aboutsummaryrefslogtreecommitdiffstats
path: root/contrib/libs/llvm12/lib/Transforms
diff options
context:
space:
mode:
authororivej <orivej@yandex-team.ru>2022-02-10 16:45:01 +0300
committerDaniil Cherednik <dcherednik@yandex-team.ru>2022-02-10 16:45:01 +0300
commit2d37894b1b037cf24231090eda8589bbb44fb6fc (patch)
treebe835aa92c6248212e705f25388ebafcf84bc7a1 /contrib/libs/llvm12/lib/Transforms
parent718c552901d703c502ccbefdfc3c9028d608b947 (diff)
downloadydb-2d37894b1b037cf24231090eda8589bbb44fb6fc.tar.gz
Restoring authorship annotation for <orivej@yandex-team.ru>. Commit 2 of 2.
Diffstat (limited to 'contrib/libs/llvm12/lib/Transforms')
-rw-r--r--contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp744
-rw-r--r--contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h248
-rw-r--r--contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp862
-rw-r--r--contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/ya.make38
-rw-r--r--contrib/libs/llvm12/lib/Transforms/CFGuard/CFGuard.cpp600
-rw-r--r--contrib/libs/llvm12/lib/Transforms/CFGuard/ya.make36
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/AlwaysInliner.cpp330
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/ArgumentPromotion.cpp2302
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/Attributor.cpp4248
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/AttributorAttributes.cpp14100
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/BarrierNoopPass.cpp94
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/BlockExtractor.cpp416
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/CalledValuePropagation.cpp868
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/ConstantMerge.cpp572
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/CrossDSOCFI.cpp350
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/DeadArgumentElimination.cpp2246
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/ElimAvailExtern.cpp204
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/ExtractGV.cpp328
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/ForceFunctionAttrs.cpp238
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/FunctionAttrs.cpp3078
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/FunctionImport.cpp2714
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/GlobalDCE.cpp920
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/GlobalOpt.cpp6380
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/GlobalSplit.cpp392
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/HotColdSplitting.cpp1442
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/IPO.cpp276
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/InferFunctionAttrs.cpp170
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/InlineSimple.cpp248
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/Inliner.cpp2012
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/Internalize.cpp582
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/LoopExtractor.cpp378
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/LowerTypeTests.cpp4484
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/MergeFunctions.cpp1902
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/OpenMPOpt.cpp2610
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/PartialInlining.cpp2758
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/PassManagerBuilder.cpp2354
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/PruneEH.cpp464
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/SCCP.cpp186
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/SampleProfile.cpp3722
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/StripDeadPrototypes.cpp176
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/StripSymbols.cpp744
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/SyntheticCountsPropagation.cpp288
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp1092
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/WholeProgramDevirt.cpp4404
-rw-r--r--contrib/libs/llvm12/lib/Transforms/IPO/ya.make110
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAddSub.cpp4266
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp6220
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp306
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCalls.cpp4422
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCasts.cpp5052
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCompares.cpp12026
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineInternal.h1468
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp2886
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp2888
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineNegator.cpp856
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombinePHI.cpp2494
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSelect.cpp5588
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineShifts.cpp2384
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp2812
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineVectorOps.cpp4558
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/InstructionCombining.cpp7606
-rw-r--r--contrib/libs/llvm12/lib/Transforms/InstCombine/ya.make64
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/AddressSanitizer.cpp6774
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/BoundsChecking.cpp508
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/CFGMST.h588
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/CGProfile.cpp302
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/ControlHeightReduction.cpp4172
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp3234
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/GCOVProfiling.cpp2020
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp2726
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp882
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrOrderFile.cpp424
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrProfiling.cpp2112
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/Instrumentation.cpp256
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/MemorySanitizer.cpp10020
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOInstrumentation.cpp3674
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp1036
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/PoisonChecking.cpp706
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/SanitizerCoverage.cpp1884
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/ThreadSanitizer.cpp1446
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.cpp158
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.h154
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfilePlugins.inc194
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Instrumentation/ya.make70
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h284
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/BlotMapVector.h234
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.cpp478
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.h164
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.cpp74
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.h186
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp282
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCContract.cpp1312
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCExpand.cpp176
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCOpts.cpp4658
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp338
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.h160
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp184
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.cpp868
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.h424
-rw-r--r--contrib/libs/llvm12/lib/Transforms/ObjCARC/ya.make52
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/ADCE.cpp1490
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp658
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/BDCE.cpp412
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/CallSiteSplitting.cpp1178
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/ConstantHoisting.cpp1966
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp1648
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/DCE.cpp334
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/DeadStoreElimination.cpp4112
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/DivRemPairs.cpp788
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/EarlyCSE.cpp2776
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/FlattenCFGPass.cpp180
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/Float2Int.cpp1100
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/GVN.cpp5314
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/GVNHoist.cpp1106
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/GVNSink.cpp1852
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/GuardWidening.cpp1752
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/IVUsersPrinter.cpp42
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/IndVarSimplify.cpp3592
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp3834
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/InferAddressSpaces.cpp2214
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/InstSimplifyPass.cpp294
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/JumpThreading.cpp5682
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LICM.cpp4268
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp48
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopDataPrefetch.cpp836
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopDeletion.cpp520
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp2160
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp3220
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp3608
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopInstSimplify.cpp514
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopInterchange.cpp3186
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopLoadElimination.cpp1394
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopPassManager.cpp132
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopPredication.cpp2476
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopRerollPass.cpp3368
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopRotation.cpp234
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopSimplifyCFG.cpp1516
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopSink.cpp716
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopStrengthReduce.cpp11626
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp1056
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollPass.cpp2882
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnswitch.cpp3204
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LoopVersioningLICM.cpp1214
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LowerAtomic.cpp354
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp352
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp836
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp180
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp3726
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/LowerWidenableCondition.cpp172
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/MakeGuardsExplicit.cpp216
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/MemCpyOptimizer.cpp2316
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/MergeICmps.cpp1868
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp846
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/NaryReassociate.cpp1000
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/NewGVN.cpp8306
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp372
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/PlaceSafepoints.cpp1372
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/Reassociate.cpp5016
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/Reg2Mem.cpp150
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp5776
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/SCCP.cpp3874
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/SROA.cpp9268
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/Scalar.cpp570
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/Scalarizer.cpp1938
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp2650
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp6028
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/SimplifyCFGPass.cpp518
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/Sink.cpp508
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp1652
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/SpeculativeExecution.cpp678
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp1420
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/StructurizeCFG.cpp2026
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/TailRecursionElimination.cpp1676
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/WarnMissedTransforms.cpp294
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Scalar/ya.make182
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp486
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/ASanStackFrameLayout.cpp304
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/AddDiscriminators.cpp554
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/AssumeBundleBuilder.cpp1232
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/BasicBlockUtils.cpp2632
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/BreakCriticalEdges.cpp960
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/BuildLibCalls.cpp2632
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/BypassSlowDivision.cpp964
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/CallGraphUpdater.cpp330
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/CallPromotionUtils.cpp1180
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeAliases.cpp210
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp496
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/CloneFunction.cpp1728
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/CloneModule.cpp414
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/CodeExtractor.cpp3514
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/CodeMoverUtils.cpp812
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/CtorUtils.cpp318
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/Debugify.cpp998
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/DemoteRegToStack.cpp306
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/EntryExitInstrumenter.cpp328
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/EscapeEnumerator.cpp188
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/Evaluator.cpp1442
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/FixIrreducible.cpp656
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/FlattenCFG.cpp1090
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/FunctionComparator.cpp1920
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/FunctionImportUtils.cpp668
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/GlobalStatus.cpp384
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/GuardUtils.cpp250
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/InjectTLIMappings.cpp326
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/InlineFunction.cpp4752
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/InstructionNamer.cpp82
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/IntegerDivision.cpp1346
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/LCSSA.cpp932
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/LibCallsShrinkWrap.cpp1124
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/Local.cpp5626
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/LoopRotationUtils.cpp1474
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/LoopSimplify.cpp1872
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/LoopUnroll.cpp1808
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollAndJam.cpp1944
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollRuntime.cpp1914
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/LoopUtils.cpp2954
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/LoopVersioning.cpp548
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/LowerInvoke.cpp192
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/LowerMemIntrinsics.cpp934
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/LowerSwitch.cpp814
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/Mem2Reg.cpp232
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/MetaRenamer.cpp166
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/ModuleUtils.cpp640
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/NameAnonGlobals.cpp240
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/PredicateInfo.cpp1682
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/PromoteMemoryToRegister.cpp1908
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdater.cpp952
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdaterBulk.cpp380
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/SanitizerStats.cpp214
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/ScalarEvolutionExpander.cpp4540
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/SimplifyCFG.cpp10980
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/SimplifyIndVar.cpp1928
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/SimplifyLibCalls.cpp6938
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/SizeOpts.cpp222
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/SplitModule.cpp568
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/StripGCRelocates.cpp118
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp70
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/SymbolRewriter.cpp1174
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp140
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/UnifyLoopExits.cpp412
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp192
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/Utils.cpp122
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/VNCoercion.cpp1220
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/ValueMapper.cpp2254
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Utils/ya.make172
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp2608
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp2456
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h580
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp14244
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp13140
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h310
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp1820
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h3528
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanDominatorTree.h82
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp708
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.h142
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanLoopInfo.h88
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp492
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.h148
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp918
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp164
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.h66
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h394
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp256
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.h82
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp1342
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/Vectorize.cpp84
-rw-r--r--contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make58
268 files changed, 242637 insertions, 242637 deletions
diff --git a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index cee0726d70..a7ae10d156 100644
--- a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -1,96 +1,96 @@
-//===- AggressiveInstCombine.cpp ------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the aggressive expression pattern combiner classes.
-// Currently, it handles expression patterns for:
-// * Truncate instruction
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
-#include "AggressiveInstCombineInternal.h"
-#include "llvm-c/Initialization.h"
-#include "llvm-c/Transforms/AggressiveInstCombine.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+//===- AggressiveInstCombine.cpp ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the aggressive expression pattern combiner classes.
+// Currently, it handles expression patterns for:
+// * Truncate instruction
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
+#include "AggressiveInstCombineInternal.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/AggressiveInstCombine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "aggressive-instcombine"
-
-STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded");
-STATISTIC(NumGuardedRotates,
- "Number of guarded rotates transformed into funnel shifts");
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "aggressive-instcombine"
+
+STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded");
+STATISTIC(NumGuardedRotates,
+ "Number of guarded rotates transformed into funnel shifts");
STATISTIC(NumGuardedFunnelShifts,
"Number of guarded funnel shifts transformed into funnel shifts");
-STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized");
-
-namespace {
-/// Contains expression pattern combiner logic.
-/// This class provides both the logic to combine expression patterns and
-/// combine them. It differs from InstCombiner class in that each pattern
-/// combiner runs only once as opposed to InstCombine's multi-iteration,
-/// which allows pattern combiner to have higher complexity than the O(1)
-/// required by the instruction combiner.
-class AggressiveInstCombinerLegacyPass : public FunctionPass {
-public:
- static char ID; // Pass identification, replacement for typeid
-
- AggressiveInstCombinerLegacyPass() : FunctionPass(ID) {
- initializeAggressiveInstCombinerLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override;
-
- /// Run all expression pattern optimizations on the given /p F function.
- ///
- /// \param F function to optimize.
- /// \returns true if the IR is changed.
- bool runOnFunction(Function &F) override;
-};
-} // namespace
-
+STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized");
+
+namespace {
+/// Contains expression pattern combiner logic.
+/// This class provides both the logic to combine expression patterns and
+/// combine them. It differs from InstCombiner class in that each pattern
+/// combiner runs only once as opposed to InstCombine's multi-iteration,
+/// which allows pattern combiner to have higher complexity than the O(1)
+/// required by the instruction combiner.
+class AggressiveInstCombinerLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ AggressiveInstCombinerLegacyPass() : FunctionPass(ID) {
+ initializeAggressiveInstCombinerLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+ /// Run all expression pattern optimizations on the given /p F function.
+ ///
+ /// \param F function to optimize.
+ /// \returns true if the IR is changed.
+ bool runOnFunction(Function &F) override;
+};
+} // namespace
+
/// Match a pattern for a bitwise funnel/rotate operation that partially guards
/// against undefined behavior by branching around the funnel-shift/rotation
/// when the shift amount is 0.
static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
- if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2)
- return false;
-
- // As with the one-use checks below, this is not strictly necessary, but we
- // are being cautious to avoid potential perf regressions on targets that
+ if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2)
+ return false;
+
+ // As with the one-use checks below, this is not strictly necessary, but we
+ // are being cautious to avoid potential perf regressions on targets that
// do not actually have a funnel/rotate instruction (where the funnel shift
// would be expanded back into math/shift/logic ops).
- if (!isPowerOf2_32(I.getType()->getScalarSizeInBits()))
- return false;
-
+ if (!isPowerOf2_32(I.getType()->getScalarSizeInBits()))
+ return false;
+
// Match V to funnel shift left/right and capture the source operands and
// shift amount.
auto matchFunnelShift = [](Value *V, Value *&ShVal0, Value *&ShVal1,
Value *&ShAmt) {
Value *SubAmt;
- unsigned Width = V->getType()->getScalarSizeInBits();
-
+ unsigned Width = V->getType()->getScalarSizeInBits();
+
// fshl(ShVal0, ShVal1, ShAmt)
// == (ShVal0 << ShAmt) | (ShVal1 >> (Width -ShAmt))
if (match(V, m_OneUse(m_c_Or(
@@ -99,8 +99,8 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
m_Sub(m_SpecificInt(Width), m_Value(SubAmt))))))) {
if (ShAmt == SubAmt) // TODO: Use m_Specific
return Intrinsic::fshl;
- }
-
+ }
+
// fshr(ShVal0, ShVal1, ShAmt)
// == (ShVal0 >> ShAmt) | (ShVal1 << (Width - ShAmt))
if (match(V,
@@ -109,19 +109,19 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
m_LShr(m_Value(ShVal1), m_Value(ShAmt)))))) {
if (ShAmt == SubAmt) // TODO: Use m_Specific
return Intrinsic::fshr;
- }
-
- return Intrinsic::not_intrinsic;
- };
-
+ }
+
+ return Intrinsic::not_intrinsic;
+ };
+
// One phi operand must be a funnel/rotate operation, and the other phi
// operand must be the source value of that funnel/rotate operation:
// phi [ rotate(RotSrc, ShAmt), FunnelBB ], [ RotSrc, GuardBB ]
// phi [ fshl(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal0, GuardBB ]
// phi [ fshr(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal1, GuardBB ]
- PHINode &Phi = cast<PHINode>(I);
+ PHINode &Phi = cast<PHINode>(I);
unsigned FunnelOp = 0, GuardOp = 1;
- Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1);
+ Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1);
Value *ShVal0, *ShVal1, *ShAmt;
Intrinsic::ID IID = matchFunnelShift(P0, ShVal0, ShVal1, ShAmt);
if (IID == Intrinsic::not_intrinsic ||
@@ -131,33 +131,33 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
if (IID == Intrinsic::not_intrinsic ||
(IID == Intrinsic::fshl && ShVal0 != P0) ||
(IID == Intrinsic::fshr && ShVal1 != P0))
- return false;
- assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) &&
- "Pattern must match funnel shift left or right");
+ return false;
+ assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) &&
+ "Pattern must match funnel shift left or right");
std::swap(FunnelOp, GuardOp);
- }
-
- // The incoming block with our source operand must be the "guard" block.
+ }
+
+ // The incoming block with our source operand must be the "guard" block.
// That must contain a cmp+branch to avoid the funnel/rotate when the shift
// amount is equal to 0. The other incoming block is the block with the
// funnel/rotate.
BasicBlock *GuardBB = Phi.getIncomingBlock(GuardOp);
BasicBlock *FunnelBB = Phi.getIncomingBlock(FunnelOp);
- Instruction *TermI = GuardBB->getTerminator();
+ Instruction *TermI = GuardBB->getTerminator();
// Ensure that the shift values dominate each block.
if (!DT.dominates(ShVal0, TermI) || !DT.dominates(ShVal1, TermI))
return false;
- ICmpInst::Predicate Pred;
- BasicBlock *PhiBB = Phi.getParent();
+ ICmpInst::Predicate Pred;
+ BasicBlock *PhiBB = Phi.getParent();
if (!match(TermI, m_Br(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()),
m_SpecificBB(PhiBB), m_SpecificBB(FunnelBB))))
- return false;
-
- if (Pred != CmpInst::ICMP_EQ)
- return false;
-
+ return false;
+
+ if (Pred != CmpInst::ICMP_EQ)
+ return false;
+
IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
if (ShVal0 == ShVal1)
@@ -175,8 +175,8 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
ShVal0 = Builder.CreateFreeze(ShVal0);
}
- // We matched a variation of this IR pattern:
- // GuardBB:
+ // We matched a variation of this IR pattern:
+ // GuardBB:
// %cmp = icmp eq i32 %ShAmt, 0
// br i1 %cmp, label %PhiBB, label %FunnelBB
// FunnelBB:
@@ -184,280 +184,280 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
// %shr = lshr i32 %ShVal1, %sub
// %shl = shl i32 %ShVal0, %ShAmt
// %fsh = or i32 %shr, %shl
- // br label %PhiBB
- // PhiBB:
+ // br label %PhiBB
+ // PhiBB:
// %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ]
- // -->
+ // -->
// llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt)
- Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType());
+ Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType());
Phi.replaceAllUsesWith(Builder.CreateCall(F, {ShVal0, ShVal1, ShAmt}));
- return true;
-}
-
-/// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and
-/// the bit indexes (Mask) needed by a masked compare. If we're matching a chain
-/// of 'and' ops, then we also need to capture the fact that we saw an
-/// "and X, 1", so that's an extra return value for that case.
-struct MaskOps {
- Value *Root;
- APInt Mask;
- bool MatchAndChain;
- bool FoundAnd1;
-
- MaskOps(unsigned BitWidth, bool MatchAnds)
- : Root(nullptr), Mask(APInt::getNullValue(BitWidth)),
- MatchAndChain(MatchAnds), FoundAnd1(false) {}
-};
-
-/// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
-/// chain of 'and' or 'or' instructions looking for shift ops of a common source
-/// value. Examples:
-/// or (or (or X, (X >> 3)), (X >> 5)), (X >> 8)
-/// returns { X, 0x129 }
-/// and (and (X >> 1), 1), (X >> 4)
-/// returns { X, 0x12 }
-static bool matchAndOrChain(Value *V, MaskOps &MOps) {
- Value *Op0, *Op1;
- if (MOps.MatchAndChain) {
- // Recurse through a chain of 'and' operands. This requires an extra check
- // vs. the 'or' matcher: we must find an "and X, 1" instruction somewhere
- // in the chain to know that all of the high bits are cleared.
- if (match(V, m_And(m_Value(Op0), m_One()))) {
- MOps.FoundAnd1 = true;
- return matchAndOrChain(Op0, MOps);
- }
- if (match(V, m_And(m_Value(Op0), m_Value(Op1))))
- return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
- } else {
- // Recurse through a chain of 'or' operands.
- if (match(V, m_Or(m_Value(Op0), m_Value(Op1))))
- return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
- }
-
- // We need a shift-right or a bare value representing a compare of bit 0 of
- // the original source operand.
- Value *Candidate;
+ return true;
+}
+
+/// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and
+/// the bit indexes (Mask) needed by a masked compare. If we're matching a chain
+/// of 'and' ops, then we also need to capture the fact that we saw an
+/// "and X, 1", so that's an extra return value for that case.
+struct MaskOps {
+ Value *Root;
+ APInt Mask;
+ bool MatchAndChain;
+ bool FoundAnd1;
+
+ MaskOps(unsigned BitWidth, bool MatchAnds)
+ : Root(nullptr), Mask(APInt::getNullValue(BitWidth)),
+ MatchAndChain(MatchAnds), FoundAnd1(false) {}
+};
+
+/// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
+/// chain of 'and' or 'or' instructions looking for shift ops of a common source
+/// value. Examples:
+/// or (or (or X, (X >> 3)), (X >> 5)), (X >> 8)
+/// returns { X, 0x129 }
+/// and (and (X >> 1), 1), (X >> 4)
+/// returns { X, 0x12 }
+static bool matchAndOrChain(Value *V, MaskOps &MOps) {
+ Value *Op0, *Op1;
+ if (MOps.MatchAndChain) {
+ // Recurse through a chain of 'and' operands. This requires an extra check
+ // vs. the 'or' matcher: we must find an "and X, 1" instruction somewhere
+ // in the chain to know that all of the high bits are cleared.
+ if (match(V, m_And(m_Value(Op0), m_One()))) {
+ MOps.FoundAnd1 = true;
+ return matchAndOrChain(Op0, MOps);
+ }
+ if (match(V, m_And(m_Value(Op0), m_Value(Op1))))
+ return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
+ } else {
+ // Recurse through a chain of 'or' operands.
+ if (match(V, m_Or(m_Value(Op0), m_Value(Op1))))
+ return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
+ }
+
+ // We need a shift-right or a bare value representing a compare of bit 0 of
+ // the original source operand.
+ Value *Candidate;
const APInt *BitIndex = nullptr;
if (!match(V, m_LShr(m_Value(Candidate), m_APInt(BitIndex))))
- Candidate = V;
-
- // Initialize result source operand.
- if (!MOps.Root)
- MOps.Root = Candidate;
-
- // The shift constant is out-of-range? This code hasn't been simplified.
+ Candidate = V;
+
+ // Initialize result source operand.
+ if (!MOps.Root)
+ MOps.Root = Candidate;
+
+ // The shift constant is out-of-range? This code hasn't been simplified.
if (BitIndex && BitIndex->uge(MOps.Mask.getBitWidth()))
- return false;
-
- // Fill in the mask bit derived from the shift constant.
+ return false;
+
+ // Fill in the mask bit derived from the shift constant.
MOps.Mask.setBit(BitIndex ? BitIndex->getZExtValue() : 0);
- return MOps.Root == Candidate;
-}
-
-/// Match patterns that correspond to "any-bits-set" and "all-bits-set".
-/// These will include a chain of 'or' or 'and'-shifted bits from a
-/// common source value:
-/// and (or (lshr X, C), ...), 1 --> (X & CMask) != 0
-/// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask
-/// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns
-/// that differ only with a final 'not' of the result. We expect that final
-/// 'not' to be folded with the compare that we create here (invert predicate).
-static bool foldAnyOrAllBitsSet(Instruction &I) {
- // The 'any-bits-set' ('or' chain) pattern is simpler to match because the
- // final "and X, 1" instruction must be the final op in the sequence.
- bool MatchAllBitsSet;
- if (match(&I, m_c_And(m_OneUse(m_And(m_Value(), m_Value())), m_Value())))
- MatchAllBitsSet = true;
- else if (match(&I, m_And(m_OneUse(m_Or(m_Value(), m_Value())), m_One())))
- MatchAllBitsSet = false;
- else
- return false;
-
- MaskOps MOps(I.getType()->getScalarSizeInBits(), MatchAllBitsSet);
- if (MatchAllBitsSet) {
- if (!matchAndOrChain(cast<BinaryOperator>(&I), MOps) || !MOps.FoundAnd1)
- return false;
- } else {
- if (!matchAndOrChain(cast<BinaryOperator>(&I)->getOperand(0), MOps))
- return false;
- }
-
- // The pattern was found. Create a masked compare that replaces all of the
- // shift and logic ops.
- IRBuilder<> Builder(&I);
- Constant *Mask = ConstantInt::get(I.getType(), MOps.Mask);
- Value *And = Builder.CreateAnd(MOps.Root, Mask);
- Value *Cmp = MatchAllBitsSet ? Builder.CreateICmpEQ(And, Mask)
- : Builder.CreateIsNotNull(And);
- Value *Zext = Builder.CreateZExt(Cmp, I.getType());
- I.replaceAllUsesWith(Zext);
- ++NumAnyOrAllBitsSet;
- return true;
-}
-
-// Try to recognize below function as popcount intrinsic.
-// This is the "best" algorithm from
-// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
-// Also used in TargetLowering::expandCTPOP().
-//
-// int popcount(unsigned int i) {
-// i = i - ((i >> 1) & 0x55555555);
-// i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
-// i = ((i + (i >> 4)) & 0x0F0F0F0F);
-// return (i * 0x01010101) >> 24;
-// }
-static bool tryToRecognizePopCount(Instruction &I) {
- if (I.getOpcode() != Instruction::LShr)
- return false;
-
- Type *Ty = I.getType();
- if (!Ty->isIntOrIntVectorTy())
- return false;
-
- unsigned Len = Ty->getScalarSizeInBits();
- // FIXME: fix Len == 8 and other irregular type lengths.
- if (!(Len <= 128 && Len > 8 && Len % 8 == 0))
- return false;
-
- APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55));
- APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33));
- APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F));
- APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01));
- APInt MaskShift = APInt(Len, Len - 8);
-
- Value *Op0 = I.getOperand(0);
- Value *Op1 = I.getOperand(1);
- Value *MulOp0;
- // Matching "(i * 0x01010101...) >> 24".
- if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) &&
- match(Op1, m_SpecificInt(MaskShift))) {
- Value *ShiftOp0;
- // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".
- if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)),
- m_Deferred(ShiftOp0)),
- m_SpecificInt(Mask0F)))) {
- Value *AndOp0;
- // Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)".
- if (match(ShiftOp0,
- m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)),
- m_And(m_LShr(m_Deferred(AndOp0), m_SpecificInt(2)),
- m_SpecificInt(Mask33))))) {
- Value *Root, *SubOp1;
- // Matching "i - ((i >> 1) & 0x55555555...)".
- if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) &&
- match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)),
- m_SpecificInt(Mask55)))) {
- LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
- IRBuilder<> Builder(&I);
- Function *Func = Intrinsic::getDeclaration(
- I.getModule(), Intrinsic::ctpop, I.getType());
- I.replaceAllUsesWith(Builder.CreateCall(Func, {Root}));
- ++NumPopCountRecognized;
- return true;
- }
- }
- }
- }
-
- return false;
-}
-
-/// This is the entry point for folds that could be implemented in regular
-/// InstCombine, but they are separated because they are not expected to
-/// occur frequently and/or have more than a constant-length pattern match.
-static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
- bool MadeChange = false;
- for (BasicBlock &BB : F) {
- // Ignore unreachable basic blocks.
- if (!DT.isReachableFromEntry(&BB))
- continue;
- // Do not delete instructions under here and invalidate the iterator.
- // Walk the block backwards for efficiency. We're matching a chain of
- // use->defs, so we're more likely to succeed by starting from the bottom.
- // Also, we want to avoid matching partial patterns.
- // TODO: It would be more efficient if we removed dead instructions
- // iteratively in this loop rather than waiting until the end.
- for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
- MadeChange |= foldAnyOrAllBitsSet(I);
+ return MOps.Root == Candidate;
+}
+
+/// Match patterns that correspond to "any-bits-set" and "all-bits-set".
+/// These will include a chain of 'or' or 'and'-shifted bits from a
+/// common source value:
+/// and (or (lshr X, C), ...), 1 --> (X & CMask) != 0
+/// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask
+/// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns
+/// that differ only with a final 'not' of the result. We expect that final
+/// 'not' to be folded with the compare that we create here (invert predicate).
+static bool foldAnyOrAllBitsSet(Instruction &I) {
+ // The 'any-bits-set' ('or' chain) pattern is simpler to match because the
+ // final "and X, 1" instruction must be the final op in the sequence.
+ bool MatchAllBitsSet;
+ if (match(&I, m_c_And(m_OneUse(m_And(m_Value(), m_Value())), m_Value())))
+ MatchAllBitsSet = true;
+ else if (match(&I, m_And(m_OneUse(m_Or(m_Value(), m_Value())), m_One())))
+ MatchAllBitsSet = false;
+ else
+ return false;
+
+ MaskOps MOps(I.getType()->getScalarSizeInBits(), MatchAllBitsSet);
+ if (MatchAllBitsSet) {
+ if (!matchAndOrChain(cast<BinaryOperator>(&I), MOps) || !MOps.FoundAnd1)
+ return false;
+ } else {
+ if (!matchAndOrChain(cast<BinaryOperator>(&I)->getOperand(0), MOps))
+ return false;
+ }
+
+ // The pattern was found. Create a masked compare that replaces all of the
+ // shift and logic ops.
+ IRBuilder<> Builder(&I);
+ Constant *Mask = ConstantInt::get(I.getType(), MOps.Mask);
+ Value *And = Builder.CreateAnd(MOps.Root, Mask);
+ Value *Cmp = MatchAllBitsSet ? Builder.CreateICmpEQ(And, Mask)
+ : Builder.CreateIsNotNull(And);
+ Value *Zext = Builder.CreateZExt(Cmp, I.getType());
+ I.replaceAllUsesWith(Zext);
+ ++NumAnyOrAllBitsSet;
+ return true;
+}
+
+// Try to recognize below function as popcount intrinsic.
+// This is the "best" algorithm from
+// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+// Also used in TargetLowering::expandCTPOP().
+//
+// int popcount(unsigned int i) {
+// i = i - ((i >> 1) & 0x55555555);
+// i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
+// i = ((i + (i >> 4)) & 0x0F0F0F0F);
+// return (i * 0x01010101) >> 24;
+// }
+static bool tryToRecognizePopCount(Instruction &I) {
+ if (I.getOpcode() != Instruction::LShr)
+ return false;
+
+ Type *Ty = I.getType();
+ if (!Ty->isIntOrIntVectorTy())
+ return false;
+
+ unsigned Len = Ty->getScalarSizeInBits();
+ // FIXME: fix Len == 8 and other irregular type lengths.
+ if (!(Len <= 128 && Len > 8 && Len % 8 == 0))
+ return false;
+
+ APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55));
+ APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33));
+ APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F));
+ APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01));
+ APInt MaskShift = APInt(Len, Len - 8);
+
+ Value *Op0 = I.getOperand(0);
+ Value *Op1 = I.getOperand(1);
+ Value *MulOp0;
+ // Matching "(i * 0x01010101...) >> 24".
+ if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) &&
+ match(Op1, m_SpecificInt(MaskShift))) {
+ Value *ShiftOp0;
+ // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".
+ if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)),
+ m_Deferred(ShiftOp0)),
+ m_SpecificInt(Mask0F)))) {
+ Value *AndOp0;
+ // Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)".
+ if (match(ShiftOp0,
+ m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)),
+ m_And(m_LShr(m_Deferred(AndOp0), m_SpecificInt(2)),
+ m_SpecificInt(Mask33))))) {
+ Value *Root, *SubOp1;
+ // Matching "i - ((i >> 1) & 0x55555555...)".
+ if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) &&
+ match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)),
+ m_SpecificInt(Mask55)))) {
+ LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
+ IRBuilder<> Builder(&I);
+ Function *Func = Intrinsic::getDeclaration(
+ I.getModule(), Intrinsic::ctpop, I.getType());
+ I.replaceAllUsesWith(Builder.CreateCall(Func, {Root}));
+ ++NumPopCountRecognized;
+ return true;
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+/// This is the entry point for folds that could be implemented in regular
+/// InstCombine, but they are separated because they are not expected to
+/// occur frequently and/or have more than a constant-length pattern match.
+static bool foldUnusualPatterns(Function &F, DominatorTree &DT) {
+ bool MadeChange = false;
+ for (BasicBlock &BB : F) {
+ // Ignore unreachable basic blocks.
+ if (!DT.isReachableFromEntry(&BB))
+ continue;
+ // Do not delete instructions under here and invalidate the iterator.
+ // Walk the block backwards for efficiency. We're matching a chain of
+ // use->defs, so we're more likely to succeed by starting from the bottom.
+ // Also, we want to avoid matching partial patterns.
+ // TODO: It would be more efficient if we removed dead instructions
+ // iteratively in this loop rather than waiting until the end.
+ for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
+ MadeChange |= foldAnyOrAllBitsSet(I);
MadeChange |= foldGuardedFunnelShift(I, DT);
- MadeChange |= tryToRecognizePopCount(I);
- }
- }
-
- // We're done with transforms, so remove dead instructions.
- if (MadeChange)
- for (BasicBlock &BB : F)
- SimplifyInstructionsInBlock(&BB);
-
- return MadeChange;
-}
-
-/// This is the entry point for all transforms. Pass manager differences are
-/// handled in the callers of this function.
-static bool runImpl(Function &F, TargetLibraryInfo &TLI, DominatorTree &DT) {
- bool MadeChange = false;
- const DataLayout &DL = F.getParent()->getDataLayout();
- TruncInstCombine TIC(TLI, DL, DT);
- MadeChange |= TIC.run(F);
- MadeChange |= foldUnusualPatterns(F, DT);
- return MadeChange;
-}
-
-void AggressiveInstCombinerLegacyPass::getAnalysisUsage(
- AnalysisUsage &AU) const {
- AU.setPreservesCFG();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<BasicAAWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
-}
-
-bool AggressiveInstCombinerLegacyPass::runOnFunction(Function &F) {
- auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- return runImpl(F, TLI, DT);
-}
-
-PreservedAnalyses AggressiveInstCombinePass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- if (!runImpl(F, TLI, DT)) {
- // No changes, all analyses are preserved.
- return PreservedAnalyses::all();
- }
- // Mark all the analyses that instcombine updates as preserved.
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<AAManager>();
- PA.preserve<GlobalsAA>();
- return PA;
-}
-
-char AggressiveInstCombinerLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(AggressiveInstCombinerLegacyPass,
- "aggressive-instcombine",
- "Combine pattern based expressions", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(AggressiveInstCombinerLegacyPass, "aggressive-instcombine",
- "Combine pattern based expressions", false, false)
-
-// Initialization Routines
-void llvm::initializeAggressiveInstCombine(PassRegistry &Registry) {
- initializeAggressiveInstCombinerLegacyPassPass(Registry);
-}
-
-void LLVMInitializeAggressiveInstCombiner(LLVMPassRegistryRef R) {
- initializeAggressiveInstCombinerLegacyPassPass(*unwrap(R));
-}
-
-FunctionPass *llvm::createAggressiveInstCombinerPass() {
- return new AggressiveInstCombinerLegacyPass();
-}
-
-void LLVMAddAggressiveInstCombinerPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createAggressiveInstCombinerPass());
-}
+ MadeChange |= tryToRecognizePopCount(I);
+ }
+ }
+
+ // We're done with transforms, so remove dead instructions.
+ if (MadeChange)
+ for (BasicBlock &BB : F)
+ SimplifyInstructionsInBlock(&BB);
+
+ return MadeChange;
+}
+
+/// This is the entry point for all transforms. Pass manager differences are
+/// handled in the callers of this function.
+static bool runImpl(Function &F, TargetLibraryInfo &TLI, DominatorTree &DT) {
+ bool MadeChange = false;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ TruncInstCombine TIC(TLI, DL, DT);
+ MadeChange |= TIC.run(F);
+ MadeChange |= foldUnusualPatterns(F, DT);
+ return MadeChange;
+}
+
+void AggressiveInstCombinerLegacyPass::getAnalysisUsage(
+ AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+}
+
+bool AggressiveInstCombinerLegacyPass::runOnFunction(Function &F) {
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ return runImpl(F, TLI, DT);
+}
+
+PreservedAnalyses AggressiveInstCombinePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ if (!runImpl(F, TLI, DT)) {
+ // No changes, all analyses are preserved.
+ return PreservedAnalyses::all();
+ }
+ // Mark all the analyses that instcombine updates as preserved.
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<AAManager>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+char AggressiveInstCombinerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(AggressiveInstCombinerLegacyPass,
+ "aggressive-instcombine",
+ "Combine pattern based expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(AggressiveInstCombinerLegacyPass, "aggressive-instcombine",
+ "Combine pattern based expressions", false, false)
+
+// Initialization Routines
+void llvm::initializeAggressiveInstCombine(PassRegistry &Registry) {
+ initializeAggressiveInstCombinerLegacyPassPass(Registry);
+}
+
+void LLVMInitializeAggressiveInstCombiner(LLVMPassRegistryRef R) {
+ initializeAggressiveInstCombinerLegacyPassPass(*unwrap(R));
+}
+
+FunctionPass *llvm::createAggressiveInstCombinerPass() {
+ return new AggressiveInstCombinerLegacyPass();
+}
+
+void LLVMAddAggressiveInstCombinerPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createAggressiveInstCombinerPass());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
index 92620bde16..42bcadfc7d 100644
--- a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
+++ b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
@@ -1,124 +1,124 @@
-//===- AggressiveInstCombineInternal.h --------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the instruction pattern combiner classes.
-// Currently, it handles pattern expressions for:
-// * Truncate instruction
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_AGGRESSIVEINSTCOMBINE_COMBINEINTERNAL_H
-#define LLVM_LIB_TRANSFORMS_AGGRESSIVEINSTCOMBINE_COMBINEINTERNAL_H
-
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SmallVector.h"
-
-using namespace llvm;
-
-//===----------------------------------------------------------------------===//
-// TruncInstCombine - looks for expression dags dominated by trunc instructions
-// and for each eligible dag, it will create a reduced bit-width expression and
-// replace the old expression with this new one and remove the old one.
-// Eligible expression dag is such that:
-// 1. Contains only supported instructions.
-// 2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
-// 3. Can be evaluated into type with reduced legal bit-width (or Trunc type).
-// 4. All instructions in the dag must not have users outside the dag.
-// Only exception is for {ZExt, SExt}Inst with operand type equal to the
-// new reduced type chosen in (3).
-//
-// The motivation for this optimization is that evaluating and expression using
-// smaller bit-width is preferable, especially for vectorization where we can
-// fit more values in one vectorized instruction. In addition, this optimization
-// may decrease the number of cast instructions, but will not increase it.
-//===----------------------------------------------------------------------===//
-
-namespace llvm {
- class DataLayout;
- class DominatorTree;
- class Function;
- class Instruction;
- class TargetLibraryInfo;
- class TruncInst;
- class Type;
- class Value;
-
-class TruncInstCombine {
- TargetLibraryInfo &TLI;
- const DataLayout &DL;
- const DominatorTree &DT;
-
- /// List of all TruncInst instructions to be processed.
- SmallVector<TruncInst *, 4> Worklist;
-
- /// Current processed TruncInst instruction.
- TruncInst *CurrentTruncInst;
-
- /// Information per each instruction in the expression dag.
- struct Info {
- /// Number of LSBs that are needed to generate a valid expression.
- unsigned ValidBitWidth = 0;
- /// Minimum number of LSBs needed to generate the ValidBitWidth.
- unsigned MinBitWidth = 0;
- /// The reduced value generated to replace the old instruction.
- Value *NewValue = nullptr;
- };
- /// An ordered map representing expression dag post-dominated by current
- /// processed TruncInst. It maps each instruction in the dag to its Info
- /// structure. The map is ordered such that each instruction appears before
- /// all other instructions in the dag that uses it.
- MapVector<Instruction *, Info> InstInfoMap;
-
-public:
- TruncInstCombine(TargetLibraryInfo &TLI, const DataLayout &DL,
- const DominatorTree &DT)
- : TLI(TLI), DL(DL), DT(DT), CurrentTruncInst(nullptr) {}
-
- /// Perform TruncInst pattern optimization on given function.
- bool run(Function &F);
-
-private:
- /// Build expression dag dominated by the /p CurrentTruncInst and append it to
- /// the InstInfoMap container.
- ///
- /// \return true only if succeed to generate an eligible sub expression dag.
- bool buildTruncExpressionDag();
-
- /// Calculate the minimal allowed bit-width of the chain ending with the
- /// currently visited truncate's operand.
- ///
- /// \return minimum number of bits to which the chain ending with the
- /// truncate's operand can be shrunk to.
- unsigned getMinBitWidth();
-
- /// Build an expression dag dominated by the current processed TruncInst and
- /// Check if it is eligible to be reduced to a smaller type.
- ///
- /// \return the scalar version of the new type to be used for the reduced
- /// expression dag, or nullptr if the expression dag is not eligible
- /// to be reduced.
- Type *getBestTruncatedType();
-
- /// Given a \p V value and a \p SclTy scalar type return the generated reduced
- /// value of \p V based on the type \p SclTy.
- ///
- /// \param V value to be reduced.
- /// \param SclTy scalar version of new type to reduce to.
- /// \return the new reduced value.
- Value *getReducedOperand(Value *V, Type *SclTy);
-
- /// Create a new expression dag using the reduced /p SclTy type and replace
- /// the old expression dag with it. Also erase all instructions in the old
- /// dag, except those that are still needed outside the dag.
- ///
- /// \param SclTy scalar version of new type to reduce expression dag into.
- void ReduceExpressionDag(Type *SclTy);
-};
-} // end namespace llvm.
-
-#endif
+//===- AggressiveInstCombineInternal.h --------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the instruction pattern combiner classes.
+// Currently, it handles pattern expressions for:
+// * Truncate instruction
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_AGGRESSIVEINSTCOMBINE_COMBINEINTERNAL_H
+#define LLVM_LIB_TRANSFORMS_AGGRESSIVEINSTCOMBINE_COMBINEINTERNAL_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// TruncInstCombine - looks for expression dags dominated by trunc instructions
+// and for each eligible dag, it will create a reduced bit-width expression and
+// replace the old expression with this new one and remove the old one.
+// Eligible expression dag is such that:
+// 1. Contains only supported instructions.
+// 2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
+// 3. Can be evaluated into type with reduced legal bit-width (or Trunc type).
+// 4. All instructions in the dag must not have users outside the dag.
+// Only exception is for {ZExt, SExt}Inst with operand type equal to the
+// new reduced type chosen in (3).
+//
+// The motivation for this optimization is that evaluating and expression using
+// smaller bit-width is preferable, especially for vectorization where we can
+// fit more values in one vectorized instruction. In addition, this optimization
+// may decrease the number of cast instructions, but will not increase it.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+ class DataLayout;
+ class DominatorTree;
+ class Function;
+ class Instruction;
+ class TargetLibraryInfo;
+ class TruncInst;
+ class Type;
+ class Value;
+
+class TruncInstCombine {
+ TargetLibraryInfo &TLI;
+ const DataLayout &DL;
+ const DominatorTree &DT;
+
+ /// List of all TruncInst instructions to be processed.
+ SmallVector<TruncInst *, 4> Worklist;
+
+ /// Current processed TruncInst instruction.
+ TruncInst *CurrentTruncInst;
+
+ /// Information per each instruction in the expression dag.
+ struct Info {
+ /// Number of LSBs that are needed to generate a valid expression.
+ unsigned ValidBitWidth = 0;
+ /// Minimum number of LSBs needed to generate the ValidBitWidth.
+ unsigned MinBitWidth = 0;
+ /// The reduced value generated to replace the old instruction.
+ Value *NewValue = nullptr;
+ };
+ /// An ordered map representing expression dag post-dominated by current
+ /// processed TruncInst. It maps each instruction in the dag to its Info
+ /// structure. The map is ordered such that each instruction appears before
+ /// all other instructions in the dag that uses it.
+ MapVector<Instruction *, Info> InstInfoMap;
+
+public:
+ TruncInstCombine(TargetLibraryInfo &TLI, const DataLayout &DL,
+ const DominatorTree &DT)
+ : TLI(TLI), DL(DL), DT(DT), CurrentTruncInst(nullptr) {}
+
+ /// Perform TruncInst pattern optimization on given function.
+ bool run(Function &F);
+
+private:
+ /// Build expression dag dominated by the /p CurrentTruncInst and append it to
+ /// the InstInfoMap container.
+ ///
+ /// \return true only if succeed to generate an eligible sub expression dag.
+ bool buildTruncExpressionDag();
+
+ /// Calculate the minimal allowed bit-width of the chain ending with the
+ /// currently visited truncate's operand.
+ ///
+ /// \return minimum number of bits to which the chain ending with the
+ /// truncate's operand can be shrunk to.
+ unsigned getMinBitWidth();
+
+ /// Build an expression dag dominated by the current processed TruncInst and
+ /// Check if it is eligible to be reduced to a smaller type.
+ ///
+ /// \return the scalar version of the new type to be used for the reduced
+ /// expression dag, or nullptr if the expression dag is not eligible
+ /// to be reduced.
+ Type *getBestTruncatedType();
+
+ /// Given a \p V value and a \p SclTy scalar type return the generated reduced
+ /// value of \p V based on the type \p SclTy.
+ ///
+ /// \param V value to be reduced.
+ /// \param SclTy scalar version of new type to reduce to.
+ /// \return the new reduced value.
+ Value *getReducedOperand(Value *V, Type *SclTy);
+
+ /// Create a new expression dag using the reduced /p SclTy type and replace
+ /// the old expression dag with it. Also erase all instructions in the old
+ /// dag, except those that are still needed outside the dag.
+ ///
+ /// \param SclTy scalar version of new type to reduce expression dag into.
+ void ReduceExpressionDag(Type *SclTy);
+};
+} // end namespace llvm.
+
+#endif
diff --git a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index 193057aaab..16b82219e8 100644
--- a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -1,436 +1,436 @@
-//===- TruncInstCombine.cpp -----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// TruncInstCombine - looks for expression dags post-dominated by TruncInst and
-// for each eligible dag, it will create a reduced bit-width expression, replace
-// the old expression with this new one and remove the old expression.
-// Eligible expression dag is such that:
-// 1. Contains only supported instructions.
-// 2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
-// 3. Can be evaluated into type with reduced legal bit-width.
-// 4. All instructions in the dag must not have users outside the dag.
-// The only exception is for {ZExt, SExt}Inst with operand type equal to
-// the new reduced type evaluated in (3).
-//
-// The motivation for this optimization is that evaluating and expression using
-// smaller bit-width is preferable, especially for vectorization where we can
-// fit more values in one vectorized instruction. In addition, this optimization
-// may decrease the number of cast instructions, but will not increase it.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AggressiveInstCombineInternal.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
+//===- TruncInstCombine.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// TruncInstCombine - looks for expression dags post-dominated by TruncInst and
+// for each eligible dag, it will create a reduced bit-width expression, replace
+// the old expression with this new one and remove the old expression.
+// Eligible expression dag is such that:
+// 1. Contains only supported instructions.
+// 2. Supported leaves: ZExtInst, SExtInst, TruncInst and Constant value.
+// 3. Can be evaluated into type with reduced legal bit-width.
+// 4. All instructions in the dag must not have users outside the dag.
+// The only exception is for {ZExt, SExt}Inst with operand type equal to
+// the new reduced type evaluated in (3).
+//
+// The motivation for this optimization is that evaluating and expression using
+// smaller bit-width is preferable, especially for vectorization where we can
+// fit more values in one vectorized instruction. In addition, this optimization
+// may decrease the number of cast instructions, but will not increase it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AggressiveInstCombineInternal.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "aggressive-instcombine"
-
-STATISTIC(
- NumDAGsReduced,
- "Number of truncations eliminated by reducing bit width of expression DAG");
-STATISTIC(NumInstrsReduced,
- "Number of instructions whose bit width was reduced");
-
-/// Given an instruction and a container, it fills all the relevant operands of
-/// that instruction, with respect to the Trunc expression dag optimizaton.
-static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
- unsigned Opc = I->getOpcode();
- switch (Opc) {
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt:
- // These CastInst are considered leaves of the evaluated expression, thus,
- // their operands are not relevent.
- break;
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Mul:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- Ops.push_back(I->getOperand(0));
- Ops.push_back(I->getOperand(1));
- break;
- case Instruction::Select:
- Ops.push_back(I->getOperand(1));
- Ops.push_back(I->getOperand(2));
- break;
- default:
- llvm_unreachable("Unreachable!");
- }
-}
-
-bool TruncInstCombine::buildTruncExpressionDag() {
- SmallVector<Value *, 8> Worklist;
- SmallVector<Instruction *, 8> Stack;
- // Clear old expression dag.
- InstInfoMap.clear();
-
- Worklist.push_back(CurrentTruncInst->getOperand(0));
-
- while (!Worklist.empty()) {
- Value *Curr = Worklist.back();
-
- if (isa<Constant>(Curr)) {
- Worklist.pop_back();
- continue;
- }
-
- auto *I = dyn_cast<Instruction>(Curr);
- if (!I)
- return false;
-
- if (!Stack.empty() && Stack.back() == I) {
- // Already handled all instruction operands, can remove it from both the
- // Worklist and the Stack, and add it to the instruction info map.
- Worklist.pop_back();
- Stack.pop_back();
- // Insert I to the Info map.
- InstInfoMap.insert(std::make_pair(I, Info()));
- continue;
- }
-
- if (InstInfoMap.count(I)) {
- Worklist.pop_back();
- continue;
- }
-
- // Add the instruction to the stack before start handling its operands.
- Stack.push_back(I);
-
- unsigned Opc = I->getOpcode();
- switch (Opc) {
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt:
- // trunc(trunc(x)) -> trunc(x)
- // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest
- // trunc(ext(x)) -> trunc(x) if the source type is larger than the new
- // dest
- break;
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Mul:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::Select: {
- SmallVector<Value *, 2> Operands;
- getRelevantOperands(I, Operands);
+#include "llvm/IR/Instruction.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aggressive-instcombine"
+
+STATISTIC(
+ NumDAGsReduced,
+ "Number of truncations eliminated by reducing bit width of expression DAG");
+STATISTIC(NumInstrsReduced,
+ "Number of instructions whose bit width was reduced");
+
+/// Given an instruction and a container, it fills all the relevant operands of
+/// that instruction, with respect to the Trunc expression dag optimizaton.
+static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
+ unsigned Opc = I->getOpcode();
+ switch (Opc) {
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ // These CastInst are considered leaves of the evaluated expression, thus,
+ // their operands are not relevent.
+ break;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ Ops.push_back(I->getOperand(0));
+ Ops.push_back(I->getOperand(1));
+ break;
+ case Instruction::Select:
+ Ops.push_back(I->getOperand(1));
+ Ops.push_back(I->getOperand(2));
+ break;
+ default:
+ llvm_unreachable("Unreachable!");
+ }
+}
+
+bool TruncInstCombine::buildTruncExpressionDag() {
+ SmallVector<Value *, 8> Worklist;
+ SmallVector<Instruction *, 8> Stack;
+ // Clear old expression dag.
+ InstInfoMap.clear();
+
+ Worklist.push_back(CurrentTruncInst->getOperand(0));
+
+ while (!Worklist.empty()) {
+ Value *Curr = Worklist.back();
+
+ if (isa<Constant>(Curr)) {
+ Worklist.pop_back();
+ continue;
+ }
+
+ auto *I = dyn_cast<Instruction>(Curr);
+ if (!I)
+ return false;
+
+ if (!Stack.empty() && Stack.back() == I) {
+ // Already handled all instruction operands, can remove it from both the
+ // Worklist and the Stack, and add it to the instruction info map.
+ Worklist.pop_back();
+ Stack.pop_back();
+ // Insert I to the Info map.
+ InstInfoMap.insert(std::make_pair(I, Info()));
+ continue;
+ }
+
+ if (InstInfoMap.count(I)) {
+ Worklist.pop_back();
+ continue;
+ }
+
+ // Add the instruction to the stack before start handling its operands.
+ Stack.push_back(I);
+
+ unsigned Opc = I->getOpcode();
+ switch (Opc) {
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ // trunc(trunc(x)) -> trunc(x)
+ // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest
+ // trunc(ext(x)) -> trunc(x) if the source type is larger than the new
+ // dest
+ break;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Select: {
+ SmallVector<Value *, 2> Operands;
+ getRelevantOperands(I, Operands);
append_range(Worklist, Operands);
- break;
- }
- default:
- // TODO: Can handle more cases here:
- // 1. shufflevector, extractelement, insertelement
- // 2. udiv, urem
- // 3. shl, lshr, ashr
- // 4. phi node(and loop handling)
- // ...
- return false;
- }
- }
- return true;
-}
-
-unsigned TruncInstCombine::getMinBitWidth() {
- SmallVector<Value *, 8> Worklist;
- SmallVector<Instruction *, 8> Stack;
-
- Value *Src = CurrentTruncInst->getOperand(0);
- Type *DstTy = CurrentTruncInst->getType();
- unsigned TruncBitWidth = DstTy->getScalarSizeInBits();
- unsigned OrigBitWidth =
- CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits();
-
- if (isa<Constant>(Src))
- return TruncBitWidth;
-
- Worklist.push_back(Src);
- InstInfoMap[cast<Instruction>(Src)].ValidBitWidth = TruncBitWidth;
-
- while (!Worklist.empty()) {
- Value *Curr = Worklist.back();
-
- if (isa<Constant>(Curr)) {
- Worklist.pop_back();
- continue;
- }
-
- // Otherwise, it must be an instruction.
- auto *I = cast<Instruction>(Curr);
-
- auto &Info = InstInfoMap[I];
-
- SmallVector<Value *, 2> Operands;
- getRelevantOperands(I, Operands);
-
- if (!Stack.empty() && Stack.back() == I) {
- // Already handled all instruction operands, can remove it from both, the
- // Worklist and the Stack, and update MinBitWidth.
- Worklist.pop_back();
- Stack.pop_back();
- for (auto *Operand : Operands)
- if (auto *IOp = dyn_cast<Instruction>(Operand))
- Info.MinBitWidth =
- std::max(Info.MinBitWidth, InstInfoMap[IOp].MinBitWidth);
- continue;
- }
-
- // Add the instruction to the stack before start handling its operands.
- Stack.push_back(I);
- unsigned ValidBitWidth = Info.ValidBitWidth;
-
- // Update minimum bit-width before handling its operands. This is required
- // when the instruction is part of a loop.
- Info.MinBitWidth = std::max(Info.MinBitWidth, Info.ValidBitWidth);
-
- for (auto *Operand : Operands)
- if (auto *IOp = dyn_cast<Instruction>(Operand)) {
- // If we already calculated the minimum bit-width for this valid
- // bit-width, or for a smaller valid bit-width, then just keep the
- // answer we already calculated.
- unsigned IOpBitwidth = InstInfoMap.lookup(IOp).ValidBitWidth;
- if (IOpBitwidth >= ValidBitWidth)
- continue;
- InstInfoMap[IOp].ValidBitWidth = ValidBitWidth;
- Worklist.push_back(IOp);
- }
- }
- unsigned MinBitWidth = InstInfoMap.lookup(cast<Instruction>(Src)).MinBitWidth;
- assert(MinBitWidth >= TruncBitWidth);
-
- if (MinBitWidth > TruncBitWidth) {
- // In this case reducing expression with vector type might generate a new
- // vector type, which is not preferable as it might result in generating
- // sub-optimal code.
- if (DstTy->isVectorTy())
- return OrigBitWidth;
- // Use the smallest integer type in the range [MinBitWidth, OrigBitWidth).
- Type *Ty = DL.getSmallestLegalIntType(DstTy->getContext(), MinBitWidth);
- // Update minimum bit-width with the new destination type bit-width if
- // succeeded to find such, otherwise, with original bit-width.
- MinBitWidth = Ty ? Ty->getScalarSizeInBits() : OrigBitWidth;
- } else { // MinBitWidth == TruncBitWidth
- // In this case the expression can be evaluated with the trunc instruction
- // destination type, and trunc instruction can be omitted. However, we
- // should not perform the evaluation if the original type is a legal scalar
- // type and the target type is illegal.
- bool FromLegal = MinBitWidth == 1 || DL.isLegalInteger(OrigBitWidth);
- bool ToLegal = MinBitWidth == 1 || DL.isLegalInteger(MinBitWidth);
- if (!DstTy->isVectorTy() && FromLegal && !ToLegal)
- return OrigBitWidth;
- }
- return MinBitWidth;
-}
-
-Type *TruncInstCombine::getBestTruncatedType() {
- if (!buildTruncExpressionDag())
- return nullptr;
-
- // We don't want to duplicate instructions, which isn't profitable. Thus, we
- // can't shrink something that has multiple users, unless all users are
- // post-dominated by the trunc instruction, i.e., were visited during the
- // expression evaluation.
- unsigned DesiredBitWidth = 0;
- for (auto Itr : InstInfoMap) {
- Instruction *I = Itr.first;
- if (I->hasOneUse())
- continue;
- bool IsExtInst = (isa<ZExtInst>(I) || isa<SExtInst>(I));
- for (auto *U : I->users())
- if (auto *UI = dyn_cast<Instruction>(U))
- if (UI != CurrentTruncInst && !InstInfoMap.count(UI)) {
- if (!IsExtInst)
- return nullptr;
- // If this is an extension from the dest type, we can eliminate it,
- // even if it has multiple users. Thus, update the DesiredBitWidth and
- // validate all extension instructions agrees on same DesiredBitWidth.
- unsigned ExtInstBitWidth =
- I->getOperand(0)->getType()->getScalarSizeInBits();
- if (DesiredBitWidth && DesiredBitWidth != ExtInstBitWidth)
- return nullptr;
- DesiredBitWidth = ExtInstBitWidth;
- }
- }
-
- unsigned OrigBitWidth =
- CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits();
-
- // Calculate minimum allowed bit-width allowed for shrinking the currently
- // visited truncate's operand.
- unsigned MinBitWidth = getMinBitWidth();
-
- // Check that we can shrink to smaller bit-width than original one and that
- // it is similar to the DesiredBitWidth is such exists.
- if (MinBitWidth >= OrigBitWidth ||
- (DesiredBitWidth && DesiredBitWidth != MinBitWidth))
- return nullptr;
-
- return IntegerType::get(CurrentTruncInst->getContext(), MinBitWidth);
-}
-
-/// Given a reduced scalar type \p Ty and a \p V value, return a reduced type
-/// for \p V, according to its type, if it vector type, return the vector
-/// version of \p Ty, otherwise return \p Ty.
-static Type *getReducedType(Value *V, Type *Ty) {
- assert(Ty && !Ty->isVectorTy() && "Expect Scalar Type");
+ break;
+ }
+ default:
+ // TODO: Can handle more cases here:
+ // 1. shufflevector, extractelement, insertelement
+ // 2. udiv, urem
+ // 3. shl, lshr, ashr
+ // 4. phi node(and loop handling)
+ // ...
+ return false;
+ }
+ }
+ return true;
+}
+
+unsigned TruncInstCombine::getMinBitWidth() {
+ SmallVector<Value *, 8> Worklist;
+ SmallVector<Instruction *, 8> Stack;
+
+ Value *Src = CurrentTruncInst->getOperand(0);
+ Type *DstTy = CurrentTruncInst->getType();
+ unsigned TruncBitWidth = DstTy->getScalarSizeInBits();
+ unsigned OrigBitWidth =
+ CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits();
+
+ if (isa<Constant>(Src))
+ return TruncBitWidth;
+
+ Worklist.push_back(Src);
+ InstInfoMap[cast<Instruction>(Src)].ValidBitWidth = TruncBitWidth;
+
+ while (!Worklist.empty()) {
+ Value *Curr = Worklist.back();
+
+ if (isa<Constant>(Curr)) {
+ Worklist.pop_back();
+ continue;
+ }
+
+ // Otherwise, it must be an instruction.
+ auto *I = cast<Instruction>(Curr);
+
+ auto &Info = InstInfoMap[I];
+
+ SmallVector<Value *, 2> Operands;
+ getRelevantOperands(I, Operands);
+
+ if (!Stack.empty() && Stack.back() == I) {
+ // Already handled all instruction operands, can remove it from both, the
+ // Worklist and the Stack, and update MinBitWidth.
+ Worklist.pop_back();
+ Stack.pop_back();
+ for (auto *Operand : Operands)
+ if (auto *IOp = dyn_cast<Instruction>(Operand))
+ Info.MinBitWidth =
+ std::max(Info.MinBitWidth, InstInfoMap[IOp].MinBitWidth);
+ continue;
+ }
+
+ // Add the instruction to the stack before start handling its operands.
+ Stack.push_back(I);
+ unsigned ValidBitWidth = Info.ValidBitWidth;
+
+ // Update minimum bit-width before handling its operands. This is required
+ // when the instruction is part of a loop.
+ Info.MinBitWidth = std::max(Info.MinBitWidth, Info.ValidBitWidth);
+
+ for (auto *Operand : Operands)
+ if (auto *IOp = dyn_cast<Instruction>(Operand)) {
+ // If we already calculated the minimum bit-width for this valid
+ // bit-width, or for a smaller valid bit-width, then just keep the
+ // answer we already calculated.
+ unsigned IOpBitwidth = InstInfoMap.lookup(IOp).ValidBitWidth;
+ if (IOpBitwidth >= ValidBitWidth)
+ continue;
+ InstInfoMap[IOp].ValidBitWidth = ValidBitWidth;
+ Worklist.push_back(IOp);
+ }
+ }
+ unsigned MinBitWidth = InstInfoMap.lookup(cast<Instruction>(Src)).MinBitWidth;
+ assert(MinBitWidth >= TruncBitWidth);
+
+ if (MinBitWidth > TruncBitWidth) {
+ // In this case reducing expression with vector type might generate a new
+ // vector type, which is not preferable as it might result in generating
+ // sub-optimal code.
+ if (DstTy->isVectorTy())
+ return OrigBitWidth;
+ // Use the smallest integer type in the range [MinBitWidth, OrigBitWidth).
+ Type *Ty = DL.getSmallestLegalIntType(DstTy->getContext(), MinBitWidth);
+ // Update minimum bit-width with the new destination type bit-width if
+ // succeeded to find such, otherwise, with original bit-width.
+ MinBitWidth = Ty ? Ty->getScalarSizeInBits() : OrigBitWidth;
+ } else { // MinBitWidth == TruncBitWidth
+ // In this case the expression can be evaluated with the trunc instruction
+ // destination type, and trunc instruction can be omitted. However, we
+ // should not perform the evaluation if the original type is a legal scalar
+ // type and the target type is illegal.
+ bool FromLegal = MinBitWidth == 1 || DL.isLegalInteger(OrigBitWidth);
+ bool ToLegal = MinBitWidth == 1 || DL.isLegalInteger(MinBitWidth);
+ if (!DstTy->isVectorTy() && FromLegal && !ToLegal)
+ return OrigBitWidth;
+ }
+ return MinBitWidth;
+}
+
+Type *TruncInstCombine::getBestTruncatedType() {
+ if (!buildTruncExpressionDag())
+ return nullptr;
+
+ // We don't want to duplicate instructions, which isn't profitable. Thus, we
+ // can't shrink something that has multiple users, unless all users are
+ // post-dominated by the trunc instruction, i.e., were visited during the
+ // expression evaluation.
+ unsigned DesiredBitWidth = 0;
+ for (auto Itr : InstInfoMap) {
+ Instruction *I = Itr.first;
+ if (I->hasOneUse())
+ continue;
+ bool IsExtInst = (isa<ZExtInst>(I) || isa<SExtInst>(I));
+ for (auto *U : I->users())
+ if (auto *UI = dyn_cast<Instruction>(U))
+ if (UI != CurrentTruncInst && !InstInfoMap.count(UI)) {
+ if (!IsExtInst)
+ return nullptr;
+ // If this is an extension from the dest type, we can eliminate it,
+ // even if it has multiple users. Thus, update the DesiredBitWidth and
+ // validate all extension instructions agrees on same DesiredBitWidth.
+ unsigned ExtInstBitWidth =
+ I->getOperand(0)->getType()->getScalarSizeInBits();
+ if (DesiredBitWidth && DesiredBitWidth != ExtInstBitWidth)
+ return nullptr;
+ DesiredBitWidth = ExtInstBitWidth;
+ }
+ }
+
+ unsigned OrigBitWidth =
+ CurrentTruncInst->getOperand(0)->getType()->getScalarSizeInBits();
+
+ // Calculate minimum allowed bit-width allowed for shrinking the currently
+ // visited truncate's operand.
+ unsigned MinBitWidth = getMinBitWidth();
+
+ // Check that we can shrink to smaller bit-width than original one and that
+ // it is similar to the DesiredBitWidth is such exists.
+ if (MinBitWidth >= OrigBitWidth ||
+ (DesiredBitWidth && DesiredBitWidth != MinBitWidth))
+ return nullptr;
+
+ return IntegerType::get(CurrentTruncInst->getContext(), MinBitWidth);
+}
+
+/// Given a reduced scalar type \p Ty and a \p V value, return a reduced type
+/// for \p V, according to its type, if it vector type, return the vector
+/// version of \p Ty, otherwise return \p Ty.
+static Type *getReducedType(Value *V, Type *Ty) {
+ assert(Ty && !Ty->isVectorTy() && "Expect Scalar Type");
if (auto *VTy = dyn_cast<VectorType>(V->getType()))
return VectorType::get(Ty, VTy->getElementCount());
- return Ty;
-}
-
-Value *TruncInstCombine::getReducedOperand(Value *V, Type *SclTy) {
- Type *Ty = getReducedType(V, SclTy);
- if (auto *C = dyn_cast<Constant>(V)) {
- C = ConstantExpr::getIntegerCast(C, Ty, false);
- // If we got a constantexpr back, try to simplify it with DL info.
- return ConstantFoldConstant(C, DL, &TLI);
- }
-
- auto *I = cast<Instruction>(V);
- Info Entry = InstInfoMap.lookup(I);
- assert(Entry.NewValue);
- return Entry.NewValue;
-}
-
-void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
- NumInstrsReduced += InstInfoMap.size();
- for (auto &Itr : InstInfoMap) { // Forward
- Instruction *I = Itr.first;
- TruncInstCombine::Info &NodeInfo = Itr.second;
-
- assert(!NodeInfo.NewValue && "Instruction has been evaluated");
-
- IRBuilder<> Builder(I);
- Value *Res = nullptr;
- unsigned Opc = I->getOpcode();
- switch (Opc) {
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt: {
- Type *Ty = getReducedType(I, SclTy);
- // If the source type of the cast is the type we're trying for then we can
- // just return the source. There's no need to insert it because it is not
- // new.
- if (I->getOperand(0)->getType() == Ty) {
- assert(!isa<TruncInst>(I) && "Cannot reach here with TruncInst");
- NodeInfo.NewValue = I->getOperand(0);
- continue;
- }
- // Otherwise, must be the same type of cast, so just reinsert a new one.
- // This also handles the case of zext(trunc(x)) -> zext(x).
- Res = Builder.CreateIntCast(I->getOperand(0), Ty,
- Opc == Instruction::SExt);
-
- // Update Worklist entries with new value if needed.
- // There are three possible changes to the Worklist:
- // 1. Update Old-TruncInst -> New-TruncInst.
- // 2. Remove Old-TruncInst (if New node is not TruncInst).
- // 3. Add New-TruncInst (if Old node was not TruncInst).
+ return Ty;
+}
+
+Value *TruncInstCombine::getReducedOperand(Value *V, Type *SclTy) {
+ Type *Ty = getReducedType(V, SclTy);
+ if (auto *C = dyn_cast<Constant>(V)) {
+ C = ConstantExpr::getIntegerCast(C, Ty, false);
+ // If we got a constantexpr back, try to simplify it with DL info.
+ return ConstantFoldConstant(C, DL, &TLI);
+ }
+
+ auto *I = cast<Instruction>(V);
+ Info Entry = InstInfoMap.lookup(I);
+ assert(Entry.NewValue);
+ return Entry.NewValue;
+}
+
+void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
+ NumInstrsReduced += InstInfoMap.size();
+ for (auto &Itr : InstInfoMap) { // Forward
+ Instruction *I = Itr.first;
+ TruncInstCombine::Info &NodeInfo = Itr.second;
+
+ assert(!NodeInfo.NewValue && "Instruction has been evaluated");
+
+ IRBuilder<> Builder(I);
+ Value *Res = nullptr;
+ unsigned Opc = I->getOpcode();
+ switch (Opc) {
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt: {
+ Type *Ty = getReducedType(I, SclTy);
+ // If the source type of the cast is the type we're trying for then we can
+ // just return the source. There's no need to insert it because it is not
+ // new.
+ if (I->getOperand(0)->getType() == Ty) {
+ assert(!isa<TruncInst>(I) && "Cannot reach here with TruncInst");
+ NodeInfo.NewValue = I->getOperand(0);
+ continue;
+ }
+ // Otherwise, must be the same type of cast, so just reinsert a new one.
+ // This also handles the case of zext(trunc(x)) -> zext(x).
+ Res = Builder.CreateIntCast(I->getOperand(0), Ty,
+ Opc == Instruction::SExt);
+
+ // Update Worklist entries with new value if needed.
+ // There are three possible changes to the Worklist:
+ // 1. Update Old-TruncInst -> New-TruncInst.
+ // 2. Remove Old-TruncInst (if New node is not TruncInst).
+ // 3. Add New-TruncInst (if Old node was not TruncInst).
auto *Entry = find(Worklist, I);
- if (Entry != Worklist.end()) {
- if (auto *NewCI = dyn_cast<TruncInst>(Res))
- *Entry = NewCI;
- else
- Worklist.erase(Entry);
- } else if (auto *NewCI = dyn_cast<TruncInst>(Res))
- Worklist.push_back(NewCI);
- break;
- }
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Mul:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- Value *LHS = getReducedOperand(I->getOperand(0), SclTy);
- Value *RHS = getReducedOperand(I->getOperand(1), SclTy);
- Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS);
- break;
- }
- case Instruction::Select: {
- Value *Op0 = I->getOperand(0);
- Value *LHS = getReducedOperand(I->getOperand(1), SclTy);
- Value *RHS = getReducedOperand(I->getOperand(2), SclTy);
- Res = Builder.CreateSelect(Op0, LHS, RHS);
- break;
- }
- default:
- llvm_unreachable("Unhandled instruction");
- }
-
- NodeInfo.NewValue = Res;
- if (auto *ResI = dyn_cast<Instruction>(Res))
- ResI->takeName(I);
- }
-
- Value *Res = getReducedOperand(CurrentTruncInst->getOperand(0), SclTy);
- Type *DstTy = CurrentTruncInst->getType();
- if (Res->getType() != DstTy) {
- IRBuilder<> Builder(CurrentTruncInst);
- Res = Builder.CreateIntCast(Res, DstTy, false);
- if (auto *ResI = dyn_cast<Instruction>(Res))
- ResI->takeName(CurrentTruncInst);
- }
- CurrentTruncInst->replaceAllUsesWith(Res);
-
- // Erase old expression dag, which was replaced by the reduced expression dag.
- // We iterate backward, which means we visit the instruction before we visit
- // any of its operands, this way, when we get to the operand, we already
- // removed the instructions (from the expression dag) that uses it.
- CurrentTruncInst->eraseFromParent();
- for (auto I = InstInfoMap.rbegin(), E = InstInfoMap.rend(); I != E; ++I) {
- // We still need to check that the instruction has no users before we erase
- // it, because {SExt, ZExt}Inst Instruction might have other users that was
- // not reduced, in such case, we need to keep that instruction.
- if (I->first->use_empty())
- I->first->eraseFromParent();
- }
-}
-
-bool TruncInstCombine::run(Function &F) {
- bool MadeIRChange = false;
-
- // Collect all TruncInst in the function into the Worklist for evaluating.
- for (auto &BB : F) {
- // Ignore unreachable basic block.
- if (!DT.isReachableFromEntry(&BB))
- continue;
- for (auto &I : BB)
- if (auto *CI = dyn_cast<TruncInst>(&I))
- Worklist.push_back(CI);
- }
-
- // Process all TruncInst in the Worklist, for each instruction:
- // 1. Check if it dominates an eligible expression dag to be reduced.
- // 2. Create a reduced expression dag and replace the old one with it.
- while (!Worklist.empty()) {
- CurrentTruncInst = Worklist.pop_back_val();
-
- if (Type *NewDstSclTy = getBestTruncatedType()) {
- LLVM_DEBUG(
- dbgs() << "ICE: TruncInstCombine reducing type of expression dag "
- "dominated by: "
- << CurrentTruncInst << '\n');
- ReduceExpressionDag(NewDstSclTy);
- ++NumDAGsReduced;
- MadeIRChange = true;
- }
- }
-
- return MadeIRChange;
-}
+ if (Entry != Worklist.end()) {
+ if (auto *NewCI = dyn_cast<TruncInst>(Res))
+ *Entry = NewCI;
+ else
+ Worklist.erase(Entry);
+ } else if (auto *NewCI = dyn_cast<TruncInst>(Res))
+ Worklist.push_back(NewCI);
+ break;
+ }
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ Value *LHS = getReducedOperand(I->getOperand(0), SclTy);
+ Value *RHS = getReducedOperand(I->getOperand(1), SclTy);
+ Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS);
+ break;
+ }
+ case Instruction::Select: {
+ Value *Op0 = I->getOperand(0);
+ Value *LHS = getReducedOperand(I->getOperand(1), SclTy);
+ Value *RHS = getReducedOperand(I->getOperand(2), SclTy);
+ Res = Builder.CreateSelect(Op0, LHS, RHS);
+ break;
+ }
+ default:
+ llvm_unreachable("Unhandled instruction");
+ }
+
+ NodeInfo.NewValue = Res;
+ if (auto *ResI = dyn_cast<Instruction>(Res))
+ ResI->takeName(I);
+ }
+
+ Value *Res = getReducedOperand(CurrentTruncInst->getOperand(0), SclTy);
+ Type *DstTy = CurrentTruncInst->getType();
+ if (Res->getType() != DstTy) {
+ IRBuilder<> Builder(CurrentTruncInst);
+ Res = Builder.CreateIntCast(Res, DstTy, false);
+ if (auto *ResI = dyn_cast<Instruction>(Res))
+ ResI->takeName(CurrentTruncInst);
+ }
+ CurrentTruncInst->replaceAllUsesWith(Res);
+
+ // Erase old expression dag, which was replaced by the reduced expression dag.
+ // We iterate backward, which means we visit the instruction before we visit
+ // any of its operands, this way, when we get to the operand, we already
+ // removed the instructions (from the expression dag) that uses it.
+ CurrentTruncInst->eraseFromParent();
+ for (auto I = InstInfoMap.rbegin(), E = InstInfoMap.rend(); I != E; ++I) {
+ // We still need to check that the instruction has no users before we erase
+ // it, because {SExt, ZExt}Inst Instruction might have other users that was
+ // not reduced, in such case, we need to keep that instruction.
+ if (I->first->use_empty())
+ I->first->eraseFromParent();
+ }
+}
+
+bool TruncInstCombine::run(Function &F) {
+ bool MadeIRChange = false;
+
+ // Collect all TruncInst in the function into the Worklist for evaluating.
+ for (auto &BB : F) {
+ // Ignore unreachable basic block.
+ if (!DT.isReachableFromEntry(&BB))
+ continue;
+ for (auto &I : BB)
+ if (auto *CI = dyn_cast<TruncInst>(&I))
+ Worklist.push_back(CI);
+ }
+
+ // Process all TruncInst in the Worklist, for each instruction:
+ // 1. Check if it dominates an eligible expression dag to be reduced.
+ // 2. Create a reduced expression dag and replace the old one with it.
+ while (!Worklist.empty()) {
+ CurrentTruncInst = Worklist.pop_back_val();
+
+ if (Type *NewDstSclTy = getBestTruncatedType()) {
+ LLVM_DEBUG(
+ dbgs() << "ICE: TruncInstCombine reducing type of expression dag "
+ "dominated by: "
+ << CurrentTruncInst << '\n');
+ ReduceExpressionDag(NewDstSclTy);
+ ++NumDAGsReduced;
+ MadeIRChange = true;
+ }
+ }
+
+ return MadeIRChange;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/ya.make b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/ya.make
index fced984059..c472a2054a 100644
--- a/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine/ya.make
@@ -1,36 +1,36 @@
-# Generated by devtools/yamaker.
-
-LIBRARY()
-
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
OWNER(
orivej
g:cpp-contrib
)
-
+
LICENSE(Apache-2.0 WITH LLVM-exception)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-PEERDIR(
+PEERDIR(
contrib/libs/llvm12
contrib/libs/llvm12/include
contrib/libs/llvm12/lib/Analysis
contrib/libs/llvm12/lib/IR
contrib/libs/llvm12/lib/Support
contrib/libs/llvm12/lib/Transforms/Utils
-)
-
+)
+
ADDINCL(
contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine
)
-
-NO_COMPILER_WARNINGS()
-
-NO_UTIL()
-
-SRCS(
- AggressiveInstCombine.cpp
- TruncInstCombine.cpp
-)
-
-END()
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+ AggressiveInstCombine.cpp
+ TruncInstCombine.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/CFGuard/CFGuard.cpp b/contrib/libs/llvm12/lib/Transforms/CFGuard/CFGuard.cpp
index b102c7d2ce..96c083a144 100644
--- a/contrib/libs/llvm12/lib/Transforms/CFGuard/CFGuard.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/CFGuard/CFGuard.cpp
@@ -1,300 +1,300 @@
-//===-- CFGuard.cpp - Control Flow Guard checks -----------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file contains the IR transform to add Microsoft's Control Flow Guard
-/// checks on Windows targets.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/CFGuard.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-
-using namespace llvm;
-
-using OperandBundleDef = OperandBundleDefT<Value *>;
-
-#define DEBUG_TYPE "cfguard"
-
-STATISTIC(CFGuardCounter, "Number of Control Flow Guard checks added");
-
-namespace {
-
-/// Adds Control Flow Guard (CFG) checks on indirect function calls/invokes.
-/// These checks ensure that the target address corresponds to the start of an
-/// address-taken function. X86_64 targets use the CF_Dispatch mechanism. X86,
-/// ARM, and AArch64 targets use the CF_Check machanism.
-class CFGuard : public FunctionPass {
-public:
- static char ID;
-
- enum Mechanism { CF_Check, CF_Dispatch };
-
- // Default constructor required for the INITIALIZE_PASS macro.
- CFGuard() : FunctionPass(ID) {
- initializeCFGuardPass(*PassRegistry::getPassRegistry());
- // By default, use the guard check mechanism.
- GuardMechanism = CF_Check;
- }
-
- // Recommended constructor used to specify the type of guard mechanism.
- CFGuard(Mechanism Var) : FunctionPass(ID) {
- initializeCFGuardPass(*PassRegistry::getPassRegistry());
- GuardMechanism = Var;
- }
-
- /// Inserts a Control Flow Guard (CFG) check on an indirect call using the CFG
- /// check mechanism. When the image is loaded, the loader puts the appropriate
- /// guard check function pointer in the __guard_check_icall_fptr global
- /// symbol. This checks that the target address is a valid address-taken
- /// function. The address of the target function is passed to the guard check
- /// function in an architecture-specific register (e.g. ECX on 32-bit X86,
- /// X15 on Aarch64, and R0 on ARM). The guard check function has no return
- /// value (if the target is invalid, the guard check funtion will raise an
- /// error).
- ///
- /// For example, the following LLVM IR:
- /// \code
- /// %func_ptr = alloca i32 ()*, align 8
- /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8
- /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8
- /// %1 = call i32 %0()
- /// \endcode
- ///
- /// is transformed to:
- /// \code
- /// %func_ptr = alloca i32 ()*, align 8
- /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8
- /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8
- /// %1 = load void (i8*)*, void (i8*)** @__guard_check_icall_fptr
- /// %2 = bitcast i32 ()* %0 to i8*
- /// call cfguard_checkcc void %1(i8* %2)
- /// %3 = call i32 %0()
- /// \endcode
- ///
- /// For example, the following X86 assembly code:
- /// \code
- /// movl $_target_func, %eax
- /// calll *%eax
- /// \endcode
- ///
- /// is transformed to:
- /// \code
- /// movl $_target_func, %ecx
- /// calll *___guard_check_icall_fptr
- /// calll *%ecx
- /// \endcode
- ///
- /// \param CB indirect call to instrument.
- void insertCFGuardCheck(CallBase *CB);
-
- /// Inserts a Control Flow Guard (CFG) check on an indirect call using the CFG
- /// dispatch mechanism. When the image is loaded, the loader puts the
- /// appropriate guard check function pointer in the
- /// __guard_dispatch_icall_fptr global symbol. This checks that the target
- /// address is a valid address-taken function and, if so, tail calls the
- /// target. The target address is passed in an architecture-specific register
- /// (e.g. RAX on X86_64), with all other arguments for the target function
- /// passed as usual.
- ///
- /// For example, the following LLVM IR:
- /// \code
- /// %func_ptr = alloca i32 ()*, align 8
- /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8
- /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8
- /// %1 = call i32 %0()
- /// \endcode
- ///
- /// is transformed to:
- /// \code
- /// %func_ptr = alloca i32 ()*, align 8
- /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8
- /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8
- /// %1 = load i32 ()*, i32 ()** @__guard_dispatch_icall_fptr
- /// %2 = call i32 %1() [ "cfguardtarget"(i32 ()* %0) ]
- /// \endcode
- ///
- /// For example, the following X86_64 assembly code:
- /// \code
- /// leaq target_func(%rip), %rax
- /// callq *%rax
- /// \endcode
- ///
- /// is transformed to:
- /// \code
- /// leaq target_func(%rip), %rax
- /// callq *__guard_dispatch_icall_fptr(%rip)
- /// \endcode
- ///
- /// \param CB indirect call to instrument.
- void insertCFGuardDispatch(CallBase *CB);
-
- bool doInitialization(Module &M) override;
- bool runOnFunction(Function &F) override;
-
-private:
- // Only add checks if the module has the cfguard=2 flag.
- int cfguard_module_flag = 0;
- Mechanism GuardMechanism = CF_Check;
- FunctionType *GuardFnType = nullptr;
- PointerType *GuardFnPtrType = nullptr;
- Constant *GuardFnGlobal = nullptr;
-};
-
-} // end anonymous namespace
-
-void CFGuard::insertCFGuardCheck(CallBase *CB) {
-
- assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() &&
- "Only applicable for Windows targets");
- assert(CB->isIndirectCall() &&
- "Control Flow Guard checks can only be added to indirect calls");
-
- IRBuilder<> B(CB);
- Value *CalledOperand = CB->getCalledOperand();
-
- // Load the global symbol as a pointer to the check function.
- LoadInst *GuardCheckLoad = B.CreateLoad(GuardFnPtrType, GuardFnGlobal);
-
- // Create new call instruction. The CFGuard check should always be a call,
- // even if the original CallBase is an Invoke or CallBr instruction.
- CallInst *GuardCheck =
- B.CreateCall(GuardFnType, GuardCheckLoad,
- {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())});
-
- // Ensure that the first argument is passed in the correct register
- // (e.g. ECX on 32-bit X86 targets).
- GuardCheck->setCallingConv(CallingConv::CFGuard_Check);
-}
-
-void CFGuard::insertCFGuardDispatch(CallBase *CB) {
-
- assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() &&
- "Only applicable for Windows targets");
- assert(CB->isIndirectCall() &&
- "Control Flow Guard checks can only be added to indirect calls");
-
- IRBuilder<> B(CB);
- Value *CalledOperand = CB->getCalledOperand();
- Type *CalledOperandType = CalledOperand->getType();
-
- // Cast the guard dispatch global to the type of the called operand.
- PointerType *PTy = PointerType::get(CalledOperandType, 0);
- if (GuardFnGlobal->getType() != PTy)
- GuardFnGlobal = ConstantExpr::getBitCast(GuardFnGlobal, PTy);
-
- // Load the global as a pointer to a function of the same type.
- LoadInst *GuardDispatchLoad = B.CreateLoad(CalledOperandType, GuardFnGlobal);
-
- // Add the original call target as a cfguardtarget operand bundle.
- SmallVector<llvm::OperandBundleDef, 1> Bundles;
- CB->getOperandBundlesAsDefs(Bundles);
- Bundles.emplace_back("cfguardtarget", CalledOperand);
-
- // Create a copy of the call/invoke instruction and add the new bundle.
- assert((isa<CallInst>(CB) || isa<InvokeInst>(CB)) &&
- "Unknown indirect call type");
- CallBase *NewCB = CallBase::Create(CB, Bundles, CB);
-
- // Change the target of the call to be the guard dispatch function.
- NewCB->setCalledOperand(GuardDispatchLoad);
-
- // Replace the original call/invoke with the new instruction.
- CB->replaceAllUsesWith(NewCB);
-
- // Delete the original call/invoke.
- CB->eraseFromParent();
-}
-
-bool CFGuard::doInitialization(Module &M) {
-
- // Check if this module has the cfguard flag and read its value.
- if (auto *MD =
- mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard")))
- cfguard_module_flag = MD->getZExtValue();
-
- // Skip modules for which CFGuard checks have been disabled.
- if (cfguard_module_flag != 2)
- return false;
-
- // Set up prototypes for the guard check and dispatch functions.
- GuardFnType = FunctionType::get(Type::getVoidTy(M.getContext()),
- {Type::getInt8PtrTy(M.getContext())}, false);
- GuardFnPtrType = PointerType::get(GuardFnType, 0);
-
- // Get or insert the guard check or dispatch global symbols.
- if (GuardMechanism == CF_Check) {
- GuardFnGlobal =
- M.getOrInsertGlobal("__guard_check_icall_fptr", GuardFnPtrType);
- } else {
- assert(GuardMechanism == CF_Dispatch && "Invalid CFGuard mechanism");
- GuardFnGlobal =
- M.getOrInsertGlobal("__guard_dispatch_icall_fptr", GuardFnPtrType);
- }
-
- return true;
-}
-
-bool CFGuard::runOnFunction(Function &F) {
-
- // Skip modules for which CFGuard checks have been disabled.
- if (cfguard_module_flag != 2)
- return false;
-
- SmallVector<CallBase *, 8> IndirectCalls;
-
- // Iterate over the instructions to find all indirect call/invoke/callbr
- // instructions. Make a separate list of pointers to indirect
- // call/invoke/callbr instructions because the original instructions will be
- // deleted as the checks are added.
- for (BasicBlock &BB : F.getBasicBlockList()) {
- for (Instruction &I : BB.getInstList()) {
- auto *CB = dyn_cast<CallBase>(&I);
- if (CB && CB->isIndirectCall() && !CB->hasFnAttr("guard_nocf")) {
- IndirectCalls.push_back(CB);
- CFGuardCounter++;
- }
- }
- }
-
- // If no checks are needed, return early.
- if (IndirectCalls.empty()) {
- return false;
- }
-
- // For each indirect call/invoke, add the appropriate dispatch or check.
- if (GuardMechanism == CF_Dispatch) {
- for (CallBase *CB : IndirectCalls) {
- insertCFGuardDispatch(CB);
- }
- } else {
- for (CallBase *CB : IndirectCalls) {
- insertCFGuardCheck(CB);
- }
- }
-
- return true;
-}
-
-char CFGuard::ID = 0;
-INITIALIZE_PASS(CFGuard, "CFGuard", "CFGuard", false, false)
-
-FunctionPass *llvm::createCFGuardCheckPass() {
- return new CFGuard(CFGuard::CF_Check);
-}
-
-FunctionPass *llvm::createCFGuardDispatchPass() {
- return new CFGuard(CFGuard::CF_Dispatch);
-}
+//===-- CFGuard.cpp - Control Flow Guard checks -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the IR transform to add Microsoft's Control Flow Guard
+/// checks on Windows targets.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/CFGuard.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+using OperandBundleDef = OperandBundleDefT<Value *>;
+
+#define DEBUG_TYPE "cfguard"
+
+STATISTIC(CFGuardCounter, "Number of Control Flow Guard checks added");
+
+namespace {
+
+/// Adds Control Flow Guard (CFG) checks on indirect function calls/invokes.
+/// These checks ensure that the target address corresponds to the start of an
+/// address-taken function. X86_64 targets use the CF_Dispatch mechanism. X86,
+/// ARM, and AArch64 targets use the CF_Check machanism.
+class CFGuard : public FunctionPass {
+public:
+ static char ID;
+
+ enum Mechanism { CF_Check, CF_Dispatch };
+
+ // Default constructor required for the INITIALIZE_PASS macro.
+ CFGuard() : FunctionPass(ID) {
+ initializeCFGuardPass(*PassRegistry::getPassRegistry());
+ // By default, use the guard check mechanism.
+ GuardMechanism = CF_Check;
+ }
+
+ // Recommended constructor used to specify the type of guard mechanism.
+ CFGuard(Mechanism Var) : FunctionPass(ID) {
+ initializeCFGuardPass(*PassRegistry::getPassRegistry());
+ GuardMechanism = Var;
+ }
+
+ /// Inserts a Control Flow Guard (CFG) check on an indirect call using the CFG
+ /// check mechanism. When the image is loaded, the loader puts the appropriate
+ /// guard check function pointer in the __guard_check_icall_fptr global
+ /// symbol. This checks that the target address is a valid address-taken
+ /// function. The address of the target function is passed to the guard check
+ /// function in an architecture-specific register (e.g. ECX on 32-bit X86,
+ /// X15 on Aarch64, and R0 on ARM). The guard check function has no return
+ /// value (if the target is invalid, the guard check funtion will raise an
+ /// error).
+ ///
+ /// For example, the following LLVM IR:
+ /// \code
+ /// %func_ptr = alloca i32 ()*, align 8
+ /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8
+ /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8
+ /// %1 = call i32 %0()
+ /// \endcode
+ ///
+ /// is transformed to:
+ /// \code
+ /// %func_ptr = alloca i32 ()*, align 8
+ /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8
+ /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8
+ /// %1 = load void (i8*)*, void (i8*)** @__guard_check_icall_fptr
+ /// %2 = bitcast i32 ()* %0 to i8*
+ /// call cfguard_checkcc void %1(i8* %2)
+ /// %3 = call i32 %0()
+ /// \endcode
+ ///
+ /// For example, the following X86 assembly code:
+ /// \code
+ /// movl $_target_func, %eax
+ /// calll *%eax
+ /// \endcode
+ ///
+ /// is transformed to:
+ /// \code
+ /// movl $_target_func, %ecx
+ /// calll *___guard_check_icall_fptr
+ /// calll *%ecx
+ /// \endcode
+ ///
+ /// \param CB indirect call to instrument.
+ void insertCFGuardCheck(CallBase *CB);
+
+ /// Inserts a Control Flow Guard (CFG) check on an indirect call using the CFG
+ /// dispatch mechanism. When the image is loaded, the loader puts the
+ /// appropriate guard check function pointer in the
+ /// __guard_dispatch_icall_fptr global symbol. This checks that the target
+ /// address is a valid address-taken function and, if so, tail calls the
+ /// target. The target address is passed in an architecture-specific register
+ /// (e.g. RAX on X86_64), with all other arguments for the target function
+ /// passed as usual.
+ ///
+ /// For example, the following LLVM IR:
+ /// \code
+ /// %func_ptr = alloca i32 ()*, align 8
+ /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8
+ /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8
+ /// %1 = call i32 %0()
+ /// \endcode
+ ///
+ /// is transformed to:
+ /// \code
+ /// %func_ptr = alloca i32 ()*, align 8
+ /// store i32 ()* @target_func, i32 ()** %func_ptr, align 8
+ /// %0 = load i32 ()*, i32 ()** %func_ptr, align 8
+ /// %1 = load i32 ()*, i32 ()** @__guard_dispatch_icall_fptr
+ /// %2 = call i32 %1() [ "cfguardtarget"(i32 ()* %0) ]
+ /// \endcode
+ ///
+ /// For example, the following X86_64 assembly code:
+ /// \code
+ /// leaq target_func(%rip), %rax
+ /// callq *%rax
+ /// \endcode
+ ///
+ /// is transformed to:
+ /// \code
+ /// leaq target_func(%rip), %rax
+ /// callq *__guard_dispatch_icall_fptr(%rip)
+ /// \endcode
+ ///
+ /// \param CB indirect call to instrument.
+ void insertCFGuardDispatch(CallBase *CB);
+
+ bool doInitialization(Module &M) override;
+ bool runOnFunction(Function &F) override;
+
+private:
+ // Only add checks if the module has the cfguard=2 flag.
+ int cfguard_module_flag = 0;
+ Mechanism GuardMechanism = CF_Check;
+ FunctionType *GuardFnType = nullptr;
+ PointerType *GuardFnPtrType = nullptr;
+ Constant *GuardFnGlobal = nullptr;
+};
+
+} // end anonymous namespace
+
+void CFGuard::insertCFGuardCheck(CallBase *CB) {
+
+ assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() &&
+ "Only applicable for Windows targets");
+ assert(CB->isIndirectCall() &&
+ "Control Flow Guard checks can only be added to indirect calls");
+
+ IRBuilder<> B(CB);
+ Value *CalledOperand = CB->getCalledOperand();
+
+ // Load the global symbol as a pointer to the check function.
+ LoadInst *GuardCheckLoad = B.CreateLoad(GuardFnPtrType, GuardFnGlobal);
+
+ // Create new call instruction. The CFGuard check should always be a call,
+ // even if the original CallBase is an Invoke or CallBr instruction.
+ CallInst *GuardCheck =
+ B.CreateCall(GuardFnType, GuardCheckLoad,
+ {B.CreateBitCast(CalledOperand, B.getInt8PtrTy())});
+
+ // Ensure that the first argument is passed in the correct register
+ // (e.g. ECX on 32-bit X86 targets).
+ GuardCheck->setCallingConv(CallingConv::CFGuard_Check);
+}
+
+void CFGuard::insertCFGuardDispatch(CallBase *CB) {
+
+ assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() &&
+ "Only applicable for Windows targets");
+ assert(CB->isIndirectCall() &&
+ "Control Flow Guard checks can only be added to indirect calls");
+
+ IRBuilder<> B(CB);
+ Value *CalledOperand = CB->getCalledOperand();
+ Type *CalledOperandType = CalledOperand->getType();
+
+ // Cast the guard dispatch global to the type of the called operand.
+ PointerType *PTy = PointerType::get(CalledOperandType, 0);
+ if (GuardFnGlobal->getType() != PTy)
+ GuardFnGlobal = ConstantExpr::getBitCast(GuardFnGlobal, PTy);
+
+ // Load the global as a pointer to a function of the same type.
+ LoadInst *GuardDispatchLoad = B.CreateLoad(CalledOperandType, GuardFnGlobal);
+
+ // Add the original call target as a cfguardtarget operand bundle.
+ SmallVector<llvm::OperandBundleDef, 1> Bundles;
+ CB->getOperandBundlesAsDefs(Bundles);
+ Bundles.emplace_back("cfguardtarget", CalledOperand);
+
+ // Create a copy of the call/invoke instruction and add the new bundle.
+ assert((isa<CallInst>(CB) || isa<InvokeInst>(CB)) &&
+ "Unknown indirect call type");
+ CallBase *NewCB = CallBase::Create(CB, Bundles, CB);
+
+ // Change the target of the call to be the guard dispatch function.
+ NewCB->setCalledOperand(GuardDispatchLoad);
+
+ // Replace the original call/invoke with the new instruction.
+ CB->replaceAllUsesWith(NewCB);
+
+ // Delete the original call/invoke.
+ CB->eraseFromParent();
+}
+
+bool CFGuard::doInitialization(Module &M) {
+
+ // Check if this module has the cfguard flag and read its value.
+ if (auto *MD =
+ mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard")))
+ cfguard_module_flag = MD->getZExtValue();
+
+ // Skip modules for which CFGuard checks have been disabled.
+ if (cfguard_module_flag != 2)
+ return false;
+
+ // Set up prototypes for the guard check and dispatch functions.
+ GuardFnType = FunctionType::get(Type::getVoidTy(M.getContext()),
+ {Type::getInt8PtrTy(M.getContext())}, false);
+ GuardFnPtrType = PointerType::get(GuardFnType, 0);
+
+ // Get or insert the guard check or dispatch global symbols.
+ if (GuardMechanism == CF_Check) {
+ GuardFnGlobal =
+ M.getOrInsertGlobal("__guard_check_icall_fptr", GuardFnPtrType);
+ } else {
+ assert(GuardMechanism == CF_Dispatch && "Invalid CFGuard mechanism");
+ GuardFnGlobal =
+ M.getOrInsertGlobal("__guard_dispatch_icall_fptr", GuardFnPtrType);
+ }
+
+ return true;
+}
+
+bool CFGuard::runOnFunction(Function &F) {
+
+ // Skip modules for which CFGuard checks have been disabled.
+ if (cfguard_module_flag != 2)
+ return false;
+
+ SmallVector<CallBase *, 8> IndirectCalls;
+
+ // Iterate over the instructions to find all indirect call/invoke/callbr
+ // instructions. Make a separate list of pointers to indirect
+ // call/invoke/callbr instructions because the original instructions will be
+ // deleted as the checks are added.
+ for (BasicBlock &BB : F.getBasicBlockList()) {
+ for (Instruction &I : BB.getInstList()) {
+ auto *CB = dyn_cast<CallBase>(&I);
+ if (CB && CB->isIndirectCall() && !CB->hasFnAttr("guard_nocf")) {
+ IndirectCalls.push_back(CB);
+ CFGuardCounter++;
+ }
+ }
+ }
+
+ // If no checks are needed, return early.
+ if (IndirectCalls.empty()) {
+ return false;
+ }
+
+ // For each indirect call/invoke, add the appropriate dispatch or check.
+ if (GuardMechanism == CF_Dispatch) {
+ for (CallBase *CB : IndirectCalls) {
+ insertCFGuardDispatch(CB);
+ }
+ } else {
+ for (CallBase *CB : IndirectCalls) {
+ insertCFGuardCheck(CB);
+ }
+ }
+
+ return true;
+}
+
+char CFGuard::ID = 0;
+INITIALIZE_PASS(CFGuard, "CFGuard", "CFGuard", false, false)
+
+FunctionPass *llvm::createCFGuardCheckPass() {
+ return new CFGuard(CFGuard::CF_Check);
+}
+
+FunctionPass *llvm::createCFGuardDispatchPass() {
+ return new CFGuard(CFGuard::CF_Dispatch);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/CFGuard/ya.make b/contrib/libs/llvm12/lib/Transforms/CFGuard/ya.make
index fa6d03488d..37fe9ccc94 100644
--- a/contrib/libs/llvm12/lib/Transforms/CFGuard/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/CFGuard/ya.make
@@ -1,33 +1,33 @@
-# Generated by devtools/yamaker.
-
-LIBRARY()
-
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
OWNER(
orivej
g:cpp-contrib
)
-
+
LICENSE(Apache-2.0 WITH LLVM-exception)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-PEERDIR(
+PEERDIR(
contrib/libs/llvm12
contrib/libs/llvm12/include
contrib/libs/llvm12/lib/IR
contrib/libs/llvm12/lib/Support
-)
-
+)
+
ADDINCL(
contrib/libs/llvm12/lib/Transforms/CFGuard
)
-
-NO_COMPILER_WARNINGS()
-
-NO_UTIL()
-
-SRCS(
- CFGuard.cpp
-)
-
-END()
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+ CFGuard.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/AlwaysInliner.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/AlwaysInliner.cpp
index c7bb0803e3..532599b42e 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -1,51 +1,51 @@
-//===- InlineAlways.cpp - Code to inline always_inline functions ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a custom inliner that handles only functions that
-// are marked as "always inline".
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/AlwaysInliner.h"
-#include "llvm/ADT/SetVector.h"
+//===- InlineAlways.cpp - Code to inline always_inline functions ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a custom inliner that handles only functions that
+// are marked as "always inline".
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/ADT/SetVector.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InlineCost.h"
#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/Inliner.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "inline"
-
-PreservedAnalyses AlwaysInlinerPass::run(Module &M,
- ModuleAnalysisManager &MAM) {
- // Add inline assumptions during code generation.
- FunctionAnalysisManager &FAM =
- MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
- return FAM.getResult<AssumptionAnalysis>(F);
- };
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/Inliner.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inline"
+
+PreservedAnalyses AlwaysInlinerPass::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+ // Add inline assumptions during code generation.
+ FunctionAnalysisManager &FAM =
+ MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+ return FAM.getResult<AssumptionAnalysis>(F);
+ };
auto &PSI = MAM.getResult<ProfileSummaryAnalysis>(M);
-
- SmallSetVector<CallBase *, 16> Calls;
- bool Changed = false;
- SmallVector<Function *, 16> InlinedFunctions;
+
+ SmallSetVector<CallBase *, 16> Calls;
+ bool Changed = false;
+ SmallVector<Function *, 16> InlinedFunctions;
for (Function &F : M) {
// When callee coroutine function is inlined into caller coroutine function
// before coro-split pass,
@@ -54,15 +54,15 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
if (F.isPresplitCoroutine())
continue;
- if (!F.isDeclaration() && F.hasFnAttribute(Attribute::AlwaysInline) &&
- isInlineViable(F).isSuccess()) {
- Calls.clear();
-
- for (User *U : F.users())
- if (auto *CB = dyn_cast<CallBase>(U))
- if (CB->getCalledFunction() == &F)
- Calls.insert(CB);
-
+ if (!F.isDeclaration() && F.hasFnAttribute(Attribute::AlwaysInline) &&
+ isInlineViable(F).isSuccess()) {
+ Calls.clear();
+
+ for (User *U : F.users())
+ if (auto *CB = dyn_cast<CallBase>(U))
+ if (CB->getCalledFunction() == &F)
+ Calls.insert(CB);
+
for (CallBase *CB : Calls) {
Function *Caller = CB->getCaller();
OptimizationRemarkEmitter ORE(Caller);
@@ -75,7 +75,7 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
assert(OIC);
emitInlinedInto(ORE, CB->getDebugLoc(), CB->getParent(), F, *Caller,
*OIC, false, DEBUG_TYPE);
-
+
InlineFunctionInfo IFI(
/*cg=*/nullptr, GetAssumptionCache, &PSI,
&FAM.getResult<BlockFrequencyAnalysis>(*(CB->getCaller())),
@@ -92,104 +92,104 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
Changed = true;
}
- // Remember to try and delete this function afterward. This both avoids
- // re-walking the rest of the module and avoids dealing with any iterator
- // invalidation issues while deleting functions.
- InlinedFunctions.push_back(&F);
- }
+ // Remember to try and delete this function afterward. This both avoids
+ // re-walking the rest of the module and avoids dealing with any iterator
+ // invalidation issues while deleting functions.
+ InlinedFunctions.push_back(&F);
+ }
+ }
+
+ // Remove any live functions.
+ erase_if(InlinedFunctions, [&](Function *F) {
+ F->removeDeadConstantUsers();
+ return !F->isDefTriviallyDead();
+ });
+
+ // Delete the non-comdat ones from the module and also from our vector.
+ auto NonComdatBegin = partition(
+ InlinedFunctions, [&](Function *F) { return F->hasComdat(); });
+ for (Function *F : make_range(NonComdatBegin, InlinedFunctions.end()))
+ M.getFunctionList().erase(F);
+ InlinedFunctions.erase(NonComdatBegin, InlinedFunctions.end());
+
+ if (!InlinedFunctions.empty()) {
+ // Now we just have the comdat functions. Filter out the ones whose comdats
+ // are not actually dead.
+ filterDeadComdatFunctions(M, InlinedFunctions);
+ // The remaining functions are actually dead.
+ for (Function *F : InlinedFunctions)
+ M.getFunctionList().erase(F);
+ }
+
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+namespace {
+
+/// Inliner pass which only handles "always inline" functions.
+///
+/// Unlike the \c AlwaysInlinerPass, this uses the more heavyweight \c Inliner
+/// base class to provide several facilities such as array alloca merging.
+class AlwaysInlinerLegacyPass : public LegacyInlinerBase {
+
+public:
+ AlwaysInlinerLegacyPass() : LegacyInlinerBase(ID, /*InsertLifetime*/ true) {
+ initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ AlwaysInlinerLegacyPass(bool InsertLifetime)
+ : LegacyInlinerBase(ID, InsertLifetime) {
+ initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ /// Main run interface method. We override here to avoid calling skipSCC().
+ bool runOnSCC(CallGraphSCC &SCC) override { return inlineCalls(SCC); }
+
+ static char ID; // Pass identification, replacement for typeid
+
+ InlineCost getInlineCost(CallBase &CB) override;
+
+ using llvm::Pass::doFinalization;
+ bool doFinalization(CallGraph &CG) override {
+ return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true);
}
-
- // Remove any live functions.
- erase_if(InlinedFunctions, [&](Function *F) {
- F->removeDeadConstantUsers();
- return !F->isDefTriviallyDead();
- });
-
- // Delete the non-comdat ones from the module and also from our vector.
- auto NonComdatBegin = partition(
- InlinedFunctions, [&](Function *F) { return F->hasComdat(); });
- for (Function *F : make_range(NonComdatBegin, InlinedFunctions.end()))
- M.getFunctionList().erase(F);
- InlinedFunctions.erase(NonComdatBegin, InlinedFunctions.end());
-
- if (!InlinedFunctions.empty()) {
- // Now we just have the comdat functions. Filter out the ones whose comdats
- // are not actually dead.
- filterDeadComdatFunctions(M, InlinedFunctions);
- // The remaining functions are actually dead.
- for (Function *F : InlinedFunctions)
- M.getFunctionList().erase(F);
- }
-
- return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
-}
-
-namespace {
-
-/// Inliner pass which only handles "always inline" functions.
-///
-/// Unlike the \c AlwaysInlinerPass, this uses the more heavyweight \c Inliner
-/// base class to provide several facilities such as array alloca merging.
-class AlwaysInlinerLegacyPass : public LegacyInlinerBase {
-
-public:
- AlwaysInlinerLegacyPass() : LegacyInlinerBase(ID, /*InsertLifetime*/ true) {
- initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- AlwaysInlinerLegacyPass(bool InsertLifetime)
- : LegacyInlinerBase(ID, InsertLifetime) {
- initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- /// Main run interface method. We override here to avoid calling skipSCC().
- bool runOnSCC(CallGraphSCC &SCC) override { return inlineCalls(SCC); }
-
- static char ID; // Pass identification, replacement for typeid
-
- InlineCost getInlineCost(CallBase &CB) override;
-
- using llvm::Pass::doFinalization;
- bool doFinalization(CallGraph &CG) override {
- return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true);
- }
-};
-}
-
-char AlwaysInlinerLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(AlwaysInlinerLegacyPass, "always-inline",
- "Inliner for always_inline functions", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(AlwaysInlinerLegacyPass, "always-inline",
- "Inliner for always_inline functions", false, false)
-
-Pass *llvm::createAlwaysInlinerLegacyPass(bool InsertLifetime) {
- return new AlwaysInlinerLegacyPass(InsertLifetime);
-}
-
-/// Get the inline cost for the always-inliner.
-///
-/// The always inliner *only* handles functions which are marked with the
-/// attribute to force inlining. As such, it is dramatically simpler and avoids
-/// using the powerful (but expensive) inline cost analysis. Instead it uses
-/// a very simple and boring direct walk of the instructions looking for
-/// impossible-to-inline constructs.
-///
-/// Note, it would be possible to go to some lengths to cache the information
-/// computed here, but as we only expect to do this for relatively few and
-/// small functions which have the explicit attribute to force inlining, it is
-/// likely not worth it in practice.
-InlineCost AlwaysInlinerLegacyPass::getInlineCost(CallBase &CB) {
- Function *Callee = CB.getCalledFunction();
-
- // Only inline direct calls to functions with always-inline attributes
- // that are viable for inlining.
- if (!Callee)
- return InlineCost::getNever("indirect call");
-
+};
+}
+
+char AlwaysInlinerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(AlwaysInlinerLegacyPass, "always-inline",
+ "Inliner for always_inline functions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(AlwaysInlinerLegacyPass, "always-inline",
+ "Inliner for always_inline functions", false, false)
+
+Pass *llvm::createAlwaysInlinerLegacyPass(bool InsertLifetime) {
+ return new AlwaysInlinerLegacyPass(InsertLifetime);
+}
+
+/// Get the inline cost for the always-inliner.
+///
+/// The always inliner *only* handles functions which are marked with the
+/// attribute to force inlining. As such, it is dramatically simpler and avoids
+/// using the powerful (but expensive) inline cost analysis. Instead it uses
+/// a very simple and boring direct walk of the instructions looking for
+/// impossible-to-inline constructs.
+///
+/// Note, it would be possible to go to some lengths to cache the information
+/// computed here, but as we only expect to do this for relatively few and
+/// small functions which have the explicit attribute to force inlining, it is
+/// likely not worth it in practice.
+InlineCost AlwaysInlinerLegacyPass::getInlineCost(CallBase &CB) {
+ Function *Callee = CB.getCalledFunction();
+
+ // Only inline direct calls to functions with always-inline attributes
+ // that are viable for inlining.
+ if (!Callee)
+ return InlineCost::getNever("indirect call");
+
// When callee coroutine function is inlined into caller coroutine function
// before coro-split pass,
// coro-early pass can not handle this quiet well.
@@ -197,16 +197,16 @@ InlineCost AlwaysInlinerLegacyPass::getInlineCost(CallBase &CB) {
if (Callee->isPresplitCoroutine())
return InlineCost::getNever("unsplited coroutine call");
- // FIXME: We shouldn't even get here for declarations.
- if (Callee->isDeclaration())
- return InlineCost::getNever("no definition");
-
- if (!CB.hasFnAttr(Attribute::AlwaysInline))
- return InlineCost::getNever("no alwaysinline attribute");
-
- auto IsViable = isInlineViable(*Callee);
- if (!IsViable.isSuccess())
- return InlineCost::getNever(IsViable.getFailureReason());
-
- return InlineCost::getAlways("always inliner");
-}
+ // FIXME: We shouldn't even get here for declarations.
+ if (Callee->isDeclaration())
+ return InlineCost::getNever("no definition");
+
+ if (!CB.hasFnAttr(Attribute::AlwaysInline))
+ return InlineCost::getNever("no alwaysinline attribute");
+
+ auto IsViable = isInlineViable(*Callee);
+ if (!IsViable.isSuccess())
+ return InlineCost::getNever(IsViable.getFailureReason());
+
+ return InlineCost::getAlways("always inliner");
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/ArgumentPromotion.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/ArgumentPromotion.cpp
index 47fea8047d..7998a1ae5c 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -1,1175 +1,1175 @@
-//===- ArgumentPromotion.cpp - Promote by-reference arguments -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass promotes "by reference" arguments to be "by value" arguments. In
-// practice, this means looking for internal functions that have pointer
-// arguments. If it can prove, through the use of alias analysis, that an
-// argument is *only* loaded, then it can pass the value into the function
-// instead of the address of the value. This can cause recursive simplification
-// of code and lead to the elimination of allocas (especially in C++ template
-// code like the STL).
-//
-// This pass also handles aggregate arguments that are passed into a function,
-// scalarizing them if the elements of the aggregate are only loaded. Note that
-// by default it refuses to scalarize aggregates which would require passing in
-// more than three operands to the function, because passing thousands of
-// operands for a large array or structure is unprofitable! This limit can be
-// configured or disabled, however.
-//
-// Note that this transformation could also be done for arguments that are only
-// stored to (returning the value instead), but does not currently. This case
-// would be best handled when and if LLVM begins supporting multiple return
-// values from functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/ArgumentPromotion.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
+//===- ArgumentPromotion.cpp - Promote by-reference arguments -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass promotes "by reference" arguments to be "by value" arguments. In
+// practice, this means looking for internal functions that have pointer
+// arguments. If it can prove, through the use of alias analysis, that an
+// argument is *only* loaded, then it can pass the value into the function
+// instead of the address of the value. This can cause recursive simplification
+// of code and lead to the elimination of allocas (especially in C++ template
+// code like the STL).
+//
+// This pass also handles aggregate arguments that are passed into a function,
+// scalarizing them if the elements of the aggregate are only loaded. Note that
+// by default it refuses to scalarize aggregates which would require passing in
+// more than three operands to the function, because passing thousands of
+// operands for a large array or structure is unprofitable! This limit can be
+// configured or disabled, however.
+//
+// Note that this transformation could also be done for arguments that are only
+// stored to (returning the value instead), but does not currently. This case
+// would be best handled when and if LLVM begins supporting multiple return
+// values from functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ArgumentPromotion.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/CGSCCPassManager.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/LazyCallGraph.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/NoFolder.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <functional>
-#include <iterator>
-#include <map>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "argpromotion"
-
-STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted");
-STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted");
-STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted");
-STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated");
-
-/// A vector used to hold the indices of a single GEP instruction
-using IndicesVector = std::vector<uint64_t>;
-
-/// DoPromotion - This method actually performs the promotion of the specified
-/// arguments, and returns the new function. At this point, we know that it's
-/// safe to do so.
-static Function *
-doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
- SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
- Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
- ReplaceCallSite) {
- // Start by computing a new prototype for the function, which is the same as
- // the old function, but has modified arguments.
- FunctionType *FTy = F->getFunctionType();
- std::vector<Type *> Params;
-
- using ScalarizeTable = std::set<std::pair<Type *, IndicesVector>>;
-
- // ScalarizedElements - If we are promoting a pointer that has elements
- // accessed out of it, keep track of which elements are accessed so that we
- // can add one argument for each.
- //
- // Arguments that are directly loaded will have a zero element value here, to
- // handle cases where there are both a direct load and GEP accesses.
- std::map<Argument *, ScalarizeTable> ScalarizedElements;
-
- // OriginalLoads - Keep track of a representative load instruction from the
- // original function so that we can tell the alias analysis implementation
- // what the new GEP/Load instructions we are inserting look like.
- // We need to keep the original loads for each argument and the elements
- // of the argument that are accessed.
- std::map<std::pair<Argument *, IndicesVector>, LoadInst *> OriginalLoads;
-
- // Attribute - Keep track of the parameter attributes for the arguments
- // that we are *not* promoting. For the ones that we do promote, the parameter
- // attributes are lost
- SmallVector<AttributeSet, 8> ArgAttrVec;
- AttributeList PAL = F->getAttributes();
-
- // First, determine the new argument list
- unsigned ArgNo = 0;
- for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
- ++I, ++ArgNo) {
- if (ByValArgsToTransform.count(&*I)) {
- // Simple byval argument? Just add all the struct element types.
- Type *AgTy = cast<PointerType>(I->getType())->getElementType();
- StructType *STy = cast<StructType>(AgTy);
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "argpromotion"
+
+STATISTIC(NumArgumentsPromoted, "Number of pointer arguments promoted");
+STATISTIC(NumAggregatesPromoted, "Number of aggregate arguments promoted");
+STATISTIC(NumByValArgsPromoted, "Number of byval arguments promoted");
+STATISTIC(NumArgumentsDead, "Number of dead pointer args eliminated");
+
+/// A vector used to hold the indices of a single GEP instruction
+using IndicesVector = std::vector<uint64_t>;
+
+/// DoPromotion - This method actually performs the promotion of the specified
+/// arguments, and returns the new function. At this point, we know that it's
+/// safe to do so.
+static Function *
+doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
+ SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
+ Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
+ ReplaceCallSite) {
+ // Start by computing a new prototype for the function, which is the same as
+ // the old function, but has modified arguments.
+ FunctionType *FTy = F->getFunctionType();
+ std::vector<Type *> Params;
+
+ using ScalarizeTable = std::set<std::pair<Type *, IndicesVector>>;
+
+ // ScalarizedElements - If we are promoting a pointer that has elements
+ // accessed out of it, keep track of which elements are accessed so that we
+ // can add one argument for each.
+ //
+ // Arguments that are directly loaded will have a zero element value here, to
+ // handle cases where there are both a direct load and GEP accesses.
+ std::map<Argument *, ScalarizeTable> ScalarizedElements;
+
+ // OriginalLoads - Keep track of a representative load instruction from the
+ // original function so that we can tell the alias analysis implementation
+ // what the new GEP/Load instructions we are inserting look like.
+ // We need to keep the original loads for each argument and the elements
+ // of the argument that are accessed.
+ std::map<std::pair<Argument *, IndicesVector>, LoadInst *> OriginalLoads;
+
+ // Attribute - Keep track of the parameter attributes for the arguments
+ // that we are *not* promoting. For the ones that we do promote, the parameter
+ // attributes are lost
+ SmallVector<AttributeSet, 8> ArgAttrVec;
+ AttributeList PAL = F->getAttributes();
+
+ // First, determine the new argument list
+ unsigned ArgNo = 0;
+ for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+ ++I, ++ArgNo) {
+ if (ByValArgsToTransform.count(&*I)) {
+ // Simple byval argument? Just add all the struct element types.
+ Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+ StructType *STy = cast<StructType>(AgTy);
llvm::append_range(Params, STy->elements());
- ArgAttrVec.insert(ArgAttrVec.end(), STy->getNumElements(),
- AttributeSet());
- ++NumByValArgsPromoted;
- } else if (!ArgsToPromote.count(&*I)) {
- // Unchanged argument
- Params.push_back(I->getType());
- ArgAttrVec.push_back(PAL.getParamAttributes(ArgNo));
- } else if (I->use_empty()) {
- // Dead argument (which are always marked as promotable)
- ++NumArgumentsDead;
- } else {
- // Okay, this is being promoted. This means that the only uses are loads
- // or GEPs which are only used by loads
-
- // In this table, we will track which indices are loaded from the argument
- // (where direct loads are tracked as no indices).
- ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+ ArgAttrVec.insert(ArgAttrVec.end(), STy->getNumElements(),
+ AttributeSet());
+ ++NumByValArgsPromoted;
+ } else if (!ArgsToPromote.count(&*I)) {
+ // Unchanged argument
+ Params.push_back(I->getType());
+ ArgAttrVec.push_back(PAL.getParamAttributes(ArgNo));
+ } else if (I->use_empty()) {
+ // Dead argument (which are always marked as promotable)
+ ++NumArgumentsDead;
+ } else {
+ // Okay, this is being promoted. This means that the only uses are loads
+ // or GEPs which are only used by loads
+
+ // In this table, we will track which indices are loaded from the argument
+ // (where direct loads are tracked as no indices).
+ ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
for (User *U : make_early_inc_range(I->users())) {
- Instruction *UI = cast<Instruction>(U);
- Type *SrcTy;
- if (LoadInst *L = dyn_cast<LoadInst>(UI))
- SrcTy = L->getType();
- else
- SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType();
+ Instruction *UI = cast<Instruction>(U);
+ Type *SrcTy;
+ if (LoadInst *L = dyn_cast<LoadInst>(UI))
+ SrcTy = L->getType();
+ else
+ SrcTy = cast<GetElementPtrInst>(UI)->getSourceElementType();
// Skip dead GEPs and remove them.
if (isa<GetElementPtrInst>(UI) && UI->use_empty()) {
UI->eraseFromParent();
continue;
}
- IndicesVector Indices;
- Indices.reserve(UI->getNumOperands() - 1);
- // Since loads will only have a single operand, and GEPs only a single
- // non-index operand, this will record direct loads without any indices,
- // and gep+loads with the GEP indices.
- for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end();
- II != IE; ++II)
- Indices.push_back(cast<ConstantInt>(*II)->getSExtValue());
- // GEPs with a single 0 index can be merged with direct loads
- if (Indices.size() == 1 && Indices.front() == 0)
- Indices.clear();
- ArgIndices.insert(std::make_pair(SrcTy, Indices));
- LoadInst *OrigLoad;
- if (LoadInst *L = dyn_cast<LoadInst>(UI))
- OrigLoad = L;
- else
- // Take any load, we will use it only to update Alias Analysis
- OrigLoad = cast<LoadInst>(UI->user_back());
- OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad;
- }
-
- // Add a parameter to the function for each element passed in.
- for (const auto &ArgIndex : ArgIndices) {
- // not allowed to dereference ->begin() if size() is 0
- Params.push_back(GetElementPtrInst::getIndexedType(
- cast<PointerType>(I->getType())->getElementType(),
- ArgIndex.second));
- ArgAttrVec.push_back(AttributeSet());
- assert(Params.back());
- }
-
- if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty())
- ++NumArgumentsPromoted;
- else
- ++NumAggregatesPromoted;
- }
- }
-
- Type *RetTy = FTy->getReturnType();
-
- // Construct the new function type using the new arguments.
- FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());
-
- // Create the new function body and insert it into the module.
- Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace(),
- F->getName());
- NF->copyAttributesFrom(F);
+ IndicesVector Indices;
+ Indices.reserve(UI->getNumOperands() - 1);
+ // Since loads will only have a single operand, and GEPs only a single
+ // non-index operand, this will record direct loads without any indices,
+ // and gep+loads with the GEP indices.
+ for (User::op_iterator II = UI->op_begin() + 1, IE = UI->op_end();
+ II != IE; ++II)
+ Indices.push_back(cast<ConstantInt>(*II)->getSExtValue());
+ // GEPs with a single 0 index can be merged with direct loads
+ if (Indices.size() == 1 && Indices.front() == 0)
+ Indices.clear();
+ ArgIndices.insert(std::make_pair(SrcTy, Indices));
+ LoadInst *OrigLoad;
+ if (LoadInst *L = dyn_cast<LoadInst>(UI))
+ OrigLoad = L;
+ else
+ // Take any load, we will use it only to update Alias Analysis
+ OrigLoad = cast<LoadInst>(UI->user_back());
+ OriginalLoads[std::make_pair(&*I, Indices)] = OrigLoad;
+ }
+
+ // Add a parameter to the function for each element passed in.
+ for (const auto &ArgIndex : ArgIndices) {
+ // not allowed to dereference ->begin() if size() is 0
+ Params.push_back(GetElementPtrInst::getIndexedType(
+ cast<PointerType>(I->getType())->getElementType(),
+ ArgIndex.second));
+ ArgAttrVec.push_back(AttributeSet());
+ assert(Params.back());
+ }
+
+ if (ArgIndices.size() == 1 && ArgIndices.begin()->second.empty())
+ ++NumArgumentsPromoted;
+ else
+ ++NumAggregatesPromoted;
+ }
+ }
+
+ Type *RetTy = FTy->getReturnType();
+
+ // Construct the new function type using the new arguments.
+ FunctionType *NFTy = FunctionType::get(RetTy, Params, FTy->isVarArg());
+
+ // Create the new function body and insert it into the module.
+ Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace(),
+ F->getName());
+ NF->copyAttributesFrom(F);
NF->copyMetadata(F, 0);
-
+
// The new function will have the !dbg metadata copied from the original
// function. The original function may not be deleted, and dbg metadata need
// to be unique so we need to drop it.
- F->setSubprogram(nullptr);
-
- LLVM_DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n"
- << "From: " << *F);
-
- // Recompute the parameter attributes list based on the new arguments for
- // the function.
- NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(),
- PAL.getRetAttributes(), ArgAttrVec));
- ArgAttrVec.clear();
-
- F->getParent()->getFunctionList().insert(F->getIterator(), NF);
- NF->takeName(F);
-
- // Loop over all of the callers of the function, transforming the call sites
- // to pass in the loaded pointers.
- //
- SmallVector<Value *, 16> Args;
- while (!F->use_empty()) {
- CallBase &CB = cast<CallBase>(*F->user_back());
- assert(CB.getCalledFunction() == F);
- const AttributeList &CallPAL = CB.getAttributes();
- IRBuilder<NoFolder> IRB(&CB);
-
- // Loop over the operands, inserting GEP and loads in the caller as
- // appropriate.
- auto AI = CB.arg_begin();
- ArgNo = 0;
- for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
- ++I, ++AI, ++ArgNo)
- if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
- Args.push_back(*AI); // Unmodified argument
- ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
- } else if (ByValArgsToTransform.count(&*I)) {
- // Emit a GEP and load for each element of the struct.
- Type *AgTy = cast<PointerType>(I->getType())->getElementType();
- StructType *STy = cast<StructType>(AgTy);
- Value *Idxs[2] = {
- ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr};
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
- Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
- auto *Idx =
- IRB.CreateGEP(STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i));
- // TODO: Tell AA about the new values?
- Args.push_back(IRB.CreateLoad(STy->getElementType(i), Idx,
- Idx->getName() + ".val"));
- ArgAttrVec.push_back(AttributeSet());
- }
- } else if (!I->use_empty()) {
- // Non-dead argument: insert GEPs and loads as appropriate.
- ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
- // Store the Value* version of the indices in here, but declare it now
- // for reuse.
- std::vector<Value *> Ops;
- for (const auto &ArgIndex : ArgIndices) {
- Value *V = *AI;
- LoadInst *OrigLoad =
- OriginalLoads[std::make_pair(&*I, ArgIndex.second)];
- if (!ArgIndex.second.empty()) {
- Ops.reserve(ArgIndex.second.size());
- Type *ElTy = V->getType();
- for (auto II : ArgIndex.second) {
- // Use i32 to index structs, and i64 for others (pointers/arrays).
- // This satisfies GEP constraints.
- Type *IdxTy =
- (ElTy->isStructTy() ? Type::getInt32Ty(F->getContext())
- : Type::getInt64Ty(F->getContext()));
- Ops.push_back(ConstantInt::get(IdxTy, II));
- // Keep track of the type we're currently indexing.
- if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
- ElTy = ElPTy->getElementType();
- else
- ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, II);
- }
- // And create a GEP to extract those indices.
- V = IRB.CreateGEP(ArgIndex.first, V, Ops, V->getName() + ".idx");
- Ops.clear();
- }
- // Since we're replacing a load make sure we take the alignment
- // of the previous load.
- LoadInst *newLoad =
- IRB.CreateLoad(OrigLoad->getType(), V, V->getName() + ".val");
- newLoad->setAlignment(OrigLoad->getAlign());
- // Transfer the AA info too.
- AAMDNodes AAInfo;
- OrigLoad->getAAMetadata(AAInfo);
- newLoad->setAAMetadata(AAInfo);
-
- Args.push_back(newLoad);
- ArgAttrVec.push_back(AttributeSet());
- }
- }
-
- // Push any varargs arguments on the list.
- for (; AI != CB.arg_end(); ++AI, ++ArgNo) {
- Args.push_back(*AI);
- ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
- }
-
- SmallVector<OperandBundleDef, 1> OpBundles;
- CB.getOperandBundlesAsDefs(OpBundles);
-
- CallBase *NewCS = nullptr;
- if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
- NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
- Args, OpBundles, "", &CB);
- } else {
- auto *NewCall = CallInst::Create(NF, Args, OpBundles, "", &CB);
- NewCall->setTailCallKind(cast<CallInst>(&CB)->getTailCallKind());
- NewCS = NewCall;
- }
- NewCS->setCallingConv(CB.getCallingConv());
- NewCS->setAttributes(
- AttributeList::get(F->getContext(), CallPAL.getFnAttributes(),
- CallPAL.getRetAttributes(), ArgAttrVec));
- NewCS->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
- Args.clear();
- ArgAttrVec.clear();
-
- // Update the callgraph to know that the callsite has been transformed.
- if (ReplaceCallSite)
- (*ReplaceCallSite)(CB, *NewCS);
-
- if (!CB.use_empty()) {
- CB.replaceAllUsesWith(NewCS);
- NewCS->takeName(&CB);
- }
-
- // Finally, remove the old call from the program, reducing the use-count of
- // F.
- CB.eraseFromParent();
- }
-
- const DataLayout &DL = F->getParent()->getDataLayout();
-
- // Since we have now created the new function, splice the body of the old
- // function right into the new function, leaving the old rotting hulk of the
- // function empty.
- NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
-
- // Loop over the argument list, transferring uses of the old arguments over to
- // the new arguments, also transferring over the names as well.
- for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
- I2 = NF->arg_begin();
- I != E; ++I) {
- if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
- // If this is an unmodified argument, move the name and users over to the
- // new version.
- I->replaceAllUsesWith(&*I2);
- I2->takeName(&*I);
- ++I2;
- continue;
- }
-
- if (ByValArgsToTransform.count(&*I)) {
- // In the callee, we create an alloca, and store each of the new incoming
- // arguments into the alloca.
- Instruction *InsertPt = &NF->begin()->front();
-
- // Just add all the struct element types.
- Type *AgTy = cast<PointerType>(I->getType())->getElementType();
- Value *TheAlloca = new AllocaInst(
- AgTy, DL.getAllocaAddrSpace(), nullptr,
- I->getParamAlign().getValueOr(DL.getPrefTypeAlign(AgTy)), "",
- InsertPt);
- StructType *STy = cast<StructType>(AgTy);
- Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
- nullptr};
-
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
- Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
- Value *Idx = GetElementPtrInst::Create(
- AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i),
- InsertPt);
- I2->setName(I->getName() + "." + Twine(i));
- new StoreInst(&*I2++, Idx, InsertPt);
- }
-
- // Anything that used the arg should now use the alloca.
- I->replaceAllUsesWith(TheAlloca);
- TheAlloca->takeName(&*I);
-
- // If the alloca is used in a call, we must clear the tail flag since
- // the callee now uses an alloca from the caller.
- for (User *U : TheAlloca->users()) {
- CallInst *Call = dyn_cast<CallInst>(U);
- if (!Call)
- continue;
- Call->setTailCall(false);
- }
- continue;
- }
-
+ F->setSubprogram(nullptr);
+
+ LLVM_DEBUG(dbgs() << "ARG PROMOTION: Promoting to:" << *NF << "\n"
+ << "From: " << *F);
+
+ // Recompute the parameter attributes list based on the new arguments for
+ // the function.
+ NF->setAttributes(AttributeList::get(F->getContext(), PAL.getFnAttributes(),
+ PAL.getRetAttributes(), ArgAttrVec));
+ ArgAttrVec.clear();
+
+ F->getParent()->getFunctionList().insert(F->getIterator(), NF);
+ NF->takeName(F);
+
+ // Loop over all of the callers of the function, transforming the call sites
+ // to pass in the loaded pointers.
+ //
+ SmallVector<Value *, 16> Args;
+ while (!F->use_empty()) {
+ CallBase &CB = cast<CallBase>(*F->user_back());
+ assert(CB.getCalledFunction() == F);
+ const AttributeList &CallPAL = CB.getAttributes();
+ IRBuilder<NoFolder> IRB(&CB);
+
+ // Loop over the operands, inserting GEP and loads in the caller as
+ // appropriate.
+ auto AI = CB.arg_begin();
+ ArgNo = 0;
+ for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+ ++I, ++AI, ++ArgNo)
+ if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
+ Args.push_back(*AI); // Unmodified argument
+ ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
+ } else if (ByValArgsToTransform.count(&*I)) {
+ // Emit a GEP and load for each element of the struct.
+ Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+ StructType *STy = cast<StructType>(AgTy);
+ Value *Idxs[2] = {
+ ConstantInt::get(Type::getInt32Ty(F->getContext()), 0), nullptr};
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
+ auto *Idx =
+ IRB.CreateGEP(STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i));
+ // TODO: Tell AA about the new values?
+ Args.push_back(IRB.CreateLoad(STy->getElementType(i), Idx,
+ Idx->getName() + ".val"));
+ ArgAttrVec.push_back(AttributeSet());
+ }
+ } else if (!I->use_empty()) {
+ // Non-dead argument: insert GEPs and loads as appropriate.
+ ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+ // Store the Value* version of the indices in here, but declare it now
+ // for reuse.
+ std::vector<Value *> Ops;
+ for (const auto &ArgIndex : ArgIndices) {
+ Value *V = *AI;
+ LoadInst *OrigLoad =
+ OriginalLoads[std::make_pair(&*I, ArgIndex.second)];
+ if (!ArgIndex.second.empty()) {
+ Ops.reserve(ArgIndex.second.size());
+ Type *ElTy = V->getType();
+ for (auto II : ArgIndex.second) {
+ // Use i32 to index structs, and i64 for others (pointers/arrays).
+ // This satisfies GEP constraints.
+ Type *IdxTy =
+ (ElTy->isStructTy() ? Type::getInt32Ty(F->getContext())
+ : Type::getInt64Ty(F->getContext()));
+ Ops.push_back(ConstantInt::get(IdxTy, II));
+ // Keep track of the type we're currently indexing.
+ if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
+ ElTy = ElPTy->getElementType();
+ else
+ ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, II);
+ }
+ // And create a GEP to extract those indices.
+ V = IRB.CreateGEP(ArgIndex.first, V, Ops, V->getName() + ".idx");
+ Ops.clear();
+ }
+ // Since we're replacing a load make sure we take the alignment
+ // of the previous load.
+ LoadInst *newLoad =
+ IRB.CreateLoad(OrigLoad->getType(), V, V->getName() + ".val");
+ newLoad->setAlignment(OrigLoad->getAlign());
+ // Transfer the AA info too.
+ AAMDNodes AAInfo;
+ OrigLoad->getAAMetadata(AAInfo);
+ newLoad->setAAMetadata(AAInfo);
+
+ Args.push_back(newLoad);
+ ArgAttrVec.push_back(AttributeSet());
+ }
+ }
+
+ // Push any varargs arguments on the list.
+ for (; AI != CB.arg_end(); ++AI, ++ArgNo) {
+ Args.push_back(*AI);
+ ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
+ }
+
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ CB.getOperandBundlesAsDefs(OpBundles);
+
+ CallBase *NewCS = nullptr;
+ if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+ NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+ Args, OpBundles, "", &CB);
+ } else {
+ auto *NewCall = CallInst::Create(NF, Args, OpBundles, "", &CB);
+ NewCall->setTailCallKind(cast<CallInst>(&CB)->getTailCallKind());
+ NewCS = NewCall;
+ }
+ NewCS->setCallingConv(CB.getCallingConv());
+ NewCS->setAttributes(
+ AttributeList::get(F->getContext(), CallPAL.getFnAttributes(),
+ CallPAL.getRetAttributes(), ArgAttrVec));
+ NewCS->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
+ Args.clear();
+ ArgAttrVec.clear();
+
+ // Update the callgraph to know that the callsite has been transformed.
+ if (ReplaceCallSite)
+ (*ReplaceCallSite)(CB, *NewCS);
+
+ if (!CB.use_empty()) {
+ CB.replaceAllUsesWith(NewCS);
+ NewCS->takeName(&CB);
+ }
+
+ // Finally, remove the old call from the program, reducing the use-count of
+ // F.
+ CB.eraseFromParent();
+ }
+
+ const DataLayout &DL = F->getParent()->getDataLayout();
+
+ // Since we have now created the new function, splice the body of the old
+ // function right into the new function, leaving the old rotting hulk of the
+ // function empty.
+ NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+ // Loop over the argument list, transferring uses of the old arguments over to
+ // the new arguments, also transferring over the names as well.
+ for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+ I2 = NF->arg_begin();
+ I != E; ++I) {
+ if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
+ // If this is an unmodified argument, move the name and users over to the
+ // new version.
+ I->replaceAllUsesWith(&*I2);
+ I2->takeName(&*I);
+ ++I2;
+ continue;
+ }
+
+ if (ByValArgsToTransform.count(&*I)) {
+ // In the callee, we create an alloca, and store each of the new incoming
+ // arguments into the alloca.
+ Instruction *InsertPt = &NF->begin()->front();
+
+ // Just add all the struct element types.
+ Type *AgTy = cast<PointerType>(I->getType())->getElementType();
+ Value *TheAlloca = new AllocaInst(
+ AgTy, DL.getAllocaAddrSpace(), nullptr,
+ I->getParamAlign().getValueOr(DL.getPrefTypeAlign(AgTy)), "",
+ InsertPt);
+ StructType *STy = cast<StructType>(AgTy);
+ Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
+ nullptr};
+
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ Idxs[1] = ConstantInt::get(Type::getInt32Ty(F->getContext()), i);
+ Value *Idx = GetElementPtrInst::Create(
+ AgTy, TheAlloca, Idxs, TheAlloca->getName() + "." + Twine(i),
+ InsertPt);
+ I2->setName(I->getName() + "." + Twine(i));
+ new StoreInst(&*I2++, Idx, InsertPt);
+ }
+
+ // Anything that used the arg should now use the alloca.
+ I->replaceAllUsesWith(TheAlloca);
+ TheAlloca->takeName(&*I);
+
+ // If the alloca is used in a call, we must clear the tail flag since
+ // the callee now uses an alloca from the caller.
+ for (User *U : TheAlloca->users()) {
+ CallInst *Call = dyn_cast<CallInst>(U);
+ if (!Call)
+ continue;
+ Call->setTailCall(false);
+ }
+ continue;
+ }
+
// There potentially are metadata uses for things like llvm.dbg.value.
// Replace them with undef, after handling the other regular uses.
auto RauwUndefMetadata = make_scope_exit(
[&]() { I->replaceAllUsesWith(UndefValue::get(I->getType())); });
- if (I->use_empty())
- continue;
-
- // Otherwise, if we promoted this argument, then all users are load
- // instructions (or GEPs with only load users), and all loads should be
- // using the new argument that we added.
- ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
-
- while (!I->use_empty()) {
- if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) {
- assert(ArgIndices.begin()->second.empty() &&
- "Load element should sort to front!");
- I2->setName(I->getName() + ".val");
- LI->replaceAllUsesWith(&*I2);
- LI->eraseFromParent();
- LLVM_DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
- << "' in function '" << F->getName() << "'\n");
- } else {
- GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back());
+ if (I->use_empty())
+ continue;
+
+ // Otherwise, if we promoted this argument, then all users are load
+ // instructions (or GEPs with only load users), and all loads should be
+ // using the new argument that we added.
+ ScalarizeTable &ArgIndices = ScalarizedElements[&*I];
+
+ while (!I->use_empty()) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I->user_back())) {
+ assert(ArgIndices.begin()->second.empty() &&
+ "Load element should sort to front!");
+ I2->setName(I->getName() + ".val");
+ LI->replaceAllUsesWith(&*I2);
+ LI->eraseFromParent();
+ LLVM_DEBUG(dbgs() << "*** Promoted load of argument '" << I->getName()
+ << "' in function '" << F->getName() << "'\n");
+ } else {
+ GetElementPtrInst *GEP = cast<GetElementPtrInst>(I->user_back());
assert(!GEP->use_empty() &&
"GEPs without uses should be cleaned up already");
- IndicesVector Operands;
- Operands.reserve(GEP->getNumIndices());
- for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
- II != IE; ++II)
- Operands.push_back(cast<ConstantInt>(*II)->getSExtValue());
-
- // GEPs with a single 0 index can be merged with direct loads
- if (Operands.size() == 1 && Operands.front() == 0)
- Operands.clear();
-
- Function::arg_iterator TheArg = I2;
- for (ScalarizeTable::iterator It = ArgIndices.begin();
- It->second != Operands; ++It, ++TheArg) {
- assert(It != ArgIndices.end() && "GEP not handled??");
- }
-
- TheArg->setName(formatv("{0}.{1:$[.]}.val", I->getName(),
- make_range(Operands.begin(), Operands.end())));
-
- LLVM_DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
- << "' of function '" << NF->getName() << "'\n");
-
- // All of the uses must be load instructions. Replace them all with
- // the argument specified by ArgNo.
- while (!GEP->use_empty()) {
- LoadInst *L = cast<LoadInst>(GEP->user_back());
- L->replaceAllUsesWith(&*TheArg);
- L->eraseFromParent();
- }
- GEP->eraseFromParent();
- }
- }
- // Increment I2 past all of the arguments added for this promoted pointer.
- std::advance(I2, ArgIndices.size());
- }
-
- return NF;
-}
-
-/// Return true if we can prove that all callees pass in a valid pointer for the
-/// specified function argument.
-static bool allCallersPassValidPointerForArgument(Argument *Arg, Type *Ty) {
- Function *Callee = Arg->getParent();
- const DataLayout &DL = Callee->getParent()->getDataLayout();
-
- unsigned ArgNo = Arg->getArgNo();
-
- // Look at all call sites of the function. At this point we know we only have
- // direct callees.
- for (User *U : Callee->users()) {
- CallBase &CB = cast<CallBase>(*U);
-
- if (!isDereferenceablePointer(CB.getArgOperand(ArgNo), Ty, DL))
- return false;
- }
- return true;
-}
-
-/// Returns true if Prefix is a prefix of longer. That means, Longer has a size
-/// that is greater than or equal to the size of prefix, and each of the
-/// elements in Prefix is the same as the corresponding elements in Longer.
-///
-/// This means it also returns true when Prefix and Longer are equal!
-static bool isPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) {
- if (Prefix.size() > Longer.size())
- return false;
- return std::equal(Prefix.begin(), Prefix.end(), Longer.begin());
-}
-
-/// Checks if Indices, or a prefix of Indices, is in Set.
-static bool prefixIn(const IndicesVector &Indices,
- std::set<IndicesVector> &Set) {
- std::set<IndicesVector>::iterator Low;
- Low = Set.upper_bound(Indices);
- if (Low != Set.begin())
- Low--;
- // Low is now the last element smaller than or equal to Indices. This means
- // it points to a prefix of Indices (possibly Indices itself), if such
- // prefix exists.
- //
- // This load is safe if any prefix of its operands is safe to load.
- return Low != Set.end() && isPrefix(*Low, Indices);
-}
-
-/// Mark the given indices (ToMark) as safe in the given set of indices
-/// (Safe). Marking safe usually means adding ToMark to Safe. However, if there
-/// is already a prefix of Indices in Safe, Indices are implicitely marked safe
-/// already. Furthermore, any indices that Indices is itself a prefix of, are
-/// removed from Safe (since they are implicitely safe because of Indices now).
-static void markIndicesSafe(const IndicesVector &ToMark,
- std::set<IndicesVector> &Safe) {
- std::set<IndicesVector>::iterator Low;
- Low = Safe.upper_bound(ToMark);
- // Guard against the case where Safe is empty
- if (Low != Safe.begin())
- Low--;
- // Low is now the last element smaller than or equal to Indices. This
- // means it points to a prefix of Indices (possibly Indices itself), if
- // such prefix exists.
- if (Low != Safe.end()) {
- if (isPrefix(*Low, ToMark))
- // If there is already a prefix of these indices (or exactly these
- // indices) marked a safe, don't bother adding these indices
- return;
-
- // Increment Low, so we can use it as a "insert before" hint
- ++Low;
- }
- // Insert
- Low = Safe.insert(Low, ToMark);
- ++Low;
- // If there we're a prefix of longer index list(s), remove those
- std::set<IndicesVector>::iterator End = Safe.end();
- while (Low != End && isPrefix(ToMark, *Low)) {
- std::set<IndicesVector>::iterator Remove = Low;
- ++Low;
- Safe.erase(Remove);
- }
-}
-
-/// isSafeToPromoteArgument - As you might guess from the name of this method,
-/// it checks to see if it is both safe and useful to promote the argument.
-/// This method limits promotion of aggregates to only promote up to three
-/// elements of the aggregate in order to avoid exploding the number of
-/// arguments passed in.
-static bool isSafeToPromoteArgument(Argument *Arg, Type *ByValTy, AAResults &AAR,
- unsigned MaxElements) {
- using GEPIndicesSet = std::set<IndicesVector>;
-
- // Quick exit for unused arguments
- if (Arg->use_empty())
- return true;
-
- // We can only promote this argument if all of the uses are loads, or are GEP
- // instructions (with constant indices) that are subsequently loaded.
- //
- // Promoting the argument causes it to be loaded in the caller
- // unconditionally. This is only safe if we can prove that either the load
- // would have happened in the callee anyway (ie, there is a load in the entry
- // block) or the pointer passed in at every call site is guaranteed to be
- // valid.
- // In the former case, invalid loads can happen, but would have happened
- // anyway, in the latter case, invalid loads won't happen. This prevents us
- // from introducing an invalid load that wouldn't have happened in the
- // original code.
- //
- // This set will contain all sets of indices that are loaded in the entry
- // block, and thus are safe to unconditionally load in the caller.
- GEPIndicesSet SafeToUnconditionallyLoad;
-
- // This set contains all the sets of indices that we are planning to promote.
- // This makes it possible to limit the number of arguments added.
- GEPIndicesSet ToPromote;
-
- // If the pointer is always valid, any load with first index 0 is valid.
-
- if (ByValTy)
- SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
-
- // Whenever a new underlying type for the operand is found, make sure it's
- // consistent with the GEPs and loads we've already seen and, if necessary,
- // use it to see if all incoming pointers are valid (which implies the 0-index
- // is safe).
- Type *BaseTy = ByValTy;
- auto UpdateBaseTy = [&](Type *NewBaseTy) {
- if (BaseTy)
- return BaseTy == NewBaseTy;
-
- BaseTy = NewBaseTy;
- if (allCallersPassValidPointerForArgument(Arg, BaseTy)) {
- assert(SafeToUnconditionallyLoad.empty());
- SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
- }
-
- return true;
- };
-
- // First, iterate the entry block and mark loads of (geps of) arguments as
- // safe.
- BasicBlock &EntryBlock = Arg->getParent()->front();
- // Declare this here so we can reuse it
- IndicesVector Indices;
- for (Instruction &I : EntryBlock)
- if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
- Value *V = LI->getPointerOperand();
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
- V = GEP->getPointerOperand();
- if (V == Arg) {
- // This load actually loads (part of) Arg? Check the indices then.
- Indices.reserve(GEP->getNumIndices());
- for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
- II != IE; ++II)
- if (ConstantInt *CI = dyn_cast<ConstantInt>(*II))
- Indices.push_back(CI->getSExtValue());
- else
- // We found a non-constant GEP index for this argument? Bail out
- // right away, can't promote this argument at all.
- return false;
-
- if (!UpdateBaseTy(GEP->getSourceElementType()))
- return false;
-
- // Indices checked out, mark them as safe
- markIndicesSafe(Indices, SafeToUnconditionallyLoad);
- Indices.clear();
- }
- } else if (V == Arg) {
- // Direct loads are equivalent to a GEP with a single 0 index.
- markIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
-
- if (BaseTy && LI->getType() != BaseTy)
- return false;
-
- BaseTy = LI->getType();
- }
- }
-
- // Now, iterate all uses of the argument to see if there are any uses that are
- // not (GEP+)loads, or any (GEP+)loads that are not safe to promote.
- SmallVector<LoadInst *, 16> Loads;
- IndicesVector Operands;
- for (Use &U : Arg->uses()) {
- User *UR = U.getUser();
- Operands.clear();
- if (LoadInst *LI = dyn_cast<LoadInst>(UR)) {
- // Don't hack volatile/atomic loads
- if (!LI->isSimple())
- return false;
- Loads.push_back(LI);
- // Direct loads are equivalent to a GEP with a zero index and then a load.
- Operands.push_back(0);
-
- if (!UpdateBaseTy(LI->getType()))
- return false;
- } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UR)) {
- if (GEP->use_empty()) {
- // Dead GEP's cause trouble later. Just remove them if we run into
- // them.
+ IndicesVector Operands;
+ Operands.reserve(GEP->getNumIndices());
+ for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
+ II != IE; ++II)
+ Operands.push_back(cast<ConstantInt>(*II)->getSExtValue());
+
+ // GEPs with a single 0 index can be merged with direct loads
+ if (Operands.size() == 1 && Operands.front() == 0)
+ Operands.clear();
+
+ Function::arg_iterator TheArg = I2;
+ for (ScalarizeTable::iterator It = ArgIndices.begin();
+ It->second != Operands; ++It, ++TheArg) {
+ assert(It != ArgIndices.end() && "GEP not handled??");
+ }
+
+ TheArg->setName(formatv("{0}.{1:$[.]}.val", I->getName(),
+ make_range(Operands.begin(), Operands.end())));
+
+ LLVM_DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
+ << "' of function '" << NF->getName() << "'\n");
+
+ // All of the uses must be load instructions. Replace them all with
+ // the argument specified by ArgNo.
+ while (!GEP->use_empty()) {
+ LoadInst *L = cast<LoadInst>(GEP->user_back());
+ L->replaceAllUsesWith(&*TheArg);
+ L->eraseFromParent();
+ }
+ GEP->eraseFromParent();
+ }
+ }
+ // Increment I2 past all of the arguments added for this promoted pointer.
+ std::advance(I2, ArgIndices.size());
+ }
+
+ return NF;
+}
+
+/// Return true if we can prove that all callees pass in a valid pointer for the
+/// specified function argument.
+static bool allCallersPassValidPointerForArgument(Argument *Arg, Type *Ty) {
+ Function *Callee = Arg->getParent();
+ const DataLayout &DL = Callee->getParent()->getDataLayout();
+
+ unsigned ArgNo = Arg->getArgNo();
+
+ // Look at all call sites of the function. At this point we know we only have
+ // direct callees.
+ for (User *U : Callee->users()) {
+ CallBase &CB = cast<CallBase>(*U);
+
+ if (!isDereferenceablePointer(CB.getArgOperand(ArgNo), Ty, DL))
+ return false;
+ }
+ return true;
+}
+
+/// Returns true if Prefix is a prefix of longer. That means, Longer has a size
+/// that is greater than or equal to the size of prefix, and each of the
+/// elements in Prefix is the same as the corresponding elements in Longer.
+///
+/// This means it also returns true when Prefix and Longer are equal!
+static bool isPrefix(const IndicesVector &Prefix, const IndicesVector &Longer) {
+ if (Prefix.size() > Longer.size())
+ return false;
+ return std::equal(Prefix.begin(), Prefix.end(), Longer.begin());
+}
+
+/// Checks if Indices, or a prefix of Indices, is in Set.
+static bool prefixIn(const IndicesVector &Indices,
+ std::set<IndicesVector> &Set) {
+ std::set<IndicesVector>::iterator Low;
+ Low = Set.upper_bound(Indices);
+ if (Low != Set.begin())
+ Low--;
+ // Low is now the last element smaller than or equal to Indices. This means
+ // it points to a prefix of Indices (possibly Indices itself), if such
+ // prefix exists.
+ //
+ // This load is safe if any prefix of its operands is safe to load.
+ return Low != Set.end() && isPrefix(*Low, Indices);
+}
+
+/// Mark the given indices (ToMark) as safe in the given set of indices
+/// (Safe). Marking safe usually means adding ToMark to Safe. However, if there
+/// is already a prefix of Indices in Safe, Indices are implicitely marked safe
+/// already. Furthermore, any indices that Indices is itself a prefix of, are
+/// removed from Safe (since they are implicitely safe because of Indices now).
+static void markIndicesSafe(const IndicesVector &ToMark,
+ std::set<IndicesVector> &Safe) {
+ std::set<IndicesVector>::iterator Low;
+ Low = Safe.upper_bound(ToMark);
+ // Guard against the case where Safe is empty
+ if (Low != Safe.begin())
+ Low--;
+ // Low is now the last element smaller than or equal to Indices. This
+ // means it points to a prefix of Indices (possibly Indices itself), if
+ // such prefix exists.
+ if (Low != Safe.end()) {
+ if (isPrefix(*Low, ToMark))
+ // If there is already a prefix of these indices (or exactly these
+ // indices) marked a safe, don't bother adding these indices
+ return;
+
+ // Increment Low, so we can use it as a "insert before" hint
+ ++Low;
+ }
+ // Insert
+ Low = Safe.insert(Low, ToMark);
+ ++Low;
+ // If there we're a prefix of longer index list(s), remove those
+ std::set<IndicesVector>::iterator End = Safe.end();
+ while (Low != End && isPrefix(ToMark, *Low)) {
+ std::set<IndicesVector>::iterator Remove = Low;
+ ++Low;
+ Safe.erase(Remove);
+ }
+}
+
+/// isSafeToPromoteArgument - As you might guess from the name of this method,
+/// it checks to see if it is both safe and useful to promote the argument.
+/// This method limits promotion of aggregates to only promote up to three
+/// elements of the aggregate in order to avoid exploding the number of
+/// arguments passed in.
+static bool isSafeToPromoteArgument(Argument *Arg, Type *ByValTy, AAResults &AAR,
+ unsigned MaxElements) {
+ using GEPIndicesSet = std::set<IndicesVector>;
+
+ // Quick exit for unused arguments
+ if (Arg->use_empty())
+ return true;
+
+ // We can only promote this argument if all of the uses are loads, or are GEP
+ // instructions (with constant indices) that are subsequently loaded.
+ //
+ // Promoting the argument causes it to be loaded in the caller
+ // unconditionally. This is only safe if we can prove that either the load
+ // would have happened in the callee anyway (ie, there is a load in the entry
+ // block) or the pointer passed in at every call site is guaranteed to be
+ // valid.
+ // In the former case, invalid loads can happen, but would have happened
+ // anyway, in the latter case, invalid loads won't happen. This prevents us
+ // from introducing an invalid load that wouldn't have happened in the
+ // original code.
+ //
+ // This set will contain all sets of indices that are loaded in the entry
+ // block, and thus are safe to unconditionally load in the caller.
+ GEPIndicesSet SafeToUnconditionallyLoad;
+
+ // This set contains all the sets of indices that we are planning to promote.
+ // This makes it possible to limit the number of arguments added.
+ GEPIndicesSet ToPromote;
+
+ // If the pointer is always valid, any load with first index 0 is valid.
+
+ if (ByValTy)
+ SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
+
+ // Whenever a new underlying type for the operand is found, make sure it's
+ // consistent with the GEPs and loads we've already seen and, if necessary,
+ // use it to see if all incoming pointers are valid (which implies the 0-index
+ // is safe).
+ Type *BaseTy = ByValTy;
+ auto UpdateBaseTy = [&](Type *NewBaseTy) {
+ if (BaseTy)
+ return BaseTy == NewBaseTy;
+
+ BaseTy = NewBaseTy;
+ if (allCallersPassValidPointerForArgument(Arg, BaseTy)) {
+ assert(SafeToUnconditionallyLoad.empty());
+ SafeToUnconditionallyLoad.insert(IndicesVector(1, 0));
+ }
+
+ return true;
+ };
+
+ // First, iterate the entry block and mark loads of (geps of) arguments as
+ // safe.
+ BasicBlock &EntryBlock = Arg->getParent()->front();
+ // Declare this here so we can reuse it
+ IndicesVector Indices;
+ for (Instruction &I : EntryBlock)
+ if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+ Value *V = LI->getPointerOperand();
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(V)) {
+ V = GEP->getPointerOperand();
+ if (V == Arg) {
+ // This load actually loads (part of) Arg? Check the indices then.
+ Indices.reserve(GEP->getNumIndices());
+ for (User::op_iterator II = GEP->idx_begin(), IE = GEP->idx_end();
+ II != IE; ++II)
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(*II))
+ Indices.push_back(CI->getSExtValue());
+ else
+ // We found a non-constant GEP index for this argument? Bail out
+ // right away, can't promote this argument at all.
+ return false;
+
+ if (!UpdateBaseTy(GEP->getSourceElementType()))
+ return false;
+
+ // Indices checked out, mark them as safe
+ markIndicesSafe(Indices, SafeToUnconditionallyLoad);
+ Indices.clear();
+ }
+ } else if (V == Arg) {
+ // Direct loads are equivalent to a GEP with a single 0 index.
+ markIndicesSafe(IndicesVector(1, 0), SafeToUnconditionallyLoad);
+
+ if (BaseTy && LI->getType() != BaseTy)
+ return false;
+
+ BaseTy = LI->getType();
+ }
+ }
+
+ // Now, iterate all uses of the argument to see if there are any uses that are
+ // not (GEP+)loads, or any (GEP+)loads that are not safe to promote.
+ SmallVector<LoadInst *, 16> Loads;
+ IndicesVector Operands;
+ for (Use &U : Arg->uses()) {
+ User *UR = U.getUser();
+ Operands.clear();
+ if (LoadInst *LI = dyn_cast<LoadInst>(UR)) {
+ // Don't hack volatile/atomic loads
+ if (!LI->isSimple())
+ return false;
+ Loads.push_back(LI);
+ // Direct loads are equivalent to a GEP with a zero index and then a load.
+ Operands.push_back(0);
+
+ if (!UpdateBaseTy(LI->getType()))
+ return false;
+ } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(UR)) {
+ if (GEP->use_empty()) {
+ // Dead GEP's cause trouble later. Just remove them if we run into
+ // them.
continue;
- }
-
- if (!UpdateBaseTy(GEP->getSourceElementType()))
- return false;
-
- // Ensure that all of the indices are constants.
- for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end(); i != e;
- ++i)
- if (ConstantInt *C = dyn_cast<ConstantInt>(*i))
- Operands.push_back(C->getSExtValue());
- else
- return false; // Not a constant operand GEP!
-
- // Ensure that the only users of the GEP are load instructions.
- for (User *GEPU : GEP->users())
- if (LoadInst *LI = dyn_cast<LoadInst>(GEPU)) {
- // Don't hack volatile/atomic loads
- if (!LI->isSimple())
- return false;
- Loads.push_back(LI);
- } else {
- // Other uses than load?
- return false;
- }
- } else {
- return false; // Not a load or a GEP.
- }
-
- // Now, see if it is safe to promote this load / loads of this GEP. Loading
- // is safe if Operands, or a prefix of Operands, is marked as safe.
- if (!prefixIn(Operands, SafeToUnconditionallyLoad))
- return false;
-
- // See if we are already promoting a load with these indices. If not, check
- // to make sure that we aren't promoting too many elements. If so, nothing
- // to do.
- if (ToPromote.find(Operands) == ToPromote.end()) {
- if (MaxElements > 0 && ToPromote.size() == MaxElements) {
- LLVM_DEBUG(dbgs() << "argpromotion not promoting argument '"
- << Arg->getName()
- << "' because it would require adding more "
- << "than " << MaxElements
- << " arguments to the function.\n");
- // We limit aggregate promotion to only promoting up to a fixed number
- // of elements of the aggregate.
- return false;
- }
- ToPromote.insert(std::move(Operands));
- }
- }
-
- if (Loads.empty())
- return true; // No users, this is a dead argument.
-
- // Okay, now we know that the argument is only used by load instructions and
- // it is safe to unconditionally perform all of them. Use alias analysis to
- // check to see if the pointer is guaranteed to not be modified from entry of
- // the function to each of the load instructions.
-
- // Because there could be several/many load instructions, remember which
- // blocks we know to be transparent to the load.
- df_iterator_default_set<BasicBlock *, 16> TranspBlocks;
-
- for (LoadInst *Load : Loads) {
- // Check to see if the load is invalidated from the start of the block to
- // the load itself.
- BasicBlock *BB = Load->getParent();
-
- MemoryLocation Loc = MemoryLocation::get(Load);
- if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, ModRefInfo::Mod))
- return false; // Pointer is invalidated!
-
- // Now check every path from the entry block to the load for transparency.
- // To do this, we perform a depth first search on the inverse CFG from the
- // loading block.
- for (BasicBlock *P : predecessors(BB)) {
- for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks))
- if (AAR.canBasicBlockModify(*TranspBB, Loc))
- return false;
- }
- }
-
- // If the path from the entry of the function to each load is free of
- // instructions that potentially invalidate the load, we can make the
- // transformation!
- return true;
-}
-
-bool ArgumentPromotionPass::isDenselyPacked(Type *type, const DataLayout &DL) {
- // There is no size information, so be conservative.
- if (!type->isSized())
- return false;
-
- // If the alloc size is not equal to the storage size, then there are padding
- // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128.
- if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type))
- return false;
-
- // FIXME: This isn't the right way to check for padding in vectors with
- // non-byte-size elements.
- if (VectorType *seqTy = dyn_cast<VectorType>(type))
- return isDenselyPacked(seqTy->getElementType(), DL);
-
- // For array types, check for padding within members.
- if (ArrayType *seqTy = dyn_cast<ArrayType>(type))
- return isDenselyPacked(seqTy->getElementType(), DL);
-
- if (!isa<StructType>(type))
- return true;
-
- // Check for padding within and between elements of a struct.
- StructType *StructTy = cast<StructType>(type);
- const StructLayout *Layout = DL.getStructLayout(StructTy);
- uint64_t StartPos = 0;
- for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) {
- Type *ElTy = StructTy->getElementType(i);
- if (!isDenselyPacked(ElTy, DL))
- return false;
- if (StartPos != Layout->getElementOffsetInBits(i))
- return false;
- StartPos += DL.getTypeAllocSizeInBits(ElTy);
- }
-
- return true;
-}
-
-/// Checks if the padding bytes of an argument could be accessed.
-static bool canPaddingBeAccessed(Argument *arg) {
- assert(arg->hasByValAttr());
-
- // Track all the pointers to the argument to make sure they are not captured.
- SmallPtrSet<Value *, 16> PtrValues;
- PtrValues.insert(arg);
-
- // Track all of the stores.
- SmallVector<StoreInst *, 16> Stores;
-
- // Scan through the uses recursively to make sure the pointer is always used
- // sanely.
+ }
+
+ if (!UpdateBaseTy(GEP->getSourceElementType()))
+ return false;
+
+ // Ensure that all of the indices are constants.
+ for (User::op_iterator i = GEP->idx_begin(), e = GEP->idx_end(); i != e;
+ ++i)
+ if (ConstantInt *C = dyn_cast<ConstantInt>(*i))
+ Operands.push_back(C->getSExtValue());
+ else
+ return false; // Not a constant operand GEP!
+
+ // Ensure that the only users of the GEP are load instructions.
+ for (User *GEPU : GEP->users())
+ if (LoadInst *LI = dyn_cast<LoadInst>(GEPU)) {
+ // Don't hack volatile/atomic loads
+ if (!LI->isSimple())
+ return false;
+ Loads.push_back(LI);
+ } else {
+ // Other uses than load?
+ return false;
+ }
+ } else {
+ return false; // Not a load or a GEP.
+ }
+
+ // Now, see if it is safe to promote this load / loads of this GEP. Loading
+ // is safe if Operands, or a prefix of Operands, is marked as safe.
+ if (!prefixIn(Operands, SafeToUnconditionallyLoad))
+ return false;
+
+ // See if we are already promoting a load with these indices. If not, check
+ // to make sure that we aren't promoting too many elements. If so, nothing
+ // to do.
+ if (ToPromote.find(Operands) == ToPromote.end()) {
+ if (MaxElements > 0 && ToPromote.size() == MaxElements) {
+ LLVM_DEBUG(dbgs() << "argpromotion not promoting argument '"
+ << Arg->getName()
+ << "' because it would require adding more "
+ << "than " << MaxElements
+ << " arguments to the function.\n");
+ // We limit aggregate promotion to only promoting up to a fixed number
+ // of elements of the aggregate.
+ return false;
+ }
+ ToPromote.insert(std::move(Operands));
+ }
+ }
+
+ if (Loads.empty())
+ return true; // No users, this is a dead argument.
+
+ // Okay, now we know that the argument is only used by load instructions and
+ // it is safe to unconditionally perform all of them. Use alias analysis to
+ // check to see if the pointer is guaranteed to not be modified from entry of
+ // the function to each of the load instructions.
+
+ // Because there could be several/many load instructions, remember which
+ // blocks we know to be transparent to the load.
+ df_iterator_default_set<BasicBlock *, 16> TranspBlocks;
+
+ for (LoadInst *Load : Loads) {
+ // Check to see if the load is invalidated from the start of the block to
+ // the load itself.
+ BasicBlock *BB = Load->getParent();
+
+ MemoryLocation Loc = MemoryLocation::get(Load);
+ if (AAR.canInstructionRangeModRef(BB->front(), *Load, Loc, ModRefInfo::Mod))
+ return false; // Pointer is invalidated!
+
+ // Now check every path from the entry block to the load for transparency.
+ // To do this, we perform a depth first search on the inverse CFG from the
+ // loading block.
+ for (BasicBlock *P : predecessors(BB)) {
+ for (BasicBlock *TranspBB : inverse_depth_first_ext(P, TranspBlocks))
+ if (AAR.canBasicBlockModify(*TranspBB, Loc))
+ return false;
+ }
+ }
+
+ // If the path from the entry of the function to each load is free of
+ // instructions that potentially invalidate the load, we can make the
+ // transformation!
+ return true;
+}
+
+bool ArgumentPromotionPass::isDenselyPacked(Type *type, const DataLayout &DL) {
+ // There is no size information, so be conservative.
+ if (!type->isSized())
+ return false;
+
+ // If the alloc size is not equal to the storage size, then there are padding
+ // bytes. For x86_fp80 on x86-64, size: 80 alloc size: 128.
+ if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type))
+ return false;
+
+ // FIXME: This isn't the right way to check for padding in vectors with
+ // non-byte-size elements.
+ if (VectorType *seqTy = dyn_cast<VectorType>(type))
+ return isDenselyPacked(seqTy->getElementType(), DL);
+
+ // For array types, check for padding within members.
+ if (ArrayType *seqTy = dyn_cast<ArrayType>(type))
+ return isDenselyPacked(seqTy->getElementType(), DL);
+
+ if (!isa<StructType>(type))
+ return true;
+
+ // Check for padding within and between elements of a struct.
+ StructType *StructTy = cast<StructType>(type);
+ const StructLayout *Layout = DL.getStructLayout(StructTy);
+ uint64_t StartPos = 0;
+ for (unsigned i = 0, E = StructTy->getNumElements(); i < E; ++i) {
+ Type *ElTy = StructTy->getElementType(i);
+ if (!isDenselyPacked(ElTy, DL))
+ return false;
+ if (StartPos != Layout->getElementOffsetInBits(i))
+ return false;
+ StartPos += DL.getTypeAllocSizeInBits(ElTy);
+ }
+
+ return true;
+}
+
+/// Checks if the padding bytes of an argument could be accessed.
+static bool canPaddingBeAccessed(Argument *arg) {
+ assert(arg->hasByValAttr());
+
+ // Track all the pointers to the argument to make sure they are not captured.
+ SmallPtrSet<Value *, 16> PtrValues;
+ PtrValues.insert(arg);
+
+ // Track all of the stores.
+ SmallVector<StoreInst *, 16> Stores;
+
+ // Scan through the uses recursively to make sure the pointer is always used
+ // sanely.
SmallVector<Value *, 16> WorkList(arg->users());
- while (!WorkList.empty()) {
+ while (!WorkList.empty()) {
Value *V = WorkList.pop_back_val();
- if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) {
- if (PtrValues.insert(V).second)
+ if (isa<GetElementPtrInst>(V) || isa<PHINode>(V)) {
+ if (PtrValues.insert(V).second)
llvm::append_range(WorkList, V->users());
- } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
- Stores.push_back(Store);
- } else if (!isa<LoadInst>(V)) {
- return true;
- }
- }
-
- // Check to make sure the pointers aren't captured
- for (StoreInst *Store : Stores)
- if (PtrValues.count(Store->getValueOperand()))
- return true;
-
- return false;
-}
-
-bool ArgumentPromotionPass::areFunctionArgsABICompatible(
- const Function &F, const TargetTransformInfo &TTI,
- SmallPtrSetImpl<Argument *> &ArgsToPromote,
- SmallPtrSetImpl<Argument *> &ByValArgsToTransform) {
- for (const Use &U : F.uses()) {
- CallBase *CB = dyn_cast<CallBase>(U.getUser());
- if (!CB)
- return false;
- const Function *Caller = CB->getCaller();
- const Function *Callee = CB->getCalledFunction();
- if (!TTI.areFunctionArgsABICompatible(Caller, Callee, ArgsToPromote) ||
- !TTI.areFunctionArgsABICompatible(Caller, Callee, ByValArgsToTransform))
- return false;
- }
- return true;
-}
-
-/// PromoteArguments - This method checks the specified function to see if there
-/// are any promotable arguments and if it is safe to promote the function (for
-/// example, all callers are direct). If safe to promote some arguments, it
-/// calls the DoPromotion method.
-static Function *
-promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
- unsigned MaxElements,
- Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
- ReplaceCallSite,
- const TargetTransformInfo &TTI) {
- // Don't perform argument promotion for naked functions; otherwise we can end
- // up removing parameters that are seemingly 'not used' as they are referred
- // to in the assembly.
- if(F->hasFnAttribute(Attribute::Naked))
- return nullptr;
-
- // Make sure that it is local to this module.
- if (!F->hasLocalLinkage())
- return nullptr;
-
- // Don't promote arguments for variadic functions. Adding, removing, or
- // changing non-pack parameters can change the classification of pack
- // parameters. Frontends encode that classification at the call site in the
- // IR, while in the callee the classification is determined dynamically based
- // on the number of registers consumed so far.
- if (F->isVarArg())
- return nullptr;
-
- // Don't transform functions that receive inallocas, as the transformation may
- // not be safe depending on calling convention.
- if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca))
- return nullptr;
-
- // First check: see if there are any pointer arguments! If not, quick exit.
- SmallVector<Argument *, 16> PointerArgs;
- for (Argument &I : F->args())
- if (I.getType()->isPointerTy())
- PointerArgs.push_back(&I);
- if (PointerArgs.empty())
- return nullptr;
-
- // Second check: make sure that all callers are direct callers. We can't
- // transform functions that have indirect callers. Also see if the function
- // is self-recursive and check that target features are compatible.
- bool isSelfRecursive = false;
- for (Use &U : F->uses()) {
- CallBase *CB = dyn_cast<CallBase>(U.getUser());
- // Must be a direct call.
- if (CB == nullptr || !CB->isCallee(&U))
- return nullptr;
-
- // Can't change signature of musttail callee
- if (CB->isMustTailCall())
- return nullptr;
-
- if (CB->getParent()->getParent() == F)
- isSelfRecursive = true;
- }
-
- // Can't change signature of musttail caller
- // FIXME: Support promoting whole chain of musttail functions
- for (BasicBlock &BB : *F)
- if (BB.getTerminatingMustTailCall())
- return nullptr;
-
- const DataLayout &DL = F->getParent()->getDataLayout();
-
- AAResults &AAR = AARGetter(*F);
-
- // Check to see which arguments are promotable. If an argument is promotable,
- // add it to ArgsToPromote.
- SmallPtrSet<Argument *, 8> ArgsToPromote;
- SmallPtrSet<Argument *, 8> ByValArgsToTransform;
- for (Argument *PtrArg : PointerArgs) {
- Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
-
- // Replace sret attribute with noalias. This reduces register pressure by
- // avoiding a register copy.
- if (PtrArg->hasStructRetAttr()) {
- unsigned ArgNo = PtrArg->getArgNo();
- F->removeParamAttr(ArgNo, Attribute::StructRet);
- F->addParamAttr(ArgNo, Attribute::NoAlias);
- for (Use &U : F->uses()) {
- CallBase &CB = cast<CallBase>(*U.getUser());
- CB.removeParamAttr(ArgNo, Attribute::StructRet);
- CB.addParamAttr(ArgNo, Attribute::NoAlias);
- }
- }
-
- // If this is a byval argument, and if the aggregate type is small, just
- // pass the elements, which is always safe, if the passed value is densely
- // packed or if we can prove the padding bytes are never accessed.
- bool isSafeToPromote = PtrArg->hasByValAttr() &&
- (ArgumentPromotionPass::isDenselyPacked(AgTy, DL) ||
- !canPaddingBeAccessed(PtrArg));
- if (isSafeToPromote) {
- if (StructType *STy = dyn_cast<StructType>(AgTy)) {
- if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
- LLVM_DEBUG(dbgs() << "argpromotion disable promoting argument '"
- << PtrArg->getName()
- << "' because it would require adding more"
- << " than " << MaxElements
- << " arguments to the function.\n");
- continue;
- }
-
- // If all the elements are single-value types, we can promote it.
- bool AllSimple = true;
- for (const auto *EltTy : STy->elements()) {
- if (!EltTy->isSingleValueType()) {
- AllSimple = false;
- break;
- }
- }
-
- // Safe to transform, don't even bother trying to "promote" it.
- // Passing the elements as a scalar will allow sroa to hack on
- // the new alloca we introduce.
- if (AllSimple) {
- ByValArgsToTransform.insert(PtrArg);
- continue;
- }
- }
- }
-
- // If the argument is a recursive type and we're in a recursive
- // function, we could end up infinitely peeling the function argument.
- if (isSelfRecursive) {
- if (StructType *STy = dyn_cast<StructType>(AgTy)) {
- bool RecursiveType = false;
- for (const auto *EltTy : STy->elements()) {
- if (EltTy == PtrArg->getType()) {
- RecursiveType = true;
- break;
- }
- }
- if (RecursiveType)
- continue;
- }
- }
-
- // Otherwise, see if we can promote the pointer to its value.
- Type *ByValTy =
- PtrArg->hasByValAttr() ? PtrArg->getParamByValType() : nullptr;
- if (isSafeToPromoteArgument(PtrArg, ByValTy, AAR, MaxElements))
- ArgsToPromote.insert(PtrArg);
- }
-
- // No promotable pointer arguments.
- if (ArgsToPromote.empty() && ByValArgsToTransform.empty())
- return nullptr;
-
- if (!ArgumentPromotionPass::areFunctionArgsABICompatible(
- *F, TTI, ArgsToPromote, ByValArgsToTransform))
- return nullptr;
-
- return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite);
-}
-
-PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
- CGSCCAnalysisManager &AM,
- LazyCallGraph &CG,
- CGSCCUpdateResult &UR) {
- bool Changed = false, LocalChange;
-
- // Iterate until we stop promoting from this SCC.
- do {
- LocalChange = false;
-
- for (LazyCallGraph::Node &N : C) {
- Function &OldF = N.getFunction();
-
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
- // FIXME: This lambda must only be used with this function. We should
- // skip the lambda and just get the AA results directly.
- auto AARGetter = [&](Function &F) -> AAResults & {
- assert(&F == &OldF && "Called with an unexpected function!");
- return FAM.getResult<AAManager>(F);
- };
-
- const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(OldF);
- Function *NewF =
- promoteArguments(&OldF, AARGetter, MaxElements, None, TTI);
- if (!NewF)
- continue;
- LocalChange = true;
-
- // Directly substitute the functions in the call graph. Note that this
- // requires the old function to be completely dead and completely
- // replaced by the new function. It does no call graph updates, it merely
- // swaps out the particular function mapped to a particular node in the
- // graph.
- C.getOuterRefSCC().replaceNodeFunction(N, *NewF);
- OldF.eraseFromParent();
- }
-
- Changed |= LocalChange;
- } while (LocalChange);
-
- if (!Changed)
- return PreservedAnalyses::all();
-
- return PreservedAnalyses::none();
-}
-
-namespace {
-
-/// ArgPromotion - The 'by reference' to 'by value' argument promotion pass.
-struct ArgPromotion : public CallGraphSCCPass {
- // Pass identification, replacement for typeid
- static char ID;
-
- explicit ArgPromotion(unsigned MaxElements = 3)
- : CallGraphSCCPass(ID), MaxElements(MaxElements) {
- initializeArgPromotionPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- getAAResultsAnalysisUsage(AU);
- CallGraphSCCPass::getAnalysisUsage(AU);
- }
-
- bool runOnSCC(CallGraphSCC &SCC) override;
-
-private:
- using llvm::Pass::doInitialization;
-
- bool doInitialization(CallGraph &CG) override;
-
- /// The maximum number of elements to expand, or 0 for unlimited.
- unsigned MaxElements;
-};
-
-} // end anonymous namespace
-
-char ArgPromotion::ID = 0;
-
-INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
- "Promote 'by reference' arguments to scalars", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
- "Promote 'by reference' arguments to scalars", false, false)
-
-Pass *llvm::createArgumentPromotionPass(unsigned MaxElements) {
- return new ArgPromotion(MaxElements);
-}
-
-bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
- if (skipSCC(SCC))
- return false;
-
- // Get the callgraph information that we need to update to reflect our
- // changes.
- CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
-
- LegacyAARGetter AARGetter(*this);
-
- bool Changed = false, LocalChange;
-
- // Iterate until we stop promoting from this SCC.
- do {
- LocalChange = false;
- // Attempt to promote arguments from all functions in this SCC.
- for (CallGraphNode *OldNode : SCC) {
- Function *OldF = OldNode->getFunction();
- if (!OldF)
- continue;
-
- auto ReplaceCallSite = [&](CallBase &OldCS, CallBase &NewCS) {
- Function *Caller = OldCS.getParent()->getParent();
- CallGraphNode *NewCalleeNode =
- CG.getOrInsertFunction(NewCS.getCalledFunction());
- CallGraphNode *CallerNode = CG[Caller];
- CallerNode->replaceCallEdge(cast<CallBase>(OldCS),
- cast<CallBase>(NewCS), NewCalleeNode);
- };
-
- const TargetTransformInfo &TTI =
- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*OldF);
- if (Function *NewF = promoteArguments(OldF, AARGetter, MaxElements,
- {ReplaceCallSite}, TTI)) {
- LocalChange = true;
-
- // Update the call graph for the newly promoted function.
- CallGraphNode *NewNode = CG.getOrInsertFunction(NewF);
- NewNode->stealCalledFunctionsFrom(OldNode);
- if (OldNode->getNumReferences() == 0)
- delete CG.removeFunctionFromModule(OldNode);
- else
- OldF->setLinkage(Function::ExternalLinkage);
-
- // And updat ethe SCC we're iterating as well.
- SCC.ReplaceNode(OldNode, NewNode);
- }
- }
- // Remember that we changed something.
- Changed |= LocalChange;
- } while (LocalChange);
-
- return Changed;
-}
-
-bool ArgPromotion::doInitialization(CallGraph &CG) {
- return CallGraphSCCPass::doInitialization(CG);
-}
+ } else if (StoreInst *Store = dyn_cast<StoreInst>(V)) {
+ Stores.push_back(Store);
+ } else if (!isa<LoadInst>(V)) {
+ return true;
+ }
+ }
+
+ // Check to make sure the pointers aren't captured
+ for (StoreInst *Store : Stores)
+ if (PtrValues.count(Store->getValueOperand()))
+ return true;
+
+ return false;
+}
+
+bool ArgumentPromotionPass::areFunctionArgsABICompatible(
+ const Function &F, const TargetTransformInfo &TTI,
+ SmallPtrSetImpl<Argument *> &ArgsToPromote,
+ SmallPtrSetImpl<Argument *> &ByValArgsToTransform) {
+ for (const Use &U : F.uses()) {
+ CallBase *CB = dyn_cast<CallBase>(U.getUser());
+ if (!CB)
+ return false;
+ const Function *Caller = CB->getCaller();
+ const Function *Callee = CB->getCalledFunction();
+ if (!TTI.areFunctionArgsABICompatible(Caller, Callee, ArgsToPromote) ||
+ !TTI.areFunctionArgsABICompatible(Caller, Callee, ByValArgsToTransform))
+ return false;
+ }
+ return true;
+}
+
+/// PromoteArguments - This method checks the specified function to see if there
+/// are any promotable arguments and if it is safe to promote the function (for
+/// example, all callers are direct). If safe to promote some arguments, it
+/// calls the DoPromotion method.
+static Function *
+promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
+ unsigned MaxElements,
+ Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
+ ReplaceCallSite,
+ const TargetTransformInfo &TTI) {
+ // Don't perform argument promotion for naked functions; otherwise we can end
+ // up removing parameters that are seemingly 'not used' as they are referred
+ // to in the assembly.
+ if(F->hasFnAttribute(Attribute::Naked))
+ return nullptr;
+
+ // Make sure that it is local to this module.
+ if (!F->hasLocalLinkage())
+ return nullptr;
+
+ // Don't promote arguments for variadic functions. Adding, removing, or
+ // changing non-pack parameters can change the classification of pack
+ // parameters. Frontends encode that classification at the call site in the
+ // IR, while in the callee the classification is determined dynamically based
+ // on the number of registers consumed so far.
+ if (F->isVarArg())
+ return nullptr;
+
+ // Don't transform functions that receive inallocas, as the transformation may
+ // not be safe depending on calling convention.
+ if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca))
+ return nullptr;
+
+ // First check: see if there are any pointer arguments! If not, quick exit.
+ SmallVector<Argument *, 16> PointerArgs;
+ for (Argument &I : F->args())
+ if (I.getType()->isPointerTy())
+ PointerArgs.push_back(&I);
+ if (PointerArgs.empty())
+ return nullptr;
+
+ // Second check: make sure that all callers are direct callers. We can't
+ // transform functions that have indirect callers. Also see if the function
+ // is self-recursive and check that target features are compatible.
+ bool isSelfRecursive = false;
+ for (Use &U : F->uses()) {
+ CallBase *CB = dyn_cast<CallBase>(U.getUser());
+ // Must be a direct call.
+ if (CB == nullptr || !CB->isCallee(&U))
+ return nullptr;
+
+ // Can't change signature of musttail callee
+ if (CB->isMustTailCall())
+ return nullptr;
+
+ if (CB->getParent()->getParent() == F)
+ isSelfRecursive = true;
+ }
+
+ // Can't change signature of musttail caller
+ // FIXME: Support promoting whole chain of musttail functions
+ for (BasicBlock &BB : *F)
+ if (BB.getTerminatingMustTailCall())
+ return nullptr;
+
+ const DataLayout &DL = F->getParent()->getDataLayout();
+
+ AAResults &AAR = AARGetter(*F);
+
+ // Check to see which arguments are promotable. If an argument is promotable,
+ // add it to ArgsToPromote.
+ SmallPtrSet<Argument *, 8> ArgsToPromote;
+ SmallPtrSet<Argument *, 8> ByValArgsToTransform;
+ for (Argument *PtrArg : PointerArgs) {
+ Type *AgTy = cast<PointerType>(PtrArg->getType())->getElementType();
+
+ // Replace sret attribute with noalias. This reduces register pressure by
+ // avoiding a register copy.
+ if (PtrArg->hasStructRetAttr()) {
+ unsigned ArgNo = PtrArg->getArgNo();
+ F->removeParamAttr(ArgNo, Attribute::StructRet);
+ F->addParamAttr(ArgNo, Attribute::NoAlias);
+ for (Use &U : F->uses()) {
+ CallBase &CB = cast<CallBase>(*U.getUser());
+ CB.removeParamAttr(ArgNo, Attribute::StructRet);
+ CB.addParamAttr(ArgNo, Attribute::NoAlias);
+ }
+ }
+
+ // If this is a byval argument, and if the aggregate type is small, just
+ // pass the elements, which is always safe, if the passed value is densely
+ // packed or if we can prove the padding bytes are never accessed.
+ bool isSafeToPromote = PtrArg->hasByValAttr() &&
+ (ArgumentPromotionPass::isDenselyPacked(AgTy, DL) ||
+ !canPaddingBeAccessed(PtrArg));
+ if (isSafeToPromote) {
+ if (StructType *STy = dyn_cast<StructType>(AgTy)) {
+ if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
+ LLVM_DEBUG(dbgs() << "argpromotion disable promoting argument '"
+ << PtrArg->getName()
+ << "' because it would require adding more"
+ << " than " << MaxElements
+ << " arguments to the function.\n");
+ continue;
+ }
+
+ // If all the elements are single-value types, we can promote it.
+ bool AllSimple = true;
+ for (const auto *EltTy : STy->elements()) {
+ if (!EltTy->isSingleValueType()) {
+ AllSimple = false;
+ break;
+ }
+ }
+
+ // Safe to transform, don't even bother trying to "promote" it.
+ // Passing the elements as a scalar will allow sroa to hack on
+ // the new alloca we introduce.
+ if (AllSimple) {
+ ByValArgsToTransform.insert(PtrArg);
+ continue;
+ }
+ }
+ }
+
+ // If the argument is a recursive type and we're in a recursive
+ // function, we could end up infinitely peeling the function argument.
+ if (isSelfRecursive) {
+ if (StructType *STy = dyn_cast<StructType>(AgTy)) {
+ bool RecursiveType = false;
+ for (const auto *EltTy : STy->elements()) {
+ if (EltTy == PtrArg->getType()) {
+ RecursiveType = true;
+ break;
+ }
+ }
+ if (RecursiveType)
+ continue;
+ }
+ }
+
+ // Otherwise, see if we can promote the pointer to its value.
+ Type *ByValTy =
+ PtrArg->hasByValAttr() ? PtrArg->getParamByValType() : nullptr;
+ if (isSafeToPromoteArgument(PtrArg, ByValTy, AAR, MaxElements))
+ ArgsToPromote.insert(PtrArg);
+ }
+
+ // No promotable pointer arguments.
+ if (ArgsToPromote.empty() && ByValArgsToTransform.empty())
+ return nullptr;
+
+ if (!ArgumentPromotionPass::areFunctionArgsABICompatible(
+ *F, TTI, ArgsToPromote, ByValArgsToTransform))
+ return nullptr;
+
+ return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite);
+}
+
+PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
+ CGSCCAnalysisManager &AM,
+ LazyCallGraph &CG,
+ CGSCCUpdateResult &UR) {
+ bool Changed = false, LocalChange;
+
+ // Iterate until we stop promoting from this SCC.
+ do {
+ LocalChange = false;
+
+ for (LazyCallGraph::Node &N : C) {
+ Function &OldF = N.getFunction();
+
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+ // FIXME: This lambda must only be used with this function. We should
+ // skip the lambda and just get the AA results directly.
+ auto AARGetter = [&](Function &F) -> AAResults & {
+ assert(&F == &OldF && "Called with an unexpected function!");
+ return FAM.getResult<AAManager>(F);
+ };
+
+ const TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(OldF);
+ Function *NewF =
+ promoteArguments(&OldF, AARGetter, MaxElements, None, TTI);
+ if (!NewF)
+ continue;
+ LocalChange = true;
+
+ // Directly substitute the functions in the call graph. Note that this
+ // requires the old function to be completely dead and completely
+ // replaced by the new function. It does no call graph updates, it merely
+ // swaps out the particular function mapped to a particular node in the
+ // graph.
+ C.getOuterRefSCC().replaceNodeFunction(N, *NewF);
+ OldF.eraseFromParent();
+ }
+
+ Changed |= LocalChange;
+ } while (LocalChange);
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+namespace {
+
+/// ArgPromotion - The 'by reference' to 'by value' argument promotion pass.
+struct ArgPromotion : public CallGraphSCCPass {
+ // Pass identification, replacement for typeid
+ static char ID;
+
+ explicit ArgPromotion(unsigned MaxElements = 3)
+ : CallGraphSCCPass(ID), MaxElements(MaxElements) {
+ initializeArgPromotionPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ getAAResultsAnalysisUsage(AU);
+ CallGraphSCCPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnSCC(CallGraphSCC &SCC) override;
+
+private:
+ using llvm::Pass::doInitialization;
+
+ bool doInitialization(CallGraph &CG) override;
+
+ /// The maximum number of elements to expand, or 0 for unlimited.
+ unsigned MaxElements;
+};
+
+} // end anonymous namespace
+
+char ArgPromotion::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ArgPromotion, "argpromotion",
+ "Promote 'by reference' arguments to scalars", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ArgPromotion, "argpromotion",
+ "Promote 'by reference' arguments to scalars", false, false)
+
+Pass *llvm::createArgumentPromotionPass(unsigned MaxElements) {
+ return new ArgPromotion(MaxElements);
+}
+
+bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
+ if (skipSCC(SCC))
+ return false;
+
+ // Get the callgraph information that we need to update to reflect our
+ // changes.
+ CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+
+ LegacyAARGetter AARGetter(*this);
+
+ bool Changed = false, LocalChange;
+
+ // Iterate until we stop promoting from this SCC.
+ do {
+ LocalChange = false;
+ // Attempt to promote arguments from all functions in this SCC.
+ for (CallGraphNode *OldNode : SCC) {
+ Function *OldF = OldNode->getFunction();
+ if (!OldF)
+ continue;
+
+ auto ReplaceCallSite = [&](CallBase &OldCS, CallBase &NewCS) {
+ Function *Caller = OldCS.getParent()->getParent();
+ CallGraphNode *NewCalleeNode =
+ CG.getOrInsertFunction(NewCS.getCalledFunction());
+ CallGraphNode *CallerNode = CG[Caller];
+ CallerNode->replaceCallEdge(cast<CallBase>(OldCS),
+ cast<CallBase>(NewCS), NewCalleeNode);
+ };
+
+ const TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*OldF);
+ if (Function *NewF = promoteArguments(OldF, AARGetter, MaxElements,
+ {ReplaceCallSite}, TTI)) {
+ LocalChange = true;
+
+ // Update the call graph for the newly promoted function.
+ CallGraphNode *NewNode = CG.getOrInsertFunction(NewF);
+ NewNode->stealCalledFunctionsFrom(OldNode);
+ if (OldNode->getNumReferences() == 0)
+ delete CG.removeFunctionFromModule(OldNode);
+ else
+ OldF->setLinkage(Function::ExternalLinkage);
+
+ // And updat ethe SCC we're iterating as well.
+ SCC.ReplaceNode(OldNode, NewNode);
+ }
+ }
+ // Remember that we changed something.
+ Changed |= LocalChange;
+ } while (LocalChange);
+
+ return Changed;
+}
+
+bool ArgPromotion::doInitialization(CallGraph &CG) {
+ return CallGraphSCCPass::doInitialization(CG);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/Attributor.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/Attributor.cpp
index fa23176c17..03ad451350 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/Attributor.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/Attributor.cpp
@@ -1,82 +1,82 @@
-//===- Attributor.cpp - Module-wide attribute deduction -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements an interprocedural pass that deduces and/or propagates
-// attributes. This is done in an abstract interpretation style fixpoint
-// iteration. See the Attributor.h file comment and the class descriptions in
-// that file for more information.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/Attributor.h"
-
+//===- Attributor.cpp - Module-wide attribute deduction -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an interprocedural pass that deduces and/or propagates
+// attributes. This is done in an abstract interpretation style fixpoint
+// iteration. See the Attributor.h file comment and the class descriptions in
+// that file for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/Attributor.h"
+
#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/TinyPtrVector.h"
#include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/LazyValueInfo.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/MustExecute.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/NoFolder.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/FileSystem.h"
#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-#include <cassert>
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <cassert>
#include <string>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "attributor"
-
+
+using namespace llvm;
+
+#define DEBUG_TYPE "attributor"
+
DEBUG_COUNTER(ManifestDBGCounter, "attributor-manifest",
"Determine what attributes are manifested in the IR");
-STATISTIC(NumFnDeleted, "Number of function deleted");
-STATISTIC(NumFnWithExactDefinition,
- "Number of functions with exact definitions");
-STATISTIC(NumFnWithoutExactDefinition,
- "Number of functions without exact definitions");
+STATISTIC(NumFnDeleted, "Number of function deleted");
+STATISTIC(NumFnWithExactDefinition,
+ "Number of functions with exact definitions");
+STATISTIC(NumFnWithoutExactDefinition,
+ "Number of functions without exact definitions");
STATISTIC(NumFnShallowWrappersCreated, "Number of shallow wrappers created");
-STATISTIC(NumAttributesTimedOut,
- "Number of abstract attributes timed out before fixpoint");
-STATISTIC(NumAttributesValidFixpoint,
- "Number of abstract attributes in a valid fixpoint state");
-STATISTIC(NumAttributesManifested,
- "Number of abstract attributes manifested in IR");
-STATISTIC(NumAttributesFixedDueToRequiredDependences,
- "Number of abstract attributes fixed due to required dependences");
-
-// TODO: Determine a good default value.
-//
-// In the LLVM-TS and SPEC2006, 32 seems to not induce compile time overheads
-// (when run with the first 5 abstract attributes). The results also indicate
-// that we never reach 32 iterations but always find a fixpoint sooner.
-//
-// This will become more evolved once we perform two interleaved fixpoint
-// iterations: bottom-up and top-down.
-static cl::opt<unsigned>
- MaxFixpointIterations("attributor-max-iterations", cl::Hidden,
- cl::desc("Maximal number of fixpoint iterations."),
- cl::init(32));
+STATISTIC(NumAttributesTimedOut,
+ "Number of abstract attributes timed out before fixpoint");
+STATISTIC(NumAttributesValidFixpoint,
+ "Number of abstract attributes in a valid fixpoint state");
+STATISTIC(NumAttributesManifested,
+ "Number of abstract attributes manifested in IR");
+STATISTIC(NumAttributesFixedDueToRequiredDependences,
+ "Number of abstract attributes fixed due to required dependences");
+
+// TODO: Determine a good default value.
+//
+// In the LLVM-TS and SPEC2006, 32 seems to not induce compile time overheads
+// (when run with the first 5 abstract attributes). The results also indicate
+// that we never reach 32 iterations but always find a fixpoint sooner.
+//
+// This will become more evolved once we perform two interleaved fixpoint
+// iterations: bottom-up and top-down.
+static cl::opt<unsigned>
+ MaxFixpointIterations("attributor-max-iterations", cl::Hidden,
+ cl::desc("Maximal number of fixpoint iterations."),
+ cl::init(32));
static cl::opt<unsigned, true> MaxInitializationChainLengthX(
"attributor-max-initialization-chain-length", cl::Hidden,
@@ -85,24 +85,24 @@ static cl::opt<unsigned, true> MaxInitializationChainLengthX(
cl::location(MaxInitializationChainLength), cl::init(1024));
unsigned llvm::MaxInitializationChainLength;
-static cl::opt<bool> VerifyMaxFixpointIterations(
- "attributor-max-iterations-verify", cl::Hidden,
- cl::desc("Verify that max-iterations is a tight bound for a fixpoint"),
- cl::init(false));
-
-static cl::opt<bool> AnnotateDeclarationCallSites(
- "attributor-annotate-decl-cs", cl::Hidden,
- cl::desc("Annotate call sites of function declarations."), cl::init(false));
-
-static cl::opt<bool> EnableHeapToStack("enable-heap-to-stack-conversion",
- cl::init(true), cl::Hidden);
-
-static cl::opt<bool>
- AllowShallowWrappers("attributor-allow-shallow-wrappers", cl::Hidden,
- cl::desc("Allow the Attributor to create shallow "
- "wrappers for non-exact definitions."),
- cl::init(false));
-
+static cl::opt<bool> VerifyMaxFixpointIterations(
+ "attributor-max-iterations-verify", cl::Hidden,
+ cl::desc("Verify that max-iterations is a tight bound for a fixpoint"),
+ cl::init(false));
+
+static cl::opt<bool> AnnotateDeclarationCallSites(
+ "attributor-annotate-decl-cs", cl::Hidden,
+ cl::desc("Annotate call sites of function declarations."), cl::init(false));
+
+static cl::opt<bool> EnableHeapToStack("enable-heap-to-stack-conversion",
+ cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+ AllowShallowWrappers("attributor-allow-shallow-wrappers", cl::Hidden,
+ cl::desc("Allow the Attributor to create shallow "
+ "wrappers for non-exact definitions."),
+ cl::init(false));
+
static cl::opt<bool>
AllowDeepWrapper("attributor-allow-deep-wrappers", cl::Hidden,
cl::desc("Allow the Attributor to use IP information "
@@ -111,12 +111,12 @@ static cl::opt<bool>
// These options can only used for debug builds.
#ifndef NDEBUG
-static cl::list<std::string>
- SeedAllowList("attributor-seed-allow-list", cl::Hidden,
+static cl::list<std::string>
+ SeedAllowList("attributor-seed-allow-list", cl::Hidden,
cl::desc("Comma seperated list of attribute names that are "
- "allowed to be seeded."),
- cl::ZeroOrMore, cl::CommaSeparated);
-
+ "allowed to be seeded."),
+ cl::ZeroOrMore, cl::CommaSeparated);
+
static cl::list<std::string> FunctionSeedAllowList(
"attributor-function-seed-allow-list", cl::Hidden,
cl::desc("Comma seperated list of function names that are "
@@ -141,194 +141,194 @@ static cl::opt<bool> PrintDependencies("attributor-print-dep", cl::Hidden,
cl::desc("Print attribute dependencies"),
cl::init(false));
-/// Logic operators for the change status enum class.
-///
-///{
+/// Logic operators for the change status enum class.
+///
+///{
ChangeStatus llvm::operator|(ChangeStatus L, ChangeStatus R) {
return L == ChangeStatus::CHANGED ? L : R;
-}
+}
ChangeStatus llvm::operator&(ChangeStatus L, ChangeStatus R) {
return L == ChangeStatus::UNCHANGED ? L : R;
-}
-///}
-
-/// Return true if \p New is equal or worse than \p Old.
-static bool isEqualOrWorse(const Attribute &New, const Attribute &Old) {
- if (!Old.isIntAttribute())
- return true;
-
- return Old.getValueAsInt() >= New.getValueAsInt();
-}
-
-/// Return true if the information provided by \p Attr was added to the
-/// attribute list \p Attrs. This is only the case if it was not already present
-/// in \p Attrs at the position describe by \p PK and \p AttrIdx.
-static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr,
- AttributeList &Attrs, int AttrIdx) {
-
- if (Attr.isEnumAttribute()) {
- Attribute::AttrKind Kind = Attr.getKindAsEnum();
- if (Attrs.hasAttribute(AttrIdx, Kind))
- if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
- return false;
- Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
- return true;
- }
- if (Attr.isStringAttribute()) {
- StringRef Kind = Attr.getKindAsString();
- if (Attrs.hasAttribute(AttrIdx, Kind))
- if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
- return false;
- Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
- return true;
- }
- if (Attr.isIntAttribute()) {
- Attribute::AttrKind Kind = Attr.getKindAsEnum();
- if (Attrs.hasAttribute(AttrIdx, Kind))
- if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
- return false;
- Attrs = Attrs.removeAttribute(Ctx, AttrIdx, Kind);
- Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
- return true;
- }
-
- llvm_unreachable("Expected enum or string attribute!");
-}
-
-Argument *IRPosition::getAssociatedArgument() const {
- if (getPositionKind() == IRP_ARGUMENT)
- return cast<Argument>(&getAnchorValue());
-
- // Not an Argument and no argument number means this is not a call site
- // argument, thus we cannot find a callback argument to return.
+}
+///}
+
+/// Return true if \p New is equal or worse than \p Old.
+static bool isEqualOrWorse(const Attribute &New, const Attribute &Old) {
+ if (!Old.isIntAttribute())
+ return true;
+
+ return Old.getValueAsInt() >= New.getValueAsInt();
+}
+
+/// Return true if the information provided by \p Attr was added to the
+/// attribute list \p Attrs. This is only the case if it was not already present
+/// in \p Attrs at the position describe by \p PK and \p AttrIdx.
+static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr,
+ AttributeList &Attrs, int AttrIdx) {
+
+ if (Attr.isEnumAttribute()) {
+ Attribute::AttrKind Kind = Attr.getKindAsEnum();
+ if (Attrs.hasAttribute(AttrIdx, Kind))
+ if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+ return false;
+ Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+ return true;
+ }
+ if (Attr.isStringAttribute()) {
+ StringRef Kind = Attr.getKindAsString();
+ if (Attrs.hasAttribute(AttrIdx, Kind))
+ if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+ return false;
+ Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+ return true;
+ }
+ if (Attr.isIntAttribute()) {
+ Attribute::AttrKind Kind = Attr.getKindAsEnum();
+ if (Attrs.hasAttribute(AttrIdx, Kind))
+ if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+ return false;
+ Attrs = Attrs.removeAttribute(Ctx, AttrIdx, Kind);
+ Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+ return true;
+ }
+
+ llvm_unreachable("Expected enum or string attribute!");
+}
+
+Argument *IRPosition::getAssociatedArgument() const {
+ if (getPositionKind() == IRP_ARGUMENT)
+ return cast<Argument>(&getAnchorValue());
+
+ // Not an Argument and no argument number means this is not a call site
+ // argument, thus we cannot find a callback argument to return.
int ArgNo = getCallSiteArgNo();
- if (ArgNo < 0)
- return nullptr;
-
- // Use abstract call sites to make the connection between the call site
- // values and the ones in callbacks. If a callback was found that makes use
- // of the underlying call site operand, we want the corresponding callback
- // callee argument and not the direct callee argument.
- Optional<Argument *> CBCandidateArg;
- SmallVector<const Use *, 4> CallbackUses;
- const auto &CB = cast<CallBase>(getAnchorValue());
- AbstractCallSite::getCallbackUses(CB, CallbackUses);
- for (const Use *U : CallbackUses) {
- AbstractCallSite ACS(U);
- assert(ACS && ACS.isCallbackCall());
- if (!ACS.getCalledFunction())
- continue;
-
- for (unsigned u = 0, e = ACS.getNumArgOperands(); u < e; u++) {
-
- // Test if the underlying call site operand is argument number u of the
- // callback callee.
- if (ACS.getCallArgOperandNo(u) != ArgNo)
- continue;
-
- assert(ACS.getCalledFunction()->arg_size() > u &&
- "ACS mapped into var-args arguments!");
- if (CBCandidateArg.hasValue()) {
- CBCandidateArg = nullptr;
- break;
- }
- CBCandidateArg = ACS.getCalledFunction()->getArg(u);
- }
- }
-
- // If we found a unique callback candidate argument, return it.
- if (CBCandidateArg.hasValue() && CBCandidateArg.getValue())
- return CBCandidateArg.getValue();
-
- // If no callbacks were found, or none used the underlying call site operand
- // exclusively, use the direct callee argument if available.
- const Function *Callee = CB.getCalledFunction();
- if (Callee && Callee->arg_size() > unsigned(ArgNo))
- return Callee->getArg(ArgNo);
-
- return nullptr;
-}
-
-ChangeStatus AbstractAttribute::update(Attributor &A) {
- ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
- if (getState().isAtFixpoint())
- return HasChanged;
-
- LLVM_DEBUG(dbgs() << "[Attributor] Update: " << *this << "\n");
-
- HasChanged = updateImpl(A);
-
- LLVM_DEBUG(dbgs() << "[Attributor] Update " << HasChanged << " " << *this
- << "\n");
-
- return HasChanged;
-}
-
-ChangeStatus
-IRAttributeManifest::manifestAttrs(Attributor &A, const IRPosition &IRP,
- const ArrayRef<Attribute> &DeducedAttrs) {
- Function *ScopeFn = IRP.getAnchorScope();
- IRPosition::Kind PK = IRP.getPositionKind();
-
- // In the following some generic code that will manifest attributes in
- // DeducedAttrs if they improve the current IR. Due to the different
- // annotation positions we use the underlying AttributeList interface.
-
- AttributeList Attrs;
- switch (PK) {
- case IRPosition::IRP_INVALID:
- case IRPosition::IRP_FLOAT:
- return ChangeStatus::UNCHANGED;
- case IRPosition::IRP_ARGUMENT:
- case IRPosition::IRP_FUNCTION:
- case IRPosition::IRP_RETURNED:
- Attrs = ScopeFn->getAttributes();
- break;
- case IRPosition::IRP_CALL_SITE:
- case IRPosition::IRP_CALL_SITE_RETURNED:
- case IRPosition::IRP_CALL_SITE_ARGUMENT:
- Attrs = cast<CallBase>(IRP.getAnchorValue()).getAttributes();
- break;
- }
-
- ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
- LLVMContext &Ctx = IRP.getAnchorValue().getContext();
- for (const Attribute &Attr : DeducedAttrs) {
- if (!addIfNotExistent(Ctx, Attr, Attrs, IRP.getAttrIdx()))
- continue;
-
- HasChanged = ChangeStatus::CHANGED;
- }
-
- if (HasChanged == ChangeStatus::UNCHANGED)
- return HasChanged;
-
- switch (PK) {
- case IRPosition::IRP_ARGUMENT:
- case IRPosition::IRP_FUNCTION:
- case IRPosition::IRP_RETURNED:
- ScopeFn->setAttributes(Attrs);
- break;
- case IRPosition::IRP_CALL_SITE:
- case IRPosition::IRP_CALL_SITE_RETURNED:
- case IRPosition::IRP_CALL_SITE_ARGUMENT:
- cast<CallBase>(IRP.getAnchorValue()).setAttributes(Attrs);
- break;
- case IRPosition::IRP_INVALID:
- case IRPosition::IRP_FLOAT:
- break;
- }
-
- return HasChanged;
-}
-
-const IRPosition IRPosition::EmptyKey(DenseMapInfo<void *>::getEmptyKey());
-const IRPosition
- IRPosition::TombstoneKey(DenseMapInfo<void *>::getTombstoneKey());
-
-SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) {
- IRPositions.emplace_back(IRP);
-
+ if (ArgNo < 0)
+ return nullptr;
+
+ // Use abstract call sites to make the connection between the call site
+ // values and the ones in callbacks. If a callback was found that makes use
+ // of the underlying call site operand, we want the corresponding callback
+ // callee argument and not the direct callee argument.
+ Optional<Argument *> CBCandidateArg;
+ SmallVector<const Use *, 4> CallbackUses;
+ const auto &CB = cast<CallBase>(getAnchorValue());
+ AbstractCallSite::getCallbackUses(CB, CallbackUses);
+ for (const Use *U : CallbackUses) {
+ AbstractCallSite ACS(U);
+ assert(ACS && ACS.isCallbackCall());
+ if (!ACS.getCalledFunction())
+ continue;
+
+ for (unsigned u = 0, e = ACS.getNumArgOperands(); u < e; u++) {
+
+ // Test if the underlying call site operand is argument number u of the
+ // callback callee.
+ if (ACS.getCallArgOperandNo(u) != ArgNo)
+ continue;
+
+ assert(ACS.getCalledFunction()->arg_size() > u &&
+ "ACS mapped into var-args arguments!");
+ if (CBCandidateArg.hasValue()) {
+ CBCandidateArg = nullptr;
+ break;
+ }
+ CBCandidateArg = ACS.getCalledFunction()->getArg(u);
+ }
+ }
+
+ // If we found a unique callback candidate argument, return it.
+ if (CBCandidateArg.hasValue() && CBCandidateArg.getValue())
+ return CBCandidateArg.getValue();
+
+ // If no callbacks were found, or none used the underlying call site operand
+ // exclusively, use the direct callee argument if available.
+ const Function *Callee = CB.getCalledFunction();
+ if (Callee && Callee->arg_size() > unsigned(ArgNo))
+ return Callee->getArg(ArgNo);
+
+ return nullptr;
+}
+
+ChangeStatus AbstractAttribute::update(Attributor &A) {
+ ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+ if (getState().isAtFixpoint())
+ return HasChanged;
+
+ LLVM_DEBUG(dbgs() << "[Attributor] Update: " << *this << "\n");
+
+ HasChanged = updateImpl(A);
+
+ LLVM_DEBUG(dbgs() << "[Attributor] Update " << HasChanged << " " << *this
+ << "\n");
+
+ return HasChanged;
+}
+
+ChangeStatus
+IRAttributeManifest::manifestAttrs(Attributor &A, const IRPosition &IRP,
+ const ArrayRef<Attribute> &DeducedAttrs) {
+ Function *ScopeFn = IRP.getAnchorScope();
+ IRPosition::Kind PK = IRP.getPositionKind();
+
+ // In the following some generic code that will manifest attributes in
+ // DeducedAttrs if they improve the current IR. Due to the different
+ // annotation positions we use the underlying AttributeList interface.
+
+ AttributeList Attrs;
+ switch (PK) {
+ case IRPosition::IRP_INVALID:
+ case IRPosition::IRP_FLOAT:
+ return ChangeStatus::UNCHANGED;
+ case IRPosition::IRP_ARGUMENT:
+ case IRPosition::IRP_FUNCTION:
+ case IRPosition::IRP_RETURNED:
+ Attrs = ScopeFn->getAttributes();
+ break;
+ case IRPosition::IRP_CALL_SITE:
+ case IRPosition::IRP_CALL_SITE_RETURNED:
+ case IRPosition::IRP_CALL_SITE_ARGUMENT:
+ Attrs = cast<CallBase>(IRP.getAnchorValue()).getAttributes();
+ break;
+ }
+
+ ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+ LLVMContext &Ctx = IRP.getAnchorValue().getContext();
+ for (const Attribute &Attr : DeducedAttrs) {
+ if (!addIfNotExistent(Ctx, Attr, Attrs, IRP.getAttrIdx()))
+ continue;
+
+ HasChanged = ChangeStatus::CHANGED;
+ }
+
+ if (HasChanged == ChangeStatus::UNCHANGED)
+ return HasChanged;
+
+ switch (PK) {
+ case IRPosition::IRP_ARGUMENT:
+ case IRPosition::IRP_FUNCTION:
+ case IRPosition::IRP_RETURNED:
+ ScopeFn->setAttributes(Attrs);
+ break;
+ case IRPosition::IRP_CALL_SITE:
+ case IRPosition::IRP_CALL_SITE_RETURNED:
+ case IRPosition::IRP_CALL_SITE_ARGUMENT:
+ cast<CallBase>(IRP.getAnchorValue()).setAttributes(Attrs);
+ break;
+ case IRPosition::IRP_INVALID:
+ case IRPosition::IRP_FLOAT:
+ break;
+ }
+
+ return HasChanged;
+}
+
+const IRPosition IRPosition::EmptyKey(DenseMapInfo<void *>::getEmptyKey());
+const IRPosition
+ IRPosition::TombstoneKey(DenseMapInfo<void *>::getTombstoneKey());
+
+SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) {
+ IRPositions.emplace_back(IRP);
+
// Helper to determine if operand bundles on a call site are benin or
// potentially problematic. We handle only llvm.assume for now.
auto CanIgnoreOperandBundles = [](const CallBase &CB) {
@@ -336,843 +336,843 @@ SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) {
cast<IntrinsicInst>(CB).getIntrinsicID() == Intrinsic ::assume);
};
- const auto *CB = dyn_cast<CallBase>(&IRP.getAnchorValue());
- switch (IRP.getPositionKind()) {
- case IRPosition::IRP_INVALID:
- case IRPosition::IRP_FLOAT:
- case IRPosition::IRP_FUNCTION:
- return;
- case IRPosition::IRP_ARGUMENT:
- case IRPosition::IRP_RETURNED:
- IRPositions.emplace_back(IRPosition::function(*IRP.getAnchorScope()));
- return;
- case IRPosition::IRP_CALL_SITE:
- assert(CB && "Expected call site!");
- // TODO: We need to look at the operand bundles similar to the redirection
- // in CallBase.
+ const auto *CB = dyn_cast<CallBase>(&IRP.getAnchorValue());
+ switch (IRP.getPositionKind()) {
+ case IRPosition::IRP_INVALID:
+ case IRPosition::IRP_FLOAT:
+ case IRPosition::IRP_FUNCTION:
+ return;
+ case IRPosition::IRP_ARGUMENT:
+ case IRPosition::IRP_RETURNED:
+ IRPositions.emplace_back(IRPosition::function(*IRP.getAnchorScope()));
+ return;
+ case IRPosition::IRP_CALL_SITE:
+ assert(CB && "Expected call site!");
+ // TODO: We need to look at the operand bundles similar to the redirection
+ // in CallBase.
if (!CB->hasOperandBundles() || CanIgnoreOperandBundles(*CB))
- if (const Function *Callee = CB->getCalledFunction())
- IRPositions.emplace_back(IRPosition::function(*Callee));
- return;
- case IRPosition::IRP_CALL_SITE_RETURNED:
- assert(CB && "Expected call site!");
- // TODO: We need to look at the operand bundles similar to the redirection
- // in CallBase.
+ if (const Function *Callee = CB->getCalledFunction())
+ IRPositions.emplace_back(IRPosition::function(*Callee));
+ return;
+ case IRPosition::IRP_CALL_SITE_RETURNED:
+ assert(CB && "Expected call site!");
+ // TODO: We need to look at the operand bundles similar to the redirection
+ // in CallBase.
if (!CB->hasOperandBundles() || CanIgnoreOperandBundles(*CB)) {
- if (const Function *Callee = CB->getCalledFunction()) {
- IRPositions.emplace_back(IRPosition::returned(*Callee));
- IRPositions.emplace_back(IRPosition::function(*Callee));
- for (const Argument &Arg : Callee->args())
- if (Arg.hasReturnedAttr()) {
- IRPositions.emplace_back(
- IRPosition::callsite_argument(*CB, Arg.getArgNo()));
- IRPositions.emplace_back(
- IRPosition::value(*CB->getArgOperand(Arg.getArgNo())));
- IRPositions.emplace_back(IRPosition::argument(Arg));
- }
- }
- }
- IRPositions.emplace_back(IRPosition::callsite_function(*CB));
- return;
- case IRPosition::IRP_CALL_SITE_ARGUMENT: {
+ if (const Function *Callee = CB->getCalledFunction()) {
+ IRPositions.emplace_back(IRPosition::returned(*Callee));
+ IRPositions.emplace_back(IRPosition::function(*Callee));
+ for (const Argument &Arg : Callee->args())
+ if (Arg.hasReturnedAttr()) {
+ IRPositions.emplace_back(
+ IRPosition::callsite_argument(*CB, Arg.getArgNo()));
+ IRPositions.emplace_back(
+ IRPosition::value(*CB->getArgOperand(Arg.getArgNo())));
+ IRPositions.emplace_back(IRPosition::argument(Arg));
+ }
+ }
+ }
+ IRPositions.emplace_back(IRPosition::callsite_function(*CB));
+ return;
+ case IRPosition::IRP_CALL_SITE_ARGUMENT: {
assert(CB && "Expected call site!");
- // TODO: We need to look at the operand bundles similar to the redirection
- // in CallBase.
+ // TODO: We need to look at the operand bundles similar to the redirection
+ // in CallBase.
if (!CB->hasOperandBundles() || CanIgnoreOperandBundles(*CB)) {
- const Function *Callee = CB->getCalledFunction();
+ const Function *Callee = CB->getCalledFunction();
if (Callee) {
if (Argument *Arg = IRP.getAssociatedArgument())
IRPositions.emplace_back(IRPosition::argument(*Arg));
- IRPositions.emplace_back(IRPosition::function(*Callee));
+ IRPositions.emplace_back(IRPosition::function(*Callee));
}
- }
- IRPositions.emplace_back(IRPosition::value(IRP.getAssociatedValue()));
- return;
- }
- }
-}
-
-bool IRPosition::hasAttr(ArrayRef<Attribute::AttrKind> AKs,
- bool IgnoreSubsumingPositions, Attributor *A) const {
- SmallVector<Attribute, 4> Attrs;
- for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) {
- for (Attribute::AttrKind AK : AKs)
- if (EquivIRP.getAttrsFromIRAttr(AK, Attrs))
- return true;
- // The first position returned by the SubsumingPositionIterator is
- // always the position itself. If we ignore subsuming positions we
- // are done after the first iteration.
- if (IgnoreSubsumingPositions)
- break;
- }
- if (A)
- for (Attribute::AttrKind AK : AKs)
- if (getAttrsFromAssumes(AK, Attrs, *A))
- return true;
- return false;
-}
-
-void IRPosition::getAttrs(ArrayRef<Attribute::AttrKind> AKs,
- SmallVectorImpl<Attribute> &Attrs,
- bool IgnoreSubsumingPositions, Attributor *A) const {
- for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) {
- for (Attribute::AttrKind AK : AKs)
- EquivIRP.getAttrsFromIRAttr(AK, Attrs);
- // The first position returned by the SubsumingPositionIterator is
- // always the position itself. If we ignore subsuming positions we
- // are done after the first iteration.
- if (IgnoreSubsumingPositions)
- break;
- }
- if (A)
- for (Attribute::AttrKind AK : AKs)
- getAttrsFromAssumes(AK, Attrs, *A);
-}
-
-bool IRPosition::getAttrsFromIRAttr(Attribute::AttrKind AK,
- SmallVectorImpl<Attribute> &Attrs) const {
- if (getPositionKind() == IRP_INVALID || getPositionKind() == IRP_FLOAT)
- return false;
-
- AttributeList AttrList;
- if (const auto *CB = dyn_cast<CallBase>(&getAnchorValue()))
- AttrList = CB->getAttributes();
- else
- AttrList = getAssociatedFunction()->getAttributes();
-
- bool HasAttr = AttrList.hasAttribute(getAttrIdx(), AK);
- if (HasAttr)
- Attrs.push_back(AttrList.getAttribute(getAttrIdx(), AK));
- return HasAttr;
-}
-
-bool IRPosition::getAttrsFromAssumes(Attribute::AttrKind AK,
- SmallVectorImpl<Attribute> &Attrs,
- Attributor &A) const {
- assert(getPositionKind() != IRP_INVALID && "Did expect a valid position!");
- Value &AssociatedValue = getAssociatedValue();
-
- const Assume2KnowledgeMap &A2K =
- A.getInfoCache().getKnowledgeMap().lookup({&AssociatedValue, AK});
-
- // Check if we found any potential assume use, if not we don't need to create
- // explorer iterators.
- if (A2K.empty())
- return false;
-
- LLVMContext &Ctx = AssociatedValue.getContext();
- unsigned AttrsSize = Attrs.size();
- MustBeExecutedContextExplorer &Explorer =
- A.getInfoCache().getMustBeExecutedContextExplorer();
- auto EIt = Explorer.begin(getCtxI()), EEnd = Explorer.end(getCtxI());
- for (auto &It : A2K)
- if (Explorer.findInContextOf(It.first, EIt, EEnd))
- Attrs.push_back(Attribute::get(Ctx, AK, It.second.Max));
- return AttrsSize != Attrs.size();
-}
-
-void IRPosition::verify() {
-#ifdef EXPENSIVE_CHECKS
- switch (getPositionKind()) {
- case IRP_INVALID:
- assert(!Enc.getOpaqueValue() &&
- "Expected a nullptr for an invalid position!");
- return;
- case IRP_FLOAT:
- assert((!isa<CallBase>(&getAssociatedValue()) &&
- !isa<Argument>(&getAssociatedValue())) &&
- "Expected specialized kind for call base and argument values!");
- return;
- case IRP_RETURNED:
- assert(isa<Function>(getAsValuePtr()) &&
- "Expected function for a 'returned' position!");
- assert(getAsValuePtr() == &getAssociatedValue() &&
- "Associated value mismatch!");
- return;
- case IRP_CALL_SITE_RETURNED:
- assert((isa<CallBase>(getAsValuePtr())) &&
- "Expected call base for 'call site returned' position!");
- assert(getAsValuePtr() == &getAssociatedValue() &&
- "Associated value mismatch!");
- return;
- case IRP_CALL_SITE:
- assert((isa<CallBase>(getAsValuePtr())) &&
- "Expected call base for 'call site function' position!");
- assert(getAsValuePtr() == &getAssociatedValue() &&
- "Associated value mismatch!");
- return;
- case IRP_FUNCTION:
- assert(isa<Function>(getAsValuePtr()) &&
- "Expected function for a 'function' position!");
- assert(getAsValuePtr() == &getAssociatedValue() &&
- "Associated value mismatch!");
- return;
- case IRP_ARGUMENT:
- assert(isa<Argument>(getAsValuePtr()) &&
- "Expected argument for a 'argument' position!");
- assert(getAsValuePtr() == &getAssociatedValue() &&
- "Associated value mismatch!");
- return;
- case IRP_CALL_SITE_ARGUMENT: {
- Use *U = getAsUsePtr();
- assert(U && "Expected use for a 'call site argument' position!");
- assert(isa<CallBase>(U->getUser()) &&
- "Expected call base user for a 'call site argument' position!");
- assert(cast<CallBase>(U->getUser())->isArgOperand(U) &&
- "Expected call base argument operand for a 'call site argument' "
- "position");
- assert(cast<CallBase>(U->getUser())->getArgOperandNo(U) ==
+ }
+ IRPositions.emplace_back(IRPosition::value(IRP.getAssociatedValue()));
+ return;
+ }
+ }
+}
+
+bool IRPosition::hasAttr(ArrayRef<Attribute::AttrKind> AKs,
+ bool IgnoreSubsumingPositions, Attributor *A) const {
+ SmallVector<Attribute, 4> Attrs;
+ for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) {
+ for (Attribute::AttrKind AK : AKs)
+ if (EquivIRP.getAttrsFromIRAttr(AK, Attrs))
+ return true;
+ // The first position returned by the SubsumingPositionIterator is
+ // always the position itself. If we ignore subsuming positions we
+ // are done after the first iteration.
+ if (IgnoreSubsumingPositions)
+ break;
+ }
+ if (A)
+ for (Attribute::AttrKind AK : AKs)
+ if (getAttrsFromAssumes(AK, Attrs, *A))
+ return true;
+ return false;
+}
+
+void IRPosition::getAttrs(ArrayRef<Attribute::AttrKind> AKs,
+ SmallVectorImpl<Attribute> &Attrs,
+ bool IgnoreSubsumingPositions, Attributor *A) const {
+ for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) {
+ for (Attribute::AttrKind AK : AKs)
+ EquivIRP.getAttrsFromIRAttr(AK, Attrs);
+ // The first position returned by the SubsumingPositionIterator is
+ // always the position itself. If we ignore subsuming positions we
+ // are done after the first iteration.
+ if (IgnoreSubsumingPositions)
+ break;
+ }
+ if (A)
+ for (Attribute::AttrKind AK : AKs)
+ getAttrsFromAssumes(AK, Attrs, *A);
+}
+
+bool IRPosition::getAttrsFromIRAttr(Attribute::AttrKind AK,
+ SmallVectorImpl<Attribute> &Attrs) const {
+ if (getPositionKind() == IRP_INVALID || getPositionKind() == IRP_FLOAT)
+ return false;
+
+ AttributeList AttrList;
+ if (const auto *CB = dyn_cast<CallBase>(&getAnchorValue()))
+ AttrList = CB->getAttributes();
+ else
+ AttrList = getAssociatedFunction()->getAttributes();
+
+ bool HasAttr = AttrList.hasAttribute(getAttrIdx(), AK);
+ if (HasAttr)
+ Attrs.push_back(AttrList.getAttribute(getAttrIdx(), AK));
+ return HasAttr;
+}
+
+bool IRPosition::getAttrsFromAssumes(Attribute::AttrKind AK,
+ SmallVectorImpl<Attribute> &Attrs,
+ Attributor &A) const {
+ assert(getPositionKind() != IRP_INVALID && "Did expect a valid position!");
+ Value &AssociatedValue = getAssociatedValue();
+
+ const Assume2KnowledgeMap &A2K =
+ A.getInfoCache().getKnowledgeMap().lookup({&AssociatedValue, AK});
+
+ // Check if we found any potential assume use, if not we don't need to create
+ // explorer iterators.
+ if (A2K.empty())
+ return false;
+
+ LLVMContext &Ctx = AssociatedValue.getContext();
+ unsigned AttrsSize = Attrs.size();
+ MustBeExecutedContextExplorer &Explorer =
+ A.getInfoCache().getMustBeExecutedContextExplorer();
+ auto EIt = Explorer.begin(getCtxI()), EEnd = Explorer.end(getCtxI());
+ for (auto &It : A2K)
+ if (Explorer.findInContextOf(It.first, EIt, EEnd))
+ Attrs.push_back(Attribute::get(Ctx, AK, It.second.Max));
+ return AttrsSize != Attrs.size();
+}
+
+void IRPosition::verify() {
+#ifdef EXPENSIVE_CHECKS
+ switch (getPositionKind()) {
+ case IRP_INVALID:
+ assert(!Enc.getOpaqueValue() &&
+ "Expected a nullptr for an invalid position!");
+ return;
+ case IRP_FLOAT:
+ assert((!isa<CallBase>(&getAssociatedValue()) &&
+ !isa<Argument>(&getAssociatedValue())) &&
+ "Expected specialized kind for call base and argument values!");
+ return;
+ case IRP_RETURNED:
+ assert(isa<Function>(getAsValuePtr()) &&
+ "Expected function for a 'returned' position!");
+ assert(getAsValuePtr() == &getAssociatedValue() &&
+ "Associated value mismatch!");
+ return;
+ case IRP_CALL_SITE_RETURNED:
+ assert((isa<CallBase>(getAsValuePtr())) &&
+ "Expected call base for 'call site returned' position!");
+ assert(getAsValuePtr() == &getAssociatedValue() &&
+ "Associated value mismatch!");
+ return;
+ case IRP_CALL_SITE:
+ assert((isa<CallBase>(getAsValuePtr())) &&
+ "Expected call base for 'call site function' position!");
+ assert(getAsValuePtr() == &getAssociatedValue() &&
+ "Associated value mismatch!");
+ return;
+ case IRP_FUNCTION:
+ assert(isa<Function>(getAsValuePtr()) &&
+ "Expected function for a 'function' position!");
+ assert(getAsValuePtr() == &getAssociatedValue() &&
+ "Associated value mismatch!");
+ return;
+ case IRP_ARGUMENT:
+ assert(isa<Argument>(getAsValuePtr()) &&
+ "Expected argument for a 'argument' position!");
+ assert(getAsValuePtr() == &getAssociatedValue() &&
+ "Associated value mismatch!");
+ return;
+ case IRP_CALL_SITE_ARGUMENT: {
+ Use *U = getAsUsePtr();
+ assert(U && "Expected use for a 'call site argument' position!");
+ assert(isa<CallBase>(U->getUser()) &&
+ "Expected call base user for a 'call site argument' position!");
+ assert(cast<CallBase>(U->getUser())->isArgOperand(U) &&
+ "Expected call base argument operand for a 'call site argument' "
+ "position");
+ assert(cast<CallBase>(U->getUser())->getArgOperandNo(U) ==
unsigned(getCallSiteArgNo()) &&
- "Argument number mismatch!");
- assert(U->get() == &getAssociatedValue() && "Associated value mismatch!");
- return;
- }
- }
-#endif
-}
-
-Optional<Constant *>
-Attributor::getAssumedConstant(const Value &V, const AbstractAttribute &AA,
- bool &UsedAssumedInformation) {
- const auto &ValueSimplifyAA = getAAFor<AAValueSimplify>(
- AA, IRPosition::value(V), /* TrackDependence */ false);
- Optional<Value *> SimplifiedV =
- ValueSimplifyAA.getAssumedSimplifiedValue(*this);
- bool IsKnown = ValueSimplifyAA.isKnown();
- UsedAssumedInformation |= !IsKnown;
- if (!SimplifiedV.hasValue()) {
- recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
- return llvm::None;
- }
- if (isa_and_nonnull<UndefValue>(SimplifiedV.getValue())) {
- recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
- return llvm::None;
- }
- Constant *CI = dyn_cast_or_null<Constant>(SimplifiedV.getValue());
- if (CI && CI->getType() != V.getType()) {
- // TODO: Check for a save conversion.
- return nullptr;
- }
- if (CI)
- recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
- return CI;
-}
-
-Attributor::~Attributor() {
- // The abstract attributes are allocated via the BumpPtrAllocator Allocator,
- // thus we cannot delete them. We can, and want to, destruct them though.
+ "Argument number mismatch!");
+ assert(U->get() == &getAssociatedValue() && "Associated value mismatch!");
+ return;
+ }
+ }
+#endif
+}
+
+Optional<Constant *>
+Attributor::getAssumedConstant(const Value &V, const AbstractAttribute &AA,
+ bool &UsedAssumedInformation) {
+ const auto &ValueSimplifyAA = getAAFor<AAValueSimplify>(
+ AA, IRPosition::value(V), /* TrackDependence */ false);
+ Optional<Value *> SimplifiedV =
+ ValueSimplifyAA.getAssumedSimplifiedValue(*this);
+ bool IsKnown = ValueSimplifyAA.isKnown();
+ UsedAssumedInformation |= !IsKnown;
+ if (!SimplifiedV.hasValue()) {
+ recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
+ return llvm::None;
+ }
+ if (isa_and_nonnull<UndefValue>(SimplifiedV.getValue())) {
+ recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
+ return llvm::None;
+ }
+ Constant *CI = dyn_cast_or_null<Constant>(SimplifiedV.getValue());
+ if (CI && CI->getType() != V.getType()) {
+ // TODO: Check for a save conversion.
+ return nullptr;
+ }
+ if (CI)
+ recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
+ return CI;
+}
+
+Attributor::~Attributor() {
+ // The abstract attributes are allocated via the BumpPtrAllocator Allocator,
+ // thus we cannot delete them. We can, and want to, destruct them though.
for (auto &DepAA : DG.SyntheticRoot.Deps) {
AbstractAttribute *AA = cast<AbstractAttribute>(DepAA.getPointer());
- AA->~AbstractAttribute();
- }
-}
-
-bool Attributor::isAssumedDead(const AbstractAttribute &AA,
- const AAIsDead *FnLivenessAA,
- bool CheckBBLivenessOnly, DepClassTy DepClass) {
- const IRPosition &IRP = AA.getIRPosition();
- if (!Functions.count(IRP.getAnchorScope()))
- return false;
- return isAssumedDead(IRP, &AA, FnLivenessAA, CheckBBLivenessOnly, DepClass);
-}
-
-bool Attributor::isAssumedDead(const Use &U,
- const AbstractAttribute *QueryingAA,
- const AAIsDead *FnLivenessAA,
- bool CheckBBLivenessOnly, DepClassTy DepClass) {
- Instruction *UserI = dyn_cast<Instruction>(U.getUser());
- if (!UserI)
- return isAssumedDead(IRPosition::value(*U.get()), QueryingAA, FnLivenessAA,
- CheckBBLivenessOnly, DepClass);
-
- if (auto *CB = dyn_cast<CallBase>(UserI)) {
- // For call site argument uses we can check if the argument is
- // unused/dead.
- if (CB->isArgOperand(&U)) {
- const IRPosition &CSArgPos =
- IRPosition::callsite_argument(*CB, CB->getArgOperandNo(&U));
- return isAssumedDead(CSArgPos, QueryingAA, FnLivenessAA,
- CheckBBLivenessOnly, DepClass);
- }
- } else if (ReturnInst *RI = dyn_cast<ReturnInst>(UserI)) {
- const IRPosition &RetPos = IRPosition::returned(*RI->getFunction());
- return isAssumedDead(RetPos, QueryingAA, FnLivenessAA, CheckBBLivenessOnly,
- DepClass);
- } else if (PHINode *PHI = dyn_cast<PHINode>(UserI)) {
- BasicBlock *IncomingBB = PHI->getIncomingBlock(U);
- return isAssumedDead(*IncomingBB->getTerminator(), QueryingAA, FnLivenessAA,
- CheckBBLivenessOnly, DepClass);
- }
-
- return isAssumedDead(IRPosition::value(*UserI), QueryingAA, FnLivenessAA,
- CheckBBLivenessOnly, DepClass);
-}
-
-bool Attributor::isAssumedDead(const Instruction &I,
- const AbstractAttribute *QueryingAA,
- const AAIsDead *FnLivenessAA,
- bool CheckBBLivenessOnly, DepClassTy DepClass) {
- if (!FnLivenessAA)
- FnLivenessAA = lookupAAFor<AAIsDead>(IRPosition::function(*I.getFunction()),
- QueryingAA,
- /* TrackDependence */ false);
-
- // If we have a context instruction and a liveness AA we use it.
- if (FnLivenessAA &&
- FnLivenessAA->getIRPosition().getAnchorScope() == I.getFunction() &&
- FnLivenessAA->isAssumedDead(&I)) {
- if (QueryingAA)
- recordDependence(*FnLivenessAA, *QueryingAA, DepClass);
- return true;
- }
-
- if (CheckBBLivenessOnly)
- return false;
-
- const AAIsDead &IsDeadAA = getOrCreateAAFor<AAIsDead>(
- IRPosition::value(I), QueryingAA, /* TrackDependence */ false);
- // Don't check liveness for AAIsDead.
- if (QueryingAA == &IsDeadAA)
- return false;
-
- if (IsDeadAA.isAssumedDead()) {
- if (QueryingAA)
- recordDependence(IsDeadAA, *QueryingAA, DepClass);
- return true;
- }
-
- return false;
-}
-
-bool Attributor::isAssumedDead(const IRPosition &IRP,
- const AbstractAttribute *QueryingAA,
- const AAIsDead *FnLivenessAA,
- bool CheckBBLivenessOnly, DepClassTy DepClass) {
- Instruction *CtxI = IRP.getCtxI();
- if (CtxI &&
- isAssumedDead(*CtxI, QueryingAA, FnLivenessAA,
- /* CheckBBLivenessOnly */ true,
- CheckBBLivenessOnly ? DepClass : DepClassTy::OPTIONAL))
- return true;
-
- if (CheckBBLivenessOnly)
- return false;
-
- // If we haven't succeeded we query the specific liveness info for the IRP.
- const AAIsDead *IsDeadAA;
- if (IRP.getPositionKind() == IRPosition::IRP_CALL_SITE)
- IsDeadAA = &getOrCreateAAFor<AAIsDead>(
- IRPosition::callsite_returned(cast<CallBase>(IRP.getAssociatedValue())),
- QueryingAA, /* TrackDependence */ false);
- else
- IsDeadAA = &getOrCreateAAFor<AAIsDead>(IRP, QueryingAA,
- /* TrackDependence */ false);
- // Don't check liveness for AAIsDead.
- if (QueryingAA == IsDeadAA)
- return false;
-
- if (IsDeadAA->isAssumedDead()) {
- if (QueryingAA)
- recordDependence(*IsDeadAA, *QueryingAA, DepClass);
- return true;
- }
-
- return false;
-}
-
-bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred,
- const AbstractAttribute &QueryingAA,
- const Value &V, DepClassTy LivenessDepClass) {
-
- // Check the trivial case first as it catches void values.
- if (V.use_empty())
- return true;
-
- // If the value is replaced by another one, for now a constant, we do not have
- // uses. Note that this requires users of `checkForAllUses` to not recurse but
- // instead use the `follow` callback argument to look at transitive users,
- // however, that should be clear from the presence of the argument.
- bool UsedAssumedInformation = false;
- Optional<Constant *> C =
- getAssumedConstant(V, QueryingAA, UsedAssumedInformation);
- if (C.hasValue() && C.getValue()) {
- LLVM_DEBUG(dbgs() << "[Attributor] Value is simplified, uses skipped: " << V
- << " -> " << *C.getValue() << "\n");
- return true;
- }
-
- const IRPosition &IRP = QueryingAA.getIRPosition();
- SmallVector<const Use *, 16> Worklist;
- SmallPtrSet<const Use *, 16> Visited;
-
- for (const Use &U : V.uses())
- Worklist.push_back(&U);
-
- LLVM_DEBUG(dbgs() << "[Attributor] Got " << Worklist.size()
- << " initial uses to check\n");
-
- const Function *ScopeFn = IRP.getAnchorScope();
- const auto *LivenessAA =
- ScopeFn ? &getAAFor<AAIsDead>(QueryingAA, IRPosition::function(*ScopeFn),
- /* TrackDependence */ false)
- : nullptr;
-
- while (!Worklist.empty()) {
- const Use *U = Worklist.pop_back_val();
- if (!Visited.insert(U).second)
- continue;
- LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << **U << " in "
- << *U->getUser() << "\n");
- if (isAssumedDead(*U, &QueryingAA, LivenessAA,
- /* CheckBBLivenessOnly */ false, LivenessDepClass)) {
- LLVM_DEBUG(dbgs() << "[Attributor] Dead use, skip!\n");
- continue;
- }
- if (U->getUser()->isDroppable()) {
- LLVM_DEBUG(dbgs() << "[Attributor] Droppable user, skip!\n");
- continue;
- }
-
- bool Follow = false;
- if (!Pred(*U, Follow))
- return false;
- if (!Follow)
- continue;
- for (const Use &UU : U->getUser()->uses())
- Worklist.push_back(&UU);
- }
-
- return true;
-}
-
-bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
- const AbstractAttribute &QueryingAA,
- bool RequireAllCallSites,
- bool &AllCallSitesKnown) {
- // We can try to determine information from
- // the call sites. However, this is only possible all call sites are known,
- // hence the function has internal linkage.
- const IRPosition &IRP = QueryingAA.getIRPosition();
- const Function *AssociatedFunction = IRP.getAssociatedFunction();
- if (!AssociatedFunction) {
- LLVM_DEBUG(dbgs() << "[Attributor] No function associated with " << IRP
- << "\n");
- AllCallSitesKnown = false;
- return false;
- }
-
- return checkForAllCallSites(Pred, *AssociatedFunction, RequireAllCallSites,
- &QueryingAA, AllCallSitesKnown);
-}
-
-bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
- const Function &Fn,
- bool RequireAllCallSites,
- const AbstractAttribute *QueryingAA,
- bool &AllCallSitesKnown) {
- if (RequireAllCallSites && !Fn.hasLocalLinkage()) {
- LLVM_DEBUG(
- dbgs()
- << "[Attributor] Function " << Fn.getName()
- << " has no internal linkage, hence not all call sites are known\n");
- AllCallSitesKnown = false;
- return false;
- }
-
- // If we do not require all call sites we might not see all.
- AllCallSitesKnown = RequireAllCallSites;
-
- SmallVector<const Use *, 8> Uses(make_pointer_range(Fn.uses()));
- for (unsigned u = 0; u < Uses.size(); ++u) {
- const Use &U = *Uses[u];
- LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << *U << " in "
- << *U.getUser() << "\n");
- if (isAssumedDead(U, QueryingAA, nullptr, /* CheckBBLivenessOnly */ true)) {
- LLVM_DEBUG(dbgs() << "[Attributor] Dead use, skip!\n");
- continue;
- }
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U.getUser())) {
- if (CE->isCast() && CE->getType()->isPointerTy() &&
- CE->getType()->getPointerElementType()->isFunctionTy()) {
- for (const Use &CEU : CE->uses())
- Uses.push_back(&CEU);
- continue;
- }
- }
-
- AbstractCallSite ACS(&U);
- if (!ACS) {
- LLVM_DEBUG(dbgs() << "[Attributor] Function " << Fn.getName()
- << " has non call site use " << *U.get() << " in "
- << *U.getUser() << "\n");
- // BlockAddress users are allowed.
- if (isa<BlockAddress>(U.getUser()))
- continue;
- return false;
- }
-
- const Use *EffectiveUse =
- ACS.isCallbackCall() ? &ACS.getCalleeUseForCallback() : &U;
- if (!ACS.isCallee(EffectiveUse)) {
- if (!RequireAllCallSites)
- continue;
- LLVM_DEBUG(dbgs() << "[Attributor] User " << EffectiveUse->getUser()
- << " is an invalid use of " << Fn.getName() << "\n");
- return false;
- }
-
- // Make sure the arguments that can be matched between the call site and the
- // callee argee on their type. It is unlikely they do not and it doesn't
- // make sense for all attributes to know/care about this.
- assert(&Fn == ACS.getCalledFunction() && "Expected known callee");
- unsigned MinArgsParams =
- std::min(size_t(ACS.getNumArgOperands()), Fn.arg_size());
- for (unsigned u = 0; u < MinArgsParams; ++u) {
- Value *CSArgOp = ACS.getCallArgOperand(u);
- if (CSArgOp && Fn.getArg(u)->getType() != CSArgOp->getType()) {
- LLVM_DEBUG(
- dbgs() << "[Attributor] Call site / callee argument type mismatch ["
- << u << "@" << Fn.getName() << ": "
- << *Fn.getArg(u)->getType() << " vs. "
- << *ACS.getCallArgOperand(u)->getType() << "\n");
- return false;
- }
- }
-
- if (Pred(ACS))
- continue;
-
- LLVM_DEBUG(dbgs() << "[Attributor] Call site callback failed for "
- << *ACS.getInstruction() << "\n");
- return false;
- }
-
- return true;
-}
-
-bool Attributor::checkForAllReturnedValuesAndReturnInsts(
- function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred,
- const AbstractAttribute &QueryingAA) {
-
- const IRPosition &IRP = QueryingAA.getIRPosition();
- // Since we need to provide return instructions we have to have an exact
- // definition.
- const Function *AssociatedFunction = IRP.getAssociatedFunction();
- if (!AssociatedFunction)
- return false;
-
- // If this is a call site query we use the call site specific return values
- // and liveness information.
- // TODO: use the function scope once we have call site AAReturnedValues.
- const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
- const auto &AARetVal = getAAFor<AAReturnedValues>(QueryingAA, QueryIRP);
- if (!AARetVal.getState().isValidState())
- return false;
-
- return AARetVal.checkForAllReturnedValuesAndReturnInsts(Pred);
-}
-
-bool Attributor::checkForAllReturnedValues(
- function_ref<bool(Value &)> Pred, const AbstractAttribute &QueryingAA) {
-
- const IRPosition &IRP = QueryingAA.getIRPosition();
- const Function *AssociatedFunction = IRP.getAssociatedFunction();
- if (!AssociatedFunction)
- return false;
-
- // TODO: use the function scope once we have call site AAReturnedValues.
- const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
- const auto &AARetVal = getAAFor<AAReturnedValues>(QueryingAA, QueryIRP);
- if (!AARetVal.getState().isValidState())
- return false;
-
- return AARetVal.checkForAllReturnedValuesAndReturnInsts(
- [&](Value &RV, const SmallSetVector<ReturnInst *, 4> &) {
- return Pred(RV);
- });
-}
-
-static bool checkForAllInstructionsImpl(
- Attributor *A, InformationCache::OpcodeInstMapTy &OpcodeInstMap,
- function_ref<bool(Instruction &)> Pred, const AbstractAttribute *QueryingAA,
- const AAIsDead *LivenessAA, const ArrayRef<unsigned> &Opcodes,
- bool CheckBBLivenessOnly = false) {
- for (unsigned Opcode : Opcodes) {
- // Check if we have instructions with this opcode at all first.
- auto *Insts = OpcodeInstMap.lookup(Opcode);
- if (!Insts)
- continue;
-
- for (Instruction *I : *Insts) {
- // Skip dead instructions.
- if (A && A->isAssumedDead(IRPosition::value(*I), QueryingAA, LivenessAA,
- CheckBBLivenessOnly))
- continue;
-
- if (!Pred(*I))
- return false;
- }
- }
- return true;
-}
-
-bool Attributor::checkForAllInstructions(function_ref<bool(Instruction &)> Pred,
- const AbstractAttribute &QueryingAA,
- const ArrayRef<unsigned> &Opcodes,
- bool CheckBBLivenessOnly) {
-
- const IRPosition &IRP = QueryingAA.getIRPosition();
- // Since we need to provide instructions we have to have an exact definition.
- const Function *AssociatedFunction = IRP.getAssociatedFunction();
- if (!AssociatedFunction)
- return false;
-
- // TODO: use the function scope once we have call site AAReturnedValues.
- const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+ AA->~AbstractAttribute();
+ }
+}
+
+bool Attributor::isAssumedDead(const AbstractAttribute &AA,
+ const AAIsDead *FnLivenessAA,
+ bool CheckBBLivenessOnly, DepClassTy DepClass) {
+ const IRPosition &IRP = AA.getIRPosition();
+ if (!Functions.count(IRP.getAnchorScope()))
+ return false;
+ return isAssumedDead(IRP, &AA, FnLivenessAA, CheckBBLivenessOnly, DepClass);
+}
+
+bool Attributor::isAssumedDead(const Use &U,
+ const AbstractAttribute *QueryingAA,
+ const AAIsDead *FnLivenessAA,
+ bool CheckBBLivenessOnly, DepClassTy DepClass) {
+ Instruction *UserI = dyn_cast<Instruction>(U.getUser());
+ if (!UserI)
+ return isAssumedDead(IRPosition::value(*U.get()), QueryingAA, FnLivenessAA,
+ CheckBBLivenessOnly, DepClass);
+
+ if (auto *CB = dyn_cast<CallBase>(UserI)) {
+ // For call site argument uses we can check if the argument is
+ // unused/dead.
+ if (CB->isArgOperand(&U)) {
+ const IRPosition &CSArgPos =
+ IRPosition::callsite_argument(*CB, CB->getArgOperandNo(&U));
+ return isAssumedDead(CSArgPos, QueryingAA, FnLivenessAA,
+ CheckBBLivenessOnly, DepClass);
+ }
+ } else if (ReturnInst *RI = dyn_cast<ReturnInst>(UserI)) {
+ const IRPosition &RetPos = IRPosition::returned(*RI->getFunction());
+ return isAssumedDead(RetPos, QueryingAA, FnLivenessAA, CheckBBLivenessOnly,
+ DepClass);
+ } else if (PHINode *PHI = dyn_cast<PHINode>(UserI)) {
+ BasicBlock *IncomingBB = PHI->getIncomingBlock(U);
+ return isAssumedDead(*IncomingBB->getTerminator(), QueryingAA, FnLivenessAA,
+ CheckBBLivenessOnly, DepClass);
+ }
+
+ return isAssumedDead(IRPosition::value(*UserI), QueryingAA, FnLivenessAA,
+ CheckBBLivenessOnly, DepClass);
+}
+
+bool Attributor::isAssumedDead(const Instruction &I,
+ const AbstractAttribute *QueryingAA,
+ const AAIsDead *FnLivenessAA,
+ bool CheckBBLivenessOnly, DepClassTy DepClass) {
+ if (!FnLivenessAA)
+ FnLivenessAA = lookupAAFor<AAIsDead>(IRPosition::function(*I.getFunction()),
+ QueryingAA,
+ /* TrackDependence */ false);
+
+ // If we have a context instruction and a liveness AA we use it.
+ if (FnLivenessAA &&
+ FnLivenessAA->getIRPosition().getAnchorScope() == I.getFunction() &&
+ FnLivenessAA->isAssumedDead(&I)) {
+ if (QueryingAA)
+ recordDependence(*FnLivenessAA, *QueryingAA, DepClass);
+ return true;
+ }
+
+ if (CheckBBLivenessOnly)
+ return false;
+
+ const AAIsDead &IsDeadAA = getOrCreateAAFor<AAIsDead>(
+ IRPosition::value(I), QueryingAA, /* TrackDependence */ false);
+ // Don't check liveness for AAIsDead.
+ if (QueryingAA == &IsDeadAA)
+ return false;
+
+ if (IsDeadAA.isAssumedDead()) {
+ if (QueryingAA)
+ recordDependence(IsDeadAA, *QueryingAA, DepClass);
+ return true;
+ }
+
+ return false;
+}
+
+bool Attributor::isAssumedDead(const IRPosition &IRP,
+ const AbstractAttribute *QueryingAA,
+ const AAIsDead *FnLivenessAA,
+ bool CheckBBLivenessOnly, DepClassTy DepClass) {
+ Instruction *CtxI = IRP.getCtxI();
+ if (CtxI &&
+ isAssumedDead(*CtxI, QueryingAA, FnLivenessAA,
+ /* CheckBBLivenessOnly */ true,
+ CheckBBLivenessOnly ? DepClass : DepClassTy::OPTIONAL))
+ return true;
+
+ if (CheckBBLivenessOnly)
+ return false;
+
+ // If we haven't succeeded we query the specific liveness info for the IRP.
+ const AAIsDead *IsDeadAA;
+ if (IRP.getPositionKind() == IRPosition::IRP_CALL_SITE)
+ IsDeadAA = &getOrCreateAAFor<AAIsDead>(
+ IRPosition::callsite_returned(cast<CallBase>(IRP.getAssociatedValue())),
+ QueryingAA, /* TrackDependence */ false);
+ else
+ IsDeadAA = &getOrCreateAAFor<AAIsDead>(IRP, QueryingAA,
+ /* TrackDependence */ false);
+ // Don't check liveness for AAIsDead.
+ if (QueryingAA == IsDeadAA)
+ return false;
+
+ if (IsDeadAA->isAssumedDead()) {
+ if (QueryingAA)
+ recordDependence(*IsDeadAA, *QueryingAA, DepClass);
+ return true;
+ }
+
+ return false;
+}
+
+bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred,
+ const AbstractAttribute &QueryingAA,
+ const Value &V, DepClassTy LivenessDepClass) {
+
+ // Check the trivial case first as it catches void values.
+ if (V.use_empty())
+ return true;
+
+ // If the value is replaced by another one, for now a constant, we do not have
+ // uses. Note that this requires users of `checkForAllUses` to not recurse but
+ // instead use the `follow` callback argument to look at transitive users,
+ // however, that should be clear from the presence of the argument.
+ bool UsedAssumedInformation = false;
+ Optional<Constant *> C =
+ getAssumedConstant(V, QueryingAA, UsedAssumedInformation);
+ if (C.hasValue() && C.getValue()) {
+ LLVM_DEBUG(dbgs() << "[Attributor] Value is simplified, uses skipped: " << V
+ << " -> " << *C.getValue() << "\n");
+ return true;
+ }
+
+ const IRPosition &IRP = QueryingAA.getIRPosition();
+ SmallVector<const Use *, 16> Worklist;
+ SmallPtrSet<const Use *, 16> Visited;
+
+ for (const Use &U : V.uses())
+ Worklist.push_back(&U);
+
+ LLVM_DEBUG(dbgs() << "[Attributor] Got " << Worklist.size()
+ << " initial uses to check\n");
+
+ const Function *ScopeFn = IRP.getAnchorScope();
+ const auto *LivenessAA =
+ ScopeFn ? &getAAFor<AAIsDead>(QueryingAA, IRPosition::function(*ScopeFn),
+ /* TrackDependence */ false)
+ : nullptr;
+
+ while (!Worklist.empty()) {
+ const Use *U = Worklist.pop_back_val();
+ if (!Visited.insert(U).second)
+ continue;
+ LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << **U << " in "
+ << *U->getUser() << "\n");
+ if (isAssumedDead(*U, &QueryingAA, LivenessAA,
+ /* CheckBBLivenessOnly */ false, LivenessDepClass)) {
+ LLVM_DEBUG(dbgs() << "[Attributor] Dead use, skip!\n");
+ continue;
+ }
+ if (U->getUser()->isDroppable()) {
+ LLVM_DEBUG(dbgs() << "[Attributor] Droppable user, skip!\n");
+ continue;
+ }
+
+ bool Follow = false;
+ if (!Pred(*U, Follow))
+ return false;
+ if (!Follow)
+ continue;
+ for (const Use &UU : U->getUser()->uses())
+ Worklist.push_back(&UU);
+ }
+
+ return true;
+}
+
+bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
+ const AbstractAttribute &QueryingAA,
+ bool RequireAllCallSites,
+ bool &AllCallSitesKnown) {
+ // We can try to determine information from
+ // the call sites. However, this is only possible all call sites are known,
+ // hence the function has internal linkage.
+ const IRPosition &IRP = QueryingAA.getIRPosition();
+ const Function *AssociatedFunction = IRP.getAssociatedFunction();
+ if (!AssociatedFunction) {
+ LLVM_DEBUG(dbgs() << "[Attributor] No function associated with " << IRP
+ << "\n");
+ AllCallSitesKnown = false;
+ return false;
+ }
+
+ return checkForAllCallSites(Pred, *AssociatedFunction, RequireAllCallSites,
+ &QueryingAA, AllCallSitesKnown);
+}
+
+bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
+ const Function &Fn,
+ bool RequireAllCallSites,
+ const AbstractAttribute *QueryingAA,
+ bool &AllCallSitesKnown) {
+ if (RequireAllCallSites && !Fn.hasLocalLinkage()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "[Attributor] Function " << Fn.getName()
+ << " has no internal linkage, hence not all call sites are known\n");
+ AllCallSitesKnown = false;
+ return false;
+ }
+
+ // If we do not require all call sites we might not see all.
+ AllCallSitesKnown = RequireAllCallSites;
+
+ SmallVector<const Use *, 8> Uses(make_pointer_range(Fn.uses()));
+ for (unsigned u = 0; u < Uses.size(); ++u) {
+ const Use &U = *Uses[u];
+ LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << *U << " in "
+ << *U.getUser() << "\n");
+ if (isAssumedDead(U, QueryingAA, nullptr, /* CheckBBLivenessOnly */ true)) {
+ LLVM_DEBUG(dbgs() << "[Attributor] Dead use, skip!\n");
+ continue;
+ }
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U.getUser())) {
+ if (CE->isCast() && CE->getType()->isPointerTy() &&
+ CE->getType()->getPointerElementType()->isFunctionTy()) {
+ for (const Use &CEU : CE->uses())
+ Uses.push_back(&CEU);
+ continue;
+ }
+ }
+
+ AbstractCallSite ACS(&U);
+ if (!ACS) {
+ LLVM_DEBUG(dbgs() << "[Attributor] Function " << Fn.getName()
+ << " has non call site use " << *U.get() << " in "
+ << *U.getUser() << "\n");
+ // BlockAddress users are allowed.
+ if (isa<BlockAddress>(U.getUser()))
+ continue;
+ return false;
+ }
+
+ const Use *EffectiveUse =
+ ACS.isCallbackCall() ? &ACS.getCalleeUseForCallback() : &U;
+ if (!ACS.isCallee(EffectiveUse)) {
+ if (!RequireAllCallSites)
+ continue;
+ LLVM_DEBUG(dbgs() << "[Attributor] User " << EffectiveUse->getUser()
+ << " is an invalid use of " << Fn.getName() << "\n");
+ return false;
+ }
+
+ // Make sure the arguments that can be matched between the call site and the
+ // callee argee on their type. It is unlikely they do not and it doesn't
+ // make sense for all attributes to know/care about this.
+ assert(&Fn == ACS.getCalledFunction() && "Expected known callee");
+ unsigned MinArgsParams =
+ std::min(size_t(ACS.getNumArgOperands()), Fn.arg_size());
+ for (unsigned u = 0; u < MinArgsParams; ++u) {
+ Value *CSArgOp = ACS.getCallArgOperand(u);
+ if (CSArgOp && Fn.getArg(u)->getType() != CSArgOp->getType()) {
+ LLVM_DEBUG(
+ dbgs() << "[Attributor] Call site / callee argument type mismatch ["
+ << u << "@" << Fn.getName() << ": "
+ << *Fn.getArg(u)->getType() << " vs. "
+ << *ACS.getCallArgOperand(u)->getType() << "\n");
+ return false;
+ }
+ }
+
+ if (Pred(ACS))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "[Attributor] Call site callback failed for "
+ << *ACS.getInstruction() << "\n");
+ return false;
+ }
+
+ return true;
+}
+
+bool Attributor::checkForAllReturnedValuesAndReturnInsts(
+ function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred,
+ const AbstractAttribute &QueryingAA) {
+
+ const IRPosition &IRP = QueryingAA.getIRPosition();
+ // Since we need to provide return instructions we have to have an exact
+ // definition.
+ const Function *AssociatedFunction = IRP.getAssociatedFunction();
+ if (!AssociatedFunction)
+ return false;
+
+ // If this is a call site query we use the call site specific return values
+ // and liveness information.
+ // TODO: use the function scope once we have call site AAReturnedValues.
+ const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+ const auto &AARetVal = getAAFor<AAReturnedValues>(QueryingAA, QueryIRP);
+ if (!AARetVal.getState().isValidState())
+ return false;
+
+ return AARetVal.checkForAllReturnedValuesAndReturnInsts(Pred);
+}
+
+bool Attributor::checkForAllReturnedValues(
+ function_ref<bool(Value &)> Pred, const AbstractAttribute &QueryingAA) {
+
+ const IRPosition &IRP = QueryingAA.getIRPosition();
+ const Function *AssociatedFunction = IRP.getAssociatedFunction();
+ if (!AssociatedFunction)
+ return false;
+
+ // TODO: use the function scope once we have call site AAReturnedValues.
+ const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+ const auto &AARetVal = getAAFor<AAReturnedValues>(QueryingAA, QueryIRP);
+ if (!AARetVal.getState().isValidState())
+ return false;
+
+ return AARetVal.checkForAllReturnedValuesAndReturnInsts(
+ [&](Value &RV, const SmallSetVector<ReturnInst *, 4> &) {
+ return Pred(RV);
+ });
+}
+
+static bool checkForAllInstructionsImpl(
+ Attributor *A, InformationCache::OpcodeInstMapTy &OpcodeInstMap,
+ function_ref<bool(Instruction &)> Pred, const AbstractAttribute *QueryingAA,
+ const AAIsDead *LivenessAA, const ArrayRef<unsigned> &Opcodes,
+ bool CheckBBLivenessOnly = false) {
+ for (unsigned Opcode : Opcodes) {
+ // Check if we have instructions with this opcode at all first.
+ auto *Insts = OpcodeInstMap.lookup(Opcode);
+ if (!Insts)
+ continue;
+
+ for (Instruction *I : *Insts) {
+ // Skip dead instructions.
+ if (A && A->isAssumedDead(IRPosition::value(*I), QueryingAA, LivenessAA,
+ CheckBBLivenessOnly))
+ continue;
+
+ if (!Pred(*I))
+ return false;
+ }
+ }
+ return true;
+}
+
+bool Attributor::checkForAllInstructions(function_ref<bool(Instruction &)> Pred,
+ const AbstractAttribute &QueryingAA,
+ const ArrayRef<unsigned> &Opcodes,
+ bool CheckBBLivenessOnly) {
+
+ const IRPosition &IRP = QueryingAA.getIRPosition();
+ // Since we need to provide instructions we have to have an exact definition.
+ const Function *AssociatedFunction = IRP.getAssociatedFunction();
+ if (!AssociatedFunction)
+ return false;
+
+ // TODO: use the function scope once we have call site AAReturnedValues.
+ const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
const auto *LivenessAA =
CheckBBLivenessOnly ? nullptr
: &(getAAFor<AAIsDead>(QueryingAA, QueryIRP,
/* TrackDependence */ false));
-
- auto &OpcodeInstMap =
- InfoCache.getOpcodeInstMapForFunction(*AssociatedFunction);
- if (!checkForAllInstructionsImpl(this, OpcodeInstMap, Pred, &QueryingAA,
+
+ auto &OpcodeInstMap =
+ InfoCache.getOpcodeInstMapForFunction(*AssociatedFunction);
+ if (!checkForAllInstructionsImpl(this, OpcodeInstMap, Pred, &QueryingAA,
LivenessAA, Opcodes, CheckBBLivenessOnly))
- return false;
-
- return true;
-}
-
-bool Attributor::checkForAllReadWriteInstructions(
- function_ref<bool(Instruction &)> Pred, AbstractAttribute &QueryingAA) {
-
- const Function *AssociatedFunction =
- QueryingAA.getIRPosition().getAssociatedFunction();
- if (!AssociatedFunction)
- return false;
-
- // TODO: use the function scope once we have call site AAReturnedValues.
- const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
- const auto &LivenessAA =
- getAAFor<AAIsDead>(QueryingAA, QueryIRP, /* TrackDependence */ false);
-
- for (Instruction *I :
- InfoCache.getReadOrWriteInstsForFunction(*AssociatedFunction)) {
- // Skip dead instructions.
- if (isAssumedDead(IRPosition::value(*I), &QueryingAA, &LivenessAA))
- continue;
-
- if (!Pred(*I))
- return false;
- }
-
- return true;
-}
-
-void Attributor::runTillFixpoint() {
+ return false;
+
+ return true;
+}
+
+bool Attributor::checkForAllReadWriteInstructions(
+ function_ref<bool(Instruction &)> Pred, AbstractAttribute &QueryingAA) {
+
+ const Function *AssociatedFunction =
+ QueryingAA.getIRPosition().getAssociatedFunction();
+ if (!AssociatedFunction)
+ return false;
+
+ // TODO: use the function scope once we have call site AAReturnedValues.
+ const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
+ const auto &LivenessAA =
+ getAAFor<AAIsDead>(QueryingAA, QueryIRP, /* TrackDependence */ false);
+
+ for (Instruction *I :
+ InfoCache.getReadOrWriteInstsForFunction(*AssociatedFunction)) {
+ // Skip dead instructions.
+ if (isAssumedDead(IRPosition::value(*I), &QueryingAA, &LivenessAA))
+ continue;
+
+ if (!Pred(*I))
+ return false;
+ }
+
+ return true;
+}
+
+void Attributor::runTillFixpoint() {
TimeTraceScope TimeScope("Attributor::runTillFixpoint");
- LLVM_DEBUG(dbgs() << "[Attributor] Identified and initialized "
+ LLVM_DEBUG(dbgs() << "[Attributor] Identified and initialized "
<< DG.SyntheticRoot.Deps.size()
- << " abstract attributes.\n");
-
- // Now that all abstract attributes are collected and initialized we start
- // the abstract analysis.
-
- unsigned IterationCounter = 1;
-
- SmallVector<AbstractAttribute *, 32> ChangedAAs;
- SetVector<AbstractAttribute *> Worklist, InvalidAAs;
+ << " abstract attributes.\n");
+
+ // Now that all abstract attributes are collected and initialized we start
+ // the abstract analysis.
+
+ unsigned IterationCounter = 1;
+
+ SmallVector<AbstractAttribute *, 32> ChangedAAs;
+ SetVector<AbstractAttribute *> Worklist, InvalidAAs;
Worklist.insert(DG.SyntheticRoot.begin(), DG.SyntheticRoot.end());
-
- do {
- // Remember the size to determine new attributes.
+
+ do {
+ // Remember the size to determine new attributes.
size_t NumAAs = DG.SyntheticRoot.Deps.size();
- LLVM_DEBUG(dbgs() << "\n\n[Attributor] #Iteration: " << IterationCounter
- << ", Worklist size: " << Worklist.size() << "\n");
-
- // For invalid AAs we can fix dependent AAs that have a required dependence,
- // thereby folding long dependence chains in a single step without the need
- // to run updates.
- for (unsigned u = 0; u < InvalidAAs.size(); ++u) {
- AbstractAttribute *InvalidAA = InvalidAAs[u];
-
- // Check the dependences to fast track invalidation.
- LLVM_DEBUG(dbgs() << "[Attributor] InvalidAA: " << *InvalidAA << " has "
- << InvalidAA->Deps.size()
- << " required & optional dependences\n");
- while (!InvalidAA->Deps.empty()) {
- const auto &Dep = InvalidAA->Deps.back();
- InvalidAA->Deps.pop_back();
+ LLVM_DEBUG(dbgs() << "\n\n[Attributor] #Iteration: " << IterationCounter
+ << ", Worklist size: " << Worklist.size() << "\n");
+
+ // For invalid AAs we can fix dependent AAs that have a required dependence,
+ // thereby folding long dependence chains in a single step without the need
+ // to run updates.
+ for (unsigned u = 0; u < InvalidAAs.size(); ++u) {
+ AbstractAttribute *InvalidAA = InvalidAAs[u];
+
+ // Check the dependences to fast track invalidation.
+ LLVM_DEBUG(dbgs() << "[Attributor] InvalidAA: " << *InvalidAA << " has "
+ << InvalidAA->Deps.size()
+ << " required & optional dependences\n");
+ while (!InvalidAA->Deps.empty()) {
+ const auto &Dep = InvalidAA->Deps.back();
+ InvalidAA->Deps.pop_back();
AbstractAttribute *DepAA = cast<AbstractAttribute>(Dep.getPointer());
- if (Dep.getInt() == unsigned(DepClassTy::OPTIONAL)) {
- Worklist.insert(DepAA);
- continue;
- }
- DepAA->getState().indicatePessimisticFixpoint();
- assert(DepAA->getState().isAtFixpoint() && "Expected fixpoint state!");
- if (!DepAA->getState().isValidState())
- InvalidAAs.insert(DepAA);
- else
- ChangedAAs.push_back(DepAA);
- }
- }
-
- // Add all abstract attributes that are potentially dependent on one that
- // changed to the work list.
- for (AbstractAttribute *ChangedAA : ChangedAAs)
- while (!ChangedAA->Deps.empty()) {
+ if (Dep.getInt() == unsigned(DepClassTy::OPTIONAL)) {
+ Worklist.insert(DepAA);
+ continue;
+ }
+ DepAA->getState().indicatePessimisticFixpoint();
+ assert(DepAA->getState().isAtFixpoint() && "Expected fixpoint state!");
+ if (!DepAA->getState().isValidState())
+ InvalidAAs.insert(DepAA);
+ else
+ ChangedAAs.push_back(DepAA);
+ }
+ }
+
+ // Add all abstract attributes that are potentially dependent on one that
+ // changed to the work list.
+ for (AbstractAttribute *ChangedAA : ChangedAAs)
+ while (!ChangedAA->Deps.empty()) {
Worklist.insert(
cast<AbstractAttribute>(ChangedAA->Deps.back().getPointer()));
- ChangedAA->Deps.pop_back();
- }
-
- LLVM_DEBUG(dbgs() << "[Attributor] #Iteration: " << IterationCounter
- << ", Worklist+Dependent size: " << Worklist.size()
- << "\n");
-
- // Reset the changed and invalid set.
- ChangedAAs.clear();
- InvalidAAs.clear();
-
- // Update all abstract attribute in the work list and record the ones that
- // changed.
- for (AbstractAttribute *AA : Worklist) {
- const auto &AAState = AA->getState();
- if (!AAState.isAtFixpoint())
- if (updateAA(*AA) == ChangeStatus::CHANGED)
- ChangedAAs.push_back(AA);
-
- // Use the InvalidAAs vector to propagate invalid states fast transitively
- // without requiring updates.
- if (!AAState.isValidState())
- InvalidAAs.insert(AA);
- }
-
- // Add attributes to the changed set if they have been created in the last
- // iteration.
+ ChangedAA->Deps.pop_back();
+ }
+
+ LLVM_DEBUG(dbgs() << "[Attributor] #Iteration: " << IterationCounter
+ << ", Worklist+Dependent size: " << Worklist.size()
+ << "\n");
+
+ // Reset the changed and invalid set.
+ ChangedAAs.clear();
+ InvalidAAs.clear();
+
+ // Update all abstract attribute in the work list and record the ones that
+ // changed.
+ for (AbstractAttribute *AA : Worklist) {
+ const auto &AAState = AA->getState();
+ if (!AAState.isAtFixpoint())
+ if (updateAA(*AA) == ChangeStatus::CHANGED)
+ ChangedAAs.push_back(AA);
+
+ // Use the InvalidAAs vector to propagate invalid states fast transitively
+ // without requiring updates.
+ if (!AAState.isValidState())
+ InvalidAAs.insert(AA);
+ }
+
+ // Add attributes to the changed set if they have been created in the last
+ // iteration.
ChangedAAs.append(DG.SyntheticRoot.begin() + NumAAs,
DG.SyntheticRoot.end());
-
- // Reset the work list and repopulate with the changed abstract attributes.
- // Note that dependent ones are added above.
- Worklist.clear();
- Worklist.insert(ChangedAAs.begin(), ChangedAAs.end());
-
- } while (!Worklist.empty() && (IterationCounter++ < MaxFixpointIterations ||
- VerifyMaxFixpointIterations));
-
- LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
- << IterationCounter << "/" << MaxFixpointIterations
- << " iterations\n");
-
- // Reset abstract arguments not settled in a sound fixpoint by now. This
- // happens when we stopped the fixpoint iteration early. Note that only the
- // ones marked as "changed" *and* the ones transitively depending on them
- // need to be reverted to a pessimistic state. Others might not be in a
- // fixpoint state but we can use the optimistic results for them anyway.
- SmallPtrSet<AbstractAttribute *, 32> Visited;
- for (unsigned u = 0; u < ChangedAAs.size(); u++) {
- AbstractAttribute *ChangedAA = ChangedAAs[u];
- if (!Visited.insert(ChangedAA).second)
- continue;
-
- AbstractState &State = ChangedAA->getState();
- if (!State.isAtFixpoint()) {
- State.indicatePessimisticFixpoint();
-
- NumAttributesTimedOut++;
- }
-
- while (!ChangedAA->Deps.empty()) {
+
+ // Reset the work list and repopulate with the changed abstract attributes.
+ // Note that dependent ones are added above.
+ Worklist.clear();
+ Worklist.insert(ChangedAAs.begin(), ChangedAAs.end());
+
+ } while (!Worklist.empty() && (IterationCounter++ < MaxFixpointIterations ||
+ VerifyMaxFixpointIterations));
+
+ LLVM_DEBUG(dbgs() << "\n[Attributor] Fixpoint iteration done after: "
+ << IterationCounter << "/" << MaxFixpointIterations
+ << " iterations\n");
+
+ // Reset abstract arguments not settled in a sound fixpoint by now. This
+ // happens when we stopped the fixpoint iteration early. Note that only the
+ // ones marked as "changed" *and* the ones transitively depending on them
+ // need to be reverted to a pessimistic state. Others might not be in a
+ // fixpoint state but we can use the optimistic results for them anyway.
+ SmallPtrSet<AbstractAttribute *, 32> Visited;
+ for (unsigned u = 0; u < ChangedAAs.size(); u++) {
+ AbstractAttribute *ChangedAA = ChangedAAs[u];
+ if (!Visited.insert(ChangedAA).second)
+ continue;
+
+ AbstractState &State = ChangedAA->getState();
+ if (!State.isAtFixpoint()) {
+ State.indicatePessimisticFixpoint();
+
+ NumAttributesTimedOut++;
+ }
+
+ while (!ChangedAA->Deps.empty()) {
ChangedAAs.push_back(
cast<AbstractAttribute>(ChangedAA->Deps.back().getPointer()));
- ChangedAA->Deps.pop_back();
- }
- }
-
- LLVM_DEBUG({
- if (!Visited.empty())
- dbgs() << "\n[Attributor] Finalized " << Visited.size()
- << " abstract attributes.\n";
- });
-
- if (VerifyMaxFixpointIterations &&
- IterationCounter != MaxFixpointIterations) {
- errs() << "\n[Attributor] Fixpoint iteration done after: "
- << IterationCounter << "/" << MaxFixpointIterations
- << " iterations\n";
- llvm_unreachable("The fixpoint was not reached with exactly the number of "
- "specified iterations!");
- }
-}
-
-ChangeStatus Attributor::manifestAttributes() {
+ ChangedAA->Deps.pop_back();
+ }
+ }
+
+ LLVM_DEBUG({
+ if (!Visited.empty())
+ dbgs() << "\n[Attributor] Finalized " << Visited.size()
+ << " abstract attributes.\n";
+ });
+
+ if (VerifyMaxFixpointIterations &&
+ IterationCounter != MaxFixpointIterations) {
+ errs() << "\n[Attributor] Fixpoint iteration done after: "
+ << IterationCounter << "/" << MaxFixpointIterations
+ << " iterations\n";
+ llvm_unreachable("The fixpoint was not reached with exactly the number of "
+ "specified iterations!");
+ }
+}
+
+ChangeStatus Attributor::manifestAttributes() {
TimeTraceScope TimeScope("Attributor::manifestAttributes");
size_t NumFinalAAs = DG.SyntheticRoot.Deps.size();
-
- unsigned NumManifested = 0;
- unsigned NumAtFixpoint = 0;
- ChangeStatus ManifestChange = ChangeStatus::UNCHANGED;
+
+ unsigned NumManifested = 0;
+ unsigned NumAtFixpoint = 0;
+ ChangeStatus ManifestChange = ChangeStatus::UNCHANGED;
for (auto &DepAA : DG.SyntheticRoot.Deps) {
AbstractAttribute *AA = cast<AbstractAttribute>(DepAA.getPointer());
- AbstractState &State = AA->getState();
-
- // If there is not already a fixpoint reached, we can now take the
- // optimistic state. This is correct because we enforced a pessimistic one
- // on abstract attributes that were transitively dependent on a changed one
- // already above.
- if (!State.isAtFixpoint())
- State.indicateOptimisticFixpoint();
-
- // If the state is invalid, we do not try to manifest it.
- if (!State.isValidState())
- continue;
-
- // Skip dead code.
- if (isAssumedDead(*AA, nullptr, /* CheckBBLivenessOnly */ true))
- continue;
+ AbstractState &State = AA->getState();
+
+ // If there is not already a fixpoint reached, we can now take the
+ // optimistic state. This is correct because we enforced a pessimistic one
+ // on abstract attributes that were transitively dependent on a changed one
+ // already above.
+ if (!State.isAtFixpoint())
+ State.indicateOptimisticFixpoint();
+
+ // If the state is invalid, we do not try to manifest it.
+ if (!State.isValidState())
+ continue;
+
+ // Skip dead code.
+ if (isAssumedDead(*AA, nullptr, /* CheckBBLivenessOnly */ true))
+ continue;
// Check if the manifest debug counter that allows skipping manifestation of
// AAs
if (!DebugCounter::shouldExecute(ManifestDBGCounter))
continue;
- // Manifest the state and record if we changed the IR.
- ChangeStatus LocalChange = AA->manifest(*this);
- if (LocalChange == ChangeStatus::CHANGED && AreStatisticsEnabled())
- AA->trackStatistics();
- LLVM_DEBUG(dbgs() << "[Attributor] Manifest " << LocalChange << " : " << *AA
- << "\n");
-
- ManifestChange = ManifestChange | LocalChange;
-
- NumAtFixpoint++;
- NumManifested += (LocalChange == ChangeStatus::CHANGED);
- }
-
- (void)NumManifested;
- (void)NumAtFixpoint;
- LLVM_DEBUG(dbgs() << "\n[Attributor] Manifested " << NumManifested
- << " arguments while " << NumAtFixpoint
- << " were in a valid fixpoint state\n");
-
- NumAttributesManifested += NumManifested;
- NumAttributesValidFixpoint += NumAtFixpoint;
-
- (void)NumFinalAAs;
+ // Manifest the state and record if we changed the IR.
+ ChangeStatus LocalChange = AA->manifest(*this);
+ if (LocalChange == ChangeStatus::CHANGED && AreStatisticsEnabled())
+ AA->trackStatistics();
+ LLVM_DEBUG(dbgs() << "[Attributor] Manifest " << LocalChange << " : " << *AA
+ << "\n");
+
+ ManifestChange = ManifestChange | LocalChange;
+
+ NumAtFixpoint++;
+ NumManifested += (LocalChange == ChangeStatus::CHANGED);
+ }
+
+ (void)NumManifested;
+ (void)NumAtFixpoint;
+ LLVM_DEBUG(dbgs() << "\n[Attributor] Manifested " << NumManifested
+ << " arguments while " << NumAtFixpoint
+ << " were in a valid fixpoint state\n");
+
+ NumAttributesManifested += NumManifested;
+ NumAttributesValidFixpoint += NumAtFixpoint;
+
+ (void)NumFinalAAs;
if (NumFinalAAs != DG.SyntheticRoot.Deps.size()) {
for (unsigned u = NumFinalAAs; u < DG.SyntheticRoot.Deps.size(); ++u)
errs() << "Unexpected abstract attribute: "
<< cast<AbstractAttribute>(DG.SyntheticRoot.Deps[u].getPointer())
- << " :: "
+ << " :: "
<< cast<AbstractAttribute>(DG.SyntheticRoot.Deps[u].getPointer())
->getIRPosition()
.getAssociatedValue()
- << "\n";
- llvm_unreachable("Expected the final number of abstract attributes to "
- "remain unchanged!");
- }
- return ManifestChange;
-}
-
+ << "\n";
+ llvm_unreachable("Expected the final number of abstract attributes to "
+ "remain unchanged!");
+ }
+ return ManifestChange;
+}
+
void Attributor::identifyDeadInternalFunctions() {
// Identify dead internal functions and delete them. This happens outside
// the other fixpoint analysis as we might treat potentially dead functions
@@ -1215,133 +1215,133 @@ void Attributor::identifyDeadInternalFunctions() {
ToBeDeletedFunctions.insert(F);
}
-ChangeStatus Attributor::cleanupIR() {
+ChangeStatus Attributor::cleanupIR() {
TimeTraceScope TimeScope("Attributor::cleanupIR");
- // Delete stuff at the end to avoid invalid references and a nice order.
- LLVM_DEBUG(dbgs() << "\n[Attributor] Delete at least "
- << ToBeDeletedFunctions.size() << " functions and "
- << ToBeDeletedBlocks.size() << " blocks and "
- << ToBeDeletedInsts.size() << " instructions and "
- << ToBeChangedUses.size() << " uses\n");
-
- SmallVector<WeakTrackingVH, 32> DeadInsts;
- SmallVector<Instruction *, 32> TerminatorsToFold;
-
- for (auto &It : ToBeChangedUses) {
- Use *U = It.first;
- Value *NewV = It.second;
- Value *OldV = U->get();
-
- // Do not replace uses in returns if the value is a must-tail call we will
- // not delete.
- if (isa<ReturnInst>(U->getUser()))
- if (auto *CI = dyn_cast<CallInst>(OldV->stripPointerCasts()))
- if (CI->isMustTailCall() && !ToBeDeletedInsts.count(CI))
- continue;
-
- LLVM_DEBUG(dbgs() << "Use " << *NewV << " in " << *U->getUser()
- << " instead of " << *OldV << "\n");
- U->set(NewV);
- // Do not modify call instructions outside the SCC.
- if (auto *CB = dyn_cast<CallBase>(OldV))
- if (!Functions.count(CB->getCaller()))
- continue;
- if (Instruction *I = dyn_cast<Instruction>(OldV)) {
- CGModifiedFunctions.insert(I->getFunction());
- if (!isa<PHINode>(I) && !ToBeDeletedInsts.count(I) &&
- isInstructionTriviallyDead(I))
- DeadInsts.push_back(I);
- }
- if (isa<Constant>(NewV) && isa<BranchInst>(U->getUser())) {
- Instruction *UserI = cast<Instruction>(U->getUser());
- if (isa<UndefValue>(NewV)) {
- ToBeChangedToUnreachableInsts.insert(UserI);
- } else {
- TerminatorsToFold.push_back(UserI);
- }
- }
- }
- for (auto &V : InvokeWithDeadSuccessor)
- if (InvokeInst *II = dyn_cast_or_null<InvokeInst>(V)) {
- bool UnwindBBIsDead = II->hasFnAttr(Attribute::NoUnwind);
- bool NormalBBIsDead = II->hasFnAttr(Attribute::NoReturn);
- bool Invoke2CallAllowed =
- !AAIsDead::mayCatchAsynchronousExceptions(*II->getFunction());
- assert((UnwindBBIsDead || NormalBBIsDead) &&
- "Invoke does not have dead successors!");
- BasicBlock *BB = II->getParent();
- BasicBlock *NormalDestBB = II->getNormalDest();
- if (UnwindBBIsDead) {
- Instruction *NormalNextIP = &NormalDestBB->front();
- if (Invoke2CallAllowed) {
- changeToCall(II);
- NormalNextIP = BB->getTerminator();
- }
- if (NormalBBIsDead)
- ToBeChangedToUnreachableInsts.insert(NormalNextIP);
- } else {
- assert(NormalBBIsDead && "Broken invariant!");
- if (!NormalDestBB->getUniquePredecessor())
- NormalDestBB = SplitBlockPredecessors(NormalDestBB, {BB}, ".dead");
- ToBeChangedToUnreachableInsts.insert(&NormalDestBB->front());
- }
- }
- for (Instruction *I : TerminatorsToFold) {
- CGModifiedFunctions.insert(I->getFunction());
- ConstantFoldTerminator(I->getParent());
- }
- for (auto &V : ToBeChangedToUnreachableInsts)
- if (Instruction *I = dyn_cast_or_null<Instruction>(V)) {
- CGModifiedFunctions.insert(I->getFunction());
- changeToUnreachable(I, /* UseLLVMTrap */ false);
- }
-
- for (auto &V : ToBeDeletedInsts) {
- if (Instruction *I = dyn_cast_or_null<Instruction>(V)) {
- I->dropDroppableUses();
- CGModifiedFunctions.insert(I->getFunction());
- if (!I->getType()->isVoidTy())
- I->replaceAllUsesWith(UndefValue::get(I->getType()));
- if (!isa<PHINode>(I) && isInstructionTriviallyDead(I))
- DeadInsts.push_back(I);
- else
- I->eraseFromParent();
- }
- }
-
- LLVM_DEBUG(dbgs() << "[Attributor] DeadInsts size: " << DeadInsts.size()
- << "\n");
-
- RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
-
- if (unsigned NumDeadBlocks = ToBeDeletedBlocks.size()) {
- SmallVector<BasicBlock *, 8> ToBeDeletedBBs;
- ToBeDeletedBBs.reserve(NumDeadBlocks);
- for (BasicBlock *BB : ToBeDeletedBlocks) {
- CGModifiedFunctions.insert(BB->getParent());
- ToBeDeletedBBs.push_back(BB);
- }
- // Actually we do not delete the blocks but squash them into a single
- // unreachable but untangling branches that jump here is something we need
- // to do in a more generic way.
- DetatchDeadBlocks(ToBeDeletedBBs, nullptr);
- }
-
+ // Delete stuff at the end to avoid invalid references and a nice order.
+ LLVM_DEBUG(dbgs() << "\n[Attributor] Delete at least "
+ << ToBeDeletedFunctions.size() << " functions and "
+ << ToBeDeletedBlocks.size() << " blocks and "
+ << ToBeDeletedInsts.size() << " instructions and "
+ << ToBeChangedUses.size() << " uses\n");
+
+ SmallVector<WeakTrackingVH, 32> DeadInsts;
+ SmallVector<Instruction *, 32> TerminatorsToFold;
+
+ for (auto &It : ToBeChangedUses) {
+ Use *U = It.first;
+ Value *NewV = It.second;
+ Value *OldV = U->get();
+
+ // Do not replace uses in returns if the value is a must-tail call we will
+ // not delete.
+ if (isa<ReturnInst>(U->getUser()))
+ if (auto *CI = dyn_cast<CallInst>(OldV->stripPointerCasts()))
+ if (CI->isMustTailCall() && !ToBeDeletedInsts.count(CI))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Use " << *NewV << " in " << *U->getUser()
+ << " instead of " << *OldV << "\n");
+ U->set(NewV);
+ // Do not modify call instructions outside the SCC.
+ if (auto *CB = dyn_cast<CallBase>(OldV))
+ if (!Functions.count(CB->getCaller()))
+ continue;
+ if (Instruction *I = dyn_cast<Instruction>(OldV)) {
+ CGModifiedFunctions.insert(I->getFunction());
+ if (!isa<PHINode>(I) && !ToBeDeletedInsts.count(I) &&
+ isInstructionTriviallyDead(I))
+ DeadInsts.push_back(I);
+ }
+ if (isa<Constant>(NewV) && isa<BranchInst>(U->getUser())) {
+ Instruction *UserI = cast<Instruction>(U->getUser());
+ if (isa<UndefValue>(NewV)) {
+ ToBeChangedToUnreachableInsts.insert(UserI);
+ } else {
+ TerminatorsToFold.push_back(UserI);
+ }
+ }
+ }
+ for (auto &V : InvokeWithDeadSuccessor)
+ if (InvokeInst *II = dyn_cast_or_null<InvokeInst>(V)) {
+ bool UnwindBBIsDead = II->hasFnAttr(Attribute::NoUnwind);
+ bool NormalBBIsDead = II->hasFnAttr(Attribute::NoReturn);
+ bool Invoke2CallAllowed =
+ !AAIsDead::mayCatchAsynchronousExceptions(*II->getFunction());
+ assert((UnwindBBIsDead || NormalBBIsDead) &&
+ "Invoke does not have dead successors!");
+ BasicBlock *BB = II->getParent();
+ BasicBlock *NormalDestBB = II->getNormalDest();
+ if (UnwindBBIsDead) {
+ Instruction *NormalNextIP = &NormalDestBB->front();
+ if (Invoke2CallAllowed) {
+ changeToCall(II);
+ NormalNextIP = BB->getTerminator();
+ }
+ if (NormalBBIsDead)
+ ToBeChangedToUnreachableInsts.insert(NormalNextIP);
+ } else {
+ assert(NormalBBIsDead && "Broken invariant!");
+ if (!NormalDestBB->getUniquePredecessor())
+ NormalDestBB = SplitBlockPredecessors(NormalDestBB, {BB}, ".dead");
+ ToBeChangedToUnreachableInsts.insert(&NormalDestBB->front());
+ }
+ }
+ for (Instruction *I : TerminatorsToFold) {
+ CGModifiedFunctions.insert(I->getFunction());
+ ConstantFoldTerminator(I->getParent());
+ }
+ for (auto &V : ToBeChangedToUnreachableInsts)
+ if (Instruction *I = dyn_cast_or_null<Instruction>(V)) {
+ CGModifiedFunctions.insert(I->getFunction());
+ changeToUnreachable(I, /* UseLLVMTrap */ false);
+ }
+
+ for (auto &V : ToBeDeletedInsts) {
+ if (Instruction *I = dyn_cast_or_null<Instruction>(V)) {
+ I->dropDroppableUses();
+ CGModifiedFunctions.insert(I->getFunction());
+ if (!I->getType()->isVoidTy())
+ I->replaceAllUsesWith(UndefValue::get(I->getType()));
+ if (!isa<PHINode>(I) && isInstructionTriviallyDead(I))
+ DeadInsts.push_back(I);
+ else
+ I->eraseFromParent();
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "[Attributor] DeadInsts size: " << DeadInsts.size()
+ << "\n");
+
+ RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
+
+ if (unsigned NumDeadBlocks = ToBeDeletedBlocks.size()) {
+ SmallVector<BasicBlock *, 8> ToBeDeletedBBs;
+ ToBeDeletedBBs.reserve(NumDeadBlocks);
+ for (BasicBlock *BB : ToBeDeletedBlocks) {
+ CGModifiedFunctions.insert(BB->getParent());
+ ToBeDeletedBBs.push_back(BB);
+ }
+ // Actually we do not delete the blocks but squash them into a single
+ // unreachable but untangling branches that jump here is something we need
+ // to do in a more generic way.
+ DetatchDeadBlocks(ToBeDeletedBBs, nullptr);
+ }
+
identifyDeadInternalFunctions();
-
- // Rewrite the functions as requested during manifest.
- ChangeStatus ManifestChange = rewriteFunctionSignatures(CGModifiedFunctions);
-
- for (Function *Fn : CGModifiedFunctions)
+
+ // Rewrite the functions as requested during manifest.
+ ChangeStatus ManifestChange = rewriteFunctionSignatures(CGModifiedFunctions);
+
+ for (Function *Fn : CGModifiedFunctions)
if (!ToBeDeletedFunctions.count(Fn))
CGUpdater.reanalyzeFunction(*Fn);
-
+
for (Function *Fn : ToBeDeletedFunctions) {
if (!Functions.count(Fn))
continue;
- CGUpdater.removeFunction(*Fn);
+ CGUpdater.removeFunction(*Fn);
}
-
+
if (!ToBeChangedUses.empty())
ManifestChange = ChangeStatus::CHANGED;
@@ -1363,27 +1363,27 @@ ChangeStatus Attributor::cleanupIR() {
if (!DeadInsts.empty())
ManifestChange = ChangeStatus::CHANGED;
- NumFnDeleted += ToBeDeletedFunctions.size();
-
+ NumFnDeleted += ToBeDeletedFunctions.size();
+
LLVM_DEBUG(dbgs() << "[Attributor] Deleted " << ToBeDeletedFunctions.size()
- << " functions after manifest.\n");
-
-#ifdef EXPENSIVE_CHECKS
- for (Function *F : Functions) {
- if (ToBeDeletedFunctions.count(F))
- continue;
- assert(!verifyFunction(*F, &errs()) && "Module verification failed!");
- }
-#endif
-
- return ManifestChange;
-}
-
-ChangeStatus Attributor::run() {
+ << " functions after manifest.\n");
+
+#ifdef EXPENSIVE_CHECKS
+ for (Function *F : Functions) {
+ if (ToBeDeletedFunctions.count(F))
+ continue;
+ assert(!verifyFunction(*F, &errs()) && "Module verification failed!");
+ }
+#endif
+
+ return ManifestChange;
+}
+
+ChangeStatus Attributor::run() {
TimeTraceScope TimeScope("Attributor::run");
Phase = AttributorPhase::UPDATE;
- runTillFixpoint();
+ runTillFixpoint();
// dump graphs on demand
if (DumpDepGraph)
@@ -1396,95 +1396,95 @@ ChangeStatus Attributor::run() {
DG.print();
Phase = AttributorPhase::MANIFEST;
- ChangeStatus ManifestChange = manifestAttributes();
+ ChangeStatus ManifestChange = manifestAttributes();
Phase = AttributorPhase::CLEANUP;
- ChangeStatus CleanupChange = cleanupIR();
+ ChangeStatus CleanupChange = cleanupIR();
- return ManifestChange | CleanupChange;
-}
-
-ChangeStatus Attributor::updateAA(AbstractAttribute &AA) {
+ return ManifestChange | CleanupChange;
+}
+
+ChangeStatus Attributor::updateAA(AbstractAttribute &AA) {
TimeTraceScope TimeScope(
AA.getName() + std::to_string(AA.getIRPosition().getPositionKind()) +
"::updateAA");
assert(Phase == AttributorPhase::UPDATE &&
"We can update AA only in the update stage!");
- // Use a new dependence vector for this update.
- DependenceVector DV;
- DependenceStack.push_back(&DV);
-
- auto &AAState = AA.getState();
- ChangeStatus CS = ChangeStatus::UNCHANGED;
- if (!isAssumedDead(AA, nullptr, /* CheckBBLivenessOnly */ true))
- CS = AA.update(*this);
-
- if (DV.empty()) {
- // If the attribute did not query any non-fix information, the state
- // will not change and we can indicate that right away.
- AAState.indicateOptimisticFixpoint();
- }
-
- if (!AAState.isAtFixpoint())
- rememberDependences();
-
- // Verify the stack was used properly, that is we pop the dependence vector we
- // put there earlier.
- DependenceVector *PoppedDV = DependenceStack.pop_back_val();
- (void)PoppedDV;
- assert(PoppedDV == &DV && "Inconsistent usage of the dependence stack!");
-
- return CS;
-}
-
+ // Use a new dependence vector for this update.
+ DependenceVector DV;
+ DependenceStack.push_back(&DV);
+
+ auto &AAState = AA.getState();
+ ChangeStatus CS = ChangeStatus::UNCHANGED;
+ if (!isAssumedDead(AA, nullptr, /* CheckBBLivenessOnly */ true))
+ CS = AA.update(*this);
+
+ if (DV.empty()) {
+ // If the attribute did not query any non-fix information, the state
+ // will not change and we can indicate that right away.
+ AAState.indicateOptimisticFixpoint();
+ }
+
+ if (!AAState.isAtFixpoint())
+ rememberDependences();
+
+ // Verify the stack was used properly, that is we pop the dependence vector we
+ // put there earlier.
+ DependenceVector *PoppedDV = DependenceStack.pop_back_val();
+ (void)PoppedDV;
+ assert(PoppedDV == &DV && "Inconsistent usage of the dependence stack!");
+
+ return CS;
+}
+
void Attributor::createShallowWrapper(Function &F) {
- assert(!F.isDeclaration() && "Cannot create a wrapper around a declaration!");
-
- Module &M = *F.getParent();
- LLVMContext &Ctx = M.getContext();
- FunctionType *FnTy = F.getFunctionType();
-
- Function *Wrapper =
- Function::Create(FnTy, F.getLinkage(), F.getAddressSpace(), F.getName());
- F.setName(""); // set the inside function anonymous
- M.getFunctionList().insert(F.getIterator(), Wrapper);
-
- F.setLinkage(GlobalValue::InternalLinkage);
-
- F.replaceAllUsesWith(Wrapper);
- assert(F.use_empty() && "Uses remained after wrapper was created!");
-
- // Move the COMDAT section to the wrapper.
- // TODO: Check if we need to keep it for F as well.
- Wrapper->setComdat(F.getComdat());
- F.setComdat(nullptr);
-
- // Copy all metadata and attributes but keep them on F as well.
- SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
- F.getAllMetadata(MDs);
- for (auto MDIt : MDs)
- Wrapper->addMetadata(MDIt.first, *MDIt.second);
- Wrapper->setAttributes(F.getAttributes());
-
- // Create the call in the wrapper.
- BasicBlock *EntryBB = BasicBlock::Create(Ctx, "entry", Wrapper);
-
- SmallVector<Value *, 8> Args;
+ assert(!F.isDeclaration() && "Cannot create a wrapper around a declaration!");
+
+ Module &M = *F.getParent();
+ LLVMContext &Ctx = M.getContext();
+ FunctionType *FnTy = F.getFunctionType();
+
+ Function *Wrapper =
+ Function::Create(FnTy, F.getLinkage(), F.getAddressSpace(), F.getName());
+ F.setName(""); // set the inside function anonymous
+ M.getFunctionList().insert(F.getIterator(), Wrapper);
+
+ F.setLinkage(GlobalValue::InternalLinkage);
+
+ F.replaceAllUsesWith(Wrapper);
+ assert(F.use_empty() && "Uses remained after wrapper was created!");
+
+ // Move the COMDAT section to the wrapper.
+ // TODO: Check if we need to keep it for F as well.
+ Wrapper->setComdat(F.getComdat());
+ F.setComdat(nullptr);
+
+ // Copy all metadata and attributes but keep them on F as well.
+ SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+ F.getAllMetadata(MDs);
+ for (auto MDIt : MDs)
+ Wrapper->addMetadata(MDIt.first, *MDIt.second);
+ Wrapper->setAttributes(F.getAttributes());
+
+ // Create the call in the wrapper.
+ BasicBlock *EntryBB = BasicBlock::Create(Ctx, "entry", Wrapper);
+
+ SmallVector<Value *, 8> Args;
Argument *FArgIt = F.arg_begin();
- for (Argument &Arg : Wrapper->args()) {
- Args.push_back(&Arg);
- Arg.setName((FArgIt++)->getName());
- }
-
- CallInst *CI = CallInst::Create(&F, Args, "", EntryBB);
- CI->setTailCall(true);
- CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
- ReturnInst::Create(Ctx, CI->getType()->isVoidTy() ? nullptr : CI, EntryBB);
-
+ for (Argument &Arg : Wrapper->args()) {
+ Args.push_back(&Arg);
+ Arg.setName((FArgIt++)->getName());
+ }
+
+ CallInst *CI = CallInst::Create(&F, Args, "", EntryBB);
+ CI->setTailCall(true);
+ CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
+ ReturnInst::Create(Ctx, CI->getType()->isVoidTy() ? nullptr : CI, EntryBB);
+
NumFnShallowWrappersCreated++;
-}
-
+}
+
/// Make another copy of the function \p F such that the copied version has
/// internal linkage afterwards and can be analysed. Then we replace all uses
/// of the original function to the copied one
@@ -1535,106 +1535,106 @@ static Function *internalizeFunction(Function &F) {
return Copied;
}
-bool Attributor::isValidFunctionSignatureRewrite(
- Argument &Arg, ArrayRef<Type *> ReplacementTypes) {
-
- auto CallSiteCanBeChanged = [](AbstractCallSite ACS) {
- // Forbid the call site to cast the function return type. If we need to
- // rewrite these functions we need to re-create a cast for the new call site
- // (if the old had uses).
- if (!ACS.getCalledFunction() ||
- ACS.getInstruction()->getType() !=
- ACS.getCalledFunction()->getReturnType())
- return false;
- // Forbid must-tail calls for now.
- return !ACS.isCallbackCall() && !ACS.getInstruction()->isMustTailCall();
- };
-
- Function *Fn = Arg.getParent();
- // Avoid var-arg functions for now.
- if (Fn->isVarArg()) {
- LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite var-args functions\n");
- return false;
- }
-
- // Avoid functions with complicated argument passing semantics.
- AttributeList FnAttributeList = Fn->getAttributes();
- if (FnAttributeList.hasAttrSomewhere(Attribute::Nest) ||
- FnAttributeList.hasAttrSomewhere(Attribute::StructRet) ||
- FnAttributeList.hasAttrSomewhere(Attribute::InAlloca) ||
- FnAttributeList.hasAttrSomewhere(Attribute::Preallocated)) {
- LLVM_DEBUG(
- dbgs() << "[Attributor] Cannot rewrite due to complex attribute\n");
- return false;
- }
-
- // Avoid callbacks for now.
- bool AllCallSitesKnown;
- if (!checkForAllCallSites(CallSiteCanBeChanged, *Fn, true, nullptr,
- AllCallSitesKnown)) {
- LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite all call sites\n");
- return false;
- }
-
- auto InstPred = [](Instruction &I) {
- if (auto *CI = dyn_cast<CallInst>(&I))
- return !CI->isMustTailCall();
- return true;
- };
-
- // Forbid must-tail calls for now.
- // TODO:
- auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(*Fn);
- if (!checkForAllInstructionsImpl(nullptr, OpcodeInstMap, InstPred, nullptr,
- nullptr, {Instruction::Call})) {
- LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite due to instructions\n");
- return false;
- }
-
- return true;
-}
-
-bool Attributor::registerFunctionSignatureRewrite(
- Argument &Arg, ArrayRef<Type *> ReplacementTypes,
- ArgumentReplacementInfo::CalleeRepairCBTy &&CalleeRepairCB,
- ArgumentReplacementInfo::ACSRepairCBTy &&ACSRepairCB) {
- LLVM_DEBUG(dbgs() << "[Attributor] Register new rewrite of " << Arg << " in "
- << Arg.getParent()->getName() << " with "
- << ReplacementTypes.size() << " replacements\n");
- assert(isValidFunctionSignatureRewrite(Arg, ReplacementTypes) &&
- "Cannot register an invalid rewrite");
-
- Function *Fn = Arg.getParent();
- SmallVectorImpl<std::unique_ptr<ArgumentReplacementInfo>> &ARIs =
- ArgumentReplacementMap[Fn];
- if (ARIs.empty())
- ARIs.resize(Fn->arg_size());
-
- // If we have a replacement already with less than or equal new arguments,
- // ignore this request.
- std::unique_ptr<ArgumentReplacementInfo> &ARI = ARIs[Arg.getArgNo()];
- if (ARI && ARI->getNumReplacementArgs() <= ReplacementTypes.size()) {
- LLVM_DEBUG(dbgs() << "[Attributor] Existing rewrite is preferred\n");
- return false;
- }
-
- // If we have a replacement already but we like the new one better, delete
- // the old.
- ARI.reset();
-
- LLVM_DEBUG(dbgs() << "[Attributor] Register new rewrite of " << Arg << " in "
- << Arg.getParent()->getName() << " with "
- << ReplacementTypes.size() << " replacements\n");
-
- // Remember the replacement.
- ARI.reset(new ArgumentReplacementInfo(*this, Arg, ReplacementTypes,
- std::move(CalleeRepairCB),
- std::move(ACSRepairCB)));
-
- return true;
-}
-
-bool Attributor::shouldSeedAttribute(AbstractAttribute &AA) {
+bool Attributor::isValidFunctionSignatureRewrite(
+ Argument &Arg, ArrayRef<Type *> ReplacementTypes) {
+
+ auto CallSiteCanBeChanged = [](AbstractCallSite ACS) {
+ // Forbid the call site to cast the function return type. If we need to
+ // rewrite these functions we need to re-create a cast for the new call site
+ // (if the old had uses).
+ if (!ACS.getCalledFunction() ||
+ ACS.getInstruction()->getType() !=
+ ACS.getCalledFunction()->getReturnType())
+ return false;
+ // Forbid must-tail calls for now.
+ return !ACS.isCallbackCall() && !ACS.getInstruction()->isMustTailCall();
+ };
+
+ Function *Fn = Arg.getParent();
+ // Avoid var-arg functions for now.
+ if (Fn->isVarArg()) {
+ LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite var-args functions\n");
+ return false;
+ }
+
+ // Avoid functions with complicated argument passing semantics.
+ AttributeList FnAttributeList = Fn->getAttributes();
+ if (FnAttributeList.hasAttrSomewhere(Attribute::Nest) ||
+ FnAttributeList.hasAttrSomewhere(Attribute::StructRet) ||
+ FnAttributeList.hasAttrSomewhere(Attribute::InAlloca) ||
+ FnAttributeList.hasAttrSomewhere(Attribute::Preallocated)) {
+ LLVM_DEBUG(
+ dbgs() << "[Attributor] Cannot rewrite due to complex attribute\n");
+ return false;
+ }
+
+ // Avoid callbacks for now.
+ bool AllCallSitesKnown;
+ if (!checkForAllCallSites(CallSiteCanBeChanged, *Fn, true, nullptr,
+ AllCallSitesKnown)) {
+ LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite all call sites\n");
+ return false;
+ }
+
+ auto InstPred = [](Instruction &I) {
+ if (auto *CI = dyn_cast<CallInst>(&I))
+ return !CI->isMustTailCall();
+ return true;
+ };
+
+ // Forbid must-tail calls for now.
+ // TODO:
+ auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(*Fn);
+ if (!checkForAllInstructionsImpl(nullptr, OpcodeInstMap, InstPred, nullptr,
+ nullptr, {Instruction::Call})) {
+ LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite due to instructions\n");
+ return false;
+ }
+
+ return true;
+}
+
+bool Attributor::registerFunctionSignatureRewrite(
+ Argument &Arg, ArrayRef<Type *> ReplacementTypes,
+ ArgumentReplacementInfo::CalleeRepairCBTy &&CalleeRepairCB,
+ ArgumentReplacementInfo::ACSRepairCBTy &&ACSRepairCB) {
+ LLVM_DEBUG(dbgs() << "[Attributor] Register new rewrite of " << Arg << " in "
+ << Arg.getParent()->getName() << " with "
+ << ReplacementTypes.size() << " replacements\n");
+ assert(isValidFunctionSignatureRewrite(Arg, ReplacementTypes) &&
+ "Cannot register an invalid rewrite");
+
+ Function *Fn = Arg.getParent();
+ SmallVectorImpl<std::unique_ptr<ArgumentReplacementInfo>> &ARIs =
+ ArgumentReplacementMap[Fn];
+ if (ARIs.empty())
+ ARIs.resize(Fn->arg_size());
+
+ // If we have a replacement already with less than or equal new arguments,
+ // ignore this request.
+ std::unique_ptr<ArgumentReplacementInfo> &ARI = ARIs[Arg.getArgNo()];
+ if (ARI && ARI->getNumReplacementArgs() <= ReplacementTypes.size()) {
+ LLVM_DEBUG(dbgs() << "[Attributor] Existing rewrite is preferred\n");
+ return false;
+ }
+
+ // If we have a replacement already but we like the new one better, delete
+ // the old.
+ ARI.reset();
+
+ LLVM_DEBUG(dbgs() << "[Attributor] Register new rewrite of " << Arg << " in "
+ << Arg.getParent()->getName() << " with "
+ << ReplacementTypes.size() << " replacements\n");
+
+ // Remember the replacement.
+ ARI.reset(new ArgumentReplacementInfo(*this, Arg, ReplacementTypes,
+ std::move(CalleeRepairCB),
+ std::move(ACSRepairCB)));
+
+ return true;
+}
+
+bool Attributor::shouldSeedAttribute(AbstractAttribute &AA) {
bool Result = true;
#ifndef NDEBUG
if (SeedAllowList.size() != 0)
@@ -1646,583 +1646,583 @@ bool Attributor::shouldSeedAttribute(AbstractAttribute &AA) {
FunctionSeedAllowList.end(), Fn->getName());
#endif
return Result;
-}
-
-ChangeStatus Attributor::rewriteFunctionSignatures(
- SmallPtrSetImpl<Function *> &ModifiedFns) {
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
-
- for (auto &It : ArgumentReplacementMap) {
- Function *OldFn = It.getFirst();
-
- // Deleted functions do not require rewrites.
+}
+
+ChangeStatus Attributor::rewriteFunctionSignatures(
+ SmallPtrSetImpl<Function *> &ModifiedFns) {
+ ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+ for (auto &It : ArgumentReplacementMap) {
+ Function *OldFn = It.getFirst();
+
+ // Deleted functions do not require rewrites.
if (!Functions.count(OldFn) || ToBeDeletedFunctions.count(OldFn))
- continue;
-
- const SmallVectorImpl<std::unique_ptr<ArgumentReplacementInfo>> &ARIs =
- It.getSecond();
- assert(ARIs.size() == OldFn->arg_size() && "Inconsistent state!");
-
- SmallVector<Type *, 16> NewArgumentTypes;
- SmallVector<AttributeSet, 16> NewArgumentAttributes;
-
- // Collect replacement argument types and copy over existing attributes.
- AttributeList OldFnAttributeList = OldFn->getAttributes();
- for (Argument &Arg : OldFn->args()) {
- if (const std::unique_ptr<ArgumentReplacementInfo> &ARI =
- ARIs[Arg.getArgNo()]) {
- NewArgumentTypes.append(ARI->ReplacementTypes.begin(),
- ARI->ReplacementTypes.end());
- NewArgumentAttributes.append(ARI->getNumReplacementArgs(),
- AttributeSet());
- } else {
- NewArgumentTypes.push_back(Arg.getType());
- NewArgumentAttributes.push_back(
- OldFnAttributeList.getParamAttributes(Arg.getArgNo()));
- }
- }
-
- FunctionType *OldFnTy = OldFn->getFunctionType();
- Type *RetTy = OldFnTy->getReturnType();
-
- // Construct the new function type using the new arguments types.
- FunctionType *NewFnTy =
- FunctionType::get(RetTy, NewArgumentTypes, OldFnTy->isVarArg());
-
- LLVM_DEBUG(dbgs() << "[Attributor] Function rewrite '" << OldFn->getName()
- << "' from " << *OldFn->getFunctionType() << " to "
- << *NewFnTy << "\n");
-
- // Create the new function body and insert it into the module.
- Function *NewFn = Function::Create(NewFnTy, OldFn->getLinkage(),
- OldFn->getAddressSpace(), "");
- OldFn->getParent()->getFunctionList().insert(OldFn->getIterator(), NewFn);
- NewFn->takeName(OldFn);
- NewFn->copyAttributesFrom(OldFn);
-
- // Patch the pointer to LLVM function in debug info descriptor.
- NewFn->setSubprogram(OldFn->getSubprogram());
- OldFn->setSubprogram(nullptr);
-
- // Recompute the parameter attributes list based on the new arguments for
- // the function.
- LLVMContext &Ctx = OldFn->getContext();
- NewFn->setAttributes(AttributeList::get(
- Ctx, OldFnAttributeList.getFnAttributes(),
- OldFnAttributeList.getRetAttributes(), NewArgumentAttributes));
-
- // Since we have now created the new function, splice the body of the old
- // function right into the new function, leaving the old rotting hulk of the
- // function empty.
- NewFn->getBasicBlockList().splice(NewFn->begin(),
- OldFn->getBasicBlockList());
-
- // Fixup block addresses to reference new function.
- SmallVector<BlockAddress *, 8u> BlockAddresses;
- for (User *U : OldFn->users())
- if (auto *BA = dyn_cast<BlockAddress>(U))
- BlockAddresses.push_back(BA);
- for (auto *BA : BlockAddresses)
- BA->replaceAllUsesWith(BlockAddress::get(NewFn, BA->getBasicBlock()));
-
- // Set of all "call-like" instructions that invoke the old function mapped
- // to their new replacements.
- SmallVector<std::pair<CallBase *, CallBase *>, 8> CallSitePairs;
-
- // Callback to create a new "call-like" instruction for a given one.
- auto CallSiteReplacementCreator = [&](AbstractCallSite ACS) {
- CallBase *OldCB = cast<CallBase>(ACS.getInstruction());
- const AttributeList &OldCallAttributeList = OldCB->getAttributes();
-
- // Collect the new argument operands for the replacement call site.
- SmallVector<Value *, 16> NewArgOperands;
- SmallVector<AttributeSet, 16> NewArgOperandAttributes;
- for (unsigned OldArgNum = 0; OldArgNum < ARIs.size(); ++OldArgNum) {
- unsigned NewFirstArgNum = NewArgOperands.size();
- (void)NewFirstArgNum; // only used inside assert.
- if (const std::unique_ptr<ArgumentReplacementInfo> &ARI =
- ARIs[OldArgNum]) {
- if (ARI->ACSRepairCB)
- ARI->ACSRepairCB(*ARI, ACS, NewArgOperands);
- assert(ARI->getNumReplacementArgs() + NewFirstArgNum ==
- NewArgOperands.size() &&
- "ACS repair callback did not provide as many operand as new "
- "types were registered!");
- // TODO: Exose the attribute set to the ACS repair callback
- NewArgOperandAttributes.append(ARI->ReplacementTypes.size(),
- AttributeSet());
- } else {
- NewArgOperands.push_back(ACS.getCallArgOperand(OldArgNum));
- NewArgOperandAttributes.push_back(
- OldCallAttributeList.getParamAttributes(OldArgNum));
- }
- }
-
- assert(NewArgOperands.size() == NewArgOperandAttributes.size() &&
- "Mismatch # argument operands vs. # argument operand attributes!");
- assert(NewArgOperands.size() == NewFn->arg_size() &&
- "Mismatch # argument operands vs. # function arguments!");
-
- SmallVector<OperandBundleDef, 4> OperandBundleDefs;
- OldCB->getOperandBundlesAsDefs(OperandBundleDefs);
-
- // Create a new call or invoke instruction to replace the old one.
- CallBase *NewCB;
- if (InvokeInst *II = dyn_cast<InvokeInst>(OldCB)) {
- NewCB =
- InvokeInst::Create(NewFn, II->getNormalDest(), II->getUnwindDest(),
- NewArgOperands, OperandBundleDefs, "", OldCB);
- } else {
- auto *NewCI = CallInst::Create(NewFn, NewArgOperands, OperandBundleDefs,
- "", OldCB);
- NewCI->setTailCallKind(cast<CallInst>(OldCB)->getTailCallKind());
- NewCB = NewCI;
- }
-
- // Copy over various properties and the new attributes.
- NewCB->copyMetadata(*OldCB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
- NewCB->setCallingConv(OldCB->getCallingConv());
- NewCB->takeName(OldCB);
- NewCB->setAttributes(AttributeList::get(
- Ctx, OldCallAttributeList.getFnAttributes(),
- OldCallAttributeList.getRetAttributes(), NewArgOperandAttributes));
-
- CallSitePairs.push_back({OldCB, NewCB});
- return true;
- };
-
- // Use the CallSiteReplacementCreator to create replacement call sites.
- bool AllCallSitesKnown;
- bool Success = checkForAllCallSites(CallSiteReplacementCreator, *OldFn,
- true, nullptr, AllCallSitesKnown);
- (void)Success;
- assert(Success && "Assumed call site replacement to succeed!");
-
- // Rewire the arguments.
+ continue;
+
+ const SmallVectorImpl<std::unique_ptr<ArgumentReplacementInfo>> &ARIs =
+ It.getSecond();
+ assert(ARIs.size() == OldFn->arg_size() && "Inconsistent state!");
+
+ SmallVector<Type *, 16> NewArgumentTypes;
+ SmallVector<AttributeSet, 16> NewArgumentAttributes;
+
+ // Collect replacement argument types and copy over existing attributes.
+ AttributeList OldFnAttributeList = OldFn->getAttributes();
+ for (Argument &Arg : OldFn->args()) {
+ if (const std::unique_ptr<ArgumentReplacementInfo> &ARI =
+ ARIs[Arg.getArgNo()]) {
+ NewArgumentTypes.append(ARI->ReplacementTypes.begin(),
+ ARI->ReplacementTypes.end());
+ NewArgumentAttributes.append(ARI->getNumReplacementArgs(),
+ AttributeSet());
+ } else {
+ NewArgumentTypes.push_back(Arg.getType());
+ NewArgumentAttributes.push_back(
+ OldFnAttributeList.getParamAttributes(Arg.getArgNo()));
+ }
+ }
+
+ FunctionType *OldFnTy = OldFn->getFunctionType();
+ Type *RetTy = OldFnTy->getReturnType();
+
+ // Construct the new function type using the new arguments types.
+ FunctionType *NewFnTy =
+ FunctionType::get(RetTy, NewArgumentTypes, OldFnTy->isVarArg());
+
+ LLVM_DEBUG(dbgs() << "[Attributor] Function rewrite '" << OldFn->getName()
+ << "' from " << *OldFn->getFunctionType() << " to "
+ << *NewFnTy << "\n");
+
+ // Create the new function body and insert it into the module.
+ Function *NewFn = Function::Create(NewFnTy, OldFn->getLinkage(),
+ OldFn->getAddressSpace(), "");
+ OldFn->getParent()->getFunctionList().insert(OldFn->getIterator(), NewFn);
+ NewFn->takeName(OldFn);
+ NewFn->copyAttributesFrom(OldFn);
+
+ // Patch the pointer to LLVM function in debug info descriptor.
+ NewFn->setSubprogram(OldFn->getSubprogram());
+ OldFn->setSubprogram(nullptr);
+
+ // Recompute the parameter attributes list based on the new arguments for
+ // the function.
+ LLVMContext &Ctx = OldFn->getContext();
+ NewFn->setAttributes(AttributeList::get(
+ Ctx, OldFnAttributeList.getFnAttributes(),
+ OldFnAttributeList.getRetAttributes(), NewArgumentAttributes));
+
+ // Since we have now created the new function, splice the body of the old
+ // function right into the new function, leaving the old rotting hulk of the
+ // function empty.
+ NewFn->getBasicBlockList().splice(NewFn->begin(),
+ OldFn->getBasicBlockList());
+
+ // Fixup block addresses to reference new function.
+ SmallVector<BlockAddress *, 8u> BlockAddresses;
+ for (User *U : OldFn->users())
+ if (auto *BA = dyn_cast<BlockAddress>(U))
+ BlockAddresses.push_back(BA);
+ for (auto *BA : BlockAddresses)
+ BA->replaceAllUsesWith(BlockAddress::get(NewFn, BA->getBasicBlock()));
+
+ // Set of all "call-like" instructions that invoke the old function mapped
+ // to their new replacements.
+ SmallVector<std::pair<CallBase *, CallBase *>, 8> CallSitePairs;
+
+ // Callback to create a new "call-like" instruction for a given one.
+ auto CallSiteReplacementCreator = [&](AbstractCallSite ACS) {
+ CallBase *OldCB = cast<CallBase>(ACS.getInstruction());
+ const AttributeList &OldCallAttributeList = OldCB->getAttributes();
+
+ // Collect the new argument operands for the replacement call site.
+ SmallVector<Value *, 16> NewArgOperands;
+ SmallVector<AttributeSet, 16> NewArgOperandAttributes;
+ for (unsigned OldArgNum = 0; OldArgNum < ARIs.size(); ++OldArgNum) {
+ unsigned NewFirstArgNum = NewArgOperands.size();
+ (void)NewFirstArgNum; // only used inside assert.
+ if (const std::unique_ptr<ArgumentReplacementInfo> &ARI =
+ ARIs[OldArgNum]) {
+ if (ARI->ACSRepairCB)
+ ARI->ACSRepairCB(*ARI, ACS, NewArgOperands);
+ assert(ARI->getNumReplacementArgs() + NewFirstArgNum ==
+ NewArgOperands.size() &&
+ "ACS repair callback did not provide as many operand as new "
+ "types were registered!");
+ // TODO: Exose the attribute set to the ACS repair callback
+ NewArgOperandAttributes.append(ARI->ReplacementTypes.size(),
+ AttributeSet());
+ } else {
+ NewArgOperands.push_back(ACS.getCallArgOperand(OldArgNum));
+ NewArgOperandAttributes.push_back(
+ OldCallAttributeList.getParamAttributes(OldArgNum));
+ }
+ }
+
+ assert(NewArgOperands.size() == NewArgOperandAttributes.size() &&
+ "Mismatch # argument operands vs. # argument operand attributes!");
+ assert(NewArgOperands.size() == NewFn->arg_size() &&
+ "Mismatch # argument operands vs. # function arguments!");
+
+ SmallVector<OperandBundleDef, 4> OperandBundleDefs;
+ OldCB->getOperandBundlesAsDefs(OperandBundleDefs);
+
+ // Create a new call or invoke instruction to replace the old one.
+ CallBase *NewCB;
+ if (InvokeInst *II = dyn_cast<InvokeInst>(OldCB)) {
+ NewCB =
+ InvokeInst::Create(NewFn, II->getNormalDest(), II->getUnwindDest(),
+ NewArgOperands, OperandBundleDefs, "", OldCB);
+ } else {
+ auto *NewCI = CallInst::Create(NewFn, NewArgOperands, OperandBundleDefs,
+ "", OldCB);
+ NewCI->setTailCallKind(cast<CallInst>(OldCB)->getTailCallKind());
+ NewCB = NewCI;
+ }
+
+ // Copy over various properties and the new attributes.
+ NewCB->copyMetadata(*OldCB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
+ NewCB->setCallingConv(OldCB->getCallingConv());
+ NewCB->takeName(OldCB);
+ NewCB->setAttributes(AttributeList::get(
+ Ctx, OldCallAttributeList.getFnAttributes(),
+ OldCallAttributeList.getRetAttributes(), NewArgOperandAttributes));
+
+ CallSitePairs.push_back({OldCB, NewCB});
+ return true;
+ };
+
+ // Use the CallSiteReplacementCreator to create replacement call sites.
+ bool AllCallSitesKnown;
+ bool Success = checkForAllCallSites(CallSiteReplacementCreator, *OldFn,
+ true, nullptr, AllCallSitesKnown);
+ (void)Success;
+ assert(Success && "Assumed call site replacement to succeed!");
+
+ // Rewire the arguments.
Argument *OldFnArgIt = OldFn->arg_begin();
Argument *NewFnArgIt = NewFn->arg_begin();
- for (unsigned OldArgNum = 0; OldArgNum < ARIs.size();
- ++OldArgNum, ++OldFnArgIt) {
- if (const std::unique_ptr<ArgumentReplacementInfo> &ARI =
- ARIs[OldArgNum]) {
- if (ARI->CalleeRepairCB)
- ARI->CalleeRepairCB(*ARI, *NewFn, NewFnArgIt);
- NewFnArgIt += ARI->ReplacementTypes.size();
- } else {
- NewFnArgIt->takeName(&*OldFnArgIt);
- OldFnArgIt->replaceAllUsesWith(&*NewFnArgIt);
- ++NewFnArgIt;
- }
- }
-
- // Eliminate the instructions *after* we visited all of them.
- for (auto &CallSitePair : CallSitePairs) {
- CallBase &OldCB = *CallSitePair.first;
- CallBase &NewCB = *CallSitePair.second;
- assert(OldCB.getType() == NewCB.getType() &&
- "Cannot handle call sites with different types!");
- ModifiedFns.insert(OldCB.getFunction());
- CGUpdater.replaceCallSite(OldCB, NewCB);
- OldCB.replaceAllUsesWith(&NewCB);
- OldCB.eraseFromParent();
- }
-
- // Replace the function in the call graph (if any).
- CGUpdater.replaceFunctionWith(*OldFn, *NewFn);
-
- // If the old function was modified and needed to be reanalyzed, the new one
- // does now.
- if (ModifiedFns.erase(OldFn))
- ModifiedFns.insert(NewFn);
-
- Changed = ChangeStatus::CHANGED;
- }
-
- return Changed;
-}
-
-void InformationCache::initializeInformationCache(const Function &CF,
- FunctionInfo &FI) {
- // As we do not modify the function here we can remove the const
- // withouth breaking implicit assumptions. At the end of the day, we could
- // initialize the cache eagerly which would look the same to the users.
- Function &F = const_cast<Function &>(CF);
-
- // Walk all instructions to find interesting instructions that might be
- // queried by abstract attributes during their initialization or update.
- // This has to happen before we create attributes.
-
- for (Instruction &I : instructions(&F)) {
- bool IsInterestingOpcode = false;
-
- // To allow easy access to all instructions in a function with a given
- // opcode we store them in the InfoCache. As not all opcodes are interesting
- // to concrete attributes we only cache the ones that are as identified in
- // the following switch.
- // Note: There are no concrete attributes now so this is initially empty.
- switch (I.getOpcode()) {
- default:
- assert(!isa<CallBase>(&I) &&
- "New call base instruction type needs to be known in the "
- "Attributor.");
- break;
- case Instruction::Call:
- // Calls are interesting on their own, additionally:
- // For `llvm.assume` calls we also fill the KnowledgeMap as we find them.
- // For `must-tail` calls we remember the caller and callee.
- if (IntrinsicInst *Assume = dyn_cast<IntrinsicInst>(&I)) {
- if (Assume->getIntrinsicID() == Intrinsic::assume)
- fillMapFromAssume(*Assume, KnowledgeMap);
- } else if (cast<CallInst>(I).isMustTailCall()) {
- FI.ContainsMustTailCall = true;
- if (const Function *Callee = cast<CallInst>(I).getCalledFunction())
- getFunctionInfo(*Callee).CalledViaMustTail = true;
- }
- LLVM_FALLTHROUGH;
- case Instruction::CallBr:
- case Instruction::Invoke:
- case Instruction::CleanupRet:
- case Instruction::CatchSwitch:
- case Instruction::AtomicRMW:
- case Instruction::AtomicCmpXchg:
- case Instruction::Br:
- case Instruction::Resume:
- case Instruction::Ret:
- case Instruction::Load:
- // The alignment of a pointer is interesting for loads.
- case Instruction::Store:
- // The alignment of a pointer is interesting for stores.
- IsInterestingOpcode = true;
- }
- if (IsInterestingOpcode) {
- auto *&Insts = FI.OpcodeInstMap[I.getOpcode()];
- if (!Insts)
- Insts = new (Allocator) InstructionVectorTy();
- Insts->push_back(&I);
- }
- if (I.mayReadOrWriteMemory())
- FI.RWInsts.push_back(&I);
- }
-
- if (F.hasFnAttribute(Attribute::AlwaysInline) &&
- isInlineViable(F).isSuccess())
- InlineableFunctions.insert(&F);
-}
-
+ for (unsigned OldArgNum = 0; OldArgNum < ARIs.size();
+ ++OldArgNum, ++OldFnArgIt) {
+ if (const std::unique_ptr<ArgumentReplacementInfo> &ARI =
+ ARIs[OldArgNum]) {
+ if (ARI->CalleeRepairCB)
+ ARI->CalleeRepairCB(*ARI, *NewFn, NewFnArgIt);
+ NewFnArgIt += ARI->ReplacementTypes.size();
+ } else {
+ NewFnArgIt->takeName(&*OldFnArgIt);
+ OldFnArgIt->replaceAllUsesWith(&*NewFnArgIt);
+ ++NewFnArgIt;
+ }
+ }
+
+ // Eliminate the instructions *after* we visited all of them.
+ for (auto &CallSitePair : CallSitePairs) {
+ CallBase &OldCB = *CallSitePair.first;
+ CallBase &NewCB = *CallSitePair.second;
+ assert(OldCB.getType() == NewCB.getType() &&
+ "Cannot handle call sites with different types!");
+ ModifiedFns.insert(OldCB.getFunction());
+ CGUpdater.replaceCallSite(OldCB, NewCB);
+ OldCB.replaceAllUsesWith(&NewCB);
+ OldCB.eraseFromParent();
+ }
+
+ // Replace the function in the call graph (if any).
+ CGUpdater.replaceFunctionWith(*OldFn, *NewFn);
+
+ // If the old function was modified and needed to be reanalyzed, the new one
+ // does now.
+ if (ModifiedFns.erase(OldFn))
+ ModifiedFns.insert(NewFn);
+
+ Changed = ChangeStatus::CHANGED;
+ }
+
+ return Changed;
+}
+
+void InformationCache::initializeInformationCache(const Function &CF,
+ FunctionInfo &FI) {
+ // As we do not modify the function here we can remove the const
+ // withouth breaking implicit assumptions. At the end of the day, we could
+ // initialize the cache eagerly which would look the same to the users.
+ Function &F = const_cast<Function &>(CF);
+
+ // Walk all instructions to find interesting instructions that might be
+ // queried by abstract attributes during their initialization or update.
+ // This has to happen before we create attributes.
+
+ for (Instruction &I : instructions(&F)) {
+ bool IsInterestingOpcode = false;
+
+ // To allow easy access to all instructions in a function with a given
+ // opcode we store them in the InfoCache. As not all opcodes are interesting
+ // to concrete attributes we only cache the ones that are as identified in
+ // the following switch.
+ // Note: There are no concrete attributes now so this is initially empty.
+ switch (I.getOpcode()) {
+ default:
+ assert(!isa<CallBase>(&I) &&
+ "New call base instruction type needs to be known in the "
+ "Attributor.");
+ break;
+ case Instruction::Call:
+ // Calls are interesting on their own, additionally:
+ // For `llvm.assume` calls we also fill the KnowledgeMap as we find them.
+ // For `must-tail` calls we remember the caller and callee.
+ if (IntrinsicInst *Assume = dyn_cast<IntrinsicInst>(&I)) {
+ if (Assume->getIntrinsicID() == Intrinsic::assume)
+ fillMapFromAssume(*Assume, KnowledgeMap);
+ } else if (cast<CallInst>(I).isMustTailCall()) {
+ FI.ContainsMustTailCall = true;
+ if (const Function *Callee = cast<CallInst>(I).getCalledFunction())
+ getFunctionInfo(*Callee).CalledViaMustTail = true;
+ }
+ LLVM_FALLTHROUGH;
+ case Instruction::CallBr:
+ case Instruction::Invoke:
+ case Instruction::CleanupRet:
+ case Instruction::CatchSwitch:
+ case Instruction::AtomicRMW:
+ case Instruction::AtomicCmpXchg:
+ case Instruction::Br:
+ case Instruction::Resume:
+ case Instruction::Ret:
+ case Instruction::Load:
+ // The alignment of a pointer is interesting for loads.
+ case Instruction::Store:
+ // The alignment of a pointer is interesting for stores.
+ IsInterestingOpcode = true;
+ }
+ if (IsInterestingOpcode) {
+ auto *&Insts = FI.OpcodeInstMap[I.getOpcode()];
+ if (!Insts)
+ Insts = new (Allocator) InstructionVectorTy();
+ Insts->push_back(&I);
+ }
+ if (I.mayReadOrWriteMemory())
+ FI.RWInsts.push_back(&I);
+ }
+
+ if (F.hasFnAttribute(Attribute::AlwaysInline) &&
+ isInlineViable(F).isSuccess())
+ InlineableFunctions.insert(&F);
+}
+
AAResults *InformationCache::getAAResultsForFunction(const Function &F) {
return AG.getAnalysis<AAManager>(F);
}
-InformationCache::FunctionInfo::~FunctionInfo() {
- // The instruction vectors are allocated using a BumpPtrAllocator, we need to
- // manually destroy them.
- for (auto &It : OpcodeInstMap)
- It.getSecond()->~InstructionVectorTy();
-}
-
-void Attributor::recordDependence(const AbstractAttribute &FromAA,
- const AbstractAttribute &ToAA,
- DepClassTy DepClass) {
- // If we are outside of an update, thus before the actual fixpoint iteration
- // started (= when we create AAs), we do not track dependences because we will
- // put all AAs into the initial worklist anyway.
- if (DependenceStack.empty())
- return;
- if (FromAA.getState().isAtFixpoint())
- return;
- DependenceStack.back()->push_back({&FromAA, &ToAA, DepClass});
-}
-
-void Attributor::rememberDependences() {
- assert(!DependenceStack.empty() && "No dependences to remember!");
-
- for (DepInfo &DI : *DependenceStack.back()) {
- auto &DepAAs = const_cast<AbstractAttribute &>(*DI.FromAA).Deps;
- DepAAs.push_back(AbstractAttribute::DepTy(
- const_cast<AbstractAttribute *>(DI.ToAA), unsigned(DI.DepClass)));
- }
-}
-
-void Attributor::identifyDefaultAbstractAttributes(Function &F) {
- if (!VisitedFunctions.insert(&F).second)
- return;
- if (F.isDeclaration())
- return;
-
- // In non-module runs we need to look at the call sites of a function to
- // determine if it is part of a must-tail call edge. This will influence what
- // attributes we can derive.
- InformationCache::FunctionInfo &FI = InfoCache.getFunctionInfo(F);
- if (!isModulePass() && !FI.CalledViaMustTail) {
- for (const Use &U : F.uses())
- if (const auto *CB = dyn_cast<CallBase>(U.getUser()))
- if (CB->isCallee(&U) && CB->isMustTailCall())
- FI.CalledViaMustTail = true;
- }
-
- IRPosition FPos = IRPosition::function(F);
-
- // Check for dead BasicBlocks in every function.
- // We need dead instruction detection because we do not want to deal with
- // broken IR in which SSA rules do not apply.
- getOrCreateAAFor<AAIsDead>(FPos);
-
- // Every function might be "will-return".
- getOrCreateAAFor<AAWillReturn>(FPos);
-
- // Every function might contain instructions that cause "undefined behavior".
- getOrCreateAAFor<AAUndefinedBehavior>(FPos);
-
- // Every function can be nounwind.
- getOrCreateAAFor<AANoUnwind>(FPos);
-
- // Every function might be marked "nosync"
- getOrCreateAAFor<AANoSync>(FPos);
-
- // Every function might be "no-free".
- getOrCreateAAFor<AANoFree>(FPos);
-
- // Every function might be "no-return".
- getOrCreateAAFor<AANoReturn>(FPos);
-
- // Every function might be "no-recurse".
- getOrCreateAAFor<AANoRecurse>(FPos);
-
- // Every function might be "readnone/readonly/writeonly/...".
- getOrCreateAAFor<AAMemoryBehavior>(FPos);
-
- // Every function can be "readnone/argmemonly/inaccessiblememonly/...".
- getOrCreateAAFor<AAMemoryLocation>(FPos);
-
- // Every function might be applicable for Heap-To-Stack conversion.
- if (EnableHeapToStack)
- getOrCreateAAFor<AAHeapToStack>(FPos);
-
- // Return attributes are only appropriate if the return type is non void.
- Type *ReturnType = F.getReturnType();
- if (!ReturnType->isVoidTy()) {
- // Argument attribute "returned" --- Create only one per function even
- // though it is an argument attribute.
- getOrCreateAAFor<AAReturnedValues>(FPos);
-
- IRPosition RetPos = IRPosition::returned(F);
-
- // Every returned value might be dead.
- getOrCreateAAFor<AAIsDead>(RetPos);
-
- // Every function might be simplified.
- getOrCreateAAFor<AAValueSimplify>(RetPos);
-
+InformationCache::FunctionInfo::~FunctionInfo() {
+ // The instruction vectors are allocated using a BumpPtrAllocator, we need to
+ // manually destroy them.
+ for (auto &It : OpcodeInstMap)
+ It.getSecond()->~InstructionVectorTy();
+}
+
+void Attributor::recordDependence(const AbstractAttribute &FromAA,
+ const AbstractAttribute &ToAA,
+ DepClassTy DepClass) {
+ // If we are outside of an update, thus before the actual fixpoint iteration
+ // started (= when we create AAs), we do not track dependences because we will
+ // put all AAs into the initial worklist anyway.
+ if (DependenceStack.empty())
+ return;
+ if (FromAA.getState().isAtFixpoint())
+ return;
+ DependenceStack.back()->push_back({&FromAA, &ToAA, DepClass});
+}
+
+void Attributor::rememberDependences() {
+ assert(!DependenceStack.empty() && "No dependences to remember!");
+
+ for (DepInfo &DI : *DependenceStack.back()) {
+ auto &DepAAs = const_cast<AbstractAttribute &>(*DI.FromAA).Deps;
+ DepAAs.push_back(AbstractAttribute::DepTy(
+ const_cast<AbstractAttribute *>(DI.ToAA), unsigned(DI.DepClass)));
+ }
+}
+
+void Attributor::identifyDefaultAbstractAttributes(Function &F) {
+ if (!VisitedFunctions.insert(&F).second)
+ return;
+ if (F.isDeclaration())
+ return;
+
+ // In non-module runs we need to look at the call sites of a function to
+ // determine if it is part of a must-tail call edge. This will influence what
+ // attributes we can derive.
+ InformationCache::FunctionInfo &FI = InfoCache.getFunctionInfo(F);
+ if (!isModulePass() && !FI.CalledViaMustTail) {
+ for (const Use &U : F.uses())
+ if (const auto *CB = dyn_cast<CallBase>(U.getUser()))
+ if (CB->isCallee(&U) && CB->isMustTailCall())
+ FI.CalledViaMustTail = true;
+ }
+
+ IRPosition FPos = IRPosition::function(F);
+
+ // Check for dead BasicBlocks in every function.
+ // We need dead instruction detection because we do not want to deal with
+ // broken IR in which SSA rules do not apply.
+ getOrCreateAAFor<AAIsDead>(FPos);
+
+ // Every function might be "will-return".
+ getOrCreateAAFor<AAWillReturn>(FPos);
+
+ // Every function might contain instructions that cause "undefined behavior".
+ getOrCreateAAFor<AAUndefinedBehavior>(FPos);
+
+ // Every function can be nounwind.
+ getOrCreateAAFor<AANoUnwind>(FPos);
+
+ // Every function might be marked "nosync"
+ getOrCreateAAFor<AANoSync>(FPos);
+
+ // Every function might be "no-free".
+ getOrCreateAAFor<AANoFree>(FPos);
+
+ // Every function might be "no-return".
+ getOrCreateAAFor<AANoReturn>(FPos);
+
+ // Every function might be "no-recurse".
+ getOrCreateAAFor<AANoRecurse>(FPos);
+
+ // Every function might be "readnone/readonly/writeonly/...".
+ getOrCreateAAFor<AAMemoryBehavior>(FPos);
+
+ // Every function can be "readnone/argmemonly/inaccessiblememonly/...".
+ getOrCreateAAFor<AAMemoryLocation>(FPos);
+
+ // Every function might be applicable for Heap-To-Stack conversion.
+ if (EnableHeapToStack)
+ getOrCreateAAFor<AAHeapToStack>(FPos);
+
+ // Return attributes are only appropriate if the return type is non void.
+ Type *ReturnType = F.getReturnType();
+ if (!ReturnType->isVoidTy()) {
+ // Argument attribute "returned" --- Create only one per function even
+ // though it is an argument attribute.
+ getOrCreateAAFor<AAReturnedValues>(FPos);
+
+ IRPosition RetPos = IRPosition::returned(F);
+
+ // Every returned value might be dead.
+ getOrCreateAAFor<AAIsDead>(RetPos);
+
+ // Every function might be simplified.
+ getOrCreateAAFor<AAValueSimplify>(RetPos);
+
// Every returned value might be marked noundef.
getOrCreateAAFor<AANoUndef>(RetPos);
- if (ReturnType->isPointerTy()) {
-
- // Every function with pointer return type might be marked align.
- getOrCreateAAFor<AAAlign>(RetPos);
-
- // Every function with pointer return type might be marked nonnull.
- getOrCreateAAFor<AANonNull>(RetPos);
-
- // Every function with pointer return type might be marked noalias.
- getOrCreateAAFor<AANoAlias>(RetPos);
-
- // Every function with pointer return type might be marked
- // dereferenceable.
- getOrCreateAAFor<AADereferenceable>(RetPos);
- }
- }
-
- for (Argument &Arg : F.args()) {
- IRPosition ArgPos = IRPosition::argument(Arg);
-
- // Every argument might be simplified.
- getOrCreateAAFor<AAValueSimplify>(ArgPos);
-
- // Every argument might be dead.
- getOrCreateAAFor<AAIsDead>(ArgPos);
-
+ if (ReturnType->isPointerTy()) {
+
+ // Every function with pointer return type might be marked align.
+ getOrCreateAAFor<AAAlign>(RetPos);
+
+ // Every function with pointer return type might be marked nonnull.
+ getOrCreateAAFor<AANonNull>(RetPos);
+
+ // Every function with pointer return type might be marked noalias.
+ getOrCreateAAFor<AANoAlias>(RetPos);
+
+ // Every function with pointer return type might be marked
+ // dereferenceable.
+ getOrCreateAAFor<AADereferenceable>(RetPos);
+ }
+ }
+
+ for (Argument &Arg : F.args()) {
+ IRPosition ArgPos = IRPosition::argument(Arg);
+
+ // Every argument might be simplified.
+ getOrCreateAAFor<AAValueSimplify>(ArgPos);
+
+ // Every argument might be dead.
+ getOrCreateAAFor<AAIsDead>(ArgPos);
+
// Every argument might be marked noundef.
getOrCreateAAFor<AANoUndef>(ArgPos);
- if (Arg.getType()->isPointerTy()) {
- // Every argument with pointer type might be marked nonnull.
- getOrCreateAAFor<AANonNull>(ArgPos);
-
- // Every argument with pointer type might be marked noalias.
- getOrCreateAAFor<AANoAlias>(ArgPos);
-
- // Every argument with pointer type might be marked dereferenceable.
- getOrCreateAAFor<AADereferenceable>(ArgPos);
-
- // Every argument with pointer type might be marked align.
- getOrCreateAAFor<AAAlign>(ArgPos);
-
- // Every argument with pointer type might be marked nocapture.
- getOrCreateAAFor<AANoCapture>(ArgPos);
-
- // Every argument with pointer type might be marked
- // "readnone/readonly/writeonly/..."
- getOrCreateAAFor<AAMemoryBehavior>(ArgPos);
-
- // Every argument with pointer type might be marked nofree.
- getOrCreateAAFor<AANoFree>(ArgPos);
-
- // Every argument with pointer type might be privatizable (or promotable)
- getOrCreateAAFor<AAPrivatizablePtr>(ArgPos);
- }
- }
-
- auto CallSitePred = [&](Instruction &I) -> bool {
- auto &CB = cast<CallBase>(I);
- IRPosition CBRetPos = IRPosition::callsite_returned(CB);
-
- // Call sites might be dead if they do not have side effects and no live
- // users. The return value might be dead if there are no live users.
- getOrCreateAAFor<AAIsDead>(CBRetPos);
-
- Function *Callee = CB.getCalledFunction();
- // TODO: Even if the callee is not known now we might be able to simplify
- // the call/callee.
- if (!Callee)
- return true;
-
- // Skip declarations except if annotations on their call sites were
- // explicitly requested.
- if (!AnnotateDeclarationCallSites && Callee->isDeclaration() &&
- !Callee->hasMetadata(LLVMContext::MD_callback))
- return true;
-
- if (!Callee->getReturnType()->isVoidTy() && !CB.use_empty()) {
-
- IRPosition CBRetPos = IRPosition::callsite_returned(CB);
-
- // Call site return integer values might be limited by a constant range.
- if (Callee->getReturnType()->isIntegerTy())
- getOrCreateAAFor<AAValueConstantRange>(CBRetPos);
- }
-
- for (int I = 0, E = CB.getNumArgOperands(); I < E; ++I) {
-
- IRPosition CBArgPos = IRPosition::callsite_argument(CB, I);
-
- // Every call site argument might be dead.
- getOrCreateAAFor<AAIsDead>(CBArgPos);
-
- // Call site argument might be simplified.
- getOrCreateAAFor<AAValueSimplify>(CBArgPos);
-
+ if (Arg.getType()->isPointerTy()) {
+ // Every argument with pointer type might be marked nonnull.
+ getOrCreateAAFor<AANonNull>(ArgPos);
+
+ // Every argument with pointer type might be marked noalias.
+ getOrCreateAAFor<AANoAlias>(ArgPos);
+
+ // Every argument with pointer type might be marked dereferenceable.
+ getOrCreateAAFor<AADereferenceable>(ArgPos);
+
+ // Every argument with pointer type might be marked align.
+ getOrCreateAAFor<AAAlign>(ArgPos);
+
+ // Every argument with pointer type might be marked nocapture.
+ getOrCreateAAFor<AANoCapture>(ArgPos);
+
+ // Every argument with pointer type might be marked
+ // "readnone/readonly/writeonly/..."
+ getOrCreateAAFor<AAMemoryBehavior>(ArgPos);
+
+ // Every argument with pointer type might be marked nofree.
+ getOrCreateAAFor<AANoFree>(ArgPos);
+
+ // Every argument with pointer type might be privatizable (or promotable)
+ getOrCreateAAFor<AAPrivatizablePtr>(ArgPos);
+ }
+ }
+
+ auto CallSitePred = [&](Instruction &I) -> bool {
+ auto &CB = cast<CallBase>(I);
+ IRPosition CBRetPos = IRPosition::callsite_returned(CB);
+
+ // Call sites might be dead if they do not have side effects and no live
+ // users. The return value might be dead if there are no live users.
+ getOrCreateAAFor<AAIsDead>(CBRetPos);
+
+ Function *Callee = CB.getCalledFunction();
+ // TODO: Even if the callee is not known now we might be able to simplify
+ // the call/callee.
+ if (!Callee)
+ return true;
+
+ // Skip declarations except if annotations on their call sites were
+ // explicitly requested.
+ if (!AnnotateDeclarationCallSites && Callee->isDeclaration() &&
+ !Callee->hasMetadata(LLVMContext::MD_callback))
+ return true;
+
+ if (!Callee->getReturnType()->isVoidTy() && !CB.use_empty()) {
+
+ IRPosition CBRetPos = IRPosition::callsite_returned(CB);
+
+ // Call site return integer values might be limited by a constant range.
+ if (Callee->getReturnType()->isIntegerTy())
+ getOrCreateAAFor<AAValueConstantRange>(CBRetPos);
+ }
+
+ for (int I = 0, E = CB.getNumArgOperands(); I < E; ++I) {
+
+ IRPosition CBArgPos = IRPosition::callsite_argument(CB, I);
+
+ // Every call site argument might be dead.
+ getOrCreateAAFor<AAIsDead>(CBArgPos);
+
+ // Call site argument might be simplified.
+ getOrCreateAAFor<AAValueSimplify>(CBArgPos);
+
// Every call site argument might be marked "noundef".
getOrCreateAAFor<AANoUndef>(CBArgPos);
- if (!CB.getArgOperand(I)->getType()->isPointerTy())
- continue;
-
- // Call site argument attribute "non-null".
- getOrCreateAAFor<AANonNull>(CBArgPos);
-
- // Call site argument attribute "nocapture".
- getOrCreateAAFor<AANoCapture>(CBArgPos);
-
- // Call site argument attribute "no-alias".
- getOrCreateAAFor<AANoAlias>(CBArgPos);
-
- // Call site argument attribute "dereferenceable".
- getOrCreateAAFor<AADereferenceable>(CBArgPos);
-
- // Call site argument attribute "align".
- getOrCreateAAFor<AAAlign>(CBArgPos);
-
- // Call site argument attribute
- // "readnone/readonly/writeonly/..."
- getOrCreateAAFor<AAMemoryBehavior>(CBArgPos);
-
- // Call site argument attribute "nofree".
- getOrCreateAAFor<AANoFree>(CBArgPos);
- }
- return true;
- };
-
- auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
- bool Success;
- Success = checkForAllInstructionsImpl(
- nullptr, OpcodeInstMap, CallSitePred, nullptr, nullptr,
- {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr,
- (unsigned)Instruction::Call});
- (void)Success;
- assert(Success && "Expected the check call to be successful!");
-
- auto LoadStorePred = [&](Instruction &I) -> bool {
- if (isa<LoadInst>(I))
- getOrCreateAAFor<AAAlign>(
- IRPosition::value(*cast<LoadInst>(I).getPointerOperand()));
- else
- getOrCreateAAFor<AAAlign>(
- IRPosition::value(*cast<StoreInst>(I).getPointerOperand()));
- return true;
- };
- Success = checkForAllInstructionsImpl(
- nullptr, OpcodeInstMap, LoadStorePred, nullptr, nullptr,
- {(unsigned)Instruction::Load, (unsigned)Instruction::Store});
- (void)Success;
- assert(Success && "Expected the check call to be successful!");
-}
-
-/// Helpers to ease debugging through output streams and print calls.
-///
-///{
-raw_ostream &llvm::operator<<(raw_ostream &OS, ChangeStatus S) {
- return OS << (S == ChangeStatus::CHANGED ? "changed" : "unchanged");
-}
-
-raw_ostream &llvm::operator<<(raw_ostream &OS, IRPosition::Kind AP) {
- switch (AP) {
- case IRPosition::IRP_INVALID:
- return OS << "inv";
- case IRPosition::IRP_FLOAT:
- return OS << "flt";
- case IRPosition::IRP_RETURNED:
- return OS << "fn_ret";
- case IRPosition::IRP_CALL_SITE_RETURNED:
- return OS << "cs_ret";
- case IRPosition::IRP_FUNCTION:
- return OS << "fn";
- case IRPosition::IRP_CALL_SITE:
- return OS << "cs";
- case IRPosition::IRP_ARGUMENT:
- return OS << "arg";
- case IRPosition::IRP_CALL_SITE_ARGUMENT:
- return OS << "cs_arg";
- }
- llvm_unreachable("Unknown attribute position!");
-}
-
-raw_ostream &llvm::operator<<(raw_ostream &OS, const IRPosition &Pos) {
- const Value &AV = Pos.getAssociatedValue();
- return OS << "{" << Pos.getPositionKind() << ":" << AV.getName() << " ["
+ if (!CB.getArgOperand(I)->getType()->isPointerTy())
+ continue;
+
+ // Call site argument attribute "non-null".
+ getOrCreateAAFor<AANonNull>(CBArgPos);
+
+ // Call site argument attribute "nocapture".
+ getOrCreateAAFor<AANoCapture>(CBArgPos);
+
+ // Call site argument attribute "no-alias".
+ getOrCreateAAFor<AANoAlias>(CBArgPos);
+
+ // Call site argument attribute "dereferenceable".
+ getOrCreateAAFor<AADereferenceable>(CBArgPos);
+
+ // Call site argument attribute "align".
+ getOrCreateAAFor<AAAlign>(CBArgPos);
+
+ // Call site argument attribute
+ // "readnone/readonly/writeonly/..."
+ getOrCreateAAFor<AAMemoryBehavior>(CBArgPos);
+
+ // Call site argument attribute "nofree".
+ getOrCreateAAFor<AANoFree>(CBArgPos);
+ }
+ return true;
+ };
+
+ auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
+ bool Success;
+ Success = checkForAllInstructionsImpl(
+ nullptr, OpcodeInstMap, CallSitePred, nullptr, nullptr,
+ {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr,
+ (unsigned)Instruction::Call});
+ (void)Success;
+ assert(Success && "Expected the check call to be successful!");
+
+ auto LoadStorePred = [&](Instruction &I) -> bool {
+ if (isa<LoadInst>(I))
+ getOrCreateAAFor<AAAlign>(
+ IRPosition::value(*cast<LoadInst>(I).getPointerOperand()));
+ else
+ getOrCreateAAFor<AAAlign>(
+ IRPosition::value(*cast<StoreInst>(I).getPointerOperand()));
+ return true;
+ };
+ Success = checkForAllInstructionsImpl(
+ nullptr, OpcodeInstMap, LoadStorePred, nullptr, nullptr,
+ {(unsigned)Instruction::Load, (unsigned)Instruction::Store});
+ (void)Success;
+ assert(Success && "Expected the check call to be successful!");
+}
+
+/// Helpers to ease debugging through output streams and print calls.
+///
+///{
+raw_ostream &llvm::operator<<(raw_ostream &OS, ChangeStatus S) {
+ return OS << (S == ChangeStatus::CHANGED ? "changed" : "unchanged");
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, IRPosition::Kind AP) {
+ switch (AP) {
+ case IRPosition::IRP_INVALID:
+ return OS << "inv";
+ case IRPosition::IRP_FLOAT:
+ return OS << "flt";
+ case IRPosition::IRP_RETURNED:
+ return OS << "fn_ret";
+ case IRPosition::IRP_CALL_SITE_RETURNED:
+ return OS << "cs_ret";
+ case IRPosition::IRP_FUNCTION:
+ return OS << "fn";
+ case IRPosition::IRP_CALL_SITE:
+ return OS << "cs";
+ case IRPosition::IRP_ARGUMENT:
+ return OS << "arg";
+ case IRPosition::IRP_CALL_SITE_ARGUMENT:
+ return OS << "cs_arg";
+ }
+ llvm_unreachable("Unknown attribute position!");
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const IRPosition &Pos) {
+ const Value &AV = Pos.getAssociatedValue();
+ return OS << "{" << Pos.getPositionKind() << ":" << AV.getName() << " ["
<< Pos.getAnchorValue().getName() << "@" << Pos.getCallSiteArgNo()
<< "]}";
-}
-
-raw_ostream &llvm::operator<<(raw_ostream &OS, const IntegerRangeState &S) {
- OS << "range-state(" << S.getBitWidth() << ")<";
- S.getKnown().print(OS);
- OS << " / ";
- S.getAssumed().print(OS);
- OS << ">";
-
- return OS << static_cast<const AbstractState &>(S);
-}
-
-raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractState &S) {
- return OS << (!S.isValidState() ? "top" : (S.isAtFixpoint() ? "fix" : ""));
-}
-
-raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractAttribute &AA) {
- AA.print(OS);
- return OS;
-}
-
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const IntegerRangeState &S) {
+ OS << "range-state(" << S.getBitWidth() << ")<";
+ S.getKnown().print(OS);
+ OS << " / ";
+ S.getAssumed().print(OS);
+ OS << ">";
+
+ return OS << static_cast<const AbstractState &>(S);
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractState &S) {
+ return OS << (!S.isValidState() ? "top" : (S.isAtFixpoint() ? "fix" : ""));
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const AbstractAttribute &AA) {
+ AA.print(OS);
+ return OS;
+}
+
raw_ostream &llvm::operator<<(raw_ostream &OS,
const PotentialConstantIntValuesState &S) {
OS << "set-state(< {";
@@ -2239,7 +2239,7 @@ raw_ostream &llvm::operator<<(raw_ostream &OS,
return OS;
}
-void AbstractAttribute::print(raw_ostream &OS) const {
+void AbstractAttribute::print(raw_ostream &OS) const {
OS << "[";
OS << getName();
OS << "] for CtxI ";
@@ -2253,7 +2253,7 @@ void AbstractAttribute::print(raw_ostream &OS) const {
OS << " at position " << getIRPosition() << " with state " << getAsStr()
<< '\n';
-}
+}
void AbstractAttribute::printWithDeps(raw_ostream &OS) const {
print(OS);
@@ -2266,32 +2266,32 @@ void AbstractAttribute::printWithDeps(raw_ostream &OS) const {
OS << '\n';
}
-///}
-
-/// ----------------------------------------------------------------------------
-/// Pass (Manager) Boilerplate
-/// ----------------------------------------------------------------------------
-
-static bool runAttributorOnFunctions(InformationCache &InfoCache,
- SetVector<Function *> &Functions,
- AnalysisGetter &AG,
- CallGraphUpdater &CGUpdater) {
- if (Functions.empty())
- return false;
-
- LLVM_DEBUG(dbgs() << "[Attributor] Run on module with " << Functions.size()
- << " functions.\n");
-
- // Create an Attributor and initially empty information cache that is filled
- // while we identify default attribute opportunities.
- Attributor A(Functions, InfoCache, CGUpdater);
-
- // Create shallow wrappers for all functions that are not IPO amendable
- if (AllowShallowWrappers)
- for (Function *F : Functions)
- if (!A.isFunctionIPOAmendable(*F))
+///}
+
+/// ----------------------------------------------------------------------------
+/// Pass (Manager) Boilerplate
+/// ----------------------------------------------------------------------------
+
+static bool runAttributorOnFunctions(InformationCache &InfoCache,
+ SetVector<Function *> &Functions,
+ AnalysisGetter &AG,
+ CallGraphUpdater &CGUpdater) {
+ if (Functions.empty())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "[Attributor] Run on module with " << Functions.size()
+ << " functions.\n");
+
+ // Create an Attributor and initially empty information cache that is filled
+ // while we identify default attribute opportunities.
+ Attributor A(Functions, InfoCache, CGUpdater);
+
+ // Create shallow wrappers for all functions that are not IPO amendable
+ if (AllowShallowWrappers)
+ for (Function *F : Functions)
+ if (!A.isFunctionIPOAmendable(*F))
Attributor::createShallowWrapper(*F);
-
+
// Internalize non-exact functions
// TODO: for now we eagerly internalize functions without calculating the
// cost, we need a cost interface to determine whether internalizing
@@ -2316,36 +2316,36 @@ static bool runAttributorOnFunctions(InformationCache &InfoCache,
}
}
- for (Function *F : Functions) {
- if (F->hasExactDefinition())
- NumFnWithExactDefinition++;
- else
- NumFnWithoutExactDefinition++;
-
- // We look at internal functions only on-demand but if any use is not a
+ for (Function *F : Functions) {
+ if (F->hasExactDefinition())
+ NumFnWithExactDefinition++;
+ else
+ NumFnWithoutExactDefinition++;
+
+ // We look at internal functions only on-demand but if any use is not a
// direct call or outside the current set of analyzed functions, we have
// to do it eagerly.
- if (F->hasLocalLinkage()) {
- if (llvm::all_of(F->uses(), [&Functions](const Use &U) {
- const auto *CB = dyn_cast<CallBase>(U.getUser());
- return CB && CB->isCallee(&U) &&
- Functions.count(const_cast<Function *>(CB->getCaller()));
- }))
- continue;
- }
-
- // Populate the Attributor with abstract attribute opportunities in the
- // function and the information cache with IR information.
- A.identifyDefaultAbstractAttributes(*F);
- }
-
- ChangeStatus Changed = A.run();
-
- LLVM_DEBUG(dbgs() << "[Attributor] Done with " << Functions.size()
- << " functions, result: " << Changed << ".\n");
- return Changed == ChangeStatus::CHANGED;
-}
-
+ if (F->hasLocalLinkage()) {
+ if (llvm::all_of(F->uses(), [&Functions](const Use &U) {
+ const auto *CB = dyn_cast<CallBase>(U.getUser());
+ return CB && CB->isCallee(&U) &&
+ Functions.count(const_cast<Function *>(CB->getCaller()));
+ }))
+ continue;
+ }
+
+ // Populate the Attributor with abstract attribute opportunities in the
+ // function and the information cache with IR information.
+ A.identifyDefaultAbstractAttributes(*F);
+ }
+
+ ChangeStatus Changed = A.run();
+
+ LLVM_DEBUG(dbgs() << "[Attributor] Done with " << Functions.size()
+ << " functions, result: " << Changed << ".\n");
+ return Changed == ChangeStatus::CHANGED;
+}
+
void AADepGraph::viewGraph() { llvm::ViewGraph(this, "Dependency Graph"); }
void AADepGraph::dumpGraph() {
@@ -2375,54 +2375,54 @@ void AADepGraph::print() {
cast<AbstractAttribute>(DepAA.getPointer())->printWithDeps(outs());
}
-PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) {
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- AnalysisGetter AG(FAM);
-
- SetVector<Function *> Functions;
- for (Function &F : M)
- Functions.insert(&F);
-
- CallGraphUpdater CGUpdater;
- BumpPtrAllocator Allocator;
- InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
- if (runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater)) {
- // FIXME: Think about passes we will preserve and add them here.
- return PreservedAnalyses::none();
- }
- return PreservedAnalyses::all();
-}
-
-PreservedAnalyses AttributorCGSCCPass::run(LazyCallGraph::SCC &C,
- CGSCCAnalysisManager &AM,
- LazyCallGraph &CG,
- CGSCCUpdateResult &UR) {
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
- AnalysisGetter AG(FAM);
-
- SetVector<Function *> Functions;
- for (LazyCallGraph::Node &N : C)
- Functions.insert(&N.getFunction());
-
- if (Functions.empty())
- return PreservedAnalyses::all();
-
- Module &M = *Functions.back()->getParent();
- CallGraphUpdater CGUpdater;
- CGUpdater.initialize(CG, C, AM, UR);
- BumpPtrAllocator Allocator;
- InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ &Functions);
- if (runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater)) {
- // FIXME: Think about passes we will preserve and add them here.
+PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) {
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ AnalysisGetter AG(FAM);
+
+ SetVector<Function *> Functions;
+ for (Function &F : M)
+ Functions.insert(&F);
+
+ CallGraphUpdater CGUpdater;
+ BumpPtrAllocator Allocator;
+ InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
+ if (runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater)) {
+ // FIXME: Think about passes we will preserve and add them here.
+ return PreservedAnalyses::none();
+ }
+ return PreservedAnalyses::all();
+}
+
+PreservedAnalyses AttributorCGSCCPass::run(LazyCallGraph::SCC &C,
+ CGSCCAnalysisManager &AM,
+ LazyCallGraph &CG,
+ CGSCCUpdateResult &UR) {
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+ AnalysisGetter AG(FAM);
+
+ SetVector<Function *> Functions;
+ for (LazyCallGraph::Node &N : C)
+ Functions.insert(&N.getFunction());
+
+ if (Functions.empty())
+ return PreservedAnalyses::all();
+
+ Module &M = *Functions.back()->getParent();
+ CallGraphUpdater CGUpdater;
+ CGUpdater.initialize(CG, C, AM, UR);
+ BumpPtrAllocator Allocator;
+ InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ &Functions);
+ if (runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater)) {
+ // FIXME: Think about passes we will preserve and add them here.
PreservedAnalyses PA;
PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
return PA;
- }
- return PreservedAnalyses::all();
-}
-
+ }
+ return PreservedAnalyses::all();
+}
+
namespace llvm {
template <> struct GraphTraits<AADepGraphNode *> {
@@ -2468,93 +2468,93 @@ template <> struct DOTGraphTraits<AADepGraph *> : public DefaultDOTGraphTraits {
} // end namespace llvm
-namespace {
-
-struct AttributorLegacyPass : public ModulePass {
- static char ID;
-
- AttributorLegacyPass() : ModulePass(ID) {
- initializeAttributorLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
-
- AnalysisGetter AG;
- SetVector<Function *> Functions;
- for (Function &F : M)
- Functions.insert(&F);
-
- CallGraphUpdater CGUpdater;
- BumpPtrAllocator Allocator;
- InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
- return runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- // FIXME: Think about passes we will preserve and add them here.
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-};
-
-struct AttributorCGSCCLegacyPass : public CallGraphSCCPass {
- static char ID;
-
- AttributorCGSCCLegacyPass() : CallGraphSCCPass(ID) {
- initializeAttributorCGSCCLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnSCC(CallGraphSCC &SCC) override {
- if (skipSCC(SCC))
- return false;
-
- SetVector<Function *> Functions;
- for (CallGraphNode *CGN : SCC)
- if (Function *Fn = CGN->getFunction())
- if (!Fn->isDeclaration())
- Functions.insert(Fn);
-
- if (Functions.empty())
- return false;
-
- AnalysisGetter AG;
- CallGraph &CG = const_cast<CallGraph &>(SCC.getCallGraph());
+namespace {
+
+struct AttributorLegacyPass : public ModulePass {
+ static char ID;
+
+ AttributorLegacyPass() : ModulePass(ID) {
+ initializeAttributorLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+
+ AnalysisGetter AG;
+ SetVector<Function *> Functions;
+ for (Function &F : M)
+ Functions.insert(&F);
+
CallGraphUpdater CGUpdater;
- CGUpdater.initialize(CG, SCC);
- Module &M = *Functions.back()->getParent();
- BumpPtrAllocator Allocator;
- InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ &Functions);
- return runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- // FIXME: Think about passes we will preserve and add them here.
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- CallGraphSCCPass::getAnalysisUsage(AU);
- }
-};
-
-} // end anonymous namespace
-
-Pass *llvm::createAttributorLegacyPass() { return new AttributorLegacyPass(); }
-Pass *llvm::createAttributorCGSCCLegacyPass() {
- return new AttributorCGSCCLegacyPass();
-}
-
-char AttributorLegacyPass::ID = 0;
-char AttributorCGSCCLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(AttributorLegacyPass, "attributor",
- "Deduce and propagate attributes", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(AttributorLegacyPass, "attributor",
- "Deduce and propagate attributes", false, false)
-INITIALIZE_PASS_BEGIN(AttributorCGSCCLegacyPass, "attributor-cgscc",
- "Deduce and propagate attributes (CGSCC pass)", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_END(AttributorCGSCCLegacyPass, "attributor-cgscc",
- "Deduce and propagate attributes (CGSCC pass)", false,
- false)
+ BumpPtrAllocator Allocator;
+ InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
+ return runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // FIXME: Think about passes we will preserve and add them here.
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+};
+
+struct AttributorCGSCCLegacyPass : public CallGraphSCCPass {
+ static char ID;
+
+ AttributorCGSCCLegacyPass() : CallGraphSCCPass(ID) {
+ initializeAttributorCGSCCLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnSCC(CallGraphSCC &SCC) override {
+ if (skipSCC(SCC))
+ return false;
+
+ SetVector<Function *> Functions;
+ for (CallGraphNode *CGN : SCC)
+ if (Function *Fn = CGN->getFunction())
+ if (!Fn->isDeclaration())
+ Functions.insert(Fn);
+
+ if (Functions.empty())
+ return false;
+
+ AnalysisGetter AG;
+ CallGraph &CG = const_cast<CallGraph &>(SCC.getCallGraph());
+ CallGraphUpdater CGUpdater;
+ CGUpdater.initialize(CG, SCC);
+ Module &M = *Functions.back()->getParent();
+ BumpPtrAllocator Allocator;
+ InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ &Functions);
+ return runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // FIXME: Think about passes we will preserve and add them here.
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ CallGraphSCCPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+Pass *llvm::createAttributorLegacyPass() { return new AttributorLegacyPass(); }
+Pass *llvm::createAttributorCGSCCLegacyPass() {
+ return new AttributorCGSCCLegacyPass();
+}
+
+char AttributorLegacyPass::ID = 0;
+char AttributorCGSCCLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AttributorLegacyPass, "attributor",
+ "Deduce and propagate attributes", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(AttributorLegacyPass, "attributor",
+ "Deduce and propagate attributes", false, false)
+INITIALIZE_PASS_BEGIN(AttributorCGSCCLegacyPass, "attributor-cgscc",
+ "Deduce and propagate attributes (CGSCC pass)", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(AttributorCGSCCLegacyPass, "attributor-cgscc",
+ "Deduce and propagate attributes (CGSCC pass)", false,
+ false)
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/AttributorAttributes.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/AttributorAttributes.cpp
index e83d2df7d2..d6127a8df6 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -1,52 +1,52 @@
-//===- AttributorAttributes.cpp - Attributes for Attributor deduction -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// See the Attributor.h file comment and the class descriptions in that file for
-// more information.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/Attributor.h"
-
+//===- AttributorAttributes.cpp - Attributes for Attributor deduction -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// See the Attributor.h file comment and the class descriptions in that file for
+// more information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/Attributor.h"
+
#include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/LazyValueInfo.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/IRBuilder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/NoFolder.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/IPO/ArgumentPromotion.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-#include <cassert>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "attributor"
-
-static cl::opt<bool> ManifestInternal(
- "attributor-manifest-internal", cl::Hidden,
- cl::desc("Manifest Attributor internal string attributes."),
- cl::init(false));
-
-static cl::opt<int> MaxHeapToStackSize("max-heap-to-stack-size", cl::init(128),
- cl::Hidden);
-
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO/ArgumentPromotion.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "attributor"
+
+static cl::opt<bool> ManifestInternal(
+ "attributor-manifest-internal", cl::Hidden,
+ cl::desc("Manifest Attributor internal string attributes."),
+ cl::init(false));
+
+static cl::opt<int> MaxHeapToStackSize("max-heap-to-stack-size", cl::init(128),
+ cl::Hidden);
+
template <>
unsigned llvm::PotentialConstantIntValuesState::MaxPotentialValues = 0;
@@ -57,1655 +57,1655 @@ static cl::opt<unsigned, true> MaxPotentialValues(
cl::location(llvm::PotentialConstantIntValuesState::MaxPotentialValues),
cl::init(7));
-STATISTIC(NumAAs, "Number of abstract attributes created");
-
-// Some helper macros to deal with statistics tracking.
-//
-// Usage:
-// For simple IR attribute tracking overload trackStatistics in the abstract
-// attribute and choose the right STATS_DECLTRACK_********* macro,
-// e.g.,:
-// void trackStatistics() const override {
-// STATS_DECLTRACK_ARG_ATTR(returned)
-// }
-// If there is a single "increment" side one can use the macro
-// STATS_DECLTRACK with a custom message. If there are multiple increment
-// sides, STATS_DECL and STATS_TRACK can also be used separately.
-//
-#define BUILD_STAT_MSG_IR_ATTR(TYPE, NAME) \
- ("Number of " #TYPE " marked '" #NAME "'")
-#define BUILD_STAT_NAME(NAME, TYPE) NumIR##TYPE##_##NAME
-#define STATS_DECL_(NAME, MSG) STATISTIC(NAME, MSG);
-#define STATS_DECL(NAME, TYPE, MSG) \
- STATS_DECL_(BUILD_STAT_NAME(NAME, TYPE), MSG);
-#define STATS_TRACK(NAME, TYPE) ++(BUILD_STAT_NAME(NAME, TYPE));
-#define STATS_DECLTRACK(NAME, TYPE, MSG) \
- { \
- STATS_DECL(NAME, TYPE, MSG) \
- STATS_TRACK(NAME, TYPE) \
- }
-#define STATS_DECLTRACK_ARG_ATTR(NAME) \
- STATS_DECLTRACK(NAME, Arguments, BUILD_STAT_MSG_IR_ATTR(arguments, NAME))
-#define STATS_DECLTRACK_CSARG_ATTR(NAME) \
- STATS_DECLTRACK(NAME, CSArguments, \
- BUILD_STAT_MSG_IR_ATTR(call site arguments, NAME))
-#define STATS_DECLTRACK_FN_ATTR(NAME) \
- STATS_DECLTRACK(NAME, Function, BUILD_STAT_MSG_IR_ATTR(functions, NAME))
-#define STATS_DECLTRACK_CS_ATTR(NAME) \
- STATS_DECLTRACK(NAME, CS, BUILD_STAT_MSG_IR_ATTR(call site, NAME))
-#define STATS_DECLTRACK_FNRET_ATTR(NAME) \
- STATS_DECLTRACK(NAME, FunctionReturn, \
- BUILD_STAT_MSG_IR_ATTR(function returns, NAME))
-#define STATS_DECLTRACK_CSRET_ATTR(NAME) \
- STATS_DECLTRACK(NAME, CSReturn, \
- BUILD_STAT_MSG_IR_ATTR(call site returns, NAME))
-#define STATS_DECLTRACK_FLOATING_ATTR(NAME) \
- STATS_DECLTRACK(NAME, Floating, \
- ("Number of floating values known to be '" #NAME "'"))
-
-// Specialization of the operator<< for abstract attributes subclasses. This
-// disambiguates situations where multiple operators are applicable.
-namespace llvm {
-#define PIPE_OPERATOR(CLASS) \
- raw_ostream &operator<<(raw_ostream &OS, const CLASS &AA) { \
- return OS << static_cast<const AbstractAttribute &>(AA); \
- }
-
-PIPE_OPERATOR(AAIsDead)
-PIPE_OPERATOR(AANoUnwind)
-PIPE_OPERATOR(AANoSync)
-PIPE_OPERATOR(AANoRecurse)
-PIPE_OPERATOR(AAWillReturn)
-PIPE_OPERATOR(AANoReturn)
-PIPE_OPERATOR(AAReturnedValues)
-PIPE_OPERATOR(AANonNull)
-PIPE_OPERATOR(AANoAlias)
-PIPE_OPERATOR(AADereferenceable)
-PIPE_OPERATOR(AAAlign)
-PIPE_OPERATOR(AANoCapture)
-PIPE_OPERATOR(AAValueSimplify)
-PIPE_OPERATOR(AANoFree)
-PIPE_OPERATOR(AAHeapToStack)
-PIPE_OPERATOR(AAReachability)
-PIPE_OPERATOR(AAMemoryBehavior)
-PIPE_OPERATOR(AAMemoryLocation)
-PIPE_OPERATOR(AAValueConstantRange)
-PIPE_OPERATOR(AAPrivatizablePtr)
-PIPE_OPERATOR(AAUndefinedBehavior)
+STATISTIC(NumAAs, "Number of abstract attributes created");
+
+// Some helper macros to deal with statistics tracking.
+//
+// Usage:
+// For simple IR attribute tracking overload trackStatistics in the abstract
+// attribute and choose the right STATS_DECLTRACK_********* macro,
+// e.g.,:
+// void trackStatistics() const override {
+// STATS_DECLTRACK_ARG_ATTR(returned)
+// }
+// If there is a single "increment" side one can use the macro
+// STATS_DECLTRACK with a custom message. If there are multiple increment
+// sides, STATS_DECL and STATS_TRACK can also be used separately.
+//
+#define BUILD_STAT_MSG_IR_ATTR(TYPE, NAME) \
+ ("Number of " #TYPE " marked '" #NAME "'")
+#define BUILD_STAT_NAME(NAME, TYPE) NumIR##TYPE##_##NAME
+#define STATS_DECL_(NAME, MSG) STATISTIC(NAME, MSG);
+#define STATS_DECL(NAME, TYPE, MSG) \
+ STATS_DECL_(BUILD_STAT_NAME(NAME, TYPE), MSG);
+#define STATS_TRACK(NAME, TYPE) ++(BUILD_STAT_NAME(NAME, TYPE));
+#define STATS_DECLTRACK(NAME, TYPE, MSG) \
+ { \
+ STATS_DECL(NAME, TYPE, MSG) \
+ STATS_TRACK(NAME, TYPE) \
+ }
+#define STATS_DECLTRACK_ARG_ATTR(NAME) \
+ STATS_DECLTRACK(NAME, Arguments, BUILD_STAT_MSG_IR_ATTR(arguments, NAME))
+#define STATS_DECLTRACK_CSARG_ATTR(NAME) \
+ STATS_DECLTRACK(NAME, CSArguments, \
+ BUILD_STAT_MSG_IR_ATTR(call site arguments, NAME))
+#define STATS_DECLTRACK_FN_ATTR(NAME) \
+ STATS_DECLTRACK(NAME, Function, BUILD_STAT_MSG_IR_ATTR(functions, NAME))
+#define STATS_DECLTRACK_CS_ATTR(NAME) \
+ STATS_DECLTRACK(NAME, CS, BUILD_STAT_MSG_IR_ATTR(call site, NAME))
+#define STATS_DECLTRACK_FNRET_ATTR(NAME) \
+ STATS_DECLTRACK(NAME, FunctionReturn, \
+ BUILD_STAT_MSG_IR_ATTR(function returns, NAME))
+#define STATS_DECLTRACK_CSRET_ATTR(NAME) \
+ STATS_DECLTRACK(NAME, CSReturn, \
+ BUILD_STAT_MSG_IR_ATTR(call site returns, NAME))
+#define STATS_DECLTRACK_FLOATING_ATTR(NAME) \
+ STATS_DECLTRACK(NAME, Floating, \
+ ("Number of floating values known to be '" #NAME "'"))
+
+// Specialization of the operator<< for abstract attributes subclasses. This
+// disambiguates situations where multiple operators are applicable.
+namespace llvm {
+#define PIPE_OPERATOR(CLASS) \
+ raw_ostream &operator<<(raw_ostream &OS, const CLASS &AA) { \
+ return OS << static_cast<const AbstractAttribute &>(AA); \
+ }
+
+PIPE_OPERATOR(AAIsDead)
+PIPE_OPERATOR(AANoUnwind)
+PIPE_OPERATOR(AANoSync)
+PIPE_OPERATOR(AANoRecurse)
+PIPE_OPERATOR(AAWillReturn)
+PIPE_OPERATOR(AANoReturn)
+PIPE_OPERATOR(AAReturnedValues)
+PIPE_OPERATOR(AANonNull)
+PIPE_OPERATOR(AANoAlias)
+PIPE_OPERATOR(AADereferenceable)
+PIPE_OPERATOR(AAAlign)
+PIPE_OPERATOR(AANoCapture)
+PIPE_OPERATOR(AAValueSimplify)
+PIPE_OPERATOR(AANoFree)
+PIPE_OPERATOR(AAHeapToStack)
+PIPE_OPERATOR(AAReachability)
+PIPE_OPERATOR(AAMemoryBehavior)
+PIPE_OPERATOR(AAMemoryLocation)
+PIPE_OPERATOR(AAValueConstantRange)
+PIPE_OPERATOR(AAPrivatizablePtr)
+PIPE_OPERATOR(AAUndefinedBehavior)
PIPE_OPERATOR(AAPotentialValues)
PIPE_OPERATOR(AANoUndef)
-
-#undef PIPE_OPERATOR
-} // namespace llvm
-
-namespace {
-
-static Optional<ConstantInt *>
-getAssumedConstantInt(Attributor &A, const Value &V,
- const AbstractAttribute &AA,
- bool &UsedAssumedInformation) {
- Optional<Constant *> C = A.getAssumedConstant(V, AA, UsedAssumedInformation);
- if (C.hasValue())
- return dyn_cast_or_null<ConstantInt>(C.getValue());
- return llvm::None;
-}
-
-/// Get pointer operand of memory accessing instruction. If \p I is
-/// not a memory accessing instruction, return nullptr. If \p AllowVolatile,
-/// is set to false and the instruction is volatile, return nullptr.
-static const Value *getPointerOperand(const Instruction *I,
- bool AllowVolatile) {
- if (auto *LI = dyn_cast<LoadInst>(I)) {
- if (!AllowVolatile && LI->isVolatile())
- return nullptr;
- return LI->getPointerOperand();
- }
-
- if (auto *SI = dyn_cast<StoreInst>(I)) {
- if (!AllowVolatile && SI->isVolatile())
- return nullptr;
- return SI->getPointerOperand();
- }
-
- if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(I)) {
- if (!AllowVolatile && CXI->isVolatile())
- return nullptr;
- return CXI->getPointerOperand();
- }
-
- if (auto *RMWI = dyn_cast<AtomicRMWInst>(I)) {
- if (!AllowVolatile && RMWI->isVolatile())
- return nullptr;
- return RMWI->getPointerOperand();
- }
-
- return nullptr;
-}
-
-/// Helper function to create a pointer of type \p ResTy, based on \p Ptr, and
-/// advanced by \p Offset bytes. To aid later analysis the method tries to build
-/// getelement pointer instructions that traverse the natural type of \p Ptr if
-/// possible. If that fails, the remaining offset is adjusted byte-wise, hence
-/// through a cast to i8*.
-///
-/// TODO: This could probably live somewhere more prominantly if it doesn't
-/// already exist.
-static Value *constructPointer(Type *ResTy, Value *Ptr, int64_t Offset,
- IRBuilder<NoFolder> &IRB, const DataLayout &DL) {
- assert(Offset >= 0 && "Negative offset not supported yet!");
- LLVM_DEBUG(dbgs() << "Construct pointer: " << *Ptr << " + " << Offset
- << "-bytes as " << *ResTy << "\n");
-
- // The initial type we are trying to traverse to get nice GEPs.
- Type *Ty = Ptr->getType();
-
- SmallVector<Value *, 4> Indices;
- std::string GEPName = Ptr->getName().str();
- while (Offset) {
- uint64_t Idx, Rem;
-
- if (auto *STy = dyn_cast<StructType>(Ty)) {
- const StructLayout *SL = DL.getStructLayout(STy);
- if (int64_t(SL->getSizeInBytes()) < Offset)
- break;
- Idx = SL->getElementContainingOffset(Offset);
- assert(Idx < STy->getNumElements() && "Offset calculation error!");
- Rem = Offset - SL->getElementOffset(Idx);
- Ty = STy->getElementType(Idx);
- } else if (auto *PTy = dyn_cast<PointerType>(Ty)) {
- Ty = PTy->getElementType();
- if (!Ty->isSized())
- break;
- uint64_t ElementSize = DL.getTypeAllocSize(Ty);
- assert(ElementSize && "Expected type with size!");
- Idx = Offset / ElementSize;
- Rem = Offset % ElementSize;
- } else {
- // Non-aggregate type, we cast and make byte-wise progress now.
- break;
- }
-
- LLVM_DEBUG(errs() << "Ty: " << *Ty << " Offset: " << Offset
- << " Idx: " << Idx << " Rem: " << Rem << "\n");
-
- GEPName += "." + std::to_string(Idx);
- Indices.push_back(ConstantInt::get(IRB.getInt32Ty(), Idx));
- Offset = Rem;
- }
-
- // Create a GEP if we collected indices above.
- if (Indices.size())
- Ptr = IRB.CreateGEP(Ptr, Indices, GEPName);
-
- // If an offset is left we use byte-wise adjustment.
- if (Offset) {
- Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy());
- Ptr = IRB.CreateGEP(Ptr, IRB.getInt32(Offset),
- GEPName + ".b" + Twine(Offset));
- }
-
- // Ensure the result has the requested type.
- Ptr = IRB.CreateBitOrPointerCast(Ptr, ResTy, Ptr->getName() + ".cast");
-
- LLVM_DEBUG(dbgs() << "Constructed pointer: " << *Ptr << "\n");
- return Ptr;
-}
-
-/// Recursively visit all values that might become \p IRP at some point. This
-/// will be done by looking through cast instructions, selects, phis, and calls
-/// with the "returned" attribute. Once we cannot look through the value any
-/// further, the callback \p VisitValueCB is invoked and passed the current
-/// value, the \p State, and a flag to indicate if we stripped anything.
-/// Stripped means that we unpacked the value associated with \p IRP at least
-/// once. Note that the value used for the callback may still be the value
-/// associated with \p IRP (due to PHIs). To limit how much effort is invested,
-/// we will never visit more values than specified by \p MaxValues.
-template <typename AAType, typename StateTy>
-static bool genericValueTraversal(
- Attributor &A, IRPosition IRP, const AAType &QueryingAA, StateTy &State,
- function_ref<bool(Value &, const Instruction *, StateTy &, bool)>
- VisitValueCB,
- const Instruction *CtxI, bool UseValueSimplify = true, int MaxValues = 16,
- function_ref<Value *(Value *)> StripCB = nullptr) {
-
- const AAIsDead *LivenessAA = nullptr;
- if (IRP.getAnchorScope())
- LivenessAA = &A.getAAFor<AAIsDead>(
- QueryingAA, IRPosition::function(*IRP.getAnchorScope()),
- /* TrackDependence */ false);
- bool AnyDead = false;
-
- using Item = std::pair<Value *, const Instruction *>;
- SmallSet<Item, 16> Visited;
- SmallVector<Item, 16> Worklist;
- Worklist.push_back({&IRP.getAssociatedValue(), CtxI});
-
- int Iteration = 0;
- do {
- Item I = Worklist.pop_back_val();
- Value *V = I.first;
- CtxI = I.second;
- if (StripCB)
- V = StripCB(V);
-
- // Check if we should process the current value. To prevent endless
- // recursion keep a record of the values we followed!
- if (!Visited.insert(I).second)
- continue;
-
- // Make sure we limit the compile time for complex expressions.
- if (Iteration++ >= MaxValues)
- return false;
-
- // Explicitly look through calls with a "returned" attribute if we do
- // not have a pointer as stripPointerCasts only works on them.
- Value *NewV = nullptr;
- if (V->getType()->isPointerTy()) {
- NewV = V->stripPointerCasts();
- } else {
- auto *CB = dyn_cast<CallBase>(V);
- if (CB && CB->getCalledFunction()) {
- for (Argument &Arg : CB->getCalledFunction()->args())
- if (Arg.hasReturnedAttr()) {
- NewV = CB->getArgOperand(Arg.getArgNo());
- break;
- }
- }
- }
- if (NewV && NewV != V) {
- Worklist.push_back({NewV, CtxI});
- continue;
- }
-
- // Look through select instructions, visit both potential values.
- if (auto *SI = dyn_cast<SelectInst>(V)) {
- Worklist.push_back({SI->getTrueValue(), CtxI});
- Worklist.push_back({SI->getFalseValue(), CtxI});
- continue;
- }
-
- // Look through phi nodes, visit all live operands.
- if (auto *PHI = dyn_cast<PHINode>(V)) {
- assert(LivenessAA &&
- "Expected liveness in the presence of instructions!");
- for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) {
- BasicBlock *IncomingBB = PHI->getIncomingBlock(u);
- if (A.isAssumedDead(*IncomingBB->getTerminator(), &QueryingAA,
- LivenessAA,
- /* CheckBBLivenessOnly */ true)) {
- AnyDead = true;
- continue;
- }
- Worklist.push_back(
- {PHI->getIncomingValue(u), IncomingBB->getTerminator()});
- }
- continue;
- }
-
- if (UseValueSimplify && !isa<Constant>(V)) {
- bool UsedAssumedInformation = false;
- Optional<Constant *> C =
- A.getAssumedConstant(*V, QueryingAA, UsedAssumedInformation);
- if (!C.hasValue())
- continue;
- if (Value *NewV = C.getValue()) {
- Worklist.push_back({NewV, CtxI});
- continue;
- }
- }
-
- // Once a leaf is reached we inform the user through the callback.
- if (!VisitValueCB(*V, CtxI, State, Iteration > 1))
- return false;
- } while (!Worklist.empty());
-
- // If we actually used liveness information so we have to record a dependence.
- if (AnyDead)
- A.recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL);
-
- // All values have been visited.
- return true;
-}
-
-const Value *stripAndAccumulateMinimalOffsets(
- Attributor &A, const AbstractAttribute &QueryingAA, const Value *Val,
- const DataLayout &DL, APInt &Offset, bool AllowNonInbounds,
- bool UseAssumed = false) {
-
- auto AttributorAnalysis = [&](Value &V, APInt &ROffset) -> bool {
- const IRPosition &Pos = IRPosition::value(V);
- // Only track dependence if we are going to use the assumed info.
- const AAValueConstantRange &ValueConstantRangeAA =
- A.getAAFor<AAValueConstantRange>(QueryingAA, Pos,
- /* TrackDependence */ UseAssumed);
- ConstantRange Range = UseAssumed ? ValueConstantRangeAA.getAssumed()
- : ValueConstantRangeAA.getKnown();
- // We can only use the lower part of the range because the upper part can
- // be higher than what the value can really be.
- ROffset = Range.getSignedMin();
- return true;
- };
-
- return Val->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds,
- AttributorAnalysis);
-}
-
-static const Value *getMinimalBaseOfAccsesPointerOperand(
- Attributor &A, const AbstractAttribute &QueryingAA, const Instruction *I,
- int64_t &BytesOffset, const DataLayout &DL, bool AllowNonInbounds = false) {
- const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false);
- if (!Ptr)
- return nullptr;
- APInt OffsetAPInt(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
- const Value *Base = stripAndAccumulateMinimalOffsets(
- A, QueryingAA, Ptr, DL, OffsetAPInt, AllowNonInbounds);
-
- BytesOffset = OffsetAPInt.getSExtValue();
- return Base;
-}
-
-static const Value *
-getBasePointerOfAccessPointerOperand(const Instruction *I, int64_t &BytesOffset,
- const DataLayout &DL,
- bool AllowNonInbounds = false) {
- const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false);
- if (!Ptr)
- return nullptr;
-
- return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL,
- AllowNonInbounds);
-}
-
-/// Helper function to clamp a state \p S of type \p StateType with the
-/// information in \p R and indicate/return if \p S did change (as-in update is
-/// required to be run again).
-template <typename StateType>
-ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R) {
- auto Assumed = S.getAssumed();
- S ^= R;
- return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED
- : ChangeStatus::CHANGED;
-}
-
-/// Clamp the information known for all returned values of a function
-/// (identified by \p QueryingAA) into \p S.
-template <typename AAType, typename StateType = typename AAType::StateType>
-static void clampReturnedValueStates(Attributor &A, const AAType &QueryingAA,
- StateType &S) {
- LLVM_DEBUG(dbgs() << "[Attributor] Clamp return value states for "
- << QueryingAA << " into " << S << "\n");
-
- assert((QueryingAA.getIRPosition().getPositionKind() ==
- IRPosition::IRP_RETURNED ||
- QueryingAA.getIRPosition().getPositionKind() ==
- IRPosition::IRP_CALL_SITE_RETURNED) &&
- "Can only clamp returned value states for a function returned or call "
- "site returned position!");
-
- // Use an optional state as there might not be any return values and we want
- // to join (IntegerState::operator&) the state of all there are.
- Optional<StateType> T;
-
- // Callback for each possibly returned value.
- auto CheckReturnValue = [&](Value &RV) -> bool {
- const IRPosition &RVPos = IRPosition::value(RV);
- const AAType &AA = A.getAAFor<AAType>(QueryingAA, RVPos);
- LLVM_DEBUG(dbgs() << "[Attributor] RV: " << RV << " AA: " << AA.getAsStr()
- << " @ " << RVPos << "\n");
+
+#undef PIPE_OPERATOR
+} // namespace llvm
+
+namespace {
+
+static Optional<ConstantInt *>
+getAssumedConstantInt(Attributor &A, const Value &V,
+ const AbstractAttribute &AA,
+ bool &UsedAssumedInformation) {
+ Optional<Constant *> C = A.getAssumedConstant(V, AA, UsedAssumedInformation);
+ if (C.hasValue())
+ return dyn_cast_or_null<ConstantInt>(C.getValue());
+ return llvm::None;
+}
+
+/// Get pointer operand of memory accessing instruction. If \p I is
+/// not a memory accessing instruction, return nullptr. If \p AllowVolatile,
+/// is set to false and the instruction is volatile, return nullptr.
+static const Value *getPointerOperand(const Instruction *I,
+ bool AllowVolatile) {
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ if (!AllowVolatile && LI->isVolatile())
+ return nullptr;
+ return LI->getPointerOperand();
+ }
+
+ if (auto *SI = dyn_cast<StoreInst>(I)) {
+ if (!AllowVolatile && SI->isVolatile())
+ return nullptr;
+ return SI->getPointerOperand();
+ }
+
+ if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(I)) {
+ if (!AllowVolatile && CXI->isVolatile())
+ return nullptr;
+ return CXI->getPointerOperand();
+ }
+
+ if (auto *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+ if (!AllowVolatile && RMWI->isVolatile())
+ return nullptr;
+ return RMWI->getPointerOperand();
+ }
+
+ return nullptr;
+}
+
+/// Helper function to create a pointer of type \p ResTy, based on \p Ptr, and
+/// advanced by \p Offset bytes. To aid later analysis the method tries to build
+/// getelement pointer instructions that traverse the natural type of \p Ptr if
+/// possible. If that fails, the remaining offset is adjusted byte-wise, hence
+/// through a cast to i8*.
+///
+/// TODO: This could probably live somewhere more prominantly if it doesn't
+/// already exist.
+static Value *constructPointer(Type *ResTy, Value *Ptr, int64_t Offset,
+ IRBuilder<NoFolder> &IRB, const DataLayout &DL) {
+ assert(Offset >= 0 && "Negative offset not supported yet!");
+ LLVM_DEBUG(dbgs() << "Construct pointer: " << *Ptr << " + " << Offset
+ << "-bytes as " << *ResTy << "\n");
+
+ // The initial type we are trying to traverse to get nice GEPs.
+ Type *Ty = Ptr->getType();
+
+ SmallVector<Value *, 4> Indices;
+ std::string GEPName = Ptr->getName().str();
+ while (Offset) {
+ uint64_t Idx, Rem;
+
+ if (auto *STy = dyn_cast<StructType>(Ty)) {
+ const StructLayout *SL = DL.getStructLayout(STy);
+ if (int64_t(SL->getSizeInBytes()) < Offset)
+ break;
+ Idx = SL->getElementContainingOffset(Offset);
+ assert(Idx < STy->getNumElements() && "Offset calculation error!");
+ Rem = Offset - SL->getElementOffset(Idx);
+ Ty = STy->getElementType(Idx);
+ } else if (auto *PTy = dyn_cast<PointerType>(Ty)) {
+ Ty = PTy->getElementType();
+ if (!Ty->isSized())
+ break;
+ uint64_t ElementSize = DL.getTypeAllocSize(Ty);
+ assert(ElementSize && "Expected type with size!");
+ Idx = Offset / ElementSize;
+ Rem = Offset % ElementSize;
+ } else {
+ // Non-aggregate type, we cast and make byte-wise progress now.
+ break;
+ }
+
+ LLVM_DEBUG(errs() << "Ty: " << *Ty << " Offset: " << Offset
+ << " Idx: " << Idx << " Rem: " << Rem << "\n");
+
+ GEPName += "." + std::to_string(Idx);
+ Indices.push_back(ConstantInt::get(IRB.getInt32Ty(), Idx));
+ Offset = Rem;
+ }
+
+ // Create a GEP if we collected indices above.
+ if (Indices.size())
+ Ptr = IRB.CreateGEP(Ptr, Indices, GEPName);
+
+ // If an offset is left we use byte-wise adjustment.
+ if (Offset) {
+ Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy());
+ Ptr = IRB.CreateGEP(Ptr, IRB.getInt32(Offset),
+ GEPName + ".b" + Twine(Offset));
+ }
+
+ // Ensure the result has the requested type.
+ Ptr = IRB.CreateBitOrPointerCast(Ptr, ResTy, Ptr->getName() + ".cast");
+
+ LLVM_DEBUG(dbgs() << "Constructed pointer: " << *Ptr << "\n");
+ return Ptr;
+}
+
+/// Recursively visit all values that might become \p IRP at some point. This
+/// will be done by looking through cast instructions, selects, phis, and calls
+/// with the "returned" attribute. Once we cannot look through the value any
+/// further, the callback \p VisitValueCB is invoked and passed the current
+/// value, the \p State, and a flag to indicate if we stripped anything.
+/// Stripped means that we unpacked the value associated with \p IRP at least
+/// once. Note that the value used for the callback may still be the value
+/// associated with \p IRP (due to PHIs). To limit how much effort is invested,
+/// we will never visit more values than specified by \p MaxValues.
+template <typename AAType, typename StateTy>
+static bool genericValueTraversal(
+ Attributor &A, IRPosition IRP, const AAType &QueryingAA, StateTy &State,
+ function_ref<bool(Value &, const Instruction *, StateTy &, bool)>
+ VisitValueCB,
+ const Instruction *CtxI, bool UseValueSimplify = true, int MaxValues = 16,
+ function_ref<Value *(Value *)> StripCB = nullptr) {
+
+ const AAIsDead *LivenessAA = nullptr;
+ if (IRP.getAnchorScope())
+ LivenessAA = &A.getAAFor<AAIsDead>(
+ QueryingAA, IRPosition::function(*IRP.getAnchorScope()),
+ /* TrackDependence */ false);
+ bool AnyDead = false;
+
+ using Item = std::pair<Value *, const Instruction *>;
+ SmallSet<Item, 16> Visited;
+ SmallVector<Item, 16> Worklist;
+ Worklist.push_back({&IRP.getAssociatedValue(), CtxI});
+
+ int Iteration = 0;
+ do {
+ Item I = Worklist.pop_back_val();
+ Value *V = I.first;
+ CtxI = I.second;
+ if (StripCB)
+ V = StripCB(V);
+
+ // Check if we should process the current value. To prevent endless
+ // recursion keep a record of the values we followed!
+ if (!Visited.insert(I).second)
+ continue;
+
+ // Make sure we limit the compile time for complex expressions.
+ if (Iteration++ >= MaxValues)
+ return false;
+
+ // Explicitly look through calls with a "returned" attribute if we do
+ // not have a pointer as stripPointerCasts only works on them.
+ Value *NewV = nullptr;
+ if (V->getType()->isPointerTy()) {
+ NewV = V->stripPointerCasts();
+ } else {
+ auto *CB = dyn_cast<CallBase>(V);
+ if (CB && CB->getCalledFunction()) {
+ for (Argument &Arg : CB->getCalledFunction()->args())
+ if (Arg.hasReturnedAttr()) {
+ NewV = CB->getArgOperand(Arg.getArgNo());
+ break;
+ }
+ }
+ }
+ if (NewV && NewV != V) {
+ Worklist.push_back({NewV, CtxI});
+ continue;
+ }
+
+ // Look through select instructions, visit both potential values.
+ if (auto *SI = dyn_cast<SelectInst>(V)) {
+ Worklist.push_back({SI->getTrueValue(), CtxI});
+ Worklist.push_back({SI->getFalseValue(), CtxI});
+ continue;
+ }
+
+ // Look through phi nodes, visit all live operands.
+ if (auto *PHI = dyn_cast<PHINode>(V)) {
+ assert(LivenessAA &&
+ "Expected liveness in the presence of instructions!");
+ for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) {
+ BasicBlock *IncomingBB = PHI->getIncomingBlock(u);
+ if (A.isAssumedDead(*IncomingBB->getTerminator(), &QueryingAA,
+ LivenessAA,
+ /* CheckBBLivenessOnly */ true)) {
+ AnyDead = true;
+ continue;
+ }
+ Worklist.push_back(
+ {PHI->getIncomingValue(u), IncomingBB->getTerminator()});
+ }
+ continue;
+ }
+
+ if (UseValueSimplify && !isa<Constant>(V)) {
+ bool UsedAssumedInformation = false;
+ Optional<Constant *> C =
+ A.getAssumedConstant(*V, QueryingAA, UsedAssumedInformation);
+ if (!C.hasValue())
+ continue;
+ if (Value *NewV = C.getValue()) {
+ Worklist.push_back({NewV, CtxI});
+ continue;
+ }
+ }
+
+ // Once a leaf is reached we inform the user through the callback.
+ if (!VisitValueCB(*V, CtxI, State, Iteration > 1))
+ return false;
+ } while (!Worklist.empty());
+
+ // If we actually used liveness information so we have to record a dependence.
+ if (AnyDead)
+ A.recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL);
+
+ // All values have been visited.
+ return true;
+}
+
+const Value *stripAndAccumulateMinimalOffsets(
+ Attributor &A, const AbstractAttribute &QueryingAA, const Value *Val,
+ const DataLayout &DL, APInt &Offset, bool AllowNonInbounds,
+ bool UseAssumed = false) {
+
+ auto AttributorAnalysis = [&](Value &V, APInt &ROffset) -> bool {
+ const IRPosition &Pos = IRPosition::value(V);
+ // Only track dependence if we are going to use the assumed info.
+ const AAValueConstantRange &ValueConstantRangeAA =
+ A.getAAFor<AAValueConstantRange>(QueryingAA, Pos,
+ /* TrackDependence */ UseAssumed);
+ ConstantRange Range = UseAssumed ? ValueConstantRangeAA.getAssumed()
+ : ValueConstantRangeAA.getKnown();
+ // We can only use the lower part of the range because the upper part can
+ // be higher than what the value can really be.
+ ROffset = Range.getSignedMin();
+ return true;
+ };
+
+ return Val->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds,
+ AttributorAnalysis);
+}
+
+static const Value *getMinimalBaseOfAccsesPointerOperand(
+ Attributor &A, const AbstractAttribute &QueryingAA, const Instruction *I,
+ int64_t &BytesOffset, const DataLayout &DL, bool AllowNonInbounds = false) {
+ const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false);
+ if (!Ptr)
+ return nullptr;
+ APInt OffsetAPInt(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+ const Value *Base = stripAndAccumulateMinimalOffsets(
+ A, QueryingAA, Ptr, DL, OffsetAPInt, AllowNonInbounds);
+
+ BytesOffset = OffsetAPInt.getSExtValue();
+ return Base;
+}
+
+static const Value *
+getBasePointerOfAccessPointerOperand(const Instruction *I, int64_t &BytesOffset,
+ const DataLayout &DL,
+ bool AllowNonInbounds = false) {
+ const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false);
+ if (!Ptr)
+ return nullptr;
+
+ return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL,
+ AllowNonInbounds);
+}
+
+/// Helper function to clamp a state \p S of type \p StateType with the
+/// information in \p R and indicate/return if \p S did change (as-in update is
+/// required to be run again).
+template <typename StateType>
+ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R) {
+ auto Assumed = S.getAssumed();
+ S ^= R;
+ return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED
+ : ChangeStatus::CHANGED;
+}
+
+/// Clamp the information known for all returned values of a function
+/// (identified by \p QueryingAA) into \p S.
+template <typename AAType, typename StateType = typename AAType::StateType>
+static void clampReturnedValueStates(Attributor &A, const AAType &QueryingAA,
+ StateType &S) {
+ LLVM_DEBUG(dbgs() << "[Attributor] Clamp return value states for "
+ << QueryingAA << " into " << S << "\n");
+
+ assert((QueryingAA.getIRPosition().getPositionKind() ==
+ IRPosition::IRP_RETURNED ||
+ QueryingAA.getIRPosition().getPositionKind() ==
+ IRPosition::IRP_CALL_SITE_RETURNED) &&
+ "Can only clamp returned value states for a function returned or call "
+ "site returned position!");
+
+ // Use an optional state as there might not be any return values and we want
+ // to join (IntegerState::operator&) the state of all there are.
+ Optional<StateType> T;
+
+ // Callback for each possibly returned value.
+ auto CheckReturnValue = [&](Value &RV) -> bool {
+ const IRPosition &RVPos = IRPosition::value(RV);
+ const AAType &AA = A.getAAFor<AAType>(QueryingAA, RVPos);
+ LLVM_DEBUG(dbgs() << "[Attributor] RV: " << RV << " AA: " << AA.getAsStr()
+ << " @ " << RVPos << "\n");
const StateType &AAS = AA.getState();
- if (T.hasValue())
- *T &= AAS;
- else
- T = AAS;
- LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " RV State: " << T
- << "\n");
- return T->isValidState();
- };
-
- if (!A.checkForAllReturnedValues(CheckReturnValue, QueryingAA))
- S.indicatePessimisticFixpoint();
- else if (T.hasValue())
- S ^= *T;
-}
-
-/// Helper class for generic deduction: return value -> returned position.
-template <typename AAType, typename BaseType,
- typename StateType = typename BaseType::StateType>
-struct AAReturnedFromReturnedValues : public BaseType {
- AAReturnedFromReturnedValues(const IRPosition &IRP, Attributor &A)
- : BaseType(IRP, A) {}
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- StateType S(StateType::getBestState(this->getState()));
- clampReturnedValueStates<AAType, StateType>(A, *this, S);
- // TODO: If we know we visited all returned values, thus no are assumed
- // dead, we can take the known information from the state T.
- return clampStateAndIndicateChange<StateType>(this->getState(), S);
- }
-};
-
-/// Clamp the information known at all call sites for a given argument
-/// (identified by \p QueryingAA) into \p S.
-template <typename AAType, typename StateType = typename AAType::StateType>
-static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA,
- StateType &S) {
- LLVM_DEBUG(dbgs() << "[Attributor] Clamp call site argument states for "
- << QueryingAA << " into " << S << "\n");
-
- assert(QueryingAA.getIRPosition().getPositionKind() ==
- IRPosition::IRP_ARGUMENT &&
- "Can only clamp call site argument states for an argument position!");
-
- // Use an optional state as there might not be any return values and we want
- // to join (IntegerState::operator&) the state of all there are.
- Optional<StateType> T;
-
- // The argument number which is also the call site argument number.
+ if (T.hasValue())
+ *T &= AAS;
+ else
+ T = AAS;
+ LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " RV State: " << T
+ << "\n");
+ return T->isValidState();
+ };
+
+ if (!A.checkForAllReturnedValues(CheckReturnValue, QueryingAA))
+ S.indicatePessimisticFixpoint();
+ else if (T.hasValue())
+ S ^= *T;
+}
+
+/// Helper class for generic deduction: return value -> returned position.
+template <typename AAType, typename BaseType,
+ typename StateType = typename BaseType::StateType>
+struct AAReturnedFromReturnedValues : public BaseType {
+ AAReturnedFromReturnedValues(const IRPosition &IRP, Attributor &A)
+ : BaseType(IRP, A) {}
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ StateType S(StateType::getBestState(this->getState()));
+ clampReturnedValueStates<AAType, StateType>(A, *this, S);
+ // TODO: If we know we visited all returned values, thus no are assumed
+ // dead, we can take the known information from the state T.
+ return clampStateAndIndicateChange<StateType>(this->getState(), S);
+ }
+};
+
+/// Clamp the information known at all call sites for a given argument
+/// (identified by \p QueryingAA) into \p S.
+template <typename AAType, typename StateType = typename AAType::StateType>
+static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA,
+ StateType &S) {
+ LLVM_DEBUG(dbgs() << "[Attributor] Clamp call site argument states for "
+ << QueryingAA << " into " << S << "\n");
+
+ assert(QueryingAA.getIRPosition().getPositionKind() ==
+ IRPosition::IRP_ARGUMENT &&
+ "Can only clamp call site argument states for an argument position!");
+
+ // Use an optional state as there might not be any return values and we want
+ // to join (IntegerState::operator&) the state of all there are.
+ Optional<StateType> T;
+
+ // The argument number which is also the call site argument number.
unsigned ArgNo = QueryingAA.getIRPosition().getCallSiteArgNo();
-
- auto CallSiteCheck = [&](AbstractCallSite ACS) {
- const IRPosition &ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo);
- // Check if a coresponding argument was found or if it is on not associated
- // (which can happen for callback calls).
- if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
- return false;
-
- const AAType &AA = A.getAAFor<AAType>(QueryingAA, ACSArgPos);
- LLVM_DEBUG(dbgs() << "[Attributor] ACS: " << *ACS.getInstruction()
- << " AA: " << AA.getAsStr() << " @" << ACSArgPos << "\n");
+
+ auto CallSiteCheck = [&](AbstractCallSite ACS) {
+ const IRPosition &ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo);
+ // Check if a coresponding argument was found or if it is on not associated
+ // (which can happen for callback calls).
+ if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
+ return false;
+
+ const AAType &AA = A.getAAFor<AAType>(QueryingAA, ACSArgPos);
+ LLVM_DEBUG(dbgs() << "[Attributor] ACS: " << *ACS.getInstruction()
+ << " AA: " << AA.getAsStr() << " @" << ACSArgPos << "\n");
const StateType &AAS = AA.getState();
- if (T.hasValue())
- *T &= AAS;
- else
- T = AAS;
- LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " CSA State: " << T
- << "\n");
- return T->isValidState();
- };
-
- bool AllCallSitesKnown;
- if (!A.checkForAllCallSites(CallSiteCheck, QueryingAA, true,
- AllCallSitesKnown))
- S.indicatePessimisticFixpoint();
- else if (T.hasValue())
- S ^= *T;
-}
-
-/// Helper class for generic deduction: call site argument -> argument position.
-template <typename AAType, typename BaseType,
- typename StateType = typename AAType::StateType>
-struct AAArgumentFromCallSiteArguments : public BaseType {
- AAArgumentFromCallSiteArguments(const IRPosition &IRP, Attributor &A)
- : BaseType(IRP, A) {}
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- StateType S(StateType::getBestState(this->getState()));
- clampCallSiteArgumentStates<AAType, StateType>(A, *this, S);
- // TODO: If we know we visited all incoming values, thus no are assumed
- // dead, we can take the known information from the state T.
- return clampStateAndIndicateChange<StateType>(this->getState(), S);
- }
-};
-
-/// Helper class for generic replication: function returned -> cs returned.
-template <typename AAType, typename BaseType,
- typename StateType = typename BaseType::StateType>
-struct AACallSiteReturnedFromReturned : public BaseType {
- AACallSiteReturnedFromReturned(const IRPosition &IRP, Attributor &A)
- : BaseType(IRP, A) {}
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- assert(this->getIRPosition().getPositionKind() ==
- IRPosition::IRP_CALL_SITE_RETURNED &&
- "Can only wrap function returned positions for call site returned "
- "positions!");
- auto &S = this->getState();
-
- const Function *AssociatedFunction =
- this->getIRPosition().getAssociatedFunction();
- if (!AssociatedFunction)
- return S.indicatePessimisticFixpoint();
-
- IRPosition FnPos = IRPosition::returned(*AssociatedFunction);
- const AAType &AA = A.getAAFor<AAType>(*this, FnPos);
+ if (T.hasValue())
+ *T &= AAS;
+ else
+ T = AAS;
+ LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " CSA State: " << T
+ << "\n");
+ return T->isValidState();
+ };
+
+ bool AllCallSitesKnown;
+ if (!A.checkForAllCallSites(CallSiteCheck, QueryingAA, true,
+ AllCallSitesKnown))
+ S.indicatePessimisticFixpoint();
+ else if (T.hasValue())
+ S ^= *T;
+}
+
+/// Helper class for generic deduction: call site argument -> argument position.
+template <typename AAType, typename BaseType,
+ typename StateType = typename AAType::StateType>
+struct AAArgumentFromCallSiteArguments : public BaseType {
+ AAArgumentFromCallSiteArguments(const IRPosition &IRP, Attributor &A)
+ : BaseType(IRP, A) {}
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ StateType S(StateType::getBestState(this->getState()));
+ clampCallSiteArgumentStates<AAType, StateType>(A, *this, S);
+ // TODO: If we know we visited all incoming values, thus no are assumed
+ // dead, we can take the known information from the state T.
+ return clampStateAndIndicateChange<StateType>(this->getState(), S);
+ }
+};
+
+/// Helper class for generic replication: function returned -> cs returned.
+template <typename AAType, typename BaseType,
+ typename StateType = typename BaseType::StateType>
+struct AACallSiteReturnedFromReturned : public BaseType {
+ AACallSiteReturnedFromReturned(const IRPosition &IRP, Attributor &A)
+ : BaseType(IRP, A) {}
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ assert(this->getIRPosition().getPositionKind() ==
+ IRPosition::IRP_CALL_SITE_RETURNED &&
+ "Can only wrap function returned positions for call site returned "
+ "positions!");
+ auto &S = this->getState();
+
+ const Function *AssociatedFunction =
+ this->getIRPosition().getAssociatedFunction();
+ if (!AssociatedFunction)
+ return S.indicatePessimisticFixpoint();
+
+ IRPosition FnPos = IRPosition::returned(*AssociatedFunction);
+ const AAType &AA = A.getAAFor<AAType>(*this, FnPos);
return clampStateAndIndicateChange(S, AA.getState());
- }
-};
-
-/// Helper function to accumulate uses.
-template <class AAType, typename StateType = typename AAType::StateType>
-static void followUsesInContext(AAType &AA, Attributor &A,
- MustBeExecutedContextExplorer &Explorer,
- const Instruction *CtxI,
- SetVector<const Use *> &Uses,
- StateType &State) {
- auto EIt = Explorer.begin(CtxI), EEnd = Explorer.end(CtxI);
- for (unsigned u = 0; u < Uses.size(); ++u) {
- const Use *U = Uses[u];
- if (const Instruction *UserI = dyn_cast<Instruction>(U->getUser())) {
- bool Found = Explorer.findInContextOf(UserI, EIt, EEnd);
- if (Found && AA.followUseInMBEC(A, U, UserI, State))
- for (const Use &Us : UserI->uses())
- Uses.insert(&Us);
- }
- }
-}
-
-/// Use the must-be-executed-context around \p I to add information into \p S.
-/// The AAType class is required to have `followUseInMBEC` method with the
-/// following signature and behaviour:
-///
-/// bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I)
-/// U - Underlying use.
-/// I - The user of the \p U.
-/// Returns true if the value should be tracked transitively.
-///
-template <class AAType, typename StateType = typename AAType::StateType>
-static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S,
- Instruction &CtxI) {
-
- // Container for (transitive) uses of the associated value.
- SetVector<const Use *> Uses;
- for (const Use &U : AA.getIRPosition().getAssociatedValue().uses())
- Uses.insert(&U);
-
- MustBeExecutedContextExplorer &Explorer =
- A.getInfoCache().getMustBeExecutedContextExplorer();
-
- followUsesInContext<AAType>(AA, A, Explorer, &CtxI, Uses, S);
-
- if (S.isAtFixpoint())
- return;
-
- SmallVector<const BranchInst *, 4> BrInsts;
- auto Pred = [&](const Instruction *I) {
- if (const BranchInst *Br = dyn_cast<BranchInst>(I))
- if (Br->isConditional())
- BrInsts.push_back(Br);
- return true;
- };
-
- // Here, accumulate conditional branch instructions in the context. We
- // explore the child paths and collect the known states. The disjunction of
- // those states can be merged to its own state. Let ParentState_i be a state
- // to indicate the known information for an i-th branch instruction in the
- // context. ChildStates are created for its successors respectively.
- //
- // ParentS_1 = ChildS_{1, 1} /\ ChildS_{1, 2} /\ ... /\ ChildS_{1, n_1}
- // ParentS_2 = ChildS_{2, 1} /\ ChildS_{2, 2} /\ ... /\ ChildS_{2, n_2}
- // ...
- // ParentS_m = ChildS_{m, 1} /\ ChildS_{m, 2} /\ ... /\ ChildS_{m, n_m}
- //
- // Known State |= ParentS_1 \/ ParentS_2 \/... \/ ParentS_m
- //
- // FIXME: Currently, recursive branches are not handled. For example, we
- // can't deduce that ptr must be dereferenced in below function.
- //
- // void f(int a, int c, int *ptr) {
- // if(a)
- // if (b) {
- // *ptr = 0;
- // } else {
- // *ptr = 1;
- // }
- // else {
- // if (b) {
- // *ptr = 0;
- // } else {
- // *ptr = 1;
- // }
- // }
- // }
-
- Explorer.checkForAllContext(&CtxI, Pred);
- for (const BranchInst *Br : BrInsts) {
- StateType ParentState;
-
- // The known state of the parent state is a conjunction of children's
- // known states so it is initialized with a best state.
- ParentState.indicateOptimisticFixpoint();
-
- for (const BasicBlock *BB : Br->successors()) {
- StateType ChildState;
-
- size_t BeforeSize = Uses.size();
- followUsesInContext(AA, A, Explorer, &BB->front(), Uses, ChildState);
-
- // Erase uses which only appear in the child.
- for (auto It = Uses.begin() + BeforeSize; It != Uses.end();)
- It = Uses.erase(It);
-
- ParentState &= ChildState;
- }
-
- // Use only known state.
- S += ParentState;
- }
-}
-
-/// -----------------------NoUnwind Function Attribute--------------------------
-
-struct AANoUnwindImpl : AANoUnwind {
- AANoUnwindImpl(const IRPosition &IRP, Attributor &A) : AANoUnwind(IRP, A) {}
-
- const std::string getAsStr() const override {
- return getAssumed() ? "nounwind" : "may-unwind";
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- auto Opcodes = {
- (unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr,
- (unsigned)Instruction::Call, (unsigned)Instruction::CleanupRet,
- (unsigned)Instruction::CatchSwitch, (unsigned)Instruction::Resume};
-
- auto CheckForNoUnwind = [&](Instruction &I) {
- if (!I.mayThrow())
- return true;
-
- if (const auto *CB = dyn_cast<CallBase>(&I)) {
- const auto &NoUnwindAA =
- A.getAAFor<AANoUnwind>(*this, IRPosition::callsite_function(*CB));
- return NoUnwindAA.isAssumedNoUnwind();
- }
- return false;
- };
-
- if (!A.checkForAllInstructions(CheckForNoUnwind, *this, Opcodes))
- return indicatePessimisticFixpoint();
-
- return ChangeStatus::UNCHANGED;
- }
-};
-
-struct AANoUnwindFunction final : public AANoUnwindImpl {
- AANoUnwindFunction(const IRPosition &IRP, Attributor &A)
- : AANoUnwindImpl(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nounwind) }
-};
-
-/// NoUnwind attribute deduction for a call sites.
-struct AANoUnwindCallSite final : AANoUnwindImpl {
- AANoUnwindCallSite(const IRPosition &IRP, Attributor &A)
- : AANoUnwindImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- AANoUnwindImpl::initialize(A);
- Function *F = getAssociatedFunction();
+ }
+};
+
+/// Helper function to accumulate uses.
+template <class AAType, typename StateType = typename AAType::StateType>
+static void followUsesInContext(AAType &AA, Attributor &A,
+ MustBeExecutedContextExplorer &Explorer,
+ const Instruction *CtxI,
+ SetVector<const Use *> &Uses,
+ StateType &State) {
+ auto EIt = Explorer.begin(CtxI), EEnd = Explorer.end(CtxI);
+ for (unsigned u = 0; u < Uses.size(); ++u) {
+ const Use *U = Uses[u];
+ if (const Instruction *UserI = dyn_cast<Instruction>(U->getUser())) {
+ bool Found = Explorer.findInContextOf(UserI, EIt, EEnd);
+ if (Found && AA.followUseInMBEC(A, U, UserI, State))
+ for (const Use &Us : UserI->uses())
+ Uses.insert(&Us);
+ }
+ }
+}
+
+/// Use the must-be-executed-context around \p I to add information into \p S.
+/// The AAType class is required to have `followUseInMBEC` method with the
+/// following signature and behaviour:
+///
+/// bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I)
+/// U - Underlying use.
+/// I - The user of the \p U.
+/// Returns true if the value should be tracked transitively.
+///
+template <class AAType, typename StateType = typename AAType::StateType>
+static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S,
+ Instruction &CtxI) {
+
+ // Container for (transitive) uses of the associated value.
+ SetVector<const Use *> Uses;
+ for (const Use &U : AA.getIRPosition().getAssociatedValue().uses())
+ Uses.insert(&U);
+
+ MustBeExecutedContextExplorer &Explorer =
+ A.getInfoCache().getMustBeExecutedContextExplorer();
+
+ followUsesInContext<AAType>(AA, A, Explorer, &CtxI, Uses, S);
+
+ if (S.isAtFixpoint())
+ return;
+
+ SmallVector<const BranchInst *, 4> BrInsts;
+ auto Pred = [&](const Instruction *I) {
+ if (const BranchInst *Br = dyn_cast<BranchInst>(I))
+ if (Br->isConditional())
+ BrInsts.push_back(Br);
+ return true;
+ };
+
+ // Here, accumulate conditional branch instructions in the context. We
+ // explore the child paths and collect the known states. The disjunction of
+ // those states can be merged to its own state. Let ParentState_i be a state
+ // to indicate the known information for an i-th branch instruction in the
+ // context. ChildStates are created for its successors respectively.
+ //
+ // ParentS_1 = ChildS_{1, 1} /\ ChildS_{1, 2} /\ ... /\ ChildS_{1, n_1}
+ // ParentS_2 = ChildS_{2, 1} /\ ChildS_{2, 2} /\ ... /\ ChildS_{2, n_2}
+ // ...
+ // ParentS_m = ChildS_{m, 1} /\ ChildS_{m, 2} /\ ... /\ ChildS_{m, n_m}
+ //
+ // Known State |= ParentS_1 \/ ParentS_2 \/... \/ ParentS_m
+ //
+ // FIXME: Currently, recursive branches are not handled. For example, we
+ // can't deduce that ptr must be dereferenced in below function.
+ //
+ // void f(int a, int c, int *ptr) {
+ // if(a)
+ // if (b) {
+ // *ptr = 0;
+ // } else {
+ // *ptr = 1;
+ // }
+ // else {
+ // if (b) {
+ // *ptr = 0;
+ // } else {
+ // *ptr = 1;
+ // }
+ // }
+ // }
+
+ Explorer.checkForAllContext(&CtxI, Pred);
+ for (const BranchInst *Br : BrInsts) {
+ StateType ParentState;
+
+ // The known state of the parent state is a conjunction of children's
+ // known states so it is initialized with a best state.
+ ParentState.indicateOptimisticFixpoint();
+
+ for (const BasicBlock *BB : Br->successors()) {
+ StateType ChildState;
+
+ size_t BeforeSize = Uses.size();
+ followUsesInContext(AA, A, Explorer, &BB->front(), Uses, ChildState);
+
+ // Erase uses which only appear in the child.
+ for (auto It = Uses.begin() + BeforeSize; It != Uses.end();)
+ It = Uses.erase(It);
+
+ ParentState &= ChildState;
+ }
+
+ // Use only known state.
+ S += ParentState;
+ }
+}
+
+/// -----------------------NoUnwind Function Attribute--------------------------
+
+struct AANoUnwindImpl : AANoUnwind {
+ AANoUnwindImpl(const IRPosition &IRP, Attributor &A) : AANoUnwind(IRP, A) {}
+
+ const std::string getAsStr() const override {
+ return getAssumed() ? "nounwind" : "may-unwind";
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ auto Opcodes = {
+ (unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr,
+ (unsigned)Instruction::Call, (unsigned)Instruction::CleanupRet,
+ (unsigned)Instruction::CatchSwitch, (unsigned)Instruction::Resume};
+
+ auto CheckForNoUnwind = [&](Instruction &I) {
+ if (!I.mayThrow())
+ return true;
+
+ if (const auto *CB = dyn_cast<CallBase>(&I)) {
+ const auto &NoUnwindAA =
+ A.getAAFor<AANoUnwind>(*this, IRPosition::callsite_function(*CB));
+ return NoUnwindAA.isAssumedNoUnwind();
+ }
+ return false;
+ };
+
+ if (!A.checkForAllInstructions(CheckForNoUnwind, *this, Opcodes))
+ return indicatePessimisticFixpoint();
+
+ return ChangeStatus::UNCHANGED;
+ }
+};
+
+struct AANoUnwindFunction final : public AANoUnwindImpl {
+ AANoUnwindFunction(const IRPosition &IRP, Attributor &A)
+ : AANoUnwindImpl(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nounwind) }
+};
+
+/// NoUnwind attribute deduction for a call sites.
+struct AANoUnwindCallSite final : AANoUnwindImpl {
+ AANoUnwindCallSite(const IRPosition &IRP, Attributor &A)
+ : AANoUnwindImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ AANoUnwindImpl::initialize(A);
+ Function *F = getAssociatedFunction();
if (!F || F->isDeclaration())
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness information and then it makes
- // sense to specialize attributes for call sites arguments instead of
- // redirecting requests to the callee argument.
- Function *F = getAssociatedFunction();
- const IRPosition &FnPos = IRPosition::function(*F);
- auto &FnAA = A.getAAFor<AANoUnwind>(*this, FnPos);
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness information and then it makes
+ // sense to specialize attributes for call sites arguments instead of
+ // redirecting requests to the callee argument.
+ Function *F = getAssociatedFunction();
+ const IRPosition &FnPos = IRPosition::function(*F);
+ auto &FnAA = A.getAAFor<AANoUnwind>(*this, FnPos);
return clampStateAndIndicateChange(getState(), FnAA.getState());
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nounwind); }
-};
-
-/// --------------------- Function Return Values -------------------------------
-
-/// "Attribute" that collects all potential returned values and the return
-/// instructions that they arise from.
-///
-/// If there is a unique returned value R, the manifest method will:
-/// - mark R with the "returned" attribute, if R is an argument.
-class AAReturnedValuesImpl : public AAReturnedValues, public AbstractState {
-
- /// Mapping of values potentially returned by the associated function to the
- /// return instructions that might return them.
- MapVector<Value *, SmallSetVector<ReturnInst *, 4>> ReturnedValues;
-
- /// Mapping to remember the number of returned values for a call site such
- /// that we can avoid updates if nothing changed.
- DenseMap<const CallBase *, unsigned> NumReturnedValuesPerKnownAA;
-
- /// Set of unresolved calls returned by the associated function.
- SmallSetVector<CallBase *, 4> UnresolvedCalls;
-
- /// State flags
- ///
- ///{
- bool IsFixed = false;
- bool IsValidState = true;
- ///}
-
-public:
- AAReturnedValuesImpl(const IRPosition &IRP, Attributor &A)
- : AAReturnedValues(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- // Reset the state.
- IsFixed = false;
- IsValidState = true;
- ReturnedValues.clear();
-
- Function *F = getAssociatedFunction();
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nounwind); }
+};
+
+/// --------------------- Function Return Values -------------------------------
+
+/// "Attribute" that collects all potential returned values and the return
+/// instructions that they arise from.
+///
+/// If there is a unique returned value R, the manifest method will:
+/// - mark R with the "returned" attribute, if R is an argument.
+class AAReturnedValuesImpl : public AAReturnedValues, public AbstractState {
+
+ /// Mapping of values potentially returned by the associated function to the
+ /// return instructions that might return them.
+ MapVector<Value *, SmallSetVector<ReturnInst *, 4>> ReturnedValues;
+
+ /// Mapping to remember the number of returned values for a call site such
+ /// that we can avoid updates if nothing changed.
+ DenseMap<const CallBase *, unsigned> NumReturnedValuesPerKnownAA;
+
+ /// Set of unresolved calls returned by the associated function.
+ SmallSetVector<CallBase *, 4> UnresolvedCalls;
+
+ /// State flags
+ ///
+ ///{
+ bool IsFixed = false;
+ bool IsValidState = true;
+ ///}
+
+public:
+ AAReturnedValuesImpl(const IRPosition &IRP, Attributor &A)
+ : AAReturnedValues(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ // Reset the state.
+ IsFixed = false;
+ IsValidState = true;
+ ReturnedValues.clear();
+
+ Function *F = getAssociatedFunction();
if (!F || F->isDeclaration()) {
- indicatePessimisticFixpoint();
- return;
- }
- assert(!F->getReturnType()->isVoidTy() &&
- "Did not expect a void return type!");
-
- // The map from instruction opcodes to those instructions in the function.
- auto &OpcodeInstMap = A.getInfoCache().getOpcodeInstMapForFunction(*F);
-
- // Look through all arguments, if one is marked as returned we are done.
- for (Argument &Arg : F->args()) {
- if (Arg.hasReturnedAttr()) {
- auto &ReturnInstSet = ReturnedValues[&Arg];
- if (auto *Insts = OpcodeInstMap.lookup(Instruction::Ret))
- for (Instruction *RI : *Insts)
- ReturnInstSet.insert(cast<ReturnInst>(RI));
-
- indicateOptimisticFixpoint();
- return;
- }
- }
-
- if (!A.isFunctionIPOAmendable(*F))
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override;
-
- /// See AbstractAttribute::getState(...).
- AbstractState &getState() override { return *this; }
-
- /// See AbstractAttribute::getState(...).
- const AbstractState &getState() const override { return *this; }
-
- /// See AbstractAttribute::updateImpl(Attributor &A).
- ChangeStatus updateImpl(Attributor &A) override;
-
- llvm::iterator_range<iterator> returned_values() override {
- return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end());
- }
-
- llvm::iterator_range<const_iterator> returned_values() const override {
- return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end());
- }
-
- const SmallSetVector<CallBase *, 4> &getUnresolvedCalls() const override {
- return UnresolvedCalls;
- }
-
- /// Return the number of potential return values, -1 if unknown.
- size_t getNumReturnValues() const override {
- return isValidState() ? ReturnedValues.size() : -1;
- }
-
- /// Return an assumed unique return value if a single candidate is found. If
- /// there cannot be one, return a nullptr. If it is not clear yet, return the
- /// Optional::NoneType.
- Optional<Value *> getAssumedUniqueReturnValue(Attributor &A) const;
-
- /// See AbstractState::checkForAllReturnedValues(...).
- bool checkForAllReturnedValuesAndReturnInsts(
- function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred)
- const override;
-
- /// Pretty print the attribute similar to the IR representation.
- const std::string getAsStr() const override;
-
- /// See AbstractState::isAtFixpoint().
- bool isAtFixpoint() const override { return IsFixed; }
-
- /// See AbstractState::isValidState().
- bool isValidState() const override { return IsValidState; }
-
- /// See AbstractState::indicateOptimisticFixpoint(...).
- ChangeStatus indicateOptimisticFixpoint() override {
- IsFixed = true;
- return ChangeStatus::UNCHANGED;
- }
-
- ChangeStatus indicatePessimisticFixpoint() override {
- IsFixed = true;
- IsValidState = false;
- return ChangeStatus::CHANGED;
- }
-};
-
-ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) {
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
-
- // Bookkeeping.
- assert(isValidState());
- STATS_DECLTRACK(KnownReturnValues, FunctionReturn,
- "Number of function with known return values");
-
- // Check if we have an assumed unique return value that we could manifest.
- Optional<Value *> UniqueRV = getAssumedUniqueReturnValue(A);
-
- if (!UniqueRV.hasValue() || !UniqueRV.getValue())
- return Changed;
-
- // Bookkeeping.
- STATS_DECLTRACK(UniqueReturnValue, FunctionReturn,
- "Number of function with unique return");
-
- // Callback to replace the uses of CB with the constant C.
- auto ReplaceCallSiteUsersWith = [&A](CallBase &CB, Constant &C) {
- if (CB.use_empty())
- return ChangeStatus::UNCHANGED;
- if (A.changeValueAfterManifest(CB, C))
- return ChangeStatus::CHANGED;
- return ChangeStatus::UNCHANGED;
- };
-
- // If the assumed unique return value is an argument, annotate it.
- if (auto *UniqueRVArg = dyn_cast<Argument>(UniqueRV.getValue())) {
- if (UniqueRVArg->getType()->canLosslesslyBitCastTo(
- getAssociatedFunction()->getReturnType())) {
- getIRPosition() = IRPosition::argument(*UniqueRVArg);
- Changed = IRAttribute::manifest(A);
- }
- } else if (auto *RVC = dyn_cast<Constant>(UniqueRV.getValue())) {
- // We can replace the returned value with the unique returned constant.
- Value &AnchorValue = getAnchorValue();
- if (Function *F = dyn_cast<Function>(&AnchorValue)) {
- for (const Use &U : F->uses())
- if (CallBase *CB = dyn_cast<CallBase>(U.getUser()))
- if (CB->isCallee(&U)) {
- Constant *RVCCast =
- CB->getType() == RVC->getType()
- ? RVC
- : ConstantExpr::getTruncOrBitCast(RVC, CB->getType());
- Changed = ReplaceCallSiteUsersWith(*CB, *RVCCast) | Changed;
- }
- } else {
- assert(isa<CallBase>(AnchorValue) &&
- "Expcected a function or call base anchor!");
- Constant *RVCCast =
- AnchorValue.getType() == RVC->getType()
- ? RVC
- : ConstantExpr::getTruncOrBitCast(RVC, AnchorValue.getType());
- Changed = ReplaceCallSiteUsersWith(cast<CallBase>(AnchorValue), *RVCCast);
- }
- if (Changed == ChangeStatus::CHANGED)
- STATS_DECLTRACK(UniqueConstantReturnValue, FunctionReturn,
- "Number of function returns replaced by constant return");
- }
-
- return Changed;
-}
-
-const std::string AAReturnedValuesImpl::getAsStr() const {
- return (isAtFixpoint() ? "returns(#" : "may-return(#") +
- (isValidState() ? std::to_string(getNumReturnValues()) : "?") +
- ")[#UC: " + std::to_string(UnresolvedCalls.size()) + "]";
-}
-
-Optional<Value *>
-AAReturnedValuesImpl::getAssumedUniqueReturnValue(Attributor &A) const {
- // If checkForAllReturnedValues provides a unique value, ignoring potential
- // undef values that can also be present, it is assumed to be the actual
- // return value and forwarded to the caller of this method. If there are
- // multiple, a nullptr is returned indicating there cannot be a unique
- // returned value.
- Optional<Value *> UniqueRV;
-
- auto Pred = [&](Value &RV) -> bool {
- // If we found a second returned value and neither the current nor the saved
- // one is an undef, there is no unique returned value. Undefs are special
- // since we can pretend they have any value.
- if (UniqueRV.hasValue() && UniqueRV != &RV &&
- !(isa<UndefValue>(RV) || isa<UndefValue>(UniqueRV.getValue()))) {
- UniqueRV = nullptr;
- return false;
- }
-
- // Do not overwrite a value with an undef.
- if (!UniqueRV.hasValue() || !isa<UndefValue>(RV))
- UniqueRV = &RV;
-
- return true;
- };
-
- if (!A.checkForAllReturnedValues(Pred, *this))
- UniqueRV = nullptr;
-
- return UniqueRV;
-}
-
-bool AAReturnedValuesImpl::checkForAllReturnedValuesAndReturnInsts(
- function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred)
- const {
- if (!isValidState())
- return false;
-
- // Check all returned values but ignore call sites as long as we have not
- // encountered an overdefined one during an update.
- for (auto &It : ReturnedValues) {
- Value *RV = It.first;
-
- CallBase *CB = dyn_cast<CallBase>(RV);
- if (CB && !UnresolvedCalls.count(CB))
- continue;
-
- if (!Pred(*RV, It.second))
- return false;
- }
-
- return true;
-}
-
-ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) {
- size_t NumUnresolvedCalls = UnresolvedCalls.size();
- bool Changed = false;
-
- // State used in the value traversals starting in returned values.
- struct RVState {
- // The map in which we collect return values -> return instrs.
- decltype(ReturnedValues) &RetValsMap;
- // The flag to indicate a change.
- bool &Changed;
- // The return instrs we come from.
- SmallSetVector<ReturnInst *, 4> RetInsts;
- };
-
- // Callback for a leaf value returned by the associated function.
- auto VisitValueCB = [](Value &Val, const Instruction *, RVState &RVS,
- bool) -> bool {
- auto Size = RVS.RetValsMap[&Val].size();
- RVS.RetValsMap[&Val].insert(RVS.RetInsts.begin(), RVS.RetInsts.end());
- bool Inserted = RVS.RetValsMap[&Val].size() != Size;
- RVS.Changed |= Inserted;
- LLVM_DEBUG({
- if (Inserted)
- dbgs() << "[AAReturnedValues] 1 Add new returned value " << Val
- << " => " << RVS.RetInsts.size() << "\n";
- });
- return true;
- };
-
- // Helper method to invoke the generic value traversal.
- auto VisitReturnedValue = [&](Value &RV, RVState &RVS,
- const Instruction *CtxI) {
- IRPosition RetValPos = IRPosition::value(RV);
- return genericValueTraversal<AAReturnedValues, RVState>(
- A, RetValPos, *this, RVS, VisitValueCB, CtxI,
- /* UseValueSimplify */ false);
- };
-
- // Callback for all "return intructions" live in the associated function.
- auto CheckReturnInst = [this, &VisitReturnedValue, &Changed](Instruction &I) {
- ReturnInst &Ret = cast<ReturnInst>(I);
- RVState RVS({ReturnedValues, Changed, {}});
- RVS.RetInsts.insert(&Ret);
- return VisitReturnedValue(*Ret.getReturnValue(), RVS, &I);
- };
-
- // Start by discovering returned values from all live returned instructions in
- // the associated function.
- if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret}))
- return indicatePessimisticFixpoint();
-
- // Once returned values "directly" present in the code are handled we try to
- // resolve returned calls. To avoid modifications to the ReturnedValues map
- // while we iterate over it we kept record of potential new entries in a copy
- // map, NewRVsMap.
- decltype(ReturnedValues) NewRVsMap;
-
+ indicatePessimisticFixpoint();
+ return;
+ }
+ assert(!F->getReturnType()->isVoidTy() &&
+ "Did not expect a void return type!");
+
+ // The map from instruction opcodes to those instructions in the function.
+ auto &OpcodeInstMap = A.getInfoCache().getOpcodeInstMapForFunction(*F);
+
+ // Look through all arguments, if one is marked as returned we are done.
+ for (Argument &Arg : F->args()) {
+ if (Arg.hasReturnedAttr()) {
+ auto &ReturnInstSet = ReturnedValues[&Arg];
+ if (auto *Insts = OpcodeInstMap.lookup(Instruction::Ret))
+ for (Instruction *RI : *Insts)
+ ReturnInstSet.insert(cast<ReturnInst>(RI));
+
+ indicateOptimisticFixpoint();
+ return;
+ }
+ }
+
+ if (!A.isFunctionIPOAmendable(*F))
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override;
+
+ /// See AbstractAttribute::getState(...).
+ AbstractState &getState() override { return *this; }
+
+ /// See AbstractAttribute::getState(...).
+ const AbstractState &getState() const override { return *this; }
+
+ /// See AbstractAttribute::updateImpl(Attributor &A).
+ ChangeStatus updateImpl(Attributor &A) override;
+
+ llvm::iterator_range<iterator> returned_values() override {
+ return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end());
+ }
+
+ llvm::iterator_range<const_iterator> returned_values() const override {
+ return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end());
+ }
+
+ const SmallSetVector<CallBase *, 4> &getUnresolvedCalls() const override {
+ return UnresolvedCalls;
+ }
+
+ /// Return the number of potential return values, -1 if unknown.
+ size_t getNumReturnValues() const override {
+ return isValidState() ? ReturnedValues.size() : -1;
+ }
+
+ /// Return an assumed unique return value if a single candidate is found. If
+ /// there cannot be one, return a nullptr. If it is not clear yet, return the
+ /// Optional::NoneType.
+ Optional<Value *> getAssumedUniqueReturnValue(Attributor &A) const;
+
+ /// See AbstractState::checkForAllReturnedValues(...).
+ bool checkForAllReturnedValuesAndReturnInsts(
+ function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred)
+ const override;
+
+ /// Pretty print the attribute similar to the IR representation.
+ const std::string getAsStr() const override;
+
+ /// See AbstractState::isAtFixpoint().
+ bool isAtFixpoint() const override { return IsFixed; }
+
+ /// See AbstractState::isValidState().
+ bool isValidState() const override { return IsValidState; }
+
+ /// See AbstractState::indicateOptimisticFixpoint(...).
+ ChangeStatus indicateOptimisticFixpoint() override {
+ IsFixed = true;
+ return ChangeStatus::UNCHANGED;
+ }
+
+ ChangeStatus indicatePessimisticFixpoint() override {
+ IsFixed = true;
+ IsValidState = false;
+ return ChangeStatus::CHANGED;
+ }
+};
+
+ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) {
+ ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+ // Bookkeeping.
+ assert(isValidState());
+ STATS_DECLTRACK(KnownReturnValues, FunctionReturn,
+ "Number of function with known return values");
+
+ // Check if we have an assumed unique return value that we could manifest.
+ Optional<Value *> UniqueRV = getAssumedUniqueReturnValue(A);
+
+ if (!UniqueRV.hasValue() || !UniqueRV.getValue())
+ return Changed;
+
+ // Bookkeeping.
+ STATS_DECLTRACK(UniqueReturnValue, FunctionReturn,
+ "Number of function with unique return");
+
+ // Callback to replace the uses of CB with the constant C.
+ auto ReplaceCallSiteUsersWith = [&A](CallBase &CB, Constant &C) {
+ if (CB.use_empty())
+ return ChangeStatus::UNCHANGED;
+ if (A.changeValueAfterManifest(CB, C))
+ return ChangeStatus::CHANGED;
+ return ChangeStatus::UNCHANGED;
+ };
+
+ // If the assumed unique return value is an argument, annotate it.
+ if (auto *UniqueRVArg = dyn_cast<Argument>(UniqueRV.getValue())) {
+ if (UniqueRVArg->getType()->canLosslesslyBitCastTo(
+ getAssociatedFunction()->getReturnType())) {
+ getIRPosition() = IRPosition::argument(*UniqueRVArg);
+ Changed = IRAttribute::manifest(A);
+ }
+ } else if (auto *RVC = dyn_cast<Constant>(UniqueRV.getValue())) {
+ // We can replace the returned value with the unique returned constant.
+ Value &AnchorValue = getAnchorValue();
+ if (Function *F = dyn_cast<Function>(&AnchorValue)) {
+ for (const Use &U : F->uses())
+ if (CallBase *CB = dyn_cast<CallBase>(U.getUser()))
+ if (CB->isCallee(&U)) {
+ Constant *RVCCast =
+ CB->getType() == RVC->getType()
+ ? RVC
+ : ConstantExpr::getTruncOrBitCast(RVC, CB->getType());
+ Changed = ReplaceCallSiteUsersWith(*CB, *RVCCast) | Changed;
+ }
+ } else {
+ assert(isa<CallBase>(AnchorValue) &&
+ "Expcected a function or call base anchor!");
+ Constant *RVCCast =
+ AnchorValue.getType() == RVC->getType()
+ ? RVC
+ : ConstantExpr::getTruncOrBitCast(RVC, AnchorValue.getType());
+ Changed = ReplaceCallSiteUsersWith(cast<CallBase>(AnchorValue), *RVCCast);
+ }
+ if (Changed == ChangeStatus::CHANGED)
+ STATS_DECLTRACK(UniqueConstantReturnValue, FunctionReturn,
+ "Number of function returns replaced by constant return");
+ }
+
+ return Changed;
+}
+
+const std::string AAReturnedValuesImpl::getAsStr() const {
+ return (isAtFixpoint() ? "returns(#" : "may-return(#") +
+ (isValidState() ? std::to_string(getNumReturnValues()) : "?") +
+ ")[#UC: " + std::to_string(UnresolvedCalls.size()) + "]";
+}
+
+Optional<Value *>
+AAReturnedValuesImpl::getAssumedUniqueReturnValue(Attributor &A) const {
+ // If checkForAllReturnedValues provides a unique value, ignoring potential
+ // undef values that can also be present, it is assumed to be the actual
+ // return value and forwarded to the caller of this method. If there are
+ // multiple, a nullptr is returned indicating there cannot be a unique
+ // returned value.
+ Optional<Value *> UniqueRV;
+
+ auto Pred = [&](Value &RV) -> bool {
+ // If we found a second returned value and neither the current nor the saved
+ // one is an undef, there is no unique returned value. Undefs are special
+ // since we can pretend they have any value.
+ if (UniqueRV.hasValue() && UniqueRV != &RV &&
+ !(isa<UndefValue>(RV) || isa<UndefValue>(UniqueRV.getValue()))) {
+ UniqueRV = nullptr;
+ return false;
+ }
+
+ // Do not overwrite a value with an undef.
+ if (!UniqueRV.hasValue() || !isa<UndefValue>(RV))
+ UniqueRV = &RV;
+
+ return true;
+ };
+
+ if (!A.checkForAllReturnedValues(Pred, *this))
+ UniqueRV = nullptr;
+
+ return UniqueRV;
+}
+
+bool AAReturnedValuesImpl::checkForAllReturnedValuesAndReturnInsts(
+ function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred)
+ const {
+ if (!isValidState())
+ return false;
+
+ // Check all returned values but ignore call sites as long as we have not
+ // encountered an overdefined one during an update.
+ for (auto &It : ReturnedValues) {
+ Value *RV = It.first;
+
+ CallBase *CB = dyn_cast<CallBase>(RV);
+ if (CB && !UnresolvedCalls.count(CB))
+ continue;
+
+ if (!Pred(*RV, It.second))
+ return false;
+ }
+
+ return true;
+}
+
+ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) {
+ size_t NumUnresolvedCalls = UnresolvedCalls.size();
+ bool Changed = false;
+
+ // State used in the value traversals starting in returned values.
+ struct RVState {
+ // The map in which we collect return values -> return instrs.
+ decltype(ReturnedValues) &RetValsMap;
+ // The flag to indicate a change.
+ bool &Changed;
+ // The return instrs we come from.
+ SmallSetVector<ReturnInst *, 4> RetInsts;
+ };
+
+ // Callback for a leaf value returned by the associated function.
+ auto VisitValueCB = [](Value &Val, const Instruction *, RVState &RVS,
+ bool) -> bool {
+ auto Size = RVS.RetValsMap[&Val].size();
+ RVS.RetValsMap[&Val].insert(RVS.RetInsts.begin(), RVS.RetInsts.end());
+ bool Inserted = RVS.RetValsMap[&Val].size() != Size;
+ RVS.Changed |= Inserted;
+ LLVM_DEBUG({
+ if (Inserted)
+ dbgs() << "[AAReturnedValues] 1 Add new returned value " << Val
+ << " => " << RVS.RetInsts.size() << "\n";
+ });
+ return true;
+ };
+
+ // Helper method to invoke the generic value traversal.
+ auto VisitReturnedValue = [&](Value &RV, RVState &RVS,
+ const Instruction *CtxI) {
+ IRPosition RetValPos = IRPosition::value(RV);
+ return genericValueTraversal<AAReturnedValues, RVState>(
+ A, RetValPos, *this, RVS, VisitValueCB, CtxI,
+ /* UseValueSimplify */ false);
+ };
+
+ // Callback for all "return intructions" live in the associated function.
+ auto CheckReturnInst = [this, &VisitReturnedValue, &Changed](Instruction &I) {
+ ReturnInst &Ret = cast<ReturnInst>(I);
+ RVState RVS({ReturnedValues, Changed, {}});
+ RVS.RetInsts.insert(&Ret);
+ return VisitReturnedValue(*Ret.getReturnValue(), RVS, &I);
+ };
+
+ // Start by discovering returned values from all live returned instructions in
+ // the associated function.
+ if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret}))
+ return indicatePessimisticFixpoint();
+
+ // Once returned values "directly" present in the code are handled we try to
+ // resolve returned calls. To avoid modifications to the ReturnedValues map
+ // while we iterate over it we kept record of potential new entries in a copy
+ // map, NewRVsMap.
+ decltype(ReturnedValues) NewRVsMap;
+
auto HandleReturnValue = [&](Value *RV,
SmallSetVector<ReturnInst *, 4> &RIs) {
LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned value: " << *RV << " by #"
<< RIs.size() << " RIs\n");
- CallBase *CB = dyn_cast<CallBase>(RV);
- if (!CB || UnresolvedCalls.count(CB))
- return;
-
- if (!CB->getCalledFunction()) {
- LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB
- << "\n");
- UnresolvedCalls.insert(CB);
- return;
- }
-
- // TODO: use the function scope once we have call site AAReturnedValues.
- const auto &RetValAA = A.getAAFor<AAReturnedValues>(
- *this, IRPosition::function(*CB->getCalledFunction()));
- LLVM_DEBUG(dbgs() << "[AAReturnedValues] Found another AAReturnedValues: "
- << RetValAA << "\n");
-
- // Skip dead ends, thus if we do not know anything about the returned
- // call we mark it as unresolved and it will stay that way.
- if (!RetValAA.getState().isValidState()) {
- LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB
- << "\n");
- UnresolvedCalls.insert(CB);
- return;
- }
-
- // Do not try to learn partial information. If the callee has unresolved
- // return values we will treat the call as unresolved/opaque.
- auto &RetValAAUnresolvedCalls = RetValAA.getUnresolvedCalls();
- if (!RetValAAUnresolvedCalls.empty()) {
- UnresolvedCalls.insert(CB);
- return;
- }
-
- // Now check if we can track transitively returned values. If possible, thus
- // if all return value can be represented in the current scope, do so.
- bool Unresolved = false;
- for (auto &RetValAAIt : RetValAA.returned_values()) {
- Value *RetVal = RetValAAIt.first;
- if (isa<Argument>(RetVal) || isa<CallBase>(RetVal) ||
- isa<Constant>(RetVal))
- continue;
- // Anything that did not fit in the above categories cannot be resolved,
- // mark the call as unresolved.
- LLVM_DEBUG(dbgs() << "[AAReturnedValues] transitively returned value "
- "cannot be translated: "
- << *RetVal << "\n");
- UnresolvedCalls.insert(CB);
- Unresolved = true;
- break;
- }
-
- if (Unresolved)
- return;
-
- // Now track transitively returned values.
- unsigned &NumRetAA = NumReturnedValuesPerKnownAA[CB];
- if (NumRetAA == RetValAA.getNumReturnValues()) {
- LLVM_DEBUG(dbgs() << "[AAReturnedValues] Skip call as it has not "
- "changed since it was seen last\n");
- return;
- }
- NumRetAA = RetValAA.getNumReturnValues();
-
- for (auto &RetValAAIt : RetValAA.returned_values()) {
- Value *RetVal = RetValAAIt.first;
- if (Argument *Arg = dyn_cast<Argument>(RetVal)) {
- // Arguments are mapped to call site operands and we begin the traversal
- // again.
- bool Unused = false;
- RVState RVS({NewRVsMap, Unused, RetValAAIt.second});
- VisitReturnedValue(*CB->getArgOperand(Arg->getArgNo()), RVS, CB);
- continue;
+ CallBase *CB = dyn_cast<CallBase>(RV);
+ if (!CB || UnresolvedCalls.count(CB))
+ return;
+
+ if (!CB->getCalledFunction()) {
+ LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB
+ << "\n");
+ UnresolvedCalls.insert(CB);
+ return;
+ }
+
+ // TODO: use the function scope once we have call site AAReturnedValues.
+ const auto &RetValAA = A.getAAFor<AAReturnedValues>(
+ *this, IRPosition::function(*CB->getCalledFunction()));
+ LLVM_DEBUG(dbgs() << "[AAReturnedValues] Found another AAReturnedValues: "
+ << RetValAA << "\n");
+
+ // Skip dead ends, thus if we do not know anything about the returned
+ // call we mark it as unresolved and it will stay that way.
+ if (!RetValAA.getState().isValidState()) {
+ LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB
+ << "\n");
+ UnresolvedCalls.insert(CB);
+ return;
+ }
+
+ // Do not try to learn partial information. If the callee has unresolved
+ // return values we will treat the call as unresolved/opaque.
+ auto &RetValAAUnresolvedCalls = RetValAA.getUnresolvedCalls();
+ if (!RetValAAUnresolvedCalls.empty()) {
+ UnresolvedCalls.insert(CB);
+ return;
+ }
+
+ // Now check if we can track transitively returned values. If possible, thus
+ // if all return value can be represented in the current scope, do so.
+ bool Unresolved = false;
+ for (auto &RetValAAIt : RetValAA.returned_values()) {
+ Value *RetVal = RetValAAIt.first;
+ if (isa<Argument>(RetVal) || isa<CallBase>(RetVal) ||
+ isa<Constant>(RetVal))
+ continue;
+ // Anything that did not fit in the above categories cannot be resolved,
+ // mark the call as unresolved.
+ LLVM_DEBUG(dbgs() << "[AAReturnedValues] transitively returned value "
+ "cannot be translated: "
+ << *RetVal << "\n");
+ UnresolvedCalls.insert(CB);
+ Unresolved = true;
+ break;
+ }
+
+ if (Unresolved)
+ return;
+
+ // Now track transitively returned values.
+ unsigned &NumRetAA = NumReturnedValuesPerKnownAA[CB];
+ if (NumRetAA == RetValAA.getNumReturnValues()) {
+ LLVM_DEBUG(dbgs() << "[AAReturnedValues] Skip call as it has not "
+ "changed since it was seen last\n");
+ return;
+ }
+ NumRetAA = RetValAA.getNumReturnValues();
+
+ for (auto &RetValAAIt : RetValAA.returned_values()) {
+ Value *RetVal = RetValAAIt.first;
+ if (Argument *Arg = dyn_cast<Argument>(RetVal)) {
+ // Arguments are mapped to call site operands and we begin the traversal
+ // again.
+ bool Unused = false;
+ RVState RVS({NewRVsMap, Unused, RetValAAIt.second});
+ VisitReturnedValue(*CB->getArgOperand(Arg->getArgNo()), RVS, CB);
+ continue;
}
if (isa<CallBase>(RetVal)) {
- // Call sites are resolved by the callee attribute over time, no need to
- // do anything for us.
- continue;
+ // Call sites are resolved by the callee attribute over time, no need to
+ // do anything for us.
+ continue;
}
if (isa<Constant>(RetVal)) {
- // Constants are valid everywhere, we can simply take them.
- NewRVsMap[RetVal].insert(RIs.begin(), RIs.end());
- continue;
- }
- }
- };
-
- for (auto &It : ReturnedValues)
- HandleReturnValue(It.first, It.second);
-
- // Because processing the new information can again lead to new return values
- // we have to be careful and iterate until this iteration is complete. The
- // idea is that we are in a stable state at the end of an update. All return
- // values have been handled and properly categorized. We might not update
- // again if we have not requested a non-fix attribute so we cannot "wait" for
- // the next update to analyze a new return value.
- while (!NewRVsMap.empty()) {
- auto It = std::move(NewRVsMap.back());
- NewRVsMap.pop_back();
-
- assert(!It.second.empty() && "Entry does not add anything.");
- auto &ReturnInsts = ReturnedValues[It.first];
- for (ReturnInst *RI : It.second)
- if (ReturnInsts.insert(RI)) {
- LLVM_DEBUG(dbgs() << "[AAReturnedValues] Add new returned value "
- << *It.first << " => " << *RI << "\n");
- HandleReturnValue(It.first, ReturnInsts);
- Changed = true;
- }
- }
-
- Changed |= (NumUnresolvedCalls != UnresolvedCalls.size());
- return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
-}
-
-struct AAReturnedValuesFunction final : public AAReturnedValuesImpl {
- AAReturnedValuesFunction(const IRPosition &IRP, Attributor &A)
- : AAReturnedValuesImpl(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(returned) }
-};
-
-/// Returned values information for a call sites.
-struct AAReturnedValuesCallSite final : AAReturnedValuesImpl {
- AAReturnedValuesCallSite(const IRPosition &IRP, Attributor &A)
- : AAReturnedValuesImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness information and then it makes
- // sense to specialize attributes for call sites instead of
- // redirecting requests to the callee.
- llvm_unreachable("Abstract attributes for returned values are not "
- "supported for call sites yet!");
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- return indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {}
-};
-
-/// ------------------------ NoSync Function Attribute -------------------------
-
-struct AANoSyncImpl : AANoSync {
- AANoSyncImpl(const IRPosition &IRP, Attributor &A) : AANoSync(IRP, A) {}
-
- const std::string getAsStr() const override {
- return getAssumed() ? "nosync" : "may-sync";
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override;
-
- /// Helper function used to determine whether an instruction is non-relaxed
- /// atomic. In other words, if an atomic instruction does not have unordered
- /// or monotonic ordering
- static bool isNonRelaxedAtomic(Instruction *I);
-
- /// Helper function used to determine whether an instruction is volatile.
- static bool isVolatile(Instruction *I);
-
- /// Helper function uset to check if intrinsic is volatile (memcpy, memmove,
- /// memset).
- static bool isNoSyncIntrinsic(Instruction *I);
-};
-
-bool AANoSyncImpl::isNonRelaxedAtomic(Instruction *I) {
- if (!I->isAtomic())
- return false;
-
- AtomicOrdering Ordering;
- switch (I->getOpcode()) {
- case Instruction::AtomicRMW:
- Ordering = cast<AtomicRMWInst>(I)->getOrdering();
- break;
- case Instruction::Store:
- Ordering = cast<StoreInst>(I)->getOrdering();
- break;
- case Instruction::Load:
- Ordering = cast<LoadInst>(I)->getOrdering();
- break;
- case Instruction::Fence: {
- auto *FI = cast<FenceInst>(I);
- if (FI->getSyncScopeID() == SyncScope::SingleThread)
- return false;
- Ordering = FI->getOrdering();
- break;
- }
- case Instruction::AtomicCmpXchg: {
- AtomicOrdering Success = cast<AtomicCmpXchgInst>(I)->getSuccessOrdering();
- AtomicOrdering Failure = cast<AtomicCmpXchgInst>(I)->getFailureOrdering();
- // Only if both are relaxed, than it can be treated as relaxed.
- // Otherwise it is non-relaxed.
- if (Success != AtomicOrdering::Unordered &&
- Success != AtomicOrdering::Monotonic)
- return true;
- if (Failure != AtomicOrdering::Unordered &&
- Failure != AtomicOrdering::Monotonic)
- return true;
- return false;
- }
- default:
- llvm_unreachable(
- "New atomic operations need to be known in the attributor.");
- }
-
- // Relaxed.
- if (Ordering == AtomicOrdering::Unordered ||
- Ordering == AtomicOrdering::Monotonic)
- return false;
- return true;
-}
-
-/// Checks if an intrinsic is nosync. Currently only checks mem* intrinsics.
-/// FIXME: We should ipmrove the handling of intrinsics.
-bool AANoSyncImpl::isNoSyncIntrinsic(Instruction *I) {
- if (auto *II = dyn_cast<IntrinsicInst>(I)) {
- switch (II->getIntrinsicID()) {
- /// Element wise atomic memory intrinsics are can only be unordered,
- /// therefore nosync.
- case Intrinsic::memset_element_unordered_atomic:
- case Intrinsic::memmove_element_unordered_atomic:
- case Intrinsic::memcpy_element_unordered_atomic:
- return true;
- case Intrinsic::memset:
- case Intrinsic::memmove:
- case Intrinsic::memcpy:
- if (!cast<MemIntrinsic>(II)->isVolatile())
- return true;
- return false;
- default:
- return false;
- }
- }
- return false;
-}
-
-bool AANoSyncImpl::isVolatile(Instruction *I) {
- assert(!isa<CallBase>(I) && "Calls should not be checked here");
-
- switch (I->getOpcode()) {
- case Instruction::AtomicRMW:
- return cast<AtomicRMWInst>(I)->isVolatile();
- case Instruction::Store:
- return cast<StoreInst>(I)->isVolatile();
- case Instruction::Load:
- return cast<LoadInst>(I)->isVolatile();
- case Instruction::AtomicCmpXchg:
- return cast<AtomicCmpXchgInst>(I)->isVolatile();
- default:
- return false;
- }
-}
-
-ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) {
-
- auto CheckRWInstForNoSync = [&](Instruction &I) {
- /// We are looking for volatile instructions or Non-Relaxed atomics.
- /// FIXME: We should improve the handling of intrinsics.
-
- if (isa<IntrinsicInst>(&I) && isNoSyncIntrinsic(&I))
- return true;
-
- if (const auto *CB = dyn_cast<CallBase>(&I)) {
- if (CB->hasFnAttr(Attribute::NoSync))
- return true;
-
- const auto &NoSyncAA =
- A.getAAFor<AANoSync>(*this, IRPosition::callsite_function(*CB));
- if (NoSyncAA.isAssumedNoSync())
- return true;
- return false;
- }
-
- if (!isVolatile(&I) && !isNonRelaxedAtomic(&I))
- return true;
-
- return false;
- };
-
- auto CheckForNoSync = [&](Instruction &I) {
- // At this point we handled all read/write effects and they are all
- // nosync, so they can be skipped.
- if (I.mayReadOrWriteMemory())
- return true;
-
- // non-convergent and readnone imply nosync.
- return !cast<CallBase>(I).isConvergent();
- };
-
- if (!A.checkForAllReadWriteInstructions(CheckRWInstForNoSync, *this) ||
- !A.checkForAllCallLikeInstructions(CheckForNoSync, *this))
- return indicatePessimisticFixpoint();
-
- return ChangeStatus::UNCHANGED;
-}
-
-struct AANoSyncFunction final : public AANoSyncImpl {
- AANoSyncFunction(const IRPosition &IRP, Attributor &A)
- : AANoSyncImpl(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nosync) }
-};
-
-/// NoSync attribute deduction for a call sites.
-struct AANoSyncCallSite final : AANoSyncImpl {
- AANoSyncCallSite(const IRPosition &IRP, Attributor &A)
- : AANoSyncImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- AANoSyncImpl::initialize(A);
- Function *F = getAssociatedFunction();
+ // Constants are valid everywhere, we can simply take them.
+ NewRVsMap[RetVal].insert(RIs.begin(), RIs.end());
+ continue;
+ }
+ }
+ };
+
+ for (auto &It : ReturnedValues)
+ HandleReturnValue(It.first, It.second);
+
+ // Because processing the new information can again lead to new return values
+ // we have to be careful and iterate until this iteration is complete. The
+ // idea is that we are in a stable state at the end of an update. All return
+ // values have been handled and properly categorized. We might not update
+ // again if we have not requested a non-fix attribute so we cannot "wait" for
+ // the next update to analyze a new return value.
+ while (!NewRVsMap.empty()) {
+ auto It = std::move(NewRVsMap.back());
+ NewRVsMap.pop_back();
+
+ assert(!It.second.empty() && "Entry does not add anything.");
+ auto &ReturnInsts = ReturnedValues[It.first];
+ for (ReturnInst *RI : It.second)
+ if (ReturnInsts.insert(RI)) {
+ LLVM_DEBUG(dbgs() << "[AAReturnedValues] Add new returned value "
+ << *It.first << " => " << *RI << "\n");
+ HandleReturnValue(It.first, ReturnInsts);
+ Changed = true;
+ }
+ }
+
+ Changed |= (NumUnresolvedCalls != UnresolvedCalls.size());
+ return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+}
+
+struct AAReturnedValuesFunction final : public AAReturnedValuesImpl {
+ AAReturnedValuesFunction(const IRPosition &IRP, Attributor &A)
+ : AAReturnedValuesImpl(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(returned) }
+};
+
+/// Returned values information for a call sites.
+struct AAReturnedValuesCallSite final : AAReturnedValuesImpl {
+ AAReturnedValuesCallSite(const IRPosition &IRP, Attributor &A)
+ : AAReturnedValuesImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness information and then it makes
+ // sense to specialize attributes for call sites instead of
+ // redirecting requests to the callee.
+ llvm_unreachable("Abstract attributes for returned values are not "
+ "supported for call sites yet!");
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ return indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {}
+};
+
+/// ------------------------ NoSync Function Attribute -------------------------
+
+struct AANoSyncImpl : AANoSync {
+ AANoSyncImpl(const IRPosition &IRP, Attributor &A) : AANoSync(IRP, A) {}
+
+ const std::string getAsStr() const override {
+ return getAssumed() ? "nosync" : "may-sync";
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override;
+
+ /// Helper function used to determine whether an instruction is non-relaxed
+ /// atomic. In other words, if an atomic instruction does not have unordered
+ /// or monotonic ordering
+ static bool isNonRelaxedAtomic(Instruction *I);
+
+ /// Helper function used to determine whether an instruction is volatile.
+ static bool isVolatile(Instruction *I);
+
+ /// Helper function uset to check if intrinsic is volatile (memcpy, memmove,
+ /// memset).
+ static bool isNoSyncIntrinsic(Instruction *I);
+};
+
+bool AANoSyncImpl::isNonRelaxedAtomic(Instruction *I) {
+ if (!I->isAtomic())
+ return false;
+
+ AtomicOrdering Ordering;
+ switch (I->getOpcode()) {
+ case Instruction::AtomicRMW:
+ Ordering = cast<AtomicRMWInst>(I)->getOrdering();
+ break;
+ case Instruction::Store:
+ Ordering = cast<StoreInst>(I)->getOrdering();
+ break;
+ case Instruction::Load:
+ Ordering = cast<LoadInst>(I)->getOrdering();
+ break;
+ case Instruction::Fence: {
+ auto *FI = cast<FenceInst>(I);
+ if (FI->getSyncScopeID() == SyncScope::SingleThread)
+ return false;
+ Ordering = FI->getOrdering();
+ break;
+ }
+ case Instruction::AtomicCmpXchg: {
+ AtomicOrdering Success = cast<AtomicCmpXchgInst>(I)->getSuccessOrdering();
+ AtomicOrdering Failure = cast<AtomicCmpXchgInst>(I)->getFailureOrdering();
+ // Only if both are relaxed, than it can be treated as relaxed.
+ // Otherwise it is non-relaxed.
+ if (Success != AtomicOrdering::Unordered &&
+ Success != AtomicOrdering::Monotonic)
+ return true;
+ if (Failure != AtomicOrdering::Unordered &&
+ Failure != AtomicOrdering::Monotonic)
+ return true;
+ return false;
+ }
+ default:
+ llvm_unreachable(
+ "New atomic operations need to be known in the attributor.");
+ }
+
+ // Relaxed.
+ if (Ordering == AtomicOrdering::Unordered ||
+ Ordering == AtomicOrdering::Monotonic)
+ return false;
+ return true;
+}
+
+/// Checks if an intrinsic is nosync. Currently only checks mem* intrinsics.
+/// FIXME: We should ipmrove the handling of intrinsics.
+bool AANoSyncImpl::isNoSyncIntrinsic(Instruction *I) {
+ if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ /// Element wise atomic memory intrinsics are can only be unordered,
+ /// therefore nosync.
+ case Intrinsic::memset_element_unordered_atomic:
+ case Intrinsic::memmove_element_unordered_atomic:
+ case Intrinsic::memcpy_element_unordered_atomic:
+ return true;
+ case Intrinsic::memset:
+ case Intrinsic::memmove:
+ case Intrinsic::memcpy:
+ if (!cast<MemIntrinsic>(II)->isVolatile())
+ return true;
+ return false;
+ default:
+ return false;
+ }
+ }
+ return false;
+}
+
+bool AANoSyncImpl::isVolatile(Instruction *I) {
+ assert(!isa<CallBase>(I) && "Calls should not be checked here");
+
+ switch (I->getOpcode()) {
+ case Instruction::AtomicRMW:
+ return cast<AtomicRMWInst>(I)->isVolatile();
+ case Instruction::Store:
+ return cast<StoreInst>(I)->isVolatile();
+ case Instruction::Load:
+ return cast<LoadInst>(I)->isVolatile();
+ case Instruction::AtomicCmpXchg:
+ return cast<AtomicCmpXchgInst>(I)->isVolatile();
+ default:
+ return false;
+ }
+}
+
+ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) {
+
+ auto CheckRWInstForNoSync = [&](Instruction &I) {
+ /// We are looking for volatile instructions or Non-Relaxed atomics.
+ /// FIXME: We should improve the handling of intrinsics.
+
+ if (isa<IntrinsicInst>(&I) && isNoSyncIntrinsic(&I))
+ return true;
+
+ if (const auto *CB = dyn_cast<CallBase>(&I)) {
+ if (CB->hasFnAttr(Attribute::NoSync))
+ return true;
+
+ const auto &NoSyncAA =
+ A.getAAFor<AANoSync>(*this, IRPosition::callsite_function(*CB));
+ if (NoSyncAA.isAssumedNoSync())
+ return true;
+ return false;
+ }
+
+ if (!isVolatile(&I) && !isNonRelaxedAtomic(&I))
+ return true;
+
+ return false;
+ };
+
+ auto CheckForNoSync = [&](Instruction &I) {
+ // At this point we handled all read/write effects and they are all
+ // nosync, so they can be skipped.
+ if (I.mayReadOrWriteMemory())
+ return true;
+
+ // non-convergent and readnone imply nosync.
+ return !cast<CallBase>(I).isConvergent();
+ };
+
+ if (!A.checkForAllReadWriteInstructions(CheckRWInstForNoSync, *this) ||
+ !A.checkForAllCallLikeInstructions(CheckForNoSync, *this))
+ return indicatePessimisticFixpoint();
+
+ return ChangeStatus::UNCHANGED;
+}
+
+struct AANoSyncFunction final : public AANoSyncImpl {
+ AANoSyncFunction(const IRPosition &IRP, Attributor &A)
+ : AANoSyncImpl(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nosync) }
+};
+
+/// NoSync attribute deduction for a call sites.
+struct AANoSyncCallSite final : AANoSyncImpl {
+ AANoSyncCallSite(const IRPosition &IRP, Attributor &A)
+ : AANoSyncImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ AANoSyncImpl::initialize(A);
+ Function *F = getAssociatedFunction();
if (!F || F->isDeclaration())
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness information and then it makes
- // sense to specialize attributes for call sites arguments instead of
- // redirecting requests to the callee argument.
- Function *F = getAssociatedFunction();
- const IRPosition &FnPos = IRPosition::function(*F);
- auto &FnAA = A.getAAFor<AANoSync>(*this, FnPos);
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness information and then it makes
+ // sense to specialize attributes for call sites arguments instead of
+ // redirecting requests to the callee argument.
+ Function *F = getAssociatedFunction();
+ const IRPosition &FnPos = IRPosition::function(*F);
+ auto &FnAA = A.getAAFor<AANoSync>(*this, FnPos);
return clampStateAndIndicateChange(getState(), FnAA.getState());
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nosync); }
-};
-
-/// ------------------------ No-Free Attributes ----------------------------
-
-struct AANoFreeImpl : public AANoFree {
- AANoFreeImpl(const IRPosition &IRP, Attributor &A) : AANoFree(IRP, A) {}
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- auto CheckForNoFree = [&](Instruction &I) {
- const auto &CB = cast<CallBase>(I);
- if (CB.hasFnAttr(Attribute::NoFree))
- return true;
-
- const auto &NoFreeAA =
- A.getAAFor<AANoFree>(*this, IRPosition::callsite_function(CB));
- return NoFreeAA.isAssumedNoFree();
- };
-
- if (!A.checkForAllCallLikeInstructions(CheckForNoFree, *this))
- return indicatePessimisticFixpoint();
- return ChangeStatus::UNCHANGED;
- }
-
- /// See AbstractAttribute::getAsStr().
- const std::string getAsStr() const override {
- return getAssumed() ? "nofree" : "may-free";
- }
-};
-
-struct AANoFreeFunction final : public AANoFreeImpl {
- AANoFreeFunction(const IRPosition &IRP, Attributor &A)
- : AANoFreeImpl(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nofree) }
-};
-
-/// NoFree attribute deduction for a call sites.
-struct AANoFreeCallSite final : AANoFreeImpl {
- AANoFreeCallSite(const IRPosition &IRP, Attributor &A)
- : AANoFreeImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- AANoFreeImpl::initialize(A);
- Function *F = getAssociatedFunction();
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nosync); }
+};
+
+/// ------------------------ No-Free Attributes ----------------------------
+
+struct AANoFreeImpl : public AANoFree {
+ AANoFreeImpl(const IRPosition &IRP, Attributor &A) : AANoFree(IRP, A) {}
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ auto CheckForNoFree = [&](Instruction &I) {
+ const auto &CB = cast<CallBase>(I);
+ if (CB.hasFnAttr(Attribute::NoFree))
+ return true;
+
+ const auto &NoFreeAA =
+ A.getAAFor<AANoFree>(*this, IRPosition::callsite_function(CB));
+ return NoFreeAA.isAssumedNoFree();
+ };
+
+ if (!A.checkForAllCallLikeInstructions(CheckForNoFree, *this))
+ return indicatePessimisticFixpoint();
+ return ChangeStatus::UNCHANGED;
+ }
+
+ /// See AbstractAttribute::getAsStr().
+ const std::string getAsStr() const override {
+ return getAssumed() ? "nofree" : "may-free";
+ }
+};
+
+struct AANoFreeFunction final : public AANoFreeImpl {
+ AANoFreeFunction(const IRPosition &IRP, Attributor &A)
+ : AANoFreeImpl(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nofree) }
+};
+
+/// NoFree attribute deduction for a call sites.
+struct AANoFreeCallSite final : AANoFreeImpl {
+ AANoFreeCallSite(const IRPosition &IRP, Attributor &A)
+ : AANoFreeImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ AANoFreeImpl::initialize(A);
+ Function *F = getAssociatedFunction();
if (!F || F->isDeclaration())
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness information and then it makes
- // sense to specialize attributes for call sites arguments instead of
- // redirecting requests to the callee argument.
- Function *F = getAssociatedFunction();
- const IRPosition &FnPos = IRPosition::function(*F);
- auto &FnAA = A.getAAFor<AANoFree>(*this, FnPos);
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness information and then it makes
+ // sense to specialize attributes for call sites arguments instead of
+ // redirecting requests to the callee argument.
+ Function *F = getAssociatedFunction();
+ const IRPosition &FnPos = IRPosition::function(*F);
+ auto &FnAA = A.getAAFor<AANoFree>(*this, FnPos);
return clampStateAndIndicateChange(getState(), FnAA.getState());
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nofree); }
-};
-
-/// NoFree attribute for floating values.
-struct AANoFreeFloating : AANoFreeImpl {
- AANoFreeFloating(const IRPosition &IRP, Attributor &A)
- : AANoFreeImpl(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override{STATS_DECLTRACK_FLOATING_ATTR(nofree)}
-
- /// See Abstract Attribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- const IRPosition &IRP = getIRPosition();
-
- const auto &NoFreeAA =
- A.getAAFor<AANoFree>(*this, IRPosition::function_scope(IRP));
- if (NoFreeAA.isAssumedNoFree())
- return ChangeStatus::UNCHANGED;
-
- Value &AssociatedValue = getIRPosition().getAssociatedValue();
- auto Pred = [&](const Use &U, bool &Follow) -> bool {
- Instruction *UserI = cast<Instruction>(U.getUser());
- if (auto *CB = dyn_cast<CallBase>(UserI)) {
- if (CB->isBundleOperand(&U))
- return false;
- if (!CB->isArgOperand(&U))
- return true;
- unsigned ArgNo = CB->getArgOperandNo(&U);
-
- const auto &NoFreeArg = A.getAAFor<AANoFree>(
- *this, IRPosition::callsite_argument(*CB, ArgNo));
- return NoFreeArg.isAssumedNoFree();
- }
-
- if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) ||
- isa<PHINode>(UserI) || isa<SelectInst>(UserI)) {
- Follow = true;
- return true;
- }
- if (isa<ReturnInst>(UserI))
- return true;
-
- // Unknown user.
- return false;
- };
- if (!A.checkForAllUses(Pred, *this, AssociatedValue))
- return indicatePessimisticFixpoint();
-
- return ChangeStatus::UNCHANGED;
- }
-};
-
-/// NoFree attribute for a call site argument.
-struct AANoFreeArgument final : AANoFreeFloating {
- AANoFreeArgument(const IRPosition &IRP, Attributor &A)
- : AANoFreeFloating(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nofree) }
-};
-
-/// NoFree attribute for call site arguments.
-struct AANoFreeCallSiteArgument final : AANoFreeFloating {
- AANoFreeCallSiteArgument(const IRPosition &IRP, Attributor &A)
- : AANoFreeFloating(IRP, A) {}
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness information and then it makes
- // sense to specialize attributes for call sites arguments instead of
- // redirecting requests to the callee argument.
- Argument *Arg = getAssociatedArgument();
- if (!Arg)
- return indicatePessimisticFixpoint();
- const IRPosition &ArgPos = IRPosition::argument(*Arg);
- auto &ArgAA = A.getAAFor<AANoFree>(*this, ArgPos);
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nofree); }
+};
+
+/// NoFree attribute for floating values.
+struct AANoFreeFloating : AANoFreeImpl {
+ AANoFreeFloating(const IRPosition &IRP, Attributor &A)
+ : AANoFreeImpl(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override{STATS_DECLTRACK_FLOATING_ATTR(nofree)}
+
+ /// See Abstract Attribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ const IRPosition &IRP = getIRPosition();
+
+ const auto &NoFreeAA =
+ A.getAAFor<AANoFree>(*this, IRPosition::function_scope(IRP));
+ if (NoFreeAA.isAssumedNoFree())
+ return ChangeStatus::UNCHANGED;
+
+ Value &AssociatedValue = getIRPosition().getAssociatedValue();
+ auto Pred = [&](const Use &U, bool &Follow) -> bool {
+ Instruction *UserI = cast<Instruction>(U.getUser());
+ if (auto *CB = dyn_cast<CallBase>(UserI)) {
+ if (CB->isBundleOperand(&U))
+ return false;
+ if (!CB->isArgOperand(&U))
+ return true;
+ unsigned ArgNo = CB->getArgOperandNo(&U);
+
+ const auto &NoFreeArg = A.getAAFor<AANoFree>(
+ *this, IRPosition::callsite_argument(*CB, ArgNo));
+ return NoFreeArg.isAssumedNoFree();
+ }
+
+ if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) ||
+ isa<PHINode>(UserI) || isa<SelectInst>(UserI)) {
+ Follow = true;
+ return true;
+ }
+ if (isa<ReturnInst>(UserI))
+ return true;
+
+ // Unknown user.
+ return false;
+ };
+ if (!A.checkForAllUses(Pred, *this, AssociatedValue))
+ return indicatePessimisticFixpoint();
+
+ return ChangeStatus::UNCHANGED;
+ }
+};
+
+/// NoFree attribute for a call site argument.
+struct AANoFreeArgument final : AANoFreeFloating {
+ AANoFreeArgument(const IRPosition &IRP, Attributor &A)
+ : AANoFreeFloating(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nofree) }
+};
+
+/// NoFree attribute for call site arguments.
+struct AANoFreeCallSiteArgument final : AANoFreeFloating {
+ AANoFreeCallSiteArgument(const IRPosition &IRP, Attributor &A)
+ : AANoFreeFloating(IRP, A) {}
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness information and then it makes
+ // sense to specialize attributes for call sites arguments instead of
+ // redirecting requests to the callee argument.
+ Argument *Arg = getAssociatedArgument();
+ if (!Arg)
+ return indicatePessimisticFixpoint();
+ const IRPosition &ArgPos = IRPosition::argument(*Arg);
+ auto &ArgAA = A.getAAFor<AANoFree>(*this, ArgPos);
return clampStateAndIndicateChange(getState(), ArgAA.getState());
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nofree)};
-};
-
-/// NoFree attribute for function return value.
-struct AANoFreeReturned final : AANoFreeFloating {
- AANoFreeReturned(const IRPosition &IRP, Attributor &A)
- : AANoFreeFloating(IRP, A) {
- llvm_unreachable("NoFree is not applicable to function returns!");
- }
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- llvm_unreachable("NoFree is not applicable to function returns!");
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- llvm_unreachable("NoFree is not applicable to function returns!");
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {}
-};
-
-/// NoFree attribute deduction for a call site return value.
-struct AANoFreeCallSiteReturned final : AANoFreeFloating {
- AANoFreeCallSiteReturned(const IRPosition &IRP, Attributor &A)
- : AANoFreeFloating(IRP, A) {}
-
- ChangeStatus manifest(Attributor &A) override {
- return ChangeStatus::UNCHANGED;
- }
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nofree) }
-};
-
-/// ------------------------ NonNull Argument Attribute ------------------------
-static int64_t getKnownNonNullAndDerefBytesForUse(
- Attributor &A, const AbstractAttribute &QueryingAA, Value &AssociatedValue,
- const Use *U, const Instruction *I, bool &IsNonNull, bool &TrackUse) {
- TrackUse = false;
-
- const Value *UseV = U->get();
- if (!UseV->getType()->isPointerTy())
- return 0;
-
- Type *PtrTy = UseV->getType();
- const Function *F = I->getFunction();
- bool NullPointerIsDefined =
- F ? llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()) : true;
- const DataLayout &DL = A.getInfoCache().getDL();
- if (const auto *CB = dyn_cast<CallBase>(I)) {
- if (CB->isBundleOperand(U)) {
- if (RetainedKnowledge RK = getKnowledgeFromUse(
- U, {Attribute::NonNull, Attribute::Dereferenceable})) {
- IsNonNull |=
- (RK.AttrKind == Attribute::NonNull || !NullPointerIsDefined);
- return RK.ArgValue;
- }
- return 0;
- }
-
- if (CB->isCallee(U)) {
- IsNonNull |= !NullPointerIsDefined;
- return 0;
- }
-
- unsigned ArgNo = CB->getArgOperandNo(U);
- IRPosition IRP = IRPosition::callsite_argument(*CB, ArgNo);
- // As long as we only use known information there is no need to track
- // dependences here.
- auto &DerefAA = A.getAAFor<AADereferenceable>(QueryingAA, IRP,
- /* TrackDependence */ false);
- IsNonNull |= DerefAA.isKnownNonNull();
- return DerefAA.getKnownDereferenceableBytes();
- }
-
- // We need to follow common pointer manipulation uses to the accesses they
- // feed into. We can try to be smart to avoid looking through things we do not
- // like for now, e.g., non-inbounds GEPs.
- if (isa<CastInst>(I)) {
- TrackUse = true;
- return 0;
- }
-
- if (isa<GetElementPtrInst>(I)) {
- TrackUse = true;
- return 0;
- }
-
- int64_t Offset;
- const Value *Base =
- getMinimalBaseOfAccsesPointerOperand(A, QueryingAA, I, Offset, DL);
- if (Base) {
- if (Base == &AssociatedValue &&
- getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
- int64_t DerefBytes =
- (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()) + Offset;
-
- IsNonNull |= !NullPointerIsDefined;
- return std::max(int64_t(0), DerefBytes);
- }
- }
-
- /// Corner case when an offset is 0.
- Base = getBasePointerOfAccessPointerOperand(I, Offset, DL,
- /*AllowNonInbounds*/ true);
- if (Base) {
- if (Offset == 0 && Base == &AssociatedValue &&
- getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
- int64_t DerefBytes =
- (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType());
- IsNonNull |= !NullPointerIsDefined;
- return std::max(int64_t(0), DerefBytes);
- }
- }
-
- return 0;
-}
-
-struct AANonNullImpl : AANonNull {
- AANonNullImpl(const IRPosition &IRP, Attributor &A)
- : AANonNull(IRP, A),
- NullIsDefined(NullPointerIsDefined(
- getAnchorScope(),
- getAssociatedValue().getType()->getPointerAddressSpace())) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- Value &V = getAssociatedValue();
- if (!NullIsDefined &&
- hasAttr({Attribute::NonNull, Attribute::Dereferenceable},
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nofree)};
+};
+
+/// NoFree attribute for function return value.
+struct AANoFreeReturned final : AANoFreeFloating {
+ AANoFreeReturned(const IRPosition &IRP, Attributor &A)
+ : AANoFreeFloating(IRP, A) {
+ llvm_unreachable("NoFree is not applicable to function returns!");
+ }
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ llvm_unreachable("NoFree is not applicable to function returns!");
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ llvm_unreachable("NoFree is not applicable to function returns!");
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {}
+};
+
+/// NoFree attribute deduction for a call site return value.
+struct AANoFreeCallSiteReturned final : AANoFreeFloating {
+ AANoFreeCallSiteReturned(const IRPosition &IRP, Attributor &A)
+ : AANoFreeFloating(IRP, A) {}
+
+ ChangeStatus manifest(Attributor &A) override {
+ return ChangeStatus::UNCHANGED;
+ }
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nofree) }
+};
+
+/// ------------------------ NonNull Argument Attribute ------------------------
+static int64_t getKnownNonNullAndDerefBytesForUse(
+ Attributor &A, const AbstractAttribute &QueryingAA, Value &AssociatedValue,
+ const Use *U, const Instruction *I, bool &IsNonNull, bool &TrackUse) {
+ TrackUse = false;
+
+ const Value *UseV = U->get();
+ if (!UseV->getType()->isPointerTy())
+ return 0;
+
+ Type *PtrTy = UseV->getType();
+ const Function *F = I->getFunction();
+ bool NullPointerIsDefined =
+ F ? llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()) : true;
+ const DataLayout &DL = A.getInfoCache().getDL();
+ if (const auto *CB = dyn_cast<CallBase>(I)) {
+ if (CB->isBundleOperand(U)) {
+ if (RetainedKnowledge RK = getKnowledgeFromUse(
+ U, {Attribute::NonNull, Attribute::Dereferenceable})) {
+ IsNonNull |=
+ (RK.AttrKind == Attribute::NonNull || !NullPointerIsDefined);
+ return RK.ArgValue;
+ }
+ return 0;
+ }
+
+ if (CB->isCallee(U)) {
+ IsNonNull |= !NullPointerIsDefined;
+ return 0;
+ }
+
+ unsigned ArgNo = CB->getArgOperandNo(U);
+ IRPosition IRP = IRPosition::callsite_argument(*CB, ArgNo);
+ // As long as we only use known information there is no need to track
+ // dependences here.
+ auto &DerefAA = A.getAAFor<AADereferenceable>(QueryingAA, IRP,
+ /* TrackDependence */ false);
+ IsNonNull |= DerefAA.isKnownNonNull();
+ return DerefAA.getKnownDereferenceableBytes();
+ }
+
+ // We need to follow common pointer manipulation uses to the accesses they
+ // feed into. We can try to be smart to avoid looking through things we do not
+ // like for now, e.g., non-inbounds GEPs.
+ if (isa<CastInst>(I)) {
+ TrackUse = true;
+ return 0;
+ }
+
+ if (isa<GetElementPtrInst>(I)) {
+ TrackUse = true;
+ return 0;
+ }
+
+ int64_t Offset;
+ const Value *Base =
+ getMinimalBaseOfAccsesPointerOperand(A, QueryingAA, I, Offset, DL);
+ if (Base) {
+ if (Base == &AssociatedValue &&
+ getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
+ int64_t DerefBytes =
+ (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()) + Offset;
+
+ IsNonNull |= !NullPointerIsDefined;
+ return std::max(int64_t(0), DerefBytes);
+ }
+ }
+
+ /// Corner case when an offset is 0.
+ Base = getBasePointerOfAccessPointerOperand(I, Offset, DL,
+ /*AllowNonInbounds*/ true);
+ if (Base) {
+ if (Offset == 0 && Base == &AssociatedValue &&
+ getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
+ int64_t DerefBytes =
+ (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType());
+ IsNonNull |= !NullPointerIsDefined;
+ return std::max(int64_t(0), DerefBytes);
+ }
+ }
+
+ return 0;
+}
+
+struct AANonNullImpl : AANonNull {
+ AANonNullImpl(const IRPosition &IRP, Attributor &A)
+ : AANonNull(IRP, A),
+ NullIsDefined(NullPointerIsDefined(
+ getAnchorScope(),
+ getAssociatedValue().getType()->getPointerAddressSpace())) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ Value &V = getAssociatedValue();
+ if (!NullIsDefined &&
+ hasAttr({Attribute::NonNull, Attribute::Dereferenceable},
/* IgnoreSubsumingPositions */ false, &A)) {
- indicateOptimisticFixpoint();
+ indicateOptimisticFixpoint();
return;
}
if (isa<ConstantPointerNull>(V)) {
- indicatePessimisticFixpoint();
+ indicatePessimisticFixpoint();
return;
}
-
+
AANonNull::initialize(A);
- bool CanBeNull = true;
+ bool CanBeNull = true;
if (V.getPointerDereferenceableBytes(A.getDataLayout(), CanBeNull)) {
if (!CanBeNull) {
- indicateOptimisticFixpoint();
+ indicateOptimisticFixpoint();
return;
}
}
-
+
if (isa<GlobalValue>(&getAssociatedValue())) {
indicatePessimisticFixpoint();
return;
@@ -1713,293 +1713,293 @@ struct AANonNullImpl : AANonNull {
if (Instruction *CtxI = getCtxI())
followUsesInMBEC(*this, A, getState(), *CtxI);
- }
-
- /// See followUsesInMBEC
- bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I,
- AANonNull::StateType &State) {
- bool IsNonNull = false;
- bool TrackUse = false;
- getKnownNonNullAndDerefBytesForUse(A, *this, getAssociatedValue(), U, I,
- IsNonNull, TrackUse);
- State.setKnown(IsNonNull);
- return TrackUse;
- }
-
- /// See AbstractAttribute::getAsStr().
- const std::string getAsStr() const override {
- return getAssumed() ? "nonnull" : "may-null";
- }
-
- /// Flag to determine if the underlying value can be null and still allow
- /// valid accesses.
- const bool NullIsDefined;
-};
-
-/// NonNull attribute for a floating value.
-struct AANonNullFloating : public AANonNullImpl {
- AANonNullFloating(const IRPosition &IRP, Attributor &A)
- : AANonNullImpl(IRP, A) {}
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- const DataLayout &DL = A.getDataLayout();
-
- DominatorTree *DT = nullptr;
- AssumptionCache *AC = nullptr;
- InformationCache &InfoCache = A.getInfoCache();
- if (const Function *Fn = getAnchorScope()) {
- DT = InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(*Fn);
- AC = InfoCache.getAnalysisResultForFunction<AssumptionAnalysis>(*Fn);
- }
-
- auto VisitValueCB = [&](Value &V, const Instruction *CtxI,
- AANonNull::StateType &T, bool Stripped) -> bool {
- const auto &AA = A.getAAFor<AANonNull>(*this, IRPosition::value(V));
- if (!Stripped && this == &AA) {
- if (!isKnownNonZero(&V, DL, 0, AC, CtxI, DT))
- T.indicatePessimisticFixpoint();
- } else {
- // Use abstract attribute information.
+ }
+
+ /// See followUsesInMBEC
+ bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I,
+ AANonNull::StateType &State) {
+ bool IsNonNull = false;
+ bool TrackUse = false;
+ getKnownNonNullAndDerefBytesForUse(A, *this, getAssociatedValue(), U, I,
+ IsNonNull, TrackUse);
+ State.setKnown(IsNonNull);
+ return TrackUse;
+ }
+
+ /// See AbstractAttribute::getAsStr().
+ const std::string getAsStr() const override {
+ return getAssumed() ? "nonnull" : "may-null";
+ }
+
+ /// Flag to determine if the underlying value can be null and still allow
+ /// valid accesses.
+ const bool NullIsDefined;
+};
+
+/// NonNull attribute for a floating value.
+struct AANonNullFloating : public AANonNullImpl {
+ AANonNullFloating(const IRPosition &IRP, Attributor &A)
+ : AANonNullImpl(IRP, A) {}
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ const DataLayout &DL = A.getDataLayout();
+
+ DominatorTree *DT = nullptr;
+ AssumptionCache *AC = nullptr;
+ InformationCache &InfoCache = A.getInfoCache();
+ if (const Function *Fn = getAnchorScope()) {
+ DT = InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(*Fn);
+ AC = InfoCache.getAnalysisResultForFunction<AssumptionAnalysis>(*Fn);
+ }
+
+ auto VisitValueCB = [&](Value &V, const Instruction *CtxI,
+ AANonNull::StateType &T, bool Stripped) -> bool {
+ const auto &AA = A.getAAFor<AANonNull>(*this, IRPosition::value(V));
+ if (!Stripped && this == &AA) {
+ if (!isKnownNonZero(&V, DL, 0, AC, CtxI, DT))
+ T.indicatePessimisticFixpoint();
+ } else {
+ // Use abstract attribute information.
const AANonNull::StateType &NS = AA.getState();
- T ^= NS;
- }
- return T.isValidState();
- };
-
- StateType T;
- if (!genericValueTraversal<AANonNull, StateType>(
- A, getIRPosition(), *this, T, VisitValueCB, getCtxI()))
- return indicatePessimisticFixpoint();
-
- return clampStateAndIndicateChange(getState(), T);
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
-};
-
-/// NonNull attribute for function return value.
-struct AANonNullReturned final
+ T ^= NS;
+ }
+ return T.isValidState();
+ };
+
+ StateType T;
+ if (!genericValueTraversal<AANonNull, StateType>(
+ A, getIRPosition(), *this, T, VisitValueCB, getCtxI()))
+ return indicatePessimisticFixpoint();
+
+ return clampStateAndIndicateChange(getState(), T);
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
+};
+
+/// NonNull attribute for function return value.
+struct AANonNullReturned final
: AAReturnedFromReturnedValues<AANonNull, AANonNull> {
- AANonNullReturned(const IRPosition &IRP, Attributor &A)
+ AANonNullReturned(const IRPosition &IRP, Attributor &A)
: AAReturnedFromReturnedValues<AANonNull, AANonNull>(IRP, A) {}
-
+
/// See AbstractAttribute::getAsStr().
const std::string getAsStr() const override {
return getAssumed() ? "nonnull" : "may-null";
}
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
-};
-
-/// NonNull attribute for function argument.
-struct AANonNullArgument final
- : AAArgumentFromCallSiteArguments<AANonNull, AANonNullImpl> {
- AANonNullArgument(const IRPosition &IRP, Attributor &A)
- : AAArgumentFromCallSiteArguments<AANonNull, AANonNullImpl>(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nonnull) }
-};
-
-struct AANonNullCallSiteArgument final : AANonNullFloating {
- AANonNullCallSiteArgument(const IRPosition &IRP, Attributor &A)
- : AANonNullFloating(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(nonnull) }
-};
-
-/// NonNull attribute for a call site return position.
-struct AANonNullCallSiteReturned final
- : AACallSiteReturnedFromReturned<AANonNull, AANonNullImpl> {
- AANonNullCallSiteReturned(const IRPosition &IRP, Attributor &A)
- : AACallSiteReturnedFromReturned<AANonNull, AANonNullImpl>(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nonnull) }
-};
-
-/// ------------------------ No-Recurse Attributes ----------------------------
-
-struct AANoRecurseImpl : public AANoRecurse {
- AANoRecurseImpl(const IRPosition &IRP, Attributor &A) : AANoRecurse(IRP, A) {}
-
- /// See AbstractAttribute::getAsStr()
- const std::string getAsStr() const override {
- return getAssumed() ? "norecurse" : "may-recurse";
- }
-};
-
-struct AANoRecurseFunction final : AANoRecurseImpl {
- AANoRecurseFunction(const IRPosition &IRP, Attributor &A)
- : AANoRecurseImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- AANoRecurseImpl::initialize(A);
- if (const Function *F = getAnchorScope())
- if (A.getInfoCache().getSccSize(*F) != 1)
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
-
- // If all live call sites are known to be no-recurse, we are as well.
- auto CallSitePred = [&](AbstractCallSite ACS) {
- const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(
- *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
- /* TrackDependence */ false, DepClassTy::OPTIONAL);
- return NoRecurseAA.isKnownNoRecurse();
- };
- bool AllCallSitesKnown;
- if (A.checkForAllCallSites(CallSitePred, *this, true, AllCallSitesKnown)) {
- // If we know all call sites and all are known no-recurse, we are done.
- // If all known call sites, which might not be all that exist, are known
- // to be no-recurse, we are not done but we can continue to assume
- // no-recurse. If one of the call sites we have not visited will become
- // live, another update is triggered.
- if (AllCallSitesKnown)
- indicateOptimisticFixpoint();
- return ChangeStatus::UNCHANGED;
- }
-
- // If the above check does not hold anymore we look at the calls.
- auto CheckForNoRecurse = [&](Instruction &I) {
- const auto &CB = cast<CallBase>(I);
- if (CB.hasFnAttr(Attribute::NoRecurse))
- return true;
-
- const auto &NoRecurseAA =
- A.getAAFor<AANoRecurse>(*this, IRPosition::callsite_function(CB));
- if (!NoRecurseAA.isAssumedNoRecurse())
- return false;
-
- // Recursion to the same function
- if (CB.getCalledFunction() == getAnchorScope())
- return false;
-
- return true;
- };
-
- if (!A.checkForAllCallLikeInstructions(CheckForNoRecurse, *this))
- return indicatePessimisticFixpoint();
- return ChangeStatus::UNCHANGED;
- }
-
- void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(norecurse) }
-};
-
-/// NoRecurse attribute deduction for a call sites.
-struct AANoRecurseCallSite final : AANoRecurseImpl {
- AANoRecurseCallSite(const IRPosition &IRP, Attributor &A)
- : AANoRecurseImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- AANoRecurseImpl::initialize(A);
- Function *F = getAssociatedFunction();
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
+};
+
+/// NonNull attribute for function argument.
+struct AANonNullArgument final
+ : AAArgumentFromCallSiteArguments<AANonNull, AANonNullImpl> {
+ AANonNullArgument(const IRPosition &IRP, Attributor &A)
+ : AAArgumentFromCallSiteArguments<AANonNull, AANonNullImpl>(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nonnull) }
+};
+
+struct AANonNullCallSiteArgument final : AANonNullFloating {
+ AANonNullCallSiteArgument(const IRPosition &IRP, Attributor &A)
+ : AANonNullFloating(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(nonnull) }
+};
+
+/// NonNull attribute for a call site return position.
+struct AANonNullCallSiteReturned final
+ : AACallSiteReturnedFromReturned<AANonNull, AANonNullImpl> {
+ AANonNullCallSiteReturned(const IRPosition &IRP, Attributor &A)
+ : AACallSiteReturnedFromReturned<AANonNull, AANonNullImpl>(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nonnull) }
+};
+
+/// ------------------------ No-Recurse Attributes ----------------------------
+
+struct AANoRecurseImpl : public AANoRecurse {
+ AANoRecurseImpl(const IRPosition &IRP, Attributor &A) : AANoRecurse(IRP, A) {}
+
+ /// See AbstractAttribute::getAsStr()
+ const std::string getAsStr() const override {
+ return getAssumed() ? "norecurse" : "may-recurse";
+ }
+};
+
+struct AANoRecurseFunction final : AANoRecurseImpl {
+ AANoRecurseFunction(const IRPosition &IRP, Attributor &A)
+ : AANoRecurseImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ AANoRecurseImpl::initialize(A);
+ if (const Function *F = getAnchorScope())
+ if (A.getInfoCache().getSccSize(*F) != 1)
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+
+ // If all live call sites are known to be no-recurse, we are as well.
+ auto CallSitePred = [&](AbstractCallSite ACS) {
+ const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(
+ *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
+ /* TrackDependence */ false, DepClassTy::OPTIONAL);
+ return NoRecurseAA.isKnownNoRecurse();
+ };
+ bool AllCallSitesKnown;
+ if (A.checkForAllCallSites(CallSitePred, *this, true, AllCallSitesKnown)) {
+ // If we know all call sites and all are known no-recurse, we are done.
+ // If all known call sites, which might not be all that exist, are known
+ // to be no-recurse, we are not done but we can continue to assume
+ // no-recurse. If one of the call sites we have not visited will become
+ // live, another update is triggered.
+ if (AllCallSitesKnown)
+ indicateOptimisticFixpoint();
+ return ChangeStatus::UNCHANGED;
+ }
+
+ // If the above check does not hold anymore we look at the calls.
+ auto CheckForNoRecurse = [&](Instruction &I) {
+ const auto &CB = cast<CallBase>(I);
+ if (CB.hasFnAttr(Attribute::NoRecurse))
+ return true;
+
+ const auto &NoRecurseAA =
+ A.getAAFor<AANoRecurse>(*this, IRPosition::callsite_function(CB));
+ if (!NoRecurseAA.isAssumedNoRecurse())
+ return false;
+
+ // Recursion to the same function
+ if (CB.getCalledFunction() == getAnchorScope())
+ return false;
+
+ return true;
+ };
+
+ if (!A.checkForAllCallLikeInstructions(CheckForNoRecurse, *this))
+ return indicatePessimisticFixpoint();
+ return ChangeStatus::UNCHANGED;
+ }
+
+ void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(norecurse) }
+};
+
+/// NoRecurse attribute deduction for a call sites.
+struct AANoRecurseCallSite final : AANoRecurseImpl {
+ AANoRecurseCallSite(const IRPosition &IRP, Attributor &A)
+ : AANoRecurseImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ AANoRecurseImpl::initialize(A);
+ Function *F = getAssociatedFunction();
if (!F || F->isDeclaration())
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness information and then it makes
- // sense to specialize attributes for call sites arguments instead of
- // redirecting requests to the callee argument.
- Function *F = getAssociatedFunction();
- const IRPosition &FnPos = IRPosition::function(*F);
- auto &FnAA = A.getAAFor<AANoRecurse>(*this, FnPos);
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness information and then it makes
+ // sense to specialize attributes for call sites arguments instead of
+ // redirecting requests to the callee argument.
+ Function *F = getAssociatedFunction();
+ const IRPosition &FnPos = IRPosition::function(*F);
+ auto &FnAA = A.getAAFor<AANoRecurse>(*this, FnPos);
return clampStateAndIndicateChange(getState(), FnAA.getState());
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(norecurse); }
-};
-
-/// -------------------- Undefined-Behavior Attributes ------------------------
-
-struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
- AAUndefinedBehaviorImpl(const IRPosition &IRP, Attributor &A)
- : AAUndefinedBehavior(IRP, A) {}
-
- /// See AbstractAttribute::updateImpl(...).
- // through a pointer (i.e. also branches etc.)
- ChangeStatus updateImpl(Attributor &A) override {
- const size_t UBPrevSize = KnownUBInsts.size();
- const size_t NoUBPrevSize = AssumedNoUBInsts.size();
-
- auto InspectMemAccessInstForUB = [&](Instruction &I) {
- // Skip instructions that are already saved.
- if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
- return true;
-
- // If we reach here, we know we have an instruction
- // that accesses memory through a pointer operand,
- // for which getPointerOperand() should give it to us.
- const Value *PtrOp = getPointerOperand(&I, /* AllowVolatile */ true);
- assert(PtrOp &&
- "Expected pointer operand of memory accessing instruction");
-
- // Either we stopped and the appropriate action was taken,
- // or we got back a simplified value to continue.
- Optional<Value *> SimplifiedPtrOp = stopOnUndefOrAssumed(A, PtrOp, &I);
- if (!SimplifiedPtrOp.hasValue())
- return true;
- const Value *PtrOpVal = SimplifiedPtrOp.getValue();
-
- // A memory access through a pointer is considered UB
- // only if the pointer has constant null value.
- // TODO: Expand it to not only check constant values.
- if (!isa<ConstantPointerNull>(PtrOpVal)) {
- AssumedNoUBInsts.insert(&I);
- return true;
- }
- const Type *PtrTy = PtrOpVal->getType();
-
- // Because we only consider instructions inside functions,
- // assume that a parent function exists.
- const Function *F = I.getFunction();
-
- // A memory access using constant null pointer is only considered UB
- // if null pointer is _not_ defined for the target platform.
- if (llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()))
- AssumedNoUBInsts.insert(&I);
- else
- KnownUBInsts.insert(&I);
- return true;
- };
-
- auto InspectBrInstForUB = [&](Instruction &I) {
- // A conditional branch instruction is considered UB if it has `undef`
- // condition.
-
- // Skip instructions that are already saved.
- if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
- return true;
-
- // We know we have a branch instruction.
- auto BrInst = cast<BranchInst>(&I);
-
- // Unconditional branches are never considered UB.
- if (BrInst->isUnconditional())
- return true;
-
- // Either we stopped and the appropriate action was taken,
- // or we got back a simplified value to continue.
- Optional<Value *> SimplifiedCond =
- stopOnUndefOrAssumed(A, BrInst->getCondition(), BrInst);
- if (!SimplifiedCond.hasValue())
- return true;
- AssumedNoUBInsts.insert(&I);
- return true;
- };
-
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(norecurse); }
+};
+
+/// -------------------- Undefined-Behavior Attributes ------------------------
+
+struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
+ AAUndefinedBehaviorImpl(const IRPosition &IRP, Attributor &A)
+ : AAUndefinedBehavior(IRP, A) {}
+
+ /// See AbstractAttribute::updateImpl(...).
+ // through a pointer (i.e. also branches etc.)
+ ChangeStatus updateImpl(Attributor &A) override {
+ const size_t UBPrevSize = KnownUBInsts.size();
+ const size_t NoUBPrevSize = AssumedNoUBInsts.size();
+
+ auto InspectMemAccessInstForUB = [&](Instruction &I) {
+ // Skip instructions that are already saved.
+ if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
+ return true;
+
+ // If we reach here, we know we have an instruction
+ // that accesses memory through a pointer operand,
+ // for which getPointerOperand() should give it to us.
+ const Value *PtrOp = getPointerOperand(&I, /* AllowVolatile */ true);
+ assert(PtrOp &&
+ "Expected pointer operand of memory accessing instruction");
+
+ // Either we stopped and the appropriate action was taken,
+ // or we got back a simplified value to continue.
+ Optional<Value *> SimplifiedPtrOp = stopOnUndefOrAssumed(A, PtrOp, &I);
+ if (!SimplifiedPtrOp.hasValue())
+ return true;
+ const Value *PtrOpVal = SimplifiedPtrOp.getValue();
+
+ // A memory access through a pointer is considered UB
+ // only if the pointer has constant null value.
+ // TODO: Expand it to not only check constant values.
+ if (!isa<ConstantPointerNull>(PtrOpVal)) {
+ AssumedNoUBInsts.insert(&I);
+ return true;
+ }
+ const Type *PtrTy = PtrOpVal->getType();
+
+ // Because we only consider instructions inside functions,
+ // assume that a parent function exists.
+ const Function *F = I.getFunction();
+
+ // A memory access using constant null pointer is only considered UB
+ // if null pointer is _not_ defined for the target platform.
+ if (llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()))
+ AssumedNoUBInsts.insert(&I);
+ else
+ KnownUBInsts.insert(&I);
+ return true;
+ };
+
+ auto InspectBrInstForUB = [&](Instruction &I) {
+ // A conditional branch instruction is considered UB if it has `undef`
+ // condition.
+
+ // Skip instructions that are already saved.
+ if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
+ return true;
+
+ // We know we have a branch instruction.
+ auto BrInst = cast<BranchInst>(&I);
+
+ // Unconditional branches are never considered UB.
+ if (BrInst->isUnconditional())
+ return true;
+
+ // Either we stopped and the appropriate action was taken,
+ // or we got back a simplified value to continue.
+ Optional<Value *> SimplifiedCond =
+ stopOnUndefOrAssumed(A, BrInst->getCondition(), BrInst);
+ if (!SimplifiedCond.hasValue())
+ return true;
+ AssumedNoUBInsts.insert(&I);
+ return true;
+ };
+
auto InspectCallSiteForUB = [&](Instruction &I) {
// Check whether a callsite always cause UB or not
@@ -2092,13 +2092,13 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
return true;
};
- A.checkForAllInstructions(InspectMemAccessInstForUB, *this,
- {Instruction::Load, Instruction::Store,
- Instruction::AtomicCmpXchg,
- Instruction::AtomicRMW},
- /* CheckBBLivenessOnly */ true);
- A.checkForAllInstructions(InspectBrInstForUB, *this, {Instruction::Br},
- /* CheckBBLivenessOnly */ true);
+ A.checkForAllInstructions(InspectMemAccessInstForUB, *this,
+ {Instruction::Load, Instruction::Store,
+ Instruction::AtomicCmpXchg,
+ Instruction::AtomicRMW},
+ /* CheckBBLivenessOnly */ true);
+ A.checkForAllInstructions(InspectBrInstForUB, *this, {Instruction::Br},
+ /* CheckBBLivenessOnly */ true);
A.checkForAllCallLikeInstructions(InspectCallSiteForUB, *this);
// If the returned position of the anchor scope has noundef attriubte, check
@@ -2115,575 +2115,575 @@ struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
}
}
- if (NoUBPrevSize != AssumedNoUBInsts.size() ||
- UBPrevSize != KnownUBInsts.size())
- return ChangeStatus::CHANGED;
- return ChangeStatus::UNCHANGED;
- }
-
- bool isKnownToCauseUB(Instruction *I) const override {
- return KnownUBInsts.count(I);
- }
-
- bool isAssumedToCauseUB(Instruction *I) const override {
- // In simple words, if an instruction is not in the assumed to _not_
- // cause UB, then it is assumed UB (that includes those
- // in the KnownUBInsts set). The rest is boilerplate
- // is to ensure that it is one of the instructions we test
- // for UB.
-
- switch (I->getOpcode()) {
- case Instruction::Load:
- case Instruction::Store:
- case Instruction::AtomicCmpXchg:
- case Instruction::AtomicRMW:
- return !AssumedNoUBInsts.count(I);
- case Instruction::Br: {
- auto BrInst = cast<BranchInst>(I);
- if (BrInst->isUnconditional())
- return false;
- return !AssumedNoUBInsts.count(I);
- } break;
- default:
- return false;
- }
- return false;
- }
-
- ChangeStatus manifest(Attributor &A) override {
- if (KnownUBInsts.empty())
- return ChangeStatus::UNCHANGED;
- for (Instruction *I : KnownUBInsts)
- A.changeToUnreachableAfterManifest(I);
- return ChangeStatus::CHANGED;
- }
-
- /// See AbstractAttribute::getAsStr()
- const std::string getAsStr() const override {
- return getAssumed() ? "undefined-behavior" : "no-ub";
- }
-
- /// Note: The correctness of this analysis depends on the fact that the
- /// following 2 sets will stop changing after some point.
- /// "Change" here means that their size changes.
- /// The size of each set is monotonically increasing
- /// (we only add items to them) and it is upper bounded by the number of
- /// instructions in the processed function (we can never save more
- /// elements in either set than this number). Hence, at some point,
- /// they will stop increasing.
- /// Consequently, at some point, both sets will have stopped
- /// changing, effectively making the analysis reach a fixpoint.
-
- /// Note: These 2 sets are disjoint and an instruction can be considered
- /// one of 3 things:
- /// 1) Known to cause UB (AAUndefinedBehavior could prove it) and put it in
- /// the KnownUBInsts set.
- /// 2) Assumed to cause UB (in every updateImpl, AAUndefinedBehavior
- /// has a reason to assume it).
- /// 3) Assumed to not cause UB. very other instruction - AAUndefinedBehavior
- /// could not find a reason to assume or prove that it can cause UB,
- /// hence it assumes it doesn't. We have a set for these instructions
- /// so that we don't reprocess them in every update.
- /// Note however that instructions in this set may cause UB.
-
-protected:
- /// A set of all live instructions _known_ to cause UB.
- SmallPtrSet<Instruction *, 8> KnownUBInsts;
-
-private:
- /// A set of all the (live) instructions that are assumed to _not_ cause UB.
- SmallPtrSet<Instruction *, 8> AssumedNoUBInsts;
-
- // Should be called on updates in which if we're processing an instruction
- // \p I that depends on a value \p V, one of the following has to happen:
- // - If the value is assumed, then stop.
- // - If the value is known but undef, then consider it UB.
- // - Otherwise, do specific processing with the simplified value.
- // We return None in the first 2 cases to signify that an appropriate
- // action was taken and the caller should stop.
- // Otherwise, we return the simplified value that the caller should
- // use for specific processing.
- Optional<Value *> stopOnUndefOrAssumed(Attributor &A, const Value *V,
- Instruction *I) {
- const auto &ValueSimplifyAA =
- A.getAAFor<AAValueSimplify>(*this, IRPosition::value(*V));
- Optional<Value *> SimplifiedV =
- ValueSimplifyAA.getAssumedSimplifiedValue(A);
- if (!ValueSimplifyAA.isKnown()) {
- // Don't depend on assumed values.
- return llvm::None;
- }
- if (!SimplifiedV.hasValue()) {
- // If it is known (which we tested above) but it doesn't have a value,
- // then we can assume `undef` and hence the instruction is UB.
- KnownUBInsts.insert(I);
- return llvm::None;
- }
- Value *Val = SimplifiedV.getValue();
- if (isa<UndefValue>(Val)) {
- KnownUBInsts.insert(I);
- return llvm::None;
- }
- return Val;
- }
-};
-
-struct AAUndefinedBehaviorFunction final : AAUndefinedBehaviorImpl {
- AAUndefinedBehaviorFunction(const IRPosition &IRP, Attributor &A)
- : AAUndefinedBehaviorImpl(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECL(UndefinedBehaviorInstruction, Instruction,
- "Number of instructions known to have UB");
- BUILD_STAT_NAME(UndefinedBehaviorInstruction, Instruction) +=
- KnownUBInsts.size();
- }
-};
-
-/// ------------------------ Will-Return Attributes ----------------------------
-
-// Helper function that checks whether a function has any cycle which we don't
-// know if it is bounded or not.
-// Loops with maximum trip count are considered bounded, any other cycle not.
-static bool mayContainUnboundedCycle(Function &F, Attributor &A) {
- ScalarEvolution *SE =
- A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(F);
- LoopInfo *LI = A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(F);
- // If either SCEV or LoopInfo is not available for the function then we assume
- // any cycle to be unbounded cycle.
- // We use scc_iterator which uses Tarjan algorithm to find all the maximal
- // SCCs.To detect if there's a cycle, we only need to find the maximal ones.
- if (!SE || !LI) {
- for (scc_iterator<Function *> SCCI = scc_begin(&F); !SCCI.isAtEnd(); ++SCCI)
- if (SCCI.hasCycle())
- return true;
- return false;
- }
-
- // If there's irreducible control, the function may contain non-loop cycles.
- if (mayContainIrreducibleControl(F, LI))
- return true;
-
- // Any loop that does not have a max trip count is considered unbounded cycle.
- for (auto *L : LI->getLoopsInPreorder()) {
- if (!SE->getSmallConstantMaxTripCount(L))
- return true;
- }
- return false;
-}
-
-struct AAWillReturnImpl : public AAWillReturn {
- AAWillReturnImpl(const IRPosition &IRP, Attributor &A)
- : AAWillReturn(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- AAWillReturn::initialize(A);
-
- Function *F = getAnchorScope();
+ if (NoUBPrevSize != AssumedNoUBInsts.size() ||
+ UBPrevSize != KnownUBInsts.size())
+ return ChangeStatus::CHANGED;
+ return ChangeStatus::UNCHANGED;
+ }
+
+ bool isKnownToCauseUB(Instruction *I) const override {
+ return KnownUBInsts.count(I);
+ }
+
+ bool isAssumedToCauseUB(Instruction *I) const override {
+ // In simple words, if an instruction is not in the assumed to _not_
+ // cause UB, then it is assumed UB (that includes those
+ // in the KnownUBInsts set). The rest is boilerplate
+ // is to ensure that it is one of the instructions we test
+ // for UB.
+
+ switch (I->getOpcode()) {
+ case Instruction::Load:
+ case Instruction::Store:
+ case Instruction::AtomicCmpXchg:
+ case Instruction::AtomicRMW:
+ return !AssumedNoUBInsts.count(I);
+ case Instruction::Br: {
+ auto BrInst = cast<BranchInst>(I);
+ if (BrInst->isUnconditional())
+ return false;
+ return !AssumedNoUBInsts.count(I);
+ } break;
+ default:
+ return false;
+ }
+ return false;
+ }
+
+ ChangeStatus manifest(Attributor &A) override {
+ if (KnownUBInsts.empty())
+ return ChangeStatus::UNCHANGED;
+ for (Instruction *I : KnownUBInsts)
+ A.changeToUnreachableAfterManifest(I);
+ return ChangeStatus::CHANGED;
+ }
+
+ /// See AbstractAttribute::getAsStr()
+ const std::string getAsStr() const override {
+ return getAssumed() ? "undefined-behavior" : "no-ub";
+ }
+
+ /// Note: The correctness of this analysis depends on the fact that the
+ /// following 2 sets will stop changing after some point.
+ /// "Change" here means that their size changes.
+ /// The size of each set is monotonically increasing
+ /// (we only add items to them) and it is upper bounded by the number of
+ /// instructions in the processed function (we can never save more
+ /// elements in either set than this number). Hence, at some point,
+ /// they will stop increasing.
+ /// Consequently, at some point, both sets will have stopped
+ /// changing, effectively making the analysis reach a fixpoint.
+
+ /// Note: These 2 sets are disjoint and an instruction can be considered
+ /// one of 3 things:
+ /// 1) Known to cause UB (AAUndefinedBehavior could prove it) and put it in
+ /// the KnownUBInsts set.
+ /// 2) Assumed to cause UB (in every updateImpl, AAUndefinedBehavior
+ /// has a reason to assume it).
+ /// 3) Assumed to not cause UB. very other instruction - AAUndefinedBehavior
+ /// could not find a reason to assume or prove that it can cause UB,
+ /// hence it assumes it doesn't. We have a set for these instructions
+ /// so that we don't reprocess them in every update.
+ /// Note however that instructions in this set may cause UB.
+
+protected:
+ /// A set of all live instructions _known_ to cause UB.
+ SmallPtrSet<Instruction *, 8> KnownUBInsts;
+
+private:
+ /// A set of all the (live) instructions that are assumed to _not_ cause UB.
+ SmallPtrSet<Instruction *, 8> AssumedNoUBInsts;
+
+ // Should be called on updates in which if we're processing an instruction
+ // \p I that depends on a value \p V, one of the following has to happen:
+ // - If the value is assumed, then stop.
+ // - If the value is known but undef, then consider it UB.
+ // - Otherwise, do specific processing with the simplified value.
+ // We return None in the first 2 cases to signify that an appropriate
+ // action was taken and the caller should stop.
+ // Otherwise, we return the simplified value that the caller should
+ // use for specific processing.
+ Optional<Value *> stopOnUndefOrAssumed(Attributor &A, const Value *V,
+ Instruction *I) {
+ const auto &ValueSimplifyAA =
+ A.getAAFor<AAValueSimplify>(*this, IRPosition::value(*V));
+ Optional<Value *> SimplifiedV =
+ ValueSimplifyAA.getAssumedSimplifiedValue(A);
+ if (!ValueSimplifyAA.isKnown()) {
+ // Don't depend on assumed values.
+ return llvm::None;
+ }
+ if (!SimplifiedV.hasValue()) {
+ // If it is known (which we tested above) but it doesn't have a value,
+ // then we can assume `undef` and hence the instruction is UB.
+ KnownUBInsts.insert(I);
+ return llvm::None;
+ }
+ Value *Val = SimplifiedV.getValue();
+ if (isa<UndefValue>(Val)) {
+ KnownUBInsts.insert(I);
+ return llvm::None;
+ }
+ return Val;
+ }
+};
+
+struct AAUndefinedBehaviorFunction final : AAUndefinedBehaviorImpl {
+ AAUndefinedBehaviorFunction(const IRPosition &IRP, Attributor &A)
+ : AAUndefinedBehaviorImpl(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECL(UndefinedBehaviorInstruction, Instruction,
+ "Number of instructions known to have UB");
+ BUILD_STAT_NAME(UndefinedBehaviorInstruction, Instruction) +=
+ KnownUBInsts.size();
+ }
+};
+
+/// ------------------------ Will-Return Attributes ----------------------------
+
+// Helper function that checks whether a function has any cycle which we don't
+// know if it is bounded or not.
+// Loops with maximum trip count are considered bounded, any other cycle not.
+static bool mayContainUnboundedCycle(Function &F, Attributor &A) {
+ ScalarEvolution *SE =
+ A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(F);
+ LoopInfo *LI = A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(F);
+ // If either SCEV or LoopInfo is not available for the function then we assume
+ // any cycle to be unbounded cycle.
+ // We use scc_iterator which uses Tarjan algorithm to find all the maximal
+ // SCCs.To detect if there's a cycle, we only need to find the maximal ones.
+ if (!SE || !LI) {
+ for (scc_iterator<Function *> SCCI = scc_begin(&F); !SCCI.isAtEnd(); ++SCCI)
+ if (SCCI.hasCycle())
+ return true;
+ return false;
+ }
+
+ // If there's irreducible control, the function may contain non-loop cycles.
+ if (mayContainIrreducibleControl(F, LI))
+ return true;
+
+ // Any loop that does not have a max trip count is considered unbounded cycle.
+ for (auto *L : LI->getLoopsInPreorder()) {
+ if (!SE->getSmallConstantMaxTripCount(L))
+ return true;
+ }
+ return false;
+}
+
+struct AAWillReturnImpl : public AAWillReturn {
+ AAWillReturnImpl(const IRPosition &IRP, Attributor &A)
+ : AAWillReturn(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ AAWillReturn::initialize(A);
+
+ Function *F = getAnchorScope();
if (!F || F->isDeclaration() || mayContainUnboundedCycle(*F, A))
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- auto CheckForWillReturn = [&](Instruction &I) {
- IRPosition IPos = IRPosition::callsite_function(cast<CallBase>(I));
- const auto &WillReturnAA = A.getAAFor<AAWillReturn>(*this, IPos);
- if (WillReturnAA.isKnownWillReturn())
- return true;
- if (!WillReturnAA.isAssumedWillReturn())
- return false;
- const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(*this, IPos);
- return NoRecurseAA.isAssumedNoRecurse();
- };
-
- if (!A.checkForAllCallLikeInstructions(CheckForWillReturn, *this))
- return indicatePessimisticFixpoint();
-
- return ChangeStatus::UNCHANGED;
- }
-
- /// See AbstractAttribute::getAsStr()
- const std::string getAsStr() const override {
- return getAssumed() ? "willreturn" : "may-noreturn";
- }
-};
-
-struct AAWillReturnFunction final : AAWillReturnImpl {
- AAWillReturnFunction(const IRPosition &IRP, Attributor &A)
- : AAWillReturnImpl(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(willreturn) }
-};
-
-/// WillReturn attribute deduction for a call sites.
-struct AAWillReturnCallSite final : AAWillReturnImpl {
- AAWillReturnCallSite(const IRPosition &IRP, Attributor &A)
- : AAWillReturnImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ auto CheckForWillReturn = [&](Instruction &I) {
+ IRPosition IPos = IRPosition::callsite_function(cast<CallBase>(I));
+ const auto &WillReturnAA = A.getAAFor<AAWillReturn>(*this, IPos);
+ if (WillReturnAA.isKnownWillReturn())
+ return true;
+ if (!WillReturnAA.isAssumedWillReturn())
+ return false;
+ const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(*this, IPos);
+ return NoRecurseAA.isAssumedNoRecurse();
+ };
+
+ if (!A.checkForAllCallLikeInstructions(CheckForWillReturn, *this))
+ return indicatePessimisticFixpoint();
+
+ return ChangeStatus::UNCHANGED;
+ }
+
+ /// See AbstractAttribute::getAsStr()
+ const std::string getAsStr() const override {
+ return getAssumed() ? "willreturn" : "may-noreturn";
+ }
+};
+
+struct AAWillReturnFunction final : AAWillReturnImpl {
+ AAWillReturnFunction(const IRPosition &IRP, Attributor &A)
+ : AAWillReturnImpl(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(willreturn) }
+};
+
+/// WillReturn attribute deduction for a call sites.
+struct AAWillReturnCallSite final : AAWillReturnImpl {
+ AAWillReturnCallSite(const IRPosition &IRP, Attributor &A)
+ : AAWillReturnImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
AAWillReturn::initialize(A);
- Function *F = getAssociatedFunction();
+ Function *F = getAssociatedFunction();
if (!F || !A.isFunctionIPOAmendable(*F))
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness information and then it makes
- // sense to specialize attributes for call sites arguments instead of
- // redirecting requests to the callee argument.
- Function *F = getAssociatedFunction();
- const IRPosition &FnPos = IRPosition::function(*F);
- auto &FnAA = A.getAAFor<AAWillReturn>(*this, FnPos);
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness information and then it makes
+ // sense to specialize attributes for call sites arguments instead of
+ // redirecting requests to the callee argument.
+ Function *F = getAssociatedFunction();
+ const IRPosition &FnPos = IRPosition::function(*F);
+ auto &FnAA = A.getAAFor<AAWillReturn>(*this, FnPos);
return clampStateAndIndicateChange(getState(), FnAA.getState());
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(willreturn); }
-};
-
-/// -------------------AAReachability Attribute--------------------------
-
-struct AAReachabilityImpl : AAReachability {
- AAReachabilityImpl(const IRPosition &IRP, Attributor &A)
- : AAReachability(IRP, A) {}
-
- const std::string getAsStr() const override {
- // TODO: Return the number of reachable queries.
- return "reachable";
- }
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override { indicatePessimisticFixpoint(); }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- return indicatePessimisticFixpoint();
- }
-};
-
-struct AAReachabilityFunction final : public AAReachabilityImpl {
- AAReachabilityFunction(const IRPosition &IRP, Attributor &A)
- : AAReachabilityImpl(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(reachable); }
-};
-
-/// ------------------------ NoAlias Argument Attribute ------------------------
-
-struct AANoAliasImpl : AANoAlias {
- AANoAliasImpl(const IRPosition &IRP, Attributor &A) : AANoAlias(IRP, A) {
- assert(getAssociatedType()->isPointerTy() &&
- "Noalias is a pointer attribute");
- }
-
- const std::string getAsStr() const override {
- return getAssumed() ? "noalias" : "may-alias";
- }
-};
-
-/// NoAlias attribute for a floating value.
-struct AANoAliasFloating final : AANoAliasImpl {
- AANoAliasFloating(const IRPosition &IRP, Attributor &A)
- : AANoAliasImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- AANoAliasImpl::initialize(A);
- Value *Val = &getAssociatedValue();
- do {
- CastInst *CI = dyn_cast<CastInst>(Val);
- if (!CI)
- break;
- Value *Base = CI->getOperand(0);
- if (!Base->hasOneUse())
- break;
- Val = Base;
- } while (true);
-
- if (!Val->getType()->isPointerTy()) {
- indicatePessimisticFixpoint();
- return;
- }
-
- if (isa<AllocaInst>(Val))
- indicateOptimisticFixpoint();
- else if (isa<ConstantPointerNull>(Val) &&
- !NullPointerIsDefined(getAnchorScope(),
- Val->getType()->getPointerAddressSpace()))
- indicateOptimisticFixpoint();
- else if (Val != &getAssociatedValue()) {
- const auto &ValNoAliasAA =
- A.getAAFor<AANoAlias>(*this, IRPosition::value(*Val));
- if (ValNoAliasAA.isKnownNoAlias())
- indicateOptimisticFixpoint();
- }
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Implement this.
- return indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_FLOATING_ATTR(noalias)
- }
-};
-
-/// NoAlias attribute for an argument.
-struct AANoAliasArgument final
- : AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl> {
- using Base = AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl>;
- AANoAliasArgument(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- Base::initialize(A);
- // See callsite argument attribute and callee argument attribute.
- if (hasAttr({Attribute::ByVal}))
- indicateOptimisticFixpoint();
- }
-
- /// See AbstractAttribute::update(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // We have to make sure no-alias on the argument does not break
- // synchronization when this is a callback argument, see also [1] below.
- // If synchronization cannot be affected, we delegate to the base updateImpl
- // function, otherwise we give up for now.
-
- // If the function is no-sync, no-alias cannot break synchronization.
- const auto &NoSyncAA = A.getAAFor<AANoSync>(
- *this, IRPosition::function_scope(getIRPosition()));
- if (NoSyncAA.isAssumedNoSync())
- return Base::updateImpl(A);
-
- // If the argument is read-only, no-alias cannot break synchronization.
- const auto &MemBehaviorAA =
- A.getAAFor<AAMemoryBehavior>(*this, getIRPosition());
- if (MemBehaviorAA.isAssumedReadOnly())
- return Base::updateImpl(A);
-
- // If the argument is never passed through callbacks, no-alias cannot break
- // synchronization.
- bool AllCallSitesKnown;
- if (A.checkForAllCallSites(
- [](AbstractCallSite ACS) { return !ACS.isCallbackCall(); }, *this,
- true, AllCallSitesKnown))
- return Base::updateImpl(A);
-
- // TODO: add no-alias but make sure it doesn't break synchronization by
- // introducing fake uses. See:
- // [1] Compiler Optimizations for OpenMP, J. Doerfert and H. Finkel,
- // International Workshop on OpenMP 2018,
- // http://compilers.cs.uni-saarland.de/people/doerfert/par_opt18.pdf
-
- return indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(noalias) }
-};
-
-struct AANoAliasCallSiteArgument final : AANoAliasImpl {
- AANoAliasCallSiteArgument(const IRPosition &IRP, Attributor &A)
- : AANoAliasImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- // See callsite argument attribute and callee argument attribute.
- const auto &CB = cast<CallBase>(getAnchorValue());
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(willreturn); }
+};
+
+/// -------------------AAReachability Attribute--------------------------
+
+struct AAReachabilityImpl : AAReachability {
+ AAReachabilityImpl(const IRPosition &IRP, Attributor &A)
+ : AAReachability(IRP, A) {}
+
+ const std::string getAsStr() const override {
+ // TODO: Return the number of reachable queries.
+ return "reachable";
+ }
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override { indicatePessimisticFixpoint(); }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ return indicatePessimisticFixpoint();
+ }
+};
+
+struct AAReachabilityFunction final : public AAReachabilityImpl {
+ AAReachabilityFunction(const IRPosition &IRP, Attributor &A)
+ : AAReachabilityImpl(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(reachable); }
+};
+
+/// ------------------------ NoAlias Argument Attribute ------------------------
+
+struct AANoAliasImpl : AANoAlias {
+ AANoAliasImpl(const IRPosition &IRP, Attributor &A) : AANoAlias(IRP, A) {
+ assert(getAssociatedType()->isPointerTy() &&
+ "Noalias is a pointer attribute");
+ }
+
+ const std::string getAsStr() const override {
+ return getAssumed() ? "noalias" : "may-alias";
+ }
+};
+
+/// NoAlias attribute for a floating value.
+struct AANoAliasFloating final : AANoAliasImpl {
+ AANoAliasFloating(const IRPosition &IRP, Attributor &A)
+ : AANoAliasImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ AANoAliasImpl::initialize(A);
+ Value *Val = &getAssociatedValue();
+ do {
+ CastInst *CI = dyn_cast<CastInst>(Val);
+ if (!CI)
+ break;
+ Value *Base = CI->getOperand(0);
+ if (!Base->hasOneUse())
+ break;
+ Val = Base;
+ } while (true);
+
+ if (!Val->getType()->isPointerTy()) {
+ indicatePessimisticFixpoint();
+ return;
+ }
+
+ if (isa<AllocaInst>(Val))
+ indicateOptimisticFixpoint();
+ else if (isa<ConstantPointerNull>(Val) &&
+ !NullPointerIsDefined(getAnchorScope(),
+ Val->getType()->getPointerAddressSpace()))
+ indicateOptimisticFixpoint();
+ else if (Val != &getAssociatedValue()) {
+ const auto &ValNoAliasAA =
+ A.getAAFor<AANoAlias>(*this, IRPosition::value(*Val));
+ if (ValNoAliasAA.isKnownNoAlias())
+ indicateOptimisticFixpoint();
+ }
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // TODO: Implement this.
+ return indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_FLOATING_ATTR(noalias)
+ }
+};
+
+/// NoAlias attribute for an argument.
+struct AANoAliasArgument final
+ : AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl> {
+ using Base = AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl>;
+ AANoAliasArgument(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ Base::initialize(A);
+ // See callsite argument attribute and callee argument attribute.
+ if (hasAttr({Attribute::ByVal}))
+ indicateOptimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::update(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // We have to make sure no-alias on the argument does not break
+ // synchronization when this is a callback argument, see also [1] below.
+ // If synchronization cannot be affected, we delegate to the base updateImpl
+ // function, otherwise we give up for now.
+
+ // If the function is no-sync, no-alias cannot break synchronization.
+ const auto &NoSyncAA = A.getAAFor<AANoSync>(
+ *this, IRPosition::function_scope(getIRPosition()));
+ if (NoSyncAA.isAssumedNoSync())
+ return Base::updateImpl(A);
+
+ // If the argument is read-only, no-alias cannot break synchronization.
+ const auto &MemBehaviorAA =
+ A.getAAFor<AAMemoryBehavior>(*this, getIRPosition());
+ if (MemBehaviorAA.isAssumedReadOnly())
+ return Base::updateImpl(A);
+
+ // If the argument is never passed through callbacks, no-alias cannot break
+ // synchronization.
+ bool AllCallSitesKnown;
+ if (A.checkForAllCallSites(
+ [](AbstractCallSite ACS) { return !ACS.isCallbackCall(); }, *this,
+ true, AllCallSitesKnown))
+ return Base::updateImpl(A);
+
+ // TODO: add no-alias but make sure it doesn't break synchronization by
+ // introducing fake uses. See:
+ // [1] Compiler Optimizations for OpenMP, J. Doerfert and H. Finkel,
+ // International Workshop on OpenMP 2018,
+ // http://compilers.cs.uni-saarland.de/people/doerfert/par_opt18.pdf
+
+ return indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(noalias) }
+};
+
+struct AANoAliasCallSiteArgument final : AANoAliasImpl {
+ AANoAliasCallSiteArgument(const IRPosition &IRP, Attributor &A)
+ : AANoAliasImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ // See callsite argument attribute and callee argument attribute.
+ const auto &CB = cast<CallBase>(getAnchorValue());
if (CB.paramHasAttr(getCallSiteArgNo(), Attribute::NoAlias))
- indicateOptimisticFixpoint();
- Value &Val = getAssociatedValue();
- if (isa<ConstantPointerNull>(Val) &&
- !NullPointerIsDefined(getAnchorScope(),
- Val.getType()->getPointerAddressSpace()))
- indicateOptimisticFixpoint();
- }
-
- /// Determine if the underlying value may alias with the call site argument
- /// \p OtherArgNo of \p ICS (= the underlying call site).
- bool mayAliasWithArgument(Attributor &A, AAResults *&AAR,
- const AAMemoryBehavior &MemBehaviorAA,
- const CallBase &CB, unsigned OtherArgNo) {
- // We do not need to worry about aliasing with the underlying IRP.
+ indicateOptimisticFixpoint();
+ Value &Val = getAssociatedValue();
+ if (isa<ConstantPointerNull>(Val) &&
+ !NullPointerIsDefined(getAnchorScope(),
+ Val.getType()->getPointerAddressSpace()))
+ indicateOptimisticFixpoint();
+ }
+
+ /// Determine if the underlying value may alias with the call site argument
+ /// \p OtherArgNo of \p ICS (= the underlying call site).
+ bool mayAliasWithArgument(Attributor &A, AAResults *&AAR,
+ const AAMemoryBehavior &MemBehaviorAA,
+ const CallBase &CB, unsigned OtherArgNo) {
+ // We do not need to worry about aliasing with the underlying IRP.
if (this->getCalleeArgNo() == (int)OtherArgNo)
- return false;
-
- // If it is not a pointer or pointer vector we do not alias.
- const Value *ArgOp = CB.getArgOperand(OtherArgNo);
- if (!ArgOp->getType()->isPtrOrPtrVectorTy())
- return false;
-
- auto &CBArgMemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
- *this, IRPosition::callsite_argument(CB, OtherArgNo),
- /* TrackDependence */ false);
-
- // If the argument is readnone, there is no read-write aliasing.
- if (CBArgMemBehaviorAA.isAssumedReadNone()) {
- A.recordDependence(CBArgMemBehaviorAA, *this, DepClassTy::OPTIONAL);
- return false;
- }
-
- // If the argument is readonly and the underlying value is readonly, there
- // is no read-write aliasing.
- bool IsReadOnly = MemBehaviorAA.isAssumedReadOnly();
- if (CBArgMemBehaviorAA.isAssumedReadOnly() && IsReadOnly) {
- A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
- A.recordDependence(CBArgMemBehaviorAA, *this, DepClassTy::OPTIONAL);
- return false;
- }
-
- // We have to utilize actual alias analysis queries so we need the object.
- if (!AAR)
- AAR = A.getInfoCache().getAAResultsForFunction(*getAnchorScope());
-
- // Try to rule it out at the call site.
- bool IsAliasing = !AAR || !AAR->isNoAlias(&getAssociatedValue(), ArgOp);
- LLVM_DEBUG(dbgs() << "[NoAliasCSArg] Check alias between "
- "callsite arguments: "
- << getAssociatedValue() << " " << *ArgOp << " => "
- << (IsAliasing ? "" : "no-") << "alias \n");
-
- return IsAliasing;
- }
-
- bool
- isKnownNoAliasDueToNoAliasPreservation(Attributor &A, AAResults *&AAR,
- const AAMemoryBehavior &MemBehaviorAA,
- const AANoAlias &NoAliasAA) {
- // We can deduce "noalias" if the following conditions hold.
- // (i) Associated value is assumed to be noalias in the definition.
- // (ii) Associated value is assumed to be no-capture in all the uses
- // possibly executed before this callsite.
- // (iii) There is no other pointer argument which could alias with the
- // value.
-
- bool AssociatedValueIsNoAliasAtDef = NoAliasAA.isAssumedNoAlias();
- if (!AssociatedValueIsNoAliasAtDef) {
- LLVM_DEBUG(dbgs() << "[AANoAlias] " << getAssociatedValue()
- << " is not no-alias at the definition\n");
- return false;
- }
-
- A.recordDependence(NoAliasAA, *this, DepClassTy::OPTIONAL);
-
- const IRPosition &VIRP = IRPosition::value(getAssociatedValue());
+ return false;
+
+ // If it is not a pointer or pointer vector we do not alias.
+ const Value *ArgOp = CB.getArgOperand(OtherArgNo);
+ if (!ArgOp->getType()->isPtrOrPtrVectorTy())
+ return false;
+
+ auto &CBArgMemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
+ *this, IRPosition::callsite_argument(CB, OtherArgNo),
+ /* TrackDependence */ false);
+
+ // If the argument is readnone, there is no read-write aliasing.
+ if (CBArgMemBehaviorAA.isAssumedReadNone()) {
+ A.recordDependence(CBArgMemBehaviorAA, *this, DepClassTy::OPTIONAL);
+ return false;
+ }
+
+ // If the argument is readonly and the underlying value is readonly, there
+ // is no read-write aliasing.
+ bool IsReadOnly = MemBehaviorAA.isAssumedReadOnly();
+ if (CBArgMemBehaviorAA.isAssumedReadOnly() && IsReadOnly) {
+ A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
+ A.recordDependence(CBArgMemBehaviorAA, *this, DepClassTy::OPTIONAL);
+ return false;
+ }
+
+ // We have to utilize actual alias analysis queries so we need the object.
+ if (!AAR)
+ AAR = A.getInfoCache().getAAResultsForFunction(*getAnchorScope());
+
+ // Try to rule it out at the call site.
+ bool IsAliasing = !AAR || !AAR->isNoAlias(&getAssociatedValue(), ArgOp);
+ LLVM_DEBUG(dbgs() << "[NoAliasCSArg] Check alias between "
+ "callsite arguments: "
+ << getAssociatedValue() << " " << *ArgOp << " => "
+ << (IsAliasing ? "" : "no-") << "alias \n");
+
+ return IsAliasing;
+ }
+
+ bool
+ isKnownNoAliasDueToNoAliasPreservation(Attributor &A, AAResults *&AAR,
+ const AAMemoryBehavior &MemBehaviorAA,
+ const AANoAlias &NoAliasAA) {
+ // We can deduce "noalias" if the following conditions hold.
+ // (i) Associated value is assumed to be noalias in the definition.
+ // (ii) Associated value is assumed to be no-capture in all the uses
+ // possibly executed before this callsite.
+ // (iii) There is no other pointer argument which could alias with the
+ // value.
+
+ bool AssociatedValueIsNoAliasAtDef = NoAliasAA.isAssumedNoAlias();
+ if (!AssociatedValueIsNoAliasAtDef) {
+ LLVM_DEBUG(dbgs() << "[AANoAlias] " << getAssociatedValue()
+ << " is not no-alias at the definition\n");
+ return false;
+ }
+
+ A.recordDependence(NoAliasAA, *this, DepClassTy::OPTIONAL);
+
+ const IRPosition &VIRP = IRPosition::value(getAssociatedValue());
const Function *ScopeFn = VIRP.getAnchorScope();
- auto &NoCaptureAA =
- A.getAAFor<AANoCapture>(*this, VIRP, /* TrackDependence */ false);
- // Check whether the value is captured in the scope using AANoCapture.
- // Look at CFG and check only uses possibly executed before this
- // callsite.
- auto UsePred = [&](const Use &U, bool &Follow) -> bool {
- Instruction *UserI = cast<Instruction>(U.getUser());
-
+ auto &NoCaptureAA =
+ A.getAAFor<AANoCapture>(*this, VIRP, /* TrackDependence */ false);
+ // Check whether the value is captured in the scope using AANoCapture.
+ // Look at CFG and check only uses possibly executed before this
+ // callsite.
+ auto UsePred = [&](const Use &U, bool &Follow) -> bool {
+ Instruction *UserI = cast<Instruction>(U.getUser());
+
// If UserI is the curr instruction and there is a single potential use of
// the value in UserI we allow the use.
// TODO: We should inspect the operands and allow those that cannot alias
// with the value.
if (UserI == getCtxI() && UserI->getNumOperands() == 1)
- return true;
-
- if (ScopeFn) {
- const auto &ReachabilityAA =
- A.getAAFor<AAReachability>(*this, IRPosition::function(*ScopeFn));
-
+ return true;
+
+ if (ScopeFn) {
+ const auto &ReachabilityAA =
+ A.getAAFor<AAReachability>(*this, IRPosition::function(*ScopeFn));
+
if (!ReachabilityAA.isAssumedReachable(A, *UserI, *getCtxI()))
- return true;
-
- if (auto *CB = dyn_cast<CallBase>(UserI)) {
- if (CB->isArgOperand(&U)) {
-
- unsigned ArgNo = CB->getArgOperandNo(&U);
-
- const auto &NoCaptureAA = A.getAAFor<AANoCapture>(
- *this, IRPosition::callsite_argument(*CB, ArgNo));
-
- if (NoCaptureAA.isAssumedNoCapture())
- return true;
- }
- }
- }
-
- // For cases which can potentially have more users
- if (isa<GetElementPtrInst>(U) || isa<BitCastInst>(U) || isa<PHINode>(U) ||
- isa<SelectInst>(U)) {
- Follow = true;
- return true;
- }
-
- LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *U << "\n");
- return false;
- };
-
- if (!NoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
- if (!A.checkForAllUses(UsePred, *this, getAssociatedValue())) {
- LLVM_DEBUG(
- dbgs() << "[AANoAliasCSArg] " << getAssociatedValue()
- << " cannot be noalias as it is potentially captured\n");
- return false;
- }
- }
- A.recordDependence(NoCaptureAA, *this, DepClassTy::OPTIONAL);
-
- // Check there is no other pointer argument which could alias with the
- // value passed at this call site.
- // TODO: AbstractCallSite
- const auto &CB = cast<CallBase>(getAnchorValue());
- for (unsigned OtherArgNo = 0; OtherArgNo < CB.getNumArgOperands();
- OtherArgNo++)
- if (mayAliasWithArgument(A, AAR, MemBehaviorAA, CB, OtherArgNo))
- return false;
-
- return true;
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // If the argument is readnone we are done as there are no accesses via the
- // argument.
- auto &MemBehaviorAA =
- A.getAAFor<AAMemoryBehavior>(*this, getIRPosition(),
- /* TrackDependence */ false);
- if (MemBehaviorAA.isAssumedReadNone()) {
- A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
- return ChangeStatus::UNCHANGED;
- }
-
- const IRPosition &VIRP = IRPosition::value(getAssociatedValue());
- const auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, VIRP,
- /* TrackDependence */ false);
-
- AAResults *AAR = nullptr;
- if (isKnownNoAliasDueToNoAliasPreservation(A, AAR, MemBehaviorAA,
- NoAliasAA)) {
- LLVM_DEBUG(
- dbgs() << "[AANoAlias] No-Alias deduced via no-alias preservation\n");
- return ChangeStatus::UNCHANGED;
- }
-
- return indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(noalias) }
-};
-
-/// NoAlias attribute for function return value.
-struct AANoAliasReturned final : AANoAliasImpl {
- AANoAliasReturned(const IRPosition &IRP, Attributor &A)
- : AANoAliasImpl(IRP, A) {}
-
+ return true;
+
+ if (auto *CB = dyn_cast<CallBase>(UserI)) {
+ if (CB->isArgOperand(&U)) {
+
+ unsigned ArgNo = CB->getArgOperandNo(&U);
+
+ const auto &NoCaptureAA = A.getAAFor<AANoCapture>(
+ *this, IRPosition::callsite_argument(*CB, ArgNo));
+
+ if (NoCaptureAA.isAssumedNoCapture())
+ return true;
+ }
+ }
+ }
+
+ // For cases which can potentially have more users
+ if (isa<GetElementPtrInst>(U) || isa<BitCastInst>(U) || isa<PHINode>(U) ||
+ isa<SelectInst>(U)) {
+ Follow = true;
+ return true;
+ }
+
+ LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *U << "\n");
+ return false;
+ };
+
+ if (!NoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+ if (!A.checkForAllUses(UsePred, *this, getAssociatedValue())) {
+ LLVM_DEBUG(
+ dbgs() << "[AANoAliasCSArg] " << getAssociatedValue()
+ << " cannot be noalias as it is potentially captured\n");
+ return false;
+ }
+ }
+ A.recordDependence(NoCaptureAA, *this, DepClassTy::OPTIONAL);
+
+ // Check there is no other pointer argument which could alias with the
+ // value passed at this call site.
+ // TODO: AbstractCallSite
+ const auto &CB = cast<CallBase>(getAnchorValue());
+ for (unsigned OtherArgNo = 0; OtherArgNo < CB.getNumArgOperands();
+ OtherArgNo++)
+ if (mayAliasWithArgument(A, AAR, MemBehaviorAA, CB, OtherArgNo))
+ return false;
+
+ return true;
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // If the argument is readnone we are done as there are no accesses via the
+ // argument.
+ auto &MemBehaviorAA =
+ A.getAAFor<AAMemoryBehavior>(*this, getIRPosition(),
+ /* TrackDependence */ false);
+ if (MemBehaviorAA.isAssumedReadNone()) {
+ A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
+ return ChangeStatus::UNCHANGED;
+ }
+
+ const IRPosition &VIRP = IRPosition::value(getAssociatedValue());
+ const auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, VIRP,
+ /* TrackDependence */ false);
+
+ AAResults *AAR = nullptr;
+ if (isKnownNoAliasDueToNoAliasPreservation(A, AAR, MemBehaviorAA,
+ NoAliasAA)) {
+ LLVM_DEBUG(
+ dbgs() << "[AANoAlias] No-Alias deduced via no-alias preservation\n");
+ return ChangeStatus::UNCHANGED;
+ }
+
+ return indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(noalias) }
+};
+
+/// NoAlias attribute for function return value.
+struct AANoAliasReturned final : AANoAliasImpl {
+ AANoAliasReturned(const IRPosition &IRP, Attributor &A)
+ : AANoAliasImpl(IRP, A) {}
+
/// See AbstractAttribute::initialize(...).
void initialize(Attributor &A) override {
AANoAliasImpl::initialize(A);
@@ -2692,371 +2692,371 @@ struct AANoAliasReturned final : AANoAliasImpl {
indicatePessimisticFixpoint();
}
- /// See AbstractAttribute::updateImpl(...).
- virtual ChangeStatus updateImpl(Attributor &A) override {
-
- auto CheckReturnValue = [&](Value &RV) -> bool {
- if (Constant *C = dyn_cast<Constant>(&RV))
- if (C->isNullValue() || isa<UndefValue>(C))
- return true;
-
- /// For now, we can only deduce noalias if we have call sites.
- /// FIXME: add more support.
- if (!isa<CallBase>(&RV))
- return false;
-
- const IRPosition &RVPos = IRPosition::value(RV);
- const auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, RVPos);
- if (!NoAliasAA.isAssumedNoAlias())
- return false;
-
- const auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, RVPos);
- return NoCaptureAA.isAssumedNoCaptureMaybeReturned();
- };
-
- if (!A.checkForAllReturnedValues(CheckReturnValue, *this))
- return indicatePessimisticFixpoint();
-
- return ChangeStatus::UNCHANGED;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(noalias) }
-};
-
-/// NoAlias attribute deduction for a call site return value.
-struct AANoAliasCallSiteReturned final : AANoAliasImpl {
- AANoAliasCallSiteReturned(const IRPosition &IRP, Attributor &A)
- : AANoAliasImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- AANoAliasImpl::initialize(A);
- Function *F = getAssociatedFunction();
+ /// See AbstractAttribute::updateImpl(...).
+ virtual ChangeStatus updateImpl(Attributor &A) override {
+
+ auto CheckReturnValue = [&](Value &RV) -> bool {
+ if (Constant *C = dyn_cast<Constant>(&RV))
+ if (C->isNullValue() || isa<UndefValue>(C))
+ return true;
+
+ /// For now, we can only deduce noalias if we have call sites.
+ /// FIXME: add more support.
+ if (!isa<CallBase>(&RV))
+ return false;
+
+ const IRPosition &RVPos = IRPosition::value(RV);
+ const auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, RVPos);
+ if (!NoAliasAA.isAssumedNoAlias())
+ return false;
+
+ const auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, RVPos);
+ return NoCaptureAA.isAssumedNoCaptureMaybeReturned();
+ };
+
+ if (!A.checkForAllReturnedValues(CheckReturnValue, *this))
+ return indicatePessimisticFixpoint();
+
+ return ChangeStatus::UNCHANGED;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(noalias) }
+};
+
+/// NoAlias attribute deduction for a call site return value.
+struct AANoAliasCallSiteReturned final : AANoAliasImpl {
+ AANoAliasCallSiteReturned(const IRPosition &IRP, Attributor &A)
+ : AANoAliasImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ AANoAliasImpl::initialize(A);
+ Function *F = getAssociatedFunction();
if (!F || F->isDeclaration())
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness information and then it makes
- // sense to specialize attributes for call sites arguments instead of
- // redirecting requests to the callee argument.
- Function *F = getAssociatedFunction();
- const IRPosition &FnPos = IRPosition::returned(*F);
- auto &FnAA = A.getAAFor<AANoAlias>(*this, FnPos);
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness information and then it makes
+ // sense to specialize attributes for call sites arguments instead of
+ // redirecting requests to the callee argument.
+ Function *F = getAssociatedFunction();
+ const IRPosition &FnPos = IRPosition::returned(*F);
+ auto &FnAA = A.getAAFor<AANoAlias>(*this, FnPos);
return clampStateAndIndicateChange(getState(), FnAA.getState());
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noalias); }
-};
-
-/// -------------------AAIsDead Function Attribute-----------------------
-
-struct AAIsDeadValueImpl : public AAIsDead {
- AAIsDeadValueImpl(const IRPosition &IRP, Attributor &A) : AAIsDead(IRP, A) {}
-
- /// See AAIsDead::isAssumedDead().
- bool isAssumedDead() const override { return getAssumed(); }
-
- /// See AAIsDead::isKnownDead().
- bool isKnownDead() const override { return getKnown(); }
-
- /// See AAIsDead::isAssumedDead(BasicBlock *).
- bool isAssumedDead(const BasicBlock *BB) const override { return false; }
-
- /// See AAIsDead::isKnownDead(BasicBlock *).
- bool isKnownDead(const BasicBlock *BB) const override { return false; }
-
- /// See AAIsDead::isAssumedDead(Instruction *I).
- bool isAssumedDead(const Instruction *I) const override {
- return I == getCtxI() && isAssumedDead();
- }
-
- /// See AAIsDead::isKnownDead(Instruction *I).
- bool isKnownDead(const Instruction *I) const override {
- return isAssumedDead(I) && getKnown();
- }
-
- /// See AbstractAttribute::getAsStr().
- const std::string getAsStr() const override {
- return isAssumedDead() ? "assumed-dead" : "assumed-live";
- }
-
- /// Check if all uses are assumed dead.
- bool areAllUsesAssumedDead(Attributor &A, Value &V) {
- auto UsePred = [&](const Use &U, bool &Follow) { return false; };
- // Explicitly set the dependence class to required because we want a long
- // chain of N dependent instructions to be considered live as soon as one is
- // without going through N update cycles. This is not required for
- // correctness.
- return A.checkForAllUses(UsePred, *this, V, DepClassTy::REQUIRED);
- }
-
- /// Determine if \p I is assumed to be side-effect free.
- bool isAssumedSideEffectFree(Attributor &A, Instruction *I) {
- if (!I || wouldInstructionBeTriviallyDead(I))
- return true;
-
- auto *CB = dyn_cast<CallBase>(I);
- if (!CB || isa<IntrinsicInst>(CB))
- return false;
-
- const IRPosition &CallIRP = IRPosition::callsite_function(*CB);
- const auto &NoUnwindAA = A.getAndUpdateAAFor<AANoUnwind>(
- *this, CallIRP, /* TrackDependence */ false);
- if (!NoUnwindAA.isAssumedNoUnwind())
- return false;
- if (!NoUnwindAA.isKnownNoUnwind())
- A.recordDependence(NoUnwindAA, *this, DepClassTy::OPTIONAL);
-
- const auto &MemBehaviorAA = A.getAndUpdateAAFor<AAMemoryBehavior>(
- *this, CallIRP, /* TrackDependence */ false);
- if (MemBehaviorAA.isAssumedReadOnly()) {
- if (!MemBehaviorAA.isKnownReadOnly())
- A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
- return true;
- }
- return false;
- }
-};
-
-struct AAIsDeadFloating : public AAIsDeadValueImpl {
- AAIsDeadFloating(const IRPosition &IRP, Attributor &A)
- : AAIsDeadValueImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- if (isa<UndefValue>(getAssociatedValue())) {
- indicatePessimisticFixpoint();
- return;
- }
-
- Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
- if (!isAssumedSideEffectFree(A, I))
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
- if (!isAssumedSideEffectFree(A, I))
- return indicatePessimisticFixpoint();
-
- if (!areAllUsesAssumedDead(A, getAssociatedValue()))
- return indicatePessimisticFixpoint();
- return ChangeStatus::UNCHANGED;
- }
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- Value &V = getAssociatedValue();
- if (auto *I = dyn_cast<Instruction>(&V)) {
- // If we get here we basically know the users are all dead. We check if
- // isAssumedSideEffectFree returns true here again because it might not be
- // the case and only the users are dead but the instruction (=call) is
- // still needed.
- if (isAssumedSideEffectFree(A, I) && !isa<InvokeInst>(I)) {
- A.deleteAfterManifest(*I);
- return ChangeStatus::CHANGED;
- }
- }
- if (V.use_empty())
- return ChangeStatus::UNCHANGED;
-
- bool UsedAssumedInformation = false;
- Optional<Constant *> C =
- A.getAssumedConstant(V, *this, UsedAssumedInformation);
- if (C.hasValue() && C.getValue())
- return ChangeStatus::UNCHANGED;
-
- // Replace the value with undef as it is dead but keep droppable uses around
- // as they provide information we don't want to give up on just yet.
- UndefValue &UV = *UndefValue::get(V.getType());
- bool AnyChange =
- A.changeValueAfterManifest(V, UV, /* ChangeDropppable */ false);
- return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_FLOATING_ATTR(IsDead)
- }
-};
-
-struct AAIsDeadArgument : public AAIsDeadFloating {
- AAIsDeadArgument(const IRPosition &IRP, Attributor &A)
- : AAIsDeadFloating(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- if (!A.isFunctionIPOAmendable(*getAnchorScope()))
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- ChangeStatus Changed = AAIsDeadFloating::manifest(A);
- Argument &Arg = *getAssociatedArgument();
- if (A.isValidFunctionSignatureRewrite(Arg, /* ReplacementTypes */ {}))
- if (A.registerFunctionSignatureRewrite(
- Arg, /* ReplacementTypes */ {},
- Attributor::ArgumentReplacementInfo::CalleeRepairCBTy{},
- Attributor::ArgumentReplacementInfo::ACSRepairCBTy{})) {
- Arg.dropDroppableUses();
- return ChangeStatus::CHANGED;
- }
- return Changed;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(IsDead) }
-};
-
-struct AAIsDeadCallSiteArgument : public AAIsDeadValueImpl {
- AAIsDeadCallSiteArgument(const IRPosition &IRP, Attributor &A)
- : AAIsDeadValueImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- if (isa<UndefValue>(getAssociatedValue()))
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness information and then it makes
- // sense to specialize attributes for call sites arguments instead of
- // redirecting requests to the callee argument.
- Argument *Arg = getAssociatedArgument();
- if (!Arg)
- return indicatePessimisticFixpoint();
- const IRPosition &ArgPos = IRPosition::argument(*Arg);
- auto &ArgAA = A.getAAFor<AAIsDead>(*this, ArgPos);
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noalias); }
+};
+
+/// -------------------AAIsDead Function Attribute-----------------------
+
+struct AAIsDeadValueImpl : public AAIsDead {
+ AAIsDeadValueImpl(const IRPosition &IRP, Attributor &A) : AAIsDead(IRP, A) {}
+
+ /// See AAIsDead::isAssumedDead().
+ bool isAssumedDead() const override { return getAssumed(); }
+
+ /// See AAIsDead::isKnownDead().
+ bool isKnownDead() const override { return getKnown(); }
+
+ /// See AAIsDead::isAssumedDead(BasicBlock *).
+ bool isAssumedDead(const BasicBlock *BB) const override { return false; }
+
+ /// See AAIsDead::isKnownDead(BasicBlock *).
+ bool isKnownDead(const BasicBlock *BB) const override { return false; }
+
+ /// See AAIsDead::isAssumedDead(Instruction *I).
+ bool isAssumedDead(const Instruction *I) const override {
+ return I == getCtxI() && isAssumedDead();
+ }
+
+ /// See AAIsDead::isKnownDead(Instruction *I).
+ bool isKnownDead(const Instruction *I) const override {
+ return isAssumedDead(I) && getKnown();
+ }
+
+ /// See AbstractAttribute::getAsStr().
+ const std::string getAsStr() const override {
+ return isAssumedDead() ? "assumed-dead" : "assumed-live";
+ }
+
+ /// Check if all uses are assumed dead.
+ bool areAllUsesAssumedDead(Attributor &A, Value &V) {
+ auto UsePred = [&](const Use &U, bool &Follow) { return false; };
+ // Explicitly set the dependence class to required because we want a long
+ // chain of N dependent instructions to be considered live as soon as one is
+ // without going through N update cycles. This is not required for
+ // correctness.
+ return A.checkForAllUses(UsePred, *this, V, DepClassTy::REQUIRED);
+ }
+
+ /// Determine if \p I is assumed to be side-effect free.
+ bool isAssumedSideEffectFree(Attributor &A, Instruction *I) {
+ if (!I || wouldInstructionBeTriviallyDead(I))
+ return true;
+
+ auto *CB = dyn_cast<CallBase>(I);
+ if (!CB || isa<IntrinsicInst>(CB))
+ return false;
+
+ const IRPosition &CallIRP = IRPosition::callsite_function(*CB);
+ const auto &NoUnwindAA = A.getAndUpdateAAFor<AANoUnwind>(
+ *this, CallIRP, /* TrackDependence */ false);
+ if (!NoUnwindAA.isAssumedNoUnwind())
+ return false;
+ if (!NoUnwindAA.isKnownNoUnwind())
+ A.recordDependence(NoUnwindAA, *this, DepClassTy::OPTIONAL);
+
+ const auto &MemBehaviorAA = A.getAndUpdateAAFor<AAMemoryBehavior>(
+ *this, CallIRP, /* TrackDependence */ false);
+ if (MemBehaviorAA.isAssumedReadOnly()) {
+ if (!MemBehaviorAA.isKnownReadOnly())
+ A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
+ return true;
+ }
+ return false;
+ }
+};
+
+struct AAIsDeadFloating : public AAIsDeadValueImpl {
+ AAIsDeadFloating(const IRPosition &IRP, Attributor &A)
+ : AAIsDeadValueImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ if (isa<UndefValue>(getAssociatedValue())) {
+ indicatePessimisticFixpoint();
+ return;
+ }
+
+ Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
+ if (!isAssumedSideEffectFree(A, I))
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
+ if (!isAssumedSideEffectFree(A, I))
+ return indicatePessimisticFixpoint();
+
+ if (!areAllUsesAssumedDead(A, getAssociatedValue()))
+ return indicatePessimisticFixpoint();
+ return ChangeStatus::UNCHANGED;
+ }
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ Value &V = getAssociatedValue();
+ if (auto *I = dyn_cast<Instruction>(&V)) {
+ // If we get here we basically know the users are all dead. We check if
+ // isAssumedSideEffectFree returns true here again because it might not be
+ // the case and only the users are dead but the instruction (=call) is
+ // still needed.
+ if (isAssumedSideEffectFree(A, I) && !isa<InvokeInst>(I)) {
+ A.deleteAfterManifest(*I);
+ return ChangeStatus::CHANGED;
+ }
+ }
+ if (V.use_empty())
+ return ChangeStatus::UNCHANGED;
+
+ bool UsedAssumedInformation = false;
+ Optional<Constant *> C =
+ A.getAssumedConstant(V, *this, UsedAssumedInformation);
+ if (C.hasValue() && C.getValue())
+ return ChangeStatus::UNCHANGED;
+
+ // Replace the value with undef as it is dead but keep droppable uses around
+ // as they provide information we don't want to give up on just yet.
+ UndefValue &UV = *UndefValue::get(V.getType());
+ bool AnyChange =
+ A.changeValueAfterManifest(V, UV, /* ChangeDropppable */ false);
+ return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_FLOATING_ATTR(IsDead)
+ }
+};
+
+struct AAIsDeadArgument : public AAIsDeadFloating {
+ AAIsDeadArgument(const IRPosition &IRP, Attributor &A)
+ : AAIsDeadFloating(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ if (!A.isFunctionIPOAmendable(*getAnchorScope()))
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ ChangeStatus Changed = AAIsDeadFloating::manifest(A);
+ Argument &Arg = *getAssociatedArgument();
+ if (A.isValidFunctionSignatureRewrite(Arg, /* ReplacementTypes */ {}))
+ if (A.registerFunctionSignatureRewrite(
+ Arg, /* ReplacementTypes */ {},
+ Attributor::ArgumentReplacementInfo::CalleeRepairCBTy{},
+ Attributor::ArgumentReplacementInfo::ACSRepairCBTy{})) {
+ Arg.dropDroppableUses();
+ return ChangeStatus::CHANGED;
+ }
+ return Changed;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(IsDead) }
+};
+
+struct AAIsDeadCallSiteArgument : public AAIsDeadValueImpl {
+ AAIsDeadCallSiteArgument(const IRPosition &IRP, Attributor &A)
+ : AAIsDeadValueImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ if (isa<UndefValue>(getAssociatedValue()))
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness information and then it makes
+ // sense to specialize attributes for call sites arguments instead of
+ // redirecting requests to the callee argument.
+ Argument *Arg = getAssociatedArgument();
+ if (!Arg)
+ return indicatePessimisticFixpoint();
+ const IRPosition &ArgPos = IRPosition::argument(*Arg);
+ auto &ArgAA = A.getAAFor<AAIsDead>(*this, ArgPos);
return clampStateAndIndicateChange(getState(), ArgAA.getState());
- }
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- CallBase &CB = cast<CallBase>(getAnchorValue());
+ }
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ CallBase &CB = cast<CallBase>(getAnchorValue());
Use &U = CB.getArgOperandUse(getCallSiteArgNo());
- assert(!isa<UndefValue>(U.get()) &&
- "Expected undef values to be filtered out!");
- UndefValue &UV = *UndefValue::get(U->getType());
- if (A.changeUseAfterManifest(U, UV))
- return ChangeStatus::CHANGED;
- return ChangeStatus::UNCHANGED;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(IsDead) }
-};
-
-struct AAIsDeadCallSiteReturned : public AAIsDeadFloating {
- AAIsDeadCallSiteReturned(const IRPosition &IRP, Attributor &A)
- : AAIsDeadFloating(IRP, A), IsAssumedSideEffectFree(true) {}
-
- /// See AAIsDead::isAssumedDead().
- bool isAssumedDead() const override {
- return AAIsDeadFloating::isAssumedDead() && IsAssumedSideEffectFree;
- }
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- if (isa<UndefValue>(getAssociatedValue())) {
- indicatePessimisticFixpoint();
- return;
- }
-
- // We track this separately as a secondary state.
- IsAssumedSideEffectFree = isAssumedSideEffectFree(A, getCtxI());
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
- if (IsAssumedSideEffectFree && !isAssumedSideEffectFree(A, getCtxI())) {
- IsAssumedSideEffectFree = false;
- Changed = ChangeStatus::CHANGED;
- }
-
- if (!areAllUsesAssumedDead(A, getAssociatedValue()))
- return indicatePessimisticFixpoint();
- return Changed;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- if (IsAssumedSideEffectFree)
- STATS_DECLTRACK_CSRET_ATTR(IsDead)
- else
- STATS_DECLTRACK_CSRET_ATTR(UnusedResult)
- }
-
- /// See AbstractAttribute::getAsStr().
- const std::string getAsStr() const override {
- return isAssumedDead()
- ? "assumed-dead"
- : (getAssumed() ? "assumed-dead-users" : "assumed-live");
- }
-
-private:
- bool IsAssumedSideEffectFree;
-};
-
-struct AAIsDeadReturned : public AAIsDeadValueImpl {
- AAIsDeadReturned(const IRPosition &IRP, Attributor &A)
- : AAIsDeadValueImpl(IRP, A) {}
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
-
- A.checkForAllInstructions([](Instruction &) { return true; }, *this,
- {Instruction::Ret});
-
- auto PredForCallSite = [&](AbstractCallSite ACS) {
- if (ACS.isCallbackCall() || !ACS.getInstruction())
- return false;
- return areAllUsesAssumedDead(A, *ACS.getInstruction());
- };
-
- bool AllCallSitesKnown;
- if (!A.checkForAllCallSites(PredForCallSite, *this, true,
- AllCallSitesKnown))
- return indicatePessimisticFixpoint();
-
- return ChangeStatus::UNCHANGED;
- }
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- // TODO: Rewrite the signature to return void?
- bool AnyChange = false;
- UndefValue &UV = *UndefValue::get(getAssociatedFunction()->getReturnType());
- auto RetInstPred = [&](Instruction &I) {
- ReturnInst &RI = cast<ReturnInst>(I);
- if (!isa<UndefValue>(RI.getReturnValue()))
- AnyChange |= A.changeUseAfterManifest(RI.getOperandUse(0), UV);
- return true;
- };
- A.checkForAllInstructions(RetInstPred, *this, {Instruction::Ret});
- return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(IsDead) }
-};
-
-struct AAIsDeadFunction : public AAIsDead {
- AAIsDeadFunction(const IRPosition &IRP, Attributor &A) : AAIsDead(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- const Function *F = getAnchorScope();
- if (F && !F->isDeclaration()) {
+ assert(!isa<UndefValue>(U.get()) &&
+ "Expected undef values to be filtered out!");
+ UndefValue &UV = *UndefValue::get(U->getType());
+ if (A.changeUseAfterManifest(U, UV))
+ return ChangeStatus::CHANGED;
+ return ChangeStatus::UNCHANGED;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(IsDead) }
+};
+
+struct AAIsDeadCallSiteReturned : public AAIsDeadFloating {
+ AAIsDeadCallSiteReturned(const IRPosition &IRP, Attributor &A)
+ : AAIsDeadFloating(IRP, A), IsAssumedSideEffectFree(true) {}
+
+ /// See AAIsDead::isAssumedDead().
+ bool isAssumedDead() const override {
+ return AAIsDeadFloating::isAssumedDead() && IsAssumedSideEffectFree;
+ }
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ if (isa<UndefValue>(getAssociatedValue())) {
+ indicatePessimisticFixpoint();
+ return;
+ }
+
+ // We track this separately as a secondary state.
+ IsAssumedSideEffectFree = isAssumedSideEffectFree(A, getCtxI());
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ ChangeStatus Changed = ChangeStatus::UNCHANGED;
+ if (IsAssumedSideEffectFree && !isAssumedSideEffectFree(A, getCtxI())) {
+ IsAssumedSideEffectFree = false;
+ Changed = ChangeStatus::CHANGED;
+ }
+
+ if (!areAllUsesAssumedDead(A, getAssociatedValue()))
+ return indicatePessimisticFixpoint();
+ return Changed;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ if (IsAssumedSideEffectFree)
+ STATS_DECLTRACK_CSRET_ATTR(IsDead)
+ else
+ STATS_DECLTRACK_CSRET_ATTR(UnusedResult)
+ }
+
+ /// See AbstractAttribute::getAsStr().
+ const std::string getAsStr() const override {
+ return isAssumedDead()
+ ? "assumed-dead"
+ : (getAssumed() ? "assumed-dead-users" : "assumed-live");
+ }
+
+private:
+ bool IsAssumedSideEffectFree;
+};
+
+struct AAIsDeadReturned : public AAIsDeadValueImpl {
+ AAIsDeadReturned(const IRPosition &IRP, Attributor &A)
+ : AAIsDeadValueImpl(IRP, A) {}
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+
+ A.checkForAllInstructions([](Instruction &) { return true; }, *this,
+ {Instruction::Ret});
+
+ auto PredForCallSite = [&](AbstractCallSite ACS) {
+ if (ACS.isCallbackCall() || !ACS.getInstruction())
+ return false;
+ return areAllUsesAssumedDead(A, *ACS.getInstruction());
+ };
+
+ bool AllCallSitesKnown;
+ if (!A.checkForAllCallSites(PredForCallSite, *this, true,
+ AllCallSitesKnown))
+ return indicatePessimisticFixpoint();
+
+ return ChangeStatus::UNCHANGED;
+ }
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ // TODO: Rewrite the signature to return void?
+ bool AnyChange = false;
+ UndefValue &UV = *UndefValue::get(getAssociatedFunction()->getReturnType());
+ auto RetInstPred = [&](Instruction &I) {
+ ReturnInst &RI = cast<ReturnInst>(I);
+ if (!isa<UndefValue>(RI.getReturnValue()))
+ AnyChange |= A.changeUseAfterManifest(RI.getOperandUse(0), UV);
+ return true;
+ };
+ A.checkForAllInstructions(RetInstPred, *this, {Instruction::Ret});
+ return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(IsDead) }
+};
+
+struct AAIsDeadFunction : public AAIsDead {
+ AAIsDeadFunction(const IRPosition &IRP, Attributor &A) : AAIsDead(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ const Function *F = getAnchorScope();
+ if (F && !F->isDeclaration()) {
// We only want to compute liveness once. If the function is not part of
// the SCC, skip it.
if (A.isRunOn(*const_cast<Function *>(F))) {
@@ -3065,267 +3065,267 @@ struct AAIsDeadFunction : public AAIsDead {
} else {
indicatePessimisticFixpoint();
}
- }
- }
-
- /// See AbstractAttribute::getAsStr().
- const std::string getAsStr() const override {
- return "Live[#BB " + std::to_string(AssumedLiveBlocks.size()) + "/" +
- std::to_string(getAnchorScope()->size()) + "][#TBEP " +
- std::to_string(ToBeExploredFrom.size()) + "][#KDE " +
- std::to_string(KnownDeadEnds.size()) + "]";
- }
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- assert(getState().isValidState() &&
- "Attempted to manifest an invalid state!");
-
- ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
- Function &F = *getAnchorScope();
-
- if (AssumedLiveBlocks.empty()) {
- A.deleteAfterManifest(F);
- return ChangeStatus::CHANGED;
- }
-
- // Flag to determine if we can change an invoke to a call assuming the
- // callee is nounwind. This is not possible if the personality of the
- // function allows to catch asynchronous exceptions.
- bool Invoke2CallAllowed = !mayCatchAsynchronousExceptions(F);
-
- KnownDeadEnds.set_union(ToBeExploredFrom);
- for (const Instruction *DeadEndI : KnownDeadEnds) {
- auto *CB = dyn_cast<CallBase>(DeadEndI);
- if (!CB)
- continue;
- const auto &NoReturnAA = A.getAndUpdateAAFor<AANoReturn>(
- *this, IRPosition::callsite_function(*CB), /* TrackDependence */ true,
- DepClassTy::OPTIONAL);
- bool MayReturn = !NoReturnAA.isAssumedNoReturn();
- if (MayReturn && (!Invoke2CallAllowed || !isa<InvokeInst>(CB)))
- continue;
-
- if (auto *II = dyn_cast<InvokeInst>(DeadEndI))
- A.registerInvokeWithDeadSuccessor(const_cast<InvokeInst &>(*II));
- else
- A.changeToUnreachableAfterManifest(
- const_cast<Instruction *>(DeadEndI->getNextNode()));
- HasChanged = ChangeStatus::CHANGED;
- }
-
- STATS_DECL(AAIsDead, BasicBlock, "Number of dead basic blocks deleted.");
- for (BasicBlock &BB : F)
- if (!AssumedLiveBlocks.count(&BB)) {
- A.deleteAfterManifest(BB);
- ++BUILD_STAT_NAME(AAIsDead, BasicBlock);
- }
-
- return HasChanged;
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override;
-
+ }
+ }
+
+ /// See AbstractAttribute::getAsStr().
+ const std::string getAsStr() const override {
+ return "Live[#BB " + std::to_string(AssumedLiveBlocks.size()) + "/" +
+ std::to_string(getAnchorScope()->size()) + "][#TBEP " +
+ std::to_string(ToBeExploredFrom.size()) + "][#KDE " +
+ std::to_string(KnownDeadEnds.size()) + "]";
+ }
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ assert(getState().isValidState() &&
+ "Attempted to manifest an invalid state!");
+
+ ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+ Function &F = *getAnchorScope();
+
+ if (AssumedLiveBlocks.empty()) {
+ A.deleteAfterManifest(F);
+ return ChangeStatus::CHANGED;
+ }
+
+ // Flag to determine if we can change an invoke to a call assuming the
+ // callee is nounwind. This is not possible if the personality of the
+ // function allows to catch asynchronous exceptions.
+ bool Invoke2CallAllowed = !mayCatchAsynchronousExceptions(F);
+
+ KnownDeadEnds.set_union(ToBeExploredFrom);
+ for (const Instruction *DeadEndI : KnownDeadEnds) {
+ auto *CB = dyn_cast<CallBase>(DeadEndI);
+ if (!CB)
+ continue;
+ const auto &NoReturnAA = A.getAndUpdateAAFor<AANoReturn>(
+ *this, IRPosition::callsite_function(*CB), /* TrackDependence */ true,
+ DepClassTy::OPTIONAL);
+ bool MayReturn = !NoReturnAA.isAssumedNoReturn();
+ if (MayReturn && (!Invoke2CallAllowed || !isa<InvokeInst>(CB)))
+ continue;
+
+ if (auto *II = dyn_cast<InvokeInst>(DeadEndI))
+ A.registerInvokeWithDeadSuccessor(const_cast<InvokeInst &>(*II));
+ else
+ A.changeToUnreachableAfterManifest(
+ const_cast<Instruction *>(DeadEndI->getNextNode()));
+ HasChanged = ChangeStatus::CHANGED;
+ }
+
+ STATS_DECL(AAIsDead, BasicBlock, "Number of dead basic blocks deleted.");
+ for (BasicBlock &BB : F)
+ if (!AssumedLiveBlocks.count(&BB)) {
+ A.deleteAfterManifest(BB);
+ ++BUILD_STAT_NAME(AAIsDead, BasicBlock);
+ }
+
+ return HasChanged;
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override;
+
bool isEdgeDead(const BasicBlock *From, const BasicBlock *To) const override {
return !AssumedLiveEdges.count(std::make_pair(From, To));
}
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {}
-
- /// Returns true if the function is assumed dead.
- bool isAssumedDead() const override { return false; }
-
- /// See AAIsDead::isKnownDead().
- bool isKnownDead() const override { return false; }
-
- /// See AAIsDead::isAssumedDead(BasicBlock *).
- bool isAssumedDead(const BasicBlock *BB) const override {
- assert(BB->getParent() == getAnchorScope() &&
- "BB must be in the same anchor scope function.");
-
- if (!getAssumed())
- return false;
- return !AssumedLiveBlocks.count(BB);
- }
-
- /// See AAIsDead::isKnownDead(BasicBlock *).
- bool isKnownDead(const BasicBlock *BB) const override {
- return getKnown() && isAssumedDead(BB);
- }
-
- /// See AAIsDead::isAssumed(Instruction *I).
- bool isAssumedDead(const Instruction *I) const override {
- assert(I->getParent()->getParent() == getAnchorScope() &&
- "Instruction must be in the same anchor scope function.");
-
- if (!getAssumed())
- return false;
-
- // If it is not in AssumedLiveBlocks then it for sure dead.
- // Otherwise, it can still be after noreturn call in a live block.
- if (!AssumedLiveBlocks.count(I->getParent()))
- return true;
-
- // If it is not after a liveness barrier it is live.
- const Instruction *PrevI = I->getPrevNode();
- while (PrevI) {
- if (KnownDeadEnds.count(PrevI) || ToBeExploredFrom.count(PrevI))
- return true;
- PrevI = PrevI->getPrevNode();
- }
- return false;
- }
-
- /// See AAIsDead::isKnownDead(Instruction *I).
- bool isKnownDead(const Instruction *I) const override {
- return getKnown() && isAssumedDead(I);
- }
-
- /// Assume \p BB is (partially) live now and indicate to the Attributor \p A
- /// that internal function called from \p BB should now be looked at.
- bool assumeLive(Attributor &A, const BasicBlock &BB) {
- if (!AssumedLiveBlocks.insert(&BB).second)
- return false;
-
- // We assume that all of BB is (probably) live now and if there are calls to
- // internal functions we will assume that those are now live as well. This
- // is a performance optimization for blocks with calls to a lot of internal
- // functions. It can however cause dead functions to be treated as live.
- for (const Instruction &I : BB)
- if (const auto *CB = dyn_cast<CallBase>(&I))
- if (const Function *F = CB->getCalledFunction())
- if (F->hasLocalLinkage())
- A.markLiveInternalFunction(*F);
- return true;
- }
-
- /// Collection of instructions that need to be explored again, e.g., we
- /// did assume they do not transfer control to (one of their) successors.
- SmallSetVector<const Instruction *, 8> ToBeExploredFrom;
-
- /// Collection of instructions that are known to not transfer control.
- SmallSetVector<const Instruction *, 8> KnownDeadEnds;
-
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {}
+
+ /// Returns true if the function is assumed dead.
+ bool isAssumedDead() const override { return false; }
+
+ /// See AAIsDead::isKnownDead().
+ bool isKnownDead() const override { return false; }
+
+ /// See AAIsDead::isAssumedDead(BasicBlock *).
+ bool isAssumedDead(const BasicBlock *BB) const override {
+ assert(BB->getParent() == getAnchorScope() &&
+ "BB must be in the same anchor scope function.");
+
+ if (!getAssumed())
+ return false;
+ return !AssumedLiveBlocks.count(BB);
+ }
+
+ /// See AAIsDead::isKnownDead(BasicBlock *).
+ bool isKnownDead(const BasicBlock *BB) const override {
+ return getKnown() && isAssumedDead(BB);
+ }
+
+ /// See AAIsDead::isAssumed(Instruction *I).
+ bool isAssumedDead(const Instruction *I) const override {
+ assert(I->getParent()->getParent() == getAnchorScope() &&
+ "Instruction must be in the same anchor scope function.");
+
+ if (!getAssumed())
+ return false;
+
+ // If it is not in AssumedLiveBlocks then it for sure dead.
+ // Otherwise, it can still be after noreturn call in a live block.
+ if (!AssumedLiveBlocks.count(I->getParent()))
+ return true;
+
+ // If it is not after a liveness barrier it is live.
+ const Instruction *PrevI = I->getPrevNode();
+ while (PrevI) {
+ if (KnownDeadEnds.count(PrevI) || ToBeExploredFrom.count(PrevI))
+ return true;
+ PrevI = PrevI->getPrevNode();
+ }
+ return false;
+ }
+
+ /// See AAIsDead::isKnownDead(Instruction *I).
+ bool isKnownDead(const Instruction *I) const override {
+ return getKnown() && isAssumedDead(I);
+ }
+
+ /// Assume \p BB is (partially) live now and indicate to the Attributor \p A
+ /// that internal function called from \p BB should now be looked at.
+ bool assumeLive(Attributor &A, const BasicBlock &BB) {
+ if (!AssumedLiveBlocks.insert(&BB).second)
+ return false;
+
+ // We assume that all of BB is (probably) live now and if there are calls to
+ // internal functions we will assume that those are now live as well. This
+ // is a performance optimization for blocks with calls to a lot of internal
+ // functions. It can however cause dead functions to be treated as live.
+ for (const Instruction &I : BB)
+ if (const auto *CB = dyn_cast<CallBase>(&I))
+ if (const Function *F = CB->getCalledFunction())
+ if (F->hasLocalLinkage())
+ A.markLiveInternalFunction(*F);
+ return true;
+ }
+
+ /// Collection of instructions that need to be explored again, e.g., we
+ /// did assume they do not transfer control to (one of their) successors.
+ SmallSetVector<const Instruction *, 8> ToBeExploredFrom;
+
+ /// Collection of instructions that are known to not transfer control.
+ SmallSetVector<const Instruction *, 8> KnownDeadEnds;
+
/// Collection of all assumed live edges
DenseSet<std::pair<const BasicBlock *, const BasicBlock *>> AssumedLiveEdges;
- /// Collection of all assumed live BasicBlocks.
- DenseSet<const BasicBlock *> AssumedLiveBlocks;
-};
-
-static bool
-identifyAliveSuccessors(Attributor &A, const CallBase &CB,
- AbstractAttribute &AA,
- SmallVectorImpl<const Instruction *> &AliveSuccessors) {
- const IRPosition &IPos = IRPosition::callsite_function(CB);
-
- const auto &NoReturnAA = A.getAndUpdateAAFor<AANoReturn>(
- AA, IPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
- if (NoReturnAA.isAssumedNoReturn())
- return !NoReturnAA.isKnownNoReturn();
- if (CB.isTerminator())
- AliveSuccessors.push_back(&CB.getSuccessor(0)->front());
- else
- AliveSuccessors.push_back(CB.getNextNode());
- return false;
-}
-
-static bool
-identifyAliveSuccessors(Attributor &A, const InvokeInst &II,
- AbstractAttribute &AA,
- SmallVectorImpl<const Instruction *> &AliveSuccessors) {
- bool UsedAssumedInformation =
- identifyAliveSuccessors(A, cast<CallBase>(II), AA, AliveSuccessors);
-
- // First, determine if we can change an invoke to a call assuming the
- // callee is nounwind. This is not possible if the personality of the
- // function allows to catch asynchronous exceptions.
- if (AAIsDeadFunction::mayCatchAsynchronousExceptions(*II.getFunction())) {
- AliveSuccessors.push_back(&II.getUnwindDest()->front());
- } else {
- const IRPosition &IPos = IRPosition::callsite_function(II);
- const auto &AANoUnw = A.getAndUpdateAAFor<AANoUnwind>(
- AA, IPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
- if (AANoUnw.isAssumedNoUnwind()) {
- UsedAssumedInformation |= !AANoUnw.isKnownNoUnwind();
- } else {
- AliveSuccessors.push_back(&II.getUnwindDest()->front());
- }
- }
- return UsedAssumedInformation;
-}
-
-static bool
-identifyAliveSuccessors(Attributor &A, const BranchInst &BI,
- AbstractAttribute &AA,
- SmallVectorImpl<const Instruction *> &AliveSuccessors) {
- bool UsedAssumedInformation = false;
- if (BI.getNumSuccessors() == 1) {
- AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
- } else {
- Optional<ConstantInt *> CI = getAssumedConstantInt(
- A, *BI.getCondition(), AA, UsedAssumedInformation);
- if (!CI.hasValue()) {
- // No value yet, assume both edges are dead.
- } else if (CI.getValue()) {
- const BasicBlock *SuccBB =
- BI.getSuccessor(1 - CI.getValue()->getZExtValue());
- AliveSuccessors.push_back(&SuccBB->front());
- } else {
- AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
- AliveSuccessors.push_back(&BI.getSuccessor(1)->front());
- UsedAssumedInformation = false;
- }
- }
- return UsedAssumedInformation;
-}
-
-static bool
-identifyAliveSuccessors(Attributor &A, const SwitchInst &SI,
- AbstractAttribute &AA,
- SmallVectorImpl<const Instruction *> &AliveSuccessors) {
- bool UsedAssumedInformation = false;
- Optional<ConstantInt *> CI =
- getAssumedConstantInt(A, *SI.getCondition(), AA, UsedAssumedInformation);
- if (!CI.hasValue()) {
- // No value yet, assume all edges are dead.
- } else if (CI.getValue()) {
- for (auto &CaseIt : SI.cases()) {
- if (CaseIt.getCaseValue() == CI.getValue()) {
- AliveSuccessors.push_back(&CaseIt.getCaseSuccessor()->front());
- return UsedAssumedInformation;
- }
- }
- AliveSuccessors.push_back(&SI.getDefaultDest()->front());
- return UsedAssumedInformation;
- } else {
- for (const BasicBlock *SuccBB : successors(SI.getParent()))
- AliveSuccessors.push_back(&SuccBB->front());
- }
- return UsedAssumedInformation;
-}
-
-ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) {
- ChangeStatus Change = ChangeStatus::UNCHANGED;
-
- LLVM_DEBUG(dbgs() << "[AAIsDead] Live [" << AssumedLiveBlocks.size() << "/"
- << getAnchorScope()->size() << "] BBs and "
- << ToBeExploredFrom.size() << " exploration points and "
- << KnownDeadEnds.size() << " known dead ends\n");
-
- // Copy and clear the list of instructions we need to explore from. It is
- // refilled with instructions the next update has to look at.
- SmallVector<const Instruction *, 8> Worklist(ToBeExploredFrom.begin(),
- ToBeExploredFrom.end());
- decltype(ToBeExploredFrom) NewToBeExploredFrom;
-
- SmallVector<const Instruction *, 8> AliveSuccessors;
- while (!Worklist.empty()) {
- const Instruction *I = Worklist.pop_back_val();
- LLVM_DEBUG(dbgs() << "[AAIsDead] Exploration inst: " << *I << "\n");
-
+ /// Collection of all assumed live BasicBlocks.
+ DenseSet<const BasicBlock *> AssumedLiveBlocks;
+};
+
+static bool
+identifyAliveSuccessors(Attributor &A, const CallBase &CB,
+ AbstractAttribute &AA,
+ SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+ const IRPosition &IPos = IRPosition::callsite_function(CB);
+
+ const auto &NoReturnAA = A.getAndUpdateAAFor<AANoReturn>(
+ AA, IPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+ if (NoReturnAA.isAssumedNoReturn())
+ return !NoReturnAA.isKnownNoReturn();
+ if (CB.isTerminator())
+ AliveSuccessors.push_back(&CB.getSuccessor(0)->front());
+ else
+ AliveSuccessors.push_back(CB.getNextNode());
+ return false;
+}
+
+static bool
+identifyAliveSuccessors(Attributor &A, const InvokeInst &II,
+ AbstractAttribute &AA,
+ SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+ bool UsedAssumedInformation =
+ identifyAliveSuccessors(A, cast<CallBase>(II), AA, AliveSuccessors);
+
+ // First, determine if we can change an invoke to a call assuming the
+ // callee is nounwind. This is not possible if the personality of the
+ // function allows to catch asynchronous exceptions.
+ if (AAIsDeadFunction::mayCatchAsynchronousExceptions(*II.getFunction())) {
+ AliveSuccessors.push_back(&II.getUnwindDest()->front());
+ } else {
+ const IRPosition &IPos = IRPosition::callsite_function(II);
+ const auto &AANoUnw = A.getAndUpdateAAFor<AANoUnwind>(
+ AA, IPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+ if (AANoUnw.isAssumedNoUnwind()) {
+ UsedAssumedInformation |= !AANoUnw.isKnownNoUnwind();
+ } else {
+ AliveSuccessors.push_back(&II.getUnwindDest()->front());
+ }
+ }
+ return UsedAssumedInformation;
+}
+
+static bool
+identifyAliveSuccessors(Attributor &A, const BranchInst &BI,
+ AbstractAttribute &AA,
+ SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+ bool UsedAssumedInformation = false;
+ if (BI.getNumSuccessors() == 1) {
+ AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
+ } else {
+ Optional<ConstantInt *> CI = getAssumedConstantInt(
+ A, *BI.getCondition(), AA, UsedAssumedInformation);
+ if (!CI.hasValue()) {
+ // No value yet, assume both edges are dead.
+ } else if (CI.getValue()) {
+ const BasicBlock *SuccBB =
+ BI.getSuccessor(1 - CI.getValue()->getZExtValue());
+ AliveSuccessors.push_back(&SuccBB->front());
+ } else {
+ AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
+ AliveSuccessors.push_back(&BI.getSuccessor(1)->front());
+ UsedAssumedInformation = false;
+ }
+ }
+ return UsedAssumedInformation;
+}
+
+static bool
+identifyAliveSuccessors(Attributor &A, const SwitchInst &SI,
+ AbstractAttribute &AA,
+ SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+ bool UsedAssumedInformation = false;
+ Optional<ConstantInt *> CI =
+ getAssumedConstantInt(A, *SI.getCondition(), AA, UsedAssumedInformation);
+ if (!CI.hasValue()) {
+ // No value yet, assume all edges are dead.
+ } else if (CI.getValue()) {
+ for (auto &CaseIt : SI.cases()) {
+ if (CaseIt.getCaseValue() == CI.getValue()) {
+ AliveSuccessors.push_back(&CaseIt.getCaseSuccessor()->front());
+ return UsedAssumedInformation;
+ }
+ }
+ AliveSuccessors.push_back(&SI.getDefaultDest()->front());
+ return UsedAssumedInformation;
+ } else {
+ for (const BasicBlock *SuccBB : successors(SI.getParent()))
+ AliveSuccessors.push_back(&SuccBB->front());
+ }
+ return UsedAssumedInformation;
+}
+
+ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) {
+ ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+ LLVM_DEBUG(dbgs() << "[AAIsDead] Live [" << AssumedLiveBlocks.size() << "/"
+ << getAnchorScope()->size() << "] BBs and "
+ << ToBeExploredFrom.size() << " exploration points and "
+ << KnownDeadEnds.size() << " known dead ends\n");
+
+ // Copy and clear the list of instructions we need to explore from. It is
+ // refilled with instructions the next update has to look at.
+ SmallVector<const Instruction *, 8> Worklist(ToBeExploredFrom.begin(),
+ ToBeExploredFrom.end());
+ decltype(ToBeExploredFrom) NewToBeExploredFrom;
+
+ SmallVector<const Instruction *, 8> AliveSuccessors;
+ while (!Worklist.empty()) {
+ const Instruction *I = Worklist.pop_back_val();
+ LLVM_DEBUG(dbgs() << "[AAIsDead] Exploration inst: " << *I << "\n");
+
// Fast forward for uninteresting instructions. We could look for UB here
// though.
while (!I->isTerminator() && !isa<CallBase>(I)) {
@@ -3333,525 +3333,525 @@ ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) {
I = I->getNextNode();
}
- AliveSuccessors.clear();
-
- bool UsedAssumedInformation = false;
- switch (I->getOpcode()) {
- // TODO: look for (assumed) UB to backwards propagate "deadness".
- default:
+ AliveSuccessors.clear();
+
+ bool UsedAssumedInformation = false;
+ switch (I->getOpcode()) {
+ // TODO: look for (assumed) UB to backwards propagate "deadness".
+ default:
assert(I->isTerminator() &&
"Expected non-terminators to be handled already!");
for (const BasicBlock *SuccBB : successors(I->getParent()))
AliveSuccessors.push_back(&SuccBB->front());
- break;
- case Instruction::Call:
- UsedAssumedInformation = identifyAliveSuccessors(A, cast<CallInst>(*I),
- *this, AliveSuccessors);
- break;
- case Instruction::Invoke:
- UsedAssumedInformation = identifyAliveSuccessors(A, cast<InvokeInst>(*I),
- *this, AliveSuccessors);
- break;
- case Instruction::Br:
- UsedAssumedInformation = identifyAliveSuccessors(A, cast<BranchInst>(*I),
- *this, AliveSuccessors);
- break;
- case Instruction::Switch:
- UsedAssumedInformation = identifyAliveSuccessors(A, cast<SwitchInst>(*I),
- *this, AliveSuccessors);
- break;
- }
-
- if (UsedAssumedInformation) {
- NewToBeExploredFrom.insert(I);
- } else {
- Change = ChangeStatus::CHANGED;
- if (AliveSuccessors.empty() ||
- (I->isTerminator() && AliveSuccessors.size() < I->getNumSuccessors()))
- KnownDeadEnds.insert(I);
- }
-
- LLVM_DEBUG(dbgs() << "[AAIsDead] #AliveSuccessors: "
- << AliveSuccessors.size() << " UsedAssumedInformation: "
- << UsedAssumedInformation << "\n");
-
- for (const Instruction *AliveSuccessor : AliveSuccessors) {
- if (!I->isTerminator()) {
- assert(AliveSuccessors.size() == 1 &&
- "Non-terminator expected to have a single successor!");
- Worklist.push_back(AliveSuccessor);
- } else {
+ break;
+ case Instruction::Call:
+ UsedAssumedInformation = identifyAliveSuccessors(A, cast<CallInst>(*I),
+ *this, AliveSuccessors);
+ break;
+ case Instruction::Invoke:
+ UsedAssumedInformation = identifyAliveSuccessors(A, cast<InvokeInst>(*I),
+ *this, AliveSuccessors);
+ break;
+ case Instruction::Br:
+ UsedAssumedInformation = identifyAliveSuccessors(A, cast<BranchInst>(*I),
+ *this, AliveSuccessors);
+ break;
+ case Instruction::Switch:
+ UsedAssumedInformation = identifyAliveSuccessors(A, cast<SwitchInst>(*I),
+ *this, AliveSuccessors);
+ break;
+ }
+
+ if (UsedAssumedInformation) {
+ NewToBeExploredFrom.insert(I);
+ } else {
+ Change = ChangeStatus::CHANGED;
+ if (AliveSuccessors.empty() ||
+ (I->isTerminator() && AliveSuccessors.size() < I->getNumSuccessors()))
+ KnownDeadEnds.insert(I);
+ }
+
+ LLVM_DEBUG(dbgs() << "[AAIsDead] #AliveSuccessors: "
+ << AliveSuccessors.size() << " UsedAssumedInformation: "
+ << UsedAssumedInformation << "\n");
+
+ for (const Instruction *AliveSuccessor : AliveSuccessors) {
+ if (!I->isTerminator()) {
+ assert(AliveSuccessors.size() == 1 &&
+ "Non-terminator expected to have a single successor!");
+ Worklist.push_back(AliveSuccessor);
+ } else {
// record the assumed live edge
AssumedLiveEdges.insert(
std::make_pair(I->getParent(), AliveSuccessor->getParent()));
- if (assumeLive(A, *AliveSuccessor->getParent()))
- Worklist.push_back(AliveSuccessor);
- }
- }
- }
-
- ToBeExploredFrom = std::move(NewToBeExploredFrom);
-
- // If we know everything is live there is no need to query for liveness.
- // Instead, indicating a pessimistic fixpoint will cause the state to be
- // "invalid" and all queries to be answered conservatively without lookups.
- // To be in this state we have to (1) finished the exploration and (3) not
- // discovered any non-trivial dead end and (2) not ruled unreachable code
- // dead.
- if (ToBeExploredFrom.empty() &&
- getAnchorScope()->size() == AssumedLiveBlocks.size() &&
- llvm::all_of(KnownDeadEnds, [](const Instruction *DeadEndI) {
- return DeadEndI->isTerminator() && DeadEndI->getNumSuccessors() == 0;
- }))
- return indicatePessimisticFixpoint();
- return Change;
-}
-
-/// Liveness information for a call sites.
-struct AAIsDeadCallSite final : AAIsDeadFunction {
- AAIsDeadCallSite(const IRPosition &IRP, Attributor &A)
- : AAIsDeadFunction(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness information and then it makes
- // sense to specialize attributes for call sites instead of
- // redirecting requests to the callee.
- llvm_unreachable("Abstract attributes for liveness are not "
- "supported for call sites yet!");
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- return indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {}
-};
-
-/// -------------------- Dereferenceable Argument Attribute --------------------
-
-template <>
-ChangeStatus clampStateAndIndicateChange<DerefState>(DerefState &S,
- const DerefState &R) {
- ChangeStatus CS0 =
- clampStateAndIndicateChange(S.DerefBytesState, R.DerefBytesState);
- ChangeStatus CS1 = clampStateAndIndicateChange(S.GlobalState, R.GlobalState);
- return CS0 | CS1;
-}
-
-struct AADereferenceableImpl : AADereferenceable {
- AADereferenceableImpl(const IRPosition &IRP, Attributor &A)
- : AADereferenceable(IRP, A) {}
- using StateType = DerefState;
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- SmallVector<Attribute, 4> Attrs;
- getAttrs({Attribute::Dereferenceable, Attribute::DereferenceableOrNull},
- Attrs, /* IgnoreSubsumingPositions */ false, &A);
- for (const Attribute &Attr : Attrs)
- takeKnownDerefBytesMaximum(Attr.getValueAsInt());
-
- const IRPosition &IRP = this->getIRPosition();
- NonNullAA = &A.getAAFor<AANonNull>(*this, IRP,
- /* TrackDependence */ false);
-
- bool CanBeNull;
- takeKnownDerefBytesMaximum(
- IRP.getAssociatedValue().getPointerDereferenceableBytes(
- A.getDataLayout(), CanBeNull));
-
- bool IsFnInterface = IRP.isFnInterfaceKind();
- Function *FnScope = IRP.getAnchorScope();
- if (IsFnInterface && (!FnScope || !A.isFunctionIPOAmendable(*FnScope))) {
- indicatePessimisticFixpoint();
- return;
- }
-
- if (Instruction *CtxI = getCtxI())
- followUsesInMBEC(*this, A, getState(), *CtxI);
- }
-
- /// See AbstractAttribute::getState()
- /// {
- StateType &getState() override { return *this; }
- const StateType &getState() const override { return *this; }
- /// }
-
- /// Helper function for collecting accessed bytes in must-be-executed-context
- void addAccessedBytesForUse(Attributor &A, const Use *U, const Instruction *I,
- DerefState &State) {
- const Value *UseV = U->get();
- if (!UseV->getType()->isPointerTy())
- return;
-
- Type *PtrTy = UseV->getType();
- const DataLayout &DL = A.getDataLayout();
- int64_t Offset;
- if (const Value *Base = getBasePointerOfAccessPointerOperand(
- I, Offset, DL, /*AllowNonInbounds*/ true)) {
- if (Base == &getAssociatedValue() &&
- getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
- uint64_t Size = DL.getTypeStoreSize(PtrTy->getPointerElementType());
- State.addAccessedBytes(Offset, Size);
- }
- }
- }
-
- /// See followUsesInMBEC
- bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I,
- AADereferenceable::StateType &State) {
- bool IsNonNull = false;
- bool TrackUse = false;
- int64_t DerefBytes = getKnownNonNullAndDerefBytesForUse(
- A, *this, getAssociatedValue(), U, I, IsNonNull, TrackUse);
- LLVM_DEBUG(dbgs() << "[AADereferenceable] Deref bytes: " << DerefBytes
- << " for instruction " << *I << "\n");
-
- addAccessedBytesForUse(A, U, I, State);
- State.takeKnownDerefBytesMaximum(DerefBytes);
- return TrackUse;
- }
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- ChangeStatus Change = AADereferenceable::manifest(A);
- if (isAssumedNonNull() && hasAttr(Attribute::DereferenceableOrNull)) {
- removeAttrs({Attribute::DereferenceableOrNull});
- return ChangeStatus::CHANGED;
- }
- return Change;
- }
-
- void getDeducedAttributes(LLVMContext &Ctx,
- SmallVectorImpl<Attribute> &Attrs) const override {
- // TODO: Add *_globally support
- if (isAssumedNonNull())
- Attrs.emplace_back(Attribute::getWithDereferenceableBytes(
- Ctx, getAssumedDereferenceableBytes()));
- else
- Attrs.emplace_back(Attribute::getWithDereferenceableOrNullBytes(
- Ctx, getAssumedDereferenceableBytes()));
- }
-
- /// See AbstractAttribute::getAsStr().
- const std::string getAsStr() const override {
- if (!getAssumedDereferenceableBytes())
- return "unknown-dereferenceable";
- return std::string("dereferenceable") +
- (isAssumedNonNull() ? "" : "_or_null") +
- (isAssumedGlobal() ? "_globally" : "") + "<" +
- std::to_string(getKnownDereferenceableBytes()) + "-" +
- std::to_string(getAssumedDereferenceableBytes()) + ">";
- }
-};
-
-/// Dereferenceable attribute for a floating value.
-struct AADereferenceableFloating : AADereferenceableImpl {
- AADereferenceableFloating(const IRPosition &IRP, Attributor &A)
- : AADereferenceableImpl(IRP, A) {}
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- const DataLayout &DL = A.getDataLayout();
-
- auto VisitValueCB = [&](const Value &V, const Instruction *, DerefState &T,
- bool Stripped) -> bool {
- unsigned IdxWidth =
- DL.getIndexSizeInBits(V.getType()->getPointerAddressSpace());
- APInt Offset(IdxWidth, 0);
- const Value *Base =
- stripAndAccumulateMinimalOffsets(A, *this, &V, DL, Offset, false);
-
- const auto &AA =
- A.getAAFor<AADereferenceable>(*this, IRPosition::value(*Base));
- int64_t DerefBytes = 0;
- if (!Stripped && this == &AA) {
- // Use IR information if we did not strip anything.
- // TODO: track globally.
- bool CanBeNull;
- DerefBytes = Base->getPointerDereferenceableBytes(DL, CanBeNull);
- T.GlobalState.indicatePessimisticFixpoint();
- } else {
+ if (assumeLive(A, *AliveSuccessor->getParent()))
+ Worklist.push_back(AliveSuccessor);
+ }
+ }
+ }
+
+ ToBeExploredFrom = std::move(NewToBeExploredFrom);
+
+ // If we know everything is live there is no need to query for liveness.
+ // Instead, indicating a pessimistic fixpoint will cause the state to be
+ // "invalid" and all queries to be answered conservatively without lookups.
+ // To be in this state we have to (1) finished the exploration and (3) not
+ // discovered any non-trivial dead end and (2) not ruled unreachable code
+ // dead.
+ if (ToBeExploredFrom.empty() &&
+ getAnchorScope()->size() == AssumedLiveBlocks.size() &&
+ llvm::all_of(KnownDeadEnds, [](const Instruction *DeadEndI) {
+ return DeadEndI->isTerminator() && DeadEndI->getNumSuccessors() == 0;
+ }))
+ return indicatePessimisticFixpoint();
+ return Change;
+}
+
+/// Liveness information for a call sites.
+struct AAIsDeadCallSite final : AAIsDeadFunction {
+ AAIsDeadCallSite(const IRPosition &IRP, Attributor &A)
+ : AAIsDeadFunction(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness information and then it makes
+ // sense to specialize attributes for call sites instead of
+ // redirecting requests to the callee.
+ llvm_unreachable("Abstract attributes for liveness are not "
+ "supported for call sites yet!");
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ return indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {}
+};
+
+/// -------------------- Dereferenceable Argument Attribute --------------------
+
+template <>
+ChangeStatus clampStateAndIndicateChange<DerefState>(DerefState &S,
+ const DerefState &R) {
+ ChangeStatus CS0 =
+ clampStateAndIndicateChange(S.DerefBytesState, R.DerefBytesState);
+ ChangeStatus CS1 = clampStateAndIndicateChange(S.GlobalState, R.GlobalState);
+ return CS0 | CS1;
+}
+
+struct AADereferenceableImpl : AADereferenceable {
+ AADereferenceableImpl(const IRPosition &IRP, Attributor &A)
+ : AADereferenceable(IRP, A) {}
+ using StateType = DerefState;
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ SmallVector<Attribute, 4> Attrs;
+ getAttrs({Attribute::Dereferenceable, Attribute::DereferenceableOrNull},
+ Attrs, /* IgnoreSubsumingPositions */ false, &A);
+ for (const Attribute &Attr : Attrs)
+ takeKnownDerefBytesMaximum(Attr.getValueAsInt());
+
+ const IRPosition &IRP = this->getIRPosition();
+ NonNullAA = &A.getAAFor<AANonNull>(*this, IRP,
+ /* TrackDependence */ false);
+
+ bool CanBeNull;
+ takeKnownDerefBytesMaximum(
+ IRP.getAssociatedValue().getPointerDereferenceableBytes(
+ A.getDataLayout(), CanBeNull));
+
+ bool IsFnInterface = IRP.isFnInterfaceKind();
+ Function *FnScope = IRP.getAnchorScope();
+ if (IsFnInterface && (!FnScope || !A.isFunctionIPOAmendable(*FnScope))) {
+ indicatePessimisticFixpoint();
+ return;
+ }
+
+ if (Instruction *CtxI = getCtxI())
+ followUsesInMBEC(*this, A, getState(), *CtxI);
+ }
+
+ /// See AbstractAttribute::getState()
+ /// {
+ StateType &getState() override { return *this; }
+ const StateType &getState() const override { return *this; }
+ /// }
+
+ /// Helper function for collecting accessed bytes in must-be-executed-context
+ void addAccessedBytesForUse(Attributor &A, const Use *U, const Instruction *I,
+ DerefState &State) {
+ const Value *UseV = U->get();
+ if (!UseV->getType()->isPointerTy())
+ return;
+
+ Type *PtrTy = UseV->getType();
+ const DataLayout &DL = A.getDataLayout();
+ int64_t Offset;
+ if (const Value *Base = getBasePointerOfAccessPointerOperand(
+ I, Offset, DL, /*AllowNonInbounds*/ true)) {
+ if (Base == &getAssociatedValue() &&
+ getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
+ uint64_t Size = DL.getTypeStoreSize(PtrTy->getPointerElementType());
+ State.addAccessedBytes(Offset, Size);
+ }
+ }
+ }
+
+ /// See followUsesInMBEC
+ bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I,
+ AADereferenceable::StateType &State) {
+ bool IsNonNull = false;
+ bool TrackUse = false;
+ int64_t DerefBytes = getKnownNonNullAndDerefBytesForUse(
+ A, *this, getAssociatedValue(), U, I, IsNonNull, TrackUse);
+ LLVM_DEBUG(dbgs() << "[AADereferenceable] Deref bytes: " << DerefBytes
+ << " for instruction " << *I << "\n");
+
+ addAccessedBytesForUse(A, U, I, State);
+ State.takeKnownDerefBytesMaximum(DerefBytes);
+ return TrackUse;
+ }
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ ChangeStatus Change = AADereferenceable::manifest(A);
+ if (isAssumedNonNull() && hasAttr(Attribute::DereferenceableOrNull)) {
+ removeAttrs({Attribute::DereferenceableOrNull});
+ return ChangeStatus::CHANGED;
+ }
+ return Change;
+ }
+
+ void getDeducedAttributes(LLVMContext &Ctx,
+ SmallVectorImpl<Attribute> &Attrs) const override {
+ // TODO: Add *_globally support
+ if (isAssumedNonNull())
+ Attrs.emplace_back(Attribute::getWithDereferenceableBytes(
+ Ctx, getAssumedDereferenceableBytes()));
+ else
+ Attrs.emplace_back(Attribute::getWithDereferenceableOrNullBytes(
+ Ctx, getAssumedDereferenceableBytes()));
+ }
+
+ /// See AbstractAttribute::getAsStr().
+ const std::string getAsStr() const override {
+ if (!getAssumedDereferenceableBytes())
+ return "unknown-dereferenceable";
+ return std::string("dereferenceable") +
+ (isAssumedNonNull() ? "" : "_or_null") +
+ (isAssumedGlobal() ? "_globally" : "") + "<" +
+ std::to_string(getKnownDereferenceableBytes()) + "-" +
+ std::to_string(getAssumedDereferenceableBytes()) + ">";
+ }
+};
+
+/// Dereferenceable attribute for a floating value.
+struct AADereferenceableFloating : AADereferenceableImpl {
+ AADereferenceableFloating(const IRPosition &IRP, Attributor &A)
+ : AADereferenceableImpl(IRP, A) {}
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ const DataLayout &DL = A.getDataLayout();
+
+ auto VisitValueCB = [&](const Value &V, const Instruction *, DerefState &T,
+ bool Stripped) -> bool {
+ unsigned IdxWidth =
+ DL.getIndexSizeInBits(V.getType()->getPointerAddressSpace());
+ APInt Offset(IdxWidth, 0);
+ const Value *Base =
+ stripAndAccumulateMinimalOffsets(A, *this, &V, DL, Offset, false);
+
+ const auto &AA =
+ A.getAAFor<AADereferenceable>(*this, IRPosition::value(*Base));
+ int64_t DerefBytes = 0;
+ if (!Stripped && this == &AA) {
+ // Use IR information if we did not strip anything.
+ // TODO: track globally.
+ bool CanBeNull;
+ DerefBytes = Base->getPointerDereferenceableBytes(DL, CanBeNull);
+ T.GlobalState.indicatePessimisticFixpoint();
+ } else {
const DerefState &DS = AA.getState();
- DerefBytes = DS.DerefBytesState.getAssumed();
- T.GlobalState &= DS.GlobalState;
- }
-
- // For now we do not try to "increase" dereferenceability due to negative
- // indices as we first have to come up with code to deal with loops and
- // for overflows of the dereferenceable bytes.
- int64_t OffsetSExt = Offset.getSExtValue();
- if (OffsetSExt < 0)
- OffsetSExt = 0;
-
- T.takeAssumedDerefBytesMinimum(
- std::max(int64_t(0), DerefBytes - OffsetSExt));
-
- if (this == &AA) {
- if (!Stripped) {
- // If nothing was stripped IR information is all we got.
- T.takeKnownDerefBytesMaximum(
- std::max(int64_t(0), DerefBytes - OffsetSExt));
- T.indicatePessimisticFixpoint();
- } else if (OffsetSExt > 0) {
- // If something was stripped but there is circular reasoning we look
- // for the offset. If it is positive we basically decrease the
- // dereferenceable bytes in a circluar loop now, which will simply
- // drive them down to the known value in a very slow way which we
- // can accelerate.
- T.indicatePessimisticFixpoint();
- }
- }
-
- return T.isValidState();
- };
-
- DerefState T;
- if (!genericValueTraversal<AADereferenceable, DerefState>(
- A, getIRPosition(), *this, T, VisitValueCB, getCtxI()))
- return indicatePessimisticFixpoint();
-
- return clampStateAndIndicateChange(getState(), T);
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_FLOATING_ATTR(dereferenceable)
- }
-};
-
-/// Dereferenceable attribute for a return value.
-struct AADereferenceableReturned final
- : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl> {
- AADereferenceableReturned(const IRPosition &IRP, Attributor &A)
- : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl>(
- IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_FNRET_ATTR(dereferenceable)
- }
-};
-
-/// Dereferenceable attribute for an argument
-struct AADereferenceableArgument final
- : AAArgumentFromCallSiteArguments<AADereferenceable,
- AADereferenceableImpl> {
- using Base =
- AAArgumentFromCallSiteArguments<AADereferenceable, AADereferenceableImpl>;
- AADereferenceableArgument(const IRPosition &IRP, Attributor &A)
- : Base(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_ARG_ATTR(dereferenceable)
- }
-};
-
-/// Dereferenceable attribute for a call site argument.
-struct AADereferenceableCallSiteArgument final : AADereferenceableFloating {
- AADereferenceableCallSiteArgument(const IRPosition &IRP, Attributor &A)
- : AADereferenceableFloating(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_CSARG_ATTR(dereferenceable)
- }
-};
-
-/// Dereferenceable attribute deduction for a call site return value.
-struct AADereferenceableCallSiteReturned final
- : AACallSiteReturnedFromReturned<AADereferenceable, AADereferenceableImpl> {
- using Base =
- AACallSiteReturnedFromReturned<AADereferenceable, AADereferenceableImpl>;
- AADereferenceableCallSiteReturned(const IRPosition &IRP, Attributor &A)
- : Base(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_CS_ATTR(dereferenceable);
- }
-};
-
-// ------------------------ Align Argument Attribute ------------------------
-
-static unsigned getKnownAlignForUse(Attributor &A,
- AbstractAttribute &QueryingAA,
- Value &AssociatedValue, const Use *U,
- const Instruction *I, bool &TrackUse) {
- // We need to follow common pointer manipulation uses to the accesses they
- // feed into.
- if (isa<CastInst>(I)) {
- // Follow all but ptr2int casts.
- TrackUse = !isa<PtrToIntInst>(I);
- return 0;
- }
- if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
- if (GEP->hasAllConstantIndices()) {
- TrackUse = true;
- return 0;
- }
- }
-
- MaybeAlign MA;
- if (const auto *CB = dyn_cast<CallBase>(I)) {
- if (CB->isBundleOperand(U) || CB->isCallee(U))
- return 0;
-
- unsigned ArgNo = CB->getArgOperandNo(U);
- IRPosition IRP = IRPosition::callsite_argument(*CB, ArgNo);
- // As long as we only use known information there is no need to track
- // dependences here.
- auto &AlignAA = A.getAAFor<AAAlign>(QueryingAA, IRP,
- /* TrackDependence */ false);
- MA = MaybeAlign(AlignAA.getKnownAlign());
- }
-
- const DataLayout &DL = A.getDataLayout();
- const Value *UseV = U->get();
- if (auto *SI = dyn_cast<StoreInst>(I)) {
- if (SI->getPointerOperand() == UseV)
- MA = SI->getAlign();
- } else if (auto *LI = dyn_cast<LoadInst>(I)) {
- if (LI->getPointerOperand() == UseV)
- MA = LI->getAlign();
- }
-
- if (!MA || *MA <= 1)
- return 0;
-
- unsigned Alignment = MA->value();
- int64_t Offset;
-
- if (const Value *Base = GetPointerBaseWithConstantOffset(UseV, Offset, DL)) {
- if (Base == &AssociatedValue) {
- // BasePointerAddr + Offset = Alignment * Q for some integer Q.
- // So we can say that the maximum power of two which is a divisor of
- // gcd(Offset, Alignment) is an alignment.
-
- uint32_t gcd =
- greatestCommonDivisor(uint32_t(abs((int32_t)Offset)), Alignment);
- Alignment = llvm::PowerOf2Floor(gcd);
- }
- }
-
- return Alignment;
-}
-
-struct AAAlignImpl : AAAlign {
- AAAlignImpl(const IRPosition &IRP, Attributor &A) : AAAlign(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- SmallVector<Attribute, 4> Attrs;
- getAttrs({Attribute::Alignment}, Attrs);
- for (const Attribute &Attr : Attrs)
- takeKnownMaximum(Attr.getValueAsInt());
-
- Value &V = getAssociatedValue();
- // TODO: This is a HACK to avoid getPointerAlignment to introduce a ptr2int
- // use of the function pointer. This was caused by D73131. We want to
- // avoid this for function pointers especially because we iterate
- // their uses and int2ptr is not handled. It is not a correctness
- // problem though!
- if (!V.getType()->getPointerElementType()->isFunctionTy())
- takeKnownMaximum(V.getPointerAlignment(A.getDataLayout()).value());
-
- if (getIRPosition().isFnInterfaceKind() &&
- (!getAnchorScope() ||
- !A.isFunctionIPOAmendable(*getAssociatedFunction()))) {
- indicatePessimisticFixpoint();
- return;
- }
-
- if (Instruction *CtxI = getCtxI())
- followUsesInMBEC(*this, A, getState(), *CtxI);
- }
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- ChangeStatus LoadStoreChanged = ChangeStatus::UNCHANGED;
-
- // Check for users that allow alignment annotations.
- Value &AssociatedValue = getAssociatedValue();
- for (const Use &U : AssociatedValue.uses()) {
- if (auto *SI = dyn_cast<StoreInst>(U.getUser())) {
- if (SI->getPointerOperand() == &AssociatedValue)
- if (SI->getAlignment() < getAssumedAlign()) {
- STATS_DECLTRACK(AAAlign, Store,
- "Number of times alignment added to a store");
- SI->setAlignment(Align(getAssumedAlign()));
- LoadStoreChanged = ChangeStatus::CHANGED;
- }
- } else if (auto *LI = dyn_cast<LoadInst>(U.getUser())) {
- if (LI->getPointerOperand() == &AssociatedValue)
- if (LI->getAlignment() < getAssumedAlign()) {
- LI->setAlignment(Align(getAssumedAlign()));
- STATS_DECLTRACK(AAAlign, Load,
- "Number of times alignment added to a load");
- LoadStoreChanged = ChangeStatus::CHANGED;
- }
- }
- }
-
- ChangeStatus Changed = AAAlign::manifest(A);
-
- Align InheritAlign =
- getAssociatedValue().getPointerAlignment(A.getDataLayout());
- if (InheritAlign >= getAssumedAlign())
- return LoadStoreChanged;
- return Changed | LoadStoreChanged;
- }
-
- // TODO: Provide a helper to determine the implied ABI alignment and check in
- // the existing manifest method and a new one for AAAlignImpl that value
- // to avoid making the alignment explicit if it did not improve.
-
- /// See AbstractAttribute::getDeducedAttributes
- virtual void
- getDeducedAttributes(LLVMContext &Ctx,
- SmallVectorImpl<Attribute> &Attrs) const override {
- if (getAssumedAlign() > 1)
- Attrs.emplace_back(
- Attribute::getWithAlignment(Ctx, Align(getAssumedAlign())));
- }
-
- /// See followUsesInMBEC
- bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I,
- AAAlign::StateType &State) {
- bool TrackUse = false;
-
- unsigned int KnownAlign =
- getKnownAlignForUse(A, *this, getAssociatedValue(), U, I, TrackUse);
- State.takeKnownMaximum(KnownAlign);
-
- return TrackUse;
- }
-
- /// See AbstractAttribute::getAsStr().
- const std::string getAsStr() const override {
- return getAssumedAlign() ? ("align<" + std::to_string(getKnownAlign()) +
- "-" + std::to_string(getAssumedAlign()) + ">")
- : "unknown-align";
- }
-};
-
-/// Align attribute for a floating value.
-struct AAAlignFloating : AAAlignImpl {
- AAAlignFloating(const IRPosition &IRP, Attributor &A) : AAAlignImpl(IRP, A) {}
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- const DataLayout &DL = A.getDataLayout();
-
- auto VisitValueCB = [&](Value &V, const Instruction *,
- AAAlign::StateType &T, bool Stripped) -> bool {
- const auto &AA = A.getAAFor<AAAlign>(*this, IRPosition::value(V));
- if (!Stripped && this == &AA) {
+ DerefBytes = DS.DerefBytesState.getAssumed();
+ T.GlobalState &= DS.GlobalState;
+ }
+
+ // For now we do not try to "increase" dereferenceability due to negative
+ // indices as we first have to come up with code to deal with loops and
+ // for overflows of the dereferenceable bytes.
+ int64_t OffsetSExt = Offset.getSExtValue();
+ if (OffsetSExt < 0)
+ OffsetSExt = 0;
+
+ T.takeAssumedDerefBytesMinimum(
+ std::max(int64_t(0), DerefBytes - OffsetSExt));
+
+ if (this == &AA) {
+ if (!Stripped) {
+ // If nothing was stripped IR information is all we got.
+ T.takeKnownDerefBytesMaximum(
+ std::max(int64_t(0), DerefBytes - OffsetSExt));
+ T.indicatePessimisticFixpoint();
+ } else if (OffsetSExt > 0) {
+ // If something was stripped but there is circular reasoning we look
+ // for the offset. If it is positive we basically decrease the
+ // dereferenceable bytes in a circluar loop now, which will simply
+ // drive them down to the known value in a very slow way which we
+ // can accelerate.
+ T.indicatePessimisticFixpoint();
+ }
+ }
+
+ return T.isValidState();
+ };
+
+ DerefState T;
+ if (!genericValueTraversal<AADereferenceable, DerefState>(
+ A, getIRPosition(), *this, T, VisitValueCB, getCtxI()))
+ return indicatePessimisticFixpoint();
+
+ return clampStateAndIndicateChange(getState(), T);
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_FLOATING_ATTR(dereferenceable)
+ }
+};
+
+/// Dereferenceable attribute for a return value.
+struct AADereferenceableReturned final
+ : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl> {
+ AADereferenceableReturned(const IRPosition &IRP, Attributor &A)
+ : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl>(
+ IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_FNRET_ATTR(dereferenceable)
+ }
+};
+
+/// Dereferenceable attribute for an argument
+struct AADereferenceableArgument final
+ : AAArgumentFromCallSiteArguments<AADereferenceable,
+ AADereferenceableImpl> {
+ using Base =
+ AAArgumentFromCallSiteArguments<AADereferenceable, AADereferenceableImpl>;
+ AADereferenceableArgument(const IRPosition &IRP, Attributor &A)
+ : Base(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_ARG_ATTR(dereferenceable)
+ }
+};
+
+/// Dereferenceable attribute for a call site argument.
+struct AADereferenceableCallSiteArgument final : AADereferenceableFloating {
+ AADereferenceableCallSiteArgument(const IRPosition &IRP, Attributor &A)
+ : AADereferenceableFloating(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_CSARG_ATTR(dereferenceable)
+ }
+};
+
+/// Dereferenceable attribute deduction for a call site return value.
+struct AADereferenceableCallSiteReturned final
+ : AACallSiteReturnedFromReturned<AADereferenceable, AADereferenceableImpl> {
+ using Base =
+ AACallSiteReturnedFromReturned<AADereferenceable, AADereferenceableImpl>;
+ AADereferenceableCallSiteReturned(const IRPosition &IRP, Attributor &A)
+ : Base(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_CS_ATTR(dereferenceable);
+ }
+};
+
+// ------------------------ Align Argument Attribute ------------------------
+
+static unsigned getKnownAlignForUse(Attributor &A,
+ AbstractAttribute &QueryingAA,
+ Value &AssociatedValue, const Use *U,
+ const Instruction *I, bool &TrackUse) {
+ // We need to follow common pointer manipulation uses to the accesses they
+ // feed into.
+ if (isa<CastInst>(I)) {
+ // Follow all but ptr2int casts.
+ TrackUse = !isa<PtrToIntInst>(I);
+ return 0;
+ }
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+ if (GEP->hasAllConstantIndices()) {
+ TrackUse = true;
+ return 0;
+ }
+ }
+
+ MaybeAlign MA;
+ if (const auto *CB = dyn_cast<CallBase>(I)) {
+ if (CB->isBundleOperand(U) || CB->isCallee(U))
+ return 0;
+
+ unsigned ArgNo = CB->getArgOperandNo(U);
+ IRPosition IRP = IRPosition::callsite_argument(*CB, ArgNo);
+ // As long as we only use known information there is no need to track
+ // dependences here.
+ auto &AlignAA = A.getAAFor<AAAlign>(QueryingAA, IRP,
+ /* TrackDependence */ false);
+ MA = MaybeAlign(AlignAA.getKnownAlign());
+ }
+
+ const DataLayout &DL = A.getDataLayout();
+ const Value *UseV = U->get();
+ if (auto *SI = dyn_cast<StoreInst>(I)) {
+ if (SI->getPointerOperand() == UseV)
+ MA = SI->getAlign();
+ } else if (auto *LI = dyn_cast<LoadInst>(I)) {
+ if (LI->getPointerOperand() == UseV)
+ MA = LI->getAlign();
+ }
+
+ if (!MA || *MA <= 1)
+ return 0;
+
+ unsigned Alignment = MA->value();
+ int64_t Offset;
+
+ if (const Value *Base = GetPointerBaseWithConstantOffset(UseV, Offset, DL)) {
+ if (Base == &AssociatedValue) {
+ // BasePointerAddr + Offset = Alignment * Q for some integer Q.
+ // So we can say that the maximum power of two which is a divisor of
+ // gcd(Offset, Alignment) is an alignment.
+
+ uint32_t gcd =
+ greatestCommonDivisor(uint32_t(abs((int32_t)Offset)), Alignment);
+ Alignment = llvm::PowerOf2Floor(gcd);
+ }
+ }
+
+ return Alignment;
+}
+
+struct AAAlignImpl : AAAlign {
+ AAAlignImpl(const IRPosition &IRP, Attributor &A) : AAAlign(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ SmallVector<Attribute, 4> Attrs;
+ getAttrs({Attribute::Alignment}, Attrs);
+ for (const Attribute &Attr : Attrs)
+ takeKnownMaximum(Attr.getValueAsInt());
+
+ Value &V = getAssociatedValue();
+ // TODO: This is a HACK to avoid getPointerAlignment to introduce a ptr2int
+ // use of the function pointer. This was caused by D73131. We want to
+ // avoid this for function pointers especially because we iterate
+ // their uses and int2ptr is not handled. It is not a correctness
+ // problem though!
+ if (!V.getType()->getPointerElementType()->isFunctionTy())
+ takeKnownMaximum(V.getPointerAlignment(A.getDataLayout()).value());
+
+ if (getIRPosition().isFnInterfaceKind() &&
+ (!getAnchorScope() ||
+ !A.isFunctionIPOAmendable(*getAssociatedFunction()))) {
+ indicatePessimisticFixpoint();
+ return;
+ }
+
+ if (Instruction *CtxI = getCtxI())
+ followUsesInMBEC(*this, A, getState(), *CtxI);
+ }
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ ChangeStatus LoadStoreChanged = ChangeStatus::UNCHANGED;
+
+ // Check for users that allow alignment annotations.
+ Value &AssociatedValue = getAssociatedValue();
+ for (const Use &U : AssociatedValue.uses()) {
+ if (auto *SI = dyn_cast<StoreInst>(U.getUser())) {
+ if (SI->getPointerOperand() == &AssociatedValue)
+ if (SI->getAlignment() < getAssumedAlign()) {
+ STATS_DECLTRACK(AAAlign, Store,
+ "Number of times alignment added to a store");
+ SI->setAlignment(Align(getAssumedAlign()));
+ LoadStoreChanged = ChangeStatus::CHANGED;
+ }
+ } else if (auto *LI = dyn_cast<LoadInst>(U.getUser())) {
+ if (LI->getPointerOperand() == &AssociatedValue)
+ if (LI->getAlignment() < getAssumedAlign()) {
+ LI->setAlignment(Align(getAssumedAlign()));
+ STATS_DECLTRACK(AAAlign, Load,
+ "Number of times alignment added to a load");
+ LoadStoreChanged = ChangeStatus::CHANGED;
+ }
+ }
+ }
+
+ ChangeStatus Changed = AAAlign::manifest(A);
+
+ Align InheritAlign =
+ getAssociatedValue().getPointerAlignment(A.getDataLayout());
+ if (InheritAlign >= getAssumedAlign())
+ return LoadStoreChanged;
+ return Changed | LoadStoreChanged;
+ }
+
+ // TODO: Provide a helper to determine the implied ABI alignment and check in
+ // the existing manifest method and a new one for AAAlignImpl that value
+ // to avoid making the alignment explicit if it did not improve.
+
+ /// See AbstractAttribute::getDeducedAttributes
+ virtual void
+ getDeducedAttributes(LLVMContext &Ctx,
+ SmallVectorImpl<Attribute> &Attrs) const override {
+ if (getAssumedAlign() > 1)
+ Attrs.emplace_back(
+ Attribute::getWithAlignment(Ctx, Align(getAssumedAlign())));
+ }
+
+ /// See followUsesInMBEC
+ bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I,
+ AAAlign::StateType &State) {
+ bool TrackUse = false;
+
+ unsigned int KnownAlign =
+ getKnownAlignForUse(A, *this, getAssociatedValue(), U, I, TrackUse);
+ State.takeKnownMaximum(KnownAlign);
+
+ return TrackUse;
+ }
+
+ /// See AbstractAttribute::getAsStr().
+ const std::string getAsStr() const override {
+ return getAssumedAlign() ? ("align<" + std::to_string(getKnownAlign()) +
+ "-" + std::to_string(getAssumedAlign()) + ">")
+ : "unknown-align";
+ }
+};
+
+/// Align attribute for a floating value.
+struct AAAlignFloating : AAAlignImpl {
+ AAAlignFloating(const IRPosition &IRP, Attributor &A) : AAAlignImpl(IRP, A) {}
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ const DataLayout &DL = A.getDataLayout();
+
+ auto VisitValueCB = [&](Value &V, const Instruction *,
+ AAAlign::StateType &T, bool Stripped) -> bool {
+ const auto &AA = A.getAAFor<AAAlign>(*this, IRPosition::value(V));
+ if (!Stripped && this == &AA) {
int64_t Offset;
unsigned Alignment = 1;
if (const Value *Base =
@@ -3867,37 +3867,37 @@ struct AAAlignFloating : AAAlignImpl {
} else {
Alignment = V.getPointerAlignment(DL).value();
}
- // Use only IR information if we did not strip anything.
+ // Use only IR information if we did not strip anything.
T.takeKnownMaximum(Alignment);
- T.indicatePessimisticFixpoint();
- } else {
- // Use abstract attribute information.
+ T.indicatePessimisticFixpoint();
+ } else {
+ // Use abstract attribute information.
const AAAlign::StateType &DS = AA.getState();
- T ^= DS;
- }
- return T.isValidState();
- };
-
- StateType T;
- if (!genericValueTraversal<AAAlign, StateType>(A, getIRPosition(), *this, T,
- VisitValueCB, getCtxI()))
- return indicatePessimisticFixpoint();
-
- // TODO: If we know we visited all incoming values, thus no are assumed
- // dead, we can take the known information from the state T.
- return clampStateAndIndicateChange(getState(), T);
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_FLOATING_ATTR(align) }
-};
-
-/// Align attribute for function return value.
-struct AAAlignReturned final
- : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl> {
+ T ^= DS;
+ }
+ return T.isValidState();
+ };
+
+ StateType T;
+ if (!genericValueTraversal<AAAlign, StateType>(A, getIRPosition(), *this, T,
+ VisitValueCB, getCtxI()))
+ return indicatePessimisticFixpoint();
+
+ // TODO: If we know we visited all incoming values, thus no are assumed
+ // dead, we can take the known information from the state T.
+ return clampStateAndIndicateChange(getState(), T);
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_FLOATING_ATTR(align) }
+};
+
+/// Align attribute for function return value.
+struct AAAlignReturned final
+ : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl> {
using Base = AAReturnedFromReturnedValues<AAAlign, AAAlignImpl>;
AAAlignReturned(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {}
-
+
/// See AbstractAttribute::initialize(...).
void initialize(Attributor &A) override {
Base::initialize(A);
@@ -3906,126 +3906,126 @@ struct AAAlignReturned final
indicatePessimisticFixpoint();
}
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(aligned) }
-};
-
-/// Align attribute for function argument.
-struct AAAlignArgument final
- : AAArgumentFromCallSiteArguments<AAAlign, AAAlignImpl> {
- using Base = AAArgumentFromCallSiteArguments<AAAlign, AAAlignImpl>;
- AAAlignArgument(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {}
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- // If the associated argument is involved in a must-tail call we give up
- // because we would need to keep the argument alignments of caller and
- // callee in-sync. Just does not seem worth the trouble right now.
- if (A.getInfoCache().isInvolvedInMustTailCall(*getAssociatedArgument()))
- return ChangeStatus::UNCHANGED;
- return Base::manifest(A);
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(aligned) }
-};
-
-struct AAAlignCallSiteArgument final : AAAlignFloating {
- AAAlignCallSiteArgument(const IRPosition &IRP, Attributor &A)
- : AAAlignFloating(IRP, A) {}
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- // If the associated argument is involved in a must-tail call we give up
- // because we would need to keep the argument alignments of caller and
- // callee in-sync. Just does not seem worth the trouble right now.
- if (Argument *Arg = getAssociatedArgument())
- if (A.getInfoCache().isInvolvedInMustTailCall(*Arg))
- return ChangeStatus::UNCHANGED;
- ChangeStatus Changed = AAAlignImpl::manifest(A);
- Align InheritAlign =
- getAssociatedValue().getPointerAlignment(A.getDataLayout());
- if (InheritAlign >= getAssumedAlign())
- Changed = ChangeStatus::UNCHANGED;
- return Changed;
- }
-
- /// See AbstractAttribute::updateImpl(Attributor &A).
- ChangeStatus updateImpl(Attributor &A) override {
- ChangeStatus Changed = AAAlignFloating::updateImpl(A);
- if (Argument *Arg = getAssociatedArgument()) {
- // We only take known information from the argument
- // so we do not need to track a dependence.
- const auto &ArgAlignAA = A.getAAFor<AAAlign>(
- *this, IRPosition::argument(*Arg), /* TrackDependence */ false);
- takeKnownMaximum(ArgAlignAA.getKnownAlign());
- }
- return Changed;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(aligned) }
-};
-
-/// Align attribute deduction for a call site return value.
-struct AAAlignCallSiteReturned final
- : AACallSiteReturnedFromReturned<AAAlign, AAAlignImpl> {
- using Base = AACallSiteReturnedFromReturned<AAAlign, AAAlignImpl>;
- AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A)
- : Base(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- Base::initialize(A);
- Function *F = getAssociatedFunction();
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(aligned) }
+};
+
+/// Align attribute for function argument.
+struct AAAlignArgument final
+ : AAArgumentFromCallSiteArguments<AAAlign, AAAlignImpl> {
+ using Base = AAArgumentFromCallSiteArguments<AAAlign, AAAlignImpl>;
+ AAAlignArgument(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {}
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ // If the associated argument is involved in a must-tail call we give up
+ // because we would need to keep the argument alignments of caller and
+ // callee in-sync. Just does not seem worth the trouble right now.
+ if (A.getInfoCache().isInvolvedInMustTailCall(*getAssociatedArgument()))
+ return ChangeStatus::UNCHANGED;
+ return Base::manifest(A);
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(aligned) }
+};
+
+struct AAAlignCallSiteArgument final : AAAlignFloating {
+ AAAlignCallSiteArgument(const IRPosition &IRP, Attributor &A)
+ : AAAlignFloating(IRP, A) {}
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ // If the associated argument is involved in a must-tail call we give up
+ // because we would need to keep the argument alignments of caller and
+ // callee in-sync. Just does not seem worth the trouble right now.
+ if (Argument *Arg = getAssociatedArgument())
+ if (A.getInfoCache().isInvolvedInMustTailCall(*Arg))
+ return ChangeStatus::UNCHANGED;
+ ChangeStatus Changed = AAAlignImpl::manifest(A);
+ Align InheritAlign =
+ getAssociatedValue().getPointerAlignment(A.getDataLayout());
+ if (InheritAlign >= getAssumedAlign())
+ Changed = ChangeStatus::UNCHANGED;
+ return Changed;
+ }
+
+ /// See AbstractAttribute::updateImpl(Attributor &A).
+ ChangeStatus updateImpl(Attributor &A) override {
+ ChangeStatus Changed = AAAlignFloating::updateImpl(A);
+ if (Argument *Arg = getAssociatedArgument()) {
+ // We only take known information from the argument
+ // so we do not need to track a dependence.
+ const auto &ArgAlignAA = A.getAAFor<AAAlign>(
+ *this, IRPosition::argument(*Arg), /* TrackDependence */ false);
+ takeKnownMaximum(ArgAlignAA.getKnownAlign());
+ }
+ return Changed;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(aligned) }
+};
+
+/// Align attribute deduction for a call site return value.
+struct AAAlignCallSiteReturned final
+ : AACallSiteReturnedFromReturned<AAAlign, AAAlignImpl> {
+ using Base = AACallSiteReturnedFromReturned<AAAlign, AAAlignImpl>;
+ AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A)
+ : Base(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ Base::initialize(A);
+ Function *F = getAssociatedFunction();
if (!F || F->isDeclaration())
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
-};
-
-/// ------------------ Function No-Return Attribute ----------------------------
-struct AANoReturnImpl : public AANoReturn {
- AANoReturnImpl(const IRPosition &IRP, Attributor &A) : AANoReturn(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- AANoReturn::initialize(A);
- Function *F = getAssociatedFunction();
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
+};
+
+/// ------------------ Function No-Return Attribute ----------------------------
+struct AANoReturnImpl : public AANoReturn {
+ AANoReturnImpl(const IRPosition &IRP, Attributor &A) : AANoReturn(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ AANoReturn::initialize(A);
+ Function *F = getAssociatedFunction();
if (!F || F->isDeclaration())
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::getAsStr().
- const std::string getAsStr() const override {
- return getAssumed() ? "noreturn" : "may-return";
- }
-
- /// See AbstractAttribute::updateImpl(Attributor &A).
- virtual ChangeStatus updateImpl(Attributor &A) override {
- auto CheckForNoReturn = [](Instruction &) { return false; };
- if (!A.checkForAllInstructions(CheckForNoReturn, *this,
- {(unsigned)Instruction::Ret}))
- return indicatePessimisticFixpoint();
- return ChangeStatus::UNCHANGED;
- }
-};
-
-struct AANoReturnFunction final : AANoReturnImpl {
- AANoReturnFunction(const IRPosition &IRP, Attributor &A)
- : AANoReturnImpl(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(noreturn) }
-};
-
-/// NoReturn attribute deduction for a call sites.
-struct AANoReturnCallSite final : AANoReturnImpl {
- AANoReturnCallSite(const IRPosition &IRP, Attributor &A)
- : AANoReturnImpl(IRP, A) {}
-
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::getAsStr().
+ const std::string getAsStr() const override {
+ return getAssumed() ? "noreturn" : "may-return";
+ }
+
+ /// See AbstractAttribute::updateImpl(Attributor &A).
+ virtual ChangeStatus updateImpl(Attributor &A) override {
+ auto CheckForNoReturn = [](Instruction &) { return false; };
+ if (!A.checkForAllInstructions(CheckForNoReturn, *this,
+ {(unsigned)Instruction::Ret}))
+ return indicatePessimisticFixpoint();
+ return ChangeStatus::UNCHANGED;
+ }
+};
+
+struct AANoReturnFunction final : AANoReturnImpl {
+ AANoReturnFunction(const IRPosition &IRP, Attributor &A)
+ : AANoReturnImpl(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(noreturn) }
+};
+
+/// NoReturn attribute deduction for a call sites.
+struct AANoReturnCallSite final : AANoReturnImpl {
+ AANoReturnCallSite(const IRPosition &IRP, Attributor &A)
+ : AANoReturnImpl(IRP, A) {}
+
/// See AbstractAttribute::initialize(...).
void initialize(Attributor &A) override {
AANoReturnImpl::initialize(A);
@@ -4037,542 +4037,542 @@ struct AANoReturnCallSite final : AANoReturnImpl {
}
}
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness information and then it makes
- // sense to specialize attributes for call sites arguments instead of
- // redirecting requests to the callee argument.
- Function *F = getAssociatedFunction();
- const IRPosition &FnPos = IRPosition::function(*F);
- auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos);
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness information and then it makes
+ // sense to specialize attributes for call sites arguments instead of
+ // redirecting requests to the callee argument.
+ Function *F = getAssociatedFunction();
+ const IRPosition &FnPos = IRPosition::function(*F);
+ auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos);
return clampStateAndIndicateChange(getState(), FnAA.getState());
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(noreturn); }
-};
-
-/// ----------------------- Variable Capturing ---------------------------------
-
-/// A class to hold the state of for no-capture attributes.
-struct AANoCaptureImpl : public AANoCapture {
- AANoCaptureImpl(const IRPosition &IRP, Attributor &A) : AANoCapture(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- if (hasAttr(getAttrKind(), /* IgnoreSubsumingPositions */ true)) {
- indicateOptimisticFixpoint();
- return;
- }
- Function *AnchorScope = getAnchorScope();
- if (isFnInterfaceKind() &&
- (!AnchorScope || !A.isFunctionIPOAmendable(*AnchorScope))) {
- indicatePessimisticFixpoint();
- return;
- }
-
- // You cannot "capture" null in the default address space.
- if (isa<ConstantPointerNull>(getAssociatedValue()) &&
- getAssociatedValue().getType()->getPointerAddressSpace() == 0) {
- indicateOptimisticFixpoint();
- return;
- }
-
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(noreturn); }
+};
+
+/// ----------------------- Variable Capturing ---------------------------------
+
+/// A class to hold the state of for no-capture attributes.
+struct AANoCaptureImpl : public AANoCapture {
+ AANoCaptureImpl(const IRPosition &IRP, Attributor &A) : AANoCapture(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ if (hasAttr(getAttrKind(), /* IgnoreSubsumingPositions */ true)) {
+ indicateOptimisticFixpoint();
+ return;
+ }
+ Function *AnchorScope = getAnchorScope();
+ if (isFnInterfaceKind() &&
+ (!AnchorScope || !A.isFunctionIPOAmendable(*AnchorScope))) {
+ indicatePessimisticFixpoint();
+ return;
+ }
+
+ // You cannot "capture" null in the default address space.
+ if (isa<ConstantPointerNull>(getAssociatedValue()) &&
+ getAssociatedValue().getType()->getPointerAddressSpace() == 0) {
+ indicateOptimisticFixpoint();
+ return;
+ }
+
const Function *F =
isArgumentPosition() ? getAssociatedFunction() : AnchorScope;
-
- // Check what state the associated function can actually capture.
- if (F)
- determineFunctionCaptureCapabilities(getIRPosition(), *F, *this);
- else
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override;
-
- /// see AbstractAttribute::isAssumedNoCaptureMaybeReturned(...).
- virtual void
- getDeducedAttributes(LLVMContext &Ctx,
- SmallVectorImpl<Attribute> &Attrs) const override {
- if (!isAssumedNoCaptureMaybeReturned())
- return;
-
+
+ // Check what state the associated function can actually capture.
+ if (F)
+ determineFunctionCaptureCapabilities(getIRPosition(), *F, *this);
+ else
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override;
+
+ /// see AbstractAttribute::isAssumedNoCaptureMaybeReturned(...).
+ virtual void
+ getDeducedAttributes(LLVMContext &Ctx,
+ SmallVectorImpl<Attribute> &Attrs) const override {
+ if (!isAssumedNoCaptureMaybeReturned())
+ return;
+
if (isArgumentPosition()) {
- if (isAssumedNoCapture())
- Attrs.emplace_back(Attribute::get(Ctx, Attribute::NoCapture));
- else if (ManifestInternal)
- Attrs.emplace_back(Attribute::get(Ctx, "no-capture-maybe-returned"));
- }
- }
-
- /// Set the NOT_CAPTURED_IN_MEM and NOT_CAPTURED_IN_RET bits in \p Known
- /// depending on the ability of the function associated with \p IRP to capture
- /// state in memory and through "returning/throwing", respectively.
- static void determineFunctionCaptureCapabilities(const IRPosition &IRP,
- const Function &F,
- BitIntegerState &State) {
- // TODO: Once we have memory behavior attributes we should use them here.
-
- // If we know we cannot communicate or write to memory, we do not care about
- // ptr2int anymore.
- if (F.onlyReadsMemory() && F.doesNotThrow() &&
- F.getReturnType()->isVoidTy()) {
- State.addKnownBits(NO_CAPTURE);
- return;
- }
-
- // A function cannot capture state in memory if it only reads memory, it can
- // however return/throw state and the state might be influenced by the
- // pointer value, e.g., loading from a returned pointer might reveal a bit.
- if (F.onlyReadsMemory())
- State.addKnownBits(NOT_CAPTURED_IN_MEM);
-
- // A function cannot communicate state back if it does not through
- // exceptions and doesn not return values.
- if (F.doesNotThrow() && F.getReturnType()->isVoidTy())
- State.addKnownBits(NOT_CAPTURED_IN_RET);
-
- // Check existing "returned" attributes.
+ if (isAssumedNoCapture())
+ Attrs.emplace_back(Attribute::get(Ctx, Attribute::NoCapture));
+ else if (ManifestInternal)
+ Attrs.emplace_back(Attribute::get(Ctx, "no-capture-maybe-returned"));
+ }
+ }
+
+ /// Set the NOT_CAPTURED_IN_MEM and NOT_CAPTURED_IN_RET bits in \p Known
+ /// depending on the ability of the function associated with \p IRP to capture
+ /// state in memory and through "returning/throwing", respectively.
+ static void determineFunctionCaptureCapabilities(const IRPosition &IRP,
+ const Function &F,
+ BitIntegerState &State) {
+ // TODO: Once we have memory behavior attributes we should use them here.
+
+ // If we know we cannot communicate or write to memory, we do not care about
+ // ptr2int anymore.
+ if (F.onlyReadsMemory() && F.doesNotThrow() &&
+ F.getReturnType()->isVoidTy()) {
+ State.addKnownBits(NO_CAPTURE);
+ return;
+ }
+
+ // A function cannot capture state in memory if it only reads memory, it can
+ // however return/throw state and the state might be influenced by the
+ // pointer value, e.g., loading from a returned pointer might reveal a bit.
+ if (F.onlyReadsMemory())
+ State.addKnownBits(NOT_CAPTURED_IN_MEM);
+
+ // A function cannot communicate state back if it does not through
+ // exceptions and doesn not return values.
+ if (F.doesNotThrow() && F.getReturnType()->isVoidTy())
+ State.addKnownBits(NOT_CAPTURED_IN_RET);
+
+ // Check existing "returned" attributes.
int ArgNo = IRP.getCalleeArgNo();
- if (F.doesNotThrow() && ArgNo >= 0) {
- for (unsigned u = 0, e = F.arg_size(); u < e; ++u)
- if (F.hasParamAttribute(u, Attribute::Returned)) {
- if (u == unsigned(ArgNo))
- State.removeAssumedBits(NOT_CAPTURED_IN_RET);
- else if (F.onlyReadsMemory())
- State.addKnownBits(NO_CAPTURE);
- else
- State.addKnownBits(NOT_CAPTURED_IN_RET);
- break;
- }
- }
- }
-
- /// See AbstractState::getAsStr().
- const std::string getAsStr() const override {
- if (isKnownNoCapture())
- return "known not-captured";
- if (isAssumedNoCapture())
- return "assumed not-captured";
- if (isKnownNoCaptureMaybeReturned())
- return "known not-captured-maybe-returned";
- if (isAssumedNoCaptureMaybeReturned())
- return "assumed not-captured-maybe-returned";
- return "assumed-captured";
- }
-};
-
-/// Attributor-aware capture tracker.
-struct AACaptureUseTracker final : public CaptureTracker {
-
- /// Create a capture tracker that can lookup in-flight abstract attributes
- /// through the Attributor \p A.
- ///
- /// If a use leads to a potential capture, \p CapturedInMemory is set and the
- /// search is stopped. If a use leads to a return instruction,
- /// \p CommunicatedBack is set to true and \p CapturedInMemory is not changed.
- /// If a use leads to a ptr2int which may capture the value,
- /// \p CapturedInInteger is set. If a use is found that is currently assumed
- /// "no-capture-maybe-returned", the user is added to the \p PotentialCopies
- /// set. All values in \p PotentialCopies are later tracked as well. For every
- /// explored use we decrement \p RemainingUsesToExplore. Once it reaches 0,
- /// the search is stopped with \p CapturedInMemory and \p CapturedInInteger
- /// conservatively set to true.
- AACaptureUseTracker(Attributor &A, AANoCapture &NoCaptureAA,
- const AAIsDead &IsDeadAA, AANoCapture::StateType &State,
- SmallVectorImpl<const Value *> &PotentialCopies,
- unsigned &RemainingUsesToExplore)
- : A(A), NoCaptureAA(NoCaptureAA), IsDeadAA(IsDeadAA), State(State),
- PotentialCopies(PotentialCopies),
- RemainingUsesToExplore(RemainingUsesToExplore) {}
-
- /// Determine if \p V maybe captured. *Also updates the state!*
- bool valueMayBeCaptured(const Value *V) {
- if (V->getType()->isPointerTy()) {
- PointerMayBeCaptured(V, this);
- } else {
- State.indicatePessimisticFixpoint();
- }
- return State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
- }
-
- /// See CaptureTracker::tooManyUses().
- void tooManyUses() override {
- State.removeAssumedBits(AANoCapture::NO_CAPTURE);
- }
-
- bool isDereferenceableOrNull(Value *O, const DataLayout &DL) override {
- if (CaptureTracker::isDereferenceableOrNull(O, DL))
- return true;
- const auto &DerefAA = A.getAAFor<AADereferenceable>(
- NoCaptureAA, IRPosition::value(*O), /* TrackDependence */ true,
- DepClassTy::OPTIONAL);
- return DerefAA.getAssumedDereferenceableBytes();
- }
-
- /// See CaptureTracker::captured(...).
- bool captured(const Use *U) override {
- Instruction *UInst = cast<Instruction>(U->getUser());
- LLVM_DEBUG(dbgs() << "Check use: " << *U->get() << " in " << *UInst
- << "\n");
-
- // Because we may reuse the tracker multiple times we keep track of the
- // number of explored uses ourselves as well.
- if (RemainingUsesToExplore-- == 0) {
- LLVM_DEBUG(dbgs() << " - too many uses to explore!\n");
- return isCapturedIn(/* Memory */ true, /* Integer */ true,
- /* Return */ true);
- }
-
- // Deal with ptr2int by following uses.
- if (isa<PtrToIntInst>(UInst)) {
- LLVM_DEBUG(dbgs() << " - ptr2int assume the worst!\n");
- return valueMayBeCaptured(UInst);
- }
-
- // Explicitly catch return instructions.
- if (isa<ReturnInst>(UInst))
- return isCapturedIn(/* Memory */ false, /* Integer */ false,
- /* Return */ true);
-
- // For now we only use special logic for call sites. However, the tracker
- // itself knows about a lot of other non-capturing cases already.
- auto *CB = dyn_cast<CallBase>(UInst);
- if (!CB || !CB->isArgOperand(U))
- return isCapturedIn(/* Memory */ true, /* Integer */ true,
- /* Return */ true);
-
- unsigned ArgNo = CB->getArgOperandNo(U);
- const IRPosition &CSArgPos = IRPosition::callsite_argument(*CB, ArgNo);
- // If we have a abstract no-capture attribute for the argument we can use
- // it to justify a non-capture attribute here. This allows recursion!
- auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(NoCaptureAA, CSArgPos);
- if (ArgNoCaptureAA.isAssumedNoCapture())
- return isCapturedIn(/* Memory */ false, /* Integer */ false,
- /* Return */ false);
- if (ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
- addPotentialCopy(*CB);
- return isCapturedIn(/* Memory */ false, /* Integer */ false,
- /* Return */ false);
- }
-
- // Lastly, we could not find a reason no-capture can be assumed so we don't.
- return isCapturedIn(/* Memory */ true, /* Integer */ true,
- /* Return */ true);
- }
-
- /// Register \p CS as potential copy of the value we are checking.
- void addPotentialCopy(CallBase &CB) { PotentialCopies.push_back(&CB); }
-
- /// See CaptureTracker::shouldExplore(...).
- bool shouldExplore(const Use *U) override {
- // Check liveness and ignore droppable users.
- return !U->getUser()->isDroppable() &&
- !A.isAssumedDead(*U, &NoCaptureAA, &IsDeadAA);
- }
-
- /// Update the state according to \p CapturedInMem, \p CapturedInInt, and
- /// \p CapturedInRet, then return the appropriate value for use in the
- /// CaptureTracker::captured() interface.
- bool isCapturedIn(bool CapturedInMem, bool CapturedInInt,
- bool CapturedInRet) {
- LLVM_DEBUG(dbgs() << " - captures [Mem " << CapturedInMem << "|Int "
- << CapturedInInt << "|Ret " << CapturedInRet << "]\n");
- if (CapturedInMem)
- State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_MEM);
- if (CapturedInInt)
- State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_INT);
- if (CapturedInRet)
- State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_RET);
- return !State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
- }
-
-private:
- /// The attributor providing in-flight abstract attributes.
- Attributor &A;
-
- /// The abstract attribute currently updated.
- AANoCapture &NoCaptureAA;
-
- /// The abstract liveness state.
- const AAIsDead &IsDeadAA;
-
- /// The state currently updated.
- AANoCapture::StateType &State;
-
- /// Set of potential copies of the tracked value.
- SmallVectorImpl<const Value *> &PotentialCopies;
-
- /// Global counter to limit the number of explored uses.
- unsigned &RemainingUsesToExplore;
-};
-
-ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) {
- const IRPosition &IRP = getIRPosition();
+ if (F.doesNotThrow() && ArgNo >= 0) {
+ for (unsigned u = 0, e = F.arg_size(); u < e; ++u)
+ if (F.hasParamAttribute(u, Attribute::Returned)) {
+ if (u == unsigned(ArgNo))
+ State.removeAssumedBits(NOT_CAPTURED_IN_RET);
+ else if (F.onlyReadsMemory())
+ State.addKnownBits(NO_CAPTURE);
+ else
+ State.addKnownBits(NOT_CAPTURED_IN_RET);
+ break;
+ }
+ }
+ }
+
+ /// See AbstractState::getAsStr().
+ const std::string getAsStr() const override {
+ if (isKnownNoCapture())
+ return "known not-captured";
+ if (isAssumedNoCapture())
+ return "assumed not-captured";
+ if (isKnownNoCaptureMaybeReturned())
+ return "known not-captured-maybe-returned";
+ if (isAssumedNoCaptureMaybeReturned())
+ return "assumed not-captured-maybe-returned";
+ return "assumed-captured";
+ }
+};
+
+/// Attributor-aware capture tracker.
+struct AACaptureUseTracker final : public CaptureTracker {
+
+ /// Create a capture tracker that can lookup in-flight abstract attributes
+ /// through the Attributor \p A.
+ ///
+ /// If a use leads to a potential capture, \p CapturedInMemory is set and the
+ /// search is stopped. If a use leads to a return instruction,
+ /// \p CommunicatedBack is set to true and \p CapturedInMemory is not changed.
+ /// If a use leads to a ptr2int which may capture the value,
+ /// \p CapturedInInteger is set. If a use is found that is currently assumed
+ /// "no-capture-maybe-returned", the user is added to the \p PotentialCopies
+ /// set. All values in \p PotentialCopies are later tracked as well. For every
+ /// explored use we decrement \p RemainingUsesToExplore. Once it reaches 0,
+ /// the search is stopped with \p CapturedInMemory and \p CapturedInInteger
+ /// conservatively set to true.
+ AACaptureUseTracker(Attributor &A, AANoCapture &NoCaptureAA,
+ const AAIsDead &IsDeadAA, AANoCapture::StateType &State,
+ SmallVectorImpl<const Value *> &PotentialCopies,
+ unsigned &RemainingUsesToExplore)
+ : A(A), NoCaptureAA(NoCaptureAA), IsDeadAA(IsDeadAA), State(State),
+ PotentialCopies(PotentialCopies),
+ RemainingUsesToExplore(RemainingUsesToExplore) {}
+
+ /// Determine if \p V maybe captured. *Also updates the state!*
+ bool valueMayBeCaptured(const Value *V) {
+ if (V->getType()->isPointerTy()) {
+ PointerMayBeCaptured(V, this);
+ } else {
+ State.indicatePessimisticFixpoint();
+ }
+ return State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
+ }
+
+ /// See CaptureTracker::tooManyUses().
+ void tooManyUses() override {
+ State.removeAssumedBits(AANoCapture::NO_CAPTURE);
+ }
+
+ bool isDereferenceableOrNull(Value *O, const DataLayout &DL) override {
+ if (CaptureTracker::isDereferenceableOrNull(O, DL))
+ return true;
+ const auto &DerefAA = A.getAAFor<AADereferenceable>(
+ NoCaptureAA, IRPosition::value(*O), /* TrackDependence */ true,
+ DepClassTy::OPTIONAL);
+ return DerefAA.getAssumedDereferenceableBytes();
+ }
+
+ /// See CaptureTracker::captured(...).
+ bool captured(const Use *U) override {
+ Instruction *UInst = cast<Instruction>(U->getUser());
+ LLVM_DEBUG(dbgs() << "Check use: " << *U->get() << " in " << *UInst
+ << "\n");
+
+ // Because we may reuse the tracker multiple times we keep track of the
+ // number of explored uses ourselves as well.
+ if (RemainingUsesToExplore-- == 0) {
+ LLVM_DEBUG(dbgs() << " - too many uses to explore!\n");
+ return isCapturedIn(/* Memory */ true, /* Integer */ true,
+ /* Return */ true);
+ }
+
+ // Deal with ptr2int by following uses.
+ if (isa<PtrToIntInst>(UInst)) {
+ LLVM_DEBUG(dbgs() << " - ptr2int assume the worst!\n");
+ return valueMayBeCaptured(UInst);
+ }
+
+ // Explicitly catch return instructions.
+ if (isa<ReturnInst>(UInst))
+ return isCapturedIn(/* Memory */ false, /* Integer */ false,
+ /* Return */ true);
+
+ // For now we only use special logic for call sites. However, the tracker
+ // itself knows about a lot of other non-capturing cases already.
+ auto *CB = dyn_cast<CallBase>(UInst);
+ if (!CB || !CB->isArgOperand(U))
+ return isCapturedIn(/* Memory */ true, /* Integer */ true,
+ /* Return */ true);
+
+ unsigned ArgNo = CB->getArgOperandNo(U);
+ const IRPosition &CSArgPos = IRPosition::callsite_argument(*CB, ArgNo);
+ // If we have a abstract no-capture attribute for the argument we can use
+ // it to justify a non-capture attribute here. This allows recursion!
+ auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(NoCaptureAA, CSArgPos);
+ if (ArgNoCaptureAA.isAssumedNoCapture())
+ return isCapturedIn(/* Memory */ false, /* Integer */ false,
+ /* Return */ false);
+ if (ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+ addPotentialCopy(*CB);
+ return isCapturedIn(/* Memory */ false, /* Integer */ false,
+ /* Return */ false);
+ }
+
+ // Lastly, we could not find a reason no-capture can be assumed so we don't.
+ return isCapturedIn(/* Memory */ true, /* Integer */ true,
+ /* Return */ true);
+ }
+
+ /// Register \p CS as potential copy of the value we are checking.
+ void addPotentialCopy(CallBase &CB) { PotentialCopies.push_back(&CB); }
+
+ /// See CaptureTracker::shouldExplore(...).
+ bool shouldExplore(const Use *U) override {
+ // Check liveness and ignore droppable users.
+ return !U->getUser()->isDroppable() &&
+ !A.isAssumedDead(*U, &NoCaptureAA, &IsDeadAA);
+ }
+
+ /// Update the state according to \p CapturedInMem, \p CapturedInInt, and
+ /// \p CapturedInRet, then return the appropriate value for use in the
+ /// CaptureTracker::captured() interface.
+ bool isCapturedIn(bool CapturedInMem, bool CapturedInInt,
+ bool CapturedInRet) {
+ LLVM_DEBUG(dbgs() << " - captures [Mem " << CapturedInMem << "|Int "
+ << CapturedInInt << "|Ret " << CapturedInRet << "]\n");
+ if (CapturedInMem)
+ State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_MEM);
+ if (CapturedInInt)
+ State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_INT);
+ if (CapturedInRet)
+ State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_RET);
+ return !State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
+ }
+
+private:
+ /// The attributor providing in-flight abstract attributes.
+ Attributor &A;
+
+ /// The abstract attribute currently updated.
+ AANoCapture &NoCaptureAA;
+
+ /// The abstract liveness state.
+ const AAIsDead &IsDeadAA;
+
+ /// The state currently updated.
+ AANoCapture::StateType &State;
+
+ /// Set of potential copies of the tracked value.
+ SmallVectorImpl<const Value *> &PotentialCopies;
+
+ /// Global counter to limit the number of explored uses.
+ unsigned &RemainingUsesToExplore;
+};
+
+ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) {
+ const IRPosition &IRP = getIRPosition();
const Value *V = isArgumentPosition() ? IRP.getAssociatedArgument()
: &IRP.getAssociatedValue();
- if (!V)
- return indicatePessimisticFixpoint();
-
- const Function *F =
+ if (!V)
+ return indicatePessimisticFixpoint();
+
+ const Function *F =
isArgumentPosition() ? IRP.getAssociatedFunction() : IRP.getAnchorScope();
- assert(F && "Expected a function!");
- const IRPosition &FnPos = IRPosition::function(*F);
- const auto &IsDeadAA =
- A.getAAFor<AAIsDead>(*this, FnPos, /* TrackDependence */ false);
-
- AANoCapture::StateType T;
-
- // Readonly means we cannot capture through memory.
- const auto &FnMemAA =
- A.getAAFor<AAMemoryBehavior>(*this, FnPos, /* TrackDependence */ false);
- if (FnMemAA.isAssumedReadOnly()) {
- T.addKnownBits(NOT_CAPTURED_IN_MEM);
- if (FnMemAA.isKnownReadOnly())
- addKnownBits(NOT_CAPTURED_IN_MEM);
- else
- A.recordDependence(FnMemAA, *this, DepClassTy::OPTIONAL);
- }
-
- // Make sure all returned values are different than the underlying value.
- // TODO: we could do this in a more sophisticated way inside
- // AAReturnedValues, e.g., track all values that escape through returns
- // directly somehow.
- auto CheckReturnedArgs = [&](const AAReturnedValues &RVAA) {
- bool SeenConstant = false;
- for (auto &It : RVAA.returned_values()) {
- if (isa<Constant>(It.first)) {
- if (SeenConstant)
- return false;
- SeenConstant = true;
- } else if (!isa<Argument>(It.first) ||
- It.first == getAssociatedArgument())
- return false;
- }
- return true;
- };
-
- const auto &NoUnwindAA = A.getAAFor<AANoUnwind>(
- *this, FnPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
- if (NoUnwindAA.isAssumedNoUnwind()) {
- bool IsVoidTy = F->getReturnType()->isVoidTy();
- const AAReturnedValues *RVAA =
- IsVoidTy ? nullptr
- : &A.getAAFor<AAReturnedValues>(*this, FnPos,
- /* TrackDependence */ true,
- DepClassTy::OPTIONAL);
- if (IsVoidTy || CheckReturnedArgs(*RVAA)) {
- T.addKnownBits(NOT_CAPTURED_IN_RET);
- if (T.isKnown(NOT_CAPTURED_IN_MEM))
- return ChangeStatus::UNCHANGED;
- if (NoUnwindAA.isKnownNoUnwind() &&
- (IsVoidTy || RVAA->getState().isAtFixpoint())) {
- addKnownBits(NOT_CAPTURED_IN_RET);
- if (isKnown(NOT_CAPTURED_IN_MEM))
- return indicateOptimisticFixpoint();
- }
- }
- }
-
- // Use the CaptureTracker interface and logic with the specialized tracker,
- // defined in AACaptureUseTracker, that can look at in-flight abstract
- // attributes and directly updates the assumed state.
- SmallVector<const Value *, 4> PotentialCopies;
- unsigned RemainingUsesToExplore =
- getDefaultMaxUsesToExploreForCaptureTracking();
- AACaptureUseTracker Tracker(A, *this, IsDeadAA, T, PotentialCopies,
- RemainingUsesToExplore);
-
- // Check all potential copies of the associated value until we can assume
- // none will be captured or we have to assume at least one might be.
- unsigned Idx = 0;
- PotentialCopies.push_back(V);
- while (T.isAssumed(NO_CAPTURE_MAYBE_RETURNED) && Idx < PotentialCopies.size())
- Tracker.valueMayBeCaptured(PotentialCopies[Idx++]);
-
- AANoCapture::StateType &S = getState();
- auto Assumed = S.getAssumed();
- S.intersectAssumedBits(T.getAssumed());
- if (!isAssumedNoCaptureMaybeReturned())
- return indicatePessimisticFixpoint();
- return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED
- : ChangeStatus::CHANGED;
-}
-
-/// NoCapture attribute for function arguments.
-struct AANoCaptureArgument final : AANoCaptureImpl {
- AANoCaptureArgument(const IRPosition &IRP, Attributor &A)
- : AANoCaptureImpl(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nocapture) }
-};
-
-/// NoCapture attribute for call site arguments.
-struct AANoCaptureCallSiteArgument final : AANoCaptureImpl {
- AANoCaptureCallSiteArgument(const IRPosition &IRP, Attributor &A)
- : AANoCaptureImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- if (Argument *Arg = getAssociatedArgument())
- if (Arg->hasByValAttr())
- indicateOptimisticFixpoint();
- AANoCaptureImpl::initialize(A);
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness information and then it makes
- // sense to specialize attributes for call sites arguments instead of
- // redirecting requests to the callee argument.
- Argument *Arg = getAssociatedArgument();
- if (!Arg)
- return indicatePessimisticFixpoint();
- const IRPosition &ArgPos = IRPosition::argument(*Arg);
- auto &ArgAA = A.getAAFor<AANoCapture>(*this, ArgPos);
+ assert(F && "Expected a function!");
+ const IRPosition &FnPos = IRPosition::function(*F);
+ const auto &IsDeadAA =
+ A.getAAFor<AAIsDead>(*this, FnPos, /* TrackDependence */ false);
+
+ AANoCapture::StateType T;
+
+ // Readonly means we cannot capture through memory.
+ const auto &FnMemAA =
+ A.getAAFor<AAMemoryBehavior>(*this, FnPos, /* TrackDependence */ false);
+ if (FnMemAA.isAssumedReadOnly()) {
+ T.addKnownBits(NOT_CAPTURED_IN_MEM);
+ if (FnMemAA.isKnownReadOnly())
+ addKnownBits(NOT_CAPTURED_IN_MEM);
+ else
+ A.recordDependence(FnMemAA, *this, DepClassTy::OPTIONAL);
+ }
+
+ // Make sure all returned values are different than the underlying value.
+ // TODO: we could do this in a more sophisticated way inside
+ // AAReturnedValues, e.g., track all values that escape through returns
+ // directly somehow.
+ auto CheckReturnedArgs = [&](const AAReturnedValues &RVAA) {
+ bool SeenConstant = false;
+ for (auto &It : RVAA.returned_values()) {
+ if (isa<Constant>(It.first)) {
+ if (SeenConstant)
+ return false;
+ SeenConstant = true;
+ } else if (!isa<Argument>(It.first) ||
+ It.first == getAssociatedArgument())
+ return false;
+ }
+ return true;
+ };
+
+ const auto &NoUnwindAA = A.getAAFor<AANoUnwind>(
+ *this, FnPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+ if (NoUnwindAA.isAssumedNoUnwind()) {
+ bool IsVoidTy = F->getReturnType()->isVoidTy();
+ const AAReturnedValues *RVAA =
+ IsVoidTy ? nullptr
+ : &A.getAAFor<AAReturnedValues>(*this, FnPos,
+ /* TrackDependence */ true,
+ DepClassTy::OPTIONAL);
+ if (IsVoidTy || CheckReturnedArgs(*RVAA)) {
+ T.addKnownBits(NOT_CAPTURED_IN_RET);
+ if (T.isKnown(NOT_CAPTURED_IN_MEM))
+ return ChangeStatus::UNCHANGED;
+ if (NoUnwindAA.isKnownNoUnwind() &&
+ (IsVoidTy || RVAA->getState().isAtFixpoint())) {
+ addKnownBits(NOT_CAPTURED_IN_RET);
+ if (isKnown(NOT_CAPTURED_IN_MEM))
+ return indicateOptimisticFixpoint();
+ }
+ }
+ }
+
+ // Use the CaptureTracker interface and logic with the specialized tracker,
+ // defined in AACaptureUseTracker, that can look at in-flight abstract
+ // attributes and directly updates the assumed state.
+ SmallVector<const Value *, 4> PotentialCopies;
+ unsigned RemainingUsesToExplore =
+ getDefaultMaxUsesToExploreForCaptureTracking();
+ AACaptureUseTracker Tracker(A, *this, IsDeadAA, T, PotentialCopies,
+ RemainingUsesToExplore);
+
+ // Check all potential copies of the associated value until we can assume
+ // none will be captured or we have to assume at least one might be.
+ unsigned Idx = 0;
+ PotentialCopies.push_back(V);
+ while (T.isAssumed(NO_CAPTURE_MAYBE_RETURNED) && Idx < PotentialCopies.size())
+ Tracker.valueMayBeCaptured(PotentialCopies[Idx++]);
+
+ AANoCapture::StateType &S = getState();
+ auto Assumed = S.getAssumed();
+ S.intersectAssumedBits(T.getAssumed());
+ if (!isAssumedNoCaptureMaybeReturned())
+ return indicatePessimisticFixpoint();
+ return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED
+ : ChangeStatus::CHANGED;
+}
+
+/// NoCapture attribute for function arguments.
+struct AANoCaptureArgument final : AANoCaptureImpl {
+ AANoCaptureArgument(const IRPosition &IRP, Attributor &A)
+ : AANoCaptureImpl(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nocapture) }
+};
+
+/// NoCapture attribute for call site arguments.
+struct AANoCaptureCallSiteArgument final : AANoCaptureImpl {
+ AANoCaptureCallSiteArgument(const IRPosition &IRP, Attributor &A)
+ : AANoCaptureImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ if (Argument *Arg = getAssociatedArgument())
+ if (Arg->hasByValAttr())
+ indicateOptimisticFixpoint();
+ AANoCaptureImpl::initialize(A);
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness information and then it makes
+ // sense to specialize attributes for call sites arguments instead of
+ // redirecting requests to the callee argument.
+ Argument *Arg = getAssociatedArgument();
+ if (!Arg)
+ return indicatePessimisticFixpoint();
+ const IRPosition &ArgPos = IRPosition::argument(*Arg);
+ auto &ArgAA = A.getAAFor<AANoCapture>(*this, ArgPos);
return clampStateAndIndicateChange(getState(), ArgAA.getState());
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nocapture)};
-};
-
-/// NoCapture attribute for floating values.
-struct AANoCaptureFloating final : AANoCaptureImpl {
- AANoCaptureFloating(const IRPosition &IRP, Attributor &A)
- : AANoCaptureImpl(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_FLOATING_ATTR(nocapture)
- }
-};
-
-/// NoCapture attribute for function return value.
-struct AANoCaptureReturned final : AANoCaptureImpl {
- AANoCaptureReturned(const IRPosition &IRP, Attributor &A)
- : AANoCaptureImpl(IRP, A) {
- llvm_unreachable("NoCapture is not applicable to function returns!");
- }
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- llvm_unreachable("NoCapture is not applicable to function returns!");
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- llvm_unreachable("NoCapture is not applicable to function returns!");
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {}
-};
-
-/// NoCapture attribute deduction for a call site return value.
-struct AANoCaptureCallSiteReturned final : AANoCaptureImpl {
- AANoCaptureCallSiteReturned(const IRPosition &IRP, Attributor &A)
- : AANoCaptureImpl(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_CSRET_ATTR(nocapture)
- }
-};
-
-/// ------------------ Value Simplify Attribute ----------------------------
-struct AAValueSimplifyImpl : AAValueSimplify {
- AAValueSimplifyImpl(const IRPosition &IRP, Attributor &A)
- : AAValueSimplify(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- if (getAssociatedValue().getType()->isVoidTy())
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::getAsStr().
- const std::string getAsStr() const override {
- return getAssumed() ? (getKnown() ? "simplified" : "maybe-simple")
- : "not-simple";
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {}
-
- /// See AAValueSimplify::getAssumedSimplifiedValue()
- Optional<Value *> getAssumedSimplifiedValue(Attributor &A) const override {
- if (!getAssumed())
- return const_cast<Value *>(&getAssociatedValue());
- return SimplifiedAssociatedValue;
- }
-
- /// Helper function for querying AAValueSimplify and updating candicate.
- /// \param QueryingValue Value trying to unify with SimplifiedValue
- /// \param AccumulatedSimplifiedValue Current simplification result.
- static bool checkAndUpdate(Attributor &A, const AbstractAttribute &QueryingAA,
- Value &QueryingValue,
- Optional<Value *> &AccumulatedSimplifiedValue) {
- // FIXME: Add a typecast support.
-
- auto &ValueSimplifyAA = A.getAAFor<AAValueSimplify>(
- QueryingAA, IRPosition::value(QueryingValue));
-
- Optional<Value *> QueryingValueSimplified =
- ValueSimplifyAA.getAssumedSimplifiedValue(A);
-
- if (!QueryingValueSimplified.hasValue())
- return true;
-
- if (!QueryingValueSimplified.getValue())
- return false;
-
- Value &QueryingValueSimplifiedUnwrapped =
- *QueryingValueSimplified.getValue();
-
- if (AccumulatedSimplifiedValue.hasValue() &&
- !isa<UndefValue>(AccumulatedSimplifiedValue.getValue()) &&
- !isa<UndefValue>(QueryingValueSimplifiedUnwrapped))
- return AccumulatedSimplifiedValue == QueryingValueSimplified;
- if (AccumulatedSimplifiedValue.hasValue() &&
- isa<UndefValue>(QueryingValueSimplifiedUnwrapped))
- return true;
-
- LLVM_DEBUG(dbgs() << "[ValueSimplify] " << QueryingValue
- << " is assumed to be "
- << QueryingValueSimplifiedUnwrapped << "\n");
-
- AccumulatedSimplifiedValue = QueryingValueSimplified;
- return true;
- }
-
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nocapture)};
+};
+
+/// NoCapture attribute for floating values.
+struct AANoCaptureFloating final : AANoCaptureImpl {
+ AANoCaptureFloating(const IRPosition &IRP, Attributor &A)
+ : AANoCaptureImpl(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_FLOATING_ATTR(nocapture)
+ }
+};
+
+/// NoCapture attribute for function return value.
+struct AANoCaptureReturned final : AANoCaptureImpl {
+ AANoCaptureReturned(const IRPosition &IRP, Attributor &A)
+ : AANoCaptureImpl(IRP, A) {
+ llvm_unreachable("NoCapture is not applicable to function returns!");
+ }
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ llvm_unreachable("NoCapture is not applicable to function returns!");
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ llvm_unreachable("NoCapture is not applicable to function returns!");
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {}
+};
+
+/// NoCapture attribute deduction for a call site return value.
+struct AANoCaptureCallSiteReturned final : AANoCaptureImpl {
+ AANoCaptureCallSiteReturned(const IRPosition &IRP, Attributor &A)
+ : AANoCaptureImpl(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_CSRET_ATTR(nocapture)
+ }
+};
+
+/// ------------------ Value Simplify Attribute ----------------------------
+struct AAValueSimplifyImpl : AAValueSimplify {
+ AAValueSimplifyImpl(const IRPosition &IRP, Attributor &A)
+ : AAValueSimplify(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ if (getAssociatedValue().getType()->isVoidTy())
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::getAsStr().
+ const std::string getAsStr() const override {
+ return getAssumed() ? (getKnown() ? "simplified" : "maybe-simple")
+ : "not-simple";
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {}
+
+ /// See AAValueSimplify::getAssumedSimplifiedValue()
+ Optional<Value *> getAssumedSimplifiedValue(Attributor &A) const override {
+ if (!getAssumed())
+ return const_cast<Value *>(&getAssociatedValue());
+ return SimplifiedAssociatedValue;
+ }
+
+ /// Helper function for querying AAValueSimplify and updating candicate.
+ /// \param QueryingValue Value trying to unify with SimplifiedValue
+ /// \param AccumulatedSimplifiedValue Current simplification result.
+ static bool checkAndUpdate(Attributor &A, const AbstractAttribute &QueryingAA,
+ Value &QueryingValue,
+ Optional<Value *> &AccumulatedSimplifiedValue) {
+ // FIXME: Add a typecast support.
+
+ auto &ValueSimplifyAA = A.getAAFor<AAValueSimplify>(
+ QueryingAA, IRPosition::value(QueryingValue));
+
+ Optional<Value *> QueryingValueSimplified =
+ ValueSimplifyAA.getAssumedSimplifiedValue(A);
+
+ if (!QueryingValueSimplified.hasValue())
+ return true;
+
+ if (!QueryingValueSimplified.getValue())
+ return false;
+
+ Value &QueryingValueSimplifiedUnwrapped =
+ *QueryingValueSimplified.getValue();
+
+ if (AccumulatedSimplifiedValue.hasValue() &&
+ !isa<UndefValue>(AccumulatedSimplifiedValue.getValue()) &&
+ !isa<UndefValue>(QueryingValueSimplifiedUnwrapped))
+ return AccumulatedSimplifiedValue == QueryingValueSimplified;
+ if (AccumulatedSimplifiedValue.hasValue() &&
+ isa<UndefValue>(QueryingValueSimplifiedUnwrapped))
+ return true;
+
+ LLVM_DEBUG(dbgs() << "[ValueSimplify] " << QueryingValue
+ << " is assumed to be "
+ << QueryingValueSimplifiedUnwrapped << "\n");
+
+ AccumulatedSimplifiedValue = QueryingValueSimplified;
+ return true;
+ }
+
/// Returns a candidate is found or not
template <typename AAType> bool askSimplifiedValueFor(Attributor &A) {
- if (!getAssociatedValue().getType()->isIntegerTy())
- return false;
-
+ if (!getAssociatedValue().getType()->isIntegerTy())
+ return false;
+
const auto &AA =
A.getAAFor<AAType>(*this, getIRPosition(), /* TrackDependence */ false);
-
+
Optional<ConstantInt *> COpt = AA.getAssumedConstantInt(A);
if (!COpt.hasValue()) {
- SimplifiedAssociatedValue = llvm::None;
+ SimplifiedAssociatedValue = llvm::None;
A.recordDependence(AA, *this, DepClassTy::OPTIONAL);
return true;
- }
+ }
if (auto *C = COpt.getValue()) {
SimplifiedAssociatedValue = C;
A.recordDependence(AA, *this, DepClassTy::OPTIONAL);
return true;
}
return false;
- }
-
+ }
+
bool askSimplifiedValueForOtherAAs(Attributor &A) {
if (askSimplifiedValueFor<AAValueConstantRange>(A))
return true;
@@ -4581,207 +4581,207 @@ struct AAValueSimplifyImpl : AAValueSimplify {
return false;
}
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
-
- if (SimplifiedAssociatedValue.hasValue() &&
- !SimplifiedAssociatedValue.getValue())
- return Changed;
-
- Value &V = getAssociatedValue();
- auto *C = SimplifiedAssociatedValue.hasValue()
- ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())
- : UndefValue::get(V.getType());
- if (C) {
- // We can replace the AssociatedValue with the constant.
- if (!V.user_empty() && &V != C && V.getType() == C->getType()) {
- LLVM_DEBUG(dbgs() << "[ValueSimplify] " << V << " -> " << *C
- << " :: " << *this << "\n");
- if (A.changeValueAfterManifest(V, *C))
- Changed = ChangeStatus::CHANGED;
- }
- }
-
- return Changed | AAValueSimplify::manifest(A);
- }
-
- /// See AbstractState::indicatePessimisticFixpoint(...).
- ChangeStatus indicatePessimisticFixpoint() override {
- // NOTE: Associated value will be returned in a pessimistic fixpoint and is
- // regarded as known. That's why`indicateOptimisticFixpoint` is called.
- SimplifiedAssociatedValue = &getAssociatedValue();
- indicateOptimisticFixpoint();
- return ChangeStatus::CHANGED;
- }
-
-protected:
- // An assumed simplified value. Initially, it is set to Optional::None, which
- // means that the value is not clear under current assumption. If in the
- // pessimistic state, getAssumedSimplifiedValue doesn't return this value but
- // returns orignal associated value.
- Optional<Value *> SimplifiedAssociatedValue;
-};
-
-struct AAValueSimplifyArgument final : AAValueSimplifyImpl {
- AAValueSimplifyArgument(const IRPosition &IRP, Attributor &A)
- : AAValueSimplifyImpl(IRP, A) {}
-
- void initialize(Attributor &A) override {
- AAValueSimplifyImpl::initialize(A);
- if (!getAnchorScope() || getAnchorScope()->isDeclaration())
- indicatePessimisticFixpoint();
- if (hasAttr({Attribute::InAlloca, Attribute::Preallocated,
- Attribute::StructRet, Attribute::Nest},
- /* IgnoreSubsumingPositions */ true))
- indicatePessimisticFixpoint();
-
- // FIXME: This is a hack to prevent us from propagating function poiner in
- // the new pass manager CGSCC pass as it creates call edges the
- // CallGraphUpdater cannot handle yet.
- Value &V = getAssociatedValue();
- if (V.getType()->isPointerTy() &&
- V.getType()->getPointerElementType()->isFunctionTy() &&
- !A.isModulePass())
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // Byval is only replacable if it is readonly otherwise we would write into
- // the replaced value and not the copy that byval creates implicitly.
- Argument *Arg = getAssociatedArgument();
- if (Arg->hasByValAttr()) {
- // TODO: We probably need to verify synchronization is not an issue, e.g.,
- // there is no race by not copying a constant byval.
- const auto &MemAA = A.getAAFor<AAMemoryBehavior>(*this, getIRPosition());
- if (!MemAA.isAssumedReadOnly())
- return indicatePessimisticFixpoint();
- }
-
- bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
-
- auto PredForCallSite = [&](AbstractCallSite ACS) {
- const IRPosition &ACSArgPos =
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+ if (SimplifiedAssociatedValue.hasValue() &&
+ !SimplifiedAssociatedValue.getValue())
+ return Changed;
+
+ Value &V = getAssociatedValue();
+ auto *C = SimplifiedAssociatedValue.hasValue()
+ ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())
+ : UndefValue::get(V.getType());
+ if (C) {
+ // We can replace the AssociatedValue with the constant.
+ if (!V.user_empty() && &V != C && V.getType() == C->getType()) {
+ LLVM_DEBUG(dbgs() << "[ValueSimplify] " << V << " -> " << *C
+ << " :: " << *this << "\n");
+ if (A.changeValueAfterManifest(V, *C))
+ Changed = ChangeStatus::CHANGED;
+ }
+ }
+
+ return Changed | AAValueSimplify::manifest(A);
+ }
+
+ /// See AbstractState::indicatePessimisticFixpoint(...).
+ ChangeStatus indicatePessimisticFixpoint() override {
+ // NOTE: Associated value will be returned in a pessimistic fixpoint and is
+ // regarded as known. That's why`indicateOptimisticFixpoint` is called.
+ SimplifiedAssociatedValue = &getAssociatedValue();
+ indicateOptimisticFixpoint();
+ return ChangeStatus::CHANGED;
+ }
+
+protected:
+ // An assumed simplified value. Initially, it is set to Optional::None, which
+ // means that the value is not clear under current assumption. If in the
+ // pessimistic state, getAssumedSimplifiedValue doesn't return this value but
+ // returns orignal associated value.
+ Optional<Value *> SimplifiedAssociatedValue;
+};
+
+struct AAValueSimplifyArgument final : AAValueSimplifyImpl {
+ AAValueSimplifyArgument(const IRPosition &IRP, Attributor &A)
+ : AAValueSimplifyImpl(IRP, A) {}
+
+ void initialize(Attributor &A) override {
+ AAValueSimplifyImpl::initialize(A);
+ if (!getAnchorScope() || getAnchorScope()->isDeclaration())
+ indicatePessimisticFixpoint();
+ if (hasAttr({Attribute::InAlloca, Attribute::Preallocated,
+ Attribute::StructRet, Attribute::Nest},
+ /* IgnoreSubsumingPositions */ true))
+ indicatePessimisticFixpoint();
+
+ // FIXME: This is a hack to prevent us from propagating function poiner in
+ // the new pass manager CGSCC pass as it creates call edges the
+ // CallGraphUpdater cannot handle yet.
+ Value &V = getAssociatedValue();
+ if (V.getType()->isPointerTy() &&
+ V.getType()->getPointerElementType()->isFunctionTy() &&
+ !A.isModulePass())
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // Byval is only replacable if it is readonly otherwise we would write into
+ // the replaced value and not the copy that byval creates implicitly.
+ Argument *Arg = getAssociatedArgument();
+ if (Arg->hasByValAttr()) {
+ // TODO: We probably need to verify synchronization is not an issue, e.g.,
+ // there is no race by not copying a constant byval.
+ const auto &MemAA = A.getAAFor<AAMemoryBehavior>(*this, getIRPosition());
+ if (!MemAA.isAssumedReadOnly())
+ return indicatePessimisticFixpoint();
+ }
+
+ bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
+ auto PredForCallSite = [&](AbstractCallSite ACS) {
+ const IRPosition &ACSArgPos =
IRPosition::callsite_argument(ACS, getCallSiteArgNo());
- // Check if a coresponding argument was found or if it is on not
- // associated (which can happen for callback calls).
- if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
- return false;
-
- // We can only propagate thread independent values through callbacks.
- // This is different to direct/indirect call sites because for them we
- // know the thread executing the caller and callee is the same. For
- // callbacks this is not guaranteed, thus a thread dependent value could
- // be different for the caller and callee, making it invalid to propagate.
- Value &ArgOp = ACSArgPos.getAssociatedValue();
- if (ACS.isCallbackCall())
- if (auto *C = dyn_cast<Constant>(&ArgOp))
- if (C->isThreadDependent())
- return false;
- return checkAndUpdate(A, *this, ArgOp, SimplifiedAssociatedValue);
- };
-
- bool AllCallSitesKnown;
- if (!A.checkForAllCallSites(PredForCallSite, *this, true,
- AllCallSitesKnown))
+ // Check if a coresponding argument was found or if it is on not
+ // associated (which can happen for callback calls).
+ if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
+ return false;
+
+ // We can only propagate thread independent values through callbacks.
+ // This is different to direct/indirect call sites because for them we
+ // know the thread executing the caller and callee is the same. For
+ // callbacks this is not guaranteed, thus a thread dependent value could
+ // be different for the caller and callee, making it invalid to propagate.
+ Value &ArgOp = ACSArgPos.getAssociatedValue();
+ if (ACS.isCallbackCall())
+ if (auto *C = dyn_cast<Constant>(&ArgOp))
+ if (C->isThreadDependent())
+ return false;
+ return checkAndUpdate(A, *this, ArgOp, SimplifiedAssociatedValue);
+ };
+
+ bool AllCallSitesKnown;
+ if (!A.checkForAllCallSites(PredForCallSite, *this, true,
+ AllCallSitesKnown))
if (!askSimplifiedValueForOtherAAs(A))
- return indicatePessimisticFixpoint();
-
- // If a candicate was found in this update, return CHANGED.
- return HasValueBefore == SimplifiedAssociatedValue.hasValue()
- ? ChangeStatus::UNCHANGED
- : ChangeStatus ::CHANGED;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_ARG_ATTR(value_simplify)
- }
-};
-
-struct AAValueSimplifyReturned : AAValueSimplifyImpl {
- AAValueSimplifyReturned(const IRPosition &IRP, Attributor &A)
- : AAValueSimplifyImpl(IRP, A) {}
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
-
- auto PredForReturned = [&](Value &V) {
- return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue);
- };
-
- if (!A.checkForAllReturnedValues(PredForReturned, *this))
+ return indicatePessimisticFixpoint();
+
+ // If a candicate was found in this update, return CHANGED.
+ return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+ ? ChangeStatus::UNCHANGED
+ : ChangeStatus ::CHANGED;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_ARG_ATTR(value_simplify)
+ }
+};
+
+struct AAValueSimplifyReturned : AAValueSimplifyImpl {
+ AAValueSimplifyReturned(const IRPosition &IRP, Attributor &A)
+ : AAValueSimplifyImpl(IRP, A) {}
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
+ auto PredForReturned = [&](Value &V) {
+ return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue);
+ };
+
+ if (!A.checkForAllReturnedValues(PredForReturned, *this))
if (!askSimplifiedValueForOtherAAs(A))
- return indicatePessimisticFixpoint();
-
- // If a candicate was found in this update, return CHANGED.
- return HasValueBefore == SimplifiedAssociatedValue.hasValue()
- ? ChangeStatus::UNCHANGED
- : ChangeStatus ::CHANGED;
- }
-
- ChangeStatus manifest(Attributor &A) override {
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
-
- if (SimplifiedAssociatedValue.hasValue() &&
- !SimplifiedAssociatedValue.getValue())
- return Changed;
-
- Value &V = getAssociatedValue();
- auto *C = SimplifiedAssociatedValue.hasValue()
- ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())
- : UndefValue::get(V.getType());
- if (C) {
- auto PredForReturned =
- [&](Value &V, const SmallSetVector<ReturnInst *, 4> &RetInsts) {
- // We can replace the AssociatedValue with the constant.
- if (&V == C || V.getType() != C->getType() || isa<UndefValue>(V))
- return true;
-
- for (ReturnInst *RI : RetInsts) {
- if (RI->getFunction() != getAnchorScope())
- continue;
- auto *RC = C;
- if (RC->getType() != RI->getReturnValue()->getType())
- RC = ConstantExpr::getBitCast(RC,
- RI->getReturnValue()->getType());
- LLVM_DEBUG(dbgs() << "[ValueSimplify] " << V << " -> " << *RC
- << " in " << *RI << " :: " << *this << "\n");
- if (A.changeUseAfterManifest(RI->getOperandUse(0), *RC))
- Changed = ChangeStatus::CHANGED;
- }
- return true;
- };
- A.checkForAllReturnedValuesAndReturnInsts(PredForReturned, *this);
- }
-
- return Changed | AAValueSimplify::manifest(A);
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_FNRET_ATTR(value_simplify)
- }
-};
-
-struct AAValueSimplifyFloating : AAValueSimplifyImpl {
- AAValueSimplifyFloating(const IRPosition &IRP, Attributor &A)
- : AAValueSimplifyImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- // FIXME: This might have exposed a SCC iterator update bug in the old PM.
- // Needs investigation.
- // AAValueSimplifyImpl::initialize(A);
- Value &V = getAnchorValue();
-
- // TODO: add other stuffs
- if (isa<Constant>(V))
- indicatePessimisticFixpoint();
- }
-
+ return indicatePessimisticFixpoint();
+
+ // If a candicate was found in this update, return CHANGED.
+ return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+ ? ChangeStatus::UNCHANGED
+ : ChangeStatus ::CHANGED;
+ }
+
+ ChangeStatus manifest(Attributor &A) override {
+ ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+ if (SimplifiedAssociatedValue.hasValue() &&
+ !SimplifiedAssociatedValue.getValue())
+ return Changed;
+
+ Value &V = getAssociatedValue();
+ auto *C = SimplifiedAssociatedValue.hasValue()
+ ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())
+ : UndefValue::get(V.getType());
+ if (C) {
+ auto PredForReturned =
+ [&](Value &V, const SmallSetVector<ReturnInst *, 4> &RetInsts) {
+ // We can replace the AssociatedValue with the constant.
+ if (&V == C || V.getType() != C->getType() || isa<UndefValue>(V))
+ return true;
+
+ for (ReturnInst *RI : RetInsts) {
+ if (RI->getFunction() != getAnchorScope())
+ continue;
+ auto *RC = C;
+ if (RC->getType() != RI->getReturnValue()->getType())
+ RC = ConstantExpr::getBitCast(RC,
+ RI->getReturnValue()->getType());
+ LLVM_DEBUG(dbgs() << "[ValueSimplify] " << V << " -> " << *RC
+ << " in " << *RI << " :: " << *this << "\n");
+ if (A.changeUseAfterManifest(RI->getOperandUse(0), *RC))
+ Changed = ChangeStatus::CHANGED;
+ }
+ return true;
+ };
+ A.checkForAllReturnedValuesAndReturnInsts(PredForReturned, *this);
+ }
+
+ return Changed | AAValueSimplify::manifest(A);
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_FNRET_ATTR(value_simplify)
+ }
+};
+
+struct AAValueSimplifyFloating : AAValueSimplifyImpl {
+ AAValueSimplifyFloating(const IRPosition &IRP, Attributor &A)
+ : AAValueSimplifyImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ // FIXME: This might have exposed a SCC iterator update bug in the old PM.
+ // Needs investigation.
+ // AAValueSimplifyImpl::initialize(A);
+ Value &V = getAnchorValue();
+
+ // TODO: add other stuffs
+ if (isa<Constant>(V))
+ indicatePessimisticFixpoint();
+ }
+
/// Check if \p ICmp is an equality comparison (==/!=) with at least one
/// nullptr. If so, try to simplify it using AANonNull on the other operand.
/// Return true if successful, in that case SimplifiedAssociatedValue will be
@@ -4843,1201 +4843,1201 @@ struct AAValueSimplifyFloating : AAValueSimplifyImpl {
return true;
}
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
-
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
ChangeStatus Changed;
if (checkForNullPtrCompare(A, dyn_cast<ICmpInst>(&getAnchorValue()),
Changed))
return Changed;
- auto VisitValueCB = [&](Value &V, const Instruction *CtxI, bool &,
- bool Stripped) -> bool {
- auto &AA = A.getAAFor<AAValueSimplify>(*this, IRPosition::value(V));
- if (!Stripped && this == &AA) {
- // TODO: Look the instruction and check recursively.
-
- LLVM_DEBUG(dbgs() << "[ValueSimplify] Can't be stripped more : " << V
- << "\n");
- return false;
- }
- return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue);
- };
-
- bool Dummy = false;
- if (!genericValueTraversal<AAValueSimplify, bool>(
- A, getIRPosition(), *this, Dummy, VisitValueCB, getCtxI(),
- /* UseValueSimplify */ false))
+ auto VisitValueCB = [&](Value &V, const Instruction *CtxI, bool &,
+ bool Stripped) -> bool {
+ auto &AA = A.getAAFor<AAValueSimplify>(*this, IRPosition::value(V));
+ if (!Stripped && this == &AA) {
+ // TODO: Look the instruction and check recursively.
+
+ LLVM_DEBUG(dbgs() << "[ValueSimplify] Can't be stripped more : " << V
+ << "\n");
+ return false;
+ }
+ return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue);
+ };
+
+ bool Dummy = false;
+ if (!genericValueTraversal<AAValueSimplify, bool>(
+ A, getIRPosition(), *this, Dummy, VisitValueCB, getCtxI(),
+ /* UseValueSimplify */ false))
if (!askSimplifiedValueForOtherAAs(A))
- return indicatePessimisticFixpoint();
-
- // If a candicate was found in this update, return CHANGED.
-
- return HasValueBefore == SimplifiedAssociatedValue.hasValue()
- ? ChangeStatus::UNCHANGED
- : ChangeStatus ::CHANGED;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_FLOATING_ATTR(value_simplify)
- }
-};
-
-struct AAValueSimplifyFunction : AAValueSimplifyImpl {
- AAValueSimplifyFunction(const IRPosition &IRP, Attributor &A)
- : AAValueSimplifyImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- SimplifiedAssociatedValue = &getAnchorValue();
- indicateOptimisticFixpoint();
- }
- /// See AbstractAttribute::initialize(...).
- ChangeStatus updateImpl(Attributor &A) override {
- llvm_unreachable(
- "AAValueSimplify(Function|CallSite)::updateImpl will not be called");
- }
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_FN_ATTR(value_simplify)
- }
-};
-
-struct AAValueSimplifyCallSite : AAValueSimplifyFunction {
- AAValueSimplifyCallSite(const IRPosition &IRP, Attributor &A)
- : AAValueSimplifyFunction(IRP, A) {}
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_CS_ATTR(value_simplify)
- }
-};
-
-struct AAValueSimplifyCallSiteReturned : AAValueSimplifyReturned {
- AAValueSimplifyCallSiteReturned(const IRPosition &IRP, Attributor &A)
- : AAValueSimplifyReturned(IRP, A) {}
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- return AAValueSimplifyImpl::manifest(A);
- }
-
- void trackStatistics() const override {
- STATS_DECLTRACK_CSRET_ATTR(value_simplify)
- }
-};
-struct AAValueSimplifyCallSiteArgument : AAValueSimplifyFloating {
- AAValueSimplifyCallSiteArgument(const IRPosition &IRP, Attributor &A)
- : AAValueSimplifyFloating(IRP, A) {}
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
-
- if (SimplifiedAssociatedValue.hasValue() &&
- !SimplifiedAssociatedValue.getValue())
- return Changed;
-
- Value &V = getAssociatedValue();
- auto *C = SimplifiedAssociatedValue.hasValue()
- ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())
- : UndefValue::get(V.getType());
- if (C) {
+ return indicatePessimisticFixpoint();
+
+ // If a candicate was found in this update, return CHANGED.
+
+ return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+ ? ChangeStatus::UNCHANGED
+ : ChangeStatus ::CHANGED;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_FLOATING_ATTR(value_simplify)
+ }
+};
+
+struct AAValueSimplifyFunction : AAValueSimplifyImpl {
+ AAValueSimplifyFunction(const IRPosition &IRP, Attributor &A)
+ : AAValueSimplifyImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ SimplifiedAssociatedValue = &getAnchorValue();
+ indicateOptimisticFixpoint();
+ }
+ /// See AbstractAttribute::initialize(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ llvm_unreachable(
+ "AAValueSimplify(Function|CallSite)::updateImpl will not be called");
+ }
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_FN_ATTR(value_simplify)
+ }
+};
+
+struct AAValueSimplifyCallSite : AAValueSimplifyFunction {
+ AAValueSimplifyCallSite(const IRPosition &IRP, Attributor &A)
+ : AAValueSimplifyFunction(IRP, A) {}
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_CS_ATTR(value_simplify)
+ }
+};
+
+struct AAValueSimplifyCallSiteReturned : AAValueSimplifyReturned {
+ AAValueSimplifyCallSiteReturned(const IRPosition &IRP, Attributor &A)
+ : AAValueSimplifyReturned(IRP, A) {}
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ return AAValueSimplifyImpl::manifest(A);
+ }
+
+ void trackStatistics() const override {
+ STATS_DECLTRACK_CSRET_ATTR(value_simplify)
+ }
+};
+struct AAValueSimplifyCallSiteArgument : AAValueSimplifyFloating {
+ AAValueSimplifyCallSiteArgument(const IRPosition &IRP, Attributor &A)
+ : AAValueSimplifyFloating(IRP, A) {}
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+ if (SimplifiedAssociatedValue.hasValue() &&
+ !SimplifiedAssociatedValue.getValue())
+ return Changed;
+
+ Value &V = getAssociatedValue();
+ auto *C = SimplifiedAssociatedValue.hasValue()
+ ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())
+ : UndefValue::get(V.getType());
+ if (C) {
Use &U = cast<CallBase>(&getAnchorValue())
->getArgOperandUse(getCallSiteArgNo());
- // We can replace the AssociatedValue with the constant.
- if (&V != C && V.getType() == C->getType()) {
- if (A.changeUseAfterManifest(U, *C))
- Changed = ChangeStatus::CHANGED;
- }
- }
-
- return Changed | AAValueSimplify::manifest(A);
- }
-
- void trackStatistics() const override {
- STATS_DECLTRACK_CSARG_ATTR(value_simplify)
- }
-};
-
-/// ----------------------- Heap-To-Stack Conversion ---------------------------
-struct AAHeapToStackImpl : public AAHeapToStack {
- AAHeapToStackImpl(const IRPosition &IRP, Attributor &A)
- : AAHeapToStack(IRP, A) {}
-
- const std::string getAsStr() const override {
- return "[H2S] Mallocs: " + std::to_string(MallocCalls.size());
- }
-
- ChangeStatus manifest(Attributor &A) override {
- assert(getState().isValidState() &&
- "Attempted to manifest an invalid state!");
-
- ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
- Function *F = getAnchorScope();
- const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
-
- for (Instruction *MallocCall : MallocCalls) {
- // This malloc cannot be replaced.
- if (BadMallocCalls.count(MallocCall))
- continue;
-
- for (Instruction *FreeCall : FreesForMalloc[MallocCall]) {
- LLVM_DEBUG(dbgs() << "H2S: Removing free call: " << *FreeCall << "\n");
- A.deleteAfterManifest(*FreeCall);
- HasChanged = ChangeStatus::CHANGED;
- }
-
- LLVM_DEBUG(dbgs() << "H2S: Removing malloc call: " << *MallocCall
- << "\n");
-
- Align Alignment;
- Constant *Size;
- if (isCallocLikeFn(MallocCall, TLI)) {
- auto *Num = cast<ConstantInt>(MallocCall->getOperand(0));
- auto *SizeT = cast<ConstantInt>(MallocCall->getOperand(1));
- APInt TotalSize = SizeT->getValue() * Num->getValue();
- Size =
- ConstantInt::get(MallocCall->getOperand(0)->getType(), TotalSize);
- } else if (isAlignedAllocLikeFn(MallocCall, TLI)) {
- Size = cast<ConstantInt>(MallocCall->getOperand(1));
- Alignment = MaybeAlign(cast<ConstantInt>(MallocCall->getOperand(0))
- ->getValue()
- .getZExtValue())
- .valueOrOne();
- } else {
- Size = cast<ConstantInt>(MallocCall->getOperand(0));
- }
-
- unsigned AS = cast<PointerType>(MallocCall->getType())->getAddressSpace();
- Instruction *AI =
- new AllocaInst(Type::getInt8Ty(F->getContext()), AS, Size, Alignment,
- "", MallocCall->getNextNode());
-
- if (AI->getType() != MallocCall->getType())
- AI = new BitCastInst(AI, MallocCall->getType(), "malloc_bc",
- AI->getNextNode());
-
- A.changeValueAfterManifest(*MallocCall, *AI);
-
- if (auto *II = dyn_cast<InvokeInst>(MallocCall)) {
- auto *NBB = II->getNormalDest();
- BranchInst::Create(NBB, MallocCall->getParent());
- A.deleteAfterManifest(*MallocCall);
- } else {
- A.deleteAfterManifest(*MallocCall);
- }
-
- // Zero out the allocated memory if it was a calloc.
- if (isCallocLikeFn(MallocCall, TLI)) {
- auto *BI = new BitCastInst(AI, MallocCall->getType(), "calloc_bc",
- AI->getNextNode());
- Value *Ops[] = {
- BI, ConstantInt::get(F->getContext(), APInt(8, 0, false)), Size,
- ConstantInt::get(Type::getInt1Ty(F->getContext()), false)};
-
- Type *Tys[] = {BI->getType(), MallocCall->getOperand(0)->getType()};
- Module *M = F->getParent();
- Function *Fn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys);
- CallInst::Create(Fn, Ops, "", BI->getNextNode());
- }
- HasChanged = ChangeStatus::CHANGED;
- }
-
- return HasChanged;
- }
-
- /// Collection of all malloc calls in a function.
- SmallSetVector<Instruction *, 4> MallocCalls;
-
- /// Collection of malloc calls that cannot be converted.
- DenseSet<const Instruction *> BadMallocCalls;
-
- /// A map for each malloc call to the set of associated free calls.
- DenseMap<Instruction *, SmallPtrSet<Instruction *, 4>> FreesForMalloc;
-
- ChangeStatus updateImpl(Attributor &A) override;
-};
-
-ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) {
- const Function *F = getAnchorScope();
- const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
-
- MustBeExecutedContextExplorer &Explorer =
- A.getInfoCache().getMustBeExecutedContextExplorer();
-
- auto FreeCheck = [&](Instruction &I) {
- const auto &Frees = FreesForMalloc.lookup(&I);
- if (Frees.size() != 1)
- return false;
- Instruction *UniqueFree = *Frees.begin();
- return Explorer.findInContextOf(UniqueFree, I.getNextNode());
- };
-
- auto UsesCheck = [&](Instruction &I) {
- bool ValidUsesOnly = true;
- bool MustUse = true;
- auto Pred = [&](const Use &U, bool &Follow) -> bool {
- Instruction *UserI = cast<Instruction>(U.getUser());
- if (isa<LoadInst>(UserI))
- return true;
- if (auto *SI = dyn_cast<StoreInst>(UserI)) {
- if (SI->getValueOperand() == U.get()) {
- LLVM_DEBUG(dbgs()
- << "[H2S] escaping store to memory: " << *UserI << "\n");
- ValidUsesOnly = false;
- } else {
- // A store into the malloc'ed memory is fine.
- }
- return true;
- }
- if (auto *CB = dyn_cast<CallBase>(UserI)) {
- if (!CB->isArgOperand(&U) || CB->isLifetimeStartOrEnd())
- return true;
- // Record malloc.
- if (isFreeCall(UserI, TLI)) {
- if (MustUse) {
- FreesForMalloc[&I].insert(UserI);
- } else {
- LLVM_DEBUG(dbgs() << "[H2S] free potentially on different mallocs: "
- << *UserI << "\n");
- ValidUsesOnly = false;
- }
- return true;
- }
-
- unsigned ArgNo = CB->getArgOperandNo(&U);
-
- const auto &NoCaptureAA = A.getAAFor<AANoCapture>(
- *this, IRPosition::callsite_argument(*CB, ArgNo));
-
- // If a callsite argument use is nofree, we are fine.
- const auto &ArgNoFreeAA = A.getAAFor<AANoFree>(
- *this, IRPosition::callsite_argument(*CB, ArgNo));
-
- if (!NoCaptureAA.isAssumedNoCapture() ||
- !ArgNoFreeAA.isAssumedNoFree()) {
- LLVM_DEBUG(dbgs() << "[H2S] Bad user: " << *UserI << "\n");
- ValidUsesOnly = false;
- }
- return true;
- }
-
- if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) ||
- isa<PHINode>(UserI) || isa<SelectInst>(UserI)) {
- MustUse &= !(isa<PHINode>(UserI) || isa<SelectInst>(UserI));
- Follow = true;
- return true;
- }
- // Unknown user for which we can not track uses further (in a way that
- // makes sense).
- LLVM_DEBUG(dbgs() << "[H2S] Unknown user: " << *UserI << "\n");
- ValidUsesOnly = false;
- return true;
- };
- A.checkForAllUses(Pred, *this, I);
- return ValidUsesOnly;
- };
-
- auto MallocCallocCheck = [&](Instruction &I) {
- if (BadMallocCalls.count(&I))
- return true;
-
- bool IsMalloc = isMallocLikeFn(&I, TLI);
- bool IsAlignedAllocLike = isAlignedAllocLikeFn(&I, TLI);
- bool IsCalloc = !IsMalloc && isCallocLikeFn(&I, TLI);
- if (!IsMalloc && !IsAlignedAllocLike && !IsCalloc) {
- BadMallocCalls.insert(&I);
- return true;
- }
-
- if (IsMalloc) {
- if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(0)))
- if (Size->getValue().ule(MaxHeapToStackSize))
- if (UsesCheck(I) || FreeCheck(I)) {
- MallocCalls.insert(&I);
- return true;
- }
- } else if (IsAlignedAllocLike && isa<ConstantInt>(I.getOperand(0))) {
- // Only if the alignment and sizes are constant.
- if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1)))
- if (Size->getValue().ule(MaxHeapToStackSize))
- if (UsesCheck(I) || FreeCheck(I)) {
- MallocCalls.insert(&I);
- return true;
- }
- } else if (IsCalloc) {
- bool Overflow = false;
- if (auto *Num = dyn_cast<ConstantInt>(I.getOperand(0)))
- if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1)))
- if ((Size->getValue().umul_ov(Num->getValue(), Overflow))
- .ule(MaxHeapToStackSize))
- if (!Overflow && (UsesCheck(I) || FreeCheck(I))) {
- MallocCalls.insert(&I);
- return true;
- }
- }
-
- BadMallocCalls.insert(&I);
- return true;
- };
-
- size_t NumBadMallocs = BadMallocCalls.size();
-
- A.checkForAllCallLikeInstructions(MallocCallocCheck, *this);
-
- if (NumBadMallocs != BadMallocCalls.size())
- return ChangeStatus::CHANGED;
-
- return ChangeStatus::UNCHANGED;
-}
-
-struct AAHeapToStackFunction final : public AAHeapToStackImpl {
- AAHeapToStackFunction(const IRPosition &IRP, Attributor &A)
- : AAHeapToStackImpl(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics().
- void trackStatistics() const override {
- STATS_DECL(
- MallocCalls, Function,
- "Number of malloc/calloc/aligned_alloc calls converted to allocas");
- for (auto *C : MallocCalls)
- if (!BadMallocCalls.count(C))
- ++BUILD_STAT_NAME(MallocCalls, Function);
- }
-};
-
-/// ----------------------- Privatizable Pointers ------------------------------
-struct AAPrivatizablePtrImpl : public AAPrivatizablePtr {
- AAPrivatizablePtrImpl(const IRPosition &IRP, Attributor &A)
- : AAPrivatizablePtr(IRP, A), PrivatizableType(llvm::None) {}
-
- ChangeStatus indicatePessimisticFixpoint() override {
- AAPrivatizablePtr::indicatePessimisticFixpoint();
- PrivatizableType = nullptr;
- return ChangeStatus::CHANGED;
- }
-
- /// Identify the type we can chose for a private copy of the underlying
- /// argument. None means it is not clear yet, nullptr means there is none.
- virtual Optional<Type *> identifyPrivatizableType(Attributor &A) = 0;
-
- /// Return a privatizable type that encloses both T0 and T1.
- /// TODO: This is merely a stub for now as we should manage a mapping as well.
- Optional<Type *> combineTypes(Optional<Type *> T0, Optional<Type *> T1) {
- if (!T0.hasValue())
- return T1;
- if (!T1.hasValue())
- return T0;
- if (T0 == T1)
- return T0;
- return nullptr;
- }
-
- Optional<Type *> getPrivatizableType() const override {
- return PrivatizableType;
- }
-
- const std::string getAsStr() const override {
- return isAssumedPrivatizablePtr() ? "[priv]" : "[no-priv]";
- }
-
-protected:
- Optional<Type *> PrivatizableType;
-};
-
-// TODO: Do this for call site arguments (probably also other values) as well.
-
-struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
- AAPrivatizablePtrArgument(const IRPosition &IRP, Attributor &A)
- : AAPrivatizablePtrImpl(IRP, A) {}
-
- /// See AAPrivatizablePtrImpl::identifyPrivatizableType(...)
- Optional<Type *> identifyPrivatizableType(Attributor &A) override {
- // If this is a byval argument and we know all the call sites (so we can
- // rewrite them), there is no need to check them explicitly.
- bool AllCallSitesKnown;
- if (getIRPosition().hasAttr(Attribute::ByVal) &&
- A.checkForAllCallSites([](AbstractCallSite ACS) { return true; }, *this,
- true, AllCallSitesKnown))
- return getAssociatedValue().getType()->getPointerElementType();
-
- Optional<Type *> Ty;
+ // We can replace the AssociatedValue with the constant.
+ if (&V != C && V.getType() == C->getType()) {
+ if (A.changeUseAfterManifest(U, *C))
+ Changed = ChangeStatus::CHANGED;
+ }
+ }
+
+ return Changed | AAValueSimplify::manifest(A);
+ }
+
+ void trackStatistics() const override {
+ STATS_DECLTRACK_CSARG_ATTR(value_simplify)
+ }
+};
+
+/// ----------------------- Heap-To-Stack Conversion ---------------------------
+struct AAHeapToStackImpl : public AAHeapToStack {
+ AAHeapToStackImpl(const IRPosition &IRP, Attributor &A)
+ : AAHeapToStack(IRP, A) {}
+
+ const std::string getAsStr() const override {
+ return "[H2S] Mallocs: " + std::to_string(MallocCalls.size());
+ }
+
+ ChangeStatus manifest(Attributor &A) override {
+ assert(getState().isValidState() &&
+ "Attempted to manifest an invalid state!");
+
+ ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+ Function *F = getAnchorScope();
+ const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
+
+ for (Instruction *MallocCall : MallocCalls) {
+ // This malloc cannot be replaced.
+ if (BadMallocCalls.count(MallocCall))
+ continue;
+
+ for (Instruction *FreeCall : FreesForMalloc[MallocCall]) {
+ LLVM_DEBUG(dbgs() << "H2S: Removing free call: " << *FreeCall << "\n");
+ A.deleteAfterManifest(*FreeCall);
+ HasChanged = ChangeStatus::CHANGED;
+ }
+
+ LLVM_DEBUG(dbgs() << "H2S: Removing malloc call: " << *MallocCall
+ << "\n");
+
+ Align Alignment;
+ Constant *Size;
+ if (isCallocLikeFn(MallocCall, TLI)) {
+ auto *Num = cast<ConstantInt>(MallocCall->getOperand(0));
+ auto *SizeT = cast<ConstantInt>(MallocCall->getOperand(1));
+ APInt TotalSize = SizeT->getValue() * Num->getValue();
+ Size =
+ ConstantInt::get(MallocCall->getOperand(0)->getType(), TotalSize);
+ } else if (isAlignedAllocLikeFn(MallocCall, TLI)) {
+ Size = cast<ConstantInt>(MallocCall->getOperand(1));
+ Alignment = MaybeAlign(cast<ConstantInt>(MallocCall->getOperand(0))
+ ->getValue()
+ .getZExtValue())
+ .valueOrOne();
+ } else {
+ Size = cast<ConstantInt>(MallocCall->getOperand(0));
+ }
+
+ unsigned AS = cast<PointerType>(MallocCall->getType())->getAddressSpace();
+ Instruction *AI =
+ new AllocaInst(Type::getInt8Ty(F->getContext()), AS, Size, Alignment,
+ "", MallocCall->getNextNode());
+
+ if (AI->getType() != MallocCall->getType())
+ AI = new BitCastInst(AI, MallocCall->getType(), "malloc_bc",
+ AI->getNextNode());
+
+ A.changeValueAfterManifest(*MallocCall, *AI);
+
+ if (auto *II = dyn_cast<InvokeInst>(MallocCall)) {
+ auto *NBB = II->getNormalDest();
+ BranchInst::Create(NBB, MallocCall->getParent());
+ A.deleteAfterManifest(*MallocCall);
+ } else {
+ A.deleteAfterManifest(*MallocCall);
+ }
+
+ // Zero out the allocated memory if it was a calloc.
+ if (isCallocLikeFn(MallocCall, TLI)) {
+ auto *BI = new BitCastInst(AI, MallocCall->getType(), "calloc_bc",
+ AI->getNextNode());
+ Value *Ops[] = {
+ BI, ConstantInt::get(F->getContext(), APInt(8, 0, false)), Size,
+ ConstantInt::get(Type::getInt1Ty(F->getContext()), false)};
+
+ Type *Tys[] = {BI->getType(), MallocCall->getOperand(0)->getType()};
+ Module *M = F->getParent();
+ Function *Fn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys);
+ CallInst::Create(Fn, Ops, "", BI->getNextNode());
+ }
+ HasChanged = ChangeStatus::CHANGED;
+ }
+
+ return HasChanged;
+ }
+
+ /// Collection of all malloc calls in a function.
+ SmallSetVector<Instruction *, 4> MallocCalls;
+
+ /// Collection of malloc calls that cannot be converted.
+ DenseSet<const Instruction *> BadMallocCalls;
+
+ /// A map for each malloc call to the set of associated free calls.
+ DenseMap<Instruction *, SmallPtrSet<Instruction *, 4>> FreesForMalloc;
+
+ ChangeStatus updateImpl(Attributor &A) override;
+};
+
+ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) {
+ const Function *F = getAnchorScope();
+ const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
+
+ MustBeExecutedContextExplorer &Explorer =
+ A.getInfoCache().getMustBeExecutedContextExplorer();
+
+ auto FreeCheck = [&](Instruction &I) {
+ const auto &Frees = FreesForMalloc.lookup(&I);
+ if (Frees.size() != 1)
+ return false;
+ Instruction *UniqueFree = *Frees.begin();
+ return Explorer.findInContextOf(UniqueFree, I.getNextNode());
+ };
+
+ auto UsesCheck = [&](Instruction &I) {
+ bool ValidUsesOnly = true;
+ bool MustUse = true;
+ auto Pred = [&](const Use &U, bool &Follow) -> bool {
+ Instruction *UserI = cast<Instruction>(U.getUser());
+ if (isa<LoadInst>(UserI))
+ return true;
+ if (auto *SI = dyn_cast<StoreInst>(UserI)) {
+ if (SI->getValueOperand() == U.get()) {
+ LLVM_DEBUG(dbgs()
+ << "[H2S] escaping store to memory: " << *UserI << "\n");
+ ValidUsesOnly = false;
+ } else {
+ // A store into the malloc'ed memory is fine.
+ }
+ return true;
+ }
+ if (auto *CB = dyn_cast<CallBase>(UserI)) {
+ if (!CB->isArgOperand(&U) || CB->isLifetimeStartOrEnd())
+ return true;
+ // Record malloc.
+ if (isFreeCall(UserI, TLI)) {
+ if (MustUse) {
+ FreesForMalloc[&I].insert(UserI);
+ } else {
+ LLVM_DEBUG(dbgs() << "[H2S] free potentially on different mallocs: "
+ << *UserI << "\n");
+ ValidUsesOnly = false;
+ }
+ return true;
+ }
+
+ unsigned ArgNo = CB->getArgOperandNo(&U);
+
+ const auto &NoCaptureAA = A.getAAFor<AANoCapture>(
+ *this, IRPosition::callsite_argument(*CB, ArgNo));
+
+ // If a callsite argument use is nofree, we are fine.
+ const auto &ArgNoFreeAA = A.getAAFor<AANoFree>(
+ *this, IRPosition::callsite_argument(*CB, ArgNo));
+
+ if (!NoCaptureAA.isAssumedNoCapture() ||
+ !ArgNoFreeAA.isAssumedNoFree()) {
+ LLVM_DEBUG(dbgs() << "[H2S] Bad user: " << *UserI << "\n");
+ ValidUsesOnly = false;
+ }
+ return true;
+ }
+
+ if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) ||
+ isa<PHINode>(UserI) || isa<SelectInst>(UserI)) {
+ MustUse &= !(isa<PHINode>(UserI) || isa<SelectInst>(UserI));
+ Follow = true;
+ return true;
+ }
+ // Unknown user for which we can not track uses further (in a way that
+ // makes sense).
+ LLVM_DEBUG(dbgs() << "[H2S] Unknown user: " << *UserI << "\n");
+ ValidUsesOnly = false;
+ return true;
+ };
+ A.checkForAllUses(Pred, *this, I);
+ return ValidUsesOnly;
+ };
+
+ auto MallocCallocCheck = [&](Instruction &I) {
+ if (BadMallocCalls.count(&I))
+ return true;
+
+ bool IsMalloc = isMallocLikeFn(&I, TLI);
+ bool IsAlignedAllocLike = isAlignedAllocLikeFn(&I, TLI);
+ bool IsCalloc = !IsMalloc && isCallocLikeFn(&I, TLI);
+ if (!IsMalloc && !IsAlignedAllocLike && !IsCalloc) {
+ BadMallocCalls.insert(&I);
+ return true;
+ }
+
+ if (IsMalloc) {
+ if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(0)))
+ if (Size->getValue().ule(MaxHeapToStackSize))
+ if (UsesCheck(I) || FreeCheck(I)) {
+ MallocCalls.insert(&I);
+ return true;
+ }
+ } else if (IsAlignedAllocLike && isa<ConstantInt>(I.getOperand(0))) {
+ // Only if the alignment and sizes are constant.
+ if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1)))
+ if (Size->getValue().ule(MaxHeapToStackSize))
+ if (UsesCheck(I) || FreeCheck(I)) {
+ MallocCalls.insert(&I);
+ return true;
+ }
+ } else if (IsCalloc) {
+ bool Overflow = false;
+ if (auto *Num = dyn_cast<ConstantInt>(I.getOperand(0)))
+ if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1)))
+ if ((Size->getValue().umul_ov(Num->getValue(), Overflow))
+ .ule(MaxHeapToStackSize))
+ if (!Overflow && (UsesCheck(I) || FreeCheck(I))) {
+ MallocCalls.insert(&I);
+ return true;
+ }
+ }
+
+ BadMallocCalls.insert(&I);
+ return true;
+ };
+
+ size_t NumBadMallocs = BadMallocCalls.size();
+
+ A.checkForAllCallLikeInstructions(MallocCallocCheck, *this);
+
+ if (NumBadMallocs != BadMallocCalls.size())
+ return ChangeStatus::CHANGED;
+
+ return ChangeStatus::UNCHANGED;
+}
+
+struct AAHeapToStackFunction final : public AAHeapToStackImpl {
+ AAHeapToStackFunction(const IRPosition &IRP, Attributor &A)
+ : AAHeapToStackImpl(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics().
+ void trackStatistics() const override {
+ STATS_DECL(
+ MallocCalls, Function,
+ "Number of malloc/calloc/aligned_alloc calls converted to allocas");
+ for (auto *C : MallocCalls)
+ if (!BadMallocCalls.count(C))
+ ++BUILD_STAT_NAME(MallocCalls, Function);
+ }
+};
+
+/// ----------------------- Privatizable Pointers ------------------------------
+struct AAPrivatizablePtrImpl : public AAPrivatizablePtr {
+ AAPrivatizablePtrImpl(const IRPosition &IRP, Attributor &A)
+ : AAPrivatizablePtr(IRP, A), PrivatizableType(llvm::None) {}
+
+ ChangeStatus indicatePessimisticFixpoint() override {
+ AAPrivatizablePtr::indicatePessimisticFixpoint();
+ PrivatizableType = nullptr;
+ return ChangeStatus::CHANGED;
+ }
+
+ /// Identify the type we can chose for a private copy of the underlying
+ /// argument. None means it is not clear yet, nullptr means there is none.
+ virtual Optional<Type *> identifyPrivatizableType(Attributor &A) = 0;
+
+ /// Return a privatizable type that encloses both T0 and T1.
+ /// TODO: This is merely a stub for now as we should manage a mapping as well.
+ Optional<Type *> combineTypes(Optional<Type *> T0, Optional<Type *> T1) {
+ if (!T0.hasValue())
+ return T1;
+ if (!T1.hasValue())
+ return T0;
+ if (T0 == T1)
+ return T0;
+ return nullptr;
+ }
+
+ Optional<Type *> getPrivatizableType() const override {
+ return PrivatizableType;
+ }
+
+ const std::string getAsStr() const override {
+ return isAssumedPrivatizablePtr() ? "[priv]" : "[no-priv]";
+ }
+
+protected:
+ Optional<Type *> PrivatizableType;
+};
+
+// TODO: Do this for call site arguments (probably also other values) as well.
+
+struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
+ AAPrivatizablePtrArgument(const IRPosition &IRP, Attributor &A)
+ : AAPrivatizablePtrImpl(IRP, A) {}
+
+ /// See AAPrivatizablePtrImpl::identifyPrivatizableType(...)
+ Optional<Type *> identifyPrivatizableType(Attributor &A) override {
+ // If this is a byval argument and we know all the call sites (so we can
+ // rewrite them), there is no need to check them explicitly.
+ bool AllCallSitesKnown;
+ if (getIRPosition().hasAttr(Attribute::ByVal) &&
+ A.checkForAllCallSites([](AbstractCallSite ACS) { return true; }, *this,
+ true, AllCallSitesKnown))
+ return getAssociatedValue().getType()->getPointerElementType();
+
+ Optional<Type *> Ty;
unsigned ArgNo = getIRPosition().getCallSiteArgNo();
-
- // Make sure the associated call site argument has the same type at all call
- // sites and it is an allocation we know is safe to privatize, for now that
- // means we only allow alloca instructions.
- // TODO: We can additionally analyze the accesses in the callee to create
- // the type from that information instead. That is a little more
- // involved and will be done in a follow up patch.
- auto CallSiteCheck = [&](AbstractCallSite ACS) {
- IRPosition ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo);
- // Check if a coresponding argument was found or if it is one not
- // associated (which can happen for callback calls).
- if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
- return false;
-
- // Check that all call sites agree on a type.
- auto &PrivCSArgAA = A.getAAFor<AAPrivatizablePtr>(*this, ACSArgPos);
- Optional<Type *> CSTy = PrivCSArgAA.getPrivatizableType();
-
- LLVM_DEBUG({
- dbgs() << "[AAPrivatizablePtr] ACSPos: " << ACSArgPos << ", CSTy: ";
- if (CSTy.hasValue() && CSTy.getValue())
- CSTy.getValue()->print(dbgs());
- else if (CSTy.hasValue())
- dbgs() << "<nullptr>";
- else
- dbgs() << "<none>";
- });
-
- Ty = combineTypes(Ty, CSTy);
-
- LLVM_DEBUG({
- dbgs() << " : New Type: ";
- if (Ty.hasValue() && Ty.getValue())
- Ty.getValue()->print(dbgs());
- else if (Ty.hasValue())
- dbgs() << "<nullptr>";
- else
- dbgs() << "<none>";
- dbgs() << "\n";
- });
-
- return !Ty.hasValue() || Ty.getValue();
- };
-
- if (!A.checkForAllCallSites(CallSiteCheck, *this, true, AllCallSitesKnown))
- return nullptr;
- return Ty;
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- PrivatizableType = identifyPrivatizableType(A);
- if (!PrivatizableType.hasValue())
- return ChangeStatus::UNCHANGED;
- if (!PrivatizableType.getValue())
- return indicatePessimisticFixpoint();
-
- // The dependence is optional so we don't give up once we give up on the
- // alignment.
- A.getAAFor<AAAlign>(*this, IRPosition::value(getAssociatedValue()),
- /* TrackDependence */ true, DepClassTy::OPTIONAL);
-
- // Avoid arguments with padding for now.
- if (!getIRPosition().hasAttr(Attribute::ByVal) &&
- !ArgumentPromotionPass::isDenselyPacked(PrivatizableType.getValue(),
- A.getInfoCache().getDL())) {
- LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Padding detected\n");
- return indicatePessimisticFixpoint();
- }
-
- // Verify callee and caller agree on how the promoted argument would be
- // passed.
- // TODO: The use of the ArgumentPromotion interface here is ugly, we need a
- // specialized form of TargetTransformInfo::areFunctionArgsABICompatible
- // which doesn't require the arguments ArgumentPromotion wanted to pass.
- Function &Fn = *getIRPosition().getAnchorScope();
- SmallPtrSet<Argument *, 1> ArgsToPromote, Dummy;
- ArgsToPromote.insert(getAssociatedArgument());
- const auto *TTI =
- A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(Fn);
- if (!TTI ||
- !ArgumentPromotionPass::areFunctionArgsABICompatible(
- Fn, *TTI, ArgsToPromote, Dummy) ||
- ArgsToPromote.empty()) {
- LLVM_DEBUG(
- dbgs() << "[AAPrivatizablePtr] ABI incompatibility detected for "
- << Fn.getName() << "\n");
- return indicatePessimisticFixpoint();
- }
-
- // Collect the types that will replace the privatizable type in the function
- // signature.
- SmallVector<Type *, 16> ReplacementTypes;
- identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes);
-
- // Register a rewrite of the argument.
- Argument *Arg = getAssociatedArgument();
- if (!A.isValidFunctionSignatureRewrite(*Arg, ReplacementTypes)) {
- LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Rewrite not valid\n");
- return indicatePessimisticFixpoint();
- }
-
- unsigned ArgNo = Arg->getArgNo();
-
- // Helper to check if for the given call site the associated argument is
- // passed to a callback where the privatization would be different.
- auto IsCompatiblePrivArgOfCallback = [&](CallBase &CB) {
- SmallVector<const Use *, 4> CallbackUses;
- AbstractCallSite::getCallbackUses(CB, CallbackUses);
- for (const Use *U : CallbackUses) {
- AbstractCallSite CBACS(U);
- assert(CBACS && CBACS.isCallbackCall());
- for (Argument &CBArg : CBACS.getCalledFunction()->args()) {
- int CBArgNo = CBACS.getCallArgOperandNo(CBArg);
-
- LLVM_DEBUG({
- dbgs()
- << "[AAPrivatizablePtr] Argument " << *Arg
- << "check if can be privatized in the context of its parent ("
- << Arg->getParent()->getName()
- << ")\n[AAPrivatizablePtr] because it is an argument in a "
- "callback ("
- << CBArgNo << "@" << CBACS.getCalledFunction()->getName()
- << ")\n[AAPrivatizablePtr] " << CBArg << " : "
- << CBACS.getCallArgOperand(CBArg) << " vs "
- << CB.getArgOperand(ArgNo) << "\n"
- << "[AAPrivatizablePtr] " << CBArg << " : "
- << CBACS.getCallArgOperandNo(CBArg) << " vs " << ArgNo << "\n";
- });
-
- if (CBArgNo != int(ArgNo))
- continue;
- const auto &CBArgPrivAA =
- A.getAAFor<AAPrivatizablePtr>(*this, IRPosition::argument(CBArg));
- if (CBArgPrivAA.isValidState()) {
- auto CBArgPrivTy = CBArgPrivAA.getPrivatizableType();
- if (!CBArgPrivTy.hasValue())
- continue;
- if (CBArgPrivTy.getValue() == PrivatizableType)
- continue;
- }
-
- LLVM_DEBUG({
- dbgs() << "[AAPrivatizablePtr] Argument " << *Arg
- << " cannot be privatized in the context of its parent ("
- << Arg->getParent()->getName()
- << ")\n[AAPrivatizablePtr] because it is an argument in a "
- "callback ("
- << CBArgNo << "@" << CBACS.getCalledFunction()->getName()
- << ").\n[AAPrivatizablePtr] for which the argument "
- "privatization is not compatible.\n";
- });
- return false;
- }
- }
- return true;
- };
-
- // Helper to check if for the given call site the associated argument is
- // passed to a direct call where the privatization would be different.
- auto IsCompatiblePrivArgOfDirectCS = [&](AbstractCallSite ACS) {
- CallBase *DC = cast<CallBase>(ACS.getInstruction());
- int DCArgNo = ACS.getCallArgOperandNo(ArgNo);
- assert(DCArgNo >= 0 && unsigned(DCArgNo) < DC->getNumArgOperands() &&
- "Expected a direct call operand for callback call operand");
-
- LLVM_DEBUG({
- dbgs() << "[AAPrivatizablePtr] Argument " << *Arg
- << " check if be privatized in the context of its parent ("
- << Arg->getParent()->getName()
- << ")\n[AAPrivatizablePtr] because it is an argument in a "
- "direct call of ("
- << DCArgNo << "@" << DC->getCalledFunction()->getName()
- << ").\n";
- });
-
- Function *DCCallee = DC->getCalledFunction();
- if (unsigned(DCArgNo) < DCCallee->arg_size()) {
- const auto &DCArgPrivAA = A.getAAFor<AAPrivatizablePtr>(
- *this, IRPosition::argument(*DCCallee->getArg(DCArgNo)));
- if (DCArgPrivAA.isValidState()) {
- auto DCArgPrivTy = DCArgPrivAA.getPrivatizableType();
- if (!DCArgPrivTy.hasValue())
- return true;
- if (DCArgPrivTy.getValue() == PrivatizableType)
- return true;
- }
- }
-
- LLVM_DEBUG({
- dbgs() << "[AAPrivatizablePtr] Argument " << *Arg
- << " cannot be privatized in the context of its parent ("
- << Arg->getParent()->getName()
- << ")\n[AAPrivatizablePtr] because it is an argument in a "
- "direct call of ("
- << ACS.getInstruction()->getCalledFunction()->getName()
- << ").\n[AAPrivatizablePtr] for which the argument "
- "privatization is not compatible.\n";
- });
- return false;
- };
-
- // Helper to check if the associated argument is used at the given abstract
- // call site in a way that is incompatible with the privatization assumed
- // here.
- auto IsCompatiblePrivArgOfOtherCallSite = [&](AbstractCallSite ACS) {
- if (ACS.isDirectCall())
- return IsCompatiblePrivArgOfCallback(*ACS.getInstruction());
- if (ACS.isCallbackCall())
- return IsCompatiblePrivArgOfDirectCS(ACS);
- return false;
- };
-
- bool AllCallSitesKnown;
- if (!A.checkForAllCallSites(IsCompatiblePrivArgOfOtherCallSite, *this, true,
- AllCallSitesKnown))
- return indicatePessimisticFixpoint();
-
- return ChangeStatus::UNCHANGED;
- }
-
- /// Given a type to private \p PrivType, collect the constituates (which are
- /// used) in \p ReplacementTypes.
- static void
- identifyReplacementTypes(Type *PrivType,
- SmallVectorImpl<Type *> &ReplacementTypes) {
- // TODO: For now we expand the privatization type to the fullest which can
- // lead to dead arguments that need to be removed later.
- assert(PrivType && "Expected privatizable type!");
-
- // Traverse the type, extract constituate types on the outermost level.
- if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
- for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++)
- ReplacementTypes.push_back(PrivStructType->getElementType(u));
- } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
- ReplacementTypes.append(PrivArrayType->getNumElements(),
- PrivArrayType->getElementType());
- } else {
- ReplacementTypes.push_back(PrivType);
- }
- }
-
- /// Initialize \p Base according to the type \p PrivType at position \p IP.
- /// The values needed are taken from the arguments of \p F starting at
- /// position \p ArgNo.
- static void createInitialization(Type *PrivType, Value &Base, Function &F,
- unsigned ArgNo, Instruction &IP) {
- assert(PrivType && "Expected privatizable type!");
-
- IRBuilder<NoFolder> IRB(&IP);
- const DataLayout &DL = F.getParent()->getDataLayout();
-
- // Traverse the type, build GEPs and stores.
- if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
- const StructLayout *PrivStructLayout = DL.getStructLayout(PrivStructType);
- for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) {
- Type *PointeeTy = PrivStructType->getElementType(u)->getPointerTo();
- Value *Ptr = constructPointer(
- PointeeTy, &Base, PrivStructLayout->getElementOffset(u), IRB, DL);
- new StoreInst(F.getArg(ArgNo + u), Ptr, &IP);
- }
- } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
+
+ // Make sure the associated call site argument has the same type at all call
+ // sites and it is an allocation we know is safe to privatize, for now that
+ // means we only allow alloca instructions.
+ // TODO: We can additionally analyze the accesses in the callee to create
+ // the type from that information instead. That is a little more
+ // involved and will be done in a follow up patch.
+ auto CallSiteCheck = [&](AbstractCallSite ACS) {
+ IRPosition ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo);
+ // Check if a coresponding argument was found or if it is one not
+ // associated (which can happen for callback calls).
+ if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
+ return false;
+
+ // Check that all call sites agree on a type.
+ auto &PrivCSArgAA = A.getAAFor<AAPrivatizablePtr>(*this, ACSArgPos);
+ Optional<Type *> CSTy = PrivCSArgAA.getPrivatizableType();
+
+ LLVM_DEBUG({
+ dbgs() << "[AAPrivatizablePtr] ACSPos: " << ACSArgPos << ", CSTy: ";
+ if (CSTy.hasValue() && CSTy.getValue())
+ CSTy.getValue()->print(dbgs());
+ else if (CSTy.hasValue())
+ dbgs() << "<nullptr>";
+ else
+ dbgs() << "<none>";
+ });
+
+ Ty = combineTypes(Ty, CSTy);
+
+ LLVM_DEBUG({
+ dbgs() << " : New Type: ";
+ if (Ty.hasValue() && Ty.getValue())
+ Ty.getValue()->print(dbgs());
+ else if (Ty.hasValue())
+ dbgs() << "<nullptr>";
+ else
+ dbgs() << "<none>";
+ dbgs() << "\n";
+ });
+
+ return !Ty.hasValue() || Ty.getValue();
+ };
+
+ if (!A.checkForAllCallSites(CallSiteCheck, *this, true, AllCallSitesKnown))
+ return nullptr;
+ return Ty;
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ PrivatizableType = identifyPrivatizableType(A);
+ if (!PrivatizableType.hasValue())
+ return ChangeStatus::UNCHANGED;
+ if (!PrivatizableType.getValue())
+ return indicatePessimisticFixpoint();
+
+ // The dependence is optional so we don't give up once we give up on the
+ // alignment.
+ A.getAAFor<AAAlign>(*this, IRPosition::value(getAssociatedValue()),
+ /* TrackDependence */ true, DepClassTy::OPTIONAL);
+
+ // Avoid arguments with padding for now.
+ if (!getIRPosition().hasAttr(Attribute::ByVal) &&
+ !ArgumentPromotionPass::isDenselyPacked(PrivatizableType.getValue(),
+ A.getInfoCache().getDL())) {
+ LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Padding detected\n");
+ return indicatePessimisticFixpoint();
+ }
+
+ // Verify callee and caller agree on how the promoted argument would be
+ // passed.
+ // TODO: The use of the ArgumentPromotion interface here is ugly, we need a
+ // specialized form of TargetTransformInfo::areFunctionArgsABICompatible
+ // which doesn't require the arguments ArgumentPromotion wanted to pass.
+ Function &Fn = *getIRPosition().getAnchorScope();
+ SmallPtrSet<Argument *, 1> ArgsToPromote, Dummy;
+ ArgsToPromote.insert(getAssociatedArgument());
+ const auto *TTI =
+ A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(Fn);
+ if (!TTI ||
+ !ArgumentPromotionPass::areFunctionArgsABICompatible(
+ Fn, *TTI, ArgsToPromote, Dummy) ||
+ ArgsToPromote.empty()) {
+ LLVM_DEBUG(
+ dbgs() << "[AAPrivatizablePtr] ABI incompatibility detected for "
+ << Fn.getName() << "\n");
+ return indicatePessimisticFixpoint();
+ }
+
+ // Collect the types that will replace the privatizable type in the function
+ // signature.
+ SmallVector<Type *, 16> ReplacementTypes;
+ identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes);
+
+ // Register a rewrite of the argument.
+ Argument *Arg = getAssociatedArgument();
+ if (!A.isValidFunctionSignatureRewrite(*Arg, ReplacementTypes)) {
+ LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Rewrite not valid\n");
+ return indicatePessimisticFixpoint();
+ }
+
+ unsigned ArgNo = Arg->getArgNo();
+
+ // Helper to check if for the given call site the associated argument is
+ // passed to a callback where the privatization would be different.
+ auto IsCompatiblePrivArgOfCallback = [&](CallBase &CB) {
+ SmallVector<const Use *, 4> CallbackUses;
+ AbstractCallSite::getCallbackUses(CB, CallbackUses);
+ for (const Use *U : CallbackUses) {
+ AbstractCallSite CBACS(U);
+ assert(CBACS && CBACS.isCallbackCall());
+ for (Argument &CBArg : CBACS.getCalledFunction()->args()) {
+ int CBArgNo = CBACS.getCallArgOperandNo(CBArg);
+
+ LLVM_DEBUG({
+ dbgs()
+ << "[AAPrivatizablePtr] Argument " << *Arg
+ << "check if can be privatized in the context of its parent ("
+ << Arg->getParent()->getName()
+ << ")\n[AAPrivatizablePtr] because it is an argument in a "
+ "callback ("
+ << CBArgNo << "@" << CBACS.getCalledFunction()->getName()
+ << ")\n[AAPrivatizablePtr] " << CBArg << " : "
+ << CBACS.getCallArgOperand(CBArg) << " vs "
+ << CB.getArgOperand(ArgNo) << "\n"
+ << "[AAPrivatizablePtr] " << CBArg << " : "
+ << CBACS.getCallArgOperandNo(CBArg) << " vs " << ArgNo << "\n";
+ });
+
+ if (CBArgNo != int(ArgNo))
+ continue;
+ const auto &CBArgPrivAA =
+ A.getAAFor<AAPrivatizablePtr>(*this, IRPosition::argument(CBArg));
+ if (CBArgPrivAA.isValidState()) {
+ auto CBArgPrivTy = CBArgPrivAA.getPrivatizableType();
+ if (!CBArgPrivTy.hasValue())
+ continue;
+ if (CBArgPrivTy.getValue() == PrivatizableType)
+ continue;
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "[AAPrivatizablePtr] Argument " << *Arg
+ << " cannot be privatized in the context of its parent ("
+ << Arg->getParent()->getName()
+ << ")\n[AAPrivatizablePtr] because it is an argument in a "
+ "callback ("
+ << CBArgNo << "@" << CBACS.getCalledFunction()->getName()
+ << ").\n[AAPrivatizablePtr] for which the argument "
+ "privatization is not compatible.\n";
+ });
+ return false;
+ }
+ }
+ return true;
+ };
+
+ // Helper to check if for the given call site the associated argument is
+ // passed to a direct call where the privatization would be different.
+ auto IsCompatiblePrivArgOfDirectCS = [&](AbstractCallSite ACS) {
+ CallBase *DC = cast<CallBase>(ACS.getInstruction());
+ int DCArgNo = ACS.getCallArgOperandNo(ArgNo);
+ assert(DCArgNo >= 0 && unsigned(DCArgNo) < DC->getNumArgOperands() &&
+ "Expected a direct call operand for callback call operand");
+
+ LLVM_DEBUG({
+ dbgs() << "[AAPrivatizablePtr] Argument " << *Arg
+ << " check if be privatized in the context of its parent ("
+ << Arg->getParent()->getName()
+ << ")\n[AAPrivatizablePtr] because it is an argument in a "
+ "direct call of ("
+ << DCArgNo << "@" << DC->getCalledFunction()->getName()
+ << ").\n";
+ });
+
+ Function *DCCallee = DC->getCalledFunction();
+ if (unsigned(DCArgNo) < DCCallee->arg_size()) {
+ const auto &DCArgPrivAA = A.getAAFor<AAPrivatizablePtr>(
+ *this, IRPosition::argument(*DCCallee->getArg(DCArgNo)));
+ if (DCArgPrivAA.isValidState()) {
+ auto DCArgPrivTy = DCArgPrivAA.getPrivatizableType();
+ if (!DCArgPrivTy.hasValue())
+ return true;
+ if (DCArgPrivTy.getValue() == PrivatizableType)
+ return true;
+ }
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "[AAPrivatizablePtr] Argument " << *Arg
+ << " cannot be privatized in the context of its parent ("
+ << Arg->getParent()->getName()
+ << ")\n[AAPrivatizablePtr] because it is an argument in a "
+ "direct call of ("
+ << ACS.getInstruction()->getCalledFunction()->getName()
+ << ").\n[AAPrivatizablePtr] for which the argument "
+ "privatization is not compatible.\n";
+ });
+ return false;
+ };
+
+ // Helper to check if the associated argument is used at the given abstract
+ // call site in a way that is incompatible with the privatization assumed
+ // here.
+ auto IsCompatiblePrivArgOfOtherCallSite = [&](AbstractCallSite ACS) {
+ if (ACS.isDirectCall())
+ return IsCompatiblePrivArgOfCallback(*ACS.getInstruction());
+ if (ACS.isCallbackCall())
+ return IsCompatiblePrivArgOfDirectCS(ACS);
+ return false;
+ };
+
+ bool AllCallSitesKnown;
+ if (!A.checkForAllCallSites(IsCompatiblePrivArgOfOtherCallSite, *this, true,
+ AllCallSitesKnown))
+ return indicatePessimisticFixpoint();
+
+ return ChangeStatus::UNCHANGED;
+ }
+
+ /// Given a type to private \p PrivType, collect the constituates (which are
+ /// used) in \p ReplacementTypes.
+ static void
+ identifyReplacementTypes(Type *PrivType,
+ SmallVectorImpl<Type *> &ReplacementTypes) {
+ // TODO: For now we expand the privatization type to the fullest which can
+ // lead to dead arguments that need to be removed later.
+ assert(PrivType && "Expected privatizable type!");
+
+ // Traverse the type, extract constituate types on the outermost level.
+ if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
+ for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++)
+ ReplacementTypes.push_back(PrivStructType->getElementType(u));
+ } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
+ ReplacementTypes.append(PrivArrayType->getNumElements(),
+ PrivArrayType->getElementType());
+ } else {
+ ReplacementTypes.push_back(PrivType);
+ }
+ }
+
+ /// Initialize \p Base according to the type \p PrivType at position \p IP.
+ /// The values needed are taken from the arguments of \p F starting at
+ /// position \p ArgNo.
+ static void createInitialization(Type *PrivType, Value &Base, Function &F,
+ unsigned ArgNo, Instruction &IP) {
+ assert(PrivType && "Expected privatizable type!");
+
+ IRBuilder<NoFolder> IRB(&IP);
+ const DataLayout &DL = F.getParent()->getDataLayout();
+
+ // Traverse the type, build GEPs and stores.
+ if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
+ const StructLayout *PrivStructLayout = DL.getStructLayout(PrivStructType);
+ for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) {
+ Type *PointeeTy = PrivStructType->getElementType(u)->getPointerTo();
+ Value *Ptr = constructPointer(
+ PointeeTy, &Base, PrivStructLayout->getElementOffset(u), IRB, DL);
+ new StoreInst(F.getArg(ArgNo + u), Ptr, &IP);
+ }
+ } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
Type *PointeeTy = PrivArrayType->getElementType();
Type *PointeePtrTy = PointeeTy->getPointerTo();
uint64_t PointeeTySize = DL.getTypeStoreSize(PointeeTy);
- for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) {
- Value *Ptr =
- constructPointer(PointeePtrTy, &Base, u * PointeeTySize, IRB, DL);
- new StoreInst(F.getArg(ArgNo + u), Ptr, &IP);
- }
- } else {
- new StoreInst(F.getArg(ArgNo), &Base, &IP);
- }
- }
-
- /// Extract values from \p Base according to the type \p PrivType at the
- /// call position \p ACS. The values are appended to \p ReplacementValues.
- void createReplacementValues(Align Alignment, Type *PrivType,
- AbstractCallSite ACS, Value *Base,
- SmallVectorImpl<Value *> &ReplacementValues) {
- assert(Base && "Expected base value!");
- assert(PrivType && "Expected privatizable type!");
- Instruction *IP = ACS.getInstruction();
-
- IRBuilder<NoFolder> IRB(IP);
- const DataLayout &DL = IP->getModule()->getDataLayout();
-
- if (Base->getType()->getPointerElementType() != PrivType)
- Base = BitCastInst::CreateBitOrPointerCast(Base, PrivType->getPointerTo(),
- "", ACS.getInstruction());
-
- // Traverse the type, build GEPs and loads.
- if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
- const StructLayout *PrivStructLayout = DL.getStructLayout(PrivStructType);
- for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) {
- Type *PointeeTy = PrivStructType->getElementType(u);
- Value *Ptr =
- constructPointer(PointeeTy->getPointerTo(), Base,
- PrivStructLayout->getElementOffset(u), IRB, DL);
- LoadInst *L = new LoadInst(PointeeTy, Ptr, "", IP);
- L->setAlignment(Alignment);
- ReplacementValues.push_back(L);
- }
- } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
- Type *PointeeTy = PrivArrayType->getElementType();
- uint64_t PointeeTySize = DL.getTypeStoreSize(PointeeTy);
- Type *PointeePtrTy = PointeeTy->getPointerTo();
- for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) {
- Value *Ptr =
- constructPointer(PointeePtrTy, Base, u * PointeeTySize, IRB, DL);
+ for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) {
+ Value *Ptr =
+ constructPointer(PointeePtrTy, &Base, u * PointeeTySize, IRB, DL);
+ new StoreInst(F.getArg(ArgNo + u), Ptr, &IP);
+ }
+ } else {
+ new StoreInst(F.getArg(ArgNo), &Base, &IP);
+ }
+ }
+
+ /// Extract values from \p Base according to the type \p PrivType at the
+ /// call position \p ACS. The values are appended to \p ReplacementValues.
+ void createReplacementValues(Align Alignment, Type *PrivType,
+ AbstractCallSite ACS, Value *Base,
+ SmallVectorImpl<Value *> &ReplacementValues) {
+ assert(Base && "Expected base value!");
+ assert(PrivType && "Expected privatizable type!");
+ Instruction *IP = ACS.getInstruction();
+
+ IRBuilder<NoFolder> IRB(IP);
+ const DataLayout &DL = IP->getModule()->getDataLayout();
+
+ if (Base->getType()->getPointerElementType() != PrivType)
+ Base = BitCastInst::CreateBitOrPointerCast(Base, PrivType->getPointerTo(),
+ "", ACS.getInstruction());
+
+ // Traverse the type, build GEPs and loads.
+ if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
+ const StructLayout *PrivStructLayout = DL.getStructLayout(PrivStructType);
+ for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) {
+ Type *PointeeTy = PrivStructType->getElementType(u);
+ Value *Ptr =
+ constructPointer(PointeeTy->getPointerTo(), Base,
+ PrivStructLayout->getElementOffset(u), IRB, DL);
+ LoadInst *L = new LoadInst(PointeeTy, Ptr, "", IP);
+ L->setAlignment(Alignment);
+ ReplacementValues.push_back(L);
+ }
+ } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
+ Type *PointeeTy = PrivArrayType->getElementType();
+ uint64_t PointeeTySize = DL.getTypeStoreSize(PointeeTy);
+ Type *PointeePtrTy = PointeeTy->getPointerTo();
+ for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) {
+ Value *Ptr =
+ constructPointer(PointeePtrTy, Base, u * PointeeTySize, IRB, DL);
LoadInst *L = new LoadInst(PointeeTy, Ptr, "", IP);
- L->setAlignment(Alignment);
- ReplacementValues.push_back(L);
- }
- } else {
- LoadInst *L = new LoadInst(PrivType, Base, "", IP);
- L->setAlignment(Alignment);
- ReplacementValues.push_back(L);
- }
- }
-
- /// See AbstractAttribute::manifest(...)
- ChangeStatus manifest(Attributor &A) override {
- if (!PrivatizableType.hasValue())
- return ChangeStatus::UNCHANGED;
- assert(PrivatizableType.getValue() && "Expected privatizable type!");
-
- // Collect all tail calls in the function as we cannot allow new allocas to
- // escape into tail recursion.
- // TODO: Be smarter about new allocas escaping into tail calls.
- SmallVector<CallInst *, 16> TailCalls;
- if (!A.checkForAllInstructions(
- [&](Instruction &I) {
- CallInst &CI = cast<CallInst>(I);
- if (CI.isTailCall())
- TailCalls.push_back(&CI);
- return true;
- },
- *this, {Instruction::Call}))
- return ChangeStatus::UNCHANGED;
-
- Argument *Arg = getAssociatedArgument();
- // Query AAAlign attribute for alignment of associated argument to
- // determine the best alignment of loads.
- const auto &AlignAA = A.getAAFor<AAAlign>(*this, IRPosition::value(*Arg));
-
- // Callback to repair the associated function. A new alloca is placed at the
- // beginning and initialized with the values passed through arguments. The
- // new alloca replaces the use of the old pointer argument.
- Attributor::ArgumentReplacementInfo::CalleeRepairCBTy FnRepairCB =
- [=](const Attributor::ArgumentReplacementInfo &ARI,
- Function &ReplacementFn, Function::arg_iterator ArgIt) {
- BasicBlock &EntryBB = ReplacementFn.getEntryBlock();
- Instruction *IP = &*EntryBB.getFirstInsertionPt();
+ L->setAlignment(Alignment);
+ ReplacementValues.push_back(L);
+ }
+ } else {
+ LoadInst *L = new LoadInst(PrivType, Base, "", IP);
+ L->setAlignment(Alignment);
+ ReplacementValues.push_back(L);
+ }
+ }
+
+ /// See AbstractAttribute::manifest(...)
+ ChangeStatus manifest(Attributor &A) override {
+ if (!PrivatizableType.hasValue())
+ return ChangeStatus::UNCHANGED;
+ assert(PrivatizableType.getValue() && "Expected privatizable type!");
+
+ // Collect all tail calls in the function as we cannot allow new allocas to
+ // escape into tail recursion.
+ // TODO: Be smarter about new allocas escaping into tail calls.
+ SmallVector<CallInst *, 16> TailCalls;
+ if (!A.checkForAllInstructions(
+ [&](Instruction &I) {
+ CallInst &CI = cast<CallInst>(I);
+ if (CI.isTailCall())
+ TailCalls.push_back(&CI);
+ return true;
+ },
+ *this, {Instruction::Call}))
+ return ChangeStatus::UNCHANGED;
+
+ Argument *Arg = getAssociatedArgument();
+ // Query AAAlign attribute for alignment of associated argument to
+ // determine the best alignment of loads.
+ const auto &AlignAA = A.getAAFor<AAAlign>(*this, IRPosition::value(*Arg));
+
+ // Callback to repair the associated function. A new alloca is placed at the
+ // beginning and initialized with the values passed through arguments. The
+ // new alloca replaces the use of the old pointer argument.
+ Attributor::ArgumentReplacementInfo::CalleeRepairCBTy FnRepairCB =
+ [=](const Attributor::ArgumentReplacementInfo &ARI,
+ Function &ReplacementFn, Function::arg_iterator ArgIt) {
+ BasicBlock &EntryBB = ReplacementFn.getEntryBlock();
+ Instruction *IP = &*EntryBB.getFirstInsertionPt();
Instruction *AI = new AllocaInst(PrivatizableType.getValue(), 0,
Arg->getName() + ".priv", IP);
- createInitialization(PrivatizableType.getValue(), *AI, ReplacementFn,
- ArgIt->getArgNo(), *IP);
+ createInitialization(PrivatizableType.getValue(), *AI, ReplacementFn,
+ ArgIt->getArgNo(), *IP);
if (AI->getType() != Arg->getType())
AI =
BitCastInst::CreateBitOrPointerCast(AI, Arg->getType(), "", IP);
- Arg->replaceAllUsesWith(AI);
-
- for (CallInst *CI : TailCalls)
- CI->setTailCall(false);
- };
-
- // Callback to repair a call site of the associated function. The elements
- // of the privatizable type are loaded prior to the call and passed to the
- // new function version.
- Attributor::ArgumentReplacementInfo::ACSRepairCBTy ACSRepairCB =
- [=, &AlignAA](const Attributor::ArgumentReplacementInfo &ARI,
- AbstractCallSite ACS,
- SmallVectorImpl<Value *> &NewArgOperands) {
- // When no alignment is specified for the load instruction,
- // natural alignment is assumed.
- createReplacementValues(
- assumeAligned(AlignAA.getAssumedAlign()),
- PrivatizableType.getValue(), ACS,
- ACS.getCallArgOperand(ARI.getReplacedArg().getArgNo()),
- NewArgOperands);
- };
-
- // Collect the types that will replace the privatizable type in the function
- // signature.
- SmallVector<Type *, 16> ReplacementTypes;
- identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes);
-
- // Register a rewrite of the argument.
- if (A.registerFunctionSignatureRewrite(*Arg, ReplacementTypes,
- std::move(FnRepairCB),
- std::move(ACSRepairCB)))
- return ChangeStatus::CHANGED;
- return ChangeStatus::UNCHANGED;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_ARG_ATTR(privatizable_ptr);
- }
-};
-
-struct AAPrivatizablePtrFloating : public AAPrivatizablePtrImpl {
- AAPrivatizablePtrFloating(const IRPosition &IRP, Attributor &A)
- : AAPrivatizablePtrImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- virtual void initialize(Attributor &A) override {
- // TODO: We can privatize more than arguments.
- indicatePessimisticFixpoint();
- }
-
- ChangeStatus updateImpl(Attributor &A) override {
- llvm_unreachable("AAPrivatizablePtr(Floating|Returned|CallSiteReturned)::"
- "updateImpl will not be called");
- }
-
- /// See AAPrivatizablePtrImpl::identifyPrivatizableType(...)
- Optional<Type *> identifyPrivatizableType(Attributor &A) override {
+ Arg->replaceAllUsesWith(AI);
+
+ for (CallInst *CI : TailCalls)
+ CI->setTailCall(false);
+ };
+
+ // Callback to repair a call site of the associated function. The elements
+ // of the privatizable type are loaded prior to the call and passed to the
+ // new function version.
+ Attributor::ArgumentReplacementInfo::ACSRepairCBTy ACSRepairCB =
+ [=, &AlignAA](const Attributor::ArgumentReplacementInfo &ARI,
+ AbstractCallSite ACS,
+ SmallVectorImpl<Value *> &NewArgOperands) {
+ // When no alignment is specified for the load instruction,
+ // natural alignment is assumed.
+ createReplacementValues(
+ assumeAligned(AlignAA.getAssumedAlign()),
+ PrivatizableType.getValue(), ACS,
+ ACS.getCallArgOperand(ARI.getReplacedArg().getArgNo()),
+ NewArgOperands);
+ };
+
+ // Collect the types that will replace the privatizable type in the function
+ // signature.
+ SmallVector<Type *, 16> ReplacementTypes;
+ identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes);
+
+ // Register a rewrite of the argument.
+ if (A.registerFunctionSignatureRewrite(*Arg, ReplacementTypes,
+ std::move(FnRepairCB),
+ std::move(ACSRepairCB)))
+ return ChangeStatus::CHANGED;
+ return ChangeStatus::UNCHANGED;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_ARG_ATTR(privatizable_ptr);
+ }
+};
+
+struct AAPrivatizablePtrFloating : public AAPrivatizablePtrImpl {
+ AAPrivatizablePtrFloating(const IRPosition &IRP, Attributor &A)
+ : AAPrivatizablePtrImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ virtual void initialize(Attributor &A) override {
+ // TODO: We can privatize more than arguments.
+ indicatePessimisticFixpoint();
+ }
+
+ ChangeStatus updateImpl(Attributor &A) override {
+ llvm_unreachable("AAPrivatizablePtr(Floating|Returned|CallSiteReturned)::"
+ "updateImpl will not be called");
+ }
+
+ /// See AAPrivatizablePtrImpl::identifyPrivatizableType(...)
+ Optional<Type *> identifyPrivatizableType(Attributor &A) override {
Value *Obj = getUnderlyingObject(&getAssociatedValue());
- if (!Obj) {
- LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] No underlying object found!\n");
- return nullptr;
- }
-
- if (auto *AI = dyn_cast<AllocaInst>(Obj))
- if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize()))
- if (CI->isOne())
- return Obj->getType()->getPointerElementType();
- if (auto *Arg = dyn_cast<Argument>(Obj)) {
- auto &PrivArgAA =
- A.getAAFor<AAPrivatizablePtr>(*this, IRPosition::argument(*Arg));
- if (PrivArgAA.isAssumedPrivatizablePtr())
- return Obj->getType()->getPointerElementType();
- }
-
- LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Underlying object neither valid "
- "alloca nor privatizable argument: "
- << *Obj << "!\n");
- return nullptr;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_FLOATING_ATTR(privatizable_ptr);
- }
-};
-
-struct AAPrivatizablePtrCallSiteArgument final
- : public AAPrivatizablePtrFloating {
- AAPrivatizablePtrCallSiteArgument(const IRPosition &IRP, Attributor &A)
- : AAPrivatizablePtrFloating(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- if (getIRPosition().hasAttr(Attribute::ByVal))
- indicateOptimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- PrivatizableType = identifyPrivatizableType(A);
- if (!PrivatizableType.hasValue())
- return ChangeStatus::UNCHANGED;
- if (!PrivatizableType.getValue())
- return indicatePessimisticFixpoint();
-
- const IRPosition &IRP = getIRPosition();
- auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, IRP);
- if (!NoCaptureAA.isAssumedNoCapture()) {
- LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer might be captured!\n");
- return indicatePessimisticFixpoint();
- }
-
- auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, IRP);
- if (!NoAliasAA.isAssumedNoAlias()) {
- LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer might alias!\n");
- return indicatePessimisticFixpoint();
- }
-
- const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(*this, IRP);
- if (!MemBehaviorAA.isAssumedReadOnly()) {
- LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer is written!\n");
- return indicatePessimisticFixpoint();
- }
-
- return ChangeStatus::UNCHANGED;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_CSARG_ATTR(privatizable_ptr);
- }
-};
-
-struct AAPrivatizablePtrCallSiteReturned final
- : public AAPrivatizablePtrFloating {
- AAPrivatizablePtrCallSiteReturned(const IRPosition &IRP, Attributor &A)
- : AAPrivatizablePtrFloating(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- // TODO: We can privatize more than arguments.
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_CSRET_ATTR(privatizable_ptr);
- }
-};
-
-struct AAPrivatizablePtrReturned final : public AAPrivatizablePtrFloating {
- AAPrivatizablePtrReturned(const IRPosition &IRP, Attributor &A)
- : AAPrivatizablePtrFloating(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- // TODO: We can privatize more than arguments.
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_FNRET_ATTR(privatizable_ptr);
- }
-};
-
-/// -------------------- Memory Behavior Attributes ----------------------------
-/// Includes read-none, read-only, and write-only.
-/// ----------------------------------------------------------------------------
-struct AAMemoryBehaviorImpl : public AAMemoryBehavior {
- AAMemoryBehaviorImpl(const IRPosition &IRP, Attributor &A)
- : AAMemoryBehavior(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- intersectAssumedBits(BEST_STATE);
- getKnownStateFromValue(getIRPosition(), getState());
+ if (!Obj) {
+ LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] No underlying object found!\n");
+ return nullptr;
+ }
+
+ if (auto *AI = dyn_cast<AllocaInst>(Obj))
+ if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize()))
+ if (CI->isOne())
+ return Obj->getType()->getPointerElementType();
+ if (auto *Arg = dyn_cast<Argument>(Obj)) {
+ auto &PrivArgAA =
+ A.getAAFor<AAPrivatizablePtr>(*this, IRPosition::argument(*Arg));
+ if (PrivArgAA.isAssumedPrivatizablePtr())
+ return Obj->getType()->getPointerElementType();
+ }
+
+ LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Underlying object neither valid "
+ "alloca nor privatizable argument: "
+ << *Obj << "!\n");
+ return nullptr;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_FLOATING_ATTR(privatizable_ptr);
+ }
+};
+
+struct AAPrivatizablePtrCallSiteArgument final
+ : public AAPrivatizablePtrFloating {
+ AAPrivatizablePtrCallSiteArgument(const IRPosition &IRP, Attributor &A)
+ : AAPrivatizablePtrFloating(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ if (getIRPosition().hasAttr(Attribute::ByVal))
+ indicateOptimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ PrivatizableType = identifyPrivatizableType(A);
+ if (!PrivatizableType.hasValue())
+ return ChangeStatus::UNCHANGED;
+ if (!PrivatizableType.getValue())
+ return indicatePessimisticFixpoint();
+
+ const IRPosition &IRP = getIRPosition();
+ auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, IRP);
+ if (!NoCaptureAA.isAssumedNoCapture()) {
+ LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer might be captured!\n");
+ return indicatePessimisticFixpoint();
+ }
+
+ auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, IRP);
+ if (!NoAliasAA.isAssumedNoAlias()) {
+ LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer might alias!\n");
+ return indicatePessimisticFixpoint();
+ }
+
+ const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(*this, IRP);
+ if (!MemBehaviorAA.isAssumedReadOnly()) {
+ LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer is written!\n");
+ return indicatePessimisticFixpoint();
+ }
+
+ return ChangeStatus::UNCHANGED;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_CSARG_ATTR(privatizable_ptr);
+ }
+};
+
+struct AAPrivatizablePtrCallSiteReturned final
+ : public AAPrivatizablePtrFloating {
+ AAPrivatizablePtrCallSiteReturned(const IRPosition &IRP, Attributor &A)
+ : AAPrivatizablePtrFloating(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ // TODO: We can privatize more than arguments.
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_CSRET_ATTR(privatizable_ptr);
+ }
+};
+
+struct AAPrivatizablePtrReturned final : public AAPrivatizablePtrFloating {
+ AAPrivatizablePtrReturned(const IRPosition &IRP, Attributor &A)
+ : AAPrivatizablePtrFloating(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ // TODO: We can privatize more than arguments.
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_FNRET_ATTR(privatizable_ptr);
+ }
+};
+
+/// -------------------- Memory Behavior Attributes ----------------------------
+/// Includes read-none, read-only, and write-only.
+/// ----------------------------------------------------------------------------
+struct AAMemoryBehaviorImpl : public AAMemoryBehavior {
+ AAMemoryBehaviorImpl(const IRPosition &IRP, Attributor &A)
+ : AAMemoryBehavior(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ intersectAssumedBits(BEST_STATE);
+ getKnownStateFromValue(getIRPosition(), getState());
AAMemoryBehavior::initialize(A);
- }
-
- /// Return the memory behavior information encoded in the IR for \p IRP.
- static void getKnownStateFromValue(const IRPosition &IRP,
- BitIntegerState &State,
- bool IgnoreSubsumingPositions = false) {
- SmallVector<Attribute, 2> Attrs;
- IRP.getAttrs(AttrKinds, Attrs, IgnoreSubsumingPositions);
- for (const Attribute &Attr : Attrs) {
- switch (Attr.getKindAsEnum()) {
- case Attribute::ReadNone:
- State.addKnownBits(NO_ACCESSES);
- break;
- case Attribute::ReadOnly:
- State.addKnownBits(NO_WRITES);
- break;
- case Attribute::WriteOnly:
- State.addKnownBits(NO_READS);
- break;
- default:
- llvm_unreachable("Unexpected attribute!");
- }
- }
-
- if (auto *I = dyn_cast<Instruction>(&IRP.getAnchorValue())) {
- if (!I->mayReadFromMemory())
- State.addKnownBits(NO_READS);
- if (!I->mayWriteToMemory())
- State.addKnownBits(NO_WRITES);
- }
- }
-
- /// See AbstractAttribute::getDeducedAttributes(...).
- void getDeducedAttributes(LLVMContext &Ctx,
- SmallVectorImpl<Attribute> &Attrs) const override {
- assert(Attrs.size() == 0);
- if (isAssumedReadNone())
- Attrs.push_back(Attribute::get(Ctx, Attribute::ReadNone));
- else if (isAssumedReadOnly())
- Attrs.push_back(Attribute::get(Ctx, Attribute::ReadOnly));
- else if (isAssumedWriteOnly())
- Attrs.push_back(Attribute::get(Ctx, Attribute::WriteOnly));
- assert(Attrs.size() <= 1);
- }
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- if (hasAttr(Attribute::ReadNone, /* IgnoreSubsumingPositions */ true))
- return ChangeStatus::UNCHANGED;
-
- const IRPosition &IRP = getIRPosition();
-
- // Check if we would improve the existing attributes first.
- SmallVector<Attribute, 4> DeducedAttrs;
- getDeducedAttributes(IRP.getAnchorValue().getContext(), DeducedAttrs);
- if (llvm::all_of(DeducedAttrs, [&](const Attribute &Attr) {
- return IRP.hasAttr(Attr.getKindAsEnum(),
- /* IgnoreSubsumingPositions */ true);
- }))
- return ChangeStatus::UNCHANGED;
-
- // Clear existing attributes.
- IRP.removeAttrs(AttrKinds);
-
- // Use the generic manifest method.
- return IRAttribute::manifest(A);
- }
-
- /// See AbstractState::getAsStr().
- const std::string getAsStr() const override {
- if (isAssumedReadNone())
- return "readnone";
- if (isAssumedReadOnly())
- return "readonly";
- if (isAssumedWriteOnly())
- return "writeonly";
- return "may-read/write";
- }
-
- /// The set of IR attributes AAMemoryBehavior deals with.
- static const Attribute::AttrKind AttrKinds[3];
-};
-
-const Attribute::AttrKind AAMemoryBehaviorImpl::AttrKinds[] = {
- Attribute::ReadNone, Attribute::ReadOnly, Attribute::WriteOnly};
-
-/// Memory behavior attribute for a floating value.
-struct AAMemoryBehaviorFloating : AAMemoryBehaviorImpl {
- AAMemoryBehaviorFloating(const IRPosition &IRP, Attributor &A)
- : AAMemoryBehaviorImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- AAMemoryBehaviorImpl::initialize(A);
+ }
+
+ /// Return the memory behavior information encoded in the IR for \p IRP.
+ static void getKnownStateFromValue(const IRPosition &IRP,
+ BitIntegerState &State,
+ bool IgnoreSubsumingPositions = false) {
+ SmallVector<Attribute, 2> Attrs;
+ IRP.getAttrs(AttrKinds, Attrs, IgnoreSubsumingPositions);
+ for (const Attribute &Attr : Attrs) {
+ switch (Attr.getKindAsEnum()) {
+ case Attribute::ReadNone:
+ State.addKnownBits(NO_ACCESSES);
+ break;
+ case Attribute::ReadOnly:
+ State.addKnownBits(NO_WRITES);
+ break;
+ case Attribute::WriteOnly:
+ State.addKnownBits(NO_READS);
+ break;
+ default:
+ llvm_unreachable("Unexpected attribute!");
+ }
+ }
+
+ if (auto *I = dyn_cast<Instruction>(&IRP.getAnchorValue())) {
+ if (!I->mayReadFromMemory())
+ State.addKnownBits(NO_READS);
+ if (!I->mayWriteToMemory())
+ State.addKnownBits(NO_WRITES);
+ }
+ }
+
+ /// See AbstractAttribute::getDeducedAttributes(...).
+ void getDeducedAttributes(LLVMContext &Ctx,
+ SmallVectorImpl<Attribute> &Attrs) const override {
+ assert(Attrs.size() == 0);
+ if (isAssumedReadNone())
+ Attrs.push_back(Attribute::get(Ctx, Attribute::ReadNone));
+ else if (isAssumedReadOnly())
+ Attrs.push_back(Attribute::get(Ctx, Attribute::ReadOnly));
+ else if (isAssumedWriteOnly())
+ Attrs.push_back(Attribute::get(Ctx, Attribute::WriteOnly));
+ assert(Attrs.size() <= 1);
+ }
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ if (hasAttr(Attribute::ReadNone, /* IgnoreSubsumingPositions */ true))
+ return ChangeStatus::UNCHANGED;
+
+ const IRPosition &IRP = getIRPosition();
+
+ // Check if we would improve the existing attributes first.
+ SmallVector<Attribute, 4> DeducedAttrs;
+ getDeducedAttributes(IRP.getAnchorValue().getContext(), DeducedAttrs);
+ if (llvm::all_of(DeducedAttrs, [&](const Attribute &Attr) {
+ return IRP.hasAttr(Attr.getKindAsEnum(),
+ /* IgnoreSubsumingPositions */ true);
+ }))
+ return ChangeStatus::UNCHANGED;
+
+ // Clear existing attributes.
+ IRP.removeAttrs(AttrKinds);
+
+ // Use the generic manifest method.
+ return IRAttribute::manifest(A);
+ }
+
+ /// See AbstractState::getAsStr().
+ const std::string getAsStr() const override {
+ if (isAssumedReadNone())
+ return "readnone";
+ if (isAssumedReadOnly())
+ return "readonly";
+ if (isAssumedWriteOnly())
+ return "writeonly";
+ return "may-read/write";
+ }
+
+ /// The set of IR attributes AAMemoryBehavior deals with.
+ static const Attribute::AttrKind AttrKinds[3];
+};
+
+const Attribute::AttrKind AAMemoryBehaviorImpl::AttrKinds[] = {
+ Attribute::ReadNone, Attribute::ReadOnly, Attribute::WriteOnly};
+
+/// Memory behavior attribute for a floating value.
+struct AAMemoryBehaviorFloating : AAMemoryBehaviorImpl {
+ AAMemoryBehaviorFloating(const IRPosition &IRP, Attributor &A)
+ : AAMemoryBehaviorImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ AAMemoryBehaviorImpl::initialize(A);
addUsesOf(A, getAssociatedValue());
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override;
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- if (isAssumedReadNone())
- STATS_DECLTRACK_FLOATING_ATTR(readnone)
- else if (isAssumedReadOnly())
- STATS_DECLTRACK_FLOATING_ATTR(readonly)
- else if (isAssumedWriteOnly())
- STATS_DECLTRACK_FLOATING_ATTR(writeonly)
- }
-
-private:
- /// Return true if users of \p UserI might access the underlying
- /// variable/location described by \p U and should therefore be analyzed.
- bool followUsersOfUseIn(Attributor &A, const Use *U,
- const Instruction *UserI);
-
- /// Update the state according to the effect of use \p U in \p UserI.
- void analyzeUseIn(Attributor &A, const Use *U, const Instruction *UserI);
-
-protected:
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override;
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ if (isAssumedReadNone())
+ STATS_DECLTRACK_FLOATING_ATTR(readnone)
+ else if (isAssumedReadOnly())
+ STATS_DECLTRACK_FLOATING_ATTR(readonly)
+ else if (isAssumedWriteOnly())
+ STATS_DECLTRACK_FLOATING_ATTR(writeonly)
+ }
+
+private:
+ /// Return true if users of \p UserI might access the underlying
+ /// variable/location described by \p U and should therefore be analyzed.
+ bool followUsersOfUseIn(Attributor &A, const Use *U,
+ const Instruction *UserI);
+
+ /// Update the state according to the effect of use \p U in \p UserI.
+ void analyzeUseIn(Attributor &A, const Use *U, const Instruction *UserI);
+
+protected:
/// Add the uses of \p V to the `Uses` set we look at during the update step.
void addUsesOf(Attributor &A, const Value &V);
- /// Container for (transitive) uses of the associated argument.
+ /// Container for (transitive) uses of the associated argument.
SmallVector<const Use *, 8> Uses;
/// Set to remember the uses we already traversed.
SmallPtrSet<const Use *, 8> Visited;
-};
-
-/// Memory behavior attribute for function argument.
-struct AAMemoryBehaviorArgument : AAMemoryBehaviorFloating {
- AAMemoryBehaviorArgument(const IRPosition &IRP, Attributor &A)
- : AAMemoryBehaviorFloating(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- intersectAssumedBits(BEST_STATE);
- const IRPosition &IRP = getIRPosition();
- // TODO: Make IgnoreSubsumingPositions a property of an IRAttribute so we
- // can query it when we use has/getAttr. That would allow us to reuse the
- // initialize of the base class here.
- bool HasByVal =
- IRP.hasAttr({Attribute::ByVal}, /* IgnoreSubsumingPositions */ true);
- getKnownStateFromValue(IRP, getState(),
- /* IgnoreSubsumingPositions */ HasByVal);
-
- // Initialize the use vector with all direct uses of the associated value.
- Argument *Arg = getAssociatedArgument();
- if (!Arg || !A.isFunctionIPOAmendable(*(Arg->getParent()))) {
- indicatePessimisticFixpoint();
- } else {
+};
+
+/// Memory behavior attribute for function argument.
+struct AAMemoryBehaviorArgument : AAMemoryBehaviorFloating {
+ AAMemoryBehaviorArgument(const IRPosition &IRP, Attributor &A)
+ : AAMemoryBehaviorFloating(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ intersectAssumedBits(BEST_STATE);
+ const IRPosition &IRP = getIRPosition();
+ // TODO: Make IgnoreSubsumingPositions a property of an IRAttribute so we
+ // can query it when we use has/getAttr. That would allow us to reuse the
+ // initialize of the base class here.
+ bool HasByVal =
+ IRP.hasAttr({Attribute::ByVal}, /* IgnoreSubsumingPositions */ true);
+ getKnownStateFromValue(IRP, getState(),
+ /* IgnoreSubsumingPositions */ HasByVal);
+
+ // Initialize the use vector with all direct uses of the associated value.
+ Argument *Arg = getAssociatedArgument();
+ if (!Arg || !A.isFunctionIPOAmendable(*(Arg->getParent()))) {
+ indicatePessimisticFixpoint();
+ } else {
addUsesOf(A, *Arg);
- }
- }
-
- ChangeStatus manifest(Attributor &A) override {
- // TODO: Pointer arguments are not supported on vectors of pointers yet.
- if (!getAssociatedValue().getType()->isPointerTy())
- return ChangeStatus::UNCHANGED;
-
- // TODO: From readattrs.ll: "inalloca parameters are always
- // considered written"
- if (hasAttr({Attribute::InAlloca, Attribute::Preallocated})) {
- removeKnownBits(NO_WRITES);
- removeAssumedBits(NO_WRITES);
- }
- return AAMemoryBehaviorFloating::manifest(A);
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- if (isAssumedReadNone())
- STATS_DECLTRACK_ARG_ATTR(readnone)
- else if (isAssumedReadOnly())
- STATS_DECLTRACK_ARG_ATTR(readonly)
- else if (isAssumedWriteOnly())
- STATS_DECLTRACK_ARG_ATTR(writeonly)
- }
-};
-
-struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument {
- AAMemoryBehaviorCallSiteArgument(const IRPosition &IRP, Attributor &A)
- : AAMemoryBehaviorArgument(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
+ }
+ }
+
+ ChangeStatus manifest(Attributor &A) override {
+ // TODO: Pointer arguments are not supported on vectors of pointers yet.
+ if (!getAssociatedValue().getType()->isPointerTy())
+ return ChangeStatus::UNCHANGED;
+
+ // TODO: From readattrs.ll: "inalloca parameters are always
+ // considered written"
+ if (hasAttr({Attribute::InAlloca, Attribute::Preallocated})) {
+ removeKnownBits(NO_WRITES);
+ removeAssumedBits(NO_WRITES);
+ }
+ return AAMemoryBehaviorFloating::manifest(A);
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ if (isAssumedReadNone())
+ STATS_DECLTRACK_ARG_ATTR(readnone)
+ else if (isAssumedReadOnly())
+ STATS_DECLTRACK_ARG_ATTR(readonly)
+ else if (isAssumedWriteOnly())
+ STATS_DECLTRACK_ARG_ATTR(writeonly)
+ }
+};
+
+struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument {
+ AAMemoryBehaviorCallSiteArgument(const IRPosition &IRP, Attributor &A)
+ : AAMemoryBehaviorArgument(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
// If we don't have an associated attribute this is either a variadic call
// or an indirect call, either way, nothing to do here.
Argument *Arg = getAssociatedArgument();
if (!Arg) {
indicatePessimisticFixpoint();
return;
- }
+ }
if (Arg->hasByValAttr()) {
addKnownBits(NO_WRITES);
removeKnownBits(NO_READS);
removeAssumedBits(NO_READS);
}
- AAMemoryBehaviorArgument::initialize(A);
+ AAMemoryBehaviorArgument::initialize(A);
if (getAssociatedFunction()->isDeclaration())
indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness liveness information and then it makes
- // sense to specialize attributes for call sites arguments instead of
- // redirecting requests to the callee argument.
- Argument *Arg = getAssociatedArgument();
- const IRPosition &ArgPos = IRPosition::argument(*Arg);
- auto &ArgAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos);
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness liveness information and then it makes
+ // sense to specialize attributes for call sites arguments instead of
+ // redirecting requests to the callee argument.
+ Argument *Arg = getAssociatedArgument();
+ const IRPosition &ArgPos = IRPosition::argument(*Arg);
+ auto &ArgAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos);
return clampStateAndIndicateChange(getState(), ArgAA.getState());
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- if (isAssumedReadNone())
- STATS_DECLTRACK_CSARG_ATTR(readnone)
- else if (isAssumedReadOnly())
- STATS_DECLTRACK_CSARG_ATTR(readonly)
- else if (isAssumedWriteOnly())
- STATS_DECLTRACK_CSARG_ATTR(writeonly)
- }
-};
-
-/// Memory behavior attribute for a call site return position.
-struct AAMemoryBehaviorCallSiteReturned final : AAMemoryBehaviorFloating {
- AAMemoryBehaviorCallSiteReturned(const IRPosition &IRP, Attributor &A)
- : AAMemoryBehaviorFloating(IRP, A) {}
-
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ if (isAssumedReadNone())
+ STATS_DECLTRACK_CSARG_ATTR(readnone)
+ else if (isAssumedReadOnly())
+ STATS_DECLTRACK_CSARG_ATTR(readonly)
+ else if (isAssumedWriteOnly())
+ STATS_DECLTRACK_CSARG_ATTR(writeonly)
+ }
+};
+
+/// Memory behavior attribute for a call site return position.
+struct AAMemoryBehaviorCallSiteReturned final : AAMemoryBehaviorFloating {
+ AAMemoryBehaviorCallSiteReturned(const IRPosition &IRP, Attributor &A)
+ : AAMemoryBehaviorFloating(IRP, A) {}
+
/// See AbstractAttribute::initialize(...).
void initialize(Attributor &A) override {
AAMemoryBehaviorImpl::initialize(A);
@@ -6046,181 +6046,181 @@ struct AAMemoryBehaviorCallSiteReturned final : AAMemoryBehaviorFloating {
indicatePessimisticFixpoint();
}
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- // We do not annotate returned values.
- return ChangeStatus::UNCHANGED;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {}
-};
-
-/// An AA to represent the memory behavior function attributes.
-struct AAMemoryBehaviorFunction final : public AAMemoryBehaviorImpl {
- AAMemoryBehaviorFunction(const IRPosition &IRP, Attributor &A)
- : AAMemoryBehaviorImpl(IRP, A) {}
-
- /// See AbstractAttribute::updateImpl(Attributor &A).
- virtual ChangeStatus updateImpl(Attributor &A) override;
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- Function &F = cast<Function>(getAnchorValue());
- if (isAssumedReadNone()) {
- F.removeFnAttr(Attribute::ArgMemOnly);
- F.removeFnAttr(Attribute::InaccessibleMemOnly);
- F.removeFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
- }
- return AAMemoryBehaviorImpl::manifest(A);
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- if (isAssumedReadNone())
- STATS_DECLTRACK_FN_ATTR(readnone)
- else if (isAssumedReadOnly())
- STATS_DECLTRACK_FN_ATTR(readonly)
- else if (isAssumedWriteOnly())
- STATS_DECLTRACK_FN_ATTR(writeonly)
- }
-};
-
-/// AAMemoryBehavior attribute for call sites.
-struct AAMemoryBehaviorCallSite final : AAMemoryBehaviorImpl {
- AAMemoryBehaviorCallSite(const IRPosition &IRP, Attributor &A)
- : AAMemoryBehaviorImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- AAMemoryBehaviorImpl::initialize(A);
- Function *F = getAssociatedFunction();
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ // We do not annotate returned values.
+ return ChangeStatus::UNCHANGED;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {}
+};
+
+/// An AA to represent the memory behavior function attributes.
+struct AAMemoryBehaviorFunction final : public AAMemoryBehaviorImpl {
+ AAMemoryBehaviorFunction(const IRPosition &IRP, Attributor &A)
+ : AAMemoryBehaviorImpl(IRP, A) {}
+
+ /// See AbstractAttribute::updateImpl(Attributor &A).
+ virtual ChangeStatus updateImpl(Attributor &A) override;
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ Function &F = cast<Function>(getAnchorValue());
+ if (isAssumedReadNone()) {
+ F.removeFnAttr(Attribute::ArgMemOnly);
+ F.removeFnAttr(Attribute::InaccessibleMemOnly);
+ F.removeFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+ }
+ return AAMemoryBehaviorImpl::manifest(A);
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ if (isAssumedReadNone())
+ STATS_DECLTRACK_FN_ATTR(readnone)
+ else if (isAssumedReadOnly())
+ STATS_DECLTRACK_FN_ATTR(readonly)
+ else if (isAssumedWriteOnly())
+ STATS_DECLTRACK_FN_ATTR(writeonly)
+ }
+};
+
+/// AAMemoryBehavior attribute for call sites.
+struct AAMemoryBehaviorCallSite final : AAMemoryBehaviorImpl {
+ AAMemoryBehaviorCallSite(const IRPosition &IRP, Attributor &A)
+ : AAMemoryBehaviorImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ AAMemoryBehaviorImpl::initialize(A);
+ Function *F = getAssociatedFunction();
if (!F || F->isDeclaration())
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness liveness information and then it makes
- // sense to specialize attributes for call sites arguments instead of
- // redirecting requests to the callee argument.
- Function *F = getAssociatedFunction();
- const IRPosition &FnPos = IRPosition::function(*F);
- auto &FnAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness liveness information and then it makes
+ // sense to specialize attributes for call sites arguments instead of
+ // redirecting requests to the callee argument.
+ Function *F = getAssociatedFunction();
+ const IRPosition &FnPos = IRPosition::function(*F);
+ auto &FnAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
return clampStateAndIndicateChange(getState(), FnAA.getState());
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- if (isAssumedReadNone())
- STATS_DECLTRACK_CS_ATTR(readnone)
- else if (isAssumedReadOnly())
- STATS_DECLTRACK_CS_ATTR(readonly)
- else if (isAssumedWriteOnly())
- STATS_DECLTRACK_CS_ATTR(writeonly)
- }
-};
-
-ChangeStatus AAMemoryBehaviorFunction::updateImpl(Attributor &A) {
-
- // The current assumed state used to determine a change.
- auto AssumedState = getAssumed();
-
- auto CheckRWInst = [&](Instruction &I) {
- // If the instruction has an own memory behavior state, use it to restrict
- // the local state. No further analysis is required as the other memory
- // state is as optimistic as it gets.
- if (const auto *CB = dyn_cast<CallBase>(&I)) {
- const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
- *this, IRPosition::callsite_function(*CB));
- intersectAssumedBits(MemBehaviorAA.getAssumed());
- return !isAtFixpoint();
- }
-
- // Remove access kind modifiers if necessary.
- if (I.mayReadFromMemory())
- removeAssumedBits(NO_READS);
- if (I.mayWriteToMemory())
- removeAssumedBits(NO_WRITES);
- return !isAtFixpoint();
- };
-
- if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
- return indicatePessimisticFixpoint();
-
- return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
- : ChangeStatus::UNCHANGED;
-}
-
-ChangeStatus AAMemoryBehaviorFloating::updateImpl(Attributor &A) {
-
- const IRPosition &IRP = getIRPosition();
- const IRPosition &FnPos = IRPosition::function_scope(IRP);
- AAMemoryBehavior::StateType &S = getState();
-
- // First, check the function scope. We take the known information and we avoid
- // work if the assumed information implies the current assumed information for
- // this attribute. This is a valid for all but byval arguments.
- Argument *Arg = IRP.getAssociatedArgument();
- AAMemoryBehavior::base_t FnMemAssumedState =
- AAMemoryBehavior::StateType::getWorstState();
- if (!Arg || !Arg->hasByValAttr()) {
- const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>(
- *this, FnPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
- FnMemAssumedState = FnMemAA.getAssumed();
- S.addKnownBits(FnMemAA.getKnown());
- if ((S.getAssumed() & FnMemAA.getAssumed()) == S.getAssumed())
- return ChangeStatus::UNCHANGED;
- }
-
- // Make sure the value is not captured (except through "return"), if
- // it is, any information derived would be irrelevant anyway as we cannot
- // check the potential aliases introduced by the capture. However, no need
- // to fall back to anythign less optimistic than the function state.
- const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(
- *this, IRP, /* TrackDependence */ true, DepClassTy::OPTIONAL);
- if (!ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
- S.intersectAssumedBits(FnMemAssumedState);
- return ChangeStatus::CHANGED;
- }
-
- // The current assumed state used to determine a change.
- auto AssumedState = S.getAssumed();
-
- // Liveness information to exclude dead users.
- // TODO: Take the FnPos once we have call site specific liveness information.
- const auto &LivenessAA = A.getAAFor<AAIsDead>(
- *this, IRPosition::function(*IRP.getAssociatedFunction()),
- /* TrackDependence */ false);
-
- // Visit and expand uses until all are analyzed or a fixpoint is reached.
- for (unsigned i = 0; i < Uses.size() && !isAtFixpoint(); i++) {
- const Use *U = Uses[i];
- Instruction *UserI = cast<Instruction>(U->getUser());
- LLVM_DEBUG(dbgs() << "[AAMemoryBehavior] Use: " << **U << " in " << *UserI
- << " [Dead: " << (A.isAssumedDead(*U, this, &LivenessAA))
- << "]\n");
- if (A.isAssumedDead(*U, this, &LivenessAA))
- continue;
-
- // Droppable users, e.g., llvm::assume does not actually perform any action.
- if (UserI->isDroppable())
- continue;
-
- // Check if the users of UserI should also be visited.
- if (followUsersOfUseIn(A, U, UserI))
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ if (isAssumedReadNone())
+ STATS_DECLTRACK_CS_ATTR(readnone)
+ else if (isAssumedReadOnly())
+ STATS_DECLTRACK_CS_ATTR(readonly)
+ else if (isAssumedWriteOnly())
+ STATS_DECLTRACK_CS_ATTR(writeonly)
+ }
+};
+
+ChangeStatus AAMemoryBehaviorFunction::updateImpl(Attributor &A) {
+
+ // The current assumed state used to determine a change.
+ auto AssumedState = getAssumed();
+
+ auto CheckRWInst = [&](Instruction &I) {
+ // If the instruction has an own memory behavior state, use it to restrict
+ // the local state. No further analysis is required as the other memory
+ // state is as optimistic as it gets.
+ if (const auto *CB = dyn_cast<CallBase>(&I)) {
+ const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
+ *this, IRPosition::callsite_function(*CB));
+ intersectAssumedBits(MemBehaviorAA.getAssumed());
+ return !isAtFixpoint();
+ }
+
+ // Remove access kind modifiers if necessary.
+ if (I.mayReadFromMemory())
+ removeAssumedBits(NO_READS);
+ if (I.mayWriteToMemory())
+ removeAssumedBits(NO_WRITES);
+ return !isAtFixpoint();
+ };
+
+ if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
+ return indicatePessimisticFixpoint();
+
+ return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
+ : ChangeStatus::UNCHANGED;
+}
+
+ChangeStatus AAMemoryBehaviorFloating::updateImpl(Attributor &A) {
+
+ const IRPosition &IRP = getIRPosition();
+ const IRPosition &FnPos = IRPosition::function_scope(IRP);
+ AAMemoryBehavior::StateType &S = getState();
+
+ // First, check the function scope. We take the known information and we avoid
+ // work if the assumed information implies the current assumed information for
+ // this attribute. This is a valid for all but byval arguments.
+ Argument *Arg = IRP.getAssociatedArgument();
+ AAMemoryBehavior::base_t FnMemAssumedState =
+ AAMemoryBehavior::StateType::getWorstState();
+ if (!Arg || !Arg->hasByValAttr()) {
+ const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>(
+ *this, FnPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+ FnMemAssumedState = FnMemAA.getAssumed();
+ S.addKnownBits(FnMemAA.getKnown());
+ if ((S.getAssumed() & FnMemAA.getAssumed()) == S.getAssumed())
+ return ChangeStatus::UNCHANGED;
+ }
+
+ // Make sure the value is not captured (except through "return"), if
+ // it is, any information derived would be irrelevant anyway as we cannot
+ // check the potential aliases introduced by the capture. However, no need
+ // to fall back to anythign less optimistic than the function state.
+ const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(
+ *this, IRP, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+ if (!ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+ S.intersectAssumedBits(FnMemAssumedState);
+ return ChangeStatus::CHANGED;
+ }
+
+ // The current assumed state used to determine a change.
+ auto AssumedState = S.getAssumed();
+
+ // Liveness information to exclude dead users.
+ // TODO: Take the FnPos once we have call site specific liveness information.
+ const auto &LivenessAA = A.getAAFor<AAIsDead>(
+ *this, IRPosition::function(*IRP.getAssociatedFunction()),
+ /* TrackDependence */ false);
+
+ // Visit and expand uses until all are analyzed or a fixpoint is reached.
+ for (unsigned i = 0; i < Uses.size() && !isAtFixpoint(); i++) {
+ const Use *U = Uses[i];
+ Instruction *UserI = cast<Instruction>(U->getUser());
+ LLVM_DEBUG(dbgs() << "[AAMemoryBehavior] Use: " << **U << " in " << *UserI
+ << " [Dead: " << (A.isAssumedDead(*U, this, &LivenessAA))
+ << "]\n");
+ if (A.isAssumedDead(*U, this, &LivenessAA))
+ continue;
+
+ // Droppable users, e.g., llvm::assume does not actually perform any action.
+ if (UserI->isDroppable())
+ continue;
+
+ // Check if the users of UserI should also be visited.
+ if (followUsersOfUseIn(A, U, UserI))
addUsesOf(A, *UserI);
-
- // If UserI might touch memory we analyze the use in detail.
- if (UserI->mayReadOrWriteMemory())
- analyzeUseIn(A, U, UserI);
- }
-
- return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
- : ChangeStatus::UNCHANGED;
-}
-
+
+ // If UserI might touch memory we analyze the use in detail.
+ if (UserI->mayReadOrWriteMemory())
+ analyzeUseIn(A, U, UserI);
+ }
+
+ return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
+ : ChangeStatus::UNCHANGED;
+}
+
void AAMemoryBehaviorFloating::addUsesOf(Attributor &A, const Value &V) {
SmallVector<const Use *, 8> WL;
for (const Use &U : V.uses())
@@ -6243,320 +6243,320 @@ void AAMemoryBehaviorFloating::addUsesOf(Attributor &A, const Value &V) {
}
}
-bool AAMemoryBehaviorFloating::followUsersOfUseIn(Attributor &A, const Use *U,
- const Instruction *UserI) {
- // The loaded value is unrelated to the pointer argument, no need to
- // follow the users of the load.
- if (isa<LoadInst>(UserI))
- return false;
-
- // By default we follow all uses assuming UserI might leak information on U,
- // we have special handling for call sites operands though.
- const auto *CB = dyn_cast<CallBase>(UserI);
- if (!CB || !CB->isArgOperand(U))
- return true;
-
- // If the use is a call argument known not to be captured, the users of
- // the call do not need to be visited because they have to be unrelated to
- // the input. Note that this check is not trivial even though we disallow
- // general capturing of the underlying argument. The reason is that the
- // call might the argument "through return", which we allow and for which we
- // need to check call users.
- if (U->get()->getType()->isPointerTy()) {
- unsigned ArgNo = CB->getArgOperandNo(U);
- const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(
- *this, IRPosition::callsite_argument(*CB, ArgNo),
- /* TrackDependence */ true, DepClassTy::OPTIONAL);
- return !ArgNoCaptureAA.isAssumedNoCapture();
- }
-
- return true;
-}
-
-void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use *U,
- const Instruction *UserI) {
- assert(UserI->mayReadOrWriteMemory());
-
- switch (UserI->getOpcode()) {
- default:
- // TODO: Handle all atomics and other side-effect operations we know of.
- break;
- case Instruction::Load:
- // Loads cause the NO_READS property to disappear.
- removeAssumedBits(NO_READS);
- return;
-
- case Instruction::Store:
- // Stores cause the NO_WRITES property to disappear if the use is the
- // pointer operand. Note that we do assume that capturing was taken care of
- // somewhere else.
- if (cast<StoreInst>(UserI)->getPointerOperand() == U->get())
- removeAssumedBits(NO_WRITES);
- return;
-
- case Instruction::Call:
- case Instruction::CallBr:
- case Instruction::Invoke: {
- // For call sites we look at the argument memory behavior attribute (this
- // could be recursive!) in order to restrict our own state.
- const auto *CB = cast<CallBase>(UserI);
-
- // Give up on operand bundles.
- if (CB->isBundleOperand(U)) {
- indicatePessimisticFixpoint();
- return;
- }
-
- // Calling a function does read the function pointer, maybe write it if the
- // function is self-modifying.
- if (CB->isCallee(U)) {
- removeAssumedBits(NO_READS);
- break;
- }
-
- // Adjust the possible access behavior based on the information on the
- // argument.
- IRPosition Pos;
- if (U->get()->getType()->isPointerTy())
- Pos = IRPosition::callsite_argument(*CB, CB->getArgOperandNo(U));
- else
- Pos = IRPosition::callsite_function(*CB);
- const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
- *this, Pos,
- /* TrackDependence */ true, DepClassTy::OPTIONAL);
- // "assumed" has at most the same bits as the MemBehaviorAA assumed
- // and at least "known".
- intersectAssumedBits(MemBehaviorAA.getAssumed());
- return;
- }
- };
-
- // Generally, look at the "may-properties" and adjust the assumed state if we
- // did not trigger special handling before.
- if (UserI->mayReadFromMemory())
- removeAssumedBits(NO_READS);
- if (UserI->mayWriteToMemory())
- removeAssumedBits(NO_WRITES);
-}
-
-} // namespace
-
-/// -------------------- Memory Locations Attributes ---------------------------
-/// Includes read-none, argmemonly, inaccessiblememonly,
-/// inaccessiblememorargmemonly
-/// ----------------------------------------------------------------------------
-
-std::string AAMemoryLocation::getMemoryLocationsAsStr(
- AAMemoryLocation::MemoryLocationsKind MLK) {
- if (0 == (MLK & AAMemoryLocation::NO_LOCATIONS))
- return "all memory";
- if (MLK == AAMemoryLocation::NO_LOCATIONS)
- return "no memory";
- std::string S = "memory:";
- if (0 == (MLK & AAMemoryLocation::NO_LOCAL_MEM))
- S += "stack,";
- if (0 == (MLK & AAMemoryLocation::NO_CONST_MEM))
- S += "constant,";
- if (0 == (MLK & AAMemoryLocation::NO_GLOBAL_INTERNAL_MEM))
- S += "internal global,";
- if (0 == (MLK & AAMemoryLocation::NO_GLOBAL_EXTERNAL_MEM))
- S += "external global,";
- if (0 == (MLK & AAMemoryLocation::NO_ARGUMENT_MEM))
- S += "argument,";
- if (0 == (MLK & AAMemoryLocation::NO_INACCESSIBLE_MEM))
- S += "inaccessible,";
- if (0 == (MLK & AAMemoryLocation::NO_MALLOCED_MEM))
- S += "malloced,";
- if (0 == (MLK & AAMemoryLocation::NO_UNKOWN_MEM))
- S += "unknown,";
- S.pop_back();
- return S;
-}
-
-namespace {
-struct AAMemoryLocationImpl : public AAMemoryLocation {
-
- AAMemoryLocationImpl(const IRPosition &IRP, Attributor &A)
- : AAMemoryLocation(IRP, A), Allocator(A.Allocator) {
- for (unsigned u = 0; u < llvm::CTLog2<VALID_STATE>(); ++u)
- AccessKind2Accesses[u] = nullptr;
- }
-
- ~AAMemoryLocationImpl() {
- // The AccessSets are allocated via a BumpPtrAllocator, we call
- // the destructor manually.
- for (unsigned u = 0; u < llvm::CTLog2<VALID_STATE>(); ++u)
- if (AccessKind2Accesses[u])
- AccessKind2Accesses[u]->~AccessSet();
- }
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- intersectAssumedBits(BEST_STATE);
- getKnownStateFromValue(A, getIRPosition(), getState());
+bool AAMemoryBehaviorFloating::followUsersOfUseIn(Attributor &A, const Use *U,
+ const Instruction *UserI) {
+ // The loaded value is unrelated to the pointer argument, no need to
+ // follow the users of the load.
+ if (isa<LoadInst>(UserI))
+ return false;
+
+ // By default we follow all uses assuming UserI might leak information on U,
+ // we have special handling for call sites operands though.
+ const auto *CB = dyn_cast<CallBase>(UserI);
+ if (!CB || !CB->isArgOperand(U))
+ return true;
+
+ // If the use is a call argument known not to be captured, the users of
+ // the call do not need to be visited because they have to be unrelated to
+ // the input. Note that this check is not trivial even though we disallow
+ // general capturing of the underlying argument. The reason is that the
+ // call might the argument "through return", which we allow and for which we
+ // need to check call users.
+ if (U->get()->getType()->isPointerTy()) {
+ unsigned ArgNo = CB->getArgOperandNo(U);
+ const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(
+ *this, IRPosition::callsite_argument(*CB, ArgNo),
+ /* TrackDependence */ true, DepClassTy::OPTIONAL);
+ return !ArgNoCaptureAA.isAssumedNoCapture();
+ }
+
+ return true;
+}
+
+void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use *U,
+ const Instruction *UserI) {
+ assert(UserI->mayReadOrWriteMemory());
+
+ switch (UserI->getOpcode()) {
+ default:
+ // TODO: Handle all atomics and other side-effect operations we know of.
+ break;
+ case Instruction::Load:
+ // Loads cause the NO_READS property to disappear.
+ removeAssumedBits(NO_READS);
+ return;
+
+ case Instruction::Store:
+ // Stores cause the NO_WRITES property to disappear if the use is the
+ // pointer operand. Note that we do assume that capturing was taken care of
+ // somewhere else.
+ if (cast<StoreInst>(UserI)->getPointerOperand() == U->get())
+ removeAssumedBits(NO_WRITES);
+ return;
+
+ case Instruction::Call:
+ case Instruction::CallBr:
+ case Instruction::Invoke: {
+ // For call sites we look at the argument memory behavior attribute (this
+ // could be recursive!) in order to restrict our own state.
+ const auto *CB = cast<CallBase>(UserI);
+
+ // Give up on operand bundles.
+ if (CB->isBundleOperand(U)) {
+ indicatePessimisticFixpoint();
+ return;
+ }
+
+ // Calling a function does read the function pointer, maybe write it if the
+ // function is self-modifying.
+ if (CB->isCallee(U)) {
+ removeAssumedBits(NO_READS);
+ break;
+ }
+
+ // Adjust the possible access behavior based on the information on the
+ // argument.
+ IRPosition Pos;
+ if (U->get()->getType()->isPointerTy())
+ Pos = IRPosition::callsite_argument(*CB, CB->getArgOperandNo(U));
+ else
+ Pos = IRPosition::callsite_function(*CB);
+ const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
+ *this, Pos,
+ /* TrackDependence */ true, DepClassTy::OPTIONAL);
+ // "assumed" has at most the same bits as the MemBehaviorAA assumed
+ // and at least "known".
+ intersectAssumedBits(MemBehaviorAA.getAssumed());
+ return;
+ }
+ };
+
+ // Generally, look at the "may-properties" and adjust the assumed state if we
+ // did not trigger special handling before.
+ if (UserI->mayReadFromMemory())
+ removeAssumedBits(NO_READS);
+ if (UserI->mayWriteToMemory())
+ removeAssumedBits(NO_WRITES);
+}
+
+} // namespace
+
+/// -------------------- Memory Locations Attributes ---------------------------
+/// Includes read-none, argmemonly, inaccessiblememonly,
+/// inaccessiblememorargmemonly
+/// ----------------------------------------------------------------------------
+
+std::string AAMemoryLocation::getMemoryLocationsAsStr(
+ AAMemoryLocation::MemoryLocationsKind MLK) {
+ if (0 == (MLK & AAMemoryLocation::NO_LOCATIONS))
+ return "all memory";
+ if (MLK == AAMemoryLocation::NO_LOCATIONS)
+ return "no memory";
+ std::string S = "memory:";
+ if (0 == (MLK & AAMemoryLocation::NO_LOCAL_MEM))
+ S += "stack,";
+ if (0 == (MLK & AAMemoryLocation::NO_CONST_MEM))
+ S += "constant,";
+ if (0 == (MLK & AAMemoryLocation::NO_GLOBAL_INTERNAL_MEM))
+ S += "internal global,";
+ if (0 == (MLK & AAMemoryLocation::NO_GLOBAL_EXTERNAL_MEM))
+ S += "external global,";
+ if (0 == (MLK & AAMemoryLocation::NO_ARGUMENT_MEM))
+ S += "argument,";
+ if (0 == (MLK & AAMemoryLocation::NO_INACCESSIBLE_MEM))
+ S += "inaccessible,";
+ if (0 == (MLK & AAMemoryLocation::NO_MALLOCED_MEM))
+ S += "malloced,";
+ if (0 == (MLK & AAMemoryLocation::NO_UNKOWN_MEM))
+ S += "unknown,";
+ S.pop_back();
+ return S;
+}
+
+namespace {
+struct AAMemoryLocationImpl : public AAMemoryLocation {
+
+ AAMemoryLocationImpl(const IRPosition &IRP, Attributor &A)
+ : AAMemoryLocation(IRP, A), Allocator(A.Allocator) {
+ for (unsigned u = 0; u < llvm::CTLog2<VALID_STATE>(); ++u)
+ AccessKind2Accesses[u] = nullptr;
+ }
+
+ ~AAMemoryLocationImpl() {
+ // The AccessSets are allocated via a BumpPtrAllocator, we call
+ // the destructor manually.
+ for (unsigned u = 0; u < llvm::CTLog2<VALID_STATE>(); ++u)
+ if (AccessKind2Accesses[u])
+ AccessKind2Accesses[u]->~AccessSet();
+ }
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ intersectAssumedBits(BEST_STATE);
+ getKnownStateFromValue(A, getIRPosition(), getState());
AAMemoryLocation::initialize(A);
- }
-
- /// Return the memory behavior information encoded in the IR for \p IRP.
- static void getKnownStateFromValue(Attributor &A, const IRPosition &IRP,
- BitIntegerState &State,
- bool IgnoreSubsumingPositions = false) {
- // For internal functions we ignore `argmemonly` and
- // `inaccessiblememorargmemonly` as we might break it via interprocedural
- // constant propagation. It is unclear if this is the best way but it is
- // unlikely this will cause real performance problems. If we are deriving
- // attributes for the anchor function we even remove the attribute in
- // addition to ignoring it.
- bool UseArgMemOnly = true;
- Function *AnchorFn = IRP.getAnchorScope();
- if (AnchorFn && A.isRunOn(*AnchorFn))
- UseArgMemOnly = !AnchorFn->hasLocalLinkage();
-
- SmallVector<Attribute, 2> Attrs;
- IRP.getAttrs(AttrKinds, Attrs, IgnoreSubsumingPositions);
- for (const Attribute &Attr : Attrs) {
- switch (Attr.getKindAsEnum()) {
- case Attribute::ReadNone:
- State.addKnownBits(NO_LOCAL_MEM | NO_CONST_MEM);
- break;
- case Attribute::InaccessibleMemOnly:
- State.addKnownBits(inverseLocation(NO_INACCESSIBLE_MEM, true, true));
- break;
- case Attribute::ArgMemOnly:
- if (UseArgMemOnly)
- State.addKnownBits(inverseLocation(NO_ARGUMENT_MEM, true, true));
- else
- IRP.removeAttrs({Attribute::ArgMemOnly});
- break;
- case Attribute::InaccessibleMemOrArgMemOnly:
- if (UseArgMemOnly)
- State.addKnownBits(inverseLocation(
- NO_INACCESSIBLE_MEM | NO_ARGUMENT_MEM, true, true));
- else
- IRP.removeAttrs({Attribute::InaccessibleMemOrArgMemOnly});
- break;
- default:
- llvm_unreachable("Unexpected attribute!");
- }
- }
- }
-
- /// See AbstractAttribute::getDeducedAttributes(...).
- void getDeducedAttributes(LLVMContext &Ctx,
- SmallVectorImpl<Attribute> &Attrs) const override {
- assert(Attrs.size() == 0);
- if (isAssumedReadNone()) {
- Attrs.push_back(Attribute::get(Ctx, Attribute::ReadNone));
- } else if (getIRPosition().getPositionKind() == IRPosition::IRP_FUNCTION) {
- if (isAssumedInaccessibleMemOnly())
- Attrs.push_back(Attribute::get(Ctx, Attribute::InaccessibleMemOnly));
- else if (isAssumedArgMemOnly())
- Attrs.push_back(Attribute::get(Ctx, Attribute::ArgMemOnly));
- else if (isAssumedInaccessibleOrArgMemOnly())
- Attrs.push_back(
- Attribute::get(Ctx, Attribute::InaccessibleMemOrArgMemOnly));
- }
- assert(Attrs.size() <= 1);
- }
-
- /// See AbstractAttribute::manifest(...).
- ChangeStatus manifest(Attributor &A) override {
- const IRPosition &IRP = getIRPosition();
-
- // Check if we would improve the existing attributes first.
- SmallVector<Attribute, 4> DeducedAttrs;
- getDeducedAttributes(IRP.getAnchorValue().getContext(), DeducedAttrs);
- if (llvm::all_of(DeducedAttrs, [&](const Attribute &Attr) {
- return IRP.hasAttr(Attr.getKindAsEnum(),
- /* IgnoreSubsumingPositions */ true);
- }))
- return ChangeStatus::UNCHANGED;
-
- // Clear existing attributes.
- IRP.removeAttrs(AttrKinds);
- if (isAssumedReadNone())
- IRP.removeAttrs(AAMemoryBehaviorImpl::AttrKinds);
-
- // Use the generic manifest method.
- return IRAttribute::manifest(A);
- }
-
- /// See AAMemoryLocation::checkForAllAccessesToMemoryKind(...).
- bool checkForAllAccessesToMemoryKind(
- function_ref<bool(const Instruction *, const Value *, AccessKind,
- MemoryLocationsKind)>
- Pred,
- MemoryLocationsKind RequestedMLK) const override {
- if (!isValidState())
- return false;
-
- MemoryLocationsKind AssumedMLK = getAssumedNotAccessedLocation();
- if (AssumedMLK == NO_LOCATIONS)
- return true;
-
- unsigned Idx = 0;
- for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS;
- CurMLK *= 2, ++Idx) {
- if (CurMLK & RequestedMLK)
- continue;
-
- if (const AccessSet *Accesses = AccessKind2Accesses[Idx])
- for (const AccessInfo &AI : *Accesses)
- if (!Pred(AI.I, AI.Ptr, AI.Kind, CurMLK))
- return false;
- }
-
- return true;
- }
-
- ChangeStatus indicatePessimisticFixpoint() override {
- // If we give up and indicate a pessimistic fixpoint this instruction will
- // become an access for all potential access kinds:
- // TODO: Add pointers for argmemonly and globals to improve the results of
- // checkForAllAccessesToMemoryKind.
- bool Changed = false;
- MemoryLocationsKind KnownMLK = getKnown();
- Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
- for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS; CurMLK *= 2)
- if (!(CurMLK & KnownMLK))
- updateStateAndAccessesMap(getState(), CurMLK, I, nullptr, Changed,
- getAccessKindFromInst(I));
- return AAMemoryLocation::indicatePessimisticFixpoint();
- }
-
-protected:
- /// Helper struct to tie together an instruction that has a read or write
- /// effect with the pointer it accesses (if any).
- struct AccessInfo {
-
- /// The instruction that caused the access.
- const Instruction *I;
-
- /// The base pointer that is accessed, or null if unknown.
- const Value *Ptr;
-
- /// The kind of access (read/write/read+write).
- AccessKind Kind;
-
- bool operator==(const AccessInfo &RHS) const {
- return I == RHS.I && Ptr == RHS.Ptr && Kind == RHS.Kind;
- }
- bool operator()(const AccessInfo &LHS, const AccessInfo &RHS) const {
- if (LHS.I != RHS.I)
- return LHS.I < RHS.I;
- if (LHS.Ptr != RHS.Ptr)
- return LHS.Ptr < RHS.Ptr;
- if (LHS.Kind != RHS.Kind)
- return LHS.Kind < RHS.Kind;
- return false;
- }
- };
-
- /// Mapping from *single* memory location kinds, e.g., LOCAL_MEM with the
- /// value of NO_LOCAL_MEM, to the accesses encountered for this memory kind.
- using AccessSet = SmallSet<AccessInfo, 2, AccessInfo>;
- AccessSet *AccessKind2Accesses[llvm::CTLog2<VALID_STATE>()];
-
+ }
+
+ /// Return the memory behavior information encoded in the IR for \p IRP.
+ static void getKnownStateFromValue(Attributor &A, const IRPosition &IRP,
+ BitIntegerState &State,
+ bool IgnoreSubsumingPositions = false) {
+ // For internal functions we ignore `argmemonly` and
+ // `inaccessiblememorargmemonly` as we might break it via interprocedural
+ // constant propagation. It is unclear if this is the best way but it is
+ // unlikely this will cause real performance problems. If we are deriving
+ // attributes for the anchor function we even remove the attribute in
+ // addition to ignoring it.
+ bool UseArgMemOnly = true;
+ Function *AnchorFn = IRP.getAnchorScope();
+ if (AnchorFn && A.isRunOn(*AnchorFn))
+ UseArgMemOnly = !AnchorFn->hasLocalLinkage();
+
+ SmallVector<Attribute, 2> Attrs;
+ IRP.getAttrs(AttrKinds, Attrs, IgnoreSubsumingPositions);
+ for (const Attribute &Attr : Attrs) {
+ switch (Attr.getKindAsEnum()) {
+ case Attribute::ReadNone:
+ State.addKnownBits(NO_LOCAL_MEM | NO_CONST_MEM);
+ break;
+ case Attribute::InaccessibleMemOnly:
+ State.addKnownBits(inverseLocation(NO_INACCESSIBLE_MEM, true, true));
+ break;
+ case Attribute::ArgMemOnly:
+ if (UseArgMemOnly)
+ State.addKnownBits(inverseLocation(NO_ARGUMENT_MEM, true, true));
+ else
+ IRP.removeAttrs({Attribute::ArgMemOnly});
+ break;
+ case Attribute::InaccessibleMemOrArgMemOnly:
+ if (UseArgMemOnly)
+ State.addKnownBits(inverseLocation(
+ NO_INACCESSIBLE_MEM | NO_ARGUMENT_MEM, true, true));
+ else
+ IRP.removeAttrs({Attribute::InaccessibleMemOrArgMemOnly});
+ break;
+ default:
+ llvm_unreachable("Unexpected attribute!");
+ }
+ }
+ }
+
+ /// See AbstractAttribute::getDeducedAttributes(...).
+ void getDeducedAttributes(LLVMContext &Ctx,
+ SmallVectorImpl<Attribute> &Attrs) const override {
+ assert(Attrs.size() == 0);
+ if (isAssumedReadNone()) {
+ Attrs.push_back(Attribute::get(Ctx, Attribute::ReadNone));
+ } else if (getIRPosition().getPositionKind() == IRPosition::IRP_FUNCTION) {
+ if (isAssumedInaccessibleMemOnly())
+ Attrs.push_back(Attribute::get(Ctx, Attribute::InaccessibleMemOnly));
+ else if (isAssumedArgMemOnly())
+ Attrs.push_back(Attribute::get(Ctx, Attribute::ArgMemOnly));
+ else if (isAssumedInaccessibleOrArgMemOnly())
+ Attrs.push_back(
+ Attribute::get(Ctx, Attribute::InaccessibleMemOrArgMemOnly));
+ }
+ assert(Attrs.size() <= 1);
+ }
+
+ /// See AbstractAttribute::manifest(...).
+ ChangeStatus manifest(Attributor &A) override {
+ const IRPosition &IRP = getIRPosition();
+
+ // Check if we would improve the existing attributes first.
+ SmallVector<Attribute, 4> DeducedAttrs;
+ getDeducedAttributes(IRP.getAnchorValue().getContext(), DeducedAttrs);
+ if (llvm::all_of(DeducedAttrs, [&](const Attribute &Attr) {
+ return IRP.hasAttr(Attr.getKindAsEnum(),
+ /* IgnoreSubsumingPositions */ true);
+ }))
+ return ChangeStatus::UNCHANGED;
+
+ // Clear existing attributes.
+ IRP.removeAttrs(AttrKinds);
+ if (isAssumedReadNone())
+ IRP.removeAttrs(AAMemoryBehaviorImpl::AttrKinds);
+
+ // Use the generic manifest method.
+ return IRAttribute::manifest(A);
+ }
+
+ /// See AAMemoryLocation::checkForAllAccessesToMemoryKind(...).
+ bool checkForAllAccessesToMemoryKind(
+ function_ref<bool(const Instruction *, const Value *, AccessKind,
+ MemoryLocationsKind)>
+ Pred,
+ MemoryLocationsKind RequestedMLK) const override {
+ if (!isValidState())
+ return false;
+
+ MemoryLocationsKind AssumedMLK = getAssumedNotAccessedLocation();
+ if (AssumedMLK == NO_LOCATIONS)
+ return true;
+
+ unsigned Idx = 0;
+ for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS;
+ CurMLK *= 2, ++Idx) {
+ if (CurMLK & RequestedMLK)
+ continue;
+
+ if (const AccessSet *Accesses = AccessKind2Accesses[Idx])
+ for (const AccessInfo &AI : *Accesses)
+ if (!Pred(AI.I, AI.Ptr, AI.Kind, CurMLK))
+ return false;
+ }
+
+ return true;
+ }
+
+ ChangeStatus indicatePessimisticFixpoint() override {
+ // If we give up and indicate a pessimistic fixpoint this instruction will
+ // become an access for all potential access kinds:
+ // TODO: Add pointers for argmemonly and globals to improve the results of
+ // checkForAllAccessesToMemoryKind.
+ bool Changed = false;
+ MemoryLocationsKind KnownMLK = getKnown();
+ Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
+ for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS; CurMLK *= 2)
+ if (!(CurMLK & KnownMLK))
+ updateStateAndAccessesMap(getState(), CurMLK, I, nullptr, Changed,
+ getAccessKindFromInst(I));
+ return AAMemoryLocation::indicatePessimisticFixpoint();
+ }
+
+protected:
+ /// Helper struct to tie together an instruction that has a read or write
+ /// effect with the pointer it accesses (if any).
+ struct AccessInfo {
+
+ /// The instruction that caused the access.
+ const Instruction *I;
+
+ /// The base pointer that is accessed, or null if unknown.
+ const Value *Ptr;
+
+ /// The kind of access (read/write/read+write).
+ AccessKind Kind;
+
+ bool operator==(const AccessInfo &RHS) const {
+ return I == RHS.I && Ptr == RHS.Ptr && Kind == RHS.Kind;
+ }
+ bool operator()(const AccessInfo &LHS, const AccessInfo &RHS) const {
+ if (LHS.I != RHS.I)
+ return LHS.I < RHS.I;
+ if (LHS.Ptr != RHS.Ptr)
+ return LHS.Ptr < RHS.Ptr;
+ if (LHS.Kind != RHS.Kind)
+ return LHS.Kind < RHS.Kind;
+ return false;
+ }
+ };
+
+ /// Mapping from *single* memory location kinds, e.g., LOCAL_MEM with the
+ /// value of NO_LOCAL_MEM, to the accesses encountered for this memory kind.
+ using AccessSet = SmallSet<AccessInfo, 2, AccessInfo>;
+ AccessSet *AccessKind2Accesses[llvm::CTLog2<VALID_STATE>()];
+
/// Categorize the pointer arguments of CB that might access memory in
/// AccessedLoc and update the state and access map accordingly.
void
@@ -6564,82 +6564,82 @@ protected:
AAMemoryLocation::StateType &AccessedLocs,
bool &Changed);
- /// Return the kind(s) of location that may be accessed by \p V.
- AAMemoryLocation::MemoryLocationsKind
- categorizeAccessedLocations(Attributor &A, Instruction &I, bool &Changed);
-
- /// Return the access kind as determined by \p I.
- AccessKind getAccessKindFromInst(const Instruction *I) {
- AccessKind AK = READ_WRITE;
- if (I) {
- AK = I->mayReadFromMemory() ? READ : NONE;
- AK = AccessKind(AK | (I->mayWriteToMemory() ? WRITE : NONE));
- }
- return AK;
- }
-
- /// Update the state \p State and the AccessKind2Accesses given that \p I is
- /// an access of kind \p AK to a \p MLK memory location with the access
- /// pointer \p Ptr.
- void updateStateAndAccessesMap(AAMemoryLocation::StateType &State,
- MemoryLocationsKind MLK, const Instruction *I,
- const Value *Ptr, bool &Changed,
- AccessKind AK = READ_WRITE) {
-
- assert(isPowerOf2_32(MLK) && "Expected a single location set!");
- auto *&Accesses = AccessKind2Accesses[llvm::Log2_32(MLK)];
- if (!Accesses)
- Accesses = new (Allocator) AccessSet();
- Changed |= Accesses->insert(AccessInfo{I, Ptr, AK}).second;
- State.removeAssumedBits(MLK);
- }
-
- /// Determine the underlying locations kinds for \p Ptr, e.g., globals or
- /// arguments, and update the state and access map accordingly.
- void categorizePtrValue(Attributor &A, const Instruction &I, const Value &Ptr,
- AAMemoryLocation::StateType &State, bool &Changed);
-
- /// Used to allocate access sets.
- BumpPtrAllocator &Allocator;
-
- /// The set of IR attributes AAMemoryLocation deals with.
- static const Attribute::AttrKind AttrKinds[4];
-};
-
-const Attribute::AttrKind AAMemoryLocationImpl::AttrKinds[] = {
- Attribute::ReadNone, Attribute::InaccessibleMemOnly, Attribute::ArgMemOnly,
- Attribute::InaccessibleMemOrArgMemOnly};
-
-void AAMemoryLocationImpl::categorizePtrValue(
- Attributor &A, const Instruction &I, const Value &Ptr,
- AAMemoryLocation::StateType &State, bool &Changed) {
- LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize pointer locations for "
- << Ptr << " ["
- << getMemoryLocationsAsStr(State.getAssumed()) << "]\n");
-
- auto StripGEPCB = [](Value *V) -> Value * {
- auto *GEP = dyn_cast<GEPOperator>(V);
- while (GEP) {
- V = GEP->getPointerOperand();
- GEP = dyn_cast<GEPOperator>(V);
- }
- return V;
- };
-
- auto VisitValueCB = [&](Value &V, const Instruction *,
- AAMemoryLocation::StateType &T,
- bool Stripped) -> bool {
+ /// Return the kind(s) of location that may be accessed by \p V.
+ AAMemoryLocation::MemoryLocationsKind
+ categorizeAccessedLocations(Attributor &A, Instruction &I, bool &Changed);
+
+ /// Return the access kind as determined by \p I.
+ AccessKind getAccessKindFromInst(const Instruction *I) {
+ AccessKind AK = READ_WRITE;
+ if (I) {
+ AK = I->mayReadFromMemory() ? READ : NONE;
+ AK = AccessKind(AK | (I->mayWriteToMemory() ? WRITE : NONE));
+ }
+ return AK;
+ }
+
+ /// Update the state \p State and the AccessKind2Accesses given that \p I is
+ /// an access of kind \p AK to a \p MLK memory location with the access
+ /// pointer \p Ptr.
+ void updateStateAndAccessesMap(AAMemoryLocation::StateType &State,
+ MemoryLocationsKind MLK, const Instruction *I,
+ const Value *Ptr, bool &Changed,
+ AccessKind AK = READ_WRITE) {
+
+ assert(isPowerOf2_32(MLK) && "Expected a single location set!");
+ auto *&Accesses = AccessKind2Accesses[llvm::Log2_32(MLK)];
+ if (!Accesses)
+ Accesses = new (Allocator) AccessSet();
+ Changed |= Accesses->insert(AccessInfo{I, Ptr, AK}).second;
+ State.removeAssumedBits(MLK);
+ }
+
+ /// Determine the underlying locations kinds for \p Ptr, e.g., globals or
+ /// arguments, and update the state and access map accordingly.
+ void categorizePtrValue(Attributor &A, const Instruction &I, const Value &Ptr,
+ AAMemoryLocation::StateType &State, bool &Changed);
+
+ /// Used to allocate access sets.
+ BumpPtrAllocator &Allocator;
+
+ /// The set of IR attributes AAMemoryLocation deals with.
+ static const Attribute::AttrKind AttrKinds[4];
+};
+
+const Attribute::AttrKind AAMemoryLocationImpl::AttrKinds[] = {
+ Attribute::ReadNone, Attribute::InaccessibleMemOnly, Attribute::ArgMemOnly,
+ Attribute::InaccessibleMemOrArgMemOnly};
+
+void AAMemoryLocationImpl::categorizePtrValue(
+ Attributor &A, const Instruction &I, const Value &Ptr,
+ AAMemoryLocation::StateType &State, bool &Changed) {
+ LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize pointer locations for "
+ << Ptr << " ["
+ << getMemoryLocationsAsStr(State.getAssumed()) << "]\n");
+
+ auto StripGEPCB = [](Value *V) -> Value * {
+ auto *GEP = dyn_cast<GEPOperator>(V);
+ while (GEP) {
+ V = GEP->getPointerOperand();
+ GEP = dyn_cast<GEPOperator>(V);
+ }
+ return V;
+ };
+
+ auto VisitValueCB = [&](Value &V, const Instruction *,
+ AAMemoryLocation::StateType &T,
+ bool Stripped) -> bool {
// TODO: recognize the TBAA used for constant accesses.
- MemoryLocationsKind MLK = NO_LOCATIONS;
- assert(!isa<GEPOperator>(V) && "GEPs should have been stripped.");
- if (isa<UndefValue>(V))
- return true;
- if (auto *Arg = dyn_cast<Argument>(&V)) {
- if (Arg->hasByValAttr())
- MLK = NO_LOCAL_MEM;
- else
- MLK = NO_ARGUMENT_MEM;
- } else if (auto *GV = dyn_cast<GlobalValue>(&V)) {
+ MemoryLocationsKind MLK = NO_LOCATIONS;
+ assert(!isa<GEPOperator>(V) && "GEPs should have been stripped.");
+ if (isa<UndefValue>(V))
+ return true;
+ if (auto *Arg = dyn_cast<Argument>(&V)) {
+ if (Arg->hasByValAttr())
+ MLK = NO_LOCAL_MEM;
+ else
+ MLK = NO_ARGUMENT_MEM;
+ } else if (auto *GV = dyn_cast<GlobalValue>(&V)) {
// Reading constant memory is not treated as a read "effect" by the
// function attr pass so we won't neither. Constants defined by TBAA are
// similar. (We know we do not write it because it is constant.)
@@ -6647,52 +6647,52 @@ void AAMemoryLocationImpl::categorizePtrValue(
if (GVar->isConstant())
return true;
- if (GV->hasLocalLinkage())
- MLK = NO_GLOBAL_INTERNAL_MEM;
- else
- MLK = NO_GLOBAL_EXTERNAL_MEM;
- } else if (isa<ConstantPointerNull>(V) &&
- !NullPointerIsDefined(getAssociatedFunction(),
- V.getType()->getPointerAddressSpace())) {
- return true;
- } else if (isa<AllocaInst>(V)) {
- MLK = NO_LOCAL_MEM;
- } else if (const auto *CB = dyn_cast<CallBase>(&V)) {
- const auto &NoAliasAA =
- A.getAAFor<AANoAlias>(*this, IRPosition::callsite_returned(*CB));
- if (NoAliasAA.isAssumedNoAlias())
- MLK = NO_MALLOCED_MEM;
- else
- MLK = NO_UNKOWN_MEM;
- } else {
- MLK = NO_UNKOWN_MEM;
- }
-
- assert(MLK != NO_LOCATIONS && "No location specified!");
- updateStateAndAccessesMap(T, MLK, &I, &V, Changed,
- getAccessKindFromInst(&I));
- LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Ptr value cannot be categorized: "
- << V << " -> " << getMemoryLocationsAsStr(T.getAssumed())
- << "\n");
- return true;
- };
-
- if (!genericValueTraversal<AAMemoryLocation, AAMemoryLocation::StateType>(
- A, IRPosition::value(Ptr), *this, State, VisitValueCB, getCtxI(),
- /* UseValueSimplify */ true,
- /* MaxValues */ 32, StripGEPCB)) {
- LLVM_DEBUG(
- dbgs() << "[AAMemoryLocation] Pointer locations not categorized\n");
- updateStateAndAccessesMap(State, NO_UNKOWN_MEM, &I, nullptr, Changed,
- getAccessKindFromInst(&I));
- } else {
- LLVM_DEBUG(
- dbgs()
- << "[AAMemoryLocation] Accessed locations with pointer locations: "
- << getMemoryLocationsAsStr(State.getAssumed()) << "\n");
- }
-}
-
+ if (GV->hasLocalLinkage())
+ MLK = NO_GLOBAL_INTERNAL_MEM;
+ else
+ MLK = NO_GLOBAL_EXTERNAL_MEM;
+ } else if (isa<ConstantPointerNull>(V) &&
+ !NullPointerIsDefined(getAssociatedFunction(),
+ V.getType()->getPointerAddressSpace())) {
+ return true;
+ } else if (isa<AllocaInst>(V)) {
+ MLK = NO_LOCAL_MEM;
+ } else if (const auto *CB = dyn_cast<CallBase>(&V)) {
+ const auto &NoAliasAA =
+ A.getAAFor<AANoAlias>(*this, IRPosition::callsite_returned(*CB));
+ if (NoAliasAA.isAssumedNoAlias())
+ MLK = NO_MALLOCED_MEM;
+ else
+ MLK = NO_UNKOWN_MEM;
+ } else {
+ MLK = NO_UNKOWN_MEM;
+ }
+
+ assert(MLK != NO_LOCATIONS && "No location specified!");
+ updateStateAndAccessesMap(T, MLK, &I, &V, Changed,
+ getAccessKindFromInst(&I));
+ LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Ptr value cannot be categorized: "
+ << V << " -> " << getMemoryLocationsAsStr(T.getAssumed())
+ << "\n");
+ return true;
+ };
+
+ if (!genericValueTraversal<AAMemoryLocation, AAMemoryLocation::StateType>(
+ A, IRPosition::value(Ptr), *this, State, VisitValueCB, getCtxI(),
+ /* UseValueSimplify */ true,
+ /* MaxValues */ 32, StripGEPCB)) {
+ LLVM_DEBUG(
+ dbgs() << "[AAMemoryLocation] Pointer locations not categorized\n");
+ updateStateAndAccessesMap(State, NO_UNKOWN_MEM, &I, nullptr, Changed,
+ getAccessKindFromInst(&I));
+ } else {
+ LLVM_DEBUG(
+ dbgs()
+ << "[AAMemoryLocation] Accessed locations with pointer locations: "
+ << getMemoryLocationsAsStr(State.getAssumed()) << "\n");
+ }
+}
+
void AAMemoryLocationImpl::categorizeArgumentPointerLocations(
Attributor &A, CallBase &CB, AAMemoryLocation::StateType &AccessedLocs,
bool &Changed) {
@@ -6717,689 +6717,689 @@ void AAMemoryLocationImpl::categorizeArgumentPointerLocations(
}
}
-AAMemoryLocation::MemoryLocationsKind
-AAMemoryLocationImpl::categorizeAccessedLocations(Attributor &A, Instruction &I,
- bool &Changed) {
- LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize accessed locations for "
- << I << "\n");
-
- AAMemoryLocation::StateType AccessedLocs;
- AccessedLocs.intersectAssumedBits(NO_LOCATIONS);
-
- if (auto *CB = dyn_cast<CallBase>(&I)) {
-
- // First check if we assume any memory is access is visible.
- const auto &CBMemLocationAA =
- A.getAAFor<AAMemoryLocation>(*this, IRPosition::callsite_function(*CB));
- LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize call site: " << I
- << " [" << CBMemLocationAA << "]\n");
-
- if (CBMemLocationAA.isAssumedReadNone())
- return NO_LOCATIONS;
-
- if (CBMemLocationAA.isAssumedInaccessibleMemOnly()) {
- updateStateAndAccessesMap(AccessedLocs, NO_INACCESSIBLE_MEM, &I, nullptr,
- Changed, getAccessKindFromInst(&I));
- return AccessedLocs.getAssumed();
- }
-
- uint32_t CBAssumedNotAccessedLocs =
- CBMemLocationAA.getAssumedNotAccessedLocation();
-
- // Set the argmemonly and global bit as we handle them separately below.
- uint32_t CBAssumedNotAccessedLocsNoArgMem =
- CBAssumedNotAccessedLocs | NO_ARGUMENT_MEM | NO_GLOBAL_MEM;
-
- for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS; CurMLK *= 2) {
- if (CBAssumedNotAccessedLocsNoArgMem & CurMLK)
- continue;
- updateStateAndAccessesMap(AccessedLocs, CurMLK, &I, nullptr, Changed,
- getAccessKindFromInst(&I));
- }
-
- // Now handle global memory if it might be accessed. This is slightly tricky
- // as NO_GLOBAL_MEM has multiple bits set.
- bool HasGlobalAccesses = ((~CBAssumedNotAccessedLocs) & NO_GLOBAL_MEM);
- if (HasGlobalAccesses) {
- auto AccessPred = [&](const Instruction *, const Value *Ptr,
- AccessKind Kind, MemoryLocationsKind MLK) {
- updateStateAndAccessesMap(AccessedLocs, MLK, &I, Ptr, Changed,
- getAccessKindFromInst(&I));
- return true;
- };
- if (!CBMemLocationAA.checkForAllAccessesToMemoryKind(
- AccessPred, inverseLocation(NO_GLOBAL_MEM, false, false)))
- return AccessedLocs.getWorstState();
- }
-
- LLVM_DEBUG(
- dbgs() << "[AAMemoryLocation] Accessed state before argument handling: "
- << getMemoryLocationsAsStr(AccessedLocs.getAssumed()) << "\n");
-
- // Now handle argument memory if it might be accessed.
- bool HasArgAccesses = ((~CBAssumedNotAccessedLocs) & NO_ARGUMENT_MEM);
+AAMemoryLocation::MemoryLocationsKind
+AAMemoryLocationImpl::categorizeAccessedLocations(Attributor &A, Instruction &I,
+ bool &Changed) {
+ LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize accessed locations for "
+ << I << "\n");
+
+ AAMemoryLocation::StateType AccessedLocs;
+ AccessedLocs.intersectAssumedBits(NO_LOCATIONS);
+
+ if (auto *CB = dyn_cast<CallBase>(&I)) {
+
+ // First check if we assume any memory is access is visible.
+ const auto &CBMemLocationAA =
+ A.getAAFor<AAMemoryLocation>(*this, IRPosition::callsite_function(*CB));
+ LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize call site: " << I
+ << " [" << CBMemLocationAA << "]\n");
+
+ if (CBMemLocationAA.isAssumedReadNone())
+ return NO_LOCATIONS;
+
+ if (CBMemLocationAA.isAssumedInaccessibleMemOnly()) {
+ updateStateAndAccessesMap(AccessedLocs, NO_INACCESSIBLE_MEM, &I, nullptr,
+ Changed, getAccessKindFromInst(&I));
+ return AccessedLocs.getAssumed();
+ }
+
+ uint32_t CBAssumedNotAccessedLocs =
+ CBMemLocationAA.getAssumedNotAccessedLocation();
+
+ // Set the argmemonly and global bit as we handle them separately below.
+ uint32_t CBAssumedNotAccessedLocsNoArgMem =
+ CBAssumedNotAccessedLocs | NO_ARGUMENT_MEM | NO_GLOBAL_MEM;
+
+ for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS; CurMLK *= 2) {
+ if (CBAssumedNotAccessedLocsNoArgMem & CurMLK)
+ continue;
+ updateStateAndAccessesMap(AccessedLocs, CurMLK, &I, nullptr, Changed,
+ getAccessKindFromInst(&I));
+ }
+
+ // Now handle global memory if it might be accessed. This is slightly tricky
+ // as NO_GLOBAL_MEM has multiple bits set.
+ bool HasGlobalAccesses = ((~CBAssumedNotAccessedLocs) & NO_GLOBAL_MEM);
+ if (HasGlobalAccesses) {
+ auto AccessPred = [&](const Instruction *, const Value *Ptr,
+ AccessKind Kind, MemoryLocationsKind MLK) {
+ updateStateAndAccessesMap(AccessedLocs, MLK, &I, Ptr, Changed,
+ getAccessKindFromInst(&I));
+ return true;
+ };
+ if (!CBMemLocationAA.checkForAllAccessesToMemoryKind(
+ AccessPred, inverseLocation(NO_GLOBAL_MEM, false, false)))
+ return AccessedLocs.getWorstState();
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "[AAMemoryLocation] Accessed state before argument handling: "
+ << getMemoryLocationsAsStr(AccessedLocs.getAssumed()) << "\n");
+
+ // Now handle argument memory if it might be accessed.
+ bool HasArgAccesses = ((~CBAssumedNotAccessedLocs) & NO_ARGUMENT_MEM);
if (HasArgAccesses)
categorizeArgumentPointerLocations(A, *CB, AccessedLocs, Changed);
-
- LLVM_DEBUG(
- dbgs() << "[AAMemoryLocation] Accessed state after argument handling: "
- << getMemoryLocationsAsStr(AccessedLocs.getAssumed()) << "\n");
-
- return AccessedLocs.getAssumed();
- }
-
- if (const Value *Ptr = getPointerOperand(&I, /* AllowVolatile */ true)) {
- LLVM_DEBUG(
- dbgs() << "[AAMemoryLocation] Categorize memory access with pointer: "
- << I << " [" << *Ptr << "]\n");
- categorizePtrValue(A, I, *Ptr, AccessedLocs, Changed);
- return AccessedLocs.getAssumed();
- }
-
- LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Failed to categorize instruction: "
- << I << "\n");
- updateStateAndAccessesMap(AccessedLocs, NO_UNKOWN_MEM, &I, nullptr, Changed,
- getAccessKindFromInst(&I));
- return AccessedLocs.getAssumed();
-}
-
-/// An AA to represent the memory behavior function attributes.
-struct AAMemoryLocationFunction final : public AAMemoryLocationImpl {
- AAMemoryLocationFunction(const IRPosition &IRP, Attributor &A)
- : AAMemoryLocationImpl(IRP, A) {}
-
- /// See AbstractAttribute::updateImpl(Attributor &A).
- virtual ChangeStatus updateImpl(Attributor &A) override {
-
- const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
- *this, getIRPosition(), /* TrackDependence */ false);
- if (MemBehaviorAA.isAssumedReadNone()) {
- if (MemBehaviorAA.isKnownReadNone())
- return indicateOptimisticFixpoint();
- assert(isAssumedReadNone() &&
- "AAMemoryLocation was not read-none but AAMemoryBehavior was!");
- A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
- return ChangeStatus::UNCHANGED;
- }
-
- // The current assumed state used to determine a change.
- auto AssumedState = getAssumed();
- bool Changed = false;
-
- auto CheckRWInst = [&](Instruction &I) {
- MemoryLocationsKind MLK = categorizeAccessedLocations(A, I, Changed);
- LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Accessed locations for " << I
- << ": " << getMemoryLocationsAsStr(MLK) << "\n");
- removeAssumedBits(inverseLocation(MLK, false, false));
+
+ LLVM_DEBUG(
+ dbgs() << "[AAMemoryLocation] Accessed state after argument handling: "
+ << getMemoryLocationsAsStr(AccessedLocs.getAssumed()) << "\n");
+
+ return AccessedLocs.getAssumed();
+ }
+
+ if (const Value *Ptr = getPointerOperand(&I, /* AllowVolatile */ true)) {
+ LLVM_DEBUG(
+ dbgs() << "[AAMemoryLocation] Categorize memory access with pointer: "
+ << I << " [" << *Ptr << "]\n");
+ categorizePtrValue(A, I, *Ptr, AccessedLocs, Changed);
+ return AccessedLocs.getAssumed();
+ }
+
+ LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Failed to categorize instruction: "
+ << I << "\n");
+ updateStateAndAccessesMap(AccessedLocs, NO_UNKOWN_MEM, &I, nullptr, Changed,
+ getAccessKindFromInst(&I));
+ return AccessedLocs.getAssumed();
+}
+
+/// An AA to represent the memory behavior function attributes.
+struct AAMemoryLocationFunction final : public AAMemoryLocationImpl {
+ AAMemoryLocationFunction(const IRPosition &IRP, Attributor &A)
+ : AAMemoryLocationImpl(IRP, A) {}
+
+ /// See AbstractAttribute::updateImpl(Attributor &A).
+ virtual ChangeStatus updateImpl(Attributor &A) override {
+
+ const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
+ *this, getIRPosition(), /* TrackDependence */ false);
+ if (MemBehaviorAA.isAssumedReadNone()) {
+ if (MemBehaviorAA.isKnownReadNone())
+ return indicateOptimisticFixpoint();
+ assert(isAssumedReadNone() &&
+ "AAMemoryLocation was not read-none but AAMemoryBehavior was!");
+ A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
+ return ChangeStatus::UNCHANGED;
+ }
+
+ // The current assumed state used to determine a change.
+ auto AssumedState = getAssumed();
+ bool Changed = false;
+
+ auto CheckRWInst = [&](Instruction &I) {
+ MemoryLocationsKind MLK = categorizeAccessedLocations(A, I, Changed);
+ LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Accessed locations for " << I
+ << ": " << getMemoryLocationsAsStr(MLK) << "\n");
+ removeAssumedBits(inverseLocation(MLK, false, false));
// Stop once only the valid bit set in the *not assumed location*, thus
// once we don't actually exclude any memory locations in the state.
return getAssumedNotAccessedLocation() != VALID_STATE;
- };
-
- if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
- return indicatePessimisticFixpoint();
-
- Changed |= AssumedState != getAssumed();
- return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- if (isAssumedReadNone())
- STATS_DECLTRACK_FN_ATTR(readnone)
- else if (isAssumedArgMemOnly())
- STATS_DECLTRACK_FN_ATTR(argmemonly)
- else if (isAssumedInaccessibleMemOnly())
- STATS_DECLTRACK_FN_ATTR(inaccessiblememonly)
- else if (isAssumedInaccessibleOrArgMemOnly())
- STATS_DECLTRACK_FN_ATTR(inaccessiblememorargmemonly)
- }
-};
-
-/// AAMemoryLocation attribute for call sites.
-struct AAMemoryLocationCallSite final : AAMemoryLocationImpl {
- AAMemoryLocationCallSite(const IRPosition &IRP, Attributor &A)
- : AAMemoryLocationImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- AAMemoryLocationImpl::initialize(A);
- Function *F = getAssociatedFunction();
+ };
+
+ if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
+ return indicatePessimisticFixpoint();
+
+ Changed |= AssumedState != getAssumed();
+ return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ if (isAssumedReadNone())
+ STATS_DECLTRACK_FN_ATTR(readnone)
+ else if (isAssumedArgMemOnly())
+ STATS_DECLTRACK_FN_ATTR(argmemonly)
+ else if (isAssumedInaccessibleMemOnly())
+ STATS_DECLTRACK_FN_ATTR(inaccessiblememonly)
+ else if (isAssumedInaccessibleOrArgMemOnly())
+ STATS_DECLTRACK_FN_ATTR(inaccessiblememorargmemonly)
+ }
+};
+
+/// AAMemoryLocation attribute for call sites.
+struct AAMemoryLocationCallSite final : AAMemoryLocationImpl {
+ AAMemoryLocationCallSite(const IRPosition &IRP, Attributor &A)
+ : AAMemoryLocationImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ AAMemoryLocationImpl::initialize(A);
+ Function *F = getAssociatedFunction();
if (!F || F->isDeclaration())
- indicatePessimisticFixpoint();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- // TODO: Once we have call site specific value information we can provide
- // call site specific liveness liveness information and then it makes
- // sense to specialize attributes for call sites arguments instead of
- // redirecting requests to the callee argument.
- Function *F = getAssociatedFunction();
- const IRPosition &FnPos = IRPosition::function(*F);
- auto &FnAA = A.getAAFor<AAMemoryLocation>(*this, FnPos);
- bool Changed = false;
- auto AccessPred = [&](const Instruction *I, const Value *Ptr,
- AccessKind Kind, MemoryLocationsKind MLK) {
- updateStateAndAccessesMap(getState(), MLK, I, Ptr, Changed,
- getAccessKindFromInst(I));
- return true;
- };
- if (!FnAA.checkForAllAccessesToMemoryKind(AccessPred, ALL_LOCATIONS))
- return indicatePessimisticFixpoint();
- return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- if (isAssumedReadNone())
- STATS_DECLTRACK_CS_ATTR(readnone)
- }
-};
-
-/// ------------------ Value Constant Range Attribute -------------------------
-
-struct AAValueConstantRangeImpl : AAValueConstantRange {
- using StateType = IntegerRangeState;
- AAValueConstantRangeImpl(const IRPosition &IRP, Attributor &A)
- : AAValueConstantRange(IRP, A) {}
-
- /// See AbstractAttribute::getAsStr().
- const std::string getAsStr() const override {
- std::string Str;
- llvm::raw_string_ostream OS(Str);
- OS << "range(" << getBitWidth() << ")<";
- getKnown().print(OS);
- OS << " / ";
- getAssumed().print(OS);
- OS << ">";
- return OS.str();
- }
-
- /// Helper function to get a SCEV expr for the associated value at program
- /// point \p I.
- const SCEV *getSCEV(Attributor &A, const Instruction *I = nullptr) const {
- if (!getAnchorScope())
- return nullptr;
-
- ScalarEvolution *SE =
- A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(
- *getAnchorScope());
-
- LoopInfo *LI = A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(
- *getAnchorScope());
-
- if (!SE || !LI)
- return nullptr;
-
- const SCEV *S = SE->getSCEV(&getAssociatedValue());
- if (!I)
- return S;
-
- return SE->getSCEVAtScope(S, LI->getLoopFor(I->getParent()));
- }
-
- /// Helper function to get a range from SCEV for the associated value at
- /// program point \p I.
- ConstantRange getConstantRangeFromSCEV(Attributor &A,
- const Instruction *I = nullptr) const {
- if (!getAnchorScope())
- return getWorstState(getBitWidth());
-
- ScalarEvolution *SE =
- A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(
- *getAnchorScope());
-
- const SCEV *S = getSCEV(A, I);
- if (!SE || !S)
- return getWorstState(getBitWidth());
-
- return SE->getUnsignedRange(S);
- }
-
- /// Helper function to get a range from LVI for the associated value at
- /// program point \p I.
- ConstantRange
- getConstantRangeFromLVI(Attributor &A,
- const Instruction *CtxI = nullptr) const {
- if (!getAnchorScope())
- return getWorstState(getBitWidth());
-
- LazyValueInfo *LVI =
- A.getInfoCache().getAnalysisResultForFunction<LazyValueAnalysis>(
- *getAnchorScope());
-
- if (!LVI || !CtxI)
- return getWorstState(getBitWidth());
- return LVI->getConstantRange(&getAssociatedValue(),
- const_cast<Instruction *>(CtxI));
- }
-
- /// See AAValueConstantRange::getKnownConstantRange(..).
- ConstantRange
- getKnownConstantRange(Attributor &A,
- const Instruction *CtxI = nullptr) const override {
- if (!CtxI || CtxI == getCtxI())
- return getKnown();
-
- ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI);
- ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI);
- return getKnown().intersectWith(SCEVR).intersectWith(LVIR);
- }
-
- /// See AAValueConstantRange::getAssumedConstantRange(..).
- ConstantRange
- getAssumedConstantRange(Attributor &A,
- const Instruction *CtxI = nullptr) const override {
- // TODO: Make SCEV use Attributor assumption.
- // We may be able to bound a variable range via assumptions in
- // Attributor. ex.) If x is assumed to be in [1, 3] and y is known to
- // evolve to x^2 + x, then we can say that y is in [2, 12].
-
- if (!CtxI || CtxI == getCtxI())
- return getAssumed();
-
- ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI);
- ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI);
- return getAssumed().intersectWith(SCEVR).intersectWith(LVIR);
- }
-
- /// See AbstractAttribute::initialize(..).
- void initialize(Attributor &A) override {
- // Intersect a range given by SCEV.
- intersectKnown(getConstantRangeFromSCEV(A, getCtxI()));
-
- // Intersect a range given by LVI.
- intersectKnown(getConstantRangeFromLVI(A, getCtxI()));
- }
-
- /// Helper function to create MDNode for range metadata.
- static MDNode *
- getMDNodeForConstantRange(Type *Ty, LLVMContext &Ctx,
- const ConstantRange &AssumedConstantRange) {
- Metadata *LowAndHigh[] = {ConstantAsMetadata::get(ConstantInt::get(
- Ty, AssumedConstantRange.getLower())),
- ConstantAsMetadata::get(ConstantInt::get(
- Ty, AssumedConstantRange.getUpper()))};
- return MDNode::get(Ctx, LowAndHigh);
- }
-
- /// Return true if \p Assumed is included in \p KnownRanges.
- static bool isBetterRange(const ConstantRange &Assumed, MDNode *KnownRanges) {
-
- if (Assumed.isFullSet())
- return false;
-
- if (!KnownRanges)
- return true;
-
- // If multiple ranges are annotated in IR, we give up to annotate assumed
- // range for now.
-
- // TODO: If there exists a known range which containts assumed range, we
- // can say assumed range is better.
- if (KnownRanges->getNumOperands() > 2)
- return false;
-
- ConstantInt *Lower =
- mdconst::extract<ConstantInt>(KnownRanges->getOperand(0));
- ConstantInt *Upper =
- mdconst::extract<ConstantInt>(KnownRanges->getOperand(1));
-
- ConstantRange Known(Lower->getValue(), Upper->getValue());
- return Known.contains(Assumed) && Known != Assumed;
- }
-
- /// Helper function to set range metadata.
- static bool
- setRangeMetadataIfisBetterRange(Instruction *I,
- const ConstantRange &AssumedConstantRange) {
- auto *OldRangeMD = I->getMetadata(LLVMContext::MD_range);
- if (isBetterRange(AssumedConstantRange, OldRangeMD)) {
- if (!AssumedConstantRange.isEmptySet()) {
- I->setMetadata(LLVMContext::MD_range,
- getMDNodeForConstantRange(I->getType(), I->getContext(),
- AssumedConstantRange));
- return true;
- }
- }
- return false;
- }
-
- /// See AbstractAttribute::manifest()
- ChangeStatus manifest(Attributor &A) override {
- ChangeStatus Changed = ChangeStatus::UNCHANGED;
- ConstantRange AssumedConstantRange = getAssumedConstantRange(A);
- assert(!AssumedConstantRange.isFullSet() && "Invalid state");
-
- auto &V = getAssociatedValue();
- if (!AssumedConstantRange.isEmptySet() &&
- !AssumedConstantRange.isSingleElement()) {
+ indicatePessimisticFixpoint();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ // TODO: Once we have call site specific value information we can provide
+ // call site specific liveness liveness information and then it makes
+ // sense to specialize attributes for call sites arguments instead of
+ // redirecting requests to the callee argument.
+ Function *F = getAssociatedFunction();
+ const IRPosition &FnPos = IRPosition::function(*F);
+ auto &FnAA = A.getAAFor<AAMemoryLocation>(*this, FnPos);
+ bool Changed = false;
+ auto AccessPred = [&](const Instruction *I, const Value *Ptr,
+ AccessKind Kind, MemoryLocationsKind MLK) {
+ updateStateAndAccessesMap(getState(), MLK, I, Ptr, Changed,
+ getAccessKindFromInst(I));
+ return true;
+ };
+ if (!FnAA.checkForAllAccessesToMemoryKind(AccessPred, ALL_LOCATIONS))
+ return indicatePessimisticFixpoint();
+ return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ if (isAssumedReadNone())
+ STATS_DECLTRACK_CS_ATTR(readnone)
+ }
+};
+
+/// ------------------ Value Constant Range Attribute -------------------------
+
+struct AAValueConstantRangeImpl : AAValueConstantRange {
+ using StateType = IntegerRangeState;
+ AAValueConstantRangeImpl(const IRPosition &IRP, Attributor &A)
+ : AAValueConstantRange(IRP, A) {}
+
+ /// See AbstractAttribute::getAsStr().
+ const std::string getAsStr() const override {
+ std::string Str;
+ llvm::raw_string_ostream OS(Str);
+ OS << "range(" << getBitWidth() << ")<";
+ getKnown().print(OS);
+ OS << " / ";
+ getAssumed().print(OS);
+ OS << ">";
+ return OS.str();
+ }
+
+ /// Helper function to get a SCEV expr for the associated value at program
+ /// point \p I.
+ const SCEV *getSCEV(Attributor &A, const Instruction *I = nullptr) const {
+ if (!getAnchorScope())
+ return nullptr;
+
+ ScalarEvolution *SE =
+ A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(
+ *getAnchorScope());
+
+ LoopInfo *LI = A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(
+ *getAnchorScope());
+
+ if (!SE || !LI)
+ return nullptr;
+
+ const SCEV *S = SE->getSCEV(&getAssociatedValue());
+ if (!I)
+ return S;
+
+ return SE->getSCEVAtScope(S, LI->getLoopFor(I->getParent()));
+ }
+
+ /// Helper function to get a range from SCEV for the associated value at
+ /// program point \p I.
+ ConstantRange getConstantRangeFromSCEV(Attributor &A,
+ const Instruction *I = nullptr) const {
+ if (!getAnchorScope())
+ return getWorstState(getBitWidth());
+
+ ScalarEvolution *SE =
+ A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(
+ *getAnchorScope());
+
+ const SCEV *S = getSCEV(A, I);
+ if (!SE || !S)
+ return getWorstState(getBitWidth());
+
+ return SE->getUnsignedRange(S);
+ }
+
+ /// Helper function to get a range from LVI for the associated value at
+ /// program point \p I.
+ ConstantRange
+ getConstantRangeFromLVI(Attributor &A,
+ const Instruction *CtxI = nullptr) const {
+ if (!getAnchorScope())
+ return getWorstState(getBitWidth());
+
+ LazyValueInfo *LVI =
+ A.getInfoCache().getAnalysisResultForFunction<LazyValueAnalysis>(
+ *getAnchorScope());
+
+ if (!LVI || !CtxI)
+ return getWorstState(getBitWidth());
+ return LVI->getConstantRange(&getAssociatedValue(),
+ const_cast<Instruction *>(CtxI));
+ }
+
+ /// See AAValueConstantRange::getKnownConstantRange(..).
+ ConstantRange
+ getKnownConstantRange(Attributor &A,
+ const Instruction *CtxI = nullptr) const override {
+ if (!CtxI || CtxI == getCtxI())
+ return getKnown();
+
+ ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI);
+ ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI);
+ return getKnown().intersectWith(SCEVR).intersectWith(LVIR);
+ }
+
+ /// See AAValueConstantRange::getAssumedConstantRange(..).
+ ConstantRange
+ getAssumedConstantRange(Attributor &A,
+ const Instruction *CtxI = nullptr) const override {
+ // TODO: Make SCEV use Attributor assumption.
+ // We may be able to bound a variable range via assumptions in
+ // Attributor. ex.) If x is assumed to be in [1, 3] and y is known to
+ // evolve to x^2 + x, then we can say that y is in [2, 12].
+
+ if (!CtxI || CtxI == getCtxI())
+ return getAssumed();
+
+ ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI);
+ ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI);
+ return getAssumed().intersectWith(SCEVR).intersectWith(LVIR);
+ }
+
+ /// See AbstractAttribute::initialize(..).
+ void initialize(Attributor &A) override {
+ // Intersect a range given by SCEV.
+ intersectKnown(getConstantRangeFromSCEV(A, getCtxI()));
+
+ // Intersect a range given by LVI.
+ intersectKnown(getConstantRangeFromLVI(A, getCtxI()));
+ }
+
+ /// Helper function to create MDNode for range metadata.
+ static MDNode *
+ getMDNodeForConstantRange(Type *Ty, LLVMContext &Ctx,
+ const ConstantRange &AssumedConstantRange) {
+ Metadata *LowAndHigh[] = {ConstantAsMetadata::get(ConstantInt::get(
+ Ty, AssumedConstantRange.getLower())),
+ ConstantAsMetadata::get(ConstantInt::get(
+ Ty, AssumedConstantRange.getUpper()))};
+ return MDNode::get(Ctx, LowAndHigh);
+ }
+
+ /// Return true if \p Assumed is included in \p KnownRanges.
+ static bool isBetterRange(const ConstantRange &Assumed, MDNode *KnownRanges) {
+
+ if (Assumed.isFullSet())
+ return false;
+
+ if (!KnownRanges)
+ return true;
+
+ // If multiple ranges are annotated in IR, we give up to annotate assumed
+ // range for now.
+
+ // TODO: If there exists a known range which containts assumed range, we
+ // can say assumed range is better.
+ if (KnownRanges->getNumOperands() > 2)
+ return false;
+
+ ConstantInt *Lower =
+ mdconst::extract<ConstantInt>(KnownRanges->getOperand(0));
+ ConstantInt *Upper =
+ mdconst::extract<ConstantInt>(KnownRanges->getOperand(1));
+
+ ConstantRange Known(Lower->getValue(), Upper->getValue());
+ return Known.contains(Assumed) && Known != Assumed;
+ }
+
+ /// Helper function to set range metadata.
+ static bool
+ setRangeMetadataIfisBetterRange(Instruction *I,
+ const ConstantRange &AssumedConstantRange) {
+ auto *OldRangeMD = I->getMetadata(LLVMContext::MD_range);
+ if (isBetterRange(AssumedConstantRange, OldRangeMD)) {
+ if (!AssumedConstantRange.isEmptySet()) {
+ I->setMetadata(LLVMContext::MD_range,
+ getMDNodeForConstantRange(I->getType(), I->getContext(),
+ AssumedConstantRange));
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /// See AbstractAttribute::manifest()
+ ChangeStatus manifest(Attributor &A) override {
+ ChangeStatus Changed = ChangeStatus::UNCHANGED;
+ ConstantRange AssumedConstantRange = getAssumedConstantRange(A);
+ assert(!AssumedConstantRange.isFullSet() && "Invalid state");
+
+ auto &V = getAssociatedValue();
+ if (!AssumedConstantRange.isEmptySet() &&
+ !AssumedConstantRange.isSingleElement()) {
if (Instruction *I = dyn_cast<Instruction>(&V)) {
assert(I == getCtxI() && "Should not annotate an instruction which is "
"not the context instruction");
- if (isa<CallInst>(I) || isa<LoadInst>(I))
- if (setRangeMetadataIfisBetterRange(I, AssumedConstantRange))
- Changed = ChangeStatus::CHANGED;
+ if (isa<CallInst>(I) || isa<LoadInst>(I))
+ if (setRangeMetadataIfisBetterRange(I, AssumedConstantRange))
+ Changed = ChangeStatus::CHANGED;
}
- }
-
- return Changed;
- }
-};
-
-struct AAValueConstantRangeArgument final
- : AAArgumentFromCallSiteArguments<
- AAValueConstantRange, AAValueConstantRangeImpl, IntegerRangeState> {
- using Base = AAArgumentFromCallSiteArguments<
- AAValueConstantRange, AAValueConstantRangeImpl, IntegerRangeState>;
- AAValueConstantRangeArgument(const IRPosition &IRP, Attributor &A)
- : Base(IRP, A) {}
-
- /// See AbstractAttribute::initialize(..).
- void initialize(Attributor &A) override {
- if (!getAnchorScope() || getAnchorScope()->isDeclaration()) {
- indicatePessimisticFixpoint();
- } else {
- Base::initialize(A);
- }
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_ARG_ATTR(value_range)
- }
-};
-
-struct AAValueConstantRangeReturned
- : AAReturnedFromReturnedValues<AAValueConstantRange,
- AAValueConstantRangeImpl> {
- using Base = AAReturnedFromReturnedValues<AAValueConstantRange,
- AAValueConstantRangeImpl>;
- AAValueConstantRangeReturned(const IRPosition &IRP, Attributor &A)
- : Base(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_FNRET_ATTR(value_range)
- }
-};
-
-struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
- AAValueConstantRangeFloating(const IRPosition &IRP, Attributor &A)
- : AAValueConstantRangeImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- AAValueConstantRangeImpl::initialize(A);
- Value &V = getAssociatedValue();
-
- if (auto *C = dyn_cast<ConstantInt>(&V)) {
- unionAssumed(ConstantRange(C->getValue()));
- indicateOptimisticFixpoint();
- return;
- }
-
- if (isa<UndefValue>(&V)) {
- // Collapse the undef state to 0.
- unionAssumed(ConstantRange(APInt(getBitWidth(), 0)));
- indicateOptimisticFixpoint();
- return;
- }
-
+ }
+
+ return Changed;
+ }
+};
+
+struct AAValueConstantRangeArgument final
+ : AAArgumentFromCallSiteArguments<
+ AAValueConstantRange, AAValueConstantRangeImpl, IntegerRangeState> {
+ using Base = AAArgumentFromCallSiteArguments<
+ AAValueConstantRange, AAValueConstantRangeImpl, IntegerRangeState>;
+ AAValueConstantRangeArgument(const IRPosition &IRP, Attributor &A)
+ : Base(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(..).
+ void initialize(Attributor &A) override {
+ if (!getAnchorScope() || getAnchorScope()->isDeclaration()) {
+ indicatePessimisticFixpoint();
+ } else {
+ Base::initialize(A);
+ }
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_ARG_ATTR(value_range)
+ }
+};
+
+struct AAValueConstantRangeReturned
+ : AAReturnedFromReturnedValues<AAValueConstantRange,
+ AAValueConstantRangeImpl> {
+ using Base = AAReturnedFromReturnedValues<AAValueConstantRange,
+ AAValueConstantRangeImpl>;
+ AAValueConstantRangeReturned(const IRPosition &IRP, Attributor &A)
+ : Base(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_FNRET_ATTR(value_range)
+ }
+};
+
+struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
+ AAValueConstantRangeFloating(const IRPosition &IRP, Attributor &A)
+ : AAValueConstantRangeImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ AAValueConstantRangeImpl::initialize(A);
+ Value &V = getAssociatedValue();
+
+ if (auto *C = dyn_cast<ConstantInt>(&V)) {
+ unionAssumed(ConstantRange(C->getValue()));
+ indicateOptimisticFixpoint();
+ return;
+ }
+
+ if (isa<UndefValue>(&V)) {
+ // Collapse the undef state to 0.
+ unionAssumed(ConstantRange(APInt(getBitWidth(), 0)));
+ indicateOptimisticFixpoint();
+ return;
+ }
+
if (isa<CallBase>(&V))
return;
- if (isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<CastInst>(&V))
- return;
- // If it is a load instruction with range metadata, use it.
- if (LoadInst *LI = dyn_cast<LoadInst>(&V))
- if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range)) {
- intersectKnown(getConstantRangeFromMetadata(*RangeMD));
- return;
- }
-
- // We can work with PHI and select instruction as we traverse their operands
- // during update.
- if (isa<SelectInst>(V) || isa<PHINode>(V))
- return;
-
- // Otherwise we give up.
- indicatePessimisticFixpoint();
-
- LLVM_DEBUG(dbgs() << "[AAValueConstantRange] We give up: "
- << getAssociatedValue() << "\n");
- }
-
- bool calculateBinaryOperator(
- Attributor &A, BinaryOperator *BinOp, IntegerRangeState &T,
- const Instruction *CtxI,
- SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) {
- Value *LHS = BinOp->getOperand(0);
- Value *RHS = BinOp->getOperand(1);
- // TODO: Allow non integers as well.
- if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
- return false;
-
- auto &LHSAA =
- A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS));
- QuerriedAAs.push_back(&LHSAA);
- auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI);
-
- auto &RHSAA =
- A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS));
- QuerriedAAs.push_back(&RHSAA);
- auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI);
-
- auto AssumedRange = LHSAARange.binaryOp(BinOp->getOpcode(), RHSAARange);
-
- T.unionAssumed(AssumedRange);
-
- // TODO: Track a known state too.
-
- return T.isValidState();
- }
-
- bool calculateCastInst(
- Attributor &A, CastInst *CastI, IntegerRangeState &T,
- const Instruction *CtxI,
- SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) {
- assert(CastI->getNumOperands() == 1 && "Expected cast to be unary!");
- // TODO: Allow non integers as well.
- Value &OpV = *CastI->getOperand(0);
- if (!OpV.getType()->isIntegerTy())
- return false;
-
- auto &OpAA =
- A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(OpV));
- QuerriedAAs.push_back(&OpAA);
- T.unionAssumed(
- OpAA.getAssumed().castOp(CastI->getOpcode(), getState().getBitWidth()));
- return T.isValidState();
- }
-
- bool
- calculateCmpInst(Attributor &A, CmpInst *CmpI, IntegerRangeState &T,
- const Instruction *CtxI,
- SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) {
- Value *LHS = CmpI->getOperand(0);
- Value *RHS = CmpI->getOperand(1);
- // TODO: Allow non integers as well.
- if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
- return false;
-
- auto &LHSAA =
- A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS));
- QuerriedAAs.push_back(&LHSAA);
- auto &RHSAA =
- A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS));
- QuerriedAAs.push_back(&RHSAA);
-
- auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI);
- auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI);
-
- // If one of them is empty set, we can't decide.
- if (LHSAARange.isEmptySet() || RHSAARange.isEmptySet())
- return true;
-
- bool MustTrue = false, MustFalse = false;
-
- auto AllowedRegion =
- ConstantRange::makeAllowedICmpRegion(CmpI->getPredicate(), RHSAARange);
-
- auto SatisfyingRegion = ConstantRange::makeSatisfyingICmpRegion(
- CmpI->getPredicate(), RHSAARange);
-
- if (AllowedRegion.intersectWith(LHSAARange).isEmptySet())
- MustFalse = true;
-
- if (SatisfyingRegion.contains(LHSAARange))
- MustTrue = true;
-
- assert((!MustTrue || !MustFalse) &&
- "Either MustTrue or MustFalse should be false!");
-
- if (MustTrue)
- T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 1)));
- else if (MustFalse)
- T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 0)));
- else
- T.unionAssumed(ConstantRange(/* BitWidth */ 1, /* isFullSet */ true));
-
- LLVM_DEBUG(dbgs() << "[AAValueConstantRange] " << *CmpI << " " << LHSAA
- << " " << RHSAA << "\n");
-
- // TODO: Track a known state too.
- return T.isValidState();
- }
-
- /// See AbstractAttribute::updateImpl(...).
- ChangeStatus updateImpl(Attributor &A) override {
- auto VisitValueCB = [&](Value &V, const Instruction *CtxI,
- IntegerRangeState &T, bool Stripped) -> bool {
- Instruction *I = dyn_cast<Instruction>(&V);
- if (!I || isa<CallBase>(I)) {
-
- // If the value is not instruction, we query AA to Attributor.
- const auto &AA =
- A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(V));
-
- // Clamp operator is not used to utilize a program point CtxI.
- T.unionAssumed(AA.getAssumedConstantRange(A, CtxI));
-
- return T.isValidState();
- }
-
- SmallVector<const AAValueConstantRange *, 4> QuerriedAAs;
- if (auto *BinOp = dyn_cast<BinaryOperator>(I)) {
- if (!calculateBinaryOperator(A, BinOp, T, CtxI, QuerriedAAs))
- return false;
- } else if (auto *CmpI = dyn_cast<CmpInst>(I)) {
- if (!calculateCmpInst(A, CmpI, T, CtxI, QuerriedAAs))
- return false;
- } else if (auto *CastI = dyn_cast<CastInst>(I)) {
- if (!calculateCastInst(A, CastI, T, CtxI, QuerriedAAs))
- return false;
- } else {
- // Give up with other instructions.
- // TODO: Add other instructions
-
- T.indicatePessimisticFixpoint();
- return false;
- }
-
- // Catch circular reasoning in a pessimistic way for now.
- // TODO: Check how the range evolves and if we stripped anything, see also
- // AADereferenceable or AAAlign for similar situations.
- for (const AAValueConstantRange *QueriedAA : QuerriedAAs) {
- if (QueriedAA != this)
- continue;
- // If we are in a stady state we do not need to worry.
- if (T.getAssumed() == getState().getAssumed())
- continue;
- T.indicatePessimisticFixpoint();
- }
-
- return T.isValidState();
- };
-
- IntegerRangeState T(getBitWidth());
-
- if (!genericValueTraversal<AAValueConstantRange, IntegerRangeState>(
- A, getIRPosition(), *this, T, VisitValueCB, getCtxI(),
- /* UseValueSimplify */ false))
- return indicatePessimisticFixpoint();
-
- return clampStateAndIndicateChange(getState(), T);
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_FLOATING_ATTR(value_range)
- }
-};
-
-struct AAValueConstantRangeFunction : AAValueConstantRangeImpl {
- AAValueConstantRangeFunction(const IRPosition &IRP, Attributor &A)
- : AAValueConstantRangeImpl(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- ChangeStatus updateImpl(Attributor &A) override {
- llvm_unreachable("AAValueConstantRange(Function|CallSite)::updateImpl will "
- "not be called");
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(value_range) }
-};
-
-struct AAValueConstantRangeCallSite : AAValueConstantRangeFunction {
- AAValueConstantRangeCallSite(const IRPosition &IRP, Attributor &A)
- : AAValueConstantRangeFunction(IRP, A) {}
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(value_range) }
-};
-
-struct AAValueConstantRangeCallSiteReturned
- : AACallSiteReturnedFromReturned<AAValueConstantRange,
- AAValueConstantRangeImpl> {
- AAValueConstantRangeCallSiteReturned(const IRPosition &IRP, Attributor &A)
- : AACallSiteReturnedFromReturned<AAValueConstantRange,
- AAValueConstantRangeImpl>(IRP, A) {}
-
- /// See AbstractAttribute::initialize(...).
- void initialize(Attributor &A) override {
- // If it is a load instruction with range metadata, use the metadata.
- if (CallInst *CI = dyn_cast<CallInst>(&getAssociatedValue()))
- if (auto *RangeMD = CI->getMetadata(LLVMContext::MD_range))
- intersectKnown(getConstantRangeFromMetadata(*RangeMD));
-
- AAValueConstantRangeImpl::initialize(A);
- }
-
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_CSRET_ATTR(value_range)
- }
-};
-struct AAValueConstantRangeCallSiteArgument : AAValueConstantRangeFloating {
- AAValueConstantRangeCallSiteArgument(const IRPosition &IRP, Attributor &A)
- : AAValueConstantRangeFloating(IRP, A) {}
-
+ if (isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<CastInst>(&V))
+ return;
+ // If it is a load instruction with range metadata, use it.
+ if (LoadInst *LI = dyn_cast<LoadInst>(&V))
+ if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range)) {
+ intersectKnown(getConstantRangeFromMetadata(*RangeMD));
+ return;
+ }
+
+ // We can work with PHI and select instruction as we traverse their operands
+ // during update.
+ if (isa<SelectInst>(V) || isa<PHINode>(V))
+ return;
+
+ // Otherwise we give up.
+ indicatePessimisticFixpoint();
+
+ LLVM_DEBUG(dbgs() << "[AAValueConstantRange] We give up: "
+ << getAssociatedValue() << "\n");
+ }
+
+ bool calculateBinaryOperator(
+ Attributor &A, BinaryOperator *BinOp, IntegerRangeState &T,
+ const Instruction *CtxI,
+ SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) {
+ Value *LHS = BinOp->getOperand(0);
+ Value *RHS = BinOp->getOperand(1);
+ // TODO: Allow non integers as well.
+ if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
+ return false;
+
+ auto &LHSAA =
+ A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS));
+ QuerriedAAs.push_back(&LHSAA);
+ auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI);
+
+ auto &RHSAA =
+ A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS));
+ QuerriedAAs.push_back(&RHSAA);
+ auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI);
+
+ auto AssumedRange = LHSAARange.binaryOp(BinOp->getOpcode(), RHSAARange);
+
+ T.unionAssumed(AssumedRange);
+
+ // TODO: Track a known state too.
+
+ return T.isValidState();
+ }
+
+ bool calculateCastInst(
+ Attributor &A, CastInst *CastI, IntegerRangeState &T,
+ const Instruction *CtxI,
+ SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) {
+ assert(CastI->getNumOperands() == 1 && "Expected cast to be unary!");
+ // TODO: Allow non integers as well.
+ Value &OpV = *CastI->getOperand(0);
+ if (!OpV.getType()->isIntegerTy())
+ return false;
+
+ auto &OpAA =
+ A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(OpV));
+ QuerriedAAs.push_back(&OpAA);
+ T.unionAssumed(
+ OpAA.getAssumed().castOp(CastI->getOpcode(), getState().getBitWidth()));
+ return T.isValidState();
+ }
+
+ bool
+ calculateCmpInst(Attributor &A, CmpInst *CmpI, IntegerRangeState &T,
+ const Instruction *CtxI,
+ SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) {
+ Value *LHS = CmpI->getOperand(0);
+ Value *RHS = CmpI->getOperand(1);
+ // TODO: Allow non integers as well.
+ if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
+ return false;
+
+ auto &LHSAA =
+ A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS));
+ QuerriedAAs.push_back(&LHSAA);
+ auto &RHSAA =
+ A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS));
+ QuerriedAAs.push_back(&RHSAA);
+
+ auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI);
+ auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI);
+
+ // If one of them is empty set, we can't decide.
+ if (LHSAARange.isEmptySet() || RHSAARange.isEmptySet())
+ return true;
+
+ bool MustTrue = false, MustFalse = false;
+
+ auto AllowedRegion =
+ ConstantRange::makeAllowedICmpRegion(CmpI->getPredicate(), RHSAARange);
+
+ auto SatisfyingRegion = ConstantRange::makeSatisfyingICmpRegion(
+ CmpI->getPredicate(), RHSAARange);
+
+ if (AllowedRegion.intersectWith(LHSAARange).isEmptySet())
+ MustFalse = true;
+
+ if (SatisfyingRegion.contains(LHSAARange))
+ MustTrue = true;
+
+ assert((!MustTrue || !MustFalse) &&
+ "Either MustTrue or MustFalse should be false!");
+
+ if (MustTrue)
+ T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 1)));
+ else if (MustFalse)
+ T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 0)));
+ else
+ T.unionAssumed(ConstantRange(/* BitWidth */ 1, /* isFullSet */ true));
+
+ LLVM_DEBUG(dbgs() << "[AAValueConstantRange] " << *CmpI << " " << LHSAA
+ << " " << RHSAA << "\n");
+
+ // TODO: Track a known state too.
+ return T.isValidState();
+ }
+
+ /// See AbstractAttribute::updateImpl(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ auto VisitValueCB = [&](Value &V, const Instruction *CtxI,
+ IntegerRangeState &T, bool Stripped) -> bool {
+ Instruction *I = dyn_cast<Instruction>(&V);
+ if (!I || isa<CallBase>(I)) {
+
+ // If the value is not instruction, we query AA to Attributor.
+ const auto &AA =
+ A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(V));
+
+ // Clamp operator is not used to utilize a program point CtxI.
+ T.unionAssumed(AA.getAssumedConstantRange(A, CtxI));
+
+ return T.isValidState();
+ }
+
+ SmallVector<const AAValueConstantRange *, 4> QuerriedAAs;
+ if (auto *BinOp = dyn_cast<BinaryOperator>(I)) {
+ if (!calculateBinaryOperator(A, BinOp, T, CtxI, QuerriedAAs))
+ return false;
+ } else if (auto *CmpI = dyn_cast<CmpInst>(I)) {
+ if (!calculateCmpInst(A, CmpI, T, CtxI, QuerriedAAs))
+ return false;
+ } else if (auto *CastI = dyn_cast<CastInst>(I)) {
+ if (!calculateCastInst(A, CastI, T, CtxI, QuerriedAAs))
+ return false;
+ } else {
+ // Give up with other instructions.
+ // TODO: Add other instructions
+
+ T.indicatePessimisticFixpoint();
+ return false;
+ }
+
+ // Catch circular reasoning in a pessimistic way for now.
+ // TODO: Check how the range evolves and if we stripped anything, see also
+ // AADereferenceable or AAAlign for similar situations.
+ for (const AAValueConstantRange *QueriedAA : QuerriedAAs) {
+ if (QueriedAA != this)
+ continue;
+ // If we are in a stady state we do not need to worry.
+ if (T.getAssumed() == getState().getAssumed())
+ continue;
+ T.indicatePessimisticFixpoint();
+ }
+
+ return T.isValidState();
+ };
+
+ IntegerRangeState T(getBitWidth());
+
+ if (!genericValueTraversal<AAValueConstantRange, IntegerRangeState>(
+ A, getIRPosition(), *this, T, VisitValueCB, getCtxI(),
+ /* UseValueSimplify */ false))
+ return indicatePessimisticFixpoint();
+
+ return clampStateAndIndicateChange(getState(), T);
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_FLOATING_ATTR(value_range)
+ }
+};
+
+struct AAValueConstantRangeFunction : AAValueConstantRangeImpl {
+ AAValueConstantRangeFunction(const IRPosition &IRP, Attributor &A)
+ : AAValueConstantRangeImpl(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ ChangeStatus updateImpl(Attributor &A) override {
+ llvm_unreachable("AAValueConstantRange(Function|CallSite)::updateImpl will "
+ "not be called");
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(value_range) }
+};
+
+struct AAValueConstantRangeCallSite : AAValueConstantRangeFunction {
+ AAValueConstantRangeCallSite(const IRPosition &IRP, Attributor &A)
+ : AAValueConstantRangeFunction(IRP, A) {}
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(value_range) }
+};
+
+struct AAValueConstantRangeCallSiteReturned
+ : AACallSiteReturnedFromReturned<AAValueConstantRange,
+ AAValueConstantRangeImpl> {
+ AAValueConstantRangeCallSiteReturned(const IRPosition &IRP, Attributor &A)
+ : AACallSiteReturnedFromReturned<AAValueConstantRange,
+ AAValueConstantRangeImpl>(IRP, A) {}
+
+ /// See AbstractAttribute::initialize(...).
+ void initialize(Attributor &A) override {
+ // If it is a load instruction with range metadata, use the metadata.
+ if (CallInst *CI = dyn_cast<CallInst>(&getAssociatedValue()))
+ if (auto *RangeMD = CI->getMetadata(LLVMContext::MD_range))
+ intersectKnown(getConstantRangeFromMetadata(*RangeMD));
+
+ AAValueConstantRangeImpl::initialize(A);
+ }
+
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_CSRET_ATTR(value_range)
+ }
+};
+struct AAValueConstantRangeCallSiteArgument : AAValueConstantRangeFloating {
+ AAValueConstantRangeCallSiteArgument(const IRPosition &IRP, Attributor &A)
+ : AAValueConstantRangeFloating(IRP, A) {}
+
/// See AbstractAttribute::manifest()
ChangeStatus manifest(Attributor &A) override {
return ChangeStatus::UNCHANGED;
}
- /// See AbstractAttribute::trackStatistics()
- void trackStatistics() const override {
- STATS_DECLTRACK_CSARG_ATTR(value_range)
- }
-};
+ /// See AbstractAttribute::trackStatistics()
+ void trackStatistics() const override {
+ STATS_DECLTRACK_CSARG_ATTR(value_range)
+ }
+};
/// ------------------ Potential Values Attribute -------------------------
@@ -8025,157 +8025,157 @@ struct AANoUndefCallSiteReturned final
/// See AbstractAttribute::trackStatistics()
void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noundef) }
};
-} // namespace
-
-const char AAReturnedValues::ID = 0;
-const char AANoUnwind::ID = 0;
-const char AANoSync::ID = 0;
-const char AANoFree::ID = 0;
-const char AANonNull::ID = 0;
-const char AANoRecurse::ID = 0;
-const char AAWillReturn::ID = 0;
-const char AAUndefinedBehavior::ID = 0;
-const char AANoAlias::ID = 0;
-const char AAReachability::ID = 0;
-const char AANoReturn::ID = 0;
-const char AAIsDead::ID = 0;
-const char AADereferenceable::ID = 0;
-const char AAAlign::ID = 0;
-const char AANoCapture::ID = 0;
-const char AAValueSimplify::ID = 0;
-const char AAHeapToStack::ID = 0;
-const char AAPrivatizablePtr::ID = 0;
-const char AAMemoryBehavior::ID = 0;
-const char AAMemoryLocation::ID = 0;
-const char AAValueConstantRange::ID = 0;
+} // namespace
+
+const char AAReturnedValues::ID = 0;
+const char AANoUnwind::ID = 0;
+const char AANoSync::ID = 0;
+const char AANoFree::ID = 0;
+const char AANonNull::ID = 0;
+const char AANoRecurse::ID = 0;
+const char AAWillReturn::ID = 0;
+const char AAUndefinedBehavior::ID = 0;
+const char AANoAlias::ID = 0;
+const char AAReachability::ID = 0;
+const char AANoReturn::ID = 0;
+const char AAIsDead::ID = 0;
+const char AADereferenceable::ID = 0;
+const char AAAlign::ID = 0;
+const char AANoCapture::ID = 0;
+const char AAValueSimplify::ID = 0;
+const char AAHeapToStack::ID = 0;
+const char AAPrivatizablePtr::ID = 0;
+const char AAMemoryBehavior::ID = 0;
+const char AAMemoryLocation::ID = 0;
+const char AAValueConstantRange::ID = 0;
const char AAPotentialValues::ID = 0;
const char AANoUndef::ID = 0;
-
-// Macro magic to create the static generator function for attributes that
-// follow the naming scheme.
-
-#define SWITCH_PK_INV(CLASS, PK, POS_NAME) \
- case IRPosition::PK: \
- llvm_unreachable("Cannot create " #CLASS " for a " POS_NAME " position!");
-
-#define SWITCH_PK_CREATE(CLASS, IRP, PK, SUFFIX) \
- case IRPosition::PK: \
- AA = new (A.Allocator) CLASS##SUFFIX(IRP, A); \
- ++NumAAs; \
- break;
-
-#define CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \
- CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \
- CLASS *AA = nullptr; \
- switch (IRP.getPositionKind()) { \
- SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \
- SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating") \
- SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument") \
- SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned") \
- SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned") \
- SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument") \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite) \
- } \
- return *AA; \
- }
-
-#define CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \
- CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \
- CLASS *AA = nullptr; \
- switch (IRP.getPositionKind()) { \
- SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \
- SWITCH_PK_INV(CLASS, IRP_FUNCTION, "function") \
- SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site") \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument) \
- } \
- return *AA; \
- }
-
-#define CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \
- CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \
- CLASS *AA = nullptr; \
- switch (IRP.getPositionKind()) { \
- SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument) \
- } \
- return *AA; \
- }
-
-#define CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \
- CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \
- CLASS *AA = nullptr; \
- switch (IRP.getPositionKind()) { \
- SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \
- SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument") \
- SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating") \
- SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned") \
- SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned") \
- SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument") \
- SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site") \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function) \
- } \
- return *AA; \
- }
-
-#define CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \
- CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \
- CLASS *AA = nullptr; \
- switch (IRP.getPositionKind()) { \
- SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \
- SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned") \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned) \
- SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument) \
- } \
- return *AA; \
- }
-
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUnwind)
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoSync)
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoRecurse)
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAWillReturn)
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoReturn)
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReturnedValues)
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryLocation)
-
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANonNull)
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoAlias)
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPrivatizablePtr)
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AADereferenceable)
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAlign)
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoCapture)
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueConstantRange)
+
+// Macro magic to create the static generator function for attributes that
+// follow the naming scheme.
+
+#define SWITCH_PK_INV(CLASS, PK, POS_NAME) \
+ case IRPosition::PK: \
+ llvm_unreachable("Cannot create " #CLASS " for a " POS_NAME " position!");
+
+#define SWITCH_PK_CREATE(CLASS, IRP, PK, SUFFIX) \
+ case IRPosition::PK: \
+ AA = new (A.Allocator) CLASS##SUFFIX(IRP, A); \
+ ++NumAAs; \
+ break;
+
+#define CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \
+ CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \
+ CLASS *AA = nullptr; \
+ switch (IRP.getPositionKind()) { \
+ SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \
+ SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating") \
+ SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument") \
+ SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned") \
+ SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned") \
+ SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument") \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite) \
+ } \
+ return *AA; \
+ }
+
+#define CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \
+ CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \
+ CLASS *AA = nullptr; \
+ switch (IRP.getPositionKind()) { \
+ SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \
+ SWITCH_PK_INV(CLASS, IRP_FUNCTION, "function") \
+ SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site") \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument) \
+ } \
+ return *AA; \
+ }
+
+#define CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \
+ CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \
+ CLASS *AA = nullptr; \
+ switch (IRP.getPositionKind()) { \
+ SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument) \
+ } \
+ return *AA; \
+ }
+
+#define CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \
+ CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \
+ CLASS *AA = nullptr; \
+ switch (IRP.getPositionKind()) { \
+ SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \
+ SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument") \
+ SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating") \
+ SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned") \
+ SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned") \
+ SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument") \
+ SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site") \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function) \
+ } \
+ return *AA; \
+ }
+
+#define CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS) \
+ CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) { \
+ CLASS *AA = nullptr; \
+ switch (IRP.getPositionKind()) { \
+ SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid") \
+ SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned") \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned) \
+ SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument) \
+ } \
+ return *AA; \
+ }
+
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUnwind)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoSync)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoRecurse)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAWillReturn)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoReturn)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReturnedValues)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryLocation)
+
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANonNull)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoAlias)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPrivatizablePtr)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AADereferenceable)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAlign)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoCapture)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueConstantRange)
CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPotentialValues)
CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUndef)
-
-CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueSimplify)
-CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAIsDead)
-CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFree)
-
-CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAHeapToStack)
-CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReachability)
-CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAUndefinedBehavior)
-
-CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryBehavior)
-
-#undef CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION
-#undef CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION
-#undef CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION
-#undef CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION
-#undef CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION
-#undef SWITCH_PK_CREATE
-#undef SWITCH_PK_INV
+
+CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueSimplify)
+CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAIsDead)
+CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFree)
+
+CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAHeapToStack)
+CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReachability)
+CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAUndefinedBehavior)
+
+CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryBehavior)
+
+#undef CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef SWITCH_PK_CREATE
+#undef SWITCH_PK_INV
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/BarrierNoopPass.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/BarrierNoopPass.cpp
index 73c0791e1a..b49a92ad16 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/BarrierNoopPass.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/BarrierNoopPass.cpp
@@ -1,47 +1,47 @@
-//===- BarrierNoopPass.cpp - A barrier pass for the pass manager ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// NOTE: DO NOT USE THIS IF AVOIDABLE
-//
-// This pass is a nonce pass intended to allow manipulation of the implicitly
-// nesting pass manager. For example, it can be used to cause a CGSCC pass
-// manager to be closed prior to running a new collection of function passes.
-//
-// FIXME: This is a huge HACK. This should be removed when the pass manager's
-// nesting is made explicit instead of implicit.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/IPO.h"
-using namespace llvm;
-
-namespace {
-/// A nonce module pass used to place a barrier in a pass manager.
-///
-/// There is no mechanism for ending a CGSCC pass manager once one is started.
-/// This prevents extension points from having clear deterministic ordering
-/// when they are phrased as non-module passes.
-class BarrierNoop : public ModulePass {
-public:
- static char ID; // Pass identification.
-
- BarrierNoop() : ModulePass(ID) {
- initializeBarrierNoopPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override { return false; }
-};
-}
-
-ModulePass *llvm::createBarrierNoopPass() { return new BarrierNoop(); }
-
-char BarrierNoop::ID = 0;
-INITIALIZE_PASS(BarrierNoop, "barrier", "A No-Op Barrier Pass",
- false, false)
+//===- BarrierNoopPass.cpp - A barrier pass for the pass manager ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// NOTE: DO NOT USE THIS IF AVOIDABLE
+//
+// This pass is a nonce pass intended to allow manipulation of the implicitly
+// nesting pass manager. For example, it can be used to cause a CGSCC pass
+// manager to be closed prior to running a new collection of function passes.
+//
+// FIXME: This is a huge HACK. This should be removed when the pass manager's
+// nesting is made explicit instead of implicit.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+using namespace llvm;
+
+namespace {
+/// A nonce module pass used to place a barrier in a pass manager.
+///
+/// There is no mechanism for ending a CGSCC pass manager once one is started.
+/// This prevents extension points from having clear deterministic ordering
+/// when they are phrased as non-module passes.
+class BarrierNoop : public ModulePass {
+public:
+ static char ID; // Pass identification.
+
+ BarrierNoop() : ModulePass(ID) {
+ initializeBarrierNoopPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override { return false; }
+};
+}
+
+ModulePass *llvm::createBarrierNoopPass() { return new BarrierNoop(); }
+
+char BarrierNoop::ID = 0;
+INITIALIZE_PASS(BarrierNoop, "barrier", "A No-Op Barrier Pass",
+ false, false)
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/BlockExtractor.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/BlockExtractor.cpp
index 0cff82113a..c6e222a096 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/BlockExtractor.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/BlockExtractor.cpp
@@ -1,61 +1,61 @@
-//===- BlockExtractor.cpp - Extracts blocks into their own functions ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass extracts the specified basic blocks from the module into their
-// own functions.
-//
-//===----------------------------------------------------------------------===//
-
+//===- BlockExtractor.cpp - Extracts blocks into their own functions ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass extracts the specified basic blocks from the module into their
+// own functions.
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/IPO/BlockExtractor.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/CodeExtractor.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "block-extractor"
-
-STATISTIC(NumExtracted, "Number of basic blocks extracted");
-
-static cl::opt<std::string> BlockExtractorFile(
- "extract-blocks-file", cl::value_desc("filename"),
- cl::desc("A file containing list of basic blocks to extract"), cl::Hidden);
-
-cl::opt<bool> BlockExtractorEraseFuncs("extract-blocks-erase-funcs",
- cl::desc("Erase the existing functions"),
- cl::Hidden);
-namespace {
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "block-extractor"
+
+STATISTIC(NumExtracted, "Number of basic blocks extracted");
+
+static cl::opt<std::string> BlockExtractorFile(
+ "extract-blocks-file", cl::value_desc("filename"),
+ cl::desc("A file containing list of basic blocks to extract"), cl::Hidden);
+
+cl::opt<bool> BlockExtractorEraseFuncs("extract-blocks-erase-funcs",
+ cl::desc("Erase the existing functions"),
+ cl::Hidden);
+namespace {
class BlockExtractor {
public:
BlockExtractor(bool EraseFunctions) : EraseFunctions(EraseFunctions) {}
bool runOnModule(Module &M);
- void init(const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
- &GroupsOfBlocksToExtract) {
- for (const SmallVectorImpl<BasicBlock *> &GroupOfBlocks :
- GroupsOfBlocksToExtract) {
- SmallVector<BasicBlock *, 16> NewGroup;
- NewGroup.append(GroupOfBlocks.begin(), GroupOfBlocks.end());
- GroupsOfBlocks.emplace_back(NewGroup);
- }
- if (!BlockExtractorFile.empty())
- loadFile();
- }
-
+ void init(const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
+ &GroupsOfBlocksToExtract) {
+ for (const SmallVectorImpl<BasicBlock *> &GroupOfBlocks :
+ GroupsOfBlocksToExtract) {
+ SmallVector<BasicBlock *, 16> NewGroup;
+ NewGroup.append(GroupOfBlocks.begin(), GroupOfBlocks.end());
+ GroupsOfBlocks.emplace_back(NewGroup);
+ }
+ if (!BlockExtractorFile.empty())
+ loadFile();
+ }
+
private:
SmallVector<SmallVector<BasicBlock *, 16>, 4> GroupsOfBlocks;
bool EraseFunctions;
@@ -71,181 +71,181 @@ class BlockExtractorLegacyPass : public ModulePass {
BlockExtractor BE;
bool runOnModule(Module &M) override;
-public:
- static char ID;
+public:
+ static char ID;
BlockExtractorLegacyPass(const SmallVectorImpl<BasicBlock *> &BlocksToExtract,
bool EraseFunctions)
: ModulePass(ID), BE(EraseFunctions) {
- // We want one group per element of the input list.
- SmallVector<SmallVector<BasicBlock *, 16>, 4> MassagedGroupsOfBlocks;
- for (BasicBlock *BB : BlocksToExtract) {
- SmallVector<BasicBlock *, 16> NewGroup;
- NewGroup.push_back(BB);
- MassagedGroupsOfBlocks.push_back(NewGroup);
- }
+ // We want one group per element of the input list.
+ SmallVector<SmallVector<BasicBlock *, 16>, 4> MassagedGroupsOfBlocks;
+ for (BasicBlock *BB : BlocksToExtract) {
+ SmallVector<BasicBlock *, 16> NewGroup;
+ NewGroup.push_back(BB);
+ MassagedGroupsOfBlocks.push_back(NewGroup);
+ }
BE.init(MassagedGroupsOfBlocks);
- }
-
+ }
+
BlockExtractorLegacyPass(const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
&GroupsOfBlocksToExtract,
bool EraseFunctions)
: ModulePass(ID), BE(EraseFunctions) {
BE.init(GroupsOfBlocksToExtract);
- }
-
+ }
+
BlockExtractorLegacyPass()
: BlockExtractorLegacyPass(SmallVector<BasicBlock *, 0>(), false) {}
};
-
-} // end anonymous namespace
-
+
+} // end anonymous namespace
+
char BlockExtractorLegacyPass::ID = 0;
INITIALIZE_PASS(BlockExtractorLegacyPass, "extract-blocks",
- "Extract basic blocks from module", false, false)
-
+ "Extract basic blocks from module", false, false)
+
ModulePass *llvm::createBlockExtractorPass() {
return new BlockExtractorLegacyPass();
}
-ModulePass *llvm::createBlockExtractorPass(
- const SmallVectorImpl<BasicBlock *> &BlocksToExtract, bool EraseFunctions) {
+ModulePass *llvm::createBlockExtractorPass(
+ const SmallVectorImpl<BasicBlock *> &BlocksToExtract, bool EraseFunctions) {
return new BlockExtractorLegacyPass(BlocksToExtract, EraseFunctions);
-}
-ModulePass *llvm::createBlockExtractorPass(
- const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
- &GroupsOfBlocksToExtract,
- bool EraseFunctions) {
+}
+ModulePass *llvm::createBlockExtractorPass(
+ const SmallVectorImpl<SmallVector<BasicBlock *, 16>>
+ &GroupsOfBlocksToExtract,
+ bool EraseFunctions) {
return new BlockExtractorLegacyPass(GroupsOfBlocksToExtract, EraseFunctions);
-}
-
-/// Gets all of the blocks specified in the input file.
-void BlockExtractor::loadFile() {
- auto ErrOrBuf = MemoryBuffer::getFile(BlockExtractorFile);
- if (ErrOrBuf.getError())
- report_fatal_error("BlockExtractor couldn't load the file.");
- // Read the file.
- auto &Buf = *ErrOrBuf;
- SmallVector<StringRef, 16> Lines;
- Buf->getBuffer().split(Lines, '\n', /*MaxSplit=*/-1,
- /*KeepEmpty=*/false);
- for (const auto &Line : Lines) {
- SmallVector<StringRef, 4> LineSplit;
- Line.split(LineSplit, ' ', /*MaxSplit=*/-1,
- /*KeepEmpty=*/false);
- if (LineSplit.empty())
- continue;
- if (LineSplit.size()!=2)
- report_fatal_error("Invalid line format, expecting lines like: 'funcname bb1[;bb2..]'");
- SmallVector<StringRef, 4> BBNames;
- LineSplit[1].split(BBNames, ';', /*MaxSplit=*/-1,
- /*KeepEmpty=*/false);
- if (BBNames.empty())
- report_fatal_error("Missing bbs name");
- BlocksByName.push_back(
- {std::string(LineSplit[0]), {BBNames.begin(), BBNames.end()}});
- }
-}
-
-/// Extracts the landing pads to make sure all of them have only one
-/// predecessor.
-void BlockExtractor::splitLandingPadPreds(Function &F) {
- for (BasicBlock &BB : F) {
- for (Instruction &I : BB) {
- if (!isa<InvokeInst>(&I))
- continue;
- InvokeInst *II = cast<InvokeInst>(&I);
- BasicBlock *Parent = II->getParent();
- BasicBlock *LPad = II->getUnwindDest();
-
- // Look through the landing pad's predecessors. If one of them ends in an
- // 'invoke', then we want to split the landing pad.
- bool Split = false;
- for (auto PredBB : predecessors(LPad)) {
- if (PredBB->isLandingPad() && PredBB != Parent &&
- isa<InvokeInst>(Parent->getTerminator())) {
- Split = true;
- break;
- }
- }
-
- if (!Split)
- continue;
-
- SmallVector<BasicBlock *, 2> NewBBs;
- SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", NewBBs);
- }
- }
-}
-
-bool BlockExtractor::runOnModule(Module &M) {
-
- bool Changed = false;
-
- // Get all the functions.
- SmallVector<Function *, 4> Functions;
- for (Function &F : M) {
- splitLandingPadPreds(F);
- Functions.push_back(&F);
- }
-
- // Get all the blocks specified in the input file.
- unsigned NextGroupIdx = GroupsOfBlocks.size();
- GroupsOfBlocks.resize(NextGroupIdx + BlocksByName.size());
- for (const auto &BInfo : BlocksByName) {
- Function *F = M.getFunction(BInfo.first);
- if (!F)
- report_fatal_error("Invalid function name specified in the input file");
- for (const auto &BBInfo : BInfo.second) {
- auto Res = llvm::find_if(*F, [&](const BasicBlock &BB) {
- return BB.getName().equals(BBInfo);
- });
- if (Res == F->end())
- report_fatal_error("Invalid block name specified in the input file");
- GroupsOfBlocks[NextGroupIdx].push_back(&*Res);
- }
- ++NextGroupIdx;
- }
-
- // Extract each group of basic blocks.
- for (auto &BBs : GroupsOfBlocks) {
- SmallVector<BasicBlock *, 32> BlocksToExtractVec;
- for (BasicBlock *BB : BBs) {
- // Check if the module contains BB.
- if (BB->getParent()->getParent() != &M)
- report_fatal_error("Invalid basic block");
- LLVM_DEBUG(dbgs() << "BlockExtractor: Extracting "
- << BB->getParent()->getName() << ":" << BB->getName()
- << "\n");
- BlocksToExtractVec.push_back(BB);
- if (const InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
- BlocksToExtractVec.push_back(II->getUnwindDest());
- ++NumExtracted;
- Changed = true;
- }
- CodeExtractorAnalysisCache CEAC(*BBs[0]->getParent());
- Function *F = CodeExtractor(BlocksToExtractVec).extractCodeRegion(CEAC);
- if (F)
- LLVM_DEBUG(dbgs() << "Extracted group '" << (*BBs.begin())->getName()
- << "' in: " << F->getName() << '\n');
- else
- LLVM_DEBUG(dbgs() << "Failed to extract for group '"
- << (*BBs.begin())->getName() << "'\n");
- }
-
- // Erase the functions.
- if (EraseFunctions || BlockExtractorEraseFuncs) {
- for (Function *F : Functions) {
- LLVM_DEBUG(dbgs() << "BlockExtractor: Trying to delete " << F->getName()
- << "\n");
- F->deleteBody();
- }
- // Set linkage as ExternalLinkage to avoid erasing unreachable functions.
- for (Function &F : M)
- F.setLinkage(GlobalValue::ExternalLinkage);
- Changed = true;
- }
-
- return Changed;
-}
+}
+
+/// Gets all of the blocks specified in the input file.
+void BlockExtractor::loadFile() {
+ auto ErrOrBuf = MemoryBuffer::getFile(BlockExtractorFile);
+ if (ErrOrBuf.getError())
+ report_fatal_error("BlockExtractor couldn't load the file.");
+ // Read the file.
+ auto &Buf = *ErrOrBuf;
+ SmallVector<StringRef, 16> Lines;
+ Buf->getBuffer().split(Lines, '\n', /*MaxSplit=*/-1,
+ /*KeepEmpty=*/false);
+ for (const auto &Line : Lines) {
+ SmallVector<StringRef, 4> LineSplit;
+ Line.split(LineSplit, ' ', /*MaxSplit=*/-1,
+ /*KeepEmpty=*/false);
+ if (LineSplit.empty())
+ continue;
+ if (LineSplit.size()!=2)
+ report_fatal_error("Invalid line format, expecting lines like: 'funcname bb1[;bb2..]'");
+ SmallVector<StringRef, 4> BBNames;
+ LineSplit[1].split(BBNames, ';', /*MaxSplit=*/-1,
+ /*KeepEmpty=*/false);
+ if (BBNames.empty())
+ report_fatal_error("Missing bbs name");
+ BlocksByName.push_back(
+ {std::string(LineSplit[0]), {BBNames.begin(), BBNames.end()}});
+ }
+}
+
+/// Extracts the landing pads to make sure all of them have only one
+/// predecessor.
+void BlockExtractor::splitLandingPadPreds(Function &F) {
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ if (!isa<InvokeInst>(&I))
+ continue;
+ InvokeInst *II = cast<InvokeInst>(&I);
+ BasicBlock *Parent = II->getParent();
+ BasicBlock *LPad = II->getUnwindDest();
+
+ // Look through the landing pad's predecessors. If one of them ends in an
+ // 'invoke', then we want to split the landing pad.
+ bool Split = false;
+ for (auto PredBB : predecessors(LPad)) {
+ if (PredBB->isLandingPad() && PredBB != Parent &&
+ isa<InvokeInst>(Parent->getTerminator())) {
+ Split = true;
+ break;
+ }
+ }
+
+ if (!Split)
+ continue;
+
+ SmallVector<BasicBlock *, 2> NewBBs;
+ SplitLandingPadPredecessors(LPad, Parent, ".1", ".2", NewBBs);
+ }
+ }
+}
+
+bool BlockExtractor::runOnModule(Module &M) {
+
+ bool Changed = false;
+
+ // Get all the functions.
+ SmallVector<Function *, 4> Functions;
+ for (Function &F : M) {
+ splitLandingPadPreds(F);
+ Functions.push_back(&F);
+ }
+
+ // Get all the blocks specified in the input file.
+ unsigned NextGroupIdx = GroupsOfBlocks.size();
+ GroupsOfBlocks.resize(NextGroupIdx + BlocksByName.size());
+ for (const auto &BInfo : BlocksByName) {
+ Function *F = M.getFunction(BInfo.first);
+ if (!F)
+ report_fatal_error("Invalid function name specified in the input file");
+ for (const auto &BBInfo : BInfo.second) {
+ auto Res = llvm::find_if(*F, [&](const BasicBlock &BB) {
+ return BB.getName().equals(BBInfo);
+ });
+ if (Res == F->end())
+ report_fatal_error("Invalid block name specified in the input file");
+ GroupsOfBlocks[NextGroupIdx].push_back(&*Res);
+ }
+ ++NextGroupIdx;
+ }
+
+ // Extract each group of basic blocks.
+ for (auto &BBs : GroupsOfBlocks) {
+ SmallVector<BasicBlock *, 32> BlocksToExtractVec;
+ for (BasicBlock *BB : BBs) {
+ // Check if the module contains BB.
+ if (BB->getParent()->getParent() != &M)
+ report_fatal_error("Invalid basic block");
+ LLVM_DEBUG(dbgs() << "BlockExtractor: Extracting "
+ << BB->getParent()->getName() << ":" << BB->getName()
+ << "\n");
+ BlocksToExtractVec.push_back(BB);
+ if (const InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+ BlocksToExtractVec.push_back(II->getUnwindDest());
+ ++NumExtracted;
+ Changed = true;
+ }
+ CodeExtractorAnalysisCache CEAC(*BBs[0]->getParent());
+ Function *F = CodeExtractor(BlocksToExtractVec).extractCodeRegion(CEAC);
+ if (F)
+ LLVM_DEBUG(dbgs() << "Extracted group '" << (*BBs.begin())->getName()
+ << "' in: " << F->getName() << '\n');
+ else
+ LLVM_DEBUG(dbgs() << "Failed to extract for group '"
+ << (*BBs.begin())->getName() << "'\n");
+ }
+
+ // Erase the functions.
+ if (EraseFunctions || BlockExtractorEraseFuncs) {
+ for (Function *F : Functions) {
+ LLVM_DEBUG(dbgs() << "BlockExtractor: Trying to delete " << F->getName()
+ << "\n");
+ F->deleteBody();
+ }
+ // Set linkage as ExternalLinkage to avoid erasing unreachable functions.
+ for (Function &F : M)
+ F.setLinkage(GlobalValue::ExternalLinkage);
+ Changed = true;
+ }
+
+ return Changed;
+}
bool BlockExtractorLegacyPass::runOnModule(Module &M) {
return BE.runOnModule(M);
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/CalledValuePropagation.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/CalledValuePropagation.cpp
index 778e017f4d..74f11fa309 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/CalledValuePropagation.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/CalledValuePropagation.cpp
@@ -1,434 +1,434 @@
-//===- CalledValuePropagation.cpp - Propagate called values -----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a transformation that attaches !callees metadata to
-// indirect call sites. For a given call site, the metadata, if present,
-// indicates the set of functions the call site could possibly target at
-// run-time. This metadata is added to indirect call sites when the set of
-// possible targets can be determined by analysis and is known to be small. The
-// analysis driving the transformation is similar to constant propagation and
-// makes uses of the generic sparse propagation solver.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/CalledValuePropagation.h"
-#include "llvm/Analysis/SparsePropagation.h"
-#include "llvm/Analysis/ValueLatticeUtils.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/IPO.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "called-value-propagation"
-
-/// The maximum number of functions to track per lattice value. Once the number
-/// of functions a call site can possibly target exceeds this threshold, it's
-/// lattice value becomes overdefined. The number of possible lattice values is
-/// bounded by Ch(F, M), where F is the number of functions in the module and M
-/// is MaxFunctionsPerValue. As such, this value should be kept very small. We
-/// likely can't do anything useful for call sites with a large number of
-/// possible targets, anyway.
-static cl::opt<unsigned> MaxFunctionsPerValue(
- "cvp-max-functions-per-value", cl::Hidden, cl::init(4),
- cl::desc("The maximum number of functions to track per lattice value"));
-
-namespace {
-/// To enable interprocedural analysis, we assign LLVM values to the following
-/// groups. The register group represents SSA registers, the return group
-/// represents the return values of functions, and the memory group represents
-/// in-memory values. An LLVM Value can technically be in more than one group.
-/// It's necessary to distinguish these groups so we can, for example, track a
-/// global variable separately from the value stored at its location.
-enum class IPOGrouping { Register, Return, Memory };
-
-/// Our LatticeKeys are PointerIntPairs composed of LLVM values and groupings.
-using CVPLatticeKey = PointerIntPair<Value *, 2, IPOGrouping>;
-
-/// The lattice value type used by our custom lattice function. It holds the
-/// lattice state, and a set of functions.
-class CVPLatticeVal {
-public:
- /// The states of the lattice values. Only the FunctionSet state is
- /// interesting. It indicates the set of functions to which an LLVM value may
- /// refer.
- enum CVPLatticeStateTy { Undefined, FunctionSet, Overdefined, Untracked };
-
- /// Comparator for sorting the functions set. We want to keep the order
- /// deterministic for testing, etc.
- struct Compare {
- bool operator()(const Function *LHS, const Function *RHS) const {
- return LHS->getName() < RHS->getName();
- }
- };
-
- CVPLatticeVal() : LatticeState(Undefined) {}
- CVPLatticeVal(CVPLatticeStateTy LatticeState) : LatticeState(LatticeState) {}
- CVPLatticeVal(std::vector<Function *> &&Functions)
- : LatticeState(FunctionSet), Functions(std::move(Functions)) {
- assert(llvm::is_sorted(this->Functions, Compare()));
- }
-
- /// Get a reference to the functions held by this lattice value. The number
- /// of functions will be zero for states other than FunctionSet.
- const std::vector<Function *> &getFunctions() const {
- return Functions;
- }
-
- /// Returns true if the lattice value is in the FunctionSet state.
- bool isFunctionSet() const { return LatticeState == FunctionSet; }
-
- bool operator==(const CVPLatticeVal &RHS) const {
- return LatticeState == RHS.LatticeState && Functions == RHS.Functions;
- }
-
- bool operator!=(const CVPLatticeVal &RHS) const {
- return LatticeState != RHS.LatticeState || Functions != RHS.Functions;
- }
-
-private:
- /// Holds the state this lattice value is in.
- CVPLatticeStateTy LatticeState;
-
- /// Holds functions indicating the possible targets of call sites. This set
- /// is empty for lattice values in the undefined, overdefined, and untracked
- /// states. The maximum size of the set is controlled by
- /// MaxFunctionsPerValue. Since most LLVM values are expected to be in
- /// uninteresting states (i.e., overdefined), CVPLatticeVal objects should be
- /// small and efficiently copyable.
- // FIXME: This could be a TinyPtrVector and/or merge with LatticeState.
- std::vector<Function *> Functions;
-};
-
-/// The custom lattice function used by the generic sparse propagation solver.
-/// It handles merging lattice values and computing new lattice values for
-/// constants, arguments, values returned from trackable functions, and values
-/// located in trackable global variables. It also computes the lattice values
-/// that change as a result of executing instructions.
-class CVPLatticeFunc
- : public AbstractLatticeFunction<CVPLatticeKey, CVPLatticeVal> {
-public:
- CVPLatticeFunc()
- : AbstractLatticeFunction(CVPLatticeVal(CVPLatticeVal::Undefined),
- CVPLatticeVal(CVPLatticeVal::Overdefined),
- CVPLatticeVal(CVPLatticeVal::Untracked)) {}
-
- /// Compute and return a CVPLatticeVal for the given CVPLatticeKey.
- CVPLatticeVal ComputeLatticeVal(CVPLatticeKey Key) override {
- switch (Key.getInt()) {
- case IPOGrouping::Register:
- if (isa<Instruction>(Key.getPointer())) {
- return getUndefVal();
- } else if (auto *A = dyn_cast<Argument>(Key.getPointer())) {
- if (canTrackArgumentsInterprocedurally(A->getParent()))
- return getUndefVal();
- } else if (auto *C = dyn_cast<Constant>(Key.getPointer())) {
- return computeConstant(C);
- }
- return getOverdefinedVal();
- case IPOGrouping::Memory:
- case IPOGrouping::Return:
- if (auto *GV = dyn_cast<GlobalVariable>(Key.getPointer())) {
- if (canTrackGlobalVariableInterprocedurally(GV))
- return computeConstant(GV->getInitializer());
- } else if (auto *F = cast<Function>(Key.getPointer()))
- if (canTrackReturnsInterprocedurally(F))
- return getUndefVal();
- }
- return getOverdefinedVal();
- }
-
- /// Merge the two given lattice values. The interesting cases are merging two
- /// FunctionSet values and a FunctionSet value with an Undefined value. For
- /// these cases, we simply union the function sets. If the size of the union
- /// is greater than the maximum functions we track, the merged value is
- /// overdefined.
- CVPLatticeVal MergeValues(CVPLatticeVal X, CVPLatticeVal Y) override {
- if (X == getOverdefinedVal() || Y == getOverdefinedVal())
- return getOverdefinedVal();
- if (X == getUndefVal() && Y == getUndefVal())
- return getUndefVal();
- std::vector<Function *> Union;
- std::set_union(X.getFunctions().begin(), X.getFunctions().end(),
- Y.getFunctions().begin(), Y.getFunctions().end(),
- std::back_inserter(Union), CVPLatticeVal::Compare{});
- if (Union.size() > MaxFunctionsPerValue)
- return getOverdefinedVal();
- return CVPLatticeVal(std::move(Union));
- }
-
- /// Compute the lattice values that change as a result of executing the given
- /// instruction. The changed values are stored in \p ChangedValues. We handle
- /// just a few kinds of instructions since we're only propagating values that
- /// can be called.
- void ComputeInstructionState(
- Instruction &I, DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
- SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) override {
- switch (I.getOpcode()) {
- case Instruction::Call:
- case Instruction::Invoke:
- return visitCallBase(cast<CallBase>(I), ChangedValues, SS);
- case Instruction::Load:
- return visitLoad(*cast<LoadInst>(&I), ChangedValues, SS);
- case Instruction::Ret:
- return visitReturn(*cast<ReturnInst>(&I), ChangedValues, SS);
- case Instruction::Select:
- return visitSelect(*cast<SelectInst>(&I), ChangedValues, SS);
- case Instruction::Store:
- return visitStore(*cast<StoreInst>(&I), ChangedValues, SS);
- default:
- return visitInst(I, ChangedValues, SS);
- }
- }
-
- /// Print the given CVPLatticeVal to the specified stream.
- void PrintLatticeVal(CVPLatticeVal LV, raw_ostream &OS) override {
- if (LV == getUndefVal())
- OS << "Undefined ";
- else if (LV == getOverdefinedVal())
- OS << "Overdefined";
- else if (LV == getUntrackedVal())
- OS << "Untracked ";
- else
- OS << "FunctionSet";
- }
-
- /// Print the given CVPLatticeKey to the specified stream.
- void PrintLatticeKey(CVPLatticeKey Key, raw_ostream &OS) override {
- if (Key.getInt() == IPOGrouping::Register)
- OS << "<reg> ";
- else if (Key.getInt() == IPOGrouping::Memory)
- OS << "<mem> ";
- else if (Key.getInt() == IPOGrouping::Return)
- OS << "<ret> ";
- if (isa<Function>(Key.getPointer()))
- OS << Key.getPointer()->getName();
- else
- OS << *Key.getPointer();
- }
-
- /// We collect a set of indirect calls when visiting call sites. This method
- /// returns a reference to that set.
- SmallPtrSetImpl<CallBase *> &getIndirectCalls() { return IndirectCalls; }
-
-private:
- /// Holds the indirect calls we encounter during the analysis. We will attach
- /// metadata to these calls after the analysis indicating the functions the
- /// calls can possibly target.
- SmallPtrSet<CallBase *, 32> IndirectCalls;
-
- /// Compute a new lattice value for the given constant. The constant, after
- /// stripping any pointer casts, should be a Function. We ignore null
- /// pointers as an optimization, since calling these values is undefined
- /// behavior.
- CVPLatticeVal computeConstant(Constant *C) {
- if (isa<ConstantPointerNull>(C))
- return CVPLatticeVal(CVPLatticeVal::FunctionSet);
- if (auto *F = dyn_cast<Function>(C->stripPointerCasts()))
- return CVPLatticeVal({F});
- return getOverdefinedVal();
- }
-
- /// Handle return instructions. The function's return state is the merge of
- /// the returned value state and the function's return state.
- void visitReturn(ReturnInst &I,
- DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
- SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
- Function *F = I.getParent()->getParent();
- if (F->getReturnType()->isVoidTy())
- return;
- auto RegI = CVPLatticeKey(I.getReturnValue(), IPOGrouping::Register);
- auto RetF = CVPLatticeKey(F, IPOGrouping::Return);
- ChangedValues[RetF] =
- MergeValues(SS.getValueState(RegI), SS.getValueState(RetF));
- }
-
- /// Handle call sites. The state of a called function's formal arguments is
- /// the merge of the argument state with the call sites corresponding actual
- /// argument state. The call site state is the merge of the call site state
- /// with the returned value state of the called function.
- void visitCallBase(CallBase &CB,
- DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
- SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
- Function *F = CB.getCalledFunction();
- auto RegI = CVPLatticeKey(&CB, IPOGrouping::Register);
-
- // If this is an indirect call, save it so we can quickly revisit it when
- // attaching metadata.
- if (!F)
- IndirectCalls.insert(&CB);
-
- // If we can't track the function's return values, there's nothing to do.
- if (!F || !canTrackReturnsInterprocedurally(F)) {
- // Void return, No need to create and update CVPLattice state as no one
- // can use it.
- if (CB.getType()->isVoidTy())
- return;
- ChangedValues[RegI] = getOverdefinedVal();
- return;
- }
-
- // Inform the solver that the called function is executable, and perform
- // the merges for the arguments and return value.
- SS.MarkBlockExecutable(&F->front());
- auto RetF = CVPLatticeKey(F, IPOGrouping::Return);
- for (Argument &A : F->args()) {
- auto RegFormal = CVPLatticeKey(&A, IPOGrouping::Register);
- auto RegActual =
- CVPLatticeKey(CB.getArgOperand(A.getArgNo()), IPOGrouping::Register);
- ChangedValues[RegFormal] =
- MergeValues(SS.getValueState(RegFormal), SS.getValueState(RegActual));
- }
-
- // Void return, No need to create and update CVPLattice state as no one can
- // use it.
- if (CB.getType()->isVoidTy())
- return;
-
- ChangedValues[RegI] =
- MergeValues(SS.getValueState(RegI), SS.getValueState(RetF));
- }
-
- /// Handle select instructions. The select instruction state is the merge the
- /// true and false value states.
- void visitSelect(SelectInst &I,
- DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
- SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
- auto RegI = CVPLatticeKey(&I, IPOGrouping::Register);
- auto RegT = CVPLatticeKey(I.getTrueValue(), IPOGrouping::Register);
- auto RegF = CVPLatticeKey(I.getFalseValue(), IPOGrouping::Register);
- ChangedValues[RegI] =
- MergeValues(SS.getValueState(RegT), SS.getValueState(RegF));
- }
-
- /// Handle load instructions. If the pointer operand of the load is a global
- /// variable, we attempt to track the value. The loaded value state is the
- /// merge of the loaded value state with the global variable state.
- void visitLoad(LoadInst &I,
- DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
- SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
- auto RegI = CVPLatticeKey(&I, IPOGrouping::Register);
- if (auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand())) {
- auto MemGV = CVPLatticeKey(GV, IPOGrouping::Memory);
- ChangedValues[RegI] =
- MergeValues(SS.getValueState(RegI), SS.getValueState(MemGV));
- } else {
- ChangedValues[RegI] = getOverdefinedVal();
- }
- }
-
- /// Handle store instructions. If the pointer operand of the store is a
- /// global variable, we attempt to track the value. The global variable state
- /// is the merge of the stored value state with the global variable state.
- void visitStore(StoreInst &I,
- DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
- SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
- auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand());
- if (!GV)
- return;
- auto RegI = CVPLatticeKey(I.getValueOperand(), IPOGrouping::Register);
- auto MemGV = CVPLatticeKey(GV, IPOGrouping::Memory);
- ChangedValues[MemGV] =
- MergeValues(SS.getValueState(RegI), SS.getValueState(MemGV));
- }
-
- /// Handle all other instructions. All other instructions are marked
- /// overdefined.
- void visitInst(Instruction &I,
- DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
- SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
- // Simply bail if this instruction has no user.
- if (I.use_empty())
- return;
- auto RegI = CVPLatticeKey(&I, IPOGrouping::Register);
- ChangedValues[RegI] = getOverdefinedVal();
- }
-};
-} // namespace
-
-namespace llvm {
-/// A specialization of LatticeKeyInfo for CVPLatticeKeys. The generic solver
-/// must translate between LatticeKeys and LLVM Values when adding Values to
-/// its work list and inspecting the state of control-flow related values.
-template <> struct LatticeKeyInfo<CVPLatticeKey> {
- static inline Value *getValueFromLatticeKey(CVPLatticeKey Key) {
- return Key.getPointer();
- }
- static inline CVPLatticeKey getLatticeKeyFromValue(Value *V) {
- return CVPLatticeKey(V, IPOGrouping::Register);
- }
-};
-} // namespace llvm
-
-static bool runCVP(Module &M) {
- // Our custom lattice function and generic sparse propagation solver.
- CVPLatticeFunc Lattice;
- SparseSolver<CVPLatticeKey, CVPLatticeVal> Solver(&Lattice);
-
- // For each function in the module, if we can't track its arguments, let the
- // generic solver assume it is executable.
- for (Function &F : M)
- if (!F.isDeclaration() && !canTrackArgumentsInterprocedurally(&F))
- Solver.MarkBlockExecutable(&F.front());
-
- // Solver our custom lattice. In doing so, we will also build a set of
- // indirect call sites.
- Solver.Solve();
-
- // Attach metadata to the indirect call sites that were collected indicating
- // the set of functions they can possibly target.
- bool Changed = false;
- MDBuilder MDB(M.getContext());
- for (CallBase *C : Lattice.getIndirectCalls()) {
- auto RegI = CVPLatticeKey(C->getCalledOperand(), IPOGrouping::Register);
- CVPLatticeVal LV = Solver.getExistingValueState(RegI);
- if (!LV.isFunctionSet() || LV.getFunctions().empty())
- continue;
- MDNode *Callees = MDB.createCallees(LV.getFunctions());
- C->setMetadata(LLVMContext::MD_callees, Callees);
- Changed = true;
- }
-
- return Changed;
-}
-
-PreservedAnalyses CalledValuePropagationPass::run(Module &M,
- ModuleAnalysisManager &) {
- runCVP(M);
- return PreservedAnalyses::all();
-}
-
-namespace {
-class CalledValuePropagationLegacyPass : public ModulePass {
-public:
- static char ID;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- }
-
- CalledValuePropagationLegacyPass() : ModulePass(ID) {
- initializeCalledValuePropagationLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
- return runCVP(M);
- }
-};
-} // namespace
-
-char CalledValuePropagationLegacyPass::ID = 0;
-INITIALIZE_PASS(CalledValuePropagationLegacyPass, "called-value-propagation",
- "Called Value Propagation", false, false)
-
-ModulePass *llvm::createCalledValuePropagationPass() {
- return new CalledValuePropagationLegacyPass();
-}
+//===- CalledValuePropagation.cpp - Propagate called values -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that attaches !callees metadata to
+// indirect call sites. For a given call site, the metadata, if present,
+// indicates the set of functions the call site could possibly target at
+// run-time. This metadata is added to indirect call sites when the set of
+// possible targets can be determined by analysis and is known to be small. The
+// analysis driving the transformation is similar to constant propagation and
+// makes uses of the generic sparse propagation solver.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/CalledValuePropagation.h"
+#include "llvm/Analysis/SparsePropagation.h"
+#include "llvm/Analysis/ValueLatticeUtils.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "called-value-propagation"
+
+/// The maximum number of functions to track per lattice value. Once the number
+/// of functions a call site can possibly target exceeds this threshold, it's
+/// lattice value becomes overdefined. The number of possible lattice values is
+/// bounded by Ch(F, M), where F is the number of functions in the module and M
+/// is MaxFunctionsPerValue. As such, this value should be kept very small. We
+/// likely can't do anything useful for call sites with a large number of
+/// possible targets, anyway.
+static cl::opt<unsigned> MaxFunctionsPerValue(
+ "cvp-max-functions-per-value", cl::Hidden, cl::init(4),
+ cl::desc("The maximum number of functions to track per lattice value"));
+
+namespace {
+/// To enable interprocedural analysis, we assign LLVM values to the following
+/// groups. The register group represents SSA registers, the return group
+/// represents the return values of functions, and the memory group represents
+/// in-memory values. An LLVM Value can technically be in more than one group.
+/// It's necessary to distinguish these groups so we can, for example, track a
+/// global variable separately from the value stored at its location.
+enum class IPOGrouping { Register, Return, Memory };
+
+/// Our LatticeKeys are PointerIntPairs composed of LLVM values and groupings.
+using CVPLatticeKey = PointerIntPair<Value *, 2, IPOGrouping>;
+
+/// The lattice value type used by our custom lattice function. It holds the
+/// lattice state, and a set of functions.
+class CVPLatticeVal {
+public:
+ /// The states of the lattice values. Only the FunctionSet state is
+ /// interesting. It indicates the set of functions to which an LLVM value may
+ /// refer.
+ enum CVPLatticeStateTy { Undefined, FunctionSet, Overdefined, Untracked };
+
+ /// Comparator for sorting the functions set. We want to keep the order
+ /// deterministic for testing, etc.
+ struct Compare {
+ bool operator()(const Function *LHS, const Function *RHS) const {
+ return LHS->getName() < RHS->getName();
+ }
+ };
+
+ CVPLatticeVal() : LatticeState(Undefined) {}
+ CVPLatticeVal(CVPLatticeStateTy LatticeState) : LatticeState(LatticeState) {}
+ CVPLatticeVal(std::vector<Function *> &&Functions)
+ : LatticeState(FunctionSet), Functions(std::move(Functions)) {
+ assert(llvm::is_sorted(this->Functions, Compare()));
+ }
+
+ /// Get a reference to the functions held by this lattice value. The number
+ /// of functions will be zero for states other than FunctionSet.
+ const std::vector<Function *> &getFunctions() const {
+ return Functions;
+ }
+
+ /// Returns true if the lattice value is in the FunctionSet state.
+ bool isFunctionSet() const { return LatticeState == FunctionSet; }
+
+ bool operator==(const CVPLatticeVal &RHS) const {
+ return LatticeState == RHS.LatticeState && Functions == RHS.Functions;
+ }
+
+ bool operator!=(const CVPLatticeVal &RHS) const {
+ return LatticeState != RHS.LatticeState || Functions != RHS.Functions;
+ }
+
+private:
+ /// Holds the state this lattice value is in.
+ CVPLatticeStateTy LatticeState;
+
+ /// Holds functions indicating the possible targets of call sites. This set
+ /// is empty for lattice values in the undefined, overdefined, and untracked
+ /// states. The maximum size of the set is controlled by
+ /// MaxFunctionsPerValue. Since most LLVM values are expected to be in
+ /// uninteresting states (i.e., overdefined), CVPLatticeVal objects should be
+ /// small and efficiently copyable.
+ // FIXME: This could be a TinyPtrVector and/or merge with LatticeState.
+ std::vector<Function *> Functions;
+};
+
+/// The custom lattice function used by the generic sparse propagation solver.
+/// It handles merging lattice values and computing new lattice values for
+/// constants, arguments, values returned from trackable functions, and values
+/// located in trackable global variables. It also computes the lattice values
+/// that change as a result of executing instructions.
+class CVPLatticeFunc
+ : public AbstractLatticeFunction<CVPLatticeKey, CVPLatticeVal> {
+public:
+ CVPLatticeFunc()
+ : AbstractLatticeFunction(CVPLatticeVal(CVPLatticeVal::Undefined),
+ CVPLatticeVal(CVPLatticeVal::Overdefined),
+ CVPLatticeVal(CVPLatticeVal::Untracked)) {}
+
+ /// Compute and return a CVPLatticeVal for the given CVPLatticeKey.
+ CVPLatticeVal ComputeLatticeVal(CVPLatticeKey Key) override {
+ switch (Key.getInt()) {
+ case IPOGrouping::Register:
+ if (isa<Instruction>(Key.getPointer())) {
+ return getUndefVal();
+ } else if (auto *A = dyn_cast<Argument>(Key.getPointer())) {
+ if (canTrackArgumentsInterprocedurally(A->getParent()))
+ return getUndefVal();
+ } else if (auto *C = dyn_cast<Constant>(Key.getPointer())) {
+ return computeConstant(C);
+ }
+ return getOverdefinedVal();
+ case IPOGrouping::Memory:
+ case IPOGrouping::Return:
+ if (auto *GV = dyn_cast<GlobalVariable>(Key.getPointer())) {
+ if (canTrackGlobalVariableInterprocedurally(GV))
+ return computeConstant(GV->getInitializer());
+ } else if (auto *F = cast<Function>(Key.getPointer()))
+ if (canTrackReturnsInterprocedurally(F))
+ return getUndefVal();
+ }
+ return getOverdefinedVal();
+ }
+
+ /// Merge the two given lattice values. The interesting cases are merging two
+ /// FunctionSet values and a FunctionSet value with an Undefined value. For
+ /// these cases, we simply union the function sets. If the size of the union
+ /// is greater than the maximum functions we track, the merged value is
+ /// overdefined.
+ CVPLatticeVal MergeValues(CVPLatticeVal X, CVPLatticeVal Y) override {
+ if (X == getOverdefinedVal() || Y == getOverdefinedVal())
+ return getOverdefinedVal();
+ if (X == getUndefVal() && Y == getUndefVal())
+ return getUndefVal();
+ std::vector<Function *> Union;
+ std::set_union(X.getFunctions().begin(), X.getFunctions().end(),
+ Y.getFunctions().begin(), Y.getFunctions().end(),
+ std::back_inserter(Union), CVPLatticeVal::Compare{});
+ if (Union.size() > MaxFunctionsPerValue)
+ return getOverdefinedVal();
+ return CVPLatticeVal(std::move(Union));
+ }
+
+ /// Compute the lattice values that change as a result of executing the given
+ /// instruction. The changed values are stored in \p ChangedValues. We handle
+ /// just a few kinds of instructions since we're only propagating values that
+ /// can be called.
+ void ComputeInstructionState(
+ Instruction &I, DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+ SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) override {
+ switch (I.getOpcode()) {
+ case Instruction::Call:
+ case Instruction::Invoke:
+ return visitCallBase(cast<CallBase>(I), ChangedValues, SS);
+ case Instruction::Load:
+ return visitLoad(*cast<LoadInst>(&I), ChangedValues, SS);
+ case Instruction::Ret:
+ return visitReturn(*cast<ReturnInst>(&I), ChangedValues, SS);
+ case Instruction::Select:
+ return visitSelect(*cast<SelectInst>(&I), ChangedValues, SS);
+ case Instruction::Store:
+ return visitStore(*cast<StoreInst>(&I), ChangedValues, SS);
+ default:
+ return visitInst(I, ChangedValues, SS);
+ }
+ }
+
+ /// Print the given CVPLatticeVal to the specified stream.
+ void PrintLatticeVal(CVPLatticeVal LV, raw_ostream &OS) override {
+ if (LV == getUndefVal())
+ OS << "Undefined ";
+ else if (LV == getOverdefinedVal())
+ OS << "Overdefined";
+ else if (LV == getUntrackedVal())
+ OS << "Untracked ";
+ else
+ OS << "FunctionSet";
+ }
+
+ /// Print the given CVPLatticeKey to the specified stream.
+ void PrintLatticeKey(CVPLatticeKey Key, raw_ostream &OS) override {
+ if (Key.getInt() == IPOGrouping::Register)
+ OS << "<reg> ";
+ else if (Key.getInt() == IPOGrouping::Memory)
+ OS << "<mem> ";
+ else if (Key.getInt() == IPOGrouping::Return)
+ OS << "<ret> ";
+ if (isa<Function>(Key.getPointer()))
+ OS << Key.getPointer()->getName();
+ else
+ OS << *Key.getPointer();
+ }
+
+ /// We collect a set of indirect calls when visiting call sites. This method
+ /// returns a reference to that set.
+ SmallPtrSetImpl<CallBase *> &getIndirectCalls() { return IndirectCalls; }
+
+private:
+ /// Holds the indirect calls we encounter during the analysis. We will attach
+ /// metadata to these calls after the analysis indicating the functions the
+ /// calls can possibly target.
+ SmallPtrSet<CallBase *, 32> IndirectCalls;
+
+ /// Compute a new lattice value for the given constant. The constant, after
+ /// stripping any pointer casts, should be a Function. We ignore null
+ /// pointers as an optimization, since calling these values is undefined
+ /// behavior.
+ CVPLatticeVal computeConstant(Constant *C) {
+ if (isa<ConstantPointerNull>(C))
+ return CVPLatticeVal(CVPLatticeVal::FunctionSet);
+ if (auto *F = dyn_cast<Function>(C->stripPointerCasts()))
+ return CVPLatticeVal({F});
+ return getOverdefinedVal();
+ }
+
+ /// Handle return instructions. The function's return state is the merge of
+ /// the returned value state and the function's return state.
+ void visitReturn(ReturnInst &I,
+ DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+ SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+ Function *F = I.getParent()->getParent();
+ if (F->getReturnType()->isVoidTy())
+ return;
+ auto RegI = CVPLatticeKey(I.getReturnValue(), IPOGrouping::Register);
+ auto RetF = CVPLatticeKey(F, IPOGrouping::Return);
+ ChangedValues[RetF] =
+ MergeValues(SS.getValueState(RegI), SS.getValueState(RetF));
+ }
+
+ /// Handle call sites. The state of a called function's formal arguments is
+ /// the merge of the argument state with the call sites corresponding actual
+ /// argument state. The call site state is the merge of the call site state
+ /// with the returned value state of the called function.
+ void visitCallBase(CallBase &CB,
+ DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+ SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+ Function *F = CB.getCalledFunction();
+ auto RegI = CVPLatticeKey(&CB, IPOGrouping::Register);
+
+ // If this is an indirect call, save it so we can quickly revisit it when
+ // attaching metadata.
+ if (!F)
+ IndirectCalls.insert(&CB);
+
+ // If we can't track the function's return values, there's nothing to do.
+ if (!F || !canTrackReturnsInterprocedurally(F)) {
+ // Void return, No need to create and update CVPLattice state as no one
+ // can use it.
+ if (CB.getType()->isVoidTy())
+ return;
+ ChangedValues[RegI] = getOverdefinedVal();
+ return;
+ }
+
+ // Inform the solver that the called function is executable, and perform
+ // the merges for the arguments and return value.
+ SS.MarkBlockExecutable(&F->front());
+ auto RetF = CVPLatticeKey(F, IPOGrouping::Return);
+ for (Argument &A : F->args()) {
+ auto RegFormal = CVPLatticeKey(&A, IPOGrouping::Register);
+ auto RegActual =
+ CVPLatticeKey(CB.getArgOperand(A.getArgNo()), IPOGrouping::Register);
+ ChangedValues[RegFormal] =
+ MergeValues(SS.getValueState(RegFormal), SS.getValueState(RegActual));
+ }
+
+ // Void return, No need to create and update CVPLattice state as no one can
+ // use it.
+ if (CB.getType()->isVoidTy())
+ return;
+
+ ChangedValues[RegI] =
+ MergeValues(SS.getValueState(RegI), SS.getValueState(RetF));
+ }
+
+ /// Handle select instructions. The select instruction state is the merge the
+ /// true and false value states.
+ void visitSelect(SelectInst &I,
+ DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+ SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+ auto RegI = CVPLatticeKey(&I, IPOGrouping::Register);
+ auto RegT = CVPLatticeKey(I.getTrueValue(), IPOGrouping::Register);
+ auto RegF = CVPLatticeKey(I.getFalseValue(), IPOGrouping::Register);
+ ChangedValues[RegI] =
+ MergeValues(SS.getValueState(RegT), SS.getValueState(RegF));
+ }
+
+ /// Handle load instructions. If the pointer operand of the load is a global
+ /// variable, we attempt to track the value. The loaded value state is the
+ /// merge of the loaded value state with the global variable state.
+ void visitLoad(LoadInst &I,
+ DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+ SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+ auto RegI = CVPLatticeKey(&I, IPOGrouping::Register);
+ if (auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand())) {
+ auto MemGV = CVPLatticeKey(GV, IPOGrouping::Memory);
+ ChangedValues[RegI] =
+ MergeValues(SS.getValueState(RegI), SS.getValueState(MemGV));
+ } else {
+ ChangedValues[RegI] = getOverdefinedVal();
+ }
+ }
+
+ /// Handle store instructions. If the pointer operand of the store is a
+ /// global variable, we attempt to track the value. The global variable state
+ /// is the merge of the stored value state with the global variable state.
+ void visitStore(StoreInst &I,
+ DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+ SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+ auto *GV = dyn_cast<GlobalVariable>(I.getPointerOperand());
+ if (!GV)
+ return;
+ auto RegI = CVPLatticeKey(I.getValueOperand(), IPOGrouping::Register);
+ auto MemGV = CVPLatticeKey(GV, IPOGrouping::Memory);
+ ChangedValues[MemGV] =
+ MergeValues(SS.getValueState(RegI), SS.getValueState(MemGV));
+ }
+
+ /// Handle all other instructions. All other instructions are marked
+ /// overdefined.
+ void visitInst(Instruction &I,
+ DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
+ SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
+ // Simply bail if this instruction has no user.
+ if (I.use_empty())
+ return;
+ auto RegI = CVPLatticeKey(&I, IPOGrouping::Register);
+ ChangedValues[RegI] = getOverdefinedVal();
+ }
+};
+} // namespace
+
+namespace llvm {
+/// A specialization of LatticeKeyInfo for CVPLatticeKeys. The generic solver
+/// must translate between LatticeKeys and LLVM Values when adding Values to
+/// its work list and inspecting the state of control-flow related values.
+template <> struct LatticeKeyInfo<CVPLatticeKey> {
+ static inline Value *getValueFromLatticeKey(CVPLatticeKey Key) {
+ return Key.getPointer();
+ }
+ static inline CVPLatticeKey getLatticeKeyFromValue(Value *V) {
+ return CVPLatticeKey(V, IPOGrouping::Register);
+ }
+};
+} // namespace llvm
+
+static bool runCVP(Module &M) {
+ // Our custom lattice function and generic sparse propagation solver.
+ CVPLatticeFunc Lattice;
+ SparseSolver<CVPLatticeKey, CVPLatticeVal> Solver(&Lattice);
+
+ // For each function in the module, if we can't track its arguments, let the
+ // generic solver assume it is executable.
+ for (Function &F : M)
+ if (!F.isDeclaration() && !canTrackArgumentsInterprocedurally(&F))
+ Solver.MarkBlockExecutable(&F.front());
+
+ // Solver our custom lattice. In doing so, we will also build a set of
+ // indirect call sites.
+ Solver.Solve();
+
+ // Attach metadata to the indirect call sites that were collected indicating
+ // the set of functions they can possibly target.
+ bool Changed = false;
+ MDBuilder MDB(M.getContext());
+ for (CallBase *C : Lattice.getIndirectCalls()) {
+ auto RegI = CVPLatticeKey(C->getCalledOperand(), IPOGrouping::Register);
+ CVPLatticeVal LV = Solver.getExistingValueState(RegI);
+ if (!LV.isFunctionSet() || LV.getFunctions().empty())
+ continue;
+ MDNode *Callees = MDB.createCallees(LV.getFunctions());
+ C->setMetadata(LLVMContext::MD_callees, Callees);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses CalledValuePropagationPass::run(Module &M,
+ ModuleAnalysisManager &) {
+ runCVP(M);
+ return PreservedAnalyses::all();
+}
+
+namespace {
+class CalledValuePropagationLegacyPass : public ModulePass {
+public:
+ static char ID;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+
+ CalledValuePropagationLegacyPass() : ModulePass(ID) {
+ initializeCalledValuePropagationLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+ return runCVP(M);
+ }
+};
+} // namespace
+
+char CalledValuePropagationLegacyPass::ID = 0;
+INITIALIZE_PASS(CalledValuePropagationLegacyPass, "called-value-propagation",
+ "Called Value Propagation", false, false)
+
+ModulePass *llvm::createCalledValuePropagationPass() {
+ return new CalledValuePropagationLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/ConstantMerge.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/ConstantMerge.cpp
index 41f4f4da81..8e81f4bad4 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/ConstantMerge.cpp
@@ -1,288 +1,288 @@
-//===- ConstantMerge.cpp - Merge duplicate global constants ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interface to a pass that merges duplicate global
-// constants together into a single constant that is shared. This is useful
-// because some passes (ie TraceValues) insert a lot of string constants into
-// the program, regardless of whether or not an existing string is available.
-//
-// Algorithm: ConstantMerge is designed to build up a map of available constants
-// and eliminate duplicates when it is initialized.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/ConstantMerge.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Transforms/IPO.h"
-#include <algorithm>
-#include <cassert>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "constmerge"
-
-STATISTIC(NumIdenticalMerged, "Number of identical global constants merged");
-
-/// Find values that are marked as llvm.used.
-static void FindUsedValues(GlobalVariable *LLVMUsed,
- SmallPtrSetImpl<const GlobalValue*> &UsedValues) {
- if (!LLVMUsed) return;
- ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
-
- for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) {
- Value *Operand = Inits->getOperand(i)->stripPointerCasts();
- GlobalValue *GV = cast<GlobalValue>(Operand);
- UsedValues.insert(GV);
- }
-}
-
-// True if A is better than B.
-static bool IsBetterCanonical(const GlobalVariable &A,
- const GlobalVariable &B) {
- if (!A.hasLocalLinkage() && B.hasLocalLinkage())
- return true;
-
- if (A.hasLocalLinkage() && !B.hasLocalLinkage())
- return false;
-
- return A.hasGlobalUnnamedAddr();
-}
-
-static bool hasMetadataOtherThanDebugLoc(const GlobalVariable *GV) {
- SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
- GV->getAllMetadata(MDs);
- for (const auto &V : MDs)
- if (V.first != LLVMContext::MD_dbg)
- return true;
- return false;
-}
-
-static void copyDebugLocMetadata(const GlobalVariable *From,
- GlobalVariable *To) {
- SmallVector<DIGlobalVariableExpression *, 1> MDs;
- From->getDebugInfo(MDs);
- for (auto MD : MDs)
- To->addDebugInfo(MD);
-}
-
-static Align getAlign(GlobalVariable *GV) {
- return GV->getAlign().getValueOr(
- GV->getParent()->getDataLayout().getPreferredAlign(GV));
-}
-
-static bool
-isUnmergeableGlobal(GlobalVariable *GV,
- const SmallPtrSetImpl<const GlobalValue *> &UsedGlobals) {
- // Only process constants with initializers in the default address space.
- return !GV->isConstant() || !GV->hasDefinitiveInitializer() ||
- GV->getType()->getAddressSpace() != 0 || GV->hasSection() ||
+//===- ConstantMerge.cpp - Merge duplicate global constants ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface to a pass that merges duplicate global
+// constants together into a single constant that is shared. This is useful
+// because some passes (ie TraceValues) insert a lot of string constants into
+// the program, regardless of whether or not an existing string is available.
+//
+// Algorithm: ConstantMerge is designed to build up a map of available constants
+// and eliminate duplicates when it is initialized.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ConstantMerge.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/IPO.h"
+#include <algorithm>
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "constmerge"
+
+STATISTIC(NumIdenticalMerged, "Number of identical global constants merged");
+
+/// Find values that are marked as llvm.used.
+static void FindUsedValues(GlobalVariable *LLVMUsed,
+ SmallPtrSetImpl<const GlobalValue*> &UsedValues) {
+ if (!LLVMUsed) return;
+ ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+
+ for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) {
+ Value *Operand = Inits->getOperand(i)->stripPointerCasts();
+ GlobalValue *GV = cast<GlobalValue>(Operand);
+ UsedValues.insert(GV);
+ }
+}
+
+// True if A is better than B.
+static bool IsBetterCanonical(const GlobalVariable &A,
+ const GlobalVariable &B) {
+ if (!A.hasLocalLinkage() && B.hasLocalLinkage())
+ return true;
+
+ if (A.hasLocalLinkage() && !B.hasLocalLinkage())
+ return false;
+
+ return A.hasGlobalUnnamedAddr();
+}
+
+static bool hasMetadataOtherThanDebugLoc(const GlobalVariable *GV) {
+ SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+ GV->getAllMetadata(MDs);
+ for (const auto &V : MDs)
+ if (V.first != LLVMContext::MD_dbg)
+ return true;
+ return false;
+}
+
+static void copyDebugLocMetadata(const GlobalVariable *From,
+ GlobalVariable *To) {
+ SmallVector<DIGlobalVariableExpression *, 1> MDs;
+ From->getDebugInfo(MDs);
+ for (auto MD : MDs)
+ To->addDebugInfo(MD);
+}
+
+static Align getAlign(GlobalVariable *GV) {
+ return GV->getAlign().getValueOr(
+ GV->getParent()->getDataLayout().getPreferredAlign(GV));
+}
+
+static bool
+isUnmergeableGlobal(GlobalVariable *GV,
+ const SmallPtrSetImpl<const GlobalValue *> &UsedGlobals) {
+ // Only process constants with initializers in the default address space.
+ return !GV->isConstant() || !GV->hasDefinitiveInitializer() ||
+ GV->getType()->getAddressSpace() != 0 || GV->hasSection() ||
// Don't touch thread-local variables.
GV->isThreadLocal() ||
- // Don't touch values marked with attribute(used).
- UsedGlobals.count(GV);
-}
-
-enum class CanMerge { No, Yes };
-static CanMerge makeMergeable(GlobalVariable *Old, GlobalVariable *New) {
- if (!Old->hasGlobalUnnamedAddr() && !New->hasGlobalUnnamedAddr())
- return CanMerge::No;
- if (hasMetadataOtherThanDebugLoc(Old))
- return CanMerge::No;
- assert(!hasMetadataOtherThanDebugLoc(New));
- if (!Old->hasGlobalUnnamedAddr())
- New->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
- return CanMerge::Yes;
-}
-
-static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New) {
- Constant *NewConstant = New;
-
- LLVM_DEBUG(dbgs() << "Replacing global: @" << Old->getName() << " -> @"
- << New->getName() << "\n");
-
- // Bump the alignment if necessary.
- if (Old->getAlign() || New->getAlign())
- New->setAlignment(std::max(getAlign(Old), getAlign(New)));
-
- copyDebugLocMetadata(Old, New);
- Old->replaceAllUsesWith(NewConstant);
-
- // Delete the global value from the module.
- assert(Old->hasLocalLinkage() &&
- "Refusing to delete an externally visible global variable.");
- Old->eraseFromParent();
-}
-
-static bool mergeConstants(Module &M) {
- // Find all the globals that are marked "used". These cannot be merged.
- SmallPtrSet<const GlobalValue*, 8> UsedGlobals;
- FindUsedValues(M.getGlobalVariable("llvm.used"), UsedGlobals);
- FindUsedValues(M.getGlobalVariable("llvm.compiler.used"), UsedGlobals);
-
- // Map unique constants to globals.
- DenseMap<Constant *, GlobalVariable *> CMap;
-
- SmallVector<std::pair<GlobalVariable *, GlobalVariable *>, 32>
- SameContentReplacements;
-
- size_t ChangesMade = 0;
- size_t OldChangesMade = 0;
-
- // Iterate constant merging while we are still making progress. Merging two
- // constants together may allow us to merge other constants together if the
- // second level constants have initializers which point to the globals that
- // were just merged.
- while (true) {
- // Find the canonical constants others will be merged with.
- for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
- GVI != E; ) {
- GlobalVariable *GV = &*GVI++;
-
- // If this GV is dead, remove it.
- GV->removeDeadConstantUsers();
- if (GV->use_empty() && GV->hasLocalLinkage()) {
- GV->eraseFromParent();
- ++ChangesMade;
- continue;
- }
-
- if (isUnmergeableGlobal(GV, UsedGlobals))
- continue;
-
- // This transformation is legal for weak ODR globals in the sense it
- // doesn't change semantics, but we really don't want to perform it
- // anyway; it's likely to pessimize code generation, and some tools
- // (like the Darwin linker in cases involving CFString) don't expect it.
- if (GV->isWeakForLinker())
- continue;
-
- // Don't touch globals with metadata other then !dbg.
- if (hasMetadataOtherThanDebugLoc(GV))
- continue;
-
- Constant *Init = GV->getInitializer();
-
- // Check to see if the initializer is already known.
- GlobalVariable *&Slot = CMap[Init];
-
- // If this is the first constant we find or if the old one is local,
- // replace with the current one. If the current is externally visible
- // it cannot be replace, but can be the canonical constant we merge with.
- bool FirstConstantFound = !Slot;
- if (FirstConstantFound || IsBetterCanonical(*GV, *Slot)) {
- Slot = GV;
- LLVM_DEBUG(dbgs() << "Cmap[" << *Init << "] = " << GV->getName()
- << (FirstConstantFound ? "\n" : " (updated)\n"));
- }
- }
-
- // Identify all globals that can be merged together, filling in the
- // SameContentReplacements vector. We cannot do the replacement in this pass
- // because doing so may cause initializers of other globals to be rewritten,
- // invalidating the Constant* pointers in CMap.
- for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
- GVI != E; ) {
- GlobalVariable *GV = &*GVI++;
-
- if (isUnmergeableGlobal(GV, UsedGlobals))
- continue;
-
- // We can only replace constant with local linkage.
- if (!GV->hasLocalLinkage())
- continue;
-
- Constant *Init = GV->getInitializer();
-
- // Check to see if the initializer is already known.
- auto Found = CMap.find(Init);
- if (Found == CMap.end())
- continue;
-
- GlobalVariable *Slot = Found->second;
- if (Slot == GV)
- continue;
-
- if (makeMergeable(GV, Slot) == CanMerge::No)
- continue;
-
- // Make all uses of the duplicate constant use the canonical version.
- LLVM_DEBUG(dbgs() << "Will replace: @" << GV->getName() << " -> @"
- << Slot->getName() << "\n");
- SameContentReplacements.push_back(std::make_pair(GV, Slot));
- }
-
- // Now that we have figured out which replacements must be made, do them all
- // now. This avoid invalidating the pointers in CMap, which are unneeded
- // now.
- for (unsigned i = 0, e = SameContentReplacements.size(); i != e; ++i) {
- GlobalVariable *Old = SameContentReplacements[i].first;
- GlobalVariable *New = SameContentReplacements[i].second;
- replace(M, Old, New);
- ++ChangesMade;
- ++NumIdenticalMerged;
- }
-
- if (ChangesMade == OldChangesMade)
- break;
- OldChangesMade = ChangesMade;
-
- SameContentReplacements.clear();
- CMap.clear();
- }
-
- return ChangesMade;
-}
-
-PreservedAnalyses ConstantMergePass::run(Module &M, ModuleAnalysisManager &) {
- if (!mergeConstants(M))
- return PreservedAnalyses::all();
- return PreservedAnalyses::none();
-}
-
-namespace {
-
-struct ConstantMergeLegacyPass : public ModulePass {
- static char ID; // Pass identification, replacement for typeid
-
- ConstantMergeLegacyPass() : ModulePass(ID) {
- initializeConstantMergeLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- // For this pass, process all of the globals in the module, eliminating
- // duplicate constants.
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
- return mergeConstants(M);
- }
-};
-
-} // end anonymous namespace
-
-char ConstantMergeLegacyPass::ID = 0;
-
-INITIALIZE_PASS(ConstantMergeLegacyPass, "constmerge",
- "Merge Duplicate Global Constants", false, false)
-
-ModulePass *llvm::createConstantMergePass() {
- return new ConstantMergeLegacyPass();
-}
+ // Don't touch values marked with attribute(used).
+ UsedGlobals.count(GV);
+}
+
+enum class CanMerge { No, Yes };
+static CanMerge makeMergeable(GlobalVariable *Old, GlobalVariable *New) {
+ if (!Old->hasGlobalUnnamedAddr() && !New->hasGlobalUnnamedAddr())
+ return CanMerge::No;
+ if (hasMetadataOtherThanDebugLoc(Old))
+ return CanMerge::No;
+ assert(!hasMetadataOtherThanDebugLoc(New));
+ if (!Old->hasGlobalUnnamedAddr())
+ New->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
+ return CanMerge::Yes;
+}
+
+static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New) {
+ Constant *NewConstant = New;
+
+ LLVM_DEBUG(dbgs() << "Replacing global: @" << Old->getName() << " -> @"
+ << New->getName() << "\n");
+
+ // Bump the alignment if necessary.
+ if (Old->getAlign() || New->getAlign())
+ New->setAlignment(std::max(getAlign(Old), getAlign(New)));
+
+ copyDebugLocMetadata(Old, New);
+ Old->replaceAllUsesWith(NewConstant);
+
+ // Delete the global value from the module.
+ assert(Old->hasLocalLinkage() &&
+ "Refusing to delete an externally visible global variable.");
+ Old->eraseFromParent();
+}
+
+static bool mergeConstants(Module &M) {
+ // Find all the globals that are marked "used". These cannot be merged.
+ SmallPtrSet<const GlobalValue*, 8> UsedGlobals;
+ FindUsedValues(M.getGlobalVariable("llvm.used"), UsedGlobals);
+ FindUsedValues(M.getGlobalVariable("llvm.compiler.used"), UsedGlobals);
+
+ // Map unique constants to globals.
+ DenseMap<Constant *, GlobalVariable *> CMap;
+
+ SmallVector<std::pair<GlobalVariable *, GlobalVariable *>, 32>
+ SameContentReplacements;
+
+ size_t ChangesMade = 0;
+ size_t OldChangesMade = 0;
+
+ // Iterate constant merging while we are still making progress. Merging two
+ // constants together may allow us to merge other constants together if the
+ // second level constants have initializers which point to the globals that
+ // were just merged.
+ while (true) {
+ // Find the canonical constants others will be merged with.
+ for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+ GVI != E; ) {
+ GlobalVariable *GV = &*GVI++;
+
+ // If this GV is dead, remove it.
+ GV->removeDeadConstantUsers();
+ if (GV->use_empty() && GV->hasLocalLinkage()) {
+ GV->eraseFromParent();
+ ++ChangesMade;
+ continue;
+ }
+
+ if (isUnmergeableGlobal(GV, UsedGlobals))
+ continue;
+
+ // This transformation is legal for weak ODR globals in the sense it
+ // doesn't change semantics, but we really don't want to perform it
+ // anyway; it's likely to pessimize code generation, and some tools
+ // (like the Darwin linker in cases involving CFString) don't expect it.
+ if (GV->isWeakForLinker())
+ continue;
+
+ // Don't touch globals with metadata other then !dbg.
+ if (hasMetadataOtherThanDebugLoc(GV))
+ continue;
+
+ Constant *Init = GV->getInitializer();
+
+ // Check to see if the initializer is already known.
+ GlobalVariable *&Slot = CMap[Init];
+
+ // If this is the first constant we find or if the old one is local,
+ // replace with the current one. If the current is externally visible
+ // it cannot be replace, but can be the canonical constant we merge with.
+ bool FirstConstantFound = !Slot;
+ if (FirstConstantFound || IsBetterCanonical(*GV, *Slot)) {
+ Slot = GV;
+ LLVM_DEBUG(dbgs() << "Cmap[" << *Init << "] = " << GV->getName()
+ << (FirstConstantFound ? "\n" : " (updated)\n"));
+ }
+ }
+
+ // Identify all globals that can be merged together, filling in the
+ // SameContentReplacements vector. We cannot do the replacement in this pass
+ // because doing so may cause initializers of other globals to be rewritten,
+ // invalidating the Constant* pointers in CMap.
+ for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+ GVI != E; ) {
+ GlobalVariable *GV = &*GVI++;
+
+ if (isUnmergeableGlobal(GV, UsedGlobals))
+ continue;
+
+ // We can only replace constant with local linkage.
+ if (!GV->hasLocalLinkage())
+ continue;
+
+ Constant *Init = GV->getInitializer();
+
+ // Check to see if the initializer is already known.
+ auto Found = CMap.find(Init);
+ if (Found == CMap.end())
+ continue;
+
+ GlobalVariable *Slot = Found->second;
+ if (Slot == GV)
+ continue;
+
+ if (makeMergeable(GV, Slot) == CanMerge::No)
+ continue;
+
+ // Make all uses of the duplicate constant use the canonical version.
+ LLVM_DEBUG(dbgs() << "Will replace: @" << GV->getName() << " -> @"
+ << Slot->getName() << "\n");
+ SameContentReplacements.push_back(std::make_pair(GV, Slot));
+ }
+
+ // Now that we have figured out which replacements must be made, do them all
+ // now. This avoid invalidating the pointers in CMap, which are unneeded
+ // now.
+ for (unsigned i = 0, e = SameContentReplacements.size(); i != e; ++i) {
+ GlobalVariable *Old = SameContentReplacements[i].first;
+ GlobalVariable *New = SameContentReplacements[i].second;
+ replace(M, Old, New);
+ ++ChangesMade;
+ ++NumIdenticalMerged;
+ }
+
+ if (ChangesMade == OldChangesMade)
+ break;
+ OldChangesMade = ChangesMade;
+
+ SameContentReplacements.clear();
+ CMap.clear();
+ }
+
+ return ChangesMade;
+}
+
+PreservedAnalyses ConstantMergePass::run(Module &M, ModuleAnalysisManager &) {
+ if (!mergeConstants(M))
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
+
+namespace {
+
+struct ConstantMergeLegacyPass : public ModulePass {
+ static char ID; // Pass identification, replacement for typeid
+
+ ConstantMergeLegacyPass() : ModulePass(ID) {
+ initializeConstantMergeLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ // For this pass, process all of the globals in the module, eliminating
+ // duplicate constants.
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+ return mergeConstants(M);
+ }
+};
+
+} // end anonymous namespace
+
+char ConstantMergeLegacyPass::ID = 0;
+
+INITIALIZE_PASS(ConstantMergeLegacyPass, "constmerge",
+ "Merge Duplicate Global Constants", false, false)
+
+ModulePass *llvm::createConstantMergePass() {
+ return new ConstantMergeLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/CrossDSOCFI.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/CrossDSOCFI.cpp
index 88b9cc5fe4..2fe9a59ad2 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -1,175 +1,175 @@
-//===-- CrossDSOCFI.cpp - Externalize this module's CFI checks ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass exports all llvm.bitset's found in the module in the form of a
-// __cfi_check function, which can be used to verify cross-DSO call targets.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/CrossDSOCFI.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalObject.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "cross-dso-cfi"
-
-STATISTIC(NumTypeIds, "Number of unique type identifiers");
-
-namespace {
-
-struct CrossDSOCFI : public ModulePass {
- static char ID;
- CrossDSOCFI() : ModulePass(ID) {
- initializeCrossDSOCFIPass(*PassRegistry::getPassRegistry());
- }
-
- MDNode *VeryLikelyWeights;
-
- ConstantInt *extractNumericTypeId(MDNode *MD);
- void buildCFICheck(Module &M);
- bool runOnModule(Module &M) override;
-};
-
-} // anonymous namespace
-
-INITIALIZE_PASS_BEGIN(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false,
- false)
-INITIALIZE_PASS_END(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false, false)
-char CrossDSOCFI::ID = 0;
-
-ModulePass *llvm::createCrossDSOCFIPass() { return new CrossDSOCFI; }
-
-/// Extracts a numeric type identifier from an MDNode containing type metadata.
-ConstantInt *CrossDSOCFI::extractNumericTypeId(MDNode *MD) {
- // This check excludes vtables for classes inside anonymous namespaces.
- auto TM = dyn_cast<ValueAsMetadata>(MD->getOperand(1));
- if (!TM)
- return nullptr;
- auto C = dyn_cast_or_null<ConstantInt>(TM->getValue());
- if (!C) return nullptr;
- // We are looking for i64 constants.
- if (C->getBitWidth() != 64) return nullptr;
-
- return C;
-}
-
-/// buildCFICheck - emits __cfi_check for the current module.
-void CrossDSOCFI::buildCFICheck(Module &M) {
- // FIXME: verify that __cfi_check ends up near the end of the code section,
- // but before the jump slots created in LowerTypeTests.
- SetVector<uint64_t> TypeIds;
- SmallVector<MDNode *, 2> Types;
- for (GlobalObject &GO : M.global_objects()) {
- Types.clear();
- GO.getMetadata(LLVMContext::MD_type, Types);
- for (MDNode *Type : Types)
- if (ConstantInt *TypeId = extractNumericTypeId(Type))
- TypeIds.insert(TypeId->getZExtValue());
- }
-
- NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
- if (CfiFunctionsMD) {
- for (auto Func : CfiFunctionsMD->operands()) {
- assert(Func->getNumOperands() >= 2);
- for (unsigned I = 2; I < Func->getNumOperands(); ++I)
- if (ConstantInt *TypeId =
- extractNumericTypeId(cast<MDNode>(Func->getOperand(I).get())))
- TypeIds.insert(TypeId->getZExtValue());
- }
- }
-
- LLVMContext &Ctx = M.getContext();
- FunctionCallee C = M.getOrInsertFunction(
- "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx),
- Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx));
- Function *F = cast<Function>(C.getCallee());
- // Take over the existing function. The frontend emits a weak stub so that the
- // linker knows about the symbol; this pass replaces the function body.
- F->deleteBody();
- F->setAlignment(Align(4096));
-
- Triple T(M.getTargetTriple());
- if (T.isARM() || T.isThumb())
- F->addFnAttr("target-features", "+thumb-mode");
-
- auto args = F->arg_begin();
- Value &CallSiteTypeId = *(args++);
- CallSiteTypeId.setName("CallSiteTypeId");
- Value &Addr = *(args++);
- Addr.setName("Addr");
- Value &CFICheckFailData = *(args++);
- CFICheckFailData.setName("CFICheckFailData");
- assert(args == F->arg_end());
-
- BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
- BasicBlock *ExitBB = BasicBlock::Create(Ctx, "exit", F);
-
- BasicBlock *TrapBB = BasicBlock::Create(Ctx, "fail", F);
- IRBuilder<> IRBFail(TrapBB);
- FunctionCallee CFICheckFailFn =
- M.getOrInsertFunction("__cfi_check_fail", Type::getVoidTy(Ctx),
- Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx));
- IRBFail.CreateCall(CFICheckFailFn, {&CFICheckFailData, &Addr});
- IRBFail.CreateBr(ExitBB);
-
- IRBuilder<> IRBExit(ExitBB);
- IRBExit.CreateRetVoid();
-
- IRBuilder<> IRB(BB);
- SwitchInst *SI = IRB.CreateSwitch(&CallSiteTypeId, TrapBB, TypeIds.size());
- for (uint64_t TypeId : TypeIds) {
- ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId);
- BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F);
- IRBuilder<> IRBTest(TestBB);
- Function *BitsetTestFn = Intrinsic::getDeclaration(&M, Intrinsic::type_test);
-
- Value *Test = IRBTest.CreateCall(
- BitsetTestFn, {&Addr, MetadataAsValue::get(
- Ctx, ConstantAsMetadata::get(CaseTypeId))});
- BranchInst *BI = IRBTest.CreateCondBr(Test, ExitBB, TrapBB);
- BI->setMetadata(LLVMContext::MD_prof, VeryLikelyWeights);
-
- SI->addCase(CaseTypeId, TestBB);
- ++NumTypeIds;
- }
-}
-
-bool CrossDSOCFI::runOnModule(Module &M) {
- VeryLikelyWeights =
- MDBuilder(M.getContext()).createBranchWeights((1U << 20) - 1, 1);
- if (M.getModuleFlag("Cross-DSO CFI") == nullptr)
- return false;
- buildCFICheck(M);
- return true;
-}
-
-PreservedAnalyses CrossDSOCFIPass::run(Module &M, ModuleAnalysisManager &AM) {
- CrossDSOCFI Impl;
- bool Changed = Impl.runOnModule(M);
- if (!Changed)
- return PreservedAnalyses::all();
- return PreservedAnalyses::none();
-}
+//===-- CrossDSOCFI.cpp - Externalize this module's CFI checks ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass exports all llvm.bitset's found in the module in the form of a
+// __cfi_check function, which can be used to verify cross-DSO call targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/CrossDSOCFI.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "cross-dso-cfi"
+
+STATISTIC(NumTypeIds, "Number of unique type identifiers");
+
+namespace {
+
+struct CrossDSOCFI : public ModulePass {
+ static char ID;
+ CrossDSOCFI() : ModulePass(ID) {
+ initializeCrossDSOCFIPass(*PassRegistry::getPassRegistry());
+ }
+
+ MDNode *VeryLikelyWeights;
+
+ ConstantInt *extractNumericTypeId(MDNode *MD);
+ void buildCFICheck(Module &M);
+ bool runOnModule(Module &M) override;
+};
+
+} // anonymous namespace
+
+INITIALIZE_PASS_BEGIN(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false,
+ false)
+INITIALIZE_PASS_END(CrossDSOCFI, "cross-dso-cfi", "Cross-DSO CFI", false, false)
+char CrossDSOCFI::ID = 0;
+
+ModulePass *llvm::createCrossDSOCFIPass() { return new CrossDSOCFI; }
+
+/// Extracts a numeric type identifier from an MDNode containing type metadata.
+ConstantInt *CrossDSOCFI::extractNumericTypeId(MDNode *MD) {
+ // This check excludes vtables for classes inside anonymous namespaces.
+ auto TM = dyn_cast<ValueAsMetadata>(MD->getOperand(1));
+ if (!TM)
+ return nullptr;
+ auto C = dyn_cast_or_null<ConstantInt>(TM->getValue());
+ if (!C) return nullptr;
+ // We are looking for i64 constants.
+ if (C->getBitWidth() != 64) return nullptr;
+
+ return C;
+}
+
+/// buildCFICheck - emits __cfi_check for the current module.
+void CrossDSOCFI::buildCFICheck(Module &M) {
+ // FIXME: verify that __cfi_check ends up near the end of the code section,
+ // but before the jump slots created in LowerTypeTests.
+ SetVector<uint64_t> TypeIds;
+ SmallVector<MDNode *, 2> Types;
+ for (GlobalObject &GO : M.global_objects()) {
+ Types.clear();
+ GO.getMetadata(LLVMContext::MD_type, Types);
+ for (MDNode *Type : Types)
+ if (ConstantInt *TypeId = extractNumericTypeId(Type))
+ TypeIds.insert(TypeId->getZExtValue());
+ }
+
+ NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
+ if (CfiFunctionsMD) {
+ for (auto Func : CfiFunctionsMD->operands()) {
+ assert(Func->getNumOperands() >= 2);
+ for (unsigned I = 2; I < Func->getNumOperands(); ++I)
+ if (ConstantInt *TypeId =
+ extractNumericTypeId(cast<MDNode>(Func->getOperand(I).get())))
+ TypeIds.insert(TypeId->getZExtValue());
+ }
+ }
+
+ LLVMContext &Ctx = M.getContext();
+ FunctionCallee C = M.getOrInsertFunction(
+ "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx),
+ Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx));
+ Function *F = cast<Function>(C.getCallee());
+ // Take over the existing function. The frontend emits a weak stub so that the
+ // linker knows about the symbol; this pass replaces the function body.
+ F->deleteBody();
+ F->setAlignment(Align(4096));
+
+ Triple T(M.getTargetTriple());
+ if (T.isARM() || T.isThumb())
+ F->addFnAttr("target-features", "+thumb-mode");
+
+ auto args = F->arg_begin();
+ Value &CallSiteTypeId = *(args++);
+ CallSiteTypeId.setName("CallSiteTypeId");
+ Value &Addr = *(args++);
+ Addr.setName("Addr");
+ Value &CFICheckFailData = *(args++);
+ CFICheckFailData.setName("CFICheckFailData");
+ assert(args == F->arg_end());
+
+ BasicBlock *BB = BasicBlock::Create(Ctx, "entry", F);
+ BasicBlock *ExitBB = BasicBlock::Create(Ctx, "exit", F);
+
+ BasicBlock *TrapBB = BasicBlock::Create(Ctx, "fail", F);
+ IRBuilder<> IRBFail(TrapBB);
+ FunctionCallee CFICheckFailFn =
+ M.getOrInsertFunction("__cfi_check_fail", Type::getVoidTy(Ctx),
+ Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx));
+ IRBFail.CreateCall(CFICheckFailFn, {&CFICheckFailData, &Addr});
+ IRBFail.CreateBr(ExitBB);
+
+ IRBuilder<> IRBExit(ExitBB);
+ IRBExit.CreateRetVoid();
+
+ IRBuilder<> IRB(BB);
+ SwitchInst *SI = IRB.CreateSwitch(&CallSiteTypeId, TrapBB, TypeIds.size());
+ for (uint64_t TypeId : TypeIds) {
+ ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId);
+ BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F);
+ IRBuilder<> IRBTest(TestBB);
+ Function *BitsetTestFn = Intrinsic::getDeclaration(&M, Intrinsic::type_test);
+
+ Value *Test = IRBTest.CreateCall(
+ BitsetTestFn, {&Addr, MetadataAsValue::get(
+ Ctx, ConstantAsMetadata::get(CaseTypeId))});
+ BranchInst *BI = IRBTest.CreateCondBr(Test, ExitBB, TrapBB);
+ BI->setMetadata(LLVMContext::MD_prof, VeryLikelyWeights);
+
+ SI->addCase(CaseTypeId, TestBB);
+ ++NumTypeIds;
+ }
+}
+
+bool CrossDSOCFI::runOnModule(Module &M) {
+ VeryLikelyWeights =
+ MDBuilder(M.getContext()).createBranchWeights((1U << 20) - 1, 1);
+ if (M.getModuleFlag("Cross-DSO CFI") == nullptr)
+ return false;
+ buildCFICheck(M);
+ return true;
+}
+
+PreservedAnalyses CrossDSOCFIPass::run(Module &M, ModuleAnalysisManager &AM) {
+ CrossDSOCFI Impl;
+ bool Changed = Impl.runOnModule(M);
+ if (!Changed)
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/DeadArgumentElimination.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/DeadArgumentElimination.cpp
index bfb1a83473..0b763e423f 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -1,1124 +1,1124 @@
-//===- DeadArgumentElimination.cpp - Eliminate dead arguments -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass deletes dead arguments from internal functions. Dead argument
-// elimination removes arguments which are directly dead, as well as arguments
-// only passed into function calls as dead arguments of other functions. This
-// pass also deletes dead return values in a similar way.
-//
-// This pass is often useful as a cleanup pass to run after aggressive
-// interprocedural passes, which add possibly-dead arguments or return values.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/DeadArgumentElimination.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/NoFolder.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <cassert>
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "deadargelim"
-
-STATISTIC(NumArgumentsEliminated, "Number of unread args removed");
-STATISTIC(NumRetValsEliminated , "Number of unused return values removed");
-STATISTIC(NumArgumentsReplacedWithUndef,
- "Number of unread args replaced with undef");
-
-namespace {
-
- /// DAE - The dead argument elimination pass.
- class DAE : public ModulePass {
- protected:
- // DAH uses this to specify a different ID.
- explicit DAE(char &ID) : ModulePass(ID) {}
-
- public:
- static char ID; // Pass identification, replacement for typeid
-
- DAE() : ModulePass(ID) {
- initializeDAEPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
- DeadArgumentEliminationPass DAEP(ShouldHackArguments());
- ModuleAnalysisManager DummyMAM;
- PreservedAnalyses PA = DAEP.run(M, DummyMAM);
- return !PA.areAllPreserved();
- }
-
- virtual bool ShouldHackArguments() const { return false; }
- };
-
-} // end anonymous namespace
-
-char DAE::ID = 0;
-
-INITIALIZE_PASS(DAE, "deadargelim", "Dead Argument Elimination", false, false)
-
-namespace {
-
- /// DAH - DeadArgumentHacking pass - Same as dead argument elimination, but
- /// deletes arguments to functions which are external. This is only for use
- /// by bugpoint.
- struct DAH : public DAE {
- static char ID;
-
- DAH() : DAE(ID) {}
-
- bool ShouldHackArguments() const override { return true; }
- };
-
-} // end anonymous namespace
-
-char DAH::ID = 0;
-
-INITIALIZE_PASS(DAH, "deadarghaX0r",
- "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)",
- false, false)
-
-/// createDeadArgEliminationPass - This pass removes arguments from functions
-/// which are not used by the body of the function.
-ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); }
-
-ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); }
-
-/// DeleteDeadVarargs - If this is an function that takes a ... list, and if
-/// llvm.vastart is never called, the varargs list is dead for the function.
-bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
- assert(Fn.getFunctionType()->isVarArg() && "Function isn't varargs!");
- if (Fn.isDeclaration() || !Fn.hasLocalLinkage()) return false;
-
- // Ensure that the function is only directly called.
- if (Fn.hasAddressTaken())
- return false;
-
- // Don't touch naked functions. The assembly might be using an argument, or
- // otherwise rely on the frame layout in a way that this analysis will not
- // see.
- if (Fn.hasFnAttribute(Attribute::Naked)) {
- return false;
- }
-
- // Okay, we know we can transform this function if safe. Scan its body
- // looking for calls marked musttail or calls to llvm.vastart.
- for (BasicBlock &BB : Fn) {
- for (Instruction &I : BB) {
- CallInst *CI = dyn_cast<CallInst>(&I);
- if (!CI)
- continue;
- if (CI->isMustTailCall())
- return false;
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
- if (II->getIntrinsicID() == Intrinsic::vastart)
- return false;
- }
- }
- }
-
- // If we get here, there are no calls to llvm.vastart in the function body,
- // remove the "..." and adjust all the calls.
-
- // Start by computing a new prototype for the function, which is the same as
- // the old function, but doesn't have isVarArg set.
- FunctionType *FTy = Fn.getFunctionType();
-
- std::vector<Type *> Params(FTy->param_begin(), FTy->param_end());
- FunctionType *NFTy = FunctionType::get(FTy->getReturnType(),
- Params, false);
- unsigned NumArgs = Params.size();
-
- // Create the new function body and insert it into the module...
- Function *NF = Function::Create(NFTy, Fn.getLinkage(), Fn.getAddressSpace());
- NF->copyAttributesFrom(&Fn);
- NF->setComdat(Fn.getComdat());
- Fn.getParent()->getFunctionList().insert(Fn.getIterator(), NF);
- NF->takeName(&Fn);
-
- // Loop over all of the callers of the function, transforming the call sites
- // to pass in a smaller number of arguments into the new function.
- //
- std::vector<Value *> Args;
- for (Value::user_iterator I = Fn.user_begin(), E = Fn.user_end(); I != E; ) {
- CallBase *CB = dyn_cast<CallBase>(*I++);
- if (!CB)
- continue;
-
- // Pass all the same arguments.
- Args.assign(CB->arg_begin(), CB->arg_begin() + NumArgs);
-
- // Drop any attributes that were on the vararg arguments.
- AttributeList PAL = CB->getAttributes();
- if (!PAL.isEmpty()) {
- SmallVector<AttributeSet, 8> ArgAttrs;
- for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo)
- ArgAttrs.push_back(PAL.getParamAttributes(ArgNo));
- PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttributes(),
- PAL.getRetAttributes(), ArgAttrs);
- }
-
- SmallVector<OperandBundleDef, 1> OpBundles;
- CB->getOperandBundlesAsDefs(OpBundles);
-
- CallBase *NewCB = nullptr;
- if (InvokeInst *II = dyn_cast<InvokeInst>(CB)) {
- NewCB = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
- Args, OpBundles, "", CB);
- } else {
- NewCB = CallInst::Create(NF, Args, OpBundles, "", CB);
- cast<CallInst>(NewCB)->setTailCallKind(
- cast<CallInst>(CB)->getTailCallKind());
- }
- NewCB->setCallingConv(CB->getCallingConv());
- NewCB->setAttributes(PAL);
- NewCB->copyMetadata(*CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
-
- Args.clear();
-
- if (!CB->use_empty())
- CB->replaceAllUsesWith(NewCB);
-
- NewCB->takeName(CB);
-
- // Finally, remove the old call from the program, reducing the use-count of
- // F.
- CB->eraseFromParent();
- }
-
- // Since we have now created the new function, splice the body of the old
- // function right into the new function, leaving the old rotting hulk of the
- // function empty.
- NF->getBasicBlockList().splice(NF->begin(), Fn.getBasicBlockList());
-
- // Loop over the argument list, transferring uses of the old arguments over to
- // the new arguments, also transferring over the names as well. While we're at
- // it, remove the dead arguments from the DeadArguments list.
- for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(),
- I2 = NF->arg_begin(); I != E; ++I, ++I2) {
- // Move the name and users over to the new version.
- I->replaceAllUsesWith(&*I2);
- I2->takeName(&*I);
- }
-
- // Clone metadatas from the old function, including debug info descriptor.
- SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
- Fn.getAllMetadata(MDs);
- for (auto MD : MDs)
- NF->addMetadata(MD.first, *MD.second);
-
- // Fix up any BlockAddresses that refer to the function.
- Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType()));
- // Delete the bitcast that we just created, so that NF does not
- // appear to be address-taken.
- NF->removeDeadConstantUsers();
- // Finally, nuke the old function.
- Fn.eraseFromParent();
- return true;
-}
-
-/// RemoveDeadArgumentsFromCallers - Checks if the given function has any
-/// arguments that are unused, and changes the caller parameters to be undefined
-/// instead.
-bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) {
- // We cannot change the arguments if this TU does not define the function or
- // if the linker may choose a function body from another TU, even if the
- // nominal linkage indicates that other copies of the function have the same
- // semantics. In the below example, the dead load from %p may not have been
- // eliminated from the linker-chosen copy of f, so replacing %p with undef
- // in callers may introduce undefined behavior.
- //
- // define linkonce_odr void @f(i32* %p) {
- // %v = load i32 %p
- // ret void
- // }
- if (!Fn.hasExactDefinition())
- return false;
-
- // Functions with local linkage should already have been handled, except the
- // fragile (variadic) ones which we can improve here.
- if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg())
- return false;
-
- // Don't touch naked functions. The assembly might be using an argument, or
- // otherwise rely on the frame layout in a way that this analysis will not
- // see.
- if (Fn.hasFnAttribute(Attribute::Naked))
- return false;
-
- if (Fn.use_empty())
- return false;
-
- SmallVector<unsigned, 8> UnusedArgs;
- bool Changed = false;
-
- for (Argument &Arg : Fn.args()) {
- if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() &&
+//===- DeadArgumentElimination.cpp - Eliminate dead arguments -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass deletes dead arguments from internal functions. Dead argument
+// elimination removes arguments which are directly dead, as well as arguments
+// only passed into function calls as dead arguments of other functions. This
+// pass also deletes dead return values in a similar way.
+//
+// This pass is often useful as a cleanup pass to run after aggressive
+// interprocedural passes, which add possibly-dead arguments or return values.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/DeadArgumentElimination.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "deadargelim"
+
+STATISTIC(NumArgumentsEliminated, "Number of unread args removed");
+STATISTIC(NumRetValsEliminated , "Number of unused return values removed");
+STATISTIC(NumArgumentsReplacedWithUndef,
+ "Number of unread args replaced with undef");
+
+namespace {
+
+ /// DAE - The dead argument elimination pass.
+ class DAE : public ModulePass {
+ protected:
+ // DAH uses this to specify a different ID.
+ explicit DAE(char &ID) : ModulePass(ID) {}
+
+ public:
+ static char ID; // Pass identification, replacement for typeid
+
+ DAE() : ModulePass(ID) {
+ initializeDAEPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+ DeadArgumentEliminationPass DAEP(ShouldHackArguments());
+ ModuleAnalysisManager DummyMAM;
+ PreservedAnalyses PA = DAEP.run(M, DummyMAM);
+ return !PA.areAllPreserved();
+ }
+
+ virtual bool ShouldHackArguments() const { return false; }
+ };
+
+} // end anonymous namespace
+
+char DAE::ID = 0;
+
+INITIALIZE_PASS(DAE, "deadargelim", "Dead Argument Elimination", false, false)
+
+namespace {
+
+ /// DAH - DeadArgumentHacking pass - Same as dead argument elimination, but
+ /// deletes arguments to functions which are external. This is only for use
+ /// by bugpoint.
+ struct DAH : public DAE {
+ static char ID;
+
+ DAH() : DAE(ID) {}
+
+ bool ShouldHackArguments() const override { return true; }
+ };
+
+} // end anonymous namespace
+
+char DAH::ID = 0;
+
+INITIALIZE_PASS(DAH, "deadarghaX0r",
+ "Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)",
+ false, false)
+
+/// createDeadArgEliminationPass - This pass removes arguments from functions
+/// which are not used by the body of the function.
+ModulePass *llvm::createDeadArgEliminationPass() { return new DAE(); }
+
+ModulePass *llvm::createDeadArgHackingPass() { return new DAH(); }
+
+/// DeleteDeadVarargs - If this is an function that takes a ... list, and if
+/// llvm.vastart is never called, the varargs list is dead for the function.
+bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
+ assert(Fn.getFunctionType()->isVarArg() && "Function isn't varargs!");
+ if (Fn.isDeclaration() || !Fn.hasLocalLinkage()) return false;
+
+ // Ensure that the function is only directly called.
+ if (Fn.hasAddressTaken())
+ return false;
+
+ // Don't touch naked functions. The assembly might be using an argument, or
+ // otherwise rely on the frame layout in a way that this analysis will not
+ // see.
+ if (Fn.hasFnAttribute(Attribute::Naked)) {
+ return false;
+ }
+
+ // Okay, we know we can transform this function if safe. Scan its body
+ // looking for calls marked musttail or calls to llvm.vastart.
+ for (BasicBlock &BB : Fn) {
+ for (Instruction &I : BB) {
+ CallInst *CI = dyn_cast<CallInst>(&I);
+ if (!CI)
+ continue;
+ if (CI->isMustTailCall())
+ return false;
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+ if (II->getIntrinsicID() == Intrinsic::vastart)
+ return false;
+ }
+ }
+ }
+
+ // If we get here, there are no calls to llvm.vastart in the function body,
+ // remove the "..." and adjust all the calls.
+
+ // Start by computing a new prototype for the function, which is the same as
+ // the old function, but doesn't have isVarArg set.
+ FunctionType *FTy = Fn.getFunctionType();
+
+ std::vector<Type *> Params(FTy->param_begin(), FTy->param_end());
+ FunctionType *NFTy = FunctionType::get(FTy->getReturnType(),
+ Params, false);
+ unsigned NumArgs = Params.size();
+
+ // Create the new function body and insert it into the module...
+ Function *NF = Function::Create(NFTy, Fn.getLinkage(), Fn.getAddressSpace());
+ NF->copyAttributesFrom(&Fn);
+ NF->setComdat(Fn.getComdat());
+ Fn.getParent()->getFunctionList().insert(Fn.getIterator(), NF);
+ NF->takeName(&Fn);
+
+ // Loop over all of the callers of the function, transforming the call sites
+ // to pass in a smaller number of arguments into the new function.
+ //
+ std::vector<Value *> Args;
+ for (Value::user_iterator I = Fn.user_begin(), E = Fn.user_end(); I != E; ) {
+ CallBase *CB = dyn_cast<CallBase>(*I++);
+ if (!CB)
+ continue;
+
+ // Pass all the same arguments.
+ Args.assign(CB->arg_begin(), CB->arg_begin() + NumArgs);
+
+ // Drop any attributes that were on the vararg arguments.
+ AttributeList PAL = CB->getAttributes();
+ if (!PAL.isEmpty()) {
+ SmallVector<AttributeSet, 8> ArgAttrs;
+ for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo)
+ ArgAttrs.push_back(PAL.getParamAttributes(ArgNo));
+ PAL = AttributeList::get(Fn.getContext(), PAL.getFnAttributes(),
+ PAL.getRetAttributes(), ArgAttrs);
+ }
+
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ CB->getOperandBundlesAsDefs(OpBundles);
+
+ CallBase *NewCB = nullptr;
+ if (InvokeInst *II = dyn_cast<InvokeInst>(CB)) {
+ NewCB = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+ Args, OpBundles, "", CB);
+ } else {
+ NewCB = CallInst::Create(NF, Args, OpBundles, "", CB);
+ cast<CallInst>(NewCB)->setTailCallKind(
+ cast<CallInst>(CB)->getTailCallKind());
+ }
+ NewCB->setCallingConv(CB->getCallingConv());
+ NewCB->setAttributes(PAL);
+ NewCB->copyMetadata(*CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
+
+ Args.clear();
+
+ if (!CB->use_empty())
+ CB->replaceAllUsesWith(NewCB);
+
+ NewCB->takeName(CB);
+
+ // Finally, remove the old call from the program, reducing the use-count of
+ // F.
+ CB->eraseFromParent();
+ }
+
+ // Since we have now created the new function, splice the body of the old
+ // function right into the new function, leaving the old rotting hulk of the
+ // function empty.
+ NF->getBasicBlockList().splice(NF->begin(), Fn.getBasicBlockList());
+
+ // Loop over the argument list, transferring uses of the old arguments over to
+ // the new arguments, also transferring over the names as well. While we're at
+ // it, remove the dead arguments from the DeadArguments list.
+ for (Function::arg_iterator I = Fn.arg_begin(), E = Fn.arg_end(),
+ I2 = NF->arg_begin(); I != E; ++I, ++I2) {
+ // Move the name and users over to the new version.
+ I->replaceAllUsesWith(&*I2);
+ I2->takeName(&*I);
+ }
+
+ // Clone metadatas from the old function, including debug info descriptor.
+ SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+ Fn.getAllMetadata(MDs);
+ for (auto MD : MDs)
+ NF->addMetadata(MD.first, *MD.second);
+
+ // Fix up any BlockAddresses that refer to the function.
+ Fn.replaceAllUsesWith(ConstantExpr::getBitCast(NF, Fn.getType()));
+ // Delete the bitcast that we just created, so that NF does not
+ // appear to be address-taken.
+ NF->removeDeadConstantUsers();
+ // Finally, nuke the old function.
+ Fn.eraseFromParent();
+ return true;
+}
+
+/// RemoveDeadArgumentsFromCallers - Checks if the given function has any
+/// arguments that are unused, and changes the caller parameters to be undefined
+/// instead.
+bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) {
+ // We cannot change the arguments if this TU does not define the function or
+ // if the linker may choose a function body from another TU, even if the
+ // nominal linkage indicates that other copies of the function have the same
+ // semantics. In the below example, the dead load from %p may not have been
+ // eliminated from the linker-chosen copy of f, so replacing %p with undef
+ // in callers may introduce undefined behavior.
+ //
+ // define linkonce_odr void @f(i32* %p) {
+ // %v = load i32 %p
+ // ret void
+ // }
+ if (!Fn.hasExactDefinition())
+ return false;
+
+ // Functions with local linkage should already have been handled, except the
+ // fragile (variadic) ones which we can improve here.
+ if (Fn.hasLocalLinkage() && !Fn.getFunctionType()->isVarArg())
+ return false;
+
+ // Don't touch naked functions. The assembly might be using an argument, or
+ // otherwise rely on the frame layout in a way that this analysis will not
+ // see.
+ if (Fn.hasFnAttribute(Attribute::Naked))
+ return false;
+
+ if (Fn.use_empty())
+ return false;
+
+ SmallVector<unsigned, 8> UnusedArgs;
+ bool Changed = false;
+
+ for (Argument &Arg : Fn.args()) {
+ if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() &&
!Arg.hasPassPointeeByValueCopyAttr()) {
- if (Arg.isUsedByMetadata()) {
- Arg.replaceAllUsesWith(UndefValue::get(Arg.getType()));
- Changed = true;
- }
- UnusedArgs.push_back(Arg.getArgNo());
- }
- }
-
- if (UnusedArgs.empty())
- return false;
-
- for (Use &U : Fn.uses()) {
- CallBase *CB = dyn_cast<CallBase>(U.getUser());
- if (!CB || !CB->isCallee(&U))
- continue;
-
- // Now go through all unused args and replace them with "undef".
- for (unsigned I = 0, E = UnusedArgs.size(); I != E; ++I) {
- unsigned ArgNo = UnusedArgs[I];
-
- Value *Arg = CB->getArgOperand(ArgNo);
- CB->setArgOperand(ArgNo, UndefValue::get(Arg->getType()));
- ++NumArgumentsReplacedWithUndef;
- Changed = true;
- }
- }
-
- return Changed;
-}
-
-/// Convenience function that returns the number of return values. It returns 0
-/// for void functions and 1 for functions not returning a struct. It returns
-/// the number of struct elements for functions returning a struct.
-static unsigned NumRetVals(const Function *F) {
- Type *RetTy = F->getReturnType();
- if (RetTy->isVoidTy())
- return 0;
- else if (StructType *STy = dyn_cast<StructType>(RetTy))
- return STy->getNumElements();
- else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy))
- return ATy->getNumElements();
- else
- return 1;
-}
-
-/// Returns the sub-type a function will return at a given Idx. Should
-/// correspond to the result type of an ExtractValue instruction executed with
-/// just that one Idx (i.e. only top-level structure is considered).
-static Type *getRetComponentType(const Function *F, unsigned Idx) {
- Type *RetTy = F->getReturnType();
- assert(!RetTy->isVoidTy() && "void type has no subtype");
-
- if (StructType *STy = dyn_cast<StructType>(RetTy))
- return STy->getElementType(Idx);
- else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy))
- return ATy->getElementType();
- else
- return RetTy;
-}
-
-/// MarkIfNotLive - This checks Use for liveness in LiveValues. If Use is not
-/// live, it adds Use to the MaybeLiveUses argument. Returns the determined
-/// liveness of Use.
-DeadArgumentEliminationPass::Liveness
-DeadArgumentEliminationPass::MarkIfNotLive(RetOrArg Use,
- UseVector &MaybeLiveUses) {
- // We're live if our use or its Function is already marked as live.
- if (IsLive(Use))
- return Live;
-
- // We're maybe live otherwise, but remember that we must become live if
- // Use becomes live.
- MaybeLiveUses.push_back(Use);
- return MaybeLive;
-}
-
-/// SurveyUse - This looks at a single use of an argument or return value
-/// and determines if it should be alive or not. Adds this use to MaybeLiveUses
-/// if it causes the used value to become MaybeLive.
-///
-/// RetValNum is the return value number to use when this use is used in a
-/// return instruction. This is used in the recursion, you should always leave
-/// it at 0.
-DeadArgumentEliminationPass::Liveness
-DeadArgumentEliminationPass::SurveyUse(const Use *U, UseVector &MaybeLiveUses,
- unsigned RetValNum) {
- const User *V = U->getUser();
- if (const ReturnInst *RI = dyn_cast<ReturnInst>(V)) {
- // The value is returned from a function. It's only live when the
- // function's return value is live. We use RetValNum here, for the case
- // that U is really a use of an insertvalue instruction that uses the
- // original Use.
- const Function *F = RI->getParent()->getParent();
- if (RetValNum != -1U) {
- RetOrArg Use = CreateRet(F, RetValNum);
- // We might be live, depending on the liveness of Use.
- return MarkIfNotLive(Use, MaybeLiveUses);
- } else {
- DeadArgumentEliminationPass::Liveness Result = MaybeLive;
- for (unsigned Ri = 0; Ri < NumRetVals(F); ++Ri) {
- RetOrArg Use = CreateRet(F, Ri);
- // We might be live, depending on the liveness of Use. If any
- // sub-value is live, then the entire value is considered live. This
- // is a conservative choice, and better tracking is possible.
- DeadArgumentEliminationPass::Liveness SubResult =
- MarkIfNotLive(Use, MaybeLiveUses);
- if (Result != Live)
- Result = SubResult;
- }
- return Result;
- }
- }
- if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) {
- if (U->getOperandNo() != InsertValueInst::getAggregateOperandIndex()
- && IV->hasIndices())
- // The use we are examining is inserted into an aggregate. Our liveness
- // depends on all uses of that aggregate, but if it is used as a return
- // value, only index at which we were inserted counts.
- RetValNum = *IV->idx_begin();
-
- // Note that if we are used as the aggregate operand to the insertvalue,
- // we don't change RetValNum, but do survey all our uses.
-
- Liveness Result = MaybeLive;
- for (const Use &UU : IV->uses()) {
- Result = SurveyUse(&UU, MaybeLiveUses, RetValNum);
- if (Result == Live)
- break;
- }
- return Result;
- }
-
- if (const auto *CB = dyn_cast<CallBase>(V)) {
- const Function *F = CB->getCalledFunction();
- if (F) {
- // Used in a direct call.
-
- // The function argument is live if it is used as a bundle operand.
- if (CB->isBundleOperand(U))
- return Live;
-
- // Find the argument number. We know for sure that this use is an
- // argument, since if it was the function argument this would be an
- // indirect call and the we know can't be looking at a value of the
- // label type (for the invoke instruction).
- unsigned ArgNo = CB->getArgOperandNo(U);
-
- if (ArgNo >= F->getFunctionType()->getNumParams())
- // The value is passed in through a vararg! Must be live.
- return Live;
-
- assert(CB->getArgOperand(ArgNo) == CB->getOperand(U->getOperandNo()) &&
- "Argument is not where we expected it");
-
- // Value passed to a normal call. It's only live when the corresponding
- // argument to the called function turns out live.
- RetOrArg Use = CreateArg(F, ArgNo);
- return MarkIfNotLive(Use, MaybeLiveUses);
- }
- }
- // Used in any other way? Value must be live.
- return Live;
-}
-
-/// SurveyUses - This looks at all the uses of the given value
-/// Returns the Liveness deduced from the uses of this value.
-///
-/// Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses. If
-/// the result is Live, MaybeLiveUses might be modified but its content should
-/// be ignored (since it might not be complete).
-DeadArgumentEliminationPass::Liveness
-DeadArgumentEliminationPass::SurveyUses(const Value *V,
- UseVector &MaybeLiveUses) {
- // Assume it's dead (which will only hold if there are no uses at all..).
- Liveness Result = MaybeLive;
- // Check each use.
- for (const Use &U : V->uses()) {
- Result = SurveyUse(&U, MaybeLiveUses);
- if (Result == Live)
- break;
- }
- return Result;
-}
-
-// SurveyFunction - This performs the initial survey of the specified function,
-// checking out whether or not it uses any of its incoming arguments or whether
-// any callers use the return value. This fills in the LiveValues set and Uses
-// map.
-//
-// We consider arguments of non-internal functions to be intrinsically alive as
-// well as arguments to functions which have their "address taken".
-void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
- // Functions with inalloca/preallocated parameters are expecting args in a
- // particular register and memory layout.
- if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
- F.getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
- MarkLive(F);
- return;
- }
-
- // Don't touch naked functions. The assembly might be using an argument, or
- // otherwise rely on the frame layout in a way that this analysis will not
- // see.
- if (F.hasFnAttribute(Attribute::Naked)) {
- MarkLive(F);
- return;
- }
-
- unsigned RetCount = NumRetVals(&F);
-
- // Assume all return values are dead
- using RetVals = SmallVector<Liveness, 5>;
-
- RetVals RetValLiveness(RetCount, MaybeLive);
-
- using RetUses = SmallVector<UseVector, 5>;
-
- // These vectors map each return value to the uses that make it MaybeLive, so
- // we can add those to the Uses map if the return value really turns out to be
- // MaybeLive. Initialized to a list of RetCount empty lists.
- RetUses MaybeLiveRetUses(RetCount);
-
- bool HasMustTailCalls = false;
-
- for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
- if (const ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
- if (RI->getNumOperands() != 0 && RI->getOperand(0)->getType()
- != F.getFunctionType()->getReturnType()) {
- // We don't support old style multiple return values.
- MarkLive(F);
- return;
- }
- }
-
- // If we have any returns of `musttail` results - the signature can't
- // change
- if (BB->getTerminatingMustTailCall() != nullptr)
- HasMustTailCalls = true;
- }
-
- if (HasMustTailCalls) {
- LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName()
- << " has musttail calls\n");
- }
-
- if (!F.hasLocalLinkage() && (!ShouldHackArguments || F.isIntrinsic())) {
- MarkLive(F);
- return;
- }
-
- LLVM_DEBUG(
- dbgs() << "DeadArgumentEliminationPass - Inspecting callers for fn: "
- << F.getName() << "\n");
- // Keep track of the number of live retvals, so we can skip checks once all
- // of them turn out to be live.
- unsigned NumLiveRetVals = 0;
-
- bool HasMustTailCallers = false;
-
- // Loop all uses of the function.
- for (const Use &U : F.uses()) {
- // If the function is PASSED IN as an argument, its address has been
- // taken.
- const auto *CB = dyn_cast<CallBase>(U.getUser());
- if (!CB || !CB->isCallee(&U)) {
- MarkLive(F);
- return;
- }
-
- // The number of arguments for `musttail` call must match the number of
- // arguments of the caller
- if (CB->isMustTailCall())
- HasMustTailCallers = true;
-
- // If we end up here, we are looking at a direct call to our function.
-
- // Now, check how our return value(s) is/are used in this caller. Don't
- // bother checking return values if all of them are live already.
- if (NumLiveRetVals == RetCount)
- continue;
-
- // Check all uses of the return value.
- for (const Use &U : CB->uses()) {
- if (ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(U.getUser())) {
- // This use uses a part of our return value, survey the uses of
- // that part and store the results for this index only.
- unsigned Idx = *Ext->idx_begin();
- if (RetValLiveness[Idx] != Live) {
- RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]);
- if (RetValLiveness[Idx] == Live)
- NumLiveRetVals++;
- }
- } else {
- // Used by something else than extractvalue. Survey, but assume that the
- // result applies to all sub-values.
- UseVector MaybeLiveAggregateUses;
- if (SurveyUse(&U, MaybeLiveAggregateUses) == Live) {
- NumLiveRetVals = RetCount;
- RetValLiveness.assign(RetCount, Live);
- break;
- } else {
- for (unsigned Ri = 0; Ri != RetCount; ++Ri) {
- if (RetValLiveness[Ri] != Live)
- MaybeLiveRetUses[Ri].append(MaybeLiveAggregateUses.begin(),
- MaybeLiveAggregateUses.end());
- }
- }
- }
- }
- }
-
- if (HasMustTailCallers) {
- LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName()
- << " has musttail callers\n");
- }
-
- // Now we've inspected all callers, record the liveness of our return values.
- for (unsigned Ri = 0; Ri != RetCount; ++Ri)
- MarkValue(CreateRet(&F, Ri), RetValLiveness[Ri], MaybeLiveRetUses[Ri]);
-
- LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting args for fn: "
- << F.getName() << "\n");
-
- // Now, check all of our arguments.
- unsigned ArgI = 0;
- UseVector MaybeLiveArgUses;
- for (Function::const_arg_iterator AI = F.arg_begin(), E = F.arg_end();
- AI != E; ++AI, ++ArgI) {
- Liveness Result;
- if (F.getFunctionType()->isVarArg() || HasMustTailCallers ||
- HasMustTailCalls) {
- // Variadic functions will already have a va_arg function expanded inside
- // them, making them potentially very sensitive to ABI changes resulting
- // from removing arguments entirely, so don't. For example AArch64 handles
- // register and stack HFAs very differently, and this is reflected in the
- // IR which has already been generated.
- //
- // `musttail` calls to this function restrict argument removal attempts.
- // The signature of the caller must match the signature of the function.
- //
- // `musttail` calls in this function prevents us from changing its
- // signature
- Result = Live;
- } else {
- // See what the effect of this use is (recording any uses that cause
- // MaybeLive in MaybeLiveArgUses).
- Result = SurveyUses(&*AI, MaybeLiveArgUses);
- }
-
- // Mark the result.
- MarkValue(CreateArg(&F, ArgI), Result, MaybeLiveArgUses);
- // Clear the vector again for the next iteration.
- MaybeLiveArgUses.clear();
- }
-}
-
-/// MarkValue - This function marks the liveness of RA depending on L. If L is
-/// MaybeLive, it also takes all uses in MaybeLiveUses and records them in Uses,
-/// such that RA will be marked live if any use in MaybeLiveUses gets marked
-/// live later on.
-void DeadArgumentEliminationPass::MarkValue(const RetOrArg &RA, Liveness L,
- const UseVector &MaybeLiveUses) {
- switch (L) {
- case Live:
- MarkLive(RA);
- break;
- case MaybeLive:
- assert(!IsLive(RA) && "Use is already live!");
- for (const auto &MaybeLiveUse : MaybeLiveUses) {
- if (IsLive(MaybeLiveUse)) {
- // A use is live, so this value is live.
- MarkLive(RA);
- break;
- } else {
- // Note any uses of this value, so this value can be
- // marked live whenever one of the uses becomes live.
- Uses.insert(std::make_pair(MaybeLiveUse, RA));
- }
- }
- break;
- }
-}
-
-/// MarkLive - Mark the given Function as alive, meaning that it cannot be
-/// changed in any way. Additionally,
-/// mark any values that are used as this function's parameters or by its return
-/// values (according to Uses) live as well.
-void DeadArgumentEliminationPass::MarkLive(const Function &F) {
- LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Intrinsically live fn: "
- << F.getName() << "\n");
- // Mark the function as live.
- LiveFunctions.insert(&F);
- // Mark all arguments as live.
- for (unsigned ArgI = 0, E = F.arg_size(); ArgI != E; ++ArgI)
- PropagateLiveness(CreateArg(&F, ArgI));
- // Mark all return values as live.
- for (unsigned Ri = 0, E = NumRetVals(&F); Ri != E; ++Ri)
- PropagateLiveness(CreateRet(&F, Ri));
-}
-
-/// MarkLive - Mark the given return value or argument as live. Additionally,
-/// mark any values that are used by this value (according to Uses) live as
-/// well.
-void DeadArgumentEliminationPass::MarkLive(const RetOrArg &RA) {
- if (IsLive(RA))
- return; // Already marked Live.
-
- LiveValues.insert(RA);
-
- LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Marking "
- << RA.getDescription() << " live\n");
- PropagateLiveness(RA);
-}
-
-bool DeadArgumentEliminationPass::IsLive(const RetOrArg &RA) {
- return LiveFunctions.count(RA.F) || LiveValues.count(RA);
-}
-
-/// PropagateLiveness - Given that RA is a live value, propagate it's liveness
-/// to any other values it uses (according to Uses).
-void DeadArgumentEliminationPass::PropagateLiveness(const RetOrArg &RA) {
- // We don't use upper_bound (or equal_range) here, because our recursive call
- // to ourselves is likely to cause the upper_bound (which is the first value
- // not belonging to RA) to become erased and the iterator invalidated.
- UseMap::iterator Begin = Uses.lower_bound(RA);
- UseMap::iterator E = Uses.end();
- UseMap::iterator I;
- for (I = Begin; I != E && I->first == RA; ++I)
- MarkLive(I->second);
-
- // Erase RA from the Uses map (from the lower bound to wherever we ended up
- // after the loop).
- Uses.erase(Begin, I);
-}
-
-// RemoveDeadStuffFromFunction - Remove any arguments and return values from F
-// that are not in LiveValues. Transform the function and all of the callees of
-// the function to not have these arguments and return values.
-//
-bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
- // Don't modify fully live functions
- if (LiveFunctions.count(F))
- return false;
-
- // Start by computing a new prototype for the function, which is the same as
- // the old function, but has fewer arguments and a different return type.
- FunctionType *FTy = F->getFunctionType();
- std::vector<Type*> Params;
-
- // Keep track of if we have a live 'returned' argument
- bool HasLiveReturnedArg = false;
-
- // Set up to build a new list of parameter attributes.
- SmallVector<AttributeSet, 8> ArgAttrVec;
- const AttributeList &PAL = F->getAttributes();
-
- // Remember which arguments are still alive.
- SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
- // Construct the new parameter list from non-dead arguments. Also construct
- // a new set of parameter attributes to correspond. Skip the first parameter
- // attribute, since that belongs to the return value.
- unsigned ArgI = 0;
- for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
- ++I, ++ArgI) {
- RetOrArg Arg = CreateArg(F, ArgI);
- if (LiveValues.erase(Arg)) {
- Params.push_back(I->getType());
- ArgAlive[ArgI] = true;
- ArgAttrVec.push_back(PAL.getParamAttributes(ArgI));
- HasLiveReturnedArg |= PAL.hasParamAttribute(ArgI, Attribute::Returned);
- } else {
- ++NumArgumentsEliminated;
- LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument "
- << ArgI << " (" << I->getName() << ") from "
- << F->getName() << "\n");
- }
- }
-
- // Find out the new return value.
- Type *RetTy = FTy->getReturnType();
- Type *NRetTy = nullptr;
- unsigned RetCount = NumRetVals(F);
-
- // -1 means unused, other numbers are the new index
- SmallVector<int, 5> NewRetIdxs(RetCount, -1);
- std::vector<Type*> RetTypes;
-
- // If there is a function with a live 'returned' argument but a dead return
- // value, then there are two possible actions:
- // 1) Eliminate the return value and take off the 'returned' attribute on the
- // argument.
- // 2) Retain the 'returned' attribute and treat the return value (but not the
- // entire function) as live so that it is not eliminated.
- //
- // It's not clear in the general case which option is more profitable because,
- // even in the absence of explicit uses of the return value, code generation
- // is free to use the 'returned' attribute to do things like eliding
- // save/restores of registers across calls. Whether or not this happens is
- // target and ABI-specific as well as depending on the amount of register
- // pressure, so there's no good way for an IR-level pass to figure this out.
- //
- // Fortunately, the only places where 'returned' is currently generated by
- // the FE are places where 'returned' is basically free and almost always a
- // performance win, so the second option can just be used always for now.
- //
- // This should be revisited if 'returned' is ever applied more liberally.
- if (RetTy->isVoidTy() || HasLiveReturnedArg) {
- NRetTy = RetTy;
- } else {
- // Look at each of the original return values individually.
- for (unsigned Ri = 0; Ri != RetCount; ++Ri) {
- RetOrArg Ret = CreateRet(F, Ri);
- if (LiveValues.erase(Ret)) {
- RetTypes.push_back(getRetComponentType(F, Ri));
- NewRetIdxs[Ri] = RetTypes.size() - 1;
- } else {
- ++NumRetValsEliminated;
- LLVM_DEBUG(
- dbgs() << "DeadArgumentEliminationPass - Removing return value "
- << Ri << " from " << F->getName() << "\n");
- }
- }
- if (RetTypes.size() > 1) {
- // More than one return type? Reduce it down to size.
- if (StructType *STy = dyn_cast<StructType>(RetTy)) {
- // Make the new struct packed if we used to return a packed struct
- // already.
- NRetTy = StructType::get(STy->getContext(), RetTypes, STy->isPacked());
- } else {
- assert(isa<ArrayType>(RetTy) && "unexpected multi-value return");
- NRetTy = ArrayType::get(RetTypes[0], RetTypes.size());
- }
- } else if (RetTypes.size() == 1)
- // One return type? Just a simple value then, but only if we didn't use to
- // return a struct with that simple value before.
- NRetTy = RetTypes.front();
- else if (RetTypes.empty())
- // No return types? Make it void, but only if we didn't use to return {}.
- NRetTy = Type::getVoidTy(F->getContext());
- }
-
- assert(NRetTy && "No new return type found?");
-
- // The existing function return attributes.
- AttrBuilder RAttrs(PAL.getRetAttributes());
-
- // Remove any incompatible attributes, but only if we removed all return
- // values. Otherwise, ensure that we don't have any conflicting attributes
- // here. Currently, this should not be possible, but special handling might be
- // required when new return value attributes are added.
- if (NRetTy->isVoidTy())
- RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
- else
- assert(!RAttrs.overlaps(AttributeFuncs::typeIncompatible(NRetTy)) &&
- "Return attributes no longer compatible?");
-
- AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
-
- // Strip allocsize attributes. They might refer to the deleted arguments.
- AttributeSet FnAttrs = PAL.getFnAttributes().removeAttribute(
- F->getContext(), Attribute::AllocSize);
-
- // Reconstruct the AttributesList based on the vector we constructed.
- assert(ArgAttrVec.size() == Params.size());
- AttributeList NewPAL =
- AttributeList::get(F->getContext(), FnAttrs, RetAttrs, ArgAttrVec);
-
- // Create the new function type based on the recomputed parameters.
- FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
-
- // No change?
- if (NFTy == FTy)
- return false;
-
- // Create the new function body and insert it into the module...
- Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace());
- NF->copyAttributesFrom(F);
- NF->setComdat(F->getComdat());
- NF->setAttributes(NewPAL);
- // Insert the new function before the old function, so we won't be processing
- // it again.
- F->getParent()->getFunctionList().insert(F->getIterator(), NF);
- NF->takeName(F);
-
- // Loop over all of the callers of the function, transforming the call sites
- // to pass in a smaller number of arguments into the new function.
- std::vector<Value*> Args;
- while (!F->use_empty()) {
- CallBase &CB = cast<CallBase>(*F->user_back());
-
- ArgAttrVec.clear();
- const AttributeList &CallPAL = CB.getAttributes();
-
- // Adjust the call return attributes in case the function was changed to
- // return void.
- AttrBuilder RAttrs(CallPAL.getRetAttributes());
- RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
- AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
-
- // Declare these outside of the loops, so we can reuse them for the second
- // loop, which loops the varargs.
- auto I = CB.arg_begin();
- unsigned Pi = 0;
- // Loop over those operands, corresponding to the normal arguments to the
- // original function, and add those that are still alive.
- for (unsigned E = FTy->getNumParams(); Pi != E; ++I, ++Pi)
- if (ArgAlive[Pi]) {
- Args.push_back(*I);
- // Get original parameter attributes, but skip return attributes.
- AttributeSet Attrs = CallPAL.getParamAttributes(Pi);
- if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) {
- // If the return type has changed, then get rid of 'returned' on the
- // call site. The alternative is to make all 'returned' attributes on
- // call sites keep the return value alive just like 'returned'
- // attributes on function declaration but it's less clearly a win and
- // this is not an expected case anyway
- ArgAttrVec.push_back(AttributeSet::get(
- F->getContext(),
- AttrBuilder(Attrs).removeAttribute(Attribute::Returned)));
- } else {
- // Otherwise, use the original attributes.
- ArgAttrVec.push_back(Attrs);
- }
- }
-
- // Push any varargs arguments on the list. Don't forget their attributes.
- for (auto E = CB.arg_end(); I != E; ++I, ++Pi) {
- Args.push_back(*I);
- ArgAttrVec.push_back(CallPAL.getParamAttributes(Pi));
- }
-
- // Reconstruct the AttributesList based on the vector we constructed.
- assert(ArgAttrVec.size() == Args.size());
-
- // Again, be sure to remove any allocsize attributes, since their indices
- // may now be incorrect.
- AttributeSet FnAttrs = CallPAL.getFnAttributes().removeAttribute(
- F->getContext(), Attribute::AllocSize);
-
- AttributeList NewCallPAL = AttributeList::get(
- F->getContext(), FnAttrs, RetAttrs, ArgAttrVec);
-
- SmallVector<OperandBundleDef, 1> OpBundles;
- CB.getOperandBundlesAsDefs(OpBundles);
-
- CallBase *NewCB = nullptr;
- if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
- NewCB = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
- Args, OpBundles, "", CB.getParent());
- } else {
- NewCB = CallInst::Create(NFTy, NF, Args, OpBundles, "", &CB);
- cast<CallInst>(NewCB)->setTailCallKind(
- cast<CallInst>(&CB)->getTailCallKind());
- }
- NewCB->setCallingConv(CB.getCallingConv());
- NewCB->setAttributes(NewCallPAL);
- NewCB->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
- Args.clear();
- ArgAttrVec.clear();
-
- if (!CB.use_empty() || CB.isUsedByMetadata()) {
- if (NewCB->getType() == CB.getType()) {
- // Return type not changed? Just replace users then.
- CB.replaceAllUsesWith(NewCB);
- NewCB->takeName(&CB);
- } else if (NewCB->getType()->isVoidTy()) {
- // If the return value is dead, replace any uses of it with undef
- // (any non-debug value uses will get removed later on).
- if (!CB.getType()->isX86_MMXTy())
- CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
- } else {
- assert((RetTy->isStructTy() || RetTy->isArrayTy()) &&
- "Return type changed, but not into a void. The old return type"
- " must have been a struct or an array!");
- Instruction *InsertPt = &CB;
- if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
- BasicBlock *NewEdge =
- SplitEdge(NewCB->getParent(), II->getNormalDest());
- InsertPt = &*NewEdge->getFirstInsertionPt();
- }
-
- // We used to return a struct or array. Instead of doing smart stuff
- // with all the uses, we will just rebuild it using extract/insertvalue
- // chaining and let instcombine clean that up.
- //
- // Start out building up our return value from undef
- Value *RetVal = UndefValue::get(RetTy);
- for (unsigned Ri = 0; Ri != RetCount; ++Ri)
- if (NewRetIdxs[Ri] != -1) {
- Value *V;
- IRBuilder<NoFolder> IRB(InsertPt);
- if (RetTypes.size() > 1)
- // We are still returning a struct, so extract the value from our
- // return value
- V = IRB.CreateExtractValue(NewCB, NewRetIdxs[Ri], "newret");
- else
- // We are now returning a single element, so just insert that
- V = NewCB;
- // Insert the value at the old position
- RetVal = IRB.CreateInsertValue(RetVal, V, Ri, "oldret");
- }
- // Now, replace all uses of the old call instruction with the return
- // struct we built
- CB.replaceAllUsesWith(RetVal);
- NewCB->takeName(&CB);
- }
- }
-
- // Finally, remove the old call from the program, reducing the use-count of
- // F.
- CB.eraseFromParent();
- }
-
- // Since we have now created the new function, splice the body of the old
- // function right into the new function, leaving the old rotting hulk of the
- // function empty.
- NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
-
- // Loop over the argument list, transferring uses of the old arguments over to
- // the new arguments, also transferring over the names as well.
- ArgI = 0;
- for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
- I2 = NF->arg_begin();
- I != E; ++I, ++ArgI)
- if (ArgAlive[ArgI]) {
- // If this is a live argument, move the name and users over to the new
- // version.
- I->replaceAllUsesWith(&*I2);
- I2->takeName(&*I);
- ++I2;
- } else {
- // If this argument is dead, replace any uses of it with undef
- // (any non-debug value uses will get removed later on).
- if (!I->getType()->isX86_MMXTy())
- I->replaceAllUsesWith(UndefValue::get(I->getType()));
- }
-
- // If we change the return value of the function we must rewrite any return
- // instructions. Check this now.
- if (F->getReturnType() != NF->getReturnType())
- for (BasicBlock &BB : *NF)
- if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
- IRBuilder<NoFolder> IRB(RI);
- Value *RetVal = nullptr;
-
- if (!NFTy->getReturnType()->isVoidTy()) {
- assert(RetTy->isStructTy() || RetTy->isArrayTy());
- // The original return value was a struct or array, insert
- // extractvalue/insertvalue chains to extract only the values we need
- // to return and insert them into our new result.
- // This does generate messy code, but we'll let it to instcombine to
- // clean that up.
- Value *OldRet = RI->getOperand(0);
- // Start out building up our return value from undef
- RetVal = UndefValue::get(NRetTy);
- for (unsigned RetI = 0; RetI != RetCount; ++RetI)
- if (NewRetIdxs[RetI] != -1) {
- Value *EV = IRB.CreateExtractValue(OldRet, RetI, "oldret");
-
- if (RetTypes.size() > 1) {
- // We're still returning a struct, so reinsert the value into
- // our new return value at the new index
-
- RetVal = IRB.CreateInsertValue(RetVal, EV, NewRetIdxs[RetI],
- "newret");
- } else {
- // We are now only returning a simple value, so just return the
- // extracted value.
- RetVal = EV;
- }
- }
- }
- // Replace the return instruction with one returning the new return
- // value (possibly 0 if we became void).
- auto *NewRet = ReturnInst::Create(F->getContext(), RetVal, RI);
- NewRet->setDebugLoc(RI->getDebugLoc());
- BB.getInstList().erase(RI);
- }
-
- // Clone metadatas from the old function, including debug info descriptor.
- SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
- F->getAllMetadata(MDs);
- for (auto MD : MDs)
- NF->addMetadata(MD.first, *MD.second);
-
- // Now that the old function is dead, delete it.
- F->eraseFromParent();
-
- return true;
-}
-
-PreservedAnalyses DeadArgumentEliminationPass::run(Module &M,
- ModuleAnalysisManager &) {
- bool Changed = false;
-
- // First pass: Do a simple check to see if any functions can have their "..."
- // removed. We can do this if they never call va_start. This loop cannot be
- // fused with the next loop, because deleting a function invalidates
- // information computed while surveying other functions.
- LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Deleting dead varargs\n");
- for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
- Function &F = *I++;
- if (F.getFunctionType()->isVarArg())
- Changed |= DeleteDeadVarargs(F);
- }
-
- // Second phase:loop through the module, determining which arguments are live.
- // We assume all arguments are dead unless proven otherwise (allowing us to
- // determine that dead arguments passed into recursive functions are dead).
- //
- LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Determining liveness\n");
- for (auto &F : M)
- SurveyFunction(F);
-
- // Now, remove all dead arguments and return values from each function in
- // turn.
- for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
- // Increment now, because the function will probably get removed (ie.
- // replaced by a new one).
- Function *F = &*I++;
- Changed |= RemoveDeadStuffFromFunction(F);
- }
-
- // Finally, look for any unused parameters in functions with non-local
- // linkage and replace the passed in parameters with undef.
- for (auto &F : M)
- Changed |= RemoveDeadArgumentsFromCallers(F);
-
- if (!Changed)
- return PreservedAnalyses::all();
- return PreservedAnalyses::none();
-}
+ if (Arg.isUsedByMetadata()) {
+ Arg.replaceAllUsesWith(UndefValue::get(Arg.getType()));
+ Changed = true;
+ }
+ UnusedArgs.push_back(Arg.getArgNo());
+ }
+ }
+
+ if (UnusedArgs.empty())
+ return false;
+
+ for (Use &U : Fn.uses()) {
+ CallBase *CB = dyn_cast<CallBase>(U.getUser());
+ if (!CB || !CB->isCallee(&U))
+ continue;
+
+ // Now go through all unused args and replace them with "undef".
+ for (unsigned I = 0, E = UnusedArgs.size(); I != E; ++I) {
+ unsigned ArgNo = UnusedArgs[I];
+
+ Value *Arg = CB->getArgOperand(ArgNo);
+ CB->setArgOperand(ArgNo, UndefValue::get(Arg->getType()));
+ ++NumArgumentsReplacedWithUndef;
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+/// Convenience function that returns the number of return values. It returns 0
+/// for void functions and 1 for functions not returning a struct. It returns
+/// the number of struct elements for functions returning a struct.
+static unsigned NumRetVals(const Function *F) {
+ Type *RetTy = F->getReturnType();
+ if (RetTy->isVoidTy())
+ return 0;
+ else if (StructType *STy = dyn_cast<StructType>(RetTy))
+ return STy->getNumElements();
+ else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy))
+ return ATy->getNumElements();
+ else
+ return 1;
+}
+
+/// Returns the sub-type a function will return at a given Idx. Should
+/// correspond to the result type of an ExtractValue instruction executed with
+/// just that one Idx (i.e. only top-level structure is considered).
+static Type *getRetComponentType(const Function *F, unsigned Idx) {
+ Type *RetTy = F->getReturnType();
+ assert(!RetTy->isVoidTy() && "void type has no subtype");
+
+ if (StructType *STy = dyn_cast<StructType>(RetTy))
+ return STy->getElementType(Idx);
+ else if (ArrayType *ATy = dyn_cast<ArrayType>(RetTy))
+ return ATy->getElementType();
+ else
+ return RetTy;
+}
+
+/// MarkIfNotLive - This checks Use for liveness in LiveValues. If Use is not
+/// live, it adds Use to the MaybeLiveUses argument. Returns the determined
+/// liveness of Use.
+DeadArgumentEliminationPass::Liveness
+DeadArgumentEliminationPass::MarkIfNotLive(RetOrArg Use,
+ UseVector &MaybeLiveUses) {
+ // We're live if our use or its Function is already marked as live.
+ if (IsLive(Use))
+ return Live;
+
+ // We're maybe live otherwise, but remember that we must become live if
+ // Use becomes live.
+ MaybeLiveUses.push_back(Use);
+ return MaybeLive;
+}
+
+/// SurveyUse - This looks at a single use of an argument or return value
+/// and determines if it should be alive or not. Adds this use to MaybeLiveUses
+/// if it causes the used value to become MaybeLive.
+///
+/// RetValNum is the return value number to use when this use is used in a
+/// return instruction. This is used in the recursion, you should always leave
+/// it at 0.
+DeadArgumentEliminationPass::Liveness
+DeadArgumentEliminationPass::SurveyUse(const Use *U, UseVector &MaybeLiveUses,
+ unsigned RetValNum) {
+ const User *V = U->getUser();
+ if (const ReturnInst *RI = dyn_cast<ReturnInst>(V)) {
+ // The value is returned from a function. It's only live when the
+ // function's return value is live. We use RetValNum here, for the case
+ // that U is really a use of an insertvalue instruction that uses the
+ // original Use.
+ const Function *F = RI->getParent()->getParent();
+ if (RetValNum != -1U) {
+ RetOrArg Use = CreateRet(F, RetValNum);
+ // We might be live, depending on the liveness of Use.
+ return MarkIfNotLive(Use, MaybeLiveUses);
+ } else {
+ DeadArgumentEliminationPass::Liveness Result = MaybeLive;
+ for (unsigned Ri = 0; Ri < NumRetVals(F); ++Ri) {
+ RetOrArg Use = CreateRet(F, Ri);
+ // We might be live, depending on the liveness of Use. If any
+ // sub-value is live, then the entire value is considered live. This
+ // is a conservative choice, and better tracking is possible.
+ DeadArgumentEliminationPass::Liveness SubResult =
+ MarkIfNotLive(Use, MaybeLiveUses);
+ if (Result != Live)
+ Result = SubResult;
+ }
+ return Result;
+ }
+ }
+ if (const InsertValueInst *IV = dyn_cast<InsertValueInst>(V)) {
+ if (U->getOperandNo() != InsertValueInst::getAggregateOperandIndex()
+ && IV->hasIndices())
+ // The use we are examining is inserted into an aggregate. Our liveness
+ // depends on all uses of that aggregate, but if it is used as a return
+ // value, only index at which we were inserted counts.
+ RetValNum = *IV->idx_begin();
+
+ // Note that if we are used as the aggregate operand to the insertvalue,
+ // we don't change RetValNum, but do survey all our uses.
+
+ Liveness Result = MaybeLive;
+ for (const Use &UU : IV->uses()) {
+ Result = SurveyUse(&UU, MaybeLiveUses, RetValNum);
+ if (Result == Live)
+ break;
+ }
+ return Result;
+ }
+
+ if (const auto *CB = dyn_cast<CallBase>(V)) {
+ const Function *F = CB->getCalledFunction();
+ if (F) {
+ // Used in a direct call.
+
+ // The function argument is live if it is used as a bundle operand.
+ if (CB->isBundleOperand(U))
+ return Live;
+
+ // Find the argument number. We know for sure that this use is an
+ // argument, since if it was the function argument this would be an
+ // indirect call and the we know can't be looking at a value of the
+ // label type (for the invoke instruction).
+ unsigned ArgNo = CB->getArgOperandNo(U);
+
+ if (ArgNo >= F->getFunctionType()->getNumParams())
+ // The value is passed in through a vararg! Must be live.
+ return Live;
+
+ assert(CB->getArgOperand(ArgNo) == CB->getOperand(U->getOperandNo()) &&
+ "Argument is not where we expected it");
+
+ // Value passed to a normal call. It's only live when the corresponding
+ // argument to the called function turns out live.
+ RetOrArg Use = CreateArg(F, ArgNo);
+ return MarkIfNotLive(Use, MaybeLiveUses);
+ }
+ }
+ // Used in any other way? Value must be live.
+ return Live;
+}
+
+/// SurveyUses - This looks at all the uses of the given value
+/// Returns the Liveness deduced from the uses of this value.
+///
+/// Adds all uses that cause the result to be MaybeLive to MaybeLiveRetUses. If
+/// the result is Live, MaybeLiveUses might be modified but its content should
+/// be ignored (since it might not be complete).
+DeadArgumentEliminationPass::Liveness
+DeadArgumentEliminationPass::SurveyUses(const Value *V,
+ UseVector &MaybeLiveUses) {
+ // Assume it's dead (which will only hold if there are no uses at all..).
+ Liveness Result = MaybeLive;
+ // Check each use.
+ for (const Use &U : V->uses()) {
+ Result = SurveyUse(&U, MaybeLiveUses);
+ if (Result == Live)
+ break;
+ }
+ return Result;
+}
+
+// SurveyFunction - This performs the initial survey of the specified function,
+// checking out whether or not it uses any of its incoming arguments or whether
+// any callers use the return value. This fills in the LiveValues set and Uses
+// map.
+//
+// We consider arguments of non-internal functions to be intrinsically alive as
+// well as arguments to functions which have their "address taken".
+void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
+ // Functions with inalloca/preallocated parameters are expecting args in a
+ // particular register and memory layout.
+ if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
+ F.getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
+ MarkLive(F);
+ return;
+ }
+
+ // Don't touch naked functions. The assembly might be using an argument, or
+ // otherwise rely on the frame layout in a way that this analysis will not
+ // see.
+ if (F.hasFnAttribute(Attribute::Naked)) {
+ MarkLive(F);
+ return;
+ }
+
+ unsigned RetCount = NumRetVals(&F);
+
+ // Assume all return values are dead
+ using RetVals = SmallVector<Liveness, 5>;
+
+ RetVals RetValLiveness(RetCount, MaybeLive);
+
+ using RetUses = SmallVector<UseVector, 5>;
+
+ // These vectors map each return value to the uses that make it MaybeLive, so
+ // we can add those to the Uses map if the return value really turns out to be
+ // MaybeLive. Initialized to a list of RetCount empty lists.
+ RetUses MaybeLiveRetUses(RetCount);
+
+ bool HasMustTailCalls = false;
+
+ for (Function::const_iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+ if (const ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
+ if (RI->getNumOperands() != 0 && RI->getOperand(0)->getType()
+ != F.getFunctionType()->getReturnType()) {
+ // We don't support old style multiple return values.
+ MarkLive(F);
+ return;
+ }
+ }
+
+ // If we have any returns of `musttail` results - the signature can't
+ // change
+ if (BB->getTerminatingMustTailCall() != nullptr)
+ HasMustTailCalls = true;
+ }
+
+ if (HasMustTailCalls) {
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName()
+ << " has musttail calls\n");
+ }
+
+ if (!F.hasLocalLinkage() && (!ShouldHackArguments || F.isIntrinsic())) {
+ MarkLive(F);
+ return;
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "DeadArgumentEliminationPass - Inspecting callers for fn: "
+ << F.getName() << "\n");
+ // Keep track of the number of live retvals, so we can skip checks once all
+ // of them turn out to be live.
+ unsigned NumLiveRetVals = 0;
+
+ bool HasMustTailCallers = false;
+
+ // Loop all uses of the function.
+ for (const Use &U : F.uses()) {
+ // If the function is PASSED IN as an argument, its address has been
+ // taken.
+ const auto *CB = dyn_cast<CallBase>(U.getUser());
+ if (!CB || !CB->isCallee(&U)) {
+ MarkLive(F);
+ return;
+ }
+
+ // The number of arguments for `musttail` call must match the number of
+ // arguments of the caller
+ if (CB->isMustTailCall())
+ HasMustTailCallers = true;
+
+ // If we end up here, we are looking at a direct call to our function.
+
+ // Now, check how our return value(s) is/are used in this caller. Don't
+ // bother checking return values if all of them are live already.
+ if (NumLiveRetVals == RetCount)
+ continue;
+
+ // Check all uses of the return value.
+ for (const Use &U : CB->uses()) {
+ if (ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(U.getUser())) {
+ // This use uses a part of our return value, survey the uses of
+ // that part and store the results for this index only.
+ unsigned Idx = *Ext->idx_begin();
+ if (RetValLiveness[Idx] != Live) {
+ RetValLiveness[Idx] = SurveyUses(Ext, MaybeLiveRetUses[Idx]);
+ if (RetValLiveness[Idx] == Live)
+ NumLiveRetVals++;
+ }
+ } else {
+ // Used by something else than extractvalue. Survey, but assume that the
+ // result applies to all sub-values.
+ UseVector MaybeLiveAggregateUses;
+ if (SurveyUse(&U, MaybeLiveAggregateUses) == Live) {
+ NumLiveRetVals = RetCount;
+ RetValLiveness.assign(RetCount, Live);
+ break;
+ } else {
+ for (unsigned Ri = 0; Ri != RetCount; ++Ri) {
+ if (RetValLiveness[Ri] != Live)
+ MaybeLiveRetUses[Ri].append(MaybeLiveAggregateUses.begin(),
+ MaybeLiveAggregateUses.end());
+ }
+ }
+ }
+ }
+ }
+
+ if (HasMustTailCallers) {
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - " << F.getName()
+ << " has musttail callers\n");
+ }
+
+ // Now we've inspected all callers, record the liveness of our return values.
+ for (unsigned Ri = 0; Ri != RetCount; ++Ri)
+ MarkValue(CreateRet(&F, Ri), RetValLiveness[Ri], MaybeLiveRetUses[Ri]);
+
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting args for fn: "
+ << F.getName() << "\n");
+
+ // Now, check all of our arguments.
+ unsigned ArgI = 0;
+ UseVector MaybeLiveArgUses;
+ for (Function::const_arg_iterator AI = F.arg_begin(), E = F.arg_end();
+ AI != E; ++AI, ++ArgI) {
+ Liveness Result;
+ if (F.getFunctionType()->isVarArg() || HasMustTailCallers ||
+ HasMustTailCalls) {
+ // Variadic functions will already have a va_arg function expanded inside
+ // them, making them potentially very sensitive to ABI changes resulting
+ // from removing arguments entirely, so don't. For example AArch64 handles
+ // register and stack HFAs very differently, and this is reflected in the
+ // IR which has already been generated.
+ //
+ // `musttail` calls to this function restrict argument removal attempts.
+ // The signature of the caller must match the signature of the function.
+ //
+ // `musttail` calls in this function prevents us from changing its
+ // signature
+ Result = Live;
+ } else {
+ // See what the effect of this use is (recording any uses that cause
+ // MaybeLive in MaybeLiveArgUses).
+ Result = SurveyUses(&*AI, MaybeLiveArgUses);
+ }
+
+ // Mark the result.
+ MarkValue(CreateArg(&F, ArgI), Result, MaybeLiveArgUses);
+ // Clear the vector again for the next iteration.
+ MaybeLiveArgUses.clear();
+ }
+}
+
+/// MarkValue - This function marks the liveness of RA depending on L. If L is
+/// MaybeLive, it also takes all uses in MaybeLiveUses and records them in Uses,
+/// such that RA will be marked live if any use in MaybeLiveUses gets marked
+/// live later on.
+void DeadArgumentEliminationPass::MarkValue(const RetOrArg &RA, Liveness L,
+ const UseVector &MaybeLiveUses) {
+ switch (L) {
+ case Live:
+ MarkLive(RA);
+ break;
+ case MaybeLive:
+ assert(!IsLive(RA) && "Use is already live!");
+ for (const auto &MaybeLiveUse : MaybeLiveUses) {
+ if (IsLive(MaybeLiveUse)) {
+ // A use is live, so this value is live.
+ MarkLive(RA);
+ break;
+ } else {
+ // Note any uses of this value, so this value can be
+ // marked live whenever one of the uses becomes live.
+ Uses.insert(std::make_pair(MaybeLiveUse, RA));
+ }
+ }
+ break;
+ }
+}
+
+/// MarkLive - Mark the given Function as alive, meaning that it cannot be
+/// changed in any way. Additionally,
+/// mark any values that are used as this function's parameters or by its return
+/// values (according to Uses) live as well.
+void DeadArgumentEliminationPass::MarkLive(const Function &F) {
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Intrinsically live fn: "
+ << F.getName() << "\n");
+ // Mark the function as live.
+ LiveFunctions.insert(&F);
+ // Mark all arguments as live.
+ for (unsigned ArgI = 0, E = F.arg_size(); ArgI != E; ++ArgI)
+ PropagateLiveness(CreateArg(&F, ArgI));
+ // Mark all return values as live.
+ for (unsigned Ri = 0, E = NumRetVals(&F); Ri != E; ++Ri)
+ PropagateLiveness(CreateRet(&F, Ri));
+}
+
+/// MarkLive - Mark the given return value or argument as live. Additionally,
+/// mark any values that are used by this value (according to Uses) live as
+/// well.
+void DeadArgumentEliminationPass::MarkLive(const RetOrArg &RA) {
+ if (IsLive(RA))
+ return; // Already marked Live.
+
+ LiveValues.insert(RA);
+
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Marking "
+ << RA.getDescription() << " live\n");
+ PropagateLiveness(RA);
+}
+
+bool DeadArgumentEliminationPass::IsLive(const RetOrArg &RA) {
+ return LiveFunctions.count(RA.F) || LiveValues.count(RA);
+}
+
+/// PropagateLiveness - Given that RA is a live value, propagate it's liveness
+/// to any other values it uses (according to Uses).
+void DeadArgumentEliminationPass::PropagateLiveness(const RetOrArg &RA) {
+ // We don't use upper_bound (or equal_range) here, because our recursive call
+ // to ourselves is likely to cause the upper_bound (which is the first value
+ // not belonging to RA) to become erased and the iterator invalidated.
+ UseMap::iterator Begin = Uses.lower_bound(RA);
+ UseMap::iterator E = Uses.end();
+ UseMap::iterator I;
+ for (I = Begin; I != E && I->first == RA; ++I)
+ MarkLive(I->second);
+
+ // Erase RA from the Uses map (from the lower bound to wherever we ended up
+ // after the loop).
+ Uses.erase(Begin, I);
+}
+
+// RemoveDeadStuffFromFunction - Remove any arguments and return values from F
+// that are not in LiveValues. Transform the function and all of the callees of
+// the function to not have these arguments and return values.
+//
+bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
+ // Don't modify fully live functions
+ if (LiveFunctions.count(F))
+ return false;
+
+ // Start by computing a new prototype for the function, which is the same as
+ // the old function, but has fewer arguments and a different return type.
+ FunctionType *FTy = F->getFunctionType();
+ std::vector<Type*> Params;
+
+ // Keep track of if we have a live 'returned' argument
+ bool HasLiveReturnedArg = false;
+
+ // Set up to build a new list of parameter attributes.
+ SmallVector<AttributeSet, 8> ArgAttrVec;
+ const AttributeList &PAL = F->getAttributes();
+
+ // Remember which arguments are still alive.
+ SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
+ // Construct the new parameter list from non-dead arguments. Also construct
+ // a new set of parameter attributes to correspond. Skip the first parameter
+ // attribute, since that belongs to the return value.
+ unsigned ArgI = 0;
+ for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+ ++I, ++ArgI) {
+ RetOrArg Arg = CreateArg(F, ArgI);
+ if (LiveValues.erase(Arg)) {
+ Params.push_back(I->getType());
+ ArgAlive[ArgI] = true;
+ ArgAttrVec.push_back(PAL.getParamAttributes(ArgI));
+ HasLiveReturnedArg |= PAL.hasParamAttribute(ArgI, Attribute::Returned);
+ } else {
+ ++NumArgumentsEliminated;
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument "
+ << ArgI << " (" << I->getName() << ") from "
+ << F->getName() << "\n");
+ }
+ }
+
+ // Find out the new return value.
+ Type *RetTy = FTy->getReturnType();
+ Type *NRetTy = nullptr;
+ unsigned RetCount = NumRetVals(F);
+
+ // -1 means unused, other numbers are the new index
+ SmallVector<int, 5> NewRetIdxs(RetCount, -1);
+ std::vector<Type*> RetTypes;
+
+ // If there is a function with a live 'returned' argument but a dead return
+ // value, then there are two possible actions:
+ // 1) Eliminate the return value and take off the 'returned' attribute on the
+ // argument.
+ // 2) Retain the 'returned' attribute and treat the return value (but not the
+ // entire function) as live so that it is not eliminated.
+ //
+ // It's not clear in the general case which option is more profitable because,
+ // even in the absence of explicit uses of the return value, code generation
+ // is free to use the 'returned' attribute to do things like eliding
+ // save/restores of registers across calls. Whether or not this happens is
+ // target and ABI-specific as well as depending on the amount of register
+ // pressure, so there's no good way for an IR-level pass to figure this out.
+ //
+ // Fortunately, the only places where 'returned' is currently generated by
+ // the FE are places where 'returned' is basically free and almost always a
+ // performance win, so the second option can just be used always for now.
+ //
+ // This should be revisited if 'returned' is ever applied more liberally.
+ if (RetTy->isVoidTy() || HasLiveReturnedArg) {
+ NRetTy = RetTy;
+ } else {
+ // Look at each of the original return values individually.
+ for (unsigned Ri = 0; Ri != RetCount; ++Ri) {
+ RetOrArg Ret = CreateRet(F, Ri);
+ if (LiveValues.erase(Ret)) {
+ RetTypes.push_back(getRetComponentType(F, Ri));
+ NewRetIdxs[Ri] = RetTypes.size() - 1;
+ } else {
+ ++NumRetValsEliminated;
+ LLVM_DEBUG(
+ dbgs() << "DeadArgumentEliminationPass - Removing return value "
+ << Ri << " from " << F->getName() << "\n");
+ }
+ }
+ if (RetTypes.size() > 1) {
+ // More than one return type? Reduce it down to size.
+ if (StructType *STy = dyn_cast<StructType>(RetTy)) {
+ // Make the new struct packed if we used to return a packed struct
+ // already.
+ NRetTy = StructType::get(STy->getContext(), RetTypes, STy->isPacked());
+ } else {
+ assert(isa<ArrayType>(RetTy) && "unexpected multi-value return");
+ NRetTy = ArrayType::get(RetTypes[0], RetTypes.size());
+ }
+ } else if (RetTypes.size() == 1)
+ // One return type? Just a simple value then, but only if we didn't use to
+ // return a struct with that simple value before.
+ NRetTy = RetTypes.front();
+ else if (RetTypes.empty())
+ // No return types? Make it void, but only if we didn't use to return {}.
+ NRetTy = Type::getVoidTy(F->getContext());
+ }
+
+ assert(NRetTy && "No new return type found?");
+
+ // The existing function return attributes.
+ AttrBuilder RAttrs(PAL.getRetAttributes());
+
+ // Remove any incompatible attributes, but only if we removed all return
+ // values. Otherwise, ensure that we don't have any conflicting attributes
+ // here. Currently, this should not be possible, but special handling might be
+ // required when new return value attributes are added.
+ if (NRetTy->isVoidTy())
+ RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
+ else
+ assert(!RAttrs.overlaps(AttributeFuncs::typeIncompatible(NRetTy)) &&
+ "Return attributes no longer compatible?");
+
+ AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
+
+ // Strip allocsize attributes. They might refer to the deleted arguments.
+ AttributeSet FnAttrs = PAL.getFnAttributes().removeAttribute(
+ F->getContext(), Attribute::AllocSize);
+
+ // Reconstruct the AttributesList based on the vector we constructed.
+ assert(ArgAttrVec.size() == Params.size());
+ AttributeList NewPAL =
+ AttributeList::get(F->getContext(), FnAttrs, RetAttrs, ArgAttrVec);
+
+ // Create the new function type based on the recomputed parameters.
+ FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
+
+ // No change?
+ if (NFTy == FTy)
+ return false;
+
+ // Create the new function body and insert it into the module...
+ Function *NF = Function::Create(NFTy, F->getLinkage(), F->getAddressSpace());
+ NF->copyAttributesFrom(F);
+ NF->setComdat(F->getComdat());
+ NF->setAttributes(NewPAL);
+ // Insert the new function before the old function, so we won't be processing
+ // it again.
+ F->getParent()->getFunctionList().insert(F->getIterator(), NF);
+ NF->takeName(F);
+
+ // Loop over all of the callers of the function, transforming the call sites
+ // to pass in a smaller number of arguments into the new function.
+ std::vector<Value*> Args;
+ while (!F->use_empty()) {
+ CallBase &CB = cast<CallBase>(*F->user_back());
+
+ ArgAttrVec.clear();
+ const AttributeList &CallPAL = CB.getAttributes();
+
+ // Adjust the call return attributes in case the function was changed to
+ // return void.
+ AttrBuilder RAttrs(CallPAL.getRetAttributes());
+ RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
+ AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
+
+ // Declare these outside of the loops, so we can reuse them for the second
+ // loop, which loops the varargs.
+ auto I = CB.arg_begin();
+ unsigned Pi = 0;
+ // Loop over those operands, corresponding to the normal arguments to the
+ // original function, and add those that are still alive.
+ for (unsigned E = FTy->getNumParams(); Pi != E; ++I, ++Pi)
+ if (ArgAlive[Pi]) {
+ Args.push_back(*I);
+ // Get original parameter attributes, but skip return attributes.
+ AttributeSet Attrs = CallPAL.getParamAttributes(Pi);
+ if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) {
+ // If the return type has changed, then get rid of 'returned' on the
+ // call site. The alternative is to make all 'returned' attributes on
+ // call sites keep the return value alive just like 'returned'
+ // attributes on function declaration but it's less clearly a win and
+ // this is not an expected case anyway
+ ArgAttrVec.push_back(AttributeSet::get(
+ F->getContext(),
+ AttrBuilder(Attrs).removeAttribute(Attribute::Returned)));
+ } else {
+ // Otherwise, use the original attributes.
+ ArgAttrVec.push_back(Attrs);
+ }
+ }
+
+ // Push any varargs arguments on the list. Don't forget their attributes.
+ for (auto E = CB.arg_end(); I != E; ++I, ++Pi) {
+ Args.push_back(*I);
+ ArgAttrVec.push_back(CallPAL.getParamAttributes(Pi));
+ }
+
+ // Reconstruct the AttributesList based on the vector we constructed.
+ assert(ArgAttrVec.size() == Args.size());
+
+ // Again, be sure to remove any allocsize attributes, since their indices
+ // may now be incorrect.
+ AttributeSet FnAttrs = CallPAL.getFnAttributes().removeAttribute(
+ F->getContext(), Attribute::AllocSize);
+
+ AttributeList NewCallPAL = AttributeList::get(
+ F->getContext(), FnAttrs, RetAttrs, ArgAttrVec);
+
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ CB.getOperandBundlesAsDefs(OpBundles);
+
+ CallBase *NewCB = nullptr;
+ if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+ NewCB = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+ Args, OpBundles, "", CB.getParent());
+ } else {
+ NewCB = CallInst::Create(NFTy, NF, Args, OpBundles, "", &CB);
+ cast<CallInst>(NewCB)->setTailCallKind(
+ cast<CallInst>(&CB)->getTailCallKind());
+ }
+ NewCB->setCallingConv(CB.getCallingConv());
+ NewCB->setAttributes(NewCallPAL);
+ NewCB->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
+ Args.clear();
+ ArgAttrVec.clear();
+
+ if (!CB.use_empty() || CB.isUsedByMetadata()) {
+ if (NewCB->getType() == CB.getType()) {
+ // Return type not changed? Just replace users then.
+ CB.replaceAllUsesWith(NewCB);
+ NewCB->takeName(&CB);
+ } else if (NewCB->getType()->isVoidTy()) {
+ // If the return value is dead, replace any uses of it with undef
+ // (any non-debug value uses will get removed later on).
+ if (!CB.getType()->isX86_MMXTy())
+ CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
+ } else {
+ assert((RetTy->isStructTy() || RetTy->isArrayTy()) &&
+ "Return type changed, but not into a void. The old return type"
+ " must have been a struct or an array!");
+ Instruction *InsertPt = &CB;
+ if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+ BasicBlock *NewEdge =
+ SplitEdge(NewCB->getParent(), II->getNormalDest());
+ InsertPt = &*NewEdge->getFirstInsertionPt();
+ }
+
+ // We used to return a struct or array. Instead of doing smart stuff
+ // with all the uses, we will just rebuild it using extract/insertvalue
+ // chaining and let instcombine clean that up.
+ //
+ // Start out building up our return value from undef
+ Value *RetVal = UndefValue::get(RetTy);
+ for (unsigned Ri = 0; Ri != RetCount; ++Ri)
+ if (NewRetIdxs[Ri] != -1) {
+ Value *V;
+ IRBuilder<NoFolder> IRB(InsertPt);
+ if (RetTypes.size() > 1)
+ // We are still returning a struct, so extract the value from our
+ // return value
+ V = IRB.CreateExtractValue(NewCB, NewRetIdxs[Ri], "newret");
+ else
+ // We are now returning a single element, so just insert that
+ V = NewCB;
+ // Insert the value at the old position
+ RetVal = IRB.CreateInsertValue(RetVal, V, Ri, "oldret");
+ }
+ // Now, replace all uses of the old call instruction with the return
+ // struct we built
+ CB.replaceAllUsesWith(RetVal);
+ NewCB->takeName(&CB);
+ }
+ }
+
+ // Finally, remove the old call from the program, reducing the use-count of
+ // F.
+ CB.eraseFromParent();
+ }
+
+ // Since we have now created the new function, splice the body of the old
+ // function right into the new function, leaving the old rotting hulk of the
+ // function empty.
+ NF->getBasicBlockList().splice(NF->begin(), F->getBasicBlockList());
+
+ // Loop over the argument list, transferring uses of the old arguments over to
+ // the new arguments, also transferring over the names as well.
+ ArgI = 0;
+ for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
+ I2 = NF->arg_begin();
+ I != E; ++I, ++ArgI)
+ if (ArgAlive[ArgI]) {
+ // If this is a live argument, move the name and users over to the new
+ // version.
+ I->replaceAllUsesWith(&*I2);
+ I2->takeName(&*I);
+ ++I2;
+ } else {
+ // If this argument is dead, replace any uses of it with undef
+ // (any non-debug value uses will get removed later on).
+ if (!I->getType()->isX86_MMXTy())
+ I->replaceAllUsesWith(UndefValue::get(I->getType()));
+ }
+
+ // If we change the return value of the function we must rewrite any return
+ // instructions. Check this now.
+ if (F->getReturnType() != NF->getReturnType())
+ for (BasicBlock &BB : *NF)
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+ IRBuilder<NoFolder> IRB(RI);
+ Value *RetVal = nullptr;
+
+ if (!NFTy->getReturnType()->isVoidTy()) {
+ assert(RetTy->isStructTy() || RetTy->isArrayTy());
+ // The original return value was a struct or array, insert
+ // extractvalue/insertvalue chains to extract only the values we need
+ // to return and insert them into our new result.
+ // This does generate messy code, but we'll let it to instcombine to
+ // clean that up.
+ Value *OldRet = RI->getOperand(0);
+ // Start out building up our return value from undef
+ RetVal = UndefValue::get(NRetTy);
+ for (unsigned RetI = 0; RetI != RetCount; ++RetI)
+ if (NewRetIdxs[RetI] != -1) {
+ Value *EV = IRB.CreateExtractValue(OldRet, RetI, "oldret");
+
+ if (RetTypes.size() > 1) {
+ // We're still returning a struct, so reinsert the value into
+ // our new return value at the new index
+
+ RetVal = IRB.CreateInsertValue(RetVal, EV, NewRetIdxs[RetI],
+ "newret");
+ } else {
+ // We are now only returning a simple value, so just return the
+ // extracted value.
+ RetVal = EV;
+ }
+ }
+ }
+ // Replace the return instruction with one returning the new return
+ // value (possibly 0 if we became void).
+ auto *NewRet = ReturnInst::Create(F->getContext(), RetVal, RI);
+ NewRet->setDebugLoc(RI->getDebugLoc());
+ BB.getInstList().erase(RI);
+ }
+
+ // Clone metadatas from the old function, including debug info descriptor.
+ SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+ F->getAllMetadata(MDs);
+ for (auto MD : MDs)
+ NF->addMetadata(MD.first, *MD.second);
+
+ // Now that the old function is dead, delete it.
+ F->eraseFromParent();
+
+ return true;
+}
+
+PreservedAnalyses DeadArgumentEliminationPass::run(Module &M,
+ ModuleAnalysisManager &) {
+ bool Changed = false;
+
+ // First pass: Do a simple check to see if any functions can have their "..."
+ // removed. We can do this if they never call va_start. This loop cannot be
+ // fused with the next loop, because deleting a function invalidates
+ // information computed while surveying other functions.
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Deleting dead varargs\n");
+ for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+ Function &F = *I++;
+ if (F.getFunctionType()->isVarArg())
+ Changed |= DeleteDeadVarargs(F);
+ }
+
+ // Second phase:loop through the module, determining which arguments are live.
+ // We assume all arguments are dead unless proven otherwise (allowing us to
+ // determine that dead arguments passed into recursive functions are dead).
+ //
+ LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Determining liveness\n");
+ for (auto &F : M)
+ SurveyFunction(F);
+
+ // Now, remove all dead arguments and return values from each function in
+ // turn.
+ for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+ // Increment now, because the function will probably get removed (ie.
+ // replaced by a new one).
+ Function *F = &*I++;
+ Changed |= RemoveDeadStuffFromFunction(F);
+ }
+
+ // Finally, look for any unused parameters in functions with non-local
+ // linkage and replace the passed in parameters with undef.
+ for (auto &F : M)
+ Changed |= RemoveDeadArgumentsFromCallers(F);
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/ElimAvailExtern.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/ElimAvailExtern.cpp
index 1fc6114af3..7f138d206f 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/ElimAvailExtern.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/ElimAvailExtern.cpp
@@ -1,102 +1,102 @@
-//===- ElimAvailExtern.cpp - DCE unreachable internal functions -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This transform is designed to eliminate available external global
-// definitions from the program, turning them into declarations.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/ElimAvailExtern.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/GlobalStatus.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "elim-avail-extern"
-
-STATISTIC(NumFunctions, "Number of functions removed");
-STATISTIC(NumVariables, "Number of global variables removed");
-
-static bool eliminateAvailableExternally(Module &M) {
- bool Changed = false;
-
- // Drop initializers of available externally global variables.
- for (GlobalVariable &GV : M.globals()) {
- if (!GV.hasAvailableExternallyLinkage())
- continue;
- if (GV.hasInitializer()) {
- Constant *Init = GV.getInitializer();
- GV.setInitializer(nullptr);
- if (isSafeToDestroyConstant(Init))
- Init->destroyConstant();
- }
- GV.removeDeadConstantUsers();
- GV.setLinkage(GlobalValue::ExternalLinkage);
- NumVariables++;
- Changed = true;
- }
-
- // Drop the bodies of available externally functions.
- for (Function &F : M) {
- if (!F.hasAvailableExternallyLinkage())
- continue;
- if (!F.isDeclaration())
- // This will set the linkage to external
- F.deleteBody();
- F.removeDeadConstantUsers();
- NumFunctions++;
- Changed = true;
- }
-
- return Changed;
-}
-
-PreservedAnalyses
-EliminateAvailableExternallyPass::run(Module &M, ModuleAnalysisManager &) {
- if (!eliminateAvailableExternally(M))
- return PreservedAnalyses::all();
- return PreservedAnalyses::none();
-}
-
-namespace {
-
-struct EliminateAvailableExternallyLegacyPass : public ModulePass {
- static char ID; // Pass identification, replacement for typeid
-
- EliminateAvailableExternallyLegacyPass() : ModulePass(ID) {
- initializeEliminateAvailableExternallyLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- // run - Do the EliminateAvailableExternally pass on the specified module,
- // optionally updating the specified callgraph to reflect the changes.
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
- return eliminateAvailableExternally(M);
- }
-};
-
-} // end anonymous namespace
-
-char EliminateAvailableExternallyLegacyPass::ID = 0;
-
-INITIALIZE_PASS(EliminateAvailableExternallyLegacyPass, "elim-avail-extern",
- "Eliminate Available Externally Globals", false, false)
-
-ModulePass *llvm::createEliminateAvailableExternallyPass() {
- return new EliminateAvailableExternallyLegacyPass();
-}
+//===- ElimAvailExtern.cpp - DCE unreachable internal functions -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transform is designed to eliminate available external global
+// definitions from the program, turning them into declarations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ElimAvailExtern.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "elim-avail-extern"
+
+STATISTIC(NumFunctions, "Number of functions removed");
+STATISTIC(NumVariables, "Number of global variables removed");
+
+static bool eliminateAvailableExternally(Module &M) {
+ bool Changed = false;
+
+ // Drop initializers of available externally global variables.
+ for (GlobalVariable &GV : M.globals()) {
+ if (!GV.hasAvailableExternallyLinkage())
+ continue;
+ if (GV.hasInitializer()) {
+ Constant *Init = GV.getInitializer();
+ GV.setInitializer(nullptr);
+ if (isSafeToDestroyConstant(Init))
+ Init->destroyConstant();
+ }
+ GV.removeDeadConstantUsers();
+ GV.setLinkage(GlobalValue::ExternalLinkage);
+ NumVariables++;
+ Changed = true;
+ }
+
+ // Drop the bodies of available externally functions.
+ for (Function &F : M) {
+ if (!F.hasAvailableExternallyLinkage())
+ continue;
+ if (!F.isDeclaration())
+ // This will set the linkage to external
+ F.deleteBody();
+ F.removeDeadConstantUsers();
+ NumFunctions++;
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses
+EliminateAvailableExternallyPass::run(Module &M, ModuleAnalysisManager &) {
+ if (!eliminateAvailableExternally(M))
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
+
+namespace {
+
+struct EliminateAvailableExternallyLegacyPass : public ModulePass {
+ static char ID; // Pass identification, replacement for typeid
+
+ EliminateAvailableExternallyLegacyPass() : ModulePass(ID) {
+ initializeEliminateAvailableExternallyLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ // run - Do the EliminateAvailableExternally pass on the specified module,
+ // optionally updating the specified callgraph to reflect the changes.
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+ return eliminateAvailableExternally(M);
+ }
+};
+
+} // end anonymous namespace
+
+char EliminateAvailableExternallyLegacyPass::ID = 0;
+
+INITIALIZE_PASS(EliminateAvailableExternallyLegacyPass, "elim-avail-extern",
+ "Eliminate Available Externally Globals", false, false)
+
+ModulePass *llvm::createEliminateAvailableExternallyPass() {
+ return new EliminateAvailableExternallyLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/ExtractGV.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/ExtractGV.cpp
index 2958fb0308..b45766a8e7 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/ExtractGV.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/ExtractGV.cpp
@@ -1,164 +1,164 @@
-//===-- ExtractGV.cpp - Global Value extraction pass ----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass extracts global values
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SetVector.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/IPO.h"
-#include <algorithm>
-using namespace llvm;
-
-/// Make sure GV is visible from both modules. Delete is true if it is
-/// being deleted from this module.
-/// This also makes sure GV cannot be dropped so that references from
-/// the split module remain valid.
-static void makeVisible(GlobalValue &GV, bool Delete) {
- bool Local = GV.hasLocalLinkage();
- if (Local || Delete) {
- GV.setLinkage(GlobalValue::ExternalLinkage);
- if (Local)
- GV.setVisibility(GlobalValue::HiddenVisibility);
- return;
- }
-
- if (!GV.hasLinkOnceLinkage()) {
- assert(!GV.isDiscardableIfUnused());
- return;
- }
-
- // Map linkonce* to weak* so that llvm doesn't drop this GV.
- switch(GV.getLinkage()) {
- default:
- llvm_unreachable("Unexpected linkage");
- case GlobalValue::LinkOnceAnyLinkage:
- GV.setLinkage(GlobalValue::WeakAnyLinkage);
- return;
- case GlobalValue::LinkOnceODRLinkage:
- GV.setLinkage(GlobalValue::WeakODRLinkage);
- return;
- }
-}
-
-namespace {
- /// A pass to extract specific global values and their dependencies.
- class GVExtractorPass : public ModulePass {
- SetVector<GlobalValue *> Named;
- bool deleteStuff;
- bool keepConstInit;
- public:
- static char ID; // Pass identification, replacement for typeid
-
- /// If deleteS is true, this pass deletes the specified global values.
- /// Otherwise, it deletes as much of the module as possible, except for the
- /// global values specified.
- explicit GVExtractorPass(std::vector<GlobalValue*> &GVs,
- bool deleteS = true, bool keepConstInit = false)
- : ModulePass(ID), Named(GVs.begin(), GVs.end()), deleteStuff(deleteS),
- keepConstInit(keepConstInit) {}
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
-
- // Visit the global inline asm.
- if (!deleteStuff)
- M.setModuleInlineAsm("");
-
- // For simplicity, just give all GlobalValues ExternalLinkage. A trickier
- // implementation could figure out which GlobalValues are actually
- // referenced by the Named set, and which GlobalValues in the rest of
- // the module are referenced by the NamedSet, and get away with leaving
- // more internal and private things internal and private. But for now,
- // be conservative and simple.
-
- // Visit the GlobalVariables.
- for (Module::global_iterator I = M.global_begin(), E = M.global_end();
- I != E; ++I) {
- bool Delete =
- deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration() &&
- (!I->isConstant() || !keepConstInit);
- if (!Delete) {
- if (I->hasAvailableExternallyLinkage())
- continue;
- if (I->getName() == "llvm.global_ctors")
- continue;
- }
-
- makeVisible(*I, Delete);
-
- if (Delete) {
- // Make this a declaration and drop it's comdat.
- I->setInitializer(nullptr);
- I->setComdat(nullptr);
- }
- }
-
- // Visit the Functions.
- for (Function &F : M) {
- bool Delete =
- deleteStuff == (bool)Named.count(&F) && !F.isDeclaration();
- if (!Delete) {
- if (F.hasAvailableExternallyLinkage())
- continue;
- }
-
- makeVisible(F, Delete);
-
- if (Delete) {
- // Make this a declaration and drop it's comdat.
- F.deleteBody();
- F.setComdat(nullptr);
- }
- }
-
- // Visit the Aliases.
- for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
- I != E;) {
- Module::alias_iterator CurI = I;
- ++I;
-
- bool Delete = deleteStuff == (bool)Named.count(&*CurI);
- makeVisible(*CurI, Delete);
-
- if (Delete) {
- Type *Ty = CurI->getValueType();
-
- CurI->removeFromParent();
- llvm::Value *Declaration;
- if (FunctionType *FTy = dyn_cast<FunctionType>(Ty)) {
- Declaration = Function::Create(FTy, GlobalValue::ExternalLinkage,
- CurI->getAddressSpace(),
- CurI->getName(), &M);
-
- } else {
- Declaration =
- new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage,
- nullptr, CurI->getName());
-
- }
- CurI->replaceAllUsesWith(Declaration);
- delete &*CurI;
- }
- }
-
- return true;
- }
- };
-
- char GVExtractorPass::ID = 0;
-}
-
-ModulePass *llvm::createGVExtractionPass(std::vector<GlobalValue *> &GVs,
- bool deleteFn, bool keepConstInit) {
- return new GVExtractorPass(GVs, deleteFn, keepConstInit);
-}
+//===-- ExtractGV.cpp - Global Value extraction pass ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass extracts global values
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+#include <algorithm>
+using namespace llvm;
+
+/// Make sure GV is visible from both modules. Delete is true if it is
+/// being deleted from this module.
+/// This also makes sure GV cannot be dropped so that references from
+/// the split module remain valid.
+static void makeVisible(GlobalValue &GV, bool Delete) {
+ bool Local = GV.hasLocalLinkage();
+ if (Local || Delete) {
+ GV.setLinkage(GlobalValue::ExternalLinkage);
+ if (Local)
+ GV.setVisibility(GlobalValue::HiddenVisibility);
+ return;
+ }
+
+ if (!GV.hasLinkOnceLinkage()) {
+ assert(!GV.isDiscardableIfUnused());
+ return;
+ }
+
+ // Map linkonce* to weak* so that llvm doesn't drop this GV.
+ switch(GV.getLinkage()) {
+ default:
+ llvm_unreachable("Unexpected linkage");
+ case GlobalValue::LinkOnceAnyLinkage:
+ GV.setLinkage(GlobalValue::WeakAnyLinkage);
+ return;
+ case GlobalValue::LinkOnceODRLinkage:
+ GV.setLinkage(GlobalValue::WeakODRLinkage);
+ return;
+ }
+}
+
+namespace {
+ /// A pass to extract specific global values and their dependencies.
+ class GVExtractorPass : public ModulePass {
+ SetVector<GlobalValue *> Named;
+ bool deleteStuff;
+ bool keepConstInit;
+ public:
+ static char ID; // Pass identification, replacement for typeid
+
+ /// If deleteS is true, this pass deletes the specified global values.
+ /// Otherwise, it deletes as much of the module as possible, except for the
+ /// global values specified.
+ explicit GVExtractorPass(std::vector<GlobalValue*> &GVs,
+ bool deleteS = true, bool keepConstInit = false)
+ : ModulePass(ID), Named(GVs.begin(), GVs.end()), deleteStuff(deleteS),
+ keepConstInit(keepConstInit) {}
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+
+ // Visit the global inline asm.
+ if (!deleteStuff)
+ M.setModuleInlineAsm("");
+
+ // For simplicity, just give all GlobalValues ExternalLinkage. A trickier
+ // implementation could figure out which GlobalValues are actually
+ // referenced by the Named set, and which GlobalValues in the rest of
+ // the module are referenced by the NamedSet, and get away with leaving
+ // more internal and private things internal and private. But for now,
+ // be conservative and simple.
+
+ // Visit the GlobalVariables.
+ for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+ I != E; ++I) {
+ bool Delete =
+ deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration() &&
+ (!I->isConstant() || !keepConstInit);
+ if (!Delete) {
+ if (I->hasAvailableExternallyLinkage())
+ continue;
+ if (I->getName() == "llvm.global_ctors")
+ continue;
+ }
+
+ makeVisible(*I, Delete);
+
+ if (Delete) {
+ // Make this a declaration and drop it's comdat.
+ I->setInitializer(nullptr);
+ I->setComdat(nullptr);
+ }
+ }
+
+ // Visit the Functions.
+ for (Function &F : M) {
+ bool Delete =
+ deleteStuff == (bool)Named.count(&F) && !F.isDeclaration();
+ if (!Delete) {
+ if (F.hasAvailableExternallyLinkage())
+ continue;
+ }
+
+ makeVisible(F, Delete);
+
+ if (Delete) {
+ // Make this a declaration and drop it's comdat.
+ F.deleteBody();
+ F.setComdat(nullptr);
+ }
+ }
+
+ // Visit the Aliases.
+ for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+ I != E;) {
+ Module::alias_iterator CurI = I;
+ ++I;
+
+ bool Delete = deleteStuff == (bool)Named.count(&*CurI);
+ makeVisible(*CurI, Delete);
+
+ if (Delete) {
+ Type *Ty = CurI->getValueType();
+
+ CurI->removeFromParent();
+ llvm::Value *Declaration;
+ if (FunctionType *FTy = dyn_cast<FunctionType>(Ty)) {
+ Declaration = Function::Create(FTy, GlobalValue::ExternalLinkage,
+ CurI->getAddressSpace(),
+ CurI->getName(), &M);
+
+ } else {
+ Declaration =
+ new GlobalVariable(M, Ty, false, GlobalValue::ExternalLinkage,
+ nullptr, CurI->getName());
+
+ }
+ CurI->replaceAllUsesWith(Declaration);
+ delete &*CurI;
+ }
+ }
+
+ return true;
+ }
+ };
+
+ char GVExtractorPass::ID = 0;
+}
+
+ModulePass *llvm::createGVExtractionPass(std::vector<GlobalValue *> &GVs,
+ bool deleteFn, bool keepConstInit) {
+ return new GVExtractorPass(GVs, deleteFn, keepConstInit);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/ForceFunctionAttrs.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/ForceFunctionAttrs.cpp
index f0aa837e30..1a8bb225a6 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/ForceFunctionAttrs.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/ForceFunctionAttrs.cpp
@@ -1,31 +1,31 @@
-//===- ForceFunctionAttrs.cpp - Force function attrs for debugging --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "forceattrs"
-
-static cl::list<std::string>
- ForceAttributes("force-attribute", cl::Hidden,
- cl::desc("Add an attribute to a function. This should be a "
- "pair of 'function-name:attribute-name', for "
- "example -force-attribute=foo:noinline. This "
- "option can be specified multiple times."));
-
+//===- ForceFunctionAttrs.cpp - Force function attrs for debugging --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "forceattrs"
+
+static cl::list<std::string>
+ ForceAttributes("force-attribute", cl::Hidden,
+ cl::desc("Add an attribute to a function. This should be a "
+ "pair of 'function-name:attribute-name', for "
+ "example -force-attribute=foo:noinline. This "
+ "option can be specified multiple times."));
+
static cl::list<std::string> ForceRemoveAttributes(
"force-remove-attribute", cl::Hidden,
cl::desc("Remove an attribute from a function. This should be a "
@@ -33,73 +33,73 @@ static cl::list<std::string> ForceRemoveAttributes(
"example -force-remove-attribute=foo:noinline. This "
"option can be specified multiple times."));
-static Attribute::AttrKind parseAttrKind(StringRef Kind) {
- return StringSwitch<Attribute::AttrKind>(Kind)
- .Case("alwaysinline", Attribute::AlwaysInline)
- .Case("builtin", Attribute::Builtin)
- .Case("cold", Attribute::Cold)
- .Case("convergent", Attribute::Convergent)
- .Case("inlinehint", Attribute::InlineHint)
- .Case("jumptable", Attribute::JumpTable)
- .Case("minsize", Attribute::MinSize)
- .Case("naked", Attribute::Naked)
- .Case("nobuiltin", Attribute::NoBuiltin)
- .Case("noduplicate", Attribute::NoDuplicate)
- .Case("noimplicitfloat", Attribute::NoImplicitFloat)
- .Case("noinline", Attribute::NoInline)
- .Case("nonlazybind", Attribute::NonLazyBind)
- .Case("noredzone", Attribute::NoRedZone)
- .Case("noreturn", Attribute::NoReturn)
- .Case("nocf_check", Attribute::NoCfCheck)
- .Case("norecurse", Attribute::NoRecurse)
- .Case("nounwind", Attribute::NoUnwind)
- .Case("optforfuzzing", Attribute::OptForFuzzing)
- .Case("optnone", Attribute::OptimizeNone)
- .Case("optsize", Attribute::OptimizeForSize)
- .Case("readnone", Attribute::ReadNone)
- .Case("readonly", Attribute::ReadOnly)
- .Case("argmemonly", Attribute::ArgMemOnly)
- .Case("returns_twice", Attribute::ReturnsTwice)
- .Case("safestack", Attribute::SafeStack)
- .Case("shadowcallstack", Attribute::ShadowCallStack)
- .Case("sanitize_address", Attribute::SanitizeAddress)
- .Case("sanitize_hwaddress", Attribute::SanitizeHWAddress)
- .Case("sanitize_memory", Attribute::SanitizeMemory)
- .Case("sanitize_thread", Attribute::SanitizeThread)
- .Case("sanitize_memtag", Attribute::SanitizeMemTag)
- .Case("speculative_load_hardening", Attribute::SpeculativeLoadHardening)
- .Case("ssp", Attribute::StackProtect)
- .Case("sspreq", Attribute::StackProtectReq)
- .Case("sspstrong", Attribute::StackProtectStrong)
- .Case("strictfp", Attribute::StrictFP)
- .Case("uwtable", Attribute::UWTable)
- .Default(Attribute::None);
-}
-
-/// If F has any forced attributes given on the command line, add them.
+static Attribute::AttrKind parseAttrKind(StringRef Kind) {
+ return StringSwitch<Attribute::AttrKind>(Kind)
+ .Case("alwaysinline", Attribute::AlwaysInline)
+ .Case("builtin", Attribute::Builtin)
+ .Case("cold", Attribute::Cold)
+ .Case("convergent", Attribute::Convergent)
+ .Case("inlinehint", Attribute::InlineHint)
+ .Case("jumptable", Attribute::JumpTable)
+ .Case("minsize", Attribute::MinSize)
+ .Case("naked", Attribute::Naked)
+ .Case("nobuiltin", Attribute::NoBuiltin)
+ .Case("noduplicate", Attribute::NoDuplicate)
+ .Case("noimplicitfloat", Attribute::NoImplicitFloat)
+ .Case("noinline", Attribute::NoInline)
+ .Case("nonlazybind", Attribute::NonLazyBind)
+ .Case("noredzone", Attribute::NoRedZone)
+ .Case("noreturn", Attribute::NoReturn)
+ .Case("nocf_check", Attribute::NoCfCheck)
+ .Case("norecurse", Attribute::NoRecurse)
+ .Case("nounwind", Attribute::NoUnwind)
+ .Case("optforfuzzing", Attribute::OptForFuzzing)
+ .Case("optnone", Attribute::OptimizeNone)
+ .Case("optsize", Attribute::OptimizeForSize)
+ .Case("readnone", Attribute::ReadNone)
+ .Case("readonly", Attribute::ReadOnly)
+ .Case("argmemonly", Attribute::ArgMemOnly)
+ .Case("returns_twice", Attribute::ReturnsTwice)
+ .Case("safestack", Attribute::SafeStack)
+ .Case("shadowcallstack", Attribute::ShadowCallStack)
+ .Case("sanitize_address", Attribute::SanitizeAddress)
+ .Case("sanitize_hwaddress", Attribute::SanitizeHWAddress)
+ .Case("sanitize_memory", Attribute::SanitizeMemory)
+ .Case("sanitize_thread", Attribute::SanitizeThread)
+ .Case("sanitize_memtag", Attribute::SanitizeMemTag)
+ .Case("speculative_load_hardening", Attribute::SpeculativeLoadHardening)
+ .Case("ssp", Attribute::StackProtect)
+ .Case("sspreq", Attribute::StackProtectReq)
+ .Case("sspstrong", Attribute::StackProtectStrong)
+ .Case("strictfp", Attribute::StrictFP)
+ .Case("uwtable", Attribute::UWTable)
+ .Default(Attribute::None);
+}
+
+/// If F has any forced attributes given on the command line, add them.
/// If F has any forced remove attributes given on the command line, remove
/// them. When both force and force-remove are given to a function, the latter
/// takes precedence.
static void forceAttributes(Function &F) {
auto ParseFunctionAndAttr = [&](StringRef S) {
auto Kind = Attribute::None;
- auto KV = StringRef(S).split(':');
- if (KV.first != F.getName())
+ auto KV = StringRef(S).split(':');
+ if (KV.first != F.getName())
return Kind;
Kind = parseAttrKind(KV.second);
- if (Kind == Attribute::None) {
- LLVM_DEBUG(dbgs() << "ForcedAttribute: " << KV.second
- << " unknown or not handled!\n");
- }
+ if (Kind == Attribute::None) {
+ LLVM_DEBUG(dbgs() << "ForcedAttribute: " << KV.second
+ << " unknown or not handled!\n");
+ }
return Kind;
};
for (auto &S : ForceAttributes) {
auto Kind = ParseFunctionAndAttr(S);
if (Kind == Attribute::None || F.hasFnAttribute(Kind))
- continue;
- F.addFnAttr(Kind);
- }
+ continue;
+ F.addFnAttr(Kind);
+ }
for (auto &S : ForceRemoveAttributes) {
auto Kind = ParseFunctionAndAttr(S);
@@ -107,49 +107,49 @@ static void forceAttributes(Function &F) {
continue;
F.removeFnAttr(Kind);
}
-}
-
+}
+
static bool hasForceAttributes() {
return !ForceAttributes.empty() || !ForceRemoveAttributes.empty();
}
-PreservedAnalyses ForceFunctionAttrsPass::run(Module &M,
- ModuleAnalysisManager &) {
+PreservedAnalyses ForceFunctionAttrsPass::run(Module &M,
+ ModuleAnalysisManager &) {
if (!hasForceAttributes())
- return PreservedAnalyses::all();
-
- for (Function &F : M.functions())
+ return PreservedAnalyses::all();
+
+ for (Function &F : M.functions())
forceAttributes(F);
-
- // Just conservatively invalidate analyses, this isn't likely to be important.
- return PreservedAnalyses::none();
-}
-
-namespace {
-struct ForceFunctionAttrsLegacyPass : public ModulePass {
- static char ID; // Pass identification, replacement for typeid
- ForceFunctionAttrsLegacyPass() : ModulePass(ID) {
- initializeForceFunctionAttrsLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
+
+ // Just conservatively invalidate analyses, this isn't likely to be important.
+ return PreservedAnalyses::none();
+}
+
+namespace {
+struct ForceFunctionAttrsLegacyPass : public ModulePass {
+ static char ID; // Pass identification, replacement for typeid
+ ForceFunctionAttrsLegacyPass() : ModulePass(ID) {
+ initializeForceFunctionAttrsLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
if (!hasForceAttributes())
- return false;
-
- for (Function &F : M.functions())
+ return false;
+
+ for (Function &F : M.functions())
forceAttributes(F);
-
- // Conservatively assume we changed something.
- return true;
- }
-};
-}
-
-char ForceFunctionAttrsLegacyPass::ID = 0;
-INITIALIZE_PASS(ForceFunctionAttrsLegacyPass, "forceattrs",
- "Force set function attributes", false, false)
-
-Pass *llvm::createForceFunctionAttrsLegacyPass() {
- return new ForceFunctionAttrsLegacyPass();
-}
+
+ // Conservatively assume we changed something.
+ return true;
+ }
+};
+}
+
+char ForceFunctionAttrsLegacyPass::ID = 0;
+INITIALIZE_PASS(ForceFunctionAttrsLegacyPass, "forceattrs",
+ "Force set function attributes", false, false)
+
+Pass *llvm::createForceFunctionAttrsLegacyPass() {
+ return new ForceFunctionAttrsLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/FunctionAttrs.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/FunctionAttrs.cpp
index 86a30355bb..6730824e86 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1,154 +1,154 @@
-//===- FunctionAttrs.cpp - Pass which marks functions attributes ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This file implements interprocedural passes which walk the
-/// call-graph deducing and/or propagating function attributes.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/FunctionAttrs.h"
+//===- FunctionAttrs.cpp - Pass which marks functions attributes ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements interprocedural passes which walk the
+/// call-graph deducing and/or propagating function attributes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/CGSCCPassManager.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/LazyCallGraph.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
-#include <cassert>
-#include <iterator>
-#include <map>
-#include <vector>
-
-using namespace llvm;
-
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include <cassert>
+#include <iterator>
+#include <map>
+#include <vector>
+
+using namespace llvm;
+
#define DEBUG_TYPE "function-attrs"
-
-STATISTIC(NumReadNone, "Number of functions marked readnone");
-STATISTIC(NumReadOnly, "Number of functions marked readonly");
-STATISTIC(NumWriteOnly, "Number of functions marked writeonly");
-STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
-STATISTIC(NumReturned, "Number of arguments marked returned");
-STATISTIC(NumReadNoneArg, "Number of arguments marked readnone");
-STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly");
-STATISTIC(NumNoAlias, "Number of function returns marked noalias");
-STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull");
-STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
-STATISTIC(NumNoUnwind, "Number of functions marked as nounwind");
-STATISTIC(NumNoFree, "Number of functions marked as nofree");
+
+STATISTIC(NumReadNone, "Number of functions marked readnone");
+STATISTIC(NumReadOnly, "Number of functions marked readonly");
+STATISTIC(NumWriteOnly, "Number of functions marked writeonly");
+STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
+STATISTIC(NumReturned, "Number of arguments marked returned");
+STATISTIC(NumReadNoneArg, "Number of arguments marked readnone");
+STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly");
+STATISTIC(NumNoAlias, "Number of function returns marked noalias");
+STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull");
+STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
+STATISTIC(NumNoUnwind, "Number of functions marked as nounwind");
+STATISTIC(NumNoFree, "Number of functions marked as nofree");
STATISTIC(NumWillReturn, "Number of functions marked as willreturn");
-
-static cl::opt<bool> EnableNonnullArgPropagation(
- "enable-nonnull-arg-prop", cl::init(true), cl::Hidden,
- cl::desc("Try to propagate nonnull argument attributes from callsites to "
- "caller functions."));
-
-static cl::opt<bool> DisableNoUnwindInference(
- "disable-nounwind-inference", cl::Hidden,
- cl::desc("Stop inferring nounwind attribute during function-attrs pass"));
-
-static cl::opt<bool> DisableNoFreeInference(
- "disable-nofree-inference", cl::Hidden,
- cl::desc("Stop inferring nofree attribute during function-attrs pass"));
-
-namespace {
-
-using SCCNodeSet = SmallSetVector<Function *, 8>;
-
-} // end anonymous namespace
-
-/// Returns the memory access attribute for function F using AAR for AA results,
-/// where SCCNodes is the current SCC.
-///
-/// If ThisBody is true, this function may examine the function body and will
-/// return a result pertaining to this copy of the function. If it is false, the
-/// result will be based only on AA results for the function declaration; it
-/// will be assumed that some other (perhaps less optimized) version of the
-/// function may be selected at link time.
-static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
- AAResults &AAR,
- const SCCNodeSet &SCCNodes) {
- FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F);
- if (MRB == FMRB_DoesNotAccessMemory)
- // Already perfect!
- return MAK_ReadNone;
-
- if (!ThisBody) {
- if (AliasAnalysis::onlyReadsMemory(MRB))
- return MAK_ReadOnly;
-
- if (AliasAnalysis::doesNotReadMemory(MRB))
- return MAK_WriteOnly;
-
- // Conservatively assume it reads and writes to memory.
- return MAK_MayWrite;
- }
-
- // Scan the function body for instructions that may read or write memory.
- bool ReadsMemory = false;
- bool WritesMemory = false;
- for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
- Instruction *I = &*II;
-
- // Some instructions can be ignored even if they read or write memory.
- // Detect these now, skipping to the next instruction if one is found.
- if (auto *Call = dyn_cast<CallBase>(I)) {
- // Ignore calls to functions in the same SCC, as long as the call sites
- // don't have operand bundles. Calls with operand bundles are allowed to
- // have memory effects not described by the memory effects of the call
- // target.
- if (!Call->hasOperandBundles() && Call->getCalledFunction() &&
- SCCNodes.count(Call->getCalledFunction()))
- continue;
- FunctionModRefBehavior MRB = AAR.getModRefBehavior(Call);
- ModRefInfo MRI = createModRefInfo(MRB);
-
- // If the call doesn't access memory, we're done.
- if (isNoModRef(MRI))
- continue;
-
+
+static cl::opt<bool> EnableNonnullArgPropagation(
+ "enable-nonnull-arg-prop", cl::init(true), cl::Hidden,
+ cl::desc("Try to propagate nonnull argument attributes from callsites to "
+ "caller functions."));
+
+static cl::opt<bool> DisableNoUnwindInference(
+ "disable-nounwind-inference", cl::Hidden,
+ cl::desc("Stop inferring nounwind attribute during function-attrs pass"));
+
+static cl::opt<bool> DisableNoFreeInference(
+ "disable-nofree-inference", cl::Hidden,
+ cl::desc("Stop inferring nofree attribute during function-attrs pass"));
+
+namespace {
+
+using SCCNodeSet = SmallSetVector<Function *, 8>;
+
+} // end anonymous namespace
+
+/// Returns the memory access attribute for function F using AAR for AA results,
+/// where SCCNodes is the current SCC.
+///
+/// If ThisBody is true, this function may examine the function body and will
+/// return a result pertaining to this copy of the function. If it is false, the
+/// result will be based only on AA results for the function declaration; it
+/// will be assumed that some other (perhaps less optimized) version of the
+/// function may be selected at link time.
+static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
+ AAResults &AAR,
+ const SCCNodeSet &SCCNodes) {
+ FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F);
+ if (MRB == FMRB_DoesNotAccessMemory)
+ // Already perfect!
+ return MAK_ReadNone;
+
+ if (!ThisBody) {
+ if (AliasAnalysis::onlyReadsMemory(MRB))
+ return MAK_ReadOnly;
+
+ if (AliasAnalysis::doesNotReadMemory(MRB))
+ return MAK_WriteOnly;
+
+ // Conservatively assume it reads and writes to memory.
+ return MAK_MayWrite;
+ }
+
+ // Scan the function body for instructions that may read or write memory.
+ bool ReadsMemory = false;
+ bool WritesMemory = false;
+ for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
+ Instruction *I = &*II;
+
+ // Some instructions can be ignored even if they read or write memory.
+ // Detect these now, skipping to the next instruction if one is found.
+ if (auto *Call = dyn_cast<CallBase>(I)) {
+ // Ignore calls to functions in the same SCC, as long as the call sites
+ // don't have operand bundles. Calls with operand bundles are allowed to
+ // have memory effects not described by the memory effects of the call
+ // target.
+ if (!Call->hasOperandBundles() && Call->getCalledFunction() &&
+ SCCNodes.count(Call->getCalledFunction()))
+ continue;
+ FunctionModRefBehavior MRB = AAR.getModRefBehavior(Call);
+ ModRefInfo MRI = createModRefInfo(MRB);
+
+ // If the call doesn't access memory, we're done.
+ if (isNoModRef(MRI))
+ continue;
+
// A pseudo probe call shouldn't change any function attribute since it
// doesn't translate to a real instruction. It comes with a memory access
// tag to prevent itself being removed by optimizations and not block
@@ -156,1158 +156,1158 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
if (isa<PseudoProbeInst>(I))
continue;
- if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) {
- // The call could access any memory. If that includes writes, note it.
- if (isModSet(MRI))
- WritesMemory = true;
- // If it reads, note it.
- if (isRefSet(MRI))
- ReadsMemory = true;
- continue;
- }
-
- // Check whether all pointer arguments point to local memory, and
- // ignore calls that only access local memory.
- for (auto CI = Call->arg_begin(), CE = Call->arg_end(); CI != CE; ++CI) {
- Value *Arg = *CI;
- if (!Arg->getType()->isPtrOrPtrVectorTy())
- continue;
-
- AAMDNodes AAInfo;
- I->getAAMetadata(AAInfo);
+ if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) {
+ // The call could access any memory. If that includes writes, note it.
+ if (isModSet(MRI))
+ WritesMemory = true;
+ // If it reads, note it.
+ if (isRefSet(MRI))
+ ReadsMemory = true;
+ continue;
+ }
+
+ // Check whether all pointer arguments point to local memory, and
+ // ignore calls that only access local memory.
+ for (auto CI = Call->arg_begin(), CE = Call->arg_end(); CI != CE; ++CI) {
+ Value *Arg = *CI;
+ if (!Arg->getType()->isPtrOrPtrVectorTy())
+ continue;
+
+ AAMDNodes AAInfo;
+ I->getAAMetadata(AAInfo);
MemoryLocation Loc = MemoryLocation::getBeforeOrAfter(Arg, AAInfo);
-
- // Skip accesses to local or constant memory as they don't impact the
- // externally visible mod/ref behavior.
- if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
- continue;
-
- if (isModSet(MRI))
- // Writes non-local memory.
- WritesMemory = true;
- if (isRefSet(MRI))
- // Ok, it reads non-local memory.
- ReadsMemory = true;
- }
- continue;
- } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
- // Ignore non-volatile loads from local memory. (Atomic is okay here.)
- if (!LI->isVolatile()) {
- MemoryLocation Loc = MemoryLocation::get(LI);
- if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
- continue;
- }
- } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
- // Ignore non-volatile stores to local memory. (Atomic is okay here.)
- if (!SI->isVolatile()) {
- MemoryLocation Loc = MemoryLocation::get(SI);
- if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
- continue;
- }
- } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) {
- // Ignore vaargs on local memory.
- MemoryLocation Loc = MemoryLocation::get(VI);
- if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
- continue;
- }
-
- // Any remaining instructions need to be taken seriously! Check if they
- // read or write memory.
- //
- // Writes memory, remember that.
- WritesMemory |= I->mayWriteToMemory();
-
- // If this instruction may read memory, remember that.
- ReadsMemory |= I->mayReadFromMemory();
- }
-
- if (WritesMemory) {
- if (!ReadsMemory)
- return MAK_WriteOnly;
- else
- return MAK_MayWrite;
- }
-
- return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone;
-}
-
-MemoryAccessKind llvm::computeFunctionBodyMemoryAccess(Function &F,
- AAResults &AAR) {
- return checkFunctionMemoryAccess(F, /*ThisBody=*/true, AAR, {});
-}
-
-/// Deduce readonly/readnone attributes for the SCC.
-template <typename AARGetterT>
-static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
- // Check if any of the functions in the SCC read or write memory. If they
- // write memory then they can't be marked readnone or readonly.
- bool ReadsMemory = false;
- bool WritesMemory = false;
- for (Function *F : SCCNodes) {
- // Call the callable parameter to look up AA results for this function.
- AAResults &AAR = AARGetter(*F);
-
- // Non-exact function definitions may not be selected at link time, and an
- // alternative version that writes to memory may be selected. See the
- // comment on GlobalValue::isDefinitionExact for more details.
- switch (checkFunctionMemoryAccess(*F, F->hasExactDefinition(),
- AAR, SCCNodes)) {
- case MAK_MayWrite:
- return false;
- case MAK_ReadOnly:
- ReadsMemory = true;
- break;
- case MAK_WriteOnly:
- WritesMemory = true;
- break;
- case MAK_ReadNone:
- // Nothing to do!
- break;
- }
- }
-
- // If the SCC contains both functions that read and functions that write, then
- // we cannot add readonly attributes.
- if (ReadsMemory && WritesMemory)
- return false;
-
- // Success! Functions in this SCC do not access memory, or only read memory.
- // Give them the appropriate attribute.
- bool MadeChange = false;
-
- for (Function *F : SCCNodes) {
- if (F->doesNotAccessMemory())
- // Already perfect!
- continue;
-
- if (F->onlyReadsMemory() && ReadsMemory)
- // No change.
- continue;
-
- if (F->doesNotReadMemory() && WritesMemory)
- continue;
-
- MadeChange = true;
-
- // Clear out any existing attributes.
+
+ // Skip accesses to local or constant memory as they don't impact the
+ // externally visible mod/ref behavior.
+ if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+ continue;
+
+ if (isModSet(MRI))
+ // Writes non-local memory.
+ WritesMemory = true;
+ if (isRefSet(MRI))
+ // Ok, it reads non-local memory.
+ ReadsMemory = true;
+ }
+ continue;
+ } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ // Ignore non-volatile loads from local memory. (Atomic is okay here.)
+ if (!LI->isVolatile()) {
+ MemoryLocation Loc = MemoryLocation::get(LI);
+ if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+ continue;
+ }
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ // Ignore non-volatile stores to local memory. (Atomic is okay here.)
+ if (!SI->isVolatile()) {
+ MemoryLocation Loc = MemoryLocation::get(SI);
+ if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+ continue;
+ }
+ } else if (VAArgInst *VI = dyn_cast<VAArgInst>(I)) {
+ // Ignore vaargs on local memory.
+ MemoryLocation Loc = MemoryLocation::get(VI);
+ if (AAR.pointsToConstantMemory(Loc, /*OrLocal=*/true))
+ continue;
+ }
+
+ // Any remaining instructions need to be taken seriously! Check if they
+ // read or write memory.
+ //
+ // Writes memory, remember that.
+ WritesMemory |= I->mayWriteToMemory();
+
+ // If this instruction may read memory, remember that.
+ ReadsMemory |= I->mayReadFromMemory();
+ }
+
+ if (WritesMemory) {
+ if (!ReadsMemory)
+ return MAK_WriteOnly;
+ else
+ return MAK_MayWrite;
+ }
+
+ return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone;
+}
+
+MemoryAccessKind llvm::computeFunctionBodyMemoryAccess(Function &F,
+ AAResults &AAR) {
+ return checkFunctionMemoryAccess(F, /*ThisBody=*/true, AAR, {});
+}
+
+/// Deduce readonly/readnone attributes for the SCC.
+template <typename AARGetterT>
+static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
+ // Check if any of the functions in the SCC read or write memory. If they
+ // write memory then they can't be marked readnone or readonly.
+ bool ReadsMemory = false;
+ bool WritesMemory = false;
+ for (Function *F : SCCNodes) {
+ // Call the callable parameter to look up AA results for this function.
+ AAResults &AAR = AARGetter(*F);
+
+ // Non-exact function definitions may not be selected at link time, and an
+ // alternative version that writes to memory may be selected. See the
+ // comment on GlobalValue::isDefinitionExact for more details.
+ switch (checkFunctionMemoryAccess(*F, F->hasExactDefinition(),
+ AAR, SCCNodes)) {
+ case MAK_MayWrite:
+ return false;
+ case MAK_ReadOnly:
+ ReadsMemory = true;
+ break;
+ case MAK_WriteOnly:
+ WritesMemory = true;
+ break;
+ case MAK_ReadNone:
+ // Nothing to do!
+ break;
+ }
+ }
+
+ // If the SCC contains both functions that read and functions that write, then
+ // we cannot add readonly attributes.
+ if (ReadsMemory && WritesMemory)
+ return false;
+
+ // Success! Functions in this SCC do not access memory, or only read memory.
+ // Give them the appropriate attribute.
+ bool MadeChange = false;
+
+ for (Function *F : SCCNodes) {
+ if (F->doesNotAccessMemory())
+ // Already perfect!
+ continue;
+
+ if (F->onlyReadsMemory() && ReadsMemory)
+ // No change.
+ continue;
+
+ if (F->doesNotReadMemory() && WritesMemory)
+ continue;
+
+ MadeChange = true;
+
+ // Clear out any existing attributes.
AttrBuilder AttrsToRemove;
AttrsToRemove.addAttribute(Attribute::ReadOnly);
AttrsToRemove.addAttribute(Attribute::ReadNone);
AttrsToRemove.addAttribute(Attribute::WriteOnly);
-
- if (!WritesMemory && !ReadsMemory) {
- // Clear out any "access range attributes" if readnone was deduced.
+
+ if (!WritesMemory && !ReadsMemory) {
+ // Clear out any "access range attributes" if readnone was deduced.
AttrsToRemove.addAttribute(Attribute::ArgMemOnly);
AttrsToRemove.addAttribute(Attribute::InaccessibleMemOnly);
AttrsToRemove.addAttribute(Attribute::InaccessibleMemOrArgMemOnly);
- }
+ }
F->removeAttributes(AttributeList::FunctionIndex, AttrsToRemove);
-
- // Add in the new attribute.
- if (WritesMemory && !ReadsMemory)
- F->addFnAttr(Attribute::WriteOnly);
- else
- F->addFnAttr(ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone);
-
- if (WritesMemory && !ReadsMemory)
- ++NumWriteOnly;
- else if (ReadsMemory)
- ++NumReadOnly;
- else
- ++NumReadNone;
- }
-
- return MadeChange;
-}
-
-namespace {
-
-/// For a given pointer Argument, this retains a list of Arguments of functions
-/// in the same SCC that the pointer data flows into. We use this to build an
-/// SCC of the arguments.
-struct ArgumentGraphNode {
- Argument *Definition;
- SmallVector<ArgumentGraphNode *, 4> Uses;
-};
-
-class ArgumentGraph {
- // We store pointers to ArgumentGraphNode objects, so it's important that
- // that they not move around upon insert.
- using ArgumentMapTy = std::map<Argument *, ArgumentGraphNode>;
-
- ArgumentMapTy ArgumentMap;
-
- // There is no root node for the argument graph, in fact:
- // void f(int *x, int *y) { if (...) f(x, y); }
- // is an example where the graph is disconnected. The SCCIterator requires a
- // single entry point, so we maintain a fake ("synthetic") root node that
- // uses every node. Because the graph is directed and nothing points into
- // the root, it will not participate in any SCCs (except for its own).
- ArgumentGraphNode SyntheticRoot;
-
-public:
- ArgumentGraph() { SyntheticRoot.Definition = nullptr; }
-
- using iterator = SmallVectorImpl<ArgumentGraphNode *>::iterator;
-
- iterator begin() { return SyntheticRoot.Uses.begin(); }
- iterator end() { return SyntheticRoot.Uses.end(); }
- ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; }
-
- ArgumentGraphNode *operator[](Argument *A) {
- ArgumentGraphNode &Node = ArgumentMap[A];
- Node.Definition = A;
- SyntheticRoot.Uses.push_back(&Node);
- return &Node;
- }
-};
-
-/// This tracker checks whether callees are in the SCC, and if so it does not
-/// consider that a capture, instead adding it to the "Uses" list and
-/// continuing with the analysis.
-struct ArgumentUsesTracker : public CaptureTracker {
- ArgumentUsesTracker(const SCCNodeSet &SCCNodes) : SCCNodes(SCCNodes) {}
-
- void tooManyUses() override { Captured = true; }
-
- bool captured(const Use *U) override {
- CallBase *CB = dyn_cast<CallBase>(U->getUser());
- if (!CB) {
- Captured = true;
- return true;
- }
-
- Function *F = CB->getCalledFunction();
- if (!F || !F->hasExactDefinition() || !SCCNodes.count(F)) {
- Captured = true;
- return true;
- }
-
- // Note: the callee and the two successor blocks *follow* the argument
- // operands. This means there is no need to adjust UseIndex to account for
- // these.
-
- unsigned UseIndex =
- std::distance(const_cast<const Use *>(CB->arg_begin()), U);
-
- assert(UseIndex < CB->data_operands_size() &&
- "Indirect function calls should have been filtered above!");
-
- if (UseIndex >= CB->getNumArgOperands()) {
- // Data operand, but not a argument operand -- must be a bundle operand
- assert(CB->hasOperandBundles() && "Must be!");
-
- // CaptureTracking told us that we're being captured by an operand bundle
- // use. In this case it does not matter if the callee is within our SCC
- // or not -- we've been captured in some unknown way, and we have to be
- // conservative.
- Captured = true;
- return true;
- }
-
- if (UseIndex >= F->arg_size()) {
- assert(F->isVarArg() && "More params than args in non-varargs call");
- Captured = true;
- return true;
- }
-
- Uses.push_back(&*std::next(F->arg_begin(), UseIndex));
- return false;
- }
-
- // True only if certainly captured (used outside our SCC).
- bool Captured = false;
-
- // Uses within our SCC.
- SmallVector<Argument *, 4> Uses;
-
- const SCCNodeSet &SCCNodes;
-};
-
-} // end anonymous namespace
-
-namespace llvm {
-
-template <> struct GraphTraits<ArgumentGraphNode *> {
- using NodeRef = ArgumentGraphNode *;
- using ChildIteratorType = SmallVectorImpl<ArgumentGraphNode *>::iterator;
-
- static NodeRef getEntryNode(NodeRef A) { return A; }
- static ChildIteratorType child_begin(NodeRef N) { return N->Uses.begin(); }
- static ChildIteratorType child_end(NodeRef N) { return N->Uses.end(); }
-};
-
-template <>
-struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> {
- static NodeRef getEntryNode(ArgumentGraph *AG) { return AG->getEntryNode(); }
-
- static ChildIteratorType nodes_begin(ArgumentGraph *AG) {
- return AG->begin();
- }
-
- static ChildIteratorType nodes_end(ArgumentGraph *AG) { return AG->end(); }
-};
-
-} // end namespace llvm
-
-/// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone.
-static Attribute::AttrKind
-determinePointerReadAttrs(Argument *A,
- const SmallPtrSet<Argument *, 8> &SCCNodes) {
- SmallVector<Use *, 32> Worklist;
- SmallPtrSet<Use *, 32> Visited;
-
- // inalloca arguments are always clobbered by the call.
- if (A->hasInAllocaAttr() || A->hasPreallocatedAttr())
- return Attribute::None;
-
- bool IsRead = false;
- // We don't need to track IsWritten. If A is written to, return immediately.
-
- for (Use &U : A->uses()) {
- Visited.insert(&U);
- Worklist.push_back(&U);
- }
-
- while (!Worklist.empty()) {
- Use *U = Worklist.pop_back_val();
- Instruction *I = cast<Instruction>(U->getUser());
-
- switch (I->getOpcode()) {
- case Instruction::BitCast:
- case Instruction::GetElementPtr:
- case Instruction::PHI:
- case Instruction::Select:
- case Instruction::AddrSpaceCast:
- // The original value is not read/written via this if the new value isn't.
- for (Use &UU : I->uses())
- if (Visited.insert(&UU).second)
- Worklist.push_back(&UU);
- break;
-
- case Instruction::Call:
- case Instruction::Invoke: {
- bool Captures = true;
-
- if (I->getType()->isVoidTy())
- Captures = false;
-
- auto AddUsersToWorklistIfCapturing = [&] {
- if (Captures)
- for (Use &UU : I->uses())
- if (Visited.insert(&UU).second)
- Worklist.push_back(&UU);
- };
-
- CallBase &CB = cast<CallBase>(*I);
- if (CB.doesNotAccessMemory()) {
- AddUsersToWorklistIfCapturing();
- continue;
- }
-
- Function *F = CB.getCalledFunction();
- if (!F) {
- if (CB.onlyReadsMemory()) {
- IsRead = true;
- AddUsersToWorklistIfCapturing();
- continue;
- }
- return Attribute::None;
- }
-
- // Note: the callee and the two successor blocks *follow* the argument
- // operands. This means there is no need to adjust UseIndex to account
- // for these.
-
- unsigned UseIndex = std::distance(CB.arg_begin(), U);
-
- // U cannot be the callee operand use: since we're exploring the
- // transitive uses of an Argument, having such a use be a callee would
- // imply the call site is an indirect call or invoke; and we'd take the
- // early exit above.
- assert(UseIndex < CB.data_operands_size() &&
- "Data operand use expected!");
-
- bool IsOperandBundleUse = UseIndex >= CB.getNumArgOperands();
-
- if (UseIndex >= F->arg_size() && !IsOperandBundleUse) {
- assert(F->isVarArg() && "More params than args in non-varargs call");
- return Attribute::None;
- }
-
- Captures &= !CB.doesNotCapture(UseIndex);
-
- // Since the optimizer (by design) cannot see the data flow corresponding
- // to a operand bundle use, these cannot participate in the optimistic SCC
- // analysis. Instead, we model the operand bundle uses as arguments in
- // call to a function external to the SCC.
- if (IsOperandBundleUse ||
- !SCCNodes.count(&*std::next(F->arg_begin(), UseIndex))) {
-
- // The accessors used on call site here do the right thing for calls and
- // invokes with operand bundles.
-
- if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex))
- return Attribute::None;
- if (!CB.doesNotAccessMemory(UseIndex))
- IsRead = true;
- }
-
- AddUsersToWorklistIfCapturing();
- break;
- }
-
- case Instruction::Load:
- // A volatile load has side effects beyond what readonly can be relied
- // upon.
- if (cast<LoadInst>(I)->isVolatile())
- return Attribute::None;
-
- IsRead = true;
- break;
-
- case Instruction::ICmp:
- case Instruction::Ret:
- break;
-
- default:
- return Attribute::None;
- }
- }
-
- return IsRead ? Attribute::ReadOnly : Attribute::ReadNone;
-}
-
-/// Deduce returned attributes for the SCC.
-static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
- bool Changed = false;
-
- // Check each function in turn, determining if an argument is always returned.
- for (Function *F : SCCNodes) {
- // We can infer and propagate function attributes only when we know that the
- // definition we'll get at link time is *exactly* the definition we see now.
- // For more details, see GlobalValue::mayBeDerefined.
- if (!F->hasExactDefinition())
- continue;
-
- if (F->getReturnType()->isVoidTy())
- continue;
-
- // There is nothing to do if an argument is already marked as 'returned'.
- if (llvm::any_of(F->args(),
- [](const Argument &Arg) { return Arg.hasReturnedAttr(); }))
- continue;
-
- auto FindRetArg = [&]() -> Value * {
- Value *RetArg = nullptr;
- for (BasicBlock &BB : *F)
- if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) {
- // Note that stripPointerCasts should look through functions with
- // returned arguments.
- Value *RetVal = Ret->getReturnValue()->stripPointerCasts();
- if (!isa<Argument>(RetVal) || RetVal->getType() != F->getReturnType())
- return nullptr;
-
- if (!RetArg)
- RetArg = RetVal;
- else if (RetArg != RetVal)
- return nullptr;
- }
-
- return RetArg;
- };
-
- if (Value *RetArg = FindRetArg()) {
- auto *A = cast<Argument>(RetArg);
- A->addAttr(Attribute::Returned);
- ++NumReturned;
- Changed = true;
- }
- }
-
- return Changed;
-}
-
-/// If a callsite has arguments that are also arguments to the parent function,
-/// try to propagate attributes from the callsite's arguments to the parent's
-/// arguments. This may be important because inlining can cause information loss
-/// when attribute knowledge disappears with the inlined call.
-static bool addArgumentAttrsFromCallsites(Function &F) {
- if (!EnableNonnullArgPropagation)
- return false;
-
- bool Changed = false;
-
- // For an argument attribute to transfer from a callsite to the parent, the
- // call must be guaranteed to execute every time the parent is called.
- // Conservatively, just check for calls in the entry block that are guaranteed
- // to execute.
- // TODO: This could be enhanced by testing if the callsite post-dominates the
- // entry block or by doing simple forward walks or backward walks to the
- // callsite.
- BasicBlock &Entry = F.getEntryBlock();
- for (Instruction &I : Entry) {
- if (auto *CB = dyn_cast<CallBase>(&I)) {
- if (auto *CalledFunc = CB->getCalledFunction()) {
- for (auto &CSArg : CalledFunc->args()) {
+
+ // Add in the new attribute.
+ if (WritesMemory && !ReadsMemory)
+ F->addFnAttr(Attribute::WriteOnly);
+ else
+ F->addFnAttr(ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone);
+
+ if (WritesMemory && !ReadsMemory)
+ ++NumWriteOnly;
+ else if (ReadsMemory)
+ ++NumReadOnly;
+ else
+ ++NumReadNone;
+ }
+
+ return MadeChange;
+}
+
+namespace {
+
+/// For a given pointer Argument, this retains a list of Arguments of functions
+/// in the same SCC that the pointer data flows into. We use this to build an
+/// SCC of the arguments.
+struct ArgumentGraphNode {
+ Argument *Definition;
+ SmallVector<ArgumentGraphNode *, 4> Uses;
+};
+
+class ArgumentGraph {
+ // We store pointers to ArgumentGraphNode objects, so it's important that
+ // that they not move around upon insert.
+ using ArgumentMapTy = std::map<Argument *, ArgumentGraphNode>;
+
+ ArgumentMapTy ArgumentMap;
+
+ // There is no root node for the argument graph, in fact:
+ // void f(int *x, int *y) { if (...) f(x, y); }
+ // is an example where the graph is disconnected. The SCCIterator requires a
+ // single entry point, so we maintain a fake ("synthetic") root node that
+ // uses every node. Because the graph is directed and nothing points into
+ // the root, it will not participate in any SCCs (except for its own).
+ ArgumentGraphNode SyntheticRoot;
+
+public:
+ ArgumentGraph() { SyntheticRoot.Definition = nullptr; }
+
+ using iterator = SmallVectorImpl<ArgumentGraphNode *>::iterator;
+
+ iterator begin() { return SyntheticRoot.Uses.begin(); }
+ iterator end() { return SyntheticRoot.Uses.end(); }
+ ArgumentGraphNode *getEntryNode() { return &SyntheticRoot; }
+
+ ArgumentGraphNode *operator[](Argument *A) {
+ ArgumentGraphNode &Node = ArgumentMap[A];
+ Node.Definition = A;
+ SyntheticRoot.Uses.push_back(&Node);
+ return &Node;
+ }
+};
+
+/// This tracker checks whether callees are in the SCC, and if so it does not
+/// consider that a capture, instead adding it to the "Uses" list and
+/// continuing with the analysis.
+struct ArgumentUsesTracker : public CaptureTracker {
+ ArgumentUsesTracker(const SCCNodeSet &SCCNodes) : SCCNodes(SCCNodes) {}
+
+ void tooManyUses() override { Captured = true; }
+
+ bool captured(const Use *U) override {
+ CallBase *CB = dyn_cast<CallBase>(U->getUser());
+ if (!CB) {
+ Captured = true;
+ return true;
+ }
+
+ Function *F = CB->getCalledFunction();
+ if (!F || !F->hasExactDefinition() || !SCCNodes.count(F)) {
+ Captured = true;
+ return true;
+ }
+
+ // Note: the callee and the two successor blocks *follow* the argument
+ // operands. This means there is no need to adjust UseIndex to account for
+ // these.
+
+ unsigned UseIndex =
+ std::distance(const_cast<const Use *>(CB->arg_begin()), U);
+
+ assert(UseIndex < CB->data_operands_size() &&
+ "Indirect function calls should have been filtered above!");
+
+ if (UseIndex >= CB->getNumArgOperands()) {
+ // Data operand, but not a argument operand -- must be a bundle operand
+ assert(CB->hasOperandBundles() && "Must be!");
+
+ // CaptureTracking told us that we're being captured by an operand bundle
+ // use. In this case it does not matter if the callee is within our SCC
+ // or not -- we've been captured in some unknown way, and we have to be
+ // conservative.
+ Captured = true;
+ return true;
+ }
+
+ if (UseIndex >= F->arg_size()) {
+ assert(F->isVarArg() && "More params than args in non-varargs call");
+ Captured = true;
+ return true;
+ }
+
+ Uses.push_back(&*std::next(F->arg_begin(), UseIndex));
+ return false;
+ }
+
+ // True only if certainly captured (used outside our SCC).
+ bool Captured = false;
+
+ // Uses within our SCC.
+ SmallVector<Argument *, 4> Uses;
+
+ const SCCNodeSet &SCCNodes;
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <> struct GraphTraits<ArgumentGraphNode *> {
+ using NodeRef = ArgumentGraphNode *;
+ using ChildIteratorType = SmallVectorImpl<ArgumentGraphNode *>::iterator;
+
+ static NodeRef getEntryNode(NodeRef A) { return A; }
+ static ChildIteratorType child_begin(NodeRef N) { return N->Uses.begin(); }
+ static ChildIteratorType child_end(NodeRef N) { return N->Uses.end(); }
+};
+
+template <>
+struct GraphTraits<ArgumentGraph *> : public GraphTraits<ArgumentGraphNode *> {
+ static NodeRef getEntryNode(ArgumentGraph *AG) { return AG->getEntryNode(); }
+
+ static ChildIteratorType nodes_begin(ArgumentGraph *AG) {
+ return AG->begin();
+ }
+
+ static ChildIteratorType nodes_end(ArgumentGraph *AG) { return AG->end(); }
+};
+
+} // end namespace llvm
+
+/// Returns Attribute::None, Attribute::ReadOnly or Attribute::ReadNone.
+static Attribute::AttrKind
+determinePointerReadAttrs(Argument *A,
+ const SmallPtrSet<Argument *, 8> &SCCNodes) {
+ SmallVector<Use *, 32> Worklist;
+ SmallPtrSet<Use *, 32> Visited;
+
+ // inalloca arguments are always clobbered by the call.
+ if (A->hasInAllocaAttr() || A->hasPreallocatedAttr())
+ return Attribute::None;
+
+ bool IsRead = false;
+ // We don't need to track IsWritten. If A is written to, return immediately.
+
+ for (Use &U : A->uses()) {
+ Visited.insert(&U);
+ Worklist.push_back(&U);
+ }
+
+ while (!Worklist.empty()) {
+ Use *U = Worklist.pop_back_val();
+ Instruction *I = cast<Instruction>(U->getUser());
+
+ switch (I->getOpcode()) {
+ case Instruction::BitCast:
+ case Instruction::GetElementPtr:
+ case Instruction::PHI:
+ case Instruction::Select:
+ case Instruction::AddrSpaceCast:
+ // The original value is not read/written via this if the new value isn't.
+ for (Use &UU : I->uses())
+ if (Visited.insert(&UU).second)
+ Worklist.push_back(&UU);
+ break;
+
+ case Instruction::Call:
+ case Instruction::Invoke: {
+ bool Captures = true;
+
+ if (I->getType()->isVoidTy())
+ Captures = false;
+
+ auto AddUsersToWorklistIfCapturing = [&] {
+ if (Captures)
+ for (Use &UU : I->uses())
+ if (Visited.insert(&UU).second)
+ Worklist.push_back(&UU);
+ };
+
+ CallBase &CB = cast<CallBase>(*I);
+ if (CB.doesNotAccessMemory()) {
+ AddUsersToWorklistIfCapturing();
+ continue;
+ }
+
+ Function *F = CB.getCalledFunction();
+ if (!F) {
+ if (CB.onlyReadsMemory()) {
+ IsRead = true;
+ AddUsersToWorklistIfCapturing();
+ continue;
+ }
+ return Attribute::None;
+ }
+
+ // Note: the callee and the two successor blocks *follow* the argument
+ // operands. This means there is no need to adjust UseIndex to account
+ // for these.
+
+ unsigned UseIndex = std::distance(CB.arg_begin(), U);
+
+ // U cannot be the callee operand use: since we're exploring the
+ // transitive uses of an Argument, having such a use be a callee would
+ // imply the call site is an indirect call or invoke; and we'd take the
+ // early exit above.
+ assert(UseIndex < CB.data_operands_size() &&
+ "Data operand use expected!");
+
+ bool IsOperandBundleUse = UseIndex >= CB.getNumArgOperands();
+
+ if (UseIndex >= F->arg_size() && !IsOperandBundleUse) {
+ assert(F->isVarArg() && "More params than args in non-varargs call");
+ return Attribute::None;
+ }
+
+ Captures &= !CB.doesNotCapture(UseIndex);
+
+ // Since the optimizer (by design) cannot see the data flow corresponding
+ // to a operand bundle use, these cannot participate in the optimistic SCC
+ // analysis. Instead, we model the operand bundle uses as arguments in
+ // call to a function external to the SCC.
+ if (IsOperandBundleUse ||
+ !SCCNodes.count(&*std::next(F->arg_begin(), UseIndex))) {
+
+ // The accessors used on call site here do the right thing for calls and
+ // invokes with operand bundles.
+
+ if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex))
+ return Attribute::None;
+ if (!CB.doesNotAccessMemory(UseIndex))
+ IsRead = true;
+ }
+
+ AddUsersToWorklistIfCapturing();
+ break;
+ }
+
+ case Instruction::Load:
+ // A volatile load has side effects beyond what readonly can be relied
+ // upon.
+ if (cast<LoadInst>(I)->isVolatile())
+ return Attribute::None;
+
+ IsRead = true;
+ break;
+
+ case Instruction::ICmp:
+ case Instruction::Ret:
+ break;
+
+ default:
+ return Attribute::None;
+ }
+ }
+
+ return IsRead ? Attribute::ReadOnly : Attribute::ReadNone;
+}
+
+/// Deduce returned attributes for the SCC.
+static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
+ bool Changed = false;
+
+ // Check each function in turn, determining if an argument is always returned.
+ for (Function *F : SCCNodes) {
+ // We can infer and propagate function attributes only when we know that the
+ // definition we'll get at link time is *exactly* the definition we see now.
+ // For more details, see GlobalValue::mayBeDerefined.
+ if (!F->hasExactDefinition())
+ continue;
+
+ if (F->getReturnType()->isVoidTy())
+ continue;
+
+ // There is nothing to do if an argument is already marked as 'returned'.
+ if (llvm::any_of(F->args(),
+ [](const Argument &Arg) { return Arg.hasReturnedAttr(); }))
+ continue;
+
+ auto FindRetArg = [&]() -> Value * {
+ Value *RetArg = nullptr;
+ for (BasicBlock &BB : *F)
+ if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator())) {
+ // Note that stripPointerCasts should look through functions with
+ // returned arguments.
+ Value *RetVal = Ret->getReturnValue()->stripPointerCasts();
+ if (!isa<Argument>(RetVal) || RetVal->getType() != F->getReturnType())
+ return nullptr;
+
+ if (!RetArg)
+ RetArg = RetVal;
+ else if (RetArg != RetVal)
+ return nullptr;
+ }
+
+ return RetArg;
+ };
+
+ if (Value *RetArg = FindRetArg()) {
+ auto *A = cast<Argument>(RetArg);
+ A->addAttr(Attribute::Returned);
+ ++NumReturned;
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+/// If a callsite has arguments that are also arguments to the parent function,
+/// try to propagate attributes from the callsite's arguments to the parent's
+/// arguments. This may be important because inlining can cause information loss
+/// when attribute knowledge disappears with the inlined call.
+static bool addArgumentAttrsFromCallsites(Function &F) {
+ if (!EnableNonnullArgPropagation)
+ return false;
+
+ bool Changed = false;
+
+ // For an argument attribute to transfer from a callsite to the parent, the
+ // call must be guaranteed to execute every time the parent is called.
+ // Conservatively, just check for calls in the entry block that are guaranteed
+ // to execute.
+ // TODO: This could be enhanced by testing if the callsite post-dominates the
+ // entry block or by doing simple forward walks or backward walks to the
+ // callsite.
+ BasicBlock &Entry = F.getEntryBlock();
+ for (Instruction &I : Entry) {
+ if (auto *CB = dyn_cast<CallBase>(&I)) {
+ if (auto *CalledFunc = CB->getCalledFunction()) {
+ for (auto &CSArg : CalledFunc->args()) {
if (!CSArg.hasNonNullAttr(/* AllowUndefOrPoison */ false))
- continue;
-
- // If the non-null callsite argument operand is an argument to 'F'
- // (the caller) and the call is guaranteed to execute, then the value
- // must be non-null throughout 'F'.
- auto *FArg = dyn_cast<Argument>(CB->getArgOperand(CSArg.getArgNo()));
- if (FArg && !FArg->hasNonNullAttr()) {
- FArg->addAttr(Attribute::NonNull);
- Changed = true;
- }
- }
- }
- }
- if (!isGuaranteedToTransferExecutionToSuccessor(&I))
- break;
- }
-
- return Changed;
-}
-
-static bool addReadAttr(Argument *A, Attribute::AttrKind R) {
- assert((R == Attribute::ReadOnly || R == Attribute::ReadNone)
- && "Must be a Read attribute.");
- assert(A && "Argument must not be null.");
-
- // If the argument already has the attribute, nothing needs to be done.
- if (A->hasAttribute(R))
- return false;
-
- // Otherwise, remove potentially conflicting attribute, add the new one,
- // and update statistics.
- A->removeAttr(Attribute::WriteOnly);
- A->removeAttr(Attribute::ReadOnly);
- A->removeAttr(Attribute::ReadNone);
- A->addAttr(R);
- R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
- return true;
-}
-
-/// Deduce nocapture attributes for the SCC.
-static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
- bool Changed = false;
-
- ArgumentGraph AG;
-
- // Check each function in turn, determining which pointer arguments are not
- // captured.
- for (Function *F : SCCNodes) {
- // We can infer and propagate function attributes only when we know that the
- // definition we'll get at link time is *exactly* the definition we see now.
- // For more details, see GlobalValue::mayBeDerefined.
- if (!F->hasExactDefinition())
- continue;
-
- Changed |= addArgumentAttrsFromCallsites(*F);
-
- // Functions that are readonly (or readnone) and nounwind and don't return
- // a value can't capture arguments. Don't analyze them.
- if (F->onlyReadsMemory() && F->doesNotThrow() &&
- F->getReturnType()->isVoidTy()) {
- for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E;
- ++A) {
- if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
- A->addAttr(Attribute::NoCapture);
- ++NumNoCapture;
- Changed = true;
- }
- }
- continue;
- }
-
- for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E;
- ++A) {
- if (!A->getType()->isPointerTy())
- continue;
- bool HasNonLocalUses = false;
- if (!A->hasNoCaptureAttr()) {
- ArgumentUsesTracker Tracker(SCCNodes);
- PointerMayBeCaptured(&*A, &Tracker);
- if (!Tracker.Captured) {
- if (Tracker.Uses.empty()) {
- // If it's trivially not captured, mark it nocapture now.
- A->addAttr(Attribute::NoCapture);
- ++NumNoCapture;
- Changed = true;
- } else {
- // If it's not trivially captured and not trivially not captured,
- // then it must be calling into another function in our SCC. Save
- // its particulars for Argument-SCC analysis later.
- ArgumentGraphNode *Node = AG[&*A];
- for (Argument *Use : Tracker.Uses) {
- Node->Uses.push_back(AG[Use]);
- if (Use != &*A)
- HasNonLocalUses = true;
- }
- }
- }
- // Otherwise, it's captured. Don't bother doing SCC analysis on it.
- }
- if (!HasNonLocalUses && !A->onlyReadsMemory()) {
- // Can we determine that it's readonly/readnone without doing an SCC?
- // Note that we don't allow any calls at all here, or else our result
- // will be dependent on the iteration order through the functions in the
- // SCC.
- SmallPtrSet<Argument *, 8> Self;
- Self.insert(&*A);
- Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self);
- if (R != Attribute::None)
- Changed = addReadAttr(A, R);
- }
- }
- }
-
- // The graph we've collected is partial because we stopped scanning for
- // argument uses once we solved the argument trivially. These partial nodes
- // show up as ArgumentGraphNode objects with an empty Uses list, and for
- // these nodes the final decision about whether they capture has already been
- // made. If the definition doesn't have a 'nocapture' attribute by now, it
- // captures.
-
- for (scc_iterator<ArgumentGraph *> I = scc_begin(&AG); !I.isAtEnd(); ++I) {
- const std::vector<ArgumentGraphNode *> &ArgumentSCC = *I;
- if (ArgumentSCC.size() == 1) {
- if (!ArgumentSCC[0]->Definition)
- continue; // synthetic root node
-
- // eg. "void f(int* x) { if (...) f(x); }"
- if (ArgumentSCC[0]->Uses.size() == 1 &&
- ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) {
- Argument *A = ArgumentSCC[0]->Definition;
- A->addAttr(Attribute::NoCapture);
- ++NumNoCapture;
- Changed = true;
- }
- continue;
- }
-
- bool SCCCaptured = false;
- for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end();
- I != E && !SCCCaptured; ++I) {
- ArgumentGraphNode *Node = *I;
- if (Node->Uses.empty()) {
- if (!Node->Definition->hasNoCaptureAttr())
- SCCCaptured = true;
- }
- }
- if (SCCCaptured)
- continue;
-
- SmallPtrSet<Argument *, 8> ArgumentSCCNodes;
- // Fill ArgumentSCCNodes with the elements of the ArgumentSCC. Used for
- // quickly looking up whether a given Argument is in this ArgumentSCC.
- for (ArgumentGraphNode *I : ArgumentSCC) {
- ArgumentSCCNodes.insert(I->Definition);
- }
-
- for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end();
- I != E && !SCCCaptured; ++I) {
- ArgumentGraphNode *N = *I;
- for (ArgumentGraphNode *Use : N->Uses) {
- Argument *A = Use->Definition;
- if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A))
- continue;
- SCCCaptured = true;
- break;
- }
- }
- if (SCCCaptured)
- continue;
-
- for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
- Argument *A = ArgumentSCC[i]->Definition;
- A->addAttr(Attribute::NoCapture);
- ++NumNoCapture;
- Changed = true;
- }
-
- // We also want to compute readonly/readnone. With a small number of false
- // negatives, we can assume that any pointer which is captured isn't going
- // to be provably readonly or readnone, since by definition we can't
- // analyze all uses of a captured pointer.
- //
- // The false negatives happen when the pointer is captured by a function
- // that promises readonly/readnone behaviour on the pointer, then the
- // pointer's lifetime ends before anything that writes to arbitrary memory.
- // Also, a readonly/readnone pointer may be returned, but returning a
- // pointer is capturing it.
-
- Attribute::AttrKind ReadAttr = Attribute::ReadNone;
- for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
- Argument *A = ArgumentSCC[i]->Definition;
- Attribute::AttrKind K = determinePointerReadAttrs(A, ArgumentSCCNodes);
- if (K == Attribute::ReadNone)
- continue;
- if (K == Attribute::ReadOnly) {
- ReadAttr = Attribute::ReadOnly;
- continue;
- }
- ReadAttr = K;
- break;
- }
-
- if (ReadAttr != Attribute::None) {
- for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
- Argument *A = ArgumentSCC[i]->Definition;
- Changed = addReadAttr(A, ReadAttr);
- }
- }
- }
-
- return Changed;
-}
-
-/// Tests whether a function is "malloc-like".
-///
-/// A function is "malloc-like" if it returns either null or a pointer that
-/// doesn't alias any other pointer visible to the caller.
-static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {
- SmallSetVector<Value *, 8> FlowsToReturn;
- for (BasicBlock &BB : *F)
- if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator()))
- FlowsToReturn.insert(Ret->getReturnValue());
-
- for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
- Value *RetVal = FlowsToReturn[i];
-
- if (Constant *C = dyn_cast<Constant>(RetVal)) {
- if (!C->isNullValue() && !isa<UndefValue>(C))
- return false;
-
- continue;
- }
-
- if (isa<Argument>(RetVal))
- return false;
-
- if (Instruction *RVI = dyn_cast<Instruction>(RetVal))
- switch (RVI->getOpcode()) {
- // Extend the analysis by looking upwards.
- case Instruction::BitCast:
- case Instruction::GetElementPtr:
- case Instruction::AddrSpaceCast:
- FlowsToReturn.insert(RVI->getOperand(0));
- continue;
- case Instruction::Select: {
- SelectInst *SI = cast<SelectInst>(RVI);
- FlowsToReturn.insert(SI->getTrueValue());
- FlowsToReturn.insert(SI->getFalseValue());
- continue;
- }
- case Instruction::PHI: {
- PHINode *PN = cast<PHINode>(RVI);
- for (Value *IncValue : PN->incoming_values())
- FlowsToReturn.insert(IncValue);
- continue;
- }
-
- // Check whether the pointer came from an allocation.
- case Instruction::Alloca:
- break;
- case Instruction::Call:
- case Instruction::Invoke: {
- CallBase &CB = cast<CallBase>(*RVI);
- if (CB.hasRetAttr(Attribute::NoAlias))
- break;
- if (CB.getCalledFunction() && SCCNodes.count(CB.getCalledFunction()))
- break;
- LLVM_FALLTHROUGH;
- }
- default:
- return false; // Did not come from an allocation.
- }
-
- if (PointerMayBeCaptured(RetVal, false, /*StoreCaptures=*/false))
- return false;
- }
-
- return true;
-}
-
-/// Deduce noalias attributes for the SCC.
-static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
- // Check each function in turn, determining which functions return noalias
- // pointers.
- for (Function *F : SCCNodes) {
- // Already noalias.
- if (F->returnDoesNotAlias())
- continue;
-
- // We can infer and propagate function attributes only when we know that the
- // definition we'll get at link time is *exactly* the definition we see now.
- // For more details, see GlobalValue::mayBeDerefined.
- if (!F->hasExactDefinition())
- return false;
-
- // We annotate noalias return values, which are only applicable to
- // pointer types.
- if (!F->getReturnType()->isPointerTy())
- continue;
-
- if (!isFunctionMallocLike(F, SCCNodes))
- return false;
- }
-
- bool MadeChange = false;
- for (Function *F : SCCNodes) {
- if (F->returnDoesNotAlias() ||
- !F->getReturnType()->isPointerTy())
- continue;
-
- F->setReturnDoesNotAlias();
- ++NumNoAlias;
- MadeChange = true;
- }
-
- return MadeChange;
-}
-
-/// Tests whether this function is known to not return null.
-///
-/// Requires that the function returns a pointer.
-///
-/// Returns true if it believes the function will not return a null, and sets
-/// \p Speculative based on whether the returned conclusion is a speculative
-/// conclusion due to SCC calls.
-static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
- bool &Speculative) {
- assert(F->getReturnType()->isPointerTy() &&
- "nonnull only meaningful on pointer types");
- Speculative = false;
-
- SmallSetVector<Value *, 8> FlowsToReturn;
- for (BasicBlock &BB : *F)
- if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator()))
- FlowsToReturn.insert(Ret->getReturnValue());
-
- auto &DL = F->getParent()->getDataLayout();
-
- for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
- Value *RetVal = FlowsToReturn[i];
-
- // If this value is locally known to be non-null, we're good
- if (isKnownNonZero(RetVal, DL))
- continue;
-
- // Otherwise, we need to look upwards since we can't make any local
- // conclusions.
- Instruction *RVI = dyn_cast<Instruction>(RetVal);
- if (!RVI)
- return false;
- switch (RVI->getOpcode()) {
- // Extend the analysis by looking upwards.
- case Instruction::BitCast:
- case Instruction::GetElementPtr:
- case Instruction::AddrSpaceCast:
- FlowsToReturn.insert(RVI->getOperand(0));
- continue;
- case Instruction::Select: {
- SelectInst *SI = cast<SelectInst>(RVI);
- FlowsToReturn.insert(SI->getTrueValue());
- FlowsToReturn.insert(SI->getFalseValue());
- continue;
- }
- case Instruction::PHI: {
- PHINode *PN = cast<PHINode>(RVI);
- for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- FlowsToReturn.insert(PN->getIncomingValue(i));
- continue;
- }
- case Instruction::Call:
- case Instruction::Invoke: {
- CallBase &CB = cast<CallBase>(*RVI);
- Function *Callee = CB.getCalledFunction();
- // A call to a node within the SCC is assumed to return null until
- // proven otherwise
- if (Callee && SCCNodes.count(Callee)) {
- Speculative = true;
- continue;
- }
- return false;
- }
- default:
- return false; // Unknown source, may be null
- };
- llvm_unreachable("should have either continued or returned");
- }
-
- return true;
-}
-
-/// Deduce nonnull attributes for the SCC.
-static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
- // Speculative that all functions in the SCC return only nonnull
- // pointers. We may refute this as we analyze functions.
- bool SCCReturnsNonNull = true;
-
- bool MadeChange = false;
-
- // Check each function in turn, determining which functions return nonnull
- // pointers.
- for (Function *F : SCCNodes) {
- // Already nonnull.
- if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
- Attribute::NonNull))
- continue;
-
- // We can infer and propagate function attributes only when we know that the
- // definition we'll get at link time is *exactly* the definition we see now.
- // For more details, see GlobalValue::mayBeDerefined.
- if (!F->hasExactDefinition())
- return false;
-
- // We annotate nonnull return values, which are only applicable to
- // pointer types.
- if (!F->getReturnType()->isPointerTy())
- continue;
-
- bool Speculative = false;
- if (isReturnNonNull(F, SCCNodes, Speculative)) {
- if (!Speculative) {
- // Mark the function eagerly since we may discover a function
- // which prevents us from speculating about the entire SCC
- LLVM_DEBUG(dbgs() << "Eagerly marking " << F->getName()
- << " as nonnull\n");
- F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
- ++NumNonNullReturn;
- MadeChange = true;
- }
- continue;
- }
- // At least one function returns something which could be null, can't
- // speculate any more.
- SCCReturnsNonNull = false;
- }
-
- if (SCCReturnsNonNull) {
- for (Function *F : SCCNodes) {
- if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
- Attribute::NonNull) ||
- !F->getReturnType()->isPointerTy())
- continue;
-
- LLVM_DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
- F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
- ++NumNonNullReturn;
- MadeChange = true;
- }
- }
-
- return MadeChange;
-}
-
-namespace {
-
-/// Collects a set of attribute inference requests and performs them all in one
-/// go on a single SCC Node. Inference involves scanning function bodies
-/// looking for instructions that violate attribute assumptions.
-/// As soon as all the bodies are fine we are free to set the attribute.
-/// Customization of inference for individual attributes is performed by
-/// providing a handful of predicates for each attribute.
-class AttributeInferer {
-public:
- /// Describes a request for inference of a single attribute.
- struct InferenceDescriptor {
-
- /// Returns true if this function does not have to be handled.
- /// General intent for this predicate is to provide an optimization
- /// for functions that do not need this attribute inference at all
- /// (say, for functions that already have the attribute).
- std::function<bool(const Function &)> SkipFunction;
-
- /// Returns true if this instruction violates attribute assumptions.
- std::function<bool(Instruction &)> InstrBreaksAttribute;
-
- /// Sets the inferred attribute for this function.
- std::function<void(Function &)> SetAttribute;
-
- /// Attribute we derive.
- Attribute::AttrKind AKind;
-
- /// If true, only "exact" definitions can be used to infer this attribute.
- /// See GlobalValue::isDefinitionExact.
- bool RequiresExactDefinition;
-
- InferenceDescriptor(Attribute::AttrKind AK,
- std::function<bool(const Function &)> SkipFunc,
- std::function<bool(Instruction &)> InstrScan,
- std::function<void(Function &)> SetAttr,
- bool ReqExactDef)
- : SkipFunction(SkipFunc), InstrBreaksAttribute(InstrScan),
- SetAttribute(SetAttr), AKind(AK),
- RequiresExactDefinition(ReqExactDef) {}
- };
-
-private:
- SmallVector<InferenceDescriptor, 4> InferenceDescriptors;
-
-public:
- void registerAttrInference(InferenceDescriptor AttrInference) {
- InferenceDescriptors.push_back(AttrInference);
- }
-
- bool run(const SCCNodeSet &SCCNodes);
-};
-
-/// Perform all the requested attribute inference actions according to the
-/// attribute predicates stored before.
-bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
- SmallVector<InferenceDescriptor, 4> InferInSCC = InferenceDescriptors;
- // Go through all the functions in SCC and check corresponding attribute
- // assumptions for each of them. Attributes that are invalid for this SCC
- // will be removed from InferInSCC.
- for (Function *F : SCCNodes) {
-
- // No attributes whose assumptions are still valid - done.
- if (InferInSCC.empty())
- return false;
-
- // Check if our attributes ever need scanning/can be scanned.
- llvm::erase_if(InferInSCC, [F](const InferenceDescriptor &ID) {
- if (ID.SkipFunction(*F))
- return false;
-
- // Remove from further inference (invalidate) when visiting a function
- // that has no instructions to scan/has an unsuitable definition.
- return F->isDeclaration() ||
- (ID.RequiresExactDefinition && !F->hasExactDefinition());
- });
-
- // For each attribute still in InferInSCC that doesn't explicitly skip F,
- // set up the F instructions scan to verify assumptions of the attribute.
- SmallVector<InferenceDescriptor, 4> InferInThisFunc;
- llvm::copy_if(
- InferInSCC, std::back_inserter(InferInThisFunc),
- [F](const InferenceDescriptor &ID) { return !ID.SkipFunction(*F); });
-
- if (InferInThisFunc.empty())
- continue;
-
- // Start instruction scan.
- for (Instruction &I : instructions(*F)) {
- llvm::erase_if(InferInThisFunc, [&](const InferenceDescriptor &ID) {
- if (!ID.InstrBreaksAttribute(I))
- return false;
- // Remove attribute from further inference on any other functions
- // because attribute assumptions have just been violated.
- llvm::erase_if(InferInSCC, [&ID](const InferenceDescriptor &D) {
- return D.AKind == ID.AKind;
- });
- // Remove attribute from the rest of current instruction scan.
- return true;
- });
-
- if (InferInThisFunc.empty())
- break;
- }
- }
-
- if (InferInSCC.empty())
- return false;
-
- bool Changed = false;
- for (Function *F : SCCNodes)
- // At this point InferInSCC contains only functions that were either:
- // - explicitly skipped from scan/inference, or
- // - verified to have no instructions that break attribute assumptions.
- // Hence we just go and force the attribute for all non-skipped functions.
- for (auto &ID : InferInSCC) {
- if (ID.SkipFunction(*F))
- continue;
- Changed = true;
- ID.SetAttribute(*F);
- }
- return Changed;
-}
-
+ continue;
+
+ // If the non-null callsite argument operand is an argument to 'F'
+ // (the caller) and the call is guaranteed to execute, then the value
+ // must be non-null throughout 'F'.
+ auto *FArg = dyn_cast<Argument>(CB->getArgOperand(CSArg.getArgNo()));
+ if (FArg && !FArg->hasNonNullAttr()) {
+ FArg->addAttr(Attribute::NonNull);
+ Changed = true;
+ }
+ }
+ }
+ }
+ if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+ break;
+ }
+
+ return Changed;
+}
+
+static bool addReadAttr(Argument *A, Attribute::AttrKind R) {
+ assert((R == Attribute::ReadOnly || R == Attribute::ReadNone)
+ && "Must be a Read attribute.");
+ assert(A && "Argument must not be null.");
+
+ // If the argument already has the attribute, nothing needs to be done.
+ if (A->hasAttribute(R))
+ return false;
+
+ // Otherwise, remove potentially conflicting attribute, add the new one,
+ // and update statistics.
+ A->removeAttr(Attribute::WriteOnly);
+ A->removeAttr(Attribute::ReadOnly);
+ A->removeAttr(Attribute::ReadNone);
+ A->addAttr(R);
+ R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
+ return true;
+}
+
+/// Deduce nocapture attributes for the SCC.
+static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
+ bool Changed = false;
+
+ ArgumentGraph AG;
+
+ // Check each function in turn, determining which pointer arguments are not
+ // captured.
+ for (Function *F : SCCNodes) {
+ // We can infer and propagate function attributes only when we know that the
+ // definition we'll get at link time is *exactly* the definition we see now.
+ // For more details, see GlobalValue::mayBeDerefined.
+ if (!F->hasExactDefinition())
+ continue;
+
+ Changed |= addArgumentAttrsFromCallsites(*F);
+
+ // Functions that are readonly (or readnone) and nounwind and don't return
+ // a value can't capture arguments. Don't analyze them.
+ if (F->onlyReadsMemory() && F->doesNotThrow() &&
+ F->getReturnType()->isVoidTy()) {
+ for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E;
+ ++A) {
+ if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
+ A->addAttr(Attribute::NoCapture);
+ ++NumNoCapture;
+ Changed = true;
+ }
+ }
+ continue;
+ }
+
+ for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E;
+ ++A) {
+ if (!A->getType()->isPointerTy())
+ continue;
+ bool HasNonLocalUses = false;
+ if (!A->hasNoCaptureAttr()) {
+ ArgumentUsesTracker Tracker(SCCNodes);
+ PointerMayBeCaptured(&*A, &Tracker);
+ if (!Tracker.Captured) {
+ if (Tracker.Uses.empty()) {
+ // If it's trivially not captured, mark it nocapture now.
+ A->addAttr(Attribute::NoCapture);
+ ++NumNoCapture;
+ Changed = true;
+ } else {
+ // If it's not trivially captured and not trivially not captured,
+ // then it must be calling into another function in our SCC. Save
+ // its particulars for Argument-SCC analysis later.
+ ArgumentGraphNode *Node = AG[&*A];
+ for (Argument *Use : Tracker.Uses) {
+ Node->Uses.push_back(AG[Use]);
+ if (Use != &*A)
+ HasNonLocalUses = true;
+ }
+ }
+ }
+ // Otherwise, it's captured. Don't bother doing SCC analysis on it.
+ }
+ if (!HasNonLocalUses && !A->onlyReadsMemory()) {
+ // Can we determine that it's readonly/readnone without doing an SCC?
+ // Note that we don't allow any calls at all here, or else our result
+ // will be dependent on the iteration order through the functions in the
+ // SCC.
+ SmallPtrSet<Argument *, 8> Self;
+ Self.insert(&*A);
+ Attribute::AttrKind R = determinePointerReadAttrs(&*A, Self);
+ if (R != Attribute::None)
+ Changed = addReadAttr(A, R);
+ }
+ }
+ }
+
+ // The graph we've collected is partial because we stopped scanning for
+ // argument uses once we solved the argument trivially. These partial nodes
+ // show up as ArgumentGraphNode objects with an empty Uses list, and for
+ // these nodes the final decision about whether they capture has already been
+ // made. If the definition doesn't have a 'nocapture' attribute by now, it
+ // captures.
+
+ for (scc_iterator<ArgumentGraph *> I = scc_begin(&AG); !I.isAtEnd(); ++I) {
+ const std::vector<ArgumentGraphNode *> &ArgumentSCC = *I;
+ if (ArgumentSCC.size() == 1) {
+ if (!ArgumentSCC[0]->Definition)
+ continue; // synthetic root node
+
+ // eg. "void f(int* x) { if (...) f(x); }"
+ if (ArgumentSCC[0]->Uses.size() == 1 &&
+ ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) {
+ Argument *A = ArgumentSCC[0]->Definition;
+ A->addAttr(Attribute::NoCapture);
+ ++NumNoCapture;
+ Changed = true;
+ }
+ continue;
+ }
+
+ bool SCCCaptured = false;
+ for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end();
+ I != E && !SCCCaptured; ++I) {
+ ArgumentGraphNode *Node = *I;
+ if (Node->Uses.empty()) {
+ if (!Node->Definition->hasNoCaptureAttr())
+ SCCCaptured = true;
+ }
+ }
+ if (SCCCaptured)
+ continue;
+
+ SmallPtrSet<Argument *, 8> ArgumentSCCNodes;
+ // Fill ArgumentSCCNodes with the elements of the ArgumentSCC. Used for
+ // quickly looking up whether a given Argument is in this ArgumentSCC.
+ for (ArgumentGraphNode *I : ArgumentSCC) {
+ ArgumentSCCNodes.insert(I->Definition);
+ }
+
+ for (auto I = ArgumentSCC.begin(), E = ArgumentSCC.end();
+ I != E && !SCCCaptured; ++I) {
+ ArgumentGraphNode *N = *I;
+ for (ArgumentGraphNode *Use : N->Uses) {
+ Argument *A = Use->Definition;
+ if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A))
+ continue;
+ SCCCaptured = true;
+ break;
+ }
+ }
+ if (SCCCaptured)
+ continue;
+
+ for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+ Argument *A = ArgumentSCC[i]->Definition;
+ A->addAttr(Attribute::NoCapture);
+ ++NumNoCapture;
+ Changed = true;
+ }
+
+ // We also want to compute readonly/readnone. With a small number of false
+ // negatives, we can assume that any pointer which is captured isn't going
+ // to be provably readonly or readnone, since by definition we can't
+ // analyze all uses of a captured pointer.
+ //
+ // The false negatives happen when the pointer is captured by a function
+ // that promises readonly/readnone behaviour on the pointer, then the
+ // pointer's lifetime ends before anything that writes to arbitrary memory.
+ // Also, a readonly/readnone pointer may be returned, but returning a
+ // pointer is capturing it.
+
+ Attribute::AttrKind ReadAttr = Attribute::ReadNone;
+ for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+ Argument *A = ArgumentSCC[i]->Definition;
+ Attribute::AttrKind K = determinePointerReadAttrs(A, ArgumentSCCNodes);
+ if (K == Attribute::ReadNone)
+ continue;
+ if (K == Attribute::ReadOnly) {
+ ReadAttr = Attribute::ReadOnly;
+ continue;
+ }
+ ReadAttr = K;
+ break;
+ }
+
+ if (ReadAttr != Attribute::None) {
+ for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
+ Argument *A = ArgumentSCC[i]->Definition;
+ Changed = addReadAttr(A, ReadAttr);
+ }
+ }
+ }
+
+ return Changed;
+}
+
+/// Tests whether a function is "malloc-like".
+///
+/// A function is "malloc-like" if it returns either null or a pointer that
+/// doesn't alias any other pointer visible to the caller.
+static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {
+ SmallSetVector<Value *, 8> FlowsToReturn;
+ for (BasicBlock &BB : *F)
+ if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator()))
+ FlowsToReturn.insert(Ret->getReturnValue());
+
+ for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
+ Value *RetVal = FlowsToReturn[i];
+
+ if (Constant *C = dyn_cast<Constant>(RetVal)) {
+ if (!C->isNullValue() && !isa<UndefValue>(C))
+ return false;
+
+ continue;
+ }
+
+ if (isa<Argument>(RetVal))
+ return false;
+
+ if (Instruction *RVI = dyn_cast<Instruction>(RetVal))
+ switch (RVI->getOpcode()) {
+ // Extend the analysis by looking upwards.
+ case Instruction::BitCast:
+ case Instruction::GetElementPtr:
+ case Instruction::AddrSpaceCast:
+ FlowsToReturn.insert(RVI->getOperand(0));
+ continue;
+ case Instruction::Select: {
+ SelectInst *SI = cast<SelectInst>(RVI);
+ FlowsToReturn.insert(SI->getTrueValue());
+ FlowsToReturn.insert(SI->getFalseValue());
+ continue;
+ }
+ case Instruction::PHI: {
+ PHINode *PN = cast<PHINode>(RVI);
+ for (Value *IncValue : PN->incoming_values())
+ FlowsToReturn.insert(IncValue);
+ continue;
+ }
+
+ // Check whether the pointer came from an allocation.
+ case Instruction::Alloca:
+ break;
+ case Instruction::Call:
+ case Instruction::Invoke: {
+ CallBase &CB = cast<CallBase>(*RVI);
+ if (CB.hasRetAttr(Attribute::NoAlias))
+ break;
+ if (CB.getCalledFunction() && SCCNodes.count(CB.getCalledFunction()))
+ break;
+ LLVM_FALLTHROUGH;
+ }
+ default:
+ return false; // Did not come from an allocation.
+ }
+
+ if (PointerMayBeCaptured(RetVal, false, /*StoreCaptures=*/false))
+ return false;
+ }
+
+ return true;
+}
+
+/// Deduce noalias attributes for the SCC.
+static bool addNoAliasAttrs(const SCCNodeSet &SCCNodes) {
+ // Check each function in turn, determining which functions return noalias
+ // pointers.
+ for (Function *F : SCCNodes) {
+ // Already noalias.
+ if (F->returnDoesNotAlias())
+ continue;
+
+ // We can infer and propagate function attributes only when we know that the
+ // definition we'll get at link time is *exactly* the definition we see now.
+ // For more details, see GlobalValue::mayBeDerefined.
+ if (!F->hasExactDefinition())
+ return false;
+
+ // We annotate noalias return values, which are only applicable to
+ // pointer types.
+ if (!F->getReturnType()->isPointerTy())
+ continue;
+
+ if (!isFunctionMallocLike(F, SCCNodes))
+ return false;
+ }
+
+ bool MadeChange = false;
+ for (Function *F : SCCNodes) {
+ if (F->returnDoesNotAlias() ||
+ !F->getReturnType()->isPointerTy())
+ continue;
+
+ F->setReturnDoesNotAlias();
+ ++NumNoAlias;
+ MadeChange = true;
+ }
+
+ return MadeChange;
+}
+
+/// Tests whether this function is known to not return null.
+///
+/// Requires that the function returns a pointer.
+///
+/// Returns true if it believes the function will not return a null, and sets
+/// \p Speculative based on whether the returned conclusion is a speculative
+/// conclusion due to SCC calls.
+static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
+ bool &Speculative) {
+ assert(F->getReturnType()->isPointerTy() &&
+ "nonnull only meaningful on pointer types");
+ Speculative = false;
+
+ SmallSetVector<Value *, 8> FlowsToReturn;
+ for (BasicBlock &BB : *F)
+ if (auto *Ret = dyn_cast<ReturnInst>(BB.getTerminator()))
+ FlowsToReturn.insert(Ret->getReturnValue());
+
+ auto &DL = F->getParent()->getDataLayout();
+
+ for (unsigned i = 0; i != FlowsToReturn.size(); ++i) {
+ Value *RetVal = FlowsToReturn[i];
+
+ // If this value is locally known to be non-null, we're good
+ if (isKnownNonZero(RetVal, DL))
+ continue;
+
+ // Otherwise, we need to look upwards since we can't make any local
+ // conclusions.
+ Instruction *RVI = dyn_cast<Instruction>(RetVal);
+ if (!RVI)
+ return false;
+ switch (RVI->getOpcode()) {
+ // Extend the analysis by looking upwards.
+ case Instruction::BitCast:
+ case Instruction::GetElementPtr:
+ case Instruction::AddrSpaceCast:
+ FlowsToReturn.insert(RVI->getOperand(0));
+ continue;
+ case Instruction::Select: {
+ SelectInst *SI = cast<SelectInst>(RVI);
+ FlowsToReturn.insert(SI->getTrueValue());
+ FlowsToReturn.insert(SI->getFalseValue());
+ continue;
+ }
+ case Instruction::PHI: {
+ PHINode *PN = cast<PHINode>(RVI);
+ for (int i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ FlowsToReturn.insert(PN->getIncomingValue(i));
+ continue;
+ }
+ case Instruction::Call:
+ case Instruction::Invoke: {
+ CallBase &CB = cast<CallBase>(*RVI);
+ Function *Callee = CB.getCalledFunction();
+ // A call to a node within the SCC is assumed to return null until
+ // proven otherwise
+ if (Callee && SCCNodes.count(Callee)) {
+ Speculative = true;
+ continue;
+ }
+ return false;
+ }
+ default:
+ return false; // Unknown source, may be null
+ };
+ llvm_unreachable("should have either continued or returned");
+ }
+
+ return true;
+}
+
+/// Deduce nonnull attributes for the SCC.
+static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
+ // Speculative that all functions in the SCC return only nonnull
+ // pointers. We may refute this as we analyze functions.
+ bool SCCReturnsNonNull = true;
+
+ bool MadeChange = false;
+
+ // Check each function in turn, determining which functions return nonnull
+ // pointers.
+ for (Function *F : SCCNodes) {
+ // Already nonnull.
+ if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
+ Attribute::NonNull))
+ continue;
+
+ // We can infer and propagate function attributes only when we know that the
+ // definition we'll get at link time is *exactly* the definition we see now.
+ // For more details, see GlobalValue::mayBeDerefined.
+ if (!F->hasExactDefinition())
+ return false;
+
+ // We annotate nonnull return values, which are only applicable to
+ // pointer types.
+ if (!F->getReturnType()->isPointerTy())
+ continue;
+
+ bool Speculative = false;
+ if (isReturnNonNull(F, SCCNodes, Speculative)) {
+ if (!Speculative) {
+ // Mark the function eagerly since we may discover a function
+ // which prevents us from speculating about the entire SCC
+ LLVM_DEBUG(dbgs() << "Eagerly marking " << F->getName()
+ << " as nonnull\n");
+ F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+ ++NumNonNullReturn;
+ MadeChange = true;
+ }
+ continue;
+ }
+ // At least one function returns something which could be null, can't
+ // speculate any more.
+ SCCReturnsNonNull = false;
+ }
+
+ if (SCCReturnsNonNull) {
+ for (Function *F : SCCNodes) {
+ if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
+ Attribute::NonNull) ||
+ !F->getReturnType()->isPointerTy())
+ continue;
+
+ LLVM_DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
+ F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+ ++NumNonNullReturn;
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
+namespace {
+
+/// Collects a set of attribute inference requests and performs them all in one
+/// go on a single SCC Node. Inference involves scanning function bodies
+/// looking for instructions that violate attribute assumptions.
+/// As soon as all the bodies are fine we are free to set the attribute.
+/// Customization of inference for individual attributes is performed by
+/// providing a handful of predicates for each attribute.
+class AttributeInferer {
+public:
+ /// Describes a request for inference of a single attribute.
+ struct InferenceDescriptor {
+
+ /// Returns true if this function does not have to be handled.
+ /// General intent for this predicate is to provide an optimization
+ /// for functions that do not need this attribute inference at all
+ /// (say, for functions that already have the attribute).
+ std::function<bool(const Function &)> SkipFunction;
+
+ /// Returns true if this instruction violates attribute assumptions.
+ std::function<bool(Instruction &)> InstrBreaksAttribute;
+
+ /// Sets the inferred attribute for this function.
+ std::function<void(Function &)> SetAttribute;
+
+ /// Attribute we derive.
+ Attribute::AttrKind AKind;
+
+ /// If true, only "exact" definitions can be used to infer this attribute.
+ /// See GlobalValue::isDefinitionExact.
+ bool RequiresExactDefinition;
+
+ InferenceDescriptor(Attribute::AttrKind AK,
+ std::function<bool(const Function &)> SkipFunc,
+ std::function<bool(Instruction &)> InstrScan,
+ std::function<void(Function &)> SetAttr,
+ bool ReqExactDef)
+ : SkipFunction(SkipFunc), InstrBreaksAttribute(InstrScan),
+ SetAttribute(SetAttr), AKind(AK),
+ RequiresExactDefinition(ReqExactDef) {}
+ };
+
+private:
+ SmallVector<InferenceDescriptor, 4> InferenceDescriptors;
+
+public:
+ void registerAttrInference(InferenceDescriptor AttrInference) {
+ InferenceDescriptors.push_back(AttrInference);
+ }
+
+ bool run(const SCCNodeSet &SCCNodes);
+};
+
+/// Perform all the requested attribute inference actions according to the
+/// attribute predicates stored before.
+bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
+ SmallVector<InferenceDescriptor, 4> InferInSCC = InferenceDescriptors;
+ // Go through all the functions in SCC and check corresponding attribute
+ // assumptions for each of them. Attributes that are invalid for this SCC
+ // will be removed from InferInSCC.
+ for (Function *F : SCCNodes) {
+
+ // No attributes whose assumptions are still valid - done.
+ if (InferInSCC.empty())
+ return false;
+
+ // Check if our attributes ever need scanning/can be scanned.
+ llvm::erase_if(InferInSCC, [F](const InferenceDescriptor &ID) {
+ if (ID.SkipFunction(*F))
+ return false;
+
+ // Remove from further inference (invalidate) when visiting a function
+ // that has no instructions to scan/has an unsuitable definition.
+ return F->isDeclaration() ||
+ (ID.RequiresExactDefinition && !F->hasExactDefinition());
+ });
+
+ // For each attribute still in InferInSCC that doesn't explicitly skip F,
+ // set up the F instructions scan to verify assumptions of the attribute.
+ SmallVector<InferenceDescriptor, 4> InferInThisFunc;
+ llvm::copy_if(
+ InferInSCC, std::back_inserter(InferInThisFunc),
+ [F](const InferenceDescriptor &ID) { return !ID.SkipFunction(*F); });
+
+ if (InferInThisFunc.empty())
+ continue;
+
+ // Start instruction scan.
+ for (Instruction &I : instructions(*F)) {
+ llvm::erase_if(InferInThisFunc, [&](const InferenceDescriptor &ID) {
+ if (!ID.InstrBreaksAttribute(I))
+ return false;
+ // Remove attribute from further inference on any other functions
+ // because attribute assumptions have just been violated.
+ llvm::erase_if(InferInSCC, [&ID](const InferenceDescriptor &D) {
+ return D.AKind == ID.AKind;
+ });
+ // Remove attribute from the rest of current instruction scan.
+ return true;
+ });
+
+ if (InferInThisFunc.empty())
+ break;
+ }
+ }
+
+ if (InferInSCC.empty())
+ return false;
+
+ bool Changed = false;
+ for (Function *F : SCCNodes)
+ // At this point InferInSCC contains only functions that were either:
+ // - explicitly skipped from scan/inference, or
+ // - verified to have no instructions that break attribute assumptions.
+ // Hence we just go and force the attribute for all non-skipped functions.
+ for (auto &ID : InferInSCC) {
+ if (ID.SkipFunction(*F))
+ continue;
+ Changed = true;
+ ID.SetAttribute(*F);
+ }
+ return Changed;
+}
+
struct SCCNodesResult {
SCCNodeSet SCCNodes;
bool HasUnknownCall;
};
-} // end anonymous namespace
-
-/// Helper for non-Convergent inference predicate InstrBreaksAttribute.
-static bool InstrBreaksNonConvergent(Instruction &I,
- const SCCNodeSet &SCCNodes) {
- const CallBase *CB = dyn_cast<CallBase>(&I);
- // Breaks non-convergent assumption if CS is a convergent call to a function
- // not in the SCC.
- return CB && CB->isConvergent() &&
- SCCNodes.count(CB->getCalledFunction()) == 0;
-}
-
-/// Helper for NoUnwind inference predicate InstrBreaksAttribute.
-static bool InstrBreaksNonThrowing(Instruction &I, const SCCNodeSet &SCCNodes) {
- if (!I.mayThrow())
- return false;
- if (const auto *CI = dyn_cast<CallInst>(&I)) {
- if (Function *Callee = CI->getCalledFunction()) {
- // I is a may-throw call to a function inside our SCC. This doesn't
- // invalidate our current working assumption that the SCC is no-throw; we
- // just have to scan that other function.
+} // end anonymous namespace
+
+/// Helper for non-Convergent inference predicate InstrBreaksAttribute.
+static bool InstrBreaksNonConvergent(Instruction &I,
+ const SCCNodeSet &SCCNodes) {
+ const CallBase *CB = dyn_cast<CallBase>(&I);
+ // Breaks non-convergent assumption if CS is a convergent call to a function
+ // not in the SCC.
+ return CB && CB->isConvergent() &&
+ SCCNodes.count(CB->getCalledFunction()) == 0;
+}
+
+/// Helper for NoUnwind inference predicate InstrBreaksAttribute.
+static bool InstrBreaksNonThrowing(Instruction &I, const SCCNodeSet &SCCNodes) {
+ if (!I.mayThrow())
+ return false;
+ if (const auto *CI = dyn_cast<CallInst>(&I)) {
+ if (Function *Callee = CI->getCalledFunction()) {
+ // I is a may-throw call to a function inside our SCC. This doesn't
+ // invalidate our current working assumption that the SCC is no-throw; we
+ // just have to scan that other function.
if (SCCNodes.contains(Callee))
- return false;
- }
- }
- return true;
-}
-
-/// Helper for NoFree inference predicate InstrBreaksAttribute.
-static bool InstrBreaksNoFree(Instruction &I, const SCCNodeSet &SCCNodes) {
- CallBase *CB = dyn_cast<CallBase>(&I);
- if (!CB)
- return false;
-
- Function *Callee = CB->getCalledFunction();
- if (!Callee)
- return true;
-
- if (Callee->doesNotFreeMemory())
- return false;
-
+ return false;
+ }
+ }
+ return true;
+}
+
+/// Helper for NoFree inference predicate InstrBreaksAttribute.
+static bool InstrBreaksNoFree(Instruction &I, const SCCNodeSet &SCCNodes) {
+ CallBase *CB = dyn_cast<CallBase>(&I);
+ if (!CB)
+ return false;
+
+ Function *Callee = CB->getCalledFunction();
+ if (!Callee)
+ return true;
+
+ if (Callee->doesNotFreeMemory())
+ return false;
+
if (SCCNodes.contains(Callee))
- return false;
-
- return true;
-}
-
+ return false;
+
+ return true;
+}
+
/// Attempt to remove convergent function attribute when possible.
-///
-/// Returns true if any changes to function attributes were made.
+///
+/// Returns true if any changes to function attributes were made.
static bool inferConvergent(const SCCNodeSet &SCCNodes) {
- AttributeInferer AI;
-
- // Request to remove the convergent attribute from all functions in the SCC
- // if every callsite within the SCC is not convergent (except for calls
- // to functions within the SCC).
- // Note: Removal of the attr from the callsites will happen in
- // InstCombineCalls separately.
- AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
- Attribute::Convergent,
- // Skip non-convergent functions.
- [](const Function &F) { return !F.isConvergent(); },
- // Instructions that break non-convergent assumption.
- [SCCNodes](Instruction &I) {
- return InstrBreaksNonConvergent(I, SCCNodes);
- },
- [](Function &F) {
- LLVM_DEBUG(dbgs() << "Removing convergent attr from fn " << F.getName()
- << "\n");
- F.setNotConvergent();
- },
- /* RequiresExactDefinition= */ false});
+ AttributeInferer AI;
+
+ // Request to remove the convergent attribute from all functions in the SCC
+ // if every callsite within the SCC is not convergent (except for calls
+ // to functions within the SCC).
+ // Note: Removal of the attr from the callsites will happen in
+ // InstCombineCalls separately.
+ AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
+ Attribute::Convergent,
+ // Skip non-convergent functions.
+ [](const Function &F) { return !F.isConvergent(); },
+ // Instructions that break non-convergent assumption.
+ [SCCNodes](Instruction &I) {
+ return InstrBreaksNonConvergent(I, SCCNodes);
+ },
+ [](Function &F) {
+ LLVM_DEBUG(dbgs() << "Removing convergent attr from fn " << F.getName()
+ << "\n");
+ F.setNotConvergent();
+ },
+ /* RequiresExactDefinition= */ false});
// Perform all the requested attribute inference actions.
return AI.run(SCCNodes);
}
-
+
/// Infer attributes from all functions in the SCC by scanning every
/// instruction for compliance to the attribute assumptions. Currently it
/// does:
@@ -1317,86 +1317,86 @@ static bool inferConvergent(const SCCNodeSet &SCCNodes) {
static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) {
AttributeInferer AI;
- if (!DisableNoUnwindInference)
- // Request to infer nounwind attribute for all the functions in the SCC if
- // every callsite within the SCC is not throwing (except for calls to
- // functions within the SCC). Note that nounwind attribute suffers from
- // derefinement - results may change depending on how functions are
- // optimized. Thus it can be inferred only from exact definitions.
- AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
- Attribute::NoUnwind,
- // Skip non-throwing functions.
- [](const Function &F) { return F.doesNotThrow(); },
- // Instructions that break non-throwing assumption.
- [&SCCNodes](Instruction &I) {
- return InstrBreaksNonThrowing(I, SCCNodes);
- },
- [](Function &F) {
- LLVM_DEBUG(dbgs()
- << "Adding nounwind attr to fn " << F.getName() << "\n");
- F.setDoesNotThrow();
- ++NumNoUnwind;
- },
- /* RequiresExactDefinition= */ true});
-
- if (!DisableNoFreeInference)
- // Request to infer nofree attribute for all the functions in the SCC if
- // every callsite within the SCC does not directly or indirectly free
- // memory (except for calls to functions within the SCC). Note that nofree
- // attribute suffers from derefinement - results may change depending on
- // how functions are optimized. Thus it can be inferred only from exact
- // definitions.
- AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
- Attribute::NoFree,
- // Skip functions known not to free memory.
- [](const Function &F) { return F.doesNotFreeMemory(); },
- // Instructions that break non-deallocating assumption.
- [&SCCNodes](Instruction &I) {
- return InstrBreaksNoFree(I, SCCNodes);
- },
- [](Function &F) {
- LLVM_DEBUG(dbgs()
- << "Adding nofree attr to fn " << F.getName() << "\n");
- F.setDoesNotFreeMemory();
- ++NumNoFree;
- },
- /* RequiresExactDefinition= */ true});
-
- // Perform all the requested attribute inference actions.
- return AI.run(SCCNodes);
-}
-
-static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
- // Try and identify functions that do not recurse.
-
- // If the SCC contains multiple nodes we know for sure there is recursion.
- if (SCCNodes.size() != 1)
- return false;
-
- Function *F = *SCCNodes.begin();
- if (!F || !F->hasExactDefinition() || F->doesNotRecurse())
- return false;
-
- // If all of the calls in F are identifiable and are to norecurse functions, F
- // is norecurse. This check also detects self-recursion as F is not currently
- // marked norecurse, so any called from F to F will not be marked norecurse.
- for (auto &BB : *F)
- for (auto &I : BB.instructionsWithoutDebug())
- if (auto *CB = dyn_cast<CallBase>(&I)) {
- Function *Callee = CB->getCalledFunction();
- if (!Callee || Callee == F || !Callee->doesNotRecurse())
- // Function calls a potentially recursive function.
- return false;
- }
-
- // Every call was to a non-recursive function other than this function, and
- // we have no indirect recursion as the SCC size is one. This function cannot
- // recurse.
+ if (!DisableNoUnwindInference)
+ // Request to infer nounwind attribute for all the functions in the SCC if
+ // every callsite within the SCC is not throwing (except for calls to
+ // functions within the SCC). Note that nounwind attribute suffers from
+ // derefinement - results may change depending on how functions are
+ // optimized. Thus it can be inferred only from exact definitions.
+ AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
+ Attribute::NoUnwind,
+ // Skip non-throwing functions.
+ [](const Function &F) { return F.doesNotThrow(); },
+ // Instructions that break non-throwing assumption.
+ [&SCCNodes](Instruction &I) {
+ return InstrBreaksNonThrowing(I, SCCNodes);
+ },
+ [](Function &F) {
+ LLVM_DEBUG(dbgs()
+ << "Adding nounwind attr to fn " << F.getName() << "\n");
+ F.setDoesNotThrow();
+ ++NumNoUnwind;
+ },
+ /* RequiresExactDefinition= */ true});
+
+ if (!DisableNoFreeInference)
+ // Request to infer nofree attribute for all the functions in the SCC if
+ // every callsite within the SCC does not directly or indirectly free
+ // memory (except for calls to functions within the SCC). Note that nofree
+ // attribute suffers from derefinement - results may change depending on
+ // how functions are optimized. Thus it can be inferred only from exact
+ // definitions.
+ AI.registerAttrInference(AttributeInferer::InferenceDescriptor{
+ Attribute::NoFree,
+ // Skip functions known not to free memory.
+ [](const Function &F) { return F.doesNotFreeMemory(); },
+ // Instructions that break non-deallocating assumption.
+ [&SCCNodes](Instruction &I) {
+ return InstrBreaksNoFree(I, SCCNodes);
+ },
+ [](Function &F) {
+ LLVM_DEBUG(dbgs()
+ << "Adding nofree attr to fn " << F.getName() << "\n");
+ F.setDoesNotFreeMemory();
+ ++NumNoFree;
+ },
+ /* RequiresExactDefinition= */ true});
+
+ // Perform all the requested attribute inference actions.
+ return AI.run(SCCNodes);
+}
+
+static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
+ // Try and identify functions that do not recurse.
+
+ // If the SCC contains multiple nodes we know for sure there is recursion.
+ if (SCCNodes.size() != 1)
+ return false;
+
+ Function *F = *SCCNodes.begin();
+ if (!F || !F->hasExactDefinition() || F->doesNotRecurse())
+ return false;
+
+ // If all of the calls in F are identifiable and are to norecurse functions, F
+ // is norecurse. This check also detects self-recursion as F is not currently
+ // marked norecurse, so any called from F to F will not be marked norecurse.
+ for (auto &BB : *F)
+ for (auto &I : BB.instructionsWithoutDebug())
+ if (auto *CB = dyn_cast<CallBase>(&I)) {
+ Function *Callee = CB->getCalledFunction();
+ if (!Callee || Callee == F || !Callee->doesNotRecurse())
+ // Function calls a potentially recursive function.
+ return false;
+ }
+
+ // Every call was to a non-recursive function other than this function, and
+ // we have no indirect recursion as the SCC size is one. This function cannot
+ // recurse.
F->setDoesNotRecurse();
++NumNoRecurse;
return true;
-}
-
+}
+
static bool instructionDoesNotReturn(Instruction &I) {
if (auto *CB = dyn_cast<CallBase>(&I)) {
Function *Callee = CB->getCalledFunction();
@@ -1501,220 +1501,220 @@ static SCCNodesResult createSCCNodeSet(ArrayRef<Function *> Functions) {
return Res;
}
-template <typename AARGetterT>
+template <typename AARGetterT>
static bool deriveAttrsInPostOrder(ArrayRef<Function *> Functions,
AARGetterT &&AARGetter) {
SCCNodesResult Nodes = createSCCNodeSet(Functions);
- bool Changed = false;
-
- // Bail if the SCC only contains optnone functions.
+ bool Changed = false;
+
+ // Bail if the SCC only contains optnone functions.
if (Nodes.SCCNodes.empty())
- return Changed;
-
+ return Changed;
+
Changed |= addArgumentReturnedAttrs(Nodes.SCCNodes);
Changed |= addReadAttrs(Nodes.SCCNodes, AARGetter);
Changed |= addArgumentAttrs(Nodes.SCCNodes);
Changed |= inferConvergent(Nodes.SCCNodes);
Changed |= addNoReturnAttrs(Nodes.SCCNodes);
Changed |= addWillReturn(Nodes.SCCNodes);
-
- // If we have no external nodes participating in the SCC, we can deduce some
- // more precise attributes as well.
+
+ // If we have no external nodes participating in the SCC, we can deduce some
+ // more precise attributes as well.
if (!Nodes.HasUnknownCall) {
Changed |= addNoAliasAttrs(Nodes.SCCNodes);
Changed |= addNonNullAttrs(Nodes.SCCNodes);
Changed |= inferAttrsFromFunctionBodies(Nodes.SCCNodes);
Changed |= addNoRecurseAttrs(Nodes.SCCNodes);
- }
-
- return Changed;
-}
-
-PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
- CGSCCAnalysisManager &AM,
- LazyCallGraph &CG,
- CGSCCUpdateResult &) {
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
-
- // We pass a lambda into functions to wire them up to the analysis manager
- // for getting function analyses.
- auto AARGetter = [&](Function &F) -> AAResults & {
- return FAM.getResult<AAManager>(F);
- };
-
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
+ CGSCCAnalysisManager &AM,
+ LazyCallGraph &CG,
+ CGSCCUpdateResult &) {
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+
+ // We pass a lambda into functions to wire them up to the analysis manager
+ // for getting function analyses.
+ auto AARGetter = [&](Function &F) -> AAResults & {
+ return FAM.getResult<AAManager>(F);
+ };
+
SmallVector<Function *, 8> Functions;
- for (LazyCallGraph::Node &N : C) {
+ for (LazyCallGraph::Node &N : C) {
Functions.push_back(&N.getFunction());
- }
-
+ }
+
if (deriveAttrsInPostOrder(Functions, AARGetter))
- return PreservedAnalyses::none();
-
- return PreservedAnalyses::all();
-}
-
-namespace {
-
-struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass {
- // Pass identification, replacement for typeid
- static char ID;
-
- PostOrderFunctionAttrsLegacyPass() : CallGraphSCCPass(ID) {
- initializePostOrderFunctionAttrsLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnSCC(CallGraphSCC &SCC) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<AssumptionCacheTracker>();
- getAAResultsAnalysisUsage(AU);
- CallGraphSCCPass::getAnalysisUsage(AU);
- }
-};
-
-} // end anonymous namespace
-
-char PostOrderFunctionAttrsLegacyPass::ID = 0;
+ return PreservedAnalyses::none();
+
+ return PreservedAnalyses::all();
+}
+
+namespace {
+
+struct PostOrderFunctionAttrsLegacyPass : public CallGraphSCCPass {
+ // Pass identification, replacement for typeid
+ static char ID;
+
+ PostOrderFunctionAttrsLegacyPass() : CallGraphSCCPass(ID) {
+ initializePostOrderFunctionAttrsLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnSCC(CallGraphSCC &SCC) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AssumptionCacheTracker>();
+ getAAResultsAnalysisUsage(AU);
+ CallGraphSCCPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char PostOrderFunctionAttrsLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(PostOrderFunctionAttrsLegacyPass, "function-attrs",
- "Deduce function attributes", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+ "Deduce function attributes", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
INITIALIZE_PASS_END(PostOrderFunctionAttrsLegacyPass, "function-attrs",
- "Deduce function attributes", false, false)
-
-Pass *llvm::createPostOrderFunctionAttrsLegacyPass() {
- return new PostOrderFunctionAttrsLegacyPass();
-}
-
-template <typename AARGetterT>
-static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
+ "Deduce function attributes", false, false)
+
+Pass *llvm::createPostOrderFunctionAttrsLegacyPass() {
+ return new PostOrderFunctionAttrsLegacyPass();
+}
+
+template <typename AARGetterT>
+static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
SmallVector<Function *, 8> Functions;
- for (CallGraphNode *I : SCC) {
+ for (CallGraphNode *I : SCC) {
Functions.push_back(I->getFunction());
- }
-
+ }
+
return deriveAttrsInPostOrder(Functions, AARGetter);
-}
-
-bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) {
- if (skipSCC(SCC))
- return false;
- return runImpl(SCC, LegacyAARGetter(*this));
-}
-
-namespace {
-
-struct ReversePostOrderFunctionAttrsLegacyPass : public ModulePass {
- // Pass identification, replacement for typeid
- static char ID;
-
- ReversePostOrderFunctionAttrsLegacyPass() : ModulePass(ID) {
- initializeReversePostOrderFunctionAttrsLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<CallGraphWrapperPass>();
- AU.addPreserved<CallGraphWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-char ReversePostOrderFunctionAttrsLegacyPass::ID = 0;
-
+}
+
+bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) {
+ if (skipSCC(SCC))
+ return false;
+ return runImpl(SCC, LegacyAARGetter(*this));
+}
+
+namespace {
+
+struct ReversePostOrderFunctionAttrsLegacyPass : public ModulePass {
+ // Pass identification, replacement for typeid
+ static char ID;
+
+ ReversePostOrderFunctionAttrsLegacyPass() : ModulePass(ID) {
+ initializeReversePostOrderFunctionAttrsLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<CallGraphWrapperPass>();
+ AU.addPreserved<CallGraphWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char ReversePostOrderFunctionAttrsLegacyPass::ID = 0;
+
INITIALIZE_PASS_BEGIN(ReversePostOrderFunctionAttrsLegacyPass,
"rpo-function-attrs", "Deduce function attributes in RPO",
false, false)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
INITIALIZE_PASS_END(ReversePostOrderFunctionAttrsLegacyPass,
"rpo-function-attrs", "Deduce function attributes in RPO",
false, false)
-
-Pass *llvm::createReversePostOrderFunctionAttrsPass() {
- return new ReversePostOrderFunctionAttrsLegacyPass();
-}
-
-static bool addNoRecurseAttrsTopDown(Function &F) {
- // We check the preconditions for the function prior to calling this to avoid
- // the cost of building up a reversible post-order list. We assert them here
- // to make sure none of the invariants this relies on were violated.
- assert(!F.isDeclaration() && "Cannot deduce norecurse without a definition!");
- assert(!F.doesNotRecurse() &&
- "This function has already been deduced as norecurs!");
- assert(F.hasInternalLinkage() &&
- "Can only do top-down deduction for internal linkage functions!");
-
- // If F is internal and all of its uses are calls from a non-recursive
- // functions, then none of its calls could in fact recurse without going
- // through a function marked norecurse, and so we can mark this function too
- // as norecurse. Note that the uses must actually be calls -- otherwise
- // a pointer to this function could be returned from a norecurse function but
- // this function could be recursively (indirectly) called. Note that this
- // also detects if F is directly recursive as F is not yet marked as
- // a norecurse function.
- for (auto *U : F.users()) {
- auto *I = dyn_cast<Instruction>(U);
- if (!I)
- return false;
- CallBase *CB = dyn_cast<CallBase>(I);
- if (!CB || !CB->getParent()->getParent()->doesNotRecurse())
- return false;
- }
+
+Pass *llvm::createReversePostOrderFunctionAttrsPass() {
+ return new ReversePostOrderFunctionAttrsLegacyPass();
+}
+
+static bool addNoRecurseAttrsTopDown(Function &F) {
+ // We check the preconditions for the function prior to calling this to avoid
+ // the cost of building up a reversible post-order list. We assert them here
+ // to make sure none of the invariants this relies on were violated.
+ assert(!F.isDeclaration() && "Cannot deduce norecurse without a definition!");
+ assert(!F.doesNotRecurse() &&
+ "This function has already been deduced as norecurs!");
+ assert(F.hasInternalLinkage() &&
+ "Can only do top-down deduction for internal linkage functions!");
+
+ // If F is internal and all of its uses are calls from a non-recursive
+ // functions, then none of its calls could in fact recurse without going
+ // through a function marked norecurse, and so we can mark this function too
+ // as norecurse. Note that the uses must actually be calls -- otherwise
+ // a pointer to this function could be returned from a norecurse function but
+ // this function could be recursively (indirectly) called. Note that this
+ // also detects if F is directly recursive as F is not yet marked as
+ // a norecurse function.
+ for (auto *U : F.users()) {
+ auto *I = dyn_cast<Instruction>(U);
+ if (!I)
+ return false;
+ CallBase *CB = dyn_cast<CallBase>(I);
+ if (!CB || !CB->getParent()->getParent()->doesNotRecurse())
+ return false;
+ }
F.setDoesNotRecurse();
++NumNoRecurse;
return true;
-}
-
-static bool deduceFunctionAttributeInRPO(Module &M, CallGraph &CG) {
- // We only have a post-order SCC traversal (because SCCs are inherently
- // discovered in post-order), so we accumulate them in a vector and then walk
- // it in reverse. This is simpler than using the RPO iterator infrastructure
- // because we need to combine SCC detection and the PO walk of the call
- // graph. We can also cheat egregiously because we're primarily interested in
- // synthesizing norecurse and so we can only save the singular SCCs as SCCs
- // with multiple functions in them will clearly be recursive.
- SmallVector<Function *, 16> Worklist;
- for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) {
- if (I->size() != 1)
- continue;
-
- Function *F = I->front()->getFunction();
- if (F && !F->isDeclaration() && !F->doesNotRecurse() &&
- F->hasInternalLinkage())
- Worklist.push_back(F);
- }
-
- bool Changed = false;
- for (auto *F : llvm::reverse(Worklist))
- Changed |= addNoRecurseAttrsTopDown(*F);
-
- return Changed;
-}
-
-bool ReversePostOrderFunctionAttrsLegacyPass::runOnModule(Module &M) {
- if (skipModule(M))
- return false;
-
- auto &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
-
- return deduceFunctionAttributeInRPO(M, CG);
-}
-
-PreservedAnalyses
-ReversePostOrderFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) {
- auto &CG = AM.getResult<CallGraphAnalysis>(M);
-
- if (!deduceFunctionAttributeInRPO(M, CG))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserve<CallGraphAnalysis>();
- return PA;
-}
+}
+
+static bool deduceFunctionAttributeInRPO(Module &M, CallGraph &CG) {
+ // We only have a post-order SCC traversal (because SCCs are inherently
+ // discovered in post-order), so we accumulate them in a vector and then walk
+ // it in reverse. This is simpler than using the RPO iterator infrastructure
+ // because we need to combine SCC detection and the PO walk of the call
+ // graph. We can also cheat egregiously because we're primarily interested in
+ // synthesizing norecurse and so we can only save the singular SCCs as SCCs
+ // with multiple functions in them will clearly be recursive.
+ SmallVector<Function *, 16> Worklist;
+ for (scc_iterator<CallGraph *> I = scc_begin(&CG); !I.isAtEnd(); ++I) {
+ if (I->size() != 1)
+ continue;
+
+ Function *F = I->front()->getFunction();
+ if (F && !F->isDeclaration() && !F->doesNotRecurse() &&
+ F->hasInternalLinkage())
+ Worklist.push_back(F);
+ }
+
+ bool Changed = false;
+ for (auto *F : llvm::reverse(Worklist))
+ Changed |= addNoRecurseAttrsTopDown(*F);
+
+ return Changed;
+}
+
+bool ReversePostOrderFunctionAttrsLegacyPass::runOnModule(Module &M) {
+ if (skipModule(M))
+ return false;
+
+ auto &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+
+ return deduceFunctionAttributeInRPO(M, CG);
+}
+
+PreservedAnalyses
+ReversePostOrderFunctionAttrsPass::run(Module &M, ModuleAnalysisManager &AM) {
+ auto &CG = AM.getResult<CallGraphAnalysis>(M);
+
+ if (!deduceFunctionAttributeInRPO(M, CG))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<CallGraphAnalysis>();
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/FunctionImport.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/FunctionImport.cpp
index f99358e70b..18343030bc 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/FunctionImport.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/FunctionImport.cpp
@@ -1,558 +1,558 @@
-//===- FunctionImport.cpp - ThinLTO Summary-based Function Import ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements Function import based on summaries.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/FunctionImport.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/IR/AutoUpgrade.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalObject.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ModuleSummaryIndex.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Linker/IRMover.h"
-#include "llvm/Object/ModuleSymbolTable.h"
-#include "llvm/Object/SymbolicFile.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO/Internalize.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/FunctionImportUtils.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <cassert>
-#include <memory>
-#include <set>
-#include <string>
-#include <system_error>
-#include <tuple>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "function-import"
-
-STATISTIC(NumImportedFunctionsThinLink,
- "Number of functions thin link decided to import");
-STATISTIC(NumImportedHotFunctionsThinLink,
- "Number of hot functions thin link decided to import");
-STATISTIC(NumImportedCriticalFunctionsThinLink,
- "Number of critical functions thin link decided to import");
-STATISTIC(NumImportedGlobalVarsThinLink,
- "Number of global variables thin link decided to import");
-STATISTIC(NumImportedFunctions, "Number of functions imported in backend");
-STATISTIC(NumImportedGlobalVars,
- "Number of global variables imported in backend");
-STATISTIC(NumImportedModules, "Number of modules imported from");
-STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
-STATISTIC(NumLiveSymbols, "Number of live symbols in index");
-
-/// Limit on instruction count of imported functions.
-static cl::opt<unsigned> ImportInstrLimit(
- "import-instr-limit", cl::init(100), cl::Hidden, cl::value_desc("N"),
- cl::desc("Only import functions with less than N instructions"));
-
-static cl::opt<int> ImportCutoff(
- "import-cutoff", cl::init(-1), cl::Hidden, cl::value_desc("N"),
- cl::desc("Only import first N functions if N>=0 (default -1)"));
-
-static cl::opt<float>
- ImportInstrFactor("import-instr-evolution-factor", cl::init(0.7),
- cl::Hidden, cl::value_desc("x"),
- cl::desc("As we import functions, multiply the "
- "`import-instr-limit` threshold by this factor "
- "before processing newly imported functions"));
-
-static cl::opt<float> ImportHotInstrFactor(
- "import-hot-evolution-factor", cl::init(1.0), cl::Hidden,
- cl::value_desc("x"),
- cl::desc("As we import functions called from hot callsite, multiply the "
- "`import-instr-limit` threshold by this factor "
- "before processing newly imported functions"));
-
-static cl::opt<float> ImportHotMultiplier(
- "import-hot-multiplier", cl::init(10.0), cl::Hidden, cl::value_desc("x"),
- cl::desc("Multiply the `import-instr-limit` threshold for hot callsites"));
-
-static cl::opt<float> ImportCriticalMultiplier(
- "import-critical-multiplier", cl::init(100.0), cl::Hidden,
- cl::value_desc("x"),
- cl::desc(
- "Multiply the `import-instr-limit` threshold for critical callsites"));
-
-// FIXME: This multiplier was not really tuned up.
-static cl::opt<float> ImportColdMultiplier(
- "import-cold-multiplier", cl::init(0), cl::Hidden, cl::value_desc("N"),
- cl::desc("Multiply the `import-instr-limit` threshold for cold callsites"));
-
-static cl::opt<bool> PrintImports("print-imports", cl::init(false), cl::Hidden,
- cl::desc("Print imported functions"));
-
-static cl::opt<bool> PrintImportFailures(
- "print-import-failures", cl::init(false), cl::Hidden,
- cl::desc("Print information for functions rejected for importing"));
-
-static cl::opt<bool> ComputeDead("compute-dead", cl::init(true), cl::Hidden,
- cl::desc("Compute dead symbols"));
-
-static cl::opt<bool> EnableImportMetadata(
+//===- FunctionImport.cpp - ThinLTO Summary-based Function Import ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Function import based on summaries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/FunctionImport.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/IR/AutoUpgrade.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Linker/IRMover.h"
+#include "llvm/Object/ModuleSymbolTable.h"
+#include "llvm/Object/SymbolicFile.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO/Internalize.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/FunctionImportUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <memory>
+#include <set>
+#include <string>
+#include <system_error>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "function-import"
+
+STATISTIC(NumImportedFunctionsThinLink,
+ "Number of functions thin link decided to import");
+STATISTIC(NumImportedHotFunctionsThinLink,
+ "Number of hot functions thin link decided to import");
+STATISTIC(NumImportedCriticalFunctionsThinLink,
+ "Number of critical functions thin link decided to import");
+STATISTIC(NumImportedGlobalVarsThinLink,
+ "Number of global variables thin link decided to import");
+STATISTIC(NumImportedFunctions, "Number of functions imported in backend");
+STATISTIC(NumImportedGlobalVars,
+ "Number of global variables imported in backend");
+STATISTIC(NumImportedModules, "Number of modules imported from");
+STATISTIC(NumDeadSymbols, "Number of dead stripped symbols in index");
+STATISTIC(NumLiveSymbols, "Number of live symbols in index");
+
+/// Limit on instruction count of imported functions.
+static cl::opt<unsigned> ImportInstrLimit(
+ "import-instr-limit", cl::init(100), cl::Hidden, cl::value_desc("N"),
+ cl::desc("Only import functions with less than N instructions"));
+
+static cl::opt<int> ImportCutoff(
+ "import-cutoff", cl::init(-1), cl::Hidden, cl::value_desc("N"),
+ cl::desc("Only import first N functions if N>=0 (default -1)"));
+
+static cl::opt<float>
+ ImportInstrFactor("import-instr-evolution-factor", cl::init(0.7),
+ cl::Hidden, cl::value_desc("x"),
+ cl::desc("As we import functions, multiply the "
+ "`import-instr-limit` threshold by this factor "
+ "before processing newly imported functions"));
+
+static cl::opt<float> ImportHotInstrFactor(
+ "import-hot-evolution-factor", cl::init(1.0), cl::Hidden,
+ cl::value_desc("x"),
+ cl::desc("As we import functions called from hot callsite, multiply the "
+ "`import-instr-limit` threshold by this factor "
+ "before processing newly imported functions"));
+
+static cl::opt<float> ImportHotMultiplier(
+ "import-hot-multiplier", cl::init(10.0), cl::Hidden, cl::value_desc("x"),
+ cl::desc("Multiply the `import-instr-limit` threshold for hot callsites"));
+
+static cl::opt<float> ImportCriticalMultiplier(
+ "import-critical-multiplier", cl::init(100.0), cl::Hidden,
+ cl::value_desc("x"),
+ cl::desc(
+ "Multiply the `import-instr-limit` threshold for critical callsites"));
+
+// FIXME: This multiplier was not really tuned up.
+static cl::opt<float> ImportColdMultiplier(
+ "import-cold-multiplier", cl::init(0), cl::Hidden, cl::value_desc("N"),
+ cl::desc("Multiply the `import-instr-limit` threshold for cold callsites"));
+
+static cl::opt<bool> PrintImports("print-imports", cl::init(false), cl::Hidden,
+ cl::desc("Print imported functions"));
+
+static cl::opt<bool> PrintImportFailures(
+ "print-import-failures", cl::init(false), cl::Hidden,
+ cl::desc("Print information for functions rejected for importing"));
+
+static cl::opt<bool> ComputeDead("compute-dead", cl::init(true), cl::Hidden,
+ cl::desc("Compute dead symbols"));
+
+static cl::opt<bool> EnableImportMetadata(
"enable-import-metadata", cl::init(false), cl::Hidden,
cl::desc("Enable import metadata like 'thinlto_src_module'"));
-
-/// Summary file to use for function importing when using -function-import from
-/// the command line.
-static cl::opt<std::string>
- SummaryFile("summary-file",
- cl::desc("The summary file to use for function importing."));
-
-/// Used when testing importing from distributed indexes via opt
-// -function-import.
-static cl::opt<bool>
- ImportAllIndex("import-all-index",
- cl::desc("Import all external functions in index."));
-
-// Load lazily a module from \p FileName in \p Context.
-static std::unique_ptr<Module> loadFile(const std::string &FileName,
- LLVMContext &Context) {
- SMDiagnostic Err;
- LLVM_DEBUG(dbgs() << "Loading '" << FileName << "'\n");
- // Metadata isn't loaded until functions are imported, to minimize
- // the memory overhead.
- std::unique_ptr<Module> Result =
- getLazyIRFileModule(FileName, Err, Context,
- /* ShouldLazyLoadMetadata = */ true);
- if (!Result) {
- Err.print("function-import", errs());
- report_fatal_error("Abort");
- }
-
- return Result;
-}
-
-/// Given a list of possible callee implementation for a call site, select one
-/// that fits the \p Threshold.
-///
-/// FIXME: select "best" instead of first that fits. But what is "best"?
-/// - The smallest: more likely to be inlined.
-/// - The one with the least outgoing edges (already well optimized).
-/// - One from a module already being imported from in order to reduce the
-/// number of source modules parsed/linked.
-/// - One that has PGO data attached.
-/// - [insert you fancy metric here]
-static const GlobalValueSummary *
-selectCallee(const ModuleSummaryIndex &Index,
- ArrayRef<std::unique_ptr<GlobalValueSummary>> CalleeSummaryList,
- unsigned Threshold, StringRef CallerModulePath,
- FunctionImporter::ImportFailureReason &Reason,
- GlobalValue::GUID GUID) {
- Reason = FunctionImporter::ImportFailureReason::None;
- auto It = llvm::find_if(
- CalleeSummaryList,
- [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) {
- auto *GVSummary = SummaryPtr.get();
- if (!Index.isGlobalValueLive(GVSummary)) {
- Reason = FunctionImporter::ImportFailureReason::NotLive;
- return false;
- }
-
- // For SamplePGO, in computeImportForFunction the OriginalId
- // may have been used to locate the callee summary list (See
- // comment there).
- // The mapping from OriginalId to GUID may return a GUID
- // that corresponds to a static variable. Filter it out here.
- // This can happen when
- // 1) There is a call to a library function which is not defined
- // in the index.
- // 2) There is a static variable with the OriginalGUID identical
- // to the GUID of the library function in 1);
- // When this happens, the logic for SamplePGO kicks in and
- // the static variable in 2) will be found, which needs to be
- // filtered out.
- if (GVSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind) {
- Reason = FunctionImporter::ImportFailureReason::GlobalVar;
- return false;
- }
- if (GlobalValue::isInterposableLinkage(GVSummary->linkage())) {
- Reason = FunctionImporter::ImportFailureReason::InterposableLinkage;
- // There is no point in importing these, we can't inline them
- return false;
- }
-
- auto *Summary = cast<FunctionSummary>(GVSummary->getBaseObject());
-
- // If this is a local function, make sure we import the copy
- // in the caller's module. The only time a local function can
- // share an entry in the index is if there is a local with the same name
- // in another module that had the same source file name (in a different
- // directory), where each was compiled in their own directory so there
- // was not distinguishing path.
- // However, do the import from another module if there is only one
- // entry in the list - in that case this must be a reference due
- // to indirect call profile data, since a function pointer can point to
- // a local in another module.
- if (GlobalValue::isLocalLinkage(Summary->linkage()) &&
- CalleeSummaryList.size() > 1 &&
- Summary->modulePath() != CallerModulePath) {
- Reason =
- FunctionImporter::ImportFailureReason::LocalLinkageNotInModule;
- return false;
- }
-
- if ((Summary->instCount() > Threshold) &&
- !Summary->fflags().AlwaysInline) {
- Reason = FunctionImporter::ImportFailureReason::TooLarge;
- return false;
- }
-
- // Skip if it isn't legal to import (e.g. may reference unpromotable
- // locals).
- if (Summary->notEligibleToImport()) {
- Reason = FunctionImporter::ImportFailureReason::NotEligible;
- return false;
- }
-
- // Don't bother importing if we can't inline it anyway.
- if (Summary->fflags().NoInline) {
- Reason = FunctionImporter::ImportFailureReason::NoInline;
- return false;
- }
-
- return true;
- });
- if (It == CalleeSummaryList.end())
- return nullptr;
-
- return cast<GlobalValueSummary>(It->get());
-}
-
-namespace {
-
+
+/// Summary file to use for function importing when using -function-import from
+/// the command line.
+static cl::opt<std::string>
+ SummaryFile("summary-file",
+ cl::desc("The summary file to use for function importing."));
+
+/// Used when testing importing from distributed indexes via opt
+// -function-import.
+static cl::opt<bool>
+ ImportAllIndex("import-all-index",
+ cl::desc("Import all external functions in index."));
+
+// Load lazily a module from \p FileName in \p Context.
+static std::unique_ptr<Module> loadFile(const std::string &FileName,
+ LLVMContext &Context) {
+ SMDiagnostic Err;
+ LLVM_DEBUG(dbgs() << "Loading '" << FileName << "'\n");
+ // Metadata isn't loaded until functions are imported, to minimize
+ // the memory overhead.
+ std::unique_ptr<Module> Result =
+ getLazyIRFileModule(FileName, Err, Context,
+ /* ShouldLazyLoadMetadata = */ true);
+ if (!Result) {
+ Err.print("function-import", errs());
+ report_fatal_error("Abort");
+ }
+
+ return Result;
+}
+
+/// Given a list of possible callee implementation for a call site, select one
+/// that fits the \p Threshold.
+///
+/// FIXME: select "best" instead of first that fits. But what is "best"?
+/// - The smallest: more likely to be inlined.
+/// - The one with the least outgoing edges (already well optimized).
+/// - One from a module already being imported from in order to reduce the
+/// number of source modules parsed/linked.
+/// - One that has PGO data attached.
+/// - [insert you fancy metric here]
+static const GlobalValueSummary *
+selectCallee(const ModuleSummaryIndex &Index,
+ ArrayRef<std::unique_ptr<GlobalValueSummary>> CalleeSummaryList,
+ unsigned Threshold, StringRef CallerModulePath,
+ FunctionImporter::ImportFailureReason &Reason,
+ GlobalValue::GUID GUID) {
+ Reason = FunctionImporter::ImportFailureReason::None;
+ auto It = llvm::find_if(
+ CalleeSummaryList,
+ [&](const std::unique_ptr<GlobalValueSummary> &SummaryPtr) {
+ auto *GVSummary = SummaryPtr.get();
+ if (!Index.isGlobalValueLive(GVSummary)) {
+ Reason = FunctionImporter::ImportFailureReason::NotLive;
+ return false;
+ }
+
+ // For SamplePGO, in computeImportForFunction the OriginalId
+ // may have been used to locate the callee summary list (See
+ // comment there).
+ // The mapping from OriginalId to GUID may return a GUID
+ // that corresponds to a static variable. Filter it out here.
+ // This can happen when
+ // 1) There is a call to a library function which is not defined
+ // in the index.
+ // 2) There is a static variable with the OriginalGUID identical
+ // to the GUID of the library function in 1);
+ // When this happens, the logic for SamplePGO kicks in and
+ // the static variable in 2) will be found, which needs to be
+ // filtered out.
+ if (GVSummary->getSummaryKind() == GlobalValueSummary::GlobalVarKind) {
+ Reason = FunctionImporter::ImportFailureReason::GlobalVar;
+ return false;
+ }
+ if (GlobalValue::isInterposableLinkage(GVSummary->linkage())) {
+ Reason = FunctionImporter::ImportFailureReason::InterposableLinkage;
+ // There is no point in importing these, we can't inline them
+ return false;
+ }
+
+ auto *Summary = cast<FunctionSummary>(GVSummary->getBaseObject());
+
+ // If this is a local function, make sure we import the copy
+ // in the caller's module. The only time a local function can
+ // share an entry in the index is if there is a local with the same name
+ // in another module that had the same source file name (in a different
+ // directory), where each was compiled in their own directory so there
+ // was not distinguishing path.
+ // However, do the import from another module if there is only one
+ // entry in the list - in that case this must be a reference due
+ // to indirect call profile data, since a function pointer can point to
+ // a local in another module.
+ if (GlobalValue::isLocalLinkage(Summary->linkage()) &&
+ CalleeSummaryList.size() > 1 &&
+ Summary->modulePath() != CallerModulePath) {
+ Reason =
+ FunctionImporter::ImportFailureReason::LocalLinkageNotInModule;
+ return false;
+ }
+
+ if ((Summary->instCount() > Threshold) &&
+ !Summary->fflags().AlwaysInline) {
+ Reason = FunctionImporter::ImportFailureReason::TooLarge;
+ return false;
+ }
+
+ // Skip if it isn't legal to import (e.g. may reference unpromotable
+ // locals).
+ if (Summary->notEligibleToImport()) {
+ Reason = FunctionImporter::ImportFailureReason::NotEligible;
+ return false;
+ }
+
+ // Don't bother importing if we can't inline it anyway.
+ if (Summary->fflags().NoInline) {
+ Reason = FunctionImporter::ImportFailureReason::NoInline;
+ return false;
+ }
+
+ return true;
+ });
+ if (It == CalleeSummaryList.end())
+ return nullptr;
+
+ return cast<GlobalValueSummary>(It->get());
+}
+
+namespace {
+
using EdgeInfo =
std::tuple<const GlobalValueSummary *, unsigned /* Threshold */>;
-
-} // anonymous namespace
-
-static ValueInfo
-updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) {
- if (!VI.getSummaryList().empty())
- return VI;
- // For SamplePGO, the indirect call targets for local functions will
- // have its original name annotated in profile. We try to find the
- // corresponding PGOFuncName as the GUID.
- // FIXME: Consider updating the edges in the graph after building
- // it, rather than needing to perform this mapping on each walk.
- auto GUID = Index.getGUIDFromOriginalID(VI.getGUID());
- if (GUID == 0)
- return ValueInfo();
- return Index.getValueInfo(GUID);
-}
-
-static void computeImportForReferencedGlobals(
+
+} // anonymous namespace
+
+static ValueInfo
+updateValueInfoForIndirectCalls(const ModuleSummaryIndex &Index, ValueInfo VI) {
+ if (!VI.getSummaryList().empty())
+ return VI;
+ // For SamplePGO, the indirect call targets for local functions will
+ // have its original name annotated in profile. We try to find the
+ // corresponding PGOFuncName as the GUID.
+ // FIXME: Consider updating the edges in the graph after building
+ // it, rather than needing to perform this mapping on each walk.
+ auto GUID = Index.getGUIDFromOriginalID(VI.getGUID());
+ if (GUID == 0)
+ return ValueInfo();
+ return Index.getValueInfo(GUID);
+}
+
+static void computeImportForReferencedGlobals(
const GlobalValueSummary &Summary, const ModuleSummaryIndex &Index,
- const GVSummaryMapTy &DefinedGVSummaries,
+ const GVSummaryMapTy &DefinedGVSummaries,
SmallVectorImpl<EdgeInfo> &Worklist,
- FunctionImporter::ImportMapTy &ImportList,
- StringMap<FunctionImporter::ExportSetTy> *ExportLists) {
- for (auto &VI : Summary.refs()) {
- if (DefinedGVSummaries.count(VI.getGUID())) {
- LLVM_DEBUG(
- dbgs() << "Ref ignored! Target already in destination module.\n");
- continue;
- }
-
- LLVM_DEBUG(dbgs() << " ref -> " << VI << "\n");
-
- // If this is a local variable, make sure we import the copy
- // in the caller's module. The only time a local variable can
- // share an entry in the index is if there is a local with the same name
- // in another module that had the same source file name (in a different
- // directory), where each was compiled in their own directory so there
- // was not distinguishing path.
- auto LocalNotInModule = [&](const GlobalValueSummary *RefSummary) -> bool {
- return GlobalValue::isLocalLinkage(RefSummary->linkage()) &&
- RefSummary->modulePath() != Summary.modulePath();
- };
-
- for (auto &RefSummary : VI.getSummaryList())
- if (isa<GlobalVarSummary>(RefSummary.get()) &&
- Index.canImportGlobalVar(RefSummary.get(), /* AnalyzeRefs */ true) &&
- !LocalNotInModule(RefSummary.get())) {
- auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID());
- // Only update stat and exports if we haven't already imported this
- // variable.
- if (!ILI.second)
- break;
- NumImportedGlobalVarsThinLink++;
- // Any references made by this variable will be marked exported later,
- // in ComputeCrossModuleImport, after import decisions are complete,
- // which is more efficient than adding them here.
- if (ExportLists)
- (*ExportLists)[RefSummary->modulePath()].insert(VI);
+ FunctionImporter::ImportMapTy &ImportList,
+ StringMap<FunctionImporter::ExportSetTy> *ExportLists) {
+ for (auto &VI : Summary.refs()) {
+ if (DefinedGVSummaries.count(VI.getGUID())) {
+ LLVM_DEBUG(
+ dbgs() << "Ref ignored! Target already in destination module.\n");
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << " ref -> " << VI << "\n");
+
+ // If this is a local variable, make sure we import the copy
+ // in the caller's module. The only time a local variable can
+ // share an entry in the index is if there is a local with the same name
+ // in another module that had the same source file name (in a different
+ // directory), where each was compiled in their own directory so there
+ // was not distinguishing path.
+ auto LocalNotInModule = [&](const GlobalValueSummary *RefSummary) -> bool {
+ return GlobalValue::isLocalLinkage(RefSummary->linkage()) &&
+ RefSummary->modulePath() != Summary.modulePath();
+ };
+
+ for (auto &RefSummary : VI.getSummaryList())
+ if (isa<GlobalVarSummary>(RefSummary.get()) &&
+ Index.canImportGlobalVar(RefSummary.get(), /* AnalyzeRefs */ true) &&
+ !LocalNotInModule(RefSummary.get())) {
+ auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID());
+ // Only update stat and exports if we haven't already imported this
+ // variable.
+ if (!ILI.second)
+ break;
+ NumImportedGlobalVarsThinLink++;
+ // Any references made by this variable will be marked exported later,
+ // in ComputeCrossModuleImport, after import decisions are complete,
+ // which is more efficient than adding them here.
+ if (ExportLists)
+ (*ExportLists)[RefSummary->modulePath()].insert(VI);
// If variable is not writeonly we attempt to recursively analyze
// its references in order to import referenced constants.
if (!Index.isWriteOnly(cast<GlobalVarSummary>(RefSummary.get())))
Worklist.emplace_back(RefSummary.get(), 0);
- break;
- }
- }
-}
-
-static const char *
-getFailureName(FunctionImporter::ImportFailureReason Reason) {
- switch (Reason) {
- case FunctionImporter::ImportFailureReason::None:
- return "None";
- case FunctionImporter::ImportFailureReason::GlobalVar:
- return "GlobalVar";
- case FunctionImporter::ImportFailureReason::NotLive:
- return "NotLive";
- case FunctionImporter::ImportFailureReason::TooLarge:
- return "TooLarge";
- case FunctionImporter::ImportFailureReason::InterposableLinkage:
- return "InterposableLinkage";
- case FunctionImporter::ImportFailureReason::LocalLinkageNotInModule:
- return "LocalLinkageNotInModule";
- case FunctionImporter::ImportFailureReason::NotEligible:
- return "NotEligible";
- case FunctionImporter::ImportFailureReason::NoInline:
- return "NoInline";
- }
- llvm_unreachable("invalid reason");
-}
-
-/// Compute the list of functions to import for a given caller. Mark these
-/// imported functions and the symbols they reference in their source module as
-/// exported from their source module.
-static void computeImportForFunction(
- const FunctionSummary &Summary, const ModuleSummaryIndex &Index,
- const unsigned Threshold, const GVSummaryMapTy &DefinedGVSummaries,
- SmallVectorImpl<EdgeInfo> &Worklist,
- FunctionImporter::ImportMapTy &ImportList,
- StringMap<FunctionImporter::ExportSetTy> *ExportLists,
- FunctionImporter::ImportThresholdsTy &ImportThresholds) {
- computeImportForReferencedGlobals(Summary, Index, DefinedGVSummaries,
+ break;
+ }
+ }
+}
+
+static const char *
+getFailureName(FunctionImporter::ImportFailureReason Reason) {
+ switch (Reason) {
+ case FunctionImporter::ImportFailureReason::None:
+ return "None";
+ case FunctionImporter::ImportFailureReason::GlobalVar:
+ return "GlobalVar";
+ case FunctionImporter::ImportFailureReason::NotLive:
+ return "NotLive";
+ case FunctionImporter::ImportFailureReason::TooLarge:
+ return "TooLarge";
+ case FunctionImporter::ImportFailureReason::InterposableLinkage:
+ return "InterposableLinkage";
+ case FunctionImporter::ImportFailureReason::LocalLinkageNotInModule:
+ return "LocalLinkageNotInModule";
+ case FunctionImporter::ImportFailureReason::NotEligible:
+ return "NotEligible";
+ case FunctionImporter::ImportFailureReason::NoInline:
+ return "NoInline";
+ }
+ llvm_unreachable("invalid reason");
+}
+
+/// Compute the list of functions to import for a given caller. Mark these
+/// imported functions and the symbols they reference in their source module as
+/// exported from their source module.
+static void computeImportForFunction(
+ const FunctionSummary &Summary, const ModuleSummaryIndex &Index,
+ const unsigned Threshold, const GVSummaryMapTy &DefinedGVSummaries,
+ SmallVectorImpl<EdgeInfo> &Worklist,
+ FunctionImporter::ImportMapTy &ImportList,
+ StringMap<FunctionImporter::ExportSetTy> *ExportLists,
+ FunctionImporter::ImportThresholdsTy &ImportThresholds) {
+ computeImportForReferencedGlobals(Summary, Index, DefinedGVSummaries,
Worklist, ImportList, ExportLists);
- static int ImportCount = 0;
- for (auto &Edge : Summary.calls()) {
- ValueInfo VI = Edge.first;
- LLVM_DEBUG(dbgs() << " edge -> " << VI << " Threshold:" << Threshold
- << "\n");
-
- if (ImportCutoff >= 0 && ImportCount >= ImportCutoff) {
- LLVM_DEBUG(dbgs() << "ignored! import-cutoff value of " << ImportCutoff
- << " reached.\n");
- continue;
- }
-
- VI = updateValueInfoForIndirectCalls(Index, VI);
- if (!VI)
- continue;
-
- if (DefinedGVSummaries.count(VI.getGUID())) {
- LLVM_DEBUG(dbgs() << "ignored! Target already in destination module.\n");
- continue;
- }
-
- auto GetBonusMultiplier = [](CalleeInfo::HotnessType Hotness) -> float {
- if (Hotness == CalleeInfo::HotnessType::Hot)
- return ImportHotMultiplier;
- if (Hotness == CalleeInfo::HotnessType::Cold)
- return ImportColdMultiplier;
- if (Hotness == CalleeInfo::HotnessType::Critical)
- return ImportCriticalMultiplier;
- return 1.0;
- };
-
- const auto NewThreshold =
- Threshold * GetBonusMultiplier(Edge.second.getHotness());
-
- auto IT = ImportThresholds.insert(std::make_pair(
- VI.getGUID(), std::make_tuple(NewThreshold, nullptr, nullptr)));
- bool PreviouslyVisited = !IT.second;
- auto &ProcessedThreshold = std::get<0>(IT.first->second);
- auto &CalleeSummary = std::get<1>(IT.first->second);
- auto &FailureInfo = std::get<2>(IT.first->second);
-
- bool IsHotCallsite =
- Edge.second.getHotness() == CalleeInfo::HotnessType::Hot;
- bool IsCriticalCallsite =
- Edge.second.getHotness() == CalleeInfo::HotnessType::Critical;
-
- const FunctionSummary *ResolvedCalleeSummary = nullptr;
- if (CalleeSummary) {
- assert(PreviouslyVisited);
- // Since the traversal of the call graph is DFS, we can revisit a function
- // a second time with a higher threshold. In this case, it is added back
- // to the worklist with the new threshold (so that its own callee chains
- // can be considered with the higher threshold).
- if (NewThreshold <= ProcessedThreshold) {
- LLVM_DEBUG(
- dbgs() << "ignored! Target was already imported with Threshold "
- << ProcessedThreshold << "\n");
- continue;
- }
- // Update with new larger threshold.
- ProcessedThreshold = NewThreshold;
- ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
- } else {
- // If we already rejected importing a callee at the same or higher
- // threshold, don't waste time calling selectCallee.
- if (PreviouslyVisited && NewThreshold <= ProcessedThreshold) {
- LLVM_DEBUG(
- dbgs() << "ignored! Target was already rejected with Threshold "
- << ProcessedThreshold << "\n");
- if (PrintImportFailures) {
- assert(FailureInfo &&
- "Expected FailureInfo for previously rejected candidate");
- FailureInfo->Attempts++;
- }
- continue;
- }
-
- FunctionImporter::ImportFailureReason Reason;
- CalleeSummary = selectCallee(Index, VI.getSummaryList(), NewThreshold,
- Summary.modulePath(), Reason, VI.getGUID());
- if (!CalleeSummary) {
- // Update with new larger threshold if this was a retry (otherwise
- // we would have already inserted with NewThreshold above). Also
- // update failure info if requested.
- if (PreviouslyVisited) {
- ProcessedThreshold = NewThreshold;
- if (PrintImportFailures) {
- assert(FailureInfo &&
- "Expected FailureInfo for previously rejected candidate");
- FailureInfo->Reason = Reason;
- FailureInfo->Attempts++;
- FailureInfo->MaxHotness =
- std::max(FailureInfo->MaxHotness, Edge.second.getHotness());
- }
- } else if (PrintImportFailures) {
- assert(!FailureInfo &&
- "Expected no FailureInfo for newly rejected candidate");
- FailureInfo = std::make_unique<FunctionImporter::ImportFailureInfo>(
- VI, Edge.second.getHotness(), Reason, 1);
- }
- LLVM_DEBUG(
- dbgs() << "ignored! No qualifying callee with summary found.\n");
- continue;
- }
-
- // "Resolve" the summary
- CalleeSummary = CalleeSummary->getBaseObject();
- ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
-
- assert((ResolvedCalleeSummary->fflags().AlwaysInline ||
- (ResolvedCalleeSummary->instCount() <= NewThreshold)) &&
- "selectCallee() didn't honor the threshold");
-
- auto ExportModulePath = ResolvedCalleeSummary->modulePath();
- auto ILI = ImportList[ExportModulePath].insert(VI.getGUID());
- // We previously decided to import this GUID definition if it was already
- // inserted in the set of imports from the exporting module.
- bool PreviouslyImported = !ILI.second;
- if (!PreviouslyImported) {
- NumImportedFunctionsThinLink++;
- if (IsHotCallsite)
- NumImportedHotFunctionsThinLink++;
- if (IsCriticalCallsite)
- NumImportedCriticalFunctionsThinLink++;
- }
-
- // Any calls/references made by this function will be marked exported
- // later, in ComputeCrossModuleImport, after import decisions are
- // complete, which is more efficient than adding them here.
- if (ExportLists)
- (*ExportLists)[ExportModulePath].insert(VI);
- }
-
- auto GetAdjustedThreshold = [](unsigned Threshold, bool IsHotCallsite) {
- // Adjust the threshold for next level of imported functions.
- // The threshold is different for hot callsites because we can then
- // inline chains of hot calls.
- if (IsHotCallsite)
- return Threshold * ImportHotInstrFactor;
- return Threshold * ImportInstrFactor;
- };
-
- const auto AdjThreshold = GetAdjustedThreshold(Threshold, IsHotCallsite);
-
- ImportCount++;
-
- // Insert the newly imported function to the worklist.
+ static int ImportCount = 0;
+ for (auto &Edge : Summary.calls()) {
+ ValueInfo VI = Edge.first;
+ LLVM_DEBUG(dbgs() << " edge -> " << VI << " Threshold:" << Threshold
+ << "\n");
+
+ if (ImportCutoff >= 0 && ImportCount >= ImportCutoff) {
+ LLVM_DEBUG(dbgs() << "ignored! import-cutoff value of " << ImportCutoff
+ << " reached.\n");
+ continue;
+ }
+
+ VI = updateValueInfoForIndirectCalls(Index, VI);
+ if (!VI)
+ continue;
+
+ if (DefinedGVSummaries.count(VI.getGUID())) {
+ LLVM_DEBUG(dbgs() << "ignored! Target already in destination module.\n");
+ continue;
+ }
+
+ auto GetBonusMultiplier = [](CalleeInfo::HotnessType Hotness) -> float {
+ if (Hotness == CalleeInfo::HotnessType::Hot)
+ return ImportHotMultiplier;
+ if (Hotness == CalleeInfo::HotnessType::Cold)
+ return ImportColdMultiplier;
+ if (Hotness == CalleeInfo::HotnessType::Critical)
+ return ImportCriticalMultiplier;
+ return 1.0;
+ };
+
+ const auto NewThreshold =
+ Threshold * GetBonusMultiplier(Edge.second.getHotness());
+
+ auto IT = ImportThresholds.insert(std::make_pair(
+ VI.getGUID(), std::make_tuple(NewThreshold, nullptr, nullptr)));
+ bool PreviouslyVisited = !IT.second;
+ auto &ProcessedThreshold = std::get<0>(IT.first->second);
+ auto &CalleeSummary = std::get<1>(IT.first->second);
+ auto &FailureInfo = std::get<2>(IT.first->second);
+
+ bool IsHotCallsite =
+ Edge.second.getHotness() == CalleeInfo::HotnessType::Hot;
+ bool IsCriticalCallsite =
+ Edge.second.getHotness() == CalleeInfo::HotnessType::Critical;
+
+ const FunctionSummary *ResolvedCalleeSummary = nullptr;
+ if (CalleeSummary) {
+ assert(PreviouslyVisited);
+ // Since the traversal of the call graph is DFS, we can revisit a function
+ // a second time with a higher threshold. In this case, it is added back
+ // to the worklist with the new threshold (so that its own callee chains
+ // can be considered with the higher threshold).
+ if (NewThreshold <= ProcessedThreshold) {
+ LLVM_DEBUG(
+ dbgs() << "ignored! Target was already imported with Threshold "
+ << ProcessedThreshold << "\n");
+ continue;
+ }
+ // Update with new larger threshold.
+ ProcessedThreshold = NewThreshold;
+ ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
+ } else {
+ // If we already rejected importing a callee at the same or higher
+ // threshold, don't waste time calling selectCallee.
+ if (PreviouslyVisited && NewThreshold <= ProcessedThreshold) {
+ LLVM_DEBUG(
+ dbgs() << "ignored! Target was already rejected with Threshold "
+ << ProcessedThreshold << "\n");
+ if (PrintImportFailures) {
+ assert(FailureInfo &&
+ "Expected FailureInfo for previously rejected candidate");
+ FailureInfo->Attempts++;
+ }
+ continue;
+ }
+
+ FunctionImporter::ImportFailureReason Reason;
+ CalleeSummary = selectCallee(Index, VI.getSummaryList(), NewThreshold,
+ Summary.modulePath(), Reason, VI.getGUID());
+ if (!CalleeSummary) {
+ // Update with new larger threshold if this was a retry (otherwise
+ // we would have already inserted with NewThreshold above). Also
+ // update failure info if requested.
+ if (PreviouslyVisited) {
+ ProcessedThreshold = NewThreshold;
+ if (PrintImportFailures) {
+ assert(FailureInfo &&
+ "Expected FailureInfo for previously rejected candidate");
+ FailureInfo->Reason = Reason;
+ FailureInfo->Attempts++;
+ FailureInfo->MaxHotness =
+ std::max(FailureInfo->MaxHotness, Edge.second.getHotness());
+ }
+ } else if (PrintImportFailures) {
+ assert(!FailureInfo &&
+ "Expected no FailureInfo for newly rejected candidate");
+ FailureInfo = std::make_unique<FunctionImporter::ImportFailureInfo>(
+ VI, Edge.second.getHotness(), Reason, 1);
+ }
+ LLVM_DEBUG(
+ dbgs() << "ignored! No qualifying callee with summary found.\n");
+ continue;
+ }
+
+ // "Resolve" the summary
+ CalleeSummary = CalleeSummary->getBaseObject();
+ ResolvedCalleeSummary = cast<FunctionSummary>(CalleeSummary);
+
+ assert((ResolvedCalleeSummary->fflags().AlwaysInline ||
+ (ResolvedCalleeSummary->instCount() <= NewThreshold)) &&
+ "selectCallee() didn't honor the threshold");
+
+ auto ExportModulePath = ResolvedCalleeSummary->modulePath();
+ auto ILI = ImportList[ExportModulePath].insert(VI.getGUID());
+ // We previously decided to import this GUID definition if it was already
+ // inserted in the set of imports from the exporting module.
+ bool PreviouslyImported = !ILI.second;
+ if (!PreviouslyImported) {
+ NumImportedFunctionsThinLink++;
+ if (IsHotCallsite)
+ NumImportedHotFunctionsThinLink++;
+ if (IsCriticalCallsite)
+ NumImportedCriticalFunctionsThinLink++;
+ }
+
+ // Any calls/references made by this function will be marked exported
+ // later, in ComputeCrossModuleImport, after import decisions are
+ // complete, which is more efficient than adding them here.
+ if (ExportLists)
+ (*ExportLists)[ExportModulePath].insert(VI);
+ }
+
+ auto GetAdjustedThreshold = [](unsigned Threshold, bool IsHotCallsite) {
+ // Adjust the threshold for next level of imported functions.
+ // The threshold is different for hot callsites because we can then
+ // inline chains of hot calls.
+ if (IsHotCallsite)
+ return Threshold * ImportHotInstrFactor;
+ return Threshold * ImportInstrFactor;
+ };
+
+ const auto AdjThreshold = GetAdjustedThreshold(Threshold, IsHotCallsite);
+
+ ImportCount++;
+
+ // Insert the newly imported function to the worklist.
Worklist.emplace_back(ResolvedCalleeSummary, AdjThreshold);
- }
-}
-
-/// Given the list of globals defined in a module, compute the list of imports
-/// as well as the list of "exports", i.e. the list of symbols referenced from
-/// another module (that may require promotion).
-static void ComputeImportForModule(
- const GVSummaryMapTy &DefinedGVSummaries, const ModuleSummaryIndex &Index,
- StringRef ModName, FunctionImporter::ImportMapTy &ImportList,
- StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
- // Worklist contains the list of function imported in this module, for which
- // we will analyse the callees and may import further down the callgraph.
- SmallVector<EdgeInfo, 128> Worklist;
- FunctionImporter::ImportThresholdsTy ImportThresholds;
-
- // Populate the worklist with the import for the functions in the current
- // module
- for (auto &GVSummary : DefinedGVSummaries) {
-#ifndef NDEBUG
- // FIXME: Change the GVSummaryMapTy to hold ValueInfo instead of GUID
- // so this map look up (and possibly others) can be avoided.
- auto VI = Index.getValueInfo(GVSummary.first);
-#endif
- if (!Index.isGlobalValueLive(GVSummary.second)) {
- LLVM_DEBUG(dbgs() << "Ignores Dead GUID: " << VI << "\n");
- continue;
- }
- auto *FuncSummary =
- dyn_cast<FunctionSummary>(GVSummary.second->getBaseObject());
- if (!FuncSummary)
- // Skip import for global variables
- continue;
- LLVM_DEBUG(dbgs() << "Initialize import for " << VI << "\n");
- computeImportForFunction(*FuncSummary, Index, ImportInstrLimit,
- DefinedGVSummaries, Worklist, ImportList,
- ExportLists, ImportThresholds);
- }
-
- // Process the newly imported functions and add callees to the worklist.
- while (!Worklist.empty()) {
+ }
+}
+
+/// Given the list of globals defined in a module, compute the list of imports
+/// as well as the list of "exports", i.e. the list of symbols referenced from
+/// another module (that may require promotion).
+static void ComputeImportForModule(
+ const GVSummaryMapTy &DefinedGVSummaries, const ModuleSummaryIndex &Index,
+ StringRef ModName, FunctionImporter::ImportMapTy &ImportList,
+ StringMap<FunctionImporter::ExportSetTy> *ExportLists = nullptr) {
+ // Worklist contains the list of function imported in this module, for which
+ // we will analyse the callees and may import further down the callgraph.
+ SmallVector<EdgeInfo, 128> Worklist;
+ FunctionImporter::ImportThresholdsTy ImportThresholds;
+
+ // Populate the worklist with the import for the functions in the current
+ // module
+ for (auto &GVSummary : DefinedGVSummaries) {
+#ifndef NDEBUG
+ // FIXME: Change the GVSummaryMapTy to hold ValueInfo instead of GUID
+ // so this map look up (and possibly others) can be avoided.
+ auto VI = Index.getValueInfo(GVSummary.first);
+#endif
+ if (!Index.isGlobalValueLive(GVSummary.second)) {
+ LLVM_DEBUG(dbgs() << "Ignores Dead GUID: " << VI << "\n");
+ continue;
+ }
+ auto *FuncSummary =
+ dyn_cast<FunctionSummary>(GVSummary.second->getBaseObject());
+ if (!FuncSummary)
+ // Skip import for global variables
+ continue;
+ LLVM_DEBUG(dbgs() << "Initialize import for " << VI << "\n");
+ computeImportForFunction(*FuncSummary, Index, ImportInstrLimit,
+ DefinedGVSummaries, Worklist, ImportList,
+ ExportLists, ImportThresholds);
+ }
+
+ // Process the newly imported functions and add callees to the worklist.
+ while (!Worklist.empty()) {
auto GVInfo = Worklist.pop_back_val();
auto *Summary = std::get<0>(GVInfo);
auto Threshold = std::get<1>(GVInfo);
-
+
if (auto *FS = dyn_cast<FunctionSummary>(Summary))
computeImportForFunction(*FS, Index, Threshold, DefinedGVSummaries,
Worklist, ImportList, ExportLists,
@@ -560,823 +560,823 @@ static void ComputeImportForModule(
else
computeImportForReferencedGlobals(*Summary, Index, DefinedGVSummaries,
Worklist, ImportList, ExportLists);
- }
-
- // Print stats about functions considered but rejected for importing
- // when requested.
- if (PrintImportFailures) {
- dbgs() << "Missed imports into module " << ModName << "\n";
- for (auto &I : ImportThresholds) {
- auto &ProcessedThreshold = std::get<0>(I.second);
- auto &CalleeSummary = std::get<1>(I.second);
- auto &FailureInfo = std::get<2>(I.second);
- if (CalleeSummary)
- continue; // We are going to import.
- assert(FailureInfo);
- FunctionSummary *FS = nullptr;
- if (!FailureInfo->VI.getSummaryList().empty())
- FS = dyn_cast<FunctionSummary>(
- FailureInfo->VI.getSummaryList()[0]->getBaseObject());
- dbgs() << FailureInfo->VI
- << ": Reason = " << getFailureName(FailureInfo->Reason)
- << ", Threshold = " << ProcessedThreshold
- << ", Size = " << (FS ? (int)FS->instCount() : -1)
- << ", MaxHotness = " << getHotnessName(FailureInfo->MaxHotness)
- << ", Attempts = " << FailureInfo->Attempts << "\n";
- }
- }
-}
-
-#ifndef NDEBUG
-static bool isGlobalVarSummary(const ModuleSummaryIndex &Index, ValueInfo VI) {
- auto SL = VI.getSummaryList();
- return SL.empty()
- ? false
- : SL[0]->getSummaryKind() == GlobalValueSummary::GlobalVarKind;
-}
-
-static bool isGlobalVarSummary(const ModuleSummaryIndex &Index,
- GlobalValue::GUID G) {
- if (const auto &VI = Index.getValueInfo(G))
- return isGlobalVarSummary(Index, VI);
- return false;
-}
-
-template <class T>
-static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index,
- T &Cont) {
- unsigned NumGVS = 0;
- for (auto &V : Cont)
- if (isGlobalVarSummary(Index, V))
- ++NumGVS;
- return NumGVS;
-}
-#endif
-
-#ifndef NDEBUG
-static bool
-checkVariableImport(const ModuleSummaryIndex &Index,
- StringMap<FunctionImporter::ImportMapTy> &ImportLists,
- StringMap<FunctionImporter::ExportSetTy> &ExportLists) {
-
- DenseSet<GlobalValue::GUID> FlattenedImports;
-
- for (auto &ImportPerModule : ImportLists)
- for (auto &ExportPerModule : ImportPerModule.second)
- FlattenedImports.insert(ExportPerModule.second.begin(),
- ExportPerModule.second.end());
-
- // Checks that all GUIDs of read/writeonly vars we see in export lists
- // are also in the import lists. Otherwise we my face linker undefs,
- // because readonly and writeonly vars are internalized in their
- // source modules.
- auto IsReadOrWriteOnlyVar = [&](StringRef ModulePath, const ValueInfo &VI) {
- auto *GVS = dyn_cast_or_null<GlobalVarSummary>(
- Index.findSummaryInModule(VI, ModulePath));
- return GVS && (Index.isReadOnly(GVS) || Index.isWriteOnly(GVS));
- };
-
- for (auto &ExportPerModule : ExportLists)
- for (auto &VI : ExportPerModule.second)
- if (!FlattenedImports.count(VI.getGUID()) &&
- IsReadOrWriteOnlyVar(ExportPerModule.first(), VI))
- return false;
-
- return true;
-}
-#endif
-
-/// Compute all the import and export for every module using the Index.
-void llvm::ComputeCrossModuleImport(
- const ModuleSummaryIndex &Index,
- const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
- StringMap<FunctionImporter::ImportMapTy> &ImportLists,
- StringMap<FunctionImporter::ExportSetTy> &ExportLists) {
- // For each module that has function defined, compute the import/export lists.
- for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) {
- auto &ImportList = ImportLists[DefinedGVSummaries.first()];
- LLVM_DEBUG(dbgs() << "Computing import for Module '"
- << DefinedGVSummaries.first() << "'\n");
- ComputeImportForModule(DefinedGVSummaries.second, Index,
- DefinedGVSummaries.first(), ImportList,
- &ExportLists);
- }
-
- // When computing imports we only added the variables and functions being
- // imported to the export list. We also need to mark any references and calls
- // they make as exported as well. We do this here, as it is more efficient
- // since we may import the same values multiple times into different modules
- // during the import computation.
- for (auto &ELI : ExportLists) {
- FunctionImporter::ExportSetTy NewExports;
- const auto &DefinedGVSummaries =
- ModuleToDefinedGVSummaries.lookup(ELI.first());
- for (auto &EI : ELI.second) {
- // Find the copy defined in the exporting module so that we can mark the
- // values it references in that specific definition as exported.
- // Below we will add all references and called values, without regard to
- // whether they are also defined in this module. We subsequently prune the
- // list to only include those defined in the exporting module, see comment
- // there as to why.
- auto DS = DefinedGVSummaries.find(EI.getGUID());
- // Anything marked exported during the import computation must have been
- // defined in the exporting module.
- assert(DS != DefinedGVSummaries.end());
- auto *S = DS->getSecond();
- S = S->getBaseObject();
- if (auto *GVS = dyn_cast<GlobalVarSummary>(S)) {
- // Export referenced functions and variables. We don't export/promote
- // objects referenced by writeonly variable initializer, because
- // we convert such variables initializers to "zeroinitializer".
- // See processGlobalForThinLTO.
- if (!Index.isWriteOnly(GVS))
- for (const auto &VI : GVS->refs())
- NewExports.insert(VI);
- } else {
- auto *FS = cast<FunctionSummary>(S);
- for (auto &Edge : FS->calls())
- NewExports.insert(Edge.first);
- for (auto &Ref : FS->refs())
- NewExports.insert(Ref);
- }
- }
- // Prune list computed above to only include values defined in the exporting
- // module. We do this after the above insertion since we may hit the same
- // ref/call target multiple times in above loop, and it is more efficient to
- // avoid a set lookup each time.
- for (auto EI = NewExports.begin(); EI != NewExports.end();) {
- if (!DefinedGVSummaries.count(EI->getGUID()))
- NewExports.erase(EI++);
- else
- ++EI;
- }
- ELI.second.insert(NewExports.begin(), NewExports.end());
- }
-
- assert(checkVariableImport(Index, ImportLists, ExportLists));
-#ifndef NDEBUG
- LLVM_DEBUG(dbgs() << "Import/Export lists for " << ImportLists.size()
- << " modules:\n");
- for (auto &ModuleImports : ImportLists) {
- auto ModName = ModuleImports.first();
- auto &Exports = ExportLists[ModName];
- unsigned NumGVS = numGlobalVarSummaries(Index, Exports);
- LLVM_DEBUG(dbgs() << "* Module " << ModName << " exports "
- << Exports.size() - NumGVS << " functions and " << NumGVS
- << " vars. Imports from " << ModuleImports.second.size()
- << " modules.\n");
- for (auto &Src : ModuleImports.second) {
- auto SrcModName = Src.first();
- unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
- LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
- << " functions imported from " << SrcModName << "\n");
- LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod
- << " global vars imported from " << SrcModName << "\n");
- }
- }
-#endif
-}
-
-#ifndef NDEBUG
-static void dumpImportListForModule(const ModuleSummaryIndex &Index,
- StringRef ModulePath,
- FunctionImporter::ImportMapTy &ImportList) {
- LLVM_DEBUG(dbgs() << "* Module " << ModulePath << " imports from "
- << ImportList.size() << " modules.\n");
- for (auto &Src : ImportList) {
- auto SrcModName = Src.first();
- unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
- LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
- << " functions imported from " << SrcModName << "\n");
- LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod << " vars imported from "
- << SrcModName << "\n");
- }
-}
-#endif
-
-/// Compute all the imports for the given module in the Index.
-void llvm::ComputeCrossModuleImportForModule(
- StringRef ModulePath, const ModuleSummaryIndex &Index,
- FunctionImporter::ImportMapTy &ImportList) {
- // Collect the list of functions this module defines.
- // GUID -> Summary
- GVSummaryMapTy FunctionSummaryMap;
- Index.collectDefinedFunctionsForModule(ModulePath, FunctionSummaryMap);
-
- // Compute the import list for this module.
- LLVM_DEBUG(dbgs() << "Computing import for Module '" << ModulePath << "'\n");
- ComputeImportForModule(FunctionSummaryMap, Index, ModulePath, ImportList);
-
-#ifndef NDEBUG
- dumpImportListForModule(Index, ModulePath, ImportList);
-#endif
-}
-
-// Mark all external summaries in Index for import into the given module.
-// Used for distributed builds using a distributed index.
-void llvm::ComputeCrossModuleImportForModuleFromIndex(
- StringRef ModulePath, const ModuleSummaryIndex &Index,
- FunctionImporter::ImportMapTy &ImportList) {
- for (auto &GlobalList : Index) {
- // Ignore entries for undefined references.
- if (GlobalList.second.SummaryList.empty())
- continue;
-
- auto GUID = GlobalList.first;
- assert(GlobalList.second.SummaryList.size() == 1 &&
- "Expected individual combined index to have one summary per GUID");
- auto &Summary = GlobalList.second.SummaryList[0];
- // Skip the summaries for the importing module. These are included to
- // e.g. record required linkage changes.
- if (Summary->modulePath() == ModulePath)
- continue;
- // Add an entry to provoke importing by thinBackend.
- ImportList[Summary->modulePath()].insert(GUID);
- }
-#ifndef NDEBUG
- dumpImportListForModule(Index, ModulePath, ImportList);
-#endif
-}
-
-void llvm::computeDeadSymbols(
- ModuleSummaryIndex &Index,
- const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
- function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing) {
- assert(!Index.withGlobalValueDeadStripping());
- if (!ComputeDead)
- return;
- if (GUIDPreservedSymbols.empty())
- // Don't do anything when nothing is live, this is friendly with tests.
- return;
- unsigned LiveSymbols = 0;
- SmallVector<ValueInfo, 128> Worklist;
- Worklist.reserve(GUIDPreservedSymbols.size() * 2);
- for (auto GUID : GUIDPreservedSymbols) {
- ValueInfo VI = Index.getValueInfo(GUID);
- if (!VI)
- continue;
- for (auto &S : VI.getSummaryList())
- S->setLive(true);
- }
-
- // Add values flagged in the index as live roots to the worklist.
- for (const auto &Entry : Index) {
- auto VI = Index.getValueInfo(Entry);
- for (auto &S : Entry.second.SummaryList)
- if (S->isLive()) {
- LLVM_DEBUG(dbgs() << "Live root: " << VI << "\n");
- Worklist.push_back(VI);
- ++LiveSymbols;
- break;
- }
- }
-
- // Make value live and add it to the worklist if it was not live before.
- auto visit = [&](ValueInfo VI, bool IsAliasee) {
- // FIXME: If we knew which edges were created for indirect call profiles,
- // we could skip them here. Any that are live should be reached via
- // other edges, e.g. reference edges. Otherwise, using a profile collected
- // on a slightly different binary might provoke preserving, importing
- // and ultimately promoting calls to functions not linked into this
- // binary, which increases the binary size unnecessarily. Note that
- // if this code changes, the importer needs to change so that edges
- // to functions marked dead are skipped.
- VI = updateValueInfoForIndirectCalls(Index, VI);
- if (!VI)
- return;
-
- if (llvm::any_of(VI.getSummaryList(),
- [](const std::unique_ptr<llvm::GlobalValueSummary> &S) {
- return S->isLive();
- }))
- return;
-
- // We only keep live symbols that are known to be non-prevailing if any are
- // available_externally, linkonceodr, weakodr. Those symbols are discarded
- // later in the EliminateAvailableExternally pass and setting them to
- // not-live could break downstreams users of liveness information (PR36483)
- // or limit optimization opportunities.
- if (isPrevailing(VI.getGUID()) == PrevailingType::No) {
- bool KeepAliveLinkage = false;
- bool Interposable = false;
- for (auto &S : VI.getSummaryList()) {
- if (S->linkage() == GlobalValue::AvailableExternallyLinkage ||
- S->linkage() == GlobalValue::WeakODRLinkage ||
- S->linkage() == GlobalValue::LinkOnceODRLinkage)
- KeepAliveLinkage = true;
- else if (GlobalValue::isInterposableLinkage(S->linkage()))
- Interposable = true;
- }
-
- if (!IsAliasee) {
- if (!KeepAliveLinkage)
- return;
-
- if (Interposable)
- report_fatal_error(
- "Interposable and available_externally/linkonce_odr/weak_odr "
- "symbol");
- }
- }
-
- for (auto &S : VI.getSummaryList())
- S->setLive(true);
- ++LiveSymbols;
- Worklist.push_back(VI);
- };
-
- while (!Worklist.empty()) {
- auto VI = Worklist.pop_back_val();
- for (auto &Summary : VI.getSummaryList()) {
+ }
+
+ // Print stats about functions considered but rejected for importing
+ // when requested.
+ if (PrintImportFailures) {
+ dbgs() << "Missed imports into module " << ModName << "\n";
+ for (auto &I : ImportThresholds) {
+ auto &ProcessedThreshold = std::get<0>(I.second);
+ auto &CalleeSummary = std::get<1>(I.second);
+ auto &FailureInfo = std::get<2>(I.second);
+ if (CalleeSummary)
+ continue; // We are going to import.
+ assert(FailureInfo);
+ FunctionSummary *FS = nullptr;
+ if (!FailureInfo->VI.getSummaryList().empty())
+ FS = dyn_cast<FunctionSummary>(
+ FailureInfo->VI.getSummaryList()[0]->getBaseObject());
+ dbgs() << FailureInfo->VI
+ << ": Reason = " << getFailureName(FailureInfo->Reason)
+ << ", Threshold = " << ProcessedThreshold
+ << ", Size = " << (FS ? (int)FS->instCount() : -1)
+ << ", MaxHotness = " << getHotnessName(FailureInfo->MaxHotness)
+ << ", Attempts = " << FailureInfo->Attempts << "\n";
+ }
+ }
+}
+
+#ifndef NDEBUG
+static bool isGlobalVarSummary(const ModuleSummaryIndex &Index, ValueInfo VI) {
+ auto SL = VI.getSummaryList();
+ return SL.empty()
+ ? false
+ : SL[0]->getSummaryKind() == GlobalValueSummary::GlobalVarKind;
+}
+
+static bool isGlobalVarSummary(const ModuleSummaryIndex &Index,
+ GlobalValue::GUID G) {
+ if (const auto &VI = Index.getValueInfo(G))
+ return isGlobalVarSummary(Index, VI);
+ return false;
+}
+
+template <class T>
+static unsigned numGlobalVarSummaries(const ModuleSummaryIndex &Index,
+ T &Cont) {
+ unsigned NumGVS = 0;
+ for (auto &V : Cont)
+ if (isGlobalVarSummary(Index, V))
+ ++NumGVS;
+ return NumGVS;
+}
+#endif
+
+#ifndef NDEBUG
+static bool
+checkVariableImport(const ModuleSummaryIndex &Index,
+ StringMap<FunctionImporter::ImportMapTy> &ImportLists,
+ StringMap<FunctionImporter::ExportSetTy> &ExportLists) {
+
+ DenseSet<GlobalValue::GUID> FlattenedImports;
+
+ for (auto &ImportPerModule : ImportLists)
+ for (auto &ExportPerModule : ImportPerModule.second)
+ FlattenedImports.insert(ExportPerModule.second.begin(),
+ ExportPerModule.second.end());
+
+ // Checks that all GUIDs of read/writeonly vars we see in export lists
+ // are also in the import lists. Otherwise we my face linker undefs,
+ // because readonly and writeonly vars are internalized in their
+ // source modules.
+ auto IsReadOrWriteOnlyVar = [&](StringRef ModulePath, const ValueInfo &VI) {
+ auto *GVS = dyn_cast_or_null<GlobalVarSummary>(
+ Index.findSummaryInModule(VI, ModulePath));
+ return GVS && (Index.isReadOnly(GVS) || Index.isWriteOnly(GVS));
+ };
+
+ for (auto &ExportPerModule : ExportLists)
+ for (auto &VI : ExportPerModule.second)
+ if (!FlattenedImports.count(VI.getGUID()) &&
+ IsReadOrWriteOnlyVar(ExportPerModule.first(), VI))
+ return false;
+
+ return true;
+}
+#endif
+
+/// Compute all the import and export for every module using the Index.
+void llvm::ComputeCrossModuleImport(
+ const ModuleSummaryIndex &Index,
+ const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+ StringMap<FunctionImporter::ImportMapTy> &ImportLists,
+ StringMap<FunctionImporter::ExportSetTy> &ExportLists) {
+ // For each module that has function defined, compute the import/export lists.
+ for (auto &DefinedGVSummaries : ModuleToDefinedGVSummaries) {
+ auto &ImportList = ImportLists[DefinedGVSummaries.first()];
+ LLVM_DEBUG(dbgs() << "Computing import for Module '"
+ << DefinedGVSummaries.first() << "'\n");
+ ComputeImportForModule(DefinedGVSummaries.second, Index,
+ DefinedGVSummaries.first(), ImportList,
+ &ExportLists);
+ }
+
+ // When computing imports we only added the variables and functions being
+ // imported to the export list. We also need to mark any references and calls
+ // they make as exported as well. We do this here, as it is more efficient
+ // since we may import the same values multiple times into different modules
+ // during the import computation.
+ for (auto &ELI : ExportLists) {
+ FunctionImporter::ExportSetTy NewExports;
+ const auto &DefinedGVSummaries =
+ ModuleToDefinedGVSummaries.lookup(ELI.first());
+ for (auto &EI : ELI.second) {
+ // Find the copy defined in the exporting module so that we can mark the
+ // values it references in that specific definition as exported.
+ // Below we will add all references and called values, without regard to
+ // whether they are also defined in this module. We subsequently prune the
+ // list to only include those defined in the exporting module, see comment
+ // there as to why.
+ auto DS = DefinedGVSummaries.find(EI.getGUID());
+ // Anything marked exported during the import computation must have been
+ // defined in the exporting module.
+ assert(DS != DefinedGVSummaries.end());
+ auto *S = DS->getSecond();
+ S = S->getBaseObject();
+ if (auto *GVS = dyn_cast<GlobalVarSummary>(S)) {
+ // Export referenced functions and variables. We don't export/promote
+ // objects referenced by writeonly variable initializer, because
+ // we convert such variables initializers to "zeroinitializer".
+ // See processGlobalForThinLTO.
+ if (!Index.isWriteOnly(GVS))
+ for (const auto &VI : GVS->refs())
+ NewExports.insert(VI);
+ } else {
+ auto *FS = cast<FunctionSummary>(S);
+ for (auto &Edge : FS->calls())
+ NewExports.insert(Edge.first);
+ for (auto &Ref : FS->refs())
+ NewExports.insert(Ref);
+ }
+ }
+ // Prune list computed above to only include values defined in the exporting
+ // module. We do this after the above insertion since we may hit the same
+ // ref/call target multiple times in above loop, and it is more efficient to
+ // avoid a set lookup each time.
+ for (auto EI = NewExports.begin(); EI != NewExports.end();) {
+ if (!DefinedGVSummaries.count(EI->getGUID()))
+ NewExports.erase(EI++);
+ else
+ ++EI;
+ }
+ ELI.second.insert(NewExports.begin(), NewExports.end());
+ }
+
+ assert(checkVariableImport(Index, ImportLists, ExportLists));
+#ifndef NDEBUG
+ LLVM_DEBUG(dbgs() << "Import/Export lists for " << ImportLists.size()
+ << " modules:\n");
+ for (auto &ModuleImports : ImportLists) {
+ auto ModName = ModuleImports.first();
+ auto &Exports = ExportLists[ModName];
+ unsigned NumGVS = numGlobalVarSummaries(Index, Exports);
+ LLVM_DEBUG(dbgs() << "* Module " << ModName << " exports "
+ << Exports.size() - NumGVS << " functions and " << NumGVS
+ << " vars. Imports from " << ModuleImports.second.size()
+ << " modules.\n");
+ for (auto &Src : ModuleImports.second) {
+ auto SrcModName = Src.first();
+ unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
+ LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
+ << " functions imported from " << SrcModName << "\n");
+ LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod
+ << " global vars imported from " << SrcModName << "\n");
+ }
+ }
+#endif
+}
+
+#ifndef NDEBUG
+static void dumpImportListForModule(const ModuleSummaryIndex &Index,
+ StringRef ModulePath,
+ FunctionImporter::ImportMapTy &ImportList) {
+ LLVM_DEBUG(dbgs() << "* Module " << ModulePath << " imports from "
+ << ImportList.size() << " modules.\n");
+ for (auto &Src : ImportList) {
+ auto SrcModName = Src.first();
+ unsigned NumGVSPerMod = numGlobalVarSummaries(Index, Src.second);
+ LLVM_DEBUG(dbgs() << " - " << Src.second.size() - NumGVSPerMod
+ << " functions imported from " << SrcModName << "\n");
+ LLVM_DEBUG(dbgs() << " - " << NumGVSPerMod << " vars imported from "
+ << SrcModName << "\n");
+ }
+}
+#endif
+
+/// Compute all the imports for the given module in the Index.
+void llvm::ComputeCrossModuleImportForModule(
+ StringRef ModulePath, const ModuleSummaryIndex &Index,
+ FunctionImporter::ImportMapTy &ImportList) {
+ // Collect the list of functions this module defines.
+ // GUID -> Summary
+ GVSummaryMapTy FunctionSummaryMap;
+ Index.collectDefinedFunctionsForModule(ModulePath, FunctionSummaryMap);
+
+ // Compute the import list for this module.
+ LLVM_DEBUG(dbgs() << "Computing import for Module '" << ModulePath << "'\n");
+ ComputeImportForModule(FunctionSummaryMap, Index, ModulePath, ImportList);
+
+#ifndef NDEBUG
+ dumpImportListForModule(Index, ModulePath, ImportList);
+#endif
+}
+
+// Mark all external summaries in Index for import into the given module.
+// Used for distributed builds using a distributed index.
+void llvm::ComputeCrossModuleImportForModuleFromIndex(
+ StringRef ModulePath, const ModuleSummaryIndex &Index,
+ FunctionImporter::ImportMapTy &ImportList) {
+ for (auto &GlobalList : Index) {
+ // Ignore entries for undefined references.
+ if (GlobalList.second.SummaryList.empty())
+ continue;
+
+ auto GUID = GlobalList.first;
+ assert(GlobalList.second.SummaryList.size() == 1 &&
+ "Expected individual combined index to have one summary per GUID");
+ auto &Summary = GlobalList.second.SummaryList[0];
+ // Skip the summaries for the importing module. These are included to
+ // e.g. record required linkage changes.
+ if (Summary->modulePath() == ModulePath)
+ continue;
+ // Add an entry to provoke importing by thinBackend.
+ ImportList[Summary->modulePath()].insert(GUID);
+ }
+#ifndef NDEBUG
+ dumpImportListForModule(Index, ModulePath, ImportList);
+#endif
+}
+
+void llvm::computeDeadSymbols(
+ ModuleSummaryIndex &Index,
+ const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
+ function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing) {
+ assert(!Index.withGlobalValueDeadStripping());
+ if (!ComputeDead)
+ return;
+ if (GUIDPreservedSymbols.empty())
+ // Don't do anything when nothing is live, this is friendly with tests.
+ return;
+ unsigned LiveSymbols = 0;
+ SmallVector<ValueInfo, 128> Worklist;
+ Worklist.reserve(GUIDPreservedSymbols.size() * 2);
+ for (auto GUID : GUIDPreservedSymbols) {
+ ValueInfo VI = Index.getValueInfo(GUID);
+ if (!VI)
+ continue;
+ for (auto &S : VI.getSummaryList())
+ S->setLive(true);
+ }
+
+ // Add values flagged in the index as live roots to the worklist.
+ for (const auto &Entry : Index) {
+ auto VI = Index.getValueInfo(Entry);
+ for (auto &S : Entry.second.SummaryList)
+ if (S->isLive()) {
+ LLVM_DEBUG(dbgs() << "Live root: " << VI << "\n");
+ Worklist.push_back(VI);
+ ++LiveSymbols;
+ break;
+ }
+ }
+
+ // Make value live and add it to the worklist if it was not live before.
+ auto visit = [&](ValueInfo VI, bool IsAliasee) {
+ // FIXME: If we knew which edges were created for indirect call profiles,
+ // we could skip them here. Any that are live should be reached via
+ // other edges, e.g. reference edges. Otherwise, using a profile collected
+ // on a slightly different binary might provoke preserving, importing
+ // and ultimately promoting calls to functions not linked into this
+ // binary, which increases the binary size unnecessarily. Note that
+ // if this code changes, the importer needs to change so that edges
+ // to functions marked dead are skipped.
+ VI = updateValueInfoForIndirectCalls(Index, VI);
+ if (!VI)
+ return;
+
+ if (llvm::any_of(VI.getSummaryList(),
+ [](const std::unique_ptr<llvm::GlobalValueSummary> &S) {
+ return S->isLive();
+ }))
+ return;
+
+ // We only keep live symbols that are known to be non-prevailing if any are
+ // available_externally, linkonceodr, weakodr. Those symbols are discarded
+ // later in the EliminateAvailableExternally pass and setting them to
+ // not-live could break downstreams users of liveness information (PR36483)
+ // or limit optimization opportunities.
+ if (isPrevailing(VI.getGUID()) == PrevailingType::No) {
+ bool KeepAliveLinkage = false;
+ bool Interposable = false;
+ for (auto &S : VI.getSummaryList()) {
+ if (S->linkage() == GlobalValue::AvailableExternallyLinkage ||
+ S->linkage() == GlobalValue::WeakODRLinkage ||
+ S->linkage() == GlobalValue::LinkOnceODRLinkage)
+ KeepAliveLinkage = true;
+ else if (GlobalValue::isInterposableLinkage(S->linkage()))
+ Interposable = true;
+ }
+
+ if (!IsAliasee) {
+ if (!KeepAliveLinkage)
+ return;
+
+ if (Interposable)
+ report_fatal_error(
+ "Interposable and available_externally/linkonce_odr/weak_odr "
+ "symbol");
+ }
+ }
+
+ for (auto &S : VI.getSummaryList())
+ S->setLive(true);
+ ++LiveSymbols;
+ Worklist.push_back(VI);
+ };
+
+ while (!Worklist.empty()) {
+ auto VI = Worklist.pop_back_val();
+ for (auto &Summary : VI.getSummaryList()) {
Summary->setLive(true);
- if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) {
- // If this is an alias, visit the aliasee VI to ensure that all copies
- // are marked live and it is added to the worklist for further
- // processing of its references.
- visit(AS->getAliaseeVI(), true);
- continue;
- }
- for (auto Ref : Summary->refs())
- visit(Ref, false);
- if (auto *FS = dyn_cast<FunctionSummary>(Summary.get()))
- for (auto Call : FS->calls())
- visit(Call.first, false);
- }
- }
- Index.setWithGlobalValueDeadStripping();
-
- unsigned DeadSymbols = Index.size() - LiveSymbols;
- LLVM_DEBUG(dbgs() << LiveSymbols << " symbols Live, and " << DeadSymbols
- << " symbols Dead \n");
- NumDeadSymbols += DeadSymbols;
- NumLiveSymbols += LiveSymbols;
-}
-
-// Compute dead symbols and propagate constants in combined index.
-void llvm::computeDeadSymbolsWithConstProp(
- ModuleSummaryIndex &Index,
- const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
- function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing,
- bool ImportEnabled) {
- computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing);
- if (ImportEnabled)
- Index.propagateAttributes(GUIDPreservedSymbols);
-}
-
-/// Compute the set of summaries needed for a ThinLTO backend compilation of
-/// \p ModulePath.
-void llvm::gatherImportedSummariesForModule(
- StringRef ModulePath,
- const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
- const FunctionImporter::ImportMapTy &ImportList,
- std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
- // Include all summaries from the importing module.
- ModuleToSummariesForIndex[std::string(ModulePath)] =
- ModuleToDefinedGVSummaries.lookup(ModulePath);
- // Include summaries for imports.
- for (auto &ILI : ImportList) {
- auto &SummariesForIndex =
- ModuleToSummariesForIndex[std::string(ILI.first())];
- const auto &DefinedGVSummaries =
- ModuleToDefinedGVSummaries.lookup(ILI.first());
- for (auto &GI : ILI.second) {
- const auto &DS = DefinedGVSummaries.find(GI);
- assert(DS != DefinedGVSummaries.end() &&
- "Expected a defined summary for imported global value");
- SummariesForIndex[GI] = DS->second;
- }
- }
-}
-
-/// Emit the files \p ModulePath will import from into \p OutputFilename.
-std::error_code llvm::EmitImportsFiles(
- StringRef ModulePath, StringRef OutputFilename,
- const std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
- std::error_code EC;
- raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::OF_None);
- if (EC)
- return EC;
- for (auto &ILI : ModuleToSummariesForIndex)
- // The ModuleToSummariesForIndex map includes an entry for the current
- // Module (needed for writing out the index files). We don't want to
- // include it in the imports file, however, so filter it out.
- if (ILI.first != ModulePath)
- ImportsOS << ILI.first << "\n";
- return std::error_code();
-}
-
-bool llvm::convertToDeclaration(GlobalValue &GV) {
- LLVM_DEBUG(dbgs() << "Converting to a declaration: `" << GV.getName()
- << "\n");
- if (Function *F = dyn_cast<Function>(&GV)) {
- F->deleteBody();
- F->clearMetadata();
- F->setComdat(nullptr);
- } else if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) {
- V->setInitializer(nullptr);
- V->setLinkage(GlobalValue::ExternalLinkage);
- V->clearMetadata();
- V->setComdat(nullptr);
- } else {
- GlobalValue *NewGV;
- if (GV.getValueType()->isFunctionTy())
- NewGV =
- Function::Create(cast<FunctionType>(GV.getValueType()),
- GlobalValue::ExternalLinkage, GV.getAddressSpace(),
- "", GV.getParent());
- else
- NewGV =
- new GlobalVariable(*GV.getParent(), GV.getValueType(),
- /*isConstant*/ false, GlobalValue::ExternalLinkage,
- /*init*/ nullptr, "",
- /*insertbefore*/ nullptr, GV.getThreadLocalMode(),
- GV.getType()->getAddressSpace());
- NewGV->takeName(&GV);
- GV.replaceAllUsesWith(NewGV);
- return false;
- }
- if (!GV.isImplicitDSOLocal())
- GV.setDSOLocal(false);
- return true;
-}
-
-/// Fixup prevailing symbol linkages in \p TheModule based on summary analysis.
-void llvm::thinLTOResolvePrevailingInModule(
- Module &TheModule, const GVSummaryMapTy &DefinedGlobals) {
- auto updateLinkage = [&](GlobalValue &GV) {
- // See if the global summary analysis computed a new resolved linkage.
- const auto &GS = DefinedGlobals.find(GV.getGUID());
- if (GS == DefinedGlobals.end())
- return;
- auto NewLinkage = GS->second->linkage();
- if (NewLinkage == GV.getLinkage())
- return;
- if (GlobalValue::isLocalLinkage(GV.getLinkage()) ||
- // Don't internalize anything here, because the code below
- // lacks necessary correctness checks. Leave this job to
- // LLVM 'internalize' pass.
- GlobalValue::isLocalLinkage(NewLinkage) ||
- // In case it was dead and already converted to declaration.
- GV.isDeclaration())
- return;
-
- // Check for a non-prevailing def that has interposable linkage
- // (e.g. non-odr weak or linkonce). In that case we can't simply
- // convert to available_externally, since it would lose the
- // interposable property and possibly get inlined. Simply drop
- // the definition in that case.
- if (GlobalValue::isAvailableExternallyLinkage(NewLinkage) &&
- GlobalValue::isInterposableLinkage(GV.getLinkage())) {
- if (!convertToDeclaration(GV))
- // FIXME: Change this to collect replaced GVs and later erase
- // them from the parent module once thinLTOResolvePrevailingGUID is
- // changed to enable this for aliases.
- llvm_unreachable("Expected GV to be converted");
- } else {
- // If all copies of the original symbol had global unnamed addr and
- // linkonce_odr linkage, it should be an auto hide symbol. In that case
- // the thin link would have marked it as CanAutoHide. Add hidden visibility
- // to the symbol to preserve the property.
- if (NewLinkage == GlobalValue::WeakODRLinkage &&
- GS->second->canAutoHide()) {
- assert(GV.hasLinkOnceODRLinkage() && GV.hasGlobalUnnamedAddr());
- GV.setVisibility(GlobalValue::HiddenVisibility);
- }
-
- LLVM_DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName()
- << "` from " << GV.getLinkage() << " to " << NewLinkage
- << "\n");
- GV.setLinkage(NewLinkage);
- }
- // Remove declarations from comdats, including available_externally
- // as this is a declaration for the linker, and will be dropped eventually.
- // It is illegal for comdats to contain declarations.
- auto *GO = dyn_cast_or_null<GlobalObject>(&GV);
- if (GO && GO->isDeclarationForLinker() && GO->hasComdat())
- GO->setComdat(nullptr);
- };
-
- // Process functions and global now
- for (auto &GV : TheModule)
- updateLinkage(GV);
- for (auto &GV : TheModule.globals())
- updateLinkage(GV);
- for (auto &GV : TheModule.aliases())
- updateLinkage(GV);
-}
-
-/// Run internalization on \p TheModule based on symmary analysis.
-void llvm::thinLTOInternalizeModule(Module &TheModule,
- const GVSummaryMapTy &DefinedGlobals) {
- // Declare a callback for the internalize pass that will ask for every
- // candidate GlobalValue if it can be internalized or not.
- auto MustPreserveGV = [&](const GlobalValue &GV) -> bool {
- // Lookup the linkage recorded in the summaries during global analysis.
- auto GS = DefinedGlobals.find(GV.getGUID());
- if (GS == DefinedGlobals.end()) {
- // Must have been promoted (possibly conservatively). Find original
- // name so that we can access the correct summary and see if it can
- // be internalized again.
- // FIXME: Eventually we should control promotion instead of promoting
- // and internalizing again.
- StringRef OrigName =
- ModuleSummaryIndex::getOriginalNameBeforePromote(GV.getName());
- std::string OrigId = GlobalValue::getGlobalIdentifier(
- OrigName, GlobalValue::InternalLinkage,
- TheModule.getSourceFileName());
- GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
- if (GS == DefinedGlobals.end()) {
- // Also check the original non-promoted non-globalized name. In some
- // cases a preempted weak value is linked in as a local copy because
- // it is referenced by an alias (IRLinker::linkGlobalValueProto).
- // In that case, since it was originally not a local value, it was
- // recorded in the index using the original name.
- // FIXME: This may not be needed once PR27866 is fixed.
- GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
- assert(GS != DefinedGlobals.end());
- }
- }
- return !GlobalValue::isLocalLinkage(GS->second->linkage());
- };
-
- // FIXME: See if we can just internalize directly here via linkage changes
- // based on the index, rather than invoking internalizeModule.
- internalizeModule(TheModule, MustPreserveGV);
-}
-
-/// Make alias a clone of its aliasee.
-static Function *replaceAliasWithAliasee(Module *SrcModule, GlobalAlias *GA) {
- Function *Fn = cast<Function>(GA->getBaseObject());
-
- ValueToValueMapTy VMap;
- Function *NewFn = CloneFunction(Fn, VMap);
- // Clone should use the original alias's linkage, visibility and name, and we
- // ensure all uses of alias instead use the new clone (casted if necessary).
- NewFn->setLinkage(GA->getLinkage());
- NewFn->setVisibility(GA->getVisibility());
- GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewFn, GA->getType()));
- NewFn->takeName(GA);
- return NewFn;
-}
-
-// Internalize values that we marked with specific attribute
-// in processGlobalForThinLTO.
-static void internalizeGVsAfterImport(Module &M) {
- for (auto &GV : M.globals())
- // Skip GVs which have been converted to declarations
- // by dropDeadSymbols.
- if (!GV.isDeclaration() && GV.hasAttribute("thinlto-internalize")) {
- GV.setLinkage(GlobalValue::InternalLinkage);
- GV.setVisibility(GlobalValue::DefaultVisibility);
- }
-}
-
-// Automatically import functions in Module \p DestModule based on the summaries
-// index.
-Expected<bool> FunctionImporter::importFunctions(
- Module &DestModule, const FunctionImporter::ImportMapTy &ImportList) {
- LLVM_DEBUG(dbgs() << "Starting import for Module "
- << DestModule.getModuleIdentifier() << "\n");
- unsigned ImportedCount = 0, ImportedGVCount = 0;
-
- IRMover Mover(DestModule);
- // Do the actual import of functions now, one Module at a time
- std::set<StringRef> ModuleNameOrderedList;
- for (auto &FunctionsToImportPerModule : ImportList) {
- ModuleNameOrderedList.insert(FunctionsToImportPerModule.first());
- }
- for (auto &Name : ModuleNameOrderedList) {
- // Get the module for the import
- const auto &FunctionsToImportPerModule = ImportList.find(Name);
- assert(FunctionsToImportPerModule != ImportList.end());
- Expected<std::unique_ptr<Module>> SrcModuleOrErr = ModuleLoader(Name);
- if (!SrcModuleOrErr)
- return SrcModuleOrErr.takeError();
- std::unique_ptr<Module> SrcModule = std::move(*SrcModuleOrErr);
- assert(&DestModule.getContext() == &SrcModule->getContext() &&
- "Context mismatch");
-
- // If modules were created with lazy metadata loading, materialize it
- // now, before linking it (otherwise this will be a noop).
- if (Error Err = SrcModule->materializeMetadata())
- return std::move(Err);
-
- auto &ImportGUIDs = FunctionsToImportPerModule->second;
- // Find the globals to import
- SetVector<GlobalValue *> GlobalsToImport;
- for (Function &F : *SrcModule) {
- if (!F.hasName())
- continue;
- auto GUID = F.getGUID();
- auto Import = ImportGUIDs.count(GUID);
- LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing function "
- << GUID << " " << F.getName() << " from "
- << SrcModule->getSourceFileName() << "\n");
- if (Import) {
- if (Error Err = F.materialize())
- return std::move(Err);
- if (EnableImportMetadata) {
- // Add 'thinlto_src_module' metadata for statistics and debugging.
- F.setMetadata(
- "thinlto_src_module",
- MDNode::get(DestModule.getContext(),
- {MDString::get(DestModule.getContext(),
- SrcModule->getSourceFileName())}));
- }
- GlobalsToImport.insert(&F);
- }
- }
- for (GlobalVariable &GV : SrcModule->globals()) {
- if (!GV.hasName())
- continue;
- auto GUID = GV.getGUID();
- auto Import = ImportGUIDs.count(GUID);
- LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing global "
- << GUID << " " << GV.getName() << " from "
- << SrcModule->getSourceFileName() << "\n");
- if (Import) {
- if (Error Err = GV.materialize())
- return std::move(Err);
- ImportedGVCount += GlobalsToImport.insert(&GV);
- }
- }
- for (GlobalAlias &GA : SrcModule->aliases()) {
- if (!GA.hasName())
- continue;
- auto GUID = GA.getGUID();
- auto Import = ImportGUIDs.count(GUID);
- LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing alias "
- << GUID << " " << GA.getName() << " from "
- << SrcModule->getSourceFileName() << "\n");
- if (Import) {
- if (Error Err = GA.materialize())
- return std::move(Err);
- // Import alias as a copy of its aliasee.
- GlobalObject *Base = GA.getBaseObject();
- if (Error Err = Base->materialize())
- return std::move(Err);
- auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA);
- LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << Base->getGUID()
- << " " << Base->getName() << " from "
- << SrcModule->getSourceFileName() << "\n");
- if (EnableImportMetadata) {
- // Add 'thinlto_src_module' metadata for statistics and debugging.
- Fn->setMetadata(
- "thinlto_src_module",
- MDNode::get(DestModule.getContext(),
- {MDString::get(DestModule.getContext(),
- SrcModule->getSourceFileName())}));
- }
- GlobalsToImport.insert(Fn);
- }
- }
-
- // Upgrade debug info after we're done materializing all the globals and we
- // have loaded all the required metadata!
- UpgradeDebugInfo(*SrcModule);
-
- // Set the partial sample profile ratio in the profile summary module flag
- // of the imported source module, if applicable, so that the profile summary
- // module flag will match with that of the destination module when it's
- // imported.
- SrcModule->setPartialSampleProfileRatio(Index);
-
- // Link in the specified functions.
- if (renameModuleForThinLTO(*SrcModule, Index, ClearDSOLocalOnDeclarations,
- &GlobalsToImport))
- return true;
-
- if (PrintImports) {
- for (const auto *GV : GlobalsToImport)
- dbgs() << DestModule.getSourceFileName() << ": Import " << GV->getName()
- << " from " << SrcModule->getSourceFileName() << "\n";
- }
-
- if (Error Err = Mover.move(
- std::move(SrcModule), GlobalsToImport.getArrayRef(),
- [](GlobalValue &, IRMover::ValueAdder) {},
- /*IsPerformingImport=*/true))
- report_fatal_error("Function Import: link error: " +
- toString(std::move(Err)));
-
- ImportedCount += GlobalsToImport.size();
- NumImportedModules++;
- }
-
- internalizeGVsAfterImport(DestModule);
-
- NumImportedFunctions += (ImportedCount - ImportedGVCount);
- NumImportedGlobalVars += ImportedGVCount;
-
- LLVM_DEBUG(dbgs() << "Imported " << ImportedCount - ImportedGVCount
- << " functions for Module "
- << DestModule.getModuleIdentifier() << "\n");
- LLVM_DEBUG(dbgs() << "Imported " << ImportedGVCount
- << " global variables for Module "
- << DestModule.getModuleIdentifier() << "\n");
- return ImportedCount;
-}
-
-static bool doImportingForModule(Module &M) {
- if (SummaryFile.empty())
- report_fatal_error("error: -function-import requires -summary-file\n");
- Expected<std::unique_ptr<ModuleSummaryIndex>> IndexPtrOrErr =
- getModuleSummaryIndexForFile(SummaryFile);
- if (!IndexPtrOrErr) {
- logAllUnhandledErrors(IndexPtrOrErr.takeError(), errs(),
- "Error loading file '" + SummaryFile + "': ");
- return false;
- }
- std::unique_ptr<ModuleSummaryIndex> Index = std::move(*IndexPtrOrErr);
-
- // First step is collecting the import list.
- FunctionImporter::ImportMapTy ImportList;
- // If requested, simply import all functions in the index. This is used
- // when testing distributed backend handling via the opt tool, when
- // we have distributed indexes containing exactly the summaries to import.
- if (ImportAllIndex)
- ComputeCrossModuleImportForModuleFromIndex(M.getModuleIdentifier(), *Index,
- ImportList);
- else
- ComputeCrossModuleImportForModule(M.getModuleIdentifier(), *Index,
- ImportList);
-
- // Conservatively mark all internal values as promoted. This interface is
- // only used when doing importing via the function importing pass. The pass
- // is only enabled when testing importing via the 'opt' tool, which does
- // not do the ThinLink that would normally determine what values to promote.
- for (auto &I : *Index) {
- for (auto &S : I.second.SummaryList) {
- if (GlobalValue::isLocalLinkage(S->linkage()))
- S->setLinkage(GlobalValue::ExternalLinkage);
- }
- }
-
- // Next we need to promote to global scope and rename any local values that
- // are potentially exported to other modules.
+ if (auto *AS = dyn_cast<AliasSummary>(Summary.get())) {
+ // If this is an alias, visit the aliasee VI to ensure that all copies
+ // are marked live and it is added to the worklist for further
+ // processing of its references.
+ visit(AS->getAliaseeVI(), true);
+ continue;
+ }
+ for (auto Ref : Summary->refs())
+ visit(Ref, false);
+ if (auto *FS = dyn_cast<FunctionSummary>(Summary.get()))
+ for (auto Call : FS->calls())
+ visit(Call.first, false);
+ }
+ }
+ Index.setWithGlobalValueDeadStripping();
+
+ unsigned DeadSymbols = Index.size() - LiveSymbols;
+ LLVM_DEBUG(dbgs() << LiveSymbols << " symbols Live, and " << DeadSymbols
+ << " symbols Dead \n");
+ NumDeadSymbols += DeadSymbols;
+ NumLiveSymbols += LiveSymbols;
+}
+
+// Compute dead symbols and propagate constants in combined index.
+void llvm::computeDeadSymbolsWithConstProp(
+ ModuleSummaryIndex &Index,
+ const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols,
+ function_ref<PrevailingType(GlobalValue::GUID)> isPrevailing,
+ bool ImportEnabled) {
+ computeDeadSymbols(Index, GUIDPreservedSymbols, isPrevailing);
+ if (ImportEnabled)
+ Index.propagateAttributes(GUIDPreservedSymbols);
+}
+
+/// Compute the set of summaries needed for a ThinLTO backend compilation of
+/// \p ModulePath.
+void llvm::gatherImportedSummariesForModule(
+ StringRef ModulePath,
+ const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
+ const FunctionImporter::ImportMapTy &ImportList,
+ std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
+ // Include all summaries from the importing module.
+ ModuleToSummariesForIndex[std::string(ModulePath)] =
+ ModuleToDefinedGVSummaries.lookup(ModulePath);
+ // Include summaries for imports.
+ for (auto &ILI : ImportList) {
+ auto &SummariesForIndex =
+ ModuleToSummariesForIndex[std::string(ILI.first())];
+ const auto &DefinedGVSummaries =
+ ModuleToDefinedGVSummaries.lookup(ILI.first());
+ for (auto &GI : ILI.second) {
+ const auto &DS = DefinedGVSummaries.find(GI);
+ assert(DS != DefinedGVSummaries.end() &&
+ "Expected a defined summary for imported global value");
+ SummariesForIndex[GI] = DS->second;
+ }
+ }
+}
+
+/// Emit the files \p ModulePath will import from into \p OutputFilename.
+std::error_code llvm::EmitImportsFiles(
+ StringRef ModulePath, StringRef OutputFilename,
+ const std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
+ std::error_code EC;
+ raw_fd_ostream ImportsOS(OutputFilename, EC, sys::fs::OpenFlags::OF_None);
+ if (EC)
+ return EC;
+ for (auto &ILI : ModuleToSummariesForIndex)
+ // The ModuleToSummariesForIndex map includes an entry for the current
+ // Module (needed for writing out the index files). We don't want to
+ // include it in the imports file, however, so filter it out.
+ if (ILI.first != ModulePath)
+ ImportsOS << ILI.first << "\n";
+ return std::error_code();
+}
+
+bool llvm::convertToDeclaration(GlobalValue &GV) {
+ LLVM_DEBUG(dbgs() << "Converting to a declaration: `" << GV.getName()
+ << "\n");
+ if (Function *F = dyn_cast<Function>(&GV)) {
+ F->deleteBody();
+ F->clearMetadata();
+ F->setComdat(nullptr);
+ } else if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) {
+ V->setInitializer(nullptr);
+ V->setLinkage(GlobalValue::ExternalLinkage);
+ V->clearMetadata();
+ V->setComdat(nullptr);
+ } else {
+ GlobalValue *NewGV;
+ if (GV.getValueType()->isFunctionTy())
+ NewGV =
+ Function::Create(cast<FunctionType>(GV.getValueType()),
+ GlobalValue::ExternalLinkage, GV.getAddressSpace(),
+ "", GV.getParent());
+ else
+ NewGV =
+ new GlobalVariable(*GV.getParent(), GV.getValueType(),
+ /*isConstant*/ false, GlobalValue::ExternalLinkage,
+ /*init*/ nullptr, "",
+ /*insertbefore*/ nullptr, GV.getThreadLocalMode(),
+ GV.getType()->getAddressSpace());
+ NewGV->takeName(&GV);
+ GV.replaceAllUsesWith(NewGV);
+ return false;
+ }
+ if (!GV.isImplicitDSOLocal())
+ GV.setDSOLocal(false);
+ return true;
+}
+
+/// Fixup prevailing symbol linkages in \p TheModule based on summary analysis.
+void llvm::thinLTOResolvePrevailingInModule(
+ Module &TheModule, const GVSummaryMapTy &DefinedGlobals) {
+ auto updateLinkage = [&](GlobalValue &GV) {
+ // See if the global summary analysis computed a new resolved linkage.
+ const auto &GS = DefinedGlobals.find(GV.getGUID());
+ if (GS == DefinedGlobals.end())
+ return;
+ auto NewLinkage = GS->second->linkage();
+ if (NewLinkage == GV.getLinkage())
+ return;
+ if (GlobalValue::isLocalLinkage(GV.getLinkage()) ||
+ // Don't internalize anything here, because the code below
+ // lacks necessary correctness checks. Leave this job to
+ // LLVM 'internalize' pass.
+ GlobalValue::isLocalLinkage(NewLinkage) ||
+ // In case it was dead and already converted to declaration.
+ GV.isDeclaration())
+ return;
+
+ // Check for a non-prevailing def that has interposable linkage
+ // (e.g. non-odr weak or linkonce). In that case we can't simply
+ // convert to available_externally, since it would lose the
+ // interposable property and possibly get inlined. Simply drop
+ // the definition in that case.
+ if (GlobalValue::isAvailableExternallyLinkage(NewLinkage) &&
+ GlobalValue::isInterposableLinkage(GV.getLinkage())) {
+ if (!convertToDeclaration(GV))
+ // FIXME: Change this to collect replaced GVs and later erase
+ // them from the parent module once thinLTOResolvePrevailingGUID is
+ // changed to enable this for aliases.
+ llvm_unreachable("Expected GV to be converted");
+ } else {
+ // If all copies of the original symbol had global unnamed addr and
+ // linkonce_odr linkage, it should be an auto hide symbol. In that case
+ // the thin link would have marked it as CanAutoHide. Add hidden visibility
+ // to the symbol to preserve the property.
+ if (NewLinkage == GlobalValue::WeakODRLinkage &&
+ GS->second->canAutoHide()) {
+ assert(GV.hasLinkOnceODRLinkage() && GV.hasGlobalUnnamedAddr());
+ GV.setVisibility(GlobalValue::HiddenVisibility);
+ }
+
+ LLVM_DEBUG(dbgs() << "ODR fixing up linkage for `" << GV.getName()
+ << "` from " << GV.getLinkage() << " to " << NewLinkage
+ << "\n");
+ GV.setLinkage(NewLinkage);
+ }
+ // Remove declarations from comdats, including available_externally
+ // as this is a declaration for the linker, and will be dropped eventually.
+ // It is illegal for comdats to contain declarations.
+ auto *GO = dyn_cast_or_null<GlobalObject>(&GV);
+ if (GO && GO->isDeclarationForLinker() && GO->hasComdat())
+ GO->setComdat(nullptr);
+ };
+
+ // Process functions and global now
+ for (auto &GV : TheModule)
+ updateLinkage(GV);
+ for (auto &GV : TheModule.globals())
+ updateLinkage(GV);
+ for (auto &GV : TheModule.aliases())
+ updateLinkage(GV);
+}
+
+/// Run internalization on \p TheModule based on symmary analysis.
+void llvm::thinLTOInternalizeModule(Module &TheModule,
+ const GVSummaryMapTy &DefinedGlobals) {
+ // Declare a callback for the internalize pass that will ask for every
+ // candidate GlobalValue if it can be internalized or not.
+ auto MustPreserveGV = [&](const GlobalValue &GV) -> bool {
+ // Lookup the linkage recorded in the summaries during global analysis.
+ auto GS = DefinedGlobals.find(GV.getGUID());
+ if (GS == DefinedGlobals.end()) {
+ // Must have been promoted (possibly conservatively). Find original
+ // name so that we can access the correct summary and see if it can
+ // be internalized again.
+ // FIXME: Eventually we should control promotion instead of promoting
+ // and internalizing again.
+ StringRef OrigName =
+ ModuleSummaryIndex::getOriginalNameBeforePromote(GV.getName());
+ std::string OrigId = GlobalValue::getGlobalIdentifier(
+ OrigName, GlobalValue::InternalLinkage,
+ TheModule.getSourceFileName());
+ GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId));
+ if (GS == DefinedGlobals.end()) {
+ // Also check the original non-promoted non-globalized name. In some
+ // cases a preempted weak value is linked in as a local copy because
+ // it is referenced by an alias (IRLinker::linkGlobalValueProto).
+ // In that case, since it was originally not a local value, it was
+ // recorded in the index using the original name.
+ // FIXME: This may not be needed once PR27866 is fixed.
+ GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName));
+ assert(GS != DefinedGlobals.end());
+ }
+ }
+ return !GlobalValue::isLocalLinkage(GS->second->linkage());
+ };
+
+ // FIXME: See if we can just internalize directly here via linkage changes
+ // based on the index, rather than invoking internalizeModule.
+ internalizeModule(TheModule, MustPreserveGV);
+}
+
+/// Make alias a clone of its aliasee.
+static Function *replaceAliasWithAliasee(Module *SrcModule, GlobalAlias *GA) {
+ Function *Fn = cast<Function>(GA->getBaseObject());
+
+ ValueToValueMapTy VMap;
+ Function *NewFn = CloneFunction(Fn, VMap);
+ // Clone should use the original alias's linkage, visibility and name, and we
+ // ensure all uses of alias instead use the new clone (casted if necessary).
+ NewFn->setLinkage(GA->getLinkage());
+ NewFn->setVisibility(GA->getVisibility());
+ GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewFn, GA->getType()));
+ NewFn->takeName(GA);
+ return NewFn;
+}
+
+// Internalize values that we marked with specific attribute
+// in processGlobalForThinLTO.
+static void internalizeGVsAfterImport(Module &M) {
+ for (auto &GV : M.globals())
+ // Skip GVs which have been converted to declarations
+ // by dropDeadSymbols.
+ if (!GV.isDeclaration() && GV.hasAttribute("thinlto-internalize")) {
+ GV.setLinkage(GlobalValue::InternalLinkage);
+ GV.setVisibility(GlobalValue::DefaultVisibility);
+ }
+}
+
+// Automatically import functions in Module \p DestModule based on the summaries
+// index.
+Expected<bool> FunctionImporter::importFunctions(
+ Module &DestModule, const FunctionImporter::ImportMapTy &ImportList) {
+ LLVM_DEBUG(dbgs() << "Starting import for Module "
+ << DestModule.getModuleIdentifier() << "\n");
+ unsigned ImportedCount = 0, ImportedGVCount = 0;
+
+ IRMover Mover(DestModule);
+ // Do the actual import of functions now, one Module at a time
+ std::set<StringRef> ModuleNameOrderedList;
+ for (auto &FunctionsToImportPerModule : ImportList) {
+ ModuleNameOrderedList.insert(FunctionsToImportPerModule.first());
+ }
+ for (auto &Name : ModuleNameOrderedList) {
+ // Get the module for the import
+ const auto &FunctionsToImportPerModule = ImportList.find(Name);
+ assert(FunctionsToImportPerModule != ImportList.end());
+ Expected<std::unique_ptr<Module>> SrcModuleOrErr = ModuleLoader(Name);
+ if (!SrcModuleOrErr)
+ return SrcModuleOrErr.takeError();
+ std::unique_ptr<Module> SrcModule = std::move(*SrcModuleOrErr);
+ assert(&DestModule.getContext() == &SrcModule->getContext() &&
+ "Context mismatch");
+
+ // If modules were created with lazy metadata loading, materialize it
+ // now, before linking it (otherwise this will be a noop).
+ if (Error Err = SrcModule->materializeMetadata())
+ return std::move(Err);
+
+ auto &ImportGUIDs = FunctionsToImportPerModule->second;
+ // Find the globals to import
+ SetVector<GlobalValue *> GlobalsToImport;
+ for (Function &F : *SrcModule) {
+ if (!F.hasName())
+ continue;
+ auto GUID = F.getGUID();
+ auto Import = ImportGUIDs.count(GUID);
+ LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing function "
+ << GUID << " " << F.getName() << " from "
+ << SrcModule->getSourceFileName() << "\n");
+ if (Import) {
+ if (Error Err = F.materialize())
+ return std::move(Err);
+ if (EnableImportMetadata) {
+ // Add 'thinlto_src_module' metadata for statistics and debugging.
+ F.setMetadata(
+ "thinlto_src_module",
+ MDNode::get(DestModule.getContext(),
+ {MDString::get(DestModule.getContext(),
+ SrcModule->getSourceFileName())}));
+ }
+ GlobalsToImport.insert(&F);
+ }
+ }
+ for (GlobalVariable &GV : SrcModule->globals()) {
+ if (!GV.hasName())
+ continue;
+ auto GUID = GV.getGUID();
+ auto Import = ImportGUIDs.count(GUID);
+ LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing global "
+ << GUID << " " << GV.getName() << " from "
+ << SrcModule->getSourceFileName() << "\n");
+ if (Import) {
+ if (Error Err = GV.materialize())
+ return std::move(Err);
+ ImportedGVCount += GlobalsToImport.insert(&GV);
+ }
+ }
+ for (GlobalAlias &GA : SrcModule->aliases()) {
+ if (!GA.hasName())
+ continue;
+ auto GUID = GA.getGUID();
+ auto Import = ImportGUIDs.count(GUID);
+ LLVM_DEBUG(dbgs() << (Import ? "Is" : "Not") << " importing alias "
+ << GUID << " " << GA.getName() << " from "
+ << SrcModule->getSourceFileName() << "\n");
+ if (Import) {
+ if (Error Err = GA.materialize())
+ return std::move(Err);
+ // Import alias as a copy of its aliasee.
+ GlobalObject *Base = GA.getBaseObject();
+ if (Error Err = Base->materialize())
+ return std::move(Err);
+ auto *Fn = replaceAliasWithAliasee(SrcModule.get(), &GA);
+ LLVM_DEBUG(dbgs() << "Is importing aliasee fn " << Base->getGUID()
+ << " " << Base->getName() << " from "
+ << SrcModule->getSourceFileName() << "\n");
+ if (EnableImportMetadata) {
+ // Add 'thinlto_src_module' metadata for statistics and debugging.
+ Fn->setMetadata(
+ "thinlto_src_module",
+ MDNode::get(DestModule.getContext(),
+ {MDString::get(DestModule.getContext(),
+ SrcModule->getSourceFileName())}));
+ }
+ GlobalsToImport.insert(Fn);
+ }
+ }
+
+ // Upgrade debug info after we're done materializing all the globals and we
+ // have loaded all the required metadata!
+ UpgradeDebugInfo(*SrcModule);
+
+ // Set the partial sample profile ratio in the profile summary module flag
+ // of the imported source module, if applicable, so that the profile summary
+ // module flag will match with that of the destination module when it's
+ // imported.
+ SrcModule->setPartialSampleProfileRatio(Index);
+
+ // Link in the specified functions.
+ if (renameModuleForThinLTO(*SrcModule, Index, ClearDSOLocalOnDeclarations,
+ &GlobalsToImport))
+ return true;
+
+ if (PrintImports) {
+ for (const auto *GV : GlobalsToImport)
+ dbgs() << DestModule.getSourceFileName() << ": Import " << GV->getName()
+ << " from " << SrcModule->getSourceFileName() << "\n";
+ }
+
+ if (Error Err = Mover.move(
+ std::move(SrcModule), GlobalsToImport.getArrayRef(),
+ [](GlobalValue &, IRMover::ValueAdder) {},
+ /*IsPerformingImport=*/true))
+ report_fatal_error("Function Import: link error: " +
+ toString(std::move(Err)));
+
+ ImportedCount += GlobalsToImport.size();
+ NumImportedModules++;
+ }
+
+ internalizeGVsAfterImport(DestModule);
+
+ NumImportedFunctions += (ImportedCount - ImportedGVCount);
+ NumImportedGlobalVars += ImportedGVCount;
+
+ LLVM_DEBUG(dbgs() << "Imported " << ImportedCount - ImportedGVCount
+ << " functions for Module "
+ << DestModule.getModuleIdentifier() << "\n");
+ LLVM_DEBUG(dbgs() << "Imported " << ImportedGVCount
+ << " global variables for Module "
+ << DestModule.getModuleIdentifier() << "\n");
+ return ImportedCount;
+}
+
+static bool doImportingForModule(Module &M) {
+ if (SummaryFile.empty())
+ report_fatal_error("error: -function-import requires -summary-file\n");
+ Expected<std::unique_ptr<ModuleSummaryIndex>> IndexPtrOrErr =
+ getModuleSummaryIndexForFile(SummaryFile);
+ if (!IndexPtrOrErr) {
+ logAllUnhandledErrors(IndexPtrOrErr.takeError(), errs(),
+ "Error loading file '" + SummaryFile + "': ");
+ return false;
+ }
+ std::unique_ptr<ModuleSummaryIndex> Index = std::move(*IndexPtrOrErr);
+
+ // First step is collecting the import list.
+ FunctionImporter::ImportMapTy ImportList;
+ // If requested, simply import all functions in the index. This is used
+ // when testing distributed backend handling via the opt tool, when
+ // we have distributed indexes containing exactly the summaries to import.
+ if (ImportAllIndex)
+ ComputeCrossModuleImportForModuleFromIndex(M.getModuleIdentifier(), *Index,
+ ImportList);
+ else
+ ComputeCrossModuleImportForModule(M.getModuleIdentifier(), *Index,
+ ImportList);
+
+ // Conservatively mark all internal values as promoted. This interface is
+ // only used when doing importing via the function importing pass. The pass
+ // is only enabled when testing importing via the 'opt' tool, which does
+ // not do the ThinLink that would normally determine what values to promote.
+ for (auto &I : *Index) {
+ for (auto &S : I.second.SummaryList) {
+ if (GlobalValue::isLocalLinkage(S->linkage()))
+ S->setLinkage(GlobalValue::ExternalLinkage);
+ }
+ }
+
+ // Next we need to promote to global scope and rename any local values that
+ // are potentially exported to other modules.
if (renameModuleForThinLTO(M, *Index, /*ClearDSOLocalOnDeclarations=*/false,
- /*GlobalsToImport=*/nullptr)) {
- errs() << "Error renaming module\n";
- return false;
- }
-
- // Perform the import now.
- auto ModuleLoader = [&M](StringRef Identifier) {
- return loadFile(std::string(Identifier), M.getContext());
- };
- FunctionImporter Importer(*Index, ModuleLoader,
- /*ClearDSOLocalOnDeclarations=*/false);
- Expected<bool> Result = Importer.importFunctions(M, ImportList);
-
- // FIXME: Probably need to propagate Errors through the pass manager.
- if (!Result) {
- logAllUnhandledErrors(Result.takeError(), errs(),
- "Error importing module: ");
- return false;
- }
-
- return *Result;
-}
-
-namespace {
-
-/// Pass that performs cross-module function import provided a summary file.
-class FunctionImportLegacyPass : public ModulePass {
-public:
- /// Pass identification, replacement for typeid
- static char ID;
-
- explicit FunctionImportLegacyPass() : ModulePass(ID) {}
-
- /// Specify pass name for debug output
- StringRef getPassName() const override { return "Function Importing"; }
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
-
- return doImportingForModule(M);
- }
-};
-
-} // end anonymous namespace
-
-PreservedAnalyses FunctionImportPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- if (!doImportingForModule(M))
- return PreservedAnalyses::all();
-
- return PreservedAnalyses::none();
-}
-
-char FunctionImportLegacyPass::ID = 0;
-INITIALIZE_PASS(FunctionImportLegacyPass, "function-import",
- "Summary Based Function Import", false, false)
-
-namespace llvm {
-
-Pass *createFunctionImportPass() {
- return new FunctionImportLegacyPass();
-}
-
-} // end namespace llvm
+ /*GlobalsToImport=*/nullptr)) {
+ errs() << "Error renaming module\n";
+ return false;
+ }
+
+ // Perform the import now.
+ auto ModuleLoader = [&M](StringRef Identifier) {
+ return loadFile(std::string(Identifier), M.getContext());
+ };
+ FunctionImporter Importer(*Index, ModuleLoader,
+ /*ClearDSOLocalOnDeclarations=*/false);
+ Expected<bool> Result = Importer.importFunctions(M, ImportList);
+
+ // FIXME: Probably need to propagate Errors through the pass manager.
+ if (!Result) {
+ logAllUnhandledErrors(Result.takeError(), errs(),
+ "Error importing module: ");
+ return false;
+ }
+
+ return *Result;
+}
+
+namespace {
+
+/// Pass that performs cross-module function import provided a summary file.
+class FunctionImportLegacyPass : public ModulePass {
+public:
+ /// Pass identification, replacement for typeid
+ static char ID;
+
+ explicit FunctionImportLegacyPass() : ModulePass(ID) {}
+
+ /// Specify pass name for debug output
+ StringRef getPassName() const override { return "Function Importing"; }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+
+ return doImportingForModule(M);
+ }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses FunctionImportPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ if (!doImportingForModule(M))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+char FunctionImportLegacyPass::ID = 0;
+INITIALIZE_PASS(FunctionImportLegacyPass, "function-import",
+ "Summary Based Function Import", false, false)
+
+namespace llvm {
+
+Pass *createFunctionImportPass() {
+ return new FunctionImportLegacyPass();
+}
+
+} // end namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/GlobalDCE.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/GlobalDCE.cpp
index 6322e51552..fb4cb23b83 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/GlobalDCE.cpp
@@ -1,460 +1,460 @@
-//===-- GlobalDCE.cpp - DCE unreachable internal functions ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This transform is designed to eliminate unreachable internal globals from the
-// program. It uses an aggressive algorithm, searching out globals that are
-// known to be alive. After it finds all of the globals which are needed, it
-// deletes whatever is left over. This allows it to delete recursive chunks of
-// the program which are unreachable.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/GlobalDCE.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/TypeMetadataUtils.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/CtorUtils.h"
-#include "llvm/Transforms/Utils/GlobalStatus.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "globaldce"
-
-static cl::opt<bool>
- ClEnableVFE("enable-vfe", cl::Hidden, cl::init(true), cl::ZeroOrMore,
- cl::desc("Enable virtual function elimination"));
-
-STATISTIC(NumAliases , "Number of global aliases removed");
-STATISTIC(NumFunctions, "Number of functions removed");
-STATISTIC(NumIFuncs, "Number of indirect functions removed");
-STATISTIC(NumVariables, "Number of global variables removed");
-STATISTIC(NumVFuncs, "Number of virtual functions removed");
-
-namespace {
- class GlobalDCELegacyPass : public ModulePass {
- public:
- static char ID; // Pass identification, replacement for typeid
- GlobalDCELegacyPass() : ModulePass(ID) {
- initializeGlobalDCELegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- // run - Do the GlobalDCE pass on the specified module, optionally updating
- // the specified callgraph to reflect the changes.
- //
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
-
- // We need a minimally functional dummy module analysis manager. It needs
- // to at least know about the possibility of proxying a function analysis
- // manager.
- FunctionAnalysisManager DummyFAM;
- ModuleAnalysisManager DummyMAM;
- DummyMAM.registerPass(
- [&] { return FunctionAnalysisManagerModuleProxy(DummyFAM); });
-
- auto PA = Impl.run(M, DummyMAM);
- return !PA.areAllPreserved();
- }
-
- private:
- GlobalDCEPass Impl;
- };
-}
-
-char GlobalDCELegacyPass::ID = 0;
-INITIALIZE_PASS(GlobalDCELegacyPass, "globaldce",
- "Dead Global Elimination", false, false)
-
-// Public interface to the GlobalDCEPass.
-ModulePass *llvm::createGlobalDCEPass() {
- return new GlobalDCELegacyPass();
-}
-
-/// Returns true if F is effectively empty.
-static bool isEmptyFunction(Function *F) {
- BasicBlock &Entry = F->getEntryBlock();
- for (auto &I : Entry) {
- if (isa<DbgInfoIntrinsic>(I))
- continue;
- if (auto *RI = dyn_cast<ReturnInst>(&I))
- return !RI->getReturnValue();
- break;
- }
- return false;
-}
-
-/// Compute the set of GlobalValue that depends from V.
-/// The recursion stops as soon as a GlobalValue is met.
-void GlobalDCEPass::ComputeDependencies(Value *V,
- SmallPtrSetImpl<GlobalValue *> &Deps) {
- if (auto *I = dyn_cast<Instruction>(V)) {
- Function *Parent = I->getParent()->getParent();
- Deps.insert(Parent);
- } else if (auto *GV = dyn_cast<GlobalValue>(V)) {
- Deps.insert(GV);
- } else if (auto *CE = dyn_cast<Constant>(V)) {
- // Avoid walking the whole tree of a big ConstantExprs multiple times.
- auto Where = ConstantDependenciesCache.find(CE);
- if (Where != ConstantDependenciesCache.end()) {
- auto const &K = Where->second;
- Deps.insert(K.begin(), K.end());
- } else {
- SmallPtrSetImpl<GlobalValue *> &LocalDeps = ConstantDependenciesCache[CE];
- for (User *CEUser : CE->users())
- ComputeDependencies(CEUser, LocalDeps);
- Deps.insert(LocalDeps.begin(), LocalDeps.end());
- }
- }
-}
-
-void GlobalDCEPass::UpdateGVDependencies(GlobalValue &GV) {
- SmallPtrSet<GlobalValue *, 8> Deps;
- for (User *User : GV.users())
- ComputeDependencies(User, Deps);
- Deps.erase(&GV); // Remove self-reference.
- for (GlobalValue *GVU : Deps) {
- // If this is a dep from a vtable to a virtual function, and we have
- // complete information about all virtual call sites which could call
- // though this vtable, then skip it, because the call site information will
- // be more precise.
- if (VFESafeVTables.count(GVU) && isa<Function>(&GV)) {
- LLVM_DEBUG(dbgs() << "Ignoring dep " << GVU->getName() << " -> "
- << GV.getName() << "\n");
- continue;
- }
- GVDependencies[GVU].insert(&GV);
- }
-}
-
-/// Mark Global value as Live
-void GlobalDCEPass::MarkLive(GlobalValue &GV,
- SmallVectorImpl<GlobalValue *> *Updates) {
- auto const Ret = AliveGlobals.insert(&GV);
- if (!Ret.second)
- return;
-
- if (Updates)
- Updates->push_back(&GV);
- if (Comdat *C = GV.getComdat()) {
- for (auto &&CM : make_range(ComdatMembers.equal_range(C))) {
- MarkLive(*CM.second, Updates); // Recursion depth is only two because only
- // globals in the same comdat are visited.
- }
- }
-}
-
-void GlobalDCEPass::ScanVTables(Module &M) {
- SmallVector<MDNode *, 2> Types;
- LLVM_DEBUG(dbgs() << "Building type info -> vtable map\n");
-
- auto *LTOPostLinkMD =
- cast_or_null<ConstantAsMetadata>(M.getModuleFlag("LTOPostLink"));
- bool LTOPostLink =
- LTOPostLinkMD &&
- (cast<ConstantInt>(LTOPostLinkMD->getValue())->getZExtValue() != 0);
-
- for (GlobalVariable &GV : M.globals()) {
- Types.clear();
- GV.getMetadata(LLVMContext::MD_type, Types);
- if (GV.isDeclaration() || Types.empty())
- continue;
-
- // Use the typeid metadata on the vtable to build a mapping from typeids to
- // the list of (GV, offset) pairs which are the possible vtables for that
- // typeid.
- for (MDNode *Type : Types) {
- Metadata *TypeID = Type->getOperand(1).get();
-
- uint64_t Offset =
- cast<ConstantInt>(
- cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
- ->getZExtValue();
-
- TypeIdMap[TypeID].insert(std::make_pair(&GV, Offset));
- }
-
- // If the type corresponding to the vtable is private to this translation
- // unit, we know that we can see all virtual functions which might use it,
- // so VFE is safe.
- if (auto GO = dyn_cast<GlobalObject>(&GV)) {
- GlobalObject::VCallVisibility TypeVis = GO->getVCallVisibility();
- if (TypeVis == GlobalObject::VCallVisibilityTranslationUnit ||
- (LTOPostLink &&
- TypeVis == GlobalObject::VCallVisibilityLinkageUnit)) {
- LLVM_DEBUG(dbgs() << GV.getName() << " is safe for VFE\n");
- VFESafeVTables.insert(&GV);
- }
- }
- }
-}
-
-void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId,
- uint64_t CallOffset) {
- for (auto &VTableInfo : TypeIdMap[TypeId]) {
- GlobalVariable *VTable = VTableInfo.first;
- uint64_t VTableOffset = VTableInfo.second;
-
- Constant *Ptr =
- getPointerAtOffset(VTable->getInitializer(), VTableOffset + CallOffset,
- *Caller->getParent());
- if (!Ptr) {
- LLVM_DEBUG(dbgs() << "can't find pointer in vtable!\n");
- VFESafeVTables.erase(VTable);
- return;
- }
-
- auto Callee = dyn_cast<Function>(Ptr->stripPointerCasts());
- if (!Callee) {
- LLVM_DEBUG(dbgs() << "vtable entry is not function pointer!\n");
- VFESafeVTables.erase(VTable);
- return;
- }
-
- LLVM_DEBUG(dbgs() << "vfunc dep " << Caller->getName() << " -> "
- << Callee->getName() << "\n");
- GVDependencies[Caller].insert(Callee);
- }
-}
-
-void GlobalDCEPass::ScanTypeCheckedLoadIntrinsics(Module &M) {
- LLVM_DEBUG(dbgs() << "Scanning type.checked.load intrinsics\n");
- Function *TypeCheckedLoadFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
-
- if (!TypeCheckedLoadFunc)
- return;
-
- for (auto U : TypeCheckedLoadFunc->users()) {
- auto CI = dyn_cast<CallInst>(U);
- if (!CI)
- continue;
-
- auto *Offset = dyn_cast<ConstantInt>(CI->getArgOperand(1));
- Value *TypeIdValue = CI->getArgOperand(2);
- auto *TypeId = cast<MetadataAsValue>(TypeIdValue)->getMetadata();
-
- if (Offset) {
- ScanVTableLoad(CI->getFunction(), TypeId, Offset->getZExtValue());
- } else {
- // type.checked.load with a non-constant offset, so assume every entry in
- // every matching vtable is used.
- for (auto &VTableInfo : TypeIdMap[TypeId]) {
- VFESafeVTables.erase(VTableInfo.first);
- }
- }
- }
-}
-
-void GlobalDCEPass::AddVirtualFunctionDependencies(Module &M) {
- if (!ClEnableVFE)
- return;
-
- // If the Virtual Function Elim module flag is present and set to zero, then
- // the vcall_visibility metadata was inserted for another optimization (WPD)
- // and we may not have type checked loads on all accesses to the vtable.
- // Don't attempt VFE in that case.
- auto *Val = mdconst::dyn_extract_or_null<ConstantInt>(
- M.getModuleFlag("Virtual Function Elim"));
- if (!Val || Val->getZExtValue() == 0)
- return;
-
- ScanVTables(M);
-
- if (VFESafeVTables.empty())
- return;
-
- ScanTypeCheckedLoadIntrinsics(M);
-
- LLVM_DEBUG(
- dbgs() << "VFE safe vtables:\n";
- for (auto *VTable : VFESafeVTables)
- dbgs() << " " << VTable->getName() << "\n";
- );
-}
-
-PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
- bool Changed = false;
-
- // The algorithm first computes the set L of global variables that are
- // trivially live. Then it walks the initialization of these variables to
- // compute the globals used to initialize them, which effectively builds a
- // directed graph where nodes are global variables, and an edge from A to B
- // means B is used to initialize A. Finally, it propagates the liveness
- // information through the graph starting from the nodes in L. Nodes note
- // marked as alive are discarded.
-
- // Remove empty functions from the global ctors list.
- Changed |= optimizeGlobalCtorsList(M, isEmptyFunction);
-
- // Collect the set of members for each comdat.
- for (Function &F : M)
- if (Comdat *C = F.getComdat())
- ComdatMembers.insert(std::make_pair(C, &F));
- for (GlobalVariable &GV : M.globals())
- if (Comdat *C = GV.getComdat())
- ComdatMembers.insert(std::make_pair(C, &GV));
- for (GlobalAlias &GA : M.aliases())
- if (Comdat *C = GA.getComdat())
- ComdatMembers.insert(std::make_pair(C, &GA));
-
- // Add dependencies between virtual call sites and the virtual functions they
- // might call, if we have that information.
- AddVirtualFunctionDependencies(M);
-
- // Loop over the module, adding globals which are obviously necessary.
- for (GlobalObject &GO : M.global_objects()) {
- Changed |= RemoveUnusedGlobalValue(GO);
- // Functions with external linkage are needed if they have a body.
- // Externally visible & appending globals are needed, if they have an
- // initializer.
- if (!GO.isDeclaration())
- if (!GO.isDiscardableIfUnused())
- MarkLive(GO);
-
- UpdateGVDependencies(GO);
- }
-
- // Compute direct dependencies of aliases.
- for (GlobalAlias &GA : M.aliases()) {
- Changed |= RemoveUnusedGlobalValue(GA);
- // Externally visible aliases are needed.
- if (!GA.isDiscardableIfUnused())
- MarkLive(GA);
-
- UpdateGVDependencies(GA);
- }
-
- // Compute direct dependencies of ifuncs.
- for (GlobalIFunc &GIF : M.ifuncs()) {
- Changed |= RemoveUnusedGlobalValue(GIF);
- // Externally visible ifuncs are needed.
- if (!GIF.isDiscardableIfUnused())
- MarkLive(GIF);
-
- UpdateGVDependencies(GIF);
- }
-
- // Propagate liveness from collected Global Values through the computed
- // dependencies.
- SmallVector<GlobalValue *, 8> NewLiveGVs{AliveGlobals.begin(),
- AliveGlobals.end()};
- while (!NewLiveGVs.empty()) {
- GlobalValue *LGV = NewLiveGVs.pop_back_val();
- for (auto *GVD : GVDependencies[LGV])
- MarkLive(*GVD, &NewLiveGVs);
- }
-
- // Now that all globals which are needed are in the AliveGlobals set, we loop
- // through the program, deleting those which are not alive.
- //
-
- // The first pass is to drop initializers of global variables which are dead.
- std::vector<GlobalVariable *> DeadGlobalVars; // Keep track of dead globals
- for (GlobalVariable &GV : M.globals())
- if (!AliveGlobals.count(&GV)) {
- DeadGlobalVars.push_back(&GV); // Keep track of dead globals
- if (GV.hasInitializer()) {
- Constant *Init = GV.getInitializer();
- GV.setInitializer(nullptr);
- if (isSafeToDestroyConstant(Init))
- Init->destroyConstant();
- }
- }
-
- // The second pass drops the bodies of functions which are dead...
- std::vector<Function *> DeadFunctions;
- for (Function &F : M)
- if (!AliveGlobals.count(&F)) {
- DeadFunctions.push_back(&F); // Keep track of dead globals
- if (!F.isDeclaration())
- F.deleteBody();
- }
-
- // The third pass drops targets of aliases which are dead...
- std::vector<GlobalAlias*> DeadAliases;
- for (GlobalAlias &GA : M.aliases())
- if (!AliveGlobals.count(&GA)) {
- DeadAliases.push_back(&GA);
- GA.setAliasee(nullptr);
- }
-
- // The fourth pass drops targets of ifuncs which are dead...
- std::vector<GlobalIFunc*> DeadIFuncs;
- for (GlobalIFunc &GIF : M.ifuncs())
- if (!AliveGlobals.count(&GIF)) {
- DeadIFuncs.push_back(&GIF);
- GIF.setResolver(nullptr);
- }
-
- // Now that all interferences have been dropped, delete the actual objects
- // themselves.
- auto EraseUnusedGlobalValue = [&](GlobalValue *GV) {
- RemoveUnusedGlobalValue(*GV);
- GV->eraseFromParent();
- Changed = true;
- };
-
- NumFunctions += DeadFunctions.size();
- for (Function *F : DeadFunctions) {
- if (!F->use_empty()) {
- // Virtual functions might still be referenced by one or more vtables,
- // but if we've proven them to be unused then it's safe to replace the
- // virtual function pointers with null, allowing us to remove the
- // function itself.
- ++NumVFuncs;
- F->replaceNonMetadataUsesWith(ConstantPointerNull::get(F->getType()));
- }
- EraseUnusedGlobalValue(F);
- }
-
- NumVariables += DeadGlobalVars.size();
- for (GlobalVariable *GV : DeadGlobalVars)
- EraseUnusedGlobalValue(GV);
-
- NumAliases += DeadAliases.size();
- for (GlobalAlias *GA : DeadAliases)
- EraseUnusedGlobalValue(GA);
-
- NumIFuncs += DeadIFuncs.size();
- for (GlobalIFunc *GIF : DeadIFuncs)
- EraseUnusedGlobalValue(GIF);
-
- // Make sure that all memory is released
- AliveGlobals.clear();
- ConstantDependenciesCache.clear();
- GVDependencies.clear();
- ComdatMembers.clear();
- TypeIdMap.clear();
- VFESafeVTables.clear();
-
- if (Changed)
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
-}
-
-// RemoveUnusedGlobalValue - Loop over all of the uses of the specified
-// GlobalValue, looking for the constant pointer ref that may be pointing to it.
-// If found, check to see if the constant pointer ref is safe to destroy, and if
-// so, nuke it. This will reduce the reference count on the global value, which
-// might make it deader.
-//
-bool GlobalDCEPass::RemoveUnusedGlobalValue(GlobalValue &GV) {
- if (GV.use_empty())
- return false;
- GV.removeDeadConstantUsers();
- return GV.use_empty();
-}
+//===-- GlobalDCE.cpp - DCE unreachable internal functions ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transform is designed to eliminate unreachable internal globals from the
+// program. It uses an aggressive algorithm, searching out globals that are
+// known to be alive. After it finds all of the globals which are needed, it
+// deletes whatever is left over. This allows it to delete recursive chunks of
+// the program which are unreachable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/GlobalDCE.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "globaldce"
+
+static cl::opt<bool>
+ ClEnableVFE("enable-vfe", cl::Hidden, cl::init(true), cl::ZeroOrMore,
+ cl::desc("Enable virtual function elimination"));
+
+STATISTIC(NumAliases , "Number of global aliases removed");
+STATISTIC(NumFunctions, "Number of functions removed");
+STATISTIC(NumIFuncs, "Number of indirect functions removed");
+STATISTIC(NumVariables, "Number of global variables removed");
+STATISTIC(NumVFuncs, "Number of virtual functions removed");
+
+namespace {
+ class GlobalDCELegacyPass : public ModulePass {
+ public:
+ static char ID; // Pass identification, replacement for typeid
+ GlobalDCELegacyPass() : ModulePass(ID) {
+ initializeGlobalDCELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ // run - Do the GlobalDCE pass on the specified module, optionally updating
+ // the specified callgraph to reflect the changes.
+ //
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+
+ // We need a minimally functional dummy module analysis manager. It needs
+ // to at least know about the possibility of proxying a function analysis
+ // manager.
+ FunctionAnalysisManager DummyFAM;
+ ModuleAnalysisManager DummyMAM;
+ DummyMAM.registerPass(
+ [&] { return FunctionAnalysisManagerModuleProxy(DummyFAM); });
+
+ auto PA = Impl.run(M, DummyMAM);
+ return !PA.areAllPreserved();
+ }
+
+ private:
+ GlobalDCEPass Impl;
+ };
+}
+
+char GlobalDCELegacyPass::ID = 0;
+INITIALIZE_PASS(GlobalDCELegacyPass, "globaldce",
+ "Dead Global Elimination", false, false)
+
+// Public interface to the GlobalDCEPass.
+ModulePass *llvm::createGlobalDCEPass() {
+ return new GlobalDCELegacyPass();
+}
+
+/// Returns true if F is effectively empty.
+static bool isEmptyFunction(Function *F) {
+ BasicBlock &Entry = F->getEntryBlock();
+ for (auto &I : Entry) {
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+ if (auto *RI = dyn_cast<ReturnInst>(&I))
+ return !RI->getReturnValue();
+ break;
+ }
+ return false;
+}
+
+/// Compute the set of GlobalValue that depends from V.
+/// The recursion stops as soon as a GlobalValue is met.
+void GlobalDCEPass::ComputeDependencies(Value *V,
+ SmallPtrSetImpl<GlobalValue *> &Deps) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ Function *Parent = I->getParent()->getParent();
+ Deps.insert(Parent);
+ } else if (auto *GV = dyn_cast<GlobalValue>(V)) {
+ Deps.insert(GV);
+ } else if (auto *CE = dyn_cast<Constant>(V)) {
+ // Avoid walking the whole tree of a big ConstantExprs multiple times.
+ auto Where = ConstantDependenciesCache.find(CE);
+ if (Where != ConstantDependenciesCache.end()) {
+ auto const &K = Where->second;
+ Deps.insert(K.begin(), K.end());
+ } else {
+ SmallPtrSetImpl<GlobalValue *> &LocalDeps = ConstantDependenciesCache[CE];
+ for (User *CEUser : CE->users())
+ ComputeDependencies(CEUser, LocalDeps);
+ Deps.insert(LocalDeps.begin(), LocalDeps.end());
+ }
+ }
+}
+
+void GlobalDCEPass::UpdateGVDependencies(GlobalValue &GV) {
+ SmallPtrSet<GlobalValue *, 8> Deps;
+ for (User *User : GV.users())
+ ComputeDependencies(User, Deps);
+ Deps.erase(&GV); // Remove self-reference.
+ for (GlobalValue *GVU : Deps) {
+ // If this is a dep from a vtable to a virtual function, and we have
+ // complete information about all virtual call sites which could call
+ // though this vtable, then skip it, because the call site information will
+ // be more precise.
+ if (VFESafeVTables.count(GVU) && isa<Function>(&GV)) {
+ LLVM_DEBUG(dbgs() << "Ignoring dep " << GVU->getName() << " -> "
+ << GV.getName() << "\n");
+ continue;
+ }
+ GVDependencies[GVU].insert(&GV);
+ }
+}
+
+/// Mark Global value as Live
+void GlobalDCEPass::MarkLive(GlobalValue &GV,
+ SmallVectorImpl<GlobalValue *> *Updates) {
+ auto const Ret = AliveGlobals.insert(&GV);
+ if (!Ret.second)
+ return;
+
+ if (Updates)
+ Updates->push_back(&GV);
+ if (Comdat *C = GV.getComdat()) {
+ for (auto &&CM : make_range(ComdatMembers.equal_range(C))) {
+ MarkLive(*CM.second, Updates); // Recursion depth is only two because only
+ // globals in the same comdat are visited.
+ }
+ }
+}
+
+void GlobalDCEPass::ScanVTables(Module &M) {
+ SmallVector<MDNode *, 2> Types;
+ LLVM_DEBUG(dbgs() << "Building type info -> vtable map\n");
+
+ auto *LTOPostLinkMD =
+ cast_or_null<ConstantAsMetadata>(M.getModuleFlag("LTOPostLink"));
+ bool LTOPostLink =
+ LTOPostLinkMD &&
+ (cast<ConstantInt>(LTOPostLinkMD->getValue())->getZExtValue() != 0);
+
+ for (GlobalVariable &GV : M.globals()) {
+ Types.clear();
+ GV.getMetadata(LLVMContext::MD_type, Types);
+ if (GV.isDeclaration() || Types.empty())
+ continue;
+
+ // Use the typeid metadata on the vtable to build a mapping from typeids to
+ // the list of (GV, offset) pairs which are the possible vtables for that
+ // typeid.
+ for (MDNode *Type : Types) {
+ Metadata *TypeID = Type->getOperand(1).get();
+
+ uint64_t Offset =
+ cast<ConstantInt>(
+ cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+ ->getZExtValue();
+
+ TypeIdMap[TypeID].insert(std::make_pair(&GV, Offset));
+ }
+
+ // If the type corresponding to the vtable is private to this translation
+ // unit, we know that we can see all virtual functions which might use it,
+ // so VFE is safe.
+ if (auto GO = dyn_cast<GlobalObject>(&GV)) {
+ GlobalObject::VCallVisibility TypeVis = GO->getVCallVisibility();
+ if (TypeVis == GlobalObject::VCallVisibilityTranslationUnit ||
+ (LTOPostLink &&
+ TypeVis == GlobalObject::VCallVisibilityLinkageUnit)) {
+ LLVM_DEBUG(dbgs() << GV.getName() << " is safe for VFE\n");
+ VFESafeVTables.insert(&GV);
+ }
+ }
+ }
+}
+
+void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId,
+ uint64_t CallOffset) {
+ for (auto &VTableInfo : TypeIdMap[TypeId]) {
+ GlobalVariable *VTable = VTableInfo.first;
+ uint64_t VTableOffset = VTableInfo.second;
+
+ Constant *Ptr =
+ getPointerAtOffset(VTable->getInitializer(), VTableOffset + CallOffset,
+ *Caller->getParent());
+ if (!Ptr) {
+ LLVM_DEBUG(dbgs() << "can't find pointer in vtable!\n");
+ VFESafeVTables.erase(VTable);
+ return;
+ }
+
+ auto Callee = dyn_cast<Function>(Ptr->stripPointerCasts());
+ if (!Callee) {
+ LLVM_DEBUG(dbgs() << "vtable entry is not function pointer!\n");
+ VFESafeVTables.erase(VTable);
+ return;
+ }
+
+ LLVM_DEBUG(dbgs() << "vfunc dep " << Caller->getName() << " -> "
+ << Callee->getName() << "\n");
+ GVDependencies[Caller].insert(Callee);
+ }
+}
+
+void GlobalDCEPass::ScanTypeCheckedLoadIntrinsics(Module &M) {
+ LLVM_DEBUG(dbgs() << "Scanning type.checked.load intrinsics\n");
+ Function *TypeCheckedLoadFunc =
+ M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
+
+ if (!TypeCheckedLoadFunc)
+ return;
+
+ for (auto U : TypeCheckedLoadFunc->users()) {
+ auto CI = dyn_cast<CallInst>(U);
+ if (!CI)
+ continue;
+
+ auto *Offset = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ Value *TypeIdValue = CI->getArgOperand(2);
+ auto *TypeId = cast<MetadataAsValue>(TypeIdValue)->getMetadata();
+
+ if (Offset) {
+ ScanVTableLoad(CI->getFunction(), TypeId, Offset->getZExtValue());
+ } else {
+ // type.checked.load with a non-constant offset, so assume every entry in
+ // every matching vtable is used.
+ for (auto &VTableInfo : TypeIdMap[TypeId]) {
+ VFESafeVTables.erase(VTableInfo.first);
+ }
+ }
+ }
+}
+
+void GlobalDCEPass::AddVirtualFunctionDependencies(Module &M) {
+ if (!ClEnableVFE)
+ return;
+
+ // If the Virtual Function Elim module flag is present and set to zero, then
+ // the vcall_visibility metadata was inserted for another optimization (WPD)
+ // and we may not have type checked loads on all accesses to the vtable.
+ // Don't attempt VFE in that case.
+ auto *Val = mdconst::dyn_extract_or_null<ConstantInt>(
+ M.getModuleFlag("Virtual Function Elim"));
+ if (!Val || Val->getZExtValue() == 0)
+ return;
+
+ ScanVTables(M);
+
+ if (VFESafeVTables.empty())
+ return;
+
+ ScanTypeCheckedLoadIntrinsics(M);
+
+ LLVM_DEBUG(
+ dbgs() << "VFE safe vtables:\n";
+ for (auto *VTable : VFESafeVTables)
+ dbgs() << " " << VTable->getName() << "\n";
+ );
+}
+
+PreservedAnalyses GlobalDCEPass::run(Module &M, ModuleAnalysisManager &MAM) {
+ bool Changed = false;
+
+ // The algorithm first computes the set L of global variables that are
+ // trivially live. Then it walks the initialization of these variables to
+ // compute the globals used to initialize them, which effectively builds a
+ // directed graph where nodes are global variables, and an edge from A to B
+ // means B is used to initialize A. Finally, it propagates the liveness
+ // information through the graph starting from the nodes in L. Nodes note
+ // marked as alive are discarded.
+
+ // Remove empty functions from the global ctors list.
+ Changed |= optimizeGlobalCtorsList(M, isEmptyFunction);
+
+ // Collect the set of members for each comdat.
+ for (Function &F : M)
+ if (Comdat *C = F.getComdat())
+ ComdatMembers.insert(std::make_pair(C, &F));
+ for (GlobalVariable &GV : M.globals())
+ if (Comdat *C = GV.getComdat())
+ ComdatMembers.insert(std::make_pair(C, &GV));
+ for (GlobalAlias &GA : M.aliases())
+ if (Comdat *C = GA.getComdat())
+ ComdatMembers.insert(std::make_pair(C, &GA));
+
+ // Add dependencies between virtual call sites and the virtual functions they
+ // might call, if we have that information.
+ AddVirtualFunctionDependencies(M);
+
+ // Loop over the module, adding globals which are obviously necessary.
+ for (GlobalObject &GO : M.global_objects()) {
+ Changed |= RemoveUnusedGlobalValue(GO);
+ // Functions with external linkage are needed if they have a body.
+ // Externally visible & appending globals are needed, if they have an
+ // initializer.
+ if (!GO.isDeclaration())
+ if (!GO.isDiscardableIfUnused())
+ MarkLive(GO);
+
+ UpdateGVDependencies(GO);
+ }
+
+ // Compute direct dependencies of aliases.
+ for (GlobalAlias &GA : M.aliases()) {
+ Changed |= RemoveUnusedGlobalValue(GA);
+ // Externally visible aliases are needed.
+ if (!GA.isDiscardableIfUnused())
+ MarkLive(GA);
+
+ UpdateGVDependencies(GA);
+ }
+
+ // Compute direct dependencies of ifuncs.
+ for (GlobalIFunc &GIF : M.ifuncs()) {
+ Changed |= RemoveUnusedGlobalValue(GIF);
+ // Externally visible ifuncs are needed.
+ if (!GIF.isDiscardableIfUnused())
+ MarkLive(GIF);
+
+ UpdateGVDependencies(GIF);
+ }
+
+ // Propagate liveness from collected Global Values through the computed
+ // dependencies.
+ SmallVector<GlobalValue *, 8> NewLiveGVs{AliveGlobals.begin(),
+ AliveGlobals.end()};
+ while (!NewLiveGVs.empty()) {
+ GlobalValue *LGV = NewLiveGVs.pop_back_val();
+ for (auto *GVD : GVDependencies[LGV])
+ MarkLive(*GVD, &NewLiveGVs);
+ }
+
+ // Now that all globals which are needed are in the AliveGlobals set, we loop
+ // through the program, deleting those which are not alive.
+ //
+
+ // The first pass is to drop initializers of global variables which are dead.
+ std::vector<GlobalVariable *> DeadGlobalVars; // Keep track of dead globals
+ for (GlobalVariable &GV : M.globals())
+ if (!AliveGlobals.count(&GV)) {
+ DeadGlobalVars.push_back(&GV); // Keep track of dead globals
+ if (GV.hasInitializer()) {
+ Constant *Init = GV.getInitializer();
+ GV.setInitializer(nullptr);
+ if (isSafeToDestroyConstant(Init))
+ Init->destroyConstant();
+ }
+ }
+
+ // The second pass drops the bodies of functions which are dead...
+ std::vector<Function *> DeadFunctions;
+ for (Function &F : M)
+ if (!AliveGlobals.count(&F)) {
+ DeadFunctions.push_back(&F); // Keep track of dead globals
+ if (!F.isDeclaration())
+ F.deleteBody();
+ }
+
+ // The third pass drops targets of aliases which are dead...
+ std::vector<GlobalAlias*> DeadAliases;
+ for (GlobalAlias &GA : M.aliases())
+ if (!AliveGlobals.count(&GA)) {
+ DeadAliases.push_back(&GA);
+ GA.setAliasee(nullptr);
+ }
+
+ // The fourth pass drops targets of ifuncs which are dead...
+ std::vector<GlobalIFunc*> DeadIFuncs;
+ for (GlobalIFunc &GIF : M.ifuncs())
+ if (!AliveGlobals.count(&GIF)) {
+ DeadIFuncs.push_back(&GIF);
+ GIF.setResolver(nullptr);
+ }
+
+ // Now that all interferences have been dropped, delete the actual objects
+ // themselves.
+ auto EraseUnusedGlobalValue = [&](GlobalValue *GV) {
+ RemoveUnusedGlobalValue(*GV);
+ GV->eraseFromParent();
+ Changed = true;
+ };
+
+ NumFunctions += DeadFunctions.size();
+ for (Function *F : DeadFunctions) {
+ if (!F->use_empty()) {
+ // Virtual functions might still be referenced by one or more vtables,
+ // but if we've proven them to be unused then it's safe to replace the
+ // virtual function pointers with null, allowing us to remove the
+ // function itself.
+ ++NumVFuncs;
+ F->replaceNonMetadataUsesWith(ConstantPointerNull::get(F->getType()));
+ }
+ EraseUnusedGlobalValue(F);
+ }
+
+ NumVariables += DeadGlobalVars.size();
+ for (GlobalVariable *GV : DeadGlobalVars)
+ EraseUnusedGlobalValue(GV);
+
+ NumAliases += DeadAliases.size();
+ for (GlobalAlias *GA : DeadAliases)
+ EraseUnusedGlobalValue(GA);
+
+ NumIFuncs += DeadIFuncs.size();
+ for (GlobalIFunc *GIF : DeadIFuncs)
+ EraseUnusedGlobalValue(GIF);
+
+ // Make sure that all memory is released
+ AliveGlobals.clear();
+ ConstantDependenciesCache.clear();
+ GVDependencies.clear();
+ ComdatMembers.clear();
+ TypeIdMap.clear();
+ VFESafeVTables.clear();
+
+ if (Changed)
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
+// RemoveUnusedGlobalValue - Loop over all of the uses of the specified
+// GlobalValue, looking for the constant pointer ref that may be pointing to it.
+// If found, check to see if the constant pointer ref is safe to destroy, and if
+// so, nuke it. This will reduce the reference count on the global value, which
+// might make it deader.
+//
+bool GlobalDCEPass::RemoveUnusedGlobalValue(GlobalValue &GV) {
+ if (GV.use_empty())
+ return false;
+ GV.removeDeadConstantUsers();
+ return GV.use_empty();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/GlobalOpt.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/GlobalOpt.cpp
index ecc0634a9e..223a05e8ea 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1,3204 +1,3204 @@
-//===- GlobalOpt.cpp - Optimize Global Variables --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass transforms simple global variables that never have their address
-// taken. If obviously true, it marks read/write globals as constant, deletes
-// variables only stored to, etc.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/GlobalOpt.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/CtorUtils.h"
-#include "llvm/Transforms/Utils/Evaluator.h"
-#include "llvm/Transforms/Utils/GlobalStatus.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <cassert>
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "globalopt"
-
-STATISTIC(NumMarked , "Number of globals marked constant");
-STATISTIC(NumUnnamed , "Number of globals marked unnamed_addr");
-STATISTIC(NumSRA , "Number of aggregate globals broken into scalars");
-STATISTIC(NumHeapSRA , "Number of heap objects SRA'd");
-STATISTIC(NumSubstitute,"Number of globals with initializers stored into them");
-STATISTIC(NumDeleted , "Number of globals deleted");
-STATISTIC(NumGlobUses , "Number of global uses devirtualized");
-STATISTIC(NumLocalized , "Number of globals localized");
-STATISTIC(NumShrunkToBool , "Number of global vars shrunk to booleans");
-STATISTIC(NumFastCallFns , "Number of functions converted to fastcc");
-STATISTIC(NumCtorsEvaluated, "Number of static ctors evaluated");
-STATISTIC(NumNestRemoved , "Number of nest attributes removed");
-STATISTIC(NumAliasesResolved, "Number of global aliases resolved");
-STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
-STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
-STATISTIC(NumInternalFunc, "Number of internal functions");
-STATISTIC(NumColdCC, "Number of functions marked coldcc");
-
-static cl::opt<bool>
- EnableColdCCStressTest("enable-coldcc-stress-test",
- cl::desc("Enable stress test of coldcc by adding "
- "calling conv to all internal functions."),
- cl::init(false), cl::Hidden);
-
-static cl::opt<int> ColdCCRelFreq(
- "coldcc-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
- cl::desc(
- "Maximum block frequency, expressed as a percentage of caller's "
- "entry frequency, for a call site to be considered cold for enabling"
- "coldcc"));
-
-/// Is this global variable possibly used by a leak checker as a root? If so,
-/// we might not really want to eliminate the stores to it.
-static bool isLeakCheckerRoot(GlobalVariable *GV) {
- // A global variable is a root if it is a pointer, or could plausibly contain
- // a pointer. There are two challenges; one is that we could have a struct
- // the has an inner member which is a pointer. We recurse through the type to
- // detect these (up to a point). The other is that we may actually be a union
- // of a pointer and another type, and so our LLVM type is an integer which
- // gets converted into a pointer, or our type is an [i8 x #] with a pointer
- // potentially contained here.
-
- if (GV->hasPrivateLinkage())
- return false;
-
- SmallVector<Type *, 4> Types;
- Types.push_back(GV->getValueType());
-
- unsigned Limit = 20;
- do {
- Type *Ty = Types.pop_back_val();
- switch (Ty->getTypeID()) {
- default: break;
- case Type::PointerTyID:
- return true;
- case Type::FixedVectorTyID:
- case Type::ScalableVectorTyID:
- if (cast<VectorType>(Ty)->getElementType()->isPointerTy())
- return true;
- break;
- case Type::ArrayTyID:
- Types.push_back(cast<ArrayType>(Ty)->getElementType());
- break;
- case Type::StructTyID: {
- StructType *STy = cast<StructType>(Ty);
- if (STy->isOpaque()) return true;
- for (StructType::element_iterator I = STy->element_begin(),
- E = STy->element_end(); I != E; ++I) {
- Type *InnerTy = *I;
- if (isa<PointerType>(InnerTy)) return true;
- if (isa<StructType>(InnerTy) || isa<ArrayType>(InnerTy) ||
- isa<VectorType>(InnerTy))
- Types.push_back(InnerTy);
- }
- break;
- }
- }
- if (--Limit == 0) return true;
- } while (!Types.empty());
- return false;
-}
-
-/// Given a value that is stored to a global but never read, determine whether
-/// it's safe to remove the store and the chain of computation that feeds the
-/// store.
-static bool IsSafeComputationToRemove(
- Value *V, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
- do {
- if (isa<Constant>(V))
- return true;
- if (!V->hasOneUse())
- return false;
- if (isa<LoadInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V) ||
- isa<GlobalValue>(V))
- return false;
- if (isAllocationFn(V, GetTLI))
- return true;
-
- Instruction *I = cast<Instruction>(V);
- if (I->mayHaveSideEffects())
- return false;
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
- if (!GEP->hasAllConstantIndices())
- return false;
- } else if (I->getNumOperands() != 1) {
- return false;
- }
-
- V = I->getOperand(0);
- } while (true);
-}
-
-/// This GV is a pointer root. Loop over all users of the global and clean up
-/// any that obviously don't assign the global a value that isn't dynamically
-/// allocated.
-static bool
-CleanupPointerRootUsers(GlobalVariable *GV,
- function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
- // A brief explanation of leak checkers. The goal is to find bugs where
- // pointers are forgotten, causing an accumulating growth in memory
- // usage over time. The common strategy for leak checkers is to explicitly
- // allow the memory pointed to by globals at exit. This is popular because it
- // also solves another problem where the main thread of a C++ program may shut
- // down before other threads that are still expecting to use those globals. To
- // handle that case, we expect the program may create a singleton and never
- // destroy it.
-
- bool Changed = false;
-
- // If Dead[n].first is the only use of a malloc result, we can delete its
- // chain of computation and the store to the global in Dead[n].second.
- SmallVector<std::pair<Instruction *, Instruction *>, 32> Dead;
-
- // Constants can't be pointers to dynamically allocated memory.
- for (Value::user_iterator UI = GV->user_begin(), E = GV->user_end();
- UI != E;) {
- User *U = *UI++;
- if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
- Value *V = SI->getValueOperand();
- if (isa<Constant>(V)) {
- Changed = true;
- SI->eraseFromParent();
- } else if (Instruction *I = dyn_cast<Instruction>(V)) {
- if (I->hasOneUse())
- Dead.push_back(std::make_pair(I, SI));
- }
- } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(U)) {
- if (isa<Constant>(MSI->getValue())) {
- Changed = true;
- MSI->eraseFromParent();
- } else if (Instruction *I = dyn_cast<Instruction>(MSI->getValue())) {
- if (I->hasOneUse())
- Dead.push_back(std::make_pair(I, MSI));
- }
- } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U)) {
- GlobalVariable *MemSrc = dyn_cast<GlobalVariable>(MTI->getSource());
- if (MemSrc && MemSrc->isConstant()) {
- Changed = true;
- MTI->eraseFromParent();
- } else if (Instruction *I = dyn_cast<Instruction>(MemSrc)) {
- if (I->hasOneUse())
- Dead.push_back(std::make_pair(I, MTI));
- }
- } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
- if (CE->use_empty()) {
- CE->destroyConstant();
- Changed = true;
- }
- } else if (Constant *C = dyn_cast<Constant>(U)) {
- if (isSafeToDestroyConstant(C)) {
- C->destroyConstant();
- // This could have invalidated UI, start over from scratch.
- Dead.clear();
- CleanupPointerRootUsers(GV, GetTLI);
- return true;
- }
- }
- }
-
- for (int i = 0, e = Dead.size(); i != e; ++i) {
- if (IsSafeComputationToRemove(Dead[i].first, GetTLI)) {
- Dead[i].second->eraseFromParent();
- Instruction *I = Dead[i].first;
- do {
- if (isAllocationFn(I, GetTLI))
- break;
- Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
- if (!J)
- break;
- I->eraseFromParent();
- I = J;
- } while (true);
- I->eraseFromParent();
+//===- GlobalOpt.cpp - Optimize Global Variables --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms simple global variables that never have their address
+// taken. If obviously true, it marks read/write globals as constant, deletes
+// variables only stored to, etc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/GlobalOpt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/Transforms/Utils/Evaluator.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "globalopt"
+
+STATISTIC(NumMarked , "Number of globals marked constant");
+STATISTIC(NumUnnamed , "Number of globals marked unnamed_addr");
+STATISTIC(NumSRA , "Number of aggregate globals broken into scalars");
+STATISTIC(NumHeapSRA , "Number of heap objects SRA'd");
+STATISTIC(NumSubstitute,"Number of globals with initializers stored into them");
+STATISTIC(NumDeleted , "Number of globals deleted");
+STATISTIC(NumGlobUses , "Number of global uses devirtualized");
+STATISTIC(NumLocalized , "Number of globals localized");
+STATISTIC(NumShrunkToBool , "Number of global vars shrunk to booleans");
+STATISTIC(NumFastCallFns , "Number of functions converted to fastcc");
+STATISTIC(NumCtorsEvaluated, "Number of static ctors evaluated");
+STATISTIC(NumNestRemoved , "Number of nest attributes removed");
+STATISTIC(NumAliasesResolved, "Number of global aliases resolved");
+STATISTIC(NumAliasesRemoved, "Number of global aliases eliminated");
+STATISTIC(NumCXXDtorsRemoved, "Number of global C++ destructors removed");
+STATISTIC(NumInternalFunc, "Number of internal functions");
+STATISTIC(NumColdCC, "Number of functions marked coldcc");
+
+static cl::opt<bool>
+ EnableColdCCStressTest("enable-coldcc-stress-test",
+ cl::desc("Enable stress test of coldcc by adding "
+ "calling conv to all internal functions."),
+ cl::init(false), cl::Hidden);
+
+static cl::opt<int> ColdCCRelFreq(
+ "coldcc-rel-freq", cl::Hidden, cl::init(2), cl::ZeroOrMore,
+ cl::desc(
+ "Maximum block frequency, expressed as a percentage of caller's "
+ "entry frequency, for a call site to be considered cold for enabling"
+ "coldcc"));
+
+/// Is this global variable possibly used by a leak checker as a root? If so,
+/// we might not really want to eliminate the stores to it.
+static bool isLeakCheckerRoot(GlobalVariable *GV) {
+ // A global variable is a root if it is a pointer, or could plausibly contain
+ // a pointer. There are two challenges; one is that we could have a struct
+ // the has an inner member which is a pointer. We recurse through the type to
+ // detect these (up to a point). The other is that we may actually be a union
+ // of a pointer and another type, and so our LLVM type is an integer which
+ // gets converted into a pointer, or our type is an [i8 x #] with a pointer
+ // potentially contained here.
+
+ if (GV->hasPrivateLinkage())
+ return false;
+
+ SmallVector<Type *, 4> Types;
+ Types.push_back(GV->getValueType());
+
+ unsigned Limit = 20;
+ do {
+ Type *Ty = Types.pop_back_val();
+ switch (Ty->getTypeID()) {
+ default: break;
+ case Type::PointerTyID:
+ return true;
+ case Type::FixedVectorTyID:
+ case Type::ScalableVectorTyID:
+ if (cast<VectorType>(Ty)->getElementType()->isPointerTy())
+ return true;
+ break;
+ case Type::ArrayTyID:
+ Types.push_back(cast<ArrayType>(Ty)->getElementType());
+ break;
+ case Type::StructTyID: {
+ StructType *STy = cast<StructType>(Ty);
+ if (STy->isOpaque()) return true;
+ for (StructType::element_iterator I = STy->element_begin(),
+ E = STy->element_end(); I != E; ++I) {
+ Type *InnerTy = *I;
+ if (isa<PointerType>(InnerTy)) return true;
+ if (isa<StructType>(InnerTy) || isa<ArrayType>(InnerTy) ||
+ isa<VectorType>(InnerTy))
+ Types.push_back(InnerTy);
+ }
+ break;
+ }
+ }
+ if (--Limit == 0) return true;
+ } while (!Types.empty());
+ return false;
+}
+
+/// Given a value that is stored to a global but never read, determine whether
+/// it's safe to remove the store and the chain of computation that feeds the
+/// store.
+static bool IsSafeComputationToRemove(
+ Value *V, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+ do {
+ if (isa<Constant>(V))
+ return true;
+ if (!V->hasOneUse())
+ return false;
+ if (isa<LoadInst>(V) || isa<InvokeInst>(V) || isa<Argument>(V) ||
+ isa<GlobalValue>(V))
+ return false;
+ if (isAllocationFn(V, GetTLI))
+ return true;
+
+ Instruction *I = cast<Instruction>(V);
+ if (I->mayHaveSideEffects())
+ return false;
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+ if (!GEP->hasAllConstantIndices())
+ return false;
+ } else if (I->getNumOperands() != 1) {
+ return false;
+ }
+
+ V = I->getOperand(0);
+ } while (true);
+}
+
+/// This GV is a pointer root. Loop over all users of the global and clean up
+/// any that obviously don't assign the global a value that isn't dynamically
+/// allocated.
+static bool
+CleanupPointerRootUsers(GlobalVariable *GV,
+ function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+ // A brief explanation of leak checkers. The goal is to find bugs where
+ // pointers are forgotten, causing an accumulating growth in memory
+ // usage over time. The common strategy for leak checkers is to explicitly
+ // allow the memory pointed to by globals at exit. This is popular because it
+ // also solves another problem where the main thread of a C++ program may shut
+ // down before other threads that are still expecting to use those globals. To
+ // handle that case, we expect the program may create a singleton and never
+ // destroy it.
+
+ bool Changed = false;
+
+ // If Dead[n].first is the only use of a malloc result, we can delete its
+ // chain of computation and the store to the global in Dead[n].second.
+ SmallVector<std::pair<Instruction *, Instruction *>, 32> Dead;
+
+ // Constants can't be pointers to dynamically allocated memory.
+ for (Value::user_iterator UI = GV->user_begin(), E = GV->user_end();
+ UI != E;) {
+ User *U = *UI++;
+ if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+ Value *V = SI->getValueOperand();
+ if (isa<Constant>(V)) {
+ Changed = true;
+ SI->eraseFromParent();
+ } else if (Instruction *I = dyn_cast<Instruction>(V)) {
+ if (I->hasOneUse())
+ Dead.push_back(std::make_pair(I, SI));
+ }
+ } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(U)) {
+ if (isa<Constant>(MSI->getValue())) {
+ Changed = true;
+ MSI->eraseFromParent();
+ } else if (Instruction *I = dyn_cast<Instruction>(MSI->getValue())) {
+ if (I->hasOneUse())
+ Dead.push_back(std::make_pair(I, MSI));
+ }
+ } else if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U)) {
+ GlobalVariable *MemSrc = dyn_cast<GlobalVariable>(MTI->getSource());
+ if (MemSrc && MemSrc->isConstant()) {
+ Changed = true;
+ MTI->eraseFromParent();
+ } else if (Instruction *I = dyn_cast<Instruction>(MemSrc)) {
+ if (I->hasOneUse())
+ Dead.push_back(std::make_pair(I, MTI));
+ }
+ } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+ if (CE->use_empty()) {
+ CE->destroyConstant();
+ Changed = true;
+ }
+ } else if (Constant *C = dyn_cast<Constant>(U)) {
+ if (isSafeToDestroyConstant(C)) {
+ C->destroyConstant();
+ // This could have invalidated UI, start over from scratch.
+ Dead.clear();
+ CleanupPointerRootUsers(GV, GetTLI);
+ return true;
+ }
+ }
+ }
+
+ for (int i = 0, e = Dead.size(); i != e; ++i) {
+ if (IsSafeComputationToRemove(Dead[i].first, GetTLI)) {
+ Dead[i].second->eraseFromParent();
+ Instruction *I = Dead[i].first;
+ do {
+ if (isAllocationFn(I, GetTLI))
+ break;
+ Instruction *J = dyn_cast<Instruction>(I->getOperand(0));
+ if (!J)
+ break;
+ I->eraseFromParent();
+ I = J;
+ } while (true);
+ I->eraseFromParent();
Changed = true;
- }
- }
-
- return Changed;
-}
-
-/// We just marked GV constant. Loop over all users of the global, cleaning up
-/// the obvious ones. This is largely just a quick scan over the use list to
-/// clean up the easy and obvious cruft. This returns true if it made a change.
-static bool CleanupConstantGlobalUsers(
- Value *V, Constant *Init, const DataLayout &DL,
- function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
- bool Changed = false;
- // Note that we need to use a weak value handle for the worklist items. When
- // we delete a constant array, we may also be holding pointer to one of its
- // elements (or an element of one of its elements if we're dealing with an
- // array of arrays) in the worklist.
+ }
+ }
+
+ return Changed;
+}
+
+/// We just marked GV constant. Loop over all users of the global, cleaning up
+/// the obvious ones. This is largely just a quick scan over the use list to
+/// clean up the easy and obvious cruft. This returns true if it made a change.
+static bool CleanupConstantGlobalUsers(
+ Value *V, Constant *Init, const DataLayout &DL,
+ function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+ bool Changed = false;
+ // Note that we need to use a weak value handle for the worklist items. When
+ // we delete a constant array, we may also be holding pointer to one of its
+ // elements (or an element of one of its elements if we're dealing with an
+ // array of arrays) in the worklist.
SmallVector<WeakTrackingVH, 8> WorkList(V->users());
- while (!WorkList.empty()) {
- Value *UV = WorkList.pop_back_val();
- if (!UV)
- continue;
-
- User *U = cast<User>(UV);
-
- if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
- if (Init) {
- // Replace the load with the initializer.
- LI->replaceAllUsesWith(Init);
- LI->eraseFromParent();
- Changed = true;
- }
- } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
- // Store must be unreachable or storing Init into the global.
- SI->eraseFromParent();
- Changed = true;
- } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
- if (CE->getOpcode() == Instruction::GetElementPtr) {
- Constant *SubInit = nullptr;
- if (Init)
- SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
- Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, GetTLI);
- } else if ((CE->getOpcode() == Instruction::BitCast &&
- CE->getType()->isPointerTy()) ||
- CE->getOpcode() == Instruction::AddrSpaceCast) {
- // Pointer cast, delete any stores and memsets to the global.
- Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, GetTLI);
- }
-
- if (CE->use_empty()) {
- CE->destroyConstant();
- Changed = true;
- }
- } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
- // Do not transform "gepinst (gep constexpr (GV))" here, because forming
- // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold
- // and will invalidate our notion of what Init is.
- Constant *SubInit = nullptr;
- if (!isa<ConstantExpr>(GEP->getOperand(0))) {
- ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(
- ConstantFoldInstruction(GEP, DL, &GetTLI(*GEP->getFunction())));
- if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
- SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
-
- // If the initializer is an all-null value and we have an inbounds GEP,
- // we already know what the result of any load from that GEP is.
- // TODO: Handle splats.
- if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds())
- SubInit = Constant::getNullValue(GEP->getResultElementType());
- }
- Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, GetTLI);
-
- if (GEP->use_empty()) {
- GEP->eraseFromParent();
- Changed = true;
- }
- } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv
- if (MI->getRawDest() == V) {
- MI->eraseFromParent();
- Changed = true;
- }
-
- } else if (Constant *C = dyn_cast<Constant>(U)) {
- // If we have a chain of dead constantexprs or other things dangling from
- // us, and if they are all dead, nuke them without remorse.
- if (isSafeToDestroyConstant(C)) {
- C->destroyConstant();
- CleanupConstantGlobalUsers(V, Init, DL, GetTLI);
- return true;
- }
- }
- }
- return Changed;
-}
-
-static bool isSafeSROAElementUse(Value *V);
-
-/// Return true if the specified GEP is a safe user of a derived
-/// expression from a global that we want to SROA.
-static bool isSafeSROAGEP(User *U) {
- // Check to see if this ConstantExpr GEP is SRA'able. In particular, we
- // don't like < 3 operand CE's, and we don't like non-constant integer
- // indices. This enforces that all uses are 'gep GV, 0, C, ...' for some
- // value of C.
- if (U->getNumOperands() < 3 || !isa<Constant>(U->getOperand(1)) ||
- !cast<Constant>(U->getOperand(1))->isNullValue())
- return false;
-
- gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U);
- ++GEPI; // Skip over the pointer index.
-
- // For all other level we require that the indices are constant and inrange.
- // In particular, consider: A[0][i]. We cannot know that the user isn't doing
- // invalid things like allowing i to index an out-of-range subscript that
- // accesses A[1]. This can also happen between different members of a struct
- // in llvm IR.
- for (; GEPI != E; ++GEPI) {
- if (GEPI.isStruct())
- continue;
-
- ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand());
- if (!IdxVal || (GEPI.isBoundedSequential() &&
- IdxVal->getZExtValue() >= GEPI.getSequentialNumElements()))
- return false;
- }
-
- return llvm::all_of(U->users(),
- [](User *UU) { return isSafeSROAElementUse(UU); });
-}
-
-/// Return true if the specified instruction is a safe user of a derived
-/// expression from a global that we want to SROA.
-static bool isSafeSROAElementUse(Value *V) {
- // We might have a dead and dangling constant hanging off of here.
- if (Constant *C = dyn_cast<Constant>(V))
- return isSafeToDestroyConstant(C);
-
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I) return false;
-
- // Loads are ok.
- if (isa<LoadInst>(I)) return true;
-
- // Stores *to* the pointer are ok.
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->getOperand(0) != V;
-
- // Otherwise, it must be a GEP. Check it and its users are safe to SRA.
- return isa<GetElementPtrInst>(I) && isSafeSROAGEP(I);
-}
-
-/// Look at all uses of the global and decide whether it is safe for us to
-/// perform this transformation.
-static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
- for (User *U : GV->users()) {
- // The user of the global must be a GEP Inst or a ConstantExpr GEP.
- if (!isa<GetElementPtrInst>(U) &&
- (!isa<ConstantExpr>(U) ||
- cast<ConstantExpr>(U)->getOpcode() != Instruction::GetElementPtr))
- return false;
-
- // Check the gep and it's users are safe to SRA
- if (!isSafeSROAGEP(U))
- return false;
- }
-
- return true;
-}
-
-static bool IsSRASequential(Type *T) {
- return isa<ArrayType>(T) || isa<VectorType>(T);
-}
-static uint64_t GetSRASequentialNumElements(Type *T) {
- if (ArrayType *AT = dyn_cast<ArrayType>(T))
- return AT->getNumElements();
- return cast<FixedVectorType>(T)->getNumElements();
-}
-static Type *GetSRASequentialElementType(Type *T) {
- if (ArrayType *AT = dyn_cast<ArrayType>(T))
- return AT->getElementType();
- return cast<VectorType>(T)->getElementType();
-}
-static bool CanDoGlobalSRA(GlobalVariable *GV) {
- Constant *Init = GV->getInitializer();
-
- if (isa<StructType>(Init->getType())) {
- // nothing to check
- } else if (IsSRASequential(Init->getType())) {
- if (GetSRASequentialNumElements(Init->getType()) > 16 &&
- GV->hasNUsesOrMore(16))
- return false; // It's not worth it.
- } else
- return false;
-
- return GlobalUsersSafeToSRA(GV);
-}
-
-/// Copy over the debug info for a variable to its SRA replacements.
-static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
- uint64_t FragmentOffsetInBits,
- uint64_t FragmentSizeInBits,
- uint64_t VarSize) {
- SmallVector<DIGlobalVariableExpression *, 1> GVs;
- GV->getDebugInfo(GVs);
- for (auto *GVE : GVs) {
- DIVariable *Var = GVE->getVariable();
- DIExpression *Expr = GVE->getExpression();
- // If the FragmentSize is smaller than the variable,
- // emit a fragment expression.
- if (FragmentSizeInBits < VarSize) {
- if (auto E = DIExpression::createFragmentExpression(
- Expr, FragmentOffsetInBits, FragmentSizeInBits))
- Expr = *E;
- else
- return;
- }
- auto *NGVE = DIGlobalVariableExpression::get(GVE->getContext(), Var, Expr);
- NGV->addDebugInfo(NGVE);
- }
-}
-
-/// Perform scalar replacement of aggregates on the specified global variable.
-/// This opens the door for other optimizations by exposing the behavior of the
-/// program in a more fine-grained way. We have determined that this
-/// transformation is safe already. We return the first global variable we
-/// insert so that the caller can reprocess it.
-static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
- // Make sure this global only has simple uses that we can SRA.
- if (!CanDoGlobalSRA(GV))
- return nullptr;
-
- assert(GV->hasLocalLinkage());
- Constant *Init = GV->getInitializer();
- Type *Ty = Init->getType();
- uint64_t VarSize = DL.getTypeSizeInBits(Ty);
-
- std::map<unsigned, GlobalVariable *> NewGlobals;
-
- // Get the alignment of the global, either explicit or target-specific.
- Align StartAlignment =
- DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getType());
-
- // Loop over all users and create replacement variables for used aggregate
- // elements.
- for (User *GEP : GV->users()) {
- assert(((isa<ConstantExpr>(GEP) && cast<ConstantExpr>(GEP)->getOpcode() ==
- Instruction::GetElementPtr) ||
- isa<GetElementPtrInst>(GEP)) &&
- "NonGEP CE's are not SRAable!");
-
- // Ignore the 1th operand, which has to be zero or else the program is quite
- // broken (undefined). Get the 2nd operand, which is the structure or array
- // index.
- unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
- if (NewGlobals.count(ElementIdx) == 1)
- continue; // we`ve already created replacement variable
- assert(NewGlobals.count(ElementIdx) == 0);
-
- Type *ElTy = nullptr;
- if (StructType *STy = dyn_cast<StructType>(Ty))
- ElTy = STy->getElementType(ElementIdx);
- else
- ElTy = GetSRASequentialElementType(Ty);
- assert(ElTy);
-
- Constant *In = Init->getAggregateElement(ElementIdx);
- assert(In && "Couldn't get element of initializer?");
-
- GlobalVariable *NGV = new GlobalVariable(
- ElTy, false, GlobalVariable::InternalLinkage, In,
- GV->getName() + "." + Twine(ElementIdx), GV->getThreadLocalMode(),
- GV->getType()->getAddressSpace());
- NGV->setExternallyInitialized(GV->isExternallyInitialized());
- NGV->copyAttributesFrom(GV);
- NewGlobals.insert(std::make_pair(ElementIdx, NGV));
-
- if (StructType *STy = dyn_cast<StructType>(Ty)) {
- const StructLayout &Layout = *DL.getStructLayout(STy);
-
- // Calculate the known alignment of the field. If the original aggregate
- // had 256 byte alignment for example, something might depend on that:
- // propagate info to each field.
- uint64_t FieldOffset = Layout.getElementOffset(ElementIdx);
- Align NewAlign = commonAlignment(StartAlignment, FieldOffset);
- if (NewAlign > DL.getABITypeAlign(STy->getElementType(ElementIdx)))
- NGV->setAlignment(NewAlign);
-
- // Copy over the debug info for the variable.
- uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType());
- uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(ElementIdx);
- transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, VarSize);
- } else {
- uint64_t EltSize = DL.getTypeAllocSize(ElTy);
- Align EltAlign = DL.getABITypeAlign(ElTy);
- uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy);
-
- // Calculate the known alignment of the field. If the original aggregate
- // had 256 byte alignment for example, something might depend on that:
- // propagate info to each field.
- Align NewAlign = commonAlignment(StartAlignment, EltSize * ElementIdx);
- if (NewAlign > EltAlign)
- NGV->setAlignment(NewAlign);
- transferSRADebugInfo(GV, NGV, FragmentSizeInBits * ElementIdx,
- FragmentSizeInBits, VarSize);
- }
- }
-
- if (NewGlobals.empty())
- return nullptr;
-
- Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
- for (auto NewGlobalVar : NewGlobals)
- Globals.push_back(NewGlobalVar.second);
-
- LLVM_DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n");
-
- Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext()));
-
- // Loop over all of the uses of the global, replacing the constantexpr geps,
- // with smaller constantexpr geps or direct references.
- while (!GV->use_empty()) {
- User *GEP = GV->user_back();
- assert(((isa<ConstantExpr>(GEP) &&
- cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)||
- isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!");
-
- // Ignore the 1th operand, which has to be zero or else the program is quite
- // broken (undefined). Get the 2nd operand, which is the structure or array
- // index.
- unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
- assert(NewGlobals.count(ElementIdx) == 1);
-
- Value *NewPtr = NewGlobals[ElementIdx];
- Type *NewTy = NewGlobals[ElementIdx]->getValueType();
-
- // Form a shorter GEP if needed.
- if (GEP->getNumOperands() > 3) {
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEP)) {
- SmallVector<Constant*, 8> Idxs;
- Idxs.push_back(NullInt);
- for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i)
- Idxs.push_back(CE->getOperand(i));
- NewPtr =
- ConstantExpr::getGetElementPtr(NewTy, cast<Constant>(NewPtr), Idxs);
- } else {
- GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP);
- SmallVector<Value*, 8> Idxs;
- Idxs.push_back(NullInt);
- for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i)
- Idxs.push_back(GEPI->getOperand(i));
- NewPtr = GetElementPtrInst::Create(
- NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(ElementIdx),
- GEPI);
- }
- }
- GEP->replaceAllUsesWith(NewPtr);
-
- if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(GEP))
- GEPI->eraseFromParent();
- else
- cast<ConstantExpr>(GEP)->destroyConstant();
- }
-
- // Delete the old global, now that it is dead.
- Globals.erase(GV);
- ++NumSRA;
-
- assert(NewGlobals.size() > 0);
- return NewGlobals.begin()->second;
-}
-
-/// Return true if all users of the specified value will trap if the value is
-/// dynamically null. PHIs keeps track of any phi nodes we've seen to avoid
-/// reprocessing them.
-static bool AllUsesOfValueWillTrapIfNull(const Value *V,
- SmallPtrSetImpl<const PHINode*> &PHIs) {
- for (const User *U : V->users()) {
- if (const Instruction *I = dyn_cast<Instruction>(U)) {
- // If null pointer is considered valid, then all uses are non-trapping.
- // Non address-space 0 globals have already been pruned by the caller.
- if (NullPointerIsDefined(I->getFunction()))
- return false;
- }
- if (isa<LoadInst>(U)) {
- // Will trap.
- } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
- if (SI->getOperand(0) == V) {
- //cerr << "NONTRAPPING USE: " << *U;
- return false; // Storing the value.
- }
- } else if (const CallInst *CI = dyn_cast<CallInst>(U)) {
- if (CI->getCalledOperand() != V) {
- //cerr << "NONTRAPPING USE: " << *U;
- return false; // Not calling the ptr
- }
- } else if (const InvokeInst *II = dyn_cast<InvokeInst>(U)) {
- if (II->getCalledOperand() != V) {
- //cerr << "NONTRAPPING USE: " << *U;
- return false; // Not calling the ptr
- }
- } else if (const BitCastInst *CI = dyn_cast<BitCastInst>(U)) {
- if (!AllUsesOfValueWillTrapIfNull(CI, PHIs)) return false;
- } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
- if (!AllUsesOfValueWillTrapIfNull(GEPI, PHIs)) return false;
- } else if (const PHINode *PN = dyn_cast<PHINode>(U)) {
- // If we've already seen this phi node, ignore it, it has already been
- // checked.
- if (PHIs.insert(PN).second && !AllUsesOfValueWillTrapIfNull(PN, PHIs))
- return false;
- } else {
- //cerr << "NONTRAPPING USE: " << *U;
- return false;
- }
- }
- return true;
-}
-
-/// Return true if all uses of any loads from GV will trap if the loaded value
-/// is null. Note that this also permits comparisons of the loaded value
-/// against null, as a special case.
-static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) {
- for (const User *U : GV->users())
- if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
- SmallPtrSet<const PHINode*, 8> PHIs;
- if (!AllUsesOfValueWillTrapIfNull(LI, PHIs))
- return false;
- } else if (isa<StoreInst>(U)) {
- // Ignore stores to the global.
- } else {
- // We don't know or understand this user, bail out.
- //cerr << "UNKNOWN USER OF GLOBAL!: " << *U;
- return false;
- }
- return true;
-}
-
-static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
- bool Changed = false;
- for (auto UI = V->user_begin(), E = V->user_end(); UI != E; ) {
- Instruction *I = cast<Instruction>(*UI++);
- // Uses are non-trapping if null pointer is considered valid.
- // Non address-space 0 globals are already pruned by the caller.
- if (NullPointerIsDefined(I->getFunction()))
- return false;
- if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
- LI->setOperand(0, NewV);
- Changed = true;
- } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
- if (SI->getOperand(1) == V) {
- SI->setOperand(1, NewV);
- Changed = true;
- }
- } else if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
- CallBase *CB = cast<CallBase>(I);
- if (CB->getCalledOperand() == V) {
- // Calling through the pointer! Turn into a direct call, but be careful
- // that the pointer is not also being passed as an argument.
- CB->setCalledOperand(NewV);
- Changed = true;
- bool PassedAsArg = false;
- for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
- if (CB->getArgOperand(i) == V) {
- PassedAsArg = true;
- CB->setArgOperand(i, NewV);
- }
-
- if (PassedAsArg) {
- // Being passed as an argument also. Be careful to not invalidate UI!
- UI = V->user_begin();
- }
- }
- } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
- Changed |= OptimizeAwayTrappingUsesOfValue(CI,
- ConstantExpr::getCast(CI->getOpcode(),
- NewV, CI->getType()));
- if (CI->use_empty()) {
- Changed = true;
- CI->eraseFromParent();
- }
- } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
- // Should handle GEP here.
- SmallVector<Constant*, 8> Idxs;
- Idxs.reserve(GEPI->getNumOperands()-1);
- for (User::op_iterator i = GEPI->op_begin() + 1, e = GEPI->op_end();
- i != e; ++i)
- if (Constant *C = dyn_cast<Constant>(*i))
- Idxs.push_back(C);
- else
- break;
- if (Idxs.size() == GEPI->getNumOperands()-1)
- Changed |= OptimizeAwayTrappingUsesOfValue(
- GEPI, ConstantExpr::getGetElementPtr(GEPI->getSourceElementType(),
- NewV, Idxs));
- if (GEPI->use_empty()) {
- Changed = true;
- GEPI->eraseFromParent();
- }
- }
- }
-
- return Changed;
-}
-
-/// The specified global has only one non-null value stored into it. If there
-/// are uses of the loaded value that would trap if the loaded value is
-/// dynamically null, then we know that they cannot be reachable with a null
-/// optimize away the load.
-static bool OptimizeAwayTrappingUsesOfLoads(
- GlobalVariable *GV, Constant *LV, const DataLayout &DL,
- function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
- bool Changed = false;
-
- // Keep track of whether we are able to remove all the uses of the global
- // other than the store that defines it.
- bool AllNonStoreUsesGone = true;
-
- // Replace all uses of loads with uses of uses of the stored value.
- for (Value::user_iterator GUI = GV->user_begin(), E = GV->user_end(); GUI != E;){
- User *GlobalUser = *GUI++;
- if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) {
- Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV);
- // If we were able to delete all uses of the loads
- if (LI->use_empty()) {
- LI->eraseFromParent();
- Changed = true;
- } else {
- AllNonStoreUsesGone = false;
- }
- } else if (isa<StoreInst>(GlobalUser)) {
- // Ignore the store that stores "LV" to the global.
- assert(GlobalUser->getOperand(1) == GV &&
- "Must be storing *to* the global");
- } else {
- AllNonStoreUsesGone = false;
-
- // If we get here we could have other crazy uses that are transitively
- // loaded.
- assert((isa<PHINode>(GlobalUser) || isa<SelectInst>(GlobalUser) ||
- isa<ConstantExpr>(GlobalUser) || isa<CmpInst>(GlobalUser) ||
- isa<BitCastInst>(GlobalUser) ||
- isa<GetElementPtrInst>(GlobalUser)) &&
- "Only expect load and stores!");
- }
- }
-
- if (Changed) {
- LLVM_DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV
- << "\n");
- ++NumGlobUses;
- }
-
- // If we nuked all of the loads, then none of the stores are needed either,
- // nor is the global.
- if (AllNonStoreUsesGone) {
- if (isLeakCheckerRoot(GV)) {
- Changed |= CleanupPointerRootUsers(GV, GetTLI);
- } else {
- Changed = true;
- CleanupConstantGlobalUsers(GV, nullptr, DL, GetTLI);
- }
- if (GV->use_empty()) {
- LLVM_DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n");
- Changed = true;
- GV->eraseFromParent();
- ++NumDeleted;
- }
- }
- return Changed;
-}
-
-/// Walk the use list of V, constant folding all of the instructions that are
-/// foldable.
-static void ConstantPropUsersOf(Value *V, const DataLayout &DL,
- TargetLibraryInfo *TLI) {
- for (Value::user_iterator UI = V->user_begin(), E = V->user_end(); UI != E; )
- if (Instruction *I = dyn_cast<Instruction>(*UI++))
- if (Constant *NewC = ConstantFoldInstruction(I, DL, TLI)) {
- I->replaceAllUsesWith(NewC);
-
- // Advance UI to the next non-I use to avoid invalidating it!
- // Instructions could multiply use V.
- while (UI != E && *UI == I)
- ++UI;
- if (isInstructionTriviallyDead(I, TLI))
- I->eraseFromParent();
- }
-}
-
-/// This function takes the specified global variable, and transforms the
-/// program as if it always contained the result of the specified malloc.
-/// Because it is always the result of the specified malloc, there is no reason
-/// to actually DO the malloc. Instead, turn the malloc into a global, and any
-/// loads of GV as uses of the new global.
-static GlobalVariable *
-OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
- ConstantInt *NElements, const DataLayout &DL,
- TargetLibraryInfo *TLI) {
- LLVM_DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << " CALL = " << *CI
- << '\n');
-
- Type *GlobalType;
- if (NElements->getZExtValue() == 1)
- GlobalType = AllocTy;
- else
- // If we have an array allocation, the global variable is of an array.
- GlobalType = ArrayType::get(AllocTy, NElements->getZExtValue());
-
- // Create the new global variable. The contents of the malloc'd memory is
- // undefined, so initialize with an undef value.
- GlobalVariable *NewGV = new GlobalVariable(
- *GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage,
- UndefValue::get(GlobalType), GV->getName() + ".body", nullptr,
- GV->getThreadLocalMode());
-
- // If there are bitcast users of the malloc (which is typical, usually we have
- // a malloc + bitcast) then replace them with uses of the new global. Update
- // other users to use the global as well.
- BitCastInst *TheBC = nullptr;
- while (!CI->use_empty()) {
- Instruction *User = cast<Instruction>(CI->user_back());
- if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
- if (BCI->getType() == NewGV->getType()) {
- BCI->replaceAllUsesWith(NewGV);
- BCI->eraseFromParent();
- } else {
- BCI->setOperand(0, NewGV);
- }
- } else {
- if (!TheBC)
- TheBC = new BitCastInst(NewGV, CI->getType(), "newgv", CI);
- User->replaceUsesOfWith(CI, TheBC);
- }
- }
-
- Constant *RepValue = NewGV;
- if (NewGV->getType() != GV->getValueType())
- RepValue = ConstantExpr::getBitCast(RepValue, GV->getValueType());
-
- // If there is a comparison against null, we will insert a global bool to
- // keep track of whether the global was initialized yet or not.
- GlobalVariable *InitBool =
- new GlobalVariable(Type::getInt1Ty(GV->getContext()), false,
- GlobalValue::InternalLinkage,
- ConstantInt::getFalse(GV->getContext()),
- GV->getName()+".init", GV->getThreadLocalMode());
- bool InitBoolUsed = false;
-
- // Loop over all uses of GV, processing them in turn.
- while (!GV->use_empty()) {
- if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) {
- // The global is initialized when the store to it occurs.
- new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false,
- Align(1), SI->getOrdering(), SI->getSyncScopeID(), SI);
- SI->eraseFromParent();
- continue;
- }
-
- LoadInst *LI = cast<LoadInst>(GV->user_back());
- while (!LI->use_empty()) {
- Use &LoadUse = *LI->use_begin();
- ICmpInst *ICI = dyn_cast<ICmpInst>(LoadUse.getUser());
- if (!ICI) {
- LoadUse = RepValue;
- continue;
- }
-
- // Replace the cmp X, 0 with a use of the bool value.
- // Sink the load to where the compare was, if atomic rules allow us to.
- Value *LV = new LoadInst(InitBool->getValueType(), InitBool,
- InitBool->getName() + ".val", false, Align(1),
- LI->getOrdering(), LI->getSyncScopeID(),
- LI->isUnordered() ? (Instruction *)ICI : LI);
- InitBoolUsed = true;
- switch (ICI->getPredicate()) {
- default: llvm_unreachable("Unknown ICmp Predicate!");
- case ICmpInst::ICMP_ULT:
- case ICmpInst::ICMP_SLT: // X < null -> always false
- LV = ConstantInt::getFalse(GV->getContext());
- break;
- case ICmpInst::ICMP_ULE:
- case ICmpInst::ICMP_SLE:
- case ICmpInst::ICMP_EQ:
- LV = BinaryOperator::CreateNot(LV, "notinit", ICI);
- break;
- case ICmpInst::ICMP_NE:
- case ICmpInst::ICMP_UGE:
- case ICmpInst::ICMP_SGE:
- case ICmpInst::ICMP_UGT:
- case ICmpInst::ICMP_SGT:
- break; // no change.
- }
- ICI->replaceAllUsesWith(LV);
- ICI->eraseFromParent();
- }
- LI->eraseFromParent();
- }
-
- // If the initialization boolean was used, insert it, otherwise delete it.
- if (!InitBoolUsed) {
- while (!InitBool->use_empty()) // Delete initializations
- cast<StoreInst>(InitBool->user_back())->eraseFromParent();
- delete InitBool;
- } else
- GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool);
-
- // Now the GV is dead, nuke it and the malloc..
- GV->eraseFromParent();
- CI->eraseFromParent();
-
- // To further other optimizations, loop over all users of NewGV and try to
- // constant prop them. This will promote GEP instructions with constant
- // indices into GEP constant-exprs, which will allow global-opt to hack on it.
- ConstantPropUsersOf(NewGV, DL, TLI);
- if (RepValue != NewGV)
- ConstantPropUsersOf(RepValue, DL, TLI);
-
- return NewGV;
-}
-
-/// Scan the use-list of V checking to make sure that there are no complex uses
-/// of V. We permit simple things like dereferencing the pointer, but not
-/// storing through the address, unless it is to the specified global.
-static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V,
- const GlobalVariable *GV,
- SmallPtrSetImpl<const PHINode*> &PHIs) {
- for (const User *U : V->users()) {
- const Instruction *Inst = cast<Instruction>(U);
-
- if (isa<LoadInst>(Inst) || isa<CmpInst>(Inst)) {
- continue; // Fine, ignore.
- }
-
- if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
- if (SI->getOperand(0) == V && SI->getOperand(1) != GV)
- return false; // Storing the pointer itself... bad.
- continue; // Otherwise, storing through it, or storing into GV... fine.
- }
-
- // Must index into the array and into the struct.
- if (isa<GetElementPtrInst>(Inst) && Inst->getNumOperands() >= 3) {
- if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Inst, GV, PHIs))
- return false;
- continue;
- }
-
- if (const PHINode *PN = dyn_cast<PHINode>(Inst)) {
- // PHIs are ok if all uses are ok. Don't infinitely recurse through PHI
- // cycles.
- if (PHIs.insert(PN).second)
- if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(PN, GV, PHIs))
- return false;
- continue;
- }
-
- if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) {
- if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV, PHIs))
- return false;
- continue;
- }
-
- return false;
- }
- return true;
-}
-
-/// The Alloc pointer is stored into GV somewhere. Transform all uses of the
-/// allocation into loads from the global and uses of the resultant pointer.
-/// Further, delete the store into GV. This assumes that these value pass the
-/// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate.
-static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc,
- GlobalVariable *GV) {
- while (!Alloc->use_empty()) {
- Instruction *U = cast<Instruction>(*Alloc->user_begin());
- Instruction *InsertPt = U;
- if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
- // If this is the store of the allocation into the global, remove it.
- if (SI->getOperand(1) == GV) {
- SI->eraseFromParent();
- continue;
- }
- } else if (PHINode *PN = dyn_cast<PHINode>(U)) {
- // Insert the load in the corresponding predecessor, not right before the
- // PHI.
- InsertPt = PN->getIncomingBlock(*Alloc->use_begin())->getTerminator();
- } else if (isa<BitCastInst>(U)) {
- // Must be bitcast between the malloc and store to initialize the global.
- ReplaceUsesOfMallocWithGlobal(U, GV);
- U->eraseFromParent();
- continue;
- } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
- // If this is a "GEP bitcast" and the user is a store to the global, then
- // just process it as a bitcast.
- if (GEPI->hasAllZeroIndices() && GEPI->hasOneUse())
- if (StoreInst *SI = dyn_cast<StoreInst>(GEPI->user_back()))
- if (SI->getOperand(1) == GV) {
- // Must be bitcast GEP between the malloc and store to initialize
- // the global.
- ReplaceUsesOfMallocWithGlobal(GEPI, GV);
- GEPI->eraseFromParent();
- continue;
- }
- }
-
- // Insert a load from the global, and use it instead of the malloc.
- Value *NL =
- new LoadInst(GV->getValueType(), GV, GV->getName() + ".val", InsertPt);
- U->replaceUsesOfWith(Alloc, NL);
- }
-}
-
-/// Verify that all uses of V (a load, or a phi of a load) are simple enough to
-/// perform heap SRA on. This permits GEP's that index through the array and
-/// struct field, icmps of null, and PHIs.
-static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V,
- SmallPtrSetImpl<const PHINode*> &LoadUsingPHIs,
- SmallPtrSetImpl<const PHINode*> &LoadUsingPHIsPerLoad) {
- // We permit two users of the load: setcc comparing against the null
- // pointer, and a getelementptr of a specific form.
- for (const User *U : V->users()) {
- const Instruction *UI = cast<Instruction>(U);
-
- // Comparison against null is ok.
- if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UI)) {
- if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
- return false;
- continue;
- }
-
- // getelementptr is also ok, but only a simple form.
- if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(UI)) {
- // Must index into the array and into the struct.
- if (GEPI->getNumOperands() < 3)
- return false;
-
- // Otherwise the GEP is ok.
- continue;
- }
-
- if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
- if (!LoadUsingPHIsPerLoad.insert(PN).second)
- // This means some phi nodes are dependent on each other.
- // Avoid infinite looping!
- return false;
- if (!LoadUsingPHIs.insert(PN).second)
- // If we have already analyzed this PHI, then it is safe.
- continue;
-
- // Make sure all uses of the PHI are simple enough to transform.
- if (!LoadUsesSimpleEnoughForHeapSRA(PN,
- LoadUsingPHIs, LoadUsingPHIsPerLoad))
- return false;
-
- continue;
- }
-
- // Otherwise we don't know what this is, not ok.
- return false;
- }
-
- return true;
-}
-
-/// If all users of values loaded from GV are simple enough to perform HeapSRA,
-/// return true.
-static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV,
- Instruction *StoredVal) {
- SmallPtrSet<const PHINode*, 32> LoadUsingPHIs;
- SmallPtrSet<const PHINode*, 32> LoadUsingPHIsPerLoad;
- for (const User *U : GV->users())
- if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
- if (!LoadUsesSimpleEnoughForHeapSRA(LI, LoadUsingPHIs,
- LoadUsingPHIsPerLoad))
- return false;
- LoadUsingPHIsPerLoad.clear();
- }
-
- // If we reach here, we know that all uses of the loads and transitive uses
- // (through PHI nodes) are simple enough to transform. However, we don't know
- // that all inputs the to the PHI nodes are in the same equivalence sets.
- // Check to verify that all operands of the PHIs are either PHIS that can be
- // transformed, loads from GV, or MI itself.
- for (const PHINode *PN : LoadUsingPHIs) {
- for (unsigned op = 0, e = PN->getNumIncomingValues(); op != e; ++op) {
- Value *InVal = PN->getIncomingValue(op);
-
- // PHI of the stored value itself is ok.
- if (InVal == StoredVal) continue;
-
- if (const PHINode *InPN = dyn_cast<PHINode>(InVal)) {
- // One of the PHIs in our set is (optimistically) ok.
- if (LoadUsingPHIs.count(InPN))
- continue;
- return false;
- }
-
- // Load from GV is ok.
- if (const LoadInst *LI = dyn_cast<LoadInst>(InVal))
- if (LI->getOperand(0) == GV)
- continue;
-
- // UNDEF? NULL?
-
- // Anything else is rejected.
- return false;
- }
- }
-
- return true;
-}
-
-static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
- DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues,
- std::vector<std::pair<PHINode *, unsigned>> &PHIsToRewrite) {
- std::vector<Value *> &FieldVals = InsertedScalarizedValues[V];
-
- if (FieldNo >= FieldVals.size())
- FieldVals.resize(FieldNo+1);
-
- // If we already have this value, just reuse the previously scalarized
- // version.
- if (Value *FieldVal = FieldVals[FieldNo])
- return FieldVal;
-
- // Depending on what instruction this is, we have several cases.
- Value *Result;
- if (LoadInst *LI = dyn_cast<LoadInst>(V)) {
- // This is a scalarized version of the load from the global. Just create
- // a new Load of the scalarized global.
- Value *V = GetHeapSROAValue(LI->getOperand(0), FieldNo,
- InsertedScalarizedValues, PHIsToRewrite);
- Result = new LoadInst(V->getType()->getPointerElementType(), V,
- LI->getName() + ".f" + Twine(FieldNo), LI);
- } else {
- PHINode *PN = cast<PHINode>(V);
- // PN's type is pointer to struct. Make a new PHI of pointer to struct
- // field.
-
- PointerType *PTy = cast<PointerType>(PN->getType());
- StructType *ST = cast<StructType>(PTy->getElementType());
-
- unsigned AS = PTy->getAddressSpace();
- PHINode *NewPN =
- PHINode::Create(PointerType::get(ST->getElementType(FieldNo), AS),
- PN->getNumIncomingValues(),
- PN->getName()+".f"+Twine(FieldNo), PN);
- Result = NewPN;
- PHIsToRewrite.push_back(std::make_pair(PN, FieldNo));
- }
-
- return FieldVals[FieldNo] = Result;
-}
-
-/// Given a load instruction and a value derived from the load, rewrite the
-/// derived value to use the HeapSRoA'd load.
-static void RewriteHeapSROALoadUser(Instruction *LoadUser,
- DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues,
- std::vector<std::pair<PHINode *, unsigned>> &PHIsToRewrite) {
- // If this is a comparison against null, handle it.
- if (ICmpInst *SCI = dyn_cast<ICmpInst>(LoadUser)) {
- assert(isa<ConstantPointerNull>(SCI->getOperand(1)));
- // If we have a setcc of the loaded pointer, we can use a setcc of any
- // field.
- Value *NPtr = GetHeapSROAValue(SCI->getOperand(0), 0,
- InsertedScalarizedValues, PHIsToRewrite);
-
- Value *New = new ICmpInst(SCI, SCI->getPredicate(), NPtr,
- Constant::getNullValue(NPtr->getType()),
- SCI->getName());
- SCI->replaceAllUsesWith(New);
- SCI->eraseFromParent();
- return;
- }
-
- // Handle 'getelementptr Ptr, Idx, i32 FieldNo ...'
- if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(LoadUser)) {
- assert(GEPI->getNumOperands() >= 3 && isa<ConstantInt>(GEPI->getOperand(2))
- && "Unexpected GEPI!");
-
- // Load the pointer for this field.
- unsigned FieldNo = cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue();
- Value *NewPtr = GetHeapSROAValue(GEPI->getOperand(0), FieldNo,
- InsertedScalarizedValues, PHIsToRewrite);
-
- // Create the new GEP idx vector.
- SmallVector<Value*, 8> GEPIdx;
- GEPIdx.push_back(GEPI->getOperand(1));
- GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end());
-
- Value *NGEPI = GetElementPtrInst::Create(GEPI->getResultElementType(), NewPtr, GEPIdx,
- GEPI->getName(), GEPI);
- GEPI->replaceAllUsesWith(NGEPI);
- GEPI->eraseFromParent();
- return;
- }
-
- // Recursively transform the users of PHI nodes. This will lazily create the
- // PHIs that are needed for individual elements. Keep track of what PHIs we
- // see in InsertedScalarizedValues so that we don't get infinite loops (very
- // antisocial). If the PHI is already in InsertedScalarizedValues, it has
- // already been seen first by another load, so its uses have already been
- // processed.
- PHINode *PN = cast<PHINode>(LoadUser);
- if (!InsertedScalarizedValues.insert(std::make_pair(PN,
- std::vector<Value *>())).second)
- return;
-
- // If this is the first time we've seen this PHI, recursively process all
- // users.
- for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) {
- Instruction *User = cast<Instruction>(*UI++);
- RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
- }
-}
-
-/// We are performing Heap SRoA on a global. Ptr is a value loaded from the
-/// global. Eliminate all uses of Ptr, making them use FieldGlobals instead.
-/// All uses of loaded values satisfy AllGlobalLoadUsesSimpleEnoughForHeapSRA.
-static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
- DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues,
- std::vector<std::pair<PHINode *, unsigned> > &PHIsToRewrite) {
- for (auto UI = Load->user_begin(), E = Load->user_end(); UI != E;) {
- Instruction *User = cast<Instruction>(*UI++);
- RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
- }
-
- if (Load->use_empty()) {
- Load->eraseFromParent();
- InsertedScalarizedValues.erase(Load);
- }
-}
-
-/// CI is an allocation of an array of structures. Break it up into multiple
-/// allocations of arrays of the fields.
-static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
- Value *NElems, const DataLayout &DL,
- const TargetLibraryInfo *TLI) {
- LLVM_DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << " MALLOC = " << *CI
- << '\n');
- Type *MAT = getMallocAllocatedType(CI, TLI);
- StructType *STy = cast<StructType>(MAT);
-
- // There is guaranteed to be at least one use of the malloc (storing
- // it into GV). If there are other uses, change them to be uses of
- // the global to simplify later code. This also deletes the store
- // into GV.
- ReplaceUsesOfMallocWithGlobal(CI, GV);
-
- // Okay, at this point, there are no users of the malloc. Insert N
- // new mallocs at the same place as CI, and N globals.
- std::vector<Value *> FieldGlobals;
- std::vector<Value *> FieldMallocs;
-
- SmallVector<OperandBundleDef, 1> OpBundles;
- CI->getOperandBundlesAsDefs(OpBundles);
-
- unsigned AS = GV->getType()->getPointerAddressSpace();
- for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){
- Type *FieldTy = STy->getElementType(FieldNo);
- PointerType *PFieldTy = PointerType::get(FieldTy, AS);
-
- GlobalVariable *NGV = new GlobalVariable(
- *GV->getParent(), PFieldTy, false, GlobalValue::InternalLinkage,
- Constant::getNullValue(PFieldTy), GV->getName() + ".f" + Twine(FieldNo),
- nullptr, GV->getThreadLocalMode());
- NGV->copyAttributesFrom(GV);
- FieldGlobals.push_back(NGV);
-
- unsigned TypeSize = DL.getTypeAllocSize(FieldTy);
- if (StructType *ST = dyn_cast<StructType>(FieldTy))
- TypeSize = DL.getStructLayout(ST)->getSizeInBytes();
- Type *IntPtrTy = DL.getIntPtrType(CI->getType());
- Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
- ConstantInt::get(IntPtrTy, TypeSize),
- NElems, OpBundles, nullptr,
- CI->getName() + ".f" + Twine(FieldNo));
- FieldMallocs.push_back(NMI);
- new StoreInst(NMI, NGV, CI);
- }
-
- // The tricky aspect of this transformation is handling the case when malloc
- // fails. In the original code, malloc failing would set the result pointer
- // of malloc to null. In this case, some mallocs could succeed and others
- // could fail. As such, we emit code that looks like this:
- // F0 = malloc(field0)
- // F1 = malloc(field1)
- // F2 = malloc(field2)
- // if (F0 == 0 || F1 == 0 || F2 == 0) {
- // if (F0) { free(F0); F0 = 0; }
- // if (F1) { free(F1); F1 = 0; }
- // if (F2) { free(F2); F2 = 0; }
- // }
- // The malloc can also fail if its argument is too large.
- Constant *ConstantZero = ConstantInt::get(CI->getArgOperand(0)->getType(), 0);
- Value *RunningOr = new ICmpInst(CI, ICmpInst::ICMP_SLT, CI->getArgOperand(0),
- ConstantZero, "isneg");
- for (unsigned i = 0, e = FieldMallocs.size(); i != e; ++i) {
- Value *Cond = new ICmpInst(CI, ICmpInst::ICMP_EQ, FieldMallocs[i],
- Constant::getNullValue(FieldMallocs[i]->getType()),
- "isnull");
- RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", CI);
- }
-
- // Split the basic block at the old malloc.
- BasicBlock *OrigBB = CI->getParent();
- BasicBlock *ContBB =
- OrigBB->splitBasicBlock(CI->getIterator(), "malloc_cont");
-
- // Create the block to check the first condition. Put all these blocks at the
- // end of the function as they are unlikely to be executed.
- BasicBlock *NullPtrBlock = BasicBlock::Create(OrigBB->getContext(),
- "malloc_ret_null",
- OrigBB->getParent());
-
- // Remove the uncond branch from OrigBB to ContBB, turning it into a cond
- // branch on RunningOr.
- OrigBB->getTerminator()->eraseFromParent();
- BranchInst::Create(NullPtrBlock, ContBB, RunningOr, OrigBB);
-
- // Within the NullPtrBlock, we need to emit a comparison and branch for each
- // pointer, because some may be null while others are not.
- for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
- Value *GVVal =
- new LoadInst(cast<GlobalVariable>(FieldGlobals[i])->getValueType(),
- FieldGlobals[i], "tmp", NullPtrBlock);
- Value *Cmp = new ICmpInst(*NullPtrBlock, ICmpInst::ICMP_NE, GVVal,
- Constant::getNullValue(GVVal->getType()));
- BasicBlock *FreeBlock = BasicBlock::Create(Cmp->getContext(), "free_it",
- OrigBB->getParent());
- BasicBlock *NextBlock = BasicBlock::Create(Cmp->getContext(), "next",
- OrigBB->getParent());
- Instruction *BI = BranchInst::Create(FreeBlock, NextBlock,
- Cmp, NullPtrBlock);
-
- // Fill in FreeBlock.
- CallInst::CreateFree(GVVal, OpBundles, BI);
- new StoreInst(Constant::getNullValue(GVVal->getType()), FieldGlobals[i],
- FreeBlock);
- BranchInst::Create(NextBlock, FreeBlock);
-
- NullPtrBlock = NextBlock;
- }
-
- BranchInst::Create(ContBB, NullPtrBlock);
-
- // CI is no longer needed, remove it.
- CI->eraseFromParent();
-
- /// As we process loads, if we can't immediately update all uses of the load,
- /// keep track of what scalarized loads are inserted for a given load.
- DenseMap<Value *, std::vector<Value *>> InsertedScalarizedValues;
- InsertedScalarizedValues[GV] = FieldGlobals;
-
- std::vector<std::pair<PHINode *, unsigned>> PHIsToRewrite;
-
- // Okay, the malloc site is completely handled. All of the uses of GV are now
- // loads, and all uses of those loads are simple. Rewrite them to use loads
- // of the per-field globals instead.
- for (auto UI = GV->user_begin(), E = GV->user_end(); UI != E;) {
- Instruction *User = cast<Instruction>(*UI++);
-
- if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
- RewriteUsesOfLoadForHeapSRoA(LI, InsertedScalarizedValues, PHIsToRewrite);
- continue;
- }
-
- // Must be a store of null.
- StoreInst *SI = cast<StoreInst>(User);
- assert(isa<ConstantPointerNull>(SI->getOperand(0)) &&
- "Unexpected heap-sra user!");
-
- // Insert a store of null into each global.
- for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
- Type *ValTy = cast<GlobalValue>(FieldGlobals[i])->getValueType();
- Constant *Null = Constant::getNullValue(ValTy);
- new StoreInst(Null, FieldGlobals[i], SI);
- }
- // Erase the original store.
- SI->eraseFromParent();
- }
-
- // While we have PHIs that are interesting to rewrite, do it.
- while (!PHIsToRewrite.empty()) {
- PHINode *PN = PHIsToRewrite.back().first;
- unsigned FieldNo = PHIsToRewrite.back().second;
- PHIsToRewrite.pop_back();
- PHINode *FieldPN = cast<PHINode>(InsertedScalarizedValues[PN][FieldNo]);
- assert(FieldPN->getNumIncomingValues() == 0 &&"Already processed this phi");
-
- // Add all the incoming values. This can materialize more phis.
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- Value *InVal = PN->getIncomingValue(i);
- InVal = GetHeapSROAValue(InVal, FieldNo, InsertedScalarizedValues,
- PHIsToRewrite);
- FieldPN->addIncoming(InVal, PN->getIncomingBlock(i));
- }
- }
-
- // Drop all inter-phi links and any loads that made it this far.
- for (DenseMap<Value *, std::vector<Value *>>::iterator
- I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
- I != E; ++I) {
- if (PHINode *PN = dyn_cast<PHINode>(I->first))
- PN->dropAllReferences();
- else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
- LI->dropAllReferences();
- }
-
- // Delete all the phis and loads now that inter-references are dead.
- for (DenseMap<Value *, std::vector<Value *>>::iterator
- I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
- I != E; ++I) {
- if (PHINode *PN = dyn_cast<PHINode>(I->first))
- PN->eraseFromParent();
- else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
- LI->eraseFromParent();
- }
-
- // The old global is now dead, remove it.
- GV->eraseFromParent();
-
- ++NumHeapSRA;
- return cast<GlobalVariable>(FieldGlobals[0]);
-}
-
-/// This function is called when we see a pointer global variable with a single
-/// value stored it that is a malloc or cast of malloc.
-static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
- Type *AllocTy,
- AtomicOrdering Ordering,
- const DataLayout &DL,
- TargetLibraryInfo *TLI) {
- // If this is a malloc of an abstract type, don't touch it.
- if (!AllocTy->isSized())
- return false;
-
- // We can't optimize this global unless all uses of it are *known* to be
- // of the malloc value, not of the null initializer value (consider a use
- // that compares the global's value against zero to see if the malloc has
- // been reached). To do this, we check to see if all uses of the global
- // would trap if the global were null: this proves that they must all
- // happen after the malloc.
- if (!AllUsesOfLoadedValueWillTrapIfNull(GV))
- return false;
-
- // We can't optimize this if the malloc itself is used in a complex way,
- // for example, being stored into multiple globals. This allows the
- // malloc to be stored into the specified global, loaded icmp'd, and
- // GEP'd. These are all things we could transform to using the global
- // for.
- SmallPtrSet<const PHINode*, 8> PHIs;
- if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV, PHIs))
- return false;
-
- // If we have a global that is only initialized with a fixed size malloc,
- // transform the program to use global memory instead of malloc'd memory.
- // This eliminates dynamic allocation, avoids an indirection accessing the
- // data, and exposes the resultant global to further GlobalOpt.
- // We cannot optimize the malloc if we cannot determine malloc array size.
- Value *NElems = getMallocArraySize(CI, DL, TLI, true);
- if (!NElems)
- return false;
-
- if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems))
- // Restrict this transformation to only working on small allocations
- // (2048 bytes currently), as we don't want to introduce a 16M global or
- // something.
- if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) {
- OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI);
- return true;
- }
-
- // If the allocation is an array of structures, consider transforming this
- // into multiple malloc'd arrays, one for each field. This is basically
- // SRoA for malloc'd memory.
-
- if (Ordering != AtomicOrdering::NotAtomic)
- return false;
-
- // If this is an allocation of a fixed size array of structs, analyze as a
- // variable size array. malloc [100 x struct],1 -> malloc struct, 100
- if (NElems == ConstantInt::get(CI->getArgOperand(0)->getType(), 1))
- if (ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
- AllocTy = AT->getElementType();
-
- StructType *AllocSTy = dyn_cast<StructType>(AllocTy);
- if (!AllocSTy)
- return false;
-
- // This the structure has an unreasonable number of fields, leave it
- // alone.
- if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 &&
- AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, CI)) {
-
- // If this is a fixed size array, transform the Malloc to be an alloc of
- // structs. malloc [100 x struct],1 -> malloc struct, 100
- if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) {
- Type *IntPtrTy = DL.getIntPtrType(CI->getType());
- unsigned TypeSize = DL.getStructLayout(AllocSTy)->getSizeInBytes();
- Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
- Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
- SmallVector<OperandBundleDef, 1> OpBundles;
- CI->getOperandBundlesAsDefs(OpBundles);
- Instruction *Malloc =
- CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy, AllocSize, NumElements,
- OpBundles, nullptr, CI->getName());
- Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI);
- CI->replaceAllUsesWith(Cast);
- CI->eraseFromParent();
- if (BitCastInst *BCI = dyn_cast<BitCastInst>(Malloc))
- CI = cast<CallInst>(BCI->getOperand(0));
- else
- CI = cast<CallInst>(Malloc);
- }
-
- PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), DL,
- TLI);
- return true;
- }
-
- return false;
-}
-
-// Try to optimize globals based on the knowledge that only one value (besides
-// its initializer) is ever stored to the global.
-static bool
-optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
- AtomicOrdering Ordering, const DataLayout &DL,
- function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
- // Ignore no-op GEPs and bitcasts.
- StoredOnceVal = StoredOnceVal->stripPointerCasts();
-
- // If we are dealing with a pointer global that is initialized to null and
- // only has one (non-null) value stored into it, then we can optimize any
- // users of the loaded value (often calls and loads) that would trap if the
- // value was null.
- if (GV->getInitializer()->getType()->isPointerTy() &&
- GV->getInitializer()->isNullValue() &&
- !NullPointerIsDefined(
- nullptr /* F */,
- GV->getInitializer()->getType()->getPointerAddressSpace())) {
- if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) {
- if (GV->getInitializer()->getType() != SOVC->getType())
- SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
-
- // Optimize away any trapping uses of the loaded value.
- if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, GetTLI))
- return true;
- } else if (CallInst *CI = extractMallocCall(StoredOnceVal, GetTLI)) {
- auto *TLI = &GetTLI(*CI->getFunction());
- Type *MallocType = getMallocAllocatedType(CI, TLI);
- if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType,
- Ordering, DL, TLI))
- return true;
- }
- }
-
- return false;
-}
-
-/// At this point, we have learned that the only two values ever stored into GV
-/// are its initializer and OtherVal. See if we can shrink the global into a
-/// boolean and select between the two values whenever it is used. This exposes
-/// the values to other scalar optimizations.
-static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
- Type *GVElType = GV->getValueType();
-
- // If GVElType is already i1, it is already shrunk. If the type of the GV is
- // an FP value, pointer or vector, don't do this optimization because a select
- // between them is very expensive and unlikely to lead to later
- // simplification. In these cases, we typically end up with "cond ? v1 : v2"
- // where v1 and v2 both require constant pool loads, a big loss.
- if (GVElType == Type::getInt1Ty(GV->getContext()) ||
- GVElType->isFloatingPointTy() ||
- GVElType->isPointerTy() || GVElType->isVectorTy())
- return false;
-
- // Walk the use list of the global seeing if all the uses are load or store.
- // If there is anything else, bail out.
- for (User *U : GV->users())
- if (!isa<LoadInst>(U) && !isa<StoreInst>(U))
- return false;
-
- LLVM_DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV << "\n");
-
- // Create the new global, initializing it to false.
- GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()),
- false,
- GlobalValue::InternalLinkage,
- ConstantInt::getFalse(GV->getContext()),
- GV->getName()+".b",
- GV->getThreadLocalMode(),
- GV->getType()->getAddressSpace());
- NewGV->copyAttributesFrom(GV);
- GV->getParent()->getGlobalList().insert(GV->getIterator(), NewGV);
-
- Constant *InitVal = GV->getInitializer();
- assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) &&
- "No reason to shrink to bool!");
-
- SmallVector<DIGlobalVariableExpression *, 1> GVs;
- GV->getDebugInfo(GVs);
-
- // If initialized to zero and storing one into the global, we can use a cast
- // instead of a select to synthesize the desired value.
- bool IsOneZero = false;
- bool EmitOneOrZero = true;
- auto *CI = dyn_cast<ConstantInt>(OtherVal);
- if (CI && CI->getValue().getActiveBits() <= 64) {
- IsOneZero = InitVal->isNullValue() && CI->isOne();
-
- auto *CIInit = dyn_cast<ConstantInt>(GV->getInitializer());
- if (CIInit && CIInit->getValue().getActiveBits() <= 64) {
- uint64_t ValInit = CIInit->getZExtValue();
- uint64_t ValOther = CI->getZExtValue();
- uint64_t ValMinus = ValOther - ValInit;
-
- for(auto *GVe : GVs){
- DIGlobalVariable *DGV = GVe->getVariable();
- DIExpression *E = GVe->getExpression();
- const DataLayout &DL = GV->getParent()->getDataLayout();
- unsigned SizeInOctets =
- DL.getTypeAllocSizeInBits(NewGV->getType()->getElementType()) / 8;
-
- // It is expected that the address of global optimized variable is on
- // top of the stack. After optimization, value of that variable will
- // be ether 0 for initial value or 1 for other value. The following
- // expression should return constant integer value depending on the
- // value at global object address:
- // val * (ValOther - ValInit) + ValInit:
- // DW_OP_deref DW_OP_constu <ValMinus>
- // DW_OP_mul DW_OP_constu <ValInit> DW_OP_plus DW_OP_stack_value
- SmallVector<uint64_t, 12> Ops = {
- dwarf::DW_OP_deref_size, SizeInOctets,
- dwarf::DW_OP_constu, ValMinus,
- dwarf::DW_OP_mul, dwarf::DW_OP_constu, ValInit,
- dwarf::DW_OP_plus};
- bool WithStackValue = true;
- E = DIExpression::prependOpcodes(E, Ops, WithStackValue);
- DIGlobalVariableExpression *DGVE =
- DIGlobalVariableExpression::get(NewGV->getContext(), DGV, E);
- NewGV->addDebugInfo(DGVE);
- }
- EmitOneOrZero = false;
- }
- }
-
- if (EmitOneOrZero) {
- // FIXME: This will only emit address for debugger on which will
- // be written only 0 or 1.
- for(auto *GV : GVs)
- NewGV->addDebugInfo(GV);
- }
-
- while (!GV->use_empty()) {
- Instruction *UI = cast<Instruction>(GV->user_back());
- if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
- // Change the store into a boolean store.
- bool StoringOther = SI->getOperand(0) == OtherVal;
- // Only do this if we weren't storing a loaded value.
- Value *StoreVal;
- if (StoringOther || SI->getOperand(0) == InitVal) {
- StoreVal = ConstantInt::get(Type::getInt1Ty(GV->getContext()),
- StoringOther);
- } else {
- // Otherwise, we are storing a previously loaded copy. To do this,
- // change the copy from copying the original value to just copying the
- // bool.
- Instruction *StoredVal = cast<Instruction>(SI->getOperand(0));
-
- // If we've already replaced the input, StoredVal will be a cast or
- // select instruction. If not, it will be a load of the original
- // global.
- if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
- assert(LI->getOperand(0) == GV && "Not a copy!");
- // Insert a new load, to preserve the saved value.
- StoreVal = new LoadInst(NewGV->getValueType(), NewGV,
- LI->getName() + ".b", false, Align(1),
- LI->getOrdering(), LI->getSyncScopeID(), LI);
- } else {
- assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) &&
- "This is not a form that we understand!");
- StoreVal = StoredVal->getOperand(0);
- assert(isa<LoadInst>(StoreVal) && "Not a load of NewGV!");
- }
- }
- StoreInst *NSI =
- new StoreInst(StoreVal, NewGV, false, Align(1), SI->getOrdering(),
- SI->getSyncScopeID(), SI);
- NSI->setDebugLoc(SI->getDebugLoc());
- } else {
- // Change the load into a load of bool then a select.
- LoadInst *LI = cast<LoadInst>(UI);
- LoadInst *NLI = new LoadInst(NewGV->getValueType(), NewGV,
- LI->getName() + ".b", false, Align(1),
- LI->getOrdering(), LI->getSyncScopeID(), LI);
- Instruction *NSI;
- if (IsOneZero)
- NSI = new ZExtInst(NLI, LI->getType(), "", LI);
- else
- NSI = SelectInst::Create(NLI, OtherVal, InitVal, "", LI);
- NSI->takeName(LI);
- // Since LI is split into two instructions, NLI and NSI both inherit the
- // same DebugLoc
- NLI->setDebugLoc(LI->getDebugLoc());
- NSI->setDebugLoc(LI->getDebugLoc());
- LI->replaceAllUsesWith(NSI);
- }
- UI->eraseFromParent();
- }
-
- // Retain the name of the old global variable. People who are debugging their
- // programs may expect these variables to be named the same.
- NewGV->takeName(GV);
- GV->eraseFromParent();
- return true;
-}
-
-static bool deleteIfDead(
- GlobalValue &GV, SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
- GV.removeDeadConstantUsers();
-
- if (!GV.isDiscardableIfUnused() && !GV.isDeclaration())
- return false;
-
- if (const Comdat *C = GV.getComdat())
- if (!GV.hasLocalLinkage() && NotDiscardableComdats.count(C))
- return false;
-
- bool Dead;
- if (auto *F = dyn_cast<Function>(&GV))
- Dead = (F->isDeclaration() && F->use_empty()) || F->isDefTriviallyDead();
- else
- Dead = GV.use_empty();
- if (!Dead)
- return false;
-
- LLVM_DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n");
- GV.eraseFromParent();
- ++NumDeleted;
- return true;
-}
-
-static bool isPointerValueDeadOnEntryToFunction(
- const Function *F, GlobalValue *GV,
- function_ref<DominatorTree &(Function &)> LookupDomTree) {
- // Find all uses of GV. We expect them all to be in F, and if we can't
- // identify any of the uses we bail out.
- //
- // On each of these uses, identify if the memory that GV points to is
- // used/required/live at the start of the function. If it is not, for example
- // if the first thing the function does is store to the GV, the GV can
- // possibly be demoted.
- //
- // We don't do an exhaustive search for memory operations - simply look
- // through bitcasts as they're quite common and benign.
- const DataLayout &DL = GV->getParent()->getDataLayout();
- SmallVector<LoadInst *, 4> Loads;
- SmallVector<StoreInst *, 4> Stores;
- for (auto *U : GV->users()) {
- if (Operator::getOpcode(U) == Instruction::BitCast) {
- for (auto *UU : U->users()) {
- if (auto *LI = dyn_cast<LoadInst>(UU))
- Loads.push_back(LI);
- else if (auto *SI = dyn_cast<StoreInst>(UU))
- Stores.push_back(SI);
- else
- return false;
- }
- continue;
- }
-
- Instruction *I = dyn_cast<Instruction>(U);
- if (!I)
- return false;
- assert(I->getParent()->getParent() == F);
-
- if (auto *LI = dyn_cast<LoadInst>(I))
- Loads.push_back(LI);
- else if (auto *SI = dyn_cast<StoreInst>(I))
- Stores.push_back(SI);
- else
- return false;
- }
-
- // We have identified all uses of GV into loads and stores. Now check if all
- // of them are known not to depend on the value of the global at the function
- // entry point. We do this by ensuring that every load is dominated by at
- // least one store.
- auto &DT = LookupDomTree(*const_cast<Function *>(F));
-
- // The below check is quadratic. Check we're not going to do too many tests.
- // FIXME: Even though this will always have worst-case quadratic time, we
- // could put effort into minimizing the average time by putting stores that
- // have been shown to dominate at least one load at the beginning of the
- // Stores array, making subsequent dominance checks more likely to succeed
- // early.
- //
- // The threshold here is fairly large because global->local demotion is a
- // very powerful optimization should it fire.
- const unsigned Threshold = 100;
- if (Loads.size() * Stores.size() > Threshold)
- return false;
-
- for (auto *L : Loads) {
- auto *LTy = L->getType();
- if (none_of(Stores, [&](const StoreInst *S) {
- auto *STy = S->getValueOperand()->getType();
- // The load is only dominated by the store if DomTree says so
- // and the number of bits loaded in L is less than or equal to
- // the number of bits stored in S.
- return DT.dominates(S, L) &&
+ while (!WorkList.empty()) {
+ Value *UV = WorkList.pop_back_val();
+ if (!UV)
+ continue;
+
+ User *U = cast<User>(UV);
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+ if (Init) {
+ // Replace the load with the initializer.
+ LI->replaceAllUsesWith(Init);
+ LI->eraseFromParent();
+ Changed = true;
+ }
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+ // Store must be unreachable or storing Init into the global.
+ SI->eraseFromParent();
+ Changed = true;
+ } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
+ if (CE->getOpcode() == Instruction::GetElementPtr) {
+ Constant *SubInit = nullptr;
+ if (Init)
+ SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
+ Changed |= CleanupConstantGlobalUsers(CE, SubInit, DL, GetTLI);
+ } else if ((CE->getOpcode() == Instruction::BitCast &&
+ CE->getType()->isPointerTy()) ||
+ CE->getOpcode() == Instruction::AddrSpaceCast) {
+ // Pointer cast, delete any stores and memsets to the global.
+ Changed |= CleanupConstantGlobalUsers(CE, nullptr, DL, GetTLI);
+ }
+
+ if (CE->use_empty()) {
+ CE->destroyConstant();
+ Changed = true;
+ }
+ } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+ // Do not transform "gepinst (gep constexpr (GV))" here, because forming
+ // "gepconstexpr (gep constexpr (GV))" will cause the two gep's to fold
+ // and will invalidate our notion of what Init is.
+ Constant *SubInit = nullptr;
+ if (!isa<ConstantExpr>(GEP->getOperand(0))) {
+ ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(
+ ConstantFoldInstruction(GEP, DL, &GetTLI(*GEP->getFunction())));
+ if (Init && CE && CE->getOpcode() == Instruction::GetElementPtr)
+ SubInit = ConstantFoldLoadThroughGEPConstantExpr(Init, CE);
+
+ // If the initializer is an all-null value and we have an inbounds GEP,
+ // we already know what the result of any load from that GEP is.
+ // TODO: Handle splats.
+ if (Init && isa<ConstantAggregateZero>(Init) && GEP->isInBounds())
+ SubInit = Constant::getNullValue(GEP->getResultElementType());
+ }
+ Changed |= CleanupConstantGlobalUsers(GEP, SubInit, DL, GetTLI);
+
+ if (GEP->use_empty()) {
+ GEP->eraseFromParent();
+ Changed = true;
+ }
+ } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U)) { // memset/cpy/mv
+ if (MI->getRawDest() == V) {
+ MI->eraseFromParent();
+ Changed = true;
+ }
+
+ } else if (Constant *C = dyn_cast<Constant>(U)) {
+ // If we have a chain of dead constantexprs or other things dangling from
+ // us, and if they are all dead, nuke them without remorse.
+ if (isSafeToDestroyConstant(C)) {
+ C->destroyConstant();
+ CleanupConstantGlobalUsers(V, Init, DL, GetTLI);
+ return true;
+ }
+ }
+ }
+ return Changed;
+}
+
+static bool isSafeSROAElementUse(Value *V);
+
+/// Return true if the specified GEP is a safe user of a derived
+/// expression from a global that we want to SROA.
+static bool isSafeSROAGEP(User *U) {
+ // Check to see if this ConstantExpr GEP is SRA'able. In particular, we
+ // don't like < 3 operand CE's, and we don't like non-constant integer
+ // indices. This enforces that all uses are 'gep GV, 0, C, ...' for some
+ // value of C.
+ if (U->getNumOperands() < 3 || !isa<Constant>(U->getOperand(1)) ||
+ !cast<Constant>(U->getOperand(1))->isNullValue())
+ return false;
+
+ gep_type_iterator GEPI = gep_type_begin(U), E = gep_type_end(U);
+ ++GEPI; // Skip over the pointer index.
+
+ // For all other level we require that the indices are constant and inrange.
+ // In particular, consider: A[0][i]. We cannot know that the user isn't doing
+ // invalid things like allowing i to index an out-of-range subscript that
+ // accesses A[1]. This can also happen between different members of a struct
+ // in llvm IR.
+ for (; GEPI != E; ++GEPI) {
+ if (GEPI.isStruct())
+ continue;
+
+ ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPI.getOperand());
+ if (!IdxVal || (GEPI.isBoundedSequential() &&
+ IdxVal->getZExtValue() >= GEPI.getSequentialNumElements()))
+ return false;
+ }
+
+ return llvm::all_of(U->users(),
+ [](User *UU) { return isSafeSROAElementUse(UU); });
+}
+
+/// Return true if the specified instruction is a safe user of a derived
+/// expression from a global that we want to SROA.
+static bool isSafeSROAElementUse(Value *V) {
+ // We might have a dead and dangling constant hanging off of here.
+ if (Constant *C = dyn_cast<Constant>(V))
+ return isSafeToDestroyConstant(C);
+
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I) return false;
+
+ // Loads are ok.
+ if (isa<LoadInst>(I)) return true;
+
+ // Stores *to* the pointer are ok.
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->getOperand(0) != V;
+
+ // Otherwise, it must be a GEP. Check it and its users are safe to SRA.
+ return isa<GetElementPtrInst>(I) && isSafeSROAGEP(I);
+}
+
+/// Look at all uses of the global and decide whether it is safe for us to
+/// perform this transformation.
+static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
+ for (User *U : GV->users()) {
+ // The user of the global must be a GEP Inst or a ConstantExpr GEP.
+ if (!isa<GetElementPtrInst>(U) &&
+ (!isa<ConstantExpr>(U) ||
+ cast<ConstantExpr>(U)->getOpcode() != Instruction::GetElementPtr))
+ return false;
+
+ // Check the gep and it's users are safe to SRA
+ if (!isSafeSROAGEP(U))
+ return false;
+ }
+
+ return true;
+}
+
+static bool IsSRASequential(Type *T) {
+ return isa<ArrayType>(T) || isa<VectorType>(T);
+}
+static uint64_t GetSRASequentialNumElements(Type *T) {
+ if (ArrayType *AT = dyn_cast<ArrayType>(T))
+ return AT->getNumElements();
+ return cast<FixedVectorType>(T)->getNumElements();
+}
+static Type *GetSRASequentialElementType(Type *T) {
+ if (ArrayType *AT = dyn_cast<ArrayType>(T))
+ return AT->getElementType();
+ return cast<VectorType>(T)->getElementType();
+}
+static bool CanDoGlobalSRA(GlobalVariable *GV) {
+ Constant *Init = GV->getInitializer();
+
+ if (isa<StructType>(Init->getType())) {
+ // nothing to check
+ } else if (IsSRASequential(Init->getType())) {
+ if (GetSRASequentialNumElements(Init->getType()) > 16 &&
+ GV->hasNUsesOrMore(16))
+ return false; // It's not worth it.
+ } else
+ return false;
+
+ return GlobalUsersSafeToSRA(GV);
+}
+
+/// Copy over the debug info for a variable to its SRA replacements.
+static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
+ uint64_t FragmentOffsetInBits,
+ uint64_t FragmentSizeInBits,
+ uint64_t VarSize) {
+ SmallVector<DIGlobalVariableExpression *, 1> GVs;
+ GV->getDebugInfo(GVs);
+ for (auto *GVE : GVs) {
+ DIVariable *Var = GVE->getVariable();
+ DIExpression *Expr = GVE->getExpression();
+ // If the FragmentSize is smaller than the variable,
+ // emit a fragment expression.
+ if (FragmentSizeInBits < VarSize) {
+ if (auto E = DIExpression::createFragmentExpression(
+ Expr, FragmentOffsetInBits, FragmentSizeInBits))
+ Expr = *E;
+ else
+ return;
+ }
+ auto *NGVE = DIGlobalVariableExpression::get(GVE->getContext(), Var, Expr);
+ NGV->addDebugInfo(NGVE);
+ }
+}
+
+/// Perform scalar replacement of aggregates on the specified global variable.
+/// This opens the door for other optimizations by exposing the behavior of the
+/// program in a more fine-grained way. We have determined that this
+/// transformation is safe already. We return the first global variable we
+/// insert so that the caller can reprocess it.
+static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
+ // Make sure this global only has simple uses that we can SRA.
+ if (!CanDoGlobalSRA(GV))
+ return nullptr;
+
+ assert(GV->hasLocalLinkage());
+ Constant *Init = GV->getInitializer();
+ Type *Ty = Init->getType();
+ uint64_t VarSize = DL.getTypeSizeInBits(Ty);
+
+ std::map<unsigned, GlobalVariable *> NewGlobals;
+
+ // Get the alignment of the global, either explicit or target-specific.
+ Align StartAlignment =
+ DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getType());
+
+ // Loop over all users and create replacement variables for used aggregate
+ // elements.
+ for (User *GEP : GV->users()) {
+ assert(((isa<ConstantExpr>(GEP) && cast<ConstantExpr>(GEP)->getOpcode() ==
+ Instruction::GetElementPtr) ||
+ isa<GetElementPtrInst>(GEP)) &&
+ "NonGEP CE's are not SRAable!");
+
+ // Ignore the 1th operand, which has to be zero or else the program is quite
+ // broken (undefined). Get the 2nd operand, which is the structure or array
+ // index.
+ unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+ if (NewGlobals.count(ElementIdx) == 1)
+ continue; // we`ve already created replacement variable
+ assert(NewGlobals.count(ElementIdx) == 0);
+
+ Type *ElTy = nullptr;
+ if (StructType *STy = dyn_cast<StructType>(Ty))
+ ElTy = STy->getElementType(ElementIdx);
+ else
+ ElTy = GetSRASequentialElementType(Ty);
+ assert(ElTy);
+
+ Constant *In = Init->getAggregateElement(ElementIdx);
+ assert(In && "Couldn't get element of initializer?");
+
+ GlobalVariable *NGV = new GlobalVariable(
+ ElTy, false, GlobalVariable::InternalLinkage, In,
+ GV->getName() + "." + Twine(ElementIdx), GV->getThreadLocalMode(),
+ GV->getType()->getAddressSpace());
+ NGV->setExternallyInitialized(GV->isExternallyInitialized());
+ NGV->copyAttributesFrom(GV);
+ NewGlobals.insert(std::make_pair(ElementIdx, NGV));
+
+ if (StructType *STy = dyn_cast<StructType>(Ty)) {
+ const StructLayout &Layout = *DL.getStructLayout(STy);
+
+ // Calculate the known alignment of the field. If the original aggregate
+ // had 256 byte alignment for example, something might depend on that:
+ // propagate info to each field.
+ uint64_t FieldOffset = Layout.getElementOffset(ElementIdx);
+ Align NewAlign = commonAlignment(StartAlignment, FieldOffset);
+ if (NewAlign > DL.getABITypeAlign(STy->getElementType(ElementIdx)))
+ NGV->setAlignment(NewAlign);
+
+ // Copy over the debug info for the variable.
+ uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType());
+ uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(ElementIdx);
+ transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size, VarSize);
+ } else {
+ uint64_t EltSize = DL.getTypeAllocSize(ElTy);
+ Align EltAlign = DL.getABITypeAlign(ElTy);
+ uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy);
+
+ // Calculate the known alignment of the field. If the original aggregate
+ // had 256 byte alignment for example, something might depend on that:
+ // propagate info to each field.
+ Align NewAlign = commonAlignment(StartAlignment, EltSize * ElementIdx);
+ if (NewAlign > EltAlign)
+ NGV->setAlignment(NewAlign);
+ transferSRADebugInfo(GV, NGV, FragmentSizeInBits * ElementIdx,
+ FragmentSizeInBits, VarSize);
+ }
+ }
+
+ if (NewGlobals.empty())
+ return nullptr;
+
+ Module::GlobalListType &Globals = GV->getParent()->getGlobalList();
+ for (auto NewGlobalVar : NewGlobals)
+ Globals.push_back(NewGlobalVar.second);
+
+ LLVM_DEBUG(dbgs() << "PERFORMING GLOBAL SRA ON: " << *GV << "\n");
+
+ Constant *NullInt =Constant::getNullValue(Type::getInt32Ty(GV->getContext()));
+
+ // Loop over all of the uses of the global, replacing the constantexpr geps,
+ // with smaller constantexpr geps or direct references.
+ while (!GV->use_empty()) {
+ User *GEP = GV->user_back();
+ assert(((isa<ConstantExpr>(GEP) &&
+ cast<ConstantExpr>(GEP)->getOpcode()==Instruction::GetElementPtr)||
+ isa<GetElementPtrInst>(GEP)) && "NonGEP CE's are not SRAable!");
+
+ // Ignore the 1th operand, which has to be zero or else the program is quite
+ // broken (undefined). Get the 2nd operand, which is the structure or array
+ // index.
+ unsigned ElementIdx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+ assert(NewGlobals.count(ElementIdx) == 1);
+
+ Value *NewPtr = NewGlobals[ElementIdx];
+ Type *NewTy = NewGlobals[ElementIdx]->getValueType();
+
+ // Form a shorter GEP if needed.
+ if (GEP->getNumOperands() > 3) {
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEP)) {
+ SmallVector<Constant*, 8> Idxs;
+ Idxs.push_back(NullInt);
+ for (unsigned i = 3, e = CE->getNumOperands(); i != e; ++i)
+ Idxs.push_back(CE->getOperand(i));
+ NewPtr =
+ ConstantExpr::getGetElementPtr(NewTy, cast<Constant>(NewPtr), Idxs);
+ } else {
+ GetElementPtrInst *GEPI = cast<GetElementPtrInst>(GEP);
+ SmallVector<Value*, 8> Idxs;
+ Idxs.push_back(NullInt);
+ for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i)
+ Idxs.push_back(GEPI->getOperand(i));
+ NewPtr = GetElementPtrInst::Create(
+ NewTy, NewPtr, Idxs, GEPI->getName() + "." + Twine(ElementIdx),
+ GEPI);
+ }
+ }
+ GEP->replaceAllUsesWith(NewPtr);
+
+ if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(GEP))
+ GEPI->eraseFromParent();
+ else
+ cast<ConstantExpr>(GEP)->destroyConstant();
+ }
+
+ // Delete the old global, now that it is dead.
+ Globals.erase(GV);
+ ++NumSRA;
+
+ assert(NewGlobals.size() > 0);
+ return NewGlobals.begin()->second;
+}
+
+/// Return true if all users of the specified value will trap if the value is
+/// dynamically null. PHIs keeps track of any phi nodes we've seen to avoid
+/// reprocessing them.
+static bool AllUsesOfValueWillTrapIfNull(const Value *V,
+ SmallPtrSetImpl<const PHINode*> &PHIs) {
+ for (const User *U : V->users()) {
+ if (const Instruction *I = dyn_cast<Instruction>(U)) {
+ // If null pointer is considered valid, then all uses are non-trapping.
+ // Non address-space 0 globals have already been pruned by the caller.
+ if (NullPointerIsDefined(I->getFunction()))
+ return false;
+ }
+ if (isa<LoadInst>(U)) {
+ // Will trap.
+ } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+ if (SI->getOperand(0) == V) {
+ //cerr << "NONTRAPPING USE: " << *U;
+ return false; // Storing the value.
+ }
+ } else if (const CallInst *CI = dyn_cast<CallInst>(U)) {
+ if (CI->getCalledOperand() != V) {
+ //cerr << "NONTRAPPING USE: " << *U;
+ return false; // Not calling the ptr
+ }
+ } else if (const InvokeInst *II = dyn_cast<InvokeInst>(U)) {
+ if (II->getCalledOperand() != V) {
+ //cerr << "NONTRAPPING USE: " << *U;
+ return false; // Not calling the ptr
+ }
+ } else if (const BitCastInst *CI = dyn_cast<BitCastInst>(U)) {
+ if (!AllUsesOfValueWillTrapIfNull(CI, PHIs)) return false;
+ } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+ if (!AllUsesOfValueWillTrapIfNull(GEPI, PHIs)) return false;
+ } else if (const PHINode *PN = dyn_cast<PHINode>(U)) {
+ // If we've already seen this phi node, ignore it, it has already been
+ // checked.
+ if (PHIs.insert(PN).second && !AllUsesOfValueWillTrapIfNull(PN, PHIs))
+ return false;
+ } else {
+ //cerr << "NONTRAPPING USE: " << *U;
+ return false;
+ }
+ }
+ return true;
+}
+
+/// Return true if all uses of any loads from GV will trap if the loaded value
+/// is null. Note that this also permits comparisons of the loaded value
+/// against null, as a special case.
+static bool AllUsesOfLoadedValueWillTrapIfNull(const GlobalVariable *GV) {
+ for (const User *U : GV->users())
+ if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+ SmallPtrSet<const PHINode*, 8> PHIs;
+ if (!AllUsesOfValueWillTrapIfNull(LI, PHIs))
+ return false;
+ } else if (isa<StoreInst>(U)) {
+ // Ignore stores to the global.
+ } else {
+ // We don't know or understand this user, bail out.
+ //cerr << "UNKNOWN USER OF GLOBAL!: " << *U;
+ return false;
+ }
+ return true;
+}
+
+static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
+ bool Changed = false;
+ for (auto UI = V->user_begin(), E = V->user_end(); UI != E; ) {
+ Instruction *I = cast<Instruction>(*UI++);
+ // Uses are non-trapping if null pointer is considered valid.
+ // Non address-space 0 globals are already pruned by the caller.
+ if (NullPointerIsDefined(I->getFunction()))
+ return false;
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ LI->setOperand(0, NewV);
+ Changed = true;
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ if (SI->getOperand(1) == V) {
+ SI->setOperand(1, NewV);
+ Changed = true;
+ }
+ } else if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+ CallBase *CB = cast<CallBase>(I);
+ if (CB->getCalledOperand() == V) {
+ // Calling through the pointer! Turn into a direct call, but be careful
+ // that the pointer is not also being passed as an argument.
+ CB->setCalledOperand(NewV);
+ Changed = true;
+ bool PassedAsArg = false;
+ for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
+ if (CB->getArgOperand(i) == V) {
+ PassedAsArg = true;
+ CB->setArgOperand(i, NewV);
+ }
+
+ if (PassedAsArg) {
+ // Being passed as an argument also. Be careful to not invalidate UI!
+ UI = V->user_begin();
+ }
+ }
+ } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+ Changed |= OptimizeAwayTrappingUsesOfValue(CI,
+ ConstantExpr::getCast(CI->getOpcode(),
+ NewV, CI->getType()));
+ if (CI->use_empty()) {
+ Changed = true;
+ CI->eraseFromParent();
+ }
+ } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+ // Should handle GEP here.
+ SmallVector<Constant*, 8> Idxs;
+ Idxs.reserve(GEPI->getNumOperands()-1);
+ for (User::op_iterator i = GEPI->op_begin() + 1, e = GEPI->op_end();
+ i != e; ++i)
+ if (Constant *C = dyn_cast<Constant>(*i))
+ Idxs.push_back(C);
+ else
+ break;
+ if (Idxs.size() == GEPI->getNumOperands()-1)
+ Changed |= OptimizeAwayTrappingUsesOfValue(
+ GEPI, ConstantExpr::getGetElementPtr(GEPI->getSourceElementType(),
+ NewV, Idxs));
+ if (GEPI->use_empty()) {
+ Changed = true;
+ GEPI->eraseFromParent();
+ }
+ }
+ }
+
+ return Changed;
+}
+
+/// The specified global has only one non-null value stored into it. If there
+/// are uses of the loaded value that would trap if the loaded value is
+/// dynamically null, then we know that they cannot be reachable with a null
+/// optimize away the load.
+static bool OptimizeAwayTrappingUsesOfLoads(
+ GlobalVariable *GV, Constant *LV, const DataLayout &DL,
+ function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+ bool Changed = false;
+
+ // Keep track of whether we are able to remove all the uses of the global
+ // other than the store that defines it.
+ bool AllNonStoreUsesGone = true;
+
+ // Replace all uses of loads with uses of uses of the stored value.
+ for (Value::user_iterator GUI = GV->user_begin(), E = GV->user_end(); GUI != E;){
+ User *GlobalUser = *GUI++;
+ if (LoadInst *LI = dyn_cast<LoadInst>(GlobalUser)) {
+ Changed |= OptimizeAwayTrappingUsesOfValue(LI, LV);
+ // If we were able to delete all uses of the loads
+ if (LI->use_empty()) {
+ LI->eraseFromParent();
+ Changed = true;
+ } else {
+ AllNonStoreUsesGone = false;
+ }
+ } else if (isa<StoreInst>(GlobalUser)) {
+ // Ignore the store that stores "LV" to the global.
+ assert(GlobalUser->getOperand(1) == GV &&
+ "Must be storing *to* the global");
+ } else {
+ AllNonStoreUsesGone = false;
+
+ // If we get here we could have other crazy uses that are transitively
+ // loaded.
+ assert((isa<PHINode>(GlobalUser) || isa<SelectInst>(GlobalUser) ||
+ isa<ConstantExpr>(GlobalUser) || isa<CmpInst>(GlobalUser) ||
+ isa<BitCastInst>(GlobalUser) ||
+ isa<GetElementPtrInst>(GlobalUser)) &&
+ "Only expect load and stores!");
+ }
+ }
+
+ if (Changed) {
+ LLVM_DEBUG(dbgs() << "OPTIMIZED LOADS FROM STORED ONCE POINTER: " << *GV
+ << "\n");
+ ++NumGlobUses;
+ }
+
+ // If we nuked all of the loads, then none of the stores are needed either,
+ // nor is the global.
+ if (AllNonStoreUsesGone) {
+ if (isLeakCheckerRoot(GV)) {
+ Changed |= CleanupPointerRootUsers(GV, GetTLI);
+ } else {
+ Changed = true;
+ CleanupConstantGlobalUsers(GV, nullptr, DL, GetTLI);
+ }
+ if (GV->use_empty()) {
+ LLVM_DEBUG(dbgs() << " *** GLOBAL NOW DEAD!\n");
+ Changed = true;
+ GV->eraseFromParent();
+ ++NumDeleted;
+ }
+ }
+ return Changed;
+}
+
+/// Walk the use list of V, constant folding all of the instructions that are
+/// foldable.
+static void ConstantPropUsersOf(Value *V, const DataLayout &DL,
+ TargetLibraryInfo *TLI) {
+ for (Value::user_iterator UI = V->user_begin(), E = V->user_end(); UI != E; )
+ if (Instruction *I = dyn_cast<Instruction>(*UI++))
+ if (Constant *NewC = ConstantFoldInstruction(I, DL, TLI)) {
+ I->replaceAllUsesWith(NewC);
+
+ // Advance UI to the next non-I use to avoid invalidating it!
+ // Instructions could multiply use V.
+ while (UI != E && *UI == I)
+ ++UI;
+ if (isInstructionTriviallyDead(I, TLI))
+ I->eraseFromParent();
+ }
+}
+
+/// This function takes the specified global variable, and transforms the
+/// program as if it always contained the result of the specified malloc.
+/// Because it is always the result of the specified malloc, there is no reason
+/// to actually DO the malloc. Instead, turn the malloc into a global, and any
+/// loads of GV as uses of the new global.
+static GlobalVariable *
+OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
+ ConstantInt *NElements, const DataLayout &DL,
+ TargetLibraryInfo *TLI) {
+ LLVM_DEBUG(errs() << "PROMOTING GLOBAL: " << *GV << " CALL = " << *CI
+ << '\n');
+
+ Type *GlobalType;
+ if (NElements->getZExtValue() == 1)
+ GlobalType = AllocTy;
+ else
+ // If we have an array allocation, the global variable is of an array.
+ GlobalType = ArrayType::get(AllocTy, NElements->getZExtValue());
+
+ // Create the new global variable. The contents of the malloc'd memory is
+ // undefined, so initialize with an undef value.
+ GlobalVariable *NewGV = new GlobalVariable(
+ *GV->getParent(), GlobalType, false, GlobalValue::InternalLinkage,
+ UndefValue::get(GlobalType), GV->getName() + ".body", nullptr,
+ GV->getThreadLocalMode());
+
+ // If there are bitcast users of the malloc (which is typical, usually we have
+ // a malloc + bitcast) then replace them with uses of the new global. Update
+ // other users to use the global as well.
+ BitCastInst *TheBC = nullptr;
+ while (!CI->use_empty()) {
+ Instruction *User = cast<Instruction>(CI->user_back());
+ if (BitCastInst *BCI = dyn_cast<BitCastInst>(User)) {
+ if (BCI->getType() == NewGV->getType()) {
+ BCI->replaceAllUsesWith(NewGV);
+ BCI->eraseFromParent();
+ } else {
+ BCI->setOperand(0, NewGV);
+ }
+ } else {
+ if (!TheBC)
+ TheBC = new BitCastInst(NewGV, CI->getType(), "newgv", CI);
+ User->replaceUsesOfWith(CI, TheBC);
+ }
+ }
+
+ Constant *RepValue = NewGV;
+ if (NewGV->getType() != GV->getValueType())
+ RepValue = ConstantExpr::getBitCast(RepValue, GV->getValueType());
+
+ // If there is a comparison against null, we will insert a global bool to
+ // keep track of whether the global was initialized yet or not.
+ GlobalVariable *InitBool =
+ new GlobalVariable(Type::getInt1Ty(GV->getContext()), false,
+ GlobalValue::InternalLinkage,
+ ConstantInt::getFalse(GV->getContext()),
+ GV->getName()+".init", GV->getThreadLocalMode());
+ bool InitBoolUsed = false;
+
+ // Loop over all uses of GV, processing them in turn.
+ while (!GV->use_empty()) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) {
+ // The global is initialized when the store to it occurs.
+ new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false,
+ Align(1), SI->getOrdering(), SI->getSyncScopeID(), SI);
+ SI->eraseFromParent();
+ continue;
+ }
+
+ LoadInst *LI = cast<LoadInst>(GV->user_back());
+ while (!LI->use_empty()) {
+ Use &LoadUse = *LI->use_begin();
+ ICmpInst *ICI = dyn_cast<ICmpInst>(LoadUse.getUser());
+ if (!ICI) {
+ LoadUse = RepValue;
+ continue;
+ }
+
+ // Replace the cmp X, 0 with a use of the bool value.
+ // Sink the load to where the compare was, if atomic rules allow us to.
+ Value *LV = new LoadInst(InitBool->getValueType(), InitBool,
+ InitBool->getName() + ".val", false, Align(1),
+ LI->getOrdering(), LI->getSyncScopeID(),
+ LI->isUnordered() ? (Instruction *)ICI : LI);
+ InitBoolUsed = true;
+ switch (ICI->getPredicate()) {
+ default: llvm_unreachable("Unknown ICmp Predicate!");
+ case ICmpInst::ICMP_ULT:
+ case ICmpInst::ICMP_SLT: // X < null -> always false
+ LV = ConstantInt::getFalse(GV->getContext());
+ break;
+ case ICmpInst::ICMP_ULE:
+ case ICmpInst::ICMP_SLE:
+ case ICmpInst::ICMP_EQ:
+ LV = BinaryOperator::CreateNot(LV, "notinit", ICI);
+ break;
+ case ICmpInst::ICMP_NE:
+ case ICmpInst::ICMP_UGE:
+ case ICmpInst::ICMP_SGE:
+ case ICmpInst::ICMP_UGT:
+ case ICmpInst::ICMP_SGT:
+ break; // no change.
+ }
+ ICI->replaceAllUsesWith(LV);
+ ICI->eraseFromParent();
+ }
+ LI->eraseFromParent();
+ }
+
+ // If the initialization boolean was used, insert it, otherwise delete it.
+ if (!InitBoolUsed) {
+ while (!InitBool->use_empty()) // Delete initializations
+ cast<StoreInst>(InitBool->user_back())->eraseFromParent();
+ delete InitBool;
+ } else
+ GV->getParent()->getGlobalList().insert(GV->getIterator(), InitBool);
+
+ // Now the GV is dead, nuke it and the malloc..
+ GV->eraseFromParent();
+ CI->eraseFromParent();
+
+ // To further other optimizations, loop over all users of NewGV and try to
+ // constant prop them. This will promote GEP instructions with constant
+ // indices into GEP constant-exprs, which will allow global-opt to hack on it.
+ ConstantPropUsersOf(NewGV, DL, TLI);
+ if (RepValue != NewGV)
+ ConstantPropUsersOf(RepValue, DL, TLI);
+
+ return NewGV;
+}
+
+/// Scan the use-list of V checking to make sure that there are no complex uses
+/// of V. We permit simple things like dereferencing the pointer, but not
+/// storing through the address, unless it is to the specified global.
+static bool ValueIsOnlyUsedLocallyOrStoredToOneGlobal(const Instruction *V,
+ const GlobalVariable *GV,
+ SmallPtrSetImpl<const PHINode*> &PHIs) {
+ for (const User *U : V->users()) {
+ const Instruction *Inst = cast<Instruction>(U);
+
+ if (isa<LoadInst>(Inst) || isa<CmpInst>(Inst)) {
+ continue; // Fine, ignore.
+ }
+
+ if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ if (SI->getOperand(0) == V && SI->getOperand(1) != GV)
+ return false; // Storing the pointer itself... bad.
+ continue; // Otherwise, storing through it, or storing into GV... fine.
+ }
+
+ // Must index into the array and into the struct.
+ if (isa<GetElementPtrInst>(Inst) && Inst->getNumOperands() >= 3) {
+ if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(Inst, GV, PHIs))
+ return false;
+ continue;
+ }
+
+ if (const PHINode *PN = dyn_cast<PHINode>(Inst)) {
+ // PHIs are ok if all uses are ok. Don't infinitely recurse through PHI
+ // cycles.
+ if (PHIs.insert(PN).second)
+ if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(PN, GV, PHIs))
+ return false;
+ continue;
+ }
+
+ if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Inst)) {
+ if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(BCI, GV, PHIs))
+ return false;
+ continue;
+ }
+
+ return false;
+ }
+ return true;
+}
+
+/// The Alloc pointer is stored into GV somewhere. Transform all uses of the
+/// allocation into loads from the global and uses of the resultant pointer.
+/// Further, delete the store into GV. This assumes that these value pass the
+/// 'ValueIsOnlyUsedLocallyOrStoredToOneGlobal' predicate.
+static void ReplaceUsesOfMallocWithGlobal(Instruction *Alloc,
+ GlobalVariable *GV) {
+ while (!Alloc->use_empty()) {
+ Instruction *U = cast<Instruction>(*Alloc->user_begin());
+ Instruction *InsertPt = U;
+ if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+ // If this is the store of the allocation into the global, remove it.
+ if (SI->getOperand(1) == GV) {
+ SI->eraseFromParent();
+ continue;
+ }
+ } else if (PHINode *PN = dyn_cast<PHINode>(U)) {
+ // Insert the load in the corresponding predecessor, not right before the
+ // PHI.
+ InsertPt = PN->getIncomingBlock(*Alloc->use_begin())->getTerminator();
+ } else if (isa<BitCastInst>(U)) {
+ // Must be bitcast between the malloc and store to initialize the global.
+ ReplaceUsesOfMallocWithGlobal(U, GV);
+ U->eraseFromParent();
+ continue;
+ } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+ // If this is a "GEP bitcast" and the user is a store to the global, then
+ // just process it as a bitcast.
+ if (GEPI->hasAllZeroIndices() && GEPI->hasOneUse())
+ if (StoreInst *SI = dyn_cast<StoreInst>(GEPI->user_back()))
+ if (SI->getOperand(1) == GV) {
+ // Must be bitcast GEP between the malloc and store to initialize
+ // the global.
+ ReplaceUsesOfMallocWithGlobal(GEPI, GV);
+ GEPI->eraseFromParent();
+ continue;
+ }
+ }
+
+ // Insert a load from the global, and use it instead of the malloc.
+ Value *NL =
+ new LoadInst(GV->getValueType(), GV, GV->getName() + ".val", InsertPt);
+ U->replaceUsesOfWith(Alloc, NL);
+ }
+}
+
+/// Verify that all uses of V (a load, or a phi of a load) are simple enough to
+/// perform heap SRA on. This permits GEP's that index through the array and
+/// struct field, icmps of null, and PHIs.
+static bool LoadUsesSimpleEnoughForHeapSRA(const Value *V,
+ SmallPtrSetImpl<const PHINode*> &LoadUsingPHIs,
+ SmallPtrSetImpl<const PHINode*> &LoadUsingPHIsPerLoad) {
+ // We permit two users of the load: setcc comparing against the null
+ // pointer, and a getelementptr of a specific form.
+ for (const User *U : V->users()) {
+ const Instruction *UI = cast<Instruction>(U);
+
+ // Comparison against null is ok.
+ if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UI)) {
+ if (!isa<ConstantPointerNull>(ICI->getOperand(1)))
+ return false;
+ continue;
+ }
+
+ // getelementptr is also ok, but only a simple form.
+ if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(UI)) {
+ // Must index into the array and into the struct.
+ if (GEPI->getNumOperands() < 3)
+ return false;
+
+ // Otherwise the GEP is ok.
+ continue;
+ }
+
+ if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
+ if (!LoadUsingPHIsPerLoad.insert(PN).second)
+ // This means some phi nodes are dependent on each other.
+ // Avoid infinite looping!
+ return false;
+ if (!LoadUsingPHIs.insert(PN).second)
+ // If we have already analyzed this PHI, then it is safe.
+ continue;
+
+ // Make sure all uses of the PHI are simple enough to transform.
+ if (!LoadUsesSimpleEnoughForHeapSRA(PN,
+ LoadUsingPHIs, LoadUsingPHIsPerLoad))
+ return false;
+
+ continue;
+ }
+
+ // Otherwise we don't know what this is, not ok.
+ return false;
+ }
+
+ return true;
+}
+
+/// If all users of values loaded from GV are simple enough to perform HeapSRA,
+/// return true.
+static bool AllGlobalLoadUsesSimpleEnoughForHeapSRA(const GlobalVariable *GV,
+ Instruction *StoredVal) {
+ SmallPtrSet<const PHINode*, 32> LoadUsingPHIs;
+ SmallPtrSet<const PHINode*, 32> LoadUsingPHIsPerLoad;
+ for (const User *U : GV->users())
+ if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+ if (!LoadUsesSimpleEnoughForHeapSRA(LI, LoadUsingPHIs,
+ LoadUsingPHIsPerLoad))
+ return false;
+ LoadUsingPHIsPerLoad.clear();
+ }
+
+ // If we reach here, we know that all uses of the loads and transitive uses
+ // (through PHI nodes) are simple enough to transform. However, we don't know
+ // that all inputs the to the PHI nodes are in the same equivalence sets.
+ // Check to verify that all operands of the PHIs are either PHIS that can be
+ // transformed, loads from GV, or MI itself.
+ for (const PHINode *PN : LoadUsingPHIs) {
+ for (unsigned op = 0, e = PN->getNumIncomingValues(); op != e; ++op) {
+ Value *InVal = PN->getIncomingValue(op);
+
+ // PHI of the stored value itself is ok.
+ if (InVal == StoredVal) continue;
+
+ if (const PHINode *InPN = dyn_cast<PHINode>(InVal)) {
+ // One of the PHIs in our set is (optimistically) ok.
+ if (LoadUsingPHIs.count(InPN))
+ continue;
+ return false;
+ }
+
+ // Load from GV is ok.
+ if (const LoadInst *LI = dyn_cast<LoadInst>(InVal))
+ if (LI->getOperand(0) == GV)
+ continue;
+
+ // UNDEF? NULL?
+
+ // Anything else is rejected.
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static Value *GetHeapSROAValue(Value *V, unsigned FieldNo,
+ DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues,
+ std::vector<std::pair<PHINode *, unsigned>> &PHIsToRewrite) {
+ std::vector<Value *> &FieldVals = InsertedScalarizedValues[V];
+
+ if (FieldNo >= FieldVals.size())
+ FieldVals.resize(FieldNo+1);
+
+ // If we already have this value, just reuse the previously scalarized
+ // version.
+ if (Value *FieldVal = FieldVals[FieldNo])
+ return FieldVal;
+
+ // Depending on what instruction this is, we have several cases.
+ Value *Result;
+ if (LoadInst *LI = dyn_cast<LoadInst>(V)) {
+ // This is a scalarized version of the load from the global. Just create
+ // a new Load of the scalarized global.
+ Value *V = GetHeapSROAValue(LI->getOperand(0), FieldNo,
+ InsertedScalarizedValues, PHIsToRewrite);
+ Result = new LoadInst(V->getType()->getPointerElementType(), V,
+ LI->getName() + ".f" + Twine(FieldNo), LI);
+ } else {
+ PHINode *PN = cast<PHINode>(V);
+ // PN's type is pointer to struct. Make a new PHI of pointer to struct
+ // field.
+
+ PointerType *PTy = cast<PointerType>(PN->getType());
+ StructType *ST = cast<StructType>(PTy->getElementType());
+
+ unsigned AS = PTy->getAddressSpace();
+ PHINode *NewPN =
+ PHINode::Create(PointerType::get(ST->getElementType(FieldNo), AS),
+ PN->getNumIncomingValues(),
+ PN->getName()+".f"+Twine(FieldNo), PN);
+ Result = NewPN;
+ PHIsToRewrite.push_back(std::make_pair(PN, FieldNo));
+ }
+
+ return FieldVals[FieldNo] = Result;
+}
+
+/// Given a load instruction and a value derived from the load, rewrite the
+/// derived value to use the HeapSRoA'd load.
+static void RewriteHeapSROALoadUser(Instruction *LoadUser,
+ DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues,
+ std::vector<std::pair<PHINode *, unsigned>> &PHIsToRewrite) {
+ // If this is a comparison against null, handle it.
+ if (ICmpInst *SCI = dyn_cast<ICmpInst>(LoadUser)) {
+ assert(isa<ConstantPointerNull>(SCI->getOperand(1)));
+ // If we have a setcc of the loaded pointer, we can use a setcc of any
+ // field.
+ Value *NPtr = GetHeapSROAValue(SCI->getOperand(0), 0,
+ InsertedScalarizedValues, PHIsToRewrite);
+
+ Value *New = new ICmpInst(SCI, SCI->getPredicate(), NPtr,
+ Constant::getNullValue(NPtr->getType()),
+ SCI->getName());
+ SCI->replaceAllUsesWith(New);
+ SCI->eraseFromParent();
+ return;
+ }
+
+ // Handle 'getelementptr Ptr, Idx, i32 FieldNo ...'
+ if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(LoadUser)) {
+ assert(GEPI->getNumOperands() >= 3 && isa<ConstantInt>(GEPI->getOperand(2))
+ && "Unexpected GEPI!");
+
+ // Load the pointer for this field.
+ unsigned FieldNo = cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue();
+ Value *NewPtr = GetHeapSROAValue(GEPI->getOperand(0), FieldNo,
+ InsertedScalarizedValues, PHIsToRewrite);
+
+ // Create the new GEP idx vector.
+ SmallVector<Value*, 8> GEPIdx;
+ GEPIdx.push_back(GEPI->getOperand(1));
+ GEPIdx.append(GEPI->op_begin()+3, GEPI->op_end());
+
+ Value *NGEPI = GetElementPtrInst::Create(GEPI->getResultElementType(), NewPtr, GEPIdx,
+ GEPI->getName(), GEPI);
+ GEPI->replaceAllUsesWith(NGEPI);
+ GEPI->eraseFromParent();
+ return;
+ }
+
+ // Recursively transform the users of PHI nodes. This will lazily create the
+ // PHIs that are needed for individual elements. Keep track of what PHIs we
+ // see in InsertedScalarizedValues so that we don't get infinite loops (very
+ // antisocial). If the PHI is already in InsertedScalarizedValues, it has
+ // already been seen first by another load, so its uses have already been
+ // processed.
+ PHINode *PN = cast<PHINode>(LoadUser);
+ if (!InsertedScalarizedValues.insert(std::make_pair(PN,
+ std::vector<Value *>())).second)
+ return;
+
+ // If this is the first time we've seen this PHI, recursively process all
+ // users.
+ for (auto UI = PN->user_begin(), E = PN->user_end(); UI != E;) {
+ Instruction *User = cast<Instruction>(*UI++);
+ RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
+ }
+}
+
+/// We are performing Heap SRoA on a global. Ptr is a value loaded from the
+/// global. Eliminate all uses of Ptr, making them use FieldGlobals instead.
+/// All uses of loaded values satisfy AllGlobalLoadUsesSimpleEnoughForHeapSRA.
+static void RewriteUsesOfLoadForHeapSRoA(LoadInst *Load,
+ DenseMap<Value *, std::vector<Value *>> &InsertedScalarizedValues,
+ std::vector<std::pair<PHINode *, unsigned> > &PHIsToRewrite) {
+ for (auto UI = Load->user_begin(), E = Load->user_end(); UI != E;) {
+ Instruction *User = cast<Instruction>(*UI++);
+ RewriteHeapSROALoadUser(User, InsertedScalarizedValues, PHIsToRewrite);
+ }
+
+ if (Load->use_empty()) {
+ Load->eraseFromParent();
+ InsertedScalarizedValues.erase(Load);
+ }
+}
+
+/// CI is an allocation of an array of structures. Break it up into multiple
+/// allocations of arrays of the fields.
+static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
+ Value *NElems, const DataLayout &DL,
+ const TargetLibraryInfo *TLI) {
+ LLVM_DEBUG(dbgs() << "SROA HEAP ALLOC: " << *GV << " MALLOC = " << *CI
+ << '\n');
+ Type *MAT = getMallocAllocatedType(CI, TLI);
+ StructType *STy = cast<StructType>(MAT);
+
+ // There is guaranteed to be at least one use of the malloc (storing
+ // it into GV). If there are other uses, change them to be uses of
+ // the global to simplify later code. This also deletes the store
+ // into GV.
+ ReplaceUsesOfMallocWithGlobal(CI, GV);
+
+ // Okay, at this point, there are no users of the malloc. Insert N
+ // new mallocs at the same place as CI, and N globals.
+ std::vector<Value *> FieldGlobals;
+ std::vector<Value *> FieldMallocs;
+
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ CI->getOperandBundlesAsDefs(OpBundles);
+
+ unsigned AS = GV->getType()->getPointerAddressSpace();
+ for (unsigned FieldNo = 0, e = STy->getNumElements(); FieldNo != e;++FieldNo){
+ Type *FieldTy = STy->getElementType(FieldNo);
+ PointerType *PFieldTy = PointerType::get(FieldTy, AS);
+
+ GlobalVariable *NGV = new GlobalVariable(
+ *GV->getParent(), PFieldTy, false, GlobalValue::InternalLinkage,
+ Constant::getNullValue(PFieldTy), GV->getName() + ".f" + Twine(FieldNo),
+ nullptr, GV->getThreadLocalMode());
+ NGV->copyAttributesFrom(GV);
+ FieldGlobals.push_back(NGV);
+
+ unsigned TypeSize = DL.getTypeAllocSize(FieldTy);
+ if (StructType *ST = dyn_cast<StructType>(FieldTy))
+ TypeSize = DL.getStructLayout(ST)->getSizeInBytes();
+ Type *IntPtrTy = DL.getIntPtrType(CI->getType());
+ Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
+ ConstantInt::get(IntPtrTy, TypeSize),
+ NElems, OpBundles, nullptr,
+ CI->getName() + ".f" + Twine(FieldNo));
+ FieldMallocs.push_back(NMI);
+ new StoreInst(NMI, NGV, CI);
+ }
+
+ // The tricky aspect of this transformation is handling the case when malloc
+ // fails. In the original code, malloc failing would set the result pointer
+ // of malloc to null. In this case, some mallocs could succeed and others
+ // could fail. As such, we emit code that looks like this:
+ // F0 = malloc(field0)
+ // F1 = malloc(field1)
+ // F2 = malloc(field2)
+ // if (F0 == 0 || F1 == 0 || F2 == 0) {
+ // if (F0) { free(F0); F0 = 0; }
+ // if (F1) { free(F1); F1 = 0; }
+ // if (F2) { free(F2); F2 = 0; }
+ // }
+ // The malloc can also fail if its argument is too large.
+ Constant *ConstantZero = ConstantInt::get(CI->getArgOperand(0)->getType(), 0);
+ Value *RunningOr = new ICmpInst(CI, ICmpInst::ICMP_SLT, CI->getArgOperand(0),
+ ConstantZero, "isneg");
+ for (unsigned i = 0, e = FieldMallocs.size(); i != e; ++i) {
+ Value *Cond = new ICmpInst(CI, ICmpInst::ICMP_EQ, FieldMallocs[i],
+ Constant::getNullValue(FieldMallocs[i]->getType()),
+ "isnull");
+ RunningOr = BinaryOperator::CreateOr(RunningOr, Cond, "tmp", CI);
+ }
+
+ // Split the basic block at the old malloc.
+ BasicBlock *OrigBB = CI->getParent();
+ BasicBlock *ContBB =
+ OrigBB->splitBasicBlock(CI->getIterator(), "malloc_cont");
+
+ // Create the block to check the first condition. Put all these blocks at the
+ // end of the function as they are unlikely to be executed.
+ BasicBlock *NullPtrBlock = BasicBlock::Create(OrigBB->getContext(),
+ "malloc_ret_null",
+ OrigBB->getParent());
+
+ // Remove the uncond branch from OrigBB to ContBB, turning it into a cond
+ // branch on RunningOr.
+ OrigBB->getTerminator()->eraseFromParent();
+ BranchInst::Create(NullPtrBlock, ContBB, RunningOr, OrigBB);
+
+ // Within the NullPtrBlock, we need to emit a comparison and branch for each
+ // pointer, because some may be null while others are not.
+ for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
+ Value *GVVal =
+ new LoadInst(cast<GlobalVariable>(FieldGlobals[i])->getValueType(),
+ FieldGlobals[i], "tmp", NullPtrBlock);
+ Value *Cmp = new ICmpInst(*NullPtrBlock, ICmpInst::ICMP_NE, GVVal,
+ Constant::getNullValue(GVVal->getType()));
+ BasicBlock *FreeBlock = BasicBlock::Create(Cmp->getContext(), "free_it",
+ OrigBB->getParent());
+ BasicBlock *NextBlock = BasicBlock::Create(Cmp->getContext(), "next",
+ OrigBB->getParent());
+ Instruction *BI = BranchInst::Create(FreeBlock, NextBlock,
+ Cmp, NullPtrBlock);
+
+ // Fill in FreeBlock.
+ CallInst::CreateFree(GVVal, OpBundles, BI);
+ new StoreInst(Constant::getNullValue(GVVal->getType()), FieldGlobals[i],
+ FreeBlock);
+ BranchInst::Create(NextBlock, FreeBlock);
+
+ NullPtrBlock = NextBlock;
+ }
+
+ BranchInst::Create(ContBB, NullPtrBlock);
+
+ // CI is no longer needed, remove it.
+ CI->eraseFromParent();
+
+ /// As we process loads, if we can't immediately update all uses of the load,
+ /// keep track of what scalarized loads are inserted for a given load.
+ DenseMap<Value *, std::vector<Value *>> InsertedScalarizedValues;
+ InsertedScalarizedValues[GV] = FieldGlobals;
+
+ std::vector<std::pair<PHINode *, unsigned>> PHIsToRewrite;
+
+ // Okay, the malloc site is completely handled. All of the uses of GV are now
+ // loads, and all uses of those loads are simple. Rewrite them to use loads
+ // of the per-field globals instead.
+ for (auto UI = GV->user_begin(), E = GV->user_end(); UI != E;) {
+ Instruction *User = cast<Instruction>(*UI++);
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+ RewriteUsesOfLoadForHeapSRoA(LI, InsertedScalarizedValues, PHIsToRewrite);
+ continue;
+ }
+
+ // Must be a store of null.
+ StoreInst *SI = cast<StoreInst>(User);
+ assert(isa<ConstantPointerNull>(SI->getOperand(0)) &&
+ "Unexpected heap-sra user!");
+
+ // Insert a store of null into each global.
+ for (unsigned i = 0, e = FieldGlobals.size(); i != e; ++i) {
+ Type *ValTy = cast<GlobalValue>(FieldGlobals[i])->getValueType();
+ Constant *Null = Constant::getNullValue(ValTy);
+ new StoreInst(Null, FieldGlobals[i], SI);
+ }
+ // Erase the original store.
+ SI->eraseFromParent();
+ }
+
+ // While we have PHIs that are interesting to rewrite, do it.
+ while (!PHIsToRewrite.empty()) {
+ PHINode *PN = PHIsToRewrite.back().first;
+ unsigned FieldNo = PHIsToRewrite.back().second;
+ PHIsToRewrite.pop_back();
+ PHINode *FieldPN = cast<PHINode>(InsertedScalarizedValues[PN][FieldNo]);
+ assert(FieldPN->getNumIncomingValues() == 0 &&"Already processed this phi");
+
+ // Add all the incoming values. This can materialize more phis.
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ Value *InVal = PN->getIncomingValue(i);
+ InVal = GetHeapSROAValue(InVal, FieldNo, InsertedScalarizedValues,
+ PHIsToRewrite);
+ FieldPN->addIncoming(InVal, PN->getIncomingBlock(i));
+ }
+ }
+
+ // Drop all inter-phi links and any loads that made it this far.
+ for (DenseMap<Value *, std::vector<Value *>>::iterator
+ I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
+ I != E; ++I) {
+ if (PHINode *PN = dyn_cast<PHINode>(I->first))
+ PN->dropAllReferences();
+ else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
+ LI->dropAllReferences();
+ }
+
+ // Delete all the phis and loads now that inter-references are dead.
+ for (DenseMap<Value *, std::vector<Value *>>::iterator
+ I = InsertedScalarizedValues.begin(), E = InsertedScalarizedValues.end();
+ I != E; ++I) {
+ if (PHINode *PN = dyn_cast<PHINode>(I->first))
+ PN->eraseFromParent();
+ else if (LoadInst *LI = dyn_cast<LoadInst>(I->first))
+ LI->eraseFromParent();
+ }
+
+ // The old global is now dead, remove it.
+ GV->eraseFromParent();
+
+ ++NumHeapSRA;
+ return cast<GlobalVariable>(FieldGlobals[0]);
+}
+
+/// This function is called when we see a pointer global variable with a single
+/// value stored it that is a malloc or cast of malloc.
+static bool tryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV, CallInst *CI,
+ Type *AllocTy,
+ AtomicOrdering Ordering,
+ const DataLayout &DL,
+ TargetLibraryInfo *TLI) {
+ // If this is a malloc of an abstract type, don't touch it.
+ if (!AllocTy->isSized())
+ return false;
+
+ // We can't optimize this global unless all uses of it are *known* to be
+ // of the malloc value, not of the null initializer value (consider a use
+ // that compares the global's value against zero to see if the malloc has
+ // been reached). To do this, we check to see if all uses of the global
+ // would trap if the global were null: this proves that they must all
+ // happen after the malloc.
+ if (!AllUsesOfLoadedValueWillTrapIfNull(GV))
+ return false;
+
+ // We can't optimize this if the malloc itself is used in a complex way,
+ // for example, being stored into multiple globals. This allows the
+ // malloc to be stored into the specified global, loaded icmp'd, and
+ // GEP'd. These are all things we could transform to using the global
+ // for.
+ SmallPtrSet<const PHINode*, 8> PHIs;
+ if (!ValueIsOnlyUsedLocallyOrStoredToOneGlobal(CI, GV, PHIs))
+ return false;
+
+ // If we have a global that is only initialized with a fixed size malloc,
+ // transform the program to use global memory instead of malloc'd memory.
+ // This eliminates dynamic allocation, avoids an indirection accessing the
+ // data, and exposes the resultant global to further GlobalOpt.
+ // We cannot optimize the malloc if we cannot determine malloc array size.
+ Value *NElems = getMallocArraySize(CI, DL, TLI, true);
+ if (!NElems)
+ return false;
+
+ if (ConstantInt *NElements = dyn_cast<ConstantInt>(NElems))
+ // Restrict this transformation to only working on small allocations
+ // (2048 bytes currently), as we don't want to introduce a 16M global or
+ // something.
+ if (NElements->getZExtValue() * DL.getTypeAllocSize(AllocTy) < 2048) {
+ OptimizeGlobalAddressOfMalloc(GV, CI, AllocTy, NElements, DL, TLI);
+ return true;
+ }
+
+ // If the allocation is an array of structures, consider transforming this
+ // into multiple malloc'd arrays, one for each field. This is basically
+ // SRoA for malloc'd memory.
+
+ if (Ordering != AtomicOrdering::NotAtomic)
+ return false;
+
+ // If this is an allocation of a fixed size array of structs, analyze as a
+ // variable size array. malloc [100 x struct],1 -> malloc struct, 100
+ if (NElems == ConstantInt::get(CI->getArgOperand(0)->getType(), 1))
+ if (ArrayType *AT = dyn_cast<ArrayType>(AllocTy))
+ AllocTy = AT->getElementType();
+
+ StructType *AllocSTy = dyn_cast<StructType>(AllocTy);
+ if (!AllocSTy)
+ return false;
+
+ // This the structure has an unreasonable number of fields, leave it
+ // alone.
+ if (AllocSTy->getNumElements() <= 16 && AllocSTy->getNumElements() != 0 &&
+ AllGlobalLoadUsesSimpleEnoughForHeapSRA(GV, CI)) {
+
+ // If this is a fixed size array, transform the Malloc to be an alloc of
+ // structs. malloc [100 x struct],1 -> malloc struct, 100
+ if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) {
+ Type *IntPtrTy = DL.getIntPtrType(CI->getType());
+ unsigned TypeSize = DL.getStructLayout(AllocSTy)->getSizeInBytes();
+ Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
+ Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ CI->getOperandBundlesAsDefs(OpBundles);
+ Instruction *Malloc =
+ CallInst::CreateMalloc(CI, IntPtrTy, AllocSTy, AllocSize, NumElements,
+ OpBundles, nullptr, CI->getName());
+ Instruction *Cast = new BitCastInst(Malloc, CI->getType(), "tmp", CI);
+ CI->replaceAllUsesWith(Cast);
+ CI->eraseFromParent();
+ if (BitCastInst *BCI = dyn_cast<BitCastInst>(Malloc))
+ CI = cast<CallInst>(BCI->getOperand(0));
+ else
+ CI = cast<CallInst>(Malloc);
+ }
+
+ PerformHeapAllocSRoA(GV, CI, getMallocArraySize(CI, DL, TLI, true), DL,
+ TLI);
+ return true;
+ }
+
+ return false;
+}
+
+// Try to optimize globals based on the knowledge that only one value (besides
+// its initializer) is ever stored to the global.
+static bool
+optimizeOnceStoredGlobal(GlobalVariable *GV, Value *StoredOnceVal,
+ AtomicOrdering Ordering, const DataLayout &DL,
+ function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+ // Ignore no-op GEPs and bitcasts.
+ StoredOnceVal = StoredOnceVal->stripPointerCasts();
+
+ // If we are dealing with a pointer global that is initialized to null and
+ // only has one (non-null) value stored into it, then we can optimize any
+ // users of the loaded value (often calls and loads) that would trap if the
+ // value was null.
+ if (GV->getInitializer()->getType()->isPointerTy() &&
+ GV->getInitializer()->isNullValue() &&
+ !NullPointerIsDefined(
+ nullptr /* F */,
+ GV->getInitializer()->getType()->getPointerAddressSpace())) {
+ if (Constant *SOVC = dyn_cast<Constant>(StoredOnceVal)) {
+ if (GV->getInitializer()->getType() != SOVC->getType())
+ SOVC = ConstantExpr::getBitCast(SOVC, GV->getInitializer()->getType());
+
+ // Optimize away any trapping uses of the loaded value.
+ if (OptimizeAwayTrappingUsesOfLoads(GV, SOVC, DL, GetTLI))
+ return true;
+ } else if (CallInst *CI = extractMallocCall(StoredOnceVal, GetTLI)) {
+ auto *TLI = &GetTLI(*CI->getFunction());
+ Type *MallocType = getMallocAllocatedType(CI, TLI);
+ if (MallocType && tryToOptimizeStoreOfMallocToGlobal(GV, CI, MallocType,
+ Ordering, DL, TLI))
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// At this point, we have learned that the only two values ever stored into GV
+/// are its initializer and OtherVal. See if we can shrink the global into a
+/// boolean and select between the two values whenever it is used. This exposes
+/// the values to other scalar optimizations.
+static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
+ Type *GVElType = GV->getValueType();
+
+ // If GVElType is already i1, it is already shrunk. If the type of the GV is
+ // an FP value, pointer or vector, don't do this optimization because a select
+ // between them is very expensive and unlikely to lead to later
+ // simplification. In these cases, we typically end up with "cond ? v1 : v2"
+ // where v1 and v2 both require constant pool loads, a big loss.
+ if (GVElType == Type::getInt1Ty(GV->getContext()) ||
+ GVElType->isFloatingPointTy() ||
+ GVElType->isPointerTy() || GVElType->isVectorTy())
+ return false;
+
+ // Walk the use list of the global seeing if all the uses are load or store.
+ // If there is anything else, bail out.
+ for (User *U : GV->users())
+ if (!isa<LoadInst>(U) && !isa<StoreInst>(U))
+ return false;
+
+ LLVM_DEBUG(dbgs() << " *** SHRINKING TO BOOL: " << *GV << "\n");
+
+ // Create the new global, initializing it to false.
+ GlobalVariable *NewGV = new GlobalVariable(Type::getInt1Ty(GV->getContext()),
+ false,
+ GlobalValue::InternalLinkage,
+ ConstantInt::getFalse(GV->getContext()),
+ GV->getName()+".b",
+ GV->getThreadLocalMode(),
+ GV->getType()->getAddressSpace());
+ NewGV->copyAttributesFrom(GV);
+ GV->getParent()->getGlobalList().insert(GV->getIterator(), NewGV);
+
+ Constant *InitVal = GV->getInitializer();
+ assert(InitVal->getType() != Type::getInt1Ty(GV->getContext()) &&
+ "No reason to shrink to bool!");
+
+ SmallVector<DIGlobalVariableExpression *, 1> GVs;
+ GV->getDebugInfo(GVs);
+
+ // If initialized to zero and storing one into the global, we can use a cast
+ // instead of a select to synthesize the desired value.
+ bool IsOneZero = false;
+ bool EmitOneOrZero = true;
+ auto *CI = dyn_cast<ConstantInt>(OtherVal);
+ if (CI && CI->getValue().getActiveBits() <= 64) {
+ IsOneZero = InitVal->isNullValue() && CI->isOne();
+
+ auto *CIInit = dyn_cast<ConstantInt>(GV->getInitializer());
+ if (CIInit && CIInit->getValue().getActiveBits() <= 64) {
+ uint64_t ValInit = CIInit->getZExtValue();
+ uint64_t ValOther = CI->getZExtValue();
+ uint64_t ValMinus = ValOther - ValInit;
+
+ for(auto *GVe : GVs){
+ DIGlobalVariable *DGV = GVe->getVariable();
+ DIExpression *E = GVe->getExpression();
+ const DataLayout &DL = GV->getParent()->getDataLayout();
+ unsigned SizeInOctets =
+ DL.getTypeAllocSizeInBits(NewGV->getType()->getElementType()) / 8;
+
+ // It is expected that the address of global optimized variable is on
+ // top of the stack. After optimization, value of that variable will
+ // be ether 0 for initial value or 1 for other value. The following
+ // expression should return constant integer value depending on the
+ // value at global object address:
+ // val * (ValOther - ValInit) + ValInit:
+ // DW_OP_deref DW_OP_constu <ValMinus>
+ // DW_OP_mul DW_OP_constu <ValInit> DW_OP_plus DW_OP_stack_value
+ SmallVector<uint64_t, 12> Ops = {
+ dwarf::DW_OP_deref_size, SizeInOctets,
+ dwarf::DW_OP_constu, ValMinus,
+ dwarf::DW_OP_mul, dwarf::DW_OP_constu, ValInit,
+ dwarf::DW_OP_plus};
+ bool WithStackValue = true;
+ E = DIExpression::prependOpcodes(E, Ops, WithStackValue);
+ DIGlobalVariableExpression *DGVE =
+ DIGlobalVariableExpression::get(NewGV->getContext(), DGV, E);
+ NewGV->addDebugInfo(DGVE);
+ }
+ EmitOneOrZero = false;
+ }
+ }
+
+ if (EmitOneOrZero) {
+ // FIXME: This will only emit address for debugger on which will
+ // be written only 0 or 1.
+ for(auto *GV : GVs)
+ NewGV->addDebugInfo(GV);
+ }
+
+ while (!GV->use_empty()) {
+ Instruction *UI = cast<Instruction>(GV->user_back());
+ if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
+ // Change the store into a boolean store.
+ bool StoringOther = SI->getOperand(0) == OtherVal;
+ // Only do this if we weren't storing a loaded value.
+ Value *StoreVal;
+ if (StoringOther || SI->getOperand(0) == InitVal) {
+ StoreVal = ConstantInt::get(Type::getInt1Ty(GV->getContext()),
+ StoringOther);
+ } else {
+ // Otherwise, we are storing a previously loaded copy. To do this,
+ // change the copy from copying the original value to just copying the
+ // bool.
+ Instruction *StoredVal = cast<Instruction>(SI->getOperand(0));
+
+ // If we've already replaced the input, StoredVal will be a cast or
+ // select instruction. If not, it will be a load of the original
+ // global.
+ if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
+ assert(LI->getOperand(0) == GV && "Not a copy!");
+ // Insert a new load, to preserve the saved value.
+ StoreVal = new LoadInst(NewGV->getValueType(), NewGV,
+ LI->getName() + ".b", false, Align(1),
+ LI->getOrdering(), LI->getSyncScopeID(), LI);
+ } else {
+ assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) &&
+ "This is not a form that we understand!");
+ StoreVal = StoredVal->getOperand(0);
+ assert(isa<LoadInst>(StoreVal) && "Not a load of NewGV!");
+ }
+ }
+ StoreInst *NSI =
+ new StoreInst(StoreVal, NewGV, false, Align(1), SI->getOrdering(),
+ SI->getSyncScopeID(), SI);
+ NSI->setDebugLoc(SI->getDebugLoc());
+ } else {
+ // Change the load into a load of bool then a select.
+ LoadInst *LI = cast<LoadInst>(UI);
+ LoadInst *NLI = new LoadInst(NewGV->getValueType(), NewGV,
+ LI->getName() + ".b", false, Align(1),
+ LI->getOrdering(), LI->getSyncScopeID(), LI);
+ Instruction *NSI;
+ if (IsOneZero)
+ NSI = new ZExtInst(NLI, LI->getType(), "", LI);
+ else
+ NSI = SelectInst::Create(NLI, OtherVal, InitVal, "", LI);
+ NSI->takeName(LI);
+ // Since LI is split into two instructions, NLI and NSI both inherit the
+ // same DebugLoc
+ NLI->setDebugLoc(LI->getDebugLoc());
+ NSI->setDebugLoc(LI->getDebugLoc());
+ LI->replaceAllUsesWith(NSI);
+ }
+ UI->eraseFromParent();
+ }
+
+ // Retain the name of the old global variable. People who are debugging their
+ // programs may expect these variables to be named the same.
+ NewGV->takeName(GV);
+ GV->eraseFromParent();
+ return true;
+}
+
+static bool deleteIfDead(
+ GlobalValue &GV, SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
+ GV.removeDeadConstantUsers();
+
+ if (!GV.isDiscardableIfUnused() && !GV.isDeclaration())
+ return false;
+
+ if (const Comdat *C = GV.getComdat())
+ if (!GV.hasLocalLinkage() && NotDiscardableComdats.count(C))
+ return false;
+
+ bool Dead;
+ if (auto *F = dyn_cast<Function>(&GV))
+ Dead = (F->isDeclaration() && F->use_empty()) || F->isDefTriviallyDead();
+ else
+ Dead = GV.use_empty();
+ if (!Dead)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "GLOBAL DEAD: " << GV << "\n");
+ GV.eraseFromParent();
+ ++NumDeleted;
+ return true;
+}
+
+static bool isPointerValueDeadOnEntryToFunction(
+ const Function *F, GlobalValue *GV,
+ function_ref<DominatorTree &(Function &)> LookupDomTree) {
+ // Find all uses of GV. We expect them all to be in F, and if we can't
+ // identify any of the uses we bail out.
+ //
+ // On each of these uses, identify if the memory that GV points to is
+ // used/required/live at the start of the function. If it is not, for example
+ // if the first thing the function does is store to the GV, the GV can
+ // possibly be demoted.
+ //
+ // We don't do an exhaustive search for memory operations - simply look
+ // through bitcasts as they're quite common and benign.
+ const DataLayout &DL = GV->getParent()->getDataLayout();
+ SmallVector<LoadInst *, 4> Loads;
+ SmallVector<StoreInst *, 4> Stores;
+ for (auto *U : GV->users()) {
+ if (Operator::getOpcode(U) == Instruction::BitCast) {
+ for (auto *UU : U->users()) {
+ if (auto *LI = dyn_cast<LoadInst>(UU))
+ Loads.push_back(LI);
+ else if (auto *SI = dyn_cast<StoreInst>(UU))
+ Stores.push_back(SI);
+ else
+ return false;
+ }
+ continue;
+ }
+
+ Instruction *I = dyn_cast<Instruction>(U);
+ if (!I)
+ return false;
+ assert(I->getParent()->getParent() == F);
+
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ Loads.push_back(LI);
+ else if (auto *SI = dyn_cast<StoreInst>(I))
+ Stores.push_back(SI);
+ else
+ return false;
+ }
+
+ // We have identified all uses of GV into loads and stores. Now check if all
+ // of them are known not to depend on the value of the global at the function
+ // entry point. We do this by ensuring that every load is dominated by at
+ // least one store.
+ auto &DT = LookupDomTree(*const_cast<Function *>(F));
+
+ // The below check is quadratic. Check we're not going to do too many tests.
+ // FIXME: Even though this will always have worst-case quadratic time, we
+ // could put effort into minimizing the average time by putting stores that
+ // have been shown to dominate at least one load at the beginning of the
+ // Stores array, making subsequent dominance checks more likely to succeed
+ // early.
+ //
+ // The threshold here is fairly large because global->local demotion is a
+ // very powerful optimization should it fire.
+ const unsigned Threshold = 100;
+ if (Loads.size() * Stores.size() > Threshold)
+ return false;
+
+ for (auto *L : Loads) {
+ auto *LTy = L->getType();
+ if (none_of(Stores, [&](const StoreInst *S) {
+ auto *STy = S->getValueOperand()->getType();
+ // The load is only dominated by the store if DomTree says so
+ // and the number of bits loaded in L is less than or equal to
+ // the number of bits stored in S.
+ return DT.dominates(S, L) &&
DL.getTypeStoreSize(LTy).getFixedSize() <=
DL.getTypeStoreSize(STy).getFixedSize();
- }))
- return false;
- }
- // All loads have known dependences inside F, so the global can be localized.
- return true;
-}
-
-/// C may have non-instruction users. Can all of those users be turned into
-/// instructions?
-static bool allNonInstructionUsersCanBeMadeInstructions(Constant *C) {
- // We don't do this exhaustively. The most common pattern that we really need
- // to care about is a constant GEP or constant bitcast - so just looking
- // through one single ConstantExpr.
- //
- // The set of constants that this function returns true for must be able to be
- // handled by makeAllConstantUsesInstructions.
- for (auto *U : C->users()) {
- if (isa<Instruction>(U))
- continue;
- if (!isa<ConstantExpr>(U))
- // Non instruction, non-constantexpr user; cannot convert this.
- return false;
- for (auto *UU : U->users())
- if (!isa<Instruction>(UU))
- // A constantexpr used by another constant. We don't try and recurse any
- // further but just bail out at this point.
- return false;
- }
-
- return true;
-}
-
-/// C may have non-instruction users, and
-/// allNonInstructionUsersCanBeMadeInstructions has returned true. Convert the
-/// non-instruction users to instructions.
-static void makeAllConstantUsesInstructions(Constant *C) {
- SmallVector<ConstantExpr*,4> Users;
- for (auto *U : C->users()) {
- if (isa<ConstantExpr>(U))
- Users.push_back(cast<ConstantExpr>(U));
- else
- // We should never get here; allNonInstructionUsersCanBeMadeInstructions
- // should not have returned true for C.
- assert(
- isa<Instruction>(U) &&
- "Can't transform non-constantexpr non-instruction to instruction!");
- }
-
- SmallVector<Value*,4> UUsers;
- for (auto *U : Users) {
- UUsers.clear();
+ }))
+ return false;
+ }
+ // All loads have known dependences inside F, so the global can be localized.
+ return true;
+}
+
+/// C may have non-instruction users. Can all of those users be turned into
+/// instructions?
+static bool allNonInstructionUsersCanBeMadeInstructions(Constant *C) {
+ // We don't do this exhaustively. The most common pattern that we really need
+ // to care about is a constant GEP or constant bitcast - so just looking
+ // through one single ConstantExpr.
+ //
+ // The set of constants that this function returns true for must be able to be
+ // handled by makeAllConstantUsesInstructions.
+ for (auto *U : C->users()) {
+ if (isa<Instruction>(U))
+ continue;
+ if (!isa<ConstantExpr>(U))
+ // Non instruction, non-constantexpr user; cannot convert this.
+ return false;
+ for (auto *UU : U->users())
+ if (!isa<Instruction>(UU))
+ // A constantexpr used by another constant. We don't try and recurse any
+ // further but just bail out at this point.
+ return false;
+ }
+
+ return true;
+}
+
+/// C may have non-instruction users, and
+/// allNonInstructionUsersCanBeMadeInstructions has returned true. Convert the
+/// non-instruction users to instructions.
+static void makeAllConstantUsesInstructions(Constant *C) {
+ SmallVector<ConstantExpr*,4> Users;
+ for (auto *U : C->users()) {
+ if (isa<ConstantExpr>(U))
+ Users.push_back(cast<ConstantExpr>(U));
+ else
+ // We should never get here; allNonInstructionUsersCanBeMadeInstructions
+ // should not have returned true for C.
+ assert(
+ isa<Instruction>(U) &&
+ "Can't transform non-constantexpr non-instruction to instruction!");
+ }
+
+ SmallVector<Value*,4> UUsers;
+ for (auto *U : Users) {
+ UUsers.clear();
append_range(UUsers, U->users());
- for (auto *UU : UUsers) {
- Instruction *UI = cast<Instruction>(UU);
- Instruction *NewU = U->getAsInstruction();
- NewU->insertBefore(UI);
- UI->replaceUsesOfWith(U, NewU);
- }
- // We've replaced all the uses, so destroy the constant. (destroyConstant
- // will update value handles and metadata.)
- U->destroyConstant();
- }
-}
-
-/// Analyze the specified global variable and optimize
-/// it if possible. If we make a change, return true.
-static bool
-processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
- function_ref<TargetLibraryInfo &(Function &)> GetTLI,
- function_ref<DominatorTree &(Function &)> LookupDomTree) {
- auto &DL = GV->getParent()->getDataLayout();
- // If this is a first class global and has only one accessing function and
- // this function is non-recursive, we replace the global with a local alloca
- // in this function.
- //
- // NOTE: It doesn't make sense to promote non-single-value types since we
- // are just replacing static memory to stack memory.
- //
- // If the global is in different address space, don't bring it to stack.
- if (!GS.HasMultipleAccessingFunctions &&
- GS.AccessingFunction &&
- GV->getValueType()->isSingleValueType() &&
- GV->getType()->getAddressSpace() == 0 &&
- !GV->isExternallyInitialized() &&
- allNonInstructionUsersCanBeMadeInstructions(GV) &&
- GS.AccessingFunction->doesNotRecurse() &&
- isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV,
- LookupDomTree)) {
- const DataLayout &DL = GV->getParent()->getDataLayout();
-
- LLVM_DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
- Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
- ->getEntryBlock().begin());
- Type *ElemTy = GV->getValueType();
- // FIXME: Pass Global's alignment when globals have alignment
- AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(), nullptr,
- GV->getName(), &FirstI);
- if (!isa<UndefValue>(GV->getInitializer()))
- new StoreInst(GV->getInitializer(), Alloca, &FirstI);
-
- makeAllConstantUsesInstructions(GV);
-
- GV->replaceAllUsesWith(Alloca);
- GV->eraseFromParent();
- ++NumLocalized;
- return true;
- }
-
+ for (auto *UU : UUsers) {
+ Instruction *UI = cast<Instruction>(UU);
+ Instruction *NewU = U->getAsInstruction();
+ NewU->insertBefore(UI);
+ UI->replaceUsesOfWith(U, NewU);
+ }
+ // We've replaced all the uses, so destroy the constant. (destroyConstant
+ // will update value handles and metadata.)
+ U->destroyConstant();
+ }
+}
+
+/// Analyze the specified global variable and optimize
+/// it if possible. If we make a change, return true.
+static bool
+processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS,
+ function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+ function_ref<DominatorTree &(Function &)> LookupDomTree) {
+ auto &DL = GV->getParent()->getDataLayout();
+ // If this is a first class global and has only one accessing function and
+ // this function is non-recursive, we replace the global with a local alloca
+ // in this function.
+ //
+ // NOTE: It doesn't make sense to promote non-single-value types since we
+ // are just replacing static memory to stack memory.
+ //
+ // If the global is in different address space, don't bring it to stack.
+ if (!GS.HasMultipleAccessingFunctions &&
+ GS.AccessingFunction &&
+ GV->getValueType()->isSingleValueType() &&
+ GV->getType()->getAddressSpace() == 0 &&
+ !GV->isExternallyInitialized() &&
+ allNonInstructionUsersCanBeMadeInstructions(GV) &&
+ GS.AccessingFunction->doesNotRecurse() &&
+ isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV,
+ LookupDomTree)) {
+ const DataLayout &DL = GV->getParent()->getDataLayout();
+
+ LLVM_DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
+ Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
+ ->getEntryBlock().begin());
+ Type *ElemTy = GV->getValueType();
+ // FIXME: Pass Global's alignment when globals have alignment
+ AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(), nullptr,
+ GV->getName(), &FirstI);
+ if (!isa<UndefValue>(GV->getInitializer()))
+ new StoreInst(GV->getInitializer(), Alloca, &FirstI);
+
+ makeAllConstantUsesInstructions(GV);
+
+ GV->replaceAllUsesWith(Alloca);
+ GV->eraseFromParent();
+ ++NumLocalized;
+ return true;
+ }
+
bool Changed = false;
- // If the global is never loaded (but may be stored to), it is dead.
- // Delete it now.
- if (!GS.IsLoaded) {
- LLVM_DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n");
-
- if (isLeakCheckerRoot(GV)) {
- // Delete any constant stores to the global.
- Changed = CleanupPointerRootUsers(GV, GetTLI);
- } else {
- // Delete any stores we can find to the global. We may not be able to
- // make it completely dead though.
- Changed =
- CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
- }
-
- // If the global is dead now, delete it.
- if (GV->use_empty()) {
- GV->eraseFromParent();
- ++NumDeleted;
- Changed = true;
- }
- return Changed;
-
- }
- if (GS.StoredType <= GlobalStatus::InitializerStored) {
- LLVM_DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
-
- // Don't actually mark a global constant if it's atomic because atomic loads
- // are implemented by a trivial cmpxchg in some edge-cases and that usually
- // requires write access to the variable even if it's not actually changed.
+ // If the global is never loaded (but may be stored to), it is dead.
+ // Delete it now.
+ if (!GS.IsLoaded) {
+ LLVM_DEBUG(dbgs() << "GLOBAL NEVER LOADED: " << *GV << "\n");
+
+ if (isLeakCheckerRoot(GV)) {
+ // Delete any constant stores to the global.
+ Changed = CleanupPointerRootUsers(GV, GetTLI);
+ } else {
+ // Delete any stores we can find to the global. We may not be able to
+ // make it completely dead though.
+ Changed =
+ CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
+ }
+
+ // If the global is dead now, delete it.
+ if (GV->use_empty()) {
+ GV->eraseFromParent();
+ ++NumDeleted;
+ Changed = true;
+ }
+ return Changed;
+
+ }
+ if (GS.StoredType <= GlobalStatus::InitializerStored) {
+ LLVM_DEBUG(dbgs() << "MARKING CONSTANT: " << *GV << "\n");
+
+ // Don't actually mark a global constant if it's atomic because atomic loads
+ // are implemented by a trivial cmpxchg in some edge-cases and that usually
+ // requires write access to the variable even if it's not actually changed.
if (GS.Ordering == AtomicOrdering::NotAtomic) {
assert(!GV->isConstant() && "Expected a non-constant global");
- GV->setConstant(true);
+ GV->setConstant(true);
Changed = true;
}
-
- // Clean up any obviously simplifiable users now.
+
+ // Clean up any obviously simplifiable users now.
Changed |= CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
-
- // If the global is dead now, just nuke it.
- if (GV->use_empty()) {
- LLVM_DEBUG(dbgs() << " *** Marking constant allowed us to simplify "
- << "all users and delete global!\n");
- GV->eraseFromParent();
- ++NumDeleted;
- return true;
- }
-
- // Fall through to the next check; see if we can optimize further.
- ++NumMarked;
- }
- if (!GV->getInitializer()->getType()->isSingleValueType()) {
- const DataLayout &DL = GV->getParent()->getDataLayout();
- if (SRAGlobal(GV, DL))
- return true;
- }
- if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) {
- // If the initial value for the global was an undef value, and if only
- // one other value was stored into it, we can just change the
- // initializer to be the stored value, then delete all stores to the
- // global. This allows us to mark it constant.
- if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
- if (isa<UndefValue>(GV->getInitializer())) {
- // Change the initial value here.
- GV->setInitializer(SOVConstant);
-
- // Clean up any obviously simplifiable users now.
- CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
-
- if (GV->use_empty()) {
- LLVM_DEBUG(dbgs() << " *** Substituting initializer allowed us to "
- << "simplify all users and delete global!\n");
- GV->eraseFromParent();
- ++NumDeleted;
- }
- ++NumSubstitute;
- return true;
- }
-
- // Try to optimize globals based on the knowledge that only one value
- // (besides its initializer) is ever stored to the global.
- if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL,
- GetTLI))
- return true;
-
- // Otherwise, if the global was not a boolean, we can shrink it to be a
- // boolean.
- if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) {
- if (GS.Ordering == AtomicOrdering::NotAtomic) {
- if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
- ++NumShrunkToBool;
- return true;
- }
- }
- }
- }
-
+
+ // If the global is dead now, just nuke it.
+ if (GV->use_empty()) {
+ LLVM_DEBUG(dbgs() << " *** Marking constant allowed us to simplify "
+ << "all users and delete global!\n");
+ GV->eraseFromParent();
+ ++NumDeleted;
+ return true;
+ }
+
+ // Fall through to the next check; see if we can optimize further.
+ ++NumMarked;
+ }
+ if (!GV->getInitializer()->getType()->isSingleValueType()) {
+ const DataLayout &DL = GV->getParent()->getDataLayout();
+ if (SRAGlobal(GV, DL))
+ return true;
+ }
+ if (GS.StoredType == GlobalStatus::StoredOnce && GS.StoredOnceValue) {
+ // If the initial value for the global was an undef value, and if only
+ // one other value was stored into it, we can just change the
+ // initializer to be the stored value, then delete all stores to the
+ // global. This allows us to mark it constant.
+ if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue))
+ if (isa<UndefValue>(GV->getInitializer())) {
+ // Change the initial value here.
+ GV->setInitializer(SOVConstant);
+
+ // Clean up any obviously simplifiable users now.
+ CleanupConstantGlobalUsers(GV, GV->getInitializer(), DL, GetTLI);
+
+ if (GV->use_empty()) {
+ LLVM_DEBUG(dbgs() << " *** Substituting initializer allowed us to "
+ << "simplify all users and delete global!\n");
+ GV->eraseFromParent();
+ ++NumDeleted;
+ }
+ ++NumSubstitute;
+ return true;
+ }
+
+ // Try to optimize globals based on the knowledge that only one value
+ // (besides its initializer) is ever stored to the global.
+ if (optimizeOnceStoredGlobal(GV, GS.StoredOnceValue, GS.Ordering, DL,
+ GetTLI))
+ return true;
+
+ // Otherwise, if the global was not a boolean, we can shrink it to be a
+ // boolean.
+ if (Constant *SOVConstant = dyn_cast<Constant>(GS.StoredOnceValue)) {
+ if (GS.Ordering == AtomicOrdering::NotAtomic) {
+ if (TryToShrinkGlobalToBoolean(GV, SOVConstant)) {
+ ++NumShrunkToBool;
+ return true;
+ }
+ }
+ }
+ }
+
return Changed;
-}
-
-/// Analyze the specified global variable and optimize it if possible. If we
-/// make a change, return true.
-static bool
-processGlobal(GlobalValue &GV,
- function_ref<TargetLibraryInfo &(Function &)> GetTLI,
- function_ref<DominatorTree &(Function &)> LookupDomTree) {
- if (GV.getName().startswith("llvm."))
- return false;
-
- GlobalStatus GS;
-
- if (GlobalStatus::analyzeGlobal(&GV, GS))
- return false;
-
- bool Changed = false;
- if (!GS.IsCompared && !GV.hasGlobalUnnamedAddr()) {
- auto NewUnnamedAddr = GV.hasLocalLinkage() ? GlobalValue::UnnamedAddr::Global
- : GlobalValue::UnnamedAddr::Local;
- if (NewUnnamedAddr != GV.getUnnamedAddr()) {
- GV.setUnnamedAddr(NewUnnamedAddr);
- NumUnnamed++;
- Changed = true;
- }
- }
-
- // Do more involved optimizations if the global is internal.
- if (!GV.hasLocalLinkage())
- return Changed;
-
- auto *GVar = dyn_cast<GlobalVariable>(&GV);
- if (!GVar)
- return Changed;
-
- if (GVar->isConstant() || !GVar->hasInitializer())
- return Changed;
-
- return processInternalGlobal(GVar, GS, GetTLI, LookupDomTree) || Changed;
-}
-
-/// Walk all of the direct calls of the specified function, changing them to
-/// FastCC.
-static void ChangeCalleesToFastCall(Function *F) {
- for (User *U : F->users()) {
- if (isa<BlockAddress>(U))
- continue;
- cast<CallBase>(U)->setCallingConv(CallingConv::Fast);
- }
-}
-
-static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs,
- Attribute::AttrKind A) {
- unsigned AttrIndex;
- if (Attrs.hasAttrSomewhere(A, &AttrIndex))
- return Attrs.removeAttribute(C, AttrIndex, A);
- return Attrs;
-}
-
-static void RemoveAttribute(Function *F, Attribute::AttrKind A) {
- F->setAttributes(StripAttr(F->getContext(), F->getAttributes(), A));
- for (User *U : F->users()) {
- if (isa<BlockAddress>(U))
- continue;
- CallBase *CB = cast<CallBase>(U);
- CB->setAttributes(StripAttr(F->getContext(), CB->getAttributes(), A));
- }
-}
-
-/// Return true if this is a calling convention that we'd like to change. The
-/// idea here is that we don't want to mess with the convention if the user
-/// explicitly requested something with performance implications like coldcc,
-/// GHC, or anyregcc.
-static bool hasChangeableCC(Function *F) {
- CallingConv::ID CC = F->getCallingConv();
-
- // FIXME: Is it worth transforming x86_stdcallcc and x86_fastcallcc?
- if (CC != CallingConv::C && CC != CallingConv::X86_ThisCall)
- return false;
-
- // FIXME: Change CC for the whole chain of musttail calls when possible.
- //
- // Can't change CC of the function that either has musttail calls, or is a
- // musttail callee itself
- for (User *U : F->users()) {
- if (isa<BlockAddress>(U))
- continue;
- CallInst* CI = dyn_cast<CallInst>(U);
- if (!CI)
- continue;
-
- if (CI->isMustTailCall())
- return false;
- }
-
- for (BasicBlock &BB : *F)
- if (BB.getTerminatingMustTailCall())
- return false;
-
- return true;
-}
-
-/// Return true if the block containing the call site has a BlockFrequency of
-/// less than ColdCCRelFreq% of the entry block.
-static bool isColdCallSite(CallBase &CB, BlockFrequencyInfo &CallerBFI) {
- const BranchProbability ColdProb(ColdCCRelFreq, 100);
- auto *CallSiteBB = CB.getParent();
- auto CallSiteFreq = CallerBFI.getBlockFreq(CallSiteBB);
- auto CallerEntryFreq =
- CallerBFI.getBlockFreq(&(CB.getCaller()->getEntryBlock()));
- return CallSiteFreq < CallerEntryFreq * ColdProb;
-}
-
-// This function checks if the input function F is cold at all call sites. It
-// also looks each call site's containing function, returning false if the
-// caller function contains other non cold calls. The input vector AllCallsCold
-// contains a list of functions that only have call sites in cold blocks.
-static bool
-isValidCandidateForColdCC(Function &F,
- function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
- const std::vector<Function *> &AllCallsCold) {
-
- if (F.user_empty())
- return false;
-
- for (User *U : F.users()) {
- if (isa<BlockAddress>(U))
- continue;
-
- CallBase &CB = cast<CallBase>(*U);
- Function *CallerFunc = CB.getParent()->getParent();
- BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc);
- if (!isColdCallSite(CB, CallerBFI))
- return false;
+}
+
+/// Analyze the specified global variable and optimize it if possible. If we
+/// make a change, return true.
+static bool
+processGlobal(GlobalValue &GV,
+ function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+ function_ref<DominatorTree &(Function &)> LookupDomTree) {
+ if (GV.getName().startswith("llvm."))
+ return false;
+
+ GlobalStatus GS;
+
+ if (GlobalStatus::analyzeGlobal(&GV, GS))
+ return false;
+
+ bool Changed = false;
+ if (!GS.IsCompared && !GV.hasGlobalUnnamedAddr()) {
+ auto NewUnnamedAddr = GV.hasLocalLinkage() ? GlobalValue::UnnamedAddr::Global
+ : GlobalValue::UnnamedAddr::Local;
+ if (NewUnnamedAddr != GV.getUnnamedAddr()) {
+ GV.setUnnamedAddr(NewUnnamedAddr);
+ NumUnnamed++;
+ Changed = true;
+ }
+ }
+
+ // Do more involved optimizations if the global is internal.
+ if (!GV.hasLocalLinkage())
+ return Changed;
+
+ auto *GVar = dyn_cast<GlobalVariable>(&GV);
+ if (!GVar)
+ return Changed;
+
+ if (GVar->isConstant() || !GVar->hasInitializer())
+ return Changed;
+
+ return processInternalGlobal(GVar, GS, GetTLI, LookupDomTree) || Changed;
+}
+
+/// Walk all of the direct calls of the specified function, changing them to
+/// FastCC.
+static void ChangeCalleesToFastCall(Function *F) {
+ for (User *U : F->users()) {
+ if (isa<BlockAddress>(U))
+ continue;
+ cast<CallBase>(U)->setCallingConv(CallingConv::Fast);
+ }
+}
+
+static AttributeList StripAttr(LLVMContext &C, AttributeList Attrs,
+ Attribute::AttrKind A) {
+ unsigned AttrIndex;
+ if (Attrs.hasAttrSomewhere(A, &AttrIndex))
+ return Attrs.removeAttribute(C, AttrIndex, A);
+ return Attrs;
+}
+
+static void RemoveAttribute(Function *F, Attribute::AttrKind A) {
+ F->setAttributes(StripAttr(F->getContext(), F->getAttributes(), A));
+ for (User *U : F->users()) {
+ if (isa<BlockAddress>(U))
+ continue;
+ CallBase *CB = cast<CallBase>(U);
+ CB->setAttributes(StripAttr(F->getContext(), CB->getAttributes(), A));
+ }
+}
+
+/// Return true if this is a calling convention that we'd like to change. The
+/// idea here is that we don't want to mess with the convention if the user
+/// explicitly requested something with performance implications like coldcc,
+/// GHC, or anyregcc.
+static bool hasChangeableCC(Function *F) {
+ CallingConv::ID CC = F->getCallingConv();
+
+ // FIXME: Is it worth transforming x86_stdcallcc and x86_fastcallcc?
+ if (CC != CallingConv::C && CC != CallingConv::X86_ThisCall)
+ return false;
+
+ // FIXME: Change CC for the whole chain of musttail calls when possible.
+ //
+ // Can't change CC of the function that either has musttail calls, or is a
+ // musttail callee itself
+ for (User *U : F->users()) {
+ if (isa<BlockAddress>(U))
+ continue;
+ CallInst* CI = dyn_cast<CallInst>(U);
+ if (!CI)
+ continue;
+
+ if (CI->isMustTailCall())
+ return false;
+ }
+
+ for (BasicBlock &BB : *F)
+ if (BB.getTerminatingMustTailCall())
+ return false;
+
+ return true;
+}
+
+/// Return true if the block containing the call site has a BlockFrequency of
+/// less than ColdCCRelFreq% of the entry block.
+static bool isColdCallSite(CallBase &CB, BlockFrequencyInfo &CallerBFI) {
+ const BranchProbability ColdProb(ColdCCRelFreq, 100);
+ auto *CallSiteBB = CB.getParent();
+ auto CallSiteFreq = CallerBFI.getBlockFreq(CallSiteBB);
+ auto CallerEntryFreq =
+ CallerBFI.getBlockFreq(&(CB.getCaller()->getEntryBlock()));
+ return CallSiteFreq < CallerEntryFreq * ColdProb;
+}
+
+// This function checks if the input function F is cold at all call sites. It
+// also looks each call site's containing function, returning false if the
+// caller function contains other non cold calls. The input vector AllCallsCold
+// contains a list of functions that only have call sites in cold blocks.
+static bool
+isValidCandidateForColdCC(Function &F,
+ function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+ const std::vector<Function *> &AllCallsCold) {
+
+ if (F.user_empty())
+ return false;
+
+ for (User *U : F.users()) {
+ if (isa<BlockAddress>(U))
+ continue;
+
+ CallBase &CB = cast<CallBase>(*U);
+ Function *CallerFunc = CB.getParent()->getParent();
+ BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc);
+ if (!isColdCallSite(CB, CallerBFI))
+ return false;
if (!llvm::is_contained(AllCallsCold, CallerFunc))
- return false;
- }
- return true;
-}
-
-static void changeCallSitesToColdCC(Function *F) {
- for (User *U : F->users()) {
- if (isa<BlockAddress>(U))
- continue;
- cast<CallBase>(U)->setCallingConv(CallingConv::Cold);
- }
-}
-
-// This function iterates over all the call instructions in the input Function
-// and checks that all call sites are in cold blocks and are allowed to use the
-// coldcc calling convention.
-static bool
-hasOnlyColdCalls(Function &F,
- function_ref<BlockFrequencyInfo &(Function &)> GetBFI) {
- for (BasicBlock &BB : F) {
- for (Instruction &I : BB) {
- if (CallInst *CI = dyn_cast<CallInst>(&I)) {
- // Skip over isline asm instructions since they aren't function calls.
- if (CI->isInlineAsm())
- continue;
- Function *CalledFn = CI->getCalledFunction();
- if (!CalledFn)
- return false;
- if (!CalledFn->hasLocalLinkage())
- return false;
- // Skip over instrinsics since they won't remain as function calls.
- if (CalledFn->getIntrinsicID() != Intrinsic::not_intrinsic)
- continue;
- // Check if it's valid to use coldcc calling convention.
- if (!hasChangeableCC(CalledFn) || CalledFn->isVarArg() ||
- CalledFn->hasAddressTaken())
- return false;
- BlockFrequencyInfo &CallerBFI = GetBFI(F);
- if (!isColdCallSite(*CI, CallerBFI))
- return false;
- }
- }
- }
- return true;
-}
-
-static bool hasMustTailCallers(Function *F) {
- for (User *U : F->users()) {
- CallBase *CB = dyn_cast<CallBase>(U);
- if (!CB) {
- assert(isa<BlockAddress>(U) &&
- "Expected either CallBase or BlockAddress");
- continue;
- }
- if (CB->isMustTailCall())
- return true;
- }
- return false;
-}
-
-static bool hasInvokeCallers(Function *F) {
- for (User *U : F->users())
- if (isa<InvokeInst>(U))
- return true;
- return false;
-}
-
-static void RemovePreallocated(Function *F) {
- RemoveAttribute(F, Attribute::Preallocated);
-
- auto *M = F->getParent();
-
- IRBuilder<> Builder(M->getContext());
-
- // Cannot modify users() while iterating over it, so make a copy.
- SmallVector<User *, 4> PreallocatedCalls(F->users());
- for (User *U : PreallocatedCalls) {
- CallBase *CB = dyn_cast<CallBase>(U);
- if (!CB)
- continue;
-
- assert(
- !CB->isMustTailCall() &&
- "Shouldn't call RemotePreallocated() on a musttail preallocated call");
- // Create copy of call without "preallocated" operand bundle.
- SmallVector<OperandBundleDef, 1> OpBundles;
- CB->getOperandBundlesAsDefs(OpBundles);
- CallBase *PreallocatedSetup = nullptr;
- for (auto *It = OpBundles.begin(); It != OpBundles.end(); ++It) {
- if (It->getTag() == "preallocated") {
- PreallocatedSetup = cast<CallBase>(*It->input_begin());
- OpBundles.erase(It);
- break;
- }
- }
- assert(PreallocatedSetup && "Did not find preallocated bundle");
- uint64_t ArgCount =
- cast<ConstantInt>(PreallocatedSetup->getArgOperand(0))->getZExtValue();
-
- assert((isa<CallInst>(CB) || isa<InvokeInst>(CB)) &&
- "Unknown indirect call type");
- CallBase *NewCB = CallBase::Create(CB, OpBundles, CB);
- CB->replaceAllUsesWith(NewCB);
- NewCB->takeName(CB);
- CB->eraseFromParent();
-
- Builder.SetInsertPoint(PreallocatedSetup);
- auto *StackSave =
- Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stacksave));
-
- Builder.SetInsertPoint(NewCB->getNextNonDebugInstruction());
- Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackrestore),
- StackSave);
-
- // Replace @llvm.call.preallocated.arg() with alloca.
- // Cannot modify users() while iterating over it, so make a copy.
- // @llvm.call.preallocated.arg() can be called with the same index multiple
- // times. So for each @llvm.call.preallocated.arg(), we see if we have
- // already created a Value* for the index, and if not, create an alloca and
- // bitcast right after the @llvm.call.preallocated.setup() so that it
- // dominates all uses.
- SmallVector<Value *, 2> ArgAllocas(ArgCount);
- SmallVector<User *, 2> PreallocatedArgs(PreallocatedSetup->users());
- for (auto *User : PreallocatedArgs) {
- auto *UseCall = cast<CallBase>(User);
- assert(UseCall->getCalledFunction()->getIntrinsicID() ==
- Intrinsic::call_preallocated_arg &&
- "preallocated token use was not a llvm.call.preallocated.arg");
- uint64_t AllocArgIndex =
- cast<ConstantInt>(UseCall->getArgOperand(1))->getZExtValue();
- Value *AllocaReplacement = ArgAllocas[AllocArgIndex];
- if (!AllocaReplacement) {
- auto AddressSpace = UseCall->getType()->getPointerAddressSpace();
- auto *ArgType = UseCall
- ->getAttribute(AttributeList::FunctionIndex,
- Attribute::Preallocated)
- .getValueAsType();
- auto *InsertBefore = PreallocatedSetup->getNextNonDebugInstruction();
- Builder.SetInsertPoint(InsertBefore);
- auto *Alloca =
- Builder.CreateAlloca(ArgType, AddressSpace, nullptr, "paarg");
- auto *BitCast = Builder.CreateBitCast(
- Alloca, Type::getInt8PtrTy(M->getContext()), UseCall->getName());
- ArgAllocas[AllocArgIndex] = BitCast;
- AllocaReplacement = BitCast;
- }
-
- UseCall->replaceAllUsesWith(AllocaReplacement);
- UseCall->eraseFromParent();
- }
- // Remove @llvm.call.preallocated.setup().
- cast<Instruction>(PreallocatedSetup)->eraseFromParent();
- }
-}
-
-static bool
-OptimizeFunctions(Module &M,
- function_ref<TargetLibraryInfo &(Function &)> GetTLI,
- function_ref<TargetTransformInfo &(Function &)> GetTTI,
- function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
- function_ref<DominatorTree &(Function &)> LookupDomTree,
- SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
-
- bool Changed = false;
-
- std::vector<Function *> AllCallsCold;
- for (Module::iterator FI = M.begin(), E = M.end(); FI != E;) {
- Function *F = &*FI++;
- if (hasOnlyColdCalls(*F, GetBFI))
- AllCallsCold.push_back(F);
- }
-
- // Optimize functions.
- for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
- Function *F = &*FI++;
-
- // Don't perform global opt pass on naked functions; we don't want fast
- // calling conventions for naked functions.
- if (F->hasFnAttribute(Attribute::Naked))
- continue;
-
- // Functions without names cannot be referenced outside this module.
- if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())
- F->setLinkage(GlobalValue::InternalLinkage);
-
- if (deleteIfDead(*F, NotDiscardableComdats)) {
- Changed = true;
- continue;
- }
-
- // LLVM's definition of dominance allows instructions that are cyclic
- // in unreachable blocks, e.g.:
- // %pat = select i1 %condition, @global, i16* %pat
- // because any instruction dominates an instruction in a block that's
- // not reachable from entry.
- // So, remove unreachable blocks from the function, because a) there's
- // no point in analyzing them and b) GlobalOpt should otherwise grow
- // some more complicated logic to break these cycles.
- // Removing unreachable blocks might invalidate the dominator so we
- // recalculate it.
- if (!F->isDeclaration()) {
- if (removeUnreachableBlocks(*F)) {
- auto &DT = LookupDomTree(*F);
- DT.recalculate(*F);
- Changed = true;
- }
- }
-
- Changed |= processGlobal(*F, GetTLI, LookupDomTree);
-
- if (!F->hasLocalLinkage())
- continue;
-
- // If we have an inalloca parameter that we can safely remove the
- // inalloca attribute from, do so. This unlocks optimizations that
- // wouldn't be safe in the presence of inalloca.
- // FIXME: We should also hoist alloca affected by this to the entry
- // block if possible.
- if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) &&
- !F->hasAddressTaken() && !hasMustTailCallers(F)) {
- RemoveAttribute(F, Attribute::InAlloca);
- Changed = true;
- }
-
- // FIXME: handle invokes
- // FIXME: handle musttail
- if (F->getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
- if (!F->hasAddressTaken() && !hasMustTailCallers(F) &&
- !hasInvokeCallers(F)) {
- RemovePreallocated(F);
- Changed = true;
- }
- continue;
- }
-
- if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) {
- NumInternalFunc++;
- TargetTransformInfo &TTI = GetTTI(*F);
- // Change the calling convention to coldcc if either stress testing is
- // enabled or the target would like to use coldcc on functions which are
- // cold at all call sites and the callers contain no other non coldcc
- // calls.
- if (EnableColdCCStressTest ||
- (TTI.useColdCCForColdCall(*F) &&
- isValidCandidateForColdCC(*F, GetBFI, AllCallsCold))) {
- F->setCallingConv(CallingConv::Cold);
- changeCallSitesToColdCC(F);
- Changed = true;
- NumColdCC++;
- }
- }
-
- if (hasChangeableCC(F) && !F->isVarArg() &&
- !F->hasAddressTaken()) {
- // If this function has a calling convention worth changing, is not a
- // varargs function, and is only called directly, promote it to use the
- // Fast calling convention.
- F->setCallingConv(CallingConv::Fast);
- ChangeCalleesToFastCall(F);
- ++NumFastCallFns;
- Changed = true;
- }
-
- if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
- !F->hasAddressTaken()) {
- // The function is not used by a trampoline intrinsic, so it is safe
- // to remove the 'nest' attribute.
- RemoveAttribute(F, Attribute::Nest);
- ++NumNestRemoved;
- Changed = true;
- }
- }
- return Changed;
-}
-
-static bool
-OptimizeGlobalVars(Module &M,
- function_ref<TargetLibraryInfo &(Function &)> GetTLI,
- function_ref<DominatorTree &(Function &)> LookupDomTree,
- SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
- bool Changed = false;
-
- for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
- GVI != E; ) {
- GlobalVariable *GV = &*GVI++;
- // Global variables without names cannot be referenced outside this module.
- if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage())
- GV->setLinkage(GlobalValue::InternalLinkage);
- // Simplify the initializer.
- if (GV->hasInitializer())
- if (auto *C = dyn_cast<Constant>(GV->getInitializer())) {
- auto &DL = M.getDataLayout();
- // TLI is not used in the case of a Constant, so use default nullptr
- // for that optional parameter, since we don't have a Function to
- // provide GetTLI anyway.
- Constant *New = ConstantFoldConstant(C, DL, /*TLI*/ nullptr);
- if (New != C)
- GV->setInitializer(New);
- }
-
- if (deleteIfDead(*GV, NotDiscardableComdats)) {
- Changed = true;
- continue;
- }
-
- Changed |= processGlobal(*GV, GetTLI, LookupDomTree);
- }
- return Changed;
-}
-
-/// Evaluate a piece of a constantexpr store into a global initializer. This
-/// returns 'Init' modified to reflect 'Val' stored into it. At this point, the
-/// GEP operands of Addr [0, OpNo) have been stepped into.
-static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
- ConstantExpr *Addr, unsigned OpNo) {
- // Base case of the recursion.
- if (OpNo == Addr->getNumOperands()) {
- assert(Val->getType() == Init->getType() && "Type mismatch!");
- return Val;
- }
-
- SmallVector<Constant*, 32> Elts;
- if (StructType *STy = dyn_cast<StructType>(Init->getType())) {
- // Break up the constant into its elements.
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
- Elts.push_back(Init->getAggregateElement(i));
-
- // Replace the element that we are supposed to.
- ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo));
- unsigned Idx = CU->getZExtValue();
- assert(Idx < STy->getNumElements() && "Struct index out of range!");
- Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1);
-
- // Return the modified struct.
- return ConstantStruct::get(STy, Elts);
- }
-
- ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
- uint64_t NumElts;
- if (ArrayType *ATy = dyn_cast<ArrayType>(Init->getType()))
- NumElts = ATy->getNumElements();
- else
- NumElts = cast<FixedVectorType>(Init->getType())->getNumElements();
-
- // Break up the array into elements.
- for (uint64_t i = 0, e = NumElts; i != e; ++i)
- Elts.push_back(Init->getAggregateElement(i));
-
- assert(CI->getZExtValue() < NumElts);
- Elts[CI->getZExtValue()] =
- EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1);
-
- if (Init->getType()->isArrayTy())
- return ConstantArray::get(cast<ArrayType>(Init->getType()), Elts);
- return ConstantVector::get(Elts);
-}
-
-/// We have decided that Addr (which satisfies the predicate
-/// isSimpleEnoughPointerToCommit) should get Val as its value. Make it happen.
-static void CommitValueTo(Constant *Val, Constant *Addr) {
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
- assert(GV->hasInitializer());
- GV->setInitializer(Val);
- return;
- }
-
- ConstantExpr *CE = cast<ConstantExpr>(Addr);
- GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
- GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2));
-}
-
-/// Given a map of address -> value, where addresses are expected to be some form
-/// of either a global or a constant GEP, set the initializer for the address to
-/// be the value. This performs mostly the same function as CommitValueTo()
-/// and EvaluateStoreInto() but is optimized to be more efficient for the common
-/// case where the set of addresses are GEPs sharing the same underlying global,
-/// processing the GEPs in batches rather than individually.
-///
-/// To give an example, consider the following C++ code adapted from the clang
-/// regression tests:
-/// struct S {
-/// int n = 10;
-/// int m = 2 * n;
-/// S(int a) : n(a) {}
-/// };
-///
-/// template<typename T>
-/// struct U {
-/// T *r = &q;
-/// T q = 42;
-/// U *p = this;
-/// };
-///
-/// U<S> e;
-///
-/// The global static constructor for 'e' will need to initialize 'r' and 'p' of
-/// the outer struct, while also initializing the inner 'q' structs 'n' and 'm'
-/// members. This batch algorithm will simply use general CommitValueTo() method
-/// to handle the complex nested S struct initialization of 'q', before
-/// processing the outermost members in a single batch. Using CommitValueTo() to
-/// handle member in the outer struct is inefficient when the struct/array is
-/// very large as we end up creating and destroy constant arrays for each
-/// initialization.
-/// For the above case, we expect the following IR to be generated:
-///
-/// %struct.U = type { %struct.S*, %struct.S, %struct.U* }
-/// %struct.S = type { i32, i32 }
-/// @e = global %struct.U { %struct.S* gep inbounds (%struct.U, %struct.U* @e,
-/// i64 0, i32 1),
-/// %struct.S { i32 42, i32 84 }, %struct.U* @e }
-/// The %struct.S { i32 42, i32 84 } inner initializer is treated as a complex
-/// constant expression, while the other two elements of @e are "simple".
-static void BatchCommitValueTo(const DenseMap<Constant*, Constant*> &Mem) {
- SmallVector<std::pair<GlobalVariable*, Constant*>, 32> GVs;
- SmallVector<std::pair<ConstantExpr*, Constant*>, 32> ComplexCEs;
- SmallVector<std::pair<ConstantExpr*, Constant*>, 32> SimpleCEs;
- SimpleCEs.reserve(Mem.size());
-
- for (const auto &I : Mem) {
- if (auto *GV = dyn_cast<GlobalVariable>(I.first)) {
- GVs.push_back(std::make_pair(GV, I.second));
- } else {
- ConstantExpr *GEP = cast<ConstantExpr>(I.first);
- // We don't handle the deeply recursive case using the batch method.
- if (GEP->getNumOperands() > 3)
- ComplexCEs.push_back(std::make_pair(GEP, I.second));
- else
- SimpleCEs.push_back(std::make_pair(GEP, I.second));
- }
- }
-
- // The algorithm below doesn't handle cases like nested structs, so use the
- // slower fully general method if we have to.
- for (auto ComplexCE : ComplexCEs)
- CommitValueTo(ComplexCE.second, ComplexCE.first);
-
- for (auto GVPair : GVs) {
- assert(GVPair.first->hasInitializer());
- GVPair.first->setInitializer(GVPair.second);
- }
-
- if (SimpleCEs.empty())
- return;
-
- // We cache a single global's initializer elements in the case where the
- // subsequent address/val pair uses the same one. This avoids throwing away and
- // rebuilding the constant struct/vector/array just because one element is
- // modified at a time.
- SmallVector<Constant *, 32> Elts;
- Elts.reserve(SimpleCEs.size());
- GlobalVariable *CurrentGV = nullptr;
-
- auto commitAndSetupCache = [&](GlobalVariable *GV, bool Update) {
- Constant *Init = GV->getInitializer();
- Type *Ty = Init->getType();
- if (Update) {
- if (CurrentGV) {
- assert(CurrentGV && "Expected a GV to commit to!");
- Type *CurrentInitTy = CurrentGV->getInitializer()->getType();
- // We have a valid cache that needs to be committed.
- if (StructType *STy = dyn_cast<StructType>(CurrentInitTy))
- CurrentGV->setInitializer(ConstantStruct::get(STy, Elts));
- else if (ArrayType *ArrTy = dyn_cast<ArrayType>(CurrentInitTy))
- CurrentGV->setInitializer(ConstantArray::get(ArrTy, Elts));
- else
- CurrentGV->setInitializer(ConstantVector::get(Elts));
- }
- if (CurrentGV == GV)
- return;
- // Need to clear and set up cache for new initializer.
- CurrentGV = GV;
- Elts.clear();
- unsigned NumElts;
- if (auto *STy = dyn_cast<StructType>(Ty))
- NumElts = STy->getNumElements();
- else if (auto *ATy = dyn_cast<ArrayType>(Ty))
- NumElts = ATy->getNumElements();
- else
- NumElts = cast<FixedVectorType>(Ty)->getNumElements();
- for (unsigned i = 0, e = NumElts; i != e; ++i)
- Elts.push_back(Init->getAggregateElement(i));
- }
- };
-
- for (auto CEPair : SimpleCEs) {
- ConstantExpr *GEP = CEPair.first;
- Constant *Val = CEPair.second;
-
- GlobalVariable *GV = cast<GlobalVariable>(GEP->getOperand(0));
- commitAndSetupCache(GV, GV != CurrentGV);
- ConstantInt *CI = cast<ConstantInt>(GEP->getOperand(2));
- Elts[CI->getZExtValue()] = Val;
- }
- // The last initializer in the list needs to be committed, others
- // will be committed on a new initializer being processed.
- commitAndSetupCache(CurrentGV, true);
-}
-
-/// Evaluate static constructors in the function, if we can. Return true if we
-/// can, false otherwise.
-static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
- TargetLibraryInfo *TLI) {
- // Call the function.
- Evaluator Eval(DL, TLI);
- Constant *RetValDummy;
- bool EvalSuccess = Eval.EvaluateFunction(F, RetValDummy,
- SmallVector<Constant*, 0>());
-
- if (EvalSuccess) {
- ++NumCtorsEvaluated;
-
- // We succeeded at evaluation: commit the result.
- LLVM_DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
- << F->getName() << "' to "
- << Eval.getMutatedMemory().size() << " stores.\n");
- BatchCommitValueTo(Eval.getMutatedMemory());
- for (GlobalVariable *GV : Eval.getInvariants())
- GV->setConstant(true);
- }
-
- return EvalSuccess;
-}
-
-static int compareNames(Constant *const *A, Constant *const *B) {
- Value *AStripped = (*A)->stripPointerCasts();
- Value *BStripped = (*B)->stripPointerCasts();
- return AStripped->getName().compare(BStripped->getName());
-}
-
-static void setUsedInitializer(GlobalVariable &V,
- const SmallPtrSetImpl<GlobalValue *> &Init) {
- if (Init.empty()) {
- V.eraseFromParent();
- return;
- }
-
- // Type of pointer to the array of pointers.
- PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext(), 0);
-
- SmallVector<Constant *, 8> UsedArray;
- for (GlobalValue *GV : Init) {
- Constant *Cast
- = ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, Int8PtrTy);
- UsedArray.push_back(Cast);
- }
- // Sort to get deterministic order.
- array_pod_sort(UsedArray.begin(), UsedArray.end(), compareNames);
- ArrayType *ATy = ArrayType::get(Int8PtrTy, UsedArray.size());
-
- Module *M = V.getParent();
- V.removeFromParent();
- GlobalVariable *NV =
- new GlobalVariable(*M, ATy, false, GlobalValue::AppendingLinkage,
- ConstantArray::get(ATy, UsedArray), "");
- NV->takeName(&V);
- NV->setSection("llvm.metadata");
- delete &V;
-}
-
-namespace {
-
-/// An easy to access representation of llvm.used and llvm.compiler.used.
-class LLVMUsed {
- SmallPtrSet<GlobalValue *, 8> Used;
- SmallPtrSet<GlobalValue *, 8> CompilerUsed;
- GlobalVariable *UsedV;
- GlobalVariable *CompilerUsedV;
-
-public:
- LLVMUsed(Module &M) {
- UsedV = collectUsedGlobalVariables(M, Used, false);
- CompilerUsedV = collectUsedGlobalVariables(M, CompilerUsed, true);
- }
-
- using iterator = SmallPtrSet<GlobalValue *, 8>::iterator;
- using used_iterator_range = iterator_range<iterator>;
-
- iterator usedBegin() { return Used.begin(); }
- iterator usedEnd() { return Used.end(); }
-
- used_iterator_range used() {
- return used_iterator_range(usedBegin(), usedEnd());
- }
-
- iterator compilerUsedBegin() { return CompilerUsed.begin(); }
- iterator compilerUsedEnd() { return CompilerUsed.end(); }
-
- used_iterator_range compilerUsed() {
- return used_iterator_range(compilerUsedBegin(), compilerUsedEnd());
- }
-
- bool usedCount(GlobalValue *GV) const { return Used.count(GV); }
-
- bool compilerUsedCount(GlobalValue *GV) const {
- return CompilerUsed.count(GV);
- }
-
- bool usedErase(GlobalValue *GV) { return Used.erase(GV); }
- bool compilerUsedErase(GlobalValue *GV) { return CompilerUsed.erase(GV); }
- bool usedInsert(GlobalValue *GV) { return Used.insert(GV).second; }
-
- bool compilerUsedInsert(GlobalValue *GV) {
- return CompilerUsed.insert(GV).second;
- }
-
- void syncVariablesAndSets() {
- if (UsedV)
- setUsedInitializer(*UsedV, Used);
- if (CompilerUsedV)
- setUsedInitializer(*CompilerUsedV, CompilerUsed);
- }
-};
-
-} // end anonymous namespace
-
-static bool hasUseOtherThanLLVMUsed(GlobalAlias &GA, const LLVMUsed &U) {
- if (GA.use_empty()) // No use at all.
- return false;
-
- assert((!U.usedCount(&GA) || !U.compilerUsedCount(&GA)) &&
- "We should have removed the duplicated "
- "element from llvm.compiler.used");
- if (!GA.hasOneUse())
- // Strictly more than one use. So at least one is not in llvm.used and
- // llvm.compiler.used.
- return true;
-
- // Exactly one use. Check if it is in llvm.used or llvm.compiler.used.
- return !U.usedCount(&GA) && !U.compilerUsedCount(&GA);
-}
-
-static bool hasMoreThanOneUseOtherThanLLVMUsed(GlobalValue &V,
- const LLVMUsed &U) {
- unsigned N = 2;
- assert((!U.usedCount(&V) || !U.compilerUsedCount(&V)) &&
- "We should have removed the duplicated "
- "element from llvm.compiler.used");
- if (U.usedCount(&V) || U.compilerUsedCount(&V))
- ++N;
- return V.hasNUsesOrMore(N);
-}
-
-static bool mayHaveOtherReferences(GlobalAlias &GA, const LLVMUsed &U) {
- if (!GA.hasLocalLinkage())
- return true;
-
- return U.usedCount(&GA) || U.compilerUsedCount(&GA);
-}
-
-static bool hasUsesToReplace(GlobalAlias &GA, const LLVMUsed &U,
- bool &RenameTarget) {
- RenameTarget = false;
- bool Ret = false;
- if (hasUseOtherThanLLVMUsed(GA, U))
- Ret = true;
-
- // If the alias is externally visible, we may still be able to simplify it.
- if (!mayHaveOtherReferences(GA, U))
- return Ret;
-
- // If the aliasee has internal linkage, give it the name and linkage
- // of the alias, and delete the alias. This turns:
- // define internal ... @f(...)
- // @a = alias ... @f
- // into:
- // define ... @a(...)
- Constant *Aliasee = GA.getAliasee();
- GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts());
- if (!Target->hasLocalLinkage())
- return Ret;
-
- // Do not perform the transform if multiple aliases potentially target the
- // aliasee. This check also ensures that it is safe to replace the section
- // and other attributes of the aliasee with those of the alias.
- if (hasMoreThanOneUseOtherThanLLVMUsed(*Target, U))
- return Ret;
-
- RenameTarget = true;
- return true;
-}
-
-static bool
-OptimizeGlobalAliases(Module &M,
- SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
- bool Changed = false;
- LLVMUsed Used(M);
-
- for (GlobalValue *GV : Used.used())
- Used.compilerUsedErase(GV);
-
- for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
- I != E;) {
- GlobalAlias *J = &*I++;
-
- // Aliases without names cannot be referenced outside this module.
- if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage())
- J->setLinkage(GlobalValue::InternalLinkage);
-
- if (deleteIfDead(*J, NotDiscardableComdats)) {
- Changed = true;
- continue;
- }
-
- // If the alias can change at link time, nothing can be done - bail out.
- if (J->isInterposable())
- continue;
-
- Constant *Aliasee = J->getAliasee();
- GlobalValue *Target = dyn_cast<GlobalValue>(Aliasee->stripPointerCasts());
- // We can't trivially replace the alias with the aliasee if the aliasee is
- // non-trivial in some way.
- // TODO: Try to handle non-zero GEPs of local aliasees.
- if (!Target)
- continue;
- Target->removeDeadConstantUsers();
-
- // Make all users of the alias use the aliasee instead.
- bool RenameTarget;
- if (!hasUsesToReplace(*J, Used, RenameTarget))
- continue;
-
- J->replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J->getType()));
- ++NumAliasesResolved;
- Changed = true;
-
- if (RenameTarget) {
- // Give the aliasee the name, linkage and other attributes of the alias.
- Target->takeName(&*J);
- Target->setLinkage(J->getLinkage());
- Target->setDSOLocal(J->isDSOLocal());
- Target->setVisibility(J->getVisibility());
- Target->setDLLStorageClass(J->getDLLStorageClass());
-
- if (Used.usedErase(&*J))
- Used.usedInsert(Target);
-
- if (Used.compilerUsedErase(&*J))
- Used.compilerUsedInsert(Target);
- } else if (mayHaveOtherReferences(*J, Used))
- continue;
-
- // Delete the alias.
- M.getAliasList().erase(J);
- ++NumAliasesRemoved;
- Changed = true;
- }
-
- Used.syncVariablesAndSets();
-
- return Changed;
-}
-
-static Function *
-FindCXAAtExit(Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
- // Hack to get a default TLI before we have actual Function.
- auto FuncIter = M.begin();
- if (FuncIter == M.end())
- return nullptr;
- auto *TLI = &GetTLI(*FuncIter);
-
- LibFunc F = LibFunc_cxa_atexit;
- if (!TLI->has(F))
- return nullptr;
-
- Function *Fn = M.getFunction(TLI->getName(F));
- if (!Fn)
- return nullptr;
-
- // Now get the actual TLI for Fn.
- TLI = &GetTLI(*Fn);
-
- // Make sure that the function has the correct prototype.
- if (!TLI->getLibFunc(*Fn, F) || F != LibFunc_cxa_atexit)
- return nullptr;
-
- return Fn;
-}
-
-/// Returns whether the given function is an empty C++ destructor and can
-/// therefore be eliminated.
-/// Note that we assume that other optimization passes have already simplified
-/// the code so we simply check for 'ret'.
-static bool cxxDtorIsEmpty(const Function &Fn) {
- // FIXME: We could eliminate C++ destructors if they're readonly/readnone and
- // nounwind, but that doesn't seem worth doing.
- if (Fn.isDeclaration())
- return false;
-
- for (auto &I : Fn.getEntryBlock()) {
- if (isa<DbgInfoIntrinsic>(I))
- continue;
- if (isa<ReturnInst>(I))
- return true;
- break;
- }
- return false;
-}
-
-static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
- /// Itanium C++ ABI p3.3.5:
- ///
- /// After constructing a global (or local static) object, that will require
- /// destruction on exit, a termination function is registered as follows:
- ///
- /// extern "C" int __cxa_atexit ( void (*f)(void *), void *p, void *d );
- ///
- /// This registration, e.g. __cxa_atexit(f,p,d), is intended to cause the
- /// call f(p) when DSO d is unloaded, before all such termination calls
- /// registered before this one. It returns zero if registration is
- /// successful, nonzero on failure.
-
- // This pass will look for calls to __cxa_atexit where the function is trivial
- // and remove them.
- bool Changed = false;
-
- for (auto I = CXAAtExitFn->user_begin(), E = CXAAtExitFn->user_end();
- I != E;) {
- // We're only interested in calls. Theoretically, we could handle invoke
- // instructions as well, but neither llvm-gcc nor clang generate invokes
- // to __cxa_atexit.
- CallInst *CI = dyn_cast<CallInst>(*I++);
- if (!CI)
- continue;
-
- Function *DtorFn =
- dyn_cast<Function>(CI->getArgOperand(0)->stripPointerCasts());
- if (!DtorFn || !cxxDtorIsEmpty(*DtorFn))
- continue;
-
- // Just remove the call.
- CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
- CI->eraseFromParent();
-
- ++NumCXXDtorsRemoved;
-
- Changed |= true;
- }
-
- return Changed;
-}
-
-static bool optimizeGlobalsInModule(
- Module &M, const DataLayout &DL,
- function_ref<TargetLibraryInfo &(Function &)> GetTLI,
- function_ref<TargetTransformInfo &(Function &)> GetTTI,
- function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
- function_ref<DominatorTree &(Function &)> LookupDomTree) {
- SmallPtrSet<const Comdat *, 8> NotDiscardableComdats;
- bool Changed = false;
- bool LocalChange = true;
- while (LocalChange) {
- LocalChange = false;
-
- NotDiscardableComdats.clear();
- for (const GlobalVariable &GV : M.globals())
- if (const Comdat *C = GV.getComdat())
- if (!GV.isDiscardableIfUnused() || !GV.use_empty())
- NotDiscardableComdats.insert(C);
- for (Function &F : M)
- if (const Comdat *C = F.getComdat())
- if (!F.isDefTriviallyDead())
- NotDiscardableComdats.insert(C);
- for (GlobalAlias &GA : M.aliases())
- if (const Comdat *C = GA.getComdat())
- if (!GA.isDiscardableIfUnused() || !GA.use_empty())
- NotDiscardableComdats.insert(C);
-
- // Delete functions that are trivially dead, ccc -> fastcc
- LocalChange |= OptimizeFunctions(M, GetTLI, GetTTI, GetBFI, LookupDomTree,
- NotDiscardableComdats);
-
- // Optimize global_ctors list.
- LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) {
- return EvaluateStaticConstructor(F, DL, &GetTLI(*F));
- });
-
- // Optimize non-address-taken globals.
- LocalChange |=
- OptimizeGlobalVars(M, GetTLI, LookupDomTree, NotDiscardableComdats);
-
- // Resolve aliases, when possible.
- LocalChange |= OptimizeGlobalAliases(M, NotDiscardableComdats);
-
- // Try to remove trivial global destructors if they are not removed
- // already.
- Function *CXAAtExitFn = FindCXAAtExit(M, GetTLI);
- if (CXAAtExitFn)
- LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn);
-
- Changed |= LocalChange;
- }
-
- // TODO: Move all global ctors functions to the end of the module for code
- // layout.
-
- return Changed;
-}
-
-PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
- auto &DL = M.getDataLayout();
- auto &FAM =
- AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- auto LookupDomTree = [&FAM](Function &F) -> DominatorTree &{
- return FAM.getResult<DominatorTreeAnalysis>(F);
- };
- auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
- return FAM.getResult<TargetLibraryAnalysis>(F);
- };
- auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
- return FAM.getResult<TargetIRAnalysis>(F);
- };
-
- auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
- return FAM.getResult<BlockFrequencyAnalysis>(F);
- };
-
- if (!optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI, LookupDomTree))
- return PreservedAnalyses::all();
- return PreservedAnalyses::none();
-}
-
-namespace {
-
-struct GlobalOptLegacyPass : public ModulePass {
- static char ID; // Pass identification, replacement for typeid
-
- GlobalOptLegacyPass() : ModulePass(ID) {
- initializeGlobalOptLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
-
- auto &DL = M.getDataLayout();
- auto LookupDomTree = [this](Function &F) -> DominatorTree & {
- return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
- };
- auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
- return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- };
- auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
- return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- };
-
- auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & {
- return this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
- };
-
- return optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI,
- LookupDomTree);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<BlockFrequencyInfoWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-char GlobalOptLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(GlobalOptLegacyPass, "globalopt",
- "Global Variable Optimizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(GlobalOptLegacyPass, "globalopt",
- "Global Variable Optimizer", false, false)
-
-ModulePass *llvm::createGlobalOptimizerPass() {
- return new GlobalOptLegacyPass();
-}
+ return false;
+ }
+ return true;
+}
+
+static void changeCallSitesToColdCC(Function *F) {
+ for (User *U : F->users()) {
+ if (isa<BlockAddress>(U))
+ continue;
+ cast<CallBase>(U)->setCallingConv(CallingConv::Cold);
+ }
+}
+
+// This function iterates over all the call instructions in the input Function
+// and checks that all call sites are in cold blocks and are allowed to use the
+// coldcc calling convention.
+static bool
+hasOnlyColdCalls(Function &F,
+ function_ref<BlockFrequencyInfo &(Function &)> GetBFI) {
+ for (BasicBlock &BB : F) {
+ for (Instruction &I : BB) {
+ if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+ // Skip over isline asm instructions since they aren't function calls.
+ if (CI->isInlineAsm())
+ continue;
+ Function *CalledFn = CI->getCalledFunction();
+ if (!CalledFn)
+ return false;
+ if (!CalledFn->hasLocalLinkage())
+ return false;
+ // Skip over instrinsics since they won't remain as function calls.
+ if (CalledFn->getIntrinsicID() != Intrinsic::not_intrinsic)
+ continue;
+ // Check if it's valid to use coldcc calling convention.
+ if (!hasChangeableCC(CalledFn) || CalledFn->isVarArg() ||
+ CalledFn->hasAddressTaken())
+ return false;
+ BlockFrequencyInfo &CallerBFI = GetBFI(F);
+ if (!isColdCallSite(*CI, CallerBFI))
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+static bool hasMustTailCallers(Function *F) {
+ for (User *U : F->users()) {
+ CallBase *CB = dyn_cast<CallBase>(U);
+ if (!CB) {
+ assert(isa<BlockAddress>(U) &&
+ "Expected either CallBase or BlockAddress");
+ continue;
+ }
+ if (CB->isMustTailCall())
+ return true;
+ }
+ return false;
+}
+
+static bool hasInvokeCallers(Function *F) {
+ for (User *U : F->users())
+ if (isa<InvokeInst>(U))
+ return true;
+ return false;
+}
+
+static void RemovePreallocated(Function *F) {
+ RemoveAttribute(F, Attribute::Preallocated);
+
+ auto *M = F->getParent();
+
+ IRBuilder<> Builder(M->getContext());
+
+ // Cannot modify users() while iterating over it, so make a copy.
+ SmallVector<User *, 4> PreallocatedCalls(F->users());
+ for (User *U : PreallocatedCalls) {
+ CallBase *CB = dyn_cast<CallBase>(U);
+ if (!CB)
+ continue;
+
+ assert(
+ !CB->isMustTailCall() &&
+ "Shouldn't call RemotePreallocated() on a musttail preallocated call");
+ // Create copy of call without "preallocated" operand bundle.
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ CB->getOperandBundlesAsDefs(OpBundles);
+ CallBase *PreallocatedSetup = nullptr;
+ for (auto *It = OpBundles.begin(); It != OpBundles.end(); ++It) {
+ if (It->getTag() == "preallocated") {
+ PreallocatedSetup = cast<CallBase>(*It->input_begin());
+ OpBundles.erase(It);
+ break;
+ }
+ }
+ assert(PreallocatedSetup && "Did not find preallocated bundle");
+ uint64_t ArgCount =
+ cast<ConstantInt>(PreallocatedSetup->getArgOperand(0))->getZExtValue();
+
+ assert((isa<CallInst>(CB) || isa<InvokeInst>(CB)) &&
+ "Unknown indirect call type");
+ CallBase *NewCB = CallBase::Create(CB, OpBundles, CB);
+ CB->replaceAllUsesWith(NewCB);
+ NewCB->takeName(CB);
+ CB->eraseFromParent();
+
+ Builder.SetInsertPoint(PreallocatedSetup);
+ auto *StackSave =
+ Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stacksave));
+
+ Builder.SetInsertPoint(NewCB->getNextNonDebugInstruction());
+ Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackrestore),
+ StackSave);
+
+ // Replace @llvm.call.preallocated.arg() with alloca.
+ // Cannot modify users() while iterating over it, so make a copy.
+ // @llvm.call.preallocated.arg() can be called with the same index multiple
+ // times. So for each @llvm.call.preallocated.arg(), we see if we have
+ // already created a Value* for the index, and if not, create an alloca and
+ // bitcast right after the @llvm.call.preallocated.setup() so that it
+ // dominates all uses.
+ SmallVector<Value *, 2> ArgAllocas(ArgCount);
+ SmallVector<User *, 2> PreallocatedArgs(PreallocatedSetup->users());
+ for (auto *User : PreallocatedArgs) {
+ auto *UseCall = cast<CallBase>(User);
+ assert(UseCall->getCalledFunction()->getIntrinsicID() ==
+ Intrinsic::call_preallocated_arg &&
+ "preallocated token use was not a llvm.call.preallocated.arg");
+ uint64_t AllocArgIndex =
+ cast<ConstantInt>(UseCall->getArgOperand(1))->getZExtValue();
+ Value *AllocaReplacement = ArgAllocas[AllocArgIndex];
+ if (!AllocaReplacement) {
+ auto AddressSpace = UseCall->getType()->getPointerAddressSpace();
+ auto *ArgType = UseCall
+ ->getAttribute(AttributeList::FunctionIndex,
+ Attribute::Preallocated)
+ .getValueAsType();
+ auto *InsertBefore = PreallocatedSetup->getNextNonDebugInstruction();
+ Builder.SetInsertPoint(InsertBefore);
+ auto *Alloca =
+ Builder.CreateAlloca(ArgType, AddressSpace, nullptr, "paarg");
+ auto *BitCast = Builder.CreateBitCast(
+ Alloca, Type::getInt8PtrTy(M->getContext()), UseCall->getName());
+ ArgAllocas[AllocArgIndex] = BitCast;
+ AllocaReplacement = BitCast;
+ }
+
+ UseCall->replaceAllUsesWith(AllocaReplacement);
+ UseCall->eraseFromParent();
+ }
+ // Remove @llvm.call.preallocated.setup().
+ cast<Instruction>(PreallocatedSetup)->eraseFromParent();
+ }
+}
+
+static bool
+OptimizeFunctions(Module &M,
+ function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+ function_ref<TargetTransformInfo &(Function &)> GetTTI,
+ function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+ function_ref<DominatorTree &(Function &)> LookupDomTree,
+ SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
+
+ bool Changed = false;
+
+ std::vector<Function *> AllCallsCold;
+ for (Module::iterator FI = M.begin(), E = M.end(); FI != E;) {
+ Function *F = &*FI++;
+ if (hasOnlyColdCalls(*F, GetBFI))
+ AllCallsCold.push_back(F);
+ }
+
+ // Optimize functions.
+ for (Module::iterator FI = M.begin(), E = M.end(); FI != E; ) {
+ Function *F = &*FI++;
+
+ // Don't perform global opt pass on naked functions; we don't want fast
+ // calling conventions for naked functions.
+ if (F->hasFnAttribute(Attribute::Naked))
+ continue;
+
+ // Functions without names cannot be referenced outside this module.
+ if (!F->hasName() && !F->isDeclaration() && !F->hasLocalLinkage())
+ F->setLinkage(GlobalValue::InternalLinkage);
+
+ if (deleteIfDead(*F, NotDiscardableComdats)) {
+ Changed = true;
+ continue;
+ }
+
+ // LLVM's definition of dominance allows instructions that are cyclic
+ // in unreachable blocks, e.g.:
+ // %pat = select i1 %condition, @global, i16* %pat
+ // because any instruction dominates an instruction in a block that's
+ // not reachable from entry.
+ // So, remove unreachable blocks from the function, because a) there's
+ // no point in analyzing them and b) GlobalOpt should otherwise grow
+ // some more complicated logic to break these cycles.
+ // Removing unreachable blocks might invalidate the dominator so we
+ // recalculate it.
+ if (!F->isDeclaration()) {
+ if (removeUnreachableBlocks(*F)) {
+ auto &DT = LookupDomTree(*F);
+ DT.recalculate(*F);
+ Changed = true;
+ }
+ }
+
+ Changed |= processGlobal(*F, GetTLI, LookupDomTree);
+
+ if (!F->hasLocalLinkage())
+ continue;
+
+ // If we have an inalloca parameter that we can safely remove the
+ // inalloca attribute from, do so. This unlocks optimizations that
+ // wouldn't be safe in the presence of inalloca.
+ // FIXME: We should also hoist alloca affected by this to the entry
+ // block if possible.
+ if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) &&
+ !F->hasAddressTaken() && !hasMustTailCallers(F)) {
+ RemoveAttribute(F, Attribute::InAlloca);
+ Changed = true;
+ }
+
+ // FIXME: handle invokes
+ // FIXME: handle musttail
+ if (F->getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
+ if (!F->hasAddressTaken() && !hasMustTailCallers(F) &&
+ !hasInvokeCallers(F)) {
+ RemovePreallocated(F);
+ Changed = true;
+ }
+ continue;
+ }
+
+ if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) {
+ NumInternalFunc++;
+ TargetTransformInfo &TTI = GetTTI(*F);
+ // Change the calling convention to coldcc if either stress testing is
+ // enabled or the target would like to use coldcc on functions which are
+ // cold at all call sites and the callers contain no other non coldcc
+ // calls.
+ if (EnableColdCCStressTest ||
+ (TTI.useColdCCForColdCall(*F) &&
+ isValidCandidateForColdCC(*F, GetBFI, AllCallsCold))) {
+ F->setCallingConv(CallingConv::Cold);
+ changeCallSitesToColdCC(F);
+ Changed = true;
+ NumColdCC++;
+ }
+ }
+
+ if (hasChangeableCC(F) && !F->isVarArg() &&
+ !F->hasAddressTaken()) {
+ // If this function has a calling convention worth changing, is not a
+ // varargs function, and is only called directly, promote it to use the
+ // Fast calling convention.
+ F->setCallingConv(CallingConv::Fast);
+ ChangeCalleesToFastCall(F);
+ ++NumFastCallFns;
+ Changed = true;
+ }
+
+ if (F->getAttributes().hasAttrSomewhere(Attribute::Nest) &&
+ !F->hasAddressTaken()) {
+ // The function is not used by a trampoline intrinsic, so it is safe
+ // to remove the 'nest' attribute.
+ RemoveAttribute(F, Attribute::Nest);
+ ++NumNestRemoved;
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+static bool
+OptimizeGlobalVars(Module &M,
+ function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+ function_ref<DominatorTree &(Function &)> LookupDomTree,
+ SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
+ bool Changed = false;
+
+ for (Module::global_iterator GVI = M.global_begin(), E = M.global_end();
+ GVI != E; ) {
+ GlobalVariable *GV = &*GVI++;
+ // Global variables without names cannot be referenced outside this module.
+ if (!GV->hasName() && !GV->isDeclaration() && !GV->hasLocalLinkage())
+ GV->setLinkage(GlobalValue::InternalLinkage);
+ // Simplify the initializer.
+ if (GV->hasInitializer())
+ if (auto *C = dyn_cast<Constant>(GV->getInitializer())) {
+ auto &DL = M.getDataLayout();
+ // TLI is not used in the case of a Constant, so use default nullptr
+ // for that optional parameter, since we don't have a Function to
+ // provide GetTLI anyway.
+ Constant *New = ConstantFoldConstant(C, DL, /*TLI*/ nullptr);
+ if (New != C)
+ GV->setInitializer(New);
+ }
+
+ if (deleteIfDead(*GV, NotDiscardableComdats)) {
+ Changed = true;
+ continue;
+ }
+
+ Changed |= processGlobal(*GV, GetTLI, LookupDomTree);
+ }
+ return Changed;
+}
+
+/// Evaluate a piece of a constantexpr store into a global initializer. This
+/// returns 'Init' modified to reflect 'Val' stored into it. At this point, the
+/// GEP operands of Addr [0, OpNo) have been stepped into.
+static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
+ ConstantExpr *Addr, unsigned OpNo) {
+ // Base case of the recursion.
+ if (OpNo == Addr->getNumOperands()) {
+ assert(Val->getType() == Init->getType() && "Type mismatch!");
+ return Val;
+ }
+
+ SmallVector<Constant*, 32> Elts;
+ if (StructType *STy = dyn_cast<StructType>(Init->getType())) {
+ // Break up the constant into its elements.
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+ Elts.push_back(Init->getAggregateElement(i));
+
+ // Replace the element that we are supposed to.
+ ConstantInt *CU = cast<ConstantInt>(Addr->getOperand(OpNo));
+ unsigned Idx = CU->getZExtValue();
+ assert(Idx < STy->getNumElements() && "Struct index out of range!");
+ Elts[Idx] = EvaluateStoreInto(Elts[Idx], Val, Addr, OpNo+1);
+
+ // Return the modified struct.
+ return ConstantStruct::get(STy, Elts);
+ }
+
+ ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
+ uint64_t NumElts;
+ if (ArrayType *ATy = dyn_cast<ArrayType>(Init->getType()))
+ NumElts = ATy->getNumElements();
+ else
+ NumElts = cast<FixedVectorType>(Init->getType())->getNumElements();
+
+ // Break up the array into elements.
+ for (uint64_t i = 0, e = NumElts; i != e; ++i)
+ Elts.push_back(Init->getAggregateElement(i));
+
+ assert(CI->getZExtValue() < NumElts);
+ Elts[CI->getZExtValue()] =
+ EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1);
+
+ if (Init->getType()->isArrayTy())
+ return ConstantArray::get(cast<ArrayType>(Init->getType()), Elts);
+ return ConstantVector::get(Elts);
+}
+
+/// We have decided that Addr (which satisfies the predicate
+/// isSimpleEnoughPointerToCommit) should get Val as its value. Make it happen.
+static void CommitValueTo(Constant *Val, Constant *Addr) {
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+ assert(GV->hasInitializer());
+ GV->setInitializer(Val);
+ return;
+ }
+
+ ConstantExpr *CE = cast<ConstantExpr>(Addr);
+ GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+ GV->setInitializer(EvaluateStoreInto(GV->getInitializer(), Val, CE, 2));
+}
+
+/// Given a map of address -> value, where addresses are expected to be some form
+/// of either a global or a constant GEP, set the initializer for the address to
+/// be the value. This performs mostly the same function as CommitValueTo()
+/// and EvaluateStoreInto() but is optimized to be more efficient for the common
+/// case where the set of addresses are GEPs sharing the same underlying global,
+/// processing the GEPs in batches rather than individually.
+///
+/// To give an example, consider the following C++ code adapted from the clang
+/// regression tests:
+/// struct S {
+/// int n = 10;
+/// int m = 2 * n;
+/// S(int a) : n(a) {}
+/// };
+///
+/// template<typename T>
+/// struct U {
+/// T *r = &q;
+/// T q = 42;
+/// U *p = this;
+/// };
+///
+/// U<S> e;
+///
+/// The global static constructor for 'e' will need to initialize 'r' and 'p' of
+/// the outer struct, while also initializing the inner 'q' structs 'n' and 'm'
+/// members. This batch algorithm will simply use general CommitValueTo() method
+/// to handle the complex nested S struct initialization of 'q', before
+/// processing the outermost members in a single batch. Using CommitValueTo() to
+/// handle member in the outer struct is inefficient when the struct/array is
+/// very large as we end up creating and destroy constant arrays for each
+/// initialization.
+/// For the above case, we expect the following IR to be generated:
+///
+/// %struct.U = type { %struct.S*, %struct.S, %struct.U* }
+/// %struct.S = type { i32, i32 }
+/// @e = global %struct.U { %struct.S* gep inbounds (%struct.U, %struct.U* @e,
+/// i64 0, i32 1),
+/// %struct.S { i32 42, i32 84 }, %struct.U* @e }
+/// The %struct.S { i32 42, i32 84 } inner initializer is treated as a complex
+/// constant expression, while the other two elements of @e are "simple".
+static void BatchCommitValueTo(const DenseMap<Constant*, Constant*> &Mem) {
+ SmallVector<std::pair<GlobalVariable*, Constant*>, 32> GVs;
+ SmallVector<std::pair<ConstantExpr*, Constant*>, 32> ComplexCEs;
+ SmallVector<std::pair<ConstantExpr*, Constant*>, 32> SimpleCEs;
+ SimpleCEs.reserve(Mem.size());
+
+ for (const auto &I : Mem) {
+ if (auto *GV = dyn_cast<GlobalVariable>(I.first)) {
+ GVs.push_back(std::make_pair(GV, I.second));
+ } else {
+ ConstantExpr *GEP = cast<ConstantExpr>(I.first);
+ // We don't handle the deeply recursive case using the batch method.
+ if (GEP->getNumOperands() > 3)
+ ComplexCEs.push_back(std::make_pair(GEP, I.second));
+ else
+ SimpleCEs.push_back(std::make_pair(GEP, I.second));
+ }
+ }
+
+ // The algorithm below doesn't handle cases like nested structs, so use the
+ // slower fully general method if we have to.
+ for (auto ComplexCE : ComplexCEs)
+ CommitValueTo(ComplexCE.second, ComplexCE.first);
+
+ for (auto GVPair : GVs) {
+ assert(GVPair.first->hasInitializer());
+ GVPair.first->setInitializer(GVPair.second);
+ }
+
+ if (SimpleCEs.empty())
+ return;
+
+ // We cache a single global's initializer elements in the case where the
+ // subsequent address/val pair uses the same one. This avoids throwing away and
+ // rebuilding the constant struct/vector/array just because one element is
+ // modified at a time.
+ SmallVector<Constant *, 32> Elts;
+ Elts.reserve(SimpleCEs.size());
+ GlobalVariable *CurrentGV = nullptr;
+
+ auto commitAndSetupCache = [&](GlobalVariable *GV, bool Update) {
+ Constant *Init = GV->getInitializer();
+ Type *Ty = Init->getType();
+ if (Update) {
+ if (CurrentGV) {
+ assert(CurrentGV && "Expected a GV to commit to!");
+ Type *CurrentInitTy = CurrentGV->getInitializer()->getType();
+ // We have a valid cache that needs to be committed.
+ if (StructType *STy = dyn_cast<StructType>(CurrentInitTy))
+ CurrentGV->setInitializer(ConstantStruct::get(STy, Elts));
+ else if (ArrayType *ArrTy = dyn_cast<ArrayType>(CurrentInitTy))
+ CurrentGV->setInitializer(ConstantArray::get(ArrTy, Elts));
+ else
+ CurrentGV->setInitializer(ConstantVector::get(Elts));
+ }
+ if (CurrentGV == GV)
+ return;
+ // Need to clear and set up cache for new initializer.
+ CurrentGV = GV;
+ Elts.clear();
+ unsigned NumElts;
+ if (auto *STy = dyn_cast<StructType>(Ty))
+ NumElts = STy->getNumElements();
+ else if (auto *ATy = dyn_cast<ArrayType>(Ty))
+ NumElts = ATy->getNumElements();
+ else
+ NumElts = cast<FixedVectorType>(Ty)->getNumElements();
+ for (unsigned i = 0, e = NumElts; i != e; ++i)
+ Elts.push_back(Init->getAggregateElement(i));
+ }
+ };
+
+ for (auto CEPair : SimpleCEs) {
+ ConstantExpr *GEP = CEPair.first;
+ Constant *Val = CEPair.second;
+
+ GlobalVariable *GV = cast<GlobalVariable>(GEP->getOperand(0));
+ commitAndSetupCache(GV, GV != CurrentGV);
+ ConstantInt *CI = cast<ConstantInt>(GEP->getOperand(2));
+ Elts[CI->getZExtValue()] = Val;
+ }
+ // The last initializer in the list needs to be committed, others
+ // will be committed on a new initializer being processed.
+ commitAndSetupCache(CurrentGV, true);
+}
+
+/// Evaluate static constructors in the function, if we can. Return true if we
+/// can, false otherwise.
+static bool EvaluateStaticConstructor(Function *F, const DataLayout &DL,
+ TargetLibraryInfo *TLI) {
+ // Call the function.
+ Evaluator Eval(DL, TLI);
+ Constant *RetValDummy;
+ bool EvalSuccess = Eval.EvaluateFunction(F, RetValDummy,
+ SmallVector<Constant*, 0>());
+
+ if (EvalSuccess) {
+ ++NumCtorsEvaluated;
+
+ // We succeeded at evaluation: commit the result.
+ LLVM_DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
+ << F->getName() << "' to "
+ << Eval.getMutatedMemory().size() << " stores.\n");
+ BatchCommitValueTo(Eval.getMutatedMemory());
+ for (GlobalVariable *GV : Eval.getInvariants())
+ GV->setConstant(true);
+ }
+
+ return EvalSuccess;
+}
+
+static int compareNames(Constant *const *A, Constant *const *B) {
+ Value *AStripped = (*A)->stripPointerCasts();
+ Value *BStripped = (*B)->stripPointerCasts();
+ return AStripped->getName().compare(BStripped->getName());
+}
+
+static void setUsedInitializer(GlobalVariable &V,
+ const SmallPtrSetImpl<GlobalValue *> &Init) {
+ if (Init.empty()) {
+ V.eraseFromParent();
+ return;
+ }
+
+ // Type of pointer to the array of pointers.
+ PointerType *Int8PtrTy = Type::getInt8PtrTy(V.getContext(), 0);
+
+ SmallVector<Constant *, 8> UsedArray;
+ for (GlobalValue *GV : Init) {
+ Constant *Cast
+ = ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV, Int8PtrTy);
+ UsedArray.push_back(Cast);
+ }
+ // Sort to get deterministic order.
+ array_pod_sort(UsedArray.begin(), UsedArray.end(), compareNames);
+ ArrayType *ATy = ArrayType::get(Int8PtrTy, UsedArray.size());
+
+ Module *M = V.getParent();
+ V.removeFromParent();
+ GlobalVariable *NV =
+ new GlobalVariable(*M, ATy, false, GlobalValue::AppendingLinkage,
+ ConstantArray::get(ATy, UsedArray), "");
+ NV->takeName(&V);
+ NV->setSection("llvm.metadata");
+ delete &V;
+}
+
+namespace {
+
+/// An easy to access representation of llvm.used and llvm.compiler.used.
+class LLVMUsed {
+ SmallPtrSet<GlobalValue *, 8> Used;
+ SmallPtrSet<GlobalValue *, 8> CompilerUsed;
+ GlobalVariable *UsedV;
+ GlobalVariable *CompilerUsedV;
+
+public:
+ LLVMUsed(Module &M) {
+ UsedV = collectUsedGlobalVariables(M, Used, false);
+ CompilerUsedV = collectUsedGlobalVariables(M, CompilerUsed, true);
+ }
+
+ using iterator = SmallPtrSet<GlobalValue *, 8>::iterator;
+ using used_iterator_range = iterator_range<iterator>;
+
+ iterator usedBegin() { return Used.begin(); }
+ iterator usedEnd() { return Used.end(); }
+
+ used_iterator_range used() {
+ return used_iterator_range(usedBegin(), usedEnd());
+ }
+
+ iterator compilerUsedBegin() { return CompilerUsed.begin(); }
+ iterator compilerUsedEnd() { return CompilerUsed.end(); }
+
+ used_iterator_range compilerUsed() {
+ return used_iterator_range(compilerUsedBegin(), compilerUsedEnd());
+ }
+
+ bool usedCount(GlobalValue *GV) const { return Used.count(GV); }
+
+ bool compilerUsedCount(GlobalValue *GV) const {
+ return CompilerUsed.count(GV);
+ }
+
+ bool usedErase(GlobalValue *GV) { return Used.erase(GV); }
+ bool compilerUsedErase(GlobalValue *GV) { return CompilerUsed.erase(GV); }
+ bool usedInsert(GlobalValue *GV) { return Used.insert(GV).second; }
+
+ bool compilerUsedInsert(GlobalValue *GV) {
+ return CompilerUsed.insert(GV).second;
+ }
+
+ void syncVariablesAndSets() {
+ if (UsedV)
+ setUsedInitializer(*UsedV, Used);
+ if (CompilerUsedV)
+ setUsedInitializer(*CompilerUsedV, CompilerUsed);
+ }
+};
+
+} // end anonymous namespace
+
+static bool hasUseOtherThanLLVMUsed(GlobalAlias &GA, const LLVMUsed &U) {
+ if (GA.use_empty()) // No use at all.
+ return false;
+
+ assert((!U.usedCount(&GA) || !U.compilerUsedCount(&GA)) &&
+ "We should have removed the duplicated "
+ "element from llvm.compiler.used");
+ if (!GA.hasOneUse())
+ // Strictly more than one use. So at least one is not in llvm.used and
+ // llvm.compiler.used.
+ return true;
+
+ // Exactly one use. Check if it is in llvm.used or llvm.compiler.used.
+ return !U.usedCount(&GA) && !U.compilerUsedCount(&GA);
+}
+
+static bool hasMoreThanOneUseOtherThanLLVMUsed(GlobalValue &V,
+ const LLVMUsed &U) {
+ unsigned N = 2;
+ assert((!U.usedCount(&V) || !U.compilerUsedCount(&V)) &&
+ "We should have removed the duplicated "
+ "element from llvm.compiler.used");
+ if (U.usedCount(&V) || U.compilerUsedCount(&V))
+ ++N;
+ return V.hasNUsesOrMore(N);
+}
+
+static bool mayHaveOtherReferences(GlobalAlias &GA, const LLVMUsed &U) {
+ if (!GA.hasLocalLinkage())
+ return true;
+
+ return U.usedCount(&GA) || U.compilerUsedCount(&GA);
+}
+
+static bool hasUsesToReplace(GlobalAlias &GA, const LLVMUsed &U,
+ bool &RenameTarget) {
+ RenameTarget = false;
+ bool Ret = false;
+ if (hasUseOtherThanLLVMUsed(GA, U))
+ Ret = true;
+
+ // If the alias is externally visible, we may still be able to simplify it.
+ if (!mayHaveOtherReferences(GA, U))
+ return Ret;
+
+ // If the aliasee has internal linkage, give it the name and linkage
+ // of the alias, and delete the alias. This turns:
+ // define internal ... @f(...)
+ // @a = alias ... @f
+ // into:
+ // define ... @a(...)
+ Constant *Aliasee = GA.getAliasee();
+ GlobalValue *Target = cast<GlobalValue>(Aliasee->stripPointerCasts());
+ if (!Target->hasLocalLinkage())
+ return Ret;
+
+ // Do not perform the transform if multiple aliases potentially target the
+ // aliasee. This check also ensures that it is safe to replace the section
+ // and other attributes of the aliasee with those of the alias.
+ if (hasMoreThanOneUseOtherThanLLVMUsed(*Target, U))
+ return Ret;
+
+ RenameTarget = true;
+ return true;
+}
+
+static bool
+OptimizeGlobalAliases(Module &M,
+ SmallPtrSetImpl<const Comdat *> &NotDiscardableComdats) {
+ bool Changed = false;
+ LLVMUsed Used(M);
+
+ for (GlobalValue *GV : Used.used())
+ Used.compilerUsedErase(GV);
+
+ for (Module::alias_iterator I = M.alias_begin(), E = M.alias_end();
+ I != E;) {
+ GlobalAlias *J = &*I++;
+
+ // Aliases without names cannot be referenced outside this module.
+ if (!J->hasName() && !J->isDeclaration() && !J->hasLocalLinkage())
+ J->setLinkage(GlobalValue::InternalLinkage);
+
+ if (deleteIfDead(*J, NotDiscardableComdats)) {
+ Changed = true;
+ continue;
+ }
+
+ // If the alias can change at link time, nothing can be done - bail out.
+ if (J->isInterposable())
+ continue;
+
+ Constant *Aliasee = J->getAliasee();
+ GlobalValue *Target = dyn_cast<GlobalValue>(Aliasee->stripPointerCasts());
+ // We can't trivially replace the alias with the aliasee if the aliasee is
+ // non-trivial in some way.
+ // TODO: Try to handle non-zero GEPs of local aliasees.
+ if (!Target)
+ continue;
+ Target->removeDeadConstantUsers();
+
+ // Make all users of the alias use the aliasee instead.
+ bool RenameTarget;
+ if (!hasUsesToReplace(*J, Used, RenameTarget))
+ continue;
+
+ J->replaceAllUsesWith(ConstantExpr::getBitCast(Aliasee, J->getType()));
+ ++NumAliasesResolved;
+ Changed = true;
+
+ if (RenameTarget) {
+ // Give the aliasee the name, linkage and other attributes of the alias.
+ Target->takeName(&*J);
+ Target->setLinkage(J->getLinkage());
+ Target->setDSOLocal(J->isDSOLocal());
+ Target->setVisibility(J->getVisibility());
+ Target->setDLLStorageClass(J->getDLLStorageClass());
+
+ if (Used.usedErase(&*J))
+ Used.usedInsert(Target);
+
+ if (Used.compilerUsedErase(&*J))
+ Used.compilerUsedInsert(Target);
+ } else if (mayHaveOtherReferences(*J, Used))
+ continue;
+
+ // Delete the alias.
+ M.getAliasList().erase(J);
+ ++NumAliasesRemoved;
+ Changed = true;
+ }
+
+ Used.syncVariablesAndSets();
+
+ return Changed;
+}
+
+static Function *
+FindCXAAtExit(Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+ // Hack to get a default TLI before we have actual Function.
+ auto FuncIter = M.begin();
+ if (FuncIter == M.end())
+ return nullptr;
+ auto *TLI = &GetTLI(*FuncIter);
+
+ LibFunc F = LibFunc_cxa_atexit;
+ if (!TLI->has(F))
+ return nullptr;
+
+ Function *Fn = M.getFunction(TLI->getName(F));
+ if (!Fn)
+ return nullptr;
+
+ // Now get the actual TLI for Fn.
+ TLI = &GetTLI(*Fn);
+
+ // Make sure that the function has the correct prototype.
+ if (!TLI->getLibFunc(*Fn, F) || F != LibFunc_cxa_atexit)
+ return nullptr;
+
+ return Fn;
+}
+
+/// Returns whether the given function is an empty C++ destructor and can
+/// therefore be eliminated.
+/// Note that we assume that other optimization passes have already simplified
+/// the code so we simply check for 'ret'.
+static bool cxxDtorIsEmpty(const Function &Fn) {
+ // FIXME: We could eliminate C++ destructors if they're readonly/readnone and
+ // nounwind, but that doesn't seem worth doing.
+ if (Fn.isDeclaration())
+ return false;
+
+ for (auto &I : Fn.getEntryBlock()) {
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+ if (isa<ReturnInst>(I))
+ return true;
+ break;
+ }
+ return false;
+}
+
+static bool OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
+ /// Itanium C++ ABI p3.3.5:
+ ///
+ /// After constructing a global (or local static) object, that will require
+ /// destruction on exit, a termination function is registered as follows:
+ ///
+ /// extern "C" int __cxa_atexit ( void (*f)(void *), void *p, void *d );
+ ///
+ /// This registration, e.g. __cxa_atexit(f,p,d), is intended to cause the
+ /// call f(p) when DSO d is unloaded, before all such termination calls
+ /// registered before this one. It returns zero if registration is
+ /// successful, nonzero on failure.
+
+ // This pass will look for calls to __cxa_atexit where the function is trivial
+ // and remove them.
+ bool Changed = false;
+
+ for (auto I = CXAAtExitFn->user_begin(), E = CXAAtExitFn->user_end();
+ I != E;) {
+ // We're only interested in calls. Theoretically, we could handle invoke
+ // instructions as well, but neither llvm-gcc nor clang generate invokes
+ // to __cxa_atexit.
+ CallInst *CI = dyn_cast<CallInst>(*I++);
+ if (!CI)
+ continue;
+
+ Function *DtorFn =
+ dyn_cast<Function>(CI->getArgOperand(0)->stripPointerCasts());
+ if (!DtorFn || !cxxDtorIsEmpty(*DtorFn))
+ continue;
+
+ // Just remove the call.
+ CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
+ CI->eraseFromParent();
+
+ ++NumCXXDtorsRemoved;
+
+ Changed |= true;
+ }
+
+ return Changed;
+}
+
+static bool optimizeGlobalsInModule(
+ Module &M, const DataLayout &DL,
+ function_ref<TargetLibraryInfo &(Function &)> GetTLI,
+ function_ref<TargetTransformInfo &(Function &)> GetTTI,
+ function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+ function_ref<DominatorTree &(Function &)> LookupDomTree) {
+ SmallPtrSet<const Comdat *, 8> NotDiscardableComdats;
+ bool Changed = false;
+ bool LocalChange = true;
+ while (LocalChange) {
+ LocalChange = false;
+
+ NotDiscardableComdats.clear();
+ for (const GlobalVariable &GV : M.globals())
+ if (const Comdat *C = GV.getComdat())
+ if (!GV.isDiscardableIfUnused() || !GV.use_empty())
+ NotDiscardableComdats.insert(C);
+ for (Function &F : M)
+ if (const Comdat *C = F.getComdat())
+ if (!F.isDefTriviallyDead())
+ NotDiscardableComdats.insert(C);
+ for (GlobalAlias &GA : M.aliases())
+ if (const Comdat *C = GA.getComdat())
+ if (!GA.isDiscardableIfUnused() || !GA.use_empty())
+ NotDiscardableComdats.insert(C);
+
+ // Delete functions that are trivially dead, ccc -> fastcc
+ LocalChange |= OptimizeFunctions(M, GetTLI, GetTTI, GetBFI, LookupDomTree,
+ NotDiscardableComdats);
+
+ // Optimize global_ctors list.
+ LocalChange |= optimizeGlobalCtorsList(M, [&](Function *F) {
+ return EvaluateStaticConstructor(F, DL, &GetTLI(*F));
+ });
+
+ // Optimize non-address-taken globals.
+ LocalChange |=
+ OptimizeGlobalVars(M, GetTLI, LookupDomTree, NotDiscardableComdats);
+
+ // Resolve aliases, when possible.
+ LocalChange |= OptimizeGlobalAliases(M, NotDiscardableComdats);
+
+ // Try to remove trivial global destructors if they are not removed
+ // already.
+ Function *CXAAtExitFn = FindCXAAtExit(M, GetTLI);
+ if (CXAAtExitFn)
+ LocalChange |= OptimizeEmptyGlobalCXXDtors(CXAAtExitFn);
+
+ Changed |= LocalChange;
+ }
+
+ // TODO: Move all global ctors functions to the end of the module for code
+ // layout.
+
+ return Changed;
+}
+
+PreservedAnalyses GlobalOptPass::run(Module &M, ModuleAnalysisManager &AM) {
+ auto &DL = M.getDataLayout();
+ auto &FAM =
+ AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto LookupDomTree = [&FAM](Function &F) -> DominatorTree &{
+ return FAM.getResult<DominatorTreeAnalysis>(F);
+ };
+ auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+ return FAM.getResult<TargetLibraryAnalysis>(F);
+ };
+ auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
+ return FAM.getResult<TargetIRAnalysis>(F);
+ };
+
+ auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
+ return FAM.getResult<BlockFrequencyAnalysis>(F);
+ };
+
+ if (!optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI, LookupDomTree))
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
+
+namespace {
+
+struct GlobalOptLegacyPass : public ModulePass {
+ static char ID; // Pass identification, replacement for typeid
+
+ GlobalOptLegacyPass() : ModulePass(ID) {
+ initializeGlobalOptLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+
+ auto &DL = M.getDataLayout();
+ auto LookupDomTree = [this](Function &F) -> DominatorTree & {
+ return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+ };
+ auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+ return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ };
+ auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
+ return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ };
+
+ auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & {
+ return this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+ };
+
+ return optimizeGlobalsInModule(M, DL, GetTLI, GetTTI, GetBFI,
+ LookupDomTree);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char GlobalOptLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GlobalOptLegacyPass, "globalopt",
+ "Global Variable Optimizer", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GlobalOptLegacyPass, "globalopt",
+ "Global Variable Optimizer", false, false)
+
+ModulePass *llvm::createGlobalOptimizerPass() {
+ return new GlobalOptLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/GlobalSplit.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/GlobalSplit.cpp
index 289099af3a..365b269dc3 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/GlobalSplit.cpp
@@ -1,196 +1,196 @@
-//===- GlobalSplit.cpp - global variable splitter -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass uses inrange annotations on GEP indices to split globals where
-// beneficial. Clang currently attaches these annotations to references to
-// virtual table globals under the Itanium ABI for the benefit of the
-// whole-program virtual call optimization and control flow integrity passes.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/GlobalSplit.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Transforms/IPO.h"
-#include <cstdint>
-#include <vector>
-
-using namespace llvm;
-
-static bool splitGlobal(GlobalVariable &GV) {
- // If the address of the global is taken outside of the module, we cannot
- // apply this transformation.
- if (!GV.hasLocalLinkage())
- return false;
-
- // We currently only know how to split ConstantStructs.
- auto *Init = dyn_cast_or_null<ConstantStruct>(GV.getInitializer());
- if (!Init)
- return false;
-
- // Verify that each user of the global is an inrange getelementptr constant.
- // From this it follows that any loads from or stores to that global must use
- // a pointer derived from an inrange getelementptr constant, which is
- // sufficient to allow us to apply the splitting transform.
- for (User *U : GV.users()) {
- if (!isa<Constant>(U))
- return false;
-
- auto *GEP = dyn_cast<GEPOperator>(U);
- if (!GEP || !GEP->getInRangeIndex() || *GEP->getInRangeIndex() != 1 ||
- !isa<ConstantInt>(GEP->getOperand(1)) ||
- !cast<ConstantInt>(GEP->getOperand(1))->isZero() ||
- !isa<ConstantInt>(GEP->getOperand(2)))
- return false;
- }
-
- SmallVector<MDNode *, 2> Types;
- GV.getMetadata(LLVMContext::MD_type, Types);
-
- const DataLayout &DL = GV.getParent()->getDataLayout();
- const StructLayout *SL = DL.getStructLayout(Init->getType());
-
- IntegerType *Int32Ty = Type::getInt32Ty(GV.getContext());
-
- std::vector<GlobalVariable *> SplitGlobals(Init->getNumOperands());
- for (unsigned I = 0; I != Init->getNumOperands(); ++I) {
- // Build a global representing this split piece.
- auto *SplitGV =
- new GlobalVariable(*GV.getParent(), Init->getOperand(I)->getType(),
- GV.isConstant(), GlobalValue::PrivateLinkage,
- Init->getOperand(I), GV.getName() + "." + utostr(I));
- SplitGlobals[I] = SplitGV;
-
- unsigned SplitBegin = SL->getElementOffset(I);
- unsigned SplitEnd = (I == Init->getNumOperands() - 1)
- ? SL->getSizeInBytes()
- : SL->getElementOffset(I + 1);
-
- // Rebuild type metadata, adjusting by the split offset.
- // FIXME: See if we can use DW_OP_piece to preserve debug metadata here.
- for (MDNode *Type : Types) {
- uint64_t ByteOffset = cast<ConstantInt>(
- cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
- ->getZExtValue();
- // Type metadata may be attached one byte after the end of the vtable, for
- // classes without virtual methods in Itanium ABI. AFAIK, it is never
- // attached to the first byte of a vtable. Subtract one to get the right
- // slice.
- // This is making an assumption that vtable groups are the only kinds of
- // global variables that !type metadata can be attached to, and that they
- // are either Itanium ABI vtable groups or contain a single vtable (i.e.
- // Microsoft ABI vtables).
- uint64_t AttachedTo = (ByteOffset == 0) ? ByteOffset : ByteOffset - 1;
- if (AttachedTo < SplitBegin || AttachedTo >= SplitEnd)
- continue;
- SplitGV->addMetadata(
- LLVMContext::MD_type,
- *MDNode::get(GV.getContext(),
- {ConstantAsMetadata::get(
- ConstantInt::get(Int32Ty, ByteOffset - SplitBegin)),
- Type->getOperand(1)}));
- }
-
- if (GV.hasMetadata(LLVMContext::MD_vcall_visibility))
- SplitGV->setVCallVisibilityMetadata(GV.getVCallVisibility());
- }
-
- for (User *U : GV.users()) {
- auto *GEP = cast<GEPOperator>(U);
- unsigned I = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
- if (I >= SplitGlobals.size())
- continue;
-
- SmallVector<Value *, 4> Ops;
- Ops.push_back(ConstantInt::get(Int32Ty, 0));
- for (unsigned I = 3; I != GEP->getNumOperands(); ++I)
- Ops.push_back(GEP->getOperand(I));
-
- auto *NewGEP = ConstantExpr::getGetElementPtr(
- SplitGlobals[I]->getInitializer()->getType(), SplitGlobals[I], Ops,
- GEP->isInBounds());
- GEP->replaceAllUsesWith(NewGEP);
- }
-
- // Finally, remove the original global. Any remaining uses refer to invalid
- // elements of the global, so replace with undef.
- if (!GV.use_empty())
- GV.replaceAllUsesWith(UndefValue::get(GV.getType()));
- GV.eraseFromParent();
- return true;
-}
-
-static bool splitGlobals(Module &M) {
- // First, see if the module uses either of the llvm.type.test or
- // llvm.type.checked.load intrinsics, which indicates that splitting globals
- // may be beneficial.
- Function *TypeTestFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_test));
- Function *TypeCheckedLoadFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
- if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
- (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
- return false;
-
- bool Changed = false;
- for (auto I = M.global_begin(); I != M.global_end();) {
- GlobalVariable &GV = *I;
- ++I;
- Changed |= splitGlobal(GV);
- }
- return Changed;
-}
-
-namespace {
-
-struct GlobalSplit : public ModulePass {
- static char ID;
-
- GlobalSplit() : ModulePass(ID) {
- initializeGlobalSplitPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
-
- return splitGlobals(M);
- }
-};
-
-} // end anonymous namespace
-
-char GlobalSplit::ID = 0;
-
-INITIALIZE_PASS(GlobalSplit, "globalsplit", "Global splitter", false, false)
-
-ModulePass *llvm::createGlobalSplitPass() {
- return new GlobalSplit;
-}
-
-PreservedAnalyses GlobalSplitPass::run(Module &M, ModuleAnalysisManager &AM) {
- if (!splitGlobals(M))
- return PreservedAnalyses::all();
- return PreservedAnalyses::none();
-}
+//===- GlobalSplit.cpp - global variable splitter -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass uses inrange annotations on GEP indices to split globals where
+// beneficial. Clang currently attaches these annotations to references to
+// virtual table globals under the Itanium ABI for the benefit of the
+// whole-program virtual call optimization and control flow integrity passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/GlobalSplit.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/IPO.h"
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+
+static bool splitGlobal(GlobalVariable &GV) {
+ // If the address of the global is taken outside of the module, we cannot
+ // apply this transformation.
+ if (!GV.hasLocalLinkage())
+ return false;
+
+ // We currently only know how to split ConstantStructs.
+ auto *Init = dyn_cast_or_null<ConstantStruct>(GV.getInitializer());
+ if (!Init)
+ return false;
+
+ // Verify that each user of the global is an inrange getelementptr constant.
+ // From this it follows that any loads from or stores to that global must use
+ // a pointer derived from an inrange getelementptr constant, which is
+ // sufficient to allow us to apply the splitting transform.
+ for (User *U : GV.users()) {
+ if (!isa<Constant>(U))
+ return false;
+
+ auto *GEP = dyn_cast<GEPOperator>(U);
+ if (!GEP || !GEP->getInRangeIndex() || *GEP->getInRangeIndex() != 1 ||
+ !isa<ConstantInt>(GEP->getOperand(1)) ||
+ !cast<ConstantInt>(GEP->getOperand(1))->isZero() ||
+ !isa<ConstantInt>(GEP->getOperand(2)))
+ return false;
+ }
+
+ SmallVector<MDNode *, 2> Types;
+ GV.getMetadata(LLVMContext::MD_type, Types);
+
+ const DataLayout &DL = GV.getParent()->getDataLayout();
+ const StructLayout *SL = DL.getStructLayout(Init->getType());
+
+ IntegerType *Int32Ty = Type::getInt32Ty(GV.getContext());
+
+ std::vector<GlobalVariable *> SplitGlobals(Init->getNumOperands());
+ for (unsigned I = 0; I != Init->getNumOperands(); ++I) {
+ // Build a global representing this split piece.
+ auto *SplitGV =
+ new GlobalVariable(*GV.getParent(), Init->getOperand(I)->getType(),
+ GV.isConstant(), GlobalValue::PrivateLinkage,
+ Init->getOperand(I), GV.getName() + "." + utostr(I));
+ SplitGlobals[I] = SplitGV;
+
+ unsigned SplitBegin = SL->getElementOffset(I);
+ unsigned SplitEnd = (I == Init->getNumOperands() - 1)
+ ? SL->getSizeInBytes()
+ : SL->getElementOffset(I + 1);
+
+ // Rebuild type metadata, adjusting by the split offset.
+ // FIXME: See if we can use DW_OP_piece to preserve debug metadata here.
+ for (MDNode *Type : Types) {
+ uint64_t ByteOffset = cast<ConstantInt>(
+ cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+ ->getZExtValue();
+ // Type metadata may be attached one byte after the end of the vtable, for
+ // classes without virtual methods in Itanium ABI. AFAIK, it is never
+ // attached to the first byte of a vtable. Subtract one to get the right
+ // slice.
+ // This is making an assumption that vtable groups are the only kinds of
+ // global variables that !type metadata can be attached to, and that they
+ // are either Itanium ABI vtable groups or contain a single vtable (i.e.
+ // Microsoft ABI vtables).
+ uint64_t AttachedTo = (ByteOffset == 0) ? ByteOffset : ByteOffset - 1;
+ if (AttachedTo < SplitBegin || AttachedTo >= SplitEnd)
+ continue;
+ SplitGV->addMetadata(
+ LLVMContext::MD_type,
+ *MDNode::get(GV.getContext(),
+ {ConstantAsMetadata::get(
+ ConstantInt::get(Int32Ty, ByteOffset - SplitBegin)),
+ Type->getOperand(1)}));
+ }
+
+ if (GV.hasMetadata(LLVMContext::MD_vcall_visibility))
+ SplitGV->setVCallVisibilityMetadata(GV.getVCallVisibility());
+ }
+
+ for (User *U : GV.users()) {
+ auto *GEP = cast<GEPOperator>(U);
+ unsigned I = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+ if (I >= SplitGlobals.size())
+ continue;
+
+ SmallVector<Value *, 4> Ops;
+ Ops.push_back(ConstantInt::get(Int32Ty, 0));
+ for (unsigned I = 3; I != GEP->getNumOperands(); ++I)
+ Ops.push_back(GEP->getOperand(I));
+
+ auto *NewGEP = ConstantExpr::getGetElementPtr(
+ SplitGlobals[I]->getInitializer()->getType(), SplitGlobals[I], Ops,
+ GEP->isInBounds());
+ GEP->replaceAllUsesWith(NewGEP);
+ }
+
+ // Finally, remove the original global. Any remaining uses refer to invalid
+ // elements of the global, so replace with undef.
+ if (!GV.use_empty())
+ GV.replaceAllUsesWith(UndefValue::get(GV.getType()));
+ GV.eraseFromParent();
+ return true;
+}
+
+static bool splitGlobals(Module &M) {
+ // First, see if the module uses either of the llvm.type.test or
+ // llvm.type.checked.load intrinsics, which indicates that splitting globals
+ // may be beneficial.
+ Function *TypeTestFunc =
+ M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+ Function *TypeCheckedLoadFunc =
+ M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
+ if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
+ (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
+ return false;
+
+ bool Changed = false;
+ for (auto I = M.global_begin(); I != M.global_end();) {
+ GlobalVariable &GV = *I;
+ ++I;
+ Changed |= splitGlobal(GV);
+ }
+ return Changed;
+}
+
+namespace {
+
+struct GlobalSplit : public ModulePass {
+ static char ID;
+
+ GlobalSplit() : ModulePass(ID) {
+ initializeGlobalSplitPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+
+ return splitGlobals(M);
+ }
+};
+
+} // end anonymous namespace
+
+char GlobalSplit::ID = 0;
+
+INITIALIZE_PASS(GlobalSplit, "globalsplit", "Global splitter", false, false)
+
+ModulePass *llvm::createGlobalSplitPass() {
+ return new GlobalSplit;
+}
+
+PreservedAnalyses GlobalSplitPass::run(Module &M, ModuleAnalysisManager &AM) {
+ if (!splitGlobals(M))
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/HotColdSplitting.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/HotColdSplitting.cpp
index 35dcaf85db..aa708ee520 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -1,91 +1,91 @@
-//===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// The goal of hot/cold splitting is to improve the memory locality of code.
-/// The splitting pass does this by identifying cold blocks and moving them into
-/// separate functions.
-///
-/// When the splitting pass finds a cold block (referred to as "the sink"), it
-/// grows a maximal cold region around that block. The maximal region contains
-/// all blocks (post-)dominated by the sink [*]. In theory, these blocks are as
-/// cold as the sink. Once a region is found, it's split out of the original
-/// function provided it's profitable to do so.
-///
-/// [*] In practice, there is some added complexity because some blocks are not
-/// safe to extract.
-///
-/// TODO: Use the PM to get domtrees, and preserve BFI/BPI.
-/// TODO: Reorder outlined functions.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/HotColdSplitting.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/BlockFrequency.h"
-#include "llvm/Support/BranchProbability.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/CodeExtractor.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <algorithm>
+//===- HotColdSplitting.cpp -- Outline Cold Regions -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// The goal of hot/cold splitting is to improve the memory locality of code.
+/// The splitting pass does this by identifying cold blocks and moving them into
+/// separate functions.
+///
+/// When the splitting pass finds a cold block (referred to as "the sink"), it
+/// grows a maximal cold region around that block. The maximal region contains
+/// all blocks (post-)dominated by the sink [*]. In theory, these blocks are as
+/// cold as the sink. Once a region is found, it's split out of the original
+/// function provided it's profitable to do so.
+///
+/// [*] In practice, there is some added complexity because some blocks are not
+/// safe to extract.
+///
+/// TODO: Use the PM to get domtrees, and preserve BFI/BPI.
+/// TODO: Reorder outlined functions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/HotColdSplitting.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
#include <limits>
-#include <cassert>
+#include <cassert>
#include <string>
-
-#define DEBUG_TYPE "hotcoldsplit"
-
-STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
-STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
-
-using namespace llvm;
-
+
+#define DEBUG_TYPE "hotcoldsplit"
+
+STATISTIC(NumColdRegionsFound, "Number of cold regions found.");
+STATISTIC(NumColdRegionsOutlined, "Number of cold regions outlined.");
+
+using namespace llvm;
+
static cl::opt<bool> EnableStaticAnalysis("hot-cold-static-analysis",
cl::init(true), cl::Hidden);
-
-static cl::opt<int>
- SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
- cl::desc("Base penalty for splitting cold code (as a "
- "multiple of TCC_Basic)"));
-
+
+static cl::opt<int>
+ SplittingThreshold("hotcoldsplit-threshold", cl::init(2), cl::Hidden,
+ cl::desc("Base penalty for splitting cold code (as a "
+ "multiple of TCC_Basic)"));
+
static cl::opt<bool> EnableColdSection(
"enable-cold-section", cl::init(false), cl::Hidden,
cl::desc("Enable placement of extracted cold functions"
@@ -101,187 +101,187 @@ static cl::opt<int> MaxParametersForSplit(
"hotcoldsplit-max-params", cl::init(4), cl::Hidden,
cl::desc("Maximum number of parameters for a split function"));
-namespace {
-// Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
-// this function unless you modify the MBB version as well.
-//
-/// A no successor, non-return block probably ends in unreachable and is cold.
-/// Also consider a block that ends in an indirect branch to be a return block,
-/// since many targets use plain indirect branches to return.
-bool blockEndsInUnreachable(const BasicBlock &BB) {
- if (!succ_empty(&BB))
- return false;
- if (BB.empty())
- return true;
- const Instruction *I = BB.getTerminator();
- return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
-}
-
-bool unlikelyExecuted(BasicBlock &BB) {
- // Exception handling blocks are unlikely executed.
- if (BB.isEHPad() || isa<ResumeInst>(BB.getTerminator()))
- return true;
-
- // The block is cold if it calls/invokes a cold function. However, do not
- // mark sanitizer traps as cold.
- for (Instruction &I : BB)
- if (auto *CB = dyn_cast<CallBase>(&I))
- if (CB->hasFnAttr(Attribute::Cold) && !CB->getMetadata("nosanitize"))
- return true;
-
- // The block is cold if it has an unreachable terminator, unless it's
- // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp).
- if (blockEndsInUnreachable(BB)) {
- if (auto *CI =
- dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode()))
- if (CI->hasFnAttr(Attribute::NoReturn))
- return false;
- return true;
- }
-
- return false;
-}
-
-/// Check whether it's safe to outline \p BB.
-static bool mayExtractBlock(const BasicBlock &BB) {
- // EH pads are unsafe to outline because doing so breaks EH type tables. It
- // follows that invoke instructions cannot be extracted, because CodeExtractor
- // requires unwind destinations to be within the extraction region.
- //
- // Resumes that are not reachable from a cleanup landing pad are considered to
- // be unreachable. It’s not safe to split them out either.
- auto Term = BB.getTerminator();
- return !BB.hasAddressTaken() && !BB.isEHPad() && !isa<InvokeInst>(Term) &&
- !isa<ResumeInst>(Term);
-}
-
-/// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
-/// If \p UpdateEntryCount is true (set when this is a new split function and
-/// module has profile data), set entry count to 0 to ensure treated as cold.
-/// Return true if the function is changed.
-static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
- assert(!F.hasOptNone() && "Can't mark this cold");
- bool Changed = false;
- if (!F.hasFnAttribute(Attribute::Cold)) {
- F.addFnAttr(Attribute::Cold);
- Changed = true;
- }
- if (!F.hasFnAttribute(Attribute::MinSize)) {
- F.addFnAttr(Attribute::MinSize);
- Changed = true;
- }
- if (UpdateEntryCount) {
- // Set the entry count to 0 to ensure it is placed in the unlikely text
- // section when function sections are enabled.
- F.setEntryCount(0);
- Changed = true;
- }
-
- return Changed;
-}
-
-class HotColdSplittingLegacyPass : public ModulePass {
-public:
- static char ID;
- HotColdSplittingLegacyPass() : ModulePass(ID) {
- initializeHotColdSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<BlockFrequencyInfoWrapperPass>();
- AU.addRequired<ProfileSummaryInfoWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addUsedIfAvailable<AssumptionCacheTracker>();
- }
-
- bool runOnModule(Module &M) override;
-};
-
-} // end anonymous namespace
-
-/// Check whether \p F is inherently cold.
-bool HotColdSplitting::isFunctionCold(const Function &F) const {
- if (F.hasFnAttribute(Attribute::Cold))
- return true;
-
- if (F.getCallingConv() == CallingConv::Cold)
- return true;
-
- if (PSI->isFunctionEntryCold(&F))
- return true;
-
- return false;
-}
-
-// Returns false if the function should not be considered for hot-cold split
-// optimization.
-bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
- if (F.hasFnAttribute(Attribute::AlwaysInline))
- return false;
-
- if (F.hasFnAttribute(Attribute::NoInline))
- return false;
-
- // A function marked `noreturn` may contain unreachable terminators: these
- // should not be considered cold, as the function may be a trampoline.
- if (F.hasFnAttribute(Attribute::NoReturn))
- return false;
-
- if (F.hasFnAttribute(Attribute::SanitizeAddress) ||
- F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
- F.hasFnAttribute(Attribute::SanitizeThread) ||
- F.hasFnAttribute(Attribute::SanitizeMemory))
- return false;
-
- return true;
-}
-
-/// Get the benefit score of outlining \p Region.
+namespace {
+// Same as blockEndsInUnreachable in CodeGen/BranchFolding.cpp. Do not modify
+// this function unless you modify the MBB version as well.
+//
+/// A no successor, non-return block probably ends in unreachable and is cold.
+/// Also consider a block that ends in an indirect branch to be a return block,
+/// since many targets use plain indirect branches to return.
+bool blockEndsInUnreachable(const BasicBlock &BB) {
+ if (!succ_empty(&BB))
+ return false;
+ if (BB.empty())
+ return true;
+ const Instruction *I = BB.getTerminator();
+ return !(isa<ReturnInst>(I) || isa<IndirectBrInst>(I));
+}
+
+bool unlikelyExecuted(BasicBlock &BB) {
+ // Exception handling blocks are unlikely executed.
+ if (BB.isEHPad() || isa<ResumeInst>(BB.getTerminator()))
+ return true;
+
+ // The block is cold if it calls/invokes a cold function. However, do not
+ // mark sanitizer traps as cold.
+ for (Instruction &I : BB)
+ if (auto *CB = dyn_cast<CallBase>(&I))
+ if (CB->hasFnAttr(Attribute::Cold) && !CB->getMetadata("nosanitize"))
+ return true;
+
+ // The block is cold if it has an unreachable terminator, unless it's
+ // preceded by a call to a (possibly warm) noreturn call (e.g. longjmp).
+ if (blockEndsInUnreachable(BB)) {
+ if (auto *CI =
+ dyn_cast_or_null<CallInst>(BB.getTerminator()->getPrevNode()))
+ if (CI->hasFnAttr(Attribute::NoReturn))
+ return false;
+ return true;
+ }
+
+ return false;
+}
+
+/// Check whether it's safe to outline \p BB.
+static bool mayExtractBlock(const BasicBlock &BB) {
+ // EH pads are unsafe to outline because doing so breaks EH type tables. It
+ // follows that invoke instructions cannot be extracted, because CodeExtractor
+ // requires unwind destinations to be within the extraction region.
+ //
+ // Resumes that are not reachable from a cleanup landing pad are considered to
+ // be unreachable. It’s not safe to split them out either.
+ auto Term = BB.getTerminator();
+ return !BB.hasAddressTaken() && !BB.isEHPad() && !isa<InvokeInst>(Term) &&
+ !isa<ResumeInst>(Term);
+}
+
+/// Mark \p F cold. Based on this assumption, also optimize it for minimum size.
+/// If \p UpdateEntryCount is true (set when this is a new split function and
+/// module has profile data), set entry count to 0 to ensure treated as cold.
+/// Return true if the function is changed.
+static bool markFunctionCold(Function &F, bool UpdateEntryCount = false) {
+ assert(!F.hasOptNone() && "Can't mark this cold");
+ bool Changed = false;
+ if (!F.hasFnAttribute(Attribute::Cold)) {
+ F.addFnAttr(Attribute::Cold);
+ Changed = true;
+ }
+ if (!F.hasFnAttribute(Attribute::MinSize)) {
+ F.addFnAttr(Attribute::MinSize);
+ Changed = true;
+ }
+ if (UpdateEntryCount) {
+ // Set the entry count to 0 to ensure it is placed in the unlikely text
+ // section when function sections are enabled.
+ F.setEntryCount(0);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+class HotColdSplittingLegacyPass : public ModulePass {
+public:
+ static char ID;
+ HotColdSplittingLegacyPass() : ModulePass(ID) {
+ initializeHotColdSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addUsedIfAvailable<AssumptionCacheTracker>();
+ }
+
+ bool runOnModule(Module &M) override;
+};
+
+} // end anonymous namespace
+
+/// Check whether \p F is inherently cold.
+bool HotColdSplitting::isFunctionCold(const Function &F) const {
+ if (F.hasFnAttribute(Attribute::Cold))
+ return true;
+
+ if (F.getCallingConv() == CallingConv::Cold)
+ return true;
+
+ if (PSI->isFunctionEntryCold(&F))
+ return true;
+
+ return false;
+}
+
+// Returns false if the function should not be considered for hot-cold split
+// optimization.
+bool HotColdSplitting::shouldOutlineFrom(const Function &F) const {
+ if (F.hasFnAttribute(Attribute::AlwaysInline))
+ return false;
+
+ if (F.hasFnAttribute(Attribute::NoInline))
+ return false;
+
+ // A function marked `noreturn` may contain unreachable terminators: these
+ // should not be considered cold, as the function may be a trampoline.
+ if (F.hasFnAttribute(Attribute::NoReturn))
+ return false;
+
+ if (F.hasFnAttribute(Attribute::SanitizeAddress) ||
+ F.hasFnAttribute(Attribute::SanitizeHWAddress) ||
+ F.hasFnAttribute(Attribute::SanitizeThread) ||
+ F.hasFnAttribute(Attribute::SanitizeMemory))
+ return false;
+
+ return true;
+}
+
+/// Get the benefit score of outlining \p Region.
static InstructionCost getOutliningBenefit(ArrayRef<BasicBlock *> Region,
TargetTransformInfo &TTI) {
- // Sum up the code size costs of non-terminator instructions. Tight coupling
- // with \ref getOutliningPenalty is needed to model the costs of terminators.
+ // Sum up the code size costs of non-terminator instructions. Tight coupling
+ // with \ref getOutliningPenalty is needed to model the costs of terminators.
InstructionCost Benefit = 0;
- for (BasicBlock *BB : Region)
- for (Instruction &I : BB->instructionsWithoutDebug())
- if (&I != BB->getTerminator())
- Benefit +=
- TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
-
- return Benefit;
-}
-
-/// Get the penalty score for outlining \p Region.
-static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
- unsigned NumInputs, unsigned NumOutputs) {
- int Penalty = SplittingThreshold;
- LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n");
-
- // If the splitting threshold is set at or below zero, skip the usual
- // profitability check.
- if (SplittingThreshold <= 0)
- return Penalty;
-
- // Find the number of distinct exit blocks for the region. Use a conservative
- // check to determine whether control returns from the region.
- bool NoBlocksReturn = true;
- SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion;
- for (BasicBlock *BB : Region) {
- // If a block has no successors, only assume it does not return if it's
- // unreachable.
- if (succ_empty(BB)) {
- NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator());
- continue;
- }
-
- for (BasicBlock *SuccBB : successors(BB)) {
+ for (BasicBlock *BB : Region)
+ for (Instruction &I : BB->instructionsWithoutDebug())
+ if (&I != BB->getTerminator())
+ Benefit +=
+ TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
+
+ return Benefit;
+}
+
+/// Get the penalty score for outlining \p Region.
+static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
+ unsigned NumInputs, unsigned NumOutputs) {
+ int Penalty = SplittingThreshold;
+ LLVM_DEBUG(dbgs() << "Applying penalty for splitting: " << Penalty << "\n");
+
+ // If the splitting threshold is set at or below zero, skip the usual
+ // profitability check.
+ if (SplittingThreshold <= 0)
+ return Penalty;
+
+ // Find the number of distinct exit blocks for the region. Use a conservative
+ // check to determine whether control returns from the region.
+ bool NoBlocksReturn = true;
+ SmallPtrSet<BasicBlock *, 2> SuccsOutsideRegion;
+ for (BasicBlock *BB : Region) {
+ // If a block has no successors, only assume it does not return if it's
+ // unreachable.
+ if (succ_empty(BB)) {
+ NoBlocksReturn &= isa<UnreachableInst>(BB->getTerminator());
+ continue;
+ }
+
+ for (BasicBlock *SuccBB : successors(BB)) {
if (!is_contained(Region, SuccBB)) {
- NoBlocksReturn = false;
- SuccsOutsideRegion.insert(SuccBB);
- }
- }
- }
-
+ NoBlocksReturn = false;
+ SuccsOutsideRegion.insert(SuccBB);
+ }
+ }
+ }
+
// Count the number of phis in exit blocks with >= 2 incoming values from the
// outlining region. These phis are split (\ref severSplitPHINodesOfExits),
// and new outputs are created to supply the split phis. CodeExtractor can't
@@ -324,473 +324,473 @@ static int getOutliningPenalty(ArrayRef<BasicBlock *> Region,
const int CostForRegionOutput = 3 * TargetTransformInfo::TCC_Basic;
Penalty += CostForRegionOutput * NumOutputsAndSplitPhis;
- // Apply a `noreturn` bonus.
- if (NoBlocksReturn) {
- LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
- << " non-returning terminators\n");
- Penalty -= Region.size();
- }
-
- // Apply a penalty for having more than one successor outside of the region.
- // This penalty accounts for the switch needed in the caller.
+ // Apply a `noreturn` bonus.
+ if (NoBlocksReturn) {
+ LLVM_DEBUG(dbgs() << "Applying bonus for: " << Region.size()
+ << " non-returning terminators\n");
+ Penalty -= Region.size();
+ }
+
+ // Apply a penalty for having more than one successor outside of the region.
+ // This penalty accounts for the switch needed in the caller.
if (SuccsOutsideRegion.size() > 1) {
- LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
- << " non-region successors\n");
- Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
- }
-
- return Penalty;
-}
-
-Function *HotColdSplitting::extractColdRegion(
- const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC,
- DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
- OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
- assert(!Region.empty());
-
- // TODO: Pass BFI and BPI to update profile information.
- CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
- /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
- /* AllowAlloca */ false,
- /* Suffix */ "cold." + std::to_string(Count));
-
- // Perform a simple cost/benefit analysis to decide whether or not to permit
- // splitting.
- SetVector<Value *> Inputs, Outputs, Sinks;
- CE.findInputsOutputs(Inputs, Outputs, Sinks);
+ LLVM_DEBUG(dbgs() << "Applying penalty for: " << SuccsOutsideRegion.size()
+ << " non-region successors\n");
+ Penalty += (SuccsOutsideRegion.size() - 1) * TargetTransformInfo::TCC_Basic;
+ }
+
+ return Penalty;
+}
+
+Function *HotColdSplitting::extractColdRegion(
+ const BlockSequence &Region, const CodeExtractorAnalysisCache &CEAC,
+ DominatorTree &DT, BlockFrequencyInfo *BFI, TargetTransformInfo &TTI,
+ OptimizationRemarkEmitter &ORE, AssumptionCache *AC, unsigned Count) {
+ assert(!Region.empty());
+
+ // TODO: Pass BFI and BPI to update profile information.
+ CodeExtractor CE(Region, &DT, /* AggregateArgs */ false, /* BFI */ nullptr,
+ /* BPI */ nullptr, AC, /* AllowVarArgs */ false,
+ /* AllowAlloca */ false,
+ /* Suffix */ "cold." + std::to_string(Count));
+
+ // Perform a simple cost/benefit analysis to decide whether or not to permit
+ // splitting.
+ SetVector<Value *> Inputs, Outputs, Sinks;
+ CE.findInputsOutputs(Inputs, Outputs, Sinks);
InstructionCost OutliningBenefit = getOutliningBenefit(Region, TTI);
- int OutliningPenalty =
- getOutliningPenalty(Region, Inputs.size(), Outputs.size());
- LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
- << ", penalty = " << OutliningPenalty << "\n");
+ int OutliningPenalty =
+ getOutliningPenalty(Region, Inputs.size(), Outputs.size());
+ LLVM_DEBUG(dbgs() << "Split profitability: benefit = " << OutliningBenefit
+ << ", penalty = " << OutliningPenalty << "\n");
if (!OutliningBenefit.isValid() || OutliningBenefit <= OutliningPenalty)
- return nullptr;
-
- Function *OrigF = Region[0]->getParent();
- if (Function *OutF = CE.extractCodeRegion(CEAC)) {
- User *U = *OutF->user_begin();
- CallInst *CI = cast<CallInst>(U);
- NumColdRegionsOutlined++;
- if (TTI.useColdCCForColdCall(*OutF)) {
- OutF->setCallingConv(CallingConv::Cold);
- CI->setCallingConv(CallingConv::Cold);
- }
- CI->setIsNoInline();
-
+ return nullptr;
+
+ Function *OrigF = Region[0]->getParent();
+ if (Function *OutF = CE.extractCodeRegion(CEAC)) {
+ User *U = *OutF->user_begin();
+ CallInst *CI = cast<CallInst>(U);
+ NumColdRegionsOutlined++;
+ if (TTI.useColdCCForColdCall(*OutF)) {
+ OutF->setCallingConv(CallingConv::Cold);
+ CI->setCallingConv(CallingConv::Cold);
+ }
+ CI->setIsNoInline();
+
if (EnableColdSection)
OutF->setSection(ColdSectionName);
else {
if (OrigF->hasSection())
OutF->setSection(OrigF->getSection());
}
-
- markFunctionCold(*OutF, BFI != nullptr);
-
- LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
- ORE.emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
- &*Region[0]->begin())
- << ore::NV("Original", OrigF) << " split cold code into "
- << ore::NV("Split", OutF);
- });
- return OutF;
- }
-
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
- &*Region[0]->begin())
- << "Failed to extract region at block "
- << ore::NV("Block", Region.front());
- });
- return nullptr;
-}
-
-/// A pair of (basic block, score).
-using BlockTy = std::pair<BasicBlock *, unsigned>;
-
-namespace {
-/// A maximal outlining region. This contains all blocks post-dominated by a
-/// sink block, the sink block itself, and all blocks dominated by the sink.
-/// If sink-predecessors and sink-successors cannot be extracted in one region,
-/// the static constructor returns a list of suitable extraction regions.
-class OutliningRegion {
- /// A list of (block, score) pairs. A block's score is non-zero iff it's a
- /// viable sub-region entry point. Blocks with higher scores are better entry
- /// points (i.e. they are more distant ancestors of the sink block).
- SmallVector<BlockTy, 0> Blocks = {};
-
- /// The suggested entry point into the region. If the region has multiple
- /// entry points, all blocks within the region may not be reachable from this
- /// entry point.
- BasicBlock *SuggestedEntryPoint = nullptr;
-
- /// Whether the entire function is cold.
- bool EntireFunctionCold = false;
-
- /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise.
- static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) {
- return mayExtractBlock(BB) ? Score : 0;
- }
-
- /// These scores should be lower than the score for predecessor blocks,
- /// because regions starting at predecessor blocks are typically larger.
- static constexpr unsigned ScoreForSuccBlock = 1;
- static constexpr unsigned ScoreForSinkBlock = 1;
-
- OutliningRegion(const OutliningRegion &) = delete;
- OutliningRegion &operator=(const OutliningRegion &) = delete;
-
-public:
- OutliningRegion() = default;
- OutliningRegion(OutliningRegion &&) = default;
- OutliningRegion &operator=(OutliningRegion &&) = default;
-
- static std::vector<OutliningRegion> create(BasicBlock &SinkBB,
- const DominatorTree &DT,
- const PostDominatorTree &PDT) {
- std::vector<OutliningRegion> Regions;
- SmallPtrSet<BasicBlock *, 4> RegionBlocks;
-
- Regions.emplace_back();
- OutliningRegion *ColdRegion = &Regions.back();
-
- auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) {
- RegionBlocks.insert(BB);
- ColdRegion->Blocks.emplace_back(BB, Score);
- };
-
- // The ancestor farthest-away from SinkBB, and also post-dominated by it.
- unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
- ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr;
- unsigned BestScore = SinkScore;
-
- // Visit SinkBB's ancestors using inverse DFS.
- auto PredIt = ++idf_begin(&SinkBB);
- auto PredEnd = idf_end(&SinkBB);
- while (PredIt != PredEnd) {
- BasicBlock &PredBB = **PredIt;
- bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
-
- // If the predecessor is cold and has no predecessors, the entire
- // function must be cold.
- if (SinkPostDom && pred_empty(&PredBB)) {
- ColdRegion->EntireFunctionCold = true;
- return Regions;
- }
-
- // If SinkBB does not post-dominate a predecessor, do not mark the
- // predecessor (or any of its predecessors) cold.
- if (!SinkPostDom || !mayExtractBlock(PredBB)) {
- PredIt.skipChildren();
- continue;
- }
-
- // Keep track of the post-dominated ancestor farthest away from the sink.
- // The path length is always >= 2, ensuring that predecessor blocks are
- // considered as entry points before the sink block.
- unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
- if (PredScore > BestScore) {
- ColdRegion->SuggestedEntryPoint = &PredBB;
- BestScore = PredScore;
- }
-
- addBlockToRegion(&PredBB, PredScore);
- ++PredIt;
- }
-
- // If the sink can be added to the cold region, do so. It's considered as
- // an entry point before any sink-successor blocks.
- //
- // Otherwise, split cold sink-successor blocks using a separate region.
- // This satisfies the requirement that all extraction blocks other than the
- // first have predecessors within the extraction region.
- if (mayExtractBlock(SinkBB)) {
- addBlockToRegion(&SinkBB, SinkScore);
- if (pred_empty(&SinkBB)) {
- ColdRegion->EntireFunctionCold = true;
- return Regions;
- }
- } else {
- Regions.emplace_back();
- ColdRegion = &Regions.back();
- BestScore = 0;
- }
-
- // Find all successors of SinkBB dominated by SinkBB using DFS.
- auto SuccIt = ++df_begin(&SinkBB);
- auto SuccEnd = df_end(&SinkBB);
- while (SuccIt != SuccEnd) {
- BasicBlock &SuccBB = **SuccIt;
- bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
-
- // Don't allow the backwards & forwards DFSes to mark the same block.
- bool DuplicateBlock = RegionBlocks.count(&SuccBB);
-
- // If SinkBB does not dominate a successor, do not mark the successor (or
- // any of its successors) cold.
- if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
- SuccIt.skipChildren();
- continue;
- }
-
- unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
- if (SuccScore > BestScore) {
- ColdRegion->SuggestedEntryPoint = &SuccBB;
- BestScore = SuccScore;
- }
-
- addBlockToRegion(&SuccBB, SuccScore);
- ++SuccIt;
- }
-
- return Regions;
- }
-
- /// Whether this region has nothing to extract.
- bool empty() const { return !SuggestedEntryPoint; }
-
- /// The blocks in this region.
- ArrayRef<std::pair<BasicBlock *, unsigned>> blocks() const { return Blocks; }
-
- /// Whether the entire function containing this region is cold.
- bool isEntireFunctionCold() const { return EntireFunctionCold; }
-
- /// Remove a sub-region from this region and return it as a block sequence.
- BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) {
- assert(!empty() && !isEntireFunctionCold() && "Nothing to extract");
-
- // Remove blocks dominated by the suggested entry point from this region.
- // During the removal, identify the next best entry point into the region.
- // Ensure that the first extracted block is the suggested entry point.
- BlockSequence SubRegion = {SuggestedEntryPoint};
- BasicBlock *NextEntryPoint = nullptr;
- unsigned NextScore = 0;
- auto RegionEndIt = Blocks.end();
- auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) {
- BasicBlock *BB = Block.first;
- unsigned Score = Block.second;
- bool InSubRegion =
- BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB);
- if (!InSubRegion && Score > NextScore) {
- NextEntryPoint = BB;
- NextScore = Score;
- }
- if (InSubRegion && BB != SuggestedEntryPoint)
- SubRegion.push_back(BB);
- return InSubRegion;
- });
- Blocks.erase(RegionStartIt, RegionEndIt);
-
- // Update the suggested entry point.
- SuggestedEntryPoint = NextEntryPoint;
-
- return SubRegion;
- }
-};
-} // namespace
-
-bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
- bool Changed = false;
-
- // The set of cold blocks.
- SmallPtrSet<BasicBlock *, 4> ColdBlocks;
-
- // The worklist of non-intersecting regions left to outline.
- SmallVector<OutliningRegion, 2> OutliningWorklist;
-
- // Set up an RPO traversal. Experimentally, this performs better (outlines
- // more) than a PO traversal, because we prevent region overlap by keeping
- // the first region to contain a block.
- ReversePostOrderTraversal<Function *> RPOT(&F);
-
- // Calculate domtrees lazily. This reduces compile-time significantly.
- std::unique_ptr<DominatorTree> DT;
- std::unique_ptr<PostDominatorTree> PDT;
-
- // Calculate BFI lazily (it's only used to query ProfileSummaryInfo). This
- // reduces compile-time significantly. TODO: When we *do* use BFI, we should
- // be able to salvage its domtrees instead of recomputing them.
- BlockFrequencyInfo *BFI = nullptr;
- if (HasProfileSummary)
- BFI = GetBFI(F);
-
- TargetTransformInfo &TTI = GetTTI(F);
- OptimizationRemarkEmitter &ORE = (*GetORE)(F);
- AssumptionCache *AC = LookupAC(F);
-
- // Find all cold regions.
- for (BasicBlock *BB : RPOT) {
- // This block is already part of some outlining region.
- if (ColdBlocks.count(BB))
- continue;
-
- bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) ||
+
+ markFunctionCold(*OutF, BFI != nullptr);
+
+ LLVM_DEBUG(llvm::dbgs() << "Outlined Region: " << *OutF);
+ ORE.emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "HotColdSplit",
+ &*Region[0]->begin())
+ << ore::NV("Original", OrigF) << " split cold code into "
+ << ore::NV("Split", OutF);
+ });
+ return OutF;
+ }
+
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
+ &*Region[0]->begin())
+ << "Failed to extract region at block "
+ << ore::NV("Block", Region.front());
+ });
+ return nullptr;
+}
+
+/// A pair of (basic block, score).
+using BlockTy = std::pair<BasicBlock *, unsigned>;
+
+namespace {
+/// A maximal outlining region. This contains all blocks post-dominated by a
+/// sink block, the sink block itself, and all blocks dominated by the sink.
+/// If sink-predecessors and sink-successors cannot be extracted in one region,
+/// the static constructor returns a list of suitable extraction regions.
+class OutliningRegion {
+ /// A list of (block, score) pairs. A block's score is non-zero iff it's a
+ /// viable sub-region entry point. Blocks with higher scores are better entry
+ /// points (i.e. they are more distant ancestors of the sink block).
+ SmallVector<BlockTy, 0> Blocks = {};
+
+ /// The suggested entry point into the region. If the region has multiple
+ /// entry points, all blocks within the region may not be reachable from this
+ /// entry point.
+ BasicBlock *SuggestedEntryPoint = nullptr;
+
+ /// Whether the entire function is cold.
+ bool EntireFunctionCold = false;
+
+ /// If \p BB is a viable entry point, return \p Score. Return 0 otherwise.
+ static unsigned getEntryPointScore(BasicBlock &BB, unsigned Score) {
+ return mayExtractBlock(BB) ? Score : 0;
+ }
+
+ /// These scores should be lower than the score for predecessor blocks,
+ /// because regions starting at predecessor blocks are typically larger.
+ static constexpr unsigned ScoreForSuccBlock = 1;
+ static constexpr unsigned ScoreForSinkBlock = 1;
+
+ OutliningRegion(const OutliningRegion &) = delete;
+ OutliningRegion &operator=(const OutliningRegion &) = delete;
+
+public:
+ OutliningRegion() = default;
+ OutliningRegion(OutliningRegion &&) = default;
+ OutliningRegion &operator=(OutliningRegion &&) = default;
+
+ static std::vector<OutliningRegion> create(BasicBlock &SinkBB,
+ const DominatorTree &DT,
+ const PostDominatorTree &PDT) {
+ std::vector<OutliningRegion> Regions;
+ SmallPtrSet<BasicBlock *, 4> RegionBlocks;
+
+ Regions.emplace_back();
+ OutliningRegion *ColdRegion = &Regions.back();
+
+ auto addBlockToRegion = [&](BasicBlock *BB, unsigned Score) {
+ RegionBlocks.insert(BB);
+ ColdRegion->Blocks.emplace_back(BB, Score);
+ };
+
+ // The ancestor farthest-away from SinkBB, and also post-dominated by it.
+ unsigned SinkScore = getEntryPointScore(SinkBB, ScoreForSinkBlock);
+ ColdRegion->SuggestedEntryPoint = (SinkScore > 0) ? &SinkBB : nullptr;
+ unsigned BestScore = SinkScore;
+
+ // Visit SinkBB's ancestors using inverse DFS.
+ auto PredIt = ++idf_begin(&SinkBB);
+ auto PredEnd = idf_end(&SinkBB);
+ while (PredIt != PredEnd) {
+ BasicBlock &PredBB = **PredIt;
+ bool SinkPostDom = PDT.dominates(&SinkBB, &PredBB);
+
+ // If the predecessor is cold and has no predecessors, the entire
+ // function must be cold.
+ if (SinkPostDom && pred_empty(&PredBB)) {
+ ColdRegion->EntireFunctionCold = true;
+ return Regions;
+ }
+
+ // If SinkBB does not post-dominate a predecessor, do not mark the
+ // predecessor (or any of its predecessors) cold.
+ if (!SinkPostDom || !mayExtractBlock(PredBB)) {
+ PredIt.skipChildren();
+ continue;
+ }
+
+ // Keep track of the post-dominated ancestor farthest away from the sink.
+ // The path length is always >= 2, ensuring that predecessor blocks are
+ // considered as entry points before the sink block.
+ unsigned PredScore = getEntryPointScore(PredBB, PredIt.getPathLength());
+ if (PredScore > BestScore) {
+ ColdRegion->SuggestedEntryPoint = &PredBB;
+ BestScore = PredScore;
+ }
+
+ addBlockToRegion(&PredBB, PredScore);
+ ++PredIt;
+ }
+
+ // If the sink can be added to the cold region, do so. It's considered as
+ // an entry point before any sink-successor blocks.
+ //
+ // Otherwise, split cold sink-successor blocks using a separate region.
+ // This satisfies the requirement that all extraction blocks other than the
+ // first have predecessors within the extraction region.
+ if (mayExtractBlock(SinkBB)) {
+ addBlockToRegion(&SinkBB, SinkScore);
+ if (pred_empty(&SinkBB)) {
+ ColdRegion->EntireFunctionCold = true;
+ return Regions;
+ }
+ } else {
+ Regions.emplace_back();
+ ColdRegion = &Regions.back();
+ BestScore = 0;
+ }
+
+ // Find all successors of SinkBB dominated by SinkBB using DFS.
+ auto SuccIt = ++df_begin(&SinkBB);
+ auto SuccEnd = df_end(&SinkBB);
+ while (SuccIt != SuccEnd) {
+ BasicBlock &SuccBB = **SuccIt;
+ bool SinkDom = DT.dominates(&SinkBB, &SuccBB);
+
+ // Don't allow the backwards & forwards DFSes to mark the same block.
+ bool DuplicateBlock = RegionBlocks.count(&SuccBB);
+
+ // If SinkBB does not dominate a successor, do not mark the successor (or
+ // any of its successors) cold.
+ if (DuplicateBlock || !SinkDom || !mayExtractBlock(SuccBB)) {
+ SuccIt.skipChildren();
+ continue;
+ }
+
+ unsigned SuccScore = getEntryPointScore(SuccBB, ScoreForSuccBlock);
+ if (SuccScore > BestScore) {
+ ColdRegion->SuggestedEntryPoint = &SuccBB;
+ BestScore = SuccScore;
+ }
+
+ addBlockToRegion(&SuccBB, SuccScore);
+ ++SuccIt;
+ }
+
+ return Regions;
+ }
+
+ /// Whether this region has nothing to extract.
+ bool empty() const { return !SuggestedEntryPoint; }
+
+ /// The blocks in this region.
+ ArrayRef<std::pair<BasicBlock *, unsigned>> blocks() const { return Blocks; }
+
+ /// Whether the entire function containing this region is cold.
+ bool isEntireFunctionCold() const { return EntireFunctionCold; }
+
+ /// Remove a sub-region from this region and return it as a block sequence.
+ BlockSequence takeSingleEntrySubRegion(DominatorTree &DT) {
+ assert(!empty() && !isEntireFunctionCold() && "Nothing to extract");
+
+ // Remove blocks dominated by the suggested entry point from this region.
+ // During the removal, identify the next best entry point into the region.
+ // Ensure that the first extracted block is the suggested entry point.
+ BlockSequence SubRegion = {SuggestedEntryPoint};
+ BasicBlock *NextEntryPoint = nullptr;
+ unsigned NextScore = 0;
+ auto RegionEndIt = Blocks.end();
+ auto RegionStartIt = remove_if(Blocks, [&](const BlockTy &Block) {
+ BasicBlock *BB = Block.first;
+ unsigned Score = Block.second;
+ bool InSubRegion =
+ BB == SuggestedEntryPoint || DT.dominates(SuggestedEntryPoint, BB);
+ if (!InSubRegion && Score > NextScore) {
+ NextEntryPoint = BB;
+ NextScore = Score;
+ }
+ if (InSubRegion && BB != SuggestedEntryPoint)
+ SubRegion.push_back(BB);
+ return InSubRegion;
+ });
+ Blocks.erase(RegionStartIt, RegionEndIt);
+
+ // Update the suggested entry point.
+ SuggestedEntryPoint = NextEntryPoint;
+
+ return SubRegion;
+ }
+};
+} // namespace
+
+bool HotColdSplitting::outlineColdRegions(Function &F, bool HasProfileSummary) {
+ bool Changed = false;
+
+ // The set of cold blocks.
+ SmallPtrSet<BasicBlock *, 4> ColdBlocks;
+
+ // The worklist of non-intersecting regions left to outline.
+ SmallVector<OutliningRegion, 2> OutliningWorklist;
+
+ // Set up an RPO traversal. Experimentally, this performs better (outlines
+ // more) than a PO traversal, because we prevent region overlap by keeping
+ // the first region to contain a block.
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+
+ // Calculate domtrees lazily. This reduces compile-time significantly.
+ std::unique_ptr<DominatorTree> DT;
+ std::unique_ptr<PostDominatorTree> PDT;
+
+ // Calculate BFI lazily (it's only used to query ProfileSummaryInfo). This
+ // reduces compile-time significantly. TODO: When we *do* use BFI, we should
+ // be able to salvage its domtrees instead of recomputing them.
+ BlockFrequencyInfo *BFI = nullptr;
+ if (HasProfileSummary)
+ BFI = GetBFI(F);
+
+ TargetTransformInfo &TTI = GetTTI(F);
+ OptimizationRemarkEmitter &ORE = (*GetORE)(F);
+ AssumptionCache *AC = LookupAC(F);
+
+ // Find all cold regions.
+ for (BasicBlock *BB : RPOT) {
+ // This block is already part of some outlining region.
+ if (ColdBlocks.count(BB))
+ continue;
+
+ bool Cold = (BFI && PSI->isColdBlock(BB, BFI)) ||
(EnableStaticAnalysis && unlikelyExecuted(*BB));
- if (!Cold)
- continue;
-
- LLVM_DEBUG({
- dbgs() << "Found a cold block:\n";
- BB->dump();
- });
-
- if (!DT)
- DT = std::make_unique<DominatorTree>(F);
- if (!PDT)
- PDT = std::make_unique<PostDominatorTree>(F);
-
- auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
- for (OutliningRegion &Region : Regions) {
- if (Region.empty())
- continue;
-
- if (Region.isEntireFunctionCold()) {
- LLVM_DEBUG(dbgs() << "Entire function is cold\n");
- return markFunctionCold(F);
- }
-
- // If this outlining region intersects with another, drop the new region.
- //
- // TODO: It's theoretically possible to outline more by only keeping the
- // largest region which contains a block, but the extra bookkeeping to do
- // this is tricky/expensive.
- bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
- return !ColdBlocks.insert(Block.first).second;
- });
- if (RegionsOverlap)
- continue;
-
- OutliningWorklist.emplace_back(std::move(Region));
- ++NumColdRegionsFound;
- }
- }
-
- if (OutliningWorklist.empty())
- return Changed;
-
- // Outline single-entry cold regions, splitting up larger regions as needed.
- unsigned OutlinedFunctionID = 1;
- // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
- CodeExtractorAnalysisCache CEAC(F);
- do {
- OutliningRegion Region = OutliningWorklist.pop_back_val();
- assert(!Region.empty() && "Empty outlining region in worklist");
- do {
- BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
- LLVM_DEBUG({
- dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
- for (BasicBlock *BB : SubRegion)
- BB->dump();
- });
-
- Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
- ORE, AC, OutlinedFunctionID);
- if (Outlined) {
- ++OutlinedFunctionID;
- Changed = true;
- }
- } while (!Region.empty());
- } while (!OutliningWorklist.empty());
-
- return Changed;
-}
-
-bool HotColdSplitting::run(Module &M) {
- bool Changed = false;
- bool HasProfileSummary = (M.getProfileSummary(/* IsCS */ false) != nullptr);
- for (auto It = M.begin(), End = M.end(); It != End; ++It) {
- Function &F = *It;
-
- // Do not touch declarations.
- if (F.isDeclaration())
- continue;
-
- // Do not modify `optnone` functions.
- if (F.hasOptNone())
- continue;
-
- // Detect inherently cold functions and mark them as such.
- if (isFunctionCold(F)) {
- Changed |= markFunctionCold(F);
- continue;
- }
-
- if (!shouldOutlineFrom(F)) {
- LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n");
- continue;
- }
-
- LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
- Changed |= outlineColdRegions(F, HasProfileSummary);
- }
- return Changed;
-}
-
-bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
- if (skipModule(M))
- return false;
- ProfileSummaryInfo *PSI =
- &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
- auto GTTI = [this](Function &F) -> TargetTransformInfo & {
- return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- };
- auto GBFI = [this](Function &F) {
- return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
- };
- std::unique_ptr<OptimizationRemarkEmitter> ORE;
- std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
- [&ORE](Function &F) -> OptimizationRemarkEmitter & {
- ORE.reset(new OptimizationRemarkEmitter(&F));
- return *ORE.get();
- };
- auto LookupAC = [this](Function &F) -> AssumptionCache * {
- if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
- return ACT->lookupAssumptionCache(F);
- return nullptr;
- };
-
- return HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M);
-}
-
-PreservedAnalyses
-HotColdSplittingPass::run(Module &M, ModuleAnalysisManager &AM) {
- auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-
- auto LookupAC = [&FAM](Function &F) -> AssumptionCache * {
- return FAM.getCachedResult<AssumptionAnalysis>(F);
- };
-
- auto GBFI = [&FAM](Function &F) {
- return &FAM.getResult<BlockFrequencyAnalysis>(F);
- };
-
- std::function<TargetTransformInfo &(Function &)> GTTI =
- [&FAM](Function &F) -> TargetTransformInfo & {
- return FAM.getResult<TargetIRAnalysis>(F);
- };
-
- std::unique_ptr<OptimizationRemarkEmitter> ORE;
- std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
- [&ORE](Function &F) -> OptimizationRemarkEmitter & {
- ORE.reset(new OptimizationRemarkEmitter(&F));
- return *ORE.get();
- };
-
- ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
-
- if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M))
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
-}
-
-char HotColdSplittingLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit",
- "Hot Cold Splitting", false, false)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit",
- "Hot Cold Splitting", false, false)
-
-ModulePass *llvm::createHotColdSplittingPass() {
- return new HotColdSplittingLegacyPass();
-}
+ if (!Cold)
+ continue;
+
+ LLVM_DEBUG({
+ dbgs() << "Found a cold block:\n";
+ BB->dump();
+ });
+
+ if (!DT)
+ DT = std::make_unique<DominatorTree>(F);
+ if (!PDT)
+ PDT = std::make_unique<PostDominatorTree>(F);
+
+ auto Regions = OutliningRegion::create(*BB, *DT, *PDT);
+ for (OutliningRegion &Region : Regions) {
+ if (Region.empty())
+ continue;
+
+ if (Region.isEntireFunctionCold()) {
+ LLVM_DEBUG(dbgs() << "Entire function is cold\n");
+ return markFunctionCold(F);
+ }
+
+ // If this outlining region intersects with another, drop the new region.
+ //
+ // TODO: It's theoretically possible to outline more by only keeping the
+ // largest region which contains a block, but the extra bookkeeping to do
+ // this is tricky/expensive.
+ bool RegionsOverlap = any_of(Region.blocks(), [&](const BlockTy &Block) {
+ return !ColdBlocks.insert(Block.first).second;
+ });
+ if (RegionsOverlap)
+ continue;
+
+ OutliningWorklist.emplace_back(std::move(Region));
+ ++NumColdRegionsFound;
+ }
+ }
+
+ if (OutliningWorklist.empty())
+ return Changed;
+
+ // Outline single-entry cold regions, splitting up larger regions as needed.
+ unsigned OutlinedFunctionID = 1;
+ // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
+ CodeExtractorAnalysisCache CEAC(F);
+ do {
+ OutliningRegion Region = OutliningWorklist.pop_back_val();
+ assert(!Region.empty() && "Empty outlining region in worklist");
+ do {
+ BlockSequence SubRegion = Region.takeSingleEntrySubRegion(*DT);
+ LLVM_DEBUG({
+ dbgs() << "Hot/cold splitting attempting to outline these blocks:\n";
+ for (BasicBlock *BB : SubRegion)
+ BB->dump();
+ });
+
+ Function *Outlined = extractColdRegion(SubRegion, CEAC, *DT, BFI, TTI,
+ ORE, AC, OutlinedFunctionID);
+ if (Outlined) {
+ ++OutlinedFunctionID;
+ Changed = true;
+ }
+ } while (!Region.empty());
+ } while (!OutliningWorklist.empty());
+
+ return Changed;
+}
+
+bool HotColdSplitting::run(Module &M) {
+ bool Changed = false;
+ bool HasProfileSummary = (M.getProfileSummary(/* IsCS */ false) != nullptr);
+ for (auto It = M.begin(), End = M.end(); It != End; ++It) {
+ Function &F = *It;
+
+ // Do not touch declarations.
+ if (F.isDeclaration())
+ continue;
+
+ // Do not modify `optnone` functions.
+ if (F.hasOptNone())
+ continue;
+
+ // Detect inherently cold functions and mark them as such.
+ if (isFunctionCold(F)) {
+ Changed |= markFunctionCold(F);
+ continue;
+ }
+
+ if (!shouldOutlineFrom(F)) {
+ LLVM_DEBUG(llvm::dbgs() << "Skipping " << F.getName() << "\n");
+ continue;
+ }
+
+ LLVM_DEBUG(llvm::dbgs() << "Outlining in " << F.getName() << "\n");
+ Changed |= outlineColdRegions(F, HasProfileSummary);
+ }
+ return Changed;
+}
+
+bool HotColdSplittingLegacyPass::runOnModule(Module &M) {
+ if (skipModule(M))
+ return false;
+ ProfileSummaryInfo *PSI =
+ &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ auto GTTI = [this](Function &F) -> TargetTransformInfo & {
+ return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ };
+ auto GBFI = [this](Function &F) {
+ return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+ };
+ std::unique_ptr<OptimizationRemarkEmitter> ORE;
+ std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
+ [&ORE](Function &F) -> OptimizationRemarkEmitter & {
+ ORE.reset(new OptimizationRemarkEmitter(&F));
+ return *ORE.get();
+ };
+ auto LookupAC = [this](Function &F) -> AssumptionCache * {
+ if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
+ return ACT->lookupAssumptionCache(F);
+ return nullptr;
+ };
+
+ return HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M);
+}
+
+PreservedAnalyses
+HotColdSplittingPass::run(Module &M, ModuleAnalysisManager &AM) {
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+ auto LookupAC = [&FAM](Function &F) -> AssumptionCache * {
+ return FAM.getCachedResult<AssumptionAnalysis>(F);
+ };
+
+ auto GBFI = [&FAM](Function &F) {
+ return &FAM.getResult<BlockFrequencyAnalysis>(F);
+ };
+
+ std::function<TargetTransformInfo &(Function &)> GTTI =
+ [&FAM](Function &F) -> TargetTransformInfo & {
+ return FAM.getResult<TargetIRAnalysis>(F);
+ };
+
+ std::unique_ptr<OptimizationRemarkEmitter> ORE;
+ std::function<OptimizationRemarkEmitter &(Function &)> GetORE =
+ [&ORE](Function &F) -> OptimizationRemarkEmitter & {
+ ORE.reset(new OptimizationRemarkEmitter(&F));
+ return *ORE.get();
+ };
+
+ ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+
+ if (HotColdSplitting(PSI, GBFI, GTTI, &GetORE, LookupAC).run(M))
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
+char HotColdSplittingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(HotColdSplittingLegacyPass, "hotcoldsplit",
+ "Hot Cold Splitting", false, false)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(HotColdSplittingLegacyPass, "hotcoldsplit",
+ "Hot Cold Splitting", false, false)
+
+ModulePass *llvm::createHotColdSplittingPass() {
+ return new HotColdSplittingLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/IPO.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/IPO.cpp
index 8b670b6c98..f4c12dd7f4 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/IPO.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/IPO.cpp
@@ -1,142 +1,142 @@
-//===-- IPO.cpp -----------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the common infrastructure (including C bindings) for
-// libLLVMIPO.a, which implements several transformations over the LLVM
-// intermediate representation.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm-c/Transforms/IPO.h"
-#include "llvm-c/Initialization.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/AlwaysInliner.h"
-#include "llvm/Transforms/IPO/FunctionAttrs.h"
-
-using namespace llvm;
-
-void llvm::initializeIPO(PassRegistry &Registry) {
- initializeOpenMPOptLegacyPassPass(Registry);
- initializeArgPromotionPass(Registry);
+//===-- IPO.cpp -----------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the common infrastructure (including C bindings) for
+// libLLVMIPO.a, which implements several transformations over the LLVM
+// intermediate representation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Transforms/IPO.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+
+using namespace llvm;
+
+void llvm::initializeIPO(PassRegistry &Registry) {
+ initializeOpenMPOptLegacyPassPass(Registry);
+ initializeArgPromotionPass(Registry);
initializeAnnotation2MetadataLegacyPass(Registry);
- initializeCalledValuePropagationLegacyPassPass(Registry);
- initializeConstantMergeLegacyPassPass(Registry);
- initializeCrossDSOCFIPass(Registry);
- initializeDAEPass(Registry);
- initializeDAHPass(Registry);
- initializeForceFunctionAttrsLegacyPassPass(Registry);
- initializeGlobalDCELegacyPassPass(Registry);
- initializeGlobalOptLegacyPassPass(Registry);
- initializeGlobalSplitPass(Registry);
- initializeHotColdSplittingLegacyPassPass(Registry);
+ initializeCalledValuePropagationLegacyPassPass(Registry);
+ initializeConstantMergeLegacyPassPass(Registry);
+ initializeCrossDSOCFIPass(Registry);
+ initializeDAEPass(Registry);
+ initializeDAHPass(Registry);
+ initializeForceFunctionAttrsLegacyPassPass(Registry);
+ initializeGlobalDCELegacyPassPass(Registry);
+ initializeGlobalOptLegacyPassPass(Registry);
+ initializeGlobalSplitPass(Registry);
+ initializeHotColdSplittingLegacyPassPass(Registry);
initializeIROutlinerLegacyPassPass(Registry);
- initializeAlwaysInlinerLegacyPassPass(Registry);
- initializeSimpleInlinerPass(Registry);
- initializeInferFunctionAttrsLegacyPassPass(Registry);
- initializeInternalizeLegacyPassPass(Registry);
+ initializeAlwaysInlinerLegacyPassPass(Registry);
+ initializeSimpleInlinerPass(Registry);
+ initializeInferFunctionAttrsLegacyPassPass(Registry);
+ initializeInternalizeLegacyPassPass(Registry);
initializeLoopExtractorLegacyPassPass(Registry);
initializeBlockExtractorLegacyPassPass(Registry);
- initializeSingleLoopExtractorPass(Registry);
- initializeLowerTypeTestsPass(Registry);
- initializeMergeFunctionsLegacyPassPass(Registry);
- initializePartialInlinerLegacyPassPass(Registry);
- initializeAttributorLegacyPassPass(Registry);
- initializeAttributorCGSCCLegacyPassPass(Registry);
- initializePostOrderFunctionAttrsLegacyPassPass(Registry);
- initializeReversePostOrderFunctionAttrsLegacyPassPass(Registry);
- initializePruneEHPass(Registry);
- initializeIPSCCPLegacyPassPass(Registry);
- initializeStripDeadPrototypesLegacyPassPass(Registry);
- initializeStripSymbolsPass(Registry);
- initializeStripDebugDeclarePass(Registry);
- initializeStripDeadDebugInfoPass(Registry);
- initializeStripNonDebugSymbolsPass(Registry);
- initializeBarrierNoopPass(Registry);
- initializeEliminateAvailableExternallyLegacyPassPass(Registry);
- initializeSampleProfileLoaderLegacyPassPass(Registry);
- initializeFunctionImportLegacyPassPass(Registry);
- initializeWholeProgramDevirtPass(Registry);
-}
-
-void LLVMInitializeIPO(LLVMPassRegistryRef R) {
- initializeIPO(*unwrap(R));
-}
-
-void LLVMAddArgumentPromotionPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createArgumentPromotionPass());
-}
-
-void LLVMAddCalledValuePropagationPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createCalledValuePropagationPass());
-}
-
-void LLVMAddConstantMergePass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createConstantMergePass());
-}
-
-void LLVMAddDeadArgEliminationPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createDeadArgEliminationPass());
-}
-
-void LLVMAddFunctionAttrsPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createPostOrderFunctionAttrsLegacyPass());
-}
-
-void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createFunctionInliningPass());
-}
-
-void LLVMAddAlwaysInlinerPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(llvm::createAlwaysInlinerLegacyPass());
-}
-
-void LLVMAddGlobalDCEPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createGlobalDCEPass());
-}
-
-void LLVMAddGlobalOptimizerPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createGlobalOptimizerPass());
-}
-
-void LLVMAddPruneEHPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createPruneEHPass());
-}
-
-void LLVMAddIPSCCPPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createIPSCCPPass());
-}
-
-void LLVMAddMergeFunctionsPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createMergeFunctionsPass());
-}
-
-void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) {
- auto PreserveMain = [=](const GlobalValue &GV) {
- return AllButMain && GV.getName() == "main";
- };
- unwrap(PM)->add(createInternalizePass(PreserveMain));
-}
-
-void LLVMAddInternalizePassWithMustPreservePredicate(
- LLVMPassManagerRef PM,
- void *Context,
- LLVMBool (*Pred)(LLVMValueRef, void *)) {
- unwrap(PM)->add(createInternalizePass([=](const GlobalValue &GV) {
- return Pred(wrap(&GV), Context) == 0 ? false : true;
- }));
-}
-
-void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createStripDeadPrototypesPass());
-}
-
-void LLVMAddStripSymbolsPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createStripSymbolsPass());
-}
+ initializeSingleLoopExtractorPass(Registry);
+ initializeLowerTypeTestsPass(Registry);
+ initializeMergeFunctionsLegacyPassPass(Registry);
+ initializePartialInlinerLegacyPassPass(Registry);
+ initializeAttributorLegacyPassPass(Registry);
+ initializeAttributorCGSCCLegacyPassPass(Registry);
+ initializePostOrderFunctionAttrsLegacyPassPass(Registry);
+ initializeReversePostOrderFunctionAttrsLegacyPassPass(Registry);
+ initializePruneEHPass(Registry);
+ initializeIPSCCPLegacyPassPass(Registry);
+ initializeStripDeadPrototypesLegacyPassPass(Registry);
+ initializeStripSymbolsPass(Registry);
+ initializeStripDebugDeclarePass(Registry);
+ initializeStripDeadDebugInfoPass(Registry);
+ initializeStripNonDebugSymbolsPass(Registry);
+ initializeBarrierNoopPass(Registry);
+ initializeEliminateAvailableExternallyLegacyPassPass(Registry);
+ initializeSampleProfileLoaderLegacyPassPass(Registry);
+ initializeFunctionImportLegacyPassPass(Registry);
+ initializeWholeProgramDevirtPass(Registry);
+}
+
+void LLVMInitializeIPO(LLVMPassRegistryRef R) {
+ initializeIPO(*unwrap(R));
+}
+
+void LLVMAddArgumentPromotionPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createArgumentPromotionPass());
+}
+
+void LLVMAddCalledValuePropagationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createCalledValuePropagationPass());
+}
+
+void LLVMAddConstantMergePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createConstantMergePass());
+}
+
+void LLVMAddDeadArgEliminationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createDeadArgEliminationPass());
+}
+
+void LLVMAddFunctionAttrsPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createPostOrderFunctionAttrsLegacyPass());
+}
+
+void LLVMAddFunctionInliningPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createFunctionInliningPass());
+}
+
+void LLVMAddAlwaysInlinerPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(llvm::createAlwaysInlinerLegacyPass());
+}
+
+void LLVMAddGlobalDCEPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createGlobalDCEPass());
+}
+
+void LLVMAddGlobalOptimizerPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createGlobalOptimizerPass());
+}
+
+void LLVMAddPruneEHPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createPruneEHPass());
+}
+
+void LLVMAddIPSCCPPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createIPSCCPPass());
+}
+
+void LLVMAddMergeFunctionsPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createMergeFunctionsPass());
+}
+
+void LLVMAddInternalizePass(LLVMPassManagerRef PM, unsigned AllButMain) {
+ auto PreserveMain = [=](const GlobalValue &GV) {
+ return AllButMain && GV.getName() == "main";
+ };
+ unwrap(PM)->add(createInternalizePass(PreserveMain));
+}
+
+void LLVMAddInternalizePassWithMustPreservePredicate(
+ LLVMPassManagerRef PM,
+ void *Context,
+ LLVMBool (*Pred)(LLVMValueRef, void *)) {
+ unwrap(PM)->add(createInternalizePass([=](const GlobalValue &GV) {
+ return Pred(wrap(&GV), Context) == 0 ? false : true;
+ }));
+}
+
+void LLVMAddStripDeadPrototypesPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createStripDeadPrototypesPass());
+}
+
+void LLVMAddStripSymbolsPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createStripSymbolsPass());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/InferFunctionAttrs.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/InferFunctionAttrs.cpp
index 327d411ea4..685f8f7d7a 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/InferFunctionAttrs.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/InferFunctionAttrs.cpp
@@ -1,85 +1,85 @@
-//===- InferFunctionAttrs.cpp - Infer implicit function attributes --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BuildLibCalls.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "inferattrs"
-
-static bool inferAllPrototypeAttributes(
- Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
- bool Changed = false;
-
- for (Function &F : M.functions())
- // We only infer things using the prototype and the name; we don't need
- // definitions.
- if (F.isDeclaration() && !F.hasOptNone())
- Changed |= inferLibFuncAttributes(F, GetTLI(F));
-
- return Changed;
-}
-
-PreservedAnalyses InferFunctionAttrsPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
- return FAM.getResult<TargetLibraryAnalysis>(F);
- };
-
- if (!inferAllPrototypeAttributes(M, GetTLI))
- // If we didn't infer anything, preserve all analyses.
- return PreservedAnalyses::all();
-
- // Otherwise, we may have changed fundamental function attributes, so clear
- // out all the passes.
- return PreservedAnalyses::none();
-}
-
-namespace {
-struct InferFunctionAttrsLegacyPass : public ModulePass {
- static char ID; // Pass identification, replacement for typeid
- InferFunctionAttrsLegacyPass() : ModulePass(ID) {
- initializeInferFunctionAttrsLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
-
- auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
- return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- };
- return inferAllPrototypeAttributes(M, GetTLI);
- }
-};
-}
-
-char InferFunctionAttrsLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(InferFunctionAttrsLegacyPass, "inferattrs",
- "Infer set function attributes", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(InferFunctionAttrsLegacyPass, "inferattrs",
- "Infer set function attributes", false, false)
-
-Pass *llvm::createInferFunctionAttrsLegacyPass() {
- return new InferFunctionAttrsLegacyPass();
-}
+//===- InferFunctionAttrs.cpp - Infer implicit function attributes --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "inferattrs"
+
+static bool inferAllPrototypeAttributes(
+ Module &M, function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
+ bool Changed = false;
+
+ for (Function &F : M.functions())
+ // We only infer things using the prototype and the name; we don't need
+ // definitions.
+ if (F.isDeclaration() && !F.hasOptNone())
+ Changed |= inferLibFuncAttributes(F, GetTLI(F));
+
+ return Changed;
+}
+
+PreservedAnalyses InferFunctionAttrsPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+ return FAM.getResult<TargetLibraryAnalysis>(F);
+ };
+
+ if (!inferAllPrototypeAttributes(M, GetTLI))
+ // If we didn't infer anything, preserve all analyses.
+ return PreservedAnalyses::all();
+
+ // Otherwise, we may have changed fundamental function attributes, so clear
+ // out all the passes.
+ return PreservedAnalyses::none();
+}
+
+namespace {
+struct InferFunctionAttrsLegacyPass : public ModulePass {
+ static char ID; // Pass identification, replacement for typeid
+ InferFunctionAttrsLegacyPass() : ModulePass(ID) {
+ initializeInferFunctionAttrsLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+
+ auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+ return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ };
+ return inferAllPrototypeAttributes(M, GetTLI);
+ }
+};
+}
+
+char InferFunctionAttrsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(InferFunctionAttrsLegacyPass, "inferattrs",
+ "Infer set function attributes", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(InferFunctionAttrsLegacyPass, "inferattrs",
+ "Infer set function attributes", false, false)
+
+Pass *llvm::createInferFunctionAttrsLegacyPass() {
+ return new InferFunctionAttrsLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/InlineSimple.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/InlineSimple.cpp
index 51659f659c..76f1d0c54d 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/InlineSimple.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/InlineSimple.cpp
@@ -1,124 +1,124 @@
-//===- InlineSimple.cpp - Code to perform simple function inlining --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements bottom-up inlining of functions into callees.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/Inliner.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "inline"
-
-namespace {
-
-/// Actual inliner pass implementation.
-///
-/// The common implementation of the inlining logic is shared between this
-/// inliner pass and the always inliner pass. The two passes use different cost
-/// analyses to determine when to inline.
-class SimpleInliner : public LegacyInlinerBase {
-
- InlineParams Params;
-
-public:
- SimpleInliner() : LegacyInlinerBase(ID), Params(llvm::getInlineParams()) {
- initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
- }
-
- explicit SimpleInliner(InlineParams Params)
- : LegacyInlinerBase(ID), Params(std::move(Params)) {
- initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
- }
-
- static char ID; // Pass identification, replacement for typeid
-
- InlineCost getInlineCost(CallBase &CB) override {
- Function *Callee = CB.getCalledFunction();
- TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
-
- bool RemarksEnabled = false;
- const auto &BBs = CB.getCaller()->getBasicBlockList();
- if (!BBs.empty()) {
- auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
- if (DI.isEnabled())
- RemarksEnabled = true;
- }
- OptimizationRemarkEmitter ORE(CB.getCaller());
-
- std::function<AssumptionCache &(Function &)> GetAssumptionCache =
- [&](Function &F) -> AssumptionCache & {
- return ACT->getAssumptionCache(F);
- };
- return llvm::getInlineCost(CB, Params, TTI, GetAssumptionCache, GetTLI,
- /*GetBFI=*/nullptr, PSI,
- RemarksEnabled ? &ORE : nullptr);
- }
-
- bool runOnSCC(CallGraphSCC &SCC) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override;
-
-private:
- TargetTransformInfoWrapperPass *TTIWP;
-
-};
-
-} // end anonymous namespace
-
-char SimpleInliner::ID = 0;
-INITIALIZE_PASS_BEGIN(SimpleInliner, "inline", "Function Integration/Inlining",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(SimpleInliner, "inline", "Function Integration/Inlining",
- false, false)
-
-Pass *llvm::createFunctionInliningPass() { return new SimpleInliner(); }
-
-Pass *llvm::createFunctionInliningPass(int Threshold) {
- return new SimpleInliner(llvm::getInlineParams(Threshold));
-}
-
-Pass *llvm::createFunctionInliningPass(unsigned OptLevel,
- unsigned SizeOptLevel,
- bool DisableInlineHotCallSite) {
- auto Param = llvm::getInlineParams(OptLevel, SizeOptLevel);
- if (DisableInlineHotCallSite)
- Param.HotCallSiteThreshold = 0;
- return new SimpleInliner(Param);
-}
-
-Pass *llvm::createFunctionInliningPass(InlineParams &Params) {
- return new SimpleInliner(Params);
-}
-
-bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) {
- TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
- return LegacyInlinerBase::runOnSCC(SCC);
-}
-
-void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<TargetTransformInfoWrapperPass>();
- LegacyInlinerBase::getAnalysisUsage(AU);
-}
+//===- InlineSimple.cpp - Code to perform simple function inlining --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements bottom-up inlining of functions into callees.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/Inliner.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inline"
+
+namespace {
+
+/// Actual inliner pass implementation.
+///
+/// The common implementation of the inlining logic is shared between this
+/// inliner pass and the always inliner pass. The two passes use different cost
+/// analyses to determine when to inline.
+class SimpleInliner : public LegacyInlinerBase {
+
+ InlineParams Params;
+
+public:
+ SimpleInliner() : LegacyInlinerBase(ID), Params(llvm::getInlineParams()) {
+ initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+ }
+
+ explicit SimpleInliner(InlineParams Params)
+ : LegacyInlinerBase(ID), Params(std::move(Params)) {
+ initializeSimpleInlinerPass(*PassRegistry::getPassRegistry());
+ }
+
+ static char ID; // Pass identification, replacement for typeid
+
+ InlineCost getInlineCost(CallBase &CB) override {
+ Function *Callee = CB.getCalledFunction();
+ TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
+
+ bool RemarksEnabled = false;
+ const auto &BBs = CB.getCaller()->getBasicBlockList();
+ if (!BBs.empty()) {
+ auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
+ if (DI.isEnabled())
+ RemarksEnabled = true;
+ }
+ OptimizationRemarkEmitter ORE(CB.getCaller());
+
+ std::function<AssumptionCache &(Function &)> GetAssumptionCache =
+ [&](Function &F) -> AssumptionCache & {
+ return ACT->getAssumptionCache(F);
+ };
+ return llvm::getInlineCost(CB, Params, TTI, GetAssumptionCache, GetTLI,
+ /*GetBFI=*/nullptr, PSI,
+ RemarksEnabled ? &ORE : nullptr);
+ }
+
+ bool runOnSCC(CallGraphSCC &SCC) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+ TargetTransformInfoWrapperPass *TTIWP;
+
+};
+
+} // end anonymous namespace
+
+char SimpleInliner::ID = 0;
+INITIALIZE_PASS_BEGIN(SimpleInliner, "inline", "Function Integration/Inlining",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(SimpleInliner, "inline", "Function Integration/Inlining",
+ false, false)
+
+Pass *llvm::createFunctionInliningPass() { return new SimpleInliner(); }
+
+Pass *llvm::createFunctionInliningPass(int Threshold) {
+ return new SimpleInliner(llvm::getInlineParams(Threshold));
+}
+
+Pass *llvm::createFunctionInliningPass(unsigned OptLevel,
+ unsigned SizeOptLevel,
+ bool DisableInlineHotCallSite) {
+ auto Param = llvm::getInlineParams(OptLevel, SizeOptLevel);
+ if (DisableInlineHotCallSite)
+ Param.HotCallSiteThreshold = 0;
+ return new SimpleInliner(Param);
+}
+
+Pass *llvm::createFunctionInliningPass(InlineParams &Params) {
+ return new SimpleInliner(Params);
+}
+
+bool SimpleInliner::runOnSCC(CallGraphSCC &SCC) {
+ TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
+ return LegacyInlinerBase::runOnSCC(SCC);
+}
+
+void SimpleInliner::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ LegacyInlinerBase::getAnalysisUsage(AU);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/Inliner.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/Inliner.cpp
index 133a6e2a85..e91b6c9b1d 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/Inliner.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/Inliner.cpp
@@ -1,658 +1,658 @@
-//===- Inliner.cpp - Code common to all inliners --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the mechanics required to implement inlining without
-// missing any calls and updating the call graph. The decisions of which calls
-// are profitable to inline are implemented elsewhere.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/Inliner.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/CGSCCPassManager.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InlineAdvisor.h"
-#include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/LazyCallGraph.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
+//===- Inliner.cpp - Code common to all inliners --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the mechanics required to implement inlining without
+// missing any calls and updating the call graph. The decisions of which calls
+// are profitable to inline are implemented elsewhere.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/Inliner.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InlineAdvisor.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/Utils/ImportedFunctionsInliningStatistics.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/CallPromotionUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-#include <algorithm>
-#include <cassert>
-#include <functional>
-#include <sstream>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "inline"
-
-STATISTIC(NumInlined, "Number of functions inlined");
-STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined");
-STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
-STATISTIC(NumMergedAllocas, "Number of allocas merged together");
-
-/// Flag to disable manual alloca merging.
-///
-/// Merging of allocas was originally done as a stack-size saving technique
-/// prior to LLVM's code generator having support for stack coloring based on
-/// lifetime markers. It is now in the process of being removed. To experiment
-/// with disabling it and relying fully on lifetime marker based stack
-/// coloring, you can pass this flag to LLVM.
-static cl::opt<bool>
- DisableInlinedAllocaMerging("disable-inlined-alloca-merging",
- cl::init(false), cl::Hidden);
-
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <sstream>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inline"
+
+STATISTIC(NumInlined, "Number of functions inlined");
+STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined");
+STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
+STATISTIC(NumMergedAllocas, "Number of allocas merged together");
+
+/// Flag to disable manual alloca merging.
+///
+/// Merging of allocas was originally done as a stack-size saving technique
+/// prior to LLVM's code generator having support for stack coloring based on
+/// lifetime markers. It is now in the process of being removed. To experiment
+/// with disabling it and relying fully on lifetime marker based stack
+/// coloring, you can pass this flag to LLVM.
+static cl::opt<bool>
+ DisableInlinedAllocaMerging("disable-inlined-alloca-merging",
+ cl::init(false), cl::Hidden);
+
extern cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats;
-
+
static cl::opt<std::string> CGSCCInlineReplayFile(
"cgscc-inline-replay", cl::init(""), cl::value_desc("filename"),
cl::desc(
"Optimization remarks file containing inline remarks to be replayed "
"by inlining from cgscc inline remarks."),
cl::Hidden);
-
-LegacyInlinerBase::LegacyInlinerBase(char &ID) : CallGraphSCCPass(ID) {}
-
-LegacyInlinerBase::LegacyInlinerBase(char &ID, bool InsertLifetime)
- : CallGraphSCCPass(ID), InsertLifetime(InsertLifetime) {}
-
-/// For this class, we declare that we require and preserve the call graph.
-/// If the derived class implements this method, it should
-/// always explicitly call the implementation here.
-void LegacyInlinerBase::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<ProfileSummaryInfoWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- getAAResultsAnalysisUsage(AU);
- CallGraphSCCPass::getAnalysisUsage(AU);
-}
-
-using InlinedArrayAllocasTy = DenseMap<ArrayType *, std::vector<AllocaInst *>>;
-
-/// Look at all of the allocas that we inlined through this call site. If we
-/// have already inlined other allocas through other calls into this function,
-/// then we know that they have disjoint lifetimes and that we can merge them.
-///
-/// There are many heuristics possible for merging these allocas, and the
-/// different options have different tradeoffs. One thing that we *really*
-/// don't want to hurt is SRoA: once inlining happens, often allocas are no
-/// longer address taken and so they can be promoted.
-///
-/// Our "solution" for that is to only merge allocas whose outermost type is an
-/// array type. These are usually not promoted because someone is using a
-/// variable index into them. These are also often the most important ones to
-/// merge.
-///
-/// A better solution would be to have real memory lifetime markers in the IR
-/// and not have the inliner do any merging of allocas at all. This would
-/// allow the backend to do proper stack slot coloring of all allocas that
-/// *actually make it to the backend*, which is really what we want.
-///
-/// Because we don't have this information, we do this simple and useful hack.
-static void mergeInlinedArrayAllocas(Function *Caller, InlineFunctionInfo &IFI,
- InlinedArrayAllocasTy &InlinedArrayAllocas,
- int InlineHistory) {
- SmallPtrSet<AllocaInst *, 16> UsedAllocas;
-
- // When processing our SCC, check to see if the call site was inlined from
- // some other call site. For example, if we're processing "A" in this code:
- // A() { B() }
- // B() { x = alloca ... C() }
- // C() { y = alloca ... }
- // Assume that C was not inlined into B initially, and so we're processing A
- // and decide to inline B into A. Doing this makes an alloca available for
- // reuse and makes a callsite (C) available for inlining. When we process
- // the C call site we don't want to do any alloca merging between X and Y
- // because their scopes are not disjoint. We could make this smarter by
- // keeping track of the inline history for each alloca in the
- // InlinedArrayAllocas but this isn't likely to be a significant win.
- if (InlineHistory != -1) // Only do merging for top-level call sites in SCC.
- return;
-
- // Loop over all the allocas we have so far and see if they can be merged with
- // a previously inlined alloca. If not, remember that we had it.
- for (unsigned AllocaNo = 0, E = IFI.StaticAllocas.size(); AllocaNo != E;
- ++AllocaNo) {
- AllocaInst *AI = IFI.StaticAllocas[AllocaNo];
-
- // Don't bother trying to merge array allocations (they will usually be
- // canonicalized to be an allocation *of* an array), or allocations whose
- // type is not itself an array (because we're afraid of pessimizing SRoA).
- ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType());
- if (!ATy || AI->isArrayAllocation())
- continue;
-
- // Get the list of all available allocas for this array type.
- std::vector<AllocaInst *> &AllocasForType = InlinedArrayAllocas[ATy];
-
- // Loop over the allocas in AllocasForType to see if we can reuse one. Note
- // that we have to be careful not to reuse the same "available" alloca for
- // multiple different allocas that we just inlined, we use the 'UsedAllocas'
- // set to keep track of which "available" allocas are being used by this
- // function. Also, AllocasForType can be empty of course!
- bool MergedAwayAlloca = false;
- for (AllocaInst *AvailableAlloca : AllocasForType) {
- Align Align1 = AI->getAlign();
- Align Align2 = AvailableAlloca->getAlign();
-
- // The available alloca has to be in the right function, not in some other
- // function in this SCC.
- if (AvailableAlloca->getParent() != AI->getParent())
- continue;
-
- // If the inlined function already uses this alloca then we can't reuse
- // it.
- if (!UsedAllocas.insert(AvailableAlloca).second)
- continue;
-
- // Otherwise, we *can* reuse it, RAUW AI into AvailableAlloca and declare
- // success!
- LLVM_DEBUG(dbgs() << " ***MERGED ALLOCA: " << *AI
- << "\n\t\tINTO: " << *AvailableAlloca << '\n');
-
- // Move affected dbg.declare calls immediately after the new alloca to
- // avoid the situation when a dbg.declare precedes its alloca.
- if (auto *L = LocalAsMetadata::getIfExists(AI))
- if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L))
- for (User *U : MDV->users())
- if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
- DDI->moveBefore(AvailableAlloca->getNextNode());
-
- AI->replaceAllUsesWith(AvailableAlloca);
-
- if (Align1 > Align2)
- AvailableAlloca->setAlignment(AI->getAlign());
-
- AI->eraseFromParent();
- MergedAwayAlloca = true;
- ++NumMergedAllocas;
- IFI.StaticAllocas[AllocaNo] = nullptr;
- break;
- }
-
- // If we already nuked the alloca, we're done with it.
- if (MergedAwayAlloca)
- continue;
-
- // If we were unable to merge away the alloca either because there are no
- // allocas of the right type available or because we reused them all
- // already, remember that this alloca came from an inlined function and mark
- // it used so we don't reuse it for other allocas from this inline
- // operation.
- AllocasForType.push_back(AI);
- UsedAllocas.insert(AI);
- }
-}
-
-/// If it is possible to inline the specified call site,
-/// do so and update the CallGraph for this operation.
-///
-/// This function also does some basic book-keeping to update the IR. The
-/// InlinedArrayAllocas map keeps track of any allocas that are already
-/// available from other functions inlined into the caller. If we are able to
-/// inline this call site we attempt to reuse already available allocas or add
-/// any new allocas to the set if not possible.
-static InlineResult inlineCallIfPossible(
- CallBase &CB, InlineFunctionInfo &IFI,
- InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory,
- bool InsertLifetime, function_ref<AAResults &(Function &)> &AARGetter,
- ImportedFunctionsInliningStatistics &ImportedFunctionsStats) {
- Function *Callee = CB.getCalledFunction();
- Function *Caller = CB.getCaller();
-
- AAResults &AAR = AARGetter(*Callee);
-
- // Try to inline the function. Get the list of static allocas that were
- // inlined.
- InlineResult IR = InlineFunction(CB, IFI, &AAR, InsertLifetime);
- if (!IR.isSuccess())
- return IR;
-
- if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
- ImportedFunctionsStats.recordInline(*Caller, *Callee);
-
- AttributeFuncs::mergeAttributesForInlining(*Caller, *Callee);
-
- if (!DisableInlinedAllocaMerging)
- mergeInlinedArrayAllocas(Caller, IFI, InlinedArrayAllocas, InlineHistory);
-
- return IR; // success
-}
-
-/// Return true if the specified inline history ID
-/// indicates an inline history that includes the specified function.
-static bool inlineHistoryIncludes(
- Function *F, int InlineHistoryID,
- const SmallVectorImpl<std::pair<Function *, int>> &InlineHistory) {
- while (InlineHistoryID != -1) {
- assert(unsigned(InlineHistoryID) < InlineHistory.size() &&
- "Invalid inline history ID");
- if (InlineHistory[InlineHistoryID].first == F)
- return true;
- InlineHistoryID = InlineHistory[InlineHistoryID].second;
- }
- return false;
-}
-
-bool LegacyInlinerBase::doInitialization(CallGraph &CG) {
- if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
- ImportedFunctionsStats.setModuleInfo(CG.getModule());
- return false; // No changes to CallGraph.
-}
-
-bool LegacyInlinerBase::runOnSCC(CallGraphSCC &SCC) {
- if (skipSCC(SCC))
- return false;
- return inlineCalls(SCC);
-}
-
-static bool
-inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
- std::function<AssumptionCache &(Function &)> GetAssumptionCache,
- ProfileSummaryInfo *PSI,
- std::function<const TargetLibraryInfo &(Function &)> GetTLI,
- bool InsertLifetime,
- function_ref<InlineCost(CallBase &CB)> GetInlineCost,
- function_ref<AAResults &(Function &)> AARGetter,
- ImportedFunctionsInliningStatistics &ImportedFunctionsStats) {
- SmallPtrSet<Function *, 8> SCCFunctions;
- LLVM_DEBUG(dbgs() << "Inliner visiting SCC:");
- for (CallGraphNode *Node : SCC) {
- Function *F = Node->getFunction();
- if (F)
- SCCFunctions.insert(F);
- LLVM_DEBUG(dbgs() << " " << (F ? F->getName() : "INDIRECTNODE"));
- }
-
- // Scan through and identify all call sites ahead of time so that we only
- // inline call sites in the original functions, not call sites that result
- // from inlining other functions.
- SmallVector<std::pair<CallBase *, int>, 16> CallSites;
-
- // When inlining a callee produces new call sites, we want to keep track of
- // the fact that they were inlined from the callee. This allows us to avoid
- // infinite inlining in some obscure cases. To represent this, we use an
- // index into the InlineHistory vector.
- SmallVector<std::pair<Function *, int>, 8> InlineHistory;
-
- for (CallGraphNode *Node : SCC) {
- Function *F = Node->getFunction();
- if (!F || F->isDeclaration())
- continue;
-
- OptimizationRemarkEmitter ORE(F);
- for (BasicBlock &BB : *F)
- for (Instruction &I : BB) {
- auto *CB = dyn_cast<CallBase>(&I);
- // If this isn't a call, or it is a call to an intrinsic, it can
- // never be inlined.
- if (!CB || isa<IntrinsicInst>(I))
- continue;
-
- // If this is a direct call to an external function, we can never inline
- // it. If it is an indirect call, inlining may resolve it to be a
- // direct call, so we keep it.
- if (Function *Callee = CB->getCalledFunction())
- if (Callee->isDeclaration()) {
- using namespace ore;
-
- setInlineRemark(*CB, "unavailable definition");
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
- << NV("Callee", Callee) << " will not be inlined into "
- << NV("Caller", CB->getCaller())
- << " because its definition is unavailable"
- << setIsVerbose();
- });
- continue;
- }
-
- CallSites.push_back(std::make_pair(CB, -1));
- }
- }
-
- LLVM_DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n");
-
- // If there are no calls in this function, exit early.
- if (CallSites.empty())
- return false;
-
- // Now that we have all of the call sites, move the ones to functions in the
- // current SCC to the end of the list.
- unsigned FirstCallInSCC = CallSites.size();
- for (unsigned I = 0; I < FirstCallInSCC; ++I)
- if (Function *F = CallSites[I].first->getCalledFunction())
- if (SCCFunctions.count(F))
- std::swap(CallSites[I--], CallSites[--FirstCallInSCC]);
-
- InlinedArrayAllocasTy InlinedArrayAllocas;
- InlineFunctionInfo InlineInfo(&CG, GetAssumptionCache, PSI);
-
- // Now that we have all of the call sites, loop over them and inline them if
- // it looks profitable to do so.
- bool Changed = false;
- bool LocalChange;
- do {
- LocalChange = false;
- // Iterate over the outer loop because inlining functions can cause indirect
- // calls to become direct calls.
- // CallSites may be modified inside so ranged for loop can not be used.
- for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) {
- auto &P = CallSites[CSi];
- CallBase &CB = *P.first;
- const int InlineHistoryID = P.second;
-
- Function *Caller = CB.getCaller();
- Function *Callee = CB.getCalledFunction();
-
- // We can only inline direct calls to non-declarations.
- if (!Callee || Callee->isDeclaration())
- continue;
-
- bool IsTriviallyDead = isInstructionTriviallyDead(&CB, &GetTLI(*Caller));
-
- if (!IsTriviallyDead) {
- // If this call site was obtained by inlining another function, verify
- // that the include path for the function did not include the callee
- // itself. If so, we'd be recursively inlining the same function,
- // which would provide the same callsites, which would cause us to
- // infinitely inline.
- if (InlineHistoryID != -1 &&
- inlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory)) {
- setInlineRemark(CB, "recursive");
- continue;
- }
- }
-
- // FIXME for new PM: because of the old PM we currently generate ORE and
- // in turn BFI on demand. With the new PM, the ORE dependency should
- // just become a regular analysis dependency.
- OptimizationRemarkEmitter ORE(Caller);
-
- auto OIC = shouldInline(CB, GetInlineCost, ORE);
- // If the policy determines that we should inline this function,
- // delete the call instead.
- if (!OIC)
- continue;
-
- // If this call site is dead and it is to a readonly function, we should
- // just delete the call instead of trying to inline it, regardless of
- // size. This happens because IPSCCP propagates the result out of the
- // call and then we're left with the dead call.
- if (IsTriviallyDead) {
- LLVM_DEBUG(dbgs() << " -> Deleting dead call: " << CB << "\n");
- // Update the call graph by deleting the edge from Callee to Caller.
- setInlineRemark(CB, "trivially dead");
- CG[Caller]->removeCallEdgeFor(CB);
- CB.eraseFromParent();
- ++NumCallsDeleted;
- } else {
- // Get DebugLoc to report. CB will be invalid after Inliner.
- DebugLoc DLoc = CB.getDebugLoc();
- BasicBlock *Block = CB.getParent();
-
- // Attempt to inline the function.
- using namespace ore;
-
- InlineResult IR = inlineCallIfPossible(
- CB, InlineInfo, InlinedArrayAllocas, InlineHistoryID,
- InsertLifetime, AARGetter, ImportedFunctionsStats);
- if (!IR.isSuccess()) {
- setInlineRemark(CB, std::string(IR.getFailureReason()) + "; " +
- inlineCostStr(*OIC));
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc,
- Block)
- << NV("Callee", Callee) << " will not be inlined into "
- << NV("Caller", Caller) << ": "
- << NV("Reason", IR.getFailureReason());
- });
- continue;
- }
- ++NumInlined;
-
- emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC);
-
- // If inlining this function gave us any new call sites, throw them
- // onto our worklist to process. They are useful inline candidates.
- if (!InlineInfo.InlinedCalls.empty()) {
- // Create a new inline history entry for this, so that we remember
- // that these new callsites came about due to inlining Callee.
- int NewHistoryID = InlineHistory.size();
- InlineHistory.push_back(std::make_pair(Callee, InlineHistoryID));
-
-#ifndef NDEBUG
- // Make sure no dupplicates in the inline candidates. This could
- // happen when a callsite is simpilfied to reusing the return value
- // of another callsite during function cloning, thus the other
- // callsite will be reconsidered here.
- DenseSet<CallBase *> DbgCallSites;
- for (auto &II : CallSites)
- DbgCallSites.insert(II.first);
-#endif
-
- for (Value *Ptr : InlineInfo.InlinedCalls) {
-#ifndef NDEBUG
- assert(DbgCallSites.count(dyn_cast<CallBase>(Ptr)) == 0);
-#endif
- CallSites.push_back(
- std::make_pair(dyn_cast<CallBase>(Ptr), NewHistoryID));
- }
- }
- }
-
- // If we inlined or deleted the last possible call site to the function,
- // delete the function body now.
- if (Callee && Callee->use_empty() && Callee->hasLocalLinkage() &&
- // TODO: Can remove if in SCC now.
- !SCCFunctions.count(Callee) &&
- // The function may be apparently dead, but if there are indirect
- // callgraph references to the node, we cannot delete it yet, this
- // could invalidate the CGSCC iterator.
- CG[Callee]->getNumReferences() == 0) {
- LLVM_DEBUG(dbgs() << " -> Deleting dead function: "
- << Callee->getName() << "\n");
- CallGraphNode *CalleeNode = CG[Callee];
-
- // Remove any call graph edges from the callee to its callees.
- CalleeNode->removeAllCalledFunctions();
-
- // Removing the node for callee from the call graph and delete it.
- delete CG.removeFunctionFromModule(CalleeNode);
- ++NumDeleted;
- }
-
- // Remove this call site from the list. If possible, use
- // swap/pop_back for efficiency, but do not use it if doing so would
- // move a call site to a function in this SCC before the
- // 'FirstCallInSCC' barrier.
- if (SCC.isSingular()) {
- CallSites[CSi] = CallSites.back();
- CallSites.pop_back();
- } else {
- CallSites.erase(CallSites.begin() + CSi);
- }
- --CSi;
-
- Changed = true;
- LocalChange = true;
- }
- } while (LocalChange);
-
- return Changed;
-}
-
-bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) {
- CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
- ACT = &getAnalysis<AssumptionCacheTracker>();
- PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
- GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
- return getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- };
- auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
- return ACT->getAssumptionCache(F);
- };
- return inlineCallsImpl(
- SCC, CG, GetAssumptionCache, PSI, GetTLI, InsertLifetime,
- [&](CallBase &CB) { return getInlineCost(CB); }, LegacyAARGetter(*this),
- ImportedFunctionsStats);
-}
-
-/// Remove now-dead linkonce functions at the end of
-/// processing to avoid breaking the SCC traversal.
-bool LegacyInlinerBase::doFinalization(CallGraph &CG) {
- if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
- ImportedFunctionsStats.dump(InlinerFunctionImportStats ==
- InlinerFunctionImportStatsOpts::Verbose);
- return removeDeadFunctions(CG);
-}
-
-/// Remove dead functions that are not included in DNR (Do Not Remove) list.
-bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG,
- bool AlwaysInlineOnly) {
- SmallVector<CallGraphNode *, 16> FunctionsToRemove;
- SmallVector<Function *, 16> DeadFunctionsInComdats;
-
- auto RemoveCGN = [&](CallGraphNode *CGN) {
- // Remove any call graph edges from the function to its callees.
- CGN->removeAllCalledFunctions();
-
- // Remove any edges from the external node to the function's call graph
- // node. These edges might have been made irrelegant due to
- // optimization of the program.
- CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN);
-
- // Removing the node for callee from the call graph and delete it.
- FunctionsToRemove.push_back(CGN);
- };
-
- // Scan for all of the functions, looking for ones that should now be removed
- // from the program. Insert the dead ones in the FunctionsToRemove set.
- for (const auto &I : CG) {
- CallGraphNode *CGN = I.second.get();
- Function *F = CGN->getFunction();
- if (!F || F->isDeclaration())
- continue;
-
- // Handle the case when this function is called and we only want to care
- // about always-inline functions. This is a bit of a hack to share code
- // between here and the InlineAlways pass.
- if (AlwaysInlineOnly && !F->hasFnAttribute(Attribute::AlwaysInline))
- continue;
-
- // If the only remaining users of the function are dead constants, remove
- // them.
- F->removeDeadConstantUsers();
-
- if (!F->isDefTriviallyDead())
- continue;
-
- // It is unsafe to drop a function with discardable linkage from a COMDAT
- // without also dropping the other members of the COMDAT.
- // The inliner doesn't visit non-function entities which are in COMDAT
- // groups so it is unsafe to do so *unless* the linkage is local.
- if (!F->hasLocalLinkage()) {
- if (F->hasComdat()) {
- DeadFunctionsInComdats.push_back(F);
- continue;
- }
- }
-
- RemoveCGN(CGN);
- }
- if (!DeadFunctionsInComdats.empty()) {
- // Filter out the functions whose comdats remain alive.
- filterDeadComdatFunctions(CG.getModule(), DeadFunctionsInComdats);
- // Remove the rest.
- for (Function *F : DeadFunctionsInComdats)
- RemoveCGN(CG[F]);
- }
-
- if (FunctionsToRemove.empty())
- return false;
-
- // Now that we know which functions to delete, do so. We didn't want to do
- // this inline, because that would invalidate our CallGraph::iterator
- // objects. :(
- //
- // Note that it doesn't matter that we are iterating over a non-stable order
- // here to do this, it doesn't matter which order the functions are deleted
- // in.
- array_pod_sort(FunctionsToRemove.begin(), FunctionsToRemove.end());
- FunctionsToRemove.erase(
- std::unique(FunctionsToRemove.begin(), FunctionsToRemove.end()),
- FunctionsToRemove.end());
- for (CallGraphNode *CGN : FunctionsToRemove) {
- delete CG.removeFunctionFromModule(CGN);
- ++NumDeleted;
- }
- return true;
-}
-
-InlineAdvisor &
-InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
- FunctionAnalysisManager &FAM, Module &M) {
+
+LegacyInlinerBase::LegacyInlinerBase(char &ID) : CallGraphSCCPass(ID) {}
+
+LegacyInlinerBase::LegacyInlinerBase(char &ID, bool InsertLifetime)
+ : CallGraphSCCPass(ID), InsertLifetime(InsertLifetime) {}
+
+/// For this class, we declare that we require and preserve the call graph.
+/// If the derived class implements this method, it should
+/// always explicitly call the implementation here.
+void LegacyInlinerBase::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ getAAResultsAnalysisUsage(AU);
+ CallGraphSCCPass::getAnalysisUsage(AU);
+}
+
+using InlinedArrayAllocasTy = DenseMap<ArrayType *, std::vector<AllocaInst *>>;
+
+/// Look at all of the allocas that we inlined through this call site. If we
+/// have already inlined other allocas through other calls into this function,
+/// then we know that they have disjoint lifetimes and that we can merge them.
+///
+/// There are many heuristics possible for merging these allocas, and the
+/// different options have different tradeoffs. One thing that we *really*
+/// don't want to hurt is SRoA: once inlining happens, often allocas are no
+/// longer address taken and so they can be promoted.
+///
+/// Our "solution" for that is to only merge allocas whose outermost type is an
+/// array type. These are usually not promoted because someone is using a
+/// variable index into them. These are also often the most important ones to
+/// merge.
+///
+/// A better solution would be to have real memory lifetime markers in the IR
+/// and not have the inliner do any merging of allocas at all. This would
+/// allow the backend to do proper stack slot coloring of all allocas that
+/// *actually make it to the backend*, which is really what we want.
+///
+/// Because we don't have this information, we do this simple and useful hack.
+static void mergeInlinedArrayAllocas(Function *Caller, InlineFunctionInfo &IFI,
+ InlinedArrayAllocasTy &InlinedArrayAllocas,
+ int InlineHistory) {
+ SmallPtrSet<AllocaInst *, 16> UsedAllocas;
+
+ // When processing our SCC, check to see if the call site was inlined from
+ // some other call site. For example, if we're processing "A" in this code:
+ // A() { B() }
+ // B() { x = alloca ... C() }
+ // C() { y = alloca ... }
+ // Assume that C was not inlined into B initially, and so we're processing A
+ // and decide to inline B into A. Doing this makes an alloca available for
+ // reuse and makes a callsite (C) available for inlining. When we process
+ // the C call site we don't want to do any alloca merging between X and Y
+ // because their scopes are not disjoint. We could make this smarter by
+ // keeping track of the inline history for each alloca in the
+ // InlinedArrayAllocas but this isn't likely to be a significant win.
+ if (InlineHistory != -1) // Only do merging for top-level call sites in SCC.
+ return;
+
+ // Loop over all the allocas we have so far and see if they can be merged with
+ // a previously inlined alloca. If not, remember that we had it.
+ for (unsigned AllocaNo = 0, E = IFI.StaticAllocas.size(); AllocaNo != E;
+ ++AllocaNo) {
+ AllocaInst *AI = IFI.StaticAllocas[AllocaNo];
+
+ // Don't bother trying to merge array allocations (they will usually be
+ // canonicalized to be an allocation *of* an array), or allocations whose
+ // type is not itself an array (because we're afraid of pessimizing SRoA).
+ ArrayType *ATy = dyn_cast<ArrayType>(AI->getAllocatedType());
+ if (!ATy || AI->isArrayAllocation())
+ continue;
+
+ // Get the list of all available allocas for this array type.
+ std::vector<AllocaInst *> &AllocasForType = InlinedArrayAllocas[ATy];
+
+ // Loop over the allocas in AllocasForType to see if we can reuse one. Note
+ // that we have to be careful not to reuse the same "available" alloca for
+ // multiple different allocas that we just inlined, we use the 'UsedAllocas'
+ // set to keep track of which "available" allocas are being used by this
+ // function. Also, AllocasForType can be empty of course!
+ bool MergedAwayAlloca = false;
+ for (AllocaInst *AvailableAlloca : AllocasForType) {
+ Align Align1 = AI->getAlign();
+ Align Align2 = AvailableAlloca->getAlign();
+
+ // The available alloca has to be in the right function, not in some other
+ // function in this SCC.
+ if (AvailableAlloca->getParent() != AI->getParent())
+ continue;
+
+ // If the inlined function already uses this alloca then we can't reuse
+ // it.
+ if (!UsedAllocas.insert(AvailableAlloca).second)
+ continue;
+
+ // Otherwise, we *can* reuse it, RAUW AI into AvailableAlloca and declare
+ // success!
+ LLVM_DEBUG(dbgs() << " ***MERGED ALLOCA: " << *AI
+ << "\n\t\tINTO: " << *AvailableAlloca << '\n');
+
+ // Move affected dbg.declare calls immediately after the new alloca to
+ // avoid the situation when a dbg.declare precedes its alloca.
+ if (auto *L = LocalAsMetadata::getIfExists(AI))
+ if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L))
+ for (User *U : MDV->users())
+ if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(U))
+ DDI->moveBefore(AvailableAlloca->getNextNode());
+
+ AI->replaceAllUsesWith(AvailableAlloca);
+
+ if (Align1 > Align2)
+ AvailableAlloca->setAlignment(AI->getAlign());
+
+ AI->eraseFromParent();
+ MergedAwayAlloca = true;
+ ++NumMergedAllocas;
+ IFI.StaticAllocas[AllocaNo] = nullptr;
+ break;
+ }
+
+ // If we already nuked the alloca, we're done with it.
+ if (MergedAwayAlloca)
+ continue;
+
+ // If we were unable to merge away the alloca either because there are no
+ // allocas of the right type available or because we reused them all
+ // already, remember that this alloca came from an inlined function and mark
+ // it used so we don't reuse it for other allocas from this inline
+ // operation.
+ AllocasForType.push_back(AI);
+ UsedAllocas.insert(AI);
+ }
+}
+
+/// If it is possible to inline the specified call site,
+/// do so and update the CallGraph for this operation.
+///
+/// This function also does some basic book-keeping to update the IR. The
+/// InlinedArrayAllocas map keeps track of any allocas that are already
+/// available from other functions inlined into the caller. If we are able to
+/// inline this call site we attempt to reuse already available allocas or add
+/// any new allocas to the set if not possible.
+static InlineResult inlineCallIfPossible(
+ CallBase &CB, InlineFunctionInfo &IFI,
+ InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory,
+ bool InsertLifetime, function_ref<AAResults &(Function &)> &AARGetter,
+ ImportedFunctionsInliningStatistics &ImportedFunctionsStats) {
+ Function *Callee = CB.getCalledFunction();
+ Function *Caller = CB.getCaller();
+
+ AAResults &AAR = AARGetter(*Callee);
+
+ // Try to inline the function. Get the list of static allocas that were
+ // inlined.
+ InlineResult IR = InlineFunction(CB, IFI, &AAR, InsertLifetime);
+ if (!IR.isSuccess())
+ return IR;
+
+ if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
+ ImportedFunctionsStats.recordInline(*Caller, *Callee);
+
+ AttributeFuncs::mergeAttributesForInlining(*Caller, *Callee);
+
+ if (!DisableInlinedAllocaMerging)
+ mergeInlinedArrayAllocas(Caller, IFI, InlinedArrayAllocas, InlineHistory);
+
+ return IR; // success
+}
+
+/// Return true if the specified inline history ID
+/// indicates an inline history that includes the specified function.
+static bool inlineHistoryIncludes(
+ Function *F, int InlineHistoryID,
+ const SmallVectorImpl<std::pair<Function *, int>> &InlineHistory) {
+ while (InlineHistoryID != -1) {
+ assert(unsigned(InlineHistoryID) < InlineHistory.size() &&
+ "Invalid inline history ID");
+ if (InlineHistory[InlineHistoryID].first == F)
+ return true;
+ InlineHistoryID = InlineHistory[InlineHistoryID].second;
+ }
+ return false;
+}
+
+bool LegacyInlinerBase::doInitialization(CallGraph &CG) {
+ if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
+ ImportedFunctionsStats.setModuleInfo(CG.getModule());
+ return false; // No changes to CallGraph.
+}
+
+bool LegacyInlinerBase::runOnSCC(CallGraphSCC &SCC) {
+ if (skipSCC(SCC))
+ return false;
+ return inlineCalls(SCC);
+}
+
+static bool
+inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
+ std::function<AssumptionCache &(Function &)> GetAssumptionCache,
+ ProfileSummaryInfo *PSI,
+ std::function<const TargetLibraryInfo &(Function &)> GetTLI,
+ bool InsertLifetime,
+ function_ref<InlineCost(CallBase &CB)> GetInlineCost,
+ function_ref<AAResults &(Function &)> AARGetter,
+ ImportedFunctionsInliningStatistics &ImportedFunctionsStats) {
+ SmallPtrSet<Function *, 8> SCCFunctions;
+ LLVM_DEBUG(dbgs() << "Inliner visiting SCC:");
+ for (CallGraphNode *Node : SCC) {
+ Function *F = Node->getFunction();
+ if (F)
+ SCCFunctions.insert(F);
+ LLVM_DEBUG(dbgs() << " " << (F ? F->getName() : "INDIRECTNODE"));
+ }
+
+ // Scan through and identify all call sites ahead of time so that we only
+ // inline call sites in the original functions, not call sites that result
+ // from inlining other functions.
+ SmallVector<std::pair<CallBase *, int>, 16> CallSites;
+
+ // When inlining a callee produces new call sites, we want to keep track of
+ // the fact that they were inlined from the callee. This allows us to avoid
+ // infinite inlining in some obscure cases. To represent this, we use an
+ // index into the InlineHistory vector.
+ SmallVector<std::pair<Function *, int>, 8> InlineHistory;
+
+ for (CallGraphNode *Node : SCC) {
+ Function *F = Node->getFunction();
+ if (!F || F->isDeclaration())
+ continue;
+
+ OptimizationRemarkEmitter ORE(F);
+ for (BasicBlock &BB : *F)
+ for (Instruction &I : BB) {
+ auto *CB = dyn_cast<CallBase>(&I);
+ // If this isn't a call, or it is a call to an intrinsic, it can
+ // never be inlined.
+ if (!CB || isa<IntrinsicInst>(I))
+ continue;
+
+ // If this is a direct call to an external function, we can never inline
+ // it. If it is an indirect call, inlining may resolve it to be a
+ // direct call, so we keep it.
+ if (Function *Callee = CB->getCalledFunction())
+ if (Callee->isDeclaration()) {
+ using namespace ore;
+
+ setInlineRemark(*CB, "unavailable definition");
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
+ << NV("Callee", Callee) << " will not be inlined into "
+ << NV("Caller", CB->getCaller())
+ << " because its definition is unavailable"
+ << setIsVerbose();
+ });
+ continue;
+ }
+
+ CallSites.push_back(std::make_pair(CB, -1));
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << ": " << CallSites.size() << " call sites.\n");
+
+ // If there are no calls in this function, exit early.
+ if (CallSites.empty())
+ return false;
+
+ // Now that we have all of the call sites, move the ones to functions in the
+ // current SCC to the end of the list.
+ unsigned FirstCallInSCC = CallSites.size();
+ for (unsigned I = 0; I < FirstCallInSCC; ++I)
+ if (Function *F = CallSites[I].first->getCalledFunction())
+ if (SCCFunctions.count(F))
+ std::swap(CallSites[I--], CallSites[--FirstCallInSCC]);
+
+ InlinedArrayAllocasTy InlinedArrayAllocas;
+ InlineFunctionInfo InlineInfo(&CG, GetAssumptionCache, PSI);
+
+ // Now that we have all of the call sites, loop over them and inline them if
+ // it looks profitable to do so.
+ bool Changed = false;
+ bool LocalChange;
+ do {
+ LocalChange = false;
+ // Iterate over the outer loop because inlining functions can cause indirect
+ // calls to become direct calls.
+ // CallSites may be modified inside so ranged for loop can not be used.
+ for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) {
+ auto &P = CallSites[CSi];
+ CallBase &CB = *P.first;
+ const int InlineHistoryID = P.second;
+
+ Function *Caller = CB.getCaller();
+ Function *Callee = CB.getCalledFunction();
+
+ // We can only inline direct calls to non-declarations.
+ if (!Callee || Callee->isDeclaration())
+ continue;
+
+ bool IsTriviallyDead = isInstructionTriviallyDead(&CB, &GetTLI(*Caller));
+
+ if (!IsTriviallyDead) {
+ // If this call site was obtained by inlining another function, verify
+ // that the include path for the function did not include the callee
+ // itself. If so, we'd be recursively inlining the same function,
+ // which would provide the same callsites, which would cause us to
+ // infinitely inline.
+ if (InlineHistoryID != -1 &&
+ inlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory)) {
+ setInlineRemark(CB, "recursive");
+ continue;
+ }
+ }
+
+ // FIXME for new PM: because of the old PM we currently generate ORE and
+ // in turn BFI on demand. With the new PM, the ORE dependency should
+ // just become a regular analysis dependency.
+ OptimizationRemarkEmitter ORE(Caller);
+
+ auto OIC = shouldInline(CB, GetInlineCost, ORE);
+ // If the policy determines that we should inline this function,
+ // delete the call instead.
+ if (!OIC)
+ continue;
+
+ // If this call site is dead and it is to a readonly function, we should
+ // just delete the call instead of trying to inline it, regardless of
+ // size. This happens because IPSCCP propagates the result out of the
+ // call and then we're left with the dead call.
+ if (IsTriviallyDead) {
+ LLVM_DEBUG(dbgs() << " -> Deleting dead call: " << CB << "\n");
+ // Update the call graph by deleting the edge from Callee to Caller.
+ setInlineRemark(CB, "trivially dead");
+ CG[Caller]->removeCallEdgeFor(CB);
+ CB.eraseFromParent();
+ ++NumCallsDeleted;
+ } else {
+ // Get DebugLoc to report. CB will be invalid after Inliner.
+ DebugLoc DLoc = CB.getDebugLoc();
+ BasicBlock *Block = CB.getParent();
+
+ // Attempt to inline the function.
+ using namespace ore;
+
+ InlineResult IR = inlineCallIfPossible(
+ CB, InlineInfo, InlinedArrayAllocas, InlineHistoryID,
+ InsertLifetime, AARGetter, ImportedFunctionsStats);
+ if (!IR.isSuccess()) {
+ setInlineRemark(CB, std::string(IR.getFailureReason()) + "; " +
+ inlineCostStr(*OIC));
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc,
+ Block)
+ << NV("Callee", Callee) << " will not be inlined into "
+ << NV("Caller", Caller) << ": "
+ << NV("Reason", IR.getFailureReason());
+ });
+ continue;
+ }
+ ++NumInlined;
+
+ emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC);
+
+ // If inlining this function gave us any new call sites, throw them
+ // onto our worklist to process. They are useful inline candidates.
+ if (!InlineInfo.InlinedCalls.empty()) {
+ // Create a new inline history entry for this, so that we remember
+ // that these new callsites came about due to inlining Callee.
+ int NewHistoryID = InlineHistory.size();
+ InlineHistory.push_back(std::make_pair(Callee, InlineHistoryID));
+
+#ifndef NDEBUG
+ // Make sure no dupplicates in the inline candidates. This could
+ // happen when a callsite is simpilfied to reusing the return value
+ // of another callsite during function cloning, thus the other
+ // callsite will be reconsidered here.
+ DenseSet<CallBase *> DbgCallSites;
+ for (auto &II : CallSites)
+ DbgCallSites.insert(II.first);
+#endif
+
+ for (Value *Ptr : InlineInfo.InlinedCalls) {
+#ifndef NDEBUG
+ assert(DbgCallSites.count(dyn_cast<CallBase>(Ptr)) == 0);
+#endif
+ CallSites.push_back(
+ std::make_pair(dyn_cast<CallBase>(Ptr), NewHistoryID));
+ }
+ }
+ }
+
+ // If we inlined or deleted the last possible call site to the function,
+ // delete the function body now.
+ if (Callee && Callee->use_empty() && Callee->hasLocalLinkage() &&
+ // TODO: Can remove if in SCC now.
+ !SCCFunctions.count(Callee) &&
+ // The function may be apparently dead, but if there are indirect
+ // callgraph references to the node, we cannot delete it yet, this
+ // could invalidate the CGSCC iterator.
+ CG[Callee]->getNumReferences() == 0) {
+ LLVM_DEBUG(dbgs() << " -> Deleting dead function: "
+ << Callee->getName() << "\n");
+ CallGraphNode *CalleeNode = CG[Callee];
+
+ // Remove any call graph edges from the callee to its callees.
+ CalleeNode->removeAllCalledFunctions();
+
+ // Removing the node for callee from the call graph and delete it.
+ delete CG.removeFunctionFromModule(CalleeNode);
+ ++NumDeleted;
+ }
+
+ // Remove this call site from the list. If possible, use
+ // swap/pop_back for efficiency, but do not use it if doing so would
+ // move a call site to a function in this SCC before the
+ // 'FirstCallInSCC' barrier.
+ if (SCC.isSingular()) {
+ CallSites[CSi] = CallSites.back();
+ CallSites.pop_back();
+ } else {
+ CallSites.erase(CallSites.begin() + CSi);
+ }
+ --CSi;
+
+ Changed = true;
+ LocalChange = true;
+ }
+ } while (LocalChange);
+
+ return Changed;
+}
+
+bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) {
+ CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+ ACT = &getAnalysis<AssumptionCacheTracker>();
+ PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
+ return getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ };
+ auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+ return ACT->getAssumptionCache(F);
+ };
+ return inlineCallsImpl(
+ SCC, CG, GetAssumptionCache, PSI, GetTLI, InsertLifetime,
+ [&](CallBase &CB) { return getInlineCost(CB); }, LegacyAARGetter(*this),
+ ImportedFunctionsStats);
+}
+
+/// Remove now-dead linkonce functions at the end of
+/// processing to avoid breaking the SCC traversal.
+bool LegacyInlinerBase::doFinalization(CallGraph &CG) {
+ if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
+ ImportedFunctionsStats.dump(InlinerFunctionImportStats ==
+ InlinerFunctionImportStatsOpts::Verbose);
+ return removeDeadFunctions(CG);
+}
+
+/// Remove dead functions that are not included in DNR (Do Not Remove) list.
+bool LegacyInlinerBase::removeDeadFunctions(CallGraph &CG,
+ bool AlwaysInlineOnly) {
+ SmallVector<CallGraphNode *, 16> FunctionsToRemove;
+ SmallVector<Function *, 16> DeadFunctionsInComdats;
+
+ auto RemoveCGN = [&](CallGraphNode *CGN) {
+ // Remove any call graph edges from the function to its callees.
+ CGN->removeAllCalledFunctions();
+
+ // Remove any edges from the external node to the function's call graph
+ // node. These edges might have been made irrelegant due to
+ // optimization of the program.
+ CG.getExternalCallingNode()->removeAnyCallEdgeTo(CGN);
+
+ // Removing the node for callee from the call graph and delete it.
+ FunctionsToRemove.push_back(CGN);
+ };
+
+ // Scan for all of the functions, looking for ones that should now be removed
+ // from the program. Insert the dead ones in the FunctionsToRemove set.
+ for (const auto &I : CG) {
+ CallGraphNode *CGN = I.second.get();
+ Function *F = CGN->getFunction();
+ if (!F || F->isDeclaration())
+ continue;
+
+ // Handle the case when this function is called and we only want to care
+ // about always-inline functions. This is a bit of a hack to share code
+ // between here and the InlineAlways pass.
+ if (AlwaysInlineOnly && !F->hasFnAttribute(Attribute::AlwaysInline))
+ continue;
+
+ // If the only remaining users of the function are dead constants, remove
+ // them.
+ F->removeDeadConstantUsers();
+
+ if (!F->isDefTriviallyDead())
+ continue;
+
+ // It is unsafe to drop a function with discardable linkage from a COMDAT
+ // without also dropping the other members of the COMDAT.
+ // The inliner doesn't visit non-function entities which are in COMDAT
+ // groups so it is unsafe to do so *unless* the linkage is local.
+ if (!F->hasLocalLinkage()) {
+ if (F->hasComdat()) {
+ DeadFunctionsInComdats.push_back(F);
+ continue;
+ }
+ }
+
+ RemoveCGN(CGN);
+ }
+ if (!DeadFunctionsInComdats.empty()) {
+ // Filter out the functions whose comdats remain alive.
+ filterDeadComdatFunctions(CG.getModule(), DeadFunctionsInComdats);
+ // Remove the rest.
+ for (Function *F : DeadFunctionsInComdats)
+ RemoveCGN(CG[F]);
+ }
+
+ if (FunctionsToRemove.empty())
+ return false;
+
+ // Now that we know which functions to delete, do so. We didn't want to do
+ // this inline, because that would invalidate our CallGraph::iterator
+ // objects. :(
+ //
+ // Note that it doesn't matter that we are iterating over a non-stable order
+ // here to do this, it doesn't matter which order the functions are deleted
+ // in.
+ array_pod_sort(FunctionsToRemove.begin(), FunctionsToRemove.end());
+ FunctionsToRemove.erase(
+ std::unique(FunctionsToRemove.begin(), FunctionsToRemove.end()),
+ FunctionsToRemove.end());
+ for (CallGraphNode *CGN : FunctionsToRemove) {
+ delete CG.removeFunctionFromModule(CGN);
+ ++NumDeleted;
+ }
+ return true;
+}
+
+InlineAdvisor &
+InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
+ FunctionAnalysisManager &FAM, Module &M) {
if (OwnedAdvisor)
return *OwnedAdvisor;
- auto *IAA = MAM.getCachedResult<InlineAdvisorAnalysis>(M);
- if (!IAA) {
- // It should still be possible to run the inliner as a stand-alone SCC pass,
- // for test scenarios. In that case, we default to the
- // DefaultInlineAdvisor, which doesn't need to keep state between SCC pass
- // runs. It also uses just the default InlineParams.
- // In this case, we need to use the provided FAM, which is valid for the
- // duration of the inliner pass, and thus the lifetime of the owned advisor.
- // The one we would get from the MAM can be invalidated as a result of the
- // inliner's activity.
+ auto *IAA = MAM.getCachedResult<InlineAdvisorAnalysis>(M);
+ if (!IAA) {
+ // It should still be possible to run the inliner as a stand-alone SCC pass,
+ // for test scenarios. In that case, we default to the
+ // DefaultInlineAdvisor, which doesn't need to keep state between SCC pass
+ // runs. It also uses just the default InlineParams.
+ // In this case, we need to use the provided FAM, which is valid for the
+ // duration of the inliner pass, and thus the lifetime of the owned advisor.
+ // The one we would get from the MAM can be invalidated as a result of the
+ // inliner's activity.
OwnedAdvisor =
std::make_unique<DefaultInlineAdvisor>(M, FAM, getInlineParams());
@@ -663,376 +663,376 @@ InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
/*EmitRemarks=*/true);
return *OwnedAdvisor;
- }
- assert(IAA->getAdvisor() &&
- "Expected a present InlineAdvisorAnalysis also have an "
- "InlineAdvisor initialized");
- return *IAA->getAdvisor();
-}
-
-PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
- CGSCCAnalysisManager &AM, LazyCallGraph &CG,
- CGSCCUpdateResult &UR) {
- const auto &MAMProxy =
- AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG);
- bool Changed = false;
-
- assert(InitialC.size() > 0 && "Cannot handle an empty SCC!");
- Module &M = *InitialC.begin()->getFunction().getParent();
- ProfileSummaryInfo *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(M);
-
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerCGSCCProxy>(InitialC, CG)
- .getManager();
-
- InlineAdvisor &Advisor = getAdvisor(MAMProxy, FAM, M);
- Advisor.onPassEntry();
-
- auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); });
-
- // We use a single common worklist for calls across the entire SCC. We
- // process these in-order and append new calls introduced during inlining to
- // the end.
- //
- // Note that this particular order of processing is actually critical to
- // avoid very bad behaviors. Consider *highly connected* call graphs where
+ }
+ assert(IAA->getAdvisor() &&
+ "Expected a present InlineAdvisorAnalysis also have an "
+ "InlineAdvisor initialized");
+ return *IAA->getAdvisor();
+}
+
+PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
+ CGSCCAnalysisManager &AM, LazyCallGraph &CG,
+ CGSCCUpdateResult &UR) {
+ const auto &MAMProxy =
+ AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG);
+ bool Changed = false;
+
+ assert(InitialC.size() > 0 && "Cannot handle an empty SCC!");
+ Module &M = *InitialC.begin()->getFunction().getParent();
+ ProfileSummaryInfo *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(M);
+
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerCGSCCProxy>(InitialC, CG)
+ .getManager();
+
+ InlineAdvisor &Advisor = getAdvisor(MAMProxy, FAM, M);
+ Advisor.onPassEntry();
+
+ auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); });
+
+ // We use a single common worklist for calls across the entire SCC. We
+ // process these in-order and append new calls introduced during inlining to
+ // the end.
+ //
+ // Note that this particular order of processing is actually critical to
+ // avoid very bad behaviors. Consider *highly connected* call graphs where
// each function contains a small amount of code and a couple of calls to
- // other functions. Because the LLVM inliner is fundamentally a bottom-up
- // inliner, it can handle gracefully the fact that these all appear to be
- // reasonable inlining candidates as it will flatten things until they become
- // too big to inline, and then move on and flatten another batch.
- //
- // However, when processing call edges *within* an SCC we cannot rely on this
- // bottom-up behavior. As a consequence, with heavily connected *SCCs* of
- // functions we can end up incrementally inlining N calls into each of
- // N functions because each incremental inlining decision looks good and we
- // don't have a topological ordering to prevent explosions.
- //
- // To compensate for this, we don't process transitive edges made immediate
- // by inlining until we've done one pass of inlining across the entire SCC.
- // Large, highly connected SCCs still lead to some amount of code bloat in
- // this model, but it is uniformly spread across all the functions in the SCC
- // and eventually they all become too large to inline, rather than
- // incrementally maknig a single function grow in a super linear fashion.
- SmallVector<std::pair<CallBase *, int>, 16> Calls;
-
- // Populate the initial list of calls in this SCC.
- for (auto &N : InitialC) {
- auto &ORE =
- FAM.getResult<OptimizationRemarkEmitterAnalysis>(N.getFunction());
- // We want to generally process call sites top-down in order for
- // simplifications stemming from replacing the call with the returned value
- // after inlining to be visible to subsequent inlining decisions.
- // FIXME: Using instructions sequence is a really bad way to do this.
- // Instead we should do an actual RPO walk of the function body.
- for (Instruction &I : instructions(N.getFunction()))
- if (auto *CB = dyn_cast<CallBase>(&I))
- if (Function *Callee = CB->getCalledFunction()) {
- if (!Callee->isDeclaration())
- Calls.push_back({CB, -1});
- else if (!isa<IntrinsicInst>(I)) {
- using namespace ore;
- setInlineRemark(*CB, "unavailable definition");
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
- << NV("Callee", Callee) << " will not be inlined into "
- << NV("Caller", CB->getCaller())
- << " because its definition is unavailable"
- << setIsVerbose();
- });
- }
- }
- }
- if (Calls.empty())
- return PreservedAnalyses::all();
-
+ // other functions. Because the LLVM inliner is fundamentally a bottom-up
+ // inliner, it can handle gracefully the fact that these all appear to be
+ // reasonable inlining candidates as it will flatten things until they become
+ // too big to inline, and then move on and flatten another batch.
+ //
+ // However, when processing call edges *within* an SCC we cannot rely on this
+ // bottom-up behavior. As a consequence, with heavily connected *SCCs* of
+ // functions we can end up incrementally inlining N calls into each of
+ // N functions because each incremental inlining decision looks good and we
+ // don't have a topological ordering to prevent explosions.
+ //
+ // To compensate for this, we don't process transitive edges made immediate
+ // by inlining until we've done one pass of inlining across the entire SCC.
+ // Large, highly connected SCCs still lead to some amount of code bloat in
+ // this model, but it is uniformly spread across all the functions in the SCC
+ // and eventually they all become too large to inline, rather than
+ // incrementally maknig a single function grow in a super linear fashion.
+ SmallVector<std::pair<CallBase *, int>, 16> Calls;
+
+ // Populate the initial list of calls in this SCC.
+ for (auto &N : InitialC) {
+ auto &ORE =
+ FAM.getResult<OptimizationRemarkEmitterAnalysis>(N.getFunction());
+ // We want to generally process call sites top-down in order for
+ // simplifications stemming from replacing the call with the returned value
+ // after inlining to be visible to subsequent inlining decisions.
+ // FIXME: Using instructions sequence is a really bad way to do this.
+ // Instead we should do an actual RPO walk of the function body.
+ for (Instruction &I : instructions(N.getFunction()))
+ if (auto *CB = dyn_cast<CallBase>(&I))
+ if (Function *Callee = CB->getCalledFunction()) {
+ if (!Callee->isDeclaration())
+ Calls.push_back({CB, -1});
+ else if (!isa<IntrinsicInst>(I)) {
+ using namespace ore;
+ setInlineRemark(*CB, "unavailable definition");
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
+ << NV("Callee", Callee) << " will not be inlined into "
+ << NV("Caller", CB->getCaller())
+ << " because its definition is unavailable"
+ << setIsVerbose();
+ });
+ }
+ }
+ }
+ if (Calls.empty())
+ return PreservedAnalyses::all();
+
// Capture updatable variable for the current SCC.
- auto *C = &InitialC;
-
- // When inlining a callee produces new call sites, we want to keep track of
- // the fact that they were inlined from the callee. This allows us to avoid
- // infinite inlining in some obscure cases. To represent this, we use an
- // index into the InlineHistory vector.
- SmallVector<std::pair<Function *, int>, 16> InlineHistory;
-
- // Track a set vector of inlined callees so that we can augment the caller
- // with all of their edges in the call graph before pruning out the ones that
- // got simplified away.
- SmallSetVector<Function *, 4> InlinedCallees;
-
- // Track the dead functions to delete once finished with inlining calls. We
- // defer deleting these to make it easier to handle the call graph updates.
- SmallVector<Function *, 4> DeadFunctions;
-
- // Loop forward over all of the calls. Note that we cannot cache the size as
- // inlining can introduce new calls that need to be processed.
- for (int I = 0; I < (int)Calls.size(); ++I) {
- // We expect the calls to typically be batched with sequences of calls that
- // have the same caller, so we first set up some shared infrastructure for
- // this caller. We also do any pruning we can at this layer on the caller
- // alone.
- Function &F = *Calls[I].first->getCaller();
- LazyCallGraph::Node &N = *CG.lookup(F);
- if (CG.lookupSCC(N) != C)
- continue;
-
- LLVM_DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n");
-
- auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
- return FAM.getResult<AssumptionAnalysis>(F);
- };
-
- // Now process as many calls as we have within this caller in the sequence.
- // We bail out as soon as the caller has to change so we can update the
- // call graph and prepare the context of that new caller.
- bool DidInline = false;
- for (; I < (int)Calls.size() && Calls[I].first->getCaller() == &F; ++I) {
- auto &P = Calls[I];
- CallBase *CB = P.first;
- const int InlineHistoryID = P.second;
- Function &Callee = *CB->getCalledFunction();
-
- if (InlineHistoryID != -1 &&
- inlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) {
- setInlineRemark(*CB, "recursive");
- continue;
- }
-
- // Check if this inlining may repeat breaking an SCC apart that has
- // already been split once before. In that case, inlining here may
- // trigger infinite inlining, much like is prevented within the inliner
- // itself by the InlineHistory above, but spread across CGSCC iterations
- // and thus hidden from the full inline history.
- if (CG.lookupSCC(*CG.lookup(Callee)) == C &&
- UR.InlinedInternalEdges.count({&N, C})) {
- LLVM_DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node "
- "previously split out of this SCC by inlining: "
- << F.getName() << " -> " << Callee.getName() << "\n");
- setInlineRemark(*CB, "recursive SCC split");
- continue;
- }
-
+ auto *C = &InitialC;
+
+ // When inlining a callee produces new call sites, we want to keep track of
+ // the fact that they were inlined from the callee. This allows us to avoid
+ // infinite inlining in some obscure cases. To represent this, we use an
+ // index into the InlineHistory vector.
+ SmallVector<std::pair<Function *, int>, 16> InlineHistory;
+
+ // Track a set vector of inlined callees so that we can augment the caller
+ // with all of their edges in the call graph before pruning out the ones that
+ // got simplified away.
+ SmallSetVector<Function *, 4> InlinedCallees;
+
+ // Track the dead functions to delete once finished with inlining calls. We
+ // defer deleting these to make it easier to handle the call graph updates.
+ SmallVector<Function *, 4> DeadFunctions;
+
+ // Loop forward over all of the calls. Note that we cannot cache the size as
+ // inlining can introduce new calls that need to be processed.
+ for (int I = 0; I < (int)Calls.size(); ++I) {
+ // We expect the calls to typically be batched with sequences of calls that
+ // have the same caller, so we first set up some shared infrastructure for
+ // this caller. We also do any pruning we can at this layer on the caller
+ // alone.
+ Function &F = *Calls[I].first->getCaller();
+ LazyCallGraph::Node &N = *CG.lookup(F);
+ if (CG.lookupSCC(N) != C)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n");
+
+ auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+ return FAM.getResult<AssumptionAnalysis>(F);
+ };
+
+ // Now process as many calls as we have within this caller in the sequence.
+ // We bail out as soon as the caller has to change so we can update the
+ // call graph and prepare the context of that new caller.
+ bool DidInline = false;
+ for (; I < (int)Calls.size() && Calls[I].first->getCaller() == &F; ++I) {
+ auto &P = Calls[I];
+ CallBase *CB = P.first;
+ const int InlineHistoryID = P.second;
+ Function &Callee = *CB->getCalledFunction();
+
+ if (InlineHistoryID != -1 &&
+ inlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) {
+ setInlineRemark(*CB, "recursive");
+ continue;
+ }
+
+ // Check if this inlining may repeat breaking an SCC apart that has
+ // already been split once before. In that case, inlining here may
+ // trigger infinite inlining, much like is prevented within the inliner
+ // itself by the InlineHistory above, but spread across CGSCC iterations
+ // and thus hidden from the full inline history.
+ if (CG.lookupSCC(*CG.lookup(Callee)) == C &&
+ UR.InlinedInternalEdges.count({&N, C})) {
+ LLVM_DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node "
+ "previously split out of this SCC by inlining: "
+ << F.getName() << " -> " << Callee.getName() << "\n");
+ setInlineRemark(*CB, "recursive SCC split");
+ continue;
+ }
+
auto Advice = Advisor.getAdvice(*CB, OnlyMandatory);
- // Check whether we want to inline this callsite.
- if (!Advice->isInliningRecommended()) {
- Advice->recordUnattemptedInlining();
- continue;
- }
-
- // Setup the data structure used to plumb customization into the
- // `InlineFunction` routine.
- InlineFunctionInfo IFI(
- /*cg=*/nullptr, GetAssumptionCache, PSI,
- &FAM.getResult<BlockFrequencyAnalysis>(*(CB->getCaller())),
- &FAM.getResult<BlockFrequencyAnalysis>(Callee));
-
+ // Check whether we want to inline this callsite.
+ if (!Advice->isInliningRecommended()) {
+ Advice->recordUnattemptedInlining();
+ continue;
+ }
+
+ // Setup the data structure used to plumb customization into the
+ // `InlineFunction` routine.
+ InlineFunctionInfo IFI(
+ /*cg=*/nullptr, GetAssumptionCache, PSI,
+ &FAM.getResult<BlockFrequencyAnalysis>(*(CB->getCaller())),
+ &FAM.getResult<BlockFrequencyAnalysis>(Callee));
+
InlineResult IR =
InlineFunction(*CB, IFI, &FAM.getResult<AAManager>(*CB->getCaller()));
- if (!IR.isSuccess()) {
- Advice->recordUnsuccessfulInlining(IR);
- continue;
- }
-
- DidInline = true;
- InlinedCallees.insert(&Callee);
- ++NumInlined;
-
- // Add any new callsites to defined functions to the worklist.
- if (!IFI.InlinedCallSites.empty()) {
- int NewHistoryID = InlineHistory.size();
- InlineHistory.push_back({&Callee, InlineHistoryID});
-
- for (CallBase *ICB : reverse(IFI.InlinedCallSites)) {
- Function *NewCallee = ICB->getCalledFunction();
- if (!NewCallee) {
- // Try to promote an indirect (virtual) call without waiting for
- // the post-inline cleanup and the next DevirtSCCRepeatedPass
- // iteration because the next iteration may not happen and we may
- // miss inlining it.
- if (tryPromoteCall(*ICB))
- NewCallee = ICB->getCalledFunction();
- }
- if (NewCallee)
- if (!NewCallee->isDeclaration())
- Calls.push_back({ICB, NewHistoryID});
- }
- }
-
- // Merge the attributes based on the inlining.
- AttributeFuncs::mergeAttributesForInlining(F, Callee);
-
- // For local functions, check whether this makes the callee trivially
- // dead. In that case, we can drop the body of the function eagerly
- // which may reduce the number of callers of other functions to one,
- // changing inline cost thresholds.
- bool CalleeWasDeleted = false;
- if (Callee.hasLocalLinkage()) {
- // To check this we also need to nuke any dead constant uses (perhaps
- // made dead by this operation on other functions).
- Callee.removeDeadConstantUsers();
- if (Callee.use_empty() && !CG.isLibFunction(Callee)) {
- Calls.erase(
- std::remove_if(Calls.begin() + I + 1, Calls.end(),
- [&](const std::pair<CallBase *, int> &Call) {
- return Call.first->getCaller() == &Callee;
- }),
- Calls.end());
- // Clear the body and queue the function itself for deletion when we
- // finish inlining and call graph updates.
- // Note that after this point, it is an error to do anything other
- // than use the callee's address or delete it.
- Callee.dropAllReferences();
+ if (!IR.isSuccess()) {
+ Advice->recordUnsuccessfulInlining(IR);
+ continue;
+ }
+
+ DidInline = true;
+ InlinedCallees.insert(&Callee);
+ ++NumInlined;
+
+ // Add any new callsites to defined functions to the worklist.
+ if (!IFI.InlinedCallSites.empty()) {
+ int NewHistoryID = InlineHistory.size();
+ InlineHistory.push_back({&Callee, InlineHistoryID});
+
+ for (CallBase *ICB : reverse(IFI.InlinedCallSites)) {
+ Function *NewCallee = ICB->getCalledFunction();
+ if (!NewCallee) {
+ // Try to promote an indirect (virtual) call without waiting for
+ // the post-inline cleanup and the next DevirtSCCRepeatedPass
+ // iteration because the next iteration may not happen and we may
+ // miss inlining it.
+ if (tryPromoteCall(*ICB))
+ NewCallee = ICB->getCalledFunction();
+ }
+ if (NewCallee)
+ if (!NewCallee->isDeclaration())
+ Calls.push_back({ICB, NewHistoryID});
+ }
+ }
+
+ // Merge the attributes based on the inlining.
+ AttributeFuncs::mergeAttributesForInlining(F, Callee);
+
+ // For local functions, check whether this makes the callee trivially
+ // dead. In that case, we can drop the body of the function eagerly
+ // which may reduce the number of callers of other functions to one,
+ // changing inline cost thresholds.
+ bool CalleeWasDeleted = false;
+ if (Callee.hasLocalLinkage()) {
+ // To check this we also need to nuke any dead constant uses (perhaps
+ // made dead by this operation on other functions).
+ Callee.removeDeadConstantUsers();
+ if (Callee.use_empty() && !CG.isLibFunction(Callee)) {
+ Calls.erase(
+ std::remove_if(Calls.begin() + I + 1, Calls.end(),
+ [&](const std::pair<CallBase *, int> &Call) {
+ return Call.first->getCaller() == &Callee;
+ }),
+ Calls.end());
+ // Clear the body and queue the function itself for deletion when we
+ // finish inlining and call graph updates.
+ // Note that after this point, it is an error to do anything other
+ // than use the callee's address or delete it.
+ Callee.dropAllReferences();
assert(!is_contained(DeadFunctions, &Callee) &&
- "Cannot put cause a function to become dead twice!");
- DeadFunctions.push_back(&Callee);
- CalleeWasDeleted = true;
- }
- }
- if (CalleeWasDeleted)
- Advice->recordInliningWithCalleeDeleted();
- else
- Advice->recordInlining();
- }
-
- // Back the call index up by one to put us in a good position to go around
- // the outer loop.
- --I;
-
- if (!DidInline)
- continue;
- Changed = true;
-
- // At this point, since we have made changes we have at least removed
- // a call instruction. However, in the process we do some incremental
- // simplification of the surrounding code. This simplification can
- // essentially do all of the same things as a function pass and we can
- // re-use the exact same logic for updating the call graph to reflect the
- // change.
-
- // Inside the update, we also update the FunctionAnalysisManager in the
- // proxy for this particular SCC. We do this as the SCC may have changed and
- // as we're going to mutate this particular function we want to make sure
- // the proxy is in place to forward any invalidation events.
- LazyCallGraph::SCC *OldC = C;
+ "Cannot put cause a function to become dead twice!");
+ DeadFunctions.push_back(&Callee);
+ CalleeWasDeleted = true;
+ }
+ }
+ if (CalleeWasDeleted)
+ Advice->recordInliningWithCalleeDeleted();
+ else
+ Advice->recordInlining();
+ }
+
+ // Back the call index up by one to put us in a good position to go around
+ // the outer loop.
+ --I;
+
+ if (!DidInline)
+ continue;
+ Changed = true;
+
+ // At this point, since we have made changes we have at least removed
+ // a call instruction. However, in the process we do some incremental
+ // simplification of the surrounding code. This simplification can
+ // essentially do all of the same things as a function pass and we can
+ // re-use the exact same logic for updating the call graph to reflect the
+ // change.
+
+ // Inside the update, we also update the FunctionAnalysisManager in the
+ // proxy for this particular SCC. We do this as the SCC may have changed and
+ // as we're going to mutate this particular function we want to make sure
+ // the proxy is in place to forward any invalidation events.
+ LazyCallGraph::SCC *OldC = C;
C = &updateCGAndAnalysisManagerForCGSCCPass(CG, *C, N, AM, UR, FAM);
- LLVM_DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
-
- // If this causes an SCC to split apart into multiple smaller SCCs, there
- // is a subtle risk we need to prepare for. Other transformations may
- // expose an "infinite inlining" opportunity later, and because of the SCC
- // mutation, we will revisit this function and potentially re-inline. If we
- // do, and that re-inlining also has the potentially to mutate the SCC
- // structure, the infinite inlining problem can manifest through infinite
- // SCC splits and merges. To avoid this, we capture the originating caller
- // node and the SCC containing the call edge. This is a slight over
- // approximation of the possible inlining decisions that must be avoided,
- // but is relatively efficient to store. We use C != OldC to know when
- // a new SCC is generated and the original SCC may be generated via merge
- // in later iterations.
- //
- // It is also possible that even if no new SCC is generated
- // (i.e., C == OldC), the original SCC could be split and then merged
- // into the same one as itself. and the original SCC will be added into
- // UR.CWorklist again, we want to catch such cases too.
- //
- // FIXME: This seems like a very heavyweight way of retaining the inline
- // history, we should look for a more efficient way of tracking it.
- if ((C != OldC || UR.CWorklist.count(OldC)) &&
- llvm::any_of(InlinedCallees, [&](Function *Callee) {
- return CG.lookupSCC(*CG.lookup(*Callee)) == OldC;
- })) {
- LLVM_DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, "
- "retaining this to avoid infinite inlining.\n");
- UR.InlinedInternalEdges.insert({&N, OldC});
- }
- InlinedCallees.clear();
- }
-
- // Now that we've finished inlining all of the calls across this SCC, delete
- // all of the trivially dead functions, updating the call graph and the CGSCC
- // pass manager in the process.
- //
- // Note that this walks a pointer set which has non-deterministic order but
- // that is OK as all we do is delete things and add pointers to unordered
- // sets.
- for (Function *DeadF : DeadFunctions) {
- // Get the necessary information out of the call graph and nuke the
- // function there. Also, clear out any cached analyses.
- auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF));
- FAM.clear(*DeadF, DeadF->getName());
- AM.clear(DeadC, DeadC.getName());
- auto &DeadRC = DeadC.getOuterRefSCC();
- CG.removeDeadFunction(*DeadF);
-
- // Mark the relevant parts of the call graph as invalid so we don't visit
- // them.
- UR.InvalidatedSCCs.insert(&DeadC);
- UR.InvalidatedRefSCCs.insert(&DeadRC);
-
- // And delete the actual function from the module.
- // The Advisor may use Function pointers to efficiently index various
- // internal maps, e.g. for memoization. Function cleanup passes like
- // argument promotion create new functions. It is possible for a new
- // function to be allocated at the address of a deleted function. We could
- // index using names, but that's inefficient. Alternatively, we let the
- // Advisor free the functions when it sees fit.
- DeadF->getBasicBlockList().clear();
- M.getFunctionList().remove(DeadF);
-
- ++NumDeleted;
- }
-
- if (!Changed)
- return PreservedAnalyses::all();
-
- // Even if we change the IR, we update the core CGSCC data structures and so
- // can preserve the proxy to the function analysis manager.
- PreservedAnalyses PA;
- PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
- return PA;
-}
-
-ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params,
- bool Debugging,
+ LLVM_DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
+
+ // If this causes an SCC to split apart into multiple smaller SCCs, there
+ // is a subtle risk we need to prepare for. Other transformations may
+ // expose an "infinite inlining" opportunity later, and because of the SCC
+ // mutation, we will revisit this function and potentially re-inline. If we
+ // do, and that re-inlining also has the potentially to mutate the SCC
+ // structure, the infinite inlining problem can manifest through infinite
+ // SCC splits and merges. To avoid this, we capture the originating caller
+ // node and the SCC containing the call edge. This is a slight over
+ // approximation of the possible inlining decisions that must be avoided,
+ // but is relatively efficient to store. We use C != OldC to know when
+ // a new SCC is generated and the original SCC may be generated via merge
+ // in later iterations.
+ //
+ // It is also possible that even if no new SCC is generated
+ // (i.e., C == OldC), the original SCC could be split and then merged
+ // into the same one as itself. and the original SCC will be added into
+ // UR.CWorklist again, we want to catch such cases too.
+ //
+ // FIXME: This seems like a very heavyweight way of retaining the inline
+ // history, we should look for a more efficient way of tracking it.
+ if ((C != OldC || UR.CWorklist.count(OldC)) &&
+ llvm::any_of(InlinedCallees, [&](Function *Callee) {
+ return CG.lookupSCC(*CG.lookup(*Callee)) == OldC;
+ })) {
+ LLVM_DEBUG(dbgs() << "Inlined an internal call edge and split an SCC, "
+ "retaining this to avoid infinite inlining.\n");
+ UR.InlinedInternalEdges.insert({&N, OldC});
+ }
+ InlinedCallees.clear();
+ }
+
+ // Now that we've finished inlining all of the calls across this SCC, delete
+ // all of the trivially dead functions, updating the call graph and the CGSCC
+ // pass manager in the process.
+ //
+ // Note that this walks a pointer set which has non-deterministic order but
+ // that is OK as all we do is delete things and add pointers to unordered
+ // sets.
+ for (Function *DeadF : DeadFunctions) {
+ // Get the necessary information out of the call graph and nuke the
+ // function there. Also, clear out any cached analyses.
+ auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF));
+ FAM.clear(*DeadF, DeadF->getName());
+ AM.clear(DeadC, DeadC.getName());
+ auto &DeadRC = DeadC.getOuterRefSCC();
+ CG.removeDeadFunction(*DeadF);
+
+ // Mark the relevant parts of the call graph as invalid so we don't visit
+ // them.
+ UR.InvalidatedSCCs.insert(&DeadC);
+ UR.InvalidatedRefSCCs.insert(&DeadRC);
+
+ // And delete the actual function from the module.
+ // The Advisor may use Function pointers to efficiently index various
+ // internal maps, e.g. for memoization. Function cleanup passes like
+ // argument promotion create new functions. It is possible for a new
+ // function to be allocated at the address of a deleted function. We could
+ // index using names, but that's inefficient. Alternatively, we let the
+ // Advisor free the functions when it sees fit.
+ DeadF->getBasicBlockList().clear();
+ M.getFunctionList().remove(DeadF);
+
+ ++NumDeleted;
+ }
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ // Even if we change the IR, we update the core CGSCC data structures and so
+ // can preserve the proxy to the function analysis manager.
+ PreservedAnalyses PA;
+ PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
+ return PA;
+}
+
+ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params,
+ bool Debugging,
bool MandatoryFirst,
- InliningAdvisorMode Mode,
- unsigned MaxDevirtIterations)
- : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations),
- PM(Debugging), MPM(Debugging) {
- // Run the inliner first. The theory is that we are walking bottom-up and so
- // the callees have already been fully optimized, and we want to inline them
- // into the callers so that our optimizations can reflect that.
- // For PreLinkThinLTO pass, we disable hot-caller heuristic for sample PGO
- // because it makes profile annotation in the backend inaccurate.
+ InliningAdvisorMode Mode,
+ unsigned MaxDevirtIterations)
+ : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations),
+ PM(Debugging), MPM(Debugging) {
+ // Run the inliner first. The theory is that we are walking bottom-up and so
+ // the callees have already been fully optimized, and we want to inline them
+ // into the callers so that our optimizations can reflect that.
+ // For PreLinkThinLTO pass, we disable hot-caller heuristic for sample PGO
+ // because it makes profile annotation in the backend inaccurate.
if (MandatoryFirst)
PM.addPass(InlinerPass(/*OnlyMandatory*/ true));
- PM.addPass(InlinerPass());
-}
-
-PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M,
- ModuleAnalysisManager &MAM) {
- auto &IAA = MAM.getResult<InlineAdvisorAnalysis>(M);
+ PM.addPass(InlinerPass());
+}
+
+PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+ auto &IAA = MAM.getResult<InlineAdvisorAnalysis>(M);
if (!IAA.tryCreate(Params, Mode, CGSCCInlineReplayFile)) {
- M.getContext().emitError(
- "Could not setup Inlining Advisor for the requested "
- "mode and/or options");
- return PreservedAnalyses::all();
- }
-
- // We wrap the CGSCC pipeline in a devirtualization repeater. This will try
- // to detect when we devirtualize indirect calls and iterate the SCC passes
- // in that case to try and catch knock-on inlining or function attrs
- // opportunities. Then we add it to the module pipeline by walking the SCCs
- // in postorder (or bottom-up).
- // If MaxDevirtIterations is 0, we just don't use the devirtualization
- // wrapper.
- if (MaxDevirtIterations == 0)
- MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(PM)));
- else
- MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
- createDevirtSCCRepeatedPass(std::move(PM), MaxDevirtIterations)));
- auto Ret = MPM.run(M, MAM);
-
- IAA.clear();
- return Ret;
-}
+ M.getContext().emitError(
+ "Could not setup Inlining Advisor for the requested "
+ "mode and/or options");
+ return PreservedAnalyses::all();
+ }
+
+ // We wrap the CGSCC pipeline in a devirtualization repeater. This will try
+ // to detect when we devirtualize indirect calls and iterate the SCC passes
+ // in that case to try and catch knock-on inlining or function attrs
+ // opportunities. Then we add it to the module pipeline by walking the SCCs
+ // in postorder (or bottom-up).
+ // If MaxDevirtIterations is 0, we just don't use the devirtualization
+ // wrapper.
+ if (MaxDevirtIterations == 0)
+ MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(PM)));
+ else
+ MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+ createDevirtSCCRepeatedPass(std::move(PM), MaxDevirtIterations)));
+ auto Ret = MPM.run(M, MAM);
+
+ IAA.clear();
+ return Ret;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/Internalize.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/Internalize.cpp
index 77c13436a5..e1644819af 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/Internalize.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/Internalize.cpp
@@ -1,291 +1,291 @@
-//===-- Internalize.cpp - Mark functions internal -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass loops over all of the functions and variables in the input module.
-// If the function or variable does not need to be preserved according to the
-// client supplied callback, it is marked as internal.
-//
-// This transformation would not be legal in a regular compilation, but it gets
-// extra information from the linker about what is safe.
-//
-// For example: Internalizing a function with external linkage. Only if we are
-// told it is only used from within this module, it is safe to do it.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/Internalize.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/LineIterator.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/GlobalStatus.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "internalize"
-
-STATISTIC(NumAliases, "Number of aliases internalized");
-STATISTIC(NumFunctions, "Number of functions internalized");
-STATISTIC(NumGlobals, "Number of global vars internalized");
-
-// APIFile - A file which contains a list of symbols that should not be marked
-// external.
-static cl::opt<std::string>
- APIFile("internalize-public-api-file", cl::value_desc("filename"),
- cl::desc("A file containing list of symbol names to preserve"));
-
-// APIList - A list of symbols that should not be marked internal.
-static cl::list<std::string>
- APIList("internalize-public-api-list", cl::value_desc("list"),
- cl::desc("A list of symbol names to preserve"), cl::CommaSeparated);
-
-namespace {
-// Helper to load an API list to preserve from file and expose it as a functor
-// for internalization.
-class PreserveAPIList {
-public:
- PreserveAPIList() {
- if (!APIFile.empty())
- LoadFile(APIFile);
- ExternalNames.insert(APIList.begin(), APIList.end());
- }
-
- bool operator()(const GlobalValue &GV) {
- return ExternalNames.count(GV.getName());
- }
-
-private:
- // Contains the set of symbols loaded from file
- StringSet<> ExternalNames;
-
- void LoadFile(StringRef Filename) {
- // Load the APIFile...
- ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
- MemoryBuffer::getFile(Filename);
- if (!Buf) {
- errs() << "WARNING: Internalize couldn't load file '" << Filename
- << "'! Continuing as if it's empty.\n";
- return; // Just continue as if the file were empty
- }
- for (line_iterator I(*Buf->get(), true), E; I != E; ++I)
- ExternalNames.insert(*I);
- }
-};
-} // end anonymous namespace
-
-bool InternalizePass::shouldPreserveGV(const GlobalValue &GV) {
- // Function must be defined here
- if (GV.isDeclaration())
- return true;
-
- // Available externally is really just a "declaration with a body".
- if (GV.hasAvailableExternallyLinkage())
- return true;
-
- // Assume that dllexported symbols are referenced elsewhere
- if (GV.hasDLLExportStorageClass())
- return true;
-
- // Already local, has nothing to do.
- if (GV.hasLocalLinkage())
- return false;
-
- // Check some special cases
- if (AlwaysPreserved.count(GV.getName()))
- return true;
-
- return MustPreserveGV(GV);
-}
-
-bool InternalizePass::maybeInternalize(
- GlobalValue &GV, const DenseSet<const Comdat *> &ExternalComdats) {
- if (Comdat *C = GV.getComdat()) {
- if (ExternalComdats.count(C))
- return false;
-
- // If a comdat is not externally visible we can drop it.
- if (auto GO = dyn_cast<GlobalObject>(&GV))
- GO->setComdat(nullptr);
-
- if (GV.hasLocalLinkage())
- return false;
- } else {
- if (GV.hasLocalLinkage())
- return false;
-
- if (shouldPreserveGV(GV))
- return false;
- }
-
- GV.setVisibility(GlobalValue::DefaultVisibility);
- GV.setLinkage(GlobalValue::InternalLinkage);
- return true;
-}
-
-// If GV is part of a comdat and is externally visible, keep track of its
-// comdat so that we don't internalize any of its members.
-void InternalizePass::checkComdatVisibility(
- GlobalValue &GV, DenseSet<const Comdat *> &ExternalComdats) {
- Comdat *C = GV.getComdat();
- if (!C)
- return;
-
- if (shouldPreserveGV(GV))
- ExternalComdats.insert(C);
-}
-
-bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
- bool Changed = false;
- CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : nullptr;
-
- SmallPtrSet<GlobalValue *, 8> Used;
- collectUsedGlobalVariables(M, Used, false);
-
- // Collect comdat visiblity information for the module.
- DenseSet<const Comdat *> ExternalComdats;
- if (!M.getComdatSymbolTable().empty()) {
- for (Function &F : M)
- checkComdatVisibility(F, ExternalComdats);
- for (GlobalVariable &GV : M.globals())
- checkComdatVisibility(GV, ExternalComdats);
- for (GlobalAlias &GA : M.aliases())
- checkComdatVisibility(GA, ExternalComdats);
- }
-
- // We must assume that globals in llvm.used have a reference that not even
- // the linker can see, so we don't internalize them.
- // For llvm.compiler.used the situation is a bit fuzzy. The assembler and
- // linker can drop those symbols. If this pass is running as part of LTO,
- // one might think that it could just drop llvm.compiler.used. The problem
- // is that even in LTO llvm doesn't see every reference. For example,
- // we don't see references from function local inline assembly. To be
- // conservative, we internalize symbols in llvm.compiler.used, but we
- // keep llvm.compiler.used so that the symbol is not deleted by llvm.
- for (GlobalValue *V : Used) {
- AlwaysPreserved.insert(V->getName());
- }
-
- // Mark all functions not in the api as internal.
- for (Function &I : M) {
- if (!maybeInternalize(I, ExternalComdats))
- continue;
- Changed = true;
-
- if (ExternalNode)
- // Remove a callgraph edge from the external node to this function.
- ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]);
-
- ++NumFunctions;
- LLVM_DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n");
- }
-
- // Never internalize the llvm.used symbol. It is used to implement
- // attribute((used)).
- // FIXME: Shouldn't this just filter on llvm.metadata section??
- AlwaysPreserved.insert("llvm.used");
- AlwaysPreserved.insert("llvm.compiler.used");
-
- // Never internalize anchors used by the machine module info, else the info
- // won't find them. (see MachineModuleInfo.)
- AlwaysPreserved.insert("llvm.global_ctors");
- AlwaysPreserved.insert("llvm.global_dtors");
- AlwaysPreserved.insert("llvm.global.annotations");
-
- // Never internalize symbols code-gen inserts.
- // FIXME: We should probably add this (and the __stack_chk_guard) via some
- // type of call-back in CodeGen.
- AlwaysPreserved.insert("__stack_chk_fail");
- AlwaysPreserved.insert("__stack_chk_guard");
-
- // Mark all global variables with initializers that are not in the api as
- // internal as well.
- for (auto &GV : M.globals()) {
- if (!maybeInternalize(GV, ExternalComdats))
- continue;
- Changed = true;
-
- ++NumGlobals;
- LLVM_DEBUG(dbgs() << "Internalized gvar " << GV.getName() << "\n");
- }
-
- // Mark all aliases that are not in the api as internal as well.
- for (auto &GA : M.aliases()) {
- if (!maybeInternalize(GA, ExternalComdats))
- continue;
- Changed = true;
-
- ++NumAliases;
- LLVM_DEBUG(dbgs() << "Internalized alias " << GA.getName() << "\n");
- }
-
- return Changed;
-}
-
-InternalizePass::InternalizePass() : MustPreserveGV(PreserveAPIList()) {}
-
-PreservedAnalyses InternalizePass::run(Module &M, ModuleAnalysisManager &AM) {
- if (!internalizeModule(M, AM.getCachedResult<CallGraphAnalysis>(M)))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserve<CallGraphAnalysis>();
- return PA;
-}
-
-namespace {
-class InternalizeLegacyPass : public ModulePass {
- // Client supplied callback to control wheter a symbol must be preserved.
- std::function<bool(const GlobalValue &)> MustPreserveGV;
-
-public:
- static char ID; // Pass identification, replacement for typeid
-
- InternalizeLegacyPass() : ModulePass(ID), MustPreserveGV(PreserveAPIList()) {}
-
- InternalizeLegacyPass(std::function<bool(const GlobalValue &)> MustPreserveGV)
- : ModulePass(ID), MustPreserveGV(std::move(MustPreserveGV)) {
- initializeInternalizeLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
-
- CallGraphWrapperPass *CGPass =
- getAnalysisIfAvailable<CallGraphWrapperPass>();
- CallGraph *CG = CGPass ? &CGPass->getCallGraph() : nullptr;
- return internalizeModule(M, MustPreserveGV, CG);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addPreserved<CallGraphWrapperPass>();
- }
-};
-}
-
-char InternalizeLegacyPass::ID = 0;
-INITIALIZE_PASS(InternalizeLegacyPass, "internalize",
- "Internalize Global Symbols", false, false)
-
-ModulePass *llvm::createInternalizePass() {
- return new InternalizeLegacyPass();
-}
-
-ModulePass *llvm::createInternalizePass(
- std::function<bool(const GlobalValue &)> MustPreserveGV) {
- return new InternalizeLegacyPass(std::move(MustPreserveGV));
-}
+//===-- Internalize.cpp - Mark functions internal -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loops over all of the functions and variables in the input module.
+// If the function or variable does not need to be preserved according to the
+// client supplied callback, it is marked as internal.
+//
+// This transformation would not be legal in a regular compilation, but it gets
+// extra information from the linker about what is safe.
+//
+// For example: Internalizing a function with external linkage. Only if we are
+// told it is only used from within this module, it is safe to do it.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/Internalize.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "internalize"
+
+STATISTIC(NumAliases, "Number of aliases internalized");
+STATISTIC(NumFunctions, "Number of functions internalized");
+STATISTIC(NumGlobals, "Number of global vars internalized");
+
+// APIFile - A file which contains a list of symbols that should not be marked
+// external.
+static cl::opt<std::string>
+ APIFile("internalize-public-api-file", cl::value_desc("filename"),
+ cl::desc("A file containing list of symbol names to preserve"));
+
+// APIList - A list of symbols that should not be marked internal.
+static cl::list<std::string>
+ APIList("internalize-public-api-list", cl::value_desc("list"),
+ cl::desc("A list of symbol names to preserve"), cl::CommaSeparated);
+
+namespace {
+// Helper to load an API list to preserve from file and expose it as a functor
+// for internalization.
+class PreserveAPIList {
+public:
+ PreserveAPIList() {
+ if (!APIFile.empty())
+ LoadFile(APIFile);
+ ExternalNames.insert(APIList.begin(), APIList.end());
+ }
+
+ bool operator()(const GlobalValue &GV) {
+ return ExternalNames.count(GV.getName());
+ }
+
+private:
+ // Contains the set of symbols loaded from file
+ StringSet<> ExternalNames;
+
+ void LoadFile(StringRef Filename) {
+ // Load the APIFile...
+ ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
+ MemoryBuffer::getFile(Filename);
+ if (!Buf) {
+ errs() << "WARNING: Internalize couldn't load file '" << Filename
+ << "'! Continuing as if it's empty.\n";
+ return; // Just continue as if the file were empty
+ }
+ for (line_iterator I(*Buf->get(), true), E; I != E; ++I)
+ ExternalNames.insert(*I);
+ }
+};
+} // end anonymous namespace
+
+bool InternalizePass::shouldPreserveGV(const GlobalValue &GV) {
+ // Function must be defined here
+ if (GV.isDeclaration())
+ return true;
+
+ // Available externally is really just a "declaration with a body".
+ if (GV.hasAvailableExternallyLinkage())
+ return true;
+
+ // Assume that dllexported symbols are referenced elsewhere
+ if (GV.hasDLLExportStorageClass())
+ return true;
+
+ // Already local, has nothing to do.
+ if (GV.hasLocalLinkage())
+ return false;
+
+ // Check some special cases
+ if (AlwaysPreserved.count(GV.getName()))
+ return true;
+
+ return MustPreserveGV(GV);
+}
+
+bool InternalizePass::maybeInternalize(
+ GlobalValue &GV, const DenseSet<const Comdat *> &ExternalComdats) {
+ if (Comdat *C = GV.getComdat()) {
+ if (ExternalComdats.count(C))
+ return false;
+
+ // If a comdat is not externally visible we can drop it.
+ if (auto GO = dyn_cast<GlobalObject>(&GV))
+ GO->setComdat(nullptr);
+
+ if (GV.hasLocalLinkage())
+ return false;
+ } else {
+ if (GV.hasLocalLinkage())
+ return false;
+
+ if (shouldPreserveGV(GV))
+ return false;
+ }
+
+ GV.setVisibility(GlobalValue::DefaultVisibility);
+ GV.setLinkage(GlobalValue::InternalLinkage);
+ return true;
+}
+
+// If GV is part of a comdat and is externally visible, keep track of its
+// comdat so that we don't internalize any of its members.
+void InternalizePass::checkComdatVisibility(
+ GlobalValue &GV, DenseSet<const Comdat *> &ExternalComdats) {
+ Comdat *C = GV.getComdat();
+ if (!C)
+ return;
+
+ if (shouldPreserveGV(GV))
+ ExternalComdats.insert(C);
+}
+
+bool InternalizePass::internalizeModule(Module &M, CallGraph *CG) {
+ bool Changed = false;
+ CallGraphNode *ExternalNode = CG ? CG->getExternalCallingNode() : nullptr;
+
+ SmallPtrSet<GlobalValue *, 8> Used;
+ collectUsedGlobalVariables(M, Used, false);
+
+ // Collect comdat visiblity information for the module.
+ DenseSet<const Comdat *> ExternalComdats;
+ if (!M.getComdatSymbolTable().empty()) {
+ for (Function &F : M)
+ checkComdatVisibility(F, ExternalComdats);
+ for (GlobalVariable &GV : M.globals())
+ checkComdatVisibility(GV, ExternalComdats);
+ for (GlobalAlias &GA : M.aliases())
+ checkComdatVisibility(GA, ExternalComdats);
+ }
+
+ // We must assume that globals in llvm.used have a reference that not even
+ // the linker can see, so we don't internalize them.
+ // For llvm.compiler.used the situation is a bit fuzzy. The assembler and
+ // linker can drop those symbols. If this pass is running as part of LTO,
+ // one might think that it could just drop llvm.compiler.used. The problem
+ // is that even in LTO llvm doesn't see every reference. For example,
+ // we don't see references from function local inline assembly. To be
+ // conservative, we internalize symbols in llvm.compiler.used, but we
+ // keep llvm.compiler.used so that the symbol is not deleted by llvm.
+ for (GlobalValue *V : Used) {
+ AlwaysPreserved.insert(V->getName());
+ }
+
+ // Mark all functions not in the api as internal.
+ for (Function &I : M) {
+ if (!maybeInternalize(I, ExternalComdats))
+ continue;
+ Changed = true;
+
+ if (ExternalNode)
+ // Remove a callgraph edge from the external node to this function.
+ ExternalNode->removeOneAbstractEdgeTo((*CG)[&I]);
+
+ ++NumFunctions;
+ LLVM_DEBUG(dbgs() << "Internalizing func " << I.getName() << "\n");
+ }
+
+ // Never internalize the llvm.used symbol. It is used to implement
+ // attribute((used)).
+ // FIXME: Shouldn't this just filter on llvm.metadata section??
+ AlwaysPreserved.insert("llvm.used");
+ AlwaysPreserved.insert("llvm.compiler.used");
+
+ // Never internalize anchors used by the machine module info, else the info
+ // won't find them. (see MachineModuleInfo.)
+ AlwaysPreserved.insert("llvm.global_ctors");
+ AlwaysPreserved.insert("llvm.global_dtors");
+ AlwaysPreserved.insert("llvm.global.annotations");
+
+ // Never internalize symbols code-gen inserts.
+ // FIXME: We should probably add this (and the __stack_chk_guard) via some
+ // type of call-back in CodeGen.
+ AlwaysPreserved.insert("__stack_chk_fail");
+ AlwaysPreserved.insert("__stack_chk_guard");
+
+ // Mark all global variables with initializers that are not in the api as
+ // internal as well.
+ for (auto &GV : M.globals()) {
+ if (!maybeInternalize(GV, ExternalComdats))
+ continue;
+ Changed = true;
+
+ ++NumGlobals;
+ LLVM_DEBUG(dbgs() << "Internalized gvar " << GV.getName() << "\n");
+ }
+
+ // Mark all aliases that are not in the api as internal as well.
+ for (auto &GA : M.aliases()) {
+ if (!maybeInternalize(GA, ExternalComdats))
+ continue;
+ Changed = true;
+
+ ++NumAliases;
+ LLVM_DEBUG(dbgs() << "Internalized alias " << GA.getName() << "\n");
+ }
+
+ return Changed;
+}
+
+InternalizePass::InternalizePass() : MustPreserveGV(PreserveAPIList()) {}
+
+PreservedAnalyses InternalizePass::run(Module &M, ModuleAnalysisManager &AM) {
+ if (!internalizeModule(M, AM.getCachedResult<CallGraphAnalysis>(M)))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<CallGraphAnalysis>();
+ return PA;
+}
+
+namespace {
+class InternalizeLegacyPass : public ModulePass {
+ // Client supplied callback to control wheter a symbol must be preserved.
+ std::function<bool(const GlobalValue &)> MustPreserveGV;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ InternalizeLegacyPass() : ModulePass(ID), MustPreserveGV(PreserveAPIList()) {}
+
+ InternalizeLegacyPass(std::function<bool(const GlobalValue &)> MustPreserveGV)
+ : ModulePass(ID), MustPreserveGV(std::move(MustPreserveGV)) {
+ initializeInternalizeLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+
+ CallGraphWrapperPass *CGPass =
+ getAnalysisIfAvailable<CallGraphWrapperPass>();
+ CallGraph *CG = CGPass ? &CGPass->getCallGraph() : nullptr;
+ return internalizeModule(M, MustPreserveGV, CG);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<CallGraphWrapperPass>();
+ }
+};
+}
+
+char InternalizeLegacyPass::ID = 0;
+INITIALIZE_PASS(InternalizeLegacyPass, "internalize",
+ "Internalize Global Symbols", false, false)
+
+ModulePass *llvm::createInternalizePass() {
+ return new InternalizeLegacyPass();
+}
+
+ModulePass *llvm::createInternalizePass(
+ std::function<bool(const GlobalValue &)> MustPreserveGV) {
+ return new InternalizeLegacyPass(std::move(MustPreserveGV));
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/LoopExtractor.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/LoopExtractor.cpp
index 79cfa45924..a497c0390b 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/LoopExtractor.cpp
@@ -1,55 +1,55 @@
-//===- LoopExtractor.cpp - Extract each loop into a new function ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// A pass wrapper around the ExtractLoop() scalar transformation to extract each
-// top-level loop into its own new function. If the loop is the ONLY loop in a
-// given function, it is not touched. This is a pass most useful for debugging
-// via bugpoint.
-//
-//===----------------------------------------------------------------------===//
-
+//===- LoopExtractor.cpp - Extract each loop into a new function ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A pass wrapper around the ExtractLoop() scalar transformation to extract each
+// top-level loop into its own new function. If the loop is the ONLY loop in a
+// given function, it is not touched. This is a pass most useful for debugging
+// via bugpoint.
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/IPO/LoopExtractor.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/CodeExtractor.h"
-#include <fstream>
-#include <set>
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-extract"
-
-STATISTIC(NumExtracted, "Number of loops extracted");
-
-namespace {
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include <fstream>
+#include <set>
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-extract"
+
+STATISTIC(NumExtracted, "Number of loops extracted");
+
+namespace {
struct LoopExtractorLegacyPass : public ModulePass {
static char ID; // Pass identification, replacement for typeid
-
+
unsigned NumLoops;
-
+
explicit LoopExtractorLegacyPass(unsigned NumLoops = ~0)
: ModulePass(ID), NumLoops(NumLoops) {
initializeLoopExtractorLegacyPassPass(*PassRegistry::getPassRegistry());
}
-
+
bool runOnModule(Module &M) override;
-
+
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequiredID(BreakCriticalEdgesID);
AU.addRequired<DominatorTreeWrapperPass>();
@@ -59,7 +59,7 @@ struct LoopExtractorLegacyPass : public ModulePass {
AU.addUsedIfAvailable<AssumptionCacheTracker>();
}
};
-
+
struct LoopExtractor {
explicit LoopExtractor(
unsigned NumLoops,
@@ -70,7 +70,7 @@ struct LoopExtractor {
LookupLoopInfo(LookupLoopInfo),
LookupAssumptionCache(LookupAssumptionCache) {}
bool runOnModule(Module &M);
-
+
private:
// The number of natural loops to extract from the program into functions.
unsigned NumLoops;
@@ -89,35 +89,35 @@ private:
char LoopExtractorLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(LoopExtractorLegacyPass, "loop-extract",
- "Extract loops into new functions", false, false)
-INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+ "Extract loops into new functions", false, false)
+INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
INITIALIZE_PASS_END(LoopExtractorLegacyPass, "loop-extract",
- "Extract loops into new functions", false, false)
-
-namespace {
- /// SingleLoopExtractor - For bugpoint.
+ "Extract loops into new functions", false, false)
+
+namespace {
+ /// SingleLoopExtractor - For bugpoint.
struct SingleLoopExtractor : public LoopExtractorLegacyPass {
static char ID; // Pass identification, replacement for typeid
SingleLoopExtractor() : LoopExtractorLegacyPass(1) {}
};
-} // End anonymous namespace
-
-char SingleLoopExtractor::ID = 0;
-INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single",
- "Extract at most one loop into a new function", false, false)
-
-// createLoopExtractorPass - This pass extracts all natural loops from the
-// program into a function if it can.
-//
+} // End anonymous namespace
+
+char SingleLoopExtractor::ID = 0;
+INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single",
+ "Extract at most one loop into a new function", false, false)
+
+// createLoopExtractorPass - This pass extracts all natural loops from the
+// program into a function if it can.
+//
Pass *llvm::createLoopExtractorPass() { return new LoopExtractorLegacyPass(); }
-
+
bool LoopExtractorLegacyPass::runOnModule(Module &M) {
- if (skipModule(M))
- return false;
-
+ if (skipModule(M))
+ return false;
+
bool Changed = false;
auto LookupDomTree = [this](Function &F) -> DominatorTree & {
return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
@@ -136,132 +136,132 @@ bool LoopExtractorLegacyPass::runOnModule(Module &M) {
}
bool LoopExtractor::runOnModule(Module &M) {
- if (M.empty())
- return false;
-
- if (!NumLoops)
- return false;
-
- bool Changed = false;
-
- // The end of the function list may change (new functions will be added at the
- // end), so we run from the first to the current last.
- auto I = M.begin(), E = --M.end();
- while (true) {
- Function &F = *I;
-
- Changed |= runOnFunction(F);
- if (!NumLoops)
- break;
-
- // If this is the last function.
- if (I == E)
- break;
-
- ++I;
- }
- return Changed;
-}
-
-bool LoopExtractor::runOnFunction(Function &F) {
- // Do not modify `optnone` functions.
- if (F.hasOptNone())
- return false;
-
- if (F.empty())
- return false;
-
- bool Changed = false;
+ if (M.empty())
+ return false;
+
+ if (!NumLoops)
+ return false;
+
+ bool Changed = false;
+
+ // The end of the function list may change (new functions will be added at the
+ // end), so we run from the first to the current last.
+ auto I = M.begin(), E = --M.end();
+ while (true) {
+ Function &F = *I;
+
+ Changed |= runOnFunction(F);
+ if (!NumLoops)
+ break;
+
+ // If this is the last function.
+ if (I == E)
+ break;
+
+ ++I;
+ }
+ return Changed;
+}
+
+bool LoopExtractor::runOnFunction(Function &F) {
+ // Do not modify `optnone` functions.
+ if (F.hasOptNone())
+ return false;
+
+ if (F.empty())
+ return false;
+
+ bool Changed = false;
LoopInfo &LI = LookupLoopInfo(F);
-
- // If there are no loops in the function.
- if (LI.empty())
- return Changed;
-
+
+ // If there are no loops in the function.
+ if (LI.empty())
+ return Changed;
+
DominatorTree &DT = LookupDomTree(F);
-
- // If there is more than one top-level loop in this function, extract all of
- // the loops.
- if (std::next(LI.begin()) != LI.end())
- return Changed | extractLoops(LI.begin(), LI.end(), LI, DT);
-
- // Otherwise there is exactly one top-level loop.
- Loop *TLL = *LI.begin();
-
- // If the loop is in LoopSimplify form, then extract it only if this function
- // is more than a minimal wrapper around the loop.
- if (TLL->isLoopSimplifyForm()) {
- bool ShouldExtractLoop = false;
-
- // Extract the loop if the entry block doesn't branch to the loop header.
- Instruction *EntryTI = F.getEntryBlock().getTerminator();
- if (!isa<BranchInst>(EntryTI) ||
- !cast<BranchInst>(EntryTI)->isUnconditional() ||
- EntryTI->getSuccessor(0) != TLL->getHeader()) {
- ShouldExtractLoop = true;
- } else {
- // Check to see if any exits from the loop are more than just return
- // blocks.
- SmallVector<BasicBlock *, 8> ExitBlocks;
- TLL->getExitBlocks(ExitBlocks);
- for (auto *ExitBlock : ExitBlocks)
- if (!isa<ReturnInst>(ExitBlock->getTerminator())) {
- ShouldExtractLoop = true;
- break;
- }
- }
-
- if (ShouldExtractLoop)
- return Changed | extractLoop(TLL, LI, DT);
- }
-
- // Okay, this function is a minimal container around the specified loop.
- // If we extract the loop, we will continue to just keep extracting it
- // infinitely... so don't extract it. However, if the loop contains any
- // sub-loops, extract them.
- return Changed | extractLoops(TLL->begin(), TLL->end(), LI, DT);
-}
-
-bool LoopExtractor::extractLoops(Loop::iterator From, Loop::iterator To,
- LoopInfo &LI, DominatorTree &DT) {
- bool Changed = false;
- SmallVector<Loop *, 8> Loops;
-
- // Save the list of loops, as it may change.
- Loops.assign(From, To);
- for (Loop *L : Loops) {
- // If LoopSimplify form is not available, stay out of trouble.
- if (!L->isLoopSimplifyForm())
- continue;
-
- Changed |= extractLoop(L, LI, DT);
- if (!NumLoops)
- break;
- }
- return Changed;
-}
-
-bool LoopExtractor::extractLoop(Loop *L, LoopInfo &LI, DominatorTree &DT) {
- assert(NumLoops != 0);
- Function &Func = *L->getHeader()->getParent();
+
+ // If there is more than one top-level loop in this function, extract all of
+ // the loops.
+ if (std::next(LI.begin()) != LI.end())
+ return Changed | extractLoops(LI.begin(), LI.end(), LI, DT);
+
+ // Otherwise there is exactly one top-level loop.
+ Loop *TLL = *LI.begin();
+
+ // If the loop is in LoopSimplify form, then extract it only if this function
+ // is more than a minimal wrapper around the loop.
+ if (TLL->isLoopSimplifyForm()) {
+ bool ShouldExtractLoop = false;
+
+ // Extract the loop if the entry block doesn't branch to the loop header.
+ Instruction *EntryTI = F.getEntryBlock().getTerminator();
+ if (!isa<BranchInst>(EntryTI) ||
+ !cast<BranchInst>(EntryTI)->isUnconditional() ||
+ EntryTI->getSuccessor(0) != TLL->getHeader()) {
+ ShouldExtractLoop = true;
+ } else {
+ // Check to see if any exits from the loop are more than just return
+ // blocks.
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ TLL->getExitBlocks(ExitBlocks);
+ for (auto *ExitBlock : ExitBlocks)
+ if (!isa<ReturnInst>(ExitBlock->getTerminator())) {
+ ShouldExtractLoop = true;
+ break;
+ }
+ }
+
+ if (ShouldExtractLoop)
+ return Changed | extractLoop(TLL, LI, DT);
+ }
+
+ // Okay, this function is a minimal container around the specified loop.
+ // If we extract the loop, we will continue to just keep extracting it
+ // infinitely... so don't extract it. However, if the loop contains any
+ // sub-loops, extract them.
+ return Changed | extractLoops(TLL->begin(), TLL->end(), LI, DT);
+}
+
+bool LoopExtractor::extractLoops(Loop::iterator From, Loop::iterator To,
+ LoopInfo &LI, DominatorTree &DT) {
+ bool Changed = false;
+ SmallVector<Loop *, 8> Loops;
+
+ // Save the list of loops, as it may change.
+ Loops.assign(From, To);
+ for (Loop *L : Loops) {
+ // If LoopSimplify form is not available, stay out of trouble.
+ if (!L->isLoopSimplifyForm())
+ continue;
+
+ Changed |= extractLoop(L, LI, DT);
+ if (!NumLoops)
+ break;
+ }
+ return Changed;
+}
+
+bool LoopExtractor::extractLoop(Loop *L, LoopInfo &LI, DominatorTree &DT) {
+ assert(NumLoops != 0);
+ Function &Func = *L->getHeader()->getParent();
AssumptionCache *AC = LookupAssumptionCache(Func);
- CodeExtractorAnalysisCache CEAC(Func);
- CodeExtractor Extractor(DT, *L, false, nullptr, nullptr, AC);
- if (Extractor.extractCodeRegion(CEAC)) {
- LI.erase(L);
- --NumLoops;
- ++NumExtracted;
- return true;
- }
- return false;
-}
-
-// createSingleLoopExtractorPass - This pass extracts one natural loop from the
-// program into a function if it can. This is used by bugpoint.
-//
-Pass *llvm::createSingleLoopExtractorPass() {
- return new SingleLoopExtractor();
-}
+ CodeExtractorAnalysisCache CEAC(Func);
+ CodeExtractor Extractor(DT, *L, false, nullptr, nullptr, AC);
+ if (Extractor.extractCodeRegion(CEAC)) {
+ LI.erase(L);
+ --NumLoops;
+ ++NumExtracted;
+ return true;
+ }
+ return false;
+}
+
+// createSingleLoopExtractorPass - This pass extracts one natural loop from the
+// program into a function if it can. This is used by bugpoint.
+//
+Pass *llvm::createSingleLoopExtractorPass() {
+ return new SingleLoopExtractor();
+}
PreservedAnalyses LoopExtractorPass::run(Module &M, ModuleAnalysisManager &AM) {
auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/LowerTypeTests.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/LowerTypeTests.cpp
index 96a4dfd176..8bd3036f1f 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1,2260 +1,2260 @@
-//===- LowerTypeTests.cpp - type metadata lowering pass -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass lowers type metadata and calls to the llvm.type.test intrinsic.
-// It also ensures that globals are properly laid out for the
-// llvm.icall.branch.funnel intrinsic.
-// See http://llvm.org/docs/TypeMetadata.html for more information.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/LowerTypeTests.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/EquivalenceClasses.h"
-#include "llvm/ADT/PointerUnion.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/TypeMetadataUtils.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalObject.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ModuleSummaryIndex.h"
-#include "llvm/IR/ModuleSummaryIndexYAML.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/TrailingObjects.h"
-#include "llvm/Support/YAMLTraits.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <memory>
-#include <set>
-#include <string>
-#include <system_error>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-using namespace lowertypetests;
-
-#define DEBUG_TYPE "lowertypetests"
-
-STATISTIC(ByteArraySizeBits, "Byte array size in bits");
-STATISTIC(ByteArraySizeBytes, "Byte array size in bytes");
-STATISTIC(NumByteArraysCreated, "Number of byte arrays created");
-STATISTIC(NumTypeTestCallsLowered, "Number of type test calls lowered");
-STATISTIC(NumTypeIdDisjointSets, "Number of disjoint sets of type identifiers");
-
-static cl::opt<bool> AvoidReuse(
- "lowertypetests-avoid-reuse",
- cl::desc("Try to avoid reuse of byte array addresses using aliases"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<PassSummaryAction> ClSummaryAction(
- "lowertypetests-summary-action",
- cl::desc("What to do with the summary when running this pass"),
- cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"),
- clEnumValN(PassSummaryAction::Import, "import",
- "Import typeid resolutions from summary and globals"),
- clEnumValN(PassSummaryAction::Export, "export",
- "Export typeid resolutions to summary and globals")),
- cl::Hidden);
-
-static cl::opt<std::string> ClReadSummary(
- "lowertypetests-read-summary",
- cl::desc("Read summary from given YAML file before running pass"),
- cl::Hidden);
-
-static cl::opt<std::string> ClWriteSummary(
- "lowertypetests-write-summary",
- cl::desc("Write summary to given YAML file after running pass"),
- cl::Hidden);
-
-bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {
- if (Offset < ByteOffset)
- return false;
-
- if ((Offset - ByteOffset) % (uint64_t(1) << AlignLog2) != 0)
- return false;
-
- uint64_t BitOffset = (Offset - ByteOffset) >> AlignLog2;
- if (BitOffset >= BitSize)
- return false;
-
- return Bits.count(BitOffset);
-}
-
-void BitSetInfo::print(raw_ostream &OS) const {
- OS << "offset " << ByteOffset << " size " << BitSize << " align "
- << (1 << AlignLog2);
-
- if (isAllOnes()) {
- OS << " all-ones\n";
- return;
- }
-
- OS << " { ";
- for (uint64_t B : Bits)
- OS << B << ' ';
- OS << "}\n";
-}
-
-BitSetInfo BitSetBuilder::build() {
- if (Min > Max)
- Min = 0;
-
- // Normalize each offset against the minimum observed offset, and compute
- // the bitwise OR of each of the offsets. The number of trailing zeros
- // in the mask gives us the log2 of the alignment of all offsets, which
- // allows us to compress the bitset by only storing one bit per aligned
- // address.
- uint64_t Mask = 0;
- for (uint64_t &Offset : Offsets) {
- Offset -= Min;
- Mask |= Offset;
- }
-
- BitSetInfo BSI;
- BSI.ByteOffset = Min;
-
- BSI.AlignLog2 = 0;
- if (Mask != 0)
- BSI.AlignLog2 = countTrailingZeros(Mask, ZB_Undefined);
-
- // Build the compressed bitset while normalizing the offsets against the
- // computed alignment.
- BSI.BitSize = ((Max - Min) >> BSI.AlignLog2) + 1;
- for (uint64_t Offset : Offsets) {
- Offset >>= BSI.AlignLog2;
- BSI.Bits.insert(Offset);
- }
-
- return BSI;
-}
-
-void GlobalLayoutBuilder::addFragment(const std::set<uint64_t> &F) {
- // Create a new fragment to hold the layout for F.
- Fragments.emplace_back();
- std::vector<uint64_t> &Fragment = Fragments.back();
- uint64_t FragmentIndex = Fragments.size() - 1;
-
- for (auto ObjIndex : F) {
- uint64_t OldFragmentIndex = FragmentMap[ObjIndex];
- if (OldFragmentIndex == 0) {
- // We haven't seen this object index before, so just add it to the current
- // fragment.
- Fragment.push_back(ObjIndex);
- } else {
- // This index belongs to an existing fragment. Copy the elements of the
- // old fragment into this one and clear the old fragment. We don't update
- // the fragment map just yet, this ensures that any further references to
- // indices from the old fragment in this fragment do not insert any more
- // indices.
- std::vector<uint64_t> &OldFragment = Fragments[OldFragmentIndex];
+//===- LowerTypeTests.cpp - type metadata lowering pass -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers type metadata and calls to the llvm.type.test intrinsic.
+// It also ensures that globals are properly laid out for the
+// llvm.icall.branch.funnel intrinsic.
+// See http://llvm.org/docs/TypeMetadata.html for more information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/LowerTypeTests.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/IR/ModuleSummaryIndexYAML.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TrailingObjects.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <set>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace lowertypetests;
+
+#define DEBUG_TYPE "lowertypetests"
+
+STATISTIC(ByteArraySizeBits, "Byte array size in bits");
+STATISTIC(ByteArraySizeBytes, "Byte array size in bytes");
+STATISTIC(NumByteArraysCreated, "Number of byte arrays created");
+STATISTIC(NumTypeTestCallsLowered, "Number of type test calls lowered");
+STATISTIC(NumTypeIdDisjointSets, "Number of disjoint sets of type identifiers");
+
+static cl::opt<bool> AvoidReuse(
+ "lowertypetests-avoid-reuse",
+ cl::desc("Try to avoid reuse of byte array addresses using aliases"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<PassSummaryAction> ClSummaryAction(
+ "lowertypetests-summary-action",
+ cl::desc("What to do with the summary when running this pass"),
+ cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"),
+ clEnumValN(PassSummaryAction::Import, "import",
+ "Import typeid resolutions from summary and globals"),
+ clEnumValN(PassSummaryAction::Export, "export",
+ "Export typeid resolutions to summary and globals")),
+ cl::Hidden);
+
+static cl::opt<std::string> ClReadSummary(
+ "lowertypetests-read-summary",
+ cl::desc("Read summary from given YAML file before running pass"),
+ cl::Hidden);
+
+static cl::opt<std::string> ClWriteSummary(
+ "lowertypetests-write-summary",
+ cl::desc("Write summary to given YAML file after running pass"),
+ cl::Hidden);
+
+bool BitSetInfo::containsGlobalOffset(uint64_t Offset) const {
+ if (Offset < ByteOffset)
+ return false;
+
+ if ((Offset - ByteOffset) % (uint64_t(1) << AlignLog2) != 0)
+ return false;
+
+ uint64_t BitOffset = (Offset - ByteOffset) >> AlignLog2;
+ if (BitOffset >= BitSize)
+ return false;
+
+ return Bits.count(BitOffset);
+}
+
+void BitSetInfo::print(raw_ostream &OS) const {
+ OS << "offset " << ByteOffset << " size " << BitSize << " align "
+ << (1 << AlignLog2);
+
+ if (isAllOnes()) {
+ OS << " all-ones\n";
+ return;
+ }
+
+ OS << " { ";
+ for (uint64_t B : Bits)
+ OS << B << ' ';
+ OS << "}\n";
+}
+
+BitSetInfo BitSetBuilder::build() {
+ if (Min > Max)
+ Min = 0;
+
+ // Normalize each offset against the minimum observed offset, and compute
+ // the bitwise OR of each of the offsets. The number of trailing zeros
+ // in the mask gives us the log2 of the alignment of all offsets, which
+ // allows us to compress the bitset by only storing one bit per aligned
+ // address.
+ uint64_t Mask = 0;
+ for (uint64_t &Offset : Offsets) {
+ Offset -= Min;
+ Mask |= Offset;
+ }
+
+ BitSetInfo BSI;
+ BSI.ByteOffset = Min;
+
+ BSI.AlignLog2 = 0;
+ if (Mask != 0)
+ BSI.AlignLog2 = countTrailingZeros(Mask, ZB_Undefined);
+
+ // Build the compressed bitset while normalizing the offsets against the
+ // computed alignment.
+ BSI.BitSize = ((Max - Min) >> BSI.AlignLog2) + 1;
+ for (uint64_t Offset : Offsets) {
+ Offset >>= BSI.AlignLog2;
+ BSI.Bits.insert(Offset);
+ }
+
+ return BSI;
+}
+
+void GlobalLayoutBuilder::addFragment(const std::set<uint64_t> &F) {
+ // Create a new fragment to hold the layout for F.
+ Fragments.emplace_back();
+ std::vector<uint64_t> &Fragment = Fragments.back();
+ uint64_t FragmentIndex = Fragments.size() - 1;
+
+ for (auto ObjIndex : F) {
+ uint64_t OldFragmentIndex = FragmentMap[ObjIndex];
+ if (OldFragmentIndex == 0) {
+ // We haven't seen this object index before, so just add it to the current
+ // fragment.
+ Fragment.push_back(ObjIndex);
+ } else {
+ // This index belongs to an existing fragment. Copy the elements of the
+ // old fragment into this one and clear the old fragment. We don't update
+ // the fragment map just yet, this ensures that any further references to
+ // indices from the old fragment in this fragment do not insert any more
+ // indices.
+ std::vector<uint64_t> &OldFragment = Fragments[OldFragmentIndex];
llvm::append_range(Fragment, OldFragment);
- OldFragment.clear();
- }
- }
-
- // Update the fragment map to point our object indices to this fragment.
- for (uint64_t ObjIndex : Fragment)
- FragmentMap[ObjIndex] = FragmentIndex;
-}
-
-void ByteArrayBuilder::allocate(const std::set<uint64_t> &Bits,
- uint64_t BitSize, uint64_t &AllocByteOffset,
- uint8_t &AllocMask) {
- // Find the smallest current allocation.
- unsigned Bit = 0;
- for (unsigned I = 1; I != BitsPerByte; ++I)
- if (BitAllocs[I] < BitAllocs[Bit])
- Bit = I;
-
- AllocByteOffset = BitAllocs[Bit];
-
- // Add our size to it.
- unsigned ReqSize = AllocByteOffset + BitSize;
- BitAllocs[Bit] = ReqSize;
- if (Bytes.size() < ReqSize)
- Bytes.resize(ReqSize);
-
- // Set our bits.
- AllocMask = 1 << Bit;
- for (uint64_t B : Bits)
- Bytes[AllocByteOffset + B] |= AllocMask;
-}
-
-bool lowertypetests::isJumpTableCanonical(Function *F) {
- if (F->isDeclarationForLinker())
- return false;
- auto *CI = mdconst::extract_or_null<ConstantInt>(
- F->getParent()->getModuleFlag("CFI Canonical Jump Tables"));
- if (!CI || CI->getZExtValue() != 0)
- return true;
- return F->hasFnAttribute("cfi-canonical-jump-table");
-}
-
-namespace {
-
-struct ByteArrayInfo {
- std::set<uint64_t> Bits;
- uint64_t BitSize;
- GlobalVariable *ByteArray;
- GlobalVariable *MaskGlobal;
- uint8_t *MaskPtr = nullptr;
-};
-
-/// A POD-like structure that we use to store a global reference together with
-/// its metadata types. In this pass we frequently need to query the set of
-/// metadata types referenced by a global, which at the IR level is an expensive
-/// operation involving a map lookup; this data structure helps to reduce the
-/// number of times we need to do this lookup.
-class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
- friend TrailingObjects;
-
- GlobalObject *GO;
- size_t NTypes;
-
- // For functions: true if the jump table is canonical. This essentially means
- // whether the canonical address (i.e. the symbol table entry) of the function
- // is provided by the local jump table. This is normally the same as whether
- // the function is defined locally, but if canonical jump tables are disabled
- // by the user then the jump table never provides a canonical definition.
- bool IsJumpTableCanonical;
-
- // For functions: true if this function is either defined or used in a thinlto
- // module and its jumptable entry needs to be exported to thinlto backends.
- bool IsExported;
-
- size_t numTrailingObjects(OverloadToken<MDNode *>) const { return NTypes; }
-
-public:
- static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO,
- bool IsJumpTableCanonical, bool IsExported,
- ArrayRef<MDNode *> Types) {
- auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate(
- totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember)));
- GTM->GO = GO;
- GTM->NTypes = Types.size();
- GTM->IsJumpTableCanonical = IsJumpTableCanonical;
- GTM->IsExported = IsExported;
- std::uninitialized_copy(Types.begin(), Types.end(),
- GTM->getTrailingObjects<MDNode *>());
- return GTM;
- }
-
- GlobalObject *getGlobal() const {
- return GO;
- }
-
- bool isJumpTableCanonical() const {
- return IsJumpTableCanonical;
- }
-
- bool isExported() const {
- return IsExported;
- }
-
- ArrayRef<MDNode *> types() const {
- return makeArrayRef(getTrailingObjects<MDNode *>(), NTypes);
- }
-};
-
-struct ICallBranchFunnel final
- : TrailingObjects<ICallBranchFunnel, GlobalTypeMember *> {
- static ICallBranchFunnel *create(BumpPtrAllocator &Alloc, CallInst *CI,
- ArrayRef<GlobalTypeMember *> Targets,
- unsigned UniqueId) {
- auto *Call = static_cast<ICallBranchFunnel *>(
- Alloc.Allocate(totalSizeToAlloc<GlobalTypeMember *>(Targets.size()),
- alignof(ICallBranchFunnel)));
- Call->CI = CI;
- Call->UniqueId = UniqueId;
- Call->NTargets = Targets.size();
- std::uninitialized_copy(Targets.begin(), Targets.end(),
- Call->getTrailingObjects<GlobalTypeMember *>());
- return Call;
- }
-
- CallInst *CI;
- ArrayRef<GlobalTypeMember *> targets() const {
- return makeArrayRef(getTrailingObjects<GlobalTypeMember *>(), NTargets);
- }
-
- unsigned UniqueId;
-
-private:
- size_t NTargets;
-};
-
-struct ScopedSaveAliaseesAndUsed {
- Module &M;
- SmallPtrSet<GlobalValue *, 16> Used, CompilerUsed;
- std::vector<std::pair<GlobalIndirectSymbol *, Function *>> FunctionAliases;
-
- ScopedSaveAliaseesAndUsed(Module &M) : M(M) {
- // The users of this class want to replace all function references except
- // for aliases and llvm.used/llvm.compiler.used with references to a jump
- // table. We avoid replacing aliases in order to avoid introducing a double
- // indirection (or an alias pointing to a declaration in ThinLTO mode), and
- // we avoid replacing llvm.used/llvm.compiler.used because these global
- // variables describe properties of the global, not the jump table (besides,
- // offseted references to the jump table in llvm.used are invalid).
- // Unfortunately, LLVM doesn't have a "RAUW except for these (possibly
- // indirect) users", so what we do is save the list of globals referenced by
- // llvm.used/llvm.compiler.used and aliases, erase the used lists, let RAUW
- // replace the aliasees and then set them back to their original values at
- // the end.
- if (GlobalVariable *GV = collectUsedGlobalVariables(M, Used, false))
- GV->eraseFromParent();
- if (GlobalVariable *GV = collectUsedGlobalVariables(M, CompilerUsed, true))
- GV->eraseFromParent();
-
- for (auto &GIS : concat<GlobalIndirectSymbol>(M.aliases(), M.ifuncs())) {
- // FIXME: This should look past all aliases not just interposable ones,
- // see discussion on D65118.
- if (auto *F =
- dyn_cast<Function>(GIS.getIndirectSymbol()->stripPointerCasts()))
- FunctionAliases.push_back({&GIS, F});
- }
- }
-
- ~ScopedSaveAliaseesAndUsed() {
- appendToUsed(M, std::vector<GlobalValue *>(Used.begin(), Used.end()));
- appendToCompilerUsed(M, std::vector<GlobalValue *>(CompilerUsed.begin(),
- CompilerUsed.end()));
-
- for (auto P : FunctionAliases)
- P.first->setIndirectSymbol(
- ConstantExpr::getBitCast(P.second, P.first->getType()));
- }
-};
-
-class LowerTypeTestsModule {
- Module &M;
-
- ModuleSummaryIndex *ExportSummary;
- const ModuleSummaryIndex *ImportSummary;
- // Set when the client has invoked this to simply drop all type test assume
- // sequences.
- bool DropTypeTests;
-
- Triple::ArchType Arch;
- Triple::OSType OS;
- Triple::ObjectFormatType ObjectFormat;
-
- IntegerType *Int1Ty = Type::getInt1Ty(M.getContext());
- IntegerType *Int8Ty = Type::getInt8Ty(M.getContext());
- PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
- ArrayType *Int8Arr0Ty = ArrayType::get(Type::getInt8Ty(M.getContext()), 0);
- IntegerType *Int32Ty = Type::getInt32Ty(M.getContext());
- PointerType *Int32PtrTy = PointerType::getUnqual(Int32Ty);
- IntegerType *Int64Ty = Type::getInt64Ty(M.getContext());
- IntegerType *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext(), 0);
-
- // Indirect function call index assignment counter for WebAssembly
- uint64_t IndirectIndex = 1;
-
- // Mapping from type identifiers to the call sites that test them, as well as
- // whether the type identifier needs to be exported to ThinLTO backends as
- // part of the regular LTO phase of the ThinLTO pipeline (see exportTypeId).
- struct TypeIdUserInfo {
- std::vector<CallInst *> CallSites;
- bool IsExported = false;
- };
- DenseMap<Metadata *, TypeIdUserInfo> TypeIdUsers;
-
- /// This structure describes how to lower type tests for a particular type
- /// identifier. It is either built directly from the global analysis (during
- /// regular LTO or the regular LTO phase of ThinLTO), or indirectly using type
- /// identifier summaries and external symbol references (in ThinLTO backends).
- struct TypeIdLowering {
- TypeTestResolution::Kind TheKind = TypeTestResolution::Unsat;
-
- /// All except Unsat: the start address within the combined global.
- Constant *OffsetedGlobal;
-
- /// ByteArray, Inline, AllOnes: log2 of the required global alignment
- /// relative to the start address.
- Constant *AlignLog2;
-
- /// ByteArray, Inline, AllOnes: one less than the size of the memory region
- /// covering members of this type identifier as a multiple of 2^AlignLog2.
- Constant *SizeM1;
-
- /// ByteArray: the byte array to test the address against.
- Constant *TheByteArray;
-
- /// ByteArray: the bit mask to apply to bytes loaded from the byte array.
- Constant *BitMask;
-
- /// Inline: the bit mask to test the address against.
- Constant *InlineBits;
- };
-
- std::vector<ByteArrayInfo> ByteArrayInfos;
-
- Function *WeakInitializerFn = nullptr;
-
- bool shouldExportConstantsAsAbsoluteSymbols();
- uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
- TypeIdLowering importTypeId(StringRef TypeId);
- void importTypeTest(CallInst *CI);
- void importFunction(Function *F, bool isJumpTableCanonical,
- std::vector<GlobalAlias *> &AliasesToErase);
-
- BitSetInfo
- buildBitSet(Metadata *TypeId,
- const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
- ByteArrayInfo *createByteArray(BitSetInfo &BSI);
- void allocateByteArrays();
- Value *createBitSetTest(IRBuilder<> &B, const TypeIdLowering &TIL,
- Value *BitOffset);
- void lowerTypeTestCalls(
- ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
- const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
- Value *lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
- const TypeIdLowering &TIL);
-
- void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> TypeIds,
- ArrayRef<GlobalTypeMember *> Globals);
- unsigned getJumpTableEntrySize();
- Type *getJumpTableEntryType();
- void createJumpTableEntry(raw_ostream &AsmOS, raw_ostream &ConstraintOS,
- Triple::ArchType JumpTableArch,
- SmallVectorImpl<Value *> &AsmArgs, Function *Dest);
- void verifyTypeMDNode(GlobalObject *GO, MDNode *Type);
- void buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds,
- ArrayRef<GlobalTypeMember *> Functions);
- void buildBitSetsFromFunctionsNative(ArrayRef<Metadata *> TypeIds,
- ArrayRef<GlobalTypeMember *> Functions);
- void buildBitSetsFromFunctionsWASM(ArrayRef<Metadata *> TypeIds,
- ArrayRef<GlobalTypeMember *> Functions);
- void
- buildBitSetsFromDisjointSet(ArrayRef<Metadata *> TypeIds,
- ArrayRef<GlobalTypeMember *> Globals,
- ArrayRef<ICallBranchFunnel *> ICallBranchFunnels);
-
- void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT,
- bool IsJumpTableCanonical);
- void moveInitializerToModuleConstructor(GlobalVariable *GV);
- void findGlobalVariableUsersOf(Constant *C,
- SmallSetVector<GlobalVariable *, 8> &Out);
-
- void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions);
-
- /// replaceCfiUses - Go through the uses list for this definition
- /// and make each use point to "V" instead of "this" when the use is outside
- /// the block. 'This's use list is expected to have at least one element.
- /// Unlike replaceAllUsesWith this function skips blockaddr and direct call
- /// uses.
- void replaceCfiUses(Function *Old, Value *New, bool IsJumpTableCanonical);
-
- /// replaceDirectCalls - Go through the uses list for this definition and
- /// replace each use, which is a direct function call.
- void replaceDirectCalls(Value *Old, Value *New);
-
-public:
- LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary,
- const ModuleSummaryIndex *ImportSummary,
- bool DropTypeTests);
-
- bool lower();
-
- // Lower the module using the action and summary passed as command line
- // arguments. For testing purposes only.
- static bool runForTesting(Module &M);
-};
-
-struct LowerTypeTests : public ModulePass {
- static char ID;
-
- bool UseCommandLine = false;
-
- ModuleSummaryIndex *ExportSummary;
- const ModuleSummaryIndex *ImportSummary;
- bool DropTypeTests;
-
- LowerTypeTests() : ModulePass(ID), UseCommandLine(true) {
- initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
- }
-
- LowerTypeTests(ModuleSummaryIndex *ExportSummary,
- const ModuleSummaryIndex *ImportSummary, bool DropTypeTests)
- : ModulePass(ID), ExportSummary(ExportSummary),
- ImportSummary(ImportSummary), DropTypeTests(DropTypeTests) {
- initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- if (UseCommandLine)
- return LowerTypeTestsModule::runForTesting(M);
- return LowerTypeTestsModule(M, ExportSummary, ImportSummary, DropTypeTests)
- .lower();
- }
-};
-
-} // end anonymous namespace
-
-char LowerTypeTests::ID = 0;
-
-INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
- false)
-
-ModulePass *
-llvm::createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary,
- const ModuleSummaryIndex *ImportSummary,
- bool DropTypeTests) {
- return new LowerTypeTests(ExportSummary, ImportSummary, DropTypeTests);
-}
-
-/// Build a bit set for TypeId using the object layouts in
-/// GlobalLayout.
-BitSetInfo LowerTypeTestsModule::buildBitSet(
- Metadata *TypeId,
- const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
- BitSetBuilder BSB;
-
- // Compute the byte offset of each address associated with this type
- // identifier.
- for (auto &GlobalAndOffset : GlobalLayout) {
- for (MDNode *Type : GlobalAndOffset.first->types()) {
- if (Type->getOperand(1) != TypeId)
- continue;
- uint64_t Offset =
- cast<ConstantInt>(
- cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
- ->getZExtValue();
- BSB.addOffset(GlobalAndOffset.second + Offset);
- }
- }
-
- return BSB.build();
-}
-
-/// Build a test that bit BitOffset mod sizeof(Bits)*8 is set in
-/// Bits. This pattern matches to the bt instruction on x86.
-static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits,
- Value *BitOffset) {
- auto BitsType = cast<IntegerType>(Bits->getType());
- unsigned BitWidth = BitsType->getBitWidth();
-
- BitOffset = B.CreateZExtOrTrunc(BitOffset, BitsType);
- Value *BitIndex =
- B.CreateAnd(BitOffset, ConstantInt::get(BitsType, BitWidth - 1));
- Value *BitMask = B.CreateShl(ConstantInt::get(BitsType, 1), BitIndex);
- Value *MaskedBits = B.CreateAnd(Bits, BitMask);
- return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0));
-}
-
-ByteArrayInfo *LowerTypeTestsModule::createByteArray(BitSetInfo &BSI) {
- // Create globals to stand in for byte arrays and masks. These never actually
- // get initialized, we RAUW and erase them later in allocateByteArrays() once
- // we know the offset and mask to use.
- auto ByteArrayGlobal = new GlobalVariable(
- M, Int8Ty, /*isConstant=*/true, GlobalValue::PrivateLinkage, nullptr);
- auto MaskGlobal = new GlobalVariable(M, Int8Ty, /*isConstant=*/true,
- GlobalValue::PrivateLinkage, nullptr);
-
- ByteArrayInfos.emplace_back();
- ByteArrayInfo *BAI = &ByteArrayInfos.back();
-
- BAI->Bits = BSI.Bits;
- BAI->BitSize = BSI.BitSize;
- BAI->ByteArray = ByteArrayGlobal;
- BAI->MaskGlobal = MaskGlobal;
- return BAI;
-}
-
-void LowerTypeTestsModule::allocateByteArrays() {
- llvm::stable_sort(ByteArrayInfos,
- [](const ByteArrayInfo &BAI1, const ByteArrayInfo &BAI2) {
- return BAI1.BitSize > BAI2.BitSize;
- });
-
- std::vector<uint64_t> ByteArrayOffsets(ByteArrayInfos.size());
-
- ByteArrayBuilder BAB;
- for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) {
- ByteArrayInfo *BAI = &ByteArrayInfos[I];
-
- uint8_t Mask;
- BAB.allocate(BAI->Bits, BAI->BitSize, ByteArrayOffsets[I], Mask);
-
- BAI->MaskGlobal->replaceAllUsesWith(
- ConstantExpr::getIntToPtr(ConstantInt::get(Int8Ty, Mask), Int8PtrTy));
- BAI->MaskGlobal->eraseFromParent();
- if (BAI->MaskPtr)
- *BAI->MaskPtr = Mask;
- }
-
- Constant *ByteArrayConst = ConstantDataArray::get(M.getContext(), BAB.Bytes);
- auto ByteArray =
- new GlobalVariable(M, ByteArrayConst->getType(), /*isConstant=*/true,
- GlobalValue::PrivateLinkage, ByteArrayConst);
-
- for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) {
- ByteArrayInfo *BAI = &ByteArrayInfos[I];
-
- Constant *Idxs[] = {ConstantInt::get(IntPtrTy, 0),
- ConstantInt::get(IntPtrTy, ByteArrayOffsets[I])};
- Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(
- ByteArrayConst->getType(), ByteArray, Idxs);
-
- // Create an alias instead of RAUW'ing the gep directly. On x86 this ensures
- // that the pc-relative displacement is folded into the lea instead of the
- // test instruction getting another displacement.
- GlobalAlias *Alias = GlobalAlias::create(
- Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, &M);
- BAI->ByteArray->replaceAllUsesWith(Alias);
- BAI->ByteArray->eraseFromParent();
- }
-
- ByteArraySizeBits = BAB.BitAllocs[0] + BAB.BitAllocs[1] + BAB.BitAllocs[2] +
- BAB.BitAllocs[3] + BAB.BitAllocs[4] + BAB.BitAllocs[5] +
- BAB.BitAllocs[6] + BAB.BitAllocs[7];
- ByteArraySizeBytes = BAB.Bytes.size();
-}
-
-/// Build a test that bit BitOffset is set in the type identifier that was
-/// lowered to TIL, which must be either an Inline or a ByteArray.
-Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B,
- const TypeIdLowering &TIL,
- Value *BitOffset) {
- if (TIL.TheKind == TypeTestResolution::Inline) {
- // If the bit set is sufficiently small, we can avoid a load by bit testing
- // a constant.
- return createMaskedBitTest(B, TIL.InlineBits, BitOffset);
- } else {
- Constant *ByteArray = TIL.TheByteArray;
- if (AvoidReuse && !ImportSummary) {
- // Each use of the byte array uses a different alias. This makes the
- // backend less likely to reuse previously computed byte array addresses,
- // improving the security of the CFI mechanism based on this pass.
- // This won't work when importing because TheByteArray is external.
- ByteArray = GlobalAlias::create(Int8Ty, 0, GlobalValue::PrivateLinkage,
- "bits_use", ByteArray, &M);
- }
-
- Value *ByteAddr = B.CreateGEP(Int8Ty, ByteArray, BitOffset);
- Value *Byte = B.CreateLoad(Int8Ty, ByteAddr);
-
- Value *ByteAndMask =
- B.CreateAnd(Byte, ConstantExpr::getPtrToInt(TIL.BitMask, Int8Ty));
- return B.CreateICmpNE(ByteAndMask, ConstantInt::get(Int8Ty, 0));
- }
-}
-
-static bool isKnownTypeIdMember(Metadata *TypeId, const DataLayout &DL,
- Value *V, uint64_t COffset) {
- if (auto GV = dyn_cast<GlobalObject>(V)) {
- SmallVector<MDNode *, 2> Types;
- GV->getMetadata(LLVMContext::MD_type, Types);
- for (MDNode *Type : Types) {
- if (Type->getOperand(1) != TypeId)
- continue;
- uint64_t Offset =
- cast<ConstantInt>(
- cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
- ->getZExtValue();
- if (COffset == Offset)
- return true;
- }
- return false;
- }
-
- if (auto GEP = dyn_cast<GEPOperator>(V)) {
- APInt APOffset(DL.getPointerSizeInBits(0), 0);
- bool Result = GEP->accumulateConstantOffset(DL, APOffset);
- if (!Result)
- return false;
- COffset += APOffset.getZExtValue();
- return isKnownTypeIdMember(TypeId, DL, GEP->getPointerOperand(), COffset);
- }
-
- if (auto Op = dyn_cast<Operator>(V)) {
- if (Op->getOpcode() == Instruction::BitCast)
- return isKnownTypeIdMember(TypeId, DL, Op->getOperand(0), COffset);
-
- if (Op->getOpcode() == Instruction::Select)
- return isKnownTypeIdMember(TypeId, DL, Op->getOperand(1), COffset) &&
- isKnownTypeIdMember(TypeId, DL, Op->getOperand(2), COffset);
- }
-
- return false;
-}
-
-/// Lower a llvm.type.test call to its implementation. Returns the value to
-/// replace the call with.
-Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
- const TypeIdLowering &TIL) {
- // Delay lowering if the resolution is currently unknown.
- if (TIL.TheKind == TypeTestResolution::Unknown)
- return nullptr;
- if (TIL.TheKind == TypeTestResolution::Unsat)
- return ConstantInt::getFalse(M.getContext());
-
- Value *Ptr = CI->getArgOperand(0);
- const DataLayout &DL = M.getDataLayout();
- if (isKnownTypeIdMember(TypeId, DL, Ptr, 0))
- return ConstantInt::getTrue(M.getContext());
-
- BasicBlock *InitialBB = CI->getParent();
-
- IRBuilder<> B(CI);
-
- Value *PtrAsInt = B.CreatePtrToInt(Ptr, IntPtrTy);
-
- Constant *OffsetedGlobalAsInt =
- ConstantExpr::getPtrToInt(TIL.OffsetedGlobal, IntPtrTy);
- if (TIL.TheKind == TypeTestResolution::Single)
- return B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt);
-
- Value *PtrOffset = B.CreateSub(PtrAsInt, OffsetedGlobalAsInt);
-
- // We need to check that the offset both falls within our range and is
- // suitably aligned. We can check both properties at the same time by
- // performing a right rotate by log2(alignment) followed by an integer
- // comparison against the bitset size. The rotate will move the lower
- // order bits that need to be zero into the higher order bits of the
- // result, causing the comparison to fail if they are nonzero. The rotate
- // also conveniently gives us a bit offset to use during the load from
- // the bitset.
- Value *OffsetSHR =
- B.CreateLShr(PtrOffset, ConstantExpr::getZExt(TIL.AlignLog2, IntPtrTy));
- Value *OffsetSHL = B.CreateShl(
- PtrOffset, ConstantExpr::getZExt(
- ConstantExpr::getSub(
- ConstantInt::get(Int8Ty, DL.getPointerSizeInBits(0)),
- TIL.AlignLog2),
- IntPtrTy));
- Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
-
- Value *OffsetInRange = B.CreateICmpULE(BitOffset, TIL.SizeM1);
-
- // If the bit set is all ones, testing against it is unnecessary.
- if (TIL.TheKind == TypeTestResolution::AllOnes)
- return OffsetInRange;
-
- // See if the intrinsic is used in the following common pattern:
- // br(llvm.type.test(...), thenbb, elsebb)
- // where nothing happens between the type test and the br.
- // If so, create slightly simpler IR.
- if (CI->hasOneUse())
- if (auto *Br = dyn_cast<BranchInst>(*CI->user_begin()))
- if (CI->getNextNode() == Br) {
- BasicBlock *Then = InitialBB->splitBasicBlock(CI->getIterator());
- BasicBlock *Else = Br->getSuccessor(1);
- BranchInst *NewBr = BranchInst::Create(Then, Else, OffsetInRange);
- NewBr->setMetadata(LLVMContext::MD_prof,
- Br->getMetadata(LLVMContext::MD_prof));
- ReplaceInstWithInst(InitialBB->getTerminator(), NewBr);
-
- // Update phis in Else resulting from InitialBB being split
- for (auto &Phi : Else->phis())
- Phi.addIncoming(Phi.getIncomingValueForBlock(Then), InitialBB);
-
- IRBuilder<> ThenB(CI);
- return createBitSetTest(ThenB, TIL, BitOffset);
- }
-
- IRBuilder<> ThenB(SplitBlockAndInsertIfThen(OffsetInRange, CI, false));
-
- // Now that we know that the offset is in range and aligned, load the
- // appropriate bit from the bitset.
- Value *Bit = createBitSetTest(ThenB, TIL, BitOffset);
-
- // The value we want is 0 if we came directly from the initial block
- // (having failed the range or alignment checks), or the loaded bit if
- // we came from the block in which we loaded it.
- B.SetInsertPoint(CI);
- PHINode *P = B.CreatePHI(Int1Ty, 2);
- P->addIncoming(ConstantInt::get(Int1Ty, 0), InitialBB);
- P->addIncoming(Bit, ThenB.GetInsertBlock());
- return P;
-}
-
-/// Given a disjoint set of type identifiers and globals, lay out the globals,
-/// build the bit sets and lower the llvm.type.test calls.
-void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
- ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals) {
- // Build a new global with the combined contents of the referenced globals.
- // This global is a struct whose even-indexed elements contain the original
- // contents of the referenced globals and whose odd-indexed elements contain
- // any padding required to align the next element to the next power of 2 plus
- // any additional padding required to meet its alignment requirements.
- std::vector<Constant *> GlobalInits;
- const DataLayout &DL = M.getDataLayout();
- DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
- Align MaxAlign;
- uint64_t CurOffset = 0;
- uint64_t DesiredPadding = 0;
- for (GlobalTypeMember *G : Globals) {
- auto *GV = cast<GlobalVariable>(G->getGlobal());
- Align Alignment =
- DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
- MaxAlign = std::max(MaxAlign, Alignment);
- uint64_t GVOffset = alignTo(CurOffset + DesiredPadding, Alignment);
- GlobalLayout[G] = GVOffset;
- if (GVOffset != 0) {
- uint64_t Padding = GVOffset - CurOffset;
- GlobalInits.push_back(
- ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding)));
- }
-
- GlobalInits.push_back(GV->getInitializer());
- uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
- CurOffset = GVOffset + InitSize;
-
- // Compute the amount of padding that we'd like for the next element.
- DesiredPadding = NextPowerOf2(InitSize - 1) - InitSize;
-
- // Experiments of different caps with Chromium on both x64 and ARM64
- // have shown that the 32-byte cap generates the smallest binary on
- // both platforms while different caps yield similar performance.
- // (see https://lists.llvm.org/pipermail/llvm-dev/2018-July/124694.html)
- if (DesiredPadding > 32)
- DesiredPadding = alignTo(InitSize, 32) - InitSize;
- }
-
- Constant *NewInit = ConstantStruct::getAnon(M.getContext(), GlobalInits);
- auto *CombinedGlobal =
- new GlobalVariable(M, NewInit->getType(), /*isConstant=*/true,
- GlobalValue::PrivateLinkage, NewInit);
- CombinedGlobal->setAlignment(MaxAlign);
-
- StructType *NewTy = cast<StructType>(NewInit->getType());
- lowerTypeTestCalls(TypeIds, CombinedGlobal, GlobalLayout);
-
- // Build aliases pointing to offsets into the combined global for each
- // global from which we built the combined global, and replace references
- // to the original globals with references to the aliases.
- for (unsigned I = 0; I != Globals.size(); ++I) {
- GlobalVariable *GV = cast<GlobalVariable>(Globals[I]->getGlobal());
-
- // Multiply by 2 to account for padding elements.
- Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0),
- ConstantInt::get(Int32Ty, I * 2)};
- Constant *CombinedGlobalElemPtr = ConstantExpr::getGetElementPtr(
- NewInit->getType(), CombinedGlobal, CombinedGlobalIdxs);
- assert(GV->getType()->getAddressSpace() == 0);
- GlobalAlias *GAlias =
- GlobalAlias::create(NewTy->getElementType(I * 2), 0, GV->getLinkage(),
- "", CombinedGlobalElemPtr, &M);
- GAlias->setVisibility(GV->getVisibility());
- GAlias->takeName(GV);
- GV->replaceAllUsesWith(GAlias);
- GV->eraseFromParent();
- }
-}
-
-bool LowerTypeTestsModule::shouldExportConstantsAsAbsoluteSymbols() {
- return (Arch == Triple::x86 || Arch == Triple::x86_64) &&
- ObjectFormat == Triple::ELF;
-}
-
-/// Export the given type identifier so that ThinLTO backends may import it.
-/// Type identifiers are exported by adding coarse-grained information about how
-/// to test the type identifier to the summary, and creating symbols in the
-/// object file (aliases and absolute symbols) containing fine-grained
-/// information about the type identifier.
-///
-/// Returns a pointer to the location in which to store the bitmask, if
-/// applicable.
-uint8_t *LowerTypeTestsModule::exportTypeId(StringRef TypeId,
- const TypeIdLowering &TIL) {
- TypeTestResolution &TTRes =
- ExportSummary->getOrInsertTypeIdSummary(TypeId).TTRes;
- TTRes.TheKind = TIL.TheKind;
-
- auto ExportGlobal = [&](StringRef Name, Constant *C) {
- GlobalAlias *GA =
- GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage,
- "__typeid_" + TypeId + "_" + Name, C, &M);
- GA->setVisibility(GlobalValue::HiddenVisibility);
- };
-
- auto ExportConstant = [&](StringRef Name, uint64_t &Storage, Constant *C) {
- if (shouldExportConstantsAsAbsoluteSymbols())
- ExportGlobal(Name, ConstantExpr::getIntToPtr(C, Int8PtrTy));
- else
- Storage = cast<ConstantInt>(C)->getZExtValue();
- };
-
- if (TIL.TheKind != TypeTestResolution::Unsat)
- ExportGlobal("global_addr", TIL.OffsetedGlobal);
-
- if (TIL.TheKind == TypeTestResolution::ByteArray ||
- TIL.TheKind == TypeTestResolution::Inline ||
- TIL.TheKind == TypeTestResolution::AllOnes) {
- ExportConstant("align", TTRes.AlignLog2, TIL.AlignLog2);
- ExportConstant("size_m1", TTRes.SizeM1, TIL.SizeM1);
-
- uint64_t BitSize = cast<ConstantInt>(TIL.SizeM1)->getZExtValue() + 1;
- if (TIL.TheKind == TypeTestResolution::Inline)
- TTRes.SizeM1BitWidth = (BitSize <= 32) ? 5 : 6;
- else
- TTRes.SizeM1BitWidth = (BitSize <= 128) ? 7 : 32;
- }
-
- if (TIL.TheKind == TypeTestResolution::ByteArray) {
- ExportGlobal("byte_array", TIL.TheByteArray);
- if (shouldExportConstantsAsAbsoluteSymbols())
- ExportGlobal("bit_mask", TIL.BitMask);
- else
- return &TTRes.BitMask;
- }
-
- if (TIL.TheKind == TypeTestResolution::Inline)
- ExportConstant("inline_bits", TTRes.InlineBits, TIL.InlineBits);
-
- return nullptr;
-}
-
-LowerTypeTestsModule::TypeIdLowering
-LowerTypeTestsModule::importTypeId(StringRef TypeId) {
- const TypeIdSummary *TidSummary = ImportSummary->getTypeIdSummary(TypeId);
- if (!TidSummary)
- return {}; // Unsat: no globals match this type id.
- const TypeTestResolution &TTRes = TidSummary->TTRes;
-
- TypeIdLowering TIL;
- TIL.TheKind = TTRes.TheKind;
-
- auto ImportGlobal = [&](StringRef Name) {
- // Give the global a type of length 0 so that it is not assumed not to alias
- // with any other global.
- Constant *C = M.getOrInsertGlobal(("__typeid_" + TypeId + "_" + Name).str(),
- Int8Arr0Ty);
- if (auto *GV = dyn_cast<GlobalVariable>(C))
- GV->setVisibility(GlobalValue::HiddenVisibility);
- C = ConstantExpr::getBitCast(C, Int8PtrTy);
- return C;
- };
-
- auto ImportConstant = [&](StringRef Name, uint64_t Const, unsigned AbsWidth,
- Type *Ty) {
- if (!shouldExportConstantsAsAbsoluteSymbols()) {
- Constant *C =
- ConstantInt::get(isa<IntegerType>(Ty) ? Ty : Int64Ty, Const);
- if (!isa<IntegerType>(Ty))
- C = ConstantExpr::getIntToPtr(C, Ty);
- return C;
- }
-
- Constant *C = ImportGlobal(Name);
- auto *GV = cast<GlobalVariable>(C->stripPointerCasts());
- if (isa<IntegerType>(Ty))
- C = ConstantExpr::getPtrToInt(C, Ty);
- if (GV->getMetadata(LLVMContext::MD_absolute_symbol))
- return C;
-
- auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
- auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
- auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
- GV->setMetadata(LLVMContext::MD_absolute_symbol,
- MDNode::get(M.getContext(), {MinC, MaxC}));
- };
- if (AbsWidth == IntPtrTy->getBitWidth())
- SetAbsRange(~0ull, ~0ull); // Full set.
- else
- SetAbsRange(0, 1ull << AbsWidth);
- return C;
- };
-
- if (TIL.TheKind != TypeTestResolution::Unsat)
- TIL.OffsetedGlobal = ImportGlobal("global_addr");
-
- if (TIL.TheKind == TypeTestResolution::ByteArray ||
- TIL.TheKind == TypeTestResolution::Inline ||
- TIL.TheKind == TypeTestResolution::AllOnes) {
- TIL.AlignLog2 = ImportConstant("align", TTRes.AlignLog2, 8, Int8Ty);
- TIL.SizeM1 =
- ImportConstant("size_m1", TTRes.SizeM1, TTRes.SizeM1BitWidth, IntPtrTy);
- }
-
- if (TIL.TheKind == TypeTestResolution::ByteArray) {
- TIL.TheByteArray = ImportGlobal("byte_array");
- TIL.BitMask = ImportConstant("bit_mask", TTRes.BitMask, 8, Int8PtrTy);
- }
-
- if (TIL.TheKind == TypeTestResolution::Inline)
- TIL.InlineBits = ImportConstant(
- "inline_bits", TTRes.InlineBits, 1 << TTRes.SizeM1BitWidth,
- TTRes.SizeM1BitWidth <= 5 ? Int32Ty : Int64Ty);
-
- return TIL;
-}
-
-void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
- auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
- if (!TypeIdMDVal)
- report_fatal_error("Second argument of llvm.type.test must be metadata");
-
- auto TypeIdStr = dyn_cast<MDString>(TypeIdMDVal->getMetadata());
- // If this is a local unpromoted type, which doesn't have a metadata string,
- // treat as Unknown and delay lowering, so that we can still utilize it for
- // later optimizations.
- if (!TypeIdStr)
- return;
-
- TypeIdLowering TIL = importTypeId(TypeIdStr->getString());
- Value *Lowered = lowerTypeTestCall(TypeIdStr, CI, TIL);
- if (Lowered) {
- CI->replaceAllUsesWith(Lowered);
- CI->eraseFromParent();
- }
-}
-
-// ThinLTO backend: the function F has a jump table entry; update this module
-// accordingly. isJumpTableCanonical describes the type of the jump table entry.
-void LowerTypeTestsModule::importFunction(
- Function *F, bool isJumpTableCanonical,
- std::vector<GlobalAlias *> &AliasesToErase) {
- assert(F->getType()->getAddressSpace() == 0);
-
- GlobalValue::VisibilityTypes Visibility = F->getVisibility();
- std::string Name = std::string(F->getName());
-
- if (F->isDeclarationForLinker() && isJumpTableCanonical) {
- // Non-dso_local functions may be overriden at run time,
- // don't short curcuit them
- if (F->isDSOLocal()) {
- Function *RealF = Function::Create(F->getFunctionType(),
- GlobalValue::ExternalLinkage,
- F->getAddressSpace(),
- Name + ".cfi", &M);
- RealF->setVisibility(GlobalVariable::HiddenVisibility);
- replaceDirectCalls(F, RealF);
- }
- return;
- }
-
- Function *FDecl;
- if (!isJumpTableCanonical) {
- // Either a declaration of an external function or a reference to a locally
- // defined jump table.
- FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
- F->getAddressSpace(), Name + ".cfi_jt", &M);
- FDecl->setVisibility(GlobalValue::HiddenVisibility);
- } else {
- F->setName(Name + ".cfi");
- F->setLinkage(GlobalValue::ExternalLinkage);
- FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
- F->getAddressSpace(), Name, &M);
- FDecl->setVisibility(Visibility);
- Visibility = GlobalValue::HiddenVisibility;
-
- // Delete aliases pointing to this function, they'll be re-created in the
- // merged output. Don't do it yet though because ScopedSaveAliaseesAndUsed
- // will want to reset the aliasees first.
- for (auto &U : F->uses()) {
- if (auto *A = dyn_cast<GlobalAlias>(U.getUser())) {
- Function *AliasDecl = Function::Create(
- F->getFunctionType(), GlobalValue::ExternalLinkage,
- F->getAddressSpace(), "", &M);
- AliasDecl->takeName(A);
- A->replaceAllUsesWith(AliasDecl);
- AliasesToErase.push_back(A);
- }
- }
- }
-
- if (F->hasExternalWeakLinkage())
- replaceWeakDeclarationWithJumpTablePtr(F, FDecl, isJumpTableCanonical);
- else
- replaceCfiUses(F, FDecl, isJumpTableCanonical);
-
- // Set visibility late because it's used in replaceCfiUses() to determine
- // whether uses need to to be replaced.
- F->setVisibility(Visibility);
-}
-
-void LowerTypeTestsModule::lowerTypeTestCalls(
- ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
- const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
- CombinedGlobalAddr = ConstantExpr::getBitCast(CombinedGlobalAddr, Int8PtrTy);
-
- // For each type identifier in this disjoint set...
- for (Metadata *TypeId : TypeIds) {
- // Build the bitset.
- BitSetInfo BSI = buildBitSet(TypeId, GlobalLayout);
- LLVM_DEBUG({
- if (auto MDS = dyn_cast<MDString>(TypeId))
- dbgs() << MDS->getString() << ": ";
- else
- dbgs() << "<unnamed>: ";
- BSI.print(dbgs());
- });
-
- ByteArrayInfo *BAI = nullptr;
- TypeIdLowering TIL;
- TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr(
- Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)),
- TIL.AlignLog2 = ConstantInt::get(Int8Ty, BSI.AlignLog2);
- TIL.SizeM1 = ConstantInt::get(IntPtrTy, BSI.BitSize - 1);
- if (BSI.isAllOnes()) {
- TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single
- : TypeTestResolution::AllOnes;
- } else if (BSI.BitSize <= 64) {
- TIL.TheKind = TypeTestResolution::Inline;
- uint64_t InlineBits = 0;
- for (auto Bit : BSI.Bits)
- InlineBits |= uint64_t(1) << Bit;
- if (InlineBits == 0)
- TIL.TheKind = TypeTestResolution::Unsat;
- else
- TIL.InlineBits = ConstantInt::get(
- (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits);
- } else {
- TIL.TheKind = TypeTestResolution::ByteArray;
- ++NumByteArraysCreated;
- BAI = createByteArray(BSI);
- TIL.TheByteArray = BAI->ByteArray;
- TIL.BitMask = BAI->MaskGlobal;
- }
-
- TypeIdUserInfo &TIUI = TypeIdUsers[TypeId];
-
- if (TIUI.IsExported) {
- uint8_t *MaskPtr = exportTypeId(cast<MDString>(TypeId)->getString(), TIL);
- if (BAI)
- BAI->MaskPtr = MaskPtr;
- }
-
- // Lower each call to llvm.type.test for this type identifier.
- for (CallInst *CI : TIUI.CallSites) {
- ++NumTypeTestCallsLowered;
- Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL);
- if (Lowered) {
- CI->replaceAllUsesWith(Lowered);
- CI->eraseFromParent();
- }
- }
- }
-}
-
-void LowerTypeTestsModule::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) {
- if (Type->getNumOperands() != 2)
- report_fatal_error("All operands of type metadata must have 2 elements");
-
- if (GO->isThreadLocal())
- report_fatal_error("Bit set element may not be thread-local");
- if (isa<GlobalVariable>(GO) && GO->hasSection())
- report_fatal_error(
- "A member of a type identifier may not have an explicit section");
-
- // FIXME: We previously checked that global var member of a type identifier
- // must be a definition, but the IR linker may leave type metadata on
- // declarations. We should restore this check after fixing PR31759.
-
- auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Type->getOperand(0));
- if (!OffsetConstMD)
- report_fatal_error("Type offset must be a constant");
- auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue());
- if (!OffsetInt)
- report_fatal_error("Type offset must be an integer constant");
-}
-
-static const unsigned kX86JumpTableEntrySize = 8;
-static const unsigned kARMJumpTableEntrySize = 4;
+ OldFragment.clear();
+ }
+ }
+
+ // Update the fragment map to point our object indices to this fragment.
+ for (uint64_t ObjIndex : Fragment)
+ FragmentMap[ObjIndex] = FragmentIndex;
+}
+
+void ByteArrayBuilder::allocate(const std::set<uint64_t> &Bits,
+ uint64_t BitSize, uint64_t &AllocByteOffset,
+ uint8_t &AllocMask) {
+ // Find the smallest current allocation.
+ unsigned Bit = 0;
+ for (unsigned I = 1; I != BitsPerByte; ++I)
+ if (BitAllocs[I] < BitAllocs[Bit])
+ Bit = I;
+
+ AllocByteOffset = BitAllocs[Bit];
+
+ // Add our size to it.
+ unsigned ReqSize = AllocByteOffset + BitSize;
+ BitAllocs[Bit] = ReqSize;
+ if (Bytes.size() < ReqSize)
+ Bytes.resize(ReqSize);
+
+ // Set our bits.
+ AllocMask = 1 << Bit;
+ for (uint64_t B : Bits)
+ Bytes[AllocByteOffset + B] |= AllocMask;
+}
+
+bool lowertypetests::isJumpTableCanonical(Function *F) {
+ if (F->isDeclarationForLinker())
+ return false;
+ auto *CI = mdconst::extract_or_null<ConstantInt>(
+ F->getParent()->getModuleFlag("CFI Canonical Jump Tables"));
+ if (!CI || CI->getZExtValue() != 0)
+ return true;
+ return F->hasFnAttribute("cfi-canonical-jump-table");
+}
+
+namespace {
+
+struct ByteArrayInfo {
+ std::set<uint64_t> Bits;
+ uint64_t BitSize;
+ GlobalVariable *ByteArray;
+ GlobalVariable *MaskGlobal;
+ uint8_t *MaskPtr = nullptr;
+};
+
+/// A POD-like structure that we use to store a global reference together with
+/// its metadata types. In this pass we frequently need to query the set of
+/// metadata types referenced by a global, which at the IR level is an expensive
+/// operation involving a map lookup; this data structure helps to reduce the
+/// number of times we need to do this lookup.
+class GlobalTypeMember final : TrailingObjects<GlobalTypeMember, MDNode *> {
+ friend TrailingObjects;
+
+ GlobalObject *GO;
+ size_t NTypes;
+
+ // For functions: true if the jump table is canonical. This essentially means
+ // whether the canonical address (i.e. the symbol table entry) of the function
+ // is provided by the local jump table. This is normally the same as whether
+ // the function is defined locally, but if canonical jump tables are disabled
+ // by the user then the jump table never provides a canonical definition.
+ bool IsJumpTableCanonical;
+
+ // For functions: true if this function is either defined or used in a thinlto
+ // module and its jumptable entry needs to be exported to thinlto backends.
+ bool IsExported;
+
+ size_t numTrailingObjects(OverloadToken<MDNode *>) const { return NTypes; }
+
+public:
+ static GlobalTypeMember *create(BumpPtrAllocator &Alloc, GlobalObject *GO,
+ bool IsJumpTableCanonical, bool IsExported,
+ ArrayRef<MDNode *> Types) {
+ auto *GTM = static_cast<GlobalTypeMember *>(Alloc.Allocate(
+ totalSizeToAlloc<MDNode *>(Types.size()), alignof(GlobalTypeMember)));
+ GTM->GO = GO;
+ GTM->NTypes = Types.size();
+ GTM->IsJumpTableCanonical = IsJumpTableCanonical;
+ GTM->IsExported = IsExported;
+ std::uninitialized_copy(Types.begin(), Types.end(),
+ GTM->getTrailingObjects<MDNode *>());
+ return GTM;
+ }
+
+ GlobalObject *getGlobal() const {
+ return GO;
+ }
+
+ bool isJumpTableCanonical() const {
+ return IsJumpTableCanonical;
+ }
+
+ bool isExported() const {
+ return IsExported;
+ }
+
+ ArrayRef<MDNode *> types() const {
+ return makeArrayRef(getTrailingObjects<MDNode *>(), NTypes);
+ }
+};
+
+struct ICallBranchFunnel final
+ : TrailingObjects<ICallBranchFunnel, GlobalTypeMember *> {
+ static ICallBranchFunnel *create(BumpPtrAllocator &Alloc, CallInst *CI,
+ ArrayRef<GlobalTypeMember *> Targets,
+ unsigned UniqueId) {
+ auto *Call = static_cast<ICallBranchFunnel *>(
+ Alloc.Allocate(totalSizeToAlloc<GlobalTypeMember *>(Targets.size()),
+ alignof(ICallBranchFunnel)));
+ Call->CI = CI;
+ Call->UniqueId = UniqueId;
+ Call->NTargets = Targets.size();
+ std::uninitialized_copy(Targets.begin(), Targets.end(),
+ Call->getTrailingObjects<GlobalTypeMember *>());
+ return Call;
+ }
+
+ CallInst *CI;
+ ArrayRef<GlobalTypeMember *> targets() const {
+ return makeArrayRef(getTrailingObjects<GlobalTypeMember *>(), NTargets);
+ }
+
+ unsigned UniqueId;
+
+private:
+ size_t NTargets;
+};
+
+struct ScopedSaveAliaseesAndUsed {
+ Module &M;
+ SmallPtrSet<GlobalValue *, 16> Used, CompilerUsed;
+ std::vector<std::pair<GlobalIndirectSymbol *, Function *>> FunctionAliases;
+
+ ScopedSaveAliaseesAndUsed(Module &M) : M(M) {
+ // The users of this class want to replace all function references except
+ // for aliases and llvm.used/llvm.compiler.used with references to a jump
+ // table. We avoid replacing aliases in order to avoid introducing a double
+ // indirection (or an alias pointing to a declaration in ThinLTO mode), and
+ // we avoid replacing llvm.used/llvm.compiler.used because these global
+ // variables describe properties of the global, not the jump table (besides,
+ // offseted references to the jump table in llvm.used are invalid).
+ // Unfortunately, LLVM doesn't have a "RAUW except for these (possibly
+ // indirect) users", so what we do is save the list of globals referenced by
+ // llvm.used/llvm.compiler.used and aliases, erase the used lists, let RAUW
+ // replace the aliasees and then set them back to their original values at
+ // the end.
+ if (GlobalVariable *GV = collectUsedGlobalVariables(M, Used, false))
+ GV->eraseFromParent();
+ if (GlobalVariable *GV = collectUsedGlobalVariables(M, CompilerUsed, true))
+ GV->eraseFromParent();
+
+ for (auto &GIS : concat<GlobalIndirectSymbol>(M.aliases(), M.ifuncs())) {
+ // FIXME: This should look past all aliases not just interposable ones,
+ // see discussion on D65118.
+ if (auto *F =
+ dyn_cast<Function>(GIS.getIndirectSymbol()->stripPointerCasts()))
+ FunctionAliases.push_back({&GIS, F});
+ }
+ }
+
+ ~ScopedSaveAliaseesAndUsed() {
+ appendToUsed(M, std::vector<GlobalValue *>(Used.begin(), Used.end()));
+ appendToCompilerUsed(M, std::vector<GlobalValue *>(CompilerUsed.begin(),
+ CompilerUsed.end()));
+
+ for (auto P : FunctionAliases)
+ P.first->setIndirectSymbol(
+ ConstantExpr::getBitCast(P.second, P.first->getType()));
+ }
+};
+
+class LowerTypeTestsModule {
+ Module &M;
+
+ ModuleSummaryIndex *ExportSummary;
+ const ModuleSummaryIndex *ImportSummary;
+ // Set when the client has invoked this to simply drop all type test assume
+ // sequences.
+ bool DropTypeTests;
+
+ Triple::ArchType Arch;
+ Triple::OSType OS;
+ Triple::ObjectFormatType ObjectFormat;
+
+ IntegerType *Int1Ty = Type::getInt1Ty(M.getContext());
+ IntegerType *Int8Ty = Type::getInt8Ty(M.getContext());
+ PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
+ ArrayType *Int8Arr0Ty = ArrayType::get(Type::getInt8Ty(M.getContext()), 0);
+ IntegerType *Int32Ty = Type::getInt32Ty(M.getContext());
+ PointerType *Int32PtrTy = PointerType::getUnqual(Int32Ty);
+ IntegerType *Int64Ty = Type::getInt64Ty(M.getContext());
+ IntegerType *IntPtrTy = M.getDataLayout().getIntPtrType(M.getContext(), 0);
+
+ // Indirect function call index assignment counter for WebAssembly
+ uint64_t IndirectIndex = 1;
+
+ // Mapping from type identifiers to the call sites that test them, as well as
+ // whether the type identifier needs to be exported to ThinLTO backends as
+ // part of the regular LTO phase of the ThinLTO pipeline (see exportTypeId).
+ struct TypeIdUserInfo {
+ std::vector<CallInst *> CallSites;
+ bool IsExported = false;
+ };
+ DenseMap<Metadata *, TypeIdUserInfo> TypeIdUsers;
+
+ /// This structure describes how to lower type tests for a particular type
+ /// identifier. It is either built directly from the global analysis (during
+ /// regular LTO or the regular LTO phase of ThinLTO), or indirectly using type
+ /// identifier summaries and external symbol references (in ThinLTO backends).
+ struct TypeIdLowering {
+ TypeTestResolution::Kind TheKind = TypeTestResolution::Unsat;
+
+ /// All except Unsat: the start address within the combined global.
+ Constant *OffsetedGlobal;
+
+ /// ByteArray, Inline, AllOnes: log2 of the required global alignment
+ /// relative to the start address.
+ Constant *AlignLog2;
+
+ /// ByteArray, Inline, AllOnes: one less than the size of the memory region
+ /// covering members of this type identifier as a multiple of 2^AlignLog2.
+ Constant *SizeM1;
+
+ /// ByteArray: the byte array to test the address against.
+ Constant *TheByteArray;
+
+ /// ByteArray: the bit mask to apply to bytes loaded from the byte array.
+ Constant *BitMask;
+
+ /// Inline: the bit mask to test the address against.
+ Constant *InlineBits;
+ };
+
+ std::vector<ByteArrayInfo> ByteArrayInfos;
+
+ Function *WeakInitializerFn = nullptr;
+
+ bool shouldExportConstantsAsAbsoluteSymbols();
+ uint8_t *exportTypeId(StringRef TypeId, const TypeIdLowering &TIL);
+ TypeIdLowering importTypeId(StringRef TypeId);
+ void importTypeTest(CallInst *CI);
+ void importFunction(Function *F, bool isJumpTableCanonical,
+ std::vector<GlobalAlias *> &AliasesToErase);
+
+ BitSetInfo
+ buildBitSet(Metadata *TypeId,
+ const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
+ ByteArrayInfo *createByteArray(BitSetInfo &BSI);
+ void allocateByteArrays();
+ Value *createBitSetTest(IRBuilder<> &B, const TypeIdLowering &TIL,
+ Value *BitOffset);
+ void lowerTypeTestCalls(
+ ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
+ const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout);
+ Value *lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
+ const TypeIdLowering &TIL);
+
+ void buildBitSetsFromGlobalVariables(ArrayRef<Metadata *> TypeIds,
+ ArrayRef<GlobalTypeMember *> Globals);
+ unsigned getJumpTableEntrySize();
+ Type *getJumpTableEntryType();
+ void createJumpTableEntry(raw_ostream &AsmOS, raw_ostream &ConstraintOS,
+ Triple::ArchType JumpTableArch,
+ SmallVectorImpl<Value *> &AsmArgs, Function *Dest);
+ void verifyTypeMDNode(GlobalObject *GO, MDNode *Type);
+ void buildBitSetsFromFunctions(ArrayRef<Metadata *> TypeIds,
+ ArrayRef<GlobalTypeMember *> Functions);
+ void buildBitSetsFromFunctionsNative(ArrayRef<Metadata *> TypeIds,
+ ArrayRef<GlobalTypeMember *> Functions);
+ void buildBitSetsFromFunctionsWASM(ArrayRef<Metadata *> TypeIds,
+ ArrayRef<GlobalTypeMember *> Functions);
+ void
+ buildBitSetsFromDisjointSet(ArrayRef<Metadata *> TypeIds,
+ ArrayRef<GlobalTypeMember *> Globals,
+ ArrayRef<ICallBranchFunnel *> ICallBranchFunnels);
+
+ void replaceWeakDeclarationWithJumpTablePtr(Function *F, Constant *JT,
+ bool IsJumpTableCanonical);
+ void moveInitializerToModuleConstructor(GlobalVariable *GV);
+ void findGlobalVariableUsersOf(Constant *C,
+ SmallSetVector<GlobalVariable *, 8> &Out);
+
+ void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions);
+
+ /// replaceCfiUses - Go through the uses list for this definition
+ /// and make each use point to "V" instead of "this" when the use is outside
+ /// the block. 'This's use list is expected to have at least one element.
+ /// Unlike replaceAllUsesWith this function skips blockaddr and direct call
+ /// uses.
+ void replaceCfiUses(Function *Old, Value *New, bool IsJumpTableCanonical);
+
+ /// replaceDirectCalls - Go through the uses list for this definition and
+ /// replace each use, which is a direct function call.
+ void replaceDirectCalls(Value *Old, Value *New);
+
+public:
+ LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary,
+ const ModuleSummaryIndex *ImportSummary,
+ bool DropTypeTests);
+
+ bool lower();
+
+ // Lower the module using the action and summary passed as command line
+ // arguments. For testing purposes only.
+ static bool runForTesting(Module &M);
+};
+
+struct LowerTypeTests : public ModulePass {
+ static char ID;
+
+ bool UseCommandLine = false;
+
+ ModuleSummaryIndex *ExportSummary;
+ const ModuleSummaryIndex *ImportSummary;
+ bool DropTypeTests;
+
+ LowerTypeTests() : ModulePass(ID), UseCommandLine(true) {
+ initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
+ }
+
+ LowerTypeTests(ModuleSummaryIndex *ExportSummary,
+ const ModuleSummaryIndex *ImportSummary, bool DropTypeTests)
+ : ModulePass(ID), ExportSummary(ExportSummary),
+ ImportSummary(ImportSummary), DropTypeTests(DropTypeTests) {
+ initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ if (UseCommandLine)
+ return LowerTypeTestsModule::runForTesting(M);
+ return LowerTypeTestsModule(M, ExportSummary, ImportSummary, DropTypeTests)
+ .lower();
+ }
+};
+
+} // end anonymous namespace
+
+char LowerTypeTests::ID = 0;
+
+INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
+ false)
+
+ModulePass *
+llvm::createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary,
+ const ModuleSummaryIndex *ImportSummary,
+ bool DropTypeTests) {
+ return new LowerTypeTests(ExportSummary, ImportSummary, DropTypeTests);
+}
+
+/// Build a bit set for TypeId using the object layouts in
+/// GlobalLayout.
+BitSetInfo LowerTypeTestsModule::buildBitSet(
+ Metadata *TypeId,
+ const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
+ BitSetBuilder BSB;
+
+ // Compute the byte offset of each address associated with this type
+ // identifier.
+ for (auto &GlobalAndOffset : GlobalLayout) {
+ for (MDNode *Type : GlobalAndOffset.first->types()) {
+ if (Type->getOperand(1) != TypeId)
+ continue;
+ uint64_t Offset =
+ cast<ConstantInt>(
+ cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+ ->getZExtValue();
+ BSB.addOffset(GlobalAndOffset.second + Offset);
+ }
+ }
+
+ return BSB.build();
+}
+
+/// Build a test that bit BitOffset mod sizeof(Bits)*8 is set in
+/// Bits. This pattern matches to the bt instruction on x86.
+static Value *createMaskedBitTest(IRBuilder<> &B, Value *Bits,
+ Value *BitOffset) {
+ auto BitsType = cast<IntegerType>(Bits->getType());
+ unsigned BitWidth = BitsType->getBitWidth();
+
+ BitOffset = B.CreateZExtOrTrunc(BitOffset, BitsType);
+ Value *BitIndex =
+ B.CreateAnd(BitOffset, ConstantInt::get(BitsType, BitWidth - 1));
+ Value *BitMask = B.CreateShl(ConstantInt::get(BitsType, 1), BitIndex);
+ Value *MaskedBits = B.CreateAnd(Bits, BitMask);
+ return B.CreateICmpNE(MaskedBits, ConstantInt::get(BitsType, 0));
+}
+
+ByteArrayInfo *LowerTypeTestsModule::createByteArray(BitSetInfo &BSI) {
+ // Create globals to stand in for byte arrays and masks. These never actually
+ // get initialized, we RAUW and erase them later in allocateByteArrays() once
+ // we know the offset and mask to use.
+ auto ByteArrayGlobal = new GlobalVariable(
+ M, Int8Ty, /*isConstant=*/true, GlobalValue::PrivateLinkage, nullptr);
+ auto MaskGlobal = new GlobalVariable(M, Int8Ty, /*isConstant=*/true,
+ GlobalValue::PrivateLinkage, nullptr);
+
+ ByteArrayInfos.emplace_back();
+ ByteArrayInfo *BAI = &ByteArrayInfos.back();
+
+ BAI->Bits = BSI.Bits;
+ BAI->BitSize = BSI.BitSize;
+ BAI->ByteArray = ByteArrayGlobal;
+ BAI->MaskGlobal = MaskGlobal;
+ return BAI;
+}
+
+void LowerTypeTestsModule::allocateByteArrays() {
+ llvm::stable_sort(ByteArrayInfos,
+ [](const ByteArrayInfo &BAI1, const ByteArrayInfo &BAI2) {
+ return BAI1.BitSize > BAI2.BitSize;
+ });
+
+ std::vector<uint64_t> ByteArrayOffsets(ByteArrayInfos.size());
+
+ ByteArrayBuilder BAB;
+ for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) {
+ ByteArrayInfo *BAI = &ByteArrayInfos[I];
+
+ uint8_t Mask;
+ BAB.allocate(BAI->Bits, BAI->BitSize, ByteArrayOffsets[I], Mask);
+
+ BAI->MaskGlobal->replaceAllUsesWith(
+ ConstantExpr::getIntToPtr(ConstantInt::get(Int8Ty, Mask), Int8PtrTy));
+ BAI->MaskGlobal->eraseFromParent();
+ if (BAI->MaskPtr)
+ *BAI->MaskPtr = Mask;
+ }
+
+ Constant *ByteArrayConst = ConstantDataArray::get(M.getContext(), BAB.Bytes);
+ auto ByteArray =
+ new GlobalVariable(M, ByteArrayConst->getType(), /*isConstant=*/true,
+ GlobalValue::PrivateLinkage, ByteArrayConst);
+
+ for (unsigned I = 0; I != ByteArrayInfos.size(); ++I) {
+ ByteArrayInfo *BAI = &ByteArrayInfos[I];
+
+ Constant *Idxs[] = {ConstantInt::get(IntPtrTy, 0),
+ ConstantInt::get(IntPtrTy, ByteArrayOffsets[I])};
+ Constant *GEP = ConstantExpr::getInBoundsGetElementPtr(
+ ByteArrayConst->getType(), ByteArray, Idxs);
+
+ // Create an alias instead of RAUW'ing the gep directly. On x86 this ensures
+ // that the pc-relative displacement is folded into the lea instead of the
+ // test instruction getting another displacement.
+ GlobalAlias *Alias = GlobalAlias::create(
+ Int8Ty, 0, GlobalValue::PrivateLinkage, "bits", GEP, &M);
+ BAI->ByteArray->replaceAllUsesWith(Alias);
+ BAI->ByteArray->eraseFromParent();
+ }
+
+ ByteArraySizeBits = BAB.BitAllocs[0] + BAB.BitAllocs[1] + BAB.BitAllocs[2] +
+ BAB.BitAllocs[3] + BAB.BitAllocs[4] + BAB.BitAllocs[5] +
+ BAB.BitAllocs[6] + BAB.BitAllocs[7];
+ ByteArraySizeBytes = BAB.Bytes.size();
+}
+
+/// Build a test that bit BitOffset is set in the type identifier that was
+/// lowered to TIL, which must be either an Inline or a ByteArray.
+Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B,
+ const TypeIdLowering &TIL,
+ Value *BitOffset) {
+ if (TIL.TheKind == TypeTestResolution::Inline) {
+ // If the bit set is sufficiently small, we can avoid a load by bit testing
+ // a constant.
+ return createMaskedBitTest(B, TIL.InlineBits, BitOffset);
+ } else {
+ Constant *ByteArray = TIL.TheByteArray;
+ if (AvoidReuse && !ImportSummary) {
+ // Each use of the byte array uses a different alias. This makes the
+ // backend less likely to reuse previously computed byte array addresses,
+ // improving the security of the CFI mechanism based on this pass.
+ // This won't work when importing because TheByteArray is external.
+ ByteArray = GlobalAlias::create(Int8Ty, 0, GlobalValue::PrivateLinkage,
+ "bits_use", ByteArray, &M);
+ }
+
+ Value *ByteAddr = B.CreateGEP(Int8Ty, ByteArray, BitOffset);
+ Value *Byte = B.CreateLoad(Int8Ty, ByteAddr);
+
+ Value *ByteAndMask =
+ B.CreateAnd(Byte, ConstantExpr::getPtrToInt(TIL.BitMask, Int8Ty));
+ return B.CreateICmpNE(ByteAndMask, ConstantInt::get(Int8Ty, 0));
+ }
+}
+
+static bool isKnownTypeIdMember(Metadata *TypeId, const DataLayout &DL,
+ Value *V, uint64_t COffset) {
+ if (auto GV = dyn_cast<GlobalObject>(V)) {
+ SmallVector<MDNode *, 2> Types;
+ GV->getMetadata(LLVMContext::MD_type, Types);
+ for (MDNode *Type : Types) {
+ if (Type->getOperand(1) != TypeId)
+ continue;
+ uint64_t Offset =
+ cast<ConstantInt>(
+ cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+ ->getZExtValue();
+ if (COffset == Offset)
+ return true;
+ }
+ return false;
+ }
+
+ if (auto GEP = dyn_cast<GEPOperator>(V)) {
+ APInt APOffset(DL.getPointerSizeInBits(0), 0);
+ bool Result = GEP->accumulateConstantOffset(DL, APOffset);
+ if (!Result)
+ return false;
+ COffset += APOffset.getZExtValue();
+ return isKnownTypeIdMember(TypeId, DL, GEP->getPointerOperand(), COffset);
+ }
+
+ if (auto Op = dyn_cast<Operator>(V)) {
+ if (Op->getOpcode() == Instruction::BitCast)
+ return isKnownTypeIdMember(TypeId, DL, Op->getOperand(0), COffset);
+
+ if (Op->getOpcode() == Instruction::Select)
+ return isKnownTypeIdMember(TypeId, DL, Op->getOperand(1), COffset) &&
+ isKnownTypeIdMember(TypeId, DL, Op->getOperand(2), COffset);
+ }
+
+ return false;
+}
+
+/// Lower a llvm.type.test call to its implementation. Returns the value to
+/// replace the call with.
+Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
+ const TypeIdLowering &TIL) {
+ // Delay lowering if the resolution is currently unknown.
+ if (TIL.TheKind == TypeTestResolution::Unknown)
+ return nullptr;
+ if (TIL.TheKind == TypeTestResolution::Unsat)
+ return ConstantInt::getFalse(M.getContext());
+
+ Value *Ptr = CI->getArgOperand(0);
+ const DataLayout &DL = M.getDataLayout();
+ if (isKnownTypeIdMember(TypeId, DL, Ptr, 0))
+ return ConstantInt::getTrue(M.getContext());
+
+ BasicBlock *InitialBB = CI->getParent();
+
+ IRBuilder<> B(CI);
+
+ Value *PtrAsInt = B.CreatePtrToInt(Ptr, IntPtrTy);
+
+ Constant *OffsetedGlobalAsInt =
+ ConstantExpr::getPtrToInt(TIL.OffsetedGlobal, IntPtrTy);
+ if (TIL.TheKind == TypeTestResolution::Single)
+ return B.CreateICmpEQ(PtrAsInt, OffsetedGlobalAsInt);
+
+ Value *PtrOffset = B.CreateSub(PtrAsInt, OffsetedGlobalAsInt);
+
+ // We need to check that the offset both falls within our range and is
+ // suitably aligned. We can check both properties at the same time by
+ // performing a right rotate by log2(alignment) followed by an integer
+ // comparison against the bitset size. The rotate will move the lower
+ // order bits that need to be zero into the higher order bits of the
+ // result, causing the comparison to fail if they are nonzero. The rotate
+ // also conveniently gives us a bit offset to use during the load from
+ // the bitset.
+ Value *OffsetSHR =
+ B.CreateLShr(PtrOffset, ConstantExpr::getZExt(TIL.AlignLog2, IntPtrTy));
+ Value *OffsetSHL = B.CreateShl(
+ PtrOffset, ConstantExpr::getZExt(
+ ConstantExpr::getSub(
+ ConstantInt::get(Int8Ty, DL.getPointerSizeInBits(0)),
+ TIL.AlignLog2),
+ IntPtrTy));
+ Value *BitOffset = B.CreateOr(OffsetSHR, OffsetSHL);
+
+ Value *OffsetInRange = B.CreateICmpULE(BitOffset, TIL.SizeM1);
+
+ // If the bit set is all ones, testing against it is unnecessary.
+ if (TIL.TheKind == TypeTestResolution::AllOnes)
+ return OffsetInRange;
+
+ // See if the intrinsic is used in the following common pattern:
+ // br(llvm.type.test(...), thenbb, elsebb)
+ // where nothing happens between the type test and the br.
+ // If so, create slightly simpler IR.
+ if (CI->hasOneUse())
+ if (auto *Br = dyn_cast<BranchInst>(*CI->user_begin()))
+ if (CI->getNextNode() == Br) {
+ BasicBlock *Then = InitialBB->splitBasicBlock(CI->getIterator());
+ BasicBlock *Else = Br->getSuccessor(1);
+ BranchInst *NewBr = BranchInst::Create(Then, Else, OffsetInRange);
+ NewBr->setMetadata(LLVMContext::MD_prof,
+ Br->getMetadata(LLVMContext::MD_prof));
+ ReplaceInstWithInst(InitialBB->getTerminator(), NewBr);
+
+ // Update phis in Else resulting from InitialBB being split
+ for (auto &Phi : Else->phis())
+ Phi.addIncoming(Phi.getIncomingValueForBlock(Then), InitialBB);
+
+ IRBuilder<> ThenB(CI);
+ return createBitSetTest(ThenB, TIL, BitOffset);
+ }
+
+ IRBuilder<> ThenB(SplitBlockAndInsertIfThen(OffsetInRange, CI, false));
+
+ // Now that we know that the offset is in range and aligned, load the
+ // appropriate bit from the bitset.
+ Value *Bit = createBitSetTest(ThenB, TIL, BitOffset);
+
+ // The value we want is 0 if we came directly from the initial block
+ // (having failed the range or alignment checks), or the loaded bit if
+ // we came from the block in which we loaded it.
+ B.SetInsertPoint(CI);
+ PHINode *P = B.CreatePHI(Int1Ty, 2);
+ P->addIncoming(ConstantInt::get(Int1Ty, 0), InitialBB);
+ P->addIncoming(Bit, ThenB.GetInsertBlock());
+ return P;
+}
+
+/// Given a disjoint set of type identifiers and globals, lay out the globals,
+/// build the bit sets and lower the llvm.type.test calls.
+void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
+ ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals) {
+ // Build a new global with the combined contents of the referenced globals.
+ // This global is a struct whose even-indexed elements contain the original
+ // contents of the referenced globals and whose odd-indexed elements contain
+ // any padding required to align the next element to the next power of 2 plus
+ // any additional padding required to meet its alignment requirements.
+ std::vector<Constant *> GlobalInits;
+ const DataLayout &DL = M.getDataLayout();
+ DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
+ Align MaxAlign;
+ uint64_t CurOffset = 0;
+ uint64_t DesiredPadding = 0;
+ for (GlobalTypeMember *G : Globals) {
+ auto *GV = cast<GlobalVariable>(G->getGlobal());
+ Align Alignment =
+ DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
+ MaxAlign = std::max(MaxAlign, Alignment);
+ uint64_t GVOffset = alignTo(CurOffset + DesiredPadding, Alignment);
+ GlobalLayout[G] = GVOffset;
+ if (GVOffset != 0) {
+ uint64_t Padding = GVOffset - CurOffset;
+ GlobalInits.push_back(
+ ConstantAggregateZero::get(ArrayType::get(Int8Ty, Padding)));
+ }
+
+ GlobalInits.push_back(GV->getInitializer());
+ uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
+ CurOffset = GVOffset + InitSize;
+
+ // Compute the amount of padding that we'd like for the next element.
+ DesiredPadding = NextPowerOf2(InitSize - 1) - InitSize;
+
+ // Experiments of different caps with Chromium on both x64 and ARM64
+ // have shown that the 32-byte cap generates the smallest binary on
+ // both platforms while different caps yield similar performance.
+ // (see https://lists.llvm.org/pipermail/llvm-dev/2018-July/124694.html)
+ if (DesiredPadding > 32)
+ DesiredPadding = alignTo(InitSize, 32) - InitSize;
+ }
+
+ Constant *NewInit = ConstantStruct::getAnon(M.getContext(), GlobalInits);
+ auto *CombinedGlobal =
+ new GlobalVariable(M, NewInit->getType(), /*isConstant=*/true,
+ GlobalValue::PrivateLinkage, NewInit);
+ CombinedGlobal->setAlignment(MaxAlign);
+
+ StructType *NewTy = cast<StructType>(NewInit->getType());
+ lowerTypeTestCalls(TypeIds, CombinedGlobal, GlobalLayout);
+
+ // Build aliases pointing to offsets into the combined global for each
+ // global from which we built the combined global, and replace references
+ // to the original globals with references to the aliases.
+ for (unsigned I = 0; I != Globals.size(); ++I) {
+ GlobalVariable *GV = cast<GlobalVariable>(Globals[I]->getGlobal());
+
+ // Multiply by 2 to account for padding elements.
+ Constant *CombinedGlobalIdxs[] = {ConstantInt::get(Int32Ty, 0),
+ ConstantInt::get(Int32Ty, I * 2)};
+ Constant *CombinedGlobalElemPtr = ConstantExpr::getGetElementPtr(
+ NewInit->getType(), CombinedGlobal, CombinedGlobalIdxs);
+ assert(GV->getType()->getAddressSpace() == 0);
+ GlobalAlias *GAlias =
+ GlobalAlias::create(NewTy->getElementType(I * 2), 0, GV->getLinkage(),
+ "", CombinedGlobalElemPtr, &M);
+ GAlias->setVisibility(GV->getVisibility());
+ GAlias->takeName(GV);
+ GV->replaceAllUsesWith(GAlias);
+ GV->eraseFromParent();
+ }
+}
+
+bool LowerTypeTestsModule::shouldExportConstantsAsAbsoluteSymbols() {
+ return (Arch == Triple::x86 || Arch == Triple::x86_64) &&
+ ObjectFormat == Triple::ELF;
+}
+
+/// Export the given type identifier so that ThinLTO backends may import it.
+/// Type identifiers are exported by adding coarse-grained information about how
+/// to test the type identifier to the summary, and creating symbols in the
+/// object file (aliases and absolute symbols) containing fine-grained
+/// information about the type identifier.
+///
+/// Returns a pointer to the location in which to store the bitmask, if
+/// applicable.
+uint8_t *LowerTypeTestsModule::exportTypeId(StringRef TypeId,
+ const TypeIdLowering &TIL) {
+ TypeTestResolution &TTRes =
+ ExportSummary->getOrInsertTypeIdSummary(TypeId).TTRes;
+ TTRes.TheKind = TIL.TheKind;
+
+ auto ExportGlobal = [&](StringRef Name, Constant *C) {
+ GlobalAlias *GA =
+ GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage,
+ "__typeid_" + TypeId + "_" + Name, C, &M);
+ GA->setVisibility(GlobalValue::HiddenVisibility);
+ };
+
+ auto ExportConstant = [&](StringRef Name, uint64_t &Storage, Constant *C) {
+ if (shouldExportConstantsAsAbsoluteSymbols())
+ ExportGlobal(Name, ConstantExpr::getIntToPtr(C, Int8PtrTy));
+ else
+ Storage = cast<ConstantInt>(C)->getZExtValue();
+ };
+
+ if (TIL.TheKind != TypeTestResolution::Unsat)
+ ExportGlobal("global_addr", TIL.OffsetedGlobal);
+
+ if (TIL.TheKind == TypeTestResolution::ByteArray ||
+ TIL.TheKind == TypeTestResolution::Inline ||
+ TIL.TheKind == TypeTestResolution::AllOnes) {
+ ExportConstant("align", TTRes.AlignLog2, TIL.AlignLog2);
+ ExportConstant("size_m1", TTRes.SizeM1, TIL.SizeM1);
+
+ uint64_t BitSize = cast<ConstantInt>(TIL.SizeM1)->getZExtValue() + 1;
+ if (TIL.TheKind == TypeTestResolution::Inline)
+ TTRes.SizeM1BitWidth = (BitSize <= 32) ? 5 : 6;
+ else
+ TTRes.SizeM1BitWidth = (BitSize <= 128) ? 7 : 32;
+ }
+
+ if (TIL.TheKind == TypeTestResolution::ByteArray) {
+ ExportGlobal("byte_array", TIL.TheByteArray);
+ if (shouldExportConstantsAsAbsoluteSymbols())
+ ExportGlobal("bit_mask", TIL.BitMask);
+ else
+ return &TTRes.BitMask;
+ }
+
+ if (TIL.TheKind == TypeTestResolution::Inline)
+ ExportConstant("inline_bits", TTRes.InlineBits, TIL.InlineBits);
+
+ return nullptr;
+}
+
+LowerTypeTestsModule::TypeIdLowering
+LowerTypeTestsModule::importTypeId(StringRef TypeId) {
+ const TypeIdSummary *TidSummary = ImportSummary->getTypeIdSummary(TypeId);
+ if (!TidSummary)
+ return {}; // Unsat: no globals match this type id.
+ const TypeTestResolution &TTRes = TidSummary->TTRes;
+
+ TypeIdLowering TIL;
+ TIL.TheKind = TTRes.TheKind;
+
+ auto ImportGlobal = [&](StringRef Name) {
+ // Give the global a type of length 0 so that it is not assumed not to alias
+ // with any other global.
+ Constant *C = M.getOrInsertGlobal(("__typeid_" + TypeId + "_" + Name).str(),
+ Int8Arr0Ty);
+ if (auto *GV = dyn_cast<GlobalVariable>(C))
+ GV->setVisibility(GlobalValue::HiddenVisibility);
+ C = ConstantExpr::getBitCast(C, Int8PtrTy);
+ return C;
+ };
+
+ auto ImportConstant = [&](StringRef Name, uint64_t Const, unsigned AbsWidth,
+ Type *Ty) {
+ if (!shouldExportConstantsAsAbsoluteSymbols()) {
+ Constant *C =
+ ConstantInt::get(isa<IntegerType>(Ty) ? Ty : Int64Ty, Const);
+ if (!isa<IntegerType>(Ty))
+ C = ConstantExpr::getIntToPtr(C, Ty);
+ return C;
+ }
+
+ Constant *C = ImportGlobal(Name);
+ auto *GV = cast<GlobalVariable>(C->stripPointerCasts());
+ if (isa<IntegerType>(Ty))
+ C = ConstantExpr::getPtrToInt(C, Ty);
+ if (GV->getMetadata(LLVMContext::MD_absolute_symbol))
+ return C;
+
+ auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
+ auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
+ auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
+ GV->setMetadata(LLVMContext::MD_absolute_symbol,
+ MDNode::get(M.getContext(), {MinC, MaxC}));
+ };
+ if (AbsWidth == IntPtrTy->getBitWidth())
+ SetAbsRange(~0ull, ~0ull); // Full set.
+ else
+ SetAbsRange(0, 1ull << AbsWidth);
+ return C;
+ };
+
+ if (TIL.TheKind != TypeTestResolution::Unsat)
+ TIL.OffsetedGlobal = ImportGlobal("global_addr");
+
+ if (TIL.TheKind == TypeTestResolution::ByteArray ||
+ TIL.TheKind == TypeTestResolution::Inline ||
+ TIL.TheKind == TypeTestResolution::AllOnes) {
+ TIL.AlignLog2 = ImportConstant("align", TTRes.AlignLog2, 8, Int8Ty);
+ TIL.SizeM1 =
+ ImportConstant("size_m1", TTRes.SizeM1, TTRes.SizeM1BitWidth, IntPtrTy);
+ }
+
+ if (TIL.TheKind == TypeTestResolution::ByteArray) {
+ TIL.TheByteArray = ImportGlobal("byte_array");
+ TIL.BitMask = ImportConstant("bit_mask", TTRes.BitMask, 8, Int8PtrTy);
+ }
+
+ if (TIL.TheKind == TypeTestResolution::Inline)
+ TIL.InlineBits = ImportConstant(
+ "inline_bits", TTRes.InlineBits, 1 << TTRes.SizeM1BitWidth,
+ TTRes.SizeM1BitWidth <= 5 ? Int32Ty : Int64Ty);
+
+ return TIL;
+}
+
+void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
+ auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
+ if (!TypeIdMDVal)
+ report_fatal_error("Second argument of llvm.type.test must be metadata");
+
+ auto TypeIdStr = dyn_cast<MDString>(TypeIdMDVal->getMetadata());
+ // If this is a local unpromoted type, which doesn't have a metadata string,
+ // treat as Unknown and delay lowering, so that we can still utilize it for
+ // later optimizations.
+ if (!TypeIdStr)
+ return;
+
+ TypeIdLowering TIL = importTypeId(TypeIdStr->getString());
+ Value *Lowered = lowerTypeTestCall(TypeIdStr, CI, TIL);
+ if (Lowered) {
+ CI->replaceAllUsesWith(Lowered);
+ CI->eraseFromParent();
+ }
+}
+
+// ThinLTO backend: the function F has a jump table entry; update this module
+// accordingly. isJumpTableCanonical describes the type of the jump table entry.
+void LowerTypeTestsModule::importFunction(
+ Function *F, bool isJumpTableCanonical,
+ std::vector<GlobalAlias *> &AliasesToErase) {
+ assert(F->getType()->getAddressSpace() == 0);
+
+ GlobalValue::VisibilityTypes Visibility = F->getVisibility();
+ std::string Name = std::string(F->getName());
+
+ if (F->isDeclarationForLinker() && isJumpTableCanonical) {
+ // Non-dso_local functions may be overriden at run time,
+ // don't short curcuit them
+ if (F->isDSOLocal()) {
+ Function *RealF = Function::Create(F->getFunctionType(),
+ GlobalValue::ExternalLinkage,
+ F->getAddressSpace(),
+ Name + ".cfi", &M);
+ RealF->setVisibility(GlobalVariable::HiddenVisibility);
+ replaceDirectCalls(F, RealF);
+ }
+ return;
+ }
+
+ Function *FDecl;
+ if (!isJumpTableCanonical) {
+ // Either a declaration of an external function or a reference to a locally
+ // defined jump table.
+ FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
+ F->getAddressSpace(), Name + ".cfi_jt", &M);
+ FDecl->setVisibility(GlobalValue::HiddenVisibility);
+ } else {
+ F->setName(Name + ".cfi");
+ F->setLinkage(GlobalValue::ExternalLinkage);
+ FDecl = Function::Create(F->getFunctionType(), GlobalValue::ExternalLinkage,
+ F->getAddressSpace(), Name, &M);
+ FDecl->setVisibility(Visibility);
+ Visibility = GlobalValue::HiddenVisibility;
+
+ // Delete aliases pointing to this function, they'll be re-created in the
+ // merged output. Don't do it yet though because ScopedSaveAliaseesAndUsed
+ // will want to reset the aliasees first.
+ for (auto &U : F->uses()) {
+ if (auto *A = dyn_cast<GlobalAlias>(U.getUser())) {
+ Function *AliasDecl = Function::Create(
+ F->getFunctionType(), GlobalValue::ExternalLinkage,
+ F->getAddressSpace(), "", &M);
+ AliasDecl->takeName(A);
+ A->replaceAllUsesWith(AliasDecl);
+ AliasesToErase.push_back(A);
+ }
+ }
+ }
+
+ if (F->hasExternalWeakLinkage())
+ replaceWeakDeclarationWithJumpTablePtr(F, FDecl, isJumpTableCanonical);
+ else
+ replaceCfiUses(F, FDecl, isJumpTableCanonical);
+
+ // Set visibility late because it's used in replaceCfiUses() to determine
+ // whether uses need to to be replaced.
+ F->setVisibility(Visibility);
+}
+
+void LowerTypeTestsModule::lowerTypeTestCalls(
+ ArrayRef<Metadata *> TypeIds, Constant *CombinedGlobalAddr,
+ const DenseMap<GlobalTypeMember *, uint64_t> &GlobalLayout) {
+ CombinedGlobalAddr = ConstantExpr::getBitCast(CombinedGlobalAddr, Int8PtrTy);
+
+ // For each type identifier in this disjoint set...
+ for (Metadata *TypeId : TypeIds) {
+ // Build the bitset.
+ BitSetInfo BSI = buildBitSet(TypeId, GlobalLayout);
+ LLVM_DEBUG({
+ if (auto MDS = dyn_cast<MDString>(TypeId))
+ dbgs() << MDS->getString() << ": ";
+ else
+ dbgs() << "<unnamed>: ";
+ BSI.print(dbgs());
+ });
+
+ ByteArrayInfo *BAI = nullptr;
+ TypeIdLowering TIL;
+ TIL.OffsetedGlobal = ConstantExpr::getGetElementPtr(
+ Int8Ty, CombinedGlobalAddr, ConstantInt::get(IntPtrTy, BSI.ByteOffset)),
+ TIL.AlignLog2 = ConstantInt::get(Int8Ty, BSI.AlignLog2);
+ TIL.SizeM1 = ConstantInt::get(IntPtrTy, BSI.BitSize - 1);
+ if (BSI.isAllOnes()) {
+ TIL.TheKind = (BSI.BitSize == 1) ? TypeTestResolution::Single
+ : TypeTestResolution::AllOnes;
+ } else if (BSI.BitSize <= 64) {
+ TIL.TheKind = TypeTestResolution::Inline;
+ uint64_t InlineBits = 0;
+ for (auto Bit : BSI.Bits)
+ InlineBits |= uint64_t(1) << Bit;
+ if (InlineBits == 0)
+ TIL.TheKind = TypeTestResolution::Unsat;
+ else
+ TIL.InlineBits = ConstantInt::get(
+ (BSI.BitSize <= 32) ? Int32Ty : Int64Ty, InlineBits);
+ } else {
+ TIL.TheKind = TypeTestResolution::ByteArray;
+ ++NumByteArraysCreated;
+ BAI = createByteArray(BSI);
+ TIL.TheByteArray = BAI->ByteArray;
+ TIL.BitMask = BAI->MaskGlobal;
+ }
+
+ TypeIdUserInfo &TIUI = TypeIdUsers[TypeId];
+
+ if (TIUI.IsExported) {
+ uint8_t *MaskPtr = exportTypeId(cast<MDString>(TypeId)->getString(), TIL);
+ if (BAI)
+ BAI->MaskPtr = MaskPtr;
+ }
+
+ // Lower each call to llvm.type.test for this type identifier.
+ for (CallInst *CI : TIUI.CallSites) {
+ ++NumTypeTestCallsLowered;
+ Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL);
+ if (Lowered) {
+ CI->replaceAllUsesWith(Lowered);
+ CI->eraseFromParent();
+ }
+ }
+ }
+}
+
+void LowerTypeTestsModule::verifyTypeMDNode(GlobalObject *GO, MDNode *Type) {
+ if (Type->getNumOperands() != 2)
+ report_fatal_error("All operands of type metadata must have 2 elements");
+
+ if (GO->isThreadLocal())
+ report_fatal_error("Bit set element may not be thread-local");
+ if (isa<GlobalVariable>(GO) && GO->hasSection())
+ report_fatal_error(
+ "A member of a type identifier may not have an explicit section");
+
+ // FIXME: We previously checked that global var member of a type identifier
+ // must be a definition, but the IR linker may leave type metadata on
+ // declarations. We should restore this check after fixing PR31759.
+
+ auto OffsetConstMD = dyn_cast<ConstantAsMetadata>(Type->getOperand(0));
+ if (!OffsetConstMD)
+ report_fatal_error("Type offset must be a constant");
+ auto OffsetInt = dyn_cast<ConstantInt>(OffsetConstMD->getValue());
+ if (!OffsetInt)
+ report_fatal_error("Type offset must be an integer constant");
+}
+
+static const unsigned kX86JumpTableEntrySize = 8;
+static const unsigned kARMJumpTableEntrySize = 4;
static const unsigned kARMBTIJumpTableEntrySize = 8;
-
-unsigned LowerTypeTestsModule::getJumpTableEntrySize() {
- switch (Arch) {
- case Triple::x86:
- case Triple::x86_64:
- return kX86JumpTableEntrySize;
- case Triple::arm:
- case Triple::thumb:
+
+unsigned LowerTypeTestsModule::getJumpTableEntrySize() {
+ switch (Arch) {
+ case Triple::x86:
+ case Triple::x86_64:
+ return kX86JumpTableEntrySize;
+ case Triple::arm:
+ case Triple::thumb:
return kARMJumpTableEntrySize;
- case Triple::aarch64:
+ case Triple::aarch64:
if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
M.getModuleFlag("branch-target-enforcement")))
if (BTE->getZExtValue())
return kARMBTIJumpTableEntrySize;
- return kARMJumpTableEntrySize;
- default:
- report_fatal_error("Unsupported architecture for jump tables");
- }
-}
-
-// Create a jump table entry for the target. This consists of an instruction
-// sequence containing a relative branch to Dest. Appends inline asm text,
-// constraints and arguments to AsmOS, ConstraintOS and AsmArgs.
-void LowerTypeTestsModule::createJumpTableEntry(
- raw_ostream &AsmOS, raw_ostream &ConstraintOS,
- Triple::ArchType JumpTableArch, SmallVectorImpl<Value *> &AsmArgs,
- Function *Dest) {
- unsigned ArgIndex = AsmArgs.size();
-
- if (JumpTableArch == Triple::x86 || JumpTableArch == Triple::x86_64) {
- AsmOS << "jmp ${" << ArgIndex << ":c}@plt\n";
- AsmOS << "int3\nint3\nint3\n";
+ return kARMJumpTableEntrySize;
+ default:
+ report_fatal_error("Unsupported architecture for jump tables");
+ }
+}
+
+// Create a jump table entry for the target. This consists of an instruction
+// sequence containing a relative branch to Dest. Appends inline asm text,
+// constraints and arguments to AsmOS, ConstraintOS and AsmArgs.
+void LowerTypeTestsModule::createJumpTableEntry(
+ raw_ostream &AsmOS, raw_ostream &ConstraintOS,
+ Triple::ArchType JumpTableArch, SmallVectorImpl<Value *> &AsmArgs,
+ Function *Dest) {
+ unsigned ArgIndex = AsmArgs.size();
+
+ if (JumpTableArch == Triple::x86 || JumpTableArch == Triple::x86_64) {
+ AsmOS << "jmp ${" << ArgIndex << ":c}@plt\n";
+ AsmOS << "int3\nint3\nint3\n";
} else if (JumpTableArch == Triple::arm) {
- AsmOS << "b $" << ArgIndex << "\n";
+ AsmOS << "b $" << ArgIndex << "\n";
} else if (JumpTableArch == Triple::aarch64) {
if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
Dest->getParent()->getModuleFlag("branch-target-enforcement")))
if (BTE->getZExtValue())
AsmOS << "bti c\n";
AsmOS << "b $" << ArgIndex << "\n";
- } else if (JumpTableArch == Triple::thumb) {
- AsmOS << "b.w $" << ArgIndex << "\n";
- } else {
- report_fatal_error("Unsupported architecture for jump tables");
- }
-
- ConstraintOS << (ArgIndex > 0 ? ",s" : "s");
- AsmArgs.push_back(Dest);
-}
-
-Type *LowerTypeTestsModule::getJumpTableEntryType() {
- return ArrayType::get(Int8Ty, getJumpTableEntrySize());
-}
-
-/// Given a disjoint set of type identifiers and functions, build the bit sets
-/// and lower the llvm.type.test calls, architecture dependently.
-void LowerTypeTestsModule::buildBitSetsFromFunctions(
- ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
- if (Arch == Triple::x86 || Arch == Triple::x86_64 || Arch == Triple::arm ||
- Arch == Triple::thumb || Arch == Triple::aarch64)
- buildBitSetsFromFunctionsNative(TypeIds, Functions);
- else if (Arch == Triple::wasm32 || Arch == Triple::wasm64)
- buildBitSetsFromFunctionsWASM(TypeIds, Functions);
- else
- report_fatal_error("Unsupported architecture for jump tables");
-}
-
-void LowerTypeTestsModule::moveInitializerToModuleConstructor(
- GlobalVariable *GV) {
- if (WeakInitializerFn == nullptr) {
- WeakInitializerFn = Function::Create(
- FunctionType::get(Type::getVoidTy(M.getContext()),
- /* IsVarArg */ false),
- GlobalValue::InternalLinkage,
- M.getDataLayout().getProgramAddressSpace(),
- "__cfi_global_var_init", &M);
- BasicBlock *BB =
- BasicBlock::Create(M.getContext(), "entry", WeakInitializerFn);
- ReturnInst::Create(M.getContext(), BB);
- WeakInitializerFn->setSection(
- ObjectFormat == Triple::MachO
- ? "__TEXT,__StaticInit,regular,pure_instructions"
- : ".text.startup");
- // This code is equivalent to relocation application, and should run at the
- // earliest possible time (i.e. with the highest priority).
- appendToGlobalCtors(M, WeakInitializerFn, /* Priority */ 0);
- }
-
- IRBuilder<> IRB(WeakInitializerFn->getEntryBlock().getTerminator());
- GV->setConstant(false);
- IRB.CreateAlignedStore(GV->getInitializer(), GV, GV->getAlign());
- GV->setInitializer(Constant::getNullValue(GV->getValueType()));
-}
-
-void LowerTypeTestsModule::findGlobalVariableUsersOf(
- Constant *C, SmallSetVector<GlobalVariable *, 8> &Out) {
- for (auto *U : C->users()){
- if (auto *GV = dyn_cast<GlobalVariable>(U))
- Out.insert(GV);
- else if (auto *C2 = dyn_cast<Constant>(U))
- findGlobalVariableUsersOf(C2, Out);
- }
-}
-
-// Replace all uses of F with (F ? JT : 0).
-void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
- Function *F, Constant *JT, bool IsJumpTableCanonical) {
- // The target expression can not appear in a constant initializer on most
- // (all?) targets. Switch to a runtime initializer.
- SmallSetVector<GlobalVariable *, 8> GlobalVarUsers;
- findGlobalVariableUsersOf(F, GlobalVarUsers);
- for (auto GV : GlobalVarUsers)
- moveInitializerToModuleConstructor(GV);
-
- // Can not RAUW F with an expression that uses F. Replace with a temporary
- // placeholder first.
- Function *PlaceholderFn =
- Function::Create(cast<FunctionType>(F->getValueType()),
- GlobalValue::ExternalWeakLinkage,
- F->getAddressSpace(), "", &M);
- replaceCfiUses(F, PlaceholderFn, IsJumpTableCanonical);
-
- Constant *Target = ConstantExpr::getSelect(
- ConstantExpr::getICmp(CmpInst::ICMP_NE, F,
- Constant::getNullValue(F->getType())),
- JT, Constant::getNullValue(F->getType()));
- PlaceholderFn->replaceAllUsesWith(Target);
- PlaceholderFn->eraseFromParent();
-}
-
-static bool isThumbFunction(Function *F, Triple::ArchType ModuleArch) {
- Attribute TFAttr = F->getFnAttribute("target-features");
+ } else if (JumpTableArch == Triple::thumb) {
+ AsmOS << "b.w $" << ArgIndex << "\n";
+ } else {
+ report_fatal_error("Unsupported architecture for jump tables");
+ }
+
+ ConstraintOS << (ArgIndex > 0 ? ",s" : "s");
+ AsmArgs.push_back(Dest);
+}
+
+Type *LowerTypeTestsModule::getJumpTableEntryType() {
+ return ArrayType::get(Int8Ty, getJumpTableEntrySize());
+}
+
+/// Given a disjoint set of type identifiers and functions, build the bit sets
+/// and lower the llvm.type.test calls, architecture dependently.
+void LowerTypeTestsModule::buildBitSetsFromFunctions(
+ ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
+ if (Arch == Triple::x86 || Arch == Triple::x86_64 || Arch == Triple::arm ||
+ Arch == Triple::thumb || Arch == Triple::aarch64)
+ buildBitSetsFromFunctionsNative(TypeIds, Functions);
+ else if (Arch == Triple::wasm32 || Arch == Triple::wasm64)
+ buildBitSetsFromFunctionsWASM(TypeIds, Functions);
+ else
+ report_fatal_error("Unsupported architecture for jump tables");
+}
+
+void LowerTypeTestsModule::moveInitializerToModuleConstructor(
+ GlobalVariable *GV) {
+ if (WeakInitializerFn == nullptr) {
+ WeakInitializerFn = Function::Create(
+ FunctionType::get(Type::getVoidTy(M.getContext()),
+ /* IsVarArg */ false),
+ GlobalValue::InternalLinkage,
+ M.getDataLayout().getProgramAddressSpace(),
+ "__cfi_global_var_init", &M);
+ BasicBlock *BB =
+ BasicBlock::Create(M.getContext(), "entry", WeakInitializerFn);
+ ReturnInst::Create(M.getContext(), BB);
+ WeakInitializerFn->setSection(
+ ObjectFormat == Triple::MachO
+ ? "__TEXT,__StaticInit,regular,pure_instructions"
+ : ".text.startup");
+ // This code is equivalent to relocation application, and should run at the
+ // earliest possible time (i.e. with the highest priority).
+ appendToGlobalCtors(M, WeakInitializerFn, /* Priority */ 0);
+ }
+
+ IRBuilder<> IRB(WeakInitializerFn->getEntryBlock().getTerminator());
+ GV->setConstant(false);
+ IRB.CreateAlignedStore(GV->getInitializer(), GV, GV->getAlign());
+ GV->setInitializer(Constant::getNullValue(GV->getValueType()));
+}
+
+void LowerTypeTestsModule::findGlobalVariableUsersOf(
+ Constant *C, SmallSetVector<GlobalVariable *, 8> &Out) {
+ for (auto *U : C->users()){
+ if (auto *GV = dyn_cast<GlobalVariable>(U))
+ Out.insert(GV);
+ else if (auto *C2 = dyn_cast<Constant>(U))
+ findGlobalVariableUsersOf(C2, Out);
+ }
+}
+
+// Replace all uses of F with (F ? JT : 0).
+void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
+ Function *F, Constant *JT, bool IsJumpTableCanonical) {
+ // The target expression can not appear in a constant initializer on most
+ // (all?) targets. Switch to a runtime initializer.
+ SmallSetVector<GlobalVariable *, 8> GlobalVarUsers;
+ findGlobalVariableUsersOf(F, GlobalVarUsers);
+ for (auto GV : GlobalVarUsers)
+ moveInitializerToModuleConstructor(GV);
+
+ // Can not RAUW F with an expression that uses F. Replace with a temporary
+ // placeholder first.
+ Function *PlaceholderFn =
+ Function::Create(cast<FunctionType>(F->getValueType()),
+ GlobalValue::ExternalWeakLinkage,
+ F->getAddressSpace(), "", &M);
+ replaceCfiUses(F, PlaceholderFn, IsJumpTableCanonical);
+
+ Constant *Target = ConstantExpr::getSelect(
+ ConstantExpr::getICmp(CmpInst::ICMP_NE, F,
+ Constant::getNullValue(F->getType())),
+ JT, Constant::getNullValue(F->getType()));
+ PlaceholderFn->replaceAllUsesWith(Target);
+ PlaceholderFn->eraseFromParent();
+}
+
+static bool isThumbFunction(Function *F, Triple::ArchType ModuleArch) {
+ Attribute TFAttr = F->getFnAttribute("target-features");
if (TFAttr.isValid()) {
- SmallVector<StringRef, 6> Features;
- TFAttr.getValueAsString().split(Features, ',');
- for (StringRef Feature : Features) {
- if (Feature == "-thumb-mode")
- return false;
- else if (Feature == "+thumb-mode")
- return true;
- }
- }
-
- return ModuleArch == Triple::thumb;
-}
-
-// Each jump table must be either ARM or Thumb as a whole for the bit-test math
-// to work. Pick one that matches the majority of members to minimize interop
-// veneers inserted by the linker.
-static Triple::ArchType
-selectJumpTableArmEncoding(ArrayRef<GlobalTypeMember *> Functions,
- Triple::ArchType ModuleArch) {
- if (ModuleArch != Triple::arm && ModuleArch != Triple::thumb)
- return ModuleArch;
-
- unsigned ArmCount = 0, ThumbCount = 0;
- for (const auto GTM : Functions) {
- if (!GTM->isJumpTableCanonical()) {
- // PLT stubs are always ARM.
- // FIXME: This is the wrong heuristic for non-canonical jump tables.
- ++ArmCount;
- continue;
- }
-
- Function *F = cast<Function>(GTM->getGlobal());
- ++(isThumbFunction(F, ModuleArch) ? ThumbCount : ArmCount);
- }
-
- return ArmCount > ThumbCount ? Triple::arm : Triple::thumb;
-}
-
-void LowerTypeTestsModule::createJumpTable(
- Function *F, ArrayRef<GlobalTypeMember *> Functions) {
- std::string AsmStr, ConstraintStr;
- raw_string_ostream AsmOS(AsmStr), ConstraintOS(ConstraintStr);
- SmallVector<Value *, 16> AsmArgs;
- AsmArgs.reserve(Functions.size() * 2);
-
- Triple::ArchType JumpTableArch = selectJumpTableArmEncoding(Functions, Arch);
-
- for (unsigned I = 0; I != Functions.size(); ++I)
- createJumpTableEntry(AsmOS, ConstraintOS, JumpTableArch, AsmArgs,
- cast<Function>(Functions[I]->getGlobal()));
-
- // Align the whole table by entry size.
- F->setAlignment(Align(getJumpTableEntrySize()));
- // Skip prologue.
- // Disabled on win32 due to https://llvm.org/bugs/show_bug.cgi?id=28641#c3.
- // Luckily, this function does not get any prologue even without the
- // attribute.
- if (OS != Triple::Win32)
- F->addFnAttr(Attribute::Naked);
- if (JumpTableArch == Triple::arm)
- F->addFnAttr("target-features", "-thumb-mode");
- if (JumpTableArch == Triple::thumb) {
- F->addFnAttr("target-features", "+thumb-mode");
- // Thumb jump table assembly needs Thumb2. The following attribute is added
- // by Clang for -march=armv7.
- F->addFnAttr("target-cpu", "cortex-a8");
- }
+ SmallVector<StringRef, 6> Features;
+ TFAttr.getValueAsString().split(Features, ',');
+ for (StringRef Feature : Features) {
+ if (Feature == "-thumb-mode")
+ return false;
+ else if (Feature == "+thumb-mode")
+ return true;
+ }
+ }
+
+ return ModuleArch == Triple::thumb;
+}
+
+// Each jump table must be either ARM or Thumb as a whole for the bit-test math
+// to work. Pick one that matches the majority of members to minimize interop
+// veneers inserted by the linker.
+static Triple::ArchType
+selectJumpTableArmEncoding(ArrayRef<GlobalTypeMember *> Functions,
+ Triple::ArchType ModuleArch) {
+ if (ModuleArch != Triple::arm && ModuleArch != Triple::thumb)
+ return ModuleArch;
+
+ unsigned ArmCount = 0, ThumbCount = 0;
+ for (const auto GTM : Functions) {
+ if (!GTM->isJumpTableCanonical()) {
+ // PLT stubs are always ARM.
+ // FIXME: This is the wrong heuristic for non-canonical jump tables.
+ ++ArmCount;
+ continue;
+ }
+
+ Function *F = cast<Function>(GTM->getGlobal());
+ ++(isThumbFunction(F, ModuleArch) ? ThumbCount : ArmCount);
+ }
+
+ return ArmCount > ThumbCount ? Triple::arm : Triple::thumb;
+}
+
+void LowerTypeTestsModule::createJumpTable(
+ Function *F, ArrayRef<GlobalTypeMember *> Functions) {
+ std::string AsmStr, ConstraintStr;
+ raw_string_ostream AsmOS(AsmStr), ConstraintOS(ConstraintStr);
+ SmallVector<Value *, 16> AsmArgs;
+ AsmArgs.reserve(Functions.size() * 2);
+
+ Triple::ArchType JumpTableArch = selectJumpTableArmEncoding(Functions, Arch);
+
+ for (unsigned I = 0; I != Functions.size(); ++I)
+ createJumpTableEntry(AsmOS, ConstraintOS, JumpTableArch, AsmArgs,
+ cast<Function>(Functions[I]->getGlobal()));
+
+ // Align the whole table by entry size.
+ F->setAlignment(Align(getJumpTableEntrySize()));
+ // Skip prologue.
+ // Disabled on win32 due to https://llvm.org/bugs/show_bug.cgi?id=28641#c3.
+ // Luckily, this function does not get any prologue even without the
+ // attribute.
+ if (OS != Triple::Win32)
+ F->addFnAttr(Attribute::Naked);
+ if (JumpTableArch == Triple::arm)
+ F->addFnAttr("target-features", "-thumb-mode");
+ if (JumpTableArch == Triple::thumb) {
+ F->addFnAttr("target-features", "+thumb-mode");
+ // Thumb jump table assembly needs Thumb2. The following attribute is added
+ // by Clang for -march=armv7.
+ F->addFnAttr("target-cpu", "cortex-a8");
+ }
if (JumpTableArch == Triple::aarch64) {
F->addFnAttr("branch-target-enforcement", "false");
F->addFnAttr("sign-return-address", "none");
}
- // Make sure we don't emit .eh_frame for this function.
- F->addFnAttr(Attribute::NoUnwind);
-
- BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F);
- IRBuilder<> IRB(BB);
-
- SmallVector<Type *, 16> ArgTypes;
- ArgTypes.reserve(AsmArgs.size());
- for (const auto &Arg : AsmArgs)
- ArgTypes.push_back(Arg->getType());
- InlineAsm *JumpTableAsm =
- InlineAsm::get(FunctionType::get(IRB.getVoidTy(), ArgTypes, false),
- AsmOS.str(), ConstraintOS.str(),
- /*hasSideEffects=*/true);
-
- IRB.CreateCall(JumpTableAsm, AsmArgs);
- IRB.CreateUnreachable();
-}
-
-/// Given a disjoint set of type identifiers and functions, build a jump table
-/// for the functions, build the bit sets and lower the llvm.type.test calls.
-void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
- ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
- // Unlike the global bitset builder, the function bitset builder cannot
- // re-arrange functions in a particular order and base its calculations on the
- // layout of the functions' entry points, as we have no idea how large a
- // particular function will end up being (the size could even depend on what
- // this pass does!) Instead, we build a jump table, which is a block of code
- // consisting of one branch instruction for each of the functions in the bit
- // set that branches to the target function, and redirect any taken function
- // addresses to the corresponding jump table entry. In the object file's
- // symbol table, the symbols for the target functions also refer to the jump
- // table entries, so that addresses taken outside the module will pass any
- // verification done inside the module.
- //
- // In more concrete terms, suppose we have three functions f, g, h which are
- // of the same type, and a function foo that returns their addresses:
- //
- // f:
- // mov 0, %eax
- // ret
- //
- // g:
- // mov 1, %eax
- // ret
- //
- // h:
- // mov 2, %eax
- // ret
- //
- // foo:
- // mov f, %eax
- // mov g, %edx
- // mov h, %ecx
- // ret
- //
- // We output the jump table as module-level inline asm string. The end result
- // will (conceptually) look like this:
- //
- // f = .cfi.jumptable
- // g = .cfi.jumptable + 4
- // h = .cfi.jumptable + 8
- // .cfi.jumptable:
- // jmp f.cfi ; 5 bytes
- // int3 ; 1 byte
- // int3 ; 1 byte
- // int3 ; 1 byte
- // jmp g.cfi ; 5 bytes
- // int3 ; 1 byte
- // int3 ; 1 byte
- // int3 ; 1 byte
- // jmp h.cfi ; 5 bytes
- // int3 ; 1 byte
- // int3 ; 1 byte
- // int3 ; 1 byte
- //
- // f.cfi:
- // mov 0, %eax
- // ret
- //
- // g.cfi:
- // mov 1, %eax
- // ret
- //
- // h.cfi:
- // mov 2, %eax
- // ret
- //
- // foo:
- // mov f, %eax
- // mov g, %edx
- // mov h, %ecx
- // ret
- //
- // Because the addresses of f, g, h are evenly spaced at a power of 2, in the
- // normal case the check can be carried out using the same kind of simple
- // arithmetic that we normally use for globals.
-
- // FIXME: find a better way to represent the jumptable in the IR.
- assert(!Functions.empty());
-
- // Build a simple layout based on the regular layout of jump tables.
- DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
- unsigned EntrySize = getJumpTableEntrySize();
- for (unsigned I = 0; I != Functions.size(); ++I)
- GlobalLayout[Functions[I]] = I * EntrySize;
-
- Function *JumpTableFn =
- Function::Create(FunctionType::get(Type::getVoidTy(M.getContext()),
- /* IsVarArg */ false),
- GlobalValue::PrivateLinkage,
- M.getDataLayout().getProgramAddressSpace(),
- ".cfi.jumptable", &M);
- ArrayType *JumpTableType =
- ArrayType::get(getJumpTableEntryType(), Functions.size());
- auto JumpTable =
- ConstantExpr::getPointerCast(JumpTableFn, JumpTableType->getPointerTo(0));
-
- lowerTypeTestCalls(TypeIds, JumpTable, GlobalLayout);
-
- {
- ScopedSaveAliaseesAndUsed S(M);
-
- // Build aliases pointing to offsets into the jump table, and replace
- // references to the original functions with references to the aliases.
- for (unsigned I = 0; I != Functions.size(); ++I) {
- Function *F = cast<Function>(Functions[I]->getGlobal());
- bool IsJumpTableCanonical = Functions[I]->isJumpTableCanonical();
-
- Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast(
- ConstantExpr::getInBoundsGetElementPtr(
- JumpTableType, JumpTable,
- ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
- ConstantInt::get(IntPtrTy, I)}),
- F->getType());
- if (Functions[I]->isExported()) {
- if (IsJumpTableCanonical) {
- ExportSummary->cfiFunctionDefs().insert(std::string(F->getName()));
- } else {
- GlobalAlias *JtAlias = GlobalAlias::create(
- F->getValueType(), 0, GlobalValue::ExternalLinkage,
- F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M);
- JtAlias->setVisibility(GlobalValue::HiddenVisibility);
- ExportSummary->cfiFunctionDecls().insert(std::string(F->getName()));
- }
- }
- if (!IsJumpTableCanonical) {
- if (F->hasExternalWeakLinkage())
- replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr,
- IsJumpTableCanonical);
- else
- replaceCfiUses(F, CombinedGlobalElemPtr, IsJumpTableCanonical);
- } else {
- assert(F->getType()->getAddressSpace() == 0);
-
- GlobalAlias *FAlias =
- GlobalAlias::create(F->getValueType(), 0, F->getLinkage(), "",
- CombinedGlobalElemPtr, &M);
- FAlias->setVisibility(F->getVisibility());
- FAlias->takeName(F);
- if (FAlias->hasName())
- F->setName(FAlias->getName() + ".cfi");
- replaceCfiUses(F, FAlias, IsJumpTableCanonical);
- if (!F->hasLocalLinkage())
- F->setVisibility(GlobalVariable::HiddenVisibility);
- }
- }
- }
-
- createJumpTable(JumpTableFn, Functions);
-}
-
-/// Assign a dummy layout using an incrementing counter, tag each function
-/// with its index represented as metadata, and lower each type test to an
-/// integer range comparison. During generation of the indirect function call
-/// table in the backend, it will assign the given indexes.
-/// Note: Dynamic linking is not supported, as the WebAssembly ABI has not yet
-/// been finalized.
-void LowerTypeTestsModule::buildBitSetsFromFunctionsWASM(
- ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
- assert(!Functions.empty());
-
- // Build consecutive monotonic integer ranges for each call target set
- DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
-
- for (GlobalTypeMember *GTM : Functions) {
- Function *F = cast<Function>(GTM->getGlobal());
-
- // Skip functions that are not address taken, to avoid bloating the table
- if (!F->hasAddressTaken())
- continue;
-
- // Store metadata with the index for each function
- MDNode *MD = MDNode::get(F->getContext(),
- ArrayRef<Metadata *>(ConstantAsMetadata::get(
- ConstantInt::get(Int64Ty, IndirectIndex))));
- F->setMetadata("wasm.index", MD);
-
- // Assign the counter value
- GlobalLayout[GTM] = IndirectIndex++;
- }
-
- // The indirect function table index space starts at zero, so pass a NULL
- // pointer as the subtracted "jump table" offset.
- lowerTypeTestCalls(TypeIds, ConstantPointerNull::get(Int32PtrTy),
- GlobalLayout);
-}
-
-void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
- ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals,
- ArrayRef<ICallBranchFunnel *> ICallBranchFunnels) {
- DenseMap<Metadata *, uint64_t> TypeIdIndices;
- for (unsigned I = 0; I != TypeIds.size(); ++I)
- TypeIdIndices[TypeIds[I]] = I;
-
- // For each type identifier, build a set of indices that refer to members of
- // the type identifier.
- std::vector<std::set<uint64_t>> TypeMembers(TypeIds.size());
- unsigned GlobalIndex = 0;
- DenseMap<GlobalTypeMember *, uint64_t> GlobalIndices;
- for (GlobalTypeMember *GTM : Globals) {
- for (MDNode *Type : GTM->types()) {
- // Type = { offset, type identifier }
- auto I = TypeIdIndices.find(Type->getOperand(1));
- if (I != TypeIdIndices.end())
- TypeMembers[I->second].insert(GlobalIndex);
- }
- GlobalIndices[GTM] = GlobalIndex;
- GlobalIndex++;
- }
-
- for (ICallBranchFunnel *JT : ICallBranchFunnels) {
- TypeMembers.emplace_back();
- std::set<uint64_t> &TMSet = TypeMembers.back();
- for (GlobalTypeMember *T : JT->targets())
- TMSet.insert(GlobalIndices[T]);
- }
-
- // Order the sets of indices by size. The GlobalLayoutBuilder works best
- // when given small index sets first.
- llvm::stable_sort(TypeMembers, [](const std::set<uint64_t> &O1,
- const std::set<uint64_t> &O2) {
- return O1.size() < O2.size();
- });
-
- // Create a GlobalLayoutBuilder and provide it with index sets as layout
- // fragments. The GlobalLayoutBuilder tries to lay out members of fragments as
- // close together as possible.
- GlobalLayoutBuilder GLB(Globals.size());
- for (auto &&MemSet : TypeMembers)
- GLB.addFragment(MemSet);
-
- // Build a vector of globals with the computed layout.
- bool IsGlobalSet =
- Globals.empty() || isa<GlobalVariable>(Globals[0]->getGlobal());
- std::vector<GlobalTypeMember *> OrderedGTMs(Globals.size());
- auto OGTMI = OrderedGTMs.begin();
- for (auto &&F : GLB.Fragments) {
- for (auto &&Offset : F) {
- if (IsGlobalSet != isa<GlobalVariable>(Globals[Offset]->getGlobal()))
- report_fatal_error("Type identifier may not contain both global "
- "variables and functions");
- *OGTMI++ = Globals[Offset];
- }
- }
-
- // Build the bitsets from this disjoint set.
- if (IsGlobalSet)
- buildBitSetsFromGlobalVariables(TypeIds, OrderedGTMs);
- else
- buildBitSetsFromFunctions(TypeIds, OrderedGTMs);
-}
-
-/// Lower all type tests in this module.
-LowerTypeTestsModule::LowerTypeTestsModule(
- Module &M, ModuleSummaryIndex *ExportSummary,
- const ModuleSummaryIndex *ImportSummary, bool DropTypeTests)
- : M(M), ExportSummary(ExportSummary), ImportSummary(ImportSummary),
- DropTypeTests(DropTypeTests) {
- assert(!(ExportSummary && ImportSummary));
- Triple TargetTriple(M.getTargetTriple());
- Arch = TargetTriple.getArch();
- OS = TargetTriple.getOS();
- ObjectFormat = TargetTriple.getObjectFormat();
-}
-
-bool LowerTypeTestsModule::runForTesting(Module &M) {
- ModuleSummaryIndex Summary(/*HaveGVs=*/false);
-
- // Handle the command-line summary arguments. This code is for testing
- // purposes only, so we handle errors directly.
- if (!ClReadSummary.empty()) {
- ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary +
- ": ");
- auto ReadSummaryFile =
- ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
-
- yaml::Input In(ReadSummaryFile->getBuffer());
- In >> Summary;
- ExitOnErr(errorCodeToError(In.error()));
- }
-
- bool Changed =
- LowerTypeTestsModule(
- M, ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr,
- ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr,
- /*DropTypeTests*/ false)
- .lower();
-
- if (!ClWriteSummary.empty()) {
- ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
- ": ");
- std::error_code EC;
- raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text);
- ExitOnErr(errorCodeToError(EC));
-
- yaml::Output Out(OS);
- Out << Summary;
- }
-
- return Changed;
-}
-
-static bool isDirectCall(Use& U) {
- auto *Usr = dyn_cast<CallInst>(U.getUser());
- if (Usr) {
- auto *CB = dyn_cast<CallBase>(Usr);
- if (CB && CB->isCallee(&U))
- return true;
- }
- return false;
-}
-
-void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New,
- bool IsJumpTableCanonical) {
- SmallSetVector<Constant *, 4> Constants;
- auto UI = Old->use_begin(), E = Old->use_end();
- for (; UI != E;) {
- Use &U = *UI;
- ++UI;
-
- // Skip block addresses
- if (isa<BlockAddress>(U.getUser()))
- continue;
-
- // Skip direct calls to externally defined or non-dso_local functions
- if (isDirectCall(U) && (Old->isDSOLocal() || !IsJumpTableCanonical))
- continue;
-
- // Must handle Constants specially, we cannot call replaceUsesOfWith on a
- // constant because they are uniqued.
- if (auto *C = dyn_cast<Constant>(U.getUser())) {
- if (!isa<GlobalValue>(C)) {
- // Save unique users to avoid processing operand replacement
- // more than once.
- Constants.insert(C);
- continue;
- }
- }
-
- U.set(New);
- }
-
- // Process operand replacement of saved constants.
- for (auto *C : Constants)
- C->handleOperandChange(Old, New);
-}
-
-void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) {
- Old->replaceUsesWithIf(New, [](Use &U) { return isDirectCall(U); });
-}
-
-bool LowerTypeTestsModule::lower() {
- Function *TypeTestFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_test));
-
- if (DropTypeTests && TypeTestFunc) {
- for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
- UI != UE;) {
- auto *CI = cast<CallInst>((*UI++).getUser());
- // Find and erase llvm.assume intrinsics for this llvm.type.test call.
- for (auto CIU = CI->use_begin(), CIUE = CI->use_end(); CIU != CIUE;) {
- if (auto *AssumeCI = dyn_cast<CallInst>((*CIU++).getUser())) {
- Function *F = AssumeCI->getCalledFunction();
- if (F && F->getIntrinsicID() == Intrinsic::assume)
- AssumeCI->eraseFromParent();
- }
- }
- CI->eraseFromParent();
- }
-
- // We have deleted the type intrinsics, so we no longer have enough
- // information to reason about the liveness of virtual function pointers
- // in GlobalDCE.
- for (GlobalVariable &GV : M.globals())
- GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
-
- return true;
- }
-
- // If only some of the modules were split, we cannot correctly perform
- // this transformation. We already checked for the presense of type tests
- // with partially split modules during the thin link, and would have emitted
- // an error if any were found, so here we can simply return.
- if ((ExportSummary && ExportSummary->partiallySplitLTOUnits()) ||
- (ImportSummary && ImportSummary->partiallySplitLTOUnits()))
- return false;
-
- Function *ICallBranchFunnelFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::icall_branch_funnel));
- if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
- (!ICallBranchFunnelFunc || ICallBranchFunnelFunc->use_empty()) &&
- !ExportSummary && !ImportSummary)
- return false;
-
- if (ImportSummary) {
- if (TypeTestFunc) {
- for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
- UI != UE;) {
- auto *CI = cast<CallInst>((*UI++).getUser());
- importTypeTest(CI);
- }
- }
-
- if (ICallBranchFunnelFunc && !ICallBranchFunnelFunc->use_empty())
- report_fatal_error(
- "unexpected call to llvm.icall.branch.funnel during import phase");
-
- SmallVector<Function *, 8> Defs;
- SmallVector<Function *, 8> Decls;
- for (auto &F : M) {
- // CFI functions are either external, or promoted. A local function may
- // have the same name, but it's not the one we are looking for.
- if (F.hasLocalLinkage())
- continue;
- if (ImportSummary->cfiFunctionDefs().count(std::string(F.getName())))
- Defs.push_back(&F);
- else if (ImportSummary->cfiFunctionDecls().count(
- std::string(F.getName())))
- Decls.push_back(&F);
- }
-
- std::vector<GlobalAlias *> AliasesToErase;
- {
- ScopedSaveAliaseesAndUsed S(M);
- for (auto F : Defs)
- importFunction(F, /*isJumpTableCanonical*/ true, AliasesToErase);
- for (auto F : Decls)
- importFunction(F, /*isJumpTableCanonical*/ false, AliasesToErase);
- }
- for (GlobalAlias *GA : AliasesToErase)
- GA->eraseFromParent();
-
- return true;
- }
-
- // Equivalence class set containing type identifiers and the globals that
- // reference them. This is used to partition the set of type identifiers in
- // the module into disjoint sets.
- using GlobalClassesTy = EquivalenceClasses<
- PointerUnion<GlobalTypeMember *, Metadata *, ICallBranchFunnel *>>;
- GlobalClassesTy GlobalClasses;
-
- // Verify the type metadata and build a few data structures to let us
- // efficiently enumerate the type identifiers associated with a global:
- // a list of GlobalTypeMembers (a GlobalObject stored alongside a vector
- // of associated type metadata) and a mapping from type identifiers to their
- // list of GlobalTypeMembers and last observed index in the list of globals.
- // The indices will be used later to deterministically order the list of type
- // identifiers.
- BumpPtrAllocator Alloc;
- struct TIInfo {
- unsigned UniqueId;
- std::vector<GlobalTypeMember *> RefGlobals;
- };
- DenseMap<Metadata *, TIInfo> TypeIdInfo;
- unsigned CurUniqueId = 0;
- SmallVector<MDNode *, 2> Types;
-
- // Cross-DSO CFI emits jumptable entries for exported functions as well as
- // address taken functions in case they are address taken in other modules.
- const bool CrossDsoCfi = M.getModuleFlag("Cross-DSO CFI") != nullptr;
-
- struct ExportedFunctionInfo {
- CfiFunctionLinkage Linkage;
- MDNode *FuncMD; // {name, linkage, type[, type...]}
- };
- DenseMap<StringRef, ExportedFunctionInfo> ExportedFunctions;
- if (ExportSummary) {
- // A set of all functions that are address taken by a live global object.
- DenseSet<GlobalValue::GUID> AddressTaken;
- for (auto &I : *ExportSummary)
- for (auto &GVS : I.second.SummaryList)
- if (GVS->isLive())
- for (auto &Ref : GVS->refs())
- AddressTaken.insert(Ref.getGUID());
-
- NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
- if (CfiFunctionsMD) {
- for (auto FuncMD : CfiFunctionsMD->operands()) {
- assert(FuncMD->getNumOperands() >= 2);
- StringRef FunctionName =
- cast<MDString>(FuncMD->getOperand(0))->getString();
- CfiFunctionLinkage Linkage = static_cast<CfiFunctionLinkage>(
- cast<ConstantAsMetadata>(FuncMD->getOperand(1))
- ->getValue()
- ->getUniqueInteger()
- .getZExtValue());
- const GlobalValue::GUID GUID = GlobalValue::getGUID(
- GlobalValue::dropLLVMManglingEscape(FunctionName));
- // Do not emit jumptable entries for functions that are not-live and
- // have no live references (and are not exported with cross-DSO CFI.)
- if (!ExportSummary->isGUIDLive(GUID))
- continue;
- if (!AddressTaken.count(GUID)) {
- if (!CrossDsoCfi || Linkage != CFL_Definition)
- continue;
-
- bool Exported = false;
- if (auto VI = ExportSummary->getValueInfo(GUID))
- for (auto &GVS : VI.getSummaryList())
- if (GVS->isLive() && !GlobalValue::isLocalLinkage(GVS->linkage()))
- Exported = true;
-
- if (!Exported)
- continue;
- }
- auto P = ExportedFunctions.insert({FunctionName, {Linkage, FuncMD}});
- if (!P.second && P.first->second.Linkage != CFL_Definition)
- P.first->second = {Linkage, FuncMD};
- }
-
- for (const auto &P : ExportedFunctions) {
- StringRef FunctionName = P.first;
- CfiFunctionLinkage Linkage = P.second.Linkage;
- MDNode *FuncMD = P.second.FuncMD;
- Function *F = M.getFunction(FunctionName);
- if (F && F->hasLocalLinkage()) {
- // Locally defined function that happens to have the same name as a
- // function defined in a ThinLTO module. Rename it to move it out of
- // the way of the external reference that we're about to create.
- // Note that setName will find a unique name for the function, so even
- // if there is an existing function with the suffix there won't be a
- // name collision.
- F->setName(F->getName() + ".1");
- F = nullptr;
- }
-
- if (!F)
- F = Function::Create(
- FunctionType::get(Type::getVoidTy(M.getContext()), false),
- GlobalVariable::ExternalLinkage,
- M.getDataLayout().getProgramAddressSpace(), FunctionName, &M);
-
- // If the function is available_externally, remove its definition so
- // that it is handled the same way as a declaration. Later we will try
- // to create an alias using this function's linkage, which will fail if
- // the linkage is available_externally. This will also result in us
- // following the code path below to replace the type metadata.
- if (F->hasAvailableExternallyLinkage()) {
- F->setLinkage(GlobalValue::ExternalLinkage);
- F->deleteBody();
- F->setComdat(nullptr);
- F->clearMetadata();
- }
-
- // Update the linkage for extern_weak declarations when a definition
- // exists.
- if (Linkage == CFL_Definition && F->hasExternalWeakLinkage())
- F->setLinkage(GlobalValue::ExternalLinkage);
-
- // If the function in the full LTO module is a declaration, replace its
- // type metadata with the type metadata we found in cfi.functions. That
- // metadata is presumed to be more accurate than the metadata attached
- // to the declaration.
- if (F->isDeclaration()) {
- if (Linkage == CFL_WeakDeclaration)
- F->setLinkage(GlobalValue::ExternalWeakLinkage);
-
- F->eraseMetadata(LLVMContext::MD_type);
- for (unsigned I = 2; I < FuncMD->getNumOperands(); ++I)
- F->addMetadata(LLVMContext::MD_type,
- *cast<MDNode>(FuncMD->getOperand(I).get()));
- }
- }
- }
- }
-
- DenseMap<GlobalObject *, GlobalTypeMember *> GlobalTypeMembers;
- for (GlobalObject &GO : M.global_objects()) {
- if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker())
- continue;
-
- Types.clear();
- GO.getMetadata(LLVMContext::MD_type, Types);
-
- bool IsJumpTableCanonical = false;
- bool IsExported = false;
- if (Function *F = dyn_cast<Function>(&GO)) {
- IsJumpTableCanonical = isJumpTableCanonical(F);
- if (ExportedFunctions.count(F->getName())) {
- IsJumpTableCanonical |=
- ExportedFunctions[F->getName()].Linkage == CFL_Definition;
- IsExported = true;
- // TODO: The logic here checks only that the function is address taken,
- // not that the address takers are live. This can be updated to check
- // their liveness and emit fewer jumptable entries once monolithic LTO
- // builds also emit summaries.
- } else if (!F->hasAddressTaken()) {
- if (!CrossDsoCfi || !IsJumpTableCanonical || F->hasLocalLinkage())
- continue;
- }
- }
-
- auto *GTM = GlobalTypeMember::create(Alloc, &GO, IsJumpTableCanonical,
- IsExported, Types);
- GlobalTypeMembers[&GO] = GTM;
- for (MDNode *Type : Types) {
- verifyTypeMDNode(&GO, Type);
- auto &Info = TypeIdInfo[Type->getOperand(1)];
- Info.UniqueId = ++CurUniqueId;
- Info.RefGlobals.push_back(GTM);
- }
- }
-
- auto AddTypeIdUse = [&](Metadata *TypeId) -> TypeIdUserInfo & {
- // Add the call site to the list of call sites for this type identifier. We
- // also use TypeIdUsers to keep track of whether we have seen this type
- // identifier before. If we have, we don't need to re-add the referenced
- // globals to the equivalence class.
- auto Ins = TypeIdUsers.insert({TypeId, {}});
- if (Ins.second) {
- // Add the type identifier to the equivalence class.
- GlobalClassesTy::iterator GCI = GlobalClasses.insert(TypeId);
- GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI);
-
- // Add the referenced globals to the type identifier's equivalence class.
- for (GlobalTypeMember *GTM : TypeIdInfo[TypeId].RefGlobals)
- CurSet = GlobalClasses.unionSets(
- CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM)));
- }
-
- return Ins.first->second;
- };
-
- if (TypeTestFunc) {
- for (const Use &U : TypeTestFunc->uses()) {
- auto CI = cast<CallInst>(U.getUser());
-
- auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
- if (!TypeIdMDVal)
- report_fatal_error("Second argument of llvm.type.test must be metadata");
- auto TypeId = TypeIdMDVal->getMetadata();
- AddTypeIdUse(TypeId).CallSites.push_back(CI);
- }
- }
-
- if (ICallBranchFunnelFunc) {
- for (const Use &U : ICallBranchFunnelFunc->uses()) {
- if (Arch != Triple::x86_64)
- report_fatal_error(
- "llvm.icall.branch.funnel not supported on this target");
-
- auto CI = cast<CallInst>(U.getUser());
-
- std::vector<GlobalTypeMember *> Targets;
- if (CI->getNumArgOperands() % 2 != 1)
- report_fatal_error("number of arguments should be odd");
-
- GlobalClassesTy::member_iterator CurSet;
- for (unsigned I = 1; I != CI->getNumArgOperands(); I += 2) {
- int64_t Offset;
- auto *Base = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
- CI->getOperand(I), Offset, M.getDataLayout()));
- if (!Base)
- report_fatal_error(
- "Expected branch funnel operand to be global value");
-
- GlobalTypeMember *GTM = GlobalTypeMembers[Base];
- Targets.push_back(GTM);
- GlobalClassesTy::member_iterator NewSet =
- GlobalClasses.findLeader(GlobalClasses.insert(GTM));
- if (I == 1)
- CurSet = NewSet;
- else
- CurSet = GlobalClasses.unionSets(CurSet, NewSet);
- }
-
- GlobalClasses.unionSets(
- CurSet, GlobalClasses.findLeader(
- GlobalClasses.insert(ICallBranchFunnel::create(
- Alloc, CI, Targets, ++CurUniqueId))));
- }
- }
-
- if (ExportSummary) {
- DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
- for (auto &P : TypeIdInfo) {
- if (auto *TypeId = dyn_cast<MDString>(P.first))
- MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back(
- TypeId);
- }
-
- for (auto &P : *ExportSummary) {
- for (auto &S : P.second.SummaryList) {
- if (!ExportSummary->isGlobalValueLive(S.get()))
- continue;
- if (auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject()))
- for (GlobalValue::GUID G : FS->type_tests())
- for (Metadata *MD : MetadataByGUID[G])
- AddTypeIdUse(MD).IsExported = true;
- }
- }
- }
-
- if (GlobalClasses.empty())
- return false;
-
- // Build a list of disjoint sets ordered by their maximum global index for
- // determinism.
- std::vector<std::pair<GlobalClassesTy::iterator, unsigned>> Sets;
- for (GlobalClassesTy::iterator I = GlobalClasses.begin(),
- E = GlobalClasses.end();
- I != E; ++I) {
- if (!I->isLeader())
- continue;
- ++NumTypeIdDisjointSets;
-
- unsigned MaxUniqueId = 0;
- for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I);
- MI != GlobalClasses.member_end(); ++MI) {
- if (auto *MD = MI->dyn_cast<Metadata *>())
- MaxUniqueId = std::max(MaxUniqueId, TypeIdInfo[MD].UniqueId);
- else if (auto *BF = MI->dyn_cast<ICallBranchFunnel *>())
- MaxUniqueId = std::max(MaxUniqueId, BF->UniqueId);
- }
- Sets.emplace_back(I, MaxUniqueId);
- }
- llvm::sort(Sets,
- [](const std::pair<GlobalClassesTy::iterator, unsigned> &S1,
- const std::pair<GlobalClassesTy::iterator, unsigned> &S2) {
- return S1.second < S2.second;
- });
-
- // For each disjoint set we found...
- for (const auto &S : Sets) {
- // Build the list of type identifiers in this disjoint set.
- std::vector<Metadata *> TypeIds;
- std::vector<GlobalTypeMember *> Globals;
- std::vector<ICallBranchFunnel *> ICallBranchFunnels;
- for (GlobalClassesTy::member_iterator MI =
- GlobalClasses.member_begin(S.first);
- MI != GlobalClasses.member_end(); ++MI) {
- if (MI->is<Metadata *>())
- TypeIds.push_back(MI->get<Metadata *>());
- else if (MI->is<GlobalTypeMember *>())
- Globals.push_back(MI->get<GlobalTypeMember *>());
- else
- ICallBranchFunnels.push_back(MI->get<ICallBranchFunnel *>());
- }
-
- // Order type identifiers by unique ID for determinism. This ordering is
- // stable as there is a one-to-one mapping between metadata and unique IDs.
- llvm::sort(TypeIds, [&](Metadata *M1, Metadata *M2) {
- return TypeIdInfo[M1].UniqueId < TypeIdInfo[M2].UniqueId;
- });
-
- // Same for the branch funnels.
- llvm::sort(ICallBranchFunnels,
- [&](ICallBranchFunnel *F1, ICallBranchFunnel *F2) {
- return F1->UniqueId < F2->UniqueId;
- });
-
- // Build bitsets for this disjoint set.
- buildBitSetsFromDisjointSet(TypeIds, Globals, ICallBranchFunnels);
- }
-
- allocateByteArrays();
-
- // Parse alias data to replace stand-in function declarations for aliases
- // with an alias to the intended target.
- if (ExportSummary) {
- if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) {
- for (auto AliasMD : AliasesMD->operands()) {
- assert(AliasMD->getNumOperands() >= 4);
- StringRef AliasName =
- cast<MDString>(AliasMD->getOperand(0))->getString();
- StringRef Aliasee = cast<MDString>(AliasMD->getOperand(1))->getString();
-
- if (!ExportedFunctions.count(Aliasee) ||
- ExportedFunctions[Aliasee].Linkage != CFL_Definition ||
- !M.getNamedAlias(Aliasee))
- continue;
-
- GlobalValue::VisibilityTypes Visibility =
- static_cast<GlobalValue::VisibilityTypes>(
- cast<ConstantAsMetadata>(AliasMD->getOperand(2))
- ->getValue()
- ->getUniqueInteger()
- .getZExtValue());
- bool Weak =
- static_cast<bool>(cast<ConstantAsMetadata>(AliasMD->getOperand(3))
- ->getValue()
- ->getUniqueInteger()
- .getZExtValue());
-
- auto *Alias = GlobalAlias::create("", M.getNamedAlias(Aliasee));
- Alias->setVisibility(Visibility);
- if (Weak)
- Alias->setLinkage(GlobalValue::WeakAnyLinkage);
-
- if (auto *F = M.getFunction(AliasName)) {
- Alias->takeName(F);
- F->replaceAllUsesWith(Alias);
- F->eraseFromParent();
- } else {
- Alias->setName(AliasName);
- }
- }
- }
- }
-
- // Emit .symver directives for exported functions, if they exist.
- if (ExportSummary) {
- if (NamedMDNode *SymversMD = M.getNamedMetadata("symvers")) {
- for (auto Symver : SymversMD->operands()) {
- assert(Symver->getNumOperands() >= 2);
- StringRef SymbolName =
- cast<MDString>(Symver->getOperand(0))->getString();
- StringRef Alias = cast<MDString>(Symver->getOperand(1))->getString();
-
- if (!ExportedFunctions.count(SymbolName))
- continue;
-
- M.appendModuleInlineAsm(
- (llvm::Twine(".symver ") + SymbolName + ", " + Alias).str());
- }
- }
- }
-
- return true;
-}
-
-PreservedAnalyses LowerTypeTestsPass::run(Module &M,
- ModuleAnalysisManager &AM) {
+ // Make sure we don't emit .eh_frame for this function.
+ F->addFnAttr(Attribute::NoUnwind);
+
+ BasicBlock *BB = BasicBlock::Create(M.getContext(), "entry", F);
+ IRBuilder<> IRB(BB);
+
+ SmallVector<Type *, 16> ArgTypes;
+ ArgTypes.reserve(AsmArgs.size());
+ for (const auto &Arg : AsmArgs)
+ ArgTypes.push_back(Arg->getType());
+ InlineAsm *JumpTableAsm =
+ InlineAsm::get(FunctionType::get(IRB.getVoidTy(), ArgTypes, false),
+ AsmOS.str(), ConstraintOS.str(),
+ /*hasSideEffects=*/true);
+
+ IRB.CreateCall(JumpTableAsm, AsmArgs);
+ IRB.CreateUnreachable();
+}
+
+/// Given a disjoint set of type identifiers and functions, build a jump table
+/// for the functions, build the bit sets and lower the llvm.type.test calls.
+void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
+ ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
+ // Unlike the global bitset builder, the function bitset builder cannot
+ // re-arrange functions in a particular order and base its calculations on the
+ // layout of the functions' entry points, as we have no idea how large a
+ // particular function will end up being (the size could even depend on what
+ // this pass does!) Instead, we build a jump table, which is a block of code
+ // consisting of one branch instruction for each of the functions in the bit
+ // set that branches to the target function, and redirect any taken function
+ // addresses to the corresponding jump table entry. In the object file's
+ // symbol table, the symbols for the target functions also refer to the jump
+ // table entries, so that addresses taken outside the module will pass any
+ // verification done inside the module.
+ //
+ // In more concrete terms, suppose we have three functions f, g, h which are
+ // of the same type, and a function foo that returns their addresses:
+ //
+ // f:
+ // mov 0, %eax
+ // ret
+ //
+ // g:
+ // mov 1, %eax
+ // ret
+ //
+ // h:
+ // mov 2, %eax
+ // ret
+ //
+ // foo:
+ // mov f, %eax
+ // mov g, %edx
+ // mov h, %ecx
+ // ret
+ //
+ // We output the jump table as module-level inline asm string. The end result
+ // will (conceptually) look like this:
+ //
+ // f = .cfi.jumptable
+ // g = .cfi.jumptable + 4
+ // h = .cfi.jumptable + 8
+ // .cfi.jumptable:
+ // jmp f.cfi ; 5 bytes
+ // int3 ; 1 byte
+ // int3 ; 1 byte
+ // int3 ; 1 byte
+ // jmp g.cfi ; 5 bytes
+ // int3 ; 1 byte
+ // int3 ; 1 byte
+ // int3 ; 1 byte
+ // jmp h.cfi ; 5 bytes
+ // int3 ; 1 byte
+ // int3 ; 1 byte
+ // int3 ; 1 byte
+ //
+ // f.cfi:
+ // mov 0, %eax
+ // ret
+ //
+ // g.cfi:
+ // mov 1, %eax
+ // ret
+ //
+ // h.cfi:
+ // mov 2, %eax
+ // ret
+ //
+ // foo:
+ // mov f, %eax
+ // mov g, %edx
+ // mov h, %ecx
+ // ret
+ //
+ // Because the addresses of f, g, h are evenly spaced at a power of 2, in the
+ // normal case the check can be carried out using the same kind of simple
+ // arithmetic that we normally use for globals.
+
+ // FIXME: find a better way to represent the jumptable in the IR.
+ assert(!Functions.empty());
+
+ // Build a simple layout based on the regular layout of jump tables.
+ DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
+ unsigned EntrySize = getJumpTableEntrySize();
+ for (unsigned I = 0; I != Functions.size(); ++I)
+ GlobalLayout[Functions[I]] = I * EntrySize;
+
+ Function *JumpTableFn =
+ Function::Create(FunctionType::get(Type::getVoidTy(M.getContext()),
+ /* IsVarArg */ false),
+ GlobalValue::PrivateLinkage,
+ M.getDataLayout().getProgramAddressSpace(),
+ ".cfi.jumptable", &M);
+ ArrayType *JumpTableType =
+ ArrayType::get(getJumpTableEntryType(), Functions.size());
+ auto JumpTable =
+ ConstantExpr::getPointerCast(JumpTableFn, JumpTableType->getPointerTo(0));
+
+ lowerTypeTestCalls(TypeIds, JumpTable, GlobalLayout);
+
+ {
+ ScopedSaveAliaseesAndUsed S(M);
+
+ // Build aliases pointing to offsets into the jump table, and replace
+ // references to the original functions with references to the aliases.
+ for (unsigned I = 0; I != Functions.size(); ++I) {
+ Function *F = cast<Function>(Functions[I]->getGlobal());
+ bool IsJumpTableCanonical = Functions[I]->isJumpTableCanonical();
+
+ Constant *CombinedGlobalElemPtr = ConstantExpr::getBitCast(
+ ConstantExpr::getInBoundsGetElementPtr(
+ JumpTableType, JumpTable,
+ ArrayRef<Constant *>{ConstantInt::get(IntPtrTy, 0),
+ ConstantInt::get(IntPtrTy, I)}),
+ F->getType());
+ if (Functions[I]->isExported()) {
+ if (IsJumpTableCanonical) {
+ ExportSummary->cfiFunctionDefs().insert(std::string(F->getName()));
+ } else {
+ GlobalAlias *JtAlias = GlobalAlias::create(
+ F->getValueType(), 0, GlobalValue::ExternalLinkage,
+ F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M);
+ JtAlias->setVisibility(GlobalValue::HiddenVisibility);
+ ExportSummary->cfiFunctionDecls().insert(std::string(F->getName()));
+ }
+ }
+ if (!IsJumpTableCanonical) {
+ if (F->hasExternalWeakLinkage())
+ replaceWeakDeclarationWithJumpTablePtr(F, CombinedGlobalElemPtr,
+ IsJumpTableCanonical);
+ else
+ replaceCfiUses(F, CombinedGlobalElemPtr, IsJumpTableCanonical);
+ } else {
+ assert(F->getType()->getAddressSpace() == 0);
+
+ GlobalAlias *FAlias =
+ GlobalAlias::create(F->getValueType(), 0, F->getLinkage(), "",
+ CombinedGlobalElemPtr, &M);
+ FAlias->setVisibility(F->getVisibility());
+ FAlias->takeName(F);
+ if (FAlias->hasName())
+ F->setName(FAlias->getName() + ".cfi");
+ replaceCfiUses(F, FAlias, IsJumpTableCanonical);
+ if (!F->hasLocalLinkage())
+ F->setVisibility(GlobalVariable::HiddenVisibility);
+ }
+ }
+ }
+
+ createJumpTable(JumpTableFn, Functions);
+}
+
+/// Assign a dummy layout using an incrementing counter, tag each function
+/// with its index represented as metadata, and lower each type test to an
+/// integer range comparison. During generation of the indirect function call
+/// table in the backend, it will assign the given indexes.
+/// Note: Dynamic linking is not supported, as the WebAssembly ABI has not yet
+/// been finalized.
+void LowerTypeTestsModule::buildBitSetsFromFunctionsWASM(
+ ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Functions) {
+ assert(!Functions.empty());
+
+ // Build consecutive monotonic integer ranges for each call target set
+ DenseMap<GlobalTypeMember *, uint64_t> GlobalLayout;
+
+ for (GlobalTypeMember *GTM : Functions) {
+ Function *F = cast<Function>(GTM->getGlobal());
+
+ // Skip functions that are not address taken, to avoid bloating the table
+ if (!F->hasAddressTaken())
+ continue;
+
+ // Store metadata with the index for each function
+ MDNode *MD = MDNode::get(F->getContext(),
+ ArrayRef<Metadata *>(ConstantAsMetadata::get(
+ ConstantInt::get(Int64Ty, IndirectIndex))));
+ F->setMetadata("wasm.index", MD);
+
+ // Assign the counter value
+ GlobalLayout[GTM] = IndirectIndex++;
+ }
+
+ // The indirect function table index space starts at zero, so pass a NULL
+ // pointer as the subtracted "jump table" offset.
+ lowerTypeTestCalls(TypeIds, ConstantPointerNull::get(Int32PtrTy),
+ GlobalLayout);
+}
+
+void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
+ ArrayRef<Metadata *> TypeIds, ArrayRef<GlobalTypeMember *> Globals,
+ ArrayRef<ICallBranchFunnel *> ICallBranchFunnels) {
+ DenseMap<Metadata *, uint64_t> TypeIdIndices;
+ for (unsigned I = 0; I != TypeIds.size(); ++I)
+ TypeIdIndices[TypeIds[I]] = I;
+
+ // For each type identifier, build a set of indices that refer to members of
+ // the type identifier.
+ std::vector<std::set<uint64_t>> TypeMembers(TypeIds.size());
+ unsigned GlobalIndex = 0;
+ DenseMap<GlobalTypeMember *, uint64_t> GlobalIndices;
+ for (GlobalTypeMember *GTM : Globals) {
+ for (MDNode *Type : GTM->types()) {
+ // Type = { offset, type identifier }
+ auto I = TypeIdIndices.find(Type->getOperand(1));
+ if (I != TypeIdIndices.end())
+ TypeMembers[I->second].insert(GlobalIndex);
+ }
+ GlobalIndices[GTM] = GlobalIndex;
+ GlobalIndex++;
+ }
+
+ for (ICallBranchFunnel *JT : ICallBranchFunnels) {
+ TypeMembers.emplace_back();
+ std::set<uint64_t> &TMSet = TypeMembers.back();
+ for (GlobalTypeMember *T : JT->targets())
+ TMSet.insert(GlobalIndices[T]);
+ }
+
+ // Order the sets of indices by size. The GlobalLayoutBuilder works best
+ // when given small index sets first.
+ llvm::stable_sort(TypeMembers, [](const std::set<uint64_t> &O1,
+ const std::set<uint64_t> &O2) {
+ return O1.size() < O2.size();
+ });
+
+ // Create a GlobalLayoutBuilder and provide it with index sets as layout
+ // fragments. The GlobalLayoutBuilder tries to lay out members of fragments as
+ // close together as possible.
+ GlobalLayoutBuilder GLB(Globals.size());
+ for (auto &&MemSet : TypeMembers)
+ GLB.addFragment(MemSet);
+
+ // Build a vector of globals with the computed layout.
+ bool IsGlobalSet =
+ Globals.empty() || isa<GlobalVariable>(Globals[0]->getGlobal());
+ std::vector<GlobalTypeMember *> OrderedGTMs(Globals.size());
+ auto OGTMI = OrderedGTMs.begin();
+ for (auto &&F : GLB.Fragments) {
+ for (auto &&Offset : F) {
+ if (IsGlobalSet != isa<GlobalVariable>(Globals[Offset]->getGlobal()))
+ report_fatal_error("Type identifier may not contain both global "
+ "variables and functions");
+ *OGTMI++ = Globals[Offset];
+ }
+ }
+
+ // Build the bitsets from this disjoint set.
+ if (IsGlobalSet)
+ buildBitSetsFromGlobalVariables(TypeIds, OrderedGTMs);
+ else
+ buildBitSetsFromFunctions(TypeIds, OrderedGTMs);
+}
+
+/// Lower all type tests in this module.
+LowerTypeTestsModule::LowerTypeTestsModule(
+ Module &M, ModuleSummaryIndex *ExportSummary,
+ const ModuleSummaryIndex *ImportSummary, bool DropTypeTests)
+ : M(M), ExportSummary(ExportSummary), ImportSummary(ImportSummary),
+ DropTypeTests(DropTypeTests) {
+ assert(!(ExportSummary && ImportSummary));
+ Triple TargetTriple(M.getTargetTriple());
+ Arch = TargetTriple.getArch();
+ OS = TargetTriple.getOS();
+ ObjectFormat = TargetTriple.getObjectFormat();
+}
+
+bool LowerTypeTestsModule::runForTesting(Module &M) {
+ ModuleSummaryIndex Summary(/*HaveGVs=*/false);
+
+ // Handle the command-line summary arguments. This code is for testing
+ // purposes only, so we handle errors directly.
+ if (!ClReadSummary.empty()) {
+ ExitOnError ExitOnErr("-lowertypetests-read-summary: " + ClReadSummary +
+ ": ");
+ auto ReadSummaryFile =
+ ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+
+ yaml::Input In(ReadSummaryFile->getBuffer());
+ In >> Summary;
+ ExitOnErr(errorCodeToError(In.error()));
+ }
+
+ bool Changed =
+ LowerTypeTestsModule(
+ M, ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr,
+ ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr,
+ /*DropTypeTests*/ false)
+ .lower();
+
+ if (!ClWriteSummary.empty()) {
+ ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
+ ": ");
+ std::error_code EC;
+ raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text);
+ ExitOnErr(errorCodeToError(EC));
+
+ yaml::Output Out(OS);
+ Out << Summary;
+ }
+
+ return Changed;
+}
+
+static bool isDirectCall(Use& U) {
+ auto *Usr = dyn_cast<CallInst>(U.getUser());
+ if (Usr) {
+ auto *CB = dyn_cast<CallBase>(Usr);
+ if (CB && CB->isCallee(&U))
+ return true;
+ }
+ return false;
+}
+
+void LowerTypeTestsModule::replaceCfiUses(Function *Old, Value *New,
+ bool IsJumpTableCanonical) {
+ SmallSetVector<Constant *, 4> Constants;
+ auto UI = Old->use_begin(), E = Old->use_end();
+ for (; UI != E;) {
+ Use &U = *UI;
+ ++UI;
+
+ // Skip block addresses
+ if (isa<BlockAddress>(U.getUser()))
+ continue;
+
+ // Skip direct calls to externally defined or non-dso_local functions
+ if (isDirectCall(U) && (Old->isDSOLocal() || !IsJumpTableCanonical))
+ continue;
+
+ // Must handle Constants specially, we cannot call replaceUsesOfWith on a
+ // constant because they are uniqued.
+ if (auto *C = dyn_cast<Constant>(U.getUser())) {
+ if (!isa<GlobalValue>(C)) {
+ // Save unique users to avoid processing operand replacement
+ // more than once.
+ Constants.insert(C);
+ continue;
+ }
+ }
+
+ U.set(New);
+ }
+
+ // Process operand replacement of saved constants.
+ for (auto *C : Constants)
+ C->handleOperandChange(Old, New);
+}
+
+void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) {
+ Old->replaceUsesWithIf(New, [](Use &U) { return isDirectCall(U); });
+}
+
+bool LowerTypeTestsModule::lower() {
+ Function *TypeTestFunc =
+ M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+
+ if (DropTypeTests && TypeTestFunc) {
+ for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
+ UI != UE;) {
+ auto *CI = cast<CallInst>((*UI++).getUser());
+ // Find and erase llvm.assume intrinsics for this llvm.type.test call.
+ for (auto CIU = CI->use_begin(), CIUE = CI->use_end(); CIU != CIUE;) {
+ if (auto *AssumeCI = dyn_cast<CallInst>((*CIU++).getUser())) {
+ Function *F = AssumeCI->getCalledFunction();
+ if (F && F->getIntrinsicID() == Intrinsic::assume)
+ AssumeCI->eraseFromParent();
+ }
+ }
+ CI->eraseFromParent();
+ }
+
+ // We have deleted the type intrinsics, so we no longer have enough
+ // information to reason about the liveness of virtual function pointers
+ // in GlobalDCE.
+ for (GlobalVariable &GV : M.globals())
+ GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
+
+ return true;
+ }
+
+ // If only some of the modules were split, we cannot correctly perform
+ // this transformation. We already checked for the presense of type tests
+ // with partially split modules during the thin link, and would have emitted
+ // an error if any were found, so here we can simply return.
+ if ((ExportSummary && ExportSummary->partiallySplitLTOUnits()) ||
+ (ImportSummary && ImportSummary->partiallySplitLTOUnits()))
+ return false;
+
+ Function *ICallBranchFunnelFunc =
+ M.getFunction(Intrinsic::getName(Intrinsic::icall_branch_funnel));
+ if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
+ (!ICallBranchFunnelFunc || ICallBranchFunnelFunc->use_empty()) &&
+ !ExportSummary && !ImportSummary)
+ return false;
+
+ if (ImportSummary) {
+ if (TypeTestFunc) {
+ for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
+ UI != UE;) {
+ auto *CI = cast<CallInst>((*UI++).getUser());
+ importTypeTest(CI);
+ }
+ }
+
+ if (ICallBranchFunnelFunc && !ICallBranchFunnelFunc->use_empty())
+ report_fatal_error(
+ "unexpected call to llvm.icall.branch.funnel during import phase");
+
+ SmallVector<Function *, 8> Defs;
+ SmallVector<Function *, 8> Decls;
+ for (auto &F : M) {
+ // CFI functions are either external, or promoted. A local function may
+ // have the same name, but it's not the one we are looking for.
+ if (F.hasLocalLinkage())
+ continue;
+ if (ImportSummary->cfiFunctionDefs().count(std::string(F.getName())))
+ Defs.push_back(&F);
+ else if (ImportSummary->cfiFunctionDecls().count(
+ std::string(F.getName())))
+ Decls.push_back(&F);
+ }
+
+ std::vector<GlobalAlias *> AliasesToErase;
+ {
+ ScopedSaveAliaseesAndUsed S(M);
+ for (auto F : Defs)
+ importFunction(F, /*isJumpTableCanonical*/ true, AliasesToErase);
+ for (auto F : Decls)
+ importFunction(F, /*isJumpTableCanonical*/ false, AliasesToErase);
+ }
+ for (GlobalAlias *GA : AliasesToErase)
+ GA->eraseFromParent();
+
+ return true;
+ }
+
+ // Equivalence class set containing type identifiers and the globals that
+ // reference them. This is used to partition the set of type identifiers in
+ // the module into disjoint sets.
+ using GlobalClassesTy = EquivalenceClasses<
+ PointerUnion<GlobalTypeMember *, Metadata *, ICallBranchFunnel *>>;
+ GlobalClassesTy GlobalClasses;
+
+ // Verify the type metadata and build a few data structures to let us
+ // efficiently enumerate the type identifiers associated with a global:
+ // a list of GlobalTypeMembers (a GlobalObject stored alongside a vector
+ // of associated type metadata) and a mapping from type identifiers to their
+ // list of GlobalTypeMembers and last observed index in the list of globals.
+ // The indices will be used later to deterministically order the list of type
+ // identifiers.
+ BumpPtrAllocator Alloc;
+ struct TIInfo {
+ unsigned UniqueId;
+ std::vector<GlobalTypeMember *> RefGlobals;
+ };
+ DenseMap<Metadata *, TIInfo> TypeIdInfo;
+ unsigned CurUniqueId = 0;
+ SmallVector<MDNode *, 2> Types;
+
+ // Cross-DSO CFI emits jumptable entries for exported functions as well as
+ // address taken functions in case they are address taken in other modules.
+ const bool CrossDsoCfi = M.getModuleFlag("Cross-DSO CFI") != nullptr;
+
+ struct ExportedFunctionInfo {
+ CfiFunctionLinkage Linkage;
+ MDNode *FuncMD; // {name, linkage, type[, type...]}
+ };
+ DenseMap<StringRef, ExportedFunctionInfo> ExportedFunctions;
+ if (ExportSummary) {
+ // A set of all functions that are address taken by a live global object.
+ DenseSet<GlobalValue::GUID> AddressTaken;
+ for (auto &I : *ExportSummary)
+ for (auto &GVS : I.second.SummaryList)
+ if (GVS->isLive())
+ for (auto &Ref : GVS->refs())
+ AddressTaken.insert(Ref.getGUID());
+
+ NamedMDNode *CfiFunctionsMD = M.getNamedMetadata("cfi.functions");
+ if (CfiFunctionsMD) {
+ for (auto FuncMD : CfiFunctionsMD->operands()) {
+ assert(FuncMD->getNumOperands() >= 2);
+ StringRef FunctionName =
+ cast<MDString>(FuncMD->getOperand(0))->getString();
+ CfiFunctionLinkage Linkage = static_cast<CfiFunctionLinkage>(
+ cast<ConstantAsMetadata>(FuncMD->getOperand(1))
+ ->getValue()
+ ->getUniqueInteger()
+ .getZExtValue());
+ const GlobalValue::GUID GUID = GlobalValue::getGUID(
+ GlobalValue::dropLLVMManglingEscape(FunctionName));
+ // Do not emit jumptable entries for functions that are not-live and
+ // have no live references (and are not exported with cross-DSO CFI.)
+ if (!ExportSummary->isGUIDLive(GUID))
+ continue;
+ if (!AddressTaken.count(GUID)) {
+ if (!CrossDsoCfi || Linkage != CFL_Definition)
+ continue;
+
+ bool Exported = false;
+ if (auto VI = ExportSummary->getValueInfo(GUID))
+ for (auto &GVS : VI.getSummaryList())
+ if (GVS->isLive() && !GlobalValue::isLocalLinkage(GVS->linkage()))
+ Exported = true;
+
+ if (!Exported)
+ continue;
+ }
+ auto P = ExportedFunctions.insert({FunctionName, {Linkage, FuncMD}});
+ if (!P.second && P.first->second.Linkage != CFL_Definition)
+ P.first->second = {Linkage, FuncMD};
+ }
+
+ for (const auto &P : ExportedFunctions) {
+ StringRef FunctionName = P.first;
+ CfiFunctionLinkage Linkage = P.second.Linkage;
+ MDNode *FuncMD = P.second.FuncMD;
+ Function *F = M.getFunction(FunctionName);
+ if (F && F->hasLocalLinkage()) {
+ // Locally defined function that happens to have the same name as a
+ // function defined in a ThinLTO module. Rename it to move it out of
+ // the way of the external reference that we're about to create.
+ // Note that setName will find a unique name for the function, so even
+ // if there is an existing function with the suffix there won't be a
+ // name collision.
+ F->setName(F->getName() + ".1");
+ F = nullptr;
+ }
+
+ if (!F)
+ F = Function::Create(
+ FunctionType::get(Type::getVoidTy(M.getContext()), false),
+ GlobalVariable::ExternalLinkage,
+ M.getDataLayout().getProgramAddressSpace(), FunctionName, &M);
+
+ // If the function is available_externally, remove its definition so
+ // that it is handled the same way as a declaration. Later we will try
+ // to create an alias using this function's linkage, which will fail if
+ // the linkage is available_externally. This will also result in us
+ // following the code path below to replace the type metadata.
+ if (F->hasAvailableExternallyLinkage()) {
+ F->setLinkage(GlobalValue::ExternalLinkage);
+ F->deleteBody();
+ F->setComdat(nullptr);
+ F->clearMetadata();
+ }
+
+ // Update the linkage for extern_weak declarations when a definition
+ // exists.
+ if (Linkage == CFL_Definition && F->hasExternalWeakLinkage())
+ F->setLinkage(GlobalValue::ExternalLinkage);
+
+ // If the function in the full LTO module is a declaration, replace its
+ // type metadata with the type metadata we found in cfi.functions. That
+ // metadata is presumed to be more accurate than the metadata attached
+ // to the declaration.
+ if (F->isDeclaration()) {
+ if (Linkage == CFL_WeakDeclaration)
+ F->setLinkage(GlobalValue::ExternalWeakLinkage);
+
+ F->eraseMetadata(LLVMContext::MD_type);
+ for (unsigned I = 2; I < FuncMD->getNumOperands(); ++I)
+ F->addMetadata(LLVMContext::MD_type,
+ *cast<MDNode>(FuncMD->getOperand(I).get()));
+ }
+ }
+ }
+ }
+
+ DenseMap<GlobalObject *, GlobalTypeMember *> GlobalTypeMembers;
+ for (GlobalObject &GO : M.global_objects()) {
+ if (isa<GlobalVariable>(GO) && GO.isDeclarationForLinker())
+ continue;
+
+ Types.clear();
+ GO.getMetadata(LLVMContext::MD_type, Types);
+
+ bool IsJumpTableCanonical = false;
+ bool IsExported = false;
+ if (Function *F = dyn_cast<Function>(&GO)) {
+ IsJumpTableCanonical = isJumpTableCanonical(F);
+ if (ExportedFunctions.count(F->getName())) {
+ IsJumpTableCanonical |=
+ ExportedFunctions[F->getName()].Linkage == CFL_Definition;
+ IsExported = true;
+ // TODO: The logic here checks only that the function is address taken,
+ // not that the address takers are live. This can be updated to check
+ // their liveness and emit fewer jumptable entries once monolithic LTO
+ // builds also emit summaries.
+ } else if (!F->hasAddressTaken()) {
+ if (!CrossDsoCfi || !IsJumpTableCanonical || F->hasLocalLinkage())
+ continue;
+ }
+ }
+
+ auto *GTM = GlobalTypeMember::create(Alloc, &GO, IsJumpTableCanonical,
+ IsExported, Types);
+ GlobalTypeMembers[&GO] = GTM;
+ for (MDNode *Type : Types) {
+ verifyTypeMDNode(&GO, Type);
+ auto &Info = TypeIdInfo[Type->getOperand(1)];
+ Info.UniqueId = ++CurUniqueId;
+ Info.RefGlobals.push_back(GTM);
+ }
+ }
+
+ auto AddTypeIdUse = [&](Metadata *TypeId) -> TypeIdUserInfo & {
+ // Add the call site to the list of call sites for this type identifier. We
+ // also use TypeIdUsers to keep track of whether we have seen this type
+ // identifier before. If we have, we don't need to re-add the referenced
+ // globals to the equivalence class.
+ auto Ins = TypeIdUsers.insert({TypeId, {}});
+ if (Ins.second) {
+ // Add the type identifier to the equivalence class.
+ GlobalClassesTy::iterator GCI = GlobalClasses.insert(TypeId);
+ GlobalClassesTy::member_iterator CurSet = GlobalClasses.findLeader(GCI);
+
+ // Add the referenced globals to the type identifier's equivalence class.
+ for (GlobalTypeMember *GTM : TypeIdInfo[TypeId].RefGlobals)
+ CurSet = GlobalClasses.unionSets(
+ CurSet, GlobalClasses.findLeader(GlobalClasses.insert(GTM)));
+ }
+
+ return Ins.first->second;
+ };
+
+ if (TypeTestFunc) {
+ for (const Use &U : TypeTestFunc->uses()) {
+ auto CI = cast<CallInst>(U.getUser());
+
+ auto TypeIdMDVal = dyn_cast<MetadataAsValue>(CI->getArgOperand(1));
+ if (!TypeIdMDVal)
+ report_fatal_error("Second argument of llvm.type.test must be metadata");
+ auto TypeId = TypeIdMDVal->getMetadata();
+ AddTypeIdUse(TypeId).CallSites.push_back(CI);
+ }
+ }
+
+ if (ICallBranchFunnelFunc) {
+ for (const Use &U : ICallBranchFunnelFunc->uses()) {
+ if (Arch != Triple::x86_64)
+ report_fatal_error(
+ "llvm.icall.branch.funnel not supported on this target");
+
+ auto CI = cast<CallInst>(U.getUser());
+
+ std::vector<GlobalTypeMember *> Targets;
+ if (CI->getNumArgOperands() % 2 != 1)
+ report_fatal_error("number of arguments should be odd");
+
+ GlobalClassesTy::member_iterator CurSet;
+ for (unsigned I = 1; I != CI->getNumArgOperands(); I += 2) {
+ int64_t Offset;
+ auto *Base = dyn_cast<GlobalObject>(GetPointerBaseWithConstantOffset(
+ CI->getOperand(I), Offset, M.getDataLayout()));
+ if (!Base)
+ report_fatal_error(
+ "Expected branch funnel operand to be global value");
+
+ GlobalTypeMember *GTM = GlobalTypeMembers[Base];
+ Targets.push_back(GTM);
+ GlobalClassesTy::member_iterator NewSet =
+ GlobalClasses.findLeader(GlobalClasses.insert(GTM));
+ if (I == 1)
+ CurSet = NewSet;
+ else
+ CurSet = GlobalClasses.unionSets(CurSet, NewSet);
+ }
+
+ GlobalClasses.unionSets(
+ CurSet, GlobalClasses.findLeader(
+ GlobalClasses.insert(ICallBranchFunnel::create(
+ Alloc, CI, Targets, ++CurUniqueId))));
+ }
+ }
+
+ if (ExportSummary) {
+ DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
+ for (auto &P : TypeIdInfo) {
+ if (auto *TypeId = dyn_cast<MDString>(P.first))
+ MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back(
+ TypeId);
+ }
+
+ for (auto &P : *ExportSummary) {
+ for (auto &S : P.second.SummaryList) {
+ if (!ExportSummary->isGlobalValueLive(S.get()))
+ continue;
+ if (auto *FS = dyn_cast<FunctionSummary>(S->getBaseObject()))
+ for (GlobalValue::GUID G : FS->type_tests())
+ for (Metadata *MD : MetadataByGUID[G])
+ AddTypeIdUse(MD).IsExported = true;
+ }
+ }
+ }
+
+ if (GlobalClasses.empty())
+ return false;
+
+ // Build a list of disjoint sets ordered by their maximum global index for
+ // determinism.
+ std::vector<std::pair<GlobalClassesTy::iterator, unsigned>> Sets;
+ for (GlobalClassesTy::iterator I = GlobalClasses.begin(),
+ E = GlobalClasses.end();
+ I != E; ++I) {
+ if (!I->isLeader())
+ continue;
+ ++NumTypeIdDisjointSets;
+
+ unsigned MaxUniqueId = 0;
+ for (GlobalClassesTy::member_iterator MI = GlobalClasses.member_begin(I);
+ MI != GlobalClasses.member_end(); ++MI) {
+ if (auto *MD = MI->dyn_cast<Metadata *>())
+ MaxUniqueId = std::max(MaxUniqueId, TypeIdInfo[MD].UniqueId);
+ else if (auto *BF = MI->dyn_cast<ICallBranchFunnel *>())
+ MaxUniqueId = std::max(MaxUniqueId, BF->UniqueId);
+ }
+ Sets.emplace_back(I, MaxUniqueId);
+ }
+ llvm::sort(Sets,
+ [](const std::pair<GlobalClassesTy::iterator, unsigned> &S1,
+ const std::pair<GlobalClassesTy::iterator, unsigned> &S2) {
+ return S1.second < S2.second;
+ });
+
+ // For each disjoint set we found...
+ for (const auto &S : Sets) {
+ // Build the list of type identifiers in this disjoint set.
+ std::vector<Metadata *> TypeIds;
+ std::vector<GlobalTypeMember *> Globals;
+ std::vector<ICallBranchFunnel *> ICallBranchFunnels;
+ for (GlobalClassesTy::member_iterator MI =
+ GlobalClasses.member_begin(S.first);
+ MI != GlobalClasses.member_end(); ++MI) {
+ if (MI->is<Metadata *>())
+ TypeIds.push_back(MI->get<Metadata *>());
+ else if (MI->is<GlobalTypeMember *>())
+ Globals.push_back(MI->get<GlobalTypeMember *>());
+ else
+ ICallBranchFunnels.push_back(MI->get<ICallBranchFunnel *>());
+ }
+
+ // Order type identifiers by unique ID for determinism. This ordering is
+ // stable as there is a one-to-one mapping between metadata and unique IDs.
+ llvm::sort(TypeIds, [&](Metadata *M1, Metadata *M2) {
+ return TypeIdInfo[M1].UniqueId < TypeIdInfo[M2].UniqueId;
+ });
+
+ // Same for the branch funnels.
+ llvm::sort(ICallBranchFunnels,
+ [&](ICallBranchFunnel *F1, ICallBranchFunnel *F2) {
+ return F1->UniqueId < F2->UniqueId;
+ });
+
+ // Build bitsets for this disjoint set.
+ buildBitSetsFromDisjointSet(TypeIds, Globals, ICallBranchFunnels);
+ }
+
+ allocateByteArrays();
+
+ // Parse alias data to replace stand-in function declarations for aliases
+ // with an alias to the intended target.
+ if (ExportSummary) {
+ if (NamedMDNode *AliasesMD = M.getNamedMetadata("aliases")) {
+ for (auto AliasMD : AliasesMD->operands()) {
+ assert(AliasMD->getNumOperands() >= 4);
+ StringRef AliasName =
+ cast<MDString>(AliasMD->getOperand(0))->getString();
+ StringRef Aliasee = cast<MDString>(AliasMD->getOperand(1))->getString();
+
+ if (!ExportedFunctions.count(Aliasee) ||
+ ExportedFunctions[Aliasee].Linkage != CFL_Definition ||
+ !M.getNamedAlias(Aliasee))
+ continue;
+
+ GlobalValue::VisibilityTypes Visibility =
+ static_cast<GlobalValue::VisibilityTypes>(
+ cast<ConstantAsMetadata>(AliasMD->getOperand(2))
+ ->getValue()
+ ->getUniqueInteger()
+ .getZExtValue());
+ bool Weak =
+ static_cast<bool>(cast<ConstantAsMetadata>(AliasMD->getOperand(3))
+ ->getValue()
+ ->getUniqueInteger()
+ .getZExtValue());
+
+ auto *Alias = GlobalAlias::create("", M.getNamedAlias(Aliasee));
+ Alias->setVisibility(Visibility);
+ if (Weak)
+ Alias->setLinkage(GlobalValue::WeakAnyLinkage);
+
+ if (auto *F = M.getFunction(AliasName)) {
+ Alias->takeName(F);
+ F->replaceAllUsesWith(Alias);
+ F->eraseFromParent();
+ } else {
+ Alias->setName(AliasName);
+ }
+ }
+ }
+ }
+
+ // Emit .symver directives for exported functions, if they exist.
+ if (ExportSummary) {
+ if (NamedMDNode *SymversMD = M.getNamedMetadata("symvers")) {
+ for (auto Symver : SymversMD->operands()) {
+ assert(Symver->getNumOperands() >= 2);
+ StringRef SymbolName =
+ cast<MDString>(Symver->getOperand(0))->getString();
+ StringRef Alias = cast<MDString>(Symver->getOperand(1))->getString();
+
+ if (!ExportedFunctions.count(SymbolName))
+ continue;
+
+ M.appendModuleInlineAsm(
+ (llvm::Twine(".symver ") + SymbolName + ", " + Alias).str());
+ }
+ }
+ }
+
+ return true;
+}
+
+PreservedAnalyses LowerTypeTestsPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
bool Changed;
if (UseCommandLine)
Changed = LowerTypeTestsModule::runForTesting(M);
@@ -2262,7 +2262,7 @@ PreservedAnalyses LowerTypeTestsPass::run(Module &M,
Changed =
LowerTypeTestsModule(M, ExportSummary, ImportSummary, DropTypeTests)
.lower();
- if (!Changed)
- return PreservedAnalyses::all();
- return PreservedAnalyses::none();
-}
+ if (!Changed)
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/MergeFunctions.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/MergeFunctions.cpp
index 7ec8de7715..ec5d86b72a 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/MergeFunctions.cpp
@@ -1,955 +1,955 @@
-//===- MergeFunctions.cpp - Merge identical functions ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass looks for equivalent functions that are mergable and folds them.
-//
-// Order relation is defined on set of functions. It was made through
-// special function comparison procedure that returns
-// 0 when functions are equal,
-// -1 when Left function is less than right function, and
-// 1 for opposite case. We need total-ordering, so we need to maintain
-// four properties on the functions set:
-// a <= a (reflexivity)
-// if a <= b and b <= a then a = b (antisymmetry)
-// if a <= b and b <= c then a <= c (transitivity).
-// for all a and b: a <= b or b <= a (totality).
-//
-// Comparison iterates through each instruction in each basic block.
-// Functions are kept on binary tree. For each new function F we perform
-// lookup in binary tree.
-// In practice it works the following way:
-// -- We define Function* container class with custom "operator<" (FunctionPtr).
-// -- "FunctionPtr" instances are stored in std::set collection, so every
-// std::set::insert operation will give you result in log(N) time.
-//
-// As an optimization, a hash of the function structure is calculated first, and
-// two functions are only compared if they have the same hash. This hash is
-// cheap to compute, and has the property that if function F == G according to
-// the comparison function, then hash(F) == hash(G). This consistency property
-// is critical to ensuring all possible merging opportunities are exploited.
-// Collisions in the hash affect the speed of the pass but not the correctness
-// or determinism of the resulting transformation.
-//
-// When a match is found the functions are folded. If both functions are
-// overridable, we move the functionality into a new internal function and
-// leave two overridable thunks to it.
-//
-//===----------------------------------------------------------------------===//
-//
-// Future work:
-//
-// * virtual functions.
-//
-// Many functions have their address taken by the virtual function table for
-// the object they belong to. However, as long as it's only used for a lookup
-// and call, this is irrelevant, and we'd like to fold such functions.
-//
-// * be smarter about bitcasts.
-//
-// In order to fold functions, we will sometimes add either bitcast instructions
-// or bitcast constant expressions. Unfortunately, this can confound further
-// analysis since the two functions differ where one has a bitcast and the
-// other doesn't. We should learn to look through bitcasts.
-//
-// * Compare complex types with pointer types inside.
-// * Compare cross-reference cases.
-// * Compare complex expressions.
-//
-// All the three issues above could be described as ability to prove that
-// fA == fB == fC == fE == fF == fG in example below:
-//
-// void fA() {
-// fB();
-// }
-// void fB() {
-// fA();
-// }
-//
-// void fE() {
-// fF();
-// }
-// void fF() {
-// fG();
-// }
-// void fG() {
-// fE();
-// }
-//
-// Simplest cross-reference case (fA <--> fB) was implemented in previous
-// versions of MergeFunctions, though it presented only in two function pairs
-// in test-suite (that counts >50k functions)
-// Though possibility to detect complex cross-referencing (e.g.: A->B->C->D->A)
-// could cover much more cases.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/ValueMap.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/MergeFunctions.h"
-#include "llvm/Transforms/Utils/FunctionComparator.h"
-#include <algorithm>
-#include <cassert>
-#include <iterator>
-#include <set>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "mergefunc"
-
-STATISTIC(NumFunctionsMerged, "Number of functions merged");
-STATISTIC(NumThunksWritten, "Number of thunks generated");
-STATISTIC(NumAliasesWritten, "Number of aliases generated");
-STATISTIC(NumDoubleWeak, "Number of new functions created");
-
-static cl::opt<unsigned> NumFunctionsForSanityCheck(
- "mergefunc-sanity",
- cl::desc("How many functions in module could be used for "
- "MergeFunctions pass sanity check. "
- "'0' disables this check. Works only with '-debug' key."),
- cl::init(0), cl::Hidden);
-
-// Under option -mergefunc-preserve-debug-info we:
-// - Do not create a new function for a thunk.
-// - Retain the debug info for a thunk's parameters (and associated
-// instructions for the debug info) from the entry block.
-// Note: -debug will display the algorithm at work.
-// - Create debug-info for the call (to the shared implementation) made by
-// a thunk and its return value.
-// - Erase the rest of the function, retaining the (minimally sized) entry
-// block to create a thunk.
-// - Preserve a thunk's call site to point to the thunk even when both occur
-// within the same translation unit, to aid debugability. Note that this
-// behaviour differs from the underlying -mergefunc implementation which
-// modifies the thunk's call site to point to the shared implementation
-// when both occur within the same translation unit.
-static cl::opt<bool>
- MergeFunctionsPDI("mergefunc-preserve-debug-info", cl::Hidden,
- cl::init(false),
- cl::desc("Preserve debug info in thunk when mergefunc "
- "transformations are made."));
-
-static cl::opt<bool>
- MergeFunctionsAliases("mergefunc-use-aliases", cl::Hidden,
- cl::init(false),
- cl::desc("Allow mergefunc to create aliases"));
-
-namespace {
-
-class FunctionNode {
- mutable AssertingVH<Function> F;
- FunctionComparator::FunctionHash Hash;
-
-public:
- // Note the hash is recalculated potentially multiple times, but it is cheap.
- FunctionNode(Function *F)
- : F(F), Hash(FunctionComparator::functionHash(*F)) {}
-
- Function *getFunc() const { return F; }
- FunctionComparator::FunctionHash getHash() const { return Hash; }
-
- /// Replace the reference to the function F by the function G, assuming their
- /// implementations are equal.
- void replaceBy(Function *G) const {
- F = G;
- }
-};
-
-/// MergeFunctions finds functions which will generate identical machine code,
-/// by considering all pointer types to be equivalent. Once identified,
-/// MergeFunctions will fold them by replacing a call to one to a call to a
-/// bitcast of the other.
-class MergeFunctions {
-public:
- MergeFunctions() : FnTree(FunctionNodeCmp(&GlobalNumbers)) {
- }
-
- bool runOnModule(Module &M);
-
-private:
- // The function comparison operator is provided here so that FunctionNodes do
- // not need to become larger with another pointer.
- class FunctionNodeCmp {
- GlobalNumberState* GlobalNumbers;
-
- public:
- FunctionNodeCmp(GlobalNumberState* GN) : GlobalNumbers(GN) {}
-
- bool operator()(const FunctionNode &LHS, const FunctionNode &RHS) const {
- // Order first by hashes, then full function comparison.
- if (LHS.getHash() != RHS.getHash())
- return LHS.getHash() < RHS.getHash();
- FunctionComparator FCmp(LHS.getFunc(), RHS.getFunc(), GlobalNumbers);
- return FCmp.compare() == -1;
- }
- };
- using FnTreeType = std::set<FunctionNode, FunctionNodeCmp>;
-
- GlobalNumberState GlobalNumbers;
-
- /// A work queue of functions that may have been modified and should be
- /// analyzed again.
- std::vector<WeakTrackingVH> Deferred;
-
-#ifndef NDEBUG
- /// Checks the rules of order relation introduced among functions set.
- /// Returns true, if sanity check has been passed, and false if failed.
- bool doSanityCheck(std::vector<WeakTrackingVH> &Worklist);
-#endif
-
- /// Insert a ComparableFunction into the FnTree, or merge it away if it's
- /// equal to one that's already present.
- bool insert(Function *NewFunction);
-
- /// Remove a Function from the FnTree and queue it up for a second sweep of
- /// analysis.
- void remove(Function *F);
-
- /// Find the functions that use this Value and remove them from FnTree and
- /// queue the functions.
- void removeUsers(Value *V);
-
- /// Replace all direct calls of Old with calls of New. Will bitcast New if
- /// necessary to make types match.
- void replaceDirectCallers(Function *Old, Function *New);
-
- /// Merge two equivalent functions. Upon completion, G may be deleted, or may
- /// be converted into a thunk. In either case, it should never be visited
- /// again.
- void mergeTwoFunctions(Function *F, Function *G);
-
- /// Fill PDIUnrelatedWL with instructions from the entry block that are
- /// unrelated to parameter related debug info.
- void filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock,
- std::vector<Instruction *> &PDIUnrelatedWL);
-
- /// Erase the rest of the CFG (i.e. barring the entry block).
- void eraseTail(Function *G);
-
- /// Erase the instructions in PDIUnrelatedWL as they are unrelated to the
- /// parameter debug info, from the entry block.
- void eraseInstsUnrelatedToPDI(std::vector<Instruction *> &PDIUnrelatedWL);
-
- /// Replace G with a simple tail call to bitcast(F). Also (unless
- /// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
- /// delete G.
- void writeThunk(Function *F, Function *G);
-
- // Replace G with an alias to F (deleting function G)
- void writeAlias(Function *F, Function *G);
-
- // Replace G with an alias to F if possible, or a thunk to F if possible.
- // Returns false if neither is the case.
- bool writeThunkOrAlias(Function *F, Function *G);
-
- /// Replace function F with function G in the function tree.
- void replaceFunctionInTree(const FunctionNode &FN, Function *G);
-
- /// The set of all distinct functions. Use the insert() and remove() methods
- /// to modify it. The map allows efficient lookup and deferring of Functions.
- FnTreeType FnTree;
-
- // Map functions to the iterators of the FunctionNode which contains them
- // in the FnTree. This must be updated carefully whenever the FnTree is
- // modified, i.e. in insert(), remove(), and replaceFunctionInTree(), to avoid
- // dangling iterators into FnTree. The invariant that preserves this is that
- // there is exactly one mapping F -> FN for each FunctionNode FN in FnTree.
- DenseMap<AssertingVH<Function>, FnTreeType::iterator> FNodesInTree;
-};
-
-class MergeFunctionsLegacyPass : public ModulePass {
-public:
- static char ID;
-
- MergeFunctionsLegacyPass(): ModulePass(ID) {
- initializeMergeFunctionsLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
-
- MergeFunctions MF;
- return MF.runOnModule(M);
- }
-};
-
-} // end anonymous namespace
-
-char MergeFunctionsLegacyPass::ID = 0;
-INITIALIZE_PASS(MergeFunctionsLegacyPass, "mergefunc",
- "Merge Functions", false, false)
-
-ModulePass *llvm::createMergeFunctionsPass() {
- return new MergeFunctionsLegacyPass();
-}
-
-PreservedAnalyses MergeFunctionsPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- MergeFunctions MF;
- if (!MF.runOnModule(M))
- return PreservedAnalyses::all();
- return PreservedAnalyses::none();
-}
-
-#ifndef NDEBUG
-bool MergeFunctions::doSanityCheck(std::vector<WeakTrackingVH> &Worklist) {
- if (const unsigned Max = NumFunctionsForSanityCheck) {
- unsigned TripleNumber = 0;
- bool Valid = true;
-
- dbgs() << "MERGEFUNC-SANITY: Started for first " << Max << " functions.\n";
-
- unsigned i = 0;
- for (std::vector<WeakTrackingVH>::iterator I = Worklist.begin(),
- E = Worklist.end();
- I != E && i < Max; ++I, ++i) {
- unsigned j = i;
- for (std::vector<WeakTrackingVH>::iterator J = I; J != E && j < Max;
- ++J, ++j) {
- Function *F1 = cast<Function>(*I);
- Function *F2 = cast<Function>(*J);
- int Res1 = FunctionComparator(F1, F2, &GlobalNumbers).compare();
- int Res2 = FunctionComparator(F2, F1, &GlobalNumbers).compare();
-
- // If F1 <= F2, then F2 >= F1, otherwise report failure.
- if (Res1 != -Res2) {
- dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber
- << "\n";
- dbgs() << *F1 << '\n' << *F2 << '\n';
- Valid = false;
- }
-
- if (Res1 == 0)
- continue;
-
- unsigned k = j;
- for (std::vector<WeakTrackingVH>::iterator K = J; K != E && k < Max;
- ++k, ++K, ++TripleNumber) {
- if (K == J)
- continue;
-
- Function *F3 = cast<Function>(*K);
- int Res3 = FunctionComparator(F1, F3, &GlobalNumbers).compare();
- int Res4 = FunctionComparator(F2, F3, &GlobalNumbers).compare();
-
- bool Transitive = true;
-
- if (Res1 != 0 && Res1 == Res4) {
- // F1 > F2, F2 > F3 => F1 > F3
- Transitive = Res3 == Res1;
- } else if (Res3 != 0 && Res3 == -Res4) {
- // F1 > F3, F3 > F2 => F1 > F2
- Transitive = Res3 == Res1;
- } else if (Res4 != 0 && -Res3 == Res4) {
- // F2 > F3, F3 > F1 => F2 > F1
- Transitive = Res4 == -Res1;
- }
-
- if (!Transitive) {
- dbgs() << "MERGEFUNC-SANITY: Non-transitive; triple: "
- << TripleNumber << "\n";
- dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", "
- << Res4 << "\n";
- dbgs() << *F1 << '\n' << *F2 << '\n' << *F3 << '\n';
- Valid = false;
- }
- }
- }
- }
-
- dbgs() << "MERGEFUNC-SANITY: " << (Valid ? "Passed." : "Failed.") << "\n";
- return Valid;
- }
- return true;
-}
-#endif
-
-/// Check whether \p F is eligible for function merging.
-static bool isEligibleForMerging(Function &F) {
- return !F.isDeclaration() && !F.hasAvailableExternallyLinkage();
-}
-
-bool MergeFunctions::runOnModule(Module &M) {
- bool Changed = false;
-
- // All functions in the module, ordered by hash. Functions with a unique
- // hash value are easily eliminated.
- std::vector<std::pair<FunctionComparator::FunctionHash, Function *>>
- HashedFuncs;
- for (Function &Func : M) {
- if (isEligibleForMerging(Func)) {
- HashedFuncs.push_back({FunctionComparator::functionHash(Func), &Func});
- }
- }
-
- llvm::stable_sort(HashedFuncs, less_first());
-
- auto S = HashedFuncs.begin();
- for (auto I = HashedFuncs.begin(), IE = HashedFuncs.end(); I != IE; ++I) {
- // If the hash value matches the previous value or the next one, we must
- // consider merging it. Otherwise it is dropped and never considered again.
- if ((I != S && std::prev(I)->first == I->first) ||
- (std::next(I) != IE && std::next(I)->first == I->first) ) {
- Deferred.push_back(WeakTrackingVH(I->second));
- }
- }
-
- do {
- std::vector<WeakTrackingVH> Worklist;
- Deferred.swap(Worklist);
-
- LLVM_DEBUG(doSanityCheck(Worklist));
-
- LLVM_DEBUG(dbgs() << "size of module: " << M.size() << '\n');
- LLVM_DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n');
-
- // Insert functions and merge them.
- for (WeakTrackingVH &I : Worklist) {
- if (!I)
- continue;
- Function *F = cast<Function>(I);
- if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage()) {
- Changed |= insert(F);
- }
- }
- LLVM_DEBUG(dbgs() << "size of FnTree: " << FnTree.size() << '\n');
- } while (!Deferred.empty());
-
- FnTree.clear();
- FNodesInTree.clear();
- GlobalNumbers.clear();
-
- return Changed;
-}
-
-// Replace direct callers of Old with New.
-void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
- Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType());
- for (auto UI = Old->use_begin(), UE = Old->use_end(); UI != UE;) {
- Use *U = &*UI;
- ++UI;
- CallBase *CB = dyn_cast<CallBase>(U->getUser());
- if (CB && CB->isCallee(U)) {
- // Do not copy attributes from the called function to the call-site.
- // Function comparison ensures that the attributes are the same up to
- // type congruences in byval(), in which case we need to keep the byval
- // type of the call-site, not the callee function.
- remove(CB->getFunction());
- U->set(BitcastNew);
- }
- }
-}
-
-// Helper for writeThunk,
-// Selects proper bitcast operation,
-// but a bit simpler then CastInst::getCastOpcode.
-static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
- Type *SrcTy = V->getType();
- if (SrcTy->isStructTy()) {
- assert(DestTy->isStructTy());
- assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements());
- Value *Result = UndefValue::get(DestTy);
- for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) {
- Value *Element = createCast(
- Builder, Builder.CreateExtractValue(V, makeArrayRef(I)),
- DestTy->getStructElementType(I));
-
- Result =
- Builder.CreateInsertValue(Result, Element, makeArrayRef(I));
- }
- return Result;
- }
- assert(!DestTy->isStructTy());
- if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
- return Builder.CreateIntToPtr(V, DestTy);
- else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
- return Builder.CreatePtrToInt(V, DestTy);
- else
- return Builder.CreateBitCast(V, DestTy);
-}
-
-// Erase the instructions in PDIUnrelatedWL as they are unrelated to the
-// parameter debug info, from the entry block.
-void MergeFunctions::eraseInstsUnrelatedToPDI(
- std::vector<Instruction *> &PDIUnrelatedWL) {
- LLVM_DEBUG(
- dbgs() << " Erasing instructions (in reverse order of appearance in "
- "entry block) unrelated to parameter debug info from entry "
- "block: {\n");
- while (!PDIUnrelatedWL.empty()) {
- Instruction *I = PDIUnrelatedWL.back();
- LLVM_DEBUG(dbgs() << " Deleting Instruction: ");
- LLVM_DEBUG(I->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- I->eraseFromParent();
- PDIUnrelatedWL.pop_back();
- }
- LLVM_DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter "
- "debug info from entry block. \n");
-}
-
-// Reduce G to its entry block.
-void MergeFunctions::eraseTail(Function *G) {
- std::vector<BasicBlock *> WorklistBB;
- for (Function::iterator BBI = std::next(G->begin()), BBE = G->end();
- BBI != BBE; ++BBI) {
- BBI->dropAllReferences();
- WorklistBB.push_back(&*BBI);
- }
- while (!WorklistBB.empty()) {
- BasicBlock *BB = WorklistBB.back();
- BB->eraseFromParent();
- WorklistBB.pop_back();
- }
-}
-
-// We are interested in the following instructions from the entry block as being
-// related to parameter debug info:
-// - @llvm.dbg.declare
-// - stores from the incoming parameters to locations on the stack-frame
-// - allocas that create these locations on the stack-frame
-// - @llvm.dbg.value
-// - the entry block's terminator
-// The rest are unrelated to debug info for the parameters; fill up
-// PDIUnrelatedWL with such instructions.
-void MergeFunctions::filterInstsUnrelatedToPDI(
- BasicBlock *GEntryBlock, std::vector<Instruction *> &PDIUnrelatedWL) {
- std::set<Instruction *> PDIRelated;
- for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end();
- BI != BIE; ++BI) {
- if (auto *DVI = dyn_cast<DbgValueInst>(&*BI)) {
- LLVM_DEBUG(dbgs() << " Deciding: ");
- LLVM_DEBUG(BI->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- DILocalVariable *DILocVar = DVI->getVariable();
- if (DILocVar->isParameter()) {
- LLVM_DEBUG(dbgs() << " Include (parameter): ");
- LLVM_DEBUG(BI->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- PDIRelated.insert(&*BI);
- } else {
- LLVM_DEBUG(dbgs() << " Delete (!parameter): ");
- LLVM_DEBUG(BI->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- }
- } else if (auto *DDI = dyn_cast<DbgDeclareInst>(&*BI)) {
- LLVM_DEBUG(dbgs() << " Deciding: ");
- LLVM_DEBUG(BI->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- DILocalVariable *DILocVar = DDI->getVariable();
- if (DILocVar->isParameter()) {
- LLVM_DEBUG(dbgs() << " Parameter: ");
- LLVM_DEBUG(DILocVar->print(dbgs()));
- AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
- if (AI) {
- LLVM_DEBUG(dbgs() << " Processing alloca users: ");
- LLVM_DEBUG(dbgs() << "\n");
- for (User *U : AI->users()) {
- if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
- if (Value *Arg = SI->getValueOperand()) {
- if (dyn_cast<Argument>(Arg)) {
- LLVM_DEBUG(dbgs() << " Include: ");
- LLVM_DEBUG(AI->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- PDIRelated.insert(AI);
- LLVM_DEBUG(dbgs() << " Include (parameter): ");
- LLVM_DEBUG(SI->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- PDIRelated.insert(SI);
- LLVM_DEBUG(dbgs() << " Include: ");
- LLVM_DEBUG(BI->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- PDIRelated.insert(&*BI);
- } else {
- LLVM_DEBUG(dbgs() << " Delete (!parameter): ");
- LLVM_DEBUG(SI->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- }
- }
- } else {
- LLVM_DEBUG(dbgs() << " Defer: ");
- LLVM_DEBUG(U->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- }
- }
- } else {
- LLVM_DEBUG(dbgs() << " Delete (alloca NULL): ");
- LLVM_DEBUG(BI->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- }
- } else {
- LLVM_DEBUG(dbgs() << " Delete (!parameter): ");
- LLVM_DEBUG(BI->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- }
- } else if (BI->isTerminator() && &*BI == GEntryBlock->getTerminator()) {
- LLVM_DEBUG(dbgs() << " Will Include Terminator: ");
- LLVM_DEBUG(BI->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- PDIRelated.insert(&*BI);
- } else {
- LLVM_DEBUG(dbgs() << " Defer: ");
- LLVM_DEBUG(BI->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- }
- }
- LLVM_DEBUG(
- dbgs()
- << " Report parameter debug info related/related instructions: {\n");
- for (BasicBlock::iterator BI = GEntryBlock->begin(), BE = GEntryBlock->end();
- BI != BE; ++BI) {
-
- Instruction *I = &*BI;
- if (PDIRelated.find(I) == PDIRelated.end()) {
- LLVM_DEBUG(dbgs() << " !PDIRelated: ");
- LLVM_DEBUG(I->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- PDIUnrelatedWL.push_back(I);
- } else {
- LLVM_DEBUG(dbgs() << " PDIRelated: ");
- LLVM_DEBUG(I->print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- }
- }
- LLVM_DEBUG(dbgs() << " }\n");
-}
-
-/// Whether this function may be replaced by a forwarding thunk.
-static bool canCreateThunkFor(Function *F) {
- if (F->isVarArg())
- return false;
-
- // Don't merge tiny functions using a thunk, since it can just end up
- // making the function larger.
- if (F->size() == 1) {
- if (F->front().size() <= 2) {
- LLVM_DEBUG(dbgs() << "canCreateThunkFor: " << F->getName()
- << " is too small to bother creating a thunk for\n");
- return false;
- }
- }
- return true;
-}
-
-// Replace G with a simple tail call to bitcast(F). Also (unless
-// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
-// delete G. Under MergeFunctionsPDI, we use G itself for creating
-// the thunk as we preserve the debug info (and associated instructions)
-// from G's entry block pertaining to G's incoming arguments which are
-// passed on as corresponding arguments in the call that G makes to F.
-// For better debugability, under MergeFunctionsPDI, we do not modify G's
-// call sites to point to F even when within the same translation unit.
-void MergeFunctions::writeThunk(Function *F, Function *G) {
- BasicBlock *GEntryBlock = nullptr;
- std::vector<Instruction *> PDIUnrelatedWL;
- BasicBlock *BB = nullptr;
- Function *NewG = nullptr;
- if (MergeFunctionsPDI) {
- LLVM_DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) Do not create a new "
- "function as thunk; retain original: "
- << G->getName() << "()\n");
- GEntryBlock = &G->getEntryBlock();
- LLVM_DEBUG(
- dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related "
- "debug info for "
- << G->getName() << "() {\n");
- filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL);
- GEntryBlock->getTerminator()->eraseFromParent();
- BB = GEntryBlock;
- } else {
- NewG = Function::Create(G->getFunctionType(), G->getLinkage(),
- G->getAddressSpace(), "", G->getParent());
- NewG->setComdat(G->getComdat());
- BB = BasicBlock::Create(F->getContext(), "", NewG);
- }
-
- IRBuilder<> Builder(BB);
- Function *H = MergeFunctionsPDI ? G : NewG;
- SmallVector<Value *, 16> Args;
- unsigned i = 0;
- FunctionType *FFTy = F->getFunctionType();
- for (Argument &AI : H->args()) {
- Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i)));
- ++i;
- }
-
- CallInst *CI = Builder.CreateCall(F, Args);
- ReturnInst *RI = nullptr;
- CI->setTailCall();
- CI->setCallingConv(F->getCallingConv());
- CI->setAttributes(F->getAttributes());
- if (H->getReturnType()->isVoidTy()) {
- RI = Builder.CreateRetVoid();
- } else {
- RI = Builder.CreateRet(createCast(Builder, CI, H->getReturnType()));
- }
-
- if (MergeFunctionsPDI) {
- DISubprogram *DIS = G->getSubprogram();
- if (DIS) {
+//===- MergeFunctions.cpp - Merge identical functions ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for equivalent functions that are mergable and folds them.
+//
+// Order relation is defined on set of functions. It was made through
+// special function comparison procedure that returns
+// 0 when functions are equal,
+// -1 when Left function is less than right function, and
+// 1 for opposite case. We need total-ordering, so we need to maintain
+// four properties on the functions set:
+// a <= a (reflexivity)
+// if a <= b and b <= a then a = b (antisymmetry)
+// if a <= b and b <= c then a <= c (transitivity).
+// for all a and b: a <= b or b <= a (totality).
+//
+// Comparison iterates through each instruction in each basic block.
+// Functions are kept on binary tree. For each new function F we perform
+// lookup in binary tree.
+// In practice it works the following way:
+// -- We define Function* container class with custom "operator<" (FunctionPtr).
+// -- "FunctionPtr" instances are stored in std::set collection, so every
+// std::set::insert operation will give you result in log(N) time.
+//
+// As an optimization, a hash of the function structure is calculated first, and
+// two functions are only compared if they have the same hash. This hash is
+// cheap to compute, and has the property that if function F == G according to
+// the comparison function, then hash(F) == hash(G). This consistency property
+// is critical to ensuring all possible merging opportunities are exploited.
+// Collisions in the hash affect the speed of the pass but not the correctness
+// or determinism of the resulting transformation.
+//
+// When a match is found the functions are folded. If both functions are
+// overridable, we move the functionality into a new internal function and
+// leave two overridable thunks to it.
+//
+//===----------------------------------------------------------------------===//
+//
+// Future work:
+//
+// * virtual functions.
+//
+// Many functions have their address taken by the virtual function table for
+// the object they belong to. However, as long as it's only used for a lookup
+// and call, this is irrelevant, and we'd like to fold such functions.
+//
+// * be smarter about bitcasts.
+//
+// In order to fold functions, we will sometimes add either bitcast instructions
+// or bitcast constant expressions. Unfortunately, this can confound further
+// analysis since the two functions differ where one has a bitcast and the
+// other doesn't. We should learn to look through bitcasts.
+//
+// * Compare complex types with pointer types inside.
+// * Compare cross-reference cases.
+// * Compare complex expressions.
+//
+// All the three issues above could be described as ability to prove that
+// fA == fB == fC == fE == fF == fG in example below:
+//
+// void fA() {
+// fB();
+// }
+// void fB() {
+// fA();
+// }
+//
+// void fE() {
+// fF();
+// }
+// void fF() {
+// fG();
+// }
+// void fG() {
+// fE();
+// }
+//
+// Simplest cross-reference case (fA <--> fB) was implemented in previous
+// versions of MergeFunctions, though it presented only in two function pairs
+// in test-suite (that counts >50k functions)
+// Though possibility to detect complex cross-referencing (e.g.: A->B->C->D->A)
+// could cover much more cases.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/MergeFunctions.h"
+#include "llvm/Transforms/Utils/FunctionComparator.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mergefunc"
+
+STATISTIC(NumFunctionsMerged, "Number of functions merged");
+STATISTIC(NumThunksWritten, "Number of thunks generated");
+STATISTIC(NumAliasesWritten, "Number of aliases generated");
+STATISTIC(NumDoubleWeak, "Number of new functions created");
+
+static cl::opt<unsigned> NumFunctionsForSanityCheck(
+ "mergefunc-sanity",
+ cl::desc("How many functions in module could be used for "
+ "MergeFunctions pass sanity check. "
+ "'0' disables this check. Works only with '-debug' key."),
+ cl::init(0), cl::Hidden);
+
+// Under option -mergefunc-preserve-debug-info we:
+// - Do not create a new function for a thunk.
+// - Retain the debug info for a thunk's parameters (and associated
+// instructions for the debug info) from the entry block.
+// Note: -debug will display the algorithm at work.
+// - Create debug-info for the call (to the shared implementation) made by
+// a thunk and its return value.
+// - Erase the rest of the function, retaining the (minimally sized) entry
+// block to create a thunk.
+// - Preserve a thunk's call site to point to the thunk even when both occur
+// within the same translation unit, to aid debugability. Note that this
+// behaviour differs from the underlying -mergefunc implementation which
+// modifies the thunk's call site to point to the shared implementation
+// when both occur within the same translation unit.
+static cl::opt<bool>
+ MergeFunctionsPDI("mergefunc-preserve-debug-info", cl::Hidden,
+ cl::init(false),
+ cl::desc("Preserve debug info in thunk when mergefunc "
+ "transformations are made."));
+
+static cl::opt<bool>
+ MergeFunctionsAliases("mergefunc-use-aliases", cl::Hidden,
+ cl::init(false),
+ cl::desc("Allow mergefunc to create aliases"));
+
+namespace {
+
+class FunctionNode {
+ mutable AssertingVH<Function> F;
+ FunctionComparator::FunctionHash Hash;
+
+public:
+ // Note the hash is recalculated potentially multiple times, but it is cheap.
+ FunctionNode(Function *F)
+ : F(F), Hash(FunctionComparator::functionHash(*F)) {}
+
+ Function *getFunc() const { return F; }
+ FunctionComparator::FunctionHash getHash() const { return Hash; }
+
+ /// Replace the reference to the function F by the function G, assuming their
+ /// implementations are equal.
+ void replaceBy(Function *G) const {
+ F = G;
+ }
+};
+
+/// MergeFunctions finds functions which will generate identical machine code,
+/// by considering all pointer types to be equivalent. Once identified,
+/// MergeFunctions will fold them by replacing a call to one to a call to a
+/// bitcast of the other.
+class MergeFunctions {
+public:
+ MergeFunctions() : FnTree(FunctionNodeCmp(&GlobalNumbers)) {
+ }
+
+ bool runOnModule(Module &M);
+
+private:
+ // The function comparison operator is provided here so that FunctionNodes do
+ // not need to become larger with another pointer.
+ class FunctionNodeCmp {
+ GlobalNumberState* GlobalNumbers;
+
+ public:
+ FunctionNodeCmp(GlobalNumberState* GN) : GlobalNumbers(GN) {}
+
+ bool operator()(const FunctionNode &LHS, const FunctionNode &RHS) const {
+ // Order first by hashes, then full function comparison.
+ if (LHS.getHash() != RHS.getHash())
+ return LHS.getHash() < RHS.getHash();
+ FunctionComparator FCmp(LHS.getFunc(), RHS.getFunc(), GlobalNumbers);
+ return FCmp.compare() == -1;
+ }
+ };
+ using FnTreeType = std::set<FunctionNode, FunctionNodeCmp>;
+
+ GlobalNumberState GlobalNumbers;
+
+ /// A work queue of functions that may have been modified and should be
+ /// analyzed again.
+ std::vector<WeakTrackingVH> Deferred;
+
+#ifndef NDEBUG
+ /// Checks the rules of order relation introduced among functions set.
+ /// Returns true, if sanity check has been passed, and false if failed.
+ bool doSanityCheck(std::vector<WeakTrackingVH> &Worklist);
+#endif
+
+ /// Insert a ComparableFunction into the FnTree, or merge it away if it's
+ /// equal to one that's already present.
+ bool insert(Function *NewFunction);
+
+ /// Remove a Function from the FnTree and queue it up for a second sweep of
+ /// analysis.
+ void remove(Function *F);
+
+ /// Find the functions that use this Value and remove them from FnTree and
+ /// queue the functions.
+ void removeUsers(Value *V);
+
+ /// Replace all direct calls of Old with calls of New. Will bitcast New if
+ /// necessary to make types match.
+ void replaceDirectCallers(Function *Old, Function *New);
+
+ /// Merge two equivalent functions. Upon completion, G may be deleted, or may
+ /// be converted into a thunk. In either case, it should never be visited
+ /// again.
+ void mergeTwoFunctions(Function *F, Function *G);
+
+ /// Fill PDIUnrelatedWL with instructions from the entry block that are
+ /// unrelated to parameter related debug info.
+ void filterInstsUnrelatedToPDI(BasicBlock *GEntryBlock,
+ std::vector<Instruction *> &PDIUnrelatedWL);
+
+ /// Erase the rest of the CFG (i.e. barring the entry block).
+ void eraseTail(Function *G);
+
+ /// Erase the instructions in PDIUnrelatedWL as they are unrelated to the
+ /// parameter debug info, from the entry block.
+ void eraseInstsUnrelatedToPDI(std::vector<Instruction *> &PDIUnrelatedWL);
+
+ /// Replace G with a simple tail call to bitcast(F). Also (unless
+ /// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
+ /// delete G.
+ void writeThunk(Function *F, Function *G);
+
+ // Replace G with an alias to F (deleting function G)
+ void writeAlias(Function *F, Function *G);
+
+ // Replace G with an alias to F if possible, or a thunk to F if possible.
+ // Returns false if neither is the case.
+ bool writeThunkOrAlias(Function *F, Function *G);
+
+ /// Replace function F with function G in the function tree.
+ void replaceFunctionInTree(const FunctionNode &FN, Function *G);
+
+ /// The set of all distinct functions. Use the insert() and remove() methods
+ /// to modify it. The map allows efficient lookup and deferring of Functions.
+ FnTreeType FnTree;
+
+ // Map functions to the iterators of the FunctionNode which contains them
+ // in the FnTree. This must be updated carefully whenever the FnTree is
+ // modified, i.e. in insert(), remove(), and replaceFunctionInTree(), to avoid
+ // dangling iterators into FnTree. The invariant that preserves this is that
+ // there is exactly one mapping F -> FN for each FunctionNode FN in FnTree.
+ DenseMap<AssertingVH<Function>, FnTreeType::iterator> FNodesInTree;
+};
+
+class MergeFunctionsLegacyPass : public ModulePass {
+public:
+ static char ID;
+
+ MergeFunctionsLegacyPass(): ModulePass(ID) {
+ initializeMergeFunctionsLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+
+ MergeFunctions MF;
+ return MF.runOnModule(M);
+ }
+};
+
+} // end anonymous namespace
+
+char MergeFunctionsLegacyPass::ID = 0;
+INITIALIZE_PASS(MergeFunctionsLegacyPass, "mergefunc",
+ "Merge Functions", false, false)
+
+ModulePass *llvm::createMergeFunctionsPass() {
+ return new MergeFunctionsLegacyPass();
+}
+
+PreservedAnalyses MergeFunctionsPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ MergeFunctions MF;
+ if (!MF.runOnModule(M))
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
+
+#ifndef NDEBUG
+bool MergeFunctions::doSanityCheck(std::vector<WeakTrackingVH> &Worklist) {
+ if (const unsigned Max = NumFunctionsForSanityCheck) {
+ unsigned TripleNumber = 0;
+ bool Valid = true;
+
+ dbgs() << "MERGEFUNC-SANITY: Started for first " << Max << " functions.\n";
+
+ unsigned i = 0;
+ for (std::vector<WeakTrackingVH>::iterator I = Worklist.begin(),
+ E = Worklist.end();
+ I != E && i < Max; ++I, ++i) {
+ unsigned j = i;
+ for (std::vector<WeakTrackingVH>::iterator J = I; J != E && j < Max;
+ ++J, ++j) {
+ Function *F1 = cast<Function>(*I);
+ Function *F2 = cast<Function>(*J);
+ int Res1 = FunctionComparator(F1, F2, &GlobalNumbers).compare();
+ int Res2 = FunctionComparator(F2, F1, &GlobalNumbers).compare();
+
+ // If F1 <= F2, then F2 >= F1, otherwise report failure.
+ if (Res1 != -Res2) {
+ dbgs() << "MERGEFUNC-SANITY: Non-symmetric; triple: " << TripleNumber
+ << "\n";
+ dbgs() << *F1 << '\n' << *F2 << '\n';
+ Valid = false;
+ }
+
+ if (Res1 == 0)
+ continue;
+
+ unsigned k = j;
+ for (std::vector<WeakTrackingVH>::iterator K = J; K != E && k < Max;
+ ++k, ++K, ++TripleNumber) {
+ if (K == J)
+ continue;
+
+ Function *F3 = cast<Function>(*K);
+ int Res3 = FunctionComparator(F1, F3, &GlobalNumbers).compare();
+ int Res4 = FunctionComparator(F2, F3, &GlobalNumbers).compare();
+
+ bool Transitive = true;
+
+ if (Res1 != 0 && Res1 == Res4) {
+ // F1 > F2, F2 > F3 => F1 > F3
+ Transitive = Res3 == Res1;
+ } else if (Res3 != 0 && Res3 == -Res4) {
+ // F1 > F3, F3 > F2 => F1 > F2
+ Transitive = Res3 == Res1;
+ } else if (Res4 != 0 && -Res3 == Res4) {
+ // F2 > F3, F3 > F1 => F2 > F1
+ Transitive = Res4 == -Res1;
+ }
+
+ if (!Transitive) {
+ dbgs() << "MERGEFUNC-SANITY: Non-transitive; triple: "
+ << TripleNumber << "\n";
+ dbgs() << "Res1, Res3, Res4: " << Res1 << ", " << Res3 << ", "
+ << Res4 << "\n";
+ dbgs() << *F1 << '\n' << *F2 << '\n' << *F3 << '\n';
+ Valid = false;
+ }
+ }
+ }
+ }
+
+ dbgs() << "MERGEFUNC-SANITY: " << (Valid ? "Passed." : "Failed.") << "\n";
+ return Valid;
+ }
+ return true;
+}
+#endif
+
+/// Check whether \p F is eligible for function merging.
+static bool isEligibleForMerging(Function &F) {
+ return !F.isDeclaration() && !F.hasAvailableExternallyLinkage();
+}
+
+bool MergeFunctions::runOnModule(Module &M) {
+ bool Changed = false;
+
+ // All functions in the module, ordered by hash. Functions with a unique
+ // hash value are easily eliminated.
+ std::vector<std::pair<FunctionComparator::FunctionHash, Function *>>
+ HashedFuncs;
+ for (Function &Func : M) {
+ if (isEligibleForMerging(Func)) {
+ HashedFuncs.push_back({FunctionComparator::functionHash(Func), &Func});
+ }
+ }
+
+ llvm::stable_sort(HashedFuncs, less_first());
+
+ auto S = HashedFuncs.begin();
+ for (auto I = HashedFuncs.begin(), IE = HashedFuncs.end(); I != IE; ++I) {
+ // If the hash value matches the previous value or the next one, we must
+ // consider merging it. Otherwise it is dropped and never considered again.
+ if ((I != S && std::prev(I)->first == I->first) ||
+ (std::next(I) != IE && std::next(I)->first == I->first) ) {
+ Deferred.push_back(WeakTrackingVH(I->second));
+ }
+ }
+
+ do {
+ std::vector<WeakTrackingVH> Worklist;
+ Deferred.swap(Worklist);
+
+ LLVM_DEBUG(doSanityCheck(Worklist));
+
+ LLVM_DEBUG(dbgs() << "size of module: " << M.size() << '\n');
+ LLVM_DEBUG(dbgs() << "size of worklist: " << Worklist.size() << '\n');
+
+ // Insert functions and merge them.
+ for (WeakTrackingVH &I : Worklist) {
+ if (!I)
+ continue;
+ Function *F = cast<Function>(I);
+ if (!F->isDeclaration() && !F->hasAvailableExternallyLinkage()) {
+ Changed |= insert(F);
+ }
+ }
+ LLVM_DEBUG(dbgs() << "size of FnTree: " << FnTree.size() << '\n');
+ } while (!Deferred.empty());
+
+ FnTree.clear();
+ FNodesInTree.clear();
+ GlobalNumbers.clear();
+
+ return Changed;
+}
+
+// Replace direct callers of Old with New.
+void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
+ Constant *BitcastNew = ConstantExpr::getBitCast(New, Old->getType());
+ for (auto UI = Old->use_begin(), UE = Old->use_end(); UI != UE;) {
+ Use *U = &*UI;
+ ++UI;
+ CallBase *CB = dyn_cast<CallBase>(U->getUser());
+ if (CB && CB->isCallee(U)) {
+ // Do not copy attributes from the called function to the call-site.
+ // Function comparison ensures that the attributes are the same up to
+ // type congruences in byval(), in which case we need to keep the byval
+ // type of the call-site, not the callee function.
+ remove(CB->getFunction());
+ U->set(BitcastNew);
+ }
+ }
+}
+
+// Helper for writeThunk,
+// Selects proper bitcast operation,
+// but a bit simpler then CastInst::getCastOpcode.
+static Value *createCast(IRBuilder<> &Builder, Value *V, Type *DestTy) {
+ Type *SrcTy = V->getType();
+ if (SrcTy->isStructTy()) {
+ assert(DestTy->isStructTy());
+ assert(SrcTy->getStructNumElements() == DestTy->getStructNumElements());
+ Value *Result = UndefValue::get(DestTy);
+ for (unsigned int I = 0, E = SrcTy->getStructNumElements(); I < E; ++I) {
+ Value *Element = createCast(
+ Builder, Builder.CreateExtractValue(V, makeArrayRef(I)),
+ DestTy->getStructElementType(I));
+
+ Result =
+ Builder.CreateInsertValue(Result, Element, makeArrayRef(I));
+ }
+ return Result;
+ }
+ assert(!DestTy->isStructTy());
+ if (SrcTy->isIntegerTy() && DestTy->isPointerTy())
+ return Builder.CreateIntToPtr(V, DestTy);
+ else if (SrcTy->isPointerTy() && DestTy->isIntegerTy())
+ return Builder.CreatePtrToInt(V, DestTy);
+ else
+ return Builder.CreateBitCast(V, DestTy);
+}
+
+// Erase the instructions in PDIUnrelatedWL as they are unrelated to the
+// parameter debug info, from the entry block.
+void MergeFunctions::eraseInstsUnrelatedToPDI(
+ std::vector<Instruction *> &PDIUnrelatedWL) {
+ LLVM_DEBUG(
+ dbgs() << " Erasing instructions (in reverse order of appearance in "
+ "entry block) unrelated to parameter debug info from entry "
+ "block: {\n");
+ while (!PDIUnrelatedWL.empty()) {
+ Instruction *I = PDIUnrelatedWL.back();
+ LLVM_DEBUG(dbgs() << " Deleting Instruction: ");
+ LLVM_DEBUG(I->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ I->eraseFromParent();
+ PDIUnrelatedWL.pop_back();
+ }
+ LLVM_DEBUG(dbgs() << " } // Done erasing instructions unrelated to parameter "
+ "debug info from entry block. \n");
+}
+
+// Reduce G to its entry block.
+void MergeFunctions::eraseTail(Function *G) {
+ std::vector<BasicBlock *> WorklistBB;
+ for (Function::iterator BBI = std::next(G->begin()), BBE = G->end();
+ BBI != BBE; ++BBI) {
+ BBI->dropAllReferences();
+ WorklistBB.push_back(&*BBI);
+ }
+ while (!WorklistBB.empty()) {
+ BasicBlock *BB = WorklistBB.back();
+ BB->eraseFromParent();
+ WorklistBB.pop_back();
+ }
+}
+
+// We are interested in the following instructions from the entry block as being
+// related to parameter debug info:
+// - @llvm.dbg.declare
+// - stores from the incoming parameters to locations on the stack-frame
+// - allocas that create these locations on the stack-frame
+// - @llvm.dbg.value
+// - the entry block's terminator
+// The rest are unrelated to debug info for the parameters; fill up
+// PDIUnrelatedWL with such instructions.
+void MergeFunctions::filterInstsUnrelatedToPDI(
+ BasicBlock *GEntryBlock, std::vector<Instruction *> &PDIUnrelatedWL) {
+ std::set<Instruction *> PDIRelated;
+ for (BasicBlock::iterator BI = GEntryBlock->begin(), BIE = GEntryBlock->end();
+ BI != BIE; ++BI) {
+ if (auto *DVI = dyn_cast<DbgValueInst>(&*BI)) {
+ LLVM_DEBUG(dbgs() << " Deciding: ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ DILocalVariable *DILocVar = DVI->getVariable();
+ if (DILocVar->isParameter()) {
+ LLVM_DEBUG(dbgs() << " Include (parameter): ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ PDIRelated.insert(&*BI);
+ } else {
+ LLVM_DEBUG(dbgs() << " Delete (!parameter): ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ }
+ } else if (auto *DDI = dyn_cast<DbgDeclareInst>(&*BI)) {
+ LLVM_DEBUG(dbgs() << " Deciding: ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ DILocalVariable *DILocVar = DDI->getVariable();
+ if (DILocVar->isParameter()) {
+ LLVM_DEBUG(dbgs() << " Parameter: ");
+ LLVM_DEBUG(DILocVar->print(dbgs()));
+ AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
+ if (AI) {
+ LLVM_DEBUG(dbgs() << " Processing alloca users: ");
+ LLVM_DEBUG(dbgs() << "\n");
+ for (User *U : AI->users()) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+ if (Value *Arg = SI->getValueOperand()) {
+ if (dyn_cast<Argument>(Arg)) {
+ LLVM_DEBUG(dbgs() << " Include: ");
+ LLVM_DEBUG(AI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ PDIRelated.insert(AI);
+ LLVM_DEBUG(dbgs() << " Include (parameter): ");
+ LLVM_DEBUG(SI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ PDIRelated.insert(SI);
+ LLVM_DEBUG(dbgs() << " Include: ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ PDIRelated.insert(&*BI);
+ } else {
+ LLVM_DEBUG(dbgs() << " Delete (!parameter): ");
+ LLVM_DEBUG(SI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ }
+ }
+ } else {
+ LLVM_DEBUG(dbgs() << " Defer: ");
+ LLVM_DEBUG(U->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ }
+ }
+ } else {
+ LLVM_DEBUG(dbgs() << " Delete (alloca NULL): ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ }
+ } else {
+ LLVM_DEBUG(dbgs() << " Delete (!parameter): ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ }
+ } else if (BI->isTerminator() && &*BI == GEntryBlock->getTerminator()) {
+ LLVM_DEBUG(dbgs() << " Will Include Terminator: ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ PDIRelated.insert(&*BI);
+ } else {
+ LLVM_DEBUG(dbgs() << " Defer: ");
+ LLVM_DEBUG(BI->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ }
+ }
+ LLVM_DEBUG(
+ dbgs()
+ << " Report parameter debug info related/related instructions: {\n");
+ for (BasicBlock::iterator BI = GEntryBlock->begin(), BE = GEntryBlock->end();
+ BI != BE; ++BI) {
+
+ Instruction *I = &*BI;
+ if (PDIRelated.find(I) == PDIRelated.end()) {
+ LLVM_DEBUG(dbgs() << " !PDIRelated: ");
+ LLVM_DEBUG(I->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ PDIUnrelatedWL.push_back(I);
+ } else {
+ LLVM_DEBUG(dbgs() << " PDIRelated: ");
+ LLVM_DEBUG(I->print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ }
+ }
+ LLVM_DEBUG(dbgs() << " }\n");
+}
+
+/// Whether this function may be replaced by a forwarding thunk.
+static bool canCreateThunkFor(Function *F) {
+ if (F->isVarArg())
+ return false;
+
+ // Don't merge tiny functions using a thunk, since it can just end up
+ // making the function larger.
+ if (F->size() == 1) {
+ if (F->front().size() <= 2) {
+ LLVM_DEBUG(dbgs() << "canCreateThunkFor: " << F->getName()
+ << " is too small to bother creating a thunk for\n");
+ return false;
+ }
+ }
+ return true;
+}
+
+// Replace G with a simple tail call to bitcast(F). Also (unless
+// MergeFunctionsPDI holds) replace direct uses of G with bitcast(F),
+// delete G. Under MergeFunctionsPDI, we use G itself for creating
+// the thunk as we preserve the debug info (and associated instructions)
+// from G's entry block pertaining to G's incoming arguments which are
+// passed on as corresponding arguments in the call that G makes to F.
+// For better debugability, under MergeFunctionsPDI, we do not modify G's
+// call sites to point to F even when within the same translation unit.
+void MergeFunctions::writeThunk(Function *F, Function *G) {
+ BasicBlock *GEntryBlock = nullptr;
+ std::vector<Instruction *> PDIUnrelatedWL;
+ BasicBlock *BB = nullptr;
+ Function *NewG = nullptr;
+ if (MergeFunctionsPDI) {
+ LLVM_DEBUG(dbgs() << "writeThunk: (MergeFunctionsPDI) Do not create a new "
+ "function as thunk; retain original: "
+ << G->getName() << "()\n");
+ GEntryBlock = &G->getEntryBlock();
+ LLVM_DEBUG(
+ dbgs() << "writeThunk: (MergeFunctionsPDI) filter parameter related "
+ "debug info for "
+ << G->getName() << "() {\n");
+ filterInstsUnrelatedToPDI(GEntryBlock, PDIUnrelatedWL);
+ GEntryBlock->getTerminator()->eraseFromParent();
+ BB = GEntryBlock;
+ } else {
+ NewG = Function::Create(G->getFunctionType(), G->getLinkage(),
+ G->getAddressSpace(), "", G->getParent());
+ NewG->setComdat(G->getComdat());
+ BB = BasicBlock::Create(F->getContext(), "", NewG);
+ }
+
+ IRBuilder<> Builder(BB);
+ Function *H = MergeFunctionsPDI ? G : NewG;
+ SmallVector<Value *, 16> Args;
+ unsigned i = 0;
+ FunctionType *FFTy = F->getFunctionType();
+ for (Argument &AI : H->args()) {
+ Args.push_back(createCast(Builder, &AI, FFTy->getParamType(i)));
+ ++i;
+ }
+
+ CallInst *CI = Builder.CreateCall(F, Args);
+ ReturnInst *RI = nullptr;
+ CI->setTailCall();
+ CI->setCallingConv(F->getCallingConv());
+ CI->setAttributes(F->getAttributes());
+ if (H->getReturnType()->isVoidTy()) {
+ RI = Builder.CreateRetVoid();
+ } else {
+ RI = Builder.CreateRet(createCast(Builder, CI, H->getReturnType()));
+ }
+
+ if (MergeFunctionsPDI) {
+ DISubprogram *DIS = G->getSubprogram();
+ if (DIS) {
DebugLoc CIDbgLoc =
DILocation::get(DIS->getContext(), DIS->getScopeLine(), 0, DIS);
DebugLoc RIDbgLoc =
DILocation::get(DIS->getContext(), DIS->getScopeLine(), 0, DIS);
- CI->setDebugLoc(CIDbgLoc);
- RI->setDebugLoc(RIDbgLoc);
- } else {
- LLVM_DEBUG(
- dbgs() << "writeThunk: (MergeFunctionsPDI) No DISubprogram for "
- << G->getName() << "()\n");
- }
- eraseTail(G);
- eraseInstsUnrelatedToPDI(PDIUnrelatedWL);
- LLVM_DEBUG(
- dbgs() << "} // End of parameter related debug info filtering for: "
- << G->getName() << "()\n");
- } else {
- NewG->copyAttributesFrom(G);
- NewG->takeName(G);
- removeUsers(G);
- G->replaceAllUsesWith(NewG);
- G->eraseFromParent();
- }
-
- LLVM_DEBUG(dbgs() << "writeThunk: " << H->getName() << '\n');
- ++NumThunksWritten;
-}
-
-// Whether this function may be replaced by an alias
-static bool canCreateAliasFor(Function *F) {
- if (!MergeFunctionsAliases || !F->hasGlobalUnnamedAddr())
- return false;
-
- // We should only see linkages supported by aliases here
- assert(F->hasLocalLinkage() || F->hasExternalLinkage()
- || F->hasWeakLinkage() || F->hasLinkOnceLinkage());
- return true;
-}
-
-// Replace G with an alias to F (deleting function G)
-void MergeFunctions::writeAlias(Function *F, Function *G) {
- Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
- PointerType *PtrType = G->getType();
- auto *GA = GlobalAlias::create(
- PtrType->getElementType(), PtrType->getAddressSpace(),
- G->getLinkage(), "", BitcastF, G->getParent());
-
- F->setAlignment(MaybeAlign(std::max(F->getAlignment(), G->getAlignment())));
- GA->takeName(G);
- GA->setVisibility(G->getVisibility());
- GA->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-
- removeUsers(G);
- G->replaceAllUsesWith(GA);
- G->eraseFromParent();
-
- LLVM_DEBUG(dbgs() << "writeAlias: " << GA->getName() << '\n');
- ++NumAliasesWritten;
-}
-
-// Replace G with an alias to F if possible, or a thunk to F if
-// profitable. Returns false if neither is the case.
-bool MergeFunctions::writeThunkOrAlias(Function *F, Function *G) {
- if (canCreateAliasFor(G)) {
- writeAlias(F, G);
- return true;
- }
- if (canCreateThunkFor(F)) {
- writeThunk(F, G);
- return true;
- }
- return false;
-}
-
-// Merge two equivalent functions. Upon completion, Function G is deleted.
-void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
- if (F->isInterposable()) {
- assert(G->isInterposable());
-
- // Both writeThunkOrAlias() calls below must succeed, either because we can
- // create aliases for G and NewF, or because a thunk for F is profitable.
- // F here has the same signature as NewF below, so that's what we check.
- if (!canCreateThunkFor(F) &&
- (!canCreateAliasFor(F) || !canCreateAliasFor(G)))
- return;
-
- // Make them both thunks to the same internal function.
- Function *NewF = Function::Create(F->getFunctionType(), F->getLinkage(),
- F->getAddressSpace(), "", F->getParent());
- NewF->copyAttributesFrom(F);
- NewF->takeName(F);
- removeUsers(F);
- F->replaceAllUsesWith(NewF);
-
- MaybeAlign MaxAlignment(std::max(G->getAlignment(), NewF->getAlignment()));
-
- writeThunkOrAlias(F, G);
- writeThunkOrAlias(F, NewF);
-
- F->setAlignment(MaxAlignment);
- F->setLinkage(GlobalValue::PrivateLinkage);
- ++NumDoubleWeak;
- ++NumFunctionsMerged;
- } else {
- // For better debugability, under MergeFunctionsPDI, we do not modify G's
- // call sites to point to F even when within the same translation unit.
- if (!G->isInterposable() && !MergeFunctionsPDI) {
- if (G->hasGlobalUnnamedAddr()) {
- // G might have been a key in our GlobalNumberState, and it's illegal
- // to replace a key in ValueMap<GlobalValue *> with a non-global.
- GlobalNumbers.erase(G);
- // If G's address is not significant, replace it entirely.
- Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
- removeUsers(G);
- G->replaceAllUsesWith(BitcastF);
- } else {
- // Redirect direct callers of G to F. (See note on MergeFunctionsPDI
- // above).
- replaceDirectCallers(G, F);
- }
- }
-
- // If G was internal then we may have replaced all uses of G with F. If so,
- // stop here and delete G. There's no need for a thunk. (See note on
- // MergeFunctionsPDI above).
- if (G->isDiscardableIfUnused() && G->use_empty() && !MergeFunctionsPDI) {
- G->eraseFromParent();
- ++NumFunctionsMerged;
- return;
- }
-
- if (writeThunkOrAlias(F, G)) {
- ++NumFunctionsMerged;
- }
- }
-}
-
-/// Replace function F by function G.
-void MergeFunctions::replaceFunctionInTree(const FunctionNode &FN,
- Function *G) {
- Function *F = FN.getFunc();
- assert(FunctionComparator(F, G, &GlobalNumbers).compare() == 0 &&
- "The two functions must be equal");
-
- auto I = FNodesInTree.find(F);
- assert(I != FNodesInTree.end() && "F should be in FNodesInTree");
- assert(FNodesInTree.count(G) == 0 && "FNodesInTree should not contain G");
-
- FnTreeType::iterator IterToFNInFnTree = I->second;
- assert(&(*IterToFNInFnTree) == &FN && "F should map to FN in FNodesInTree.");
- // Remove F -> FN and insert G -> FN
- FNodesInTree.erase(I);
- FNodesInTree.insert({G, IterToFNInFnTree});
- // Replace F with G in FN, which is stored inside the FnTree.
- FN.replaceBy(G);
-}
-
-// Ordering for functions that are equal under FunctionComparator
-static bool isFuncOrderCorrect(const Function *F, const Function *G) {
- if (F->isInterposable() != G->isInterposable()) {
- // Strong before weak, because the weak function may call the strong
- // one, but not the other way around.
- return !F->isInterposable();
- }
- if (F->hasLocalLinkage() != G->hasLocalLinkage()) {
- // External before local, because we definitely have to keep the external
- // function, but may be able to drop the local one.
- return !F->hasLocalLinkage();
- }
- // Impose a total order (by name) on the replacement of functions. This is
- // important when operating on more than one module independently to prevent
- // cycles of thunks calling each other when the modules are linked together.
- return F->getName() <= G->getName();
-}
-
-// Insert a ComparableFunction into the FnTree, or merge it away if equal to one
-// that was already inserted.
-bool MergeFunctions::insert(Function *NewFunction) {
- std::pair<FnTreeType::iterator, bool> Result =
- FnTree.insert(FunctionNode(NewFunction));
-
- if (Result.second) {
- assert(FNodesInTree.count(NewFunction) == 0);
- FNodesInTree.insert({NewFunction, Result.first});
- LLVM_DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName()
- << '\n');
- return false;
- }
-
- const FunctionNode &OldF = *Result.first;
-
- if (!isFuncOrderCorrect(OldF.getFunc(), NewFunction)) {
- // Swap the two functions.
- Function *F = OldF.getFunc();
- replaceFunctionInTree(*Result.first, NewFunction);
- NewFunction = F;
- assert(OldF.getFunc() != F && "Must have swapped the functions.");
- }
-
- LLVM_DEBUG(dbgs() << " " << OldF.getFunc()->getName()
- << " == " << NewFunction->getName() << '\n');
-
- Function *DeleteF = NewFunction;
- mergeTwoFunctions(OldF.getFunc(), DeleteF);
- return true;
-}
-
-// Remove a function from FnTree. If it was already in FnTree, add
-// it to Deferred so that we'll look at it in the next round.
-void MergeFunctions::remove(Function *F) {
- auto I = FNodesInTree.find(F);
- if (I != FNodesInTree.end()) {
- LLVM_DEBUG(dbgs() << "Deferred " << F->getName() << ".\n");
- FnTree.erase(I->second);
- // I->second has been invalidated, remove it from the FNodesInTree map to
- // preserve the invariant.
- FNodesInTree.erase(I);
- Deferred.emplace_back(F);
- }
-}
-
-// For each instruction used by the value, remove() the function that contains
-// the instruction. This should happen right before a call to RAUW.
-void MergeFunctions::removeUsers(Value *V) {
- for (User *U : V->users())
- if (auto *I = dyn_cast<Instruction>(U))
- remove(I->getFunction());
-}
+ CI->setDebugLoc(CIDbgLoc);
+ RI->setDebugLoc(RIDbgLoc);
+ } else {
+ LLVM_DEBUG(
+ dbgs() << "writeThunk: (MergeFunctionsPDI) No DISubprogram for "
+ << G->getName() << "()\n");
+ }
+ eraseTail(G);
+ eraseInstsUnrelatedToPDI(PDIUnrelatedWL);
+ LLVM_DEBUG(
+ dbgs() << "} // End of parameter related debug info filtering for: "
+ << G->getName() << "()\n");
+ } else {
+ NewG->copyAttributesFrom(G);
+ NewG->takeName(G);
+ removeUsers(G);
+ G->replaceAllUsesWith(NewG);
+ G->eraseFromParent();
+ }
+
+ LLVM_DEBUG(dbgs() << "writeThunk: " << H->getName() << '\n');
+ ++NumThunksWritten;
+}
+
+// Whether this function may be replaced by an alias
+static bool canCreateAliasFor(Function *F) {
+ if (!MergeFunctionsAliases || !F->hasGlobalUnnamedAddr())
+ return false;
+
+ // We should only see linkages supported by aliases here
+ assert(F->hasLocalLinkage() || F->hasExternalLinkage()
+ || F->hasWeakLinkage() || F->hasLinkOnceLinkage());
+ return true;
+}
+
+// Replace G with an alias to F (deleting function G)
+void MergeFunctions::writeAlias(Function *F, Function *G) {
+ Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
+ PointerType *PtrType = G->getType();
+ auto *GA = GlobalAlias::create(
+ PtrType->getElementType(), PtrType->getAddressSpace(),
+ G->getLinkage(), "", BitcastF, G->getParent());
+
+ F->setAlignment(MaybeAlign(std::max(F->getAlignment(), G->getAlignment())));
+ GA->takeName(G);
+ GA->setVisibility(G->getVisibility());
+ GA->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+ removeUsers(G);
+ G->replaceAllUsesWith(GA);
+ G->eraseFromParent();
+
+ LLVM_DEBUG(dbgs() << "writeAlias: " << GA->getName() << '\n');
+ ++NumAliasesWritten;
+}
+
+// Replace G with an alias to F if possible, or a thunk to F if
+// profitable. Returns false if neither is the case.
+bool MergeFunctions::writeThunkOrAlias(Function *F, Function *G) {
+ if (canCreateAliasFor(G)) {
+ writeAlias(F, G);
+ return true;
+ }
+ if (canCreateThunkFor(F)) {
+ writeThunk(F, G);
+ return true;
+ }
+ return false;
+}
+
+// Merge two equivalent functions. Upon completion, Function G is deleted.
+void MergeFunctions::mergeTwoFunctions(Function *F, Function *G) {
+ if (F->isInterposable()) {
+ assert(G->isInterposable());
+
+ // Both writeThunkOrAlias() calls below must succeed, either because we can
+ // create aliases for G and NewF, or because a thunk for F is profitable.
+ // F here has the same signature as NewF below, so that's what we check.
+ if (!canCreateThunkFor(F) &&
+ (!canCreateAliasFor(F) || !canCreateAliasFor(G)))
+ return;
+
+ // Make them both thunks to the same internal function.
+ Function *NewF = Function::Create(F->getFunctionType(), F->getLinkage(),
+ F->getAddressSpace(), "", F->getParent());
+ NewF->copyAttributesFrom(F);
+ NewF->takeName(F);
+ removeUsers(F);
+ F->replaceAllUsesWith(NewF);
+
+ MaybeAlign MaxAlignment(std::max(G->getAlignment(), NewF->getAlignment()));
+
+ writeThunkOrAlias(F, G);
+ writeThunkOrAlias(F, NewF);
+
+ F->setAlignment(MaxAlignment);
+ F->setLinkage(GlobalValue::PrivateLinkage);
+ ++NumDoubleWeak;
+ ++NumFunctionsMerged;
+ } else {
+ // For better debugability, under MergeFunctionsPDI, we do not modify G's
+ // call sites to point to F even when within the same translation unit.
+ if (!G->isInterposable() && !MergeFunctionsPDI) {
+ if (G->hasGlobalUnnamedAddr()) {
+ // G might have been a key in our GlobalNumberState, and it's illegal
+ // to replace a key in ValueMap<GlobalValue *> with a non-global.
+ GlobalNumbers.erase(G);
+ // If G's address is not significant, replace it entirely.
+ Constant *BitcastF = ConstantExpr::getBitCast(F, G->getType());
+ removeUsers(G);
+ G->replaceAllUsesWith(BitcastF);
+ } else {
+ // Redirect direct callers of G to F. (See note on MergeFunctionsPDI
+ // above).
+ replaceDirectCallers(G, F);
+ }
+ }
+
+ // If G was internal then we may have replaced all uses of G with F. If so,
+ // stop here and delete G. There's no need for a thunk. (See note on
+ // MergeFunctionsPDI above).
+ if (G->isDiscardableIfUnused() && G->use_empty() && !MergeFunctionsPDI) {
+ G->eraseFromParent();
+ ++NumFunctionsMerged;
+ return;
+ }
+
+ if (writeThunkOrAlias(F, G)) {
+ ++NumFunctionsMerged;
+ }
+ }
+}
+
+/// Replace function F by function G.
+void MergeFunctions::replaceFunctionInTree(const FunctionNode &FN,
+ Function *G) {
+ Function *F = FN.getFunc();
+ assert(FunctionComparator(F, G, &GlobalNumbers).compare() == 0 &&
+ "The two functions must be equal");
+
+ auto I = FNodesInTree.find(F);
+ assert(I != FNodesInTree.end() && "F should be in FNodesInTree");
+ assert(FNodesInTree.count(G) == 0 && "FNodesInTree should not contain G");
+
+ FnTreeType::iterator IterToFNInFnTree = I->second;
+ assert(&(*IterToFNInFnTree) == &FN && "F should map to FN in FNodesInTree.");
+ // Remove F -> FN and insert G -> FN
+ FNodesInTree.erase(I);
+ FNodesInTree.insert({G, IterToFNInFnTree});
+ // Replace F with G in FN, which is stored inside the FnTree.
+ FN.replaceBy(G);
+}
+
+// Ordering for functions that are equal under FunctionComparator
+static bool isFuncOrderCorrect(const Function *F, const Function *G) {
+ if (F->isInterposable() != G->isInterposable()) {
+ // Strong before weak, because the weak function may call the strong
+ // one, but not the other way around.
+ return !F->isInterposable();
+ }
+ if (F->hasLocalLinkage() != G->hasLocalLinkage()) {
+ // External before local, because we definitely have to keep the external
+ // function, but may be able to drop the local one.
+ return !F->hasLocalLinkage();
+ }
+ // Impose a total order (by name) on the replacement of functions. This is
+ // important when operating on more than one module independently to prevent
+ // cycles of thunks calling each other when the modules are linked together.
+ return F->getName() <= G->getName();
+}
+
+// Insert a ComparableFunction into the FnTree, or merge it away if equal to one
+// that was already inserted.
+bool MergeFunctions::insert(Function *NewFunction) {
+ std::pair<FnTreeType::iterator, bool> Result =
+ FnTree.insert(FunctionNode(NewFunction));
+
+ if (Result.second) {
+ assert(FNodesInTree.count(NewFunction) == 0);
+ FNodesInTree.insert({NewFunction, Result.first});
+ LLVM_DEBUG(dbgs() << "Inserting as unique: " << NewFunction->getName()
+ << '\n');
+ return false;
+ }
+
+ const FunctionNode &OldF = *Result.first;
+
+ if (!isFuncOrderCorrect(OldF.getFunc(), NewFunction)) {
+ // Swap the two functions.
+ Function *F = OldF.getFunc();
+ replaceFunctionInTree(*Result.first, NewFunction);
+ NewFunction = F;
+ assert(OldF.getFunc() != F && "Must have swapped the functions.");
+ }
+
+ LLVM_DEBUG(dbgs() << " " << OldF.getFunc()->getName()
+ << " == " << NewFunction->getName() << '\n');
+
+ Function *DeleteF = NewFunction;
+ mergeTwoFunctions(OldF.getFunc(), DeleteF);
+ return true;
+}
+
+// Remove a function from FnTree. If it was already in FnTree, add
+// it to Deferred so that we'll look at it in the next round.
+void MergeFunctions::remove(Function *F) {
+ auto I = FNodesInTree.find(F);
+ if (I != FNodesInTree.end()) {
+ LLVM_DEBUG(dbgs() << "Deferred " << F->getName() << ".\n");
+ FnTree.erase(I->second);
+ // I->second has been invalidated, remove it from the FNodesInTree map to
+ // preserve the invariant.
+ FNodesInTree.erase(I);
+ Deferred.emplace_back(F);
+ }
+}
+
+// For each instruction used by the value, remove() the function that contains
+// the instruction. This should happen right before a call to RAUW.
+void MergeFunctions::removeUsers(Value *V) {
+ for (User *U : V->users())
+ if (auto *I = dyn_cast<Instruction>(U))
+ remove(I->getFunction());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/OpenMPOpt.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/OpenMPOpt.cpp
index f213859928..a5ba6edb9a 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/OpenMPOpt.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -1,323 +1,323 @@
-//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// OpenMP specific optimizations:
-//
-// - Deduplication of runtime calls, e.g., omp_get_thread_num.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/OpenMPOpt.h"
-
-#include "llvm/ADT/EnumeratedArray.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OpenMP specific optimizations:
+//
+// - Deduplication of runtime calls, e.g., omp_get_thread_num.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/OpenMPOpt.h"
+
+#include "llvm/ADT/EnumeratedArray.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/Attributor.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/Attributor.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include "llvm/Transforms/Utils/CallGraphUpdater.h"
#include "llvm/Transforms/Utils/CodeExtractor.h"
-
-using namespace llvm;
-using namespace omp;
-
-#define DEBUG_TYPE "openmp-opt"
-
-static cl::opt<bool> DisableOpenMPOptimizations(
- "openmp-opt-disable", cl::ZeroOrMore,
- cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
- cl::init(false));
-
+
+using namespace llvm;
+using namespace omp;
+
+#define DEBUG_TYPE "openmp-opt"
+
+static cl::opt<bool> DisableOpenMPOptimizations(
+ "openmp-opt-disable", cl::ZeroOrMore,
+ cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
+ cl::init(false));
+
static cl::opt<bool> EnableParallelRegionMerging(
"openmp-opt-enable-merging", cl::ZeroOrMore,
cl::desc("Enable the OpenMP region merging optimization."), cl::Hidden,
cl::init(false));
-static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
- cl::Hidden);
-static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
- cl::init(false), cl::Hidden);
-
+static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
+ cl::Hidden);
+static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
+ cl::init(false), cl::Hidden);
+
static cl::opt<bool> HideMemoryTransferLatency(
"openmp-hide-memory-transfer-latency",
cl::desc("[WIP] Tries to hide the latency of host to device memory"
" transfers"),
cl::Hidden, cl::init(false));
-STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
- "Number of OpenMP runtime calls deduplicated");
-STATISTIC(NumOpenMPParallelRegionsDeleted,
- "Number of OpenMP parallel regions deleted");
-STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
- "Number of OpenMP runtime functions identified");
-STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
- "Number of OpenMP runtime function uses identified");
-STATISTIC(NumOpenMPTargetRegionKernels,
- "Number of OpenMP target region entry points (=kernels) identified");
-STATISTIC(
- NumOpenMPParallelRegionsReplacedInGPUStateMachine,
- "Number of OpenMP parallel regions replaced with ID in GPU state machines");
+STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
+ "Number of OpenMP runtime calls deduplicated");
+STATISTIC(NumOpenMPParallelRegionsDeleted,
+ "Number of OpenMP parallel regions deleted");
+STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
+ "Number of OpenMP runtime functions identified");
+STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
+ "Number of OpenMP runtime function uses identified");
+STATISTIC(NumOpenMPTargetRegionKernels,
+ "Number of OpenMP target region entry points (=kernels) identified");
+STATISTIC(
+ NumOpenMPParallelRegionsReplacedInGPUStateMachine,
+ "Number of OpenMP parallel regions replaced with ID in GPU state machines");
STATISTIC(NumOpenMPParallelRegionsMerged,
"Number of OpenMP parallel regions merged");
-
-#if !defined(NDEBUG)
-static constexpr auto TAG = "[" DEBUG_TYPE "]";
-#endif
-
-namespace {
-
-struct AAICVTracker;
-
-/// OpenMP specific information. For now, stores RFIs and ICVs also needed for
-/// Attributor runs.
-struct OMPInformationCache : public InformationCache {
- OMPInformationCache(Module &M, AnalysisGetter &AG,
- BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC,
- SmallPtrSetImpl<Kernel> &Kernels)
- : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
- Kernels(Kernels) {
-
- OMPBuilder.initialize();
- initializeRuntimeFunctions();
- initializeInternalControlVars();
- }
-
- /// Generic information that describes an internal control variable.
- struct InternalControlVarInfo {
- /// The kind, as described by InternalControlVar enum.
- InternalControlVar Kind;
-
- /// The name of the ICV.
- StringRef Name;
-
- /// Environment variable associated with this ICV.
- StringRef EnvVarName;
-
- /// Initial value kind.
- ICVInitValue InitKind;
-
- /// Initial value.
- ConstantInt *InitValue;
-
- /// Setter RTL function associated with this ICV.
- RuntimeFunction Setter;
-
- /// Getter RTL function associated with this ICV.
- RuntimeFunction Getter;
-
- /// RTL Function corresponding to the override clause of this ICV
- RuntimeFunction Clause;
- };
-
- /// Generic information that describes a runtime function
- struct RuntimeFunctionInfo {
-
- /// The kind, as described by the RuntimeFunction enum.
- RuntimeFunction Kind;
-
- /// The name of the function.
- StringRef Name;
-
- /// Flag to indicate a variadic function.
- bool IsVarArg;
-
- /// The return type of the function.
- Type *ReturnType;
-
- /// The argument types of the function.
- SmallVector<Type *, 8> ArgumentTypes;
-
- /// The declaration if available.
- Function *Declaration = nullptr;
-
- /// Uses of this runtime function per function containing the use.
- using UseVector = SmallVector<Use *, 16>;
-
- /// Clear UsesMap for runtime function.
- void clearUsesMap() { UsesMap.clear(); }
-
- /// Boolean conversion that is true if the runtime function was found.
- operator bool() const { return Declaration; }
-
- /// Return the vector of uses in function \p F.
- UseVector &getOrCreateUseVector(Function *F) {
- std::shared_ptr<UseVector> &UV = UsesMap[F];
- if (!UV)
- UV = std::make_shared<UseVector>();
- return *UV;
- }
-
- /// Return the vector of uses in function \p F or `nullptr` if there are
- /// none.
- const UseVector *getUseVector(Function &F) const {
- auto I = UsesMap.find(&F);
- if (I != UsesMap.end())
- return I->second.get();
- return nullptr;
- }
-
- /// Return how many functions contain uses of this runtime function.
- size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
-
- /// Return the number of arguments (or the minimal number for variadic
- /// functions).
- size_t getNumArgs() const { return ArgumentTypes.size(); }
-
- /// Run the callback \p CB on each use and forget the use if the result is
- /// true. The callback will be fed the function in which the use was
- /// encountered as second argument.
- void foreachUse(SmallVectorImpl<Function *> &SCC,
- function_ref<bool(Use &, Function &)> CB) {
- for (Function *F : SCC)
- foreachUse(CB, F);
- }
-
- /// Run the callback \p CB on each use within the function \p F and forget
- /// the use if the result is true.
- void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
- SmallVector<unsigned, 8> ToBeDeleted;
- ToBeDeleted.clear();
-
- unsigned Idx = 0;
- UseVector &UV = getOrCreateUseVector(F);
-
- for (Use *U : UV) {
- if (CB(*U, *F))
- ToBeDeleted.push_back(Idx);
- ++Idx;
- }
-
- // Remove the to-be-deleted indices in reverse order as prior
- // modifications will not modify the smaller indices.
- while (!ToBeDeleted.empty()) {
- unsigned Idx = ToBeDeleted.pop_back_val();
- UV[Idx] = UV.back();
- UV.pop_back();
- }
- }
-
- private:
- /// Map from functions to all uses of this runtime function contained in
- /// them.
- DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
- };
-
- /// An OpenMP-IR-Builder instance
- OpenMPIRBuilder OMPBuilder;
-
- /// Map from runtime function kind to the runtime function description.
- EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
- RuntimeFunction::OMPRTL___last>
- RFIs;
-
- /// Map from ICV kind to the ICV description.
- EnumeratedArray<InternalControlVarInfo, InternalControlVar,
- InternalControlVar::ICV___last>
- ICVs;
-
- /// Helper to initialize all internal control variable information for those
- /// defined in OMPKinds.def.
- void initializeInternalControlVars() {
-#define ICV_RT_SET(_Name, RTL) \
- { \
- auto &ICV = ICVs[_Name]; \
- ICV.Setter = RTL; \
- }
-#define ICV_RT_GET(Name, RTL) \
- { \
- auto &ICV = ICVs[Name]; \
- ICV.Getter = RTL; \
- }
-#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
- { \
- auto &ICV = ICVs[Enum]; \
- ICV.Name = _Name; \
- ICV.Kind = Enum; \
- ICV.InitKind = Init; \
- ICV.EnvVarName = _EnvVarName; \
- switch (ICV.InitKind) { \
- case ICV_IMPLEMENTATION_DEFINED: \
- ICV.InitValue = nullptr; \
- break; \
- case ICV_ZERO: \
- ICV.InitValue = ConstantInt::get( \
- Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
- break; \
- case ICV_FALSE: \
- ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
- break; \
- case ICV_LAST: \
- break; \
- } \
- }
-#include "llvm/Frontend/OpenMP/OMPKinds.def"
- }
-
- /// Returns true if the function declaration \p F matches the runtime
- /// function types, that is, return type \p RTFRetType, and argument types
- /// \p RTFArgTypes.
- static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
- SmallVector<Type *, 8> &RTFArgTypes) {
- // TODO: We should output information to the user (under debug output
- // and via remarks).
-
- if (!F)
- return false;
- if (F->getReturnType() != RTFRetType)
- return false;
- if (F->arg_size() != RTFArgTypes.size())
- return false;
-
- auto RTFTyIt = RTFArgTypes.begin();
- for (Argument &Arg : F->args()) {
- if (Arg.getType() != *RTFTyIt)
- return false;
-
- ++RTFTyIt;
- }
-
- return true;
- }
-
- // Helper to collect all uses of the declaration in the UsesMap.
- unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
- unsigned NumUses = 0;
- if (!RFI.Declaration)
- return NumUses;
- OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
-
- if (CollectStats) {
- NumOpenMPRuntimeFunctionsIdentified += 1;
- NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
- }
-
- // TODO: We directly convert uses into proper calls and unknown uses.
- for (Use &U : RFI.Declaration->uses()) {
- if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
- if (ModuleSlice.count(UserI->getFunction())) {
- RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
- ++NumUses;
- }
- } else {
- RFI.getOrCreateUseVector(nullptr).push_back(&U);
- ++NumUses;
- }
- }
- return NumUses;
- }
-
+
+#if !defined(NDEBUG)
+static constexpr auto TAG = "[" DEBUG_TYPE "]";
+#endif
+
+namespace {
+
+struct AAICVTracker;
+
+/// OpenMP specific information. For now, stores RFIs and ICVs also needed for
+/// Attributor runs.
+struct OMPInformationCache : public InformationCache {
+ OMPInformationCache(Module &M, AnalysisGetter &AG,
+ BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC,
+ SmallPtrSetImpl<Kernel> &Kernels)
+ : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
+ Kernels(Kernels) {
+
+ OMPBuilder.initialize();
+ initializeRuntimeFunctions();
+ initializeInternalControlVars();
+ }
+
+ /// Generic information that describes an internal control variable.
+ struct InternalControlVarInfo {
+ /// The kind, as described by InternalControlVar enum.
+ InternalControlVar Kind;
+
+ /// The name of the ICV.
+ StringRef Name;
+
+ /// Environment variable associated with this ICV.
+ StringRef EnvVarName;
+
+ /// Initial value kind.
+ ICVInitValue InitKind;
+
+ /// Initial value.
+ ConstantInt *InitValue;
+
+ /// Setter RTL function associated with this ICV.
+ RuntimeFunction Setter;
+
+ /// Getter RTL function associated with this ICV.
+ RuntimeFunction Getter;
+
+ /// RTL Function corresponding to the override clause of this ICV
+ RuntimeFunction Clause;
+ };
+
+ /// Generic information that describes a runtime function
+ struct RuntimeFunctionInfo {
+
+ /// The kind, as described by the RuntimeFunction enum.
+ RuntimeFunction Kind;
+
+ /// The name of the function.
+ StringRef Name;
+
+ /// Flag to indicate a variadic function.
+ bool IsVarArg;
+
+ /// The return type of the function.
+ Type *ReturnType;
+
+ /// The argument types of the function.
+ SmallVector<Type *, 8> ArgumentTypes;
+
+ /// The declaration if available.
+ Function *Declaration = nullptr;
+
+ /// Uses of this runtime function per function containing the use.
+ using UseVector = SmallVector<Use *, 16>;
+
+ /// Clear UsesMap for runtime function.
+ void clearUsesMap() { UsesMap.clear(); }
+
+ /// Boolean conversion that is true if the runtime function was found.
+ operator bool() const { return Declaration; }
+
+ /// Return the vector of uses in function \p F.
+ UseVector &getOrCreateUseVector(Function *F) {
+ std::shared_ptr<UseVector> &UV = UsesMap[F];
+ if (!UV)
+ UV = std::make_shared<UseVector>();
+ return *UV;
+ }
+
+ /// Return the vector of uses in function \p F or `nullptr` if there are
+ /// none.
+ const UseVector *getUseVector(Function &F) const {
+ auto I = UsesMap.find(&F);
+ if (I != UsesMap.end())
+ return I->second.get();
+ return nullptr;
+ }
+
+ /// Return how many functions contain uses of this runtime function.
+ size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
+
+ /// Return the number of arguments (or the minimal number for variadic
+ /// functions).
+ size_t getNumArgs() const { return ArgumentTypes.size(); }
+
+ /// Run the callback \p CB on each use and forget the use if the result is
+ /// true. The callback will be fed the function in which the use was
+ /// encountered as second argument.
+ void foreachUse(SmallVectorImpl<Function *> &SCC,
+ function_ref<bool(Use &, Function &)> CB) {
+ for (Function *F : SCC)
+ foreachUse(CB, F);
+ }
+
+ /// Run the callback \p CB on each use within the function \p F and forget
+ /// the use if the result is true.
+ void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
+ SmallVector<unsigned, 8> ToBeDeleted;
+ ToBeDeleted.clear();
+
+ unsigned Idx = 0;
+ UseVector &UV = getOrCreateUseVector(F);
+
+ for (Use *U : UV) {
+ if (CB(*U, *F))
+ ToBeDeleted.push_back(Idx);
+ ++Idx;
+ }
+
+ // Remove the to-be-deleted indices in reverse order as prior
+ // modifications will not modify the smaller indices.
+ while (!ToBeDeleted.empty()) {
+ unsigned Idx = ToBeDeleted.pop_back_val();
+ UV[Idx] = UV.back();
+ UV.pop_back();
+ }
+ }
+
+ private:
+ /// Map from functions to all uses of this runtime function contained in
+ /// them.
+ DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
+ };
+
+ /// An OpenMP-IR-Builder instance
+ OpenMPIRBuilder OMPBuilder;
+
+ /// Map from runtime function kind to the runtime function description.
+ EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
+ RuntimeFunction::OMPRTL___last>
+ RFIs;
+
+ /// Map from ICV kind to the ICV description.
+ EnumeratedArray<InternalControlVarInfo, InternalControlVar,
+ InternalControlVar::ICV___last>
+ ICVs;
+
+ /// Helper to initialize all internal control variable information for those
+ /// defined in OMPKinds.def.
+ void initializeInternalControlVars() {
+#define ICV_RT_SET(_Name, RTL) \
+ { \
+ auto &ICV = ICVs[_Name]; \
+ ICV.Setter = RTL; \
+ }
+#define ICV_RT_GET(Name, RTL) \
+ { \
+ auto &ICV = ICVs[Name]; \
+ ICV.Getter = RTL; \
+ }
+#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init) \
+ { \
+ auto &ICV = ICVs[Enum]; \
+ ICV.Name = _Name; \
+ ICV.Kind = Enum; \
+ ICV.InitKind = Init; \
+ ICV.EnvVarName = _EnvVarName; \
+ switch (ICV.InitKind) { \
+ case ICV_IMPLEMENTATION_DEFINED: \
+ ICV.InitValue = nullptr; \
+ break; \
+ case ICV_ZERO: \
+ ICV.InitValue = ConstantInt::get( \
+ Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0); \
+ break; \
+ case ICV_FALSE: \
+ ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext()); \
+ break; \
+ case ICV_LAST: \
+ break; \
+ } \
+ }
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+ }
+
+ /// Returns true if the function declaration \p F matches the runtime
+ /// function types, that is, return type \p RTFRetType, and argument types
+ /// \p RTFArgTypes.
+ static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
+ SmallVector<Type *, 8> &RTFArgTypes) {
+ // TODO: We should output information to the user (under debug output
+ // and via remarks).
+
+ if (!F)
+ return false;
+ if (F->getReturnType() != RTFRetType)
+ return false;
+ if (F->arg_size() != RTFArgTypes.size())
+ return false;
+
+ auto RTFTyIt = RTFArgTypes.begin();
+ for (Argument &Arg : F->args()) {
+ if (Arg.getType() != *RTFTyIt)
+ return false;
+
+ ++RTFTyIt;
+ }
+
+ return true;
+ }
+
+ // Helper to collect all uses of the declaration in the UsesMap.
+ unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
+ unsigned NumUses = 0;
+ if (!RFI.Declaration)
+ return NumUses;
+ OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
+
+ if (CollectStats) {
+ NumOpenMPRuntimeFunctionsIdentified += 1;
+ NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
+ }
+
+ // TODO: We directly convert uses into proper calls and unknown uses.
+ for (Use &U : RFI.Declaration->uses()) {
+ if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
+ if (ModuleSlice.count(UserI->getFunction())) {
+ RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
+ ++NumUses;
+ }
+ } else {
+ RFI.getOrCreateUseVector(nullptr).push_back(&U);
+ ++NumUses;
+ }
+ }
+ return NumUses;
+ }
+
// Helper function to recollect uses of a runtime function.
void recollectUsesForFunction(RuntimeFunction RTF) {
auto &RFI = RFIs[RTF];
@@ -325,73 +325,73 @@ struct OMPInformationCache : public InformationCache {
collectUses(RFI, /*CollectStats*/ false);
}
- // Helper function to recollect uses of all runtime functions.
- void recollectUses() {
+ // Helper function to recollect uses of all runtime functions.
+ void recollectUses() {
for (int Idx = 0; Idx < RFIs.size(); ++Idx)
recollectUsesForFunction(static_cast<RuntimeFunction>(Idx));
- }
-
- /// Helper to initialize all runtime function information for those defined
- /// in OpenMPKinds.def.
- void initializeRuntimeFunctions() {
- Module &M = *((*ModuleSlice.begin())->getParent());
-
- // Helper macros for handling __VA_ARGS__ in OMP_RTL
-#define OMP_TYPE(VarName, ...) \
- Type *VarName = OMPBuilder.VarName; \
- (void)VarName;
-
-#define OMP_ARRAY_TYPE(VarName, ...) \
- ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
- (void)VarName##Ty; \
- PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
- (void)VarName##PtrTy;
-
-#define OMP_FUNCTION_TYPE(VarName, ...) \
- FunctionType *VarName = OMPBuilder.VarName; \
- (void)VarName; \
- PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
- (void)VarName##Ptr;
-
-#define OMP_STRUCT_TYPE(VarName, ...) \
- StructType *VarName = OMPBuilder.VarName; \
- (void)VarName; \
- PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
- (void)VarName##Ptr;
-
-#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
- { \
- SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
- Function *F = M.getFunction(_Name); \
- if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
- auto &RFI = RFIs[_Enum]; \
- RFI.Kind = _Enum; \
- RFI.Name = _Name; \
- RFI.IsVarArg = _IsVarArg; \
- RFI.ReturnType = OMPBuilder._ReturnType; \
- RFI.ArgumentTypes = std::move(ArgsTypes); \
- RFI.Declaration = F; \
- unsigned NumUses = collectUses(RFI); \
- (void)NumUses; \
- LLVM_DEBUG({ \
- dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
- << " found\n"; \
- if (RFI.Declaration) \
- dbgs() << TAG << "-> got " << NumUses << " uses in " \
- << RFI.getNumFunctionsWithUses() \
- << " different functions.\n"; \
- }); \
- } \
- }
-#include "llvm/Frontend/OpenMP/OMPKinds.def"
-
- // TODO: We should attach the attributes defined in OMPKinds.def.
- }
-
- /// Collection of known kernels (\see Kernel) in the module.
- SmallPtrSetImpl<Kernel> &Kernels;
-};
-
+ }
+
+ /// Helper to initialize all runtime function information for those defined
+ /// in OpenMPKinds.def.
+ void initializeRuntimeFunctions() {
+ Module &M = *((*ModuleSlice.begin())->getParent());
+
+ // Helper macros for handling __VA_ARGS__ in OMP_RTL
+#define OMP_TYPE(VarName, ...) \
+ Type *VarName = OMPBuilder.VarName; \
+ (void)VarName;
+
+#define OMP_ARRAY_TYPE(VarName, ...) \
+ ArrayType *VarName##Ty = OMPBuilder.VarName##Ty; \
+ (void)VarName##Ty; \
+ PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy; \
+ (void)VarName##PtrTy;
+
+#define OMP_FUNCTION_TYPE(VarName, ...) \
+ FunctionType *VarName = OMPBuilder.VarName; \
+ (void)VarName; \
+ PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
+ (void)VarName##Ptr;
+
+#define OMP_STRUCT_TYPE(VarName, ...) \
+ StructType *VarName = OMPBuilder.VarName; \
+ (void)VarName; \
+ PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr; \
+ (void)VarName##Ptr;
+
+#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...) \
+ { \
+ SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__}); \
+ Function *F = M.getFunction(_Name); \
+ if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) { \
+ auto &RFI = RFIs[_Enum]; \
+ RFI.Kind = _Enum; \
+ RFI.Name = _Name; \
+ RFI.IsVarArg = _IsVarArg; \
+ RFI.ReturnType = OMPBuilder._ReturnType; \
+ RFI.ArgumentTypes = std::move(ArgsTypes); \
+ RFI.Declaration = F; \
+ unsigned NumUses = collectUses(RFI); \
+ (void)NumUses; \
+ LLVM_DEBUG({ \
+ dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not") \
+ << " found\n"; \
+ if (RFI.Declaration) \
+ dbgs() << TAG << "-> got " << NumUses << " uses in " \
+ << RFI.getNumFunctionsWithUses() \
+ << " different functions.\n"; \
+ }); \
+ } \
+ }
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+
+ // TODO: We should attach the attributes defined in OMPKinds.def.
+ }
+
+ /// Collection of known kernels (\see Kernel) in the module.
+ SmallPtrSetImpl<Kernel> &Kernels;
+};
+
/// Used to map the values physically (in the IR) stored in an offload
/// array, to a vector in memory.
struct OffloadArray {
@@ -477,122 +477,122 @@ private:
}
};
-struct OpenMPOpt {
-
- using OptimizationRemarkGetter =
- function_ref<OptimizationRemarkEmitter &(Function *)>;
-
- OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
- OptimizationRemarkGetter OREGetter,
- OMPInformationCache &OMPInfoCache, Attributor &A)
- : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
- OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
-
+struct OpenMPOpt {
+
+ using OptimizationRemarkGetter =
+ function_ref<OptimizationRemarkEmitter &(Function *)>;
+
+ OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
+ OptimizationRemarkGetter OREGetter,
+ OMPInformationCache &OMPInfoCache, Attributor &A)
+ : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
+ OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
+
/// Check if any remarks are enabled for openmp-opt
bool remarksEnabled() {
auto &Ctx = M.getContext();
return Ctx.getDiagHandlerPtr()->isAnyRemarkEnabled(DEBUG_TYPE);
}
- /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
- bool run() {
- if (SCC.empty())
- return false;
-
- bool Changed = false;
-
- LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
- << " functions in a slice with "
- << OMPInfoCache.ModuleSlice.size() << " functions\n");
-
- if (PrintICVValues)
- printICVs();
- if (PrintOpenMPKernels)
- printKernels();
-
- Changed |= rewriteDeviceCodeStateMachine();
-
- Changed |= runAttributor();
-
- // Recollect uses, in case Attributor deleted any.
- OMPInfoCache.recollectUses();
-
+ /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
+ bool run() {
+ if (SCC.empty())
+ return false;
+
+ bool Changed = false;
+
+ LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
+ << " functions in a slice with "
+ << OMPInfoCache.ModuleSlice.size() << " functions\n");
+
+ if (PrintICVValues)
+ printICVs();
+ if (PrintOpenMPKernels)
+ printKernels();
+
+ Changed |= rewriteDeviceCodeStateMachine();
+
+ Changed |= runAttributor();
+
+ // Recollect uses, in case Attributor deleted any.
+ OMPInfoCache.recollectUses();
+
Changed |= deleteParallelRegions();
if (HideMemoryTransferLatency)
Changed |= hideMemTransfersLatency();
if (remarksEnabled())
analysisGlobalization();
- Changed |= deduplicateRuntimeCalls();
+ Changed |= deduplicateRuntimeCalls();
if (EnableParallelRegionMerging) {
if (mergeParallelRegions()) {
deduplicateRuntimeCalls();
Changed = true;
}
}
-
- return Changed;
- }
-
- /// Print initial ICV values for testing.
- /// FIXME: This should be done from the Attributor once it is added.
- void printICVs() const {
+
+ return Changed;
+ }
+
+ /// Print initial ICV values for testing.
+ /// FIXME: This should be done from the Attributor once it is added.
+ void printICVs() const {
InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel,
ICV_proc_bind};
-
- for (Function *F : OMPInfoCache.ModuleSlice) {
- for (auto ICV : ICVs) {
- auto ICVInfo = OMPInfoCache.ICVs[ICV];
- auto Remark = [&](OptimizationRemark OR) {
- return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
- << " Value: "
- << (ICVInfo.InitValue
- ? ICVInfo.InitValue->getValue().toString(10, true)
- : "IMPLEMENTATION_DEFINED");
- };
-
- emitRemarkOnFunction(F, "OpenMPICVTracker", Remark);
- }
- }
- }
-
- /// Print OpenMP GPU kernels for testing.
- void printKernels() const {
- for (Function *F : SCC) {
- if (!OMPInfoCache.Kernels.count(F))
- continue;
-
- auto Remark = [&](OptimizationRemark OR) {
- return OR << "OpenMP GPU kernel "
- << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
- };
-
- emitRemarkOnFunction(F, "OpenMPGPU", Remark);
- }
- }
-
- /// Return the call if \p U is a callee use in a regular call. If \p RFI is
- /// given it has to be the callee or a nullptr is returned.
- static CallInst *getCallIfRegularCall(
- Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
- CallInst *CI = dyn_cast<CallInst>(U.getUser());
- if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
- (!RFI || CI->getCalledFunction() == RFI->Declaration))
- return CI;
- return nullptr;
- }
-
- /// Return the call if \p V is a regular call. If \p RFI is given it has to be
- /// the callee or a nullptr is returned.
- static CallInst *getCallIfRegularCall(
- Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
- CallInst *CI = dyn_cast<CallInst>(&V);
- if (CI && !CI->hasOperandBundles() &&
- (!RFI || CI->getCalledFunction() == RFI->Declaration))
- return CI;
- return nullptr;
- }
-
-private:
+
+ for (Function *F : OMPInfoCache.ModuleSlice) {
+ for (auto ICV : ICVs) {
+ auto ICVInfo = OMPInfoCache.ICVs[ICV];
+ auto Remark = [&](OptimizationRemark OR) {
+ return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
+ << " Value: "
+ << (ICVInfo.InitValue
+ ? ICVInfo.InitValue->getValue().toString(10, true)
+ : "IMPLEMENTATION_DEFINED");
+ };
+
+ emitRemarkOnFunction(F, "OpenMPICVTracker", Remark);
+ }
+ }
+ }
+
+ /// Print OpenMP GPU kernels for testing.
+ void printKernels() const {
+ for (Function *F : SCC) {
+ if (!OMPInfoCache.Kernels.count(F))
+ continue;
+
+ auto Remark = [&](OptimizationRemark OR) {
+ return OR << "OpenMP GPU kernel "
+ << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
+ };
+
+ emitRemarkOnFunction(F, "OpenMPGPU", Remark);
+ }
+ }
+
+ /// Return the call if \p U is a callee use in a regular call. If \p RFI is
+ /// given it has to be the callee or a nullptr is returned.
+ static CallInst *getCallIfRegularCall(
+ Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
+ CallInst *CI = dyn_cast<CallInst>(U.getUser());
+ if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
+ (!RFI || CI->getCalledFunction() == RFI->Declaration))
+ return CI;
+ return nullptr;
+ }
+
+ /// Return the call if \p V is a regular call. If \p RFI is given it has to be
+ /// the callee or a nullptr is returned.
+ static CallInst *getCallIfRegularCall(
+ Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
+ CallInst *CI = dyn_cast<CallInst>(&V);
+ if (CI && !CI->hasOperandBundles() &&
+ (!RFI || CI->getCalledFunction() == RFI->Declaration))
+ return CI;
+ return nullptr;
+ }
+
+private:
/// Merge parallel regions when it is safe.
bool mergeParallelRegions() {
const unsigned CallbackCalleeOperand = 2;
@@ -981,101 +981,101 @@ private:
return Changed;
}
- /// Try to delete parallel regions if possible.
- bool deleteParallelRegions() {
- const unsigned CallbackCalleeOperand = 2;
-
- OMPInformationCache::RuntimeFunctionInfo &RFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
-
- if (!RFI.Declaration)
- return false;
-
- bool Changed = false;
- auto DeleteCallCB = [&](Use &U, Function &) {
- CallInst *CI = getCallIfRegularCall(U);
- if (!CI)
- return false;
- auto *Fn = dyn_cast<Function>(
- CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
- if (!Fn)
- return false;
- if (!Fn->onlyReadsMemory())
- return false;
- if (!Fn->hasFnAttribute(Attribute::WillReturn))
- return false;
-
- LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
- << CI->getCaller()->getName() << "\n");
-
- auto Remark = [&](OptimizationRemark OR) {
- return OR << "Parallel region in "
- << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName())
- << " deleted";
- };
- emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion",
- Remark);
-
- CGUpdater.removeCallSite(*CI);
- CI->eraseFromParent();
- Changed = true;
- ++NumOpenMPParallelRegionsDeleted;
- return true;
- };
-
- RFI.foreachUse(SCC, DeleteCallCB);
-
- return Changed;
- }
-
- /// Try to eliminate runtime calls by reusing existing ones.
- bool deduplicateRuntimeCalls() {
- bool Changed = false;
-
- RuntimeFunction DeduplicableRuntimeCallIDs[] = {
- OMPRTL_omp_get_num_threads,
- OMPRTL_omp_in_parallel,
- OMPRTL_omp_get_cancellation,
- OMPRTL_omp_get_thread_limit,
- OMPRTL_omp_get_supported_active_levels,
- OMPRTL_omp_get_level,
- OMPRTL_omp_get_ancestor_thread_num,
- OMPRTL_omp_get_team_size,
- OMPRTL_omp_get_active_level,
- OMPRTL_omp_in_final,
- OMPRTL_omp_get_proc_bind,
- OMPRTL_omp_get_num_places,
- OMPRTL_omp_get_num_procs,
- OMPRTL_omp_get_place_num,
- OMPRTL_omp_get_partition_num_places,
- OMPRTL_omp_get_partition_place_nums};
-
- // Global-tid is handled separately.
- SmallSetVector<Value *, 16> GTIdArgs;
- collectGlobalThreadIdArguments(GTIdArgs);
- LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
- << " global thread ID arguments\n");
-
- for (Function *F : SCC) {
- for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
+ /// Try to delete parallel regions if possible.
+ bool deleteParallelRegions() {
+ const unsigned CallbackCalleeOperand = 2;
+
+ OMPInformationCache::RuntimeFunctionInfo &RFI =
+ OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
+
+ if (!RFI.Declaration)
+ return false;
+
+ bool Changed = false;
+ auto DeleteCallCB = [&](Use &U, Function &) {
+ CallInst *CI = getCallIfRegularCall(U);
+ if (!CI)
+ return false;
+ auto *Fn = dyn_cast<Function>(
+ CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
+ if (!Fn)
+ return false;
+ if (!Fn->onlyReadsMemory())
+ return false;
+ if (!Fn->hasFnAttribute(Attribute::WillReturn))
+ return false;
+
+ LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
+ << CI->getCaller()->getName() << "\n");
+
+ auto Remark = [&](OptimizationRemark OR) {
+ return OR << "Parallel region in "
+ << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName())
+ << " deleted";
+ };
+ emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion",
+ Remark);
+
+ CGUpdater.removeCallSite(*CI);
+ CI->eraseFromParent();
+ Changed = true;
+ ++NumOpenMPParallelRegionsDeleted;
+ return true;
+ };
+
+ RFI.foreachUse(SCC, DeleteCallCB);
+
+ return Changed;
+ }
+
+ /// Try to eliminate runtime calls by reusing existing ones.
+ bool deduplicateRuntimeCalls() {
+ bool Changed = false;
+
+ RuntimeFunction DeduplicableRuntimeCallIDs[] = {
+ OMPRTL_omp_get_num_threads,
+ OMPRTL_omp_in_parallel,
+ OMPRTL_omp_get_cancellation,
+ OMPRTL_omp_get_thread_limit,
+ OMPRTL_omp_get_supported_active_levels,
+ OMPRTL_omp_get_level,
+ OMPRTL_omp_get_ancestor_thread_num,
+ OMPRTL_omp_get_team_size,
+ OMPRTL_omp_get_active_level,
+ OMPRTL_omp_in_final,
+ OMPRTL_omp_get_proc_bind,
+ OMPRTL_omp_get_num_places,
+ OMPRTL_omp_get_num_procs,
+ OMPRTL_omp_get_place_num,
+ OMPRTL_omp_get_partition_num_places,
+ OMPRTL_omp_get_partition_place_nums};
+
+ // Global-tid is handled separately.
+ SmallSetVector<Value *, 16> GTIdArgs;
+ collectGlobalThreadIdArguments(GTIdArgs);
+ LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
+ << " global thread ID arguments\n");
+
+ for (Function *F : SCC) {
+ for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
Changed |= deduplicateRuntimeCalls(
*F, OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
-
- // __kmpc_global_thread_num is special as we can replace it with an
- // argument in enough cases to make it worth trying.
- Value *GTIdArg = nullptr;
- for (Argument &Arg : F->args())
- if (GTIdArgs.count(&Arg)) {
- GTIdArg = &Arg;
- break;
- }
- Changed |= deduplicateRuntimeCalls(
- *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
- }
-
- return Changed;
- }
-
+
+ // __kmpc_global_thread_num is special as we can replace it with an
+ // argument in enough cases to make it worth trying.
+ Value *GTIdArg = nullptr;
+ for (Argument &Arg : F->args())
+ if (GTIdArgs.count(&Arg)) {
+ GTIdArg = &Arg;
+ break;
+ }
+ Changed |= deduplicateRuntimeCalls(
+ *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
+ }
+
+ return Changed;
+ }
+
/// Tries to hide the latency of runtime calls that involve host to
/// device memory transfers by splitting them into their "issue" and "wait"
/// versions. The "issue" is moved upwards as much as possible. The "wait" is
@@ -1293,294 +1293,294 @@ private:
return true;
}
- static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
- bool GlobalOnly, bool &SingleChoice) {
- if (CurrentIdent == NextIdent)
- return CurrentIdent;
-
- // TODO: Figure out how to actually combine multiple debug locations. For
- // now we just keep an existing one if there is a single choice.
- if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
- SingleChoice = !CurrentIdent;
- return NextIdent;
- }
- return nullptr;
- }
-
- /// Return an `struct ident_t*` value that represents the ones used in the
- /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
- /// return a local `struct ident_t*`. For now, if we cannot find a suitable
- /// return value we create one from scratch. We also do not yet combine
- /// information, e.g., the source locations, see combinedIdentStruct.
- Value *
- getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
- Function &F, bool GlobalOnly) {
- bool SingleChoice = true;
- Value *Ident = nullptr;
- auto CombineIdentStruct = [&](Use &U, Function &Caller) {
- CallInst *CI = getCallIfRegularCall(U, &RFI);
- if (!CI || &F != &Caller)
- return false;
- Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
- /* GlobalOnly */ true, SingleChoice);
- return false;
- };
- RFI.foreachUse(SCC, CombineIdentStruct);
-
- if (!Ident || !SingleChoice) {
- // The IRBuilder uses the insertion block to get to the module, this is
- // unfortunate but we work around it for now.
- if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
- OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
- &F.getEntryBlock(), F.getEntryBlock().begin()));
- // Create a fallback location if non was found.
- // TODO: Use the debug locations of the calls instead.
- Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
- Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
- }
- return Ident;
- }
-
- /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
- /// \p ReplVal if given.
- bool deduplicateRuntimeCalls(Function &F,
- OMPInformationCache::RuntimeFunctionInfo &RFI,
- Value *ReplVal = nullptr) {
- auto *UV = RFI.getUseVector(F);
- if (!UV || UV->size() + (ReplVal != nullptr) < 2)
- return false;
-
- LLVM_DEBUG(
- dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
- << (ReplVal ? " with an existing value\n" : "\n") << "\n");
-
- assert((!ReplVal || (isa<Argument>(ReplVal) &&
- cast<Argument>(ReplVal)->getParent() == &F)) &&
- "Unexpected replacement value!");
-
- // TODO: Use dominance to find a good position instead.
- auto CanBeMoved = [this](CallBase &CB) {
- unsigned NumArgs = CB.getNumArgOperands();
- if (NumArgs == 0)
- return true;
- if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
- return false;
- for (unsigned u = 1; u < NumArgs; ++u)
- if (isa<Instruction>(CB.getArgOperand(u)))
- return false;
- return true;
- };
-
- if (!ReplVal) {
- for (Use *U : *UV)
- if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
- if (!CanBeMoved(*CI))
- continue;
-
- auto Remark = [&](OptimizationRemark OR) {
- auto newLoc = &*F.getEntryBlock().getFirstInsertionPt();
- return OR << "OpenMP runtime call "
- << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to "
- << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc());
- };
- emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeCodeMotion", Remark);
-
- CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
- ReplVal = CI;
- break;
- }
- if (!ReplVal)
- return false;
- }
-
- // If we use a call as a replacement value we need to make sure the ident is
- // valid at the new location. For now we just pick a global one, either
- // existing and used by one of the calls, or created from scratch.
- if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
- if (CI->getNumArgOperands() > 0 &&
- CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
- Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
- /* GlobalOnly */ true);
- CI->setArgOperand(0, Ident);
- }
- }
-
- bool Changed = false;
- auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
- CallInst *CI = getCallIfRegularCall(U, &RFI);
- if (!CI || CI == ReplVal || &F != &Caller)
- return false;
- assert(CI->getCaller() == &F && "Unexpected call!");
-
- auto Remark = [&](OptimizationRemark OR) {
- return OR << "OpenMP runtime call "
- << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated";
- };
- emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeDeduplicated", Remark);
-
- CGUpdater.removeCallSite(*CI);
- CI->replaceAllUsesWith(ReplVal);
- CI->eraseFromParent();
- ++NumOpenMPRuntimeCallsDeduplicated;
- Changed = true;
- return true;
- };
- RFI.foreachUse(SCC, ReplaceAndDeleteCB);
-
- return Changed;
- }
-
- /// Collect arguments that represent the global thread id in \p GTIdArgs.
- void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) {
- // TODO: Below we basically perform a fixpoint iteration with a pessimistic
- // initialization. We could define an AbstractAttribute instead and
- // run the Attributor here once it can be run as an SCC pass.
-
- // Helper to check the argument \p ArgNo at all call sites of \p F for
- // a GTId.
- auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
- if (!F.hasLocalLinkage())
- return false;
- for (Use &U : F.uses()) {
- if (CallInst *CI = getCallIfRegularCall(U)) {
- Value *ArgOp = CI->getArgOperand(ArgNo);
- if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
- getCallIfRegularCall(
- *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
- continue;
- }
- return false;
- }
- return true;
- };
-
- // Helper to identify uses of a GTId as GTId arguments.
- auto AddUserArgs = [&](Value &GTId) {
- for (Use &U : GTId.uses())
- if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
- if (CI->isArgOperand(&U))
- if (Function *Callee = CI->getCalledFunction())
- if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
- GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
- };
-
- // The argument users of __kmpc_global_thread_num calls are GTIds.
- OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
-
- GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
- if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
- AddUserArgs(*CI);
- return false;
- });
-
- // Transitively search for more arguments by looking at the users of the
- // ones we know already. During the search the GTIdArgs vector is extended
- // so we cannot cache the size nor can we use a range based for.
- for (unsigned u = 0; u < GTIdArgs.size(); ++u)
- AddUserArgs(*GTIdArgs[u]);
- }
-
- /// Kernel (=GPU) optimizations and utility functions
- ///
- ///{{
-
- /// Check if \p F is a kernel, hence entry point for target offloading.
- bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
-
- /// Cache to remember the unique kernel for a function.
- DenseMap<Function *, Optional<Kernel>> UniqueKernelMap;
-
- /// Find the unique kernel that will execute \p F, if any.
- Kernel getUniqueKernelFor(Function &F);
-
- /// Find the unique kernel that will execute \p I, if any.
- Kernel getUniqueKernelFor(Instruction &I) {
- return getUniqueKernelFor(*I.getFunction());
- }
-
- /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
- /// the cases we can avoid taking the address of a function.
- bool rewriteDeviceCodeStateMachine();
-
- ///
- ///}}
-
- /// Emit a remark generically
- ///
- /// This template function can be used to generically emit a remark. The
- /// RemarkKind should be one of the following:
- /// - OptimizationRemark to indicate a successful optimization attempt
- /// - OptimizationRemarkMissed to report a failed optimization attempt
- /// - OptimizationRemarkAnalysis to provide additional information about an
- /// optimization attempt
- ///
- /// The remark is built using a callback function provided by the caller that
- /// takes a RemarkKind as input and returns a RemarkKind.
- template <typename RemarkKind,
- typename RemarkCallBack = function_ref<RemarkKind(RemarkKind &&)>>
- void emitRemark(Instruction *Inst, StringRef RemarkName,
- RemarkCallBack &&RemarkCB) const {
- Function *F = Inst->getParent()->getParent();
- auto &ORE = OREGetter(F);
-
- ORE.emit(
- [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); });
- }
-
- /// Emit a remark on a function. Since only OptimizationRemark is supporting
- /// this, it can't be made generic.
- void
- emitRemarkOnFunction(Function *F, StringRef RemarkName,
- function_ref<OptimizationRemark(OptimizationRemark &&)>
- &&RemarkCB) const {
- auto &ORE = OREGetter(F);
-
- ORE.emit([&]() {
- return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F));
- });
- }
-
- /// The underlying module.
- Module &M;
-
- /// The SCC we are operating on.
- SmallVectorImpl<Function *> &SCC;
-
- /// Callback to update the call graph, the first argument is a removed call,
- /// the second an optional replacement call.
- CallGraphUpdater &CGUpdater;
-
- /// Callback to get an OptimizationRemarkEmitter from a Function *
- OptimizationRemarkGetter OREGetter;
-
- /// OpenMP-specific information cache. Also Used for Attributor runs.
- OMPInformationCache &OMPInfoCache;
-
- /// Attributor instance.
- Attributor &A;
-
- /// Helper function to run Attributor on SCC.
- bool runAttributor() {
- if (SCC.empty())
- return false;
-
- registerAAs();
-
- ChangeStatus Changed = A.run();
-
- LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
- << " functions, result: " << Changed << ".\n");
-
- return Changed == ChangeStatus::CHANGED;
- }
-
- /// Populate the Attributor with abstract attribute opportunities in the
- /// function.
- void registerAAs() {
+ static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
+ bool GlobalOnly, bool &SingleChoice) {
+ if (CurrentIdent == NextIdent)
+ return CurrentIdent;
+
+ // TODO: Figure out how to actually combine multiple debug locations. For
+ // now we just keep an existing one if there is a single choice.
+ if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
+ SingleChoice = !CurrentIdent;
+ return NextIdent;
+ }
+ return nullptr;
+ }
+
+ /// Return an `struct ident_t*` value that represents the ones used in the
+ /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
+ /// return a local `struct ident_t*`. For now, if we cannot find a suitable
+ /// return value we create one from scratch. We also do not yet combine
+ /// information, e.g., the source locations, see combinedIdentStruct.
+ Value *
+ getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
+ Function &F, bool GlobalOnly) {
+ bool SingleChoice = true;
+ Value *Ident = nullptr;
+ auto CombineIdentStruct = [&](Use &U, Function &Caller) {
+ CallInst *CI = getCallIfRegularCall(U, &RFI);
+ if (!CI || &F != &Caller)
+ return false;
+ Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
+ /* GlobalOnly */ true, SingleChoice);
+ return false;
+ };
+ RFI.foreachUse(SCC, CombineIdentStruct);
+
+ if (!Ident || !SingleChoice) {
+ // The IRBuilder uses the insertion block to get to the module, this is
+ // unfortunate but we work around it for now.
+ if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
+ OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
+ &F.getEntryBlock(), F.getEntryBlock().begin()));
+ // Create a fallback location if non was found.
+ // TODO: Use the debug locations of the calls instead.
+ Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
+ Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
+ }
+ return Ident;
+ }
+
+ /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
+ /// \p ReplVal if given.
+ bool deduplicateRuntimeCalls(Function &F,
+ OMPInformationCache::RuntimeFunctionInfo &RFI,
+ Value *ReplVal = nullptr) {
+ auto *UV = RFI.getUseVector(F);
+ if (!UV || UV->size() + (ReplVal != nullptr) < 2)
+ return false;
+
+ LLVM_DEBUG(
+ dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
+ << (ReplVal ? " with an existing value\n" : "\n") << "\n");
+
+ assert((!ReplVal || (isa<Argument>(ReplVal) &&
+ cast<Argument>(ReplVal)->getParent() == &F)) &&
+ "Unexpected replacement value!");
+
+ // TODO: Use dominance to find a good position instead.
+ auto CanBeMoved = [this](CallBase &CB) {
+ unsigned NumArgs = CB.getNumArgOperands();
+ if (NumArgs == 0)
+ return true;
+ if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
+ return false;
+ for (unsigned u = 1; u < NumArgs; ++u)
+ if (isa<Instruction>(CB.getArgOperand(u)))
+ return false;
+ return true;
+ };
+
+ if (!ReplVal) {
+ for (Use *U : *UV)
+ if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
+ if (!CanBeMoved(*CI))
+ continue;
+
+ auto Remark = [&](OptimizationRemark OR) {
+ auto newLoc = &*F.getEntryBlock().getFirstInsertionPt();
+ return OR << "OpenMP runtime call "
+ << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to "
+ << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc());
+ };
+ emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeCodeMotion", Remark);
+
+ CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
+ ReplVal = CI;
+ break;
+ }
+ if (!ReplVal)
+ return false;
+ }
+
+ // If we use a call as a replacement value we need to make sure the ident is
+ // valid at the new location. For now we just pick a global one, either
+ // existing and used by one of the calls, or created from scratch.
+ if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
+ if (CI->getNumArgOperands() > 0 &&
+ CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
+ Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
+ /* GlobalOnly */ true);
+ CI->setArgOperand(0, Ident);
+ }
+ }
+
+ bool Changed = false;
+ auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
+ CallInst *CI = getCallIfRegularCall(U, &RFI);
+ if (!CI || CI == ReplVal || &F != &Caller)
+ return false;
+ assert(CI->getCaller() == &F && "Unexpected call!");
+
+ auto Remark = [&](OptimizationRemark OR) {
+ return OR << "OpenMP runtime call "
+ << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated";
+ };
+ emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeDeduplicated", Remark);
+
+ CGUpdater.removeCallSite(*CI);
+ CI->replaceAllUsesWith(ReplVal);
+ CI->eraseFromParent();
+ ++NumOpenMPRuntimeCallsDeduplicated;
+ Changed = true;
+ return true;
+ };
+ RFI.foreachUse(SCC, ReplaceAndDeleteCB);
+
+ return Changed;
+ }
+
+ /// Collect arguments that represent the global thread id in \p GTIdArgs.
+ void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) {
+ // TODO: Below we basically perform a fixpoint iteration with a pessimistic
+ // initialization. We could define an AbstractAttribute instead and
+ // run the Attributor here once it can be run as an SCC pass.
+
+ // Helper to check the argument \p ArgNo at all call sites of \p F for
+ // a GTId.
+ auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
+ if (!F.hasLocalLinkage())
+ return false;
+ for (Use &U : F.uses()) {
+ if (CallInst *CI = getCallIfRegularCall(U)) {
+ Value *ArgOp = CI->getArgOperand(ArgNo);
+ if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
+ getCallIfRegularCall(
+ *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
+ continue;
+ }
+ return false;
+ }
+ return true;
+ };
+
+ // Helper to identify uses of a GTId as GTId arguments.
+ auto AddUserArgs = [&](Value &GTId) {
+ for (Use &U : GTId.uses())
+ if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
+ if (CI->isArgOperand(&U))
+ if (Function *Callee = CI->getCalledFunction())
+ if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
+ GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
+ };
+
+ // The argument users of __kmpc_global_thread_num calls are GTIds.
+ OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
+ OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
+
+ GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
+ if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
+ AddUserArgs(*CI);
+ return false;
+ });
+
+ // Transitively search for more arguments by looking at the users of the
+ // ones we know already. During the search the GTIdArgs vector is extended
+ // so we cannot cache the size nor can we use a range based for.
+ for (unsigned u = 0; u < GTIdArgs.size(); ++u)
+ AddUserArgs(*GTIdArgs[u]);
+ }
+
+ /// Kernel (=GPU) optimizations and utility functions
+ ///
+ ///{{
+
+ /// Check if \p F is a kernel, hence entry point for target offloading.
+ bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
+
+ /// Cache to remember the unique kernel for a function.
+ DenseMap<Function *, Optional<Kernel>> UniqueKernelMap;
+
+ /// Find the unique kernel that will execute \p F, if any.
+ Kernel getUniqueKernelFor(Function &F);
+
+ /// Find the unique kernel that will execute \p I, if any.
+ Kernel getUniqueKernelFor(Instruction &I) {
+ return getUniqueKernelFor(*I.getFunction());
+ }
+
+ /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
+ /// the cases we can avoid taking the address of a function.
+ bool rewriteDeviceCodeStateMachine();
+
+ ///
+ ///}}
+
+ /// Emit a remark generically
+ ///
+ /// This template function can be used to generically emit a remark. The
+ /// RemarkKind should be one of the following:
+ /// - OptimizationRemark to indicate a successful optimization attempt
+ /// - OptimizationRemarkMissed to report a failed optimization attempt
+ /// - OptimizationRemarkAnalysis to provide additional information about an
+ /// optimization attempt
+ ///
+ /// The remark is built using a callback function provided by the caller that
+ /// takes a RemarkKind as input and returns a RemarkKind.
+ template <typename RemarkKind,
+ typename RemarkCallBack = function_ref<RemarkKind(RemarkKind &&)>>
+ void emitRemark(Instruction *Inst, StringRef RemarkName,
+ RemarkCallBack &&RemarkCB) const {
+ Function *F = Inst->getParent()->getParent();
+ auto &ORE = OREGetter(F);
+
+ ORE.emit(
+ [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); });
+ }
+
+ /// Emit a remark on a function. Since only OptimizationRemark is supporting
+ /// this, it can't be made generic.
+ void
+ emitRemarkOnFunction(Function *F, StringRef RemarkName,
+ function_ref<OptimizationRemark(OptimizationRemark &&)>
+ &&RemarkCB) const {
+ auto &ORE = OREGetter(F);
+
+ ORE.emit([&]() {
+ return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F));
+ });
+ }
+
+ /// The underlying module.
+ Module &M;
+
+ /// The SCC we are operating on.
+ SmallVectorImpl<Function *> &SCC;
+
+ /// Callback to update the call graph, the first argument is a removed call,
+ /// the second an optional replacement call.
+ CallGraphUpdater &CGUpdater;
+
+ /// Callback to get an OptimizationRemarkEmitter from a Function *
+ OptimizationRemarkGetter OREGetter;
+
+ /// OpenMP-specific information cache. Also Used for Attributor runs.
+ OMPInformationCache &OMPInfoCache;
+
+ /// Attributor instance.
+ Attributor &A;
+
+ /// Helper function to run Attributor on SCC.
+ bool runAttributor() {
+ if (SCC.empty())
+ return false;
+
+ registerAAs();
+
+ ChangeStatus Changed = A.run();
+
+ LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
+ << " functions, result: " << Changed << ".\n");
+
+ return Changed == ChangeStatus::CHANGED;
+ }
+
+ /// Populate the Attributor with abstract attribute opportunities in the
+ /// function.
+ void registerAAs() {
if (SCC.empty())
return;
-
+
// Create CallSite AA for all Getters.
for (int Idx = 0; Idx < OMPInfoCache.ICVs.size() - 1; ++Idx) {
auto ICVInfo = OMPInfoCache.ICVs[static_cast<InternalControlVar>(Idx)];
@@ -1600,29 +1600,29 @@ private:
};
GetterRFI.foreachUse(SCC, CreateAA);
- }
- }
-};
-
-Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
- if (!OMPInfoCache.ModuleSlice.count(&F))
- return nullptr;
-
- // Use a scope to keep the lifetime of the CachedKernel short.
- {
- Optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
- if (CachedKernel)
- return *CachedKernel;
-
- // TODO: We should use an AA to create an (optimistic and callback
- // call-aware) call graph. For now we stick to simple patterns that
- // are less powerful, basically the worst fixpoint.
- if (isKernel(F)) {
- CachedKernel = Kernel(&F);
- return *CachedKernel;
- }
-
- CachedKernel = nullptr;
+ }
+ }
+};
+
+Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
+ if (!OMPInfoCache.ModuleSlice.count(&F))
+ return nullptr;
+
+ // Use a scope to keep the lifetime of the CachedKernel short.
+ {
+ Optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
+ if (CachedKernel)
+ return *CachedKernel;
+
+ // TODO: We should use an AA to create an (optimistic and callback
+ // call-aware) call graph. For now we stick to simple patterns that
+ // are less powerful, basically the worst fixpoint.
+ if (isKernel(F)) {
+ CachedKernel = Kernel(&F);
+ return *CachedKernel;
+ }
+
+ CachedKernel = nullptr;
if (!F.hasLocalLinkage()) {
// See https://openmp.llvm.org/remarks/OptimizationRemarks.html
@@ -1631,206 +1631,206 @@ Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
};
emitRemarkOnFunction(&F, "OMP100", Remark);
- return nullptr;
+ return nullptr;
}
- }
-
- auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
- if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
- // Allow use in equality comparisons.
- if (Cmp->isEquality())
- return getUniqueKernelFor(*Cmp);
- return nullptr;
- }
- if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
- // Allow direct calls.
- if (CB->isCallee(&U))
- return getUniqueKernelFor(*CB);
- // Allow the use in __kmpc_kernel_prepare_parallel calls.
- if (Function *Callee = CB->getCalledFunction())
- if (Callee->getName() == "__kmpc_kernel_prepare_parallel")
- return getUniqueKernelFor(*CB);
- return nullptr;
- }
- // Disallow every other use.
- return nullptr;
- };
-
- // TODO: In the future we want to track more than just a unique kernel.
- SmallPtrSet<Kernel, 2> PotentialKernels;
+ }
+
+ auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
+ if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
+ // Allow use in equality comparisons.
+ if (Cmp->isEquality())
+ return getUniqueKernelFor(*Cmp);
+ return nullptr;
+ }
+ if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
+ // Allow direct calls.
+ if (CB->isCallee(&U))
+ return getUniqueKernelFor(*CB);
+ // Allow the use in __kmpc_kernel_prepare_parallel calls.
+ if (Function *Callee = CB->getCalledFunction())
+ if (Callee->getName() == "__kmpc_kernel_prepare_parallel")
+ return getUniqueKernelFor(*CB);
+ return nullptr;
+ }
+ // Disallow every other use.
+ return nullptr;
+ };
+
+ // TODO: In the future we want to track more than just a unique kernel.
+ SmallPtrSet<Kernel, 2> PotentialKernels;
OMPInformationCache::foreachUse(F, [&](const Use &U) {
- PotentialKernels.insert(GetUniqueKernelForUse(U));
- });
-
- Kernel K = nullptr;
- if (PotentialKernels.size() == 1)
- K = *PotentialKernels.begin();
-
- // Cache the result.
- UniqueKernelMap[&F] = K;
-
- return K;
-}
-
-bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
- OMPInformationCache::RuntimeFunctionInfo &KernelPrepareParallelRFI =
- OMPInfoCache.RFIs[OMPRTL___kmpc_kernel_prepare_parallel];
-
- bool Changed = false;
- if (!KernelPrepareParallelRFI)
- return Changed;
-
- for (Function *F : SCC) {
-
- // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at
- // all.
- bool UnknownUse = false;
- bool KernelPrepareUse = false;
- unsigned NumDirectCalls = 0;
-
- SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
+ PotentialKernels.insert(GetUniqueKernelForUse(U));
+ });
+
+ Kernel K = nullptr;
+ if (PotentialKernels.size() == 1)
+ K = *PotentialKernels.begin();
+
+ // Cache the result.
+ UniqueKernelMap[&F] = K;
+
+ return K;
+}
+
+bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
+ OMPInformationCache::RuntimeFunctionInfo &KernelPrepareParallelRFI =
+ OMPInfoCache.RFIs[OMPRTL___kmpc_kernel_prepare_parallel];
+
+ bool Changed = false;
+ if (!KernelPrepareParallelRFI)
+ return Changed;
+
+ for (Function *F : SCC) {
+
+ // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at
+ // all.
+ bool UnknownUse = false;
+ bool KernelPrepareUse = false;
+ unsigned NumDirectCalls = 0;
+
+ SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
OMPInformationCache::foreachUse(*F, [&](Use &U) {
- if (auto *CB = dyn_cast<CallBase>(U.getUser()))
- if (CB->isCallee(&U)) {
- ++NumDirectCalls;
- return;
- }
-
- if (isa<ICmpInst>(U.getUser())) {
- ToBeReplacedStateMachineUses.push_back(&U);
- return;
- }
- if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall(
- *U.getUser(), &KernelPrepareParallelRFI)) {
- KernelPrepareUse = true;
- ToBeReplacedStateMachineUses.push_back(&U);
- return;
- }
- UnknownUse = true;
- });
-
- // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel
- // use.
- if (!KernelPrepareUse)
- continue;
-
- {
- auto Remark = [&](OptimizationRemark OR) {
- return OR << "Found a parallel region that is called in a target "
- "region but not part of a combined target construct nor "
- "nesed inside a target construct without intermediate "
- "code. This can lead to excessive register usage for "
- "unrelated target regions in the same translation unit "
- "due to spurious call edges assumed by ptxas.";
- };
- emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
- }
-
- // If this ever hits, we should investigate.
- // TODO: Checking the number of uses is not a necessary restriction and
- // should be lifted.
- if (UnknownUse || NumDirectCalls != 1 ||
- ToBeReplacedStateMachineUses.size() != 2) {
- {
- auto Remark = [&](OptimizationRemark OR) {
- return OR << "Parallel region is used in "
- << (UnknownUse ? "unknown" : "unexpected")
- << " ways; will not attempt to rewrite the state machine.";
- };
- emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
- }
- continue;
- }
-
- // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give
- // up if the function is not called from a unique kernel.
- Kernel K = getUniqueKernelFor(*F);
- if (!K) {
- {
- auto Remark = [&](OptimizationRemark OR) {
- return OR << "Parallel region is not known to be called from a "
- "unique single target region, maybe the surrounding "
- "function has external linkage?; will not attempt to "
- "rewrite the state machine use.";
- };
- emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl",
- Remark);
- }
- continue;
- }
-
- // We now know F is a parallel body function called only from the kernel K.
- // We also identified the state machine uses in which we replace the
- // function pointer by a new global symbol for identification purposes. This
- // ensures only direct calls to the function are left.
-
- {
- auto RemarkParalleRegion = [&](OptimizationRemark OR) {
- return OR << "Specialize parallel region that is only reached from a "
- "single target region to avoid spurious call edges and "
- "excessive register usage in other target regions. "
- "(parallel region ID: "
- << ore::NV("OpenMPParallelRegion", F->getName())
- << ", kernel ID: "
- << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
- };
- emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD",
- RemarkParalleRegion);
- auto RemarkKernel = [&](OptimizationRemark OR) {
- return OR << "Target region containing the parallel region that is "
- "specialized. (parallel region ID: "
- << ore::NV("OpenMPParallelRegion", F->getName())
- << ", kernel ID: "
- << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
- };
- emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel);
- }
-
- Module &M = *F->getParent();
- Type *Int8Ty = Type::getInt8Ty(M.getContext());
-
- auto *ID = new GlobalVariable(
- M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
- UndefValue::get(Int8Ty), F->getName() + ".ID");
-
- for (Use *U : ToBeReplacedStateMachineUses)
- U->set(ConstantExpr::getBitCast(ID, U->get()->getType()));
-
- ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
-
- Changed = true;
- }
-
- return Changed;
-}
-
-/// Abstract Attribute for tracking ICV values.
-struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
- using Base = StateWrapper<BooleanState, AbstractAttribute>;
- AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
-
+ if (auto *CB = dyn_cast<CallBase>(U.getUser()))
+ if (CB->isCallee(&U)) {
+ ++NumDirectCalls;
+ return;
+ }
+
+ if (isa<ICmpInst>(U.getUser())) {
+ ToBeReplacedStateMachineUses.push_back(&U);
+ return;
+ }
+ if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall(
+ *U.getUser(), &KernelPrepareParallelRFI)) {
+ KernelPrepareUse = true;
+ ToBeReplacedStateMachineUses.push_back(&U);
+ return;
+ }
+ UnknownUse = true;
+ });
+
+ // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel
+ // use.
+ if (!KernelPrepareUse)
+ continue;
+
+ {
+ auto Remark = [&](OptimizationRemark OR) {
+ return OR << "Found a parallel region that is called in a target "
+ "region but not part of a combined target construct nor "
+ "nesed inside a target construct without intermediate "
+ "code. This can lead to excessive register usage for "
+ "unrelated target regions in the same translation unit "
+ "due to spurious call edges assumed by ptxas.";
+ };
+ emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
+ }
+
+ // If this ever hits, we should investigate.
+ // TODO: Checking the number of uses is not a necessary restriction and
+ // should be lifted.
+ if (UnknownUse || NumDirectCalls != 1 ||
+ ToBeReplacedStateMachineUses.size() != 2) {
+ {
+ auto Remark = [&](OptimizationRemark OR) {
+ return OR << "Parallel region is used in "
+ << (UnknownUse ? "unknown" : "unexpected")
+ << " ways; will not attempt to rewrite the state machine.";
+ };
+ emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
+ }
+ continue;
+ }
+
+ // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give
+ // up if the function is not called from a unique kernel.
+ Kernel K = getUniqueKernelFor(*F);
+ if (!K) {
+ {
+ auto Remark = [&](OptimizationRemark OR) {
+ return OR << "Parallel region is not known to be called from a "
+ "unique single target region, maybe the surrounding "
+ "function has external linkage?; will not attempt to "
+ "rewrite the state machine use.";
+ };
+ emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl",
+ Remark);
+ }
+ continue;
+ }
+
+ // We now know F is a parallel body function called only from the kernel K.
+ // We also identified the state machine uses in which we replace the
+ // function pointer by a new global symbol for identification purposes. This
+ // ensures only direct calls to the function are left.
+
+ {
+ auto RemarkParalleRegion = [&](OptimizationRemark OR) {
+ return OR << "Specialize parallel region that is only reached from a "
+ "single target region to avoid spurious call edges and "
+ "excessive register usage in other target regions. "
+ "(parallel region ID: "
+ << ore::NV("OpenMPParallelRegion", F->getName())
+ << ", kernel ID: "
+ << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
+ };
+ emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD",
+ RemarkParalleRegion);
+ auto RemarkKernel = [&](OptimizationRemark OR) {
+ return OR << "Target region containing the parallel region that is "
+ "specialized. (parallel region ID: "
+ << ore::NV("OpenMPParallelRegion", F->getName())
+ << ", kernel ID: "
+ << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
+ };
+ emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel);
+ }
+
+ Module &M = *F->getParent();
+ Type *Int8Ty = Type::getInt8Ty(M.getContext());
+
+ auto *ID = new GlobalVariable(
+ M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
+ UndefValue::get(Int8Ty), F->getName() + ".ID");
+
+ for (Use *U : ToBeReplacedStateMachineUses)
+ U->set(ConstantExpr::getBitCast(ID, U->get()->getType()));
+
+ ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
+
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+/// Abstract Attribute for tracking ICV values.
+struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
+ using Base = StateWrapper<BooleanState, AbstractAttribute>;
+ AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
void initialize(Attributor &A) override {
Function *F = getAnchorScope();
if (!F || !A.isFunctionIPOAmendable(*F))
indicatePessimisticFixpoint();
}
- /// Returns true if value is assumed to be tracked.
- bool isAssumedTracked() const { return getAssumed(); }
-
- /// Returns true if value is known to be tracked.
- bool isKnownTracked() const { return getAssumed(); }
-
- /// Create an abstract attribute biew for the position \p IRP.
- static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
-
- /// Return the value with which \p I can be replaced for specific \p ICV.
+ /// Returns true if value is assumed to be tracked.
+ bool isAssumedTracked() const { return getAssumed(); }
+
+ /// Returns true if value is known to be tracked.
+ bool isKnownTracked() const { return getAssumed(); }
+
+ /// Create an abstract attribute biew for the position \p IRP.
+ static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
+
+ /// Return the value with which \p I can be replaced for specific \p ICV.
virtual Optional<Value *> getReplacementValue(InternalControlVar ICV,
const Instruction *I,
Attributor &A) const {
return None;
}
-
+
/// Return an assumed unique ICV value if a single candidate is found. If
/// there cannot be one, return a nullptr. If it is not clear yet, return the
/// Optional::NoneType.
@@ -1841,64 +1841,64 @@ struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
// this array will only grow with time.
InternalControlVar TrackableICVs[1] = {ICV_nthreads};
- /// See AbstractAttribute::getName()
- const std::string getName() const override { return "AAICVTracker"; }
-
- /// See AbstractAttribute::getIdAddr()
- const char *getIdAddr() const override { return &ID; }
-
- /// This function should return true if the type of the \p AA is AAICVTracker
- static bool classof(const AbstractAttribute *AA) {
- return (AA->getIdAddr() == &ID);
- }
-
- static const char ID;
-};
-
-struct AAICVTrackerFunction : public AAICVTracker {
- AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
- : AAICVTracker(IRP, A) {}
-
- // FIXME: come up with better string.
+ /// See AbstractAttribute::getName()
+ const std::string getName() const override { return "AAICVTracker"; }
+
+ /// See AbstractAttribute::getIdAddr()
+ const char *getIdAddr() const override { return &ID; }
+
+ /// This function should return true if the type of the \p AA is AAICVTracker
+ static bool classof(const AbstractAttribute *AA) {
+ return (AA->getIdAddr() == &ID);
+ }
+
+ static const char ID;
+};
+
+struct AAICVTrackerFunction : public AAICVTracker {
+ AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
+ : AAICVTracker(IRP, A) {}
+
+ // FIXME: come up with better string.
const std::string getAsStr() const override { return "ICVTrackerFunction"; }
-
- // FIXME: come up with some stats.
- void trackStatistics() const override {}
-
+
+ // FIXME: come up with some stats.
+ void trackStatistics() const override {}
+
/// We don't manifest anything for this AA.
- ChangeStatus manifest(Attributor &A) override {
+ ChangeStatus manifest(Attributor &A) override {
return ChangeStatus::UNCHANGED;
- }
-
- // Map of ICV to their values at specific program point.
+ }
+
+ // Map of ICV to their values at specific program point.
EnumeratedArray<DenseMap<Instruction *, Value *>, InternalControlVar,
- InternalControlVar::ICV___last>
+ InternalControlVar::ICV___last>
ICVReplacementValuesMap;
-
- ChangeStatus updateImpl(Attributor &A) override {
- ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
-
- Function *F = getAnchorScope();
-
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
-
- for (InternalControlVar ICV : TrackableICVs) {
- auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
-
+
+ ChangeStatus updateImpl(Attributor &A) override {
+ ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+
+ Function *F = getAnchorScope();
+
+ auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+
+ for (InternalControlVar ICV : TrackableICVs) {
+ auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
+
auto &ValuesMap = ICVReplacementValuesMap[ICV];
- auto TrackValues = [&](Use &U, Function &) {
- CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
- if (!CI)
- return false;
-
- // FIXME: handle setters with more that 1 arguments.
- /// Track new value.
+ auto TrackValues = [&](Use &U, Function &) {
+ CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
+ if (!CI)
+ return false;
+
+ // FIXME: handle setters with more that 1 arguments.
+ /// Track new value.
if (ValuesMap.insert(std::make_pair(CI, CI->getArgOperand(0))).second)
- HasChanged = ChangeStatus::CHANGED;
-
- return false;
- };
-
+ HasChanged = ChangeStatus::CHANGED;
+
+ return false;
+ };
+
auto CallCheck = [&](Instruction &I) {
Optional<Value *> ReplVal = getValueForCall(A, &I, ICV);
if (ReplVal.hasValue() &&
@@ -1909,7 +1909,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
};
// Track all changes of an ICV.
- SetterRFI.foreachUse(TrackValues, F);
+ SetterRFI.foreachUse(TrackValues, F);
A.checkForAllInstructions(CallCheck, *this, {Instruction::Call},
/* CheckBBLivenessOnly */ true);
@@ -1919,26 +1919,26 @@ struct AAICVTrackerFunction : public AAICVTracker {
Instruction *Entry = &F->getEntryBlock().front();
if (HasChanged == ChangeStatus::CHANGED && !ValuesMap.count(Entry))
ValuesMap.insert(std::make_pair(Entry, nullptr));
- }
-
- return HasChanged;
- }
-
+ }
+
+ return HasChanged;
+ }
+
/// Hepler to check if \p I is a call and get the value for it if it is
/// unique.
Optional<Value *> getValueForCall(Attributor &A, const Instruction *I,
InternalControlVar &ICV) const {
-
+
const auto *CB = dyn_cast<CallBase>(I);
if (!CB || CB->hasFnAttr("no_openmp") ||
CB->hasFnAttr("no_openmp_routines"))
return None;
- auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
- auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
+ auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+ auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
Function *CalledFunction = CB->getCalledFunction();
-
+
// Indirect call, assume ICV changes.
if (CalledFunction == nullptr)
return nullptr;
@@ -1947,7 +1947,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
if (CalledFunction == SetterRFI.Declaration) {
if (ICVReplacementValuesMap[ICV].count(I))
return ICVReplacementValuesMap[ICV].lookup(I);
-
+
return nullptr;
}
@@ -2006,11 +2006,11 @@ struct AAICVTrackerFunction : public AAICVTracker {
// If we found a new value, we can't know the icv value anymore.
if (NewReplVal.hasValue())
if (ReplVal != NewReplVal)
- return nullptr;
-
+ return nullptr;
+
break;
- }
-
+ }
+
Optional<Value *> NewReplVal = getValueForCall(A, CurrInst, ICV);
if (!NewReplVal.hasValue())
continue;
@@ -2025,7 +2025,7 @@ struct AAICVTrackerFunction : public AAICVTracker {
// We found a new value, we can't know the icv value anymore.
if (ReplVal != NewReplVal)
return nullptr;
- }
+ }
// If we are in the same BB and we have a value, we are done.
if (CurrBB == I->getParent() && ReplVal.hasValue())
@@ -2035,11 +2035,11 @@ struct AAICVTrackerFunction : public AAICVTracker {
for (const BasicBlock *Pred : predecessors(CurrBB))
if (const Instruction *Terminator = Pred->getTerminator())
Worklist.push_back(Terminator);
- }
-
+ }
+
return ReplVal;
- }
-};
+ }
+};
struct AAICVTrackerFunctionReturned : AAICVTracker {
AAICVTrackerFunctionReturned(const IRPosition &IRP, Attributor &A)
@@ -2231,52 +2231,52 @@ struct AAICVTrackerCallSiteReturned : AAICVTracker {
return Changed;
}
};
-} // namespace
-
-const char AAICVTracker::ID = 0;
-
-AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
- Attributor &A) {
- AAICVTracker *AA = nullptr;
- switch (IRP.getPositionKind()) {
- case IRPosition::IRP_INVALID:
- case IRPosition::IRP_FLOAT:
- case IRPosition::IRP_ARGUMENT:
+} // namespace
+
+const char AAICVTracker::ID = 0;
+
+AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
+ Attributor &A) {
+ AAICVTracker *AA = nullptr;
+ switch (IRP.getPositionKind()) {
+ case IRPosition::IRP_INVALID:
+ case IRPosition::IRP_FLOAT:
+ case IRPosition::IRP_ARGUMENT:
case IRPosition::IRP_CALL_SITE_ARGUMENT:
llvm_unreachable("ICVTracker can only be created for function position!");
- case IRPosition::IRP_RETURNED:
+ case IRPosition::IRP_RETURNED:
AA = new (A.Allocator) AAICVTrackerFunctionReturned(IRP, A);
break;
- case IRPosition::IRP_CALL_SITE_RETURNED:
+ case IRPosition::IRP_CALL_SITE_RETURNED:
AA = new (A.Allocator) AAICVTrackerCallSiteReturned(IRP, A);
break;
- case IRPosition::IRP_CALL_SITE:
+ case IRPosition::IRP_CALL_SITE:
AA = new (A.Allocator) AAICVTrackerCallSite(IRP, A);
break;
- case IRPosition::IRP_FUNCTION:
- AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
- break;
- }
-
- return *AA;
-}
-
-PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C,
- CGSCCAnalysisManager &AM,
- LazyCallGraph &CG, CGSCCUpdateResult &UR) {
- if (!containsOpenMP(*C.begin()->getFunction().getParent(), OMPInModule))
- return PreservedAnalyses::all();
-
- if (DisableOpenMPOptimizations)
- return PreservedAnalyses::all();
-
- SmallVector<Function *, 16> SCC;
+ case IRPosition::IRP_FUNCTION:
+ AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
+ break;
+ }
+
+ return *AA;
+}
+
+PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C,
+ CGSCCAnalysisManager &AM,
+ LazyCallGraph &CG, CGSCCUpdateResult &UR) {
+ if (!containsOpenMP(*C.begin()->getFunction().getParent(), OMPInModule))
+ return PreservedAnalyses::all();
+
+ if (DisableOpenMPOptimizations)
+ return PreservedAnalyses::all();
+
+ SmallVector<Function *, 16> SCC;
// If there are kernels in the module, we have to run on all SCC's.
bool SCCIsInteresting = !OMPInModule.getKernels().empty();
for (LazyCallGraph::Node &N : C) {
Function *Fn = &N.getFunction();
SCC.push_back(Fn);
-
+
// Do we already know that the SCC contains kernels,
// or that OpenMP functions are called from this SCC?
if (SCCIsInteresting)
@@ -2286,63 +2286,63 @@ PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C,
}
if (!SCCIsInteresting || SCC.empty())
- return PreservedAnalyses::all();
-
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
-
- AnalysisGetter AG(FAM);
-
- auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
- return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
- };
-
- CallGraphUpdater CGUpdater;
- CGUpdater.initialize(CG, C, AM, UR);
-
- SetVector<Function *> Functions(SCC.begin(), SCC.end());
- BumpPtrAllocator Allocator;
- OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
- /*CGSCC*/ Functions, OMPInModule.getKernels());
-
- Attributor A(Functions, InfoCache, CGUpdater);
-
- OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
- bool Changed = OMPOpt.run();
- if (Changed)
- return PreservedAnalyses::none();
-
- return PreservedAnalyses::all();
-}
-
-namespace {
-
-struct OpenMPOptLegacyPass : public CallGraphSCCPass {
- CallGraphUpdater CGUpdater;
- OpenMPInModule OMPInModule;
- static char ID;
-
- OpenMPOptLegacyPass() : CallGraphSCCPass(ID) {
- initializeOpenMPOptLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- CallGraphSCCPass::getAnalysisUsage(AU);
- }
-
- bool doInitialization(CallGraph &CG) override {
- // Disable the pass if there is no OpenMP (runtime call) in the module.
- containsOpenMP(CG.getModule(), OMPInModule);
- return false;
- }
-
- bool runOnSCC(CallGraphSCC &CGSCC) override {
- if (!containsOpenMP(CGSCC.getCallGraph().getModule(), OMPInModule))
- return false;
- if (DisableOpenMPOptimizations || skipSCC(CGSCC))
- return false;
-
- SmallVector<Function *, 16> SCC;
+ return PreservedAnalyses::all();
+
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+
+ AnalysisGetter AG(FAM);
+
+ auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
+ return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
+ };
+
+ CallGraphUpdater CGUpdater;
+ CGUpdater.initialize(CG, C, AM, UR);
+
+ SetVector<Function *> Functions(SCC.begin(), SCC.end());
+ BumpPtrAllocator Allocator;
+ OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
+ /*CGSCC*/ Functions, OMPInModule.getKernels());
+
+ Attributor A(Functions, InfoCache, CGUpdater);
+
+ OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
+ bool Changed = OMPOpt.run();
+ if (Changed)
+ return PreservedAnalyses::none();
+
+ return PreservedAnalyses::all();
+}
+
+namespace {
+
+struct OpenMPOptLegacyPass : public CallGraphSCCPass {
+ CallGraphUpdater CGUpdater;
+ OpenMPInModule OMPInModule;
+ static char ID;
+
+ OpenMPOptLegacyPass() : CallGraphSCCPass(ID) {
+ initializeOpenMPOptLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ CallGraphSCCPass::getAnalysisUsage(AU);
+ }
+
+ bool doInitialization(CallGraph &CG) override {
+ // Disable the pass if there is no OpenMP (runtime call) in the module.
+ containsOpenMP(CG.getModule(), OMPInModule);
+ return false;
+ }
+
+ bool runOnSCC(CallGraphSCC &CGSCC) override {
+ if (!containsOpenMP(CGSCC.getCallGraph().getModule(), OMPInModule))
+ return false;
+ if (DisableOpenMPOptimizations || skipSCC(CGSCC))
+ return false;
+
+ SmallVector<Function *, 16> SCC;
// If there are kernels in the module, we have to run on all SCC's.
bool SCCIsInteresting = !OMPInModule.getKernels().empty();
for (CallGraphNode *CGN : CGSCC) {
@@ -2350,7 +2350,7 @@ struct OpenMPOptLegacyPass : public CallGraphSCCPass {
if (!Fn || Fn->isDeclaration())
continue;
SCC.push_back(Fn);
-
+
// Do we already know that the SCC contains kernels,
// or that OpenMP functions are called from this SCC?
if (SCCIsInteresting)
@@ -2360,100 +2360,100 @@ struct OpenMPOptLegacyPass : public CallGraphSCCPass {
}
if (!SCCIsInteresting || SCC.empty())
- return false;
-
- CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
- CGUpdater.initialize(CG, CGSCC);
-
- // Maintain a map of functions to avoid rebuilding the ORE
- DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap;
- auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & {
- std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F];
- if (!ORE)
- ORE = std::make_unique<OptimizationRemarkEmitter>(F);
- return *ORE;
- };
-
- AnalysisGetter AG;
- SetVector<Function *> Functions(SCC.begin(), SCC.end());
- BumpPtrAllocator Allocator;
- OMPInformationCache InfoCache(
- *(Functions.back()->getParent()), AG, Allocator,
- /*CGSCC*/ Functions, OMPInModule.getKernels());
-
- Attributor A(Functions, InfoCache, CGUpdater);
-
- OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
- return OMPOpt.run();
- }
-
- bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
-};
-
-} // end anonymous namespace
-
-void OpenMPInModule::identifyKernels(Module &M) {
-
- NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
- if (!MD)
- return;
-
- for (auto *Op : MD->operands()) {
- if (Op->getNumOperands() < 2)
- continue;
- MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
- if (!KindID || KindID->getString() != "kernel")
- continue;
-
- Function *KernelFn =
- mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
- if (!KernelFn)
- continue;
-
- ++NumOpenMPTargetRegionKernels;
-
- Kernels.insert(KernelFn);
- }
-}
-
-bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) {
- if (OMPInModule.isKnown())
- return OMPInModule;
-
+ return false;
+
+ CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+ CGUpdater.initialize(CG, CGSCC);
+
+ // Maintain a map of functions to avoid rebuilding the ORE
+ DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap;
+ auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & {
+ std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F];
+ if (!ORE)
+ ORE = std::make_unique<OptimizationRemarkEmitter>(F);
+ return *ORE;
+ };
+
+ AnalysisGetter AG;
+ SetVector<Function *> Functions(SCC.begin(), SCC.end());
+ BumpPtrAllocator Allocator;
+ OMPInformationCache InfoCache(
+ *(Functions.back()->getParent()), AG, Allocator,
+ /*CGSCC*/ Functions, OMPInModule.getKernels());
+
+ Attributor A(Functions, InfoCache, CGUpdater);
+
+ OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
+ return OMPOpt.run();
+ }
+
+ bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
+};
+
+} // end anonymous namespace
+
+void OpenMPInModule::identifyKernels(Module &M) {
+
+ NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
+ if (!MD)
+ return;
+
+ for (auto *Op : MD->operands()) {
+ if (Op->getNumOperands() < 2)
+ continue;
+ MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
+ if (!KindID || KindID->getString() != "kernel")
+ continue;
+
+ Function *KernelFn =
+ mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
+ if (!KernelFn)
+ continue;
+
+ ++NumOpenMPTargetRegionKernels;
+
+ Kernels.insert(KernelFn);
+ }
+}
+
+bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) {
+ if (OMPInModule.isKnown())
+ return OMPInModule;
+
auto RecordFunctionsContainingUsesOf = [&](Function *F) {
for (User *U : F->users())
if (auto *I = dyn_cast<Instruction>(U))
OMPInModule.FuncsWithOMPRuntimeCalls.insert(I->getFunction());
};
- // MSVC doesn't like long if-else chains for some reason and instead just
- // issues an error. Work around it..
- do {
-#define OMP_RTL(_Enum, _Name, ...) \
+ // MSVC doesn't like long if-else chains for some reason and instead just
+ // issues an error. Work around it..
+ do {
+#define OMP_RTL(_Enum, _Name, ...) \
if (Function *F = M.getFunction(_Name)) { \
RecordFunctionsContainingUsesOf(F); \
- OMPInModule = true; \
- }
-#include "llvm/Frontend/OpenMP/OMPKinds.def"
- } while (false);
-
- // Identify kernels once. TODO: We should split the OMPInformationCache into a
- // module and an SCC part. The kernel information, among other things, could
- // go into the module part.
- if (OMPInModule.isKnown() && OMPInModule) {
- OMPInModule.identifyKernels(M);
- return true;
- }
-
- return OMPInModule = false;
-}
-
-char OpenMPOptLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt",
- "OpenMP specific optimizations", false, false)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_END(OpenMPOptLegacyPass, "openmpopt",
- "OpenMP specific optimizations", false, false)
-
-Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacyPass(); }
+ OMPInModule = true; \
+ }
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+ } while (false);
+
+ // Identify kernels once. TODO: We should split the OMPInformationCache into a
+ // module and an SCC part. The kernel information, among other things, could
+ // go into the module part.
+ if (OMPInModule.isKnown() && OMPInModule) {
+ OMPInModule.identifyKernels(M);
+ return true;
+ }
+
+ return OMPInModule = false;
+}
+
+char OpenMPOptLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt",
+ "OpenMP specific optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(OpenMPOptLegacyPass, "openmpopt",
+ "OpenMP specific optimizations", false, false)
+
+Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/PartialInlining.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/PartialInlining.cpp
index e0a77e26b2..2bbf4bf110 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/PartialInlining.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/PartialInlining.cpp
@@ -1,880 +1,880 @@
-//===- PartialInlining.cpp - Inline parts of functions --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass performs partial inlining, typically by inlining an if statement
-// that surrounds the body of the function.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/PartialInlining.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/User.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/BlockFrequency.h"
-#include "llvm/Support/BranchProbability.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/CodeExtractor.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <functional>
-#include <iterator>
-#include <memory>
-#include <tuple>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "partial-inlining"
-
-STATISTIC(NumPartialInlined,
- "Number of callsites functions partially inlined into.");
-STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
- "cold outlined regions were partially "
- "inlined into its caller(s).");
-STATISTIC(NumColdRegionsFound,
- "Number of cold single entry/exit regions found.");
-STATISTIC(NumColdRegionsOutlined,
- "Number of cold single entry/exit regions outlined.");
-
-// Command line option to disable partial-inlining. The default is false:
-static cl::opt<bool>
- DisablePartialInlining("disable-partial-inlining", cl::init(false),
- cl::Hidden, cl::desc("Disable partial inlining"));
-// Command line option to disable multi-region partial-inlining. The default is
-// false:
-static cl::opt<bool> DisableMultiRegionPartialInline(
- "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
- cl::desc("Disable multi-region partial inlining"));
-
-// Command line option to force outlining in regions with live exit variables.
-// The default is false:
-static cl::opt<bool>
- ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
- cl::desc("Force outline regions with live exits"));
-
-// Command line option to enable marking outline functions with Cold Calling
-// Convention. The default is false:
-static cl::opt<bool>
- MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
- cl::desc("Mark outline function calls with ColdCC"));
-
-// This is an option used by testing:
-static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
- cl::init(false), cl::ZeroOrMore,
- cl::ReallyHidden,
- cl::desc("Skip Cost Analysis"));
-// Used to determine if a cold region is worth outlining based on
-// its inlining cost compared to the original function. Default is set at 10%.
-// ie. if the cold region reduces the inlining cost of the original function by
-// at least 10%.
-static cl::opt<float> MinRegionSizeRatio(
- "min-region-size-ratio", cl::init(0.1), cl::Hidden,
- cl::desc("Minimum ratio comparing relative sizes of each "
- "outline candidate and original function"));
-// Used to tune the minimum number of execution counts needed in the predecessor
-// block to the cold edge. ie. confidence interval.
-static cl::opt<unsigned>
- MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
- cl::desc("Minimum block executions to consider "
- "its BranchProbabilityInfo valid"));
-// Used to determine when an edge is considered cold. Default is set to 10%. ie.
-// if the branch probability is 10% or less, then it is deemed as 'cold'.
-static cl::opt<float> ColdBranchRatio(
- "cold-branch-ratio", cl::init(0.1), cl::Hidden,
- cl::desc("Minimum BranchProbability to consider a region cold."));
-
-static cl::opt<unsigned> MaxNumInlineBlocks(
- "max-num-inline-blocks", cl::init(5), cl::Hidden,
- cl::desc("Max number of blocks to be partially inlined"));
-
-// Command line option to set the maximum number of partial inlining allowed
-// for the module. The default value of -1 means no limit.
-static cl::opt<int> MaxNumPartialInlining(
- "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
- cl::desc("Max number of partial inlining. The default is unlimited"));
-
-// Used only when PGO or user annotated branch data is absent. It is
-// the least value that is used to weigh the outline region. If BFI
-// produces larger value, the BFI value will be used.
-static cl::opt<int>
- OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
- cl::Hidden, cl::ZeroOrMore,
- cl::desc("Relative frequency of outline region to "
- "the entry block"));
-
-static cl::opt<unsigned> ExtraOutliningPenalty(
- "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
- cl::desc("A debug option to add additional penalty to the computed one."));
-
-namespace {
-
-struct FunctionOutliningInfo {
- FunctionOutliningInfo() = default;
-
- // Returns the number of blocks to be inlined including all blocks
- // in Entries and one return block.
+//===- PartialInlining.cpp - Inline parts of functions --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs partial inlining, typically by inlining an if statement
+// that surrounds the body of the function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/PartialInlining.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/User.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <tuple>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "partial-inlining"
+
+STATISTIC(NumPartialInlined,
+ "Number of callsites functions partially inlined into.");
+STATISTIC(NumColdOutlinePartialInlined, "Number of times functions with "
+ "cold outlined regions were partially "
+ "inlined into its caller(s).");
+STATISTIC(NumColdRegionsFound,
+ "Number of cold single entry/exit regions found.");
+STATISTIC(NumColdRegionsOutlined,
+ "Number of cold single entry/exit regions outlined.");
+
+// Command line option to disable partial-inlining. The default is false:
+static cl::opt<bool>
+ DisablePartialInlining("disable-partial-inlining", cl::init(false),
+ cl::Hidden, cl::desc("Disable partial inlining"));
+// Command line option to disable multi-region partial-inlining. The default is
+// false:
+static cl::opt<bool> DisableMultiRegionPartialInline(
+ "disable-mr-partial-inlining", cl::init(false), cl::Hidden,
+ cl::desc("Disable multi-region partial inlining"));
+
+// Command line option to force outlining in regions with live exit variables.
+// The default is false:
+static cl::opt<bool>
+ ForceLiveExit("pi-force-live-exit-outline", cl::init(false), cl::Hidden,
+ cl::desc("Force outline regions with live exits"));
+
+// Command line option to enable marking outline functions with Cold Calling
+// Convention. The default is false:
+static cl::opt<bool>
+ MarkOutlinedColdCC("pi-mark-coldcc", cl::init(false), cl::Hidden,
+ cl::desc("Mark outline function calls with ColdCC"));
+
+// This is an option used by testing:
+static cl::opt<bool> SkipCostAnalysis("skip-partial-inlining-cost-analysis",
+ cl::init(false), cl::ZeroOrMore,
+ cl::ReallyHidden,
+ cl::desc("Skip Cost Analysis"));
+// Used to determine if a cold region is worth outlining based on
+// its inlining cost compared to the original function. Default is set at 10%.
+// ie. if the cold region reduces the inlining cost of the original function by
+// at least 10%.
+static cl::opt<float> MinRegionSizeRatio(
+ "min-region-size-ratio", cl::init(0.1), cl::Hidden,
+ cl::desc("Minimum ratio comparing relative sizes of each "
+ "outline candidate and original function"));
+// Used to tune the minimum number of execution counts needed in the predecessor
+// block to the cold edge. ie. confidence interval.
+static cl::opt<unsigned>
+ MinBlockCounterExecution("min-block-execution", cl::init(100), cl::Hidden,
+ cl::desc("Minimum block executions to consider "
+ "its BranchProbabilityInfo valid"));
+// Used to determine when an edge is considered cold. Default is set to 10%. ie.
+// if the branch probability is 10% or less, then it is deemed as 'cold'.
+static cl::opt<float> ColdBranchRatio(
+ "cold-branch-ratio", cl::init(0.1), cl::Hidden,
+ cl::desc("Minimum BranchProbability to consider a region cold."));
+
+static cl::opt<unsigned> MaxNumInlineBlocks(
+ "max-num-inline-blocks", cl::init(5), cl::Hidden,
+ cl::desc("Max number of blocks to be partially inlined"));
+
+// Command line option to set the maximum number of partial inlining allowed
+// for the module. The default value of -1 means no limit.
+static cl::opt<int> MaxNumPartialInlining(
+ "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Max number of partial inlining. The default is unlimited"));
+
+// Used only when PGO or user annotated branch data is absent. It is
+// the least value that is used to weigh the outline region. If BFI
+// produces larger value, the BFI value will be used.
+static cl::opt<int>
+ OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75),
+ cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Relative frequency of outline region to "
+ "the entry block"));
+
+static cl::opt<unsigned> ExtraOutliningPenalty(
+ "partial-inlining-extra-penalty", cl::init(0), cl::Hidden,
+ cl::desc("A debug option to add additional penalty to the computed one."));
+
+namespace {
+
+struct FunctionOutliningInfo {
+ FunctionOutliningInfo() = default;
+
+ // Returns the number of blocks to be inlined including all blocks
+ // in Entries and one return block.
unsigned getNumInlinedBlocks() const { return Entries.size() + 1; }
-
- // A set of blocks including the function entry that guard
- // the region to be outlined.
- SmallVector<BasicBlock *, 4> Entries;
-
- // The return block that is not included in the outlined region.
- BasicBlock *ReturnBlock = nullptr;
-
- // The dominating block of the region to be outlined.
- BasicBlock *NonReturnBlock = nullptr;
-
- // The set of blocks in Entries that that are predecessors to ReturnBlock
- SmallVector<BasicBlock *, 4> ReturnBlockPreds;
-};
-
-struct FunctionOutliningMultiRegionInfo {
- FunctionOutliningMultiRegionInfo()
- : ORI() {}
-
- // Container for outline regions
- struct OutlineRegionInfo {
- OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
- BasicBlock *EntryBlock, BasicBlock *ExitBlock,
- BasicBlock *ReturnBlock)
- : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
- ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
- SmallVector<BasicBlock *, 8> Region;
- BasicBlock *EntryBlock;
- BasicBlock *ExitBlock;
- BasicBlock *ReturnBlock;
- };
-
- SmallVector<OutlineRegionInfo, 4> ORI;
-};
-
-struct PartialInlinerImpl {
-
- PartialInlinerImpl(
- function_ref<AssumptionCache &(Function &)> GetAC,
- function_ref<AssumptionCache *(Function &)> LookupAC,
- function_ref<TargetTransformInfo &(Function &)> GTTI,
- function_ref<const TargetLibraryInfo &(Function &)> GTLI,
- ProfileSummaryInfo &ProfSI,
- function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
- : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
- GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
-
- bool run(Module &M);
- // Main part of the transformation that calls helper functions to find
- // outlining candidates, clone & outline the function, and attempt to
- // partially inline the resulting function. Returns true if
- // inlining was successful, false otherwise. Also returns the outline
- // function (only if we partially inlined early returns) as there is a
- // possibility to further "peel" early return statements that were left in the
- // outline function due to code size.
+
+ // A set of blocks including the function entry that guard
+ // the region to be outlined.
+ SmallVector<BasicBlock *, 4> Entries;
+
+ // The return block that is not included in the outlined region.
+ BasicBlock *ReturnBlock = nullptr;
+
+ // The dominating block of the region to be outlined.
+ BasicBlock *NonReturnBlock = nullptr;
+
+ // The set of blocks in Entries that that are predecessors to ReturnBlock
+ SmallVector<BasicBlock *, 4> ReturnBlockPreds;
+};
+
+struct FunctionOutliningMultiRegionInfo {
+ FunctionOutliningMultiRegionInfo()
+ : ORI() {}
+
+ // Container for outline regions
+ struct OutlineRegionInfo {
+ OutlineRegionInfo(ArrayRef<BasicBlock *> Region,
+ BasicBlock *EntryBlock, BasicBlock *ExitBlock,
+ BasicBlock *ReturnBlock)
+ : Region(Region.begin(), Region.end()), EntryBlock(EntryBlock),
+ ExitBlock(ExitBlock), ReturnBlock(ReturnBlock) {}
+ SmallVector<BasicBlock *, 8> Region;
+ BasicBlock *EntryBlock;
+ BasicBlock *ExitBlock;
+ BasicBlock *ReturnBlock;
+ };
+
+ SmallVector<OutlineRegionInfo, 4> ORI;
+};
+
+struct PartialInlinerImpl {
+
+ PartialInlinerImpl(
+ function_ref<AssumptionCache &(Function &)> GetAC,
+ function_ref<AssumptionCache *(Function &)> LookupAC,
+ function_ref<TargetTransformInfo &(Function &)> GTTI,
+ function_ref<const TargetLibraryInfo &(Function &)> GTLI,
+ ProfileSummaryInfo &ProfSI,
+ function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
+ : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
+ GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
+
+ bool run(Module &M);
+ // Main part of the transformation that calls helper functions to find
+ // outlining candidates, clone & outline the function, and attempt to
+ // partially inline the resulting function. Returns true if
+ // inlining was successful, false otherwise. Also returns the outline
+ // function (only if we partially inlined early returns) as there is a
+ // possibility to further "peel" early return statements that were left in the
+ // outline function due to code size.
std::pair<bool, Function *> unswitchFunction(Function &F);
-
- // This class speculatively clones the function to be partial inlined.
- // At the end of partial inlining, the remaining callsites to the cloned
- // function that are not partially inlined will be fixed up to reference
- // the original function, and the cloned function will be erased.
- struct FunctionCloner {
- // Two constructors, one for single region outlining, the other for
- // multi-region outlining.
- FunctionCloner(Function *F, FunctionOutliningInfo *OI,
- OptimizationRemarkEmitter &ORE,
+
+ // This class speculatively clones the function to be partial inlined.
+ // At the end of partial inlining, the remaining callsites to the cloned
+ // function that are not partially inlined will be fixed up to reference
+ // the original function, and the cloned function will be erased.
+ struct FunctionCloner {
+ // Two constructors, one for single region outlining, the other for
+ // multi-region outlining.
+ FunctionCloner(Function *F, FunctionOutliningInfo *OI,
+ OptimizationRemarkEmitter &ORE,
function_ref<AssumptionCache *(Function &)> LookupAC,
function_ref<TargetTransformInfo &(Function &)> GetTTI);
- FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
- OptimizationRemarkEmitter &ORE,
+ FunctionCloner(Function *F, FunctionOutliningMultiRegionInfo *OMRI,
+ OptimizationRemarkEmitter &ORE,
function_ref<AssumptionCache *(Function &)> LookupAC,
function_ref<TargetTransformInfo &(Function &)> GetTTI);
- ~FunctionCloner();
-
- // Prepare for function outlining: making sure there is only
- // one incoming edge from the extracted/outlined region to
- // the return block.
+ ~FunctionCloner();
+
+ // Prepare for function outlining: making sure there is only
+ // one incoming edge from the extracted/outlined region to
+ // the return block.
void normalizeReturnBlock() const;
-
- // Do function outlining for cold regions.
- bool doMultiRegionFunctionOutlining();
- // Do function outlining for region after early return block(s).
- // NOTE: For vararg functions that do the vararg handling in the outlined
- // function, we temporarily generate IR that does not properly
- // forward varargs to the outlined function. Calling InlineFunction
- // will update calls to the outlined functions to properly forward
- // the varargs.
- Function *doSingleRegionFunctionOutlining();
-
- Function *OrigFunc = nullptr;
- Function *ClonedFunc = nullptr;
-
- typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
- // Keep track of Outlined Functions and the basic block they're called from.
- SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
-
- // ClonedFunc is inlined in one of its callers after function
- // outlining.
- bool IsFunctionInlined = false;
- // The cost of the region to be outlined.
- int OutlinedRegionCost = 0;
- // ClonedOI is specific to outlining non-early return blocks.
- std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
- // ClonedOMRI is specific to outlining cold regions.
- std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
- std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
- OptimizationRemarkEmitter &ORE;
- function_ref<AssumptionCache *(Function &)> LookupAC;
+
+ // Do function outlining for cold regions.
+ bool doMultiRegionFunctionOutlining();
+ // Do function outlining for region after early return block(s).
+ // NOTE: For vararg functions that do the vararg handling in the outlined
+ // function, we temporarily generate IR that does not properly
+ // forward varargs to the outlined function. Calling InlineFunction
+ // will update calls to the outlined functions to properly forward
+ // the varargs.
+ Function *doSingleRegionFunctionOutlining();
+
+ Function *OrigFunc = nullptr;
+ Function *ClonedFunc = nullptr;
+
+ typedef std::pair<Function *, BasicBlock *> FuncBodyCallerPair;
+ // Keep track of Outlined Functions and the basic block they're called from.
+ SmallVector<FuncBodyCallerPair, 4> OutlinedFunctions;
+
+ // ClonedFunc is inlined in one of its callers after function
+ // outlining.
+ bool IsFunctionInlined = false;
+ // The cost of the region to be outlined.
+ int OutlinedRegionCost = 0;
+ // ClonedOI is specific to outlining non-early return blocks.
+ std::unique_ptr<FunctionOutliningInfo> ClonedOI = nullptr;
+ // ClonedOMRI is specific to outlining cold regions.
+ std::unique_ptr<FunctionOutliningMultiRegionInfo> ClonedOMRI = nullptr;
+ std::unique_ptr<BlockFrequencyInfo> ClonedFuncBFI = nullptr;
+ OptimizationRemarkEmitter &ORE;
+ function_ref<AssumptionCache *(Function &)> LookupAC;
function_ref<TargetTransformInfo &(Function &)> GetTTI;
- };
-
-private:
- int NumPartialInlining = 0;
- function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
- function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
- function_ref<TargetTransformInfo &(Function &)> GetTTI;
- function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
- function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
- ProfileSummaryInfo &PSI;
-
- // Return the frequency of the OutlininingBB relative to F's entry point.
- // The result is no larger than 1 and is represented using BP.
- // (Note that the outlined region's 'head' block can only have incoming
- // edges from the guarding entry blocks).
+ };
+
+private:
+ int NumPartialInlining = 0;
+ function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
+ function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
+ function_ref<TargetTransformInfo &(Function &)> GetTTI;
+ function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
+ function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
+ ProfileSummaryInfo &PSI;
+
+ // Return the frequency of the OutlininingBB relative to F's entry point.
+ // The result is no larger than 1 and is represented using BP.
+ // (Note that the outlined region's 'head' block can only have incoming
+ // edges from the guarding entry blocks).
BranchProbability
getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) const;
-
- // Return true if the callee of CB should be partially inlined with
- // profit.
- bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
- BlockFrequency WeightedOutliningRcost,
+
+ // Return true if the callee of CB should be partially inlined with
+ // profit.
+ bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
+ BlockFrequency WeightedOutliningRcost,
OptimizationRemarkEmitter &ORE) const;
-
- // Try to inline DuplicateFunction (cloned from F with call to
- // the OutlinedFunction into its callers. Return true
- // if there is any successful inlining.
- bool tryPartialInline(FunctionCloner &Cloner);
-
- // Compute the mapping from use site of DuplicationFunction to the enclosing
- // BB's profile count.
+
+ // Try to inline DuplicateFunction (cloned from F with call to
+ // the OutlinedFunction into its callers. Return true
+ // if there is any successful inlining.
+ bool tryPartialInline(FunctionCloner &Cloner);
+
+ // Compute the mapping from use site of DuplicationFunction to the enclosing
+ // BB's profile count.
void
computeCallsiteToProfCountMap(Function *DuplicateFunction,
DenseMap<User *, uint64_t> &SiteCountMap) const;
-
+
bool isLimitReached() const {
- return (MaxNumPartialInlining != -1 &&
- NumPartialInlining >= MaxNumPartialInlining);
- }
-
- static CallBase *getSupportedCallBase(User *U) {
- if (isa<CallInst>(U) || isa<InvokeInst>(U))
- return cast<CallBase>(U);
- llvm_unreachable("All uses must be calls");
- return nullptr;
- }
-
+ return (MaxNumPartialInlining != -1 &&
+ NumPartialInlining >= MaxNumPartialInlining);
+ }
+
+ static CallBase *getSupportedCallBase(User *U) {
+ if (isa<CallInst>(U) || isa<InvokeInst>(U))
+ return cast<CallBase>(U);
+ llvm_unreachable("All uses must be calls");
+ return nullptr;
+ }
+
static CallBase *getOneCallSiteTo(Function &F) {
User *User = *F.user_begin();
- return getSupportedCallBase(User);
- }
-
+ return getSupportedCallBase(User);
+ }
+
std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function &F) const {
- CallBase *CB = getOneCallSiteTo(F);
- DebugLoc DLoc = CB->getDebugLoc();
- BasicBlock *Block = CB->getParent();
- return std::make_tuple(DLoc, Block);
- }
-
- // Returns the costs associated with function outlining:
- // - The first value is the non-weighted runtime cost for making the call
- // to the outlined function, including the addtional setup cost in the
- // outlined function itself;
- // - The second value is the estimated size of the new call sequence in
- // basic block Cloner.OutliningCallBB;
+ CallBase *CB = getOneCallSiteTo(F);
+ DebugLoc DLoc = CB->getDebugLoc();
+ BasicBlock *Block = CB->getParent();
+ return std::make_tuple(DLoc, Block);
+ }
+
+ // Returns the costs associated with function outlining:
+ // - The first value is the non-weighted runtime cost for making the call
+ // to the outlined function, including the addtional setup cost in the
+ // outlined function itself;
+ // - The second value is the estimated size of the new call sequence in
+ // basic block Cloner.OutliningCallBB;
std::tuple<int, int> computeOutliningCosts(FunctionCloner &Cloner) const;
-
- // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
- // approximate both the size and runtime cost (Note that in the current
- // inline cost analysis, there is no clear distinction there either).
+
+ // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to
+ // approximate both the size and runtime cost (Note that in the current
+ // inline cost analysis, there is no clear distinction there either).
static int computeBBInlineCost(BasicBlock *BB, TargetTransformInfo *TTI);
-
+
std::unique_ptr<FunctionOutliningInfo>
computeOutliningInfo(Function &F) const;
- std::unique_ptr<FunctionOutliningMultiRegionInfo>
+ std::unique_ptr<FunctionOutliningMultiRegionInfo>
computeOutliningColdRegionsInfo(Function &F,
OptimizationRemarkEmitter &ORE) const;
-};
-
-struct PartialInlinerLegacyPass : public ModulePass {
- static char ID; // Pass identification, replacement for typeid
-
- PartialInlinerLegacyPass() : ModulePass(ID) {
- initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<ProfileSummaryInfoWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
-
- AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
- TargetTransformInfoWrapperPass *TTIWP =
- &getAnalysis<TargetTransformInfoWrapperPass>();
- ProfileSummaryInfo &PSI =
- getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-
- auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
- return ACT->getAssumptionCache(F);
- };
-
- auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
- return ACT->lookupAssumptionCache(F);
- };
-
- auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
- return TTIWP->getTTI(F);
- };
-
- auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
- return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- };
-
- return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
- GetTLI, PSI)
- .run(M);
- }
-};
-
-} // end anonymous namespace
-
-std::unique_ptr<FunctionOutliningMultiRegionInfo>
+};
+
+struct PartialInlinerLegacyPass : public ModulePass {
+ static char ID; // Pass identification, replacement for typeid
+
+ PartialInlinerLegacyPass() : ModulePass(ID) {
+ initializePartialInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+
+ AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
+ TargetTransformInfoWrapperPass *TTIWP =
+ &getAnalysis<TargetTransformInfoWrapperPass>();
+ ProfileSummaryInfo &PSI =
+ getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+
+ auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
+ return ACT->getAssumptionCache(F);
+ };
+
+ auto LookupAssumptionCache = [ACT](Function &F) -> AssumptionCache * {
+ return ACT->lookupAssumptionCache(F);
+ };
+
+ auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
+ return TTIWP->getTTI(F);
+ };
+
+ auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+ return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ };
+
+ return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
+ GetTLI, PSI)
+ .run(M);
+ }
+};
+
+} // end anonymous namespace
+
+std::unique_ptr<FunctionOutliningMultiRegionInfo>
PartialInlinerImpl::computeOutliningColdRegionsInfo(
Function &F, OptimizationRemarkEmitter &ORE) const {
BasicBlock *EntryBlock = &F.front();
-
+
DominatorTree DT(F);
- LoopInfo LI(DT);
+ LoopInfo LI(DT);
BranchProbabilityInfo BPI(F, LI);
- std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
- BlockFrequencyInfo *BFI;
- if (!GetBFI) {
+ std::unique_ptr<BlockFrequencyInfo> ScopedBFI;
+ BlockFrequencyInfo *BFI;
+ if (!GetBFI) {
ScopedBFI.reset(new BlockFrequencyInfo(F, BPI, LI));
- BFI = ScopedBFI.get();
- } else
+ BFI = ScopedBFI.get();
+ } else
BFI = &(GetBFI(F));
-
- // Return if we don't have profiling information.
- if (!PSI.hasInstrumentationProfile())
- return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
-
- std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
- std::make_unique<FunctionOutliningMultiRegionInfo>();
-
- auto IsSingleExit =
- [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
- BasicBlock *ExitBlock = nullptr;
- for (auto *Block : BlockList) {
- for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
- if (!is_contained(BlockList, *SI)) {
- if (ExitBlock) {
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
- &SI->front())
- << "Region dominated by "
- << ore::NV("Block", BlockList.front()->getName())
- << " has more than one region exit edge.";
- });
- return nullptr;
+
+ // Return if we don't have profiling information.
+ if (!PSI.hasInstrumentationProfile())
+ return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
+
+ std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
+ std::make_unique<FunctionOutliningMultiRegionInfo>();
+
+ auto IsSingleExit =
+ [&ORE](SmallVectorImpl<BasicBlock *> &BlockList) -> BasicBlock * {
+ BasicBlock *ExitBlock = nullptr;
+ for (auto *Block : BlockList) {
+ for (auto SI = succ_begin(Block); SI != succ_end(Block); ++SI) {
+ if (!is_contained(BlockList, *SI)) {
+ if (ExitBlock) {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "MultiExitRegion",
+ &SI->front())
+ << "Region dominated by "
+ << ore::NV("Block", BlockList.front()->getName())
+ << " has more than one region exit edge.";
+ });
+ return nullptr;
}
ExitBlock = Block;
- }
- }
- }
- return ExitBlock;
- };
-
- auto BBProfileCount = [BFI](BasicBlock *BB) {
- return BFI->getBlockProfileCount(BB)
- ? BFI->getBlockProfileCount(BB).getValue()
- : 0;
- };
-
- // Use the same computeBBInlineCost function to compute the cost savings of
- // the outlining the candidate region.
+ }
+ }
+ }
+ return ExitBlock;
+ };
+
+ auto BBProfileCount = [BFI](BasicBlock *BB) {
+ return BFI->getBlockProfileCount(BB)
+ ? BFI->getBlockProfileCount(BB).getValue()
+ : 0;
+ };
+
+ // Use the same computeBBInlineCost function to compute the cost savings of
+ // the outlining the candidate region.
TargetTransformInfo *FTTI = &GetTTI(F);
- int OverallFunctionCost = 0;
+ int OverallFunctionCost = 0;
for (auto &BB : F)
OverallFunctionCost += computeBBInlineCost(&BB, FTTI);
-
+
LLVM_DEBUG(dbgs() << "OverallFunctionCost = " << OverallFunctionCost
<< "\n";);
- int MinOutlineRegionCost =
- static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
- BranchProbability MinBranchProbability(
- static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
- MinBlockCounterExecution);
- bool ColdCandidateFound = false;
- BasicBlock *CurrEntry = EntryBlock;
- std::vector<BasicBlock *> DFS;
- DenseMap<BasicBlock *, bool> VisitedMap;
- DFS.push_back(CurrEntry);
- VisitedMap[CurrEntry] = true;
-
- // Use Depth First Search on the basic blocks to find CFG edges that are
- // considered cold.
- // Cold regions considered must also have its inline cost compared to the
- // overall inline cost of the original function. The region is outlined only
- // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
- // more.
- while (!DFS.empty()) {
+ int MinOutlineRegionCost =
+ static_cast<int>(OverallFunctionCost * MinRegionSizeRatio);
+ BranchProbability MinBranchProbability(
+ static_cast<int>(ColdBranchRatio * MinBlockCounterExecution),
+ MinBlockCounterExecution);
+ bool ColdCandidateFound = false;
+ BasicBlock *CurrEntry = EntryBlock;
+ std::vector<BasicBlock *> DFS;
+ DenseMap<BasicBlock *, bool> VisitedMap;
+ DFS.push_back(CurrEntry);
+ VisitedMap[CurrEntry] = true;
+
+ // Use Depth First Search on the basic blocks to find CFG edges that are
+ // considered cold.
+ // Cold regions considered must also have its inline cost compared to the
+ // overall inline cost of the original function. The region is outlined only
+ // if it reduced the inline cost of the function by 'MinOutlineRegionCost' or
+ // more.
+ while (!DFS.empty()) {
auto *ThisBB = DFS.back();
- DFS.pop_back();
- // Only consider regions with predecessor blocks that are considered
- // not-cold (default: part of the top 99.99% of all block counters)
- // AND greater than our minimum block execution count (default: 100).
+ DFS.pop_back();
+ // Only consider regions with predecessor blocks that are considered
+ // not-cold (default: part of the top 99.99% of all block counters)
+ // AND greater than our minimum block execution count (default: 100).
if (PSI.isColdBlock(ThisBB, BFI) ||
BBProfileCount(ThisBB) < MinBlockCounterExecution)
- continue;
+ continue;
for (auto SI = succ_begin(ThisBB); SI != succ_end(ThisBB); ++SI) {
- if (VisitedMap[*SI])
- continue;
- VisitedMap[*SI] = true;
- DFS.push_back(*SI);
- // If branch isn't cold, we skip to the next one.
+ if (VisitedMap[*SI])
+ continue;
+ VisitedMap[*SI] = true;
+ DFS.push_back(*SI);
+ // If branch isn't cold, we skip to the next one.
BranchProbability SuccProb = BPI.getEdgeProbability(ThisBB, *SI);
- if (SuccProb > MinBranchProbability)
- continue;
+ if (SuccProb > MinBranchProbability)
+ continue;
LLVM_DEBUG(dbgs() << "Found cold edge: " << ThisBB->getName() << "->"
<< SI->getName()
<< "\nBranch Probability = " << SuccProb << "\n";);
- SmallVector<BasicBlock *, 8> DominateVector;
- DT.getDescendants(*SI, DominateVector);
+ SmallVector<BasicBlock *, 8> DominateVector;
+ DT.getDescendants(*SI, DominateVector);
assert(!DominateVector.empty() &&
"SI should be reachable and have at least itself as descendant");
- // We can only outline single entry regions (for now).
+ // We can only outline single entry regions (for now).
if (!DominateVector.front()->hasNPredecessors(1)) {
LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
<< " doesn't have a single predecessor in the "
"dominator tree\n";);
- continue;
+ continue;
}
- BasicBlock *ExitBlock = nullptr;
- // We can only outline single exit regions (for now).
+ BasicBlock *ExitBlock = nullptr;
+ // We can only outline single exit regions (for now).
if (!(ExitBlock = IsSingleExit(DominateVector))) {
LLVM_DEBUG(dbgs() << "ABORT: Block " << SI->getName()
<< " doesn't have a unique successor\n";);
- continue;
+ continue;
}
- int OutlineRegionCost = 0;
- for (auto *BB : DominateVector)
+ int OutlineRegionCost = 0;
+ for (auto *BB : DominateVector)
OutlineRegionCost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
-
+
LLVM_DEBUG(dbgs() << "OutlineRegionCost = " << OutlineRegionCost
<< "\n";);
-
+
if (!SkipCostAnalysis && OutlineRegionCost < MinOutlineRegionCost) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
- &SI->front())
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly",
+ &SI->front())
<< ore::NV("Callee", &F)
<< " inline cost-savings smaller than "
- << ore::NV("Cost", MinOutlineRegionCost);
- });
+ << ore::NV("Cost", MinOutlineRegionCost);
+ });
LLVM_DEBUG(dbgs() << "ABORT: Outline region cost is smaller than "
<< MinOutlineRegionCost << "\n";);
- continue;
- }
-
- // For now, ignore blocks that belong to a SISE region that is a
- // candidate for outlining. In the future, we may want to look
- // at inner regions because the outer region may have live-exit
- // variables.
- for (auto *BB : DominateVector)
- VisitedMap[BB] = true;
-
- // ReturnBlock here means the block after the outline call
- BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
- FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
- DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
- OutliningInfo->ORI.push_back(RegInfo);
+ continue;
+ }
+
+ // For now, ignore blocks that belong to a SISE region that is a
+ // candidate for outlining. In the future, we may want to look
+ // at inner regions because the outer region may have live-exit
+ // variables.
+ for (auto *BB : DominateVector)
+ VisitedMap[BB] = true;
+
+ // ReturnBlock here means the block after the outline call
+ BasicBlock *ReturnBlock = ExitBlock->getSingleSuccessor();
+ FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegInfo(
+ DominateVector, DominateVector.front(), ExitBlock, ReturnBlock);
+ OutliningInfo->ORI.push_back(RegInfo);
LLVM_DEBUG(dbgs() << "Found Cold Candidate starting at block: "
<< DominateVector.front()->getName() << "\n";);
- ColdCandidateFound = true;
- NumColdRegionsFound++;
- }
- }
+ ColdCandidateFound = true;
+ NumColdRegionsFound++;
+ }
+ }
- if (ColdCandidateFound)
- return OutliningInfo;
+ if (ColdCandidateFound)
+ return OutliningInfo;
return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
-}
-
-std::unique_ptr<FunctionOutliningInfo>
+}
+
+std::unique_ptr<FunctionOutliningInfo>
PartialInlinerImpl::computeOutliningInfo(Function &F) const {
BasicBlock *EntryBlock = &F.front();
- BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
- if (!BR || BR->isUnconditional())
- return std::unique_ptr<FunctionOutliningInfo>();
-
- // Returns true if Succ is BB's successor
- auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
- return is_contained(successors(BB), Succ);
- };
-
- auto IsReturnBlock = [](BasicBlock *BB) {
- Instruction *TI = BB->getTerminator();
- return isa<ReturnInst>(TI);
- };
-
- auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
- if (IsReturnBlock(Succ1))
- return std::make_tuple(Succ1, Succ2);
- if (IsReturnBlock(Succ2))
- return std::make_tuple(Succ2, Succ1);
-
- return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
- };
-
- // Detect a triangular shape:
- auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
- if (IsSuccessor(Succ1, Succ2))
- return std::make_tuple(Succ1, Succ2);
- if (IsSuccessor(Succ2, Succ1))
- return std::make_tuple(Succ2, Succ1);
-
- return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
- };
-
- std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
- std::make_unique<FunctionOutliningInfo>();
-
- BasicBlock *CurrEntry = EntryBlock;
- bool CandidateFound = false;
- do {
- // The number of blocks to be inlined has already reached
- // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
- // disables partial inlining for the function.
+ BranchInst *BR = dyn_cast<BranchInst>(EntryBlock->getTerminator());
+ if (!BR || BR->isUnconditional())
+ return std::unique_ptr<FunctionOutliningInfo>();
+
+ // Returns true if Succ is BB's successor
+ auto IsSuccessor = [](BasicBlock *Succ, BasicBlock *BB) {
+ return is_contained(successors(BB), Succ);
+ };
+
+ auto IsReturnBlock = [](BasicBlock *BB) {
+ Instruction *TI = BB->getTerminator();
+ return isa<ReturnInst>(TI);
+ };
+
+ auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
+ if (IsReturnBlock(Succ1))
+ return std::make_tuple(Succ1, Succ2);
+ if (IsReturnBlock(Succ2))
+ return std::make_tuple(Succ2, Succ1);
+
+ return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
+ };
+
+ // Detect a triangular shape:
+ auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) {
+ if (IsSuccessor(Succ1, Succ2))
+ return std::make_tuple(Succ1, Succ2);
+ if (IsSuccessor(Succ2, Succ1))
+ return std::make_tuple(Succ2, Succ1);
+
+ return std::make_tuple<BasicBlock *, BasicBlock *>(nullptr, nullptr);
+ };
+
+ std::unique_ptr<FunctionOutliningInfo> OutliningInfo =
+ std::make_unique<FunctionOutliningInfo>();
+
+ BasicBlock *CurrEntry = EntryBlock;
+ bool CandidateFound = false;
+ do {
+ // The number of blocks to be inlined has already reached
+ // the limit. When MaxNumInlineBlocks is set to 0 or 1, this
+ // disables partial inlining for the function.
if (OutliningInfo->getNumInlinedBlocks() >= MaxNumInlineBlocks)
- break;
-
- if (succ_size(CurrEntry) != 2)
- break;
-
- BasicBlock *Succ1 = *succ_begin(CurrEntry);
- BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
-
- BasicBlock *ReturnBlock, *NonReturnBlock;
- std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
-
- if (ReturnBlock) {
- OutliningInfo->Entries.push_back(CurrEntry);
- OutliningInfo->ReturnBlock = ReturnBlock;
- OutliningInfo->NonReturnBlock = NonReturnBlock;
- CandidateFound = true;
- break;
- }
-
+ break;
+
+ if (succ_size(CurrEntry) != 2)
+ break;
+
+ BasicBlock *Succ1 = *succ_begin(CurrEntry);
+ BasicBlock *Succ2 = *(succ_begin(CurrEntry) + 1);
+
+ BasicBlock *ReturnBlock, *NonReturnBlock;
+ std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
+
+ if (ReturnBlock) {
+ OutliningInfo->Entries.push_back(CurrEntry);
+ OutliningInfo->ReturnBlock = ReturnBlock;
+ OutliningInfo->NonReturnBlock = NonReturnBlock;
+ CandidateFound = true;
+ break;
+ }
+
BasicBlock *CommSucc, *OtherSucc;
- std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
-
- if (!CommSucc)
- break;
-
- OutliningInfo->Entries.push_back(CurrEntry);
- CurrEntry = OtherSucc;
- } while (true);
-
- if (!CandidateFound)
- return std::unique_ptr<FunctionOutliningInfo>();
-
- // Do sanity check of the entries: threre should not
- // be any successors (not in the entry set) other than
- // {ReturnBlock, NonReturnBlock}
+ std::tie(CommSucc, OtherSucc) = GetCommonSucc(Succ1, Succ2);
+
+ if (!CommSucc)
+ break;
+
+ OutliningInfo->Entries.push_back(CurrEntry);
+ CurrEntry = OtherSucc;
+ } while (true);
+
+ if (!CandidateFound)
+ return std::unique_ptr<FunctionOutliningInfo>();
+
+ // Do sanity check of the entries: threre should not
+ // be any successors (not in the entry set) other than
+ // {ReturnBlock, NonReturnBlock}
assert(OutliningInfo->Entries[0] == &F.front() &&
- "Function Entry must be the first in Entries vector");
- DenseSet<BasicBlock *> Entries;
- for (BasicBlock *E : OutliningInfo->Entries)
- Entries.insert(E);
-
- // Returns true of BB has Predecessor which is not
- // in Entries set.
- auto HasNonEntryPred = [Entries](BasicBlock *BB) {
+ "Function Entry must be the first in Entries vector");
+ DenseSet<BasicBlock *> Entries;
+ for (BasicBlock *E : OutliningInfo->Entries)
+ Entries.insert(E);
+
+ // Returns true of BB has Predecessor which is not
+ // in Entries set.
+ auto HasNonEntryPred = [Entries](BasicBlock *BB) {
for (auto *Pred : predecessors(BB)) {
- if (!Entries.count(Pred))
- return true;
- }
- return false;
- };
- auto CheckAndNormalizeCandidate =
- [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
- for (BasicBlock *E : OutliningInfo->Entries) {
+ if (!Entries.count(Pred))
+ return true;
+ }
+ return false;
+ };
+ auto CheckAndNormalizeCandidate =
+ [Entries, HasNonEntryPred](FunctionOutliningInfo *OutliningInfo) {
+ for (BasicBlock *E : OutliningInfo->Entries) {
for (auto *Succ : successors(E)) {
- if (Entries.count(Succ))
- continue;
- if (Succ == OutliningInfo->ReturnBlock)
- OutliningInfo->ReturnBlockPreds.push_back(E);
- else if (Succ != OutliningInfo->NonReturnBlock)
- return false;
- }
- // There should not be any outside incoming edges either:
- if (HasNonEntryPred(E))
- return false;
- }
- return true;
- };
-
- if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
- return std::unique_ptr<FunctionOutliningInfo>();
-
- // Now further growing the candidate's inlining region by
- // peeling off dominating blocks from the outlining region:
+ if (Entries.count(Succ))
+ continue;
+ if (Succ == OutliningInfo->ReturnBlock)
+ OutliningInfo->ReturnBlockPreds.push_back(E);
+ else if (Succ != OutliningInfo->NonReturnBlock)
+ return false;
+ }
+ // There should not be any outside incoming edges either:
+ if (HasNonEntryPred(E))
+ return false;
+ }
+ return true;
+ };
+
+ if (!CheckAndNormalizeCandidate(OutliningInfo.get()))
+ return std::unique_ptr<FunctionOutliningInfo>();
+
+ // Now further growing the candidate's inlining region by
+ // peeling off dominating blocks from the outlining region:
while (OutliningInfo->getNumInlinedBlocks() < MaxNumInlineBlocks) {
- BasicBlock *Cand = OutliningInfo->NonReturnBlock;
- if (succ_size(Cand) != 2)
- break;
-
- if (HasNonEntryPred(Cand))
- break;
-
- BasicBlock *Succ1 = *succ_begin(Cand);
- BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
-
- BasicBlock *ReturnBlock, *NonReturnBlock;
- std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
- if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
- break;
-
- if (NonReturnBlock->getSinglePredecessor() != Cand)
- break;
-
- // Now grow and update OutlininigInfo:
- OutliningInfo->Entries.push_back(Cand);
- OutliningInfo->NonReturnBlock = NonReturnBlock;
- OutliningInfo->ReturnBlockPreds.push_back(Cand);
- Entries.insert(Cand);
- }
-
- return OutliningInfo;
-}
-
-// Check if there is PGO data or user annotated branch data:
+ BasicBlock *Cand = OutliningInfo->NonReturnBlock;
+ if (succ_size(Cand) != 2)
+ break;
+
+ if (HasNonEntryPred(Cand))
+ break;
+
+ BasicBlock *Succ1 = *succ_begin(Cand);
+ BasicBlock *Succ2 = *(succ_begin(Cand) + 1);
+
+ BasicBlock *ReturnBlock, *NonReturnBlock;
+ std::tie(ReturnBlock, NonReturnBlock) = GetReturnBlock(Succ1, Succ2);
+ if (!ReturnBlock || ReturnBlock != OutliningInfo->ReturnBlock)
+ break;
+
+ if (NonReturnBlock->getSinglePredecessor() != Cand)
+ break;
+
+ // Now grow and update OutlininigInfo:
+ OutliningInfo->Entries.push_back(Cand);
+ OutliningInfo->NonReturnBlock = NonReturnBlock;
+ OutliningInfo->ReturnBlockPreds.push_back(Cand);
+ Entries.insert(Cand);
+ }
+
+ return OutliningInfo;
+}
+
+// Check if there is PGO data or user annotated branch data:
static bool hasProfileData(const Function &F, const FunctionOutliningInfo &OI) {
if (F.hasProfileData())
- return true;
- // Now check if any of the entry block has MD_prof data:
+ return true;
+ // Now check if any of the entry block has MD_prof data:
for (auto *E : OI.Entries) {
- BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
- if (!BR || BR->isUnconditional())
- continue;
- uint64_t T, F;
- if (BR->extractProfMetadata(T, F))
- return true;
- }
- return false;
-}
-
+ BranchInst *BR = dyn_cast<BranchInst>(E->getTerminator());
+ if (!BR || BR->isUnconditional())
+ continue;
+ uint64_t T, F;
+ if (BR->extractProfMetadata(T, F))
+ return true;
+ }
+ return false;
+}
+
BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq(
FunctionCloner &Cloner) const {
- BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
- auto EntryFreq =
- Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
- auto OutliningCallFreq =
- Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
- // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
- // we outlined any regions, so we may encounter situations where the
- // OutliningCallFreq is *slightly* bigger than the EntryFreq.
+ BasicBlock *OutliningCallBB = Cloner.OutlinedFunctions.back().second;
+ auto EntryFreq =
+ Cloner.ClonedFuncBFI->getBlockFreq(&Cloner.ClonedFunc->getEntryBlock());
+ auto OutliningCallFreq =
+ Cloner.ClonedFuncBFI->getBlockFreq(OutliningCallBB);
+ // FIXME Hackery needed because ClonedFuncBFI is based on the function BEFORE
+ // we outlined any regions, so we may encounter situations where the
+ // OutliningCallFreq is *slightly* bigger than the EntryFreq.
if (OutliningCallFreq.getFrequency() > EntryFreq.getFrequency())
- OutliningCallFreq = EntryFreq;
+ OutliningCallFreq = EntryFreq;
+
+ auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
+ OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
- auto OutlineRegionRelFreq = BranchProbability::getBranchProbability(
- OutliningCallFreq.getFrequency(), EntryFreq.getFrequency());
-
if (hasProfileData(*Cloner.OrigFunc, *Cloner.ClonedOI.get()))
- return OutlineRegionRelFreq;
-
- // When profile data is not available, we need to be conservative in
- // estimating the overall savings. Static branch prediction can usually
- // guess the branch direction right (taken/non-taken), but the guessed
- // branch probability is usually not biased enough. In case when the
- // outlined region is predicted to be likely, its probability needs
- // to be made higher (more biased) to not under-estimate the cost of
- // function outlining. On the other hand, if the outlined region
- // is predicted to be less likely, the predicted probablity is usually
- // higher than the actual. For instance, the actual probability of the
- // less likely target is only 5%, but the guessed probablity can be
- // 40%. In the latter case, there is no need for further adjustement.
- // FIXME: add an option for this.
- if (OutlineRegionRelFreq < BranchProbability(45, 100))
- return OutlineRegionRelFreq;
-
- OutlineRegionRelFreq = std::max(
- OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
-
- return OutlineRegionRelFreq;
-}
-
-bool PartialInlinerImpl::shouldPartialInline(
- CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
+ return OutlineRegionRelFreq;
+
+ // When profile data is not available, we need to be conservative in
+ // estimating the overall savings. Static branch prediction can usually
+ // guess the branch direction right (taken/non-taken), but the guessed
+ // branch probability is usually not biased enough. In case when the
+ // outlined region is predicted to be likely, its probability needs
+ // to be made higher (more biased) to not under-estimate the cost of
+ // function outlining. On the other hand, if the outlined region
+ // is predicted to be less likely, the predicted probablity is usually
+ // higher than the actual. For instance, the actual probability of the
+ // less likely target is only 5%, but the guessed probablity can be
+ // 40%. In the latter case, there is no need for further adjustement.
+ // FIXME: add an option for this.
+ if (OutlineRegionRelFreq < BranchProbability(45, 100))
+ return OutlineRegionRelFreq;
+
+ OutlineRegionRelFreq = std::max(
+ OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100));
+
+ return OutlineRegionRelFreq;
+}
+
+bool PartialInlinerImpl::shouldPartialInline(
+ CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
OptimizationRemarkEmitter &ORE) const {
- using namespace ore;
-
- Function *Callee = CB.getCalledFunction();
- assert(Callee == Cloner.ClonedFunc);
-
- if (SkipCostAnalysis)
- return isInlineViable(*Callee).isSuccess();
-
- Function *Caller = CB.getCaller();
- auto &CalleeTTI = GetTTI(*Callee);
- bool RemarksEnabled =
- Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
- DEBUG_TYPE);
- InlineCost IC =
- getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
- GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
-
- if (IC.isAlways()) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
- << NV("Callee", Cloner.OrigFunc)
- << " should always be fully inlined, not partially";
- });
- return false;
- }
-
- if (IC.isNever()) {
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
- << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
- << NV("Caller", Caller)
- << " because it should never be inlined (cost=never)";
- });
- return false;
- }
-
- if (!IC) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
- << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
- << NV("Caller", Caller) << " because too costly to inline (cost="
- << NV("Cost", IC.getCost()) << ", threshold="
- << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
- });
- return false;
- }
- const DataLayout &DL = Caller->getParent()->getDataLayout();
-
- // The savings of eliminating the call:
- int NonWeightedSavings = getCallsiteCost(CB, DL);
- BlockFrequency NormWeightedSavings(NonWeightedSavings);
-
- // Weighted saving is smaller than weighted cost, return false
- if (NormWeightedSavings < WeightedOutliningRcost) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
- &CB)
- << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
- << NV("Caller", Caller) << " runtime overhead (overhead="
- << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
- << ", savings="
- << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
- << ")"
- << " of making the outlined call is too high";
- });
-
- return false;
- }
-
- ORE.emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
- << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
- << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
- << " (threshold="
- << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
- });
- return true;
-}
-
-// TODO: Ideally we should share Inliner's InlineCost Analysis code.
-// For now use a simplified version. The returned 'InlineCost' will be used
-// to esimate the size cost as well as runtime cost of the BB.
+ using namespace ore;
+
+ Function *Callee = CB.getCalledFunction();
+ assert(Callee == Cloner.ClonedFunc);
+
+ if (SkipCostAnalysis)
+ return isInlineViable(*Callee).isSuccess();
+
+ Function *Caller = CB.getCaller();
+ auto &CalleeTTI = GetTTI(*Callee);
+ bool RemarksEnabled =
+ Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
+ DEBUG_TYPE);
+ InlineCost IC =
+ getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
+ GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
+
+ if (IC.isAlways()) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
+ << NV("Callee", Cloner.OrigFunc)
+ << " should always be fully inlined, not partially";
+ });
+ return false;
+ }
+
+ if (IC.isNever()) {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
+ << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+ << NV("Caller", Caller)
+ << " because it should never be inlined (cost=never)";
+ });
+ return false;
+ }
+
+ if (!IC) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
+ << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+ << NV("Caller", Caller) << " because too costly to inline (cost="
+ << NV("Cost", IC.getCost()) << ", threshold="
+ << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
+ });
+ return false;
+ }
+ const DataLayout &DL = Caller->getParent()->getDataLayout();
+
+ // The savings of eliminating the call:
+ int NonWeightedSavings = getCallsiteCost(CB, DL);
+ BlockFrequency NormWeightedSavings(NonWeightedSavings);
+
+ // Weighted saving is smaller than weighted cost, return false
+ if (NormWeightedSavings < WeightedOutliningRcost) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
+ &CB)
+ << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
+ << NV("Caller", Caller) << " runtime overhead (overhead="
+ << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
+ << ", savings="
+ << NV("Savings", (unsigned)NormWeightedSavings.getFrequency())
+ << ")"
+ << " of making the outlined call is too high";
+ });
+
+ return false;
+ }
+
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
+ << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
+ << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
+ << " (threshold="
+ << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")";
+ });
+ return true;
+}
+
+// TODO: Ideally we should share Inliner's InlineCost Analysis code.
+// For now use a simplified version. The returned 'InlineCost' will be used
+// to esimate the size cost as well as runtime cost of the BB.
int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
TargetTransformInfo *TTI) {
- int InlineCost = 0;
- const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
- for (Instruction &I : BB->instructionsWithoutDebug()) {
- // Skip free instructions.
- switch (I.getOpcode()) {
- case Instruction::BitCast:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::Alloca:
- case Instruction::PHI:
- continue;
- case Instruction::GetElementPtr:
- if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
- continue;
- break;
- default:
- break;
- }
-
- if (I.isLifetimeStartOrEnd())
- continue;
-
+ int InlineCost = 0;
+ const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ // Skip free instructions.
+ switch (I.getOpcode()) {
+ case Instruction::BitCast:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::Alloca:
+ case Instruction::PHI:
+ continue;
+ case Instruction::GetElementPtr:
+ if (cast<GetElementPtrInst>(&I)->hasAllZeroIndices())
+ continue;
+ break;
+ default:
+ break;
+ }
+
+ if (I.isLifetimeStartOrEnd())
+ continue;
+
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
Intrinsic::ID IID = II->getIntrinsicID();
SmallVector<Type *, 4> Tys;
@@ -890,657 +890,657 @@ int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB,
continue;
}
- if (CallInst *CI = dyn_cast<CallInst>(&I)) {
- InlineCost += getCallsiteCost(*CI, DL);
- continue;
- }
-
- if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
- InlineCost += getCallsiteCost(*II, DL);
- continue;
- }
-
- if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
- InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
- continue;
- }
- InlineCost += InlineConstants::InstrCost;
- }
- return InlineCost;
-}
-
-std::tuple<int, int>
+ if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+ InlineCost += getCallsiteCost(*CI, DL);
+ continue;
+ }
+
+ if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
+ InlineCost += getCallsiteCost(*II, DL);
+ continue;
+ }
+
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(&I)) {
+ InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost;
+ continue;
+ }
+ InlineCost += InlineConstants::InstrCost;
+ }
+ return InlineCost;
+}
+
+std::tuple<int, int>
PartialInlinerImpl::computeOutliningCosts(FunctionCloner &Cloner) const {
- int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
- for (auto FuncBBPair : Cloner.OutlinedFunctions) {
- Function *OutlinedFunc = FuncBBPair.first;
- BasicBlock* OutliningCallBB = FuncBBPair.second;
- // Now compute the cost of the call sequence to the outlined function
- // 'OutlinedFunction' in BB 'OutliningCallBB':
+ int OutliningFuncCallCost = 0, OutlinedFunctionCost = 0;
+ for (auto FuncBBPair : Cloner.OutlinedFunctions) {
+ Function *OutlinedFunc = FuncBBPair.first;
+ BasicBlock* OutliningCallBB = FuncBBPair.second;
+ // Now compute the cost of the call sequence to the outlined function
+ // 'OutlinedFunction' in BB 'OutliningCallBB':
auto *OutlinedFuncTTI = &GetTTI(*OutlinedFunc);
OutliningFuncCallCost +=
computeBBInlineCost(OutliningCallBB, OutlinedFuncTTI);
-
- // Now compute the cost of the extracted/outlined function itself:
- for (BasicBlock &BB : *OutlinedFunc)
+
+ // Now compute the cost of the extracted/outlined function itself:
+ for (BasicBlock &BB : *OutlinedFunc)
OutlinedFunctionCost += computeBBInlineCost(&BB, OutlinedFuncTTI);
- }
- assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
- "Outlined function cost should be no less than the outlined region");
-
- // The code extractor introduces a new root and exit stub blocks with
- // additional unconditional branches. Those branches will be eliminated
- // later with bb layout. The cost should be adjusted accordingly:
- OutlinedFunctionCost -=
- 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
-
- int OutliningRuntimeOverhead =
- OutliningFuncCallCost +
- (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
- ExtraOutliningPenalty;
-
- return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
-}
-
-// Create the callsite to profile count map which is
-// used to update the original function's entry count,
-// after the function is partially inlined into the callsite.
-void PartialInlinerImpl::computeCallsiteToProfCountMap(
- Function *DuplicateFunction,
+ }
+ assert(OutlinedFunctionCost >= Cloner.OutlinedRegionCost &&
+ "Outlined function cost should be no less than the outlined region");
+
+ // The code extractor introduces a new root and exit stub blocks with
+ // additional unconditional branches. Those branches will be eliminated
+ // later with bb layout. The cost should be adjusted accordingly:
+ OutlinedFunctionCost -=
+ 2 * InlineConstants::InstrCost * Cloner.OutlinedFunctions.size();
+
+ int OutliningRuntimeOverhead =
+ OutliningFuncCallCost +
+ (OutlinedFunctionCost - Cloner.OutlinedRegionCost) +
+ ExtraOutliningPenalty;
+
+ return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead);
+}
+
+// Create the callsite to profile count map which is
+// used to update the original function's entry count,
+// after the function is partially inlined into the callsite.
+void PartialInlinerImpl::computeCallsiteToProfCountMap(
+ Function *DuplicateFunction,
DenseMap<User *, uint64_t> &CallSiteToProfCountMap) const {
- std::vector<User *> Users(DuplicateFunction->user_begin(),
- DuplicateFunction->user_end());
- Function *CurrentCaller = nullptr;
- std::unique_ptr<BlockFrequencyInfo> TempBFI;
- BlockFrequencyInfo *CurrentCallerBFI = nullptr;
-
- auto ComputeCurrBFI = [&,this](Function *Caller) {
- // For the old pass manager:
- if (!GetBFI) {
- DominatorTree DT(*Caller);
- LoopInfo LI(DT);
- BranchProbabilityInfo BPI(*Caller, LI);
- TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
- CurrentCallerBFI = TempBFI.get();
- } else {
- // New pass manager:
- CurrentCallerBFI = &(GetBFI(*Caller));
- }
- };
-
- for (User *User : Users) {
- CallBase *CB = getSupportedCallBase(User);
- Function *Caller = CB->getCaller();
- if (CurrentCaller != Caller) {
- CurrentCaller = Caller;
- ComputeCurrBFI(Caller);
- } else {
- assert(CurrentCallerBFI && "CallerBFI is not set");
- }
- BasicBlock *CallBB = CB->getParent();
- auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
- if (Count)
- CallSiteToProfCountMap[User] = *Count;
- else
- CallSiteToProfCountMap[User] = 0;
- }
-}
-
-PartialInlinerImpl::FunctionCloner::FunctionCloner(
- Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
+ std::vector<User *> Users(DuplicateFunction->user_begin(),
+ DuplicateFunction->user_end());
+ Function *CurrentCaller = nullptr;
+ std::unique_ptr<BlockFrequencyInfo> TempBFI;
+ BlockFrequencyInfo *CurrentCallerBFI = nullptr;
+
+ auto ComputeCurrBFI = [&,this](Function *Caller) {
+ // For the old pass manager:
+ if (!GetBFI) {
+ DominatorTree DT(*Caller);
+ LoopInfo LI(DT);
+ BranchProbabilityInfo BPI(*Caller, LI);
+ TempBFI.reset(new BlockFrequencyInfo(*Caller, BPI, LI));
+ CurrentCallerBFI = TempBFI.get();
+ } else {
+ // New pass manager:
+ CurrentCallerBFI = &(GetBFI(*Caller));
+ }
+ };
+
+ for (User *User : Users) {
+ CallBase *CB = getSupportedCallBase(User);
+ Function *Caller = CB->getCaller();
+ if (CurrentCaller != Caller) {
+ CurrentCaller = Caller;
+ ComputeCurrBFI(Caller);
+ } else {
+ assert(CurrentCallerBFI && "CallerBFI is not set");
+ }
+ BasicBlock *CallBB = CB->getParent();
+ auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
+ if (Count)
+ CallSiteToProfCountMap[User] = *Count;
+ else
+ CallSiteToProfCountMap[User] = 0;
+ }
+}
+
+PartialInlinerImpl::FunctionCloner::FunctionCloner(
+ Function *F, FunctionOutliningInfo *OI, OptimizationRemarkEmitter &ORE,
function_ref<AssumptionCache *(Function &)> LookupAC,
function_ref<TargetTransformInfo &(Function &)> GetTTI)
: OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
- ClonedOI = std::make_unique<FunctionOutliningInfo>();
-
- // Clone the function, so that we can hack away on it.
- ValueToValueMapTy VMap;
- ClonedFunc = CloneFunction(F, VMap);
-
- ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
- ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
+ ClonedOI = std::make_unique<FunctionOutliningInfo>();
+
+ // Clone the function, so that we can hack away on it.
+ ValueToValueMapTy VMap;
+ ClonedFunc = CloneFunction(F, VMap);
+
+ ClonedOI->ReturnBlock = cast<BasicBlock>(VMap[OI->ReturnBlock]);
+ ClonedOI->NonReturnBlock = cast<BasicBlock>(VMap[OI->NonReturnBlock]);
for (BasicBlock *BB : OI->Entries)
- ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
-
- for (BasicBlock *E : OI->ReturnBlockPreds) {
- BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
- ClonedOI->ReturnBlockPreds.push_back(NewE);
- }
- // Go ahead and update all uses to the duplicate, so that we can just
- // use the inliner functionality when we're done hacking.
- F->replaceAllUsesWith(ClonedFunc);
-}
-
-PartialInlinerImpl::FunctionCloner::FunctionCloner(
- Function *F, FunctionOutliningMultiRegionInfo *OI,
- OptimizationRemarkEmitter &ORE,
+ ClonedOI->Entries.push_back(cast<BasicBlock>(VMap[BB]));
+
+ for (BasicBlock *E : OI->ReturnBlockPreds) {
+ BasicBlock *NewE = cast<BasicBlock>(VMap[E]);
+ ClonedOI->ReturnBlockPreds.push_back(NewE);
+ }
+ // Go ahead and update all uses to the duplicate, so that we can just
+ // use the inliner functionality when we're done hacking.
+ F->replaceAllUsesWith(ClonedFunc);
+}
+
+PartialInlinerImpl::FunctionCloner::FunctionCloner(
+ Function *F, FunctionOutliningMultiRegionInfo *OI,
+ OptimizationRemarkEmitter &ORE,
function_ref<AssumptionCache *(Function &)> LookupAC,
function_ref<TargetTransformInfo &(Function &)> GetTTI)
: OrigFunc(F), ORE(ORE), LookupAC(LookupAC), GetTTI(GetTTI) {
- ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
-
- // Clone the function, so that we can hack away on it.
- ValueToValueMapTy VMap;
- ClonedFunc = CloneFunction(F, VMap);
-
- // Go through all Outline Candidate Regions and update all BasicBlock
- // information.
- for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
- OI->ORI) {
- SmallVector<BasicBlock *, 8> Region;
+ ClonedOMRI = std::make_unique<FunctionOutliningMultiRegionInfo>();
+
+ // Clone the function, so that we can hack away on it.
+ ValueToValueMapTy VMap;
+ ClonedFunc = CloneFunction(F, VMap);
+
+ // Go through all Outline Candidate Regions and update all BasicBlock
+ // information.
+ for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
+ OI->ORI) {
+ SmallVector<BasicBlock *, 8> Region;
for (BasicBlock *BB : RegionInfo.Region)
- Region.push_back(cast<BasicBlock>(VMap[BB]));
-
- BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
- BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
- BasicBlock *NewReturnBlock = nullptr;
- if (RegionInfo.ReturnBlock)
- NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
- FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
- Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
- ClonedOMRI->ORI.push_back(MappedRegionInfo);
- }
- // Go ahead and update all uses to the duplicate, so that we can just
- // use the inliner functionality when we're done hacking.
- F->replaceAllUsesWith(ClonedFunc);
-}
-
+ Region.push_back(cast<BasicBlock>(VMap[BB]));
+
+ BasicBlock *NewEntryBlock = cast<BasicBlock>(VMap[RegionInfo.EntryBlock]);
+ BasicBlock *NewExitBlock = cast<BasicBlock>(VMap[RegionInfo.ExitBlock]);
+ BasicBlock *NewReturnBlock = nullptr;
+ if (RegionInfo.ReturnBlock)
+ NewReturnBlock = cast<BasicBlock>(VMap[RegionInfo.ReturnBlock]);
+ FunctionOutliningMultiRegionInfo::OutlineRegionInfo MappedRegionInfo(
+ Region, NewEntryBlock, NewExitBlock, NewReturnBlock);
+ ClonedOMRI->ORI.push_back(MappedRegionInfo);
+ }
+ // Go ahead and update all uses to the duplicate, so that we can just
+ // use the inliner functionality when we're done hacking.
+ F->replaceAllUsesWith(ClonedFunc);
+}
+
void PartialInlinerImpl::FunctionCloner::normalizeReturnBlock() const {
auto GetFirstPHI = [](BasicBlock *BB) {
- BasicBlock::iterator I = BB->begin();
- PHINode *FirstPhi = nullptr;
- while (I != BB->end()) {
- PHINode *Phi = dyn_cast<PHINode>(I);
- if (!Phi)
- break;
- if (!FirstPhi) {
- FirstPhi = Phi;
- break;
- }
- }
- return FirstPhi;
- };
-
- // Shouldn't need to normalize PHIs if we're not outlining non-early return
- // blocks.
- if (!ClonedOI)
- return;
-
- // Special hackery is needed with PHI nodes that have inputs from more than
- // one extracted block. For simplicity, just split the PHIs into a two-level
- // sequence of PHIs, some of which will go in the extracted region, and some
- // of which will go outside.
- BasicBlock *PreReturn = ClonedOI->ReturnBlock;
- // only split block when necessary:
+ BasicBlock::iterator I = BB->begin();
+ PHINode *FirstPhi = nullptr;
+ while (I != BB->end()) {
+ PHINode *Phi = dyn_cast<PHINode>(I);
+ if (!Phi)
+ break;
+ if (!FirstPhi) {
+ FirstPhi = Phi;
+ break;
+ }
+ }
+ return FirstPhi;
+ };
+
+ // Shouldn't need to normalize PHIs if we're not outlining non-early return
+ // blocks.
+ if (!ClonedOI)
+ return;
+
+ // Special hackery is needed with PHI nodes that have inputs from more than
+ // one extracted block. For simplicity, just split the PHIs into a two-level
+ // sequence of PHIs, some of which will go in the extracted region, and some
+ // of which will go outside.
+ BasicBlock *PreReturn = ClonedOI->ReturnBlock;
+ // only split block when necessary:
PHINode *FirstPhi = GetFirstPHI(PreReturn);
- unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
-
- if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
- return;
-
- auto IsTrivialPhi = [](PHINode *PN) -> Value * {
- Value *CommonValue = PN->getIncomingValue(0);
- if (all_of(PN->incoming_values(),
- [&](Value *V) { return V == CommonValue; }))
- return CommonValue;
- return nullptr;
- };
-
- ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
- ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
- BasicBlock::iterator I = PreReturn->begin();
- Instruction *Ins = &ClonedOI->ReturnBlock->front();
- SmallVector<Instruction *, 4> DeadPhis;
- while (I != PreReturn->end()) {
- PHINode *OldPhi = dyn_cast<PHINode>(I);
- if (!OldPhi)
- break;
-
- PHINode *RetPhi =
- PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
- OldPhi->replaceAllUsesWith(RetPhi);
- Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
-
- RetPhi->addIncoming(&*I, PreReturn);
- for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
- RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
- OldPhi->removeIncomingValue(E);
- }
-
- // After incoming values splitting, the old phi may become trivial.
- // Keeping the trivial phi can introduce definition inside the outline
- // region which is live-out, causing necessary overhead (load, store
- // arg passing etc).
- if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
- OldPhi->replaceAllUsesWith(OldPhiVal);
- DeadPhis.push_back(OldPhi);
- }
- ++I;
- }
- for (auto *DP : DeadPhis)
- DP->eraseFromParent();
-
+ unsigned NumPredsFromEntries = ClonedOI->ReturnBlockPreds.size();
+
+ if (!FirstPhi || FirstPhi->getNumIncomingValues() <= NumPredsFromEntries + 1)
+ return;
+
+ auto IsTrivialPhi = [](PHINode *PN) -> Value * {
+ Value *CommonValue = PN->getIncomingValue(0);
+ if (all_of(PN->incoming_values(),
+ [&](Value *V) { return V == CommonValue; }))
+ return CommonValue;
+ return nullptr;
+ };
+
+ ClonedOI->ReturnBlock = ClonedOI->ReturnBlock->splitBasicBlock(
+ ClonedOI->ReturnBlock->getFirstNonPHI()->getIterator());
+ BasicBlock::iterator I = PreReturn->begin();
+ Instruction *Ins = &ClonedOI->ReturnBlock->front();
+ SmallVector<Instruction *, 4> DeadPhis;
+ while (I != PreReturn->end()) {
+ PHINode *OldPhi = dyn_cast<PHINode>(I);
+ if (!OldPhi)
+ break;
+
+ PHINode *RetPhi =
+ PHINode::Create(OldPhi->getType(), NumPredsFromEntries + 1, "", Ins);
+ OldPhi->replaceAllUsesWith(RetPhi);
+ Ins = ClonedOI->ReturnBlock->getFirstNonPHI();
+
+ RetPhi->addIncoming(&*I, PreReturn);
+ for (BasicBlock *E : ClonedOI->ReturnBlockPreds) {
+ RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(E), E);
+ OldPhi->removeIncomingValue(E);
+ }
+
+ // After incoming values splitting, the old phi may become trivial.
+ // Keeping the trivial phi can introduce definition inside the outline
+ // region which is live-out, causing necessary overhead (load, store
+ // arg passing etc).
+ if (auto *OldPhiVal = IsTrivialPhi(OldPhi)) {
+ OldPhi->replaceAllUsesWith(OldPhiVal);
+ DeadPhis.push_back(OldPhi);
+ }
+ ++I;
+ }
+ for (auto *DP : DeadPhis)
+ DP->eraseFromParent();
+
for (auto *E : ClonedOI->ReturnBlockPreds)
- E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
-}
-
-bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
-
+ E->getTerminator()->replaceUsesOfWith(PreReturn, ClonedOI->ReturnBlock);
+}
+
+bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
+
auto ComputeRegionCost = [&](SmallVectorImpl<BasicBlock *> &Region) {
- int Cost = 0;
- for (BasicBlock* BB : Region)
+ int Cost = 0;
+ for (BasicBlock* BB : Region)
Cost += computeBBInlineCost(BB, &GetTTI(*BB->getParent()));
- return Cost;
- };
-
- assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
-
- if (ClonedOMRI->ORI.empty())
- return false;
-
- // The CodeExtractor needs a dominator tree.
- DominatorTree DT;
- DT.recalculate(*ClonedFunc);
-
- // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
- LoopInfo LI(DT);
- BranchProbabilityInfo BPI(*ClonedFunc, LI);
- ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
-
- // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
- CodeExtractorAnalysisCache CEAC(*ClonedFunc);
-
- SetVector<Value *> Inputs, Outputs, Sinks;
- for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
- ClonedOMRI->ORI) {
- int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
-
- CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
- ClonedFuncBFI.get(), &BPI,
- LookupAC(*RegionInfo.EntryBlock->getParent()),
- /* AllowVarargs */ false);
-
- CE.findInputsOutputs(Inputs, Outputs, Sinks);
-
+ return Cost;
+ };
+
+ assert(ClonedOMRI && "Expecting OutlineInfo for multi region outline");
+
+ if (ClonedOMRI->ORI.empty())
+ return false;
+
+ // The CodeExtractor needs a dominator tree.
+ DominatorTree DT;
+ DT.recalculate(*ClonedFunc);
+
+ // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
+ LoopInfo LI(DT);
+ BranchProbabilityInfo BPI(*ClonedFunc, LI);
+ ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
+
+ // Cache and recycle the CodeExtractor analysis to avoid O(n^2) compile-time.
+ CodeExtractorAnalysisCache CEAC(*ClonedFunc);
+
+ SetVector<Value *> Inputs, Outputs, Sinks;
+ for (FunctionOutliningMultiRegionInfo::OutlineRegionInfo RegionInfo :
+ ClonedOMRI->ORI) {
+ int CurrentOutlinedRegionCost = ComputeRegionCost(RegionInfo.Region);
+
+ CodeExtractor CE(RegionInfo.Region, &DT, /*AggregateArgs*/ false,
+ ClonedFuncBFI.get(), &BPI,
+ LookupAC(*RegionInfo.EntryBlock->getParent()),
+ /* AllowVarargs */ false);
+
+ CE.findInputsOutputs(Inputs, Outputs, Sinks);
+
LLVM_DEBUG({
- dbgs() << "inputs: " << Inputs.size() << "\n";
- dbgs() << "outputs: " << Outputs.size() << "\n";
- for (Value *value : Inputs)
- dbgs() << "value used in func: " << *value << "\n";
- for (Value *output : Outputs)
- dbgs() << "instr used in func: " << *output << "\n";
+ dbgs() << "inputs: " << Inputs.size() << "\n";
+ dbgs() << "outputs: " << Outputs.size() << "\n";
+ for (Value *value : Inputs)
+ dbgs() << "value used in func: " << *value << "\n";
+ for (Value *output : Outputs)
+ dbgs() << "instr used in func: " << *output << "\n";
});
- // Do not extract regions that have live exit variables.
- if (Outputs.size() > 0 && !ForceLiveExit)
- continue;
-
+ // Do not extract regions that have live exit variables.
+ if (Outputs.size() > 0 && !ForceLiveExit)
+ continue;
+
if (Function *OutlinedFunc = CE.extractCodeRegion(CEAC)) {
CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc);
- BasicBlock *OutliningCallBB = OCS->getParent();
- assert(OutliningCallBB->getParent() == ClonedFunc);
- OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
- NumColdRegionsOutlined++;
- OutlinedRegionCost += CurrentOutlinedRegionCost;
-
- if (MarkOutlinedColdCC) {
- OutlinedFunc->setCallingConv(CallingConv::Cold);
- OCS->setCallingConv(CallingConv::Cold);
- }
- } else
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
- &RegionInfo.Region.front()->front())
- << "Failed to extract region at block "
- << ore::NV("Block", RegionInfo.Region.front());
- });
- }
-
- return !OutlinedFunctions.empty();
-}
-
-Function *
-PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
- // Returns true if the block is to be partial inlined into the caller
- // (i.e. not to be extracted to the out of line function)
- auto ToBeInlined = [&, this](BasicBlock *BB) {
- return BB == ClonedOI->ReturnBlock ||
+ BasicBlock *OutliningCallBB = OCS->getParent();
+ assert(OutliningCallBB->getParent() == ClonedFunc);
+ OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
+ NumColdRegionsOutlined++;
+ OutlinedRegionCost += CurrentOutlinedRegionCost;
+
+ if (MarkOutlinedColdCC) {
+ OutlinedFunc->setCallingConv(CallingConv::Cold);
+ OCS->setCallingConv(CallingConv::Cold);
+ }
+ } else
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
+ &RegionInfo.Region.front()->front())
+ << "Failed to extract region at block "
+ << ore::NV("Block", RegionInfo.Region.front());
+ });
+ }
+
+ return !OutlinedFunctions.empty();
+}
+
+Function *
+PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
+ // Returns true if the block is to be partial inlined into the caller
+ // (i.e. not to be extracted to the out of line function)
+ auto ToBeInlined = [&, this](BasicBlock *BB) {
+ return BB == ClonedOI->ReturnBlock ||
llvm::is_contained(ClonedOI->Entries, BB);
- };
-
- assert(ClonedOI && "Expecting OutlineInfo for single region outline");
- // The CodeExtractor needs a dominator tree.
- DominatorTree DT;
- DT.recalculate(*ClonedFunc);
-
- // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
- LoopInfo LI(DT);
- BranchProbabilityInfo BPI(*ClonedFunc, LI);
- ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
-
- // Gather up the blocks that we're going to extract.
- std::vector<BasicBlock *> ToExtract;
+ };
+
+ assert(ClonedOI && "Expecting OutlineInfo for single region outline");
+ // The CodeExtractor needs a dominator tree.
+ DominatorTree DT;
+ DT.recalculate(*ClonedFunc);
+
+ // Manually calculate a BlockFrequencyInfo and BranchProbabilityInfo.
+ LoopInfo LI(DT);
+ BranchProbabilityInfo BPI(*ClonedFunc, LI);
+ ClonedFuncBFI.reset(new BlockFrequencyInfo(*ClonedFunc, BPI, LI));
+
+ // Gather up the blocks that we're going to extract.
+ std::vector<BasicBlock *> ToExtract;
auto *ClonedFuncTTI = &GetTTI(*ClonedFunc);
- ToExtract.push_back(ClonedOI->NonReturnBlock);
+ ToExtract.push_back(ClonedOI->NonReturnBlock);
OutlinedRegionCost += PartialInlinerImpl::computeBBInlineCost(
ClonedOI->NonReturnBlock, ClonedFuncTTI);
- for (BasicBlock &BB : *ClonedFunc)
- if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
- ToExtract.push_back(&BB);
- // FIXME: the code extractor may hoist/sink more code
- // into the outlined function which may make the outlining
- // overhead (the difference of the outlined function cost
- // and OutliningRegionCost) look larger.
+ for (BasicBlock &BB : *ClonedFunc)
+ if (!ToBeInlined(&BB) && &BB != ClonedOI->NonReturnBlock) {
+ ToExtract.push_back(&BB);
+ // FIXME: the code extractor may hoist/sink more code
+ // into the outlined function which may make the outlining
+ // overhead (the difference of the outlined function cost
+ // and OutliningRegionCost) look larger.
OutlinedRegionCost += computeBBInlineCost(&BB, ClonedFuncTTI);
- }
-
- // Extract the body of the if.
- CodeExtractorAnalysisCache CEAC(*ClonedFunc);
- Function *OutlinedFunc =
- CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
- ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
- /* AllowVarargs */ true)
- .extractCodeRegion(CEAC);
-
- if (OutlinedFunc) {
- BasicBlock *OutliningCallBB =
+ }
+
+ // Extract the body of the if.
+ CodeExtractorAnalysisCache CEAC(*ClonedFunc);
+ Function *OutlinedFunc =
+ CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false,
+ ClonedFuncBFI.get(), &BPI, LookupAC(*ClonedFunc),
+ /* AllowVarargs */ true)
+ .extractCodeRegion(CEAC);
+
+ if (OutlinedFunc) {
+ BasicBlock *OutliningCallBB =
PartialInlinerImpl::getOneCallSiteTo(*OutlinedFunc)->getParent();
- assert(OutliningCallBB->getParent() == ClonedFunc);
- OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
- } else
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
- &ToExtract.front()->front())
- << "Failed to extract region at block "
- << ore::NV("Block", ToExtract.front());
- });
-
- return OutlinedFunc;
-}
-
-PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
- // Ditch the duplicate, since we're done with it, and rewrite all remaining
- // users (function pointers, etc.) back to the original function.
- ClonedFunc->replaceAllUsesWith(OrigFunc);
- ClonedFunc->eraseFromParent();
- if (!IsFunctionInlined) {
- // Remove each function that was speculatively created if there is no
- // reference.
- for (auto FuncBBPair : OutlinedFunctions) {
- Function *Func = FuncBBPair.first;
- Func->eraseFromParent();
- }
- }
-}
-
+ assert(OutliningCallBB->getParent() == ClonedFunc);
+ OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
+ } else
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "ExtractFailed",
+ &ToExtract.front()->front())
+ << "Failed to extract region at block "
+ << ore::NV("Block", ToExtract.front());
+ });
+
+ return OutlinedFunc;
+}
+
+PartialInlinerImpl::FunctionCloner::~FunctionCloner() {
+ // Ditch the duplicate, since we're done with it, and rewrite all remaining
+ // users (function pointers, etc.) back to the original function.
+ ClonedFunc->replaceAllUsesWith(OrigFunc);
+ ClonedFunc->eraseFromParent();
+ if (!IsFunctionInlined) {
+ // Remove each function that was speculatively created if there is no
+ // reference.
+ for (auto FuncBBPair : OutlinedFunctions) {
+ Function *Func = FuncBBPair.first;
+ Func->eraseFromParent();
+ }
+ }
+}
+
std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function &F) {
if (F.hasAddressTaken())
- return {false, nullptr};
-
- // Let inliner handle it
+ return {false, nullptr};
+
+ // Let inliner handle it
if (F.hasFnAttribute(Attribute::AlwaysInline))
- return {false, nullptr};
-
+ return {false, nullptr};
+
if (F.hasFnAttribute(Attribute::NoInline))
- return {false, nullptr};
-
+ return {false, nullptr};
+
if (PSI.isFunctionEntryCold(&F))
- return {false, nullptr};
-
+ return {false, nullptr};
+
if (F.users().empty())
- return {false, nullptr};
-
+ return {false, nullptr};
+
OptimizationRemarkEmitter ORE(&F);
-
- // Only try to outline cold regions if we have a profile summary, which
- // implies we have profiling information.
+
+ // Only try to outline cold regions if we have a profile summary, which
+ // implies we have profiling information.
if (PSI.hasProfileSummary() && F.hasProfileData() &&
- !DisableMultiRegionPartialInline) {
- std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
- computeOutliningColdRegionsInfo(F, ORE);
- if (OMRI) {
+ !DisableMultiRegionPartialInline) {
+ std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
+ computeOutliningColdRegionsInfo(F, ORE);
+ if (OMRI) {
FunctionCloner Cloner(&F, OMRI.get(), ORE, LookupAssumptionCache, GetTTI);
-
+
LLVM_DEBUG({
- dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
- dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
- << "\n";
+ dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
+ dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
+ << "\n";
});
- bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
-
- if (DidOutline) {
+ bool DidOutline = Cloner.doMultiRegionFunctionOutlining();
+
+ if (DidOutline) {
LLVM_DEBUG({
- dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
- Cloner.ClonedFunc->print(dbgs());
- dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
+ dbgs() << ">>>>>> Outlined (Cloned) Function >>>>>>\n";
+ Cloner.ClonedFunc->print(dbgs());
+ dbgs() << "<<<<<< Outlined (Cloned) Function <<<<<<\n";
});
-
- if (tryPartialInline(Cloner))
- return {true, nullptr};
- }
- }
- }
-
- // Fall-thru to regular partial inlining if we:
- // i) can't find any cold regions to outline, or
- // ii) can't inline the outlined function anywhere.
- std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
- if (!OI)
- return {false, nullptr};
-
+
+ if (tryPartialInline(Cloner))
+ return {true, nullptr};
+ }
+ }
+ }
+
+ // Fall-thru to regular partial inlining if we:
+ // i) can't find any cold regions to outline, or
+ // ii) can't inline the outlined function anywhere.
+ std::unique_ptr<FunctionOutliningInfo> OI = computeOutliningInfo(F);
+ if (!OI)
+ return {false, nullptr};
+
FunctionCloner Cloner(&F, OI.get(), ORE, LookupAssumptionCache, GetTTI);
Cloner.normalizeReturnBlock();
-
- Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
-
- if (!OutlinedFunction)
- return {false, nullptr};
-
+
+ Function *OutlinedFunction = Cloner.doSingleRegionFunctionOutlining();
+
+ if (!OutlinedFunction)
+ return {false, nullptr};
+
if (tryPartialInline(Cloner))
- return {true, OutlinedFunction};
-
- return {false, nullptr};
-}
-
-bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
- if (Cloner.OutlinedFunctions.empty())
- return false;
-
- int SizeCost = 0;
- BlockFrequency WeightedRcost;
- int NonWeightedRcost;
- std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
-
- // Only calculate RelativeToEntryFreq when we are doing single region
- // outlining.
- BranchProbability RelativeToEntryFreq;
+ return {true, OutlinedFunction};
+
+ return {false, nullptr};
+}
+
+bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
+ if (Cloner.OutlinedFunctions.empty())
+ return false;
+
+ int SizeCost = 0;
+ BlockFrequency WeightedRcost;
+ int NonWeightedRcost;
+ std::tie(SizeCost, NonWeightedRcost) = computeOutliningCosts(Cloner);
+
+ // Only calculate RelativeToEntryFreq when we are doing single region
+ // outlining.
+ BranchProbability RelativeToEntryFreq;
if (Cloner.ClonedOI)
- RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
+ RelativeToEntryFreq = getOutliningCallBBRelativeFreq(Cloner);
else
- // RelativeToEntryFreq doesn't make sense when we have more than one
- // outlined call because each call will have a different relative frequency
- // to the entry block. We can consider using the average, but the
- // usefulness of that information is questionable. For now, assume we never
- // execute the calls to outlined functions.
- RelativeToEntryFreq = BranchProbability(0, 1);
-
- WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
-
- // The call sequence(s) to the outlined function(s) are larger than the sum of
- // the original outlined region size(s), it does not increase the chances of
- // inlining the function with outlining (The inliner uses the size increase to
- // model the cost of inlining a callee).
- if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
- OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
- DebugLoc DLoc;
- BasicBlock *Block;
+ // RelativeToEntryFreq doesn't make sense when we have more than one
+ // outlined call because each call will have a different relative frequency
+ // to the entry block. We can consider using the average, but the
+ // usefulness of that information is questionable. For now, assume we never
+ // execute the calls to outlined functions.
+ RelativeToEntryFreq = BranchProbability(0, 1);
+
+ WeightedRcost = BlockFrequency(NonWeightedRcost) * RelativeToEntryFreq;
+
+ // The call sequence(s) to the outlined function(s) are larger than the sum of
+ // the original outlined region size(s), it does not increase the chances of
+ // inlining the function with outlining (The inliner uses the size increase to
+ // model the cost of inlining a callee).
+ if (!SkipCostAnalysis && Cloner.OutlinedRegionCost < SizeCost) {
+ OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
+ DebugLoc DLoc;
+ BasicBlock *Block;
std::tie(DLoc, Block) = getOneDebugLoc(*Cloner.ClonedFunc);
- OrigFuncORE.emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
- DLoc, Block)
- << ore::NV("Function", Cloner.OrigFunc)
- << " not partially inlined into callers (Original Size = "
- << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
- << ", Size of call sequence to outlined function = "
- << ore::NV("NewSize", SizeCost) << ")";
- });
- return false;
- }
-
- assert(Cloner.OrigFunc->users().empty() &&
- "F's users should all be replaced!");
-
- std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
- Cloner.ClonedFunc->user_end());
-
- DenseMap<User *, uint64_t> CallSiteToProfCountMap;
- auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
- if (CalleeEntryCount)
- computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
-
- uint64_t CalleeEntryCountV =
- (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
-
- bool AnyInline = false;
- for (User *User : Users) {
- CallBase *CB = getSupportedCallBase(User);
-
+ OrigFuncORE.emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall",
+ DLoc, Block)
+ << ore::NV("Function", Cloner.OrigFunc)
+ << " not partially inlined into callers (Original Size = "
+ << ore::NV("OutlinedRegionOriginalSize", Cloner.OutlinedRegionCost)
+ << ", Size of call sequence to outlined function = "
+ << ore::NV("NewSize", SizeCost) << ")";
+ });
+ return false;
+ }
+
+ assert(Cloner.OrigFunc->users().empty() &&
+ "F's users should all be replaced!");
+
+ std::vector<User *> Users(Cloner.ClonedFunc->user_begin(),
+ Cloner.ClonedFunc->user_end());
+
+ DenseMap<User *, uint64_t> CallSiteToProfCountMap;
+ auto CalleeEntryCount = Cloner.OrigFunc->getEntryCount();
+ if (CalleeEntryCount)
+ computeCallsiteToProfCountMap(Cloner.ClonedFunc, CallSiteToProfCountMap);
+
+ uint64_t CalleeEntryCountV =
+ (CalleeEntryCount ? CalleeEntryCount.getCount() : 0);
+
+ bool AnyInline = false;
+ for (User *User : Users) {
+ CallBase *CB = getSupportedCallBase(User);
+
if (isLimitReached())
- continue;
-
- OptimizationRemarkEmitter CallerORE(CB->getCaller());
- if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
- continue;
-
- // Construct remark before doing the inlining, as after successful inlining
- // the callsite is removed.
- OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
- OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
- << ore::NV("Caller", CB->getCaller());
-
- InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
- // We can only forward varargs when we outlined a single region, else we
- // bail on vararg functions.
- if (!InlineFunction(*CB, IFI, nullptr, true,
- (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
- : nullptr))
- .isSuccess())
- continue;
-
- CallerORE.emit(OR);
-
- // Now update the entry count:
- if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
- uint64_t CallSiteCount = CallSiteToProfCountMap[User];
- CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
- }
-
- AnyInline = true;
- NumPartialInlining++;
- // Update the stats
- if (Cloner.ClonedOI)
- NumPartialInlined++;
- else
- NumColdOutlinePartialInlined++;
- }
-
- if (AnyInline) {
- Cloner.IsFunctionInlined = true;
- if (CalleeEntryCount)
- Cloner.OrigFunc->setEntryCount(
- CalleeEntryCount.setCount(CalleeEntryCountV));
- OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
- OrigFuncORE.emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
- << "Partially inlined into at least one caller";
- });
- }
-
- return AnyInline;
-}
-
-bool PartialInlinerImpl::run(Module &M) {
- if (DisablePartialInlining)
- return false;
-
- std::vector<Function *> Worklist;
- Worklist.reserve(M.size());
- for (Function &F : M)
- if (!F.use_empty() && !F.isDeclaration())
- Worklist.push_back(&F);
-
- bool Changed = false;
- while (!Worklist.empty()) {
- Function *CurrFunc = Worklist.back();
- Worklist.pop_back();
-
- if (CurrFunc->use_empty())
- continue;
-
- bool Recursive = false;
- for (User *U : CurrFunc->users())
- if (Instruction *I = dyn_cast<Instruction>(U))
- if (I->getParent()->getParent() == CurrFunc) {
- Recursive = true;
- break;
- }
- if (Recursive)
- continue;
-
+ continue;
+
+ OptimizationRemarkEmitter CallerORE(CB->getCaller());
+ if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
+ continue;
+
+ // Construct remark before doing the inlining, as after successful inlining
+ // the callsite is removed.
+ OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
+ OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
+ << ore::NV("Caller", CB->getCaller());
+
+ InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
+ // We can only forward varargs when we outlined a single region, else we
+ // bail on vararg functions.
+ if (!InlineFunction(*CB, IFI, nullptr, true,
+ (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
+ : nullptr))
+ .isSuccess())
+ continue;
+
+ CallerORE.emit(OR);
+
+ // Now update the entry count:
+ if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
+ uint64_t CallSiteCount = CallSiteToProfCountMap[User];
+ CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
+ }
+
+ AnyInline = true;
+ NumPartialInlining++;
+ // Update the stats
+ if (Cloner.ClonedOI)
+ NumPartialInlined++;
+ else
+ NumColdOutlinePartialInlined++;
+ }
+
+ if (AnyInline) {
+ Cloner.IsFunctionInlined = true;
+ if (CalleeEntryCount)
+ Cloner.OrigFunc->setEntryCount(
+ CalleeEntryCount.setCount(CalleeEntryCountV));
+ OptimizationRemarkEmitter OrigFuncORE(Cloner.OrigFunc);
+ OrigFuncORE.emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", Cloner.OrigFunc)
+ << "Partially inlined into at least one caller";
+ });
+ }
+
+ return AnyInline;
+}
+
+bool PartialInlinerImpl::run(Module &M) {
+ if (DisablePartialInlining)
+ return false;
+
+ std::vector<Function *> Worklist;
+ Worklist.reserve(M.size());
+ for (Function &F : M)
+ if (!F.use_empty() && !F.isDeclaration())
+ Worklist.push_back(&F);
+
+ bool Changed = false;
+ while (!Worklist.empty()) {
+ Function *CurrFunc = Worklist.back();
+ Worklist.pop_back();
+
+ if (CurrFunc->use_empty())
+ continue;
+
+ bool Recursive = false;
+ for (User *U : CurrFunc->users())
+ if (Instruction *I = dyn_cast<Instruction>(U))
+ if (I->getParent()->getParent() == CurrFunc) {
+ Recursive = true;
+ break;
+ }
+ if (Recursive)
+ continue;
+
std::pair<bool, Function *> Result = unswitchFunction(*CurrFunc);
- if (Result.second)
- Worklist.push_back(Result.second);
- Changed |= Result.first;
- }
-
- return Changed;
-}
-
-char PartialInlinerLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
- "Partial Inliner", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
- "Partial Inliner", false, false)
-
-ModulePass *llvm::createPartialInliningPass() {
- return new PartialInlinerLegacyPass();
-}
-
-PreservedAnalyses PartialInlinerPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-
- auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
- return FAM.getResult<AssumptionAnalysis>(F);
- };
-
- auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
- return FAM.getCachedResult<AssumptionAnalysis>(F);
- };
-
- auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
- return FAM.getResult<BlockFrequencyAnalysis>(F);
- };
-
- auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
- return FAM.getResult<TargetIRAnalysis>(F);
- };
-
- auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
- return FAM.getResult<TargetLibraryAnalysis>(F);
- };
-
- ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
-
- if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
- GetTLI, PSI, GetBFI)
- .run(M))
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
-}
+ if (Result.second)
+ Worklist.push_back(Result.second);
+ Changed |= Result.first;
+ }
+
+ return Changed;
+}
+
+char PartialInlinerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
+ "Partial Inliner", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
+ "Partial Inliner", false, false)
+
+ModulePass *llvm::createPartialInliningPass() {
+ return new PartialInlinerLegacyPass();
+}
+
+PreservedAnalyses PartialInlinerPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+ auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
+ return FAM.getResult<AssumptionAnalysis>(F);
+ };
+
+ auto LookupAssumptionCache = [&FAM](Function &F) -> AssumptionCache * {
+ return FAM.getCachedResult<AssumptionAnalysis>(F);
+ };
+
+ auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
+ return FAM.getResult<BlockFrequencyAnalysis>(F);
+ };
+
+ auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
+ return FAM.getResult<TargetIRAnalysis>(F);
+ };
+
+ auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+ return FAM.getResult<TargetLibraryAnalysis>(F);
+ };
+
+ ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
+
+ if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
+ GetTLI, PSI, GetBFI)
+ .run(M))
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/PassManagerBuilder.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/PassManagerBuilder.cpp
index 520456e912..068328391d 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -1,311 +1,311 @@
-//===- PassManagerBuilder.cpp - Build Standard Pass -----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the PassManagerBuilder class, which is used to set up a
-// "standard" optimization sequence suitable for languages like C and C++.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/PassManagerBuilder.h"
-#include "llvm-c/Transforms/PassManagerBuilder.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/CFLAndersAliasAnalysis.h"
-#include "llvm/Analysis/CFLSteensAliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/ScopedNoAliasAA.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/Attributor.h"
-#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
-#include "llvm/Transforms/IPO/FunctionAttrs.h"
-#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
-#include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
-#include "llvm/Transforms/Scalar/LICM.h"
-#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
-#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Vectorize.h"
-#include "llvm/Transforms/Vectorize/LoopVectorize.h"
-#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
-#include "llvm/Transforms/Vectorize/VectorCombine.h"
-
-using namespace llvm;
-
+//===- PassManagerBuilder.cpp - Build Standard Pass -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the PassManagerBuilder class, which is used to set up a
+// "standard" optimization sequence suitable for languages like C and C++.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm-c/Transforms/PassManagerBuilder.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CFLAndersAliasAnalysis.h"
+#include "llvm/Analysis/CFLSteensAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/Attributor.h"
+#include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/IPO/InferFunctionAttrs.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/Transforms/Scalar/LICM.h"
+#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+#include "llvm/Transforms/Vectorize/VectorCombine.h"
+
+using namespace llvm;
+
cl::opt<bool> RunPartialInlining("enable-partial-inlining", cl::init(false),
cl::Hidden, cl::ZeroOrMore,
cl::desc("Run Partial inlinining pass"));
-
-static cl::opt<bool>
-UseGVNAfterVectorization("use-gvn-after-vectorization",
- cl::init(false), cl::Hidden,
- cl::desc("Run GVN instead of Early CSE after vectorization passes"));
-
+
+static cl::opt<bool>
+UseGVNAfterVectorization("use-gvn-after-vectorization",
+ cl::init(false), cl::Hidden,
+ cl::desc("Run GVN instead of Early CSE after vectorization passes"));
+
cl::opt<bool> ExtraVectorizerPasses(
- "extra-vectorizer-passes", cl::init(false), cl::Hidden,
- cl::desc("Run cleanup optimization passes after vectorization."));
-
-static cl::opt<bool>
-RunLoopRerolling("reroll-loops", cl::Hidden,
- cl::desc("Run the loop rerolling pass"));
-
+ "extra-vectorizer-passes", cl::init(false), cl::Hidden,
+ cl::desc("Run cleanup optimization passes after vectorization."));
+
+static cl::opt<bool>
+RunLoopRerolling("reroll-loops", cl::Hidden,
+ cl::desc("Run the loop rerolling pass"));
+
cl::opt<bool> RunNewGVN("enable-newgvn", cl::init(false), cl::Hidden,
cl::desc("Run the NewGVN pass"));
-
-// Experimental option to use CFL-AA
-enum class CFLAAType { None, Steensgaard, Andersen, Both };
+
+// Experimental option to use CFL-AA
+enum class CFLAAType { None, Steensgaard, Andersen, Both };
static cl::opt<::CFLAAType>
UseCFLAA("use-cfl-aa", cl::init(::CFLAAType::None), cl::Hidden,
- cl::desc("Enable the new, experimental CFL alias analysis"),
+ cl::desc("Enable the new, experimental CFL alias analysis"),
cl::values(clEnumValN(::CFLAAType::None, "none", "Disable CFL-AA"),
clEnumValN(::CFLAAType::Steensgaard, "steens",
- "Enable unification-based CFL-AA"),
+ "Enable unification-based CFL-AA"),
clEnumValN(::CFLAAType::Andersen, "anders",
- "Enable inclusion-based CFL-AA"),
+ "Enable inclusion-based CFL-AA"),
clEnumValN(::CFLAAType::Both, "both",
- "Enable both variants of CFL-AA")));
-
-static cl::opt<bool> EnableLoopInterchange(
- "enable-loopinterchange", cl::init(false), cl::Hidden,
- cl::desc("Enable the new, experimental LoopInterchange Pass"));
-
+ "Enable both variants of CFL-AA")));
+
+static cl::opt<bool> EnableLoopInterchange(
+ "enable-loopinterchange", cl::init(false), cl::Hidden,
+ cl::desc("Enable the new, experimental LoopInterchange Pass"));
+
cl::opt<bool> EnableUnrollAndJam("enable-unroll-and-jam", cl::init(false),
cl::Hidden,
cl::desc("Enable Unroll And Jam Pass"));
-
+
cl::opt<bool> EnableLoopFlatten("enable-loop-flatten", cl::init(false),
cl::Hidden,
cl::desc("Enable the LoopFlatten Pass"));
-static cl::opt<bool>
- EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
- cl::desc("Enable preparation for ThinLTO."));
-
-static cl::opt<bool>
- EnablePerformThinLTO("perform-thinlto", cl::init(false), cl::Hidden,
- cl::desc("Enable performing ThinLTO."));
-
-cl::opt<bool> EnableHotColdSplit("hot-cold-split", cl::init(false),
- cl::ZeroOrMore, cl::desc("Enable hot-cold splitting pass"));
-
+static cl::opt<bool>
+ EnablePrepareForThinLTO("prepare-for-thinlto", cl::init(false), cl::Hidden,
+ cl::desc("Enable preparation for ThinLTO."));
+
+static cl::opt<bool>
+ EnablePerformThinLTO("perform-thinlto", cl::init(false), cl::Hidden,
+ cl::desc("Enable performing ThinLTO."));
+
+cl::opt<bool> EnableHotColdSplit("hot-cold-split", cl::init(false),
+ cl::ZeroOrMore, cl::desc("Enable hot-cold splitting pass"));
+
cl::opt<bool> EnableIROutliner("ir-outliner", cl::init(false), cl::Hidden,
cl::desc("Enable ir outliner pass"));
-static cl::opt<bool> UseLoopVersioningLICM(
- "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
- cl::desc("Enable the experimental Loop Versioning LICM pass"));
-
+static cl::opt<bool> UseLoopVersioningLICM(
+ "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
+ cl::desc("Enable the experimental Loop Versioning LICM pass"));
+
cl::opt<bool>
- DisablePreInliner("disable-preinline", cl::init(false), cl::Hidden,
- cl::desc("Disable pre-instrumentation inliner"));
-
+ DisablePreInliner("disable-preinline", cl::init(false), cl::Hidden,
+ cl::desc("Disable pre-instrumentation inliner"));
+
cl::opt<int> PreInlineThreshold(
- "preinline-threshold", cl::Hidden, cl::init(75), cl::ZeroOrMore,
- cl::desc("Control the amount of inlining in pre-instrumentation inliner "
- "(default = 75)"));
-
+ "preinline-threshold", cl::Hidden, cl::init(75), cl::ZeroOrMore,
+ cl::desc("Control the amount of inlining in pre-instrumentation inliner "
+ "(default = 75)"));
+
cl::opt<bool>
EnableGVNHoist("enable-gvn-hoist", cl::init(false), cl::ZeroOrMore,
cl::desc("Enable the GVN hoisting pass (default = off)"));
-
-static cl::opt<bool>
- DisableLibCallsShrinkWrap("disable-libcalls-shrinkwrap", cl::init(false),
- cl::Hidden,
- cl::desc("Disable shrink-wrap library calls"));
-
-static cl::opt<bool> EnableSimpleLoopUnswitch(
- "enable-simple-loop-unswitch", cl::init(false), cl::Hidden,
- cl::desc("Enable the simple loop unswitch pass. Also enables independent "
- "cleanup passes integrated into the loop pass manager pipeline."));
-
+
+static cl::opt<bool>
+ DisableLibCallsShrinkWrap("disable-libcalls-shrinkwrap", cl::init(false),
+ cl::Hidden,
+ cl::desc("Disable shrink-wrap library calls"));
+
+static cl::opt<bool> EnableSimpleLoopUnswitch(
+ "enable-simple-loop-unswitch", cl::init(false), cl::Hidden,
+ cl::desc("Enable the simple loop unswitch pass. Also enables independent "
+ "cleanup passes integrated into the loop pass manager pipeline."));
+
cl::opt<bool>
EnableGVNSink("enable-gvn-sink", cl::init(false), cl::ZeroOrMore,
cl::desc("Enable the GVN sinking pass (default = off)"));
-
-// This option is used in simplifying testing SampleFDO optimizations for
-// profile loading.
+
+// This option is used in simplifying testing SampleFDO optimizations for
+// profile loading.
cl::opt<bool>
- EnableCHR("enable-chr", cl::init(true), cl::Hidden,
- cl::desc("Enable control height reduction optimization (CHR)"));
-
-cl::opt<bool> FlattenedProfileUsed(
- "flattened-profile-used", cl::init(false), cl::Hidden,
- cl::desc("Indicate the sample profile being used is flattened, i.e., "
- "no inline hierachy exists in the profile. "));
-
-cl::opt<bool> EnableOrderFileInstrumentation(
- "enable-order-file-instrumentation", cl::init(false), cl::Hidden,
- cl::desc("Enable order file instrumentation (default = off)"));
-
+ EnableCHR("enable-chr", cl::init(true), cl::Hidden,
+ cl::desc("Enable control height reduction optimization (CHR)"));
+
+cl::opt<bool> FlattenedProfileUsed(
+ "flattened-profile-used", cl::init(false), cl::Hidden,
+ cl::desc("Indicate the sample profile being used is flattened, i.e., "
+ "no inline hierachy exists in the profile. "));
+
+cl::opt<bool> EnableOrderFileInstrumentation(
+ "enable-order-file-instrumentation", cl::init(false), cl::Hidden,
+ cl::desc("Enable order file instrumentation (default = off)"));
+
cl::opt<bool> EnableMatrix(
"enable-matrix", cl::init(false), cl::Hidden,
cl::desc("Enable lowering of the matrix intrinsics"));
-
+
cl::opt<bool> EnableConstraintElimination(
"enable-constraint-elimination", cl::init(false), cl::Hidden,
cl::desc(
"Enable pass to eliminate conditions based on linear constraints."));
-cl::opt<AttributorRunOption> AttributorRun(
- "attributor-enable", cl::Hidden, cl::init(AttributorRunOption::NONE),
- cl::desc("Enable the attributor inter-procedural deduction pass."),
- cl::values(clEnumValN(AttributorRunOption::ALL, "all",
- "enable all attributor runs"),
- clEnumValN(AttributorRunOption::MODULE, "module",
- "enable module-wide attributor runs"),
- clEnumValN(AttributorRunOption::CGSCC, "cgscc",
- "enable call graph SCC attributor runs"),
- clEnumValN(AttributorRunOption::NONE, "none",
- "disable attributor runs")));
-
-extern cl::opt<bool> EnableKnowledgeRetention;
-
-PassManagerBuilder::PassManagerBuilder() {
- OptLevel = 2;
- SizeLevel = 0;
- LibraryInfo = nullptr;
- Inliner = nullptr;
- DisableUnrollLoops = false;
- SLPVectorize = false;
- LoopVectorize = true;
- LoopsInterleaved = true;
- RerollLoops = RunLoopRerolling;
- NewGVN = RunNewGVN;
- LicmMssaOptCap = SetLicmMssaOptCap;
- LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
- DisableGVNLoadPRE = false;
- ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll;
- VerifyInput = false;
- VerifyOutput = false;
- MergeFunctions = false;
- PrepareForLTO = false;
- EnablePGOInstrGen = false;
- EnablePGOCSInstrGen = false;
- EnablePGOCSInstrUse = false;
- PGOInstrGen = "";
- PGOInstrUse = "";
- PGOSampleUse = "";
- PrepareForThinLTO = EnablePrepareForThinLTO;
- PerformThinLTO = EnablePerformThinLTO;
- DivergentTarget = false;
- CallGraphProfile = true;
-}
-
-PassManagerBuilder::~PassManagerBuilder() {
- delete LibraryInfo;
- delete Inliner;
-}
-
-/// Set of global extensions, automatically added as part of the standard set.
-static ManagedStatic<
- SmallVector<std::tuple<PassManagerBuilder::ExtensionPointTy,
- PassManagerBuilder::ExtensionFn,
- PassManagerBuilder::GlobalExtensionID>,
- 8>>
- GlobalExtensions;
-static PassManagerBuilder::GlobalExtensionID GlobalExtensionsCounter;
-
-/// Check if GlobalExtensions is constructed and not empty.
-/// Since GlobalExtensions is a managed static, calling 'empty()' will trigger
-/// the construction of the object.
-static bool GlobalExtensionsNotEmpty() {
- return GlobalExtensions.isConstructed() && !GlobalExtensions->empty();
-}
-
-PassManagerBuilder::GlobalExtensionID
-PassManagerBuilder::addGlobalExtension(PassManagerBuilder::ExtensionPointTy Ty,
- PassManagerBuilder::ExtensionFn Fn) {
- auto ExtensionID = GlobalExtensionsCounter++;
- GlobalExtensions->push_back(std::make_tuple(Ty, std::move(Fn), ExtensionID));
- return ExtensionID;
-}
-
-void PassManagerBuilder::removeGlobalExtension(
- PassManagerBuilder::GlobalExtensionID ExtensionID) {
- // RegisterStandardPasses may try to call this function after GlobalExtensions
- // has already been destroyed; doing so should not generate an error.
- if (!GlobalExtensions.isConstructed())
- return;
-
- auto GlobalExtension =
- llvm::find_if(*GlobalExtensions, [ExtensionID](const auto &elem) {
- return std::get<2>(elem) == ExtensionID;
- });
- assert(GlobalExtension != GlobalExtensions->end() &&
- "The extension ID to be removed should always be valid.");
-
- GlobalExtensions->erase(GlobalExtension);
-}
-
-void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) {
- Extensions.push_back(std::make_pair(Ty, std::move(Fn)));
-}
-
-void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
- legacy::PassManagerBase &PM) const {
- if (GlobalExtensionsNotEmpty()) {
- for (auto &Ext : *GlobalExtensions) {
- if (std::get<0>(Ext) == ETy)
- std::get<1>(Ext)(*this, PM);
- }
- }
- for (unsigned i = 0, e = Extensions.size(); i != e; ++i)
- if (Extensions[i].first == ETy)
- Extensions[i].second(*this, PM);
-}
-
-void PassManagerBuilder::addInitialAliasAnalysisPasses(
- legacy::PassManagerBase &PM) const {
- switch (UseCFLAA) {
+cl::opt<AttributorRunOption> AttributorRun(
+ "attributor-enable", cl::Hidden, cl::init(AttributorRunOption::NONE),
+ cl::desc("Enable the attributor inter-procedural deduction pass."),
+ cl::values(clEnumValN(AttributorRunOption::ALL, "all",
+ "enable all attributor runs"),
+ clEnumValN(AttributorRunOption::MODULE, "module",
+ "enable module-wide attributor runs"),
+ clEnumValN(AttributorRunOption::CGSCC, "cgscc",
+ "enable call graph SCC attributor runs"),
+ clEnumValN(AttributorRunOption::NONE, "none",
+ "disable attributor runs")));
+
+extern cl::opt<bool> EnableKnowledgeRetention;
+
+PassManagerBuilder::PassManagerBuilder() {
+ OptLevel = 2;
+ SizeLevel = 0;
+ LibraryInfo = nullptr;
+ Inliner = nullptr;
+ DisableUnrollLoops = false;
+ SLPVectorize = false;
+ LoopVectorize = true;
+ LoopsInterleaved = true;
+ RerollLoops = RunLoopRerolling;
+ NewGVN = RunNewGVN;
+ LicmMssaOptCap = SetLicmMssaOptCap;
+ LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
+ DisableGVNLoadPRE = false;
+ ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll;
+ VerifyInput = false;
+ VerifyOutput = false;
+ MergeFunctions = false;
+ PrepareForLTO = false;
+ EnablePGOInstrGen = false;
+ EnablePGOCSInstrGen = false;
+ EnablePGOCSInstrUse = false;
+ PGOInstrGen = "";
+ PGOInstrUse = "";
+ PGOSampleUse = "";
+ PrepareForThinLTO = EnablePrepareForThinLTO;
+ PerformThinLTO = EnablePerformThinLTO;
+ DivergentTarget = false;
+ CallGraphProfile = true;
+}
+
+PassManagerBuilder::~PassManagerBuilder() {
+ delete LibraryInfo;
+ delete Inliner;
+}
+
+/// Set of global extensions, automatically added as part of the standard set.
+static ManagedStatic<
+ SmallVector<std::tuple<PassManagerBuilder::ExtensionPointTy,
+ PassManagerBuilder::ExtensionFn,
+ PassManagerBuilder::GlobalExtensionID>,
+ 8>>
+ GlobalExtensions;
+static PassManagerBuilder::GlobalExtensionID GlobalExtensionsCounter;
+
+/// Check if GlobalExtensions is constructed and not empty.
+/// Since GlobalExtensions is a managed static, calling 'empty()' will trigger
+/// the construction of the object.
+static bool GlobalExtensionsNotEmpty() {
+ return GlobalExtensions.isConstructed() && !GlobalExtensions->empty();
+}
+
+PassManagerBuilder::GlobalExtensionID
+PassManagerBuilder::addGlobalExtension(PassManagerBuilder::ExtensionPointTy Ty,
+ PassManagerBuilder::ExtensionFn Fn) {
+ auto ExtensionID = GlobalExtensionsCounter++;
+ GlobalExtensions->push_back(std::make_tuple(Ty, std::move(Fn), ExtensionID));
+ return ExtensionID;
+}
+
+void PassManagerBuilder::removeGlobalExtension(
+ PassManagerBuilder::GlobalExtensionID ExtensionID) {
+ // RegisterStandardPasses may try to call this function after GlobalExtensions
+ // has already been destroyed; doing so should not generate an error.
+ if (!GlobalExtensions.isConstructed())
+ return;
+
+ auto GlobalExtension =
+ llvm::find_if(*GlobalExtensions, [ExtensionID](const auto &elem) {
+ return std::get<2>(elem) == ExtensionID;
+ });
+ assert(GlobalExtension != GlobalExtensions->end() &&
+ "The extension ID to be removed should always be valid.");
+
+ GlobalExtensions->erase(GlobalExtension);
+}
+
+void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) {
+ Extensions.push_back(std::make_pair(Ty, std::move(Fn)));
+}
+
+void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
+ legacy::PassManagerBase &PM) const {
+ if (GlobalExtensionsNotEmpty()) {
+ for (auto &Ext : *GlobalExtensions) {
+ if (std::get<0>(Ext) == ETy)
+ std::get<1>(Ext)(*this, PM);
+ }
+ }
+ for (unsigned i = 0, e = Extensions.size(); i != e; ++i)
+ if (Extensions[i].first == ETy)
+ Extensions[i].second(*this, PM);
+}
+
+void PassManagerBuilder::addInitialAliasAnalysisPasses(
+ legacy::PassManagerBase &PM) const {
+ switch (UseCFLAA) {
case ::CFLAAType::Steensgaard:
- PM.add(createCFLSteensAAWrapperPass());
- break;
+ PM.add(createCFLSteensAAWrapperPass());
+ break;
case ::CFLAAType::Andersen:
- PM.add(createCFLAndersAAWrapperPass());
- break;
+ PM.add(createCFLAndersAAWrapperPass());
+ break;
case ::CFLAAType::Both:
- PM.add(createCFLSteensAAWrapperPass());
- PM.add(createCFLAndersAAWrapperPass());
- break;
- default:
- break;
- }
-
- // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
- // BasicAliasAnalysis wins if they disagree. This is intended to help
- // support "obvious" type-punning idioms.
- PM.add(createTypeBasedAAWrapperPass());
- PM.add(createScopedNoAliasAAWrapperPass());
-}
-
-void PassManagerBuilder::populateFunctionPassManager(
- legacy::FunctionPassManager &FPM) {
- addExtensionsToPM(EP_EarlyAsPossible, FPM);
- FPM.add(createEntryExitInstrumenterPass());
-
- // Add LibraryInfo if we have some.
- if (LibraryInfo)
- FPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
-
+ PM.add(createCFLSteensAAWrapperPass());
+ PM.add(createCFLAndersAAWrapperPass());
+ break;
+ default:
+ break;
+ }
+
+ // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+ // BasicAliasAnalysis wins if they disagree. This is intended to help
+ // support "obvious" type-punning idioms.
+ PM.add(createTypeBasedAAWrapperPass());
+ PM.add(createScopedNoAliasAAWrapperPass());
+}
+
+void PassManagerBuilder::populateFunctionPassManager(
+ legacy::FunctionPassManager &FPM) {
+ addExtensionsToPM(EP_EarlyAsPossible, FPM);
+ FPM.add(createEntryExitInstrumenterPass());
+
+ // Add LibraryInfo if we have some.
+ if (LibraryInfo)
+ FPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
+
// The backends do not handle matrix intrinsics currently.
// Make sure they are also lowered in O0.
// FIXME: A lightweight version of the pass should run in the backend
@@ -313,34 +313,34 @@ void PassManagerBuilder::populateFunctionPassManager(
if (EnableMatrix && OptLevel == 0)
FPM.add(createLowerMatrixIntrinsicsMinimalPass());
- if (OptLevel == 0) return;
-
- addInitialAliasAnalysisPasses(FPM);
-
- FPM.add(createCFGSimplificationPass());
- FPM.add(createSROAPass());
- FPM.add(createEarlyCSEPass());
- FPM.add(createLowerExpectIntrinsicPass());
-}
-
-// Do PGO instrumentation generation or use pass as the option specified.
-void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM,
- bool IsCS = false) {
- if (IsCS) {
- if (!EnablePGOCSInstrGen && !EnablePGOCSInstrUse)
- return;
- } else if (!EnablePGOInstrGen && PGOInstrUse.empty() && PGOSampleUse.empty())
- return;
-
- // Perform the preinline and cleanup passes for O1 and above.
- // We will not do this inline for context sensitive PGO (when IsCS is true).
+ if (OptLevel == 0) return;
+
+ addInitialAliasAnalysisPasses(FPM);
+
+ FPM.add(createCFGSimplificationPass());
+ FPM.add(createSROAPass());
+ FPM.add(createEarlyCSEPass());
+ FPM.add(createLowerExpectIntrinsicPass());
+}
+
+// Do PGO instrumentation generation or use pass as the option specified.
+void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM,
+ bool IsCS = false) {
+ if (IsCS) {
+ if (!EnablePGOCSInstrGen && !EnablePGOCSInstrUse)
+ return;
+ } else if (!EnablePGOInstrGen && PGOInstrUse.empty() && PGOSampleUse.empty())
+ return;
+
+ // Perform the preinline and cleanup passes for O1 and above.
+ // We will not do this inline for context sensitive PGO (when IsCS is true).
if (OptLevel > 0 && !DisablePreInliner && PGOSampleUse.empty() && !IsCS) {
- // Create preinline pass. We construct an InlineParams object and specify
- // the threshold here to avoid the command line options of the regular
- // inliner to influence pre-inlining. The only fields of InlineParams we
- // care about are DefaultThreshold and HintThreshold.
- InlineParams IP;
- IP.DefaultThreshold = PreInlineThreshold;
+ // Create preinline pass. We construct an InlineParams object and specify
+ // the threshold here to avoid the command line options of the regular
+ // inliner to influence pre-inlining. The only fields of InlineParams we
+ // care about are DefaultThreshold and HintThreshold.
+ InlineParams IP;
+ IP.DefaultThreshold = PreInlineThreshold;
// FIXME: The hint threshold has the same value used by the regular inliner
// when not optimzing for size. This should probably be lowered after
// performance testing.
@@ -348,476 +348,476 @@ void PassManagerBuilder::addPGOInstrPasses(legacy::PassManagerBase &MPM,
// the instrumented binary unusably large. Even if PreInlineThreshold is not
// correct thresold for -Oz, it is better than not running preinliner.
IP.HintThreshold = SizeLevel > 0 ? PreInlineThreshold : 325;
-
- MPM.add(createFunctionInliningPass(IP));
- MPM.add(createSROAPass());
- MPM.add(createEarlyCSEPass()); // Catch trivial redundancies
- MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
- MPM.add(createInstructionCombiningPass()); // Combine silly seq's
- addExtensionsToPM(EP_Peephole, MPM);
- }
- if ((EnablePGOInstrGen && !IsCS) || (EnablePGOCSInstrGen && IsCS)) {
- MPM.add(createPGOInstrumentationGenLegacyPass(IsCS));
- // Add the profile lowering pass.
- InstrProfOptions Options;
- if (!PGOInstrGen.empty())
- Options.InstrProfileOutput = PGOInstrGen;
- Options.DoCounterPromotion = true;
- Options.UseBFIInPromotion = IsCS;
- MPM.add(createLoopRotatePass());
- MPM.add(createInstrProfilingLegacyPass(Options, IsCS));
- }
- if (!PGOInstrUse.empty())
- MPM.add(createPGOInstrumentationUseLegacyPass(PGOInstrUse, IsCS));
- // Indirect call promotion that promotes intra-module targets only.
- // For ThinLTO this is done earlier due to interactions with globalopt
- // for imported functions. We don't run this at -O0.
- if (OptLevel > 0 && !IsCS)
- MPM.add(
- createPGOIndirectCallPromotionLegacyPass(false, !PGOSampleUse.empty()));
-}
-void PassManagerBuilder::addFunctionSimplificationPasses(
- legacy::PassManagerBase &MPM) {
- // Start of function pass.
- // Break up aggregate allocas, using SSAUpdater.
- assert(OptLevel >= 1 && "Calling function optimizer with no optimization level!");
- MPM.add(createSROAPass());
- MPM.add(createEarlyCSEPass(true /* Enable mem-ssa. */)); // Catch trivial redundancies
- if (EnableKnowledgeRetention)
- MPM.add(createAssumeSimplifyPass());
-
- if (OptLevel > 1) {
- if (EnableGVNHoist)
- MPM.add(createGVNHoistPass());
- if (EnableGVNSink) {
- MPM.add(createGVNSinkPass());
- MPM.add(createCFGSimplificationPass());
- }
- }
-
+
+ MPM.add(createFunctionInliningPass(IP));
+ MPM.add(createSROAPass());
+ MPM.add(createEarlyCSEPass()); // Catch trivial redundancies
+ MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
+ MPM.add(createInstructionCombiningPass()); // Combine silly seq's
+ addExtensionsToPM(EP_Peephole, MPM);
+ }
+ if ((EnablePGOInstrGen && !IsCS) || (EnablePGOCSInstrGen && IsCS)) {
+ MPM.add(createPGOInstrumentationGenLegacyPass(IsCS));
+ // Add the profile lowering pass.
+ InstrProfOptions Options;
+ if (!PGOInstrGen.empty())
+ Options.InstrProfileOutput = PGOInstrGen;
+ Options.DoCounterPromotion = true;
+ Options.UseBFIInPromotion = IsCS;
+ MPM.add(createLoopRotatePass());
+ MPM.add(createInstrProfilingLegacyPass(Options, IsCS));
+ }
+ if (!PGOInstrUse.empty())
+ MPM.add(createPGOInstrumentationUseLegacyPass(PGOInstrUse, IsCS));
+ // Indirect call promotion that promotes intra-module targets only.
+ // For ThinLTO this is done earlier due to interactions with globalopt
+ // for imported functions. We don't run this at -O0.
+ if (OptLevel > 0 && !IsCS)
+ MPM.add(
+ createPGOIndirectCallPromotionLegacyPass(false, !PGOSampleUse.empty()));
+}
+void PassManagerBuilder::addFunctionSimplificationPasses(
+ legacy::PassManagerBase &MPM) {
+ // Start of function pass.
+ // Break up aggregate allocas, using SSAUpdater.
+ assert(OptLevel >= 1 && "Calling function optimizer with no optimization level!");
+ MPM.add(createSROAPass());
+ MPM.add(createEarlyCSEPass(true /* Enable mem-ssa. */)); // Catch trivial redundancies
+ if (EnableKnowledgeRetention)
+ MPM.add(createAssumeSimplifyPass());
+
+ if (OptLevel > 1) {
+ if (EnableGVNHoist)
+ MPM.add(createGVNHoistPass());
+ if (EnableGVNSink) {
+ MPM.add(createGVNSinkPass());
+ MPM.add(createCFGSimplificationPass());
+ }
+ }
+
if (EnableConstraintElimination)
MPM.add(createConstraintEliminationPass());
- if (OptLevel > 1) {
- // Speculative execution if the target has divergent branches; otherwise nop.
- MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
-
- MPM.add(createJumpThreadingPass()); // Thread jumps.
- MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
- }
- MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
- // Combine silly seq's
- if (OptLevel > 2)
- MPM.add(createAggressiveInstCombinerPass());
- MPM.add(createInstructionCombiningPass());
- if (SizeLevel == 0 && !DisableLibCallsShrinkWrap)
- MPM.add(createLibCallsShrinkWrapPass());
- addExtensionsToPM(EP_Peephole, MPM);
-
- // Optimize memory intrinsic calls based on the profiled size information.
- if (SizeLevel == 0)
- MPM.add(createPGOMemOPSizeOptLegacyPass());
-
- // TODO: Investigate the cost/benefit of tail call elimination on debugging.
- if (OptLevel > 1)
- MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
- MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
- MPM.add(createReassociatePass()); // Reassociate expressions
-
- // Begin the loop pass pipeline.
- if (EnableSimpleLoopUnswitch) {
- // The simple loop unswitch pass relies on separate cleanup passes. Schedule
- // them first so when we re-process a loop they run before other loop
- // passes.
- MPM.add(createLoopInstSimplifyPass());
- MPM.add(createLoopSimplifyCFGPass());
- }
- // Rotate Loop - disable header duplication at -Oz
+ if (OptLevel > 1) {
+ // Speculative execution if the target has divergent branches; otherwise nop.
+ MPM.add(createSpeculativeExecutionIfHasBranchDivergencePass());
+
+ MPM.add(createJumpThreadingPass()); // Thread jumps.
+ MPM.add(createCorrelatedValuePropagationPass()); // Propagate conditionals
+ }
+ MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
+ // Combine silly seq's
+ if (OptLevel > 2)
+ MPM.add(createAggressiveInstCombinerPass());
+ MPM.add(createInstructionCombiningPass());
+ if (SizeLevel == 0 && !DisableLibCallsShrinkWrap)
+ MPM.add(createLibCallsShrinkWrapPass());
+ addExtensionsToPM(EP_Peephole, MPM);
+
+ // Optimize memory intrinsic calls based on the profiled size information.
+ if (SizeLevel == 0)
+ MPM.add(createPGOMemOPSizeOptLegacyPass());
+
+ // TODO: Investigate the cost/benefit of tail call elimination on debugging.
+ if (OptLevel > 1)
+ MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
+ MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
+ MPM.add(createReassociatePass()); // Reassociate expressions
+
+ // Begin the loop pass pipeline.
+ if (EnableSimpleLoopUnswitch) {
+ // The simple loop unswitch pass relies on separate cleanup passes. Schedule
+ // them first so when we re-process a loop they run before other loop
+ // passes.
+ MPM.add(createLoopInstSimplifyPass());
+ MPM.add(createLoopSimplifyCFGPass());
+ }
+ // Rotate Loop - disable header duplication at -Oz
MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
- // TODO: Investigate promotion cap for O1.
- MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
- if (EnableSimpleLoopUnswitch)
- MPM.add(createSimpleLoopUnswitchLegacyPass());
- else
- MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
- // FIXME: We break the loop pass pipeline here in order to do full
- // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the
- // need for this.
- MPM.add(createCFGSimplificationPass());
- MPM.add(createInstructionCombiningPass());
- // We resume loop passes creating a second loop pipeline here.
+ // TODO: Investigate promotion cap for O1.
+ MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+ if (EnableSimpleLoopUnswitch)
+ MPM.add(createSimpleLoopUnswitchLegacyPass());
+ else
+ MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
+ // FIXME: We break the loop pass pipeline here in order to do full
+ // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the
+ // need for this.
+ MPM.add(createCFGSimplificationPass());
+ MPM.add(createInstructionCombiningPass());
+ // We resume loop passes creating a second loop pipeline here.
if (EnableLoopFlatten) {
MPM.add(createLoopFlattenPass()); // Flatten loops
MPM.add(createLoopSimplifyCFGPass());
}
MPM.add(createLoopIdiomPass()); // Recognize idioms like memset.
- MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars
- addExtensionsToPM(EP_LateLoopOptimizations, MPM);
- MPM.add(createLoopDeletionPass()); // Delete dead loops
-
- if (EnableLoopInterchange)
- MPM.add(createLoopInterchangePass()); // Interchange loops
-
+ MPM.add(createIndVarSimplifyPass()); // Canonicalize indvars
+ addExtensionsToPM(EP_LateLoopOptimizations, MPM);
+ MPM.add(createLoopDeletionPass()); // Delete dead loops
+
+ if (EnableLoopInterchange)
+ MPM.add(createLoopInterchangePass()); // Interchange loops
+
// Unroll small loops and perform peeling.
- MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
- ForgetAllSCEVInLoopUnroll));
- addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
- // This ends the loop pass pipelines.
-
+ MPM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
+ ForgetAllSCEVInLoopUnroll));
+ addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
+ // This ends the loop pass pipelines.
+
// Break up allocas that may now be splittable after loop unrolling.
MPM.add(createSROAPass());
- if (OptLevel > 1) {
- MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
- MPM.add(NewGVN ? createNewGVNPass()
- : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
- }
- MPM.add(createMemCpyOptPass()); // Remove memcpy / form memset
- MPM.add(createSCCPPass()); // Constant prop with SCCP
-
+ if (OptLevel > 1) {
+ MPM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds
+ MPM.add(NewGVN ? createNewGVNPass()
+ : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies
+ }
+ MPM.add(createMemCpyOptPass()); // Remove memcpy / form memset
+ MPM.add(createSCCPPass()); // Constant prop with SCCP
+
if (EnableConstraintElimination)
MPM.add(createConstraintEliminationPass());
- // Delete dead bit computations (instcombine runs after to fold away the dead
- // computations, and then ADCE will run later to exploit any new DCE
- // opportunities that creates).
- MPM.add(createBitTrackingDCEPass()); // Delete dead bit computations
-
- // Run instcombine after redundancy elimination to exploit opportunities
- // opened up by them.
- MPM.add(createInstructionCombiningPass());
- addExtensionsToPM(EP_Peephole, MPM);
- if (OptLevel > 1) {
- MPM.add(createJumpThreadingPass()); // Thread jumps
- MPM.add(createCorrelatedValuePropagationPass());
+ // Delete dead bit computations (instcombine runs after to fold away the dead
+ // computations, and then ADCE will run later to exploit any new DCE
+ // opportunities that creates).
+ MPM.add(createBitTrackingDCEPass()); // Delete dead bit computations
+
+ // Run instcombine after redundancy elimination to exploit opportunities
+ // opened up by them.
+ MPM.add(createInstructionCombiningPass());
+ addExtensionsToPM(EP_Peephole, MPM);
+ if (OptLevel > 1) {
+ MPM.add(createJumpThreadingPass()); // Thread jumps
+ MPM.add(createCorrelatedValuePropagationPass());
}
MPM.add(createAggressiveDCEPass()); // Delete dead instructions
// TODO: Investigate if this is too expensive at O1.
if (OptLevel > 1) {
- MPM.add(createDeadStoreEliminationPass()); // Delete dead stores
- MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
- }
-
- addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
-
- if (RerollLoops)
- MPM.add(createLoopRerollPass());
-
- MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
- // Clean up after everything.
- MPM.add(createInstructionCombiningPass());
- addExtensionsToPM(EP_Peephole, MPM);
-
- if (EnableCHR && OptLevel >= 3 &&
- (!PGOInstrUse.empty() || !PGOSampleUse.empty() || EnablePGOCSInstrGen))
- MPM.add(createControlHeightReductionLegacyPass());
-}
-
-void PassManagerBuilder::populateModulePassManager(
- legacy::PassManagerBase &MPM) {
- // Whether this is a default or *LTO pre-link pipeline. The FullLTO post-link
- // is handled separately, so just check this is not the ThinLTO post-link.
- bool DefaultOrPreLinkPipeline = !PerformThinLTO;
-
+ MPM.add(createDeadStoreEliminationPass()); // Delete dead stores
+ MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+ }
+
+ addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
+
+ if (RerollLoops)
+ MPM.add(createLoopRerollPass());
+
+ MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
+ // Clean up after everything.
+ MPM.add(createInstructionCombiningPass());
+ addExtensionsToPM(EP_Peephole, MPM);
+
+ if (EnableCHR && OptLevel >= 3 &&
+ (!PGOInstrUse.empty() || !PGOSampleUse.empty() || EnablePGOCSInstrGen))
+ MPM.add(createControlHeightReductionLegacyPass());
+}
+
+void PassManagerBuilder::populateModulePassManager(
+ legacy::PassManagerBase &MPM) {
+ // Whether this is a default or *LTO pre-link pipeline. The FullLTO post-link
+ // is handled separately, so just check this is not the ThinLTO post-link.
+ bool DefaultOrPreLinkPipeline = !PerformThinLTO;
+
MPM.add(createAnnotation2MetadataLegacyPass());
- if (!PGOSampleUse.empty()) {
- MPM.add(createPruneEHPass());
- // In ThinLTO mode, when flattened profile is used, all the available
- // profile information will be annotated in PreLink phase so there is
- // no need to load the profile again in PostLink.
- if (!(FlattenedProfileUsed && PerformThinLTO))
- MPM.add(createSampleProfileLoaderPass(PGOSampleUse));
- }
-
- // Allow forcing function attributes as a debugging and tuning aid.
- MPM.add(createForceFunctionAttrsLegacyPass());
-
- // If all optimizations are disabled, just run the always-inline pass and,
- // if enabled, the function merging pass.
- if (OptLevel == 0) {
- addPGOInstrPasses(MPM);
- if (Inliner) {
- MPM.add(Inliner);
- Inliner = nullptr;
- }
-
- // FIXME: The BarrierNoopPass is a HACK! The inliner pass above implicitly
- // creates a CGSCC pass manager, but we don't want to add extensions into
- // that pass manager. To prevent this we insert a no-op module pass to reset
- // the pass manager to get the same behavior as EP_OptimizerLast in non-O0
- // builds. The function merging pass is
- if (MergeFunctions)
- MPM.add(createMergeFunctionsPass());
- else if (GlobalExtensionsNotEmpty() || !Extensions.empty())
- MPM.add(createBarrierNoopPass());
-
- if (PerformThinLTO) {
- MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
- // Drop available_externally and unreferenced globals. This is necessary
- // with ThinLTO in order to avoid leaving undefined references to dead
- // globals in the object file.
- MPM.add(createEliminateAvailableExternallyPass());
- MPM.add(createGlobalDCEPass());
- }
-
- addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
-
- if (PrepareForLTO || PrepareForThinLTO) {
- MPM.add(createCanonicalizeAliasesPass());
- // Rename anon globals to be able to export them in the summary.
- // This has to be done after we add the extensions to the pass manager
- // as there could be passes (e.g. Adddress sanitizer) which introduce
- // new unnamed globals.
- MPM.add(createNameAnonGlobalPass());
- }
+ if (!PGOSampleUse.empty()) {
+ MPM.add(createPruneEHPass());
+ // In ThinLTO mode, when flattened profile is used, all the available
+ // profile information will be annotated in PreLink phase so there is
+ // no need to load the profile again in PostLink.
+ if (!(FlattenedProfileUsed && PerformThinLTO))
+ MPM.add(createSampleProfileLoaderPass(PGOSampleUse));
+ }
+
+ // Allow forcing function attributes as a debugging and tuning aid.
+ MPM.add(createForceFunctionAttrsLegacyPass());
+
+ // If all optimizations are disabled, just run the always-inline pass and,
+ // if enabled, the function merging pass.
+ if (OptLevel == 0) {
+ addPGOInstrPasses(MPM);
+ if (Inliner) {
+ MPM.add(Inliner);
+ Inliner = nullptr;
+ }
+
+ // FIXME: The BarrierNoopPass is a HACK! The inliner pass above implicitly
+ // creates a CGSCC pass manager, but we don't want to add extensions into
+ // that pass manager. To prevent this we insert a no-op module pass to reset
+ // the pass manager to get the same behavior as EP_OptimizerLast in non-O0
+ // builds. The function merging pass is
+ if (MergeFunctions)
+ MPM.add(createMergeFunctionsPass());
+ else if (GlobalExtensionsNotEmpty() || !Extensions.empty())
+ MPM.add(createBarrierNoopPass());
+
+ if (PerformThinLTO) {
+ MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
+ // Drop available_externally and unreferenced globals. This is necessary
+ // with ThinLTO in order to avoid leaving undefined references to dead
+ // globals in the object file.
+ MPM.add(createEliminateAvailableExternallyPass());
+ MPM.add(createGlobalDCEPass());
+ }
+
+ addExtensionsToPM(EP_EnabledOnOptLevel0, MPM);
+
+ if (PrepareForLTO || PrepareForThinLTO) {
+ MPM.add(createCanonicalizeAliasesPass());
+ // Rename anon globals to be able to export them in the summary.
+ // This has to be done after we add the extensions to the pass manager
+ // as there could be passes (e.g. Adddress sanitizer) which introduce
+ // new unnamed globals.
+ MPM.add(createNameAnonGlobalPass());
+ }
MPM.add(createAnnotationRemarksLegacyPass());
- return;
- }
-
- // Add LibraryInfo if we have some.
- if (LibraryInfo)
- MPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
-
- addInitialAliasAnalysisPasses(MPM);
-
- // For ThinLTO there are two passes of indirect call promotion. The
- // first is during the compile phase when PerformThinLTO=false and
- // intra-module indirect call targets are promoted. The second is during
- // the ThinLTO backend when PerformThinLTO=true, when we promote imported
- // inter-module indirect calls. For that we perform indirect call promotion
- // earlier in the pass pipeline, here before globalopt. Otherwise imported
- // available_externally functions look unreferenced and are removed.
- if (PerformThinLTO) {
- MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true,
- !PGOSampleUse.empty()));
- MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
- }
-
- // For SamplePGO in ThinLTO compile phase, we do not want to unroll loops
- // as it will change the CFG too much to make the 2nd profile annotation
- // in backend more difficult.
- bool PrepareForThinLTOUsingPGOSampleProfile =
- PrepareForThinLTO && !PGOSampleUse.empty();
- if (PrepareForThinLTOUsingPGOSampleProfile)
- DisableUnrollLoops = true;
-
- // Infer attributes about declarations if possible.
- MPM.add(createInferFunctionAttrsLegacyPass());
-
- // Infer attributes on declarations, call sites, arguments, etc.
- if (AttributorRun & AttributorRunOption::MODULE)
- MPM.add(createAttributorLegacyPass());
-
- addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
-
- if (OptLevel > 2)
- MPM.add(createCallSiteSplittingPass());
-
- MPM.add(createIPSCCPPass()); // IP SCCP
- MPM.add(createCalledValuePropagationPass());
-
- MPM.add(createGlobalOptimizerPass()); // Optimize out global vars
- // Promote any localized global vars.
- MPM.add(createPromoteMemoryToRegisterPass());
-
- MPM.add(createDeadArgEliminationPass()); // Dead argument elimination
-
- MPM.add(createInstructionCombiningPass()); // Clean up after IPCP & DAE
- addExtensionsToPM(EP_Peephole, MPM);
- MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
-
- // For SamplePGO in ThinLTO compile phase, we do not want to do indirect
- // call promotion as it will change the CFG too much to make the 2nd
- // profile annotation in backend more difficult.
- // PGO instrumentation is added during the compile phase for ThinLTO, do
- // not run it a second time
- if (DefaultOrPreLinkPipeline && !PrepareForThinLTOUsingPGOSampleProfile)
- addPGOInstrPasses(MPM);
-
- // Create profile COMDAT variables. Lld linker wants to see all variables
- // before the LTO/ThinLTO link since it needs to resolve symbols/comdats.
- if (!PerformThinLTO && EnablePGOCSInstrGen)
- MPM.add(createPGOInstrumentationGenCreateVarLegacyPass(PGOInstrGen));
-
- // We add a module alias analysis pass here. In part due to bugs in the
- // analysis infrastructure this "works" in that the analysis stays alive
- // for the entire SCC pass run below.
- MPM.add(createGlobalsAAWrapperPass());
-
- // Start of CallGraph SCC passes.
- MPM.add(createPruneEHPass()); // Remove dead EH info
- bool RunInliner = false;
- if (Inliner) {
- MPM.add(Inliner);
- Inliner = nullptr;
- RunInliner = true;
- }
-
- // Infer attributes on declarations, call sites, arguments, etc. for an SCC.
- if (AttributorRun & AttributorRunOption::CGSCC)
- MPM.add(createAttributorCGSCCLegacyPass());
-
- // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
- // there are no OpenMP runtime calls present in the module.
- if (OptLevel > 1)
- MPM.add(createOpenMPOptLegacyPass());
-
- MPM.add(createPostOrderFunctionAttrsLegacyPass());
- if (OptLevel > 2)
- MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args
-
- addExtensionsToPM(EP_CGSCCOptimizerLate, MPM);
- addFunctionSimplificationPasses(MPM);
-
- // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
- // pass manager that we are specifically trying to avoid. To prevent this
- // we must insert a no-op module pass to reset the pass manager.
- MPM.add(createBarrierNoopPass());
-
- if (RunPartialInlining)
- MPM.add(createPartialInliningPass());
-
- if (OptLevel > 1 && !PrepareForLTO && !PrepareForThinLTO)
- // Remove avail extern fns and globals definitions if we aren't
- // compiling an object file for later LTO. For LTO we want to preserve
- // these so they are eligible for inlining at link-time. Note if they
- // are unreferenced they will be removed by GlobalDCE later, so
- // this only impacts referenced available externally globals.
- // Eventually they will be suppressed during codegen, but eliminating
- // here enables more opportunity for GlobalDCE as it may make
- // globals referenced by available external functions dead
- // and saves running remaining passes on the eliminated functions.
- MPM.add(createEliminateAvailableExternallyPass());
-
- // CSFDO instrumentation and use pass. Don't invoke this for Prepare pass
- // for LTO and ThinLTO -- The actual pass will be called after all inlines
- // are performed.
- // Need to do this after COMDAT variables have been eliminated,
- // (i.e. after EliminateAvailableExternallyPass).
- if (!(PrepareForLTO || PrepareForThinLTO))
- addPGOInstrPasses(MPM, /* IsCS */ true);
-
- if (EnableOrderFileInstrumentation)
- MPM.add(createInstrOrderFilePass());
-
- MPM.add(createReversePostOrderFunctionAttrsPass());
-
- // The inliner performs some kind of dead code elimination as it goes,
- // but there are cases that are not really caught by it. We might
- // at some point consider teaching the inliner about them, but it
- // is OK for now to run GlobalOpt + GlobalDCE in tandem as their
- // benefits generally outweight the cost, making the whole pipeline
- // faster.
- if (RunInliner) {
- MPM.add(createGlobalOptimizerPass());
- MPM.add(createGlobalDCEPass());
- }
-
- // If we are planning to perform ThinLTO later, let's not bloat the code with
- // unrolling/vectorization/... now. We'll first run the inliner + CGSCC passes
- // during ThinLTO and perform the rest of the optimizations afterward.
- if (PrepareForThinLTO) {
- // Ensure we perform any last passes, but do so before renaming anonymous
- // globals in case the passes add any.
- addExtensionsToPM(EP_OptimizerLast, MPM);
- MPM.add(createCanonicalizeAliasesPass());
- // Rename anon globals to be able to export them in the summary.
- MPM.add(createNameAnonGlobalPass());
- return;
- }
-
- if (PerformThinLTO)
- // Optimize globals now when performing ThinLTO, this enables more
- // optimizations later.
- MPM.add(createGlobalOptimizerPass());
-
- // Scheduling LoopVersioningLICM when inlining is over, because after that
- // we may see more accurate aliasing. Reason to run this late is that too
- // early versioning may prevent further inlining due to increase of code
- // size. By placing it just after inlining other optimizations which runs
- // later might get benefit of no-alias assumption in clone loop.
- if (UseLoopVersioningLICM) {
- MPM.add(createLoopVersioningLICMPass()); // Do LoopVersioningLICM
- MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
- }
-
- // We add a fresh GlobalsModRef run at this point. This is particularly
- // useful as the above will have inlined, DCE'ed, and function-attr
- // propagated everything. We should at this point have a reasonably minimal
- // and richly annotated call graph. By computing aliasing and mod/ref
- // information for all local globals here, the late loop passes and notably
- // the vectorizer will be able to use them to help recognize vectorizable
- // memory operations.
- //
- // Note that this relies on a bug in the pass manager which preserves
- // a module analysis into a function pass pipeline (and throughout it) so
- // long as the first function pass doesn't invalidate the module analysis.
- // Thus both Float2Int and LoopRotate have to preserve AliasAnalysis for
- // this to work. Fortunately, it is trivial to preserve AliasAnalysis
- // (doing nothing preserves it as it is required to be conservatively
- // correct in the face of IR changes).
- MPM.add(createGlobalsAAWrapperPass());
-
- MPM.add(createFloat2IntPass());
- MPM.add(createLowerConstantIntrinsicsPass());
-
- if (EnableMatrix) {
- MPM.add(createLowerMatrixIntrinsicsPass());
- // CSE the pointer arithmetic of the column vectors. This allows alias
- // analysis to establish no-aliasing between loads and stores of different
- // columns of the same matrix.
- MPM.add(createEarlyCSEPass(false));
- }
-
- addExtensionsToPM(EP_VectorizerStart, MPM);
-
- // Re-rotate loops in all our loop nests. These may have fallout out of
- // rotated form due to GVN or other transformations, and the vectorizer relies
- // on the rotated form. Disable header duplication at -Oz.
+ return;
+ }
+
+ // Add LibraryInfo if we have some.
+ if (LibraryInfo)
+ MPM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
+
+ addInitialAliasAnalysisPasses(MPM);
+
+ // For ThinLTO there are two passes of indirect call promotion. The
+ // first is during the compile phase when PerformThinLTO=false and
+ // intra-module indirect call targets are promoted. The second is during
+ // the ThinLTO backend when PerformThinLTO=true, when we promote imported
+ // inter-module indirect calls. For that we perform indirect call promotion
+ // earlier in the pass pipeline, here before globalopt. Otherwise imported
+ // available_externally functions look unreferenced and are removed.
+ if (PerformThinLTO) {
+ MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true,
+ !PGOSampleUse.empty()));
+ MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
+ }
+
+ // For SamplePGO in ThinLTO compile phase, we do not want to unroll loops
+ // as it will change the CFG too much to make the 2nd profile annotation
+ // in backend more difficult.
+ bool PrepareForThinLTOUsingPGOSampleProfile =
+ PrepareForThinLTO && !PGOSampleUse.empty();
+ if (PrepareForThinLTOUsingPGOSampleProfile)
+ DisableUnrollLoops = true;
+
+ // Infer attributes about declarations if possible.
+ MPM.add(createInferFunctionAttrsLegacyPass());
+
+ // Infer attributes on declarations, call sites, arguments, etc.
+ if (AttributorRun & AttributorRunOption::MODULE)
+ MPM.add(createAttributorLegacyPass());
+
+ addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
+
+ if (OptLevel > 2)
+ MPM.add(createCallSiteSplittingPass());
+
+ MPM.add(createIPSCCPPass()); // IP SCCP
+ MPM.add(createCalledValuePropagationPass());
+
+ MPM.add(createGlobalOptimizerPass()); // Optimize out global vars
+ // Promote any localized global vars.
+ MPM.add(createPromoteMemoryToRegisterPass());
+
+ MPM.add(createDeadArgEliminationPass()); // Dead argument elimination
+
+ MPM.add(createInstructionCombiningPass()); // Clean up after IPCP & DAE
+ addExtensionsToPM(EP_Peephole, MPM);
+ MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
+
+ // For SamplePGO in ThinLTO compile phase, we do not want to do indirect
+ // call promotion as it will change the CFG too much to make the 2nd
+ // profile annotation in backend more difficult.
+ // PGO instrumentation is added during the compile phase for ThinLTO, do
+ // not run it a second time
+ if (DefaultOrPreLinkPipeline && !PrepareForThinLTOUsingPGOSampleProfile)
+ addPGOInstrPasses(MPM);
+
+ // Create profile COMDAT variables. Lld linker wants to see all variables
+ // before the LTO/ThinLTO link since it needs to resolve symbols/comdats.
+ if (!PerformThinLTO && EnablePGOCSInstrGen)
+ MPM.add(createPGOInstrumentationGenCreateVarLegacyPass(PGOInstrGen));
+
+ // We add a module alias analysis pass here. In part due to bugs in the
+ // analysis infrastructure this "works" in that the analysis stays alive
+ // for the entire SCC pass run below.
+ MPM.add(createGlobalsAAWrapperPass());
+
+ // Start of CallGraph SCC passes.
+ MPM.add(createPruneEHPass()); // Remove dead EH info
+ bool RunInliner = false;
+ if (Inliner) {
+ MPM.add(Inliner);
+ Inliner = nullptr;
+ RunInliner = true;
+ }
+
+ // Infer attributes on declarations, call sites, arguments, etc. for an SCC.
+ if (AttributorRun & AttributorRunOption::CGSCC)
+ MPM.add(createAttributorCGSCCLegacyPass());
+
+ // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
+ // there are no OpenMP runtime calls present in the module.
+ if (OptLevel > 1)
+ MPM.add(createOpenMPOptLegacyPass());
+
+ MPM.add(createPostOrderFunctionAttrsLegacyPass());
+ if (OptLevel > 2)
+ MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args
+
+ addExtensionsToPM(EP_CGSCCOptimizerLate, MPM);
+ addFunctionSimplificationPasses(MPM);
+
+ // FIXME: This is a HACK! The inliner pass above implicitly creates a CGSCC
+ // pass manager that we are specifically trying to avoid. To prevent this
+ // we must insert a no-op module pass to reset the pass manager.
+ MPM.add(createBarrierNoopPass());
+
+ if (RunPartialInlining)
+ MPM.add(createPartialInliningPass());
+
+ if (OptLevel > 1 && !PrepareForLTO && !PrepareForThinLTO)
+ // Remove avail extern fns and globals definitions if we aren't
+ // compiling an object file for later LTO. For LTO we want to preserve
+ // these so they are eligible for inlining at link-time. Note if they
+ // are unreferenced they will be removed by GlobalDCE later, so
+ // this only impacts referenced available externally globals.
+ // Eventually they will be suppressed during codegen, but eliminating
+ // here enables more opportunity for GlobalDCE as it may make
+ // globals referenced by available external functions dead
+ // and saves running remaining passes on the eliminated functions.
+ MPM.add(createEliminateAvailableExternallyPass());
+
+ // CSFDO instrumentation and use pass. Don't invoke this for Prepare pass
+ // for LTO and ThinLTO -- The actual pass will be called after all inlines
+ // are performed.
+ // Need to do this after COMDAT variables have been eliminated,
+ // (i.e. after EliminateAvailableExternallyPass).
+ if (!(PrepareForLTO || PrepareForThinLTO))
+ addPGOInstrPasses(MPM, /* IsCS */ true);
+
+ if (EnableOrderFileInstrumentation)
+ MPM.add(createInstrOrderFilePass());
+
+ MPM.add(createReversePostOrderFunctionAttrsPass());
+
+ // The inliner performs some kind of dead code elimination as it goes,
+ // but there are cases that are not really caught by it. We might
+ // at some point consider teaching the inliner about them, but it
+ // is OK for now to run GlobalOpt + GlobalDCE in tandem as their
+ // benefits generally outweight the cost, making the whole pipeline
+ // faster.
+ if (RunInliner) {
+ MPM.add(createGlobalOptimizerPass());
+ MPM.add(createGlobalDCEPass());
+ }
+
+ // If we are planning to perform ThinLTO later, let's not bloat the code with
+ // unrolling/vectorization/... now. We'll first run the inliner + CGSCC passes
+ // during ThinLTO and perform the rest of the optimizations afterward.
+ if (PrepareForThinLTO) {
+ // Ensure we perform any last passes, but do so before renaming anonymous
+ // globals in case the passes add any.
+ addExtensionsToPM(EP_OptimizerLast, MPM);
+ MPM.add(createCanonicalizeAliasesPass());
+ // Rename anon globals to be able to export them in the summary.
+ MPM.add(createNameAnonGlobalPass());
+ return;
+ }
+
+ if (PerformThinLTO)
+ // Optimize globals now when performing ThinLTO, this enables more
+ // optimizations later.
+ MPM.add(createGlobalOptimizerPass());
+
+ // Scheduling LoopVersioningLICM when inlining is over, because after that
+ // we may see more accurate aliasing. Reason to run this late is that too
+ // early versioning may prevent further inlining due to increase of code
+ // size. By placing it just after inlining other optimizations which runs
+ // later might get benefit of no-alias assumption in clone loop.
+ if (UseLoopVersioningLICM) {
+ MPM.add(createLoopVersioningLICMPass()); // Do LoopVersioningLICM
+ MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+ }
+
+ // We add a fresh GlobalsModRef run at this point. This is particularly
+ // useful as the above will have inlined, DCE'ed, and function-attr
+ // propagated everything. We should at this point have a reasonably minimal
+ // and richly annotated call graph. By computing aliasing and mod/ref
+ // information for all local globals here, the late loop passes and notably
+ // the vectorizer will be able to use them to help recognize vectorizable
+ // memory operations.
+ //
+ // Note that this relies on a bug in the pass manager which preserves
+ // a module analysis into a function pass pipeline (and throughout it) so
+ // long as the first function pass doesn't invalidate the module analysis.
+ // Thus both Float2Int and LoopRotate have to preserve AliasAnalysis for
+ // this to work. Fortunately, it is trivial to preserve AliasAnalysis
+ // (doing nothing preserves it as it is required to be conservatively
+ // correct in the face of IR changes).
+ MPM.add(createGlobalsAAWrapperPass());
+
+ MPM.add(createFloat2IntPass());
+ MPM.add(createLowerConstantIntrinsicsPass());
+
+ if (EnableMatrix) {
+ MPM.add(createLowerMatrixIntrinsicsPass());
+ // CSE the pointer arithmetic of the column vectors. This allows alias
+ // analysis to establish no-aliasing between loads and stores of different
+ // columns of the same matrix.
+ MPM.add(createEarlyCSEPass(false));
+ }
+
+ addExtensionsToPM(EP_VectorizerStart, MPM);
+
+ // Re-rotate loops in all our loop nests. These may have fallout out of
+ // rotated form due to GVN or other transformations, and the vectorizer relies
+ // on the rotated form. Disable header duplication at -Oz.
MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO));
-
- // Distribute loops to allow partial vectorization. I.e. isolate dependences
- // into separate loop that would otherwise inhibit vectorization. This is
- // currently only performed for loops marked with the metadata
- // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
- MPM.add(createLoopDistributePass());
-
- MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
-
- // Eliminate loads by forwarding stores from the previous iteration to loads
- // of the current iteration.
- MPM.add(createLoopLoadEliminationPass());
-
- // FIXME: Because of #pragma vectorize enable, the passes below are always
- // inserted in the pipeline, even when the vectorizer doesn't run (ex. when
- // on -O1 and no #pragma is found). Would be good to have these two passes
- // as function calls, so that we can only pass them when the vectorizer
- // changed the code.
- MPM.add(createInstructionCombiningPass());
- if (OptLevel > 1 && ExtraVectorizerPasses) {
- // At higher optimization levels, try to clean up any runtime overlap and
- // alignment checks inserted by the vectorizer. We want to track correllated
- // runtime checks for two inner loops in the same outer loop, fold any
- // common computations, hoist loop-invariant aspects out of any outer loop,
- // and unswitch the runtime checks if possible. Once hoisted, we may have
- // dead (or speculatable) control flows or more combining opportunities.
- MPM.add(createEarlyCSEPass());
- MPM.add(createCorrelatedValuePropagationPass());
- MPM.add(createInstructionCombiningPass());
- MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
- MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
- MPM.add(createCFGSimplificationPass());
- MPM.add(createInstructionCombiningPass());
- }
-
- // Cleanup after loop vectorization, etc. Simplification passes like CVP and
- // GVN, loop transforms, and others have already run, so it's now better to
- // convert to more optimized IR using more aggressive simplify CFG options.
- // The extra sinking transform can create larger basic blocks, so do this
- // before SLP vectorization.
+
+ // Distribute loops to allow partial vectorization. I.e. isolate dependences
+ // into separate loop that would otherwise inhibit vectorization. This is
+ // currently only performed for loops marked with the metadata
+ // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
+ MPM.add(createLoopDistributePass());
+
+ MPM.add(createLoopVectorizePass(!LoopsInterleaved, !LoopVectorize));
+
+ // Eliminate loads by forwarding stores from the previous iteration to loads
+ // of the current iteration.
+ MPM.add(createLoopLoadEliminationPass());
+
+ // FIXME: Because of #pragma vectorize enable, the passes below are always
+ // inserted in the pipeline, even when the vectorizer doesn't run (ex. when
+ // on -O1 and no #pragma is found). Would be good to have these two passes
+ // as function calls, so that we can only pass them when the vectorizer
+ // changed the code.
+ MPM.add(createInstructionCombiningPass());
+ if (OptLevel > 1 && ExtraVectorizerPasses) {
+ // At higher optimization levels, try to clean up any runtime overlap and
+ // alignment checks inserted by the vectorizer. We want to track correllated
+ // runtime checks for two inner loops in the same outer loop, fold any
+ // common computations, hoist loop-invariant aspects out of any outer loop,
+ // and unswitch the runtime checks if possible. Once hoisted, we may have
+ // dead (or speculatable) control flows or more combining opportunities.
+ MPM.add(createEarlyCSEPass());
+ MPM.add(createCorrelatedValuePropagationPass());
+ MPM.add(createInstructionCombiningPass());
+ MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+ MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
+ MPM.add(createCFGSimplificationPass());
+ MPM.add(createInstructionCombiningPass());
+ }
+
+ // Cleanup after loop vectorization, etc. Simplification passes like CVP and
+ // GVN, loop transforms, and others have already run, so it's now better to
+ // convert to more optimized IR using more aggressive simplify CFG options.
+ // The extra sinking transform can create larger basic blocks, so do this
+ // before SLP vectorization.
// FIXME: study whether hoisting and/or sinking of common instructions should
// be delayed until after SLP vectorizer.
MPM.add(createCFGSimplificationPass(SimplifyCFGOptions()
@@ -826,464 +826,464 @@ void PassManagerBuilder::populateModulePassManager(
.needCanonicalLoops(false)
.hoistCommonInsts(true)
.sinkCommonInsts(true)));
-
- if (SLPVectorize) {
- MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
- if (OptLevel > 1 && ExtraVectorizerPasses) {
- MPM.add(createEarlyCSEPass());
- }
- }
-
- // Enhance/cleanup vector code.
- MPM.add(createVectorCombinePass());
-
- addExtensionsToPM(EP_Peephole, MPM);
- MPM.add(createInstructionCombiningPass());
-
- if (EnableUnrollAndJam && !DisableUnrollLoops) {
- // Unroll and Jam. We do this before unroll but need to be in a separate
- // loop pass manager in order for the outer loop to be processed by
- // unroll and jam before the inner loop is unrolled.
- MPM.add(createLoopUnrollAndJamPass(OptLevel));
- }
-
- // Unroll small loops
- MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
- ForgetAllSCEVInLoopUnroll));
-
- if (!DisableUnrollLoops) {
- // LoopUnroll may generate some redundency to cleanup.
- MPM.add(createInstructionCombiningPass());
-
- // Runtime unrolling will introduce runtime check in loop prologue. If the
- // unrolled loop is a inner loop, then the prologue will be inside the
- // outer loop. LICM pass can help to promote the runtime check out if the
- // checked value is loop invariant.
- MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
- }
-
- MPM.add(createWarnMissedTransformationsPass());
-
- // After vectorization and unrolling, assume intrinsics may tell us more
- // about pointer alignments.
- MPM.add(createAlignmentFromAssumptionsPass());
-
- // FIXME: We shouldn't bother with this anymore.
- MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
-
- // GlobalOpt already deletes dead functions and globals, at -O2 try a
- // late pass of GlobalDCE. It is capable of deleting dead cycles.
- if (OptLevel > 1) {
- MPM.add(createGlobalDCEPass()); // Remove dead fns and globals.
- MPM.add(createConstantMergePass()); // Merge dup global constants
- }
-
- // See comment in the new PM for justification of scheduling splitting at
- // this stage (\ref buildModuleSimplificationPipeline).
- if (EnableHotColdSplit && !(PrepareForLTO || PrepareForThinLTO))
- MPM.add(createHotColdSplittingPass());
-
+
+ if (SLPVectorize) {
+ MPM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+ if (OptLevel > 1 && ExtraVectorizerPasses) {
+ MPM.add(createEarlyCSEPass());
+ }
+ }
+
+ // Enhance/cleanup vector code.
+ MPM.add(createVectorCombinePass());
+
+ addExtensionsToPM(EP_Peephole, MPM);
+ MPM.add(createInstructionCombiningPass());
+
+ if (EnableUnrollAndJam && !DisableUnrollLoops) {
+ // Unroll and Jam. We do this before unroll but need to be in a separate
+ // loop pass manager in order for the outer loop to be processed by
+ // unroll and jam before the inner loop is unrolled.
+ MPM.add(createLoopUnrollAndJamPass(OptLevel));
+ }
+
+ // Unroll small loops
+ MPM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
+ ForgetAllSCEVInLoopUnroll));
+
+ if (!DisableUnrollLoops) {
+ // LoopUnroll may generate some redundency to cleanup.
+ MPM.add(createInstructionCombiningPass());
+
+ // Runtime unrolling will introduce runtime check in loop prologue. If the
+ // unrolled loop is a inner loop, then the prologue will be inside the
+ // outer loop. LICM pass can help to promote the runtime check out if the
+ // checked value is loop invariant.
+ MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+ }
+
+ MPM.add(createWarnMissedTransformationsPass());
+
+ // After vectorization and unrolling, assume intrinsics may tell us more
+ // about pointer alignments.
+ MPM.add(createAlignmentFromAssumptionsPass());
+
+ // FIXME: We shouldn't bother with this anymore.
+ MPM.add(createStripDeadPrototypesPass()); // Get rid of dead prototypes
+
+ // GlobalOpt already deletes dead functions and globals, at -O2 try a
+ // late pass of GlobalDCE. It is capable of deleting dead cycles.
+ if (OptLevel > 1) {
+ MPM.add(createGlobalDCEPass()); // Remove dead fns and globals.
+ MPM.add(createConstantMergePass()); // Merge dup global constants
+ }
+
+ // See comment in the new PM for justification of scheduling splitting at
+ // this stage (\ref buildModuleSimplificationPipeline).
+ if (EnableHotColdSplit && !(PrepareForLTO || PrepareForThinLTO))
+ MPM.add(createHotColdSplittingPass());
+
if (EnableIROutliner)
MPM.add(createIROutlinerPass());
- if (MergeFunctions)
- MPM.add(createMergeFunctionsPass());
-
- // Add Module flag "CG Profile" based on Branch Frequency Information.
- if (CallGraphProfile)
- MPM.add(createCGProfileLegacyPass());
-
- // LoopSink pass sinks instructions hoisted by LICM, which serves as a
- // canonicalization pass that enables other optimizations. As a result,
- // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
- // result too early.
- MPM.add(createLoopSinkPass());
- // Get rid of LCSSA nodes.
- MPM.add(createInstSimplifyLegacyPass());
-
- // This hoists/decomposes div/rem ops. It should run after other sink/hoist
- // passes to avoid re-sinking, but before SimplifyCFG because it can allow
- // flattening of blocks.
- MPM.add(createDivRemPairsPass());
-
- // LoopSink (and other loop passes since the last simplifyCFG) might have
- // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
- MPM.add(createCFGSimplificationPass());
-
- addExtensionsToPM(EP_OptimizerLast, MPM);
-
- if (PrepareForLTO) {
- MPM.add(createCanonicalizeAliasesPass());
- // Rename anon globals to be able to handle them in the summary
- MPM.add(createNameAnonGlobalPass());
- }
+ if (MergeFunctions)
+ MPM.add(createMergeFunctionsPass());
+
+ // Add Module flag "CG Profile" based on Branch Frequency Information.
+ if (CallGraphProfile)
+ MPM.add(createCGProfileLegacyPass());
+
+ // LoopSink pass sinks instructions hoisted by LICM, which serves as a
+ // canonicalization pass that enables other optimizations. As a result,
+ // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
+ // result too early.
+ MPM.add(createLoopSinkPass());
+ // Get rid of LCSSA nodes.
+ MPM.add(createInstSimplifyLegacyPass());
+
+ // This hoists/decomposes div/rem ops. It should run after other sink/hoist
+ // passes to avoid re-sinking, but before SimplifyCFG because it can allow
+ // flattening of blocks.
+ MPM.add(createDivRemPairsPass());
+
+ // LoopSink (and other loop passes since the last simplifyCFG) might have
+ // resulted in single-entry-single-exit or empty blocks. Clean up the CFG.
+ MPM.add(createCFGSimplificationPass());
+
+ addExtensionsToPM(EP_OptimizerLast, MPM);
+
+ if (PrepareForLTO) {
+ MPM.add(createCanonicalizeAliasesPass());
+ // Rename anon globals to be able to handle them in the summary
+ MPM.add(createNameAnonGlobalPass());
+ }
MPM.add(createAnnotationRemarksLegacyPass());
-}
-
-void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
- // Load sample profile before running the LTO optimization pipeline.
- if (!PGOSampleUse.empty()) {
- PM.add(createPruneEHPass());
- PM.add(createSampleProfileLoaderPass(PGOSampleUse));
- }
-
- // Remove unused virtual tables to improve the quality of code generated by
- // whole-program devirtualization and bitset lowering.
- PM.add(createGlobalDCEPass());
-
- // Provide AliasAnalysis services for optimizations.
- addInitialAliasAnalysisPasses(PM);
-
- // Allow forcing function attributes as a debugging and tuning aid.
- PM.add(createForceFunctionAttrsLegacyPass());
-
- // Infer attributes about declarations if possible.
- PM.add(createInferFunctionAttrsLegacyPass());
-
- if (OptLevel > 1) {
- // Split call-site with more constrained arguments.
- PM.add(createCallSiteSplittingPass());
-
- // Indirect call promotion. This should promote all the targets that are
- // left by the earlier promotion pass that promotes intra-module targets.
- // This two-step promotion is to save the compile time. For LTO, it should
- // produce the same result as if we only do promotion here.
- PM.add(
- createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty()));
-
- // Propagate constants at call sites into the functions they call. This
- // opens opportunities for globalopt (and inlining) by substituting function
- // pointers passed as arguments to direct uses of functions.
- PM.add(createIPSCCPPass());
-
- // Attach metadata to indirect call sites indicating the set of functions
- // they may target at run-time. This should follow IPSCCP.
- PM.add(createCalledValuePropagationPass());
-
- // Infer attributes on declarations, call sites, arguments, etc.
- if (AttributorRun & AttributorRunOption::MODULE)
- PM.add(createAttributorLegacyPass());
- }
-
- // Infer attributes about definitions. The readnone attribute in particular is
- // required for virtual constant propagation.
- PM.add(createPostOrderFunctionAttrsLegacyPass());
- PM.add(createReversePostOrderFunctionAttrsPass());
-
- // Split globals using inrange annotations on GEP indices. This can help
- // improve the quality of generated code when virtual constant propagation or
- // control flow integrity are enabled.
- PM.add(createGlobalSplitPass());
-
- // Apply whole-program devirtualization and virtual constant propagation.
- PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr));
-
- // That's all we need at opt level 1.
- if (OptLevel == 1)
- return;
-
- // Now that we internalized some globals, see if we can hack on them!
- PM.add(createGlobalOptimizerPass());
- // Promote any localized global vars.
- PM.add(createPromoteMemoryToRegisterPass());
-
- // Linking modules together can lead to duplicated global constants, only
- // keep one copy of each constant.
- PM.add(createConstantMergePass());
-
- // Remove unused arguments from functions.
- PM.add(createDeadArgEliminationPass());
-
- // Reduce the code after globalopt and ipsccp. Both can open up significant
- // simplification opportunities, and both can propagate functions through
- // function pointers. When this happens, we often have to resolve varargs
- // calls, etc, so let instcombine do this.
- if (OptLevel > 2)
- PM.add(createAggressiveInstCombinerPass());
- PM.add(createInstructionCombiningPass());
- addExtensionsToPM(EP_Peephole, PM);
-
- // Inline small functions
- bool RunInliner = Inliner;
- if (RunInliner) {
- PM.add(Inliner);
- Inliner = nullptr;
- }
-
- PM.add(createPruneEHPass()); // Remove dead EH info.
-
- // CSFDO instrumentation and use pass.
- addPGOInstrPasses(PM, /* IsCS */ true);
-
- // Infer attributes on declarations, call sites, arguments, etc. for an SCC.
- if (AttributorRun & AttributorRunOption::CGSCC)
- PM.add(createAttributorCGSCCLegacyPass());
-
- // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
- // there are no OpenMP runtime calls present in the module.
- if (OptLevel > 1)
- PM.add(createOpenMPOptLegacyPass());
-
- // Optimize globals again if we ran the inliner.
- if (RunInliner)
- PM.add(createGlobalOptimizerPass());
- PM.add(createGlobalDCEPass()); // Remove dead functions.
-
- // If we didn't decide to inline a function, check to see if we can
- // transform it to pass arguments by value instead of by reference.
- PM.add(createArgumentPromotionPass());
-
- // The IPO passes may leave cruft around. Clean up after them.
- PM.add(createInstructionCombiningPass());
- addExtensionsToPM(EP_Peephole, PM);
+}
+
+void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
+ // Load sample profile before running the LTO optimization pipeline.
+ if (!PGOSampleUse.empty()) {
+ PM.add(createPruneEHPass());
+ PM.add(createSampleProfileLoaderPass(PGOSampleUse));
+ }
+
+ // Remove unused virtual tables to improve the quality of code generated by
+ // whole-program devirtualization and bitset lowering.
+ PM.add(createGlobalDCEPass());
+
+ // Provide AliasAnalysis services for optimizations.
+ addInitialAliasAnalysisPasses(PM);
+
+ // Allow forcing function attributes as a debugging and tuning aid.
+ PM.add(createForceFunctionAttrsLegacyPass());
+
+ // Infer attributes about declarations if possible.
+ PM.add(createInferFunctionAttrsLegacyPass());
+
+ if (OptLevel > 1) {
+ // Split call-site with more constrained arguments.
+ PM.add(createCallSiteSplittingPass());
+
+ // Indirect call promotion. This should promote all the targets that are
+ // left by the earlier promotion pass that promotes intra-module targets.
+ // This two-step promotion is to save the compile time. For LTO, it should
+ // produce the same result as if we only do promotion here.
+ PM.add(
+ createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty()));
+
+ // Propagate constants at call sites into the functions they call. This
+ // opens opportunities for globalopt (and inlining) by substituting function
+ // pointers passed as arguments to direct uses of functions.
+ PM.add(createIPSCCPPass());
+
+ // Attach metadata to indirect call sites indicating the set of functions
+ // they may target at run-time. This should follow IPSCCP.
+ PM.add(createCalledValuePropagationPass());
+
+ // Infer attributes on declarations, call sites, arguments, etc.
+ if (AttributorRun & AttributorRunOption::MODULE)
+ PM.add(createAttributorLegacyPass());
+ }
+
+ // Infer attributes about definitions. The readnone attribute in particular is
+ // required for virtual constant propagation.
+ PM.add(createPostOrderFunctionAttrsLegacyPass());
+ PM.add(createReversePostOrderFunctionAttrsPass());
+
+ // Split globals using inrange annotations on GEP indices. This can help
+ // improve the quality of generated code when virtual constant propagation or
+ // control flow integrity are enabled.
+ PM.add(createGlobalSplitPass());
+
+ // Apply whole-program devirtualization and virtual constant propagation.
+ PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr));
+
+ // That's all we need at opt level 1.
+ if (OptLevel == 1)
+ return;
+
+ // Now that we internalized some globals, see if we can hack on them!
+ PM.add(createGlobalOptimizerPass());
+ // Promote any localized global vars.
+ PM.add(createPromoteMemoryToRegisterPass());
+
+ // Linking modules together can lead to duplicated global constants, only
+ // keep one copy of each constant.
+ PM.add(createConstantMergePass());
+
+ // Remove unused arguments from functions.
+ PM.add(createDeadArgEliminationPass());
+
+ // Reduce the code after globalopt and ipsccp. Both can open up significant
+ // simplification opportunities, and both can propagate functions through
+ // function pointers. When this happens, we often have to resolve varargs
+ // calls, etc, so let instcombine do this.
+ if (OptLevel > 2)
+ PM.add(createAggressiveInstCombinerPass());
+ PM.add(createInstructionCombiningPass());
+ addExtensionsToPM(EP_Peephole, PM);
+
+ // Inline small functions
+ bool RunInliner = Inliner;
+ if (RunInliner) {
+ PM.add(Inliner);
+ Inliner = nullptr;
+ }
+
+ PM.add(createPruneEHPass()); // Remove dead EH info.
+
+ // CSFDO instrumentation and use pass.
+ addPGOInstrPasses(PM, /* IsCS */ true);
+
+ // Infer attributes on declarations, call sites, arguments, etc. for an SCC.
+ if (AttributorRun & AttributorRunOption::CGSCC)
+ PM.add(createAttributorCGSCCLegacyPass());
+
+ // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
+ // there are no OpenMP runtime calls present in the module.
+ if (OptLevel > 1)
+ PM.add(createOpenMPOptLegacyPass());
+
+ // Optimize globals again if we ran the inliner.
+ if (RunInliner)
+ PM.add(createGlobalOptimizerPass());
+ PM.add(createGlobalDCEPass()); // Remove dead functions.
+
+ // If we didn't decide to inline a function, check to see if we can
+ // transform it to pass arguments by value instead of by reference.
+ PM.add(createArgumentPromotionPass());
+
+ // The IPO passes may leave cruft around. Clean up after them.
+ PM.add(createInstructionCombiningPass());
+ addExtensionsToPM(EP_Peephole, PM);
PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true));
-
- // Break up allocas
- PM.add(createSROAPass());
-
- // LTO provides additional opportunities for tailcall elimination due to
- // link-time inlining, and visibility of nocapture attribute.
- if (OptLevel > 1)
- PM.add(createTailCallEliminationPass());
-
- // Infer attributes on declarations, call sites, arguments, etc.
- PM.add(createPostOrderFunctionAttrsLegacyPass()); // Add nocapture.
- // Run a few AA driven optimizations here and now, to cleanup the code.
- PM.add(createGlobalsAAWrapperPass()); // IP alias analysis.
-
- PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
- PM.add(NewGVN ? createNewGVNPass()
- : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
- PM.add(createMemCpyOptPass()); // Remove dead memcpys.
-
- // Nuke dead stores.
- PM.add(createDeadStoreEliminationPass());
+
+ // Break up allocas
+ PM.add(createSROAPass());
+
+ // LTO provides additional opportunities for tailcall elimination due to
+ // link-time inlining, and visibility of nocapture attribute.
+ if (OptLevel > 1)
+ PM.add(createTailCallEliminationPass());
+
+ // Infer attributes on declarations, call sites, arguments, etc.
+ PM.add(createPostOrderFunctionAttrsLegacyPass()); // Add nocapture.
+ // Run a few AA driven optimizations here and now, to cleanup the code.
+ PM.add(createGlobalsAAWrapperPass()); // IP alias analysis.
+
+ PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
+ PM.add(NewGVN ? createNewGVNPass()
+ : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies.
+ PM.add(createMemCpyOptPass()); // Remove dead memcpys.
+
+ // Nuke dead stores.
+ PM.add(createDeadStoreEliminationPass());
PM.add(createMergedLoadStoreMotionPass()); // Merge ld/st in diamonds.
-
- // More loops are countable; try to optimize them.
+
+ // More loops are countable; try to optimize them.
if (EnableLoopFlatten)
PM.add(createLoopFlattenPass());
- PM.add(createIndVarSimplifyPass());
- PM.add(createLoopDeletionPass());
- if (EnableLoopInterchange)
- PM.add(createLoopInterchangePass());
-
+ PM.add(createIndVarSimplifyPass());
+ PM.add(createLoopDeletionPass());
+ if (EnableLoopInterchange)
+ PM.add(createLoopInterchangePass());
+
if (EnableConstraintElimination)
PM.add(createConstraintEliminationPass());
// Unroll small loops and perform peeling.
- PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
- ForgetAllSCEVInLoopUnroll));
+ PM.add(createSimpleLoopUnrollPass(OptLevel, DisableUnrollLoops,
+ ForgetAllSCEVInLoopUnroll));
PM.add(createLoopDistributePass());
- PM.add(createLoopVectorizePass(true, !LoopVectorize));
- // The vectorizer may have significantly shortened a loop body; unroll again.
- PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
- ForgetAllSCEVInLoopUnroll));
-
- PM.add(createWarnMissedTransformationsPass());
-
- // Now that we've optimized loops (in particular loop induction variables),
- // we may have exposed more scalar opportunities. Run parts of the scalar
- // optimizer again at this point.
- PM.add(createInstructionCombiningPass()); // Initial cleanup
+ PM.add(createLoopVectorizePass(true, !LoopVectorize));
+ // The vectorizer may have significantly shortened a loop body; unroll again.
+ PM.add(createLoopUnrollPass(OptLevel, DisableUnrollLoops,
+ ForgetAllSCEVInLoopUnroll));
+
+ PM.add(createWarnMissedTransformationsPass());
+
+ // Now that we've optimized loops (in particular loop induction variables),
+ // we may have exposed more scalar opportunities. Run parts of the scalar
+ // optimizer again at this point.
+ PM.add(createInstructionCombiningPass()); // Initial cleanup
PM.add(createCFGSimplificationPass(SimplifyCFGOptions() // if-convert
.hoistCommonInsts(true)));
- PM.add(createSCCPPass()); // Propagate exposed constants
- PM.add(createInstructionCombiningPass()); // Clean up again
- PM.add(createBitTrackingDCEPass());
-
- // More scalar chains could be vectorized due to more alias information
- if (SLPVectorize)
- PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
-
- PM.add(createVectorCombinePass()); // Clean up partial vectorization.
-
- // After vectorization, assume intrinsics may tell us more about pointer
- // alignments.
- PM.add(createAlignmentFromAssumptionsPass());
-
- // Cleanup and simplify the code after the scalar optimizations.
- PM.add(createInstructionCombiningPass());
- addExtensionsToPM(EP_Peephole, PM);
-
+ PM.add(createSCCPPass()); // Propagate exposed constants
+ PM.add(createInstructionCombiningPass()); // Clean up again
+ PM.add(createBitTrackingDCEPass());
+
+ // More scalar chains could be vectorized due to more alias information
+ if (SLPVectorize)
+ PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
+
+ PM.add(createVectorCombinePass()); // Clean up partial vectorization.
+
+ // After vectorization, assume intrinsics may tell us more about pointer
+ // alignments.
+ PM.add(createAlignmentFromAssumptionsPass());
+
+ // Cleanup and simplify the code after the scalar optimizations.
+ PM.add(createInstructionCombiningPass());
+ addExtensionsToPM(EP_Peephole, PM);
+
PM.add(createJumpThreadingPass(/*FreezeSelectCond*/ true));
-}
-
-void PassManagerBuilder::addLateLTOOptimizationPasses(
- legacy::PassManagerBase &PM) {
- // See comment in the new PM for justification of scheduling splitting at
- // this stage (\ref buildLTODefaultPipeline).
- if (EnableHotColdSplit)
- PM.add(createHotColdSplittingPass());
-
- // Delete basic blocks, which optimization passes may have killed.
+}
+
+void PassManagerBuilder::addLateLTOOptimizationPasses(
+ legacy::PassManagerBase &PM) {
+ // See comment in the new PM for justification of scheduling splitting at
+ // this stage (\ref buildLTODefaultPipeline).
+ if (EnableHotColdSplit)
+ PM.add(createHotColdSplittingPass());
+
+ // Delete basic blocks, which optimization passes may have killed.
PM.add(
createCFGSimplificationPass(SimplifyCFGOptions().hoistCommonInsts(true)));
-
- // Drop bodies of available externally objects to improve GlobalDCE.
- PM.add(createEliminateAvailableExternallyPass());
-
- // Now that we have optimized the program, discard unreachable functions.
- PM.add(createGlobalDCEPass());
-
- // FIXME: this is profitable (for compiler time) to do at -O0 too, but
- // currently it damages debug info.
- if (MergeFunctions)
- PM.add(createMergeFunctionsPass());
-}
-
-void PassManagerBuilder::populateThinLTOPassManager(
- legacy::PassManagerBase &PM) {
- PerformThinLTO = true;
- if (LibraryInfo)
- PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
-
- if (VerifyInput)
- PM.add(createVerifierPass());
-
- if (ImportSummary) {
- // This pass imports type identifier resolutions for whole-program
- // devirtualization and CFI. It must run early because other passes may
- // disturb the specific instruction patterns that these passes look for,
- // creating dependencies on resolutions that may not appear in the summary.
- //
- // For example, GVN may transform the pattern assume(type.test) appearing in
- // two basic blocks into assume(phi(type.test, type.test)), which would
- // transform a dependency on a WPD resolution into a dependency on a type
- // identifier resolution for CFI.
- //
- // Also, WPD has access to more precise information than ICP and can
- // devirtualize more effectively, so it should operate on the IR first.
- PM.add(createWholeProgramDevirtPass(nullptr, ImportSummary));
- PM.add(createLowerTypeTestsPass(nullptr, ImportSummary));
- }
-
- populateModulePassManager(PM);
-
- if (VerifyOutput)
- PM.add(createVerifierPass());
- PerformThinLTO = false;
-}
-
-void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
- if (LibraryInfo)
- PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
-
- if (VerifyInput)
- PM.add(createVerifierPass());
-
- addExtensionsToPM(EP_FullLinkTimeOptimizationEarly, PM);
-
- if (OptLevel != 0)
- addLTOOptimizationPasses(PM);
- else {
- // The whole-program-devirt pass needs to run at -O0 because only it knows
- // about the llvm.type.checked.load intrinsic: it needs to both lower the
- // intrinsic itself and handle it in the summary.
- PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr));
- }
-
- // Create a function that performs CFI checks for cross-DSO calls with targets
- // in the current module.
- PM.add(createCrossDSOCFIPass());
-
- // Lower type metadata and the type.test intrinsic. This pass supports Clang's
- // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
- // link time if CFI is enabled. The pass does nothing if CFI is disabled.
- PM.add(createLowerTypeTestsPass(ExportSummary, nullptr));
- // Run a second time to clean up any type tests left behind by WPD for use
- // in ICP (which is performed earlier than this in the regular LTO pipeline).
- PM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
-
- if (OptLevel != 0)
- addLateLTOOptimizationPasses(PM);
-
- addExtensionsToPM(EP_FullLinkTimeOptimizationLast, PM);
-
+
+ // Drop bodies of available externally objects to improve GlobalDCE.
+ PM.add(createEliminateAvailableExternallyPass());
+
+ // Now that we have optimized the program, discard unreachable functions.
+ PM.add(createGlobalDCEPass());
+
+ // FIXME: this is profitable (for compiler time) to do at -O0 too, but
+ // currently it damages debug info.
+ if (MergeFunctions)
+ PM.add(createMergeFunctionsPass());
+}
+
+void PassManagerBuilder::populateThinLTOPassManager(
+ legacy::PassManagerBase &PM) {
+ PerformThinLTO = true;
+ if (LibraryInfo)
+ PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
+
+ if (VerifyInput)
+ PM.add(createVerifierPass());
+
+ if (ImportSummary) {
+ // This pass imports type identifier resolutions for whole-program
+ // devirtualization and CFI. It must run early because other passes may
+ // disturb the specific instruction patterns that these passes look for,
+ // creating dependencies on resolutions that may not appear in the summary.
+ //
+ // For example, GVN may transform the pattern assume(type.test) appearing in
+ // two basic blocks into assume(phi(type.test, type.test)), which would
+ // transform a dependency on a WPD resolution into a dependency on a type
+ // identifier resolution for CFI.
+ //
+ // Also, WPD has access to more precise information than ICP and can
+ // devirtualize more effectively, so it should operate on the IR first.
+ PM.add(createWholeProgramDevirtPass(nullptr, ImportSummary));
+ PM.add(createLowerTypeTestsPass(nullptr, ImportSummary));
+ }
+
+ populateModulePassManager(PM);
+
+ if (VerifyOutput)
+ PM.add(createVerifierPass());
+ PerformThinLTO = false;
+}
+
+void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
+ if (LibraryInfo)
+ PM.add(new TargetLibraryInfoWrapperPass(*LibraryInfo));
+
+ if (VerifyInput)
+ PM.add(createVerifierPass());
+
+ addExtensionsToPM(EP_FullLinkTimeOptimizationEarly, PM);
+
+ if (OptLevel != 0)
+ addLTOOptimizationPasses(PM);
+ else {
+ // The whole-program-devirt pass needs to run at -O0 because only it knows
+ // about the llvm.type.checked.load intrinsic: it needs to both lower the
+ // intrinsic itself and handle it in the summary.
+ PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr));
+ }
+
+ // Create a function that performs CFI checks for cross-DSO calls with targets
+ // in the current module.
+ PM.add(createCrossDSOCFIPass());
+
+ // Lower type metadata and the type.test intrinsic. This pass supports Clang's
+ // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
+ // link time if CFI is enabled. The pass does nothing if CFI is disabled.
+ PM.add(createLowerTypeTestsPass(ExportSummary, nullptr));
+ // Run a second time to clean up any type tests left behind by WPD for use
+ // in ICP (which is performed earlier than this in the regular LTO pipeline).
+ PM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
+
+ if (OptLevel != 0)
+ addLateLTOOptimizationPasses(PM);
+
+ addExtensionsToPM(EP_FullLinkTimeOptimizationLast, PM);
+
PM.add(createAnnotationRemarksLegacyPass());
- if (VerifyOutput)
- PM.add(createVerifierPass());
-}
-
-LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate() {
- PassManagerBuilder *PMB = new PassManagerBuilder();
- return wrap(PMB);
-}
-
-void LLVMPassManagerBuilderDispose(LLVMPassManagerBuilderRef PMB) {
- PassManagerBuilder *Builder = unwrap(PMB);
- delete Builder;
-}
-
-void
-LLVMPassManagerBuilderSetOptLevel(LLVMPassManagerBuilderRef PMB,
- unsigned OptLevel) {
- PassManagerBuilder *Builder = unwrap(PMB);
- Builder->OptLevel = OptLevel;
-}
-
-void
-LLVMPassManagerBuilderSetSizeLevel(LLVMPassManagerBuilderRef PMB,
- unsigned SizeLevel) {
- PassManagerBuilder *Builder = unwrap(PMB);
- Builder->SizeLevel = SizeLevel;
-}
-
-void
-LLVMPassManagerBuilderSetDisableUnitAtATime(LLVMPassManagerBuilderRef PMB,
- LLVMBool Value) {
- // NOTE: The DisableUnitAtATime switch has been removed.
-}
-
-void
-LLVMPassManagerBuilderSetDisableUnrollLoops(LLVMPassManagerBuilderRef PMB,
- LLVMBool Value) {
- PassManagerBuilder *Builder = unwrap(PMB);
- Builder->DisableUnrollLoops = Value;
-}
-
-void
-LLVMPassManagerBuilderSetDisableSimplifyLibCalls(LLVMPassManagerBuilderRef PMB,
- LLVMBool Value) {
- // NOTE: The simplify-libcalls pass has been removed.
-}
-
-void
-LLVMPassManagerBuilderUseInlinerWithThreshold(LLVMPassManagerBuilderRef PMB,
- unsigned Threshold) {
- PassManagerBuilder *Builder = unwrap(PMB);
- Builder->Inliner = createFunctionInliningPass(Threshold);
-}
-
-void
-LLVMPassManagerBuilderPopulateFunctionPassManager(LLVMPassManagerBuilderRef PMB,
- LLVMPassManagerRef PM) {
- PassManagerBuilder *Builder = unwrap(PMB);
- legacy::FunctionPassManager *FPM = unwrap<legacy::FunctionPassManager>(PM);
- Builder->populateFunctionPassManager(*FPM);
-}
-
-void
-LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB,
- LLVMPassManagerRef PM) {
- PassManagerBuilder *Builder = unwrap(PMB);
- legacy::PassManagerBase *MPM = unwrap(PM);
- Builder->populateModulePassManager(*MPM);
-}
-
-void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB,
- LLVMPassManagerRef PM,
- LLVMBool Internalize,
- LLVMBool RunInliner) {
- PassManagerBuilder *Builder = unwrap(PMB);
- legacy::PassManagerBase *LPM = unwrap(PM);
-
- // A small backwards compatibility hack. populateLTOPassManager used to take
- // an RunInliner option.
- if (RunInliner && !Builder->Inliner)
- Builder->Inliner = createFunctionInliningPass();
-
- Builder->populateLTOPassManager(*LPM);
-}
+ if (VerifyOutput)
+ PM.add(createVerifierPass());
+}
+
+LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate() {
+ PassManagerBuilder *PMB = new PassManagerBuilder();
+ return wrap(PMB);
+}
+
+void LLVMPassManagerBuilderDispose(LLVMPassManagerBuilderRef PMB) {
+ PassManagerBuilder *Builder = unwrap(PMB);
+ delete Builder;
+}
+
+void
+LLVMPassManagerBuilderSetOptLevel(LLVMPassManagerBuilderRef PMB,
+ unsigned OptLevel) {
+ PassManagerBuilder *Builder = unwrap(PMB);
+ Builder->OptLevel = OptLevel;
+}
+
+void
+LLVMPassManagerBuilderSetSizeLevel(LLVMPassManagerBuilderRef PMB,
+ unsigned SizeLevel) {
+ PassManagerBuilder *Builder = unwrap(PMB);
+ Builder->SizeLevel = SizeLevel;
+}
+
+void
+LLVMPassManagerBuilderSetDisableUnitAtATime(LLVMPassManagerBuilderRef PMB,
+ LLVMBool Value) {
+ // NOTE: The DisableUnitAtATime switch has been removed.
+}
+
+void
+LLVMPassManagerBuilderSetDisableUnrollLoops(LLVMPassManagerBuilderRef PMB,
+ LLVMBool Value) {
+ PassManagerBuilder *Builder = unwrap(PMB);
+ Builder->DisableUnrollLoops = Value;
+}
+
+void
+LLVMPassManagerBuilderSetDisableSimplifyLibCalls(LLVMPassManagerBuilderRef PMB,
+ LLVMBool Value) {
+ // NOTE: The simplify-libcalls pass has been removed.
+}
+
+void
+LLVMPassManagerBuilderUseInlinerWithThreshold(LLVMPassManagerBuilderRef PMB,
+ unsigned Threshold) {
+ PassManagerBuilder *Builder = unwrap(PMB);
+ Builder->Inliner = createFunctionInliningPass(Threshold);
+}
+
+void
+LLVMPassManagerBuilderPopulateFunctionPassManager(LLVMPassManagerBuilderRef PMB,
+ LLVMPassManagerRef PM) {
+ PassManagerBuilder *Builder = unwrap(PMB);
+ legacy::FunctionPassManager *FPM = unwrap<legacy::FunctionPassManager>(PM);
+ Builder->populateFunctionPassManager(*FPM);
+}
+
+void
+LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB,
+ LLVMPassManagerRef PM) {
+ PassManagerBuilder *Builder = unwrap(PMB);
+ legacy::PassManagerBase *MPM = unwrap(PM);
+ Builder->populateModulePassManager(*MPM);
+}
+
+void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB,
+ LLVMPassManagerRef PM,
+ LLVMBool Internalize,
+ LLVMBool RunInliner) {
+ PassManagerBuilder *Builder = unwrap(PMB);
+ legacy::PassManagerBase *LPM = unwrap(PM);
+
+ // A small backwards compatibility hack. populateLTOPassManager used to take
+ // an RunInliner option.
+ if (RunInliner && !Builder->Inliner)
+ Builder->Inliner = createFunctionInliningPass();
+
+ Builder->populateLTOPassManager(*LPM);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/PruneEH.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/PruneEH.cpp
index 0e50d45979..3f3b18771c 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/PruneEH.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/PruneEH.cpp
@@ -1,264 +1,264 @@
-//===- PruneEH.cpp - Pass which deletes unused exception handlers ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a simple interprocedural pass which walks the
-// call-graph, turning invoke instructions into calls, iff the callee cannot
-// throw an exception, and marking functions 'nounwind' if they cannot throw.
-// It implements this as a bottom-up traversal of the call-graph.
-//
-//===----------------------------------------------------------------------===//
-
+//===- PruneEH.cpp - Pass which deletes unused exception handlers ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a simple interprocedural pass which walks the
+// call-graph, turning invoke instructions into calls, iff the callee cannot
+// throw an exception, and marking functions 'nounwind' if they cannot throw.
+// It implements this as a bottom-up traversal of the call-graph.
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/Utils/CallGraphUpdater.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "prune-eh"
-
-STATISTIC(NumRemoved, "Number of invokes removed");
-STATISTIC(NumUnreach, "Number of noreturn calls optimized");
-
-namespace {
- struct PruneEH : public CallGraphSCCPass {
- static char ID; // Pass identification, replacement for typeid
- PruneEH() : CallGraphSCCPass(ID) {
- initializePruneEHPass(*PassRegistry::getPassRegistry());
- }
-
- // runOnSCC - Analyze the SCC, performing the transformation if possible.
- bool runOnSCC(CallGraphSCC &SCC) override;
- };
-}
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "prune-eh"
+
+STATISTIC(NumRemoved, "Number of invokes removed");
+STATISTIC(NumUnreach, "Number of noreturn calls optimized");
+
+namespace {
+ struct PruneEH : public CallGraphSCCPass {
+ static char ID; // Pass identification, replacement for typeid
+ PruneEH() : CallGraphSCCPass(ID) {
+ initializePruneEHPass(*PassRegistry::getPassRegistry());
+ }
+
+ // runOnSCC - Analyze the SCC, performing the transformation if possible.
+ bool runOnSCC(CallGraphSCC &SCC) override;
+ };
+}
static bool SimplifyFunction(Function *F, CallGraphUpdater &CGU);
static void DeleteBasicBlock(BasicBlock *BB, CallGraphUpdater &CGU);
-
-char PruneEH::ID = 0;
-INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh",
- "Remove unused exception handling info", false, false)
-INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
-INITIALIZE_PASS_END(PruneEH, "prune-eh",
- "Remove unused exception handling info", false, false)
-
-Pass *llvm::createPruneEHPass() { return new PruneEH(); }
-
+
+char PruneEH::ID = 0;
+INITIALIZE_PASS_BEGIN(PruneEH, "prune-eh",
+ "Remove unused exception handling info", false, false)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(PruneEH, "prune-eh",
+ "Remove unused exception handling info", false, false)
+
+Pass *llvm::createPruneEHPass() { return new PruneEH(); }
+
static bool runImpl(CallGraphUpdater &CGU, SetVector<Function *> &Functions) {
#ifndef NDEBUG
for (auto *F : Functions)
assert(F && "null Function");
#endif
- bool MadeChange = false;
-
- // First pass, scan all of the functions in the SCC, simplifying them
- // according to what we know.
+ bool MadeChange = false;
+
+ // First pass, scan all of the functions in the SCC, simplifying them
+ // according to what we know.
for (Function *F : Functions)
MadeChange |= SimplifyFunction(F, CGU);
-
- // Next, check to see if any callees might throw or if there are any external
- // functions in this SCC: if so, we cannot prune any functions in this SCC.
- // Definitions that are weak and not declared non-throwing might be
- // overridden at linktime with something that throws, so assume that.
- // If this SCC includes the unwind instruction, we KNOW it throws, so
- // obviously the SCC might throw.
- //
- bool SCCMightUnwind = false, SCCMightReturn = false;
+
+ // Next, check to see if any callees might throw or if there are any external
+ // functions in this SCC: if so, we cannot prune any functions in this SCC.
+ // Definitions that are weak and not declared non-throwing might be
+ // overridden at linktime with something that throws, so assume that.
+ // If this SCC includes the unwind instruction, we KNOW it throws, so
+ // obviously the SCC might throw.
+ //
+ bool SCCMightUnwind = false, SCCMightReturn = false;
for (Function *F : Functions) {
if (!F->hasExactDefinition()) {
- SCCMightUnwind |= !F->doesNotThrow();
- SCCMightReturn |= !F->doesNotReturn();
- } else {
- bool CheckUnwind = !SCCMightUnwind && !F->doesNotThrow();
- bool CheckReturn = !SCCMightReturn && !F->doesNotReturn();
- // Determine if we should scan for InlineAsm in a naked function as it
- // is the only way to return without a ReturnInst. Only do this for
- // no-inline functions as functions which may be inlined cannot
- // meaningfully return via assembly.
- bool CheckReturnViaAsm = CheckReturn &&
- F->hasFnAttribute(Attribute::Naked) &&
- F->hasFnAttribute(Attribute::NoInline);
-
- if (!CheckUnwind && !CheckReturn)
- continue;
-
- for (const BasicBlock &BB : *F) {
- const Instruction *TI = BB.getTerminator();
- if (CheckUnwind && TI->mayThrow()) {
- SCCMightUnwind = true;
- } else if (CheckReturn && isa<ReturnInst>(TI)) {
- SCCMightReturn = true;
- }
-
- for (const Instruction &I : BB) {
- if ((!CheckUnwind || SCCMightUnwind) &&
- (!CheckReturnViaAsm || SCCMightReturn))
- break;
-
- // Check to see if this function performs an unwind or calls an
- // unwinding function.
- if (CheckUnwind && !SCCMightUnwind && I.mayThrow()) {
- bool InstMightUnwind = true;
- if (const auto *CI = dyn_cast<CallInst>(&I)) {
- if (Function *Callee = CI->getCalledFunction()) {
- // If the callee is outside our current SCC then we may throw
- // because it might. If it is inside, do nothing.
+ SCCMightUnwind |= !F->doesNotThrow();
+ SCCMightReturn |= !F->doesNotReturn();
+ } else {
+ bool CheckUnwind = !SCCMightUnwind && !F->doesNotThrow();
+ bool CheckReturn = !SCCMightReturn && !F->doesNotReturn();
+ // Determine if we should scan for InlineAsm in a naked function as it
+ // is the only way to return without a ReturnInst. Only do this for
+ // no-inline functions as functions which may be inlined cannot
+ // meaningfully return via assembly.
+ bool CheckReturnViaAsm = CheckReturn &&
+ F->hasFnAttribute(Attribute::Naked) &&
+ F->hasFnAttribute(Attribute::NoInline);
+
+ if (!CheckUnwind && !CheckReturn)
+ continue;
+
+ for (const BasicBlock &BB : *F) {
+ const Instruction *TI = BB.getTerminator();
+ if (CheckUnwind && TI->mayThrow()) {
+ SCCMightUnwind = true;
+ } else if (CheckReturn && isa<ReturnInst>(TI)) {
+ SCCMightReturn = true;
+ }
+
+ for (const Instruction &I : BB) {
+ if ((!CheckUnwind || SCCMightUnwind) &&
+ (!CheckReturnViaAsm || SCCMightReturn))
+ break;
+
+ // Check to see if this function performs an unwind or calls an
+ // unwinding function.
+ if (CheckUnwind && !SCCMightUnwind && I.mayThrow()) {
+ bool InstMightUnwind = true;
+ if (const auto *CI = dyn_cast<CallInst>(&I)) {
+ if (Function *Callee = CI->getCalledFunction()) {
+ // If the callee is outside our current SCC then we may throw
+ // because it might. If it is inside, do nothing.
if (Functions.contains(Callee))
- InstMightUnwind = false;
- }
- }
- SCCMightUnwind |= InstMightUnwind;
- }
- if (CheckReturnViaAsm && !SCCMightReturn)
- if (const auto *CB = dyn_cast<CallBase>(&I))
- if (const auto *IA = dyn_cast<InlineAsm>(CB->getCalledOperand()))
- if (IA->hasSideEffects())
- SCCMightReturn = true;
- }
+ InstMightUnwind = false;
+ }
+ }
+ SCCMightUnwind |= InstMightUnwind;
+ }
+ if (CheckReturnViaAsm && !SCCMightReturn)
+ if (const auto *CB = dyn_cast<CallBase>(&I))
+ if (const auto *IA = dyn_cast<InlineAsm>(CB->getCalledOperand()))
+ if (IA->hasSideEffects())
+ SCCMightReturn = true;
+ }
}
- if (SCCMightUnwind && SCCMightReturn)
- break;
- }
- }
-
- // If the SCC doesn't unwind or doesn't throw, note this fact.
- if (!SCCMightUnwind || !SCCMightReturn)
+ if (SCCMightUnwind && SCCMightReturn)
+ break;
+ }
+ }
+
+ // If the SCC doesn't unwind or doesn't throw, note this fact.
+ if (!SCCMightUnwind || !SCCMightReturn)
for (Function *F : Functions) {
- if (!SCCMightUnwind && !F->hasFnAttribute(Attribute::NoUnwind)) {
- F->addFnAttr(Attribute::NoUnwind);
- MadeChange = true;
- }
-
- if (!SCCMightReturn && !F->hasFnAttribute(Attribute::NoReturn)) {
- F->addFnAttr(Attribute::NoReturn);
- MadeChange = true;
- }
- }
-
+ if (!SCCMightUnwind && !F->hasFnAttribute(Attribute::NoUnwind)) {
+ F->addFnAttr(Attribute::NoUnwind);
+ MadeChange = true;
+ }
+
+ if (!SCCMightReturn && !F->hasFnAttribute(Attribute::NoReturn)) {
+ F->addFnAttr(Attribute::NoReturn);
+ MadeChange = true;
+ }
+ }
+
for (Function *F : Functions) {
- // Convert any invoke instructions to non-throwing functions in this node
- // into call instructions with a branch. This makes the exception blocks
- // dead.
+ // Convert any invoke instructions to non-throwing functions in this node
+ // into call instructions with a branch. This makes the exception blocks
+ // dead.
MadeChange |= SimplifyFunction(F, CGU);
- }
-
- return MadeChange;
-}
-
-bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
- if (skipSCC(SCC))
- return false;
+ }
+
+ return MadeChange;
+}
+
+bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
+ if (skipSCC(SCC))
+ return false;
SetVector<Function *> Functions;
for (auto &N : SCC) {
if (auto *F = N->getFunction())
Functions.insert(F);
}
- CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+ CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
CallGraphUpdater CGU;
CGU.initialize(CG, SCC);
return runImpl(CGU, Functions);
-}
-
-
-// SimplifyFunction - Given information about callees, simplify the specified
-// function if we have invokes to non-unwinding functions or code after calls to
-// no-return functions.
+}
+
+
+// SimplifyFunction - Given information about callees, simplify the specified
+// function if we have invokes to non-unwinding functions or code after calls to
+// no-return functions.
static bool SimplifyFunction(Function *F, CallGraphUpdater &CGU) {
- bool MadeChange = false;
- for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
- if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
- if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(F)) {
- BasicBlock *UnwindBlock = II->getUnwindDest();
- removeUnwindEdge(&*BB);
-
- // If the unwind block is now dead, nuke it.
- if (pred_empty(UnwindBlock))
+ bool MadeChange = false;
+ for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
+ if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+ if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(F)) {
+ BasicBlock *UnwindBlock = II->getUnwindDest();
+ removeUnwindEdge(&*BB);
+
+ // If the unwind block is now dead, nuke it.
+ if (pred_empty(UnwindBlock))
DeleteBasicBlock(UnwindBlock, CGU); // Delete the new BB.
-
- ++NumRemoved;
- MadeChange = true;
- }
-
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
- if (CallInst *CI = dyn_cast<CallInst>(I++))
- if (CI->doesNotReturn() && !CI->isMustTailCall() &&
- !isa<UnreachableInst>(I)) {
- // This call calls a function that cannot return. Insert an
- // unreachable instruction after it and simplify the code. Do this
- // by splitting the BB, adding the unreachable, then deleting the
- // new BB.
- BasicBlock *New = BB->splitBasicBlock(I);
-
- // Remove the uncond branch and add an unreachable.
- BB->getInstList().pop_back();
- new UnreachableInst(BB->getContext(), &*BB);
-
+
+ ++NumRemoved;
+ MadeChange = true;
+ }
+
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; )
+ if (CallInst *CI = dyn_cast<CallInst>(I++))
+ if (CI->doesNotReturn() && !CI->isMustTailCall() &&
+ !isa<UnreachableInst>(I)) {
+ // This call calls a function that cannot return. Insert an
+ // unreachable instruction after it and simplify the code. Do this
+ // by splitting the BB, adding the unreachable, then deleting the
+ // new BB.
+ BasicBlock *New = BB->splitBasicBlock(I);
+
+ // Remove the uncond branch and add an unreachable.
+ BB->getInstList().pop_back();
+ new UnreachableInst(BB->getContext(), &*BB);
+
DeleteBasicBlock(New, CGU); // Delete the new BB.
- MadeChange = true;
- ++NumUnreach;
- break;
- }
- }
-
- return MadeChange;
-}
-
-/// DeleteBasicBlock - remove the specified basic block from the program,
-/// updating the callgraph to reflect any now-obsolete edges due to calls that
-/// exist in the BB.
+ MadeChange = true;
+ ++NumUnreach;
+ break;
+ }
+ }
+
+ return MadeChange;
+}
+
+/// DeleteBasicBlock - remove the specified basic block from the program,
+/// updating the callgraph to reflect any now-obsolete edges due to calls that
+/// exist in the BB.
static void DeleteBasicBlock(BasicBlock *BB, CallGraphUpdater &CGU) {
- assert(pred_empty(BB) && "BB is not dead!");
-
- Instruction *TokenInst = nullptr;
-
- for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) {
- --I;
-
- if (I->getType()->isTokenTy()) {
- TokenInst = &*I;
- break;
- }
-
- if (auto *Call = dyn_cast<CallBase>(&*I)) {
- const Function *Callee = Call->getCalledFunction();
- if (!Callee || !Intrinsic::isLeaf(Callee->getIntrinsicID()))
+ assert(pred_empty(BB) && "BB is not dead!");
+
+ Instruction *TokenInst = nullptr;
+
+ for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; ) {
+ --I;
+
+ if (I->getType()->isTokenTy()) {
+ TokenInst = &*I;
+ break;
+ }
+
+ if (auto *Call = dyn_cast<CallBase>(&*I)) {
+ const Function *Callee = Call->getCalledFunction();
+ if (!Callee || !Intrinsic::isLeaf(Callee->getIntrinsicID()))
CGU.removeCallSite(*Call);
- else if (!Callee->isIntrinsic())
+ else if (!Callee->isIntrinsic())
CGU.removeCallSite(*Call);
- }
-
- if (!I->use_empty())
- I->replaceAllUsesWith(UndefValue::get(I->getType()));
- }
-
- if (TokenInst) {
- if (!TokenInst->isTerminator())
- changeToUnreachable(TokenInst->getNextNode(), /*UseLLVMTrap=*/false);
- } else {
- // Get the list of successors of this block.
- std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
-
- for (unsigned i = 0, e = Succs.size(); i != e; ++i)
- Succs[i]->removePredecessor(BB);
-
- BB->eraseFromParent();
- }
-}
+ }
+
+ if (!I->use_empty())
+ I->replaceAllUsesWith(UndefValue::get(I->getType()));
+ }
+
+ if (TokenInst) {
+ if (!TokenInst->isTerminator())
+ changeToUnreachable(TokenInst->getNextNode(), /*UseLLVMTrap=*/false);
+ } else {
+ // Get the list of successors of this block.
+ std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
+
+ for (unsigned i = 0, e = Succs.size(); i != e; ++i)
+ Succs[i]->removePredecessor(BB);
+
+ BB->eraseFromParent();
+ }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/SCCP.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/SCCP.cpp
index c8be482716..fdffffba0c 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/SCCP.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/SCCP.cpp
@@ -1,93 +1,93 @@
-#include "llvm/Transforms/IPO/SCCP.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Scalar/SCCP.h"
-
-using namespace llvm;
-
-PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
- const DataLayout &DL = M.getDataLayout();
- auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- auto GetTLI = [&FAM](Function &F) -> const TargetLibraryInfo & {
- return FAM.getResult<TargetLibraryAnalysis>(F);
- };
- auto getAnalysis = [&FAM](Function &F) -> AnalysisResultsForFn {
- DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
- return {
- std::make_unique<PredicateInfo>(F, DT, FAM.getResult<AssumptionAnalysis>(F)),
- &DT, FAM.getCachedResult<PostDominatorTreeAnalysis>(F)};
- };
-
- if (!runIPSCCP(M, DL, GetTLI, getAnalysis))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<PostDominatorTreeAnalysis>();
- PA.preserve<FunctionAnalysisManagerModuleProxy>();
- return PA;
-}
-
-namespace {
-
-//===--------------------------------------------------------------------===//
-//
-/// IPSCCP Class - This class implements interprocedural Sparse Conditional
-/// Constant Propagation.
-///
-class IPSCCPLegacyPass : public ModulePass {
-public:
- static char ID;
-
- IPSCCPLegacyPass() : ModulePass(ID) {
- initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
- const DataLayout &DL = M.getDataLayout();
- auto GetTLI = [this](Function &F) -> const TargetLibraryInfo & {
- return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- };
- auto getAnalysis = [this](Function &F) -> AnalysisResultsForFn {
- DominatorTree &DT =
- this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
- return {
- std::make_unique<PredicateInfo>(
- F, DT,
- this->getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
- F)),
- nullptr, // We cannot preserve the DT or PDT with the legacy pass
- nullptr}; // manager, so set them to nullptr.
- };
-
- return runIPSCCP(M, DL, GetTLI, getAnalysis);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-char IPSCCPLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp",
- "Interprocedural Sparse Conditional Constant Propagation",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp",
- "Interprocedural Sparse Conditional Constant Propagation",
- false, false)
-
-// createIPSCCPPass - This is the public interface to this file.
-ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); }
+#include "llvm/Transforms/IPO/SCCP.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar/SCCP.h"
+
+using namespace llvm;
+
+PreservedAnalyses IPSCCPPass::run(Module &M, ModuleAnalysisManager &AM) {
+ const DataLayout &DL = M.getDataLayout();
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto GetTLI = [&FAM](Function &F) -> const TargetLibraryInfo & {
+ return FAM.getResult<TargetLibraryAnalysis>(F);
+ };
+ auto getAnalysis = [&FAM](Function &F) -> AnalysisResultsForFn {
+ DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ return {
+ std::make_unique<PredicateInfo>(F, DT, FAM.getResult<AssumptionAnalysis>(F)),
+ &DT, FAM.getCachedResult<PostDominatorTreeAnalysis>(F)};
+ };
+
+ if (!runIPSCCP(M, DL, GetTLI, getAnalysis))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<PostDominatorTreeAnalysis>();
+ PA.preserve<FunctionAnalysisManagerModuleProxy>();
+ return PA;
+}
+
+namespace {
+
+//===--------------------------------------------------------------------===//
+//
+/// IPSCCP Class - This class implements interprocedural Sparse Conditional
+/// Constant Propagation.
+///
+class IPSCCPLegacyPass : public ModulePass {
+public:
+ static char ID;
+
+ IPSCCPLegacyPass() : ModulePass(ID) {
+ initializeIPSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+ const DataLayout &DL = M.getDataLayout();
+ auto GetTLI = [this](Function &F) -> const TargetLibraryInfo & {
+ return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ };
+ auto getAnalysis = [this](Function &F) -> AnalysisResultsForFn {
+ DominatorTree &DT =
+ this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+ return {
+ std::make_unique<PredicateInfo>(
+ F, DT,
+ this->getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ F)),
+ nullptr, // We cannot preserve the DT or PDT with the legacy pass
+ nullptr}; // manager, so set them to nullptr.
+ };
+
+ return runIPSCCP(M, DL, GetTLI, getAnalysis);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char IPSCCPLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(IPSCCPLegacyPass, "ipsccp",
+ "Interprocedural Sparse Conditional Constant Propagation",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(IPSCCPLegacyPass, "ipsccp",
+ "Interprocedural Sparse Conditional Constant Propagation",
+ false, false)
+
+// createIPSCCPPass - This is the public interface to this file.
+ModulePass *llvm::createIPSCCPPass() { return new IPSCCPLegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/SampleProfile.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/SampleProfile.cpp
index e2a097bfaa..a6a419bfe7 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/SampleProfile.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/SampleProfile.cpp
@@ -1,116 +1,116 @@
-//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the SampleProfileLoader transformation. This pass
-// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
-// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
-// profile information in the given profile.
-//
-// This pass generates branch weight annotations on the IR:
-//
-// - prof: Represents branch weights. This annotation is added to branches
-// to indicate the weights of each edge coming out of the branch.
-// The weight of each edge is the weight of the target block for
-// that edge. The weight of a block B is computed as the maximum
-// number of samples found in B.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/SampleProfile.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/None.h"
+//===- SampleProfile.cpp - Incorporate sample profiles into the IR --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SampleProfileLoader transformation. This pass
+// reads a profile file generated by a sampling profiler (e.g. Linux Perf -
+// http://perf.wiki.kernel.org/) and generates IR metadata to reflect the
+// profile information in the given profile.
+//
+// This pass generates branch weight annotations on the IR:
+//
+// - prof: Represents branch weights. This annotation is added to branches
+// to indicate the weights of each edge coming out of the branch.
+// The weight of each edge is the weight of the target block for
+// that edge. The weight of a block B is computed as the maximum
+// number of samples found in B.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/SampleProfile.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
#include "llvm/ADT/PriorityQueue.h"
-#include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/InlineAdvisor.h"
-#include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/InlineAdvisor.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/Analysis/ReplayInlineAdvisor.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/ValueSymbolTable.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/ProfileData/InstrProf.h"
-#include "llvm/ProfileData/SampleProf.h"
-#include "llvm/ProfileData/SampleProfReader.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/GenericDomTree.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/SampleProf.h"
+#include "llvm/ProfileData/SampleProfReader.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
#include "llvm/Transforms/IPO/SampleContextTracker.h"
#include "llvm/Transforms/IPO/SampleProfileProbe.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Utils/CallPromotionUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <functional>
-#include <limits>
-#include <map>
-#include <memory>
-#include <queue>
-#include <string>
-#include <system_error>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-using namespace sampleprof;
-using ProfileCount = Function::ProfileCount;
-#define DEBUG_TYPE "sample-profile"
-#define CSINLINE_DEBUG DEBUG_TYPE "-inline"
-
-STATISTIC(NumCSInlined,
- "Number of functions inlined with context sensitive profile");
-STATISTIC(NumCSNotInlined,
- "Number of functions not inlined with context sensitive profile");
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <map>
+#include <memory>
+#include <queue>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace sampleprof;
+using ProfileCount = Function::ProfileCount;
+#define DEBUG_TYPE "sample-profile"
+#define CSINLINE_DEBUG DEBUG_TYPE "-inline"
+
+STATISTIC(NumCSInlined,
+ "Number of functions inlined with context sensitive profile");
+STATISTIC(NumCSNotInlined,
+ "Number of functions not inlined with context sensitive profile");
STATISTIC(NumMismatchedProfile,
"Number of functions with CFG mismatched profile");
STATISTIC(NumMatchedProfile, "Number of functions with CFG matched profile");
STATISTIC(NumDuplicatedInlinesite,
"Number of inlined callsites with a partial distribution factor");
-
+
STATISTIC(NumCSInlinedHitMinLimit,
"Number of functions with FDO inline stopped due to min size limit");
STATISTIC(NumCSInlinedHitMaxLimit,
@@ -119,64 +119,64 @@ STATISTIC(
NumCSInlinedHitGrowthLimit,
"Number of functions with FDO inline stopped due to growth size limit");
-// Command line option to specify the file to read samples from. This is
-// mainly used for debugging.
-static cl::opt<std::string> SampleProfileFile(
- "sample-profile-file", cl::init(""), cl::value_desc("filename"),
- cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
-
-// The named file contains a set of transformations that may have been applied
-// to the symbol names between the program from which the sample data was
-// collected and the current program's symbols.
-static cl::opt<std::string> SampleProfileRemappingFile(
- "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
- cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
-
-static cl::opt<unsigned> SampleProfileMaxPropagateIterations(
- "sample-profile-max-propagate-iterations", cl::init(100),
- cl::desc("Maximum number of iterations to go through when propagating "
- "sample block/edge weights through the CFG."));
-
-static cl::opt<unsigned> SampleProfileRecordCoverage(
- "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"),
- cl::desc("Emit a warning if less than N% of records in the input profile "
- "are matched to the IR."));
-
-static cl::opt<unsigned> SampleProfileSampleCoverage(
- "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"),
- cl::desc("Emit a warning if less than N% of samples in the input profile "
- "are matched to the IR."));
-
-static cl::opt<bool> NoWarnSampleUnused(
- "no-warn-sample-unused", cl::init(false), cl::Hidden,
- cl::desc("Use this option to turn off/on warnings about function with "
- "samples but without debug information to use those samples. "));
-
-static cl::opt<bool> ProfileSampleAccurate(
- "profile-sample-accurate", cl::Hidden, cl::init(false),
- cl::desc("If the sample profile is accurate, we will mark all un-sampled "
- "callsite and function as having 0 samples. Otherwise, treat "
- "un-sampled callsites and functions conservatively as unknown. "));
-
-static cl::opt<bool> ProfileAccurateForSymsInList(
- "profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore,
- cl::init(true),
- cl::desc("For symbols in profile symbol list, regard their profiles to "
- "be accurate. It may be overriden by profile-sample-accurate. "));
-
-static cl::opt<bool> ProfileMergeInlinee(
- "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
- cl::desc("Merge past inlinee's profile to outline version if sample "
- "profile loader decided not to inline a call site. It will "
- "only be enabled when top-down order of profile loading is "
- "enabled. "));
-
-static cl::opt<bool> ProfileTopDownLoad(
- "sample-profile-top-down-load", cl::Hidden, cl::init(true),
- cl::desc("Do profile annotation and inlining for functions in top-down "
- "order of call graph during sample profile loading. It only "
- "works for new pass manager. "));
-
+// Command line option to specify the file to read samples from. This is
+// mainly used for debugging.
+static cl::opt<std::string> SampleProfileFile(
+ "sample-profile-file", cl::init(""), cl::value_desc("filename"),
+ cl::desc("Profile file loaded by -sample-profile"), cl::Hidden);
+
+// The named file contains a set of transformations that may have been applied
+// to the symbol names between the program from which the sample data was
+// collected and the current program's symbols.
+static cl::opt<std::string> SampleProfileRemappingFile(
+ "sample-profile-remapping-file", cl::init(""), cl::value_desc("filename"),
+ cl::desc("Profile remapping file loaded by -sample-profile"), cl::Hidden);
+
+static cl::opt<unsigned> SampleProfileMaxPropagateIterations(
+ "sample-profile-max-propagate-iterations", cl::init(100),
+ cl::desc("Maximum number of iterations to go through when propagating "
+ "sample block/edge weights through the CFG."));
+
+static cl::opt<unsigned> SampleProfileRecordCoverage(
+ "sample-profile-check-record-coverage", cl::init(0), cl::value_desc("N"),
+ cl::desc("Emit a warning if less than N% of records in the input profile "
+ "are matched to the IR."));
+
+static cl::opt<unsigned> SampleProfileSampleCoverage(
+ "sample-profile-check-sample-coverage", cl::init(0), cl::value_desc("N"),
+ cl::desc("Emit a warning if less than N% of samples in the input profile "
+ "are matched to the IR."));
+
+static cl::opt<bool> NoWarnSampleUnused(
+ "no-warn-sample-unused", cl::init(false), cl::Hidden,
+ cl::desc("Use this option to turn off/on warnings about function with "
+ "samples but without debug information to use those samples. "));
+
+static cl::opt<bool> ProfileSampleAccurate(
+ "profile-sample-accurate", cl::Hidden, cl::init(false),
+ cl::desc("If the sample profile is accurate, we will mark all un-sampled "
+ "callsite and function as having 0 samples. Otherwise, treat "
+ "un-sampled callsites and functions conservatively as unknown. "));
+
+static cl::opt<bool> ProfileAccurateForSymsInList(
+ "profile-accurate-for-symsinlist", cl::Hidden, cl::ZeroOrMore,
+ cl::init(true),
+ cl::desc("For symbols in profile symbol list, regard their profiles to "
+ "be accurate. It may be overriden by profile-sample-accurate. "));
+
+static cl::opt<bool> ProfileMergeInlinee(
+ "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
+ cl::desc("Merge past inlinee's profile to outline version if sample "
+ "profile loader decided not to inline a call site. It will "
+ "only be enabled when top-down order of profile loading is "
+ "enabled. "));
+
+static cl::opt<bool> ProfileTopDownLoad(
+ "sample-profile-top-down-load", cl::Hidden, cl::init(true),
+ cl::desc("Do profile annotation and inlining for functions in top-down "
+ "order of call graph during sample profile loading. It only "
+ "works for new pass manager. "));
+
static cl::opt<bool> UseProfileIndirectCallEdges(
"use-profile-indirect-call-edges", cl::init(true), cl::Hidden,
cl::desc("Considering indirect call samples from profile when top-down "
@@ -187,11 +187,11 @@ static cl::opt<bool> UseProfileTopDownOrder(
cl::desc("Process functions in one SCC in a top-down order "
"based on the input profile."));
-static cl::opt<bool> ProfileSizeInline(
- "sample-profile-inline-size", cl::Hidden, cl::init(false),
- cl::desc("Inline cold call sites in profile loader if it's beneficial "
- "for code size."));
-
+static cl::opt<bool> ProfileSizeInline(
+ "sample-profile-inline-size", cl::Hidden, cl::init(false),
+ cl::desc("Inline cold call sites in profile loader if it's beneficial "
+ "for code size."));
+
static cl::opt<int> ProfileInlineGrowthLimit(
"sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
cl::desc("The size growth ratio limit for proirity-based sample profile "
@@ -224,10 +224,10 @@ static cl::opt<bool> CallsitePrioritizedInline(
cl::desc("Use call site prioritized inlining for sample profile loader."
"Currently only CSSPGO is supported."));
-static cl::opt<int> SampleColdCallSiteThreshold(
- "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
- cl::desc("Threshold for inlining cold callsites"));
-
+static cl::opt<int> SampleColdCallSiteThreshold(
+ "sample-profile-cold-inline-threshold", cl::Hidden, cl::init(45),
+ cl::desc("Threshold for inlining cold callsites"));
+
static cl::opt<std::string> ProfileInlineReplayFile(
"sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
cl::desc(
@@ -235,137 +235,137 @@ static cl::opt<std::string> ProfileInlineReplayFile(
"by inlining from sample profile loader."),
cl::Hidden);
-namespace {
-
-using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
-using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
-using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
-using EdgeWeightMap = DenseMap<Edge, uint64_t>;
-using BlockEdgeMap =
- DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>;
-
-class SampleProfileLoader;
-
-class SampleCoverageTracker {
-public:
- SampleCoverageTracker(SampleProfileLoader &SPL) : SPLoader(SPL){};
-
- bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
- uint32_t Discriminator, uint64_t Samples);
- unsigned computeCoverage(unsigned Used, unsigned Total) const;
- unsigned countUsedRecords(const FunctionSamples *FS,
- ProfileSummaryInfo *PSI) const;
- unsigned countBodyRecords(const FunctionSamples *FS,
- ProfileSummaryInfo *PSI) const;
- uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
- uint64_t countBodySamples(const FunctionSamples *FS,
- ProfileSummaryInfo *PSI) const;
-
- void clear() {
- SampleCoverage.clear();
- TotalUsedSamples = 0;
- }
-
-private:
- using BodySampleCoverageMap = std::map<LineLocation, unsigned>;
- using FunctionSamplesCoverageMap =
- DenseMap<const FunctionSamples *, BodySampleCoverageMap>;
-
- /// Coverage map for sampling records.
- ///
- /// This map keeps a record of sampling records that have been matched to
- /// an IR instruction. This is used to detect some form of staleness in
- /// profiles (see flag -sample-profile-check-coverage).
- ///
- /// Each entry in the map corresponds to a FunctionSamples instance. This is
- /// another map that counts how many times the sample record at the
- /// given location has been used.
- FunctionSamplesCoverageMap SampleCoverage;
-
- /// Number of samples used from the profile.
- ///
- /// When a sampling record is used for the first time, the samples from
- /// that record are added to this accumulator. Coverage is later computed
- /// based on the total number of samples available in this function and
- /// its callsites.
- ///
- /// Note that this accumulator tracks samples used from a single function
- /// and all the inlined callsites. Strictly, we should have a map of counters
- /// keyed by FunctionSamples pointers, but these stats are cleared after
- /// every function, so we just need to keep a single counter.
- uint64_t TotalUsedSamples = 0;
-
- SampleProfileLoader &SPLoader;
-};
-
-class GUIDToFuncNameMapper {
-public:
- GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
- DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
- : CurrentReader(Reader), CurrentModule(M),
- CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
- if (!CurrentReader.useMD5())
- return;
-
- for (const auto &F : CurrentModule) {
- StringRef OrigName = F.getName();
- CurrentGUIDToFuncNameMap.insert(
- {Function::getGUID(OrigName), OrigName});
-
- // Local to global var promotion used by optimization like thinlto
- // will rename the var and add suffix like ".llvm.xxx" to the
- // original local name. In sample profile, the suffixes of function
- // names are all stripped. Since it is possible that the mapper is
- // built in post-thin-link phase and var promotion has been done,
- // we need to add the substring of function name without the suffix
- // into the GUIDToFuncNameMap.
- StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
- if (CanonName != OrigName)
- CurrentGUIDToFuncNameMap.insert(
- {Function::getGUID(CanonName), CanonName});
- }
-
- // Update GUIDToFuncNameMap for each function including inlinees.
- SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
- }
-
- ~GUIDToFuncNameMapper() {
- if (!CurrentReader.useMD5())
- return;
-
- CurrentGUIDToFuncNameMap.clear();
-
- // Reset GUIDToFuncNameMap for of each function as they're no
- // longer valid at this point.
- SetGUIDToFuncNameMapForAll(nullptr);
- }
-
-private:
- void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
- std::queue<FunctionSamples *> FSToUpdate;
- for (auto &IFS : CurrentReader.getProfiles()) {
- FSToUpdate.push(&IFS.second);
- }
-
- while (!FSToUpdate.empty()) {
- FunctionSamples *FS = FSToUpdate.front();
- FSToUpdate.pop();
- FS->GUIDToFuncNameMap = Map;
- for (const auto &ICS : FS->getCallsiteSamples()) {
- const FunctionSamplesMap &FSMap = ICS.second;
- for (auto &IFS : FSMap) {
- FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
- FSToUpdate.push(&FS);
- }
- }
- }
- }
-
- SampleProfileReader &CurrentReader;
- Module &CurrentModule;
- DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
-};
-
+namespace {
+
+using BlockWeightMap = DenseMap<const BasicBlock *, uint64_t>;
+using EquivalenceClassMap = DenseMap<const BasicBlock *, const BasicBlock *>;
+using Edge = std::pair<const BasicBlock *, const BasicBlock *>;
+using EdgeWeightMap = DenseMap<Edge, uint64_t>;
+using BlockEdgeMap =
+ DenseMap<const BasicBlock *, SmallVector<const BasicBlock *, 8>>;
+
+class SampleProfileLoader;
+
+class SampleCoverageTracker {
+public:
+ SampleCoverageTracker(SampleProfileLoader &SPL) : SPLoader(SPL){};
+
+ bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset,
+ uint32_t Discriminator, uint64_t Samples);
+ unsigned computeCoverage(unsigned Used, unsigned Total) const;
+ unsigned countUsedRecords(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const;
+ unsigned countBodyRecords(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const;
+ uint64_t getTotalUsedSamples() const { return TotalUsedSamples; }
+ uint64_t countBodySamples(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const;
+
+ void clear() {
+ SampleCoverage.clear();
+ TotalUsedSamples = 0;
+ }
+
+private:
+ using BodySampleCoverageMap = std::map<LineLocation, unsigned>;
+ using FunctionSamplesCoverageMap =
+ DenseMap<const FunctionSamples *, BodySampleCoverageMap>;
+
+ /// Coverage map for sampling records.
+ ///
+ /// This map keeps a record of sampling records that have been matched to
+ /// an IR instruction. This is used to detect some form of staleness in
+ /// profiles (see flag -sample-profile-check-coverage).
+ ///
+ /// Each entry in the map corresponds to a FunctionSamples instance. This is
+ /// another map that counts how many times the sample record at the
+ /// given location has been used.
+ FunctionSamplesCoverageMap SampleCoverage;
+
+ /// Number of samples used from the profile.
+ ///
+ /// When a sampling record is used for the first time, the samples from
+ /// that record are added to this accumulator. Coverage is later computed
+ /// based on the total number of samples available in this function and
+ /// its callsites.
+ ///
+ /// Note that this accumulator tracks samples used from a single function
+ /// and all the inlined callsites. Strictly, we should have a map of counters
+ /// keyed by FunctionSamples pointers, but these stats are cleared after
+ /// every function, so we just need to keep a single counter.
+ uint64_t TotalUsedSamples = 0;
+
+ SampleProfileLoader &SPLoader;
+};
+
+class GUIDToFuncNameMapper {
+public:
+ GUIDToFuncNameMapper(Module &M, SampleProfileReader &Reader,
+ DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
+ : CurrentReader(Reader), CurrentModule(M),
+ CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
+ if (!CurrentReader.useMD5())
+ return;
+
+ for (const auto &F : CurrentModule) {
+ StringRef OrigName = F.getName();
+ CurrentGUIDToFuncNameMap.insert(
+ {Function::getGUID(OrigName), OrigName});
+
+ // Local to global var promotion used by optimization like thinlto
+ // will rename the var and add suffix like ".llvm.xxx" to the
+ // original local name. In sample profile, the suffixes of function
+ // names are all stripped. Since it is possible that the mapper is
+ // built in post-thin-link phase and var promotion has been done,
+ // we need to add the substring of function name without the suffix
+ // into the GUIDToFuncNameMap.
+ StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
+ if (CanonName != OrigName)
+ CurrentGUIDToFuncNameMap.insert(
+ {Function::getGUID(CanonName), CanonName});
+ }
+
+ // Update GUIDToFuncNameMap for each function including inlinees.
+ SetGUIDToFuncNameMapForAll(&CurrentGUIDToFuncNameMap);
+ }
+
+ ~GUIDToFuncNameMapper() {
+ if (!CurrentReader.useMD5())
+ return;
+
+ CurrentGUIDToFuncNameMap.clear();
+
+ // Reset GUIDToFuncNameMap for of each function as they're no
+ // longer valid at this point.
+ SetGUIDToFuncNameMapForAll(nullptr);
+ }
+
+private:
+ void SetGUIDToFuncNameMapForAll(DenseMap<uint64_t, StringRef> *Map) {
+ std::queue<FunctionSamples *> FSToUpdate;
+ for (auto &IFS : CurrentReader.getProfiles()) {
+ FSToUpdate.push(&IFS.second);
+ }
+
+ while (!FSToUpdate.empty()) {
+ FunctionSamples *FS = FSToUpdate.front();
+ FSToUpdate.pop();
+ FS->GUIDToFuncNameMap = Map;
+ for (const auto &ICS : FS->getCallsiteSamples()) {
+ const FunctionSamplesMap &FSMap = ICS.second;
+ for (auto &IFS : FSMap) {
+ FunctionSamples &FS = const_cast<FunctionSamples &>(IFS.second);
+ FSToUpdate.push(&FS);
+ }
+ }
+ }
+ }
+
+ SampleProfileReader &CurrentReader;
+ Module &CurrentModule;
+ DenseMap<uint64_t, StringRef> &CurrentGUIDToFuncNameMap;
+};
+
// Inline candidate used by iterative callsite prioritized inliner
struct InlineCandidate {
CallBase *CallInstr;
@@ -398,50 +398,50 @@ using CandidateQueue =
PriorityQueue<InlineCandidate, std::vector<InlineCandidate>,
CandidateComparer>;
-/// Sample profile pass.
-///
-/// This pass reads profile data from the file specified by
-/// -sample-profile-file and annotates every affected function with the
-/// profile information found in that file.
-class SampleProfileLoader {
-public:
- SampleProfileLoader(
+/// Sample profile pass.
+///
+/// This pass reads profile data from the file specified by
+/// -sample-profile-file and annotates every affected function with the
+/// profile information found in that file.
+class SampleProfileLoader {
+public:
+ SampleProfileLoader(
StringRef Name, StringRef RemapName, ThinOrFullLTOPhase LTOPhase,
- std::function<AssumptionCache &(Function &)> GetAssumptionCache,
- std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
- std::function<const TargetLibraryInfo &(Function &)> GetTLI)
- : GetAC(std::move(GetAssumptionCache)),
- GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
- CoverageTracker(*this), Filename(std::string(Name)),
+ std::function<AssumptionCache &(Function &)> GetAssumptionCache,
+ std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
+ std::function<const TargetLibraryInfo &(Function &)> GetTLI)
+ : GetAC(std::move(GetAssumptionCache)),
+ GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
+ CoverageTracker(*this), Filename(std::string(Name)),
RemappingFilename(std::string(RemapName)), LTOPhase(LTOPhase) {}
-
+
bool doInitialization(Module &M, FunctionAnalysisManager *FAM = nullptr);
- bool runOnModule(Module &M, ModuleAnalysisManager *AM,
- ProfileSummaryInfo *_PSI, CallGraph *CG);
-
- void dump() { Reader->dump(); }
-
-protected:
- friend class SampleCoverageTracker;
-
- bool runOnFunction(Function &F, ModuleAnalysisManager *AM);
- unsigned getFunctionLoc(Function &F);
- bool emitAnnotations(Function &F);
- ErrorOr<uint64_t> getInstWeight(const Instruction &I);
+ bool runOnModule(Module &M, ModuleAnalysisManager *AM,
+ ProfileSummaryInfo *_PSI, CallGraph *CG);
+
+ void dump() { Reader->dump(); }
+
+protected:
+ friend class SampleCoverageTracker;
+
+ bool runOnFunction(Function &F, ModuleAnalysisManager *AM);
+ unsigned getFunctionLoc(Function &F);
+ bool emitAnnotations(Function &F);
+ ErrorOr<uint64_t> getInstWeight(const Instruction &I);
ErrorOr<uint64_t> getProbeWeight(const Instruction &I);
- ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
- const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
- std::vector<const FunctionSamples *>
- findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
- mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap;
- const FunctionSamples *findFunctionSamples(const Instruction &I) const;
+ ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
+ const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
+ std::vector<const FunctionSamples *>
+ findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
+ mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap;
+ const FunctionSamples *findFunctionSamples(const Instruction &I) const;
// Attempt to promote indirect call and also inline the promoted call
bool tryPromoteAndInlineCandidate(
Function &F, InlineCandidate &Candidate, uint64_t SumOrigin,
uint64_t &Sum, DenseSet<Instruction *> &PromotedInsns,
SmallVector<CallBase *, 8> *InlinedCallSites = nullptr);
- bool inlineHotFunctions(Function &F,
- DenseSet<GlobalValue::GUID> &InlinedGUIDs);
+ bool inlineHotFunctions(Function &F,
+ DenseSet<GlobalValue::GUID> &InlinedGUIDs);
InlineCost shouldInlineCandidate(InlineCandidate &Candidate);
bool getInlineCandidate(InlineCandidate *NewCandidate, CallBase *CB);
bool
@@ -450,442 +450,442 @@ protected:
bool
inlineHotFunctionsWithPriority(Function &F,
DenseSet<GlobalValue::GUID> &InlinedGUIDs);
- // Inline cold/small functions in addition to hot ones
- bool shouldInlineColdCallee(CallBase &CallInst);
- void emitOptimizationRemarksForInlineCandidates(
- const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
- bool Hot);
- void printEdgeWeight(raw_ostream &OS, Edge E);
- void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
- void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
- bool computeBlockWeights(Function &F);
- void findEquivalenceClasses(Function &F);
- template <bool IsPostDom>
- void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
- DominatorTreeBase<BasicBlock, IsPostDom> *DomTree);
-
- void propagateWeights(Function &F);
- uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
- void buildEdges(Function &F);
- std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG);
+ // Inline cold/small functions in addition to hot ones
+ bool shouldInlineColdCallee(CallBase &CallInst);
+ void emitOptimizationRemarksForInlineCandidates(
+ const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
+ bool Hot);
+ void printEdgeWeight(raw_ostream &OS, Edge E);
+ void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
+ void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
+ bool computeBlockWeights(Function &F);
+ void findEquivalenceClasses(Function &F);
+ template <bool IsPostDom>
+ void findEquivalencesFor(BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
+ DominatorTreeBase<BasicBlock, IsPostDom> *DomTree);
+
+ void propagateWeights(Function &F);
+ uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge);
+ void buildEdges(Function &F);
+ std::vector<Function *> buildFunctionOrder(Module &M, CallGraph *CG);
void addCallGraphEdges(CallGraph &CG, const FunctionSamples &Samples);
void replaceCallGraphEdges(CallGraph &CG, StringMap<Function *> &SymbolMap);
- bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
- void computeDominanceAndLoopInfo(Function &F);
- void clearFunctionData();
- bool callsiteIsHot(const FunctionSamples *CallsiteFS,
- ProfileSummaryInfo *PSI);
-
- /// Map basic blocks to their computed weights.
- ///
- /// The weight of a basic block is defined to be the maximum
- /// of all the instruction weights in that block.
- BlockWeightMap BlockWeights;
-
- /// Map edges to their computed weights.
- ///
- /// Edge weights are computed by propagating basic block weights in
- /// SampleProfile::propagateWeights.
- EdgeWeightMap EdgeWeights;
-
- /// Set of visited blocks during propagation.
- SmallPtrSet<const BasicBlock *, 32> VisitedBlocks;
-
- /// Set of visited edges during propagation.
- SmallSet<Edge, 32> VisitedEdges;
-
- /// Equivalence classes for block weights.
- ///
- /// Two blocks BB1 and BB2 are in the same equivalence class if they
- /// dominate and post-dominate each other, and they are in the same loop
- /// nest. When this happens, the two blocks are guaranteed to execute
- /// the same number of times.
- EquivalenceClassMap EquivalenceClass;
-
- /// Map from function name to Function *. Used to find the function from
- /// the function name. If the function name contains suffix, additional
- /// entry is added to map from the stripped name to the function if there
- /// is one-to-one mapping.
- StringMap<Function *> SymbolMap;
-
- /// Dominance, post-dominance and loop information.
- std::unique_ptr<DominatorTree> DT;
- std::unique_ptr<PostDominatorTree> PDT;
- std::unique_ptr<LoopInfo> LI;
-
- std::function<AssumptionCache &(Function &)> GetAC;
- std::function<TargetTransformInfo &(Function &)> GetTTI;
- std::function<const TargetLibraryInfo &(Function &)> GetTLI;
-
- /// Predecessors for each basic block in the CFG.
- BlockEdgeMap Predecessors;
-
- /// Successors for each basic block in the CFG.
- BlockEdgeMap Successors;
-
- SampleCoverageTracker CoverageTracker;
-
- /// Profile reader object.
- std::unique_ptr<SampleProfileReader> Reader;
-
+ bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
+ void computeDominanceAndLoopInfo(Function &F);
+ void clearFunctionData();
+ bool callsiteIsHot(const FunctionSamples *CallsiteFS,
+ ProfileSummaryInfo *PSI);
+
+ /// Map basic blocks to their computed weights.
+ ///
+ /// The weight of a basic block is defined to be the maximum
+ /// of all the instruction weights in that block.
+ BlockWeightMap BlockWeights;
+
+ /// Map edges to their computed weights.
+ ///
+ /// Edge weights are computed by propagating basic block weights in
+ /// SampleProfile::propagateWeights.
+ EdgeWeightMap EdgeWeights;
+
+ /// Set of visited blocks during propagation.
+ SmallPtrSet<const BasicBlock *, 32> VisitedBlocks;
+
+ /// Set of visited edges during propagation.
+ SmallSet<Edge, 32> VisitedEdges;
+
+ /// Equivalence classes for block weights.
+ ///
+ /// Two blocks BB1 and BB2 are in the same equivalence class if they
+ /// dominate and post-dominate each other, and they are in the same loop
+ /// nest. When this happens, the two blocks are guaranteed to execute
+ /// the same number of times.
+ EquivalenceClassMap EquivalenceClass;
+
+ /// Map from function name to Function *. Used to find the function from
+ /// the function name. If the function name contains suffix, additional
+ /// entry is added to map from the stripped name to the function if there
+ /// is one-to-one mapping.
+ StringMap<Function *> SymbolMap;
+
+ /// Dominance, post-dominance and loop information.
+ std::unique_ptr<DominatorTree> DT;
+ std::unique_ptr<PostDominatorTree> PDT;
+ std::unique_ptr<LoopInfo> LI;
+
+ std::function<AssumptionCache &(Function &)> GetAC;
+ std::function<TargetTransformInfo &(Function &)> GetTTI;
+ std::function<const TargetLibraryInfo &(Function &)> GetTLI;
+
+ /// Predecessors for each basic block in the CFG.
+ BlockEdgeMap Predecessors;
+
+ /// Successors for each basic block in the CFG.
+ BlockEdgeMap Successors;
+
+ SampleCoverageTracker CoverageTracker;
+
+ /// Profile reader object.
+ std::unique_ptr<SampleProfileReader> Reader;
+
/// Profile tracker for different context.
std::unique_ptr<SampleContextTracker> ContextTracker;
- /// Samples collected for the body of this function.
- FunctionSamples *Samples = nullptr;
-
- /// Name of the profile file to load.
- std::string Filename;
-
- /// Name of the profile remapping file to load.
- std::string RemappingFilename;
-
- /// Flag indicating whether the profile input loaded successfully.
- bool ProfileIsValid = false;
-
+ /// Samples collected for the body of this function.
+ FunctionSamples *Samples = nullptr;
+
+ /// Name of the profile file to load.
+ std::string Filename;
+
+ /// Name of the profile remapping file to load.
+ std::string RemappingFilename;
+
+ /// Flag indicating whether the profile input loaded successfully.
+ bool ProfileIsValid = false;
+
/// Flag indicating whether input profile is context-sensitive
bool ProfileIsCS = false;
/// Flag indicating which LTO/ThinLTO phase the pass is invoked in.
- ///
+ ///
/// We need to know the LTO phase because for example in ThinLTOPrelink
/// phase, in annotation, we should not promote indirect calls. Instead,
/// we will mark GUIDs that needs to be annotated to the function.
ThinOrFullLTOPhase LTOPhase;
-
- /// Profile Summary Info computed from sample profile.
- ProfileSummaryInfo *PSI = nullptr;
-
- /// Profle Symbol list tells whether a function name appears in the binary
- /// used to generate the current profile.
- std::unique_ptr<ProfileSymbolList> PSL;
-
- /// Total number of samples collected in this profile.
- ///
- /// This is the sum of all the samples collected in all the functions executed
- /// at runtime.
- uint64_t TotalCollectedSamples = 0;
-
- /// Optimization Remark Emitter used to emit diagnostic remarks.
- OptimizationRemarkEmitter *ORE = nullptr;
-
- // Information recorded when we declined to inline a call site
- // because we have determined it is too cold is accumulated for
- // each callee function. Initially this is just the entry count.
- struct NotInlinedProfileInfo {
- uint64_t entryCount;
- };
- DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo;
-
- // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
- // all the function symbols defined or declared in current module.
- DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
-
- // All the Names used in FunctionSamples including outline function
- // names, inline instance names and call target names.
- StringSet<> NamesInProfile;
-
- // For symbol in profile symbol list, whether to regard their profiles
- // to be accurate. It is mainly decided by existance of profile symbol
- // list and -profile-accurate-for-symsinlist flag, but it can be
- // overriden by -profile-sample-accurate or profile-sample-accurate
- // attribute.
- bool ProfAccForSymsInList;
+
+ /// Profile Summary Info computed from sample profile.
+ ProfileSummaryInfo *PSI = nullptr;
+
+ /// Profle Symbol list tells whether a function name appears in the binary
+ /// used to generate the current profile.
+ std::unique_ptr<ProfileSymbolList> PSL;
+
+ /// Total number of samples collected in this profile.
+ ///
+ /// This is the sum of all the samples collected in all the functions executed
+ /// at runtime.
+ uint64_t TotalCollectedSamples = 0;
+
+ /// Optimization Remark Emitter used to emit diagnostic remarks.
+ OptimizationRemarkEmitter *ORE = nullptr;
+
+ // Information recorded when we declined to inline a call site
+ // because we have determined it is too cold is accumulated for
+ // each callee function. Initially this is just the entry count.
+ struct NotInlinedProfileInfo {
+ uint64_t entryCount;
+ };
+ DenseMap<Function *, NotInlinedProfileInfo> notInlinedCallInfo;
+
+ // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for
+ // all the function symbols defined or declared in current module.
+ DenseMap<uint64_t, StringRef> GUIDToFuncNameMap;
+
+ // All the Names used in FunctionSamples including outline function
+ // names, inline instance names and call target names.
+ StringSet<> NamesInProfile;
+
+ // For symbol in profile symbol list, whether to regard their profiles
+ // to be accurate. It is mainly decided by existance of profile symbol
+ // list and -profile-accurate-for-symsinlist flag, but it can be
+ // overriden by -profile-sample-accurate or profile-sample-accurate
+ // attribute.
+ bool ProfAccForSymsInList;
// External inline advisor used to replay inline decision from remarks.
std::unique_ptr<ReplayInlineAdvisor> ExternalInlineAdvisor;
// A pseudo probe helper to correlate the imported sample counts.
std::unique_ptr<PseudoProbeManager> ProbeManager;
-};
-
-class SampleProfileLoaderLegacyPass : public ModulePass {
-public:
- // Class identification, replacement for typeinfo
- static char ID;
-
+};
+
+class SampleProfileLoaderLegacyPass : public ModulePass {
+public:
+ // Class identification, replacement for typeinfo
+ static char ID;
+
SampleProfileLoaderLegacyPass(
StringRef Name = SampleProfileFile,
ThinOrFullLTOPhase LTOPhase = ThinOrFullLTOPhase::None)
- : ModulePass(ID), SampleLoader(
+ : ModulePass(ID), SampleLoader(
Name, SampleProfileRemappingFile, LTOPhase,
- [&](Function &F) -> AssumptionCache & {
- return ACT->getAssumptionCache(F);
- },
- [&](Function &F) -> TargetTransformInfo & {
- return TTIWP->getTTI(F);
- },
- [&](Function &F) -> TargetLibraryInfo & {
- return TLIWP->getTLI(F);
- }) {
- initializeSampleProfileLoaderLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- void dump() { SampleLoader.dump(); }
-
- bool doInitialization(Module &M) override {
- return SampleLoader.doInitialization(M);
- }
-
- StringRef getPassName() const override { return "Sample profile pass"; }
- bool runOnModule(Module &M) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<ProfileSummaryInfoWrapperPass>();
- }
-
-private:
- SampleProfileLoader SampleLoader;
- AssumptionCacheTracker *ACT = nullptr;
- TargetTransformInfoWrapperPass *TTIWP = nullptr;
- TargetLibraryInfoWrapperPass *TLIWP = nullptr;
-};
-
-} // end anonymous namespace
-
-/// Return true if the given callsite is hot wrt to hot cutoff threshold.
-///
-/// Functions that were inlined in the original binary will be represented
-/// in the inline stack in the sample profile. If the profile shows that
-/// the original inline decision was "good" (i.e., the callsite is executed
-/// frequently), then we will recreate the inline decision and apply the
-/// profile from the inlined callsite.
-///
-/// To decide whether an inlined callsite is hot, we compare the callsite
-/// sample count with the hot cutoff computed by ProfileSummaryInfo, it is
-/// regarded as hot if the count is above the cutoff value.
-///
-/// When ProfileAccurateForSymsInList is enabled and profile symbol list
-/// is present, functions in the profile symbol list but without profile will
-/// be regarded as cold and much less inlining will happen in CGSCC inlining
-/// pass, so we tend to lower the hot criteria here to allow more early
-/// inlining to happen for warm callsites and it is helpful for performance.
-bool SampleProfileLoader::callsiteIsHot(const FunctionSamples *CallsiteFS,
- ProfileSummaryInfo *PSI) {
- if (!CallsiteFS)
- return false; // The callsite was not inlined in the original binary.
-
- assert(PSI && "PSI is expected to be non null");
- uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
- if (ProfAccForSymsInList)
- return !PSI->isColdCount(CallsiteTotalSamples);
- else
- return PSI->isHotCount(CallsiteTotalSamples);
-}
-
-/// Mark as used the sample record for the given function samples at
-/// (LineOffset, Discriminator).
-///
-/// \returns true if this is the first time we mark the given record.
-bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS,
- uint32_t LineOffset,
- uint32_t Discriminator,
- uint64_t Samples) {
- LineLocation Loc(LineOffset, Discriminator);
- unsigned &Count = SampleCoverage[FS][Loc];
- bool FirstTime = (++Count == 1);
- if (FirstTime)
- TotalUsedSamples += Samples;
- return FirstTime;
-}
-
-/// Return the number of sample records that were applied from this profile.
-///
-/// This count does not include records from cold inlined callsites.
-unsigned
-SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS,
- ProfileSummaryInfo *PSI) const {
- auto I = SampleCoverage.find(FS);
-
- // The size of the coverage map for FS represents the number of records
- // that were marked used at least once.
- unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 0;
-
- // If there are inlined callsites in this function, count the samples found
- // in the respective bodies. However, do not bother counting callees with 0
- // total samples, these are callees that were never invoked at runtime.
- for (const auto &I : FS->getCallsiteSamples())
- for (const auto &J : I.second) {
- const FunctionSamples *CalleeSamples = &J.second;
- if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
- Count += countUsedRecords(CalleeSamples, PSI);
- }
-
- return Count;
-}
-
-/// Return the number of sample records in the body of this profile.
-///
-/// This count does not include records from cold inlined callsites.
-unsigned
-SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS,
- ProfileSummaryInfo *PSI) const {
- unsigned Count = FS->getBodySamples().size();
-
- // Only count records in hot callsites.
- for (const auto &I : FS->getCallsiteSamples())
- for (const auto &J : I.second) {
- const FunctionSamples *CalleeSamples = &J.second;
- if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
- Count += countBodyRecords(CalleeSamples, PSI);
- }
-
- return Count;
-}
-
-/// Return the number of samples collected in the body of this profile.
-///
-/// This count does not include samples from cold inlined callsites.
-uint64_t
-SampleCoverageTracker::countBodySamples(const FunctionSamples *FS,
- ProfileSummaryInfo *PSI) const {
- uint64_t Total = 0;
- for (const auto &I : FS->getBodySamples())
- Total += I.second.getSamples();
-
- // Only count samples in hot callsites.
- for (const auto &I : FS->getCallsiteSamples())
- for (const auto &J : I.second) {
- const FunctionSamples *CalleeSamples = &J.second;
- if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
- Total += countBodySamples(CalleeSamples, PSI);
- }
-
- return Total;
-}
-
-/// Return the fraction of sample records used in this profile.
-///
-/// The returned value is an unsigned integer in the range 0-100 indicating
-/// the percentage of sample records that were used while applying this
-/// profile to the associated function.
-unsigned SampleCoverageTracker::computeCoverage(unsigned Used,
- unsigned Total) const {
- assert(Used <= Total &&
- "number of used records cannot exceed the total number of records");
- return Total > 0 ? Used * 100 / Total : 100;
-}
-
-/// Clear all the per-function data used to load samples and propagate weights.
-void SampleProfileLoader::clearFunctionData() {
- BlockWeights.clear();
- EdgeWeights.clear();
- VisitedBlocks.clear();
- VisitedEdges.clear();
- EquivalenceClass.clear();
- DT = nullptr;
- PDT = nullptr;
- LI = nullptr;
- Predecessors.clear();
- Successors.clear();
- CoverageTracker.clear();
-}
-
-#ifndef NDEBUG
-/// Print the weight of edge \p E on stream \p OS.
-///
-/// \param OS Stream to emit the output to.
-/// \param E Edge to print.
-void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) {
- OS << "weight[" << E.first->getName() << "->" << E.second->getName()
- << "]: " << EdgeWeights[E] << "\n";
-}
-
-/// Print the equivalence class of block \p BB on stream \p OS.
-///
-/// \param OS Stream to emit the output to.
-/// \param BB Block to print.
-void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,
- const BasicBlock *BB) {
- const BasicBlock *Equiv = EquivalenceClass[BB];
- OS << "equivalence[" << BB->getName()
- << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
-}
-
-/// Print the weight of block \p BB on stream \p OS.
-///
-/// \param OS Stream to emit the output to.
-/// \param BB Block to print.
-void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
- const BasicBlock *BB) const {
- const auto &I = BlockWeights.find(BB);
- uint64_t W = (I == BlockWeights.end() ? 0 : I->second);
- OS << "weight[" << BB->getName() << "]: " << W << "\n";
-}
-#endif
-
-/// Get the weight for an instruction.
-///
-/// The "weight" of an instruction \p Inst is the number of samples
-/// collected on that instruction at runtime. To retrieve it, we
-/// need to compute the line number of \p Inst relative to the start of its
-/// function. We use HeaderLineno to compute the offset. We then
-/// look up the samples collected for \p Inst using BodySamples.
-///
-/// \param Inst Instruction to query.
-///
-/// \returns the weight of \p Inst.
-ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
+ [&](Function &F) -> AssumptionCache & {
+ return ACT->getAssumptionCache(F);
+ },
+ [&](Function &F) -> TargetTransformInfo & {
+ return TTIWP->getTTI(F);
+ },
+ [&](Function &F) -> TargetLibraryInfo & {
+ return TLIWP->getTLI(F);
+ }) {
+ initializeSampleProfileLoaderLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ void dump() { SampleLoader.dump(); }
+
+ bool doInitialization(Module &M) override {
+ return SampleLoader.doInitialization(M);
+ }
+
+ StringRef getPassName() const override { return "Sample profile pass"; }
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ }
+
+private:
+ SampleProfileLoader SampleLoader;
+ AssumptionCacheTracker *ACT = nullptr;
+ TargetTransformInfoWrapperPass *TTIWP = nullptr;
+ TargetLibraryInfoWrapperPass *TLIWP = nullptr;
+};
+
+} // end anonymous namespace
+
+/// Return true if the given callsite is hot wrt to hot cutoff threshold.
+///
+/// Functions that were inlined in the original binary will be represented
+/// in the inline stack in the sample profile. If the profile shows that
+/// the original inline decision was "good" (i.e., the callsite is executed
+/// frequently), then we will recreate the inline decision and apply the
+/// profile from the inlined callsite.
+///
+/// To decide whether an inlined callsite is hot, we compare the callsite
+/// sample count with the hot cutoff computed by ProfileSummaryInfo, it is
+/// regarded as hot if the count is above the cutoff value.
+///
+/// When ProfileAccurateForSymsInList is enabled and profile symbol list
+/// is present, functions in the profile symbol list but without profile will
+/// be regarded as cold and much less inlining will happen in CGSCC inlining
+/// pass, so we tend to lower the hot criteria here to allow more early
+/// inlining to happen for warm callsites and it is helpful for performance.
+bool SampleProfileLoader::callsiteIsHot(const FunctionSamples *CallsiteFS,
+ ProfileSummaryInfo *PSI) {
+ if (!CallsiteFS)
+ return false; // The callsite was not inlined in the original binary.
+
+ assert(PSI && "PSI is expected to be non null");
+ uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples();
+ if (ProfAccForSymsInList)
+ return !PSI->isColdCount(CallsiteTotalSamples);
+ else
+ return PSI->isHotCount(CallsiteTotalSamples);
+}
+
+/// Mark as used the sample record for the given function samples at
+/// (LineOffset, Discriminator).
+///
+/// \returns true if this is the first time we mark the given record.
+bool SampleCoverageTracker::markSamplesUsed(const FunctionSamples *FS,
+ uint32_t LineOffset,
+ uint32_t Discriminator,
+ uint64_t Samples) {
+ LineLocation Loc(LineOffset, Discriminator);
+ unsigned &Count = SampleCoverage[FS][Loc];
+ bool FirstTime = (++Count == 1);
+ if (FirstTime)
+ TotalUsedSamples += Samples;
+ return FirstTime;
+}
+
+/// Return the number of sample records that were applied from this profile.
+///
+/// This count does not include records from cold inlined callsites.
+unsigned
+SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const {
+ auto I = SampleCoverage.find(FS);
+
+ // The size of the coverage map for FS represents the number of records
+ // that were marked used at least once.
+ unsigned Count = (I != SampleCoverage.end()) ? I->second.size() : 0;
+
+ // If there are inlined callsites in this function, count the samples found
+ // in the respective bodies. However, do not bother counting callees with 0
+ // total samples, these are callees that were never invoked at runtime.
+ for (const auto &I : FS->getCallsiteSamples())
+ for (const auto &J : I.second) {
+ const FunctionSamples *CalleeSamples = &J.second;
+ if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
+ Count += countUsedRecords(CalleeSamples, PSI);
+ }
+
+ return Count;
+}
+
+/// Return the number of sample records in the body of this profile.
+///
+/// This count does not include records from cold inlined callsites.
+unsigned
+SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const {
+ unsigned Count = FS->getBodySamples().size();
+
+ // Only count records in hot callsites.
+ for (const auto &I : FS->getCallsiteSamples())
+ for (const auto &J : I.second) {
+ const FunctionSamples *CalleeSamples = &J.second;
+ if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
+ Count += countBodyRecords(CalleeSamples, PSI);
+ }
+
+ return Count;
+}
+
+/// Return the number of samples collected in the body of this profile.
+///
+/// This count does not include samples from cold inlined callsites.
+uint64_t
+SampleCoverageTracker::countBodySamples(const FunctionSamples *FS,
+ ProfileSummaryInfo *PSI) const {
+ uint64_t Total = 0;
+ for (const auto &I : FS->getBodySamples())
+ Total += I.second.getSamples();
+
+ // Only count samples in hot callsites.
+ for (const auto &I : FS->getCallsiteSamples())
+ for (const auto &J : I.second) {
+ const FunctionSamples *CalleeSamples = &J.second;
+ if (SPLoader.callsiteIsHot(CalleeSamples, PSI))
+ Total += countBodySamples(CalleeSamples, PSI);
+ }
+
+ return Total;
+}
+
+/// Return the fraction of sample records used in this profile.
+///
+/// The returned value is an unsigned integer in the range 0-100 indicating
+/// the percentage of sample records that were used while applying this
+/// profile to the associated function.
+unsigned SampleCoverageTracker::computeCoverage(unsigned Used,
+ unsigned Total) const {
+ assert(Used <= Total &&
+ "number of used records cannot exceed the total number of records");
+ return Total > 0 ? Used * 100 / Total : 100;
+}
+
+/// Clear all the per-function data used to load samples and propagate weights.
+void SampleProfileLoader::clearFunctionData() {
+ BlockWeights.clear();
+ EdgeWeights.clear();
+ VisitedBlocks.clear();
+ VisitedEdges.clear();
+ EquivalenceClass.clear();
+ DT = nullptr;
+ PDT = nullptr;
+ LI = nullptr;
+ Predecessors.clear();
+ Successors.clear();
+ CoverageTracker.clear();
+}
+
+#ifndef NDEBUG
+/// Print the weight of edge \p E on stream \p OS.
+///
+/// \param OS Stream to emit the output to.
+/// \param E Edge to print.
+void SampleProfileLoader::printEdgeWeight(raw_ostream &OS, Edge E) {
+ OS << "weight[" << E.first->getName() << "->" << E.second->getName()
+ << "]: " << EdgeWeights[E] << "\n";
+}
+
+/// Print the equivalence class of block \p BB on stream \p OS.
+///
+/// \param OS Stream to emit the output to.
+/// \param BB Block to print.
+void SampleProfileLoader::printBlockEquivalence(raw_ostream &OS,
+ const BasicBlock *BB) {
+ const BasicBlock *Equiv = EquivalenceClass[BB];
+ OS << "equivalence[" << BB->getName()
+ << "]: " << ((Equiv) ? EquivalenceClass[BB]->getName() : "NONE") << "\n";
+}
+
+/// Print the weight of block \p BB on stream \p OS.
+///
+/// \param OS Stream to emit the output to.
+/// \param BB Block to print.
+void SampleProfileLoader::printBlockWeight(raw_ostream &OS,
+ const BasicBlock *BB) const {
+ const auto &I = BlockWeights.find(BB);
+ uint64_t W = (I == BlockWeights.end() ? 0 : I->second);
+ OS << "weight[" << BB->getName() << "]: " << W << "\n";
+}
+#endif
+
+/// Get the weight for an instruction.
+///
+/// The "weight" of an instruction \p Inst is the number of samples
+/// collected on that instruction at runtime. To retrieve it, we
+/// need to compute the line number of \p Inst relative to the start of its
+/// function. We use HeaderLineno to compute the offset. We then
+/// look up the samples collected for \p Inst using BodySamples.
+///
+/// \param Inst Instruction to query.
+///
+/// \returns the weight of \p Inst.
+ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
if (FunctionSamples::ProfileIsProbeBased)
return getProbeWeight(Inst);
- const DebugLoc &DLoc = Inst.getDebugLoc();
- if (!DLoc)
- return std::error_code();
-
- const FunctionSamples *FS = findFunctionSamples(Inst);
- if (!FS)
- return std::error_code();
-
- // Ignore all intrinsics, phinodes and branch instructions.
- // Branch and phinodes instruction usually contains debug info from sources outside of
- // the residing basic block, thus we ignore them during annotation.
- if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
- return std::error_code();
-
- // If a direct call/invoke instruction is inlined in profile
- // (findCalleeFunctionSamples returns non-empty result), but not inlined here,
- // it means that the inlined callsite has no sample, thus the call
- // instruction should have 0 count.
+ const DebugLoc &DLoc = Inst.getDebugLoc();
+ if (!DLoc)
+ return std::error_code();
+
+ const FunctionSamples *FS = findFunctionSamples(Inst);
+ if (!FS)
+ return std::error_code();
+
+ // Ignore all intrinsics, phinodes and branch instructions.
+ // Branch and phinodes instruction usually contains debug info from sources outside of
+ // the residing basic block, thus we ignore them during annotation.
+ if (isa<BranchInst>(Inst) || isa<IntrinsicInst>(Inst) || isa<PHINode>(Inst))
+ return std::error_code();
+
+ // If a direct call/invoke instruction is inlined in profile
+ // (findCalleeFunctionSamples returns non-empty result), but not inlined here,
+ // it means that the inlined callsite has no sample, thus the call
+ // instruction should have 0 count.
if (!ProfileIsCS)
if (const auto *CB = dyn_cast<CallBase>(&Inst))
if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
return 0;
-
- const DILocation *DIL = DLoc;
- uint32_t LineOffset = FunctionSamples::getOffset(DIL);
- uint32_t Discriminator = DIL->getBaseDiscriminator();
- ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
- if (R) {
- bool FirstMark =
- CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
- if (FirstMark) {
- ORE->emit([&]() {
- OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst);
- Remark << "Applied " << ore::NV("NumSamples", *R);
- Remark << " samples from profile (offset: ";
- Remark << ore::NV("LineOffset", LineOffset);
- if (Discriminator) {
- Remark << ".";
- Remark << ore::NV("Discriminator", Discriminator);
- }
- Remark << ")";
- return Remark;
- });
- }
- LLVM_DEBUG(dbgs() << " " << DLoc.getLine() << "."
- << DIL->getBaseDiscriminator() << ":" << Inst
- << " (line offset: " << LineOffset << "."
- << DIL->getBaseDiscriminator() << " - weight: " << R.get()
- << ")\n");
- }
- return R;
-}
-
+
+ const DILocation *DIL = DLoc;
+ uint32_t LineOffset = FunctionSamples::getOffset(DIL);
+ uint32_t Discriminator = DIL->getBaseDiscriminator();
+ ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
+ if (R) {
+ bool FirstMark =
+ CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
+ if (FirstMark) {
+ ORE->emit([&]() {
+ OptimizationRemarkAnalysis Remark(DEBUG_TYPE, "AppliedSamples", &Inst);
+ Remark << "Applied " << ore::NV("NumSamples", *R);
+ Remark << " samples from profile (offset: ";
+ Remark << ore::NV("LineOffset", LineOffset);
+ if (Discriminator) {
+ Remark << ".";
+ Remark << ore::NV("Discriminator", Discriminator);
+ }
+ Remark << ")";
+ return Remark;
+ });
+ }
+ LLVM_DEBUG(dbgs() << " " << DLoc.getLine() << "."
+ << DIL->getBaseDiscriminator() << ":" << Inst
+ << " (line offset: " << LineOffset << "."
+ << DIL->getBaseDiscriminator() << " - weight: " << R.get()
+ << ")\n");
+ }
+ return R;
+}
+
ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
assert(FunctionSamples::ProfileIsProbeBased &&
"Profile is not pseudo probe based");
@@ -931,96 +931,96 @@ ErrorOr<uint64_t> SampleProfileLoader::getProbeWeight(const Instruction &Inst) {
return R;
}
-/// Compute the weight of a basic block.
-///
-/// The weight of basic block \p BB is the maximum weight of all the
-/// instructions in BB.
-///
-/// \param BB The basic block to query.
-///
-/// \returns the weight for \p BB.
-ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
- uint64_t Max = 0;
- bool HasWeight = false;
- for (auto &I : BB->getInstList()) {
- const ErrorOr<uint64_t> &R = getInstWeight(I);
- if (R) {
- Max = std::max(Max, R.get());
- HasWeight = true;
- }
- }
- return HasWeight ? ErrorOr<uint64_t>(Max) : std::error_code();
-}
-
-/// Compute and store the weights of every basic block.
-///
-/// This populates the BlockWeights map by computing
-/// the weights of every basic block in the CFG.
-///
-/// \param F The function to query.
-bool SampleProfileLoader::computeBlockWeights(Function &F) {
- bool Changed = false;
- LLVM_DEBUG(dbgs() << "Block weights\n");
- for (const auto &BB : F) {
- ErrorOr<uint64_t> Weight = getBlockWeight(&BB);
- if (Weight) {
- BlockWeights[&BB] = Weight.get();
- VisitedBlocks.insert(&BB);
- Changed = true;
- }
- LLVM_DEBUG(printBlockWeight(dbgs(), &BB));
- }
-
- return Changed;
-}
-
-/// Get the FunctionSamples for a call instruction.
-///
-/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
-/// instance in which that call instruction is calling to. It contains
-/// all samples that resides in the inlined instance. We first find the
-/// inlined instance in which the call instruction is from, then we
-/// traverse its children to find the callsite with the matching
-/// location.
-///
-/// \param Inst Call/Invoke instruction to query.
-///
-/// \returns The FunctionSamples pointer to the inlined instance.
-const FunctionSamples *
-SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
- const DILocation *DIL = Inst.getDebugLoc();
- if (!DIL) {
- return nullptr;
- }
-
- StringRef CalleeName;
+/// Compute the weight of a basic block.
+///
+/// The weight of basic block \p BB is the maximum weight of all the
+/// instructions in BB.
+///
+/// \param BB The basic block to query.
+///
+/// \returns the weight for \p BB.
+ErrorOr<uint64_t> SampleProfileLoader::getBlockWeight(const BasicBlock *BB) {
+ uint64_t Max = 0;
+ bool HasWeight = false;
+ for (auto &I : BB->getInstList()) {
+ const ErrorOr<uint64_t> &R = getInstWeight(I);
+ if (R) {
+ Max = std::max(Max, R.get());
+ HasWeight = true;
+ }
+ }
+ return HasWeight ? ErrorOr<uint64_t>(Max) : std::error_code();
+}
+
+/// Compute and store the weights of every basic block.
+///
+/// This populates the BlockWeights map by computing
+/// the weights of every basic block in the CFG.
+///
+/// \param F The function to query.
+bool SampleProfileLoader::computeBlockWeights(Function &F) {
+ bool Changed = false;
+ LLVM_DEBUG(dbgs() << "Block weights\n");
+ for (const auto &BB : F) {
+ ErrorOr<uint64_t> Weight = getBlockWeight(&BB);
+ if (Weight) {
+ BlockWeights[&BB] = Weight.get();
+ VisitedBlocks.insert(&BB);
+ Changed = true;
+ }
+ LLVM_DEBUG(printBlockWeight(dbgs(), &BB));
+ }
+
+ return Changed;
+}
+
+/// Get the FunctionSamples for a call instruction.
+///
+/// The FunctionSamples of a call/invoke instruction \p Inst is the inlined
+/// instance in which that call instruction is calling to. It contains
+/// all samples that resides in the inlined instance. We first find the
+/// inlined instance in which the call instruction is from, then we
+/// traverse its children to find the callsite with the matching
+/// location.
+///
+/// \param Inst Call/Invoke instruction to query.
+///
+/// \returns The FunctionSamples pointer to the inlined instance.
+const FunctionSamples *
+SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
+ const DILocation *DIL = Inst.getDebugLoc();
+ if (!DIL) {
+ return nullptr;
+ }
+
+ StringRef CalleeName;
if (Function *Callee = Inst.getCalledFunction())
CalleeName = FunctionSamples::getCanonicalFnName(*Callee);
-
+
if (ProfileIsCS)
return ContextTracker->getCalleeContextSamplesFor(Inst, CalleeName);
- const FunctionSamples *FS = findFunctionSamples(Inst);
- if (FS == nullptr)
- return nullptr;
-
+ const FunctionSamples *FS = findFunctionSamples(Inst);
+ if (FS == nullptr)
+ return nullptr;
+
return FS->findFunctionSamplesAt(FunctionSamples::getCallSiteIdentifier(DIL),
CalleeName, Reader->getRemapper());
-}
-
-/// Returns a vector of FunctionSamples that are the indirect call targets
-/// of \p Inst. The vector is sorted by the total number of samples. Stores
-/// the total call count of the indirect call in \p Sum.
-std::vector<const FunctionSamples *>
-SampleProfileLoader::findIndirectCallFunctionSamples(
- const Instruction &Inst, uint64_t &Sum) const {
- const DILocation *DIL = Inst.getDebugLoc();
- std::vector<const FunctionSamples *> R;
-
- if (!DIL) {
- return R;
- }
-
+}
+
+/// Returns a vector of FunctionSamples that are the indirect call targets
+/// of \p Inst. The vector is sorted by the total number of samples. Stores
+/// the total call count of the indirect call in \p Sum.
+std::vector<const FunctionSamples *>
+SampleProfileLoader::findIndirectCallFunctionSamples(
+ const Instruction &Inst, uint64_t &Sum) const {
+ const DILocation *DIL = Inst.getDebugLoc();
+ std::vector<const FunctionSamples *> R;
+
+ if (!DIL) {
+ return R;
+ }
+
auto FSCompare = [](const FunctionSamples *L, const FunctionSamples *R) {
assert(L && R && "Expect non-null FunctionSamples");
if (L->getEntrySamples() != R->getEntrySamples())
@@ -1046,50 +1046,50 @@ SampleProfileLoader::findIndirectCallFunctionSamples(
return R;
}
- const FunctionSamples *FS = findFunctionSamples(Inst);
- if (FS == nullptr)
- return R;
-
+ const FunctionSamples *FS = findFunctionSamples(Inst);
+ if (FS == nullptr)
+ return R;
+
auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
auto T = FS->findCallTargetMapAt(CallSite);
- Sum = 0;
- if (T)
- for (const auto &T_C : T.get())
- Sum += T_C.second;
+ Sum = 0;
+ if (T)
+ for (const auto &T_C : T.get())
+ Sum += T_C.second;
if (const FunctionSamplesMap *M = FS->findFunctionSamplesMapAt(CallSite)) {
- if (M->empty())
- return R;
- for (const auto &NameFS : *M) {
- Sum += NameFS.second.getEntrySamples();
- R.push_back(&NameFS.second);
- }
+ if (M->empty())
+ return R;
+ for (const auto &NameFS : *M) {
+ Sum += NameFS.second.getEntrySamples();
+ R.push_back(&NameFS.second);
+ }
llvm::sort(R, FSCompare);
- }
- return R;
-}
-
-/// Get the FunctionSamples for an instruction.
-///
-/// The FunctionSamples of an instruction \p Inst is the inlined instance
-/// in which that instruction is coming from. We traverse the inline stack
-/// of that instruction, and match it with the tree nodes in the profile.
-///
-/// \param Inst Instruction to query.
-///
-/// \returns the FunctionSamples pointer to the inlined instance.
-const FunctionSamples *
-SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
+ }
+ return R;
+}
+
+/// Get the FunctionSamples for an instruction.
+///
+/// The FunctionSamples of an instruction \p Inst is the inlined instance
+/// in which that instruction is coming from. We traverse the inline stack
+/// of that instruction, and match it with the tree nodes in the profile.
+///
+/// \param Inst Instruction to query.
+///
+/// \returns the FunctionSamples pointer to the inlined instance.
+const FunctionSamples *
+SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
if (FunctionSamples::ProfileIsProbeBased) {
Optional<PseudoProbe> Probe = extractProbe(Inst);
if (!Probe)
return nullptr;
}
- const DILocation *DIL = Inst.getDebugLoc();
- if (!DIL)
- return Samples;
-
- auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
+ const DILocation *DIL = Inst.getDebugLoc();
+ if (!DIL)
+ return Samples;
+
+ auto it = DILocation2SampleMap.try_emplace(DIL,nullptr);
if (it.second) {
if (ProfileIsCS)
it.first->second = ContextTracker->getContextSamplesFor(DIL);
@@ -1097,9 +1097,9 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
it.first->second =
Samples->findFunctionSamples(DIL, Reader->getRemapper());
}
- return it.first->second;
-}
-
+ return it.first->second;
+}
+
/// Attempt to promote indirect call and also inline the promoted call.
///
/// \param F Caller function.
@@ -1158,175 +1158,175 @@ bool SampleProfileLoader::tryPromoteAndInlineCandidate(
LLVM_DEBUG(dbgs() << "\nFailed to promote indirect call to "
<< Candidate.CalleeSamples->getFuncName() << " because "
<< Reason << "\n");
- }
- return false;
-}
-
-bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
- if (!ProfileSizeInline)
- return false;
-
- Function *Callee = CallInst.getCalledFunction();
- if (Callee == nullptr)
- return false;
-
- InlineCost Cost = getInlineCost(CallInst, getInlineParams(), GetTTI(*Callee),
- GetAC, GetTLI);
-
+ }
+ return false;
+}
+
+bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
+ if (!ProfileSizeInline)
+ return false;
+
+ Function *Callee = CallInst.getCalledFunction();
+ if (Callee == nullptr)
+ return false;
+
+ InlineCost Cost = getInlineCost(CallInst, getInlineParams(), GetTTI(*Callee),
+ GetAC, GetTLI);
+
if (Cost.isNever())
return false;
if (Cost.isAlways())
return true;
- return Cost.getCost() <= SampleColdCallSiteThreshold;
-}
-
-void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
- const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
- bool Hot) {
- for (auto I : Candidates) {
- Function *CalledFunction = I->getCalledFunction();
- if (CalledFunction) {
- ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineAttempt",
- I->getDebugLoc(), I->getParent())
- << "previous inlining reattempted for "
- << (Hot ? "hotness: '" : "size: '")
- << ore::NV("Callee", CalledFunction) << "' into '"
- << ore::NV("Caller", &F) << "'");
- }
- }
-}
-
-/// Iteratively inline hot callsites of a function.
-///
-/// Iteratively traverse all callsites of the function \p F, and find if
-/// the corresponding inlined instance exists and is hot in profile. If
-/// it is hot enough, inline the callsites and adds new callsites of the
-/// callee into the caller. If the call is an indirect call, first promote
-/// it to direct call. Each indirect call is limited with a single target.
-///
-/// \param F function to perform iterative inlining.
-/// \param InlinedGUIDs a set to be updated to include all GUIDs that are
-/// inlined in the profiled binary.
-///
-/// \returns True if there is any inline happened.
-bool SampleProfileLoader::inlineHotFunctions(
- Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
- DenseSet<Instruction *> PromotedInsns;
-
- // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
- // Profile symbol list is ignored when profile-sample-accurate is on.
- assert((!ProfAccForSymsInList ||
- (!ProfileSampleAccurate &&
- !F.hasFnAttribute("profile-sample-accurate"))) &&
- "ProfAccForSymsInList should be false when profile-sample-accurate "
- "is enabled");
-
+ return Cost.getCost() <= SampleColdCallSiteThreshold;
+}
+
+void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
+ const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
+ bool Hot) {
+ for (auto I : Candidates) {
+ Function *CalledFunction = I->getCalledFunction();
+ if (CalledFunction) {
+ ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineAttempt",
+ I->getDebugLoc(), I->getParent())
+ << "previous inlining reattempted for "
+ << (Hot ? "hotness: '" : "size: '")
+ << ore::NV("Callee", CalledFunction) << "' into '"
+ << ore::NV("Caller", &F) << "'");
+ }
+ }
+}
+
+/// Iteratively inline hot callsites of a function.
+///
+/// Iteratively traverse all callsites of the function \p F, and find if
+/// the corresponding inlined instance exists and is hot in profile. If
+/// it is hot enough, inline the callsites and adds new callsites of the
+/// callee into the caller. If the call is an indirect call, first promote
+/// it to direct call. Each indirect call is limited with a single target.
+///
+/// \param F function to perform iterative inlining.
+/// \param InlinedGUIDs a set to be updated to include all GUIDs that are
+/// inlined in the profiled binary.
+///
+/// \returns True if there is any inline happened.
+bool SampleProfileLoader::inlineHotFunctions(
+ Function &F, DenseSet<GlobalValue::GUID> &InlinedGUIDs) {
+ DenseSet<Instruction *> PromotedInsns;
+
+ // ProfAccForSymsInList is used in callsiteIsHot. The assertion makes sure
+ // Profile symbol list is ignored when profile-sample-accurate is on.
+ assert((!ProfAccForSymsInList ||
+ (!ProfileSampleAccurate &&
+ !F.hasFnAttribute("profile-sample-accurate"))) &&
+ "ProfAccForSymsInList should be false when profile-sample-accurate "
+ "is enabled");
+
DenseMap<CallBase *, const FunctionSamples *> LocalNotInlinedCallSites;
- bool Changed = false;
+ bool Changed = false;
bool LocalChanged = true;
while (LocalChanged) {
LocalChanged = false;
- SmallVector<CallBase *, 10> CIS;
- for (auto &BB : F) {
- bool Hot = false;
- SmallVector<CallBase *, 10> AllCandidates;
- SmallVector<CallBase *, 10> ColdCandidates;
- for (auto &I : BB.getInstList()) {
- const FunctionSamples *FS = nullptr;
- if (auto *CB = dyn_cast<CallBase>(&I)) {
- if (!isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(*CB))) {
+ SmallVector<CallBase *, 10> CIS;
+ for (auto &BB : F) {
+ bool Hot = false;
+ SmallVector<CallBase *, 10> AllCandidates;
+ SmallVector<CallBase *, 10> ColdCandidates;
+ for (auto &I : BB.getInstList()) {
+ const FunctionSamples *FS = nullptr;
+ if (auto *CB = dyn_cast<CallBase>(&I)) {
+ if (!isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(*CB))) {
assert((!FunctionSamples::UseMD5 || FS->GUIDToFuncNameMap) &&
"GUIDToFuncNameMap has to be populated");
- AllCandidates.push_back(CB);
+ AllCandidates.push_back(CB);
if (FS->getEntrySamples() > 0 || ProfileIsCS)
LocalNotInlinedCallSites.try_emplace(CB, FS);
- if (callsiteIsHot(FS, PSI))
- Hot = true;
- else if (shouldInlineColdCallee(*CB))
- ColdCandidates.push_back(CB);
- }
- }
- }
+ if (callsiteIsHot(FS, PSI))
+ Hot = true;
+ else if (shouldInlineColdCallee(*CB))
+ ColdCandidates.push_back(CB);
+ }
+ }
+ }
if (Hot || ExternalInlineAdvisor) {
- CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
- emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
- } else {
- CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
- emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
- }
- }
- for (CallBase *I : CIS) {
- Function *CalledFunction = I->getCalledFunction();
+ CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
+ emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
+ } else {
+ CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
+ emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
+ }
+ }
+ for (CallBase *I : CIS) {
+ Function *CalledFunction = I->getCalledFunction();
InlineCandidate Candidate = {
I,
LocalNotInlinedCallSites.count(I) ? LocalNotInlinedCallSites[I]
: nullptr,
0 /* dummy count */, 1.0 /* dummy distribution factor */};
- // Do not inline recursive calls.
- if (CalledFunction == &F)
- continue;
- if (I->isIndirectCall()) {
- if (PromotedInsns.count(I))
- continue;
- uint64_t Sum;
- for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
+ // Do not inline recursive calls.
+ if (CalledFunction == &F)
+ continue;
+ if (I->isIndirectCall()) {
+ if (PromotedInsns.count(I))
+ continue;
+ uint64_t Sum;
+ for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) {
uint64_t SumOrigin = Sum;
if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
- FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
- PSI->getOrCompHotCountThreshold());
- continue;
- }
- if (!callsiteIsHot(FS, PSI))
- continue;
-
+ FS->findInlinedFunctions(InlinedGUIDs, F.getParent(),
+ PSI->getOrCompHotCountThreshold());
+ continue;
+ }
+ if (!callsiteIsHot(FS, PSI))
+ continue;
+
Candidate = {I, FS, FS->getEntrySamples(), 1.0};
if (tryPromoteAndInlineCandidate(F, Candidate, SumOrigin, Sum,
PromotedInsns)) {
LocalNotInlinedCallSites.erase(I);
LocalChanged = true;
- }
- }
- } else if (CalledFunction && CalledFunction->getSubprogram() &&
- !CalledFunction->isDeclaration()) {
+ }
+ }
+ } else if (CalledFunction && CalledFunction->getSubprogram() &&
+ !CalledFunction->isDeclaration()) {
if (tryInlineCandidate(Candidate)) {
LocalNotInlinedCallSites.erase(I);
- LocalChanged = true;
- }
+ LocalChanged = true;
+ }
} else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) {
- findCalleeFunctionSamples(*I)->findInlinedFunctions(
- InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold());
- }
- }
+ findCalleeFunctionSamples(*I)->findInlinedFunctions(
+ InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold());
+ }
+ }
Changed |= LocalChanged;
- }
-
+ }
+
// For CS profile, profile for not inlined context will be merged when
// base profile is being trieved
if (ProfileIsCS)
return Changed;
- // Accumulate not inlined callsite information into notInlinedSamples
+ // Accumulate not inlined callsite information into notInlinedSamples
for (const auto &Pair : LocalNotInlinedCallSites) {
- CallBase *I = Pair.getFirst();
- Function *Callee = I->getCalledFunction();
- if (!Callee || Callee->isDeclaration())
- continue;
-
- ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline",
- I->getDebugLoc(), I->getParent())
- << "previous inlining not repeated: '"
- << ore::NV("Callee", Callee) << "' into '"
- << ore::NV("Caller", &F) << "'");
-
- ++NumCSNotInlined;
- const FunctionSamples *FS = Pair.getSecond();
- if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) {
- continue;
- }
-
- if (ProfileMergeInlinee) {
+ CallBase *I = Pair.getFirst();
+ Function *Callee = I->getCalledFunction();
+ if (!Callee || Callee->isDeclaration())
+ continue;
+
+ ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "NotInline",
+ I->getDebugLoc(), I->getParent())
+ << "previous inlining not repeated: '"
+ << ore::NV("Callee", Callee) << "' into '"
+ << ore::NV("Caller", &F) << "'");
+
+ ++NumCSNotInlined;
+ const FunctionSamples *FS = Pair.getSecond();
+ if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) {
+ continue;
+ }
+
+ if (ProfileMergeInlinee) {
// A function call can be replicated by optimizations like callsite
// splitting or jump threading and the replicates end up sharing the
// sample nested callee profile instead of slicing the original inlinee's
@@ -1337,22 +1337,22 @@ bool SampleProfileLoader::inlineHotFunctions(
// don't have head samples.
const_cast<FunctionSamples *>(FS)->addHeadSamples(
FS->getEntrySamples());
-
+
// Note that we have to do the merge right after processing function.
// This allows OutlineFS's profile to be used for annotation during
// top-down processing of functions' annotation.
FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee);
OutlineFS->merge(*FS);
}
- } else {
- auto pair =
- notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
- pair.first->second.entryCount += FS->getEntrySamples();
- }
- }
- return Changed;
-}
-
+ } else {
+ auto pair =
+ notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0});
+ pair.first->second.entryCount += FS->getEntrySamples();
+ }
+ }
+ return Changed;
+}
+
bool SampleProfileLoader::tryInlineCandidate(
InlineCandidate &Candidate, SmallVector<CallBase *, 8> *InlinedCallSites) {
@@ -1613,428 +1613,428 @@ bool SampleProfileLoader::inlineHotFunctionsWithPriority(
return Changed;
}
-/// Find equivalence classes for the given block.
-///
-/// This finds all the blocks that are guaranteed to execute the same
-/// number of times as \p BB1. To do this, it traverses all the
-/// descendants of \p BB1 in the dominator or post-dominator tree.
-///
-/// A block BB2 will be in the same equivalence class as \p BB1 if
-/// the following holds:
-///
-/// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2
-/// is a descendant of \p BB1 in the dominator tree, then BB2 should
-/// dominate BB1 in the post-dominator tree.
-///
-/// 2- Both BB2 and \p BB1 must be in the same loop.
-///
-/// For every block BB2 that meets those two requirements, we set BB2's
-/// equivalence class to \p BB1.
-///
-/// \param BB1 Block to check.
-/// \param Descendants Descendants of \p BB1 in either the dom or pdom tree.
-/// \param DomTree Opposite dominator tree. If \p Descendants is filled
-/// with blocks from \p BB1's dominator tree, then
-/// this is the post-dominator tree, and vice versa.
-template <bool IsPostDom>
-void SampleProfileLoader::findEquivalencesFor(
- BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
- DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) {
- const BasicBlock *EC = EquivalenceClass[BB1];
- uint64_t Weight = BlockWeights[EC];
- for (const auto *BB2 : Descendants) {
- bool IsDomParent = DomTree->dominates(BB2, BB1);
- bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2);
- if (BB1 != BB2 && IsDomParent && IsInSameLoop) {
- EquivalenceClass[BB2] = EC;
- // If BB2 is visited, then the entire EC should be marked as visited.
- if (VisitedBlocks.count(BB2)) {
- VisitedBlocks.insert(EC);
- }
-
- // If BB2 is heavier than BB1, make BB2 have the same weight
- // as BB1.
- //
- // Note that we don't worry about the opposite situation here
- // (when BB2 is lighter than BB1). We will deal with this
- // during the propagation phase. Right now, we just want to
- // make sure that BB1 has the largest weight of all the
- // members of its equivalence set.
- Weight = std::max(Weight, BlockWeights[BB2]);
- }
- }
- if (EC == &EC->getParent()->getEntryBlock()) {
- BlockWeights[EC] = Samples->getHeadSamples() + 1;
- } else {
- BlockWeights[EC] = Weight;
- }
-}
-
-/// Find equivalence classes.
-///
-/// Since samples may be missing from blocks, we can fill in the gaps by setting
-/// the weights of all the blocks in the same equivalence class to the same
-/// weight. To compute the concept of equivalence, we use dominance and loop
-/// information. Two blocks B1 and B2 are in the same equivalence class if B1
-/// dominates B2, B2 post-dominates B1 and both are in the same loop.
-///
-/// \param F The function to query.
-void SampleProfileLoader::findEquivalenceClasses(Function &F) {
- SmallVector<BasicBlock *, 8> DominatedBBs;
- LLVM_DEBUG(dbgs() << "\nBlock equivalence classes\n");
- // Find equivalence sets based on dominance and post-dominance information.
- for (auto &BB : F) {
- BasicBlock *BB1 = &BB;
-
- // Compute BB1's equivalence class once.
- if (EquivalenceClass.count(BB1)) {
- LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
- continue;
- }
-
- // By default, blocks are in their own equivalence class.
- EquivalenceClass[BB1] = BB1;
-
- // Traverse all the blocks dominated by BB1. We are looking for
- // every basic block BB2 such that:
- //
- // 1- BB1 dominates BB2.
- // 2- BB2 post-dominates BB1.
- // 3- BB1 and BB2 are in the same loop nest.
- //
- // If all those conditions hold, it means that BB2 is executed
- // as many times as BB1, so they are placed in the same equivalence
- // class by making BB2's equivalence class be BB1.
- DominatedBBs.clear();
- DT->getDescendants(BB1, DominatedBBs);
- findEquivalencesFor(BB1, DominatedBBs, PDT.get());
-
- LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
- }
-
- // Assign weights to equivalence classes.
- //
- // All the basic blocks in the same equivalence class will execute
- // the same number of times. Since we know that the head block in
- // each equivalence class has the largest weight, assign that weight
- // to all the blocks in that equivalence class.
- LLVM_DEBUG(
- dbgs() << "\nAssign the same weight to all blocks in the same class\n");
- for (auto &BI : F) {
- const BasicBlock *BB = &BI;
- const BasicBlock *EquivBB = EquivalenceClass[BB];
- if (BB != EquivBB)
- BlockWeights[BB] = BlockWeights[EquivBB];
- LLVM_DEBUG(printBlockWeight(dbgs(), BB));
- }
-}
-
-/// Visit the given edge to decide if it has a valid weight.
-///
-/// If \p E has not been visited before, we copy to \p UnknownEdge
-/// and increment the count of unknown edges.
-///
-/// \param E Edge to visit.
-/// \param NumUnknownEdges Current number of unknown edges.
-/// \param UnknownEdge Set if E has not been visited before.
-///
-/// \returns E's weight, if known. Otherwise, return 0.
-uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
- Edge *UnknownEdge) {
- if (!VisitedEdges.count(E)) {
- (*NumUnknownEdges)++;
- *UnknownEdge = E;
- return 0;
- }
-
- return EdgeWeights[E];
-}
-
-/// Propagate weights through incoming/outgoing edges.
-///
-/// If the weight of a basic block is known, and there is only one edge
-/// with an unknown weight, we can calculate the weight of that edge.
-///
-/// Similarly, if all the edges have a known count, we can calculate the
-/// count of the basic block, if needed.
-///
-/// \param F Function to process.
-/// \param UpdateBlockCount Whether we should update basic block counts that
-/// has already been annotated.
-///
-/// \returns True if new weights were assigned to edges or blocks.
-bool SampleProfileLoader::propagateThroughEdges(Function &F,
- bool UpdateBlockCount) {
- bool Changed = false;
- LLVM_DEBUG(dbgs() << "\nPropagation through edges\n");
- for (const auto &BI : F) {
- const BasicBlock *BB = &BI;
- const BasicBlock *EC = EquivalenceClass[BB];
-
- // Visit all the predecessor and successor edges to determine
- // which ones have a weight assigned already. Note that it doesn't
- // matter that we only keep track of a single unknown edge. The
- // only case we are interested in handling is when only a single
- // edge is unknown (see setEdgeOrBlockWeight).
- for (unsigned i = 0; i < 2; i++) {
- uint64_t TotalWeight = 0;
- unsigned NumUnknownEdges = 0, NumTotalEdges = 0;
- Edge UnknownEdge, SelfReferentialEdge, SingleEdge;
-
- if (i == 0) {
- // First, visit all predecessor edges.
- NumTotalEdges = Predecessors[BB].size();
- for (auto *Pred : Predecessors[BB]) {
- Edge E = std::make_pair(Pred, BB);
- TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
- if (E.first == E.second)
- SelfReferentialEdge = E;
- }
- if (NumTotalEdges == 1) {
- SingleEdge = std::make_pair(Predecessors[BB][0], BB);
- }
- } else {
- // On the second round, visit all successor edges.
- NumTotalEdges = Successors[BB].size();
- for (auto *Succ : Successors[BB]) {
- Edge E = std::make_pair(BB, Succ);
- TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
- }
- if (NumTotalEdges == 1) {
- SingleEdge = std::make_pair(BB, Successors[BB][0]);
- }
- }
-
- // After visiting all the edges, there are three cases that we
- // can handle immediately:
- //
- // - All the edge weights are known (i.e., NumUnknownEdges == 0).
- // In this case, we simply check that the sum of all the edges
- // is the same as BB's weight. If not, we change BB's weight
- // to match. Additionally, if BB had not been visited before,
- // we mark it visited.
- //
- // - Only one edge is unknown and BB has already been visited.
- // In this case, we can compute the weight of the edge by
- // subtracting the total block weight from all the known
- // edge weights. If the edges weight more than BB, then the
- // edge of the last remaining edge is set to zero.
- //
- // - There exists a self-referential edge and the weight of BB is
- // known. In this case, this edge can be based on BB's weight.
- // We add up all the other known edges and set the weight on
- // the self-referential edge as we did in the previous case.
- //
- // In any other case, we must continue iterating. Eventually,
- // all edges will get a weight, or iteration will stop when
- // it reaches SampleProfileMaxPropagateIterations.
- if (NumUnknownEdges <= 1) {
- uint64_t &BBWeight = BlockWeights[EC];
- if (NumUnknownEdges == 0) {
- if (!VisitedBlocks.count(EC)) {
- // If we already know the weight of all edges, the weight of the
- // basic block can be computed. It should be no larger than the sum
- // of all edge weights.
- if (TotalWeight > BBWeight) {
- BBWeight = TotalWeight;
- Changed = true;
- LLVM_DEBUG(dbgs() << "All edge weights for " << BB->getName()
- << " known. Set weight for block: ";
- printBlockWeight(dbgs(), BB););
- }
- } else if (NumTotalEdges == 1 &&
- EdgeWeights[SingleEdge] < BlockWeights[EC]) {
- // If there is only one edge for the visited basic block, use the
- // block weight to adjust edge weight if edge weight is smaller.
- EdgeWeights[SingleEdge] = BlockWeights[EC];
- Changed = true;
- }
- } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) {
- // If there is a single unknown edge and the block has been
- // visited, then we can compute E's weight.
- if (BBWeight >= TotalWeight)
- EdgeWeights[UnknownEdge] = BBWeight - TotalWeight;
- else
- EdgeWeights[UnknownEdge] = 0;
- const BasicBlock *OtherEC;
- if (i == 0)
- OtherEC = EquivalenceClass[UnknownEdge.first];
- else
- OtherEC = EquivalenceClass[UnknownEdge.second];
- // Edge weights should never exceed the BB weights it connects.
- if (VisitedBlocks.count(OtherEC) &&
- EdgeWeights[UnknownEdge] > BlockWeights[OtherEC])
- EdgeWeights[UnknownEdge] = BlockWeights[OtherEC];
- VisitedEdges.insert(UnknownEdge);
- Changed = true;
- LLVM_DEBUG(dbgs() << "Set weight for edge: ";
- printEdgeWeight(dbgs(), UnknownEdge));
- }
- } else if (VisitedBlocks.count(EC) && BlockWeights[EC] == 0) {
- // If a block Weights 0, all its in/out edges should weight 0.
- if (i == 0) {
- for (auto *Pred : Predecessors[BB]) {
- Edge E = std::make_pair(Pred, BB);
- EdgeWeights[E] = 0;
- VisitedEdges.insert(E);
- }
- } else {
- for (auto *Succ : Successors[BB]) {
- Edge E = std::make_pair(BB, Succ);
- EdgeWeights[E] = 0;
- VisitedEdges.insert(E);
- }
- }
- } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) {
- uint64_t &BBWeight = BlockWeights[BB];
- // We have a self-referential edge and the weight of BB is known.
- if (BBWeight >= TotalWeight)
- EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight;
- else
- EdgeWeights[SelfReferentialEdge] = 0;
- VisitedEdges.insert(SelfReferentialEdge);
- Changed = true;
- LLVM_DEBUG(dbgs() << "Set self-referential edge weight to: ";
- printEdgeWeight(dbgs(), SelfReferentialEdge));
- }
- if (UpdateBlockCount && !VisitedBlocks.count(EC) && TotalWeight > 0) {
- BlockWeights[EC] = TotalWeight;
- VisitedBlocks.insert(EC);
- Changed = true;
- }
- }
- }
-
- return Changed;
-}
-
-/// Build in/out edge lists for each basic block in the CFG.
-///
-/// We are interested in unique edges. If a block B1 has multiple
-/// edges to another block B2, we only add a single B1->B2 edge.
-void SampleProfileLoader::buildEdges(Function &F) {
- for (auto &BI : F) {
- BasicBlock *B1 = &BI;
-
- // Add predecessors for B1.
- SmallPtrSet<BasicBlock *, 16> Visited;
- if (!Predecessors[B1].empty())
- llvm_unreachable("Found a stale predecessors list in a basic block.");
- for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) {
- BasicBlock *B2 = *PI;
- if (Visited.insert(B2).second)
- Predecessors[B1].push_back(B2);
- }
-
- // Add successors for B1.
- Visited.clear();
- if (!Successors[B1].empty())
- llvm_unreachable("Found a stale successors list in a basic block.");
- for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) {
- BasicBlock *B2 = *SI;
- if (Visited.insert(B2).second)
- Successors[B1].push_back(B2);
- }
- }
-}
-
-/// Returns the sorted CallTargetMap \p M by count in descending order.
-static SmallVector<InstrProfValueData, 2> GetSortedValueDataFromCallTargets(
- const SampleRecord::CallTargetMap & M) {
- SmallVector<InstrProfValueData, 2> R;
- for (const auto &I : SampleRecord::SortCallTargets(M)) {
- R.emplace_back(InstrProfValueData{FunctionSamples::getGUID(I.first), I.second});
- }
- return R;
-}
-
-/// Propagate weights into edges
-///
-/// The following rules are applied to every block BB in the CFG:
-///
-/// - If BB has a single predecessor/successor, then the weight
-/// of that edge is the weight of the block.
-///
-/// - If all incoming or outgoing edges are known except one, and the
-/// weight of the block is already known, the weight of the unknown
-/// edge will be the weight of the block minus the sum of all the known
-/// edges. If the sum of all the known edges is larger than BB's weight,
-/// we set the unknown edge weight to zero.
-///
-/// - If there is a self-referential edge, and the weight of the block is
-/// known, the weight for that edge is set to the weight of the block
-/// minus the weight of the other incoming edges to that block (if
-/// known).
-void SampleProfileLoader::propagateWeights(Function &F) {
- bool Changed = true;
- unsigned I = 0;
-
- // If BB weight is larger than its corresponding loop's header BB weight,
- // use the BB weight to replace the loop header BB weight.
- for (auto &BI : F) {
- BasicBlock *BB = &BI;
- Loop *L = LI->getLoopFor(BB);
- if (!L) {
- continue;
- }
- BasicBlock *Header = L->getHeader();
- if (Header && BlockWeights[BB] > BlockWeights[Header]) {
- BlockWeights[Header] = BlockWeights[BB];
- }
- }
-
- // Before propagation starts, build, for each block, a list of
- // unique predecessors and successors. This is necessary to handle
- // identical edges in multiway branches. Since we visit all blocks and all
- // edges of the CFG, it is cleaner to build these lists once at the start
- // of the pass.
- buildEdges(F);
-
- // Propagate until we converge or we go past the iteration limit.
- while (Changed && I++ < SampleProfileMaxPropagateIterations) {
- Changed = propagateThroughEdges(F, false);
- }
-
- // The first propagation propagates BB counts from annotated BBs to unknown
- // BBs. The 2nd propagation pass resets edges weights, and use all BB weights
- // to propagate edge weights.
- VisitedEdges.clear();
- Changed = true;
- while (Changed && I++ < SampleProfileMaxPropagateIterations) {
- Changed = propagateThroughEdges(F, false);
- }
-
- // The 3rd propagation pass allows adjust annotated BB weights that are
- // obviously wrong.
- Changed = true;
- while (Changed && I++ < SampleProfileMaxPropagateIterations) {
- Changed = propagateThroughEdges(F, true);
- }
-
- // Generate MD_prof metadata for every branch instruction using the
- // edge weights computed during propagation.
- LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
- LLVMContext &Ctx = F.getContext();
- MDBuilder MDB(Ctx);
- for (auto &BI : F) {
- BasicBlock *BB = &BI;
-
- if (BlockWeights[BB]) {
- for (auto &I : BB->getInstList()) {
- if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
- continue;
- if (!cast<CallBase>(I).getCalledFunction()) {
- const DebugLoc &DLoc = I.getDebugLoc();
- if (!DLoc)
- continue;
- const DILocation *DIL = DLoc;
- const FunctionSamples *FS = findFunctionSamples(I);
- if (!FS)
- continue;
+/// Find equivalence classes for the given block.
+///
+/// This finds all the blocks that are guaranteed to execute the same
+/// number of times as \p BB1. To do this, it traverses all the
+/// descendants of \p BB1 in the dominator or post-dominator tree.
+///
+/// A block BB2 will be in the same equivalence class as \p BB1 if
+/// the following holds:
+///
+/// 1- \p BB1 is a descendant of BB2 in the opposite tree. So, if BB2
+/// is a descendant of \p BB1 in the dominator tree, then BB2 should
+/// dominate BB1 in the post-dominator tree.
+///
+/// 2- Both BB2 and \p BB1 must be in the same loop.
+///
+/// For every block BB2 that meets those two requirements, we set BB2's
+/// equivalence class to \p BB1.
+///
+/// \param BB1 Block to check.
+/// \param Descendants Descendants of \p BB1 in either the dom or pdom tree.
+/// \param DomTree Opposite dominator tree. If \p Descendants is filled
+/// with blocks from \p BB1's dominator tree, then
+/// this is the post-dominator tree, and vice versa.
+template <bool IsPostDom>
+void SampleProfileLoader::findEquivalencesFor(
+ BasicBlock *BB1, ArrayRef<BasicBlock *> Descendants,
+ DominatorTreeBase<BasicBlock, IsPostDom> *DomTree) {
+ const BasicBlock *EC = EquivalenceClass[BB1];
+ uint64_t Weight = BlockWeights[EC];
+ for (const auto *BB2 : Descendants) {
+ bool IsDomParent = DomTree->dominates(BB2, BB1);
+ bool IsInSameLoop = LI->getLoopFor(BB1) == LI->getLoopFor(BB2);
+ if (BB1 != BB2 && IsDomParent && IsInSameLoop) {
+ EquivalenceClass[BB2] = EC;
+ // If BB2 is visited, then the entire EC should be marked as visited.
+ if (VisitedBlocks.count(BB2)) {
+ VisitedBlocks.insert(EC);
+ }
+
+ // If BB2 is heavier than BB1, make BB2 have the same weight
+ // as BB1.
+ //
+ // Note that we don't worry about the opposite situation here
+ // (when BB2 is lighter than BB1). We will deal with this
+ // during the propagation phase. Right now, we just want to
+ // make sure that BB1 has the largest weight of all the
+ // members of its equivalence set.
+ Weight = std::max(Weight, BlockWeights[BB2]);
+ }
+ }
+ if (EC == &EC->getParent()->getEntryBlock()) {
+ BlockWeights[EC] = Samples->getHeadSamples() + 1;
+ } else {
+ BlockWeights[EC] = Weight;
+ }
+}
+
+/// Find equivalence classes.
+///
+/// Since samples may be missing from blocks, we can fill in the gaps by setting
+/// the weights of all the blocks in the same equivalence class to the same
+/// weight. To compute the concept of equivalence, we use dominance and loop
+/// information. Two blocks B1 and B2 are in the same equivalence class if B1
+/// dominates B2, B2 post-dominates B1 and both are in the same loop.
+///
+/// \param F The function to query.
+void SampleProfileLoader::findEquivalenceClasses(Function &F) {
+ SmallVector<BasicBlock *, 8> DominatedBBs;
+ LLVM_DEBUG(dbgs() << "\nBlock equivalence classes\n");
+ // Find equivalence sets based on dominance and post-dominance information.
+ for (auto &BB : F) {
+ BasicBlock *BB1 = &BB;
+
+ // Compute BB1's equivalence class once.
+ if (EquivalenceClass.count(BB1)) {
+ LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
+ continue;
+ }
+
+ // By default, blocks are in their own equivalence class.
+ EquivalenceClass[BB1] = BB1;
+
+ // Traverse all the blocks dominated by BB1. We are looking for
+ // every basic block BB2 such that:
+ //
+ // 1- BB1 dominates BB2.
+ // 2- BB2 post-dominates BB1.
+ // 3- BB1 and BB2 are in the same loop nest.
+ //
+ // If all those conditions hold, it means that BB2 is executed
+ // as many times as BB1, so they are placed in the same equivalence
+ // class by making BB2's equivalence class be BB1.
+ DominatedBBs.clear();
+ DT->getDescendants(BB1, DominatedBBs);
+ findEquivalencesFor(BB1, DominatedBBs, PDT.get());
+
+ LLVM_DEBUG(printBlockEquivalence(dbgs(), BB1));
+ }
+
+ // Assign weights to equivalence classes.
+ //
+ // All the basic blocks in the same equivalence class will execute
+ // the same number of times. Since we know that the head block in
+ // each equivalence class has the largest weight, assign that weight
+ // to all the blocks in that equivalence class.
+ LLVM_DEBUG(
+ dbgs() << "\nAssign the same weight to all blocks in the same class\n");
+ for (auto &BI : F) {
+ const BasicBlock *BB = &BI;
+ const BasicBlock *EquivBB = EquivalenceClass[BB];
+ if (BB != EquivBB)
+ BlockWeights[BB] = BlockWeights[EquivBB];
+ LLVM_DEBUG(printBlockWeight(dbgs(), BB));
+ }
+}
+
+/// Visit the given edge to decide if it has a valid weight.
+///
+/// If \p E has not been visited before, we copy to \p UnknownEdge
+/// and increment the count of unknown edges.
+///
+/// \param E Edge to visit.
+/// \param NumUnknownEdges Current number of unknown edges.
+/// \param UnknownEdge Set if E has not been visited before.
+///
+/// \returns E's weight, if known. Otherwise, return 0.
+uint64_t SampleProfileLoader::visitEdge(Edge E, unsigned *NumUnknownEdges,
+ Edge *UnknownEdge) {
+ if (!VisitedEdges.count(E)) {
+ (*NumUnknownEdges)++;
+ *UnknownEdge = E;
+ return 0;
+ }
+
+ return EdgeWeights[E];
+}
+
+/// Propagate weights through incoming/outgoing edges.
+///
+/// If the weight of a basic block is known, and there is only one edge
+/// with an unknown weight, we can calculate the weight of that edge.
+///
+/// Similarly, if all the edges have a known count, we can calculate the
+/// count of the basic block, if needed.
+///
+/// \param F Function to process.
+/// \param UpdateBlockCount Whether we should update basic block counts that
+/// has already been annotated.
+///
+/// \returns True if new weights were assigned to edges or blocks.
+bool SampleProfileLoader::propagateThroughEdges(Function &F,
+ bool UpdateBlockCount) {
+ bool Changed = false;
+ LLVM_DEBUG(dbgs() << "\nPropagation through edges\n");
+ for (const auto &BI : F) {
+ const BasicBlock *BB = &BI;
+ const BasicBlock *EC = EquivalenceClass[BB];
+
+ // Visit all the predecessor and successor edges to determine
+ // which ones have a weight assigned already. Note that it doesn't
+ // matter that we only keep track of a single unknown edge. The
+ // only case we are interested in handling is when only a single
+ // edge is unknown (see setEdgeOrBlockWeight).
+ for (unsigned i = 0; i < 2; i++) {
+ uint64_t TotalWeight = 0;
+ unsigned NumUnknownEdges = 0, NumTotalEdges = 0;
+ Edge UnknownEdge, SelfReferentialEdge, SingleEdge;
+
+ if (i == 0) {
+ // First, visit all predecessor edges.
+ NumTotalEdges = Predecessors[BB].size();
+ for (auto *Pred : Predecessors[BB]) {
+ Edge E = std::make_pair(Pred, BB);
+ TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
+ if (E.first == E.second)
+ SelfReferentialEdge = E;
+ }
+ if (NumTotalEdges == 1) {
+ SingleEdge = std::make_pair(Predecessors[BB][0], BB);
+ }
+ } else {
+ // On the second round, visit all successor edges.
+ NumTotalEdges = Successors[BB].size();
+ for (auto *Succ : Successors[BB]) {
+ Edge E = std::make_pair(BB, Succ);
+ TotalWeight += visitEdge(E, &NumUnknownEdges, &UnknownEdge);
+ }
+ if (NumTotalEdges == 1) {
+ SingleEdge = std::make_pair(BB, Successors[BB][0]);
+ }
+ }
+
+ // After visiting all the edges, there are three cases that we
+ // can handle immediately:
+ //
+ // - All the edge weights are known (i.e., NumUnknownEdges == 0).
+ // In this case, we simply check that the sum of all the edges
+ // is the same as BB's weight. If not, we change BB's weight
+ // to match. Additionally, if BB had not been visited before,
+ // we mark it visited.
+ //
+ // - Only one edge is unknown and BB has already been visited.
+ // In this case, we can compute the weight of the edge by
+ // subtracting the total block weight from all the known
+ // edge weights. If the edges weight more than BB, then the
+ // edge of the last remaining edge is set to zero.
+ //
+ // - There exists a self-referential edge and the weight of BB is
+ // known. In this case, this edge can be based on BB's weight.
+ // We add up all the other known edges and set the weight on
+ // the self-referential edge as we did in the previous case.
+ //
+ // In any other case, we must continue iterating. Eventually,
+ // all edges will get a weight, or iteration will stop when
+ // it reaches SampleProfileMaxPropagateIterations.
+ if (NumUnknownEdges <= 1) {
+ uint64_t &BBWeight = BlockWeights[EC];
+ if (NumUnknownEdges == 0) {
+ if (!VisitedBlocks.count(EC)) {
+ // If we already know the weight of all edges, the weight of the
+ // basic block can be computed. It should be no larger than the sum
+ // of all edge weights.
+ if (TotalWeight > BBWeight) {
+ BBWeight = TotalWeight;
+ Changed = true;
+ LLVM_DEBUG(dbgs() << "All edge weights for " << BB->getName()
+ << " known. Set weight for block: ";
+ printBlockWeight(dbgs(), BB););
+ }
+ } else if (NumTotalEdges == 1 &&
+ EdgeWeights[SingleEdge] < BlockWeights[EC]) {
+ // If there is only one edge for the visited basic block, use the
+ // block weight to adjust edge weight if edge weight is smaller.
+ EdgeWeights[SingleEdge] = BlockWeights[EC];
+ Changed = true;
+ }
+ } else if (NumUnknownEdges == 1 && VisitedBlocks.count(EC)) {
+ // If there is a single unknown edge and the block has been
+ // visited, then we can compute E's weight.
+ if (BBWeight >= TotalWeight)
+ EdgeWeights[UnknownEdge] = BBWeight - TotalWeight;
+ else
+ EdgeWeights[UnknownEdge] = 0;
+ const BasicBlock *OtherEC;
+ if (i == 0)
+ OtherEC = EquivalenceClass[UnknownEdge.first];
+ else
+ OtherEC = EquivalenceClass[UnknownEdge.second];
+ // Edge weights should never exceed the BB weights it connects.
+ if (VisitedBlocks.count(OtherEC) &&
+ EdgeWeights[UnknownEdge] > BlockWeights[OtherEC])
+ EdgeWeights[UnknownEdge] = BlockWeights[OtherEC];
+ VisitedEdges.insert(UnknownEdge);
+ Changed = true;
+ LLVM_DEBUG(dbgs() << "Set weight for edge: ";
+ printEdgeWeight(dbgs(), UnknownEdge));
+ }
+ } else if (VisitedBlocks.count(EC) && BlockWeights[EC] == 0) {
+ // If a block Weights 0, all its in/out edges should weight 0.
+ if (i == 0) {
+ for (auto *Pred : Predecessors[BB]) {
+ Edge E = std::make_pair(Pred, BB);
+ EdgeWeights[E] = 0;
+ VisitedEdges.insert(E);
+ }
+ } else {
+ for (auto *Succ : Successors[BB]) {
+ Edge E = std::make_pair(BB, Succ);
+ EdgeWeights[E] = 0;
+ VisitedEdges.insert(E);
+ }
+ }
+ } else if (SelfReferentialEdge.first && VisitedBlocks.count(EC)) {
+ uint64_t &BBWeight = BlockWeights[BB];
+ // We have a self-referential edge and the weight of BB is known.
+ if (BBWeight >= TotalWeight)
+ EdgeWeights[SelfReferentialEdge] = BBWeight - TotalWeight;
+ else
+ EdgeWeights[SelfReferentialEdge] = 0;
+ VisitedEdges.insert(SelfReferentialEdge);
+ Changed = true;
+ LLVM_DEBUG(dbgs() << "Set self-referential edge weight to: ";
+ printEdgeWeight(dbgs(), SelfReferentialEdge));
+ }
+ if (UpdateBlockCount && !VisitedBlocks.count(EC) && TotalWeight > 0) {
+ BlockWeights[EC] = TotalWeight;
+ VisitedBlocks.insert(EC);
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+/// Build in/out edge lists for each basic block in the CFG.
+///
+/// We are interested in unique edges. If a block B1 has multiple
+/// edges to another block B2, we only add a single B1->B2 edge.
+void SampleProfileLoader::buildEdges(Function &F) {
+ for (auto &BI : F) {
+ BasicBlock *B1 = &BI;
+
+ // Add predecessors for B1.
+ SmallPtrSet<BasicBlock *, 16> Visited;
+ if (!Predecessors[B1].empty())
+ llvm_unreachable("Found a stale predecessors list in a basic block.");
+ for (pred_iterator PI = pred_begin(B1), PE = pred_end(B1); PI != PE; ++PI) {
+ BasicBlock *B2 = *PI;
+ if (Visited.insert(B2).second)
+ Predecessors[B1].push_back(B2);
+ }
+
+ // Add successors for B1.
+ Visited.clear();
+ if (!Successors[B1].empty())
+ llvm_unreachable("Found a stale successors list in a basic block.");
+ for (succ_iterator SI = succ_begin(B1), SE = succ_end(B1); SI != SE; ++SI) {
+ BasicBlock *B2 = *SI;
+ if (Visited.insert(B2).second)
+ Successors[B1].push_back(B2);
+ }
+ }
+}
+
+/// Returns the sorted CallTargetMap \p M by count in descending order.
+static SmallVector<InstrProfValueData, 2> GetSortedValueDataFromCallTargets(
+ const SampleRecord::CallTargetMap & M) {
+ SmallVector<InstrProfValueData, 2> R;
+ for (const auto &I : SampleRecord::SortCallTargets(M)) {
+ R.emplace_back(InstrProfValueData{FunctionSamples::getGUID(I.first), I.second});
+ }
+ return R;
+}
+
+/// Propagate weights into edges
+///
+/// The following rules are applied to every block BB in the CFG:
+///
+/// - If BB has a single predecessor/successor, then the weight
+/// of that edge is the weight of the block.
+///
+/// - If all incoming or outgoing edges are known except one, and the
+/// weight of the block is already known, the weight of the unknown
+/// edge will be the weight of the block minus the sum of all the known
+/// edges. If the sum of all the known edges is larger than BB's weight,
+/// we set the unknown edge weight to zero.
+///
+/// - If there is a self-referential edge, and the weight of the block is
+/// known, the weight for that edge is set to the weight of the block
+/// minus the weight of the other incoming edges to that block (if
+/// known).
+void SampleProfileLoader::propagateWeights(Function &F) {
+ bool Changed = true;
+ unsigned I = 0;
+
+ // If BB weight is larger than its corresponding loop's header BB weight,
+ // use the BB weight to replace the loop header BB weight.
+ for (auto &BI : F) {
+ BasicBlock *BB = &BI;
+ Loop *L = LI->getLoopFor(BB);
+ if (!L) {
+ continue;
+ }
+ BasicBlock *Header = L->getHeader();
+ if (Header && BlockWeights[BB] > BlockWeights[Header]) {
+ BlockWeights[Header] = BlockWeights[BB];
+ }
+ }
+
+ // Before propagation starts, build, for each block, a list of
+ // unique predecessors and successors. This is necessary to handle
+ // identical edges in multiway branches. Since we visit all blocks and all
+ // edges of the CFG, it is cleaner to build these lists once at the start
+ // of the pass.
+ buildEdges(F);
+
+ // Propagate until we converge or we go past the iteration limit.
+ while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+ Changed = propagateThroughEdges(F, false);
+ }
+
+ // The first propagation propagates BB counts from annotated BBs to unknown
+ // BBs. The 2nd propagation pass resets edges weights, and use all BB weights
+ // to propagate edge weights.
+ VisitedEdges.clear();
+ Changed = true;
+ while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+ Changed = propagateThroughEdges(F, false);
+ }
+
+ // The 3rd propagation pass allows adjust annotated BB weights that are
+ // obviously wrong.
+ Changed = true;
+ while (Changed && I++ < SampleProfileMaxPropagateIterations) {
+ Changed = propagateThroughEdges(F, true);
+ }
+
+ // Generate MD_prof metadata for every branch instruction using the
+ // edge weights computed during propagation.
+ LLVM_DEBUG(dbgs() << "\nPropagation complete. Setting branch weights\n");
+ LLVMContext &Ctx = F.getContext();
+ MDBuilder MDB(Ctx);
+ for (auto &BI : F) {
+ BasicBlock *BB = &BI;
+
+ if (BlockWeights[BB]) {
+ for (auto &I : BB->getInstList()) {
+ if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
+ continue;
+ if (!cast<CallBase>(I).getCalledFunction()) {
+ const DebugLoc &DLoc = I.getDebugLoc();
+ if (!DLoc)
+ continue;
+ const DILocation *DIL = DLoc;
+ const FunctionSamples *FS = findFunctionSamples(I);
+ if (!FS)
+ continue;
auto CallSite = FunctionSamples::getCallSiteIdentifier(DIL);
auto T = FS->findCallTargetMapAt(CallSite);
- if (!T || T.get().empty())
- continue;
+ if (!T || T.get().empty())
+ continue;
// Prorate the callsite counts to reflect what is already done to the
// callsite, such as ICP or calliste cloning.
if (FunctionSamples::ProfileIsProbeBased) {
@@ -2043,167 +2043,167 @@ void SampleProfileLoader::propagateWeights(Function &F) {
T = SampleRecord::adjustCallTargets(T.get(), Probe->Factor);
}
}
- SmallVector<InstrProfValueData, 2> SortedCallTargets =
- GetSortedValueDataFromCallTargets(T.get());
- uint64_t Sum;
- findIndirectCallFunctionSamples(I, Sum);
- annotateValueSite(*I.getParent()->getParent()->getParent(), I,
- SortedCallTargets, Sum, IPVK_IndirectCallTarget,
- SortedCallTargets.size());
- } else if (!isa<IntrinsicInst>(&I)) {
- I.setMetadata(LLVMContext::MD_prof,
- MDB.createBranchWeights(
- {static_cast<uint32_t>(BlockWeights[BB])}));
- }
- }
- }
- Instruction *TI = BB->getTerminator();
- if (TI->getNumSuccessors() == 1)
- continue;
- if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
- continue;
-
- DebugLoc BranchLoc = TI->getDebugLoc();
- LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
- << ((BranchLoc) ? Twine(BranchLoc.getLine())
- : Twine("<UNKNOWN LOCATION>"))
- << ".\n");
- SmallVector<uint32_t, 4> Weights;
- uint32_t MaxWeight = 0;
- Instruction *MaxDestInst;
- for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
- BasicBlock *Succ = TI->getSuccessor(I);
- Edge E = std::make_pair(BB, Succ);
- uint64_t Weight = EdgeWeights[E];
- LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
- // Use uint32_t saturated arithmetic to adjust the incoming weights,
- // if needed. Sample counts in profiles are 64-bit unsigned values,
- // but internally branch weights are expressed as 32-bit values.
- if (Weight > std::numeric_limits<uint32_t>::max()) {
- LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
- Weight = std::numeric_limits<uint32_t>::max();
- }
- // Weight is added by one to avoid propagation errors introduced by
- // 0 weights.
- Weights.push_back(static_cast<uint32_t>(Weight + 1));
- if (Weight != 0) {
- if (Weight > MaxWeight) {
- MaxWeight = Weight;
- MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
- }
- }
- }
-
- uint64_t TempWeight;
- // Only set weights if there is at least one non-zero weight.
- // In any other case, let the analyzer set weights.
- // Do not set weights if the weights are present. In ThinLTO, the profile
- // annotation is done twice. If the first annotation already set the
- // weights, the second pass does not need to set it.
- if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
- LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
- TI->setMetadata(LLVMContext::MD_prof,
- MDB.createBranchWeights(Weights));
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
- << "most popular destination for conditional branches at "
- << ore::NV("CondBranchesLoc", BranchLoc);
- });
- } else {
- LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
- }
- }
-}
-
-/// Get the line number for the function header.
-///
-/// This looks up function \p F in the current compilation unit and
-/// retrieves the line number where the function is defined. This is
-/// line 0 for all the samples read from the profile file. Every line
-/// number is relative to this line.
-///
-/// \param F Function object to query.
-///
-/// \returns the line number where \p F is defined. If it returns 0,
-/// it means that there is no debug information available for \p F.
-unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
- if (DISubprogram *S = F.getSubprogram())
- return S->getLine();
-
- if (NoWarnSampleUnused)
- return 0;
-
- // If the start of \p F is missing, emit a diagnostic to inform the user
- // about the missed opportunity.
- F.getContext().diagnose(DiagnosticInfoSampleProfile(
- "No debug information found in function " + F.getName() +
- ": Function profile not used",
- DS_Warning));
- return 0;
-}
-
-void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
- DT.reset(new DominatorTree);
- DT->recalculate(F);
-
- PDT.reset(new PostDominatorTree(F));
-
- LI.reset(new LoopInfo);
- LI->analyze(*DT);
-}
-
-/// Generate branch weight metadata for all branches in \p F.
-///
-/// Branch weights are computed out of instruction samples using a
-/// propagation heuristic. Propagation proceeds in 3 phases:
-///
-/// 1- Assignment of block weights. All the basic blocks in the function
-/// are initial assigned the same weight as their most frequently
-/// executed instruction.
-///
-/// 2- Creation of equivalence classes. Since samples may be missing from
-/// blocks, we can fill in the gaps by setting the weights of all the
-/// blocks in the same equivalence class to the same weight. To compute
-/// the concept of equivalence, we use dominance and loop information.
-/// Two blocks B1 and B2 are in the same equivalence class if B1
-/// dominates B2, B2 post-dominates B1 and both are in the same loop.
-///
-/// 3- Propagation of block weights into edges. This uses a simple
-/// propagation heuristic. The following rules are applied to every
-/// block BB in the CFG:
-///
-/// - If BB has a single predecessor/successor, then the weight
-/// of that edge is the weight of the block.
-///
-/// - If all the edges are known except one, and the weight of the
-/// block is already known, the weight of the unknown edge will
-/// be the weight of the block minus the sum of all the known
-/// edges. If the sum of all the known edges is larger than BB's weight,
-/// we set the unknown edge weight to zero.
-///
-/// - If there is a self-referential edge, and the weight of the block is
-/// known, the weight for that edge is set to the weight of the block
-/// minus the weight of the other incoming edges to that block (if
-/// known).
-///
-/// Since this propagation is not guaranteed to finalize for every CFG, we
-/// only allow it to proceed for a limited number of iterations (controlled
-/// by -sample-profile-max-propagate-iterations).
-///
-/// FIXME: Try to replace this propagation heuristic with a scheme
-/// that is guaranteed to finalize. A work-list approach similar to
-/// the standard value propagation algorithm used by SSA-CCP might
-/// work here.
-///
-/// Once all the branch weights are computed, we emit the MD_prof
-/// metadata on BB using the computed values for each of its branches.
-///
-/// \param F The function to query.
-///
-/// \returns true if \p F was modified. Returns false, otherwise.
-bool SampleProfileLoader::emitAnnotations(Function &F) {
- bool Changed = false;
-
+ SmallVector<InstrProfValueData, 2> SortedCallTargets =
+ GetSortedValueDataFromCallTargets(T.get());
+ uint64_t Sum;
+ findIndirectCallFunctionSamples(I, Sum);
+ annotateValueSite(*I.getParent()->getParent()->getParent(), I,
+ SortedCallTargets, Sum, IPVK_IndirectCallTarget,
+ SortedCallTargets.size());
+ } else if (!isa<IntrinsicInst>(&I)) {
+ I.setMetadata(LLVMContext::MD_prof,
+ MDB.createBranchWeights(
+ {static_cast<uint32_t>(BlockWeights[BB])}));
+ }
+ }
+ }
+ Instruction *TI = BB->getTerminator();
+ if (TI->getNumSuccessors() == 1)
+ continue;
+ if (!isa<BranchInst>(TI) && !isa<SwitchInst>(TI))
+ continue;
+
+ DebugLoc BranchLoc = TI->getDebugLoc();
+ LLVM_DEBUG(dbgs() << "\nGetting weights for branch at line "
+ << ((BranchLoc) ? Twine(BranchLoc.getLine())
+ : Twine("<UNKNOWN LOCATION>"))
+ << ".\n");
+ SmallVector<uint32_t, 4> Weights;
+ uint32_t MaxWeight = 0;
+ Instruction *MaxDestInst;
+ for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) {
+ BasicBlock *Succ = TI->getSuccessor(I);
+ Edge E = std::make_pair(BB, Succ);
+ uint64_t Weight = EdgeWeights[E];
+ LLVM_DEBUG(dbgs() << "\t"; printEdgeWeight(dbgs(), E));
+ // Use uint32_t saturated arithmetic to adjust the incoming weights,
+ // if needed. Sample counts in profiles are 64-bit unsigned values,
+ // but internally branch weights are expressed as 32-bit values.
+ if (Weight > std::numeric_limits<uint32_t>::max()) {
+ LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)");
+ Weight = std::numeric_limits<uint32_t>::max();
+ }
+ // Weight is added by one to avoid propagation errors introduced by
+ // 0 weights.
+ Weights.push_back(static_cast<uint32_t>(Weight + 1));
+ if (Weight != 0) {
+ if (Weight > MaxWeight) {
+ MaxWeight = Weight;
+ MaxDestInst = Succ->getFirstNonPHIOrDbgOrLifetime();
+ }
+ }
+ }
+
+ uint64_t TempWeight;
+ // Only set weights if there is at least one non-zero weight.
+ // In any other case, let the analyzer set weights.
+ // Do not set weights if the weights are present. In ThinLTO, the profile
+ // annotation is done twice. If the first annotation already set the
+ // weights, the second pass does not need to set it.
+ if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
+ LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
+ TI->setMetadata(LLVMContext::MD_prof,
+ MDB.createBranchWeights(Weights));
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
+ << "most popular destination for conditional branches at "
+ << ore::NV("CondBranchesLoc", BranchLoc);
+ });
+ } else {
+ LLVM_DEBUG(dbgs() << "SKIPPED. All branch weights are zero.\n");
+ }
+ }
+}
+
+/// Get the line number for the function header.
+///
+/// This looks up function \p F in the current compilation unit and
+/// retrieves the line number where the function is defined. This is
+/// line 0 for all the samples read from the profile file. Every line
+/// number is relative to this line.
+///
+/// \param F Function object to query.
+///
+/// \returns the line number where \p F is defined. If it returns 0,
+/// it means that there is no debug information available for \p F.
+unsigned SampleProfileLoader::getFunctionLoc(Function &F) {
+ if (DISubprogram *S = F.getSubprogram())
+ return S->getLine();
+
+ if (NoWarnSampleUnused)
+ return 0;
+
+ // If the start of \p F is missing, emit a diagnostic to inform the user
+ // about the missed opportunity.
+ F.getContext().diagnose(DiagnosticInfoSampleProfile(
+ "No debug information found in function " + F.getName() +
+ ": Function profile not used",
+ DS_Warning));
+ return 0;
+}
+
+void SampleProfileLoader::computeDominanceAndLoopInfo(Function &F) {
+ DT.reset(new DominatorTree);
+ DT->recalculate(F);
+
+ PDT.reset(new PostDominatorTree(F));
+
+ LI.reset(new LoopInfo);
+ LI->analyze(*DT);
+}
+
+/// Generate branch weight metadata for all branches in \p F.
+///
+/// Branch weights are computed out of instruction samples using a
+/// propagation heuristic. Propagation proceeds in 3 phases:
+///
+/// 1- Assignment of block weights. All the basic blocks in the function
+/// are initial assigned the same weight as their most frequently
+/// executed instruction.
+///
+/// 2- Creation of equivalence classes. Since samples may be missing from
+/// blocks, we can fill in the gaps by setting the weights of all the
+/// blocks in the same equivalence class to the same weight. To compute
+/// the concept of equivalence, we use dominance and loop information.
+/// Two blocks B1 and B2 are in the same equivalence class if B1
+/// dominates B2, B2 post-dominates B1 and both are in the same loop.
+///
+/// 3- Propagation of block weights into edges. This uses a simple
+/// propagation heuristic. The following rules are applied to every
+/// block BB in the CFG:
+///
+/// - If BB has a single predecessor/successor, then the weight
+/// of that edge is the weight of the block.
+///
+/// - If all the edges are known except one, and the weight of the
+/// block is already known, the weight of the unknown edge will
+/// be the weight of the block minus the sum of all the known
+/// edges. If the sum of all the known edges is larger than BB's weight,
+/// we set the unknown edge weight to zero.
+///
+/// - If there is a self-referential edge, and the weight of the block is
+/// known, the weight for that edge is set to the weight of the block
+/// minus the weight of the other incoming edges to that block (if
+/// known).
+///
+/// Since this propagation is not guaranteed to finalize for every CFG, we
+/// only allow it to proceed for a limited number of iterations (controlled
+/// by -sample-profile-max-propagate-iterations).
+///
+/// FIXME: Try to replace this propagation heuristic with a scheme
+/// that is guaranteed to finalize. A work-list approach similar to
+/// the standard value propagation algorithm used by SSA-CCP might
+/// work here.
+///
+/// Once all the branch weights are computed, we emit the MD_prof
+/// metadata on BB using the computed values for each of its branches.
+///
+/// \param F The function to query.
+///
+/// \returns true if \p F was modified. Returns false, otherwise.
+bool SampleProfileLoader::emitAnnotations(Function &F) {
+ bool Changed = false;
+
if (FunctionSamples::ProfileIsProbeBased) {
if (!ProbeManager->profileIsValid(F, *Samples)) {
LLVM_DEBUG(
@@ -2216,80 +2216,80 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
} else {
if (getFunctionLoc(F) == 0)
return false;
-
+
LLVM_DEBUG(dbgs() << "Line number for the first instruction in "
<< F.getName() << ": " << getFunctionLoc(F) << "\n");
}
-
- DenseSet<GlobalValue::GUID> InlinedGUIDs;
+
+ DenseSet<GlobalValue::GUID> InlinedGUIDs;
if (ProfileIsCS && CallsitePrioritizedInline)
Changed |= inlineHotFunctionsWithPriority(F, InlinedGUIDs);
else
Changed |= inlineHotFunctions(F, InlinedGUIDs);
-
- // Compute basic block weights.
- Changed |= computeBlockWeights(F);
-
- if (Changed) {
- // Add an entry count to the function using the samples gathered at the
- // function entry.
- // Sets the GUIDs that are inlined in the profiled binary. This is used
- // for ThinLink to make correct liveness analysis, and also make the IR
- // match the profiled binary before annotation.
- F.setEntryCount(
- ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real),
- &InlinedGUIDs);
-
- // Compute dominance and loop info needed for propagation.
- computeDominanceAndLoopInfo(F);
-
- // Find equivalence classes.
- findEquivalenceClasses(F);
-
- // Propagate weights to all edges.
- propagateWeights(F);
- }
-
- // If coverage checking was requested, compute it now.
- if (SampleProfileRecordCoverage) {
- unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI);
- unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI);
- unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
- if (Coverage < SampleProfileRecordCoverage) {
- F.getContext().diagnose(DiagnosticInfoSampleProfile(
- F.getSubprogram()->getFilename(), getFunctionLoc(F),
- Twine(Used) + " of " + Twine(Total) + " available profile records (" +
- Twine(Coverage) + "%) were applied",
- DS_Warning));
- }
- }
-
- if (SampleProfileSampleCoverage) {
- uint64_t Used = CoverageTracker.getTotalUsedSamples();
- uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI);
- unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
- if (Coverage < SampleProfileSampleCoverage) {
- F.getContext().diagnose(DiagnosticInfoSampleProfile(
- F.getSubprogram()->getFilename(), getFunctionLoc(F),
- Twine(Used) + " of " + Twine(Total) + " available profile samples (" +
- Twine(Coverage) + "%) were applied",
- DS_Warning));
- }
- }
- return Changed;
-}
-
-char SampleProfileLoaderLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
- "Sample Profile loader", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
- "Sample Profile loader", false, false)
-
+
+ // Compute basic block weights.
+ Changed |= computeBlockWeights(F);
+
+ if (Changed) {
+ // Add an entry count to the function using the samples gathered at the
+ // function entry.
+ // Sets the GUIDs that are inlined in the profiled binary. This is used
+ // for ThinLink to make correct liveness analysis, and also make the IR
+ // match the profiled binary before annotation.
+ F.setEntryCount(
+ ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real),
+ &InlinedGUIDs);
+
+ // Compute dominance and loop info needed for propagation.
+ computeDominanceAndLoopInfo(F);
+
+ // Find equivalence classes.
+ findEquivalenceClasses(F);
+
+ // Propagate weights to all edges.
+ propagateWeights(F);
+ }
+
+ // If coverage checking was requested, compute it now.
+ if (SampleProfileRecordCoverage) {
+ unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI);
+ unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI);
+ unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
+ if (Coverage < SampleProfileRecordCoverage) {
+ F.getContext().diagnose(DiagnosticInfoSampleProfile(
+ F.getSubprogram()->getFilename(), getFunctionLoc(F),
+ Twine(Used) + " of " + Twine(Total) + " available profile records (" +
+ Twine(Coverage) + "%) were applied",
+ DS_Warning));
+ }
+ }
+
+ if (SampleProfileSampleCoverage) {
+ uint64_t Used = CoverageTracker.getTotalUsedSamples();
+ uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI);
+ unsigned Coverage = CoverageTracker.computeCoverage(Used, Total);
+ if (Coverage < SampleProfileSampleCoverage) {
+ F.getContext().diagnose(DiagnosticInfoSampleProfile(
+ F.getSubprogram()->getFilename(), getFunctionLoc(F),
+ Twine(Used) + " of " + Twine(Total) + " available profile samples (" +
+ Twine(Coverage) + "%) were applied",
+ DS_Warning));
+ }
+ }
+ return Changed;
+}
+
+char SampleProfileLoaderLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
+ "Sample Profile loader", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
+ "Sample Profile loader", false, false)
+
// Add inlined profile call edges to the call graph.
void SampleProfileLoader::addCallGraphEdges(CallGraph &CG,
const FunctionSamples &Samples) {
@@ -2329,28 +2329,28 @@ void SampleProfileLoader::replaceCallGraphEdges(
}
}
-std::vector<Function *>
-SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
- std::vector<Function *> FunctionOrderList;
- FunctionOrderList.reserve(M.size());
-
- if (!ProfileTopDownLoad || CG == nullptr) {
- if (ProfileMergeInlinee) {
- // Disable ProfileMergeInlinee if profile is not loaded in top down order,
- // because the profile for a function may be used for the profile
- // annotation of its outline copy before the profile merging of its
- // non-inlined inline instances, and that is not the way how
- // ProfileMergeInlinee is supposed to work.
- ProfileMergeInlinee = false;
- }
-
- for (Function &F : M)
- if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
- FunctionOrderList.push_back(&F);
- return FunctionOrderList;
- }
-
- assert(&CG->getModule() == &M);
+std::vector<Function *>
+SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
+ std::vector<Function *> FunctionOrderList;
+ FunctionOrderList.reserve(M.size());
+
+ if (!ProfileTopDownLoad || CG == nullptr) {
+ if (ProfileMergeInlinee) {
+ // Disable ProfileMergeInlinee if profile is not loaded in top down order,
+ // because the profile for a function may be used for the profile
+ // annotation of its outline copy before the profile merging of its
+ // non-inlined inline instances, and that is not the way how
+ // ProfileMergeInlinee is supposed to work.
+ ProfileMergeInlinee = false;
+ }
+
+ for (Function &F : M)
+ if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
+ FunctionOrderList.push_back(&F);
+ return FunctionOrderList;
+ }
+
+ assert(&CG->getModule() == &M);
// Add indirect call edges from profile to augment the static call graph.
// Functions will be processed in a top-down order defined by the static call
@@ -2414,14 +2414,14 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
}
}
- scc_iterator<CallGraph *> CGI = scc_begin(CG);
- while (!CGI.isAtEnd()) {
+ scc_iterator<CallGraph *> CGI = scc_begin(CG);
+ while (!CGI.isAtEnd()) {
uint64_t Start = FunctionOrderList.size();
for (CallGraphNode *Node : *CGI) {
auto *F = Node->getFunction();
- if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
- FunctionOrderList.push_back(F);
- }
+ if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
+ FunctionOrderList.push_back(F);
+ }
// Sort nodes in SCC based on the profile top-down order.
if (!ProfileOrderMap.empty()) {
@@ -2432,9 +2432,9 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
});
}
- ++CGI;
- }
-
+ ++CGI;
+ }
+
LLVM_DEBUG({
dbgs() << "Function processing order:\n";
for (auto F : reverse(FunctionOrderList)) {
@@ -2442,41 +2442,41 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
}
});
- std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
- return FunctionOrderList;
-}
-
+ std::reverse(FunctionOrderList.begin(), FunctionOrderList.end());
+ return FunctionOrderList;
+}
+
bool SampleProfileLoader::doInitialization(Module &M,
FunctionAnalysisManager *FAM) {
- auto &Ctx = M.getContext();
-
- auto ReaderOrErr =
- SampleProfileReader::create(Filename, Ctx, RemappingFilename);
- if (std::error_code EC = ReaderOrErr.getError()) {
- std::string Msg = "Could not open profile: " + EC.message();
- Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
- return false;
- }
- Reader = std::move(ReaderOrErr.get());
+ auto &Ctx = M.getContext();
+
+ auto ReaderOrErr =
+ SampleProfileReader::create(Filename, Ctx, RemappingFilename);
+ if (std::error_code EC = ReaderOrErr.getError()) {
+ std::string Msg = "Could not open profile: " + EC.message();
+ Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
+ return false;
+ }
+ Reader = std::move(ReaderOrErr.get());
Reader->setSkipFlatProf(LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink);
- Reader->collectFuncsFrom(M);
+ Reader->collectFuncsFrom(M);
if (std::error_code EC = Reader->read()) {
std::string Msg = "profile reading failed: " + EC.message();
Ctx.diagnose(DiagnosticInfoSampleProfile(Filename, Msg));
return false;
}
- PSL = Reader->getProfileSymbolList();
-
- // While profile-sample-accurate is on, ignore symbol list.
- ProfAccForSymsInList =
- ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate;
- if (ProfAccForSymsInList) {
- NamesInProfile.clear();
- if (auto NameTable = Reader->getNameTable())
- NamesInProfile.insert(NameTable->begin(), NameTable->end());
- }
-
+ PSL = Reader->getProfileSymbolList();
+
+ // While profile-sample-accurate is on, ignore symbol list.
+ ProfAccForSymsInList =
+ ProfileAccurateForSymsInList && PSL && !ProfileSampleAccurate;
+ if (ProfAccForSymsInList) {
+ NamesInProfile.clear();
+ if (auto NameTable = Reader->getNameTable())
+ NamesInProfile.insert(NameTable->begin(), NameTable->end());
+ }
+
if (FAM && !ProfileInlineReplayFile.empty()) {
ExternalInlineAdvisor = std::make_unique<ReplayInlineAdvisor>(
M, *FAM, Ctx, /*OriginalAdvisor=*/nullptr, ProfileInlineReplayFile,
@@ -2512,51 +2512,51 @@ bool SampleProfileLoader::doInitialization(Module &M,
}
}
- return true;
-}
-
-ModulePass *llvm::createSampleProfileLoaderPass() {
- return new SampleProfileLoaderLegacyPass();
-}
-
-ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
- return new SampleProfileLoaderLegacyPass(Name);
-}
-
-bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
- ProfileSummaryInfo *_PSI, CallGraph *CG) {
- GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
-
- PSI = _PSI;
- if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
- M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
- ProfileSummary::PSK_Sample);
- PSI->refresh();
- }
- // Compute the total number of samples collected in this profile.
- for (const auto &I : Reader->getProfiles())
- TotalCollectedSamples += I.second.getTotalSamples();
-
+ return true;
+}
+
+ModulePass *llvm::createSampleProfileLoaderPass() {
+ return new SampleProfileLoaderLegacyPass();
+}
+
+ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
+ return new SampleProfileLoaderLegacyPass(Name);
+}
+
+bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
+ ProfileSummaryInfo *_PSI, CallGraph *CG) {
+ GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
+
+ PSI = _PSI;
+ if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
+ M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
+ ProfileSummary::PSK_Sample);
+ PSI->refresh();
+ }
+ // Compute the total number of samples collected in this profile.
+ for (const auto &I : Reader->getProfiles())
+ TotalCollectedSamples += I.second.getTotalSamples();
+
auto Remapper = Reader->getRemapper();
- // Populate the symbol map.
- for (const auto &N_F : M.getValueSymbolTable()) {
- StringRef OrigName = N_F.getKey();
- Function *F = dyn_cast<Function>(N_F.getValue());
- if (F == nullptr)
- continue;
- SymbolMap[OrigName] = F;
- auto pos = OrigName.find('.');
- if (pos != StringRef::npos) {
- StringRef NewName = OrigName.substr(0, pos);
- auto r = SymbolMap.insert(std::make_pair(NewName, F));
- // Failiing to insert means there is already an entry in SymbolMap,
- // thus there are multiple functions that are mapped to the same
- // stripped name. In this case of name conflicting, set the value
- // to nullptr to avoid confusion.
- if (!r.second)
- r.first->second = nullptr;
+ // Populate the symbol map.
+ for (const auto &N_F : M.getValueSymbolTable()) {
+ StringRef OrigName = N_F.getKey();
+ Function *F = dyn_cast<Function>(N_F.getValue());
+ if (F == nullptr)
+ continue;
+ SymbolMap[OrigName] = F;
+ auto pos = OrigName.find('.');
+ if (pos != StringRef::npos) {
+ StringRef NewName = OrigName.substr(0, pos);
+ auto r = SymbolMap.insert(std::make_pair(NewName, F));
+ // Failiing to insert means there is already an entry in SymbolMap,
+ // thus there are multiple functions that are mapped to the same
+ // stripped name. In this case of name conflicting, set the value
+ // to nullptr to avoid confusion.
+ if (!r.second)
+ r.first->second = nullptr;
OrigName = NewName;
- }
+ }
// Insert the remapped names into SymbolMap.
if (Remapper) {
if (auto MapName = Remapper->lookUpNameInProfile(OrigName)) {
@@ -2565,129 +2565,129 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
SymbolMap.insert(std::make_pair(*MapName, F));
}
}
- }
-
- bool retval = false;
- for (auto F : buildFunctionOrder(M, CG)) {
- assert(!F->isDeclaration());
- clearFunctionData();
- retval |= runOnFunction(*F, AM);
- }
-
- // Account for cold calls not inlined....
+ }
+
+ bool retval = false;
+ for (auto F : buildFunctionOrder(M, CG)) {
+ assert(!F->isDeclaration());
+ clearFunctionData();
+ retval |= runOnFunction(*F, AM);
+ }
+
+ // Account for cold calls not inlined....
if (!ProfileIsCS)
for (const std::pair<Function *, NotInlinedProfileInfo> &pair :
notInlinedCallInfo)
updateProfileCallee(pair.first, pair.second.entryCount);
-
- return retval;
-}
-
-bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
- ACT = &getAnalysis<AssumptionCacheTracker>();
- TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
- TLIWP = &getAnalysis<TargetLibraryInfoWrapperPass>();
- ProfileSummaryInfo *PSI =
- &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
- return SampleLoader.runOnModule(M, nullptr, PSI, nullptr);
-}
-
-bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
+
+ return retval;
+}
+
+bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
+ ACT = &getAnalysis<AssumptionCacheTracker>();
+ TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
+ TLIWP = &getAnalysis<TargetLibraryInfoWrapperPass>();
+ ProfileSummaryInfo *PSI =
+ &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ return SampleLoader.runOnModule(M, nullptr, PSI, nullptr);
+}
+
+bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) {
LLVM_DEBUG(dbgs() << "\n\nProcessing Function " << F.getName() << "\n");
- DILocation2SampleMap.clear();
- // By default the entry count is initialized to -1, which will be treated
- // conservatively by getEntryCount as the same as unknown (None). This is
- // to avoid newly added code to be treated as cold. If we have samples
- // this will be overwritten in emitAnnotations.
- uint64_t initialEntryCount = -1;
-
- ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
- if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
- // initialize all the function entry counts to 0. It means all the
- // functions without profile will be regarded as cold.
- initialEntryCount = 0;
- // profile-sample-accurate is a user assertion which has a higher precedence
- // than symbol list. When profile-sample-accurate is on, ignore symbol list.
- ProfAccForSymsInList = false;
- }
-
- // PSL -- profile symbol list include all the symbols in sampled binary.
- // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
- // old functions without samples being cold, without having to worry
- // about new and hot functions being mistakenly treated as cold.
- if (ProfAccForSymsInList) {
- // Initialize the entry count to 0 for functions in the list.
- if (PSL->contains(F.getName()))
- initialEntryCount = 0;
-
- // Function in the symbol list but without sample will be regarded as
- // cold. To minimize the potential negative performance impact it could
- // have, we want to be a little conservative here saying if a function
- // shows up in the profile, no matter as outline function, inline instance
- // or call targets, treat the function as not being cold. This will handle
- // the cases such as most callsites of a function are inlined in sampled
- // binary but not inlined in current build (because of source code drift,
- // imprecise debug information, or the callsites are all cold individually
- // but not cold accumulatively...), so the outline function showing up as
- // cold in sampled binary will actually not be cold after current build.
- StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
- if (NamesInProfile.count(CanonName))
- initialEntryCount = -1;
- }
-
+ DILocation2SampleMap.clear();
+ // By default the entry count is initialized to -1, which will be treated
+ // conservatively by getEntryCount as the same as unknown (None). This is
+ // to avoid newly added code to be treated as cold. If we have samples
+ // this will be overwritten in emitAnnotations.
+ uint64_t initialEntryCount = -1;
+
+ ProfAccForSymsInList = ProfileAccurateForSymsInList && PSL;
+ if (ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) {
+ // initialize all the function entry counts to 0. It means all the
+ // functions without profile will be regarded as cold.
+ initialEntryCount = 0;
+ // profile-sample-accurate is a user assertion which has a higher precedence
+ // than symbol list. When profile-sample-accurate is on, ignore symbol list.
+ ProfAccForSymsInList = false;
+ }
+
+ // PSL -- profile symbol list include all the symbols in sampled binary.
+ // If ProfileAccurateForSymsInList is enabled, PSL is used to treat
+ // old functions without samples being cold, without having to worry
+ // about new and hot functions being mistakenly treated as cold.
+ if (ProfAccForSymsInList) {
+ // Initialize the entry count to 0 for functions in the list.
+ if (PSL->contains(F.getName()))
+ initialEntryCount = 0;
+
+ // Function in the symbol list but without sample will be regarded as
+ // cold. To minimize the potential negative performance impact it could
+ // have, we want to be a little conservative here saying if a function
+ // shows up in the profile, no matter as outline function, inline instance
+ // or call targets, treat the function as not being cold. This will handle
+ // the cases such as most callsites of a function are inlined in sampled
+ // binary but not inlined in current build (because of source code drift,
+ // imprecise debug information, or the callsites are all cold individually
+ // but not cold accumulatively...), so the outline function showing up as
+ // cold in sampled binary will actually not be cold after current build.
+ StringRef CanonName = FunctionSamples::getCanonicalFnName(F);
+ if (NamesInProfile.count(CanonName))
+ initialEntryCount = -1;
+ }
+
// Initialize entry count when the function has no existing entry
// count value.
if (!F.getEntryCount().hasValue())
F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real));
- std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
- if (AM) {
- auto &FAM =
- AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent())
- .getManager();
- ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- } else {
- OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
- ORE = OwnedORE.get();
- }
+ std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
+ if (AM) {
+ auto &FAM =
+ AM->getResult<FunctionAnalysisManagerModuleProxy>(*F.getParent())
+ .getManager();
+ ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ } else {
+ OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
+ ORE = OwnedORE.get();
+ }
if (ProfileIsCS)
Samples = ContextTracker->getBaseSamplesFor(F);
else
Samples = Reader->getSamplesFor(F);
- if (Samples && !Samples->empty())
- return emitAnnotations(F);
- return false;
-}
-
-PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-
- auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
- return FAM.getResult<AssumptionAnalysis>(F);
- };
- auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
- return FAM.getResult<TargetIRAnalysis>(F);
- };
- auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
- return FAM.getResult<TargetLibraryAnalysis>(F);
- };
-
- SampleProfileLoader SampleLoader(
- ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
- ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
- : ProfileRemappingFileName,
+ if (Samples && !Samples->empty())
+ return emitAnnotations(F);
+ return false;
+}
+
+PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
+ auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+ return FAM.getResult<AssumptionAnalysis>(F);
+ };
+ auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
+ return FAM.getResult<TargetIRAnalysis>(F);
+ };
+ auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
+ return FAM.getResult<TargetLibraryAnalysis>(F);
+ };
+
+ SampleProfileLoader SampleLoader(
+ ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
+ ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
+ : ProfileRemappingFileName,
LTOPhase, GetAssumptionCache, GetTTI, GetTLI);
-
+
if (!SampleLoader.doInitialization(M, &FAM))
- return PreservedAnalyses::all();
-
- ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
- CallGraph &CG = AM.getResult<CallGraphAnalysis>(M);
- if (!SampleLoader.runOnModule(M, &AM, PSI, &CG))
- return PreservedAnalyses::all();
-
- return PreservedAnalyses::none();
-}
+ return PreservedAnalyses::all();
+
+ ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+ CallGraph &CG = AM.getResult<CallGraphAnalysis>(M);
+ if (!SampleLoader.runOnModule(M, &AM, PSI, &CG))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/StripDeadPrototypes.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/StripDeadPrototypes.cpp
index 1e9cbeac6d..655a7a4049 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/StripDeadPrototypes.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -1,88 +1,88 @@
-//===-- StripDeadPrototypes.cpp - Remove unused function declarations ----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass loops over all of the functions in the input module, looking for
-// dead declarations and removes them. Dead declarations are declarations of
-// functions for which no implementation is available (i.e., declarations for
-// unused library functions).
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/StripDeadPrototypes.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/IPO.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "strip-dead-prototypes"
-
-STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed");
-
-static bool stripDeadPrototypes(Module &M) {
- bool MadeChange = false;
-
- // Erase dead function prototypes.
- for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
- Function *F = &*I++;
- // Function must be a prototype and unused.
- if (F->isDeclaration() && F->use_empty()) {
- F->eraseFromParent();
- ++NumDeadPrototypes;
- MadeChange = true;
- }
- }
-
- // Erase dead global var prototypes.
- for (Module::global_iterator I = M.global_begin(), E = M.global_end();
- I != E; ) {
- GlobalVariable *GV = &*I++;
- // Global must be a prototype and unused.
- if (GV->isDeclaration() && GV->use_empty())
- GV->eraseFromParent();
- }
-
- // Return an indication of whether we changed anything or not.
- return MadeChange;
-}
-
-PreservedAnalyses StripDeadPrototypesPass::run(Module &M,
- ModuleAnalysisManager &) {
- if (stripDeadPrototypes(M))
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
-}
-
-namespace {
-
-class StripDeadPrototypesLegacyPass : public ModulePass {
-public:
- static char ID; // Pass identification, replacement for typeid
- StripDeadPrototypesLegacyPass() : ModulePass(ID) {
- initializeStripDeadPrototypesLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
-
- return stripDeadPrototypes(M);
- }
-};
-
-} // end anonymous namespace
-
-char StripDeadPrototypesLegacyPass::ID = 0;
-INITIALIZE_PASS(StripDeadPrototypesLegacyPass, "strip-dead-prototypes",
- "Strip Unused Function Prototypes", false, false)
-
-ModulePass *llvm::createStripDeadPrototypesPass() {
- return new StripDeadPrototypesLegacyPass();
-}
+//===-- StripDeadPrototypes.cpp - Remove unused function declarations ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass loops over all of the functions in the input module, looking for
+// dead declarations and removes them. Dead declarations are declarations of
+// functions for which no implementation is available (i.e., declarations for
+// unused library functions).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/StripDeadPrototypes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "strip-dead-prototypes"
+
+STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed");
+
+static bool stripDeadPrototypes(Module &M) {
+ bool MadeChange = false;
+
+ // Erase dead function prototypes.
+ for (Module::iterator I = M.begin(), E = M.end(); I != E; ) {
+ Function *F = &*I++;
+ // Function must be a prototype and unused.
+ if (F->isDeclaration() && F->use_empty()) {
+ F->eraseFromParent();
+ ++NumDeadPrototypes;
+ MadeChange = true;
+ }
+ }
+
+ // Erase dead global var prototypes.
+ for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+ I != E; ) {
+ GlobalVariable *GV = &*I++;
+ // Global must be a prototype and unused.
+ if (GV->isDeclaration() && GV->use_empty())
+ GV->eraseFromParent();
+ }
+
+ // Return an indication of whether we changed anything or not.
+ return MadeChange;
+}
+
+PreservedAnalyses StripDeadPrototypesPass::run(Module &M,
+ ModuleAnalysisManager &) {
+ if (stripDeadPrototypes(M))
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
+namespace {
+
+class StripDeadPrototypesLegacyPass : public ModulePass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ StripDeadPrototypesLegacyPass() : ModulePass(ID) {
+ initializeStripDeadPrototypesLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+
+ return stripDeadPrototypes(M);
+ }
+};
+
+} // end anonymous namespace
+
+char StripDeadPrototypesLegacyPass::ID = 0;
+INITIALIZE_PASS(StripDeadPrototypesLegacyPass, "strip-dead-prototypes",
+ "Strip Unused Function Prototypes", false, false)
+
+ModulePass *llvm::createStripDeadPrototypesPass() {
+ return new StripDeadPrototypesLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/StripSymbols.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/StripSymbols.cpp
index d35f785a31..4fc71847a0 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/StripSymbols.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/StripSymbols.cpp
@@ -1,382 +1,382 @@
-//===- StripSymbols.cpp - Strip symbols and debug info from a module ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The StripSymbols transformation implements code stripping. Specifically, it
-// can delete:
-//
-// * names for virtual registers
-// * symbols for internal globals and functions
-// * debug information
-//
-// Note that this transformation makes code much less readable, so it should
-// only be used in situations where the 'strip' utility would be used, such as
-// reducing code size or making it harder to reverse engineer code.
-//
-//===----------------------------------------------------------------------===//
-
+//===- StripSymbols.cpp - Strip symbols and debug info from a module ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The StripSymbols transformation implements code stripping. Specifically, it
+// can delete:
+//
+// * names for virtual registers
+// * symbols for internal globals and functions
+// * debug information
+//
+// Note that this transformation makes code much less readable, so it should
+// only be used in situations where the 'strip' utility would be used, such as
+// reducing code size or making it harder to reverse engineer code.
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/IPO/StripSymbols.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/IR/TypeFinder.h"
-#include "llvm/IR/ValueSymbolTable.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-using namespace llvm;
-
-namespace {
- class StripSymbols : public ModulePass {
- bool OnlyDebugInfo;
- public:
- static char ID; // Pass identification, replacement for typeid
- explicit StripSymbols(bool ODI = false)
- : ModulePass(ID), OnlyDebugInfo(ODI) {
- initializeStripSymbolsPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- }
- };
-
- class StripNonDebugSymbols : public ModulePass {
- public:
- static char ID; // Pass identification, replacement for typeid
- explicit StripNonDebugSymbols()
- : ModulePass(ID) {
- initializeStripNonDebugSymbolsPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- }
- };
-
- class StripDebugDeclare : public ModulePass {
- public:
- static char ID; // Pass identification, replacement for typeid
- explicit StripDebugDeclare()
- : ModulePass(ID) {
- initializeStripDebugDeclarePass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- }
- };
-
- class StripDeadDebugInfo : public ModulePass {
- public:
- static char ID; // Pass identification, replacement for typeid
- explicit StripDeadDebugInfo()
- : ModulePass(ID) {
- initializeStripDeadDebugInfoPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- }
- };
-}
-
-char StripSymbols::ID = 0;
-INITIALIZE_PASS(StripSymbols, "strip",
- "Strip all symbols from a module", false, false)
-
-ModulePass *llvm::createStripSymbolsPass(bool OnlyDebugInfo) {
- return new StripSymbols(OnlyDebugInfo);
-}
-
-char StripNonDebugSymbols::ID = 0;
-INITIALIZE_PASS(StripNonDebugSymbols, "strip-nondebug",
- "Strip all symbols, except dbg symbols, from a module",
- false, false)
-
-ModulePass *llvm::createStripNonDebugSymbolsPass() {
- return new StripNonDebugSymbols();
-}
-
-char StripDebugDeclare::ID = 0;
-INITIALIZE_PASS(StripDebugDeclare, "strip-debug-declare",
- "Strip all llvm.dbg.declare intrinsics", false, false)
-
-ModulePass *llvm::createStripDebugDeclarePass() {
- return new StripDebugDeclare();
-}
-
-char StripDeadDebugInfo::ID = 0;
-INITIALIZE_PASS(StripDeadDebugInfo, "strip-dead-debug-info",
- "Strip debug info for unused symbols", false, false)
-
-ModulePass *llvm::createStripDeadDebugInfoPass() {
- return new StripDeadDebugInfo();
-}
-
-/// OnlyUsedBy - Return true if V is only used by Usr.
-static bool OnlyUsedBy(Value *V, Value *Usr) {
- for (User *U : V->users())
- if (U != Usr)
- return false;
-
- return true;
-}
-
-static void RemoveDeadConstant(Constant *C) {
- assert(C->use_empty() && "Constant is not dead!");
- SmallPtrSet<Constant*, 4> Operands;
- for (Value *Op : C->operands())
- if (OnlyUsedBy(Op, C))
- Operands.insert(cast<Constant>(Op));
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
- if (!GV->hasLocalLinkage()) return; // Don't delete non-static globals.
- GV->eraseFromParent();
- } else if (!isa<Function>(C)) {
- // FIXME: Why does the type of the constant matter here?
- if (isa<StructType>(C->getType()) || isa<ArrayType>(C->getType()) ||
- isa<VectorType>(C->getType()))
- C->destroyConstant();
- }
-
- // If the constant referenced anything, see if we can delete it as well.
- for (Constant *O : Operands)
- RemoveDeadConstant(O);
-}
-
-// Strip the symbol table of its names.
-//
-static void StripSymtab(ValueSymbolTable &ST, bool PreserveDbgInfo) {
- for (ValueSymbolTable::iterator VI = ST.begin(), VE = ST.end(); VI != VE; ) {
- Value *V = VI->getValue();
- ++VI;
- if (!isa<GlobalValue>(V) || cast<GlobalValue>(V)->hasLocalLinkage()) {
- if (!PreserveDbgInfo || !V->getName().startswith("llvm.dbg"))
- // Set name to "", removing from symbol table!
- V->setName("");
- }
- }
-}
-
-// Strip any named types of their names.
-static void StripTypeNames(Module &M, bool PreserveDbgInfo) {
- TypeFinder StructTypes;
- StructTypes.run(M, false);
-
- for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
- StructType *STy = StructTypes[i];
- if (STy->isLiteral() || STy->getName().empty()) continue;
-
- if (PreserveDbgInfo && STy->getName().startswith("llvm.dbg"))
- continue;
-
- STy->setName("");
- }
-}
-
-/// Find values that are marked as llvm.used.
-static void findUsedValues(GlobalVariable *LLVMUsed,
- SmallPtrSetImpl<const GlobalValue*> &UsedValues) {
- if (!LLVMUsed) return;
- UsedValues.insert(LLVMUsed);
-
- ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
-
- for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
- if (GlobalValue *GV =
- dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
- UsedValues.insert(GV);
-}
-
-/// StripSymbolNames - Strip symbol names.
-static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
-
- SmallPtrSet<const GlobalValue*, 8> llvmUsedValues;
- findUsedValues(M.getGlobalVariable("llvm.used"), llvmUsedValues);
- findUsedValues(M.getGlobalVariable("llvm.compiler.used"), llvmUsedValues);
-
- for (Module::global_iterator I = M.global_begin(), E = M.global_end();
- I != E; ++I) {
- if (I->hasLocalLinkage() && llvmUsedValues.count(&*I) == 0)
- if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
- I->setName(""); // Internal symbols can't participate in linkage
- }
-
- for (Function &I : M) {
- if (I.hasLocalLinkage() && llvmUsedValues.count(&I) == 0)
- if (!PreserveDbgInfo || !I.getName().startswith("llvm.dbg"))
- I.setName(""); // Internal symbols can't participate in linkage
- if (auto *Symtab = I.getValueSymbolTable())
- StripSymtab(*Symtab, PreserveDbgInfo);
- }
-
- // Remove all names from types.
- StripTypeNames(M, PreserveDbgInfo);
-
- return true;
-}
-
-bool StripSymbols::runOnModule(Module &M) {
- if (skipModule(M))
- return false;
-
- bool Changed = false;
- Changed |= StripDebugInfo(M);
- if (!OnlyDebugInfo)
- Changed |= StripSymbolNames(M, false);
- return Changed;
-}
-
-bool StripNonDebugSymbols::runOnModule(Module &M) {
- if (skipModule(M))
- return false;
-
- return StripSymbolNames(M, true);
-}
-
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+namespace {
+ class StripSymbols : public ModulePass {
+ bool OnlyDebugInfo;
+ public:
+ static char ID; // Pass identification, replacement for typeid
+ explicit StripSymbols(bool ODI = false)
+ : ModulePass(ID), OnlyDebugInfo(ODI) {
+ initializeStripSymbolsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+ };
+
+ class StripNonDebugSymbols : public ModulePass {
+ public:
+ static char ID; // Pass identification, replacement for typeid
+ explicit StripNonDebugSymbols()
+ : ModulePass(ID) {
+ initializeStripNonDebugSymbolsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+ };
+
+ class StripDebugDeclare : public ModulePass {
+ public:
+ static char ID; // Pass identification, replacement for typeid
+ explicit StripDebugDeclare()
+ : ModulePass(ID) {
+ initializeStripDebugDeclarePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+ };
+
+ class StripDeadDebugInfo : public ModulePass {
+ public:
+ static char ID; // Pass identification, replacement for typeid
+ explicit StripDeadDebugInfo()
+ : ModulePass(ID) {
+ initializeStripDeadDebugInfoPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+ };
+}
+
+char StripSymbols::ID = 0;
+INITIALIZE_PASS(StripSymbols, "strip",
+ "Strip all symbols from a module", false, false)
+
+ModulePass *llvm::createStripSymbolsPass(bool OnlyDebugInfo) {
+ return new StripSymbols(OnlyDebugInfo);
+}
+
+char StripNonDebugSymbols::ID = 0;
+INITIALIZE_PASS(StripNonDebugSymbols, "strip-nondebug",
+ "Strip all symbols, except dbg symbols, from a module",
+ false, false)
+
+ModulePass *llvm::createStripNonDebugSymbolsPass() {
+ return new StripNonDebugSymbols();
+}
+
+char StripDebugDeclare::ID = 0;
+INITIALIZE_PASS(StripDebugDeclare, "strip-debug-declare",
+ "Strip all llvm.dbg.declare intrinsics", false, false)
+
+ModulePass *llvm::createStripDebugDeclarePass() {
+ return new StripDebugDeclare();
+}
+
+char StripDeadDebugInfo::ID = 0;
+INITIALIZE_PASS(StripDeadDebugInfo, "strip-dead-debug-info",
+ "Strip debug info for unused symbols", false, false)
+
+ModulePass *llvm::createStripDeadDebugInfoPass() {
+ return new StripDeadDebugInfo();
+}
+
+/// OnlyUsedBy - Return true if V is only used by Usr.
+static bool OnlyUsedBy(Value *V, Value *Usr) {
+ for (User *U : V->users())
+ if (U != Usr)
+ return false;
+
+ return true;
+}
+
+static void RemoveDeadConstant(Constant *C) {
+ assert(C->use_empty() && "Constant is not dead!");
+ SmallPtrSet<Constant*, 4> Operands;
+ for (Value *Op : C->operands())
+ if (OnlyUsedBy(Op, C))
+ Operands.insert(cast<Constant>(Op));
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+ if (!GV->hasLocalLinkage()) return; // Don't delete non-static globals.
+ GV->eraseFromParent();
+ } else if (!isa<Function>(C)) {
+ // FIXME: Why does the type of the constant matter here?
+ if (isa<StructType>(C->getType()) || isa<ArrayType>(C->getType()) ||
+ isa<VectorType>(C->getType()))
+ C->destroyConstant();
+ }
+
+ // If the constant referenced anything, see if we can delete it as well.
+ for (Constant *O : Operands)
+ RemoveDeadConstant(O);
+}
+
+// Strip the symbol table of its names.
+//
+static void StripSymtab(ValueSymbolTable &ST, bool PreserveDbgInfo) {
+ for (ValueSymbolTable::iterator VI = ST.begin(), VE = ST.end(); VI != VE; ) {
+ Value *V = VI->getValue();
+ ++VI;
+ if (!isa<GlobalValue>(V) || cast<GlobalValue>(V)->hasLocalLinkage()) {
+ if (!PreserveDbgInfo || !V->getName().startswith("llvm.dbg"))
+ // Set name to "", removing from symbol table!
+ V->setName("");
+ }
+ }
+}
+
+// Strip any named types of their names.
+static void StripTypeNames(Module &M, bool PreserveDbgInfo) {
+ TypeFinder StructTypes;
+ StructTypes.run(M, false);
+
+ for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
+ StructType *STy = StructTypes[i];
+ if (STy->isLiteral() || STy->getName().empty()) continue;
+
+ if (PreserveDbgInfo && STy->getName().startswith("llvm.dbg"))
+ continue;
+
+ STy->setName("");
+ }
+}
+
+/// Find values that are marked as llvm.used.
+static void findUsedValues(GlobalVariable *LLVMUsed,
+ SmallPtrSetImpl<const GlobalValue*> &UsedValues) {
+ if (!LLVMUsed) return;
+ UsedValues.insert(LLVMUsed);
+
+ ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+
+ for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
+ if (GlobalValue *GV =
+ dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
+ UsedValues.insert(GV);
+}
+
+/// StripSymbolNames - Strip symbol names.
+static bool StripSymbolNames(Module &M, bool PreserveDbgInfo) {
+
+ SmallPtrSet<const GlobalValue*, 8> llvmUsedValues;
+ findUsedValues(M.getGlobalVariable("llvm.used"), llvmUsedValues);
+ findUsedValues(M.getGlobalVariable("llvm.compiler.used"), llvmUsedValues);
+
+ for (Module::global_iterator I = M.global_begin(), E = M.global_end();
+ I != E; ++I) {
+ if (I->hasLocalLinkage() && llvmUsedValues.count(&*I) == 0)
+ if (!PreserveDbgInfo || !I->getName().startswith("llvm.dbg"))
+ I->setName(""); // Internal symbols can't participate in linkage
+ }
+
+ for (Function &I : M) {
+ if (I.hasLocalLinkage() && llvmUsedValues.count(&I) == 0)
+ if (!PreserveDbgInfo || !I.getName().startswith("llvm.dbg"))
+ I.setName(""); // Internal symbols can't participate in linkage
+ if (auto *Symtab = I.getValueSymbolTable())
+ StripSymtab(*Symtab, PreserveDbgInfo);
+ }
+
+ // Remove all names from types.
+ StripTypeNames(M, PreserveDbgInfo);
+
+ return true;
+}
+
+bool StripSymbols::runOnModule(Module &M) {
+ if (skipModule(M))
+ return false;
+
+ bool Changed = false;
+ Changed |= StripDebugInfo(M);
+ if (!OnlyDebugInfo)
+ Changed |= StripSymbolNames(M, false);
+ return Changed;
+}
+
+bool StripNonDebugSymbols::runOnModule(Module &M) {
+ if (skipModule(M))
+ return false;
+
+ return StripSymbolNames(M, true);
+}
+
static bool stripDebugDeclareImpl(Module &M) {
-
- Function *Declare = M.getFunction("llvm.dbg.declare");
- std::vector<Constant*> DeadConstants;
-
- if (Declare) {
- while (!Declare->use_empty()) {
- CallInst *CI = cast<CallInst>(Declare->user_back());
- Value *Arg1 = CI->getArgOperand(0);
- Value *Arg2 = CI->getArgOperand(1);
- assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
- CI->eraseFromParent();
- if (Arg1->use_empty()) {
- if (Constant *C = dyn_cast<Constant>(Arg1))
- DeadConstants.push_back(C);
- else
- RecursivelyDeleteTriviallyDeadInstructions(Arg1);
- }
- if (Arg2->use_empty())
- if (Constant *C = dyn_cast<Constant>(Arg2))
- DeadConstants.push_back(C);
- }
- Declare->eraseFromParent();
- }
-
- while (!DeadConstants.empty()) {
- Constant *C = DeadConstants.back();
- DeadConstants.pop_back();
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
- if (GV->hasLocalLinkage())
- RemoveDeadConstant(GV);
- } else
- RemoveDeadConstant(C);
- }
-
- return true;
-}
-
+
+ Function *Declare = M.getFunction("llvm.dbg.declare");
+ std::vector<Constant*> DeadConstants;
+
+ if (Declare) {
+ while (!Declare->use_empty()) {
+ CallInst *CI = cast<CallInst>(Declare->user_back());
+ Value *Arg1 = CI->getArgOperand(0);
+ Value *Arg2 = CI->getArgOperand(1);
+ assert(CI->use_empty() && "llvm.dbg intrinsic should have void result");
+ CI->eraseFromParent();
+ if (Arg1->use_empty()) {
+ if (Constant *C = dyn_cast<Constant>(Arg1))
+ DeadConstants.push_back(C);
+ else
+ RecursivelyDeleteTriviallyDeadInstructions(Arg1);
+ }
+ if (Arg2->use_empty())
+ if (Constant *C = dyn_cast<Constant>(Arg2))
+ DeadConstants.push_back(C);
+ }
+ Declare->eraseFromParent();
+ }
+
+ while (!DeadConstants.empty()) {
+ Constant *C = DeadConstants.back();
+ DeadConstants.pop_back();
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
+ if (GV->hasLocalLinkage())
+ RemoveDeadConstant(GV);
+ } else
+ RemoveDeadConstant(C);
+ }
+
+ return true;
+}
+
bool StripDebugDeclare::runOnModule(Module &M) {
- if (skipModule(M))
- return false;
+ if (skipModule(M))
+ return false;
return stripDebugDeclareImpl(M);
}
-
+
static bool stripDeadDebugInfoImpl(Module &M) {
- bool Changed = false;
-
- LLVMContext &C = M.getContext();
-
- // Find all debug info in F. This is actually overkill in terms of what we
- // want to do, but we want to try and be as resilient as possible in the face
- // of potential debug info changes by using the formal interfaces given to us
- // as much as possible.
- DebugInfoFinder F;
- F.processModule(M);
-
- // For each compile unit, find the live set of global variables/functions and
- // replace the current list of potentially dead global variables/functions
- // with the live list.
- SmallVector<Metadata *, 64> LiveGlobalVariables;
- DenseSet<DIGlobalVariableExpression *> VisitedSet;
-
- std::set<DIGlobalVariableExpression *> LiveGVs;
- for (GlobalVariable &GV : M.globals()) {
- SmallVector<DIGlobalVariableExpression *, 1> GVEs;
- GV.getDebugInfo(GVEs);
- for (auto *GVE : GVEs)
- LiveGVs.insert(GVE);
- }
-
- std::set<DICompileUnit *> LiveCUs;
- // Any CU referenced from a subprogram is live.
- for (DISubprogram *SP : F.subprograms()) {
- if (SP->getUnit())
- LiveCUs.insert(SP->getUnit());
- }
-
- bool HasDeadCUs = false;
- for (DICompileUnit *DIC : F.compile_units()) {
- // Create our live global variable list.
- bool GlobalVariableChange = false;
- for (auto *DIG : DIC->getGlobalVariables()) {
- if (DIG->getExpression() && DIG->getExpression()->isConstant())
- LiveGVs.insert(DIG);
-
- // Make sure we only visit each global variable only once.
- if (!VisitedSet.insert(DIG).second)
- continue;
-
- // If a global variable references DIG, the global variable is live.
- if (LiveGVs.count(DIG))
- LiveGlobalVariables.push_back(DIG);
- else
- GlobalVariableChange = true;
- }
-
- if (!LiveGlobalVariables.empty())
- LiveCUs.insert(DIC);
- else if (!LiveCUs.count(DIC))
- HasDeadCUs = true;
-
- // If we found dead global variables, replace the current global
- // variable list with our new live global variable list.
- if (GlobalVariableChange) {
- DIC->replaceGlobalVariables(MDTuple::get(C, LiveGlobalVariables));
- Changed = true;
- }
-
- // Reset lists for the next iteration.
- LiveGlobalVariables.clear();
- }
-
- if (HasDeadCUs) {
- // Delete the old node and replace it with a new one
- NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
- NMD->clearOperands();
- if (!LiveCUs.empty()) {
- for (DICompileUnit *CU : LiveCUs)
- NMD->addOperand(CU);
- }
- Changed = true;
- }
-
- return Changed;
-}
+ bool Changed = false;
+
+ LLVMContext &C = M.getContext();
+
+ // Find all debug info in F. This is actually overkill in terms of what we
+ // want to do, but we want to try and be as resilient as possible in the face
+ // of potential debug info changes by using the formal interfaces given to us
+ // as much as possible.
+ DebugInfoFinder F;
+ F.processModule(M);
+
+ // For each compile unit, find the live set of global variables/functions and
+ // replace the current list of potentially dead global variables/functions
+ // with the live list.
+ SmallVector<Metadata *, 64> LiveGlobalVariables;
+ DenseSet<DIGlobalVariableExpression *> VisitedSet;
+
+ std::set<DIGlobalVariableExpression *> LiveGVs;
+ for (GlobalVariable &GV : M.globals()) {
+ SmallVector<DIGlobalVariableExpression *, 1> GVEs;
+ GV.getDebugInfo(GVEs);
+ for (auto *GVE : GVEs)
+ LiveGVs.insert(GVE);
+ }
+
+ std::set<DICompileUnit *> LiveCUs;
+ // Any CU referenced from a subprogram is live.
+ for (DISubprogram *SP : F.subprograms()) {
+ if (SP->getUnit())
+ LiveCUs.insert(SP->getUnit());
+ }
+
+ bool HasDeadCUs = false;
+ for (DICompileUnit *DIC : F.compile_units()) {
+ // Create our live global variable list.
+ bool GlobalVariableChange = false;
+ for (auto *DIG : DIC->getGlobalVariables()) {
+ if (DIG->getExpression() && DIG->getExpression()->isConstant())
+ LiveGVs.insert(DIG);
+
+ // Make sure we only visit each global variable only once.
+ if (!VisitedSet.insert(DIG).second)
+ continue;
+
+ // If a global variable references DIG, the global variable is live.
+ if (LiveGVs.count(DIG))
+ LiveGlobalVariables.push_back(DIG);
+ else
+ GlobalVariableChange = true;
+ }
+
+ if (!LiveGlobalVariables.empty())
+ LiveCUs.insert(DIC);
+ else if (!LiveCUs.count(DIC))
+ HasDeadCUs = true;
+
+ // If we found dead global variables, replace the current global
+ // variable list with our new live global variable list.
+ if (GlobalVariableChange) {
+ DIC->replaceGlobalVariables(MDTuple::get(C, LiveGlobalVariables));
+ Changed = true;
+ }
+
+ // Reset lists for the next iteration.
+ LiveGlobalVariables.clear();
+ }
+
+ if (HasDeadCUs) {
+ // Delete the old node and replace it with a new one
+ NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
+ NMD->clearOperands();
+ if (!LiveCUs.empty()) {
+ for (DICompileUnit *CU : LiveCUs)
+ NMD->addOperand(CU);
+ }
+ Changed = true;
+ }
+
+ return Changed;
+}
/// Remove any debug info for global variables/functions in the given module for
/// which said global variable/function no longer exists (i.e. is null).
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/SyntheticCountsPropagation.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
index c29ea77791..1b1e91cafa 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
@@ -1,144 +1,144 @@
-//=- SyntheticCountsPropagation.cpp - Propagate function counts --*- C++ -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a transformation that synthesizes entry counts for
-// functions and attaches !prof metadata to functions with the synthesized
-// counts. The presence of !prof metadata with counter name set to
-// 'synthesized_function_entry_count' indicate that the value of the counter is
-// an estimation of the likely execution count of the function. This transform
-// is applied only in non PGO mode as functions get 'real' profile-based
-// function entry counts in the PGO mode.
-//
-// The transformation works by first assigning some initial values to the entry
-// counts of all functions and then doing a top-down traversal of the
-// callgraph-scc to propagate the counts. For each function the set of callsites
-// and their relative block frequency is gathered. The relative block frequency
-// multiplied by the entry count of the caller and added to the callee's entry
-// count. For non-trivial SCCs, the new counts are computed from the previous
-// counts and updated in one shot.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/SyntheticCountsPropagation.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/SyntheticCountsUtils.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-using Scaled64 = ScaledNumber<uint64_t>;
-using ProfileCount = Function::ProfileCount;
-
-#define DEBUG_TYPE "synthetic-counts-propagation"
-
-/// Initial synthetic count assigned to functions.
-cl::opt<int>
- InitialSyntheticCount("initial-synthetic-count", cl::Hidden, cl::init(10),
- cl::ZeroOrMore,
- cl::desc("Initial value of synthetic entry count."));
-
-/// Initial synthetic count assigned to inline functions.
-static cl::opt<int> InlineSyntheticCount(
- "inline-synthetic-count", cl::Hidden, cl::init(15), cl::ZeroOrMore,
- cl::desc("Initial synthetic entry count for inline functions."));
-
-/// Initial synthetic count assigned to cold functions.
-static cl::opt<int> ColdSyntheticCount(
- "cold-synthetic-count", cl::Hidden, cl::init(5), cl::ZeroOrMore,
- cl::desc("Initial synthetic entry count for cold functions."));
-
-// Assign initial synthetic entry counts to functions.
-static void
-initializeCounts(Module &M, function_ref<void(Function *, uint64_t)> SetCount) {
- auto MayHaveIndirectCalls = [](Function &F) {
- for (auto *U : F.users()) {
- if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
- return true;
- }
- return false;
- };
-
- for (Function &F : M) {
- uint64_t InitialCount = InitialSyntheticCount;
- if (F.isDeclaration())
- continue;
- if (F.hasFnAttribute(Attribute::AlwaysInline) ||
- F.hasFnAttribute(Attribute::InlineHint)) {
- // Use a higher value for inline functions to account for the fact that
- // these are usually beneficial to inline.
- InitialCount = InlineSyntheticCount;
- } else if (F.hasLocalLinkage() && !MayHaveIndirectCalls(F)) {
- // Local functions without inline hints get counts only through
- // propagation.
- InitialCount = 0;
- } else if (F.hasFnAttribute(Attribute::Cold) ||
- F.hasFnAttribute(Attribute::NoInline)) {
- // Use a lower value for noinline and cold functions.
- InitialCount = ColdSyntheticCount;
- }
- SetCount(&F, InitialCount);
- }
-}
-
-PreservedAnalyses SyntheticCountsPropagation::run(Module &M,
- ModuleAnalysisManager &MAM) {
- FunctionAnalysisManager &FAM =
- MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- DenseMap<Function *, Scaled64> Counts;
- // Set initial entry counts.
- initializeCounts(
- M, [&](Function *F, uint64_t Count) { Counts[F] = Scaled64(Count, 0); });
-
- // Edge includes information about the source. Hence ignore the first
- // parameter.
- auto GetCallSiteProfCount = [&](const CallGraphNode *,
- const CallGraphNode::CallRecord &Edge) {
- Optional<Scaled64> Res = None;
- if (!Edge.first)
- return Res;
- CallBase &CB = *cast<CallBase>(*Edge.first);
- Function *Caller = CB.getCaller();
- auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller);
-
- // Now compute the callsite count from relative frequency and
- // entry count:
- BasicBlock *CSBB = CB.getParent();
- Scaled64 EntryFreq(BFI.getEntryFreq(), 0);
- Scaled64 BBCount(BFI.getBlockFreq(CSBB).getFrequency(), 0);
- BBCount /= EntryFreq;
- BBCount *= Counts[Caller];
- return Optional<Scaled64>(BBCount);
- };
-
- CallGraph CG(M);
- // Propgate the entry counts on the callgraph.
- SyntheticCountsUtils<const CallGraph *>::propagate(
- &CG, GetCallSiteProfCount, [&](const CallGraphNode *N, Scaled64 New) {
- auto F = N->getFunction();
- if (!F || F->isDeclaration())
- return;
-
- Counts[F] += New;
- });
-
- // Set the counts as metadata.
- for (auto Entry : Counts) {
- Entry.first->setEntryCount(ProfileCount(
- Entry.second.template toInt<uint64_t>(), Function::PCT_Synthetic));
- }
-
- return PreservedAnalyses::all();
-}
+//=- SyntheticCountsPropagation.cpp - Propagate function counts --*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that synthesizes entry counts for
+// functions and attaches !prof metadata to functions with the synthesized
+// counts. The presence of !prof metadata with counter name set to
+// 'synthesized_function_entry_count' indicate that the value of the counter is
+// an estimation of the likely execution count of the function. This transform
+// is applied only in non PGO mode as functions get 'real' profile-based
+// function entry counts in the PGO mode.
+//
+// The transformation works by first assigning some initial values to the entry
+// counts of all functions and then doing a top-down traversal of the
+// callgraph-scc to propagate the counts. For each function the set of callsites
+// and their relative block frequency is gathered. The relative block frequency
+// multiplied by the entry count of the caller and added to the callee's entry
+// count. For non-trivial SCCs, the new counts are computed from the previous
+// counts and updated in one shot.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/SyntheticCountsPropagation.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/SyntheticCountsUtils.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using Scaled64 = ScaledNumber<uint64_t>;
+using ProfileCount = Function::ProfileCount;
+
+#define DEBUG_TYPE "synthetic-counts-propagation"
+
+/// Initial synthetic count assigned to functions.
+cl::opt<int>
+ InitialSyntheticCount("initial-synthetic-count", cl::Hidden, cl::init(10),
+ cl::ZeroOrMore,
+ cl::desc("Initial value of synthetic entry count."));
+
+/// Initial synthetic count assigned to inline functions.
+static cl::opt<int> InlineSyntheticCount(
+ "inline-synthetic-count", cl::Hidden, cl::init(15), cl::ZeroOrMore,
+ cl::desc("Initial synthetic entry count for inline functions."));
+
+/// Initial synthetic count assigned to cold functions.
+static cl::opt<int> ColdSyntheticCount(
+ "cold-synthetic-count", cl::Hidden, cl::init(5), cl::ZeroOrMore,
+ cl::desc("Initial synthetic entry count for cold functions."));
+
+// Assign initial synthetic entry counts to functions.
+static void
+initializeCounts(Module &M, function_ref<void(Function *, uint64_t)> SetCount) {
+ auto MayHaveIndirectCalls = [](Function &F) {
+ for (auto *U : F.users()) {
+ if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
+ return true;
+ }
+ return false;
+ };
+
+ for (Function &F : M) {
+ uint64_t InitialCount = InitialSyntheticCount;
+ if (F.isDeclaration())
+ continue;
+ if (F.hasFnAttribute(Attribute::AlwaysInline) ||
+ F.hasFnAttribute(Attribute::InlineHint)) {
+ // Use a higher value for inline functions to account for the fact that
+ // these are usually beneficial to inline.
+ InitialCount = InlineSyntheticCount;
+ } else if (F.hasLocalLinkage() && !MayHaveIndirectCalls(F)) {
+ // Local functions without inline hints get counts only through
+ // propagation.
+ InitialCount = 0;
+ } else if (F.hasFnAttribute(Attribute::Cold) ||
+ F.hasFnAttribute(Attribute::NoInline)) {
+ // Use a lower value for noinline and cold functions.
+ InitialCount = ColdSyntheticCount;
+ }
+ SetCount(&F, InitialCount);
+ }
+}
+
+PreservedAnalyses SyntheticCountsPropagation::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+ FunctionAnalysisManager &FAM =
+ MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ DenseMap<Function *, Scaled64> Counts;
+ // Set initial entry counts.
+ initializeCounts(
+ M, [&](Function *F, uint64_t Count) { Counts[F] = Scaled64(Count, 0); });
+
+ // Edge includes information about the source. Hence ignore the first
+ // parameter.
+ auto GetCallSiteProfCount = [&](const CallGraphNode *,
+ const CallGraphNode::CallRecord &Edge) {
+ Optional<Scaled64> Res = None;
+ if (!Edge.first)
+ return Res;
+ CallBase &CB = *cast<CallBase>(*Edge.first);
+ Function *Caller = CB.getCaller();
+ auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller);
+
+ // Now compute the callsite count from relative frequency and
+ // entry count:
+ BasicBlock *CSBB = CB.getParent();
+ Scaled64 EntryFreq(BFI.getEntryFreq(), 0);
+ Scaled64 BBCount(BFI.getBlockFreq(CSBB).getFrequency(), 0);
+ BBCount /= EntryFreq;
+ BBCount *= Counts[Caller];
+ return Optional<Scaled64>(BBCount);
+ };
+
+ CallGraph CG(M);
+ // Propgate the entry counts on the callgraph.
+ SyntheticCountsUtils<const CallGraph *>::propagate(
+ &CG, GetCallSiteProfCount, [&](const CallGraphNode *N, Scaled64 New) {
+ auto F = N->getFunction();
+ if (!F || F->isDeclaration())
+ return;
+
+ Counts[F] += New;
+ });
+
+ // Set the counts as metadata.
+ for (auto Entry : Counts) {
+ Entry.first->setEntryCount(ProfileCount(
+ Entry.second.template toInt<uint64_t>(), Function::PCT_Synthetic));
+ }
+
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 24891b3392..225b4fe95f 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -1,549 +1,549 @@
-//===- ThinLTOBitcodeWriter.cpp - Bitcode writing pass for ThinLTO --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/ModuleSummaryAnalysis.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/TypeMetadataUtils.h"
-#include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+//===- ThinLTOBitcodeWriter.cpp - Bitcode writing pass for ThinLTO --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Object/ModuleSymbolTable.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/ScopedPrinter.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/FunctionAttrs.h"
-#include "llvm/Transforms/IPO/FunctionImport.h"
-#include "llvm/Transforms/IPO/LowerTypeTests.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-using namespace llvm;
-
-namespace {
-
-// Promote each local-linkage entity defined by ExportM and used by ImportM by
-// changing visibility and appending the given ModuleId.
-void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId,
- SetVector<GlobalValue *> &PromoteExtra) {
- DenseMap<const Comdat *, Comdat *> RenamedComdats;
- for (auto &ExportGV : ExportM.global_values()) {
- if (!ExportGV.hasLocalLinkage())
- continue;
-
- auto Name = ExportGV.getName();
- GlobalValue *ImportGV = nullptr;
- if (!PromoteExtra.count(&ExportGV)) {
- ImportGV = ImportM.getNamedValue(Name);
- if (!ImportGV)
- continue;
- ImportGV->removeDeadConstantUsers();
- if (ImportGV->use_empty()) {
- ImportGV->eraseFromParent();
- continue;
- }
- }
-
- std::string NewName = (Name + ModuleId).str();
-
- if (const auto *C = ExportGV.getComdat())
- if (C->getName() == Name)
- RenamedComdats.try_emplace(C, ExportM.getOrInsertComdat(NewName));
-
- ExportGV.setName(NewName);
- ExportGV.setLinkage(GlobalValue::ExternalLinkage);
- ExportGV.setVisibility(GlobalValue::HiddenVisibility);
-
- if (ImportGV) {
- ImportGV->setName(NewName);
- ImportGV->setVisibility(GlobalValue::HiddenVisibility);
- }
- }
-
- if (!RenamedComdats.empty())
- for (auto &GO : ExportM.global_objects())
- if (auto *C = GO.getComdat()) {
- auto Replacement = RenamedComdats.find(C);
- if (Replacement != RenamedComdats.end())
- GO.setComdat(Replacement->second);
- }
-}
-
-// Promote all internal (i.e. distinct) type ids used by the module by replacing
-// them with external type ids formed using the module id.
-//
-// Note that this needs to be done before we clone the module because each clone
-// will receive its own set of distinct metadata nodes.
-void promoteTypeIds(Module &M, StringRef ModuleId) {
- DenseMap<Metadata *, Metadata *> LocalToGlobal;
- auto ExternalizeTypeId = [&](CallInst *CI, unsigned ArgNo) {
- Metadata *MD =
- cast<MetadataAsValue>(CI->getArgOperand(ArgNo))->getMetadata();
-
- if (isa<MDNode>(MD) && cast<MDNode>(MD)->isDistinct()) {
- Metadata *&GlobalMD = LocalToGlobal[MD];
- if (!GlobalMD) {
- std::string NewName = (Twine(LocalToGlobal.size()) + ModuleId).str();
- GlobalMD = MDString::get(M.getContext(), NewName);
- }
-
- CI->setArgOperand(ArgNo,
- MetadataAsValue::get(M.getContext(), GlobalMD));
- }
- };
-
- if (Function *TypeTestFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_test))) {
- for (const Use &U : TypeTestFunc->uses()) {
- auto CI = cast<CallInst>(U.getUser());
- ExternalizeTypeId(CI, 1);
- }
- }
-
- if (Function *TypeCheckedLoadFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load))) {
- for (const Use &U : TypeCheckedLoadFunc->uses()) {
- auto CI = cast<CallInst>(U.getUser());
- ExternalizeTypeId(CI, 2);
- }
- }
-
- for (GlobalObject &GO : M.global_objects()) {
- SmallVector<MDNode *, 1> MDs;
- GO.getMetadata(LLVMContext::MD_type, MDs);
-
- GO.eraseMetadata(LLVMContext::MD_type);
- for (auto MD : MDs) {
- auto I = LocalToGlobal.find(MD->getOperand(1));
- if (I == LocalToGlobal.end()) {
- GO.addMetadata(LLVMContext::MD_type, *MD);
- continue;
- }
- GO.addMetadata(
- LLVMContext::MD_type,
- *MDNode::get(M.getContext(), {MD->getOperand(0), I->second}));
- }
- }
-}
-
-// Drop unused globals, and drop type information from function declarations.
-// FIXME: If we made functions typeless then there would be no need to do this.
-void simplifyExternals(Module &M) {
- FunctionType *EmptyFT =
- FunctionType::get(Type::getVoidTy(M.getContext()), false);
-
- for (auto I = M.begin(), E = M.end(); I != E;) {
- Function &F = *I++;
- if (F.isDeclaration() && F.use_empty()) {
- F.eraseFromParent();
- continue;
- }
-
- if (!F.isDeclaration() || F.getFunctionType() == EmptyFT ||
- // Changing the type of an intrinsic may invalidate the IR.
- F.getName().startswith("llvm."))
- continue;
-
- Function *NewF =
- Function::Create(EmptyFT, GlobalValue::ExternalLinkage,
- F.getAddressSpace(), "", &M);
- NewF->setVisibility(F.getVisibility());
- NewF->takeName(&F);
- F.replaceAllUsesWith(ConstantExpr::getBitCast(NewF, F.getType()));
- F.eraseFromParent();
- }
-
- for (auto I = M.global_begin(), E = M.global_end(); I != E;) {
- GlobalVariable &GV = *I++;
- if (GV.isDeclaration() && GV.use_empty()) {
- GV.eraseFromParent();
- continue;
- }
- }
-}
-
-static void
-filterModule(Module *M,
- function_ref<bool(const GlobalValue *)> ShouldKeepDefinition) {
- std::vector<GlobalValue *> V;
- for (GlobalValue &GV : M->global_values())
- if (!ShouldKeepDefinition(&GV))
- V.push_back(&GV);
-
- for (GlobalValue *GV : V)
- if (!convertToDeclaration(*GV))
- GV->eraseFromParent();
-}
-
-void forEachVirtualFunction(Constant *C, function_ref<void(Function *)> Fn) {
- if (auto *F = dyn_cast<Function>(C))
- return Fn(F);
- if (isa<GlobalValue>(C))
- return;
- for (Value *Op : C->operands())
- forEachVirtualFunction(cast<Constant>(Op), Fn);
-}
-
-// If it's possible to split M into regular and thin LTO parts, do so and write
-// a multi-module bitcode file with the two parts to OS. Otherwise, write only a
-// regular LTO bitcode file to OS.
-void splitAndWriteThinLTOBitcode(
- raw_ostream &OS, raw_ostream *ThinLinkOS,
- function_ref<AAResults &(Function &)> AARGetter, Module &M) {
- std::string ModuleId = getUniqueModuleId(&M);
- if (ModuleId.empty()) {
- // We couldn't generate a module ID for this module, write it out as a
- // regular LTO module with an index for summary-based dead stripping.
- ProfileSummaryInfo PSI(M);
- M.addModuleFlag(Module::Error, "ThinLTO", uint32_t(0));
- ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
- WriteBitcodeToFile(M, OS, /*ShouldPreserveUseListOrder=*/false, &Index);
-
- if (ThinLinkOS)
- // We don't have a ThinLTO part, but still write the module to the
- // ThinLinkOS if requested so that the expected output file is produced.
- WriteBitcodeToFile(M, *ThinLinkOS, /*ShouldPreserveUseListOrder=*/false,
- &Index);
-
- return;
- }
-
- promoteTypeIds(M, ModuleId);
-
- // Returns whether a global or its associated global has attached type
- // metadata. The former may participate in CFI or whole-program
- // devirtualization, so they need to appear in the merged module instead of
- // the thin LTO module. Similarly, globals that are associated with globals
- // with type metadata need to appear in the merged module because they will
- // reference the global's section directly.
- auto HasTypeMetadata = [](const GlobalObject *GO) {
- if (MDNode *MD = GO->getMetadata(LLVMContext::MD_associated))
- if (auto *AssocVM = dyn_cast_or_null<ValueAsMetadata>(MD->getOperand(0)))
- if (auto *AssocGO = dyn_cast<GlobalObject>(AssocVM->getValue()))
- if (AssocGO->hasMetadata(LLVMContext::MD_type))
- return true;
- return GO->hasMetadata(LLVMContext::MD_type);
- };
-
- // Collect the set of virtual functions that are eligible for virtual constant
- // propagation. Each eligible function must not access memory, must return
- // an integer of width <=64 bits, must take at least one argument, must not
- // use its first argument (assumed to be "this") and all arguments other than
- // the first one must be of <=64 bit integer type.
- //
- // Note that we test whether this copy of the function is readnone, rather
- // than testing function attributes, which must hold for any copy of the
- // function, even a less optimized version substituted at link time. This is
- // sound because the virtual constant propagation optimizations effectively
- // inline all implementations of the virtual function into each call site,
- // rather than using function attributes to perform local optimization.
- DenseSet<const Function *> EligibleVirtualFns;
- // If any member of a comdat lives in MergedM, put all members of that
- // comdat in MergedM to keep the comdat together.
- DenseSet<const Comdat *> MergedMComdats;
- for (GlobalVariable &GV : M.globals())
- if (HasTypeMetadata(&GV)) {
- if (const auto *C = GV.getComdat())
- MergedMComdats.insert(C);
- forEachVirtualFunction(GV.getInitializer(), [&](Function *F) {
- auto *RT = dyn_cast<IntegerType>(F->getReturnType());
- if (!RT || RT->getBitWidth() > 64 || F->arg_empty() ||
- !F->arg_begin()->use_empty())
- return;
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Object/ModuleSymbolTable.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/IPO/FunctionImport.h"
+#include "llvm/Transforms/IPO/LowerTypeTests.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+using namespace llvm;
+
+namespace {
+
+// Promote each local-linkage entity defined by ExportM and used by ImportM by
+// changing visibility and appending the given ModuleId.
+void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId,
+ SetVector<GlobalValue *> &PromoteExtra) {
+ DenseMap<const Comdat *, Comdat *> RenamedComdats;
+ for (auto &ExportGV : ExportM.global_values()) {
+ if (!ExportGV.hasLocalLinkage())
+ continue;
+
+ auto Name = ExportGV.getName();
+ GlobalValue *ImportGV = nullptr;
+ if (!PromoteExtra.count(&ExportGV)) {
+ ImportGV = ImportM.getNamedValue(Name);
+ if (!ImportGV)
+ continue;
+ ImportGV->removeDeadConstantUsers();
+ if (ImportGV->use_empty()) {
+ ImportGV->eraseFromParent();
+ continue;
+ }
+ }
+
+ std::string NewName = (Name + ModuleId).str();
+
+ if (const auto *C = ExportGV.getComdat())
+ if (C->getName() == Name)
+ RenamedComdats.try_emplace(C, ExportM.getOrInsertComdat(NewName));
+
+ ExportGV.setName(NewName);
+ ExportGV.setLinkage(GlobalValue::ExternalLinkage);
+ ExportGV.setVisibility(GlobalValue::HiddenVisibility);
+
+ if (ImportGV) {
+ ImportGV->setName(NewName);
+ ImportGV->setVisibility(GlobalValue::HiddenVisibility);
+ }
+ }
+
+ if (!RenamedComdats.empty())
+ for (auto &GO : ExportM.global_objects())
+ if (auto *C = GO.getComdat()) {
+ auto Replacement = RenamedComdats.find(C);
+ if (Replacement != RenamedComdats.end())
+ GO.setComdat(Replacement->second);
+ }
+}
+
+// Promote all internal (i.e. distinct) type ids used by the module by replacing
+// them with external type ids formed using the module id.
+//
+// Note that this needs to be done before we clone the module because each clone
+// will receive its own set of distinct metadata nodes.
+void promoteTypeIds(Module &M, StringRef ModuleId) {
+ DenseMap<Metadata *, Metadata *> LocalToGlobal;
+ auto ExternalizeTypeId = [&](CallInst *CI, unsigned ArgNo) {
+ Metadata *MD =
+ cast<MetadataAsValue>(CI->getArgOperand(ArgNo))->getMetadata();
+
+ if (isa<MDNode>(MD) && cast<MDNode>(MD)->isDistinct()) {
+ Metadata *&GlobalMD = LocalToGlobal[MD];
+ if (!GlobalMD) {
+ std::string NewName = (Twine(LocalToGlobal.size()) + ModuleId).str();
+ GlobalMD = MDString::get(M.getContext(), NewName);
+ }
+
+ CI->setArgOperand(ArgNo,
+ MetadataAsValue::get(M.getContext(), GlobalMD));
+ }
+ };
+
+ if (Function *TypeTestFunc =
+ M.getFunction(Intrinsic::getName(Intrinsic::type_test))) {
+ for (const Use &U : TypeTestFunc->uses()) {
+ auto CI = cast<CallInst>(U.getUser());
+ ExternalizeTypeId(CI, 1);
+ }
+ }
+
+ if (Function *TypeCheckedLoadFunc =
+ M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load))) {
+ for (const Use &U : TypeCheckedLoadFunc->uses()) {
+ auto CI = cast<CallInst>(U.getUser());
+ ExternalizeTypeId(CI, 2);
+ }
+ }
+
+ for (GlobalObject &GO : M.global_objects()) {
+ SmallVector<MDNode *, 1> MDs;
+ GO.getMetadata(LLVMContext::MD_type, MDs);
+
+ GO.eraseMetadata(LLVMContext::MD_type);
+ for (auto MD : MDs) {
+ auto I = LocalToGlobal.find(MD->getOperand(1));
+ if (I == LocalToGlobal.end()) {
+ GO.addMetadata(LLVMContext::MD_type, *MD);
+ continue;
+ }
+ GO.addMetadata(
+ LLVMContext::MD_type,
+ *MDNode::get(M.getContext(), {MD->getOperand(0), I->second}));
+ }
+ }
+}
+
+// Drop unused globals, and drop type information from function declarations.
+// FIXME: If we made functions typeless then there would be no need to do this.
+void simplifyExternals(Module &M) {
+ FunctionType *EmptyFT =
+ FunctionType::get(Type::getVoidTy(M.getContext()), false);
+
+ for (auto I = M.begin(), E = M.end(); I != E;) {
+ Function &F = *I++;
+ if (F.isDeclaration() && F.use_empty()) {
+ F.eraseFromParent();
+ continue;
+ }
+
+ if (!F.isDeclaration() || F.getFunctionType() == EmptyFT ||
+ // Changing the type of an intrinsic may invalidate the IR.
+ F.getName().startswith("llvm."))
+ continue;
+
+ Function *NewF =
+ Function::Create(EmptyFT, GlobalValue::ExternalLinkage,
+ F.getAddressSpace(), "", &M);
+ NewF->setVisibility(F.getVisibility());
+ NewF->takeName(&F);
+ F.replaceAllUsesWith(ConstantExpr::getBitCast(NewF, F.getType()));
+ F.eraseFromParent();
+ }
+
+ for (auto I = M.global_begin(), E = M.global_end(); I != E;) {
+ GlobalVariable &GV = *I++;
+ if (GV.isDeclaration() && GV.use_empty()) {
+ GV.eraseFromParent();
+ continue;
+ }
+ }
+}
+
+static void
+filterModule(Module *M,
+ function_ref<bool(const GlobalValue *)> ShouldKeepDefinition) {
+ std::vector<GlobalValue *> V;
+ for (GlobalValue &GV : M->global_values())
+ if (!ShouldKeepDefinition(&GV))
+ V.push_back(&GV);
+
+ for (GlobalValue *GV : V)
+ if (!convertToDeclaration(*GV))
+ GV->eraseFromParent();
+}
+
+void forEachVirtualFunction(Constant *C, function_ref<void(Function *)> Fn) {
+ if (auto *F = dyn_cast<Function>(C))
+ return Fn(F);
+ if (isa<GlobalValue>(C))
+ return;
+ for (Value *Op : C->operands())
+ forEachVirtualFunction(cast<Constant>(Op), Fn);
+}
+
+// If it's possible to split M into regular and thin LTO parts, do so and write
+// a multi-module bitcode file with the two parts to OS. Otherwise, write only a
+// regular LTO bitcode file to OS.
+void splitAndWriteThinLTOBitcode(
+ raw_ostream &OS, raw_ostream *ThinLinkOS,
+ function_ref<AAResults &(Function &)> AARGetter, Module &M) {
+ std::string ModuleId = getUniqueModuleId(&M);
+ if (ModuleId.empty()) {
+ // We couldn't generate a module ID for this module, write it out as a
+ // regular LTO module with an index for summary-based dead stripping.
+ ProfileSummaryInfo PSI(M);
+ M.addModuleFlag(Module::Error, "ThinLTO", uint32_t(0));
+ ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
+ WriteBitcodeToFile(M, OS, /*ShouldPreserveUseListOrder=*/false, &Index);
+
+ if (ThinLinkOS)
+ // We don't have a ThinLTO part, but still write the module to the
+ // ThinLinkOS if requested so that the expected output file is produced.
+ WriteBitcodeToFile(M, *ThinLinkOS, /*ShouldPreserveUseListOrder=*/false,
+ &Index);
+
+ return;
+ }
+
+ promoteTypeIds(M, ModuleId);
+
+ // Returns whether a global or its associated global has attached type
+ // metadata. The former may participate in CFI or whole-program
+ // devirtualization, so they need to appear in the merged module instead of
+ // the thin LTO module. Similarly, globals that are associated with globals
+ // with type metadata need to appear in the merged module because they will
+ // reference the global's section directly.
+ auto HasTypeMetadata = [](const GlobalObject *GO) {
+ if (MDNode *MD = GO->getMetadata(LLVMContext::MD_associated))
+ if (auto *AssocVM = dyn_cast_or_null<ValueAsMetadata>(MD->getOperand(0)))
+ if (auto *AssocGO = dyn_cast<GlobalObject>(AssocVM->getValue()))
+ if (AssocGO->hasMetadata(LLVMContext::MD_type))
+ return true;
+ return GO->hasMetadata(LLVMContext::MD_type);
+ };
+
+ // Collect the set of virtual functions that are eligible for virtual constant
+ // propagation. Each eligible function must not access memory, must return
+ // an integer of width <=64 bits, must take at least one argument, must not
+ // use its first argument (assumed to be "this") and all arguments other than
+ // the first one must be of <=64 bit integer type.
+ //
+ // Note that we test whether this copy of the function is readnone, rather
+ // than testing function attributes, which must hold for any copy of the
+ // function, even a less optimized version substituted at link time. This is
+ // sound because the virtual constant propagation optimizations effectively
+ // inline all implementations of the virtual function into each call site,
+ // rather than using function attributes to perform local optimization.
+ DenseSet<const Function *> EligibleVirtualFns;
+ // If any member of a comdat lives in MergedM, put all members of that
+ // comdat in MergedM to keep the comdat together.
+ DenseSet<const Comdat *> MergedMComdats;
+ for (GlobalVariable &GV : M.globals())
+ if (HasTypeMetadata(&GV)) {
+ if (const auto *C = GV.getComdat())
+ MergedMComdats.insert(C);
+ forEachVirtualFunction(GV.getInitializer(), [&](Function *F) {
+ auto *RT = dyn_cast<IntegerType>(F->getReturnType());
+ if (!RT || RT->getBitWidth() > 64 || F->arg_empty() ||
+ !F->arg_begin()->use_empty())
+ return;
for (auto &Arg : drop_begin(F->args())) {
- auto *ArgT = dyn_cast<IntegerType>(Arg.getType());
- if (!ArgT || ArgT->getBitWidth() > 64)
- return;
- }
- if (!F->isDeclaration() &&
- computeFunctionBodyMemoryAccess(*F, AARGetter(*F)) == MAK_ReadNone)
- EligibleVirtualFns.insert(F);
- });
- }
-
- ValueToValueMapTy VMap;
- std::unique_ptr<Module> MergedM(
- CloneModule(M, VMap, [&](const GlobalValue *GV) -> bool {
- if (const auto *C = GV->getComdat())
- if (MergedMComdats.count(C))
- return true;
- if (auto *F = dyn_cast<Function>(GV))
- return EligibleVirtualFns.count(F);
- if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
- return HasTypeMetadata(GVar);
- return false;
- }));
- StripDebugInfo(*MergedM);
- MergedM->setModuleInlineAsm("");
-
- for (Function &F : *MergedM)
- if (!F.isDeclaration()) {
- // Reset the linkage of all functions eligible for virtual constant
- // propagation. The canonical definitions live in the thin LTO module so
- // that they can be imported.
- F.setLinkage(GlobalValue::AvailableExternallyLinkage);
- F.setComdat(nullptr);
- }
-
- SetVector<GlobalValue *> CfiFunctions;
- for (auto &F : M)
- if ((!F.hasLocalLinkage() || F.hasAddressTaken()) && HasTypeMetadata(&F))
- CfiFunctions.insert(&F);
-
- // Remove all globals with type metadata, globals with comdats that live in
- // MergedM, and aliases pointing to such globals from the thin LTO module.
- filterModule(&M, [&](const GlobalValue *GV) {
- if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
- if (HasTypeMetadata(GVar))
- return false;
- if (const auto *C = GV->getComdat())
- if (MergedMComdats.count(C))
- return false;
- return true;
- });
-
- promoteInternals(*MergedM, M, ModuleId, CfiFunctions);
- promoteInternals(M, *MergedM, ModuleId, CfiFunctions);
-
- auto &Ctx = MergedM->getContext();
- SmallVector<MDNode *, 8> CfiFunctionMDs;
- for (auto V : CfiFunctions) {
- Function &F = *cast<Function>(V);
- SmallVector<MDNode *, 2> Types;
- F.getMetadata(LLVMContext::MD_type, Types);
-
- SmallVector<Metadata *, 4> Elts;
- Elts.push_back(MDString::get(Ctx, F.getName()));
- CfiFunctionLinkage Linkage;
- if (lowertypetests::isJumpTableCanonical(&F))
- Linkage = CFL_Definition;
- else if (F.hasExternalWeakLinkage())
- Linkage = CFL_WeakDeclaration;
- else
- Linkage = CFL_Declaration;
- Elts.push_back(ConstantAsMetadata::get(
- llvm::ConstantInt::get(Type::getInt8Ty(Ctx), Linkage)));
+ auto *ArgT = dyn_cast<IntegerType>(Arg.getType());
+ if (!ArgT || ArgT->getBitWidth() > 64)
+ return;
+ }
+ if (!F->isDeclaration() &&
+ computeFunctionBodyMemoryAccess(*F, AARGetter(*F)) == MAK_ReadNone)
+ EligibleVirtualFns.insert(F);
+ });
+ }
+
+ ValueToValueMapTy VMap;
+ std::unique_ptr<Module> MergedM(
+ CloneModule(M, VMap, [&](const GlobalValue *GV) -> bool {
+ if (const auto *C = GV->getComdat())
+ if (MergedMComdats.count(C))
+ return true;
+ if (auto *F = dyn_cast<Function>(GV))
+ return EligibleVirtualFns.count(F);
+ if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+ return HasTypeMetadata(GVar);
+ return false;
+ }));
+ StripDebugInfo(*MergedM);
+ MergedM->setModuleInlineAsm("");
+
+ for (Function &F : *MergedM)
+ if (!F.isDeclaration()) {
+ // Reset the linkage of all functions eligible for virtual constant
+ // propagation. The canonical definitions live in the thin LTO module so
+ // that they can be imported.
+ F.setLinkage(GlobalValue::AvailableExternallyLinkage);
+ F.setComdat(nullptr);
+ }
+
+ SetVector<GlobalValue *> CfiFunctions;
+ for (auto &F : M)
+ if ((!F.hasLocalLinkage() || F.hasAddressTaken()) && HasTypeMetadata(&F))
+ CfiFunctions.insert(&F);
+
+ // Remove all globals with type metadata, globals with comdats that live in
+ // MergedM, and aliases pointing to such globals from the thin LTO module.
+ filterModule(&M, [&](const GlobalValue *GV) {
+ if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+ if (HasTypeMetadata(GVar))
+ return false;
+ if (const auto *C = GV->getComdat())
+ if (MergedMComdats.count(C))
+ return false;
+ return true;
+ });
+
+ promoteInternals(*MergedM, M, ModuleId, CfiFunctions);
+ promoteInternals(M, *MergedM, ModuleId, CfiFunctions);
+
+ auto &Ctx = MergedM->getContext();
+ SmallVector<MDNode *, 8> CfiFunctionMDs;
+ for (auto V : CfiFunctions) {
+ Function &F = *cast<Function>(V);
+ SmallVector<MDNode *, 2> Types;
+ F.getMetadata(LLVMContext::MD_type, Types);
+
+ SmallVector<Metadata *, 4> Elts;
+ Elts.push_back(MDString::get(Ctx, F.getName()));
+ CfiFunctionLinkage Linkage;
+ if (lowertypetests::isJumpTableCanonical(&F))
+ Linkage = CFL_Definition;
+ else if (F.hasExternalWeakLinkage())
+ Linkage = CFL_WeakDeclaration;
+ else
+ Linkage = CFL_Declaration;
+ Elts.push_back(ConstantAsMetadata::get(
+ llvm::ConstantInt::get(Type::getInt8Ty(Ctx), Linkage)));
append_range(Elts, Types);
- CfiFunctionMDs.push_back(MDTuple::get(Ctx, Elts));
- }
-
- if(!CfiFunctionMDs.empty()) {
- NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("cfi.functions");
- for (auto MD : CfiFunctionMDs)
- NMD->addOperand(MD);
- }
-
- SmallVector<MDNode *, 8> FunctionAliases;
- for (auto &A : M.aliases()) {
- if (!isa<Function>(A.getAliasee()))
- continue;
-
- auto *F = cast<Function>(A.getAliasee());
-
- Metadata *Elts[] = {
- MDString::get(Ctx, A.getName()),
- MDString::get(Ctx, F->getName()),
- ConstantAsMetadata::get(
- ConstantInt::get(Type::getInt8Ty(Ctx), A.getVisibility())),
- ConstantAsMetadata::get(
- ConstantInt::get(Type::getInt8Ty(Ctx), A.isWeakForLinker())),
- };
-
- FunctionAliases.push_back(MDTuple::get(Ctx, Elts));
- }
-
- if (!FunctionAliases.empty()) {
- NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("aliases");
- for (auto MD : FunctionAliases)
- NMD->addOperand(MD);
- }
-
- SmallVector<MDNode *, 8> Symvers;
- ModuleSymbolTable::CollectAsmSymvers(M, [&](StringRef Name, StringRef Alias) {
- Function *F = M.getFunction(Name);
- if (!F || F->use_empty())
- return;
-
- Symvers.push_back(MDTuple::get(
- Ctx, {MDString::get(Ctx, Name), MDString::get(Ctx, Alias)}));
- });
-
- if (!Symvers.empty()) {
- NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("symvers");
- for (auto MD : Symvers)
- NMD->addOperand(MD);
- }
-
- simplifyExternals(*MergedM);
-
- // FIXME: Try to re-use BSI and PFI from the original module here.
- ProfileSummaryInfo PSI(M);
- ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
-
- // Mark the merged module as requiring full LTO. We still want an index for
- // it though, so that it can participate in summary-based dead stripping.
- MergedM->addModuleFlag(Module::Error, "ThinLTO", uint32_t(0));
- ModuleSummaryIndex MergedMIndex =
- buildModuleSummaryIndex(*MergedM, nullptr, &PSI);
-
- SmallVector<char, 0> Buffer;
-
- BitcodeWriter W(Buffer);
- // Save the module hash produced for the full bitcode, which will
- // be used in the backends, and use that in the minimized bitcode
- // produced for the full link.
- ModuleHash ModHash = {{0}};
- W.writeModule(M, /*ShouldPreserveUseListOrder=*/false, &Index,
- /*GenerateHash=*/true, &ModHash);
- W.writeModule(*MergedM, /*ShouldPreserveUseListOrder=*/false, &MergedMIndex);
- W.writeSymtab();
- W.writeStrtab();
- OS << Buffer;
-
- // If a minimized bitcode module was requested for the thin link, only
- // the information that is needed by thin link will be written in the
- // given OS (the merged module will be written as usual).
- if (ThinLinkOS) {
- Buffer.clear();
- BitcodeWriter W2(Buffer);
- StripDebugInfo(M);
- W2.writeThinLinkBitcode(M, Index, ModHash);
- W2.writeModule(*MergedM, /*ShouldPreserveUseListOrder=*/false,
- &MergedMIndex);
- W2.writeSymtab();
- W2.writeStrtab();
- *ThinLinkOS << Buffer;
- }
-}
-
-// Check if the LTO Unit splitting has been enabled.
-bool enableSplitLTOUnit(Module &M) {
- bool EnableSplitLTOUnit = false;
- if (auto *MD = mdconst::extract_or_null<ConstantInt>(
- M.getModuleFlag("EnableSplitLTOUnit")))
- EnableSplitLTOUnit = MD->getZExtValue();
- return EnableSplitLTOUnit;
-}
-
-// Returns whether this module needs to be split because it uses type metadata.
-bool hasTypeMetadata(Module &M) {
- for (auto &GO : M.global_objects()) {
- if (GO.hasMetadata(LLVMContext::MD_type))
- return true;
- }
- return false;
-}
-
-void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
- function_ref<AAResults &(Function &)> AARGetter,
- Module &M, const ModuleSummaryIndex *Index) {
- std::unique_ptr<ModuleSummaryIndex> NewIndex = nullptr;
- // See if this module has any type metadata. If so, we try to split it
- // or at least promote type ids to enable WPD.
- if (hasTypeMetadata(M)) {
- if (enableSplitLTOUnit(M))
- return splitAndWriteThinLTOBitcode(OS, ThinLinkOS, AARGetter, M);
- // Promote type ids as needed for index-based WPD.
- std::string ModuleId = getUniqueModuleId(&M);
- if (!ModuleId.empty()) {
- promoteTypeIds(M, ModuleId);
- // Need to rebuild the index so that it contains type metadata
- // for the newly promoted type ids.
- // FIXME: Probably should not bother building the index at all
- // in the caller of writeThinLTOBitcode (which does so via the
- // ModuleSummaryIndexAnalysis pass), since we have to rebuild it
- // anyway whenever there is type metadata (here or in
- // splitAndWriteThinLTOBitcode). Just always build it once via the
- // buildModuleSummaryIndex when Module(s) are ready.
- ProfileSummaryInfo PSI(M);
- NewIndex = std::make_unique<ModuleSummaryIndex>(
- buildModuleSummaryIndex(M, nullptr, &PSI));
- Index = NewIndex.get();
- }
- }
-
- // Write it out as an unsplit ThinLTO module.
-
- // Save the module hash produced for the full bitcode, which will
- // be used in the backends, and use that in the minimized bitcode
- // produced for the full link.
- ModuleHash ModHash = {{0}};
- WriteBitcodeToFile(M, OS, /*ShouldPreserveUseListOrder=*/false, Index,
- /*GenerateHash=*/true, &ModHash);
- // If a minimized bitcode module was requested for the thin link, only
- // the information that is needed by thin link will be written in the
- // given OS.
- if (ThinLinkOS && Index)
- WriteThinLinkBitcodeToFile(M, *ThinLinkOS, *Index, ModHash);
-}
-
-class WriteThinLTOBitcode : public ModulePass {
- raw_ostream &OS; // raw_ostream to print on
- // The output stream on which to emit a minimized module for use
- // just in the thin link, if requested.
- raw_ostream *ThinLinkOS;
-
-public:
- static char ID; // Pass identification, replacement for typeid
- WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()), ThinLinkOS(nullptr) {
- initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
- }
-
- explicit WriteThinLTOBitcode(raw_ostream &o, raw_ostream *ThinLinkOS)
- : ModulePass(ID), OS(o), ThinLinkOS(ThinLinkOS) {
- initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
- }
-
- StringRef getPassName() const override { return "ThinLTO Bitcode Writer"; }
-
- bool runOnModule(Module &M) override {
- const ModuleSummaryIndex *Index =
- &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex());
- writeThinLTOBitcode(OS, ThinLinkOS, LegacyAARGetter(*this), M, Index);
- return true;
- }
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<ModuleSummaryIndexWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-};
-} // anonymous namespace
-
-char WriteThinLTOBitcode::ID = 0;
-INITIALIZE_PASS_BEGIN(WriteThinLTOBitcode, "write-thinlto-bitcode",
- "Write ThinLTO Bitcode", false, true)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(WriteThinLTOBitcode, "write-thinlto-bitcode",
- "Write ThinLTO Bitcode", false, true)
-
-ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str,
- raw_ostream *ThinLinkOS) {
- return new WriteThinLTOBitcode(Str, ThinLinkOS);
-}
-
-PreservedAnalyses
-llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- writeThinLTOBitcode(OS, ThinLinkOS,
- [&FAM](Function &F) -> AAResults & {
- return FAM.getResult<AAManager>(F);
- },
- M, &AM.getResult<ModuleSummaryIndexAnalysis>(M));
- return PreservedAnalyses::all();
-}
+ CfiFunctionMDs.push_back(MDTuple::get(Ctx, Elts));
+ }
+
+ if(!CfiFunctionMDs.empty()) {
+ NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("cfi.functions");
+ for (auto MD : CfiFunctionMDs)
+ NMD->addOperand(MD);
+ }
+
+ SmallVector<MDNode *, 8> FunctionAliases;
+ for (auto &A : M.aliases()) {
+ if (!isa<Function>(A.getAliasee()))
+ continue;
+
+ auto *F = cast<Function>(A.getAliasee());
+
+ Metadata *Elts[] = {
+ MDString::get(Ctx, A.getName()),
+ MDString::get(Ctx, F->getName()),
+ ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt8Ty(Ctx), A.getVisibility())),
+ ConstantAsMetadata::get(
+ ConstantInt::get(Type::getInt8Ty(Ctx), A.isWeakForLinker())),
+ };
+
+ FunctionAliases.push_back(MDTuple::get(Ctx, Elts));
+ }
+
+ if (!FunctionAliases.empty()) {
+ NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("aliases");
+ for (auto MD : FunctionAliases)
+ NMD->addOperand(MD);
+ }
+
+ SmallVector<MDNode *, 8> Symvers;
+ ModuleSymbolTable::CollectAsmSymvers(M, [&](StringRef Name, StringRef Alias) {
+ Function *F = M.getFunction(Name);
+ if (!F || F->use_empty())
+ return;
+
+ Symvers.push_back(MDTuple::get(
+ Ctx, {MDString::get(Ctx, Name), MDString::get(Ctx, Alias)}));
+ });
+
+ if (!Symvers.empty()) {
+ NamedMDNode *NMD = MergedM->getOrInsertNamedMetadata("symvers");
+ for (auto MD : Symvers)
+ NMD->addOperand(MD);
+ }
+
+ simplifyExternals(*MergedM);
+
+ // FIXME: Try to re-use BSI and PFI from the original module here.
+ ProfileSummaryInfo PSI(M);
+ ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI);
+
+ // Mark the merged module as requiring full LTO. We still want an index for
+ // it though, so that it can participate in summary-based dead stripping.
+ MergedM->addModuleFlag(Module::Error, "ThinLTO", uint32_t(0));
+ ModuleSummaryIndex MergedMIndex =
+ buildModuleSummaryIndex(*MergedM, nullptr, &PSI);
+
+ SmallVector<char, 0> Buffer;
+
+ BitcodeWriter W(Buffer);
+ // Save the module hash produced for the full bitcode, which will
+ // be used in the backends, and use that in the minimized bitcode
+ // produced for the full link.
+ ModuleHash ModHash = {{0}};
+ W.writeModule(M, /*ShouldPreserveUseListOrder=*/false, &Index,
+ /*GenerateHash=*/true, &ModHash);
+ W.writeModule(*MergedM, /*ShouldPreserveUseListOrder=*/false, &MergedMIndex);
+ W.writeSymtab();
+ W.writeStrtab();
+ OS << Buffer;
+
+ // If a minimized bitcode module was requested for the thin link, only
+ // the information that is needed by thin link will be written in the
+ // given OS (the merged module will be written as usual).
+ if (ThinLinkOS) {
+ Buffer.clear();
+ BitcodeWriter W2(Buffer);
+ StripDebugInfo(M);
+ W2.writeThinLinkBitcode(M, Index, ModHash);
+ W2.writeModule(*MergedM, /*ShouldPreserveUseListOrder=*/false,
+ &MergedMIndex);
+ W2.writeSymtab();
+ W2.writeStrtab();
+ *ThinLinkOS << Buffer;
+ }
+}
+
+// Check if the LTO Unit splitting has been enabled.
+bool enableSplitLTOUnit(Module &M) {
+ bool EnableSplitLTOUnit = false;
+ if (auto *MD = mdconst::extract_or_null<ConstantInt>(
+ M.getModuleFlag("EnableSplitLTOUnit")))
+ EnableSplitLTOUnit = MD->getZExtValue();
+ return EnableSplitLTOUnit;
+}
+
+// Returns whether this module needs to be split because it uses type metadata.
+bool hasTypeMetadata(Module &M) {
+ for (auto &GO : M.global_objects()) {
+ if (GO.hasMetadata(LLVMContext::MD_type))
+ return true;
+ }
+ return false;
+}
+
+void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
+ function_ref<AAResults &(Function &)> AARGetter,
+ Module &M, const ModuleSummaryIndex *Index) {
+ std::unique_ptr<ModuleSummaryIndex> NewIndex = nullptr;
+ // See if this module has any type metadata. If so, we try to split it
+ // or at least promote type ids to enable WPD.
+ if (hasTypeMetadata(M)) {
+ if (enableSplitLTOUnit(M))
+ return splitAndWriteThinLTOBitcode(OS, ThinLinkOS, AARGetter, M);
+ // Promote type ids as needed for index-based WPD.
+ std::string ModuleId = getUniqueModuleId(&M);
+ if (!ModuleId.empty()) {
+ promoteTypeIds(M, ModuleId);
+ // Need to rebuild the index so that it contains type metadata
+ // for the newly promoted type ids.
+ // FIXME: Probably should not bother building the index at all
+ // in the caller of writeThinLTOBitcode (which does so via the
+ // ModuleSummaryIndexAnalysis pass), since we have to rebuild it
+ // anyway whenever there is type metadata (here or in
+ // splitAndWriteThinLTOBitcode). Just always build it once via the
+ // buildModuleSummaryIndex when Module(s) are ready.
+ ProfileSummaryInfo PSI(M);
+ NewIndex = std::make_unique<ModuleSummaryIndex>(
+ buildModuleSummaryIndex(M, nullptr, &PSI));
+ Index = NewIndex.get();
+ }
+ }
+
+ // Write it out as an unsplit ThinLTO module.
+
+ // Save the module hash produced for the full bitcode, which will
+ // be used in the backends, and use that in the minimized bitcode
+ // produced for the full link.
+ ModuleHash ModHash = {{0}};
+ WriteBitcodeToFile(M, OS, /*ShouldPreserveUseListOrder=*/false, Index,
+ /*GenerateHash=*/true, &ModHash);
+ // If a minimized bitcode module was requested for the thin link, only
+ // the information that is needed by thin link will be written in the
+ // given OS.
+ if (ThinLinkOS && Index)
+ WriteThinLinkBitcodeToFile(M, *ThinLinkOS, *Index, ModHash);
+}
+
+class WriteThinLTOBitcode : public ModulePass {
+ raw_ostream &OS; // raw_ostream to print on
+ // The output stream on which to emit a minimized module for use
+ // just in the thin link, if requested.
+ raw_ostream *ThinLinkOS;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+ WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()), ThinLinkOS(nullptr) {
+ initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
+ }
+
+ explicit WriteThinLTOBitcode(raw_ostream &o, raw_ostream *ThinLinkOS)
+ : ModulePass(ID), OS(o), ThinLinkOS(ThinLinkOS) {
+ initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return "ThinLTO Bitcode Writer"; }
+
+ bool runOnModule(Module &M) override {
+ const ModuleSummaryIndex *Index =
+ &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex());
+ writeThinLTOBitcode(OS, ThinLinkOS, LegacyAARGetter(*this), M, Index);
+ return true;
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<ModuleSummaryIndexWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+};
+} // anonymous namespace
+
+char WriteThinLTOBitcode::ID = 0;
+INITIALIZE_PASS_BEGIN(WriteThinLTOBitcode, "write-thinlto-bitcode",
+ "Write ThinLTO Bitcode", false, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(WriteThinLTOBitcode, "write-thinlto-bitcode",
+ "Write ThinLTO Bitcode", false, true)
+
+ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str,
+ raw_ostream *ThinLinkOS) {
+ return new WriteThinLTOBitcode(Str, ThinLinkOS);
+}
+
+PreservedAnalyses
+llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) {
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ writeThinLTOBitcode(OS, ThinLinkOS,
+ [&FAM](Function &F) -> AAResults & {
+ return FAM.getResult<AAManager>(F);
+ },
+ M, &AM.getResult<ModuleSummaryIndexAnalysis>(M));
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/WholeProgramDevirt.cpp b/contrib/libs/llvm12/lib/Transforms/IPO/WholeProgramDevirt.cpp
index d515fe9ed9..cf1ff405c4 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -1,2216 +1,2216 @@
-//===- WholeProgramDevirt.cpp - Whole program virtual call optimization ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass implements whole program optimization of virtual calls in cases
-// where we know (via !type metadata) that the list of callees is fixed. This
-// includes the following:
-// - Single implementation devirtualization: if a virtual call has a single
-// possible callee, replace all calls with a direct call to that callee.
-// - Virtual constant propagation: if the virtual function's return type is an
-// integer <=64 bits and all possible callees are readnone, for each class and
-// each list of constant arguments: evaluate the function, store the return
-// value alongside the virtual table, and rewrite each virtual call as a load
-// from the virtual table.
-// - Uniform return value optimization: if the conditions for virtual constant
-// propagation hold and each function returns the same constant value, replace
-// each virtual call with that constant.
-// - Unique return value optimization for i1 return values: if the conditions
-// for virtual constant propagation hold and a single vtable's function
-// returns 0, or a single vtable's function returns 1, replace each virtual
-// call with a comparison of the vptr against that vtable's address.
-//
-// This pass is intended to be used during the regular and thin LTO pipelines:
-//
-// During regular LTO, the pass determines the best optimization for each
-// virtual call and applies the resolutions directly to virtual calls that are
-// eligible for virtual call optimization (i.e. calls that use either of the
-// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics).
-//
-// During hybrid Regular/ThinLTO, the pass operates in two phases:
-// - Export phase: this is run during the thin link over a single merged module
-// that contains all vtables with !type metadata that participate in the link.
-// The pass computes a resolution for each virtual call and stores it in the
-// type identifier summary.
-// - Import phase: this is run during the thin backends over the individual
-// modules. The pass applies the resolutions previously computed during the
-// import phase to each eligible virtual call.
-//
-// During ThinLTO, the pass operates in two phases:
-// - Export phase: this is run during the thin link over the index which
-// contains a summary of all vtables with !type metadata that participate in
-// the link. It computes a resolution for each virtual call and stores it in
-// the type identifier summary. Only single implementation devirtualization
-// is supported.
-// - Import phase: (same as with hybrid case above).
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseMapInfo.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/ADT/iterator_range.h"
+//===- WholeProgramDevirt.cpp - Whole program virtual call optimization ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements whole program optimization of virtual calls in cases
+// where we know (via !type metadata) that the list of callees is fixed. This
+// includes the following:
+// - Single implementation devirtualization: if a virtual call has a single
+// possible callee, replace all calls with a direct call to that callee.
+// - Virtual constant propagation: if the virtual function's return type is an
+// integer <=64 bits and all possible callees are readnone, for each class and
+// each list of constant arguments: evaluate the function, store the return
+// value alongside the virtual table, and rewrite each virtual call as a load
+// from the virtual table.
+// - Uniform return value optimization: if the conditions for virtual constant
+// propagation hold and each function returns the same constant value, replace
+// each virtual call with that constant.
+// - Unique return value optimization for i1 return values: if the conditions
+// for virtual constant propagation hold and a single vtable's function
+// returns 0, or a single vtable's function returns 1, replace each virtual
+// call with a comparison of the vptr against that vtable's address.
+//
+// This pass is intended to be used during the regular and thin LTO pipelines:
+//
+// During regular LTO, the pass determines the best optimization for each
+// virtual call and applies the resolutions directly to virtual calls that are
+// eligible for virtual call optimization (i.e. calls that use either of the
+// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics).
+//
+// During hybrid Regular/ThinLTO, the pass operates in two phases:
+// - Export phase: this is run during the thin link over a single merged module
+// that contains all vtables with !type metadata that participate in the link.
+// The pass computes a resolution for each virtual call and stores it in the
+// type identifier summary.
+// - Import phase: this is run during the thin backends over the individual
+// modules. The pass applies the resolutions previously computed during the
+// import phase to each eligible virtual call.
+//
+// During ThinLTO, the pass operates in two phases:
+// - Export phase: this is run during the thin link over the index which
+// contains a summary of all vtables with !type metadata that participate in
+// the link. It computes a resolution for each virtual call and stores it in
+// the type identifier summary. Only single implementation devirtualization
+// is supported.
+// - Import phase: (same as with hybrid case above).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/iterator_range.h"
#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/TypeMetadataUtils.h"
-#include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/Bitcode/BitcodeWriter.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ModuleSummaryIndexYAML.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/PassRegistry.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Errc.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/GlobPattern.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/Transforms/IPO/FunctionAttrs.h"
-#include "llvm/Transforms/Utils/Evaluator.h"
-#include <algorithm>
-#include <cstddef>
-#include <map>
-#include <set>
-#include <string>
-
-using namespace llvm;
-using namespace wholeprogramdevirt;
-
-#define DEBUG_TYPE "wholeprogramdevirt"
-
-static cl::opt<PassSummaryAction> ClSummaryAction(
- "wholeprogramdevirt-summary-action",
- cl::desc("What to do with the summary when running this pass"),
- cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"),
- clEnumValN(PassSummaryAction::Import, "import",
- "Import typeid resolutions from summary and globals"),
- clEnumValN(PassSummaryAction::Export, "export",
- "Export typeid resolutions to summary and globals")),
- cl::Hidden);
-
-static cl::opt<std::string> ClReadSummary(
- "wholeprogramdevirt-read-summary",
- cl::desc(
- "Read summary from given bitcode or YAML file before running pass"),
- cl::Hidden);
-
-static cl::opt<std::string> ClWriteSummary(
- "wholeprogramdevirt-write-summary",
- cl::desc("Write summary to given bitcode or YAML file after running pass. "
- "Output file format is deduced from extension: *.bc means writing "
- "bitcode, otherwise YAML"),
- cl::Hidden);
-
-static cl::opt<unsigned>
- ClThreshold("wholeprogramdevirt-branch-funnel-threshold", cl::Hidden,
- cl::init(10), cl::ZeroOrMore,
- cl::desc("Maximum number of call targets per "
- "call site to enable branch funnels"));
-
-static cl::opt<bool>
- PrintSummaryDevirt("wholeprogramdevirt-print-index-based", cl::Hidden,
- cl::init(false), cl::ZeroOrMore,
- cl::desc("Print index-based devirtualization messages"));
-
-/// Provide a way to force enable whole program visibility in tests.
-/// This is needed to support legacy tests that don't contain
-/// !vcall_visibility metadata (the mere presense of type tests
-/// previously implied hidden visibility).
-cl::opt<bool>
- WholeProgramVisibility("whole-program-visibility", cl::init(false),
- cl::Hidden, cl::ZeroOrMore,
- cl::desc("Enable whole program visibility"));
-
-/// Provide a way to force disable whole program for debugging or workarounds,
-/// when enabled via the linker.
-cl::opt<bool> DisableWholeProgramVisibility(
- "disable-whole-program-visibility", cl::init(false), cl::Hidden,
- cl::ZeroOrMore,
- cl::desc("Disable whole program visibility (overrides enabling options)"));
-
-/// Provide way to prevent certain function from being devirtualized
-cl::list<std::string>
- SkipFunctionNames("wholeprogramdevirt-skip",
- cl::desc("Prevent function(s) from being devirtualized"),
- cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated);
-
-namespace {
-struct PatternList {
- std::vector<GlobPattern> Patterns;
- template <class T> void init(const T &StringList) {
- for (const auto &S : StringList)
- if (Expected<GlobPattern> Pat = GlobPattern::create(S))
- Patterns.push_back(std::move(*Pat));
- }
- bool match(StringRef S) {
- for (const GlobPattern &P : Patterns)
- if (P.match(S))
- return true;
- return false;
- }
-};
-} // namespace
-
-// Find the minimum offset that we may store a value of size Size bits at. If
-// IsAfter is set, look for an offset before the object, otherwise look for an
-// offset after the object.
-uint64_t
-wholeprogramdevirt::findLowestOffset(ArrayRef<VirtualCallTarget> Targets,
- bool IsAfter, uint64_t Size) {
- // Find a minimum offset taking into account only vtable sizes.
- uint64_t MinByte = 0;
- for (const VirtualCallTarget &Target : Targets) {
- if (IsAfter)
- MinByte = std::max(MinByte, Target.minAfterBytes());
- else
- MinByte = std::max(MinByte, Target.minBeforeBytes());
- }
-
- // Build a vector of arrays of bytes covering, for each target, a slice of the
- // used region (see AccumBitVector::BytesUsed in
- // llvm/Transforms/IPO/WholeProgramDevirt.h) starting at MinByte. Effectively,
- // this aligns the used regions to start at MinByte.
- //
- // In this example, A, B and C are vtables, # is a byte already allocated for
- // a virtual function pointer, AAAA... (etc.) are the used regions for the
- // vtables and Offset(X) is the value computed for the Offset variable below
- // for X.
- //
- // Offset(A)
- // | |
- // |MinByte
- // A: ################AAAAAAAA|AAAAAAAA
- // B: ########BBBBBBBBBBBBBBBB|BBBB
- // C: ########################|CCCCCCCCCCCCCCCC
- // | Offset(B) |
- //
- // This code produces the slices of A, B and C that appear after the divider
- // at MinByte.
- std::vector<ArrayRef<uint8_t>> Used;
- for (const VirtualCallTarget &Target : Targets) {
- ArrayRef<uint8_t> VTUsed = IsAfter ? Target.TM->Bits->After.BytesUsed
- : Target.TM->Bits->Before.BytesUsed;
- uint64_t Offset = IsAfter ? MinByte - Target.minAfterBytes()
- : MinByte - Target.minBeforeBytes();
-
- // Disregard used regions that are smaller than Offset. These are
- // effectively all-free regions that do not need to be checked.
- if (VTUsed.size() > Offset)
- Used.push_back(VTUsed.slice(Offset));
- }
-
- if (Size == 1) {
- // Find a free bit in each member of Used.
- for (unsigned I = 0;; ++I) {
- uint8_t BitsUsed = 0;
- for (auto &&B : Used)
- if (I < B.size())
- BitsUsed |= B[I];
- if (BitsUsed != 0xff)
- return (MinByte + I) * 8 +
- countTrailingZeros(uint8_t(~BitsUsed), ZB_Undefined);
- }
- } else {
- // Find a free (Size/8) byte region in each member of Used.
- // FIXME: see if alignment helps.
- for (unsigned I = 0;; ++I) {
- for (auto &&B : Used) {
- unsigned Byte = 0;
- while ((I + Byte) < B.size() && Byte < (Size / 8)) {
- if (B[I + Byte])
- goto NextI;
- ++Byte;
- }
- }
- return (MinByte + I) * 8;
- NextI:;
- }
- }
-}
-
-void wholeprogramdevirt::setBeforeReturnValues(
- MutableArrayRef<VirtualCallTarget> Targets, uint64_t AllocBefore,
- unsigned BitWidth, int64_t &OffsetByte, uint64_t &OffsetBit) {
- if (BitWidth == 1)
- OffsetByte = -(AllocBefore / 8 + 1);
- else
- OffsetByte = -((AllocBefore + 7) / 8 + (BitWidth + 7) / 8);
- OffsetBit = AllocBefore % 8;
-
- for (VirtualCallTarget &Target : Targets) {
- if (BitWidth == 1)
- Target.setBeforeBit(AllocBefore);
- else
- Target.setBeforeBytes(AllocBefore, (BitWidth + 7) / 8);
- }
-}
-
-void wholeprogramdevirt::setAfterReturnValues(
- MutableArrayRef<VirtualCallTarget> Targets, uint64_t AllocAfter,
- unsigned BitWidth, int64_t &OffsetByte, uint64_t &OffsetBit) {
- if (BitWidth == 1)
- OffsetByte = AllocAfter / 8;
- else
- OffsetByte = (AllocAfter + 7) / 8;
- OffsetBit = AllocAfter % 8;
-
- for (VirtualCallTarget &Target : Targets) {
- if (BitWidth == 1)
- Target.setAfterBit(AllocAfter);
- else
- Target.setAfterBytes(AllocAfter, (BitWidth + 7) / 8);
- }
-}
-
-VirtualCallTarget::VirtualCallTarget(Function *Fn, const TypeMemberInfo *TM)
- : Fn(Fn), TM(TM),
- IsBigEndian(Fn->getParent()->getDataLayout().isBigEndian()), WasDevirt(false) {}
-
-namespace {
-
-// A slot in a set of virtual tables. The TypeID identifies the set of virtual
-// tables, and the ByteOffset is the offset in bytes from the address point to
-// the virtual function pointer.
-struct VTableSlot {
- Metadata *TypeID;
- uint64_t ByteOffset;
-};
-
-} // end anonymous namespace
-
-namespace llvm {
-
-template <> struct DenseMapInfo<VTableSlot> {
- static VTableSlot getEmptyKey() {
- return {DenseMapInfo<Metadata *>::getEmptyKey(),
- DenseMapInfo<uint64_t>::getEmptyKey()};
- }
- static VTableSlot getTombstoneKey() {
- return {DenseMapInfo<Metadata *>::getTombstoneKey(),
- DenseMapInfo<uint64_t>::getTombstoneKey()};
- }
- static unsigned getHashValue(const VTableSlot &I) {
- return DenseMapInfo<Metadata *>::getHashValue(I.TypeID) ^
- DenseMapInfo<uint64_t>::getHashValue(I.ByteOffset);
- }
- static bool isEqual(const VTableSlot &LHS,
- const VTableSlot &RHS) {
- return LHS.TypeID == RHS.TypeID && LHS.ByteOffset == RHS.ByteOffset;
- }
-};
-
-template <> struct DenseMapInfo<VTableSlotSummary> {
- static VTableSlotSummary getEmptyKey() {
- return {DenseMapInfo<StringRef>::getEmptyKey(),
- DenseMapInfo<uint64_t>::getEmptyKey()};
- }
- static VTableSlotSummary getTombstoneKey() {
- return {DenseMapInfo<StringRef>::getTombstoneKey(),
- DenseMapInfo<uint64_t>::getTombstoneKey()};
- }
- static unsigned getHashValue(const VTableSlotSummary &I) {
- return DenseMapInfo<StringRef>::getHashValue(I.TypeID) ^
- DenseMapInfo<uint64_t>::getHashValue(I.ByteOffset);
- }
- static bool isEqual(const VTableSlotSummary &LHS,
- const VTableSlotSummary &RHS) {
- return LHS.TypeID == RHS.TypeID && LHS.ByteOffset == RHS.ByteOffset;
- }
-};
-
-} // end namespace llvm
-
-namespace {
-
-// A virtual call site. VTable is the loaded virtual table pointer, and CS is
-// the indirect virtual call.
-struct VirtualCallSite {
- Value *VTable = nullptr;
- CallBase &CB;
-
- // If non-null, this field points to the associated unsafe use count stored in
- // the DevirtModule::NumUnsafeUsesForTypeTest map below. See the description
- // of that field for details.
- unsigned *NumUnsafeUses = nullptr;
-
- void
- emitRemark(const StringRef OptName, const StringRef TargetName,
- function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
- Function *F = CB.getCaller();
- DebugLoc DLoc = CB.getDebugLoc();
- BasicBlock *Block = CB.getParent();
-
- using namespace ore;
- OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block)
- << NV("Optimization", OptName)
- << ": devirtualized a call to "
- << NV("FunctionName", TargetName));
- }
-
- void replaceAndErase(
- const StringRef OptName, const StringRef TargetName, bool RemarksEnabled,
- function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
- Value *New) {
- if (RemarksEnabled)
- emitRemark(OptName, TargetName, OREGetter);
- CB.replaceAllUsesWith(New);
- if (auto *II = dyn_cast<InvokeInst>(&CB)) {
- BranchInst::Create(II->getNormalDest(), &CB);
- II->getUnwindDest()->removePredecessor(II->getParent());
- }
- CB.eraseFromParent();
- // This use is no longer unsafe.
- if (NumUnsafeUses)
- --*NumUnsafeUses;
- }
-};
-
-// Call site information collected for a specific VTableSlot and possibly a list
-// of constant integer arguments. The grouping by arguments is handled by the
-// VTableSlotInfo class.
-struct CallSiteInfo {
- /// The set of call sites for this slot. Used during regular LTO and the
- /// import phase of ThinLTO (as well as the export phase of ThinLTO for any
- /// call sites that appear in the merged module itself); in each of these
- /// cases we are directly operating on the call sites at the IR level.
- std::vector<VirtualCallSite> CallSites;
-
- /// Whether all call sites represented by this CallSiteInfo, including those
- /// in summaries, have been devirtualized. This starts off as true because a
- /// default constructed CallSiteInfo represents no call sites.
- bool AllCallSitesDevirted = true;
-
- // These fields are used during the export phase of ThinLTO and reflect
- // information collected from function summaries.
-
- /// Whether any function summary contains an llvm.assume(llvm.type.test) for
- /// this slot.
- bool SummaryHasTypeTestAssumeUsers = false;
-
- /// CFI-specific: a vector containing the list of function summaries that use
- /// the llvm.type.checked.load intrinsic and therefore will require
- /// resolutions for llvm.type.test in order to implement CFI checks if
- /// devirtualization was unsuccessful. If devirtualization was successful, the
- /// pass will clear this vector by calling markDevirt(). If at the end of the
- /// pass the vector is non-empty, we will need to add a use of llvm.type.test
- /// to each of the function summaries in the vector.
- std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers;
- std::vector<FunctionSummary *> SummaryTypeTestAssumeUsers;
-
- bool isExported() const {
- return SummaryHasTypeTestAssumeUsers ||
- !SummaryTypeCheckedLoadUsers.empty();
- }
-
- void addSummaryTypeCheckedLoadUser(FunctionSummary *FS) {
- SummaryTypeCheckedLoadUsers.push_back(FS);
- AllCallSitesDevirted = false;
- }
-
- void addSummaryTypeTestAssumeUser(FunctionSummary *FS) {
- SummaryTypeTestAssumeUsers.push_back(FS);
- SummaryHasTypeTestAssumeUsers = true;
- AllCallSitesDevirted = false;
- }
-
- void markDevirt() {
- AllCallSitesDevirted = true;
-
- // As explained in the comment for SummaryTypeCheckedLoadUsers.
- SummaryTypeCheckedLoadUsers.clear();
- }
-};
-
-// Call site information collected for a specific VTableSlot.
-struct VTableSlotInfo {
- // The set of call sites which do not have all constant integer arguments
- // (excluding "this").
- CallSiteInfo CSInfo;
-
- // The set of call sites with all constant integer arguments (excluding
- // "this"), grouped by argument list.
- std::map<std::vector<uint64_t>, CallSiteInfo> ConstCSInfo;
-
- void addCallSite(Value *VTable, CallBase &CB, unsigned *NumUnsafeUses);
-
-private:
- CallSiteInfo &findCallSiteInfo(CallBase &CB);
-};
-
-CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallBase &CB) {
- std::vector<uint64_t> Args;
- auto *CBType = dyn_cast<IntegerType>(CB.getType());
- if (!CBType || CBType->getBitWidth() > 64 || CB.arg_empty())
- return CSInfo;
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndexYAML.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/GlobPattern.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
+#include "llvm/Transforms/Utils/Evaluator.h"
+#include <algorithm>
+#include <cstddef>
+#include <map>
+#include <set>
+#include <string>
+
+using namespace llvm;
+using namespace wholeprogramdevirt;
+
+#define DEBUG_TYPE "wholeprogramdevirt"
+
+static cl::opt<PassSummaryAction> ClSummaryAction(
+ "wholeprogramdevirt-summary-action",
+ cl::desc("What to do with the summary when running this pass"),
+ cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"),
+ clEnumValN(PassSummaryAction::Import, "import",
+ "Import typeid resolutions from summary and globals"),
+ clEnumValN(PassSummaryAction::Export, "export",
+ "Export typeid resolutions to summary and globals")),
+ cl::Hidden);
+
+static cl::opt<std::string> ClReadSummary(
+ "wholeprogramdevirt-read-summary",
+ cl::desc(
+ "Read summary from given bitcode or YAML file before running pass"),
+ cl::Hidden);
+
+static cl::opt<std::string> ClWriteSummary(
+ "wholeprogramdevirt-write-summary",
+ cl::desc("Write summary to given bitcode or YAML file after running pass. "
+ "Output file format is deduced from extension: *.bc means writing "
+ "bitcode, otherwise YAML"),
+ cl::Hidden);
+
+static cl::opt<unsigned>
+ ClThreshold("wholeprogramdevirt-branch-funnel-threshold", cl::Hidden,
+ cl::init(10), cl::ZeroOrMore,
+ cl::desc("Maximum number of call targets per "
+ "call site to enable branch funnels"));
+
+static cl::opt<bool>
+ PrintSummaryDevirt("wholeprogramdevirt-print-index-based", cl::Hidden,
+ cl::init(false), cl::ZeroOrMore,
+ cl::desc("Print index-based devirtualization messages"));
+
+/// Provide a way to force enable whole program visibility in tests.
+/// This is needed to support legacy tests that don't contain
+/// !vcall_visibility metadata (the mere presense of type tests
+/// previously implied hidden visibility).
+cl::opt<bool>
+ WholeProgramVisibility("whole-program-visibility", cl::init(false),
+ cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Enable whole program visibility"));
+
+/// Provide a way to force disable whole program for debugging or workarounds,
+/// when enabled via the linker.
+cl::opt<bool> DisableWholeProgramVisibility(
+ "disable-whole-program-visibility", cl::init(false), cl::Hidden,
+ cl::ZeroOrMore,
+ cl::desc("Disable whole program visibility (overrides enabling options)"));
+
+/// Provide way to prevent certain function from being devirtualized
+cl::list<std::string>
+ SkipFunctionNames("wholeprogramdevirt-skip",
+ cl::desc("Prevent function(s) from being devirtualized"),
+ cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated);
+
+namespace {
+struct PatternList {
+ std::vector<GlobPattern> Patterns;
+ template <class T> void init(const T &StringList) {
+ for (const auto &S : StringList)
+ if (Expected<GlobPattern> Pat = GlobPattern::create(S))
+ Patterns.push_back(std::move(*Pat));
+ }
+ bool match(StringRef S) {
+ for (const GlobPattern &P : Patterns)
+ if (P.match(S))
+ return true;
+ return false;
+ }
+};
+} // namespace
+
+// Find the minimum offset that we may store a value of size Size bits at. If
+// IsAfter is set, look for an offset before the object, otherwise look for an
+// offset after the object.
+uint64_t
+wholeprogramdevirt::findLowestOffset(ArrayRef<VirtualCallTarget> Targets,
+ bool IsAfter, uint64_t Size) {
+ // Find a minimum offset taking into account only vtable sizes.
+ uint64_t MinByte = 0;
+ for (const VirtualCallTarget &Target : Targets) {
+ if (IsAfter)
+ MinByte = std::max(MinByte, Target.minAfterBytes());
+ else
+ MinByte = std::max(MinByte, Target.minBeforeBytes());
+ }
+
+ // Build a vector of arrays of bytes covering, for each target, a slice of the
+ // used region (see AccumBitVector::BytesUsed in
+ // llvm/Transforms/IPO/WholeProgramDevirt.h) starting at MinByte. Effectively,
+ // this aligns the used regions to start at MinByte.
+ //
+ // In this example, A, B and C are vtables, # is a byte already allocated for
+ // a virtual function pointer, AAAA... (etc.) are the used regions for the
+ // vtables and Offset(X) is the value computed for the Offset variable below
+ // for X.
+ //
+ // Offset(A)
+ // | |
+ // |MinByte
+ // A: ################AAAAAAAA|AAAAAAAA
+ // B: ########BBBBBBBBBBBBBBBB|BBBB
+ // C: ########################|CCCCCCCCCCCCCCCC
+ // | Offset(B) |
+ //
+ // This code produces the slices of A, B and C that appear after the divider
+ // at MinByte.
+ std::vector<ArrayRef<uint8_t>> Used;
+ for (const VirtualCallTarget &Target : Targets) {
+ ArrayRef<uint8_t> VTUsed = IsAfter ? Target.TM->Bits->After.BytesUsed
+ : Target.TM->Bits->Before.BytesUsed;
+ uint64_t Offset = IsAfter ? MinByte - Target.minAfterBytes()
+ : MinByte - Target.minBeforeBytes();
+
+ // Disregard used regions that are smaller than Offset. These are
+ // effectively all-free regions that do not need to be checked.
+ if (VTUsed.size() > Offset)
+ Used.push_back(VTUsed.slice(Offset));
+ }
+
+ if (Size == 1) {
+ // Find a free bit in each member of Used.
+ for (unsigned I = 0;; ++I) {
+ uint8_t BitsUsed = 0;
+ for (auto &&B : Used)
+ if (I < B.size())
+ BitsUsed |= B[I];
+ if (BitsUsed != 0xff)
+ return (MinByte + I) * 8 +
+ countTrailingZeros(uint8_t(~BitsUsed), ZB_Undefined);
+ }
+ } else {
+ // Find a free (Size/8) byte region in each member of Used.
+ // FIXME: see if alignment helps.
+ for (unsigned I = 0;; ++I) {
+ for (auto &&B : Used) {
+ unsigned Byte = 0;
+ while ((I + Byte) < B.size() && Byte < (Size / 8)) {
+ if (B[I + Byte])
+ goto NextI;
+ ++Byte;
+ }
+ }
+ return (MinByte + I) * 8;
+ NextI:;
+ }
+ }
+}
+
+void wholeprogramdevirt::setBeforeReturnValues(
+ MutableArrayRef<VirtualCallTarget> Targets, uint64_t AllocBefore,
+ unsigned BitWidth, int64_t &OffsetByte, uint64_t &OffsetBit) {
+ if (BitWidth == 1)
+ OffsetByte = -(AllocBefore / 8 + 1);
+ else
+ OffsetByte = -((AllocBefore + 7) / 8 + (BitWidth + 7) / 8);
+ OffsetBit = AllocBefore % 8;
+
+ for (VirtualCallTarget &Target : Targets) {
+ if (BitWidth == 1)
+ Target.setBeforeBit(AllocBefore);
+ else
+ Target.setBeforeBytes(AllocBefore, (BitWidth + 7) / 8);
+ }
+}
+
+void wholeprogramdevirt::setAfterReturnValues(
+ MutableArrayRef<VirtualCallTarget> Targets, uint64_t AllocAfter,
+ unsigned BitWidth, int64_t &OffsetByte, uint64_t &OffsetBit) {
+ if (BitWidth == 1)
+ OffsetByte = AllocAfter / 8;
+ else
+ OffsetByte = (AllocAfter + 7) / 8;
+ OffsetBit = AllocAfter % 8;
+
+ for (VirtualCallTarget &Target : Targets) {
+ if (BitWidth == 1)
+ Target.setAfterBit(AllocAfter);
+ else
+ Target.setAfterBytes(AllocAfter, (BitWidth + 7) / 8);
+ }
+}
+
+VirtualCallTarget::VirtualCallTarget(Function *Fn, const TypeMemberInfo *TM)
+ : Fn(Fn), TM(TM),
+ IsBigEndian(Fn->getParent()->getDataLayout().isBigEndian()), WasDevirt(false) {}
+
+namespace {
+
+// A slot in a set of virtual tables. The TypeID identifies the set of virtual
+// tables, and the ByteOffset is the offset in bytes from the address point to
+// the virtual function pointer.
+struct VTableSlot {
+ Metadata *TypeID;
+ uint64_t ByteOffset;
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <> struct DenseMapInfo<VTableSlot> {
+ static VTableSlot getEmptyKey() {
+ return {DenseMapInfo<Metadata *>::getEmptyKey(),
+ DenseMapInfo<uint64_t>::getEmptyKey()};
+ }
+ static VTableSlot getTombstoneKey() {
+ return {DenseMapInfo<Metadata *>::getTombstoneKey(),
+ DenseMapInfo<uint64_t>::getTombstoneKey()};
+ }
+ static unsigned getHashValue(const VTableSlot &I) {
+ return DenseMapInfo<Metadata *>::getHashValue(I.TypeID) ^
+ DenseMapInfo<uint64_t>::getHashValue(I.ByteOffset);
+ }
+ static bool isEqual(const VTableSlot &LHS,
+ const VTableSlot &RHS) {
+ return LHS.TypeID == RHS.TypeID && LHS.ByteOffset == RHS.ByteOffset;
+ }
+};
+
+template <> struct DenseMapInfo<VTableSlotSummary> {
+ static VTableSlotSummary getEmptyKey() {
+ return {DenseMapInfo<StringRef>::getEmptyKey(),
+ DenseMapInfo<uint64_t>::getEmptyKey()};
+ }
+ static VTableSlotSummary getTombstoneKey() {
+ return {DenseMapInfo<StringRef>::getTombstoneKey(),
+ DenseMapInfo<uint64_t>::getTombstoneKey()};
+ }
+ static unsigned getHashValue(const VTableSlotSummary &I) {
+ return DenseMapInfo<StringRef>::getHashValue(I.TypeID) ^
+ DenseMapInfo<uint64_t>::getHashValue(I.ByteOffset);
+ }
+ static bool isEqual(const VTableSlotSummary &LHS,
+ const VTableSlotSummary &RHS) {
+ return LHS.TypeID == RHS.TypeID && LHS.ByteOffset == RHS.ByteOffset;
+ }
+};
+
+} // end namespace llvm
+
+namespace {
+
+// A virtual call site. VTable is the loaded virtual table pointer, and CS is
+// the indirect virtual call.
+struct VirtualCallSite {
+ Value *VTable = nullptr;
+ CallBase &CB;
+
+ // If non-null, this field points to the associated unsafe use count stored in
+ // the DevirtModule::NumUnsafeUsesForTypeTest map below. See the description
+ // of that field for details.
+ unsigned *NumUnsafeUses = nullptr;
+
+ void
+ emitRemark(const StringRef OptName, const StringRef TargetName,
+ function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
+ Function *F = CB.getCaller();
+ DebugLoc DLoc = CB.getDebugLoc();
+ BasicBlock *Block = CB.getParent();
+
+ using namespace ore;
+ OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block)
+ << NV("Optimization", OptName)
+ << ": devirtualized a call to "
+ << NV("FunctionName", TargetName));
+ }
+
+ void replaceAndErase(
+ const StringRef OptName, const StringRef TargetName, bool RemarksEnabled,
+ function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
+ Value *New) {
+ if (RemarksEnabled)
+ emitRemark(OptName, TargetName, OREGetter);
+ CB.replaceAllUsesWith(New);
+ if (auto *II = dyn_cast<InvokeInst>(&CB)) {
+ BranchInst::Create(II->getNormalDest(), &CB);
+ II->getUnwindDest()->removePredecessor(II->getParent());
+ }
+ CB.eraseFromParent();
+ // This use is no longer unsafe.
+ if (NumUnsafeUses)
+ --*NumUnsafeUses;
+ }
+};
+
+// Call site information collected for a specific VTableSlot and possibly a list
+// of constant integer arguments. The grouping by arguments is handled by the
+// VTableSlotInfo class.
+struct CallSiteInfo {
+ /// The set of call sites for this slot. Used during regular LTO and the
+ /// import phase of ThinLTO (as well as the export phase of ThinLTO for any
+ /// call sites that appear in the merged module itself); in each of these
+ /// cases we are directly operating on the call sites at the IR level.
+ std::vector<VirtualCallSite> CallSites;
+
+ /// Whether all call sites represented by this CallSiteInfo, including those
+ /// in summaries, have been devirtualized. This starts off as true because a
+ /// default constructed CallSiteInfo represents no call sites.
+ bool AllCallSitesDevirted = true;
+
+ // These fields are used during the export phase of ThinLTO and reflect
+ // information collected from function summaries.
+
+ /// Whether any function summary contains an llvm.assume(llvm.type.test) for
+ /// this slot.
+ bool SummaryHasTypeTestAssumeUsers = false;
+
+ /// CFI-specific: a vector containing the list of function summaries that use
+ /// the llvm.type.checked.load intrinsic and therefore will require
+ /// resolutions for llvm.type.test in order to implement CFI checks if
+ /// devirtualization was unsuccessful. If devirtualization was successful, the
+ /// pass will clear this vector by calling markDevirt(). If at the end of the
+ /// pass the vector is non-empty, we will need to add a use of llvm.type.test
+ /// to each of the function summaries in the vector.
+ std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers;
+ std::vector<FunctionSummary *> SummaryTypeTestAssumeUsers;
+
+ bool isExported() const {
+ return SummaryHasTypeTestAssumeUsers ||
+ !SummaryTypeCheckedLoadUsers.empty();
+ }
+
+ void addSummaryTypeCheckedLoadUser(FunctionSummary *FS) {
+ SummaryTypeCheckedLoadUsers.push_back(FS);
+ AllCallSitesDevirted = false;
+ }
+
+ void addSummaryTypeTestAssumeUser(FunctionSummary *FS) {
+ SummaryTypeTestAssumeUsers.push_back(FS);
+ SummaryHasTypeTestAssumeUsers = true;
+ AllCallSitesDevirted = false;
+ }
+
+ void markDevirt() {
+ AllCallSitesDevirted = true;
+
+ // As explained in the comment for SummaryTypeCheckedLoadUsers.
+ SummaryTypeCheckedLoadUsers.clear();
+ }
+};
+
+// Call site information collected for a specific VTableSlot.
+struct VTableSlotInfo {
+ // The set of call sites which do not have all constant integer arguments
+ // (excluding "this").
+ CallSiteInfo CSInfo;
+
+ // The set of call sites with all constant integer arguments (excluding
+ // "this"), grouped by argument list.
+ std::map<std::vector<uint64_t>, CallSiteInfo> ConstCSInfo;
+
+ void addCallSite(Value *VTable, CallBase &CB, unsigned *NumUnsafeUses);
+
+private:
+ CallSiteInfo &findCallSiteInfo(CallBase &CB);
+};
+
+CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallBase &CB) {
+ std::vector<uint64_t> Args;
+ auto *CBType = dyn_cast<IntegerType>(CB.getType());
+ if (!CBType || CBType->getBitWidth() > 64 || CB.arg_empty())
+ return CSInfo;
for (auto &&Arg : drop_begin(CB.args())) {
- auto *CI = dyn_cast<ConstantInt>(Arg);
- if (!CI || CI->getBitWidth() > 64)
- return CSInfo;
- Args.push_back(CI->getZExtValue());
- }
- return ConstCSInfo[Args];
-}
-
-void VTableSlotInfo::addCallSite(Value *VTable, CallBase &CB,
- unsigned *NumUnsafeUses) {
- auto &CSI = findCallSiteInfo(CB);
- CSI.AllCallSitesDevirted = false;
- CSI.CallSites.push_back({VTable, CB, NumUnsafeUses});
-}
-
-struct DevirtModule {
- Module &M;
- function_ref<AAResults &(Function &)> AARGetter;
- function_ref<DominatorTree &(Function &)> LookupDomTree;
-
- ModuleSummaryIndex *ExportSummary;
- const ModuleSummaryIndex *ImportSummary;
-
- IntegerType *Int8Ty;
- PointerType *Int8PtrTy;
- IntegerType *Int32Ty;
- IntegerType *Int64Ty;
- IntegerType *IntPtrTy;
- /// Sizeless array type, used for imported vtables. This provides a signal
- /// to analyzers that these imports may alias, as they do for example
- /// when multiple unique return values occur in the same vtable.
- ArrayType *Int8Arr0Ty;
-
- bool RemarksEnabled;
- function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
-
- MapVector<VTableSlot, VTableSlotInfo> CallSlots;
-
- // This map keeps track of the number of "unsafe" uses of a loaded function
- // pointer. The key is the associated llvm.type.test intrinsic call generated
- // by this pass. An unsafe use is one that calls the loaded function pointer
- // directly. Every time we eliminate an unsafe use (for example, by
- // devirtualizing it or by applying virtual constant propagation), we
- // decrement the value stored in this map. If a value reaches zero, we can
- // eliminate the type check by RAUWing the associated llvm.type.test call with
- // true.
- std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest;
- PatternList FunctionsToSkip;
-
- DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter,
- function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
- function_ref<DominatorTree &(Function &)> LookupDomTree,
- ModuleSummaryIndex *ExportSummary,
- const ModuleSummaryIndex *ImportSummary)
- : M(M), AARGetter(AARGetter), LookupDomTree(LookupDomTree),
- ExportSummary(ExportSummary), ImportSummary(ImportSummary),
- Int8Ty(Type::getInt8Ty(M.getContext())),
- Int8PtrTy(Type::getInt8PtrTy(M.getContext())),
- Int32Ty(Type::getInt32Ty(M.getContext())),
- Int64Ty(Type::getInt64Ty(M.getContext())),
- IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)),
- Int8Arr0Ty(ArrayType::get(Type::getInt8Ty(M.getContext()), 0)),
- RemarksEnabled(areRemarksEnabled()), OREGetter(OREGetter) {
- assert(!(ExportSummary && ImportSummary));
- FunctionsToSkip.init(SkipFunctionNames);
- }
-
- bool areRemarksEnabled();
-
- void
- scanTypeTestUsers(Function *TypeTestFunc,
- DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
- void scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc);
-
- void buildTypeIdentifierMap(
- std::vector<VTableBits> &Bits,
- DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
- bool
- tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot,
- const std::set<TypeMemberInfo> &TypeMemberInfos,
- uint64_t ByteOffset);
-
- void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn,
- bool &IsExported);
- bool trySingleImplDevirt(ModuleSummaryIndex *ExportSummary,
- MutableArrayRef<VirtualCallTarget> TargetsForSlot,
- VTableSlotInfo &SlotInfo,
- WholeProgramDevirtResolution *Res);
-
- void applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Constant *JT,
- bool &IsExported);
- void tryICallBranchFunnel(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
- VTableSlotInfo &SlotInfo,
- WholeProgramDevirtResolution *Res, VTableSlot Slot);
-
- bool tryEvaluateFunctionsWithArgs(
- MutableArrayRef<VirtualCallTarget> TargetsForSlot,
- ArrayRef<uint64_t> Args);
-
- void applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
- uint64_t TheRetVal);
- bool tryUniformRetValOpt(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
- CallSiteInfo &CSInfo,
- WholeProgramDevirtResolution::ByArg *Res);
-
- // Returns the global symbol name that is used to export information about the
- // given vtable slot and list of arguments.
- std::string getGlobalName(VTableSlot Slot, ArrayRef<uint64_t> Args,
- StringRef Name);
-
- bool shouldExportConstantsAsAbsoluteSymbols();
-
- // This function is called during the export phase to create a symbol
- // definition containing information about the given vtable slot and list of
- // arguments.
- void exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name,
- Constant *C);
- void exportConstant(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name,
- uint32_t Const, uint32_t &Storage);
-
- // This function is called during the import phase to create a reference to
- // the symbol definition created during the export phase.
- Constant *importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
- StringRef Name);
- Constant *importConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
- StringRef Name, IntegerType *IntTy,
- uint32_t Storage);
-
- Constant *getMemberAddr(const TypeMemberInfo *M);
-
- void applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, bool IsOne,
- Constant *UniqueMemberAddr);
- bool tryUniqueRetValOpt(unsigned BitWidth,
- MutableArrayRef<VirtualCallTarget> TargetsForSlot,
- CallSiteInfo &CSInfo,
- WholeProgramDevirtResolution::ByArg *Res,
- VTableSlot Slot, ArrayRef<uint64_t> Args);
-
- void applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
- Constant *Byte, Constant *Bit);
- bool tryVirtualConstProp(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
- VTableSlotInfo &SlotInfo,
- WholeProgramDevirtResolution *Res, VTableSlot Slot);
-
- void rebuildGlobal(VTableBits &B);
-
- // Apply the summary resolution for Slot to all virtual calls in SlotInfo.
- void importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo);
-
- // If we were able to eliminate all unsafe uses for a type checked load,
- // eliminate the associated type tests by replacing them with true.
- void removeRedundantTypeTests();
-
- bool run();
-
- // Lower the module using the action and summary passed as command line
- // arguments. For testing purposes only.
- static bool
- runForTesting(Module &M, function_ref<AAResults &(Function &)> AARGetter,
- function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
- function_ref<DominatorTree &(Function &)> LookupDomTree);
-};
-
-struct DevirtIndex {
- ModuleSummaryIndex &ExportSummary;
- // The set in which to record GUIDs exported from their module by
- // devirtualization, used by client to ensure they are not internalized.
- std::set<GlobalValue::GUID> &ExportedGUIDs;
- // A map in which to record the information necessary to locate the WPD
- // resolution for local targets in case they are exported by cross module
- // importing.
- std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap;
-
- MapVector<VTableSlotSummary, VTableSlotInfo> CallSlots;
-
- PatternList FunctionsToSkip;
-
- DevirtIndex(
- ModuleSummaryIndex &ExportSummary,
- std::set<GlobalValue::GUID> &ExportedGUIDs,
- std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap)
- : ExportSummary(ExportSummary), ExportedGUIDs(ExportedGUIDs),
- LocalWPDTargetsMap(LocalWPDTargetsMap) {
- FunctionsToSkip.init(SkipFunctionNames);
- }
-
- bool tryFindVirtualCallTargets(std::vector<ValueInfo> &TargetsForSlot,
- const TypeIdCompatibleVtableInfo TIdInfo,
- uint64_t ByteOffset);
-
- bool trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
- VTableSlotSummary &SlotSummary,
- VTableSlotInfo &SlotInfo,
- WholeProgramDevirtResolution *Res,
- std::set<ValueInfo> &DevirtTargets);
-
- void run();
-};
-
-struct WholeProgramDevirt : public ModulePass {
- static char ID;
-
- bool UseCommandLine = false;
-
- ModuleSummaryIndex *ExportSummary = nullptr;
- const ModuleSummaryIndex *ImportSummary = nullptr;
-
- WholeProgramDevirt() : ModulePass(ID), UseCommandLine(true) {
- initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
- }
-
- WholeProgramDevirt(ModuleSummaryIndex *ExportSummary,
- const ModuleSummaryIndex *ImportSummary)
- : ModulePass(ID), ExportSummary(ExportSummary),
- ImportSummary(ImportSummary) {
- initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- if (skipModule(M))
- return false;
-
- // In the new pass manager, we can request the optimization
- // remark emitter pass on a per-function-basis, which the
- // OREGetter will do for us.
- // In the old pass manager, this is harder, so we just build
- // an optimization remark emitter on the fly, when we need it.
- std::unique_ptr<OptimizationRemarkEmitter> ORE;
- auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
- ORE = std::make_unique<OptimizationRemarkEmitter>(F);
- return *ORE;
- };
-
- auto LookupDomTree = [this](Function &F) -> DominatorTree & {
- return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
- };
-
- if (UseCommandLine)
- return DevirtModule::runForTesting(M, LegacyAARGetter(*this), OREGetter,
- LookupDomTree);
-
- return DevirtModule(M, LegacyAARGetter(*this), OREGetter, LookupDomTree,
- ExportSummary, ImportSummary)
- .run();
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-INITIALIZE_PASS_BEGIN(WholeProgramDevirt, "wholeprogramdevirt",
- "Whole program devirtualization", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(WholeProgramDevirt, "wholeprogramdevirt",
- "Whole program devirtualization", false, false)
-char WholeProgramDevirt::ID = 0;
-
-ModulePass *
-llvm::createWholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary,
- const ModuleSummaryIndex *ImportSummary) {
- return new WholeProgramDevirt(ExportSummary, ImportSummary);
-}
-
-PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- auto AARGetter = [&](Function &F) -> AAResults & {
- return FAM.getResult<AAManager>(F);
- };
- auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
- return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
- };
- auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & {
- return FAM.getResult<DominatorTreeAnalysis>(F);
- };
+ auto *CI = dyn_cast<ConstantInt>(Arg);
+ if (!CI || CI->getBitWidth() > 64)
+ return CSInfo;
+ Args.push_back(CI->getZExtValue());
+ }
+ return ConstCSInfo[Args];
+}
+
+void VTableSlotInfo::addCallSite(Value *VTable, CallBase &CB,
+ unsigned *NumUnsafeUses) {
+ auto &CSI = findCallSiteInfo(CB);
+ CSI.AllCallSitesDevirted = false;
+ CSI.CallSites.push_back({VTable, CB, NumUnsafeUses});
+}
+
+struct DevirtModule {
+ Module &M;
+ function_ref<AAResults &(Function &)> AARGetter;
+ function_ref<DominatorTree &(Function &)> LookupDomTree;
+
+ ModuleSummaryIndex *ExportSummary;
+ const ModuleSummaryIndex *ImportSummary;
+
+ IntegerType *Int8Ty;
+ PointerType *Int8PtrTy;
+ IntegerType *Int32Ty;
+ IntegerType *Int64Ty;
+ IntegerType *IntPtrTy;
+ /// Sizeless array type, used for imported vtables. This provides a signal
+ /// to analyzers that these imports may alias, as they do for example
+ /// when multiple unique return values occur in the same vtable.
+ ArrayType *Int8Arr0Ty;
+
+ bool RemarksEnabled;
+ function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
+
+ MapVector<VTableSlot, VTableSlotInfo> CallSlots;
+
+ // This map keeps track of the number of "unsafe" uses of a loaded function
+ // pointer. The key is the associated llvm.type.test intrinsic call generated
+ // by this pass. An unsafe use is one that calls the loaded function pointer
+ // directly. Every time we eliminate an unsafe use (for example, by
+ // devirtualizing it or by applying virtual constant propagation), we
+ // decrement the value stored in this map. If a value reaches zero, we can
+ // eliminate the type check by RAUWing the associated llvm.type.test call with
+ // true.
+ std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest;
+ PatternList FunctionsToSkip;
+
+ DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter,
+ function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
+ function_ref<DominatorTree &(Function &)> LookupDomTree,
+ ModuleSummaryIndex *ExportSummary,
+ const ModuleSummaryIndex *ImportSummary)
+ : M(M), AARGetter(AARGetter), LookupDomTree(LookupDomTree),
+ ExportSummary(ExportSummary), ImportSummary(ImportSummary),
+ Int8Ty(Type::getInt8Ty(M.getContext())),
+ Int8PtrTy(Type::getInt8PtrTy(M.getContext())),
+ Int32Ty(Type::getInt32Ty(M.getContext())),
+ Int64Ty(Type::getInt64Ty(M.getContext())),
+ IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)),
+ Int8Arr0Ty(ArrayType::get(Type::getInt8Ty(M.getContext()), 0)),
+ RemarksEnabled(areRemarksEnabled()), OREGetter(OREGetter) {
+ assert(!(ExportSummary && ImportSummary));
+ FunctionsToSkip.init(SkipFunctionNames);
+ }
+
+ bool areRemarksEnabled();
+
+ void
+ scanTypeTestUsers(Function *TypeTestFunc,
+ DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
+ void scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc);
+
+ void buildTypeIdentifierMap(
+ std::vector<VTableBits> &Bits,
+ DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
+ bool
+ tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot,
+ const std::set<TypeMemberInfo> &TypeMemberInfos,
+ uint64_t ByteOffset);
+
+ void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn,
+ bool &IsExported);
+ bool trySingleImplDevirt(ModuleSummaryIndex *ExportSummary,
+ MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+ VTableSlotInfo &SlotInfo,
+ WholeProgramDevirtResolution *Res);
+
+ void applyICallBranchFunnel(VTableSlotInfo &SlotInfo, Constant *JT,
+ bool &IsExported);
+ void tryICallBranchFunnel(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+ VTableSlotInfo &SlotInfo,
+ WholeProgramDevirtResolution *Res, VTableSlot Slot);
+
+ bool tryEvaluateFunctionsWithArgs(
+ MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+ ArrayRef<uint64_t> Args);
+
+ void applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+ uint64_t TheRetVal);
+ bool tryUniformRetValOpt(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+ CallSiteInfo &CSInfo,
+ WholeProgramDevirtResolution::ByArg *Res);
+
+ // Returns the global symbol name that is used to export information about the
+ // given vtable slot and list of arguments.
+ std::string getGlobalName(VTableSlot Slot, ArrayRef<uint64_t> Args,
+ StringRef Name);
+
+ bool shouldExportConstantsAsAbsoluteSymbols();
+
+ // This function is called during the export phase to create a symbol
+ // definition containing information about the given vtable slot and list of
+ // arguments.
+ void exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name,
+ Constant *C);
+ void exportConstant(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name,
+ uint32_t Const, uint32_t &Storage);
+
+ // This function is called during the import phase to create a reference to
+ // the symbol definition created during the export phase.
+ Constant *importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+ StringRef Name);
+ Constant *importConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
+ StringRef Name, IntegerType *IntTy,
+ uint32_t Storage);
+
+ Constant *getMemberAddr(const TypeMemberInfo *M);
+
+ void applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, bool IsOne,
+ Constant *UniqueMemberAddr);
+ bool tryUniqueRetValOpt(unsigned BitWidth,
+ MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+ CallSiteInfo &CSInfo,
+ WholeProgramDevirtResolution::ByArg *Res,
+ VTableSlot Slot, ArrayRef<uint64_t> Args);
+
+ void applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
+ Constant *Byte, Constant *Bit);
+ bool tryVirtualConstProp(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+ VTableSlotInfo &SlotInfo,
+ WholeProgramDevirtResolution *Res, VTableSlot Slot);
+
+ void rebuildGlobal(VTableBits &B);
+
+ // Apply the summary resolution for Slot to all virtual calls in SlotInfo.
+ void importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo);
+
+ // If we were able to eliminate all unsafe uses for a type checked load,
+ // eliminate the associated type tests by replacing them with true.
+ void removeRedundantTypeTests();
+
+ bool run();
+
+ // Lower the module using the action and summary passed as command line
+ // arguments. For testing purposes only.
+ static bool
+ runForTesting(Module &M, function_ref<AAResults &(Function &)> AARGetter,
+ function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
+ function_ref<DominatorTree &(Function &)> LookupDomTree);
+};
+
+struct DevirtIndex {
+ ModuleSummaryIndex &ExportSummary;
+ // The set in which to record GUIDs exported from their module by
+ // devirtualization, used by client to ensure they are not internalized.
+ std::set<GlobalValue::GUID> &ExportedGUIDs;
+ // A map in which to record the information necessary to locate the WPD
+ // resolution for local targets in case they are exported by cross module
+ // importing.
+ std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap;
+
+ MapVector<VTableSlotSummary, VTableSlotInfo> CallSlots;
+
+ PatternList FunctionsToSkip;
+
+ DevirtIndex(
+ ModuleSummaryIndex &ExportSummary,
+ std::set<GlobalValue::GUID> &ExportedGUIDs,
+ std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap)
+ : ExportSummary(ExportSummary), ExportedGUIDs(ExportedGUIDs),
+ LocalWPDTargetsMap(LocalWPDTargetsMap) {
+ FunctionsToSkip.init(SkipFunctionNames);
+ }
+
+ bool tryFindVirtualCallTargets(std::vector<ValueInfo> &TargetsForSlot,
+ const TypeIdCompatibleVtableInfo TIdInfo,
+ uint64_t ByteOffset);
+
+ bool trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
+ VTableSlotSummary &SlotSummary,
+ VTableSlotInfo &SlotInfo,
+ WholeProgramDevirtResolution *Res,
+ std::set<ValueInfo> &DevirtTargets);
+
+ void run();
+};
+
+struct WholeProgramDevirt : public ModulePass {
+ static char ID;
+
+ bool UseCommandLine = false;
+
+ ModuleSummaryIndex *ExportSummary = nullptr;
+ const ModuleSummaryIndex *ImportSummary = nullptr;
+
+ WholeProgramDevirt() : ModulePass(ID), UseCommandLine(true) {
+ initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
+ }
+
+ WholeProgramDevirt(ModuleSummaryIndex *ExportSummary,
+ const ModuleSummaryIndex *ImportSummary)
+ : ModulePass(ID), ExportSummary(ExportSummary),
+ ImportSummary(ImportSummary) {
+ initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ if (skipModule(M))
+ return false;
+
+ // In the new pass manager, we can request the optimization
+ // remark emitter pass on a per-function-basis, which the
+ // OREGetter will do for us.
+ // In the old pass manager, this is harder, so we just build
+ // an optimization remark emitter on the fly, when we need it.
+ std::unique_ptr<OptimizationRemarkEmitter> ORE;
+ auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
+ ORE = std::make_unique<OptimizationRemarkEmitter>(F);
+ return *ORE;
+ };
+
+ auto LookupDomTree = [this](Function &F) -> DominatorTree & {
+ return this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+ };
+
+ if (UseCommandLine)
+ return DevirtModule::runForTesting(M, LegacyAARGetter(*this), OREGetter,
+ LookupDomTree);
+
+ return DevirtModule(M, LegacyAARGetter(*this), OREGetter, LookupDomTree,
+ ExportSummary, ImportSummary)
+ .run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(WholeProgramDevirt, "wholeprogramdevirt",
+ "Whole program devirtualization", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(WholeProgramDevirt, "wholeprogramdevirt",
+ "Whole program devirtualization", false, false)
+char WholeProgramDevirt::ID = 0;
+
+ModulePass *
+llvm::createWholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary,
+ const ModuleSummaryIndex *ImportSummary) {
+ return new WholeProgramDevirt(ExportSummary, ImportSummary);
+}
+
+PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto AARGetter = [&](Function &F) -> AAResults & {
+ return FAM.getResult<AAManager>(F);
+ };
+ auto OREGetter = [&](Function *F) -> OptimizationRemarkEmitter & {
+ return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
+ };
+ auto LookupDomTree = [&FAM](Function &F) -> DominatorTree & {
+ return FAM.getResult<DominatorTreeAnalysis>(F);
+ };
if (UseCommandLine) {
if (DevirtModule::runForTesting(M, AARGetter, OREGetter, LookupDomTree))
return PreservedAnalyses::all();
return PreservedAnalyses::none();
}
- if (!DevirtModule(M, AARGetter, OREGetter, LookupDomTree, ExportSummary,
- ImportSummary)
- .run())
- return PreservedAnalyses::all();
- return PreservedAnalyses::none();
-}
-
-// Enable whole program visibility if enabled by client (e.g. linker) or
-// internal option, and not force disabled.
-static bool hasWholeProgramVisibility(bool WholeProgramVisibilityEnabledInLTO) {
- return (WholeProgramVisibilityEnabledInLTO || WholeProgramVisibility) &&
- !DisableWholeProgramVisibility;
-}
-
-namespace llvm {
-
-/// If whole program visibility asserted, then upgrade all public vcall
-/// visibility metadata on vtable definitions to linkage unit visibility in
-/// Module IR (for regular or hybrid LTO).
-void updateVCallVisibilityInModule(Module &M,
- bool WholeProgramVisibilityEnabledInLTO) {
- if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
- return;
- for (GlobalVariable &GV : M.globals())
- // Add linkage unit visibility to any variable with type metadata, which are
- // the vtable definitions. We won't have an existing vcall_visibility
- // metadata on vtable definitions with public visibility.
- if (GV.hasMetadata(LLVMContext::MD_type) &&
- GV.getVCallVisibility() == GlobalObject::VCallVisibilityPublic)
- GV.setVCallVisibilityMetadata(GlobalObject::VCallVisibilityLinkageUnit);
-}
-
-/// If whole program visibility asserted, then upgrade all public vcall
-/// visibility metadata on vtable definition summaries to linkage unit
-/// visibility in Module summary index (for ThinLTO).
-void updateVCallVisibilityInIndex(ModuleSummaryIndex &Index,
- bool WholeProgramVisibilityEnabledInLTO) {
- if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
- return;
- for (auto &P : Index) {
- for (auto &S : P.second.SummaryList) {
- auto *GVar = dyn_cast<GlobalVarSummary>(S.get());
- if (!GVar || GVar->vTableFuncs().empty() ||
- GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
- continue;
- GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit);
- }
- }
-}
-
-void runWholeProgramDevirtOnIndex(
- ModuleSummaryIndex &Summary, std::set<GlobalValue::GUID> &ExportedGUIDs,
- std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) {
- DevirtIndex(Summary, ExportedGUIDs, LocalWPDTargetsMap).run();
-}
-
-void updateIndexWPDForExports(
- ModuleSummaryIndex &Summary,
- function_ref<bool(StringRef, ValueInfo)> isExported,
- std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) {
- for (auto &T : LocalWPDTargetsMap) {
- auto &VI = T.first;
- // This was enforced earlier during trySingleImplDevirt.
- assert(VI.getSummaryList().size() == 1 &&
- "Devirt of local target has more than one copy");
- auto &S = VI.getSummaryList()[0];
- if (!isExported(S->modulePath(), VI))
- continue;
-
- // It's been exported by a cross module import.
- for (auto &SlotSummary : T.second) {
- auto *TIdSum = Summary.getTypeIdSummary(SlotSummary.TypeID);
- assert(TIdSum);
- auto WPDRes = TIdSum->WPDRes.find(SlotSummary.ByteOffset);
- assert(WPDRes != TIdSum->WPDRes.end());
- WPDRes->second.SingleImplName = ModuleSummaryIndex::getGlobalNameForLocal(
- WPDRes->second.SingleImplName,
- Summary.getModuleHash(S->modulePath()));
- }
- }
-}
-
-} // end namespace llvm
-
-static Error checkCombinedSummaryForTesting(ModuleSummaryIndex *Summary) {
- // Check that summary index contains regular LTO module when performing
- // export to prevent occasional use of index from pure ThinLTO compilation
- // (-fno-split-lto-module). This kind of summary index is passed to
- // DevirtIndex::run, not to DevirtModule::run used by opt/runForTesting.
- const auto &ModPaths = Summary->modulePaths();
- if (ClSummaryAction != PassSummaryAction::Import &&
- ModPaths.find(ModuleSummaryIndex::getRegularLTOModuleName()) ==
- ModPaths.end())
- return createStringError(
- errc::invalid_argument,
- "combined summary should contain Regular LTO module");
- return ErrorSuccess();
-}
-
-bool DevirtModule::runForTesting(
- Module &M, function_ref<AAResults &(Function &)> AARGetter,
- function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
- function_ref<DominatorTree &(Function &)> LookupDomTree) {
- std::unique_ptr<ModuleSummaryIndex> Summary =
- std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
-
- // Handle the command-line summary arguments. This code is for testing
- // purposes only, so we handle errors directly.
- if (!ClReadSummary.empty()) {
- ExitOnError ExitOnErr("-wholeprogramdevirt-read-summary: " + ClReadSummary +
- ": ");
- auto ReadSummaryFile =
- ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
- if (Expected<std::unique_ptr<ModuleSummaryIndex>> SummaryOrErr =
- getModuleSummaryIndex(*ReadSummaryFile)) {
- Summary = std::move(*SummaryOrErr);
- ExitOnErr(checkCombinedSummaryForTesting(Summary.get()));
- } else {
- // Try YAML if we've failed with bitcode.
- consumeError(SummaryOrErr.takeError());
- yaml::Input In(ReadSummaryFile->getBuffer());
- In >> *Summary;
- ExitOnErr(errorCodeToError(In.error()));
- }
- }
-
- bool Changed =
- DevirtModule(M, AARGetter, OREGetter, LookupDomTree,
- ClSummaryAction == PassSummaryAction::Export ? Summary.get()
- : nullptr,
- ClSummaryAction == PassSummaryAction::Import ? Summary.get()
- : nullptr)
- .run();
-
- if (!ClWriteSummary.empty()) {
- ExitOnError ExitOnErr(
- "-wholeprogramdevirt-write-summary: " + ClWriteSummary + ": ");
- std::error_code EC;
- if (StringRef(ClWriteSummary).endswith(".bc")) {
- raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_None);
- ExitOnErr(errorCodeToError(EC));
- WriteIndexToFile(*Summary, OS);
- } else {
- raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text);
- ExitOnErr(errorCodeToError(EC));
- yaml::Output Out(OS);
- Out << *Summary;
- }
- }
-
- return Changed;
-}
-
-void DevirtModule::buildTypeIdentifierMap(
- std::vector<VTableBits> &Bits,
- DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
- DenseMap<GlobalVariable *, VTableBits *> GVToBits;
- Bits.reserve(M.getGlobalList().size());
- SmallVector<MDNode *, 2> Types;
- for (GlobalVariable &GV : M.globals()) {
- Types.clear();
- GV.getMetadata(LLVMContext::MD_type, Types);
- if (GV.isDeclaration() || Types.empty())
- continue;
-
- VTableBits *&BitsPtr = GVToBits[&GV];
- if (!BitsPtr) {
- Bits.emplace_back();
- Bits.back().GV = &GV;
- Bits.back().ObjectSize =
- M.getDataLayout().getTypeAllocSize(GV.getInitializer()->getType());
- BitsPtr = &Bits.back();
- }
-
- for (MDNode *Type : Types) {
- auto TypeID = Type->getOperand(1).get();
-
- uint64_t Offset =
- cast<ConstantInt>(
- cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
- ->getZExtValue();
-
- TypeIdMap[TypeID].insert({BitsPtr, Offset});
- }
- }
-}
-
-bool DevirtModule::tryFindVirtualCallTargets(
- std::vector<VirtualCallTarget> &TargetsForSlot,
- const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) {
- for (const TypeMemberInfo &TM : TypeMemberInfos) {
- if (!TM.Bits->GV->isConstant())
- return false;
-
- // We cannot perform whole program devirtualization analysis on a vtable
- // with public LTO visibility.
- if (TM.Bits->GV->getVCallVisibility() ==
- GlobalObject::VCallVisibilityPublic)
- return false;
-
- Constant *Ptr = getPointerAtOffset(TM.Bits->GV->getInitializer(),
- TM.Offset + ByteOffset, M);
- if (!Ptr)
- return false;
-
- auto Fn = dyn_cast<Function>(Ptr->stripPointerCasts());
- if (!Fn)
- return false;
-
- if (FunctionsToSkip.match(Fn->getName()))
- return false;
-
- // We can disregard __cxa_pure_virtual as a possible call target, as
- // calls to pure virtuals are UB.
- if (Fn->getName() == "__cxa_pure_virtual")
- continue;
-
- TargetsForSlot.push_back({Fn, &TM});
- }
-
- // Give up if we couldn't find any targets.
- return !TargetsForSlot.empty();
-}
-
-bool DevirtIndex::tryFindVirtualCallTargets(
- std::vector<ValueInfo> &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo,
- uint64_t ByteOffset) {
- for (const TypeIdOffsetVtableInfo &P : TIdInfo) {
- // Find the first non-available_externally linkage vtable initializer.
- // We can have multiple available_externally, linkonce_odr and weak_odr
- // vtable initializers, however we want to skip available_externally as they
- // do not have type metadata attached, and therefore the summary will not
- // contain any vtable functions. We can also have multiple external
- // vtable initializers in the case of comdats, which we cannot check here.
- // The linker should give an error in this case.
- //
- // Also, handle the case of same-named local Vtables with the same path
- // and therefore the same GUID. This can happen if there isn't enough
- // distinguishing path when compiling the source file. In that case we
- // conservatively return false early.
- const GlobalVarSummary *VS = nullptr;
- bool LocalFound = false;
- for (auto &S : P.VTableVI.getSummaryList()) {
- if (GlobalValue::isLocalLinkage(S->linkage())) {
- if (LocalFound)
- return false;
- LocalFound = true;
- }
- if (!GlobalValue::isAvailableExternallyLinkage(S->linkage())) {
- VS = cast<GlobalVarSummary>(S->getBaseObject());
- // We cannot perform whole program devirtualization analysis on a vtable
- // with public LTO visibility.
- if (VS->getVCallVisibility() == GlobalObject::VCallVisibilityPublic)
- return false;
- }
- }
- if (!VS->isLive())
- continue;
- for (auto VTP : VS->vTableFuncs()) {
- if (VTP.VTableOffset != P.AddressPointOffset + ByteOffset)
- continue;
-
- TargetsForSlot.push_back(VTP.FuncVI);
- }
- }
-
- // Give up if we couldn't find any targets.
- return !TargetsForSlot.empty();
-}
-
-void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
- Constant *TheFn, bool &IsExported) {
+ if (!DevirtModule(M, AARGetter, OREGetter, LookupDomTree, ExportSummary,
+ ImportSummary)
+ .run())
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
+
+// Enable whole program visibility if enabled by client (e.g. linker) or
+// internal option, and not force disabled.
+static bool hasWholeProgramVisibility(bool WholeProgramVisibilityEnabledInLTO) {
+ return (WholeProgramVisibilityEnabledInLTO || WholeProgramVisibility) &&
+ !DisableWholeProgramVisibility;
+}
+
+namespace llvm {
+
+/// If whole program visibility asserted, then upgrade all public vcall
+/// visibility metadata on vtable definitions to linkage unit visibility in
+/// Module IR (for regular or hybrid LTO).
+void updateVCallVisibilityInModule(Module &M,
+ bool WholeProgramVisibilityEnabledInLTO) {
+ if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
+ return;
+ for (GlobalVariable &GV : M.globals())
+ // Add linkage unit visibility to any variable with type metadata, which are
+ // the vtable definitions. We won't have an existing vcall_visibility
+ // metadata on vtable definitions with public visibility.
+ if (GV.hasMetadata(LLVMContext::MD_type) &&
+ GV.getVCallVisibility() == GlobalObject::VCallVisibilityPublic)
+ GV.setVCallVisibilityMetadata(GlobalObject::VCallVisibilityLinkageUnit);
+}
+
+/// If whole program visibility asserted, then upgrade all public vcall
+/// visibility metadata on vtable definition summaries to linkage unit
+/// visibility in Module summary index (for ThinLTO).
+void updateVCallVisibilityInIndex(ModuleSummaryIndex &Index,
+ bool WholeProgramVisibilityEnabledInLTO) {
+ if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
+ return;
+ for (auto &P : Index) {
+ for (auto &S : P.second.SummaryList) {
+ auto *GVar = dyn_cast<GlobalVarSummary>(S.get());
+ if (!GVar || GVar->vTableFuncs().empty() ||
+ GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
+ continue;
+ GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit);
+ }
+ }
+}
+
+void runWholeProgramDevirtOnIndex(
+ ModuleSummaryIndex &Summary, std::set<GlobalValue::GUID> &ExportedGUIDs,
+ std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) {
+ DevirtIndex(Summary, ExportedGUIDs, LocalWPDTargetsMap).run();
+}
+
+void updateIndexWPDForExports(
+ ModuleSummaryIndex &Summary,
+ function_ref<bool(StringRef, ValueInfo)> isExported,
+ std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) {
+ for (auto &T : LocalWPDTargetsMap) {
+ auto &VI = T.first;
+ // This was enforced earlier during trySingleImplDevirt.
+ assert(VI.getSummaryList().size() == 1 &&
+ "Devirt of local target has more than one copy");
+ auto &S = VI.getSummaryList()[0];
+ if (!isExported(S->modulePath(), VI))
+ continue;
+
+ // It's been exported by a cross module import.
+ for (auto &SlotSummary : T.second) {
+ auto *TIdSum = Summary.getTypeIdSummary(SlotSummary.TypeID);
+ assert(TIdSum);
+ auto WPDRes = TIdSum->WPDRes.find(SlotSummary.ByteOffset);
+ assert(WPDRes != TIdSum->WPDRes.end());
+ WPDRes->second.SingleImplName = ModuleSummaryIndex::getGlobalNameForLocal(
+ WPDRes->second.SingleImplName,
+ Summary.getModuleHash(S->modulePath()));
+ }
+ }
+}
+
+} // end namespace llvm
+
+static Error checkCombinedSummaryForTesting(ModuleSummaryIndex *Summary) {
+ // Check that summary index contains regular LTO module when performing
+ // export to prevent occasional use of index from pure ThinLTO compilation
+ // (-fno-split-lto-module). This kind of summary index is passed to
+ // DevirtIndex::run, not to DevirtModule::run used by opt/runForTesting.
+ const auto &ModPaths = Summary->modulePaths();
+ if (ClSummaryAction != PassSummaryAction::Import &&
+ ModPaths.find(ModuleSummaryIndex::getRegularLTOModuleName()) ==
+ ModPaths.end())
+ return createStringError(
+ errc::invalid_argument,
+ "combined summary should contain Regular LTO module");
+ return ErrorSuccess();
+}
+
+bool DevirtModule::runForTesting(
+ Module &M, function_ref<AAResults &(Function &)> AARGetter,
+ function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
+ function_ref<DominatorTree &(Function &)> LookupDomTree) {
+ std::unique_ptr<ModuleSummaryIndex> Summary =
+ std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
+
+ // Handle the command-line summary arguments. This code is for testing
+ // purposes only, so we handle errors directly.
+ if (!ClReadSummary.empty()) {
+ ExitOnError ExitOnErr("-wholeprogramdevirt-read-summary: " + ClReadSummary +
+ ": ");
+ auto ReadSummaryFile =
+ ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+ if (Expected<std::unique_ptr<ModuleSummaryIndex>> SummaryOrErr =
+ getModuleSummaryIndex(*ReadSummaryFile)) {
+ Summary = std::move(*SummaryOrErr);
+ ExitOnErr(checkCombinedSummaryForTesting(Summary.get()));
+ } else {
+ // Try YAML if we've failed with bitcode.
+ consumeError(SummaryOrErr.takeError());
+ yaml::Input In(ReadSummaryFile->getBuffer());
+ In >> *Summary;
+ ExitOnErr(errorCodeToError(In.error()));
+ }
+ }
+
+ bool Changed =
+ DevirtModule(M, AARGetter, OREGetter, LookupDomTree,
+ ClSummaryAction == PassSummaryAction::Export ? Summary.get()
+ : nullptr,
+ ClSummaryAction == PassSummaryAction::Import ? Summary.get()
+ : nullptr)
+ .run();
+
+ if (!ClWriteSummary.empty()) {
+ ExitOnError ExitOnErr(
+ "-wholeprogramdevirt-write-summary: " + ClWriteSummary + ": ");
+ std::error_code EC;
+ if (StringRef(ClWriteSummary).endswith(".bc")) {
+ raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_None);
+ ExitOnErr(errorCodeToError(EC));
+ WriteIndexToFile(*Summary, OS);
+ } else {
+ raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text);
+ ExitOnErr(errorCodeToError(EC));
+ yaml::Output Out(OS);
+ Out << *Summary;
+ }
+ }
+
+ return Changed;
+}
+
+void DevirtModule::buildTypeIdentifierMap(
+ std::vector<VTableBits> &Bits,
+ DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
+ DenseMap<GlobalVariable *, VTableBits *> GVToBits;
+ Bits.reserve(M.getGlobalList().size());
+ SmallVector<MDNode *, 2> Types;
+ for (GlobalVariable &GV : M.globals()) {
+ Types.clear();
+ GV.getMetadata(LLVMContext::MD_type, Types);
+ if (GV.isDeclaration() || Types.empty())
+ continue;
+
+ VTableBits *&BitsPtr = GVToBits[&GV];
+ if (!BitsPtr) {
+ Bits.emplace_back();
+ Bits.back().GV = &GV;
+ Bits.back().ObjectSize =
+ M.getDataLayout().getTypeAllocSize(GV.getInitializer()->getType());
+ BitsPtr = &Bits.back();
+ }
+
+ for (MDNode *Type : Types) {
+ auto TypeID = Type->getOperand(1).get();
+
+ uint64_t Offset =
+ cast<ConstantInt>(
+ cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
+ ->getZExtValue();
+
+ TypeIdMap[TypeID].insert({BitsPtr, Offset});
+ }
+ }
+}
+
+bool DevirtModule::tryFindVirtualCallTargets(
+ std::vector<VirtualCallTarget> &TargetsForSlot,
+ const std::set<TypeMemberInfo> &TypeMemberInfos, uint64_t ByteOffset) {
+ for (const TypeMemberInfo &TM : TypeMemberInfos) {
+ if (!TM.Bits->GV->isConstant())
+ return false;
+
+ // We cannot perform whole program devirtualization analysis on a vtable
+ // with public LTO visibility.
+ if (TM.Bits->GV->getVCallVisibility() ==
+ GlobalObject::VCallVisibilityPublic)
+ return false;
+
+ Constant *Ptr = getPointerAtOffset(TM.Bits->GV->getInitializer(),
+ TM.Offset + ByteOffset, M);
+ if (!Ptr)
+ return false;
+
+ auto Fn = dyn_cast<Function>(Ptr->stripPointerCasts());
+ if (!Fn)
+ return false;
+
+ if (FunctionsToSkip.match(Fn->getName()))
+ return false;
+
+ // We can disregard __cxa_pure_virtual as a possible call target, as
+ // calls to pure virtuals are UB.
+ if (Fn->getName() == "__cxa_pure_virtual")
+ continue;
+
+ TargetsForSlot.push_back({Fn, &TM});
+ }
+
+ // Give up if we couldn't find any targets.
+ return !TargetsForSlot.empty();
+}
+
+bool DevirtIndex::tryFindVirtualCallTargets(
+ std::vector<ValueInfo> &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo,
+ uint64_t ByteOffset) {
+ for (const TypeIdOffsetVtableInfo &P : TIdInfo) {
+ // Find the first non-available_externally linkage vtable initializer.
+ // We can have multiple available_externally, linkonce_odr and weak_odr
+ // vtable initializers, however we want to skip available_externally as they
+ // do not have type metadata attached, and therefore the summary will not
+ // contain any vtable functions. We can also have multiple external
+ // vtable initializers in the case of comdats, which we cannot check here.
+ // The linker should give an error in this case.
+ //
+ // Also, handle the case of same-named local Vtables with the same path
+ // and therefore the same GUID. This can happen if there isn't enough
+ // distinguishing path when compiling the source file. In that case we
+ // conservatively return false early.
+ const GlobalVarSummary *VS = nullptr;
+ bool LocalFound = false;
+ for (auto &S : P.VTableVI.getSummaryList()) {
+ if (GlobalValue::isLocalLinkage(S->linkage())) {
+ if (LocalFound)
+ return false;
+ LocalFound = true;
+ }
+ if (!GlobalValue::isAvailableExternallyLinkage(S->linkage())) {
+ VS = cast<GlobalVarSummary>(S->getBaseObject());
+ // We cannot perform whole program devirtualization analysis on a vtable
+ // with public LTO visibility.
+ if (VS->getVCallVisibility() == GlobalObject::VCallVisibilityPublic)
+ return false;
+ }
+ }
+ if (!VS->isLive())
+ continue;
+ for (auto VTP : VS->vTableFuncs()) {
+ if (VTP.VTableOffset != P.AddressPointOffset + ByteOffset)
+ continue;
+
+ TargetsForSlot.push_back(VTP.FuncVI);
+ }
+ }
+
+ // Give up if we couldn't find any targets.
+ return !TargetsForSlot.empty();
+}
+
+void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
+ Constant *TheFn, bool &IsExported) {
// Don't devirtualize function if we're told to skip it
// in -wholeprogramdevirt-skip.
if (FunctionsToSkip.match(TheFn->stripPointerCasts()->getName()))
return;
- auto Apply = [&](CallSiteInfo &CSInfo) {
- for (auto &&VCallSite : CSInfo.CallSites) {
- if (RemarksEnabled)
- VCallSite.emitRemark("single-impl",
- TheFn->stripPointerCasts()->getName(), OREGetter);
- VCallSite.CB.setCalledOperand(ConstantExpr::getBitCast(
- TheFn, VCallSite.CB.getCalledOperand()->getType()));
- // This use is no longer unsafe.
- if (VCallSite.NumUnsafeUses)
- --*VCallSite.NumUnsafeUses;
- }
- if (CSInfo.isExported())
- IsExported = true;
- CSInfo.markDevirt();
- };
- Apply(SlotInfo.CSInfo);
- for (auto &P : SlotInfo.ConstCSInfo)
- Apply(P.second);
-}
-
-static bool AddCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) {
- // We can't add calls if we haven't seen a definition
- if (Callee.getSummaryList().empty())
- return false;
-
- // Insert calls into the summary index so that the devirtualized targets
- // are eligible for import.
- // FIXME: Annotate type tests with hotness. For now, mark these as hot
- // to better ensure we have the opportunity to inline them.
- bool IsExported = false;
- auto &S = Callee.getSummaryList()[0];
- CalleeInfo CI(CalleeInfo::HotnessType::Hot, /* RelBF = */ 0);
- auto AddCalls = [&](CallSiteInfo &CSInfo) {
- for (auto *FS : CSInfo.SummaryTypeCheckedLoadUsers) {
- FS->addCall({Callee, CI});
- IsExported |= S->modulePath() != FS->modulePath();
- }
- for (auto *FS : CSInfo.SummaryTypeTestAssumeUsers) {
- FS->addCall({Callee, CI});
- IsExported |= S->modulePath() != FS->modulePath();
- }
- };
- AddCalls(SlotInfo.CSInfo);
- for (auto &P : SlotInfo.ConstCSInfo)
- AddCalls(P.second);
- return IsExported;
-}
-
-bool DevirtModule::trySingleImplDevirt(
- ModuleSummaryIndex *ExportSummary,
- MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
- WholeProgramDevirtResolution *Res) {
- // See if the program contains a single implementation of this virtual
- // function.
- Function *TheFn = TargetsForSlot[0].Fn;
- for (auto &&Target : TargetsForSlot)
- if (TheFn != Target.Fn)
- return false;
-
- // If so, update each call site to call that implementation directly.
- if (RemarksEnabled)
- TargetsForSlot[0].WasDevirt = true;
-
- bool IsExported = false;
- applySingleImplDevirt(SlotInfo, TheFn, IsExported);
- if (!IsExported)
- return false;
-
- // If the only implementation has local linkage, we must promote to external
- // to make it visible to thin LTO objects. We can only get here during the
- // ThinLTO export phase.
- if (TheFn->hasLocalLinkage()) {
- std::string NewName = (TheFn->getName() + "$merged").str();
-
- // Since we are renaming the function, any comdats with the same name must
- // also be renamed. This is required when targeting COFF, as the comdat name
- // must match one of the names of the symbols in the comdat.
- if (Comdat *C = TheFn->getComdat()) {
- if (C->getName() == TheFn->getName()) {
- Comdat *NewC = M.getOrInsertComdat(NewName);
- NewC->setSelectionKind(C->getSelectionKind());
- for (GlobalObject &GO : M.global_objects())
- if (GO.getComdat() == C)
- GO.setComdat(NewC);
- }
- }
-
- TheFn->setLinkage(GlobalValue::ExternalLinkage);
- TheFn->setVisibility(GlobalValue::HiddenVisibility);
- TheFn->setName(NewName);
- }
- if (ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFn->getGUID()))
- // Any needed promotion of 'TheFn' has already been done during
- // LTO unit split, so we can ignore return value of AddCalls.
- AddCalls(SlotInfo, TheFnVI);
-
- Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
- Res->SingleImplName = std::string(TheFn->getName());
-
- return true;
-}
-
-bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
- VTableSlotSummary &SlotSummary,
- VTableSlotInfo &SlotInfo,
- WholeProgramDevirtResolution *Res,
- std::set<ValueInfo> &DevirtTargets) {
- // See if the program contains a single implementation of this virtual
- // function.
- auto TheFn = TargetsForSlot[0];
- for (auto &&Target : TargetsForSlot)
- if (TheFn != Target)
- return false;
-
- // Don't devirtualize if we don't have target definition.
- auto Size = TheFn.getSummaryList().size();
- if (!Size)
- return false;
-
- // Don't devirtualize function if we're told to skip it
- // in -wholeprogramdevirt-skip.
- if (FunctionsToSkip.match(TheFn.name()))
- return false;
-
- // If the summary list contains multiple summaries where at least one is
- // a local, give up, as we won't know which (possibly promoted) name to use.
- for (auto &S : TheFn.getSummaryList())
- if (GlobalValue::isLocalLinkage(S->linkage()) && Size > 1)
- return false;
-
- // Collect functions devirtualized at least for one call site for stats.
- if (PrintSummaryDevirt)
- DevirtTargets.insert(TheFn);
-
- auto &S = TheFn.getSummaryList()[0];
- bool IsExported = AddCalls(SlotInfo, TheFn);
- if (IsExported)
- ExportedGUIDs.insert(TheFn.getGUID());
-
- // Record in summary for use in devirtualization during the ThinLTO import
- // step.
- Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
- if (GlobalValue::isLocalLinkage(S->linkage())) {
- if (IsExported)
- // If target is a local function and we are exporting it by
- // devirtualizing a call in another module, we need to record the
- // promoted name.
- Res->SingleImplName = ModuleSummaryIndex::getGlobalNameForLocal(
- TheFn.name(), ExportSummary.getModuleHash(S->modulePath()));
- else {
- LocalWPDTargetsMap[TheFn].push_back(SlotSummary);
- Res->SingleImplName = std::string(TheFn.name());
- }
- } else
- Res->SingleImplName = std::string(TheFn.name());
-
- // Name will be empty if this thin link driven off of serialized combined
- // index (e.g. llvm-lto). However, WPD is not supported/invoked for the
- // legacy LTO API anyway.
- assert(!Res->SingleImplName.empty());
-
- return true;
-}
-
-void DevirtModule::tryICallBranchFunnel(
- MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
- WholeProgramDevirtResolution *Res, VTableSlot Slot) {
- Triple T(M.getTargetTriple());
- if (T.getArch() != Triple::x86_64)
- return;
-
- if (TargetsForSlot.size() > ClThreshold)
- return;
-
- bool HasNonDevirt = !SlotInfo.CSInfo.AllCallSitesDevirted;
- if (!HasNonDevirt)
- for (auto &P : SlotInfo.ConstCSInfo)
- if (!P.second.AllCallSitesDevirted) {
- HasNonDevirt = true;
- break;
- }
-
- if (!HasNonDevirt)
- return;
-
- FunctionType *FT =
- FunctionType::get(Type::getVoidTy(M.getContext()), {Int8PtrTy}, true);
- Function *JT;
- if (isa<MDString>(Slot.TypeID)) {
- JT = Function::Create(FT, Function::ExternalLinkage,
- M.getDataLayout().getProgramAddressSpace(),
- getGlobalName(Slot, {}, "branch_funnel"), &M);
- JT->setVisibility(GlobalValue::HiddenVisibility);
- } else {
- JT = Function::Create(FT, Function::InternalLinkage,
- M.getDataLayout().getProgramAddressSpace(),
- "branch_funnel", &M);
- }
- JT->addAttribute(1, Attribute::Nest);
-
- std::vector<Value *> JTArgs;
- JTArgs.push_back(JT->arg_begin());
- for (auto &T : TargetsForSlot) {
- JTArgs.push_back(getMemberAddr(T.TM));
- JTArgs.push_back(T.Fn);
- }
-
- BasicBlock *BB = BasicBlock::Create(M.getContext(), "", JT, nullptr);
- Function *Intr =
- Intrinsic::getDeclaration(&M, llvm::Intrinsic::icall_branch_funnel, {});
-
- auto *CI = CallInst::Create(Intr, JTArgs, "", BB);
- CI->setTailCallKind(CallInst::TCK_MustTail);
- ReturnInst::Create(M.getContext(), nullptr, BB);
-
- bool IsExported = false;
- applyICallBranchFunnel(SlotInfo, JT, IsExported);
- if (IsExported)
- Res->TheKind = WholeProgramDevirtResolution::BranchFunnel;
-}
-
-void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
- Constant *JT, bool &IsExported) {
- auto Apply = [&](CallSiteInfo &CSInfo) {
- if (CSInfo.isExported())
- IsExported = true;
- if (CSInfo.AllCallSitesDevirted)
- return;
- for (auto &&VCallSite : CSInfo.CallSites) {
- CallBase &CB = VCallSite.CB;
-
- // Jump tables are only profitable if the retpoline mitigation is enabled.
- Attribute FSAttr = CB.getCaller()->getFnAttribute("target-features");
+ auto Apply = [&](CallSiteInfo &CSInfo) {
+ for (auto &&VCallSite : CSInfo.CallSites) {
+ if (RemarksEnabled)
+ VCallSite.emitRemark("single-impl",
+ TheFn->stripPointerCasts()->getName(), OREGetter);
+ VCallSite.CB.setCalledOperand(ConstantExpr::getBitCast(
+ TheFn, VCallSite.CB.getCalledOperand()->getType()));
+ // This use is no longer unsafe.
+ if (VCallSite.NumUnsafeUses)
+ --*VCallSite.NumUnsafeUses;
+ }
+ if (CSInfo.isExported())
+ IsExported = true;
+ CSInfo.markDevirt();
+ };
+ Apply(SlotInfo.CSInfo);
+ for (auto &P : SlotInfo.ConstCSInfo)
+ Apply(P.second);
+}
+
+static bool AddCalls(VTableSlotInfo &SlotInfo, const ValueInfo &Callee) {
+ // We can't add calls if we haven't seen a definition
+ if (Callee.getSummaryList().empty())
+ return false;
+
+ // Insert calls into the summary index so that the devirtualized targets
+ // are eligible for import.
+ // FIXME: Annotate type tests with hotness. For now, mark these as hot
+ // to better ensure we have the opportunity to inline them.
+ bool IsExported = false;
+ auto &S = Callee.getSummaryList()[0];
+ CalleeInfo CI(CalleeInfo::HotnessType::Hot, /* RelBF = */ 0);
+ auto AddCalls = [&](CallSiteInfo &CSInfo) {
+ for (auto *FS : CSInfo.SummaryTypeCheckedLoadUsers) {
+ FS->addCall({Callee, CI});
+ IsExported |= S->modulePath() != FS->modulePath();
+ }
+ for (auto *FS : CSInfo.SummaryTypeTestAssumeUsers) {
+ FS->addCall({Callee, CI});
+ IsExported |= S->modulePath() != FS->modulePath();
+ }
+ };
+ AddCalls(SlotInfo.CSInfo);
+ for (auto &P : SlotInfo.ConstCSInfo)
+ AddCalls(P.second);
+ return IsExported;
+}
+
+bool DevirtModule::trySingleImplDevirt(
+ ModuleSummaryIndex *ExportSummary,
+ MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
+ WholeProgramDevirtResolution *Res) {
+ // See if the program contains a single implementation of this virtual
+ // function.
+ Function *TheFn = TargetsForSlot[0].Fn;
+ for (auto &&Target : TargetsForSlot)
+ if (TheFn != Target.Fn)
+ return false;
+
+ // If so, update each call site to call that implementation directly.
+ if (RemarksEnabled)
+ TargetsForSlot[0].WasDevirt = true;
+
+ bool IsExported = false;
+ applySingleImplDevirt(SlotInfo, TheFn, IsExported);
+ if (!IsExported)
+ return false;
+
+ // If the only implementation has local linkage, we must promote to external
+ // to make it visible to thin LTO objects. We can only get here during the
+ // ThinLTO export phase.
+ if (TheFn->hasLocalLinkage()) {
+ std::string NewName = (TheFn->getName() + "$merged").str();
+
+ // Since we are renaming the function, any comdats with the same name must
+ // also be renamed. This is required when targeting COFF, as the comdat name
+ // must match one of the names of the symbols in the comdat.
+ if (Comdat *C = TheFn->getComdat()) {
+ if (C->getName() == TheFn->getName()) {
+ Comdat *NewC = M.getOrInsertComdat(NewName);
+ NewC->setSelectionKind(C->getSelectionKind());
+ for (GlobalObject &GO : M.global_objects())
+ if (GO.getComdat() == C)
+ GO.setComdat(NewC);
+ }
+ }
+
+ TheFn->setLinkage(GlobalValue::ExternalLinkage);
+ TheFn->setVisibility(GlobalValue::HiddenVisibility);
+ TheFn->setName(NewName);
+ }
+ if (ValueInfo TheFnVI = ExportSummary->getValueInfo(TheFn->getGUID()))
+ // Any needed promotion of 'TheFn' has already been done during
+ // LTO unit split, so we can ignore return value of AddCalls.
+ AddCalls(SlotInfo, TheFnVI);
+
+ Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
+ Res->SingleImplName = std::string(TheFn->getName());
+
+ return true;
+}
+
+bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
+ VTableSlotSummary &SlotSummary,
+ VTableSlotInfo &SlotInfo,
+ WholeProgramDevirtResolution *Res,
+ std::set<ValueInfo> &DevirtTargets) {
+ // See if the program contains a single implementation of this virtual
+ // function.
+ auto TheFn = TargetsForSlot[0];
+ for (auto &&Target : TargetsForSlot)
+ if (TheFn != Target)
+ return false;
+
+ // Don't devirtualize if we don't have target definition.
+ auto Size = TheFn.getSummaryList().size();
+ if (!Size)
+ return false;
+
+ // Don't devirtualize function if we're told to skip it
+ // in -wholeprogramdevirt-skip.
+ if (FunctionsToSkip.match(TheFn.name()))
+ return false;
+
+ // If the summary list contains multiple summaries where at least one is
+ // a local, give up, as we won't know which (possibly promoted) name to use.
+ for (auto &S : TheFn.getSummaryList())
+ if (GlobalValue::isLocalLinkage(S->linkage()) && Size > 1)
+ return false;
+
+ // Collect functions devirtualized at least for one call site for stats.
+ if (PrintSummaryDevirt)
+ DevirtTargets.insert(TheFn);
+
+ auto &S = TheFn.getSummaryList()[0];
+ bool IsExported = AddCalls(SlotInfo, TheFn);
+ if (IsExported)
+ ExportedGUIDs.insert(TheFn.getGUID());
+
+ // Record in summary for use in devirtualization during the ThinLTO import
+ // step.
+ Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
+ if (GlobalValue::isLocalLinkage(S->linkage())) {
+ if (IsExported)
+ // If target is a local function and we are exporting it by
+ // devirtualizing a call in another module, we need to record the
+ // promoted name.
+ Res->SingleImplName = ModuleSummaryIndex::getGlobalNameForLocal(
+ TheFn.name(), ExportSummary.getModuleHash(S->modulePath()));
+ else {
+ LocalWPDTargetsMap[TheFn].push_back(SlotSummary);
+ Res->SingleImplName = std::string(TheFn.name());
+ }
+ } else
+ Res->SingleImplName = std::string(TheFn.name());
+
+ // Name will be empty if this thin link driven off of serialized combined
+ // index (e.g. llvm-lto). However, WPD is not supported/invoked for the
+ // legacy LTO API anyway.
+ assert(!Res->SingleImplName.empty());
+
+ return true;
+}
+
+void DevirtModule::tryICallBranchFunnel(
+ MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
+ WholeProgramDevirtResolution *Res, VTableSlot Slot) {
+ Triple T(M.getTargetTriple());
+ if (T.getArch() != Triple::x86_64)
+ return;
+
+ if (TargetsForSlot.size() > ClThreshold)
+ return;
+
+ bool HasNonDevirt = !SlotInfo.CSInfo.AllCallSitesDevirted;
+ if (!HasNonDevirt)
+ for (auto &P : SlotInfo.ConstCSInfo)
+ if (!P.second.AllCallSitesDevirted) {
+ HasNonDevirt = true;
+ break;
+ }
+
+ if (!HasNonDevirt)
+ return;
+
+ FunctionType *FT =
+ FunctionType::get(Type::getVoidTy(M.getContext()), {Int8PtrTy}, true);
+ Function *JT;
+ if (isa<MDString>(Slot.TypeID)) {
+ JT = Function::Create(FT, Function::ExternalLinkage,
+ M.getDataLayout().getProgramAddressSpace(),
+ getGlobalName(Slot, {}, "branch_funnel"), &M);
+ JT->setVisibility(GlobalValue::HiddenVisibility);
+ } else {
+ JT = Function::Create(FT, Function::InternalLinkage,
+ M.getDataLayout().getProgramAddressSpace(),
+ "branch_funnel", &M);
+ }
+ JT->addAttribute(1, Attribute::Nest);
+
+ std::vector<Value *> JTArgs;
+ JTArgs.push_back(JT->arg_begin());
+ for (auto &T : TargetsForSlot) {
+ JTArgs.push_back(getMemberAddr(T.TM));
+ JTArgs.push_back(T.Fn);
+ }
+
+ BasicBlock *BB = BasicBlock::Create(M.getContext(), "", JT, nullptr);
+ Function *Intr =
+ Intrinsic::getDeclaration(&M, llvm::Intrinsic::icall_branch_funnel, {});
+
+ auto *CI = CallInst::Create(Intr, JTArgs, "", BB);
+ CI->setTailCallKind(CallInst::TCK_MustTail);
+ ReturnInst::Create(M.getContext(), nullptr, BB);
+
+ bool IsExported = false;
+ applyICallBranchFunnel(SlotInfo, JT, IsExported);
+ if (IsExported)
+ Res->TheKind = WholeProgramDevirtResolution::BranchFunnel;
+}
+
+void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
+ Constant *JT, bool &IsExported) {
+ auto Apply = [&](CallSiteInfo &CSInfo) {
+ if (CSInfo.isExported())
+ IsExported = true;
+ if (CSInfo.AllCallSitesDevirted)
+ return;
+ for (auto &&VCallSite : CSInfo.CallSites) {
+ CallBase &CB = VCallSite.CB;
+
+ // Jump tables are only profitable if the retpoline mitigation is enabled.
+ Attribute FSAttr = CB.getCaller()->getFnAttribute("target-features");
if (!FSAttr.isValid() ||
- !FSAttr.getValueAsString().contains("+retpoline"))
- continue;
-
- if (RemarksEnabled)
- VCallSite.emitRemark("branch-funnel",
- JT->stripPointerCasts()->getName(), OREGetter);
-
- // Pass the address of the vtable in the nest register, which is r10 on
- // x86_64.
- std::vector<Type *> NewArgs;
- NewArgs.push_back(Int8PtrTy);
+ !FSAttr.getValueAsString().contains("+retpoline"))
+ continue;
+
+ if (RemarksEnabled)
+ VCallSite.emitRemark("branch-funnel",
+ JT->stripPointerCasts()->getName(), OREGetter);
+
+ // Pass the address of the vtable in the nest register, which is r10 on
+ // x86_64.
+ std::vector<Type *> NewArgs;
+ NewArgs.push_back(Int8PtrTy);
append_range(NewArgs, CB.getFunctionType()->params());
- FunctionType *NewFT =
- FunctionType::get(CB.getFunctionType()->getReturnType(), NewArgs,
- CB.getFunctionType()->isVarArg());
- PointerType *NewFTPtr = PointerType::getUnqual(NewFT);
-
- IRBuilder<> IRB(&CB);
- std::vector<Value *> Args;
- Args.push_back(IRB.CreateBitCast(VCallSite.VTable, Int8PtrTy));
+ FunctionType *NewFT =
+ FunctionType::get(CB.getFunctionType()->getReturnType(), NewArgs,
+ CB.getFunctionType()->isVarArg());
+ PointerType *NewFTPtr = PointerType::getUnqual(NewFT);
+
+ IRBuilder<> IRB(&CB);
+ std::vector<Value *> Args;
+ Args.push_back(IRB.CreateBitCast(VCallSite.VTable, Int8PtrTy));
llvm::append_range(Args, CB.args());
-
- CallBase *NewCS = nullptr;
- if (isa<CallInst>(CB))
- NewCS = IRB.CreateCall(NewFT, IRB.CreateBitCast(JT, NewFTPtr), Args);
- else
- NewCS = IRB.CreateInvoke(NewFT, IRB.CreateBitCast(JT, NewFTPtr),
- cast<InvokeInst>(CB).getNormalDest(),
- cast<InvokeInst>(CB).getUnwindDest(), Args);
- NewCS->setCallingConv(CB.getCallingConv());
-
- AttributeList Attrs = CB.getAttributes();
- std::vector<AttributeSet> NewArgAttrs;
- NewArgAttrs.push_back(AttributeSet::get(
- M.getContext(), ArrayRef<Attribute>{Attribute::get(
- M.getContext(), Attribute::Nest)}));
- for (unsigned I = 0; I + 2 < Attrs.getNumAttrSets(); ++I)
- NewArgAttrs.push_back(Attrs.getParamAttributes(I));
- NewCS->setAttributes(
- AttributeList::get(M.getContext(), Attrs.getFnAttributes(),
- Attrs.getRetAttributes(), NewArgAttrs));
-
- CB.replaceAllUsesWith(NewCS);
- CB.eraseFromParent();
-
- // This use is no longer unsafe.
- if (VCallSite.NumUnsafeUses)
- --*VCallSite.NumUnsafeUses;
- }
- // Don't mark as devirtualized because there may be callers compiled without
- // retpoline mitigation, which would mean that they are lowered to
- // llvm.type.test and therefore require an llvm.type.test resolution for the
- // type identifier.
- };
- Apply(SlotInfo.CSInfo);
- for (auto &P : SlotInfo.ConstCSInfo)
- Apply(P.second);
-}
-
-bool DevirtModule::tryEvaluateFunctionsWithArgs(
- MutableArrayRef<VirtualCallTarget> TargetsForSlot,
- ArrayRef<uint64_t> Args) {
- // Evaluate each function and store the result in each target's RetVal
- // field.
- for (VirtualCallTarget &Target : TargetsForSlot) {
- if (Target.Fn->arg_size() != Args.size() + 1)
- return false;
-
- Evaluator Eval(M.getDataLayout(), nullptr);
- SmallVector<Constant *, 2> EvalArgs;
- EvalArgs.push_back(
- Constant::getNullValue(Target.Fn->getFunctionType()->getParamType(0)));
- for (unsigned I = 0; I != Args.size(); ++I) {
- auto *ArgTy = dyn_cast<IntegerType>(
- Target.Fn->getFunctionType()->getParamType(I + 1));
- if (!ArgTy)
- return false;
- EvalArgs.push_back(ConstantInt::get(ArgTy, Args[I]));
- }
-
- Constant *RetVal;
- if (!Eval.EvaluateFunction(Target.Fn, RetVal, EvalArgs) ||
- !isa<ConstantInt>(RetVal))
- return false;
- Target.RetVal = cast<ConstantInt>(RetVal)->getZExtValue();
- }
- return true;
-}
-
-void DevirtModule::applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
- uint64_t TheRetVal) {
- for (auto Call : CSInfo.CallSites)
- Call.replaceAndErase(
- "uniform-ret-val", FnName, RemarksEnabled, OREGetter,
- ConstantInt::get(cast<IntegerType>(Call.CB.getType()), TheRetVal));
- CSInfo.markDevirt();
-}
-
-bool DevirtModule::tryUniformRetValOpt(
- MutableArrayRef<VirtualCallTarget> TargetsForSlot, CallSiteInfo &CSInfo,
- WholeProgramDevirtResolution::ByArg *Res) {
- // Uniform return value optimization. If all functions return the same
- // constant, replace all calls with that constant.
- uint64_t TheRetVal = TargetsForSlot[0].RetVal;
- for (const VirtualCallTarget &Target : TargetsForSlot)
- if (Target.RetVal != TheRetVal)
- return false;
-
- if (CSInfo.isExported()) {
- Res->TheKind = WholeProgramDevirtResolution::ByArg::UniformRetVal;
- Res->Info = TheRetVal;
- }
-
- applyUniformRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), TheRetVal);
- if (RemarksEnabled)
- for (auto &&Target : TargetsForSlot)
- Target.WasDevirt = true;
- return true;
-}
-
-std::string DevirtModule::getGlobalName(VTableSlot Slot,
- ArrayRef<uint64_t> Args,
- StringRef Name) {
- std::string FullName = "__typeid_";
- raw_string_ostream OS(FullName);
- OS << cast<MDString>(Slot.TypeID)->getString() << '_' << Slot.ByteOffset;
- for (uint64_t Arg : Args)
- OS << '_' << Arg;
- OS << '_' << Name;
- return OS.str();
-}
-
-bool DevirtModule::shouldExportConstantsAsAbsoluteSymbols() {
- Triple T(M.getTargetTriple());
- return T.isX86() && T.getObjectFormat() == Triple::ELF;
-}
-
-void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
- StringRef Name, Constant *C) {
- GlobalAlias *GA = GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage,
- getGlobalName(Slot, Args, Name), C, &M);
- GA->setVisibility(GlobalValue::HiddenVisibility);
-}
-
-void DevirtModule::exportConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
- StringRef Name, uint32_t Const,
- uint32_t &Storage) {
- if (shouldExportConstantsAsAbsoluteSymbols()) {
- exportGlobal(
- Slot, Args, Name,
- ConstantExpr::getIntToPtr(ConstantInt::get(Int32Ty, Const), Int8PtrTy));
- return;
- }
-
- Storage = Const;
-}
-
-Constant *DevirtModule::importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
- StringRef Name) {
- Constant *C =
- M.getOrInsertGlobal(getGlobalName(Slot, Args, Name), Int8Arr0Ty);
- auto *GV = dyn_cast<GlobalVariable>(C);
- if (GV)
- GV->setVisibility(GlobalValue::HiddenVisibility);
- return C;
-}
-
-Constant *DevirtModule::importConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
- StringRef Name, IntegerType *IntTy,
- uint32_t Storage) {
- if (!shouldExportConstantsAsAbsoluteSymbols())
- return ConstantInt::get(IntTy, Storage);
-
- Constant *C = importGlobal(Slot, Args, Name);
- auto *GV = cast<GlobalVariable>(C->stripPointerCasts());
- C = ConstantExpr::getPtrToInt(C, IntTy);
-
- // We only need to set metadata if the global is newly created, in which
- // case it would not have hidden visibility.
- if (GV->hasMetadata(LLVMContext::MD_absolute_symbol))
- return C;
-
- auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
- auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
- auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
- GV->setMetadata(LLVMContext::MD_absolute_symbol,
- MDNode::get(M.getContext(), {MinC, MaxC}));
- };
- unsigned AbsWidth = IntTy->getBitWidth();
- if (AbsWidth == IntPtrTy->getBitWidth())
- SetAbsRange(~0ull, ~0ull); // Full set.
- else
- SetAbsRange(0, 1ull << AbsWidth);
- return C;
-}
-
-void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
- bool IsOne,
- Constant *UniqueMemberAddr) {
- for (auto &&Call : CSInfo.CallSites) {
- IRBuilder<> B(&Call.CB);
- Value *Cmp =
- B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, Call.VTable,
- B.CreateBitCast(UniqueMemberAddr, Call.VTable->getType()));
- Cmp = B.CreateZExt(Cmp, Call.CB.getType());
- Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, OREGetter,
- Cmp);
- }
- CSInfo.markDevirt();
-}
-
-Constant *DevirtModule::getMemberAddr(const TypeMemberInfo *M) {
- Constant *C = ConstantExpr::getBitCast(M->Bits->GV, Int8PtrTy);
- return ConstantExpr::getGetElementPtr(Int8Ty, C,
- ConstantInt::get(Int64Ty, M->Offset));
-}
-
-bool DevirtModule::tryUniqueRetValOpt(
- unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot,
- CallSiteInfo &CSInfo, WholeProgramDevirtResolution::ByArg *Res,
- VTableSlot Slot, ArrayRef<uint64_t> Args) {
- // IsOne controls whether we look for a 0 or a 1.
- auto tryUniqueRetValOptFor = [&](bool IsOne) {
- const TypeMemberInfo *UniqueMember = nullptr;
- for (const VirtualCallTarget &Target : TargetsForSlot) {
- if (Target.RetVal == (IsOne ? 1 : 0)) {
- if (UniqueMember)
- return false;
- UniqueMember = Target.TM;
- }
- }
-
- // We should have found a unique member or bailed out by now. We already
- // checked for a uniform return value in tryUniformRetValOpt.
- assert(UniqueMember);
-
- Constant *UniqueMemberAddr = getMemberAddr(UniqueMember);
- if (CSInfo.isExported()) {
- Res->TheKind = WholeProgramDevirtResolution::ByArg::UniqueRetVal;
- Res->Info = IsOne;
-
- exportGlobal(Slot, Args, "unique_member", UniqueMemberAddr);
- }
-
- // Replace each call with the comparison.
- applyUniqueRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), IsOne,
- UniqueMemberAddr);
-
- // Update devirtualization statistics for targets.
- if (RemarksEnabled)
- for (auto &&Target : TargetsForSlot)
- Target.WasDevirt = true;
-
- return true;
- };
-
- if (BitWidth == 1) {
- if (tryUniqueRetValOptFor(true))
- return true;
- if (tryUniqueRetValOptFor(false))
- return true;
- }
- return false;
-}
-
-void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
- Constant *Byte, Constant *Bit) {
- for (auto Call : CSInfo.CallSites) {
- auto *RetType = cast<IntegerType>(Call.CB.getType());
- IRBuilder<> B(&Call.CB);
- Value *Addr =
- B.CreateGEP(Int8Ty, B.CreateBitCast(Call.VTable, Int8PtrTy), Byte);
- if (RetType->getBitWidth() == 1) {
- Value *Bits = B.CreateLoad(Int8Ty, Addr);
- Value *BitsAndBit = B.CreateAnd(Bits, Bit);
- auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0));
- Call.replaceAndErase("virtual-const-prop-1-bit", FnName, RemarksEnabled,
- OREGetter, IsBitSet);
- } else {
- Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo());
- Value *Val = B.CreateLoad(RetType, ValAddr);
- Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled,
- OREGetter, Val);
- }
- }
- CSInfo.markDevirt();
-}
-
-bool DevirtModule::tryVirtualConstProp(
- MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
- WholeProgramDevirtResolution *Res, VTableSlot Slot) {
- // This only works if the function returns an integer.
- auto RetType = dyn_cast<IntegerType>(TargetsForSlot[0].Fn->getReturnType());
- if (!RetType)
- return false;
- unsigned BitWidth = RetType->getBitWidth();
- if (BitWidth > 64)
- return false;
-
- // Make sure that each function is defined, does not access memory, takes at
- // least one argument, does not use its first argument (which we assume is
- // 'this'), and has the same return type.
- //
- // Note that we test whether this copy of the function is readnone, rather
- // than testing function attributes, which must hold for any copy of the
- // function, even a less optimized version substituted at link time. This is
- // sound because the virtual constant propagation optimizations effectively
- // inline all implementations of the virtual function into each call site,
- // rather than using function attributes to perform local optimization.
- for (VirtualCallTarget &Target : TargetsForSlot) {
- if (Target.Fn->isDeclaration() ||
- computeFunctionBodyMemoryAccess(*Target.Fn, AARGetter(*Target.Fn)) !=
- MAK_ReadNone ||
- Target.Fn->arg_empty() || !Target.Fn->arg_begin()->use_empty() ||
- Target.Fn->getReturnType() != RetType)
- return false;
- }
-
- for (auto &&CSByConstantArg : SlotInfo.ConstCSInfo) {
- if (!tryEvaluateFunctionsWithArgs(TargetsForSlot, CSByConstantArg.first))
- continue;
-
- WholeProgramDevirtResolution::ByArg *ResByArg = nullptr;
- if (Res)
- ResByArg = &Res->ResByArg[CSByConstantArg.first];
-
- if (tryUniformRetValOpt(TargetsForSlot, CSByConstantArg.second, ResByArg))
- continue;
-
- if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second,
- ResByArg, Slot, CSByConstantArg.first))
- continue;
-
- // Find an allocation offset in bits in all vtables associated with the
- // type.
- uint64_t AllocBefore =
- findLowestOffset(TargetsForSlot, /*IsAfter=*/false, BitWidth);
- uint64_t AllocAfter =
- findLowestOffset(TargetsForSlot, /*IsAfter=*/true, BitWidth);
-
- // Calculate the total amount of padding needed to store a value at both
- // ends of the object.
- uint64_t TotalPaddingBefore = 0, TotalPaddingAfter = 0;
- for (auto &&Target : TargetsForSlot) {
- TotalPaddingBefore += std::max<int64_t>(
- (AllocBefore + 7) / 8 - Target.allocatedBeforeBytes() - 1, 0);
- TotalPaddingAfter += std::max<int64_t>(
- (AllocAfter + 7) / 8 - Target.allocatedAfterBytes() - 1, 0);
- }
-
- // If the amount of padding is too large, give up.
- // FIXME: do something smarter here.
- if (std::min(TotalPaddingBefore, TotalPaddingAfter) > 128)
- continue;
-
- // Calculate the offset to the value as a (possibly negative) byte offset
- // and (if applicable) a bit offset, and store the values in the targets.
- int64_t OffsetByte;
- uint64_t OffsetBit;
- if (TotalPaddingBefore <= TotalPaddingAfter)
- setBeforeReturnValues(TargetsForSlot, AllocBefore, BitWidth, OffsetByte,
- OffsetBit);
- else
- setAfterReturnValues(TargetsForSlot, AllocAfter, BitWidth, OffsetByte,
- OffsetBit);
-
- if (RemarksEnabled)
- for (auto &&Target : TargetsForSlot)
- Target.WasDevirt = true;
-
-
- if (CSByConstantArg.second.isExported()) {
- ResByArg->TheKind = WholeProgramDevirtResolution::ByArg::VirtualConstProp;
- exportConstant(Slot, CSByConstantArg.first, "byte", OffsetByte,
- ResByArg->Byte);
- exportConstant(Slot, CSByConstantArg.first, "bit", 1ULL << OffsetBit,
- ResByArg->Bit);
- }
-
- // Rewrite each call to a load from OffsetByte/OffsetBit.
- Constant *ByteConst = ConstantInt::get(Int32Ty, OffsetByte);
- Constant *BitConst = ConstantInt::get(Int8Ty, 1ULL << OffsetBit);
- applyVirtualConstProp(CSByConstantArg.second,
- TargetsForSlot[0].Fn->getName(), ByteConst, BitConst);
- }
- return true;
-}
-
-void DevirtModule::rebuildGlobal(VTableBits &B) {
- if (B.Before.Bytes.empty() && B.After.Bytes.empty())
- return;
-
- // Align the before byte array to the global's minimum alignment so that we
- // don't break any alignment requirements on the global.
- Align Alignment = M.getDataLayout().getValueOrABITypeAlignment(
- B.GV->getAlign(), B.GV->getValueType());
- B.Before.Bytes.resize(alignTo(B.Before.Bytes.size(), Alignment));
-
- // Before was stored in reverse order; flip it now.
- for (size_t I = 0, Size = B.Before.Bytes.size(); I != Size / 2; ++I)
- std::swap(B.Before.Bytes[I], B.Before.Bytes[Size - 1 - I]);
-
- // Build an anonymous global containing the before bytes, followed by the
- // original initializer, followed by the after bytes.
- auto NewInit = ConstantStruct::getAnon(
- {ConstantDataArray::get(M.getContext(), B.Before.Bytes),
- B.GV->getInitializer(),
- ConstantDataArray::get(M.getContext(), B.After.Bytes)});
- auto NewGV =
- new GlobalVariable(M, NewInit->getType(), B.GV->isConstant(),
- GlobalVariable::PrivateLinkage, NewInit, "", B.GV);
- NewGV->setSection(B.GV->getSection());
- NewGV->setComdat(B.GV->getComdat());
- NewGV->setAlignment(MaybeAlign(B.GV->getAlignment()));
-
- // Copy the original vtable's metadata to the anonymous global, adjusting
- // offsets as required.
- NewGV->copyMetadata(B.GV, B.Before.Bytes.size());
-
- // Build an alias named after the original global, pointing at the second
- // element (the original initializer).
- auto Alias = GlobalAlias::create(
- B.GV->getInitializer()->getType(), 0, B.GV->getLinkage(), "",
- ConstantExpr::getGetElementPtr(
- NewInit->getType(), NewGV,
- ArrayRef<Constant *>{ConstantInt::get(Int32Ty, 0),
- ConstantInt::get(Int32Ty, 1)}),
- &M);
- Alias->setVisibility(B.GV->getVisibility());
- Alias->takeName(B.GV);
-
- B.GV->replaceAllUsesWith(Alias);
- B.GV->eraseFromParent();
-}
-
-bool DevirtModule::areRemarksEnabled() {
- const auto &FL = M.getFunctionList();
- for (const Function &Fn : FL) {
- const auto &BBL = Fn.getBasicBlockList();
- if (BBL.empty())
- continue;
- auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBL.front());
- return DI.isEnabled();
- }
- return false;
-}
-
-void DevirtModule::scanTypeTestUsers(
- Function *TypeTestFunc,
- DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
- // Find all virtual calls via a virtual table pointer %p under an assumption
- // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p
- // points to a member of the type identifier %md. Group calls by (type ID,
- // offset) pair (effectively the identity of the virtual function) and store
- // to CallSlots.
- for (auto I = TypeTestFunc->use_begin(), E = TypeTestFunc->use_end();
- I != E;) {
- auto CI = dyn_cast<CallInst>(I->getUser());
- ++I;
- if (!CI)
- continue;
-
- // Search for virtual calls based on %p and add them to DevirtCalls.
- SmallVector<DevirtCallSite, 1> DevirtCalls;
- SmallVector<CallInst *, 1> Assumes;
- auto &DT = LookupDomTree(*CI->getFunction());
- findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT);
-
- Metadata *TypeId =
- cast<MetadataAsValue>(CI->getArgOperand(1))->getMetadata();
- // If we found any, add them to CallSlots.
- if (!Assumes.empty()) {
- Value *Ptr = CI->getArgOperand(0)->stripPointerCasts();
- for (DevirtCallSite Call : DevirtCalls)
- CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CB, nullptr);
- }
-
- auto RemoveTypeTestAssumes = [&]() {
- // We no longer need the assumes or the type test.
- for (auto Assume : Assumes)
- Assume->eraseFromParent();
- // We can't use RecursivelyDeleteTriviallyDeadInstructions here because we
- // may use the vtable argument later.
- if (CI->use_empty())
- CI->eraseFromParent();
- };
-
- // At this point we could remove all type test assume sequences, as they
- // were originally inserted for WPD. However, we can keep these in the
- // code stream for later analysis (e.g. to help drive more efficient ICP
- // sequences). They will eventually be removed by a second LowerTypeTests
- // invocation that cleans them up. In order to do this correctly, the first
- // LowerTypeTests invocation needs to know that they have "Unknown" type
- // test resolution, so that they aren't treated as Unsat and lowered to
- // False, which will break any uses on assumes. Below we remove any type
- // test assumes that will not be treated as Unknown by LTT.
-
- // The type test assumes will be treated by LTT as Unsat if the type id is
- // not used on a global (in which case it has no entry in the TypeIdMap).
- if (!TypeIdMap.count(TypeId))
- RemoveTypeTestAssumes();
-
- // For ThinLTO importing, we need to remove the type test assumes if this is
- // an MDString type id without a corresponding TypeIdSummary. Any
- // non-MDString type ids are ignored and treated as Unknown by LTT, so their
- // type test assumes can be kept. If the MDString type id is missing a
- // TypeIdSummary (e.g. because there was no use on a vcall, preventing the
- // exporting phase of WPD from analyzing it), then it would be treated as
- // Unsat by LTT and we need to remove its type test assumes here. If not
- // used on a vcall we don't need them for later optimization use in any
- // case.
- else if (ImportSummary && isa<MDString>(TypeId)) {
- const TypeIdSummary *TidSummary =
- ImportSummary->getTypeIdSummary(cast<MDString>(TypeId)->getString());
- if (!TidSummary)
- RemoveTypeTestAssumes();
- else
- // If one was created it should not be Unsat, because if we reached here
- // the type id was used on a global.
- assert(TidSummary->TTRes.TheKind != TypeTestResolution::Unsat);
- }
- }
-}
-
-void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
- Function *TypeTestFunc = Intrinsic::getDeclaration(&M, Intrinsic::type_test);
-
- for (auto I = TypeCheckedLoadFunc->use_begin(),
- E = TypeCheckedLoadFunc->use_end();
- I != E;) {
- auto CI = dyn_cast<CallInst>(I->getUser());
- ++I;
- if (!CI)
- continue;
-
- Value *Ptr = CI->getArgOperand(0);
- Value *Offset = CI->getArgOperand(1);
- Value *TypeIdValue = CI->getArgOperand(2);
- Metadata *TypeId = cast<MetadataAsValue>(TypeIdValue)->getMetadata();
-
- SmallVector<DevirtCallSite, 1> DevirtCalls;
- SmallVector<Instruction *, 1> LoadedPtrs;
- SmallVector<Instruction *, 1> Preds;
- bool HasNonCallUses = false;
- auto &DT = LookupDomTree(*CI->getFunction());
- findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds,
- HasNonCallUses, CI, DT);
-
- // Start by generating "pessimistic" code that explicitly loads the function
- // pointer from the vtable and performs the type check. If possible, we will
- // eliminate the load and the type check later.
-
- // If possible, only generate the load at the point where it is used.
- // This helps avoid unnecessary spills.
- IRBuilder<> LoadB(
- (LoadedPtrs.size() == 1 && !HasNonCallUses) ? LoadedPtrs[0] : CI);
- Value *GEP = LoadB.CreateGEP(Int8Ty, Ptr, Offset);
- Value *GEPPtr = LoadB.CreateBitCast(GEP, PointerType::getUnqual(Int8PtrTy));
- Value *LoadedValue = LoadB.CreateLoad(Int8PtrTy, GEPPtr);
-
- for (Instruction *LoadedPtr : LoadedPtrs) {
- LoadedPtr->replaceAllUsesWith(LoadedValue);
- LoadedPtr->eraseFromParent();
- }
-
- // Likewise for the type test.
- IRBuilder<> CallB((Preds.size() == 1 && !HasNonCallUses) ? Preds[0] : CI);
- CallInst *TypeTestCall = CallB.CreateCall(TypeTestFunc, {Ptr, TypeIdValue});
-
- for (Instruction *Pred : Preds) {
- Pred->replaceAllUsesWith(TypeTestCall);
- Pred->eraseFromParent();
- }
-
- // We have already erased any extractvalue instructions that refer to the
- // intrinsic call, but the intrinsic may have other non-extractvalue uses
- // (although this is unlikely). In that case, explicitly build a pair and
- // RAUW it.
- if (!CI->use_empty()) {
- Value *Pair = UndefValue::get(CI->getType());
- IRBuilder<> B(CI);
- Pair = B.CreateInsertValue(Pair, LoadedValue, {0});
- Pair = B.CreateInsertValue(Pair, TypeTestCall, {1});
- CI->replaceAllUsesWith(Pair);
- }
-
- // The number of unsafe uses is initially the number of uses.
- auto &NumUnsafeUses = NumUnsafeUsesForTypeTest[TypeTestCall];
- NumUnsafeUses = DevirtCalls.size();
-
- // If the function pointer has a non-call user, we cannot eliminate the type
- // check, as one of those users may eventually call the pointer. Increment
- // the unsafe use count to make sure it cannot reach zero.
- if (HasNonCallUses)
- ++NumUnsafeUses;
- for (DevirtCallSite Call : DevirtCalls) {
- CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CB,
- &NumUnsafeUses);
- }
-
- CI->eraseFromParent();
- }
-}
-
-void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
- auto *TypeId = dyn_cast<MDString>(Slot.TypeID);
- if (!TypeId)
- return;
- const TypeIdSummary *TidSummary =
- ImportSummary->getTypeIdSummary(TypeId->getString());
- if (!TidSummary)
- return;
- auto ResI = TidSummary->WPDRes.find(Slot.ByteOffset);
- if (ResI == TidSummary->WPDRes.end())
- return;
- const WholeProgramDevirtResolution &Res = ResI->second;
-
- if (Res.TheKind == WholeProgramDevirtResolution::SingleImpl) {
- assert(!Res.SingleImplName.empty());
- // The type of the function in the declaration is irrelevant because every
- // call site will cast it to the correct type.
- Constant *SingleImpl =
- cast<Constant>(M.getOrInsertFunction(Res.SingleImplName,
- Type::getVoidTy(M.getContext()))
- .getCallee());
-
- // This is the import phase so we should not be exporting anything.
- bool IsExported = false;
- applySingleImplDevirt(SlotInfo, SingleImpl, IsExported);
- assert(!IsExported);
- }
-
- for (auto &CSByConstantArg : SlotInfo.ConstCSInfo) {
- auto I = Res.ResByArg.find(CSByConstantArg.first);
- if (I == Res.ResByArg.end())
- continue;
- auto &ResByArg = I->second;
- // FIXME: We should figure out what to do about the "function name" argument
- // to the apply* functions, as the function names are unavailable during the
- // importing phase. For now we just pass the empty string. This does not
- // impact correctness because the function names are just used for remarks.
- switch (ResByArg.TheKind) {
- case WholeProgramDevirtResolution::ByArg::UniformRetVal:
- applyUniformRetValOpt(CSByConstantArg.second, "", ResByArg.Info);
- break;
- case WholeProgramDevirtResolution::ByArg::UniqueRetVal: {
- Constant *UniqueMemberAddr =
- importGlobal(Slot, CSByConstantArg.first, "unique_member");
- applyUniqueRetValOpt(CSByConstantArg.second, "", ResByArg.Info,
- UniqueMemberAddr);
- break;
- }
- case WholeProgramDevirtResolution::ByArg::VirtualConstProp: {
- Constant *Byte = importConstant(Slot, CSByConstantArg.first, "byte",
- Int32Ty, ResByArg.Byte);
- Constant *Bit = importConstant(Slot, CSByConstantArg.first, "bit", Int8Ty,
- ResByArg.Bit);
- applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit);
- break;
- }
- default:
- break;
- }
- }
-
- if (Res.TheKind == WholeProgramDevirtResolution::BranchFunnel) {
- // The type of the function is irrelevant, because it's bitcast at calls
- // anyhow.
- Constant *JT = cast<Constant>(
- M.getOrInsertFunction(getGlobalName(Slot, {}, "branch_funnel"),
- Type::getVoidTy(M.getContext()))
- .getCallee());
- bool IsExported = false;
- applyICallBranchFunnel(SlotInfo, JT, IsExported);
- assert(!IsExported);
- }
-}
-
-void DevirtModule::removeRedundantTypeTests() {
- auto True = ConstantInt::getTrue(M.getContext());
- for (auto &&U : NumUnsafeUsesForTypeTest) {
- if (U.second == 0) {
- U.first->replaceAllUsesWith(True);
- U.first->eraseFromParent();
- }
- }
-}
-
-bool DevirtModule::run() {
- // If only some of the modules were split, we cannot correctly perform
- // this transformation. We already checked for the presense of type tests
- // with partially split modules during the thin link, and would have emitted
- // an error if any were found, so here we can simply return.
- if ((ExportSummary && ExportSummary->partiallySplitLTOUnits()) ||
- (ImportSummary && ImportSummary->partiallySplitLTOUnits()))
- return false;
-
- Function *TypeTestFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_test));
- Function *TypeCheckedLoadFunc =
- M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
- Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume));
-
- // Normally if there are no users of the devirtualization intrinsics in the
- // module, this pass has nothing to do. But if we are exporting, we also need
- // to handle any users that appear only in the function summaries.
- if (!ExportSummary &&
- (!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc ||
- AssumeFunc->use_empty()) &&
- (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
- return false;
-
- // Rebuild type metadata into a map for easy lookup.
- std::vector<VTableBits> Bits;
- DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
- buildTypeIdentifierMap(Bits, TypeIdMap);
-
- if (TypeTestFunc && AssumeFunc)
- scanTypeTestUsers(TypeTestFunc, TypeIdMap);
-
- if (TypeCheckedLoadFunc)
- scanTypeCheckedLoadUsers(TypeCheckedLoadFunc);
-
- if (ImportSummary) {
- for (auto &S : CallSlots)
- importResolution(S.first, S.second);
-
- removeRedundantTypeTests();
-
- // We have lowered or deleted the type instrinsics, so we will no
- // longer have enough information to reason about the liveness of virtual
- // function pointers in GlobalDCE.
- for (GlobalVariable &GV : M.globals())
- GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
-
- // The rest of the code is only necessary when exporting or during regular
- // LTO, so we are done.
- return true;
- }
-
- if (TypeIdMap.empty())
- return true;
-
- // Collect information from summary about which calls to try to devirtualize.
- if (ExportSummary) {
- DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
- for (auto &P : TypeIdMap) {
- if (auto *TypeId = dyn_cast<MDString>(P.first))
- MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back(
- TypeId);
- }
-
- for (auto &P : *ExportSummary) {
- for (auto &S : P.second.SummaryList) {
- auto *FS = dyn_cast<FunctionSummary>(S.get());
- if (!FS)
- continue;
- // FIXME: Only add live functions.
- for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
- for (Metadata *MD : MetadataByGUID[VF.GUID]) {
- CallSlots[{MD, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS);
- }
- }
- for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
- for (Metadata *MD : MetadataByGUID[VF.GUID]) {
- CallSlots[{MD, VF.Offset}].CSInfo.addSummaryTypeCheckedLoadUser(FS);
- }
- }
- for (const FunctionSummary::ConstVCall &VC :
- FS->type_test_assume_const_vcalls()) {
- for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
- CallSlots[{MD, VC.VFunc.Offset}]
- .ConstCSInfo[VC.Args]
- .addSummaryTypeTestAssumeUser(FS);
- }
- }
- for (const FunctionSummary::ConstVCall &VC :
- FS->type_checked_load_const_vcalls()) {
- for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
- CallSlots[{MD, VC.VFunc.Offset}]
- .ConstCSInfo[VC.Args]
- .addSummaryTypeCheckedLoadUser(FS);
- }
- }
- }
- }
- }
-
- // For each (type, offset) pair:
- bool DidVirtualConstProp = false;
- std::map<std::string, Function*> DevirtTargets;
- for (auto &S : CallSlots) {
- // Search each of the members of the type identifier for the virtual
- // function implementation at offset S.first.ByteOffset, and add to
- // TargetsForSlot.
- std::vector<VirtualCallTarget> TargetsForSlot;
- WholeProgramDevirtResolution *Res = nullptr;
- const std::set<TypeMemberInfo> &TypeMemberInfos = TypeIdMap[S.first.TypeID];
- if (ExportSummary && isa<MDString>(S.first.TypeID) &&
- TypeMemberInfos.size())
- // For any type id used on a global's type metadata, create the type id
- // summary resolution regardless of whether we can devirtualize, so that
- // lower type tests knows the type id is not Unsat. If it was not used on
- // a global's type metadata, the TypeIdMap entry set will be empty, and
- // we don't want to create an entry (with the default Unknown type
- // resolution), which can prevent detection of the Unsat.
- Res = &ExportSummary
- ->getOrInsertTypeIdSummary(
- cast<MDString>(S.first.TypeID)->getString())
- .WPDRes[S.first.ByteOffset];
- if (tryFindVirtualCallTargets(TargetsForSlot, TypeMemberInfos,
- S.first.ByteOffset)) {
-
- if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) {
- DidVirtualConstProp |=
- tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first);
-
- tryICallBranchFunnel(TargetsForSlot, S.second, Res, S.first);
- }
-
- // Collect functions devirtualized at least for one call site for stats.
- if (RemarksEnabled)
- for (const auto &T : TargetsForSlot)
- if (T.WasDevirt)
- DevirtTargets[std::string(T.Fn->getName())] = T.Fn;
- }
-
- // CFI-specific: if we are exporting and any llvm.type.checked.load
- // intrinsics were *not* devirtualized, we need to add the resulting
- // llvm.type.test intrinsics to the function summaries so that the
- // LowerTypeTests pass will export them.
- if (ExportSummary && isa<MDString>(S.first.TypeID)) {
- auto GUID =
- GlobalValue::getGUID(cast<MDString>(S.first.TypeID)->getString());
- for (auto FS : S.second.CSInfo.SummaryTypeCheckedLoadUsers)
- FS->addTypeTest(GUID);
- for (auto &CCS : S.second.ConstCSInfo)
- for (auto FS : CCS.second.SummaryTypeCheckedLoadUsers)
- FS->addTypeTest(GUID);
- }
- }
-
- if (RemarksEnabled) {
- // Generate remarks for each devirtualized function.
- for (const auto &DT : DevirtTargets) {
- Function *F = DT.second;
-
- using namespace ore;
- OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F)
- << "devirtualized "
- << NV("FunctionName", DT.first));
- }
- }
-
- removeRedundantTypeTests();
-
- // Rebuild each global we touched as part of virtual constant propagation to
- // include the before and after bytes.
- if (DidVirtualConstProp)
- for (VTableBits &B : Bits)
- rebuildGlobal(B);
-
- // We have lowered or deleted the type instrinsics, so we will no
- // longer have enough information to reason about the liveness of virtual
- // function pointers in GlobalDCE.
- for (GlobalVariable &GV : M.globals())
- GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
-
- return true;
-}
-
-void DevirtIndex::run() {
- if (ExportSummary.typeIdCompatibleVtableMap().empty())
- return;
-
- DenseMap<GlobalValue::GUID, std::vector<StringRef>> NameByGUID;
- for (auto &P : ExportSummary.typeIdCompatibleVtableMap()) {
- NameByGUID[GlobalValue::getGUID(P.first)].push_back(P.first);
- }
-
- // Collect information from summary about which calls to try to devirtualize.
- for (auto &P : ExportSummary) {
- for (auto &S : P.second.SummaryList) {
- auto *FS = dyn_cast<FunctionSummary>(S.get());
- if (!FS)
- continue;
- // FIXME: Only add live functions.
- for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
- for (StringRef Name : NameByGUID[VF.GUID]) {
- CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS);
- }
- }
- for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
- for (StringRef Name : NameByGUID[VF.GUID]) {
- CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeCheckedLoadUser(FS);
- }
- }
- for (const FunctionSummary::ConstVCall &VC :
- FS->type_test_assume_const_vcalls()) {
- for (StringRef Name : NameByGUID[VC.VFunc.GUID]) {
- CallSlots[{Name, VC.VFunc.Offset}]
- .ConstCSInfo[VC.Args]
- .addSummaryTypeTestAssumeUser(FS);
- }
- }
- for (const FunctionSummary::ConstVCall &VC :
- FS->type_checked_load_const_vcalls()) {
- for (StringRef Name : NameByGUID[VC.VFunc.GUID]) {
- CallSlots[{Name, VC.VFunc.Offset}]
- .ConstCSInfo[VC.Args]
- .addSummaryTypeCheckedLoadUser(FS);
- }
- }
- }
- }
-
- std::set<ValueInfo> DevirtTargets;
- // For each (type, offset) pair:
- for (auto &S : CallSlots) {
- // Search each of the members of the type identifier for the virtual
- // function implementation at offset S.first.ByteOffset, and add to
- // TargetsForSlot.
- std::vector<ValueInfo> TargetsForSlot;
- auto TidSummary = ExportSummary.getTypeIdCompatibleVtableSummary(S.first.TypeID);
- assert(TidSummary);
- // Create the type id summary resolution regardlness of whether we can
- // devirtualize, so that lower type tests knows the type id is used on
- // a global and not Unsat.
- WholeProgramDevirtResolution *Res =
- &ExportSummary.getOrInsertTypeIdSummary(S.first.TypeID)
- .WPDRes[S.first.ByteOffset];
- if (tryFindVirtualCallTargets(TargetsForSlot, *TidSummary,
- S.first.ByteOffset)) {
-
- if (!trySingleImplDevirt(TargetsForSlot, S.first, S.second, Res,
- DevirtTargets))
- continue;
- }
- }
-
- // Optionally have the thin link print message for each devirtualized
- // function.
- if (PrintSummaryDevirt)
- for (const auto &DT : DevirtTargets)
- errs() << "Devirtualized call to " << DT << "\n";
-}
+
+ CallBase *NewCS = nullptr;
+ if (isa<CallInst>(CB))
+ NewCS = IRB.CreateCall(NewFT, IRB.CreateBitCast(JT, NewFTPtr), Args);
+ else
+ NewCS = IRB.CreateInvoke(NewFT, IRB.CreateBitCast(JT, NewFTPtr),
+ cast<InvokeInst>(CB).getNormalDest(),
+ cast<InvokeInst>(CB).getUnwindDest(), Args);
+ NewCS->setCallingConv(CB.getCallingConv());
+
+ AttributeList Attrs = CB.getAttributes();
+ std::vector<AttributeSet> NewArgAttrs;
+ NewArgAttrs.push_back(AttributeSet::get(
+ M.getContext(), ArrayRef<Attribute>{Attribute::get(
+ M.getContext(), Attribute::Nest)}));
+ for (unsigned I = 0; I + 2 < Attrs.getNumAttrSets(); ++I)
+ NewArgAttrs.push_back(Attrs.getParamAttributes(I));
+ NewCS->setAttributes(
+ AttributeList::get(M.getContext(), Attrs.getFnAttributes(),
+ Attrs.getRetAttributes(), NewArgAttrs));
+
+ CB.replaceAllUsesWith(NewCS);
+ CB.eraseFromParent();
+
+ // This use is no longer unsafe.
+ if (VCallSite.NumUnsafeUses)
+ --*VCallSite.NumUnsafeUses;
+ }
+ // Don't mark as devirtualized because there may be callers compiled without
+ // retpoline mitigation, which would mean that they are lowered to
+ // llvm.type.test and therefore require an llvm.type.test resolution for the
+ // type identifier.
+ };
+ Apply(SlotInfo.CSInfo);
+ for (auto &P : SlotInfo.ConstCSInfo)
+ Apply(P.second);
+}
+
+bool DevirtModule::tryEvaluateFunctionsWithArgs(
+ MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+ ArrayRef<uint64_t> Args) {
+ // Evaluate each function and store the result in each target's RetVal
+ // field.
+ for (VirtualCallTarget &Target : TargetsForSlot) {
+ if (Target.Fn->arg_size() != Args.size() + 1)
+ return false;
+
+ Evaluator Eval(M.getDataLayout(), nullptr);
+ SmallVector<Constant *, 2> EvalArgs;
+ EvalArgs.push_back(
+ Constant::getNullValue(Target.Fn->getFunctionType()->getParamType(0)));
+ for (unsigned I = 0; I != Args.size(); ++I) {
+ auto *ArgTy = dyn_cast<IntegerType>(
+ Target.Fn->getFunctionType()->getParamType(I + 1));
+ if (!ArgTy)
+ return false;
+ EvalArgs.push_back(ConstantInt::get(ArgTy, Args[I]));
+ }
+
+ Constant *RetVal;
+ if (!Eval.EvaluateFunction(Target.Fn, RetVal, EvalArgs) ||
+ !isa<ConstantInt>(RetVal))
+ return false;
+ Target.RetVal = cast<ConstantInt>(RetVal)->getZExtValue();
+ }
+ return true;
+}
+
+void DevirtModule::applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+ uint64_t TheRetVal) {
+ for (auto Call : CSInfo.CallSites)
+ Call.replaceAndErase(
+ "uniform-ret-val", FnName, RemarksEnabled, OREGetter,
+ ConstantInt::get(cast<IntegerType>(Call.CB.getType()), TheRetVal));
+ CSInfo.markDevirt();
+}
+
+bool DevirtModule::tryUniformRetValOpt(
+ MutableArrayRef<VirtualCallTarget> TargetsForSlot, CallSiteInfo &CSInfo,
+ WholeProgramDevirtResolution::ByArg *Res) {
+ // Uniform return value optimization. If all functions return the same
+ // constant, replace all calls with that constant.
+ uint64_t TheRetVal = TargetsForSlot[0].RetVal;
+ for (const VirtualCallTarget &Target : TargetsForSlot)
+ if (Target.RetVal != TheRetVal)
+ return false;
+
+ if (CSInfo.isExported()) {
+ Res->TheKind = WholeProgramDevirtResolution::ByArg::UniformRetVal;
+ Res->Info = TheRetVal;
+ }
+
+ applyUniformRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), TheRetVal);
+ if (RemarksEnabled)
+ for (auto &&Target : TargetsForSlot)
+ Target.WasDevirt = true;
+ return true;
+}
+
+std::string DevirtModule::getGlobalName(VTableSlot Slot,
+ ArrayRef<uint64_t> Args,
+ StringRef Name) {
+ std::string FullName = "__typeid_";
+ raw_string_ostream OS(FullName);
+ OS << cast<MDString>(Slot.TypeID)->getString() << '_' << Slot.ByteOffset;
+ for (uint64_t Arg : Args)
+ OS << '_' << Arg;
+ OS << '_' << Name;
+ return OS.str();
+}
+
+bool DevirtModule::shouldExportConstantsAsAbsoluteSymbols() {
+ Triple T(M.getTargetTriple());
+ return T.isX86() && T.getObjectFormat() == Triple::ELF;
+}
+
+void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+ StringRef Name, Constant *C) {
+ GlobalAlias *GA = GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage,
+ getGlobalName(Slot, Args, Name), C, &M);
+ GA->setVisibility(GlobalValue::HiddenVisibility);
+}
+
+void DevirtModule::exportConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
+ StringRef Name, uint32_t Const,
+ uint32_t &Storage) {
+ if (shouldExportConstantsAsAbsoluteSymbols()) {
+ exportGlobal(
+ Slot, Args, Name,
+ ConstantExpr::getIntToPtr(ConstantInt::get(Int32Ty, Const), Int8PtrTy));
+ return;
+ }
+
+ Storage = Const;
+}
+
+Constant *DevirtModule::importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+ StringRef Name) {
+ Constant *C =
+ M.getOrInsertGlobal(getGlobalName(Slot, Args, Name), Int8Arr0Ty);
+ auto *GV = dyn_cast<GlobalVariable>(C);
+ if (GV)
+ GV->setVisibility(GlobalValue::HiddenVisibility);
+ return C;
+}
+
+Constant *DevirtModule::importConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
+ StringRef Name, IntegerType *IntTy,
+ uint32_t Storage) {
+ if (!shouldExportConstantsAsAbsoluteSymbols())
+ return ConstantInt::get(IntTy, Storage);
+
+ Constant *C = importGlobal(Slot, Args, Name);
+ auto *GV = cast<GlobalVariable>(C->stripPointerCasts());
+ C = ConstantExpr::getPtrToInt(C, IntTy);
+
+ // We only need to set metadata if the global is newly created, in which
+ // case it would not have hidden visibility.
+ if (GV->hasMetadata(LLVMContext::MD_absolute_symbol))
+ return C;
+
+ auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
+ auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
+ auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
+ GV->setMetadata(LLVMContext::MD_absolute_symbol,
+ MDNode::get(M.getContext(), {MinC, MaxC}));
+ };
+ unsigned AbsWidth = IntTy->getBitWidth();
+ if (AbsWidth == IntPtrTy->getBitWidth())
+ SetAbsRange(~0ull, ~0ull); // Full set.
+ else
+ SetAbsRange(0, 1ull << AbsWidth);
+ return C;
+}
+
+void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+ bool IsOne,
+ Constant *UniqueMemberAddr) {
+ for (auto &&Call : CSInfo.CallSites) {
+ IRBuilder<> B(&Call.CB);
+ Value *Cmp =
+ B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, Call.VTable,
+ B.CreateBitCast(UniqueMemberAddr, Call.VTable->getType()));
+ Cmp = B.CreateZExt(Cmp, Call.CB.getType());
+ Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, OREGetter,
+ Cmp);
+ }
+ CSInfo.markDevirt();
+}
+
+Constant *DevirtModule::getMemberAddr(const TypeMemberInfo *M) {
+ Constant *C = ConstantExpr::getBitCast(M->Bits->GV, Int8PtrTy);
+ return ConstantExpr::getGetElementPtr(Int8Ty, C,
+ ConstantInt::get(Int64Ty, M->Offset));
+}
+
+bool DevirtModule::tryUniqueRetValOpt(
+ unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+ CallSiteInfo &CSInfo, WholeProgramDevirtResolution::ByArg *Res,
+ VTableSlot Slot, ArrayRef<uint64_t> Args) {
+ // IsOne controls whether we look for a 0 or a 1.
+ auto tryUniqueRetValOptFor = [&](bool IsOne) {
+ const TypeMemberInfo *UniqueMember = nullptr;
+ for (const VirtualCallTarget &Target : TargetsForSlot) {
+ if (Target.RetVal == (IsOne ? 1 : 0)) {
+ if (UniqueMember)
+ return false;
+ UniqueMember = Target.TM;
+ }
+ }
+
+ // We should have found a unique member or bailed out by now. We already
+ // checked for a uniform return value in tryUniformRetValOpt.
+ assert(UniqueMember);
+
+ Constant *UniqueMemberAddr = getMemberAddr(UniqueMember);
+ if (CSInfo.isExported()) {
+ Res->TheKind = WholeProgramDevirtResolution::ByArg::UniqueRetVal;
+ Res->Info = IsOne;
+
+ exportGlobal(Slot, Args, "unique_member", UniqueMemberAddr);
+ }
+
+ // Replace each call with the comparison.
+ applyUniqueRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), IsOne,
+ UniqueMemberAddr);
+
+ // Update devirtualization statistics for targets.
+ if (RemarksEnabled)
+ for (auto &&Target : TargetsForSlot)
+ Target.WasDevirt = true;
+
+ return true;
+ };
+
+ if (BitWidth == 1) {
+ if (tryUniqueRetValOptFor(true))
+ return true;
+ if (tryUniqueRetValOptFor(false))
+ return true;
+ }
+ return false;
+}
+
+void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
+ Constant *Byte, Constant *Bit) {
+ for (auto Call : CSInfo.CallSites) {
+ auto *RetType = cast<IntegerType>(Call.CB.getType());
+ IRBuilder<> B(&Call.CB);
+ Value *Addr =
+ B.CreateGEP(Int8Ty, B.CreateBitCast(Call.VTable, Int8PtrTy), Byte);
+ if (RetType->getBitWidth() == 1) {
+ Value *Bits = B.CreateLoad(Int8Ty, Addr);
+ Value *BitsAndBit = B.CreateAnd(Bits, Bit);
+ auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0));
+ Call.replaceAndErase("virtual-const-prop-1-bit", FnName, RemarksEnabled,
+ OREGetter, IsBitSet);
+ } else {
+ Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo());
+ Value *Val = B.CreateLoad(RetType, ValAddr);
+ Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled,
+ OREGetter, Val);
+ }
+ }
+ CSInfo.markDevirt();
+}
+
+bool DevirtModule::tryVirtualConstProp(
+ MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
+ WholeProgramDevirtResolution *Res, VTableSlot Slot) {
+ // This only works if the function returns an integer.
+ auto RetType = dyn_cast<IntegerType>(TargetsForSlot[0].Fn->getReturnType());
+ if (!RetType)
+ return false;
+ unsigned BitWidth = RetType->getBitWidth();
+ if (BitWidth > 64)
+ return false;
+
+ // Make sure that each function is defined, does not access memory, takes at
+ // least one argument, does not use its first argument (which we assume is
+ // 'this'), and has the same return type.
+ //
+ // Note that we test whether this copy of the function is readnone, rather
+ // than testing function attributes, which must hold for any copy of the
+ // function, even a less optimized version substituted at link time. This is
+ // sound because the virtual constant propagation optimizations effectively
+ // inline all implementations of the virtual function into each call site,
+ // rather than using function attributes to perform local optimization.
+ for (VirtualCallTarget &Target : TargetsForSlot) {
+ if (Target.Fn->isDeclaration() ||
+ computeFunctionBodyMemoryAccess(*Target.Fn, AARGetter(*Target.Fn)) !=
+ MAK_ReadNone ||
+ Target.Fn->arg_empty() || !Target.Fn->arg_begin()->use_empty() ||
+ Target.Fn->getReturnType() != RetType)
+ return false;
+ }
+
+ for (auto &&CSByConstantArg : SlotInfo.ConstCSInfo) {
+ if (!tryEvaluateFunctionsWithArgs(TargetsForSlot, CSByConstantArg.first))
+ continue;
+
+ WholeProgramDevirtResolution::ByArg *ResByArg = nullptr;
+ if (Res)
+ ResByArg = &Res->ResByArg[CSByConstantArg.first];
+
+ if (tryUniformRetValOpt(TargetsForSlot, CSByConstantArg.second, ResByArg))
+ continue;
+
+ if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second,
+ ResByArg, Slot, CSByConstantArg.first))
+ continue;
+
+ // Find an allocation offset in bits in all vtables associated with the
+ // type.
+ uint64_t AllocBefore =
+ findLowestOffset(TargetsForSlot, /*IsAfter=*/false, BitWidth);
+ uint64_t AllocAfter =
+ findLowestOffset(TargetsForSlot, /*IsAfter=*/true, BitWidth);
+
+ // Calculate the total amount of padding needed to store a value at both
+ // ends of the object.
+ uint64_t TotalPaddingBefore = 0, TotalPaddingAfter = 0;
+ for (auto &&Target : TargetsForSlot) {
+ TotalPaddingBefore += std::max<int64_t>(
+ (AllocBefore + 7) / 8 - Target.allocatedBeforeBytes() - 1, 0);
+ TotalPaddingAfter += std::max<int64_t>(
+ (AllocAfter + 7) / 8 - Target.allocatedAfterBytes() - 1, 0);
+ }
+
+ // If the amount of padding is too large, give up.
+ // FIXME: do something smarter here.
+ if (std::min(TotalPaddingBefore, TotalPaddingAfter) > 128)
+ continue;
+
+ // Calculate the offset to the value as a (possibly negative) byte offset
+ // and (if applicable) a bit offset, and store the values in the targets.
+ int64_t OffsetByte;
+ uint64_t OffsetBit;
+ if (TotalPaddingBefore <= TotalPaddingAfter)
+ setBeforeReturnValues(TargetsForSlot, AllocBefore, BitWidth, OffsetByte,
+ OffsetBit);
+ else
+ setAfterReturnValues(TargetsForSlot, AllocAfter, BitWidth, OffsetByte,
+ OffsetBit);
+
+ if (RemarksEnabled)
+ for (auto &&Target : TargetsForSlot)
+ Target.WasDevirt = true;
+
+
+ if (CSByConstantArg.second.isExported()) {
+ ResByArg->TheKind = WholeProgramDevirtResolution::ByArg::VirtualConstProp;
+ exportConstant(Slot, CSByConstantArg.first, "byte", OffsetByte,
+ ResByArg->Byte);
+ exportConstant(Slot, CSByConstantArg.first, "bit", 1ULL << OffsetBit,
+ ResByArg->Bit);
+ }
+
+ // Rewrite each call to a load from OffsetByte/OffsetBit.
+ Constant *ByteConst = ConstantInt::get(Int32Ty, OffsetByte);
+ Constant *BitConst = ConstantInt::get(Int8Ty, 1ULL << OffsetBit);
+ applyVirtualConstProp(CSByConstantArg.second,
+ TargetsForSlot[0].Fn->getName(), ByteConst, BitConst);
+ }
+ return true;
+}
+
+void DevirtModule::rebuildGlobal(VTableBits &B) {
+ if (B.Before.Bytes.empty() && B.After.Bytes.empty())
+ return;
+
+ // Align the before byte array to the global's minimum alignment so that we
+ // don't break any alignment requirements on the global.
+ Align Alignment = M.getDataLayout().getValueOrABITypeAlignment(
+ B.GV->getAlign(), B.GV->getValueType());
+ B.Before.Bytes.resize(alignTo(B.Before.Bytes.size(), Alignment));
+
+ // Before was stored in reverse order; flip it now.
+ for (size_t I = 0, Size = B.Before.Bytes.size(); I != Size / 2; ++I)
+ std::swap(B.Before.Bytes[I], B.Before.Bytes[Size - 1 - I]);
+
+ // Build an anonymous global containing the before bytes, followed by the
+ // original initializer, followed by the after bytes.
+ auto NewInit = ConstantStruct::getAnon(
+ {ConstantDataArray::get(M.getContext(), B.Before.Bytes),
+ B.GV->getInitializer(),
+ ConstantDataArray::get(M.getContext(), B.After.Bytes)});
+ auto NewGV =
+ new GlobalVariable(M, NewInit->getType(), B.GV->isConstant(),
+ GlobalVariable::PrivateLinkage, NewInit, "", B.GV);
+ NewGV->setSection(B.GV->getSection());
+ NewGV->setComdat(B.GV->getComdat());
+ NewGV->setAlignment(MaybeAlign(B.GV->getAlignment()));
+
+ // Copy the original vtable's metadata to the anonymous global, adjusting
+ // offsets as required.
+ NewGV->copyMetadata(B.GV, B.Before.Bytes.size());
+
+ // Build an alias named after the original global, pointing at the second
+ // element (the original initializer).
+ auto Alias = GlobalAlias::create(
+ B.GV->getInitializer()->getType(), 0, B.GV->getLinkage(), "",
+ ConstantExpr::getGetElementPtr(
+ NewInit->getType(), NewGV,
+ ArrayRef<Constant *>{ConstantInt::get(Int32Ty, 0),
+ ConstantInt::get(Int32Ty, 1)}),
+ &M);
+ Alias->setVisibility(B.GV->getVisibility());
+ Alias->takeName(B.GV);
+
+ B.GV->replaceAllUsesWith(Alias);
+ B.GV->eraseFromParent();
+}
+
+bool DevirtModule::areRemarksEnabled() {
+ const auto &FL = M.getFunctionList();
+ for (const Function &Fn : FL) {
+ const auto &BBL = Fn.getBasicBlockList();
+ if (BBL.empty())
+ continue;
+ auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBL.front());
+ return DI.isEnabled();
+ }
+ return false;
+}
+
+void DevirtModule::scanTypeTestUsers(
+ Function *TypeTestFunc,
+ DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
+ // Find all virtual calls via a virtual table pointer %p under an assumption
+ // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p
+ // points to a member of the type identifier %md. Group calls by (type ID,
+ // offset) pair (effectively the identity of the virtual function) and store
+ // to CallSlots.
+ for (auto I = TypeTestFunc->use_begin(), E = TypeTestFunc->use_end();
+ I != E;) {
+ auto CI = dyn_cast<CallInst>(I->getUser());
+ ++I;
+ if (!CI)
+ continue;
+
+ // Search for virtual calls based on %p and add them to DevirtCalls.
+ SmallVector<DevirtCallSite, 1> DevirtCalls;
+ SmallVector<CallInst *, 1> Assumes;
+ auto &DT = LookupDomTree(*CI->getFunction());
+ findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT);
+
+ Metadata *TypeId =
+ cast<MetadataAsValue>(CI->getArgOperand(1))->getMetadata();
+ // If we found any, add them to CallSlots.
+ if (!Assumes.empty()) {
+ Value *Ptr = CI->getArgOperand(0)->stripPointerCasts();
+ for (DevirtCallSite Call : DevirtCalls)
+ CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CB, nullptr);
+ }
+
+ auto RemoveTypeTestAssumes = [&]() {
+ // We no longer need the assumes or the type test.
+ for (auto Assume : Assumes)
+ Assume->eraseFromParent();
+ // We can't use RecursivelyDeleteTriviallyDeadInstructions here because we
+ // may use the vtable argument later.
+ if (CI->use_empty())
+ CI->eraseFromParent();
+ };
+
+ // At this point we could remove all type test assume sequences, as they
+ // were originally inserted for WPD. However, we can keep these in the
+ // code stream for later analysis (e.g. to help drive more efficient ICP
+ // sequences). They will eventually be removed by a second LowerTypeTests
+ // invocation that cleans them up. In order to do this correctly, the first
+ // LowerTypeTests invocation needs to know that they have "Unknown" type
+ // test resolution, so that they aren't treated as Unsat and lowered to
+ // False, which will break any uses on assumes. Below we remove any type
+ // test assumes that will not be treated as Unknown by LTT.
+
+ // The type test assumes will be treated by LTT as Unsat if the type id is
+ // not used on a global (in which case it has no entry in the TypeIdMap).
+ if (!TypeIdMap.count(TypeId))
+ RemoveTypeTestAssumes();
+
+ // For ThinLTO importing, we need to remove the type test assumes if this is
+ // an MDString type id without a corresponding TypeIdSummary. Any
+ // non-MDString type ids are ignored and treated as Unknown by LTT, so their
+ // type test assumes can be kept. If the MDString type id is missing a
+ // TypeIdSummary (e.g. because there was no use on a vcall, preventing the
+ // exporting phase of WPD from analyzing it), then it would be treated as
+ // Unsat by LTT and we need to remove its type test assumes here. If not
+ // used on a vcall we don't need them for later optimization use in any
+ // case.
+ else if (ImportSummary && isa<MDString>(TypeId)) {
+ const TypeIdSummary *TidSummary =
+ ImportSummary->getTypeIdSummary(cast<MDString>(TypeId)->getString());
+ if (!TidSummary)
+ RemoveTypeTestAssumes();
+ else
+ // If one was created it should not be Unsat, because if we reached here
+ // the type id was used on a global.
+ assert(TidSummary->TTRes.TheKind != TypeTestResolution::Unsat);
+ }
+ }
+}
+
+void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
+ Function *TypeTestFunc = Intrinsic::getDeclaration(&M, Intrinsic::type_test);
+
+ for (auto I = TypeCheckedLoadFunc->use_begin(),
+ E = TypeCheckedLoadFunc->use_end();
+ I != E;) {
+ auto CI = dyn_cast<CallInst>(I->getUser());
+ ++I;
+ if (!CI)
+ continue;
+
+ Value *Ptr = CI->getArgOperand(0);
+ Value *Offset = CI->getArgOperand(1);
+ Value *TypeIdValue = CI->getArgOperand(2);
+ Metadata *TypeId = cast<MetadataAsValue>(TypeIdValue)->getMetadata();
+
+ SmallVector<DevirtCallSite, 1> DevirtCalls;
+ SmallVector<Instruction *, 1> LoadedPtrs;
+ SmallVector<Instruction *, 1> Preds;
+ bool HasNonCallUses = false;
+ auto &DT = LookupDomTree(*CI->getFunction());
+ findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds,
+ HasNonCallUses, CI, DT);
+
+ // Start by generating "pessimistic" code that explicitly loads the function
+ // pointer from the vtable and performs the type check. If possible, we will
+ // eliminate the load and the type check later.
+
+ // If possible, only generate the load at the point where it is used.
+ // This helps avoid unnecessary spills.
+ IRBuilder<> LoadB(
+ (LoadedPtrs.size() == 1 && !HasNonCallUses) ? LoadedPtrs[0] : CI);
+ Value *GEP = LoadB.CreateGEP(Int8Ty, Ptr, Offset);
+ Value *GEPPtr = LoadB.CreateBitCast(GEP, PointerType::getUnqual(Int8PtrTy));
+ Value *LoadedValue = LoadB.CreateLoad(Int8PtrTy, GEPPtr);
+
+ for (Instruction *LoadedPtr : LoadedPtrs) {
+ LoadedPtr->replaceAllUsesWith(LoadedValue);
+ LoadedPtr->eraseFromParent();
+ }
+
+ // Likewise for the type test.
+ IRBuilder<> CallB((Preds.size() == 1 && !HasNonCallUses) ? Preds[0] : CI);
+ CallInst *TypeTestCall = CallB.CreateCall(TypeTestFunc, {Ptr, TypeIdValue});
+
+ for (Instruction *Pred : Preds) {
+ Pred->replaceAllUsesWith(TypeTestCall);
+ Pred->eraseFromParent();
+ }
+
+ // We have already erased any extractvalue instructions that refer to the
+ // intrinsic call, but the intrinsic may have other non-extractvalue uses
+ // (although this is unlikely). In that case, explicitly build a pair and
+ // RAUW it.
+ if (!CI->use_empty()) {
+ Value *Pair = UndefValue::get(CI->getType());
+ IRBuilder<> B(CI);
+ Pair = B.CreateInsertValue(Pair, LoadedValue, {0});
+ Pair = B.CreateInsertValue(Pair, TypeTestCall, {1});
+ CI->replaceAllUsesWith(Pair);
+ }
+
+ // The number of unsafe uses is initially the number of uses.
+ auto &NumUnsafeUses = NumUnsafeUsesForTypeTest[TypeTestCall];
+ NumUnsafeUses = DevirtCalls.size();
+
+ // If the function pointer has a non-call user, we cannot eliminate the type
+ // check, as one of those users may eventually call the pointer. Increment
+ // the unsafe use count to make sure it cannot reach zero.
+ if (HasNonCallUses)
+ ++NumUnsafeUses;
+ for (DevirtCallSite Call : DevirtCalls) {
+ CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CB,
+ &NumUnsafeUses);
+ }
+
+ CI->eraseFromParent();
+ }
+}
+
+void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
+ auto *TypeId = dyn_cast<MDString>(Slot.TypeID);
+ if (!TypeId)
+ return;
+ const TypeIdSummary *TidSummary =
+ ImportSummary->getTypeIdSummary(TypeId->getString());
+ if (!TidSummary)
+ return;
+ auto ResI = TidSummary->WPDRes.find(Slot.ByteOffset);
+ if (ResI == TidSummary->WPDRes.end())
+ return;
+ const WholeProgramDevirtResolution &Res = ResI->second;
+
+ if (Res.TheKind == WholeProgramDevirtResolution::SingleImpl) {
+ assert(!Res.SingleImplName.empty());
+ // The type of the function in the declaration is irrelevant because every
+ // call site will cast it to the correct type.
+ Constant *SingleImpl =
+ cast<Constant>(M.getOrInsertFunction(Res.SingleImplName,
+ Type::getVoidTy(M.getContext()))
+ .getCallee());
+
+ // This is the import phase so we should not be exporting anything.
+ bool IsExported = false;
+ applySingleImplDevirt(SlotInfo, SingleImpl, IsExported);
+ assert(!IsExported);
+ }
+
+ for (auto &CSByConstantArg : SlotInfo.ConstCSInfo) {
+ auto I = Res.ResByArg.find(CSByConstantArg.first);
+ if (I == Res.ResByArg.end())
+ continue;
+ auto &ResByArg = I->second;
+ // FIXME: We should figure out what to do about the "function name" argument
+ // to the apply* functions, as the function names are unavailable during the
+ // importing phase. For now we just pass the empty string. This does not
+ // impact correctness because the function names are just used for remarks.
+ switch (ResByArg.TheKind) {
+ case WholeProgramDevirtResolution::ByArg::UniformRetVal:
+ applyUniformRetValOpt(CSByConstantArg.second, "", ResByArg.Info);
+ break;
+ case WholeProgramDevirtResolution::ByArg::UniqueRetVal: {
+ Constant *UniqueMemberAddr =
+ importGlobal(Slot, CSByConstantArg.first, "unique_member");
+ applyUniqueRetValOpt(CSByConstantArg.second, "", ResByArg.Info,
+ UniqueMemberAddr);
+ break;
+ }
+ case WholeProgramDevirtResolution::ByArg::VirtualConstProp: {
+ Constant *Byte = importConstant(Slot, CSByConstantArg.first, "byte",
+ Int32Ty, ResByArg.Byte);
+ Constant *Bit = importConstant(Slot, CSByConstantArg.first, "bit", Int8Ty,
+ ResByArg.Bit);
+ applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit);
+ break;
+ }
+ default:
+ break;
+ }
+ }
+
+ if (Res.TheKind == WholeProgramDevirtResolution::BranchFunnel) {
+ // The type of the function is irrelevant, because it's bitcast at calls
+ // anyhow.
+ Constant *JT = cast<Constant>(
+ M.getOrInsertFunction(getGlobalName(Slot, {}, "branch_funnel"),
+ Type::getVoidTy(M.getContext()))
+ .getCallee());
+ bool IsExported = false;
+ applyICallBranchFunnel(SlotInfo, JT, IsExported);
+ assert(!IsExported);
+ }
+}
+
+void DevirtModule::removeRedundantTypeTests() {
+ auto True = ConstantInt::getTrue(M.getContext());
+ for (auto &&U : NumUnsafeUsesForTypeTest) {
+ if (U.second == 0) {
+ U.first->replaceAllUsesWith(True);
+ U.first->eraseFromParent();
+ }
+ }
+}
+
+bool DevirtModule::run() {
+ // If only some of the modules were split, we cannot correctly perform
+ // this transformation. We already checked for the presense of type tests
+ // with partially split modules during the thin link, and would have emitted
+ // an error if any were found, so here we can simply return.
+ if ((ExportSummary && ExportSummary->partiallySplitLTOUnits()) ||
+ (ImportSummary && ImportSummary->partiallySplitLTOUnits()))
+ return false;
+
+ Function *TypeTestFunc =
+ M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+ Function *TypeCheckedLoadFunc =
+ M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
+ Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume));
+
+ // Normally if there are no users of the devirtualization intrinsics in the
+ // module, this pass has nothing to do. But if we are exporting, we also need
+ // to handle any users that appear only in the function summaries.
+ if (!ExportSummary &&
+ (!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc ||
+ AssumeFunc->use_empty()) &&
+ (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
+ return false;
+
+ // Rebuild type metadata into a map for easy lookup.
+ std::vector<VTableBits> Bits;
+ DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
+ buildTypeIdentifierMap(Bits, TypeIdMap);
+
+ if (TypeTestFunc && AssumeFunc)
+ scanTypeTestUsers(TypeTestFunc, TypeIdMap);
+
+ if (TypeCheckedLoadFunc)
+ scanTypeCheckedLoadUsers(TypeCheckedLoadFunc);
+
+ if (ImportSummary) {
+ for (auto &S : CallSlots)
+ importResolution(S.first, S.second);
+
+ removeRedundantTypeTests();
+
+ // We have lowered or deleted the type instrinsics, so we will no
+ // longer have enough information to reason about the liveness of virtual
+ // function pointers in GlobalDCE.
+ for (GlobalVariable &GV : M.globals())
+ GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
+
+ // The rest of the code is only necessary when exporting or during regular
+ // LTO, so we are done.
+ return true;
+ }
+
+ if (TypeIdMap.empty())
+ return true;
+
+ // Collect information from summary about which calls to try to devirtualize.
+ if (ExportSummary) {
+ DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
+ for (auto &P : TypeIdMap) {
+ if (auto *TypeId = dyn_cast<MDString>(P.first))
+ MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back(
+ TypeId);
+ }
+
+ for (auto &P : *ExportSummary) {
+ for (auto &S : P.second.SummaryList) {
+ auto *FS = dyn_cast<FunctionSummary>(S.get());
+ if (!FS)
+ continue;
+ // FIXME: Only add live functions.
+ for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
+ for (Metadata *MD : MetadataByGUID[VF.GUID]) {
+ CallSlots[{MD, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS);
+ }
+ }
+ for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
+ for (Metadata *MD : MetadataByGUID[VF.GUID]) {
+ CallSlots[{MD, VF.Offset}].CSInfo.addSummaryTypeCheckedLoadUser(FS);
+ }
+ }
+ for (const FunctionSummary::ConstVCall &VC :
+ FS->type_test_assume_const_vcalls()) {
+ for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
+ CallSlots[{MD, VC.VFunc.Offset}]
+ .ConstCSInfo[VC.Args]
+ .addSummaryTypeTestAssumeUser(FS);
+ }
+ }
+ for (const FunctionSummary::ConstVCall &VC :
+ FS->type_checked_load_const_vcalls()) {
+ for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
+ CallSlots[{MD, VC.VFunc.Offset}]
+ .ConstCSInfo[VC.Args]
+ .addSummaryTypeCheckedLoadUser(FS);
+ }
+ }
+ }
+ }
+ }
+
+ // For each (type, offset) pair:
+ bool DidVirtualConstProp = false;
+ std::map<std::string, Function*> DevirtTargets;
+ for (auto &S : CallSlots) {
+ // Search each of the members of the type identifier for the virtual
+ // function implementation at offset S.first.ByteOffset, and add to
+ // TargetsForSlot.
+ std::vector<VirtualCallTarget> TargetsForSlot;
+ WholeProgramDevirtResolution *Res = nullptr;
+ const std::set<TypeMemberInfo> &TypeMemberInfos = TypeIdMap[S.first.TypeID];
+ if (ExportSummary && isa<MDString>(S.first.TypeID) &&
+ TypeMemberInfos.size())
+ // For any type id used on a global's type metadata, create the type id
+ // summary resolution regardless of whether we can devirtualize, so that
+ // lower type tests knows the type id is not Unsat. If it was not used on
+ // a global's type metadata, the TypeIdMap entry set will be empty, and
+ // we don't want to create an entry (with the default Unknown type
+ // resolution), which can prevent detection of the Unsat.
+ Res = &ExportSummary
+ ->getOrInsertTypeIdSummary(
+ cast<MDString>(S.first.TypeID)->getString())
+ .WPDRes[S.first.ByteOffset];
+ if (tryFindVirtualCallTargets(TargetsForSlot, TypeMemberInfos,
+ S.first.ByteOffset)) {
+
+ if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) {
+ DidVirtualConstProp |=
+ tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first);
+
+ tryICallBranchFunnel(TargetsForSlot, S.second, Res, S.first);
+ }
+
+ // Collect functions devirtualized at least for one call site for stats.
+ if (RemarksEnabled)
+ for (const auto &T : TargetsForSlot)
+ if (T.WasDevirt)
+ DevirtTargets[std::string(T.Fn->getName())] = T.Fn;
+ }
+
+ // CFI-specific: if we are exporting and any llvm.type.checked.load
+ // intrinsics were *not* devirtualized, we need to add the resulting
+ // llvm.type.test intrinsics to the function summaries so that the
+ // LowerTypeTests pass will export them.
+ if (ExportSummary && isa<MDString>(S.first.TypeID)) {
+ auto GUID =
+ GlobalValue::getGUID(cast<MDString>(S.first.TypeID)->getString());
+ for (auto FS : S.second.CSInfo.SummaryTypeCheckedLoadUsers)
+ FS->addTypeTest(GUID);
+ for (auto &CCS : S.second.ConstCSInfo)
+ for (auto FS : CCS.second.SummaryTypeCheckedLoadUsers)
+ FS->addTypeTest(GUID);
+ }
+ }
+
+ if (RemarksEnabled) {
+ // Generate remarks for each devirtualized function.
+ for (const auto &DT : DevirtTargets) {
+ Function *F = DT.second;
+
+ using namespace ore;
+ OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, "Devirtualized", F)
+ << "devirtualized "
+ << NV("FunctionName", DT.first));
+ }
+ }
+
+ removeRedundantTypeTests();
+
+ // Rebuild each global we touched as part of virtual constant propagation to
+ // include the before and after bytes.
+ if (DidVirtualConstProp)
+ for (VTableBits &B : Bits)
+ rebuildGlobal(B);
+
+ // We have lowered or deleted the type instrinsics, so we will no
+ // longer have enough information to reason about the liveness of virtual
+ // function pointers in GlobalDCE.
+ for (GlobalVariable &GV : M.globals())
+ GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
+
+ return true;
+}
+
+void DevirtIndex::run() {
+ if (ExportSummary.typeIdCompatibleVtableMap().empty())
+ return;
+
+ DenseMap<GlobalValue::GUID, std::vector<StringRef>> NameByGUID;
+ for (auto &P : ExportSummary.typeIdCompatibleVtableMap()) {
+ NameByGUID[GlobalValue::getGUID(P.first)].push_back(P.first);
+ }
+
+ // Collect information from summary about which calls to try to devirtualize.
+ for (auto &P : ExportSummary) {
+ for (auto &S : P.second.SummaryList) {
+ auto *FS = dyn_cast<FunctionSummary>(S.get());
+ if (!FS)
+ continue;
+ // FIXME: Only add live functions.
+ for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
+ for (StringRef Name : NameByGUID[VF.GUID]) {
+ CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeTestAssumeUser(FS);
+ }
+ }
+ for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
+ for (StringRef Name : NameByGUID[VF.GUID]) {
+ CallSlots[{Name, VF.Offset}].CSInfo.addSummaryTypeCheckedLoadUser(FS);
+ }
+ }
+ for (const FunctionSummary::ConstVCall &VC :
+ FS->type_test_assume_const_vcalls()) {
+ for (StringRef Name : NameByGUID[VC.VFunc.GUID]) {
+ CallSlots[{Name, VC.VFunc.Offset}]
+ .ConstCSInfo[VC.Args]
+ .addSummaryTypeTestAssumeUser(FS);
+ }
+ }
+ for (const FunctionSummary::ConstVCall &VC :
+ FS->type_checked_load_const_vcalls()) {
+ for (StringRef Name : NameByGUID[VC.VFunc.GUID]) {
+ CallSlots[{Name, VC.VFunc.Offset}]
+ .ConstCSInfo[VC.Args]
+ .addSummaryTypeCheckedLoadUser(FS);
+ }
+ }
+ }
+ }
+
+ std::set<ValueInfo> DevirtTargets;
+ // For each (type, offset) pair:
+ for (auto &S : CallSlots) {
+ // Search each of the members of the type identifier for the virtual
+ // function implementation at offset S.first.ByteOffset, and add to
+ // TargetsForSlot.
+ std::vector<ValueInfo> TargetsForSlot;
+ auto TidSummary = ExportSummary.getTypeIdCompatibleVtableSummary(S.first.TypeID);
+ assert(TidSummary);
+ // Create the type id summary resolution regardlness of whether we can
+ // devirtualize, so that lower type tests knows the type id is used on
+ // a global and not Unsat.
+ WholeProgramDevirtResolution *Res =
+ &ExportSummary.getOrInsertTypeIdSummary(S.first.TypeID)
+ .WPDRes[S.first.ByteOffset];
+ if (tryFindVirtualCallTargets(TargetsForSlot, *TidSummary,
+ S.first.ByteOffset)) {
+
+ if (!trySingleImplDevirt(TargetsForSlot, S.first, S.second, Res,
+ DevirtTargets))
+ continue;
+ }
+ }
+
+ // Optionally have the thin link print message for each devirtualized
+ // function.
+ if (PrintSummaryDevirt)
+ for (const auto &DT : DevirtTargets)
+ errs() << "Devirtualized call to " << DT << "\n";
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/IPO/ya.make b/contrib/libs/llvm12/lib/Transforms/IPO/ya.make
index 8e38815374..5b078050fe 100644
--- a/contrib/libs/llvm12/lib/Transforms/IPO/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/IPO/ya.make
@@ -1,17 +1,17 @@
-# Generated by devtools/yamaker.
-
-LIBRARY()
-
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
OWNER(
orivej
g:cpp-contrib
)
-
+
LICENSE(Apache-2.0 WITH LLVM-exception)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-PEERDIR(
+PEERDIR(
contrib/libs/llvm12
contrib/libs/llvm12/include
contrib/libs/llvm12/lib/Analysis
@@ -30,59 +30,59 @@ PEERDIR(
contrib/libs/llvm12/lib/Transforms/Scalar
contrib/libs/llvm12/lib/Transforms/Utils
contrib/libs/llvm12/lib/Transforms/Vectorize
-)
-
+)
+
ADDINCL(
contrib/libs/llvm12/lib/Transforms/IPO
)
-
-NO_COMPILER_WARNINGS()
-
-NO_UTIL()
-
-SRCS(
- AlwaysInliner.cpp
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+ AlwaysInliner.cpp
Annotation2Metadata.cpp
- ArgumentPromotion.cpp
- Attributor.cpp
- AttributorAttributes.cpp
- BarrierNoopPass.cpp
- BlockExtractor.cpp
- CalledValuePropagation.cpp
- ConstantMerge.cpp
- CrossDSOCFI.cpp
- DeadArgumentElimination.cpp
- ElimAvailExtern.cpp
- ExtractGV.cpp
- ForceFunctionAttrs.cpp
- FunctionAttrs.cpp
- FunctionImport.cpp
- GlobalDCE.cpp
- GlobalOpt.cpp
- GlobalSplit.cpp
- HotColdSplitting.cpp
- IPO.cpp
+ ArgumentPromotion.cpp
+ Attributor.cpp
+ AttributorAttributes.cpp
+ BarrierNoopPass.cpp
+ BlockExtractor.cpp
+ CalledValuePropagation.cpp
+ ConstantMerge.cpp
+ CrossDSOCFI.cpp
+ DeadArgumentElimination.cpp
+ ElimAvailExtern.cpp
+ ExtractGV.cpp
+ ForceFunctionAttrs.cpp
+ FunctionAttrs.cpp
+ FunctionImport.cpp
+ GlobalDCE.cpp
+ GlobalOpt.cpp
+ GlobalSplit.cpp
+ HotColdSplitting.cpp
+ IPO.cpp
IROutliner.cpp
- InferFunctionAttrs.cpp
- InlineSimple.cpp
- Inliner.cpp
- Internalize.cpp
- LoopExtractor.cpp
- LowerTypeTests.cpp
- MergeFunctions.cpp
- OpenMPOpt.cpp
- PartialInlining.cpp
- PassManagerBuilder.cpp
- PruneEH.cpp
- SCCP.cpp
+ InferFunctionAttrs.cpp
+ InlineSimple.cpp
+ Inliner.cpp
+ Internalize.cpp
+ LoopExtractor.cpp
+ LowerTypeTests.cpp
+ MergeFunctions.cpp
+ OpenMPOpt.cpp
+ PartialInlining.cpp
+ PassManagerBuilder.cpp
+ PruneEH.cpp
+ SCCP.cpp
SampleContextTracker.cpp
- SampleProfile.cpp
+ SampleProfile.cpp
SampleProfileProbe.cpp
- StripDeadPrototypes.cpp
- StripSymbols.cpp
- SyntheticCountsPropagation.cpp
- ThinLTOBitcodeWriter.cpp
- WholeProgramDevirt.cpp
-)
-
-END()
+ StripDeadPrototypes.cpp
+ StripSymbols.cpp
+ SyntheticCountsPropagation.cpp
+ ThinLTOBitcodeWriter.cpp
+ WholeProgramDevirt.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index f63a508659..bacb868989 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -1,929 +1,929 @@
-//===- InstCombineAddSub.cpp ------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the visit functions for add, fadd, sub, and fsub.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/AlignOf.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/KnownBits.h"
+//===- InstCombineAddSub.cpp ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for add, fadd, sub, and fsub.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include <cassert>
-#include <utility>
-
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "instcombine"
-
-namespace {
-
- /// Class representing coefficient of floating-point addend.
- /// This class needs to be highly efficient, which is especially true for
- /// the constructor. As of I write this comment, the cost of the default
- /// constructor is merely 4-byte-store-zero (Assuming compiler is able to
- /// perform write-merging).
- ///
- class FAddendCoef {
- public:
- // The constructor has to initialize a APFloat, which is unnecessary for
- // most addends which have coefficient either 1 or -1. So, the constructor
- // is expensive. In order to avoid the cost of the constructor, we should
- // reuse some instances whenever possible. The pre-created instances
- // FAddCombine::Add[0-5] embodies this idea.
- FAddendCoef() = default;
- ~FAddendCoef();
-
- // If possible, don't define operator+/operator- etc because these
- // operators inevitably call FAddendCoef's constructor which is not cheap.
- void operator=(const FAddendCoef &A);
- void operator+=(const FAddendCoef &A);
- void operator*=(const FAddendCoef &S);
-
- void set(short C) {
- assert(!insaneIntVal(C) && "Insane coefficient");
- IsFp = false; IntVal = C;
- }
-
- void set(const APFloat& C);
-
- void negate();
-
- bool isZero() const { return isInt() ? !IntVal : getFpVal().isZero(); }
- Value *getValue(Type *) const;
-
- bool isOne() const { return isInt() && IntVal == 1; }
- bool isTwo() const { return isInt() && IntVal == 2; }
- bool isMinusOne() const { return isInt() && IntVal == -1; }
- bool isMinusTwo() const { return isInt() && IntVal == -2; }
-
- private:
- bool insaneIntVal(int V) { return V > 4 || V < -4; }
-
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+namespace {
+
+ /// Class representing coefficient of floating-point addend.
+ /// This class needs to be highly efficient, which is especially true for
+ /// the constructor. As of I write this comment, the cost of the default
+ /// constructor is merely 4-byte-store-zero (Assuming compiler is able to
+ /// perform write-merging).
+ ///
+ class FAddendCoef {
+ public:
+ // The constructor has to initialize a APFloat, which is unnecessary for
+ // most addends which have coefficient either 1 or -1. So, the constructor
+ // is expensive. In order to avoid the cost of the constructor, we should
+ // reuse some instances whenever possible. The pre-created instances
+ // FAddCombine::Add[0-5] embodies this idea.
+ FAddendCoef() = default;
+ ~FAddendCoef();
+
+ // If possible, don't define operator+/operator- etc because these
+ // operators inevitably call FAddendCoef's constructor which is not cheap.
+ void operator=(const FAddendCoef &A);
+ void operator+=(const FAddendCoef &A);
+ void operator*=(const FAddendCoef &S);
+
+ void set(short C) {
+ assert(!insaneIntVal(C) && "Insane coefficient");
+ IsFp = false; IntVal = C;
+ }
+
+ void set(const APFloat& C);
+
+ void negate();
+
+ bool isZero() const { return isInt() ? !IntVal : getFpVal().isZero(); }
+ Value *getValue(Type *) const;
+
+ bool isOne() const { return isInt() && IntVal == 1; }
+ bool isTwo() const { return isInt() && IntVal == 2; }
+ bool isMinusOne() const { return isInt() && IntVal == -1; }
+ bool isMinusTwo() const { return isInt() && IntVal == -2; }
+
+ private:
+ bool insaneIntVal(int V) { return V > 4 || V < -4; }
+
APFloat *getFpValPtr() { return reinterpret_cast<APFloat *>(&FpValBuf); }
-
+
const APFloat *getFpValPtr() const {
return reinterpret_cast<const APFloat *>(&FpValBuf);
}
-
- const APFloat &getFpVal() const {
- assert(IsFp && BufHasFpVal && "Incorret state");
- return *getFpValPtr();
- }
-
- APFloat &getFpVal() {
- assert(IsFp && BufHasFpVal && "Incorret state");
- return *getFpValPtr();
- }
-
- bool isInt() const { return !IsFp; }
-
- // If the coefficient is represented by an integer, promote it to a
- // floating point.
- void convertToFpType(const fltSemantics &Sem);
-
- // Construct an APFloat from a signed integer.
- // TODO: We should get rid of this function when APFloat can be constructed
- // from an *SIGNED* integer.
- APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val);
-
- bool IsFp = false;
-
- // True iff FpValBuf contains an instance of APFloat.
- bool BufHasFpVal = false;
-
- // The integer coefficient of an individual addend is either 1 or -1,
- // and we try to simplify at most 4 addends from neighboring at most
- // two instructions. So the range of <IntVal> falls in [-4, 4]. APInt
- // is overkill of this end.
- short IntVal = 0;
-
- AlignedCharArrayUnion<APFloat> FpValBuf;
- };
-
- /// FAddend is used to represent floating-point addend. An addend is
- /// represented as <C, V>, where the V is a symbolic value, and C is a
- /// constant coefficient. A constant addend is represented as <C, 0>.
- class FAddend {
- public:
- FAddend() = default;
-
- void operator+=(const FAddend &T) {
- assert((Val == T.Val) && "Symbolic-values disagree");
- Coeff += T.Coeff;
- }
-
- Value *getSymVal() const { return Val; }
- const FAddendCoef &getCoef() const { return Coeff; }
-
- bool isConstant() const { return Val == nullptr; }
- bool isZero() const { return Coeff.isZero(); }
-
- void set(short Coefficient, Value *V) {
- Coeff.set(Coefficient);
- Val = V;
- }
- void set(const APFloat &Coefficient, Value *V) {
- Coeff.set(Coefficient);
- Val = V;
- }
- void set(const ConstantFP *Coefficient, Value *V) {
- Coeff.set(Coefficient->getValueAPF());
- Val = V;
- }
-
- void negate() { Coeff.negate(); }
-
- /// Drill down the U-D chain one step to find the definition of V, and
- /// try to break the definition into one or two addends.
- static unsigned drillValueDownOneStep(Value* V, FAddend &A0, FAddend &A1);
-
- /// Similar to FAddend::drillDownOneStep() except that the value being
- /// splitted is the addend itself.
- unsigned drillAddendDownOneStep(FAddend &Addend0, FAddend &Addend1) const;
-
- private:
- void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; }
-
- // This addend has the value of "Coeff * Val".
- Value *Val = nullptr;
- FAddendCoef Coeff;
- };
-
- /// FAddCombine is the class for optimizing an unsafe fadd/fsub along
- /// with its neighboring at most two instructions.
- ///
- class FAddCombine {
- public:
- FAddCombine(InstCombiner::BuilderTy &B) : Builder(B) {}
-
- Value *simplify(Instruction *FAdd);
-
- private:
- using AddendVect = SmallVector<const FAddend *, 4>;
-
- Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota);
-
- /// Convert given addend to a Value
- Value *createAddendVal(const FAddend &A, bool& NeedNeg);
-
- /// Return the number of instructions needed to emit the N-ary addition.
- unsigned calcInstrNumber(const AddendVect& Vect);
-
- Value *createFSub(Value *Opnd0, Value *Opnd1);
- Value *createFAdd(Value *Opnd0, Value *Opnd1);
- Value *createFMul(Value *Opnd0, Value *Opnd1);
- Value *createFNeg(Value *V);
- Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
- void createInstPostProc(Instruction *NewInst, bool NoNumber = false);
-
- // Debugging stuff are clustered here.
- #ifndef NDEBUG
- unsigned CreateInstrNum;
- void initCreateInstNum() { CreateInstrNum = 0; }
- void incCreateInstNum() { CreateInstrNum++; }
- #else
- void initCreateInstNum() {}
- void incCreateInstNum() {}
- #endif
-
- InstCombiner::BuilderTy &Builder;
- Instruction *Instr = nullptr;
- };
-
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-//
-// Implementation of
-// {FAddendCoef, FAddend, FAddition, FAddCombine}.
-//
-//===----------------------------------------------------------------------===//
-FAddendCoef::~FAddendCoef() {
- if (BufHasFpVal)
- getFpValPtr()->~APFloat();
-}
-
-void FAddendCoef::set(const APFloat& C) {
- APFloat *P = getFpValPtr();
-
- if (isInt()) {
- // As the buffer is meanless byte stream, we cannot call
- // APFloat::operator=().
- new(P) APFloat(C);
- } else
- *P = C;
-
- IsFp = BufHasFpVal = true;
-}
-
-void FAddendCoef::convertToFpType(const fltSemantics &Sem) {
- if (!isInt())
- return;
-
- APFloat *P = getFpValPtr();
- if (IntVal > 0)
- new(P) APFloat(Sem, IntVal);
- else {
- new(P) APFloat(Sem, 0 - IntVal);
- P->changeSign();
- }
- IsFp = BufHasFpVal = true;
-}
-
-APFloat FAddendCoef::createAPFloatFromInt(const fltSemantics &Sem, int Val) {
- if (Val >= 0)
- return APFloat(Sem, Val);
-
- APFloat T(Sem, 0 - Val);
- T.changeSign();
-
- return T;
-}
-
-void FAddendCoef::operator=(const FAddendCoef &That) {
- if (That.isInt())
- set(That.IntVal);
- else
- set(That.getFpVal());
-}
-
-void FAddendCoef::operator+=(const FAddendCoef &That) {
- RoundingMode RndMode = RoundingMode::NearestTiesToEven;
- if (isInt() == That.isInt()) {
- if (isInt())
- IntVal += That.IntVal;
- else
- getFpVal().add(That.getFpVal(), RndMode);
- return;
- }
-
- if (isInt()) {
- const APFloat &T = That.getFpVal();
- convertToFpType(T.getSemantics());
- getFpVal().add(T, RndMode);
- return;
- }
-
- APFloat &T = getFpVal();
- T.add(createAPFloatFromInt(T.getSemantics(), That.IntVal), RndMode);
-}
-
-void FAddendCoef::operator*=(const FAddendCoef &That) {
- if (That.isOne())
- return;
-
- if (That.isMinusOne()) {
- negate();
- return;
- }
-
- if (isInt() && That.isInt()) {
- int Res = IntVal * (int)That.IntVal;
- assert(!insaneIntVal(Res) && "Insane int value");
- IntVal = Res;
- return;
- }
-
- const fltSemantics &Semantic =
- isInt() ? That.getFpVal().getSemantics() : getFpVal().getSemantics();
-
- if (isInt())
- convertToFpType(Semantic);
- APFloat &F0 = getFpVal();
-
- if (That.isInt())
- F0.multiply(createAPFloatFromInt(Semantic, That.IntVal),
- APFloat::rmNearestTiesToEven);
- else
- F0.multiply(That.getFpVal(), APFloat::rmNearestTiesToEven);
-}
-
-void FAddendCoef::negate() {
- if (isInt())
- IntVal = 0 - IntVal;
- else
- getFpVal().changeSign();
-}
-
-Value *FAddendCoef::getValue(Type *Ty) const {
- return isInt() ?
- ConstantFP::get(Ty, float(IntVal)) :
- ConstantFP::get(Ty->getContext(), getFpVal());
-}
-
-// The definition of <Val> Addends
-// =========================================
-// A + B <1, A>, <1,B>
-// A - B <1, A>, <1,B>
-// 0 - B <-1, B>
-// C * A, <C, A>
-// A + C <1, A> <C, NULL>
-// 0 +/- 0 <0, NULL> (corner case)
-//
-// Legend: A and B are not constant, C is constant
-unsigned FAddend::drillValueDownOneStep
- (Value *Val, FAddend &Addend0, FAddend &Addend1) {
- Instruction *I = nullptr;
- if (!Val || !(I = dyn_cast<Instruction>(Val)))
- return 0;
-
- unsigned Opcode = I->getOpcode();
-
- if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
- ConstantFP *C0, *C1;
- Value *Opnd0 = I->getOperand(0);
- Value *Opnd1 = I->getOperand(1);
- if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero())
- Opnd0 = nullptr;
-
- if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero())
- Opnd1 = nullptr;
-
- if (Opnd0) {
- if (!C0)
- Addend0.set(1, Opnd0);
- else
- Addend0.set(C0, nullptr);
- }
-
- if (Opnd1) {
- FAddend &Addend = Opnd0 ? Addend1 : Addend0;
- if (!C1)
- Addend.set(1, Opnd1);
- else
- Addend.set(C1, nullptr);
- if (Opcode == Instruction::FSub)
- Addend.negate();
- }
-
- if (Opnd0 || Opnd1)
- return Opnd0 && Opnd1 ? 2 : 1;
-
- // Both operands are zero. Weird!
- Addend0.set(APFloat(C0->getValueAPF().getSemantics()), nullptr);
- return 1;
- }
-
- if (I->getOpcode() == Instruction::FMul) {
- Value *V0 = I->getOperand(0);
- Value *V1 = I->getOperand(1);
- if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) {
- Addend0.set(C, V1);
- return 1;
- }
-
- if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) {
- Addend0.set(C, V0);
- return 1;
- }
- }
-
- return 0;
-}
-
-// Try to break *this* addend into two addends. e.g. Suppose this addend is
-// <2.3, V>, and V = X + Y, by calling this function, we obtain two addends,
-// i.e. <2.3, X> and <2.3, Y>.
-unsigned FAddend::drillAddendDownOneStep
- (FAddend &Addend0, FAddend &Addend1) const {
- if (isConstant())
- return 0;
-
- unsigned BreakNum = FAddend::drillValueDownOneStep(Val, Addend0, Addend1);
- if (!BreakNum || Coeff.isOne())
- return BreakNum;
-
- Addend0.Scale(Coeff);
-
- if (BreakNum == 2)
- Addend1.Scale(Coeff);
-
- return BreakNum;
-}
-
-Value *FAddCombine::simplify(Instruction *I) {
- assert(I->hasAllowReassoc() && I->hasNoSignedZeros() &&
- "Expected 'reassoc'+'nsz' instruction");
-
- // Currently we are not able to handle vector type.
- if (I->getType()->isVectorTy())
- return nullptr;
-
- assert((I->getOpcode() == Instruction::FAdd ||
- I->getOpcode() == Instruction::FSub) && "Expect add/sub");
-
- // Save the instruction before calling other member-functions.
- Instr = I;
-
- FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1;
-
- unsigned OpndNum = FAddend::drillValueDownOneStep(I, Opnd0, Opnd1);
-
- // Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1.
- unsigned Opnd0_ExpNum = 0;
- unsigned Opnd1_ExpNum = 0;
-
- if (!Opnd0.isConstant())
- Opnd0_ExpNum = Opnd0.drillAddendDownOneStep(Opnd0_0, Opnd0_1);
-
- // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1.
- if (OpndNum == 2 && !Opnd1.isConstant())
- Opnd1_ExpNum = Opnd1.drillAddendDownOneStep(Opnd1_0, Opnd1_1);
-
- // Step 3: Try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1
- if (Opnd0_ExpNum && Opnd1_ExpNum) {
- AddendVect AllOpnds;
- AllOpnds.push_back(&Opnd0_0);
- AllOpnds.push_back(&Opnd1_0);
- if (Opnd0_ExpNum == 2)
- AllOpnds.push_back(&Opnd0_1);
- if (Opnd1_ExpNum == 2)
- AllOpnds.push_back(&Opnd1_1);
-
- // Compute instruction quota. We should save at least one instruction.
- unsigned InstQuota = 0;
-
- Value *V0 = I->getOperand(0);
- Value *V1 = I->getOperand(1);
- InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&
- (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1;
-
- if (Value *R = simplifyFAdd(AllOpnds, InstQuota))
- return R;
- }
-
- if (OpndNum != 2) {
- // The input instruction is : "I=0.0 +/- V". If the "V" were able to be
- // splitted into two addends, say "V = X - Y", the instruction would have
- // been optimized into "I = Y - X" in the previous steps.
- //
- const FAddendCoef &CE = Opnd0.getCoef();
- return CE.isOne() ? Opnd0.getSymVal() : nullptr;
- }
-
- // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1]
- if (Opnd1_ExpNum) {
- AddendVect AllOpnds;
- AllOpnds.push_back(&Opnd0);
- AllOpnds.push_back(&Opnd1_0);
- if (Opnd1_ExpNum == 2)
- AllOpnds.push_back(&Opnd1_1);
-
- if (Value *R = simplifyFAdd(AllOpnds, 1))
- return R;
- }
-
- // step 5: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1]
- if (Opnd0_ExpNum) {
- AddendVect AllOpnds;
- AllOpnds.push_back(&Opnd1);
- AllOpnds.push_back(&Opnd0_0);
- if (Opnd0_ExpNum == 2)
- AllOpnds.push_back(&Opnd0_1);
-
- if (Value *R = simplifyFAdd(AllOpnds, 1))
- return R;
- }
-
- return nullptr;
-}
-
-Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
- unsigned AddendNum = Addends.size();
- assert(AddendNum <= 4 && "Too many addends");
-
- // For saving intermediate results;
- unsigned NextTmpIdx = 0;
- FAddend TmpResult[3];
-
- // Points to the constant addend of the resulting simplified expression.
- // If the resulting expr has constant-addend, this constant-addend is
- // desirable to reside at the top of the resulting expression tree. Placing
- // constant close to supper-expr(s) will potentially reveal some optimization
- // opportunities in super-expr(s).
- const FAddend *ConstAdd = nullptr;
-
- // Simplified addends are placed <SimpVect>.
- AddendVect SimpVect;
-
- // The outer loop works on one symbolic-value at a time. Suppose the input
- // addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ...
- // The symbolic-values will be processed in this order: x, y, z.
- for (unsigned SymIdx = 0; SymIdx < AddendNum; SymIdx++) {
-
- const FAddend *ThisAddend = Addends[SymIdx];
- if (!ThisAddend) {
- // This addend was processed before.
- continue;
- }
-
- Value *Val = ThisAddend->getSymVal();
- unsigned StartIdx = SimpVect.size();
- SimpVect.push_back(ThisAddend);
-
- // The inner loop collects addends sharing same symbolic-value, and these
- // addends will be later on folded into a single addend. Following above
- // example, if the symbolic value "y" is being processed, the inner loop
- // will collect two addends "<b1,y>" and "<b2,Y>". These two addends will
- // be later on folded into "<b1+b2, y>".
- for (unsigned SameSymIdx = SymIdx + 1;
- SameSymIdx < AddendNum; SameSymIdx++) {
- const FAddend *T = Addends[SameSymIdx];
- if (T && T->getSymVal() == Val) {
- // Set null such that next iteration of the outer loop will not process
- // this addend again.
- Addends[SameSymIdx] = nullptr;
- SimpVect.push_back(T);
- }
- }
-
- // If multiple addends share same symbolic value, fold them together.
- if (StartIdx + 1 != SimpVect.size()) {
- FAddend &R = TmpResult[NextTmpIdx ++];
- R = *SimpVect[StartIdx];
- for (unsigned Idx = StartIdx + 1; Idx < SimpVect.size(); Idx++)
- R += *SimpVect[Idx];
-
- // Pop all addends being folded and push the resulting folded addend.
- SimpVect.resize(StartIdx);
- if (Val) {
- if (!R.isZero()) {
- SimpVect.push_back(&R);
- }
- } else {
- // Don't push constant addend at this time. It will be the last element
- // of <SimpVect>.
- ConstAdd = &R;
- }
- }
- }
-
- assert((NextTmpIdx <= array_lengthof(TmpResult) + 1) &&
- "out-of-bound access");
-
- if (ConstAdd)
- SimpVect.push_back(ConstAdd);
-
- Value *Result;
- if (!SimpVect.empty())
- Result = createNaryFAdd(SimpVect, InstrQuota);
- else {
- // The addition is folded to 0.0.
- Result = ConstantFP::get(Instr->getType(), 0.0);
- }
-
- return Result;
-}
-
-Value *FAddCombine::createNaryFAdd
- (const AddendVect &Opnds, unsigned InstrQuota) {
- assert(!Opnds.empty() && "Expect at least one addend");
-
- // Step 1: Check if the # of instructions needed exceeds the quota.
-
- unsigned InstrNeeded = calcInstrNumber(Opnds);
- if (InstrNeeded > InstrQuota)
- return nullptr;
-
- initCreateInstNum();
-
- // step 2: Emit the N-ary addition.
- // Note that at most three instructions are involved in Fadd-InstCombine: the
- // addition in question, and at most two neighboring instructions.
- // The resulting optimized addition should have at least one less instruction
- // than the original addition expression tree. This implies that the resulting
- // N-ary addition has at most two instructions, and we don't need to worry
- // about tree-height when constructing the N-ary addition.
-
- Value *LastVal = nullptr;
- bool LastValNeedNeg = false;
-
- // Iterate the addends, creating fadd/fsub using adjacent two addends.
- for (const FAddend *Opnd : Opnds) {
- bool NeedNeg;
- Value *V = createAddendVal(*Opnd, NeedNeg);
- if (!LastVal) {
- LastVal = V;
- LastValNeedNeg = NeedNeg;
- continue;
- }
-
- if (LastValNeedNeg == NeedNeg) {
- LastVal = createFAdd(LastVal, V);
- continue;
- }
-
- if (LastValNeedNeg)
- LastVal = createFSub(V, LastVal);
- else
- LastVal = createFSub(LastVal, V);
-
- LastValNeedNeg = false;
- }
-
- if (LastValNeedNeg) {
- LastVal = createFNeg(LastVal);
- }
-
-#ifndef NDEBUG
- assert(CreateInstrNum == InstrNeeded &&
- "Inconsistent in instruction numbers");
-#endif
-
- return LastVal;
-}
-
-Value *FAddCombine::createFSub(Value *Opnd0, Value *Opnd1) {
- Value *V = Builder.CreateFSub(Opnd0, Opnd1);
- if (Instruction *I = dyn_cast<Instruction>(V))
- createInstPostProc(I);
- return V;
-}
-
-Value *FAddCombine::createFNeg(Value *V) {
- Value *NewV = Builder.CreateFNeg(V);
- if (Instruction *I = dyn_cast<Instruction>(NewV))
- createInstPostProc(I, true); // fneg's don't receive instruction numbers.
- return NewV;
-}
-
-Value *FAddCombine::createFAdd(Value *Opnd0, Value *Opnd1) {
- Value *V = Builder.CreateFAdd(Opnd0, Opnd1);
- if (Instruction *I = dyn_cast<Instruction>(V))
- createInstPostProc(I);
- return V;
-}
-
-Value *FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) {
- Value *V = Builder.CreateFMul(Opnd0, Opnd1);
- if (Instruction *I = dyn_cast<Instruction>(V))
- createInstPostProc(I);
- return V;
-}
-
-void FAddCombine::createInstPostProc(Instruction *NewInstr, bool NoNumber) {
- NewInstr->setDebugLoc(Instr->getDebugLoc());
-
- // Keep track of the number of instruction created.
- if (!NoNumber)
- incCreateInstNum();
-
- // Propagate fast-math flags
- NewInstr->setFastMathFlags(Instr->getFastMathFlags());
-}
-
-// Return the number of instruction needed to emit the N-ary addition.
-// NOTE: Keep this function in sync with createAddendVal().
-unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
- unsigned OpndNum = Opnds.size();
- unsigned InstrNeeded = OpndNum - 1;
-
- // The number of addends in the form of "(-1)*x".
- unsigned NegOpndNum = 0;
-
- // Adjust the number of instructions needed to emit the N-ary add.
- for (const FAddend *Opnd : Opnds) {
- if (Opnd->isConstant())
- continue;
-
- // The constant check above is really for a few special constant
- // coefficients.
- if (isa<UndefValue>(Opnd->getSymVal()))
- continue;
-
- const FAddendCoef &CE = Opnd->getCoef();
- if (CE.isMinusOne() || CE.isMinusTwo())
- NegOpndNum++;
-
- // Let the addend be "c * x". If "c == +/-1", the value of the addend
- // is immediately available; otherwise, it needs exactly one instruction
- // to evaluate the value.
- if (!CE.isMinusOne() && !CE.isOne())
- InstrNeeded++;
- }
- return InstrNeeded;
-}
-
-// Input Addend Value NeedNeg(output)
-// ================================================================
-// Constant C C false
-// <+/-1, V> V coefficient is -1
-// <2/-2, V> "fadd V, V" coefficient is -2
-// <C, V> "fmul V, C" false
-//
-// NOTE: Keep this function in sync with FAddCombine::calcInstrNumber.
-Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
- const FAddendCoef &Coeff = Opnd.getCoef();
-
- if (Opnd.isConstant()) {
- NeedNeg = false;
- return Coeff.getValue(Instr->getType());
- }
-
- Value *OpndVal = Opnd.getSymVal();
-
- if (Coeff.isMinusOne() || Coeff.isOne()) {
- NeedNeg = Coeff.isMinusOne();
- return OpndVal;
- }
-
- if (Coeff.isTwo() || Coeff.isMinusTwo()) {
- NeedNeg = Coeff.isMinusTwo();
- return createFAdd(OpndVal, OpndVal);
- }
-
- NeedNeg = false;
- return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
-}
-
-// Checks if any operand is negative and we can convert add to sub.
-// This function checks for following negative patterns
-// ADD(XOR(OR(Z, NOT(C)), C)), 1) == NEG(AND(Z, C))
-// ADD(XOR(AND(Z, C), C), 1) == NEG(OR(Z, ~C))
-// XOR(AND(Z, C), (C + 1)) == NEG(OR(Z, ~C)) if C is even
-static Value *checkForNegativeOperand(BinaryOperator &I,
- InstCombiner::BuilderTy &Builder) {
- Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
-
- // This function creates 2 instructions to replace ADD, we need at least one
- // of LHS or RHS to have one use to ensure benefit in transform.
- if (!LHS->hasOneUse() && !RHS->hasOneUse())
- return nullptr;
-
- Value *X = nullptr, *Y = nullptr, *Z = nullptr;
- const APInt *C1 = nullptr, *C2 = nullptr;
-
- // if ONE is on other side, swap
- if (match(RHS, m_Add(m_Value(X), m_One())))
- std::swap(LHS, RHS);
-
- if (match(LHS, m_Add(m_Value(X), m_One()))) {
- // if XOR on other side, swap
- if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1))))
- std::swap(X, RHS);
-
- if (match(X, m_Xor(m_Value(Y), m_APInt(C1)))) {
- // X = XOR(Y, C1), Y = OR(Z, C2), C2 = NOT(C1) ==> X == NOT(AND(Z, C1))
- // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, AND(Z, C1))
- if (match(Y, m_Or(m_Value(Z), m_APInt(C2))) && (*C2 == ~(*C1))) {
- Value *NewAnd = Builder.CreateAnd(Z, *C1);
- return Builder.CreateSub(RHS, NewAnd, "sub");
- } else if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && (*C1 == *C2)) {
- // X = XOR(Y, C1), Y = AND(Z, C2), C2 == C1 ==> X == NOT(OR(Z, ~C1))
- // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, OR(Z, ~C1))
- Value *NewOr = Builder.CreateOr(Z, ~(*C1));
- return Builder.CreateSub(RHS, NewOr, "sub");
- }
- }
- }
-
- // Restore LHS and RHS
- LHS = I.getOperand(0);
- RHS = I.getOperand(1);
-
- // if XOR is on other side, swap
- if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1))))
- std::swap(LHS, RHS);
-
- // C2 is ODD
- // LHS = XOR(Y, C1), Y = AND(Z, C2), C1 == (C2 + 1) => LHS == NEG(OR(Z, ~C2))
- // ADD(LHS, RHS) == SUB(RHS, OR(Z, ~C2))
- if (match(LHS, m_Xor(m_Value(Y), m_APInt(C1))))
- if (C1->countTrailingZeros() == 0)
- if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && *C1 == (*C2 + 1)) {
- Value *NewOr = Builder.CreateOr(Z, ~(*C2));
- return Builder.CreateSub(RHS, NewOr, "sub");
- }
- return nullptr;
-}
-
-/// Wrapping flags may allow combining constants separated by an extend.
-static Instruction *foldNoWrapAdd(BinaryOperator &Add,
- InstCombiner::BuilderTy &Builder) {
- Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
- Type *Ty = Add.getType();
- Constant *Op1C;
- if (!match(Op1, m_Constant(Op1C)))
- return nullptr;
-
- // Try this match first because it results in an add in the narrow type.
- // (zext (X +nuw C2)) + C1 --> zext (X + (C2 + trunc(C1)))
- Value *X;
- const APInt *C1, *C2;
- if (match(Op1, m_APInt(C1)) &&
- match(Op0, m_OneUse(m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2))))) &&
- C1->isNegative() && C1->sge(-C2->sext(C1->getBitWidth()))) {
- Constant *NewC =
- ConstantInt::get(X->getType(), *C2 + C1->trunc(C2->getBitWidth()));
- return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty);
- }
-
- // More general combining of constants in the wide type.
- // (sext (X +nsw NarrowC)) + C --> (sext X) + (sext(NarrowC) + C)
- Constant *NarrowC;
- if (match(Op0, m_OneUse(m_SExt(m_NSWAdd(m_Value(X), m_Constant(NarrowC)))))) {
- Constant *WideC = ConstantExpr::getSExt(NarrowC, Ty);
- Constant *NewC = ConstantExpr::getAdd(WideC, Op1C);
- Value *WideX = Builder.CreateSExt(X, Ty);
- return BinaryOperator::CreateAdd(WideX, NewC);
- }
- // (zext (X +nuw NarrowC)) + C --> (zext X) + (zext(NarrowC) + C)
- if (match(Op0, m_OneUse(m_ZExt(m_NUWAdd(m_Value(X), m_Constant(NarrowC)))))) {
- Constant *WideC = ConstantExpr::getZExt(NarrowC, Ty);
- Constant *NewC = ConstantExpr::getAdd(WideC, Op1C);
- Value *WideX = Builder.CreateZExt(X, Ty);
- return BinaryOperator::CreateAdd(WideX, NewC);
- }
-
- return nullptr;
-}
-
+
+ const APFloat &getFpVal() const {
+ assert(IsFp && BufHasFpVal && "Incorret state");
+ return *getFpValPtr();
+ }
+
+ APFloat &getFpVal() {
+ assert(IsFp && BufHasFpVal && "Incorret state");
+ return *getFpValPtr();
+ }
+
+ bool isInt() const { return !IsFp; }
+
+ // If the coefficient is represented by an integer, promote it to a
+ // floating point.
+ void convertToFpType(const fltSemantics &Sem);
+
+ // Construct an APFloat from a signed integer.
+ // TODO: We should get rid of this function when APFloat can be constructed
+ // from an *SIGNED* integer.
+ APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val);
+
+ bool IsFp = false;
+
+ // True iff FpValBuf contains an instance of APFloat.
+ bool BufHasFpVal = false;
+
+ // The integer coefficient of an individual addend is either 1 or -1,
+ // and we try to simplify at most 4 addends from neighboring at most
+ // two instructions. So the range of <IntVal> falls in [-4, 4]. APInt
+ // is overkill of this end.
+ short IntVal = 0;
+
+ AlignedCharArrayUnion<APFloat> FpValBuf;
+ };
+
+ /// FAddend is used to represent floating-point addend. An addend is
+ /// represented as <C, V>, where the V is a symbolic value, and C is a
+ /// constant coefficient. A constant addend is represented as <C, 0>.
+ class FAddend {
+ public:
+ FAddend() = default;
+
+ void operator+=(const FAddend &T) {
+ assert((Val == T.Val) && "Symbolic-values disagree");
+ Coeff += T.Coeff;
+ }
+
+ Value *getSymVal() const { return Val; }
+ const FAddendCoef &getCoef() const { return Coeff; }
+
+ bool isConstant() const { return Val == nullptr; }
+ bool isZero() const { return Coeff.isZero(); }
+
+ void set(short Coefficient, Value *V) {
+ Coeff.set(Coefficient);
+ Val = V;
+ }
+ void set(const APFloat &Coefficient, Value *V) {
+ Coeff.set(Coefficient);
+ Val = V;
+ }
+ void set(const ConstantFP *Coefficient, Value *V) {
+ Coeff.set(Coefficient->getValueAPF());
+ Val = V;
+ }
+
+ void negate() { Coeff.negate(); }
+
+ /// Drill down the U-D chain one step to find the definition of V, and
+ /// try to break the definition into one or two addends.
+ static unsigned drillValueDownOneStep(Value* V, FAddend &A0, FAddend &A1);
+
+ /// Similar to FAddend::drillDownOneStep() except that the value being
+ /// splitted is the addend itself.
+ unsigned drillAddendDownOneStep(FAddend &Addend0, FAddend &Addend1) const;
+
+ private:
+ void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; }
+
+ // This addend has the value of "Coeff * Val".
+ Value *Val = nullptr;
+ FAddendCoef Coeff;
+ };
+
+ /// FAddCombine is the class for optimizing an unsafe fadd/fsub along
+ /// with its neighboring at most two instructions.
+ ///
+ class FAddCombine {
+ public:
+ FAddCombine(InstCombiner::BuilderTy &B) : Builder(B) {}
+
+ Value *simplify(Instruction *FAdd);
+
+ private:
+ using AddendVect = SmallVector<const FAddend *, 4>;
+
+ Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota);
+
+ /// Convert given addend to a Value
+ Value *createAddendVal(const FAddend &A, bool& NeedNeg);
+
+ /// Return the number of instructions needed to emit the N-ary addition.
+ unsigned calcInstrNumber(const AddendVect& Vect);
+
+ Value *createFSub(Value *Opnd0, Value *Opnd1);
+ Value *createFAdd(Value *Opnd0, Value *Opnd1);
+ Value *createFMul(Value *Opnd0, Value *Opnd1);
+ Value *createFNeg(Value *V);
+ Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
+ void createInstPostProc(Instruction *NewInst, bool NoNumber = false);
+
+ // Debugging stuff are clustered here.
+ #ifndef NDEBUG
+ unsigned CreateInstrNum;
+ void initCreateInstNum() { CreateInstrNum = 0; }
+ void incCreateInstNum() { CreateInstrNum++; }
+ #else
+ void initCreateInstNum() {}
+ void incCreateInstNum() {}
+ #endif
+
+ InstCombiner::BuilderTy &Builder;
+ Instruction *Instr = nullptr;
+ };
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//
+// Implementation of
+// {FAddendCoef, FAddend, FAddition, FAddCombine}.
+//
+//===----------------------------------------------------------------------===//
+FAddendCoef::~FAddendCoef() {
+ if (BufHasFpVal)
+ getFpValPtr()->~APFloat();
+}
+
+void FAddendCoef::set(const APFloat& C) {
+ APFloat *P = getFpValPtr();
+
+ if (isInt()) {
+ // As the buffer is meanless byte stream, we cannot call
+ // APFloat::operator=().
+ new(P) APFloat(C);
+ } else
+ *P = C;
+
+ IsFp = BufHasFpVal = true;
+}
+
+void FAddendCoef::convertToFpType(const fltSemantics &Sem) {
+ if (!isInt())
+ return;
+
+ APFloat *P = getFpValPtr();
+ if (IntVal > 0)
+ new(P) APFloat(Sem, IntVal);
+ else {
+ new(P) APFloat(Sem, 0 - IntVal);
+ P->changeSign();
+ }
+ IsFp = BufHasFpVal = true;
+}
+
+APFloat FAddendCoef::createAPFloatFromInt(const fltSemantics &Sem, int Val) {
+ if (Val >= 0)
+ return APFloat(Sem, Val);
+
+ APFloat T(Sem, 0 - Val);
+ T.changeSign();
+
+ return T;
+}
+
+void FAddendCoef::operator=(const FAddendCoef &That) {
+ if (That.isInt())
+ set(That.IntVal);
+ else
+ set(That.getFpVal());
+}
+
+void FAddendCoef::operator+=(const FAddendCoef &That) {
+ RoundingMode RndMode = RoundingMode::NearestTiesToEven;
+ if (isInt() == That.isInt()) {
+ if (isInt())
+ IntVal += That.IntVal;
+ else
+ getFpVal().add(That.getFpVal(), RndMode);
+ return;
+ }
+
+ if (isInt()) {
+ const APFloat &T = That.getFpVal();
+ convertToFpType(T.getSemantics());
+ getFpVal().add(T, RndMode);
+ return;
+ }
+
+ APFloat &T = getFpVal();
+ T.add(createAPFloatFromInt(T.getSemantics(), That.IntVal), RndMode);
+}
+
+void FAddendCoef::operator*=(const FAddendCoef &That) {
+ if (That.isOne())
+ return;
+
+ if (That.isMinusOne()) {
+ negate();
+ return;
+ }
+
+ if (isInt() && That.isInt()) {
+ int Res = IntVal * (int)That.IntVal;
+ assert(!insaneIntVal(Res) && "Insane int value");
+ IntVal = Res;
+ return;
+ }
+
+ const fltSemantics &Semantic =
+ isInt() ? That.getFpVal().getSemantics() : getFpVal().getSemantics();
+
+ if (isInt())
+ convertToFpType(Semantic);
+ APFloat &F0 = getFpVal();
+
+ if (That.isInt())
+ F0.multiply(createAPFloatFromInt(Semantic, That.IntVal),
+ APFloat::rmNearestTiesToEven);
+ else
+ F0.multiply(That.getFpVal(), APFloat::rmNearestTiesToEven);
+}
+
+void FAddendCoef::negate() {
+ if (isInt())
+ IntVal = 0 - IntVal;
+ else
+ getFpVal().changeSign();
+}
+
+Value *FAddendCoef::getValue(Type *Ty) const {
+ return isInt() ?
+ ConstantFP::get(Ty, float(IntVal)) :
+ ConstantFP::get(Ty->getContext(), getFpVal());
+}
+
+// The definition of <Val> Addends
+// =========================================
+// A + B <1, A>, <1,B>
+// A - B <1, A>, <1,B>
+// 0 - B <-1, B>
+// C * A, <C, A>
+// A + C <1, A> <C, NULL>
+// 0 +/- 0 <0, NULL> (corner case)
+//
+// Legend: A and B are not constant, C is constant
+unsigned FAddend::drillValueDownOneStep
+ (Value *Val, FAddend &Addend0, FAddend &Addend1) {
+ Instruction *I = nullptr;
+ if (!Val || !(I = dyn_cast<Instruction>(Val)))
+ return 0;
+
+ unsigned Opcode = I->getOpcode();
+
+ if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub) {
+ ConstantFP *C0, *C1;
+ Value *Opnd0 = I->getOperand(0);
+ Value *Opnd1 = I->getOperand(1);
+ if ((C0 = dyn_cast<ConstantFP>(Opnd0)) && C0->isZero())
+ Opnd0 = nullptr;
+
+ if ((C1 = dyn_cast<ConstantFP>(Opnd1)) && C1->isZero())
+ Opnd1 = nullptr;
+
+ if (Opnd0) {
+ if (!C0)
+ Addend0.set(1, Opnd0);
+ else
+ Addend0.set(C0, nullptr);
+ }
+
+ if (Opnd1) {
+ FAddend &Addend = Opnd0 ? Addend1 : Addend0;
+ if (!C1)
+ Addend.set(1, Opnd1);
+ else
+ Addend.set(C1, nullptr);
+ if (Opcode == Instruction::FSub)
+ Addend.negate();
+ }
+
+ if (Opnd0 || Opnd1)
+ return Opnd0 && Opnd1 ? 2 : 1;
+
+ // Both operands are zero. Weird!
+ Addend0.set(APFloat(C0->getValueAPF().getSemantics()), nullptr);
+ return 1;
+ }
+
+ if (I->getOpcode() == Instruction::FMul) {
+ Value *V0 = I->getOperand(0);
+ Value *V1 = I->getOperand(1);
+ if (ConstantFP *C = dyn_cast<ConstantFP>(V0)) {
+ Addend0.set(C, V1);
+ return 1;
+ }
+
+ if (ConstantFP *C = dyn_cast<ConstantFP>(V1)) {
+ Addend0.set(C, V0);
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+// Try to break *this* addend into two addends. e.g. Suppose this addend is
+// <2.3, V>, and V = X + Y, by calling this function, we obtain two addends,
+// i.e. <2.3, X> and <2.3, Y>.
+unsigned FAddend::drillAddendDownOneStep
+ (FAddend &Addend0, FAddend &Addend1) const {
+ if (isConstant())
+ return 0;
+
+ unsigned BreakNum = FAddend::drillValueDownOneStep(Val, Addend0, Addend1);
+ if (!BreakNum || Coeff.isOne())
+ return BreakNum;
+
+ Addend0.Scale(Coeff);
+
+ if (BreakNum == 2)
+ Addend1.Scale(Coeff);
+
+ return BreakNum;
+}
+
+Value *FAddCombine::simplify(Instruction *I) {
+ assert(I->hasAllowReassoc() && I->hasNoSignedZeros() &&
+ "Expected 'reassoc'+'nsz' instruction");
+
+ // Currently we are not able to handle vector type.
+ if (I->getType()->isVectorTy())
+ return nullptr;
+
+ assert((I->getOpcode() == Instruction::FAdd ||
+ I->getOpcode() == Instruction::FSub) && "Expect add/sub");
+
+ // Save the instruction before calling other member-functions.
+ Instr = I;
+
+ FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1;
+
+ unsigned OpndNum = FAddend::drillValueDownOneStep(I, Opnd0, Opnd1);
+
+ // Step 1: Expand the 1st addend into Opnd0_0 and Opnd0_1.
+ unsigned Opnd0_ExpNum = 0;
+ unsigned Opnd1_ExpNum = 0;
+
+ if (!Opnd0.isConstant())
+ Opnd0_ExpNum = Opnd0.drillAddendDownOneStep(Opnd0_0, Opnd0_1);
+
+ // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1.
+ if (OpndNum == 2 && !Opnd1.isConstant())
+ Opnd1_ExpNum = Opnd1.drillAddendDownOneStep(Opnd1_0, Opnd1_1);
+
+ // Step 3: Try to optimize Opnd0_0 + Opnd0_1 + Opnd1_0 + Opnd1_1
+ if (Opnd0_ExpNum && Opnd1_ExpNum) {
+ AddendVect AllOpnds;
+ AllOpnds.push_back(&Opnd0_0);
+ AllOpnds.push_back(&Opnd1_0);
+ if (Opnd0_ExpNum == 2)
+ AllOpnds.push_back(&Opnd0_1);
+ if (Opnd1_ExpNum == 2)
+ AllOpnds.push_back(&Opnd1_1);
+
+ // Compute instruction quota. We should save at least one instruction.
+ unsigned InstQuota = 0;
+
+ Value *V0 = I->getOperand(0);
+ Value *V1 = I->getOperand(1);
+ InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&
+ (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1;
+
+ if (Value *R = simplifyFAdd(AllOpnds, InstQuota))
+ return R;
+ }
+
+ if (OpndNum != 2) {
+ // The input instruction is : "I=0.0 +/- V". If the "V" were able to be
+ // splitted into two addends, say "V = X - Y", the instruction would have
+ // been optimized into "I = Y - X" in the previous steps.
+ //
+ const FAddendCoef &CE = Opnd0.getCoef();
+ return CE.isOne() ? Opnd0.getSymVal() : nullptr;
+ }
+
+ // step 4: Try to optimize Opnd0 + Opnd1_0 [+ Opnd1_1]
+ if (Opnd1_ExpNum) {
+ AddendVect AllOpnds;
+ AllOpnds.push_back(&Opnd0);
+ AllOpnds.push_back(&Opnd1_0);
+ if (Opnd1_ExpNum == 2)
+ AllOpnds.push_back(&Opnd1_1);
+
+ if (Value *R = simplifyFAdd(AllOpnds, 1))
+ return R;
+ }
+
+ // step 5: Try to optimize Opnd1 + Opnd0_0 [+ Opnd0_1]
+ if (Opnd0_ExpNum) {
+ AddendVect AllOpnds;
+ AllOpnds.push_back(&Opnd1);
+ AllOpnds.push_back(&Opnd0_0);
+ if (Opnd0_ExpNum == 2)
+ AllOpnds.push_back(&Opnd0_1);
+
+ if (Value *R = simplifyFAdd(AllOpnds, 1))
+ return R;
+ }
+
+ return nullptr;
+}
+
+Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
+ unsigned AddendNum = Addends.size();
+ assert(AddendNum <= 4 && "Too many addends");
+
+ // For saving intermediate results;
+ unsigned NextTmpIdx = 0;
+ FAddend TmpResult[3];
+
+ // Points to the constant addend of the resulting simplified expression.
+ // If the resulting expr has constant-addend, this constant-addend is
+ // desirable to reside at the top of the resulting expression tree. Placing
+ // constant close to supper-expr(s) will potentially reveal some optimization
+ // opportunities in super-expr(s).
+ const FAddend *ConstAdd = nullptr;
+
+ // Simplified addends are placed <SimpVect>.
+ AddendVect SimpVect;
+
+ // The outer loop works on one symbolic-value at a time. Suppose the input
+ // addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ...
+ // The symbolic-values will be processed in this order: x, y, z.
+ for (unsigned SymIdx = 0; SymIdx < AddendNum; SymIdx++) {
+
+ const FAddend *ThisAddend = Addends[SymIdx];
+ if (!ThisAddend) {
+ // This addend was processed before.
+ continue;
+ }
+
+ Value *Val = ThisAddend->getSymVal();
+ unsigned StartIdx = SimpVect.size();
+ SimpVect.push_back(ThisAddend);
+
+ // The inner loop collects addends sharing same symbolic-value, and these
+ // addends will be later on folded into a single addend. Following above
+ // example, if the symbolic value "y" is being processed, the inner loop
+ // will collect two addends "<b1,y>" and "<b2,Y>". These two addends will
+ // be later on folded into "<b1+b2, y>".
+ for (unsigned SameSymIdx = SymIdx + 1;
+ SameSymIdx < AddendNum; SameSymIdx++) {
+ const FAddend *T = Addends[SameSymIdx];
+ if (T && T->getSymVal() == Val) {
+ // Set null such that next iteration of the outer loop will not process
+ // this addend again.
+ Addends[SameSymIdx] = nullptr;
+ SimpVect.push_back(T);
+ }
+ }
+
+ // If multiple addends share same symbolic value, fold them together.
+ if (StartIdx + 1 != SimpVect.size()) {
+ FAddend &R = TmpResult[NextTmpIdx ++];
+ R = *SimpVect[StartIdx];
+ for (unsigned Idx = StartIdx + 1; Idx < SimpVect.size(); Idx++)
+ R += *SimpVect[Idx];
+
+ // Pop all addends being folded and push the resulting folded addend.
+ SimpVect.resize(StartIdx);
+ if (Val) {
+ if (!R.isZero()) {
+ SimpVect.push_back(&R);
+ }
+ } else {
+ // Don't push constant addend at this time. It will be the last element
+ // of <SimpVect>.
+ ConstAdd = &R;
+ }
+ }
+ }
+
+ assert((NextTmpIdx <= array_lengthof(TmpResult) + 1) &&
+ "out-of-bound access");
+
+ if (ConstAdd)
+ SimpVect.push_back(ConstAdd);
+
+ Value *Result;
+ if (!SimpVect.empty())
+ Result = createNaryFAdd(SimpVect, InstrQuota);
+ else {
+ // The addition is folded to 0.0.
+ Result = ConstantFP::get(Instr->getType(), 0.0);
+ }
+
+ return Result;
+}
+
+Value *FAddCombine::createNaryFAdd
+ (const AddendVect &Opnds, unsigned InstrQuota) {
+ assert(!Opnds.empty() && "Expect at least one addend");
+
+ // Step 1: Check if the # of instructions needed exceeds the quota.
+
+ unsigned InstrNeeded = calcInstrNumber(Opnds);
+ if (InstrNeeded > InstrQuota)
+ return nullptr;
+
+ initCreateInstNum();
+
+ // step 2: Emit the N-ary addition.
+ // Note that at most three instructions are involved in Fadd-InstCombine: the
+ // addition in question, and at most two neighboring instructions.
+ // The resulting optimized addition should have at least one less instruction
+ // than the original addition expression tree. This implies that the resulting
+ // N-ary addition has at most two instructions, and we don't need to worry
+ // about tree-height when constructing the N-ary addition.
+
+ Value *LastVal = nullptr;
+ bool LastValNeedNeg = false;
+
+ // Iterate the addends, creating fadd/fsub using adjacent two addends.
+ for (const FAddend *Opnd : Opnds) {
+ bool NeedNeg;
+ Value *V = createAddendVal(*Opnd, NeedNeg);
+ if (!LastVal) {
+ LastVal = V;
+ LastValNeedNeg = NeedNeg;
+ continue;
+ }
+
+ if (LastValNeedNeg == NeedNeg) {
+ LastVal = createFAdd(LastVal, V);
+ continue;
+ }
+
+ if (LastValNeedNeg)
+ LastVal = createFSub(V, LastVal);
+ else
+ LastVal = createFSub(LastVal, V);
+
+ LastValNeedNeg = false;
+ }
+
+ if (LastValNeedNeg) {
+ LastVal = createFNeg(LastVal);
+ }
+
+#ifndef NDEBUG
+ assert(CreateInstrNum == InstrNeeded &&
+ "Inconsistent in instruction numbers");
+#endif
+
+ return LastVal;
+}
+
+Value *FAddCombine::createFSub(Value *Opnd0, Value *Opnd1) {
+ Value *V = Builder.CreateFSub(Opnd0, Opnd1);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ createInstPostProc(I);
+ return V;
+}
+
+Value *FAddCombine::createFNeg(Value *V) {
+ Value *NewV = Builder.CreateFNeg(V);
+ if (Instruction *I = dyn_cast<Instruction>(NewV))
+ createInstPostProc(I, true); // fneg's don't receive instruction numbers.
+ return NewV;
+}
+
+Value *FAddCombine::createFAdd(Value *Opnd0, Value *Opnd1) {
+ Value *V = Builder.CreateFAdd(Opnd0, Opnd1);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ createInstPostProc(I);
+ return V;
+}
+
+Value *FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) {
+ Value *V = Builder.CreateFMul(Opnd0, Opnd1);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ createInstPostProc(I);
+ return V;
+}
+
+void FAddCombine::createInstPostProc(Instruction *NewInstr, bool NoNumber) {
+ NewInstr->setDebugLoc(Instr->getDebugLoc());
+
+ // Keep track of the number of instruction created.
+ if (!NoNumber)
+ incCreateInstNum();
+
+ // Propagate fast-math flags
+ NewInstr->setFastMathFlags(Instr->getFastMathFlags());
+}
+
+// Return the number of instruction needed to emit the N-ary addition.
+// NOTE: Keep this function in sync with createAddendVal().
+unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
+ unsigned OpndNum = Opnds.size();
+ unsigned InstrNeeded = OpndNum - 1;
+
+ // The number of addends in the form of "(-1)*x".
+ unsigned NegOpndNum = 0;
+
+ // Adjust the number of instructions needed to emit the N-ary add.
+ for (const FAddend *Opnd : Opnds) {
+ if (Opnd->isConstant())
+ continue;
+
+ // The constant check above is really for a few special constant
+ // coefficients.
+ if (isa<UndefValue>(Opnd->getSymVal()))
+ continue;
+
+ const FAddendCoef &CE = Opnd->getCoef();
+ if (CE.isMinusOne() || CE.isMinusTwo())
+ NegOpndNum++;
+
+ // Let the addend be "c * x". If "c == +/-1", the value of the addend
+ // is immediately available; otherwise, it needs exactly one instruction
+ // to evaluate the value.
+ if (!CE.isMinusOne() && !CE.isOne())
+ InstrNeeded++;
+ }
+ return InstrNeeded;
+}
+
+// Input Addend Value NeedNeg(output)
+// ================================================================
+// Constant C C false
+// <+/-1, V> V coefficient is -1
+// <2/-2, V> "fadd V, V" coefficient is -2
+// <C, V> "fmul V, C" false
+//
+// NOTE: Keep this function in sync with FAddCombine::calcInstrNumber.
+Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) {
+ const FAddendCoef &Coeff = Opnd.getCoef();
+
+ if (Opnd.isConstant()) {
+ NeedNeg = false;
+ return Coeff.getValue(Instr->getType());
+ }
+
+ Value *OpndVal = Opnd.getSymVal();
+
+ if (Coeff.isMinusOne() || Coeff.isOne()) {
+ NeedNeg = Coeff.isMinusOne();
+ return OpndVal;
+ }
+
+ if (Coeff.isTwo() || Coeff.isMinusTwo()) {
+ NeedNeg = Coeff.isMinusTwo();
+ return createFAdd(OpndVal, OpndVal);
+ }
+
+ NeedNeg = false;
+ return createFMul(OpndVal, Coeff.getValue(Instr->getType()));
+}
+
+// Checks if any operand is negative and we can convert add to sub.
+// This function checks for following negative patterns
+// ADD(XOR(OR(Z, NOT(C)), C)), 1) == NEG(AND(Z, C))
+// ADD(XOR(AND(Z, C), C), 1) == NEG(OR(Z, ~C))
+// XOR(AND(Z, C), (C + 1)) == NEG(OR(Z, ~C)) if C is even
+static Value *checkForNegativeOperand(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+ // This function creates 2 instructions to replace ADD, we need at least one
+ // of LHS or RHS to have one use to ensure benefit in transform.
+ if (!LHS->hasOneUse() && !RHS->hasOneUse())
+ return nullptr;
+
+ Value *X = nullptr, *Y = nullptr, *Z = nullptr;
+ const APInt *C1 = nullptr, *C2 = nullptr;
+
+ // if ONE is on other side, swap
+ if (match(RHS, m_Add(m_Value(X), m_One())))
+ std::swap(LHS, RHS);
+
+ if (match(LHS, m_Add(m_Value(X), m_One()))) {
+ // if XOR on other side, swap
+ if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1))))
+ std::swap(X, RHS);
+
+ if (match(X, m_Xor(m_Value(Y), m_APInt(C1)))) {
+ // X = XOR(Y, C1), Y = OR(Z, C2), C2 = NOT(C1) ==> X == NOT(AND(Z, C1))
+ // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, AND(Z, C1))
+ if (match(Y, m_Or(m_Value(Z), m_APInt(C2))) && (*C2 == ~(*C1))) {
+ Value *NewAnd = Builder.CreateAnd(Z, *C1);
+ return Builder.CreateSub(RHS, NewAnd, "sub");
+ } else if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && (*C1 == *C2)) {
+ // X = XOR(Y, C1), Y = AND(Z, C2), C2 == C1 ==> X == NOT(OR(Z, ~C1))
+ // ADD(ADD(X, 1), RHS) == ADD(X, ADD(RHS, 1)) == SUB(RHS, OR(Z, ~C1))
+ Value *NewOr = Builder.CreateOr(Z, ~(*C1));
+ return Builder.CreateSub(RHS, NewOr, "sub");
+ }
+ }
+ }
+
+ // Restore LHS and RHS
+ LHS = I.getOperand(0);
+ RHS = I.getOperand(1);
+
+ // if XOR is on other side, swap
+ if (match(RHS, m_Xor(m_Value(Y), m_APInt(C1))))
+ std::swap(LHS, RHS);
+
+ // C2 is ODD
+ // LHS = XOR(Y, C1), Y = AND(Z, C2), C1 == (C2 + 1) => LHS == NEG(OR(Z, ~C2))
+ // ADD(LHS, RHS) == SUB(RHS, OR(Z, ~C2))
+ if (match(LHS, m_Xor(m_Value(Y), m_APInt(C1))))
+ if (C1->countTrailingZeros() == 0)
+ if (match(Y, m_And(m_Value(Z), m_APInt(C2))) && *C1 == (*C2 + 1)) {
+ Value *NewOr = Builder.CreateOr(Z, ~(*C2));
+ return Builder.CreateSub(RHS, NewOr, "sub");
+ }
+ return nullptr;
+}
+
+/// Wrapping flags may allow combining constants separated by an extend.
+static Instruction *foldNoWrapAdd(BinaryOperator &Add,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
+ Type *Ty = Add.getType();
+ Constant *Op1C;
+ if (!match(Op1, m_Constant(Op1C)))
+ return nullptr;
+
+ // Try this match first because it results in an add in the narrow type.
+ // (zext (X +nuw C2)) + C1 --> zext (X + (C2 + trunc(C1)))
+ Value *X;
+ const APInt *C1, *C2;
+ if (match(Op1, m_APInt(C1)) &&
+ match(Op0, m_OneUse(m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2))))) &&
+ C1->isNegative() && C1->sge(-C2->sext(C1->getBitWidth()))) {
+ Constant *NewC =
+ ConstantInt::get(X->getType(), *C2 + C1->trunc(C2->getBitWidth()));
+ return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty);
+ }
+
+ // More general combining of constants in the wide type.
+ // (sext (X +nsw NarrowC)) + C --> (sext X) + (sext(NarrowC) + C)
+ Constant *NarrowC;
+ if (match(Op0, m_OneUse(m_SExt(m_NSWAdd(m_Value(X), m_Constant(NarrowC)))))) {
+ Constant *WideC = ConstantExpr::getSExt(NarrowC, Ty);
+ Constant *NewC = ConstantExpr::getAdd(WideC, Op1C);
+ Value *WideX = Builder.CreateSExt(X, Ty);
+ return BinaryOperator::CreateAdd(WideX, NewC);
+ }
+ // (zext (X +nuw NarrowC)) + C --> (zext X) + (zext(NarrowC) + C)
+ if (match(Op0, m_OneUse(m_ZExt(m_NUWAdd(m_Value(X), m_Constant(NarrowC)))))) {
+ Constant *WideC = ConstantExpr::getZExt(NarrowC, Ty);
+ Constant *NewC = ConstantExpr::getAdd(WideC, Op1C);
+ Value *WideX = Builder.CreateZExt(X, Ty);
+ return BinaryOperator::CreateAdd(WideX, NewC);
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
- Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
- Constant *Op1C;
- if (!match(Op1, m_Constant(Op1C)))
- return nullptr;
-
- if (Instruction *NV = foldBinOpIntoSelectOrPhi(Add))
- return NV;
-
- Value *X;
- Constant *Op00C;
-
- // add (sub C1, X), C2 --> sub (add C1, C2), X
- if (match(Op0, m_Sub(m_Constant(Op00C), m_Value(X))))
- return BinaryOperator::CreateSub(ConstantExpr::getAdd(Op00C, Op1C), X);
-
- Value *Y;
-
- // add (sub X, Y), -1 --> add (not Y), X
- if (match(Op0, m_OneUse(m_Sub(m_Value(X), m_Value(Y)))) &&
- match(Op1, m_AllOnes()))
- return BinaryOperator::CreateAdd(Builder.CreateNot(Y), X);
-
- // zext(bool) + C -> bool ? C + 1 : C
- if (match(Op0, m_ZExt(m_Value(X))) &&
- X->getType()->getScalarSizeInBits() == 1)
+ Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1);
+ Constant *Op1C;
+ if (!match(Op1, m_Constant(Op1C)))
+ return nullptr;
+
+ if (Instruction *NV = foldBinOpIntoSelectOrPhi(Add))
+ return NV;
+
+ Value *X;
+ Constant *Op00C;
+
+ // add (sub C1, X), C2 --> sub (add C1, C2), X
+ if (match(Op0, m_Sub(m_Constant(Op00C), m_Value(X))))
+ return BinaryOperator::CreateSub(ConstantExpr::getAdd(Op00C, Op1C), X);
+
+ Value *Y;
+
+ // add (sub X, Y), -1 --> add (not Y), X
+ if (match(Op0, m_OneUse(m_Sub(m_Value(X), m_Value(Y)))) &&
+ match(Op1, m_AllOnes()))
+ return BinaryOperator::CreateAdd(Builder.CreateNot(Y), X);
+
+ // zext(bool) + C -> bool ? C + 1 : C
+ if (match(Op0, m_ZExt(m_Value(X))) &&
+ X->getType()->getScalarSizeInBits() == 1)
return SelectInst::Create(X, InstCombiner::AddOne(Op1C), Op1);
- // sext(bool) + C -> bool ? C - 1 : C
- if (match(Op0, m_SExt(m_Value(X))) &&
- X->getType()->getScalarSizeInBits() == 1)
+ // sext(bool) + C -> bool ? C - 1 : C
+ if (match(Op0, m_SExt(m_Value(X))) &&
+ X->getType()->getScalarSizeInBits() == 1)
return SelectInst::Create(X, InstCombiner::SubOne(Op1C), Op1);
-
- // ~X + C --> (C-1) - X
- if (match(Op0, m_Not(m_Value(X))))
+
+ // ~X + C --> (C-1) - X
+ if (match(Op0, m_Not(m_Value(X))))
return BinaryOperator::CreateSub(InstCombiner::SubOne(Op1C), X);
-
- const APInt *C;
- if (!match(Op1, m_APInt(C)))
- return nullptr;
-
- // (X | C2) + C --> (X | C2) ^ C2 iff (C2 == -C)
- const APInt *C2;
- if (match(Op0, m_Or(m_Value(), m_APInt(C2))) && *C2 == -*C)
- return BinaryOperator::CreateXor(Op0, ConstantInt::get(Add.getType(), *C2));
-
- if (C->isSignMask()) {
- // If wrapping is not allowed, then the addition must set the sign bit:
- // X + (signmask) --> X | signmask
- if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap())
- return BinaryOperator::CreateOr(Op0, Op1);
-
- // If wrapping is allowed, then the addition flips the sign bit of LHS:
- // X + (signmask) --> X ^ signmask
- return BinaryOperator::CreateXor(Op0, Op1);
- }
-
- // Is this add the last step in a convoluted sext?
- // add(zext(xor i16 X, -32768), -32768) --> sext X
- Type *Ty = Add.getType();
- if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) &&
- C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C)
- return CastInst::Create(Instruction::SExt, X, Ty);
-
+
+ const APInt *C;
+ if (!match(Op1, m_APInt(C)))
+ return nullptr;
+
+ // (X | C2) + C --> (X | C2) ^ C2 iff (C2 == -C)
+ const APInt *C2;
+ if (match(Op0, m_Or(m_Value(), m_APInt(C2))) && *C2 == -*C)
+ return BinaryOperator::CreateXor(Op0, ConstantInt::get(Add.getType(), *C2));
+
+ if (C->isSignMask()) {
+ // If wrapping is not allowed, then the addition must set the sign bit:
+ // X + (signmask) --> X | signmask
+ if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap())
+ return BinaryOperator::CreateOr(Op0, Op1);
+
+ // If wrapping is allowed, then the addition flips the sign bit of LHS:
+ // X + (signmask) --> X ^ signmask
+ return BinaryOperator::CreateXor(Op0, Op1);
+ }
+
+ // Is this add the last step in a convoluted sext?
+ // add(zext(xor i16 X, -32768), -32768) --> sext X
+ Type *Ty = Add.getType();
+ if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) &&
+ C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C)
+ return CastInst::Create(Instruction::SExt, X, Ty);
+
if (match(Op0, m_Xor(m_Value(X), m_APInt(C2)))) {
// (X ^ signmask) + C --> (X + (signmask ^ C))
if (C2->isSignMask())
@@ -957,26 +957,26 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
}
}
- if (C->isOneValue() && Op0->hasOneUse()) {
- // add (sext i1 X), 1 --> zext (not X)
- // TODO: The smallest IR representation is (select X, 0, 1), and that would
- // not require the one-use check. But we need to remove a transform in
- // visitSelect and make sure that IR value tracking for select is equal or
- // better than for these ops.
- if (match(Op0, m_SExt(m_Value(X))) &&
- X->getType()->getScalarSizeInBits() == 1)
- return new ZExtInst(Builder.CreateNot(X), Ty);
-
- // Shifts and add used to flip and mask off the low bit:
- // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1
- const APInt *C3;
- if (match(Op0, m_AShr(m_Shl(m_Value(X), m_APInt(C2)), m_APInt(C3))) &&
- C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) {
- Value *NotX = Builder.CreateNot(X);
- return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1));
- }
- }
-
+ if (C->isOneValue() && Op0->hasOneUse()) {
+ // add (sext i1 X), 1 --> zext (not X)
+ // TODO: The smallest IR representation is (select X, 0, 1), and that would
+ // not require the one-use check. But we need to remove a transform in
+ // visitSelect and make sure that IR value tracking for select is equal or
+ // better than for these ops.
+ if (match(Op0, m_SExt(m_Value(X))) &&
+ X->getType()->getScalarSizeInBits() == 1)
+ return new ZExtInst(Builder.CreateNot(X), Ty);
+
+ // Shifts and add used to flip and mask off the low bit:
+ // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1
+ const APInt *C3;
+ if (match(Op0, m_AShr(m_Shl(m_Value(X), m_APInt(C2)), m_APInt(C3))) &&
+ C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) {
+ Value *NotX = Builder.CreateNot(X);
+ return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1));
+ }
+ }
+
// If all bits affected by the add are included in a high-bit-mask, do the
// add before the mask op:
// (X & 0xFF00) + xx00 --> (X + xx00) & 0xFF00
@@ -986,261 +986,261 @@ Instruction *InstCombinerImpl::foldAddWithConstant(BinaryOperator &Add) {
return BinaryOperator::CreateAnd(NewAdd, ConstantInt::get(Ty, *C2));
}
- return nullptr;
-}
-
-// Matches multiplication expression Op * C where C is a constant. Returns the
-// constant value in C and the other operand in Op. Returns true if such a
-// match is found.
-static bool MatchMul(Value *E, Value *&Op, APInt &C) {
- const APInt *AI;
- if (match(E, m_Mul(m_Value(Op), m_APInt(AI)))) {
- C = *AI;
- return true;
- }
- if (match(E, m_Shl(m_Value(Op), m_APInt(AI)))) {
- C = APInt(AI->getBitWidth(), 1);
- C <<= *AI;
- return true;
- }
- return false;
-}
-
-// Matches remainder expression Op % C where C is a constant. Returns the
-// constant value in C and the other operand in Op. Returns the signedness of
-// the remainder operation in IsSigned. Returns true if such a match is
-// found.
-static bool MatchRem(Value *E, Value *&Op, APInt &C, bool &IsSigned) {
- const APInt *AI;
- IsSigned = false;
- if (match(E, m_SRem(m_Value(Op), m_APInt(AI)))) {
- IsSigned = true;
- C = *AI;
- return true;
- }
- if (match(E, m_URem(m_Value(Op), m_APInt(AI)))) {
- C = *AI;
- return true;
- }
- if (match(E, m_And(m_Value(Op), m_APInt(AI))) && (*AI + 1).isPowerOf2()) {
- C = *AI + 1;
- return true;
- }
- return false;
-}
-
-// Matches division expression Op / C with the given signedness as indicated
-// by IsSigned, where C is a constant. Returns the constant value in C and the
-// other operand in Op. Returns true if such a match is found.
-static bool MatchDiv(Value *E, Value *&Op, APInt &C, bool IsSigned) {
- const APInt *AI;
- if (IsSigned && match(E, m_SDiv(m_Value(Op), m_APInt(AI)))) {
- C = *AI;
- return true;
- }
- if (!IsSigned) {
- if (match(E, m_UDiv(m_Value(Op), m_APInt(AI)))) {
- C = *AI;
- return true;
- }
- if (match(E, m_LShr(m_Value(Op), m_APInt(AI)))) {
- C = APInt(AI->getBitWidth(), 1);
- C <<= *AI;
- return true;
- }
- }
- return false;
-}
-
-// Returns whether C0 * C1 with the given signedness overflows.
-static bool MulWillOverflow(APInt &C0, APInt &C1, bool IsSigned) {
- bool overflow;
- if (IsSigned)
- (void)C0.smul_ov(C1, overflow);
- else
- (void)C0.umul_ov(C1, overflow);
- return overflow;
-}
-
-// Simplifies X % C0 + (( X / C0 ) % C1) * C0 to X % (C0 * C1), where (C0 * C1)
-// does not overflow.
+ return nullptr;
+}
+
+// Matches multiplication expression Op * C where C is a constant. Returns the
+// constant value in C and the other operand in Op. Returns true if such a
+// match is found.
+static bool MatchMul(Value *E, Value *&Op, APInt &C) {
+ const APInt *AI;
+ if (match(E, m_Mul(m_Value(Op), m_APInt(AI)))) {
+ C = *AI;
+ return true;
+ }
+ if (match(E, m_Shl(m_Value(Op), m_APInt(AI)))) {
+ C = APInt(AI->getBitWidth(), 1);
+ C <<= *AI;
+ return true;
+ }
+ return false;
+}
+
+// Matches remainder expression Op % C where C is a constant. Returns the
+// constant value in C and the other operand in Op. Returns the signedness of
+// the remainder operation in IsSigned. Returns true if such a match is
+// found.
+static bool MatchRem(Value *E, Value *&Op, APInt &C, bool &IsSigned) {
+ const APInt *AI;
+ IsSigned = false;
+ if (match(E, m_SRem(m_Value(Op), m_APInt(AI)))) {
+ IsSigned = true;
+ C = *AI;
+ return true;
+ }
+ if (match(E, m_URem(m_Value(Op), m_APInt(AI)))) {
+ C = *AI;
+ return true;
+ }
+ if (match(E, m_And(m_Value(Op), m_APInt(AI))) && (*AI + 1).isPowerOf2()) {
+ C = *AI + 1;
+ return true;
+ }
+ return false;
+}
+
+// Matches division expression Op / C with the given signedness as indicated
+// by IsSigned, where C is a constant. Returns the constant value in C and the
+// other operand in Op. Returns true if such a match is found.
+static bool MatchDiv(Value *E, Value *&Op, APInt &C, bool IsSigned) {
+ const APInt *AI;
+ if (IsSigned && match(E, m_SDiv(m_Value(Op), m_APInt(AI)))) {
+ C = *AI;
+ return true;
+ }
+ if (!IsSigned) {
+ if (match(E, m_UDiv(m_Value(Op), m_APInt(AI)))) {
+ C = *AI;
+ return true;
+ }
+ if (match(E, m_LShr(m_Value(Op), m_APInt(AI)))) {
+ C = APInt(AI->getBitWidth(), 1);
+ C <<= *AI;
+ return true;
+ }
+ }
+ return false;
+}
+
+// Returns whether C0 * C1 with the given signedness overflows.
+static bool MulWillOverflow(APInt &C0, APInt &C1, bool IsSigned) {
+ bool overflow;
+ if (IsSigned)
+ (void)C0.smul_ov(C1, overflow);
+ else
+ (void)C0.umul_ov(C1, overflow);
+ return overflow;
+}
+
+// Simplifies X % C0 + (( X / C0 ) % C1) * C0 to X % (C0 * C1), where (C0 * C1)
+// does not overflow.
Value *InstCombinerImpl::SimplifyAddWithRemainder(BinaryOperator &I) {
- Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
- Value *X, *MulOpV;
- APInt C0, MulOpC;
- bool IsSigned;
- // Match I = X % C0 + MulOpV * C0
- if (((MatchRem(LHS, X, C0, IsSigned) && MatchMul(RHS, MulOpV, MulOpC)) ||
- (MatchRem(RHS, X, C0, IsSigned) && MatchMul(LHS, MulOpV, MulOpC))) &&
- C0 == MulOpC) {
- Value *RemOpV;
- APInt C1;
- bool Rem2IsSigned;
- // Match MulOpC = RemOpV % C1
- if (MatchRem(MulOpV, RemOpV, C1, Rem2IsSigned) &&
- IsSigned == Rem2IsSigned) {
- Value *DivOpV;
- APInt DivOpC;
- // Match RemOpV = X / C0
- if (MatchDiv(RemOpV, DivOpV, DivOpC, IsSigned) && X == DivOpV &&
- C0 == DivOpC && !MulWillOverflow(C0, C1, IsSigned)) {
- Value *NewDivisor = ConstantInt::get(X->getType(), C0 * C1);
- return IsSigned ? Builder.CreateSRem(X, NewDivisor, "srem")
- : Builder.CreateURem(X, NewDivisor, "urem");
- }
- }
- }
-
- return nullptr;
-}
-
-/// Fold
-/// (1 << NBits) - 1
-/// Into:
-/// ~(-(1 << NBits))
-/// Because a 'not' is better for bit-tracking analysis and other transforms
-/// than an 'add'. The new shl is always nsw, and is nuw if old `and` was.
-static Instruction *canonicalizeLowbitMask(BinaryOperator &I,
- InstCombiner::BuilderTy &Builder) {
- Value *NBits;
- if (!match(&I, m_Add(m_OneUse(m_Shl(m_One(), m_Value(NBits))), m_AllOnes())))
- return nullptr;
-
- Constant *MinusOne = Constant::getAllOnesValue(NBits->getType());
- Value *NotMask = Builder.CreateShl(MinusOne, NBits, "notmask");
- // Be wary of constant folding.
- if (auto *BOp = dyn_cast<BinaryOperator>(NotMask)) {
- // Always NSW. But NUW propagates from `add`.
- BOp->setHasNoSignedWrap();
- BOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
- }
-
- return BinaryOperator::CreateNot(NotMask, I.getName());
-}
-
-static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) {
- assert(I.getOpcode() == Instruction::Add && "Expecting add instruction");
- Type *Ty = I.getType();
- auto getUAddSat = [&]() {
- return Intrinsic::getDeclaration(I.getModule(), Intrinsic::uadd_sat, Ty);
- };
-
- // add (umin X, ~Y), Y --> uaddsat X, Y
- Value *X, *Y;
- if (match(&I, m_c_Add(m_c_UMin(m_Value(X), m_Not(m_Value(Y))),
- m_Deferred(Y))))
- return CallInst::Create(getUAddSat(), { X, Y });
-
- // add (umin X, ~C), C --> uaddsat X, C
- const APInt *C, *NotC;
- if (match(&I, m_Add(m_UMin(m_Value(X), m_APInt(NotC)), m_APInt(C))) &&
- *C == ~*NotC)
- return CallInst::Create(getUAddSat(), { X, ConstantInt::get(Ty, *C) });
-
- return nullptr;
-}
-
+ Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+ Value *X, *MulOpV;
+ APInt C0, MulOpC;
+ bool IsSigned;
+ // Match I = X % C0 + MulOpV * C0
+ if (((MatchRem(LHS, X, C0, IsSigned) && MatchMul(RHS, MulOpV, MulOpC)) ||
+ (MatchRem(RHS, X, C0, IsSigned) && MatchMul(LHS, MulOpV, MulOpC))) &&
+ C0 == MulOpC) {
+ Value *RemOpV;
+ APInt C1;
+ bool Rem2IsSigned;
+ // Match MulOpC = RemOpV % C1
+ if (MatchRem(MulOpV, RemOpV, C1, Rem2IsSigned) &&
+ IsSigned == Rem2IsSigned) {
+ Value *DivOpV;
+ APInt DivOpC;
+ // Match RemOpV = X / C0
+ if (MatchDiv(RemOpV, DivOpV, DivOpC, IsSigned) && X == DivOpV &&
+ C0 == DivOpC && !MulWillOverflow(C0, C1, IsSigned)) {
+ Value *NewDivisor = ConstantInt::get(X->getType(), C0 * C1);
+ return IsSigned ? Builder.CreateSRem(X, NewDivisor, "srem")
+ : Builder.CreateURem(X, NewDivisor, "urem");
+ }
+ }
+ }
+
+ return nullptr;
+}
+
+/// Fold
+/// (1 << NBits) - 1
+/// Into:
+/// ~(-(1 << NBits))
+/// Because a 'not' is better for bit-tracking analysis and other transforms
+/// than an 'add'. The new shl is always nsw, and is nuw if old `and` was.
+static Instruction *canonicalizeLowbitMask(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ Value *NBits;
+ if (!match(&I, m_Add(m_OneUse(m_Shl(m_One(), m_Value(NBits))), m_AllOnes())))
+ return nullptr;
+
+ Constant *MinusOne = Constant::getAllOnesValue(NBits->getType());
+ Value *NotMask = Builder.CreateShl(MinusOne, NBits, "notmask");
+ // Be wary of constant folding.
+ if (auto *BOp = dyn_cast<BinaryOperator>(NotMask)) {
+ // Always NSW. But NUW propagates from `add`.
+ BOp->setHasNoSignedWrap();
+ BOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+ }
+
+ return BinaryOperator::CreateNot(NotMask, I.getName());
+}
+
+static Instruction *foldToUnsignedSaturatedAdd(BinaryOperator &I) {
+ assert(I.getOpcode() == Instruction::Add && "Expecting add instruction");
+ Type *Ty = I.getType();
+ auto getUAddSat = [&]() {
+ return Intrinsic::getDeclaration(I.getModule(), Intrinsic::uadd_sat, Ty);
+ };
+
+ // add (umin X, ~Y), Y --> uaddsat X, Y
+ Value *X, *Y;
+ if (match(&I, m_c_Add(m_c_UMin(m_Value(X), m_Not(m_Value(Y))),
+ m_Deferred(Y))))
+ return CallInst::Create(getUAddSat(), { X, Y });
+
+ // add (umin X, ~C), C --> uaddsat X, C
+ const APInt *C, *NotC;
+ if (match(&I, m_Add(m_UMin(m_Value(X), m_APInt(NotC)), m_APInt(C))) &&
+ *C == ~*NotC)
+ return CallInst::Create(getUAddSat(), { X, ConstantInt::get(Ty, *C) });
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::
canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
BinaryOperator &I) {
- assert((I.getOpcode() == Instruction::Add ||
- I.getOpcode() == Instruction::Or ||
- I.getOpcode() == Instruction::Sub) &&
- "Expecting add/or/sub instruction");
-
- // We have a subtraction/addition between a (potentially truncated) *logical*
- // right-shift of X and a "select".
- Value *X, *Select;
- Instruction *LowBitsToSkip, *Extract;
- if (!match(&I, m_c_BinOp(m_TruncOrSelf(m_CombineAnd(
- m_LShr(m_Value(X), m_Instruction(LowBitsToSkip)),
- m_Instruction(Extract))),
- m_Value(Select))))
- return nullptr;
-
- // `add`/`or` is commutative; but for `sub`, "select" *must* be on RHS.
- if (I.getOpcode() == Instruction::Sub && I.getOperand(1) != Select)
- return nullptr;
-
- Type *XTy = X->getType();
- bool HadTrunc = I.getType() != XTy;
-
- // If there was a truncation of extracted value, then we'll need to produce
- // one extra instruction, so we need to ensure one instruction will go away.
- if (HadTrunc && !match(&I, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
- return nullptr;
-
- // Extraction should extract high NBits bits, with shift amount calculated as:
- // low bits to skip = shift bitwidth - high bits to extract
- // The shift amount itself may be extended, and we need to look past zero-ext
- // when matching NBits, that will matter for matching later.
- Constant *C;
- Value *NBits;
- if (!match(
- LowBitsToSkip,
- m_ZExtOrSelf(m_Sub(m_Constant(C), m_ZExtOrSelf(m_Value(NBits))))) ||
- !match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
- APInt(C->getType()->getScalarSizeInBits(),
- X->getType()->getScalarSizeInBits()))))
- return nullptr;
-
- // Sign-extending value can be zero-extended if we `sub`tract it,
- // or sign-extended otherwise.
- auto SkipExtInMagic = [&I](Value *&V) {
- if (I.getOpcode() == Instruction::Sub)
- match(V, m_ZExtOrSelf(m_Value(V)));
- else
- match(V, m_SExtOrSelf(m_Value(V)));
- };
-
- // Now, finally validate the sign-extending magic.
- // `select` itself may be appropriately extended, look past that.
- SkipExtInMagic(Select);
-
- ICmpInst::Predicate Pred;
- const APInt *Thr;
- Value *SignExtendingValue, *Zero;
- bool ShouldSignext;
- // It must be a select between two values we will later establish to be a
- // sign-extending value and a zero constant. The condition guarding the
- // sign-extension must be based on a sign bit of the same X we had in `lshr`.
- if (!match(Select, m_Select(m_ICmp(Pred, m_Specific(X), m_APInt(Thr)),
- m_Value(SignExtendingValue), m_Value(Zero))) ||
- !isSignBitCheck(Pred, *Thr, ShouldSignext))
- return nullptr;
-
- // icmp-select pair is commutative.
- if (!ShouldSignext)
- std::swap(SignExtendingValue, Zero);
-
- // If we should not perform sign-extension then we must add/or/subtract zero.
- if (!match(Zero, m_Zero()))
- return nullptr;
- // Otherwise, it should be some constant, left-shifted by the same NBits we
- // had in `lshr`. Said left-shift can also be appropriately extended.
- // Again, we must look past zero-ext when looking for NBits.
- SkipExtInMagic(SignExtendingValue);
- Constant *SignExtendingValueBaseConstant;
- if (!match(SignExtendingValue,
- m_Shl(m_Constant(SignExtendingValueBaseConstant),
- m_ZExtOrSelf(m_Specific(NBits)))))
- return nullptr;
- // If we `sub`, then the constant should be one, else it should be all-ones.
- if (I.getOpcode() == Instruction::Sub
- ? !match(SignExtendingValueBaseConstant, m_One())
- : !match(SignExtendingValueBaseConstant, m_AllOnes()))
- return nullptr;
-
- auto *NewAShr = BinaryOperator::CreateAShr(X, LowBitsToSkip,
- Extract->getName() + ".sext");
- NewAShr->copyIRFlags(Extract); // Preserve `exact`-ness.
- if (!HadTrunc)
- return NewAShr;
-
- Builder.Insert(NewAShr);
- return TruncInst::CreateTruncOrBitCast(NewAShr, I.getType());
-}
-
+ assert((I.getOpcode() == Instruction::Add ||
+ I.getOpcode() == Instruction::Or ||
+ I.getOpcode() == Instruction::Sub) &&
+ "Expecting add/or/sub instruction");
+
+ // We have a subtraction/addition between a (potentially truncated) *logical*
+ // right-shift of X and a "select".
+ Value *X, *Select;
+ Instruction *LowBitsToSkip, *Extract;
+ if (!match(&I, m_c_BinOp(m_TruncOrSelf(m_CombineAnd(
+ m_LShr(m_Value(X), m_Instruction(LowBitsToSkip)),
+ m_Instruction(Extract))),
+ m_Value(Select))))
+ return nullptr;
+
+ // `add`/`or` is commutative; but for `sub`, "select" *must* be on RHS.
+ if (I.getOpcode() == Instruction::Sub && I.getOperand(1) != Select)
+ return nullptr;
+
+ Type *XTy = X->getType();
+ bool HadTrunc = I.getType() != XTy;
+
+ // If there was a truncation of extracted value, then we'll need to produce
+ // one extra instruction, so we need to ensure one instruction will go away.
+ if (HadTrunc && !match(&I, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
+ return nullptr;
+
+ // Extraction should extract high NBits bits, with shift amount calculated as:
+ // low bits to skip = shift bitwidth - high bits to extract
+ // The shift amount itself may be extended, and we need to look past zero-ext
+ // when matching NBits, that will matter for matching later.
+ Constant *C;
+ Value *NBits;
+ if (!match(
+ LowBitsToSkip,
+ m_ZExtOrSelf(m_Sub(m_Constant(C), m_ZExtOrSelf(m_Value(NBits))))) ||
+ !match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+ APInt(C->getType()->getScalarSizeInBits(),
+ X->getType()->getScalarSizeInBits()))))
+ return nullptr;
+
+ // Sign-extending value can be zero-extended if we `sub`tract it,
+ // or sign-extended otherwise.
+ auto SkipExtInMagic = [&I](Value *&V) {
+ if (I.getOpcode() == Instruction::Sub)
+ match(V, m_ZExtOrSelf(m_Value(V)));
+ else
+ match(V, m_SExtOrSelf(m_Value(V)));
+ };
+
+ // Now, finally validate the sign-extending magic.
+ // `select` itself may be appropriately extended, look past that.
+ SkipExtInMagic(Select);
+
+ ICmpInst::Predicate Pred;
+ const APInt *Thr;
+ Value *SignExtendingValue, *Zero;
+ bool ShouldSignext;
+ // It must be a select between two values we will later establish to be a
+ // sign-extending value and a zero constant. The condition guarding the
+ // sign-extension must be based on a sign bit of the same X we had in `lshr`.
+ if (!match(Select, m_Select(m_ICmp(Pred, m_Specific(X), m_APInt(Thr)),
+ m_Value(SignExtendingValue), m_Value(Zero))) ||
+ !isSignBitCheck(Pred, *Thr, ShouldSignext))
+ return nullptr;
+
+ // icmp-select pair is commutative.
+ if (!ShouldSignext)
+ std::swap(SignExtendingValue, Zero);
+
+ // If we should not perform sign-extension then we must add/or/subtract zero.
+ if (!match(Zero, m_Zero()))
+ return nullptr;
+ // Otherwise, it should be some constant, left-shifted by the same NBits we
+ // had in `lshr`. Said left-shift can also be appropriately extended.
+ // Again, we must look past zero-ext when looking for NBits.
+ SkipExtInMagic(SignExtendingValue);
+ Constant *SignExtendingValueBaseConstant;
+ if (!match(SignExtendingValue,
+ m_Shl(m_Constant(SignExtendingValueBaseConstant),
+ m_ZExtOrSelf(m_Specific(NBits)))))
+ return nullptr;
+ // If we `sub`, then the constant should be one, else it should be all-ones.
+ if (I.getOpcode() == Instruction::Sub
+ ? !match(SignExtendingValueBaseConstant, m_One())
+ : !match(SignExtendingValueBaseConstant, m_AllOnes()))
+ return nullptr;
+
+ auto *NewAShr = BinaryOperator::CreateAShr(X, LowBitsToSkip,
+ Extract->getName() + ".sext");
+ NewAShr->copyIRFlags(Extract); // Preserve `exact`-ness.
+ if (!HadTrunc)
+ return NewAShr;
+
+ Builder.Insert(NewAShr);
+ return TruncInst::CreateTruncOrBitCast(NewAShr, I.getType());
+}
+
/// This is a specialization of a more general transform from
/// SimplifyUsingDistributiveLaws. If that code can be made to work optimally
/// for multi-use cases or propagating nsw/nuw, then we would not need this.
@@ -1279,161 +1279,161 @@ static Instruction *factorizeMathWithShlOps(BinaryOperator &I,
}
Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
- if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1),
- I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (SimplifyAssociativeOrCommutative(I))
- return &I;
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- // (A*B)+(A*C) -> A*(B+C) etc
- if (Value *V = SimplifyUsingDistributiveLaws(I))
- return replaceInstUsesWith(I, V);
-
+ if (Value *V = SimplifyAddInst(I.getOperand(0), I.getOperand(1),
+ I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (SimplifyAssociativeOrCommutative(I))
+ return &I;
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ // (A*B)+(A*C) -> A*(B+C) etc
+ if (Value *V = SimplifyUsingDistributiveLaws(I))
+ return replaceInstUsesWith(I, V);
+
if (Instruction *R = factorizeMathWithShlOps(I, Builder))
return R;
- if (Instruction *X = foldAddWithConstant(I))
- return X;
-
- if (Instruction *X = foldNoWrapAdd(I, Builder))
- return X;
-
- Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
- Type *Ty = I.getType();
- if (Ty->isIntOrIntVectorTy(1))
- return BinaryOperator::CreateXor(LHS, RHS);
-
- // X + X --> X << 1
- if (LHS == RHS) {
- auto *Shl = BinaryOperator::CreateShl(LHS, ConstantInt::get(Ty, 1));
- Shl->setHasNoSignedWrap(I.hasNoSignedWrap());
- Shl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
- return Shl;
- }
-
- Value *A, *B;
- if (match(LHS, m_Neg(m_Value(A)))) {
- // -A + -B --> -(A + B)
- if (match(RHS, m_Neg(m_Value(B))))
- return BinaryOperator::CreateNeg(Builder.CreateAdd(A, B));
-
- // -A + B --> B - A
- return BinaryOperator::CreateSub(RHS, A);
- }
-
- // A + -B --> A - B
- if (match(RHS, m_Neg(m_Value(B))))
- return BinaryOperator::CreateSub(LHS, B);
-
- if (Value *V = checkForNegativeOperand(I, Builder))
- return replaceInstUsesWith(I, V);
-
- // (A + 1) + ~B --> A - B
- // ~B + (A + 1) --> A - B
- // (~B + A) + 1 --> A - B
- // (A + ~B) + 1 --> A - B
- if (match(&I, m_c_BinOp(m_Add(m_Value(A), m_One()), m_Not(m_Value(B)))) ||
- match(&I, m_BinOp(m_c_Add(m_Not(m_Value(B)), m_Value(A)), m_One())))
- return BinaryOperator::CreateSub(A, B);
-
- // (A + RHS) + RHS --> A + (RHS << 1)
- if (match(LHS, m_OneUse(m_c_Add(m_Value(A), m_Specific(RHS)))))
- return BinaryOperator::CreateAdd(A, Builder.CreateShl(RHS, 1, "reass.add"));
-
- // LHS + (A + LHS) --> A + (LHS << 1)
- if (match(RHS, m_OneUse(m_c_Add(m_Value(A), m_Specific(LHS)))))
- return BinaryOperator::CreateAdd(A, Builder.CreateShl(LHS, 1, "reass.add"));
-
- // X % C0 + (( X / C0 ) % C1) * C0 => X % (C0 * C1)
- if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V);
-
- // ((X s/ C1) << C2) + X => X s% -C1 where -C1 is 1 << C2
- const APInt *C1, *C2;
- if (match(LHS, m_Shl(m_SDiv(m_Specific(RHS), m_APInt(C1)), m_APInt(C2)))) {
- APInt one(C2->getBitWidth(), 1);
- APInt minusC1 = -(*C1);
- if (minusC1 == (one << *C2)) {
- Constant *NewRHS = ConstantInt::get(RHS->getType(), minusC1);
- return BinaryOperator::CreateSRem(RHS, NewRHS);
- }
- }
-
- // A+B --> A|B iff A and B have no bits set in common.
- if (haveNoCommonBitsSet(LHS, RHS, DL, &AC, &I, &DT))
- return BinaryOperator::CreateOr(LHS, RHS);
-
- // add (select X 0 (sub n A)) A --> select X A n
- {
- SelectInst *SI = dyn_cast<SelectInst>(LHS);
- Value *A = RHS;
- if (!SI) {
- SI = dyn_cast<SelectInst>(RHS);
- A = LHS;
- }
- if (SI && SI->hasOneUse()) {
- Value *TV = SI->getTrueValue();
- Value *FV = SI->getFalseValue();
- Value *N;
-
- // Can we fold the add into the argument of the select?
- // We check both true and false select arguments for a matching subtract.
- if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A))))
- // Fold the add into the true select value.
- return SelectInst::Create(SI->getCondition(), N, A);
-
- if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A))))
- // Fold the add into the false select value.
- return SelectInst::Create(SI->getCondition(), A, N);
- }
- }
-
- if (Instruction *Ext = narrowMathIfNoOverflow(I))
- return Ext;
-
- // (add (xor A, B) (and A, B)) --> (or A, B)
- // (add (and A, B) (xor A, B)) --> (or A, B)
- if (match(&I, m_c_BinOp(m_Xor(m_Value(A), m_Value(B)),
- m_c_And(m_Deferred(A), m_Deferred(B)))))
- return BinaryOperator::CreateOr(A, B);
-
- // (add (or A, B) (and A, B)) --> (add A, B)
- // (add (and A, B) (or A, B)) --> (add A, B)
- if (match(&I, m_c_BinOp(m_Or(m_Value(A), m_Value(B)),
- m_c_And(m_Deferred(A), m_Deferred(B))))) {
- // Replacing operands in-place to preserve nuw/nsw flags.
- replaceOperand(I, 0, A);
- replaceOperand(I, 1, B);
- return &I;
- }
-
- // TODO(jingyue): Consider willNotOverflowSignedAdd and
- // willNotOverflowUnsignedAdd to reduce the number of invocations of
- // computeKnownBits.
- bool Changed = false;
- if (!I.hasNoSignedWrap() && willNotOverflowSignedAdd(LHS, RHS, I)) {
- Changed = true;
- I.setHasNoSignedWrap(true);
- }
- if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) {
- Changed = true;
- I.setHasNoUnsignedWrap(true);
- }
-
- if (Instruction *V = canonicalizeLowbitMask(I, Builder))
- return V;
-
- if (Instruction *V =
- canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
- return V;
-
- if (Instruction *SatAdd = foldToUnsignedSaturatedAdd(I))
- return SatAdd;
-
+ if (Instruction *X = foldAddWithConstant(I))
+ return X;
+
+ if (Instruction *X = foldNoWrapAdd(I, Builder))
+ return X;
+
+ Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+ Type *Ty = I.getType();
+ if (Ty->isIntOrIntVectorTy(1))
+ return BinaryOperator::CreateXor(LHS, RHS);
+
+ // X + X --> X << 1
+ if (LHS == RHS) {
+ auto *Shl = BinaryOperator::CreateShl(LHS, ConstantInt::get(Ty, 1));
+ Shl->setHasNoSignedWrap(I.hasNoSignedWrap());
+ Shl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+ return Shl;
+ }
+
+ Value *A, *B;
+ if (match(LHS, m_Neg(m_Value(A)))) {
+ // -A + -B --> -(A + B)
+ if (match(RHS, m_Neg(m_Value(B))))
+ return BinaryOperator::CreateNeg(Builder.CreateAdd(A, B));
+
+ // -A + B --> B - A
+ return BinaryOperator::CreateSub(RHS, A);
+ }
+
+ // A + -B --> A - B
+ if (match(RHS, m_Neg(m_Value(B))))
+ return BinaryOperator::CreateSub(LHS, B);
+
+ if (Value *V = checkForNegativeOperand(I, Builder))
+ return replaceInstUsesWith(I, V);
+
+ // (A + 1) + ~B --> A - B
+ // ~B + (A + 1) --> A - B
+ // (~B + A) + 1 --> A - B
+ // (A + ~B) + 1 --> A - B
+ if (match(&I, m_c_BinOp(m_Add(m_Value(A), m_One()), m_Not(m_Value(B)))) ||
+ match(&I, m_BinOp(m_c_Add(m_Not(m_Value(B)), m_Value(A)), m_One())))
+ return BinaryOperator::CreateSub(A, B);
+
+ // (A + RHS) + RHS --> A + (RHS << 1)
+ if (match(LHS, m_OneUse(m_c_Add(m_Value(A), m_Specific(RHS)))))
+ return BinaryOperator::CreateAdd(A, Builder.CreateShl(RHS, 1, "reass.add"));
+
+ // LHS + (A + LHS) --> A + (LHS << 1)
+ if (match(RHS, m_OneUse(m_c_Add(m_Value(A), m_Specific(LHS)))))
+ return BinaryOperator::CreateAdd(A, Builder.CreateShl(LHS, 1, "reass.add"));
+
+ // X % C0 + (( X / C0 ) % C1) * C0 => X % (C0 * C1)
+ if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V);
+
+ // ((X s/ C1) << C2) + X => X s% -C1 where -C1 is 1 << C2
+ const APInt *C1, *C2;
+ if (match(LHS, m_Shl(m_SDiv(m_Specific(RHS), m_APInt(C1)), m_APInt(C2)))) {
+ APInt one(C2->getBitWidth(), 1);
+ APInt minusC1 = -(*C1);
+ if (minusC1 == (one << *C2)) {
+ Constant *NewRHS = ConstantInt::get(RHS->getType(), minusC1);
+ return BinaryOperator::CreateSRem(RHS, NewRHS);
+ }
+ }
+
+ // A+B --> A|B iff A and B have no bits set in common.
+ if (haveNoCommonBitsSet(LHS, RHS, DL, &AC, &I, &DT))
+ return BinaryOperator::CreateOr(LHS, RHS);
+
+ // add (select X 0 (sub n A)) A --> select X A n
+ {
+ SelectInst *SI = dyn_cast<SelectInst>(LHS);
+ Value *A = RHS;
+ if (!SI) {
+ SI = dyn_cast<SelectInst>(RHS);
+ A = LHS;
+ }
+ if (SI && SI->hasOneUse()) {
+ Value *TV = SI->getTrueValue();
+ Value *FV = SI->getFalseValue();
+ Value *N;
+
+ // Can we fold the add into the argument of the select?
+ // We check both true and false select arguments for a matching subtract.
+ if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A))))
+ // Fold the add into the true select value.
+ return SelectInst::Create(SI->getCondition(), N, A);
+
+ if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A))))
+ // Fold the add into the false select value.
+ return SelectInst::Create(SI->getCondition(), A, N);
+ }
+ }
+
+ if (Instruction *Ext = narrowMathIfNoOverflow(I))
+ return Ext;
+
+ // (add (xor A, B) (and A, B)) --> (or A, B)
+ // (add (and A, B) (xor A, B)) --> (or A, B)
+ if (match(&I, m_c_BinOp(m_Xor(m_Value(A), m_Value(B)),
+ m_c_And(m_Deferred(A), m_Deferred(B)))))
+ return BinaryOperator::CreateOr(A, B);
+
+ // (add (or A, B) (and A, B)) --> (add A, B)
+ // (add (and A, B) (or A, B)) --> (add A, B)
+ if (match(&I, m_c_BinOp(m_Or(m_Value(A), m_Value(B)),
+ m_c_And(m_Deferred(A), m_Deferred(B))))) {
+ // Replacing operands in-place to preserve nuw/nsw flags.
+ replaceOperand(I, 0, A);
+ replaceOperand(I, 1, B);
+ return &I;
+ }
+
+ // TODO(jingyue): Consider willNotOverflowSignedAdd and
+ // willNotOverflowUnsignedAdd to reduce the number of invocations of
+ // computeKnownBits.
+ bool Changed = false;
+ if (!I.hasNoSignedWrap() && willNotOverflowSignedAdd(LHS, RHS, I)) {
+ Changed = true;
+ I.setHasNoSignedWrap(true);
+ }
+ if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) {
+ Changed = true;
+ I.setHasNoUnsignedWrap(true);
+ }
+
+ if (Instruction *V = canonicalizeLowbitMask(I, Builder))
+ return V;
+
+ if (Instruction *V =
+ canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
+ return V;
+
+ if (Instruction *SatAdd = foldToUnsignedSaturatedAdd(I))
+ return SatAdd;
+
// usub.sat(A, B) + B => umax(A, B)
if (match(&I, m_c_BinOp(
m_OneUse(m_Intrinsic<Intrinsic::usub_sat>(m_Value(A), m_Value(B))),
@@ -1442,286 +1442,286 @@ Instruction *InstCombinerImpl::visitAdd(BinaryOperator &I) {
Builder.CreateIntrinsic(Intrinsic::umax, {I.getType()}, {A, B}));
}
- return Changed ? &I : nullptr;
-}
-
-/// Eliminate an op from a linear interpolation (lerp) pattern.
-static Instruction *factorizeLerp(BinaryOperator &I,
- InstCombiner::BuilderTy &Builder) {
- Value *X, *Y, *Z;
- if (!match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_Value(Y),
- m_OneUse(m_FSub(m_FPOne(),
- m_Value(Z))))),
- m_OneUse(m_c_FMul(m_Value(X), m_Deferred(Z))))))
- return nullptr;
-
- // (Y * (1.0 - Z)) + (X * Z) --> Y + Z * (X - Y) [8 commuted variants]
- Value *XY = Builder.CreateFSubFMF(X, Y, &I);
- Value *MulZ = Builder.CreateFMulFMF(Z, XY, &I);
- return BinaryOperator::CreateFAddFMF(Y, MulZ, &I);
-}
-
-/// Factor a common operand out of fadd/fsub of fmul/fdiv.
-static Instruction *factorizeFAddFSub(BinaryOperator &I,
- InstCombiner::BuilderTy &Builder) {
- assert((I.getOpcode() == Instruction::FAdd ||
- I.getOpcode() == Instruction::FSub) && "Expecting fadd/fsub");
- assert(I.hasAllowReassoc() && I.hasNoSignedZeros() &&
- "FP factorization requires FMF");
-
- if (Instruction *Lerp = factorizeLerp(I, Builder))
- return Lerp;
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- Value *X, *Y, *Z;
- bool IsFMul;
- if ((match(Op0, m_OneUse(m_FMul(m_Value(X), m_Value(Z)))) &&
- match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))) ||
- (match(Op0, m_OneUse(m_FMul(m_Value(Z), m_Value(X)))) &&
- match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))))
- IsFMul = true;
- else if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Z)))) &&
- match(Op1, m_OneUse(m_FDiv(m_Value(Y), m_Specific(Z)))))
- IsFMul = false;
- else
- return nullptr;
-
- // (X * Z) + (Y * Z) --> (X + Y) * Z
- // (X * Z) - (Y * Z) --> (X - Y) * Z
- // (X / Z) + (Y / Z) --> (X + Y) / Z
- // (X / Z) - (Y / Z) --> (X - Y) / Z
- bool IsFAdd = I.getOpcode() == Instruction::FAdd;
- Value *XY = IsFAdd ? Builder.CreateFAddFMF(X, Y, &I)
- : Builder.CreateFSubFMF(X, Y, &I);
-
- // Bail out if we just created a denormal constant.
- // TODO: This is copied from a previous implementation. Is it necessary?
- const APFloat *C;
- if (match(XY, m_APFloat(C)) && !C->isNormal())
- return nullptr;
-
- return IsFMul ? BinaryOperator::CreateFMulFMF(XY, Z, &I)
- : BinaryOperator::CreateFDivFMF(XY, Z, &I);
-}
-
+ return Changed ? &I : nullptr;
+}
+
+/// Eliminate an op from a linear interpolation (lerp) pattern.
+static Instruction *factorizeLerp(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ Value *X, *Y, *Z;
+ if (!match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_Value(Y),
+ m_OneUse(m_FSub(m_FPOne(),
+ m_Value(Z))))),
+ m_OneUse(m_c_FMul(m_Value(X), m_Deferred(Z))))))
+ return nullptr;
+
+ // (Y * (1.0 - Z)) + (X * Z) --> Y + Z * (X - Y) [8 commuted variants]
+ Value *XY = Builder.CreateFSubFMF(X, Y, &I);
+ Value *MulZ = Builder.CreateFMulFMF(Z, XY, &I);
+ return BinaryOperator::CreateFAddFMF(Y, MulZ, &I);
+}
+
+/// Factor a common operand out of fadd/fsub of fmul/fdiv.
+static Instruction *factorizeFAddFSub(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ assert((I.getOpcode() == Instruction::FAdd ||
+ I.getOpcode() == Instruction::FSub) && "Expecting fadd/fsub");
+ assert(I.hasAllowReassoc() && I.hasNoSignedZeros() &&
+ "FP factorization requires FMF");
+
+ if (Instruction *Lerp = factorizeLerp(I, Builder))
+ return Lerp;
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Value *X, *Y, *Z;
+ bool IsFMul;
+ if ((match(Op0, m_OneUse(m_FMul(m_Value(X), m_Value(Z)))) &&
+ match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))) ||
+ (match(Op0, m_OneUse(m_FMul(m_Value(Z), m_Value(X)))) &&
+ match(Op1, m_OneUse(m_c_FMul(m_Value(Y), m_Specific(Z))))))
+ IsFMul = true;
+ else if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Z)))) &&
+ match(Op1, m_OneUse(m_FDiv(m_Value(Y), m_Specific(Z)))))
+ IsFMul = false;
+ else
+ return nullptr;
+
+ // (X * Z) + (Y * Z) --> (X + Y) * Z
+ // (X * Z) - (Y * Z) --> (X - Y) * Z
+ // (X / Z) + (Y / Z) --> (X + Y) / Z
+ // (X / Z) - (Y / Z) --> (X - Y) / Z
+ bool IsFAdd = I.getOpcode() == Instruction::FAdd;
+ Value *XY = IsFAdd ? Builder.CreateFAddFMF(X, Y, &I)
+ : Builder.CreateFSubFMF(X, Y, &I);
+
+ // Bail out if we just created a denormal constant.
+ // TODO: This is copied from a previous implementation. Is it necessary?
+ const APFloat *C;
+ if (match(XY, m_APFloat(C)) && !C->isNormal())
+ return nullptr;
+
+ return IsFMul ? BinaryOperator::CreateFMulFMF(XY, Z, &I)
+ : BinaryOperator::CreateFDivFMF(XY, Z, &I);
+}
+
Instruction *InstCombinerImpl::visitFAdd(BinaryOperator &I) {
- if (Value *V = SimplifyFAddInst(I.getOperand(0), I.getOperand(1),
- I.getFastMathFlags(),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (SimplifyAssociativeOrCommutative(I))
- return &I;
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I))
- return FoldedFAdd;
-
- // (-X) + Y --> Y - X
- Value *X, *Y;
- if (match(&I, m_c_FAdd(m_FNeg(m_Value(X)), m_Value(Y))))
- return BinaryOperator::CreateFSubFMF(Y, X, &I);
-
- // Similar to above, but look through fmul/fdiv for the negated term.
- // (-X * Y) + Z --> Z - (X * Y) [4 commuted variants]
- Value *Z;
- if (match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))),
- m_Value(Z)))) {
- Value *XY = Builder.CreateFMulFMF(X, Y, &I);
- return BinaryOperator::CreateFSubFMF(Z, XY, &I);
- }
- // (-X / Y) + Z --> Z - (X / Y) [2 commuted variants]
- // (X / -Y) + Z --> Z - (X / Y) [2 commuted variants]
- if (match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y))),
- m_Value(Z))) ||
- match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))),
- m_Value(Z)))) {
- Value *XY = Builder.CreateFDivFMF(X, Y, &I);
- return BinaryOperator::CreateFSubFMF(Z, XY, &I);
- }
-
- // Check for (fadd double (sitofp x), y), see if we can merge this into an
- // integer add followed by a promotion.
- Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
- if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
- Value *LHSIntVal = LHSConv->getOperand(0);
- Type *FPType = LHSConv->getType();
-
- // TODO: This check is overly conservative. In many cases known bits
- // analysis can tell us that the result of the addition has less significant
- // bits than the integer type can hold.
- auto IsValidPromotion = [](Type *FTy, Type *ITy) {
- Type *FScalarTy = FTy->getScalarType();
- Type *IScalarTy = ITy->getScalarType();
-
- // Do we have enough bits in the significand to represent the result of
- // the integer addition?
- unsigned MaxRepresentableBits =
- APFloat::semanticsPrecision(FScalarTy->getFltSemantics());
- return IScalarTy->getIntegerBitWidth() <= MaxRepresentableBits;
- };
-
- // (fadd double (sitofp x), fpcst) --> (sitofp (add int x, intcst))
- // ... if the constant fits in the integer value. This is useful for things
- // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer
- // requires a constant pool load, and generally allows the add to be better
- // instcombined.
- if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS))
- if (IsValidPromotion(FPType, LHSIntVal->getType())) {
- Constant *CI =
- ConstantExpr::getFPToSI(CFP, LHSIntVal->getType());
- if (LHSConv->hasOneUse() &&
- ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
- willNotOverflowSignedAdd(LHSIntVal, CI, I)) {
- // Insert the new integer add.
- Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, CI, "addconv");
- return new SIToFPInst(NewAdd, I.getType());
- }
- }
-
- // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y))
- if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) {
- Value *RHSIntVal = RHSConv->getOperand(0);
- // It's enough to check LHS types only because we require int types to
- // be the same for this transform.
- if (IsValidPromotion(FPType, LHSIntVal->getType())) {
- // Only do this if x/y have the same type, if at least one of them has a
- // single use (so we don't increase the number of int->fp conversions),
- // and if the integer add will not overflow.
- if (LHSIntVal->getType() == RHSIntVal->getType() &&
- (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
- willNotOverflowSignedAdd(LHSIntVal, RHSIntVal, I)) {
- // Insert the new integer add.
- Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, RHSIntVal, "addconv");
- return new SIToFPInst(NewAdd, I.getType());
- }
- }
- }
- }
-
- // Handle specials cases for FAdd with selects feeding the operation
- if (Value *V = SimplifySelectsFeedingBinaryOp(I, LHS, RHS))
- return replaceInstUsesWith(I, V);
-
- if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
- if (Instruction *F = factorizeFAddFSub(I, Builder))
- return F;
- if (Value *V = FAddCombine(Builder).simplify(&I))
- return replaceInstUsesWith(I, V);
- }
-
- return nullptr;
-}
-
-/// Optimize pointer differences into the same array into a size. Consider:
-/// &A[10] - &A[0]: we should compile this to "10". LHS/RHS are the pointer
-/// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
+ if (Value *V = SimplifyFAddInst(I.getOperand(0), I.getOperand(1),
+ I.getFastMathFlags(),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (SimplifyAssociativeOrCommutative(I))
+ return &I;
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ if (Instruction *FoldedFAdd = foldBinOpIntoSelectOrPhi(I))
+ return FoldedFAdd;
+
+ // (-X) + Y --> Y - X
+ Value *X, *Y;
+ if (match(&I, m_c_FAdd(m_FNeg(m_Value(X)), m_Value(Y))))
+ return BinaryOperator::CreateFSubFMF(Y, X, &I);
+
+ // Similar to above, but look through fmul/fdiv for the negated term.
+ // (-X * Y) + Z --> Z - (X * Y) [4 commuted variants]
+ Value *Z;
+ if (match(&I, m_c_FAdd(m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))),
+ m_Value(Z)))) {
+ Value *XY = Builder.CreateFMulFMF(X, Y, &I);
+ return BinaryOperator::CreateFSubFMF(Z, XY, &I);
+ }
+ // (-X / Y) + Z --> Z - (X / Y) [2 commuted variants]
+ // (X / -Y) + Z --> Z - (X / Y) [2 commuted variants]
+ if (match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y))),
+ m_Value(Z))) ||
+ match(&I, m_c_FAdd(m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))),
+ m_Value(Z)))) {
+ Value *XY = Builder.CreateFDivFMF(X, Y, &I);
+ return BinaryOperator::CreateFSubFMF(Z, XY, &I);
+ }
+
+ // Check for (fadd double (sitofp x), y), see if we can merge this into an
+ // integer add followed by a promotion.
+ Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+ if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
+ Value *LHSIntVal = LHSConv->getOperand(0);
+ Type *FPType = LHSConv->getType();
+
+ // TODO: This check is overly conservative. In many cases known bits
+ // analysis can tell us that the result of the addition has less significant
+ // bits than the integer type can hold.
+ auto IsValidPromotion = [](Type *FTy, Type *ITy) {
+ Type *FScalarTy = FTy->getScalarType();
+ Type *IScalarTy = ITy->getScalarType();
+
+ // Do we have enough bits in the significand to represent the result of
+ // the integer addition?
+ unsigned MaxRepresentableBits =
+ APFloat::semanticsPrecision(FScalarTy->getFltSemantics());
+ return IScalarTy->getIntegerBitWidth() <= MaxRepresentableBits;
+ };
+
+ // (fadd double (sitofp x), fpcst) --> (sitofp (add int x, intcst))
+ // ... if the constant fits in the integer value. This is useful for things
+ // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer
+ // requires a constant pool load, and generally allows the add to be better
+ // instcombined.
+ if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS))
+ if (IsValidPromotion(FPType, LHSIntVal->getType())) {
+ Constant *CI =
+ ConstantExpr::getFPToSI(CFP, LHSIntVal->getType());
+ if (LHSConv->hasOneUse() &&
+ ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
+ willNotOverflowSignedAdd(LHSIntVal, CI, I)) {
+ // Insert the new integer add.
+ Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, CI, "addconv");
+ return new SIToFPInst(NewAdd, I.getType());
+ }
+ }
+
+ // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y))
+ if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) {
+ Value *RHSIntVal = RHSConv->getOperand(0);
+ // It's enough to check LHS types only because we require int types to
+ // be the same for this transform.
+ if (IsValidPromotion(FPType, LHSIntVal->getType())) {
+ // Only do this if x/y have the same type, if at least one of them has a
+ // single use (so we don't increase the number of int->fp conversions),
+ // and if the integer add will not overflow.
+ if (LHSIntVal->getType() == RHSIntVal->getType() &&
+ (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
+ willNotOverflowSignedAdd(LHSIntVal, RHSIntVal, I)) {
+ // Insert the new integer add.
+ Value *NewAdd = Builder.CreateNSWAdd(LHSIntVal, RHSIntVal, "addconv");
+ return new SIToFPInst(NewAdd, I.getType());
+ }
+ }
+ }
+ }
+
+ // Handle specials cases for FAdd with selects feeding the operation
+ if (Value *V = SimplifySelectsFeedingBinaryOp(I, LHS, RHS))
+ return replaceInstUsesWith(I, V);
+
+ if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
+ if (Instruction *F = factorizeFAddFSub(I, Builder))
+ return F;
+ if (Value *V = FAddCombine(Builder).simplify(&I))
+ return replaceInstUsesWith(I, V);
+ }
+
+ return nullptr;
+}
+
+/// Optimize pointer differences into the same array into a size. Consider:
+/// &A[10] - &A[0]: we should compile this to "10". LHS/RHS are the pointer
+/// operands to the ptrtoint instructions for the LHS/RHS of the subtract.
Value *InstCombinerImpl::OptimizePointerDifference(Value *LHS, Value *RHS,
Type *Ty, bool IsNUW) {
- // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
- // this.
- bool Swapped = false;
- GEPOperator *GEP1 = nullptr, *GEP2 = nullptr;
+ // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
+ // this.
+ bool Swapped = false;
+ GEPOperator *GEP1 = nullptr, *GEP2 = nullptr;
if (!isa<GEPOperator>(LHS) && isa<GEPOperator>(RHS)) {
std::swap(LHS, RHS);
Swapped = true;
}
-
+
// Require at least one GEP with a common base pointer on both sides.
if (auto *LHSGEP = dyn_cast<GEPOperator>(LHS)) {
- // (gep X, ...) - X
- if (LHSGEP->getOperand(0) == RHS) {
- GEP1 = LHSGEP;
+ // (gep X, ...) - X
+ if (LHSGEP->getOperand(0) == RHS) {
+ GEP1 = LHSGEP;
} else if (auto *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
- // (gep X, ...) - (gep X, ...)
- if (LHSGEP->getOperand(0)->stripPointerCasts() ==
+ // (gep X, ...) - (gep X, ...)
+ if (LHSGEP->getOperand(0)->stripPointerCasts() ==
RHSGEP->getOperand(0)->stripPointerCasts()) {
GEP1 = LHSGEP;
- GEP2 = RHSGEP;
- }
- }
- }
-
- if (!GEP1)
- return nullptr;
-
- if (GEP2) {
- // (gep X, ...) - (gep X, ...)
- //
- // Avoid duplicating the arithmetic if there are more than one non-constant
- // indices between the two GEPs and either GEP has a non-constant index and
- // multiple users. If zero non-constant index, the result is a constant and
- // there is no duplication. If one non-constant index, the result is an add
- // or sub with a constant, which is no larger than the original code, and
- // there's no duplicated arithmetic, even if either GEP has multiple
- // users. If more than one non-constant indices combined, as long as the GEP
- // with at least one non-constant index doesn't have multiple users, there
- // is no duplication.
- unsigned NumNonConstantIndices1 = GEP1->countNonConstantIndices();
- unsigned NumNonConstantIndices2 = GEP2->countNonConstantIndices();
- if (NumNonConstantIndices1 + NumNonConstantIndices2 > 1 &&
- ((NumNonConstantIndices1 > 0 && !GEP1->hasOneUse()) ||
- (NumNonConstantIndices2 > 0 && !GEP2->hasOneUse()))) {
- return nullptr;
- }
- }
-
- // Emit the offset of the GEP and an intptr_t.
- Value *Result = EmitGEPOffset(GEP1);
-
- // If this is a single inbounds GEP and the original sub was nuw,
+ GEP2 = RHSGEP;
+ }
+ }
+ }
+
+ if (!GEP1)
+ return nullptr;
+
+ if (GEP2) {
+ // (gep X, ...) - (gep X, ...)
+ //
+ // Avoid duplicating the arithmetic if there are more than one non-constant
+ // indices between the two GEPs and either GEP has a non-constant index and
+ // multiple users. If zero non-constant index, the result is a constant and
+ // there is no duplication. If one non-constant index, the result is an add
+ // or sub with a constant, which is no larger than the original code, and
+ // there's no duplicated arithmetic, even if either GEP has multiple
+ // users. If more than one non-constant indices combined, as long as the GEP
+ // with at least one non-constant index doesn't have multiple users, there
+ // is no duplication.
+ unsigned NumNonConstantIndices1 = GEP1->countNonConstantIndices();
+ unsigned NumNonConstantIndices2 = GEP2->countNonConstantIndices();
+ if (NumNonConstantIndices1 + NumNonConstantIndices2 > 1 &&
+ ((NumNonConstantIndices1 > 0 && !GEP1->hasOneUse()) ||
+ (NumNonConstantIndices2 > 0 && !GEP2->hasOneUse()))) {
+ return nullptr;
+ }
+ }
+
+ // Emit the offset of the GEP and an intptr_t.
+ Value *Result = EmitGEPOffset(GEP1);
+
+ // If this is a single inbounds GEP and the original sub was nuw,
// then the final multiplication is also nuw.
if (auto *I = dyn_cast<Instruction>(Result))
if (IsNUW && !GEP2 && !Swapped && GEP1->isInBounds() &&
I->getOpcode() == Instruction::Mul)
I->setHasNoUnsignedWrap();
-
+
// If we have a 2nd GEP of the same base pointer, subtract the offsets.
// If both GEPs are inbounds, then the subtract does not have signed overflow.
- if (GEP2) {
- Value *Offset = EmitGEPOffset(GEP2);
+ if (GEP2) {
+ Value *Offset = EmitGEPOffset(GEP2);
Result = Builder.CreateSub(Result, Offset, "gepdiff", /* NUW */ false,
GEP1->isInBounds() && GEP2->isInBounds());
- }
-
- // If we have p - gep(p, ...) then we have to negate the result.
- if (Swapped)
- Result = Builder.CreateNeg(Result, "diff.neg");
-
- return Builder.CreateIntCast(Result, Ty, true);
-}
-
+ }
+
+ // If we have p - gep(p, ...) then we have to negate the result.
+ if (Swapped)
+ Result = Builder.CreateNeg(Result, "diff.neg");
+
+ return Builder.CreateIntCast(Result, Ty, true);
+}
+
Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
- if (Value *V = SimplifySubInst(I.getOperand(0), I.getOperand(1),
- I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- // If this is a 'B = x-(-A)', change to B = x+A.
- // We deal with this without involving Negator to preserve NSW flag.
- if (Value *V = dyn_castNegVal(Op1)) {
- BinaryOperator *Res = BinaryOperator::CreateAdd(Op0, V);
-
- if (const auto *BO = dyn_cast<BinaryOperator>(Op1)) {
- assert(BO->getOpcode() == Instruction::Sub &&
- "Expected a subtraction operator!");
- if (BO->hasNoSignedWrap() && I.hasNoSignedWrap())
- Res->setHasNoSignedWrap(true);
- } else {
- if (cast<Constant>(Op1)->isNotMinSignedValue() && I.hasNoSignedWrap())
- Res->setHasNoSignedWrap(true);
- }
-
- return Res;
- }
-
+ if (Value *V = SimplifySubInst(I.getOperand(0), I.getOperand(1),
+ I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+ // If this is a 'B = x-(-A)', change to B = x+A.
+ // We deal with this without involving Negator to preserve NSW flag.
+ if (Value *V = dyn_castNegVal(Op1)) {
+ BinaryOperator *Res = BinaryOperator::CreateAdd(Op0, V);
+
+ if (const auto *BO = dyn_cast<BinaryOperator>(Op1)) {
+ assert(BO->getOpcode() == Instruction::Sub &&
+ "Expected a subtraction operator!");
+ if (BO->hasNoSignedWrap() && I.hasNoSignedWrap())
+ Res->setHasNoSignedWrap(true);
+ } else {
+ if (cast<Constant>(Op1)->isNotMinSignedValue() && I.hasNoSignedWrap())
+ Res->setHasNoSignedWrap(true);
+ }
+
+ return Res;
+ }
+
// Try this before Negator to preserve NSW flag.
if (Instruction *R = factorizeMathWithShlOps(I, Builder))
return R;
@@ -1735,144 +1735,144 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X);
}
- auto TryToNarrowDeduceFlags = [this, &I, &Op0, &Op1]() -> Instruction * {
- if (Instruction *Ext = narrowMathIfNoOverflow(I))
- return Ext;
-
- bool Changed = false;
- if (!I.hasNoSignedWrap() && willNotOverflowSignedSub(Op0, Op1, I)) {
- Changed = true;
- I.setHasNoSignedWrap(true);
- }
- if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedSub(Op0, Op1, I)) {
- Changed = true;
- I.setHasNoUnsignedWrap(true);
- }
-
- return Changed ? &I : nullptr;
- };
-
- // First, let's try to interpret `sub a, b` as `add a, (sub 0, b)`,
- // and let's try to sink `(sub 0, b)` into `b` itself. But only if this isn't
- // a pure negation used by a select that looks like abs/nabs.
- bool IsNegation = match(Op0, m_ZeroInt());
- if (!IsNegation || none_of(I.users(), [&I, Op1](const User *U) {
- const Instruction *UI = dyn_cast<Instruction>(U);
- if (!UI)
- return false;
- return match(UI,
- m_Select(m_Value(), m_Specific(Op1), m_Specific(&I))) ||
- match(UI, m_Select(m_Value(), m_Specific(&I), m_Specific(Op1)));
- })) {
- if (Value *NegOp1 = Negator::Negate(IsNegation, Op1, *this))
- return BinaryOperator::CreateAdd(NegOp1, Op0);
- }
- if (IsNegation)
- return TryToNarrowDeduceFlags(); // Should have been handled in Negator!
-
- // (A*B)-(A*C) -> A*(B-C) etc
- if (Value *V = SimplifyUsingDistributiveLaws(I))
- return replaceInstUsesWith(I, V);
-
- if (I.getType()->isIntOrIntVectorTy(1))
- return BinaryOperator::CreateXor(Op0, Op1);
-
- // Replace (-1 - A) with (~A).
- if (match(Op0, m_AllOnes()))
- return BinaryOperator::CreateNot(Op1);
-
- // (~X) - (~Y) --> Y - X
- Value *X, *Y;
- if (match(Op0, m_Not(m_Value(X))) && match(Op1, m_Not(m_Value(Y))))
- return BinaryOperator::CreateSub(Y, X);
-
- // (X + -1) - Y --> ~Y + X
- if (match(Op0, m_OneUse(m_Add(m_Value(X), m_AllOnes()))))
- return BinaryOperator::CreateAdd(Builder.CreateNot(Op1), X);
-
- // Reassociate sub/add sequences to create more add instructions and
- // reduce dependency chains:
- // ((X - Y) + Z) - Op1 --> (X + Z) - (Y + Op1)
- Value *Z;
- if (match(Op0, m_OneUse(m_c_Add(m_OneUse(m_Sub(m_Value(X), m_Value(Y))),
- m_Value(Z))))) {
- Value *XZ = Builder.CreateAdd(X, Z);
- Value *YW = Builder.CreateAdd(Y, Op1);
- return BinaryOperator::CreateSub(XZ, YW);
- }
-
- auto m_AddRdx = [](Value *&Vec) {
+ auto TryToNarrowDeduceFlags = [this, &I, &Op0, &Op1]() -> Instruction * {
+ if (Instruction *Ext = narrowMathIfNoOverflow(I))
+ return Ext;
+
+ bool Changed = false;
+ if (!I.hasNoSignedWrap() && willNotOverflowSignedSub(Op0, Op1, I)) {
+ Changed = true;
+ I.setHasNoSignedWrap(true);
+ }
+ if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedSub(Op0, Op1, I)) {
+ Changed = true;
+ I.setHasNoUnsignedWrap(true);
+ }
+
+ return Changed ? &I : nullptr;
+ };
+
+ // First, let's try to interpret `sub a, b` as `add a, (sub 0, b)`,
+ // and let's try to sink `(sub 0, b)` into `b` itself. But only if this isn't
+ // a pure negation used by a select that looks like abs/nabs.
+ bool IsNegation = match(Op0, m_ZeroInt());
+ if (!IsNegation || none_of(I.users(), [&I, Op1](const User *U) {
+ const Instruction *UI = dyn_cast<Instruction>(U);
+ if (!UI)
+ return false;
+ return match(UI,
+ m_Select(m_Value(), m_Specific(Op1), m_Specific(&I))) ||
+ match(UI, m_Select(m_Value(), m_Specific(&I), m_Specific(Op1)));
+ })) {
+ if (Value *NegOp1 = Negator::Negate(IsNegation, Op1, *this))
+ return BinaryOperator::CreateAdd(NegOp1, Op0);
+ }
+ if (IsNegation)
+ return TryToNarrowDeduceFlags(); // Should have been handled in Negator!
+
+ // (A*B)-(A*C) -> A*(B-C) etc
+ if (Value *V = SimplifyUsingDistributiveLaws(I))
+ return replaceInstUsesWith(I, V);
+
+ if (I.getType()->isIntOrIntVectorTy(1))
+ return BinaryOperator::CreateXor(Op0, Op1);
+
+ // Replace (-1 - A) with (~A).
+ if (match(Op0, m_AllOnes()))
+ return BinaryOperator::CreateNot(Op1);
+
+ // (~X) - (~Y) --> Y - X
+ Value *X, *Y;
+ if (match(Op0, m_Not(m_Value(X))) && match(Op1, m_Not(m_Value(Y))))
+ return BinaryOperator::CreateSub(Y, X);
+
+ // (X + -1) - Y --> ~Y + X
+ if (match(Op0, m_OneUse(m_Add(m_Value(X), m_AllOnes()))))
+ return BinaryOperator::CreateAdd(Builder.CreateNot(Op1), X);
+
+ // Reassociate sub/add sequences to create more add instructions and
+ // reduce dependency chains:
+ // ((X - Y) + Z) - Op1 --> (X + Z) - (Y + Op1)
+ Value *Z;
+ if (match(Op0, m_OneUse(m_c_Add(m_OneUse(m_Sub(m_Value(X), m_Value(Y))),
+ m_Value(Z))))) {
+ Value *XZ = Builder.CreateAdd(X, Z);
+ Value *YW = Builder.CreateAdd(Y, Op1);
+ return BinaryOperator::CreateSub(XZ, YW);
+ }
+
+ auto m_AddRdx = [](Value *&Vec) {
return m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_add>(m_Value(Vec)));
- };
- Value *V0, *V1;
- if (match(Op0, m_AddRdx(V0)) && match(Op1, m_AddRdx(V1)) &&
- V0->getType() == V1->getType()) {
- // Difference of sums is sum of differences:
- // add_rdx(V0) - add_rdx(V1) --> add_rdx(V0 - V1)
- Value *Sub = Builder.CreateSub(V0, V1);
+ };
+ Value *V0, *V1;
+ if (match(Op0, m_AddRdx(V0)) && match(Op1, m_AddRdx(V1)) &&
+ V0->getType() == V1->getType()) {
+ // Difference of sums is sum of differences:
+ // add_rdx(V0) - add_rdx(V1) --> add_rdx(V0 - V1)
+ Value *Sub = Builder.CreateSub(V0, V1);
Value *Rdx = Builder.CreateIntrinsic(Intrinsic::vector_reduce_add,
{Sub->getType()}, {Sub});
- return replaceInstUsesWith(I, Rdx);
- }
-
- if (Constant *C = dyn_cast<Constant>(Op0)) {
- Value *X;
- if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
- // C - (zext bool) --> bool ? C - 1 : C
+ return replaceInstUsesWith(I, Rdx);
+ }
+
+ if (Constant *C = dyn_cast<Constant>(Op0)) {
+ Value *X;
+ if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
+ // C - (zext bool) --> bool ? C - 1 : C
return SelectInst::Create(X, InstCombiner::SubOne(C), C);
- if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
- // C - (sext bool) --> bool ? C + 1 : C
+ if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
+ // C - (sext bool) --> bool ? C + 1 : C
return SelectInst::Create(X, InstCombiner::AddOne(C), C);
-
- // C - ~X == X + (1+C)
- if (match(Op1, m_Not(m_Value(X))))
+
+ // C - ~X == X + (1+C)
+ if (match(Op1, m_Not(m_Value(X))))
return BinaryOperator::CreateAdd(X, InstCombiner::AddOne(C));
-
- // Try to fold constant sub into select arguments.
- if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
- if (Instruction *R = FoldOpIntoSelect(I, SI))
- return R;
-
- // Try to fold constant sub into PHI values.
- if (PHINode *PN = dyn_cast<PHINode>(Op1))
- if (Instruction *R = foldOpIntoPhi(I, PN))
- return R;
-
- Constant *C2;
-
- // C-(C2-X) --> X+(C-C2)
+
+ // Try to fold constant sub into select arguments.
+ if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+ if (Instruction *R = FoldOpIntoSelect(I, SI))
+ return R;
+
+ // Try to fold constant sub into PHI values.
+ if (PHINode *PN = dyn_cast<PHINode>(Op1))
+ if (Instruction *R = foldOpIntoPhi(I, PN))
+ return R;
+
+ Constant *C2;
+
+ // C-(C2-X) --> X+(C-C2)
if (match(Op1, m_Sub(m_ImmConstant(C2), m_Value(X))))
- return BinaryOperator::CreateAdd(X, ConstantExpr::getSub(C, C2));
- }
-
- const APInt *Op0C;
- if (match(Op0, m_APInt(Op0C)) && Op0C->isMask()) {
- // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
- // zero.
- KnownBits RHSKnown = computeKnownBits(Op1, 0, &I);
- if ((*Op0C | RHSKnown.Zero).isAllOnesValue())
- return BinaryOperator::CreateXor(Op1, Op0);
- }
-
- {
- Value *Y;
- // X-(X+Y) == -Y X-(Y+X) == -Y
- if (match(Op1, m_c_Add(m_Specific(Op0), m_Value(Y))))
- return BinaryOperator::CreateNeg(Y);
-
- // (X-Y)-X == -Y
- if (match(Op0, m_Sub(m_Specific(Op1), m_Value(Y))))
- return BinaryOperator::CreateNeg(Y);
- }
-
- // (sub (or A, B) (and A, B)) --> (xor A, B)
- {
- Value *A, *B;
- if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
- match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
- return BinaryOperator::CreateXor(A, B);
- }
-
+ return BinaryOperator::CreateAdd(X, ConstantExpr::getSub(C, C2));
+ }
+
+ const APInt *Op0C;
+ if (match(Op0, m_APInt(Op0C)) && Op0C->isMask()) {
+ // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
+ // zero.
+ KnownBits RHSKnown = computeKnownBits(Op1, 0, &I);
+ if ((*Op0C | RHSKnown.Zero).isAllOnesValue())
+ return BinaryOperator::CreateXor(Op1, Op0);
+ }
+
+ {
+ Value *Y;
+ // X-(X+Y) == -Y X-(Y+X) == -Y
+ if (match(Op1, m_c_Add(m_Specific(Op0), m_Value(Y))))
+ return BinaryOperator::CreateNeg(Y);
+
+ // (X-Y)-X == -Y
+ if (match(Op0, m_Sub(m_Specific(Op1), m_Value(Y))))
+ return BinaryOperator::CreateNeg(Y);
+ }
+
+ // (sub (or A, B) (and A, B)) --> (xor A, B)
+ {
+ Value *A, *B;
+ if (match(Op1, m_And(m_Value(A), m_Value(B))) &&
+ match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
+ return BinaryOperator::CreateXor(A, B);
+ }
+
// (sub (add A, B) (or A, B)) --> (and A, B)
{
Value *A, *B;
@@ -1889,184 +1889,184 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
return BinaryOperator::CreateOr(A, B);
}
- // (sub (and A, B) (or A, B)) --> neg (xor A, B)
- {
- Value *A, *B;
- if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
- match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) &&
- (Op0->hasOneUse() || Op1->hasOneUse()))
- return BinaryOperator::CreateNeg(Builder.CreateXor(A, B));
- }
-
- // (sub (or A, B), (xor A, B)) --> (and A, B)
- {
- Value *A, *B;
- if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
- match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
- return BinaryOperator::CreateAnd(A, B);
- }
-
- // (sub (xor A, B) (or A, B)) --> neg (and A, B)
- {
- Value *A, *B;
- if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
- match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) &&
- (Op0->hasOneUse() || Op1->hasOneUse()))
- return BinaryOperator::CreateNeg(Builder.CreateAnd(A, B));
- }
-
- {
- Value *Y;
- // ((X | Y) - X) --> (~X & Y)
- if (match(Op0, m_OneUse(m_c_Or(m_Value(Y), m_Specific(Op1)))))
- return BinaryOperator::CreateAnd(
- Y, Builder.CreateNot(Op1, Op1->getName() + ".not"));
- }
-
- {
- // (sub (and Op1, (neg X)), Op1) --> neg (and Op1, (add X, -1))
- Value *X;
- if (match(Op0, m_OneUse(m_c_And(m_Specific(Op1),
- m_OneUse(m_Neg(m_Value(X))))))) {
- return BinaryOperator::CreateNeg(Builder.CreateAnd(
- Op1, Builder.CreateAdd(X, Constant::getAllOnesValue(I.getType()))));
- }
- }
-
- {
- // (sub (and Op1, C), Op1) --> neg (and Op1, ~C)
- Constant *C;
- if (match(Op0, m_OneUse(m_And(m_Specific(Op1), m_Constant(C))))) {
- return BinaryOperator::CreateNeg(
- Builder.CreateAnd(Op1, Builder.CreateNot(C)));
- }
- }
-
- {
- // If we have a subtraction between some value and a select between
- // said value and something else, sink subtraction into select hands, i.e.:
- // sub (select %Cond, %TrueVal, %FalseVal), %Op1
- // ->
- // select %Cond, (sub %TrueVal, %Op1), (sub %FalseVal, %Op1)
- // or
- // sub %Op0, (select %Cond, %TrueVal, %FalseVal)
- // ->
- // select %Cond, (sub %Op0, %TrueVal), (sub %Op0, %FalseVal)
- // This will result in select between new subtraction and 0.
- auto SinkSubIntoSelect =
- [Ty = I.getType()](Value *Select, Value *OtherHandOfSub,
- auto SubBuilder) -> Instruction * {
- Value *Cond, *TrueVal, *FalseVal;
- if (!match(Select, m_OneUse(m_Select(m_Value(Cond), m_Value(TrueVal),
- m_Value(FalseVal)))))
- return nullptr;
- if (OtherHandOfSub != TrueVal && OtherHandOfSub != FalseVal)
- return nullptr;
- // While it is really tempting to just create two subtractions and let
- // InstCombine fold one of those to 0, it isn't possible to do so
- // because of worklist visitation order. So ugly it is.
- bool OtherHandOfSubIsTrueVal = OtherHandOfSub == TrueVal;
- Value *NewSub = SubBuilder(OtherHandOfSubIsTrueVal ? FalseVal : TrueVal);
- Constant *Zero = Constant::getNullValue(Ty);
- SelectInst *NewSel =
- SelectInst::Create(Cond, OtherHandOfSubIsTrueVal ? Zero : NewSub,
- OtherHandOfSubIsTrueVal ? NewSub : Zero);
- // Preserve prof metadata if any.
- NewSel->copyMetadata(cast<Instruction>(*Select));
- return NewSel;
- };
- if (Instruction *NewSel = SinkSubIntoSelect(
- /*Select=*/Op0, /*OtherHandOfSub=*/Op1,
- [Builder = &Builder, Op1](Value *OtherHandOfSelect) {
- return Builder->CreateSub(OtherHandOfSelect,
- /*OtherHandOfSub=*/Op1);
- }))
- return NewSel;
- if (Instruction *NewSel = SinkSubIntoSelect(
- /*Select=*/Op1, /*OtherHandOfSub=*/Op0,
- [Builder = &Builder, Op0](Value *OtherHandOfSelect) {
- return Builder->CreateSub(/*OtherHandOfSub=*/Op0,
- OtherHandOfSelect);
- }))
- return NewSel;
- }
-
- // (X - (X & Y)) --> (X & ~Y)
- if (match(Op1, m_c_And(m_Specific(Op0), m_Value(Y))) &&
- (Op1->hasOneUse() || isa<Constant>(Y)))
- return BinaryOperator::CreateAnd(
- Op0, Builder.CreateNot(Y, Y->getName() + ".not"));
-
- {
- // ~A - Min/Max(~A, O) -> Max/Min(A, ~O) - A
- // ~A - Min/Max(O, ~A) -> Max/Min(A, ~O) - A
- // Min/Max(~A, O) - ~A -> A - Max/Min(A, ~O)
- // Min/Max(O, ~A) - ~A -> A - Max/Min(A, ~O)
- // So long as O here is freely invertible, this will be neutral or a win.
- Value *LHS, *RHS, *A;
- Value *NotA = Op0, *MinMax = Op1;
- SelectPatternFlavor SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor;
- if (!SelectPatternResult::isMinOrMax(SPF)) {
- NotA = Op1;
- MinMax = Op0;
- SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor;
- }
- if (SelectPatternResult::isMinOrMax(SPF) &&
- match(NotA, m_Not(m_Value(A))) && (NotA == LHS || NotA == RHS)) {
- if (NotA == LHS)
- std::swap(LHS, RHS);
- // LHS is now O above and expected to have at least 2 uses (the min/max)
- // NotA is epected to have 2 uses from the min/max and 1 from the sub.
- if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
- !NotA->hasNUsesOrMore(4)) {
- // Note: We don't generate the inverse max/min, just create the not of
- // it and let other folds do the rest.
- Value *Not = Builder.CreateNot(MinMax);
- if (NotA == Op0)
- return BinaryOperator::CreateSub(Not, A);
- else
- return BinaryOperator::CreateSub(A, Not);
- }
- }
- }
-
- // Optimize pointer differences into the same array into a size. Consider:
- // &A[10] - &A[0]: we should compile this to "10".
- Value *LHSOp, *RHSOp;
- if (match(Op0, m_PtrToInt(m_Value(LHSOp))) &&
- match(Op1, m_PtrToInt(m_Value(RHSOp))))
- if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
- I.hasNoUnsignedWrap()))
- return replaceInstUsesWith(I, Res);
-
- // trunc(p)-trunc(q) -> trunc(p-q)
- if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
- match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
- if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
- /* IsNUW */ false))
- return replaceInstUsesWith(I, Res);
-
- // Canonicalize a shifty way to code absolute value to the common pattern.
- // There are 2 potential commuted variants.
- // We're relying on the fact that we only do this transform when the shift has
- // exactly 2 uses and the xor has exactly 1 use (otherwise, we might increase
- // instructions).
- Value *A;
- const APInt *ShAmt;
- Type *Ty = I.getType();
- if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) &&
- Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
- match(Op0, m_OneUse(m_c_Xor(m_Specific(A), m_Specific(Op1))))) {
- // B = ashr i32 A, 31 ; smear the sign bit
- // sub (xor A, B), B ; flip bits if negative and subtract -1 (add 1)
- // --> (A < 0) ? -A : A
- Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty));
- // Copy the nuw/nsw flags from the sub to the negate.
- Value *Neg = Builder.CreateNeg(A, "", I.hasNoUnsignedWrap(),
- I.hasNoSignedWrap());
- return SelectInst::Create(Cmp, Neg, A);
- }
-
+ // (sub (and A, B) (or A, B)) --> neg (xor A, B)
+ {
+ Value *A, *B;
+ if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+ match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) &&
+ (Op0->hasOneUse() || Op1->hasOneUse()))
+ return BinaryOperator::CreateNeg(Builder.CreateXor(A, B));
+ }
+
+ // (sub (or A, B), (xor A, B)) --> (and A, B)
+ {
+ Value *A, *B;
+ if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
+ match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
+ return BinaryOperator::CreateAnd(A, B);
+ }
+
+ // (sub (xor A, B) (or A, B)) --> neg (and A, B)
+ {
+ Value *A, *B;
+ if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+ match(Op1, m_c_Or(m_Specific(A), m_Specific(B))) &&
+ (Op0->hasOneUse() || Op1->hasOneUse()))
+ return BinaryOperator::CreateNeg(Builder.CreateAnd(A, B));
+ }
+
+ {
+ Value *Y;
+ // ((X | Y) - X) --> (~X & Y)
+ if (match(Op0, m_OneUse(m_c_Or(m_Value(Y), m_Specific(Op1)))))
+ return BinaryOperator::CreateAnd(
+ Y, Builder.CreateNot(Op1, Op1->getName() + ".not"));
+ }
+
+ {
+ // (sub (and Op1, (neg X)), Op1) --> neg (and Op1, (add X, -1))
+ Value *X;
+ if (match(Op0, m_OneUse(m_c_And(m_Specific(Op1),
+ m_OneUse(m_Neg(m_Value(X))))))) {
+ return BinaryOperator::CreateNeg(Builder.CreateAnd(
+ Op1, Builder.CreateAdd(X, Constant::getAllOnesValue(I.getType()))));
+ }
+ }
+
+ {
+ // (sub (and Op1, C), Op1) --> neg (and Op1, ~C)
+ Constant *C;
+ if (match(Op0, m_OneUse(m_And(m_Specific(Op1), m_Constant(C))))) {
+ return BinaryOperator::CreateNeg(
+ Builder.CreateAnd(Op1, Builder.CreateNot(C)));
+ }
+ }
+
+ {
+ // If we have a subtraction between some value and a select between
+ // said value and something else, sink subtraction into select hands, i.e.:
+ // sub (select %Cond, %TrueVal, %FalseVal), %Op1
+ // ->
+ // select %Cond, (sub %TrueVal, %Op1), (sub %FalseVal, %Op1)
+ // or
+ // sub %Op0, (select %Cond, %TrueVal, %FalseVal)
+ // ->
+ // select %Cond, (sub %Op0, %TrueVal), (sub %Op0, %FalseVal)
+ // This will result in select between new subtraction and 0.
+ auto SinkSubIntoSelect =
+ [Ty = I.getType()](Value *Select, Value *OtherHandOfSub,
+ auto SubBuilder) -> Instruction * {
+ Value *Cond, *TrueVal, *FalseVal;
+ if (!match(Select, m_OneUse(m_Select(m_Value(Cond), m_Value(TrueVal),
+ m_Value(FalseVal)))))
+ return nullptr;
+ if (OtherHandOfSub != TrueVal && OtherHandOfSub != FalseVal)
+ return nullptr;
+ // While it is really tempting to just create two subtractions and let
+ // InstCombine fold one of those to 0, it isn't possible to do so
+ // because of worklist visitation order. So ugly it is.
+ bool OtherHandOfSubIsTrueVal = OtherHandOfSub == TrueVal;
+ Value *NewSub = SubBuilder(OtherHandOfSubIsTrueVal ? FalseVal : TrueVal);
+ Constant *Zero = Constant::getNullValue(Ty);
+ SelectInst *NewSel =
+ SelectInst::Create(Cond, OtherHandOfSubIsTrueVal ? Zero : NewSub,
+ OtherHandOfSubIsTrueVal ? NewSub : Zero);
+ // Preserve prof metadata if any.
+ NewSel->copyMetadata(cast<Instruction>(*Select));
+ return NewSel;
+ };
+ if (Instruction *NewSel = SinkSubIntoSelect(
+ /*Select=*/Op0, /*OtherHandOfSub=*/Op1,
+ [Builder = &Builder, Op1](Value *OtherHandOfSelect) {
+ return Builder->CreateSub(OtherHandOfSelect,
+ /*OtherHandOfSub=*/Op1);
+ }))
+ return NewSel;
+ if (Instruction *NewSel = SinkSubIntoSelect(
+ /*Select=*/Op1, /*OtherHandOfSub=*/Op0,
+ [Builder = &Builder, Op0](Value *OtherHandOfSelect) {
+ return Builder->CreateSub(/*OtherHandOfSub=*/Op0,
+ OtherHandOfSelect);
+ }))
+ return NewSel;
+ }
+
+ // (X - (X & Y)) --> (X & ~Y)
+ if (match(Op1, m_c_And(m_Specific(Op0), m_Value(Y))) &&
+ (Op1->hasOneUse() || isa<Constant>(Y)))
+ return BinaryOperator::CreateAnd(
+ Op0, Builder.CreateNot(Y, Y->getName() + ".not"));
+
+ {
+ // ~A - Min/Max(~A, O) -> Max/Min(A, ~O) - A
+ // ~A - Min/Max(O, ~A) -> Max/Min(A, ~O) - A
+ // Min/Max(~A, O) - ~A -> A - Max/Min(A, ~O)
+ // Min/Max(O, ~A) - ~A -> A - Max/Min(A, ~O)
+ // So long as O here is freely invertible, this will be neutral or a win.
+ Value *LHS, *RHS, *A;
+ Value *NotA = Op0, *MinMax = Op1;
+ SelectPatternFlavor SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor;
+ if (!SelectPatternResult::isMinOrMax(SPF)) {
+ NotA = Op1;
+ MinMax = Op0;
+ SPF = matchSelectPattern(MinMax, LHS, RHS).Flavor;
+ }
+ if (SelectPatternResult::isMinOrMax(SPF) &&
+ match(NotA, m_Not(m_Value(A))) && (NotA == LHS || NotA == RHS)) {
+ if (NotA == LHS)
+ std::swap(LHS, RHS);
+ // LHS is now O above and expected to have at least 2 uses (the min/max)
+ // NotA is epected to have 2 uses from the min/max and 1 from the sub.
+ if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
+ !NotA->hasNUsesOrMore(4)) {
+ // Note: We don't generate the inverse max/min, just create the not of
+ // it and let other folds do the rest.
+ Value *Not = Builder.CreateNot(MinMax);
+ if (NotA == Op0)
+ return BinaryOperator::CreateSub(Not, A);
+ else
+ return BinaryOperator::CreateSub(A, Not);
+ }
+ }
+ }
+
+ // Optimize pointer differences into the same array into a size. Consider:
+ // &A[10] - &A[0]: we should compile this to "10".
+ Value *LHSOp, *RHSOp;
+ if (match(Op0, m_PtrToInt(m_Value(LHSOp))) &&
+ match(Op1, m_PtrToInt(m_Value(RHSOp))))
+ if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
+ I.hasNoUnsignedWrap()))
+ return replaceInstUsesWith(I, Res);
+
+ // trunc(p)-trunc(q) -> trunc(p-q)
+ if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
+ match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
+ if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
+ /* IsNUW */ false))
+ return replaceInstUsesWith(I, Res);
+
+ // Canonicalize a shifty way to code absolute value to the common pattern.
+ // There are 2 potential commuted variants.
+ // We're relying on the fact that we only do this transform when the shift has
+ // exactly 2 uses and the xor has exactly 1 use (otherwise, we might increase
+ // instructions).
+ Value *A;
+ const APInt *ShAmt;
+ Type *Ty = I.getType();
+ if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) &&
+ Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
+ match(Op0, m_OneUse(m_c_Xor(m_Specific(A), m_Specific(Op1))))) {
+ // B = ashr i32 A, 31 ; smear the sign bit
+ // sub (xor A, B), B ; flip bits if negative and subtract -1 (add 1)
+ // --> (A < 0) ? -A : A
+ Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty));
+ // Copy the nuw/nsw flags from the sub to the negate.
+ Value *Neg = Builder.CreateNeg(A, "", I.hasNoUnsignedWrap(),
+ I.hasNoSignedWrap());
+ return SelectInst::Create(Cmp, Neg, A);
+ }
+
// If we are subtracting a low-bit masked subset of some value from an add
// of that same value with no low bits changed, that is clearing some low bits
// of the sum:
@@ -2081,238 +2081,238 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
return BinaryOperator::CreateAnd(Op0, ConstantInt::get(Ty, ~(*AndC)));
}
- if (Instruction *V =
- canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
- return V;
-
- return TryToNarrowDeduceFlags();
-}
-
-/// This eliminates floating-point negation in either 'fneg(X)' or
-/// 'fsub(-0.0, X)' form by combining into a constant operand.
-static Instruction *foldFNegIntoConstant(Instruction &I) {
- Value *X;
- Constant *C;
-
- // Fold negation into constant operand. This is limited with one-use because
- // fneg is assumed better for analysis and cheaper in codegen than fmul/fdiv.
- // -(X * C) --> X * (-C)
- // FIXME: It's arguable whether these should be m_OneUse or not. The current
- // belief is that the FNeg allows for better reassociation opportunities.
- if (match(&I, m_FNeg(m_OneUse(m_FMul(m_Value(X), m_Constant(C))))))
- return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);
- // -(X / C) --> X / (-C)
- if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Value(X), m_Constant(C))))))
- return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I);
- // -(C / X) --> (-C) / X
- if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Constant(C), m_Value(X))))))
- return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I);
-
- // With NSZ [ counter-example with -0.0: -(-0.0 + 0.0) != 0.0 + -0.0 ]:
- // -(X + C) --> -X + -C --> -C - X
- if (I.hasNoSignedZeros() &&
- match(&I, m_FNeg(m_OneUse(m_FAdd(m_Value(X), m_Constant(C))))))
- return BinaryOperator::CreateFSubFMF(ConstantExpr::getFNeg(C), X, &I);
-
- return nullptr;
-}
-
-static Instruction *hoistFNegAboveFMulFDiv(Instruction &I,
- InstCombiner::BuilderTy &Builder) {
- Value *FNeg;
- if (!match(&I, m_FNeg(m_Value(FNeg))))
- return nullptr;
-
- Value *X, *Y;
- if (match(FNeg, m_OneUse(m_FMul(m_Value(X), m_Value(Y)))))
- return BinaryOperator::CreateFMulFMF(Builder.CreateFNegFMF(X, &I), Y, &I);
-
- if (match(FNeg, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))))
- return BinaryOperator::CreateFDivFMF(Builder.CreateFNegFMF(X, &I), Y, &I);
-
- return nullptr;
-}
-
+ if (Instruction *V =
+ canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
+ return V;
+
+ return TryToNarrowDeduceFlags();
+}
+
+/// This eliminates floating-point negation in either 'fneg(X)' or
+/// 'fsub(-0.0, X)' form by combining into a constant operand.
+static Instruction *foldFNegIntoConstant(Instruction &I) {
+ Value *X;
+ Constant *C;
+
+ // Fold negation into constant operand. This is limited with one-use because
+ // fneg is assumed better for analysis and cheaper in codegen than fmul/fdiv.
+ // -(X * C) --> X * (-C)
+ // FIXME: It's arguable whether these should be m_OneUse or not. The current
+ // belief is that the FNeg allows for better reassociation opportunities.
+ if (match(&I, m_FNeg(m_OneUse(m_FMul(m_Value(X), m_Constant(C))))))
+ return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);
+ // -(X / C) --> X / (-C)
+ if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Value(X), m_Constant(C))))))
+ return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I);
+ // -(C / X) --> (-C) / X
+ if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Constant(C), m_Value(X))))))
+ return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I);
+
+ // With NSZ [ counter-example with -0.0: -(-0.0 + 0.0) != 0.0 + -0.0 ]:
+ // -(X + C) --> -X + -C --> -C - X
+ if (I.hasNoSignedZeros() &&
+ match(&I, m_FNeg(m_OneUse(m_FAdd(m_Value(X), m_Constant(C))))))
+ return BinaryOperator::CreateFSubFMF(ConstantExpr::getFNeg(C), X, &I);
+
+ return nullptr;
+}
+
+static Instruction *hoistFNegAboveFMulFDiv(Instruction &I,
+ InstCombiner::BuilderTy &Builder) {
+ Value *FNeg;
+ if (!match(&I, m_FNeg(m_Value(FNeg))))
+ return nullptr;
+
+ Value *X, *Y;
+ if (match(FNeg, m_OneUse(m_FMul(m_Value(X), m_Value(Y)))))
+ return BinaryOperator::CreateFMulFMF(Builder.CreateFNegFMF(X, &I), Y, &I);
+
+ if (match(FNeg, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))))
+ return BinaryOperator::CreateFDivFMF(Builder.CreateFNegFMF(X, &I), Y, &I);
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitFNeg(UnaryOperator &I) {
- Value *Op = I.getOperand(0);
-
- if (Value *V = SimplifyFNegInst(Op, I.getFastMathFlags(),
+ Value *Op = I.getOperand(0);
+
+ if (Value *V = SimplifyFNegInst(Op, I.getFastMathFlags(),
getSimplifyQuery().getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (Instruction *X = foldFNegIntoConstant(I))
- return X;
-
- Value *X, *Y;
-
- // If we can ignore the sign of zeros: -(X - Y) --> (Y - X)
- if (I.hasNoSignedZeros() &&
- match(Op, m_OneUse(m_FSub(m_Value(X), m_Value(Y)))))
- return BinaryOperator::CreateFSubFMF(Y, X, &I);
-
- if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder))
- return R;
-
- return nullptr;
-}
-
+ return replaceInstUsesWith(I, V);
+
+ if (Instruction *X = foldFNegIntoConstant(I))
+ return X;
+
+ Value *X, *Y;
+
+ // If we can ignore the sign of zeros: -(X - Y) --> (Y - X)
+ if (I.hasNoSignedZeros() &&
+ match(Op, m_OneUse(m_FSub(m_Value(X), m_Value(Y)))))
+ return BinaryOperator::CreateFSubFMF(Y, X, &I);
+
+ if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder))
+ return R;
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitFSub(BinaryOperator &I) {
- if (Value *V = SimplifyFSubInst(I.getOperand(0), I.getOperand(1),
- I.getFastMathFlags(),
+ if (Value *V = SimplifyFSubInst(I.getOperand(0), I.getOperand(1),
+ I.getFastMathFlags(),
getSimplifyQuery().getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- // Subtraction from -0.0 is the canonical form of fneg.
- // fsub -0.0, X ==> fneg X
- // fsub nsz 0.0, X ==> fneg nsz X
- //
- // FIXME This matcher does not respect FTZ or DAZ yet:
- // fsub -0.0, Denorm ==> +-0
- // fneg Denorm ==> -Denorm
- Value *Op;
- if (match(&I, m_FNeg(m_Value(Op))))
- return UnaryOperator::CreateFNegFMF(Op, &I);
-
- if (Instruction *X = foldFNegIntoConstant(I))
- return X;
-
- if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder))
- return R;
-
- Value *X, *Y;
- Constant *C;
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- // If Op0 is not -0.0 or we can ignore -0.0: Z - (X - Y) --> Z + (Y - X)
- // Canonicalize to fadd to make analysis easier.
- // This can also help codegen because fadd is commutative.
- // Note that if this fsub was really an fneg, the fadd with -0.0 will get
- // killed later. We still limit that particular transform with 'hasOneUse'
- // because an fneg is assumed better/cheaper than a generic fsub.
- if (I.hasNoSignedZeros() || CannotBeNegativeZero(Op0, SQ.TLI)) {
- if (match(Op1, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) {
- Value *NewSub = Builder.CreateFSubFMF(Y, X, &I);
- return BinaryOperator::CreateFAddFMF(Op0, NewSub, &I);
- }
- }
-
- // (-X) - Op1 --> -(X + Op1)
- if (I.hasNoSignedZeros() && !isa<ConstantExpr>(Op0) &&
- match(Op0, m_OneUse(m_FNeg(m_Value(X))))) {
- Value *FAdd = Builder.CreateFAddFMF(X, Op1, &I);
- return UnaryOperator::CreateFNegFMF(FAdd, &I);
- }
-
- if (isa<Constant>(Op0))
- if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
- if (Instruction *NV = FoldOpIntoSelect(I, SI))
- return NV;
-
- // X - C --> X + (-C)
- // But don't transform constant expressions because there's an inverse fold
- // for X + (-Y) --> X - Y.
+ return replaceInstUsesWith(I, V);
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ // Subtraction from -0.0 is the canonical form of fneg.
+ // fsub -0.0, X ==> fneg X
+ // fsub nsz 0.0, X ==> fneg nsz X
+ //
+ // FIXME This matcher does not respect FTZ or DAZ yet:
+ // fsub -0.0, Denorm ==> +-0
+ // fneg Denorm ==> -Denorm
+ Value *Op;
+ if (match(&I, m_FNeg(m_Value(Op))))
+ return UnaryOperator::CreateFNegFMF(Op, &I);
+
+ if (Instruction *X = foldFNegIntoConstant(I))
+ return X;
+
+ if (Instruction *R = hoistFNegAboveFMulFDiv(I, Builder))
+ return R;
+
+ Value *X, *Y;
+ Constant *C;
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ // If Op0 is not -0.0 or we can ignore -0.0: Z - (X - Y) --> Z + (Y - X)
+ // Canonicalize to fadd to make analysis easier.
+ // This can also help codegen because fadd is commutative.
+ // Note that if this fsub was really an fneg, the fadd with -0.0 will get
+ // killed later. We still limit that particular transform with 'hasOneUse'
+ // because an fneg is assumed better/cheaper than a generic fsub.
+ if (I.hasNoSignedZeros() || CannotBeNegativeZero(Op0, SQ.TLI)) {
+ if (match(Op1, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) {
+ Value *NewSub = Builder.CreateFSubFMF(Y, X, &I);
+ return BinaryOperator::CreateFAddFMF(Op0, NewSub, &I);
+ }
+ }
+
+ // (-X) - Op1 --> -(X + Op1)
+ if (I.hasNoSignedZeros() && !isa<ConstantExpr>(Op0) &&
+ match(Op0, m_OneUse(m_FNeg(m_Value(X))))) {
+ Value *FAdd = Builder.CreateFAddFMF(X, Op1, &I);
+ return UnaryOperator::CreateFNegFMF(FAdd, &I);
+ }
+
+ if (isa<Constant>(Op0))
+ if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+ if (Instruction *NV = FoldOpIntoSelect(I, SI))
+ return NV;
+
+ // X - C --> X + (-C)
+ // But don't transform constant expressions because there's an inverse fold
+ // for X + (-Y) --> X - Y.
if (match(Op1, m_ImmConstant(C)))
- return BinaryOperator::CreateFAddFMF(Op0, ConstantExpr::getFNeg(C), &I);
-
- // X - (-Y) --> X + Y
- if (match(Op1, m_FNeg(m_Value(Y))))
- return BinaryOperator::CreateFAddFMF(Op0, Y, &I);
-
- // Similar to above, but look through a cast of the negated value:
- // X - (fptrunc(-Y)) --> X + fptrunc(Y)
- Type *Ty = I.getType();
- if (match(Op1, m_OneUse(m_FPTrunc(m_FNeg(m_Value(Y))))))
- return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPTrunc(Y, Ty), &I);
-
- // X - (fpext(-Y)) --> X + fpext(Y)
- if (match(Op1, m_OneUse(m_FPExt(m_FNeg(m_Value(Y))))))
- return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPExt(Y, Ty), &I);
-
- // Similar to above, but look through fmul/fdiv of the negated value:
- // Op0 - (-X * Y) --> Op0 + (X * Y)
- // Op0 - (Y * -X) --> Op0 + (X * Y)
- if (match(Op1, m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))))) {
- Value *FMul = Builder.CreateFMulFMF(X, Y, &I);
- return BinaryOperator::CreateFAddFMF(Op0, FMul, &I);
- }
- // Op0 - (-X / Y) --> Op0 + (X / Y)
- // Op0 - (X / -Y) --> Op0 + (X / Y)
- if (match(Op1, m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y)))) ||
- match(Op1, m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))))) {
- Value *FDiv = Builder.CreateFDivFMF(X, Y, &I);
- return BinaryOperator::CreateFAddFMF(Op0, FDiv, &I);
- }
-
- // Handle special cases for FSub with selects feeding the operation
- if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
- return replaceInstUsesWith(I, V);
-
- if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
- // (Y - X) - Y --> -X
- if (match(Op0, m_FSub(m_Specific(Op1), m_Value(X))))
- return UnaryOperator::CreateFNegFMF(X, &I);
-
- // Y - (X + Y) --> -X
- // Y - (Y + X) --> -X
- if (match(Op1, m_c_FAdd(m_Specific(Op0), m_Value(X))))
- return UnaryOperator::CreateFNegFMF(X, &I);
-
- // (X * C) - X --> X * (C - 1.0)
- if (match(Op0, m_FMul(m_Specific(Op1), m_Constant(C)))) {
- Constant *CSubOne = ConstantExpr::getFSub(C, ConstantFP::get(Ty, 1.0));
- return BinaryOperator::CreateFMulFMF(Op1, CSubOne, &I);
- }
- // X - (X * C) --> X * (1.0 - C)
- if (match(Op1, m_FMul(m_Specific(Op0), m_Constant(C)))) {
- Constant *OneSubC = ConstantExpr::getFSub(ConstantFP::get(Ty, 1.0), C);
- return BinaryOperator::CreateFMulFMF(Op0, OneSubC, &I);
- }
-
- // Reassociate fsub/fadd sequences to create more fadd instructions and
- // reduce dependency chains:
- // ((X - Y) + Z) - Op1 --> (X + Z) - (Y + Op1)
- Value *Z;
- if (match(Op0, m_OneUse(m_c_FAdd(m_OneUse(m_FSub(m_Value(X), m_Value(Y))),
- m_Value(Z))))) {
- Value *XZ = Builder.CreateFAddFMF(X, Z, &I);
- Value *YW = Builder.CreateFAddFMF(Y, Op1, &I);
- return BinaryOperator::CreateFSubFMF(XZ, YW, &I);
- }
-
- auto m_FaddRdx = [](Value *&Sum, Value *&Vec) {
+ return BinaryOperator::CreateFAddFMF(Op0, ConstantExpr::getFNeg(C), &I);
+
+ // X - (-Y) --> X + Y
+ if (match(Op1, m_FNeg(m_Value(Y))))
+ return BinaryOperator::CreateFAddFMF(Op0, Y, &I);
+
+ // Similar to above, but look through a cast of the negated value:
+ // X - (fptrunc(-Y)) --> X + fptrunc(Y)
+ Type *Ty = I.getType();
+ if (match(Op1, m_OneUse(m_FPTrunc(m_FNeg(m_Value(Y))))))
+ return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPTrunc(Y, Ty), &I);
+
+ // X - (fpext(-Y)) --> X + fpext(Y)
+ if (match(Op1, m_OneUse(m_FPExt(m_FNeg(m_Value(Y))))))
+ return BinaryOperator::CreateFAddFMF(Op0, Builder.CreateFPExt(Y, Ty), &I);
+
+ // Similar to above, but look through fmul/fdiv of the negated value:
+ // Op0 - (-X * Y) --> Op0 + (X * Y)
+ // Op0 - (Y * -X) --> Op0 + (X * Y)
+ if (match(Op1, m_OneUse(m_c_FMul(m_FNeg(m_Value(X)), m_Value(Y))))) {
+ Value *FMul = Builder.CreateFMulFMF(X, Y, &I);
+ return BinaryOperator::CreateFAddFMF(Op0, FMul, &I);
+ }
+ // Op0 - (-X / Y) --> Op0 + (X / Y)
+ // Op0 - (X / -Y) --> Op0 + (X / Y)
+ if (match(Op1, m_OneUse(m_FDiv(m_FNeg(m_Value(X)), m_Value(Y)))) ||
+ match(Op1, m_OneUse(m_FDiv(m_Value(X), m_FNeg(m_Value(Y)))))) {
+ Value *FDiv = Builder.CreateFDivFMF(X, Y, &I);
+ return BinaryOperator::CreateFAddFMF(Op0, FDiv, &I);
+ }
+
+ // Handle special cases for FSub with selects feeding the operation
+ if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
+ return replaceInstUsesWith(I, V);
+
+ if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
+ // (Y - X) - Y --> -X
+ if (match(Op0, m_FSub(m_Specific(Op1), m_Value(X))))
+ return UnaryOperator::CreateFNegFMF(X, &I);
+
+ // Y - (X + Y) --> -X
+ // Y - (Y + X) --> -X
+ if (match(Op1, m_c_FAdd(m_Specific(Op0), m_Value(X))))
+ return UnaryOperator::CreateFNegFMF(X, &I);
+
+ // (X * C) - X --> X * (C - 1.0)
+ if (match(Op0, m_FMul(m_Specific(Op1), m_Constant(C)))) {
+ Constant *CSubOne = ConstantExpr::getFSub(C, ConstantFP::get(Ty, 1.0));
+ return BinaryOperator::CreateFMulFMF(Op1, CSubOne, &I);
+ }
+ // X - (X * C) --> X * (1.0 - C)
+ if (match(Op1, m_FMul(m_Specific(Op0), m_Constant(C)))) {
+ Constant *OneSubC = ConstantExpr::getFSub(ConstantFP::get(Ty, 1.0), C);
+ return BinaryOperator::CreateFMulFMF(Op0, OneSubC, &I);
+ }
+
+ // Reassociate fsub/fadd sequences to create more fadd instructions and
+ // reduce dependency chains:
+ // ((X - Y) + Z) - Op1 --> (X + Z) - (Y + Op1)
+ Value *Z;
+ if (match(Op0, m_OneUse(m_c_FAdd(m_OneUse(m_FSub(m_Value(X), m_Value(Y))),
+ m_Value(Z))))) {
+ Value *XZ = Builder.CreateFAddFMF(X, Z, &I);
+ Value *YW = Builder.CreateFAddFMF(Y, Op1, &I);
+ return BinaryOperator::CreateFSubFMF(XZ, YW, &I);
+ }
+
+ auto m_FaddRdx = [](Value *&Sum, Value *&Vec) {
return m_OneUse(m_Intrinsic<Intrinsic::vector_reduce_fadd>(m_Value(Sum),
m_Value(Vec)));
- };
- Value *A0, *A1, *V0, *V1;
- if (match(Op0, m_FaddRdx(A0, V0)) && match(Op1, m_FaddRdx(A1, V1)) &&
- V0->getType() == V1->getType()) {
- // Difference of sums is sum of differences:
- // add_rdx(A0, V0) - add_rdx(A1, V1) --> add_rdx(A0, V0 - V1) - A1
- Value *Sub = Builder.CreateFSubFMF(V0, V1, &I);
+ };
+ Value *A0, *A1, *V0, *V1;
+ if (match(Op0, m_FaddRdx(A0, V0)) && match(Op1, m_FaddRdx(A1, V1)) &&
+ V0->getType() == V1->getType()) {
+ // Difference of sums is sum of differences:
+ // add_rdx(A0, V0) - add_rdx(A1, V1) --> add_rdx(A0, V0 - V1) - A1
+ Value *Sub = Builder.CreateFSubFMF(V0, V1, &I);
Value *Rdx = Builder.CreateIntrinsic(Intrinsic::vector_reduce_fadd,
{Sub->getType()}, {A0, Sub}, &I);
- return BinaryOperator::CreateFSubFMF(Rdx, A1, &I);
- }
-
- if (Instruction *F = factorizeFAddFSub(I, Builder))
- return F;
-
- // TODO: This performs reassociative folds for FP ops. Some fraction of the
- // functionality has been subsumed by simple pattern matching here and in
- // InstSimplify. We should let a dedicated reassociation pass handle more
- // complex pattern matching and remove this from InstCombine.
- if (Value *V = FAddCombine(Builder).simplify(&I))
- return replaceInstUsesWith(I, V);
-
- // (X - Y) - Op1 --> X - (Y + Op1)
- if (match(Op0, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) {
- Value *FAdd = Builder.CreateFAddFMF(Y, Op1, &I);
- return BinaryOperator::CreateFSubFMF(X, FAdd, &I);
- }
- }
-
- return nullptr;
-}
+ return BinaryOperator::CreateFSubFMF(Rdx, A1, &I);
+ }
+
+ if (Instruction *F = factorizeFAddFSub(I, Builder))
+ return F;
+
+ // TODO: This performs reassociative folds for FP ops. Some fraction of the
+ // functionality has been subsumed by simple pattern matching here and in
+ // InstSimplify. We should let a dedicated reassociation pass handle more
+ // complex pattern matching and remove this from InstCombine.
+ if (Value *V = FAddCombine(Builder).simplify(&I))
+ return replaceInstUsesWith(I, V);
+
+ // (X - Y) - Op1 --> X - (Y + Op1)
+ if (match(Op0, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) {
+ Value *FAdd = Builder.CreateFAddFMF(Y, Op1, &I);
+ return BinaryOperator::CreateFSubFMF(X, FAdd, &I);
+ }
+ }
+
+ return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index fcf09f9216..85a7abe211 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -1,1632 +1,1632 @@
-//===- InstCombineAndOrXor.cpp --------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the visitAnd, visitOr, and visitXor functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
-#include "llvm/Analysis/CmpInstAnalysis.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/IR/ConstantRange.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/PatternMatch.h"
+//===- InstCombineAndOrXor.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitAnd, visitOr, and visitXor functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/Analysis/CmpInstAnalysis.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include "llvm/Transforms/Utils/Local.h"
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "instcombine"
-
-/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into
-/// a four bit mask.
-static unsigned getFCmpCode(FCmpInst::Predicate CC) {
- assert(FCmpInst::FCMP_FALSE <= CC && CC <= FCmpInst::FCMP_TRUE &&
- "Unexpected FCmp predicate!");
- // Take advantage of the bit pattern of FCmpInst::Predicate here.
- // U L G E
- static_assert(FCmpInst::FCMP_FALSE == 0, ""); // 0 0 0 0
- static_assert(FCmpInst::FCMP_OEQ == 1, ""); // 0 0 0 1
- static_assert(FCmpInst::FCMP_OGT == 2, ""); // 0 0 1 0
- static_assert(FCmpInst::FCMP_OGE == 3, ""); // 0 0 1 1
- static_assert(FCmpInst::FCMP_OLT == 4, ""); // 0 1 0 0
- static_assert(FCmpInst::FCMP_OLE == 5, ""); // 0 1 0 1
- static_assert(FCmpInst::FCMP_ONE == 6, ""); // 0 1 1 0
- static_assert(FCmpInst::FCMP_ORD == 7, ""); // 0 1 1 1
- static_assert(FCmpInst::FCMP_UNO == 8, ""); // 1 0 0 0
- static_assert(FCmpInst::FCMP_UEQ == 9, ""); // 1 0 0 1
- static_assert(FCmpInst::FCMP_UGT == 10, ""); // 1 0 1 0
- static_assert(FCmpInst::FCMP_UGE == 11, ""); // 1 0 1 1
- static_assert(FCmpInst::FCMP_ULT == 12, ""); // 1 1 0 0
- static_assert(FCmpInst::FCMP_ULE == 13, ""); // 1 1 0 1
- static_assert(FCmpInst::FCMP_UNE == 14, ""); // 1 1 1 0
- static_assert(FCmpInst::FCMP_TRUE == 15, ""); // 1 1 1 1
- return CC;
-}
-
-/// This is the complement of getICmpCode, which turns an opcode and two
-/// operands into either a constant true or false, or a brand new ICmp
-/// instruction. The sign is passed in to determine which kind of predicate to
-/// use in the new icmp instruction.
-static Value *getNewICmpValue(unsigned Code, bool Sign, Value *LHS, Value *RHS,
- InstCombiner::BuilderTy &Builder) {
- ICmpInst::Predicate NewPred;
- if (Constant *TorF = getPredForICmpCode(Code, Sign, LHS->getType(), NewPred))
- return TorF;
- return Builder.CreateICmp(NewPred, LHS, RHS);
-}
-
-/// This is the complement of getFCmpCode, which turns an opcode and two
-/// operands into either a FCmp instruction, or a true/false constant.
-static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS,
- InstCombiner::BuilderTy &Builder) {
- const auto Pred = static_cast<FCmpInst::Predicate>(Code);
- assert(FCmpInst::FCMP_FALSE <= Pred && Pred <= FCmpInst::FCMP_TRUE &&
- "Unexpected FCmp predicate!");
- if (Pred == FCmpInst::FCMP_FALSE)
- return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
- if (Pred == FCmpInst::FCMP_TRUE)
- return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
- return Builder.CreateFCmp(Pred, LHS, RHS);
-}
-
-/// Transform BITWISE_OP(BSWAP(A),BSWAP(B)) or
-/// BITWISE_OP(BSWAP(A), Constant) to BSWAP(BITWISE_OP(A, B))
-/// \param I Binary operator to transform.
-/// \return Pointer to node that must replace the original binary operator, or
-/// null pointer if no transformation was made.
-static Value *SimplifyBSwap(BinaryOperator &I,
- InstCombiner::BuilderTy &Builder) {
- assert(I.isBitwiseLogicOp() && "Unexpected opcode for bswap simplifying");
-
- Value *OldLHS = I.getOperand(0);
- Value *OldRHS = I.getOperand(1);
-
- Value *NewLHS;
- if (!match(OldLHS, m_BSwap(m_Value(NewLHS))))
- return nullptr;
-
- Value *NewRHS;
- const APInt *C;
-
- if (match(OldRHS, m_BSwap(m_Value(NewRHS)))) {
- // OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) )
- if (!OldLHS->hasOneUse() && !OldRHS->hasOneUse())
- return nullptr;
- // NewRHS initialized by the matcher.
- } else if (match(OldRHS, m_APInt(C))) {
- // OP( BSWAP(x), CONSTANT ) -> BSWAP( OP(x, BSWAP(CONSTANT) ) )
- if (!OldLHS->hasOneUse())
- return nullptr;
- NewRHS = ConstantInt::get(I.getType(), C->byteSwap());
- } else
- return nullptr;
-
- Value *BinOp = Builder.CreateBinOp(I.getOpcode(), NewLHS, NewRHS);
- Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap,
- I.getType());
- return Builder.CreateCall(F, BinOp);
-}
-
-/// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise
-/// (V < Lo || V >= Hi). This method expects that Lo < Hi. IsSigned indicates
-/// whether to treat V, Lo, and Hi as signed or not.
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+/// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into
+/// a four bit mask.
+static unsigned getFCmpCode(FCmpInst::Predicate CC) {
+ assert(FCmpInst::FCMP_FALSE <= CC && CC <= FCmpInst::FCMP_TRUE &&
+ "Unexpected FCmp predicate!");
+ // Take advantage of the bit pattern of FCmpInst::Predicate here.
+ // U L G E
+ static_assert(FCmpInst::FCMP_FALSE == 0, ""); // 0 0 0 0
+ static_assert(FCmpInst::FCMP_OEQ == 1, ""); // 0 0 0 1
+ static_assert(FCmpInst::FCMP_OGT == 2, ""); // 0 0 1 0
+ static_assert(FCmpInst::FCMP_OGE == 3, ""); // 0 0 1 1
+ static_assert(FCmpInst::FCMP_OLT == 4, ""); // 0 1 0 0
+ static_assert(FCmpInst::FCMP_OLE == 5, ""); // 0 1 0 1
+ static_assert(FCmpInst::FCMP_ONE == 6, ""); // 0 1 1 0
+ static_assert(FCmpInst::FCMP_ORD == 7, ""); // 0 1 1 1
+ static_assert(FCmpInst::FCMP_UNO == 8, ""); // 1 0 0 0
+ static_assert(FCmpInst::FCMP_UEQ == 9, ""); // 1 0 0 1
+ static_assert(FCmpInst::FCMP_UGT == 10, ""); // 1 0 1 0
+ static_assert(FCmpInst::FCMP_UGE == 11, ""); // 1 0 1 1
+ static_assert(FCmpInst::FCMP_ULT == 12, ""); // 1 1 0 0
+ static_assert(FCmpInst::FCMP_ULE == 13, ""); // 1 1 0 1
+ static_assert(FCmpInst::FCMP_UNE == 14, ""); // 1 1 1 0
+ static_assert(FCmpInst::FCMP_TRUE == 15, ""); // 1 1 1 1
+ return CC;
+}
+
+/// This is the complement of getICmpCode, which turns an opcode and two
+/// operands into either a constant true or false, or a brand new ICmp
+/// instruction. The sign is passed in to determine which kind of predicate to
+/// use in the new icmp instruction.
+static Value *getNewICmpValue(unsigned Code, bool Sign, Value *LHS, Value *RHS,
+ InstCombiner::BuilderTy &Builder) {
+ ICmpInst::Predicate NewPred;
+ if (Constant *TorF = getPredForICmpCode(Code, Sign, LHS->getType(), NewPred))
+ return TorF;
+ return Builder.CreateICmp(NewPred, LHS, RHS);
+}
+
+/// This is the complement of getFCmpCode, which turns an opcode and two
+/// operands into either a FCmp instruction, or a true/false constant.
+static Value *getFCmpValue(unsigned Code, Value *LHS, Value *RHS,
+ InstCombiner::BuilderTy &Builder) {
+ const auto Pred = static_cast<FCmpInst::Predicate>(Code);
+ assert(FCmpInst::FCMP_FALSE <= Pred && Pred <= FCmpInst::FCMP_TRUE &&
+ "Unexpected FCmp predicate!");
+ if (Pred == FCmpInst::FCMP_FALSE)
+ return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
+ if (Pred == FCmpInst::FCMP_TRUE)
+ return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 1);
+ return Builder.CreateFCmp(Pred, LHS, RHS);
+}
+
+/// Transform BITWISE_OP(BSWAP(A),BSWAP(B)) or
+/// BITWISE_OP(BSWAP(A), Constant) to BSWAP(BITWISE_OP(A, B))
+/// \param I Binary operator to transform.
+/// \return Pointer to node that must replace the original binary operator, or
+/// null pointer if no transformation was made.
+static Value *SimplifyBSwap(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ assert(I.isBitwiseLogicOp() && "Unexpected opcode for bswap simplifying");
+
+ Value *OldLHS = I.getOperand(0);
+ Value *OldRHS = I.getOperand(1);
+
+ Value *NewLHS;
+ if (!match(OldLHS, m_BSwap(m_Value(NewLHS))))
+ return nullptr;
+
+ Value *NewRHS;
+ const APInt *C;
+
+ if (match(OldRHS, m_BSwap(m_Value(NewRHS)))) {
+ // OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) )
+ if (!OldLHS->hasOneUse() && !OldRHS->hasOneUse())
+ return nullptr;
+ // NewRHS initialized by the matcher.
+ } else if (match(OldRHS, m_APInt(C))) {
+ // OP( BSWAP(x), CONSTANT ) -> BSWAP( OP(x, BSWAP(CONSTANT) ) )
+ if (!OldLHS->hasOneUse())
+ return nullptr;
+ NewRHS = ConstantInt::get(I.getType(), C->byteSwap());
+ } else
+ return nullptr;
+
+ Value *BinOp = Builder.CreateBinOp(I.getOpcode(), NewLHS, NewRHS);
+ Function *F = Intrinsic::getDeclaration(I.getModule(), Intrinsic::bswap,
+ I.getType());
+ return Builder.CreateCall(F, BinOp);
+}
+
+/// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise
+/// (V < Lo || V >= Hi). This method expects that Lo < Hi. IsSigned indicates
+/// whether to treat V, Lo, and Hi as signed or not.
Value *InstCombinerImpl::insertRangeTest(Value *V, const APInt &Lo,
const APInt &Hi, bool isSigned,
bool Inside) {
- assert((isSigned ? Lo.slt(Hi) : Lo.ult(Hi)) &&
- "Lo is not < Hi in range emission code!");
-
- Type *Ty = V->getType();
-
- // V >= Min && V < Hi --> V < Hi
- // V < Min || V >= Hi --> V >= Hi
- ICmpInst::Predicate Pred = Inside ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_UGE;
- if (isSigned ? Lo.isMinSignedValue() : Lo.isMinValue()) {
- Pred = isSigned ? ICmpInst::getSignedPredicate(Pred) : Pred;
- return Builder.CreateICmp(Pred, V, ConstantInt::get(Ty, Hi));
- }
-
- // V >= Lo && V < Hi --> V - Lo u< Hi - Lo
- // V < Lo || V >= Hi --> V - Lo u>= Hi - Lo
- Value *VMinusLo =
- Builder.CreateSub(V, ConstantInt::get(Ty, Lo), V->getName() + ".off");
- Constant *HiMinusLo = ConstantInt::get(Ty, Hi - Lo);
- return Builder.CreateICmp(Pred, VMinusLo, HiMinusLo);
-}
-
-/// Classify (icmp eq (A & B), C) and (icmp ne (A & B), C) as matching patterns
-/// that can be simplified.
-/// One of A and B is considered the mask. The other is the value. This is
-/// described as the "AMask" or "BMask" part of the enum. If the enum contains
-/// only "Mask", then both A and B can be considered masks. If A is the mask,
-/// then it was proven that (A & C) == C. This is trivial if C == A or C == 0.
-/// If both A and C are constants, this proof is also easy.
-/// For the following explanations, we assume that A is the mask.
-///
-/// "AllOnes" declares that the comparison is true only if (A & B) == A or all
-/// bits of A are set in B.
-/// Example: (icmp eq (A & 3), 3) -> AMask_AllOnes
-///
-/// "AllZeros" declares that the comparison is true only if (A & B) == 0 or all
-/// bits of A are cleared in B.
-/// Example: (icmp eq (A & 3), 0) -> Mask_AllZeroes
-///
-/// "Mixed" declares that (A & B) == C and C might or might not contain any
-/// number of one bits and zero bits.
-/// Example: (icmp eq (A & 3), 1) -> AMask_Mixed
-///
-/// "Not" means that in above descriptions "==" should be replaced by "!=".
-/// Example: (icmp ne (A & 3), 3) -> AMask_NotAllOnes
-///
-/// If the mask A contains a single bit, then the following is equivalent:
-/// (icmp eq (A & B), A) equals (icmp ne (A & B), 0)
-/// (icmp ne (A & B), A) equals (icmp eq (A & B), 0)
-enum MaskedICmpType {
- AMask_AllOnes = 1,
- AMask_NotAllOnes = 2,
- BMask_AllOnes = 4,
- BMask_NotAllOnes = 8,
- Mask_AllZeros = 16,
- Mask_NotAllZeros = 32,
- AMask_Mixed = 64,
- AMask_NotMixed = 128,
- BMask_Mixed = 256,
- BMask_NotMixed = 512
-};
-
-/// Return the set of patterns (from MaskedICmpType) that (icmp SCC (A & B), C)
-/// satisfies.
-static unsigned getMaskedICmpType(Value *A, Value *B, Value *C,
- ICmpInst::Predicate Pred) {
- ConstantInt *ACst = dyn_cast<ConstantInt>(A);
- ConstantInt *BCst = dyn_cast<ConstantInt>(B);
- ConstantInt *CCst = dyn_cast<ConstantInt>(C);
- bool IsEq = (Pred == ICmpInst::ICMP_EQ);
- bool IsAPow2 = (ACst && !ACst->isZero() && ACst->getValue().isPowerOf2());
- bool IsBPow2 = (BCst && !BCst->isZero() && BCst->getValue().isPowerOf2());
- unsigned MaskVal = 0;
- if (CCst && CCst->isZero()) {
- // if C is zero, then both A and B qualify as mask
- MaskVal |= (IsEq ? (Mask_AllZeros | AMask_Mixed | BMask_Mixed)
- : (Mask_NotAllZeros | AMask_NotMixed | BMask_NotMixed));
- if (IsAPow2)
- MaskVal |= (IsEq ? (AMask_NotAllOnes | AMask_NotMixed)
- : (AMask_AllOnes | AMask_Mixed));
- if (IsBPow2)
- MaskVal |= (IsEq ? (BMask_NotAllOnes | BMask_NotMixed)
- : (BMask_AllOnes | BMask_Mixed));
- return MaskVal;
- }
-
- if (A == C) {
- MaskVal |= (IsEq ? (AMask_AllOnes | AMask_Mixed)
- : (AMask_NotAllOnes | AMask_NotMixed));
- if (IsAPow2)
- MaskVal |= (IsEq ? (Mask_NotAllZeros | AMask_NotMixed)
- : (Mask_AllZeros | AMask_Mixed));
- } else if (ACst && CCst && ConstantExpr::getAnd(ACst, CCst) == CCst) {
- MaskVal |= (IsEq ? AMask_Mixed : AMask_NotMixed);
- }
-
- if (B == C) {
- MaskVal |= (IsEq ? (BMask_AllOnes | BMask_Mixed)
- : (BMask_NotAllOnes | BMask_NotMixed));
- if (IsBPow2)
- MaskVal |= (IsEq ? (Mask_NotAllZeros | BMask_NotMixed)
- : (Mask_AllZeros | BMask_Mixed));
- } else if (BCst && CCst && ConstantExpr::getAnd(BCst, CCst) == CCst) {
- MaskVal |= (IsEq ? BMask_Mixed : BMask_NotMixed);
- }
-
- return MaskVal;
-}
-
-/// Convert an analysis of a masked ICmp into its equivalent if all boolean
-/// operations had the opposite sense. Since each "NotXXX" flag (recording !=)
-/// is adjacent to the corresponding normal flag (recording ==), this just
-/// involves swapping those bits over.
-static unsigned conjugateICmpMask(unsigned Mask) {
- unsigned NewMask;
- NewMask = (Mask & (AMask_AllOnes | BMask_AllOnes | Mask_AllZeros |
- AMask_Mixed | BMask_Mixed))
- << 1;
-
- NewMask |= (Mask & (AMask_NotAllOnes | BMask_NotAllOnes | Mask_NotAllZeros |
- AMask_NotMixed | BMask_NotMixed))
- >> 1;
-
- return NewMask;
-}
-
-// Adapts the external decomposeBitTestICmp for local use.
-static bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred,
- Value *&X, Value *&Y, Value *&Z) {
- APInt Mask;
- if (!llvm::decomposeBitTestICmp(LHS, RHS, Pred, X, Mask))
- return false;
-
- Y = ConstantInt::get(X->getType(), Mask);
- Z = ConstantInt::get(X->getType(), 0);
- return true;
-}
-
-/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E).
-/// Return the pattern classes (from MaskedICmpType) for the left hand side and
-/// the right hand side as a pair.
-/// LHS and RHS are the left hand side and the right hand side ICmps and PredL
-/// and PredR are their predicates, respectively.
-static
-Optional<std::pair<unsigned, unsigned>>
-getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
- Value *&D, Value *&E, ICmpInst *LHS,
- ICmpInst *RHS,
- ICmpInst::Predicate &PredL,
- ICmpInst::Predicate &PredR) {
- // vectors are not (yet?) supported. Don't support pointers either.
- if (!LHS->getOperand(0)->getType()->isIntegerTy() ||
- !RHS->getOperand(0)->getType()->isIntegerTy())
- return None;
-
- // Here comes the tricky part:
- // LHS might be of the form L11 & L12 == X, X == L21 & L22,
- // and L11 & L12 == L21 & L22. The same goes for RHS.
- // Now we must find those components L** and R**, that are equal, so
- // that we can extract the parameters A, B, C, D, and E for the canonical
- // above.
- Value *L1 = LHS->getOperand(0);
- Value *L2 = LHS->getOperand(1);
- Value *L11, *L12, *L21, *L22;
- // Check whether the icmp can be decomposed into a bit test.
- if (decomposeBitTestICmp(L1, L2, PredL, L11, L12, L2)) {
- L21 = L22 = L1 = nullptr;
- } else {
- // Look for ANDs in the LHS icmp.
- if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) {
- // Any icmp can be viewed as being trivially masked; if it allows us to
- // remove one, it's worth it.
- L11 = L1;
- L12 = Constant::getAllOnesValue(L1->getType());
- }
-
- if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) {
- L21 = L2;
- L22 = Constant::getAllOnesValue(L2->getType());
- }
- }
-
- // Bail if LHS was a icmp that can't be decomposed into an equality.
- if (!ICmpInst::isEquality(PredL))
- return None;
-
- Value *R1 = RHS->getOperand(0);
- Value *R2 = RHS->getOperand(1);
- Value *R11, *R12;
- bool Ok = false;
- if (decomposeBitTestICmp(R1, R2, PredR, R11, R12, R2)) {
- if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
- A = R11;
- D = R12;
- } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
- A = R12;
- D = R11;
- } else {
- return None;
- }
- E = R2;
- R1 = nullptr;
- Ok = true;
- } else {
- if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) {
- // As before, model no mask as a trivial mask if it'll let us do an
- // optimization.
- R11 = R1;
- R12 = Constant::getAllOnesValue(R1->getType());
- }
-
- if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
- A = R11;
- D = R12;
- E = R2;
- Ok = true;
- } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
- A = R12;
- D = R11;
- E = R2;
- Ok = true;
- }
- }
-
- // Bail if RHS was a icmp that can't be decomposed into an equality.
- if (!ICmpInst::isEquality(PredR))
- return None;
-
- // Look for ANDs on the right side of the RHS icmp.
- if (!Ok) {
- if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) {
- R11 = R2;
- R12 = Constant::getAllOnesValue(R2->getType());
- }
-
- if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
- A = R11;
- D = R12;
- E = R1;
- Ok = true;
- } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
- A = R12;
- D = R11;
- E = R1;
- Ok = true;
- } else {
- return None;
- }
- }
- if (!Ok)
- return None;
-
- if (L11 == A) {
- B = L12;
- C = L2;
- } else if (L12 == A) {
- B = L11;
- C = L2;
- } else if (L21 == A) {
- B = L22;
- C = L1;
- } else if (L22 == A) {
- B = L21;
- C = L1;
- }
-
- unsigned LeftType = getMaskedICmpType(A, B, C, PredL);
- unsigned RightType = getMaskedICmpType(A, D, E, PredR);
- return Optional<std::pair<unsigned, unsigned>>(std::make_pair(LeftType, RightType));
-}
-
-/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) into a single
-/// (icmp(A & X) ==/!= Y), where the left-hand side is of type Mask_NotAllZeros
-/// and the right hand side is of type BMask_Mixed. For example,
-/// (icmp (A & 12) != 0) & (icmp (A & 15) == 8) -> (icmp (A & 15) == 8).
+ assert((isSigned ? Lo.slt(Hi) : Lo.ult(Hi)) &&
+ "Lo is not < Hi in range emission code!");
+
+ Type *Ty = V->getType();
+
+ // V >= Min && V < Hi --> V < Hi
+ // V < Min || V >= Hi --> V >= Hi
+ ICmpInst::Predicate Pred = Inside ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_UGE;
+ if (isSigned ? Lo.isMinSignedValue() : Lo.isMinValue()) {
+ Pred = isSigned ? ICmpInst::getSignedPredicate(Pred) : Pred;
+ return Builder.CreateICmp(Pred, V, ConstantInt::get(Ty, Hi));
+ }
+
+ // V >= Lo && V < Hi --> V - Lo u< Hi - Lo
+ // V < Lo || V >= Hi --> V - Lo u>= Hi - Lo
+ Value *VMinusLo =
+ Builder.CreateSub(V, ConstantInt::get(Ty, Lo), V->getName() + ".off");
+ Constant *HiMinusLo = ConstantInt::get(Ty, Hi - Lo);
+ return Builder.CreateICmp(Pred, VMinusLo, HiMinusLo);
+}
+
+/// Classify (icmp eq (A & B), C) and (icmp ne (A & B), C) as matching patterns
+/// that can be simplified.
+/// One of A and B is considered the mask. The other is the value. This is
+/// described as the "AMask" or "BMask" part of the enum. If the enum contains
+/// only "Mask", then both A and B can be considered masks. If A is the mask,
+/// then it was proven that (A & C) == C. This is trivial if C == A or C == 0.
+/// If both A and C are constants, this proof is also easy.
+/// For the following explanations, we assume that A is the mask.
+///
+/// "AllOnes" declares that the comparison is true only if (A & B) == A or all
+/// bits of A are set in B.
+/// Example: (icmp eq (A & 3), 3) -> AMask_AllOnes
+///
+/// "AllZeros" declares that the comparison is true only if (A & B) == 0 or all
+/// bits of A are cleared in B.
+/// Example: (icmp eq (A & 3), 0) -> Mask_AllZeroes
+///
+/// "Mixed" declares that (A & B) == C and C might or might not contain any
+/// number of one bits and zero bits.
+/// Example: (icmp eq (A & 3), 1) -> AMask_Mixed
+///
+/// "Not" means that in above descriptions "==" should be replaced by "!=".
+/// Example: (icmp ne (A & 3), 3) -> AMask_NotAllOnes
+///
+/// If the mask A contains a single bit, then the following is equivalent:
+/// (icmp eq (A & B), A) equals (icmp ne (A & B), 0)
+/// (icmp ne (A & B), A) equals (icmp eq (A & B), 0)
+enum MaskedICmpType {
+ AMask_AllOnes = 1,
+ AMask_NotAllOnes = 2,
+ BMask_AllOnes = 4,
+ BMask_NotAllOnes = 8,
+ Mask_AllZeros = 16,
+ Mask_NotAllZeros = 32,
+ AMask_Mixed = 64,
+ AMask_NotMixed = 128,
+ BMask_Mixed = 256,
+ BMask_NotMixed = 512
+};
+
+/// Return the set of patterns (from MaskedICmpType) that (icmp SCC (A & B), C)
+/// satisfies.
+static unsigned getMaskedICmpType(Value *A, Value *B, Value *C,
+ ICmpInst::Predicate Pred) {
+ ConstantInt *ACst = dyn_cast<ConstantInt>(A);
+ ConstantInt *BCst = dyn_cast<ConstantInt>(B);
+ ConstantInt *CCst = dyn_cast<ConstantInt>(C);
+ bool IsEq = (Pred == ICmpInst::ICMP_EQ);
+ bool IsAPow2 = (ACst && !ACst->isZero() && ACst->getValue().isPowerOf2());
+ bool IsBPow2 = (BCst && !BCst->isZero() && BCst->getValue().isPowerOf2());
+ unsigned MaskVal = 0;
+ if (CCst && CCst->isZero()) {
+ // if C is zero, then both A and B qualify as mask
+ MaskVal |= (IsEq ? (Mask_AllZeros | AMask_Mixed | BMask_Mixed)
+ : (Mask_NotAllZeros | AMask_NotMixed | BMask_NotMixed));
+ if (IsAPow2)
+ MaskVal |= (IsEq ? (AMask_NotAllOnes | AMask_NotMixed)
+ : (AMask_AllOnes | AMask_Mixed));
+ if (IsBPow2)
+ MaskVal |= (IsEq ? (BMask_NotAllOnes | BMask_NotMixed)
+ : (BMask_AllOnes | BMask_Mixed));
+ return MaskVal;
+ }
+
+ if (A == C) {
+ MaskVal |= (IsEq ? (AMask_AllOnes | AMask_Mixed)
+ : (AMask_NotAllOnes | AMask_NotMixed));
+ if (IsAPow2)
+ MaskVal |= (IsEq ? (Mask_NotAllZeros | AMask_NotMixed)
+ : (Mask_AllZeros | AMask_Mixed));
+ } else if (ACst && CCst && ConstantExpr::getAnd(ACst, CCst) == CCst) {
+ MaskVal |= (IsEq ? AMask_Mixed : AMask_NotMixed);
+ }
+
+ if (B == C) {
+ MaskVal |= (IsEq ? (BMask_AllOnes | BMask_Mixed)
+ : (BMask_NotAllOnes | BMask_NotMixed));
+ if (IsBPow2)
+ MaskVal |= (IsEq ? (Mask_NotAllZeros | BMask_NotMixed)
+ : (Mask_AllZeros | BMask_Mixed));
+ } else if (BCst && CCst && ConstantExpr::getAnd(BCst, CCst) == CCst) {
+ MaskVal |= (IsEq ? BMask_Mixed : BMask_NotMixed);
+ }
+
+ return MaskVal;
+}
+
+/// Convert an analysis of a masked ICmp into its equivalent if all boolean
+/// operations had the opposite sense. Since each "NotXXX" flag (recording !=)
+/// is adjacent to the corresponding normal flag (recording ==), this just
+/// involves swapping those bits over.
+static unsigned conjugateICmpMask(unsigned Mask) {
+ unsigned NewMask;
+ NewMask = (Mask & (AMask_AllOnes | BMask_AllOnes | Mask_AllZeros |
+ AMask_Mixed | BMask_Mixed))
+ << 1;
+
+ NewMask |= (Mask & (AMask_NotAllOnes | BMask_NotAllOnes | Mask_NotAllZeros |
+ AMask_NotMixed | BMask_NotMixed))
+ >> 1;
+
+ return NewMask;
+}
+
+// Adapts the external decomposeBitTestICmp for local use.
+static bool decomposeBitTestICmp(Value *LHS, Value *RHS, CmpInst::Predicate &Pred,
+ Value *&X, Value *&Y, Value *&Z) {
+ APInt Mask;
+ if (!llvm::decomposeBitTestICmp(LHS, RHS, Pred, X, Mask))
+ return false;
+
+ Y = ConstantInt::get(X->getType(), Mask);
+ Z = ConstantInt::get(X->getType(), 0);
+ return true;
+}
+
+/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E).
+/// Return the pattern classes (from MaskedICmpType) for the left hand side and
+/// the right hand side as a pair.
+/// LHS and RHS are the left hand side and the right hand side ICmps and PredL
+/// and PredR are their predicates, respectively.
+static
+Optional<std::pair<unsigned, unsigned>>
+getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
+ Value *&D, Value *&E, ICmpInst *LHS,
+ ICmpInst *RHS,
+ ICmpInst::Predicate &PredL,
+ ICmpInst::Predicate &PredR) {
+ // vectors are not (yet?) supported. Don't support pointers either.
+ if (!LHS->getOperand(0)->getType()->isIntegerTy() ||
+ !RHS->getOperand(0)->getType()->isIntegerTy())
+ return None;
+
+ // Here comes the tricky part:
+ // LHS might be of the form L11 & L12 == X, X == L21 & L22,
+ // and L11 & L12 == L21 & L22. The same goes for RHS.
+ // Now we must find those components L** and R**, that are equal, so
+ // that we can extract the parameters A, B, C, D, and E for the canonical
+ // above.
+ Value *L1 = LHS->getOperand(0);
+ Value *L2 = LHS->getOperand(1);
+ Value *L11, *L12, *L21, *L22;
+ // Check whether the icmp can be decomposed into a bit test.
+ if (decomposeBitTestICmp(L1, L2, PredL, L11, L12, L2)) {
+ L21 = L22 = L1 = nullptr;
+ } else {
+ // Look for ANDs in the LHS icmp.
+ if (!match(L1, m_And(m_Value(L11), m_Value(L12)))) {
+ // Any icmp can be viewed as being trivially masked; if it allows us to
+ // remove one, it's worth it.
+ L11 = L1;
+ L12 = Constant::getAllOnesValue(L1->getType());
+ }
+
+ if (!match(L2, m_And(m_Value(L21), m_Value(L22)))) {
+ L21 = L2;
+ L22 = Constant::getAllOnesValue(L2->getType());
+ }
+ }
+
+ // Bail if LHS was a icmp that can't be decomposed into an equality.
+ if (!ICmpInst::isEquality(PredL))
+ return None;
+
+ Value *R1 = RHS->getOperand(0);
+ Value *R2 = RHS->getOperand(1);
+ Value *R11, *R12;
+ bool Ok = false;
+ if (decomposeBitTestICmp(R1, R2, PredR, R11, R12, R2)) {
+ if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+ A = R11;
+ D = R12;
+ } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
+ A = R12;
+ D = R11;
+ } else {
+ return None;
+ }
+ E = R2;
+ R1 = nullptr;
+ Ok = true;
+ } else {
+ if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) {
+ // As before, model no mask as a trivial mask if it'll let us do an
+ // optimization.
+ R11 = R1;
+ R12 = Constant::getAllOnesValue(R1->getType());
+ }
+
+ if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+ A = R11;
+ D = R12;
+ E = R2;
+ Ok = true;
+ } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
+ A = R12;
+ D = R11;
+ E = R2;
+ Ok = true;
+ }
+ }
+
+ // Bail if RHS was a icmp that can't be decomposed into an equality.
+ if (!ICmpInst::isEquality(PredR))
+ return None;
+
+ // Look for ANDs on the right side of the RHS icmp.
+ if (!Ok) {
+ if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) {
+ R11 = R2;
+ R12 = Constant::getAllOnesValue(R2->getType());
+ }
+
+ if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
+ A = R11;
+ D = R12;
+ E = R1;
+ Ok = true;
+ } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
+ A = R12;
+ D = R11;
+ E = R1;
+ Ok = true;
+ } else {
+ return None;
+ }
+ }
+ if (!Ok)
+ return None;
+
+ if (L11 == A) {
+ B = L12;
+ C = L2;
+ } else if (L12 == A) {
+ B = L11;
+ C = L2;
+ } else if (L21 == A) {
+ B = L22;
+ C = L1;
+ } else if (L22 == A) {
+ B = L21;
+ C = L1;
+ }
+
+ unsigned LeftType = getMaskedICmpType(A, B, C, PredL);
+ unsigned RightType = getMaskedICmpType(A, D, E, PredR);
+ return Optional<std::pair<unsigned, unsigned>>(std::make_pair(LeftType, RightType));
+}
+
+/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E) into a single
+/// (icmp(A & X) ==/!= Y), where the left-hand side is of type Mask_NotAllZeros
+/// and the right hand side is of type BMask_Mixed. For example,
+/// (icmp (A & 12) != 0) & (icmp (A & 15) == 8) -> (icmp (A & 15) == 8).
static Value *foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C,
Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
InstCombiner::BuilderTy &Builder) {
- // We are given the canonical form:
- // (icmp ne (A & B), 0) & (icmp eq (A & D), E).
- // where D & E == E.
- //
- // If IsAnd is false, we get it in negated form:
- // (icmp eq (A & B), 0) | (icmp ne (A & D), E) ->
- // !((icmp ne (A & B), 0) & (icmp eq (A & D), E)).
- //
- // We currently handle the case of B, C, D, E are constant.
- //
+ // We are given the canonical form:
+ // (icmp ne (A & B), 0) & (icmp eq (A & D), E).
+ // where D & E == E.
+ //
+ // If IsAnd is false, we get it in negated form:
+ // (icmp eq (A & B), 0) | (icmp ne (A & D), E) ->
+ // !((icmp ne (A & B), 0) & (icmp eq (A & D), E)).
+ //
+ // We currently handle the case of B, C, D, E are constant.
+ //
ConstantInt *BCst, *CCst, *DCst, *ECst;
if (!match(B, m_ConstantInt(BCst)) || !match(C, m_ConstantInt(CCst)) ||
!match(D, m_ConstantInt(DCst)) || !match(E, m_ConstantInt(ECst)))
- return nullptr;
-
- ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
-
- // Update E to the canonical form when D is a power of two and RHS is
- // canonicalized as,
- // (icmp ne (A & D), 0) -> (icmp eq (A & D), D) or
- // (icmp ne (A & D), D) -> (icmp eq (A & D), 0).
- if (PredR != NewCC)
- ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
-
- // If B or D is zero, skip because if LHS or RHS can be trivially folded by
- // other folding rules and this pattern won't apply any more.
- if (BCst->getValue() == 0 || DCst->getValue() == 0)
- return nullptr;
-
- // If B and D don't intersect, ie. (B & D) == 0, no folding because we can't
- // deduce anything from it.
- // For example,
- // (icmp ne (A & 12), 0) & (icmp eq (A & 3), 1) -> no folding.
- if ((BCst->getValue() & DCst->getValue()) == 0)
- return nullptr;
-
- // If the following two conditions are met:
- //
- // 1. mask B covers only a single bit that's not covered by mask D, that is,
- // (B & (B ^ D)) is a power of 2 (in other words, B minus the intersection of
- // B and D has only one bit set) and,
- //
- // 2. RHS (and E) indicates that the rest of B's bits are zero (in other
- // words, the intersection of B and D is zero), that is, ((B & D) & E) == 0
- //
- // then that single bit in B must be one and thus the whole expression can be
- // folded to
- // (A & (B | D)) == (B & (B ^ D)) | E.
- //
- // For example,
- // (icmp ne (A & 12), 0) & (icmp eq (A & 7), 1) -> (icmp eq (A & 15), 9)
- // (icmp ne (A & 15), 0) & (icmp eq (A & 7), 0) -> (icmp eq (A & 15), 8)
- if ((((BCst->getValue() & DCst->getValue()) & ECst->getValue()) == 0) &&
- (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())).isPowerOf2()) {
- APInt BorD = BCst->getValue() | DCst->getValue();
- APInt BandBxorDorE = (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())) |
- ECst->getValue();
- Value *NewMask = ConstantInt::get(BCst->getType(), BorD);
- Value *NewMaskedValue = ConstantInt::get(BCst->getType(), BandBxorDorE);
- Value *NewAnd = Builder.CreateAnd(A, NewMask);
- return Builder.CreateICmp(NewCC, NewAnd, NewMaskedValue);
- }
-
- auto IsSubSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) {
- return (C1->getValue() & C2->getValue()) == C1->getValue();
- };
- auto IsSuperSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) {
- return (C1->getValue() & C2->getValue()) == C2->getValue();
- };
-
- // In the following, we consider only the cases where B is a superset of D, B
- // is a subset of D, or B == D because otherwise there's at least one bit
- // covered by B but not D, in which case we can't deduce much from it, so
- // no folding (aside from the single must-be-one bit case right above.)
- // For example,
- // (icmp ne (A & 14), 0) & (icmp eq (A & 3), 1) -> no folding.
- if (!IsSubSetOrEqual(BCst, DCst) && !IsSuperSetOrEqual(BCst, DCst))
- return nullptr;
-
- // At this point, either B is a superset of D, B is a subset of D or B == D.
-
- // If E is zero, if B is a subset of (or equal to) D, LHS and RHS contradict
- // and the whole expression becomes false (or true if negated), otherwise, no
- // folding.
- // For example,
- // (icmp ne (A & 3), 0) & (icmp eq (A & 7), 0) -> false.
- // (icmp ne (A & 15), 0) & (icmp eq (A & 3), 0) -> no folding.
- if (ECst->isZero()) {
- if (IsSubSetOrEqual(BCst, DCst))
- return ConstantInt::get(LHS->getType(), !IsAnd);
- return nullptr;
- }
-
- // At this point, B, D, E aren't zero and (B & D) == B, (B & D) == D or B ==
- // D. If B is a superset of (or equal to) D, since E is not zero, LHS is
- // subsumed by RHS (RHS implies LHS.) So the whole expression becomes
- // RHS. For example,
- // (icmp ne (A & 255), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
- // (icmp ne (A & 15), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
- if (IsSuperSetOrEqual(BCst, DCst))
- return RHS;
- // Otherwise, B is a subset of D. If B and E have a common bit set,
- // ie. (B & E) != 0, then LHS is subsumed by RHS. For example.
- // (icmp ne (A & 12), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
- assert(IsSubSetOrEqual(BCst, DCst) && "Precondition due to above code");
- if ((BCst->getValue() & ECst->getValue()) != 0)
- return RHS;
- // Otherwise, LHS and RHS contradict and the whole expression becomes false
- // (or true if negated.) For example,
- // (icmp ne (A & 7), 0) & (icmp eq (A & 15), 8) -> false.
- // (icmp ne (A & 6), 0) & (icmp eq (A & 15), 8) -> false.
- return ConstantInt::get(LHS->getType(), !IsAnd);
-}
-
-/// Try to fold (icmp(A & B) ==/!= 0) &/| (icmp(A & D) ==/!= E) into a single
-/// (icmp(A & X) ==/!= Y), where the left-hand side and the right hand side
-/// aren't of the common mask pattern type.
-static Value *foldLogOpOfMaskedICmpsAsymmetric(
+ return nullptr;
+
+ ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+
+ // Update E to the canonical form when D is a power of two and RHS is
+ // canonicalized as,
+ // (icmp ne (A & D), 0) -> (icmp eq (A & D), D) or
+ // (icmp ne (A & D), D) -> (icmp eq (A & D), 0).
+ if (PredR != NewCC)
+ ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
+
+ // If B or D is zero, skip because if LHS or RHS can be trivially folded by
+ // other folding rules and this pattern won't apply any more.
+ if (BCst->getValue() == 0 || DCst->getValue() == 0)
+ return nullptr;
+
+ // If B and D don't intersect, ie. (B & D) == 0, no folding because we can't
+ // deduce anything from it.
+ // For example,
+ // (icmp ne (A & 12), 0) & (icmp eq (A & 3), 1) -> no folding.
+ if ((BCst->getValue() & DCst->getValue()) == 0)
+ return nullptr;
+
+ // If the following two conditions are met:
+ //
+ // 1. mask B covers only a single bit that's not covered by mask D, that is,
+ // (B & (B ^ D)) is a power of 2 (in other words, B minus the intersection of
+ // B and D has only one bit set) and,
+ //
+ // 2. RHS (and E) indicates that the rest of B's bits are zero (in other
+ // words, the intersection of B and D is zero), that is, ((B & D) & E) == 0
+ //
+ // then that single bit in B must be one and thus the whole expression can be
+ // folded to
+ // (A & (B | D)) == (B & (B ^ D)) | E.
+ //
+ // For example,
+ // (icmp ne (A & 12), 0) & (icmp eq (A & 7), 1) -> (icmp eq (A & 15), 9)
+ // (icmp ne (A & 15), 0) & (icmp eq (A & 7), 0) -> (icmp eq (A & 15), 8)
+ if ((((BCst->getValue() & DCst->getValue()) & ECst->getValue()) == 0) &&
+ (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())).isPowerOf2()) {
+ APInt BorD = BCst->getValue() | DCst->getValue();
+ APInt BandBxorDorE = (BCst->getValue() & (BCst->getValue() ^ DCst->getValue())) |
+ ECst->getValue();
+ Value *NewMask = ConstantInt::get(BCst->getType(), BorD);
+ Value *NewMaskedValue = ConstantInt::get(BCst->getType(), BandBxorDorE);
+ Value *NewAnd = Builder.CreateAnd(A, NewMask);
+ return Builder.CreateICmp(NewCC, NewAnd, NewMaskedValue);
+ }
+
+ auto IsSubSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) {
+ return (C1->getValue() & C2->getValue()) == C1->getValue();
+ };
+ auto IsSuperSetOrEqual = [](ConstantInt *C1, ConstantInt *C2) {
+ return (C1->getValue() & C2->getValue()) == C2->getValue();
+ };
+
+ // In the following, we consider only the cases where B is a superset of D, B
+ // is a subset of D, or B == D because otherwise there's at least one bit
+ // covered by B but not D, in which case we can't deduce much from it, so
+ // no folding (aside from the single must-be-one bit case right above.)
+ // For example,
+ // (icmp ne (A & 14), 0) & (icmp eq (A & 3), 1) -> no folding.
+ if (!IsSubSetOrEqual(BCst, DCst) && !IsSuperSetOrEqual(BCst, DCst))
+ return nullptr;
+
+ // At this point, either B is a superset of D, B is a subset of D or B == D.
+
+ // If E is zero, if B is a subset of (or equal to) D, LHS and RHS contradict
+ // and the whole expression becomes false (or true if negated), otherwise, no
+ // folding.
+ // For example,
+ // (icmp ne (A & 3), 0) & (icmp eq (A & 7), 0) -> false.
+ // (icmp ne (A & 15), 0) & (icmp eq (A & 3), 0) -> no folding.
+ if (ECst->isZero()) {
+ if (IsSubSetOrEqual(BCst, DCst))
+ return ConstantInt::get(LHS->getType(), !IsAnd);
+ return nullptr;
+ }
+
+ // At this point, B, D, E aren't zero and (B & D) == B, (B & D) == D or B ==
+ // D. If B is a superset of (or equal to) D, since E is not zero, LHS is
+ // subsumed by RHS (RHS implies LHS.) So the whole expression becomes
+ // RHS. For example,
+ // (icmp ne (A & 255), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
+ // (icmp ne (A & 15), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
+ if (IsSuperSetOrEqual(BCst, DCst))
+ return RHS;
+ // Otherwise, B is a subset of D. If B and E have a common bit set,
+ // ie. (B & E) != 0, then LHS is subsumed by RHS. For example.
+ // (icmp ne (A & 12), 0) & (icmp eq (A & 15), 8) -> (icmp eq (A & 15), 8).
+ assert(IsSubSetOrEqual(BCst, DCst) && "Precondition due to above code");
+ if ((BCst->getValue() & ECst->getValue()) != 0)
+ return RHS;
+ // Otherwise, LHS and RHS contradict and the whole expression becomes false
+ // (or true if negated.) For example,
+ // (icmp ne (A & 7), 0) & (icmp eq (A & 15), 8) -> false.
+ // (icmp ne (A & 6), 0) & (icmp eq (A & 15), 8) -> false.
+ return ConstantInt::get(LHS->getType(), !IsAnd);
+}
+
+/// Try to fold (icmp(A & B) ==/!= 0) &/| (icmp(A & D) ==/!= E) into a single
+/// (icmp(A & X) ==/!= Y), where the left-hand side and the right hand side
+/// aren't of the common mask pattern type.
+static Value *foldLogOpOfMaskedICmpsAsymmetric(
ICmpInst *LHS, ICmpInst *RHS, bool IsAnd, Value *A, Value *B, Value *C,
Value *D, Value *E, ICmpInst::Predicate PredL, ICmpInst::Predicate PredR,
unsigned LHSMask, unsigned RHSMask, InstCombiner::BuilderTy &Builder) {
- assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
- "Expected equality predicates for masked type of icmps.");
- // Handle Mask_NotAllZeros-BMask_Mixed cases.
- // (icmp ne/eq (A & B), C) &/| (icmp eq/ne (A & D), E), or
- // (icmp eq/ne (A & B), C) &/| (icmp ne/eq (A & D), E)
- // which gets swapped to
- // (icmp ne/eq (A & D), E) &/| (icmp eq/ne (A & B), C).
- if (!IsAnd) {
- LHSMask = conjugateICmpMask(LHSMask);
- RHSMask = conjugateICmpMask(RHSMask);
- }
- if ((LHSMask & Mask_NotAllZeros) && (RHSMask & BMask_Mixed)) {
- if (Value *V = foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
- LHS, RHS, IsAnd, A, B, C, D, E,
- PredL, PredR, Builder)) {
- return V;
- }
- } else if ((LHSMask & BMask_Mixed) && (RHSMask & Mask_NotAllZeros)) {
- if (Value *V = foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
- RHS, LHS, IsAnd, A, D, E, B, C,
- PredR, PredL, Builder)) {
- return V;
- }
- }
- return nullptr;
-}
-
-/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
-/// into a single (icmp(A & X) ==/!= Y).
-static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
+ assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
+ "Expected equality predicates for masked type of icmps.");
+ // Handle Mask_NotAllZeros-BMask_Mixed cases.
+ // (icmp ne/eq (A & B), C) &/| (icmp eq/ne (A & D), E), or
+ // (icmp eq/ne (A & B), C) &/| (icmp ne/eq (A & D), E)
+ // which gets swapped to
+ // (icmp ne/eq (A & D), E) &/| (icmp eq/ne (A & B), C).
+ if (!IsAnd) {
+ LHSMask = conjugateICmpMask(LHSMask);
+ RHSMask = conjugateICmpMask(RHSMask);
+ }
+ if ((LHSMask & Mask_NotAllZeros) && (RHSMask & BMask_Mixed)) {
+ if (Value *V = foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
+ LHS, RHS, IsAnd, A, B, C, D, E,
+ PredL, PredR, Builder)) {
+ return V;
+ }
+ } else if ((LHSMask & BMask_Mixed) && (RHSMask & Mask_NotAllZeros)) {
+ if (Value *V = foldLogOpOfMaskedICmps_NotAllZeros_BMask_Mixed(
+ RHS, LHS, IsAnd, A, D, E, B, C,
+ PredR, PredL, Builder)) {
+ return V;
+ }
+ }
+ return nullptr;
+}
+
+/// Try to fold (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
+/// into a single (icmp(A & X) ==/!= Y).
+static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
InstCombiner::BuilderTy &Builder) {
- Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
- ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
- Optional<std::pair<unsigned, unsigned>> MaskPair =
- getMaskedTypeForICmpPair(A, B, C, D, E, LHS, RHS, PredL, PredR);
- if (!MaskPair)
- return nullptr;
- assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
- "Expected equality predicates for masked type of icmps.");
- unsigned LHSMask = MaskPair->first;
- unsigned RHSMask = MaskPair->second;
- unsigned Mask = LHSMask & RHSMask;
- if (Mask == 0) {
- // Even if the two sides don't share a common pattern, check if folding can
- // still happen.
- if (Value *V = foldLogOpOfMaskedICmpsAsymmetric(
- LHS, RHS, IsAnd, A, B, C, D, E, PredL, PredR, LHSMask, RHSMask,
- Builder))
- return V;
- return nullptr;
- }
-
- // In full generality:
- // (icmp (A & B) Op C) | (icmp (A & D) Op E)
- // == ![ (icmp (A & B) !Op C) & (icmp (A & D) !Op E) ]
- //
- // If the latter can be converted into (icmp (A & X) Op Y) then the former is
- // equivalent to (icmp (A & X) !Op Y).
- //
- // Therefore, we can pretend for the rest of this function that we're dealing
- // with the conjunction, provided we flip the sense of any comparisons (both
- // input and output).
-
- // In most cases we're going to produce an EQ for the "&&" case.
- ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
- if (!IsAnd) {
- // Convert the masking analysis into its equivalent with negated
- // comparisons.
- Mask = conjugateICmpMask(Mask);
- }
-
- if (Mask & Mask_AllZeros) {
- // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
- // -> (icmp eq (A & (B|D)), 0)
- Value *NewOr = Builder.CreateOr(B, D);
- Value *NewAnd = Builder.CreateAnd(A, NewOr);
- // We can't use C as zero because we might actually handle
- // (icmp ne (A & B), B) & (icmp ne (A & D), D)
- // with B and D, having a single bit set.
- Value *Zero = Constant::getNullValue(A->getType());
- return Builder.CreateICmp(NewCC, NewAnd, Zero);
- }
- if (Mask & BMask_AllOnes) {
- // (icmp eq (A & B), B) & (icmp eq (A & D), D)
- // -> (icmp eq (A & (B|D)), (B|D))
- Value *NewOr = Builder.CreateOr(B, D);
- Value *NewAnd = Builder.CreateAnd(A, NewOr);
- return Builder.CreateICmp(NewCC, NewAnd, NewOr);
- }
- if (Mask & AMask_AllOnes) {
- // (icmp eq (A & B), A) & (icmp eq (A & D), A)
- // -> (icmp eq (A & (B&D)), A)
- Value *NewAnd1 = Builder.CreateAnd(B, D);
- Value *NewAnd2 = Builder.CreateAnd(A, NewAnd1);
- return Builder.CreateICmp(NewCC, NewAnd2, A);
- }
-
- // Remaining cases assume at least that B and D are constant, and depend on
- // their actual values. This isn't strictly necessary, just a "handle the
- // easy cases for now" decision.
+ Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
+ ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+ Optional<std::pair<unsigned, unsigned>> MaskPair =
+ getMaskedTypeForICmpPair(A, B, C, D, E, LHS, RHS, PredL, PredR);
+ if (!MaskPair)
+ return nullptr;
+ assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
+ "Expected equality predicates for masked type of icmps.");
+ unsigned LHSMask = MaskPair->first;
+ unsigned RHSMask = MaskPair->second;
+ unsigned Mask = LHSMask & RHSMask;
+ if (Mask == 0) {
+ // Even if the two sides don't share a common pattern, check if folding can
+ // still happen.
+ if (Value *V = foldLogOpOfMaskedICmpsAsymmetric(
+ LHS, RHS, IsAnd, A, B, C, D, E, PredL, PredR, LHSMask, RHSMask,
+ Builder))
+ return V;
+ return nullptr;
+ }
+
+ // In full generality:
+ // (icmp (A & B) Op C) | (icmp (A & D) Op E)
+ // == ![ (icmp (A & B) !Op C) & (icmp (A & D) !Op E) ]
+ //
+ // If the latter can be converted into (icmp (A & X) Op Y) then the former is
+ // equivalent to (icmp (A & X) !Op Y).
+ //
+ // Therefore, we can pretend for the rest of this function that we're dealing
+ // with the conjunction, provided we flip the sense of any comparisons (both
+ // input and output).
+
+ // In most cases we're going to produce an EQ for the "&&" case.
+ ICmpInst::Predicate NewCC = IsAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+ if (!IsAnd) {
+ // Convert the masking analysis into its equivalent with negated
+ // comparisons.
+ Mask = conjugateICmpMask(Mask);
+ }
+
+ if (Mask & Mask_AllZeros) {
+ // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
+ // -> (icmp eq (A & (B|D)), 0)
+ Value *NewOr = Builder.CreateOr(B, D);
+ Value *NewAnd = Builder.CreateAnd(A, NewOr);
+ // We can't use C as zero because we might actually handle
+ // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+ // with B and D, having a single bit set.
+ Value *Zero = Constant::getNullValue(A->getType());
+ return Builder.CreateICmp(NewCC, NewAnd, Zero);
+ }
+ if (Mask & BMask_AllOnes) {
+ // (icmp eq (A & B), B) & (icmp eq (A & D), D)
+ // -> (icmp eq (A & (B|D)), (B|D))
+ Value *NewOr = Builder.CreateOr(B, D);
+ Value *NewAnd = Builder.CreateAnd(A, NewOr);
+ return Builder.CreateICmp(NewCC, NewAnd, NewOr);
+ }
+ if (Mask & AMask_AllOnes) {
+ // (icmp eq (A & B), A) & (icmp eq (A & D), A)
+ // -> (icmp eq (A & (B&D)), A)
+ Value *NewAnd1 = Builder.CreateAnd(B, D);
+ Value *NewAnd2 = Builder.CreateAnd(A, NewAnd1);
+ return Builder.CreateICmp(NewCC, NewAnd2, A);
+ }
+
+ // Remaining cases assume at least that B and D are constant, and depend on
+ // their actual values. This isn't strictly necessary, just a "handle the
+ // easy cases for now" decision.
ConstantInt *BCst, *DCst;
if (!match(B, m_ConstantInt(BCst)) || !match(D, m_ConstantInt(DCst)))
- return nullptr;
-
- if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) {
- // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
- // (icmp ne (A & B), B) & (icmp ne (A & D), D)
- // -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
- // Only valid if one of the masks is a superset of the other (check "B&D" is
- // the same as either B or D).
- APInt NewMask = BCst->getValue() & DCst->getValue();
-
- if (NewMask == BCst->getValue())
- return LHS;
- else if (NewMask == DCst->getValue())
- return RHS;
- }
-
- if (Mask & AMask_NotAllOnes) {
- // (icmp ne (A & B), B) & (icmp ne (A & D), D)
- // -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
- // Only valid if one of the masks is a superset of the other (check "B|D" is
- // the same as either B or D).
- APInt NewMask = BCst->getValue() | DCst->getValue();
-
- if (NewMask == BCst->getValue())
- return LHS;
- else if (NewMask == DCst->getValue())
- return RHS;
- }
-
- if (Mask & BMask_Mixed) {
- // (icmp eq (A & B), C) & (icmp eq (A & D), E)
- // We already know that B & C == C && D & E == E.
- // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of
- // C and E, which are shared by both the mask B and the mask D, don't
- // contradict, then we can transform to
- // -> (icmp eq (A & (B|D)), (C|E))
- // Currently, we only handle the case of B, C, D, and E being constant.
- // We can't simply use C and E because we might actually handle
- // (icmp ne (A & B), B) & (icmp eq (A & D), D)
- // with B and D, having a single bit set.
+ return nullptr;
+
+ if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) {
+ // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
+ // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+ // -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
+ // Only valid if one of the masks is a superset of the other (check "B&D" is
+ // the same as either B or D).
+ APInt NewMask = BCst->getValue() & DCst->getValue();
+
+ if (NewMask == BCst->getValue())
+ return LHS;
+ else if (NewMask == DCst->getValue())
+ return RHS;
+ }
+
+ if (Mask & AMask_NotAllOnes) {
+ // (icmp ne (A & B), B) & (icmp ne (A & D), D)
+ // -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
+ // Only valid if one of the masks is a superset of the other (check "B|D" is
+ // the same as either B or D).
+ APInt NewMask = BCst->getValue() | DCst->getValue();
+
+ if (NewMask == BCst->getValue())
+ return LHS;
+ else if (NewMask == DCst->getValue())
+ return RHS;
+ }
+
+ if (Mask & BMask_Mixed) {
+ // (icmp eq (A & B), C) & (icmp eq (A & D), E)
+ // We already know that B & C == C && D & E == E.
+ // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of
+ // C and E, which are shared by both the mask B and the mask D, don't
+ // contradict, then we can transform to
+ // -> (icmp eq (A & (B|D)), (C|E))
+ // Currently, we only handle the case of B, C, D, and E being constant.
+ // We can't simply use C and E because we might actually handle
+ // (icmp ne (A & B), B) & (icmp eq (A & D), D)
+ // with B and D, having a single bit set.
ConstantInt *CCst, *ECst;
if (!match(C, m_ConstantInt(CCst)) || !match(E, m_ConstantInt(ECst)))
- return nullptr;
- if (PredL != NewCC)
- CCst = cast<ConstantInt>(ConstantExpr::getXor(BCst, CCst));
- if (PredR != NewCC)
- ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
-
- // If there is a conflict, we should actually return a false for the
- // whole construct.
- if (((BCst->getValue() & DCst->getValue()) &
- (CCst->getValue() ^ ECst->getValue())).getBoolValue())
- return ConstantInt::get(LHS->getType(), !IsAnd);
-
- Value *NewOr1 = Builder.CreateOr(B, D);
- Value *NewOr2 = ConstantExpr::getOr(CCst, ECst);
- Value *NewAnd = Builder.CreateAnd(A, NewOr1);
- return Builder.CreateICmp(NewCC, NewAnd, NewOr2);
- }
-
- return nullptr;
-}
-
-/// Try to fold a signed range checked with lower bound 0 to an unsigned icmp.
-/// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
-/// If \p Inverted is true then the check is for the inverted range, e.g.
-/// (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
+ return nullptr;
+ if (PredL != NewCC)
+ CCst = cast<ConstantInt>(ConstantExpr::getXor(BCst, CCst));
+ if (PredR != NewCC)
+ ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
+
+ // If there is a conflict, we should actually return a false for the
+ // whole construct.
+ if (((BCst->getValue() & DCst->getValue()) &
+ (CCst->getValue() ^ ECst->getValue())).getBoolValue())
+ return ConstantInt::get(LHS->getType(), !IsAnd);
+
+ Value *NewOr1 = Builder.CreateOr(B, D);
+ Value *NewOr2 = ConstantExpr::getOr(CCst, ECst);
+ Value *NewAnd = Builder.CreateAnd(A, NewOr1);
+ return Builder.CreateICmp(NewCC, NewAnd, NewOr2);
+ }
+
+ return nullptr;
+}
+
+/// Try to fold a signed range checked with lower bound 0 to an unsigned icmp.
+/// Example: (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
+/// If \p Inverted is true then the check is for the inverted range, e.g.
+/// (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
Value *InstCombinerImpl::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
bool Inverted) {
- // Check the lower range comparison, e.g. x >= 0
- // InstCombine already ensured that if there is a constant it's on the RHS.
- ConstantInt *RangeStart = dyn_cast<ConstantInt>(Cmp0->getOperand(1));
- if (!RangeStart)
- return nullptr;
-
- ICmpInst::Predicate Pred0 = (Inverted ? Cmp0->getInversePredicate() :
- Cmp0->getPredicate());
-
- // Accept x > -1 or x >= 0 (after potentially inverting the predicate).
- if (!((Pred0 == ICmpInst::ICMP_SGT && RangeStart->isMinusOne()) ||
- (Pred0 == ICmpInst::ICMP_SGE && RangeStart->isZero())))
- return nullptr;
-
- ICmpInst::Predicate Pred1 = (Inverted ? Cmp1->getInversePredicate() :
- Cmp1->getPredicate());
-
- Value *Input = Cmp0->getOperand(0);
- Value *RangeEnd;
- if (Cmp1->getOperand(0) == Input) {
- // For the upper range compare we have: icmp x, n
- RangeEnd = Cmp1->getOperand(1);
- } else if (Cmp1->getOperand(1) == Input) {
- // For the upper range compare we have: icmp n, x
- RangeEnd = Cmp1->getOperand(0);
- Pred1 = ICmpInst::getSwappedPredicate(Pred1);
- } else {
- return nullptr;
- }
-
- // Check the upper range comparison, e.g. x < n
- ICmpInst::Predicate NewPred;
- switch (Pred1) {
- case ICmpInst::ICMP_SLT: NewPred = ICmpInst::ICMP_ULT; break;
- case ICmpInst::ICMP_SLE: NewPred = ICmpInst::ICMP_ULE; break;
- default: return nullptr;
- }
-
- // This simplification is only valid if the upper range is not negative.
- KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1);
- if (!Known.isNonNegative())
- return nullptr;
-
- if (Inverted)
- NewPred = ICmpInst::getInversePredicate(NewPred);
-
- return Builder.CreateICmp(NewPred, Input, RangeEnd);
-}
-
-static Value *
-foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS,
- bool JoinedByAnd,
- InstCombiner::BuilderTy &Builder) {
- Value *X = LHS->getOperand(0);
- if (X != RHS->getOperand(0))
- return nullptr;
-
- const APInt *C1, *C2;
- if (!match(LHS->getOperand(1), m_APInt(C1)) ||
- !match(RHS->getOperand(1), m_APInt(C2)))
- return nullptr;
-
- // We only handle (X != C1 && X != C2) and (X == C1 || X == C2).
- ICmpInst::Predicate Pred = LHS->getPredicate();
- if (Pred != RHS->getPredicate())
- return nullptr;
- if (JoinedByAnd && Pred != ICmpInst::ICMP_NE)
- return nullptr;
- if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
- return nullptr;
-
- // The larger unsigned constant goes on the right.
- if (C1->ugt(*C2))
- std::swap(C1, C2);
-
- APInt Xor = *C1 ^ *C2;
- if (Xor.isPowerOf2()) {
- // If LHSC and RHSC differ by only one bit, then set that bit in X and
- // compare against the larger constant:
- // (X == C1 || X == C2) --> (X | (C1 ^ C2)) == C2
- // (X != C1 && X != C2) --> (X | (C1 ^ C2)) != C2
- // We choose an 'or' with a Pow2 constant rather than the inverse mask with
- // 'and' because that may lead to smaller codegen from a smaller constant.
- Value *Or = Builder.CreateOr(X, ConstantInt::get(X->getType(), Xor));
- return Builder.CreateICmp(Pred, Or, ConstantInt::get(X->getType(), *C2));
- }
-
- // Special case: get the ordering right when the values wrap around zero.
- // Ie, we assumed the constants were unsigned when swapping earlier.
- if (C1->isNullValue() && C2->isAllOnesValue())
- std::swap(C1, C2);
-
- if (*C1 == *C2 - 1) {
- // (X == 13 || X == 14) --> X - 13 <=u 1
- // (X != 13 && X != 14) --> X - 13 >u 1
- // An 'add' is the canonical IR form, so favor that over a 'sub'.
- Value *Add = Builder.CreateAdd(X, ConstantInt::get(X->getType(), -(*C1)));
- auto NewPred = JoinedByAnd ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_ULE;
- return Builder.CreateICmp(NewPred, Add, ConstantInt::get(X->getType(), 1));
- }
-
- return nullptr;
-}
-
-// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
-// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
+ // Check the lower range comparison, e.g. x >= 0
+ // InstCombine already ensured that if there is a constant it's on the RHS.
+ ConstantInt *RangeStart = dyn_cast<ConstantInt>(Cmp0->getOperand(1));
+ if (!RangeStart)
+ return nullptr;
+
+ ICmpInst::Predicate Pred0 = (Inverted ? Cmp0->getInversePredicate() :
+ Cmp0->getPredicate());
+
+ // Accept x > -1 or x >= 0 (after potentially inverting the predicate).
+ if (!((Pred0 == ICmpInst::ICMP_SGT && RangeStart->isMinusOne()) ||
+ (Pred0 == ICmpInst::ICMP_SGE && RangeStart->isZero())))
+ return nullptr;
+
+ ICmpInst::Predicate Pred1 = (Inverted ? Cmp1->getInversePredicate() :
+ Cmp1->getPredicate());
+
+ Value *Input = Cmp0->getOperand(0);
+ Value *RangeEnd;
+ if (Cmp1->getOperand(0) == Input) {
+ // For the upper range compare we have: icmp x, n
+ RangeEnd = Cmp1->getOperand(1);
+ } else if (Cmp1->getOperand(1) == Input) {
+ // For the upper range compare we have: icmp n, x
+ RangeEnd = Cmp1->getOperand(0);
+ Pred1 = ICmpInst::getSwappedPredicate(Pred1);
+ } else {
+ return nullptr;
+ }
+
+ // Check the upper range comparison, e.g. x < n
+ ICmpInst::Predicate NewPred;
+ switch (Pred1) {
+ case ICmpInst::ICMP_SLT: NewPred = ICmpInst::ICMP_ULT; break;
+ case ICmpInst::ICMP_SLE: NewPred = ICmpInst::ICMP_ULE; break;
+ default: return nullptr;
+ }
+
+ // This simplification is only valid if the upper range is not negative.
+ KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1);
+ if (!Known.isNonNegative())
+ return nullptr;
+
+ if (Inverted)
+ NewPred = ICmpInst::getInversePredicate(NewPred);
+
+ return Builder.CreateICmp(NewPred, Input, RangeEnd);
+}
+
+static Value *
+foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS,
+ bool JoinedByAnd,
+ InstCombiner::BuilderTy &Builder) {
+ Value *X = LHS->getOperand(0);
+ if (X != RHS->getOperand(0))
+ return nullptr;
+
+ const APInt *C1, *C2;
+ if (!match(LHS->getOperand(1), m_APInt(C1)) ||
+ !match(RHS->getOperand(1), m_APInt(C2)))
+ return nullptr;
+
+ // We only handle (X != C1 && X != C2) and (X == C1 || X == C2).
+ ICmpInst::Predicate Pred = LHS->getPredicate();
+ if (Pred != RHS->getPredicate())
+ return nullptr;
+ if (JoinedByAnd && Pred != ICmpInst::ICMP_NE)
+ return nullptr;
+ if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
+ return nullptr;
+
+ // The larger unsigned constant goes on the right.
+ if (C1->ugt(*C2))
+ std::swap(C1, C2);
+
+ APInt Xor = *C1 ^ *C2;
+ if (Xor.isPowerOf2()) {
+ // If LHSC and RHSC differ by only one bit, then set that bit in X and
+ // compare against the larger constant:
+ // (X == C1 || X == C2) --> (X | (C1 ^ C2)) == C2
+ // (X != C1 && X != C2) --> (X | (C1 ^ C2)) != C2
+ // We choose an 'or' with a Pow2 constant rather than the inverse mask with
+ // 'and' because that may lead to smaller codegen from a smaller constant.
+ Value *Or = Builder.CreateOr(X, ConstantInt::get(X->getType(), Xor));
+ return Builder.CreateICmp(Pred, Or, ConstantInt::get(X->getType(), *C2));
+ }
+
+ // Special case: get the ordering right when the values wrap around zero.
+ // Ie, we assumed the constants were unsigned when swapping earlier.
+ if (C1->isNullValue() && C2->isAllOnesValue())
+ std::swap(C1, C2);
+
+ if (*C1 == *C2 - 1) {
+ // (X == 13 || X == 14) --> X - 13 <=u 1
+ // (X != 13 && X != 14) --> X - 13 >u 1
+ // An 'add' is the canonical IR form, so favor that over a 'sub'.
+ Value *Add = Builder.CreateAdd(X, ConstantInt::get(X->getType(), -(*C1)));
+ auto NewPred = JoinedByAnd ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_ULE;
+ return Builder.CreateICmp(NewPred, Add, ConstantInt::get(X->getType(), 1));
+ }
+
+ return nullptr;
+}
+
+// Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
+// Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
Value *InstCombinerImpl::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS,
ICmpInst *RHS,
BinaryOperator &Logic) {
- bool JoinedByAnd = Logic.getOpcode() == Instruction::And;
- assert((JoinedByAnd || Logic.getOpcode() == Instruction::Or) &&
- "Wrong opcode");
- ICmpInst::Predicate Pred = LHS->getPredicate();
- if (Pred != RHS->getPredicate())
- return nullptr;
- if (JoinedByAnd && Pred != ICmpInst::ICMP_NE)
- return nullptr;
- if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
- return nullptr;
-
+ bool JoinedByAnd = Logic.getOpcode() == Instruction::And;
+ assert((JoinedByAnd || Logic.getOpcode() == Instruction::Or) &&
+ "Wrong opcode");
+ ICmpInst::Predicate Pred = LHS->getPredicate();
+ if (Pred != RHS->getPredicate())
+ return nullptr;
+ if (JoinedByAnd && Pred != ICmpInst::ICMP_NE)
+ return nullptr;
+ if (!JoinedByAnd && Pred != ICmpInst::ICMP_EQ)
+ return nullptr;
+
if (!match(LHS->getOperand(1), m_Zero()) ||
!match(RHS->getOperand(1), m_Zero()))
- return nullptr;
-
- Value *A, *B, *C, *D;
- if (match(LHS->getOperand(0), m_And(m_Value(A), m_Value(B))) &&
- match(RHS->getOperand(0), m_And(m_Value(C), m_Value(D)))) {
- if (A == D || B == D)
- std::swap(C, D);
- if (B == C)
- std::swap(A, B);
-
- if (A == C &&
- isKnownToBeAPowerOfTwo(B, false, 0, &Logic) &&
- isKnownToBeAPowerOfTwo(D, false, 0, &Logic)) {
- Value *Mask = Builder.CreateOr(B, D);
- Value *Masked = Builder.CreateAnd(A, Mask);
- auto NewPred = JoinedByAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
- return Builder.CreateICmp(NewPred, Masked, Mask);
- }
- }
-
- return nullptr;
-}
-
-/// General pattern:
-/// X & Y
-///
-/// Where Y is checking that all the high bits (covered by a mask 4294967168)
-/// are uniform, i.e. %arg & 4294967168 can be either 4294967168 or 0
-/// Pattern can be one of:
-/// %t = add i32 %arg, 128
-/// %r = icmp ult i32 %t, 256
-/// Or
-/// %t0 = shl i32 %arg, 24
-/// %t1 = ashr i32 %t0, 24
-/// %r = icmp eq i32 %t1, %arg
-/// Or
-/// %t0 = trunc i32 %arg to i8
-/// %t1 = sext i8 %t0 to i32
-/// %r = icmp eq i32 %t1, %arg
-/// This pattern is a signed truncation check.
-///
-/// And X is checking that some bit in that same mask is zero.
-/// I.e. can be one of:
-/// %r = icmp sgt i32 %arg, -1
-/// Or
-/// %t = and i32 %arg, 2147483648
-/// %r = icmp eq i32 %t, 0
-///
-/// Since we are checking that all the bits in that mask are the same,
-/// and a particular bit is zero, what we are really checking is that all the
-/// masked bits are zero.
-/// So this should be transformed to:
-/// %r = icmp ult i32 %arg, 128
-static Value *foldSignedTruncationCheck(ICmpInst *ICmp0, ICmpInst *ICmp1,
- Instruction &CxtI,
- InstCombiner::BuilderTy &Builder) {
- assert(CxtI.getOpcode() == Instruction::And);
-
- // Match icmp ult (add %arg, C01), C1 (C1 == C01 << 1; powers of two)
- auto tryToMatchSignedTruncationCheck = [](ICmpInst *ICmp, Value *&X,
- APInt &SignBitMask) -> bool {
- CmpInst::Predicate Pred;
- const APInt *I01, *I1; // powers of two; I1 == I01 << 1
- if (!(match(ICmp,
- m_ICmp(Pred, m_Add(m_Value(X), m_Power2(I01)), m_Power2(I1))) &&
- Pred == ICmpInst::ICMP_ULT && I1->ugt(*I01) && I01->shl(1) == *I1))
- return false;
- // Which bit is the new sign bit as per the 'signed truncation' pattern?
- SignBitMask = *I01;
- return true;
- };
-
- // One icmp needs to be 'signed truncation check'.
- // We need to match this first, else we will mismatch commutative cases.
- Value *X1;
- APInt HighestBit;
- ICmpInst *OtherICmp;
- if (tryToMatchSignedTruncationCheck(ICmp1, X1, HighestBit))
- OtherICmp = ICmp0;
- else if (tryToMatchSignedTruncationCheck(ICmp0, X1, HighestBit))
- OtherICmp = ICmp1;
- else
- return nullptr;
-
- assert(HighestBit.isPowerOf2() && "expected to be power of two (non-zero)");
-
- // Try to match/decompose into: icmp eq (X & Mask), 0
- auto tryToDecompose = [](ICmpInst *ICmp, Value *&X,
- APInt &UnsetBitsMask) -> bool {
- CmpInst::Predicate Pred = ICmp->getPredicate();
- // Can it be decomposed into icmp eq (X & Mask), 0 ?
- if (llvm::decomposeBitTestICmp(ICmp->getOperand(0), ICmp->getOperand(1),
- Pred, X, UnsetBitsMask,
- /*LookThroughTrunc=*/false) &&
- Pred == ICmpInst::ICMP_EQ)
- return true;
- // Is it icmp eq (X & Mask), 0 already?
- const APInt *Mask;
- if (match(ICmp, m_ICmp(Pred, m_And(m_Value(X), m_APInt(Mask)), m_Zero())) &&
- Pred == ICmpInst::ICMP_EQ) {
- UnsetBitsMask = *Mask;
- return true;
- }
- return false;
- };
-
- // And the other icmp needs to be decomposable into a bit test.
- Value *X0;
- APInt UnsetBitsMask;
- if (!tryToDecompose(OtherICmp, X0, UnsetBitsMask))
- return nullptr;
-
- assert(!UnsetBitsMask.isNullValue() && "empty mask makes no sense.");
-
- // Are they working on the same value?
- Value *X;
- if (X1 == X0) {
- // Ok as is.
- X = X1;
- } else if (match(X0, m_Trunc(m_Specific(X1)))) {
- UnsetBitsMask = UnsetBitsMask.zext(X1->getType()->getScalarSizeInBits());
- X = X1;
- } else
- return nullptr;
-
- // So which bits should be uniform as per the 'signed truncation check'?
- // (all the bits starting with (i.e. including) HighestBit)
- APInt SignBitsMask = ~(HighestBit - 1U);
-
- // UnsetBitsMask must have some common bits with SignBitsMask,
- if (!UnsetBitsMask.intersects(SignBitsMask))
- return nullptr;
-
- // Does UnsetBitsMask contain any bits outside of SignBitsMask?
- if (!UnsetBitsMask.isSubsetOf(SignBitsMask)) {
- APInt OtherHighestBit = (~UnsetBitsMask) + 1U;
- if (!OtherHighestBit.isPowerOf2())
- return nullptr;
- HighestBit = APIntOps::umin(HighestBit, OtherHighestBit);
- }
- // Else, if it does not, then all is ok as-is.
-
- // %r = icmp ult %X, SignBit
- return Builder.CreateICmpULT(X, ConstantInt::get(X->getType(), HighestBit),
- CxtI.getName() + ".simplified");
-}
-
-/// Reduce a pair of compares that check if a value has exactly 1 bit set.
-static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd,
- InstCombiner::BuilderTy &Builder) {
- // Handle 'and' / 'or' commutation: make the equality check the first operand.
- if (JoinedByAnd && Cmp1->getPredicate() == ICmpInst::ICMP_NE)
- std::swap(Cmp0, Cmp1);
- else if (!JoinedByAnd && Cmp1->getPredicate() == ICmpInst::ICMP_EQ)
- std::swap(Cmp0, Cmp1);
-
- // (X != 0) && (ctpop(X) u< 2) --> ctpop(X) == 1
- CmpInst::Predicate Pred0, Pred1;
- Value *X;
- if (JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) &&
- match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
- m_SpecificInt(2))) &&
- Pred0 == ICmpInst::ICMP_NE && Pred1 == ICmpInst::ICMP_ULT) {
- Value *CtPop = Cmp1->getOperand(0);
- return Builder.CreateICmpEQ(CtPop, ConstantInt::get(CtPop->getType(), 1));
- }
- // (X == 0) || (ctpop(X) u> 1) --> ctpop(X) != 1
- if (!JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) &&
- match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
- m_SpecificInt(1))) &&
- Pred0 == ICmpInst::ICMP_EQ && Pred1 == ICmpInst::ICMP_UGT) {
- Value *CtPop = Cmp1->getOperand(0);
- return Builder.CreateICmpNE(CtPop, ConstantInt::get(CtPop->getType(), 1));
- }
- return nullptr;
-}
-
-/// Commuted variants are assumed to be handled by calling this function again
-/// with the parameters swapped.
-static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp,
- ICmpInst *UnsignedICmp, bool IsAnd,
- const SimplifyQuery &Q,
- InstCombiner::BuilderTy &Builder) {
- Value *ZeroCmpOp;
- ICmpInst::Predicate EqPred;
- if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(ZeroCmpOp), m_Zero())) ||
- !ICmpInst::isEquality(EqPred))
- return nullptr;
-
- auto IsKnownNonZero = [&](Value *V) {
- return isKnownNonZero(V, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
- };
-
- ICmpInst::Predicate UnsignedPred;
-
- Value *A, *B;
- if (match(UnsignedICmp,
- m_c_ICmp(UnsignedPred, m_Specific(ZeroCmpOp), m_Value(A))) &&
- match(ZeroCmpOp, m_c_Add(m_Specific(A), m_Value(B))) &&
- (ZeroICmp->hasOneUse() || UnsignedICmp->hasOneUse())) {
- auto GetKnownNonZeroAndOther = [&](Value *&NonZero, Value *&Other) {
- if (!IsKnownNonZero(NonZero))
- std::swap(NonZero, Other);
- return IsKnownNonZero(NonZero);
- };
-
- // Given ZeroCmpOp = (A + B)
- // ZeroCmpOp <= A && ZeroCmpOp != 0 --> (0-B) < A
- // ZeroCmpOp > A || ZeroCmpOp == 0 --> (0-B) >= A
- //
- // ZeroCmpOp < A && ZeroCmpOp != 0 --> (0-X) < Y iff
- // ZeroCmpOp >= A || ZeroCmpOp == 0 --> (0-X) >= Y iff
- // with X being the value (A/B) that is known to be non-zero,
- // and Y being remaining value.
- if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE &&
- IsAnd)
- return Builder.CreateICmpULT(Builder.CreateNeg(B), A);
- if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE &&
- IsAnd && GetKnownNonZeroAndOther(B, A))
- return Builder.CreateICmpULT(Builder.CreateNeg(B), A);
- if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
- !IsAnd)
- return Builder.CreateICmpUGE(Builder.CreateNeg(B), A);
- if (UnsignedPred == ICmpInst::ICMP_UGE && EqPred == ICmpInst::ICMP_EQ &&
- !IsAnd && GetKnownNonZeroAndOther(B, A))
- return Builder.CreateICmpUGE(Builder.CreateNeg(B), A);
- }
-
- Value *Base, *Offset;
- if (!match(ZeroCmpOp, m_Sub(m_Value(Base), m_Value(Offset))))
- return nullptr;
-
- if (!match(UnsignedICmp,
- m_c_ICmp(UnsignedPred, m_Specific(Base), m_Specific(Offset))) ||
- !ICmpInst::isUnsigned(UnsignedPred))
- return nullptr;
-
- // Base >=/> Offset && (Base - Offset) != 0 <--> Base > Offset
- // (no overflow and not null)
- if ((UnsignedPred == ICmpInst::ICMP_UGE ||
- UnsignedPred == ICmpInst::ICMP_UGT) &&
- EqPred == ICmpInst::ICMP_NE && IsAnd)
- return Builder.CreateICmpUGT(Base, Offset);
-
- // Base <=/< Offset || (Base - Offset) == 0 <--> Base <= Offset
- // (overflow or null)
- if ((UnsignedPred == ICmpInst::ICMP_ULE ||
- UnsignedPred == ICmpInst::ICMP_ULT) &&
- EqPred == ICmpInst::ICMP_EQ && !IsAnd)
- return Builder.CreateICmpULE(Base, Offset);
-
- // Base <= Offset && (Base - Offset) != 0 --> Base < Offset
- if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE &&
- IsAnd)
- return Builder.CreateICmpULT(Base, Offset);
-
- // Base > Offset || (Base - Offset) == 0 --> Base >= Offset
- if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
- !IsAnd)
- return Builder.CreateICmpUGE(Base, Offset);
-
- return nullptr;
-}
-
-/// Reduce logic-of-compares with equality to a constant by substituting a
-/// common operand with the constant. Callers are expected to call this with
-/// Cmp0/Cmp1 switched to handle logic op commutativity.
-static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
- BinaryOperator &Logic,
- InstCombiner::BuilderTy &Builder,
- const SimplifyQuery &Q) {
- bool IsAnd = Logic.getOpcode() == Instruction::And;
- assert((IsAnd || Logic.getOpcode() == Instruction::Or) && "Wrong logic op");
-
- // Match an equality compare with a non-poison constant as Cmp0.
- // Also, give up if the compare can be constant-folded to avoid looping.
- ICmpInst::Predicate Pred0;
- Value *X;
- Constant *C;
- if (!match(Cmp0, m_ICmp(Pred0, m_Value(X), m_Constant(C))) ||
- !isGuaranteedNotToBeUndefOrPoison(C) || isa<Constant>(X))
- return nullptr;
- if ((IsAnd && Pred0 != ICmpInst::ICMP_EQ) ||
- (!IsAnd && Pred0 != ICmpInst::ICMP_NE))
- return nullptr;
-
- // The other compare must include a common operand (X). Canonicalize the
- // common operand as operand 1 (Pred1 is swapped if the common operand was
- // operand 0).
- Value *Y;
- ICmpInst::Predicate Pred1;
- if (!match(Cmp1, m_c_ICmp(Pred1, m_Value(Y), m_Deferred(X))))
- return nullptr;
-
- // Replace variable with constant value equivalence to remove a variable use:
- // (X == C) && (Y Pred1 X) --> (X == C) && (Y Pred1 C)
- // (X != C) || (Y Pred1 X) --> (X != C) || (Y Pred1 C)
- // Can think of the 'or' substitution with the 'and' bool equivalent:
- // A || B --> A || (!A && B)
- Value *SubstituteCmp = SimplifyICmpInst(Pred1, Y, C, Q);
- if (!SubstituteCmp) {
- // If we need to create a new instruction, require that the old compare can
- // be removed.
- if (!Cmp1->hasOneUse())
- return nullptr;
- SubstituteCmp = Builder.CreateICmp(Pred1, Y, C);
- }
- return Builder.CreateBinOp(Logic.getOpcode(), Cmp0, SubstituteCmp);
-}
-
-/// Fold (icmp)&(icmp) if possible.
+ return nullptr;
+
+ Value *A, *B, *C, *D;
+ if (match(LHS->getOperand(0), m_And(m_Value(A), m_Value(B))) &&
+ match(RHS->getOperand(0), m_And(m_Value(C), m_Value(D)))) {
+ if (A == D || B == D)
+ std::swap(C, D);
+ if (B == C)
+ std::swap(A, B);
+
+ if (A == C &&
+ isKnownToBeAPowerOfTwo(B, false, 0, &Logic) &&
+ isKnownToBeAPowerOfTwo(D, false, 0, &Logic)) {
+ Value *Mask = Builder.CreateOr(B, D);
+ Value *Masked = Builder.CreateAnd(A, Mask);
+ auto NewPred = JoinedByAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
+ return Builder.CreateICmp(NewPred, Masked, Mask);
+ }
+ }
+
+ return nullptr;
+}
+
+/// General pattern:
+/// X & Y
+///
+/// Where Y is checking that all the high bits (covered by a mask 4294967168)
+/// are uniform, i.e. %arg & 4294967168 can be either 4294967168 or 0
+/// Pattern can be one of:
+/// %t = add i32 %arg, 128
+/// %r = icmp ult i32 %t, 256
+/// Or
+/// %t0 = shl i32 %arg, 24
+/// %t1 = ashr i32 %t0, 24
+/// %r = icmp eq i32 %t1, %arg
+/// Or
+/// %t0 = trunc i32 %arg to i8
+/// %t1 = sext i8 %t0 to i32
+/// %r = icmp eq i32 %t1, %arg
+/// This pattern is a signed truncation check.
+///
+/// And X is checking that some bit in that same mask is zero.
+/// I.e. can be one of:
+/// %r = icmp sgt i32 %arg, -1
+/// Or
+/// %t = and i32 %arg, 2147483648
+/// %r = icmp eq i32 %t, 0
+///
+/// Since we are checking that all the bits in that mask are the same,
+/// and a particular bit is zero, what we are really checking is that all the
+/// masked bits are zero.
+/// So this should be transformed to:
+/// %r = icmp ult i32 %arg, 128
+static Value *foldSignedTruncationCheck(ICmpInst *ICmp0, ICmpInst *ICmp1,
+ Instruction &CxtI,
+ InstCombiner::BuilderTy &Builder) {
+ assert(CxtI.getOpcode() == Instruction::And);
+
+ // Match icmp ult (add %arg, C01), C1 (C1 == C01 << 1; powers of two)
+ auto tryToMatchSignedTruncationCheck = [](ICmpInst *ICmp, Value *&X,
+ APInt &SignBitMask) -> bool {
+ CmpInst::Predicate Pred;
+ const APInt *I01, *I1; // powers of two; I1 == I01 << 1
+ if (!(match(ICmp,
+ m_ICmp(Pred, m_Add(m_Value(X), m_Power2(I01)), m_Power2(I1))) &&
+ Pred == ICmpInst::ICMP_ULT && I1->ugt(*I01) && I01->shl(1) == *I1))
+ return false;
+ // Which bit is the new sign bit as per the 'signed truncation' pattern?
+ SignBitMask = *I01;
+ return true;
+ };
+
+ // One icmp needs to be 'signed truncation check'.
+ // We need to match this first, else we will mismatch commutative cases.
+ Value *X1;
+ APInt HighestBit;
+ ICmpInst *OtherICmp;
+ if (tryToMatchSignedTruncationCheck(ICmp1, X1, HighestBit))
+ OtherICmp = ICmp0;
+ else if (tryToMatchSignedTruncationCheck(ICmp0, X1, HighestBit))
+ OtherICmp = ICmp1;
+ else
+ return nullptr;
+
+ assert(HighestBit.isPowerOf2() && "expected to be power of two (non-zero)");
+
+ // Try to match/decompose into: icmp eq (X & Mask), 0
+ auto tryToDecompose = [](ICmpInst *ICmp, Value *&X,
+ APInt &UnsetBitsMask) -> bool {
+ CmpInst::Predicate Pred = ICmp->getPredicate();
+ // Can it be decomposed into icmp eq (X & Mask), 0 ?
+ if (llvm::decomposeBitTestICmp(ICmp->getOperand(0), ICmp->getOperand(1),
+ Pred, X, UnsetBitsMask,
+ /*LookThroughTrunc=*/false) &&
+ Pred == ICmpInst::ICMP_EQ)
+ return true;
+ // Is it icmp eq (X & Mask), 0 already?
+ const APInt *Mask;
+ if (match(ICmp, m_ICmp(Pred, m_And(m_Value(X), m_APInt(Mask)), m_Zero())) &&
+ Pred == ICmpInst::ICMP_EQ) {
+ UnsetBitsMask = *Mask;
+ return true;
+ }
+ return false;
+ };
+
+ // And the other icmp needs to be decomposable into a bit test.
+ Value *X0;
+ APInt UnsetBitsMask;
+ if (!tryToDecompose(OtherICmp, X0, UnsetBitsMask))
+ return nullptr;
+
+ assert(!UnsetBitsMask.isNullValue() && "empty mask makes no sense.");
+
+ // Are they working on the same value?
+ Value *X;
+ if (X1 == X0) {
+ // Ok as is.
+ X = X1;
+ } else if (match(X0, m_Trunc(m_Specific(X1)))) {
+ UnsetBitsMask = UnsetBitsMask.zext(X1->getType()->getScalarSizeInBits());
+ X = X1;
+ } else
+ return nullptr;
+
+ // So which bits should be uniform as per the 'signed truncation check'?
+ // (all the bits starting with (i.e. including) HighestBit)
+ APInt SignBitsMask = ~(HighestBit - 1U);
+
+ // UnsetBitsMask must have some common bits with SignBitsMask,
+ if (!UnsetBitsMask.intersects(SignBitsMask))
+ return nullptr;
+
+ // Does UnsetBitsMask contain any bits outside of SignBitsMask?
+ if (!UnsetBitsMask.isSubsetOf(SignBitsMask)) {
+ APInt OtherHighestBit = (~UnsetBitsMask) + 1U;
+ if (!OtherHighestBit.isPowerOf2())
+ return nullptr;
+ HighestBit = APIntOps::umin(HighestBit, OtherHighestBit);
+ }
+ // Else, if it does not, then all is ok as-is.
+
+ // %r = icmp ult %X, SignBit
+ return Builder.CreateICmpULT(X, ConstantInt::get(X->getType(), HighestBit),
+ CxtI.getName() + ".simplified");
+}
+
+/// Reduce a pair of compares that check if a value has exactly 1 bit set.
+static Value *foldIsPowerOf2(ICmpInst *Cmp0, ICmpInst *Cmp1, bool JoinedByAnd,
+ InstCombiner::BuilderTy &Builder) {
+ // Handle 'and' / 'or' commutation: make the equality check the first operand.
+ if (JoinedByAnd && Cmp1->getPredicate() == ICmpInst::ICMP_NE)
+ std::swap(Cmp0, Cmp1);
+ else if (!JoinedByAnd && Cmp1->getPredicate() == ICmpInst::ICMP_EQ)
+ std::swap(Cmp0, Cmp1);
+
+ // (X != 0) && (ctpop(X) u< 2) --> ctpop(X) == 1
+ CmpInst::Predicate Pred0, Pred1;
+ Value *X;
+ if (JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) &&
+ match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
+ m_SpecificInt(2))) &&
+ Pred0 == ICmpInst::ICMP_NE && Pred1 == ICmpInst::ICMP_ULT) {
+ Value *CtPop = Cmp1->getOperand(0);
+ return Builder.CreateICmpEQ(CtPop, ConstantInt::get(CtPop->getType(), 1));
+ }
+ // (X == 0) || (ctpop(X) u> 1) --> ctpop(X) != 1
+ if (!JoinedByAnd && match(Cmp0, m_ICmp(Pred0, m_Value(X), m_ZeroInt())) &&
+ match(Cmp1, m_ICmp(Pred1, m_Intrinsic<Intrinsic::ctpop>(m_Specific(X)),
+ m_SpecificInt(1))) &&
+ Pred0 == ICmpInst::ICMP_EQ && Pred1 == ICmpInst::ICMP_UGT) {
+ Value *CtPop = Cmp1->getOperand(0);
+ return Builder.CreateICmpNE(CtPop, ConstantInt::get(CtPop->getType(), 1));
+ }
+ return nullptr;
+}
+
+/// Commuted variants are assumed to be handled by calling this function again
+/// with the parameters swapped.
+static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp,
+ ICmpInst *UnsignedICmp, bool IsAnd,
+ const SimplifyQuery &Q,
+ InstCombiner::BuilderTy &Builder) {
+ Value *ZeroCmpOp;
+ ICmpInst::Predicate EqPred;
+ if (!match(ZeroICmp, m_ICmp(EqPred, m_Value(ZeroCmpOp), m_Zero())) ||
+ !ICmpInst::isEquality(EqPred))
+ return nullptr;
+
+ auto IsKnownNonZero = [&](Value *V) {
+ return isKnownNonZero(V, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT);
+ };
+
+ ICmpInst::Predicate UnsignedPred;
+
+ Value *A, *B;
+ if (match(UnsignedICmp,
+ m_c_ICmp(UnsignedPred, m_Specific(ZeroCmpOp), m_Value(A))) &&
+ match(ZeroCmpOp, m_c_Add(m_Specific(A), m_Value(B))) &&
+ (ZeroICmp->hasOneUse() || UnsignedICmp->hasOneUse())) {
+ auto GetKnownNonZeroAndOther = [&](Value *&NonZero, Value *&Other) {
+ if (!IsKnownNonZero(NonZero))
+ std::swap(NonZero, Other);
+ return IsKnownNonZero(NonZero);
+ };
+
+ // Given ZeroCmpOp = (A + B)
+ // ZeroCmpOp <= A && ZeroCmpOp != 0 --> (0-B) < A
+ // ZeroCmpOp > A || ZeroCmpOp == 0 --> (0-B) >= A
+ //
+ // ZeroCmpOp < A && ZeroCmpOp != 0 --> (0-X) < Y iff
+ // ZeroCmpOp >= A || ZeroCmpOp == 0 --> (0-X) >= Y iff
+ // with X being the value (A/B) that is known to be non-zero,
+ // and Y being remaining value.
+ if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE &&
+ IsAnd)
+ return Builder.CreateICmpULT(Builder.CreateNeg(B), A);
+ if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE &&
+ IsAnd && GetKnownNonZeroAndOther(B, A))
+ return Builder.CreateICmpULT(Builder.CreateNeg(B), A);
+ if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
+ !IsAnd)
+ return Builder.CreateICmpUGE(Builder.CreateNeg(B), A);
+ if (UnsignedPred == ICmpInst::ICMP_UGE && EqPred == ICmpInst::ICMP_EQ &&
+ !IsAnd && GetKnownNonZeroAndOther(B, A))
+ return Builder.CreateICmpUGE(Builder.CreateNeg(B), A);
+ }
+
+ Value *Base, *Offset;
+ if (!match(ZeroCmpOp, m_Sub(m_Value(Base), m_Value(Offset))))
+ return nullptr;
+
+ if (!match(UnsignedICmp,
+ m_c_ICmp(UnsignedPred, m_Specific(Base), m_Specific(Offset))) ||
+ !ICmpInst::isUnsigned(UnsignedPred))
+ return nullptr;
+
+ // Base >=/> Offset && (Base - Offset) != 0 <--> Base > Offset
+ // (no overflow and not null)
+ if ((UnsignedPred == ICmpInst::ICMP_UGE ||
+ UnsignedPred == ICmpInst::ICMP_UGT) &&
+ EqPred == ICmpInst::ICMP_NE && IsAnd)
+ return Builder.CreateICmpUGT(Base, Offset);
+
+ // Base <=/< Offset || (Base - Offset) == 0 <--> Base <= Offset
+ // (overflow or null)
+ if ((UnsignedPred == ICmpInst::ICMP_ULE ||
+ UnsignedPred == ICmpInst::ICMP_ULT) &&
+ EqPred == ICmpInst::ICMP_EQ && !IsAnd)
+ return Builder.CreateICmpULE(Base, Offset);
+
+ // Base <= Offset && (Base - Offset) != 0 --> Base < Offset
+ if (UnsignedPred == ICmpInst::ICMP_ULE && EqPred == ICmpInst::ICMP_NE &&
+ IsAnd)
+ return Builder.CreateICmpULT(Base, Offset);
+
+ // Base > Offset || (Base - Offset) == 0 --> Base >= Offset
+ if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
+ !IsAnd)
+ return Builder.CreateICmpUGE(Base, Offset);
+
+ return nullptr;
+}
+
+/// Reduce logic-of-compares with equality to a constant by substituting a
+/// common operand with the constant. Callers are expected to call this with
+/// Cmp0/Cmp1 switched to handle logic op commutativity.
+static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
+ BinaryOperator &Logic,
+ InstCombiner::BuilderTy &Builder,
+ const SimplifyQuery &Q) {
+ bool IsAnd = Logic.getOpcode() == Instruction::And;
+ assert((IsAnd || Logic.getOpcode() == Instruction::Or) && "Wrong logic op");
+
+ // Match an equality compare with a non-poison constant as Cmp0.
+ // Also, give up if the compare can be constant-folded to avoid looping.
+ ICmpInst::Predicate Pred0;
+ Value *X;
+ Constant *C;
+ if (!match(Cmp0, m_ICmp(Pred0, m_Value(X), m_Constant(C))) ||
+ !isGuaranteedNotToBeUndefOrPoison(C) || isa<Constant>(X))
+ return nullptr;
+ if ((IsAnd && Pred0 != ICmpInst::ICMP_EQ) ||
+ (!IsAnd && Pred0 != ICmpInst::ICMP_NE))
+ return nullptr;
+
+ // The other compare must include a common operand (X). Canonicalize the
+ // common operand as operand 1 (Pred1 is swapped if the common operand was
+ // operand 0).
+ Value *Y;
+ ICmpInst::Predicate Pred1;
+ if (!match(Cmp1, m_c_ICmp(Pred1, m_Value(Y), m_Deferred(X))))
+ return nullptr;
+
+ // Replace variable with constant value equivalence to remove a variable use:
+ // (X == C) && (Y Pred1 X) --> (X == C) && (Y Pred1 C)
+ // (X != C) || (Y Pred1 X) --> (X != C) || (Y Pred1 C)
+ // Can think of the 'or' substitution with the 'and' bool equivalent:
+ // A || B --> A || (!A && B)
+ Value *SubstituteCmp = SimplifyICmpInst(Pred1, Y, C, Q);
+ if (!SubstituteCmp) {
+ // If we need to create a new instruction, require that the old compare can
+ // be removed.
+ if (!Cmp1->hasOneUse())
+ return nullptr;
+ SubstituteCmp = Builder.CreateICmp(Pred1, Y, C);
+ }
+ return Builder.CreateBinOp(Logic.getOpcode(), Cmp0, SubstituteCmp);
+}
+
+/// Fold (icmp)&(icmp) if possible.
Value *InstCombinerImpl::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
BinaryOperator &And) {
- const SimplifyQuery Q = SQ.getWithInstruction(&And);
-
- // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
- // if K1 and K2 are a one-bit mask.
- if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, And))
- return V;
-
- ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
-
- // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
- if (predicatesFoldable(PredL, PredR)) {
- if (LHS->getOperand(0) == RHS->getOperand(1) &&
- LHS->getOperand(1) == RHS->getOperand(0))
- LHS->swapOperands();
- if (LHS->getOperand(0) == RHS->getOperand(0) &&
- LHS->getOperand(1) == RHS->getOperand(1)) {
- Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
- unsigned Code = getICmpCode(LHS) & getICmpCode(RHS);
- bool IsSigned = LHS->isSigned() || RHS->isSigned();
- return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder);
- }
- }
-
- // handle (roughly): (icmp eq (A & B), C) & (icmp eq (A & D), E)
- if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder))
- return V;
-
- if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, And, Builder, Q))
- return V;
- if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, And, Builder, Q))
- return V;
-
- // E.g. (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
- if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/false))
- return V;
-
- // E.g. (icmp slt x, n) & (icmp sge x, 0) --> icmp ult x, n
- if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/false))
- return V;
-
- if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, true, Builder))
- return V;
-
- if (Value *V = foldSignedTruncationCheck(LHS, RHS, And, Builder))
- return V;
-
- if (Value *V = foldIsPowerOf2(LHS, RHS, true /* JoinedByAnd */, Builder))
- return V;
-
- if (Value *X =
- foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/true, Q, Builder))
- return X;
- if (Value *X =
- foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/true, Q, Builder))
- return X;
-
- // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
- Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
+ const SimplifyQuery Q = SQ.getWithInstruction(&And);
+
+ // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
+ // if K1 and K2 are a one-bit mask.
+ if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, And))
+ return V;
+
+ ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+
+ // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
+ if (predicatesFoldable(PredL, PredR)) {
+ if (LHS->getOperand(0) == RHS->getOperand(1) &&
+ LHS->getOperand(1) == RHS->getOperand(0))
+ LHS->swapOperands();
+ if (LHS->getOperand(0) == RHS->getOperand(0) &&
+ LHS->getOperand(1) == RHS->getOperand(1)) {
+ Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
+ unsigned Code = getICmpCode(LHS) & getICmpCode(RHS);
+ bool IsSigned = LHS->isSigned() || RHS->isSigned();
+ return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder);
+ }
+ }
+
+ // handle (roughly): (icmp eq (A & B), C) & (icmp eq (A & D), E)
+ if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder))
+ return V;
+
+ if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, And, Builder, Q))
+ return V;
+ if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, And, Builder, Q))
+ return V;
+
+ // E.g. (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
+ if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/false))
+ return V;
+
+ // E.g. (icmp slt x, n) & (icmp sge x, 0) --> icmp ult x, n
+ if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/false))
+ return V;
+
+ if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, true, Builder))
+ return V;
+
+ if (Value *V = foldSignedTruncationCheck(LHS, RHS, And, Builder))
+ return V;
+
+ if (Value *V = foldIsPowerOf2(LHS, RHS, true /* JoinedByAnd */, Builder))
+ return V;
+
+ if (Value *X =
+ foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/true, Q, Builder))
+ return X;
+ if (Value *X =
+ foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/true, Q, Builder))
+ return X;
+
+ // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
+ Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
ConstantInt *LHSC, *RHSC;
if (!match(LHS->getOperand(1), m_ConstantInt(LHSC)) ||
!match(RHS->getOperand(1), m_ConstantInt(RHSC)))
- return nullptr;
-
- if (LHSC == RHSC && PredL == PredR) {
- // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
- // where C is a power of 2 or
- // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
- if ((PredL == ICmpInst::ICMP_ULT && LHSC->getValue().isPowerOf2()) ||
- (PredL == ICmpInst::ICMP_EQ && LHSC->isZero())) {
- Value *NewOr = Builder.CreateOr(LHS0, RHS0);
- return Builder.CreateICmp(PredL, NewOr, LHSC);
- }
- }
-
- // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2
- // where CMAX is the all ones value for the truncated type,
- // iff the lower bits of C2 and CA are zero.
- if (PredL == ICmpInst::ICMP_EQ && PredL == PredR && LHS->hasOneUse() &&
- RHS->hasOneUse()) {
- Value *V;
- ConstantInt *AndC, *SmallC = nullptr, *BigC = nullptr;
-
- // (trunc x) == C1 & (and x, CA) == C2
- // (and x, CA) == C2 & (trunc x) == C1
- if (match(RHS0, m_Trunc(m_Value(V))) &&
- match(LHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
- SmallC = RHSC;
- BigC = LHSC;
- } else if (match(LHS0, m_Trunc(m_Value(V))) &&
- match(RHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
- SmallC = LHSC;
- BigC = RHSC;
- }
-
- if (SmallC && BigC) {
- unsigned BigBitSize = BigC->getType()->getBitWidth();
- unsigned SmallBitSize = SmallC->getType()->getBitWidth();
-
- // Check that the low bits are zero.
- APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize);
- if ((Low & AndC->getValue()).isNullValue() &&
- (Low & BigC->getValue()).isNullValue()) {
- Value *NewAnd = Builder.CreateAnd(V, Low | AndC->getValue());
- APInt N = SmallC->getValue().zext(BigBitSize) | BigC->getValue();
- Value *NewVal = ConstantInt::get(AndC->getType()->getContext(), N);
- return Builder.CreateICmp(PredL, NewAnd, NewVal);
- }
- }
- }
-
- // From here on, we only handle:
- // (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
- if (LHS0 != RHS0)
- return nullptr;
-
- // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
- if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
- PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
- PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
- PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
- return nullptr;
-
- // We can't fold (ugt x, C) & (sgt x, C2).
- if (!predicatesFoldable(PredL, PredR))
- return nullptr;
-
- // Ensure that the larger constant is on the RHS.
- bool ShouldSwap;
- if (CmpInst::isSigned(PredL) ||
- (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
- ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
- else
- ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
-
- if (ShouldSwap) {
- std::swap(LHS, RHS);
- std::swap(LHSC, RHSC);
- std::swap(PredL, PredR);
- }
-
- // At this point, we know we have two icmp instructions
- // comparing a value against two constants and and'ing the result
- // together. Because of the above check, we know that we only have
- // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know
- // (from the icmp folding check above), that the two constants
- // are not equal and that the larger constant is on the RHS
- assert(LHSC != RHSC && "Compares not folded above?");
-
- switch (PredL) {
- default:
- llvm_unreachable("Unknown integer condition code!");
- case ICmpInst::ICMP_NE:
- switch (PredR) {
- default:
- llvm_unreachable("Unknown integer condition code!");
- case ICmpInst::ICMP_ULT:
- // (X != 13 & X u< 14) -> X < 13
- if (LHSC->getValue() == (RHSC->getValue() - 1))
- return Builder.CreateICmpULT(LHS0, LHSC);
- if (LHSC->isZero()) // (X != 0 & X u< C) -> X-1 u< C-1
- return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
- false, true);
- break; // (X != 13 & X u< 15) -> no change
- case ICmpInst::ICMP_SLT:
- // (X != 13 & X s< 14) -> X < 13
- if (LHSC->getValue() == (RHSC->getValue() - 1))
- return Builder.CreateICmpSLT(LHS0, LHSC);
- // (X != INT_MIN & X s< C) -> X-(INT_MIN+1) u< (C-(INT_MIN+1))
- if (LHSC->isMinValue(true))
- return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
- true, true);
- break; // (X != 13 & X s< 15) -> no change
- case ICmpInst::ICMP_NE:
- // Potential folds for this case should already be handled.
- break;
- }
- break;
- case ICmpInst::ICMP_UGT:
- switch (PredR) {
- default:
- llvm_unreachable("Unknown integer condition code!");
- case ICmpInst::ICMP_NE:
- // (X u> 13 & X != 14) -> X u> 14
- if (RHSC->getValue() == (LHSC->getValue() + 1))
- return Builder.CreateICmp(PredL, LHS0, RHSC);
- // X u> C & X != UINT_MAX -> (X-(C+1)) u< UINT_MAX-(C+1)
- if (RHSC->isMaxValue(false))
- return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
- false, true);
- break; // (X u> 13 & X != 15) -> no change
- case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) u< 1
- return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
- false, true);
- }
- break;
- case ICmpInst::ICMP_SGT:
- switch (PredR) {
- default:
- llvm_unreachable("Unknown integer condition code!");
- case ICmpInst::ICMP_NE:
- // (X s> 13 & X != 14) -> X s> 14
- if (RHSC->getValue() == (LHSC->getValue() + 1))
- return Builder.CreateICmp(PredL, LHS0, RHSC);
- // X s> C & X != INT_MAX -> (X-(C+1)) u< INT_MAX-(C+1)
- if (RHSC->isMaxValue(true))
- return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
- true, true);
- break; // (X s> 13 & X != 15) -> no change
- case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) u< 1
- return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), true,
- true);
- }
- break;
- }
-
- return nullptr;
-}
-
+ return nullptr;
+
+ if (LHSC == RHSC && PredL == PredR) {
+ // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
+ // where C is a power of 2 or
+ // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
+ if ((PredL == ICmpInst::ICMP_ULT && LHSC->getValue().isPowerOf2()) ||
+ (PredL == ICmpInst::ICMP_EQ && LHSC->isZero())) {
+ Value *NewOr = Builder.CreateOr(LHS0, RHS0);
+ return Builder.CreateICmp(PredL, NewOr, LHSC);
+ }
+ }
+
+ // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2
+ // where CMAX is the all ones value for the truncated type,
+ // iff the lower bits of C2 and CA are zero.
+ if (PredL == ICmpInst::ICMP_EQ && PredL == PredR && LHS->hasOneUse() &&
+ RHS->hasOneUse()) {
+ Value *V;
+ ConstantInt *AndC, *SmallC = nullptr, *BigC = nullptr;
+
+ // (trunc x) == C1 & (and x, CA) == C2
+ // (and x, CA) == C2 & (trunc x) == C1
+ if (match(RHS0, m_Trunc(m_Value(V))) &&
+ match(LHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
+ SmallC = RHSC;
+ BigC = LHSC;
+ } else if (match(LHS0, m_Trunc(m_Value(V))) &&
+ match(RHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
+ SmallC = LHSC;
+ BigC = RHSC;
+ }
+
+ if (SmallC && BigC) {
+ unsigned BigBitSize = BigC->getType()->getBitWidth();
+ unsigned SmallBitSize = SmallC->getType()->getBitWidth();
+
+ // Check that the low bits are zero.
+ APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize);
+ if ((Low & AndC->getValue()).isNullValue() &&
+ (Low & BigC->getValue()).isNullValue()) {
+ Value *NewAnd = Builder.CreateAnd(V, Low | AndC->getValue());
+ APInt N = SmallC->getValue().zext(BigBitSize) | BigC->getValue();
+ Value *NewVal = ConstantInt::get(AndC->getType()->getContext(), N);
+ return Builder.CreateICmp(PredL, NewAnd, NewVal);
+ }
+ }
+ }
+
+ // From here on, we only handle:
+ // (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
+ if (LHS0 != RHS0)
+ return nullptr;
+
+ // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
+ if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
+ PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
+ PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
+ PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
+ return nullptr;
+
+ // We can't fold (ugt x, C) & (sgt x, C2).
+ if (!predicatesFoldable(PredL, PredR))
+ return nullptr;
+
+ // Ensure that the larger constant is on the RHS.
+ bool ShouldSwap;
+ if (CmpInst::isSigned(PredL) ||
+ (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
+ ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
+ else
+ ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
+
+ if (ShouldSwap) {
+ std::swap(LHS, RHS);
+ std::swap(LHSC, RHSC);
+ std::swap(PredL, PredR);
+ }
+
+ // At this point, we know we have two icmp instructions
+ // comparing a value against two constants and and'ing the result
+ // together. Because of the above check, we know that we only have
+ // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know
+ // (from the icmp folding check above), that the two constants
+ // are not equal and that the larger constant is on the RHS
+ assert(LHSC != RHSC && "Compares not folded above?");
+
+ switch (PredL) {
+ default:
+ llvm_unreachable("Unknown integer condition code!");
+ case ICmpInst::ICMP_NE:
+ switch (PredR) {
+ default:
+ llvm_unreachable("Unknown integer condition code!");
+ case ICmpInst::ICMP_ULT:
+ // (X != 13 & X u< 14) -> X < 13
+ if (LHSC->getValue() == (RHSC->getValue() - 1))
+ return Builder.CreateICmpULT(LHS0, LHSC);
+ if (LHSC->isZero()) // (X != 0 & X u< C) -> X-1 u< C-1
+ return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+ false, true);
+ break; // (X != 13 & X u< 15) -> no change
+ case ICmpInst::ICMP_SLT:
+ // (X != 13 & X s< 14) -> X < 13
+ if (LHSC->getValue() == (RHSC->getValue() - 1))
+ return Builder.CreateICmpSLT(LHS0, LHSC);
+ // (X != INT_MIN & X s< C) -> X-(INT_MIN+1) u< (C-(INT_MIN+1))
+ if (LHSC->isMinValue(true))
+ return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+ true, true);
+ break; // (X != 13 & X s< 15) -> no change
+ case ICmpInst::ICMP_NE:
+ // Potential folds for this case should already be handled.
+ break;
+ }
+ break;
+ case ICmpInst::ICMP_UGT:
+ switch (PredR) {
+ default:
+ llvm_unreachable("Unknown integer condition code!");
+ case ICmpInst::ICMP_NE:
+ // (X u> 13 & X != 14) -> X u> 14
+ if (RHSC->getValue() == (LHSC->getValue() + 1))
+ return Builder.CreateICmp(PredL, LHS0, RHSC);
+ // X u> C & X != UINT_MAX -> (X-(C+1)) u< UINT_MAX-(C+1)
+ if (RHSC->isMaxValue(false))
+ return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+ false, true);
+ break; // (X u> 13 & X != 15) -> no change
+ case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) u< 1
+ return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+ false, true);
+ }
+ break;
+ case ICmpInst::ICMP_SGT:
+ switch (PredR) {
+ default:
+ llvm_unreachable("Unknown integer condition code!");
+ case ICmpInst::ICMP_NE:
+ // (X s> 13 & X != 14) -> X s> 14
+ if (RHSC->getValue() == (LHSC->getValue() + 1))
+ return Builder.CreateICmp(PredL, LHS0, RHSC);
+ // X s> C & X != INT_MAX -> (X-(C+1)) u< INT_MAX-(C+1)
+ if (RHSC->isMaxValue(true))
+ return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
+ true, true);
+ break; // (X s> 13 & X != 15) -> no change
+ case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) u< 1
+ return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), true,
+ true);
+ }
+ break;
+ }
+
+ return nullptr;
+}
+
Value *InstCombinerImpl::foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS,
bool IsAnd) {
- Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
- Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
- FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
-
- if (LHS0 == RHS1 && RHS0 == LHS1) {
- // Swap RHS operands to match LHS.
- PredR = FCmpInst::getSwappedPredicate(PredR);
- std::swap(RHS0, RHS1);
- }
-
- // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y).
- // Suppose the relation between x and y is R, where R is one of
- // U(1000), L(0100), G(0010) or E(0001), and CC0 and CC1 are the bitmasks for
- // testing the desired relations.
- //
- // Since (R & CC0) and (R & CC1) are either R or 0, we actually have this:
- // bool(R & CC0) && bool(R & CC1)
- // = bool((R & CC0) & (R & CC1))
- // = bool(R & (CC0 & CC1)) <= by re-association, commutation, and idempotency
- //
- // Since (R & CC0) and (R & CC1) are either R or 0, we actually have this:
- // bool(R & CC0) || bool(R & CC1)
- // = bool((R & CC0) | (R & CC1))
- // = bool(R & (CC0 | CC1)) <= by reversed distribution (contribution? ;)
- if (LHS0 == RHS0 && LHS1 == RHS1) {
- unsigned FCmpCodeL = getFCmpCode(PredL);
- unsigned FCmpCodeR = getFCmpCode(PredR);
- unsigned NewPred = IsAnd ? FCmpCodeL & FCmpCodeR : FCmpCodeL | FCmpCodeR;
- return getFCmpValue(NewPred, LHS0, LHS1, Builder);
- }
-
- if ((PredL == FCmpInst::FCMP_ORD && PredR == FCmpInst::FCMP_ORD && IsAnd) ||
- (PredL == FCmpInst::FCMP_UNO && PredR == FCmpInst::FCMP_UNO && !IsAnd)) {
- if (LHS0->getType() != RHS0->getType())
- return nullptr;
-
- // FCmp canonicalization ensures that (fcmp ord/uno X, X) and
- // (fcmp ord/uno X, C) will be transformed to (fcmp X, +0.0).
- if (match(LHS1, m_PosZeroFP()) && match(RHS1, m_PosZeroFP()))
- // Ignore the constants because they are obviously not NANs:
- // (fcmp ord x, 0.0) & (fcmp ord y, 0.0) -> (fcmp ord x, y)
- // (fcmp uno x, 0.0) | (fcmp uno y, 0.0) -> (fcmp uno x, y)
- return Builder.CreateFCmp(PredL, LHS0, RHS0);
- }
-
- return nullptr;
-}
-
-/// This a limited reassociation for a special case (see above) where we are
-/// checking if two values are either both NAN (unordered) or not-NAN (ordered).
-/// This could be handled more generally in '-reassociation', but it seems like
-/// an unlikely pattern for a large number of logic ops and fcmps.
-static Instruction *reassociateFCmps(BinaryOperator &BO,
- InstCombiner::BuilderTy &Builder) {
- Instruction::BinaryOps Opcode = BO.getOpcode();
- assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
- "Expecting and/or op for fcmp transform");
-
- // There are 4 commuted variants of the pattern. Canonicalize operands of this
- // logic op so an fcmp is operand 0 and a matching logic op is operand 1.
- Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1), *X;
- FCmpInst::Predicate Pred;
- if (match(Op1, m_FCmp(Pred, m_Value(), m_AnyZeroFP())))
- std::swap(Op0, Op1);
-
- // Match inner binop and the predicate for combining 2 NAN checks into 1.
- BinaryOperator *BO1;
- FCmpInst::Predicate NanPred = Opcode == Instruction::And ? FCmpInst::FCMP_ORD
- : FCmpInst::FCMP_UNO;
- if (!match(Op0, m_FCmp(Pred, m_Value(X), m_AnyZeroFP())) || Pred != NanPred ||
- !match(Op1, m_BinOp(BO1)) || BO1->getOpcode() != Opcode)
- return nullptr;
-
- // The inner logic op must have a matching fcmp operand.
- Value *BO10 = BO1->getOperand(0), *BO11 = BO1->getOperand(1), *Y;
- if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) ||
- Pred != NanPred || X->getType() != Y->getType())
- std::swap(BO10, BO11);
-
- if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) ||
- Pred != NanPred || X->getType() != Y->getType())
- return nullptr;
-
- // and (fcmp ord X, 0), (and (fcmp ord Y, 0), Z) --> and (fcmp ord X, Y), Z
- // or (fcmp uno X, 0), (or (fcmp uno Y, 0), Z) --> or (fcmp uno X, Y), Z
- Value *NewFCmp = Builder.CreateFCmp(Pred, X, Y);
- if (auto *NewFCmpInst = dyn_cast<FCmpInst>(NewFCmp)) {
- // Intersect FMF from the 2 source fcmps.
- NewFCmpInst->copyIRFlags(Op0);
- NewFCmpInst->andIRFlags(BO10);
- }
- return BinaryOperator::Create(Opcode, NewFCmp, BO11);
-}
-
-/// Match De Morgan's Laws:
-/// (~A & ~B) == (~(A | B))
-/// (~A | ~B) == (~(A & B))
-static Instruction *matchDeMorgansLaws(BinaryOperator &I,
- InstCombiner::BuilderTy &Builder) {
- auto Opcode = I.getOpcode();
- assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
- "Trying to match De Morgan's Laws with something other than and/or");
-
- // Flip the logic operation.
- Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
-
- Value *A, *B;
- if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
- match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
+ Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
+ Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
+ FCmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+
+ if (LHS0 == RHS1 && RHS0 == LHS1) {
+ // Swap RHS operands to match LHS.
+ PredR = FCmpInst::getSwappedPredicate(PredR);
+ std::swap(RHS0, RHS1);
+ }
+
+ // Simplify (fcmp cc0 x, y) & (fcmp cc1 x, y).
+ // Suppose the relation between x and y is R, where R is one of
+ // U(1000), L(0100), G(0010) or E(0001), and CC0 and CC1 are the bitmasks for
+ // testing the desired relations.
+ //
+ // Since (R & CC0) and (R & CC1) are either R or 0, we actually have this:
+ // bool(R & CC0) && bool(R & CC1)
+ // = bool((R & CC0) & (R & CC1))
+ // = bool(R & (CC0 & CC1)) <= by re-association, commutation, and idempotency
+ //
+ // Since (R & CC0) and (R & CC1) are either R or 0, we actually have this:
+ // bool(R & CC0) || bool(R & CC1)
+ // = bool((R & CC0) | (R & CC1))
+ // = bool(R & (CC0 | CC1)) <= by reversed distribution (contribution? ;)
+ if (LHS0 == RHS0 && LHS1 == RHS1) {
+ unsigned FCmpCodeL = getFCmpCode(PredL);
+ unsigned FCmpCodeR = getFCmpCode(PredR);
+ unsigned NewPred = IsAnd ? FCmpCodeL & FCmpCodeR : FCmpCodeL | FCmpCodeR;
+ return getFCmpValue(NewPred, LHS0, LHS1, Builder);
+ }
+
+ if ((PredL == FCmpInst::FCMP_ORD && PredR == FCmpInst::FCMP_ORD && IsAnd) ||
+ (PredL == FCmpInst::FCMP_UNO && PredR == FCmpInst::FCMP_UNO && !IsAnd)) {
+ if (LHS0->getType() != RHS0->getType())
+ return nullptr;
+
+ // FCmp canonicalization ensures that (fcmp ord/uno X, X) and
+ // (fcmp ord/uno X, C) will be transformed to (fcmp X, +0.0).
+ if (match(LHS1, m_PosZeroFP()) && match(RHS1, m_PosZeroFP()))
+ // Ignore the constants because they are obviously not NANs:
+ // (fcmp ord x, 0.0) & (fcmp ord y, 0.0) -> (fcmp ord x, y)
+ // (fcmp uno x, 0.0) | (fcmp uno y, 0.0) -> (fcmp uno x, y)
+ return Builder.CreateFCmp(PredL, LHS0, RHS0);
+ }
+
+ return nullptr;
+}
+
+/// This a limited reassociation for a special case (see above) where we are
+/// checking if two values are either both NAN (unordered) or not-NAN (ordered).
+/// This could be handled more generally in '-reassociation', but it seems like
+/// an unlikely pattern for a large number of logic ops and fcmps.
+static Instruction *reassociateFCmps(BinaryOperator &BO,
+ InstCombiner::BuilderTy &Builder) {
+ Instruction::BinaryOps Opcode = BO.getOpcode();
+ assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
+ "Expecting and/or op for fcmp transform");
+
+ // There are 4 commuted variants of the pattern. Canonicalize operands of this
+ // logic op so an fcmp is operand 0 and a matching logic op is operand 1.
+ Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1), *X;
+ FCmpInst::Predicate Pred;
+ if (match(Op1, m_FCmp(Pred, m_Value(), m_AnyZeroFP())))
+ std::swap(Op0, Op1);
+
+ // Match inner binop and the predicate for combining 2 NAN checks into 1.
+ BinaryOperator *BO1;
+ FCmpInst::Predicate NanPred = Opcode == Instruction::And ? FCmpInst::FCMP_ORD
+ : FCmpInst::FCMP_UNO;
+ if (!match(Op0, m_FCmp(Pred, m_Value(X), m_AnyZeroFP())) || Pred != NanPred ||
+ !match(Op1, m_BinOp(BO1)) || BO1->getOpcode() != Opcode)
+ return nullptr;
+
+ // The inner logic op must have a matching fcmp operand.
+ Value *BO10 = BO1->getOperand(0), *BO11 = BO1->getOperand(1), *Y;
+ if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) ||
+ Pred != NanPred || X->getType() != Y->getType())
+ std::swap(BO10, BO11);
+
+ if (!match(BO10, m_FCmp(Pred, m_Value(Y), m_AnyZeroFP())) ||
+ Pred != NanPred || X->getType() != Y->getType())
+ return nullptr;
+
+ // and (fcmp ord X, 0), (and (fcmp ord Y, 0), Z) --> and (fcmp ord X, Y), Z
+ // or (fcmp uno X, 0), (or (fcmp uno Y, 0), Z) --> or (fcmp uno X, Y), Z
+ Value *NewFCmp = Builder.CreateFCmp(Pred, X, Y);
+ if (auto *NewFCmpInst = dyn_cast<FCmpInst>(NewFCmp)) {
+ // Intersect FMF from the 2 source fcmps.
+ NewFCmpInst->copyIRFlags(Op0);
+ NewFCmpInst->andIRFlags(BO10);
+ }
+ return BinaryOperator::Create(Opcode, NewFCmp, BO11);
+}
+
+/// Match De Morgan's Laws:
+/// (~A & ~B) == (~(A | B))
+/// (~A | ~B) == (~(A & B))
+static Instruction *matchDeMorgansLaws(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ auto Opcode = I.getOpcode();
+ assert((Opcode == Instruction::And || Opcode == Instruction::Or) &&
+ "Trying to match De Morgan's Laws with something other than and/or");
+
+ // Flip the logic operation.
+ Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And;
+
+ Value *A, *B;
+ if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) &&
+ match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) &&
!InstCombiner::isFreeToInvert(A, A->hasOneUse()) &&
!InstCombiner::isFreeToInvert(B, B->hasOneUse())) {
- Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
- return BinaryOperator::CreateNot(AndOr);
- }
-
- return nullptr;
-}
-
+ Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan");
+ return BinaryOperator::CreateNot(AndOr);
+ }
+
+ return nullptr;
+}
+
bool InstCombinerImpl::shouldOptimizeCast(CastInst *CI) {
- Value *CastSrc = CI->getOperand(0);
-
- // Noop casts and casts of constants should be eliminated trivially.
- if (CI->getSrcTy() == CI->getDestTy() || isa<Constant>(CastSrc))
- return false;
-
- // If this cast is paired with another cast that can be eliminated, we prefer
- // to have it eliminated.
- if (const auto *PrecedingCI = dyn_cast<CastInst>(CastSrc))
- if (isEliminableCastPair(PrecedingCI, CI))
- return false;
-
- return true;
-}
-
-/// Fold {and,or,xor} (cast X), C.
-static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast,
- InstCombiner::BuilderTy &Builder) {
- Constant *C = dyn_cast<Constant>(Logic.getOperand(1));
- if (!C)
- return nullptr;
-
- auto LogicOpc = Logic.getOpcode();
- Type *DestTy = Logic.getType();
- Type *SrcTy = Cast->getSrcTy();
-
- // Move the logic operation ahead of a zext or sext if the constant is
- // unchanged in the smaller source type. Performing the logic in a smaller
- // type may provide more information to later folds, and the smaller logic
- // instruction may be cheaper (particularly in the case of vectors).
- Value *X;
- if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) {
- Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy);
- Constant *ZextTruncC = ConstantExpr::getZExt(TruncC, DestTy);
- if (ZextTruncC == C) {
- // LogicOpc (zext X), C --> zext (LogicOpc X, C)
- Value *NewOp = Builder.CreateBinOp(LogicOpc, X, TruncC);
- return new ZExtInst(NewOp, DestTy);
- }
- }
-
- if (match(Cast, m_OneUse(m_SExt(m_Value(X))))) {
- Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy);
- Constant *SextTruncC = ConstantExpr::getSExt(TruncC, DestTy);
- if (SextTruncC == C) {
- // LogicOpc (sext X), C --> sext (LogicOpc X, C)
- Value *NewOp = Builder.CreateBinOp(LogicOpc, X, TruncC);
- return new SExtInst(NewOp, DestTy);
- }
- }
-
- return nullptr;
-}
-
-/// Fold {and,or,xor} (cast X), Y.
+ Value *CastSrc = CI->getOperand(0);
+
+ // Noop casts and casts of constants should be eliminated trivially.
+ if (CI->getSrcTy() == CI->getDestTy() || isa<Constant>(CastSrc))
+ return false;
+
+ // If this cast is paired with another cast that can be eliminated, we prefer
+ // to have it eliminated.
+ if (const auto *PrecedingCI = dyn_cast<CastInst>(CastSrc))
+ if (isEliminableCastPair(PrecedingCI, CI))
+ return false;
+
+ return true;
+}
+
+/// Fold {and,or,xor} (cast X), C.
+static Instruction *foldLogicCastConstant(BinaryOperator &Logic, CastInst *Cast,
+ InstCombiner::BuilderTy &Builder) {
+ Constant *C = dyn_cast<Constant>(Logic.getOperand(1));
+ if (!C)
+ return nullptr;
+
+ auto LogicOpc = Logic.getOpcode();
+ Type *DestTy = Logic.getType();
+ Type *SrcTy = Cast->getSrcTy();
+
+ // Move the logic operation ahead of a zext or sext if the constant is
+ // unchanged in the smaller source type. Performing the logic in a smaller
+ // type may provide more information to later folds, and the smaller logic
+ // instruction may be cheaper (particularly in the case of vectors).
+ Value *X;
+ if (match(Cast, m_OneUse(m_ZExt(m_Value(X))))) {
+ Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy);
+ Constant *ZextTruncC = ConstantExpr::getZExt(TruncC, DestTy);
+ if (ZextTruncC == C) {
+ // LogicOpc (zext X), C --> zext (LogicOpc X, C)
+ Value *NewOp = Builder.CreateBinOp(LogicOpc, X, TruncC);
+ return new ZExtInst(NewOp, DestTy);
+ }
+ }
+
+ if (match(Cast, m_OneUse(m_SExt(m_Value(X))))) {
+ Constant *TruncC = ConstantExpr::getTrunc(C, SrcTy);
+ Constant *SextTruncC = ConstantExpr::getSExt(TruncC, DestTy);
+ if (SextTruncC == C) {
+ // LogicOpc (sext X), C --> sext (LogicOpc X, C)
+ Value *NewOp = Builder.CreateBinOp(LogicOpc, X, TruncC);
+ return new SExtInst(NewOp, DestTy);
+ }
+ }
+
+ return nullptr;
+}
+
+/// Fold {and,or,xor} (cast X), Y.
Instruction *InstCombinerImpl::foldCastedBitwiseLogic(BinaryOperator &I) {
- auto LogicOpc = I.getOpcode();
- assert(I.isBitwiseLogicOp() && "Unexpected opcode for bitwise logic folding");
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- CastInst *Cast0 = dyn_cast<CastInst>(Op0);
- if (!Cast0)
- return nullptr;
-
- // This must be a cast from an integer or integer vector source type to allow
- // transformation of the logic operation to the source type.
- Type *DestTy = I.getType();
- Type *SrcTy = Cast0->getSrcTy();
- if (!SrcTy->isIntOrIntVectorTy())
- return nullptr;
-
- if (Instruction *Ret = foldLogicCastConstant(I, Cast0, Builder))
- return Ret;
-
- CastInst *Cast1 = dyn_cast<CastInst>(Op1);
- if (!Cast1)
- return nullptr;
-
- // Both operands of the logic operation are casts. The casts must be of the
- // same type for reduction.
- auto CastOpcode = Cast0->getOpcode();
- if (CastOpcode != Cast1->getOpcode() || SrcTy != Cast1->getSrcTy())
- return nullptr;
-
- Value *Cast0Src = Cast0->getOperand(0);
- Value *Cast1Src = Cast1->getOperand(0);
-
- // fold logic(cast(A), cast(B)) -> cast(logic(A, B))
- if (shouldOptimizeCast(Cast0) && shouldOptimizeCast(Cast1)) {
- Value *NewOp = Builder.CreateBinOp(LogicOpc, Cast0Src, Cast1Src,
- I.getName());
- return CastInst::Create(CastOpcode, NewOp, DestTy);
- }
-
- // For now, only 'and'/'or' have optimizations after this.
- if (LogicOpc == Instruction::Xor)
- return nullptr;
-
- // If this is logic(cast(icmp), cast(icmp)), try to fold this even if the
- // cast is otherwise not optimizable. This happens for vector sexts.
- ICmpInst *ICmp0 = dyn_cast<ICmpInst>(Cast0Src);
- ICmpInst *ICmp1 = dyn_cast<ICmpInst>(Cast1Src);
- if (ICmp0 && ICmp1) {
- Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1, I)
- : foldOrOfICmps(ICmp0, ICmp1, I);
- if (Res)
- return CastInst::Create(CastOpcode, Res, DestTy);
- return nullptr;
- }
-
- // If this is logic(cast(fcmp), cast(fcmp)), try to fold this even if the
- // cast is otherwise not optimizable. This happens for vector sexts.
- FCmpInst *FCmp0 = dyn_cast<FCmpInst>(Cast0Src);
- FCmpInst *FCmp1 = dyn_cast<FCmpInst>(Cast1Src);
- if (FCmp0 && FCmp1)
- if (Value *R = foldLogicOfFCmps(FCmp0, FCmp1, LogicOpc == Instruction::And))
- return CastInst::Create(CastOpcode, R, DestTy);
-
- return nullptr;
-}
-
-static Instruction *foldAndToXor(BinaryOperator &I,
- InstCombiner::BuilderTy &Builder) {
- assert(I.getOpcode() == Instruction::And);
- Value *Op0 = I.getOperand(0);
- Value *Op1 = I.getOperand(1);
- Value *A, *B;
-
- // Operand complexity canonicalization guarantees that the 'or' is Op0.
- // (A | B) & ~(A & B) --> A ^ B
- // (A | B) & ~(B & A) --> A ^ B
- if (match(&I, m_BinOp(m_Or(m_Value(A), m_Value(B)),
- m_Not(m_c_And(m_Deferred(A), m_Deferred(B))))))
- return BinaryOperator::CreateXor(A, B);
-
- // (A | ~B) & (~A | B) --> ~(A ^ B)
- // (A | ~B) & (B | ~A) --> ~(A ^ B)
- // (~B | A) & (~A | B) --> ~(A ^ B)
- // (~B | A) & (B | ~A) --> ~(A ^ B)
- if (Op0->hasOneUse() || Op1->hasOneUse())
- if (match(&I, m_BinOp(m_c_Or(m_Value(A), m_Not(m_Value(B))),
- m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B)))))
- return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
-
- return nullptr;
-}
-
-static Instruction *foldOrToXor(BinaryOperator &I,
- InstCombiner::BuilderTy &Builder) {
- assert(I.getOpcode() == Instruction::Or);
- Value *Op0 = I.getOperand(0);
- Value *Op1 = I.getOperand(1);
- Value *A, *B;
-
- // Operand complexity canonicalization guarantees that the 'and' is Op0.
- // (A & B) | ~(A | B) --> ~(A ^ B)
- // (A & B) | ~(B | A) --> ~(A ^ B)
- if (Op0->hasOneUse() || Op1->hasOneUse())
- if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
- match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
- return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
-
+ auto LogicOpc = I.getOpcode();
+ assert(I.isBitwiseLogicOp() && "Unexpected opcode for bitwise logic folding");
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ CastInst *Cast0 = dyn_cast<CastInst>(Op0);
+ if (!Cast0)
+ return nullptr;
+
+ // This must be a cast from an integer or integer vector source type to allow
+ // transformation of the logic operation to the source type.
+ Type *DestTy = I.getType();
+ Type *SrcTy = Cast0->getSrcTy();
+ if (!SrcTy->isIntOrIntVectorTy())
+ return nullptr;
+
+ if (Instruction *Ret = foldLogicCastConstant(I, Cast0, Builder))
+ return Ret;
+
+ CastInst *Cast1 = dyn_cast<CastInst>(Op1);
+ if (!Cast1)
+ return nullptr;
+
+ // Both operands of the logic operation are casts. The casts must be of the
+ // same type for reduction.
+ auto CastOpcode = Cast0->getOpcode();
+ if (CastOpcode != Cast1->getOpcode() || SrcTy != Cast1->getSrcTy())
+ return nullptr;
+
+ Value *Cast0Src = Cast0->getOperand(0);
+ Value *Cast1Src = Cast1->getOperand(0);
+
+ // fold logic(cast(A), cast(B)) -> cast(logic(A, B))
+ if (shouldOptimizeCast(Cast0) && shouldOptimizeCast(Cast1)) {
+ Value *NewOp = Builder.CreateBinOp(LogicOpc, Cast0Src, Cast1Src,
+ I.getName());
+ return CastInst::Create(CastOpcode, NewOp, DestTy);
+ }
+
+ // For now, only 'and'/'or' have optimizations after this.
+ if (LogicOpc == Instruction::Xor)
+ return nullptr;
+
+ // If this is logic(cast(icmp), cast(icmp)), try to fold this even if the
+ // cast is otherwise not optimizable. This happens for vector sexts.
+ ICmpInst *ICmp0 = dyn_cast<ICmpInst>(Cast0Src);
+ ICmpInst *ICmp1 = dyn_cast<ICmpInst>(Cast1Src);
+ if (ICmp0 && ICmp1) {
+ Value *Res = LogicOpc == Instruction::And ? foldAndOfICmps(ICmp0, ICmp1, I)
+ : foldOrOfICmps(ICmp0, ICmp1, I);
+ if (Res)
+ return CastInst::Create(CastOpcode, Res, DestTy);
+ return nullptr;
+ }
+
+ // If this is logic(cast(fcmp), cast(fcmp)), try to fold this even if the
+ // cast is otherwise not optimizable. This happens for vector sexts.
+ FCmpInst *FCmp0 = dyn_cast<FCmpInst>(Cast0Src);
+ FCmpInst *FCmp1 = dyn_cast<FCmpInst>(Cast1Src);
+ if (FCmp0 && FCmp1)
+ if (Value *R = foldLogicOfFCmps(FCmp0, FCmp1, LogicOpc == Instruction::And))
+ return CastInst::Create(CastOpcode, R, DestTy);
+
+ return nullptr;
+}
+
+static Instruction *foldAndToXor(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ assert(I.getOpcode() == Instruction::And);
+ Value *Op0 = I.getOperand(0);
+ Value *Op1 = I.getOperand(1);
+ Value *A, *B;
+
+ // Operand complexity canonicalization guarantees that the 'or' is Op0.
+ // (A | B) & ~(A & B) --> A ^ B
+ // (A | B) & ~(B & A) --> A ^ B
+ if (match(&I, m_BinOp(m_Or(m_Value(A), m_Value(B)),
+ m_Not(m_c_And(m_Deferred(A), m_Deferred(B))))))
+ return BinaryOperator::CreateXor(A, B);
+
+ // (A | ~B) & (~A | B) --> ~(A ^ B)
+ // (A | ~B) & (B | ~A) --> ~(A ^ B)
+ // (~B | A) & (~A | B) --> ~(A ^ B)
+ // (~B | A) & (B | ~A) --> ~(A ^ B)
+ if (Op0->hasOneUse() || Op1->hasOneUse())
+ if (match(&I, m_BinOp(m_c_Or(m_Value(A), m_Not(m_Value(B))),
+ m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B)))))
+ return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+ return nullptr;
+}
+
+static Instruction *foldOrToXor(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ assert(I.getOpcode() == Instruction::Or);
+ Value *Op0 = I.getOperand(0);
+ Value *Op1 = I.getOperand(1);
+ Value *A, *B;
+
+ // Operand complexity canonicalization guarantees that the 'and' is Op0.
+ // (A & B) | ~(A | B) --> ~(A ^ B)
+ // (A & B) | ~(B | A) --> ~(A ^ B)
+ if (Op0->hasOneUse() || Op1->hasOneUse())
+ if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+ match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
+ return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
// Operand complexity canonicalization guarantees that the 'xor' is Op0.
// (A ^ B) | ~(A | B) --> ~(A & B)
// (A ^ B) | ~(B | A) --> ~(A & B)
@@ -1635,98 +1635,98 @@ static Instruction *foldOrToXor(BinaryOperator &I,
match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B)))))
return BinaryOperator::CreateNot(Builder.CreateAnd(A, B));
- // (A & ~B) | (~A & B) --> A ^ B
- // (A & ~B) | (B & ~A) --> A ^ B
- // (~B & A) | (~A & B) --> A ^ B
- // (~B & A) | (B & ~A) --> A ^ B
- if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
- match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B))))
- return BinaryOperator::CreateXor(A, B);
-
- return nullptr;
-}
-
-/// Return true if a constant shift amount is always less than the specified
-/// bit-width. If not, the shift could create poison in the narrower type.
-static bool canNarrowShiftAmt(Constant *C, unsigned BitWidth) {
+ // (A & ~B) | (~A & B) --> A ^ B
+ // (A & ~B) | (B & ~A) --> A ^ B
+ // (~B & A) | (~A & B) --> A ^ B
+ // (~B & A) | (B & ~A) --> A ^ B
+ if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
+ match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B))))
+ return BinaryOperator::CreateXor(A, B);
+
+ return nullptr;
+}
+
+/// Return true if a constant shift amount is always less than the specified
+/// bit-width. If not, the shift could create poison in the narrower type.
+static bool canNarrowShiftAmt(Constant *C, unsigned BitWidth) {
APInt Threshold(C->getType()->getScalarSizeInBits(), BitWidth);
return match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, Threshold));
-}
-
-/// Try to use narrower ops (sink zext ops) for an 'and' with binop operand and
-/// a common zext operand: and (binop (zext X), C), (zext X).
+}
+
+/// Try to use narrower ops (sink zext ops) for an 'and' with binop operand and
+/// a common zext operand: and (binop (zext X), C), (zext X).
Instruction *InstCombinerImpl::narrowMaskedBinOp(BinaryOperator &And) {
- // This transform could also apply to {or, and, xor}, but there are better
- // folds for those cases, so we don't expect those patterns here. AShr is not
- // handled because it should always be transformed to LShr in this sequence.
- // The subtract transform is different because it has a constant on the left.
- // Add/mul commute the constant to RHS; sub with constant RHS becomes add.
- Value *Op0 = And.getOperand(0), *Op1 = And.getOperand(1);
- Constant *C;
- if (!match(Op0, m_OneUse(m_Add(m_Specific(Op1), m_Constant(C)))) &&
- !match(Op0, m_OneUse(m_Mul(m_Specific(Op1), m_Constant(C)))) &&
- !match(Op0, m_OneUse(m_LShr(m_Specific(Op1), m_Constant(C)))) &&
- !match(Op0, m_OneUse(m_Shl(m_Specific(Op1), m_Constant(C)))) &&
- !match(Op0, m_OneUse(m_Sub(m_Constant(C), m_Specific(Op1)))))
- return nullptr;
-
- Value *X;
- if (!match(Op1, m_ZExt(m_Value(X))) || Op1->hasNUsesOrMore(3))
- return nullptr;
-
- Type *Ty = And.getType();
- if (!isa<VectorType>(Ty) && !shouldChangeType(Ty, X->getType()))
- return nullptr;
-
- // If we're narrowing a shift, the shift amount must be safe (less than the
- // width) in the narrower type. If the shift amount is greater, instsimplify
- // usually handles that case, but we can't guarantee/assert it.
- Instruction::BinaryOps Opc = cast<BinaryOperator>(Op0)->getOpcode();
- if (Opc == Instruction::LShr || Opc == Instruction::Shl)
- if (!canNarrowShiftAmt(C, X->getType()->getScalarSizeInBits()))
- return nullptr;
-
- // and (sub C, (zext X)), (zext X) --> zext (and (sub C', X), X)
- // and (binop (zext X), C), (zext X) --> zext (and (binop X, C'), X)
- Value *NewC = ConstantExpr::getTrunc(C, X->getType());
- Value *NewBO = Opc == Instruction::Sub ? Builder.CreateBinOp(Opc, NewC, X)
- : Builder.CreateBinOp(Opc, X, NewC);
- return new ZExtInst(Builder.CreateAnd(NewBO, X), Ty);
-}
-
-// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
-// here. We should standardize that construct where it is needed or choose some
-// other way to ensure that commutated variants of patterns are not missed.
+ // This transform could also apply to {or, and, xor}, but there are better
+ // folds for those cases, so we don't expect those patterns here. AShr is not
+ // handled because it should always be transformed to LShr in this sequence.
+ // The subtract transform is different because it has a constant on the left.
+ // Add/mul commute the constant to RHS; sub with constant RHS becomes add.
+ Value *Op0 = And.getOperand(0), *Op1 = And.getOperand(1);
+ Constant *C;
+ if (!match(Op0, m_OneUse(m_Add(m_Specific(Op1), m_Constant(C)))) &&
+ !match(Op0, m_OneUse(m_Mul(m_Specific(Op1), m_Constant(C)))) &&
+ !match(Op0, m_OneUse(m_LShr(m_Specific(Op1), m_Constant(C)))) &&
+ !match(Op0, m_OneUse(m_Shl(m_Specific(Op1), m_Constant(C)))) &&
+ !match(Op0, m_OneUse(m_Sub(m_Constant(C), m_Specific(Op1)))))
+ return nullptr;
+
+ Value *X;
+ if (!match(Op1, m_ZExt(m_Value(X))) || Op1->hasNUsesOrMore(3))
+ return nullptr;
+
+ Type *Ty = And.getType();
+ if (!isa<VectorType>(Ty) && !shouldChangeType(Ty, X->getType()))
+ return nullptr;
+
+ // If we're narrowing a shift, the shift amount must be safe (less than the
+ // width) in the narrower type. If the shift amount is greater, instsimplify
+ // usually handles that case, but we can't guarantee/assert it.
+ Instruction::BinaryOps Opc = cast<BinaryOperator>(Op0)->getOpcode();
+ if (Opc == Instruction::LShr || Opc == Instruction::Shl)
+ if (!canNarrowShiftAmt(C, X->getType()->getScalarSizeInBits()))
+ return nullptr;
+
+ // and (sub C, (zext X)), (zext X) --> zext (and (sub C', X), X)
+ // and (binop (zext X), C), (zext X) --> zext (and (binop X, C'), X)
+ Value *NewC = ConstantExpr::getTrunc(C, X->getType());
+ Value *NewBO = Opc == Instruction::Sub ? Builder.CreateBinOp(Opc, NewC, X)
+ : Builder.CreateBinOp(Opc, X, NewC);
+ return new ZExtInst(Builder.CreateAnd(NewBO, X), Ty);
+}
+
+// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
+// here. We should standardize that construct where it is needed or choose some
+// other way to ensure that commutated variants of patterns are not missed.
Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
Type *Ty = I.getType();
- if (Value *V = SimplifyAndInst(I.getOperand(0), I.getOperand(1),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (SimplifyAssociativeOrCommutative(I))
- return &I;
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- // See if we can simplify any instructions used by the instruction whose sole
- // purpose is to compute bits we don't care about.
- if (SimplifyDemandedInstructionBits(I))
- return &I;
-
- // Do this before using distributive laws to catch simple and/or/not patterns.
- if (Instruction *Xor = foldAndToXor(I, Builder))
- return Xor;
-
- // (A|B)&(A|C) -> A|(B&C) etc
- if (Value *V = SimplifyUsingDistributiveLaws(I))
- return replaceInstUsesWith(I, V);
-
- if (Value *V = SimplifyBSwap(I, Builder))
- return replaceInstUsesWith(I, V);
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ if (Value *V = SimplifyAndInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (SimplifyAssociativeOrCommutative(I))
+ return &I;
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ // See if we can simplify any instructions used by the instruction whose sole
+ // purpose is to compute bits we don't care about.
+ if (SimplifyDemandedInstructionBits(I))
+ return &I;
+
+ // Do this before using distributive laws to catch simple and/or/not patterns.
+ if (Instruction *Xor = foldAndToXor(I, Builder))
+ return Xor;
+
+ // (A|B)&(A|C) -> A|(B&C) etc
+ if (Value *V = SimplifyUsingDistributiveLaws(I))
+ return replaceInstUsesWith(I, V);
+
+ if (Value *V = SimplifyBSwap(I, Builder))
+ return replaceInstUsesWith(I, V);
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
Value *X, *Y;
if (match(Op0, m_OneUse(m_LogicalShift(m_One(), m_Value(X)))) &&
@@ -1737,61 +1737,61 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
return new ZExtInst(IsZero, Ty);
}
- const APInt *C;
- if (match(Op1, m_APInt(C))) {
- const APInt *XorC;
- if (match(Op0, m_OneUse(m_Xor(m_Value(X), m_APInt(XorC))))) {
- // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2)
+ const APInt *C;
+ if (match(Op1, m_APInt(C))) {
+ const APInt *XorC;
+ if (match(Op0, m_OneUse(m_Xor(m_Value(X), m_APInt(XorC))))) {
+ // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2)
Constant *NewC = ConstantInt::get(Ty, *C & *XorC);
- Value *And = Builder.CreateAnd(X, Op1);
- And->takeName(Op0);
- return BinaryOperator::CreateXor(And, NewC);
- }
-
- const APInt *OrC;
- if (match(Op0, m_OneUse(m_Or(m_Value(X), m_APInt(OrC))))) {
- // (X | C1) & C2 --> (X & C2^(C1&C2)) | (C1&C2)
- // NOTE: This reduces the number of bits set in the & mask, which
- // can expose opportunities for store narrowing for scalars.
- // NOTE: SimplifyDemandedBits should have already removed bits from C1
- // that aren't set in C2. Meaning we can replace (C1&C2) with C1 in
- // above, but this feels safer.
- APInt Together = *C & *OrC;
+ Value *And = Builder.CreateAnd(X, Op1);
+ And->takeName(Op0);
+ return BinaryOperator::CreateXor(And, NewC);
+ }
+
+ const APInt *OrC;
+ if (match(Op0, m_OneUse(m_Or(m_Value(X), m_APInt(OrC))))) {
+ // (X | C1) & C2 --> (X & C2^(C1&C2)) | (C1&C2)
+ // NOTE: This reduces the number of bits set in the & mask, which
+ // can expose opportunities for store narrowing for scalars.
+ // NOTE: SimplifyDemandedBits should have already removed bits from C1
+ // that aren't set in C2. Meaning we can replace (C1&C2) with C1 in
+ // above, but this feels safer.
+ APInt Together = *C & *OrC;
Value *And = Builder.CreateAnd(X, ConstantInt::get(Ty, Together ^ *C));
- And->takeName(Op0);
+ And->takeName(Op0);
return BinaryOperator::CreateOr(And, ConstantInt::get(Ty, Together));
- }
-
- // If the mask is only needed on one incoming arm, push the 'and' op up.
- if (match(Op0, m_OneUse(m_Xor(m_Value(X), m_Value(Y)))) ||
- match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
- APInt NotAndMask(~(*C));
- BinaryOperator::BinaryOps BinOp = cast<BinaryOperator>(Op0)->getOpcode();
- if (MaskedValueIsZero(X, NotAndMask, 0, &I)) {
- // Not masking anything out for the LHS, move mask to RHS.
- // and ({x}or X, Y), C --> {x}or X, (and Y, C)
- Value *NewRHS = Builder.CreateAnd(Y, Op1, Y->getName() + ".masked");
- return BinaryOperator::Create(BinOp, X, NewRHS);
- }
- if (!isa<Constant>(Y) && MaskedValueIsZero(Y, NotAndMask, 0, &I)) {
- // Not masking anything out for the RHS, move mask to LHS.
- // and ({x}or X, Y), C --> {x}or (and X, C), Y
- Value *NewLHS = Builder.CreateAnd(X, Op1, X->getName() + ".masked");
- return BinaryOperator::Create(BinOp, NewLHS, Y);
- }
- }
+ }
+
+ // If the mask is only needed on one incoming arm, push the 'and' op up.
+ if (match(Op0, m_OneUse(m_Xor(m_Value(X), m_Value(Y)))) ||
+ match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
+ APInt NotAndMask(~(*C));
+ BinaryOperator::BinaryOps BinOp = cast<BinaryOperator>(Op0)->getOpcode();
+ if (MaskedValueIsZero(X, NotAndMask, 0, &I)) {
+ // Not masking anything out for the LHS, move mask to RHS.
+ // and ({x}or X, Y), C --> {x}or X, (and Y, C)
+ Value *NewRHS = Builder.CreateAnd(Y, Op1, Y->getName() + ".masked");
+ return BinaryOperator::Create(BinOp, X, NewRHS);
+ }
+ if (!isa<Constant>(Y) && MaskedValueIsZero(Y, NotAndMask, 0, &I)) {
+ // Not masking anything out for the RHS, move mask to LHS.
+ // and ({x}or X, Y), C --> {x}or (and X, C), Y
+ Value *NewLHS = Builder.CreateAnd(X, Op1, X->getName() + ".masked");
+ return BinaryOperator::Create(BinOp, NewLHS, Y);
+ }
+ }
unsigned Width = Ty->getScalarSizeInBits();
- const APInt *ShiftC;
- if (match(Op0, m_OneUse(m_SExt(m_AShr(m_Value(X), m_APInt(ShiftC)))))) {
- if (*C == APInt::getLowBitsSet(Width, Width - ShiftC->getZExtValue())) {
- // We are clearing high bits that were potentially set by sext+ashr:
- // and (sext (ashr X, ShiftC)), C --> lshr (sext X), ShiftC
+ const APInt *ShiftC;
+ if (match(Op0, m_OneUse(m_SExt(m_AShr(m_Value(X), m_APInt(ShiftC)))))) {
+ if (*C == APInt::getLowBitsSet(Width, Width - ShiftC->getZExtValue())) {
+ // We are clearing high bits that were potentially set by sext+ashr:
+ // and (sext (ashr X, ShiftC)), C --> lshr (sext X), ShiftC
Value *Sext = Builder.CreateSExt(X, Ty);
Constant *ShAmtC = ConstantInt::get(Ty, ShiftC->zext(Width));
- return BinaryOperator::CreateLShr(Sext, ShAmtC);
- }
- }
+ return BinaryOperator::CreateLShr(Sext, ShAmtC);
+ }
+ }
const APInt *AddC;
if (match(Op0, m_Add(m_Value(X), m_APInt(AddC)))) {
@@ -1812,48 +1812,48 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
return BinaryOperator::CreateXor(NewAnd, Op1);
}
}
- }
-
+ }
+
ConstantInt *AndRHS;
if (match(Op1, m_ConstantInt(AndRHS))) {
- const APInt &AndRHSMask = AndRHS->getValue();
-
- // Optimize a variety of ((val OP C1) & C2) combinations...
- if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
- // ((C1 OP zext(X)) & C2) -> zext((C1-X) & C2) if C2 fits in the bitwidth
- // of X and OP behaves well when given trunc(C1) and X.
+ const APInt &AndRHSMask = AndRHS->getValue();
+
+ // Optimize a variety of ((val OP C1) & C2) combinations...
+ if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
+ // ((C1 OP zext(X)) & C2) -> zext((C1-X) & C2) if C2 fits in the bitwidth
+ // of X and OP behaves well when given trunc(C1) and X.
// TODO: Do this for vectors by using m_APInt instead of m_ConstantInt.
- switch (Op0I->getOpcode()) {
- default:
- break;
- case Instruction::Xor:
- case Instruction::Or:
- case Instruction::Mul:
- case Instruction::Add:
- case Instruction::Sub:
- Value *X;
- ConstantInt *C1;
- // TODO: The one use restrictions could be relaxed a little if the AND
- // is going to be removed.
- if (match(Op0I, m_OneUse(m_c_BinOp(m_OneUse(m_ZExt(m_Value(X))),
- m_ConstantInt(C1))))) {
- if (AndRHSMask.isIntN(X->getType()->getScalarSizeInBits())) {
- auto *TruncC1 = ConstantExpr::getTrunc(C1, X->getType());
- Value *BinOp;
- Value *Op0LHS = Op0I->getOperand(0);
- if (isa<ZExtInst>(Op0LHS))
- BinOp = Builder.CreateBinOp(Op0I->getOpcode(), X, TruncC1);
- else
- BinOp = Builder.CreateBinOp(Op0I->getOpcode(), TruncC1, X);
- auto *TruncC2 = ConstantExpr::getTrunc(AndRHS, X->getType());
- auto *And = Builder.CreateAnd(BinOp, TruncC2);
+ switch (Op0I->getOpcode()) {
+ default:
+ break;
+ case Instruction::Xor:
+ case Instruction::Or:
+ case Instruction::Mul:
+ case Instruction::Add:
+ case Instruction::Sub:
+ Value *X;
+ ConstantInt *C1;
+ // TODO: The one use restrictions could be relaxed a little if the AND
+ // is going to be removed.
+ if (match(Op0I, m_OneUse(m_c_BinOp(m_OneUse(m_ZExt(m_Value(X))),
+ m_ConstantInt(C1))))) {
+ if (AndRHSMask.isIntN(X->getType()->getScalarSizeInBits())) {
+ auto *TruncC1 = ConstantExpr::getTrunc(C1, X->getType());
+ Value *BinOp;
+ Value *Op0LHS = Op0I->getOperand(0);
+ if (isa<ZExtInst>(Op0LHS))
+ BinOp = Builder.CreateBinOp(Op0I->getOpcode(), X, TruncC1);
+ else
+ BinOp = Builder.CreateBinOp(Op0I->getOpcode(), TruncC1, X);
+ auto *TruncC2 = ConstantExpr::getTrunc(AndRHS, X->getType());
+ auto *And = Builder.CreateAnd(BinOp, TruncC2);
return new ZExtInst(And, Ty);
- }
- }
- }
- }
+ }
+ }
+ }
+ }
}
-
+
if (match(&I, m_And(m_OneUse(m_Shl(m_ZExt(m_Value(X)), m_Value(Y))),
m_SignMask())) &&
match(Y, m_SpecificInt_ICMP(
@@ -1871,26 +1871,26 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
SanitizedSignMask =
Constant::mergeUndefsWith(SanitizedSignMask, cast<Constant>(Y));
return BinaryOperator::CreateAnd(SExt, SanitizedSignMask);
- }
-
- if (Instruction *Z = narrowMaskedBinOp(I))
- return Z;
-
- if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
- return FoldedLogic;
-
- if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
- return DeMorgan;
-
- {
- Value *A, *B, *C;
- // A & (A ^ B) --> A & ~B
- if (match(Op1, m_OneUse(m_c_Xor(m_Specific(Op0), m_Value(B)))))
- return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(B));
- // (A ^ B) & A --> A & ~B
- if (match(Op0, m_OneUse(m_c_Xor(m_Specific(Op1), m_Value(B)))))
- return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(B));
-
+ }
+
+ if (Instruction *Z = narrowMaskedBinOp(I))
+ return Z;
+
+ if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
+ return FoldedLogic;
+
+ if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+ return DeMorgan;
+
+ {
+ Value *A, *B, *C;
+ // A & (A ^ B) --> A & ~B
+ if (match(Op1, m_OneUse(m_c_Xor(m_Specific(Op0), m_Value(B)))))
+ return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(B));
+ // (A ^ B) & A --> A & ~B
+ if (match(Op0, m_OneUse(m_c_Xor(m_Specific(Op1), m_Value(B)))))
+ return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(B));
+
// A & ~(A ^ B) --> A & B
if (match(Op1, m_Not(m_c_Xor(m_Specific(Op0), m_Value(B)))))
return BinaryOperator::CreateAnd(Op0, B);
@@ -1898,166 +1898,166 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
if (match(Op0, m_Not(m_c_Xor(m_Specific(Op1), m_Value(B)))))
return BinaryOperator::CreateAnd(Op1, B);
- // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C
- if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
- if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
- if (Op1->hasOneUse() || isFreeToInvert(C, C->hasOneUse()))
- return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(C));
-
- // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C
- if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
- if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
- if (Op0->hasOneUse() || isFreeToInvert(C, C->hasOneUse()))
- return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C));
-
- // (A | B) & ((~A) ^ B) -> (A & B)
- // (A | B) & (B ^ (~A)) -> (A & B)
- // (B | A) & ((~A) ^ B) -> (A & B)
- // (B | A) & (B ^ (~A)) -> (A & B)
- if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
- match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
- return BinaryOperator::CreateAnd(A, B);
-
- // ((~A) ^ B) & (A | B) -> (A & B)
- // ((~A) ^ B) & (B | A) -> (A & B)
- // (B ^ (~A)) & (A | B) -> (A & B)
- // (B ^ (~A)) & (B | A) -> (A & B)
- if (match(Op0, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
- match(Op1, m_c_Or(m_Specific(A), m_Specific(B))))
- return BinaryOperator::CreateAnd(A, B);
- }
-
- {
- ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
- ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
- if (LHS && RHS)
- if (Value *Res = foldAndOfICmps(LHS, RHS, I))
- return replaceInstUsesWith(I, Res);
-
- // TODO: Make this recursive; it's a little tricky because an arbitrary
- // number of 'and' instructions might have to be created.
- if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
- if (auto *Cmp = dyn_cast<ICmpInst>(X))
- if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
- return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y));
- if (auto *Cmp = dyn_cast<ICmpInst>(Y))
- if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
- return replaceInstUsesWith(I, Builder.CreateAnd(Res, X));
- }
- if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
- if (auto *Cmp = dyn_cast<ICmpInst>(X))
- if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
- return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y));
- if (auto *Cmp = dyn_cast<ICmpInst>(Y))
- if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
- return replaceInstUsesWith(I, Builder.CreateAnd(Res, X));
- }
- }
-
- if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
- if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
- if (Value *Res = foldLogicOfFCmps(LHS, RHS, true))
- return replaceInstUsesWith(I, Res);
-
- if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
- return FoldedFCmps;
-
- if (Instruction *CastedAnd = foldCastedBitwiseLogic(I))
- return CastedAnd;
-
- // and(sext(A), B) / and(B, sext(A)) --> A ? B : 0, where A is i1 or <N x i1>.
- Value *A;
- if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
- A->getType()->isIntOrIntVectorTy(1))
+ // (A ^ B) & ((B ^ C) ^ A) -> (A ^ B) & ~C
+ if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
+ if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
+ if (Op1->hasOneUse() || isFreeToInvert(C, C->hasOneUse()))
+ return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(C));
+
+ // ((A ^ C) ^ B) & (B ^ A) -> (B ^ A) & ~C
+ if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
+ if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
+ if (Op0->hasOneUse() || isFreeToInvert(C, C->hasOneUse()))
+ return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(C));
+
+ // (A | B) & ((~A) ^ B) -> (A & B)
+ // (A | B) & (B ^ (~A)) -> (A & B)
+ // (B | A) & ((~A) ^ B) -> (A & B)
+ // (B | A) & (B ^ (~A)) -> (A & B)
+ if (match(Op1, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
+ match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
+ return BinaryOperator::CreateAnd(A, B);
+
+ // ((~A) ^ B) & (A | B) -> (A & B)
+ // ((~A) ^ B) & (B | A) -> (A & B)
+ // (B ^ (~A)) & (A | B) -> (A & B)
+ // (B ^ (~A)) & (B | A) -> (A & B)
+ if (match(Op0, m_c_Xor(m_Not(m_Value(A)), m_Value(B))) &&
+ match(Op1, m_c_Or(m_Specific(A), m_Specific(B))))
+ return BinaryOperator::CreateAnd(A, B);
+ }
+
+ {
+ ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
+ ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
+ if (LHS && RHS)
+ if (Value *Res = foldAndOfICmps(LHS, RHS, I))
+ return replaceInstUsesWith(I, Res);
+
+ // TODO: Make this recursive; it's a little tricky because an arbitrary
+ // number of 'and' instructions might have to be created.
+ if (LHS && match(Op1, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
+ if (auto *Cmp = dyn_cast<ICmpInst>(X))
+ if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
+ return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y));
+ if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+ if (Value *Res = foldAndOfICmps(LHS, Cmp, I))
+ return replaceInstUsesWith(I, Builder.CreateAnd(Res, X));
+ }
+ if (RHS && match(Op0, m_OneUse(m_And(m_Value(X), m_Value(Y))))) {
+ if (auto *Cmp = dyn_cast<ICmpInst>(X))
+ if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
+ return replaceInstUsesWith(I, Builder.CreateAnd(Res, Y));
+ if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+ if (Value *Res = foldAndOfICmps(Cmp, RHS, I))
+ return replaceInstUsesWith(I, Builder.CreateAnd(Res, X));
+ }
+ }
+
+ if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
+ if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
+ if (Value *Res = foldLogicOfFCmps(LHS, RHS, true))
+ return replaceInstUsesWith(I, Res);
+
+ if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
+ return FoldedFCmps;
+
+ if (Instruction *CastedAnd = foldCastedBitwiseLogic(I))
+ return CastedAnd;
+
+ // and(sext(A), B) / and(B, sext(A)) --> A ? B : 0, where A is i1 or <N x i1>.
+ Value *A;
+ if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
+ A->getType()->isIntOrIntVectorTy(1))
return SelectInst::Create(A, Op1, Constant::getNullValue(Ty));
- if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) &&
- A->getType()->isIntOrIntVectorTy(1))
+ if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) &&
+ A->getType()->isIntOrIntVectorTy(1))
return SelectInst::Create(A, Op0, Constant::getNullValue(Ty));
-
- // and(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X) --> X s> Y ? X : 0.
+
+ // and(ashr(subNSW(Y, X), ScalarSizeInBits(Y)-1), X) --> X s> Y ? X : 0.
if (match(&I, m_c_And(m_OneUse(m_AShr(
m_NSWSub(m_Value(Y), m_Value(X)),
m_SpecificInt(Ty->getScalarSizeInBits() - 1))),
m_Deferred(X)))) {
Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
return SelectInst::Create(NewICmpInst, X, ConstantInt::getNullValue(Ty));
- }
-
+ }
+
// (~x) & y --> ~(x | (~y)) iff that gets rid of inversions
if (sinkNotIntoOtherHandOfAndOrOr(I))
return &I;
- return nullptr;
-}
-
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::matchBSwapOrBitReverse(BinaryOperator &Or,
bool MatchBSwaps,
bool MatchBitReversals) {
- assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'");
- Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1);
-
- // Look through zero extends.
- if (Instruction *Ext = dyn_cast<ZExtInst>(Op0))
- Op0 = Ext->getOperand(0);
-
- if (Instruction *Ext = dyn_cast<ZExtInst>(Op1))
- Op1 = Ext->getOperand(0);
-
- // (A | B) | C and A | (B | C) -> bswap if possible.
+ assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'");
+ Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1);
+
+ // Look through zero extends.
+ if (Instruction *Ext = dyn_cast<ZExtInst>(Op0))
+ Op0 = Ext->getOperand(0);
+
+ if (Instruction *Ext = dyn_cast<ZExtInst>(Op1))
+ Op1 = Ext->getOperand(0);
+
+ // (A | B) | C and A | (B | C) -> bswap if possible.
bool OrWithOrs = match(Op0, m_Or(m_Value(), m_Value())) ||
match(Op1, m_Or(m_Value(), m_Value()));
-
+
// (A >> B) | C and (A << B) | C -> bswap if possible.
bool OrWithShifts = match(Op0, m_LogicalShift(m_Value(), m_Value())) ||
match(Op1, m_LogicalShift(m_Value(), m_Value()));
-
+
// (A & B) | C and A | (B & C) -> bswap if possible.
bool OrWithAnds = match(Op0, m_And(m_Value(), m_Value())) ||
match(Op1, m_And(m_Value(), m_Value()));
-
+
// fshl(A,B,C) | D and A | fshl(B,C,D) -> bswap if possible.
// fshr(A,B,C) | D and A | fshr(B,C,D) -> bswap if possible.
bool OrWithFunnels = match(Op0, m_FShl(m_Value(), m_Value(), m_Value())) ||
match(Op0, m_FShr(m_Value(), m_Value(), m_Value())) ||
match(Op0, m_FShl(m_Value(), m_Value(), m_Value())) ||
match(Op0, m_FShr(m_Value(), m_Value(), m_Value()));
-
+
// TODO: Do we need all these filtering checks or should we just rely on
// recognizeBSwapOrBitReverseIdiom + collectBitParts to reject them quickly?
if (!OrWithOrs && !OrWithShifts && !OrWithAnds && !OrWithFunnels)
- return nullptr;
-
+ return nullptr;
+
SmallVector<Instruction *, 4> Insts;
if (!recognizeBSwapOrBitReverseIdiom(&Or, MatchBSwaps, MatchBitReversals,
Insts))
- return nullptr;
- Instruction *LastInst = Insts.pop_back_val();
- LastInst->removeFromParent();
-
- for (auto *Inst : Insts)
- Worklist.push(Inst);
- return LastInst;
-}
-
+ return nullptr;
+ Instruction *LastInst = Insts.pop_back_val();
+ LastInst->removeFromParent();
+
+ for (auto *Inst : Insts)
+ Worklist.push(Inst);
+ return LastInst;
+}
+
/// Match UB-safe variants of the funnel shift intrinsic.
static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
- // TODO: Can we reduce the code duplication between this and the related
- // rotate matching code under visitSelect and visitTrunc?
- unsigned Width = Or.getType()->getScalarSizeInBits();
-
+ // TODO: Can we reduce the code duplication between this and the related
+ // rotate matching code under visitSelect and visitTrunc?
+ unsigned Width = Or.getType()->getScalarSizeInBits();
+
// First, find an or'd pair of opposite shifts:
// or (lshr ShVal0, ShAmt0), (shl ShVal1, ShAmt1)
- BinaryOperator *Or0, *Or1;
- if (!match(Or.getOperand(0), m_BinOp(Or0)) ||
- !match(Or.getOperand(1), m_BinOp(Or1)))
- return nullptr;
-
+ BinaryOperator *Or0, *Or1;
+ if (!match(Or.getOperand(0), m_BinOp(Or0)) ||
+ !match(Or.getOperand(1), m_BinOp(Or1)))
+ return nullptr;
+
Value *ShVal0, *ShVal1, *ShAmt0, *ShAmt1;
if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal0), m_Value(ShAmt0)))) ||
!match(Or1, m_OneUse(m_LogicalShift(m_Value(ShVal1), m_Value(ShAmt1)))) ||
Or0->getOpcode() == Or1->getOpcode())
- return nullptr;
-
+ return nullptr;
+
// Canonicalize to or(shl(ShVal0, ShAmt0), lshr(ShVal1, ShAmt1)).
if (Or0->getOpcode() == BinaryOperator::LShr) {
std::swap(Or0, Or1);
@@ -2067,7 +2067,7 @@ static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
assert(Or0->getOpcode() == BinaryOperator::Shl &&
Or1->getOpcode() == BinaryOperator::LShr &&
"Illegal or(shift,shift) pair");
-
+
// Match the shift amount operands for a funnel shift pattern. This always
// matches a subtraction on the R operand.
auto matchShiftAmount = [&](Value *L, Value *R, unsigned Width) -> Value * {
@@ -2105,327 +2105,327 @@ static Instruction *matchFunnelShift(Instruction &Or, InstCombinerImpl &IC) {
if (!isPowerOf2_32(Width))
return nullptr;
- // The shift amount may be masked with negation:
- // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1)))
- Value *X;
- unsigned Mask = Width - 1;
- if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) &&
- match(R, m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask))))
- return X;
-
- // Similar to above, but the shift amount may be extended after masking,
- // so return the extended value as the parameter for the intrinsic.
- if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) &&
- match(R, m_And(m_Neg(m_ZExt(m_And(m_Specific(X), m_SpecificInt(Mask)))),
- m_SpecificInt(Mask))))
- return L;
-
+ // The shift amount may be masked with negation:
+ // (shl ShVal, (X & (Width - 1))) | (lshr ShVal, ((-X) & (Width - 1)))
+ Value *X;
+ unsigned Mask = Width - 1;
+ if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) &&
+ match(R, m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask))))
+ return X;
+
+ // Similar to above, but the shift amount may be extended after masking,
+ // so return the extended value as the parameter for the intrinsic.
+ if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) &&
+ match(R, m_And(m_Neg(m_ZExt(m_And(m_Specific(X), m_SpecificInt(Mask)))),
+ m_SpecificInt(Mask))))
+ return L;
+
if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) &&
match(R, m_ZExt(m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask)))))
return L;
- return nullptr;
- };
-
- Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, Width);
+ return nullptr;
+ };
+
+ Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, Width);
bool IsFshl = true; // Sub on LSHR.
- if (!ShAmt) {
- ShAmt = matchShiftAmount(ShAmt1, ShAmt0, Width);
+ if (!ShAmt) {
+ ShAmt = matchShiftAmount(ShAmt1, ShAmt0, Width);
IsFshl = false; // Sub on SHL.
- }
- if (!ShAmt)
- return nullptr;
-
- Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
- Function *F = Intrinsic::getDeclaration(Or.getModule(), IID, Or.getType());
+ }
+ if (!ShAmt)
+ return nullptr;
+
+ Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
+ Function *F = Intrinsic::getDeclaration(Or.getModule(), IID, Or.getType());
return IntrinsicInst::Create(F, {ShVal0, ShVal1, ShAmt});
-}
-
-/// Attempt to combine or(zext(x),shl(zext(y),bw/2) concat packing patterns.
-static Instruction *matchOrConcat(Instruction &Or,
- InstCombiner::BuilderTy &Builder) {
- assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'");
- Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1);
- Type *Ty = Or.getType();
-
- unsigned Width = Ty->getScalarSizeInBits();
- if ((Width & 1) != 0)
- return nullptr;
- unsigned HalfWidth = Width / 2;
-
- // Canonicalize zext (lower half) to LHS.
- if (!isa<ZExtInst>(Op0))
- std::swap(Op0, Op1);
-
- // Find lower/upper half.
- Value *LowerSrc, *ShlVal, *UpperSrc;
- const APInt *C;
- if (!match(Op0, m_OneUse(m_ZExt(m_Value(LowerSrc)))) ||
- !match(Op1, m_OneUse(m_Shl(m_Value(ShlVal), m_APInt(C)))) ||
- !match(ShlVal, m_OneUse(m_ZExt(m_Value(UpperSrc)))))
- return nullptr;
- if (*C != HalfWidth || LowerSrc->getType() != UpperSrc->getType() ||
- LowerSrc->getType()->getScalarSizeInBits() != HalfWidth)
- return nullptr;
-
- auto ConcatIntrinsicCalls = [&](Intrinsic::ID id, Value *Lo, Value *Hi) {
- Value *NewLower = Builder.CreateZExt(Lo, Ty);
- Value *NewUpper = Builder.CreateZExt(Hi, Ty);
- NewUpper = Builder.CreateShl(NewUpper, HalfWidth);
- Value *BinOp = Builder.CreateOr(NewLower, NewUpper);
- Function *F = Intrinsic::getDeclaration(Or.getModule(), id, Ty);
- return Builder.CreateCall(F, BinOp);
- };
-
- // BSWAP: Push the concat down, swapping the lower/upper sources.
- // concat(bswap(x),bswap(y)) -> bswap(concat(x,y))
- Value *LowerBSwap, *UpperBSwap;
- if (match(LowerSrc, m_BSwap(m_Value(LowerBSwap))) &&
- match(UpperSrc, m_BSwap(m_Value(UpperBSwap))))
- return ConcatIntrinsicCalls(Intrinsic::bswap, UpperBSwap, LowerBSwap);
-
- // BITREVERSE: Push the concat down, swapping the lower/upper sources.
- // concat(bitreverse(x),bitreverse(y)) -> bitreverse(concat(x,y))
- Value *LowerBRev, *UpperBRev;
- if (match(LowerSrc, m_BitReverse(m_Value(LowerBRev))) &&
- match(UpperSrc, m_BitReverse(m_Value(UpperBRev))))
- return ConcatIntrinsicCalls(Intrinsic::bitreverse, UpperBRev, LowerBRev);
-
- return nullptr;
-}
-
-/// If all elements of two constant vectors are 0/-1 and inverses, return true.
-static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
+}
+
+/// Attempt to combine or(zext(x),shl(zext(y),bw/2) concat packing patterns.
+static Instruction *matchOrConcat(Instruction &Or,
+ InstCombiner::BuilderTy &Builder) {
+ assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'");
+ Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1);
+ Type *Ty = Or.getType();
+
+ unsigned Width = Ty->getScalarSizeInBits();
+ if ((Width & 1) != 0)
+ return nullptr;
+ unsigned HalfWidth = Width / 2;
+
+ // Canonicalize zext (lower half) to LHS.
+ if (!isa<ZExtInst>(Op0))
+ std::swap(Op0, Op1);
+
+ // Find lower/upper half.
+ Value *LowerSrc, *ShlVal, *UpperSrc;
+ const APInt *C;
+ if (!match(Op0, m_OneUse(m_ZExt(m_Value(LowerSrc)))) ||
+ !match(Op1, m_OneUse(m_Shl(m_Value(ShlVal), m_APInt(C)))) ||
+ !match(ShlVal, m_OneUse(m_ZExt(m_Value(UpperSrc)))))
+ return nullptr;
+ if (*C != HalfWidth || LowerSrc->getType() != UpperSrc->getType() ||
+ LowerSrc->getType()->getScalarSizeInBits() != HalfWidth)
+ return nullptr;
+
+ auto ConcatIntrinsicCalls = [&](Intrinsic::ID id, Value *Lo, Value *Hi) {
+ Value *NewLower = Builder.CreateZExt(Lo, Ty);
+ Value *NewUpper = Builder.CreateZExt(Hi, Ty);
+ NewUpper = Builder.CreateShl(NewUpper, HalfWidth);
+ Value *BinOp = Builder.CreateOr(NewLower, NewUpper);
+ Function *F = Intrinsic::getDeclaration(Or.getModule(), id, Ty);
+ return Builder.CreateCall(F, BinOp);
+ };
+
+ // BSWAP: Push the concat down, swapping the lower/upper sources.
+ // concat(bswap(x),bswap(y)) -> bswap(concat(x,y))
+ Value *LowerBSwap, *UpperBSwap;
+ if (match(LowerSrc, m_BSwap(m_Value(LowerBSwap))) &&
+ match(UpperSrc, m_BSwap(m_Value(UpperBSwap))))
+ return ConcatIntrinsicCalls(Intrinsic::bswap, UpperBSwap, LowerBSwap);
+
+ // BITREVERSE: Push the concat down, swapping the lower/upper sources.
+ // concat(bitreverse(x),bitreverse(y)) -> bitreverse(concat(x,y))
+ Value *LowerBRev, *UpperBRev;
+ if (match(LowerSrc, m_BitReverse(m_Value(LowerBRev))) &&
+ match(UpperSrc, m_BitReverse(m_Value(UpperBRev))))
+ return ConcatIntrinsicCalls(Intrinsic::bitreverse, UpperBRev, LowerBRev);
+
+ return nullptr;
+}
+
+/// If all elements of two constant vectors are 0/-1 and inverses, return true.
+static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
unsigned NumElts = cast<FixedVectorType>(C1->getType())->getNumElements();
- for (unsigned i = 0; i != NumElts; ++i) {
- Constant *EltC1 = C1->getAggregateElement(i);
- Constant *EltC2 = C2->getAggregateElement(i);
- if (!EltC1 || !EltC2)
- return false;
-
- // One element must be all ones, and the other must be all zeros.
- if (!((match(EltC1, m_Zero()) && match(EltC2, m_AllOnes())) ||
- (match(EltC2, m_Zero()) && match(EltC1, m_AllOnes()))))
- return false;
- }
- return true;
-}
-
-/// We have an expression of the form (A & C) | (B & D). If A is a scalar or
-/// vector composed of all-zeros or all-ones values and is the bitwise 'not' of
-/// B, it can be used as the condition operand of a select instruction.
+ for (unsigned i = 0; i != NumElts; ++i) {
+ Constant *EltC1 = C1->getAggregateElement(i);
+ Constant *EltC2 = C2->getAggregateElement(i);
+ if (!EltC1 || !EltC2)
+ return false;
+
+ // One element must be all ones, and the other must be all zeros.
+ if (!((match(EltC1, m_Zero()) && match(EltC2, m_AllOnes())) ||
+ (match(EltC2, m_Zero()) && match(EltC1, m_AllOnes()))))
+ return false;
+ }
+ return true;
+}
+
+/// We have an expression of the form (A & C) | (B & D). If A is a scalar or
+/// vector composed of all-zeros or all-ones values and is the bitwise 'not' of
+/// B, it can be used as the condition operand of a select instruction.
Value *InstCombinerImpl::getSelectCondition(Value *A, Value *B) {
- // Step 1: We may have peeked through bitcasts in the caller.
- // Exit immediately if we don't have (vector) integer types.
- Type *Ty = A->getType();
- if (!Ty->isIntOrIntVectorTy() || !B->getType()->isIntOrIntVectorTy())
- return nullptr;
-
- // Step 2: We need 0 or all-1's bitmasks.
- if (ComputeNumSignBits(A) != Ty->getScalarSizeInBits())
- return nullptr;
-
- // Step 3: If B is the 'not' value of A, we have our answer.
- if (match(A, m_Not(m_Specific(B)))) {
- // If these are scalars or vectors of i1, A can be used directly.
- if (Ty->isIntOrIntVectorTy(1))
- return A;
- return Builder.CreateTrunc(A, CmpInst::makeCmpResultType(Ty));
- }
-
- // If both operands are constants, see if the constants are inverse bitmasks.
- Constant *AConst, *BConst;
- if (match(A, m_Constant(AConst)) && match(B, m_Constant(BConst)))
- if (AConst == ConstantExpr::getNot(BConst))
- return Builder.CreateZExtOrTrunc(A, CmpInst::makeCmpResultType(Ty));
-
- // Look for more complex patterns. The 'not' op may be hidden behind various
- // casts. Look through sexts and bitcasts to find the booleans.
- Value *Cond;
- Value *NotB;
- if (match(A, m_SExt(m_Value(Cond))) &&
- Cond->getType()->isIntOrIntVectorTy(1) &&
- match(B, m_OneUse(m_Not(m_Value(NotB))))) {
- NotB = peekThroughBitcast(NotB, true);
- if (match(NotB, m_SExt(m_Specific(Cond))))
- return Cond;
- }
-
- // All scalar (and most vector) possibilities should be handled now.
- // Try more matches that only apply to non-splat constant vectors.
- if (!Ty->isVectorTy())
- return nullptr;
-
- // If both operands are xor'd with constants using the same sexted boolean
- // operand, see if the constants are inverse bitmasks.
- // TODO: Use ConstantExpr::getNot()?
- if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AConst)))) &&
- match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BConst)))) &&
- Cond->getType()->isIntOrIntVectorTy(1) &&
- areInverseVectorBitmasks(AConst, BConst)) {
- AConst = ConstantExpr::getTrunc(AConst, CmpInst::makeCmpResultType(Ty));
- return Builder.CreateXor(Cond, AConst);
- }
- return nullptr;
-}
-
-/// We have an expression of the form (A & C) | (B & D). Try to simplify this
-/// to "A' ? C : D", where A' is a boolean or vector of booleans.
+ // Step 1: We may have peeked through bitcasts in the caller.
+ // Exit immediately if we don't have (vector) integer types.
+ Type *Ty = A->getType();
+ if (!Ty->isIntOrIntVectorTy() || !B->getType()->isIntOrIntVectorTy())
+ return nullptr;
+
+ // Step 2: We need 0 or all-1's bitmasks.
+ if (ComputeNumSignBits(A) != Ty->getScalarSizeInBits())
+ return nullptr;
+
+ // Step 3: If B is the 'not' value of A, we have our answer.
+ if (match(A, m_Not(m_Specific(B)))) {
+ // If these are scalars or vectors of i1, A can be used directly.
+ if (Ty->isIntOrIntVectorTy(1))
+ return A;
+ return Builder.CreateTrunc(A, CmpInst::makeCmpResultType(Ty));
+ }
+
+ // If both operands are constants, see if the constants are inverse bitmasks.
+ Constant *AConst, *BConst;
+ if (match(A, m_Constant(AConst)) && match(B, m_Constant(BConst)))
+ if (AConst == ConstantExpr::getNot(BConst))
+ return Builder.CreateZExtOrTrunc(A, CmpInst::makeCmpResultType(Ty));
+
+ // Look for more complex patterns. The 'not' op may be hidden behind various
+ // casts. Look through sexts and bitcasts to find the booleans.
+ Value *Cond;
+ Value *NotB;
+ if (match(A, m_SExt(m_Value(Cond))) &&
+ Cond->getType()->isIntOrIntVectorTy(1) &&
+ match(B, m_OneUse(m_Not(m_Value(NotB))))) {
+ NotB = peekThroughBitcast(NotB, true);
+ if (match(NotB, m_SExt(m_Specific(Cond))))
+ return Cond;
+ }
+
+ // All scalar (and most vector) possibilities should be handled now.
+ // Try more matches that only apply to non-splat constant vectors.
+ if (!Ty->isVectorTy())
+ return nullptr;
+
+ // If both operands are xor'd with constants using the same sexted boolean
+ // operand, see if the constants are inverse bitmasks.
+ // TODO: Use ConstantExpr::getNot()?
+ if (match(A, (m_Xor(m_SExt(m_Value(Cond)), m_Constant(AConst)))) &&
+ match(B, (m_Xor(m_SExt(m_Specific(Cond)), m_Constant(BConst)))) &&
+ Cond->getType()->isIntOrIntVectorTy(1) &&
+ areInverseVectorBitmasks(AConst, BConst)) {
+ AConst = ConstantExpr::getTrunc(AConst, CmpInst::makeCmpResultType(Ty));
+ return Builder.CreateXor(Cond, AConst);
+ }
+ return nullptr;
+}
+
+/// We have an expression of the form (A & C) | (B & D). Try to simplify this
+/// to "A' ? C : D", where A' is a boolean or vector of booleans.
Value *InstCombinerImpl::matchSelectFromAndOr(Value *A, Value *C, Value *B,
Value *D) {
- // The potential condition of the select may be bitcasted. In that case, look
- // through its bitcast and the corresponding bitcast of the 'not' condition.
- Type *OrigType = A->getType();
- A = peekThroughBitcast(A, true);
- B = peekThroughBitcast(B, true);
- if (Value *Cond = getSelectCondition(A, B)) {
- // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D))
- // The bitcasts will either all exist or all not exist. The builder will
- // not create unnecessary casts if the types already match.
- Value *BitcastC = Builder.CreateBitCast(C, A->getType());
- Value *BitcastD = Builder.CreateBitCast(D, A->getType());
- Value *Select = Builder.CreateSelect(Cond, BitcastC, BitcastD);
- return Builder.CreateBitCast(Select, OrigType);
- }
-
- return nullptr;
-}
-
-/// Fold (icmp)|(icmp) if possible.
+ // The potential condition of the select may be bitcasted. In that case, look
+ // through its bitcast and the corresponding bitcast of the 'not' condition.
+ Type *OrigType = A->getType();
+ A = peekThroughBitcast(A, true);
+ B = peekThroughBitcast(B, true);
+ if (Value *Cond = getSelectCondition(A, B)) {
+ // ((bc Cond) & C) | ((bc ~Cond) & D) --> bc (select Cond, (bc C), (bc D))
+ // The bitcasts will either all exist or all not exist. The builder will
+ // not create unnecessary casts if the types already match.
+ Value *BitcastC = Builder.CreateBitCast(C, A->getType());
+ Value *BitcastD = Builder.CreateBitCast(D, A->getType());
+ Value *Select = Builder.CreateSelect(Cond, BitcastC, BitcastD);
+ return Builder.CreateBitCast(Select, OrigType);
+ }
+
+ return nullptr;
+}
+
+/// Fold (icmp)|(icmp) if possible.
Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
BinaryOperator &Or) {
- const SimplifyQuery Q = SQ.getWithInstruction(&Or);
-
- // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
- // if K1 and K2 are a one-bit mask.
- if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, Or))
- return V;
-
- ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+ const SimplifyQuery Q = SQ.getWithInstruction(&Or);
+
+ // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
+ // if K1 and K2 are a one-bit mask.
+ if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, Or))
+ return V;
+
+ ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
Value *LHS1 = LHS->getOperand(1), *RHS1 = RHS->getOperand(1);
auto *LHSC = dyn_cast<ConstantInt>(LHS1);
auto *RHSC = dyn_cast<ConstantInt>(RHS1);
-
- // Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3)
- // --> (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3)
- // The original condition actually refers to the following two ranges:
- // [MAX_UINT-C1+1, MAX_UINT-C1+1+C3] and [MAX_UINT-C2+1, MAX_UINT-C2+1+C3]
- // We can fold these two ranges if:
- // 1) C1 and C2 is unsigned greater than C3.
- // 2) The two ranges are separated.
- // 3) C1 ^ C2 is one-bit mask.
- // 4) LowRange1 ^ LowRange2 and HighRange1 ^ HighRange2 are one-bit mask.
- // This implies all values in the two ranges differ by exactly one bit.
- if ((PredL == ICmpInst::ICMP_ULT || PredL == ICmpInst::ICMP_ULE) &&
- PredL == PredR && LHSC && RHSC && LHS->hasOneUse() && RHS->hasOneUse() &&
- LHSC->getType() == RHSC->getType() &&
- LHSC->getValue() == (RHSC->getValue())) {
-
+
+ // Fold (icmp ult/ule (A + C1), C3) | (icmp ult/ule (A + C2), C3)
+ // --> (icmp ult/ule ((A & ~(C1 ^ C2)) + max(C1, C2)), C3)
+ // The original condition actually refers to the following two ranges:
+ // [MAX_UINT-C1+1, MAX_UINT-C1+1+C3] and [MAX_UINT-C2+1, MAX_UINT-C2+1+C3]
+ // We can fold these two ranges if:
+ // 1) C1 and C2 is unsigned greater than C3.
+ // 2) The two ranges are separated.
+ // 3) C1 ^ C2 is one-bit mask.
+ // 4) LowRange1 ^ LowRange2 and HighRange1 ^ HighRange2 are one-bit mask.
+ // This implies all values in the two ranges differ by exactly one bit.
+ if ((PredL == ICmpInst::ICMP_ULT || PredL == ICmpInst::ICMP_ULE) &&
+ PredL == PredR && LHSC && RHSC && LHS->hasOneUse() && RHS->hasOneUse() &&
+ LHSC->getType() == RHSC->getType() &&
+ LHSC->getValue() == (RHSC->getValue())) {
+
Value *AddOpnd;
- ConstantInt *LAddC, *RAddC;
+ ConstantInt *LAddC, *RAddC;
if (match(LHS0, m_Add(m_Value(AddOpnd), m_ConstantInt(LAddC))) &&
match(RHS0, m_Add(m_Specific(AddOpnd), m_ConstantInt(RAddC))) &&
- LAddC->getValue().ugt(LHSC->getValue()) &&
- RAddC->getValue().ugt(LHSC->getValue())) {
-
- APInt DiffC = LAddC->getValue() ^ RAddC->getValue();
+ LAddC->getValue().ugt(LHSC->getValue()) &&
+ RAddC->getValue().ugt(LHSC->getValue())) {
+
+ APInt DiffC = LAddC->getValue() ^ RAddC->getValue();
if (DiffC.isPowerOf2()) {
- ConstantInt *MaxAddC = nullptr;
- if (LAddC->getValue().ult(RAddC->getValue()))
- MaxAddC = RAddC;
- else
- MaxAddC = LAddC;
-
- APInt RRangeLow = -RAddC->getValue();
- APInt RRangeHigh = RRangeLow + LHSC->getValue();
- APInt LRangeLow = -LAddC->getValue();
- APInt LRangeHigh = LRangeLow + LHSC->getValue();
- APInt LowRangeDiff = RRangeLow ^ LRangeLow;
- APInt HighRangeDiff = RRangeHigh ^ LRangeHigh;
- APInt RangeDiff = LRangeLow.sgt(RRangeLow) ? LRangeLow - RRangeLow
- : RRangeLow - LRangeLow;
-
- if (LowRangeDiff.isPowerOf2() && LowRangeDiff == HighRangeDiff &&
- RangeDiff.ugt(LHSC->getValue())) {
- Value *MaskC = ConstantInt::get(LAddC->getType(), ~DiffC);
-
+ ConstantInt *MaxAddC = nullptr;
+ if (LAddC->getValue().ult(RAddC->getValue()))
+ MaxAddC = RAddC;
+ else
+ MaxAddC = LAddC;
+
+ APInt RRangeLow = -RAddC->getValue();
+ APInt RRangeHigh = RRangeLow + LHSC->getValue();
+ APInt LRangeLow = -LAddC->getValue();
+ APInt LRangeHigh = LRangeLow + LHSC->getValue();
+ APInt LowRangeDiff = RRangeLow ^ LRangeLow;
+ APInt HighRangeDiff = RRangeHigh ^ LRangeHigh;
+ APInt RangeDiff = LRangeLow.sgt(RRangeLow) ? LRangeLow - RRangeLow
+ : RRangeLow - LRangeLow;
+
+ if (LowRangeDiff.isPowerOf2() && LowRangeDiff == HighRangeDiff &&
+ RangeDiff.ugt(LHSC->getValue())) {
+ Value *MaskC = ConstantInt::get(LAddC->getType(), ~DiffC);
+
Value *NewAnd = Builder.CreateAnd(AddOpnd, MaskC);
- Value *NewAdd = Builder.CreateAdd(NewAnd, MaxAddC);
- return Builder.CreateICmp(LHS->getPredicate(), NewAdd, LHSC);
- }
- }
- }
- }
-
- // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
- if (predicatesFoldable(PredL, PredR)) {
+ Value *NewAdd = Builder.CreateAdd(NewAnd, MaxAddC);
+ return Builder.CreateICmp(LHS->getPredicate(), NewAdd, LHSC);
+ }
+ }
+ }
+ }
+
+ // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
+ if (predicatesFoldable(PredL, PredR)) {
if (LHS0 == RHS1 && LHS1 == RHS0)
- LHS->swapOperands();
+ LHS->swapOperands();
if (LHS0 == RHS0 && LHS1 == RHS1) {
- unsigned Code = getICmpCode(LHS) | getICmpCode(RHS);
- bool IsSigned = LHS->isSigned() || RHS->isSigned();
+ unsigned Code = getICmpCode(LHS) | getICmpCode(RHS);
+ bool IsSigned = LHS->isSigned() || RHS->isSigned();
return getNewICmpValue(Code, IsSigned, LHS0, LHS1, Builder);
- }
- }
-
- // handle (roughly):
- // (icmp ne (A & B), C) | (icmp ne (A & D), E)
- if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder))
- return V;
-
- if (LHS->hasOneUse() || RHS->hasOneUse()) {
- // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1)
- // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1)
- Value *A = nullptr, *B = nullptr;
+ }
+ }
+
+ // handle (roughly):
+ // (icmp ne (A & B), C) | (icmp ne (A & D), E)
+ if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder))
+ return V;
+
+ if (LHS->hasOneUse() || RHS->hasOneUse()) {
+ // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1)
+ // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1)
+ Value *A = nullptr, *B = nullptr;
if (PredL == ICmpInst::ICMP_EQ && match(LHS1, m_Zero())) {
- B = LHS0;
+ B = LHS0;
if (PredR == ICmpInst::ICMP_ULT && LHS0 == RHS1)
- A = RHS0;
- else if (PredR == ICmpInst::ICMP_UGT && LHS0 == RHS0)
+ A = RHS0;
+ else if (PredR == ICmpInst::ICMP_UGT && LHS0 == RHS0)
A = RHS1;
- }
- // (icmp ult A, B) | (icmp eq B, 0) -> (icmp ule A, B-1)
- // (icmp ugt B, A) | (icmp eq B, 0) -> (icmp ule A, B-1)
+ }
+ // (icmp ult A, B) | (icmp eq B, 0) -> (icmp ule A, B-1)
+ // (icmp ugt B, A) | (icmp eq B, 0) -> (icmp ule A, B-1)
else if (PredR == ICmpInst::ICMP_EQ && match(RHS1, m_Zero())) {
- B = RHS0;
+ B = RHS0;
if (PredL == ICmpInst::ICMP_ULT && RHS0 == LHS1)
- A = LHS0;
+ A = LHS0;
else if (PredL == ICmpInst::ICMP_UGT && RHS0 == LHS0)
A = LHS1;
- }
+ }
if (A && B && B->getType()->isIntOrIntVectorTy())
- return Builder.CreateICmp(
- ICmpInst::ICMP_UGE,
+ return Builder.CreateICmp(
+ ICmpInst::ICMP_UGE,
Builder.CreateAdd(B, Constant::getAllOnesValue(B->getType())), A);
- }
-
- if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, Or, Builder, Q))
- return V;
- if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, Or, Builder, Q))
- return V;
-
- // E.g. (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
- if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/true))
- return V;
-
- // E.g. (icmp sgt x, n) | (icmp slt x, 0) --> icmp ugt x, n
- if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/true))
- return V;
-
- if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, false, Builder))
- return V;
-
- if (Value *V = foldIsPowerOf2(LHS, RHS, false /* JoinedByAnd */, Builder))
- return V;
-
- if (Value *X =
- foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/false, Q, Builder))
- return X;
- if (Value *X =
- foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/false, Q, Builder))
- return X;
-
+ }
+
+ if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, Or, Builder, Q))
+ return V;
+ if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, Or, Builder, Q))
+ return V;
+
+ // E.g. (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
+ if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/true))
+ return V;
+
+ // E.g. (icmp sgt x, n) | (icmp slt x, 0) --> icmp ugt x, n
+ if (Value *V = simplifyRangeCheck(RHS, LHS, /*Inverted=*/true))
+ return V;
+
+ if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, false, Builder))
+ return V;
+
+ if (Value *V = foldIsPowerOf2(LHS, RHS, false /* JoinedByAnd */, Builder))
+ return V;
+
+ if (Value *X =
+ foldUnsignedUnderflowCheck(LHS, RHS, /*IsAnd=*/false, Q, Builder))
+ return X;
+ if (Value *X =
+ foldUnsignedUnderflowCheck(RHS, LHS, /*IsAnd=*/false, Q, Builder))
+ return X;
+
// (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
// TODO: Remove this when foldLogOpOfMaskedICmps can handle vectors.
if (PredL == ICmpInst::ICMP_NE && match(LHS1, m_Zero()) &&
@@ -2437,666 +2437,666 @@ Value *InstCombinerImpl::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
Constant::getNullValue(NewOr->getType()));
}
- // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
- if (!LHSC || !RHSC)
- return nullptr;
-
- // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1)
- // iff C2 + CA == C1.
- if (PredL == ICmpInst::ICMP_ULT && PredR == ICmpInst::ICMP_EQ) {
- ConstantInt *AddC;
- if (match(LHS0, m_Add(m_Specific(RHS0), m_ConstantInt(AddC))))
- if (RHSC->getValue() + AddC->getValue() == LHSC->getValue())
- return Builder.CreateICmpULE(LHS0, LHSC);
- }
-
- // From here on, we only handle:
- // (icmp1 A, C1) | (icmp2 A, C2) --> something simpler.
- if (LHS0 != RHS0)
- return nullptr;
-
- // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
- if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
- PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
- PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
- PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
- return nullptr;
-
- // We can't fold (ugt x, C) | (sgt x, C2).
- if (!predicatesFoldable(PredL, PredR))
- return nullptr;
-
- // Ensure that the larger constant is on the RHS.
- bool ShouldSwap;
- if (CmpInst::isSigned(PredL) ||
- (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
- ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
- else
- ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
-
- if (ShouldSwap) {
- std::swap(LHS, RHS);
- std::swap(LHSC, RHSC);
- std::swap(PredL, PredR);
- }
-
- // At this point, we know we have two icmp instructions
- // comparing a value against two constants and or'ing the result
- // together. Because of the above check, we know that we only have
- // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the
- // icmp folding check above), that the two constants are not
- // equal.
- assert(LHSC != RHSC && "Compares not folded above?");
-
- switch (PredL) {
- default:
- llvm_unreachable("Unknown integer condition code!");
- case ICmpInst::ICMP_EQ:
- switch (PredR) {
- default:
- llvm_unreachable("Unknown integer condition code!");
- case ICmpInst::ICMP_EQ:
- // Potential folds for this case should already be handled.
- break;
- case ICmpInst::ICMP_UGT:
- // (X == 0 || X u> C) -> (X-1) u>= C
- if (LHSC->isMinValue(false))
- return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1,
- false, false);
- // (X == 13 | X u> 14) -> no change
- break;
- case ICmpInst::ICMP_SGT:
- // (X == INT_MIN || X s> C) -> (X-(INT_MIN+1)) u>= C-INT_MIN
- if (LHSC->isMinValue(true))
- return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1,
- true, false);
- // (X == 13 | X s> 14) -> no change
- break;
- }
- break;
- case ICmpInst::ICMP_ULT:
- switch (PredR) {
- default:
- llvm_unreachable("Unknown integer condition code!");
- case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change
- // (X u< C || X == UINT_MAX) => (X-C) u>= UINT_MAX-C
- if (RHSC->isMaxValue(false))
- return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(),
- false, false);
- break;
- case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2
- assert(!RHSC->isMaxValue(false) && "Missed icmp simplification");
- return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1,
- false, false);
- }
- break;
- case ICmpInst::ICMP_SLT:
- switch (PredR) {
- default:
- llvm_unreachable("Unknown integer condition code!");
- case ICmpInst::ICMP_EQ:
- // (X s< C || X == INT_MAX) => (X-C) u>= INT_MAX-C
- if (RHSC->isMaxValue(true))
- return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(),
- true, false);
- // (X s< 13 | X == 14) -> no change
- break;
- case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) u> 2
- assert(!RHSC->isMaxValue(true) && "Missed icmp simplification");
- return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, true,
- false);
- }
- break;
- }
- return nullptr;
-}
-
-// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
-// here. We should standardize that construct where it is needed or choose some
-// other way to ensure that commutated variants of patterns are not missed.
+ // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
+ if (!LHSC || !RHSC)
+ return nullptr;
+
+ // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1)
+ // iff C2 + CA == C1.
+ if (PredL == ICmpInst::ICMP_ULT && PredR == ICmpInst::ICMP_EQ) {
+ ConstantInt *AddC;
+ if (match(LHS0, m_Add(m_Specific(RHS0), m_ConstantInt(AddC))))
+ if (RHSC->getValue() + AddC->getValue() == LHSC->getValue())
+ return Builder.CreateICmpULE(LHS0, LHSC);
+ }
+
+ // From here on, we only handle:
+ // (icmp1 A, C1) | (icmp2 A, C2) --> something simpler.
+ if (LHS0 != RHS0)
+ return nullptr;
+
+ // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
+ if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
+ PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
+ PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
+ PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
+ return nullptr;
+
+ // We can't fold (ugt x, C) | (sgt x, C2).
+ if (!predicatesFoldable(PredL, PredR))
+ return nullptr;
+
+ // Ensure that the larger constant is on the RHS.
+ bool ShouldSwap;
+ if (CmpInst::isSigned(PredL) ||
+ (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
+ ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
+ else
+ ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
+
+ if (ShouldSwap) {
+ std::swap(LHS, RHS);
+ std::swap(LHSC, RHSC);
+ std::swap(PredL, PredR);
+ }
+
+ // At this point, we know we have two icmp instructions
+ // comparing a value against two constants and or'ing the result
+ // together. Because of the above check, we know that we only have
+ // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the
+ // icmp folding check above), that the two constants are not
+ // equal.
+ assert(LHSC != RHSC && "Compares not folded above?");
+
+ switch (PredL) {
+ default:
+ llvm_unreachable("Unknown integer condition code!");
+ case ICmpInst::ICMP_EQ:
+ switch (PredR) {
+ default:
+ llvm_unreachable("Unknown integer condition code!");
+ case ICmpInst::ICMP_EQ:
+ // Potential folds for this case should already be handled.
+ break;
+ case ICmpInst::ICMP_UGT:
+ // (X == 0 || X u> C) -> (X-1) u>= C
+ if (LHSC->isMinValue(false))
+ return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1,
+ false, false);
+ // (X == 13 | X u> 14) -> no change
+ break;
+ case ICmpInst::ICMP_SGT:
+ // (X == INT_MIN || X s> C) -> (X-(INT_MIN+1)) u>= C-INT_MIN
+ if (LHSC->isMinValue(true))
+ return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue() + 1,
+ true, false);
+ // (X == 13 | X s> 14) -> no change
+ break;
+ }
+ break;
+ case ICmpInst::ICMP_ULT:
+ switch (PredR) {
+ default:
+ llvm_unreachable("Unknown integer condition code!");
+ case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change
+ // (X u< C || X == UINT_MAX) => (X-C) u>= UINT_MAX-C
+ if (RHSC->isMaxValue(false))
+ return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(),
+ false, false);
+ break;
+ case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2
+ assert(!RHSC->isMaxValue(false) && "Missed icmp simplification");
+ return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1,
+ false, false);
+ }
+ break;
+ case ICmpInst::ICMP_SLT:
+ switch (PredR) {
+ default:
+ llvm_unreachable("Unknown integer condition code!");
+ case ICmpInst::ICMP_EQ:
+ // (X s< C || X == INT_MAX) => (X-C) u>= INT_MAX-C
+ if (RHSC->isMaxValue(true))
+ return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue(),
+ true, false);
+ // (X s< 13 | X == 14) -> no change
+ break;
+ case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) u> 2
+ assert(!RHSC->isMaxValue(true) && "Missed icmp simplification");
+ return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, true,
+ false);
+ }
+ break;
+ }
+ return nullptr;
+}
+
+// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
+// here. We should standardize that construct where it is needed or choose some
+// other way to ensure that commutated variants of patterns are not missed.
Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
- if (Value *V = SimplifyOrInst(I.getOperand(0), I.getOperand(1),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (SimplifyAssociativeOrCommutative(I))
- return &I;
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- // See if we can simplify any instructions used by the instruction whose sole
- // purpose is to compute bits we don't care about.
- if (SimplifyDemandedInstructionBits(I))
- return &I;
-
- // Do this before using distributive laws to catch simple and/or/not patterns.
- if (Instruction *Xor = foldOrToXor(I, Builder))
- return Xor;
-
- // (A&B)|(A&C) -> A&(B|C) etc
- if (Value *V = SimplifyUsingDistributiveLaws(I))
- return replaceInstUsesWith(I, V);
-
- if (Value *V = SimplifyBSwap(I, Builder))
- return replaceInstUsesWith(I, V);
-
- if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
- return FoldedLogic;
-
+ if (Value *V = SimplifyOrInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (SimplifyAssociativeOrCommutative(I))
+ return &I;
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ // See if we can simplify any instructions used by the instruction whose sole
+ // purpose is to compute bits we don't care about.
+ if (SimplifyDemandedInstructionBits(I))
+ return &I;
+
+ // Do this before using distributive laws to catch simple and/or/not patterns.
+ if (Instruction *Xor = foldOrToXor(I, Builder))
+ return Xor;
+
+ // (A&B)|(A&C) -> A&(B|C) etc
+ if (Value *V = SimplifyUsingDistributiveLaws(I))
+ return replaceInstUsesWith(I, V);
+
+ if (Value *V = SimplifyBSwap(I, Builder))
+ return replaceInstUsesWith(I, V);
+
+ if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
+ return FoldedLogic;
+
if (Instruction *BSwap = matchBSwapOrBitReverse(I, /*MatchBSwaps*/ true,
/*MatchBitReversals*/ false))
- return BSwap;
-
+ return BSwap;
+
if (Instruction *Funnel = matchFunnelShift(I, *this))
return Funnel;
-
- if (Instruction *Concat = matchOrConcat(I, Builder))
- return replaceInstUsesWith(I, Concat);
-
- Value *X, *Y;
- const APInt *CV;
- if (match(&I, m_c_Or(m_OneUse(m_Xor(m_Value(X), m_APInt(CV))), m_Value(Y))) &&
- !CV->isAllOnesValue() && MaskedValueIsZero(Y, *CV, 0, &I)) {
- // (X ^ C) | Y -> (X | Y) ^ C iff Y & C == 0
- // The check for a 'not' op is for efficiency (if Y is known zero --> ~X).
- Value *Or = Builder.CreateOr(X, Y);
- return BinaryOperator::CreateXor(Or, ConstantInt::get(I.getType(), *CV));
- }
-
- // (A & C)|(B & D)
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- Value *A, *B, *C, *D;
- if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
- match(Op1, m_And(m_Value(B), m_Value(D)))) {
+
+ if (Instruction *Concat = matchOrConcat(I, Builder))
+ return replaceInstUsesWith(I, Concat);
+
+ Value *X, *Y;
+ const APInt *CV;
+ if (match(&I, m_c_Or(m_OneUse(m_Xor(m_Value(X), m_APInt(CV))), m_Value(Y))) &&
+ !CV->isAllOnesValue() && MaskedValueIsZero(Y, *CV, 0, &I)) {
+ // (X ^ C) | Y -> (X | Y) ^ C iff Y & C == 0
+ // The check for a 'not' op is for efficiency (if Y is known zero --> ~X).
+ Value *Or = Builder.CreateOr(X, Y);
+ return BinaryOperator::CreateXor(Or, ConstantInt::get(I.getType(), *CV));
+ }
+
+ // (A & C)|(B & D)
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Value *A, *B, *C, *D;
+ if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
+ match(Op1, m_And(m_Value(B), m_Value(D)))) {
// (A & C1)|(B & C2)
ConstantInt *C1, *C2;
if (match(C, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2))) {
- Value *V1 = nullptr, *V2 = nullptr;
- if ((C1->getValue() & C2->getValue()).isNullValue()) {
- // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2)
- // iff (C1&C2) == 0 and (N&~C1) == 0
- if (match(A, m_Or(m_Value(V1), m_Value(V2))) &&
- ((V1 == B &&
- MaskedValueIsZero(V2, ~C1->getValue(), 0, &I)) || // (V|N)
- (V2 == B &&
- MaskedValueIsZero(V1, ~C1->getValue(), 0, &I)))) // (N|V)
- return BinaryOperator::CreateAnd(A,
- Builder.getInt(C1->getValue()|C2->getValue()));
- // Or commutes, try both ways.
- if (match(B, m_Or(m_Value(V1), m_Value(V2))) &&
- ((V1 == A &&
- MaskedValueIsZero(V2, ~C2->getValue(), 0, &I)) || // (V|N)
- (V2 == A &&
- MaskedValueIsZero(V1, ~C2->getValue(), 0, &I)))) // (N|V)
- return BinaryOperator::CreateAnd(B,
- Builder.getInt(C1->getValue()|C2->getValue()));
-
- // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2)
- // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0.
- ConstantInt *C3 = nullptr, *C4 = nullptr;
- if (match(A, m_Or(m_Value(V1), m_ConstantInt(C3))) &&
- (C3->getValue() & ~C1->getValue()).isNullValue() &&
- match(B, m_Or(m_Specific(V1), m_ConstantInt(C4))) &&
- (C4->getValue() & ~C2->getValue()).isNullValue()) {
- V2 = Builder.CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield");
- return BinaryOperator::CreateAnd(V2,
- Builder.getInt(C1->getValue()|C2->getValue()));
- }
- }
-
- if (C1->getValue() == ~C2->getValue()) {
- Value *X;
-
- // ((X|B)&C1)|(B&C2) -> (X&C1) | B iff C1 == ~C2
- if (match(A, m_c_Or(m_Value(X), m_Specific(B))))
- return BinaryOperator::CreateOr(Builder.CreateAnd(X, C1), B);
- // (A&C2)|((X|A)&C1) -> (X&C2) | A iff C1 == ~C2
- if (match(B, m_c_Or(m_Specific(A), m_Value(X))))
- return BinaryOperator::CreateOr(Builder.CreateAnd(X, C2), A);
-
- // ((X^B)&C1)|(B&C2) -> (X&C1) ^ B iff C1 == ~C2
- if (match(A, m_c_Xor(m_Value(X), m_Specific(B))))
- return BinaryOperator::CreateXor(Builder.CreateAnd(X, C1), B);
- // (A&C2)|((X^A)&C1) -> (X&C2) ^ A iff C1 == ~C2
- if (match(B, m_c_Xor(m_Specific(A), m_Value(X))))
- return BinaryOperator::CreateXor(Builder.CreateAnd(X, C2), A);
- }
- }
-
- // Don't try to form a select if it's unlikely that we'll get rid of at
- // least one of the operands. A select is generally more expensive than the
- // 'or' that it is replacing.
- if (Op0->hasOneUse() || Op1->hasOneUse()) {
- // (Cond & C) | (~Cond & D) -> Cond ? C : D, and commuted variants.
- if (Value *V = matchSelectFromAndOr(A, C, B, D))
- return replaceInstUsesWith(I, V);
- if (Value *V = matchSelectFromAndOr(A, C, D, B))
- return replaceInstUsesWith(I, V);
- if (Value *V = matchSelectFromAndOr(C, A, B, D))
- return replaceInstUsesWith(I, V);
- if (Value *V = matchSelectFromAndOr(C, A, D, B))
- return replaceInstUsesWith(I, V);
- if (Value *V = matchSelectFromAndOr(B, D, A, C))
- return replaceInstUsesWith(I, V);
- if (Value *V = matchSelectFromAndOr(B, D, C, A))
- return replaceInstUsesWith(I, V);
- if (Value *V = matchSelectFromAndOr(D, B, A, C))
- return replaceInstUsesWith(I, V);
- if (Value *V = matchSelectFromAndOr(D, B, C, A))
- return replaceInstUsesWith(I, V);
- }
- }
-
- // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C
- if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
- if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
- return BinaryOperator::CreateOr(Op0, C);
-
- // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C
- if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
- if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
- return BinaryOperator::CreateOr(Op1, C);
-
- // ((B | C) & A) | B -> B | (A & C)
- if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
- return BinaryOperator::CreateOr(Op1, Builder.CreateAnd(A, C));
-
- if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
- return DeMorgan;
-
- // Canonicalize xor to the RHS.
- bool SwappedForXor = false;
- if (match(Op0, m_Xor(m_Value(), m_Value()))) {
- std::swap(Op0, Op1);
- SwappedForXor = true;
- }
-
- // A | ( A ^ B) -> A | B
- // A | (~A ^ B) -> A | ~B
- // (A & B) | (A ^ B)
- if (match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
- if (Op0 == A || Op0 == B)
- return BinaryOperator::CreateOr(A, B);
-
- if (match(Op0, m_And(m_Specific(A), m_Specific(B))) ||
- match(Op0, m_And(m_Specific(B), m_Specific(A))))
- return BinaryOperator::CreateOr(A, B);
-
- if (Op1->hasOneUse() && match(A, m_Not(m_Specific(Op0)))) {
- Value *Not = Builder.CreateNot(B, B->getName() + ".not");
- return BinaryOperator::CreateOr(Not, Op0);
- }
- if (Op1->hasOneUse() && match(B, m_Not(m_Specific(Op0)))) {
- Value *Not = Builder.CreateNot(A, A->getName() + ".not");
- return BinaryOperator::CreateOr(Not, Op0);
- }
- }
-
- // A | ~(A | B) -> A | ~B
- // A | ~(A ^ B) -> A | ~B
- if (match(Op1, m_Not(m_Value(A))))
- if (BinaryOperator *B = dyn_cast<BinaryOperator>(A))
- if ((Op0 == B->getOperand(0) || Op0 == B->getOperand(1)) &&
- Op1->hasOneUse() && (B->getOpcode() == Instruction::Or ||
- B->getOpcode() == Instruction::Xor)) {
- Value *NotOp = Op0 == B->getOperand(0) ? B->getOperand(1) :
- B->getOperand(0);
- Value *Not = Builder.CreateNot(NotOp, NotOp->getName() + ".not");
- return BinaryOperator::CreateOr(Not, Op0);
- }
-
- if (SwappedForXor)
- std::swap(Op0, Op1);
-
- {
- ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
- ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
- if (LHS && RHS)
- if (Value *Res = foldOrOfICmps(LHS, RHS, I))
- return replaceInstUsesWith(I, Res);
-
- // TODO: Make this recursive; it's a little tricky because an arbitrary
- // number of 'or' instructions might have to be created.
- Value *X, *Y;
- if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
- if (auto *Cmp = dyn_cast<ICmpInst>(X))
- if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
- return replaceInstUsesWith(I, Builder.CreateOr(Res, Y));
- if (auto *Cmp = dyn_cast<ICmpInst>(Y))
- if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
- return replaceInstUsesWith(I, Builder.CreateOr(Res, X));
- }
- if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
- if (auto *Cmp = dyn_cast<ICmpInst>(X))
- if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
- return replaceInstUsesWith(I, Builder.CreateOr(Res, Y));
- if (auto *Cmp = dyn_cast<ICmpInst>(Y))
- if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
- return replaceInstUsesWith(I, Builder.CreateOr(Res, X));
- }
- }
-
- if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
- if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
- if (Value *Res = foldLogicOfFCmps(LHS, RHS, false))
- return replaceInstUsesWith(I, Res);
-
- if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
- return FoldedFCmps;
-
- if (Instruction *CastedOr = foldCastedBitwiseLogic(I))
- return CastedOr;
-
- // or(sext(A), B) / or(B, sext(A)) --> A ? -1 : B, where A is i1 or <N x i1>.
- if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
- A->getType()->isIntOrIntVectorTy(1))
- return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1);
- if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) &&
- A->getType()->isIntOrIntVectorTy(1))
- return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0);
-
- // Note: If we've gotten to the point of visiting the outer OR, then the
- // inner one couldn't be simplified. If it was a constant, then it won't
- // be simplified by a later pass either, so we try swapping the inner/outer
- // ORs in the hopes that we'll be able to simplify it this way.
- // (X|C) | V --> (X|V) | C
- ConstantInt *CI;
+ Value *V1 = nullptr, *V2 = nullptr;
+ if ((C1->getValue() & C2->getValue()).isNullValue()) {
+ // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2)
+ // iff (C1&C2) == 0 and (N&~C1) == 0
+ if (match(A, m_Or(m_Value(V1), m_Value(V2))) &&
+ ((V1 == B &&
+ MaskedValueIsZero(V2, ~C1->getValue(), 0, &I)) || // (V|N)
+ (V2 == B &&
+ MaskedValueIsZero(V1, ~C1->getValue(), 0, &I)))) // (N|V)
+ return BinaryOperator::CreateAnd(A,
+ Builder.getInt(C1->getValue()|C2->getValue()));
+ // Or commutes, try both ways.
+ if (match(B, m_Or(m_Value(V1), m_Value(V2))) &&
+ ((V1 == A &&
+ MaskedValueIsZero(V2, ~C2->getValue(), 0, &I)) || // (V|N)
+ (V2 == A &&
+ MaskedValueIsZero(V1, ~C2->getValue(), 0, &I)))) // (N|V)
+ return BinaryOperator::CreateAnd(B,
+ Builder.getInt(C1->getValue()|C2->getValue()));
+
+ // ((V|C3)&C1) | ((V|C4)&C2) --> (V|C3|C4)&(C1|C2)
+ // iff (C1&C2) == 0 and (C3&~C1) == 0 and (C4&~C2) == 0.
+ ConstantInt *C3 = nullptr, *C4 = nullptr;
+ if (match(A, m_Or(m_Value(V1), m_ConstantInt(C3))) &&
+ (C3->getValue() & ~C1->getValue()).isNullValue() &&
+ match(B, m_Or(m_Specific(V1), m_ConstantInt(C4))) &&
+ (C4->getValue() & ~C2->getValue()).isNullValue()) {
+ V2 = Builder.CreateOr(V1, ConstantExpr::getOr(C3, C4), "bitfield");
+ return BinaryOperator::CreateAnd(V2,
+ Builder.getInt(C1->getValue()|C2->getValue()));
+ }
+ }
+
+ if (C1->getValue() == ~C2->getValue()) {
+ Value *X;
+
+ // ((X|B)&C1)|(B&C2) -> (X&C1) | B iff C1 == ~C2
+ if (match(A, m_c_Or(m_Value(X), m_Specific(B))))
+ return BinaryOperator::CreateOr(Builder.CreateAnd(X, C1), B);
+ // (A&C2)|((X|A)&C1) -> (X&C2) | A iff C1 == ~C2
+ if (match(B, m_c_Or(m_Specific(A), m_Value(X))))
+ return BinaryOperator::CreateOr(Builder.CreateAnd(X, C2), A);
+
+ // ((X^B)&C1)|(B&C2) -> (X&C1) ^ B iff C1 == ~C2
+ if (match(A, m_c_Xor(m_Value(X), m_Specific(B))))
+ return BinaryOperator::CreateXor(Builder.CreateAnd(X, C1), B);
+ // (A&C2)|((X^A)&C1) -> (X&C2) ^ A iff C1 == ~C2
+ if (match(B, m_c_Xor(m_Specific(A), m_Value(X))))
+ return BinaryOperator::CreateXor(Builder.CreateAnd(X, C2), A);
+ }
+ }
+
+ // Don't try to form a select if it's unlikely that we'll get rid of at
+ // least one of the operands. A select is generally more expensive than the
+ // 'or' that it is replacing.
+ if (Op0->hasOneUse() || Op1->hasOneUse()) {
+ // (Cond & C) | (~Cond & D) -> Cond ? C : D, and commuted variants.
+ if (Value *V = matchSelectFromAndOr(A, C, B, D))
+ return replaceInstUsesWith(I, V);
+ if (Value *V = matchSelectFromAndOr(A, C, D, B))
+ return replaceInstUsesWith(I, V);
+ if (Value *V = matchSelectFromAndOr(C, A, B, D))
+ return replaceInstUsesWith(I, V);
+ if (Value *V = matchSelectFromAndOr(C, A, D, B))
+ return replaceInstUsesWith(I, V);
+ if (Value *V = matchSelectFromAndOr(B, D, A, C))
+ return replaceInstUsesWith(I, V);
+ if (Value *V = matchSelectFromAndOr(B, D, C, A))
+ return replaceInstUsesWith(I, V);
+ if (Value *V = matchSelectFromAndOr(D, B, A, C))
+ return replaceInstUsesWith(I, V);
+ if (Value *V = matchSelectFromAndOr(D, B, C, A))
+ return replaceInstUsesWith(I, V);
+ }
+ }
+
+ // (A ^ B) | ((B ^ C) ^ A) -> (A ^ B) | C
+ if (match(Op0, m_Xor(m_Value(A), m_Value(B))))
+ if (match(Op1, m_Xor(m_Xor(m_Specific(B), m_Value(C)), m_Specific(A))))
+ return BinaryOperator::CreateOr(Op0, C);
+
+ // ((A ^ C) ^ B) | (B ^ A) -> (B ^ A) | C
+ if (match(Op0, m_Xor(m_Xor(m_Value(A), m_Value(C)), m_Value(B))))
+ if (match(Op1, m_Xor(m_Specific(B), m_Specific(A))))
+ return BinaryOperator::CreateOr(Op1, C);
+
+ // ((B | C) & A) | B -> B | (A & C)
+ if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A))))
+ return BinaryOperator::CreateOr(Op1, Builder.CreateAnd(A, C));
+
+ if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
+ return DeMorgan;
+
+ // Canonicalize xor to the RHS.
+ bool SwappedForXor = false;
+ if (match(Op0, m_Xor(m_Value(), m_Value()))) {
+ std::swap(Op0, Op1);
+ SwappedForXor = true;
+ }
+
+ // A | ( A ^ B) -> A | B
+ // A | (~A ^ B) -> A | ~B
+ // (A & B) | (A ^ B)
+ if (match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
+ if (Op0 == A || Op0 == B)
+ return BinaryOperator::CreateOr(A, B);
+
+ if (match(Op0, m_And(m_Specific(A), m_Specific(B))) ||
+ match(Op0, m_And(m_Specific(B), m_Specific(A))))
+ return BinaryOperator::CreateOr(A, B);
+
+ if (Op1->hasOneUse() && match(A, m_Not(m_Specific(Op0)))) {
+ Value *Not = Builder.CreateNot(B, B->getName() + ".not");
+ return BinaryOperator::CreateOr(Not, Op0);
+ }
+ if (Op1->hasOneUse() && match(B, m_Not(m_Specific(Op0)))) {
+ Value *Not = Builder.CreateNot(A, A->getName() + ".not");
+ return BinaryOperator::CreateOr(Not, Op0);
+ }
+ }
+
+ // A | ~(A | B) -> A | ~B
+ // A | ~(A ^ B) -> A | ~B
+ if (match(Op1, m_Not(m_Value(A))))
+ if (BinaryOperator *B = dyn_cast<BinaryOperator>(A))
+ if ((Op0 == B->getOperand(0) || Op0 == B->getOperand(1)) &&
+ Op1->hasOneUse() && (B->getOpcode() == Instruction::Or ||
+ B->getOpcode() == Instruction::Xor)) {
+ Value *NotOp = Op0 == B->getOperand(0) ? B->getOperand(1) :
+ B->getOperand(0);
+ Value *Not = Builder.CreateNot(NotOp, NotOp->getName() + ".not");
+ return BinaryOperator::CreateOr(Not, Op0);
+ }
+
+ if (SwappedForXor)
+ std::swap(Op0, Op1);
+
+ {
+ ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
+ ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
+ if (LHS && RHS)
+ if (Value *Res = foldOrOfICmps(LHS, RHS, I))
+ return replaceInstUsesWith(I, Res);
+
+ // TODO: Make this recursive; it's a little tricky because an arbitrary
+ // number of 'or' instructions might have to be created.
+ Value *X, *Y;
+ if (LHS && match(Op1, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
+ if (auto *Cmp = dyn_cast<ICmpInst>(X))
+ if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
+ return replaceInstUsesWith(I, Builder.CreateOr(Res, Y));
+ if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+ if (Value *Res = foldOrOfICmps(LHS, Cmp, I))
+ return replaceInstUsesWith(I, Builder.CreateOr(Res, X));
+ }
+ if (RHS && match(Op0, m_OneUse(m_Or(m_Value(X), m_Value(Y))))) {
+ if (auto *Cmp = dyn_cast<ICmpInst>(X))
+ if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
+ return replaceInstUsesWith(I, Builder.CreateOr(Res, Y));
+ if (auto *Cmp = dyn_cast<ICmpInst>(Y))
+ if (Value *Res = foldOrOfICmps(Cmp, RHS, I))
+ return replaceInstUsesWith(I, Builder.CreateOr(Res, X));
+ }
+ }
+
+ if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
+ if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
+ if (Value *Res = foldLogicOfFCmps(LHS, RHS, false))
+ return replaceInstUsesWith(I, Res);
+
+ if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
+ return FoldedFCmps;
+
+ if (Instruction *CastedOr = foldCastedBitwiseLogic(I))
+ return CastedOr;
+
+ // or(sext(A), B) / or(B, sext(A)) --> A ? -1 : B, where A is i1 or <N x i1>.
+ if (match(Op0, m_OneUse(m_SExt(m_Value(A)))) &&
+ A->getType()->isIntOrIntVectorTy(1))
+ return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op1);
+ if (match(Op1, m_OneUse(m_SExt(m_Value(A)))) &&
+ A->getType()->isIntOrIntVectorTy(1))
+ return SelectInst::Create(A, ConstantInt::getSigned(I.getType(), -1), Op0);
+
+ // Note: If we've gotten to the point of visiting the outer OR, then the
+ // inner one couldn't be simplified. If it was a constant, then it won't
+ // be simplified by a later pass either, so we try swapping the inner/outer
+ // ORs in the hopes that we'll be able to simplify it this way.
+ // (X|C) | V --> (X|V) | C
+ ConstantInt *CI;
if (Op0->hasOneUse() && !match(Op1, m_ConstantInt()) &&
- match(Op0, m_Or(m_Value(A), m_ConstantInt(CI)))) {
- Value *Inner = Builder.CreateOr(A, Op1);
- Inner->takeName(Op0);
- return BinaryOperator::CreateOr(Inner, CI);
- }
-
- // Change (or (bool?A:B),(bool?C:D)) --> (bool?(or A,C):(or B,D))
- // Since this OR statement hasn't been optimized further yet, we hope
- // that this transformation will allow the new ORs to be optimized.
- {
- Value *X = nullptr, *Y = nullptr;
- if (Op0->hasOneUse() && Op1->hasOneUse() &&
- match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) &&
- match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) {
- Value *orTrue = Builder.CreateOr(A, C);
- Value *orFalse = Builder.CreateOr(B, D);
- return SelectInst::Create(X, orTrue, orFalse);
- }
- }
-
+ match(Op0, m_Or(m_Value(A), m_ConstantInt(CI)))) {
+ Value *Inner = Builder.CreateOr(A, Op1);
+ Inner->takeName(Op0);
+ return BinaryOperator::CreateOr(Inner, CI);
+ }
+
+ // Change (or (bool?A:B),(bool?C:D)) --> (bool?(or A,C):(or B,D))
+ // Since this OR statement hasn't been optimized further yet, we hope
+ // that this transformation will allow the new ORs to be optimized.
+ {
+ Value *X = nullptr, *Y = nullptr;
+ if (Op0->hasOneUse() && Op1->hasOneUse() &&
+ match(Op0, m_Select(m_Value(X), m_Value(A), m_Value(B))) &&
+ match(Op1, m_Select(m_Value(Y), m_Value(C), m_Value(D))) && X == Y) {
+ Value *orTrue = Builder.CreateOr(A, C);
+ Value *orFalse = Builder.CreateOr(B, D);
+ return SelectInst::Create(X, orTrue, orFalse);
+ }
+ }
+
// or(ashr(subNSW(Y, X), ScalarSizeInBits(Y) - 1), X) --> X s> Y ? -1 : X.
- {
- Value *X, *Y;
- Type *Ty = I.getType();
+ {
+ Value *X, *Y;
+ Type *Ty = I.getType();
if (match(&I, m_c_Or(m_OneUse(m_AShr(
m_NSWSub(m_Value(Y), m_Value(X)),
m_SpecificInt(Ty->getScalarSizeInBits() - 1))),
m_Deferred(X)))) {
- Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
+ Value *NewICmpInst = Builder.CreateICmpSGT(X, Y);
Value *AllOnes = ConstantInt::getAllOnesValue(Ty);
return SelectInst::Create(NewICmpInst, AllOnes, X);
- }
- }
-
- if (Instruction *V =
- canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
- return V;
-
- CmpInst::Predicate Pred;
- Value *Mul, *Ov, *MulIsNotZero, *UMulWithOv;
- // Check if the OR weakens the overflow condition for umul.with.overflow by
- // treating any non-zero result as overflow. In that case, we overflow if both
- // umul.with.overflow operands are != 0, as in that case the result can only
- // be 0, iff the multiplication overflows.
- if (match(&I,
- m_c_Or(m_CombineAnd(m_ExtractValue<1>(m_Value(UMulWithOv)),
- m_Value(Ov)),
- m_CombineAnd(m_ICmp(Pred,
- m_CombineAnd(m_ExtractValue<0>(
- m_Deferred(UMulWithOv)),
- m_Value(Mul)),
- m_ZeroInt()),
- m_Value(MulIsNotZero)))) &&
- (Ov->hasOneUse() || (MulIsNotZero->hasOneUse() && Mul->hasOneUse())) &&
- Pred == CmpInst::ICMP_NE) {
- Value *A, *B;
- if (match(UMulWithOv, m_Intrinsic<Intrinsic::umul_with_overflow>(
- m_Value(A), m_Value(B)))) {
- Value *NotNullA = Builder.CreateIsNotNull(A);
- Value *NotNullB = Builder.CreateIsNotNull(B);
- return BinaryOperator::CreateAnd(NotNullA, NotNullB);
- }
- }
-
+ }
+ }
+
+ if (Instruction *V =
+ canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
+ return V;
+
+ CmpInst::Predicate Pred;
+ Value *Mul, *Ov, *MulIsNotZero, *UMulWithOv;
+ // Check if the OR weakens the overflow condition for umul.with.overflow by
+ // treating any non-zero result as overflow. In that case, we overflow if both
+ // umul.with.overflow operands are != 0, as in that case the result can only
+ // be 0, iff the multiplication overflows.
+ if (match(&I,
+ m_c_Or(m_CombineAnd(m_ExtractValue<1>(m_Value(UMulWithOv)),
+ m_Value(Ov)),
+ m_CombineAnd(m_ICmp(Pred,
+ m_CombineAnd(m_ExtractValue<0>(
+ m_Deferred(UMulWithOv)),
+ m_Value(Mul)),
+ m_ZeroInt()),
+ m_Value(MulIsNotZero)))) &&
+ (Ov->hasOneUse() || (MulIsNotZero->hasOneUse() && Mul->hasOneUse())) &&
+ Pred == CmpInst::ICMP_NE) {
+ Value *A, *B;
+ if (match(UMulWithOv, m_Intrinsic<Intrinsic::umul_with_overflow>(
+ m_Value(A), m_Value(B)))) {
+ Value *NotNullA = Builder.CreateIsNotNull(A);
+ Value *NotNullB = Builder.CreateIsNotNull(B);
+ return BinaryOperator::CreateAnd(NotNullA, NotNullB);
+ }
+ }
+
// (~x) | y --> ~(x & (~y)) iff that gets rid of inversions
if (sinkNotIntoOtherHandOfAndOrOr(I))
return &I;
- return nullptr;
-}
-
-/// A ^ B can be specified using other logic ops in a variety of patterns. We
-/// can fold these early and efficiently by morphing an existing instruction.
-static Instruction *foldXorToXor(BinaryOperator &I,
- InstCombiner::BuilderTy &Builder) {
- assert(I.getOpcode() == Instruction::Xor);
- Value *Op0 = I.getOperand(0);
- Value *Op1 = I.getOperand(1);
- Value *A, *B;
-
- // There are 4 commuted variants for each of the basic patterns.
-
- // (A & B) ^ (A | B) -> A ^ B
- // (A & B) ^ (B | A) -> A ^ B
- // (A | B) ^ (A & B) -> A ^ B
- // (A | B) ^ (B & A) -> A ^ B
- if (match(&I, m_c_Xor(m_And(m_Value(A), m_Value(B)),
- m_c_Or(m_Deferred(A), m_Deferred(B)))))
- return BinaryOperator::CreateXor(A, B);
-
- // (A | ~B) ^ (~A | B) -> A ^ B
- // (~B | A) ^ (~A | B) -> A ^ B
- // (~A | B) ^ (A | ~B) -> A ^ B
- // (B | ~A) ^ (A | ~B) -> A ^ B
- if (match(&I, m_Xor(m_c_Or(m_Value(A), m_Not(m_Value(B))),
- m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B)))))
- return BinaryOperator::CreateXor(A, B);
-
- // (A & ~B) ^ (~A & B) -> A ^ B
- // (~B & A) ^ (~A & B) -> A ^ B
- // (~A & B) ^ (A & ~B) -> A ^ B
- // (B & ~A) ^ (A & ~B) -> A ^ B
- if (match(&I, m_Xor(m_c_And(m_Value(A), m_Not(m_Value(B))),
- m_c_And(m_Not(m_Deferred(A)), m_Deferred(B)))))
- return BinaryOperator::CreateXor(A, B);
-
- // For the remaining cases we need to get rid of one of the operands.
- if (!Op0->hasOneUse() && !Op1->hasOneUse())
- return nullptr;
-
- // (A | B) ^ ~(A & B) -> ~(A ^ B)
- // (A | B) ^ ~(B & A) -> ~(A ^ B)
- // (A & B) ^ ~(A | B) -> ~(A ^ B)
- // (A & B) ^ ~(B | A) -> ~(A ^ B)
- // Complexity sorting ensures the not will be on the right side.
- if ((match(Op0, m_Or(m_Value(A), m_Value(B))) &&
- match(Op1, m_Not(m_c_And(m_Specific(A), m_Specific(B))))) ||
- (match(Op0, m_And(m_Value(A), m_Value(B))) &&
- match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))))
- return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
-
- return nullptr;
-}
-
+ return nullptr;
+}
+
+/// A ^ B can be specified using other logic ops in a variety of patterns. We
+/// can fold these early and efficiently by morphing an existing instruction.
+static Instruction *foldXorToXor(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ assert(I.getOpcode() == Instruction::Xor);
+ Value *Op0 = I.getOperand(0);
+ Value *Op1 = I.getOperand(1);
+ Value *A, *B;
+
+ // There are 4 commuted variants for each of the basic patterns.
+
+ // (A & B) ^ (A | B) -> A ^ B
+ // (A & B) ^ (B | A) -> A ^ B
+ // (A | B) ^ (A & B) -> A ^ B
+ // (A | B) ^ (B & A) -> A ^ B
+ if (match(&I, m_c_Xor(m_And(m_Value(A), m_Value(B)),
+ m_c_Or(m_Deferred(A), m_Deferred(B)))))
+ return BinaryOperator::CreateXor(A, B);
+
+ // (A | ~B) ^ (~A | B) -> A ^ B
+ // (~B | A) ^ (~A | B) -> A ^ B
+ // (~A | B) ^ (A | ~B) -> A ^ B
+ // (B | ~A) ^ (A | ~B) -> A ^ B
+ if (match(&I, m_Xor(m_c_Or(m_Value(A), m_Not(m_Value(B))),
+ m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B)))))
+ return BinaryOperator::CreateXor(A, B);
+
+ // (A & ~B) ^ (~A & B) -> A ^ B
+ // (~B & A) ^ (~A & B) -> A ^ B
+ // (~A & B) ^ (A & ~B) -> A ^ B
+ // (B & ~A) ^ (A & ~B) -> A ^ B
+ if (match(&I, m_Xor(m_c_And(m_Value(A), m_Not(m_Value(B))),
+ m_c_And(m_Not(m_Deferred(A)), m_Deferred(B)))))
+ return BinaryOperator::CreateXor(A, B);
+
+ // For the remaining cases we need to get rid of one of the operands.
+ if (!Op0->hasOneUse() && !Op1->hasOneUse())
+ return nullptr;
+
+ // (A | B) ^ ~(A & B) -> ~(A ^ B)
+ // (A | B) ^ ~(B & A) -> ~(A ^ B)
+ // (A & B) ^ ~(A | B) -> ~(A ^ B)
+ // (A & B) ^ ~(B | A) -> ~(A ^ B)
+ // Complexity sorting ensures the not will be on the right side.
+ if ((match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+ match(Op1, m_Not(m_c_And(m_Specific(A), m_Specific(B))))) ||
+ (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+ match(Op1, m_Not(m_c_Or(m_Specific(A), m_Specific(B))))))
+ return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+ return nullptr;
+}
+
Value *InstCombinerImpl::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
BinaryOperator &I) {
- assert(I.getOpcode() == Instruction::Xor && I.getOperand(0) == LHS &&
- I.getOperand(1) == RHS && "Should be 'xor' with these operands");
-
- if (predicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
- if (LHS->getOperand(0) == RHS->getOperand(1) &&
- LHS->getOperand(1) == RHS->getOperand(0))
- LHS->swapOperands();
- if (LHS->getOperand(0) == RHS->getOperand(0) &&
- LHS->getOperand(1) == RHS->getOperand(1)) {
- // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
- Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
- unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS);
- bool IsSigned = LHS->isSigned() || RHS->isSigned();
- return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder);
- }
- }
-
- // TODO: This can be generalized to compares of non-signbits using
- // decomposeBitTestICmp(). It could be enhanced more by using (something like)
- // foldLogOpOfMaskedICmps().
- ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
- Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
- Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
- if ((LHS->hasOneUse() || RHS->hasOneUse()) &&
- LHS0->getType() == RHS0->getType() &&
- LHS0->getType()->isIntOrIntVectorTy()) {
- // (X > -1) ^ (Y > -1) --> (X ^ Y) < 0
- // (X < 0) ^ (Y < 0) --> (X ^ Y) < 0
- if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) &&
- PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes())) ||
- (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) &&
- PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero()))) {
- Value *Zero = ConstantInt::getNullValue(LHS0->getType());
- return Builder.CreateICmpSLT(Builder.CreateXor(LHS0, RHS0), Zero);
- }
- // (X > -1) ^ (Y < 0) --> (X ^ Y) > -1
- // (X < 0) ^ (Y > -1) --> (X ^ Y) > -1
- if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) &&
- PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero())) ||
- (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) &&
- PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes()))) {
- Value *MinusOne = ConstantInt::getAllOnesValue(LHS0->getType());
- return Builder.CreateICmpSGT(Builder.CreateXor(LHS0, RHS0), MinusOne);
- }
- }
-
- // Instead of trying to imitate the folds for and/or, decompose this 'xor'
- // into those logic ops. That is, try to turn this into an and-of-icmps
- // because we have many folds for that pattern.
- //
- // This is based on a truth table definition of xor:
- // X ^ Y --> (X | Y) & !(X & Y)
- if (Value *OrICmp = SimplifyBinOp(Instruction::Or, LHS, RHS, SQ)) {
- // TODO: If OrICmp is true, then the definition of xor simplifies to !(X&Y).
- // TODO: If OrICmp is false, the whole thing is false (InstSimplify?).
- if (Value *AndICmp = SimplifyBinOp(Instruction::And, LHS, RHS, SQ)) {
- // TODO: Independently handle cases where the 'and' side is a constant.
- ICmpInst *X = nullptr, *Y = nullptr;
- if (OrICmp == LHS && AndICmp == RHS) {
- // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS --> X & !Y
- X = LHS;
- Y = RHS;
- }
- if (OrICmp == RHS && AndICmp == LHS) {
- // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS --> !Y & X
- X = RHS;
- Y = LHS;
- }
- if (X && Y && (Y->hasOneUse() || canFreelyInvertAllUsersOf(Y, &I))) {
- // Invert the predicate of 'Y', thus inverting its output.
- Y->setPredicate(Y->getInversePredicate());
- // So, are there other uses of Y?
- if (!Y->hasOneUse()) {
- // We need to adapt other uses of Y though. Get a value that matches
- // the original value of Y before inversion. While this increases
- // immediate instruction count, we have just ensured that all the
- // users are freely-invertible, so that 'not' *will* get folded away.
- BuilderTy::InsertPointGuard Guard(Builder);
- // Set insertion point to right after the Y.
- Builder.SetInsertPoint(Y->getParent(), ++(Y->getIterator()));
- Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
- // Replace all uses of Y (excluding the one in NotY!) with NotY.
- Worklist.pushUsersToWorkList(*Y);
- Y->replaceUsesWithIf(NotY,
- [NotY](Use &U) { return U.getUser() != NotY; });
- }
- // All done.
- return Builder.CreateAnd(LHS, RHS);
- }
- }
- }
-
- return nullptr;
-}
-
-/// If we have a masked merge, in the canonical form of:
-/// (assuming that A only has one use.)
-/// | A | |B|
-/// ((x ^ y) & M) ^ y
-/// | D |
-/// * If M is inverted:
-/// | D |
-/// ((x ^ y) & ~M) ^ y
-/// We can canonicalize by swapping the final xor operand
-/// to eliminate the 'not' of the mask.
-/// ((x ^ y) & M) ^ x
-/// * If M is a constant, and D has one use, we transform to 'and' / 'or' ops
-/// because that shortens the dependency chain and improves analysis:
-/// (x & M) | (y & ~M)
-static Instruction *visitMaskedMerge(BinaryOperator &I,
- InstCombiner::BuilderTy &Builder) {
- Value *B, *X, *D;
- Value *M;
- if (!match(&I, m_c_Xor(m_Value(B),
- m_OneUse(m_c_And(
- m_CombineAnd(m_c_Xor(m_Deferred(B), m_Value(X)),
- m_Value(D)),
- m_Value(M))))))
- return nullptr;
-
- Value *NotM;
- if (match(M, m_Not(m_Value(NotM)))) {
- // De-invert the mask and swap the value in B part.
- Value *NewA = Builder.CreateAnd(D, NotM);
- return BinaryOperator::CreateXor(NewA, X);
- }
-
- Constant *C;
- if (D->hasOneUse() && match(M, m_Constant(C))) {
- // Propagating undef is unsafe. Clamp undef elements to -1.
- Type *EltTy = C->getType()->getScalarType();
- C = Constant::replaceUndefsWith(C, ConstantInt::getAllOnesValue(EltTy));
- // Unfold.
- Value *LHS = Builder.CreateAnd(X, C);
- Value *NotC = Builder.CreateNot(C);
- Value *RHS = Builder.CreateAnd(B, NotC);
- return BinaryOperator::CreateOr(LHS, RHS);
- }
-
- return nullptr;
-}
-
-// Transform
-// ~(x ^ y)
-// into:
-// (~x) ^ y
-// or into
-// x ^ (~y)
-static Instruction *sinkNotIntoXor(BinaryOperator &I,
- InstCombiner::BuilderTy &Builder) {
- Value *X, *Y;
- // FIXME: one-use check is not needed in general, but currently we are unable
- // to fold 'not' into 'icmp', if that 'icmp' has multiple uses. (D35182)
- if (!match(&I, m_Not(m_OneUse(m_Xor(m_Value(X), m_Value(Y))))))
- return nullptr;
-
- // We only want to do the transform if it is free to do.
+ assert(I.getOpcode() == Instruction::Xor && I.getOperand(0) == LHS &&
+ I.getOperand(1) == RHS && "Should be 'xor' with these operands");
+
+ if (predicatesFoldable(LHS->getPredicate(), RHS->getPredicate())) {
+ if (LHS->getOperand(0) == RHS->getOperand(1) &&
+ LHS->getOperand(1) == RHS->getOperand(0))
+ LHS->swapOperands();
+ if (LHS->getOperand(0) == RHS->getOperand(0) &&
+ LHS->getOperand(1) == RHS->getOperand(1)) {
+ // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
+ Value *Op0 = LHS->getOperand(0), *Op1 = LHS->getOperand(1);
+ unsigned Code = getICmpCode(LHS) ^ getICmpCode(RHS);
+ bool IsSigned = LHS->isSigned() || RHS->isSigned();
+ return getNewICmpValue(Code, IsSigned, Op0, Op1, Builder);
+ }
+ }
+
+ // TODO: This can be generalized to compares of non-signbits using
+ // decomposeBitTestICmp(). It could be enhanced more by using (something like)
+ // foldLogOpOfMaskedICmps().
+ ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+ Value *LHS0 = LHS->getOperand(0), *LHS1 = LHS->getOperand(1);
+ Value *RHS0 = RHS->getOperand(0), *RHS1 = RHS->getOperand(1);
+ if ((LHS->hasOneUse() || RHS->hasOneUse()) &&
+ LHS0->getType() == RHS0->getType() &&
+ LHS0->getType()->isIntOrIntVectorTy()) {
+ // (X > -1) ^ (Y > -1) --> (X ^ Y) < 0
+ // (X < 0) ^ (Y < 0) --> (X ^ Y) < 0
+ if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) &&
+ PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes())) ||
+ (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) &&
+ PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero()))) {
+ Value *Zero = ConstantInt::getNullValue(LHS0->getType());
+ return Builder.CreateICmpSLT(Builder.CreateXor(LHS0, RHS0), Zero);
+ }
+ // (X > -1) ^ (Y < 0) --> (X ^ Y) > -1
+ // (X < 0) ^ (Y > -1) --> (X ^ Y) > -1
+ if ((PredL == CmpInst::ICMP_SGT && match(LHS1, m_AllOnes()) &&
+ PredR == CmpInst::ICMP_SLT && match(RHS1, m_Zero())) ||
+ (PredL == CmpInst::ICMP_SLT && match(LHS1, m_Zero()) &&
+ PredR == CmpInst::ICMP_SGT && match(RHS1, m_AllOnes()))) {
+ Value *MinusOne = ConstantInt::getAllOnesValue(LHS0->getType());
+ return Builder.CreateICmpSGT(Builder.CreateXor(LHS0, RHS0), MinusOne);
+ }
+ }
+
+ // Instead of trying to imitate the folds for and/or, decompose this 'xor'
+ // into those logic ops. That is, try to turn this into an and-of-icmps
+ // because we have many folds for that pattern.
+ //
+ // This is based on a truth table definition of xor:
+ // X ^ Y --> (X | Y) & !(X & Y)
+ if (Value *OrICmp = SimplifyBinOp(Instruction::Or, LHS, RHS, SQ)) {
+ // TODO: If OrICmp is true, then the definition of xor simplifies to !(X&Y).
+ // TODO: If OrICmp is false, the whole thing is false (InstSimplify?).
+ if (Value *AndICmp = SimplifyBinOp(Instruction::And, LHS, RHS, SQ)) {
+ // TODO: Independently handle cases where the 'and' side is a constant.
+ ICmpInst *X = nullptr, *Y = nullptr;
+ if (OrICmp == LHS && AndICmp == RHS) {
+ // (LHS | RHS) & !(LHS & RHS) --> LHS & !RHS --> X & !Y
+ X = LHS;
+ Y = RHS;
+ }
+ if (OrICmp == RHS && AndICmp == LHS) {
+ // !(LHS & RHS) & (LHS | RHS) --> !LHS & RHS --> !Y & X
+ X = RHS;
+ Y = LHS;
+ }
+ if (X && Y && (Y->hasOneUse() || canFreelyInvertAllUsersOf(Y, &I))) {
+ // Invert the predicate of 'Y', thus inverting its output.
+ Y->setPredicate(Y->getInversePredicate());
+ // So, are there other uses of Y?
+ if (!Y->hasOneUse()) {
+ // We need to adapt other uses of Y though. Get a value that matches
+ // the original value of Y before inversion. While this increases
+ // immediate instruction count, we have just ensured that all the
+ // users are freely-invertible, so that 'not' *will* get folded away.
+ BuilderTy::InsertPointGuard Guard(Builder);
+ // Set insertion point to right after the Y.
+ Builder.SetInsertPoint(Y->getParent(), ++(Y->getIterator()));
+ Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+ // Replace all uses of Y (excluding the one in NotY!) with NotY.
+ Worklist.pushUsersToWorkList(*Y);
+ Y->replaceUsesWithIf(NotY,
+ [NotY](Use &U) { return U.getUser() != NotY; });
+ }
+ // All done.
+ return Builder.CreateAnd(LHS, RHS);
+ }
+ }
+ }
+
+ return nullptr;
+}
+
+/// If we have a masked merge, in the canonical form of:
+/// (assuming that A only has one use.)
+/// | A | |B|
+/// ((x ^ y) & M) ^ y
+/// | D |
+/// * If M is inverted:
+/// | D |
+/// ((x ^ y) & ~M) ^ y
+/// We can canonicalize by swapping the final xor operand
+/// to eliminate the 'not' of the mask.
+/// ((x ^ y) & M) ^ x
+/// * If M is a constant, and D has one use, we transform to 'and' / 'or' ops
+/// because that shortens the dependency chain and improves analysis:
+/// (x & M) | (y & ~M)
+static Instruction *visitMaskedMerge(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ Value *B, *X, *D;
+ Value *M;
+ if (!match(&I, m_c_Xor(m_Value(B),
+ m_OneUse(m_c_And(
+ m_CombineAnd(m_c_Xor(m_Deferred(B), m_Value(X)),
+ m_Value(D)),
+ m_Value(M))))))
+ return nullptr;
+
+ Value *NotM;
+ if (match(M, m_Not(m_Value(NotM)))) {
+ // De-invert the mask and swap the value in B part.
+ Value *NewA = Builder.CreateAnd(D, NotM);
+ return BinaryOperator::CreateXor(NewA, X);
+ }
+
+ Constant *C;
+ if (D->hasOneUse() && match(M, m_Constant(C))) {
+ // Propagating undef is unsafe. Clamp undef elements to -1.
+ Type *EltTy = C->getType()->getScalarType();
+ C = Constant::replaceUndefsWith(C, ConstantInt::getAllOnesValue(EltTy));
+ // Unfold.
+ Value *LHS = Builder.CreateAnd(X, C);
+ Value *NotC = Builder.CreateNot(C);
+ Value *RHS = Builder.CreateAnd(B, NotC);
+ return BinaryOperator::CreateOr(LHS, RHS);
+ }
+
+ return nullptr;
+}
+
+// Transform
+// ~(x ^ y)
+// into:
+// (~x) ^ y
+// or into
+// x ^ (~y)
+static Instruction *sinkNotIntoXor(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ Value *X, *Y;
+ // FIXME: one-use check is not needed in general, but currently we are unable
+ // to fold 'not' into 'icmp', if that 'icmp' has multiple uses. (D35182)
+ if (!match(&I, m_Not(m_OneUse(m_Xor(m_Value(X), m_Value(Y))))))
+ return nullptr;
+
+ // We only want to do the transform if it is free to do.
if (InstCombiner::isFreeToInvert(X, X->hasOneUse())) {
- // Ok, good.
+ // Ok, good.
} else if (InstCombiner::isFreeToInvert(Y, Y->hasOneUse())) {
- std::swap(X, Y);
- } else
- return nullptr;
-
- Value *NotX = Builder.CreateNot(X, X->getName() + ".not");
- return BinaryOperator::CreateXor(NotX, Y, I.getName() + ".demorgan");
-}
-
+ std::swap(X, Y);
+ } else
+ return nullptr;
+
+ Value *NotX = Builder.CreateNot(X, X->getName() + ".not");
+ return BinaryOperator::CreateXor(NotX, Y, I.getName() + ".demorgan");
+}
+
// Transform
// z = (~x) &/| y
// into:
@@ -3139,118 +3139,118 @@ bool InstCombinerImpl::sinkNotIntoOtherHandOfAndOrOr(BinaryOperator &I) {
return true;
}
-// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
-// here. We should standardize that construct where it is needed or choose some
-// other way to ensure that commutated variants of patterns are not missed.
+// FIXME: We use commutative matchers (m_c_*) for some, but not all, matches
+// here. We should standardize that construct where it is needed or choose some
+// other way to ensure that commutated variants of patterns are not missed.
Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
- if (Value *V = SimplifyXorInst(I.getOperand(0), I.getOperand(1),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (SimplifyAssociativeOrCommutative(I))
- return &I;
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- if (Instruction *NewXor = foldXorToXor(I, Builder))
- return NewXor;
-
- // (A&B)^(A&C) -> A&(B^C) etc
- if (Value *V = SimplifyUsingDistributiveLaws(I))
- return replaceInstUsesWith(I, V);
-
- // See if we can simplify any instructions used by the instruction whose sole
- // purpose is to compute bits we don't care about.
- if (SimplifyDemandedInstructionBits(I))
- return &I;
-
- if (Value *V = SimplifyBSwap(I, Builder))
- return replaceInstUsesWith(I, V);
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ if (Value *V = SimplifyXorInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (SimplifyAssociativeOrCommutative(I))
+ return &I;
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ if (Instruction *NewXor = foldXorToXor(I, Builder))
+ return NewXor;
+
+ // (A&B)^(A&C) -> A&(B^C) etc
+ if (Value *V = SimplifyUsingDistributiveLaws(I))
+ return replaceInstUsesWith(I, V);
+
+ // See if we can simplify any instructions used by the instruction whose sole
+ // purpose is to compute bits we don't care about.
+ if (SimplifyDemandedInstructionBits(I))
+ return &I;
+
+ if (Value *V = SimplifyBSwap(I, Builder))
+ return replaceInstUsesWith(I, V);
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
Type *Ty = I.getType();
-
- // Fold (X & M) ^ (Y & ~M) -> (X & M) | (Y & ~M)
- // This it a special case in haveNoCommonBitsSet, but the computeKnownBits
- // calls in there are unnecessary as SimplifyDemandedInstructionBits should
- // have already taken care of those cases.
- Value *M;
- if (match(&I, m_c_Xor(m_c_And(m_Not(m_Value(M)), m_Value()),
- m_c_And(m_Deferred(M), m_Value()))))
- return BinaryOperator::CreateOr(Op0, Op1);
-
- // Apply DeMorgan's Law for 'nand' / 'nor' logic with an inverted operand.
- Value *X, *Y;
-
- // We must eliminate the and/or (one-use) for these transforms to not increase
- // the instruction count.
- // ~(~X & Y) --> (X | ~Y)
- // ~(Y & ~X) --> (X | ~Y)
- if (match(&I, m_Not(m_OneUse(m_c_And(m_Not(m_Value(X)), m_Value(Y)))))) {
- Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
- return BinaryOperator::CreateOr(X, NotY);
- }
- // ~(~X | Y) --> (X & ~Y)
- // ~(Y | ~X) --> (X & ~Y)
- if (match(&I, m_Not(m_OneUse(m_c_Or(m_Not(m_Value(X)), m_Value(Y)))))) {
- Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
- return BinaryOperator::CreateAnd(X, NotY);
- }
-
- if (Instruction *Xor = visitMaskedMerge(I, Builder))
- return Xor;
-
- // Is this a 'not' (~) fed by a binary operator?
- BinaryOperator *NotVal;
- if (match(&I, m_Not(m_BinOp(NotVal)))) {
- if (NotVal->getOpcode() == Instruction::And ||
- NotVal->getOpcode() == Instruction::Or) {
- // Apply DeMorgan's Law when inverts are free:
- // ~(X & Y) --> (~X | ~Y)
- // ~(X | Y) --> (~X & ~Y)
- if (isFreeToInvert(NotVal->getOperand(0),
- NotVal->getOperand(0)->hasOneUse()) &&
- isFreeToInvert(NotVal->getOperand(1),
- NotVal->getOperand(1)->hasOneUse())) {
- Value *NotX = Builder.CreateNot(NotVal->getOperand(0), "notlhs");
- Value *NotY = Builder.CreateNot(NotVal->getOperand(1), "notrhs");
- if (NotVal->getOpcode() == Instruction::And)
- return BinaryOperator::CreateOr(NotX, NotY);
- return BinaryOperator::CreateAnd(NotX, NotY);
- }
- }
-
- // ~(~X >>s Y) --> (X >>s Y)
- if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y))))
- return BinaryOperator::CreateAShr(X, Y);
-
- // If we are inverting a right-shifted constant, we may be able to eliminate
- // the 'not' by inverting the constant and using the opposite shift type.
- // Canonicalization rules ensure that only a negative constant uses 'ashr',
- // but we must check that in case that transform has not fired yet.
-
- // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits)
- Constant *C;
- if (match(NotVal, m_AShr(m_Constant(C), m_Value(Y))) &&
- match(C, m_Negative())) {
- // We matched a negative constant, so propagating undef is unsafe.
- // Clamp undef elements to -1.
+
+ // Fold (X & M) ^ (Y & ~M) -> (X & M) | (Y & ~M)
+ // This it a special case in haveNoCommonBitsSet, but the computeKnownBits
+ // calls in there are unnecessary as SimplifyDemandedInstructionBits should
+ // have already taken care of those cases.
+ Value *M;
+ if (match(&I, m_c_Xor(m_c_And(m_Not(m_Value(M)), m_Value()),
+ m_c_And(m_Deferred(M), m_Value()))))
+ return BinaryOperator::CreateOr(Op0, Op1);
+
+ // Apply DeMorgan's Law for 'nand' / 'nor' logic with an inverted operand.
+ Value *X, *Y;
+
+ // We must eliminate the and/or (one-use) for these transforms to not increase
+ // the instruction count.
+ // ~(~X & Y) --> (X | ~Y)
+ // ~(Y & ~X) --> (X | ~Y)
+ if (match(&I, m_Not(m_OneUse(m_c_And(m_Not(m_Value(X)), m_Value(Y)))))) {
+ Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+ return BinaryOperator::CreateOr(X, NotY);
+ }
+ // ~(~X | Y) --> (X & ~Y)
+ // ~(Y | ~X) --> (X & ~Y)
+ if (match(&I, m_Not(m_OneUse(m_c_Or(m_Not(m_Value(X)), m_Value(Y)))))) {
+ Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
+ return BinaryOperator::CreateAnd(X, NotY);
+ }
+
+ if (Instruction *Xor = visitMaskedMerge(I, Builder))
+ return Xor;
+
+ // Is this a 'not' (~) fed by a binary operator?
+ BinaryOperator *NotVal;
+ if (match(&I, m_Not(m_BinOp(NotVal)))) {
+ if (NotVal->getOpcode() == Instruction::And ||
+ NotVal->getOpcode() == Instruction::Or) {
+ // Apply DeMorgan's Law when inverts are free:
+ // ~(X & Y) --> (~X | ~Y)
+ // ~(X | Y) --> (~X & ~Y)
+ if (isFreeToInvert(NotVal->getOperand(0),
+ NotVal->getOperand(0)->hasOneUse()) &&
+ isFreeToInvert(NotVal->getOperand(1),
+ NotVal->getOperand(1)->hasOneUse())) {
+ Value *NotX = Builder.CreateNot(NotVal->getOperand(0), "notlhs");
+ Value *NotY = Builder.CreateNot(NotVal->getOperand(1), "notrhs");
+ if (NotVal->getOpcode() == Instruction::And)
+ return BinaryOperator::CreateOr(NotX, NotY);
+ return BinaryOperator::CreateAnd(NotX, NotY);
+ }
+ }
+
+ // ~(~X >>s Y) --> (X >>s Y)
+ if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y))))
+ return BinaryOperator::CreateAShr(X, Y);
+
+ // If we are inverting a right-shifted constant, we may be able to eliminate
+ // the 'not' by inverting the constant and using the opposite shift type.
+ // Canonicalization rules ensure that only a negative constant uses 'ashr',
+ // but we must check that in case that transform has not fired yet.
+
+ // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits)
+ Constant *C;
+ if (match(NotVal, m_AShr(m_Constant(C), m_Value(Y))) &&
+ match(C, m_Negative())) {
+ // We matched a negative constant, so propagating undef is unsafe.
+ // Clamp undef elements to -1.
Type *EltTy = Ty->getScalarType();
- C = Constant::replaceUndefsWith(C, ConstantInt::getAllOnesValue(EltTy));
- return BinaryOperator::CreateLShr(ConstantExpr::getNot(C), Y);
- }
-
- // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits)
- if (match(NotVal, m_LShr(m_Constant(C), m_Value(Y))) &&
- match(C, m_NonNegative())) {
- // We matched a non-negative constant, so propagating undef is unsafe.
- // Clamp undef elements to 0.
+ C = Constant::replaceUndefsWith(C, ConstantInt::getAllOnesValue(EltTy));
+ return BinaryOperator::CreateLShr(ConstantExpr::getNot(C), Y);
+ }
+
+ // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits)
+ if (match(NotVal, m_LShr(m_Constant(C), m_Value(Y))) &&
+ match(C, m_NonNegative())) {
+ // We matched a non-negative constant, so propagating undef is unsafe.
+ // Clamp undef elements to 0.
Type *EltTy = Ty->getScalarType();
- C = Constant::replaceUndefsWith(C, ConstantInt::getNullValue(EltTy));
- return BinaryOperator::CreateAShr(ConstantExpr::getNot(C), Y);
- }
-
+ C = Constant::replaceUndefsWith(C, ConstantInt::getNullValue(EltTy));
+ return BinaryOperator::CreateAShr(ConstantExpr::getNot(C), Y);
+ }
+
// ~(X + C) --> ~C - X
if (match(NotVal, m_c_Add(m_Value(X), m_ImmConstant(C))))
return BinaryOperator::CreateSub(ConstantExpr::getNot(C), X);
@@ -3265,46 +3265,46 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
if (match(NotVal, m_c_Add(m_Not(m_Value(X)), m_Value(Y))))
return BinaryOperator::CreateWithCopiedFlags(Instruction::Sub, X, Y,
NotVal);
- }
-
- // Use DeMorgan and reassociation to eliminate a 'not' op.
- Constant *C1;
- if (match(Op1, m_Constant(C1))) {
- Constant *C2;
- if (match(Op0, m_OneUse(m_Or(m_Not(m_Value(X)), m_Constant(C2))))) {
- // (~X | C2) ^ C1 --> ((X & ~C2) ^ -1) ^ C1 --> (X & ~C2) ^ ~C1
- Value *And = Builder.CreateAnd(X, ConstantExpr::getNot(C2));
- return BinaryOperator::CreateXor(And, ConstantExpr::getNot(C1));
- }
- if (match(Op0, m_OneUse(m_And(m_Not(m_Value(X)), m_Constant(C2))))) {
- // (~X & C2) ^ C1 --> ((X | ~C2) ^ -1) ^ C1 --> (X | ~C2) ^ ~C1
- Value *Or = Builder.CreateOr(X, ConstantExpr::getNot(C2));
- return BinaryOperator::CreateXor(Or, ConstantExpr::getNot(C1));
- }
- }
-
- // not (cmp A, B) = !cmp A, B
- CmpInst::Predicate Pred;
- if (match(&I, m_Not(m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))))) {
- cast<CmpInst>(Op0)->setPredicate(CmpInst::getInversePredicate(Pred));
- return replaceInstUsesWith(I, Op0);
- }
-
- {
- const APInt *RHSC;
- if (match(Op1, m_APInt(RHSC))) {
- Value *X;
- const APInt *C;
+ }
+
+ // Use DeMorgan and reassociation to eliminate a 'not' op.
+ Constant *C1;
+ if (match(Op1, m_Constant(C1))) {
+ Constant *C2;
+ if (match(Op0, m_OneUse(m_Or(m_Not(m_Value(X)), m_Constant(C2))))) {
+ // (~X | C2) ^ C1 --> ((X & ~C2) ^ -1) ^ C1 --> (X & ~C2) ^ ~C1
+ Value *And = Builder.CreateAnd(X, ConstantExpr::getNot(C2));
+ return BinaryOperator::CreateXor(And, ConstantExpr::getNot(C1));
+ }
+ if (match(Op0, m_OneUse(m_And(m_Not(m_Value(X)), m_Constant(C2))))) {
+ // (~X & C2) ^ C1 --> ((X | ~C2) ^ -1) ^ C1 --> (X | ~C2) ^ ~C1
+ Value *Or = Builder.CreateOr(X, ConstantExpr::getNot(C2));
+ return BinaryOperator::CreateXor(Or, ConstantExpr::getNot(C1));
+ }
+ }
+
+ // not (cmp A, B) = !cmp A, B
+ CmpInst::Predicate Pred;
+ if (match(&I, m_Not(m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))))) {
+ cast<CmpInst>(Op0)->setPredicate(CmpInst::getInversePredicate(Pred));
+ return replaceInstUsesWith(I, Op0);
+ }
+
+ {
+ const APInt *RHSC;
+ if (match(Op1, m_APInt(RHSC))) {
+ Value *X;
+ const APInt *C;
// (C - X) ^ signmaskC --> (C + signmaskC) - X
if (RHSC->isSignMask() && match(Op0, m_Sub(m_APInt(C), m_Value(X))))
return BinaryOperator::CreateSub(ConstantInt::get(Ty, *C + *RHSC), X);
-
+
// (X + C) ^ signmaskC --> X + (C + signmaskC)
if (RHSC->isSignMask() && match(Op0, m_Add(m_Value(X), m_APInt(C))))
return BinaryOperator::CreateAdd(X, ConstantInt::get(Ty, *C + *RHSC));
// (X | C) ^ RHSC --> X ^ (C ^ RHSC) iff X & C == 0
- if (match(Op0, m_Or(m_Value(X), m_APInt(C))) &&
+ if (match(Op0, m_Or(m_Value(X), m_APInt(C))) &&
MaskedValueIsZero(X, *C, 0, &I))
return BinaryOperator::CreateXor(X, ConstantInt::get(Ty, *C ^ *RHSC));
@@ -3315,7 +3315,7 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
*RHSC == APInt::getAllOnesValue(Ty->getScalarSizeInBits()).shl(*C)) {
Value *NotX = Builder.CreateNot(X);
return BinaryOperator::CreateShl(NotX, ConstantInt::get(Ty, *C));
- }
+ }
// (X >>u C) ^ RHSC --> ~X >>u C
if (match(Op0, m_OneUse(m_LShr(m_Value(X), m_APInt(C)))) &&
*RHSC == APInt::getAllOnesValue(Ty->getScalarSizeInBits()).lshr(*C)) {
@@ -3325,9 +3325,9 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
// TODO: We could handle 'ashr' here as well. That would be matching
// a 'not' op and moving it before the shift. Doing that requires
// preventing the inverse fold in canShiftBinOpWithConstantRHS().
- }
- }
-
+ }
+ }
+
// FIXME: This should not be limited to scalar (pull into APInt match above).
{
Value *X;
@@ -3345,62 +3345,62 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
Opnd0->takeName(cast<Instruction>(Op0));
Opnd0->setDebugLoc(I.getDebugLoc());
return BinaryOperator::CreateXor(Opnd0, ConstantInt::get(Ty, FoldConst));
- }
- }
-
- if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
- return FoldedLogic;
-
- // Y ^ (X | Y) --> X & ~Y
- // Y ^ (Y | X) --> X & ~Y
- if (match(Op1, m_OneUse(m_c_Or(m_Value(X), m_Specific(Op0)))))
- return BinaryOperator::CreateAnd(X, Builder.CreateNot(Op0));
- // (X | Y) ^ Y --> X & ~Y
- // (Y | X) ^ Y --> X & ~Y
- if (match(Op0, m_OneUse(m_c_Or(m_Value(X), m_Specific(Op1)))))
- return BinaryOperator::CreateAnd(X, Builder.CreateNot(Op1));
-
- // Y ^ (X & Y) --> ~X & Y
- // Y ^ (Y & X) --> ~X & Y
- if (match(Op1, m_OneUse(m_c_And(m_Value(X), m_Specific(Op0)))))
- return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(X));
- // (X & Y) ^ Y --> ~X & Y
- // (Y & X) ^ Y --> ~X & Y
- // Canonical form is (X & C) ^ C; don't touch that.
- // TODO: A 'not' op is better for analysis and codegen, but demanded bits must
- // be fixed to prefer that (otherwise we get infinite looping).
- if (!match(Op1, m_Constant()) &&
- match(Op0, m_OneUse(m_c_And(m_Value(X), m_Specific(Op1)))))
- return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(X));
-
- Value *A, *B, *C;
- // (A ^ B) ^ (A | C) --> (~A & C) ^ B -- There are 4 commuted variants.
- if (match(&I, m_c_Xor(m_OneUse(m_Xor(m_Value(A), m_Value(B))),
- m_OneUse(m_c_Or(m_Deferred(A), m_Value(C))))))
- return BinaryOperator::CreateXor(
- Builder.CreateAnd(Builder.CreateNot(A), C), B);
-
- // (A ^ B) ^ (B | C) --> (~B & C) ^ A -- There are 4 commuted variants.
- if (match(&I, m_c_Xor(m_OneUse(m_Xor(m_Value(A), m_Value(B))),
- m_OneUse(m_c_Or(m_Deferred(B), m_Value(C))))))
- return BinaryOperator::CreateXor(
- Builder.CreateAnd(Builder.CreateNot(B), C), A);
-
- // (A & B) ^ (A ^ B) -> (A | B)
- if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
- match(Op1, m_c_Xor(m_Specific(A), m_Specific(B))))
- return BinaryOperator::CreateOr(A, B);
- // (A ^ B) ^ (A & B) -> (A | B)
- if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
- match(Op1, m_c_And(m_Specific(A), m_Specific(B))))
- return BinaryOperator::CreateOr(A, B);
-
- // (A & ~B) ^ ~A -> ~(A & B)
- // (~B & A) ^ ~A -> ~(A & B)
- if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
- match(Op1, m_Not(m_Specific(A))))
- return BinaryOperator::CreateNot(Builder.CreateAnd(A, B));
-
+ }
+ }
+
+ if (Instruction *FoldedLogic = foldBinOpIntoSelectOrPhi(I))
+ return FoldedLogic;
+
+ // Y ^ (X | Y) --> X & ~Y
+ // Y ^ (Y | X) --> X & ~Y
+ if (match(Op1, m_OneUse(m_c_Or(m_Value(X), m_Specific(Op0)))))
+ return BinaryOperator::CreateAnd(X, Builder.CreateNot(Op0));
+ // (X | Y) ^ Y --> X & ~Y
+ // (Y | X) ^ Y --> X & ~Y
+ if (match(Op0, m_OneUse(m_c_Or(m_Value(X), m_Specific(Op1)))))
+ return BinaryOperator::CreateAnd(X, Builder.CreateNot(Op1));
+
+ // Y ^ (X & Y) --> ~X & Y
+ // Y ^ (Y & X) --> ~X & Y
+ if (match(Op1, m_OneUse(m_c_And(m_Value(X), m_Specific(Op0)))))
+ return BinaryOperator::CreateAnd(Op0, Builder.CreateNot(X));
+ // (X & Y) ^ Y --> ~X & Y
+ // (Y & X) ^ Y --> ~X & Y
+ // Canonical form is (X & C) ^ C; don't touch that.
+ // TODO: A 'not' op is better for analysis and codegen, but demanded bits must
+ // be fixed to prefer that (otherwise we get infinite looping).
+ if (!match(Op1, m_Constant()) &&
+ match(Op0, m_OneUse(m_c_And(m_Value(X), m_Specific(Op1)))))
+ return BinaryOperator::CreateAnd(Op1, Builder.CreateNot(X));
+
+ Value *A, *B, *C;
+ // (A ^ B) ^ (A | C) --> (~A & C) ^ B -- There are 4 commuted variants.
+ if (match(&I, m_c_Xor(m_OneUse(m_Xor(m_Value(A), m_Value(B))),
+ m_OneUse(m_c_Or(m_Deferred(A), m_Value(C))))))
+ return BinaryOperator::CreateXor(
+ Builder.CreateAnd(Builder.CreateNot(A), C), B);
+
+ // (A ^ B) ^ (B | C) --> (~B & C) ^ A -- There are 4 commuted variants.
+ if (match(&I, m_c_Xor(m_OneUse(m_Xor(m_Value(A), m_Value(B))),
+ m_OneUse(m_c_Or(m_Deferred(B), m_Value(C))))))
+ return BinaryOperator::CreateXor(
+ Builder.CreateAnd(Builder.CreateNot(B), C), A);
+
+ // (A & B) ^ (A ^ B) -> (A | B)
+ if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+ match(Op1, m_c_Xor(m_Specific(A), m_Specific(B))))
+ return BinaryOperator::CreateOr(A, B);
+ // (A ^ B) ^ (A & B) -> (A | B)
+ if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+ match(Op1, m_c_And(m_Specific(A), m_Specific(B))))
+ return BinaryOperator::CreateOr(A, B);
+
+ // (A & ~B) ^ ~A -> ~(A & B)
+ // (~B & A) ^ ~A -> ~(A & B)
+ if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
+ match(Op1, m_Not(m_Specific(A))))
+ return BinaryOperator::CreateNot(Builder.CreateAnd(A, B));
+
// (~A & B) ^ A --> A | B -- There are 4 commuted variants.
if (match(&I, m_c_Xor(m_c_And(m_Not(m_Value(A)), m_Value(B)), m_Deferred(A))))
return BinaryOperator::CreateOr(A, B);
@@ -3420,90 +3420,90 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
}
}
- if (auto *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
- if (auto *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
- if (Value *V = foldXorOfICmps(LHS, RHS, I))
- return replaceInstUsesWith(I, V);
-
- if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
- return CastedXor;
-
- // Canonicalize a shifty way to code absolute value to the common pattern.
- // There are 4 potential commuted variants. Move the 'ashr' candidate to Op1.
- // We're relying on the fact that we only do this transform when the shift has
- // exactly 2 uses and the add has exactly 1 use (otherwise, we might increase
- // instructions).
- if (Op0->hasNUses(2))
- std::swap(Op0, Op1);
-
- const APInt *ShAmt;
- if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) &&
- Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
- match(Op0, m_OneUse(m_c_Add(m_Specific(A), m_Specific(Op1))))) {
- // B = ashr i32 A, 31 ; smear the sign bit
- // xor (add A, B), B ; add -1 and flip bits if negative
- // --> (A < 0) ? -A : A
- Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty));
- // Copy the nuw/nsw flags from the add to the negate.
- auto *Add = cast<BinaryOperator>(Op0);
- Value *Neg = Builder.CreateNeg(A, "", Add->hasNoUnsignedWrap(),
- Add->hasNoSignedWrap());
- return SelectInst::Create(Cmp, Neg, A);
- }
-
- // Eliminate a bitwise 'not' op of 'not' min/max by inverting the min/max:
- //
- // %notx = xor i32 %x, -1
- // %cmp1 = icmp sgt i32 %notx, %y
- // %smax = select i1 %cmp1, i32 %notx, i32 %y
- // %res = xor i32 %smax, -1
- // =>
- // %noty = xor i32 %y, -1
- // %cmp2 = icmp slt %x, %noty
- // %res = select i1 %cmp2, i32 %x, i32 %noty
- //
- // Same is applicable for smin/umax/umin.
- if (match(Op1, m_AllOnes()) && Op0->hasOneUse()) {
- Value *LHS, *RHS;
- SelectPatternFlavor SPF = matchSelectPattern(Op0, LHS, RHS).Flavor;
- if (SelectPatternResult::isMinOrMax(SPF)) {
- // It's possible we get here before the not has been simplified, so make
- // sure the input to the not isn't freely invertible.
- if (match(LHS, m_Not(m_Value(X))) && !isFreeToInvert(X, X->hasOneUse())) {
- Value *NotY = Builder.CreateNot(RHS);
- return SelectInst::Create(
- Builder.CreateICmp(getInverseMinMaxPred(SPF), X, NotY), X, NotY);
- }
-
- // It's possible we get here before the not has been simplified, so make
- // sure the input to the not isn't freely invertible.
- if (match(RHS, m_Not(m_Value(Y))) && !isFreeToInvert(Y, Y->hasOneUse())) {
- Value *NotX = Builder.CreateNot(LHS);
- return SelectInst::Create(
- Builder.CreateICmp(getInverseMinMaxPred(SPF), NotX, Y), NotX, Y);
- }
-
- // If both sides are freely invertible, then we can get rid of the xor
- // completely.
- if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
- isFreeToInvert(RHS, !RHS->hasNUsesOrMore(3))) {
- Value *NotLHS = Builder.CreateNot(LHS);
- Value *NotRHS = Builder.CreateNot(RHS);
- return SelectInst::Create(
- Builder.CreateICmp(getInverseMinMaxPred(SPF), NotLHS, NotRHS),
- NotLHS, NotRHS);
- }
- }
-
+ if (auto *LHS = dyn_cast<ICmpInst>(I.getOperand(0)))
+ if (auto *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
+ if (Value *V = foldXorOfICmps(LHS, RHS, I))
+ return replaceInstUsesWith(I, V);
+
+ if (Instruction *CastedXor = foldCastedBitwiseLogic(I))
+ return CastedXor;
+
+ // Canonicalize a shifty way to code absolute value to the common pattern.
+ // There are 4 potential commuted variants. Move the 'ashr' candidate to Op1.
+ // We're relying on the fact that we only do this transform when the shift has
+ // exactly 2 uses and the add has exactly 1 use (otherwise, we might increase
+ // instructions).
+ if (Op0->hasNUses(2))
+ std::swap(Op0, Op1);
+
+ const APInt *ShAmt;
+ if (match(Op1, m_AShr(m_Value(A), m_APInt(ShAmt))) &&
+ Op1->hasNUses(2) && *ShAmt == Ty->getScalarSizeInBits() - 1 &&
+ match(Op0, m_OneUse(m_c_Add(m_Specific(A), m_Specific(Op1))))) {
+ // B = ashr i32 A, 31 ; smear the sign bit
+ // xor (add A, B), B ; add -1 and flip bits if negative
+ // --> (A < 0) ? -A : A
+ Value *Cmp = Builder.CreateICmpSLT(A, ConstantInt::getNullValue(Ty));
+ // Copy the nuw/nsw flags from the add to the negate.
+ auto *Add = cast<BinaryOperator>(Op0);
+ Value *Neg = Builder.CreateNeg(A, "", Add->hasNoUnsignedWrap(),
+ Add->hasNoSignedWrap());
+ return SelectInst::Create(Cmp, Neg, A);
+ }
+
+ // Eliminate a bitwise 'not' op of 'not' min/max by inverting the min/max:
+ //
+ // %notx = xor i32 %x, -1
+ // %cmp1 = icmp sgt i32 %notx, %y
+ // %smax = select i1 %cmp1, i32 %notx, i32 %y
+ // %res = xor i32 %smax, -1
+ // =>
+ // %noty = xor i32 %y, -1
+ // %cmp2 = icmp slt %x, %noty
+ // %res = select i1 %cmp2, i32 %x, i32 %noty
+ //
+ // Same is applicable for smin/umax/umin.
+ if (match(Op1, m_AllOnes()) && Op0->hasOneUse()) {
+ Value *LHS, *RHS;
+ SelectPatternFlavor SPF = matchSelectPattern(Op0, LHS, RHS).Flavor;
+ if (SelectPatternResult::isMinOrMax(SPF)) {
+ // It's possible we get here before the not has been simplified, so make
+ // sure the input to the not isn't freely invertible.
+ if (match(LHS, m_Not(m_Value(X))) && !isFreeToInvert(X, X->hasOneUse())) {
+ Value *NotY = Builder.CreateNot(RHS);
+ return SelectInst::Create(
+ Builder.CreateICmp(getInverseMinMaxPred(SPF), X, NotY), X, NotY);
+ }
+
+ // It's possible we get here before the not has been simplified, so make
+ // sure the input to the not isn't freely invertible.
+ if (match(RHS, m_Not(m_Value(Y))) && !isFreeToInvert(Y, Y->hasOneUse())) {
+ Value *NotX = Builder.CreateNot(LHS);
+ return SelectInst::Create(
+ Builder.CreateICmp(getInverseMinMaxPred(SPF), NotX, Y), NotX, Y);
+ }
+
+ // If both sides are freely invertible, then we can get rid of the xor
+ // completely.
+ if (isFreeToInvert(LHS, !LHS->hasNUsesOrMore(3)) &&
+ isFreeToInvert(RHS, !RHS->hasNUsesOrMore(3))) {
+ Value *NotLHS = Builder.CreateNot(LHS);
+ Value *NotRHS = Builder.CreateNot(RHS);
+ return SelectInst::Create(
+ Builder.CreateICmp(getInverseMinMaxPred(SPF), NotLHS, NotRHS),
+ NotLHS, NotRHS);
+ }
+ }
+
// Pull 'not' into operands of select if both operands are one-use compares
// or one is one-use compare and the other one is a constant.
- // Inverting the predicates eliminates the 'not' operation.
- // Example:
+ // Inverting the predicates eliminates the 'not' operation.
+ // Example:
// not (select ?, (cmp TPred, ?, ?), (cmp FPred, ?, ?) -->
- // select ?, (cmp InvTPred, ?, ?), (cmp InvFPred, ?, ?)
+ // select ?, (cmp InvTPred, ?, ?), (cmp InvFPred, ?, ?)
// not (select ?, (cmp TPred, ?, ?), true -->
// select ?, (cmp InvTPred, ?, ?), false
- if (auto *Sel = dyn_cast<SelectInst>(Op0)) {
+ if (auto *Sel = dyn_cast<SelectInst>(Op0)) {
Value *TV = Sel->getTrueValue();
Value *FV = Sel->getFalseValue();
auto *CmpT = dyn_cast<CmpInst>(TV);
@@ -3519,14 +3519,14 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
CmpF->setPredicate(CmpF->getInversePredicate());
else
Sel->setFalseValue(ConstantExpr::getNot(cast<Constant>(FV)));
- return replaceInstUsesWith(I, Sel);
- }
- }
- }
-
- if (Instruction *NewXor = sinkNotIntoXor(I, Builder))
- return NewXor;
-
+ return replaceInstUsesWith(I, Sel);
+ }
+ }
+ }
+
+ if (Instruction *NewXor = sinkNotIntoXor(I, Builder))
+ return NewXor;
+
// Otherwise, if all else failed, try to hoist the xor-by-constant:
// (X ^ C) ^ Y --> (X ^ Y) ^ C
// Just like we do in other places, we completely avoid the fold
@@ -3537,5 +3537,5 @@ Instruction *InstCombinerImpl::visitXor(BinaryOperator &I) {
m_Value(Y))))
return BinaryOperator::CreateXor(Builder.CreateXor(X, Y), C1);
- return nullptr;
-}
+ return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
index e9115e2eae..495493aab4 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
@@ -1,159 +1,159 @@
-//===- InstCombineAtomicRMW.cpp -------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the visit functions for atomic rmw instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
-#include "llvm/IR/Instructions.h"
+//===- InstCombineAtomicRMW.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for atomic rmw instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
-
-using namespace llvm;
-
-namespace {
-/// Return true if and only if the given instruction does not modify the memory
-/// location referenced. Note that an idemptent atomicrmw may still have
-/// ordering effects on nearby instructions, or be volatile.
-/// TODO: Common w/ the version in AtomicExpandPass, and change the term used.
-/// Idemptotent is confusing in this context.
-bool isIdempotentRMW(AtomicRMWInst& RMWI) {
- if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand()))
- switch(RMWI.getOperation()) {
- case AtomicRMWInst::FAdd: // -0.0
- return CF->isZero() && CF->isNegative();
- case AtomicRMWInst::FSub: // +0.0
- return CF->isZero() && !CF->isNegative();
- default:
- return false;
- };
-
- auto C = dyn_cast<ConstantInt>(RMWI.getValOperand());
- if(!C)
- return false;
-
- switch(RMWI.getOperation()) {
- case AtomicRMWInst::Add:
- case AtomicRMWInst::Sub:
- case AtomicRMWInst::Or:
- case AtomicRMWInst::Xor:
- return C->isZero();
- case AtomicRMWInst::And:
- return C->isMinusOne();
- case AtomicRMWInst::Min:
- return C->isMaxValue(true);
- case AtomicRMWInst::Max:
- return C->isMinValue(true);
- case AtomicRMWInst::UMin:
- return C->isMaxValue(false);
- case AtomicRMWInst::UMax:
- return C->isMinValue(false);
- default:
- return false;
- }
-}
-
-/// Return true if the given instruction always produces a value in memory
-/// equivalent to its value operand.
-bool isSaturating(AtomicRMWInst& RMWI) {
- if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand()))
- switch(RMWI.getOperation()) {
- case AtomicRMWInst::FAdd:
- case AtomicRMWInst::FSub:
- return CF->isNaN();
- default:
- return false;
- };
-
- auto C = dyn_cast<ConstantInt>(RMWI.getValOperand());
- if(!C)
- return false;
-
- switch(RMWI.getOperation()) {
- default:
- return false;
- case AtomicRMWInst::Xchg:
- return true;
- case AtomicRMWInst::Or:
- return C->isAllOnesValue();
- case AtomicRMWInst::And:
- return C->isZero();
- case AtomicRMWInst::Min:
- return C->isMinValue(true);
- case AtomicRMWInst::Max:
- return C->isMaxValue(true);
- case AtomicRMWInst::UMin:
- return C->isMinValue(false);
- case AtomicRMWInst::UMax:
- return C->isMaxValue(false);
- };
-}
+
+using namespace llvm;
+
+namespace {
+/// Return true if and only if the given instruction does not modify the memory
+/// location referenced. Note that an idemptent atomicrmw may still have
+/// ordering effects on nearby instructions, or be volatile.
+/// TODO: Common w/ the version in AtomicExpandPass, and change the term used.
+/// Idemptotent is confusing in this context.
+bool isIdempotentRMW(AtomicRMWInst& RMWI) {
+ if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand()))
+ switch(RMWI.getOperation()) {
+ case AtomicRMWInst::FAdd: // -0.0
+ return CF->isZero() && CF->isNegative();
+ case AtomicRMWInst::FSub: // +0.0
+ return CF->isZero() && !CF->isNegative();
+ default:
+ return false;
+ };
+
+ auto C = dyn_cast<ConstantInt>(RMWI.getValOperand());
+ if(!C)
+ return false;
+
+ switch(RMWI.getOperation()) {
+ case AtomicRMWInst::Add:
+ case AtomicRMWInst::Sub:
+ case AtomicRMWInst::Or:
+ case AtomicRMWInst::Xor:
+ return C->isZero();
+ case AtomicRMWInst::And:
+ return C->isMinusOne();
+ case AtomicRMWInst::Min:
+ return C->isMaxValue(true);
+ case AtomicRMWInst::Max:
+ return C->isMinValue(true);
+ case AtomicRMWInst::UMin:
+ return C->isMaxValue(false);
+ case AtomicRMWInst::UMax:
+ return C->isMinValue(false);
+ default:
+ return false;
+ }
+}
+
+/// Return true if the given instruction always produces a value in memory
+/// equivalent to its value operand.
+bool isSaturating(AtomicRMWInst& RMWI) {
+ if (auto CF = dyn_cast<ConstantFP>(RMWI.getValOperand()))
+ switch(RMWI.getOperation()) {
+ case AtomicRMWInst::FAdd:
+ case AtomicRMWInst::FSub:
+ return CF->isNaN();
+ default:
+ return false;
+ };
+
+ auto C = dyn_cast<ConstantInt>(RMWI.getValOperand());
+ if(!C)
+ return false;
+
+ switch(RMWI.getOperation()) {
+ default:
+ return false;
+ case AtomicRMWInst::Xchg:
+ return true;
+ case AtomicRMWInst::Or:
+ return C->isAllOnesValue();
+ case AtomicRMWInst::And:
+ return C->isZero();
+ case AtomicRMWInst::Min:
+ return C->isMinValue(true);
+ case AtomicRMWInst::Max:
+ return C->isMaxValue(true);
+ case AtomicRMWInst::UMin:
+ return C->isMinValue(false);
+ case AtomicRMWInst::UMax:
+ return C->isMaxValue(false);
+ };
+}
} // namespace
-
+
Instruction *InstCombinerImpl::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
-
- // Volatile RMWs perform a load and a store, we cannot replace this by just a
- // load or just a store. We chose not to canonicalize out of general paranoia
+
+ // Volatile RMWs perform a load and a store, we cannot replace this by just a
+ // load or just a store. We chose not to canonicalize out of general paranoia
// about user expectations around volatile.
- if (RMWI.isVolatile())
- return nullptr;
-
- // Any atomicrmw op which produces a known result in memory can be
- // replaced w/an atomicrmw xchg.
- if (isSaturating(RMWI) &&
- RMWI.getOperation() != AtomicRMWInst::Xchg) {
- RMWI.setOperation(AtomicRMWInst::Xchg);
- return &RMWI;
- }
-
- AtomicOrdering Ordering = RMWI.getOrdering();
- assert(Ordering != AtomicOrdering::NotAtomic &&
- Ordering != AtomicOrdering::Unordered &&
- "AtomicRMWs don't make sense with Unordered or NotAtomic");
-
- // Any atomicrmw xchg with no uses can be converted to a atomic store if the
+ if (RMWI.isVolatile())
+ return nullptr;
+
+ // Any atomicrmw op which produces a known result in memory can be
+ // replaced w/an atomicrmw xchg.
+ if (isSaturating(RMWI) &&
+ RMWI.getOperation() != AtomicRMWInst::Xchg) {
+ RMWI.setOperation(AtomicRMWInst::Xchg);
+ return &RMWI;
+ }
+
+ AtomicOrdering Ordering = RMWI.getOrdering();
+ assert(Ordering != AtomicOrdering::NotAtomic &&
+ Ordering != AtomicOrdering::Unordered &&
+ "AtomicRMWs don't make sense with Unordered or NotAtomic");
+
+ // Any atomicrmw xchg with no uses can be converted to a atomic store if the
// ordering is compatible.
- if (RMWI.getOperation() == AtomicRMWInst::Xchg &&
- RMWI.use_empty()) {
- if (Ordering != AtomicOrdering::Release &&
- Ordering != AtomicOrdering::Monotonic)
- return nullptr;
- auto *SI = new StoreInst(RMWI.getValOperand(),
- RMWI.getPointerOperand(), &RMWI);
- SI->setAtomic(Ordering, RMWI.getSyncScopeID());
- SI->setAlignment(DL.getABITypeAlign(RMWI.getType()));
- return eraseInstFromFunction(RMWI);
- }
-
- if (!isIdempotentRMW(RMWI))
- return nullptr;
-
- // We chose to canonicalize all idempotent operations to an single
- // operation code and constant. This makes it easier for the rest of the
- // optimizer to match easily. The choices of or w/0 and fadd w/-0.0 are
+ if (RMWI.getOperation() == AtomicRMWInst::Xchg &&
+ RMWI.use_empty()) {
+ if (Ordering != AtomicOrdering::Release &&
+ Ordering != AtomicOrdering::Monotonic)
+ return nullptr;
+ auto *SI = new StoreInst(RMWI.getValOperand(),
+ RMWI.getPointerOperand(), &RMWI);
+ SI->setAtomic(Ordering, RMWI.getSyncScopeID());
+ SI->setAlignment(DL.getABITypeAlign(RMWI.getType()));
+ return eraseInstFromFunction(RMWI);
+ }
+
+ if (!isIdempotentRMW(RMWI))
+ return nullptr;
+
+ // We chose to canonicalize all idempotent operations to an single
+ // operation code and constant. This makes it easier for the rest of the
+ // optimizer to match easily. The choices of or w/0 and fadd w/-0.0 are
// arbitrary.
- if (RMWI.getType()->isIntegerTy() &&
- RMWI.getOperation() != AtomicRMWInst::Or) {
- RMWI.setOperation(AtomicRMWInst::Or);
- return replaceOperand(RMWI, 1, ConstantInt::get(RMWI.getType(), 0));
- } else if (RMWI.getType()->isFloatingPointTy() &&
- RMWI.getOperation() != AtomicRMWInst::FAdd) {
- RMWI.setOperation(AtomicRMWInst::FAdd);
- return replaceOperand(RMWI, 1, ConstantFP::getNegativeZero(RMWI.getType()));
- }
-
- // Check if the required ordering is compatible with an atomic load.
- if (Ordering != AtomicOrdering::Acquire &&
- Ordering != AtomicOrdering::Monotonic)
- return nullptr;
-
- LoadInst *Load = new LoadInst(RMWI.getType(), RMWI.getPointerOperand(), "",
- false, DL.getABITypeAlign(RMWI.getType()),
- Ordering, RMWI.getSyncScopeID());
- return Load;
-}
+ if (RMWI.getType()->isIntegerTy() &&
+ RMWI.getOperation() != AtomicRMWInst::Or) {
+ RMWI.setOperation(AtomicRMWInst::Or);
+ return replaceOperand(RMWI, 1, ConstantInt::get(RMWI.getType(), 0));
+ } else if (RMWI.getType()->isFloatingPointTy() &&
+ RMWI.getOperation() != AtomicRMWInst::FAdd) {
+ RMWI.setOperation(AtomicRMWInst::FAdd);
+ return replaceOperand(RMWI, 1, ConstantFP::getNegativeZero(RMWI.getType()));
+ }
+
+ // Check if the required ordering is compatible with an atomic load.
+ if (Ordering != AtomicOrdering::Acquire &&
+ Ordering != AtomicOrdering::Monotonic)
+ return nullptr;
+
+ LoadInst *Load = new LoadInst(RMWI.getType(), RMWI.getPointerOperand(), "",
+ false, DL.getABITypeAlign(RMWI.getType()),
+ Ordering, RMWI.getSyncScopeID());
+ return Load;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCalls.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCalls.cpp
index a1fd5f4c4c..5482b944e3 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1,646 +1,646 @@
-//===- InstCombineCalls.cpp -----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the visitCall, visitInvoke, and visitCallBr functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/APSInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/FloatingPointMode.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumeBundleQueries.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
+//===- InstCombineCalls.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitCall, visitInvoke, and visitCallBr functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FloatingPointMode.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IntrinsicsAArch64.h"
-#include "llvm/IR/IntrinsicsAMDGPU.h"
-#include "llvm/IR/IntrinsicsARM.h"
-#include "llvm/IR/IntrinsicsHexagon.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Statepoint.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/KnownBits.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "instcombine"
-
-STATISTIC(NumSimplified, "Number of library calls simplified");
-
-static cl::opt<unsigned> GuardWideningWindow(
- "instcombine-guard-widening-window",
- cl::init(3),
- cl::desc("How wide an instruction window to bypass looking for "
- "another guard"));
-
-/// Return the specified type promoted as it would be to pass though a va_arg
-/// area.
-static Type *getPromotedType(Type *Ty) {
- if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
- if (ITy->getBitWidth() < 32)
- return Type::getInt32Ty(Ty->getContext());
- }
- return Ty;
-}
-
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+STATISTIC(NumSimplified, "Number of library calls simplified");
+
+static cl::opt<unsigned> GuardWideningWindow(
+ "instcombine-guard-widening-window",
+ cl::init(3),
+ cl::desc("How wide an instruction window to bypass looking for "
+ "another guard"));
+
+/// Return the specified type promoted as it would be to pass though a va_arg
+/// area.
+static Type *getPromotedType(Type *Ty) {
+ if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
+ if (ITy->getBitWidth() < 32)
+ return Type::getInt32Ty(Ty->getContext());
+ }
+ return Ty;
+}
+
Instruction *InstCombinerImpl::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
- Align DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
- MaybeAlign CopyDstAlign = MI->getDestAlign();
- if (!CopyDstAlign || *CopyDstAlign < DstAlign) {
- MI->setDestAlignment(DstAlign);
- return MI;
- }
-
- Align SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT);
- MaybeAlign CopySrcAlign = MI->getSourceAlign();
- if (!CopySrcAlign || *CopySrcAlign < SrcAlign) {
- MI->setSourceAlignment(SrcAlign);
- return MI;
- }
-
- // If we have a store to a location which is known constant, we can conclude
- // that the store must be storing the constant value (else the memory
- // wouldn't be constant), and this must be a noop.
- if (AA->pointsToConstantMemory(MI->getDest())) {
- // Set the size of the copy to 0, it will be deleted on the next iteration.
- MI->setLength(Constant::getNullValue(MI->getLength()->getType()));
- return MI;
- }
-
- // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
- // load/store.
- ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength());
- if (!MemOpLength) return nullptr;
-
- // Source and destination pointer types are always "i8*" for intrinsic. See
- // if the size is something we can handle with a single primitive load/store.
- // A single load+store correctly handles overlapping memory in the memmove
- // case.
- uint64_t Size = MemOpLength->getLimitedValue();
- assert(Size && "0-sized memory transferring should be removed already.");
-
- if (Size > 8 || (Size&(Size-1)))
- return nullptr; // If not 1/2/4/8 bytes, exit.
-
- // If it is an atomic and alignment is less than the size then we will
- // introduce the unaligned memory access which will be later transformed
- // into libcall in CodeGen. This is not evident performance gain so disable
- // it now.
- if (isa<AtomicMemTransferInst>(MI))
- if (*CopyDstAlign < Size || *CopySrcAlign < Size)
- return nullptr;
-
- // Use an integer load+store unless we can find something better.
- unsigned SrcAddrSp =
- cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace();
- unsigned DstAddrSp =
- cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace();
-
- IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
- Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
- Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);
-
- // If the memcpy has metadata describing the members, see if we can get the
- // TBAA tag describing our copy.
- MDNode *CopyMD = nullptr;
- if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) {
- CopyMD = M;
- } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
- if (M->getNumOperands() == 3 && M->getOperand(0) &&
- mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
- mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
- M->getOperand(1) &&
- mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
- mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
- Size &&
- M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
- CopyMD = cast<MDNode>(M->getOperand(2));
- }
-
- Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
- Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
- LoadInst *L = Builder.CreateLoad(IntType, Src);
- // Alignment from the mem intrinsic will be better, so use it.
- L->setAlignment(*CopySrcAlign);
- if (CopyMD)
- L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
- MDNode *LoopMemParallelMD =
- MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
- if (LoopMemParallelMD)
- L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
- MDNode *AccessGroupMD = MI->getMetadata(LLVMContext::MD_access_group);
- if (AccessGroupMD)
- L->setMetadata(LLVMContext::MD_access_group, AccessGroupMD);
-
- StoreInst *S = Builder.CreateStore(L, Dest);
- // Alignment from the mem intrinsic will be better, so use it.
- S->setAlignment(*CopyDstAlign);
- if (CopyMD)
- S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
- if (LoopMemParallelMD)
- S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
- if (AccessGroupMD)
- S->setMetadata(LLVMContext::MD_access_group, AccessGroupMD);
-
- if (auto *MT = dyn_cast<MemTransferInst>(MI)) {
- // non-atomics can be volatile
- L->setVolatile(MT->isVolatile());
- S->setVolatile(MT->isVolatile());
- }
- if (isa<AtomicMemTransferInst>(MI)) {
- // atomics have to be unordered
- L->setOrdering(AtomicOrdering::Unordered);
- S->setOrdering(AtomicOrdering::Unordered);
- }
-
- // Set the size of the copy to 0, it will be deleted on the next iteration.
- MI->setLength(Constant::getNullValue(MemOpLength->getType()));
- return MI;
-}
-
+ Align DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
+ MaybeAlign CopyDstAlign = MI->getDestAlign();
+ if (!CopyDstAlign || *CopyDstAlign < DstAlign) {
+ MI->setDestAlignment(DstAlign);
+ return MI;
+ }
+
+ Align SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT);
+ MaybeAlign CopySrcAlign = MI->getSourceAlign();
+ if (!CopySrcAlign || *CopySrcAlign < SrcAlign) {
+ MI->setSourceAlignment(SrcAlign);
+ return MI;
+ }
+
+ // If we have a store to a location which is known constant, we can conclude
+ // that the store must be storing the constant value (else the memory
+ // wouldn't be constant), and this must be a noop.
+ if (AA->pointsToConstantMemory(MI->getDest())) {
+ // Set the size of the copy to 0, it will be deleted on the next iteration.
+ MI->setLength(Constant::getNullValue(MI->getLength()->getType()));
+ return MI;
+ }
+
+ // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
+ // load/store.
+ ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength());
+ if (!MemOpLength) return nullptr;
+
+ // Source and destination pointer types are always "i8*" for intrinsic. See
+ // if the size is something we can handle with a single primitive load/store.
+ // A single load+store correctly handles overlapping memory in the memmove
+ // case.
+ uint64_t Size = MemOpLength->getLimitedValue();
+ assert(Size && "0-sized memory transferring should be removed already.");
+
+ if (Size > 8 || (Size&(Size-1)))
+ return nullptr; // If not 1/2/4/8 bytes, exit.
+
+ // If it is an atomic and alignment is less than the size then we will
+ // introduce the unaligned memory access which will be later transformed
+ // into libcall in CodeGen. This is not evident performance gain so disable
+ // it now.
+ if (isa<AtomicMemTransferInst>(MI))
+ if (*CopyDstAlign < Size || *CopySrcAlign < Size)
+ return nullptr;
+
+ // Use an integer load+store unless we can find something better.
+ unsigned SrcAddrSp =
+ cast<PointerType>(MI->getArgOperand(1)->getType())->getAddressSpace();
+ unsigned DstAddrSp =
+ cast<PointerType>(MI->getArgOperand(0)->getType())->getAddressSpace();
+
+ IntegerType* IntType = IntegerType::get(MI->getContext(), Size<<3);
+ Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
+ Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);
+
+ // If the memcpy has metadata describing the members, see if we can get the
+ // TBAA tag describing our copy.
+ MDNode *CopyMD = nullptr;
+ if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa)) {
+ CopyMD = M;
+ } else if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
+ if (M->getNumOperands() == 3 && M->getOperand(0) &&
+ mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
+ mdconst::extract<ConstantInt>(M->getOperand(0))->isZero() &&
+ M->getOperand(1) &&
+ mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
+ mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
+ Size &&
+ M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
+ CopyMD = cast<MDNode>(M->getOperand(2));
+ }
+
+ Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
+ Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
+ LoadInst *L = Builder.CreateLoad(IntType, Src);
+ // Alignment from the mem intrinsic will be better, so use it.
+ L->setAlignment(*CopySrcAlign);
+ if (CopyMD)
+ L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
+ MDNode *LoopMemParallelMD =
+ MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
+ if (LoopMemParallelMD)
+ L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
+ MDNode *AccessGroupMD = MI->getMetadata(LLVMContext::MD_access_group);
+ if (AccessGroupMD)
+ L->setMetadata(LLVMContext::MD_access_group, AccessGroupMD);
+
+ StoreInst *S = Builder.CreateStore(L, Dest);
+ // Alignment from the mem intrinsic will be better, so use it.
+ S->setAlignment(*CopyDstAlign);
+ if (CopyMD)
+ S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
+ if (LoopMemParallelMD)
+ S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
+ if (AccessGroupMD)
+ S->setMetadata(LLVMContext::MD_access_group, AccessGroupMD);
+
+ if (auto *MT = dyn_cast<MemTransferInst>(MI)) {
+ // non-atomics can be volatile
+ L->setVolatile(MT->isVolatile());
+ S->setVolatile(MT->isVolatile());
+ }
+ if (isa<AtomicMemTransferInst>(MI)) {
+ // atomics have to be unordered
+ L->setOrdering(AtomicOrdering::Unordered);
+ S->setOrdering(AtomicOrdering::Unordered);
+ }
+
+ // Set the size of the copy to 0, it will be deleted on the next iteration.
+ MI->setLength(Constant::getNullValue(MemOpLength->getType()));
+ return MI;
+}
+
Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
- const Align KnownAlignment =
- getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
- MaybeAlign MemSetAlign = MI->getDestAlign();
- if (!MemSetAlign || *MemSetAlign < KnownAlignment) {
- MI->setDestAlignment(KnownAlignment);
- return MI;
- }
-
- // If we have a store to a location which is known constant, we can conclude
- // that the store must be storing the constant value (else the memory
- // wouldn't be constant), and this must be a noop.
- if (AA->pointsToConstantMemory(MI->getDest())) {
- // Set the size of the copy to 0, it will be deleted on the next iteration.
- MI->setLength(Constant::getNullValue(MI->getLength()->getType()));
- return MI;
- }
-
- // Extract the length and alignment and fill if they are constant.
- ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
- ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
- if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
- return nullptr;
- const uint64_t Len = LenC->getLimitedValue();
- assert(Len && "0-sized memory setting should be removed already.");
- const Align Alignment = assumeAligned(MI->getDestAlignment());
-
- // If it is an atomic and alignment is less than the size then we will
- // introduce the unaligned memory access which will be later transformed
- // into libcall in CodeGen. This is not evident performance gain so disable
- // it now.
- if (isa<AtomicMemSetInst>(MI))
- if (Alignment < Len)
- return nullptr;
-
- // memset(s,c,n) -> store s, c (for n=1,2,4,8)
- if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
- Type *ITy = IntegerType::get(MI->getContext(), Len*8); // n=1 -> i8.
-
- Value *Dest = MI->getDest();
- unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace();
- Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp);
- Dest = Builder.CreateBitCast(Dest, NewDstPtrTy);
-
- // Extract the fill value and store.
- uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
- StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest,
- MI->isVolatile());
- S->setAlignment(Alignment);
- if (isa<AtomicMemSetInst>(MI))
- S->setOrdering(AtomicOrdering::Unordered);
-
- // Set the size of the copy to 0, it will be deleted on the next iteration.
- MI->setLength(Constant::getNullValue(LenC->getType()));
- return MI;
- }
-
- return nullptr;
-}
-
-// TODO, Obvious Missing Transforms:
-// * Narrow width by halfs excluding zero/undef lanes
+ const Align KnownAlignment =
+ getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
+ MaybeAlign MemSetAlign = MI->getDestAlign();
+ if (!MemSetAlign || *MemSetAlign < KnownAlignment) {
+ MI->setDestAlignment(KnownAlignment);
+ return MI;
+ }
+
+ // If we have a store to a location which is known constant, we can conclude
+ // that the store must be storing the constant value (else the memory
+ // wouldn't be constant), and this must be a noop.
+ if (AA->pointsToConstantMemory(MI->getDest())) {
+ // Set the size of the copy to 0, it will be deleted on the next iteration.
+ MI->setLength(Constant::getNullValue(MI->getLength()->getType()));
+ return MI;
+ }
+
+ // Extract the length and alignment and fill if they are constant.
+ ConstantInt *LenC = dyn_cast<ConstantInt>(MI->getLength());
+ ConstantInt *FillC = dyn_cast<ConstantInt>(MI->getValue());
+ if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8))
+ return nullptr;
+ const uint64_t Len = LenC->getLimitedValue();
+ assert(Len && "0-sized memory setting should be removed already.");
+ const Align Alignment = assumeAligned(MI->getDestAlignment());
+
+ // If it is an atomic and alignment is less than the size then we will
+ // introduce the unaligned memory access which will be later transformed
+ // into libcall in CodeGen. This is not evident performance gain so disable
+ // it now.
+ if (isa<AtomicMemSetInst>(MI))
+ if (Alignment < Len)
+ return nullptr;
+
+ // memset(s,c,n) -> store s, c (for n=1,2,4,8)
+ if (Len <= 8 && isPowerOf2_32((uint32_t)Len)) {
+ Type *ITy = IntegerType::get(MI->getContext(), Len*8); // n=1 -> i8.
+
+ Value *Dest = MI->getDest();
+ unsigned DstAddrSp = cast<PointerType>(Dest->getType())->getAddressSpace();
+ Type *NewDstPtrTy = PointerType::get(ITy, DstAddrSp);
+ Dest = Builder.CreateBitCast(Dest, NewDstPtrTy);
+
+ // Extract the fill value and store.
+ uint64_t Fill = FillC->getZExtValue()*0x0101010101010101ULL;
+ StoreInst *S = Builder.CreateStore(ConstantInt::get(ITy, Fill), Dest,
+ MI->isVolatile());
+ S->setAlignment(Alignment);
+ if (isa<AtomicMemSetInst>(MI))
+ S->setOrdering(AtomicOrdering::Unordered);
+
+ // Set the size of the copy to 0, it will be deleted on the next iteration.
+ MI->setLength(Constant::getNullValue(LenC->getType()));
+ return MI;
+ }
+
+ return nullptr;
+}
+
+// TODO, Obvious Missing Transforms:
+// * Narrow width by halfs excluding zero/undef lanes
Value *InstCombinerImpl::simplifyMaskedLoad(IntrinsicInst &II) {
- Value *LoadPtr = II.getArgOperand(0);
- const Align Alignment =
- cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
-
- // If the mask is all ones or undefs, this is a plain vector load of the 1st
- // argument.
- if (maskIsAllOneOrUndef(II.getArgOperand(2)))
- return Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
- "unmaskedload");
-
- // If we can unconditionally load from this address, replace with a
- // load/select idiom. TODO: use DT for context sensitive query
+ Value *LoadPtr = II.getArgOperand(0);
+ const Align Alignment =
+ cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
+
+ // If the mask is all ones or undefs, this is a plain vector load of the 1st
+ // argument.
+ if (maskIsAllOneOrUndef(II.getArgOperand(2)))
+ return Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
+ "unmaskedload");
+
+ // If we can unconditionally load from this address, replace with a
+ // load/select idiom. TODO: use DT for context sensitive query
if (isDereferenceablePointer(LoadPtr, II.getType(),
II.getModule()->getDataLayout(), &II, nullptr)) {
- Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
- "unmaskedload");
- return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3));
- }
-
- return nullptr;
-}
-
-// TODO, Obvious Missing Transforms:
-// * Single constant active lane -> store
-// * Narrow width by halfs excluding zero/undef lanes
+ Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
+ "unmaskedload");
+ return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3));
+ }
+
+ return nullptr;
+}
+
+// TODO, Obvious Missing Transforms:
+// * Single constant active lane -> store
+// * Narrow width by halfs excluding zero/undef lanes
Instruction *InstCombinerImpl::simplifyMaskedStore(IntrinsicInst &II) {
- auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
- if (!ConstMask)
- return nullptr;
-
- // If the mask is all zeros, this instruction does nothing.
- if (ConstMask->isNullValue())
- return eraseInstFromFunction(II);
-
- // If the mask is all ones, this is a plain vector store of the 1st argument.
- if (ConstMask->isAllOnesValue()) {
- Value *StorePtr = II.getArgOperand(1);
- Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
- return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
- }
-
+ auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+ if (!ConstMask)
+ return nullptr;
+
+ // If the mask is all zeros, this instruction does nothing.
+ if (ConstMask->isNullValue())
+ return eraseInstFromFunction(II);
+
+ // If the mask is all ones, this is a plain vector store of the 1st argument.
+ if (ConstMask->isAllOnesValue()) {
+ Value *StorePtr = II.getArgOperand(1);
+ Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
+ return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
+ }
+
if (isa<ScalableVectorType>(ConstMask->getType()))
return nullptr;
- // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts
- APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
- APInt UndefElts(DemandedElts.getBitWidth(), 0);
+ // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts
+ APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
+ APInt UndefElts(DemandedElts.getBitWidth(), 0);
if (Value *V =
SimplifyDemandedVectorElts(II.getOperand(0), DemandedElts, UndefElts))
- return replaceOperand(II, 0, V);
-
- return nullptr;
-}
-
-// TODO, Obvious Missing Transforms:
-// * Single constant active lane load -> load
-// * Dereferenceable address & few lanes -> scalarize speculative load/selects
-// * Adjacent vector addresses -> masked.load
-// * Narrow width by halfs excluding zero/undef lanes
-// * Vector splat address w/known mask -> scalar load
-// * Vector incrementing address -> vector masked load
+ return replaceOperand(II, 0, V);
+
+ return nullptr;
+}
+
+// TODO, Obvious Missing Transforms:
+// * Single constant active lane load -> load
+// * Dereferenceable address & few lanes -> scalarize speculative load/selects
+// * Adjacent vector addresses -> masked.load
+// * Narrow width by halfs excluding zero/undef lanes
+// * Vector splat address w/known mask -> scalar load
+// * Vector incrementing address -> vector masked load
Instruction *InstCombinerImpl::simplifyMaskedGather(IntrinsicInst &II) {
- return nullptr;
-}
-
-// TODO, Obvious Missing Transforms:
-// * Single constant active lane -> store
-// * Adjacent vector addresses -> masked.store
-// * Narrow store width by halfs excluding zero/undef lanes
-// * Vector splat address w/known mask -> scalar store
-// * Vector incrementing address -> vector masked store
+ return nullptr;
+}
+
+// TODO, Obvious Missing Transforms:
+// * Single constant active lane -> store
+// * Adjacent vector addresses -> masked.store
+// * Narrow store width by halfs excluding zero/undef lanes
+// * Vector splat address w/known mask -> scalar store
+// * Vector incrementing address -> vector masked store
Instruction *InstCombinerImpl::simplifyMaskedScatter(IntrinsicInst &II) {
- auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
- if (!ConstMask)
- return nullptr;
-
- // If the mask is all zeros, a scatter does nothing.
- if (ConstMask->isNullValue())
- return eraseInstFromFunction(II);
-
+ auto *ConstMask = dyn_cast<Constant>(II.getArgOperand(3));
+ if (!ConstMask)
+ return nullptr;
+
+ // If the mask is all zeros, a scatter does nothing.
+ if (ConstMask->isNullValue())
+ return eraseInstFromFunction(II);
+
if (isa<ScalableVectorType>(ConstMask->getType()))
return nullptr;
- // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts
- APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
- APInt UndefElts(DemandedElts.getBitWidth(), 0);
+ // Use masked off lanes to simplify operands via SimplifyDemandedVectorElts
+ APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
+ APInt UndefElts(DemandedElts.getBitWidth(), 0);
if (Value *V =
SimplifyDemandedVectorElts(II.getOperand(0), DemandedElts, UndefElts))
- return replaceOperand(II, 0, V);
+ return replaceOperand(II, 0, V);
if (Value *V =
SimplifyDemandedVectorElts(II.getOperand(1), DemandedElts, UndefElts))
- return replaceOperand(II, 1, V);
-
- return nullptr;
-}
-
-/// This function transforms launder.invariant.group and strip.invariant.group
-/// like:
-/// launder(launder(%x)) -> launder(%x) (the result is not the argument)
-/// launder(strip(%x)) -> launder(%x)
-/// strip(strip(%x)) -> strip(%x) (the result is not the argument)
-/// strip(launder(%x)) -> strip(%x)
-/// This is legal because it preserves the most recent information about
-/// the presence or absence of invariant.group.
-static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II,
+ return replaceOperand(II, 1, V);
+
+ return nullptr;
+}
+
+/// This function transforms launder.invariant.group and strip.invariant.group
+/// like:
+/// launder(launder(%x)) -> launder(%x) (the result is not the argument)
+/// launder(strip(%x)) -> launder(%x)
+/// strip(strip(%x)) -> strip(%x) (the result is not the argument)
+/// strip(launder(%x)) -> strip(%x)
+/// This is legal because it preserves the most recent information about
+/// the presence or absence of invariant.group.
+static Instruction *simplifyInvariantGroupIntrinsic(IntrinsicInst &II,
InstCombinerImpl &IC) {
- auto *Arg = II.getArgOperand(0);
- auto *StrippedArg = Arg->stripPointerCasts();
- auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups();
- if (StrippedArg == StrippedInvariantGroupsArg)
- return nullptr; // No launders/strips to remove.
-
- Value *Result = nullptr;
-
- if (II.getIntrinsicID() == Intrinsic::launder_invariant_group)
- Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg);
- else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group)
- Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg);
- else
- llvm_unreachable(
- "simplifyInvariantGroupIntrinsic only handles launder and strip");
- if (Result->getType()->getPointerAddressSpace() !=
- II.getType()->getPointerAddressSpace())
- Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType());
- if (Result->getType() != II.getType())
- Result = IC.Builder.CreateBitCast(Result, II.getType());
-
- return cast<Instruction>(Result);
-}
-
+ auto *Arg = II.getArgOperand(0);
+ auto *StrippedArg = Arg->stripPointerCasts();
+ auto *StrippedInvariantGroupsArg = Arg->stripPointerCastsAndInvariantGroups();
+ if (StrippedArg == StrippedInvariantGroupsArg)
+ return nullptr; // No launders/strips to remove.
+
+ Value *Result = nullptr;
+
+ if (II.getIntrinsicID() == Intrinsic::launder_invariant_group)
+ Result = IC.Builder.CreateLaunderInvariantGroup(StrippedInvariantGroupsArg);
+ else if (II.getIntrinsicID() == Intrinsic::strip_invariant_group)
+ Result = IC.Builder.CreateStripInvariantGroup(StrippedInvariantGroupsArg);
+ else
+ llvm_unreachable(
+ "simplifyInvariantGroupIntrinsic only handles launder and strip");
+ if (Result->getType()->getPointerAddressSpace() !=
+ II.getType()->getPointerAddressSpace())
+ Result = IC.Builder.CreateAddrSpaceCast(Result, II.getType());
+ if (Result->getType() != II.getType())
+ Result = IC.Builder.CreateBitCast(Result, II.getType());
+
+ return cast<Instruction>(Result);
+}
+
static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
- assert((II.getIntrinsicID() == Intrinsic::cttz ||
- II.getIntrinsicID() == Intrinsic::ctlz) &&
- "Expected cttz or ctlz intrinsic");
- bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
- Value *Op0 = II.getArgOperand(0);
- Value *X;
- // ctlz(bitreverse(x)) -> cttz(x)
- // cttz(bitreverse(x)) -> ctlz(x)
- if (match(Op0, m_BitReverse(m_Value(X)))) {
- Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz;
- Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType());
- return CallInst::Create(F, {X, II.getArgOperand(1)});
- }
-
- if (IsTZ) {
- // cttz(-x) -> cttz(x)
- if (match(Op0, m_Neg(m_Value(X))))
- return IC.replaceOperand(II, 0, X);
-
- // cttz(abs(x)) -> cttz(x)
- // cttz(nabs(x)) -> cttz(x)
- Value *Y;
- SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
- if (SPF == SPF_ABS || SPF == SPF_NABS)
- return IC.replaceOperand(II, 0, X);
+ assert((II.getIntrinsicID() == Intrinsic::cttz ||
+ II.getIntrinsicID() == Intrinsic::ctlz) &&
+ "Expected cttz or ctlz intrinsic");
+ bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz;
+ Value *Op0 = II.getArgOperand(0);
+ Value *X;
+ // ctlz(bitreverse(x)) -> cttz(x)
+ // cttz(bitreverse(x)) -> ctlz(x)
+ if (match(Op0, m_BitReverse(m_Value(X)))) {
+ Intrinsic::ID ID = IsTZ ? Intrinsic::ctlz : Intrinsic::cttz;
+ Function *F = Intrinsic::getDeclaration(II.getModule(), ID, II.getType());
+ return CallInst::Create(F, {X, II.getArgOperand(1)});
+ }
+
+ if (IsTZ) {
+ // cttz(-x) -> cttz(x)
+ if (match(Op0, m_Neg(m_Value(X))))
+ return IC.replaceOperand(II, 0, X);
+
+ // cttz(abs(x)) -> cttz(x)
+ // cttz(nabs(x)) -> cttz(x)
+ Value *Y;
+ SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
+ if (SPF == SPF_ABS || SPF == SPF_NABS)
+ return IC.replaceOperand(II, 0, X);
if (match(Op0, m_Intrinsic<Intrinsic::abs>(m_Value(X))))
return IC.replaceOperand(II, 0, X);
- }
-
- KnownBits Known = IC.computeKnownBits(Op0, 0, &II);
-
- // Create a mask for bits above (ctlz) or below (cttz) the first known one.
- unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros()
- : Known.countMaxLeadingZeros();
- unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros()
- : Known.countMinLeadingZeros();
-
- // If all bits above (ctlz) or below (cttz) the first known one are known
- // zero, this value is constant.
- // FIXME: This should be in InstSimplify because we're replacing an
- // instruction with a constant.
- if (PossibleZeros == DefiniteZeros) {
- auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros);
- return IC.replaceInstUsesWith(II, C);
- }
-
- // If the input to cttz/ctlz is known to be non-zero,
- // then change the 'ZeroIsUndef' parameter to 'true'
- // because we know the zero behavior can't affect the result.
- if (!Known.One.isNullValue() ||
- isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II,
- &IC.getDominatorTree())) {
- if (!match(II.getArgOperand(1), m_One()))
- return IC.replaceOperand(II, 1, IC.Builder.getTrue());
- }
-
- // Add range metadata since known bits can't completely reflect what we know.
- // TODO: Handle splat vectors.
- auto *IT = dyn_cast<IntegerType>(Op0->getType());
- if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
- Metadata *LowAndHigh[] = {
- ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)),
- ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))};
- II.setMetadata(LLVMContext::MD_range,
- MDNode::get(II.getContext(), LowAndHigh));
- return &II;
- }
-
- return nullptr;
-}
-
+ }
+
+ KnownBits Known = IC.computeKnownBits(Op0, 0, &II);
+
+ // Create a mask for bits above (ctlz) or below (cttz) the first known one.
+ unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros()
+ : Known.countMaxLeadingZeros();
+ unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros()
+ : Known.countMinLeadingZeros();
+
+ // If all bits above (ctlz) or below (cttz) the first known one are known
+ // zero, this value is constant.
+ // FIXME: This should be in InstSimplify because we're replacing an
+ // instruction with a constant.
+ if (PossibleZeros == DefiniteZeros) {
+ auto *C = ConstantInt::get(Op0->getType(), DefiniteZeros);
+ return IC.replaceInstUsesWith(II, C);
+ }
+
+ // If the input to cttz/ctlz is known to be non-zero,
+ // then change the 'ZeroIsUndef' parameter to 'true'
+ // because we know the zero behavior can't affect the result.
+ if (!Known.One.isNullValue() ||
+ isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II,
+ &IC.getDominatorTree())) {
+ if (!match(II.getArgOperand(1), m_One()))
+ return IC.replaceOperand(II, 1, IC.Builder.getTrue());
+ }
+
+ // Add range metadata since known bits can't completely reflect what we know.
+ // TODO: Handle splat vectors.
+ auto *IT = dyn_cast<IntegerType>(Op0->getType());
+ if (IT && IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
+ Metadata *LowAndHigh[] = {
+ ConstantAsMetadata::get(ConstantInt::get(IT, DefiniteZeros)),
+ ConstantAsMetadata::get(ConstantInt::get(IT, PossibleZeros + 1))};
+ II.setMetadata(LLVMContext::MD_range,
+ MDNode::get(II.getContext(), LowAndHigh));
+ return &II;
+ }
+
+ return nullptr;
+}
+
static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
- assert(II.getIntrinsicID() == Intrinsic::ctpop &&
- "Expected ctpop intrinsic");
- Type *Ty = II.getType();
- unsigned BitWidth = Ty->getScalarSizeInBits();
- Value *Op0 = II.getArgOperand(0);
- Value *X;
-
- // ctpop(bitreverse(x)) -> ctpop(x)
- // ctpop(bswap(x)) -> ctpop(x)
- if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X))))
- return IC.replaceOperand(II, 0, X);
-
- // ctpop(x | -x) -> bitwidth - cttz(x, false)
- if (Op0->hasOneUse() &&
- match(Op0, m_c_Or(m_Value(X), m_Neg(m_Deferred(X))))) {
- Function *F =
- Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty);
- auto *Cttz = IC.Builder.CreateCall(F, {X, IC.Builder.getFalse()});
- auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth));
- return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz));
- }
-
- // ctpop(~x & (x - 1)) -> cttz(x, false)
- if (match(Op0,
- m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) {
- Function *F =
- Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty);
- return CallInst::Create(F, {X, IC.Builder.getFalse()});
- }
-
- // FIXME: Try to simplify vectors of integers.
- auto *IT = dyn_cast<IntegerType>(Ty);
- if (!IT)
- return nullptr;
-
- KnownBits Known(BitWidth);
- IC.computeKnownBits(Op0, Known, 0, &II);
-
- unsigned MinCount = Known.countMinPopulation();
- unsigned MaxCount = Known.countMaxPopulation();
-
- // Add range metadata since known bits can't completely reflect what we know.
- if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
- Metadata *LowAndHigh[] = {
- ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)),
- ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))};
- II.setMetadata(LLVMContext::MD_range,
- MDNode::get(II.getContext(), LowAndHigh));
- return &II;
- }
-
- return nullptr;
-}
-
-/// Convert a table lookup to shufflevector if the mask is constant.
-/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
-/// which case we could lower the shufflevector with rev64 instructions
-/// as it's actually a byte reverse.
-static Value *simplifyNeonTbl1(const IntrinsicInst &II,
- InstCombiner::BuilderTy &Builder) {
- // Bail out if the mask is not a constant.
- auto *C = dyn_cast<Constant>(II.getArgOperand(1));
- if (!C)
- return nullptr;
-
+ assert(II.getIntrinsicID() == Intrinsic::ctpop &&
+ "Expected ctpop intrinsic");
+ Type *Ty = II.getType();
+ unsigned BitWidth = Ty->getScalarSizeInBits();
+ Value *Op0 = II.getArgOperand(0);
+ Value *X;
+
+ // ctpop(bitreverse(x)) -> ctpop(x)
+ // ctpop(bswap(x)) -> ctpop(x)
+ if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X))))
+ return IC.replaceOperand(II, 0, X);
+
+ // ctpop(x | -x) -> bitwidth - cttz(x, false)
+ if (Op0->hasOneUse() &&
+ match(Op0, m_c_Or(m_Value(X), m_Neg(m_Deferred(X))))) {
+ Function *F =
+ Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty);
+ auto *Cttz = IC.Builder.CreateCall(F, {X, IC.Builder.getFalse()});
+ auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth));
+ return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz));
+ }
+
+ // ctpop(~x & (x - 1)) -> cttz(x, false)
+ if (match(Op0,
+ m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) {
+ Function *F =
+ Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty);
+ return CallInst::Create(F, {X, IC.Builder.getFalse()});
+ }
+
+ // FIXME: Try to simplify vectors of integers.
+ auto *IT = dyn_cast<IntegerType>(Ty);
+ if (!IT)
+ return nullptr;
+
+ KnownBits Known(BitWidth);
+ IC.computeKnownBits(Op0, Known, 0, &II);
+
+ unsigned MinCount = Known.countMinPopulation();
+ unsigned MaxCount = Known.countMaxPopulation();
+
+ // Add range metadata since known bits can't completely reflect what we know.
+ if (IT->getBitWidth() != 1 && !II.getMetadata(LLVMContext::MD_range)) {
+ Metadata *LowAndHigh[] = {
+ ConstantAsMetadata::get(ConstantInt::get(IT, MinCount)),
+ ConstantAsMetadata::get(ConstantInt::get(IT, MaxCount + 1))};
+ II.setMetadata(LLVMContext::MD_range,
+ MDNode::get(II.getContext(), LowAndHigh));
+ return &II;
+ }
+
+ return nullptr;
+}
+
+/// Convert a table lookup to shufflevector if the mask is constant.
+/// This could benefit tbl1 if the mask is { 7,6,5,4,3,2,1,0 }, in
+/// which case we could lower the shufflevector with rev64 instructions
+/// as it's actually a byte reverse.
+static Value *simplifyNeonTbl1(const IntrinsicInst &II,
+ InstCombiner::BuilderTy &Builder) {
+ // Bail out if the mask is not a constant.
+ auto *C = dyn_cast<Constant>(II.getArgOperand(1));
+ if (!C)
+ return nullptr;
+
auto *VecTy = cast<FixedVectorType>(II.getType());
- unsigned NumElts = VecTy->getNumElements();
-
- // Only perform this transformation for <8 x i8> vector types.
- if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
- return nullptr;
-
- int Indexes[8];
-
- for (unsigned I = 0; I < NumElts; ++I) {
- Constant *COp = C->getAggregateElement(I);
-
- if (!COp || !isa<ConstantInt>(COp))
- return nullptr;
-
- Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
-
- // Make sure the mask indices are in range.
- if ((unsigned)Indexes[I] >= NumElts)
- return nullptr;
- }
-
- auto *V1 = II.getArgOperand(0);
- auto *V2 = Constant::getNullValue(V1->getType());
- return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes));
-}
-
-// Returns true iff the 2 intrinsics have the same operands, limiting the
-// comparison to the first NumOperands.
-static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
- unsigned NumOperands) {
- assert(I.getNumArgOperands() >= NumOperands && "Not enough operands");
- assert(E.getNumArgOperands() >= NumOperands && "Not enough operands");
- for (unsigned i = 0; i < NumOperands; i++)
- if (I.getArgOperand(i) != E.getArgOperand(i))
- return false;
- return true;
-}
-
-// Remove trivially empty start/end intrinsic ranges, i.e. a start
-// immediately followed by an end (ignoring debuginfo or other
-// start/end intrinsics in between). As this handles only the most trivial
-// cases, tracking the nesting level is not needed:
-//
-// call @llvm.foo.start(i1 0)
-// call @llvm.foo.start(i1 0) ; This one won't be skipped: it will be removed
-// call @llvm.foo.end(i1 0)
-// call @llvm.foo.end(i1 0) ; &I
+ unsigned NumElts = VecTy->getNumElements();
+
+ // Only perform this transformation for <8 x i8> vector types.
+ if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
+ return nullptr;
+
+ int Indexes[8];
+
+ for (unsigned I = 0; I < NumElts; ++I) {
+ Constant *COp = C->getAggregateElement(I);
+
+ if (!COp || !isa<ConstantInt>(COp))
+ return nullptr;
+
+ Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
+
+ // Make sure the mask indices are in range.
+ if ((unsigned)Indexes[I] >= NumElts)
+ return nullptr;
+ }
+
+ auto *V1 = II.getArgOperand(0);
+ auto *V2 = Constant::getNullValue(V1->getType());
+ return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes));
+}
+
+// Returns true iff the 2 intrinsics have the same operands, limiting the
+// comparison to the first NumOperands.
+static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
+ unsigned NumOperands) {
+ assert(I.getNumArgOperands() >= NumOperands && "Not enough operands");
+ assert(E.getNumArgOperands() >= NumOperands && "Not enough operands");
+ for (unsigned i = 0; i < NumOperands; i++)
+ if (I.getArgOperand(i) != E.getArgOperand(i))
+ return false;
+ return true;
+}
+
+// Remove trivially empty start/end intrinsic ranges, i.e. a start
+// immediately followed by an end (ignoring debuginfo or other
+// start/end intrinsics in between). As this handles only the most trivial
+// cases, tracking the nesting level is not needed:
+//
+// call @llvm.foo.start(i1 0)
+// call @llvm.foo.start(i1 0) ; This one won't be skipped: it will be removed
+// call @llvm.foo.end(i1 0)
+// call @llvm.foo.end(i1 0) ; &I
static bool
removeTriviallyEmptyRange(IntrinsicInst &EndI, InstCombinerImpl &IC,
std::function<bool(const IntrinsicInst &)> IsStart) {
- // We start from the end intrinsic and scan backwards, so that InstCombine
- // has already processed (and potentially removed) all the instructions
- // before the end intrinsic.
- BasicBlock::reverse_iterator BI(EndI), BE(EndI.getParent()->rend());
- for (; BI != BE; ++BI) {
- if (auto *I = dyn_cast<IntrinsicInst>(&*BI)) {
- if (isa<DbgInfoIntrinsic>(I) ||
- I->getIntrinsicID() == EndI.getIntrinsicID())
- continue;
- if (IsStart(*I)) {
- if (haveSameOperands(EndI, *I, EndI.getNumArgOperands())) {
- IC.eraseInstFromFunction(*I);
- IC.eraseInstFromFunction(EndI);
- return true;
- }
- // Skip start intrinsics that don't pair with this end intrinsic.
- continue;
- }
- }
- break;
- }
-
- return false;
-}
-
+ // We start from the end intrinsic and scan backwards, so that InstCombine
+ // has already processed (and potentially removed) all the instructions
+ // before the end intrinsic.
+ BasicBlock::reverse_iterator BI(EndI), BE(EndI.getParent()->rend());
+ for (; BI != BE; ++BI) {
+ if (auto *I = dyn_cast<IntrinsicInst>(&*BI)) {
+ if (isa<DbgInfoIntrinsic>(I) ||
+ I->getIntrinsicID() == EndI.getIntrinsicID())
+ continue;
+ if (IsStart(*I)) {
+ if (haveSameOperands(EndI, *I, EndI.getNumArgOperands())) {
+ IC.eraseInstFromFunction(*I);
+ IC.eraseInstFromFunction(EndI);
+ return true;
+ }
+ // Skip start intrinsics that don't pair with this end intrinsic.
+ continue;
+ }
+ }
+ break;
+ }
+
+ return false;
+}
+
Instruction *InstCombinerImpl::visitVAEndInst(VAEndInst &I) {
- removeTriviallyEmptyRange(I, *this, [](const IntrinsicInst &I) {
- return I.getIntrinsicID() == Intrinsic::vastart ||
- I.getIntrinsicID() == Intrinsic::vacopy;
- });
- return nullptr;
-}
-
+ removeTriviallyEmptyRange(I, *this, [](const IntrinsicInst &I) {
+ return I.getIntrinsicID() == Intrinsic::vastart ||
+ I.getIntrinsicID() == Intrinsic::vacopy;
+ });
+ return nullptr;
+}
+
static CallInst *canonicalizeConstantArg0ToArg1(CallInst &Call) {
- assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap");
- Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1);
- if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) {
- Call.setArgOperand(0, Arg1);
- Call.setArgOperand(1, Arg0);
- return &Call;
- }
- return nullptr;
-}
-
+ assert(Call.getNumArgOperands() > 1 && "Need at least 2 args to swap");
+ Value *Arg0 = Call.getArgOperand(0), *Arg1 = Call.getArgOperand(1);
+ if (isa<Constant>(Arg0) && !isa<Constant>(Arg1)) {
+ Call.setArgOperand(0, Arg1);
+ Call.setArgOperand(1, Arg0);
+ return &Call;
+ }
+ return nullptr;
+}
+
/// Creates a result tuple for an overflow intrinsic \p II with a given
/// \p Result and a constant \p Overflow value.
static Instruction *createOverflowTuple(IntrinsicInst *II, Value *Result,
@@ -653,15 +653,15 @@ static Instruction *createOverflowTuple(IntrinsicInst *II, Value *Result,
Instruction *
InstCombinerImpl::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
- WithOverflowInst *WO = cast<WithOverflowInst>(II);
- Value *OperationResult = nullptr;
- Constant *OverflowResult = nullptr;
- if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
- WO->getRHS(), *WO, OperationResult, OverflowResult))
+ WithOverflowInst *WO = cast<WithOverflowInst>(II);
+ Value *OperationResult = nullptr;
+ Constant *OverflowResult = nullptr;
+ if (OptimizeOverflowCheck(WO->getBinaryOp(), WO->isSigned(), WO->getLHS(),
+ WO->getRHS(), *WO, OperationResult, OverflowResult))
return createOverflowTuple(WO, OperationResult, OverflowResult);
- return nullptr;
-}
-
+ return nullptr;
+}
+
static Optional<bool> getKnownSign(Value *Op, Instruction *CxtI,
const DataLayout &DL, AssumptionCache *AC,
DominatorTree *DT) {
@@ -675,126 +675,126 @@ static Optional<bool> getKnownSign(Value *Op, Instruction *CxtI,
ICmpInst::ICMP_SLT, Op, Constant::getNullValue(Op->getType()), CxtI, DL);
}
-/// CallInst simplification. This mostly only handles folding of intrinsic
-/// instructions. For normal calls, it allows visitCallBase to do the heavy
-/// lifting.
+/// CallInst simplification. This mostly only handles folding of intrinsic
+/// instructions. For normal calls, it allows visitCallBase to do the heavy
+/// lifting.
Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
- // Don't try to simplify calls without uses. It will not do anything useful,
- // but will result in the following folds being skipped.
- if (!CI.use_empty())
- if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI)))
- return replaceInstUsesWith(CI, V);
-
- if (isFreeCall(&CI, &TLI))
- return visitFree(CI);
-
- // If the caller function is nounwind, mark the call as nounwind, even if the
- // callee isn't.
- if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) {
- CI.setDoesNotThrow();
- return &CI;
- }
-
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
- if (!II) return visitCallBase(CI);
-
- // For atomic unordered mem intrinsics if len is not a positive or
- // not a multiple of element size then behavior is undefined.
- if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(II))
- if (ConstantInt *NumBytes = dyn_cast<ConstantInt>(AMI->getLength()))
- if (NumBytes->getSExtValue() < 0 ||
- (NumBytes->getZExtValue() % AMI->getElementSizeInBytes() != 0)) {
- CreateNonTerminatorUnreachable(AMI);
- assert(AMI->getType()->isVoidTy() &&
- "non void atomic unordered mem intrinsic");
- return eraseInstFromFunction(*AMI);
- }
-
- // Intrinsics cannot occur in an invoke or a callbr, so handle them here
- // instead of in visitCallBase.
- if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) {
- bool Changed = false;
-
- // memmove/cpy/set of zero bytes is a noop.
- if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
- if (NumBytes->isNullValue())
- return eraseInstFromFunction(CI);
-
- if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
- if (CI->getZExtValue() == 1) {
- // Replace the instruction with just byte operations. We would
- // transform other cases to loads/stores, but we don't know if
- // alignment is sufficient.
- }
- }
-
- // No other transformations apply to volatile transfers.
- if (auto *M = dyn_cast<MemIntrinsic>(MI))
- if (M->isVolatile())
- return nullptr;
-
- // If we have a memmove and the source operation is a constant global,
- // then the source and dest pointers can't alias, so we can change this
- // into a call to memcpy.
- if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) {
- if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
- if (GVSrc->isConstant()) {
- Module *M = CI.getModule();
- Intrinsic::ID MemCpyID =
- isa<AtomicMemMoveInst>(MMI)
- ? Intrinsic::memcpy_element_unordered_atomic
- : Intrinsic::memcpy;
- Type *Tys[3] = { CI.getArgOperand(0)->getType(),
- CI.getArgOperand(1)->getType(),
- CI.getArgOperand(2)->getType() };
- CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys));
- Changed = true;
- }
- }
-
- if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
- // memmove(x,x,size) -> noop.
- if (MTI->getSource() == MTI->getDest())
- return eraseInstFromFunction(CI);
- }
-
- // If we can determine a pointer alignment that is bigger than currently
- // set, update the alignment.
- if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
- if (Instruction *I = SimplifyAnyMemTransfer(MTI))
- return I;
- } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) {
- if (Instruction *I = SimplifyAnyMemSet(MSI))
- return I;
- }
-
- if (Changed) return II;
- }
-
- // For fixed width vector result intrinsics, use the generic demanded vector
- // support.
- if (auto *IIFVTy = dyn_cast<FixedVectorType>(II->getType())) {
- auto VWidth = IIFVTy->getNumElements();
- APInt UndefElts(VWidth, 0);
- APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
- if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
- if (V != II)
- return replaceInstUsesWith(*II, V);
- return II;
- }
- }
-
+ // Don't try to simplify calls without uses. It will not do anything useful,
+ // but will result in the following folds being skipped.
+ if (!CI.use_empty())
+ if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI)))
+ return replaceInstUsesWith(CI, V);
+
+ if (isFreeCall(&CI, &TLI))
+ return visitFree(CI);
+
+ // If the caller function is nounwind, mark the call as nounwind, even if the
+ // callee isn't.
+ if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) {
+ CI.setDoesNotThrow();
+ return &CI;
+ }
+
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
+ if (!II) return visitCallBase(CI);
+
+ // For atomic unordered mem intrinsics if len is not a positive or
+ // not a multiple of element size then behavior is undefined.
+ if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(II))
+ if (ConstantInt *NumBytes = dyn_cast<ConstantInt>(AMI->getLength()))
+ if (NumBytes->getSExtValue() < 0 ||
+ (NumBytes->getZExtValue() % AMI->getElementSizeInBytes() != 0)) {
+ CreateNonTerminatorUnreachable(AMI);
+ assert(AMI->getType()->isVoidTy() &&
+ "non void atomic unordered mem intrinsic");
+ return eraseInstFromFunction(*AMI);
+ }
+
+ // Intrinsics cannot occur in an invoke or a callbr, so handle them here
+ // instead of in visitCallBase.
+ if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) {
+ bool Changed = false;
+
+ // memmove/cpy/set of zero bytes is a noop.
+ if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
+ if (NumBytes->isNullValue())
+ return eraseInstFromFunction(CI);
+
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
+ if (CI->getZExtValue() == 1) {
+ // Replace the instruction with just byte operations. We would
+ // transform other cases to loads/stores, but we don't know if
+ // alignment is sufficient.
+ }
+ }
+
+ // No other transformations apply to volatile transfers.
+ if (auto *M = dyn_cast<MemIntrinsic>(MI))
+ if (M->isVolatile())
+ return nullptr;
+
+ // If we have a memmove and the source operation is a constant global,
+ // then the source and dest pointers can't alias, so we can change this
+ // into a call to memcpy.
+ if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) {
+ if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
+ if (GVSrc->isConstant()) {
+ Module *M = CI.getModule();
+ Intrinsic::ID MemCpyID =
+ isa<AtomicMemMoveInst>(MMI)
+ ? Intrinsic::memcpy_element_unordered_atomic
+ : Intrinsic::memcpy;
+ Type *Tys[3] = { CI.getArgOperand(0)->getType(),
+ CI.getArgOperand(1)->getType(),
+ CI.getArgOperand(2)->getType() };
+ CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys));
+ Changed = true;
+ }
+ }
+
+ if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
+ // memmove(x,x,size) -> noop.
+ if (MTI->getSource() == MTI->getDest())
+ return eraseInstFromFunction(CI);
+ }
+
+ // If we can determine a pointer alignment that is bigger than currently
+ // set, update the alignment.
+ if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
+ if (Instruction *I = SimplifyAnyMemTransfer(MTI))
+ return I;
+ } else if (auto *MSI = dyn_cast<AnyMemSetInst>(MI)) {
+ if (Instruction *I = SimplifyAnyMemSet(MSI))
+ return I;
+ }
+
+ if (Changed) return II;
+ }
+
+ // For fixed width vector result intrinsics, use the generic demanded vector
+ // support.
+ if (auto *IIFVTy = dyn_cast<FixedVectorType>(II->getType())) {
+ auto VWidth = IIFVTy->getNumElements();
+ APInt UndefElts(VWidth, 0);
+ APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+ if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
+ if (V != II)
+ return replaceInstUsesWith(*II, V);
+ return II;
+ }
+ }
+
if (II->isCommutative()) {
if (CallInst *NewCall = canonicalizeConstantArg0ToArg1(CI))
return NewCall;
}
-
- Intrinsic::ID IID = II->getIntrinsicID();
- switch (IID) {
- case Intrinsic::objectsize:
- if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false))
- return replaceInstUsesWith(CI, V);
- return nullptr;
+
+ Intrinsic::ID IID = II->getIntrinsicID();
+ switch (IID) {
+ case Intrinsic::objectsize:
+ if (Value *V = lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/false))
+ return replaceInstUsesWith(CI, V);
+ return nullptr;
case Intrinsic::abs: {
Value *IIOperand = II->getArgOperand(0);
bool IntMinIsPoison = cast<Constant>(II->getArgOperand(1))->isOneValue();
@@ -854,444 +854,444 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
}
break;
}
- case Intrinsic::bswap: {
- Value *IIOperand = II->getArgOperand(0);
- Value *X = nullptr;
-
- // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
- if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
+ case Intrinsic::bswap: {
+ Value *IIOperand = II->getArgOperand(0);
+ Value *X = nullptr;
+
+ // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
+ if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
unsigned C = X->getType()->getScalarSizeInBits() -
IIOperand->getType()->getScalarSizeInBits();
- Value *CV = ConstantInt::get(X->getType(), C);
- Value *V = Builder.CreateLShr(X, CV);
- return new TruncInst(V, IIOperand->getType());
- }
- break;
- }
- case Intrinsic::masked_load:
- if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II))
- return replaceInstUsesWith(CI, SimplifiedMaskedOp);
- break;
- case Intrinsic::masked_store:
- return simplifyMaskedStore(*II);
- case Intrinsic::masked_gather:
- return simplifyMaskedGather(*II);
- case Intrinsic::masked_scatter:
- return simplifyMaskedScatter(*II);
- case Intrinsic::launder_invariant_group:
- case Intrinsic::strip_invariant_group:
- if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this))
- return replaceInstUsesWith(*II, SkippedBarrier);
- break;
- case Intrinsic::powi:
- if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
- // 0 and 1 are handled in instsimplify
- // powi(x, -1) -> 1/x
- if (Power->isMinusOne())
+ Value *CV = ConstantInt::get(X->getType(), C);
+ Value *V = Builder.CreateLShr(X, CV);
+ return new TruncInst(V, IIOperand->getType());
+ }
+ break;
+ }
+ case Intrinsic::masked_load:
+ if (Value *SimplifiedMaskedOp = simplifyMaskedLoad(*II))
+ return replaceInstUsesWith(CI, SimplifiedMaskedOp);
+ break;
+ case Intrinsic::masked_store:
+ return simplifyMaskedStore(*II);
+ case Intrinsic::masked_gather:
+ return simplifyMaskedGather(*II);
+ case Intrinsic::masked_scatter:
+ return simplifyMaskedScatter(*II);
+ case Intrinsic::launder_invariant_group:
+ case Intrinsic::strip_invariant_group:
+ if (auto *SkippedBarrier = simplifyInvariantGroupIntrinsic(*II, *this))
+ return replaceInstUsesWith(*II, SkippedBarrier);
+ break;
+ case Intrinsic::powi:
+ if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
+ // 0 and 1 are handled in instsimplify
+ // powi(x, -1) -> 1/x
+ if (Power->isMinusOne())
return BinaryOperator::CreateFDivFMF(ConstantFP::get(CI.getType(), 1.0),
II->getArgOperand(0), II);
- // powi(x, 2) -> x*x
- if (Power->equalsInt(2))
+ // powi(x, 2) -> x*x
+ if (Power->equalsInt(2))
return BinaryOperator::CreateFMulFMF(II->getArgOperand(0),
II->getArgOperand(0), II);
- }
- break;
-
- case Intrinsic::cttz:
- case Intrinsic::ctlz:
- if (auto *I = foldCttzCtlz(*II, *this))
- return I;
- break;
-
- case Intrinsic::ctpop:
- if (auto *I = foldCtpop(*II, *this))
- return I;
- break;
-
- case Intrinsic::fshl:
- case Intrinsic::fshr: {
- Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1);
- Type *Ty = II->getType();
- unsigned BitWidth = Ty->getScalarSizeInBits();
- Constant *ShAmtC;
+ }
+ break;
+
+ case Intrinsic::cttz:
+ case Intrinsic::ctlz:
+ if (auto *I = foldCttzCtlz(*II, *this))
+ return I;
+ break;
+
+ case Intrinsic::ctpop:
+ if (auto *I = foldCtpop(*II, *this))
+ return I;
+ break;
+
+ case Intrinsic::fshl:
+ case Intrinsic::fshr: {
+ Value *Op0 = II->getArgOperand(0), *Op1 = II->getArgOperand(1);
+ Type *Ty = II->getType();
+ unsigned BitWidth = Ty->getScalarSizeInBits();
+ Constant *ShAmtC;
if (match(II->getArgOperand(2), m_ImmConstant(ShAmtC)) &&
!ShAmtC->containsConstantExpression()) {
- // Canonicalize a shift amount constant operand to modulo the bit-width.
- Constant *WidthC = ConstantInt::get(Ty, BitWidth);
- Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC);
- if (ModuloC != ShAmtC)
- return replaceOperand(*II, 2, ModuloC);
-
- assert(ConstantExpr::getICmp(ICmpInst::ICMP_UGT, WidthC, ShAmtC) ==
- ConstantInt::getTrue(CmpInst::makeCmpResultType(Ty)) &&
- "Shift amount expected to be modulo bitwidth");
-
- // Canonicalize funnel shift right by constant to funnel shift left. This
- // is not entirely arbitrary. For historical reasons, the backend may
- // recognize rotate left patterns but miss rotate right patterns.
- if (IID == Intrinsic::fshr) {
- // fshr X, Y, C --> fshl X, Y, (BitWidth - C)
- Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC);
- Module *Mod = II->getModule();
- Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty);
- return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC });
- }
- assert(IID == Intrinsic::fshl &&
- "All funnel shifts by simple constants should go left");
-
- // fshl(X, 0, C) --> shl X, C
- // fshl(X, undef, C) --> shl X, C
- if (match(Op1, m_ZeroInt()) || match(Op1, m_Undef()))
- return BinaryOperator::CreateShl(Op0, ShAmtC);
-
- // fshl(0, X, C) --> lshr X, (BW-C)
- // fshl(undef, X, C) --> lshr X, (BW-C)
- if (match(Op0, m_ZeroInt()) || match(Op0, m_Undef()))
- return BinaryOperator::CreateLShr(Op1,
- ConstantExpr::getSub(WidthC, ShAmtC));
-
- // fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form)
- if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) {
- Module *Mod = II->getModule();
- Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty);
- return CallInst::Create(Bswap, { Op0 });
- }
- }
-
- // Left or right might be masked.
- if (SimplifyDemandedInstructionBits(*II))
- return &CI;
-
- // The shift amount (operand 2) of a funnel shift is modulo the bitwidth,
- // so only the low bits of the shift amount are demanded if the bitwidth is
- // a power-of-2.
- if (!isPowerOf2_32(BitWidth))
- break;
- APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth));
- KnownBits Op2Known(BitWidth);
- if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known))
- return &CI;
- break;
- }
- case Intrinsic::uadd_with_overflow:
- case Intrinsic::sadd_with_overflow: {
- if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
- return I;
-
- // Given 2 constant operands whose sum does not overflow:
- // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1
- // saddo (X +nsw C0), C1 -> saddo X, C0 + C1
- Value *X;
- const APInt *C0, *C1;
- Value *Arg0 = II->getArgOperand(0);
- Value *Arg1 = II->getArgOperand(1);
- bool IsSigned = IID == Intrinsic::sadd_with_overflow;
- bool HasNWAdd = IsSigned ? match(Arg0, m_NSWAdd(m_Value(X), m_APInt(C0)))
- : match(Arg0, m_NUWAdd(m_Value(X), m_APInt(C0)));
- if (HasNWAdd && match(Arg1, m_APInt(C1))) {
- bool Overflow;
- APInt NewC =
- IsSigned ? C1->sadd_ov(*C0, Overflow) : C1->uadd_ov(*C0, Overflow);
- if (!Overflow)
- return replaceInstUsesWith(
- *II, Builder.CreateBinaryIntrinsic(
- IID, X, ConstantInt::get(Arg1->getType(), NewC)));
- }
- break;
- }
-
- case Intrinsic::umul_with_overflow:
- case Intrinsic::smul_with_overflow:
- case Intrinsic::usub_with_overflow:
- if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
- return I;
- break;
-
- case Intrinsic::ssub_with_overflow: {
- if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
- return I;
-
- Constant *C;
- Value *Arg0 = II->getArgOperand(0);
- Value *Arg1 = II->getArgOperand(1);
- // Given a constant C that is not the minimum signed value
- // for an integer of a given bit width:
- //
- // ssubo X, C -> saddo X, -C
- if (match(Arg1, m_Constant(C)) && C->isNotMinSignedValue()) {
- Value *NegVal = ConstantExpr::getNeg(C);
- // Build a saddo call that is equivalent to the discovered
- // ssubo call.
- return replaceInstUsesWith(
- *II, Builder.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow,
- Arg0, NegVal));
- }
-
- break;
- }
-
- case Intrinsic::uadd_sat:
- case Intrinsic::sadd_sat:
- case Intrinsic::usub_sat:
- case Intrinsic::ssub_sat: {
- SaturatingInst *SI = cast<SaturatingInst>(II);
- Type *Ty = SI->getType();
- Value *Arg0 = SI->getLHS();
- Value *Arg1 = SI->getRHS();
-
- // Make use of known overflow information.
- OverflowResult OR = computeOverflow(SI->getBinaryOp(), SI->isSigned(),
- Arg0, Arg1, SI);
- switch (OR) {
- case OverflowResult::MayOverflow:
- break;
- case OverflowResult::NeverOverflows:
- if (SI->isSigned())
- return BinaryOperator::CreateNSW(SI->getBinaryOp(), Arg0, Arg1);
- else
- return BinaryOperator::CreateNUW(SI->getBinaryOp(), Arg0, Arg1);
- case OverflowResult::AlwaysOverflowsLow: {
- unsigned BitWidth = Ty->getScalarSizeInBits();
- APInt Min = APSInt::getMinValue(BitWidth, !SI->isSigned());
- return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Min));
- }
- case OverflowResult::AlwaysOverflowsHigh: {
- unsigned BitWidth = Ty->getScalarSizeInBits();
- APInt Max = APSInt::getMaxValue(BitWidth, !SI->isSigned());
- return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Max));
- }
- }
-
- // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN
- Constant *C;
- if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) &&
- C->isNotMinSignedValue()) {
- Value *NegVal = ConstantExpr::getNeg(C);
- return replaceInstUsesWith(
- *II, Builder.CreateBinaryIntrinsic(
- Intrinsic::sadd_sat, Arg0, NegVal));
- }
-
- // sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2))
- // sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2))
- // if Val and Val2 have the same sign
- if (auto *Other = dyn_cast<IntrinsicInst>(Arg0)) {
- Value *X;
- const APInt *Val, *Val2;
- APInt NewVal;
- bool IsUnsigned =
- IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat;
- if (Other->getIntrinsicID() == IID &&
- match(Arg1, m_APInt(Val)) &&
- match(Other->getArgOperand(0), m_Value(X)) &&
- match(Other->getArgOperand(1), m_APInt(Val2))) {
- if (IsUnsigned)
- NewVal = Val->uadd_sat(*Val2);
- else if (Val->isNonNegative() == Val2->isNonNegative()) {
- bool Overflow;
- NewVal = Val->sadd_ov(*Val2, Overflow);
- if (Overflow) {
- // Both adds together may add more than SignedMaxValue
- // without saturating the final result.
- break;
- }
- } else {
- // Cannot fold saturated addition with different signs.
- break;
- }
-
- return replaceInstUsesWith(
- *II, Builder.CreateBinaryIntrinsic(
- IID, X, ConstantInt::get(II->getType(), NewVal)));
- }
- }
- break;
- }
-
- case Intrinsic::minnum:
- case Intrinsic::maxnum:
- case Intrinsic::minimum:
- case Intrinsic::maximum: {
- Value *Arg0 = II->getArgOperand(0);
- Value *Arg1 = II->getArgOperand(1);
- Value *X, *Y;
- if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) &&
- (Arg0->hasOneUse() || Arg1->hasOneUse())) {
- // If both operands are negated, invert the call and negate the result:
- // min(-X, -Y) --> -(max(X, Y))
- // max(-X, -Y) --> -(min(X, Y))
- Intrinsic::ID NewIID;
- switch (IID) {
- case Intrinsic::maxnum:
- NewIID = Intrinsic::minnum;
- break;
- case Intrinsic::minnum:
- NewIID = Intrinsic::maxnum;
- break;
- case Intrinsic::maximum:
- NewIID = Intrinsic::minimum;
- break;
- case Intrinsic::minimum:
- NewIID = Intrinsic::maximum;
- break;
- default:
- llvm_unreachable("unexpected intrinsic ID");
- }
- Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II);
- Instruction *FNeg = UnaryOperator::CreateFNeg(NewCall);
- FNeg->copyIRFlags(II);
- return FNeg;
- }
-
- // m(m(X, C2), C1) -> m(X, C)
- const APFloat *C1, *C2;
- if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) {
- if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) &&
- ((match(M->getArgOperand(0), m_Value(X)) &&
- match(M->getArgOperand(1), m_APFloat(C2))) ||
- (match(M->getArgOperand(1), m_Value(X)) &&
- match(M->getArgOperand(0), m_APFloat(C2))))) {
- APFloat Res(0.0);
- switch (IID) {
- case Intrinsic::maxnum:
- Res = maxnum(*C1, *C2);
- break;
- case Intrinsic::minnum:
- Res = minnum(*C1, *C2);
- break;
- case Intrinsic::maximum:
- Res = maximum(*C1, *C2);
- break;
- case Intrinsic::minimum:
- Res = minimum(*C1, *C2);
- break;
- default:
- llvm_unreachable("unexpected intrinsic ID");
- }
- Instruction *NewCall = Builder.CreateBinaryIntrinsic(
- IID, X, ConstantFP::get(Arg0->getType(), Res), II);
- // TODO: Conservatively intersecting FMF. If Res == C2, the transform
- // was a simplification (so Arg0 and its original flags could
- // propagate?)
- NewCall->andIRFlags(M);
- return replaceInstUsesWith(*II, NewCall);
- }
- }
-
- Value *ExtSrc0;
- Value *ExtSrc1;
-
- // minnum (fpext x), (fpext y) -> minnum x, y
- // maxnum (fpext x), (fpext y) -> maxnum x, y
- if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc0)))) &&
- match(II->getArgOperand(1), m_OneUse(m_FPExt(m_Value(ExtSrc1)))) &&
- ExtSrc0->getType() == ExtSrc1->getType()) {
- Function *F = Intrinsic::getDeclaration(
- II->getModule(), II->getIntrinsicID(), {ExtSrc0->getType()});
- CallInst *NewCall = Builder.CreateCall(F, { ExtSrc0, ExtSrc1 });
- NewCall->copyFastMathFlags(II);
- NewCall->takeName(II);
- return new FPExtInst(NewCall, II->getType());
- }
-
- break;
- }
- case Intrinsic::fmuladd: {
- // Canonicalize fast fmuladd to the separate fmul + fadd.
- if (II->isFast()) {
- BuilderTy::FastMathFlagGuard Guard(Builder);
- Builder.setFastMathFlags(II->getFastMathFlags());
- Value *Mul = Builder.CreateFMul(II->getArgOperand(0),
- II->getArgOperand(1));
- Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2));
- Add->takeName(II);
- return replaceInstUsesWith(*II, Add);
- }
-
- // Try to simplify the underlying FMul.
- if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1),
- II->getFastMathFlags(),
- SQ.getWithInstruction(II))) {
- auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
- FAdd->copyFastMathFlags(II);
- return FAdd;
- }
-
- LLVM_FALLTHROUGH;
- }
- case Intrinsic::fma: {
- // fma fneg(x), fneg(y), z -> fma x, y, z
- Value *Src0 = II->getArgOperand(0);
- Value *Src1 = II->getArgOperand(1);
- Value *X, *Y;
- if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) {
- replaceOperand(*II, 0, X);
- replaceOperand(*II, 1, Y);
- return II;
- }
-
- // fma fabs(x), fabs(x), z -> fma x, x, z
- if (match(Src0, m_FAbs(m_Value(X))) &&
- match(Src1, m_FAbs(m_Specific(X)))) {
- replaceOperand(*II, 0, X);
- replaceOperand(*II, 1, X);
- return II;
- }
-
- // Try to simplify the underlying FMul. We can only apply simplifications
- // that do not require rounding.
- if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1),
- II->getFastMathFlags(),
- SQ.getWithInstruction(II))) {
- auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
- FAdd->copyFastMathFlags(II);
- return FAdd;
- }
-
- // fma x, y, 0 -> fmul x, y
- // This is always valid for -0.0, but requires nsz for +0.0 as
- // -0.0 + 0.0 = 0.0, which would not be the same as the fmul on its own.
- if (match(II->getArgOperand(2), m_NegZeroFP()) ||
- (match(II->getArgOperand(2), m_PosZeroFP()) &&
- II->getFastMathFlags().noSignedZeros()))
- return BinaryOperator::CreateFMulFMF(Src0, Src1, II);
-
- break;
- }
- case Intrinsic::copysign: {
+ // Canonicalize a shift amount constant operand to modulo the bit-width.
+ Constant *WidthC = ConstantInt::get(Ty, BitWidth);
+ Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC);
+ if (ModuloC != ShAmtC)
+ return replaceOperand(*II, 2, ModuloC);
+
+ assert(ConstantExpr::getICmp(ICmpInst::ICMP_UGT, WidthC, ShAmtC) ==
+ ConstantInt::getTrue(CmpInst::makeCmpResultType(Ty)) &&
+ "Shift amount expected to be modulo bitwidth");
+
+ // Canonicalize funnel shift right by constant to funnel shift left. This
+ // is not entirely arbitrary. For historical reasons, the backend may
+ // recognize rotate left patterns but miss rotate right patterns.
+ if (IID == Intrinsic::fshr) {
+ // fshr X, Y, C --> fshl X, Y, (BitWidth - C)
+ Constant *LeftShiftC = ConstantExpr::getSub(WidthC, ShAmtC);
+ Module *Mod = II->getModule();
+ Function *Fshl = Intrinsic::getDeclaration(Mod, Intrinsic::fshl, Ty);
+ return CallInst::Create(Fshl, { Op0, Op1, LeftShiftC });
+ }
+ assert(IID == Intrinsic::fshl &&
+ "All funnel shifts by simple constants should go left");
+
+ // fshl(X, 0, C) --> shl X, C
+ // fshl(X, undef, C) --> shl X, C
+ if (match(Op1, m_ZeroInt()) || match(Op1, m_Undef()))
+ return BinaryOperator::CreateShl(Op0, ShAmtC);
+
+ // fshl(0, X, C) --> lshr X, (BW-C)
+ // fshl(undef, X, C) --> lshr X, (BW-C)
+ if (match(Op0, m_ZeroInt()) || match(Op0, m_Undef()))
+ return BinaryOperator::CreateLShr(Op1,
+ ConstantExpr::getSub(WidthC, ShAmtC));
+
+ // fshl i16 X, X, 8 --> bswap i16 X (reduce to more-specific form)
+ if (Op0 == Op1 && BitWidth == 16 && match(ShAmtC, m_SpecificInt(8))) {
+ Module *Mod = II->getModule();
+ Function *Bswap = Intrinsic::getDeclaration(Mod, Intrinsic::bswap, Ty);
+ return CallInst::Create(Bswap, { Op0 });
+ }
+ }
+
+ // Left or right might be masked.
+ if (SimplifyDemandedInstructionBits(*II))
+ return &CI;
+
+ // The shift amount (operand 2) of a funnel shift is modulo the bitwidth,
+ // so only the low bits of the shift amount are demanded if the bitwidth is
+ // a power-of-2.
+ if (!isPowerOf2_32(BitWidth))
+ break;
+ APInt Op2Demanded = APInt::getLowBitsSet(BitWidth, Log2_32_Ceil(BitWidth));
+ KnownBits Op2Known(BitWidth);
+ if (SimplifyDemandedBits(II, 2, Op2Demanded, Op2Known))
+ return &CI;
+ break;
+ }
+ case Intrinsic::uadd_with_overflow:
+ case Intrinsic::sadd_with_overflow: {
+ if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
+ return I;
+
+ // Given 2 constant operands whose sum does not overflow:
+ // uaddo (X +nuw C0), C1 -> uaddo X, C0 + C1
+ // saddo (X +nsw C0), C1 -> saddo X, C0 + C1
+ Value *X;
+ const APInt *C0, *C1;
+ Value *Arg0 = II->getArgOperand(0);
+ Value *Arg1 = II->getArgOperand(1);
+ bool IsSigned = IID == Intrinsic::sadd_with_overflow;
+ bool HasNWAdd = IsSigned ? match(Arg0, m_NSWAdd(m_Value(X), m_APInt(C0)))
+ : match(Arg0, m_NUWAdd(m_Value(X), m_APInt(C0)));
+ if (HasNWAdd && match(Arg1, m_APInt(C1))) {
+ bool Overflow;
+ APInt NewC =
+ IsSigned ? C1->sadd_ov(*C0, Overflow) : C1->uadd_ov(*C0, Overflow);
+ if (!Overflow)
+ return replaceInstUsesWith(
+ *II, Builder.CreateBinaryIntrinsic(
+ IID, X, ConstantInt::get(Arg1->getType(), NewC)));
+ }
+ break;
+ }
+
+ case Intrinsic::umul_with_overflow:
+ case Intrinsic::smul_with_overflow:
+ case Intrinsic::usub_with_overflow:
+ if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
+ return I;
+ break;
+
+ case Intrinsic::ssub_with_overflow: {
+ if (Instruction *I = foldIntrinsicWithOverflowCommon(II))
+ return I;
+
+ Constant *C;
+ Value *Arg0 = II->getArgOperand(0);
+ Value *Arg1 = II->getArgOperand(1);
+ // Given a constant C that is not the minimum signed value
+ // for an integer of a given bit width:
+ //
+ // ssubo X, C -> saddo X, -C
+ if (match(Arg1, m_Constant(C)) && C->isNotMinSignedValue()) {
+ Value *NegVal = ConstantExpr::getNeg(C);
+ // Build a saddo call that is equivalent to the discovered
+ // ssubo call.
+ return replaceInstUsesWith(
+ *II, Builder.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow,
+ Arg0, NegVal));
+ }
+
+ break;
+ }
+
+ case Intrinsic::uadd_sat:
+ case Intrinsic::sadd_sat:
+ case Intrinsic::usub_sat:
+ case Intrinsic::ssub_sat: {
+ SaturatingInst *SI = cast<SaturatingInst>(II);
+ Type *Ty = SI->getType();
+ Value *Arg0 = SI->getLHS();
+ Value *Arg1 = SI->getRHS();
+
+ // Make use of known overflow information.
+ OverflowResult OR = computeOverflow(SI->getBinaryOp(), SI->isSigned(),
+ Arg0, Arg1, SI);
+ switch (OR) {
+ case OverflowResult::MayOverflow:
+ break;
+ case OverflowResult::NeverOverflows:
+ if (SI->isSigned())
+ return BinaryOperator::CreateNSW(SI->getBinaryOp(), Arg0, Arg1);
+ else
+ return BinaryOperator::CreateNUW(SI->getBinaryOp(), Arg0, Arg1);
+ case OverflowResult::AlwaysOverflowsLow: {
+ unsigned BitWidth = Ty->getScalarSizeInBits();
+ APInt Min = APSInt::getMinValue(BitWidth, !SI->isSigned());
+ return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Min));
+ }
+ case OverflowResult::AlwaysOverflowsHigh: {
+ unsigned BitWidth = Ty->getScalarSizeInBits();
+ APInt Max = APSInt::getMaxValue(BitWidth, !SI->isSigned());
+ return replaceInstUsesWith(*SI, ConstantInt::get(Ty, Max));
+ }
+ }
+
+ // ssub.sat(X, C) -> sadd.sat(X, -C) if C != MIN
+ Constant *C;
+ if (IID == Intrinsic::ssub_sat && match(Arg1, m_Constant(C)) &&
+ C->isNotMinSignedValue()) {
+ Value *NegVal = ConstantExpr::getNeg(C);
+ return replaceInstUsesWith(
+ *II, Builder.CreateBinaryIntrinsic(
+ Intrinsic::sadd_sat, Arg0, NegVal));
+ }
+
+ // sat(sat(X + Val2) + Val) -> sat(X + (Val+Val2))
+ // sat(sat(X - Val2) - Val) -> sat(X - (Val+Val2))
+ // if Val and Val2 have the same sign
+ if (auto *Other = dyn_cast<IntrinsicInst>(Arg0)) {
+ Value *X;
+ const APInt *Val, *Val2;
+ APInt NewVal;
+ bool IsUnsigned =
+ IID == Intrinsic::uadd_sat || IID == Intrinsic::usub_sat;
+ if (Other->getIntrinsicID() == IID &&
+ match(Arg1, m_APInt(Val)) &&
+ match(Other->getArgOperand(0), m_Value(X)) &&
+ match(Other->getArgOperand(1), m_APInt(Val2))) {
+ if (IsUnsigned)
+ NewVal = Val->uadd_sat(*Val2);
+ else if (Val->isNonNegative() == Val2->isNonNegative()) {
+ bool Overflow;
+ NewVal = Val->sadd_ov(*Val2, Overflow);
+ if (Overflow) {
+ // Both adds together may add more than SignedMaxValue
+ // without saturating the final result.
+ break;
+ }
+ } else {
+ // Cannot fold saturated addition with different signs.
+ break;
+ }
+
+ return replaceInstUsesWith(
+ *II, Builder.CreateBinaryIntrinsic(
+ IID, X, ConstantInt::get(II->getType(), NewVal)));
+ }
+ }
+ break;
+ }
+
+ case Intrinsic::minnum:
+ case Intrinsic::maxnum:
+ case Intrinsic::minimum:
+ case Intrinsic::maximum: {
+ Value *Arg0 = II->getArgOperand(0);
+ Value *Arg1 = II->getArgOperand(1);
+ Value *X, *Y;
+ if (match(Arg0, m_FNeg(m_Value(X))) && match(Arg1, m_FNeg(m_Value(Y))) &&
+ (Arg0->hasOneUse() || Arg1->hasOneUse())) {
+ // If both operands are negated, invert the call and negate the result:
+ // min(-X, -Y) --> -(max(X, Y))
+ // max(-X, -Y) --> -(min(X, Y))
+ Intrinsic::ID NewIID;
+ switch (IID) {
+ case Intrinsic::maxnum:
+ NewIID = Intrinsic::minnum;
+ break;
+ case Intrinsic::minnum:
+ NewIID = Intrinsic::maxnum;
+ break;
+ case Intrinsic::maximum:
+ NewIID = Intrinsic::minimum;
+ break;
+ case Intrinsic::minimum:
+ NewIID = Intrinsic::maximum;
+ break;
+ default:
+ llvm_unreachable("unexpected intrinsic ID");
+ }
+ Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II);
+ Instruction *FNeg = UnaryOperator::CreateFNeg(NewCall);
+ FNeg->copyIRFlags(II);
+ return FNeg;
+ }
+
+ // m(m(X, C2), C1) -> m(X, C)
+ const APFloat *C1, *C2;
+ if (auto *M = dyn_cast<IntrinsicInst>(Arg0)) {
+ if (M->getIntrinsicID() == IID && match(Arg1, m_APFloat(C1)) &&
+ ((match(M->getArgOperand(0), m_Value(X)) &&
+ match(M->getArgOperand(1), m_APFloat(C2))) ||
+ (match(M->getArgOperand(1), m_Value(X)) &&
+ match(M->getArgOperand(0), m_APFloat(C2))))) {
+ APFloat Res(0.0);
+ switch (IID) {
+ case Intrinsic::maxnum:
+ Res = maxnum(*C1, *C2);
+ break;
+ case Intrinsic::minnum:
+ Res = minnum(*C1, *C2);
+ break;
+ case Intrinsic::maximum:
+ Res = maximum(*C1, *C2);
+ break;
+ case Intrinsic::minimum:
+ Res = minimum(*C1, *C2);
+ break;
+ default:
+ llvm_unreachable("unexpected intrinsic ID");
+ }
+ Instruction *NewCall = Builder.CreateBinaryIntrinsic(
+ IID, X, ConstantFP::get(Arg0->getType(), Res), II);
+ // TODO: Conservatively intersecting FMF. If Res == C2, the transform
+ // was a simplification (so Arg0 and its original flags could
+ // propagate?)
+ NewCall->andIRFlags(M);
+ return replaceInstUsesWith(*II, NewCall);
+ }
+ }
+
+ Value *ExtSrc0;
+ Value *ExtSrc1;
+
+ // minnum (fpext x), (fpext y) -> minnum x, y
+ // maxnum (fpext x), (fpext y) -> maxnum x, y
+ if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc0)))) &&
+ match(II->getArgOperand(1), m_OneUse(m_FPExt(m_Value(ExtSrc1)))) &&
+ ExtSrc0->getType() == ExtSrc1->getType()) {
+ Function *F = Intrinsic::getDeclaration(
+ II->getModule(), II->getIntrinsicID(), {ExtSrc0->getType()});
+ CallInst *NewCall = Builder.CreateCall(F, { ExtSrc0, ExtSrc1 });
+ NewCall->copyFastMathFlags(II);
+ NewCall->takeName(II);
+ return new FPExtInst(NewCall, II->getType());
+ }
+
+ break;
+ }
+ case Intrinsic::fmuladd: {
+ // Canonicalize fast fmuladd to the separate fmul + fadd.
+ if (II->isFast()) {
+ BuilderTy::FastMathFlagGuard Guard(Builder);
+ Builder.setFastMathFlags(II->getFastMathFlags());
+ Value *Mul = Builder.CreateFMul(II->getArgOperand(0),
+ II->getArgOperand(1));
+ Value *Add = Builder.CreateFAdd(Mul, II->getArgOperand(2));
+ Add->takeName(II);
+ return replaceInstUsesWith(*II, Add);
+ }
+
+ // Try to simplify the underlying FMul.
+ if (Value *V = SimplifyFMulInst(II->getArgOperand(0), II->getArgOperand(1),
+ II->getFastMathFlags(),
+ SQ.getWithInstruction(II))) {
+ auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
+ FAdd->copyFastMathFlags(II);
+ return FAdd;
+ }
+
+ LLVM_FALLTHROUGH;
+ }
+ case Intrinsic::fma: {
+ // fma fneg(x), fneg(y), z -> fma x, y, z
+ Value *Src0 = II->getArgOperand(0);
+ Value *Src1 = II->getArgOperand(1);
+ Value *X, *Y;
+ if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) {
+ replaceOperand(*II, 0, X);
+ replaceOperand(*II, 1, Y);
+ return II;
+ }
+
+ // fma fabs(x), fabs(x), z -> fma x, x, z
+ if (match(Src0, m_FAbs(m_Value(X))) &&
+ match(Src1, m_FAbs(m_Specific(X)))) {
+ replaceOperand(*II, 0, X);
+ replaceOperand(*II, 1, X);
+ return II;
+ }
+
+ // Try to simplify the underlying FMul. We can only apply simplifications
+ // that do not require rounding.
+ if (Value *V = SimplifyFMAFMul(II->getArgOperand(0), II->getArgOperand(1),
+ II->getFastMathFlags(),
+ SQ.getWithInstruction(II))) {
+ auto *FAdd = BinaryOperator::CreateFAdd(V, II->getArgOperand(2));
+ FAdd->copyFastMathFlags(II);
+ return FAdd;
+ }
+
+ // fma x, y, 0 -> fmul x, y
+ // This is always valid for -0.0, but requires nsz for +0.0 as
+ // -0.0 + 0.0 = 0.0, which would not be the same as the fmul on its own.
+ if (match(II->getArgOperand(2), m_NegZeroFP()) ||
+ (match(II->getArgOperand(2), m_PosZeroFP()) &&
+ II->getFastMathFlags().noSignedZeros()))
+ return BinaryOperator::CreateFMulFMF(Src0, Src1, II);
+
+ break;
+ }
+ case Intrinsic::copysign: {
Value *Mag = II->getArgOperand(0), *Sign = II->getArgOperand(1);
if (SignBitMustBeZero(Sign, &TLI)) {
- // If we know that the sign argument is positive, reduce to FABS:
+ // If we know that the sign argument is positive, reduce to FABS:
// copysign Mag, +Sign --> fabs Mag
Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, Mag, II);
- return replaceInstUsesWith(*II, Fabs);
- }
- // TODO: There should be a ValueTracking sibling like SignBitMustBeOne.
- const APFloat *C;
+ return replaceInstUsesWith(*II, Fabs);
+ }
+ // TODO: There should be a ValueTracking sibling like SignBitMustBeOne.
+ const APFloat *C;
if (match(Sign, m_APFloat(C)) && C->isNegative()) {
- // If we know that the sign argument is negative, reduce to FNABS:
+ // If we know that the sign argument is negative, reduce to FNABS:
// copysign Mag, -Sign --> fneg (fabs Mag)
Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, Mag, II);
- return replaceInstUsesWith(*II, Builder.CreateFNegFMF(Fabs, II));
- }
-
- // Propagate sign argument through nested calls:
+ return replaceInstUsesWith(*II, Builder.CreateFNegFMF(Fabs, II));
+ }
+
+ // Propagate sign argument through nested calls:
// copysign Mag, (copysign ?, X) --> copysign Mag, X
Value *X;
if (match(Sign, m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(X))))
return replaceOperand(*II, 1, X);
-
+
// Peek through changes of magnitude's sign-bit. This call rewrites those:
// copysign (fabs X), Sign --> copysign X, Sign
// copysign (fneg X), Sign --> copysign X, Sign
if (match(Mag, m_FAbs(m_Value(X))) || match(Mag, m_FNeg(m_Value(X))))
return replaceOperand(*II, 0, X);
- break;
- }
- case Intrinsic::fabs: {
+ break;
+ }
+ case Intrinsic::fabs: {
Value *Cond, *TVal, *FVal;
- if (match(II->getArgOperand(0),
+ if (match(II->getArgOperand(0),
m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))) {
// fabs (select Cond, TrueC, FalseC) --> select Cond, AbsT, AbsF
if (isa<Constant>(TVal) && isa<Constant>(FVal)) {
@@ -1305,276 +1305,276 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
// fabs (select Cond, TVal, -TVal) --> fabs TVal
if (match(FVal, m_FNeg(m_Specific(TVal))))
return replaceOperand(*II, 0, TVal);
- }
-
- LLVM_FALLTHROUGH;
- }
- case Intrinsic::ceil:
- case Intrinsic::floor:
- case Intrinsic::round:
- case Intrinsic::roundeven:
- case Intrinsic::nearbyint:
- case Intrinsic::rint:
- case Intrinsic::trunc: {
- Value *ExtSrc;
- if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) {
- // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x)
- Value *NarrowII = Builder.CreateUnaryIntrinsic(IID, ExtSrc, II);
- return new FPExtInst(NarrowII, II->getType());
- }
- break;
- }
- case Intrinsic::cos:
- case Intrinsic::amdgcn_cos: {
- Value *X;
- Value *Src = II->getArgOperand(0);
- if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X)))) {
- // cos(-x) -> cos(x)
- // cos(fabs(x)) -> cos(x)
- return replaceOperand(*II, 0, X);
- }
- break;
- }
- case Intrinsic::sin: {
- Value *X;
- if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) {
- // sin(-x) --> -sin(x)
- Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II);
- Instruction *FNeg = UnaryOperator::CreateFNeg(NewSin);
- FNeg->copyFastMathFlags(II);
- return FNeg;
- }
- break;
- }
-
- case Intrinsic::arm_neon_vtbl1:
- case Intrinsic::aarch64_neon_tbl1:
- if (Value *V = simplifyNeonTbl1(*II, Builder))
- return replaceInstUsesWith(*II, V);
- break;
-
- case Intrinsic::arm_neon_vmulls:
- case Intrinsic::arm_neon_vmullu:
- case Intrinsic::aarch64_neon_smull:
- case Intrinsic::aarch64_neon_umull: {
- Value *Arg0 = II->getArgOperand(0);
- Value *Arg1 = II->getArgOperand(1);
-
- // Handle mul by zero first:
- if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
- return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
- }
-
- // Check for constant LHS & RHS - in this case we just simplify.
- bool Zext = (IID == Intrinsic::arm_neon_vmullu ||
- IID == Intrinsic::aarch64_neon_umull);
- VectorType *NewVT = cast<VectorType>(II->getType());
- if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
- if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
- CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext);
- CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext);
-
- return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1));
- }
-
- // Couldn't simplify - canonicalize constant to the RHS.
- std::swap(Arg0, Arg1);
- }
-
- // Handle mul by one:
- if (Constant *CV1 = dyn_cast<Constant>(Arg1))
- if (ConstantInt *Splat =
- dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
- if (Splat->isOne())
- return CastInst::CreateIntegerCast(Arg0, II->getType(),
- /*isSigned=*/!Zext);
-
- break;
- }
- case Intrinsic::arm_neon_aesd:
- case Intrinsic::arm_neon_aese:
- case Intrinsic::aarch64_crypto_aesd:
- case Intrinsic::aarch64_crypto_aese: {
- Value *DataArg = II->getArgOperand(0);
- Value *KeyArg = II->getArgOperand(1);
-
- // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
- Value *Data, *Key;
- if (match(KeyArg, m_ZeroInt()) &&
- match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
- replaceOperand(*II, 0, Data);
- replaceOperand(*II, 1, Key);
- return II;
- }
- break;
- }
- case Intrinsic::hexagon_V6_vandvrt:
- case Intrinsic::hexagon_V6_vandvrt_128B: {
- // Simplify Q -> V -> Q conversion.
- if (auto Op0 = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
- Intrinsic::ID ID0 = Op0->getIntrinsicID();
- if (ID0 != Intrinsic::hexagon_V6_vandqrt &&
- ID0 != Intrinsic::hexagon_V6_vandqrt_128B)
- break;
- Value *Bytes = Op0->getArgOperand(1), *Mask = II->getArgOperand(1);
- uint64_t Bytes1 = computeKnownBits(Bytes, 0, Op0).One.getZExtValue();
- uint64_t Mask1 = computeKnownBits(Mask, 0, II).One.getZExtValue();
- // Check if every byte has common bits in Bytes and Mask.
- uint64_t C = Bytes1 & Mask1;
- if ((C & 0xFF) && (C & 0xFF00) && (C & 0xFF0000) && (C & 0xFF000000))
- return replaceInstUsesWith(*II, Op0->getArgOperand(0));
- }
- break;
- }
- case Intrinsic::stackrestore: {
- // If the save is right next to the restore, remove the restore. This can
- // happen when variable allocas are DCE'd.
- if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
- if (SS->getIntrinsicID() == Intrinsic::stacksave) {
- // Skip over debug info.
- if (SS->getNextNonDebugInstruction() == II) {
- return eraseInstFromFunction(CI);
- }
- }
- }
-
- // Scan down this block to see if there is another stack restore in the
- // same block without an intervening call/alloca.
- BasicBlock::iterator BI(II);
- Instruction *TI = II->getParent()->getTerminator();
- bool CannotRemove = false;
- for (++BI; &*BI != TI; ++BI) {
- if (isa<AllocaInst>(BI)) {
- CannotRemove = true;
- break;
- }
- if (CallInst *BCI = dyn_cast<CallInst>(BI)) {
- if (auto *II2 = dyn_cast<IntrinsicInst>(BCI)) {
- // If there is a stackrestore below this one, remove this one.
- if (II2->getIntrinsicID() == Intrinsic::stackrestore)
- return eraseInstFromFunction(CI);
-
- // Bail if we cross over an intrinsic with side effects, such as
- // llvm.stacksave, or llvm.read_register.
- if (II2->mayHaveSideEffects()) {
- CannotRemove = true;
- break;
- }
- } else {
- // If we found a non-intrinsic call, we can't remove the stack
- // restore.
- CannotRemove = true;
- break;
- }
- }
- }
-
- // If the stack restore is in a return, resume, or unwind block and if there
- // are no allocas or calls between the restore and the return, nuke the
- // restore.
- if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI)))
- return eraseInstFromFunction(CI);
- break;
- }
- case Intrinsic::lifetime_end:
- // Asan needs to poison memory to detect invalid access which is possible
- // even for empty lifetime range.
- if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) ||
- II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) ||
- II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress))
- break;
-
- if (removeTriviallyEmptyRange(*II, *this, [](const IntrinsicInst &I) {
- return I.getIntrinsicID() == Intrinsic::lifetime_start;
- }))
- return nullptr;
- break;
- case Intrinsic::assume: {
- Value *IIOperand = II->getArgOperand(0);
+ }
+
+ LLVM_FALLTHROUGH;
+ }
+ case Intrinsic::ceil:
+ case Intrinsic::floor:
+ case Intrinsic::round:
+ case Intrinsic::roundeven:
+ case Intrinsic::nearbyint:
+ case Intrinsic::rint:
+ case Intrinsic::trunc: {
+ Value *ExtSrc;
+ if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc))))) {
+ // Narrow the call: intrinsic (fpext x) -> fpext (intrinsic x)
+ Value *NarrowII = Builder.CreateUnaryIntrinsic(IID, ExtSrc, II);
+ return new FPExtInst(NarrowII, II->getType());
+ }
+ break;
+ }
+ case Intrinsic::cos:
+ case Intrinsic::amdgcn_cos: {
+ Value *X;
+ Value *Src = II->getArgOperand(0);
+ if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X)))) {
+ // cos(-x) -> cos(x)
+ // cos(fabs(x)) -> cos(x)
+ return replaceOperand(*II, 0, X);
+ }
+ break;
+ }
+ case Intrinsic::sin: {
+ Value *X;
+ if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) {
+ // sin(-x) --> -sin(x)
+ Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II);
+ Instruction *FNeg = UnaryOperator::CreateFNeg(NewSin);
+ FNeg->copyFastMathFlags(II);
+ return FNeg;
+ }
+ break;
+ }
+
+ case Intrinsic::arm_neon_vtbl1:
+ case Intrinsic::aarch64_neon_tbl1:
+ if (Value *V = simplifyNeonTbl1(*II, Builder))
+ return replaceInstUsesWith(*II, V);
+ break;
+
+ case Intrinsic::arm_neon_vmulls:
+ case Intrinsic::arm_neon_vmullu:
+ case Intrinsic::aarch64_neon_smull:
+ case Intrinsic::aarch64_neon_umull: {
+ Value *Arg0 = II->getArgOperand(0);
+ Value *Arg1 = II->getArgOperand(1);
+
+ // Handle mul by zero first:
+ if (isa<ConstantAggregateZero>(Arg0) || isa<ConstantAggregateZero>(Arg1)) {
+ return replaceInstUsesWith(CI, ConstantAggregateZero::get(II->getType()));
+ }
+
+ // Check for constant LHS & RHS - in this case we just simplify.
+ bool Zext = (IID == Intrinsic::arm_neon_vmullu ||
+ IID == Intrinsic::aarch64_neon_umull);
+ VectorType *NewVT = cast<VectorType>(II->getType());
+ if (Constant *CV0 = dyn_cast<Constant>(Arg0)) {
+ if (Constant *CV1 = dyn_cast<Constant>(Arg1)) {
+ CV0 = ConstantExpr::getIntegerCast(CV0, NewVT, /*isSigned=*/!Zext);
+ CV1 = ConstantExpr::getIntegerCast(CV1, NewVT, /*isSigned=*/!Zext);
+
+ return replaceInstUsesWith(CI, ConstantExpr::getMul(CV0, CV1));
+ }
+
+ // Couldn't simplify - canonicalize constant to the RHS.
+ std::swap(Arg0, Arg1);
+ }
+
+ // Handle mul by one:
+ if (Constant *CV1 = dyn_cast<Constant>(Arg1))
+ if (ConstantInt *Splat =
+ dyn_cast_or_null<ConstantInt>(CV1->getSplatValue()))
+ if (Splat->isOne())
+ return CastInst::CreateIntegerCast(Arg0, II->getType(),
+ /*isSigned=*/!Zext);
+
+ break;
+ }
+ case Intrinsic::arm_neon_aesd:
+ case Intrinsic::arm_neon_aese:
+ case Intrinsic::aarch64_crypto_aesd:
+ case Intrinsic::aarch64_crypto_aese: {
+ Value *DataArg = II->getArgOperand(0);
+ Value *KeyArg = II->getArgOperand(1);
+
+ // Try to use the builtin XOR in AESE and AESD to eliminate a prior XOR
+ Value *Data, *Key;
+ if (match(KeyArg, m_ZeroInt()) &&
+ match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
+ replaceOperand(*II, 0, Data);
+ replaceOperand(*II, 1, Key);
+ return II;
+ }
+ break;
+ }
+ case Intrinsic::hexagon_V6_vandvrt:
+ case Intrinsic::hexagon_V6_vandvrt_128B: {
+ // Simplify Q -> V -> Q conversion.
+ if (auto Op0 = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
+ Intrinsic::ID ID0 = Op0->getIntrinsicID();
+ if (ID0 != Intrinsic::hexagon_V6_vandqrt &&
+ ID0 != Intrinsic::hexagon_V6_vandqrt_128B)
+ break;
+ Value *Bytes = Op0->getArgOperand(1), *Mask = II->getArgOperand(1);
+ uint64_t Bytes1 = computeKnownBits(Bytes, 0, Op0).One.getZExtValue();
+ uint64_t Mask1 = computeKnownBits(Mask, 0, II).One.getZExtValue();
+ // Check if every byte has common bits in Bytes and Mask.
+ uint64_t C = Bytes1 & Mask1;
+ if ((C & 0xFF) && (C & 0xFF00) && (C & 0xFF0000) && (C & 0xFF000000))
+ return replaceInstUsesWith(*II, Op0->getArgOperand(0));
+ }
+ break;
+ }
+ case Intrinsic::stackrestore: {
+ // If the save is right next to the restore, remove the restore. This can
+ // happen when variable allocas are DCE'd.
+ if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
+ if (SS->getIntrinsicID() == Intrinsic::stacksave) {
+ // Skip over debug info.
+ if (SS->getNextNonDebugInstruction() == II) {
+ return eraseInstFromFunction(CI);
+ }
+ }
+ }
+
+ // Scan down this block to see if there is another stack restore in the
+ // same block without an intervening call/alloca.
+ BasicBlock::iterator BI(II);
+ Instruction *TI = II->getParent()->getTerminator();
+ bool CannotRemove = false;
+ for (++BI; &*BI != TI; ++BI) {
+ if (isa<AllocaInst>(BI)) {
+ CannotRemove = true;
+ break;
+ }
+ if (CallInst *BCI = dyn_cast<CallInst>(BI)) {
+ if (auto *II2 = dyn_cast<IntrinsicInst>(BCI)) {
+ // If there is a stackrestore below this one, remove this one.
+ if (II2->getIntrinsicID() == Intrinsic::stackrestore)
+ return eraseInstFromFunction(CI);
+
+ // Bail if we cross over an intrinsic with side effects, such as
+ // llvm.stacksave, or llvm.read_register.
+ if (II2->mayHaveSideEffects()) {
+ CannotRemove = true;
+ break;
+ }
+ } else {
+ // If we found a non-intrinsic call, we can't remove the stack
+ // restore.
+ CannotRemove = true;
+ break;
+ }
+ }
+ }
+
+ // If the stack restore is in a return, resume, or unwind block and if there
+ // are no allocas or calls between the restore and the return, nuke the
+ // restore.
+ if (!CannotRemove && (isa<ReturnInst>(TI) || isa<ResumeInst>(TI)))
+ return eraseInstFromFunction(CI);
+ break;
+ }
+ case Intrinsic::lifetime_end:
+ // Asan needs to poison memory to detect invalid access which is possible
+ // even for empty lifetime range.
+ if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) ||
+ II->getFunction()->hasFnAttribute(Attribute::SanitizeMemory) ||
+ II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress))
+ break;
+
+ if (removeTriviallyEmptyRange(*II, *this, [](const IntrinsicInst &I) {
+ return I.getIntrinsicID() == Intrinsic::lifetime_start;
+ }))
+ return nullptr;
+ break;
+ case Intrinsic::assume: {
+ Value *IIOperand = II->getArgOperand(0);
SmallVector<OperandBundleDef, 4> OpBundles;
II->getOperandBundlesAsDefs(OpBundles);
bool HasOpBundles = !OpBundles.empty();
- // Remove an assume if it is followed by an identical assume.
- // TODO: Do we need this? Unless there are conflicting assumptions, the
- // computeKnownBits(IIOperand) below here eliminates redundant assumes.
- Instruction *Next = II->getNextNonDebugInstruction();
+ // Remove an assume if it is followed by an identical assume.
+ // TODO: Do we need this? Unless there are conflicting assumptions, the
+ // computeKnownBits(IIOperand) below here eliminates redundant assumes.
+ Instruction *Next = II->getNextNonDebugInstruction();
if (HasOpBundles &&
match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))) &&
!cast<IntrinsicInst>(Next)->hasOperandBundles())
- return eraseInstFromFunction(CI);
-
- // Canonicalize assume(a && b) -> assume(a); assume(b);
- // Note: New assumption intrinsics created here are registered by
- // the InstCombineIRInserter object.
- FunctionType *AssumeIntrinsicTy = II->getFunctionType();
- Value *AssumeIntrinsic = II->getCalledOperand();
- Value *A, *B;
+ return eraseInstFromFunction(CI);
+
+ // Canonicalize assume(a && b) -> assume(a); assume(b);
+ // Note: New assumption intrinsics created here are registered by
+ // the InstCombineIRInserter object.
+ FunctionType *AssumeIntrinsicTy = II->getFunctionType();
+ Value *AssumeIntrinsic = II->getCalledOperand();
+ Value *A, *B;
if (match(IIOperand, m_LogicalAnd(m_Value(A), m_Value(B)))) {
Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, OpBundles,
II->getName());
- Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName());
- return eraseInstFromFunction(*II);
- }
- // assume(!(a || b)) -> assume(!a); assume(!b);
+ Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName());
+ return eraseInstFromFunction(*II);
+ }
+ // assume(!(a || b)) -> assume(!a); assume(!b);
if (match(IIOperand, m_Not(m_LogicalOr(m_Value(A), m_Value(B))))) {
- Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
+ Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
Builder.CreateNot(A), OpBundles, II->getName());
- Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
- Builder.CreateNot(B), II->getName());
- return eraseInstFromFunction(*II);
- }
-
- // assume( (load addr) != null ) -> add 'nonnull' metadata to load
- // (if assume is valid at the load)
- CmpInst::Predicate Pred;
- Instruction *LHS;
- if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) &&
- Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load &&
- LHS->getType()->isPointerTy() &&
- isValidAssumeForContext(II, LHS, &DT)) {
- MDNode *MD = MDNode::get(II->getContext(), None);
- LHS->setMetadata(LLVMContext::MD_nonnull, MD);
+ Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
+ Builder.CreateNot(B), II->getName());
+ return eraseInstFromFunction(*II);
+ }
+
+ // assume( (load addr) != null ) -> add 'nonnull' metadata to load
+ // (if assume is valid at the load)
+ CmpInst::Predicate Pred;
+ Instruction *LHS;
+ if (match(IIOperand, m_ICmp(Pred, m_Instruction(LHS), m_Zero())) &&
+ Pred == ICmpInst::ICMP_NE && LHS->getOpcode() == Instruction::Load &&
+ LHS->getType()->isPointerTy() &&
+ isValidAssumeForContext(II, LHS, &DT)) {
+ MDNode *MD = MDNode::get(II->getContext(), None);
+ LHS->setMetadata(LLVMContext::MD_nonnull, MD);
if (!HasOpBundles)
return eraseInstFromFunction(*II);
-
- // TODO: apply nonnull return attributes to calls and invokes
- // TODO: apply range metadata for range check patterns?
- }
-
- // If there is a dominating assume with the same condition as this one,
- // then this one is redundant, and should be removed.
- KnownBits Known(1);
- computeKnownBits(IIOperand, Known, 0, II);
- if (Known.isAllOnes() && isAssumeWithEmptyBundle(*II))
- return eraseInstFromFunction(*II);
-
- // Update the cache of affected values for this assumption (we might be
- // here because we just simplified the condition).
- AC.updateAffectedValues(II);
- break;
- }
+
+ // TODO: apply nonnull return attributes to calls and invokes
+ // TODO: apply range metadata for range check patterns?
+ }
+
+ // If there is a dominating assume with the same condition as this one,
+ // then this one is redundant, and should be removed.
+ KnownBits Known(1);
+ computeKnownBits(IIOperand, Known, 0, II);
+ if (Known.isAllOnes() && isAssumeWithEmptyBundle(*II))
+ return eraseInstFromFunction(*II);
+
+ // Update the cache of affected values for this assumption (we might be
+ // here because we just simplified the condition).
+ AC.updateAffectedValues(II);
+ break;
+ }
case Intrinsic::experimental_gc_statepoint: {
GCStatepointInst &GCSP = *cast<GCStatepointInst>(II);
SmallPtrSet<Value *, 32> LiveGcValues;
for (const GCRelocateInst *Reloc : GCSP.getGCRelocates()) {
GCRelocateInst &GCR = *const_cast<GCRelocateInst *>(Reloc);
-
+
// Remove the relocation if unused.
if (GCR.use_empty()) {
eraseInstFromFunction(GCR);
continue;
}
-
+
Value *DerivedPtr = GCR.getDerivedPtr();
Value *BasePtr = GCR.getBasePtr();
-
+
// Undef is undef, even after relocation.
if (isa<UndefValue>(DerivedPtr) || isa<UndefValue>(BasePtr)) {
replaceInstUsesWith(GCR, UndefValue::get(GCR.getType()));
eraseInstFromFunction(GCR);
continue;
}
-
+
if (auto *PT = dyn_cast<PointerType>(GCR.getType())) {
// The relocation of null will be null for most any collector.
// TODO: provide a hook for this in GCStrategy. There might be some
@@ -1585,7 +1585,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
eraseInstFromFunction(GCR);
continue;
}
-
+
// isKnownNonNull -> nonnull attribute
if (!GCR.hasRetAttr(Attribute::NonNull) &&
isKnownNonZero(DerivedPtr, DL, 0, &AC, II, &DT)) {
@@ -1594,18 +1594,18 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
Worklist.pushUsersToWorkList(GCR);
}
}
-
+
// If we have two copies of the same pointer in the statepoint argument
// list, canonicalize to one. This may let us common gc.relocates.
if (GCR.getBasePtr() == GCR.getDerivedPtr() &&
GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) {
auto *OpIntTy = GCR.getOperand(2)->getType();
GCR.setOperand(2, ConstantInt::get(OpIntTy, GCR.getBasePtrIndex()));
- }
-
+ }
+
// TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
// Canonicalize on the type from the uses to the defs
-
+
// TODO: relocate((gep p, C, C2, ...)) -> gep(relocate(p), C, C2, ...)
LiveGcValues.insert(BasePtr);
LiveGcValues.insert(DerivedPtr);
@@ -1649,40 +1649,40 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
else
return InvokeInst::CreateWithReplacedBundle(cast<InvokeInst>(II),
NewBundle);
- break;
- }
- case Intrinsic::experimental_guard: {
- // Is this guard followed by another guard? We scan forward over a small
- // fixed window of instructions to handle common cases with conditions
- // computed between guards.
- Instruction *NextInst = II->getNextNonDebugInstruction();
- for (unsigned i = 0; i < GuardWideningWindow; i++) {
- // Note: Using context-free form to avoid compile time blow up
- if (!isSafeToSpeculativelyExecute(NextInst))
- break;
- NextInst = NextInst->getNextNonDebugInstruction();
- }
- Value *NextCond = nullptr;
- if (match(NextInst,
- m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) {
- Value *CurrCond = II->getArgOperand(0);
-
- // Remove a guard that it is immediately preceded by an identical guard.
- // Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
- if (CurrCond != NextCond) {
- Instruction *MoveI = II->getNextNonDebugInstruction();
- while (MoveI != NextInst) {
- auto *Temp = MoveI;
- MoveI = MoveI->getNextNonDebugInstruction();
- Temp->moveBefore(II);
- }
- replaceOperand(*II, 0, Builder.CreateAnd(CurrCond, NextCond));
- }
- eraseInstFromFunction(*NextInst);
- return II;
- }
- break;
- }
+ break;
+ }
+ case Intrinsic::experimental_guard: {
+ // Is this guard followed by another guard? We scan forward over a small
+ // fixed window of instructions to handle common cases with conditions
+ // computed between guards.
+ Instruction *NextInst = II->getNextNonDebugInstruction();
+ for (unsigned i = 0; i < GuardWideningWindow; i++) {
+ // Note: Using context-free form to avoid compile time blow up
+ if (!isSafeToSpeculativelyExecute(NextInst))
+ break;
+ NextInst = NextInst->getNextNonDebugInstruction();
+ }
+ Value *NextCond = nullptr;
+ if (match(NextInst,
+ m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) {
+ Value *CurrCond = II->getArgOperand(0);
+
+ // Remove a guard that it is immediately preceded by an identical guard.
+ // Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
+ if (CurrCond != NextCond) {
+ Instruction *MoveI = II->getNextNonDebugInstruction();
+ while (MoveI != NextInst) {
+ auto *Temp = MoveI;
+ MoveI = MoveI->getNextNonDebugInstruction();
+ Temp->moveBefore(II);
+ }
+ replaceOperand(*II, 0, Builder.CreateAnd(CurrCond, NextCond));
+ }
+ eraseInstFromFunction(*NextInst);
+ return II;
+ }
+ break;
+ }
case Intrinsic::experimental_vector_insert: {
Value *Vec = II->getArgOperand(0);
Value *SubVec = II->getArgOperand(1);
@@ -1738,7 +1738,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
return eraseInstFromFunction(CI);
}
break;
- }
+ }
case Intrinsic::experimental_vector_extract: {
Value *Vec = II->getArgOperand(0);
Value *Idx = II->getArgOperand(1);
@@ -1786,804 +1786,804 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
break;
}
}
- return visitCallBase(*II);
-}
-
-// Fence instruction simplification
+ return visitCallBase(*II);
+}
+
+// Fence instruction simplification
Instruction *InstCombinerImpl::visitFenceInst(FenceInst &FI) {
- // Remove identical consecutive fences.
- Instruction *Next = FI.getNextNonDebugInstruction();
- if (auto *NFI = dyn_cast<FenceInst>(Next))
- if (FI.isIdenticalTo(NFI))
- return eraseInstFromFunction(FI);
- return nullptr;
-}
-
-// InvokeInst simplification
+ // Remove identical consecutive fences.
+ Instruction *Next = FI.getNextNonDebugInstruction();
+ if (auto *NFI = dyn_cast<FenceInst>(Next))
+ if (FI.isIdenticalTo(NFI))
+ return eraseInstFromFunction(FI);
+ return nullptr;
+}
+
+// InvokeInst simplification
Instruction *InstCombinerImpl::visitInvokeInst(InvokeInst &II) {
- return visitCallBase(II);
-}
-
-// CallBrInst simplification
+ return visitCallBase(II);
+}
+
+// CallBrInst simplification
Instruction *InstCombinerImpl::visitCallBrInst(CallBrInst &CBI) {
- return visitCallBase(CBI);
-}
-
-/// If this cast does not affect the value passed through the varargs area, we
-/// can eliminate the use of the cast.
-static bool isSafeToEliminateVarargsCast(const CallBase &Call,
- const DataLayout &DL,
- const CastInst *const CI,
- const int ix) {
- if (!CI->isLosslessCast())
- return false;
-
- // If this is a GC intrinsic, avoid munging types. We need types for
- // statepoint reconstruction in SelectionDAG.
- // TODO: This is probably something which should be expanded to all
- // intrinsics since the entire point of intrinsics is that
- // they are understandable by the optimizer.
- if (isa<GCStatepointInst>(Call) || isa<GCRelocateInst>(Call) ||
- isa<GCResultInst>(Call))
- return false;
-
- // The size of ByVal or InAlloca arguments is derived from the type, so we
- // can't change to a type with a different size. If the size were
- // passed explicitly we could avoid this check.
- if (!Call.isPassPointeeByValueArgument(ix))
- return true;
-
- Type* SrcTy =
- cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
- Type *DstTy = Call.isByValArgument(ix)
- ? Call.getParamByValType(ix)
- : cast<PointerType>(CI->getType())->getElementType();
- if (!SrcTy->isSized() || !DstTy->isSized())
- return false;
- if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy))
- return false;
- return true;
-}
-
+ return visitCallBase(CBI);
+}
+
+/// If this cast does not affect the value passed through the varargs area, we
+/// can eliminate the use of the cast.
+static bool isSafeToEliminateVarargsCast(const CallBase &Call,
+ const DataLayout &DL,
+ const CastInst *const CI,
+ const int ix) {
+ if (!CI->isLosslessCast())
+ return false;
+
+ // If this is a GC intrinsic, avoid munging types. We need types for
+ // statepoint reconstruction in SelectionDAG.
+ // TODO: This is probably something which should be expanded to all
+ // intrinsics since the entire point of intrinsics is that
+ // they are understandable by the optimizer.
+ if (isa<GCStatepointInst>(Call) || isa<GCRelocateInst>(Call) ||
+ isa<GCResultInst>(Call))
+ return false;
+
+ // The size of ByVal or InAlloca arguments is derived from the type, so we
+ // can't change to a type with a different size. If the size were
+ // passed explicitly we could avoid this check.
+ if (!Call.isPassPointeeByValueArgument(ix))
+ return true;
+
+ Type* SrcTy =
+ cast<PointerType>(CI->getOperand(0)->getType())->getElementType();
+ Type *DstTy = Call.isByValArgument(ix)
+ ? Call.getParamByValType(ix)
+ : cast<PointerType>(CI->getType())->getElementType();
+ if (!SrcTy->isSized() || !DstTy->isSized())
+ return false;
+ if (DL.getTypeAllocSize(SrcTy) != DL.getTypeAllocSize(DstTy))
+ return false;
+ return true;
+}
+
Instruction *InstCombinerImpl::tryOptimizeCall(CallInst *CI) {
- if (!CI->getCalledFunction()) return nullptr;
-
- auto InstCombineRAUW = [this](Instruction *From, Value *With) {
- replaceInstUsesWith(*From, With);
- };
- auto InstCombineErase = [this](Instruction *I) {
- eraseInstFromFunction(*I);
- };
- LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW,
- InstCombineErase);
- if (Value *With = Simplifier.optimizeCall(CI, Builder)) {
- ++NumSimplified;
- return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With);
- }
-
- return nullptr;
-}
-
-static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) {
- // Strip off at most one level of pointer casts, looking for an alloca. This
- // is good enough in practice and simpler than handling any number of casts.
- Value *Underlying = TrampMem->stripPointerCasts();
- if (Underlying != TrampMem &&
- (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem))
- return nullptr;
- if (!isa<AllocaInst>(Underlying))
- return nullptr;
-
- IntrinsicInst *InitTrampoline = nullptr;
- for (User *U : TrampMem->users()) {
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
- if (!II)
- return nullptr;
- if (II->getIntrinsicID() == Intrinsic::init_trampoline) {
- if (InitTrampoline)
- // More than one init_trampoline writes to this value. Give up.
- return nullptr;
- InitTrampoline = II;
- continue;
- }
- if (II->getIntrinsicID() == Intrinsic::adjust_trampoline)
- // Allow any number of calls to adjust.trampoline.
- continue;
- return nullptr;
- }
-
- // No call to init.trampoline found.
- if (!InitTrampoline)
- return nullptr;
-
- // Check that the alloca is being used in the expected way.
- if (InitTrampoline->getOperand(0) != TrampMem)
- return nullptr;
-
- return InitTrampoline;
-}
-
-static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
- Value *TrampMem) {
- // Visit all the previous instructions in the basic block, and try to find a
- // init.trampoline which has a direct path to the adjust.trampoline.
- for (BasicBlock::iterator I = AdjustTramp->getIterator(),
- E = AdjustTramp->getParent()->begin();
- I != E;) {
- Instruction *Inst = &*--I;
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
- if (II->getIntrinsicID() == Intrinsic::init_trampoline &&
- II->getOperand(0) == TrampMem)
- return II;
- if (Inst->mayWriteToMemory())
- return nullptr;
- }
- return nullptr;
-}
-
-// Given a call to llvm.adjust.trampoline, find and return the corresponding
-// call to llvm.init.trampoline if the call to the trampoline can be optimized
-// to a direct call to a function. Otherwise return NULL.
-static IntrinsicInst *findInitTrampoline(Value *Callee) {
- Callee = Callee->stripPointerCasts();
- IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
- if (!AdjustTramp ||
- AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline)
- return nullptr;
-
- Value *TrampMem = AdjustTramp->getOperand(0);
-
- if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem))
- return IT;
- if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem))
- return IT;
- return nullptr;
-}
-
-static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) {
- unsigned NumArgs = Call.getNumArgOperands();
- ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0));
- ConstantInt *Op1C =
- (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1));
- // Bail out if the allocation size is zero (or an invalid alignment of zero
- // with aligned_alloc).
- if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue()))
- return;
-
- if (isMallocLikeFn(&Call, TLI) && Op0C) {
- if (isOpNewLikeFn(&Call, TLI))
- Call.addAttribute(AttributeList::ReturnIndex,
- Attribute::getWithDereferenceableBytes(
- Call.getContext(), Op0C->getZExtValue()));
- else
- Call.addAttribute(AttributeList::ReturnIndex,
- Attribute::getWithDereferenceableOrNullBytes(
- Call.getContext(), Op0C->getZExtValue()));
- } else if (isAlignedAllocLikeFn(&Call, TLI) && Op1C) {
- Call.addAttribute(AttributeList::ReturnIndex,
- Attribute::getWithDereferenceableOrNullBytes(
- Call.getContext(), Op1C->getZExtValue()));
- // Add alignment attribute if alignment is a power of two constant.
- if (Op0C && Op0C->getValue().ult(llvm::Value::MaximumAlignment)) {
- uint64_t AlignmentVal = Op0C->getZExtValue();
- if (llvm::isPowerOf2_64(AlignmentVal))
- Call.addAttribute(AttributeList::ReturnIndex,
- Attribute::getWithAlignment(Call.getContext(),
- Align(AlignmentVal)));
- }
- } else if (isReallocLikeFn(&Call, TLI) && Op1C) {
- Call.addAttribute(AttributeList::ReturnIndex,
- Attribute::getWithDereferenceableOrNullBytes(
- Call.getContext(), Op1C->getZExtValue()));
- } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) {
- bool Overflow;
- const APInt &N = Op0C->getValue();
- APInt Size = N.umul_ov(Op1C->getValue(), Overflow);
- if (!Overflow)
- Call.addAttribute(AttributeList::ReturnIndex,
- Attribute::getWithDereferenceableOrNullBytes(
- Call.getContext(), Size.getZExtValue()));
- } else if (isStrdupLikeFn(&Call, TLI)) {
- uint64_t Len = GetStringLength(Call.getOperand(0));
- if (Len) {
- // strdup
- if (NumArgs == 1)
- Call.addAttribute(AttributeList::ReturnIndex,
- Attribute::getWithDereferenceableOrNullBytes(
- Call.getContext(), Len));
- // strndup
- else if (NumArgs == 2 && Op1C)
- Call.addAttribute(
- AttributeList::ReturnIndex,
- Attribute::getWithDereferenceableOrNullBytes(
- Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1)));
- }
- }
-}
-
-/// Improvements for call, callbr and invoke instructions.
+ if (!CI->getCalledFunction()) return nullptr;
+
+ auto InstCombineRAUW = [this](Instruction *From, Value *With) {
+ replaceInstUsesWith(*From, With);
+ };
+ auto InstCombineErase = [this](Instruction *I) {
+ eraseInstFromFunction(*I);
+ };
+ LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW,
+ InstCombineErase);
+ if (Value *With = Simplifier.optimizeCall(CI, Builder)) {
+ ++NumSimplified;
+ return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With);
+ }
+
+ return nullptr;
+}
+
+static IntrinsicInst *findInitTrampolineFromAlloca(Value *TrampMem) {
+ // Strip off at most one level of pointer casts, looking for an alloca. This
+ // is good enough in practice and simpler than handling any number of casts.
+ Value *Underlying = TrampMem->stripPointerCasts();
+ if (Underlying != TrampMem &&
+ (!Underlying->hasOneUse() || Underlying->user_back() != TrampMem))
+ return nullptr;
+ if (!isa<AllocaInst>(Underlying))
+ return nullptr;
+
+ IntrinsicInst *InitTrampoline = nullptr;
+ for (User *U : TrampMem->users()) {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+ if (!II)
+ return nullptr;
+ if (II->getIntrinsicID() == Intrinsic::init_trampoline) {
+ if (InitTrampoline)
+ // More than one init_trampoline writes to this value. Give up.
+ return nullptr;
+ InitTrampoline = II;
+ continue;
+ }
+ if (II->getIntrinsicID() == Intrinsic::adjust_trampoline)
+ // Allow any number of calls to adjust.trampoline.
+ continue;
+ return nullptr;
+ }
+
+ // No call to init.trampoline found.
+ if (!InitTrampoline)
+ return nullptr;
+
+ // Check that the alloca is being used in the expected way.
+ if (InitTrampoline->getOperand(0) != TrampMem)
+ return nullptr;
+
+ return InitTrampoline;
+}
+
+static IntrinsicInst *findInitTrampolineFromBB(IntrinsicInst *AdjustTramp,
+ Value *TrampMem) {
+ // Visit all the previous instructions in the basic block, and try to find a
+ // init.trampoline which has a direct path to the adjust.trampoline.
+ for (BasicBlock::iterator I = AdjustTramp->getIterator(),
+ E = AdjustTramp->getParent()->begin();
+ I != E;) {
+ Instruction *Inst = &*--I;
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+ if (II->getIntrinsicID() == Intrinsic::init_trampoline &&
+ II->getOperand(0) == TrampMem)
+ return II;
+ if (Inst->mayWriteToMemory())
+ return nullptr;
+ }
+ return nullptr;
+}
+
+// Given a call to llvm.adjust.trampoline, find and return the corresponding
+// call to llvm.init.trampoline if the call to the trampoline can be optimized
+// to a direct call to a function. Otherwise return NULL.
+static IntrinsicInst *findInitTrampoline(Value *Callee) {
+ Callee = Callee->stripPointerCasts();
+ IntrinsicInst *AdjustTramp = dyn_cast<IntrinsicInst>(Callee);
+ if (!AdjustTramp ||
+ AdjustTramp->getIntrinsicID() != Intrinsic::adjust_trampoline)
+ return nullptr;
+
+ Value *TrampMem = AdjustTramp->getOperand(0);
+
+ if (IntrinsicInst *IT = findInitTrampolineFromAlloca(TrampMem))
+ return IT;
+ if (IntrinsicInst *IT = findInitTrampolineFromBB(AdjustTramp, TrampMem))
+ return IT;
+ return nullptr;
+}
+
+static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) {
+ unsigned NumArgs = Call.getNumArgOperands();
+ ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0));
+ ConstantInt *Op1C =
+ (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1));
+ // Bail out if the allocation size is zero (or an invalid alignment of zero
+ // with aligned_alloc).
+ if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue()))
+ return;
+
+ if (isMallocLikeFn(&Call, TLI) && Op0C) {
+ if (isOpNewLikeFn(&Call, TLI))
+ Call.addAttribute(AttributeList::ReturnIndex,
+ Attribute::getWithDereferenceableBytes(
+ Call.getContext(), Op0C->getZExtValue()));
+ else
+ Call.addAttribute(AttributeList::ReturnIndex,
+ Attribute::getWithDereferenceableOrNullBytes(
+ Call.getContext(), Op0C->getZExtValue()));
+ } else if (isAlignedAllocLikeFn(&Call, TLI) && Op1C) {
+ Call.addAttribute(AttributeList::ReturnIndex,
+ Attribute::getWithDereferenceableOrNullBytes(
+ Call.getContext(), Op1C->getZExtValue()));
+ // Add alignment attribute if alignment is a power of two constant.
+ if (Op0C && Op0C->getValue().ult(llvm::Value::MaximumAlignment)) {
+ uint64_t AlignmentVal = Op0C->getZExtValue();
+ if (llvm::isPowerOf2_64(AlignmentVal))
+ Call.addAttribute(AttributeList::ReturnIndex,
+ Attribute::getWithAlignment(Call.getContext(),
+ Align(AlignmentVal)));
+ }
+ } else if (isReallocLikeFn(&Call, TLI) && Op1C) {
+ Call.addAttribute(AttributeList::ReturnIndex,
+ Attribute::getWithDereferenceableOrNullBytes(
+ Call.getContext(), Op1C->getZExtValue()));
+ } else if (isCallocLikeFn(&Call, TLI) && Op0C && Op1C) {
+ bool Overflow;
+ const APInt &N = Op0C->getValue();
+ APInt Size = N.umul_ov(Op1C->getValue(), Overflow);
+ if (!Overflow)
+ Call.addAttribute(AttributeList::ReturnIndex,
+ Attribute::getWithDereferenceableOrNullBytes(
+ Call.getContext(), Size.getZExtValue()));
+ } else if (isStrdupLikeFn(&Call, TLI)) {
+ uint64_t Len = GetStringLength(Call.getOperand(0));
+ if (Len) {
+ // strdup
+ if (NumArgs == 1)
+ Call.addAttribute(AttributeList::ReturnIndex,
+ Attribute::getWithDereferenceableOrNullBytes(
+ Call.getContext(), Len));
+ // strndup
+ else if (NumArgs == 2 && Op1C)
+ Call.addAttribute(
+ AttributeList::ReturnIndex,
+ Attribute::getWithDereferenceableOrNullBytes(
+ Call.getContext(), std::min(Len, Op1C->getZExtValue() + 1)));
+ }
+ }
+}
+
+/// Improvements for call, callbr and invoke instructions.
Instruction *InstCombinerImpl::visitCallBase(CallBase &Call) {
- if (isAllocationFn(&Call, &TLI))
- annotateAnyAllocSite(Call, &TLI);
-
- bool Changed = false;
-
- // Mark any parameters that are known to be non-null with the nonnull
- // attribute. This is helpful for inlining calls to functions with null
- // checks on their arguments.
- SmallVector<unsigned, 4> ArgNos;
- unsigned ArgNo = 0;
-
- for (Value *V : Call.args()) {
- if (V->getType()->isPointerTy() &&
- !Call.paramHasAttr(ArgNo, Attribute::NonNull) &&
- isKnownNonZero(V, DL, 0, &AC, &Call, &DT))
- ArgNos.push_back(ArgNo);
- ArgNo++;
- }
-
- assert(ArgNo == Call.arg_size() && "sanity check");
-
- if (!ArgNos.empty()) {
- AttributeList AS = Call.getAttributes();
- LLVMContext &Ctx = Call.getContext();
- AS = AS.addParamAttribute(Ctx, ArgNos,
- Attribute::get(Ctx, Attribute::NonNull));
- Call.setAttributes(AS);
- Changed = true;
- }
-
- // If the callee is a pointer to a function, attempt to move any casts to the
- // arguments of the call/callbr/invoke.
- Value *Callee = Call.getCalledOperand();
- if (!isa<Function>(Callee) && transformConstExprCastCall(Call))
- return nullptr;
-
- if (Function *CalleeF = dyn_cast<Function>(Callee)) {
- // Remove the convergent attr on calls when the callee is not convergent.
- if (Call.isConvergent() && !CalleeF->isConvergent() &&
- !CalleeF->isIntrinsic()) {
- LLVM_DEBUG(dbgs() << "Removing convergent attr from instr " << Call
- << "\n");
- Call.setNotConvergent();
- return &Call;
- }
-
- // If the call and callee calling conventions don't match, this call must
- // be unreachable, as the call is undefined.
- if (CalleeF->getCallingConv() != Call.getCallingConv() &&
- // Only do this for calls to a function with a body. A prototype may
- // not actually end up matching the implementation's calling conv for a
- // variety of reasons (e.g. it may be written in assembly).
- !CalleeF->isDeclaration()) {
- Instruction *OldCall = &Call;
- CreateNonTerminatorUnreachable(OldCall);
+ if (isAllocationFn(&Call, &TLI))
+ annotateAnyAllocSite(Call, &TLI);
+
+ bool Changed = false;
+
+ // Mark any parameters that are known to be non-null with the nonnull
+ // attribute. This is helpful for inlining calls to functions with null
+ // checks on their arguments.
+ SmallVector<unsigned, 4> ArgNos;
+ unsigned ArgNo = 0;
+
+ for (Value *V : Call.args()) {
+ if (V->getType()->isPointerTy() &&
+ !Call.paramHasAttr(ArgNo, Attribute::NonNull) &&
+ isKnownNonZero(V, DL, 0, &AC, &Call, &DT))
+ ArgNos.push_back(ArgNo);
+ ArgNo++;
+ }
+
+ assert(ArgNo == Call.arg_size() && "sanity check");
+
+ if (!ArgNos.empty()) {
+ AttributeList AS = Call.getAttributes();
+ LLVMContext &Ctx = Call.getContext();
+ AS = AS.addParamAttribute(Ctx, ArgNos,
+ Attribute::get(Ctx, Attribute::NonNull));
+ Call.setAttributes(AS);
+ Changed = true;
+ }
+
+ // If the callee is a pointer to a function, attempt to move any casts to the
+ // arguments of the call/callbr/invoke.
+ Value *Callee = Call.getCalledOperand();
+ if (!isa<Function>(Callee) && transformConstExprCastCall(Call))
+ return nullptr;
+
+ if (Function *CalleeF = dyn_cast<Function>(Callee)) {
+ // Remove the convergent attr on calls when the callee is not convergent.
+ if (Call.isConvergent() && !CalleeF->isConvergent() &&
+ !CalleeF->isIntrinsic()) {
+ LLVM_DEBUG(dbgs() << "Removing convergent attr from instr " << Call
+ << "\n");
+ Call.setNotConvergent();
+ return &Call;
+ }
+
+ // If the call and callee calling conventions don't match, this call must
+ // be unreachable, as the call is undefined.
+ if (CalleeF->getCallingConv() != Call.getCallingConv() &&
+ // Only do this for calls to a function with a body. A prototype may
+ // not actually end up matching the implementation's calling conv for a
+ // variety of reasons (e.g. it may be written in assembly).
+ !CalleeF->isDeclaration()) {
+ Instruction *OldCall = &Call;
+ CreateNonTerminatorUnreachable(OldCall);
// If OldCall does not return void then replaceInstUsesWith undef.
- // This allows ValueHandlers and custom metadata to adjust itself.
- if (!OldCall->getType()->isVoidTy())
- replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
- if (isa<CallInst>(OldCall))
- return eraseInstFromFunction(*OldCall);
-
- // We cannot remove an invoke or a callbr, because it would change thexi
- // CFG, just change the callee to a null pointer.
- cast<CallBase>(OldCall)->setCalledFunction(
- CalleeF->getFunctionType(),
- Constant::getNullValue(CalleeF->getType()));
- return nullptr;
- }
- }
-
- if ((isa<ConstantPointerNull>(Callee) &&
- !NullPointerIsDefined(Call.getFunction())) ||
- isa<UndefValue>(Callee)) {
+ // This allows ValueHandlers and custom metadata to adjust itself.
+ if (!OldCall->getType()->isVoidTy())
+ replaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
+ if (isa<CallInst>(OldCall))
+ return eraseInstFromFunction(*OldCall);
+
+ // We cannot remove an invoke or a callbr, because it would change thexi
+ // CFG, just change the callee to a null pointer.
+ cast<CallBase>(OldCall)->setCalledFunction(
+ CalleeF->getFunctionType(),
+ Constant::getNullValue(CalleeF->getType()));
+ return nullptr;
+ }
+ }
+
+ if ((isa<ConstantPointerNull>(Callee) &&
+ !NullPointerIsDefined(Call.getFunction())) ||
+ isa<UndefValue>(Callee)) {
// If Call does not return void then replaceInstUsesWith undef.
- // This allows ValueHandlers and custom metadata to adjust itself.
- if (!Call.getType()->isVoidTy())
- replaceInstUsesWith(Call, UndefValue::get(Call.getType()));
-
- if (Call.isTerminator()) {
- // Can't remove an invoke or callbr because we cannot change the CFG.
- return nullptr;
- }
-
- // This instruction is not reachable, just remove it.
- CreateNonTerminatorUnreachable(&Call);
- return eraseInstFromFunction(Call);
- }
-
- if (IntrinsicInst *II = findInitTrampoline(Callee))
- return transformCallThroughTrampoline(Call, *II);
-
- PointerType *PTy = cast<PointerType>(Callee->getType());
- FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
- if (FTy->isVarArg()) {
- int ix = FTy->getNumParams();
- // See if we can optimize any arguments passed through the varargs area of
- // the call.
- for (auto I = Call.arg_begin() + FTy->getNumParams(), E = Call.arg_end();
- I != E; ++I, ++ix) {
- CastInst *CI = dyn_cast<CastInst>(*I);
- if (CI && isSafeToEliminateVarargsCast(Call, DL, CI, ix)) {
- replaceUse(*I, CI->getOperand(0));
-
- // Update the byval type to match the argument type.
- if (Call.isByValArgument(ix)) {
- Call.removeParamAttr(ix, Attribute::ByVal);
- Call.addParamAttr(
- ix, Attribute::getWithByValType(
- Call.getContext(),
- CI->getOperand(0)->getType()->getPointerElementType()));
- }
- Changed = true;
- }
- }
- }
-
- if (isa<InlineAsm>(Callee) && !Call.doesNotThrow()) {
- // Inline asm calls cannot throw - mark them 'nounwind'.
- Call.setDoesNotThrow();
- Changed = true;
- }
-
- // Try to optimize the call if possible, we require DataLayout for most of
- // this. None of these calls are seen as possibly dead so go ahead and
- // delete the instruction now.
- if (CallInst *CI = dyn_cast<CallInst>(&Call)) {
- Instruction *I = tryOptimizeCall(CI);
- // If we changed something return the result, etc. Otherwise let
- // the fallthrough check.
- if (I) return eraseInstFromFunction(*I);
- }
-
- if (!Call.use_empty() && !Call.isMustTailCall())
- if (Value *ReturnedArg = Call.getReturnedArgOperand()) {
- Type *CallTy = Call.getType();
- Type *RetArgTy = ReturnedArg->getType();
- if (RetArgTy->canLosslesslyBitCastTo(CallTy))
- return replaceInstUsesWith(
- Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy));
- }
-
- if (isAllocLikeFn(&Call, &TLI))
- return visitAllocSite(Call);
-
- return Changed ? &Call : nullptr;
-}
-
-/// If the callee is a constexpr cast of a function, attempt to move the cast to
-/// the arguments of the call/callbr/invoke.
+ // This allows ValueHandlers and custom metadata to adjust itself.
+ if (!Call.getType()->isVoidTy())
+ replaceInstUsesWith(Call, UndefValue::get(Call.getType()));
+
+ if (Call.isTerminator()) {
+ // Can't remove an invoke or callbr because we cannot change the CFG.
+ return nullptr;
+ }
+
+ // This instruction is not reachable, just remove it.
+ CreateNonTerminatorUnreachable(&Call);
+ return eraseInstFromFunction(Call);
+ }
+
+ if (IntrinsicInst *II = findInitTrampoline(Callee))
+ return transformCallThroughTrampoline(Call, *II);
+
+ PointerType *PTy = cast<PointerType>(Callee->getType());
+ FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+ if (FTy->isVarArg()) {
+ int ix = FTy->getNumParams();
+ // See if we can optimize any arguments passed through the varargs area of
+ // the call.
+ for (auto I = Call.arg_begin() + FTy->getNumParams(), E = Call.arg_end();
+ I != E; ++I, ++ix) {
+ CastInst *CI = dyn_cast<CastInst>(*I);
+ if (CI && isSafeToEliminateVarargsCast(Call, DL, CI, ix)) {
+ replaceUse(*I, CI->getOperand(0));
+
+ // Update the byval type to match the argument type.
+ if (Call.isByValArgument(ix)) {
+ Call.removeParamAttr(ix, Attribute::ByVal);
+ Call.addParamAttr(
+ ix, Attribute::getWithByValType(
+ Call.getContext(),
+ CI->getOperand(0)->getType()->getPointerElementType()));
+ }
+ Changed = true;
+ }
+ }
+ }
+
+ if (isa<InlineAsm>(Callee) && !Call.doesNotThrow()) {
+ // Inline asm calls cannot throw - mark them 'nounwind'.
+ Call.setDoesNotThrow();
+ Changed = true;
+ }
+
+ // Try to optimize the call if possible, we require DataLayout for most of
+ // this. None of these calls are seen as possibly dead so go ahead and
+ // delete the instruction now.
+ if (CallInst *CI = dyn_cast<CallInst>(&Call)) {
+ Instruction *I = tryOptimizeCall(CI);
+ // If we changed something return the result, etc. Otherwise let
+ // the fallthrough check.
+ if (I) return eraseInstFromFunction(*I);
+ }
+
+ if (!Call.use_empty() && !Call.isMustTailCall())
+ if (Value *ReturnedArg = Call.getReturnedArgOperand()) {
+ Type *CallTy = Call.getType();
+ Type *RetArgTy = ReturnedArg->getType();
+ if (RetArgTy->canLosslesslyBitCastTo(CallTy))
+ return replaceInstUsesWith(
+ Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy));
+ }
+
+ if (isAllocLikeFn(&Call, &TLI))
+ return visitAllocSite(Call);
+
+ return Changed ? &Call : nullptr;
+}
+
+/// If the callee is a constexpr cast of a function, attempt to move the cast to
+/// the arguments of the call/callbr/invoke.
bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
- auto *Callee =
- dyn_cast<Function>(Call.getCalledOperand()->stripPointerCasts());
- if (!Callee)
- return false;
-
- // If this is a call to a thunk function, don't remove the cast. Thunks are
- // used to transparently forward all incoming parameters and outgoing return
- // values, so it's important to leave the cast in place.
- if (Callee->hasFnAttribute("thunk"))
- return false;
-
- // If this is a musttail call, the callee's prototype must match the caller's
- // prototype with the exception of pointee types. The code below doesn't
- // implement that, so we can't do this transform.
- // TODO: Do the transform if it only requires adding pointer casts.
- if (Call.isMustTailCall())
- return false;
-
- Instruction *Caller = &Call;
- const AttributeList &CallerPAL = Call.getAttributes();
-
- // Okay, this is a cast from a function to a different type. Unless doing so
- // would cause a type conversion of one of our arguments, change this call to
- // be a direct call with arguments casted to the appropriate types.
- FunctionType *FT = Callee->getFunctionType();
- Type *OldRetTy = Caller->getType();
- Type *NewRetTy = FT->getReturnType();
-
- // Check to see if we are changing the return type...
- if (OldRetTy != NewRetTy) {
-
- if (NewRetTy->isStructTy())
- return false; // TODO: Handle multiple return values.
-
- if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) {
- if (Callee->isDeclaration())
- return false; // Cannot transform this return value.
-
- if (!Caller->use_empty() &&
- // void -> non-void is handled specially
- !NewRetTy->isVoidTy())
- return false; // Cannot transform this return value.
- }
-
- if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
- AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
- if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy)))
- return false; // Attribute not compatible with transformed value.
- }
-
- // If the callbase is an invoke/callbr instruction, and the return value is
- // used by a PHI node in a successor, we cannot change the return type of
- // the call because there is no place to put the cast instruction (without
- // breaking the critical edge). Bail out in this case.
- if (!Caller->use_empty()) {
- if (InvokeInst *II = dyn_cast<InvokeInst>(Caller))
- for (User *U : II->users())
- if (PHINode *PN = dyn_cast<PHINode>(U))
- if (PN->getParent() == II->getNormalDest() ||
- PN->getParent() == II->getUnwindDest())
- return false;
- // FIXME: Be conservative for callbr to avoid a quadratic search.
- if (isa<CallBrInst>(Caller))
- return false;
- }
- }
-
- unsigned NumActualArgs = Call.arg_size();
- unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs);
-
- // Prevent us turning:
- // declare void @takes_i32_inalloca(i32* inalloca)
- // call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0)
- //
- // into:
- // call void @takes_i32_inalloca(i32* null)
- //
- // Similarly, avoid folding away bitcasts of byval calls.
- if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
- Callee->getAttributes().hasAttrSomewhere(Attribute::Preallocated) ||
- Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal))
- return false;
-
- auto AI = Call.arg_begin();
- for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) {
- Type *ParamTy = FT->getParamType(i);
- Type *ActTy = (*AI)->getType();
-
- if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
- return false; // Cannot transform this parameter value.
-
- if (AttrBuilder(CallerPAL.getParamAttributes(i))
- .overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
- return false; // Attribute not compatible with transformed value.
-
- if (Call.isInAllocaArgument(i))
- return false; // Cannot transform to and from inalloca.
-
+ auto *Callee =
+ dyn_cast<Function>(Call.getCalledOperand()->stripPointerCasts());
+ if (!Callee)
+ return false;
+
+ // If this is a call to a thunk function, don't remove the cast. Thunks are
+ // used to transparently forward all incoming parameters and outgoing return
+ // values, so it's important to leave the cast in place.
+ if (Callee->hasFnAttribute("thunk"))
+ return false;
+
+ // If this is a musttail call, the callee's prototype must match the caller's
+ // prototype with the exception of pointee types. The code below doesn't
+ // implement that, so we can't do this transform.
+ // TODO: Do the transform if it only requires adding pointer casts.
+ if (Call.isMustTailCall())
+ return false;
+
+ Instruction *Caller = &Call;
+ const AttributeList &CallerPAL = Call.getAttributes();
+
+ // Okay, this is a cast from a function to a different type. Unless doing so
+ // would cause a type conversion of one of our arguments, change this call to
+ // be a direct call with arguments casted to the appropriate types.
+ FunctionType *FT = Callee->getFunctionType();
+ Type *OldRetTy = Caller->getType();
+ Type *NewRetTy = FT->getReturnType();
+
+ // Check to see if we are changing the return type...
+ if (OldRetTy != NewRetTy) {
+
+ if (NewRetTy->isStructTy())
+ return false; // TODO: Handle multiple return values.
+
+ if (!CastInst::isBitOrNoopPointerCastable(NewRetTy, OldRetTy, DL)) {
+ if (Callee->isDeclaration())
+ return false; // Cannot transform this return value.
+
+ if (!Caller->use_empty() &&
+ // void -> non-void is handled specially
+ !NewRetTy->isVoidTy())
+ return false; // Cannot transform this return value.
+ }
+
+ if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
+ AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+ if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy)))
+ return false; // Attribute not compatible with transformed value.
+ }
+
+ // If the callbase is an invoke/callbr instruction, and the return value is
+ // used by a PHI node in a successor, we cannot change the return type of
+ // the call because there is no place to put the cast instruction (without
+ // breaking the critical edge). Bail out in this case.
+ if (!Caller->use_empty()) {
+ if (InvokeInst *II = dyn_cast<InvokeInst>(Caller))
+ for (User *U : II->users())
+ if (PHINode *PN = dyn_cast<PHINode>(U))
+ if (PN->getParent() == II->getNormalDest() ||
+ PN->getParent() == II->getUnwindDest())
+ return false;
+ // FIXME: Be conservative for callbr to avoid a quadratic search.
+ if (isa<CallBrInst>(Caller))
+ return false;
+ }
+ }
+
+ unsigned NumActualArgs = Call.arg_size();
+ unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs);
+
+ // Prevent us turning:
+ // declare void @takes_i32_inalloca(i32* inalloca)
+ // call void bitcast (void (i32*)* @takes_i32_inalloca to void (i32)*)(i32 0)
+ //
+ // into:
+ // call void @takes_i32_inalloca(i32* null)
+ //
+ // Similarly, avoid folding away bitcasts of byval calls.
+ if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
+ Callee->getAttributes().hasAttrSomewhere(Attribute::Preallocated) ||
+ Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal))
+ return false;
+
+ auto AI = Call.arg_begin();
+ for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) {
+ Type *ParamTy = FT->getParamType(i);
+ Type *ActTy = (*AI)->getType();
+
+ if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
+ return false; // Cannot transform this parameter value.
+
+ if (AttrBuilder(CallerPAL.getParamAttributes(i))
+ .overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
+ return false; // Attribute not compatible with transformed value.
+
+ if (Call.isInAllocaArgument(i))
+ return false; // Cannot transform to and from inalloca.
+
if (CallerPAL.hasParamAttribute(i, Attribute::SwiftError))
return false;
- // If the parameter is passed as a byval argument, then we have to have a
- // sized type and the sized type has to have the same size as the old type.
- if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
- PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
- if (!ParamPTy || !ParamPTy->getElementType()->isSized())
- return false;
-
- Type *CurElTy = Call.getParamByValType(i);
- if (DL.getTypeAllocSize(CurElTy) !=
- DL.getTypeAllocSize(ParamPTy->getElementType()))
- return false;
- }
- }
-
- if (Callee->isDeclaration()) {
- // Do not delete arguments unless we have a function body.
- if (FT->getNumParams() < NumActualArgs && !FT->isVarArg())
- return false;
-
- // If the callee is just a declaration, don't change the varargsness of the
- // call. We don't want to introduce a varargs call where one doesn't
- // already exist.
- PointerType *APTy = cast<PointerType>(Call.getCalledOperand()->getType());
- if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
- return false;
-
- // If both the callee and the cast type are varargs, we still have to make
- // sure the number of fixed parameters are the same or we have the same
- // ABI issues as if we introduce a varargs call.
- if (FT->isVarArg() &&
- cast<FunctionType>(APTy->getElementType())->isVarArg() &&
- FT->getNumParams() !=
- cast<FunctionType>(APTy->getElementType())->getNumParams())
- return false;
- }
-
- if (FT->getNumParams() < NumActualArgs && FT->isVarArg() &&
- !CallerPAL.isEmpty()) {
- // In this case we have more arguments than the new function type, but we
- // won't be dropping them. Check that these extra arguments have attributes
- // that are compatible with being a vararg call argument.
- unsigned SRetIdx;
- if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) &&
- SRetIdx > FT->getNumParams())
- return false;
- }
-
- // Okay, we decided that this is a safe thing to do: go ahead and start
- // inserting cast instructions as necessary.
- SmallVector<Value *, 8> Args;
- SmallVector<AttributeSet, 8> ArgAttrs;
- Args.reserve(NumActualArgs);
- ArgAttrs.reserve(NumActualArgs);
-
- // Get any return attributes.
- AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
-
- // If the return value is not being used, the type may not be compatible
- // with the existing attributes. Wipe out any problematic attributes.
- RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy));
-
- LLVMContext &Ctx = Call.getContext();
- AI = Call.arg_begin();
- for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
- Type *ParamTy = FT->getParamType(i);
-
- Value *NewArg = *AI;
- if ((*AI)->getType() != ParamTy)
- NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy);
- Args.push_back(NewArg);
-
- // Add any parameter attributes.
- if (CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
- AttrBuilder AB(CallerPAL.getParamAttributes(i));
- AB.addByValAttr(NewArg->getType()->getPointerElementType());
- ArgAttrs.push_back(AttributeSet::get(Ctx, AB));
- } else
- ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
- }
-
- // If the function takes more arguments than the call was taking, add them
- // now.
- for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) {
- Args.push_back(Constant::getNullValue(FT->getParamType(i)));
- ArgAttrs.push_back(AttributeSet());
- }
-
- // If we are removing arguments to the function, emit an obnoxious warning.
- if (FT->getNumParams() < NumActualArgs) {
- // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722
- if (FT->isVarArg()) {
- // Add all of the arguments in their promoted form to the arg list.
- for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
- Type *PTy = getPromotedType((*AI)->getType());
- Value *NewArg = *AI;
- if (PTy != (*AI)->getType()) {
- // Must promote to pass through va_arg area!
- Instruction::CastOps opcode =
- CastInst::getCastOpcode(*AI, false, PTy, false);
- NewArg = Builder.CreateCast(opcode, *AI, PTy);
- }
- Args.push_back(NewArg);
-
- // Add any parameter attributes.
- ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
- }
- }
- }
-
- AttributeSet FnAttrs = CallerPAL.getFnAttributes();
-
- if (NewRetTy->isVoidTy())
- Caller->setName(""); // Void type should not have a name.
-
- assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) &&
- "missing argument attributes");
- AttributeList NewCallerPAL = AttributeList::get(
- Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs);
-
- SmallVector<OperandBundleDef, 1> OpBundles;
- Call.getOperandBundlesAsDefs(OpBundles);
-
- CallBase *NewCall;
- if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
- NewCall = Builder.CreateInvoke(Callee, II->getNormalDest(),
- II->getUnwindDest(), Args, OpBundles);
- } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) {
- NewCall = Builder.CreateCallBr(Callee, CBI->getDefaultDest(),
- CBI->getIndirectDests(), Args, OpBundles);
- } else {
- NewCall = Builder.CreateCall(Callee, Args, OpBundles);
- cast<CallInst>(NewCall)->setTailCallKind(
- cast<CallInst>(Caller)->getTailCallKind());
- }
- NewCall->takeName(Caller);
- NewCall->setCallingConv(Call.getCallingConv());
- NewCall->setAttributes(NewCallerPAL);
-
- // Preserve prof metadata if any.
- NewCall->copyMetadata(*Caller, {LLVMContext::MD_prof});
-
- // Insert a cast of the return type as necessary.
- Instruction *NC = NewCall;
- Value *NV = NC;
- if (OldRetTy != NV->getType() && !Caller->use_empty()) {
- if (!NV->getType()->isVoidTy()) {
- NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy);
- NC->setDebugLoc(Caller->getDebugLoc());
-
- // If this is an invoke/callbr instruction, we should insert it after the
- // first non-phi instruction in the normal successor block.
- if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
- BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt();
- InsertNewInstBefore(NC, *I);
- } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) {
- BasicBlock::iterator I = CBI->getDefaultDest()->getFirstInsertionPt();
- InsertNewInstBefore(NC, *I);
- } else {
- // Otherwise, it's a call, just insert cast right after the call.
- InsertNewInstBefore(NC, *Caller);
- }
- Worklist.pushUsersToWorkList(*Caller);
- } else {
- NV = UndefValue::get(Caller->getType());
- }
- }
-
- if (!Caller->use_empty())
- replaceInstUsesWith(*Caller, NV);
- else if (Caller->hasValueHandle()) {
- if (OldRetTy == NV->getType())
- ValueHandleBase::ValueIsRAUWd(Caller, NV);
- else
- // We cannot call ValueIsRAUWd with a different type, and the
- // actual tracked value will disappear.
- ValueHandleBase::ValueIsDeleted(Caller);
- }
-
- eraseInstFromFunction(*Caller);
- return true;
-}
-
-/// Turn a call to a function created by init_trampoline / adjust_trampoline
-/// intrinsic pair into a direct call to the underlying function.
-Instruction *
+ // If the parameter is passed as a byval argument, then we have to have a
+ // sized type and the sized type has to have the same size as the old type.
+ if (ParamTy != ActTy && CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
+ PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
+ if (!ParamPTy || !ParamPTy->getElementType()->isSized())
+ return false;
+
+ Type *CurElTy = Call.getParamByValType(i);
+ if (DL.getTypeAllocSize(CurElTy) !=
+ DL.getTypeAllocSize(ParamPTy->getElementType()))
+ return false;
+ }
+ }
+
+ if (Callee->isDeclaration()) {
+ // Do not delete arguments unless we have a function body.
+ if (FT->getNumParams() < NumActualArgs && !FT->isVarArg())
+ return false;
+
+ // If the callee is just a declaration, don't change the varargsness of the
+ // call. We don't want to introduce a varargs call where one doesn't
+ // already exist.
+ PointerType *APTy = cast<PointerType>(Call.getCalledOperand()->getType());
+ if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
+ return false;
+
+ // If both the callee and the cast type are varargs, we still have to make
+ // sure the number of fixed parameters are the same or we have the same
+ // ABI issues as if we introduce a varargs call.
+ if (FT->isVarArg() &&
+ cast<FunctionType>(APTy->getElementType())->isVarArg() &&
+ FT->getNumParams() !=
+ cast<FunctionType>(APTy->getElementType())->getNumParams())
+ return false;
+ }
+
+ if (FT->getNumParams() < NumActualArgs && FT->isVarArg() &&
+ !CallerPAL.isEmpty()) {
+ // In this case we have more arguments than the new function type, but we
+ // won't be dropping them. Check that these extra arguments have attributes
+ // that are compatible with being a vararg call argument.
+ unsigned SRetIdx;
+ if (CallerPAL.hasAttrSomewhere(Attribute::StructRet, &SRetIdx) &&
+ SRetIdx > FT->getNumParams())
+ return false;
+ }
+
+ // Okay, we decided that this is a safe thing to do: go ahead and start
+ // inserting cast instructions as necessary.
+ SmallVector<Value *, 8> Args;
+ SmallVector<AttributeSet, 8> ArgAttrs;
+ Args.reserve(NumActualArgs);
+ ArgAttrs.reserve(NumActualArgs);
+
+ // Get any return attributes.
+ AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+
+ // If the return value is not being used, the type may not be compatible
+ // with the existing attributes. Wipe out any problematic attributes.
+ RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy));
+
+ LLVMContext &Ctx = Call.getContext();
+ AI = Call.arg_begin();
+ for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
+ Type *ParamTy = FT->getParamType(i);
+
+ Value *NewArg = *AI;
+ if ((*AI)->getType() != ParamTy)
+ NewArg = Builder.CreateBitOrPointerCast(*AI, ParamTy);
+ Args.push_back(NewArg);
+
+ // Add any parameter attributes.
+ if (CallerPAL.hasParamAttribute(i, Attribute::ByVal)) {
+ AttrBuilder AB(CallerPAL.getParamAttributes(i));
+ AB.addByValAttr(NewArg->getType()->getPointerElementType());
+ ArgAttrs.push_back(AttributeSet::get(Ctx, AB));
+ } else
+ ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
+ }
+
+ // If the function takes more arguments than the call was taking, add them
+ // now.
+ for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i) {
+ Args.push_back(Constant::getNullValue(FT->getParamType(i)));
+ ArgAttrs.push_back(AttributeSet());
+ }
+
+ // If we are removing arguments to the function, emit an obnoxious warning.
+ if (FT->getNumParams() < NumActualArgs) {
+ // TODO: if (!FT->isVarArg()) this call may be unreachable. PR14722
+ if (FT->isVarArg()) {
+ // Add all of the arguments in their promoted form to the arg list.
+ for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
+ Type *PTy = getPromotedType((*AI)->getType());
+ Value *NewArg = *AI;
+ if (PTy != (*AI)->getType()) {
+ // Must promote to pass through va_arg area!
+ Instruction::CastOps opcode =
+ CastInst::getCastOpcode(*AI, false, PTy, false);
+ NewArg = Builder.CreateCast(opcode, *AI, PTy);
+ }
+ Args.push_back(NewArg);
+
+ // Add any parameter attributes.
+ ArgAttrs.push_back(CallerPAL.getParamAttributes(i));
+ }
+ }
+ }
+
+ AttributeSet FnAttrs = CallerPAL.getFnAttributes();
+
+ if (NewRetTy->isVoidTy())
+ Caller->setName(""); // Void type should not have a name.
+
+ assert((ArgAttrs.size() == FT->getNumParams() || FT->isVarArg()) &&
+ "missing argument attributes");
+ AttributeList NewCallerPAL = AttributeList::get(
+ Ctx, FnAttrs, AttributeSet::get(Ctx, RAttrs), ArgAttrs);
+
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ Call.getOperandBundlesAsDefs(OpBundles);
+
+ CallBase *NewCall;
+ if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+ NewCall = Builder.CreateInvoke(Callee, II->getNormalDest(),
+ II->getUnwindDest(), Args, OpBundles);
+ } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) {
+ NewCall = Builder.CreateCallBr(Callee, CBI->getDefaultDest(),
+ CBI->getIndirectDests(), Args, OpBundles);
+ } else {
+ NewCall = Builder.CreateCall(Callee, Args, OpBundles);
+ cast<CallInst>(NewCall)->setTailCallKind(
+ cast<CallInst>(Caller)->getTailCallKind());
+ }
+ NewCall->takeName(Caller);
+ NewCall->setCallingConv(Call.getCallingConv());
+ NewCall->setAttributes(NewCallerPAL);
+
+ // Preserve prof metadata if any.
+ NewCall->copyMetadata(*Caller, {LLVMContext::MD_prof});
+
+ // Insert a cast of the return type as necessary.
+ Instruction *NC = NewCall;
+ Value *NV = NC;
+ if (OldRetTy != NV->getType() && !Caller->use_empty()) {
+ if (!NV->getType()->isVoidTy()) {
+ NV = NC = CastInst::CreateBitOrPointerCast(NC, OldRetTy);
+ NC->setDebugLoc(Caller->getDebugLoc());
+
+ // If this is an invoke/callbr instruction, we should insert it after the
+ // first non-phi instruction in the normal successor block.
+ if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+ BasicBlock::iterator I = II->getNormalDest()->getFirstInsertionPt();
+ InsertNewInstBefore(NC, *I);
+ } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(Caller)) {
+ BasicBlock::iterator I = CBI->getDefaultDest()->getFirstInsertionPt();
+ InsertNewInstBefore(NC, *I);
+ } else {
+ // Otherwise, it's a call, just insert cast right after the call.
+ InsertNewInstBefore(NC, *Caller);
+ }
+ Worklist.pushUsersToWorkList(*Caller);
+ } else {
+ NV = UndefValue::get(Caller->getType());
+ }
+ }
+
+ if (!Caller->use_empty())
+ replaceInstUsesWith(*Caller, NV);
+ else if (Caller->hasValueHandle()) {
+ if (OldRetTy == NV->getType())
+ ValueHandleBase::ValueIsRAUWd(Caller, NV);
+ else
+ // We cannot call ValueIsRAUWd with a different type, and the
+ // actual tracked value will disappear.
+ ValueHandleBase::ValueIsDeleted(Caller);
+ }
+
+ eraseInstFromFunction(*Caller);
+ return true;
+}
+
+/// Turn a call to a function created by init_trampoline / adjust_trampoline
+/// intrinsic pair into a direct call to the underlying function.
+Instruction *
InstCombinerImpl::transformCallThroughTrampoline(CallBase &Call,
IntrinsicInst &Tramp) {
- Value *Callee = Call.getCalledOperand();
- Type *CalleeTy = Callee->getType();
- FunctionType *FTy = Call.getFunctionType();
- AttributeList Attrs = Call.getAttributes();
-
- // If the call already has the 'nest' attribute somewhere then give up -
- // otherwise 'nest' would occur twice after splicing in the chain.
- if (Attrs.hasAttrSomewhere(Attribute::Nest))
- return nullptr;
-
- Function *NestF = cast<Function>(Tramp.getArgOperand(1)->stripPointerCasts());
- FunctionType *NestFTy = NestF->getFunctionType();
-
- AttributeList NestAttrs = NestF->getAttributes();
- if (!NestAttrs.isEmpty()) {
- unsigned NestArgNo = 0;
- Type *NestTy = nullptr;
- AttributeSet NestAttr;
-
- // Look for a parameter marked with the 'nest' attribute.
- for (FunctionType::param_iterator I = NestFTy->param_begin(),
- E = NestFTy->param_end();
- I != E; ++NestArgNo, ++I) {
- AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo);
- if (AS.hasAttribute(Attribute::Nest)) {
- // Record the parameter type and any other attributes.
- NestTy = *I;
- NestAttr = AS;
- break;
- }
- }
-
- if (NestTy) {
- std::vector<Value*> NewArgs;
- std::vector<AttributeSet> NewArgAttrs;
- NewArgs.reserve(Call.arg_size() + 1);
- NewArgAttrs.reserve(Call.arg_size());
-
- // Insert the nest argument into the call argument list, which may
- // mean appending it. Likewise for attributes.
-
- {
- unsigned ArgNo = 0;
- auto I = Call.arg_begin(), E = Call.arg_end();
- do {
- if (ArgNo == NestArgNo) {
- // Add the chain argument and attributes.
- Value *NestVal = Tramp.getArgOperand(2);
- if (NestVal->getType() != NestTy)
- NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest");
- NewArgs.push_back(NestVal);
- NewArgAttrs.push_back(NestAttr);
- }
-
- if (I == E)
- break;
-
- // Add the original argument and attributes.
- NewArgs.push_back(*I);
- NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
-
- ++ArgNo;
- ++I;
- } while (true);
- }
-
- // The trampoline may have been bitcast to a bogus type (FTy).
- // Handle this by synthesizing a new function type, equal to FTy
- // with the chain parameter inserted.
-
- std::vector<Type*> NewTypes;
- NewTypes.reserve(FTy->getNumParams()+1);
-
- // Insert the chain's type into the list of parameter types, which may
- // mean appending it.
- {
- unsigned ArgNo = 0;
- FunctionType::param_iterator I = FTy->param_begin(),
- E = FTy->param_end();
-
- do {
- if (ArgNo == NestArgNo)
- // Add the chain's type.
- NewTypes.push_back(NestTy);
-
- if (I == E)
- break;
-
- // Add the original type.
- NewTypes.push_back(*I);
-
- ++ArgNo;
- ++I;
- } while (true);
- }
-
- // Replace the trampoline call with a direct call. Let the generic
- // code sort out any function type mismatches.
- FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes,
- FTy->isVarArg());
- Constant *NewCallee =
- NestF->getType() == PointerType::getUnqual(NewFTy) ?
- NestF : ConstantExpr::getBitCast(NestF,
- PointerType::getUnqual(NewFTy));
- AttributeList NewPAL =
- AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(),
- Attrs.getRetAttributes(), NewArgAttrs);
-
- SmallVector<OperandBundleDef, 1> OpBundles;
- Call.getOperandBundlesAsDefs(OpBundles);
-
- Instruction *NewCaller;
- if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
- NewCaller = InvokeInst::Create(NewFTy, NewCallee,
- II->getNormalDest(), II->getUnwindDest(),
- NewArgs, OpBundles);
- cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv());
- cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
- } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(&Call)) {
- NewCaller =
- CallBrInst::Create(NewFTy, NewCallee, CBI->getDefaultDest(),
- CBI->getIndirectDests(), NewArgs, OpBundles);
- cast<CallBrInst>(NewCaller)->setCallingConv(CBI->getCallingConv());
- cast<CallBrInst>(NewCaller)->setAttributes(NewPAL);
- } else {
- NewCaller = CallInst::Create(NewFTy, NewCallee, NewArgs, OpBundles);
- cast<CallInst>(NewCaller)->setTailCallKind(
- cast<CallInst>(Call).getTailCallKind());
- cast<CallInst>(NewCaller)->setCallingConv(
- cast<CallInst>(Call).getCallingConv());
- cast<CallInst>(NewCaller)->setAttributes(NewPAL);
- }
- NewCaller->setDebugLoc(Call.getDebugLoc());
-
- return NewCaller;
- }
- }
-
- // Replace the trampoline call with a direct call. Since there is no 'nest'
- // parameter, there is no need to adjust the argument list. Let the generic
- // code sort out any function type mismatches.
- Constant *NewCallee = ConstantExpr::getBitCast(NestF, CalleeTy);
- Call.setCalledFunction(FTy, NewCallee);
- return &Call;
-}
+ Value *Callee = Call.getCalledOperand();
+ Type *CalleeTy = Callee->getType();
+ FunctionType *FTy = Call.getFunctionType();
+ AttributeList Attrs = Call.getAttributes();
+
+ // If the call already has the 'nest' attribute somewhere then give up -
+ // otherwise 'nest' would occur twice after splicing in the chain.
+ if (Attrs.hasAttrSomewhere(Attribute::Nest))
+ return nullptr;
+
+ Function *NestF = cast<Function>(Tramp.getArgOperand(1)->stripPointerCasts());
+ FunctionType *NestFTy = NestF->getFunctionType();
+
+ AttributeList NestAttrs = NestF->getAttributes();
+ if (!NestAttrs.isEmpty()) {
+ unsigned NestArgNo = 0;
+ Type *NestTy = nullptr;
+ AttributeSet NestAttr;
+
+ // Look for a parameter marked with the 'nest' attribute.
+ for (FunctionType::param_iterator I = NestFTy->param_begin(),
+ E = NestFTy->param_end();
+ I != E; ++NestArgNo, ++I) {
+ AttributeSet AS = NestAttrs.getParamAttributes(NestArgNo);
+ if (AS.hasAttribute(Attribute::Nest)) {
+ // Record the parameter type and any other attributes.
+ NestTy = *I;
+ NestAttr = AS;
+ break;
+ }
+ }
+
+ if (NestTy) {
+ std::vector<Value*> NewArgs;
+ std::vector<AttributeSet> NewArgAttrs;
+ NewArgs.reserve(Call.arg_size() + 1);
+ NewArgAttrs.reserve(Call.arg_size());
+
+ // Insert the nest argument into the call argument list, which may
+ // mean appending it. Likewise for attributes.
+
+ {
+ unsigned ArgNo = 0;
+ auto I = Call.arg_begin(), E = Call.arg_end();
+ do {
+ if (ArgNo == NestArgNo) {
+ // Add the chain argument and attributes.
+ Value *NestVal = Tramp.getArgOperand(2);
+ if (NestVal->getType() != NestTy)
+ NestVal = Builder.CreateBitCast(NestVal, NestTy, "nest");
+ NewArgs.push_back(NestVal);
+ NewArgAttrs.push_back(NestAttr);
+ }
+
+ if (I == E)
+ break;
+
+ // Add the original argument and attributes.
+ NewArgs.push_back(*I);
+ NewArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
+
+ ++ArgNo;
+ ++I;
+ } while (true);
+ }
+
+ // The trampoline may have been bitcast to a bogus type (FTy).
+ // Handle this by synthesizing a new function type, equal to FTy
+ // with the chain parameter inserted.
+
+ std::vector<Type*> NewTypes;
+ NewTypes.reserve(FTy->getNumParams()+1);
+
+ // Insert the chain's type into the list of parameter types, which may
+ // mean appending it.
+ {
+ unsigned ArgNo = 0;
+ FunctionType::param_iterator I = FTy->param_begin(),
+ E = FTy->param_end();
+
+ do {
+ if (ArgNo == NestArgNo)
+ // Add the chain's type.
+ NewTypes.push_back(NestTy);
+
+ if (I == E)
+ break;
+
+ // Add the original type.
+ NewTypes.push_back(*I);
+
+ ++ArgNo;
+ ++I;
+ } while (true);
+ }
+
+ // Replace the trampoline call with a direct call. Let the generic
+ // code sort out any function type mismatches.
+ FunctionType *NewFTy = FunctionType::get(FTy->getReturnType(), NewTypes,
+ FTy->isVarArg());
+ Constant *NewCallee =
+ NestF->getType() == PointerType::getUnqual(NewFTy) ?
+ NestF : ConstantExpr::getBitCast(NestF,
+ PointerType::getUnqual(NewFTy));
+ AttributeList NewPAL =
+ AttributeList::get(FTy->getContext(), Attrs.getFnAttributes(),
+ Attrs.getRetAttributes(), NewArgAttrs);
+
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ Call.getOperandBundlesAsDefs(OpBundles);
+
+ Instruction *NewCaller;
+ if (InvokeInst *II = dyn_cast<InvokeInst>(&Call)) {
+ NewCaller = InvokeInst::Create(NewFTy, NewCallee,
+ II->getNormalDest(), II->getUnwindDest(),
+ NewArgs, OpBundles);
+ cast<InvokeInst>(NewCaller)->setCallingConv(II->getCallingConv());
+ cast<InvokeInst>(NewCaller)->setAttributes(NewPAL);
+ } else if (CallBrInst *CBI = dyn_cast<CallBrInst>(&Call)) {
+ NewCaller =
+ CallBrInst::Create(NewFTy, NewCallee, CBI->getDefaultDest(),
+ CBI->getIndirectDests(), NewArgs, OpBundles);
+ cast<CallBrInst>(NewCaller)->setCallingConv(CBI->getCallingConv());
+ cast<CallBrInst>(NewCaller)->setAttributes(NewPAL);
+ } else {
+ NewCaller = CallInst::Create(NewFTy, NewCallee, NewArgs, OpBundles);
+ cast<CallInst>(NewCaller)->setTailCallKind(
+ cast<CallInst>(Call).getTailCallKind());
+ cast<CallInst>(NewCaller)->setCallingConv(
+ cast<CallInst>(Call).getCallingConv());
+ cast<CallInst>(NewCaller)->setAttributes(NewPAL);
+ }
+ NewCaller->setDebugLoc(Call.getDebugLoc());
+
+ return NewCaller;
+ }
+ }
+
+ // Replace the trampoline call with a direct call. Since there is no 'nest'
+ // parameter, there is no need to adjust the argument list. Let the generic
+ // code sort out any function type mismatches.
+ Constant *NewCallee = ConstantExpr::getBitCast(NestF, CalleeTy);
+ Call.setCalledFunction(FTy, NewCallee);
+ return &Call;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCasts.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCasts.cpp
index d0bb02568d..07e68c4441 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -1,99 +1,99 @@
-//===- InstCombineCasts.cpp -----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the visit functions for cast operations.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+//===- InstCombineCasts.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for cast operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/KnownBits.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include <numeric>
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "instcombine"
-
-/// Analyze 'Val', seeing if it is a simple linear expression.
-/// If so, decompose it, returning some value X, such that Val is
-/// X*Scale+Offset.
-///
-static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
- uint64_t &Offset) {
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
- Offset = CI->getZExtValue();
- Scale = 0;
- return ConstantInt::get(Val->getType(), 0);
- }
-
- if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
- // Cannot look past anything that might overflow.
- OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val);
- if (OBI && !OBI->hasNoUnsignedWrap() && !OBI->hasNoSignedWrap()) {
- Scale = 1;
- Offset = 0;
- return Val;
- }
-
- if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
- if (I->getOpcode() == Instruction::Shl) {
- // This is a value scaled by '1 << the shift amt'.
- Scale = UINT64_C(1) << RHS->getZExtValue();
- Offset = 0;
- return I->getOperand(0);
- }
-
- if (I->getOpcode() == Instruction::Mul) {
- // This value is scaled by 'RHS'.
- Scale = RHS->getZExtValue();
- Offset = 0;
- return I->getOperand(0);
- }
-
- if (I->getOpcode() == Instruction::Add) {
- // We have X+C. Check to see if we really have (X*C2)+C1,
- // where C1 is divisible by C2.
- unsigned SubScale;
- Value *SubVal =
- decomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
- Offset += RHS->getZExtValue();
- Scale = SubScale;
- return SubVal;
- }
- }
- }
-
- // Otherwise, we can't look past this.
- Scale = 1;
- Offset = 0;
- return Val;
-}
-
-/// If we find a cast of an allocation instruction, try to eliminate the cast by
-/// moving the type information into the alloc.
+#include <numeric>
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+/// Analyze 'Val', seeing if it is a simple linear expression.
+/// If so, decompose it, returning some value X, such that Val is
+/// X*Scale+Offset.
+///
+static Value *decomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
+ uint64_t &Offset) {
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+ Offset = CI->getZExtValue();
+ Scale = 0;
+ return ConstantInt::get(Val->getType(), 0);
+ }
+
+ if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
+ // Cannot look past anything that might overflow.
+ OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val);
+ if (OBI && !OBI->hasNoUnsignedWrap() && !OBI->hasNoSignedWrap()) {
+ Scale = 1;
+ Offset = 0;
+ return Val;
+ }
+
+ if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+ if (I->getOpcode() == Instruction::Shl) {
+ // This is a value scaled by '1 << the shift amt'.
+ Scale = UINT64_C(1) << RHS->getZExtValue();
+ Offset = 0;
+ return I->getOperand(0);
+ }
+
+ if (I->getOpcode() == Instruction::Mul) {
+ // This value is scaled by 'RHS'.
+ Scale = RHS->getZExtValue();
+ Offset = 0;
+ return I->getOperand(0);
+ }
+
+ if (I->getOpcode() == Instruction::Add) {
+ // We have X+C. Check to see if we really have (X*C2)+C1,
+ // where C1 is divisible by C2.
+ unsigned SubScale;
+ Value *SubVal =
+ decomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
+ Offset += RHS->getZExtValue();
+ Scale = SubScale;
+ return SubVal;
+ }
+ }
+ }
+
+ // Otherwise, we can't look past this.
+ Scale = 1;
+ Offset = 0;
+ return Val;
+}
+
+/// If we find a cast of an allocation instruction, try to eliminate the cast by
+/// moving the type information into the alloc.
Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
AllocaInst &AI) {
- PointerType *PTy = cast<PointerType>(CI.getType());
-
- IRBuilderBase::InsertPointGuard Guard(Builder);
- Builder.SetInsertPoint(&AI);
-
- // Get the type really allocated and the type casted to.
- Type *AllocElTy = AI.getAllocatedType();
- Type *CastElTy = PTy->getElementType();
- if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr;
-
+ PointerType *PTy = cast<PointerType>(CI.getType());
+
+ IRBuilderBase::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(&AI);
+
+ // Get the type really allocated and the type casted to.
+ Type *AllocElTy = AI.getAllocatedType();
+ Type *CastElTy = PTy->getElementType();
+ if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr;
+
// This optimisation does not work for cases where the cast type
// is scalable and the allocated type is not. This because we need to
// know how many times the casted type fits into the allocated type.
@@ -106,441 +106,441 @@ Instruction *InstCombinerImpl::PromoteCastOfAllocation(BitCastInst &CI,
bool CastIsScalable = isa<ScalableVectorType>(CastElTy);
if (AllocIsScalable != CastIsScalable) return nullptr;
- Align AllocElTyAlign = DL.getABITypeAlign(AllocElTy);
- Align CastElTyAlign = DL.getABITypeAlign(CastElTy);
- if (CastElTyAlign < AllocElTyAlign) return nullptr;
-
- // If the allocation has multiple uses, only promote it if we are strictly
- // increasing the alignment of the resultant allocation. If we keep it the
- // same, we open the door to infinite loops of various kinds.
- if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr;
-
+ Align AllocElTyAlign = DL.getABITypeAlign(AllocElTy);
+ Align CastElTyAlign = DL.getABITypeAlign(CastElTy);
+ if (CastElTyAlign < AllocElTyAlign) return nullptr;
+
+ // If the allocation has multiple uses, only promote it if we are strictly
+ // increasing the alignment of the resultant allocation. If we keep it the
+ // same, we open the door to infinite loops of various kinds.
+ if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return nullptr;
+
// The alloc and cast types should be either both fixed or both scalable.
uint64_t AllocElTySize = DL.getTypeAllocSize(AllocElTy).getKnownMinSize();
uint64_t CastElTySize = DL.getTypeAllocSize(CastElTy).getKnownMinSize();
- if (CastElTySize == 0 || AllocElTySize == 0) return nullptr;
-
- // If the allocation has multiple uses, only promote it if we're not
- // shrinking the amount of memory being allocated.
+ if (CastElTySize == 0 || AllocElTySize == 0) return nullptr;
+
+ // If the allocation has multiple uses, only promote it if we're not
+ // shrinking the amount of memory being allocated.
uint64_t AllocElTyStoreSize = DL.getTypeStoreSize(AllocElTy).getKnownMinSize();
uint64_t CastElTyStoreSize = DL.getTypeStoreSize(CastElTy).getKnownMinSize();
- if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr;
-
- // See if we can satisfy the modulus by pulling a scale out of the array
- // size argument.
- unsigned ArraySizeScale;
- uint64_t ArrayOffset;
- Value *NumElements = // See if the array size is a decomposable linear expr.
- decomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
-
- // If we can now satisfy the modulus, by using a non-1 scale, we really can
- // do the xform.
- if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
- (AllocElTySize*ArrayOffset ) % CastElTySize != 0) return nullptr;
-
+ if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return nullptr;
+
+ // See if we can satisfy the modulus by pulling a scale out of the array
+ // size argument.
+ unsigned ArraySizeScale;
+ uint64_t ArrayOffset;
+ Value *NumElements = // See if the array size is a decomposable linear expr.
+ decomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
+
+ // If we can now satisfy the modulus, by using a non-1 scale, we really can
+ // do the xform.
+ if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
+ (AllocElTySize*ArrayOffset ) % CastElTySize != 0) return nullptr;
+
// We don't currently support arrays of scalable types.
assert(!AllocIsScalable || (ArrayOffset == 1 && ArraySizeScale == 0));
- unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize;
- Value *Amt = nullptr;
- if (Scale == 1) {
- Amt = NumElements;
- } else {
- Amt = ConstantInt::get(AI.getArraySize()->getType(), Scale);
- // Insert before the alloca, not before the cast.
- Amt = Builder.CreateMul(Amt, NumElements);
- }
-
- if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
- Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
- Offset, true);
- Amt = Builder.CreateAdd(Amt, Off);
- }
-
- AllocaInst *New = Builder.CreateAlloca(CastElTy, Amt);
- New->setAlignment(AI.getAlign());
- New->takeName(&AI);
- New->setUsedWithInAlloca(AI.isUsedWithInAlloca());
-
- // If the allocation has multiple real uses, insert a cast and change all
- // things that used it to use the new cast. This will also hack on CI, but it
- // will die soon.
- if (!AI.hasOneUse()) {
- // New is the allocation instruction, pointer typed. AI is the original
- // allocation instruction, also pointer typed. Thus, cast to use is BitCast.
- Value *NewCast = Builder.CreateBitCast(New, AI.getType(), "tmpcast");
- replaceInstUsesWith(AI, NewCast);
- eraseInstFromFunction(AI);
- }
- return replaceInstUsesWith(CI, New);
-}
-
-/// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns
-/// true for, actually insert the code to evaluate the expression.
+ unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize;
+ Value *Amt = nullptr;
+ if (Scale == 1) {
+ Amt = NumElements;
+ } else {
+ Amt = ConstantInt::get(AI.getArraySize()->getType(), Scale);
+ // Insert before the alloca, not before the cast.
+ Amt = Builder.CreateMul(Amt, NumElements);
+ }
+
+ if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
+ Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
+ Offset, true);
+ Amt = Builder.CreateAdd(Amt, Off);
+ }
+
+ AllocaInst *New = Builder.CreateAlloca(CastElTy, Amt);
+ New->setAlignment(AI.getAlign());
+ New->takeName(&AI);
+ New->setUsedWithInAlloca(AI.isUsedWithInAlloca());
+
+ // If the allocation has multiple real uses, insert a cast and change all
+ // things that used it to use the new cast. This will also hack on CI, but it
+ // will die soon.
+ if (!AI.hasOneUse()) {
+ // New is the allocation instruction, pointer typed. AI is the original
+ // allocation instruction, also pointer typed. Thus, cast to use is BitCast.
+ Value *NewCast = Builder.CreateBitCast(New, AI.getType(), "tmpcast");
+ replaceInstUsesWith(AI, NewCast);
+ eraseInstFromFunction(AI);
+ }
+ return replaceInstUsesWith(CI, New);
+}
+
+/// Given an expression that CanEvaluateTruncated or CanEvaluateSExtd returns
+/// true for, actually insert the code to evaluate the expression.
Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty,
bool isSigned) {
- if (Constant *C = dyn_cast<Constant>(V)) {
- C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
- // If we got a constantexpr back, try to simplify it with DL info.
- return ConstantFoldConstant(C, DL, &TLI);
- }
-
- // Otherwise, it must be an instruction.
- Instruction *I = cast<Instruction>(V);
- Instruction *Res = nullptr;
- unsigned Opc = I->getOpcode();
- switch (Opc) {
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Mul:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::AShr:
- case Instruction::LShr:
- case Instruction::Shl:
- case Instruction::UDiv:
- case Instruction::URem: {
- Value *LHS = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned);
- Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
- Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
- break;
- }
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt:
- // If the source type of the cast is the type we're trying for then we can
- // just return the source. There's no need to insert it because it is not
- // new.
- if (I->getOperand(0)->getType() == Ty)
- return I->getOperand(0);
-
- // Otherwise, must be the same type of cast, so just reinsert a new one.
- // This also handles the case of zext(trunc(x)) -> zext(x).
- Res = CastInst::CreateIntegerCast(I->getOperand(0), Ty,
- Opc == Instruction::SExt);
- break;
- case Instruction::Select: {
- Value *True = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
- Value *False = EvaluateInDifferentType(I->getOperand(2), Ty, isSigned);
- Res = SelectInst::Create(I->getOperand(0), True, False);
- break;
- }
- case Instruction::PHI: {
- PHINode *OPN = cast<PHINode>(I);
- PHINode *NPN = PHINode::Create(Ty, OPN->getNumIncomingValues());
- for (unsigned i = 0, e = OPN->getNumIncomingValues(); i != e; ++i) {
- Value *V =
- EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned);
- NPN->addIncoming(V, OPN->getIncomingBlock(i));
- }
- Res = NPN;
- break;
- }
- default:
- // TODO: Can handle more cases here.
- llvm_unreachable("Unreachable!");
- }
-
- Res->takeName(I);
- return InsertNewInstWith(Res, *I);
-}
-
+ if (Constant *C = dyn_cast<Constant>(V)) {
+ C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
+ // If we got a constantexpr back, try to simplify it with DL info.
+ return ConstantFoldConstant(C, DL, &TLI);
+ }
+
+ // Otherwise, it must be an instruction.
+ Instruction *I = cast<Instruction>(V);
+ Instruction *Res = nullptr;
+ unsigned Opc = I->getOpcode();
+ switch (Opc) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::AShr:
+ case Instruction::LShr:
+ case Instruction::Shl:
+ case Instruction::UDiv:
+ case Instruction::URem: {
+ Value *LHS = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned);
+ Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+ Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
+ break;
+ }
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ // If the source type of the cast is the type we're trying for then we can
+ // just return the source. There's no need to insert it because it is not
+ // new.
+ if (I->getOperand(0)->getType() == Ty)
+ return I->getOperand(0);
+
+ // Otherwise, must be the same type of cast, so just reinsert a new one.
+ // This also handles the case of zext(trunc(x)) -> zext(x).
+ Res = CastInst::CreateIntegerCast(I->getOperand(0), Ty,
+ Opc == Instruction::SExt);
+ break;
+ case Instruction::Select: {
+ Value *True = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+ Value *False = EvaluateInDifferentType(I->getOperand(2), Ty, isSigned);
+ Res = SelectInst::Create(I->getOperand(0), True, False);
+ break;
+ }
+ case Instruction::PHI: {
+ PHINode *OPN = cast<PHINode>(I);
+ PHINode *NPN = PHINode::Create(Ty, OPN->getNumIncomingValues());
+ for (unsigned i = 0, e = OPN->getNumIncomingValues(); i != e; ++i) {
+ Value *V =
+ EvaluateInDifferentType(OPN->getIncomingValue(i), Ty, isSigned);
+ NPN->addIncoming(V, OPN->getIncomingBlock(i));
+ }
+ Res = NPN;
+ break;
+ }
+ default:
+ // TODO: Can handle more cases here.
+ llvm_unreachable("Unreachable!");
+ }
+
+ Res->takeName(I);
+ return InsertNewInstWith(Res, *I);
+}
+
Instruction::CastOps
InstCombinerImpl::isEliminableCastPair(const CastInst *CI1,
const CastInst *CI2) {
- Type *SrcTy = CI1->getSrcTy();
- Type *MidTy = CI1->getDestTy();
- Type *DstTy = CI2->getDestTy();
-
- Instruction::CastOps firstOp = CI1->getOpcode();
- Instruction::CastOps secondOp = CI2->getOpcode();
- Type *SrcIntPtrTy =
- SrcTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(SrcTy) : nullptr;
- Type *MidIntPtrTy =
- MidTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(MidTy) : nullptr;
- Type *DstIntPtrTy =
- DstTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(DstTy) : nullptr;
- unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy,
- DstTy, SrcIntPtrTy, MidIntPtrTy,
- DstIntPtrTy);
-
- // We don't want to form an inttoptr or ptrtoint that converts to an integer
- // type that differs from the pointer size.
- if ((Res == Instruction::IntToPtr && SrcTy != DstIntPtrTy) ||
- (Res == Instruction::PtrToInt && DstTy != SrcIntPtrTy))
- Res = 0;
-
- return Instruction::CastOps(Res);
-}
-
-/// Implement the transforms common to all CastInst visitors.
+ Type *SrcTy = CI1->getSrcTy();
+ Type *MidTy = CI1->getDestTy();
+ Type *DstTy = CI2->getDestTy();
+
+ Instruction::CastOps firstOp = CI1->getOpcode();
+ Instruction::CastOps secondOp = CI2->getOpcode();
+ Type *SrcIntPtrTy =
+ SrcTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(SrcTy) : nullptr;
+ Type *MidIntPtrTy =
+ MidTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(MidTy) : nullptr;
+ Type *DstIntPtrTy =
+ DstTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(DstTy) : nullptr;
+ unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy,
+ DstTy, SrcIntPtrTy, MidIntPtrTy,
+ DstIntPtrTy);
+
+ // We don't want to form an inttoptr or ptrtoint that converts to an integer
+ // type that differs from the pointer size.
+ if ((Res == Instruction::IntToPtr && SrcTy != DstIntPtrTy) ||
+ (Res == Instruction::PtrToInt && DstTy != SrcIntPtrTy))
+ Res = 0;
+
+ return Instruction::CastOps(Res);
+}
+
+/// Implement the transforms common to all CastInst visitors.
Instruction *InstCombinerImpl::commonCastTransforms(CastInst &CI) {
- Value *Src = CI.getOperand(0);
-
- // Try to eliminate a cast of a cast.
- if (auto *CSrc = dyn_cast<CastInst>(Src)) { // A->B->C cast
- if (Instruction::CastOps NewOpc = isEliminableCastPair(CSrc, &CI)) {
- // The first cast (CSrc) is eliminable so we need to fix up or replace
- // the second cast (CI). CSrc will then have a good chance of being dead.
- auto *Ty = CI.getType();
- auto *Res = CastInst::Create(NewOpc, CSrc->getOperand(0), Ty);
- // Point debug users of the dying cast to the new one.
- if (CSrc->hasOneUse())
- replaceAllDbgUsesWith(*CSrc, *Res, CI, DT);
- return Res;
- }
- }
-
- if (auto *Sel = dyn_cast<SelectInst>(Src)) {
- // We are casting a select. Try to fold the cast into the select if the
- // select does not have a compare instruction with matching operand types
- // or the select is likely better done in a narrow type.
- // Creating a select with operands that are different sizes than its
- // condition may inhibit other folds and lead to worse codegen.
- auto *Cmp = dyn_cast<CmpInst>(Sel->getCondition());
- if (!Cmp || Cmp->getOperand(0)->getType() != Sel->getType() ||
- (CI.getOpcode() == Instruction::Trunc &&
- shouldChangeType(CI.getSrcTy(), CI.getType()))) {
- if (Instruction *NV = FoldOpIntoSelect(CI, Sel)) {
- replaceAllDbgUsesWith(*Sel, *NV, CI, DT);
- return NV;
- }
- }
- }
-
- // If we are casting a PHI, then fold the cast into the PHI.
- if (auto *PN = dyn_cast<PHINode>(Src)) {
- // Don't do this if it would create a PHI node with an illegal type from a
- // legal type.
- if (!Src->getType()->isIntegerTy() || !CI.getType()->isIntegerTy() ||
- shouldChangeType(CI.getSrcTy(), CI.getType()))
- if (Instruction *NV = foldOpIntoPhi(CI, PN))
- return NV;
- }
-
- return nullptr;
-}
-
-/// Constants and extensions/truncates from the destination type are always
-/// free to be evaluated in that type. This is a helper for canEvaluate*.
-static bool canAlwaysEvaluateInType(Value *V, Type *Ty) {
- if (isa<Constant>(V))
- return true;
- Value *X;
- if ((match(V, m_ZExtOrSExt(m_Value(X))) || match(V, m_Trunc(m_Value(X)))) &&
- X->getType() == Ty)
- return true;
-
- return false;
-}
-
-/// Filter out values that we can not evaluate in the destination type for free.
-/// This is a helper for canEvaluate*.
-static bool canNotEvaluateInType(Value *V, Type *Ty) {
- assert(!isa<Constant>(V) && "Constant should already be handled.");
- if (!isa<Instruction>(V))
- return true;
- // We don't extend or shrink something that has multiple uses -- doing so
- // would require duplicating the instruction which isn't profitable.
- if (!V->hasOneUse())
- return true;
-
- return false;
-}
-
-/// Return true if we can evaluate the specified expression tree as type Ty
-/// instead of its larger type, and arrive with the same value.
-/// This is used by code that tries to eliminate truncates.
-///
-/// Ty will always be a type smaller than V. We should return true if trunc(V)
-/// can be computed by computing V in the smaller type. If V is an instruction,
-/// then trunc(inst(x,y)) can be computed as inst(trunc(x),trunc(y)), which only
-/// makes sense if x and y can be efficiently truncated.
-///
-/// This function works on both vectors and scalars.
-///
+ Value *Src = CI.getOperand(0);
+
+ // Try to eliminate a cast of a cast.
+ if (auto *CSrc = dyn_cast<CastInst>(Src)) { // A->B->C cast
+ if (Instruction::CastOps NewOpc = isEliminableCastPair(CSrc, &CI)) {
+ // The first cast (CSrc) is eliminable so we need to fix up or replace
+ // the second cast (CI). CSrc will then have a good chance of being dead.
+ auto *Ty = CI.getType();
+ auto *Res = CastInst::Create(NewOpc, CSrc->getOperand(0), Ty);
+ // Point debug users of the dying cast to the new one.
+ if (CSrc->hasOneUse())
+ replaceAllDbgUsesWith(*CSrc, *Res, CI, DT);
+ return Res;
+ }
+ }
+
+ if (auto *Sel = dyn_cast<SelectInst>(Src)) {
+ // We are casting a select. Try to fold the cast into the select if the
+ // select does not have a compare instruction with matching operand types
+ // or the select is likely better done in a narrow type.
+ // Creating a select with operands that are different sizes than its
+ // condition may inhibit other folds and lead to worse codegen.
+ auto *Cmp = dyn_cast<CmpInst>(Sel->getCondition());
+ if (!Cmp || Cmp->getOperand(0)->getType() != Sel->getType() ||
+ (CI.getOpcode() == Instruction::Trunc &&
+ shouldChangeType(CI.getSrcTy(), CI.getType()))) {
+ if (Instruction *NV = FoldOpIntoSelect(CI, Sel)) {
+ replaceAllDbgUsesWith(*Sel, *NV, CI, DT);
+ return NV;
+ }
+ }
+ }
+
+ // If we are casting a PHI, then fold the cast into the PHI.
+ if (auto *PN = dyn_cast<PHINode>(Src)) {
+ // Don't do this if it would create a PHI node with an illegal type from a
+ // legal type.
+ if (!Src->getType()->isIntegerTy() || !CI.getType()->isIntegerTy() ||
+ shouldChangeType(CI.getSrcTy(), CI.getType()))
+ if (Instruction *NV = foldOpIntoPhi(CI, PN))
+ return NV;
+ }
+
+ return nullptr;
+}
+
+/// Constants and extensions/truncates from the destination type are always
+/// free to be evaluated in that type. This is a helper for canEvaluate*.
+static bool canAlwaysEvaluateInType(Value *V, Type *Ty) {
+ if (isa<Constant>(V))
+ return true;
+ Value *X;
+ if ((match(V, m_ZExtOrSExt(m_Value(X))) || match(V, m_Trunc(m_Value(X)))) &&
+ X->getType() == Ty)
+ return true;
+
+ return false;
+}
+
+/// Filter out values that we can not evaluate in the destination type for free.
+/// This is a helper for canEvaluate*.
+static bool canNotEvaluateInType(Value *V, Type *Ty) {
+ assert(!isa<Constant>(V) && "Constant should already be handled.");
+ if (!isa<Instruction>(V))
+ return true;
+ // We don't extend or shrink something that has multiple uses -- doing so
+ // would require duplicating the instruction which isn't profitable.
+ if (!V->hasOneUse())
+ return true;
+
+ return false;
+}
+
+/// Return true if we can evaluate the specified expression tree as type Ty
+/// instead of its larger type, and arrive with the same value.
+/// This is used by code that tries to eliminate truncates.
+///
+/// Ty will always be a type smaller than V. We should return true if trunc(V)
+/// can be computed by computing V in the smaller type. If V is an instruction,
+/// then trunc(inst(x,y)) can be computed as inst(trunc(x),trunc(y)), which only
+/// makes sense if x and y can be efficiently truncated.
+///
+/// This function works on both vectors and scalars.
+///
static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombinerImpl &IC,
- Instruction *CxtI) {
- if (canAlwaysEvaluateInType(V, Ty))
- return true;
- if (canNotEvaluateInType(V, Ty))
- return false;
-
- auto *I = cast<Instruction>(V);
- Type *OrigTy = V->getType();
- switch (I->getOpcode()) {
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Mul:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- // These operators can all arbitrarily be extended or truncated.
- return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
- canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
-
- case Instruction::UDiv:
- case Instruction::URem: {
- // UDiv and URem can be truncated if all the truncated bits are zero.
- uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
- uint32_t BitWidth = Ty->getScalarSizeInBits();
- assert(BitWidth < OrigBitWidth && "Unexpected bitwidths!");
- APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
- if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) &&
- IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) {
- return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
- canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
- }
- break;
- }
- case Instruction::Shl: {
- // If we are truncating the result of this SHL, and if it's a shift of an
- // inrange amount, we can always perform a SHL in a smaller type.
- uint32_t BitWidth = Ty->getScalarSizeInBits();
- KnownBits AmtKnownBits =
- llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
- if (AmtKnownBits.getMaxValue().ult(BitWidth))
- return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
- canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
- break;
- }
- case Instruction::LShr: {
- // If this is a truncate of a logical shr, we can truncate it to a smaller
- // lshr iff we know that the bits we would otherwise be shifting in are
- // already zeros.
- // TODO: It is enough to check that the bits we would be shifting in are
- // zero - use AmtKnownBits.getMaxValue().
- uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
- uint32_t BitWidth = Ty->getScalarSizeInBits();
- KnownBits AmtKnownBits =
- llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
- APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
- if (AmtKnownBits.getMaxValue().ult(BitWidth) &&
- IC.MaskedValueIsZero(I->getOperand(0), ShiftedBits, 0, CxtI)) {
- return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
- canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
- }
- break;
- }
- case Instruction::AShr: {
- // If this is a truncate of an arithmetic shr, we can truncate it to a
- // smaller ashr iff we know that all the bits from the sign bit of the
- // original type and the sign bit of the truncate type are similar.
- // TODO: It is enough to check that the bits we would be shifting in are
- // similar to sign bit of the truncate type.
- uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
- uint32_t BitWidth = Ty->getScalarSizeInBits();
- KnownBits AmtKnownBits =
- llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
- unsigned ShiftedBits = OrigBitWidth - BitWidth;
- if (AmtKnownBits.getMaxValue().ult(BitWidth) &&
- ShiftedBits < IC.ComputeNumSignBits(I->getOperand(0), 0, CxtI))
- return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
- canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
- break;
- }
- case Instruction::Trunc:
- // trunc(trunc(x)) -> trunc(x)
- return true;
- case Instruction::ZExt:
- case Instruction::SExt:
- // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest
- // trunc(ext(x)) -> trunc(x) if the source type is larger than the new dest
- return true;
- case Instruction::Select: {
- SelectInst *SI = cast<SelectInst>(I);
- return canEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) &&
- canEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI);
- }
- case Instruction::PHI: {
- // We can change a phi if we can change all operands. Note that we never
- // get into trouble with cyclic PHIs here because we only consider
- // instructions with a single use.
- PHINode *PN = cast<PHINode>(I);
- for (Value *IncValue : PN->incoming_values())
- if (!canEvaluateTruncated(IncValue, Ty, IC, CxtI))
- return false;
- return true;
- }
- default:
- // TODO: Can handle more cases here.
- break;
- }
-
- return false;
-}
-
-/// Given a vector that is bitcast to an integer, optionally logically
-/// right-shifted, and truncated, convert it to an extractelement.
-/// Example (big endian):
-/// trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32
-/// --->
-/// extractelement <4 x i32> %X, 1
+ Instruction *CxtI) {
+ if (canAlwaysEvaluateInType(V, Ty))
+ return true;
+ if (canNotEvaluateInType(V, Ty))
+ return false;
+
+ auto *I = cast<Instruction>(V);
+ Type *OrigTy = V->getType();
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ // These operators can all arbitrarily be extended or truncated.
+ return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+ canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
+
+ case Instruction::UDiv:
+ case Instruction::URem: {
+ // UDiv and URem can be truncated if all the truncated bits are zero.
+ uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+ uint32_t BitWidth = Ty->getScalarSizeInBits();
+ assert(BitWidth < OrigBitWidth && "Unexpected bitwidths!");
+ APInt Mask = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+ if (IC.MaskedValueIsZero(I->getOperand(0), Mask, 0, CxtI) &&
+ IC.MaskedValueIsZero(I->getOperand(1), Mask, 0, CxtI)) {
+ return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+ canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
+ }
+ break;
+ }
+ case Instruction::Shl: {
+ // If we are truncating the result of this SHL, and if it's a shift of an
+ // inrange amount, we can always perform a SHL in a smaller type.
+ uint32_t BitWidth = Ty->getScalarSizeInBits();
+ KnownBits AmtKnownBits =
+ llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
+ if (AmtKnownBits.getMaxValue().ult(BitWidth))
+ return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+ canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
+ break;
+ }
+ case Instruction::LShr: {
+ // If this is a truncate of a logical shr, we can truncate it to a smaller
+ // lshr iff we know that the bits we would otherwise be shifting in are
+ // already zeros.
+ // TODO: It is enough to check that the bits we would be shifting in are
+ // zero - use AmtKnownBits.getMaxValue().
+ uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+ uint32_t BitWidth = Ty->getScalarSizeInBits();
+ KnownBits AmtKnownBits =
+ llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
+ APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+ if (AmtKnownBits.getMaxValue().ult(BitWidth) &&
+ IC.MaskedValueIsZero(I->getOperand(0), ShiftedBits, 0, CxtI)) {
+ return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+ canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
+ }
+ break;
+ }
+ case Instruction::AShr: {
+ // If this is a truncate of an arithmetic shr, we can truncate it to a
+ // smaller ashr iff we know that all the bits from the sign bit of the
+ // original type and the sign bit of the truncate type are similar.
+ // TODO: It is enough to check that the bits we would be shifting in are
+ // similar to sign bit of the truncate type.
+ uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+ uint32_t BitWidth = Ty->getScalarSizeInBits();
+ KnownBits AmtKnownBits =
+ llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
+ unsigned ShiftedBits = OrigBitWidth - BitWidth;
+ if (AmtKnownBits.getMaxValue().ult(BitWidth) &&
+ ShiftedBits < IC.ComputeNumSignBits(I->getOperand(0), 0, CxtI))
+ return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+ canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
+ break;
+ }
+ case Instruction::Trunc:
+ // trunc(trunc(x)) -> trunc(x)
+ return true;
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ // trunc(ext(x)) -> ext(x) if the source type is smaller than the new dest
+ // trunc(ext(x)) -> trunc(x) if the source type is larger than the new dest
+ return true;
+ case Instruction::Select: {
+ SelectInst *SI = cast<SelectInst>(I);
+ return canEvaluateTruncated(SI->getTrueValue(), Ty, IC, CxtI) &&
+ canEvaluateTruncated(SI->getFalseValue(), Ty, IC, CxtI);
+ }
+ case Instruction::PHI: {
+ // We can change a phi if we can change all operands. Note that we never
+ // get into trouble with cyclic PHIs here because we only consider
+ // instructions with a single use.
+ PHINode *PN = cast<PHINode>(I);
+ for (Value *IncValue : PN->incoming_values())
+ if (!canEvaluateTruncated(IncValue, Ty, IC, CxtI))
+ return false;
+ return true;
+ }
+ default:
+ // TODO: Can handle more cases here.
+ break;
+ }
+
+ return false;
+}
+
+/// Given a vector that is bitcast to an integer, optionally logically
+/// right-shifted, and truncated, convert it to an extractelement.
+/// Example (big endian):
+/// trunc (lshr (bitcast <4 x i32> %X to i128), 32) to i32
+/// --->
+/// extractelement <4 x i32> %X, 1
static Instruction *foldVecTruncToExtElt(TruncInst &Trunc,
InstCombinerImpl &IC) {
- Value *TruncOp = Trunc.getOperand(0);
- Type *DestType = Trunc.getType();
- if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType))
- return nullptr;
-
- Value *VecInput = nullptr;
- ConstantInt *ShiftVal = nullptr;
- if (!match(TruncOp, m_CombineOr(m_BitCast(m_Value(VecInput)),
- m_LShr(m_BitCast(m_Value(VecInput)),
- m_ConstantInt(ShiftVal)))) ||
- !isa<VectorType>(VecInput->getType()))
- return nullptr;
-
- VectorType *VecType = cast<VectorType>(VecInput->getType());
- unsigned VecWidth = VecType->getPrimitiveSizeInBits();
- unsigned DestWidth = DestType->getPrimitiveSizeInBits();
- unsigned ShiftAmount = ShiftVal ? ShiftVal->getZExtValue() : 0;
-
- if ((VecWidth % DestWidth != 0) || (ShiftAmount % DestWidth != 0))
- return nullptr;
-
- // If the element type of the vector doesn't match the result type,
- // bitcast it to a vector type that we can extract from.
- unsigned NumVecElts = VecWidth / DestWidth;
- if (VecType->getElementType() != DestType) {
- VecType = FixedVectorType::get(DestType, NumVecElts);
- VecInput = IC.Builder.CreateBitCast(VecInput, VecType, "bc");
- }
-
- unsigned Elt = ShiftAmount / DestWidth;
- if (IC.getDataLayout().isBigEndian())
- Elt = NumVecElts - 1 - Elt;
-
- return ExtractElementInst::Create(VecInput, IC.Builder.getInt32(Elt));
-}
-
+ Value *TruncOp = Trunc.getOperand(0);
+ Type *DestType = Trunc.getType();
+ if (!TruncOp->hasOneUse() || !isa<IntegerType>(DestType))
+ return nullptr;
+
+ Value *VecInput = nullptr;
+ ConstantInt *ShiftVal = nullptr;
+ if (!match(TruncOp, m_CombineOr(m_BitCast(m_Value(VecInput)),
+ m_LShr(m_BitCast(m_Value(VecInput)),
+ m_ConstantInt(ShiftVal)))) ||
+ !isa<VectorType>(VecInput->getType()))
+ return nullptr;
+
+ VectorType *VecType = cast<VectorType>(VecInput->getType());
+ unsigned VecWidth = VecType->getPrimitiveSizeInBits();
+ unsigned DestWidth = DestType->getPrimitiveSizeInBits();
+ unsigned ShiftAmount = ShiftVal ? ShiftVal->getZExtValue() : 0;
+
+ if ((VecWidth % DestWidth != 0) || (ShiftAmount % DestWidth != 0))
+ return nullptr;
+
+ // If the element type of the vector doesn't match the result type,
+ // bitcast it to a vector type that we can extract from.
+ unsigned NumVecElts = VecWidth / DestWidth;
+ if (VecType->getElementType() != DestType) {
+ VecType = FixedVectorType::get(DestType, NumVecElts);
+ VecInput = IC.Builder.CreateBitCast(VecInput, VecType, "bc");
+ }
+
+ unsigned Elt = ShiftAmount / DestWidth;
+ if (IC.getDataLayout().isBigEndian())
+ Elt = NumVecElts - 1 - Elt;
+
+ return ExtractElementInst::Create(VecInput, IC.Builder.getInt32(Elt));
+}
+
/// Funnel/Rotate left/right may occur in a wider type than necessary because of
/// type promotion rules. Try to narrow the inputs and convert to funnel shift.
Instruction *InstCombinerImpl::narrowFunnelShift(TruncInst &Trunc) {
- assert((isa<VectorType>(Trunc.getSrcTy()) ||
- shouldChangeType(Trunc.getSrcTy(), Trunc.getType())) &&
- "Don't narrow to an illegal scalar type");
-
- // Bail out on strange types. It is possible to handle some of these patterns
- // even with non-power-of-2 sizes, but it is not a likely scenario.
- Type *DestTy = Trunc.getType();
- unsigned NarrowWidth = DestTy->getScalarSizeInBits();
- if (!isPowerOf2_32(NarrowWidth))
- return nullptr;
-
+ assert((isa<VectorType>(Trunc.getSrcTy()) ||
+ shouldChangeType(Trunc.getSrcTy(), Trunc.getType())) &&
+ "Don't narrow to an illegal scalar type");
+
+ // Bail out on strange types. It is possible to handle some of these patterns
+ // even with non-power-of-2 sizes, but it is not a likely scenario.
+ Type *DestTy = Trunc.getType();
+ unsigned NarrowWidth = DestTy->getScalarSizeInBits();
+ if (!isPowerOf2_32(NarrowWidth))
+ return nullptr;
+
// First, find an or'd pair of opposite shifts:
// trunc (or (lshr ShVal0, ShAmt0), (shl ShVal1, ShAmt1))
BinaryOperator *Or0, *Or1;
if (!match(Trunc.getOperand(0), m_OneUse(m_Or(m_BinOp(Or0), m_BinOp(Or1)))))
- return nullptr;
-
+ return nullptr;
+
Value *ShVal0, *ShVal1, *ShAmt0, *ShAmt1;
if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(ShVal0), m_Value(ShAmt0)))) ||
!match(Or1, m_OneUse(m_LogicalShift(m_Value(ShVal1), m_Value(ShAmt1)))) ||
Or0->getOpcode() == Or1->getOpcode())
- return nullptr;
-
+ return nullptr;
+
// Canonicalize to or(shl(ShVal0, ShAmt0), lshr(ShVal1, ShAmt1)).
if (Or0->getOpcode() == BinaryOperator::LShr) {
std::swap(Or0, Or1);
@@ -550,303 +550,303 @@ Instruction *InstCombinerImpl::narrowFunnelShift(TruncInst &Trunc) {
assert(Or0->getOpcode() == BinaryOperator::Shl &&
Or1->getOpcode() == BinaryOperator::LShr &&
"Illegal or(shift,shift) pair");
-
+
// Match the shift amount operands for a funnel/rotate pattern. This always
// matches a subtraction on the R operand.
auto matchShiftAmount = [&](Value *L, Value *R, unsigned Width) -> Value * {
- // The shift amounts may add up to the narrow bit width:
+ // The shift amounts may add up to the narrow bit width:
// (shl ShVal0, L) | (lshr ShVal1, Width - L)
- if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L)))))
- return L;
-
+ if (match(R, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(L)))))
+ return L;
+
// The following patterns currently only work for rotation patterns.
// TODO: Add more general funnel-shift compatible patterns.
if (ShVal0 != ShVal1)
return nullptr;
- // The shift amount may be masked with negation:
+ // The shift amount may be masked with negation:
// (shl ShVal0, (X & (Width - 1))) | (lshr ShVal1, ((-X) & (Width - 1)))
- Value *X;
- unsigned Mask = Width - 1;
- if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) &&
- match(R, m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask))))
- return X;
-
- // Same as above, but the shift amount may be extended after masking:
- if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) &&
- match(R, m_ZExt(m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask)))))
- return X;
-
- return nullptr;
- };
-
- Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, NarrowWidth);
+ Value *X;
+ unsigned Mask = Width - 1;
+ if (match(L, m_And(m_Value(X), m_SpecificInt(Mask))) &&
+ match(R, m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask))))
+ return X;
+
+ // Same as above, but the shift amount may be extended after masking:
+ if (match(L, m_ZExt(m_And(m_Value(X), m_SpecificInt(Mask)))) &&
+ match(R, m_ZExt(m_And(m_Neg(m_Specific(X)), m_SpecificInt(Mask)))))
+ return X;
+
+ return nullptr;
+ };
+
+ Value *ShAmt = matchShiftAmount(ShAmt0, ShAmt1, NarrowWidth);
bool IsFshl = true; // Sub on LSHR.
- if (!ShAmt) {
- ShAmt = matchShiftAmount(ShAmt1, ShAmt0, NarrowWidth);
+ if (!ShAmt) {
+ ShAmt = matchShiftAmount(ShAmt1, ShAmt0, NarrowWidth);
IsFshl = false; // Sub on SHL.
- }
- if (!ShAmt)
- return nullptr;
-
- // The shifted value must have high zeros in the wide type. Typically, this
- // will be a zext, but it could also be the result of an 'and' or 'shift'.
- unsigned WideWidth = Trunc.getSrcTy()->getScalarSizeInBits();
- APInt HiBitMask = APInt::getHighBitsSet(WideWidth, WideWidth - NarrowWidth);
+ }
+ if (!ShAmt)
+ return nullptr;
+
+ // The shifted value must have high zeros in the wide type. Typically, this
+ // will be a zext, but it could also be the result of an 'and' or 'shift'.
+ unsigned WideWidth = Trunc.getSrcTy()->getScalarSizeInBits();
+ APInt HiBitMask = APInt::getHighBitsSet(WideWidth, WideWidth - NarrowWidth);
if (!MaskedValueIsZero(ShVal0, HiBitMask, 0, &Trunc) ||
!MaskedValueIsZero(ShVal1, HiBitMask, 0, &Trunc))
- return nullptr;
-
- // We have an unnecessarily wide rotate!
+ return nullptr;
+
+ // We have an unnecessarily wide rotate!
// trunc (or (lshr ShVal0, ShAmt), (shl ShVal1, BitWidth - ShAmt))
- // Narrow the inputs and convert to funnel shift intrinsic:
- // llvm.fshl.i8(trunc(ShVal), trunc(ShVal), trunc(ShAmt))
- Value *NarrowShAmt = Builder.CreateTrunc(ShAmt, DestTy);
+ // Narrow the inputs and convert to funnel shift intrinsic:
+ // llvm.fshl.i8(trunc(ShVal), trunc(ShVal), trunc(ShAmt))
+ Value *NarrowShAmt = Builder.CreateTrunc(ShAmt, DestTy);
Value *X, *Y;
X = Y = Builder.CreateTrunc(ShVal0, DestTy);
if (ShVal0 != ShVal1)
Y = Builder.CreateTrunc(ShVal1, DestTy);
- Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
- Function *F = Intrinsic::getDeclaration(Trunc.getModule(), IID, DestTy);
+ Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
+ Function *F = Intrinsic::getDeclaration(Trunc.getModule(), IID, DestTy);
return IntrinsicInst::Create(F, {X, Y, NarrowShAmt});
-}
-
-/// Try to narrow the width of math or bitwise logic instructions by pulling a
-/// truncate ahead of binary operators.
-/// TODO: Transforms for truncated shifts should be moved into here.
+}
+
+/// Try to narrow the width of math or bitwise logic instructions by pulling a
+/// truncate ahead of binary operators.
+/// TODO: Transforms for truncated shifts should be moved into here.
Instruction *InstCombinerImpl::narrowBinOp(TruncInst &Trunc) {
- Type *SrcTy = Trunc.getSrcTy();
- Type *DestTy = Trunc.getType();
- if (!isa<VectorType>(SrcTy) && !shouldChangeType(SrcTy, DestTy))
- return nullptr;
-
- BinaryOperator *BinOp;
- if (!match(Trunc.getOperand(0), m_OneUse(m_BinOp(BinOp))))
- return nullptr;
-
- Value *BinOp0 = BinOp->getOperand(0);
- Value *BinOp1 = BinOp->getOperand(1);
- switch (BinOp->getOpcode()) {
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Mul: {
- Constant *C;
- if (match(BinOp0, m_Constant(C))) {
- // trunc (binop C, X) --> binop (trunc C', X)
- Constant *NarrowC = ConstantExpr::getTrunc(C, DestTy);
- Value *TruncX = Builder.CreateTrunc(BinOp1, DestTy);
- return BinaryOperator::Create(BinOp->getOpcode(), NarrowC, TruncX);
- }
- if (match(BinOp1, m_Constant(C))) {
- // trunc (binop X, C) --> binop (trunc X, C')
- Constant *NarrowC = ConstantExpr::getTrunc(C, DestTy);
- Value *TruncX = Builder.CreateTrunc(BinOp0, DestTy);
- return BinaryOperator::Create(BinOp->getOpcode(), TruncX, NarrowC);
- }
- Value *X;
- if (match(BinOp0, m_ZExtOrSExt(m_Value(X))) && X->getType() == DestTy) {
- // trunc (binop (ext X), Y) --> binop X, (trunc Y)
- Value *NarrowOp1 = Builder.CreateTrunc(BinOp1, DestTy);
- return BinaryOperator::Create(BinOp->getOpcode(), X, NarrowOp1);
- }
- if (match(BinOp1, m_ZExtOrSExt(m_Value(X))) && X->getType() == DestTy) {
- // trunc (binop Y, (ext X)) --> binop (trunc Y), X
- Value *NarrowOp0 = Builder.CreateTrunc(BinOp0, DestTy);
- return BinaryOperator::Create(BinOp->getOpcode(), NarrowOp0, X);
- }
- break;
- }
-
- default: break;
- }
-
+ Type *SrcTy = Trunc.getSrcTy();
+ Type *DestTy = Trunc.getType();
+ if (!isa<VectorType>(SrcTy) && !shouldChangeType(SrcTy, DestTy))
+ return nullptr;
+
+ BinaryOperator *BinOp;
+ if (!match(Trunc.getOperand(0), m_OneUse(m_BinOp(BinOp))))
+ return nullptr;
+
+ Value *BinOp0 = BinOp->getOperand(0);
+ Value *BinOp1 = BinOp->getOperand(1);
+ switch (BinOp->getOpcode()) {
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul: {
+ Constant *C;
+ if (match(BinOp0, m_Constant(C))) {
+ // trunc (binop C, X) --> binop (trunc C', X)
+ Constant *NarrowC = ConstantExpr::getTrunc(C, DestTy);
+ Value *TruncX = Builder.CreateTrunc(BinOp1, DestTy);
+ return BinaryOperator::Create(BinOp->getOpcode(), NarrowC, TruncX);
+ }
+ if (match(BinOp1, m_Constant(C))) {
+ // trunc (binop X, C) --> binop (trunc X, C')
+ Constant *NarrowC = ConstantExpr::getTrunc(C, DestTy);
+ Value *TruncX = Builder.CreateTrunc(BinOp0, DestTy);
+ return BinaryOperator::Create(BinOp->getOpcode(), TruncX, NarrowC);
+ }
+ Value *X;
+ if (match(BinOp0, m_ZExtOrSExt(m_Value(X))) && X->getType() == DestTy) {
+ // trunc (binop (ext X), Y) --> binop X, (trunc Y)
+ Value *NarrowOp1 = Builder.CreateTrunc(BinOp1, DestTy);
+ return BinaryOperator::Create(BinOp->getOpcode(), X, NarrowOp1);
+ }
+ if (match(BinOp1, m_ZExtOrSExt(m_Value(X))) && X->getType() == DestTy) {
+ // trunc (binop Y, (ext X)) --> binop (trunc Y), X
+ Value *NarrowOp0 = Builder.CreateTrunc(BinOp0, DestTy);
+ return BinaryOperator::Create(BinOp->getOpcode(), NarrowOp0, X);
+ }
+ break;
+ }
+
+ default: break;
+ }
+
if (Instruction *NarrowOr = narrowFunnelShift(Trunc))
- return NarrowOr;
-
- return nullptr;
-}
-
-/// Try to narrow the width of a splat shuffle. This could be generalized to any
-/// shuffle with a constant operand, but we limit the transform to avoid
-/// creating a shuffle type that targets may not be able to lower effectively.
-static Instruction *shrinkSplatShuffle(TruncInst &Trunc,
- InstCombiner::BuilderTy &Builder) {
- auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0));
- if (Shuf && Shuf->hasOneUse() && isa<UndefValue>(Shuf->getOperand(1)) &&
- is_splat(Shuf->getShuffleMask()) &&
- Shuf->getType() == Shuf->getOperand(0)->getType()) {
- // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Undef, SplatMask
- Constant *NarrowUndef = UndefValue::get(Trunc.getType());
- Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType());
- return new ShuffleVectorInst(NarrowOp, NarrowUndef, Shuf->getShuffleMask());
- }
-
- return nullptr;
-}
-
-/// Try to narrow the width of an insert element. This could be generalized for
-/// any vector constant, but we limit the transform to insertion into undef to
-/// avoid potential backend problems from unsupported insertion widths. This
-/// could also be extended to handle the case of inserting a scalar constant
-/// into a vector variable.
-static Instruction *shrinkInsertElt(CastInst &Trunc,
- InstCombiner::BuilderTy &Builder) {
- Instruction::CastOps Opcode = Trunc.getOpcode();
- assert((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
- "Unexpected instruction for shrinking");
-
- auto *InsElt = dyn_cast<InsertElementInst>(Trunc.getOperand(0));
- if (!InsElt || !InsElt->hasOneUse())
- return nullptr;
-
- Type *DestTy = Trunc.getType();
- Type *DestScalarTy = DestTy->getScalarType();
- Value *VecOp = InsElt->getOperand(0);
- Value *ScalarOp = InsElt->getOperand(1);
- Value *Index = InsElt->getOperand(2);
-
- if (isa<UndefValue>(VecOp)) {
- // trunc (inselt undef, X, Index) --> inselt undef, (trunc X), Index
- // fptrunc (inselt undef, X, Index) --> inselt undef, (fptrunc X), Index
- UndefValue *NarrowUndef = UndefValue::get(DestTy);
- Value *NarrowOp = Builder.CreateCast(Opcode, ScalarOp, DestScalarTy);
- return InsertElementInst::Create(NarrowUndef, NarrowOp, Index);
- }
-
- return nullptr;
-}
-
+ return NarrowOr;
+
+ return nullptr;
+}
+
+/// Try to narrow the width of a splat shuffle. This could be generalized to any
+/// shuffle with a constant operand, but we limit the transform to avoid
+/// creating a shuffle type that targets may not be able to lower effectively.
+static Instruction *shrinkSplatShuffle(TruncInst &Trunc,
+ InstCombiner::BuilderTy &Builder) {
+ auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0));
+ if (Shuf && Shuf->hasOneUse() && isa<UndefValue>(Shuf->getOperand(1)) &&
+ is_splat(Shuf->getShuffleMask()) &&
+ Shuf->getType() == Shuf->getOperand(0)->getType()) {
+ // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Undef, SplatMask
+ Constant *NarrowUndef = UndefValue::get(Trunc.getType());
+ Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType());
+ return new ShuffleVectorInst(NarrowOp, NarrowUndef, Shuf->getShuffleMask());
+ }
+
+ return nullptr;
+}
+
+/// Try to narrow the width of an insert element. This could be generalized for
+/// any vector constant, but we limit the transform to insertion into undef to
+/// avoid potential backend problems from unsupported insertion widths. This
+/// could also be extended to handle the case of inserting a scalar constant
+/// into a vector variable.
+static Instruction *shrinkInsertElt(CastInst &Trunc,
+ InstCombiner::BuilderTy &Builder) {
+ Instruction::CastOps Opcode = Trunc.getOpcode();
+ assert((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
+ "Unexpected instruction for shrinking");
+
+ auto *InsElt = dyn_cast<InsertElementInst>(Trunc.getOperand(0));
+ if (!InsElt || !InsElt->hasOneUse())
+ return nullptr;
+
+ Type *DestTy = Trunc.getType();
+ Type *DestScalarTy = DestTy->getScalarType();
+ Value *VecOp = InsElt->getOperand(0);
+ Value *ScalarOp = InsElt->getOperand(1);
+ Value *Index = InsElt->getOperand(2);
+
+ if (isa<UndefValue>(VecOp)) {
+ // trunc (inselt undef, X, Index) --> inselt undef, (trunc X), Index
+ // fptrunc (inselt undef, X, Index) --> inselt undef, (fptrunc X), Index
+ UndefValue *NarrowUndef = UndefValue::get(DestTy);
+ Value *NarrowOp = Builder.CreateCast(Opcode, ScalarOp, DestScalarTy);
+ return InsertElementInst::Create(NarrowUndef, NarrowOp, Index);
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
- if (Instruction *Result = commonCastTransforms(Trunc))
- return Result;
-
- Value *Src = Trunc.getOperand(0);
- Type *DestTy = Trunc.getType(), *SrcTy = Src->getType();
- unsigned DestWidth = DestTy->getScalarSizeInBits();
- unsigned SrcWidth = SrcTy->getScalarSizeInBits();
-
- // Attempt to truncate the entire input expression tree to the destination
- // type. Only do this if the dest type is a simple type, don't convert the
- // expression tree to something weird like i93 unless the source is also
- // strange.
- if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
- canEvaluateTruncated(Src, DestTy, *this, &Trunc)) {
-
- // If this cast is a truncate, evaluting in a different type always
- // eliminates the cast, so it is always a win.
- LLVM_DEBUG(
- dbgs() << "ICE: EvaluateInDifferentType converting expression type"
- " to avoid cast: "
- << Trunc << '\n');
- Value *Res = EvaluateInDifferentType(Src, DestTy, false);
- assert(Res->getType() == DestTy);
- return replaceInstUsesWith(Trunc, Res);
- }
-
- // For integer types, check if we can shorten the entire input expression to
- // DestWidth * 2, which won't allow removing the truncate, but reducing the
- // width may enable further optimizations, e.g. allowing for larger
- // vectorization factors.
- if (auto *DestITy = dyn_cast<IntegerType>(DestTy)) {
- if (DestWidth * 2 < SrcWidth) {
- auto *NewDestTy = DestITy->getExtendedType();
- if (shouldChangeType(SrcTy, NewDestTy) &&
- canEvaluateTruncated(Src, NewDestTy, *this, &Trunc)) {
- LLVM_DEBUG(
- dbgs() << "ICE: EvaluateInDifferentType converting expression type"
- " to reduce the width of operand of"
- << Trunc << '\n');
- Value *Res = EvaluateInDifferentType(Src, NewDestTy, false);
- return new TruncInst(Res, DestTy);
- }
- }
- }
-
- // Test if the trunc is the user of a select which is part of a
- // minimum or maximum operation. If so, don't do any more simplification.
- // Even simplifying demanded bits can break the canonical form of a
- // min/max.
- Value *LHS, *RHS;
- if (SelectInst *Sel = dyn_cast<SelectInst>(Src))
- if (matchSelectPattern(Sel, LHS, RHS).Flavor != SPF_UNKNOWN)
- return nullptr;
-
- // See if we can simplify any instructions used by the input whose sole
- // purpose is to compute bits we don't care about.
- if (SimplifyDemandedInstructionBits(Trunc))
- return &Trunc;
-
- if (DestWidth == 1) {
- Value *Zero = Constant::getNullValue(SrcTy);
- if (DestTy->isIntegerTy()) {
- // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only).
- // TODO: We canonicalize to more instructions here because we are probably
- // lacking equivalent analysis for trunc relative to icmp. There may also
- // be codegen concerns. If those trunc limitations were removed, we could
- // remove this transform.
- Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1));
- return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
- }
-
- // For vectors, we do not canonicalize all truncs to icmp, so optimize
- // patterns that would be covered within visitICmpInst.
- Value *X;
- Constant *C;
- if (match(Src, m_OneUse(m_LShr(m_Value(X), m_Constant(C))))) {
- // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0
- Constant *One = ConstantInt::get(SrcTy, APInt(SrcWidth, 1));
- Constant *MaskC = ConstantExpr::getShl(One, C);
- Value *And = Builder.CreateAnd(X, MaskC);
- return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
- }
- if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_Constant(C)),
- m_Deferred(X))))) {
- // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0
- Constant *One = ConstantInt::get(SrcTy, APInt(SrcWidth, 1));
- Constant *MaskC = ConstantExpr::getShl(One, C);
- MaskC = ConstantExpr::getOr(MaskC, One);
- Value *And = Builder.CreateAnd(X, MaskC);
- return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
- }
- }
-
+ if (Instruction *Result = commonCastTransforms(Trunc))
+ return Result;
+
+ Value *Src = Trunc.getOperand(0);
+ Type *DestTy = Trunc.getType(), *SrcTy = Src->getType();
+ unsigned DestWidth = DestTy->getScalarSizeInBits();
+ unsigned SrcWidth = SrcTy->getScalarSizeInBits();
+
+ // Attempt to truncate the entire input expression tree to the destination
+ // type. Only do this if the dest type is a simple type, don't convert the
+ // expression tree to something weird like i93 unless the source is also
+ // strange.
+ if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
+ canEvaluateTruncated(Src, DestTy, *this, &Trunc)) {
+
+ // If this cast is a truncate, evaluting in a different type always
+ // eliminates the cast, so it is always a win.
+ LLVM_DEBUG(
+ dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+ " to avoid cast: "
+ << Trunc << '\n');
+ Value *Res = EvaluateInDifferentType(Src, DestTy, false);
+ assert(Res->getType() == DestTy);
+ return replaceInstUsesWith(Trunc, Res);
+ }
+
+ // For integer types, check if we can shorten the entire input expression to
+ // DestWidth * 2, which won't allow removing the truncate, but reducing the
+ // width may enable further optimizations, e.g. allowing for larger
+ // vectorization factors.
+ if (auto *DestITy = dyn_cast<IntegerType>(DestTy)) {
+ if (DestWidth * 2 < SrcWidth) {
+ auto *NewDestTy = DestITy->getExtendedType();
+ if (shouldChangeType(SrcTy, NewDestTy) &&
+ canEvaluateTruncated(Src, NewDestTy, *this, &Trunc)) {
+ LLVM_DEBUG(
+ dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+ " to reduce the width of operand of"
+ << Trunc << '\n');
+ Value *Res = EvaluateInDifferentType(Src, NewDestTy, false);
+ return new TruncInst(Res, DestTy);
+ }
+ }
+ }
+
+ // Test if the trunc is the user of a select which is part of a
+ // minimum or maximum operation. If so, don't do any more simplification.
+ // Even simplifying demanded bits can break the canonical form of a
+ // min/max.
+ Value *LHS, *RHS;
+ if (SelectInst *Sel = dyn_cast<SelectInst>(Src))
+ if (matchSelectPattern(Sel, LHS, RHS).Flavor != SPF_UNKNOWN)
+ return nullptr;
+
+ // See if we can simplify any instructions used by the input whose sole
+ // purpose is to compute bits we don't care about.
+ if (SimplifyDemandedInstructionBits(Trunc))
+ return &Trunc;
+
+ if (DestWidth == 1) {
+ Value *Zero = Constant::getNullValue(SrcTy);
+ if (DestTy->isIntegerTy()) {
+ // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only).
+ // TODO: We canonicalize to more instructions here because we are probably
+ // lacking equivalent analysis for trunc relative to icmp. There may also
+ // be codegen concerns. If those trunc limitations were removed, we could
+ // remove this transform.
+ Value *And = Builder.CreateAnd(Src, ConstantInt::get(SrcTy, 1));
+ return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+ }
+
+ // For vectors, we do not canonicalize all truncs to icmp, so optimize
+ // patterns that would be covered within visitICmpInst.
+ Value *X;
+ Constant *C;
+ if (match(Src, m_OneUse(m_LShr(m_Value(X), m_Constant(C))))) {
+ // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0
+ Constant *One = ConstantInt::get(SrcTy, APInt(SrcWidth, 1));
+ Constant *MaskC = ConstantExpr::getShl(One, C);
+ Value *And = Builder.CreateAnd(X, MaskC);
+ return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+ }
+ if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_Constant(C)),
+ m_Deferred(X))))) {
+ // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0
+ Constant *One = ConstantInt::get(SrcTy, APInt(SrcWidth, 1));
+ Constant *MaskC = ConstantExpr::getShl(One, C);
+ MaskC = ConstantExpr::getOr(MaskC, One);
+ Value *And = Builder.CreateAnd(X, MaskC);
+ return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
+ }
+ }
+
Value *A;
Constant *C;
if (match(Src, m_LShr(m_SExt(m_Value(A)), m_Constant(C)))) {
- unsigned AWidth = A->getType()->getScalarSizeInBits();
- unsigned MaxShiftAmt = SrcWidth - std::max(DestWidth, AWidth);
+ unsigned AWidth = A->getType()->getScalarSizeInBits();
+ unsigned MaxShiftAmt = SrcWidth - std::max(DestWidth, AWidth);
auto *OldSh = cast<Instruction>(Src);
bool IsExact = OldSh->isExact();
-
- // If the shift is small enough, all zero bits created by the shift are
- // removed by the trunc.
+
+ // If the shift is small enough, all zero bits created by the shift are
+ // removed by the trunc.
if (match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULE,
APInt(SrcWidth, MaxShiftAmt)))) {
- // trunc (lshr (sext A), C) --> ashr A, C
- if (A->getType() == DestTy) {
+ // trunc (lshr (sext A), C) --> ashr A, C
+ if (A->getType() == DestTy) {
Constant *MaxAmt = ConstantInt::get(SrcTy, DestWidth - 1, false);
Constant *ShAmt = ConstantExpr::getUMin(C, MaxAmt);
ShAmt = ConstantExpr::getTrunc(ShAmt, A->getType());
ShAmt = Constant::mergeUndefsWith(ShAmt, C);
return IsExact ? BinaryOperator::CreateExactAShr(A, ShAmt)
: BinaryOperator::CreateAShr(A, ShAmt);
- }
- // The types are mismatched, so create a cast after shifting:
- // trunc (lshr (sext A), C) --> sext/trunc (ashr A, C)
- if (Src->hasOneUse()) {
+ }
+ // The types are mismatched, so create a cast after shifting:
+ // trunc (lshr (sext A), C) --> sext/trunc (ashr A, C)
+ if (Src->hasOneUse()) {
Constant *MaxAmt = ConstantInt::get(SrcTy, AWidth - 1, false);
Constant *ShAmt = ConstantExpr::getUMin(C, MaxAmt);
ShAmt = ConstantExpr::getTrunc(ShAmt, A->getType());
Value *Shift = Builder.CreateAShr(A, ShAmt, "", IsExact);
- return CastInst::CreateIntegerCast(Shift, DestTy, true);
- }
- }
- // TODO: Mask high bits with 'and'.
- }
-
+ return CastInst::CreateIntegerCast(Shift, DestTy, true);
+ }
+ }
+ // TODO: Mask high bits with 'and'.
+ }
+
// trunc (*shr (trunc A), C) --> trunc(*shr A, C)
if (match(Src, m_OneUse(m_Shr(m_Trunc(m_Value(A)), m_Constant(C))))) {
unsigned MaxShiftAmt = SrcWidth - DestWidth;
@@ -867,661 +867,661 @@ Instruction *InstCombinerImpl::visitTrunc(TruncInst &Trunc) {
}
}
- if (Instruction *I = narrowBinOp(Trunc))
- return I;
-
- if (Instruction *I = shrinkSplatShuffle(Trunc, Builder))
- return I;
-
- if (Instruction *I = shrinkInsertElt(Trunc, Builder))
- return I;
-
+ if (Instruction *I = narrowBinOp(Trunc))
+ return I;
+
+ if (Instruction *I = shrinkSplatShuffle(Trunc, Builder))
+ return I;
+
+ if (Instruction *I = shrinkInsertElt(Trunc, Builder))
+ return I;
+
if (Src->hasOneUse() &&
(isa<VectorType>(SrcTy) || shouldChangeType(SrcTy, DestTy))) {
- // Transform "trunc (shl X, cst)" -> "shl (trunc X), cst" so long as the
- // dest type is native and cst < dest size.
+ // Transform "trunc (shl X, cst)" -> "shl (trunc X), cst" so long as the
+ // dest type is native and cst < dest size.
if (match(Src, m_Shl(m_Value(A), m_Constant(C))) &&
- !match(A, m_Shr(m_Value(), m_Constant()))) {
- // Skip shifts of shift by constants. It undoes a combine in
- // FoldShiftByConstant and is the extend in reg pattern.
+ !match(A, m_Shr(m_Value(), m_Constant()))) {
+ // Skip shifts of shift by constants. It undoes a combine in
+ // FoldShiftByConstant and is the extend in reg pattern.
APInt Threshold = APInt(C->getType()->getScalarSizeInBits(), DestWidth);
if (match(C, m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, Threshold))) {
- Value *NewTrunc = Builder.CreateTrunc(A, DestTy, A->getName() + ".tr");
+ Value *NewTrunc = Builder.CreateTrunc(A, DestTy, A->getName() + ".tr");
return BinaryOperator::Create(Instruction::Shl, NewTrunc,
ConstantExpr::getTrunc(C, DestTy));
- }
- }
- }
-
- if (Instruction *I = foldVecTruncToExtElt(Trunc, *this))
- return I;
-
- // Whenever an element is extracted from a vector, and then truncated,
- // canonicalize by converting it to a bitcast followed by an
- // extractelement.
- //
- // Example (little endian):
- // trunc (extractelement <4 x i64> %X, 0) to i32
- // --->
- // extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0
- Value *VecOp;
+ }
+ }
+ }
+
+ if (Instruction *I = foldVecTruncToExtElt(Trunc, *this))
+ return I;
+
+ // Whenever an element is extracted from a vector, and then truncated,
+ // canonicalize by converting it to a bitcast followed by an
+ // extractelement.
+ //
+ // Example (little endian):
+ // trunc (extractelement <4 x i64> %X, 0) to i32
+ // --->
+ // extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0
+ Value *VecOp;
ConstantInt *Cst;
- if (match(Src, m_OneUse(m_ExtractElt(m_Value(VecOp), m_ConstantInt(Cst))))) {
- auto *VecOpTy = cast<VectorType>(VecOp->getType());
+ if (match(Src, m_OneUse(m_ExtractElt(m_Value(VecOp), m_ConstantInt(Cst))))) {
+ auto *VecOpTy = cast<VectorType>(VecOp->getType());
auto VecElts = VecOpTy->getElementCount();
-
- // A badly fit destination size would result in an invalid cast.
- if (SrcWidth % DestWidth == 0) {
- uint64_t TruncRatio = SrcWidth / DestWidth;
+
+ // A badly fit destination size would result in an invalid cast.
+ if (SrcWidth % DestWidth == 0) {
+ uint64_t TruncRatio = SrcWidth / DestWidth;
uint64_t BitCastNumElts = VecElts.getKnownMinValue() * TruncRatio;
- uint64_t VecOpIdx = Cst->getZExtValue();
- uint64_t NewIdx = DL.isBigEndian() ? (VecOpIdx + 1) * TruncRatio - 1
- : VecOpIdx * TruncRatio;
- assert(BitCastNumElts <= std::numeric_limits<uint32_t>::max() &&
- "overflow 32-bits");
-
+ uint64_t VecOpIdx = Cst->getZExtValue();
+ uint64_t NewIdx = DL.isBigEndian() ? (VecOpIdx + 1) * TruncRatio - 1
+ : VecOpIdx * TruncRatio;
+ assert(BitCastNumElts <= std::numeric_limits<uint32_t>::max() &&
+ "overflow 32-bits");
+
auto *BitCastTo =
VectorType::get(DestTy, BitCastNumElts, VecElts.isScalable());
- Value *BitCast = Builder.CreateBitCast(VecOp, BitCastTo);
- return ExtractElementInst::Create(BitCast, Builder.getInt32(NewIdx));
- }
- }
-
- return nullptr;
-}
-
+ Value *BitCast = Builder.CreateBitCast(VecOp, BitCastTo);
+ return ExtractElementInst::Create(BitCast, Builder.getInt32(NewIdx));
+ }
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::transformZExtICmp(ICmpInst *Cmp, ZExtInst &Zext,
bool DoTransform) {
- // If we are just checking for a icmp eq of a single bit and zext'ing it
- // to an integer, then shift the bit to the appropriate place and then
- // cast to integer to avoid the comparison.
- const APInt *Op1CV;
- if (match(Cmp->getOperand(1), m_APInt(Op1CV))) {
-
- // zext (x <s 0) to i32 --> x>>u31 true if signbit set.
- // zext (x >s -1) to i32 --> (x>>u31)^1 true if signbit clear.
- if ((Cmp->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isNullValue()) ||
- (Cmp->getPredicate() == ICmpInst::ICMP_SGT && Op1CV->isAllOnesValue())) {
- if (!DoTransform) return Cmp;
-
- Value *In = Cmp->getOperand(0);
- Value *Sh = ConstantInt::get(In->getType(),
- In->getType()->getScalarSizeInBits() - 1);
- In = Builder.CreateLShr(In, Sh, In->getName() + ".lobit");
- if (In->getType() != Zext.getType())
- In = Builder.CreateIntCast(In, Zext.getType(), false /*ZExt*/);
-
- if (Cmp->getPredicate() == ICmpInst::ICMP_SGT) {
- Constant *One = ConstantInt::get(In->getType(), 1);
- In = Builder.CreateXor(In, One, In->getName() + ".not");
- }
-
- return replaceInstUsesWith(Zext, In);
- }
-
- // zext (X == 0) to i32 --> X^1 iff X has only the low bit set.
- // zext (X == 0) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
- // zext (X == 1) to i32 --> X iff X has only the low bit set.
- // zext (X == 2) to i32 --> X>>1 iff X has only the 2nd bit set.
- // zext (X != 0) to i32 --> X iff X has only the low bit set.
- // zext (X != 0) to i32 --> X>>1 iff X has only the 2nd bit set.
- // zext (X != 1) to i32 --> X^1 iff X has only the low bit set.
- // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
- if ((Op1CV->isNullValue() || Op1CV->isPowerOf2()) &&
- // This only works for EQ and NE
- Cmp->isEquality()) {
- // If Op1C some other power of two, convert:
- KnownBits Known = computeKnownBits(Cmp->getOperand(0), 0, &Zext);
-
- APInt KnownZeroMask(~Known.Zero);
- if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
- if (!DoTransform) return Cmp;
-
- bool isNE = Cmp->getPredicate() == ICmpInst::ICMP_NE;
- if (!Op1CV->isNullValue() && (*Op1CV != KnownZeroMask)) {
- // (X&4) == 2 --> false
- // (X&4) != 2 --> true
- Constant *Res = ConstantInt::get(Zext.getType(), isNE);
- return replaceInstUsesWith(Zext, Res);
- }
-
- uint32_t ShAmt = KnownZeroMask.logBase2();
- Value *In = Cmp->getOperand(0);
- if (ShAmt) {
- // Perform a logical shr by shiftamt.
- // Insert the shift to put the result in the low bit.
- In = Builder.CreateLShr(In, ConstantInt::get(In->getType(), ShAmt),
- In->getName() + ".lobit");
- }
-
- if (!Op1CV->isNullValue() == isNE) { // Toggle the low bit.
- Constant *One = ConstantInt::get(In->getType(), 1);
- In = Builder.CreateXor(In, One);
- }
-
- if (Zext.getType() == In->getType())
- return replaceInstUsesWith(Zext, In);
-
- Value *IntCast = Builder.CreateIntCast(In, Zext.getType(), false);
- return replaceInstUsesWith(Zext, IntCast);
- }
- }
- }
-
- // icmp ne A, B is equal to xor A, B when A and B only really have one bit.
- // It is also profitable to transform icmp eq into not(xor(A, B)) because that
- // may lead to additional simplifications.
- if (Cmp->isEquality() && Zext.getType() == Cmp->getOperand(0)->getType()) {
- if (IntegerType *ITy = dyn_cast<IntegerType>(Zext.getType())) {
- Value *LHS = Cmp->getOperand(0);
- Value *RHS = Cmp->getOperand(1);
-
- KnownBits KnownLHS = computeKnownBits(LHS, 0, &Zext);
- KnownBits KnownRHS = computeKnownBits(RHS, 0, &Zext);
-
- if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) {
- APInt KnownBits = KnownLHS.Zero | KnownLHS.One;
- APInt UnknownBit = ~KnownBits;
- if (UnknownBit.countPopulation() == 1) {
- if (!DoTransform) return Cmp;
-
- Value *Result = Builder.CreateXor(LHS, RHS);
-
- // Mask off any bits that are set and won't be shifted away.
- if (KnownLHS.One.uge(UnknownBit))
- Result = Builder.CreateAnd(Result,
- ConstantInt::get(ITy, UnknownBit));
-
- // Shift the bit we're testing down to the lsb.
- Result = Builder.CreateLShr(
- Result, ConstantInt::get(ITy, UnknownBit.countTrailingZeros()));
-
- if (Cmp->getPredicate() == ICmpInst::ICMP_EQ)
- Result = Builder.CreateXor(Result, ConstantInt::get(ITy, 1));
- Result->takeName(Cmp);
- return replaceInstUsesWith(Zext, Result);
- }
- }
- }
- }
-
- return nullptr;
-}
-
-/// Determine if the specified value can be computed in the specified wider type
-/// and produce the same low bits. If not, return false.
-///
-/// If this function returns true, it can also return a non-zero number of bits
-/// (in BitsToClear) which indicates that the value it computes is correct for
-/// the zero extend, but that the additional BitsToClear bits need to be zero'd
-/// out. For example, to promote something like:
-///
-/// %B = trunc i64 %A to i32
-/// %C = lshr i32 %B, 8
-/// %E = zext i32 %C to i64
-///
-/// CanEvaluateZExtd for the 'lshr' will return true, and BitsToClear will be
-/// set to 8 to indicate that the promoted value needs to have bits 24-31
-/// cleared in addition to bits 32-63. Since an 'and' will be generated to
-/// clear the top bits anyway, doing this has no extra cost.
-///
-/// This function works on both vectors and scalars.
-static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
+ // If we are just checking for a icmp eq of a single bit and zext'ing it
+ // to an integer, then shift the bit to the appropriate place and then
+ // cast to integer to avoid the comparison.
+ const APInt *Op1CV;
+ if (match(Cmp->getOperand(1), m_APInt(Op1CV))) {
+
+ // zext (x <s 0) to i32 --> x>>u31 true if signbit set.
+ // zext (x >s -1) to i32 --> (x>>u31)^1 true if signbit clear.
+ if ((Cmp->getPredicate() == ICmpInst::ICMP_SLT && Op1CV->isNullValue()) ||
+ (Cmp->getPredicate() == ICmpInst::ICMP_SGT && Op1CV->isAllOnesValue())) {
+ if (!DoTransform) return Cmp;
+
+ Value *In = Cmp->getOperand(0);
+ Value *Sh = ConstantInt::get(In->getType(),
+ In->getType()->getScalarSizeInBits() - 1);
+ In = Builder.CreateLShr(In, Sh, In->getName() + ".lobit");
+ if (In->getType() != Zext.getType())
+ In = Builder.CreateIntCast(In, Zext.getType(), false /*ZExt*/);
+
+ if (Cmp->getPredicate() == ICmpInst::ICMP_SGT) {
+ Constant *One = ConstantInt::get(In->getType(), 1);
+ In = Builder.CreateXor(In, One, In->getName() + ".not");
+ }
+
+ return replaceInstUsesWith(Zext, In);
+ }
+
+ // zext (X == 0) to i32 --> X^1 iff X has only the low bit set.
+ // zext (X == 0) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
+ // zext (X == 1) to i32 --> X iff X has only the low bit set.
+ // zext (X == 2) to i32 --> X>>1 iff X has only the 2nd bit set.
+ // zext (X != 0) to i32 --> X iff X has only the low bit set.
+ // zext (X != 0) to i32 --> X>>1 iff X has only the 2nd bit set.
+ // zext (X != 1) to i32 --> X^1 iff X has only the low bit set.
+ // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
+ if ((Op1CV->isNullValue() || Op1CV->isPowerOf2()) &&
+ // This only works for EQ and NE
+ Cmp->isEquality()) {
+ // If Op1C some other power of two, convert:
+ KnownBits Known = computeKnownBits(Cmp->getOperand(0), 0, &Zext);
+
+ APInt KnownZeroMask(~Known.Zero);
+ if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
+ if (!DoTransform) return Cmp;
+
+ bool isNE = Cmp->getPredicate() == ICmpInst::ICMP_NE;
+ if (!Op1CV->isNullValue() && (*Op1CV != KnownZeroMask)) {
+ // (X&4) == 2 --> false
+ // (X&4) != 2 --> true
+ Constant *Res = ConstantInt::get(Zext.getType(), isNE);
+ return replaceInstUsesWith(Zext, Res);
+ }
+
+ uint32_t ShAmt = KnownZeroMask.logBase2();
+ Value *In = Cmp->getOperand(0);
+ if (ShAmt) {
+ // Perform a logical shr by shiftamt.
+ // Insert the shift to put the result in the low bit.
+ In = Builder.CreateLShr(In, ConstantInt::get(In->getType(), ShAmt),
+ In->getName() + ".lobit");
+ }
+
+ if (!Op1CV->isNullValue() == isNE) { // Toggle the low bit.
+ Constant *One = ConstantInt::get(In->getType(), 1);
+ In = Builder.CreateXor(In, One);
+ }
+
+ if (Zext.getType() == In->getType())
+ return replaceInstUsesWith(Zext, In);
+
+ Value *IntCast = Builder.CreateIntCast(In, Zext.getType(), false);
+ return replaceInstUsesWith(Zext, IntCast);
+ }
+ }
+ }
+
+ // icmp ne A, B is equal to xor A, B when A and B only really have one bit.
+ // It is also profitable to transform icmp eq into not(xor(A, B)) because that
+ // may lead to additional simplifications.
+ if (Cmp->isEquality() && Zext.getType() == Cmp->getOperand(0)->getType()) {
+ if (IntegerType *ITy = dyn_cast<IntegerType>(Zext.getType())) {
+ Value *LHS = Cmp->getOperand(0);
+ Value *RHS = Cmp->getOperand(1);
+
+ KnownBits KnownLHS = computeKnownBits(LHS, 0, &Zext);
+ KnownBits KnownRHS = computeKnownBits(RHS, 0, &Zext);
+
+ if (KnownLHS.Zero == KnownRHS.Zero && KnownLHS.One == KnownRHS.One) {
+ APInt KnownBits = KnownLHS.Zero | KnownLHS.One;
+ APInt UnknownBit = ~KnownBits;
+ if (UnknownBit.countPopulation() == 1) {
+ if (!DoTransform) return Cmp;
+
+ Value *Result = Builder.CreateXor(LHS, RHS);
+
+ // Mask off any bits that are set and won't be shifted away.
+ if (KnownLHS.One.uge(UnknownBit))
+ Result = Builder.CreateAnd(Result,
+ ConstantInt::get(ITy, UnknownBit));
+
+ // Shift the bit we're testing down to the lsb.
+ Result = Builder.CreateLShr(
+ Result, ConstantInt::get(ITy, UnknownBit.countTrailingZeros()));
+
+ if (Cmp->getPredicate() == ICmpInst::ICMP_EQ)
+ Result = Builder.CreateXor(Result, ConstantInt::get(ITy, 1));
+ Result->takeName(Cmp);
+ return replaceInstUsesWith(Zext, Result);
+ }
+ }
+ }
+ }
+
+ return nullptr;
+}
+
+/// Determine if the specified value can be computed in the specified wider type
+/// and produce the same low bits. If not, return false.
+///
+/// If this function returns true, it can also return a non-zero number of bits
+/// (in BitsToClear) which indicates that the value it computes is correct for
+/// the zero extend, but that the additional BitsToClear bits need to be zero'd
+/// out. For example, to promote something like:
+///
+/// %B = trunc i64 %A to i32
+/// %C = lshr i32 %B, 8
+/// %E = zext i32 %C to i64
+///
+/// CanEvaluateZExtd for the 'lshr' will return true, and BitsToClear will be
+/// set to 8 to indicate that the promoted value needs to have bits 24-31
+/// cleared in addition to bits 32-63. Since an 'and' will be generated to
+/// clear the top bits anyway, doing this has no extra cost.
+///
+/// This function works on both vectors and scalars.
+static bool canEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear,
InstCombinerImpl &IC, Instruction *CxtI) {
- BitsToClear = 0;
- if (canAlwaysEvaluateInType(V, Ty))
- return true;
- if (canNotEvaluateInType(V, Ty))
- return false;
-
- auto *I = cast<Instruction>(V);
- unsigned Tmp;
- switch (I->getOpcode()) {
- case Instruction::ZExt: // zext(zext(x)) -> zext(x).
- case Instruction::SExt: // zext(sext(x)) -> sext(x).
- case Instruction::Trunc: // zext(trunc(x)) -> trunc(x) or zext(x)
- return true;
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Mul:
- if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) ||
- !canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI))
- return false;
- // These can all be promoted if neither operand has 'bits to clear'.
- if (BitsToClear == 0 && Tmp == 0)
- return true;
-
- // If the operation is an AND/OR/XOR and the bits to clear are zero in the
- // other side, BitsToClear is ok.
- if (Tmp == 0 && I->isBitwiseLogicOp()) {
- // We use MaskedValueIsZero here for generality, but the case we care
- // about the most is constant RHS.
- unsigned VSize = V->getType()->getScalarSizeInBits();
- if (IC.MaskedValueIsZero(I->getOperand(1),
- APInt::getHighBitsSet(VSize, BitsToClear),
- 0, CxtI)) {
- // If this is an And instruction and all of the BitsToClear are
- // known to be zero we can reset BitsToClear.
- if (I->getOpcode() == Instruction::And)
- BitsToClear = 0;
- return true;
- }
- }
-
- // Otherwise, we don't know how to analyze this BitsToClear case yet.
- return false;
-
- case Instruction::Shl: {
- // We can promote shl(x, cst) if we can promote x. Since shl overwrites the
- // upper bits we can reduce BitsToClear by the shift amount.
- const APInt *Amt;
- if (match(I->getOperand(1), m_APInt(Amt))) {
- if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
- return false;
- uint64_t ShiftAmt = Amt->getZExtValue();
- BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0;
- return true;
- }
- return false;
- }
- case Instruction::LShr: {
- // We can promote lshr(x, cst) if we can promote x. This requires the
- // ultimate 'and' to clear out the high zero bits we're clearing out though.
- const APInt *Amt;
- if (match(I->getOperand(1), m_APInt(Amt))) {
- if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
- return false;
- BitsToClear += Amt->getZExtValue();
- if (BitsToClear > V->getType()->getScalarSizeInBits())
- BitsToClear = V->getType()->getScalarSizeInBits();
- return true;
- }
- // Cannot promote variable LSHR.
- return false;
- }
- case Instruction::Select:
- if (!canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) ||
- !canEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) ||
- // TODO: If important, we could handle the case when the BitsToClear are
- // known zero in the disagreeing side.
- Tmp != BitsToClear)
- return false;
- return true;
-
- case Instruction::PHI: {
- // We can change a phi if we can change all operands. Note that we never
- // get into trouble with cyclic PHIs here because we only consider
- // instructions with a single use.
- PHINode *PN = cast<PHINode>(I);
- if (!canEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI))
- return false;
- for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i)
- if (!canEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) ||
- // TODO: If important, we could handle the case when the BitsToClear
- // are known zero in the disagreeing input.
- Tmp != BitsToClear)
- return false;
- return true;
- }
- default:
- // TODO: Can handle more cases here.
- return false;
- }
-}
-
+ BitsToClear = 0;
+ if (canAlwaysEvaluateInType(V, Ty))
+ return true;
+ if (canNotEvaluateInType(V, Ty))
+ return false;
+
+ auto *I = cast<Instruction>(V);
+ unsigned Tmp;
+ switch (I->getOpcode()) {
+ case Instruction::ZExt: // zext(zext(x)) -> zext(x).
+ case Instruction::SExt: // zext(sext(x)) -> sext(x).
+ case Instruction::Trunc: // zext(trunc(x)) -> trunc(x) or zext(x)
+ return true;
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI) ||
+ !canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI))
+ return false;
+ // These can all be promoted if neither operand has 'bits to clear'.
+ if (BitsToClear == 0 && Tmp == 0)
+ return true;
+
+ // If the operation is an AND/OR/XOR and the bits to clear are zero in the
+ // other side, BitsToClear is ok.
+ if (Tmp == 0 && I->isBitwiseLogicOp()) {
+ // We use MaskedValueIsZero here for generality, but the case we care
+ // about the most is constant RHS.
+ unsigned VSize = V->getType()->getScalarSizeInBits();
+ if (IC.MaskedValueIsZero(I->getOperand(1),
+ APInt::getHighBitsSet(VSize, BitsToClear),
+ 0, CxtI)) {
+ // If this is an And instruction and all of the BitsToClear are
+ // known to be zero we can reset BitsToClear.
+ if (I->getOpcode() == Instruction::And)
+ BitsToClear = 0;
+ return true;
+ }
+ }
+
+ // Otherwise, we don't know how to analyze this BitsToClear case yet.
+ return false;
+
+ case Instruction::Shl: {
+ // We can promote shl(x, cst) if we can promote x. Since shl overwrites the
+ // upper bits we can reduce BitsToClear by the shift amount.
+ const APInt *Amt;
+ if (match(I->getOperand(1), m_APInt(Amt))) {
+ if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
+ return false;
+ uint64_t ShiftAmt = Amt->getZExtValue();
+ BitsToClear = ShiftAmt < BitsToClear ? BitsToClear - ShiftAmt : 0;
+ return true;
+ }
+ return false;
+ }
+ case Instruction::LShr: {
+ // We can promote lshr(x, cst) if we can promote x. This requires the
+ // ultimate 'and' to clear out the high zero bits we're clearing out though.
+ const APInt *Amt;
+ if (match(I->getOperand(1), m_APInt(Amt))) {
+ if (!canEvaluateZExtd(I->getOperand(0), Ty, BitsToClear, IC, CxtI))
+ return false;
+ BitsToClear += Amt->getZExtValue();
+ if (BitsToClear > V->getType()->getScalarSizeInBits())
+ BitsToClear = V->getType()->getScalarSizeInBits();
+ return true;
+ }
+ // Cannot promote variable LSHR.
+ return false;
+ }
+ case Instruction::Select:
+ if (!canEvaluateZExtd(I->getOperand(1), Ty, Tmp, IC, CxtI) ||
+ !canEvaluateZExtd(I->getOperand(2), Ty, BitsToClear, IC, CxtI) ||
+ // TODO: If important, we could handle the case when the BitsToClear are
+ // known zero in the disagreeing side.
+ Tmp != BitsToClear)
+ return false;
+ return true;
+
+ case Instruction::PHI: {
+ // We can change a phi if we can change all operands. Note that we never
+ // get into trouble with cyclic PHIs here because we only consider
+ // instructions with a single use.
+ PHINode *PN = cast<PHINode>(I);
+ if (!canEvaluateZExtd(PN->getIncomingValue(0), Ty, BitsToClear, IC, CxtI))
+ return false;
+ for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (!canEvaluateZExtd(PN->getIncomingValue(i), Ty, Tmp, IC, CxtI) ||
+ // TODO: If important, we could handle the case when the BitsToClear
+ // are known zero in the disagreeing input.
+ Tmp != BitsToClear)
+ return false;
+ return true;
+ }
+ default:
+ // TODO: Can handle more cases here.
+ return false;
+ }
+}
+
Instruction *InstCombinerImpl::visitZExt(ZExtInst &CI) {
- // If this zero extend is only used by a truncate, let the truncate be
- // eliminated before we try to optimize this zext.
- if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
- return nullptr;
-
- // If one of the common conversion will work, do it.
- if (Instruction *Result = commonCastTransforms(CI))
- return Result;
-
- Value *Src = CI.getOperand(0);
- Type *SrcTy = Src->getType(), *DestTy = CI.getType();
-
- // Try to extend the entire expression tree to the wide destination type.
- unsigned BitsToClear;
- if (shouldChangeType(SrcTy, DestTy) &&
- canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) {
- assert(BitsToClear <= SrcTy->getScalarSizeInBits() &&
- "Can't clear more bits than in SrcTy");
-
- // Okay, we can transform this! Insert the new expression now.
- LLVM_DEBUG(
- dbgs() << "ICE: EvaluateInDifferentType converting expression type"
- " to avoid zero extend: "
- << CI << '\n');
- Value *Res = EvaluateInDifferentType(Src, DestTy, false);
- assert(Res->getType() == DestTy);
-
- // Preserve debug values referring to Src if the zext is its last use.
- if (auto *SrcOp = dyn_cast<Instruction>(Src))
- if (SrcOp->hasOneUse())
- replaceAllDbgUsesWith(*SrcOp, *Res, CI, DT);
-
- uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear;
- uint32_t DestBitSize = DestTy->getScalarSizeInBits();
-
- // If the high bits are already filled with zeros, just replace this
- // cast with the result.
- if (MaskedValueIsZero(Res,
- APInt::getHighBitsSet(DestBitSize,
- DestBitSize-SrcBitsKept),
- 0, &CI))
- return replaceInstUsesWith(CI, Res);
-
- // We need to emit an AND to clear the high bits.
- Constant *C = ConstantInt::get(Res->getType(),
- APInt::getLowBitsSet(DestBitSize, SrcBitsKept));
- return BinaryOperator::CreateAnd(Res, C);
- }
-
- // If this is a TRUNC followed by a ZEXT then we are dealing with integral
- // types and if the sizes are just right we can convert this into a logical
- // 'and' which will be much cheaper than the pair of casts.
- if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) { // A->B->C cast
- // TODO: Subsume this into EvaluateInDifferentType.
-
- // Get the sizes of the types involved. We know that the intermediate type
- // will be smaller than A or C, but don't know the relation between A and C.
- Value *A = CSrc->getOperand(0);
- unsigned SrcSize = A->getType()->getScalarSizeInBits();
- unsigned MidSize = CSrc->getType()->getScalarSizeInBits();
- unsigned DstSize = CI.getType()->getScalarSizeInBits();
- // If we're actually extending zero bits, then if
- // SrcSize < DstSize: zext(a & mask)
- // SrcSize == DstSize: a & mask
- // SrcSize > DstSize: trunc(a) & mask
- if (SrcSize < DstSize) {
- APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
- Constant *AndConst = ConstantInt::get(A->getType(), AndValue);
- Value *And = Builder.CreateAnd(A, AndConst, CSrc->getName() + ".mask");
- return new ZExtInst(And, CI.getType());
- }
-
- if (SrcSize == DstSize) {
- APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
- return BinaryOperator::CreateAnd(A, ConstantInt::get(A->getType(),
- AndValue));
- }
- if (SrcSize > DstSize) {
- Value *Trunc = Builder.CreateTrunc(A, CI.getType());
- APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
- return BinaryOperator::CreateAnd(Trunc,
- ConstantInt::get(Trunc->getType(),
- AndValue));
- }
- }
-
- if (ICmpInst *Cmp = dyn_cast<ICmpInst>(Src))
- return transformZExtICmp(Cmp, CI);
-
- BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src);
- if (SrcI && SrcI->getOpcode() == Instruction::Or) {
- // zext (or icmp, icmp) -> or (zext icmp), (zext icmp) if at least one
- // of the (zext icmp) can be eliminated. If so, immediately perform the
- // according elimination.
- ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0));
- ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1));
- if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() &&
+ // If this zero extend is only used by a truncate, let the truncate be
+ // eliminated before we try to optimize this zext.
+ if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
+ return nullptr;
+
+ // If one of the common conversion will work, do it.
+ if (Instruction *Result = commonCastTransforms(CI))
+ return Result;
+
+ Value *Src = CI.getOperand(0);
+ Type *SrcTy = Src->getType(), *DestTy = CI.getType();
+
+ // Try to extend the entire expression tree to the wide destination type.
+ unsigned BitsToClear;
+ if (shouldChangeType(SrcTy, DestTy) &&
+ canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) {
+ assert(BitsToClear <= SrcTy->getScalarSizeInBits() &&
+ "Can't clear more bits than in SrcTy");
+
+ // Okay, we can transform this! Insert the new expression now.
+ LLVM_DEBUG(
+ dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+ " to avoid zero extend: "
+ << CI << '\n');
+ Value *Res = EvaluateInDifferentType(Src, DestTy, false);
+ assert(Res->getType() == DestTy);
+
+ // Preserve debug values referring to Src if the zext is its last use.
+ if (auto *SrcOp = dyn_cast<Instruction>(Src))
+ if (SrcOp->hasOneUse())
+ replaceAllDbgUsesWith(*SrcOp, *Res, CI, DT);
+
+ uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear;
+ uint32_t DestBitSize = DestTy->getScalarSizeInBits();
+
+ // If the high bits are already filled with zeros, just replace this
+ // cast with the result.
+ if (MaskedValueIsZero(Res,
+ APInt::getHighBitsSet(DestBitSize,
+ DestBitSize-SrcBitsKept),
+ 0, &CI))
+ return replaceInstUsesWith(CI, Res);
+
+ // We need to emit an AND to clear the high bits.
+ Constant *C = ConstantInt::get(Res->getType(),
+ APInt::getLowBitsSet(DestBitSize, SrcBitsKept));
+ return BinaryOperator::CreateAnd(Res, C);
+ }
+
+ // If this is a TRUNC followed by a ZEXT then we are dealing with integral
+ // types and if the sizes are just right we can convert this into a logical
+ // 'and' which will be much cheaper than the pair of casts.
+ if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) { // A->B->C cast
+ // TODO: Subsume this into EvaluateInDifferentType.
+
+ // Get the sizes of the types involved. We know that the intermediate type
+ // will be smaller than A or C, but don't know the relation between A and C.
+ Value *A = CSrc->getOperand(0);
+ unsigned SrcSize = A->getType()->getScalarSizeInBits();
+ unsigned MidSize = CSrc->getType()->getScalarSizeInBits();
+ unsigned DstSize = CI.getType()->getScalarSizeInBits();
+ // If we're actually extending zero bits, then if
+ // SrcSize < DstSize: zext(a & mask)
+ // SrcSize == DstSize: a & mask
+ // SrcSize > DstSize: trunc(a) & mask
+ if (SrcSize < DstSize) {
+ APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+ Constant *AndConst = ConstantInt::get(A->getType(), AndValue);
+ Value *And = Builder.CreateAnd(A, AndConst, CSrc->getName() + ".mask");
+ return new ZExtInst(And, CI.getType());
+ }
+
+ if (SrcSize == DstSize) {
+ APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+ return BinaryOperator::CreateAnd(A, ConstantInt::get(A->getType(),
+ AndValue));
+ }
+ if (SrcSize > DstSize) {
+ Value *Trunc = Builder.CreateTrunc(A, CI.getType());
+ APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
+ return BinaryOperator::CreateAnd(Trunc,
+ ConstantInt::get(Trunc->getType(),
+ AndValue));
+ }
+ }
+
+ if (ICmpInst *Cmp = dyn_cast<ICmpInst>(Src))
+ return transformZExtICmp(Cmp, CI);
+
+ BinaryOperator *SrcI = dyn_cast<BinaryOperator>(Src);
+ if (SrcI && SrcI->getOpcode() == Instruction::Or) {
+ // zext (or icmp, icmp) -> or (zext icmp), (zext icmp) if at least one
+ // of the (zext icmp) can be eliminated. If so, immediately perform the
+ // according elimination.
+ ICmpInst *LHS = dyn_cast<ICmpInst>(SrcI->getOperand(0));
+ ICmpInst *RHS = dyn_cast<ICmpInst>(SrcI->getOperand(1));
+ if (LHS && RHS && LHS->hasOneUse() && RHS->hasOneUse() &&
LHS->getOperand(0)->getType() == RHS->getOperand(0)->getType() &&
- (transformZExtICmp(LHS, CI, false) ||
- transformZExtICmp(RHS, CI, false))) {
- // zext (or icmp, icmp) -> or (zext icmp), (zext icmp)
- Value *LCast = Builder.CreateZExt(LHS, CI.getType(), LHS->getName());
- Value *RCast = Builder.CreateZExt(RHS, CI.getType(), RHS->getName());
- Value *Or = Builder.CreateOr(LCast, RCast, CI.getName());
- if (auto *OrInst = dyn_cast<Instruction>(Or))
- Builder.SetInsertPoint(OrInst);
-
- // Perform the elimination.
- if (auto *LZExt = dyn_cast<ZExtInst>(LCast))
- transformZExtICmp(LHS, *LZExt);
- if (auto *RZExt = dyn_cast<ZExtInst>(RCast))
- transformZExtICmp(RHS, *RZExt);
-
- return replaceInstUsesWith(CI, Or);
- }
- }
-
- // zext(trunc(X) & C) -> (X & zext(C)).
- Constant *C;
- Value *X;
- if (SrcI &&
- match(SrcI, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Constant(C)))) &&
- X->getType() == CI.getType())
- return BinaryOperator::CreateAnd(X, ConstantExpr::getZExt(C, CI.getType()));
-
- // zext((trunc(X) & C) ^ C) -> ((X & zext(C)) ^ zext(C)).
- Value *And;
- if (SrcI && match(SrcI, m_OneUse(m_Xor(m_Value(And), m_Constant(C)))) &&
- match(And, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Specific(C)))) &&
- X->getType() == CI.getType()) {
- Constant *ZC = ConstantExpr::getZExt(C, CI.getType());
- return BinaryOperator::CreateXor(Builder.CreateAnd(X, ZC), ZC);
- }
-
- return nullptr;
-}
-
-/// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp.
+ (transformZExtICmp(LHS, CI, false) ||
+ transformZExtICmp(RHS, CI, false))) {
+ // zext (or icmp, icmp) -> or (zext icmp), (zext icmp)
+ Value *LCast = Builder.CreateZExt(LHS, CI.getType(), LHS->getName());
+ Value *RCast = Builder.CreateZExt(RHS, CI.getType(), RHS->getName());
+ Value *Or = Builder.CreateOr(LCast, RCast, CI.getName());
+ if (auto *OrInst = dyn_cast<Instruction>(Or))
+ Builder.SetInsertPoint(OrInst);
+
+ // Perform the elimination.
+ if (auto *LZExt = dyn_cast<ZExtInst>(LCast))
+ transformZExtICmp(LHS, *LZExt);
+ if (auto *RZExt = dyn_cast<ZExtInst>(RCast))
+ transformZExtICmp(RHS, *RZExt);
+
+ return replaceInstUsesWith(CI, Or);
+ }
+ }
+
+ // zext(trunc(X) & C) -> (X & zext(C)).
+ Constant *C;
+ Value *X;
+ if (SrcI &&
+ match(SrcI, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Constant(C)))) &&
+ X->getType() == CI.getType())
+ return BinaryOperator::CreateAnd(X, ConstantExpr::getZExt(C, CI.getType()));
+
+ // zext((trunc(X) & C) ^ C) -> ((X & zext(C)) ^ zext(C)).
+ Value *And;
+ if (SrcI && match(SrcI, m_OneUse(m_Xor(m_Value(And), m_Constant(C)))) &&
+ match(And, m_OneUse(m_And(m_Trunc(m_Value(X)), m_Specific(C)))) &&
+ X->getType() == CI.getType()) {
+ Constant *ZC = ConstantExpr::getZExt(C, CI.getType());
+ return BinaryOperator::CreateXor(Builder.CreateAnd(X, ZC), ZC);
+ }
+
+ return nullptr;
+}
+
+/// Transform (sext icmp) to bitwise / integer operations to eliminate the icmp.
Instruction *InstCombinerImpl::transformSExtICmp(ICmpInst *ICI,
Instruction &CI) {
- Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1);
- ICmpInst::Predicate Pred = ICI->getPredicate();
-
- // Don't bother if Op1 isn't of vector or integer type.
- if (!Op1->getType()->isIntOrIntVectorTy())
- return nullptr;
-
- if ((Pred == ICmpInst::ICMP_SLT && match(Op1, m_ZeroInt())) ||
- (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes()))) {
- // (x <s 0) ? -1 : 0 -> ashr x, 31 -> all ones if negative
- // (x >s -1) ? -1 : 0 -> not (ashr x, 31) -> all ones if positive
- Value *Sh = ConstantInt::get(Op0->getType(),
- Op0->getType()->getScalarSizeInBits() - 1);
- Value *In = Builder.CreateAShr(Op0, Sh, Op0->getName() + ".lobit");
- if (In->getType() != CI.getType())
- In = Builder.CreateIntCast(In, CI.getType(), true /*SExt*/);
-
- if (Pred == ICmpInst::ICMP_SGT)
- In = Builder.CreateNot(In, In->getName() + ".not");
- return replaceInstUsesWith(CI, In);
- }
-
- if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
- // If we know that only one bit of the LHS of the icmp can be set and we
- // have an equality comparison with zero or a power of 2, we can transform
- // the icmp and sext into bitwise/integer operations.
- if (ICI->hasOneUse() &&
- ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){
- KnownBits Known = computeKnownBits(Op0, 0, &CI);
-
- APInt KnownZeroMask(~Known.Zero);
- if (KnownZeroMask.isPowerOf2()) {
- Value *In = ICI->getOperand(0);
-
- // If the icmp tests for a known zero bit we can constant fold it.
- if (!Op1C->isZero() && Op1C->getValue() != KnownZeroMask) {
- Value *V = Pred == ICmpInst::ICMP_NE ?
- ConstantInt::getAllOnesValue(CI.getType()) :
- ConstantInt::getNullValue(CI.getType());
- return replaceInstUsesWith(CI, V);
- }
-
- if (!Op1C->isZero() == (Pred == ICmpInst::ICMP_NE)) {
- // sext ((x & 2^n) == 0) -> (x >> n) - 1
- // sext ((x & 2^n) != 2^n) -> (x >> n) - 1
- unsigned ShiftAmt = KnownZeroMask.countTrailingZeros();
- // Perform a right shift to place the desired bit in the LSB.
- if (ShiftAmt)
- In = Builder.CreateLShr(In,
- ConstantInt::get(In->getType(), ShiftAmt));
-
- // At this point "In" is either 1 or 0. Subtract 1 to turn
- // {1, 0} -> {0, -1}.
- In = Builder.CreateAdd(In,
- ConstantInt::getAllOnesValue(In->getType()),
- "sext");
- } else {
- // sext ((x & 2^n) != 0) -> (x << bitwidth-n) a>> bitwidth-1
- // sext ((x & 2^n) == 2^n) -> (x << bitwidth-n) a>> bitwidth-1
- unsigned ShiftAmt = KnownZeroMask.countLeadingZeros();
- // Perform a left shift to place the desired bit in the MSB.
- if (ShiftAmt)
- In = Builder.CreateShl(In,
- ConstantInt::get(In->getType(), ShiftAmt));
-
- // Distribute the bit over the whole bit width.
- In = Builder.CreateAShr(In, ConstantInt::get(In->getType(),
- KnownZeroMask.getBitWidth() - 1), "sext");
- }
-
- if (CI.getType() == In->getType())
- return replaceInstUsesWith(CI, In);
- return CastInst::CreateIntegerCast(In, CI.getType(), true/*SExt*/);
- }
- }
- }
-
- return nullptr;
-}
-
-/// Return true if we can take the specified value and return it as type Ty
-/// without inserting any new casts and without changing the value of the common
-/// low bits. This is used by code that tries to promote integer operations to
-/// a wider types will allow us to eliminate the extension.
-///
-/// This function works on both vectors and scalars.
-///
-static bool canEvaluateSExtd(Value *V, Type *Ty) {
- assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() &&
- "Can't sign extend type to a smaller type");
- if (canAlwaysEvaluateInType(V, Ty))
- return true;
- if (canNotEvaluateInType(V, Ty))
- return false;
-
- auto *I = cast<Instruction>(V);
- switch (I->getOpcode()) {
- case Instruction::SExt: // sext(sext(x)) -> sext(x)
- case Instruction::ZExt: // sext(zext(x)) -> zext(x)
- case Instruction::Trunc: // sext(trunc(x)) -> trunc(x) or sext(x)
- return true;
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Mul:
- // These operators can all arbitrarily be extended if their inputs can.
- return canEvaluateSExtd(I->getOperand(0), Ty) &&
- canEvaluateSExtd(I->getOperand(1), Ty);
-
- //case Instruction::Shl: TODO
- //case Instruction::LShr: TODO
-
- case Instruction::Select:
- return canEvaluateSExtd(I->getOperand(1), Ty) &&
- canEvaluateSExtd(I->getOperand(2), Ty);
-
- case Instruction::PHI: {
- // We can change a phi if we can change all operands. Note that we never
- // get into trouble with cyclic PHIs here because we only consider
- // instructions with a single use.
- PHINode *PN = cast<PHINode>(I);
- for (Value *IncValue : PN->incoming_values())
- if (!canEvaluateSExtd(IncValue, Ty)) return false;
- return true;
- }
- default:
- // TODO: Can handle more cases here.
- break;
- }
-
- return false;
-}
-
+ Value *Op0 = ICI->getOperand(0), *Op1 = ICI->getOperand(1);
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+
+ // Don't bother if Op1 isn't of vector or integer type.
+ if (!Op1->getType()->isIntOrIntVectorTy())
+ return nullptr;
+
+ if ((Pred == ICmpInst::ICMP_SLT && match(Op1, m_ZeroInt())) ||
+ (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes()))) {
+ // (x <s 0) ? -1 : 0 -> ashr x, 31 -> all ones if negative
+ // (x >s -1) ? -1 : 0 -> not (ashr x, 31) -> all ones if positive
+ Value *Sh = ConstantInt::get(Op0->getType(),
+ Op0->getType()->getScalarSizeInBits() - 1);
+ Value *In = Builder.CreateAShr(Op0, Sh, Op0->getName() + ".lobit");
+ if (In->getType() != CI.getType())
+ In = Builder.CreateIntCast(In, CI.getType(), true /*SExt*/);
+
+ if (Pred == ICmpInst::ICMP_SGT)
+ In = Builder.CreateNot(In, In->getName() + ".not");
+ return replaceInstUsesWith(CI, In);
+ }
+
+ if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
+ // If we know that only one bit of the LHS of the icmp can be set and we
+ // have an equality comparison with zero or a power of 2, we can transform
+ // the icmp and sext into bitwise/integer operations.
+ if (ICI->hasOneUse() &&
+ ICI->isEquality() && (Op1C->isZero() || Op1C->getValue().isPowerOf2())){
+ KnownBits Known = computeKnownBits(Op0, 0, &CI);
+
+ APInt KnownZeroMask(~Known.Zero);
+ if (KnownZeroMask.isPowerOf2()) {
+ Value *In = ICI->getOperand(0);
+
+ // If the icmp tests for a known zero bit we can constant fold it.
+ if (!Op1C->isZero() && Op1C->getValue() != KnownZeroMask) {
+ Value *V = Pred == ICmpInst::ICMP_NE ?
+ ConstantInt::getAllOnesValue(CI.getType()) :
+ ConstantInt::getNullValue(CI.getType());
+ return replaceInstUsesWith(CI, V);
+ }
+
+ if (!Op1C->isZero() == (Pred == ICmpInst::ICMP_NE)) {
+ // sext ((x & 2^n) == 0) -> (x >> n) - 1
+ // sext ((x & 2^n) != 2^n) -> (x >> n) - 1
+ unsigned ShiftAmt = KnownZeroMask.countTrailingZeros();
+ // Perform a right shift to place the desired bit in the LSB.
+ if (ShiftAmt)
+ In = Builder.CreateLShr(In,
+ ConstantInt::get(In->getType(), ShiftAmt));
+
+ // At this point "In" is either 1 or 0. Subtract 1 to turn
+ // {1, 0} -> {0, -1}.
+ In = Builder.CreateAdd(In,
+ ConstantInt::getAllOnesValue(In->getType()),
+ "sext");
+ } else {
+ // sext ((x & 2^n) != 0) -> (x << bitwidth-n) a>> bitwidth-1
+ // sext ((x & 2^n) == 2^n) -> (x << bitwidth-n) a>> bitwidth-1
+ unsigned ShiftAmt = KnownZeroMask.countLeadingZeros();
+ // Perform a left shift to place the desired bit in the MSB.
+ if (ShiftAmt)
+ In = Builder.CreateShl(In,
+ ConstantInt::get(In->getType(), ShiftAmt));
+
+ // Distribute the bit over the whole bit width.
+ In = Builder.CreateAShr(In, ConstantInt::get(In->getType(),
+ KnownZeroMask.getBitWidth() - 1), "sext");
+ }
+
+ if (CI.getType() == In->getType())
+ return replaceInstUsesWith(CI, In);
+ return CastInst::CreateIntegerCast(In, CI.getType(), true/*SExt*/);
+ }
+ }
+ }
+
+ return nullptr;
+}
+
+/// Return true if we can take the specified value and return it as type Ty
+/// without inserting any new casts and without changing the value of the common
+/// low bits. This is used by code that tries to promote integer operations to
+/// a wider types will allow us to eliminate the extension.
+///
+/// This function works on both vectors and scalars.
+///
+static bool canEvaluateSExtd(Value *V, Type *Ty) {
+ assert(V->getType()->getScalarSizeInBits() < Ty->getScalarSizeInBits() &&
+ "Can't sign extend type to a smaller type");
+ if (canAlwaysEvaluateInType(V, Ty))
+ return true;
+ if (canNotEvaluateInType(V, Ty))
+ return false;
+
+ auto *I = cast<Instruction>(V);
+ switch (I->getOpcode()) {
+ case Instruction::SExt: // sext(sext(x)) -> sext(x)
+ case Instruction::ZExt: // sext(zext(x)) -> zext(x)
+ case Instruction::Trunc: // sext(trunc(x)) -> trunc(x) or sext(x)
+ return true;
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ // These operators can all arbitrarily be extended if their inputs can.
+ return canEvaluateSExtd(I->getOperand(0), Ty) &&
+ canEvaluateSExtd(I->getOperand(1), Ty);
+
+ //case Instruction::Shl: TODO
+ //case Instruction::LShr: TODO
+
+ case Instruction::Select:
+ return canEvaluateSExtd(I->getOperand(1), Ty) &&
+ canEvaluateSExtd(I->getOperand(2), Ty);
+
+ case Instruction::PHI: {
+ // We can change a phi if we can change all operands. Note that we never
+ // get into trouble with cyclic PHIs here because we only consider
+ // instructions with a single use.
+ PHINode *PN = cast<PHINode>(I);
+ for (Value *IncValue : PN->incoming_values())
+ if (!canEvaluateSExtd(IncValue, Ty)) return false;
+ return true;
+ }
+ default:
+ // TODO: Can handle more cases here.
+ break;
+ }
+
+ return false;
+}
+
Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) {
- // If this sign extend is only used by a truncate, let the truncate be
- // eliminated before we try to optimize this sext.
- if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
- return nullptr;
-
- if (Instruction *I = commonCastTransforms(CI))
- return I;
-
- Value *Src = CI.getOperand(0);
- Type *SrcTy = Src->getType(), *DestTy = CI.getType();
-
- // If we know that the value being extended is positive, we can use a zext
- // instead.
- KnownBits Known = computeKnownBits(Src, 0, &CI);
- if (Known.isNonNegative())
- return CastInst::Create(Instruction::ZExt, Src, DestTy);
-
- // Try to extend the entire expression tree to the wide destination type.
- if (shouldChangeType(SrcTy, DestTy) && canEvaluateSExtd(Src, DestTy)) {
- // Okay, we can transform this! Insert the new expression now.
- LLVM_DEBUG(
- dbgs() << "ICE: EvaluateInDifferentType converting expression type"
- " to avoid sign extend: "
- << CI << '\n');
- Value *Res = EvaluateInDifferentType(Src, DestTy, true);
- assert(Res->getType() == DestTy);
-
- uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
- uint32_t DestBitSize = DestTy->getScalarSizeInBits();
-
- // If the high bits are already filled with sign bit, just replace this
- // cast with the result.
- if (ComputeNumSignBits(Res, 0, &CI) > DestBitSize - SrcBitSize)
- return replaceInstUsesWith(CI, Res);
-
- // We need to emit a shl + ashr to do the sign extend.
- Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
- return BinaryOperator::CreateAShr(Builder.CreateShl(Res, ShAmt, "sext"),
- ShAmt);
- }
-
- // If the input is a trunc from the destination type, then turn sext(trunc(x))
- // into shifts.
- Value *X;
- if (match(Src, m_OneUse(m_Trunc(m_Value(X)))) && X->getType() == DestTy) {
- // sext(trunc(X)) --> ashr(shl(X, C), C)
- unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
- unsigned DestBitSize = DestTy->getScalarSizeInBits();
- Constant *ShAmt = ConstantInt::get(DestTy, DestBitSize - SrcBitSize);
- return BinaryOperator::CreateAShr(Builder.CreateShl(X, ShAmt), ShAmt);
- }
-
- if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src))
- return transformSExtICmp(ICI, CI);
-
- // If the input is a shl/ashr pair of a same constant, then this is a sign
- // extension from a smaller value. If we could trust arbitrary bitwidth
- // integers, we could turn this into a truncate to the smaller bit and then
- // use a sext for the whole extension. Since we don't, look deeper and check
- // for a truncate. If the source and dest are the same type, eliminate the
- // trunc and extend and just do shifts. For example, turn:
- // %a = trunc i32 %i to i8
+ // If this sign extend is only used by a truncate, let the truncate be
+ // eliminated before we try to optimize this sext.
+ if (CI.hasOneUse() && isa<TruncInst>(CI.user_back()))
+ return nullptr;
+
+ if (Instruction *I = commonCastTransforms(CI))
+ return I;
+
+ Value *Src = CI.getOperand(0);
+ Type *SrcTy = Src->getType(), *DestTy = CI.getType();
+
+ // If we know that the value being extended is positive, we can use a zext
+ // instead.
+ KnownBits Known = computeKnownBits(Src, 0, &CI);
+ if (Known.isNonNegative())
+ return CastInst::Create(Instruction::ZExt, Src, DestTy);
+
+ // Try to extend the entire expression tree to the wide destination type.
+ if (shouldChangeType(SrcTy, DestTy) && canEvaluateSExtd(Src, DestTy)) {
+ // Okay, we can transform this! Insert the new expression now.
+ LLVM_DEBUG(
+ dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+ " to avoid sign extend: "
+ << CI << '\n');
+ Value *Res = EvaluateInDifferentType(Src, DestTy, true);
+ assert(Res->getType() == DestTy);
+
+ uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
+ uint32_t DestBitSize = DestTy->getScalarSizeInBits();
+
+ // If the high bits are already filled with sign bit, just replace this
+ // cast with the result.
+ if (ComputeNumSignBits(Res, 0, &CI) > DestBitSize - SrcBitSize)
+ return replaceInstUsesWith(CI, Res);
+
+ // We need to emit a shl + ashr to do the sign extend.
+ Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
+ return BinaryOperator::CreateAShr(Builder.CreateShl(Res, ShAmt, "sext"),
+ ShAmt);
+ }
+
+ // If the input is a trunc from the destination type, then turn sext(trunc(x))
+ // into shifts.
+ Value *X;
+ if (match(Src, m_OneUse(m_Trunc(m_Value(X)))) && X->getType() == DestTy) {
+ // sext(trunc(X)) --> ashr(shl(X, C), C)
+ unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
+ unsigned DestBitSize = DestTy->getScalarSizeInBits();
+ Constant *ShAmt = ConstantInt::get(DestTy, DestBitSize - SrcBitSize);
+ return BinaryOperator::CreateAShr(Builder.CreateShl(X, ShAmt), ShAmt);
+ }
+
+ if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src))
+ return transformSExtICmp(ICI, CI);
+
+ // If the input is a shl/ashr pair of a same constant, then this is a sign
+ // extension from a smaller value. If we could trust arbitrary bitwidth
+ // integers, we could turn this into a truncate to the smaller bit and then
+ // use a sext for the whole extension. Since we don't, look deeper and check
+ // for a truncate. If the source and dest are the same type, eliminate the
+ // trunc and extend and just do shifts. For example, turn:
+ // %a = trunc i32 %i to i8
// %b = shl i8 %a, C
// %c = ashr i8 %b, C
- // %d = sext i8 %c to i32
- // into:
+ // %d = sext i8 %c to i32
+ // into:
// %a = shl i32 %i, 32-(8-C)
// %d = ashr i32 %a, 32-(8-C)
- Value *A = nullptr;
- // TODO: Eventually this could be subsumed by EvaluateInDifferentType.
- Constant *BA = nullptr, *CA = nullptr;
- if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_Constant(BA)),
- m_Constant(CA))) &&
+ Value *A = nullptr;
+ // TODO: Eventually this could be subsumed by EvaluateInDifferentType.
+ Constant *BA = nullptr, *CA = nullptr;
+ if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_Constant(BA)),
+ m_Constant(CA))) &&
BA->isElementWiseEqual(CA) && A->getType() == DestTy) {
Constant *WideCurrShAmt = ConstantExpr::getSExt(CA, DestTy);
Constant *NumLowbitsLeft = ConstantExpr::getSub(
@@ -1533,445 +1533,445 @@ Instruction *InstCombinerImpl::visitSExt(SExtInst &CI) {
Constant::mergeUndefsWith(Constant::mergeUndefsWith(NewShAmt, BA), CA);
A = Builder.CreateShl(A, NewShAmt, CI.getName());
return BinaryOperator::CreateAShr(A, NewShAmt);
- }
-
- return nullptr;
-}
-
-/// Return a Constant* for the specified floating-point constant if it fits
-/// in the specified FP type without changing its value.
-static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
- bool losesInfo;
- APFloat F = CFP->getValueAPF();
- (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
- return !losesInfo;
-}
-
-static Type *shrinkFPConstant(ConstantFP *CFP) {
- if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext()))
- return nullptr; // No constant folding of this.
- // See if the value can be truncated to half and then reextended.
- if (fitsInFPType(CFP, APFloat::IEEEhalf()))
- return Type::getHalfTy(CFP->getContext());
- // See if the value can be truncated to float and then reextended.
- if (fitsInFPType(CFP, APFloat::IEEEsingle()))
- return Type::getFloatTy(CFP->getContext());
- if (CFP->getType()->isDoubleTy())
- return nullptr; // Won't shrink.
- if (fitsInFPType(CFP, APFloat::IEEEdouble()))
- return Type::getDoubleTy(CFP->getContext());
- // Don't try to shrink to various long double types.
- return nullptr;
-}
-
-// Determine if this is a vector of ConstantFPs and if so, return the minimal
-// type we can safely truncate all elements to.
-// TODO: Make these support undef elements.
-static Type *shrinkFPConstantVector(Value *V) {
- auto *CV = dyn_cast<Constant>(V);
- auto *CVVTy = dyn_cast<VectorType>(V->getType());
- if (!CV || !CVVTy)
- return nullptr;
-
- Type *MinType = nullptr;
-
+ }
+
+ return nullptr;
+}
+
+/// Return a Constant* for the specified floating-point constant if it fits
+/// in the specified FP type without changing its value.
+static bool fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
+ bool losesInfo;
+ APFloat F = CFP->getValueAPF();
+ (void)F.convert(Sem, APFloat::rmNearestTiesToEven, &losesInfo);
+ return !losesInfo;
+}
+
+static Type *shrinkFPConstant(ConstantFP *CFP) {
+ if (CFP->getType() == Type::getPPC_FP128Ty(CFP->getContext()))
+ return nullptr; // No constant folding of this.
+ // See if the value can be truncated to half and then reextended.
+ if (fitsInFPType(CFP, APFloat::IEEEhalf()))
+ return Type::getHalfTy(CFP->getContext());
+ // See if the value can be truncated to float and then reextended.
+ if (fitsInFPType(CFP, APFloat::IEEEsingle()))
+ return Type::getFloatTy(CFP->getContext());
+ if (CFP->getType()->isDoubleTy())
+ return nullptr; // Won't shrink.
+ if (fitsInFPType(CFP, APFloat::IEEEdouble()))
+ return Type::getDoubleTy(CFP->getContext());
+ // Don't try to shrink to various long double types.
+ return nullptr;
+}
+
+// Determine if this is a vector of ConstantFPs and if so, return the minimal
+// type we can safely truncate all elements to.
+// TODO: Make these support undef elements.
+static Type *shrinkFPConstantVector(Value *V) {
+ auto *CV = dyn_cast<Constant>(V);
+ auto *CVVTy = dyn_cast<VectorType>(V->getType());
+ if (!CV || !CVVTy)
+ return nullptr;
+
+ Type *MinType = nullptr;
+
unsigned NumElts = cast<FixedVectorType>(CVVTy)->getNumElements();
- for (unsigned i = 0; i != NumElts; ++i) {
- auto *CFP = dyn_cast_or_null<ConstantFP>(CV->getAggregateElement(i));
- if (!CFP)
- return nullptr;
-
- Type *T = shrinkFPConstant(CFP);
- if (!T)
- return nullptr;
-
- // If we haven't found a type yet or this type has a larger mantissa than
- // our previous type, this is our new minimal type.
- if (!MinType || T->getFPMantissaWidth() > MinType->getFPMantissaWidth())
- MinType = T;
- }
-
- // Make a vector type from the minimal type.
- return FixedVectorType::get(MinType, NumElts);
-}
-
-/// Find the minimum FP type we can safely truncate to.
-static Type *getMinimumFPType(Value *V) {
- if (auto *FPExt = dyn_cast<FPExtInst>(V))
- return FPExt->getOperand(0)->getType();
-
- // If this value is a constant, return the constant in the smallest FP type
- // that can accurately represent it. This allows us to turn
- // (float)((double)X+2.0) into x+2.0f.
- if (auto *CFP = dyn_cast<ConstantFP>(V))
- if (Type *T = shrinkFPConstant(CFP))
- return T;
-
- // Try to shrink a vector of FP constants.
- if (Type *T = shrinkFPConstantVector(V))
- return T;
-
- return V->getType();
-}
-
-/// Return true if the cast from integer to FP can be proven to be exact for all
-/// possible inputs (the conversion does not lose any precision).
-static bool isKnownExactCastIntToFP(CastInst &I) {
- CastInst::CastOps Opcode = I.getOpcode();
- assert((Opcode == CastInst::SIToFP || Opcode == CastInst::UIToFP) &&
- "Unexpected cast");
- Value *Src = I.getOperand(0);
- Type *SrcTy = Src->getType();
- Type *FPTy = I.getType();
- bool IsSigned = Opcode == Instruction::SIToFP;
- int SrcSize = (int)SrcTy->getScalarSizeInBits() - IsSigned;
-
- // Easy case - if the source integer type has less bits than the FP mantissa,
- // then the cast must be exact.
- int DestNumSigBits = FPTy->getFPMantissaWidth();
- if (SrcSize <= DestNumSigBits)
- return true;
-
- // Cast from FP to integer and back to FP is independent of the intermediate
- // integer width because of poison on overflow.
- Value *F;
- if (match(Src, m_FPToSI(m_Value(F))) || match(Src, m_FPToUI(m_Value(F)))) {
- // If this is uitofp (fptosi F), the source needs an extra bit to avoid
- // potential rounding of negative FP input values.
- int SrcNumSigBits = F->getType()->getFPMantissaWidth();
- if (!IsSigned && match(Src, m_FPToSI(m_Value())))
- SrcNumSigBits++;
-
- // [su]itofp (fpto[su]i F) --> exact if the source type has less or equal
- // significant bits than the destination (and make sure neither type is
- // weird -- ppc_fp128).
- if (SrcNumSigBits > 0 && DestNumSigBits > 0 &&
- SrcNumSigBits <= DestNumSigBits)
- return true;
- }
-
- // TODO:
- // Try harder to find if the source integer type has less significant bits.
- // For example, compute number of sign bits or compute low bit mask.
- return false;
-}
-
+ for (unsigned i = 0; i != NumElts; ++i) {
+ auto *CFP = dyn_cast_or_null<ConstantFP>(CV->getAggregateElement(i));
+ if (!CFP)
+ return nullptr;
+
+ Type *T = shrinkFPConstant(CFP);
+ if (!T)
+ return nullptr;
+
+ // If we haven't found a type yet or this type has a larger mantissa than
+ // our previous type, this is our new minimal type.
+ if (!MinType || T->getFPMantissaWidth() > MinType->getFPMantissaWidth())
+ MinType = T;
+ }
+
+ // Make a vector type from the minimal type.
+ return FixedVectorType::get(MinType, NumElts);
+}
+
+/// Find the minimum FP type we can safely truncate to.
+static Type *getMinimumFPType(Value *V) {
+ if (auto *FPExt = dyn_cast<FPExtInst>(V))
+ return FPExt->getOperand(0)->getType();
+
+ // If this value is a constant, return the constant in the smallest FP type
+ // that can accurately represent it. This allows us to turn
+ // (float)((double)X+2.0) into x+2.0f.
+ if (auto *CFP = dyn_cast<ConstantFP>(V))
+ if (Type *T = shrinkFPConstant(CFP))
+ return T;
+
+ // Try to shrink a vector of FP constants.
+ if (Type *T = shrinkFPConstantVector(V))
+ return T;
+
+ return V->getType();
+}
+
+/// Return true if the cast from integer to FP can be proven to be exact for all
+/// possible inputs (the conversion does not lose any precision).
+static bool isKnownExactCastIntToFP(CastInst &I) {
+ CastInst::CastOps Opcode = I.getOpcode();
+ assert((Opcode == CastInst::SIToFP || Opcode == CastInst::UIToFP) &&
+ "Unexpected cast");
+ Value *Src = I.getOperand(0);
+ Type *SrcTy = Src->getType();
+ Type *FPTy = I.getType();
+ bool IsSigned = Opcode == Instruction::SIToFP;
+ int SrcSize = (int)SrcTy->getScalarSizeInBits() - IsSigned;
+
+ // Easy case - if the source integer type has less bits than the FP mantissa,
+ // then the cast must be exact.
+ int DestNumSigBits = FPTy->getFPMantissaWidth();
+ if (SrcSize <= DestNumSigBits)
+ return true;
+
+ // Cast from FP to integer and back to FP is independent of the intermediate
+ // integer width because of poison on overflow.
+ Value *F;
+ if (match(Src, m_FPToSI(m_Value(F))) || match(Src, m_FPToUI(m_Value(F)))) {
+ // If this is uitofp (fptosi F), the source needs an extra bit to avoid
+ // potential rounding of negative FP input values.
+ int SrcNumSigBits = F->getType()->getFPMantissaWidth();
+ if (!IsSigned && match(Src, m_FPToSI(m_Value())))
+ SrcNumSigBits++;
+
+ // [su]itofp (fpto[su]i F) --> exact if the source type has less or equal
+ // significant bits than the destination (and make sure neither type is
+ // weird -- ppc_fp128).
+ if (SrcNumSigBits > 0 && DestNumSigBits > 0 &&
+ SrcNumSigBits <= DestNumSigBits)
+ return true;
+ }
+
+ // TODO:
+ // Try harder to find if the source integer type has less significant bits.
+ // For example, compute number of sign bits or compute low bit mask.
+ return false;
+}
+
Instruction *InstCombinerImpl::visitFPTrunc(FPTruncInst &FPT) {
- if (Instruction *I = commonCastTransforms(FPT))
- return I;
-
- // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to
- // simplify this expression to avoid one or more of the trunc/extend
- // operations if we can do so without changing the numerical results.
- //
- // The exact manner in which the widths of the operands interact to limit
- // what we can and cannot do safely varies from operation to operation, and
- // is explained below in the various case statements.
- Type *Ty = FPT.getType();
- auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0));
- if (BO && BO->hasOneUse()) {
- Type *LHSMinType = getMinimumFPType(BO->getOperand(0));
- Type *RHSMinType = getMinimumFPType(BO->getOperand(1));
- unsigned OpWidth = BO->getType()->getFPMantissaWidth();
- unsigned LHSWidth = LHSMinType->getFPMantissaWidth();
- unsigned RHSWidth = RHSMinType->getFPMantissaWidth();
- unsigned SrcWidth = std::max(LHSWidth, RHSWidth);
- unsigned DstWidth = Ty->getFPMantissaWidth();
- switch (BO->getOpcode()) {
- default: break;
- case Instruction::FAdd:
- case Instruction::FSub:
- // For addition and subtraction, the infinitely precise result can
- // essentially be arbitrarily wide; proving that double rounding
- // will not occur because the result of OpI is exact (as we will for
- // FMul, for example) is hopeless. However, we *can* nonetheless
- // frequently know that double rounding cannot occur (or that it is
- // innocuous) by taking advantage of the specific structure of
- // infinitely-precise results that admit double rounding.
- //
- // Specifically, if OpWidth >= 2*DstWdith+1 and DstWidth is sufficient
- // to represent both sources, we can guarantee that the double
- // rounding is innocuous (See p50 of Figueroa's 2000 PhD thesis,
- // "A Rigorous Framework for Fully Supporting the IEEE Standard ..."
- // for proof of this fact).
- //
- // Note: Figueroa does not consider the case where DstFormat !=
- // SrcFormat. It's possible (likely even!) that this analysis
- // could be tightened for those cases, but they are rare (the main
- // case of interest here is (float)((double)float + float)).
- if (OpWidth >= 2*DstWidth+1 && DstWidth >= SrcWidth) {
- Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
- Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
- Instruction *RI = BinaryOperator::Create(BO->getOpcode(), LHS, RHS);
- RI->copyFastMathFlags(BO);
- return RI;
- }
- break;
- case Instruction::FMul:
- // For multiplication, the infinitely precise result has at most
- // LHSWidth + RHSWidth significant bits; if OpWidth is sufficient
- // that such a value can be exactly represented, then no double
- // rounding can possibly occur; we can safely perform the operation
- // in the destination format if it can represent both sources.
- if (OpWidth >= LHSWidth + RHSWidth && DstWidth >= SrcWidth) {
- Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
- Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
- return BinaryOperator::CreateFMulFMF(LHS, RHS, BO);
- }
- break;
- case Instruction::FDiv:
- // For division, we use again use the bound from Figueroa's
- // dissertation. I am entirely certain that this bound can be
- // tightened in the unbalanced operand case by an analysis based on
- // the diophantine rational approximation bound, but the well-known
- // condition used here is a good conservative first pass.
- // TODO: Tighten bound via rigorous analysis of the unbalanced case.
- if (OpWidth >= 2*DstWidth && DstWidth >= SrcWidth) {
- Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
- Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
- return BinaryOperator::CreateFDivFMF(LHS, RHS, BO);
- }
- break;
- case Instruction::FRem: {
- // Remainder is straightforward. Remainder is always exact, so the
- // type of OpI doesn't enter into things at all. We simply evaluate
- // in whichever source type is larger, then convert to the
- // destination type.
- if (SrcWidth == OpWidth)
- break;
- Value *LHS, *RHS;
- if (LHSWidth == SrcWidth) {
- LHS = Builder.CreateFPTrunc(BO->getOperand(0), LHSMinType);
- RHS = Builder.CreateFPTrunc(BO->getOperand(1), LHSMinType);
- } else {
- LHS = Builder.CreateFPTrunc(BO->getOperand(0), RHSMinType);
- RHS = Builder.CreateFPTrunc(BO->getOperand(1), RHSMinType);
- }
-
- Value *ExactResult = Builder.CreateFRemFMF(LHS, RHS, BO);
- return CastInst::CreateFPCast(ExactResult, Ty);
- }
- }
- }
-
- // (fptrunc (fneg x)) -> (fneg (fptrunc x))
- Value *X;
- Instruction *Op = dyn_cast<Instruction>(FPT.getOperand(0));
- if (Op && Op->hasOneUse()) {
- // FIXME: The FMF should propagate from the fptrunc, not the source op.
- IRBuilder<>::FastMathFlagGuard FMFG(Builder);
- if (isa<FPMathOperator>(Op))
- Builder.setFastMathFlags(Op->getFastMathFlags());
-
- if (match(Op, m_FNeg(m_Value(X)))) {
- Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty);
-
- return UnaryOperator::CreateFNegFMF(InnerTrunc, Op);
- }
-
- // If we are truncating a select that has an extended operand, we can
- // narrow the other operand and do the select as a narrow op.
- Value *Cond, *X, *Y;
- if (match(Op, m_Select(m_Value(Cond), m_FPExt(m_Value(X)), m_Value(Y))) &&
- X->getType() == Ty) {
- // fptrunc (select Cond, (fpext X), Y --> select Cond, X, (fptrunc Y)
- Value *NarrowY = Builder.CreateFPTrunc(Y, Ty);
- Value *Sel = Builder.CreateSelect(Cond, X, NarrowY, "narrow.sel", Op);
- return replaceInstUsesWith(FPT, Sel);
- }
- if (match(Op, m_Select(m_Value(Cond), m_Value(Y), m_FPExt(m_Value(X)))) &&
- X->getType() == Ty) {
- // fptrunc (select Cond, Y, (fpext X) --> select Cond, (fptrunc Y), X
- Value *NarrowY = Builder.CreateFPTrunc(Y, Ty);
- Value *Sel = Builder.CreateSelect(Cond, NarrowY, X, "narrow.sel", Op);
- return replaceInstUsesWith(FPT, Sel);
- }
- }
-
- if (auto *II = dyn_cast<IntrinsicInst>(FPT.getOperand(0))) {
- switch (II->getIntrinsicID()) {
- default: break;
- case Intrinsic::ceil:
- case Intrinsic::fabs:
- case Intrinsic::floor:
- case Intrinsic::nearbyint:
- case Intrinsic::rint:
- case Intrinsic::round:
- case Intrinsic::roundeven:
- case Intrinsic::trunc: {
- Value *Src = II->getArgOperand(0);
- if (!Src->hasOneUse())
- break;
-
- // Except for fabs, this transformation requires the input of the unary FP
- // operation to be itself an fpext from the type to which we're
- // truncating.
- if (II->getIntrinsicID() != Intrinsic::fabs) {
- FPExtInst *FPExtSrc = dyn_cast<FPExtInst>(Src);
- if (!FPExtSrc || FPExtSrc->getSrcTy() != Ty)
- break;
- }
-
- // Do unary FP operation on smaller type.
- // (fptrunc (fabs x)) -> (fabs (fptrunc x))
- Value *InnerTrunc = Builder.CreateFPTrunc(Src, Ty);
- Function *Overload = Intrinsic::getDeclaration(FPT.getModule(),
- II->getIntrinsicID(), Ty);
- SmallVector<OperandBundleDef, 1> OpBundles;
- II->getOperandBundlesAsDefs(OpBundles);
- CallInst *NewCI =
- CallInst::Create(Overload, {InnerTrunc}, OpBundles, II->getName());
- NewCI->copyFastMathFlags(II);
- return NewCI;
- }
- }
- }
-
- if (Instruction *I = shrinkInsertElt(FPT, Builder))
- return I;
-
- Value *Src = FPT.getOperand(0);
- if (isa<SIToFPInst>(Src) || isa<UIToFPInst>(Src)) {
- auto *FPCast = cast<CastInst>(Src);
- if (isKnownExactCastIntToFP(*FPCast))
- return CastInst::Create(FPCast->getOpcode(), FPCast->getOperand(0), Ty);
- }
-
- return nullptr;
-}
-
+ if (Instruction *I = commonCastTransforms(FPT))
+ return I;
+
+ // If we have fptrunc(OpI (fpextend x), (fpextend y)), we would like to
+ // simplify this expression to avoid one or more of the trunc/extend
+ // operations if we can do so without changing the numerical results.
+ //
+ // The exact manner in which the widths of the operands interact to limit
+ // what we can and cannot do safely varies from operation to operation, and
+ // is explained below in the various case statements.
+ Type *Ty = FPT.getType();
+ auto *BO = dyn_cast<BinaryOperator>(FPT.getOperand(0));
+ if (BO && BO->hasOneUse()) {
+ Type *LHSMinType = getMinimumFPType(BO->getOperand(0));
+ Type *RHSMinType = getMinimumFPType(BO->getOperand(1));
+ unsigned OpWidth = BO->getType()->getFPMantissaWidth();
+ unsigned LHSWidth = LHSMinType->getFPMantissaWidth();
+ unsigned RHSWidth = RHSMinType->getFPMantissaWidth();
+ unsigned SrcWidth = std::max(LHSWidth, RHSWidth);
+ unsigned DstWidth = Ty->getFPMantissaWidth();
+ switch (BO->getOpcode()) {
+ default: break;
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ // For addition and subtraction, the infinitely precise result can
+ // essentially be arbitrarily wide; proving that double rounding
+ // will not occur because the result of OpI is exact (as we will for
+ // FMul, for example) is hopeless. However, we *can* nonetheless
+ // frequently know that double rounding cannot occur (or that it is
+ // innocuous) by taking advantage of the specific structure of
+ // infinitely-precise results that admit double rounding.
+ //
+ // Specifically, if OpWidth >= 2*DstWdith+1 and DstWidth is sufficient
+ // to represent both sources, we can guarantee that the double
+ // rounding is innocuous (See p50 of Figueroa's 2000 PhD thesis,
+ // "A Rigorous Framework for Fully Supporting the IEEE Standard ..."
+ // for proof of this fact).
+ //
+ // Note: Figueroa does not consider the case where DstFormat !=
+ // SrcFormat. It's possible (likely even!) that this analysis
+ // could be tightened for those cases, but they are rare (the main
+ // case of interest here is (float)((double)float + float)).
+ if (OpWidth >= 2*DstWidth+1 && DstWidth >= SrcWidth) {
+ Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
+ Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
+ Instruction *RI = BinaryOperator::Create(BO->getOpcode(), LHS, RHS);
+ RI->copyFastMathFlags(BO);
+ return RI;
+ }
+ break;
+ case Instruction::FMul:
+ // For multiplication, the infinitely precise result has at most
+ // LHSWidth + RHSWidth significant bits; if OpWidth is sufficient
+ // that such a value can be exactly represented, then no double
+ // rounding can possibly occur; we can safely perform the operation
+ // in the destination format if it can represent both sources.
+ if (OpWidth >= LHSWidth + RHSWidth && DstWidth >= SrcWidth) {
+ Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
+ Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
+ return BinaryOperator::CreateFMulFMF(LHS, RHS, BO);
+ }
+ break;
+ case Instruction::FDiv:
+ // For division, we use again use the bound from Figueroa's
+ // dissertation. I am entirely certain that this bound can be
+ // tightened in the unbalanced operand case by an analysis based on
+ // the diophantine rational approximation bound, but the well-known
+ // condition used here is a good conservative first pass.
+ // TODO: Tighten bound via rigorous analysis of the unbalanced case.
+ if (OpWidth >= 2*DstWidth && DstWidth >= SrcWidth) {
+ Value *LHS = Builder.CreateFPTrunc(BO->getOperand(0), Ty);
+ Value *RHS = Builder.CreateFPTrunc(BO->getOperand(1), Ty);
+ return BinaryOperator::CreateFDivFMF(LHS, RHS, BO);
+ }
+ break;
+ case Instruction::FRem: {
+ // Remainder is straightforward. Remainder is always exact, so the
+ // type of OpI doesn't enter into things at all. We simply evaluate
+ // in whichever source type is larger, then convert to the
+ // destination type.
+ if (SrcWidth == OpWidth)
+ break;
+ Value *LHS, *RHS;
+ if (LHSWidth == SrcWidth) {
+ LHS = Builder.CreateFPTrunc(BO->getOperand(0), LHSMinType);
+ RHS = Builder.CreateFPTrunc(BO->getOperand(1), LHSMinType);
+ } else {
+ LHS = Builder.CreateFPTrunc(BO->getOperand(0), RHSMinType);
+ RHS = Builder.CreateFPTrunc(BO->getOperand(1), RHSMinType);
+ }
+
+ Value *ExactResult = Builder.CreateFRemFMF(LHS, RHS, BO);
+ return CastInst::CreateFPCast(ExactResult, Ty);
+ }
+ }
+ }
+
+ // (fptrunc (fneg x)) -> (fneg (fptrunc x))
+ Value *X;
+ Instruction *Op = dyn_cast<Instruction>(FPT.getOperand(0));
+ if (Op && Op->hasOneUse()) {
+ // FIXME: The FMF should propagate from the fptrunc, not the source op.
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ if (isa<FPMathOperator>(Op))
+ Builder.setFastMathFlags(Op->getFastMathFlags());
+
+ if (match(Op, m_FNeg(m_Value(X)))) {
+ Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty);
+
+ return UnaryOperator::CreateFNegFMF(InnerTrunc, Op);
+ }
+
+ // If we are truncating a select that has an extended operand, we can
+ // narrow the other operand and do the select as a narrow op.
+ Value *Cond, *X, *Y;
+ if (match(Op, m_Select(m_Value(Cond), m_FPExt(m_Value(X)), m_Value(Y))) &&
+ X->getType() == Ty) {
+ // fptrunc (select Cond, (fpext X), Y --> select Cond, X, (fptrunc Y)
+ Value *NarrowY = Builder.CreateFPTrunc(Y, Ty);
+ Value *Sel = Builder.CreateSelect(Cond, X, NarrowY, "narrow.sel", Op);
+ return replaceInstUsesWith(FPT, Sel);
+ }
+ if (match(Op, m_Select(m_Value(Cond), m_Value(Y), m_FPExt(m_Value(X)))) &&
+ X->getType() == Ty) {
+ // fptrunc (select Cond, Y, (fpext X) --> select Cond, (fptrunc Y), X
+ Value *NarrowY = Builder.CreateFPTrunc(Y, Ty);
+ Value *Sel = Builder.CreateSelect(Cond, NarrowY, X, "narrow.sel", Op);
+ return replaceInstUsesWith(FPT, Sel);
+ }
+ }
+
+ if (auto *II = dyn_cast<IntrinsicInst>(FPT.getOperand(0))) {
+ switch (II->getIntrinsicID()) {
+ default: break;
+ case Intrinsic::ceil:
+ case Intrinsic::fabs:
+ case Intrinsic::floor:
+ case Intrinsic::nearbyint:
+ case Intrinsic::rint:
+ case Intrinsic::round:
+ case Intrinsic::roundeven:
+ case Intrinsic::trunc: {
+ Value *Src = II->getArgOperand(0);
+ if (!Src->hasOneUse())
+ break;
+
+ // Except for fabs, this transformation requires the input of the unary FP
+ // operation to be itself an fpext from the type to which we're
+ // truncating.
+ if (II->getIntrinsicID() != Intrinsic::fabs) {
+ FPExtInst *FPExtSrc = dyn_cast<FPExtInst>(Src);
+ if (!FPExtSrc || FPExtSrc->getSrcTy() != Ty)
+ break;
+ }
+
+ // Do unary FP operation on smaller type.
+ // (fptrunc (fabs x)) -> (fabs (fptrunc x))
+ Value *InnerTrunc = Builder.CreateFPTrunc(Src, Ty);
+ Function *Overload = Intrinsic::getDeclaration(FPT.getModule(),
+ II->getIntrinsicID(), Ty);
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ II->getOperandBundlesAsDefs(OpBundles);
+ CallInst *NewCI =
+ CallInst::Create(Overload, {InnerTrunc}, OpBundles, II->getName());
+ NewCI->copyFastMathFlags(II);
+ return NewCI;
+ }
+ }
+ }
+
+ if (Instruction *I = shrinkInsertElt(FPT, Builder))
+ return I;
+
+ Value *Src = FPT.getOperand(0);
+ if (isa<SIToFPInst>(Src) || isa<UIToFPInst>(Src)) {
+ auto *FPCast = cast<CastInst>(Src);
+ if (isKnownExactCastIntToFP(*FPCast))
+ return CastInst::Create(FPCast->getOpcode(), FPCast->getOperand(0), Ty);
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitFPExt(CastInst &FPExt) {
- // If the source operand is a cast from integer to FP and known exact, then
- // cast the integer operand directly to the destination type.
- Type *Ty = FPExt.getType();
- Value *Src = FPExt.getOperand(0);
- if (isa<SIToFPInst>(Src) || isa<UIToFPInst>(Src)) {
- auto *FPCast = cast<CastInst>(Src);
- if (isKnownExactCastIntToFP(*FPCast))
- return CastInst::Create(FPCast->getOpcode(), FPCast->getOperand(0), Ty);
- }
-
- return commonCastTransforms(FPExt);
-}
-
-/// fpto{s/u}i({u/s}itofp(X)) --> X or zext(X) or sext(X) or trunc(X)
-/// This is safe if the intermediate type has enough bits in its mantissa to
-/// accurately represent all values of X. For example, this won't work with
-/// i64 -> float -> i64.
+ // If the source operand is a cast from integer to FP and known exact, then
+ // cast the integer operand directly to the destination type.
+ Type *Ty = FPExt.getType();
+ Value *Src = FPExt.getOperand(0);
+ if (isa<SIToFPInst>(Src) || isa<UIToFPInst>(Src)) {
+ auto *FPCast = cast<CastInst>(Src);
+ if (isKnownExactCastIntToFP(*FPCast))
+ return CastInst::Create(FPCast->getOpcode(), FPCast->getOperand(0), Ty);
+ }
+
+ return commonCastTransforms(FPExt);
+}
+
+/// fpto{s/u}i({u/s}itofp(X)) --> X or zext(X) or sext(X) or trunc(X)
+/// This is safe if the intermediate type has enough bits in its mantissa to
+/// accurately represent all values of X. For example, this won't work with
+/// i64 -> float -> i64.
Instruction *InstCombinerImpl::foldItoFPtoI(CastInst &FI) {
- if (!isa<UIToFPInst>(FI.getOperand(0)) && !isa<SIToFPInst>(FI.getOperand(0)))
- return nullptr;
-
- auto *OpI = cast<CastInst>(FI.getOperand(0));
- Value *X = OpI->getOperand(0);
- Type *XType = X->getType();
- Type *DestType = FI.getType();
- bool IsOutputSigned = isa<FPToSIInst>(FI);
-
- // Since we can assume the conversion won't overflow, our decision as to
- // whether the input will fit in the float should depend on the minimum
- // of the input range and output range.
-
- // This means this is also safe for a signed input and unsigned output, since
- // a negative input would lead to undefined behavior.
- if (!isKnownExactCastIntToFP(*OpI)) {
- // The first cast may not round exactly based on the source integer width
- // and FP width, but the overflow UB rules can still allow this to fold.
- // If the destination type is narrow, that means the intermediate FP value
- // must be large enough to hold the source value exactly.
- // For example, (uint8_t)((float)(uint32_t 16777217) is undefined behavior.
- int OutputSize = (int)DestType->getScalarSizeInBits() - IsOutputSigned;
- if (OutputSize > OpI->getType()->getFPMantissaWidth())
- return nullptr;
- }
-
- if (DestType->getScalarSizeInBits() > XType->getScalarSizeInBits()) {
- bool IsInputSigned = isa<SIToFPInst>(OpI);
- if (IsInputSigned && IsOutputSigned)
- return new SExtInst(X, DestType);
- return new ZExtInst(X, DestType);
- }
- if (DestType->getScalarSizeInBits() < XType->getScalarSizeInBits())
- return new TruncInst(X, DestType);
-
- assert(XType == DestType && "Unexpected types for int to FP to int casts");
- return replaceInstUsesWith(FI, X);
-}
-
+ if (!isa<UIToFPInst>(FI.getOperand(0)) && !isa<SIToFPInst>(FI.getOperand(0)))
+ return nullptr;
+
+ auto *OpI = cast<CastInst>(FI.getOperand(0));
+ Value *X = OpI->getOperand(0);
+ Type *XType = X->getType();
+ Type *DestType = FI.getType();
+ bool IsOutputSigned = isa<FPToSIInst>(FI);
+
+ // Since we can assume the conversion won't overflow, our decision as to
+ // whether the input will fit in the float should depend on the minimum
+ // of the input range and output range.
+
+ // This means this is also safe for a signed input and unsigned output, since
+ // a negative input would lead to undefined behavior.
+ if (!isKnownExactCastIntToFP(*OpI)) {
+ // The first cast may not round exactly based on the source integer width
+ // and FP width, but the overflow UB rules can still allow this to fold.
+ // If the destination type is narrow, that means the intermediate FP value
+ // must be large enough to hold the source value exactly.
+ // For example, (uint8_t)((float)(uint32_t 16777217) is undefined behavior.
+ int OutputSize = (int)DestType->getScalarSizeInBits() - IsOutputSigned;
+ if (OutputSize > OpI->getType()->getFPMantissaWidth())
+ return nullptr;
+ }
+
+ if (DestType->getScalarSizeInBits() > XType->getScalarSizeInBits()) {
+ bool IsInputSigned = isa<SIToFPInst>(OpI);
+ if (IsInputSigned && IsOutputSigned)
+ return new SExtInst(X, DestType);
+ return new ZExtInst(X, DestType);
+ }
+ if (DestType->getScalarSizeInBits() < XType->getScalarSizeInBits())
+ return new TruncInst(X, DestType);
+
+ assert(XType == DestType && "Unexpected types for int to FP to int casts");
+ return replaceInstUsesWith(FI, X);
+}
+
Instruction *InstCombinerImpl::visitFPToUI(FPToUIInst &FI) {
- if (Instruction *I = foldItoFPtoI(FI))
- return I;
-
- return commonCastTransforms(FI);
-}
-
+ if (Instruction *I = foldItoFPtoI(FI))
+ return I;
+
+ return commonCastTransforms(FI);
+}
+
Instruction *InstCombinerImpl::visitFPToSI(FPToSIInst &FI) {
- if (Instruction *I = foldItoFPtoI(FI))
- return I;
-
- return commonCastTransforms(FI);
-}
-
+ if (Instruction *I = foldItoFPtoI(FI))
+ return I;
+
+ return commonCastTransforms(FI);
+}
+
Instruction *InstCombinerImpl::visitUIToFP(CastInst &CI) {
- return commonCastTransforms(CI);
-}
-
+ return commonCastTransforms(CI);
+}
+
Instruction *InstCombinerImpl::visitSIToFP(CastInst &CI) {
- return commonCastTransforms(CI);
-}
-
+ return commonCastTransforms(CI);
+}
+
Instruction *InstCombinerImpl::visitIntToPtr(IntToPtrInst &CI) {
- // If the source integer type is not the intptr_t type for this target, do a
- // trunc or zext to the intptr_t type, then inttoptr of it. This allows the
- // cast to be exposed to other transforms.
- unsigned AS = CI.getAddressSpace();
- if (CI.getOperand(0)->getType()->getScalarSizeInBits() !=
- DL.getPointerSizeInBits(AS)) {
- Type *Ty = DL.getIntPtrType(CI.getContext(), AS);
- // Handle vectors of pointers.
- if (auto *CIVTy = dyn_cast<VectorType>(CI.getType()))
- Ty = VectorType::get(Ty, CIVTy->getElementCount());
-
- Value *P = Builder.CreateZExtOrTrunc(CI.getOperand(0), Ty);
- return new IntToPtrInst(P, CI.getType());
- }
-
- if (Instruction *I = commonCastTransforms(CI))
- return I;
-
- return nullptr;
-}
-
-/// Implement the transforms for cast of pointer (bitcast/ptrtoint)
+ // If the source integer type is not the intptr_t type for this target, do a
+ // trunc or zext to the intptr_t type, then inttoptr of it. This allows the
+ // cast to be exposed to other transforms.
+ unsigned AS = CI.getAddressSpace();
+ if (CI.getOperand(0)->getType()->getScalarSizeInBits() !=
+ DL.getPointerSizeInBits(AS)) {
+ Type *Ty = DL.getIntPtrType(CI.getContext(), AS);
+ // Handle vectors of pointers.
+ if (auto *CIVTy = dyn_cast<VectorType>(CI.getType()))
+ Ty = VectorType::get(Ty, CIVTy->getElementCount());
+
+ Value *P = Builder.CreateZExtOrTrunc(CI.getOperand(0), Ty);
+ return new IntToPtrInst(P, CI.getType());
+ }
+
+ if (Instruction *I = commonCastTransforms(CI))
+ return I;
+
+ return nullptr;
+}
+
+/// Implement the transforms for cast of pointer (bitcast/ptrtoint)
Instruction *InstCombinerImpl::commonPointerCastTransforms(CastInst &CI) {
- Value *Src = CI.getOperand(0);
-
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
- // If casting the result of a getelementptr instruction with no offset, turn
- // this into a cast of the original pointer!
- if (GEP->hasAllZeroIndices() &&
- // If CI is an addrspacecast and GEP changes the poiner type, merging
- // GEP into CI would undo canonicalizing addrspacecast with different
- // pointer types, causing infinite loops.
- (!isa<AddrSpaceCastInst>(CI) ||
- GEP->getType() == GEP->getPointerOperandType())) {
- // Changing the cast operand is usually not a good idea but it is safe
- // here because the pointer operand is being replaced with another
- // pointer operand so the opcode doesn't need to change.
- return replaceOperand(CI, 0, GEP->getOperand(0));
- }
- }
-
- return commonCastTransforms(CI);
-}
-
+ Value *Src = CI.getOperand(0);
+
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
+ // If casting the result of a getelementptr instruction with no offset, turn
+ // this into a cast of the original pointer!
+ if (GEP->hasAllZeroIndices() &&
+ // If CI is an addrspacecast and GEP changes the poiner type, merging
+ // GEP into CI would undo canonicalizing addrspacecast with different
+ // pointer types, causing infinite loops.
+ (!isa<AddrSpaceCastInst>(CI) ||
+ GEP->getType() == GEP->getPointerOperandType())) {
+ // Changing the cast operand is usually not a good idea but it is safe
+ // here because the pointer operand is being replaced with another
+ // pointer operand so the opcode doesn't need to change.
+ return replaceOperand(CI, 0, GEP->getOperand(0));
+ }
+ }
+
+ return commonCastTransforms(CI);
+}
+
Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
- // If the destination integer type is not the intptr_t type for this target,
- // do a ptrtoint to intptr_t then do a trunc or zext. This allows the cast
- // to be exposed to other transforms.
+ // If the destination integer type is not the intptr_t type for this target,
+ // do a ptrtoint to intptr_t then do a trunc or zext. This allows the cast
+ // to be exposed to other transforms.
Value *SrcOp = CI.getPointerOperand();
- Type *Ty = CI.getType();
- unsigned AS = CI.getPointerAddressSpace();
+ Type *Ty = CI.getType();
+ unsigned AS = CI.getPointerAddressSpace();
unsigned TySize = Ty->getScalarSizeInBits();
unsigned PtrSize = DL.getPointerSizeInBits(AS);
if (TySize != PtrSize) {
@@ -1979,11 +1979,11 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
// Handle vectors of pointers.
if (auto *VecTy = dyn_cast<VectorType>(Ty))
IntPtrTy = VectorType::get(IntPtrTy, VecTy->getElementCount());
-
+
Value *P = Builder.CreatePtrToInt(SrcOp, IntPtrTy);
return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
}
-
+
Value *Vec, *Scalar, *Index;
if (match(SrcOp, m_OneUse(m_InsertElt(m_IntToPtr(m_Value(Vec)),
m_Value(Scalar), m_Value(Index)))) &&
@@ -1993,745 +1993,745 @@ Instruction *InstCombinerImpl::visitPtrToInt(PtrToIntInst &CI) {
// p2i (ins (i2p Vec), Scalar, Index --> ins Vec, (p2i Scalar), Index
Value *NewCast = Builder.CreatePtrToInt(Scalar, Ty->getScalarType());
return InsertElementInst::Create(Vec, NewCast, Index);
- }
-
+ }
+
return commonPointerCastTransforms(CI);
-}
-
-/// This input value (which is known to have vector type) is being zero extended
-/// or truncated to the specified vector type. Since the zext/trunc is done
-/// using an integer type, we have a (bitcast(cast(bitcast))) pattern,
-/// endianness will impact which end of the vector that is extended or
-/// truncated.
-///
-/// A vector is always stored with index 0 at the lowest address, which
-/// corresponds to the most significant bits for a big endian stored integer and
-/// the least significant bits for little endian. A trunc/zext of an integer
-/// impacts the big end of the integer. Thus, we need to add/remove elements at
-/// the front of the vector for big endian targets, and the back of the vector
-/// for little endian targets.
-///
-/// Try to replace it with a shuffle (and vector/vector bitcast) if possible.
-///
-/// The source and destination vector types may have different element types.
+}
+
+/// This input value (which is known to have vector type) is being zero extended
+/// or truncated to the specified vector type. Since the zext/trunc is done
+/// using an integer type, we have a (bitcast(cast(bitcast))) pattern,
+/// endianness will impact which end of the vector that is extended or
+/// truncated.
+///
+/// A vector is always stored with index 0 at the lowest address, which
+/// corresponds to the most significant bits for a big endian stored integer and
+/// the least significant bits for little endian. A trunc/zext of an integer
+/// impacts the big end of the integer. Thus, we need to add/remove elements at
+/// the front of the vector for big endian targets, and the back of the vector
+/// for little endian targets.
+///
+/// Try to replace it with a shuffle (and vector/vector bitcast) if possible.
+///
+/// The source and destination vector types may have different element types.
static Instruction *
optimizeVectorResizeWithIntegerBitCasts(Value *InVal, VectorType *DestTy,
InstCombinerImpl &IC) {
- // We can only do this optimization if the output is a multiple of the input
- // element size, or the input is a multiple of the output element size.
- // Convert the input type to have the same element type as the output.
- VectorType *SrcTy = cast<VectorType>(InVal->getType());
-
- if (SrcTy->getElementType() != DestTy->getElementType()) {
- // The input types don't need to be identical, but for now they must be the
- // same size. There is no specific reason we couldn't handle things like
- // <4 x i16> -> <4 x i32> by bitcasting to <2 x i32> but haven't gotten
- // there yet.
- if (SrcTy->getElementType()->getPrimitiveSizeInBits() !=
- DestTy->getElementType()->getPrimitiveSizeInBits())
- return nullptr;
-
- SrcTy =
+ // We can only do this optimization if the output is a multiple of the input
+ // element size, or the input is a multiple of the output element size.
+ // Convert the input type to have the same element type as the output.
+ VectorType *SrcTy = cast<VectorType>(InVal->getType());
+
+ if (SrcTy->getElementType() != DestTy->getElementType()) {
+ // The input types don't need to be identical, but for now they must be the
+ // same size. There is no specific reason we couldn't handle things like
+ // <4 x i16> -> <4 x i32> by bitcasting to <2 x i32> but haven't gotten
+ // there yet.
+ if (SrcTy->getElementType()->getPrimitiveSizeInBits() !=
+ DestTy->getElementType()->getPrimitiveSizeInBits())
+ return nullptr;
+
+ SrcTy =
FixedVectorType::get(DestTy->getElementType(),
cast<FixedVectorType>(SrcTy)->getNumElements());
- InVal = IC.Builder.CreateBitCast(InVal, SrcTy);
- }
-
- bool IsBigEndian = IC.getDataLayout().isBigEndian();
+ InVal = IC.Builder.CreateBitCast(InVal, SrcTy);
+ }
+
+ bool IsBigEndian = IC.getDataLayout().isBigEndian();
unsigned SrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
unsigned DestElts = cast<FixedVectorType>(DestTy)->getNumElements();
-
- assert(SrcElts != DestElts && "Element counts should be different.");
-
- // Now that the element types match, get the shuffle mask and RHS of the
- // shuffle to use, which depends on whether we're increasing or decreasing the
- // size of the input.
- SmallVector<int, 16> ShuffleMaskStorage;
- ArrayRef<int> ShuffleMask;
- Value *V2;
-
- // Produce an identify shuffle mask for the src vector.
- ShuffleMaskStorage.resize(SrcElts);
- std::iota(ShuffleMaskStorage.begin(), ShuffleMaskStorage.end(), 0);
-
- if (SrcElts > DestElts) {
- // If we're shrinking the number of elements (rewriting an integer
- // truncate), just shuffle in the elements corresponding to the least
- // significant bits from the input and use undef as the second shuffle
- // input.
- V2 = UndefValue::get(SrcTy);
- // Make sure the shuffle mask selects the "least significant bits" by
- // keeping elements from back of the src vector for big endian, and from the
- // front for little endian.
- ShuffleMask = ShuffleMaskStorage;
- if (IsBigEndian)
- ShuffleMask = ShuffleMask.take_back(DestElts);
- else
- ShuffleMask = ShuffleMask.take_front(DestElts);
- } else {
- // If we're increasing the number of elements (rewriting an integer zext),
- // shuffle in all of the elements from InVal. Fill the rest of the result
- // elements with zeros from a constant zero.
- V2 = Constant::getNullValue(SrcTy);
- // Use first elt from V2 when indicating zero in the shuffle mask.
- uint32_t NullElt = SrcElts;
- // Extend with null values in the "most significant bits" by adding elements
- // in front of the src vector for big endian, and at the back for little
- // endian.
- unsigned DeltaElts = DestElts - SrcElts;
- if (IsBigEndian)
- ShuffleMaskStorage.insert(ShuffleMaskStorage.begin(), DeltaElts, NullElt);
- else
- ShuffleMaskStorage.append(DeltaElts, NullElt);
- ShuffleMask = ShuffleMaskStorage;
- }
-
- return new ShuffleVectorInst(InVal, V2, ShuffleMask);
-}
-
-static bool isMultipleOfTypeSize(unsigned Value, Type *Ty) {
- return Value % Ty->getPrimitiveSizeInBits() == 0;
-}
-
-static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {
- return Value / Ty->getPrimitiveSizeInBits();
-}
-
-/// V is a value which is inserted into a vector of VecEltTy.
-/// Look through the value to see if we can decompose it into
-/// insertions into the vector. See the example in the comment for
-/// OptimizeIntegerToVectorInsertions for the pattern this handles.
-/// The type of V is always a non-zero multiple of VecEltTy's size.
-/// Shift is the number of bits between the lsb of V and the lsb of
-/// the vector.
-///
-/// This returns false if the pattern can't be matched or true if it can,
-/// filling in Elements with the elements found here.
-static bool collectInsertionElements(Value *V, unsigned Shift,
- SmallVectorImpl<Value *> &Elements,
- Type *VecEltTy, bool isBigEndian) {
- assert(isMultipleOfTypeSize(Shift, VecEltTy) &&
- "Shift should be a multiple of the element type size");
-
- // Undef values never contribute useful bits to the result.
- if (isa<UndefValue>(V)) return true;
-
- // If we got down to a value of the right type, we win, try inserting into the
- // right element.
- if (V->getType() == VecEltTy) {
- // Inserting null doesn't actually insert any elements.
- if (Constant *C = dyn_cast<Constant>(V))
- if (C->isNullValue())
- return true;
-
- unsigned ElementIndex = getTypeSizeIndex(Shift, VecEltTy);
- if (isBigEndian)
- ElementIndex = Elements.size() - ElementIndex - 1;
-
- // Fail if multiple elements are inserted into this slot.
- if (Elements[ElementIndex])
- return false;
-
- Elements[ElementIndex] = V;
- return true;
- }
-
- if (Constant *C = dyn_cast<Constant>(V)) {
- // Figure out the # elements this provides, and bitcast it or slice it up
- // as required.
- unsigned NumElts = getTypeSizeIndex(C->getType()->getPrimitiveSizeInBits(),
- VecEltTy);
- // If the constant is the size of a vector element, we just need to bitcast
- // it to the right type so it gets properly inserted.
- if (NumElts == 1)
- return collectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
- Shift, Elements, VecEltTy, isBigEndian);
-
- // Okay, this is a constant that covers multiple elements. Slice it up into
- // pieces and insert each element-sized piece into the vector.
- if (!isa<IntegerType>(C->getType()))
- C = ConstantExpr::getBitCast(C, IntegerType::get(V->getContext(),
- C->getType()->getPrimitiveSizeInBits()));
- unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits();
- Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
-
- for (unsigned i = 0; i != NumElts; ++i) {
- unsigned ShiftI = Shift+i*ElementSize;
- Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
- ShiftI));
- Piece = ConstantExpr::getTrunc(Piece, ElementIntTy);
- if (!collectInsertionElements(Piece, ShiftI, Elements, VecEltTy,
- isBigEndian))
- return false;
- }
- return true;
- }
-
- if (!V->hasOneUse()) return false;
-
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I) return false;
- switch (I->getOpcode()) {
- default: return false; // Unhandled case.
- case Instruction::BitCast:
- return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
- isBigEndian);
- case Instruction::ZExt:
- if (!isMultipleOfTypeSize(
- I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
- VecEltTy))
- return false;
- return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
- isBigEndian);
- case Instruction::Or:
- return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
- isBigEndian) &&
- collectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy,
- isBigEndian);
- case Instruction::Shl: {
- // Must be shifting by a constant that is a multiple of the element size.
- ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
- if (!CI) return false;
- Shift += CI->getZExtValue();
- if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false;
- return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
- isBigEndian);
- }
-
- }
-}
-
-
-/// If the input is an 'or' instruction, we may be doing shifts and ors to
-/// assemble the elements of the vector manually.
-/// Try to rip the code out and replace it with insertelements. This is to
-/// optimize code like this:
-///
-/// %tmp37 = bitcast float %inc to i32
-/// %tmp38 = zext i32 %tmp37 to i64
-/// %tmp31 = bitcast float %inc5 to i32
-/// %tmp32 = zext i32 %tmp31 to i64
-/// %tmp33 = shl i64 %tmp32, 32
-/// %ins35 = or i64 %tmp33, %tmp38
-/// %tmp43 = bitcast i64 %ins35 to <2 x float>
-///
-/// Into two insertelements that do "buildvector{%inc, %inc5}".
-static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
+
+ assert(SrcElts != DestElts && "Element counts should be different.");
+
+ // Now that the element types match, get the shuffle mask and RHS of the
+ // shuffle to use, which depends on whether we're increasing or decreasing the
+ // size of the input.
+ SmallVector<int, 16> ShuffleMaskStorage;
+ ArrayRef<int> ShuffleMask;
+ Value *V2;
+
+ // Produce an identify shuffle mask for the src vector.
+ ShuffleMaskStorage.resize(SrcElts);
+ std::iota(ShuffleMaskStorage.begin(), ShuffleMaskStorage.end(), 0);
+
+ if (SrcElts > DestElts) {
+ // If we're shrinking the number of elements (rewriting an integer
+ // truncate), just shuffle in the elements corresponding to the least
+ // significant bits from the input and use undef as the second shuffle
+ // input.
+ V2 = UndefValue::get(SrcTy);
+ // Make sure the shuffle mask selects the "least significant bits" by
+ // keeping elements from back of the src vector for big endian, and from the
+ // front for little endian.
+ ShuffleMask = ShuffleMaskStorage;
+ if (IsBigEndian)
+ ShuffleMask = ShuffleMask.take_back(DestElts);
+ else
+ ShuffleMask = ShuffleMask.take_front(DestElts);
+ } else {
+ // If we're increasing the number of elements (rewriting an integer zext),
+ // shuffle in all of the elements from InVal. Fill the rest of the result
+ // elements with zeros from a constant zero.
+ V2 = Constant::getNullValue(SrcTy);
+ // Use first elt from V2 when indicating zero in the shuffle mask.
+ uint32_t NullElt = SrcElts;
+ // Extend with null values in the "most significant bits" by adding elements
+ // in front of the src vector for big endian, and at the back for little
+ // endian.
+ unsigned DeltaElts = DestElts - SrcElts;
+ if (IsBigEndian)
+ ShuffleMaskStorage.insert(ShuffleMaskStorage.begin(), DeltaElts, NullElt);
+ else
+ ShuffleMaskStorage.append(DeltaElts, NullElt);
+ ShuffleMask = ShuffleMaskStorage;
+ }
+
+ return new ShuffleVectorInst(InVal, V2, ShuffleMask);
+}
+
+static bool isMultipleOfTypeSize(unsigned Value, Type *Ty) {
+ return Value % Ty->getPrimitiveSizeInBits() == 0;
+}
+
+static unsigned getTypeSizeIndex(unsigned Value, Type *Ty) {
+ return Value / Ty->getPrimitiveSizeInBits();
+}
+
+/// V is a value which is inserted into a vector of VecEltTy.
+/// Look through the value to see if we can decompose it into
+/// insertions into the vector. See the example in the comment for
+/// OptimizeIntegerToVectorInsertions for the pattern this handles.
+/// The type of V is always a non-zero multiple of VecEltTy's size.
+/// Shift is the number of bits between the lsb of V and the lsb of
+/// the vector.
+///
+/// This returns false if the pattern can't be matched or true if it can,
+/// filling in Elements with the elements found here.
+static bool collectInsertionElements(Value *V, unsigned Shift,
+ SmallVectorImpl<Value *> &Elements,
+ Type *VecEltTy, bool isBigEndian) {
+ assert(isMultipleOfTypeSize(Shift, VecEltTy) &&
+ "Shift should be a multiple of the element type size");
+
+ // Undef values never contribute useful bits to the result.
+ if (isa<UndefValue>(V)) return true;
+
+ // If we got down to a value of the right type, we win, try inserting into the
+ // right element.
+ if (V->getType() == VecEltTy) {
+ // Inserting null doesn't actually insert any elements.
+ if (Constant *C = dyn_cast<Constant>(V))
+ if (C->isNullValue())
+ return true;
+
+ unsigned ElementIndex = getTypeSizeIndex(Shift, VecEltTy);
+ if (isBigEndian)
+ ElementIndex = Elements.size() - ElementIndex - 1;
+
+ // Fail if multiple elements are inserted into this slot.
+ if (Elements[ElementIndex])
+ return false;
+
+ Elements[ElementIndex] = V;
+ return true;
+ }
+
+ if (Constant *C = dyn_cast<Constant>(V)) {
+ // Figure out the # elements this provides, and bitcast it or slice it up
+ // as required.
+ unsigned NumElts = getTypeSizeIndex(C->getType()->getPrimitiveSizeInBits(),
+ VecEltTy);
+ // If the constant is the size of a vector element, we just need to bitcast
+ // it to the right type so it gets properly inserted.
+ if (NumElts == 1)
+ return collectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
+ Shift, Elements, VecEltTy, isBigEndian);
+
+ // Okay, this is a constant that covers multiple elements. Slice it up into
+ // pieces and insert each element-sized piece into the vector.
+ if (!isa<IntegerType>(C->getType()))
+ C = ConstantExpr::getBitCast(C, IntegerType::get(V->getContext(),
+ C->getType()->getPrimitiveSizeInBits()));
+ unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits();
+ Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ unsigned ShiftI = Shift+i*ElementSize;
+ Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
+ ShiftI));
+ Piece = ConstantExpr::getTrunc(Piece, ElementIntTy);
+ if (!collectInsertionElements(Piece, ShiftI, Elements, VecEltTy,
+ isBigEndian))
+ return false;
+ }
+ return true;
+ }
+
+ if (!V->hasOneUse()) return false;
+
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I) return false;
+ switch (I->getOpcode()) {
+ default: return false; // Unhandled case.
+ case Instruction::BitCast:
+ return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+ isBigEndian);
+ case Instruction::ZExt:
+ if (!isMultipleOfTypeSize(
+ I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
+ VecEltTy))
+ return false;
+ return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+ isBigEndian);
+ case Instruction::Or:
+ return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+ isBigEndian) &&
+ collectInsertionElements(I->getOperand(1), Shift, Elements, VecEltTy,
+ isBigEndian);
+ case Instruction::Shl: {
+ // Must be shifting by a constant that is a multiple of the element size.
+ ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
+ if (!CI) return false;
+ Shift += CI->getZExtValue();
+ if (!isMultipleOfTypeSize(Shift, VecEltTy)) return false;
+ return collectInsertionElements(I->getOperand(0), Shift, Elements, VecEltTy,
+ isBigEndian);
+ }
+
+ }
+}
+
+
+/// If the input is an 'or' instruction, we may be doing shifts and ors to
+/// assemble the elements of the vector manually.
+/// Try to rip the code out and replace it with insertelements. This is to
+/// optimize code like this:
+///
+/// %tmp37 = bitcast float %inc to i32
+/// %tmp38 = zext i32 %tmp37 to i64
+/// %tmp31 = bitcast float %inc5 to i32
+/// %tmp32 = zext i32 %tmp31 to i64
+/// %tmp33 = shl i64 %tmp32, 32
+/// %ins35 = or i64 %tmp33, %tmp38
+/// %tmp43 = bitcast i64 %ins35 to <2 x float>
+///
+/// Into two insertelements that do "buildvector{%inc, %inc5}".
+static Value *optimizeIntegerToVectorInsertions(BitCastInst &CI,
InstCombinerImpl &IC) {
auto *DestVecTy = cast<FixedVectorType>(CI.getType());
- Value *IntInput = CI.getOperand(0);
-
- SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
- if (!collectInsertionElements(IntInput, 0, Elements,
- DestVecTy->getElementType(),
- IC.getDataLayout().isBigEndian()))
- return nullptr;
-
- // If we succeeded, we know that all of the element are specified by Elements
- // or are zero if Elements has a null entry. Recast this as a set of
- // insertions.
- Value *Result = Constant::getNullValue(CI.getType());
- for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
- if (!Elements[i]) continue; // Unset element.
-
- Result = IC.Builder.CreateInsertElement(Result, Elements[i],
- IC.Builder.getInt32(i));
- }
-
- return Result;
-}
-
-/// Canonicalize scalar bitcasts of extracted elements into a bitcast of the
-/// vector followed by extract element. The backend tends to handle bitcasts of
-/// vectors better than bitcasts of scalars because vector registers are
-/// usually not type-specific like scalar integer or scalar floating-point.
-static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
+ Value *IntInput = CI.getOperand(0);
+
+ SmallVector<Value*, 8> Elements(DestVecTy->getNumElements());
+ if (!collectInsertionElements(IntInput, 0, Elements,
+ DestVecTy->getElementType(),
+ IC.getDataLayout().isBigEndian()))
+ return nullptr;
+
+ // If we succeeded, we know that all of the element are specified by Elements
+ // or are zero if Elements has a null entry. Recast this as a set of
+ // insertions.
+ Value *Result = Constant::getNullValue(CI.getType());
+ for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
+ if (!Elements[i]) continue; // Unset element.
+
+ Result = IC.Builder.CreateInsertElement(Result, Elements[i],
+ IC.Builder.getInt32(i));
+ }
+
+ return Result;
+}
+
+/// Canonicalize scalar bitcasts of extracted elements into a bitcast of the
+/// vector followed by extract element. The backend tends to handle bitcasts of
+/// vectors better than bitcasts of scalars because vector registers are
+/// usually not type-specific like scalar integer or scalar floating-point.
+static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
InstCombinerImpl &IC) {
- // TODO: Create and use a pattern matcher for ExtractElementInst.
- auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
- if (!ExtElt || !ExtElt->hasOneUse())
- return nullptr;
-
- // The bitcast must be to a vectorizable type, otherwise we can't make a new
- // type to extract from.
- Type *DestType = BitCast.getType();
- if (!VectorType::isValidElementType(DestType))
- return nullptr;
-
+ // TODO: Create and use a pattern matcher for ExtractElementInst.
+ auto *ExtElt = dyn_cast<ExtractElementInst>(BitCast.getOperand(0));
+ if (!ExtElt || !ExtElt->hasOneUse())
+ return nullptr;
+
+ // The bitcast must be to a vectorizable type, otherwise we can't make a new
+ // type to extract from.
+ Type *DestType = BitCast.getType();
+ if (!VectorType::isValidElementType(DestType))
+ return nullptr;
+
auto *NewVecType = VectorType::get(DestType, ExtElt->getVectorOperandType());
- auto *NewBC = IC.Builder.CreateBitCast(ExtElt->getVectorOperand(),
- NewVecType, "bc");
- return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand());
-}
-
-/// Change the type of a bitwise logic operation if we can eliminate a bitcast.
-static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast,
- InstCombiner::BuilderTy &Builder) {
- Type *DestTy = BitCast.getType();
- BinaryOperator *BO;
- if (!DestTy->isIntOrIntVectorTy() ||
- !match(BitCast.getOperand(0), m_OneUse(m_BinOp(BO))) ||
- !BO->isBitwiseLogicOp())
- return nullptr;
-
- // FIXME: This transform is restricted to vector types to avoid backend
- // problems caused by creating potentially illegal operations. If a fix-up is
- // added to handle that situation, we can remove this check.
- if (!DestTy->isVectorTy() || !BO->getType()->isVectorTy())
- return nullptr;
-
- Value *X;
- if (match(BO->getOperand(0), m_OneUse(m_BitCast(m_Value(X)))) &&
- X->getType() == DestTy && !isa<Constant>(X)) {
- // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
- Value *CastedOp1 = Builder.CreateBitCast(BO->getOperand(1), DestTy);
- return BinaryOperator::Create(BO->getOpcode(), X, CastedOp1);
- }
-
- if (match(BO->getOperand(1), m_OneUse(m_BitCast(m_Value(X)))) &&
- X->getType() == DestTy && !isa<Constant>(X)) {
- // bitcast(logic(Y, bitcast(X))) --> logic'(bitcast(Y), X)
- Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy);
- return BinaryOperator::Create(BO->getOpcode(), CastedOp0, X);
- }
-
- // Canonicalize vector bitcasts to come before vector bitwise logic with a
- // constant. This eases recognition of special constants for later ops.
- // Example:
- // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
- Constant *C;
- if (match(BO->getOperand(1), m_Constant(C))) {
- // bitcast (logic X, C) --> logic (bitcast X, C')
- Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy);
- Value *CastedC = Builder.CreateBitCast(C, DestTy);
- return BinaryOperator::Create(BO->getOpcode(), CastedOp0, CastedC);
- }
-
- return nullptr;
-}
-
-/// Change the type of a select if we can eliminate a bitcast.
-static Instruction *foldBitCastSelect(BitCastInst &BitCast,
- InstCombiner::BuilderTy &Builder) {
- Value *Cond, *TVal, *FVal;
- if (!match(BitCast.getOperand(0),
- m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
- return nullptr;
-
- // A vector select must maintain the same number of elements in its operands.
- Type *CondTy = Cond->getType();
- Type *DestTy = BitCast.getType();
+ auto *NewBC = IC.Builder.CreateBitCast(ExtElt->getVectorOperand(),
+ NewVecType, "bc");
+ return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand());
+}
+
+/// Change the type of a bitwise logic operation if we can eliminate a bitcast.
+static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast,
+ InstCombiner::BuilderTy &Builder) {
+ Type *DestTy = BitCast.getType();
+ BinaryOperator *BO;
+ if (!DestTy->isIntOrIntVectorTy() ||
+ !match(BitCast.getOperand(0), m_OneUse(m_BinOp(BO))) ||
+ !BO->isBitwiseLogicOp())
+ return nullptr;
+
+ // FIXME: This transform is restricted to vector types to avoid backend
+ // problems caused by creating potentially illegal operations. If a fix-up is
+ // added to handle that situation, we can remove this check.
+ if (!DestTy->isVectorTy() || !BO->getType()->isVectorTy())
+ return nullptr;
+
+ Value *X;
+ if (match(BO->getOperand(0), m_OneUse(m_BitCast(m_Value(X)))) &&
+ X->getType() == DestTy && !isa<Constant>(X)) {
+ // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
+ Value *CastedOp1 = Builder.CreateBitCast(BO->getOperand(1), DestTy);
+ return BinaryOperator::Create(BO->getOpcode(), X, CastedOp1);
+ }
+
+ if (match(BO->getOperand(1), m_OneUse(m_BitCast(m_Value(X)))) &&
+ X->getType() == DestTy && !isa<Constant>(X)) {
+ // bitcast(logic(Y, bitcast(X))) --> logic'(bitcast(Y), X)
+ Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy);
+ return BinaryOperator::Create(BO->getOpcode(), CastedOp0, X);
+ }
+
+ // Canonicalize vector bitcasts to come before vector bitwise logic with a
+ // constant. This eases recognition of special constants for later ops.
+ // Example:
+ // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
+ Constant *C;
+ if (match(BO->getOperand(1), m_Constant(C))) {
+ // bitcast (logic X, C) --> logic (bitcast X, C')
+ Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy);
+ Value *CastedC = Builder.CreateBitCast(C, DestTy);
+ return BinaryOperator::Create(BO->getOpcode(), CastedOp0, CastedC);
+ }
+
+ return nullptr;
+}
+
+/// Change the type of a select if we can eliminate a bitcast.
+static Instruction *foldBitCastSelect(BitCastInst &BitCast,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Cond, *TVal, *FVal;
+ if (!match(BitCast.getOperand(0),
+ m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
+ return nullptr;
+
+ // A vector select must maintain the same number of elements in its operands.
+ Type *CondTy = Cond->getType();
+ Type *DestTy = BitCast.getType();
if (auto *CondVTy = dyn_cast<VectorType>(CondTy))
if (!DestTy->isVectorTy() ||
CondVTy->getElementCount() !=
cast<VectorType>(DestTy)->getElementCount())
- return nullptr;
-
- // FIXME: This transform is restricted from changing the select between
- // scalars and vectors to avoid backend problems caused by creating
- // potentially illegal operations. If a fix-up is added to handle that
- // situation, we can remove this check.
- if (DestTy->isVectorTy() != TVal->getType()->isVectorTy())
- return nullptr;
-
- auto *Sel = cast<Instruction>(BitCast.getOperand(0));
- Value *X;
- if (match(TVal, m_OneUse(m_BitCast(m_Value(X)))) && X->getType() == DestTy &&
- !isa<Constant>(X)) {
- // bitcast(select(Cond, bitcast(X), Y)) --> select'(Cond, X, bitcast(Y))
- Value *CastedVal = Builder.CreateBitCast(FVal, DestTy);
- return SelectInst::Create(Cond, X, CastedVal, "", nullptr, Sel);
- }
-
- if (match(FVal, m_OneUse(m_BitCast(m_Value(X)))) && X->getType() == DestTy &&
- !isa<Constant>(X)) {
- // bitcast(select(Cond, Y, bitcast(X))) --> select'(Cond, bitcast(Y), X)
- Value *CastedVal = Builder.CreateBitCast(TVal, DestTy);
- return SelectInst::Create(Cond, CastedVal, X, "", nullptr, Sel);
- }
-
- return nullptr;
-}
-
-/// Check if all users of CI are StoreInsts.
-static bool hasStoreUsersOnly(CastInst &CI) {
- for (User *U : CI.users()) {
- if (!isa<StoreInst>(U))
- return false;
- }
- return true;
-}
-
-/// This function handles following case
-///
-/// A -> B cast
-/// PHI
-/// B -> A cast
-///
-/// All the related PHI nodes can be replaced by new PHI nodes with type A.
-/// The uses of \p CI can be changed to the new PHI node corresponding to \p PN.
+ return nullptr;
+
+ // FIXME: This transform is restricted from changing the select between
+ // scalars and vectors to avoid backend problems caused by creating
+ // potentially illegal operations. If a fix-up is added to handle that
+ // situation, we can remove this check.
+ if (DestTy->isVectorTy() != TVal->getType()->isVectorTy())
+ return nullptr;
+
+ auto *Sel = cast<Instruction>(BitCast.getOperand(0));
+ Value *X;
+ if (match(TVal, m_OneUse(m_BitCast(m_Value(X)))) && X->getType() == DestTy &&
+ !isa<Constant>(X)) {
+ // bitcast(select(Cond, bitcast(X), Y)) --> select'(Cond, X, bitcast(Y))
+ Value *CastedVal = Builder.CreateBitCast(FVal, DestTy);
+ return SelectInst::Create(Cond, X, CastedVal, "", nullptr, Sel);
+ }
+
+ if (match(FVal, m_OneUse(m_BitCast(m_Value(X)))) && X->getType() == DestTy &&
+ !isa<Constant>(X)) {
+ // bitcast(select(Cond, Y, bitcast(X))) --> select'(Cond, bitcast(Y), X)
+ Value *CastedVal = Builder.CreateBitCast(TVal, DestTy);
+ return SelectInst::Create(Cond, CastedVal, X, "", nullptr, Sel);
+ }
+
+ return nullptr;
+}
+
+/// Check if all users of CI are StoreInsts.
+static bool hasStoreUsersOnly(CastInst &CI) {
+ for (User *U : CI.users()) {
+ if (!isa<StoreInst>(U))
+ return false;
+ }
+ return true;
+}
+
+/// This function handles following case
+///
+/// A -> B cast
+/// PHI
+/// B -> A cast
+///
+/// All the related PHI nodes can be replaced by new PHI nodes with type A.
+/// The uses of \p CI can be changed to the new PHI node corresponding to \p PN.
Instruction *InstCombinerImpl::optimizeBitCastFromPhi(CastInst &CI,
PHINode *PN) {
- // BitCast used by Store can be handled in InstCombineLoadStoreAlloca.cpp.
- if (hasStoreUsersOnly(CI))
- return nullptr;
-
- Value *Src = CI.getOperand(0);
- Type *SrcTy = Src->getType(); // Type B
- Type *DestTy = CI.getType(); // Type A
-
- SmallVector<PHINode *, 4> PhiWorklist;
- SmallSetVector<PHINode *, 4> OldPhiNodes;
-
- // Find all of the A->B casts and PHI nodes.
- // We need to inspect all related PHI nodes, but PHIs can be cyclic, so
- // OldPhiNodes is used to track all known PHI nodes, before adding a new
- // PHI to PhiWorklist, it is checked against and added to OldPhiNodes first.
- PhiWorklist.push_back(PN);
- OldPhiNodes.insert(PN);
- while (!PhiWorklist.empty()) {
- auto *OldPN = PhiWorklist.pop_back_val();
- for (Value *IncValue : OldPN->incoming_values()) {
- if (isa<Constant>(IncValue))
- continue;
-
- if (auto *LI = dyn_cast<LoadInst>(IncValue)) {
- // If there is a sequence of one or more load instructions, each loaded
- // value is used as address of later load instruction, bitcast is
- // necessary to change the value type, don't optimize it. For
- // simplicity we give up if the load address comes from another load.
- Value *Addr = LI->getOperand(0);
- if (Addr == &CI || isa<LoadInst>(Addr))
- return nullptr;
- if (LI->hasOneUse() && LI->isSimple())
- continue;
- // If a LoadInst has more than one use, changing the type of loaded
- // value may create another bitcast.
- return nullptr;
- }
-
- if (auto *PNode = dyn_cast<PHINode>(IncValue)) {
- if (OldPhiNodes.insert(PNode))
- PhiWorklist.push_back(PNode);
- continue;
- }
-
- auto *BCI = dyn_cast<BitCastInst>(IncValue);
- // We can't handle other instructions.
- if (!BCI)
- return nullptr;
-
- // Verify it's a A->B cast.
- Type *TyA = BCI->getOperand(0)->getType();
- Type *TyB = BCI->getType();
- if (TyA != DestTy || TyB != SrcTy)
- return nullptr;
- }
- }
-
- // Check that each user of each old PHI node is something that we can
- // rewrite, so that all of the old PHI nodes can be cleaned up afterwards.
- for (auto *OldPN : OldPhiNodes) {
- for (User *V : OldPN->users()) {
- if (auto *SI = dyn_cast<StoreInst>(V)) {
- if (!SI->isSimple() || SI->getOperand(0) != OldPN)
- return nullptr;
- } else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
- // Verify it's a B->A cast.
- Type *TyB = BCI->getOperand(0)->getType();
- Type *TyA = BCI->getType();
- if (TyA != DestTy || TyB != SrcTy)
- return nullptr;
- } else if (auto *PHI = dyn_cast<PHINode>(V)) {
- // As long as the user is another old PHI node, then even if we don't
- // rewrite it, the PHI web we're considering won't have any users
- // outside itself, so it'll be dead.
- if (OldPhiNodes.count(PHI) == 0)
- return nullptr;
- } else {
- return nullptr;
- }
- }
- }
-
- // For each old PHI node, create a corresponding new PHI node with a type A.
- SmallDenseMap<PHINode *, PHINode *> NewPNodes;
- for (auto *OldPN : OldPhiNodes) {
- Builder.SetInsertPoint(OldPN);
- PHINode *NewPN = Builder.CreatePHI(DestTy, OldPN->getNumOperands());
- NewPNodes[OldPN] = NewPN;
- }
-
- // Fill in the operands of new PHI nodes.
- for (auto *OldPN : OldPhiNodes) {
- PHINode *NewPN = NewPNodes[OldPN];
- for (unsigned j = 0, e = OldPN->getNumOperands(); j != e; ++j) {
- Value *V = OldPN->getOperand(j);
- Value *NewV = nullptr;
- if (auto *C = dyn_cast<Constant>(V)) {
- NewV = ConstantExpr::getBitCast(C, DestTy);
- } else if (auto *LI = dyn_cast<LoadInst>(V)) {
- // Explicitly perform load combine to make sure no opposing transform
- // can remove the bitcast in the meantime and trigger an infinite loop.
- Builder.SetInsertPoint(LI);
- NewV = combineLoadToNewType(*LI, DestTy);
- // Remove the old load and its use in the old phi, which itself becomes
- // dead once the whole transform finishes.
- replaceInstUsesWith(*LI, UndefValue::get(LI->getType()));
- eraseInstFromFunction(*LI);
- } else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
- NewV = BCI->getOperand(0);
- } else if (auto *PrevPN = dyn_cast<PHINode>(V)) {
- NewV = NewPNodes[PrevPN];
- }
- assert(NewV);
- NewPN->addIncoming(NewV, OldPN->getIncomingBlock(j));
- }
- }
-
- // Traverse all accumulated PHI nodes and process its users,
- // which are Stores and BitcCasts. Without this processing
- // NewPHI nodes could be replicated and could lead to extra
- // moves generated after DeSSA.
- // If there is a store with type B, change it to type A.
-
-
- // Replace users of BitCast B->A with NewPHI. These will help
- // later to get rid off a closure formed by OldPHI nodes.
- Instruction *RetVal = nullptr;
- for (auto *OldPN : OldPhiNodes) {
- PHINode *NewPN = NewPNodes[OldPN];
+ // BitCast used by Store can be handled in InstCombineLoadStoreAlloca.cpp.
+ if (hasStoreUsersOnly(CI))
+ return nullptr;
+
+ Value *Src = CI.getOperand(0);
+ Type *SrcTy = Src->getType(); // Type B
+ Type *DestTy = CI.getType(); // Type A
+
+ SmallVector<PHINode *, 4> PhiWorklist;
+ SmallSetVector<PHINode *, 4> OldPhiNodes;
+
+ // Find all of the A->B casts and PHI nodes.
+ // We need to inspect all related PHI nodes, but PHIs can be cyclic, so
+ // OldPhiNodes is used to track all known PHI nodes, before adding a new
+ // PHI to PhiWorklist, it is checked against and added to OldPhiNodes first.
+ PhiWorklist.push_back(PN);
+ OldPhiNodes.insert(PN);
+ while (!PhiWorklist.empty()) {
+ auto *OldPN = PhiWorklist.pop_back_val();
+ for (Value *IncValue : OldPN->incoming_values()) {
+ if (isa<Constant>(IncValue))
+ continue;
+
+ if (auto *LI = dyn_cast<LoadInst>(IncValue)) {
+ // If there is a sequence of one or more load instructions, each loaded
+ // value is used as address of later load instruction, bitcast is
+ // necessary to change the value type, don't optimize it. For
+ // simplicity we give up if the load address comes from another load.
+ Value *Addr = LI->getOperand(0);
+ if (Addr == &CI || isa<LoadInst>(Addr))
+ return nullptr;
+ if (LI->hasOneUse() && LI->isSimple())
+ continue;
+ // If a LoadInst has more than one use, changing the type of loaded
+ // value may create another bitcast.
+ return nullptr;
+ }
+
+ if (auto *PNode = dyn_cast<PHINode>(IncValue)) {
+ if (OldPhiNodes.insert(PNode))
+ PhiWorklist.push_back(PNode);
+ continue;
+ }
+
+ auto *BCI = dyn_cast<BitCastInst>(IncValue);
+ // We can't handle other instructions.
+ if (!BCI)
+ return nullptr;
+
+ // Verify it's a A->B cast.
+ Type *TyA = BCI->getOperand(0)->getType();
+ Type *TyB = BCI->getType();
+ if (TyA != DestTy || TyB != SrcTy)
+ return nullptr;
+ }
+ }
+
+ // Check that each user of each old PHI node is something that we can
+ // rewrite, so that all of the old PHI nodes can be cleaned up afterwards.
+ for (auto *OldPN : OldPhiNodes) {
+ for (User *V : OldPN->users()) {
+ if (auto *SI = dyn_cast<StoreInst>(V)) {
+ if (!SI->isSimple() || SI->getOperand(0) != OldPN)
+ return nullptr;
+ } else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
+ // Verify it's a B->A cast.
+ Type *TyB = BCI->getOperand(0)->getType();
+ Type *TyA = BCI->getType();
+ if (TyA != DestTy || TyB != SrcTy)
+ return nullptr;
+ } else if (auto *PHI = dyn_cast<PHINode>(V)) {
+ // As long as the user is another old PHI node, then even if we don't
+ // rewrite it, the PHI web we're considering won't have any users
+ // outside itself, so it'll be dead.
+ if (OldPhiNodes.count(PHI) == 0)
+ return nullptr;
+ } else {
+ return nullptr;
+ }
+ }
+ }
+
+ // For each old PHI node, create a corresponding new PHI node with a type A.
+ SmallDenseMap<PHINode *, PHINode *> NewPNodes;
+ for (auto *OldPN : OldPhiNodes) {
+ Builder.SetInsertPoint(OldPN);
+ PHINode *NewPN = Builder.CreatePHI(DestTy, OldPN->getNumOperands());
+ NewPNodes[OldPN] = NewPN;
+ }
+
+ // Fill in the operands of new PHI nodes.
+ for (auto *OldPN : OldPhiNodes) {
+ PHINode *NewPN = NewPNodes[OldPN];
+ for (unsigned j = 0, e = OldPN->getNumOperands(); j != e; ++j) {
+ Value *V = OldPN->getOperand(j);
+ Value *NewV = nullptr;
+ if (auto *C = dyn_cast<Constant>(V)) {
+ NewV = ConstantExpr::getBitCast(C, DestTy);
+ } else if (auto *LI = dyn_cast<LoadInst>(V)) {
+ // Explicitly perform load combine to make sure no opposing transform
+ // can remove the bitcast in the meantime and trigger an infinite loop.
+ Builder.SetInsertPoint(LI);
+ NewV = combineLoadToNewType(*LI, DestTy);
+ // Remove the old load and its use in the old phi, which itself becomes
+ // dead once the whole transform finishes.
+ replaceInstUsesWith(*LI, UndefValue::get(LI->getType()));
+ eraseInstFromFunction(*LI);
+ } else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
+ NewV = BCI->getOperand(0);
+ } else if (auto *PrevPN = dyn_cast<PHINode>(V)) {
+ NewV = NewPNodes[PrevPN];
+ }
+ assert(NewV);
+ NewPN->addIncoming(NewV, OldPN->getIncomingBlock(j));
+ }
+ }
+
+ // Traverse all accumulated PHI nodes and process its users,
+ // which are Stores and BitcCasts. Without this processing
+ // NewPHI nodes could be replicated and could lead to extra
+ // moves generated after DeSSA.
+ // If there is a store with type B, change it to type A.
+
+
+ // Replace users of BitCast B->A with NewPHI. These will help
+ // later to get rid off a closure formed by OldPHI nodes.
+ Instruction *RetVal = nullptr;
+ for (auto *OldPN : OldPhiNodes) {
+ PHINode *NewPN = NewPNodes[OldPN];
for (User *V : make_early_inc_range(OldPN->users())) {
- if (auto *SI = dyn_cast<StoreInst>(V)) {
- assert(SI->isSimple() && SI->getOperand(0) == OldPN);
- Builder.SetInsertPoint(SI);
- auto *NewBC =
- cast<BitCastInst>(Builder.CreateBitCast(NewPN, SrcTy));
- SI->setOperand(0, NewBC);
- Worklist.push(SI);
- assert(hasStoreUsersOnly(*NewBC));
- }
- else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
- Type *TyB = BCI->getOperand(0)->getType();
- Type *TyA = BCI->getType();
- assert(TyA == DestTy && TyB == SrcTy);
- (void) TyA;
- (void) TyB;
- Instruction *I = replaceInstUsesWith(*BCI, NewPN);
- if (BCI == &CI)
- RetVal = I;
- } else if (auto *PHI = dyn_cast<PHINode>(V)) {
+ if (auto *SI = dyn_cast<StoreInst>(V)) {
+ assert(SI->isSimple() && SI->getOperand(0) == OldPN);
+ Builder.SetInsertPoint(SI);
+ auto *NewBC =
+ cast<BitCastInst>(Builder.CreateBitCast(NewPN, SrcTy));
+ SI->setOperand(0, NewBC);
+ Worklist.push(SI);
+ assert(hasStoreUsersOnly(*NewBC));
+ }
+ else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
+ Type *TyB = BCI->getOperand(0)->getType();
+ Type *TyA = BCI->getType();
+ assert(TyA == DestTy && TyB == SrcTy);
+ (void) TyA;
+ (void) TyB;
+ Instruction *I = replaceInstUsesWith(*BCI, NewPN);
+ if (BCI == &CI)
+ RetVal = I;
+ } else if (auto *PHI = dyn_cast<PHINode>(V)) {
assert(OldPhiNodes.contains(PHI));
- (void) PHI;
- } else {
- llvm_unreachable("all uses should be handled");
- }
- }
- }
-
- return RetVal;
-}
-
+ (void) PHI;
+ } else {
+ llvm_unreachable("all uses should be handled");
+ }
+ }
+ }
+
+ return RetVal;
+}
+
Instruction *InstCombinerImpl::visitBitCast(BitCastInst &CI) {
- // If the operands are integer typed then apply the integer transforms,
- // otherwise just apply the common ones.
- Value *Src = CI.getOperand(0);
- Type *SrcTy = Src->getType();
- Type *DestTy = CI.getType();
-
- // Get rid of casts from one type to the same type. These are useless and can
- // be replaced by the operand.
- if (DestTy == Src->getType())
- return replaceInstUsesWith(CI, Src);
-
- if (isa<PointerType>(SrcTy) && isa<PointerType>(DestTy)) {
- PointerType *SrcPTy = cast<PointerType>(SrcTy);
- PointerType *DstPTy = cast<PointerType>(DestTy);
- Type *DstElTy = DstPTy->getElementType();
- Type *SrcElTy = SrcPTy->getElementType();
-
- // Casting pointers between the same type, but with different address spaces
- // is an addrspace cast rather than a bitcast.
- if ((DstElTy == SrcElTy) &&
- (DstPTy->getAddressSpace() != SrcPTy->getAddressSpace()))
- return new AddrSpaceCastInst(Src, DestTy);
-
- // If we are casting a alloca to a pointer to a type of the same
- // size, rewrite the allocation instruction to allocate the "right" type.
- // There is no need to modify malloc calls because it is their bitcast that
- // needs to be cleaned up.
- if (AllocaInst *AI = dyn_cast<AllocaInst>(Src))
- if (Instruction *V = PromoteCastOfAllocation(CI, *AI))
- return V;
-
- // When the type pointed to is not sized the cast cannot be
- // turned into a gep.
- Type *PointeeType =
- cast<PointerType>(Src->getType()->getScalarType())->getElementType();
- if (!PointeeType->isSized())
- return nullptr;
-
- // If the source and destination are pointers, and this cast is equivalent
- // to a getelementptr X, 0, 0, 0... turn it into the appropriate gep.
- // This can enhance SROA and other transforms that want type-safe pointers.
- unsigned NumZeros = 0;
- while (SrcElTy && SrcElTy != DstElTy) {
- SrcElTy = GetElementPtrInst::getTypeAtIndex(SrcElTy, (uint64_t)0);
- ++NumZeros;
- }
-
- // If we found a path from the src to dest, create the getelementptr now.
- if (SrcElTy == DstElTy) {
- SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder.getInt32(0));
- GetElementPtrInst *GEP =
- GetElementPtrInst::Create(SrcPTy->getElementType(), Src, Idxs);
-
- // If the source pointer is dereferenceable, then assume it points to an
- // allocated object and apply "inbounds" to the GEP.
- bool CanBeNull;
- if (Src->getPointerDereferenceableBytes(DL, CanBeNull)) {
- // In a non-default address space (not 0), a null pointer can not be
- // assumed inbounds, so ignore that case (dereferenceable_or_null).
- // The reason is that 'null' is not treated differently in these address
- // spaces, and we consequently ignore the 'gep inbounds' special case
- // for 'null' which allows 'inbounds' on 'null' if the indices are
- // zeros.
- if (SrcPTy->getAddressSpace() == 0 || !CanBeNull)
- GEP->setIsInBounds();
- }
- return GEP;
- }
- }
-
- if (FixedVectorType *DestVTy = dyn_cast<FixedVectorType>(DestTy)) {
- // Beware: messing with this target-specific oddity may cause trouble.
- if (DestVTy->getNumElements() == 1 && SrcTy->isX86_MMXTy()) {
- Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType());
- return InsertElementInst::Create(UndefValue::get(DestTy), Elem,
- Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
- }
-
- if (isa<IntegerType>(SrcTy)) {
- // If this is a cast from an integer to vector, check to see if the input
- // is a trunc or zext of a bitcast from vector. If so, we can replace all
- // the casts with a shuffle and (potentially) a bitcast.
- if (isa<TruncInst>(Src) || isa<ZExtInst>(Src)) {
- CastInst *SrcCast = cast<CastInst>(Src);
- if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0)))
- if (isa<VectorType>(BCIn->getOperand(0)->getType()))
- if (Instruction *I = optimizeVectorResizeWithIntegerBitCasts(
- BCIn->getOperand(0), cast<VectorType>(DestTy), *this))
- return I;
- }
-
- // If the input is an 'or' instruction, we may be doing shifts and ors to
- // assemble the elements of the vector manually. Try to rip the code out
- // and replace it with insertelements.
- if (Value *V = optimizeIntegerToVectorInsertions(CI, *this))
- return replaceInstUsesWith(CI, V);
- }
- }
-
- if (FixedVectorType *SrcVTy = dyn_cast<FixedVectorType>(SrcTy)) {
- if (SrcVTy->getNumElements() == 1) {
- // If our destination is not a vector, then make this a straight
- // scalar-scalar cast.
- if (!DestTy->isVectorTy()) {
- Value *Elem =
- Builder.CreateExtractElement(Src,
- Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
- return CastInst::Create(Instruction::BitCast, Elem, DestTy);
- }
-
- // Otherwise, see if our source is an insert. If so, then use the scalar
- // component directly:
- // bitcast (inselt <1 x elt> V, X, 0) to <n x m> --> bitcast X to <n x m>
- if (auto *InsElt = dyn_cast<InsertElementInst>(Src))
- return new BitCastInst(InsElt->getOperand(1), DestTy);
- }
- }
-
- if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Src)) {
- // Okay, we have (bitcast (shuffle ..)). Check to see if this is
- // a bitcast to a vector with the same # elts.
- Value *ShufOp0 = Shuf->getOperand(0);
- Value *ShufOp1 = Shuf->getOperand(1);
+ // If the operands are integer typed then apply the integer transforms,
+ // otherwise just apply the common ones.
+ Value *Src = CI.getOperand(0);
+ Type *SrcTy = Src->getType();
+ Type *DestTy = CI.getType();
+
+ // Get rid of casts from one type to the same type. These are useless and can
+ // be replaced by the operand.
+ if (DestTy == Src->getType())
+ return replaceInstUsesWith(CI, Src);
+
+ if (isa<PointerType>(SrcTy) && isa<PointerType>(DestTy)) {
+ PointerType *SrcPTy = cast<PointerType>(SrcTy);
+ PointerType *DstPTy = cast<PointerType>(DestTy);
+ Type *DstElTy = DstPTy->getElementType();
+ Type *SrcElTy = SrcPTy->getElementType();
+
+ // Casting pointers between the same type, but with different address spaces
+ // is an addrspace cast rather than a bitcast.
+ if ((DstElTy == SrcElTy) &&
+ (DstPTy->getAddressSpace() != SrcPTy->getAddressSpace()))
+ return new AddrSpaceCastInst(Src, DestTy);
+
+ // If we are casting a alloca to a pointer to a type of the same
+ // size, rewrite the allocation instruction to allocate the "right" type.
+ // There is no need to modify malloc calls because it is their bitcast that
+ // needs to be cleaned up.
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(Src))
+ if (Instruction *V = PromoteCastOfAllocation(CI, *AI))
+ return V;
+
+ // When the type pointed to is not sized the cast cannot be
+ // turned into a gep.
+ Type *PointeeType =
+ cast<PointerType>(Src->getType()->getScalarType())->getElementType();
+ if (!PointeeType->isSized())
+ return nullptr;
+
+ // If the source and destination are pointers, and this cast is equivalent
+ // to a getelementptr X, 0, 0, 0... turn it into the appropriate gep.
+ // This can enhance SROA and other transforms that want type-safe pointers.
+ unsigned NumZeros = 0;
+ while (SrcElTy && SrcElTy != DstElTy) {
+ SrcElTy = GetElementPtrInst::getTypeAtIndex(SrcElTy, (uint64_t)0);
+ ++NumZeros;
+ }
+
+ // If we found a path from the src to dest, create the getelementptr now.
+ if (SrcElTy == DstElTy) {
+ SmallVector<Value *, 8> Idxs(NumZeros + 1, Builder.getInt32(0));
+ GetElementPtrInst *GEP =
+ GetElementPtrInst::Create(SrcPTy->getElementType(), Src, Idxs);
+
+ // If the source pointer is dereferenceable, then assume it points to an
+ // allocated object and apply "inbounds" to the GEP.
+ bool CanBeNull;
+ if (Src->getPointerDereferenceableBytes(DL, CanBeNull)) {
+ // In a non-default address space (not 0), a null pointer can not be
+ // assumed inbounds, so ignore that case (dereferenceable_or_null).
+ // The reason is that 'null' is not treated differently in these address
+ // spaces, and we consequently ignore the 'gep inbounds' special case
+ // for 'null' which allows 'inbounds' on 'null' if the indices are
+ // zeros.
+ if (SrcPTy->getAddressSpace() == 0 || !CanBeNull)
+ GEP->setIsInBounds();
+ }
+ return GEP;
+ }
+ }
+
+ if (FixedVectorType *DestVTy = dyn_cast<FixedVectorType>(DestTy)) {
+ // Beware: messing with this target-specific oddity may cause trouble.
+ if (DestVTy->getNumElements() == 1 && SrcTy->isX86_MMXTy()) {
+ Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType());
+ return InsertElementInst::Create(UndefValue::get(DestTy), Elem,
+ Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
+ }
+
+ if (isa<IntegerType>(SrcTy)) {
+ // If this is a cast from an integer to vector, check to see if the input
+ // is a trunc or zext of a bitcast from vector. If so, we can replace all
+ // the casts with a shuffle and (potentially) a bitcast.
+ if (isa<TruncInst>(Src) || isa<ZExtInst>(Src)) {
+ CastInst *SrcCast = cast<CastInst>(Src);
+ if (BitCastInst *BCIn = dyn_cast<BitCastInst>(SrcCast->getOperand(0)))
+ if (isa<VectorType>(BCIn->getOperand(0)->getType()))
+ if (Instruction *I = optimizeVectorResizeWithIntegerBitCasts(
+ BCIn->getOperand(0), cast<VectorType>(DestTy), *this))
+ return I;
+ }
+
+ // If the input is an 'or' instruction, we may be doing shifts and ors to
+ // assemble the elements of the vector manually. Try to rip the code out
+ // and replace it with insertelements.
+ if (Value *V = optimizeIntegerToVectorInsertions(CI, *this))
+ return replaceInstUsesWith(CI, V);
+ }
+ }
+
+ if (FixedVectorType *SrcVTy = dyn_cast<FixedVectorType>(SrcTy)) {
+ if (SrcVTy->getNumElements() == 1) {
+ // If our destination is not a vector, then make this a straight
+ // scalar-scalar cast.
+ if (!DestTy->isVectorTy()) {
+ Value *Elem =
+ Builder.CreateExtractElement(Src,
+ Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
+ return CastInst::Create(Instruction::BitCast, Elem, DestTy);
+ }
+
+ // Otherwise, see if our source is an insert. If so, then use the scalar
+ // component directly:
+ // bitcast (inselt <1 x elt> V, X, 0) to <n x m> --> bitcast X to <n x m>
+ if (auto *InsElt = dyn_cast<InsertElementInst>(Src))
+ return new BitCastInst(InsElt->getOperand(1), DestTy);
+ }
+ }
+
+ if (auto *Shuf = dyn_cast<ShuffleVectorInst>(Src)) {
+ // Okay, we have (bitcast (shuffle ..)). Check to see if this is
+ // a bitcast to a vector with the same # elts.
+ Value *ShufOp0 = Shuf->getOperand(0);
+ Value *ShufOp1 = Shuf->getOperand(1);
auto ShufElts = cast<VectorType>(Shuf->getType())->getElementCount();
auto SrcVecElts = cast<VectorType>(ShufOp0->getType())->getElementCount();
- if (Shuf->hasOneUse() && DestTy->isVectorTy() &&
+ if (Shuf->hasOneUse() && DestTy->isVectorTy() &&
cast<VectorType>(DestTy)->getElementCount() == ShufElts &&
ShufElts == SrcVecElts) {
- BitCastInst *Tmp;
- // If either of the operands is a cast from CI.getType(), then
- // evaluating the shuffle in the casted destination's type will allow
- // us to eliminate at least one cast.
- if (((Tmp = dyn_cast<BitCastInst>(ShufOp0)) &&
- Tmp->getOperand(0)->getType() == DestTy) ||
- ((Tmp = dyn_cast<BitCastInst>(ShufOp1)) &&
- Tmp->getOperand(0)->getType() == DestTy)) {
- Value *LHS = Builder.CreateBitCast(ShufOp0, DestTy);
- Value *RHS = Builder.CreateBitCast(ShufOp1, DestTy);
- // Return a new shuffle vector. Use the same element ID's, as we
- // know the vector types match #elts.
- return new ShuffleVectorInst(LHS, RHS, Shuf->getShuffleMask());
- }
- }
-
- // A bitcasted-to-scalar and byte-reversing shuffle is better recognized as
- // a byte-swap:
- // bitcast <N x i8> (shuf X, undef, <N, N-1,...0>) --> bswap (bitcast X)
- // TODO: We should match the related pattern for bitreverse.
- if (DestTy->isIntegerTy() &&
- DL.isLegalInteger(DestTy->getScalarSizeInBits()) &&
+ BitCastInst *Tmp;
+ // If either of the operands is a cast from CI.getType(), then
+ // evaluating the shuffle in the casted destination's type will allow
+ // us to eliminate at least one cast.
+ if (((Tmp = dyn_cast<BitCastInst>(ShufOp0)) &&
+ Tmp->getOperand(0)->getType() == DestTy) ||
+ ((Tmp = dyn_cast<BitCastInst>(ShufOp1)) &&
+ Tmp->getOperand(0)->getType() == DestTy)) {
+ Value *LHS = Builder.CreateBitCast(ShufOp0, DestTy);
+ Value *RHS = Builder.CreateBitCast(ShufOp1, DestTy);
+ // Return a new shuffle vector. Use the same element ID's, as we
+ // know the vector types match #elts.
+ return new ShuffleVectorInst(LHS, RHS, Shuf->getShuffleMask());
+ }
+ }
+
+ // A bitcasted-to-scalar and byte-reversing shuffle is better recognized as
+ // a byte-swap:
+ // bitcast <N x i8> (shuf X, undef, <N, N-1,...0>) --> bswap (bitcast X)
+ // TODO: We should match the related pattern for bitreverse.
+ if (DestTy->isIntegerTy() &&
+ DL.isLegalInteger(DestTy->getScalarSizeInBits()) &&
SrcTy->getScalarSizeInBits() == 8 &&
ShufElts.getKnownMinValue() % 2 == 0 && Shuf->hasOneUse() &&
Shuf->isReverse()) {
- assert(ShufOp0->getType() == SrcTy && "Unexpected shuffle mask");
- assert(isa<UndefValue>(ShufOp1) && "Unexpected shuffle op");
- Function *Bswap =
- Intrinsic::getDeclaration(CI.getModule(), Intrinsic::bswap, DestTy);
- Value *ScalarX = Builder.CreateBitCast(ShufOp0, DestTy);
- return IntrinsicInst::Create(Bswap, { ScalarX });
- }
- }
-
- // Handle the A->B->A cast, and there is an intervening PHI node.
- if (PHINode *PN = dyn_cast<PHINode>(Src))
- if (Instruction *I = optimizeBitCastFromPhi(CI, PN))
- return I;
-
- if (Instruction *I = canonicalizeBitCastExtElt(CI, *this))
- return I;
-
- if (Instruction *I = foldBitCastBitwiseLogic(CI, Builder))
- return I;
-
- if (Instruction *I = foldBitCastSelect(CI, Builder))
- return I;
-
- if (SrcTy->isPointerTy())
- return commonPointerCastTransforms(CI);
- return commonCastTransforms(CI);
-}
-
+ assert(ShufOp0->getType() == SrcTy && "Unexpected shuffle mask");
+ assert(isa<UndefValue>(ShufOp1) && "Unexpected shuffle op");
+ Function *Bswap =
+ Intrinsic::getDeclaration(CI.getModule(), Intrinsic::bswap, DestTy);
+ Value *ScalarX = Builder.CreateBitCast(ShufOp0, DestTy);
+ return IntrinsicInst::Create(Bswap, { ScalarX });
+ }
+ }
+
+ // Handle the A->B->A cast, and there is an intervening PHI node.
+ if (PHINode *PN = dyn_cast<PHINode>(Src))
+ if (Instruction *I = optimizeBitCastFromPhi(CI, PN))
+ return I;
+
+ if (Instruction *I = canonicalizeBitCastExtElt(CI, *this))
+ return I;
+
+ if (Instruction *I = foldBitCastBitwiseLogic(CI, Builder))
+ return I;
+
+ if (Instruction *I = foldBitCastSelect(CI, Builder))
+ return I;
+
+ if (SrcTy->isPointerTy())
+ return commonPointerCastTransforms(CI);
+ return commonCastTransforms(CI);
+}
+
Instruction *InstCombinerImpl::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
- // If the destination pointer element type is not the same as the source's
- // first do a bitcast to the destination type, and then the addrspacecast.
- // This allows the cast to be exposed to other transforms.
- Value *Src = CI.getOperand(0);
- PointerType *SrcTy = cast<PointerType>(Src->getType()->getScalarType());
- PointerType *DestTy = cast<PointerType>(CI.getType()->getScalarType());
-
- Type *DestElemTy = DestTy->getElementType();
- if (SrcTy->getElementType() != DestElemTy) {
- Type *MidTy = PointerType::get(DestElemTy, SrcTy->getAddressSpace());
+ // If the destination pointer element type is not the same as the source's
+ // first do a bitcast to the destination type, and then the addrspacecast.
+ // This allows the cast to be exposed to other transforms.
+ Value *Src = CI.getOperand(0);
+ PointerType *SrcTy = cast<PointerType>(Src->getType()->getScalarType());
+ PointerType *DestTy = cast<PointerType>(CI.getType()->getScalarType());
+
+ Type *DestElemTy = DestTy->getElementType();
+ if (SrcTy->getElementType() != DestElemTy) {
+ Type *MidTy = PointerType::get(DestElemTy, SrcTy->getAddressSpace());
// Handle vectors of pointers.
if (VectorType *VT = dyn_cast<VectorType>(CI.getType()))
MidTy = VectorType::get(MidTy, VT->getElementCount());
-
- Value *NewBitCast = Builder.CreateBitCast(Src, MidTy);
- return new AddrSpaceCastInst(NewBitCast, CI.getType());
- }
-
- return commonPointerCastTransforms(CI);
-}
+
+ Value *NewBitCast = Builder.CreateBitCast(Src, MidTy);
+ return new AddrSpaceCastInst(NewBitCast, CI.getType());
+ }
+
+ return commonPointerCastTransforms(CI);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCompares.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCompares.cpp
index ff56c39d78..cd9a036179 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1,1946 +1,1946 @@
-//===- InstCombineCompares.cpp --------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the visitICmp and visitFCmp functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
-#include "llvm/ADT/APSInt.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/ConstantRange.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/KnownBits.h"
+//===- InstCombineCompares.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitICmp and visitFCmp functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
-
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "instcombine"
-
-// How many times is a select replaced by one of its operands?
-STATISTIC(NumSel, "Number of select opts");
-
-
-/// Compute Result = In1+In2, returning true if the result overflowed for this
-/// type.
-static bool addWithOverflow(APInt &Result, const APInt &In1,
- const APInt &In2, bool IsSigned = false) {
- bool Overflow;
- if (IsSigned)
- Result = In1.sadd_ov(In2, Overflow);
- else
- Result = In1.uadd_ov(In2, Overflow);
-
- return Overflow;
-}
-
-/// Compute Result = In1-In2, returning true if the result overflowed for this
-/// type.
-static bool subWithOverflow(APInt &Result, const APInt &In1,
- const APInt &In2, bool IsSigned = false) {
- bool Overflow;
- if (IsSigned)
- Result = In1.ssub_ov(In2, Overflow);
- else
- Result = In1.usub_ov(In2, Overflow);
-
- return Overflow;
-}
-
-/// Given an icmp instruction, return true if any use of this comparison is a
-/// branch on sign bit comparison.
-static bool hasBranchUse(ICmpInst &I) {
- for (auto *U : I.users())
- if (isa<BranchInst>(U))
- return true;
- return false;
-}
-
-/// Returns true if the exploded icmp can be expressed as a signed comparison
-/// to zero and updates the predicate accordingly.
-/// The signedness of the comparison is preserved.
-/// TODO: Refactor with decomposeBitTestICmp()?
-static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
- if (!ICmpInst::isSigned(Pred))
- return false;
-
- if (C.isNullValue())
- return ICmpInst::isRelational(Pred);
-
- if (C.isOneValue()) {
- if (Pred == ICmpInst::ICMP_SLT) {
- Pred = ICmpInst::ICMP_SLE;
- return true;
- }
- } else if (C.isAllOnesValue()) {
- if (Pred == ICmpInst::ICMP_SGT) {
- Pred = ICmpInst::ICMP_SGE;
- return true;
- }
- }
-
- return false;
-}
-
-/// This is called when we see this pattern:
-/// cmp pred (load (gep GV, ...)), cmpcst
-/// where GV is a global variable with a constant initializer. Try to simplify
-/// this into some simple computation that does not need the load. For example
-/// we can optimize "icmp eq (load (gep "foo", 0, i)), 0" into "icmp eq i, 3".
-///
-/// If AndCst is non-null, then the loaded value is masked with that constant
-/// before doing the comparison. This handles cases like "A[i]&4 == 0".
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+// How many times is a select replaced by one of its operands?
+STATISTIC(NumSel, "Number of select opts");
+
+
+/// Compute Result = In1+In2, returning true if the result overflowed for this
+/// type.
+static bool addWithOverflow(APInt &Result, const APInt &In1,
+ const APInt &In2, bool IsSigned = false) {
+ bool Overflow;
+ if (IsSigned)
+ Result = In1.sadd_ov(In2, Overflow);
+ else
+ Result = In1.uadd_ov(In2, Overflow);
+
+ return Overflow;
+}
+
+/// Compute Result = In1-In2, returning true if the result overflowed for this
+/// type.
+static bool subWithOverflow(APInt &Result, const APInt &In1,
+ const APInt &In2, bool IsSigned = false) {
+ bool Overflow;
+ if (IsSigned)
+ Result = In1.ssub_ov(In2, Overflow);
+ else
+ Result = In1.usub_ov(In2, Overflow);
+
+ return Overflow;
+}
+
+/// Given an icmp instruction, return true if any use of this comparison is a
+/// branch on sign bit comparison.
+static bool hasBranchUse(ICmpInst &I) {
+ for (auto *U : I.users())
+ if (isa<BranchInst>(U))
+ return true;
+ return false;
+}
+
+/// Returns true if the exploded icmp can be expressed as a signed comparison
+/// to zero and updates the predicate accordingly.
+/// The signedness of the comparison is preserved.
+/// TODO: Refactor with decomposeBitTestICmp()?
+static bool isSignTest(ICmpInst::Predicate &Pred, const APInt &C) {
+ if (!ICmpInst::isSigned(Pred))
+ return false;
+
+ if (C.isNullValue())
+ return ICmpInst::isRelational(Pred);
+
+ if (C.isOneValue()) {
+ if (Pred == ICmpInst::ICMP_SLT) {
+ Pred = ICmpInst::ICMP_SLE;
+ return true;
+ }
+ } else if (C.isAllOnesValue()) {
+ if (Pred == ICmpInst::ICMP_SGT) {
+ Pred = ICmpInst::ICMP_SGE;
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/// This is called when we see this pattern:
+/// cmp pred (load (gep GV, ...)), cmpcst
+/// where GV is a global variable with a constant initializer. Try to simplify
+/// this into some simple computation that does not need the load. For example
+/// we can optimize "icmp eq (load (gep "foo", 0, i)), 0" into "icmp eq i, 3".
+///
+/// If AndCst is non-null, then the loaded value is masked with that constant
+/// before doing the comparison. This handles cases like "A[i]&4 == 0".
Instruction *
InstCombinerImpl::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
GlobalVariable *GV, CmpInst &ICI,
ConstantInt *AndCst) {
- Constant *Init = GV->getInitializer();
- if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
- return nullptr;
-
- uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
- // Don't blow up on huge arrays.
- if (ArrayElementCount > MaxArraySizeForCombine)
- return nullptr;
-
- // There are many forms of this optimization we can handle, for now, just do
- // the simple index into a single-dimensional array.
- //
- // Require: GEP GV, 0, i {{, constant indices}}
- if (GEP->getNumOperands() < 3 ||
- !isa<ConstantInt>(GEP->getOperand(1)) ||
- !cast<ConstantInt>(GEP->getOperand(1))->isZero() ||
- isa<Constant>(GEP->getOperand(2)))
- return nullptr;
-
- // Check that indices after the variable are constants and in-range for the
- // type they index. Collect the indices. This is typically for arrays of
- // structs.
- SmallVector<unsigned, 4> LaterIndices;
-
- Type *EltTy = Init->getType()->getArrayElementType();
- for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) {
- ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
- if (!Idx) return nullptr; // Variable index.
-
- uint64_t IdxVal = Idx->getZExtValue();
- if ((unsigned)IdxVal != IdxVal) return nullptr; // Too large array index.
-
- if (StructType *STy = dyn_cast<StructType>(EltTy))
- EltTy = STy->getElementType(IdxVal);
- else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
- if (IdxVal >= ATy->getNumElements()) return nullptr;
- EltTy = ATy->getElementType();
- } else {
- return nullptr; // Unknown type.
- }
-
- LaterIndices.push_back(IdxVal);
- }
-
- enum { Overdefined = -3, Undefined = -2 };
-
- // Variables for our state machines.
-
- // FirstTrueElement/SecondTrueElement - Used to emit a comparison of the form
- // "i == 47 | i == 87", where 47 is the first index the condition is true for,
- // and 87 is the second (and last) index. FirstTrueElement is -2 when
- // undefined, otherwise set to the first true element. SecondTrueElement is
- // -2 when undefined, -3 when overdefined and >= 0 when that index is true.
- int FirstTrueElement = Undefined, SecondTrueElement = Undefined;
-
- // FirstFalseElement/SecondFalseElement - Used to emit a comparison of the
- // form "i != 47 & i != 87". Same state transitions as for true elements.
- int FirstFalseElement = Undefined, SecondFalseElement = Undefined;
-
- /// TrueRangeEnd/FalseRangeEnd - In conjunction with First*Element, these
- /// define a state machine that triggers for ranges of values that the index
- /// is true or false for. This triggers on things like "abbbbc"[i] == 'b'.
- /// This is -2 when undefined, -3 when overdefined, and otherwise the last
- /// index in the range (inclusive). We use -2 for undefined here because we
- /// use relative comparisons and don't want 0-1 to match -1.
- int TrueRangeEnd = Undefined, FalseRangeEnd = Undefined;
-
- // MagicBitvector - This is a magic bitvector where we set a bit if the
- // comparison is true for element 'i'. If there are 64 elements or less in
- // the array, this will fully represent all the comparison results.
- uint64_t MagicBitvector = 0;
-
- // Scan the array and see if one of our patterns matches.
- Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
- for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) {
- Constant *Elt = Init->getAggregateElement(i);
- if (!Elt) return nullptr;
-
- // If this is indexing an array of structures, get the structure element.
- if (!LaterIndices.empty())
- Elt = ConstantExpr::getExtractValue(Elt, LaterIndices);
-
- // If the element is masked, handle it.
- if (AndCst) Elt = ConstantExpr::getAnd(Elt, AndCst);
-
- // Find out if the comparison would be true or false for the i'th element.
- Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt,
- CompareRHS, DL, &TLI);
- // If the result is undef for this element, ignore it.
- if (isa<UndefValue>(C)) {
- // Extend range state machines to cover this element in case there is an
- // undef in the middle of the range.
- if (TrueRangeEnd == (int)i-1)
- TrueRangeEnd = i;
- if (FalseRangeEnd == (int)i-1)
- FalseRangeEnd = i;
- continue;
- }
-
- // If we can't compute the result for any of the elements, we have to give
- // up evaluating the entire conditional.
- if (!isa<ConstantInt>(C)) return nullptr;
-
- // Otherwise, we know if the comparison is true or false for this element,
- // update our state machines.
- bool IsTrueForElt = !cast<ConstantInt>(C)->isZero();
-
- // State machine for single/double/range index comparison.
- if (IsTrueForElt) {
- // Update the TrueElement state machine.
- if (FirstTrueElement == Undefined)
- FirstTrueElement = TrueRangeEnd = i; // First true element.
- else {
- // Update double-compare state machine.
- if (SecondTrueElement == Undefined)
- SecondTrueElement = i;
- else
- SecondTrueElement = Overdefined;
-
- // Update range state machine.
- if (TrueRangeEnd == (int)i-1)
- TrueRangeEnd = i;
- else
- TrueRangeEnd = Overdefined;
- }
- } else {
- // Update the FalseElement state machine.
- if (FirstFalseElement == Undefined)
- FirstFalseElement = FalseRangeEnd = i; // First false element.
- else {
- // Update double-compare state machine.
- if (SecondFalseElement == Undefined)
- SecondFalseElement = i;
- else
- SecondFalseElement = Overdefined;
-
- // Update range state machine.
- if (FalseRangeEnd == (int)i-1)
- FalseRangeEnd = i;
- else
- FalseRangeEnd = Overdefined;
- }
- }
-
- // If this element is in range, update our magic bitvector.
- if (i < 64 && IsTrueForElt)
- MagicBitvector |= 1ULL << i;
-
- // If all of our states become overdefined, bail out early. Since the
- // predicate is expensive, only check it every 8 elements. This is only
- // really useful for really huge arrays.
- if ((i & 8) == 0 && i >= 64 && SecondTrueElement == Overdefined &&
- SecondFalseElement == Overdefined && TrueRangeEnd == Overdefined &&
- FalseRangeEnd == Overdefined)
- return nullptr;
- }
-
- // Now that we've scanned the entire array, emit our new comparison(s). We
- // order the state machines in complexity of the generated code.
- Value *Idx = GEP->getOperand(2);
-
- // If the index is larger than the pointer size of the target, truncate the
- // index down like the GEP would do implicitly. We don't have to do this for
- // an inbounds GEP because the index can't be out of range.
- if (!GEP->isInBounds()) {
- Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
- unsigned PtrSize = IntPtrTy->getIntegerBitWidth();
+ Constant *Init = GV->getInitializer();
+ if (!isa<ConstantArray>(Init) && !isa<ConstantDataArray>(Init))
+ return nullptr;
+
+ uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
+ // Don't blow up on huge arrays.
+ if (ArrayElementCount > MaxArraySizeForCombine)
+ return nullptr;
+
+ // There are many forms of this optimization we can handle, for now, just do
+ // the simple index into a single-dimensional array.
+ //
+ // Require: GEP GV, 0, i {{, constant indices}}
+ if (GEP->getNumOperands() < 3 ||
+ !isa<ConstantInt>(GEP->getOperand(1)) ||
+ !cast<ConstantInt>(GEP->getOperand(1))->isZero() ||
+ isa<Constant>(GEP->getOperand(2)))
+ return nullptr;
+
+ // Check that indices after the variable are constants and in-range for the
+ // type they index. Collect the indices. This is typically for arrays of
+ // structs.
+ SmallVector<unsigned, 4> LaterIndices;
+
+ Type *EltTy = Init->getType()->getArrayElementType();
+ for (unsigned i = 3, e = GEP->getNumOperands(); i != e; ++i) {
+ ConstantInt *Idx = dyn_cast<ConstantInt>(GEP->getOperand(i));
+ if (!Idx) return nullptr; // Variable index.
+
+ uint64_t IdxVal = Idx->getZExtValue();
+ if ((unsigned)IdxVal != IdxVal) return nullptr; // Too large array index.
+
+ if (StructType *STy = dyn_cast<StructType>(EltTy))
+ EltTy = STy->getElementType(IdxVal);
+ else if (ArrayType *ATy = dyn_cast<ArrayType>(EltTy)) {
+ if (IdxVal >= ATy->getNumElements()) return nullptr;
+ EltTy = ATy->getElementType();
+ } else {
+ return nullptr; // Unknown type.
+ }
+
+ LaterIndices.push_back(IdxVal);
+ }
+
+ enum { Overdefined = -3, Undefined = -2 };
+
+ // Variables for our state machines.
+
+ // FirstTrueElement/SecondTrueElement - Used to emit a comparison of the form
+ // "i == 47 | i == 87", where 47 is the first index the condition is true for,
+ // and 87 is the second (and last) index. FirstTrueElement is -2 when
+ // undefined, otherwise set to the first true element. SecondTrueElement is
+ // -2 when undefined, -3 when overdefined and >= 0 when that index is true.
+ int FirstTrueElement = Undefined, SecondTrueElement = Undefined;
+
+ // FirstFalseElement/SecondFalseElement - Used to emit a comparison of the
+ // form "i != 47 & i != 87". Same state transitions as for true elements.
+ int FirstFalseElement = Undefined, SecondFalseElement = Undefined;
+
+ /// TrueRangeEnd/FalseRangeEnd - In conjunction with First*Element, these
+ /// define a state machine that triggers for ranges of values that the index
+ /// is true or false for. This triggers on things like "abbbbc"[i] == 'b'.
+ /// This is -2 when undefined, -3 when overdefined, and otherwise the last
+ /// index in the range (inclusive). We use -2 for undefined here because we
+ /// use relative comparisons and don't want 0-1 to match -1.
+ int TrueRangeEnd = Undefined, FalseRangeEnd = Undefined;
+
+ // MagicBitvector - This is a magic bitvector where we set a bit if the
+ // comparison is true for element 'i'. If there are 64 elements or less in
+ // the array, this will fully represent all the comparison results.
+ uint64_t MagicBitvector = 0;
+
+ // Scan the array and see if one of our patterns matches.
+ Constant *CompareRHS = cast<Constant>(ICI.getOperand(1));
+ for (unsigned i = 0, e = ArrayElementCount; i != e; ++i) {
+ Constant *Elt = Init->getAggregateElement(i);
+ if (!Elt) return nullptr;
+
+ // If this is indexing an array of structures, get the structure element.
+ if (!LaterIndices.empty())
+ Elt = ConstantExpr::getExtractValue(Elt, LaterIndices);
+
+ // If the element is masked, handle it.
+ if (AndCst) Elt = ConstantExpr::getAnd(Elt, AndCst);
+
+ // Find out if the comparison would be true or false for the i'th element.
+ Constant *C = ConstantFoldCompareInstOperands(ICI.getPredicate(), Elt,
+ CompareRHS, DL, &TLI);
+ // If the result is undef for this element, ignore it.
+ if (isa<UndefValue>(C)) {
+ // Extend range state machines to cover this element in case there is an
+ // undef in the middle of the range.
+ if (TrueRangeEnd == (int)i-1)
+ TrueRangeEnd = i;
+ if (FalseRangeEnd == (int)i-1)
+ FalseRangeEnd = i;
+ continue;
+ }
+
+ // If we can't compute the result for any of the elements, we have to give
+ // up evaluating the entire conditional.
+ if (!isa<ConstantInt>(C)) return nullptr;
+
+ // Otherwise, we know if the comparison is true or false for this element,
+ // update our state machines.
+ bool IsTrueForElt = !cast<ConstantInt>(C)->isZero();
+
+ // State machine for single/double/range index comparison.
+ if (IsTrueForElt) {
+ // Update the TrueElement state machine.
+ if (FirstTrueElement == Undefined)
+ FirstTrueElement = TrueRangeEnd = i; // First true element.
+ else {
+ // Update double-compare state machine.
+ if (SecondTrueElement == Undefined)
+ SecondTrueElement = i;
+ else
+ SecondTrueElement = Overdefined;
+
+ // Update range state machine.
+ if (TrueRangeEnd == (int)i-1)
+ TrueRangeEnd = i;
+ else
+ TrueRangeEnd = Overdefined;
+ }
+ } else {
+ // Update the FalseElement state machine.
+ if (FirstFalseElement == Undefined)
+ FirstFalseElement = FalseRangeEnd = i; // First false element.
+ else {
+ // Update double-compare state machine.
+ if (SecondFalseElement == Undefined)
+ SecondFalseElement = i;
+ else
+ SecondFalseElement = Overdefined;
+
+ // Update range state machine.
+ if (FalseRangeEnd == (int)i-1)
+ FalseRangeEnd = i;
+ else
+ FalseRangeEnd = Overdefined;
+ }
+ }
+
+ // If this element is in range, update our magic bitvector.
+ if (i < 64 && IsTrueForElt)
+ MagicBitvector |= 1ULL << i;
+
+ // If all of our states become overdefined, bail out early. Since the
+ // predicate is expensive, only check it every 8 elements. This is only
+ // really useful for really huge arrays.
+ if ((i & 8) == 0 && i >= 64 && SecondTrueElement == Overdefined &&
+ SecondFalseElement == Overdefined && TrueRangeEnd == Overdefined &&
+ FalseRangeEnd == Overdefined)
+ return nullptr;
+ }
+
+ // Now that we've scanned the entire array, emit our new comparison(s). We
+ // order the state machines in complexity of the generated code.
+ Value *Idx = GEP->getOperand(2);
+
+ // If the index is larger than the pointer size of the target, truncate the
+ // index down like the GEP would do implicitly. We don't have to do this for
+ // an inbounds GEP because the index can't be out of range.
+ if (!GEP->isInBounds()) {
+ Type *IntPtrTy = DL.getIntPtrType(GEP->getType());
+ unsigned PtrSize = IntPtrTy->getIntegerBitWidth();
if (Idx->getType()->getPrimitiveSizeInBits().getFixedSize() > PtrSize)
- Idx = Builder.CreateTrunc(Idx, IntPtrTy);
- }
-
- // If the comparison is only true for one or two elements, emit direct
- // comparisons.
- if (SecondTrueElement != Overdefined) {
- // None true -> false.
- if (FirstTrueElement == Undefined)
- return replaceInstUsesWith(ICI, Builder.getFalse());
-
- Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement);
-
- // True for one element -> 'i == 47'.
- if (SecondTrueElement == Undefined)
- return new ICmpInst(ICmpInst::ICMP_EQ, Idx, FirstTrueIdx);
-
- // True for two elements -> 'i == 47 | i == 72'.
- Value *C1 = Builder.CreateICmpEQ(Idx, FirstTrueIdx);
- Value *SecondTrueIdx = ConstantInt::get(Idx->getType(), SecondTrueElement);
- Value *C2 = Builder.CreateICmpEQ(Idx, SecondTrueIdx);
- return BinaryOperator::CreateOr(C1, C2);
- }
-
- // If the comparison is only false for one or two elements, emit direct
- // comparisons.
- if (SecondFalseElement != Overdefined) {
- // None false -> true.
- if (FirstFalseElement == Undefined)
- return replaceInstUsesWith(ICI, Builder.getTrue());
-
- Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement);
-
- // False for one element -> 'i != 47'.
- if (SecondFalseElement == Undefined)
- return new ICmpInst(ICmpInst::ICMP_NE, Idx, FirstFalseIdx);
-
- // False for two elements -> 'i != 47 & i != 72'.
- Value *C1 = Builder.CreateICmpNE(Idx, FirstFalseIdx);
- Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement);
- Value *C2 = Builder.CreateICmpNE(Idx, SecondFalseIdx);
- return BinaryOperator::CreateAnd(C1, C2);
- }
-
- // If the comparison can be replaced with a range comparison for the elements
- // where it is true, emit the range check.
- if (TrueRangeEnd != Overdefined) {
- assert(TrueRangeEnd != FirstTrueElement && "Should emit single compare");
-
- // Generate (i-FirstTrue) <u (TrueRangeEnd-FirstTrue+1).
- if (FirstTrueElement) {
- Value *Offs = ConstantInt::get(Idx->getType(), -FirstTrueElement);
- Idx = Builder.CreateAdd(Idx, Offs);
- }
-
- Value *End = ConstantInt::get(Idx->getType(),
- TrueRangeEnd-FirstTrueElement+1);
- return new ICmpInst(ICmpInst::ICMP_ULT, Idx, End);
- }
-
- // False range check.
- if (FalseRangeEnd != Overdefined) {
- assert(FalseRangeEnd != FirstFalseElement && "Should emit single compare");
- // Generate (i-FirstFalse) >u (FalseRangeEnd-FirstFalse).
- if (FirstFalseElement) {
- Value *Offs = ConstantInt::get(Idx->getType(), -FirstFalseElement);
- Idx = Builder.CreateAdd(Idx, Offs);
- }
-
- Value *End = ConstantInt::get(Idx->getType(),
- FalseRangeEnd-FirstFalseElement);
- return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End);
- }
-
- // If a magic bitvector captures the entire comparison state
- // of this load, replace it with computation that does:
- // ((magic_cst >> i) & 1) != 0
- {
- Type *Ty = nullptr;
-
- // Look for an appropriate type:
- // - The type of Idx if the magic fits
- // - The smallest fitting legal type
- if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth())
- Ty = Idx->getType();
- else
- Ty = DL.getSmallestLegalIntType(Init->getContext(), ArrayElementCount);
-
- if (Ty) {
- Value *V = Builder.CreateIntCast(Idx, Ty, false);
- V = Builder.CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
- V = Builder.CreateAnd(ConstantInt::get(Ty, 1), V);
- return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
- }
- }
-
- return nullptr;
-}
-
-/// Return a value that can be used to compare the *offset* implied by a GEP to
-/// zero. For example, if we have &A[i], we want to return 'i' for
-/// "icmp ne i, 0". Note that, in general, indices can be complex, and scales
-/// are involved. The above expression would also be legal to codegen as
-/// "icmp ne (i*4), 0" (assuming A is a pointer to i32).
-/// This latter form is less amenable to optimization though, and we are allowed
-/// to generate the first by knowing that pointer arithmetic doesn't overflow.
-///
-/// If we can't emit an optimized form for this expression, this returns null.
-///
+ Idx = Builder.CreateTrunc(Idx, IntPtrTy);
+ }
+
+ // If the comparison is only true for one or two elements, emit direct
+ // comparisons.
+ if (SecondTrueElement != Overdefined) {
+ // None true -> false.
+ if (FirstTrueElement == Undefined)
+ return replaceInstUsesWith(ICI, Builder.getFalse());
+
+ Value *FirstTrueIdx = ConstantInt::get(Idx->getType(), FirstTrueElement);
+
+ // True for one element -> 'i == 47'.
+ if (SecondTrueElement == Undefined)
+ return new ICmpInst(ICmpInst::ICMP_EQ, Idx, FirstTrueIdx);
+
+ // True for two elements -> 'i == 47 | i == 72'.
+ Value *C1 = Builder.CreateICmpEQ(Idx, FirstTrueIdx);
+ Value *SecondTrueIdx = ConstantInt::get(Idx->getType(), SecondTrueElement);
+ Value *C2 = Builder.CreateICmpEQ(Idx, SecondTrueIdx);
+ return BinaryOperator::CreateOr(C1, C2);
+ }
+
+ // If the comparison is only false for one or two elements, emit direct
+ // comparisons.
+ if (SecondFalseElement != Overdefined) {
+ // None false -> true.
+ if (FirstFalseElement == Undefined)
+ return replaceInstUsesWith(ICI, Builder.getTrue());
+
+ Value *FirstFalseIdx = ConstantInt::get(Idx->getType(), FirstFalseElement);
+
+ // False for one element -> 'i != 47'.
+ if (SecondFalseElement == Undefined)
+ return new ICmpInst(ICmpInst::ICMP_NE, Idx, FirstFalseIdx);
+
+ // False for two elements -> 'i != 47 & i != 72'.
+ Value *C1 = Builder.CreateICmpNE(Idx, FirstFalseIdx);
+ Value *SecondFalseIdx = ConstantInt::get(Idx->getType(),SecondFalseElement);
+ Value *C2 = Builder.CreateICmpNE(Idx, SecondFalseIdx);
+ return BinaryOperator::CreateAnd(C1, C2);
+ }
+
+ // If the comparison can be replaced with a range comparison for the elements
+ // where it is true, emit the range check.
+ if (TrueRangeEnd != Overdefined) {
+ assert(TrueRangeEnd != FirstTrueElement && "Should emit single compare");
+
+ // Generate (i-FirstTrue) <u (TrueRangeEnd-FirstTrue+1).
+ if (FirstTrueElement) {
+ Value *Offs = ConstantInt::get(Idx->getType(), -FirstTrueElement);
+ Idx = Builder.CreateAdd(Idx, Offs);
+ }
+
+ Value *End = ConstantInt::get(Idx->getType(),
+ TrueRangeEnd-FirstTrueElement+1);
+ return new ICmpInst(ICmpInst::ICMP_ULT, Idx, End);
+ }
+
+ // False range check.
+ if (FalseRangeEnd != Overdefined) {
+ assert(FalseRangeEnd != FirstFalseElement && "Should emit single compare");
+ // Generate (i-FirstFalse) >u (FalseRangeEnd-FirstFalse).
+ if (FirstFalseElement) {
+ Value *Offs = ConstantInt::get(Idx->getType(), -FirstFalseElement);
+ Idx = Builder.CreateAdd(Idx, Offs);
+ }
+
+ Value *End = ConstantInt::get(Idx->getType(),
+ FalseRangeEnd-FirstFalseElement);
+ return new ICmpInst(ICmpInst::ICMP_UGT, Idx, End);
+ }
+
+ // If a magic bitvector captures the entire comparison state
+ // of this load, replace it with computation that does:
+ // ((magic_cst >> i) & 1) != 0
+ {
+ Type *Ty = nullptr;
+
+ // Look for an appropriate type:
+ // - The type of Idx if the magic fits
+ // - The smallest fitting legal type
+ if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth())
+ Ty = Idx->getType();
+ else
+ Ty = DL.getSmallestLegalIntType(Init->getContext(), ArrayElementCount);
+
+ if (Ty) {
+ Value *V = Builder.CreateIntCast(Idx, Ty, false);
+ V = Builder.CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
+ V = Builder.CreateAnd(ConstantInt::get(Ty, 1), V);
+ return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
+ }
+ }
+
+ return nullptr;
+}
+
+/// Return a value that can be used to compare the *offset* implied by a GEP to
+/// zero. For example, if we have &A[i], we want to return 'i' for
+/// "icmp ne i, 0". Note that, in general, indices can be complex, and scales
+/// are involved. The above expression would also be legal to codegen as
+/// "icmp ne (i*4), 0" (assuming A is a pointer to i32).
+/// This latter form is less amenable to optimization though, and we are allowed
+/// to generate the first by knowing that pointer arithmetic doesn't overflow.
+///
+/// If we can't emit an optimized form for this expression, this returns null.
+///
static Value *evaluateGEPOffsetExpression(User *GEP, InstCombinerImpl &IC,
- const DataLayout &DL) {
- gep_type_iterator GTI = gep_type_begin(GEP);
-
- // Check to see if this gep only has a single variable index. If so, and if
- // any constant indices are a multiple of its scale, then we can compute this
- // in terms of the scale of the variable index. For example, if the GEP
- // implies an offset of "12 + i*4", then we can codegen this as "3 + i",
- // because the expression will cross zero at the same point.
- unsigned i, e = GEP->getNumOperands();
- int64_t Offset = 0;
- for (i = 1; i != e; ++i, ++GTI) {
- if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
- // Compute the aggregate offset of constant indices.
- if (CI->isZero()) continue;
-
- // Handle a struct index, which adds its field offset to the pointer.
- if (StructType *STy = GTI.getStructTypeOrNull()) {
- Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
- } else {
- uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
- Offset += Size*CI->getSExtValue();
- }
- } else {
- // Found our variable index.
- break;
- }
- }
-
- // If there are no variable indices, we must have a constant offset, just
- // evaluate it the general way.
- if (i == e) return nullptr;
-
- Value *VariableIdx = GEP->getOperand(i);
- // Determine the scale factor of the variable element. For example, this is
- // 4 if the variable index is into an array of i32.
- uint64_t VariableScale = DL.getTypeAllocSize(GTI.getIndexedType());
-
- // Verify that there are no other variable indices. If so, emit the hard way.
- for (++i, ++GTI; i != e; ++i, ++GTI) {
- ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i));
- if (!CI) return nullptr;
-
- // Compute the aggregate offset of constant indices.
- if (CI->isZero()) continue;
-
- // Handle a struct index, which adds its field offset to the pointer.
- if (StructType *STy = GTI.getStructTypeOrNull()) {
- Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
- } else {
- uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
- Offset += Size*CI->getSExtValue();
- }
- }
-
- // Okay, we know we have a single variable index, which must be a
- // pointer/array/vector index. If there is no offset, life is simple, return
- // the index.
- Type *IntPtrTy = DL.getIntPtrType(GEP->getOperand(0)->getType());
- unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth();
- if (Offset == 0) {
- // Cast to intptrty in case a truncation occurs. If an extension is needed,
- // we don't need to bother extending: the extension won't affect where the
- // computation crosses zero.
+ const DataLayout &DL) {
+ gep_type_iterator GTI = gep_type_begin(GEP);
+
+ // Check to see if this gep only has a single variable index. If so, and if
+ // any constant indices are a multiple of its scale, then we can compute this
+ // in terms of the scale of the variable index. For example, if the GEP
+ // implies an offset of "12 + i*4", then we can codegen this as "3 + i",
+ // because the expression will cross zero at the same point.
+ unsigned i, e = GEP->getNumOperands();
+ int64_t Offset = 0;
+ for (i = 1; i != e; ++i, ++GTI) {
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i))) {
+ // Compute the aggregate offset of constant indices.
+ if (CI->isZero()) continue;
+
+ // Handle a struct index, which adds its field offset to the pointer.
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
+ Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
+ } else {
+ uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
+ Offset += Size*CI->getSExtValue();
+ }
+ } else {
+ // Found our variable index.
+ break;
+ }
+ }
+
+ // If there are no variable indices, we must have a constant offset, just
+ // evaluate it the general way.
+ if (i == e) return nullptr;
+
+ Value *VariableIdx = GEP->getOperand(i);
+ // Determine the scale factor of the variable element. For example, this is
+ // 4 if the variable index is into an array of i32.
+ uint64_t VariableScale = DL.getTypeAllocSize(GTI.getIndexedType());
+
+ // Verify that there are no other variable indices. If so, emit the hard way.
+ for (++i, ++GTI; i != e; ++i, ++GTI) {
+ ConstantInt *CI = dyn_cast<ConstantInt>(GEP->getOperand(i));
+ if (!CI) return nullptr;
+
+ // Compute the aggregate offset of constant indices.
+ if (CI->isZero()) continue;
+
+ // Handle a struct index, which adds its field offset to the pointer.
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
+ Offset += DL.getStructLayout(STy)->getElementOffset(CI->getZExtValue());
+ } else {
+ uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
+ Offset += Size*CI->getSExtValue();
+ }
+ }
+
+ // Okay, we know we have a single variable index, which must be a
+ // pointer/array/vector index. If there is no offset, life is simple, return
+ // the index.
+ Type *IntPtrTy = DL.getIntPtrType(GEP->getOperand(0)->getType());
+ unsigned IntPtrWidth = IntPtrTy->getIntegerBitWidth();
+ if (Offset == 0) {
+ // Cast to intptrty in case a truncation occurs. If an extension is needed,
+ // we don't need to bother extending: the extension won't affect where the
+ // computation crosses zero.
if (VariableIdx->getType()->getPrimitiveSizeInBits().getFixedSize() >
IntPtrWidth) {
- VariableIdx = IC.Builder.CreateTrunc(VariableIdx, IntPtrTy);
- }
- return VariableIdx;
- }
-
- // Otherwise, there is an index. The computation we will do will be modulo
- // the pointer size.
- Offset = SignExtend64(Offset, IntPtrWidth);
- VariableScale = SignExtend64(VariableScale, IntPtrWidth);
-
- // To do this transformation, any constant index must be a multiple of the
- // variable scale factor. For example, we can evaluate "12 + 4*i" as "3 + i",
- // but we can't evaluate "10 + 3*i" in terms of i. Check that the offset is a
- // multiple of the variable scale.
- int64_t NewOffs = Offset / (int64_t)VariableScale;
- if (Offset != NewOffs*(int64_t)VariableScale)
- return nullptr;
-
- // Okay, we can do this evaluation. Start by converting the index to intptr.
- if (VariableIdx->getType() != IntPtrTy)
- VariableIdx = IC.Builder.CreateIntCast(VariableIdx, IntPtrTy,
- true /*Signed*/);
- Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs);
- return IC.Builder.CreateAdd(VariableIdx, OffsetVal, "offset");
-}
-
-/// Returns true if we can rewrite Start as a GEP with pointer Base
-/// and some integer offset. The nodes that need to be re-written
-/// for this transformation will be added to Explored.
-static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
- const DataLayout &DL,
- SetVector<Value *> &Explored) {
- SmallVector<Value *, 16> WorkList(1, Start);
- Explored.insert(Base);
-
- // The following traversal gives us an order which can be used
- // when doing the final transformation. Since in the final
- // transformation we create the PHI replacement instructions first,
- // we don't have to get them in any particular order.
- //
- // However, for other instructions we will have to traverse the
- // operands of an instruction first, which means that we have to
- // do a post-order traversal.
- while (!WorkList.empty()) {
- SetVector<PHINode *> PHIs;
-
- while (!WorkList.empty()) {
- if (Explored.size() >= 100)
- return false;
-
- Value *V = WorkList.back();
-
+ VariableIdx = IC.Builder.CreateTrunc(VariableIdx, IntPtrTy);
+ }
+ return VariableIdx;
+ }
+
+ // Otherwise, there is an index. The computation we will do will be modulo
+ // the pointer size.
+ Offset = SignExtend64(Offset, IntPtrWidth);
+ VariableScale = SignExtend64(VariableScale, IntPtrWidth);
+
+ // To do this transformation, any constant index must be a multiple of the
+ // variable scale factor. For example, we can evaluate "12 + 4*i" as "3 + i",
+ // but we can't evaluate "10 + 3*i" in terms of i. Check that the offset is a
+ // multiple of the variable scale.
+ int64_t NewOffs = Offset / (int64_t)VariableScale;
+ if (Offset != NewOffs*(int64_t)VariableScale)
+ return nullptr;
+
+ // Okay, we can do this evaluation. Start by converting the index to intptr.
+ if (VariableIdx->getType() != IntPtrTy)
+ VariableIdx = IC.Builder.CreateIntCast(VariableIdx, IntPtrTy,
+ true /*Signed*/);
+ Constant *OffsetVal = ConstantInt::get(IntPtrTy, NewOffs);
+ return IC.Builder.CreateAdd(VariableIdx, OffsetVal, "offset");
+}
+
+/// Returns true if we can rewrite Start as a GEP with pointer Base
+/// and some integer offset. The nodes that need to be re-written
+/// for this transformation will be added to Explored.
+static bool canRewriteGEPAsOffset(Value *Start, Value *Base,
+ const DataLayout &DL,
+ SetVector<Value *> &Explored) {
+ SmallVector<Value *, 16> WorkList(1, Start);
+ Explored.insert(Base);
+
+ // The following traversal gives us an order which can be used
+ // when doing the final transformation. Since in the final
+ // transformation we create the PHI replacement instructions first,
+ // we don't have to get them in any particular order.
+ //
+ // However, for other instructions we will have to traverse the
+ // operands of an instruction first, which means that we have to
+ // do a post-order traversal.
+ while (!WorkList.empty()) {
+ SetVector<PHINode *> PHIs;
+
+ while (!WorkList.empty()) {
+ if (Explored.size() >= 100)
+ return false;
+
+ Value *V = WorkList.back();
+
if (Explored.contains(V)) {
- WorkList.pop_back();
- continue;
- }
-
- if (!isa<IntToPtrInst>(V) && !isa<PtrToIntInst>(V) &&
- !isa<GetElementPtrInst>(V) && !isa<PHINode>(V))
- // We've found some value that we can't explore which is different from
- // the base. Therefore we can't do this transformation.
- return false;
-
- if (isa<IntToPtrInst>(V) || isa<PtrToIntInst>(V)) {
+ WorkList.pop_back();
+ continue;
+ }
+
+ if (!isa<IntToPtrInst>(V) && !isa<PtrToIntInst>(V) &&
+ !isa<GetElementPtrInst>(V) && !isa<PHINode>(V))
+ // We've found some value that we can't explore which is different from
+ // the base. Therefore we can't do this transformation.
+ return false;
+
+ if (isa<IntToPtrInst>(V) || isa<PtrToIntInst>(V)) {
auto *CI = cast<CastInst>(V);
- if (!CI->isNoopCast(DL))
- return false;
-
- if (Explored.count(CI->getOperand(0)) == 0)
- WorkList.push_back(CI->getOperand(0));
- }
-
- if (auto *GEP = dyn_cast<GEPOperator>(V)) {
- // We're limiting the GEP to having one index. This will preserve
- // the original pointer type. We could handle more cases in the
- // future.
- if (GEP->getNumIndices() != 1 || !GEP->isInBounds() ||
- GEP->getType() != Start->getType())
- return false;
-
- if (Explored.count(GEP->getOperand(0)) == 0)
- WorkList.push_back(GEP->getOperand(0));
- }
-
- if (WorkList.back() == V) {
- WorkList.pop_back();
- // We've finished visiting this node, mark it as such.
- Explored.insert(V);
- }
-
- if (auto *PN = dyn_cast<PHINode>(V)) {
- // We cannot transform PHIs on unsplittable basic blocks.
- if (isa<CatchSwitchInst>(PN->getParent()->getTerminator()))
- return false;
- Explored.insert(PN);
- PHIs.insert(PN);
- }
- }
-
- // Explore the PHI nodes further.
- for (auto *PN : PHIs)
- for (Value *Op : PN->incoming_values())
- if (Explored.count(Op) == 0)
- WorkList.push_back(Op);
- }
-
- // Make sure that we can do this. Since we can't insert GEPs in a basic
- // block before a PHI node, we can't easily do this transformation if
- // we have PHI node users of transformed instructions.
- for (Value *Val : Explored) {
- for (Value *Use : Val->uses()) {
-
- auto *PHI = dyn_cast<PHINode>(Use);
- auto *Inst = dyn_cast<Instruction>(Val);
-
- if (Inst == Base || Inst == PHI || !Inst || !PHI ||
- Explored.count(PHI) == 0)
- continue;
-
- if (PHI->getParent() == Inst->getParent())
- return false;
- }
- }
- return true;
-}
-
-// Sets the appropriate insert point on Builder where we can add
-// a replacement Instruction for V (if that is possible).
-static void setInsertionPoint(IRBuilder<> &Builder, Value *V,
- bool Before = true) {
- if (auto *PHI = dyn_cast<PHINode>(V)) {
- Builder.SetInsertPoint(&*PHI->getParent()->getFirstInsertionPt());
- return;
- }
- if (auto *I = dyn_cast<Instruction>(V)) {
- if (!Before)
- I = &*std::next(I->getIterator());
- Builder.SetInsertPoint(I);
- return;
- }
- if (auto *A = dyn_cast<Argument>(V)) {
- // Set the insertion point in the entry block.
- BasicBlock &Entry = A->getParent()->getEntryBlock();
- Builder.SetInsertPoint(&*Entry.getFirstInsertionPt());
- return;
- }
- // Otherwise, this is a constant and we don't need to set a new
- // insertion point.
- assert(isa<Constant>(V) && "Setting insertion point for unknown value!");
-}
-
-/// Returns a re-written value of Start as an indexed GEP using Base as a
-/// pointer.
-static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
- const DataLayout &DL,
- SetVector<Value *> &Explored) {
- // Perform all the substitutions. This is a bit tricky because we can
- // have cycles in our use-def chains.
- // 1. Create the PHI nodes without any incoming values.
- // 2. Create all the other values.
- // 3. Add the edges for the PHI nodes.
- // 4. Emit GEPs to get the original pointers.
- // 5. Remove the original instructions.
- Type *IndexType = IntegerType::get(
- Base->getContext(), DL.getIndexTypeSizeInBits(Start->getType()));
-
- DenseMap<Value *, Value *> NewInsts;
- NewInsts[Base] = ConstantInt::getNullValue(IndexType);
-
- // Create the new PHI nodes, without adding any incoming values.
- for (Value *Val : Explored) {
- if (Val == Base)
- continue;
- // Create empty phi nodes. This avoids cyclic dependencies when creating
- // the remaining instructions.
- if (auto *PHI = dyn_cast<PHINode>(Val))
- NewInsts[PHI] = PHINode::Create(IndexType, PHI->getNumIncomingValues(),
- PHI->getName() + ".idx", PHI);
- }
- IRBuilder<> Builder(Base->getContext());
-
- // Create all the other instructions.
- for (Value *Val : Explored) {
-
- if (NewInsts.find(Val) != NewInsts.end())
- continue;
-
- if (auto *CI = dyn_cast<CastInst>(Val)) {
- // Don't get rid of the intermediate variable here; the store can grow
- // the map which will invalidate the reference to the input value.
- Value *V = NewInsts[CI->getOperand(0)];
- NewInsts[CI] = V;
- continue;
- }
- if (auto *GEP = dyn_cast<GEPOperator>(Val)) {
- Value *Index = NewInsts[GEP->getOperand(1)] ? NewInsts[GEP->getOperand(1)]
- : GEP->getOperand(1);
- setInsertionPoint(Builder, GEP);
- // Indices might need to be sign extended. GEPs will magically do
- // this, but we need to do it ourselves here.
- if (Index->getType()->getScalarSizeInBits() !=
- NewInsts[GEP->getOperand(0)]->getType()->getScalarSizeInBits()) {
- Index = Builder.CreateSExtOrTrunc(
- Index, NewInsts[GEP->getOperand(0)]->getType(),
- GEP->getOperand(0)->getName() + ".sext");
- }
-
- auto *Op = NewInsts[GEP->getOperand(0)];
- if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
- NewInsts[GEP] = Index;
- else
- NewInsts[GEP] = Builder.CreateNSWAdd(
- Op, Index, GEP->getOperand(0)->getName() + ".add");
- continue;
- }
- if (isa<PHINode>(Val))
- continue;
-
- llvm_unreachable("Unexpected instruction type");
- }
-
- // Add the incoming values to the PHI nodes.
- for (Value *Val : Explored) {
- if (Val == Base)
- continue;
- // All the instructions have been created, we can now add edges to the
- // phi nodes.
- if (auto *PHI = dyn_cast<PHINode>(Val)) {
- PHINode *NewPhi = static_cast<PHINode *>(NewInsts[PHI]);
- for (unsigned I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
- Value *NewIncoming = PHI->getIncomingValue(I);
-
- if (NewInsts.find(NewIncoming) != NewInsts.end())
- NewIncoming = NewInsts[NewIncoming];
-
- NewPhi->addIncoming(NewIncoming, PHI->getIncomingBlock(I));
- }
- }
- }
-
- for (Value *Val : Explored) {
- if (Val == Base)
- continue;
-
- // Depending on the type, for external users we have to emit
- // a GEP or a GEP + ptrtoint.
- setInsertionPoint(Builder, Val, false);
-
- // If required, create an inttoptr instruction for Base.
- Value *NewBase = Base;
- if (!Base->getType()->isPointerTy())
- NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(),
- Start->getName() + "to.ptr");
-
- Value *GEP = Builder.CreateInBoundsGEP(
- Start->getType()->getPointerElementType(), NewBase,
- makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr");
-
- if (!Val->getType()->isPointerTy()) {
- Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(),
- Val->getName() + ".conv");
- GEP = Cast;
- }
- Val->replaceAllUsesWith(GEP);
- }
-
- return NewInsts[Start];
-}
-
-/// Looks through GEPs, IntToPtrInsts and PtrToIntInsts in order to express
-/// the input Value as a constant indexed GEP. Returns a pair containing
-/// the GEPs Pointer and Index.
-static std::pair<Value *, Value *>
-getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
- Type *IndexType = IntegerType::get(V->getContext(),
- DL.getIndexTypeSizeInBits(V->getType()));
-
- Constant *Index = ConstantInt::getNullValue(IndexType);
- while (true) {
- if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
- // We accept only inbouds GEPs here to exclude the possibility of
- // overflow.
- if (!GEP->isInBounds())
- break;
- if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 &&
- GEP->getType() == V->getType()) {
- V = GEP->getOperand(0);
- Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1));
- Index = ConstantExpr::getAdd(
- Index, ConstantExpr::getSExtOrBitCast(GEPIndex, IndexType));
- continue;
- }
- break;
- }
- if (auto *CI = dyn_cast<IntToPtrInst>(V)) {
- if (!CI->isNoopCast(DL))
- break;
- V = CI->getOperand(0);
- continue;
- }
- if (auto *CI = dyn_cast<PtrToIntInst>(V)) {
- if (!CI->isNoopCast(DL))
- break;
- V = CI->getOperand(0);
- continue;
- }
- break;
- }
- return {V, Index};
-}
-
-/// Converts (CMP GEPLHS, RHS) if this change would make RHS a constant.
-/// We can look through PHIs, GEPs and casts in order to determine a common base
-/// between GEPLHS and RHS.
-static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
- ICmpInst::Predicate Cond,
- const DataLayout &DL) {
- // FIXME: Support vector of pointers.
- if (GEPLHS->getType()->isVectorTy())
- return nullptr;
-
- if (!GEPLHS->hasAllConstantIndices())
- return nullptr;
-
- // Make sure the pointers have the same type.
- if (GEPLHS->getType() != RHS->getType())
- return nullptr;
-
- Value *PtrBase, *Index;
- std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL);
-
- // The set of nodes that will take part in this transformation.
- SetVector<Value *> Nodes;
-
- if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes))
- return nullptr;
-
- // We know we can re-write this as
- // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)
- // Since we've only looked through inbouds GEPs we know that we
- // can't have overflow on either side. We can therefore re-write
- // this as:
- // OFFSET1 cmp OFFSET2
- Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes);
-
- // RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written
- // GEP having PtrBase as the pointer base, and has returned in NewRHS the
- // offset. Since Index is the offset of LHS to the base pointer, we will now
- // compare the offsets instead of comparing the pointers.
- return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Index, NewRHS);
-}
-
-/// Fold comparisons between a GEP instruction and something else. At this point
-/// we know that the GEP is on the LHS of the comparison.
+ if (!CI->isNoopCast(DL))
+ return false;
+
+ if (Explored.count(CI->getOperand(0)) == 0)
+ WorkList.push_back(CI->getOperand(0));
+ }
+
+ if (auto *GEP = dyn_cast<GEPOperator>(V)) {
+ // We're limiting the GEP to having one index. This will preserve
+ // the original pointer type. We could handle more cases in the
+ // future.
+ if (GEP->getNumIndices() != 1 || !GEP->isInBounds() ||
+ GEP->getType() != Start->getType())
+ return false;
+
+ if (Explored.count(GEP->getOperand(0)) == 0)
+ WorkList.push_back(GEP->getOperand(0));
+ }
+
+ if (WorkList.back() == V) {
+ WorkList.pop_back();
+ // We've finished visiting this node, mark it as such.
+ Explored.insert(V);
+ }
+
+ if (auto *PN = dyn_cast<PHINode>(V)) {
+ // We cannot transform PHIs on unsplittable basic blocks.
+ if (isa<CatchSwitchInst>(PN->getParent()->getTerminator()))
+ return false;
+ Explored.insert(PN);
+ PHIs.insert(PN);
+ }
+ }
+
+ // Explore the PHI nodes further.
+ for (auto *PN : PHIs)
+ for (Value *Op : PN->incoming_values())
+ if (Explored.count(Op) == 0)
+ WorkList.push_back(Op);
+ }
+
+ // Make sure that we can do this. Since we can't insert GEPs in a basic
+ // block before a PHI node, we can't easily do this transformation if
+ // we have PHI node users of transformed instructions.
+ for (Value *Val : Explored) {
+ for (Value *Use : Val->uses()) {
+
+ auto *PHI = dyn_cast<PHINode>(Use);
+ auto *Inst = dyn_cast<Instruction>(Val);
+
+ if (Inst == Base || Inst == PHI || !Inst || !PHI ||
+ Explored.count(PHI) == 0)
+ continue;
+
+ if (PHI->getParent() == Inst->getParent())
+ return false;
+ }
+ }
+ return true;
+}
+
+// Sets the appropriate insert point on Builder where we can add
+// a replacement Instruction for V (if that is possible).
+static void setInsertionPoint(IRBuilder<> &Builder, Value *V,
+ bool Before = true) {
+ if (auto *PHI = dyn_cast<PHINode>(V)) {
+ Builder.SetInsertPoint(&*PHI->getParent()->getFirstInsertionPt());
+ return;
+ }
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ if (!Before)
+ I = &*std::next(I->getIterator());
+ Builder.SetInsertPoint(I);
+ return;
+ }
+ if (auto *A = dyn_cast<Argument>(V)) {
+ // Set the insertion point in the entry block.
+ BasicBlock &Entry = A->getParent()->getEntryBlock();
+ Builder.SetInsertPoint(&*Entry.getFirstInsertionPt());
+ return;
+ }
+ // Otherwise, this is a constant and we don't need to set a new
+ // insertion point.
+ assert(isa<Constant>(V) && "Setting insertion point for unknown value!");
+}
+
+/// Returns a re-written value of Start as an indexed GEP using Base as a
+/// pointer.
+static Value *rewriteGEPAsOffset(Value *Start, Value *Base,
+ const DataLayout &DL,
+ SetVector<Value *> &Explored) {
+ // Perform all the substitutions. This is a bit tricky because we can
+ // have cycles in our use-def chains.
+ // 1. Create the PHI nodes without any incoming values.
+ // 2. Create all the other values.
+ // 3. Add the edges for the PHI nodes.
+ // 4. Emit GEPs to get the original pointers.
+ // 5. Remove the original instructions.
+ Type *IndexType = IntegerType::get(
+ Base->getContext(), DL.getIndexTypeSizeInBits(Start->getType()));
+
+ DenseMap<Value *, Value *> NewInsts;
+ NewInsts[Base] = ConstantInt::getNullValue(IndexType);
+
+ // Create the new PHI nodes, without adding any incoming values.
+ for (Value *Val : Explored) {
+ if (Val == Base)
+ continue;
+ // Create empty phi nodes. This avoids cyclic dependencies when creating
+ // the remaining instructions.
+ if (auto *PHI = dyn_cast<PHINode>(Val))
+ NewInsts[PHI] = PHINode::Create(IndexType, PHI->getNumIncomingValues(),
+ PHI->getName() + ".idx", PHI);
+ }
+ IRBuilder<> Builder(Base->getContext());
+
+ // Create all the other instructions.
+ for (Value *Val : Explored) {
+
+ if (NewInsts.find(Val) != NewInsts.end())
+ continue;
+
+ if (auto *CI = dyn_cast<CastInst>(Val)) {
+ // Don't get rid of the intermediate variable here; the store can grow
+ // the map which will invalidate the reference to the input value.
+ Value *V = NewInsts[CI->getOperand(0)];
+ NewInsts[CI] = V;
+ continue;
+ }
+ if (auto *GEP = dyn_cast<GEPOperator>(Val)) {
+ Value *Index = NewInsts[GEP->getOperand(1)] ? NewInsts[GEP->getOperand(1)]
+ : GEP->getOperand(1);
+ setInsertionPoint(Builder, GEP);
+ // Indices might need to be sign extended. GEPs will magically do
+ // this, but we need to do it ourselves here.
+ if (Index->getType()->getScalarSizeInBits() !=
+ NewInsts[GEP->getOperand(0)]->getType()->getScalarSizeInBits()) {
+ Index = Builder.CreateSExtOrTrunc(
+ Index, NewInsts[GEP->getOperand(0)]->getType(),
+ GEP->getOperand(0)->getName() + ".sext");
+ }
+
+ auto *Op = NewInsts[GEP->getOperand(0)];
+ if (isa<ConstantInt>(Op) && cast<ConstantInt>(Op)->isZero())
+ NewInsts[GEP] = Index;
+ else
+ NewInsts[GEP] = Builder.CreateNSWAdd(
+ Op, Index, GEP->getOperand(0)->getName() + ".add");
+ continue;
+ }
+ if (isa<PHINode>(Val))
+ continue;
+
+ llvm_unreachable("Unexpected instruction type");
+ }
+
+ // Add the incoming values to the PHI nodes.
+ for (Value *Val : Explored) {
+ if (Val == Base)
+ continue;
+ // All the instructions have been created, we can now add edges to the
+ // phi nodes.
+ if (auto *PHI = dyn_cast<PHINode>(Val)) {
+ PHINode *NewPhi = static_cast<PHINode *>(NewInsts[PHI]);
+ for (unsigned I = 0, E = PHI->getNumIncomingValues(); I < E; ++I) {
+ Value *NewIncoming = PHI->getIncomingValue(I);
+
+ if (NewInsts.find(NewIncoming) != NewInsts.end())
+ NewIncoming = NewInsts[NewIncoming];
+
+ NewPhi->addIncoming(NewIncoming, PHI->getIncomingBlock(I));
+ }
+ }
+ }
+
+ for (Value *Val : Explored) {
+ if (Val == Base)
+ continue;
+
+ // Depending on the type, for external users we have to emit
+ // a GEP or a GEP + ptrtoint.
+ setInsertionPoint(Builder, Val, false);
+
+ // If required, create an inttoptr instruction for Base.
+ Value *NewBase = Base;
+ if (!Base->getType()->isPointerTy())
+ NewBase = Builder.CreateBitOrPointerCast(Base, Start->getType(),
+ Start->getName() + "to.ptr");
+
+ Value *GEP = Builder.CreateInBoundsGEP(
+ Start->getType()->getPointerElementType(), NewBase,
+ makeArrayRef(NewInsts[Val]), Val->getName() + ".ptr");
+
+ if (!Val->getType()->isPointerTy()) {
+ Value *Cast = Builder.CreatePointerCast(GEP, Val->getType(),
+ Val->getName() + ".conv");
+ GEP = Cast;
+ }
+ Val->replaceAllUsesWith(GEP);
+ }
+
+ return NewInsts[Start];
+}
+
+/// Looks through GEPs, IntToPtrInsts and PtrToIntInsts in order to express
+/// the input Value as a constant indexed GEP. Returns a pair containing
+/// the GEPs Pointer and Index.
+static std::pair<Value *, Value *>
+getAsConstantIndexedAddress(Value *V, const DataLayout &DL) {
+ Type *IndexType = IntegerType::get(V->getContext(),
+ DL.getIndexTypeSizeInBits(V->getType()));
+
+ Constant *Index = ConstantInt::getNullValue(IndexType);
+ while (true) {
+ if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+ // We accept only inbouds GEPs here to exclude the possibility of
+ // overflow.
+ if (!GEP->isInBounds())
+ break;
+ if (GEP->hasAllConstantIndices() && GEP->getNumIndices() == 1 &&
+ GEP->getType() == V->getType()) {
+ V = GEP->getOperand(0);
+ Constant *GEPIndex = static_cast<Constant *>(GEP->getOperand(1));
+ Index = ConstantExpr::getAdd(
+ Index, ConstantExpr::getSExtOrBitCast(GEPIndex, IndexType));
+ continue;
+ }
+ break;
+ }
+ if (auto *CI = dyn_cast<IntToPtrInst>(V)) {
+ if (!CI->isNoopCast(DL))
+ break;
+ V = CI->getOperand(0);
+ continue;
+ }
+ if (auto *CI = dyn_cast<PtrToIntInst>(V)) {
+ if (!CI->isNoopCast(DL))
+ break;
+ V = CI->getOperand(0);
+ continue;
+ }
+ break;
+ }
+ return {V, Index};
+}
+
+/// Converts (CMP GEPLHS, RHS) if this change would make RHS a constant.
+/// We can look through PHIs, GEPs and casts in order to determine a common base
+/// between GEPLHS and RHS.
+static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
+ ICmpInst::Predicate Cond,
+ const DataLayout &DL) {
+ // FIXME: Support vector of pointers.
+ if (GEPLHS->getType()->isVectorTy())
+ return nullptr;
+
+ if (!GEPLHS->hasAllConstantIndices())
+ return nullptr;
+
+ // Make sure the pointers have the same type.
+ if (GEPLHS->getType() != RHS->getType())
+ return nullptr;
+
+ Value *PtrBase, *Index;
+ std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL);
+
+ // The set of nodes that will take part in this transformation.
+ SetVector<Value *> Nodes;
+
+ if (!canRewriteGEPAsOffset(RHS, PtrBase, DL, Nodes))
+ return nullptr;
+
+ // We know we can re-write this as
+ // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)
+ // Since we've only looked through inbouds GEPs we know that we
+ // can't have overflow on either side. We can therefore re-write
+ // this as:
+ // OFFSET1 cmp OFFSET2
+ Value *NewRHS = rewriteGEPAsOffset(RHS, PtrBase, DL, Nodes);
+
+ // RewriteGEPAsOffset has replaced RHS and all of its uses with a re-written
+ // GEP having PtrBase as the pointer base, and has returned in NewRHS the
+ // offset. Since Index is the offset of LHS to the base pointer, we will now
+ // compare the offsets instead of comparing the pointers.
+ return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Index, NewRHS);
+}
+
+/// Fold comparisons between a GEP instruction and something else. At this point
+/// we know that the GEP is on the LHS of the comparison.
Instruction *InstCombinerImpl::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
ICmpInst::Predicate Cond,
Instruction &I) {
- // Don't transform signed compares of GEPs into index compares. Even if the
- // GEP is inbounds, the final add of the base pointer can have signed overflow
- // and would change the result of the icmp.
- // e.g. "&foo[0] <s &foo[1]" can't be folded to "true" because "foo" could be
- // the maximum signed value for the pointer type.
- if (ICmpInst::isSigned(Cond))
- return nullptr;
-
- // Look through bitcasts and addrspacecasts. We do not however want to remove
- // 0 GEPs.
- if (!isa<GetElementPtrInst>(RHS))
- RHS = RHS->stripPointerCasts();
-
- Value *PtrBase = GEPLHS->getOperand(0);
- // FIXME: Support vector pointer GEPs.
- if (PtrBase == RHS && GEPLHS->isInBounds() &&
- !GEPLHS->getType()->isVectorTy()) {
- // ((gep Ptr, OFFSET) cmp Ptr) ---> (OFFSET cmp 0).
- // This transformation (ignoring the base and scales) is valid because we
- // know pointers can't overflow since the gep is inbounds. See if we can
- // output an optimized form.
- Value *Offset = evaluateGEPOffsetExpression(GEPLHS, *this, DL);
-
- // If not, synthesize the offset the hard way.
- if (!Offset)
- Offset = EmitGEPOffset(GEPLHS);
- return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
- Constant::getNullValue(Offset->getType()));
- }
-
- if (GEPLHS->isInBounds() && ICmpInst::isEquality(Cond) &&
- isa<Constant>(RHS) && cast<Constant>(RHS)->isNullValue() &&
- !NullPointerIsDefined(I.getFunction(),
- RHS->getType()->getPointerAddressSpace())) {
- // For most address spaces, an allocation can't be placed at null, but null
- // itself is treated as a 0 size allocation in the in bounds rules. Thus,
- // the only valid inbounds address derived from null, is null itself.
- // Thus, we have four cases to consider:
- // 1) Base == nullptr, Offset == 0 -> inbounds, null
- // 2) Base == nullptr, Offset != 0 -> poison as the result is out of bounds
- // 3) Base != nullptr, Offset == (-base) -> poison (crossing allocations)
- // 4) Base != nullptr, Offset != (-base) -> nonnull (and possibly poison)
- //
- // (Note if we're indexing a type of size 0, that simply collapses into one
- // of the buckets above.)
- //
- // In general, we're allowed to make values less poison (i.e. remove
- // sources of full UB), so in this case, we just select between the two
- // non-poison cases (1 and 4 above).
- //
- // For vectors, we apply the same reasoning on a per-lane basis.
- auto *Base = GEPLHS->getPointerOperand();
- if (GEPLHS->getType()->isVectorTy() && Base->getType()->isPointerTy()) {
+ // Don't transform signed compares of GEPs into index compares. Even if the
+ // GEP is inbounds, the final add of the base pointer can have signed overflow
+ // and would change the result of the icmp.
+ // e.g. "&foo[0] <s &foo[1]" can't be folded to "true" because "foo" could be
+ // the maximum signed value for the pointer type.
+ if (ICmpInst::isSigned(Cond))
+ return nullptr;
+
+ // Look through bitcasts and addrspacecasts. We do not however want to remove
+ // 0 GEPs.
+ if (!isa<GetElementPtrInst>(RHS))
+ RHS = RHS->stripPointerCasts();
+
+ Value *PtrBase = GEPLHS->getOperand(0);
+ // FIXME: Support vector pointer GEPs.
+ if (PtrBase == RHS && GEPLHS->isInBounds() &&
+ !GEPLHS->getType()->isVectorTy()) {
+ // ((gep Ptr, OFFSET) cmp Ptr) ---> (OFFSET cmp 0).
+ // This transformation (ignoring the base and scales) is valid because we
+ // know pointers can't overflow since the gep is inbounds. See if we can
+ // output an optimized form.
+ Value *Offset = evaluateGEPOffsetExpression(GEPLHS, *this, DL);
+
+ // If not, synthesize the offset the hard way.
+ if (!Offset)
+ Offset = EmitGEPOffset(GEPLHS);
+ return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
+ Constant::getNullValue(Offset->getType()));
+ }
+
+ if (GEPLHS->isInBounds() && ICmpInst::isEquality(Cond) &&
+ isa<Constant>(RHS) && cast<Constant>(RHS)->isNullValue() &&
+ !NullPointerIsDefined(I.getFunction(),
+ RHS->getType()->getPointerAddressSpace())) {
+ // For most address spaces, an allocation can't be placed at null, but null
+ // itself is treated as a 0 size allocation in the in bounds rules. Thus,
+ // the only valid inbounds address derived from null, is null itself.
+ // Thus, we have four cases to consider:
+ // 1) Base == nullptr, Offset == 0 -> inbounds, null
+ // 2) Base == nullptr, Offset != 0 -> poison as the result is out of bounds
+ // 3) Base != nullptr, Offset == (-base) -> poison (crossing allocations)
+ // 4) Base != nullptr, Offset != (-base) -> nonnull (and possibly poison)
+ //
+ // (Note if we're indexing a type of size 0, that simply collapses into one
+ // of the buckets above.)
+ //
+ // In general, we're allowed to make values less poison (i.e. remove
+ // sources of full UB), so in this case, we just select between the two
+ // non-poison cases (1 and 4 above).
+ //
+ // For vectors, we apply the same reasoning on a per-lane basis.
+ auto *Base = GEPLHS->getPointerOperand();
+ if (GEPLHS->getType()->isVectorTy() && Base->getType()->isPointerTy()) {
auto EC = cast<VectorType>(GEPLHS->getType())->getElementCount();
Base = Builder.CreateVectorSplat(EC, Base);
- }
- return new ICmpInst(Cond, Base,
- ConstantExpr::getPointerBitCastOrAddrSpaceCast(
- cast<Constant>(RHS), Base->getType()));
- } else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) {
- // If the base pointers are different, but the indices are the same, just
- // compare the base pointer.
- if (PtrBase != GEPRHS->getOperand(0)) {
- bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands();
- IndicesTheSame &= GEPLHS->getOperand(0)->getType() ==
- GEPRHS->getOperand(0)->getType();
- if (IndicesTheSame)
- for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
- if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
- IndicesTheSame = false;
- break;
- }
-
- // If all indices are the same, just compare the base pointers.
- Type *BaseType = GEPLHS->getOperand(0)->getType();
- if (IndicesTheSame && CmpInst::makeCmpResultType(BaseType) == I.getType())
- return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0));
-
- // If we're comparing GEPs with two base pointers that only differ in type
- // and both GEPs have only constant indices or just one use, then fold
- // the compare with the adjusted indices.
- // FIXME: Support vector of pointers.
- if (GEPLHS->isInBounds() && GEPRHS->isInBounds() &&
- (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) &&
- (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) &&
- PtrBase->stripPointerCasts() ==
- GEPRHS->getOperand(0)->stripPointerCasts() &&
- !GEPLHS->getType()->isVectorTy()) {
- Value *LOffset = EmitGEPOffset(GEPLHS);
- Value *ROffset = EmitGEPOffset(GEPRHS);
-
- // If we looked through an addrspacecast between different sized address
- // spaces, the LHS and RHS pointers are different sized
- // integers. Truncate to the smaller one.
- Type *LHSIndexTy = LOffset->getType();
- Type *RHSIndexTy = ROffset->getType();
- if (LHSIndexTy != RHSIndexTy) {
+ }
+ return new ICmpInst(Cond, Base,
+ ConstantExpr::getPointerBitCastOrAddrSpaceCast(
+ cast<Constant>(RHS), Base->getType()));
+ } else if (GEPOperator *GEPRHS = dyn_cast<GEPOperator>(RHS)) {
+ // If the base pointers are different, but the indices are the same, just
+ // compare the base pointer.
+ if (PtrBase != GEPRHS->getOperand(0)) {
+ bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands();
+ IndicesTheSame &= GEPLHS->getOperand(0)->getType() ==
+ GEPRHS->getOperand(0)->getType();
+ if (IndicesTheSame)
+ for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
+ if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
+ IndicesTheSame = false;
+ break;
+ }
+
+ // If all indices are the same, just compare the base pointers.
+ Type *BaseType = GEPLHS->getOperand(0)->getType();
+ if (IndicesTheSame && CmpInst::makeCmpResultType(BaseType) == I.getType())
+ return new ICmpInst(Cond, GEPLHS->getOperand(0), GEPRHS->getOperand(0));
+
+ // If we're comparing GEPs with two base pointers that only differ in type
+ // and both GEPs have only constant indices or just one use, then fold
+ // the compare with the adjusted indices.
+ // FIXME: Support vector of pointers.
+ if (GEPLHS->isInBounds() && GEPRHS->isInBounds() &&
+ (GEPLHS->hasAllConstantIndices() || GEPLHS->hasOneUse()) &&
+ (GEPRHS->hasAllConstantIndices() || GEPRHS->hasOneUse()) &&
+ PtrBase->stripPointerCasts() ==
+ GEPRHS->getOperand(0)->stripPointerCasts() &&
+ !GEPLHS->getType()->isVectorTy()) {
+ Value *LOffset = EmitGEPOffset(GEPLHS);
+ Value *ROffset = EmitGEPOffset(GEPRHS);
+
+ // If we looked through an addrspacecast between different sized address
+ // spaces, the LHS and RHS pointers are different sized
+ // integers. Truncate to the smaller one.
+ Type *LHSIndexTy = LOffset->getType();
+ Type *RHSIndexTy = ROffset->getType();
+ if (LHSIndexTy != RHSIndexTy) {
if (LHSIndexTy->getPrimitiveSizeInBits().getFixedSize() <
RHSIndexTy->getPrimitiveSizeInBits().getFixedSize()) {
- ROffset = Builder.CreateTrunc(ROffset, LHSIndexTy);
- } else
- LOffset = Builder.CreateTrunc(LOffset, RHSIndexTy);
- }
-
- Value *Cmp = Builder.CreateICmp(ICmpInst::getSignedPredicate(Cond),
- LOffset, ROffset);
- return replaceInstUsesWith(I, Cmp);
- }
-
- // Otherwise, the base pointers are different and the indices are
- // different. Try convert this to an indexed compare by looking through
- // PHIs/casts.
- return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
- }
-
- // If one of the GEPs has all zero indices, recurse.
- // FIXME: Handle vector of pointers.
- if (!GEPLHS->getType()->isVectorTy() && GEPLHS->hasAllZeroIndices())
- return foldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
- ICmpInst::getSwappedPredicate(Cond), I);
-
- // If the other GEP has all zero indices, recurse.
- // FIXME: Handle vector of pointers.
- if (!GEPRHS->getType()->isVectorTy() && GEPRHS->hasAllZeroIndices())
- return foldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
-
- bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
- if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) {
- // If the GEPs only differ by one index, compare it.
- unsigned NumDifferences = 0; // Keep track of # differences.
- unsigned DiffOperand = 0; // The operand that differs.
- for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
- if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
- Type *LHSType = GEPLHS->getOperand(i)->getType();
- Type *RHSType = GEPRHS->getOperand(i)->getType();
- // FIXME: Better support for vector of pointers.
- if (LHSType->getPrimitiveSizeInBits() !=
- RHSType->getPrimitiveSizeInBits() ||
- (GEPLHS->getType()->isVectorTy() &&
- (!LHSType->isVectorTy() || !RHSType->isVectorTy()))) {
- // Irreconcilable differences.
- NumDifferences = 2;
- break;
- }
-
- if (NumDifferences++) break;
- DiffOperand = i;
- }
-
- if (NumDifferences == 0) // SAME GEP?
- return replaceInstUsesWith(I, // No comparison is needed here.
- ConstantInt::get(I.getType(), ICmpInst::isTrueWhenEqual(Cond)));
-
- else if (NumDifferences == 1 && GEPsInBounds) {
- Value *LHSV = GEPLHS->getOperand(DiffOperand);
- Value *RHSV = GEPRHS->getOperand(DiffOperand);
- // Make sure we do a signed comparison here.
- return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV);
- }
- }
-
- // Only lower this if the icmp is the only user of the GEP or if we expect
- // the result to fold to a constant!
- if (GEPsInBounds && (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) &&
- (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) {
- // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2)
- Value *L = EmitGEPOffset(GEPLHS);
- Value *R = EmitGEPOffset(GEPRHS);
- return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
- }
- }
-
- // Try convert this to an indexed compare by looking through PHIs/casts as a
- // last resort.
- return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
-}
-
+ ROffset = Builder.CreateTrunc(ROffset, LHSIndexTy);
+ } else
+ LOffset = Builder.CreateTrunc(LOffset, RHSIndexTy);
+ }
+
+ Value *Cmp = Builder.CreateICmp(ICmpInst::getSignedPredicate(Cond),
+ LOffset, ROffset);
+ return replaceInstUsesWith(I, Cmp);
+ }
+
+ // Otherwise, the base pointers are different and the indices are
+ // different. Try convert this to an indexed compare by looking through
+ // PHIs/casts.
+ return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
+ }
+
+ // If one of the GEPs has all zero indices, recurse.
+ // FIXME: Handle vector of pointers.
+ if (!GEPLHS->getType()->isVectorTy() && GEPLHS->hasAllZeroIndices())
+ return foldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
+ ICmpInst::getSwappedPredicate(Cond), I);
+
+ // If the other GEP has all zero indices, recurse.
+ // FIXME: Handle vector of pointers.
+ if (!GEPRHS->getType()->isVectorTy() && GEPRHS->hasAllZeroIndices())
+ return foldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
+
+ bool GEPsInBounds = GEPLHS->isInBounds() && GEPRHS->isInBounds();
+ if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) {
+ // If the GEPs only differ by one index, compare it.
+ unsigned NumDifferences = 0; // Keep track of # differences.
+ unsigned DiffOperand = 0; // The operand that differs.
+ for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
+ if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
+ Type *LHSType = GEPLHS->getOperand(i)->getType();
+ Type *RHSType = GEPRHS->getOperand(i)->getType();
+ // FIXME: Better support for vector of pointers.
+ if (LHSType->getPrimitiveSizeInBits() !=
+ RHSType->getPrimitiveSizeInBits() ||
+ (GEPLHS->getType()->isVectorTy() &&
+ (!LHSType->isVectorTy() || !RHSType->isVectorTy()))) {
+ // Irreconcilable differences.
+ NumDifferences = 2;
+ break;
+ }
+
+ if (NumDifferences++) break;
+ DiffOperand = i;
+ }
+
+ if (NumDifferences == 0) // SAME GEP?
+ return replaceInstUsesWith(I, // No comparison is needed here.
+ ConstantInt::get(I.getType(), ICmpInst::isTrueWhenEqual(Cond)));
+
+ else if (NumDifferences == 1 && GEPsInBounds) {
+ Value *LHSV = GEPLHS->getOperand(DiffOperand);
+ Value *RHSV = GEPRHS->getOperand(DiffOperand);
+ // Make sure we do a signed comparison here.
+ return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV);
+ }
+ }
+
+ // Only lower this if the icmp is the only user of the GEP or if we expect
+ // the result to fold to a constant!
+ if (GEPsInBounds && (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) &&
+ (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) {
+ // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2) ---> (OFFSET1 cmp OFFSET2)
+ Value *L = EmitGEPOffset(GEPLHS);
+ Value *R = EmitGEPOffset(GEPRHS);
+ return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
+ }
+ }
+
+ // Try convert this to an indexed compare by looking through PHIs/casts as a
+ // last resort.
+ return transformToIndexedCompare(GEPLHS, RHS, Cond, DL);
+}
+
Instruction *InstCombinerImpl::foldAllocaCmp(ICmpInst &ICI,
const AllocaInst *Alloca,
const Value *Other) {
- assert(ICI.isEquality() && "Cannot fold non-equality comparison.");
-
- // It would be tempting to fold away comparisons between allocas and any
- // pointer not based on that alloca (e.g. an argument). However, even
- // though such pointers cannot alias, they can still compare equal.
- //
- // But LLVM doesn't specify where allocas get their memory, so if the alloca
- // doesn't escape we can argue that it's impossible to guess its value, and we
- // can therefore act as if any such guesses are wrong.
- //
- // The code below checks that the alloca doesn't escape, and that it's only
- // used in a comparison once (the current instruction). The
- // single-comparison-use condition ensures that we're trivially folding all
- // comparisons against the alloca consistently, and avoids the risk of
- // erroneously folding a comparison of the pointer with itself.
-
- unsigned MaxIter = 32; // Break cycles and bound to constant-time.
-
- SmallVector<const Use *, 32> Worklist;
- for (const Use &U : Alloca->uses()) {
- if (Worklist.size() >= MaxIter)
- return nullptr;
- Worklist.push_back(&U);
- }
-
- unsigned NumCmps = 0;
- while (!Worklist.empty()) {
- assert(Worklist.size() <= MaxIter);
- const Use *U = Worklist.pop_back_val();
- const Value *V = U->getUser();
- --MaxIter;
-
- if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V) || isa<PHINode>(V) ||
- isa<SelectInst>(V)) {
- // Track the uses.
- } else if (isa<LoadInst>(V)) {
- // Loading from the pointer doesn't escape it.
- continue;
- } else if (const auto *SI = dyn_cast<StoreInst>(V)) {
- // Storing *to* the pointer is fine, but storing the pointer escapes it.
- if (SI->getValueOperand() == U->get())
- return nullptr;
- continue;
- } else if (isa<ICmpInst>(V)) {
- if (NumCmps++)
- return nullptr; // Found more than one cmp.
- continue;
- } else if (const auto *Intrin = dyn_cast<IntrinsicInst>(V)) {
- switch (Intrin->getIntrinsicID()) {
- // These intrinsics don't escape or compare the pointer. Memset is safe
- // because we don't allow ptrtoint. Memcpy and memmove are safe because
- // we don't allow stores, so src cannot point to V.
- case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
- case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset:
- continue;
- default:
- return nullptr;
- }
- } else {
- return nullptr;
- }
- for (const Use &U : V->uses()) {
- if (Worklist.size() >= MaxIter)
- return nullptr;
- Worklist.push_back(&U);
- }
- }
-
- Type *CmpTy = CmpInst::makeCmpResultType(Other->getType());
- return replaceInstUsesWith(
- ICI,
- ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate())));
-}
-
-/// Fold "icmp pred (X+C), X".
+ assert(ICI.isEquality() && "Cannot fold non-equality comparison.");
+
+ // It would be tempting to fold away comparisons between allocas and any
+ // pointer not based on that alloca (e.g. an argument). However, even
+ // though such pointers cannot alias, they can still compare equal.
+ //
+ // But LLVM doesn't specify where allocas get their memory, so if the alloca
+ // doesn't escape we can argue that it's impossible to guess its value, and we
+ // can therefore act as if any such guesses are wrong.
+ //
+ // The code below checks that the alloca doesn't escape, and that it's only
+ // used in a comparison once (the current instruction). The
+ // single-comparison-use condition ensures that we're trivially folding all
+ // comparisons against the alloca consistently, and avoids the risk of
+ // erroneously folding a comparison of the pointer with itself.
+
+ unsigned MaxIter = 32; // Break cycles and bound to constant-time.
+
+ SmallVector<const Use *, 32> Worklist;
+ for (const Use &U : Alloca->uses()) {
+ if (Worklist.size() >= MaxIter)
+ return nullptr;
+ Worklist.push_back(&U);
+ }
+
+ unsigned NumCmps = 0;
+ while (!Worklist.empty()) {
+ assert(Worklist.size() <= MaxIter);
+ const Use *U = Worklist.pop_back_val();
+ const Value *V = U->getUser();
+ --MaxIter;
+
+ if (isa<BitCastInst>(V) || isa<GetElementPtrInst>(V) || isa<PHINode>(V) ||
+ isa<SelectInst>(V)) {
+ // Track the uses.
+ } else if (isa<LoadInst>(V)) {
+ // Loading from the pointer doesn't escape it.
+ continue;
+ } else if (const auto *SI = dyn_cast<StoreInst>(V)) {
+ // Storing *to* the pointer is fine, but storing the pointer escapes it.
+ if (SI->getValueOperand() == U->get())
+ return nullptr;
+ continue;
+ } else if (isa<ICmpInst>(V)) {
+ if (NumCmps++)
+ return nullptr; // Found more than one cmp.
+ continue;
+ } else if (const auto *Intrin = dyn_cast<IntrinsicInst>(V)) {
+ switch (Intrin->getIntrinsicID()) {
+ // These intrinsics don't escape or compare the pointer. Memset is safe
+ // because we don't allow ptrtoint. Memcpy and memmove are safe because
+ // we don't allow stores, so src cannot point to V.
+ case Intrinsic::lifetime_start: case Intrinsic::lifetime_end:
+ case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset:
+ continue;
+ default:
+ return nullptr;
+ }
+ } else {
+ return nullptr;
+ }
+ for (const Use &U : V->uses()) {
+ if (Worklist.size() >= MaxIter)
+ return nullptr;
+ Worklist.push_back(&U);
+ }
+ }
+
+ Type *CmpTy = CmpInst::makeCmpResultType(Other->getType());
+ return replaceInstUsesWith(
+ ICI,
+ ConstantInt::get(CmpTy, !CmpInst::isTrueWhenEqual(ICI.getPredicate())));
+}
+
+/// Fold "icmp pred (X+C), X".
Instruction *InstCombinerImpl::foldICmpAddOpConst(Value *X, const APInt &C,
ICmpInst::Predicate Pred) {
- // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
- // so the values can never be equal. Similarly for all other "or equals"
- // operators.
- assert(!!C && "C should not be zero!");
-
- // (X+1) <u X --> X >u (MAXUINT-1) --> X == 255
- // (X+2) <u X --> X >u (MAXUINT-2) --> X > 253
- // (X+MAXUINT) <u X --> X >u (MAXUINT-MAXUINT) --> X != 0
- if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) {
- Constant *R = ConstantInt::get(X->getType(),
- APInt::getMaxValue(C.getBitWidth()) - C);
- return new ICmpInst(ICmpInst::ICMP_UGT, X, R);
- }
-
- // (X+1) >u X --> X <u (0-1) --> X != 255
- // (X+2) >u X --> X <u (0-2) --> X <u 254
- // (X+MAXUINT) >u X --> X <u (0-MAXUINT) --> X <u 1 --> X == 0
- if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE)
- return new ICmpInst(ICmpInst::ICMP_ULT, X,
- ConstantInt::get(X->getType(), -C));
-
- APInt SMax = APInt::getSignedMaxValue(C.getBitWidth());
-
- // (X+ 1) <s X --> X >s (MAXSINT-1) --> X == 127
- // (X+ 2) <s X --> X >s (MAXSINT-2) --> X >s 125
- // (X+MAXSINT) <s X --> X >s (MAXSINT-MAXSINT) --> X >s 0
- // (X+MINSINT) <s X --> X >s (MAXSINT-MINSINT) --> X >s -1
- // (X+ -2) <s X --> X >s (MAXSINT- -2) --> X >s 126
- // (X+ -1) <s X --> X >s (MAXSINT- -1) --> X != 127
- if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
- return new ICmpInst(ICmpInst::ICMP_SGT, X,
- ConstantInt::get(X->getType(), SMax - C));
-
- // (X+ 1) >s X --> X <s (MAXSINT-(1-1)) --> X != 127
- // (X+ 2) >s X --> X <s (MAXSINT-(2-1)) --> X <s 126
- // (X+MAXSINT) >s X --> X <s (MAXSINT-(MAXSINT-1)) --> X <s 1
- // (X+MINSINT) >s X --> X <s (MAXSINT-(MINSINT-1)) --> X <s -2
- // (X+ -2) >s X --> X <s (MAXSINT-(-2-1)) --> X <s -126
- // (X+ -1) >s X --> X <s (MAXSINT-(-1-1)) --> X == -128
-
- assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE);
- return new ICmpInst(ICmpInst::ICMP_SLT, X,
- ConstantInt::get(X->getType(), SMax - (C - 1)));
-}
-
-/// Handle "(icmp eq/ne (ashr/lshr AP2, A), AP1)" ->
-/// (icmp eq/ne A, Log2(AP2/AP1)) ->
-/// (icmp eq/ne A, Log2(AP2) - Log2(AP1)).
+ // From this point on, we know that (X+C <= X) --> (X+C < X) because C != 0,
+ // so the values can never be equal. Similarly for all other "or equals"
+ // operators.
+ assert(!!C && "C should not be zero!");
+
+ // (X+1) <u X --> X >u (MAXUINT-1) --> X == 255
+ // (X+2) <u X --> X >u (MAXUINT-2) --> X > 253
+ // (X+MAXUINT) <u X --> X >u (MAXUINT-MAXUINT) --> X != 0
+ if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE) {
+ Constant *R = ConstantInt::get(X->getType(),
+ APInt::getMaxValue(C.getBitWidth()) - C);
+ return new ICmpInst(ICmpInst::ICMP_UGT, X, R);
+ }
+
+ // (X+1) >u X --> X <u (0-1) --> X != 255
+ // (X+2) >u X --> X <u (0-2) --> X <u 254
+ // (X+MAXUINT) >u X --> X <u (0-MAXUINT) --> X <u 1 --> X == 0
+ if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE)
+ return new ICmpInst(ICmpInst::ICMP_ULT, X,
+ ConstantInt::get(X->getType(), -C));
+
+ APInt SMax = APInt::getSignedMaxValue(C.getBitWidth());
+
+ // (X+ 1) <s X --> X >s (MAXSINT-1) --> X == 127
+ // (X+ 2) <s X --> X >s (MAXSINT-2) --> X >s 125
+ // (X+MAXSINT) <s X --> X >s (MAXSINT-MAXSINT) --> X >s 0
+ // (X+MINSINT) <s X --> X >s (MAXSINT-MINSINT) --> X >s -1
+ // (X+ -2) <s X --> X >s (MAXSINT- -2) --> X >s 126
+ // (X+ -1) <s X --> X >s (MAXSINT- -1) --> X != 127
+ if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
+ return new ICmpInst(ICmpInst::ICMP_SGT, X,
+ ConstantInt::get(X->getType(), SMax - C));
+
+ // (X+ 1) >s X --> X <s (MAXSINT-(1-1)) --> X != 127
+ // (X+ 2) >s X --> X <s (MAXSINT-(2-1)) --> X <s 126
+ // (X+MAXSINT) >s X --> X <s (MAXSINT-(MAXSINT-1)) --> X <s 1
+ // (X+MINSINT) >s X --> X <s (MAXSINT-(MINSINT-1)) --> X <s -2
+ // (X+ -2) >s X --> X <s (MAXSINT-(-2-1)) --> X <s -126
+ // (X+ -1) >s X --> X <s (MAXSINT-(-1-1)) --> X == -128
+
+ assert(Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE);
+ return new ICmpInst(ICmpInst::ICMP_SLT, X,
+ ConstantInt::get(X->getType(), SMax - (C - 1)));
+}
+
+/// Handle "(icmp eq/ne (ashr/lshr AP2, A), AP1)" ->
+/// (icmp eq/ne A, Log2(AP2/AP1)) ->
+/// (icmp eq/ne A, Log2(AP2) - Log2(AP1)).
Instruction *InstCombinerImpl::foldICmpShrConstConst(ICmpInst &I, Value *A,
const APInt &AP1,
const APInt &AP2) {
- assert(I.isEquality() && "Cannot fold icmp gt/lt");
-
- auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
- if (I.getPredicate() == I.ICMP_NE)
- Pred = CmpInst::getInversePredicate(Pred);
- return new ICmpInst(Pred, LHS, RHS);
- };
-
- // Don't bother doing any work for cases which InstSimplify handles.
- if (AP2.isNullValue())
- return nullptr;
-
- bool IsAShr = isa<AShrOperator>(I.getOperand(0));
- if (IsAShr) {
- if (AP2.isAllOnesValue())
- return nullptr;
- if (AP2.isNegative() != AP1.isNegative())
- return nullptr;
- if (AP2.sgt(AP1))
- return nullptr;
- }
-
- if (!AP1)
- // 'A' must be large enough to shift out the highest set bit.
- return getICmp(I.ICMP_UGT, A,
- ConstantInt::get(A->getType(), AP2.logBase2()));
-
- if (AP1 == AP2)
- return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
-
- int Shift;
- if (IsAShr && AP1.isNegative())
- Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes();
- else
- Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros();
-
- if (Shift > 0) {
- if (IsAShr && AP1 == AP2.ashr(Shift)) {
- // There are multiple solutions if we are comparing against -1 and the LHS
- // of the ashr is not a power of two.
- if (AP1.isAllOnesValue() && !AP2.isPowerOf2())
- return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift));
- return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
- } else if (AP1 == AP2.lshr(Shift)) {
- return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
- }
- }
-
- // Shifting const2 will never be equal to const1.
- // FIXME: This should always be handled by InstSimplify?
- auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE);
- return replaceInstUsesWith(I, TorF);
-}
-
-/// Handle "(icmp eq/ne (shl AP2, A), AP1)" ->
-/// (icmp eq/ne A, TrailingZeros(AP1) - TrailingZeros(AP2)).
+ assert(I.isEquality() && "Cannot fold icmp gt/lt");
+
+ auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
+ if (I.getPredicate() == I.ICMP_NE)
+ Pred = CmpInst::getInversePredicate(Pred);
+ return new ICmpInst(Pred, LHS, RHS);
+ };
+
+ // Don't bother doing any work for cases which InstSimplify handles.
+ if (AP2.isNullValue())
+ return nullptr;
+
+ bool IsAShr = isa<AShrOperator>(I.getOperand(0));
+ if (IsAShr) {
+ if (AP2.isAllOnesValue())
+ return nullptr;
+ if (AP2.isNegative() != AP1.isNegative())
+ return nullptr;
+ if (AP2.sgt(AP1))
+ return nullptr;
+ }
+
+ if (!AP1)
+ // 'A' must be large enough to shift out the highest set bit.
+ return getICmp(I.ICMP_UGT, A,
+ ConstantInt::get(A->getType(), AP2.logBase2()));
+
+ if (AP1 == AP2)
+ return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
+
+ int Shift;
+ if (IsAShr && AP1.isNegative())
+ Shift = AP1.countLeadingOnes() - AP2.countLeadingOnes();
+ else
+ Shift = AP1.countLeadingZeros() - AP2.countLeadingZeros();
+
+ if (Shift > 0) {
+ if (IsAShr && AP1 == AP2.ashr(Shift)) {
+ // There are multiple solutions if we are comparing against -1 and the LHS
+ // of the ashr is not a power of two.
+ if (AP1.isAllOnesValue() && !AP2.isPowerOf2())
+ return getICmp(I.ICMP_UGE, A, ConstantInt::get(A->getType(), Shift));
+ return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+ } else if (AP1 == AP2.lshr(Shift)) {
+ return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+ }
+ }
+
+ // Shifting const2 will never be equal to const1.
+ // FIXME: This should always be handled by InstSimplify?
+ auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE);
+ return replaceInstUsesWith(I, TorF);
+}
+
+/// Handle "(icmp eq/ne (shl AP2, A), AP1)" ->
+/// (icmp eq/ne A, TrailingZeros(AP1) - TrailingZeros(AP2)).
Instruction *InstCombinerImpl::foldICmpShlConstConst(ICmpInst &I, Value *A,
const APInt &AP1,
const APInt &AP2) {
- assert(I.isEquality() && "Cannot fold icmp gt/lt");
-
- auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
- if (I.getPredicate() == I.ICMP_NE)
- Pred = CmpInst::getInversePredicate(Pred);
- return new ICmpInst(Pred, LHS, RHS);
- };
-
- // Don't bother doing any work for cases which InstSimplify handles.
- if (AP2.isNullValue())
- return nullptr;
-
- unsigned AP2TrailingZeros = AP2.countTrailingZeros();
-
- if (!AP1 && AP2TrailingZeros != 0)
- return getICmp(
- I.ICMP_UGE, A,
- ConstantInt::get(A->getType(), AP2.getBitWidth() - AP2TrailingZeros));
-
- if (AP1 == AP2)
- return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
-
- // Get the distance between the lowest bits that are set.
- int Shift = AP1.countTrailingZeros() - AP2TrailingZeros;
-
- if (Shift > 0 && AP2.shl(Shift) == AP1)
- return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
-
- // Shifting const2 will never be equal to const1.
- // FIXME: This should always be handled by InstSimplify?
- auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE);
- return replaceInstUsesWith(I, TorF);
-}
-
-/// The caller has matched a pattern of the form:
-/// I = icmp ugt (add (add A, B), CI2), CI1
-/// If this is of the form:
-/// sum = a + b
-/// if (sum+128 >u 255)
-/// Then replace it with llvm.sadd.with.overflow.i8.
-///
-static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
- ConstantInt *CI2, ConstantInt *CI1,
+ assert(I.isEquality() && "Cannot fold icmp gt/lt");
+
+ auto getICmp = [&I](CmpInst::Predicate Pred, Value *LHS, Value *RHS) {
+ if (I.getPredicate() == I.ICMP_NE)
+ Pred = CmpInst::getInversePredicate(Pred);
+ return new ICmpInst(Pred, LHS, RHS);
+ };
+
+ // Don't bother doing any work for cases which InstSimplify handles.
+ if (AP2.isNullValue())
+ return nullptr;
+
+ unsigned AP2TrailingZeros = AP2.countTrailingZeros();
+
+ if (!AP1 && AP2TrailingZeros != 0)
+ return getICmp(
+ I.ICMP_UGE, A,
+ ConstantInt::get(A->getType(), AP2.getBitWidth() - AP2TrailingZeros));
+
+ if (AP1 == AP2)
+ return getICmp(I.ICMP_EQ, A, ConstantInt::getNullValue(A->getType()));
+
+ // Get the distance between the lowest bits that are set.
+ int Shift = AP1.countTrailingZeros() - AP2TrailingZeros;
+
+ if (Shift > 0 && AP2.shl(Shift) == AP1)
+ return getICmp(I.ICMP_EQ, A, ConstantInt::get(A->getType(), Shift));
+
+ // Shifting const2 will never be equal to const1.
+ // FIXME: This should always be handled by InstSimplify?
+ auto *TorF = ConstantInt::get(I.getType(), I.getPredicate() == I.ICMP_NE);
+ return replaceInstUsesWith(I, TorF);
+}
+
+/// The caller has matched a pattern of the form:
+/// I = icmp ugt (add (add A, B), CI2), CI1
+/// If this is of the form:
+/// sum = a + b
+/// if (sum+128 >u 255)
+/// Then replace it with llvm.sadd.with.overflow.i8.
+///
+static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
+ ConstantInt *CI2, ConstantInt *CI1,
InstCombinerImpl &IC) {
- // The transformation we're trying to do here is to transform this into an
- // llvm.sadd.with.overflow. To do this, we have to replace the original add
- // with a narrower add, and discard the add-with-constant that is part of the
- // range check (if we can't eliminate it, this isn't profitable).
-
- // In order to eliminate the add-with-constant, the compare can be its only
- // use.
- Instruction *AddWithCst = cast<Instruction>(I.getOperand(0));
- if (!AddWithCst->hasOneUse())
- return nullptr;
-
- // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow.
- if (!CI2->getValue().isPowerOf2())
- return nullptr;
- unsigned NewWidth = CI2->getValue().countTrailingZeros();
- if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31)
- return nullptr;
-
- // The width of the new add formed is 1 more than the bias.
- ++NewWidth;
-
- // Check to see that CI1 is an all-ones value with NewWidth bits.
- if (CI1->getBitWidth() == NewWidth ||
- CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth))
- return nullptr;
-
- // This is only really a signed overflow check if the inputs have been
- // sign-extended; check for that condition. For example, if CI2 is 2^31 and
- // the operands of the add are 64 bits wide, we need at least 33 sign bits.
- unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1;
- if (IC.ComputeNumSignBits(A, 0, &I) < NeededSignBits ||
- IC.ComputeNumSignBits(B, 0, &I) < NeededSignBits)
- return nullptr;
-
- // In order to replace the original add with a narrower
- // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant
- // and truncates that discard the high bits of the add. Verify that this is
- // the case.
- Instruction *OrigAdd = cast<Instruction>(AddWithCst->getOperand(0));
- for (User *U : OrigAdd->users()) {
- if (U == AddWithCst)
- continue;
-
- // Only accept truncates for now. We would really like a nice recursive
- // predicate like SimplifyDemandedBits, but which goes downwards the use-def
- // chain to see which bits of a value are actually demanded. If the
- // original add had another add which was then immediately truncated, we
- // could still do the transformation.
- TruncInst *TI = dyn_cast<TruncInst>(U);
- if (!TI || TI->getType()->getPrimitiveSizeInBits() > NewWidth)
- return nullptr;
- }
-
- // If the pattern matches, truncate the inputs to the narrower type and
- // use the sadd_with_overflow intrinsic to efficiently compute both the
- // result and the overflow bit.
- Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth);
- Function *F = Intrinsic::getDeclaration(
- I.getModule(), Intrinsic::sadd_with_overflow, NewType);
-
- InstCombiner::BuilderTy &Builder = IC.Builder;
-
- // Put the new code above the original add, in case there are any uses of the
- // add between the add and the compare.
- Builder.SetInsertPoint(OrigAdd);
-
- Value *TruncA = Builder.CreateTrunc(A, NewType, A->getName() + ".trunc");
- Value *TruncB = Builder.CreateTrunc(B, NewType, B->getName() + ".trunc");
- CallInst *Call = Builder.CreateCall(F, {TruncA, TruncB}, "sadd");
- Value *Add = Builder.CreateExtractValue(Call, 0, "sadd.result");
- Value *ZExt = Builder.CreateZExt(Add, OrigAdd->getType());
-
- // The inner add was the result of the narrow add, zero extended to the
- // wider type. Replace it with the result computed by the intrinsic.
- IC.replaceInstUsesWith(*OrigAdd, ZExt);
- IC.eraseInstFromFunction(*OrigAdd);
-
- // The original icmp gets replaced with the overflow value.
- return ExtractValueInst::Create(Call, 1, "sadd.overflow");
-}
-
-/// If we have:
-/// icmp eq/ne (urem/srem %x, %y), 0
-/// iff %y is a power-of-two, we can replace this with a bit test:
-/// icmp eq/ne (and %x, (add %y, -1)), 0
+ // The transformation we're trying to do here is to transform this into an
+ // llvm.sadd.with.overflow. To do this, we have to replace the original add
+ // with a narrower add, and discard the add-with-constant that is part of the
+ // range check (if we can't eliminate it, this isn't profitable).
+
+ // In order to eliminate the add-with-constant, the compare can be its only
+ // use.
+ Instruction *AddWithCst = cast<Instruction>(I.getOperand(0));
+ if (!AddWithCst->hasOneUse())
+ return nullptr;
+
+ // If CI2 is 2^7, 2^15, 2^31, then it might be an sadd.with.overflow.
+ if (!CI2->getValue().isPowerOf2())
+ return nullptr;
+ unsigned NewWidth = CI2->getValue().countTrailingZeros();
+ if (NewWidth != 7 && NewWidth != 15 && NewWidth != 31)
+ return nullptr;
+
+ // The width of the new add formed is 1 more than the bias.
+ ++NewWidth;
+
+ // Check to see that CI1 is an all-ones value with NewWidth bits.
+ if (CI1->getBitWidth() == NewWidth ||
+ CI1->getValue() != APInt::getLowBitsSet(CI1->getBitWidth(), NewWidth))
+ return nullptr;
+
+ // This is only really a signed overflow check if the inputs have been
+ // sign-extended; check for that condition. For example, if CI2 is 2^31 and
+ // the operands of the add are 64 bits wide, we need at least 33 sign bits.
+ unsigned NeededSignBits = CI1->getBitWidth() - NewWidth + 1;
+ if (IC.ComputeNumSignBits(A, 0, &I) < NeededSignBits ||
+ IC.ComputeNumSignBits(B, 0, &I) < NeededSignBits)
+ return nullptr;
+
+ // In order to replace the original add with a narrower
+ // llvm.sadd.with.overflow, the only uses allowed are the add-with-constant
+ // and truncates that discard the high bits of the add. Verify that this is
+ // the case.
+ Instruction *OrigAdd = cast<Instruction>(AddWithCst->getOperand(0));
+ for (User *U : OrigAdd->users()) {
+ if (U == AddWithCst)
+ continue;
+
+ // Only accept truncates for now. We would really like a nice recursive
+ // predicate like SimplifyDemandedBits, but which goes downwards the use-def
+ // chain to see which bits of a value are actually demanded. If the
+ // original add had another add which was then immediately truncated, we
+ // could still do the transformation.
+ TruncInst *TI = dyn_cast<TruncInst>(U);
+ if (!TI || TI->getType()->getPrimitiveSizeInBits() > NewWidth)
+ return nullptr;
+ }
+
+ // If the pattern matches, truncate the inputs to the narrower type and
+ // use the sadd_with_overflow intrinsic to efficiently compute both the
+ // result and the overflow bit.
+ Type *NewType = IntegerType::get(OrigAdd->getContext(), NewWidth);
+ Function *F = Intrinsic::getDeclaration(
+ I.getModule(), Intrinsic::sadd_with_overflow, NewType);
+
+ InstCombiner::BuilderTy &Builder = IC.Builder;
+
+ // Put the new code above the original add, in case there are any uses of the
+ // add between the add and the compare.
+ Builder.SetInsertPoint(OrigAdd);
+
+ Value *TruncA = Builder.CreateTrunc(A, NewType, A->getName() + ".trunc");
+ Value *TruncB = Builder.CreateTrunc(B, NewType, B->getName() + ".trunc");
+ CallInst *Call = Builder.CreateCall(F, {TruncA, TruncB}, "sadd");
+ Value *Add = Builder.CreateExtractValue(Call, 0, "sadd.result");
+ Value *ZExt = Builder.CreateZExt(Add, OrigAdd->getType());
+
+ // The inner add was the result of the narrow add, zero extended to the
+ // wider type. Replace it with the result computed by the intrinsic.
+ IC.replaceInstUsesWith(*OrigAdd, ZExt);
+ IC.eraseInstFromFunction(*OrigAdd);
+
+ // The original icmp gets replaced with the overflow value.
+ return ExtractValueInst::Create(Call, 1, "sadd.overflow");
+}
+
+/// If we have:
+/// icmp eq/ne (urem/srem %x, %y), 0
+/// iff %y is a power-of-two, we can replace this with a bit test:
+/// icmp eq/ne (and %x, (add %y, -1)), 0
Instruction *InstCombinerImpl::foldIRemByPowerOfTwoToBitTest(ICmpInst &I) {
- // This fold is only valid for equality predicates.
- if (!I.isEquality())
- return nullptr;
- ICmpInst::Predicate Pred;
- Value *X, *Y, *Zero;
- if (!match(&I, m_ICmp(Pred, m_OneUse(m_IRem(m_Value(X), m_Value(Y))),
- m_CombineAnd(m_Zero(), m_Value(Zero)))))
- return nullptr;
- if (!isKnownToBeAPowerOfTwo(Y, /*OrZero*/ true, 0, &I))
- return nullptr;
- // This may increase instruction count, we don't enforce that Y is a constant.
- Value *Mask = Builder.CreateAdd(Y, Constant::getAllOnesValue(Y->getType()));
- Value *Masked = Builder.CreateAnd(X, Mask);
- return ICmpInst::Create(Instruction::ICmp, Pred, Masked, Zero);
-}
-
-/// Fold equality-comparison between zero and any (maybe truncated) right-shift
-/// by one-less-than-bitwidth into a sign test on the original value.
+ // This fold is only valid for equality predicates.
+ if (!I.isEquality())
+ return nullptr;
+ ICmpInst::Predicate Pred;
+ Value *X, *Y, *Zero;
+ if (!match(&I, m_ICmp(Pred, m_OneUse(m_IRem(m_Value(X), m_Value(Y))),
+ m_CombineAnd(m_Zero(), m_Value(Zero)))))
+ return nullptr;
+ if (!isKnownToBeAPowerOfTwo(Y, /*OrZero*/ true, 0, &I))
+ return nullptr;
+ // This may increase instruction count, we don't enforce that Y is a constant.
+ Value *Mask = Builder.CreateAdd(Y, Constant::getAllOnesValue(Y->getType()));
+ Value *Masked = Builder.CreateAnd(X, Mask);
+ return ICmpInst::Create(Instruction::ICmp, Pred, Masked, Zero);
+}
+
+/// Fold equality-comparison between zero and any (maybe truncated) right-shift
+/// by one-less-than-bitwidth into a sign test on the original value.
Instruction *InstCombinerImpl::foldSignBitTest(ICmpInst &I) {
- Instruction *Val;
- ICmpInst::Predicate Pred;
- if (!I.isEquality() || !match(&I, m_ICmp(Pred, m_Instruction(Val), m_Zero())))
- return nullptr;
-
- Value *X;
- Type *XTy;
-
- Constant *C;
- if (match(Val, m_TruncOrSelf(m_Shr(m_Value(X), m_Constant(C))))) {
- XTy = X->getType();
- unsigned XBitWidth = XTy->getScalarSizeInBits();
- if (!match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
- APInt(XBitWidth, XBitWidth - 1))))
- return nullptr;
- } else if (isa<BinaryOperator>(Val) &&
- (X = reassociateShiftAmtsOfTwoSameDirectionShifts(
- cast<BinaryOperator>(Val), SQ.getWithInstruction(Val),
- /*AnalyzeForSignBitExtraction=*/true))) {
- XTy = X->getType();
- } else
- return nullptr;
-
- return ICmpInst::Create(Instruction::ICmp,
- Pred == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_SGE
- : ICmpInst::ICMP_SLT,
- X, ConstantInt::getNullValue(XTy));
-}
-
-// Handle icmp pred X, 0
+ Instruction *Val;
+ ICmpInst::Predicate Pred;
+ if (!I.isEquality() || !match(&I, m_ICmp(Pred, m_Instruction(Val), m_Zero())))
+ return nullptr;
+
+ Value *X;
+ Type *XTy;
+
+ Constant *C;
+ if (match(Val, m_TruncOrSelf(m_Shr(m_Value(X), m_Constant(C))))) {
+ XTy = X->getType();
+ unsigned XBitWidth = XTy->getScalarSizeInBits();
+ if (!match(C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+ APInt(XBitWidth, XBitWidth - 1))))
+ return nullptr;
+ } else if (isa<BinaryOperator>(Val) &&
+ (X = reassociateShiftAmtsOfTwoSameDirectionShifts(
+ cast<BinaryOperator>(Val), SQ.getWithInstruction(Val),
+ /*AnalyzeForSignBitExtraction=*/true))) {
+ XTy = X->getType();
+ } else
+ return nullptr;
+
+ return ICmpInst::Create(Instruction::ICmp,
+ Pred == ICmpInst::ICMP_EQ ? ICmpInst::ICMP_SGE
+ : ICmpInst::ICMP_SLT,
+ X, ConstantInt::getNullValue(XTy));
+}
+
+// Handle icmp pred X, 0
Instruction *InstCombinerImpl::foldICmpWithZero(ICmpInst &Cmp) {
- CmpInst::Predicate Pred = Cmp.getPredicate();
- if (!match(Cmp.getOperand(1), m_Zero()))
- return nullptr;
-
- // (icmp sgt smin(PosA, B) 0) -> (icmp sgt B 0)
- if (Pred == ICmpInst::ICMP_SGT) {
- Value *A, *B;
- SelectPatternResult SPR = matchSelectPattern(Cmp.getOperand(0), A, B);
- if (SPR.Flavor == SPF_SMIN) {
- if (isKnownPositive(A, DL, 0, &AC, &Cmp, &DT))
- return new ICmpInst(Pred, B, Cmp.getOperand(1));
- if (isKnownPositive(B, DL, 0, &AC, &Cmp, &DT))
- return new ICmpInst(Pred, A, Cmp.getOperand(1));
- }
- }
-
- if (Instruction *New = foldIRemByPowerOfTwoToBitTest(Cmp))
- return New;
-
- // Given:
- // icmp eq/ne (urem %x, %y), 0
- // Iff %x has 0 or 1 bits set, and %y has at least 2 bits set, omit 'urem':
- // icmp eq/ne %x, 0
- Value *X, *Y;
- if (match(Cmp.getOperand(0), m_URem(m_Value(X), m_Value(Y))) &&
- ICmpInst::isEquality(Pred)) {
- KnownBits XKnown = computeKnownBits(X, 0, &Cmp);
- KnownBits YKnown = computeKnownBits(Y, 0, &Cmp);
- if (XKnown.countMaxPopulation() == 1 && YKnown.countMinPopulation() >= 2)
- return new ICmpInst(Pred, X, Cmp.getOperand(1));
- }
-
- return nullptr;
-}
-
-/// Fold icmp Pred X, C.
-/// TODO: This code structure does not make sense. The saturating add fold
-/// should be moved to some other helper and extended as noted below (it is also
-/// possible that code has been made unnecessary - do we canonicalize IR to
-/// overflow/saturating intrinsics or not?).
+ CmpInst::Predicate Pred = Cmp.getPredicate();
+ if (!match(Cmp.getOperand(1), m_Zero()))
+ return nullptr;
+
+ // (icmp sgt smin(PosA, B) 0) -> (icmp sgt B 0)
+ if (Pred == ICmpInst::ICMP_SGT) {
+ Value *A, *B;
+ SelectPatternResult SPR = matchSelectPattern(Cmp.getOperand(0), A, B);
+ if (SPR.Flavor == SPF_SMIN) {
+ if (isKnownPositive(A, DL, 0, &AC, &Cmp, &DT))
+ return new ICmpInst(Pred, B, Cmp.getOperand(1));
+ if (isKnownPositive(B, DL, 0, &AC, &Cmp, &DT))
+ return new ICmpInst(Pred, A, Cmp.getOperand(1));
+ }
+ }
+
+ if (Instruction *New = foldIRemByPowerOfTwoToBitTest(Cmp))
+ return New;
+
+ // Given:
+ // icmp eq/ne (urem %x, %y), 0
+ // Iff %x has 0 or 1 bits set, and %y has at least 2 bits set, omit 'urem':
+ // icmp eq/ne %x, 0
+ Value *X, *Y;
+ if (match(Cmp.getOperand(0), m_URem(m_Value(X), m_Value(Y))) &&
+ ICmpInst::isEquality(Pred)) {
+ KnownBits XKnown = computeKnownBits(X, 0, &Cmp);
+ KnownBits YKnown = computeKnownBits(Y, 0, &Cmp);
+ if (XKnown.countMaxPopulation() == 1 && YKnown.countMinPopulation() >= 2)
+ return new ICmpInst(Pred, X, Cmp.getOperand(1));
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp Pred X, C.
+/// TODO: This code structure does not make sense. The saturating add fold
+/// should be moved to some other helper and extended as noted below (it is also
+/// possible that code has been made unnecessary - do we canonicalize IR to
+/// overflow/saturating intrinsics or not?).
Instruction *InstCombinerImpl::foldICmpWithConstant(ICmpInst &Cmp) {
- // Match the following pattern, which is a common idiom when writing
- // overflow-safe integer arithmetic functions. The source performs an addition
- // in wider type and explicitly checks for overflow using comparisons against
- // INT_MIN and INT_MAX. Simplify by using the sadd_with_overflow intrinsic.
- //
- // TODO: This could probably be generalized to handle other overflow-safe
- // operations if we worked out the formulas to compute the appropriate magic
- // constants.
- //
- // sum = a + b
- // if (sum+128 >u 255) ... -> llvm.sadd.with.overflow.i8
- CmpInst::Predicate Pred = Cmp.getPredicate();
- Value *Op0 = Cmp.getOperand(0), *Op1 = Cmp.getOperand(1);
- Value *A, *B;
- ConstantInt *CI, *CI2; // I = icmp ugt (add (add A, B), CI2), CI
- if (Pred == ICmpInst::ICMP_UGT && match(Op1, m_ConstantInt(CI)) &&
- match(Op0, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2))))
- if (Instruction *Res = processUGT_ADDCST_ADD(Cmp, A, B, CI2, CI, *this))
- return Res;
-
- // icmp(phi(C1, C2, ...), C) -> phi(icmp(C1, C), icmp(C2, C), ...).
- Constant *C = dyn_cast<Constant>(Op1);
- if (!C)
- return nullptr;
-
- if (auto *Phi = dyn_cast<PHINode>(Op0))
- if (all_of(Phi->operands(), [](Value *V) { return isa<Constant>(V); })) {
- Type *Ty = Cmp.getType();
- Builder.SetInsertPoint(Phi);
- PHINode *NewPhi =
- Builder.CreatePHI(Ty, Phi->getNumOperands());
- for (BasicBlock *Predecessor : predecessors(Phi->getParent())) {
- auto *Input =
- cast<Constant>(Phi->getIncomingValueForBlock(Predecessor));
- auto *BoolInput = ConstantExpr::getCompare(Pred, Input, C);
- NewPhi->addIncoming(BoolInput, Predecessor);
- }
- NewPhi->takeName(&Cmp);
- return replaceInstUsesWith(Cmp, NewPhi);
- }
-
- return nullptr;
-}
-
-/// Canonicalize icmp instructions based on dominating conditions.
+ // Match the following pattern, which is a common idiom when writing
+ // overflow-safe integer arithmetic functions. The source performs an addition
+ // in wider type and explicitly checks for overflow using comparisons against
+ // INT_MIN and INT_MAX. Simplify by using the sadd_with_overflow intrinsic.
+ //
+ // TODO: This could probably be generalized to handle other overflow-safe
+ // operations if we worked out the formulas to compute the appropriate magic
+ // constants.
+ //
+ // sum = a + b
+ // if (sum+128 >u 255) ... -> llvm.sadd.with.overflow.i8
+ CmpInst::Predicate Pred = Cmp.getPredicate();
+ Value *Op0 = Cmp.getOperand(0), *Op1 = Cmp.getOperand(1);
+ Value *A, *B;
+ ConstantInt *CI, *CI2; // I = icmp ugt (add (add A, B), CI2), CI
+ if (Pred == ICmpInst::ICMP_UGT && match(Op1, m_ConstantInt(CI)) &&
+ match(Op0, m_Add(m_Add(m_Value(A), m_Value(B)), m_ConstantInt(CI2))))
+ if (Instruction *Res = processUGT_ADDCST_ADD(Cmp, A, B, CI2, CI, *this))
+ return Res;
+
+ // icmp(phi(C1, C2, ...), C) -> phi(icmp(C1, C), icmp(C2, C), ...).
+ Constant *C = dyn_cast<Constant>(Op1);
+ if (!C)
+ return nullptr;
+
+ if (auto *Phi = dyn_cast<PHINode>(Op0))
+ if (all_of(Phi->operands(), [](Value *V) { return isa<Constant>(V); })) {
+ Type *Ty = Cmp.getType();
+ Builder.SetInsertPoint(Phi);
+ PHINode *NewPhi =
+ Builder.CreatePHI(Ty, Phi->getNumOperands());
+ for (BasicBlock *Predecessor : predecessors(Phi->getParent())) {
+ auto *Input =
+ cast<Constant>(Phi->getIncomingValueForBlock(Predecessor));
+ auto *BoolInput = ConstantExpr::getCompare(Pred, Input, C);
+ NewPhi->addIncoming(BoolInput, Predecessor);
+ }
+ NewPhi->takeName(&Cmp);
+ return replaceInstUsesWith(Cmp, NewPhi);
+ }
+
+ return nullptr;
+}
+
+/// Canonicalize icmp instructions based on dominating conditions.
Instruction *InstCombinerImpl::foldICmpWithDominatingICmp(ICmpInst &Cmp) {
- // This is a cheap/incomplete check for dominance - just match a single
- // predecessor with a conditional branch.
- BasicBlock *CmpBB = Cmp.getParent();
- BasicBlock *DomBB = CmpBB->getSinglePredecessor();
- if (!DomBB)
- return nullptr;
-
- Value *DomCond;
- BasicBlock *TrueBB, *FalseBB;
- if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB)))
- return nullptr;
-
- assert((TrueBB == CmpBB || FalseBB == CmpBB) &&
- "Predecessor block does not point to successor?");
-
- // The branch should get simplified. Don't bother simplifying this condition.
- if (TrueBB == FalseBB)
- return nullptr;
-
- // Try to simplify this compare to T/F based on the dominating condition.
- Optional<bool> Imp = isImpliedCondition(DomCond, &Cmp, DL, TrueBB == CmpBB);
- if (Imp)
- return replaceInstUsesWith(Cmp, ConstantInt::get(Cmp.getType(), *Imp));
-
- CmpInst::Predicate Pred = Cmp.getPredicate();
- Value *X = Cmp.getOperand(0), *Y = Cmp.getOperand(1);
- ICmpInst::Predicate DomPred;
- const APInt *C, *DomC;
- if (match(DomCond, m_ICmp(DomPred, m_Specific(X), m_APInt(DomC))) &&
- match(Y, m_APInt(C))) {
- // We have 2 compares of a variable with constants. Calculate the constant
- // ranges of those compares to see if we can transform the 2nd compare:
- // DomBB:
- // DomCond = icmp DomPred X, DomC
- // br DomCond, CmpBB, FalseBB
- // CmpBB:
- // Cmp = icmp Pred X, C
- ConstantRange CR = ConstantRange::makeAllowedICmpRegion(Pred, *C);
- ConstantRange DominatingCR =
- (CmpBB == TrueBB) ? ConstantRange::makeExactICmpRegion(DomPred, *DomC)
- : ConstantRange::makeExactICmpRegion(
- CmpInst::getInversePredicate(DomPred), *DomC);
- ConstantRange Intersection = DominatingCR.intersectWith(CR);
- ConstantRange Difference = DominatingCR.difference(CR);
- if (Intersection.isEmptySet())
- return replaceInstUsesWith(Cmp, Builder.getFalse());
- if (Difference.isEmptySet())
- return replaceInstUsesWith(Cmp, Builder.getTrue());
-
- // Canonicalizing a sign bit comparison that gets used in a branch,
- // pessimizes codegen by generating branch on zero instruction instead
- // of a test and branch. So we avoid canonicalizing in such situations
- // because test and branch instruction has better branch displacement
- // than compare and branch instruction.
- bool UnusedBit;
- bool IsSignBit = isSignBitCheck(Pred, *C, UnusedBit);
- if (Cmp.isEquality() || (IsSignBit && hasBranchUse(Cmp)))
- return nullptr;
-
- if (const APInt *EqC = Intersection.getSingleElement())
- return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder.getInt(*EqC));
- if (const APInt *NeC = Difference.getSingleElement())
- return new ICmpInst(ICmpInst::ICMP_NE, X, Builder.getInt(*NeC));
- }
-
- return nullptr;
-}
-
-/// Fold icmp (trunc X, Y), C.
+ // This is a cheap/incomplete check for dominance - just match a single
+ // predecessor with a conditional branch.
+ BasicBlock *CmpBB = Cmp.getParent();
+ BasicBlock *DomBB = CmpBB->getSinglePredecessor();
+ if (!DomBB)
+ return nullptr;
+
+ Value *DomCond;
+ BasicBlock *TrueBB, *FalseBB;
+ if (!match(DomBB->getTerminator(), m_Br(m_Value(DomCond), TrueBB, FalseBB)))
+ return nullptr;
+
+ assert((TrueBB == CmpBB || FalseBB == CmpBB) &&
+ "Predecessor block does not point to successor?");
+
+ // The branch should get simplified. Don't bother simplifying this condition.
+ if (TrueBB == FalseBB)
+ return nullptr;
+
+ // Try to simplify this compare to T/F based on the dominating condition.
+ Optional<bool> Imp = isImpliedCondition(DomCond, &Cmp, DL, TrueBB == CmpBB);
+ if (Imp)
+ return replaceInstUsesWith(Cmp, ConstantInt::get(Cmp.getType(), *Imp));
+
+ CmpInst::Predicate Pred = Cmp.getPredicate();
+ Value *X = Cmp.getOperand(0), *Y = Cmp.getOperand(1);
+ ICmpInst::Predicate DomPred;
+ const APInt *C, *DomC;
+ if (match(DomCond, m_ICmp(DomPred, m_Specific(X), m_APInt(DomC))) &&
+ match(Y, m_APInt(C))) {
+ // We have 2 compares of a variable with constants. Calculate the constant
+ // ranges of those compares to see if we can transform the 2nd compare:
+ // DomBB:
+ // DomCond = icmp DomPred X, DomC
+ // br DomCond, CmpBB, FalseBB
+ // CmpBB:
+ // Cmp = icmp Pred X, C
+ ConstantRange CR = ConstantRange::makeAllowedICmpRegion(Pred, *C);
+ ConstantRange DominatingCR =
+ (CmpBB == TrueBB) ? ConstantRange::makeExactICmpRegion(DomPred, *DomC)
+ : ConstantRange::makeExactICmpRegion(
+ CmpInst::getInversePredicate(DomPred), *DomC);
+ ConstantRange Intersection = DominatingCR.intersectWith(CR);
+ ConstantRange Difference = DominatingCR.difference(CR);
+ if (Intersection.isEmptySet())
+ return replaceInstUsesWith(Cmp, Builder.getFalse());
+ if (Difference.isEmptySet())
+ return replaceInstUsesWith(Cmp, Builder.getTrue());
+
+ // Canonicalizing a sign bit comparison that gets used in a branch,
+ // pessimizes codegen by generating branch on zero instruction instead
+ // of a test and branch. So we avoid canonicalizing in such situations
+ // because test and branch instruction has better branch displacement
+ // than compare and branch instruction.
+ bool UnusedBit;
+ bool IsSignBit = isSignBitCheck(Pred, *C, UnusedBit);
+ if (Cmp.isEquality() || (IsSignBit && hasBranchUse(Cmp)))
+ return nullptr;
+
+ if (const APInt *EqC = Intersection.getSingleElement())
+ return new ICmpInst(ICmpInst::ICMP_EQ, X, Builder.getInt(*EqC));
+ if (const APInt *NeC = Difference.getSingleElement())
+ return new ICmpInst(ICmpInst::ICMP_NE, X, Builder.getInt(*NeC));
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (trunc X, Y), C.
Instruction *InstCombinerImpl::foldICmpTruncConstant(ICmpInst &Cmp,
TruncInst *Trunc,
const APInt &C) {
- ICmpInst::Predicate Pred = Cmp.getPredicate();
- Value *X = Trunc->getOperand(0);
- if (C.isOneValue() && C.getBitWidth() > 1) {
- // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1
- Value *V = nullptr;
- if (Pred == ICmpInst::ICMP_SLT && match(X, m_Signum(m_Value(V))))
- return new ICmpInst(ICmpInst::ICMP_SLT, V,
- ConstantInt::get(V->getType(), 1));
- }
-
- if (Cmp.isEquality() && Trunc->hasOneUse()) {
- // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all
- // of the high bits truncated out of x are known.
- unsigned DstBits = Trunc->getType()->getScalarSizeInBits(),
- SrcBits = X->getType()->getScalarSizeInBits();
- KnownBits Known = computeKnownBits(X, 0, &Cmp);
-
- // If all the high bits are known, we can do this xform.
- if ((Known.Zero | Known.One).countLeadingOnes() >= SrcBits - DstBits) {
- // Pull in the high bits from known-ones set.
- APInt NewRHS = C.zext(SrcBits);
- NewRHS |= Known.One & APInt::getHighBitsSet(SrcBits, SrcBits - DstBits);
- return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), NewRHS));
- }
- }
-
- return nullptr;
-}
-
-/// Fold icmp (xor X, Y), C.
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ Value *X = Trunc->getOperand(0);
+ if (C.isOneValue() && C.getBitWidth() > 1) {
+ // icmp slt trunc(signum(V)) 1 --> icmp slt V, 1
+ Value *V = nullptr;
+ if (Pred == ICmpInst::ICMP_SLT && match(X, m_Signum(m_Value(V))))
+ return new ICmpInst(ICmpInst::ICMP_SLT, V,
+ ConstantInt::get(V->getType(), 1));
+ }
+
+ if (Cmp.isEquality() && Trunc->hasOneUse()) {
+ // Simplify icmp eq (trunc x to i8), 42 -> icmp eq x, 42|highbits if all
+ // of the high bits truncated out of x are known.
+ unsigned DstBits = Trunc->getType()->getScalarSizeInBits(),
+ SrcBits = X->getType()->getScalarSizeInBits();
+ KnownBits Known = computeKnownBits(X, 0, &Cmp);
+
+ // If all the high bits are known, we can do this xform.
+ if ((Known.Zero | Known.One).countLeadingOnes() >= SrcBits - DstBits) {
+ // Pull in the high bits from known-ones set.
+ APInt NewRHS = C.zext(SrcBits);
+ NewRHS |= Known.One & APInt::getHighBitsSet(SrcBits, SrcBits - DstBits);
+ return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), NewRHS));
+ }
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (xor X, Y), C.
Instruction *InstCombinerImpl::foldICmpXorConstant(ICmpInst &Cmp,
BinaryOperator *Xor,
const APInt &C) {
- Value *X = Xor->getOperand(0);
- Value *Y = Xor->getOperand(1);
- const APInt *XorC;
- if (!match(Y, m_APInt(XorC)))
- return nullptr;
-
- // If this is a comparison that tests the signbit (X < 0) or (x > -1),
- // fold the xor.
- ICmpInst::Predicate Pred = Cmp.getPredicate();
- bool TrueIfSigned = false;
- if (isSignBitCheck(Cmp.getPredicate(), C, TrueIfSigned)) {
-
- // If the sign bit of the XorCst is not set, there is no change to
- // the operation, just stop using the Xor.
- if (!XorC->isNegative())
- return replaceOperand(Cmp, 0, X);
-
- // Emit the opposite comparison.
- if (TrueIfSigned)
- return new ICmpInst(ICmpInst::ICMP_SGT, X,
- ConstantInt::getAllOnesValue(X->getType()));
- else
- return new ICmpInst(ICmpInst::ICMP_SLT, X,
- ConstantInt::getNullValue(X->getType()));
- }
-
- if (Xor->hasOneUse()) {
- // (icmp u/s (xor X SignMask), C) -> (icmp s/u X, (xor C SignMask))
- if (!Cmp.isEquality() && XorC->isSignMask()) {
+ Value *X = Xor->getOperand(0);
+ Value *Y = Xor->getOperand(1);
+ const APInt *XorC;
+ if (!match(Y, m_APInt(XorC)))
+ return nullptr;
+
+ // If this is a comparison that tests the signbit (X < 0) or (x > -1),
+ // fold the xor.
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ bool TrueIfSigned = false;
+ if (isSignBitCheck(Cmp.getPredicate(), C, TrueIfSigned)) {
+
+ // If the sign bit of the XorCst is not set, there is no change to
+ // the operation, just stop using the Xor.
+ if (!XorC->isNegative())
+ return replaceOperand(Cmp, 0, X);
+
+ // Emit the opposite comparison.
+ if (TrueIfSigned)
+ return new ICmpInst(ICmpInst::ICMP_SGT, X,
+ ConstantInt::getAllOnesValue(X->getType()));
+ else
+ return new ICmpInst(ICmpInst::ICMP_SLT, X,
+ ConstantInt::getNullValue(X->getType()));
+ }
+
+ if (Xor->hasOneUse()) {
+ // (icmp u/s (xor X SignMask), C) -> (icmp s/u X, (xor C SignMask))
+ if (!Cmp.isEquality() && XorC->isSignMask()) {
Pred = Cmp.getFlippedSignednessPredicate();
- return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC));
- }
-
- // (icmp u/s (xor X ~SignMask), C) -> (icmp s/u X, (xor C ~SignMask))
- if (!Cmp.isEquality() && XorC->isMaxSignedValue()) {
+ return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC));
+ }
+
+ // (icmp u/s (xor X ~SignMask), C) -> (icmp s/u X, (xor C ~SignMask))
+ if (!Cmp.isEquality() && XorC->isMaxSignedValue()) {
Pred = Cmp.getFlippedSignednessPredicate();
- Pred = Cmp.getSwappedPredicate(Pred);
- return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC));
- }
- }
-
- // Mask constant magic can eliminate an 'xor' with unsigned compares.
- if (Pred == ICmpInst::ICMP_UGT) {
- // (xor X, ~C) >u C --> X <u ~C (when C+1 is a power of 2)
- if (*XorC == ~C && (C + 1).isPowerOf2())
- return new ICmpInst(ICmpInst::ICMP_ULT, X, Y);
- // (xor X, C) >u C --> X >u C (when C+1 is a power of 2)
- if (*XorC == C && (C + 1).isPowerOf2())
- return new ICmpInst(ICmpInst::ICMP_UGT, X, Y);
- }
- if (Pred == ICmpInst::ICMP_ULT) {
- // (xor X, -C) <u C --> X >u ~C (when C is a power of 2)
- if (*XorC == -C && C.isPowerOf2())
- return new ICmpInst(ICmpInst::ICMP_UGT, X,
- ConstantInt::get(X->getType(), ~C));
- // (xor X, C) <u C --> X >u ~C (when -C is a power of 2)
- if (*XorC == C && (-C).isPowerOf2())
- return new ICmpInst(ICmpInst::ICMP_UGT, X,
- ConstantInt::get(X->getType(), ~C));
- }
- return nullptr;
-}
-
-/// Fold icmp (and (sh X, Y), C2), C1.
+ Pred = Cmp.getSwappedPredicate(Pred);
+ return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), C ^ *XorC));
+ }
+ }
+
+ // Mask constant magic can eliminate an 'xor' with unsigned compares.
+ if (Pred == ICmpInst::ICMP_UGT) {
+ // (xor X, ~C) >u C --> X <u ~C (when C+1 is a power of 2)
+ if (*XorC == ~C && (C + 1).isPowerOf2())
+ return new ICmpInst(ICmpInst::ICMP_ULT, X, Y);
+ // (xor X, C) >u C --> X >u C (when C+1 is a power of 2)
+ if (*XorC == C && (C + 1).isPowerOf2())
+ return new ICmpInst(ICmpInst::ICMP_UGT, X, Y);
+ }
+ if (Pred == ICmpInst::ICMP_ULT) {
+ // (xor X, -C) <u C --> X >u ~C (when C is a power of 2)
+ if (*XorC == -C && C.isPowerOf2())
+ return new ICmpInst(ICmpInst::ICMP_UGT, X,
+ ConstantInt::get(X->getType(), ~C));
+ // (xor X, C) <u C --> X >u ~C (when -C is a power of 2)
+ if (*XorC == C && (-C).isPowerOf2())
+ return new ICmpInst(ICmpInst::ICMP_UGT, X,
+ ConstantInt::get(X->getType(), ~C));
+ }
+ return nullptr;
+}
+
+/// Fold icmp (and (sh X, Y), C2), C1.
Instruction *InstCombinerImpl::foldICmpAndShift(ICmpInst &Cmp,
BinaryOperator *And,
const APInt &C1,
const APInt &C2) {
- BinaryOperator *Shift = dyn_cast<BinaryOperator>(And->getOperand(0));
- if (!Shift || !Shift->isShift())
- return nullptr;
-
- // If this is: (X >> C3) & C2 != C1 (where any shift and any compare could
- // exist), turn it into (X & (C2 << C3)) != (C1 << C3). This happens a LOT in
- // code produced by the clang front-end, for bitfield access.
- // This seemingly simple opportunity to fold away a shift turns out to be
- // rather complicated. See PR17827 for details.
- unsigned ShiftOpcode = Shift->getOpcode();
- bool IsShl = ShiftOpcode == Instruction::Shl;
- const APInt *C3;
- if (match(Shift->getOperand(1), m_APInt(C3))) {
- APInt NewAndCst, NewCmpCst;
- bool AnyCmpCstBitsShiftedOut;
- if (ShiftOpcode == Instruction::Shl) {
- // For a left shift, we can fold if the comparison is not signed. We can
- // also fold a signed comparison if the mask value and comparison value
- // are not negative. These constraints may not be obvious, but we can
- // prove that they are correct using an SMT solver.
- if (Cmp.isSigned() && (C2.isNegative() || C1.isNegative()))
- return nullptr;
-
- NewCmpCst = C1.lshr(*C3);
- NewAndCst = C2.lshr(*C3);
- AnyCmpCstBitsShiftedOut = NewCmpCst.shl(*C3) != C1;
- } else if (ShiftOpcode == Instruction::LShr) {
- // For a logical right shift, we can fold if the comparison is not signed.
- // We can also fold a signed comparison if the shifted mask value and the
- // shifted comparison value are not negative. These constraints may not be
- // obvious, but we can prove that they are correct using an SMT solver.
- NewCmpCst = C1.shl(*C3);
- NewAndCst = C2.shl(*C3);
- AnyCmpCstBitsShiftedOut = NewCmpCst.lshr(*C3) != C1;
- if (Cmp.isSigned() && (NewAndCst.isNegative() || NewCmpCst.isNegative()))
- return nullptr;
- } else {
- // For an arithmetic shift, check that both constants don't use (in a
- // signed sense) the top bits being shifted out.
- assert(ShiftOpcode == Instruction::AShr && "Unknown shift opcode");
- NewCmpCst = C1.shl(*C3);
- NewAndCst = C2.shl(*C3);
- AnyCmpCstBitsShiftedOut = NewCmpCst.ashr(*C3) != C1;
- if (NewAndCst.ashr(*C3) != C2)
- return nullptr;
- }
-
- if (AnyCmpCstBitsShiftedOut) {
- // If we shifted bits out, the fold is not going to work out. As a
- // special case, check to see if this means that the result is always
- // true or false now.
- if (Cmp.getPredicate() == ICmpInst::ICMP_EQ)
- return replaceInstUsesWith(Cmp, ConstantInt::getFalse(Cmp.getType()));
- if (Cmp.getPredicate() == ICmpInst::ICMP_NE)
- return replaceInstUsesWith(Cmp, ConstantInt::getTrue(Cmp.getType()));
- } else {
- Value *NewAnd = Builder.CreateAnd(
- Shift->getOperand(0), ConstantInt::get(And->getType(), NewAndCst));
- return new ICmpInst(Cmp.getPredicate(),
- NewAnd, ConstantInt::get(And->getType(), NewCmpCst));
- }
- }
-
- // Turn ((X >> Y) & C2) == 0 into (X & (C2 << Y)) == 0. The latter is
- // preferable because it allows the C2 << Y expression to be hoisted out of a
- // loop if Y is invariant and X is not.
- if (Shift->hasOneUse() && C1.isNullValue() && Cmp.isEquality() &&
- !Shift->isArithmeticShift() && !isa<Constant>(Shift->getOperand(0))) {
- // Compute C2 << Y.
- Value *NewShift =
- IsShl ? Builder.CreateLShr(And->getOperand(1), Shift->getOperand(1))
- : Builder.CreateShl(And->getOperand(1), Shift->getOperand(1));
-
- // Compute X & (C2 << Y).
- Value *NewAnd = Builder.CreateAnd(Shift->getOperand(0), NewShift);
- return replaceOperand(Cmp, 0, NewAnd);
- }
-
- return nullptr;
-}
-
-/// Fold icmp (and X, C2), C1.
+ BinaryOperator *Shift = dyn_cast<BinaryOperator>(And->getOperand(0));
+ if (!Shift || !Shift->isShift())
+ return nullptr;
+
+ // If this is: (X >> C3) & C2 != C1 (where any shift and any compare could
+ // exist), turn it into (X & (C2 << C3)) != (C1 << C3). This happens a LOT in
+ // code produced by the clang front-end, for bitfield access.
+ // This seemingly simple opportunity to fold away a shift turns out to be
+ // rather complicated. See PR17827 for details.
+ unsigned ShiftOpcode = Shift->getOpcode();
+ bool IsShl = ShiftOpcode == Instruction::Shl;
+ const APInt *C3;
+ if (match(Shift->getOperand(1), m_APInt(C3))) {
+ APInt NewAndCst, NewCmpCst;
+ bool AnyCmpCstBitsShiftedOut;
+ if (ShiftOpcode == Instruction::Shl) {
+ // For a left shift, we can fold if the comparison is not signed. We can
+ // also fold a signed comparison if the mask value and comparison value
+ // are not negative. These constraints may not be obvious, but we can
+ // prove that they are correct using an SMT solver.
+ if (Cmp.isSigned() && (C2.isNegative() || C1.isNegative()))
+ return nullptr;
+
+ NewCmpCst = C1.lshr(*C3);
+ NewAndCst = C2.lshr(*C3);
+ AnyCmpCstBitsShiftedOut = NewCmpCst.shl(*C3) != C1;
+ } else if (ShiftOpcode == Instruction::LShr) {
+ // For a logical right shift, we can fold if the comparison is not signed.
+ // We can also fold a signed comparison if the shifted mask value and the
+ // shifted comparison value are not negative. These constraints may not be
+ // obvious, but we can prove that they are correct using an SMT solver.
+ NewCmpCst = C1.shl(*C3);
+ NewAndCst = C2.shl(*C3);
+ AnyCmpCstBitsShiftedOut = NewCmpCst.lshr(*C3) != C1;
+ if (Cmp.isSigned() && (NewAndCst.isNegative() || NewCmpCst.isNegative()))
+ return nullptr;
+ } else {
+ // For an arithmetic shift, check that both constants don't use (in a
+ // signed sense) the top bits being shifted out.
+ assert(ShiftOpcode == Instruction::AShr && "Unknown shift opcode");
+ NewCmpCst = C1.shl(*C3);
+ NewAndCst = C2.shl(*C3);
+ AnyCmpCstBitsShiftedOut = NewCmpCst.ashr(*C3) != C1;
+ if (NewAndCst.ashr(*C3) != C2)
+ return nullptr;
+ }
+
+ if (AnyCmpCstBitsShiftedOut) {
+ // If we shifted bits out, the fold is not going to work out. As a
+ // special case, check to see if this means that the result is always
+ // true or false now.
+ if (Cmp.getPredicate() == ICmpInst::ICMP_EQ)
+ return replaceInstUsesWith(Cmp, ConstantInt::getFalse(Cmp.getType()));
+ if (Cmp.getPredicate() == ICmpInst::ICMP_NE)
+ return replaceInstUsesWith(Cmp, ConstantInt::getTrue(Cmp.getType()));
+ } else {
+ Value *NewAnd = Builder.CreateAnd(
+ Shift->getOperand(0), ConstantInt::get(And->getType(), NewAndCst));
+ return new ICmpInst(Cmp.getPredicate(),
+ NewAnd, ConstantInt::get(And->getType(), NewCmpCst));
+ }
+ }
+
+ // Turn ((X >> Y) & C2) == 0 into (X & (C2 << Y)) == 0. The latter is
+ // preferable because it allows the C2 << Y expression to be hoisted out of a
+ // loop if Y is invariant and X is not.
+ if (Shift->hasOneUse() && C1.isNullValue() && Cmp.isEquality() &&
+ !Shift->isArithmeticShift() && !isa<Constant>(Shift->getOperand(0))) {
+ // Compute C2 << Y.
+ Value *NewShift =
+ IsShl ? Builder.CreateLShr(And->getOperand(1), Shift->getOperand(1))
+ : Builder.CreateShl(And->getOperand(1), Shift->getOperand(1));
+
+ // Compute X & (C2 << Y).
+ Value *NewAnd = Builder.CreateAnd(Shift->getOperand(0), NewShift);
+ return replaceOperand(Cmp, 0, NewAnd);
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (and X, C2), C1.
Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp,
BinaryOperator *And,
const APInt &C1) {
- bool isICMP_NE = Cmp.getPredicate() == ICmpInst::ICMP_NE;
-
- // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
- // TODO: We canonicalize to the longer form for scalars because we have
- // better analysis/folds for icmp, and codegen may be better with icmp.
- if (isICMP_NE && Cmp.getType()->isVectorTy() && C1.isNullValue() &&
- match(And->getOperand(1), m_One()))
- return new TruncInst(And->getOperand(0), Cmp.getType());
-
- const APInt *C2;
- Value *X;
- if (!match(And, m_And(m_Value(X), m_APInt(C2))))
- return nullptr;
-
- // Don't perform the following transforms if the AND has multiple uses
- if (!And->hasOneUse())
- return nullptr;
-
- if (Cmp.isEquality() && C1.isNullValue()) {
- // Restrict this fold to single-use 'and' (PR10267).
- // Replace (and X, (1 << size(X)-1) != 0) with X s< 0
- if (C2->isSignMask()) {
- Constant *Zero = Constant::getNullValue(X->getType());
- auto NewPred = isICMP_NE ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
- return new ICmpInst(NewPred, X, Zero);
- }
-
- // Restrict this fold only for single-use 'and' (PR10267).
- // ((%x & C) == 0) --> %x u< (-C) iff (-C) is power of two.
- if ((~(*C2) + 1).isPowerOf2()) {
- Constant *NegBOC =
- ConstantExpr::getNeg(cast<Constant>(And->getOperand(1)));
- auto NewPred = isICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
- return new ICmpInst(NewPred, X, NegBOC);
- }
- }
-
- // If the LHS is an 'and' of a truncate and we can widen the and/compare to
- // the input width without changing the value produced, eliminate the cast:
- //
- // icmp (and (trunc W), C2), C1 -> icmp (and W, C2'), C1'
- //
- // We can do this transformation if the constants do not have their sign bits
- // set or if it is an equality comparison. Extending a relational comparison
- // when we're checking the sign bit would not work.
- Value *W;
- if (match(And->getOperand(0), m_OneUse(m_Trunc(m_Value(W)))) &&
- (Cmp.isEquality() || (!C1.isNegative() && !C2->isNegative()))) {
- // TODO: Is this a good transform for vectors? Wider types may reduce
- // throughput. Should this transform be limited (even for scalars) by using
- // shouldChangeType()?
- if (!Cmp.getType()->isVectorTy()) {
- Type *WideType = W->getType();
- unsigned WideScalarBits = WideType->getScalarSizeInBits();
- Constant *ZextC1 = ConstantInt::get(WideType, C1.zext(WideScalarBits));
- Constant *ZextC2 = ConstantInt::get(WideType, C2->zext(WideScalarBits));
- Value *NewAnd = Builder.CreateAnd(W, ZextC2, And->getName());
- return new ICmpInst(Cmp.getPredicate(), NewAnd, ZextC1);
- }
- }
-
- if (Instruction *I = foldICmpAndShift(Cmp, And, C1, *C2))
- return I;
-
- // (icmp pred (and (or (lshr A, B), A), 1), 0) -->
- // (icmp pred (and A, (or (shl 1, B), 1), 0))
- //
- // iff pred isn't signed
- if (!Cmp.isSigned() && C1.isNullValue() && And->getOperand(0)->hasOneUse() &&
- match(And->getOperand(1), m_One())) {
- Constant *One = cast<Constant>(And->getOperand(1));
- Value *Or = And->getOperand(0);
- Value *A, *B, *LShr;
- if (match(Or, m_Or(m_Value(LShr), m_Value(A))) &&
- match(LShr, m_LShr(m_Specific(A), m_Value(B)))) {
- unsigned UsesRemoved = 0;
- if (And->hasOneUse())
- ++UsesRemoved;
- if (Or->hasOneUse())
- ++UsesRemoved;
- if (LShr->hasOneUse())
- ++UsesRemoved;
-
- // Compute A & ((1 << B) | 1)
- Value *NewOr = nullptr;
- if (auto *C = dyn_cast<Constant>(B)) {
- if (UsesRemoved >= 1)
- NewOr = ConstantExpr::getOr(ConstantExpr::getNUWShl(One, C), One);
- } else {
- if (UsesRemoved >= 3)
- NewOr = Builder.CreateOr(Builder.CreateShl(One, B, LShr->getName(),
- /*HasNUW=*/true),
- One, Or->getName());
- }
- if (NewOr) {
- Value *NewAnd = Builder.CreateAnd(A, NewOr, And->getName());
- return replaceOperand(Cmp, 0, NewAnd);
- }
- }
- }
-
- return nullptr;
-}
-
-/// Fold icmp (and X, Y), C.
+ bool isICMP_NE = Cmp.getPredicate() == ICmpInst::ICMP_NE;
+
+ // For vectors: icmp ne (and X, 1), 0 --> trunc X to N x i1
+ // TODO: We canonicalize to the longer form for scalars because we have
+ // better analysis/folds for icmp, and codegen may be better with icmp.
+ if (isICMP_NE && Cmp.getType()->isVectorTy() && C1.isNullValue() &&
+ match(And->getOperand(1), m_One()))
+ return new TruncInst(And->getOperand(0), Cmp.getType());
+
+ const APInt *C2;
+ Value *X;
+ if (!match(And, m_And(m_Value(X), m_APInt(C2))))
+ return nullptr;
+
+ // Don't perform the following transforms if the AND has multiple uses
+ if (!And->hasOneUse())
+ return nullptr;
+
+ if (Cmp.isEquality() && C1.isNullValue()) {
+ // Restrict this fold to single-use 'and' (PR10267).
+ // Replace (and X, (1 << size(X)-1) != 0) with X s< 0
+ if (C2->isSignMask()) {
+ Constant *Zero = Constant::getNullValue(X->getType());
+ auto NewPred = isICMP_NE ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
+ return new ICmpInst(NewPred, X, Zero);
+ }
+
+ // Restrict this fold only for single-use 'and' (PR10267).
+ // ((%x & C) == 0) --> %x u< (-C) iff (-C) is power of two.
+ if ((~(*C2) + 1).isPowerOf2()) {
+ Constant *NegBOC =
+ ConstantExpr::getNeg(cast<Constant>(And->getOperand(1)));
+ auto NewPred = isICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
+ return new ICmpInst(NewPred, X, NegBOC);
+ }
+ }
+
+ // If the LHS is an 'and' of a truncate and we can widen the and/compare to
+ // the input width without changing the value produced, eliminate the cast:
+ //
+ // icmp (and (trunc W), C2), C1 -> icmp (and W, C2'), C1'
+ //
+ // We can do this transformation if the constants do not have their sign bits
+ // set or if it is an equality comparison. Extending a relational comparison
+ // when we're checking the sign bit would not work.
+ Value *W;
+ if (match(And->getOperand(0), m_OneUse(m_Trunc(m_Value(W)))) &&
+ (Cmp.isEquality() || (!C1.isNegative() && !C2->isNegative()))) {
+ // TODO: Is this a good transform for vectors? Wider types may reduce
+ // throughput. Should this transform be limited (even for scalars) by using
+ // shouldChangeType()?
+ if (!Cmp.getType()->isVectorTy()) {
+ Type *WideType = W->getType();
+ unsigned WideScalarBits = WideType->getScalarSizeInBits();
+ Constant *ZextC1 = ConstantInt::get(WideType, C1.zext(WideScalarBits));
+ Constant *ZextC2 = ConstantInt::get(WideType, C2->zext(WideScalarBits));
+ Value *NewAnd = Builder.CreateAnd(W, ZextC2, And->getName());
+ return new ICmpInst(Cmp.getPredicate(), NewAnd, ZextC1);
+ }
+ }
+
+ if (Instruction *I = foldICmpAndShift(Cmp, And, C1, *C2))
+ return I;
+
+ // (icmp pred (and (or (lshr A, B), A), 1), 0) -->
+ // (icmp pred (and A, (or (shl 1, B), 1), 0))
+ //
+ // iff pred isn't signed
+ if (!Cmp.isSigned() && C1.isNullValue() && And->getOperand(0)->hasOneUse() &&
+ match(And->getOperand(1), m_One())) {
+ Constant *One = cast<Constant>(And->getOperand(1));
+ Value *Or = And->getOperand(0);
+ Value *A, *B, *LShr;
+ if (match(Or, m_Or(m_Value(LShr), m_Value(A))) &&
+ match(LShr, m_LShr(m_Specific(A), m_Value(B)))) {
+ unsigned UsesRemoved = 0;
+ if (And->hasOneUse())
+ ++UsesRemoved;
+ if (Or->hasOneUse())
+ ++UsesRemoved;
+ if (LShr->hasOneUse())
+ ++UsesRemoved;
+
+ // Compute A & ((1 << B) | 1)
+ Value *NewOr = nullptr;
+ if (auto *C = dyn_cast<Constant>(B)) {
+ if (UsesRemoved >= 1)
+ NewOr = ConstantExpr::getOr(ConstantExpr::getNUWShl(One, C), One);
+ } else {
+ if (UsesRemoved >= 3)
+ NewOr = Builder.CreateOr(Builder.CreateShl(One, B, LShr->getName(),
+ /*HasNUW=*/true),
+ One, Or->getName());
+ }
+ if (NewOr) {
+ Value *NewAnd = Builder.CreateAnd(A, NewOr, And->getName());
+ return replaceOperand(Cmp, 0, NewAnd);
+ }
+ }
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (and X, Y), C.
Instruction *InstCombinerImpl::foldICmpAndConstant(ICmpInst &Cmp,
BinaryOperator *And,
const APInt &C) {
- if (Instruction *I = foldICmpAndConstConst(Cmp, And, C))
- return I;
-
- // TODO: These all require that Y is constant too, so refactor with the above.
-
- // Try to optimize things like "A[i] & 42 == 0" to index computations.
- Value *X = And->getOperand(0);
- Value *Y = And->getOperand(1);
- if (auto *LI = dyn_cast<LoadInst>(X))
- if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0)))
- if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
- if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
- !LI->isVolatile() && isa<ConstantInt>(Y)) {
- ConstantInt *C2 = cast<ConstantInt>(Y);
- if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, Cmp, C2))
- return Res;
- }
-
- if (!Cmp.isEquality())
- return nullptr;
-
- // X & -C == -C -> X > u ~C
- // X & -C != -C -> X <= u ~C
- // iff C is a power of 2
- if (Cmp.getOperand(1) == Y && (-C).isPowerOf2()) {
- auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT
- : CmpInst::ICMP_ULE;
- return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1))));
- }
-
- // (X & C2) == 0 -> (trunc X) >= 0
- // (X & C2) != 0 -> (trunc X) < 0
- // iff C2 is a power of 2 and it masks the sign bit of a legal integer type.
- const APInt *C2;
- if (And->hasOneUse() && C.isNullValue() && match(Y, m_APInt(C2))) {
- int32_t ExactLogBase2 = C2->exactLogBase2();
- if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
- Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1);
- if (auto *AndVTy = dyn_cast<VectorType>(And->getType()))
+ if (Instruction *I = foldICmpAndConstConst(Cmp, And, C))
+ return I;
+
+ // TODO: These all require that Y is constant too, so refactor with the above.
+
+ // Try to optimize things like "A[i] & 42 == 0" to index computations.
+ Value *X = And->getOperand(0);
+ Value *Y = And->getOperand(1);
+ if (auto *LI = dyn_cast<LoadInst>(X))
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0)))
+ if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+ if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+ !LI->isVolatile() && isa<ConstantInt>(Y)) {
+ ConstantInt *C2 = cast<ConstantInt>(Y);
+ if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, Cmp, C2))
+ return Res;
+ }
+
+ if (!Cmp.isEquality())
+ return nullptr;
+
+ // X & -C == -C -> X > u ~C
+ // X & -C != -C -> X <= u ~C
+ // iff C is a power of 2
+ if (Cmp.getOperand(1) == Y && (-C).isPowerOf2()) {
+ auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGT
+ : CmpInst::ICMP_ULE;
+ return new ICmpInst(NewPred, X, SubOne(cast<Constant>(Cmp.getOperand(1))));
+ }
+
+ // (X & C2) == 0 -> (trunc X) >= 0
+ // (X & C2) != 0 -> (trunc X) < 0
+ // iff C2 is a power of 2 and it masks the sign bit of a legal integer type.
+ const APInt *C2;
+ if (And->hasOneUse() && C.isNullValue() && match(Y, m_APInt(C2))) {
+ int32_t ExactLogBase2 = C2->exactLogBase2();
+ if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
+ Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1);
+ if (auto *AndVTy = dyn_cast<VectorType>(And->getType()))
NTy = VectorType::get(NTy, AndVTy->getElementCount());
- Value *Trunc = Builder.CreateTrunc(X, NTy);
- auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE
- : CmpInst::ICMP_SLT;
- return new ICmpInst(NewPred, Trunc, Constant::getNullValue(NTy));
- }
- }
-
- return nullptr;
-}
-
-/// Fold icmp (or X, Y), C.
+ Value *Trunc = Builder.CreateTrunc(X, NTy);
+ auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE
+ : CmpInst::ICMP_SLT;
+ return new ICmpInst(NewPred, Trunc, Constant::getNullValue(NTy));
+ }
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (or X, Y), C.
Instruction *InstCombinerImpl::foldICmpOrConstant(ICmpInst &Cmp,
BinaryOperator *Or,
const APInt &C) {
- ICmpInst::Predicate Pred = Cmp.getPredicate();
- if (C.isOneValue()) {
- // icmp slt signum(V) 1 --> icmp slt V, 1
- Value *V = nullptr;
- if (Pred == ICmpInst::ICMP_SLT && match(Or, m_Signum(m_Value(V))))
- return new ICmpInst(ICmpInst::ICMP_SLT, V,
- ConstantInt::get(V->getType(), 1));
- }
-
- Value *OrOp0 = Or->getOperand(0), *OrOp1 = Or->getOperand(1);
- const APInt *MaskC;
- if (match(OrOp1, m_APInt(MaskC)) && Cmp.isEquality()) {
- if (*MaskC == C && (C + 1).isPowerOf2()) {
- // X | C == C --> X <=u C
- // X | C != C --> X >u C
- // iff C+1 is a power of 2 (C is a bitmask of the low bits)
- Pred = (Pred == CmpInst::ICMP_EQ) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
- return new ICmpInst(Pred, OrOp0, OrOp1);
- }
-
- // More general: canonicalize 'equality with set bits mask' to
- // 'equality with clear bits mask'.
- // (X | MaskC) == C --> (X & ~MaskC) == C ^ MaskC
- // (X | MaskC) != C --> (X & ~MaskC) != C ^ MaskC
- if (Or->hasOneUse()) {
- Value *And = Builder.CreateAnd(OrOp0, ~(*MaskC));
- Constant *NewC = ConstantInt::get(Or->getType(), C ^ (*MaskC));
- return new ICmpInst(Pred, And, NewC);
- }
- }
-
- if (!Cmp.isEquality() || !C.isNullValue() || !Or->hasOneUse())
- return nullptr;
-
- Value *P, *Q;
- if (match(Or, m_Or(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Value(Q))))) {
- // Simplify icmp eq (or (ptrtoint P), (ptrtoint Q)), 0
- // -> and (icmp eq P, null), (icmp eq Q, null).
- Value *CmpP =
- Builder.CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType()));
- Value *CmpQ =
- Builder.CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType()));
- auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
- return BinaryOperator::Create(BOpc, CmpP, CmpQ);
- }
-
- // Are we using xors to bitwise check for a pair of (in)equalities? Convert to
- // a shorter form that has more potential to be folded even further.
- Value *X1, *X2, *X3, *X4;
- if (match(OrOp0, m_OneUse(m_Xor(m_Value(X1), m_Value(X2)))) &&
- match(OrOp1, m_OneUse(m_Xor(m_Value(X3), m_Value(X4))))) {
- // ((X1 ^ X2) || (X3 ^ X4)) == 0 --> (X1 == X2) && (X3 == X4)
- // ((X1 ^ X2) || (X3 ^ X4)) != 0 --> (X1 != X2) || (X3 != X4)
- Value *Cmp12 = Builder.CreateICmp(Pred, X1, X2);
- Value *Cmp34 = Builder.CreateICmp(Pred, X3, X4);
- auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
- return BinaryOperator::Create(BOpc, Cmp12, Cmp34);
- }
-
- return nullptr;
-}
-
-/// Fold icmp (mul X, Y), C.
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ if (C.isOneValue()) {
+ // icmp slt signum(V) 1 --> icmp slt V, 1
+ Value *V = nullptr;
+ if (Pred == ICmpInst::ICMP_SLT && match(Or, m_Signum(m_Value(V))))
+ return new ICmpInst(ICmpInst::ICMP_SLT, V,
+ ConstantInt::get(V->getType(), 1));
+ }
+
+ Value *OrOp0 = Or->getOperand(0), *OrOp1 = Or->getOperand(1);
+ const APInt *MaskC;
+ if (match(OrOp1, m_APInt(MaskC)) && Cmp.isEquality()) {
+ if (*MaskC == C && (C + 1).isPowerOf2()) {
+ // X | C == C --> X <=u C
+ // X | C != C --> X >u C
+ // iff C+1 is a power of 2 (C is a bitmask of the low bits)
+ Pred = (Pred == CmpInst::ICMP_EQ) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
+ return new ICmpInst(Pred, OrOp0, OrOp1);
+ }
+
+ // More general: canonicalize 'equality with set bits mask' to
+ // 'equality with clear bits mask'.
+ // (X | MaskC) == C --> (X & ~MaskC) == C ^ MaskC
+ // (X | MaskC) != C --> (X & ~MaskC) != C ^ MaskC
+ if (Or->hasOneUse()) {
+ Value *And = Builder.CreateAnd(OrOp0, ~(*MaskC));
+ Constant *NewC = ConstantInt::get(Or->getType(), C ^ (*MaskC));
+ return new ICmpInst(Pred, And, NewC);
+ }
+ }
+
+ if (!Cmp.isEquality() || !C.isNullValue() || !Or->hasOneUse())
+ return nullptr;
+
+ Value *P, *Q;
+ if (match(Or, m_Or(m_PtrToInt(m_Value(P)), m_PtrToInt(m_Value(Q))))) {
+ // Simplify icmp eq (or (ptrtoint P), (ptrtoint Q)), 0
+ // -> and (icmp eq P, null), (icmp eq Q, null).
+ Value *CmpP =
+ Builder.CreateICmp(Pred, P, ConstantInt::getNullValue(P->getType()));
+ Value *CmpQ =
+ Builder.CreateICmp(Pred, Q, ConstantInt::getNullValue(Q->getType()));
+ auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
+ return BinaryOperator::Create(BOpc, CmpP, CmpQ);
+ }
+
+ // Are we using xors to bitwise check for a pair of (in)equalities? Convert to
+ // a shorter form that has more potential to be folded even further.
+ Value *X1, *X2, *X3, *X4;
+ if (match(OrOp0, m_OneUse(m_Xor(m_Value(X1), m_Value(X2)))) &&
+ match(OrOp1, m_OneUse(m_Xor(m_Value(X3), m_Value(X4))))) {
+ // ((X1 ^ X2) || (X3 ^ X4)) == 0 --> (X1 == X2) && (X3 == X4)
+ // ((X1 ^ X2) || (X3 ^ X4)) != 0 --> (X1 != X2) || (X3 != X4)
+ Value *Cmp12 = Builder.CreateICmp(Pred, X1, X2);
+ Value *Cmp34 = Builder.CreateICmp(Pred, X3, X4);
+ auto BOpc = Pred == CmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
+ return BinaryOperator::Create(BOpc, Cmp12, Cmp34);
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (mul X, Y), C.
Instruction *InstCombinerImpl::foldICmpMulConstant(ICmpInst &Cmp,
BinaryOperator *Mul,
const APInt &C) {
- const APInt *MulC;
- if (!match(Mul->getOperand(1), m_APInt(MulC)))
- return nullptr;
-
- // If this is a test of the sign bit and the multiply is sign-preserving with
- // a constant operand, use the multiply LHS operand instead.
- ICmpInst::Predicate Pred = Cmp.getPredicate();
- if (isSignTest(Pred, C) && Mul->hasNoSignedWrap()) {
- if (MulC->isNegative())
- Pred = ICmpInst::getSwappedPredicate(Pred);
- return new ICmpInst(Pred, Mul->getOperand(0),
- Constant::getNullValue(Mul->getType()));
- }
-
+ const APInt *MulC;
+ if (!match(Mul->getOperand(1), m_APInt(MulC)))
+ return nullptr;
+
+ // If this is a test of the sign bit and the multiply is sign-preserving with
+ // a constant operand, use the multiply LHS operand instead.
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ if (isSignTest(Pred, C) && Mul->hasNoSignedWrap()) {
+ if (MulC->isNegative())
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ return new ICmpInst(Pred, Mul->getOperand(0),
+ Constant::getNullValue(Mul->getType()));
+ }
+
// If the multiply does not wrap, try to divide the compare constant by the
// multiplication factor.
if (Cmp.isEquality() && !MulC->isNullValue()) {
@@ -1956,260 +1956,260 @@ Instruction *InstCombinerImpl::foldICmpMulConstant(ICmpInst &Cmp,
}
}
- return nullptr;
-}
-
-/// Fold icmp (shl 1, Y), C.
-static Instruction *foldICmpShlOne(ICmpInst &Cmp, Instruction *Shl,
- const APInt &C) {
- Value *Y;
- if (!match(Shl, m_Shl(m_One(), m_Value(Y))))
- return nullptr;
-
- Type *ShiftType = Shl->getType();
- unsigned TypeBits = C.getBitWidth();
- bool CIsPowerOf2 = C.isPowerOf2();
- ICmpInst::Predicate Pred = Cmp.getPredicate();
- if (Cmp.isUnsigned()) {
- // (1 << Y) pred C -> Y pred Log2(C)
- if (!CIsPowerOf2) {
- // (1 << Y) < 30 -> Y <= 4
- // (1 << Y) <= 30 -> Y <= 4
- // (1 << Y) >= 30 -> Y > 4
- // (1 << Y) > 30 -> Y > 4
- if (Pred == ICmpInst::ICMP_ULT)
- Pred = ICmpInst::ICMP_ULE;
- else if (Pred == ICmpInst::ICMP_UGE)
- Pred = ICmpInst::ICMP_UGT;
- }
-
- // (1 << Y) >= 2147483648 -> Y >= 31 -> Y == 31
- // (1 << Y) < 2147483648 -> Y < 31 -> Y != 31
- unsigned CLog2 = C.logBase2();
- if (CLog2 == TypeBits - 1) {
- if (Pred == ICmpInst::ICMP_UGE)
- Pred = ICmpInst::ICMP_EQ;
- else if (Pred == ICmpInst::ICMP_ULT)
- Pred = ICmpInst::ICMP_NE;
- }
- return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, CLog2));
- } else if (Cmp.isSigned()) {
- Constant *BitWidthMinusOne = ConstantInt::get(ShiftType, TypeBits - 1);
- if (C.isAllOnesValue()) {
- // (1 << Y) <= -1 -> Y == 31
- if (Pred == ICmpInst::ICMP_SLE)
- return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);
-
- // (1 << Y) > -1 -> Y != 31
- if (Pred == ICmpInst::ICMP_SGT)
- return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne);
- } else if (!C) {
- // (1 << Y) < 0 -> Y == 31
- // (1 << Y) <= 0 -> Y == 31
- if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
- return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);
-
- // (1 << Y) >= 0 -> Y != 31
- // (1 << Y) > 0 -> Y != 31
- if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE)
- return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne);
- }
- } else if (Cmp.isEquality() && CIsPowerOf2) {
- return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, C.logBase2()));
- }
-
- return nullptr;
-}
-
-/// Fold icmp (shl X, Y), C.
+ return nullptr;
+}
+
+/// Fold icmp (shl 1, Y), C.
+static Instruction *foldICmpShlOne(ICmpInst &Cmp, Instruction *Shl,
+ const APInt &C) {
+ Value *Y;
+ if (!match(Shl, m_Shl(m_One(), m_Value(Y))))
+ return nullptr;
+
+ Type *ShiftType = Shl->getType();
+ unsigned TypeBits = C.getBitWidth();
+ bool CIsPowerOf2 = C.isPowerOf2();
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ if (Cmp.isUnsigned()) {
+ // (1 << Y) pred C -> Y pred Log2(C)
+ if (!CIsPowerOf2) {
+ // (1 << Y) < 30 -> Y <= 4
+ // (1 << Y) <= 30 -> Y <= 4
+ // (1 << Y) >= 30 -> Y > 4
+ // (1 << Y) > 30 -> Y > 4
+ if (Pred == ICmpInst::ICMP_ULT)
+ Pred = ICmpInst::ICMP_ULE;
+ else if (Pred == ICmpInst::ICMP_UGE)
+ Pred = ICmpInst::ICMP_UGT;
+ }
+
+ // (1 << Y) >= 2147483648 -> Y >= 31 -> Y == 31
+ // (1 << Y) < 2147483648 -> Y < 31 -> Y != 31
+ unsigned CLog2 = C.logBase2();
+ if (CLog2 == TypeBits - 1) {
+ if (Pred == ICmpInst::ICMP_UGE)
+ Pred = ICmpInst::ICMP_EQ;
+ else if (Pred == ICmpInst::ICMP_ULT)
+ Pred = ICmpInst::ICMP_NE;
+ }
+ return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, CLog2));
+ } else if (Cmp.isSigned()) {
+ Constant *BitWidthMinusOne = ConstantInt::get(ShiftType, TypeBits - 1);
+ if (C.isAllOnesValue()) {
+ // (1 << Y) <= -1 -> Y == 31
+ if (Pred == ICmpInst::ICMP_SLE)
+ return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);
+
+ // (1 << Y) > -1 -> Y != 31
+ if (Pred == ICmpInst::ICMP_SGT)
+ return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne);
+ } else if (!C) {
+ // (1 << Y) < 0 -> Y == 31
+ // (1 << Y) <= 0 -> Y == 31
+ if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SLE)
+ return new ICmpInst(ICmpInst::ICMP_EQ, Y, BitWidthMinusOne);
+
+ // (1 << Y) >= 0 -> Y != 31
+ // (1 << Y) > 0 -> Y != 31
+ if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SGE)
+ return new ICmpInst(ICmpInst::ICMP_NE, Y, BitWidthMinusOne);
+ }
+ } else if (Cmp.isEquality() && CIsPowerOf2) {
+ return new ICmpInst(Pred, Y, ConstantInt::get(ShiftType, C.logBase2()));
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (shl X, Y), C.
Instruction *InstCombinerImpl::foldICmpShlConstant(ICmpInst &Cmp,
BinaryOperator *Shl,
const APInt &C) {
- const APInt *ShiftVal;
- if (Cmp.isEquality() && match(Shl->getOperand(0), m_APInt(ShiftVal)))
- return foldICmpShlConstConst(Cmp, Shl->getOperand(1), C, *ShiftVal);
-
- const APInt *ShiftAmt;
- if (!match(Shl->getOperand(1), m_APInt(ShiftAmt)))
- return foldICmpShlOne(Cmp, Shl, C);
-
- // Check that the shift amount is in range. If not, don't perform undefined
- // shifts. When the shift is visited, it will be simplified.
- unsigned TypeBits = C.getBitWidth();
- if (ShiftAmt->uge(TypeBits))
- return nullptr;
-
- ICmpInst::Predicate Pred = Cmp.getPredicate();
- Value *X = Shl->getOperand(0);
- Type *ShType = Shl->getType();
-
- // NSW guarantees that we are only shifting out sign bits from the high bits,
- // so we can ASHR the compare constant without needing a mask and eliminate
- // the shift.
- if (Shl->hasNoSignedWrap()) {
- if (Pred == ICmpInst::ICMP_SGT) {
- // icmp Pred (shl nsw X, ShiftAmt), C --> icmp Pred X, (C >>s ShiftAmt)
- APInt ShiftedC = C.ashr(*ShiftAmt);
- return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
- }
- if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
- C.ashr(*ShiftAmt).shl(*ShiftAmt) == C) {
- APInt ShiftedC = C.ashr(*ShiftAmt);
- return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
- }
- if (Pred == ICmpInst::ICMP_SLT) {
- // SLE is the same as above, but SLE is canonicalized to SLT, so convert:
- // (X << S) <=s C is equiv to X <=s (C >> S) for all C
- // (X << S) <s (C + 1) is equiv to X <s (C >> S) + 1 if C <s SMAX
- // (X << S) <s C is equiv to X <s ((C - 1) >> S) + 1 if C >s SMIN
- assert(!C.isMinSignedValue() && "Unexpected icmp slt");
- APInt ShiftedC = (C - 1).ashr(*ShiftAmt) + 1;
- return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
- }
- // If this is a signed comparison to 0 and the shift is sign preserving,
- // use the shift LHS operand instead; isSignTest may change 'Pred', so only
- // do that if we're sure to not continue on in this function.
- if (isSignTest(Pred, C))
- return new ICmpInst(Pred, X, Constant::getNullValue(ShType));
- }
-
- // NUW guarantees that we are only shifting out zero bits from the high bits,
- // so we can LSHR the compare constant without needing a mask and eliminate
- // the shift.
- if (Shl->hasNoUnsignedWrap()) {
- if (Pred == ICmpInst::ICMP_UGT) {
- // icmp Pred (shl nuw X, ShiftAmt), C --> icmp Pred X, (C >>u ShiftAmt)
- APInt ShiftedC = C.lshr(*ShiftAmt);
- return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
- }
- if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
- C.lshr(*ShiftAmt).shl(*ShiftAmt) == C) {
- APInt ShiftedC = C.lshr(*ShiftAmt);
- return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
- }
- if (Pred == ICmpInst::ICMP_ULT) {
- // ULE is the same as above, but ULE is canonicalized to ULT, so convert:
- // (X << S) <=u C is equiv to X <=u (C >> S) for all C
- // (X << S) <u (C + 1) is equiv to X <u (C >> S) + 1 if C <u ~0u
- // (X << S) <u C is equiv to X <u ((C - 1) >> S) + 1 if C >u 0
- assert(C.ugt(0) && "ult 0 should have been eliminated");
- APInt ShiftedC = (C - 1).lshr(*ShiftAmt) + 1;
- return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
- }
- }
-
- if (Cmp.isEquality() && Shl->hasOneUse()) {
- // Strength-reduce the shift into an 'and'.
- Constant *Mask = ConstantInt::get(
- ShType,
- APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue()));
- Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask");
- Constant *LShrC = ConstantInt::get(ShType, C.lshr(*ShiftAmt));
- return new ICmpInst(Pred, And, LShrC);
- }
-
- // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
- bool TrueIfSigned = false;
- if (Shl->hasOneUse() && isSignBitCheck(Pred, C, TrueIfSigned)) {
- // (X << 31) <s 0 --> (X & 1) != 0
- Constant *Mask = ConstantInt::get(
- ShType,
- APInt::getOneBitSet(TypeBits, TypeBits - ShiftAmt->getZExtValue() - 1));
- Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask");
- return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
- And, Constant::getNullValue(ShType));
- }
-
- // Simplify 'shl' inequality test into 'and' equality test.
- if (Cmp.isUnsigned() && Shl->hasOneUse()) {
- // (X l<< C2) u<=/u> C1 iff C1+1 is power of two -> X & (~C1 l>> C2) ==/!= 0
- if ((C + 1).isPowerOf2() &&
- (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT)) {
- Value *And = Builder.CreateAnd(X, (~C).lshr(ShiftAmt->getZExtValue()));
- return new ICmpInst(Pred == ICmpInst::ICMP_ULE ? ICmpInst::ICMP_EQ
- : ICmpInst::ICMP_NE,
- And, Constant::getNullValue(ShType));
- }
- // (X l<< C2) u</u>= C1 iff C1 is power of two -> X & (-C1 l>> C2) ==/!= 0
- if (C.isPowerOf2() &&
- (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE)) {
- Value *And =
- Builder.CreateAnd(X, (~(C - 1)).lshr(ShiftAmt->getZExtValue()));
- return new ICmpInst(Pred == ICmpInst::ICMP_ULT ? ICmpInst::ICMP_EQ
- : ICmpInst::ICMP_NE,
- And, Constant::getNullValue(ShType));
- }
- }
-
- // Transform (icmp pred iM (shl iM %v, N), C)
- // -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (C>>N))
- // Transform the shl to a trunc if (trunc (C>>N)) has no loss and M-N.
- // This enables us to get rid of the shift in favor of a trunc that may be
- // free on the target. It has the additional benefit of comparing to a
- // smaller constant that may be more target-friendly.
- unsigned Amt = ShiftAmt->getLimitedValue(TypeBits - 1);
- if (Shl->hasOneUse() && Amt != 0 && C.countTrailingZeros() >= Amt &&
- DL.isLegalInteger(TypeBits - Amt)) {
- Type *TruncTy = IntegerType::get(Cmp.getContext(), TypeBits - Amt);
- if (auto *ShVTy = dyn_cast<VectorType>(ShType))
+ const APInt *ShiftVal;
+ if (Cmp.isEquality() && match(Shl->getOperand(0), m_APInt(ShiftVal)))
+ return foldICmpShlConstConst(Cmp, Shl->getOperand(1), C, *ShiftVal);
+
+ const APInt *ShiftAmt;
+ if (!match(Shl->getOperand(1), m_APInt(ShiftAmt)))
+ return foldICmpShlOne(Cmp, Shl, C);
+
+ // Check that the shift amount is in range. If not, don't perform undefined
+ // shifts. When the shift is visited, it will be simplified.
+ unsigned TypeBits = C.getBitWidth();
+ if (ShiftAmt->uge(TypeBits))
+ return nullptr;
+
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ Value *X = Shl->getOperand(0);
+ Type *ShType = Shl->getType();
+
+ // NSW guarantees that we are only shifting out sign bits from the high bits,
+ // so we can ASHR the compare constant without needing a mask and eliminate
+ // the shift.
+ if (Shl->hasNoSignedWrap()) {
+ if (Pred == ICmpInst::ICMP_SGT) {
+ // icmp Pred (shl nsw X, ShiftAmt), C --> icmp Pred X, (C >>s ShiftAmt)
+ APInt ShiftedC = C.ashr(*ShiftAmt);
+ return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+ }
+ if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
+ C.ashr(*ShiftAmt).shl(*ShiftAmt) == C) {
+ APInt ShiftedC = C.ashr(*ShiftAmt);
+ return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+ }
+ if (Pred == ICmpInst::ICMP_SLT) {
+ // SLE is the same as above, but SLE is canonicalized to SLT, so convert:
+ // (X << S) <=s C is equiv to X <=s (C >> S) for all C
+ // (X << S) <s (C + 1) is equiv to X <s (C >> S) + 1 if C <s SMAX
+ // (X << S) <s C is equiv to X <s ((C - 1) >> S) + 1 if C >s SMIN
+ assert(!C.isMinSignedValue() && "Unexpected icmp slt");
+ APInt ShiftedC = (C - 1).ashr(*ShiftAmt) + 1;
+ return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+ }
+ // If this is a signed comparison to 0 and the shift is sign preserving,
+ // use the shift LHS operand instead; isSignTest may change 'Pred', so only
+ // do that if we're sure to not continue on in this function.
+ if (isSignTest(Pred, C))
+ return new ICmpInst(Pred, X, Constant::getNullValue(ShType));
+ }
+
+ // NUW guarantees that we are only shifting out zero bits from the high bits,
+ // so we can LSHR the compare constant without needing a mask and eliminate
+ // the shift.
+ if (Shl->hasNoUnsignedWrap()) {
+ if (Pred == ICmpInst::ICMP_UGT) {
+ // icmp Pred (shl nuw X, ShiftAmt), C --> icmp Pred X, (C >>u ShiftAmt)
+ APInt ShiftedC = C.lshr(*ShiftAmt);
+ return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+ }
+ if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE) &&
+ C.lshr(*ShiftAmt).shl(*ShiftAmt) == C) {
+ APInt ShiftedC = C.lshr(*ShiftAmt);
+ return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+ }
+ if (Pred == ICmpInst::ICMP_ULT) {
+ // ULE is the same as above, but ULE is canonicalized to ULT, so convert:
+ // (X << S) <=u C is equiv to X <=u (C >> S) for all C
+ // (X << S) <u (C + 1) is equiv to X <u (C >> S) + 1 if C <u ~0u
+ // (X << S) <u C is equiv to X <u ((C - 1) >> S) + 1 if C >u 0
+ assert(C.ugt(0) && "ult 0 should have been eliminated");
+ APInt ShiftedC = (C - 1).lshr(*ShiftAmt) + 1;
+ return new ICmpInst(Pred, X, ConstantInt::get(ShType, ShiftedC));
+ }
+ }
+
+ if (Cmp.isEquality() && Shl->hasOneUse()) {
+ // Strength-reduce the shift into an 'and'.
+ Constant *Mask = ConstantInt::get(
+ ShType,
+ APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt->getZExtValue()));
+ Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask");
+ Constant *LShrC = ConstantInt::get(ShType, C.lshr(*ShiftAmt));
+ return new ICmpInst(Pred, And, LShrC);
+ }
+
+ // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
+ bool TrueIfSigned = false;
+ if (Shl->hasOneUse() && isSignBitCheck(Pred, C, TrueIfSigned)) {
+ // (X << 31) <s 0 --> (X & 1) != 0
+ Constant *Mask = ConstantInt::get(
+ ShType,
+ APInt::getOneBitSet(TypeBits, TypeBits - ShiftAmt->getZExtValue() - 1));
+ Value *And = Builder.CreateAnd(X, Mask, Shl->getName() + ".mask");
+ return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
+ And, Constant::getNullValue(ShType));
+ }
+
+ // Simplify 'shl' inequality test into 'and' equality test.
+ if (Cmp.isUnsigned() && Shl->hasOneUse()) {
+ // (X l<< C2) u<=/u> C1 iff C1+1 is power of two -> X & (~C1 l>> C2) ==/!= 0
+ if ((C + 1).isPowerOf2() &&
+ (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT)) {
+ Value *And = Builder.CreateAnd(X, (~C).lshr(ShiftAmt->getZExtValue()));
+ return new ICmpInst(Pred == ICmpInst::ICMP_ULE ? ICmpInst::ICMP_EQ
+ : ICmpInst::ICMP_NE,
+ And, Constant::getNullValue(ShType));
+ }
+ // (X l<< C2) u</u>= C1 iff C1 is power of two -> X & (-C1 l>> C2) ==/!= 0
+ if (C.isPowerOf2() &&
+ (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE)) {
+ Value *And =
+ Builder.CreateAnd(X, (~(C - 1)).lshr(ShiftAmt->getZExtValue()));
+ return new ICmpInst(Pred == ICmpInst::ICMP_ULT ? ICmpInst::ICMP_EQ
+ : ICmpInst::ICMP_NE,
+ And, Constant::getNullValue(ShType));
+ }
+ }
+
+ // Transform (icmp pred iM (shl iM %v, N), C)
+ // -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (C>>N))
+ // Transform the shl to a trunc if (trunc (C>>N)) has no loss and M-N.
+ // This enables us to get rid of the shift in favor of a trunc that may be
+ // free on the target. It has the additional benefit of comparing to a
+ // smaller constant that may be more target-friendly.
+ unsigned Amt = ShiftAmt->getLimitedValue(TypeBits - 1);
+ if (Shl->hasOneUse() && Amt != 0 && C.countTrailingZeros() >= Amt &&
+ DL.isLegalInteger(TypeBits - Amt)) {
+ Type *TruncTy = IntegerType::get(Cmp.getContext(), TypeBits - Amt);
+ if (auto *ShVTy = dyn_cast<VectorType>(ShType))
TruncTy = VectorType::get(TruncTy, ShVTy->getElementCount());
- Constant *NewC =
- ConstantInt::get(TruncTy, C.ashr(*ShiftAmt).trunc(TypeBits - Amt));
- return new ICmpInst(Pred, Builder.CreateTrunc(X, TruncTy), NewC);
- }
-
- return nullptr;
-}
-
-/// Fold icmp ({al}shr X, Y), C.
+ Constant *NewC =
+ ConstantInt::get(TruncTy, C.ashr(*ShiftAmt).trunc(TypeBits - Amt));
+ return new ICmpInst(Pred, Builder.CreateTrunc(X, TruncTy), NewC);
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp ({al}shr X, Y), C.
Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
BinaryOperator *Shr,
const APInt &C) {
- // An exact shr only shifts out zero bits, so:
- // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0
- Value *X = Shr->getOperand(0);
- CmpInst::Predicate Pred = Cmp.getPredicate();
- if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() &&
- C.isNullValue())
- return new ICmpInst(Pred, X, Cmp.getOperand(1));
-
- const APInt *ShiftVal;
- if (Cmp.isEquality() && match(Shr->getOperand(0), m_APInt(ShiftVal)))
- return foldICmpShrConstConst(Cmp, Shr->getOperand(1), C, *ShiftVal);
-
- const APInt *ShiftAmt;
- if (!match(Shr->getOperand(1), m_APInt(ShiftAmt)))
- return nullptr;
-
- // Check that the shift amount is in range. If not, don't perform undefined
- // shifts. When the shift is visited it will be simplified.
- unsigned TypeBits = C.getBitWidth();
- unsigned ShAmtVal = ShiftAmt->getLimitedValue(TypeBits);
- if (ShAmtVal >= TypeBits || ShAmtVal == 0)
- return nullptr;
-
- bool IsAShr = Shr->getOpcode() == Instruction::AShr;
- bool IsExact = Shr->isExact();
- Type *ShrTy = Shr->getType();
- // TODO: If we could guarantee that InstSimplify would handle all of the
- // constant-value-based preconditions in the folds below, then we could assert
- // those conditions rather than checking them. This is difficult because of
- // undef/poison (PR34838).
- if (IsAShr) {
- if (Pred == CmpInst::ICMP_SLT || (Pred == CmpInst::ICMP_SGT && IsExact)) {
- // icmp slt (ashr X, ShAmtC), C --> icmp slt X, (C << ShAmtC)
- // icmp sgt (ashr exact X, ShAmtC), C --> icmp sgt X, (C << ShAmtC)
- APInt ShiftedC = C.shl(ShAmtVal);
- if (ShiftedC.ashr(ShAmtVal) == C)
- return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
- }
- if (Pred == CmpInst::ICMP_SGT) {
- // icmp sgt (ashr X, ShAmtC), C --> icmp sgt X, ((C + 1) << ShAmtC) - 1
- APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1;
- if (!C.isMaxSignedValue() && !(C + 1).shl(ShAmtVal).isMinSignedValue() &&
- (ShiftedC + 1).ashr(ShAmtVal) == (C + 1))
- return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
- }
+ // An exact shr only shifts out zero bits, so:
+ // icmp eq/ne (shr X, Y), 0 --> icmp eq/ne X, 0
+ Value *X = Shr->getOperand(0);
+ CmpInst::Predicate Pred = Cmp.getPredicate();
+ if (Cmp.isEquality() && Shr->isExact() && Shr->hasOneUse() &&
+ C.isNullValue())
+ return new ICmpInst(Pred, X, Cmp.getOperand(1));
+
+ const APInt *ShiftVal;
+ if (Cmp.isEquality() && match(Shr->getOperand(0), m_APInt(ShiftVal)))
+ return foldICmpShrConstConst(Cmp, Shr->getOperand(1), C, *ShiftVal);
+
+ const APInt *ShiftAmt;
+ if (!match(Shr->getOperand(1), m_APInt(ShiftAmt)))
+ return nullptr;
+
+ // Check that the shift amount is in range. If not, don't perform undefined
+ // shifts. When the shift is visited it will be simplified.
+ unsigned TypeBits = C.getBitWidth();
+ unsigned ShAmtVal = ShiftAmt->getLimitedValue(TypeBits);
+ if (ShAmtVal >= TypeBits || ShAmtVal == 0)
+ return nullptr;
+
+ bool IsAShr = Shr->getOpcode() == Instruction::AShr;
+ bool IsExact = Shr->isExact();
+ Type *ShrTy = Shr->getType();
+ // TODO: If we could guarantee that InstSimplify would handle all of the
+ // constant-value-based preconditions in the folds below, then we could assert
+ // those conditions rather than checking them. This is difficult because of
+ // undef/poison (PR34838).
+ if (IsAShr) {
+ if (Pred == CmpInst::ICMP_SLT || (Pred == CmpInst::ICMP_SGT && IsExact)) {
+ // icmp slt (ashr X, ShAmtC), C --> icmp slt X, (C << ShAmtC)
+ // icmp sgt (ashr exact X, ShAmtC), C --> icmp sgt X, (C << ShAmtC)
+ APInt ShiftedC = C.shl(ShAmtVal);
+ if (ShiftedC.ashr(ShAmtVal) == C)
+ return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
+ }
+ if (Pred == CmpInst::ICMP_SGT) {
+ // icmp sgt (ashr X, ShAmtC), C --> icmp sgt X, ((C + 1) << ShAmtC) - 1
+ APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1;
+ if (!C.isMaxSignedValue() && !(C + 1).shl(ShAmtVal).isMinSignedValue() &&
+ (ShiftedC + 1).ashr(ShAmtVal) == (C + 1))
+ return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
+ }
// If the compare constant has significant bits above the lowest sign-bit,
// then convert an unsigned cmp to a test of the sign-bit:
@@ -2225,841 +2225,841 @@ Instruction *InstCombinerImpl::foldICmpShrConstant(ICmpInst &Cmp,
ConstantInt::getAllOnesValue(ShrTy));
}
}
- } else {
- if (Pred == CmpInst::ICMP_ULT || (Pred == CmpInst::ICMP_UGT && IsExact)) {
- // icmp ult (lshr X, ShAmtC), C --> icmp ult X, (C << ShAmtC)
- // icmp ugt (lshr exact X, ShAmtC), C --> icmp ugt X, (C << ShAmtC)
- APInt ShiftedC = C.shl(ShAmtVal);
- if (ShiftedC.lshr(ShAmtVal) == C)
- return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
- }
- if (Pred == CmpInst::ICMP_UGT) {
- // icmp ugt (lshr X, ShAmtC), C --> icmp ugt X, ((C + 1) << ShAmtC) - 1
- APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1;
- if ((ShiftedC + 1).lshr(ShAmtVal) == (C + 1))
- return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
- }
- }
-
- if (!Cmp.isEquality())
- return nullptr;
-
- // Handle equality comparisons of shift-by-constant.
-
- // If the comparison constant changes with the shift, the comparison cannot
- // succeed (bits of the comparison constant cannot match the shifted value).
- // This should be known by InstSimplify and already be folded to true/false.
- assert(((IsAShr && C.shl(ShAmtVal).ashr(ShAmtVal) == C) ||
- (!IsAShr && C.shl(ShAmtVal).lshr(ShAmtVal) == C)) &&
- "Expected icmp+shr simplify did not occur.");
-
- // If the bits shifted out are known zero, compare the unshifted value:
- // (X & 4) >> 1 == 2 --> (X & 4) == 4.
- if (Shr->isExact())
- return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, C << ShAmtVal));
-
- if (Shr->hasOneUse()) {
- // Canonicalize the shift into an 'and':
- // icmp eq/ne (shr X, ShAmt), C --> icmp eq/ne (and X, HiMask), (C << ShAmt)
- APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
- Constant *Mask = ConstantInt::get(ShrTy, Val);
- Value *And = Builder.CreateAnd(X, Mask, Shr->getName() + ".mask");
- return new ICmpInst(Pred, And, ConstantInt::get(ShrTy, C << ShAmtVal));
- }
-
- return nullptr;
-}
-
+ } else {
+ if (Pred == CmpInst::ICMP_ULT || (Pred == CmpInst::ICMP_UGT && IsExact)) {
+ // icmp ult (lshr X, ShAmtC), C --> icmp ult X, (C << ShAmtC)
+ // icmp ugt (lshr exact X, ShAmtC), C --> icmp ugt X, (C << ShAmtC)
+ APInt ShiftedC = C.shl(ShAmtVal);
+ if (ShiftedC.lshr(ShAmtVal) == C)
+ return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
+ }
+ if (Pred == CmpInst::ICMP_UGT) {
+ // icmp ugt (lshr X, ShAmtC), C --> icmp ugt X, ((C + 1) << ShAmtC) - 1
+ APInt ShiftedC = (C + 1).shl(ShAmtVal) - 1;
+ if ((ShiftedC + 1).lshr(ShAmtVal) == (C + 1))
+ return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, ShiftedC));
+ }
+ }
+
+ if (!Cmp.isEquality())
+ return nullptr;
+
+ // Handle equality comparisons of shift-by-constant.
+
+ // If the comparison constant changes with the shift, the comparison cannot
+ // succeed (bits of the comparison constant cannot match the shifted value).
+ // This should be known by InstSimplify and already be folded to true/false.
+ assert(((IsAShr && C.shl(ShAmtVal).ashr(ShAmtVal) == C) ||
+ (!IsAShr && C.shl(ShAmtVal).lshr(ShAmtVal) == C)) &&
+ "Expected icmp+shr simplify did not occur.");
+
+ // If the bits shifted out are known zero, compare the unshifted value:
+ // (X & 4) >> 1 == 2 --> (X & 4) == 4.
+ if (Shr->isExact())
+ return new ICmpInst(Pred, X, ConstantInt::get(ShrTy, C << ShAmtVal));
+
+ if (Shr->hasOneUse()) {
+ // Canonicalize the shift into an 'and':
+ // icmp eq/ne (shr X, ShAmt), C --> icmp eq/ne (and X, HiMask), (C << ShAmt)
+ APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
+ Constant *Mask = ConstantInt::get(ShrTy, Val);
+ Value *And = Builder.CreateAnd(X, Mask, Shr->getName() + ".mask");
+ return new ICmpInst(Pred, And, ConstantInt::get(ShrTy, C << ShAmtVal));
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::foldICmpSRemConstant(ICmpInst &Cmp,
BinaryOperator *SRem,
const APInt &C) {
- // Match an 'is positive' or 'is negative' comparison of remainder by a
- // constant power-of-2 value:
- // (X % pow2C) sgt/slt 0
- const ICmpInst::Predicate Pred = Cmp.getPredicate();
- if (Pred != ICmpInst::ICMP_SGT && Pred != ICmpInst::ICMP_SLT)
- return nullptr;
-
- // TODO: The one-use check is standard because we do not typically want to
- // create longer instruction sequences, but this might be a special-case
- // because srem is not good for analysis or codegen.
- if (!SRem->hasOneUse())
- return nullptr;
-
- const APInt *DivisorC;
- if (!C.isNullValue() || !match(SRem->getOperand(1), m_Power2(DivisorC)))
- return nullptr;
-
- // Mask off the sign bit and the modulo bits (low-bits).
- Type *Ty = SRem->getType();
- APInt SignMask = APInt::getSignMask(Ty->getScalarSizeInBits());
- Constant *MaskC = ConstantInt::get(Ty, SignMask | (*DivisorC - 1));
- Value *And = Builder.CreateAnd(SRem->getOperand(0), MaskC);
-
- // For 'is positive?' check that the sign-bit is clear and at least 1 masked
- // bit is set. Example:
- // (i8 X % 32) s> 0 --> (X & 159) s> 0
- if (Pred == ICmpInst::ICMP_SGT)
- return new ICmpInst(ICmpInst::ICMP_SGT, And, ConstantInt::getNullValue(Ty));
-
- // For 'is negative?' check that the sign-bit is set and at least 1 masked
- // bit is set. Example:
- // (i16 X % 4) s< 0 --> (X & 32771) u> 32768
- return new ICmpInst(ICmpInst::ICMP_UGT, And, ConstantInt::get(Ty, SignMask));
-}
-
-/// Fold icmp (udiv X, Y), C.
+ // Match an 'is positive' or 'is negative' comparison of remainder by a
+ // constant power-of-2 value:
+ // (X % pow2C) sgt/slt 0
+ const ICmpInst::Predicate Pred = Cmp.getPredicate();
+ if (Pred != ICmpInst::ICMP_SGT && Pred != ICmpInst::ICMP_SLT)
+ return nullptr;
+
+ // TODO: The one-use check is standard because we do not typically want to
+ // create longer instruction sequences, but this might be a special-case
+ // because srem is not good for analysis or codegen.
+ if (!SRem->hasOneUse())
+ return nullptr;
+
+ const APInt *DivisorC;
+ if (!C.isNullValue() || !match(SRem->getOperand(1), m_Power2(DivisorC)))
+ return nullptr;
+
+ // Mask off the sign bit and the modulo bits (low-bits).
+ Type *Ty = SRem->getType();
+ APInt SignMask = APInt::getSignMask(Ty->getScalarSizeInBits());
+ Constant *MaskC = ConstantInt::get(Ty, SignMask | (*DivisorC - 1));
+ Value *And = Builder.CreateAnd(SRem->getOperand(0), MaskC);
+
+ // For 'is positive?' check that the sign-bit is clear and at least 1 masked
+ // bit is set. Example:
+ // (i8 X % 32) s> 0 --> (X & 159) s> 0
+ if (Pred == ICmpInst::ICMP_SGT)
+ return new ICmpInst(ICmpInst::ICMP_SGT, And, ConstantInt::getNullValue(Ty));
+
+ // For 'is negative?' check that the sign-bit is set and at least 1 masked
+ // bit is set. Example:
+ // (i16 X % 4) s< 0 --> (X & 32771) u> 32768
+ return new ICmpInst(ICmpInst::ICMP_UGT, And, ConstantInt::get(Ty, SignMask));
+}
+
+/// Fold icmp (udiv X, Y), C.
Instruction *InstCombinerImpl::foldICmpUDivConstant(ICmpInst &Cmp,
BinaryOperator *UDiv,
const APInt &C) {
- const APInt *C2;
- if (!match(UDiv->getOperand(0), m_APInt(C2)))
- return nullptr;
-
- assert(*C2 != 0 && "udiv 0, X should have been simplified already.");
-
- // (icmp ugt (udiv C2, Y), C) -> (icmp ule Y, C2/(C+1))
- Value *Y = UDiv->getOperand(1);
- if (Cmp.getPredicate() == ICmpInst::ICMP_UGT) {
- assert(!C.isMaxValue() &&
- "icmp ugt X, UINT_MAX should have been simplified already.");
- return new ICmpInst(ICmpInst::ICMP_ULE, Y,
- ConstantInt::get(Y->getType(), C2->udiv(C + 1)));
- }
-
- // (icmp ult (udiv C2, Y), C) -> (icmp ugt Y, C2/C)
- if (Cmp.getPredicate() == ICmpInst::ICMP_ULT) {
- assert(C != 0 && "icmp ult X, 0 should have been simplified already.");
- return new ICmpInst(ICmpInst::ICMP_UGT, Y,
- ConstantInt::get(Y->getType(), C2->udiv(C)));
- }
-
- return nullptr;
-}
-
-/// Fold icmp ({su}div X, Y), C.
+ const APInt *C2;
+ if (!match(UDiv->getOperand(0), m_APInt(C2)))
+ return nullptr;
+
+ assert(*C2 != 0 && "udiv 0, X should have been simplified already.");
+
+ // (icmp ugt (udiv C2, Y), C) -> (icmp ule Y, C2/(C+1))
+ Value *Y = UDiv->getOperand(1);
+ if (Cmp.getPredicate() == ICmpInst::ICMP_UGT) {
+ assert(!C.isMaxValue() &&
+ "icmp ugt X, UINT_MAX should have been simplified already.");
+ return new ICmpInst(ICmpInst::ICMP_ULE, Y,
+ ConstantInt::get(Y->getType(), C2->udiv(C + 1)));
+ }
+
+ // (icmp ult (udiv C2, Y), C) -> (icmp ugt Y, C2/C)
+ if (Cmp.getPredicate() == ICmpInst::ICMP_ULT) {
+ assert(C != 0 && "icmp ult X, 0 should have been simplified already.");
+ return new ICmpInst(ICmpInst::ICMP_UGT, Y,
+ ConstantInt::get(Y->getType(), C2->udiv(C)));
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp ({su}div X, Y), C.
Instruction *InstCombinerImpl::foldICmpDivConstant(ICmpInst &Cmp,
BinaryOperator *Div,
const APInt &C) {
- // Fold: icmp pred ([us]div X, C2), C -> range test
- // Fold this div into the comparison, producing a range check.
- // Determine, based on the divide type, what the range is being
- // checked. If there is an overflow on the low or high side, remember
- // it, otherwise compute the range [low, hi) bounding the new value.
- // See: InsertRangeTest above for the kinds of replacements possible.
- const APInt *C2;
- if (!match(Div->getOperand(1), m_APInt(C2)))
- return nullptr;
-
- // FIXME: If the operand types don't match the type of the divide
- // then don't attempt this transform. The code below doesn't have the
- // logic to deal with a signed divide and an unsigned compare (and
- // vice versa). This is because (x /s C2) <s C produces different
- // results than (x /s C2) <u C or (x /u C2) <s C or even
- // (x /u C2) <u C. Simply casting the operands and result won't
- // work. :( The if statement below tests that condition and bails
- // if it finds it.
- bool DivIsSigned = Div->getOpcode() == Instruction::SDiv;
- if (!Cmp.isEquality() && DivIsSigned != Cmp.isSigned())
- return nullptr;
-
- // The ProdOV computation fails on divide by 0 and divide by -1. Cases with
- // INT_MIN will also fail if the divisor is 1. Although folds of all these
- // division-by-constant cases should be present, we can not assert that they
- // have happened before we reach this icmp instruction.
- if (C2->isNullValue() || C2->isOneValue() ||
- (DivIsSigned && C2->isAllOnesValue()))
- return nullptr;
-
- // Compute Prod = C * C2. We are essentially solving an equation of
- // form X / C2 = C. We solve for X by multiplying C2 and C.
- // By solving for X, we can turn this into a range check instead of computing
- // a divide.
- APInt Prod = C * *C2;
-
- // Determine if the product overflows by seeing if the product is not equal to
- // the divide. Make sure we do the same kind of divide as in the LHS
- // instruction that we're folding.
- bool ProdOV = (DivIsSigned ? Prod.sdiv(*C2) : Prod.udiv(*C2)) != C;
-
- ICmpInst::Predicate Pred = Cmp.getPredicate();
-
- // If the division is known to be exact, then there is no remainder from the
- // divide, so the covered range size is unit, otherwise it is the divisor.
- APInt RangeSize = Div->isExact() ? APInt(C2->getBitWidth(), 1) : *C2;
-
- // Figure out the interval that is being checked. For example, a comparison
- // like "X /u 5 == 0" is really checking that X is in the interval [0, 5).
- // Compute this interval based on the constants involved and the signedness of
- // the compare/divide. This computes a half-open interval, keeping track of
- // whether either value in the interval overflows. After analysis each
- // overflow variable is set to 0 if it's corresponding bound variable is valid
- // -1 if overflowed off the bottom end, or +1 if overflowed off the top end.
- int LoOverflow = 0, HiOverflow = 0;
- APInt LoBound, HiBound;
-
- if (!DivIsSigned) { // udiv
- // e.g. X/5 op 3 --> [15, 20)
- LoBound = Prod;
- HiOverflow = LoOverflow = ProdOV;
- if (!HiOverflow) {
- // If this is not an exact divide, then many values in the range collapse
- // to the same result value.
- HiOverflow = addWithOverflow(HiBound, LoBound, RangeSize, false);
- }
- } else if (C2->isStrictlyPositive()) { // Divisor is > 0.
- if (C.isNullValue()) { // (X / pos) op 0
- // Can't overflow. e.g. X/2 op 0 --> [-1, 2)
- LoBound = -(RangeSize - 1);
- HiBound = RangeSize;
- } else if (C.isStrictlyPositive()) { // (X / pos) op pos
- LoBound = Prod; // e.g. X/5 op 3 --> [15, 20)
- HiOverflow = LoOverflow = ProdOV;
- if (!HiOverflow)
- HiOverflow = addWithOverflow(HiBound, Prod, RangeSize, true);
- } else { // (X / pos) op neg
- // e.g. X/5 op -3 --> [-15-4, -15+1) --> [-19, -14)
- HiBound = Prod + 1;
- LoOverflow = HiOverflow = ProdOV ? -1 : 0;
- if (!LoOverflow) {
- APInt DivNeg = -RangeSize;
- LoOverflow = addWithOverflow(LoBound, HiBound, DivNeg, true) ? -1 : 0;
- }
- }
- } else if (C2->isNegative()) { // Divisor is < 0.
- if (Div->isExact())
- RangeSize.negate();
- if (C.isNullValue()) { // (X / neg) op 0
- // e.g. X/-5 op 0 --> [-4, 5)
- LoBound = RangeSize + 1;
- HiBound = -RangeSize;
- if (HiBound == *C2) { // -INTMIN = INTMIN
- HiOverflow = 1; // [INTMIN+1, overflow)
- HiBound = APInt(); // e.g. X/INTMIN = 0 --> X > INTMIN
- }
- } else if (C.isStrictlyPositive()) { // (X / neg) op pos
- // e.g. X/-5 op 3 --> [-19, -14)
- HiBound = Prod + 1;
- HiOverflow = LoOverflow = ProdOV ? -1 : 0;
- if (!LoOverflow)
- LoOverflow = addWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0;
- } else { // (X / neg) op neg
- LoBound = Prod; // e.g. X/-5 op -3 --> [15, 20)
- LoOverflow = HiOverflow = ProdOV;
- if (!HiOverflow)
- HiOverflow = subWithOverflow(HiBound, Prod, RangeSize, true);
- }
-
- // Dividing by a negative swaps the condition. LT <-> GT
- Pred = ICmpInst::getSwappedPredicate(Pred);
- }
-
- Value *X = Div->getOperand(0);
- switch (Pred) {
- default: llvm_unreachable("Unhandled icmp opcode!");
- case ICmpInst::ICMP_EQ:
- if (LoOverflow && HiOverflow)
- return replaceInstUsesWith(Cmp, Builder.getFalse());
- if (HiOverflow)
- return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
- ICmpInst::ICMP_UGE, X,
- ConstantInt::get(Div->getType(), LoBound));
- if (LoOverflow)
- return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
- ICmpInst::ICMP_ULT, X,
- ConstantInt::get(Div->getType(), HiBound));
- return replaceInstUsesWith(
- Cmp, insertRangeTest(X, LoBound, HiBound, DivIsSigned, true));
- case ICmpInst::ICMP_NE:
- if (LoOverflow && HiOverflow)
- return replaceInstUsesWith(Cmp, Builder.getTrue());
- if (HiOverflow)
- return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
- ICmpInst::ICMP_ULT, X,
- ConstantInt::get(Div->getType(), LoBound));
- if (LoOverflow)
- return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
- ICmpInst::ICMP_UGE, X,
- ConstantInt::get(Div->getType(), HiBound));
- return replaceInstUsesWith(Cmp,
- insertRangeTest(X, LoBound, HiBound,
- DivIsSigned, false));
- case ICmpInst::ICMP_ULT:
- case ICmpInst::ICMP_SLT:
- if (LoOverflow == +1) // Low bound is greater than input range.
- return replaceInstUsesWith(Cmp, Builder.getTrue());
- if (LoOverflow == -1) // Low bound is less than input range.
- return replaceInstUsesWith(Cmp, Builder.getFalse());
- return new ICmpInst(Pred, X, ConstantInt::get(Div->getType(), LoBound));
- case ICmpInst::ICMP_UGT:
- case ICmpInst::ICMP_SGT:
- if (HiOverflow == +1) // High bound greater than input range.
- return replaceInstUsesWith(Cmp, Builder.getFalse());
- if (HiOverflow == -1) // High bound less than input range.
- return replaceInstUsesWith(Cmp, Builder.getTrue());
- if (Pred == ICmpInst::ICMP_UGT)
- return new ICmpInst(ICmpInst::ICMP_UGE, X,
- ConstantInt::get(Div->getType(), HiBound));
- return new ICmpInst(ICmpInst::ICMP_SGE, X,
- ConstantInt::get(Div->getType(), HiBound));
- }
-
- return nullptr;
-}
-
-/// Fold icmp (sub X, Y), C.
+ // Fold: icmp pred ([us]div X, C2), C -> range test
+ // Fold this div into the comparison, producing a range check.
+ // Determine, based on the divide type, what the range is being
+ // checked. If there is an overflow on the low or high side, remember
+ // it, otherwise compute the range [low, hi) bounding the new value.
+ // See: InsertRangeTest above for the kinds of replacements possible.
+ const APInt *C2;
+ if (!match(Div->getOperand(1), m_APInt(C2)))
+ return nullptr;
+
+ // FIXME: If the operand types don't match the type of the divide
+ // then don't attempt this transform. The code below doesn't have the
+ // logic to deal with a signed divide and an unsigned compare (and
+ // vice versa). This is because (x /s C2) <s C produces different
+ // results than (x /s C2) <u C or (x /u C2) <s C or even
+ // (x /u C2) <u C. Simply casting the operands and result won't
+ // work. :( The if statement below tests that condition and bails
+ // if it finds it.
+ bool DivIsSigned = Div->getOpcode() == Instruction::SDiv;
+ if (!Cmp.isEquality() && DivIsSigned != Cmp.isSigned())
+ return nullptr;
+
+ // The ProdOV computation fails on divide by 0 and divide by -1. Cases with
+ // INT_MIN will also fail if the divisor is 1. Although folds of all these
+ // division-by-constant cases should be present, we can not assert that they
+ // have happened before we reach this icmp instruction.
+ if (C2->isNullValue() || C2->isOneValue() ||
+ (DivIsSigned && C2->isAllOnesValue()))
+ return nullptr;
+
+ // Compute Prod = C * C2. We are essentially solving an equation of
+ // form X / C2 = C. We solve for X by multiplying C2 and C.
+ // By solving for X, we can turn this into a range check instead of computing
+ // a divide.
+ APInt Prod = C * *C2;
+
+ // Determine if the product overflows by seeing if the product is not equal to
+ // the divide. Make sure we do the same kind of divide as in the LHS
+ // instruction that we're folding.
+ bool ProdOV = (DivIsSigned ? Prod.sdiv(*C2) : Prod.udiv(*C2)) != C;
+
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+
+ // If the division is known to be exact, then there is no remainder from the
+ // divide, so the covered range size is unit, otherwise it is the divisor.
+ APInt RangeSize = Div->isExact() ? APInt(C2->getBitWidth(), 1) : *C2;
+
+ // Figure out the interval that is being checked. For example, a comparison
+ // like "X /u 5 == 0" is really checking that X is in the interval [0, 5).
+ // Compute this interval based on the constants involved and the signedness of
+ // the compare/divide. This computes a half-open interval, keeping track of
+ // whether either value in the interval overflows. After analysis each
+ // overflow variable is set to 0 if it's corresponding bound variable is valid
+ // -1 if overflowed off the bottom end, or +1 if overflowed off the top end.
+ int LoOverflow = 0, HiOverflow = 0;
+ APInt LoBound, HiBound;
+
+ if (!DivIsSigned) { // udiv
+ // e.g. X/5 op 3 --> [15, 20)
+ LoBound = Prod;
+ HiOverflow = LoOverflow = ProdOV;
+ if (!HiOverflow) {
+ // If this is not an exact divide, then many values in the range collapse
+ // to the same result value.
+ HiOverflow = addWithOverflow(HiBound, LoBound, RangeSize, false);
+ }
+ } else if (C2->isStrictlyPositive()) { // Divisor is > 0.
+ if (C.isNullValue()) { // (X / pos) op 0
+ // Can't overflow. e.g. X/2 op 0 --> [-1, 2)
+ LoBound = -(RangeSize - 1);
+ HiBound = RangeSize;
+ } else if (C.isStrictlyPositive()) { // (X / pos) op pos
+ LoBound = Prod; // e.g. X/5 op 3 --> [15, 20)
+ HiOverflow = LoOverflow = ProdOV;
+ if (!HiOverflow)
+ HiOverflow = addWithOverflow(HiBound, Prod, RangeSize, true);
+ } else { // (X / pos) op neg
+ // e.g. X/5 op -3 --> [-15-4, -15+1) --> [-19, -14)
+ HiBound = Prod + 1;
+ LoOverflow = HiOverflow = ProdOV ? -1 : 0;
+ if (!LoOverflow) {
+ APInt DivNeg = -RangeSize;
+ LoOverflow = addWithOverflow(LoBound, HiBound, DivNeg, true) ? -1 : 0;
+ }
+ }
+ } else if (C2->isNegative()) { // Divisor is < 0.
+ if (Div->isExact())
+ RangeSize.negate();
+ if (C.isNullValue()) { // (X / neg) op 0
+ // e.g. X/-5 op 0 --> [-4, 5)
+ LoBound = RangeSize + 1;
+ HiBound = -RangeSize;
+ if (HiBound == *C2) { // -INTMIN = INTMIN
+ HiOverflow = 1; // [INTMIN+1, overflow)
+ HiBound = APInt(); // e.g. X/INTMIN = 0 --> X > INTMIN
+ }
+ } else if (C.isStrictlyPositive()) { // (X / neg) op pos
+ // e.g. X/-5 op 3 --> [-19, -14)
+ HiBound = Prod + 1;
+ HiOverflow = LoOverflow = ProdOV ? -1 : 0;
+ if (!LoOverflow)
+ LoOverflow = addWithOverflow(LoBound, HiBound, RangeSize, true) ? -1:0;
+ } else { // (X / neg) op neg
+ LoBound = Prod; // e.g. X/-5 op -3 --> [15, 20)
+ LoOverflow = HiOverflow = ProdOV;
+ if (!HiOverflow)
+ HiOverflow = subWithOverflow(HiBound, Prod, RangeSize, true);
+ }
+
+ // Dividing by a negative swaps the condition. LT <-> GT
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ }
+
+ Value *X = Div->getOperand(0);
+ switch (Pred) {
+ default: llvm_unreachable("Unhandled icmp opcode!");
+ case ICmpInst::ICMP_EQ:
+ if (LoOverflow && HiOverflow)
+ return replaceInstUsesWith(Cmp, Builder.getFalse());
+ if (HiOverflow)
+ return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
+ ICmpInst::ICMP_UGE, X,
+ ConstantInt::get(Div->getType(), LoBound));
+ if (LoOverflow)
+ return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
+ ICmpInst::ICMP_ULT, X,
+ ConstantInt::get(Div->getType(), HiBound));
+ return replaceInstUsesWith(
+ Cmp, insertRangeTest(X, LoBound, HiBound, DivIsSigned, true));
+ case ICmpInst::ICMP_NE:
+ if (LoOverflow && HiOverflow)
+ return replaceInstUsesWith(Cmp, Builder.getTrue());
+ if (HiOverflow)
+ return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT :
+ ICmpInst::ICMP_ULT, X,
+ ConstantInt::get(Div->getType(), LoBound));
+ if (LoOverflow)
+ return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE :
+ ICmpInst::ICMP_UGE, X,
+ ConstantInt::get(Div->getType(), HiBound));
+ return replaceInstUsesWith(Cmp,
+ insertRangeTest(X, LoBound, HiBound,
+ DivIsSigned, false));
+ case ICmpInst::ICMP_ULT:
+ case ICmpInst::ICMP_SLT:
+ if (LoOverflow == +1) // Low bound is greater than input range.
+ return replaceInstUsesWith(Cmp, Builder.getTrue());
+ if (LoOverflow == -1) // Low bound is less than input range.
+ return replaceInstUsesWith(Cmp, Builder.getFalse());
+ return new ICmpInst(Pred, X, ConstantInt::get(Div->getType(), LoBound));
+ case ICmpInst::ICMP_UGT:
+ case ICmpInst::ICMP_SGT:
+ if (HiOverflow == +1) // High bound greater than input range.
+ return replaceInstUsesWith(Cmp, Builder.getFalse());
+ if (HiOverflow == -1) // High bound less than input range.
+ return replaceInstUsesWith(Cmp, Builder.getTrue());
+ if (Pred == ICmpInst::ICMP_UGT)
+ return new ICmpInst(ICmpInst::ICMP_UGE, X,
+ ConstantInt::get(Div->getType(), HiBound));
+ return new ICmpInst(ICmpInst::ICMP_SGE, X,
+ ConstantInt::get(Div->getType(), HiBound));
+ }
+
+ return nullptr;
+}
+
+/// Fold icmp (sub X, Y), C.
Instruction *InstCombinerImpl::foldICmpSubConstant(ICmpInst &Cmp,
BinaryOperator *Sub,
const APInt &C) {
- Value *X = Sub->getOperand(0), *Y = Sub->getOperand(1);
- ICmpInst::Predicate Pred = Cmp.getPredicate();
- const APInt *C2;
- APInt SubResult;
-
- // icmp eq/ne (sub C, Y), C -> icmp eq/ne Y, 0
- if (match(X, m_APInt(C2)) && *C2 == C && Cmp.isEquality())
- return new ICmpInst(Cmp.getPredicate(), Y,
- ConstantInt::get(Y->getType(), 0));
-
- // (icmp P (sub nuw|nsw C2, Y), C) -> (icmp swap(P) Y, C2-C)
- if (match(X, m_APInt(C2)) &&
- ((Cmp.isUnsigned() && Sub->hasNoUnsignedWrap()) ||
- (Cmp.isSigned() && Sub->hasNoSignedWrap())) &&
- !subWithOverflow(SubResult, *C2, C, Cmp.isSigned()))
- return new ICmpInst(Cmp.getSwappedPredicate(), Y,
- ConstantInt::get(Y->getType(), SubResult));
-
- // The following transforms are only worth it if the only user of the subtract
- // is the icmp.
- if (!Sub->hasOneUse())
- return nullptr;
-
- if (Sub->hasNoSignedWrap()) {
- // (icmp sgt (sub nsw X, Y), -1) -> (icmp sge X, Y)
- if (Pred == ICmpInst::ICMP_SGT && C.isAllOnesValue())
- return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);
-
- // (icmp sgt (sub nsw X, Y), 0) -> (icmp sgt X, Y)
- if (Pred == ICmpInst::ICMP_SGT && C.isNullValue())
- return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);
-
- // (icmp slt (sub nsw X, Y), 0) -> (icmp slt X, Y)
- if (Pred == ICmpInst::ICMP_SLT && C.isNullValue())
- return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);
-
- // (icmp slt (sub nsw X, Y), 1) -> (icmp sle X, Y)
- if (Pred == ICmpInst::ICMP_SLT && C.isOneValue())
- return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
- }
-
- if (!match(X, m_APInt(C2)))
- return nullptr;
-
- // C2 - Y <u C -> (Y | (C - 1)) == C2
- // iff (C2 & (C - 1)) == C - 1 and C is a power of 2
- if (Pred == ICmpInst::ICMP_ULT && C.isPowerOf2() &&
- (*C2 & (C - 1)) == (C - 1))
- return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateOr(Y, C - 1), X);
-
- // C2 - Y >u C -> (Y | C) != C2
- // iff C2 & C == C and C + 1 is a power of 2
- if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == C)
- return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateOr(Y, C), X);
-
- return nullptr;
-}
-
-/// Fold icmp (add X, Y), C.
+ Value *X = Sub->getOperand(0), *Y = Sub->getOperand(1);
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ const APInt *C2;
+ APInt SubResult;
+
+ // icmp eq/ne (sub C, Y), C -> icmp eq/ne Y, 0
+ if (match(X, m_APInt(C2)) && *C2 == C && Cmp.isEquality())
+ return new ICmpInst(Cmp.getPredicate(), Y,
+ ConstantInt::get(Y->getType(), 0));
+
+ // (icmp P (sub nuw|nsw C2, Y), C) -> (icmp swap(P) Y, C2-C)
+ if (match(X, m_APInt(C2)) &&
+ ((Cmp.isUnsigned() && Sub->hasNoUnsignedWrap()) ||
+ (Cmp.isSigned() && Sub->hasNoSignedWrap())) &&
+ !subWithOverflow(SubResult, *C2, C, Cmp.isSigned()))
+ return new ICmpInst(Cmp.getSwappedPredicate(), Y,
+ ConstantInt::get(Y->getType(), SubResult));
+
+ // The following transforms are only worth it if the only user of the subtract
+ // is the icmp.
+ if (!Sub->hasOneUse())
+ return nullptr;
+
+ if (Sub->hasNoSignedWrap()) {
+ // (icmp sgt (sub nsw X, Y), -1) -> (icmp sge X, Y)
+ if (Pred == ICmpInst::ICMP_SGT && C.isAllOnesValue())
+ return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);
+
+ // (icmp sgt (sub nsw X, Y), 0) -> (icmp sgt X, Y)
+ if (Pred == ICmpInst::ICMP_SGT && C.isNullValue())
+ return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);
+
+ // (icmp slt (sub nsw X, Y), 0) -> (icmp slt X, Y)
+ if (Pred == ICmpInst::ICMP_SLT && C.isNullValue())
+ return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);
+
+ // (icmp slt (sub nsw X, Y), 1) -> (icmp sle X, Y)
+ if (Pred == ICmpInst::ICMP_SLT && C.isOneValue())
+ return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
+ }
+
+ if (!match(X, m_APInt(C2)))
+ return nullptr;
+
+ // C2 - Y <u C -> (Y | (C - 1)) == C2
+ // iff (C2 & (C - 1)) == C - 1 and C is a power of 2
+ if (Pred == ICmpInst::ICMP_ULT && C.isPowerOf2() &&
+ (*C2 & (C - 1)) == (C - 1))
+ return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateOr(Y, C - 1), X);
+
+ // C2 - Y >u C -> (Y | C) != C2
+ // iff C2 & C == C and C + 1 is a power of 2
+ if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == C)
+ return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateOr(Y, C), X);
+
+ return nullptr;
+}
+
+/// Fold icmp (add X, Y), C.
Instruction *InstCombinerImpl::foldICmpAddConstant(ICmpInst &Cmp,
BinaryOperator *Add,
const APInt &C) {
- Value *Y = Add->getOperand(1);
- const APInt *C2;
- if (Cmp.isEquality() || !match(Y, m_APInt(C2)))
- return nullptr;
-
- // Fold icmp pred (add X, C2), C.
- Value *X = Add->getOperand(0);
- Type *Ty = Add->getType();
- CmpInst::Predicate Pred = Cmp.getPredicate();
-
- // If the add does not wrap, we can always adjust the compare by subtracting
- // the constants. Equality comparisons are handled elsewhere. SGE/SLE/UGE/ULE
- // are canonicalized to SGT/SLT/UGT/ULT.
- if ((Add->hasNoSignedWrap() &&
- (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLT)) ||
- (Add->hasNoUnsignedWrap() &&
- (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT))) {
- bool Overflow;
- APInt NewC =
- Cmp.isSigned() ? C.ssub_ov(*C2, Overflow) : C.usub_ov(*C2, Overflow);
- // If there is overflow, the result must be true or false.
- // TODO: Can we assert there is no overflow because InstSimplify always
- // handles those cases?
- if (!Overflow)
- // icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2)
- return new ICmpInst(Pred, X, ConstantInt::get(Ty, NewC));
- }
-
- auto CR = ConstantRange::makeExactICmpRegion(Pred, C).subtract(*C2);
- const APInt &Upper = CR.getUpper();
- const APInt &Lower = CR.getLower();
- if (Cmp.isSigned()) {
- if (Lower.isSignMask())
- return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantInt::get(Ty, Upper));
- if (Upper.isSignMask())
- return new ICmpInst(ICmpInst::ICMP_SGE, X, ConstantInt::get(Ty, Lower));
- } else {
- if (Lower.isMinValue())
- return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantInt::get(Ty, Upper));
- if (Upper.isMinValue())
- return new ICmpInst(ICmpInst::ICMP_UGE, X, ConstantInt::get(Ty, Lower));
- }
-
- if (!Add->hasOneUse())
- return nullptr;
-
- // X+C <u C2 -> (X & -C2) == C
- // iff C & (C2-1) == 0
- // C2 is a power of 2
- if (Pred == ICmpInst::ICMP_ULT && C.isPowerOf2() && (*C2 & (C - 1)) == 0)
- return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateAnd(X, -C),
- ConstantExpr::getNeg(cast<Constant>(Y)));
-
- // X+C >u C2 -> (X & ~C2) != C
- // iff C & C2 == 0
- // C2+1 is a power of 2
- if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == 0)
- return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateAnd(X, ~C),
- ConstantExpr::getNeg(cast<Constant>(Y)));
-
- return nullptr;
-}
-
+ Value *Y = Add->getOperand(1);
+ const APInt *C2;
+ if (Cmp.isEquality() || !match(Y, m_APInt(C2)))
+ return nullptr;
+
+ // Fold icmp pred (add X, C2), C.
+ Value *X = Add->getOperand(0);
+ Type *Ty = Add->getType();
+ CmpInst::Predicate Pred = Cmp.getPredicate();
+
+ // If the add does not wrap, we can always adjust the compare by subtracting
+ // the constants. Equality comparisons are handled elsewhere. SGE/SLE/UGE/ULE
+ // are canonicalized to SGT/SLT/UGT/ULT.
+ if ((Add->hasNoSignedWrap() &&
+ (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLT)) ||
+ (Add->hasNoUnsignedWrap() &&
+ (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULT))) {
+ bool Overflow;
+ APInt NewC =
+ Cmp.isSigned() ? C.ssub_ov(*C2, Overflow) : C.usub_ov(*C2, Overflow);
+ // If there is overflow, the result must be true or false.
+ // TODO: Can we assert there is no overflow because InstSimplify always
+ // handles those cases?
+ if (!Overflow)
+ // icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2)
+ return new ICmpInst(Pred, X, ConstantInt::get(Ty, NewC));
+ }
+
+ auto CR = ConstantRange::makeExactICmpRegion(Pred, C).subtract(*C2);
+ const APInt &Upper = CR.getUpper();
+ const APInt &Lower = CR.getLower();
+ if (Cmp.isSigned()) {
+ if (Lower.isSignMask())
+ return new ICmpInst(ICmpInst::ICMP_SLT, X, ConstantInt::get(Ty, Upper));
+ if (Upper.isSignMask())
+ return new ICmpInst(ICmpInst::ICMP_SGE, X, ConstantInt::get(Ty, Lower));
+ } else {
+ if (Lower.isMinValue())
+ return new ICmpInst(ICmpInst::ICMP_ULT, X, ConstantInt::get(Ty, Upper));
+ if (Upper.isMinValue())
+ return new ICmpInst(ICmpInst::ICMP_UGE, X, ConstantInt::get(Ty, Lower));
+ }
+
+ if (!Add->hasOneUse())
+ return nullptr;
+
+ // X+C <u C2 -> (X & -C2) == C
+ // iff C & (C2-1) == 0
+ // C2 is a power of 2
+ if (Pred == ICmpInst::ICMP_ULT && C.isPowerOf2() && (*C2 & (C - 1)) == 0)
+ return new ICmpInst(ICmpInst::ICMP_EQ, Builder.CreateAnd(X, -C),
+ ConstantExpr::getNeg(cast<Constant>(Y)));
+
+ // X+C >u C2 -> (X & ~C2) != C
+ // iff C & C2 == 0
+ // C2+1 is a power of 2
+ if (Pred == ICmpInst::ICMP_UGT && (C + 1).isPowerOf2() && (*C2 & C) == 0)
+ return new ICmpInst(ICmpInst::ICMP_NE, Builder.CreateAnd(X, ~C),
+ ConstantExpr::getNeg(cast<Constant>(Y)));
+
+ return nullptr;
+}
+
bool InstCombinerImpl::matchThreeWayIntCompare(SelectInst *SI, Value *&LHS,
Value *&RHS, ConstantInt *&Less,
ConstantInt *&Equal,
ConstantInt *&Greater) {
- // TODO: Generalize this to work with other comparison idioms or ensure
- // they get canonicalized into this form.
-
- // select i1 (a == b),
- // i32 Equal,
- // i32 (select i1 (a < b), i32 Less, i32 Greater)
- // where Equal, Less and Greater are placeholders for any three constants.
- ICmpInst::Predicate PredA;
- if (!match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) ||
- !ICmpInst::isEquality(PredA))
- return false;
- Value *EqualVal = SI->getTrueValue();
- Value *UnequalVal = SI->getFalseValue();
- // We still can get non-canonical predicate here, so canonicalize.
- if (PredA == ICmpInst::ICMP_NE)
- std::swap(EqualVal, UnequalVal);
- if (!match(EqualVal, m_ConstantInt(Equal)))
- return false;
- ICmpInst::Predicate PredB;
- Value *LHS2, *RHS2;
- if (!match(UnequalVal, m_Select(m_ICmp(PredB, m_Value(LHS2), m_Value(RHS2)),
- m_ConstantInt(Less), m_ConstantInt(Greater))))
- return false;
- // We can get predicate mismatch here, so canonicalize if possible:
- // First, ensure that 'LHS' match.
- if (LHS2 != LHS) {
- // x sgt y <--> y slt x
- std::swap(LHS2, RHS2);
- PredB = ICmpInst::getSwappedPredicate(PredB);
- }
- if (LHS2 != LHS)
- return false;
- // We also need to canonicalize 'RHS'.
- if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) {
- // x sgt C-1 <--> x sge C <--> not(x slt C)
- auto FlippedStrictness =
+ // TODO: Generalize this to work with other comparison idioms or ensure
+ // they get canonicalized into this form.
+
+ // select i1 (a == b),
+ // i32 Equal,
+ // i32 (select i1 (a < b), i32 Less, i32 Greater)
+ // where Equal, Less and Greater are placeholders for any three constants.
+ ICmpInst::Predicate PredA;
+ if (!match(SI->getCondition(), m_ICmp(PredA, m_Value(LHS), m_Value(RHS))) ||
+ !ICmpInst::isEquality(PredA))
+ return false;
+ Value *EqualVal = SI->getTrueValue();
+ Value *UnequalVal = SI->getFalseValue();
+ // We still can get non-canonical predicate here, so canonicalize.
+ if (PredA == ICmpInst::ICMP_NE)
+ std::swap(EqualVal, UnequalVal);
+ if (!match(EqualVal, m_ConstantInt(Equal)))
+ return false;
+ ICmpInst::Predicate PredB;
+ Value *LHS2, *RHS2;
+ if (!match(UnequalVal, m_Select(m_ICmp(PredB, m_Value(LHS2), m_Value(RHS2)),
+ m_ConstantInt(Less), m_ConstantInt(Greater))))
+ return false;
+ // We can get predicate mismatch here, so canonicalize if possible:
+ // First, ensure that 'LHS' match.
+ if (LHS2 != LHS) {
+ // x sgt y <--> y slt x
+ std::swap(LHS2, RHS2);
+ PredB = ICmpInst::getSwappedPredicate(PredB);
+ }
+ if (LHS2 != LHS)
+ return false;
+ // We also need to canonicalize 'RHS'.
+ if (PredB == ICmpInst::ICMP_SGT && isa<Constant>(RHS2)) {
+ // x sgt C-1 <--> x sge C <--> not(x slt C)
+ auto FlippedStrictness =
InstCombiner::getFlippedStrictnessPredicateAndConstant(
PredB, cast<Constant>(RHS2));
- if (!FlippedStrictness)
- return false;
- assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check");
- RHS2 = FlippedStrictness->second;
- // And kind-of perform the result swap.
- std::swap(Less, Greater);
- PredB = ICmpInst::ICMP_SLT;
- }
- return PredB == ICmpInst::ICMP_SLT && RHS == RHS2;
-}
-
+ if (!FlippedStrictness)
+ return false;
+ assert(FlippedStrictness->first == ICmpInst::ICMP_SGE && "Sanity check");
+ RHS2 = FlippedStrictness->second;
+ // And kind-of perform the result swap.
+ std::swap(Less, Greater);
+ PredB = ICmpInst::ICMP_SLT;
+ }
+ return PredB == ICmpInst::ICMP_SLT && RHS == RHS2;
+}
+
Instruction *InstCombinerImpl::foldICmpSelectConstant(ICmpInst &Cmp,
SelectInst *Select,
ConstantInt *C) {
-
- assert(C && "Cmp RHS should be a constant int!");
- // If we're testing a constant value against the result of a three way
- // comparison, the result can be expressed directly in terms of the
- // original values being compared. Note: We could possibly be more
- // aggressive here and remove the hasOneUse test. The original select is
- // really likely to simplify or sink when we remove a test of the result.
- Value *OrigLHS, *OrigRHS;
- ConstantInt *C1LessThan, *C2Equal, *C3GreaterThan;
- if (Cmp.hasOneUse() &&
- matchThreeWayIntCompare(Select, OrigLHS, OrigRHS, C1LessThan, C2Equal,
- C3GreaterThan)) {
- assert(C1LessThan && C2Equal && C3GreaterThan);
-
- bool TrueWhenLessThan =
- ConstantExpr::getCompare(Cmp.getPredicate(), C1LessThan, C)
- ->isAllOnesValue();
- bool TrueWhenEqual =
- ConstantExpr::getCompare(Cmp.getPredicate(), C2Equal, C)
- ->isAllOnesValue();
- bool TrueWhenGreaterThan =
- ConstantExpr::getCompare(Cmp.getPredicate(), C3GreaterThan, C)
- ->isAllOnesValue();
-
- // This generates the new instruction that will replace the original Cmp
- // Instruction. Instead of enumerating the various combinations when
- // TrueWhenLessThan, TrueWhenEqual and TrueWhenGreaterThan are true versus
- // false, we rely on chaining of ORs and future passes of InstCombine to
- // simplify the OR further (i.e. a s< b || a == b becomes a s<= b).
-
- // When none of the three constants satisfy the predicate for the RHS (C),
- // the entire original Cmp can be simplified to a false.
- Value *Cond = Builder.getFalse();
- if (TrueWhenLessThan)
- Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SLT,
- OrigLHS, OrigRHS));
- if (TrueWhenEqual)
- Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_EQ,
- OrigLHS, OrigRHS));
- if (TrueWhenGreaterThan)
- Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SGT,
- OrigLHS, OrigRHS));
-
- return replaceInstUsesWith(Cmp, Cond);
- }
- return nullptr;
-}
-
-static Instruction *foldICmpBitCast(ICmpInst &Cmp,
- InstCombiner::BuilderTy &Builder) {
- auto *Bitcast = dyn_cast<BitCastInst>(Cmp.getOperand(0));
- if (!Bitcast)
- return nullptr;
-
- ICmpInst::Predicate Pred = Cmp.getPredicate();
- Value *Op1 = Cmp.getOperand(1);
- Value *BCSrcOp = Bitcast->getOperand(0);
-
- // Make sure the bitcast doesn't change the number of vector elements.
- if (Bitcast->getSrcTy()->getScalarSizeInBits() ==
- Bitcast->getDestTy()->getScalarSizeInBits()) {
- // Zero-equality and sign-bit checks are preserved through sitofp + bitcast.
- Value *X;
- if (match(BCSrcOp, m_SIToFP(m_Value(X)))) {
- // icmp eq (bitcast (sitofp X)), 0 --> icmp eq X, 0
- // icmp ne (bitcast (sitofp X)), 0 --> icmp ne X, 0
- // icmp slt (bitcast (sitofp X)), 0 --> icmp slt X, 0
- // icmp sgt (bitcast (sitofp X)), 0 --> icmp sgt X, 0
- if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_SLT ||
- Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT) &&
- match(Op1, m_Zero()))
- return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
-
- // icmp slt (bitcast (sitofp X)), 1 --> icmp slt X, 1
- if (Pred == ICmpInst::ICMP_SLT && match(Op1, m_One()))
- return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), 1));
-
- // icmp sgt (bitcast (sitofp X)), -1 --> icmp sgt X, -1
- if (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes()))
- return new ICmpInst(Pred, X,
- ConstantInt::getAllOnesValue(X->getType()));
- }
-
- // Zero-equality checks are preserved through unsigned floating-point casts:
- // icmp eq (bitcast (uitofp X)), 0 --> icmp eq X, 0
- // icmp ne (bitcast (uitofp X)), 0 --> icmp ne X, 0
- if (match(BCSrcOp, m_UIToFP(m_Value(X))))
- if (Cmp.isEquality() && match(Op1, m_Zero()))
- return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
-
- // If this is a sign-bit test of a bitcast of a casted FP value, eliminate
- // the FP extend/truncate because that cast does not change the sign-bit.
- // This is true for all standard IEEE-754 types and the X86 80-bit type.
- // The sign-bit is always the most significant bit in those types.
- const APInt *C;
- bool TrueIfSigned;
- if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse() &&
+
+ assert(C && "Cmp RHS should be a constant int!");
+ // If we're testing a constant value against the result of a three way
+ // comparison, the result can be expressed directly in terms of the
+ // original values being compared. Note: We could possibly be more
+ // aggressive here and remove the hasOneUse test. The original select is
+ // really likely to simplify or sink when we remove a test of the result.
+ Value *OrigLHS, *OrigRHS;
+ ConstantInt *C1LessThan, *C2Equal, *C3GreaterThan;
+ if (Cmp.hasOneUse() &&
+ matchThreeWayIntCompare(Select, OrigLHS, OrigRHS, C1LessThan, C2Equal,
+ C3GreaterThan)) {
+ assert(C1LessThan && C2Equal && C3GreaterThan);
+
+ bool TrueWhenLessThan =
+ ConstantExpr::getCompare(Cmp.getPredicate(), C1LessThan, C)
+ ->isAllOnesValue();
+ bool TrueWhenEqual =
+ ConstantExpr::getCompare(Cmp.getPredicate(), C2Equal, C)
+ ->isAllOnesValue();
+ bool TrueWhenGreaterThan =
+ ConstantExpr::getCompare(Cmp.getPredicate(), C3GreaterThan, C)
+ ->isAllOnesValue();
+
+ // This generates the new instruction that will replace the original Cmp
+ // Instruction. Instead of enumerating the various combinations when
+ // TrueWhenLessThan, TrueWhenEqual and TrueWhenGreaterThan are true versus
+ // false, we rely on chaining of ORs and future passes of InstCombine to
+ // simplify the OR further (i.e. a s< b || a == b becomes a s<= b).
+
+ // When none of the three constants satisfy the predicate for the RHS (C),
+ // the entire original Cmp can be simplified to a false.
+ Value *Cond = Builder.getFalse();
+ if (TrueWhenLessThan)
+ Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SLT,
+ OrigLHS, OrigRHS));
+ if (TrueWhenEqual)
+ Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_EQ,
+ OrigLHS, OrigRHS));
+ if (TrueWhenGreaterThan)
+ Cond = Builder.CreateOr(Cond, Builder.CreateICmp(ICmpInst::ICMP_SGT,
+ OrigLHS, OrigRHS));
+
+ return replaceInstUsesWith(Cmp, Cond);
+ }
+ return nullptr;
+}
+
+static Instruction *foldICmpBitCast(ICmpInst &Cmp,
+ InstCombiner::BuilderTy &Builder) {
+ auto *Bitcast = dyn_cast<BitCastInst>(Cmp.getOperand(0));
+ if (!Bitcast)
+ return nullptr;
+
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ Value *Op1 = Cmp.getOperand(1);
+ Value *BCSrcOp = Bitcast->getOperand(0);
+
+ // Make sure the bitcast doesn't change the number of vector elements.
+ if (Bitcast->getSrcTy()->getScalarSizeInBits() ==
+ Bitcast->getDestTy()->getScalarSizeInBits()) {
+ // Zero-equality and sign-bit checks are preserved through sitofp + bitcast.
+ Value *X;
+ if (match(BCSrcOp, m_SIToFP(m_Value(X)))) {
+ // icmp eq (bitcast (sitofp X)), 0 --> icmp eq X, 0
+ // icmp ne (bitcast (sitofp X)), 0 --> icmp ne X, 0
+ // icmp slt (bitcast (sitofp X)), 0 --> icmp slt X, 0
+ // icmp sgt (bitcast (sitofp X)), 0 --> icmp sgt X, 0
+ if ((Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_SLT ||
+ Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT) &&
+ match(Op1, m_Zero()))
+ return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
+
+ // icmp slt (bitcast (sitofp X)), 1 --> icmp slt X, 1
+ if (Pred == ICmpInst::ICMP_SLT && match(Op1, m_One()))
+ return new ICmpInst(Pred, X, ConstantInt::get(X->getType(), 1));
+
+ // icmp sgt (bitcast (sitofp X)), -1 --> icmp sgt X, -1
+ if (Pred == ICmpInst::ICMP_SGT && match(Op1, m_AllOnes()))
+ return new ICmpInst(Pred, X,
+ ConstantInt::getAllOnesValue(X->getType()));
+ }
+
+ // Zero-equality checks are preserved through unsigned floating-point casts:
+ // icmp eq (bitcast (uitofp X)), 0 --> icmp eq X, 0
+ // icmp ne (bitcast (uitofp X)), 0 --> icmp ne X, 0
+ if (match(BCSrcOp, m_UIToFP(m_Value(X))))
+ if (Cmp.isEquality() && match(Op1, m_Zero()))
+ return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
+
+ // If this is a sign-bit test of a bitcast of a casted FP value, eliminate
+ // the FP extend/truncate because that cast does not change the sign-bit.
+ // This is true for all standard IEEE-754 types and the X86 80-bit type.
+ // The sign-bit is always the most significant bit in those types.
+ const APInt *C;
+ bool TrueIfSigned;
+ if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse() &&
InstCombiner::isSignBitCheck(Pred, *C, TrueIfSigned)) {
- if (match(BCSrcOp, m_FPExt(m_Value(X))) ||
- match(BCSrcOp, m_FPTrunc(m_Value(X)))) {
- // (bitcast (fpext/fptrunc X)) to iX) < 0 --> (bitcast X to iY) < 0
- // (bitcast (fpext/fptrunc X)) to iX) > -1 --> (bitcast X to iY) > -1
- Type *XType = X->getType();
-
- // We can't currently handle Power style floating point operations here.
- if (!(XType->isPPC_FP128Ty() || BCSrcOp->getType()->isPPC_FP128Ty())) {
-
- Type *NewType = Builder.getIntNTy(XType->getScalarSizeInBits());
- if (auto *XVTy = dyn_cast<VectorType>(XType))
+ if (match(BCSrcOp, m_FPExt(m_Value(X))) ||
+ match(BCSrcOp, m_FPTrunc(m_Value(X)))) {
+ // (bitcast (fpext/fptrunc X)) to iX) < 0 --> (bitcast X to iY) < 0
+ // (bitcast (fpext/fptrunc X)) to iX) > -1 --> (bitcast X to iY) > -1
+ Type *XType = X->getType();
+
+ // We can't currently handle Power style floating point operations here.
+ if (!(XType->isPPC_FP128Ty() || BCSrcOp->getType()->isPPC_FP128Ty())) {
+
+ Type *NewType = Builder.getIntNTy(XType->getScalarSizeInBits());
+ if (auto *XVTy = dyn_cast<VectorType>(XType))
NewType = VectorType::get(NewType, XVTy->getElementCount());
- Value *NewBitcast = Builder.CreateBitCast(X, NewType);
- if (TrueIfSigned)
- return new ICmpInst(ICmpInst::ICMP_SLT, NewBitcast,
- ConstantInt::getNullValue(NewType));
- else
- return new ICmpInst(ICmpInst::ICMP_SGT, NewBitcast,
- ConstantInt::getAllOnesValue(NewType));
- }
- }
- }
- }
-
- // Test to see if the operands of the icmp are casted versions of other
- // values. If the ptr->ptr cast can be stripped off both arguments, do so.
- if (Bitcast->getType()->isPointerTy() &&
- (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) {
- // If operand #1 is a bitcast instruction, it must also be a ptr->ptr cast
- // so eliminate it as well.
- if (auto *BC2 = dyn_cast<BitCastInst>(Op1))
- Op1 = BC2->getOperand(0);
-
- Op1 = Builder.CreateBitCast(Op1, BCSrcOp->getType());
- return new ICmpInst(Pred, BCSrcOp, Op1);
- }
-
- // Folding: icmp <pred> iN X, C
- // where X = bitcast <M x iK> (shufflevector <M x iK> %vec, undef, SC)) to iN
- // and C is a splat of a K-bit pattern
- // and SC is a constant vector = <C', C', C', ..., C'>
- // Into:
- // %E = extractelement <M x iK> %vec, i32 C'
- // icmp <pred> iK %E, trunc(C)
- const APInt *C;
- if (!match(Cmp.getOperand(1), m_APInt(C)) ||
- !Bitcast->getType()->isIntegerTy() ||
- !Bitcast->getSrcTy()->isIntOrIntVectorTy())
- return nullptr;
-
- Value *Vec;
- ArrayRef<int> Mask;
- if (match(BCSrcOp, m_Shuffle(m_Value(Vec), m_Undef(), m_Mask(Mask)))) {
- // Check whether every element of Mask is the same constant
- if (is_splat(Mask)) {
- auto *VecTy = cast<VectorType>(BCSrcOp->getType());
- auto *EltTy = cast<IntegerType>(VecTy->getElementType());
- if (C->isSplat(EltTy->getBitWidth())) {
- // Fold the icmp based on the value of C
- // If C is M copies of an iK sized bit pattern,
- // then:
- // => %E = extractelement <N x iK> %vec, i32 Elem
- // icmp <pred> iK %SplatVal, <pattern>
- Value *Elem = Builder.getInt32(Mask[0]);
- Value *Extract = Builder.CreateExtractElement(Vec, Elem);
- Value *NewC = ConstantInt::get(EltTy, C->trunc(EltTy->getBitWidth()));
- return new ICmpInst(Pred, Extract, NewC);
- }
- }
- }
- return nullptr;
-}
-
-/// Try to fold integer comparisons with a constant operand: icmp Pred X, C
-/// where X is some kind of instruction.
+ Value *NewBitcast = Builder.CreateBitCast(X, NewType);
+ if (TrueIfSigned)
+ return new ICmpInst(ICmpInst::ICMP_SLT, NewBitcast,
+ ConstantInt::getNullValue(NewType));
+ else
+ return new ICmpInst(ICmpInst::ICMP_SGT, NewBitcast,
+ ConstantInt::getAllOnesValue(NewType));
+ }
+ }
+ }
+ }
+
+ // Test to see if the operands of the icmp are casted versions of other
+ // values. If the ptr->ptr cast can be stripped off both arguments, do so.
+ if (Bitcast->getType()->isPointerTy() &&
+ (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) {
+ // If operand #1 is a bitcast instruction, it must also be a ptr->ptr cast
+ // so eliminate it as well.
+ if (auto *BC2 = dyn_cast<BitCastInst>(Op1))
+ Op1 = BC2->getOperand(0);
+
+ Op1 = Builder.CreateBitCast(Op1, BCSrcOp->getType());
+ return new ICmpInst(Pred, BCSrcOp, Op1);
+ }
+
+ // Folding: icmp <pred> iN X, C
+ // where X = bitcast <M x iK> (shufflevector <M x iK> %vec, undef, SC)) to iN
+ // and C is a splat of a K-bit pattern
+ // and SC is a constant vector = <C', C', C', ..., C'>
+ // Into:
+ // %E = extractelement <M x iK> %vec, i32 C'
+ // icmp <pred> iK %E, trunc(C)
+ const APInt *C;
+ if (!match(Cmp.getOperand(1), m_APInt(C)) ||
+ !Bitcast->getType()->isIntegerTy() ||
+ !Bitcast->getSrcTy()->isIntOrIntVectorTy())
+ return nullptr;
+
+ Value *Vec;
+ ArrayRef<int> Mask;
+ if (match(BCSrcOp, m_Shuffle(m_Value(Vec), m_Undef(), m_Mask(Mask)))) {
+ // Check whether every element of Mask is the same constant
+ if (is_splat(Mask)) {
+ auto *VecTy = cast<VectorType>(BCSrcOp->getType());
+ auto *EltTy = cast<IntegerType>(VecTy->getElementType());
+ if (C->isSplat(EltTy->getBitWidth())) {
+ // Fold the icmp based on the value of C
+ // If C is M copies of an iK sized bit pattern,
+ // then:
+ // => %E = extractelement <N x iK> %vec, i32 Elem
+ // icmp <pred> iK %SplatVal, <pattern>
+ Value *Elem = Builder.getInt32(Mask[0]);
+ Value *Extract = Builder.CreateExtractElement(Vec, Elem);
+ Value *NewC = ConstantInt::get(EltTy, C->trunc(EltTy->getBitWidth()));
+ return new ICmpInst(Pred, Extract, NewC);
+ }
+ }
+ }
+ return nullptr;
+}
+
+/// Try to fold integer comparisons with a constant operand: icmp Pred X, C
+/// where X is some kind of instruction.
Instruction *InstCombinerImpl::foldICmpInstWithConstant(ICmpInst &Cmp) {
- const APInt *C;
- if (!match(Cmp.getOperand(1), m_APInt(C)))
- return nullptr;
-
- if (auto *BO = dyn_cast<BinaryOperator>(Cmp.getOperand(0))) {
- switch (BO->getOpcode()) {
- case Instruction::Xor:
- if (Instruction *I = foldICmpXorConstant(Cmp, BO, *C))
- return I;
- break;
- case Instruction::And:
- if (Instruction *I = foldICmpAndConstant(Cmp, BO, *C))
- return I;
- break;
- case Instruction::Or:
- if (Instruction *I = foldICmpOrConstant(Cmp, BO, *C))
- return I;
- break;
- case Instruction::Mul:
- if (Instruction *I = foldICmpMulConstant(Cmp, BO, *C))
- return I;
- break;
- case Instruction::Shl:
- if (Instruction *I = foldICmpShlConstant(Cmp, BO, *C))
- return I;
- break;
- case Instruction::LShr:
- case Instruction::AShr:
- if (Instruction *I = foldICmpShrConstant(Cmp, BO, *C))
- return I;
- break;
- case Instruction::SRem:
- if (Instruction *I = foldICmpSRemConstant(Cmp, BO, *C))
- return I;
- break;
- case Instruction::UDiv:
- if (Instruction *I = foldICmpUDivConstant(Cmp, BO, *C))
- return I;
- LLVM_FALLTHROUGH;
- case Instruction::SDiv:
- if (Instruction *I = foldICmpDivConstant(Cmp, BO, *C))
- return I;
- break;
- case Instruction::Sub:
- if (Instruction *I = foldICmpSubConstant(Cmp, BO, *C))
- return I;
- break;
- case Instruction::Add:
- if (Instruction *I = foldICmpAddConstant(Cmp, BO, *C))
- return I;
- break;
- default:
- break;
- }
- // TODO: These folds could be refactored to be part of the above calls.
- if (Instruction *I = foldICmpBinOpEqualityWithConstant(Cmp, BO, *C))
- return I;
- }
-
- // Match against CmpInst LHS being instructions other than binary operators.
-
- if (auto *SI = dyn_cast<SelectInst>(Cmp.getOperand(0))) {
- // For now, we only support constant integers while folding the
- // ICMP(SELECT)) pattern. We can extend this to support vector of integers
- // similar to the cases handled by binary ops above.
- if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(Cmp.getOperand(1)))
- if (Instruction *I = foldICmpSelectConstant(Cmp, SI, ConstRHS))
- return I;
- }
-
- if (auto *TI = dyn_cast<TruncInst>(Cmp.getOperand(0))) {
- if (Instruction *I = foldICmpTruncConstant(Cmp, TI, *C))
- return I;
- }
-
- if (auto *II = dyn_cast<IntrinsicInst>(Cmp.getOperand(0)))
- if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, II, *C))
- return I;
-
- return nullptr;
-}
-
-/// Fold an icmp equality instruction with binary operator LHS and constant RHS:
-/// icmp eq/ne BO, C.
+ const APInt *C;
+ if (!match(Cmp.getOperand(1), m_APInt(C)))
+ return nullptr;
+
+ if (auto *BO = dyn_cast<BinaryOperator>(Cmp.getOperand(0))) {
+ switch (BO->getOpcode()) {
+ case Instruction::Xor:
+ if (Instruction *I = foldICmpXorConstant(Cmp, BO, *C))
+ return I;
+ break;
+ case Instruction::And:
+ if (Instruction *I = foldICmpAndConstant(Cmp, BO, *C))
+ return I;
+ break;
+ case Instruction::Or:
+ if (Instruction *I = foldICmpOrConstant(Cmp, BO, *C))
+ return I;
+ break;
+ case Instruction::Mul:
+ if (Instruction *I = foldICmpMulConstant(Cmp, BO, *C))
+ return I;
+ break;
+ case Instruction::Shl:
+ if (Instruction *I = foldICmpShlConstant(Cmp, BO, *C))
+ return I;
+ break;
+ case Instruction::LShr:
+ case Instruction::AShr:
+ if (Instruction *I = foldICmpShrConstant(Cmp, BO, *C))
+ return I;
+ break;
+ case Instruction::SRem:
+ if (Instruction *I = foldICmpSRemConstant(Cmp, BO, *C))
+ return I;
+ break;
+ case Instruction::UDiv:
+ if (Instruction *I = foldICmpUDivConstant(Cmp, BO, *C))
+ return I;
+ LLVM_FALLTHROUGH;
+ case Instruction::SDiv:
+ if (Instruction *I = foldICmpDivConstant(Cmp, BO, *C))
+ return I;
+ break;
+ case Instruction::Sub:
+ if (Instruction *I = foldICmpSubConstant(Cmp, BO, *C))
+ return I;
+ break;
+ case Instruction::Add:
+ if (Instruction *I = foldICmpAddConstant(Cmp, BO, *C))
+ return I;
+ break;
+ default:
+ break;
+ }
+ // TODO: These folds could be refactored to be part of the above calls.
+ if (Instruction *I = foldICmpBinOpEqualityWithConstant(Cmp, BO, *C))
+ return I;
+ }
+
+ // Match against CmpInst LHS being instructions other than binary operators.
+
+ if (auto *SI = dyn_cast<SelectInst>(Cmp.getOperand(0))) {
+ // For now, we only support constant integers while folding the
+ // ICMP(SELECT)) pattern. We can extend this to support vector of integers
+ // similar to the cases handled by binary ops above.
+ if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(Cmp.getOperand(1)))
+ if (Instruction *I = foldICmpSelectConstant(Cmp, SI, ConstRHS))
+ return I;
+ }
+
+ if (auto *TI = dyn_cast<TruncInst>(Cmp.getOperand(0))) {
+ if (Instruction *I = foldICmpTruncConstant(Cmp, TI, *C))
+ return I;
+ }
+
+ if (auto *II = dyn_cast<IntrinsicInst>(Cmp.getOperand(0)))
+ if (Instruction *I = foldICmpIntrinsicWithConstant(Cmp, II, *C))
+ return I;
+
+ return nullptr;
+}
+
+/// Fold an icmp equality instruction with binary operator LHS and constant RHS:
+/// icmp eq/ne BO, C.
Instruction *InstCombinerImpl::foldICmpBinOpEqualityWithConstant(
ICmpInst &Cmp, BinaryOperator *BO, const APInt &C) {
- // TODO: Some of these folds could work with arbitrary constants, but this
- // function is limited to scalar and vector splat constants.
- if (!Cmp.isEquality())
- return nullptr;
-
- ICmpInst::Predicate Pred = Cmp.getPredicate();
- bool isICMP_NE = Pred == ICmpInst::ICMP_NE;
- Constant *RHS = cast<Constant>(Cmp.getOperand(1));
- Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1);
-
- switch (BO->getOpcode()) {
- case Instruction::SRem:
- // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one.
- if (C.isNullValue() && BO->hasOneUse()) {
- const APInt *BOC;
- if (match(BOp1, m_APInt(BOC)) && BOC->sgt(1) && BOC->isPowerOf2()) {
- Value *NewRem = Builder.CreateURem(BOp0, BOp1, BO->getName());
- return new ICmpInst(Pred, NewRem,
- Constant::getNullValue(BO->getType()));
- }
- }
- break;
- case Instruction::Add: {
- // Replace ((add A, B) != C) with (A != C-B) if B & C are constants.
- if (Constant *BOC = dyn_cast<Constant>(BOp1)) {
- if (BO->hasOneUse())
- return new ICmpInst(Pred, BOp0, ConstantExpr::getSub(RHS, BOC));
- } else if (C.isNullValue()) {
- // Replace ((add A, B) != 0) with (A != -B) if A or B is
- // efficiently invertible, or if the add has just this one use.
- if (Value *NegVal = dyn_castNegVal(BOp1))
- return new ICmpInst(Pred, BOp0, NegVal);
- if (Value *NegVal = dyn_castNegVal(BOp0))
- return new ICmpInst(Pred, NegVal, BOp1);
- if (BO->hasOneUse()) {
- Value *Neg = Builder.CreateNeg(BOp1);
- Neg->takeName(BO);
- return new ICmpInst(Pred, BOp0, Neg);
- }
- }
- break;
- }
- case Instruction::Xor:
- if (BO->hasOneUse()) {
- if (Constant *BOC = dyn_cast<Constant>(BOp1)) {
- // For the xor case, we can xor two constants together, eliminating
- // the explicit xor.
- return new ICmpInst(Pred, BOp0, ConstantExpr::getXor(RHS, BOC));
- } else if (C.isNullValue()) {
- // Replace ((xor A, B) != 0) with (A != B)
- return new ICmpInst(Pred, BOp0, BOp1);
- }
- }
- break;
- case Instruction::Sub:
- if (BO->hasOneUse()) {
- // Only check for constant LHS here, as constant RHS will be canonicalized
- // to add and use the fold above.
- if (Constant *BOC = dyn_cast<Constant>(BOp0)) {
- // Replace ((sub BOC, B) != C) with (B != BOC-C).
- return new ICmpInst(Pred, BOp1, ConstantExpr::getSub(BOC, RHS));
- } else if (C.isNullValue()) {
- // Replace ((sub A, B) != 0) with (A != B).
- return new ICmpInst(Pred, BOp0, BOp1);
- }
- }
- break;
- case Instruction::Or: {
- const APInt *BOC;
- if (match(BOp1, m_APInt(BOC)) && BO->hasOneUse() && RHS->isAllOnesValue()) {
- // Comparing if all bits outside of a constant mask are set?
- // Replace (X | C) == -1 with (X & ~C) == ~C.
- // This removes the -1 constant.
- Constant *NotBOC = ConstantExpr::getNot(cast<Constant>(BOp1));
- Value *And = Builder.CreateAnd(BOp0, NotBOC);
- return new ICmpInst(Pred, And, NotBOC);
- }
- break;
- }
- case Instruction::And: {
- const APInt *BOC;
- if (match(BOp1, m_APInt(BOC))) {
- // If we have ((X & C) == C), turn it into ((X & C) != 0).
- if (C == *BOC && C.isPowerOf2())
- return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
- BO, Constant::getNullValue(RHS->getType()));
- }
- break;
- }
- case Instruction::UDiv:
- if (C.isNullValue()) {
- // (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A)
- auto NewPred = isICMP_NE ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
- return new ICmpInst(NewPred, BOp1, BOp0);
- }
- break;
- default:
- break;
- }
- return nullptr;
-}
-
-/// Fold an equality icmp with LLVM intrinsic and constant operand.
+ // TODO: Some of these folds could work with arbitrary constants, but this
+ // function is limited to scalar and vector splat constants.
+ if (!Cmp.isEquality())
+ return nullptr;
+
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ bool isICMP_NE = Pred == ICmpInst::ICMP_NE;
+ Constant *RHS = cast<Constant>(Cmp.getOperand(1));
+ Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1);
+
+ switch (BO->getOpcode()) {
+ case Instruction::SRem:
+ // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one.
+ if (C.isNullValue() && BO->hasOneUse()) {
+ const APInt *BOC;
+ if (match(BOp1, m_APInt(BOC)) && BOC->sgt(1) && BOC->isPowerOf2()) {
+ Value *NewRem = Builder.CreateURem(BOp0, BOp1, BO->getName());
+ return new ICmpInst(Pred, NewRem,
+ Constant::getNullValue(BO->getType()));
+ }
+ }
+ break;
+ case Instruction::Add: {
+ // Replace ((add A, B) != C) with (A != C-B) if B & C are constants.
+ if (Constant *BOC = dyn_cast<Constant>(BOp1)) {
+ if (BO->hasOneUse())
+ return new ICmpInst(Pred, BOp0, ConstantExpr::getSub(RHS, BOC));
+ } else if (C.isNullValue()) {
+ // Replace ((add A, B) != 0) with (A != -B) if A or B is
+ // efficiently invertible, or if the add has just this one use.
+ if (Value *NegVal = dyn_castNegVal(BOp1))
+ return new ICmpInst(Pred, BOp0, NegVal);
+ if (Value *NegVal = dyn_castNegVal(BOp0))
+ return new ICmpInst(Pred, NegVal, BOp1);
+ if (BO->hasOneUse()) {
+ Value *Neg = Builder.CreateNeg(BOp1);
+ Neg->takeName(BO);
+ return new ICmpInst(Pred, BOp0, Neg);
+ }
+ }
+ break;
+ }
+ case Instruction::Xor:
+ if (BO->hasOneUse()) {
+ if (Constant *BOC = dyn_cast<Constant>(BOp1)) {
+ // For the xor case, we can xor two constants together, eliminating
+ // the explicit xor.
+ return new ICmpInst(Pred, BOp0, ConstantExpr::getXor(RHS, BOC));
+ } else if (C.isNullValue()) {
+ // Replace ((xor A, B) != 0) with (A != B)
+ return new ICmpInst(Pred, BOp0, BOp1);
+ }
+ }
+ break;
+ case Instruction::Sub:
+ if (BO->hasOneUse()) {
+ // Only check for constant LHS here, as constant RHS will be canonicalized
+ // to add and use the fold above.
+ if (Constant *BOC = dyn_cast<Constant>(BOp0)) {
+ // Replace ((sub BOC, B) != C) with (B != BOC-C).
+ return new ICmpInst(Pred, BOp1, ConstantExpr::getSub(BOC, RHS));
+ } else if (C.isNullValue()) {
+ // Replace ((sub A, B) != 0) with (A != B).
+ return new ICmpInst(Pred, BOp0, BOp1);
+ }
+ }
+ break;
+ case Instruction::Or: {
+ const APInt *BOC;
+ if (match(BOp1, m_APInt(BOC)) && BO->hasOneUse() && RHS->isAllOnesValue()) {
+ // Comparing if all bits outside of a constant mask are set?
+ // Replace (X | C) == -1 with (X & ~C) == ~C.
+ // This removes the -1 constant.
+ Constant *NotBOC = ConstantExpr::getNot(cast<Constant>(BOp1));
+ Value *And = Builder.CreateAnd(BOp0, NotBOC);
+ return new ICmpInst(Pred, And, NotBOC);
+ }
+ break;
+ }
+ case Instruction::And: {
+ const APInt *BOC;
+ if (match(BOp1, m_APInt(BOC))) {
+ // If we have ((X & C) == C), turn it into ((X & C) != 0).
+ if (C == *BOC && C.isPowerOf2())
+ return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
+ BO, Constant::getNullValue(RHS->getType()));
+ }
+ break;
+ }
+ case Instruction::UDiv:
+ if (C.isNullValue()) {
+ // (icmp eq/ne (udiv A, B), 0) -> (icmp ugt/ule i32 B, A)
+ auto NewPred = isICMP_NE ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
+ return new ICmpInst(NewPred, BOp1, BOp0);
+ }
+ break;
+ default:
+ break;
+ }
+ return nullptr;
+}
+
+/// Fold an equality icmp with LLVM intrinsic and constant operand.
Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant(
ICmpInst &Cmp, IntrinsicInst *II, const APInt &C) {
- Type *Ty = II->getType();
- unsigned BitWidth = C.getBitWidth();
- switch (II->getIntrinsicID()) {
+ Type *Ty = II->getType();
+ unsigned BitWidth = C.getBitWidth();
+ switch (II->getIntrinsicID()) {
case Intrinsic::abs:
// abs(A) == 0 -> A == 0
// abs(A) == INT_MIN -> A == INT_MIN
@@ -3068,83 +3068,83 @@ Instruction *InstCombinerImpl::foldICmpEqIntrinsicWithConstant(
ConstantInt::get(Ty, C));
break;
- case Intrinsic::bswap:
- // bswap(A) == C -> A == bswap(C)
- return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
- ConstantInt::get(Ty, C.byteSwap()));
-
- case Intrinsic::ctlz:
- case Intrinsic::cttz: {
- // ctz(A) == bitwidth(A) -> A == 0 and likewise for !=
- if (C == BitWidth)
- return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
- ConstantInt::getNullValue(Ty));
-
- // ctz(A) == C -> A & Mask1 == Mask2, where Mask2 only has bit C set
- // and Mask1 has bits 0..C+1 set. Similar for ctl, but for high bits.
- // Limit to one use to ensure we don't increase instruction count.
- unsigned Num = C.getLimitedValue(BitWidth);
- if (Num != BitWidth && II->hasOneUse()) {
- bool IsTrailing = II->getIntrinsicID() == Intrinsic::cttz;
- APInt Mask1 = IsTrailing ? APInt::getLowBitsSet(BitWidth, Num + 1)
- : APInt::getHighBitsSet(BitWidth, Num + 1);
- APInt Mask2 = IsTrailing
- ? APInt::getOneBitSet(BitWidth, Num)
- : APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
- return new ICmpInst(Cmp.getPredicate(),
- Builder.CreateAnd(II->getArgOperand(0), Mask1),
- ConstantInt::get(Ty, Mask2));
- }
- break;
- }
-
- case Intrinsic::ctpop: {
- // popcount(A) == 0 -> A == 0 and likewise for !=
- // popcount(A) == bitwidth(A) -> A == -1 and likewise for !=
- bool IsZero = C.isNullValue();
- if (IsZero || C == BitWidth)
- return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
- IsZero ? Constant::getNullValue(Ty) : Constant::getAllOnesValue(Ty));
-
- break;
- }
-
- case Intrinsic::uadd_sat: {
- // uadd.sat(a, b) == 0 -> (a | b) == 0
- if (C.isNullValue()) {
- Value *Or = Builder.CreateOr(II->getArgOperand(0), II->getArgOperand(1));
- return new ICmpInst(Cmp.getPredicate(), Or, Constant::getNullValue(Ty));
- }
- break;
- }
-
- case Intrinsic::usub_sat: {
- // usub.sat(a, b) == 0 -> a <= b
- if (C.isNullValue()) {
- ICmpInst::Predicate NewPred = Cmp.getPredicate() == ICmpInst::ICMP_EQ
- ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
- return new ICmpInst(NewPred, II->getArgOperand(0), II->getArgOperand(1));
- }
- break;
- }
- default:
- break;
- }
-
- return nullptr;
-}
-
-/// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C.
+ case Intrinsic::bswap:
+ // bswap(A) == C -> A == bswap(C)
+ return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
+ ConstantInt::get(Ty, C.byteSwap()));
+
+ case Intrinsic::ctlz:
+ case Intrinsic::cttz: {
+ // ctz(A) == bitwidth(A) -> A == 0 and likewise for !=
+ if (C == BitWidth)
+ return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
+ ConstantInt::getNullValue(Ty));
+
+ // ctz(A) == C -> A & Mask1 == Mask2, where Mask2 only has bit C set
+ // and Mask1 has bits 0..C+1 set. Similar for ctl, but for high bits.
+ // Limit to one use to ensure we don't increase instruction count.
+ unsigned Num = C.getLimitedValue(BitWidth);
+ if (Num != BitWidth && II->hasOneUse()) {
+ bool IsTrailing = II->getIntrinsicID() == Intrinsic::cttz;
+ APInt Mask1 = IsTrailing ? APInt::getLowBitsSet(BitWidth, Num + 1)
+ : APInt::getHighBitsSet(BitWidth, Num + 1);
+ APInt Mask2 = IsTrailing
+ ? APInt::getOneBitSet(BitWidth, Num)
+ : APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
+ return new ICmpInst(Cmp.getPredicate(),
+ Builder.CreateAnd(II->getArgOperand(0), Mask1),
+ ConstantInt::get(Ty, Mask2));
+ }
+ break;
+ }
+
+ case Intrinsic::ctpop: {
+ // popcount(A) == 0 -> A == 0 and likewise for !=
+ // popcount(A) == bitwidth(A) -> A == -1 and likewise for !=
+ bool IsZero = C.isNullValue();
+ if (IsZero || C == BitWidth)
+ return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
+ IsZero ? Constant::getNullValue(Ty) : Constant::getAllOnesValue(Ty));
+
+ break;
+ }
+
+ case Intrinsic::uadd_sat: {
+ // uadd.sat(a, b) == 0 -> (a | b) == 0
+ if (C.isNullValue()) {
+ Value *Or = Builder.CreateOr(II->getArgOperand(0), II->getArgOperand(1));
+ return new ICmpInst(Cmp.getPredicate(), Or, Constant::getNullValue(Ty));
+ }
+ break;
+ }
+
+ case Intrinsic::usub_sat: {
+ // usub.sat(a, b) == 0 -> a <= b
+ if (C.isNullValue()) {
+ ICmpInst::Predicate NewPred = Cmp.getPredicate() == ICmpInst::ICMP_EQ
+ ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
+ return new ICmpInst(NewPred, II->getArgOperand(0), II->getArgOperand(1));
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ return nullptr;
+}
+
+/// Fold an icmp with LLVM intrinsic and constant operand: icmp Pred II, C.
Instruction *InstCombinerImpl::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
IntrinsicInst *II,
const APInt &C) {
- if (Cmp.isEquality())
- return foldICmpEqIntrinsicWithConstant(Cmp, II, C);
-
- Type *Ty = II->getType();
- unsigned BitWidth = C.getBitWidth();
+ if (Cmp.isEquality())
+ return foldICmpEqIntrinsicWithConstant(Cmp, II, C);
+
+ Type *Ty = II->getType();
+ unsigned BitWidth = C.getBitWidth();
ICmpInst::Predicate Pred = Cmp.getPredicate();
- switch (II->getIntrinsicID()) {
+ switch (II->getIntrinsicID()) {
case Intrinsic::ctpop: {
// (ctpop X > BitWidth - 1) --> X == -1
Value *X = II->getArgOperand(0);
@@ -3157,562 +3157,562 @@ Instruction *InstCombinerImpl::foldICmpIntrinsicWithConstant(ICmpInst &Cmp,
ConstantInt::getAllOnesValue(Ty));
break;
}
- case Intrinsic::ctlz: {
- // ctlz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX < 0b00010000
+ case Intrinsic::ctlz: {
+ // ctlz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX < 0b00010000
if (Pred == ICmpInst::ICMP_UGT && C.ult(BitWidth)) {
- unsigned Num = C.getLimitedValue();
- APInt Limit = APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
- return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_ULT,
- II->getArgOperand(0), ConstantInt::get(Ty, Limit));
- }
-
- // ctlz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX > 0b00011111
+ unsigned Num = C.getLimitedValue();
+ APInt Limit = APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
+ return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_ULT,
+ II->getArgOperand(0), ConstantInt::get(Ty, Limit));
+ }
+
+ // ctlz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX > 0b00011111
if (Pred == ICmpInst::ICMP_ULT && C.uge(1) && C.ule(BitWidth)) {
- unsigned Num = C.getLimitedValue();
- APInt Limit = APInt::getLowBitsSet(BitWidth, BitWidth - Num);
- return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_UGT,
- II->getArgOperand(0), ConstantInt::get(Ty, Limit));
- }
- break;
- }
- case Intrinsic::cttz: {
- // Limit to one use to ensure we don't increase instruction count.
- if (!II->hasOneUse())
- return nullptr;
-
- // cttz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX & 0b00001111 == 0
+ unsigned Num = C.getLimitedValue();
+ APInt Limit = APInt::getLowBitsSet(BitWidth, BitWidth - Num);
+ return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_UGT,
+ II->getArgOperand(0), ConstantInt::get(Ty, Limit));
+ }
+ break;
+ }
+ case Intrinsic::cttz: {
+ // Limit to one use to ensure we don't increase instruction count.
+ if (!II->hasOneUse())
+ return nullptr;
+
+ // cttz(0bXXXXXXXX) > 3 -> 0bXXXXXXXX & 0b00001111 == 0
if (Pred == ICmpInst::ICMP_UGT && C.ult(BitWidth)) {
- APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue() + 1);
- return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_EQ,
- Builder.CreateAnd(II->getArgOperand(0), Mask),
- ConstantInt::getNullValue(Ty));
- }
-
- // cttz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX & 0b00000111 != 0
+ APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue() + 1);
+ return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_EQ,
+ Builder.CreateAnd(II->getArgOperand(0), Mask),
+ ConstantInt::getNullValue(Ty));
+ }
+
+ // cttz(0bXXXXXXXX) < 3 -> 0bXXXXXXXX & 0b00000111 != 0
if (Pred == ICmpInst::ICMP_ULT && C.uge(1) && C.ule(BitWidth)) {
- APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue());
- return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_NE,
- Builder.CreateAnd(II->getArgOperand(0), Mask),
- ConstantInt::getNullValue(Ty));
- }
- break;
- }
- default:
- break;
- }
-
- return nullptr;
-}
-
-/// Handle icmp with constant (but not simple integer constant) RHS.
+ APInt Mask = APInt::getLowBitsSet(BitWidth, C.getLimitedValue());
+ return CmpInst::Create(Instruction::ICmp, ICmpInst::ICMP_NE,
+ Builder.CreateAnd(II->getArgOperand(0), Mask),
+ ConstantInt::getNullValue(Ty));
+ }
+ break;
+ }
+ default:
+ break;
+ }
+
+ return nullptr;
+}
+
+/// Handle icmp with constant (but not simple integer constant) RHS.
Instruction *InstCombinerImpl::foldICmpInstWithConstantNotInt(ICmpInst &I) {
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- Constant *RHSC = dyn_cast<Constant>(Op1);
- Instruction *LHSI = dyn_cast<Instruction>(Op0);
- if (!RHSC || !LHSI)
- return nullptr;
-
- switch (LHSI->getOpcode()) {
- case Instruction::GetElementPtr:
- // icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null
- if (RHSC->isNullValue() &&
- cast<GetElementPtrInst>(LHSI)->hasAllZeroIndices())
- return new ICmpInst(
- I.getPredicate(), LHSI->getOperand(0),
- Constant::getNullValue(LHSI->getOperand(0)->getType()));
- break;
- case Instruction::PHI:
- // Only fold icmp into the PHI if the phi and icmp are in the same
- // block. If in the same block, we're encouraging jump threading. If
- // not, we are just pessimizing the code by making an i1 phi.
- if (LHSI->getParent() == I.getParent())
- if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
- return NV;
- break;
- case Instruction::Select: {
- // If either operand of the select is a constant, we can fold the
- // comparison into the select arms, which will cause one to be
- // constant folded and the select turned into a bitwise or.
- Value *Op1 = nullptr, *Op2 = nullptr;
- ConstantInt *CI = nullptr;
- if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
- Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
- CI = dyn_cast<ConstantInt>(Op1);
- }
- if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
- Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
- CI = dyn_cast<ConstantInt>(Op2);
- }
-
- // We only want to perform this transformation if it will not lead to
- // additional code. This is true if either both sides of the select
- // fold to a constant (in which case the icmp is replaced with a select
- // which will usually simplify) or this is the only user of the
- // select (in which case we are trading a select+icmp for a simpler
- // select+icmp) or all uses of the select can be replaced based on
- // dominance information ("Global cases").
- bool Transform = false;
- if (Op1 && Op2)
- Transform = true;
- else if (Op1 || Op2) {
- // Local case
- if (LHSI->hasOneUse())
- Transform = true;
- // Global cases
- else if (CI && !CI->isZero())
- // When Op1 is constant try replacing select with second operand.
- // Otherwise Op2 is constant and try replacing select with first
- // operand.
- Transform =
- replacedSelectWithOperand(cast<SelectInst>(LHSI), &I, Op1 ? 2 : 1);
- }
- if (Transform) {
- if (!Op1)
- Op1 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(1), RHSC,
- I.getName());
- if (!Op2)
- Op2 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(2), RHSC,
- I.getName());
- return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
- }
- break;
- }
- case Instruction::IntToPtr:
- // icmp pred inttoptr(X), null -> icmp pred X, 0
- if (RHSC->isNullValue() &&
- DL.getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType())
- return new ICmpInst(
- I.getPredicate(), LHSI->getOperand(0),
- Constant::getNullValue(LHSI->getOperand(0)->getType()));
- break;
-
- case Instruction::Load:
- // Try to optimize things like "A[i] > 4" to index computations.
- if (GetElementPtrInst *GEP =
- dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) {
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
- if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
- !cast<LoadInst>(LHSI)->isVolatile())
- if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
- return Res;
- }
- break;
- }
-
- return nullptr;
-}
-
-/// Some comparisons can be simplified.
-/// In this case, we are looking for comparisons that look like
-/// a check for a lossy truncation.
-/// Folds:
-/// icmp SrcPred (x & Mask), x to icmp DstPred x, Mask
-/// Where Mask is some pattern that produces all-ones in low bits:
-/// (-1 >> y)
-/// ((-1 << y) >> y) <- non-canonical, has extra uses
-/// ~(-1 << y)
-/// ((1 << y) + (-1)) <- non-canonical, has extra uses
-/// The Mask can be a constant, too.
-/// For some predicates, the operands are commutative.
-/// For others, x can only be on a specific side.
-static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I,
- InstCombiner::BuilderTy &Builder) {
- ICmpInst::Predicate SrcPred;
- Value *X, *M, *Y;
- auto m_VariableMask = m_CombineOr(
- m_CombineOr(m_Not(m_Shl(m_AllOnes(), m_Value())),
- m_Add(m_Shl(m_One(), m_Value()), m_AllOnes())),
- m_CombineOr(m_LShr(m_AllOnes(), m_Value()),
- m_LShr(m_Shl(m_AllOnes(), m_Value(Y)), m_Deferred(Y))));
- auto m_Mask = m_CombineOr(m_VariableMask, m_LowBitMask());
- if (!match(&I, m_c_ICmp(SrcPred,
- m_c_And(m_CombineAnd(m_Mask, m_Value(M)), m_Value(X)),
- m_Deferred(X))))
- return nullptr;
-
- ICmpInst::Predicate DstPred;
- switch (SrcPred) {
- case ICmpInst::Predicate::ICMP_EQ:
- // x & (-1 >> y) == x -> x u<= (-1 >> y)
- DstPred = ICmpInst::Predicate::ICMP_ULE;
- break;
- case ICmpInst::Predicate::ICMP_NE:
- // x & (-1 >> y) != x -> x u> (-1 >> y)
- DstPred = ICmpInst::Predicate::ICMP_UGT;
- break;
- case ICmpInst::Predicate::ICMP_ULT:
- // x & (-1 >> y) u< x -> x u> (-1 >> y)
- // x u> x & (-1 >> y) -> x u> (-1 >> y)
- DstPred = ICmpInst::Predicate::ICMP_UGT;
- break;
- case ICmpInst::Predicate::ICMP_UGE:
- // x & (-1 >> y) u>= x -> x u<= (-1 >> y)
- // x u<= x & (-1 >> y) -> x u<= (-1 >> y)
- DstPred = ICmpInst::Predicate::ICMP_ULE;
- break;
- case ICmpInst::Predicate::ICMP_SLT:
- // x & (-1 >> y) s< x -> x s> (-1 >> y)
- // x s> x & (-1 >> y) -> x s> (-1 >> y)
- if (!match(M, m_Constant())) // Can not do this fold with non-constant.
- return nullptr;
- if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
- return nullptr;
- DstPred = ICmpInst::Predicate::ICMP_SGT;
- break;
- case ICmpInst::Predicate::ICMP_SGE:
- // x & (-1 >> y) s>= x -> x s<= (-1 >> y)
- // x s<= x & (-1 >> y) -> x s<= (-1 >> y)
- if (!match(M, m_Constant())) // Can not do this fold with non-constant.
- return nullptr;
- if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
- return nullptr;
- DstPred = ICmpInst::Predicate::ICMP_SLE;
- break;
- case ICmpInst::Predicate::ICMP_SGT:
- case ICmpInst::Predicate::ICMP_SLE:
- return nullptr;
- case ICmpInst::Predicate::ICMP_UGT:
- case ICmpInst::Predicate::ICMP_ULE:
- llvm_unreachable("Instsimplify took care of commut. variant");
- break;
- default:
- llvm_unreachable("All possible folds are handled.");
- }
-
- // The mask value may be a vector constant that has undefined elements. But it
- // may not be safe to propagate those undefs into the new compare, so replace
- // those elements by copying an existing, defined, and safe scalar constant.
- Type *OpTy = M->getType();
- auto *VecC = dyn_cast<Constant>(M);
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Constant *RHSC = dyn_cast<Constant>(Op1);
+ Instruction *LHSI = dyn_cast<Instruction>(Op0);
+ if (!RHSC || !LHSI)
+ return nullptr;
+
+ switch (LHSI->getOpcode()) {
+ case Instruction::GetElementPtr:
+ // icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null
+ if (RHSC->isNullValue() &&
+ cast<GetElementPtrInst>(LHSI)->hasAllZeroIndices())
+ return new ICmpInst(
+ I.getPredicate(), LHSI->getOperand(0),
+ Constant::getNullValue(LHSI->getOperand(0)->getType()));
+ break;
+ case Instruction::PHI:
+ // Only fold icmp into the PHI if the phi and icmp are in the same
+ // block. If in the same block, we're encouraging jump threading. If
+ // not, we are just pessimizing the code by making an i1 phi.
+ if (LHSI->getParent() == I.getParent())
+ if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
+ return NV;
+ break;
+ case Instruction::Select: {
+ // If either operand of the select is a constant, we can fold the
+ // comparison into the select arms, which will cause one to be
+ // constant folded and the select turned into a bitwise or.
+ Value *Op1 = nullptr, *Op2 = nullptr;
+ ConstantInt *CI = nullptr;
+ if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
+ Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+ CI = dyn_cast<ConstantInt>(Op1);
+ }
+ if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
+ Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+ CI = dyn_cast<ConstantInt>(Op2);
+ }
+
+ // We only want to perform this transformation if it will not lead to
+ // additional code. This is true if either both sides of the select
+ // fold to a constant (in which case the icmp is replaced with a select
+ // which will usually simplify) or this is the only user of the
+ // select (in which case we are trading a select+icmp for a simpler
+ // select+icmp) or all uses of the select can be replaced based on
+ // dominance information ("Global cases").
+ bool Transform = false;
+ if (Op1 && Op2)
+ Transform = true;
+ else if (Op1 || Op2) {
+ // Local case
+ if (LHSI->hasOneUse())
+ Transform = true;
+ // Global cases
+ else if (CI && !CI->isZero())
+ // When Op1 is constant try replacing select with second operand.
+ // Otherwise Op2 is constant and try replacing select with first
+ // operand.
+ Transform =
+ replacedSelectWithOperand(cast<SelectInst>(LHSI), &I, Op1 ? 2 : 1);
+ }
+ if (Transform) {
+ if (!Op1)
+ Op1 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(1), RHSC,
+ I.getName());
+ if (!Op2)
+ Op2 = Builder.CreateICmp(I.getPredicate(), LHSI->getOperand(2), RHSC,
+ I.getName());
+ return SelectInst::Create(LHSI->getOperand(0), Op1, Op2);
+ }
+ break;
+ }
+ case Instruction::IntToPtr:
+ // icmp pred inttoptr(X), null -> icmp pred X, 0
+ if (RHSC->isNullValue() &&
+ DL.getIntPtrType(RHSC->getType()) == LHSI->getOperand(0)->getType())
+ return new ICmpInst(
+ I.getPredicate(), LHSI->getOperand(0),
+ Constant::getNullValue(LHSI->getOperand(0)->getType()));
+ break;
+
+ case Instruction::Load:
+ // Try to optimize things like "A[i] > 4" to index computations.
+ if (GetElementPtrInst *GEP =
+ dyn_cast<GetElementPtrInst>(LHSI->getOperand(0))) {
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+ if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+ !cast<LoadInst>(LHSI)->isVolatile())
+ if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
+ return Res;
+ }
+ break;
+ }
+
+ return nullptr;
+}
+
+/// Some comparisons can be simplified.
+/// In this case, we are looking for comparisons that look like
+/// a check for a lossy truncation.
+/// Folds:
+/// icmp SrcPred (x & Mask), x to icmp DstPred x, Mask
+/// Where Mask is some pattern that produces all-ones in low bits:
+/// (-1 >> y)
+/// ((-1 << y) >> y) <- non-canonical, has extra uses
+/// ~(-1 << y)
+/// ((1 << y) + (-1)) <- non-canonical, has extra uses
+/// The Mask can be a constant, too.
+/// For some predicates, the operands are commutative.
+/// For others, x can only be on a specific side.
+static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I,
+ InstCombiner::BuilderTy &Builder) {
+ ICmpInst::Predicate SrcPred;
+ Value *X, *M, *Y;
+ auto m_VariableMask = m_CombineOr(
+ m_CombineOr(m_Not(m_Shl(m_AllOnes(), m_Value())),
+ m_Add(m_Shl(m_One(), m_Value()), m_AllOnes())),
+ m_CombineOr(m_LShr(m_AllOnes(), m_Value()),
+ m_LShr(m_Shl(m_AllOnes(), m_Value(Y)), m_Deferred(Y))));
+ auto m_Mask = m_CombineOr(m_VariableMask, m_LowBitMask());
+ if (!match(&I, m_c_ICmp(SrcPred,
+ m_c_And(m_CombineAnd(m_Mask, m_Value(M)), m_Value(X)),
+ m_Deferred(X))))
+ return nullptr;
+
+ ICmpInst::Predicate DstPred;
+ switch (SrcPred) {
+ case ICmpInst::Predicate::ICMP_EQ:
+ // x & (-1 >> y) == x -> x u<= (-1 >> y)
+ DstPred = ICmpInst::Predicate::ICMP_ULE;
+ break;
+ case ICmpInst::Predicate::ICMP_NE:
+ // x & (-1 >> y) != x -> x u> (-1 >> y)
+ DstPred = ICmpInst::Predicate::ICMP_UGT;
+ break;
+ case ICmpInst::Predicate::ICMP_ULT:
+ // x & (-1 >> y) u< x -> x u> (-1 >> y)
+ // x u> x & (-1 >> y) -> x u> (-1 >> y)
+ DstPred = ICmpInst::Predicate::ICMP_UGT;
+ break;
+ case ICmpInst::Predicate::ICMP_UGE:
+ // x & (-1 >> y) u>= x -> x u<= (-1 >> y)
+ // x u<= x & (-1 >> y) -> x u<= (-1 >> y)
+ DstPred = ICmpInst::Predicate::ICMP_ULE;
+ break;
+ case ICmpInst::Predicate::ICMP_SLT:
+ // x & (-1 >> y) s< x -> x s> (-1 >> y)
+ // x s> x & (-1 >> y) -> x s> (-1 >> y)
+ if (!match(M, m_Constant())) // Can not do this fold with non-constant.
+ return nullptr;
+ if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
+ return nullptr;
+ DstPred = ICmpInst::Predicate::ICMP_SGT;
+ break;
+ case ICmpInst::Predicate::ICMP_SGE:
+ // x & (-1 >> y) s>= x -> x s<= (-1 >> y)
+ // x s<= x & (-1 >> y) -> x s<= (-1 >> y)
+ if (!match(M, m_Constant())) // Can not do this fold with non-constant.
+ return nullptr;
+ if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
+ return nullptr;
+ DstPred = ICmpInst::Predicate::ICMP_SLE;
+ break;
+ case ICmpInst::Predicate::ICMP_SGT:
+ case ICmpInst::Predicate::ICMP_SLE:
+ return nullptr;
+ case ICmpInst::Predicate::ICMP_UGT:
+ case ICmpInst::Predicate::ICMP_ULE:
+ llvm_unreachable("Instsimplify took care of commut. variant");
+ break;
+ default:
+ llvm_unreachable("All possible folds are handled.");
+ }
+
+ // The mask value may be a vector constant that has undefined elements. But it
+ // may not be safe to propagate those undefs into the new compare, so replace
+ // those elements by copying an existing, defined, and safe scalar constant.
+ Type *OpTy = M->getType();
+ auto *VecC = dyn_cast<Constant>(M);
auto *OpVTy = dyn_cast<FixedVectorType>(OpTy);
if (OpVTy && VecC && VecC->containsUndefOrPoisonElement()) {
- Constant *SafeReplacementConstant = nullptr;
- for (unsigned i = 0, e = OpVTy->getNumElements(); i != e; ++i) {
- if (!isa<UndefValue>(VecC->getAggregateElement(i))) {
- SafeReplacementConstant = VecC->getAggregateElement(i);
- break;
- }
- }
- assert(SafeReplacementConstant && "Failed to find undef replacement");
- M = Constant::replaceUndefsWith(VecC, SafeReplacementConstant);
- }
-
- return Builder.CreateICmp(DstPred, X, M);
-}
-
-/// Some comparisons can be simplified.
-/// In this case, we are looking for comparisons that look like
-/// a check for a lossy signed truncation.
-/// Folds: (MaskedBits is a constant.)
-/// ((%x << MaskedBits) a>> MaskedBits) SrcPred %x
-/// Into:
-/// (add %x, (1 << (KeptBits-1))) DstPred (1 << KeptBits)
-/// Where KeptBits = bitwidth(%x) - MaskedBits
-static Value *
-foldICmpWithTruncSignExtendedVal(ICmpInst &I,
- InstCombiner::BuilderTy &Builder) {
- ICmpInst::Predicate SrcPred;
- Value *X;
- const APInt *C0, *C1; // FIXME: non-splats, potentially with undef.
- // We are ok with 'shl' having multiple uses, but 'ashr' must be one-use.
- if (!match(&I, m_c_ICmp(SrcPred,
- m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C0)),
- m_APInt(C1))),
- m_Deferred(X))))
- return nullptr;
-
- // Potential handling of non-splats: for each element:
- // * if both are undef, replace with constant 0.
- // Because (1<<0) is OK and is 1, and ((1<<0)>>1) is also OK and is 0.
- // * if both are not undef, and are different, bailout.
- // * else, only one is undef, then pick the non-undef one.
-
- // The shift amount must be equal.
- if (*C0 != *C1)
- return nullptr;
- const APInt &MaskedBits = *C0;
- assert(MaskedBits != 0 && "shift by zero should be folded away already.");
-
- ICmpInst::Predicate DstPred;
- switch (SrcPred) {
- case ICmpInst::Predicate::ICMP_EQ:
- // ((%x << MaskedBits) a>> MaskedBits) == %x
- // =>
- // (add %x, (1 << (KeptBits-1))) u< (1 << KeptBits)
- DstPred = ICmpInst::Predicate::ICMP_ULT;
- break;
- case ICmpInst::Predicate::ICMP_NE:
- // ((%x << MaskedBits) a>> MaskedBits) != %x
- // =>
- // (add %x, (1 << (KeptBits-1))) u>= (1 << KeptBits)
- DstPred = ICmpInst::Predicate::ICMP_UGE;
- break;
- // FIXME: are more folds possible?
- default:
- return nullptr;
- }
-
- auto *XType = X->getType();
- const unsigned XBitWidth = XType->getScalarSizeInBits();
- const APInt BitWidth = APInt(XBitWidth, XBitWidth);
- assert(BitWidth.ugt(MaskedBits) && "shifts should leave some bits untouched");
-
- // KeptBits = bitwidth(%x) - MaskedBits
- const APInt KeptBits = BitWidth - MaskedBits;
- assert(KeptBits.ugt(0) && KeptBits.ult(BitWidth) && "unreachable");
- // ICmpCst = (1 << KeptBits)
- const APInt ICmpCst = APInt(XBitWidth, 1).shl(KeptBits);
- assert(ICmpCst.isPowerOf2());
- // AddCst = (1 << (KeptBits-1))
- const APInt AddCst = ICmpCst.lshr(1);
- assert(AddCst.ult(ICmpCst) && AddCst.isPowerOf2());
-
- // T0 = add %x, AddCst
- Value *T0 = Builder.CreateAdd(X, ConstantInt::get(XType, AddCst));
- // T1 = T0 DstPred ICmpCst
- Value *T1 = Builder.CreateICmp(DstPred, T0, ConstantInt::get(XType, ICmpCst));
-
- return T1;
-}
-
-// Given pattern:
-// icmp eq/ne (and ((x shift Q), (y oppositeshift K))), 0
-// we should move shifts to the same hand of 'and', i.e. rewrite as
-// icmp eq/ne (and (x shift (Q+K)), y), 0 iff (Q+K) u< bitwidth(x)
-// We are only interested in opposite logical shifts here.
-// One of the shifts can be truncated.
-// If we can, we want to end up creating 'lshr' shift.
-static Value *
-foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
- InstCombiner::BuilderTy &Builder) {
- if (!I.isEquality() || !match(I.getOperand(1), m_Zero()) ||
- !I.getOperand(0)->hasOneUse())
- return nullptr;
-
- auto m_AnyLogicalShift = m_LogicalShift(m_Value(), m_Value());
-
- // Look for an 'and' of two logical shifts, one of which may be truncated.
- // We use m_TruncOrSelf() on the RHS to correctly handle commutative case.
- Instruction *XShift, *MaybeTruncation, *YShift;
- if (!match(
- I.getOperand(0),
- m_c_And(m_CombineAnd(m_AnyLogicalShift, m_Instruction(XShift)),
- m_CombineAnd(m_TruncOrSelf(m_CombineAnd(
- m_AnyLogicalShift, m_Instruction(YShift))),
- m_Instruction(MaybeTruncation)))))
- return nullptr;
-
- // We potentially looked past 'trunc', but only when matching YShift,
- // therefore YShift must have the widest type.
- Instruction *WidestShift = YShift;
- // Therefore XShift must have the shallowest type.
- // Or they both have identical types if there was no truncation.
- Instruction *NarrowestShift = XShift;
-
- Type *WidestTy = WidestShift->getType();
- Type *NarrowestTy = NarrowestShift->getType();
- assert(NarrowestTy == I.getOperand(0)->getType() &&
- "We did not look past any shifts while matching XShift though.");
- bool HadTrunc = WidestTy != I.getOperand(0)->getType();
-
- // If YShift is a 'lshr', swap the shifts around.
- if (match(YShift, m_LShr(m_Value(), m_Value())))
- std::swap(XShift, YShift);
-
- // The shifts must be in opposite directions.
- auto XShiftOpcode = XShift->getOpcode();
- if (XShiftOpcode == YShift->getOpcode())
- return nullptr; // Do not care about same-direction shifts here.
-
- Value *X, *XShAmt, *Y, *YShAmt;
- match(XShift, m_BinOp(m_Value(X), m_ZExtOrSelf(m_Value(XShAmt))));
- match(YShift, m_BinOp(m_Value(Y), m_ZExtOrSelf(m_Value(YShAmt))));
-
- // If one of the values being shifted is a constant, then we will end with
- // and+icmp, and [zext+]shift instrs will be constant-folded. If they are not,
- // however, we will need to ensure that we won't increase instruction count.
- if (!isa<Constant>(X) && !isa<Constant>(Y)) {
- // At least one of the hands of the 'and' should be one-use shift.
- if (!match(I.getOperand(0),
- m_c_And(m_OneUse(m_AnyLogicalShift), m_Value())))
- return nullptr;
- if (HadTrunc) {
- // Due to the 'trunc', we will need to widen X. For that either the old
- // 'trunc' or the shift amt in the non-truncated shift should be one-use.
- if (!MaybeTruncation->hasOneUse() &&
- !NarrowestShift->getOperand(1)->hasOneUse())
- return nullptr;
- }
- }
-
- // We have two shift amounts from two different shifts. The types of those
- // shift amounts may not match. If that's the case let's bailout now.
- if (XShAmt->getType() != YShAmt->getType())
- return nullptr;
-
- // As input, we have the following pattern:
- // icmp eq/ne (and ((x shift Q), (y oppositeshift K))), 0
- // We want to rewrite that as:
- // icmp eq/ne (and (x shift (Q+K)), y), 0 iff (Q+K) u< bitwidth(x)
- // While we know that originally (Q+K) would not overflow
- // (because 2 * (N-1) u<= iN -1), we have looked past extensions of
- // shift amounts. so it may now overflow in smaller bitwidth.
- // To ensure that does not happen, we need to ensure that the total maximal
- // shift amount is still representable in that smaller bit width.
- unsigned MaximalPossibleTotalShiftAmount =
- (WidestTy->getScalarSizeInBits() - 1) +
- (NarrowestTy->getScalarSizeInBits() - 1);
- APInt MaximalRepresentableShiftAmount =
- APInt::getAllOnesValue(XShAmt->getType()->getScalarSizeInBits());
- if (MaximalRepresentableShiftAmount.ult(MaximalPossibleTotalShiftAmount))
- return nullptr;
-
- // Can we fold (XShAmt+YShAmt) ?
- auto *NewShAmt = dyn_cast_or_null<Constant>(
- SimplifyAddInst(XShAmt, YShAmt, /*isNSW=*/false,
- /*isNUW=*/false, SQ.getWithInstruction(&I)));
- if (!NewShAmt)
- return nullptr;
- NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, WidestTy);
- unsigned WidestBitWidth = WidestTy->getScalarSizeInBits();
-
- // Is the new shift amount smaller than the bit width?
- // FIXME: could also rely on ConstantRange.
- if (!match(NewShAmt,
- m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
- APInt(WidestBitWidth, WidestBitWidth))))
- return nullptr;
-
- // An extra legality check is needed if we had trunc-of-lshr.
- if (HadTrunc && match(WidestShift, m_LShr(m_Value(), m_Value()))) {
- auto CanFold = [NewShAmt, WidestBitWidth, NarrowestShift, SQ,
- WidestShift]() {
- // It isn't obvious whether it's worth it to analyze non-constants here.
- // Also, let's basically give up on non-splat cases, pessimizing vectors.
- // If *any* of these preconditions matches we can perform the fold.
- Constant *NewShAmtSplat = NewShAmt->getType()->isVectorTy()
- ? NewShAmt->getSplatValue()
- : NewShAmt;
- // If it's edge-case shift (by 0 or by WidestBitWidth-1) we can fold.
- if (NewShAmtSplat &&
- (NewShAmtSplat->isNullValue() ||
- NewShAmtSplat->getUniqueInteger() == WidestBitWidth - 1))
- return true;
- // We consider *min* leading zeros so a single outlier
- // blocks the transform as opposed to allowing it.
- if (auto *C = dyn_cast<Constant>(NarrowestShift->getOperand(0))) {
- KnownBits Known = computeKnownBits(C, SQ.DL);
- unsigned MinLeadZero = Known.countMinLeadingZeros();
- // If the value being shifted has at most lowest bit set we can fold.
- unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero;
- if (MaxActiveBits <= 1)
- return true;
- // Precondition: NewShAmt u<= countLeadingZeros(C)
- if (NewShAmtSplat && NewShAmtSplat->getUniqueInteger().ule(MinLeadZero))
- return true;
- }
- if (auto *C = dyn_cast<Constant>(WidestShift->getOperand(0))) {
- KnownBits Known = computeKnownBits(C, SQ.DL);
- unsigned MinLeadZero = Known.countMinLeadingZeros();
- // If the value being shifted has at most lowest bit set we can fold.
- unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero;
- if (MaxActiveBits <= 1)
- return true;
- // Precondition: ((WidestBitWidth-1)-NewShAmt) u<= countLeadingZeros(C)
- if (NewShAmtSplat) {
- APInt AdjNewShAmt =
- (WidestBitWidth - 1) - NewShAmtSplat->getUniqueInteger();
- if (AdjNewShAmt.ule(MinLeadZero))
- return true;
- }
- }
- return false; // Can't tell if it's ok.
- };
- if (!CanFold())
- return nullptr;
- }
-
- // All good, we can do this fold.
- X = Builder.CreateZExt(X, WidestTy);
- Y = Builder.CreateZExt(Y, WidestTy);
- // The shift is the same that was for X.
- Value *T0 = XShiftOpcode == Instruction::BinaryOps::LShr
- ? Builder.CreateLShr(X, NewShAmt)
- : Builder.CreateShl(X, NewShAmt);
- Value *T1 = Builder.CreateAnd(T0, Y);
- return Builder.CreateICmp(I.getPredicate(), T1,
- Constant::getNullValue(WidestTy));
-}
-
-/// Fold
-/// (-1 u/ x) u< y
-/// ((x * y) u/ x) != y
-/// to
-/// @llvm.umul.with.overflow(x, y) plus extraction of overflow bit
-/// Note that the comparison is commutative, while inverted (u>=, ==) predicate
-/// will mean that we are looking for the opposite answer.
+ Constant *SafeReplacementConstant = nullptr;
+ for (unsigned i = 0, e = OpVTy->getNumElements(); i != e; ++i) {
+ if (!isa<UndefValue>(VecC->getAggregateElement(i))) {
+ SafeReplacementConstant = VecC->getAggregateElement(i);
+ break;
+ }
+ }
+ assert(SafeReplacementConstant && "Failed to find undef replacement");
+ M = Constant::replaceUndefsWith(VecC, SafeReplacementConstant);
+ }
+
+ return Builder.CreateICmp(DstPred, X, M);
+}
+
+/// Some comparisons can be simplified.
+/// In this case, we are looking for comparisons that look like
+/// a check for a lossy signed truncation.
+/// Folds: (MaskedBits is a constant.)
+/// ((%x << MaskedBits) a>> MaskedBits) SrcPred %x
+/// Into:
+/// (add %x, (1 << (KeptBits-1))) DstPred (1 << KeptBits)
+/// Where KeptBits = bitwidth(%x) - MaskedBits
+static Value *
+foldICmpWithTruncSignExtendedVal(ICmpInst &I,
+ InstCombiner::BuilderTy &Builder) {
+ ICmpInst::Predicate SrcPred;
+ Value *X;
+ const APInt *C0, *C1; // FIXME: non-splats, potentially with undef.
+ // We are ok with 'shl' having multiple uses, but 'ashr' must be one-use.
+ if (!match(&I, m_c_ICmp(SrcPred,
+ m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C0)),
+ m_APInt(C1))),
+ m_Deferred(X))))
+ return nullptr;
+
+ // Potential handling of non-splats: for each element:
+ // * if both are undef, replace with constant 0.
+ // Because (1<<0) is OK and is 1, and ((1<<0)>>1) is also OK and is 0.
+ // * if both are not undef, and are different, bailout.
+ // * else, only one is undef, then pick the non-undef one.
+
+ // The shift amount must be equal.
+ if (*C0 != *C1)
+ return nullptr;
+ const APInt &MaskedBits = *C0;
+ assert(MaskedBits != 0 && "shift by zero should be folded away already.");
+
+ ICmpInst::Predicate DstPred;
+ switch (SrcPred) {
+ case ICmpInst::Predicate::ICMP_EQ:
+ // ((%x << MaskedBits) a>> MaskedBits) == %x
+ // =>
+ // (add %x, (1 << (KeptBits-1))) u< (1 << KeptBits)
+ DstPred = ICmpInst::Predicate::ICMP_ULT;
+ break;
+ case ICmpInst::Predicate::ICMP_NE:
+ // ((%x << MaskedBits) a>> MaskedBits) != %x
+ // =>
+ // (add %x, (1 << (KeptBits-1))) u>= (1 << KeptBits)
+ DstPred = ICmpInst::Predicate::ICMP_UGE;
+ break;
+ // FIXME: are more folds possible?
+ default:
+ return nullptr;
+ }
+
+ auto *XType = X->getType();
+ const unsigned XBitWidth = XType->getScalarSizeInBits();
+ const APInt BitWidth = APInt(XBitWidth, XBitWidth);
+ assert(BitWidth.ugt(MaskedBits) && "shifts should leave some bits untouched");
+
+ // KeptBits = bitwidth(%x) - MaskedBits
+ const APInt KeptBits = BitWidth - MaskedBits;
+ assert(KeptBits.ugt(0) && KeptBits.ult(BitWidth) && "unreachable");
+ // ICmpCst = (1 << KeptBits)
+ const APInt ICmpCst = APInt(XBitWidth, 1).shl(KeptBits);
+ assert(ICmpCst.isPowerOf2());
+ // AddCst = (1 << (KeptBits-1))
+ const APInt AddCst = ICmpCst.lshr(1);
+ assert(AddCst.ult(ICmpCst) && AddCst.isPowerOf2());
+
+ // T0 = add %x, AddCst
+ Value *T0 = Builder.CreateAdd(X, ConstantInt::get(XType, AddCst));
+ // T1 = T0 DstPred ICmpCst
+ Value *T1 = Builder.CreateICmp(DstPred, T0, ConstantInt::get(XType, ICmpCst));
+
+ return T1;
+}
+
+// Given pattern:
+// icmp eq/ne (and ((x shift Q), (y oppositeshift K))), 0
+// we should move shifts to the same hand of 'and', i.e. rewrite as
+// icmp eq/ne (and (x shift (Q+K)), y), 0 iff (Q+K) u< bitwidth(x)
+// We are only interested in opposite logical shifts here.
+// One of the shifts can be truncated.
+// If we can, we want to end up creating 'lshr' shift.
+static Value *
+foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
+ InstCombiner::BuilderTy &Builder) {
+ if (!I.isEquality() || !match(I.getOperand(1), m_Zero()) ||
+ !I.getOperand(0)->hasOneUse())
+ return nullptr;
+
+ auto m_AnyLogicalShift = m_LogicalShift(m_Value(), m_Value());
+
+ // Look for an 'and' of two logical shifts, one of which may be truncated.
+ // We use m_TruncOrSelf() on the RHS to correctly handle commutative case.
+ Instruction *XShift, *MaybeTruncation, *YShift;
+ if (!match(
+ I.getOperand(0),
+ m_c_And(m_CombineAnd(m_AnyLogicalShift, m_Instruction(XShift)),
+ m_CombineAnd(m_TruncOrSelf(m_CombineAnd(
+ m_AnyLogicalShift, m_Instruction(YShift))),
+ m_Instruction(MaybeTruncation)))))
+ return nullptr;
+
+ // We potentially looked past 'trunc', but only when matching YShift,
+ // therefore YShift must have the widest type.
+ Instruction *WidestShift = YShift;
+ // Therefore XShift must have the shallowest type.
+ // Or they both have identical types if there was no truncation.
+ Instruction *NarrowestShift = XShift;
+
+ Type *WidestTy = WidestShift->getType();
+ Type *NarrowestTy = NarrowestShift->getType();
+ assert(NarrowestTy == I.getOperand(0)->getType() &&
+ "We did not look past any shifts while matching XShift though.");
+ bool HadTrunc = WidestTy != I.getOperand(0)->getType();
+
+ // If YShift is a 'lshr', swap the shifts around.
+ if (match(YShift, m_LShr(m_Value(), m_Value())))
+ std::swap(XShift, YShift);
+
+ // The shifts must be in opposite directions.
+ auto XShiftOpcode = XShift->getOpcode();
+ if (XShiftOpcode == YShift->getOpcode())
+ return nullptr; // Do not care about same-direction shifts here.
+
+ Value *X, *XShAmt, *Y, *YShAmt;
+ match(XShift, m_BinOp(m_Value(X), m_ZExtOrSelf(m_Value(XShAmt))));
+ match(YShift, m_BinOp(m_Value(Y), m_ZExtOrSelf(m_Value(YShAmt))));
+
+ // If one of the values being shifted is a constant, then we will end with
+ // and+icmp, and [zext+]shift instrs will be constant-folded. If they are not,
+ // however, we will need to ensure that we won't increase instruction count.
+ if (!isa<Constant>(X) && !isa<Constant>(Y)) {
+ // At least one of the hands of the 'and' should be one-use shift.
+ if (!match(I.getOperand(0),
+ m_c_And(m_OneUse(m_AnyLogicalShift), m_Value())))
+ return nullptr;
+ if (HadTrunc) {
+ // Due to the 'trunc', we will need to widen X. For that either the old
+ // 'trunc' or the shift amt in the non-truncated shift should be one-use.
+ if (!MaybeTruncation->hasOneUse() &&
+ !NarrowestShift->getOperand(1)->hasOneUse())
+ return nullptr;
+ }
+ }
+
+ // We have two shift amounts from two different shifts. The types of those
+ // shift amounts may not match. If that's the case let's bailout now.
+ if (XShAmt->getType() != YShAmt->getType())
+ return nullptr;
+
+ // As input, we have the following pattern:
+ // icmp eq/ne (and ((x shift Q), (y oppositeshift K))), 0
+ // We want to rewrite that as:
+ // icmp eq/ne (and (x shift (Q+K)), y), 0 iff (Q+K) u< bitwidth(x)
+ // While we know that originally (Q+K) would not overflow
+ // (because 2 * (N-1) u<= iN -1), we have looked past extensions of
+ // shift amounts. so it may now overflow in smaller bitwidth.
+ // To ensure that does not happen, we need to ensure that the total maximal
+ // shift amount is still representable in that smaller bit width.
+ unsigned MaximalPossibleTotalShiftAmount =
+ (WidestTy->getScalarSizeInBits() - 1) +
+ (NarrowestTy->getScalarSizeInBits() - 1);
+ APInt MaximalRepresentableShiftAmount =
+ APInt::getAllOnesValue(XShAmt->getType()->getScalarSizeInBits());
+ if (MaximalRepresentableShiftAmount.ult(MaximalPossibleTotalShiftAmount))
+ return nullptr;
+
+ // Can we fold (XShAmt+YShAmt) ?
+ auto *NewShAmt = dyn_cast_or_null<Constant>(
+ SimplifyAddInst(XShAmt, YShAmt, /*isNSW=*/false,
+ /*isNUW=*/false, SQ.getWithInstruction(&I)));
+ if (!NewShAmt)
+ return nullptr;
+ NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, WidestTy);
+ unsigned WidestBitWidth = WidestTy->getScalarSizeInBits();
+
+ // Is the new shift amount smaller than the bit width?
+ // FIXME: could also rely on ConstantRange.
+ if (!match(NewShAmt,
+ m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
+ APInt(WidestBitWidth, WidestBitWidth))))
+ return nullptr;
+
+ // An extra legality check is needed if we had trunc-of-lshr.
+ if (HadTrunc && match(WidestShift, m_LShr(m_Value(), m_Value()))) {
+ auto CanFold = [NewShAmt, WidestBitWidth, NarrowestShift, SQ,
+ WidestShift]() {
+ // It isn't obvious whether it's worth it to analyze non-constants here.
+ // Also, let's basically give up on non-splat cases, pessimizing vectors.
+ // If *any* of these preconditions matches we can perform the fold.
+ Constant *NewShAmtSplat = NewShAmt->getType()->isVectorTy()
+ ? NewShAmt->getSplatValue()
+ : NewShAmt;
+ // If it's edge-case shift (by 0 or by WidestBitWidth-1) we can fold.
+ if (NewShAmtSplat &&
+ (NewShAmtSplat->isNullValue() ||
+ NewShAmtSplat->getUniqueInteger() == WidestBitWidth - 1))
+ return true;
+ // We consider *min* leading zeros so a single outlier
+ // blocks the transform as opposed to allowing it.
+ if (auto *C = dyn_cast<Constant>(NarrowestShift->getOperand(0))) {
+ KnownBits Known = computeKnownBits(C, SQ.DL);
+ unsigned MinLeadZero = Known.countMinLeadingZeros();
+ // If the value being shifted has at most lowest bit set we can fold.
+ unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero;
+ if (MaxActiveBits <= 1)
+ return true;
+ // Precondition: NewShAmt u<= countLeadingZeros(C)
+ if (NewShAmtSplat && NewShAmtSplat->getUniqueInteger().ule(MinLeadZero))
+ return true;
+ }
+ if (auto *C = dyn_cast<Constant>(WidestShift->getOperand(0))) {
+ KnownBits Known = computeKnownBits(C, SQ.DL);
+ unsigned MinLeadZero = Known.countMinLeadingZeros();
+ // If the value being shifted has at most lowest bit set we can fold.
+ unsigned MaxActiveBits = Known.getBitWidth() - MinLeadZero;
+ if (MaxActiveBits <= 1)
+ return true;
+ // Precondition: ((WidestBitWidth-1)-NewShAmt) u<= countLeadingZeros(C)
+ if (NewShAmtSplat) {
+ APInt AdjNewShAmt =
+ (WidestBitWidth - 1) - NewShAmtSplat->getUniqueInteger();
+ if (AdjNewShAmt.ule(MinLeadZero))
+ return true;
+ }
+ }
+ return false; // Can't tell if it's ok.
+ };
+ if (!CanFold())
+ return nullptr;
+ }
+
+ // All good, we can do this fold.
+ X = Builder.CreateZExt(X, WidestTy);
+ Y = Builder.CreateZExt(Y, WidestTy);
+ // The shift is the same that was for X.
+ Value *T0 = XShiftOpcode == Instruction::BinaryOps::LShr
+ ? Builder.CreateLShr(X, NewShAmt)
+ : Builder.CreateShl(X, NewShAmt);
+ Value *T1 = Builder.CreateAnd(T0, Y);
+ return Builder.CreateICmp(I.getPredicate(), T1,
+ Constant::getNullValue(WidestTy));
+}
+
+/// Fold
+/// (-1 u/ x) u< y
+/// ((x * y) u/ x) != y
+/// to
+/// @llvm.umul.with.overflow(x, y) plus extraction of overflow bit
+/// Note that the comparison is commutative, while inverted (u>=, ==) predicate
+/// will mean that we are looking for the opposite answer.
Value *InstCombinerImpl::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
- ICmpInst::Predicate Pred;
- Value *X, *Y;
- Instruction *Mul;
- bool NeedNegation;
- // Look for: (-1 u/ x) u</u>= y
- if (!I.isEquality() &&
- match(&I, m_c_ICmp(Pred, m_OneUse(m_UDiv(m_AllOnes(), m_Value(X))),
- m_Value(Y)))) {
- Mul = nullptr;
-
- // Are we checking that overflow does not happen, or does happen?
- switch (Pred) {
- case ICmpInst::Predicate::ICMP_ULT:
- NeedNegation = false;
- break; // OK
- case ICmpInst::Predicate::ICMP_UGE:
- NeedNegation = true;
- break; // OK
- default:
- return nullptr; // Wrong predicate.
- }
- } else // Look for: ((x * y) u/ x) !=/== y
- if (I.isEquality() &&
- match(&I, m_c_ICmp(Pred, m_Value(Y),
- m_OneUse(m_UDiv(m_CombineAnd(m_c_Mul(m_Deferred(Y),
- m_Value(X)),
- m_Instruction(Mul)),
- m_Deferred(X)))))) {
- NeedNegation = Pred == ICmpInst::Predicate::ICMP_EQ;
- } else
- return nullptr;
-
- BuilderTy::InsertPointGuard Guard(Builder);
- // If the pattern included (x * y), we'll want to insert new instructions
- // right before that original multiplication so that we can replace it.
- bool MulHadOtherUses = Mul && !Mul->hasOneUse();
- if (MulHadOtherUses)
- Builder.SetInsertPoint(Mul);
-
- Function *F = Intrinsic::getDeclaration(
- I.getModule(), Intrinsic::umul_with_overflow, X->getType());
- CallInst *Call = Builder.CreateCall(F, {X, Y}, "umul");
-
- // If the multiplication was used elsewhere, to ensure that we don't leave
- // "duplicate" instructions, replace uses of that original multiplication
- // with the multiplication result from the with.overflow intrinsic.
- if (MulHadOtherUses)
- replaceInstUsesWith(*Mul, Builder.CreateExtractValue(Call, 0, "umul.val"));
-
- Value *Res = Builder.CreateExtractValue(Call, 1, "umul.ov");
- if (NeedNegation) // This technically increases instruction count.
- Res = Builder.CreateNot(Res, "umul.not.ov");
-
- // If we replaced the mul, erase it. Do this after all uses of Builder,
- // as the mul is used as insertion point.
- if (MulHadOtherUses)
- eraseInstFromFunction(*Mul);
-
- return Res;
-}
-
+ ICmpInst::Predicate Pred;
+ Value *X, *Y;
+ Instruction *Mul;
+ bool NeedNegation;
+ // Look for: (-1 u/ x) u</u>= y
+ if (!I.isEquality() &&
+ match(&I, m_c_ICmp(Pred, m_OneUse(m_UDiv(m_AllOnes(), m_Value(X))),
+ m_Value(Y)))) {
+ Mul = nullptr;
+
+ // Are we checking that overflow does not happen, or does happen?
+ switch (Pred) {
+ case ICmpInst::Predicate::ICMP_ULT:
+ NeedNegation = false;
+ break; // OK
+ case ICmpInst::Predicate::ICMP_UGE:
+ NeedNegation = true;
+ break; // OK
+ default:
+ return nullptr; // Wrong predicate.
+ }
+ } else // Look for: ((x * y) u/ x) !=/== y
+ if (I.isEquality() &&
+ match(&I, m_c_ICmp(Pred, m_Value(Y),
+ m_OneUse(m_UDiv(m_CombineAnd(m_c_Mul(m_Deferred(Y),
+ m_Value(X)),
+ m_Instruction(Mul)),
+ m_Deferred(X)))))) {
+ NeedNegation = Pred == ICmpInst::Predicate::ICMP_EQ;
+ } else
+ return nullptr;
+
+ BuilderTy::InsertPointGuard Guard(Builder);
+ // If the pattern included (x * y), we'll want to insert new instructions
+ // right before that original multiplication so that we can replace it.
+ bool MulHadOtherUses = Mul && !Mul->hasOneUse();
+ if (MulHadOtherUses)
+ Builder.SetInsertPoint(Mul);
+
+ Function *F = Intrinsic::getDeclaration(
+ I.getModule(), Intrinsic::umul_with_overflow, X->getType());
+ CallInst *Call = Builder.CreateCall(F, {X, Y}, "umul");
+
+ // If the multiplication was used elsewhere, to ensure that we don't leave
+ // "duplicate" instructions, replace uses of that original multiplication
+ // with the multiplication result from the with.overflow intrinsic.
+ if (MulHadOtherUses)
+ replaceInstUsesWith(*Mul, Builder.CreateExtractValue(Call, 0, "umul.val"));
+
+ Value *Res = Builder.CreateExtractValue(Call, 1, "umul.ov");
+ if (NeedNegation) // This technically increases instruction count.
+ Res = Builder.CreateNot(Res, "umul.not.ov");
+
+ // If we replaced the mul, erase it. Do this after all uses of Builder,
+ // as the mul is used as insertion point.
+ if (MulHadOtherUses)
+ eraseInstFromFunction(*Mul);
+
+ return Res;
+}
+
static Instruction *foldICmpXNegX(ICmpInst &I) {
CmpInst::Predicate Pred;
Value *X;
@@ -3729,244 +3729,244 @@ static Instruction *foldICmpXNegX(ICmpInst &I) {
Constant::getNullValue(X->getType()), I.getName());
}
-/// Try to fold icmp (binop), X or icmp X, (binop).
-/// TODO: A large part of this logic is duplicated in InstSimplify's
-/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
-/// duplication.
+/// Try to fold icmp (binop), X or icmp X, (binop).
+/// TODO: A large part of this logic is duplicated in InstSimplify's
+/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code
+/// duplication.
Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
const SimplifyQuery &SQ) {
- const SimplifyQuery Q = SQ.getWithInstruction(&I);
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- // Special logic for binary operators.
- BinaryOperator *BO0 = dyn_cast<BinaryOperator>(Op0);
- BinaryOperator *BO1 = dyn_cast<BinaryOperator>(Op1);
- if (!BO0 && !BO1)
- return nullptr;
-
+ const SimplifyQuery Q = SQ.getWithInstruction(&I);
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+ // Special logic for binary operators.
+ BinaryOperator *BO0 = dyn_cast<BinaryOperator>(Op0);
+ BinaryOperator *BO1 = dyn_cast<BinaryOperator>(Op1);
+ if (!BO0 && !BO1)
+ return nullptr;
+
if (Instruction *NewICmp = foldICmpXNegX(I))
return NewICmp;
- const CmpInst::Predicate Pred = I.getPredicate();
- Value *X;
-
- // Convert add-with-unsigned-overflow comparisons into a 'not' with compare.
- // (Op1 + X) u</u>= Op1 --> ~Op1 u</u>= X
- if (match(Op0, m_OneUse(m_c_Add(m_Specific(Op1), m_Value(X)))) &&
- (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE))
- return new ICmpInst(Pred, Builder.CreateNot(Op1), X);
- // Op0 u>/u<= (Op0 + X) --> X u>/u<= ~Op0
- if (match(Op1, m_OneUse(m_c_Add(m_Specific(Op0), m_Value(X)))) &&
- (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE))
- return new ICmpInst(Pred, X, Builder.CreateNot(Op0));
-
- bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
- if (BO0 && isa<OverflowingBinaryOperator>(BO0))
- NoOp0WrapProblem =
- ICmpInst::isEquality(Pred) ||
- (CmpInst::isUnsigned(Pred) && BO0->hasNoUnsignedWrap()) ||
- (CmpInst::isSigned(Pred) && BO0->hasNoSignedWrap());
- if (BO1 && isa<OverflowingBinaryOperator>(BO1))
- NoOp1WrapProblem =
- ICmpInst::isEquality(Pred) ||
- (CmpInst::isUnsigned(Pred) && BO1->hasNoUnsignedWrap()) ||
- (CmpInst::isSigned(Pred) && BO1->hasNoSignedWrap());
-
- // Analyze the case when either Op0 or Op1 is an add instruction.
- // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null).
- Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
- if (BO0 && BO0->getOpcode() == Instruction::Add) {
- A = BO0->getOperand(0);
- B = BO0->getOperand(1);
- }
- if (BO1 && BO1->getOpcode() == Instruction::Add) {
- C = BO1->getOperand(0);
- D = BO1->getOperand(1);
- }
-
- // icmp (A+B), A -> icmp B, 0 for equalities or if there is no overflow.
- // icmp (A+B), B -> icmp A, 0 for equalities or if there is no overflow.
- if ((A == Op1 || B == Op1) && NoOp0WrapProblem)
- return new ICmpInst(Pred, A == Op1 ? B : A,
- Constant::getNullValue(Op1->getType()));
-
- // icmp C, (C+D) -> icmp 0, D for equalities or if there is no overflow.
- // icmp D, (C+D) -> icmp 0, C for equalities or if there is no overflow.
- if ((C == Op0 || D == Op0) && NoOp1WrapProblem)
- return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()),
- C == Op0 ? D : C);
-
- // icmp (A+B), (A+D) -> icmp B, D for equalities or if there is no overflow.
- if (A && C && (A == C || A == D || B == C || B == D) && NoOp0WrapProblem &&
- NoOp1WrapProblem) {
- // Determine Y and Z in the form icmp (X+Y), (X+Z).
- Value *Y, *Z;
- if (A == C) {
- // C + B == C + D -> B == D
- Y = B;
- Z = D;
- } else if (A == D) {
- // D + B == C + D -> B == C
- Y = B;
- Z = C;
- } else if (B == C) {
- // A + C == C + D -> A == D
- Y = A;
- Z = D;
- } else {
- assert(B == D);
- // A + D == C + D -> A == C
- Y = A;
- Z = C;
- }
- return new ICmpInst(Pred, Y, Z);
- }
-
- // icmp slt (A + -1), Op1 -> icmp sle A, Op1
- if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT &&
- match(B, m_AllOnes()))
- return new ICmpInst(CmpInst::ICMP_SLE, A, Op1);
-
- // icmp sge (A + -1), Op1 -> icmp sgt A, Op1
- if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE &&
- match(B, m_AllOnes()))
- return new ICmpInst(CmpInst::ICMP_SGT, A, Op1);
-
- // icmp sle (A + 1), Op1 -> icmp slt A, Op1
- if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE && match(B, m_One()))
- return new ICmpInst(CmpInst::ICMP_SLT, A, Op1);
-
- // icmp sgt (A + 1), Op1 -> icmp sge A, Op1
- if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT && match(B, m_One()))
- return new ICmpInst(CmpInst::ICMP_SGE, A, Op1);
-
- // icmp sgt Op0, (C + -1) -> icmp sge Op0, C
- if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT &&
- match(D, m_AllOnes()))
- return new ICmpInst(CmpInst::ICMP_SGE, Op0, C);
-
- // icmp sle Op0, (C + -1) -> icmp slt Op0, C
- if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE &&
- match(D, m_AllOnes()))
- return new ICmpInst(CmpInst::ICMP_SLT, Op0, C);
-
- // icmp sge Op0, (C + 1) -> icmp sgt Op0, C
- if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && match(D, m_One()))
- return new ICmpInst(CmpInst::ICMP_SGT, Op0, C);
-
- // icmp slt Op0, (C + 1) -> icmp sle Op0, C
- if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && match(D, m_One()))
- return new ICmpInst(CmpInst::ICMP_SLE, Op0, C);
-
- // TODO: The subtraction-related identities shown below also hold, but
- // canonicalization from (X -nuw 1) to (X + -1) means that the combinations
- // wouldn't happen even if they were implemented.
- //
- // icmp ult (A - 1), Op1 -> icmp ule A, Op1
- // icmp uge (A - 1), Op1 -> icmp ugt A, Op1
- // icmp ugt Op0, (C - 1) -> icmp uge Op0, C
- // icmp ule Op0, (C - 1) -> icmp ult Op0, C
-
- // icmp ule (A + 1), Op0 -> icmp ult A, Op1
- if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_ULE && match(B, m_One()))
- return new ICmpInst(CmpInst::ICMP_ULT, A, Op1);
-
- // icmp ugt (A + 1), Op0 -> icmp uge A, Op1
- if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_UGT && match(B, m_One()))
- return new ICmpInst(CmpInst::ICMP_UGE, A, Op1);
-
- // icmp uge Op0, (C + 1) -> icmp ugt Op0, C
- if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_UGE && match(D, m_One()))
- return new ICmpInst(CmpInst::ICMP_UGT, Op0, C);
-
- // icmp ult Op0, (C + 1) -> icmp ule Op0, C
- if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_ULT && match(D, m_One()))
- return new ICmpInst(CmpInst::ICMP_ULE, Op0, C);
-
- // if C1 has greater magnitude than C2:
- // icmp (A + C1), (C + C2) -> icmp (A + C3), C
- // s.t. C3 = C1 - C2
- //
- // if C2 has greater magnitude than C1:
- // icmp (A + C1), (C + C2) -> icmp A, (C + C3)
- // s.t. C3 = C2 - C1
- if (A && C && NoOp0WrapProblem && NoOp1WrapProblem &&
- (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned())
- if (ConstantInt *C1 = dyn_cast<ConstantInt>(B))
- if (ConstantInt *C2 = dyn_cast<ConstantInt>(D)) {
- const APInt &AP1 = C1->getValue();
- const APInt &AP2 = C2->getValue();
- if (AP1.isNegative() == AP2.isNegative()) {
- APInt AP1Abs = C1->getValue().abs();
- APInt AP2Abs = C2->getValue().abs();
- if (AP1Abs.uge(AP2Abs)) {
- ConstantInt *C3 = Builder.getInt(AP1 - AP2);
- Value *NewAdd = Builder.CreateNSWAdd(A, C3);
- return new ICmpInst(Pred, NewAdd, C);
- } else {
- ConstantInt *C3 = Builder.getInt(AP2 - AP1);
- Value *NewAdd = Builder.CreateNSWAdd(C, C3);
- return new ICmpInst(Pred, A, NewAdd);
- }
- }
- }
-
- // Analyze the case when either Op0 or Op1 is a sub instruction.
- // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null).
- A = nullptr;
- B = nullptr;
- C = nullptr;
- D = nullptr;
- if (BO0 && BO0->getOpcode() == Instruction::Sub) {
- A = BO0->getOperand(0);
- B = BO0->getOperand(1);
- }
- if (BO1 && BO1->getOpcode() == Instruction::Sub) {
- C = BO1->getOperand(0);
- D = BO1->getOperand(1);
- }
-
- // icmp (A-B), A -> icmp 0, B for equalities or if there is no overflow.
- if (A == Op1 && NoOp0WrapProblem)
- return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B);
- // icmp C, (C-D) -> icmp D, 0 for equalities or if there is no overflow.
- if (C == Op0 && NoOp1WrapProblem)
- return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType()));
-
- // Convert sub-with-unsigned-overflow comparisons into a comparison of args.
- // (A - B) u>/u<= A --> B u>/u<= A
- if (A == Op1 && (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE))
- return new ICmpInst(Pred, B, A);
- // C u</u>= (C - D) --> C u</u>= D
- if (C == Op0 && (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE))
- return new ICmpInst(Pred, C, D);
- // (A - B) u>=/u< A --> B u>/u<= A iff B != 0
- if (A == Op1 && (Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_ULT) &&
- isKnownNonZero(B, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
- return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), B, A);
- // C u<=/u> (C - D) --> C u</u>= D iff B != 0
- if (C == Op0 && (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT) &&
- isKnownNonZero(D, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
- return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), C, D);
-
- // icmp (A-B), (C-B) -> icmp A, C for equalities or if there is no overflow.
- if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem)
- return new ICmpInst(Pred, A, C);
-
- // icmp (A-B), (A-D) -> icmp D, B for equalities or if there is no overflow.
- if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem)
- return new ICmpInst(Pred, D, B);
-
- // icmp (0-X) < cst --> x > -cst
- if (NoOp0WrapProblem && ICmpInst::isSigned(Pred)) {
- Value *X;
- if (match(BO0, m_Neg(m_Value(X))))
- if (Constant *RHSC = dyn_cast<Constant>(Op1))
- if (RHSC->isNotMinSignedValue())
- return new ICmpInst(I.getSwappedPredicate(), X,
- ConstantExpr::getNeg(RHSC));
- }
-
+ const CmpInst::Predicate Pred = I.getPredicate();
+ Value *X;
+
+ // Convert add-with-unsigned-overflow comparisons into a 'not' with compare.
+ // (Op1 + X) u</u>= Op1 --> ~Op1 u</u>= X
+ if (match(Op0, m_OneUse(m_c_Add(m_Specific(Op1), m_Value(X)))) &&
+ (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE))
+ return new ICmpInst(Pred, Builder.CreateNot(Op1), X);
+ // Op0 u>/u<= (Op0 + X) --> X u>/u<= ~Op0
+ if (match(Op1, m_OneUse(m_c_Add(m_Specific(Op0), m_Value(X)))) &&
+ (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE))
+ return new ICmpInst(Pred, X, Builder.CreateNot(Op0));
+
+ bool NoOp0WrapProblem = false, NoOp1WrapProblem = false;
+ if (BO0 && isa<OverflowingBinaryOperator>(BO0))
+ NoOp0WrapProblem =
+ ICmpInst::isEquality(Pred) ||
+ (CmpInst::isUnsigned(Pred) && BO0->hasNoUnsignedWrap()) ||
+ (CmpInst::isSigned(Pred) && BO0->hasNoSignedWrap());
+ if (BO1 && isa<OverflowingBinaryOperator>(BO1))
+ NoOp1WrapProblem =
+ ICmpInst::isEquality(Pred) ||
+ (CmpInst::isUnsigned(Pred) && BO1->hasNoUnsignedWrap()) ||
+ (CmpInst::isSigned(Pred) && BO1->hasNoSignedWrap());
+
+ // Analyze the case when either Op0 or Op1 is an add instruction.
+ // Op0 = A + B (or A and B are null); Op1 = C + D (or C and D are null).
+ Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr;
+ if (BO0 && BO0->getOpcode() == Instruction::Add) {
+ A = BO0->getOperand(0);
+ B = BO0->getOperand(1);
+ }
+ if (BO1 && BO1->getOpcode() == Instruction::Add) {
+ C = BO1->getOperand(0);
+ D = BO1->getOperand(1);
+ }
+
+ // icmp (A+B), A -> icmp B, 0 for equalities or if there is no overflow.
+ // icmp (A+B), B -> icmp A, 0 for equalities or if there is no overflow.
+ if ((A == Op1 || B == Op1) && NoOp0WrapProblem)
+ return new ICmpInst(Pred, A == Op1 ? B : A,
+ Constant::getNullValue(Op1->getType()));
+
+ // icmp C, (C+D) -> icmp 0, D for equalities or if there is no overflow.
+ // icmp D, (C+D) -> icmp 0, C for equalities or if there is no overflow.
+ if ((C == Op0 || D == Op0) && NoOp1WrapProblem)
+ return new ICmpInst(Pred, Constant::getNullValue(Op0->getType()),
+ C == Op0 ? D : C);
+
+ // icmp (A+B), (A+D) -> icmp B, D for equalities or if there is no overflow.
+ if (A && C && (A == C || A == D || B == C || B == D) && NoOp0WrapProblem &&
+ NoOp1WrapProblem) {
+ // Determine Y and Z in the form icmp (X+Y), (X+Z).
+ Value *Y, *Z;
+ if (A == C) {
+ // C + B == C + D -> B == D
+ Y = B;
+ Z = D;
+ } else if (A == D) {
+ // D + B == C + D -> B == C
+ Y = B;
+ Z = C;
+ } else if (B == C) {
+ // A + C == C + D -> A == D
+ Y = A;
+ Z = D;
+ } else {
+ assert(B == D);
+ // A + D == C + D -> A == C
+ Y = A;
+ Z = C;
+ }
+ return new ICmpInst(Pred, Y, Z);
+ }
+
+ // icmp slt (A + -1), Op1 -> icmp sle A, Op1
+ if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT &&
+ match(B, m_AllOnes()))
+ return new ICmpInst(CmpInst::ICMP_SLE, A, Op1);
+
+ // icmp sge (A + -1), Op1 -> icmp sgt A, Op1
+ if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE &&
+ match(B, m_AllOnes()))
+ return new ICmpInst(CmpInst::ICMP_SGT, A, Op1);
+
+ // icmp sle (A + 1), Op1 -> icmp slt A, Op1
+ if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE && match(B, m_One()))
+ return new ICmpInst(CmpInst::ICMP_SLT, A, Op1);
+
+ // icmp sgt (A + 1), Op1 -> icmp sge A, Op1
+ if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT && match(B, m_One()))
+ return new ICmpInst(CmpInst::ICMP_SGE, A, Op1);
+
+ // icmp sgt Op0, (C + -1) -> icmp sge Op0, C
+ if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGT &&
+ match(D, m_AllOnes()))
+ return new ICmpInst(CmpInst::ICMP_SGE, Op0, C);
+
+ // icmp sle Op0, (C + -1) -> icmp slt Op0, C
+ if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLE &&
+ match(D, m_AllOnes()))
+ return new ICmpInst(CmpInst::ICMP_SLT, Op0, C);
+
+ // icmp sge Op0, (C + 1) -> icmp sgt Op0, C
+ if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SGE && match(D, m_One()))
+ return new ICmpInst(CmpInst::ICMP_SGT, Op0, C);
+
+ // icmp slt Op0, (C + 1) -> icmp sle Op0, C
+ if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_SLT && match(D, m_One()))
+ return new ICmpInst(CmpInst::ICMP_SLE, Op0, C);
+
+ // TODO: The subtraction-related identities shown below also hold, but
+ // canonicalization from (X -nuw 1) to (X + -1) means that the combinations
+ // wouldn't happen even if they were implemented.
+ //
+ // icmp ult (A - 1), Op1 -> icmp ule A, Op1
+ // icmp uge (A - 1), Op1 -> icmp ugt A, Op1
+ // icmp ugt Op0, (C - 1) -> icmp uge Op0, C
+ // icmp ule Op0, (C - 1) -> icmp ult Op0, C
+
+ // icmp ule (A + 1), Op0 -> icmp ult A, Op1
+ if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_ULE && match(B, m_One()))
+ return new ICmpInst(CmpInst::ICMP_ULT, A, Op1);
+
+ // icmp ugt (A + 1), Op0 -> icmp uge A, Op1
+ if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_UGT && match(B, m_One()))
+ return new ICmpInst(CmpInst::ICMP_UGE, A, Op1);
+
+ // icmp uge Op0, (C + 1) -> icmp ugt Op0, C
+ if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_UGE && match(D, m_One()))
+ return new ICmpInst(CmpInst::ICMP_UGT, Op0, C);
+
+ // icmp ult Op0, (C + 1) -> icmp ule Op0, C
+ if (C && NoOp1WrapProblem && Pred == CmpInst::ICMP_ULT && match(D, m_One()))
+ return new ICmpInst(CmpInst::ICMP_ULE, Op0, C);
+
+ // if C1 has greater magnitude than C2:
+ // icmp (A + C1), (C + C2) -> icmp (A + C3), C
+ // s.t. C3 = C1 - C2
+ //
+ // if C2 has greater magnitude than C1:
+ // icmp (A + C1), (C + C2) -> icmp A, (C + C3)
+ // s.t. C3 = C2 - C1
+ if (A && C && NoOp0WrapProblem && NoOp1WrapProblem &&
+ (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned())
+ if (ConstantInt *C1 = dyn_cast<ConstantInt>(B))
+ if (ConstantInt *C2 = dyn_cast<ConstantInt>(D)) {
+ const APInt &AP1 = C1->getValue();
+ const APInt &AP2 = C2->getValue();
+ if (AP1.isNegative() == AP2.isNegative()) {
+ APInt AP1Abs = C1->getValue().abs();
+ APInt AP2Abs = C2->getValue().abs();
+ if (AP1Abs.uge(AP2Abs)) {
+ ConstantInt *C3 = Builder.getInt(AP1 - AP2);
+ Value *NewAdd = Builder.CreateNSWAdd(A, C3);
+ return new ICmpInst(Pred, NewAdd, C);
+ } else {
+ ConstantInt *C3 = Builder.getInt(AP2 - AP1);
+ Value *NewAdd = Builder.CreateNSWAdd(C, C3);
+ return new ICmpInst(Pred, A, NewAdd);
+ }
+ }
+ }
+
+ // Analyze the case when either Op0 or Op1 is a sub instruction.
+ // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null).
+ A = nullptr;
+ B = nullptr;
+ C = nullptr;
+ D = nullptr;
+ if (BO0 && BO0->getOpcode() == Instruction::Sub) {
+ A = BO0->getOperand(0);
+ B = BO0->getOperand(1);
+ }
+ if (BO1 && BO1->getOpcode() == Instruction::Sub) {
+ C = BO1->getOperand(0);
+ D = BO1->getOperand(1);
+ }
+
+ // icmp (A-B), A -> icmp 0, B for equalities or if there is no overflow.
+ if (A == Op1 && NoOp0WrapProblem)
+ return new ICmpInst(Pred, Constant::getNullValue(Op1->getType()), B);
+ // icmp C, (C-D) -> icmp D, 0 for equalities or if there is no overflow.
+ if (C == Op0 && NoOp1WrapProblem)
+ return new ICmpInst(Pred, D, Constant::getNullValue(Op0->getType()));
+
+ // Convert sub-with-unsigned-overflow comparisons into a comparison of args.
+ // (A - B) u>/u<= A --> B u>/u<= A
+ if (A == Op1 && (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_ULE))
+ return new ICmpInst(Pred, B, A);
+ // C u</u>= (C - D) --> C u</u>= D
+ if (C == Op0 && (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_UGE))
+ return new ICmpInst(Pred, C, D);
+ // (A - B) u>=/u< A --> B u>/u<= A iff B != 0
+ if (A == Op1 && (Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_ULT) &&
+ isKnownNonZero(B, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
+ return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), B, A);
+ // C u<=/u> (C - D) --> C u</u>= D iff B != 0
+ if (C == Op0 && (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_UGT) &&
+ isKnownNonZero(D, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
+ return new ICmpInst(CmpInst::getFlippedStrictnessPredicate(Pred), C, D);
+
+ // icmp (A-B), (C-B) -> icmp A, C for equalities or if there is no overflow.
+ if (B && D && B == D && NoOp0WrapProblem && NoOp1WrapProblem)
+ return new ICmpInst(Pred, A, C);
+
+ // icmp (A-B), (A-D) -> icmp D, B for equalities or if there is no overflow.
+ if (A && C && A == C && NoOp0WrapProblem && NoOp1WrapProblem)
+ return new ICmpInst(Pred, D, B);
+
+ // icmp (0-X) < cst --> x > -cst
+ if (NoOp0WrapProblem && ICmpInst::isSigned(Pred)) {
+ Value *X;
+ if (match(BO0, m_Neg(m_Value(X))))
+ if (Constant *RHSC = dyn_cast<Constant>(Op1))
+ if (RHSC->isNotMinSignedValue())
+ return new ICmpInst(I.getSwappedPredicate(), X,
+ ConstantExpr::getNeg(RHSC));
+ }
+
{
// Try to remove shared constant multiplier from equality comparison:
// X * C == Y * C (with no overflowing/aliasing) --> X == Y
@@ -3980,2296 +3980,2296 @@ Instruction *InstCombinerImpl::foldICmpBinOp(ICmpInst &I,
return new ICmpInst(Pred, X, Y);
}
- BinaryOperator *SRem = nullptr;
- // icmp (srem X, Y), Y
- if (BO0 && BO0->getOpcode() == Instruction::SRem && Op1 == BO0->getOperand(1))
- SRem = BO0;
- // icmp Y, (srem X, Y)
- else if (BO1 && BO1->getOpcode() == Instruction::SRem &&
- Op0 == BO1->getOperand(1))
- SRem = BO1;
- if (SRem) {
- // We don't check hasOneUse to avoid increasing register pressure because
- // the value we use is the same value this instruction was already using.
- switch (SRem == BO0 ? ICmpInst::getSwappedPredicate(Pred) : Pred) {
- default:
- break;
- case ICmpInst::ICMP_EQ:
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- case ICmpInst::ICMP_NE:
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- case ICmpInst::ICMP_SGT:
- case ICmpInst::ICMP_SGE:
- return new ICmpInst(ICmpInst::ICMP_SGT, SRem->getOperand(1),
- Constant::getAllOnesValue(SRem->getType()));
- case ICmpInst::ICMP_SLT:
- case ICmpInst::ICMP_SLE:
- return new ICmpInst(ICmpInst::ICMP_SLT, SRem->getOperand(1),
- Constant::getNullValue(SRem->getType()));
- }
- }
-
- if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() && BO0->hasOneUse() &&
- BO1->hasOneUse() && BO0->getOperand(1) == BO1->getOperand(1)) {
- switch (BO0->getOpcode()) {
- default:
- break;
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Xor: {
- if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
- return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
-
- const APInt *C;
- if (match(BO0->getOperand(1), m_APInt(C))) {
- // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
- if (C->isSignMask()) {
+ BinaryOperator *SRem = nullptr;
+ // icmp (srem X, Y), Y
+ if (BO0 && BO0->getOpcode() == Instruction::SRem && Op1 == BO0->getOperand(1))
+ SRem = BO0;
+ // icmp Y, (srem X, Y)
+ else if (BO1 && BO1->getOpcode() == Instruction::SRem &&
+ Op0 == BO1->getOperand(1))
+ SRem = BO1;
+ if (SRem) {
+ // We don't check hasOneUse to avoid increasing register pressure because
+ // the value we use is the same value this instruction was already using.
+ switch (SRem == BO0 ? ICmpInst::getSwappedPredicate(Pred) : Pred) {
+ default:
+ break;
+ case ICmpInst::ICMP_EQ:
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ case ICmpInst::ICMP_NE:
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ case ICmpInst::ICMP_SGT:
+ case ICmpInst::ICMP_SGE:
+ return new ICmpInst(ICmpInst::ICMP_SGT, SRem->getOperand(1),
+ Constant::getAllOnesValue(SRem->getType()));
+ case ICmpInst::ICMP_SLT:
+ case ICmpInst::ICMP_SLE:
+ return new ICmpInst(ICmpInst::ICMP_SLT, SRem->getOperand(1),
+ Constant::getNullValue(SRem->getType()));
+ }
+ }
+
+ if (BO0 && BO1 && BO0->getOpcode() == BO1->getOpcode() && BO0->hasOneUse() &&
+ BO1->hasOneUse() && BO0->getOperand(1) == BO1->getOperand(1)) {
+ switch (BO0->getOpcode()) {
+ default:
+ break;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Xor: {
+ if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
+ const APInt *C;
+ if (match(BO0->getOperand(1), m_APInt(C))) {
+ // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b
+ if (C->isSignMask()) {
ICmpInst::Predicate NewPred = I.getFlippedSignednessPredicate();
- return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
- }
-
- // icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b
- if (BO0->getOpcode() == Instruction::Xor && C->isMaxSignedValue()) {
+ return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
+ }
+
+ // icmp u/s (a ^ maxsignval), (b ^ maxsignval) --> icmp s/u' a, b
+ if (BO0->getOpcode() == Instruction::Xor && C->isMaxSignedValue()) {
ICmpInst::Predicate NewPred = I.getFlippedSignednessPredicate();
- NewPred = I.getSwappedPredicate(NewPred);
- return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
- }
- }
- break;
- }
- case Instruction::Mul: {
- if (!I.isEquality())
- break;
-
- const APInt *C;
- if (match(BO0->getOperand(1), m_APInt(C)) && !C->isNullValue() &&
- !C->isOneValue()) {
- // icmp eq/ne (X * C), (Y * C) --> icmp (X & Mask), (Y & Mask)
- // Mask = -1 >> count-trailing-zeros(C).
- if (unsigned TZs = C->countTrailingZeros()) {
- Constant *Mask = ConstantInt::get(
- BO0->getType(),
- APInt::getLowBitsSet(C->getBitWidth(), C->getBitWidth() - TZs));
- Value *And1 = Builder.CreateAnd(BO0->getOperand(0), Mask);
- Value *And2 = Builder.CreateAnd(BO1->getOperand(0), Mask);
- return new ICmpInst(Pred, And1, And2);
- }
- }
- break;
- }
- case Instruction::UDiv:
- case Instruction::LShr:
- if (I.isSigned() || !BO0->isExact() || !BO1->isExact())
- break;
- return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
-
- case Instruction::SDiv:
- if (!I.isEquality() || !BO0->isExact() || !BO1->isExact())
- break;
- return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
-
- case Instruction::AShr:
- if (!BO0->isExact() || !BO1->isExact())
- break;
- return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
-
- case Instruction::Shl: {
- bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
- bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
- if (!NUW && !NSW)
- break;
- if (!NSW && I.isSigned())
- break;
- return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
- }
- }
- }
-
- if (BO0) {
- // Transform A & (L - 1) `ult` L --> L != 0
- auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes());
- auto BitwiseAnd = m_c_And(m_Value(), LSubOne);
-
- if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {
- auto *Zero = Constant::getNullValue(BO0->getType());
- return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
- }
- }
-
- if (Value *V = foldUnsignedMultiplicationOverflowCheck(I))
- return replaceInstUsesWith(I, V);
-
- if (Value *V = foldICmpWithLowBitMaskedVal(I, Builder))
- return replaceInstUsesWith(I, V);
-
- if (Value *V = foldICmpWithTruncSignExtendedVal(I, Builder))
- return replaceInstUsesWith(I, V);
-
- if (Value *V = foldShiftIntoShiftInAnotherHandOfAndInICmp(I, SQ, Builder))
- return replaceInstUsesWith(I, V);
-
- return nullptr;
-}
-
-/// Fold icmp Pred min|max(X, Y), X.
-static Instruction *foldICmpWithMinMax(ICmpInst &Cmp) {
- ICmpInst::Predicate Pred = Cmp.getPredicate();
- Value *Op0 = Cmp.getOperand(0);
- Value *X = Cmp.getOperand(1);
-
- // Canonicalize minimum or maximum operand to LHS of the icmp.
- if (match(X, m_c_SMin(m_Specific(Op0), m_Value())) ||
- match(X, m_c_SMax(m_Specific(Op0), m_Value())) ||
- match(X, m_c_UMin(m_Specific(Op0), m_Value())) ||
- match(X, m_c_UMax(m_Specific(Op0), m_Value()))) {
- std::swap(Op0, X);
- Pred = Cmp.getSwappedPredicate();
- }
-
- Value *Y;
- if (match(Op0, m_c_SMin(m_Specific(X), m_Value(Y)))) {
- // smin(X, Y) == X --> X s<= Y
- // smin(X, Y) s>= X --> X s<= Y
- if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SGE)
- return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
-
- // smin(X, Y) != X --> X s> Y
- // smin(X, Y) s< X --> X s> Y
- if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SLT)
- return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);
-
- // These cases should be handled in InstSimplify:
- // smin(X, Y) s<= X --> true
- // smin(X, Y) s> X --> false
- return nullptr;
- }
-
- if (match(Op0, m_c_SMax(m_Specific(X), m_Value(Y)))) {
- // smax(X, Y) == X --> X s>= Y
- // smax(X, Y) s<= X --> X s>= Y
- if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SLE)
- return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);
-
- // smax(X, Y) != X --> X s< Y
- // smax(X, Y) s> X --> X s< Y
- if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SGT)
- return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);
-
- // These cases should be handled in InstSimplify:
- // smax(X, Y) s>= X --> true
- // smax(X, Y) s< X --> false
- return nullptr;
- }
-
- if (match(Op0, m_c_UMin(m_Specific(X), m_Value(Y)))) {
- // umin(X, Y) == X --> X u<= Y
- // umin(X, Y) u>= X --> X u<= Y
- if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_UGE)
- return new ICmpInst(ICmpInst::ICMP_ULE, X, Y);
-
- // umin(X, Y) != X --> X u> Y
- // umin(X, Y) u< X --> X u> Y
- if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT)
- return new ICmpInst(ICmpInst::ICMP_UGT, X, Y);
-
- // These cases should be handled in InstSimplify:
- // umin(X, Y) u<= X --> true
- // umin(X, Y) u> X --> false
- return nullptr;
- }
-
- if (match(Op0, m_c_UMax(m_Specific(X), m_Value(Y)))) {
- // umax(X, Y) == X --> X u>= Y
- // umax(X, Y) u<= X --> X u>= Y
- if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_ULE)
- return new ICmpInst(ICmpInst::ICMP_UGE, X, Y);
-
- // umax(X, Y) != X --> X u< Y
- // umax(X, Y) u> X --> X u< Y
- if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_UGT)
- return new ICmpInst(ICmpInst::ICMP_ULT, X, Y);
-
- // These cases should be handled in InstSimplify:
- // umax(X, Y) u>= X --> true
- // umax(X, Y) u< X --> false
- return nullptr;
- }
-
- return nullptr;
-}
-
+ NewPred = I.getSwappedPredicate(NewPred);
+ return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0));
+ }
+ }
+ break;
+ }
+ case Instruction::Mul: {
+ if (!I.isEquality())
+ break;
+
+ const APInt *C;
+ if (match(BO0->getOperand(1), m_APInt(C)) && !C->isNullValue() &&
+ !C->isOneValue()) {
+ // icmp eq/ne (X * C), (Y * C) --> icmp (X & Mask), (Y & Mask)
+ // Mask = -1 >> count-trailing-zeros(C).
+ if (unsigned TZs = C->countTrailingZeros()) {
+ Constant *Mask = ConstantInt::get(
+ BO0->getType(),
+ APInt::getLowBitsSet(C->getBitWidth(), C->getBitWidth() - TZs));
+ Value *And1 = Builder.CreateAnd(BO0->getOperand(0), Mask);
+ Value *And2 = Builder.CreateAnd(BO1->getOperand(0), Mask);
+ return new ICmpInst(Pred, And1, And2);
+ }
+ }
+ break;
+ }
+ case Instruction::UDiv:
+ case Instruction::LShr:
+ if (I.isSigned() || !BO0->isExact() || !BO1->isExact())
+ break;
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
+ case Instruction::SDiv:
+ if (!I.isEquality() || !BO0->isExact() || !BO1->isExact())
+ break;
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
+ case Instruction::AShr:
+ if (!BO0->isExact() || !BO1->isExact())
+ break;
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+
+ case Instruction::Shl: {
+ bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap();
+ bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap();
+ if (!NUW && !NSW)
+ break;
+ if (!NSW && I.isSigned())
+ break;
+ return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0));
+ }
+ }
+ }
+
+ if (BO0) {
+ // Transform A & (L - 1) `ult` L --> L != 0
+ auto LSubOne = m_Add(m_Specific(Op1), m_AllOnes());
+ auto BitwiseAnd = m_c_And(m_Value(), LSubOne);
+
+ if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) {
+ auto *Zero = Constant::getNullValue(BO0->getType());
+ return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero);
+ }
+ }
+
+ if (Value *V = foldUnsignedMultiplicationOverflowCheck(I))
+ return replaceInstUsesWith(I, V);
+
+ if (Value *V = foldICmpWithLowBitMaskedVal(I, Builder))
+ return replaceInstUsesWith(I, V);
+
+ if (Value *V = foldICmpWithTruncSignExtendedVal(I, Builder))
+ return replaceInstUsesWith(I, V);
+
+ if (Value *V = foldShiftIntoShiftInAnotherHandOfAndInICmp(I, SQ, Builder))
+ return replaceInstUsesWith(I, V);
+
+ return nullptr;
+}
+
+/// Fold icmp Pred min|max(X, Y), X.
+static Instruction *foldICmpWithMinMax(ICmpInst &Cmp) {
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ Value *Op0 = Cmp.getOperand(0);
+ Value *X = Cmp.getOperand(1);
+
+ // Canonicalize minimum or maximum operand to LHS of the icmp.
+ if (match(X, m_c_SMin(m_Specific(Op0), m_Value())) ||
+ match(X, m_c_SMax(m_Specific(Op0), m_Value())) ||
+ match(X, m_c_UMin(m_Specific(Op0), m_Value())) ||
+ match(X, m_c_UMax(m_Specific(Op0), m_Value()))) {
+ std::swap(Op0, X);
+ Pred = Cmp.getSwappedPredicate();
+ }
+
+ Value *Y;
+ if (match(Op0, m_c_SMin(m_Specific(X), m_Value(Y)))) {
+ // smin(X, Y) == X --> X s<= Y
+ // smin(X, Y) s>= X --> X s<= Y
+ if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SGE)
+ return new ICmpInst(ICmpInst::ICMP_SLE, X, Y);
+
+ // smin(X, Y) != X --> X s> Y
+ // smin(X, Y) s< X --> X s> Y
+ if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SLT)
+ return new ICmpInst(ICmpInst::ICMP_SGT, X, Y);
+
+ // These cases should be handled in InstSimplify:
+ // smin(X, Y) s<= X --> true
+ // smin(X, Y) s> X --> false
+ return nullptr;
+ }
+
+ if (match(Op0, m_c_SMax(m_Specific(X), m_Value(Y)))) {
+ // smax(X, Y) == X --> X s>= Y
+ // smax(X, Y) s<= X --> X s>= Y
+ if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_SLE)
+ return new ICmpInst(ICmpInst::ICMP_SGE, X, Y);
+
+ // smax(X, Y) != X --> X s< Y
+ // smax(X, Y) s> X --> X s< Y
+ if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_SGT)
+ return new ICmpInst(ICmpInst::ICMP_SLT, X, Y);
+
+ // These cases should be handled in InstSimplify:
+ // smax(X, Y) s>= X --> true
+ // smax(X, Y) s< X --> false
+ return nullptr;
+ }
+
+ if (match(Op0, m_c_UMin(m_Specific(X), m_Value(Y)))) {
+ // umin(X, Y) == X --> X u<= Y
+ // umin(X, Y) u>= X --> X u<= Y
+ if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_UGE)
+ return new ICmpInst(ICmpInst::ICMP_ULE, X, Y);
+
+ // umin(X, Y) != X --> X u> Y
+ // umin(X, Y) u< X --> X u> Y
+ if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_ULT)
+ return new ICmpInst(ICmpInst::ICMP_UGT, X, Y);
+
+ // These cases should be handled in InstSimplify:
+ // umin(X, Y) u<= X --> true
+ // umin(X, Y) u> X --> false
+ return nullptr;
+ }
+
+ if (match(Op0, m_c_UMax(m_Specific(X), m_Value(Y)))) {
+ // umax(X, Y) == X --> X u>= Y
+ // umax(X, Y) u<= X --> X u>= Y
+ if (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_ULE)
+ return new ICmpInst(ICmpInst::ICMP_UGE, X, Y);
+
+ // umax(X, Y) != X --> X u< Y
+ // umax(X, Y) u> X --> X u< Y
+ if (Pred == CmpInst::ICMP_NE || Pred == CmpInst::ICMP_UGT)
+ return new ICmpInst(ICmpInst::ICMP_ULT, X, Y);
+
+ // These cases should be handled in InstSimplify:
+ // umax(X, Y) u>= X --> true
+ // umax(X, Y) u< X --> false
+ return nullptr;
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::foldICmpEquality(ICmpInst &I) {
- if (!I.isEquality())
- return nullptr;
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- const CmpInst::Predicate Pred = I.getPredicate();
- Value *A, *B, *C, *D;
- if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
- if (A == Op1 || B == Op1) { // (A^B) == A -> B == 0
- Value *OtherVal = A == Op1 ? B : A;
- return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
- }
-
- if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
- // A^c1 == C^c2 --> A == C^(c1^c2)
- ConstantInt *C1, *C2;
- if (match(B, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2)) &&
- Op1->hasOneUse()) {
- Constant *NC = Builder.getInt(C1->getValue() ^ C2->getValue());
- Value *Xor = Builder.CreateXor(C, NC);
- return new ICmpInst(Pred, A, Xor);
- }
-
- // A^B == A^D -> B == D
- if (A == C)
- return new ICmpInst(Pred, B, D);
- if (A == D)
- return new ICmpInst(Pred, B, C);
- if (B == C)
- return new ICmpInst(Pred, A, D);
- if (B == D)
- return new ICmpInst(Pred, A, C);
- }
- }
-
- if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) {
- // A == (A^B) -> B == 0
- Value *OtherVal = A == Op0 ? B : A;
- return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
- }
-
- // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
- if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) &&
- match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) {
- Value *X = nullptr, *Y = nullptr, *Z = nullptr;
-
- if (A == C) {
- X = B;
- Y = D;
- Z = A;
- } else if (A == D) {
- X = B;
- Y = C;
- Z = A;
- } else if (B == C) {
- X = A;
- Y = D;
- Z = B;
- } else if (B == D) {
- X = A;
- Y = C;
- Z = B;
- }
-
- if (X) { // Build (X^Y) & Z
- Op1 = Builder.CreateXor(X, Y);
- Op1 = Builder.CreateAnd(Op1, Z);
- return new ICmpInst(Pred, Op1, Constant::getNullValue(Op1->getType()));
- }
- }
-
- // Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B)
- // and (B & (1<<X)-1) == (zext A) --> A == (trunc B)
- ConstantInt *Cst1;
- if ((Op0->hasOneUse() && match(Op0, m_ZExt(m_Value(A))) &&
- match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) ||
- (Op1->hasOneUse() && match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) &&
- match(Op1, m_ZExt(m_Value(A))))) {
- APInt Pow2 = Cst1->getValue() + 1;
- if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) &&
- Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth())
- return new ICmpInst(Pred, A, Builder.CreateTrunc(B, A->getType()));
- }
-
- // (A >> C) == (B >> C) --> (A^B) u< (1 << C)
- // For lshr and ashr pairs.
- if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) &&
- match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) ||
- (match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) &&
- match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) {
- unsigned TypeBits = Cst1->getBitWidth();
- unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
- if (ShAmt < TypeBits && ShAmt != 0) {
- ICmpInst::Predicate NewPred =
- Pred == ICmpInst::ICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
- Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
- APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
- return new ICmpInst(NewPred, Xor, Builder.getInt(CmpVal));
- }
- }
-
- // (A << C) == (B << C) --> ((A^B) & (~0U >> C)) == 0
- if (match(Op0, m_OneUse(m_Shl(m_Value(A), m_ConstantInt(Cst1)))) &&
- match(Op1, m_OneUse(m_Shl(m_Value(B), m_Specific(Cst1))))) {
- unsigned TypeBits = Cst1->getBitWidth();
- unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
- if (ShAmt < TypeBits && ShAmt != 0) {
- Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
- APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt);
- Value *And = Builder.CreateAnd(Xor, Builder.getInt(AndVal),
- I.getName() + ".mask");
- return new ICmpInst(Pred, And, Constant::getNullValue(Cst1->getType()));
- }
- }
-
- // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
- // "icmp (and X, mask), cst"
- uint64_t ShAmt = 0;
- if (Op0->hasOneUse() &&
- match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A), m_ConstantInt(ShAmt))))) &&
- match(Op1, m_ConstantInt(Cst1)) &&
- // Only do this when A has multiple uses. This is most important to do
- // when it exposes other optimizations.
- !A->hasOneUse()) {
- unsigned ASize = cast<IntegerType>(A->getType())->getPrimitiveSizeInBits();
-
- if (ShAmt < ASize) {
- APInt MaskV =
- APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits());
- MaskV <<= ShAmt;
-
- APInt CmpV = Cst1->getValue().zext(ASize);
- CmpV <<= ShAmt;
-
- Value *Mask = Builder.CreateAnd(A, Builder.getInt(MaskV));
- return new ICmpInst(Pred, Mask, Builder.getInt(CmpV));
- }
- }
-
- // If both operands are byte-swapped or bit-reversed, just compare the
- // original values.
- // TODO: Move this to a function similar to foldICmpIntrinsicWithConstant()
- // and handle more intrinsics.
- if ((match(Op0, m_BSwap(m_Value(A))) && match(Op1, m_BSwap(m_Value(B)))) ||
- (match(Op0, m_BitReverse(m_Value(A))) &&
- match(Op1, m_BitReverse(m_Value(B)))))
- return new ICmpInst(Pred, A, B);
-
- // Canonicalize checking for a power-of-2-or-zero value:
- // (A & (A-1)) == 0 --> ctpop(A) < 2 (two commuted variants)
- // ((A-1) & A) != 0 --> ctpop(A) > 1 (two commuted variants)
- if (!match(Op0, m_OneUse(m_c_And(m_Add(m_Value(A), m_AllOnes()),
- m_Deferred(A)))) ||
- !match(Op1, m_ZeroInt()))
- A = nullptr;
-
- // (A & -A) == A --> ctpop(A) < 2 (four commuted variants)
- // (-A & A) != A --> ctpop(A) > 1 (four commuted variants)
- if (match(Op0, m_OneUse(m_c_And(m_Neg(m_Specific(Op1)), m_Specific(Op1)))))
- A = Op1;
- else if (match(Op1,
- m_OneUse(m_c_And(m_Neg(m_Specific(Op0)), m_Specific(Op0)))))
- A = Op0;
-
- if (A) {
- Type *Ty = A->getType();
- CallInst *CtPop = Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, A);
- return Pred == ICmpInst::ICMP_EQ
- ? new ICmpInst(ICmpInst::ICMP_ULT, CtPop, ConstantInt::get(Ty, 2))
- : new ICmpInst(ICmpInst::ICMP_UGT, CtPop, ConstantInt::get(Ty, 1));
- }
-
- return nullptr;
-}
-
-static Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp,
- InstCombiner::BuilderTy &Builder) {
- assert(isa<CastInst>(ICmp.getOperand(0)) && "Expected cast for operand 0");
- auto *CastOp0 = cast<CastInst>(ICmp.getOperand(0));
- Value *X;
- if (!match(CastOp0, m_ZExtOrSExt(m_Value(X))))
- return nullptr;
-
- bool IsSignedExt = CastOp0->getOpcode() == Instruction::SExt;
- bool IsSignedCmp = ICmp.isSigned();
- if (auto *CastOp1 = dyn_cast<CastInst>(ICmp.getOperand(1))) {
- // If the signedness of the two casts doesn't agree (i.e. one is a sext
- // and the other is a zext), then we can't handle this.
- // TODO: This is too strict. We can handle some predicates (equality?).
- if (CastOp0->getOpcode() != CastOp1->getOpcode())
- return nullptr;
-
- // Not an extension from the same type?
- Value *Y = CastOp1->getOperand(0);
- Type *XTy = X->getType(), *YTy = Y->getType();
- if (XTy != YTy) {
- // One of the casts must have one use because we are creating a new cast.
- if (!CastOp0->hasOneUse() && !CastOp1->hasOneUse())
- return nullptr;
- // Extend the narrower operand to the type of the wider operand.
- if (XTy->getScalarSizeInBits() < YTy->getScalarSizeInBits())
- X = Builder.CreateCast(CastOp0->getOpcode(), X, YTy);
- else if (YTy->getScalarSizeInBits() < XTy->getScalarSizeInBits())
- Y = Builder.CreateCast(CastOp0->getOpcode(), Y, XTy);
- else
- return nullptr;
- }
-
- // (zext X) == (zext Y) --> X == Y
- // (sext X) == (sext Y) --> X == Y
- if (ICmp.isEquality())
- return new ICmpInst(ICmp.getPredicate(), X, Y);
-
- // A signed comparison of sign extended values simplifies into a
- // signed comparison.
- if (IsSignedCmp && IsSignedExt)
- return new ICmpInst(ICmp.getPredicate(), X, Y);
-
- // The other three cases all fold into an unsigned comparison.
- return new ICmpInst(ICmp.getUnsignedPredicate(), X, Y);
- }
-
- // Below here, we are only folding a compare with constant.
- auto *C = dyn_cast<Constant>(ICmp.getOperand(1));
- if (!C)
- return nullptr;
-
- // Compute the constant that would happen if we truncated to SrcTy then
- // re-extended to DestTy.
- Type *SrcTy = CastOp0->getSrcTy();
- Type *DestTy = CastOp0->getDestTy();
- Constant *Res1 = ConstantExpr::getTrunc(C, SrcTy);
- Constant *Res2 = ConstantExpr::getCast(CastOp0->getOpcode(), Res1, DestTy);
-
- // If the re-extended constant didn't change...
- if (Res2 == C) {
- if (ICmp.isEquality())
- return new ICmpInst(ICmp.getPredicate(), X, Res1);
-
- // A signed comparison of sign extended values simplifies into a
- // signed comparison.
- if (IsSignedExt && IsSignedCmp)
- return new ICmpInst(ICmp.getPredicate(), X, Res1);
-
- // The other three cases all fold into an unsigned comparison.
- return new ICmpInst(ICmp.getUnsignedPredicate(), X, Res1);
- }
-
- // The re-extended constant changed, partly changed (in the case of a vector),
- // or could not be determined to be equal (in the case of a constant
- // expression), so the constant cannot be represented in the shorter type.
- // All the cases that fold to true or false will have already been handled
- // by SimplifyICmpInst, so only deal with the tricky case.
- if (IsSignedCmp || !IsSignedExt || !isa<ConstantInt>(C))
- return nullptr;
-
- // Is source op positive?
- // icmp ult (sext X), C --> icmp sgt X, -1
- if (ICmp.getPredicate() == ICmpInst::ICMP_ULT)
- return new ICmpInst(CmpInst::ICMP_SGT, X, Constant::getAllOnesValue(SrcTy));
-
- // Is source op negative?
- // icmp ugt (sext X), C --> icmp slt X, 0
- assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!");
- return new ICmpInst(CmpInst::ICMP_SLT, X, Constant::getNullValue(SrcTy));
-}
-
-/// Handle icmp (cast x), (cast or constant).
+ if (!I.isEquality())
+ return nullptr;
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ const CmpInst::Predicate Pred = I.getPredicate();
+ Value *A, *B, *C, *D;
+ if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+ if (A == Op1 || B == Op1) { // (A^B) == A -> B == 0
+ Value *OtherVal = A == Op1 ? B : A;
+ return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
+ }
+
+ if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
+ // A^c1 == C^c2 --> A == C^(c1^c2)
+ ConstantInt *C1, *C2;
+ if (match(B, m_ConstantInt(C1)) && match(D, m_ConstantInt(C2)) &&
+ Op1->hasOneUse()) {
+ Constant *NC = Builder.getInt(C1->getValue() ^ C2->getValue());
+ Value *Xor = Builder.CreateXor(C, NC);
+ return new ICmpInst(Pred, A, Xor);
+ }
+
+ // A^B == A^D -> B == D
+ if (A == C)
+ return new ICmpInst(Pred, B, D);
+ if (A == D)
+ return new ICmpInst(Pred, B, C);
+ if (B == C)
+ return new ICmpInst(Pred, A, D);
+ if (B == D)
+ return new ICmpInst(Pred, A, C);
+ }
+ }
+
+ if (match(Op1, m_Xor(m_Value(A), m_Value(B))) && (A == Op0 || B == Op0)) {
+ // A == (A^B) -> B == 0
+ Value *OtherVal = A == Op0 ? B : A;
+ return new ICmpInst(Pred, OtherVal, Constant::getNullValue(A->getType()));
+ }
+
+ // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
+ if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B)))) &&
+ match(Op1, m_OneUse(m_And(m_Value(C), m_Value(D))))) {
+ Value *X = nullptr, *Y = nullptr, *Z = nullptr;
+
+ if (A == C) {
+ X = B;
+ Y = D;
+ Z = A;
+ } else if (A == D) {
+ X = B;
+ Y = C;
+ Z = A;
+ } else if (B == C) {
+ X = A;
+ Y = D;
+ Z = B;
+ } else if (B == D) {
+ X = A;
+ Y = C;
+ Z = B;
+ }
+
+ if (X) { // Build (X^Y) & Z
+ Op1 = Builder.CreateXor(X, Y);
+ Op1 = Builder.CreateAnd(Op1, Z);
+ return new ICmpInst(Pred, Op1, Constant::getNullValue(Op1->getType()));
+ }
+ }
+
+ // Transform (zext A) == (B & (1<<X)-1) --> A == (trunc B)
+ // and (B & (1<<X)-1) == (zext A) --> A == (trunc B)
+ ConstantInt *Cst1;
+ if ((Op0->hasOneUse() && match(Op0, m_ZExt(m_Value(A))) &&
+ match(Op1, m_And(m_Value(B), m_ConstantInt(Cst1)))) ||
+ (Op1->hasOneUse() && match(Op0, m_And(m_Value(B), m_ConstantInt(Cst1))) &&
+ match(Op1, m_ZExt(m_Value(A))))) {
+ APInt Pow2 = Cst1->getValue() + 1;
+ if (Pow2.isPowerOf2() && isa<IntegerType>(A->getType()) &&
+ Pow2.logBase2() == cast<IntegerType>(A->getType())->getBitWidth())
+ return new ICmpInst(Pred, A, Builder.CreateTrunc(B, A->getType()));
+ }
+
+ // (A >> C) == (B >> C) --> (A^B) u< (1 << C)
+ // For lshr and ashr pairs.
+ if ((match(Op0, m_OneUse(m_LShr(m_Value(A), m_ConstantInt(Cst1)))) &&
+ match(Op1, m_OneUse(m_LShr(m_Value(B), m_Specific(Cst1))))) ||
+ (match(Op0, m_OneUse(m_AShr(m_Value(A), m_ConstantInt(Cst1)))) &&
+ match(Op1, m_OneUse(m_AShr(m_Value(B), m_Specific(Cst1)))))) {
+ unsigned TypeBits = Cst1->getBitWidth();
+ unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
+ if (ShAmt < TypeBits && ShAmt != 0) {
+ ICmpInst::Predicate NewPred =
+ Pred == ICmpInst::ICMP_NE ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
+ Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
+ APInt CmpVal = APInt::getOneBitSet(TypeBits, ShAmt);
+ return new ICmpInst(NewPred, Xor, Builder.getInt(CmpVal));
+ }
+ }
+
+ // (A << C) == (B << C) --> ((A^B) & (~0U >> C)) == 0
+ if (match(Op0, m_OneUse(m_Shl(m_Value(A), m_ConstantInt(Cst1)))) &&
+ match(Op1, m_OneUse(m_Shl(m_Value(B), m_Specific(Cst1))))) {
+ unsigned TypeBits = Cst1->getBitWidth();
+ unsigned ShAmt = (unsigned)Cst1->getLimitedValue(TypeBits);
+ if (ShAmt < TypeBits && ShAmt != 0) {
+ Value *Xor = Builder.CreateXor(A, B, I.getName() + ".unshifted");
+ APInt AndVal = APInt::getLowBitsSet(TypeBits, TypeBits - ShAmt);
+ Value *And = Builder.CreateAnd(Xor, Builder.getInt(AndVal),
+ I.getName() + ".mask");
+ return new ICmpInst(Pred, And, Constant::getNullValue(Cst1->getType()));
+ }
+ }
+
+ // Transform "icmp eq (trunc (lshr(X, cst1)), cst" to
+ // "icmp (and X, mask), cst"
+ uint64_t ShAmt = 0;
+ if (Op0->hasOneUse() &&
+ match(Op0, m_Trunc(m_OneUse(m_LShr(m_Value(A), m_ConstantInt(ShAmt))))) &&
+ match(Op1, m_ConstantInt(Cst1)) &&
+ // Only do this when A has multiple uses. This is most important to do
+ // when it exposes other optimizations.
+ !A->hasOneUse()) {
+ unsigned ASize = cast<IntegerType>(A->getType())->getPrimitiveSizeInBits();
+
+ if (ShAmt < ASize) {
+ APInt MaskV =
+ APInt::getLowBitsSet(ASize, Op0->getType()->getPrimitiveSizeInBits());
+ MaskV <<= ShAmt;
+
+ APInt CmpV = Cst1->getValue().zext(ASize);
+ CmpV <<= ShAmt;
+
+ Value *Mask = Builder.CreateAnd(A, Builder.getInt(MaskV));
+ return new ICmpInst(Pred, Mask, Builder.getInt(CmpV));
+ }
+ }
+
+ // If both operands are byte-swapped or bit-reversed, just compare the
+ // original values.
+ // TODO: Move this to a function similar to foldICmpIntrinsicWithConstant()
+ // and handle more intrinsics.
+ if ((match(Op0, m_BSwap(m_Value(A))) && match(Op1, m_BSwap(m_Value(B)))) ||
+ (match(Op0, m_BitReverse(m_Value(A))) &&
+ match(Op1, m_BitReverse(m_Value(B)))))
+ return new ICmpInst(Pred, A, B);
+
+ // Canonicalize checking for a power-of-2-or-zero value:
+ // (A & (A-1)) == 0 --> ctpop(A) < 2 (two commuted variants)
+ // ((A-1) & A) != 0 --> ctpop(A) > 1 (two commuted variants)
+ if (!match(Op0, m_OneUse(m_c_And(m_Add(m_Value(A), m_AllOnes()),
+ m_Deferred(A)))) ||
+ !match(Op1, m_ZeroInt()))
+ A = nullptr;
+
+ // (A & -A) == A --> ctpop(A) < 2 (four commuted variants)
+ // (-A & A) != A --> ctpop(A) > 1 (four commuted variants)
+ if (match(Op0, m_OneUse(m_c_And(m_Neg(m_Specific(Op1)), m_Specific(Op1)))))
+ A = Op1;
+ else if (match(Op1,
+ m_OneUse(m_c_And(m_Neg(m_Specific(Op0)), m_Specific(Op0)))))
+ A = Op0;
+
+ if (A) {
+ Type *Ty = A->getType();
+ CallInst *CtPop = Builder.CreateUnaryIntrinsic(Intrinsic::ctpop, A);
+ return Pred == ICmpInst::ICMP_EQ
+ ? new ICmpInst(ICmpInst::ICMP_ULT, CtPop, ConstantInt::get(Ty, 2))
+ : new ICmpInst(ICmpInst::ICMP_UGT, CtPop, ConstantInt::get(Ty, 1));
+ }
+
+ return nullptr;
+}
+
+static Instruction *foldICmpWithZextOrSext(ICmpInst &ICmp,
+ InstCombiner::BuilderTy &Builder) {
+ assert(isa<CastInst>(ICmp.getOperand(0)) && "Expected cast for operand 0");
+ auto *CastOp0 = cast<CastInst>(ICmp.getOperand(0));
+ Value *X;
+ if (!match(CastOp0, m_ZExtOrSExt(m_Value(X))))
+ return nullptr;
+
+ bool IsSignedExt = CastOp0->getOpcode() == Instruction::SExt;
+ bool IsSignedCmp = ICmp.isSigned();
+ if (auto *CastOp1 = dyn_cast<CastInst>(ICmp.getOperand(1))) {
+ // If the signedness of the two casts doesn't agree (i.e. one is a sext
+ // and the other is a zext), then we can't handle this.
+ // TODO: This is too strict. We can handle some predicates (equality?).
+ if (CastOp0->getOpcode() != CastOp1->getOpcode())
+ return nullptr;
+
+ // Not an extension from the same type?
+ Value *Y = CastOp1->getOperand(0);
+ Type *XTy = X->getType(), *YTy = Y->getType();
+ if (XTy != YTy) {
+ // One of the casts must have one use because we are creating a new cast.
+ if (!CastOp0->hasOneUse() && !CastOp1->hasOneUse())
+ return nullptr;
+ // Extend the narrower operand to the type of the wider operand.
+ if (XTy->getScalarSizeInBits() < YTy->getScalarSizeInBits())
+ X = Builder.CreateCast(CastOp0->getOpcode(), X, YTy);
+ else if (YTy->getScalarSizeInBits() < XTy->getScalarSizeInBits())
+ Y = Builder.CreateCast(CastOp0->getOpcode(), Y, XTy);
+ else
+ return nullptr;
+ }
+
+ // (zext X) == (zext Y) --> X == Y
+ // (sext X) == (sext Y) --> X == Y
+ if (ICmp.isEquality())
+ return new ICmpInst(ICmp.getPredicate(), X, Y);
+
+ // A signed comparison of sign extended values simplifies into a
+ // signed comparison.
+ if (IsSignedCmp && IsSignedExt)
+ return new ICmpInst(ICmp.getPredicate(), X, Y);
+
+ // The other three cases all fold into an unsigned comparison.
+ return new ICmpInst(ICmp.getUnsignedPredicate(), X, Y);
+ }
+
+ // Below here, we are only folding a compare with constant.
+ auto *C = dyn_cast<Constant>(ICmp.getOperand(1));
+ if (!C)
+ return nullptr;
+
+ // Compute the constant that would happen if we truncated to SrcTy then
+ // re-extended to DestTy.
+ Type *SrcTy = CastOp0->getSrcTy();
+ Type *DestTy = CastOp0->getDestTy();
+ Constant *Res1 = ConstantExpr::getTrunc(C, SrcTy);
+ Constant *Res2 = ConstantExpr::getCast(CastOp0->getOpcode(), Res1, DestTy);
+
+ // If the re-extended constant didn't change...
+ if (Res2 == C) {
+ if (ICmp.isEquality())
+ return new ICmpInst(ICmp.getPredicate(), X, Res1);
+
+ // A signed comparison of sign extended values simplifies into a
+ // signed comparison.
+ if (IsSignedExt && IsSignedCmp)
+ return new ICmpInst(ICmp.getPredicate(), X, Res1);
+
+ // The other three cases all fold into an unsigned comparison.
+ return new ICmpInst(ICmp.getUnsignedPredicate(), X, Res1);
+ }
+
+ // The re-extended constant changed, partly changed (in the case of a vector),
+ // or could not be determined to be equal (in the case of a constant
+ // expression), so the constant cannot be represented in the shorter type.
+ // All the cases that fold to true or false will have already been handled
+ // by SimplifyICmpInst, so only deal with the tricky case.
+ if (IsSignedCmp || !IsSignedExt || !isa<ConstantInt>(C))
+ return nullptr;
+
+ // Is source op positive?
+ // icmp ult (sext X), C --> icmp sgt X, -1
+ if (ICmp.getPredicate() == ICmpInst::ICMP_ULT)
+ return new ICmpInst(CmpInst::ICMP_SGT, X, Constant::getAllOnesValue(SrcTy));
+
+ // Is source op negative?
+ // icmp ugt (sext X), C --> icmp slt X, 0
+ assert(ICmp.getPredicate() == ICmpInst::ICMP_UGT && "ICmp should be folded!");
+ return new ICmpInst(CmpInst::ICMP_SLT, X, Constant::getNullValue(SrcTy));
+}
+
+/// Handle icmp (cast x), (cast or constant).
Instruction *InstCombinerImpl::foldICmpWithCastOp(ICmpInst &ICmp) {
- auto *CastOp0 = dyn_cast<CastInst>(ICmp.getOperand(0));
- if (!CastOp0)
- return nullptr;
- if (!isa<Constant>(ICmp.getOperand(1)) && !isa<CastInst>(ICmp.getOperand(1)))
- return nullptr;
-
- Value *Op0Src = CastOp0->getOperand(0);
- Type *SrcTy = CastOp0->getSrcTy();
- Type *DestTy = CastOp0->getDestTy();
-
- // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
- // integer type is the same size as the pointer type.
- auto CompatibleSizes = [&](Type *SrcTy, Type *DestTy) {
- if (isa<VectorType>(SrcTy)) {
- SrcTy = cast<VectorType>(SrcTy)->getElementType();
- DestTy = cast<VectorType>(DestTy)->getElementType();
- }
- return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth();
- };
- if (CastOp0->getOpcode() == Instruction::PtrToInt &&
- CompatibleSizes(SrcTy, DestTy)) {
- Value *NewOp1 = nullptr;
- if (auto *PtrToIntOp1 = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) {
- Value *PtrSrc = PtrToIntOp1->getOperand(0);
- if (PtrSrc->getType()->getPointerAddressSpace() ==
- Op0Src->getType()->getPointerAddressSpace()) {
- NewOp1 = PtrToIntOp1->getOperand(0);
- // If the pointer types don't match, insert a bitcast.
- if (Op0Src->getType() != NewOp1->getType())
- NewOp1 = Builder.CreateBitCast(NewOp1, Op0Src->getType());
- }
- } else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) {
- NewOp1 = ConstantExpr::getIntToPtr(RHSC, SrcTy);
- }
-
- if (NewOp1)
- return new ICmpInst(ICmp.getPredicate(), Op0Src, NewOp1);
- }
-
- return foldICmpWithZextOrSext(ICmp, Builder);
-}
-
-static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS) {
- switch (BinaryOp) {
- default:
- llvm_unreachable("Unsupported binary op");
- case Instruction::Add:
- case Instruction::Sub:
- return match(RHS, m_Zero());
- case Instruction::Mul:
- return match(RHS, m_One());
- }
-}
-
+ auto *CastOp0 = dyn_cast<CastInst>(ICmp.getOperand(0));
+ if (!CastOp0)
+ return nullptr;
+ if (!isa<Constant>(ICmp.getOperand(1)) && !isa<CastInst>(ICmp.getOperand(1)))
+ return nullptr;
+
+ Value *Op0Src = CastOp0->getOperand(0);
+ Type *SrcTy = CastOp0->getSrcTy();
+ Type *DestTy = CastOp0->getDestTy();
+
+ // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
+ // integer type is the same size as the pointer type.
+ auto CompatibleSizes = [&](Type *SrcTy, Type *DestTy) {
+ if (isa<VectorType>(SrcTy)) {
+ SrcTy = cast<VectorType>(SrcTy)->getElementType();
+ DestTy = cast<VectorType>(DestTy)->getElementType();
+ }
+ return DL.getPointerTypeSizeInBits(SrcTy) == DestTy->getIntegerBitWidth();
+ };
+ if (CastOp0->getOpcode() == Instruction::PtrToInt &&
+ CompatibleSizes(SrcTy, DestTy)) {
+ Value *NewOp1 = nullptr;
+ if (auto *PtrToIntOp1 = dyn_cast<PtrToIntOperator>(ICmp.getOperand(1))) {
+ Value *PtrSrc = PtrToIntOp1->getOperand(0);
+ if (PtrSrc->getType()->getPointerAddressSpace() ==
+ Op0Src->getType()->getPointerAddressSpace()) {
+ NewOp1 = PtrToIntOp1->getOperand(0);
+ // If the pointer types don't match, insert a bitcast.
+ if (Op0Src->getType() != NewOp1->getType())
+ NewOp1 = Builder.CreateBitCast(NewOp1, Op0Src->getType());
+ }
+ } else if (auto *RHSC = dyn_cast<Constant>(ICmp.getOperand(1))) {
+ NewOp1 = ConstantExpr::getIntToPtr(RHSC, SrcTy);
+ }
+
+ if (NewOp1)
+ return new ICmpInst(ICmp.getPredicate(), Op0Src, NewOp1);
+ }
+
+ return foldICmpWithZextOrSext(ICmp, Builder);
+}
+
+static bool isNeutralValue(Instruction::BinaryOps BinaryOp, Value *RHS) {
+ switch (BinaryOp) {
+ default:
+ llvm_unreachable("Unsupported binary op");
+ case Instruction::Add:
+ case Instruction::Sub:
+ return match(RHS, m_Zero());
+ case Instruction::Mul:
+ return match(RHS, m_One());
+ }
+}
+
OverflowResult
InstCombinerImpl::computeOverflow(Instruction::BinaryOps BinaryOp,
bool IsSigned, Value *LHS, Value *RHS,
Instruction *CxtI) const {
- switch (BinaryOp) {
- default:
- llvm_unreachable("Unsupported binary op");
- case Instruction::Add:
- if (IsSigned)
- return computeOverflowForSignedAdd(LHS, RHS, CxtI);
- else
- return computeOverflowForUnsignedAdd(LHS, RHS, CxtI);
- case Instruction::Sub:
- if (IsSigned)
- return computeOverflowForSignedSub(LHS, RHS, CxtI);
- else
- return computeOverflowForUnsignedSub(LHS, RHS, CxtI);
- case Instruction::Mul:
- if (IsSigned)
- return computeOverflowForSignedMul(LHS, RHS, CxtI);
- else
- return computeOverflowForUnsignedMul(LHS, RHS, CxtI);
- }
-}
-
+ switch (BinaryOp) {
+ default:
+ llvm_unreachable("Unsupported binary op");
+ case Instruction::Add:
+ if (IsSigned)
+ return computeOverflowForSignedAdd(LHS, RHS, CxtI);
+ else
+ return computeOverflowForUnsignedAdd(LHS, RHS, CxtI);
+ case Instruction::Sub:
+ if (IsSigned)
+ return computeOverflowForSignedSub(LHS, RHS, CxtI);
+ else
+ return computeOverflowForUnsignedSub(LHS, RHS, CxtI);
+ case Instruction::Mul:
+ if (IsSigned)
+ return computeOverflowForSignedMul(LHS, RHS, CxtI);
+ else
+ return computeOverflowForUnsignedMul(LHS, RHS, CxtI);
+ }
+}
+
bool InstCombinerImpl::OptimizeOverflowCheck(Instruction::BinaryOps BinaryOp,
bool IsSigned, Value *LHS,
Value *RHS, Instruction &OrigI,
Value *&Result,
Constant *&Overflow) {
- if (OrigI.isCommutative() && isa<Constant>(LHS) && !isa<Constant>(RHS))
- std::swap(LHS, RHS);
-
- // If the overflow check was an add followed by a compare, the insertion point
- // may be pointing to the compare. We want to insert the new instructions
- // before the add in case there are uses of the add between the add and the
- // compare.
- Builder.SetInsertPoint(&OrigI);
-
+ if (OrigI.isCommutative() && isa<Constant>(LHS) && !isa<Constant>(RHS))
+ std::swap(LHS, RHS);
+
+ // If the overflow check was an add followed by a compare, the insertion point
+ // may be pointing to the compare. We want to insert the new instructions
+ // before the add in case there are uses of the add between the add and the
+ // compare.
+ Builder.SetInsertPoint(&OrigI);
+
Type *OverflowTy = Type::getInt1Ty(LHS->getContext());
if (auto *LHSTy = dyn_cast<VectorType>(LHS->getType()))
OverflowTy = VectorType::get(OverflowTy, LHSTy->getElementCount());
- if (isNeutralValue(BinaryOp, RHS)) {
- Result = LHS;
+ if (isNeutralValue(BinaryOp, RHS)) {
+ Result = LHS;
Overflow = ConstantInt::getFalse(OverflowTy);
- return true;
- }
-
- switch (computeOverflow(BinaryOp, IsSigned, LHS, RHS, &OrigI)) {
- case OverflowResult::MayOverflow:
- return false;
- case OverflowResult::AlwaysOverflowsLow:
- case OverflowResult::AlwaysOverflowsHigh:
- Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
- Result->takeName(&OrigI);
+ return true;
+ }
+
+ switch (computeOverflow(BinaryOp, IsSigned, LHS, RHS, &OrigI)) {
+ case OverflowResult::MayOverflow:
+ return false;
+ case OverflowResult::AlwaysOverflowsLow:
+ case OverflowResult::AlwaysOverflowsHigh:
+ Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
+ Result->takeName(&OrigI);
Overflow = ConstantInt::getTrue(OverflowTy);
- return true;
- case OverflowResult::NeverOverflows:
- Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
- Result->takeName(&OrigI);
+ return true;
+ case OverflowResult::NeverOverflows:
+ Result = Builder.CreateBinOp(BinaryOp, LHS, RHS);
+ Result->takeName(&OrigI);
Overflow = ConstantInt::getFalse(OverflowTy);
- if (auto *Inst = dyn_cast<Instruction>(Result)) {
- if (IsSigned)
- Inst->setHasNoSignedWrap();
- else
- Inst->setHasNoUnsignedWrap();
- }
- return true;
- }
-
- llvm_unreachable("Unexpected overflow result");
-}
-
-/// Recognize and process idiom involving test for multiplication
-/// overflow.
-///
-/// The caller has matched a pattern of the form:
-/// I = cmp u (mul(zext A, zext B), V
-/// The function checks if this is a test for overflow and if so replaces
-/// multiplication with call to 'mul.with.overflow' intrinsic.
-///
-/// \param I Compare instruction.
-/// \param MulVal Result of 'mult' instruction. It is one of the arguments of
-/// the compare instruction. Must be of integer type.
-/// \param OtherVal The other argument of compare instruction.
-/// \returns Instruction which must replace the compare instruction, NULL if no
-/// replacement required.
-static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
+ if (auto *Inst = dyn_cast<Instruction>(Result)) {
+ if (IsSigned)
+ Inst->setHasNoSignedWrap();
+ else
+ Inst->setHasNoUnsignedWrap();
+ }
+ return true;
+ }
+
+ llvm_unreachable("Unexpected overflow result");
+}
+
+/// Recognize and process idiom involving test for multiplication
+/// overflow.
+///
+/// The caller has matched a pattern of the form:
+/// I = cmp u (mul(zext A, zext B), V
+/// The function checks if this is a test for overflow and if so replaces
+/// multiplication with call to 'mul.with.overflow' intrinsic.
+///
+/// \param I Compare instruction.
+/// \param MulVal Result of 'mult' instruction. It is one of the arguments of
+/// the compare instruction. Must be of integer type.
+/// \param OtherVal The other argument of compare instruction.
+/// \returns Instruction which must replace the compare instruction, NULL if no
+/// replacement required.
+static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
Value *OtherVal,
InstCombinerImpl &IC) {
- // Don't bother doing this transformation for pointers, don't do it for
- // vectors.
- if (!isa<IntegerType>(MulVal->getType()))
- return nullptr;
-
- assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal);
- assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal);
- auto *MulInstr = dyn_cast<Instruction>(MulVal);
- if (!MulInstr)
- return nullptr;
- assert(MulInstr->getOpcode() == Instruction::Mul);
-
- auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)),
- *RHS = cast<ZExtOperator>(MulInstr->getOperand(1));
- assert(LHS->getOpcode() == Instruction::ZExt);
- assert(RHS->getOpcode() == Instruction::ZExt);
- Value *A = LHS->getOperand(0), *B = RHS->getOperand(0);
-
- // Calculate type and width of the result produced by mul.with.overflow.
- Type *TyA = A->getType(), *TyB = B->getType();
- unsigned WidthA = TyA->getPrimitiveSizeInBits(),
- WidthB = TyB->getPrimitiveSizeInBits();
- unsigned MulWidth;
- Type *MulType;
- if (WidthB > WidthA) {
- MulWidth = WidthB;
- MulType = TyB;
- } else {
- MulWidth = WidthA;
- MulType = TyA;
- }
-
- // In order to replace the original mul with a narrower mul.with.overflow,
- // all uses must ignore upper bits of the product. The number of used low
- // bits must be not greater than the width of mul.with.overflow.
- if (MulVal->hasNUsesOrMore(2))
- for (User *U : MulVal->users()) {
- if (U == &I)
- continue;
- if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
- // Check if truncation ignores bits above MulWidth.
- unsigned TruncWidth = TI->getType()->getPrimitiveSizeInBits();
- if (TruncWidth > MulWidth)
- return nullptr;
- } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
- // Check if AND ignores bits above MulWidth.
- if (BO->getOpcode() != Instruction::And)
- return nullptr;
- if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
- const APInt &CVal = CI->getValue();
- if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth)
- return nullptr;
- } else {
- // In this case we could have the operand of the binary operation
- // being defined in another block, and performing the replacement
- // could break the dominance relation.
- return nullptr;
- }
- } else {
- // Other uses prohibit this transformation.
- return nullptr;
- }
- }
-
- // Recognize patterns
- switch (I.getPredicate()) {
- case ICmpInst::ICMP_EQ:
- case ICmpInst::ICMP_NE:
- // Recognize pattern:
- // mulval = mul(zext A, zext B)
- // cmp eq/neq mulval, and(mulval, mask), mask selects low MulWidth bits.
- ConstantInt *CI;
- Value *ValToMask;
- if (match(OtherVal, m_And(m_Value(ValToMask), m_ConstantInt(CI)))) {
- if (ValToMask != MulVal)
- return nullptr;
- const APInt &CVal = CI->getValue() + 1;
- if (CVal.isPowerOf2()) {
- unsigned MaskWidth = CVal.logBase2();
- if (MaskWidth == MulWidth)
- break; // Recognized
- }
- }
- return nullptr;
-
- case ICmpInst::ICMP_UGT:
- // Recognize pattern:
- // mulval = mul(zext A, zext B)
- // cmp ugt mulval, max
- if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
- APInt MaxVal = APInt::getMaxValue(MulWidth);
- MaxVal = MaxVal.zext(CI->getBitWidth());
- if (MaxVal.eq(CI->getValue()))
- break; // Recognized
- }
- return nullptr;
-
- case ICmpInst::ICMP_UGE:
- // Recognize pattern:
- // mulval = mul(zext A, zext B)
- // cmp uge mulval, max+1
- if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
- APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth);
- if (MaxVal.eq(CI->getValue()))
- break; // Recognized
- }
- return nullptr;
-
- case ICmpInst::ICMP_ULE:
- // Recognize pattern:
- // mulval = mul(zext A, zext B)
- // cmp ule mulval, max
- if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
- APInt MaxVal = APInt::getMaxValue(MulWidth);
- MaxVal = MaxVal.zext(CI->getBitWidth());
- if (MaxVal.eq(CI->getValue()))
- break; // Recognized
- }
- return nullptr;
-
- case ICmpInst::ICMP_ULT:
- // Recognize pattern:
- // mulval = mul(zext A, zext B)
- // cmp ule mulval, max + 1
- if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
- APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth);
- if (MaxVal.eq(CI->getValue()))
- break; // Recognized
- }
- return nullptr;
-
- default:
- return nullptr;
- }
-
- InstCombiner::BuilderTy &Builder = IC.Builder;
- Builder.SetInsertPoint(MulInstr);
-
- // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B)
- Value *MulA = A, *MulB = B;
- if (WidthA < MulWidth)
- MulA = Builder.CreateZExt(A, MulType);
- if (WidthB < MulWidth)
- MulB = Builder.CreateZExt(B, MulType);
- Function *F = Intrinsic::getDeclaration(
- I.getModule(), Intrinsic::umul_with_overflow, MulType);
- CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul");
+ // Don't bother doing this transformation for pointers, don't do it for
+ // vectors.
+ if (!isa<IntegerType>(MulVal->getType()))
+ return nullptr;
+
+ assert(I.getOperand(0) == MulVal || I.getOperand(1) == MulVal);
+ assert(I.getOperand(0) == OtherVal || I.getOperand(1) == OtherVal);
+ auto *MulInstr = dyn_cast<Instruction>(MulVal);
+ if (!MulInstr)
+ return nullptr;
+ assert(MulInstr->getOpcode() == Instruction::Mul);
+
+ auto *LHS = cast<ZExtOperator>(MulInstr->getOperand(0)),
+ *RHS = cast<ZExtOperator>(MulInstr->getOperand(1));
+ assert(LHS->getOpcode() == Instruction::ZExt);
+ assert(RHS->getOpcode() == Instruction::ZExt);
+ Value *A = LHS->getOperand(0), *B = RHS->getOperand(0);
+
+ // Calculate type and width of the result produced by mul.with.overflow.
+ Type *TyA = A->getType(), *TyB = B->getType();
+ unsigned WidthA = TyA->getPrimitiveSizeInBits(),
+ WidthB = TyB->getPrimitiveSizeInBits();
+ unsigned MulWidth;
+ Type *MulType;
+ if (WidthB > WidthA) {
+ MulWidth = WidthB;
+ MulType = TyB;
+ } else {
+ MulWidth = WidthA;
+ MulType = TyA;
+ }
+
+ // In order to replace the original mul with a narrower mul.with.overflow,
+ // all uses must ignore upper bits of the product. The number of used low
+ // bits must be not greater than the width of mul.with.overflow.
+ if (MulVal->hasNUsesOrMore(2))
+ for (User *U : MulVal->users()) {
+ if (U == &I)
+ continue;
+ if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
+ // Check if truncation ignores bits above MulWidth.
+ unsigned TruncWidth = TI->getType()->getPrimitiveSizeInBits();
+ if (TruncWidth > MulWidth)
+ return nullptr;
+ } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
+ // Check if AND ignores bits above MulWidth.
+ if (BO->getOpcode() != Instruction::And)
+ return nullptr;
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+ const APInt &CVal = CI->getValue();
+ if (CVal.getBitWidth() - CVal.countLeadingZeros() > MulWidth)
+ return nullptr;
+ } else {
+ // In this case we could have the operand of the binary operation
+ // being defined in another block, and performing the replacement
+ // could break the dominance relation.
+ return nullptr;
+ }
+ } else {
+ // Other uses prohibit this transformation.
+ return nullptr;
+ }
+ }
+
+ // Recognize patterns
+ switch (I.getPredicate()) {
+ case ICmpInst::ICMP_EQ:
+ case ICmpInst::ICMP_NE:
+ // Recognize pattern:
+ // mulval = mul(zext A, zext B)
+ // cmp eq/neq mulval, and(mulval, mask), mask selects low MulWidth bits.
+ ConstantInt *CI;
+ Value *ValToMask;
+ if (match(OtherVal, m_And(m_Value(ValToMask), m_ConstantInt(CI)))) {
+ if (ValToMask != MulVal)
+ return nullptr;
+ const APInt &CVal = CI->getValue() + 1;
+ if (CVal.isPowerOf2()) {
+ unsigned MaskWidth = CVal.logBase2();
+ if (MaskWidth == MulWidth)
+ break; // Recognized
+ }
+ }
+ return nullptr;
+
+ case ICmpInst::ICMP_UGT:
+ // Recognize pattern:
+ // mulval = mul(zext A, zext B)
+ // cmp ugt mulval, max
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+ APInt MaxVal = APInt::getMaxValue(MulWidth);
+ MaxVal = MaxVal.zext(CI->getBitWidth());
+ if (MaxVal.eq(CI->getValue()))
+ break; // Recognized
+ }
+ return nullptr;
+
+ case ICmpInst::ICMP_UGE:
+ // Recognize pattern:
+ // mulval = mul(zext A, zext B)
+ // cmp uge mulval, max+1
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+ APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth);
+ if (MaxVal.eq(CI->getValue()))
+ break; // Recognized
+ }
+ return nullptr;
+
+ case ICmpInst::ICMP_ULE:
+ // Recognize pattern:
+ // mulval = mul(zext A, zext B)
+ // cmp ule mulval, max
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+ APInt MaxVal = APInt::getMaxValue(MulWidth);
+ MaxVal = MaxVal.zext(CI->getBitWidth());
+ if (MaxVal.eq(CI->getValue()))
+ break; // Recognized
+ }
+ return nullptr;
+
+ case ICmpInst::ICMP_ULT:
+ // Recognize pattern:
+ // mulval = mul(zext A, zext B)
+ // cmp ule mulval, max + 1
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(OtherVal)) {
+ APInt MaxVal = APInt::getOneBitSet(CI->getBitWidth(), MulWidth);
+ if (MaxVal.eq(CI->getValue()))
+ break; // Recognized
+ }
+ return nullptr;
+
+ default:
+ return nullptr;
+ }
+
+ InstCombiner::BuilderTy &Builder = IC.Builder;
+ Builder.SetInsertPoint(MulInstr);
+
+ // Replace: mul(zext A, zext B) --> mul.with.overflow(A, B)
+ Value *MulA = A, *MulB = B;
+ if (WidthA < MulWidth)
+ MulA = Builder.CreateZExt(A, MulType);
+ if (WidthB < MulWidth)
+ MulB = Builder.CreateZExt(B, MulType);
+ Function *F = Intrinsic::getDeclaration(
+ I.getModule(), Intrinsic::umul_with_overflow, MulType);
+ CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul");
IC.addToWorklist(MulInstr);
-
- // If there are uses of mul result other than the comparison, we know that
- // they are truncation or binary AND. Change them to use result of
- // mul.with.overflow and adjust properly mask/size.
- if (MulVal->hasNUsesOrMore(2)) {
- Value *Mul = Builder.CreateExtractValue(Call, 0, "umul.value");
+
+ // If there are uses of mul result other than the comparison, we know that
+ // they are truncation or binary AND. Change them to use result of
+ // mul.with.overflow and adjust properly mask/size.
+ if (MulVal->hasNUsesOrMore(2)) {
+ Value *Mul = Builder.CreateExtractValue(Call, 0, "umul.value");
for (User *U : make_early_inc_range(MulVal->users())) {
- if (U == &I || U == OtherVal)
- continue;
- if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
- if (TI->getType()->getPrimitiveSizeInBits() == MulWidth)
- IC.replaceInstUsesWith(*TI, Mul);
- else
- TI->setOperand(0, Mul);
- } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
- assert(BO->getOpcode() == Instruction::And);
- // Replace (mul & mask) --> zext (mul.with.overflow & short_mask)
- ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
- APInt ShortMask = CI->getValue().trunc(MulWidth);
- Value *ShortAnd = Builder.CreateAnd(Mul, ShortMask);
- Value *Zext = Builder.CreateZExt(ShortAnd, BO->getType());
- IC.replaceInstUsesWith(*BO, Zext);
- } else {
- llvm_unreachable("Unexpected Binary operation");
- }
+ if (U == &I || U == OtherVal)
+ continue;
+ if (TruncInst *TI = dyn_cast<TruncInst>(U)) {
+ if (TI->getType()->getPrimitiveSizeInBits() == MulWidth)
+ IC.replaceInstUsesWith(*TI, Mul);
+ else
+ TI->setOperand(0, Mul);
+ } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(U)) {
+ assert(BO->getOpcode() == Instruction::And);
+ // Replace (mul & mask) --> zext (mul.with.overflow & short_mask)
+ ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
+ APInt ShortMask = CI->getValue().trunc(MulWidth);
+ Value *ShortAnd = Builder.CreateAnd(Mul, ShortMask);
+ Value *Zext = Builder.CreateZExt(ShortAnd, BO->getType());
+ IC.replaceInstUsesWith(*BO, Zext);
+ } else {
+ llvm_unreachable("Unexpected Binary operation");
+ }
IC.addToWorklist(cast<Instruction>(U));
- }
- }
- if (isa<Instruction>(OtherVal))
+ }
+ }
+ if (isa<Instruction>(OtherVal))
IC.addToWorklist(cast<Instruction>(OtherVal));
-
- // The original icmp gets replaced with the overflow value, maybe inverted
- // depending on predicate.
- bool Inverse = false;
- switch (I.getPredicate()) {
- case ICmpInst::ICMP_NE:
- break;
- case ICmpInst::ICMP_EQ:
- Inverse = true;
- break;
- case ICmpInst::ICMP_UGT:
- case ICmpInst::ICMP_UGE:
- if (I.getOperand(0) == MulVal)
- break;
- Inverse = true;
- break;
- case ICmpInst::ICMP_ULT:
- case ICmpInst::ICMP_ULE:
- if (I.getOperand(1) == MulVal)
- break;
- Inverse = true;
- break;
- default:
- llvm_unreachable("Unexpected predicate");
- }
- if (Inverse) {
- Value *Res = Builder.CreateExtractValue(Call, 1);
- return BinaryOperator::CreateNot(Res);
- }
-
- return ExtractValueInst::Create(Call, 1);
-}
-
-/// When performing a comparison against a constant, it is possible that not all
-/// the bits in the LHS are demanded. This helper method computes the mask that
-/// IS demanded.
-static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth) {
- const APInt *RHS;
- if (!match(I.getOperand(1), m_APInt(RHS)))
- return APInt::getAllOnesValue(BitWidth);
-
- // If this is a normal comparison, it demands all bits. If it is a sign bit
- // comparison, it only demands the sign bit.
- bool UnusedBit;
+
+ // The original icmp gets replaced with the overflow value, maybe inverted
+ // depending on predicate.
+ bool Inverse = false;
+ switch (I.getPredicate()) {
+ case ICmpInst::ICMP_NE:
+ break;
+ case ICmpInst::ICMP_EQ:
+ Inverse = true;
+ break;
+ case ICmpInst::ICMP_UGT:
+ case ICmpInst::ICMP_UGE:
+ if (I.getOperand(0) == MulVal)
+ break;
+ Inverse = true;
+ break;
+ case ICmpInst::ICMP_ULT:
+ case ICmpInst::ICMP_ULE:
+ if (I.getOperand(1) == MulVal)
+ break;
+ Inverse = true;
+ break;
+ default:
+ llvm_unreachable("Unexpected predicate");
+ }
+ if (Inverse) {
+ Value *Res = Builder.CreateExtractValue(Call, 1);
+ return BinaryOperator::CreateNot(Res);
+ }
+
+ return ExtractValueInst::Create(Call, 1);
+}
+
+/// When performing a comparison against a constant, it is possible that not all
+/// the bits in the LHS are demanded. This helper method computes the mask that
+/// IS demanded.
+static APInt getDemandedBitsLHSMask(ICmpInst &I, unsigned BitWidth) {
+ const APInt *RHS;
+ if (!match(I.getOperand(1), m_APInt(RHS)))
+ return APInt::getAllOnesValue(BitWidth);
+
+ // If this is a normal comparison, it demands all bits. If it is a sign bit
+ // comparison, it only demands the sign bit.
+ bool UnusedBit;
if (InstCombiner::isSignBitCheck(I.getPredicate(), *RHS, UnusedBit))
- return APInt::getSignMask(BitWidth);
-
- switch (I.getPredicate()) {
- // For a UGT comparison, we don't care about any bits that
- // correspond to the trailing ones of the comparand. The value of these
- // bits doesn't impact the outcome of the comparison, because any value
- // greater than the RHS must differ in a bit higher than these due to carry.
- case ICmpInst::ICMP_UGT:
- return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingOnes());
-
- // Similarly, for a ULT comparison, we don't care about the trailing zeros.
- // Any value less than the RHS must differ in a higher bit because of carries.
- case ICmpInst::ICMP_ULT:
- return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingZeros());
-
- default:
- return APInt::getAllOnesValue(BitWidth);
- }
-}
-
-/// Check if the order of \p Op0 and \p Op1 as operands in an ICmpInst
-/// should be swapped.
-/// The decision is based on how many times these two operands are reused
-/// as subtract operands and their positions in those instructions.
-/// The rationale is that several architectures use the same instruction for
-/// both subtract and cmp. Thus, it is better if the order of those operands
-/// match.
-/// \return true if Op0 and Op1 should be swapped.
-static bool swapMayExposeCSEOpportunities(const Value *Op0, const Value *Op1) {
- // Filter out pointer values as those cannot appear directly in subtract.
- // FIXME: we may want to go through inttoptrs or bitcasts.
- if (Op0->getType()->isPointerTy())
- return false;
- // If a subtract already has the same operands as a compare, swapping would be
- // bad. If a subtract has the same operands as a compare but in reverse order,
- // then swapping is good.
- int GoodToSwap = 0;
- for (const User *U : Op0->users()) {
- if (match(U, m_Sub(m_Specific(Op1), m_Specific(Op0))))
- GoodToSwap++;
- else if (match(U, m_Sub(m_Specific(Op0), m_Specific(Op1))))
- GoodToSwap--;
- }
- return GoodToSwap > 0;
-}
-
-/// Check that one use is in the same block as the definition and all
-/// other uses are in blocks dominated by a given block.
-///
-/// \param DI Definition
-/// \param UI Use
-/// \param DB Block that must dominate all uses of \p DI outside
-/// the parent block
-/// \return true when \p UI is the only use of \p DI in the parent block
-/// and all other uses of \p DI are in blocks dominated by \p DB.
-///
+ return APInt::getSignMask(BitWidth);
+
+ switch (I.getPredicate()) {
+ // For a UGT comparison, we don't care about any bits that
+ // correspond to the trailing ones of the comparand. The value of these
+ // bits doesn't impact the outcome of the comparison, because any value
+ // greater than the RHS must differ in a bit higher than these due to carry.
+ case ICmpInst::ICMP_UGT:
+ return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingOnes());
+
+ // Similarly, for a ULT comparison, we don't care about the trailing zeros.
+ // Any value less than the RHS must differ in a higher bit because of carries.
+ case ICmpInst::ICMP_ULT:
+ return APInt::getBitsSetFrom(BitWidth, RHS->countTrailingZeros());
+
+ default:
+ return APInt::getAllOnesValue(BitWidth);
+ }
+}
+
+/// Check if the order of \p Op0 and \p Op1 as operands in an ICmpInst
+/// should be swapped.
+/// The decision is based on how many times these two operands are reused
+/// as subtract operands and their positions in those instructions.
+/// The rationale is that several architectures use the same instruction for
+/// both subtract and cmp. Thus, it is better if the order of those operands
+/// match.
+/// \return true if Op0 and Op1 should be swapped.
+static bool swapMayExposeCSEOpportunities(const Value *Op0, const Value *Op1) {
+ // Filter out pointer values as those cannot appear directly in subtract.
+ // FIXME: we may want to go through inttoptrs or bitcasts.
+ if (Op0->getType()->isPointerTy())
+ return false;
+ // If a subtract already has the same operands as a compare, swapping would be
+ // bad. If a subtract has the same operands as a compare but in reverse order,
+ // then swapping is good.
+ int GoodToSwap = 0;
+ for (const User *U : Op0->users()) {
+ if (match(U, m_Sub(m_Specific(Op1), m_Specific(Op0))))
+ GoodToSwap++;
+ else if (match(U, m_Sub(m_Specific(Op0), m_Specific(Op1))))
+ GoodToSwap--;
+ }
+ return GoodToSwap > 0;
+}
+
+/// Check that one use is in the same block as the definition and all
+/// other uses are in blocks dominated by a given block.
+///
+/// \param DI Definition
+/// \param UI Use
+/// \param DB Block that must dominate all uses of \p DI outside
+/// the parent block
+/// \return true when \p UI is the only use of \p DI in the parent block
+/// and all other uses of \p DI are in blocks dominated by \p DB.
+///
bool InstCombinerImpl::dominatesAllUses(const Instruction *DI,
const Instruction *UI,
const BasicBlock *DB) const {
- assert(DI && UI && "Instruction not defined\n");
- // Ignore incomplete definitions.
- if (!DI->getParent())
- return false;
- // DI and UI must be in the same block.
- if (DI->getParent() != UI->getParent())
- return false;
- // Protect from self-referencing blocks.
- if (DI->getParent() == DB)
- return false;
- for (const User *U : DI->users()) {
- auto *Usr = cast<Instruction>(U);
- if (Usr != UI && !DT.dominates(DB, Usr->getParent()))
- return false;
- }
- return true;
-}
-
-/// Return true when the instruction sequence within a block is select-cmp-br.
-static bool isChainSelectCmpBranch(const SelectInst *SI) {
- const BasicBlock *BB = SI->getParent();
- if (!BB)
- return false;
- auto *BI = dyn_cast_or_null<BranchInst>(BB->getTerminator());
- if (!BI || BI->getNumSuccessors() != 2)
- return false;
- auto *IC = dyn_cast<ICmpInst>(BI->getCondition());
- if (!IC || (IC->getOperand(0) != SI && IC->getOperand(1) != SI))
- return false;
- return true;
-}
-
-/// True when a select result is replaced by one of its operands
-/// in select-icmp sequence. This will eventually result in the elimination
-/// of the select.
-///
-/// \param SI Select instruction
-/// \param Icmp Compare instruction
-/// \param SIOpd Operand that replaces the select
-///
-/// Notes:
-/// - The replacement is global and requires dominator information
-/// - The caller is responsible for the actual replacement
-///
-/// Example:
-///
-/// entry:
-/// %4 = select i1 %3, %C* %0, %C* null
-/// %5 = icmp eq %C* %4, null
-/// br i1 %5, label %9, label %7
-/// ...
-/// ; <label>:7 ; preds = %entry
-/// %8 = getelementptr inbounds %C* %4, i64 0, i32 0
-/// ...
-///
-/// can be transformed to
-///
-/// %5 = icmp eq %C* %0, null
-/// %6 = select i1 %3, i1 %5, i1 true
-/// br i1 %6, label %9, label %7
-/// ...
-/// ; <label>:7 ; preds = %entry
-/// %8 = getelementptr inbounds %C* %0, i64 0, i32 0 // replace by %0!
-///
-/// Similar when the first operand of the select is a constant or/and
-/// the compare is for not equal rather than equal.
-///
-/// NOTE: The function is only called when the select and compare constants
-/// are equal, the optimization can work only for EQ predicates. This is not a
-/// major restriction since a NE compare should be 'normalized' to an equal
-/// compare, which usually happens in the combiner and test case
-/// select-cmp-br.ll checks for it.
+ assert(DI && UI && "Instruction not defined\n");
+ // Ignore incomplete definitions.
+ if (!DI->getParent())
+ return false;
+ // DI and UI must be in the same block.
+ if (DI->getParent() != UI->getParent())
+ return false;
+ // Protect from self-referencing blocks.
+ if (DI->getParent() == DB)
+ return false;
+ for (const User *U : DI->users()) {
+ auto *Usr = cast<Instruction>(U);
+ if (Usr != UI && !DT.dominates(DB, Usr->getParent()))
+ return false;
+ }
+ return true;
+}
+
+/// Return true when the instruction sequence within a block is select-cmp-br.
+static bool isChainSelectCmpBranch(const SelectInst *SI) {
+ const BasicBlock *BB = SI->getParent();
+ if (!BB)
+ return false;
+ auto *BI = dyn_cast_or_null<BranchInst>(BB->getTerminator());
+ if (!BI || BI->getNumSuccessors() != 2)
+ return false;
+ auto *IC = dyn_cast<ICmpInst>(BI->getCondition());
+ if (!IC || (IC->getOperand(0) != SI && IC->getOperand(1) != SI))
+ return false;
+ return true;
+}
+
+/// True when a select result is replaced by one of its operands
+/// in select-icmp sequence. This will eventually result in the elimination
+/// of the select.
+///
+/// \param SI Select instruction
+/// \param Icmp Compare instruction
+/// \param SIOpd Operand that replaces the select
+///
+/// Notes:
+/// - The replacement is global and requires dominator information
+/// - The caller is responsible for the actual replacement
+///
+/// Example:
+///
+/// entry:
+/// %4 = select i1 %3, %C* %0, %C* null
+/// %5 = icmp eq %C* %4, null
+/// br i1 %5, label %9, label %7
+/// ...
+/// ; <label>:7 ; preds = %entry
+/// %8 = getelementptr inbounds %C* %4, i64 0, i32 0
+/// ...
+///
+/// can be transformed to
+///
+/// %5 = icmp eq %C* %0, null
+/// %6 = select i1 %3, i1 %5, i1 true
+/// br i1 %6, label %9, label %7
+/// ...
+/// ; <label>:7 ; preds = %entry
+/// %8 = getelementptr inbounds %C* %0, i64 0, i32 0 // replace by %0!
+///
+/// Similar when the first operand of the select is a constant or/and
+/// the compare is for not equal rather than equal.
+///
+/// NOTE: The function is only called when the select and compare constants
+/// are equal, the optimization can work only for EQ predicates. This is not a
+/// major restriction since a NE compare should be 'normalized' to an equal
+/// compare, which usually happens in the combiner and test case
+/// select-cmp-br.ll checks for it.
bool InstCombinerImpl::replacedSelectWithOperand(SelectInst *SI,
const ICmpInst *Icmp,
const unsigned SIOpd) {
- assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!");
- if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) {
- BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1);
- // The check for the single predecessor is not the best that can be
- // done. But it protects efficiently against cases like when SI's
- // home block has two successors, Succ and Succ1, and Succ1 predecessor
- // of Succ. Then SI can't be replaced by SIOpd because the use that gets
- // replaced can be reached on either path. So the uniqueness check
- // guarantees that the path all uses of SI (outside SI's parent) are on
- // is disjoint from all other paths out of SI. But that information
- // is more expensive to compute, and the trade-off here is in favor
- // of compile-time. It should also be noticed that we check for a single
- // predecessor and not only uniqueness. This to handle the situation when
- // Succ and Succ1 points to the same basic block.
- if (Succ->getSinglePredecessor() && dominatesAllUses(SI, Icmp, Succ)) {
- NumSel++;
- SI->replaceUsesOutsideBlock(SI->getOperand(SIOpd), SI->getParent());
- return true;
- }
- }
- return false;
-}
-
-/// Try to fold the comparison based on range information we can get by checking
-/// whether bits are known to be zero or one in the inputs.
+ assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!");
+ if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) {
+ BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1);
+ // The check for the single predecessor is not the best that can be
+ // done. But it protects efficiently against cases like when SI's
+ // home block has two successors, Succ and Succ1, and Succ1 predecessor
+ // of Succ. Then SI can't be replaced by SIOpd because the use that gets
+ // replaced can be reached on either path. So the uniqueness check
+ // guarantees that the path all uses of SI (outside SI's parent) are on
+ // is disjoint from all other paths out of SI. But that information
+ // is more expensive to compute, and the trade-off here is in favor
+ // of compile-time. It should also be noticed that we check for a single
+ // predecessor and not only uniqueness. This to handle the situation when
+ // Succ and Succ1 points to the same basic block.
+ if (Succ->getSinglePredecessor() && dominatesAllUses(SI, Icmp, Succ)) {
+ NumSel++;
+ SI->replaceUsesOutsideBlock(SI->getOperand(SIOpd), SI->getParent());
+ return true;
+ }
+ }
+ return false;
+}
+
+/// Try to fold the comparison based on range information we can get by checking
+/// whether bits are known to be zero or one in the inputs.
Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- Type *Ty = Op0->getType();
- ICmpInst::Predicate Pred = I.getPredicate();
-
- // Get scalar or pointer size.
- unsigned BitWidth = Ty->isIntOrIntVectorTy()
- ? Ty->getScalarSizeInBits()
- : DL.getPointerTypeSizeInBits(Ty->getScalarType());
-
- if (!BitWidth)
- return nullptr;
-
- KnownBits Op0Known(BitWidth);
- KnownBits Op1Known(BitWidth);
-
- if (SimplifyDemandedBits(&I, 0,
- getDemandedBitsLHSMask(I, BitWidth),
- Op0Known, 0))
- return &I;
-
- if (SimplifyDemandedBits(&I, 1, APInt::getAllOnesValue(BitWidth),
- Op1Known, 0))
- return &I;
-
- // Given the known and unknown bits, compute a range that the LHS could be
- // in. Compute the Min, Max and RHS values based on the known bits. For the
- // EQ and NE we use unsigned values.
- APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0);
- APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0);
- if (I.isSigned()) {
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Type *Ty = Op0->getType();
+ ICmpInst::Predicate Pred = I.getPredicate();
+
+ // Get scalar or pointer size.
+ unsigned BitWidth = Ty->isIntOrIntVectorTy()
+ ? Ty->getScalarSizeInBits()
+ : DL.getPointerTypeSizeInBits(Ty->getScalarType());
+
+ if (!BitWidth)
+ return nullptr;
+
+ KnownBits Op0Known(BitWidth);
+ KnownBits Op1Known(BitWidth);
+
+ if (SimplifyDemandedBits(&I, 0,
+ getDemandedBitsLHSMask(I, BitWidth),
+ Op0Known, 0))
+ return &I;
+
+ if (SimplifyDemandedBits(&I, 1, APInt::getAllOnesValue(BitWidth),
+ Op1Known, 0))
+ return &I;
+
+ // Given the known and unknown bits, compute a range that the LHS could be
+ // in. Compute the Min, Max and RHS values based on the known bits. For the
+ // EQ and NE we use unsigned values.
+ APInt Op0Min(BitWidth, 0), Op0Max(BitWidth, 0);
+ APInt Op1Min(BitWidth, 0), Op1Max(BitWidth, 0);
+ if (I.isSigned()) {
Op0Min = Op0Known.getSignedMinValue();
Op0Max = Op0Known.getSignedMaxValue();
Op1Min = Op1Known.getSignedMinValue();
Op1Max = Op1Known.getSignedMaxValue();
- } else {
+ } else {
Op0Min = Op0Known.getMinValue();
Op0Max = Op0Known.getMaxValue();
Op1Min = Op1Known.getMinValue();
Op1Max = Op1Known.getMaxValue();
- }
-
- // If Min and Max are known to be the same, then SimplifyDemandedBits figured
- // out that the LHS or RHS is a constant. Constant fold this now, so that
- // code below can assume that Min != Max.
- if (!isa<Constant>(Op0) && Op0Min == Op0Max)
- return new ICmpInst(Pred, ConstantExpr::getIntegerValue(Ty, Op0Min), Op1);
- if (!isa<Constant>(Op1) && Op1Min == Op1Max)
- return new ICmpInst(Pred, Op0, ConstantExpr::getIntegerValue(Ty, Op1Min));
-
- // Based on the range information we know about the LHS, see if we can
- // simplify this comparison. For example, (x&4) < 8 is always true.
- switch (Pred) {
- default:
- llvm_unreachable("Unknown icmp opcode!");
- case ICmpInst::ICMP_EQ:
- case ICmpInst::ICMP_NE: {
+ }
+
+ // If Min and Max are known to be the same, then SimplifyDemandedBits figured
+ // out that the LHS or RHS is a constant. Constant fold this now, so that
+ // code below can assume that Min != Max.
+ if (!isa<Constant>(Op0) && Op0Min == Op0Max)
+ return new ICmpInst(Pred, ConstantExpr::getIntegerValue(Ty, Op0Min), Op1);
+ if (!isa<Constant>(Op1) && Op1Min == Op1Max)
+ return new ICmpInst(Pred, Op0, ConstantExpr::getIntegerValue(Ty, Op1Min));
+
+ // Based on the range information we know about the LHS, see if we can
+ // simplify this comparison. For example, (x&4) < 8 is always true.
+ switch (Pred) {
+ default:
+ llvm_unreachable("Unknown icmp opcode!");
+ case ICmpInst::ICMP_EQ:
+ case ICmpInst::ICMP_NE: {
if (Op0Max.ult(Op1Min) || Op0Min.ugt(Op1Max))
return replaceInstUsesWith(
I, ConstantInt::getBool(I.getType(), Pred == CmpInst::ICMP_NE));
-
- // If all bits are known zero except for one, then we know at most one bit
- // is set. If the comparison is against zero, then this is a check to see if
- // *that* bit is set.
- APInt Op0KnownZeroInverted = ~Op0Known.Zero;
- if (Op1Known.isZero()) {
- // If the LHS is an AND with the same constant, look through it.
- Value *LHS = nullptr;
- const APInt *LHSC;
- if (!match(Op0, m_And(m_Value(LHS), m_APInt(LHSC))) ||
- *LHSC != Op0KnownZeroInverted)
- LHS = Op0;
-
- Value *X;
- if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
- APInt ValToCheck = Op0KnownZeroInverted;
- Type *XTy = X->getType();
- if (ValToCheck.isPowerOf2()) {
- // ((1 << X) & 8) == 0 -> X != 3
- // ((1 << X) & 8) != 0 -> X == 3
- auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
- auto NewPred = ICmpInst::getInversePredicate(Pred);
- return new ICmpInst(NewPred, X, CmpC);
- } else if ((++ValToCheck).isPowerOf2()) {
- // ((1 << X) & 7) == 0 -> X >= 3
- // ((1 << X) & 7) != 0 -> X < 3
- auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
- auto NewPred =
- Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGE : CmpInst::ICMP_ULT;
- return new ICmpInst(NewPred, X, CmpC);
- }
- }
-
- // Check if the LHS is 8 >>u x and the result is a power of 2 like 1.
- const APInt *CI;
- if (Op0KnownZeroInverted.isOneValue() &&
- match(LHS, m_LShr(m_Power2(CI), m_Value(X)))) {
- // ((8 >>u X) & 1) == 0 -> X != 3
- // ((8 >>u X) & 1) != 0 -> X == 3
- unsigned CmpVal = CI->countTrailingZeros();
- auto NewPred = ICmpInst::getInversePredicate(Pred);
- return new ICmpInst(NewPred, X, ConstantInt::get(X->getType(), CmpVal));
- }
- }
- break;
- }
- case ICmpInst::ICMP_ULT: {
- if (Op0Max.ult(Op1Min)) // A <u B -> true if max(A) < min(B)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Min.uge(Op1Max)) // A <u B -> false if min(A) >= max(B)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B)
- return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
-
- const APInt *CmpC;
- if (match(Op1, m_APInt(CmpC))) {
- // A <u C -> A == C-1 if min(A)+1 == C
- if (*CmpC == Op0Min + 1)
- return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
- ConstantInt::get(Op1->getType(), *CmpC - 1));
- // X <u C --> X == 0, if the number of zero bits in the bottom of X
- // exceeds the log2 of C.
- if (Op0Known.countMinTrailingZeros() >= CmpC->ceilLogBase2())
- return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
- Constant::getNullValue(Op1->getType()));
- }
- break;
- }
- case ICmpInst::ICMP_UGT: {
- if (Op0Min.ugt(Op1Max)) // A >u B -> true if min(A) > max(B)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Max.ule(Op1Min)) // A >u B -> false if max(A) <= max(B)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B)
- return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
-
- const APInt *CmpC;
- if (match(Op1, m_APInt(CmpC))) {
- // A >u C -> A == C+1 if max(a)-1 == C
- if (*CmpC == Op0Max - 1)
- return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
- ConstantInt::get(Op1->getType(), *CmpC + 1));
- // X >u C --> X != 0, if the number of zero bits in the bottom of X
- // exceeds the log2 of C.
- if (Op0Known.countMinTrailingZeros() >= CmpC->getActiveBits())
- return new ICmpInst(ICmpInst::ICMP_NE, Op0,
- Constant::getNullValue(Op1->getType()));
- }
- break;
- }
- case ICmpInst::ICMP_SLT: {
- if (Op0Max.slt(Op1Min)) // A <s B -> true if max(A) < min(C)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Min.sge(Op1Max)) // A <s B -> false if min(A) >= max(C)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B)
- return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
- const APInt *CmpC;
- if (match(Op1, m_APInt(CmpC))) {
- if (*CmpC == Op0Min + 1) // A <s C -> A == C-1 if min(A)+1 == C
- return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
- ConstantInt::get(Op1->getType(), *CmpC - 1));
- }
- break;
- }
- case ICmpInst::ICMP_SGT: {
- if (Op0Min.sgt(Op1Max)) // A >s B -> true if min(A) > max(B)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Max.sle(Op1Min)) // A >s B -> false if max(A) <= min(B)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B)
- return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
- const APInt *CmpC;
- if (match(Op1, m_APInt(CmpC))) {
- if (*CmpC == Op0Max - 1) // A >s C -> A == C+1 if max(A)-1 == C
- return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
- ConstantInt::get(Op1->getType(), *CmpC + 1));
- }
- break;
- }
- case ICmpInst::ICMP_SGE:
- assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!");
- if (Op0Min.sge(Op1Max)) // A >=s B -> true if min(A) >= max(B)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Max.slt(Op1Min)) // A >=s B -> false if max(A) < min(B)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- if (Op1Min == Op0Max) // A >=s B -> A == B if max(A) == min(B)
- return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
- break;
- case ICmpInst::ICMP_SLE:
- assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!");
- if (Op0Max.sle(Op1Min)) // A <=s B -> true if max(A) <= min(B)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Min.sgt(Op1Max)) // A <=s B -> false if min(A) > max(B)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- if (Op1Max == Op0Min) // A <=s B -> A == B if min(A) == max(B)
- return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
- break;
- case ICmpInst::ICMP_UGE:
- assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!");
- if (Op0Min.uge(Op1Max)) // A >=u B -> true if min(A) >= max(B)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Max.ult(Op1Min)) // A >=u B -> false if max(A) < min(B)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- if (Op1Min == Op0Max) // A >=u B -> A == B if max(A) == min(B)
- return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
- break;
- case ICmpInst::ICMP_ULE:
- assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!");
- if (Op0Max.ule(Op1Min)) // A <=u B -> true if max(A) <= min(B)
- return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
- if (Op0Min.ugt(Op1Max)) // A <=u B -> false if min(A) > max(B)
- return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
- if (Op1Max == Op0Min) // A <=u B -> A == B if min(A) == max(B)
- return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
- break;
- }
-
- // Turn a signed comparison into an unsigned one if both operands are known to
- // have the same sign.
- if (I.isSigned() &&
- ((Op0Known.Zero.isNegative() && Op1Known.Zero.isNegative()) ||
- (Op0Known.One.isNegative() && Op1Known.One.isNegative())))
- return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
-
- return nullptr;
-}
-
-llvm::Optional<std::pair<CmpInst::Predicate, Constant *>>
+
+ // If all bits are known zero except for one, then we know at most one bit
+ // is set. If the comparison is against zero, then this is a check to see if
+ // *that* bit is set.
+ APInt Op0KnownZeroInverted = ~Op0Known.Zero;
+ if (Op1Known.isZero()) {
+ // If the LHS is an AND with the same constant, look through it.
+ Value *LHS = nullptr;
+ const APInt *LHSC;
+ if (!match(Op0, m_And(m_Value(LHS), m_APInt(LHSC))) ||
+ *LHSC != Op0KnownZeroInverted)
+ LHS = Op0;
+
+ Value *X;
+ if (match(LHS, m_Shl(m_One(), m_Value(X)))) {
+ APInt ValToCheck = Op0KnownZeroInverted;
+ Type *XTy = X->getType();
+ if (ValToCheck.isPowerOf2()) {
+ // ((1 << X) & 8) == 0 -> X != 3
+ // ((1 << X) & 8) != 0 -> X == 3
+ auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
+ auto NewPred = ICmpInst::getInversePredicate(Pred);
+ return new ICmpInst(NewPred, X, CmpC);
+ } else if ((++ValToCheck).isPowerOf2()) {
+ // ((1 << X) & 7) == 0 -> X >= 3
+ // ((1 << X) & 7) != 0 -> X < 3
+ auto *CmpC = ConstantInt::get(XTy, ValToCheck.countTrailingZeros());
+ auto NewPred =
+ Pred == CmpInst::ICMP_EQ ? CmpInst::ICMP_UGE : CmpInst::ICMP_ULT;
+ return new ICmpInst(NewPred, X, CmpC);
+ }
+ }
+
+ // Check if the LHS is 8 >>u x and the result is a power of 2 like 1.
+ const APInt *CI;
+ if (Op0KnownZeroInverted.isOneValue() &&
+ match(LHS, m_LShr(m_Power2(CI), m_Value(X)))) {
+ // ((8 >>u X) & 1) == 0 -> X != 3
+ // ((8 >>u X) & 1) != 0 -> X == 3
+ unsigned CmpVal = CI->countTrailingZeros();
+ auto NewPred = ICmpInst::getInversePredicate(Pred);
+ return new ICmpInst(NewPred, X, ConstantInt::get(X->getType(), CmpVal));
+ }
+ }
+ break;
+ }
+ case ICmpInst::ICMP_ULT: {
+ if (Op0Max.ult(Op1Min)) // A <u B -> true if max(A) < min(B)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Min.uge(Op1Max)) // A <u B -> false if min(A) >= max(B)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ if (Op1Min == Op0Max) // A <u B -> A != B if max(A) == min(B)
+ return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+
+ const APInt *CmpC;
+ if (match(Op1, m_APInt(CmpC))) {
+ // A <u C -> A == C-1 if min(A)+1 == C
+ if (*CmpC == Op0Min + 1)
+ return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+ ConstantInt::get(Op1->getType(), *CmpC - 1));
+ // X <u C --> X == 0, if the number of zero bits in the bottom of X
+ // exceeds the log2 of C.
+ if (Op0Known.countMinTrailingZeros() >= CmpC->ceilLogBase2())
+ return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+ Constant::getNullValue(Op1->getType()));
+ }
+ break;
+ }
+ case ICmpInst::ICMP_UGT: {
+ if (Op0Min.ugt(Op1Max)) // A >u B -> true if min(A) > max(B)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Max.ule(Op1Min)) // A >u B -> false if max(A) <= max(B)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ if (Op1Max == Op0Min) // A >u B -> A != B if min(A) == max(B)
+ return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+
+ const APInt *CmpC;
+ if (match(Op1, m_APInt(CmpC))) {
+ // A >u C -> A == C+1 if max(a)-1 == C
+ if (*CmpC == Op0Max - 1)
+ return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+ ConstantInt::get(Op1->getType(), *CmpC + 1));
+ // X >u C --> X != 0, if the number of zero bits in the bottom of X
+ // exceeds the log2 of C.
+ if (Op0Known.countMinTrailingZeros() >= CmpC->getActiveBits())
+ return new ICmpInst(ICmpInst::ICMP_NE, Op0,
+ Constant::getNullValue(Op1->getType()));
+ }
+ break;
+ }
+ case ICmpInst::ICMP_SLT: {
+ if (Op0Max.slt(Op1Min)) // A <s B -> true if max(A) < min(C)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Min.sge(Op1Max)) // A <s B -> false if min(A) >= max(C)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ if (Op1Min == Op0Max) // A <s B -> A != B if max(A) == min(B)
+ return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+ const APInt *CmpC;
+ if (match(Op1, m_APInt(CmpC))) {
+ if (*CmpC == Op0Min + 1) // A <s C -> A == C-1 if min(A)+1 == C
+ return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+ ConstantInt::get(Op1->getType(), *CmpC - 1));
+ }
+ break;
+ }
+ case ICmpInst::ICMP_SGT: {
+ if (Op0Min.sgt(Op1Max)) // A >s B -> true if min(A) > max(B)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Max.sle(Op1Min)) // A >s B -> false if max(A) <= min(B)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ if (Op1Max == Op0Min) // A >s B -> A != B if min(A) == max(B)
+ return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+ const APInt *CmpC;
+ if (match(Op1, m_APInt(CmpC))) {
+ if (*CmpC == Op0Max - 1) // A >s C -> A == C+1 if max(A)-1 == C
+ return new ICmpInst(ICmpInst::ICMP_EQ, Op0,
+ ConstantInt::get(Op1->getType(), *CmpC + 1));
+ }
+ break;
+ }
+ case ICmpInst::ICMP_SGE:
+ assert(!isa<ConstantInt>(Op1) && "ICMP_SGE with ConstantInt not folded!");
+ if (Op0Min.sge(Op1Max)) // A >=s B -> true if min(A) >= max(B)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Max.slt(Op1Min)) // A >=s B -> false if max(A) < min(B)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ if (Op1Min == Op0Max) // A >=s B -> A == B if max(A) == min(B)
+ return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
+ break;
+ case ICmpInst::ICMP_SLE:
+ assert(!isa<ConstantInt>(Op1) && "ICMP_SLE with ConstantInt not folded!");
+ if (Op0Max.sle(Op1Min)) // A <=s B -> true if max(A) <= min(B)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Min.sgt(Op1Max)) // A <=s B -> false if min(A) > max(B)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ if (Op1Max == Op0Min) // A <=s B -> A == B if min(A) == max(B)
+ return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
+ break;
+ case ICmpInst::ICMP_UGE:
+ assert(!isa<ConstantInt>(Op1) && "ICMP_UGE with ConstantInt not folded!");
+ if (Op0Min.uge(Op1Max)) // A >=u B -> true if min(A) >= max(B)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Max.ult(Op1Min)) // A >=u B -> false if max(A) < min(B)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ if (Op1Min == Op0Max) // A >=u B -> A == B if max(A) == min(B)
+ return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
+ break;
+ case ICmpInst::ICMP_ULE:
+ assert(!isa<ConstantInt>(Op1) && "ICMP_ULE with ConstantInt not folded!");
+ if (Op0Max.ule(Op1Min)) // A <=u B -> true if max(A) <= min(B)
+ return replaceInstUsesWith(I, ConstantInt::getTrue(I.getType()));
+ if (Op0Min.ugt(Op1Max)) // A <=u B -> false if min(A) > max(B)
+ return replaceInstUsesWith(I, ConstantInt::getFalse(I.getType()));
+ if (Op1Max == Op0Min) // A <=u B -> A == B if min(A) == max(B)
+ return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
+ break;
+ }
+
+ // Turn a signed comparison into an unsigned one if both operands are known to
+ // have the same sign.
+ if (I.isSigned() &&
+ ((Op0Known.Zero.isNegative() && Op1Known.Zero.isNegative()) ||
+ (Op0Known.One.isNegative() && Op1Known.One.isNegative())))
+ return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
+
+ return nullptr;
+}
+
+llvm::Optional<std::pair<CmpInst::Predicate, Constant *>>
InstCombiner::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
Constant *C) {
- assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) &&
- "Only for relational integer predicates.");
-
- Type *Type = C->getType();
- bool IsSigned = ICmpInst::isSigned(Pred);
-
- CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred);
- bool WillIncrement =
- UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT;
-
- // Check if the constant operand can be safely incremented/decremented
- // without overflowing/underflowing.
- auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) {
- return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned);
- };
-
- Constant *SafeReplacementConstant = nullptr;
- if (auto *CI = dyn_cast<ConstantInt>(C)) {
- // Bail out if the constant can't be safely incremented/decremented.
- if (!ConstantIsOk(CI))
- return llvm::None;
+ assert(ICmpInst::isRelational(Pred) && ICmpInst::isIntPredicate(Pred) &&
+ "Only for relational integer predicates.");
+
+ Type *Type = C->getType();
+ bool IsSigned = ICmpInst::isSigned(Pred);
+
+ CmpInst::Predicate UnsignedPred = ICmpInst::getUnsignedPredicate(Pred);
+ bool WillIncrement =
+ UnsignedPred == ICmpInst::ICMP_ULE || UnsignedPred == ICmpInst::ICMP_UGT;
+
+ // Check if the constant operand can be safely incremented/decremented
+ // without overflowing/underflowing.
+ auto ConstantIsOk = [WillIncrement, IsSigned](ConstantInt *C) {
+ return WillIncrement ? !C->isMaxValue(IsSigned) : !C->isMinValue(IsSigned);
+ };
+
+ Constant *SafeReplacementConstant = nullptr;
+ if (auto *CI = dyn_cast<ConstantInt>(C)) {
+ // Bail out if the constant can't be safely incremented/decremented.
+ if (!ConstantIsOk(CI))
+ return llvm::None;
} else if (auto *FVTy = dyn_cast<FixedVectorType>(Type)) {
unsigned NumElts = FVTy->getNumElements();
- for (unsigned i = 0; i != NumElts; ++i) {
- Constant *Elt = C->getAggregateElement(i);
- if (!Elt)
- return llvm::None;
-
- if (isa<UndefValue>(Elt))
- continue;
-
- // Bail out if we can't determine if this constant is min/max or if we
- // know that this constant is min/max.
- auto *CI = dyn_cast<ConstantInt>(Elt);
- if (!CI || !ConstantIsOk(CI))
- return llvm::None;
-
- if (!SafeReplacementConstant)
- SafeReplacementConstant = CI;
- }
- } else {
- // ConstantExpr?
- return llvm::None;
- }
-
- // It may not be safe to change a compare predicate in the presence of
- // undefined elements, so replace those elements with the first safe constant
- // that we found.
+ for (unsigned i = 0; i != NumElts; ++i) {
+ Constant *Elt = C->getAggregateElement(i);
+ if (!Elt)
+ return llvm::None;
+
+ if (isa<UndefValue>(Elt))
+ continue;
+
+ // Bail out if we can't determine if this constant is min/max or if we
+ // know that this constant is min/max.
+ auto *CI = dyn_cast<ConstantInt>(Elt);
+ if (!CI || !ConstantIsOk(CI))
+ return llvm::None;
+
+ if (!SafeReplacementConstant)
+ SafeReplacementConstant = CI;
+ }
+ } else {
+ // ConstantExpr?
+ return llvm::None;
+ }
+
+ // It may not be safe to change a compare predicate in the presence of
+ // undefined elements, so replace those elements with the first safe constant
+ // that we found.
// TODO: in case of poison, it is safe; let's replace undefs only.
if (C->containsUndefOrPoisonElement()) {
- assert(SafeReplacementConstant && "Replacement constant not set");
- C = Constant::replaceUndefsWith(C, SafeReplacementConstant);
- }
-
- CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred);
-
- // Increment or decrement the constant.
- Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true);
- Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne);
-
- return std::make_pair(NewPred, NewC);
-}
-
-/// If we have an icmp le or icmp ge instruction with a constant operand, turn
-/// it into the appropriate icmp lt or icmp gt instruction. This transform
-/// allows them to be folded in visitICmpInst.
-static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
- ICmpInst::Predicate Pred = I.getPredicate();
- if (ICmpInst::isEquality(Pred) || !ICmpInst::isIntPredicate(Pred) ||
+ assert(SafeReplacementConstant && "Replacement constant not set");
+ C = Constant::replaceUndefsWith(C, SafeReplacementConstant);
+ }
+
+ CmpInst::Predicate NewPred = CmpInst::getFlippedStrictnessPredicate(Pred);
+
+ // Increment or decrement the constant.
+ Constant *OneOrNegOne = ConstantInt::get(Type, WillIncrement ? 1 : -1, true);
+ Constant *NewC = ConstantExpr::getAdd(C, OneOrNegOne);
+
+ return std::make_pair(NewPred, NewC);
+}
+
+/// If we have an icmp le or icmp ge instruction with a constant operand, turn
+/// it into the appropriate icmp lt or icmp gt instruction. This transform
+/// allows them to be folded in visitICmpInst.
+static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
+ ICmpInst::Predicate Pred = I.getPredicate();
+ if (ICmpInst::isEquality(Pred) || !ICmpInst::isIntPredicate(Pred) ||
InstCombiner::isCanonicalPredicate(Pred))
- return nullptr;
-
- Value *Op0 = I.getOperand(0);
- Value *Op1 = I.getOperand(1);
- auto *Op1C = dyn_cast<Constant>(Op1);
- if (!Op1C)
- return nullptr;
-
+ return nullptr;
+
+ Value *Op0 = I.getOperand(0);
+ Value *Op1 = I.getOperand(1);
+ auto *Op1C = dyn_cast<Constant>(Op1);
+ if (!Op1C)
+ return nullptr;
+
auto FlippedStrictness =
InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, Op1C);
- if (!FlippedStrictness)
- return nullptr;
-
- return new ICmpInst(FlippedStrictness->first, Op0, FlippedStrictness->second);
-}
-
-/// If we have a comparison with a non-canonical predicate, if we can update
-/// all the users, invert the predicate and adjust all the users.
+ if (!FlippedStrictness)
+ return nullptr;
+
+ return new ICmpInst(FlippedStrictness->first, Op0, FlippedStrictness->second);
+}
+
+/// If we have a comparison with a non-canonical predicate, if we can update
+/// all the users, invert the predicate and adjust all the users.
CmpInst *InstCombinerImpl::canonicalizeICmpPredicate(CmpInst &I) {
- // Is the predicate already canonical?
- CmpInst::Predicate Pred = I.getPredicate();
+ // Is the predicate already canonical?
+ CmpInst::Predicate Pred = I.getPredicate();
if (InstCombiner::isCanonicalPredicate(Pred))
- return nullptr;
-
- // Can all users be adjusted to predicate inversion?
+ return nullptr;
+
+ // Can all users be adjusted to predicate inversion?
if (!InstCombiner::canFreelyInvertAllUsersOf(&I, /*IgnoredUser=*/nullptr))
- return nullptr;
-
- // Ok, we can canonicalize comparison!
- // Let's first invert the comparison's predicate.
- I.setPredicate(CmpInst::getInversePredicate(Pred));
- I.setName(I.getName() + ".not");
-
+ return nullptr;
+
+ // Ok, we can canonicalize comparison!
+ // Let's first invert the comparison's predicate.
+ I.setPredicate(CmpInst::getInversePredicate(Pred));
+ I.setName(I.getName() + ".not");
+
// And, adapt users.
freelyInvertAllUsersOf(&I);
-
- return &I;
-}
-
-/// Integer compare with boolean values can always be turned into bitwise ops.
-static Instruction *canonicalizeICmpBool(ICmpInst &I,
- InstCombiner::BuilderTy &Builder) {
- Value *A = I.getOperand(0), *B = I.getOperand(1);
- assert(A->getType()->isIntOrIntVectorTy(1) && "Bools only");
-
- // A boolean compared to true/false can be simplified to Op0/true/false in
- // 14 out of the 20 (10 predicates * 2 constants) possible combinations.
- // Cases not handled by InstSimplify are always 'not' of Op0.
- if (match(B, m_Zero())) {
- switch (I.getPredicate()) {
- case CmpInst::ICMP_EQ: // A == 0 -> !A
- case CmpInst::ICMP_ULE: // A <=u 0 -> !A
- case CmpInst::ICMP_SGE: // A >=s 0 -> !A
- return BinaryOperator::CreateNot(A);
- default:
- llvm_unreachable("ICmp i1 X, C not simplified as expected.");
- }
- } else if (match(B, m_One())) {
- switch (I.getPredicate()) {
- case CmpInst::ICMP_NE: // A != 1 -> !A
- case CmpInst::ICMP_ULT: // A <u 1 -> !A
- case CmpInst::ICMP_SGT: // A >s -1 -> !A
- return BinaryOperator::CreateNot(A);
- default:
- llvm_unreachable("ICmp i1 X, C not simplified as expected.");
- }
- }
-
- switch (I.getPredicate()) {
- default:
- llvm_unreachable("Invalid icmp instruction!");
- case ICmpInst::ICMP_EQ:
- // icmp eq i1 A, B -> ~(A ^ B)
- return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
-
- case ICmpInst::ICMP_NE:
- // icmp ne i1 A, B -> A ^ B
- return BinaryOperator::CreateXor(A, B);
-
- case ICmpInst::ICMP_UGT:
- // icmp ugt -> icmp ult
- std::swap(A, B);
- LLVM_FALLTHROUGH;
- case ICmpInst::ICMP_ULT:
- // icmp ult i1 A, B -> ~A & B
- return BinaryOperator::CreateAnd(Builder.CreateNot(A), B);
-
- case ICmpInst::ICMP_SGT:
- // icmp sgt -> icmp slt
- std::swap(A, B);
- LLVM_FALLTHROUGH;
- case ICmpInst::ICMP_SLT:
- // icmp slt i1 A, B -> A & ~B
- return BinaryOperator::CreateAnd(Builder.CreateNot(B), A);
-
- case ICmpInst::ICMP_UGE:
- // icmp uge -> icmp ule
- std::swap(A, B);
- LLVM_FALLTHROUGH;
- case ICmpInst::ICMP_ULE:
- // icmp ule i1 A, B -> ~A | B
- return BinaryOperator::CreateOr(Builder.CreateNot(A), B);
-
- case ICmpInst::ICMP_SGE:
- // icmp sge -> icmp sle
- std::swap(A, B);
- LLVM_FALLTHROUGH;
- case ICmpInst::ICMP_SLE:
- // icmp sle i1 A, B -> A | ~B
- return BinaryOperator::CreateOr(Builder.CreateNot(B), A);
- }
-}
-
-// Transform pattern like:
-// (1 << Y) u<= X or ~(-1 << Y) u< X or ((1 << Y)+(-1)) u< X
-// (1 << Y) u> X or ~(-1 << Y) u>= X or ((1 << Y)+(-1)) u>= X
-// Into:
-// (X l>> Y) != 0
-// (X l>> Y) == 0
-static Instruction *foldICmpWithHighBitMask(ICmpInst &Cmp,
- InstCombiner::BuilderTy &Builder) {
- ICmpInst::Predicate Pred, NewPred;
- Value *X, *Y;
- if (match(&Cmp,
- m_c_ICmp(Pred, m_OneUse(m_Shl(m_One(), m_Value(Y))), m_Value(X)))) {
- switch (Pred) {
- case ICmpInst::ICMP_ULE:
- NewPred = ICmpInst::ICMP_NE;
- break;
- case ICmpInst::ICMP_UGT:
- NewPred = ICmpInst::ICMP_EQ;
- break;
- default:
- return nullptr;
- }
- } else if (match(&Cmp, m_c_ICmp(Pred,
- m_OneUse(m_CombineOr(
- m_Not(m_Shl(m_AllOnes(), m_Value(Y))),
- m_Add(m_Shl(m_One(), m_Value(Y)),
- m_AllOnes()))),
- m_Value(X)))) {
- // The variant with 'add' is not canonical, (the variant with 'not' is)
- // we only get it because it has extra uses, and can't be canonicalized,
-
- switch (Pred) {
- case ICmpInst::ICMP_ULT:
- NewPred = ICmpInst::ICMP_NE;
- break;
- case ICmpInst::ICMP_UGE:
- NewPred = ICmpInst::ICMP_EQ;
- break;
- default:
- return nullptr;
- }
- } else
- return nullptr;
-
- Value *NewX = Builder.CreateLShr(X, Y, X->getName() + ".highbits");
- Constant *Zero = Constant::getNullValue(NewX->getType());
- return CmpInst::Create(Instruction::ICmp, NewPred, NewX, Zero);
-}
-
-static Instruction *foldVectorCmp(CmpInst &Cmp,
- InstCombiner::BuilderTy &Builder) {
- const CmpInst::Predicate Pred = Cmp.getPredicate();
- Value *LHS = Cmp.getOperand(0), *RHS = Cmp.getOperand(1);
- Value *V1, *V2;
- ArrayRef<int> M;
- if (!match(LHS, m_Shuffle(m_Value(V1), m_Undef(), m_Mask(M))))
- return nullptr;
-
- // If both arguments of the cmp are shuffles that use the same mask and
- // shuffle within a single vector, move the shuffle after the cmp:
- // cmp (shuffle V1, M), (shuffle V2, M) --> shuffle (cmp V1, V2), M
- Type *V1Ty = V1->getType();
- if (match(RHS, m_Shuffle(m_Value(V2), m_Undef(), m_SpecificMask(M))) &&
- V1Ty == V2->getType() && (LHS->hasOneUse() || RHS->hasOneUse())) {
- Value *NewCmp = Builder.CreateCmp(Pred, V1, V2);
- return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()), M);
- }
-
- // Try to canonicalize compare with splatted operand and splat constant.
- // TODO: We could generalize this for more than splats. See/use the code in
- // InstCombiner::foldVectorBinop().
- Constant *C;
- if (!LHS->hasOneUse() || !match(RHS, m_Constant(C)))
- return nullptr;
-
- // Length-changing splats are ok, so adjust the constants as needed:
- // cmp (shuffle V1, M), C --> shuffle (cmp V1, C'), M
- Constant *ScalarC = C->getSplatValue(/* AllowUndefs */ true);
- int MaskSplatIndex;
- if (ScalarC && match(M, m_SplatOrUndefMask(MaskSplatIndex))) {
- // We allow undefs in matching, but this transform removes those for safety.
- // Demanded elements analysis should be able to recover some/all of that.
- C = ConstantVector::getSplat(cast<VectorType>(V1Ty)->getElementCount(),
- ScalarC);
- SmallVector<int, 8> NewM(M.size(), MaskSplatIndex);
- Value *NewCmp = Builder.CreateCmp(Pred, V1, C);
- return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()),
- NewM);
- }
-
- return nullptr;
-}
-
-// extract(uadd.with.overflow(A, B), 0) ult A
-// -> extract(uadd.with.overflow(A, B), 1)
-static Instruction *foldICmpOfUAddOv(ICmpInst &I) {
- CmpInst::Predicate Pred = I.getPredicate();
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- Value *UAddOv;
- Value *A, *B;
- auto UAddOvResultPat = m_ExtractValue<0>(
- m_Intrinsic<Intrinsic::uadd_with_overflow>(m_Value(A), m_Value(B)));
- if (match(Op0, UAddOvResultPat) &&
- ((Pred == ICmpInst::ICMP_ULT && (Op1 == A || Op1 == B)) ||
- (Pred == ICmpInst::ICMP_EQ && match(Op1, m_ZeroInt()) &&
- (match(A, m_One()) || match(B, m_One()))) ||
- (Pred == ICmpInst::ICMP_NE && match(Op1, m_AllOnes()) &&
- (match(A, m_AllOnes()) || match(B, m_AllOnes())))))
- // extract(uadd.with.overflow(A, B), 0) < A
- // extract(uadd.with.overflow(A, 1), 0) == 0
- // extract(uadd.with.overflow(A, -1), 0) != -1
- UAddOv = cast<ExtractValueInst>(Op0)->getAggregateOperand();
- else if (match(Op1, UAddOvResultPat) &&
- Pred == ICmpInst::ICMP_UGT && (Op0 == A || Op0 == B))
- // A > extract(uadd.with.overflow(A, B), 0)
- UAddOv = cast<ExtractValueInst>(Op1)->getAggregateOperand();
- else
- return nullptr;
-
- return ExtractValueInst::Create(UAddOv, 1);
-}
-
+
+ return &I;
+}
+
+/// Integer compare with boolean values can always be turned into bitwise ops.
+static Instruction *canonicalizeICmpBool(ICmpInst &I,
+ InstCombiner::BuilderTy &Builder) {
+ Value *A = I.getOperand(0), *B = I.getOperand(1);
+ assert(A->getType()->isIntOrIntVectorTy(1) && "Bools only");
+
+ // A boolean compared to true/false can be simplified to Op0/true/false in
+ // 14 out of the 20 (10 predicates * 2 constants) possible combinations.
+ // Cases not handled by InstSimplify are always 'not' of Op0.
+ if (match(B, m_Zero())) {
+ switch (I.getPredicate()) {
+ case CmpInst::ICMP_EQ: // A == 0 -> !A
+ case CmpInst::ICMP_ULE: // A <=u 0 -> !A
+ case CmpInst::ICMP_SGE: // A >=s 0 -> !A
+ return BinaryOperator::CreateNot(A);
+ default:
+ llvm_unreachable("ICmp i1 X, C not simplified as expected.");
+ }
+ } else if (match(B, m_One())) {
+ switch (I.getPredicate()) {
+ case CmpInst::ICMP_NE: // A != 1 -> !A
+ case CmpInst::ICMP_ULT: // A <u 1 -> !A
+ case CmpInst::ICMP_SGT: // A >s -1 -> !A
+ return BinaryOperator::CreateNot(A);
+ default:
+ llvm_unreachable("ICmp i1 X, C not simplified as expected.");
+ }
+ }
+
+ switch (I.getPredicate()) {
+ default:
+ llvm_unreachable("Invalid icmp instruction!");
+ case ICmpInst::ICMP_EQ:
+ // icmp eq i1 A, B -> ~(A ^ B)
+ return BinaryOperator::CreateNot(Builder.CreateXor(A, B));
+
+ case ICmpInst::ICMP_NE:
+ // icmp ne i1 A, B -> A ^ B
+ return BinaryOperator::CreateXor(A, B);
+
+ case ICmpInst::ICMP_UGT:
+ // icmp ugt -> icmp ult
+ std::swap(A, B);
+ LLVM_FALLTHROUGH;
+ case ICmpInst::ICMP_ULT:
+ // icmp ult i1 A, B -> ~A & B
+ return BinaryOperator::CreateAnd(Builder.CreateNot(A), B);
+
+ case ICmpInst::ICMP_SGT:
+ // icmp sgt -> icmp slt
+ std::swap(A, B);
+ LLVM_FALLTHROUGH;
+ case ICmpInst::ICMP_SLT:
+ // icmp slt i1 A, B -> A & ~B
+ return BinaryOperator::CreateAnd(Builder.CreateNot(B), A);
+
+ case ICmpInst::ICMP_UGE:
+ // icmp uge -> icmp ule
+ std::swap(A, B);
+ LLVM_FALLTHROUGH;
+ case ICmpInst::ICMP_ULE:
+ // icmp ule i1 A, B -> ~A | B
+ return BinaryOperator::CreateOr(Builder.CreateNot(A), B);
+
+ case ICmpInst::ICMP_SGE:
+ // icmp sge -> icmp sle
+ std::swap(A, B);
+ LLVM_FALLTHROUGH;
+ case ICmpInst::ICMP_SLE:
+ // icmp sle i1 A, B -> A | ~B
+ return BinaryOperator::CreateOr(Builder.CreateNot(B), A);
+ }
+}
+
+// Transform pattern like:
+// (1 << Y) u<= X or ~(-1 << Y) u< X or ((1 << Y)+(-1)) u< X
+// (1 << Y) u> X or ~(-1 << Y) u>= X or ((1 << Y)+(-1)) u>= X
+// Into:
+// (X l>> Y) != 0
+// (X l>> Y) == 0
+static Instruction *foldICmpWithHighBitMask(ICmpInst &Cmp,
+ InstCombiner::BuilderTy &Builder) {
+ ICmpInst::Predicate Pred, NewPred;
+ Value *X, *Y;
+ if (match(&Cmp,
+ m_c_ICmp(Pred, m_OneUse(m_Shl(m_One(), m_Value(Y))), m_Value(X)))) {
+ switch (Pred) {
+ case ICmpInst::ICMP_ULE:
+ NewPred = ICmpInst::ICMP_NE;
+ break;
+ case ICmpInst::ICMP_UGT:
+ NewPred = ICmpInst::ICMP_EQ;
+ break;
+ default:
+ return nullptr;
+ }
+ } else if (match(&Cmp, m_c_ICmp(Pred,
+ m_OneUse(m_CombineOr(
+ m_Not(m_Shl(m_AllOnes(), m_Value(Y))),
+ m_Add(m_Shl(m_One(), m_Value(Y)),
+ m_AllOnes()))),
+ m_Value(X)))) {
+ // The variant with 'add' is not canonical, (the variant with 'not' is)
+ // we only get it because it has extra uses, and can't be canonicalized,
+
+ switch (Pred) {
+ case ICmpInst::ICMP_ULT:
+ NewPred = ICmpInst::ICMP_NE;
+ break;
+ case ICmpInst::ICMP_UGE:
+ NewPred = ICmpInst::ICMP_EQ;
+ break;
+ default:
+ return nullptr;
+ }
+ } else
+ return nullptr;
+
+ Value *NewX = Builder.CreateLShr(X, Y, X->getName() + ".highbits");
+ Constant *Zero = Constant::getNullValue(NewX->getType());
+ return CmpInst::Create(Instruction::ICmp, NewPred, NewX, Zero);
+}
+
+static Instruction *foldVectorCmp(CmpInst &Cmp,
+ InstCombiner::BuilderTy &Builder) {
+ const CmpInst::Predicate Pred = Cmp.getPredicate();
+ Value *LHS = Cmp.getOperand(0), *RHS = Cmp.getOperand(1);
+ Value *V1, *V2;
+ ArrayRef<int> M;
+ if (!match(LHS, m_Shuffle(m_Value(V1), m_Undef(), m_Mask(M))))
+ return nullptr;
+
+ // If both arguments of the cmp are shuffles that use the same mask and
+ // shuffle within a single vector, move the shuffle after the cmp:
+ // cmp (shuffle V1, M), (shuffle V2, M) --> shuffle (cmp V1, V2), M
+ Type *V1Ty = V1->getType();
+ if (match(RHS, m_Shuffle(m_Value(V2), m_Undef(), m_SpecificMask(M))) &&
+ V1Ty == V2->getType() && (LHS->hasOneUse() || RHS->hasOneUse())) {
+ Value *NewCmp = Builder.CreateCmp(Pred, V1, V2);
+ return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()), M);
+ }
+
+ // Try to canonicalize compare with splatted operand and splat constant.
+ // TODO: We could generalize this for more than splats. See/use the code in
+ // InstCombiner::foldVectorBinop().
+ Constant *C;
+ if (!LHS->hasOneUse() || !match(RHS, m_Constant(C)))
+ return nullptr;
+
+ // Length-changing splats are ok, so adjust the constants as needed:
+ // cmp (shuffle V1, M), C --> shuffle (cmp V1, C'), M
+ Constant *ScalarC = C->getSplatValue(/* AllowUndefs */ true);
+ int MaskSplatIndex;
+ if (ScalarC && match(M, m_SplatOrUndefMask(MaskSplatIndex))) {
+ // We allow undefs in matching, but this transform removes those for safety.
+ // Demanded elements analysis should be able to recover some/all of that.
+ C = ConstantVector::getSplat(cast<VectorType>(V1Ty)->getElementCount(),
+ ScalarC);
+ SmallVector<int, 8> NewM(M.size(), MaskSplatIndex);
+ Value *NewCmp = Builder.CreateCmp(Pred, V1, C);
+ return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()),
+ NewM);
+ }
+
+ return nullptr;
+}
+
+// extract(uadd.with.overflow(A, B), 0) ult A
+// -> extract(uadd.with.overflow(A, B), 1)
+static Instruction *foldICmpOfUAddOv(ICmpInst &I) {
+ CmpInst::Predicate Pred = I.getPredicate();
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+ Value *UAddOv;
+ Value *A, *B;
+ auto UAddOvResultPat = m_ExtractValue<0>(
+ m_Intrinsic<Intrinsic::uadd_with_overflow>(m_Value(A), m_Value(B)));
+ if (match(Op0, UAddOvResultPat) &&
+ ((Pred == ICmpInst::ICMP_ULT && (Op1 == A || Op1 == B)) ||
+ (Pred == ICmpInst::ICMP_EQ && match(Op1, m_ZeroInt()) &&
+ (match(A, m_One()) || match(B, m_One()))) ||
+ (Pred == ICmpInst::ICMP_NE && match(Op1, m_AllOnes()) &&
+ (match(A, m_AllOnes()) || match(B, m_AllOnes())))))
+ // extract(uadd.with.overflow(A, B), 0) < A
+ // extract(uadd.with.overflow(A, 1), 0) == 0
+ // extract(uadd.with.overflow(A, -1), 0) != -1
+ UAddOv = cast<ExtractValueInst>(Op0)->getAggregateOperand();
+ else if (match(Op1, UAddOvResultPat) &&
+ Pred == ICmpInst::ICMP_UGT && (Op0 == A || Op0 == B))
+ // A > extract(uadd.with.overflow(A, B), 0)
+ UAddOv = cast<ExtractValueInst>(Op1)->getAggregateOperand();
+ else
+ return nullptr;
+
+ return ExtractValueInst::Create(UAddOv, 1);
+}
+
Instruction *InstCombinerImpl::visitICmpInst(ICmpInst &I) {
- bool Changed = false;
- const SimplifyQuery Q = SQ.getWithInstruction(&I);
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- unsigned Op0Cplxity = getComplexity(Op0);
- unsigned Op1Cplxity = getComplexity(Op1);
-
- /// Orders the operands of the compare so that they are listed from most
- /// complex to least complex. This puts constants before unary operators,
- /// before binary operators.
- if (Op0Cplxity < Op1Cplxity ||
- (Op0Cplxity == Op1Cplxity && swapMayExposeCSEOpportunities(Op0, Op1))) {
- I.swapOperands();
- std::swap(Op0, Op1);
- Changed = true;
- }
-
- if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, Q))
- return replaceInstUsesWith(I, V);
-
- // Comparing -val or val with non-zero is the same as just comparing val
- // ie, abs(val) != 0 -> val != 0
- if (I.getPredicate() == ICmpInst::ICMP_NE && match(Op1, m_Zero())) {
- Value *Cond, *SelectTrue, *SelectFalse;
- if (match(Op0, m_Select(m_Value(Cond), m_Value(SelectTrue),
- m_Value(SelectFalse)))) {
- if (Value *V = dyn_castNegVal(SelectTrue)) {
- if (V == SelectFalse)
- return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
- }
- else if (Value *V = dyn_castNegVal(SelectFalse)) {
- if (V == SelectTrue)
- return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
- }
- }
- }
-
- if (Op0->getType()->isIntOrIntVectorTy(1))
- if (Instruction *Res = canonicalizeICmpBool(I, Builder))
- return Res;
-
- if (Instruction *Res = canonicalizeCmpWithConstant(I))
- return Res;
-
- if (Instruction *Res = canonicalizeICmpPredicate(I))
- return Res;
-
- if (Instruction *Res = foldICmpWithConstant(I))
- return Res;
-
- if (Instruction *Res = foldICmpWithDominatingICmp(I))
- return Res;
-
- if (Instruction *Res = foldICmpBinOp(I, Q))
- return Res;
-
- if (Instruction *Res = foldICmpUsingKnownBits(I))
- return Res;
-
- // Test if the ICmpInst instruction is used exclusively by a select as
- // part of a minimum or maximum operation. If so, refrain from doing
- // any other folding. This helps out other analyses which understand
- // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
- // and CodeGen. And in this case, at least one of the comparison
- // operands has at least one user besides the compare (the select),
- // which would often largely negate the benefit of folding anyway.
- //
- // Do the same for the other patterns recognized by matchSelectPattern.
- if (I.hasOneUse())
- if (SelectInst *SI = dyn_cast<SelectInst>(I.user_back())) {
- Value *A, *B;
- SelectPatternResult SPR = matchSelectPattern(SI, A, B);
- if (SPR.Flavor != SPF_UNKNOWN)
- return nullptr;
- }
-
- // Do this after checking for min/max to prevent infinite looping.
- if (Instruction *Res = foldICmpWithZero(I))
- return Res;
-
- // FIXME: We only do this after checking for min/max to prevent infinite
- // looping caused by a reverse canonicalization of these patterns for min/max.
- // FIXME: The organization of folds is a mess. These would naturally go into
- // canonicalizeCmpWithConstant(), but we can't move all of the above folds
- // down here after the min/max restriction.
- ICmpInst::Predicate Pred = I.getPredicate();
- const APInt *C;
- if (match(Op1, m_APInt(C))) {
- // For i32: x >u 2147483647 -> x <s 0 -> true if sign bit set
- if (Pred == ICmpInst::ICMP_UGT && C->isMaxSignedValue()) {
- Constant *Zero = Constant::getNullValue(Op0->getType());
- return new ICmpInst(ICmpInst::ICMP_SLT, Op0, Zero);
- }
-
- // For i32: x <u 2147483648 -> x >s -1 -> true if sign bit clear
- if (Pred == ICmpInst::ICMP_ULT && C->isMinSignedValue()) {
- Constant *AllOnes = Constant::getAllOnesValue(Op0->getType());
- return new ICmpInst(ICmpInst::ICMP_SGT, Op0, AllOnes);
- }
- }
-
- if (Instruction *Res = foldICmpInstWithConstant(I))
- return Res;
-
- // Try to match comparison as a sign bit test. Intentionally do this after
- // foldICmpInstWithConstant() to potentially let other folds to happen first.
- if (Instruction *New = foldSignBitTest(I))
- return New;
-
- if (Instruction *Res = foldICmpInstWithConstantNotInt(I))
- return Res;
-
- // If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now.
- if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op0))
- if (Instruction *NI = foldGEPICmp(GEP, Op1, I.getPredicate(), I))
- return NI;
- if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1))
- if (Instruction *NI = foldGEPICmp(GEP, Op0,
- ICmpInst::getSwappedPredicate(I.getPredicate()), I))
- return NI;
-
- // Try to optimize equality comparisons against alloca-based pointers.
- if (Op0->getType()->isPointerTy() && I.isEquality()) {
- assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?");
+ bool Changed = false;
+ const SimplifyQuery Q = SQ.getWithInstruction(&I);
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ unsigned Op0Cplxity = getComplexity(Op0);
+ unsigned Op1Cplxity = getComplexity(Op1);
+
+ /// Orders the operands of the compare so that they are listed from most
+ /// complex to least complex. This puts constants before unary operators,
+ /// before binary operators.
+ if (Op0Cplxity < Op1Cplxity ||
+ (Op0Cplxity == Op1Cplxity && swapMayExposeCSEOpportunities(Op0, Op1))) {
+ I.swapOperands();
+ std::swap(Op0, Op1);
+ Changed = true;
+ }
+
+ if (Value *V = SimplifyICmpInst(I.getPredicate(), Op0, Op1, Q))
+ return replaceInstUsesWith(I, V);
+
+ // Comparing -val or val with non-zero is the same as just comparing val
+ // ie, abs(val) != 0 -> val != 0
+ if (I.getPredicate() == ICmpInst::ICMP_NE && match(Op1, m_Zero())) {
+ Value *Cond, *SelectTrue, *SelectFalse;
+ if (match(Op0, m_Select(m_Value(Cond), m_Value(SelectTrue),
+ m_Value(SelectFalse)))) {
+ if (Value *V = dyn_castNegVal(SelectTrue)) {
+ if (V == SelectFalse)
+ return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
+ }
+ else if (Value *V = dyn_castNegVal(SelectFalse)) {
+ if (V == SelectTrue)
+ return CmpInst::Create(Instruction::ICmp, I.getPredicate(), V, Op1);
+ }
+ }
+ }
+
+ if (Op0->getType()->isIntOrIntVectorTy(1))
+ if (Instruction *Res = canonicalizeICmpBool(I, Builder))
+ return Res;
+
+ if (Instruction *Res = canonicalizeCmpWithConstant(I))
+ return Res;
+
+ if (Instruction *Res = canonicalizeICmpPredicate(I))
+ return Res;
+
+ if (Instruction *Res = foldICmpWithConstant(I))
+ return Res;
+
+ if (Instruction *Res = foldICmpWithDominatingICmp(I))
+ return Res;
+
+ if (Instruction *Res = foldICmpBinOp(I, Q))
+ return Res;
+
+ if (Instruction *Res = foldICmpUsingKnownBits(I))
+ return Res;
+
+ // Test if the ICmpInst instruction is used exclusively by a select as
+ // part of a minimum or maximum operation. If so, refrain from doing
+ // any other folding. This helps out other analyses which understand
+ // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
+ // and CodeGen. And in this case, at least one of the comparison
+ // operands has at least one user besides the compare (the select),
+ // which would often largely negate the benefit of folding anyway.
+ //
+ // Do the same for the other patterns recognized by matchSelectPattern.
+ if (I.hasOneUse())
+ if (SelectInst *SI = dyn_cast<SelectInst>(I.user_back())) {
+ Value *A, *B;
+ SelectPatternResult SPR = matchSelectPattern(SI, A, B);
+ if (SPR.Flavor != SPF_UNKNOWN)
+ return nullptr;
+ }
+
+ // Do this after checking for min/max to prevent infinite looping.
+ if (Instruction *Res = foldICmpWithZero(I))
+ return Res;
+
+ // FIXME: We only do this after checking for min/max to prevent infinite
+ // looping caused by a reverse canonicalization of these patterns for min/max.
+ // FIXME: The organization of folds is a mess. These would naturally go into
+ // canonicalizeCmpWithConstant(), but we can't move all of the above folds
+ // down here after the min/max restriction.
+ ICmpInst::Predicate Pred = I.getPredicate();
+ const APInt *C;
+ if (match(Op1, m_APInt(C))) {
+ // For i32: x >u 2147483647 -> x <s 0 -> true if sign bit set
+ if (Pred == ICmpInst::ICMP_UGT && C->isMaxSignedValue()) {
+ Constant *Zero = Constant::getNullValue(Op0->getType());
+ return new ICmpInst(ICmpInst::ICMP_SLT, Op0, Zero);
+ }
+
+ // For i32: x <u 2147483648 -> x >s -1 -> true if sign bit clear
+ if (Pred == ICmpInst::ICMP_ULT && C->isMinSignedValue()) {
+ Constant *AllOnes = Constant::getAllOnesValue(Op0->getType());
+ return new ICmpInst(ICmpInst::ICMP_SGT, Op0, AllOnes);
+ }
+ }
+
+ if (Instruction *Res = foldICmpInstWithConstant(I))
+ return Res;
+
+ // Try to match comparison as a sign bit test. Intentionally do this after
+ // foldICmpInstWithConstant() to potentially let other folds to happen first.
+ if (Instruction *New = foldSignBitTest(I))
+ return New;
+
+ if (Instruction *Res = foldICmpInstWithConstantNotInt(I))
+ return Res;
+
+ // If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now.
+ if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op0))
+ if (Instruction *NI = foldGEPICmp(GEP, Op1, I.getPredicate(), I))
+ return NI;
+ if (GEPOperator *GEP = dyn_cast<GEPOperator>(Op1))
+ if (Instruction *NI = foldGEPICmp(GEP, Op0,
+ ICmpInst::getSwappedPredicate(I.getPredicate()), I))
+ return NI;
+
+ // Try to optimize equality comparisons against alloca-based pointers.
+ if (Op0->getType()->isPointerTy() && I.isEquality()) {
+ assert(Op1->getType()->isPointerTy() && "Comparing pointer with non-pointer?");
if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(Op0)))
- if (Instruction *New = foldAllocaCmp(I, Alloca, Op1))
- return New;
+ if (Instruction *New = foldAllocaCmp(I, Alloca, Op1))
+ return New;
if (auto *Alloca = dyn_cast<AllocaInst>(getUnderlyingObject(Op1)))
- if (Instruction *New = foldAllocaCmp(I, Alloca, Op0))
- return New;
- }
-
- if (Instruction *Res = foldICmpBitCast(I, Builder))
- return Res;
-
- // TODO: Hoist this above the min/max bailout.
- if (Instruction *R = foldICmpWithCastOp(I))
- return R;
-
- if (Instruction *Res = foldICmpWithMinMax(I))
- return Res;
-
- {
- Value *A, *B;
- // Transform (A & ~B) == 0 --> (A & B) != 0
- // and (A & ~B) != 0 --> (A & B) == 0
- // if A is a power of 2.
- if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
- match(Op1, m_Zero()) &&
- isKnownToBeAPowerOfTwo(A, false, 0, &I) && I.isEquality())
- return new ICmpInst(I.getInversePredicate(), Builder.CreateAnd(A, B),
- Op1);
-
- // ~X < ~Y --> Y < X
- // ~X < C --> X > ~C
- if (match(Op0, m_Not(m_Value(A)))) {
- if (match(Op1, m_Not(m_Value(B))))
- return new ICmpInst(I.getPredicate(), B, A);
-
- const APInt *C;
- if (match(Op1, m_APInt(C)))
- return new ICmpInst(I.getSwappedPredicate(), A,
- ConstantInt::get(Op1->getType(), ~(*C)));
- }
-
- Instruction *AddI = nullptr;
- if (match(&I, m_UAddWithOverflow(m_Value(A), m_Value(B),
- m_Instruction(AddI))) &&
- isa<IntegerType>(A->getType())) {
- Value *Result;
- Constant *Overflow;
- // m_UAddWithOverflow can match patterns that do not include an explicit
- // "add" instruction, so check the opcode of the matched op.
- if (AddI->getOpcode() == Instruction::Add &&
- OptimizeOverflowCheck(Instruction::Add, /*Signed*/ false, A, B, *AddI,
- Result, Overflow)) {
- replaceInstUsesWith(*AddI, Result);
- eraseInstFromFunction(*AddI);
- return replaceInstUsesWith(I, Overflow);
- }
- }
-
- // (zext a) * (zext b) --> llvm.umul.with.overflow.
- if (match(Op0, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
- if (Instruction *R = processUMulZExtIdiom(I, Op0, Op1, *this))
- return R;
- }
- if (match(Op1, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
- if (Instruction *R = processUMulZExtIdiom(I, Op1, Op0, *this))
- return R;
- }
- }
-
- if (Instruction *Res = foldICmpEquality(I))
- return Res;
-
- if (Instruction *Res = foldICmpOfUAddOv(I))
- return Res;
-
- // The 'cmpxchg' instruction returns an aggregate containing the old value and
- // an i1 which indicates whether or not we successfully did the swap.
- //
- // Replace comparisons between the old value and the expected value with the
- // indicator that 'cmpxchg' returns.
- //
- // N.B. This transform is only valid when the 'cmpxchg' is not permitted to
- // spuriously fail. In those cases, the old value may equal the expected
- // value but it is possible for the swap to not occur.
- if (I.getPredicate() == ICmpInst::ICMP_EQ)
- if (auto *EVI = dyn_cast<ExtractValueInst>(Op0))
- if (auto *ACXI = dyn_cast<AtomicCmpXchgInst>(EVI->getAggregateOperand()))
- if (EVI->getIndices()[0] == 0 && ACXI->getCompareOperand() == Op1 &&
- !ACXI->isWeak())
- return ExtractValueInst::Create(ACXI, 1);
-
- {
- Value *X;
- const APInt *C;
- // icmp X+Cst, X
- if (match(Op0, m_Add(m_Value(X), m_APInt(C))) && Op1 == X)
- return foldICmpAddOpConst(X, *C, I.getPredicate());
-
- // icmp X, X+Cst
- if (match(Op1, m_Add(m_Value(X), m_APInt(C))) && Op0 == X)
- return foldICmpAddOpConst(X, *C, I.getSwappedPredicate());
- }
-
- if (Instruction *Res = foldICmpWithHighBitMask(I, Builder))
- return Res;
-
- if (I.getType()->isVectorTy())
- if (Instruction *Res = foldVectorCmp(I, Builder))
- return Res;
-
- return Changed ? &I : nullptr;
-}
-
-/// Fold fcmp ([us]itofp x, cst) if possible.
+ if (Instruction *New = foldAllocaCmp(I, Alloca, Op0))
+ return New;
+ }
+
+ if (Instruction *Res = foldICmpBitCast(I, Builder))
+ return Res;
+
+ // TODO: Hoist this above the min/max bailout.
+ if (Instruction *R = foldICmpWithCastOp(I))
+ return R;
+
+ if (Instruction *Res = foldICmpWithMinMax(I))
+ return Res;
+
+ {
+ Value *A, *B;
+ // Transform (A & ~B) == 0 --> (A & B) != 0
+ // and (A & ~B) != 0 --> (A & B) == 0
+ // if A is a power of 2.
+ if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
+ match(Op1, m_Zero()) &&
+ isKnownToBeAPowerOfTwo(A, false, 0, &I) && I.isEquality())
+ return new ICmpInst(I.getInversePredicate(), Builder.CreateAnd(A, B),
+ Op1);
+
+ // ~X < ~Y --> Y < X
+ // ~X < C --> X > ~C
+ if (match(Op0, m_Not(m_Value(A)))) {
+ if (match(Op1, m_Not(m_Value(B))))
+ return new ICmpInst(I.getPredicate(), B, A);
+
+ const APInt *C;
+ if (match(Op1, m_APInt(C)))
+ return new ICmpInst(I.getSwappedPredicate(), A,
+ ConstantInt::get(Op1->getType(), ~(*C)));
+ }
+
+ Instruction *AddI = nullptr;
+ if (match(&I, m_UAddWithOverflow(m_Value(A), m_Value(B),
+ m_Instruction(AddI))) &&
+ isa<IntegerType>(A->getType())) {
+ Value *Result;
+ Constant *Overflow;
+ // m_UAddWithOverflow can match patterns that do not include an explicit
+ // "add" instruction, so check the opcode of the matched op.
+ if (AddI->getOpcode() == Instruction::Add &&
+ OptimizeOverflowCheck(Instruction::Add, /*Signed*/ false, A, B, *AddI,
+ Result, Overflow)) {
+ replaceInstUsesWith(*AddI, Result);
+ eraseInstFromFunction(*AddI);
+ return replaceInstUsesWith(I, Overflow);
+ }
+ }
+
+ // (zext a) * (zext b) --> llvm.umul.with.overflow.
+ if (match(Op0, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
+ if (Instruction *R = processUMulZExtIdiom(I, Op0, Op1, *this))
+ return R;
+ }
+ if (match(Op1, m_Mul(m_ZExt(m_Value(A)), m_ZExt(m_Value(B))))) {
+ if (Instruction *R = processUMulZExtIdiom(I, Op1, Op0, *this))
+ return R;
+ }
+ }
+
+ if (Instruction *Res = foldICmpEquality(I))
+ return Res;
+
+ if (Instruction *Res = foldICmpOfUAddOv(I))
+ return Res;
+
+ // The 'cmpxchg' instruction returns an aggregate containing the old value and
+ // an i1 which indicates whether or not we successfully did the swap.
+ //
+ // Replace comparisons between the old value and the expected value with the
+ // indicator that 'cmpxchg' returns.
+ //
+ // N.B. This transform is only valid when the 'cmpxchg' is not permitted to
+ // spuriously fail. In those cases, the old value may equal the expected
+ // value but it is possible for the swap to not occur.
+ if (I.getPredicate() == ICmpInst::ICMP_EQ)
+ if (auto *EVI = dyn_cast<ExtractValueInst>(Op0))
+ if (auto *ACXI = dyn_cast<AtomicCmpXchgInst>(EVI->getAggregateOperand()))
+ if (EVI->getIndices()[0] == 0 && ACXI->getCompareOperand() == Op1 &&
+ !ACXI->isWeak())
+ return ExtractValueInst::Create(ACXI, 1);
+
+ {
+ Value *X;
+ const APInt *C;
+ // icmp X+Cst, X
+ if (match(Op0, m_Add(m_Value(X), m_APInt(C))) && Op1 == X)
+ return foldICmpAddOpConst(X, *C, I.getPredicate());
+
+ // icmp X, X+Cst
+ if (match(Op1, m_Add(m_Value(X), m_APInt(C))) && Op0 == X)
+ return foldICmpAddOpConst(X, *C, I.getSwappedPredicate());
+ }
+
+ if (Instruction *Res = foldICmpWithHighBitMask(I, Builder))
+ return Res;
+
+ if (I.getType()->isVectorTy())
+ if (Instruction *Res = foldVectorCmp(I, Builder))
+ return Res;
+
+ return Changed ? &I : nullptr;
+}
+
+/// Fold fcmp ([us]itofp x, cst) if possible.
Instruction *InstCombinerImpl::foldFCmpIntToFPConst(FCmpInst &I,
Instruction *LHSI,
Constant *RHSC) {
- if (!isa<ConstantFP>(RHSC)) return nullptr;
- const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
-
- // Get the width of the mantissa. We don't want to hack on conversions that
- // might lose information from the integer, e.g. "i64 -> float"
- int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
- if (MantissaWidth == -1) return nullptr; // Unknown.
-
- IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
-
- bool LHSUnsigned = isa<UIToFPInst>(LHSI);
-
- if (I.isEquality()) {
- FCmpInst::Predicate P = I.getPredicate();
- bool IsExact = false;
- APSInt RHSCvt(IntTy->getBitWidth(), LHSUnsigned);
- RHS.convertToInteger(RHSCvt, APFloat::rmNearestTiesToEven, &IsExact);
-
- // If the floating point constant isn't an integer value, we know if we will
- // ever compare equal / not equal to it.
- if (!IsExact) {
- // TODO: Can never be -0.0 and other non-representable values
- APFloat RHSRoundInt(RHS);
- RHSRoundInt.roundToIntegral(APFloat::rmNearestTiesToEven);
- if (RHS != RHSRoundInt) {
- if (P == FCmpInst::FCMP_OEQ || P == FCmpInst::FCMP_UEQ)
- return replaceInstUsesWith(I, Builder.getFalse());
-
- assert(P == FCmpInst::FCMP_ONE || P == FCmpInst::FCMP_UNE);
- return replaceInstUsesWith(I, Builder.getTrue());
- }
- }
-
- // TODO: If the constant is exactly representable, is it always OK to do
- // equality compares as integer?
- }
-
- // Check to see that the input is converted from an integer type that is small
- // enough that preserves all bits. TODO: check here for "known" sign bits.
- // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
- unsigned InputSize = IntTy->getScalarSizeInBits();
-
- // Following test does NOT adjust InputSize downwards for signed inputs,
- // because the most negative value still requires all the mantissa bits
- // to distinguish it from one less than that value.
- if ((int)InputSize > MantissaWidth) {
- // Conversion would lose accuracy. Check if loss can impact comparison.
- int Exp = ilogb(RHS);
- if (Exp == APFloat::IEK_Inf) {
- int MaxExponent = ilogb(APFloat::getLargest(RHS.getSemantics()));
- if (MaxExponent < (int)InputSize - !LHSUnsigned)
- // Conversion could create infinity.
- return nullptr;
- } else {
- // Note that if RHS is zero or NaN, then Exp is negative
- // and first condition is trivially false.
- if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned)
- // Conversion could affect comparison.
- return nullptr;
- }
- }
-
- // Otherwise, we can potentially simplify the comparison. We know that it
- // will always come through as an integer value and we know the constant is
- // not a NAN (it would have been previously simplified).
- assert(!RHS.isNaN() && "NaN comparison not already folded!");
-
- ICmpInst::Predicate Pred;
- switch (I.getPredicate()) {
- default: llvm_unreachable("Unexpected predicate!");
- case FCmpInst::FCMP_UEQ:
- case FCmpInst::FCMP_OEQ:
- Pred = ICmpInst::ICMP_EQ;
- break;
- case FCmpInst::FCMP_UGT:
- case FCmpInst::FCMP_OGT:
- Pred = LHSUnsigned ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_SGT;
- break;
- case FCmpInst::FCMP_UGE:
- case FCmpInst::FCMP_OGE:
- Pred = LHSUnsigned ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_SGE;
- break;
- case FCmpInst::FCMP_ULT:
- case FCmpInst::FCMP_OLT:
- Pred = LHSUnsigned ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_SLT;
- break;
- case FCmpInst::FCMP_ULE:
- case FCmpInst::FCMP_OLE:
- Pred = LHSUnsigned ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_SLE;
- break;
- case FCmpInst::FCMP_UNE:
- case FCmpInst::FCMP_ONE:
- Pred = ICmpInst::ICMP_NE;
- break;
- case FCmpInst::FCMP_ORD:
- return replaceInstUsesWith(I, Builder.getTrue());
- case FCmpInst::FCMP_UNO:
- return replaceInstUsesWith(I, Builder.getFalse());
- }
-
- // Now we know that the APFloat is a normal number, zero or inf.
-
- // See if the FP constant is too large for the integer. For example,
- // comparing an i8 to 300.0.
- unsigned IntWidth = IntTy->getScalarSizeInBits();
-
- if (!LHSUnsigned) {
- // If the RHS value is > SignedMax, fold the comparison. This handles +INF
- // and large values.
- APFloat SMax(RHS.getSemantics());
- SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true,
- APFloat::rmNearestTiesToEven);
- if (SMax < RHS) { // smax < 13123.0
- if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SLT ||
- Pred == ICmpInst::ICMP_SLE)
- return replaceInstUsesWith(I, Builder.getTrue());
- return replaceInstUsesWith(I, Builder.getFalse());
- }
- } else {
- // If the RHS value is > UnsignedMax, fold the comparison. This handles
- // +INF and large values.
- APFloat UMax(RHS.getSemantics());
- UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false,
- APFloat::rmNearestTiesToEven);
- if (UMax < RHS) { // umax < 13123.0
- if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_ULT ||
- Pred == ICmpInst::ICMP_ULE)
- return replaceInstUsesWith(I, Builder.getTrue());
- return replaceInstUsesWith(I, Builder.getFalse());
- }
- }
-
- if (!LHSUnsigned) {
- // See if the RHS value is < SignedMin.
- APFloat SMin(RHS.getSemantics());
- SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true,
- APFloat::rmNearestTiesToEven);
- if (SMin > RHS) { // smin > 12312.0
- if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
- Pred == ICmpInst::ICMP_SGE)
- return replaceInstUsesWith(I, Builder.getTrue());
- return replaceInstUsesWith(I, Builder.getFalse());
- }
- } else {
- // See if the RHS value is < UnsignedMin.
- APFloat UMin(RHS.getSemantics());
- UMin.convertFromAPInt(APInt::getMinValue(IntWidth), false,
- APFloat::rmNearestTiesToEven);
- if (UMin > RHS) { // umin > 12312.0
- if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT ||
- Pred == ICmpInst::ICMP_UGE)
- return replaceInstUsesWith(I, Builder.getTrue());
- return replaceInstUsesWith(I, Builder.getFalse());
- }
- }
-
- // Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or
- // [0, UMAX], but it may still be fractional. See if it is fractional by
- // casting the FP value to the integer value and back, checking for equality.
- // Don't do this for zero, because -0.0 is not fractional.
- Constant *RHSInt = LHSUnsigned
- ? ConstantExpr::getFPToUI(RHSC, IntTy)
- : ConstantExpr::getFPToSI(RHSC, IntTy);
- if (!RHS.isZero()) {
- bool Equal = LHSUnsigned
- ? ConstantExpr::getUIToFP(RHSInt, RHSC->getType()) == RHSC
- : ConstantExpr::getSIToFP(RHSInt, RHSC->getType()) == RHSC;
- if (!Equal) {
- // If we had a comparison against a fractional value, we have to adjust
- // the compare predicate and sometimes the value. RHSC is rounded towards
- // zero at this point.
- switch (Pred) {
- default: llvm_unreachable("Unexpected integer comparison!");
- case ICmpInst::ICMP_NE: // (float)int != 4.4 --> true
- return replaceInstUsesWith(I, Builder.getTrue());
- case ICmpInst::ICMP_EQ: // (float)int == 4.4 --> false
- return replaceInstUsesWith(I, Builder.getFalse());
- case ICmpInst::ICMP_ULE:
- // (float)int <= 4.4 --> int <= 4
- // (float)int <= -4.4 --> false
- if (RHS.isNegative())
- return replaceInstUsesWith(I, Builder.getFalse());
- break;
- case ICmpInst::ICMP_SLE:
- // (float)int <= 4.4 --> int <= 4
- // (float)int <= -4.4 --> int < -4
- if (RHS.isNegative())
- Pred = ICmpInst::ICMP_SLT;
- break;
- case ICmpInst::ICMP_ULT:
- // (float)int < -4.4 --> false
- // (float)int < 4.4 --> int <= 4
- if (RHS.isNegative())
- return replaceInstUsesWith(I, Builder.getFalse());
- Pred = ICmpInst::ICMP_ULE;
- break;
- case ICmpInst::ICMP_SLT:
- // (float)int < -4.4 --> int < -4
- // (float)int < 4.4 --> int <= 4
- if (!RHS.isNegative())
- Pred = ICmpInst::ICMP_SLE;
- break;
- case ICmpInst::ICMP_UGT:
- // (float)int > 4.4 --> int > 4
- // (float)int > -4.4 --> true
- if (RHS.isNegative())
- return replaceInstUsesWith(I, Builder.getTrue());
- break;
- case ICmpInst::ICMP_SGT:
- // (float)int > 4.4 --> int > 4
- // (float)int > -4.4 --> int >= -4
- if (RHS.isNegative())
- Pred = ICmpInst::ICMP_SGE;
- break;
- case ICmpInst::ICMP_UGE:
- // (float)int >= -4.4 --> true
- // (float)int >= 4.4 --> int > 4
- if (RHS.isNegative())
- return replaceInstUsesWith(I, Builder.getTrue());
- Pred = ICmpInst::ICMP_UGT;
- break;
- case ICmpInst::ICMP_SGE:
- // (float)int >= -4.4 --> int >= -4
- // (float)int >= 4.4 --> int > 4
- if (!RHS.isNegative())
- Pred = ICmpInst::ICMP_SGT;
- break;
- }
- }
- }
-
- // Lower this FP comparison into an appropriate integer version of the
- // comparison.
- return new ICmpInst(Pred, LHSI->getOperand(0), RHSInt);
-}
-
-/// Fold (C / X) < 0.0 --> X < 0.0 if possible. Swap predicate if necessary.
-static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI,
- Constant *RHSC) {
- // When C is not 0.0 and infinities are not allowed:
- // (C / X) < 0.0 is a sign-bit test of X
- // (C / X) < 0.0 --> X < 0.0 (if C is positive)
- // (C / X) < 0.0 --> X > 0.0 (if C is negative, swap the predicate)
- //
- // Proof:
- // Multiply (C / X) < 0.0 by X * X / C.
- // - X is non zero, if it is the flag 'ninf' is violated.
- // - C defines the sign of X * X * C. Thus it also defines whether to swap
- // the predicate. C is also non zero by definition.
- //
- // Thus X * X / C is non zero and the transformation is valid. [qed]
-
- FCmpInst::Predicate Pred = I.getPredicate();
-
- // Check that predicates are valid.
- if ((Pred != FCmpInst::FCMP_OGT) && (Pred != FCmpInst::FCMP_OLT) &&
- (Pred != FCmpInst::FCMP_OGE) && (Pred != FCmpInst::FCMP_OLE))
- return nullptr;
-
- // Check that RHS operand is zero.
- if (!match(RHSC, m_AnyZeroFP()))
- return nullptr;
-
- // Check fastmath flags ('ninf').
- if (!LHSI->hasNoInfs() || !I.hasNoInfs())
- return nullptr;
-
- // Check the properties of the dividend. It must not be zero to avoid a
- // division by zero (see Proof).
- const APFloat *C;
- if (!match(LHSI->getOperand(0), m_APFloat(C)))
- return nullptr;
-
- if (C->isZero())
- return nullptr;
-
- // Get swapped predicate if necessary.
- if (C->isNegative())
- Pred = I.getSwappedPredicate();
-
- return new FCmpInst(Pred, LHSI->getOperand(1), RHSC, "", &I);
-}
-
-/// Optimize fabs(X) compared with zero.
+ if (!isa<ConstantFP>(RHSC)) return nullptr;
+ const APFloat &RHS = cast<ConstantFP>(RHSC)->getValueAPF();
+
+ // Get the width of the mantissa. We don't want to hack on conversions that
+ // might lose information from the integer, e.g. "i64 -> float"
+ int MantissaWidth = LHSI->getType()->getFPMantissaWidth();
+ if (MantissaWidth == -1) return nullptr; // Unknown.
+
+ IntegerType *IntTy = cast<IntegerType>(LHSI->getOperand(0)->getType());
+
+ bool LHSUnsigned = isa<UIToFPInst>(LHSI);
+
+ if (I.isEquality()) {
+ FCmpInst::Predicate P = I.getPredicate();
+ bool IsExact = false;
+ APSInt RHSCvt(IntTy->getBitWidth(), LHSUnsigned);
+ RHS.convertToInteger(RHSCvt, APFloat::rmNearestTiesToEven, &IsExact);
+
+ // If the floating point constant isn't an integer value, we know if we will
+ // ever compare equal / not equal to it.
+ if (!IsExact) {
+ // TODO: Can never be -0.0 and other non-representable values
+ APFloat RHSRoundInt(RHS);
+ RHSRoundInt.roundToIntegral(APFloat::rmNearestTiesToEven);
+ if (RHS != RHSRoundInt) {
+ if (P == FCmpInst::FCMP_OEQ || P == FCmpInst::FCMP_UEQ)
+ return replaceInstUsesWith(I, Builder.getFalse());
+
+ assert(P == FCmpInst::FCMP_ONE || P == FCmpInst::FCMP_UNE);
+ return replaceInstUsesWith(I, Builder.getTrue());
+ }
+ }
+
+ // TODO: If the constant is exactly representable, is it always OK to do
+ // equality compares as integer?
+ }
+
+ // Check to see that the input is converted from an integer type that is small
+ // enough that preserves all bits. TODO: check here for "known" sign bits.
+ // This would allow us to handle (fptosi (x >>s 62) to float) if x is i64 f.e.
+ unsigned InputSize = IntTy->getScalarSizeInBits();
+
+ // Following test does NOT adjust InputSize downwards for signed inputs,
+ // because the most negative value still requires all the mantissa bits
+ // to distinguish it from one less than that value.
+ if ((int)InputSize > MantissaWidth) {
+ // Conversion would lose accuracy. Check if loss can impact comparison.
+ int Exp = ilogb(RHS);
+ if (Exp == APFloat::IEK_Inf) {
+ int MaxExponent = ilogb(APFloat::getLargest(RHS.getSemantics()));
+ if (MaxExponent < (int)InputSize - !LHSUnsigned)
+ // Conversion could create infinity.
+ return nullptr;
+ } else {
+ // Note that if RHS is zero or NaN, then Exp is negative
+ // and first condition is trivially false.
+ if (MantissaWidth <= Exp && Exp <= (int)InputSize - !LHSUnsigned)
+ // Conversion could affect comparison.
+ return nullptr;
+ }
+ }
+
+ // Otherwise, we can potentially simplify the comparison. We know that it
+ // will always come through as an integer value and we know the constant is
+ // not a NAN (it would have been previously simplified).
+ assert(!RHS.isNaN() && "NaN comparison not already folded!");
+
+ ICmpInst::Predicate Pred;
+ switch (I.getPredicate()) {
+ default: llvm_unreachable("Unexpected predicate!");
+ case FCmpInst::FCMP_UEQ:
+ case FCmpInst::FCMP_OEQ:
+ Pred = ICmpInst::ICMP_EQ;
+ break;
+ case FCmpInst::FCMP_UGT:
+ case FCmpInst::FCMP_OGT:
+ Pred = LHSUnsigned ? ICmpInst::ICMP_UGT : ICmpInst::ICMP_SGT;
+ break;
+ case FCmpInst::FCMP_UGE:
+ case FCmpInst::FCMP_OGE:
+ Pred = LHSUnsigned ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_SGE;
+ break;
+ case FCmpInst::FCMP_ULT:
+ case FCmpInst::FCMP_OLT:
+ Pred = LHSUnsigned ? ICmpInst::ICMP_ULT : ICmpInst::ICMP_SLT;
+ break;
+ case FCmpInst::FCMP_ULE:
+ case FCmpInst::FCMP_OLE:
+ Pred = LHSUnsigned ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_SLE;
+ break;
+ case FCmpInst::FCMP_UNE:
+ case FCmpInst::FCMP_ONE:
+ Pred = ICmpInst::ICMP_NE;
+ break;
+ case FCmpInst::FCMP_ORD:
+ return replaceInstUsesWith(I, Builder.getTrue());
+ case FCmpInst::FCMP_UNO:
+ return replaceInstUsesWith(I, Builder.getFalse());
+ }
+
+ // Now we know that the APFloat is a normal number, zero or inf.
+
+ // See if the FP constant is too large for the integer. For example,
+ // comparing an i8 to 300.0.
+ unsigned IntWidth = IntTy->getScalarSizeInBits();
+
+ if (!LHSUnsigned) {
+ // If the RHS value is > SignedMax, fold the comparison. This handles +INF
+ // and large values.
+ APFloat SMax(RHS.getSemantics());
+ SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true,
+ APFloat::rmNearestTiesToEven);
+ if (SMax < RHS) { // smax < 13123.0
+ if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SLT ||
+ Pred == ICmpInst::ICMP_SLE)
+ return replaceInstUsesWith(I, Builder.getTrue());
+ return replaceInstUsesWith(I, Builder.getFalse());
+ }
+ } else {
+ // If the RHS value is > UnsignedMax, fold the comparison. This handles
+ // +INF and large values.
+ APFloat UMax(RHS.getSemantics());
+ UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false,
+ APFloat::rmNearestTiesToEven);
+ if (UMax < RHS) { // umax < 13123.0
+ if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_ULT ||
+ Pred == ICmpInst::ICMP_ULE)
+ return replaceInstUsesWith(I, Builder.getTrue());
+ return replaceInstUsesWith(I, Builder.getFalse());
+ }
+ }
+
+ if (!LHSUnsigned) {
+ // See if the RHS value is < SignedMin.
+ APFloat SMin(RHS.getSemantics());
+ SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true,
+ APFloat::rmNearestTiesToEven);
+ if (SMin > RHS) { // smin > 12312.0
+ if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
+ Pred == ICmpInst::ICMP_SGE)
+ return replaceInstUsesWith(I, Builder.getTrue());
+ return replaceInstUsesWith(I, Builder.getFalse());
+ }
+ } else {
+ // See if the RHS value is < UnsignedMin.
+ APFloat UMin(RHS.getSemantics());
+ UMin.convertFromAPInt(APInt::getMinValue(IntWidth), false,
+ APFloat::rmNearestTiesToEven);
+ if (UMin > RHS) { // umin > 12312.0
+ if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT ||
+ Pred == ICmpInst::ICMP_UGE)
+ return replaceInstUsesWith(I, Builder.getTrue());
+ return replaceInstUsesWith(I, Builder.getFalse());
+ }
+ }
+
+ // Okay, now we know that the FP constant fits in the range [SMIN, SMAX] or
+ // [0, UMAX], but it may still be fractional. See if it is fractional by
+ // casting the FP value to the integer value and back, checking for equality.
+ // Don't do this for zero, because -0.0 is not fractional.
+ Constant *RHSInt = LHSUnsigned
+ ? ConstantExpr::getFPToUI(RHSC, IntTy)
+ : ConstantExpr::getFPToSI(RHSC, IntTy);
+ if (!RHS.isZero()) {
+ bool Equal = LHSUnsigned
+ ? ConstantExpr::getUIToFP(RHSInt, RHSC->getType()) == RHSC
+ : ConstantExpr::getSIToFP(RHSInt, RHSC->getType()) == RHSC;
+ if (!Equal) {
+ // If we had a comparison against a fractional value, we have to adjust
+ // the compare predicate and sometimes the value. RHSC is rounded towards
+ // zero at this point.
+ switch (Pred) {
+ default: llvm_unreachable("Unexpected integer comparison!");
+ case ICmpInst::ICMP_NE: // (float)int != 4.4 --> true
+ return replaceInstUsesWith(I, Builder.getTrue());
+ case ICmpInst::ICMP_EQ: // (float)int == 4.4 --> false
+ return replaceInstUsesWith(I, Builder.getFalse());
+ case ICmpInst::ICMP_ULE:
+ // (float)int <= 4.4 --> int <= 4
+ // (float)int <= -4.4 --> false
+ if (RHS.isNegative())
+ return replaceInstUsesWith(I, Builder.getFalse());
+ break;
+ case ICmpInst::ICMP_SLE:
+ // (float)int <= 4.4 --> int <= 4
+ // (float)int <= -4.4 --> int < -4
+ if (RHS.isNegative())
+ Pred = ICmpInst::ICMP_SLT;
+ break;
+ case ICmpInst::ICMP_ULT:
+ // (float)int < -4.4 --> false
+ // (float)int < 4.4 --> int <= 4
+ if (RHS.isNegative())
+ return replaceInstUsesWith(I, Builder.getFalse());
+ Pred = ICmpInst::ICMP_ULE;
+ break;
+ case ICmpInst::ICMP_SLT:
+ // (float)int < -4.4 --> int < -4
+ // (float)int < 4.4 --> int <= 4
+ if (!RHS.isNegative())
+ Pred = ICmpInst::ICMP_SLE;
+ break;
+ case ICmpInst::ICMP_UGT:
+ // (float)int > 4.4 --> int > 4
+ // (float)int > -4.4 --> true
+ if (RHS.isNegative())
+ return replaceInstUsesWith(I, Builder.getTrue());
+ break;
+ case ICmpInst::ICMP_SGT:
+ // (float)int > 4.4 --> int > 4
+ // (float)int > -4.4 --> int >= -4
+ if (RHS.isNegative())
+ Pred = ICmpInst::ICMP_SGE;
+ break;
+ case ICmpInst::ICMP_UGE:
+ // (float)int >= -4.4 --> true
+ // (float)int >= 4.4 --> int > 4
+ if (RHS.isNegative())
+ return replaceInstUsesWith(I, Builder.getTrue());
+ Pred = ICmpInst::ICMP_UGT;
+ break;
+ case ICmpInst::ICMP_SGE:
+ // (float)int >= -4.4 --> int >= -4
+ // (float)int >= 4.4 --> int > 4
+ if (!RHS.isNegative())
+ Pred = ICmpInst::ICMP_SGT;
+ break;
+ }
+ }
+ }
+
+ // Lower this FP comparison into an appropriate integer version of the
+ // comparison.
+ return new ICmpInst(Pred, LHSI->getOperand(0), RHSInt);
+}
+
+/// Fold (C / X) < 0.0 --> X < 0.0 if possible. Swap predicate if necessary.
+static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI,
+ Constant *RHSC) {
+ // When C is not 0.0 and infinities are not allowed:
+ // (C / X) < 0.0 is a sign-bit test of X
+ // (C / X) < 0.0 --> X < 0.0 (if C is positive)
+ // (C / X) < 0.0 --> X > 0.0 (if C is negative, swap the predicate)
+ //
+ // Proof:
+ // Multiply (C / X) < 0.0 by X * X / C.
+ // - X is non zero, if it is the flag 'ninf' is violated.
+ // - C defines the sign of X * X * C. Thus it also defines whether to swap
+ // the predicate. C is also non zero by definition.
+ //
+ // Thus X * X / C is non zero and the transformation is valid. [qed]
+
+ FCmpInst::Predicate Pred = I.getPredicate();
+
+ // Check that predicates are valid.
+ if ((Pred != FCmpInst::FCMP_OGT) && (Pred != FCmpInst::FCMP_OLT) &&
+ (Pred != FCmpInst::FCMP_OGE) && (Pred != FCmpInst::FCMP_OLE))
+ return nullptr;
+
+ // Check that RHS operand is zero.
+ if (!match(RHSC, m_AnyZeroFP()))
+ return nullptr;
+
+ // Check fastmath flags ('ninf').
+ if (!LHSI->hasNoInfs() || !I.hasNoInfs())
+ return nullptr;
+
+ // Check the properties of the dividend. It must not be zero to avoid a
+ // division by zero (see Proof).
+ const APFloat *C;
+ if (!match(LHSI->getOperand(0), m_APFloat(C)))
+ return nullptr;
+
+ if (C->isZero())
+ return nullptr;
+
+ // Get swapped predicate if necessary.
+ if (C->isNegative())
+ Pred = I.getSwappedPredicate();
+
+ return new FCmpInst(Pred, LHSI->getOperand(1), RHSC, "", &I);
+}
+
+/// Optimize fabs(X) compared with zero.
static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombinerImpl &IC) {
- Value *X;
+ Value *X;
if (!match(I.getOperand(0), m_FAbs(m_Value(X))) ||
- !match(I.getOperand(1), m_PosZeroFP()))
- return nullptr;
-
- auto replacePredAndOp0 = [&IC](FCmpInst *I, FCmpInst::Predicate P, Value *X) {
- I->setPredicate(P);
- return IC.replaceOperand(*I, 0, X);
- };
-
- switch (I.getPredicate()) {
- case FCmpInst::FCMP_UGE:
- case FCmpInst::FCMP_OLT:
- // fabs(X) >= 0.0 --> true
- // fabs(X) < 0.0 --> false
- llvm_unreachable("fcmp should have simplified");
-
- case FCmpInst::FCMP_OGT:
- // fabs(X) > 0.0 --> X != 0.0
- return replacePredAndOp0(&I, FCmpInst::FCMP_ONE, X);
-
- case FCmpInst::FCMP_UGT:
- // fabs(X) u> 0.0 --> X u!= 0.0
- return replacePredAndOp0(&I, FCmpInst::FCMP_UNE, X);
-
- case FCmpInst::FCMP_OLE:
- // fabs(X) <= 0.0 --> X == 0.0
- return replacePredAndOp0(&I, FCmpInst::FCMP_OEQ, X);
-
- case FCmpInst::FCMP_ULE:
- // fabs(X) u<= 0.0 --> X u== 0.0
- return replacePredAndOp0(&I, FCmpInst::FCMP_UEQ, X);
-
- case FCmpInst::FCMP_OGE:
- // fabs(X) >= 0.0 --> !isnan(X)
- assert(!I.hasNoNaNs() && "fcmp should have simplified");
- return replacePredAndOp0(&I, FCmpInst::FCMP_ORD, X);
-
- case FCmpInst::FCMP_ULT:
- // fabs(X) u< 0.0 --> isnan(X)
- assert(!I.hasNoNaNs() && "fcmp should have simplified");
- return replacePredAndOp0(&I, FCmpInst::FCMP_UNO, X);
-
- case FCmpInst::FCMP_OEQ:
- case FCmpInst::FCMP_UEQ:
- case FCmpInst::FCMP_ONE:
- case FCmpInst::FCMP_UNE:
- case FCmpInst::FCMP_ORD:
- case FCmpInst::FCMP_UNO:
- // Look through the fabs() because it doesn't change anything but the sign.
- // fabs(X) == 0.0 --> X == 0.0,
- // fabs(X) != 0.0 --> X != 0.0
- // isnan(fabs(X)) --> isnan(X)
- // !isnan(fabs(X) --> !isnan(X)
- return replacePredAndOp0(&I, I.getPredicate(), X);
-
- default:
- return nullptr;
- }
-}
-
+ !match(I.getOperand(1), m_PosZeroFP()))
+ return nullptr;
+
+ auto replacePredAndOp0 = [&IC](FCmpInst *I, FCmpInst::Predicate P, Value *X) {
+ I->setPredicate(P);
+ return IC.replaceOperand(*I, 0, X);
+ };
+
+ switch (I.getPredicate()) {
+ case FCmpInst::FCMP_UGE:
+ case FCmpInst::FCMP_OLT:
+ // fabs(X) >= 0.0 --> true
+ // fabs(X) < 0.0 --> false
+ llvm_unreachable("fcmp should have simplified");
+
+ case FCmpInst::FCMP_OGT:
+ // fabs(X) > 0.0 --> X != 0.0
+ return replacePredAndOp0(&I, FCmpInst::FCMP_ONE, X);
+
+ case FCmpInst::FCMP_UGT:
+ // fabs(X) u> 0.0 --> X u!= 0.0
+ return replacePredAndOp0(&I, FCmpInst::FCMP_UNE, X);
+
+ case FCmpInst::FCMP_OLE:
+ // fabs(X) <= 0.0 --> X == 0.0
+ return replacePredAndOp0(&I, FCmpInst::FCMP_OEQ, X);
+
+ case FCmpInst::FCMP_ULE:
+ // fabs(X) u<= 0.0 --> X u== 0.0
+ return replacePredAndOp0(&I, FCmpInst::FCMP_UEQ, X);
+
+ case FCmpInst::FCMP_OGE:
+ // fabs(X) >= 0.0 --> !isnan(X)
+ assert(!I.hasNoNaNs() && "fcmp should have simplified");
+ return replacePredAndOp0(&I, FCmpInst::FCMP_ORD, X);
+
+ case FCmpInst::FCMP_ULT:
+ // fabs(X) u< 0.0 --> isnan(X)
+ assert(!I.hasNoNaNs() && "fcmp should have simplified");
+ return replacePredAndOp0(&I, FCmpInst::FCMP_UNO, X);
+
+ case FCmpInst::FCMP_OEQ:
+ case FCmpInst::FCMP_UEQ:
+ case FCmpInst::FCMP_ONE:
+ case FCmpInst::FCMP_UNE:
+ case FCmpInst::FCMP_ORD:
+ case FCmpInst::FCMP_UNO:
+ // Look through the fabs() because it doesn't change anything but the sign.
+ // fabs(X) == 0.0 --> X == 0.0,
+ // fabs(X) != 0.0 --> X != 0.0
+ // isnan(fabs(X)) --> isnan(X)
+ // !isnan(fabs(X) --> !isnan(X)
+ return replacePredAndOp0(&I, I.getPredicate(), X);
+
+ default:
+ return nullptr;
+ }
+}
+
Instruction *InstCombinerImpl::visitFCmpInst(FCmpInst &I) {
- bool Changed = false;
-
- /// Orders the operands of the compare so that they are listed from most
- /// complex to least complex. This puts constants before unary operators,
- /// before binary operators.
- if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1))) {
- I.swapOperands();
- Changed = true;
- }
-
- const CmpInst::Predicate Pred = I.getPredicate();
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- if (Value *V = SimplifyFCmpInst(Pred, Op0, Op1, I.getFastMathFlags(),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- // Simplify 'fcmp pred X, X'
- Type *OpType = Op0->getType();
- assert(OpType == Op1->getType() && "fcmp with different-typed operands?");
- if (Op0 == Op1) {
- switch (Pred) {
- default: break;
- case FCmpInst::FCMP_UNO: // True if unordered: isnan(X) | isnan(Y)
- case FCmpInst::FCMP_ULT: // True if unordered or less than
- case FCmpInst::FCMP_UGT: // True if unordered or greater than
- case FCmpInst::FCMP_UNE: // True if unordered or not equal
- // Canonicalize these to be 'fcmp uno %X, 0.0'.
- I.setPredicate(FCmpInst::FCMP_UNO);
- I.setOperand(1, Constant::getNullValue(OpType));
- return &I;
-
- case FCmpInst::FCMP_ORD: // True if ordered (no nans)
- case FCmpInst::FCMP_OEQ: // True if ordered and equal
- case FCmpInst::FCMP_OGE: // True if ordered and greater than or equal
- case FCmpInst::FCMP_OLE: // True if ordered and less than or equal
- // Canonicalize these to be 'fcmp ord %X, 0.0'.
- I.setPredicate(FCmpInst::FCMP_ORD);
- I.setOperand(1, Constant::getNullValue(OpType));
- return &I;
- }
- }
-
- // If we're just checking for a NaN (ORD/UNO) and have a non-NaN operand,
- // then canonicalize the operand to 0.0.
- if (Pred == CmpInst::FCMP_ORD || Pred == CmpInst::FCMP_UNO) {
- if (!match(Op0, m_PosZeroFP()) && isKnownNeverNaN(Op0, &TLI))
- return replaceOperand(I, 0, ConstantFP::getNullValue(OpType));
-
- if (!match(Op1, m_PosZeroFP()) && isKnownNeverNaN(Op1, &TLI))
- return replaceOperand(I, 1, ConstantFP::getNullValue(OpType));
- }
-
- // fcmp pred (fneg X), (fneg Y) -> fcmp swap(pred) X, Y
- Value *X, *Y;
- if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
- return new FCmpInst(I.getSwappedPredicate(), X, Y, "", &I);
-
- // Test if the FCmpInst instruction is used exclusively by a select as
- // part of a minimum or maximum operation. If so, refrain from doing
- // any other folding. This helps out other analyses which understand
- // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
- // and CodeGen. And in this case, at least one of the comparison
- // operands has at least one user besides the compare (the select),
- // which would often largely negate the benefit of folding anyway.
- if (I.hasOneUse())
- if (SelectInst *SI = dyn_cast<SelectInst>(I.user_back())) {
- Value *A, *B;
- SelectPatternResult SPR = matchSelectPattern(SI, A, B);
- if (SPR.Flavor != SPF_UNKNOWN)
- return nullptr;
- }
-
- // The sign of 0.0 is ignored by fcmp, so canonicalize to +0.0:
- // fcmp Pred X, -0.0 --> fcmp Pred X, 0.0
- if (match(Op1, m_AnyZeroFP()) && !match(Op1, m_PosZeroFP()))
- return replaceOperand(I, 1, ConstantFP::getNullValue(OpType));
-
- // Handle fcmp with instruction LHS and constant RHS.
- Instruction *LHSI;
- Constant *RHSC;
- if (match(Op0, m_Instruction(LHSI)) && match(Op1, m_Constant(RHSC))) {
- switch (LHSI->getOpcode()) {
- case Instruction::PHI:
- // Only fold fcmp into the PHI if the phi and fcmp are in the same
- // block. If in the same block, we're encouraging jump threading. If
- // not, we are just pessimizing the code by making an i1 phi.
- if (LHSI->getParent() == I.getParent())
- if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
- return NV;
- break;
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- if (Instruction *NV = foldFCmpIntToFPConst(I, LHSI, RHSC))
- return NV;
- break;
- case Instruction::FDiv:
- if (Instruction *NV = foldFCmpReciprocalAndZero(I, LHSI, RHSC))
- return NV;
- break;
- case Instruction::Load:
- if (auto *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0)))
- if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
- if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
- !cast<LoadInst>(LHSI)->isVolatile())
- if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
- return Res;
- break;
- }
- }
-
- if (Instruction *R = foldFabsWithFcmpZero(I, *this))
- return R;
-
- if (match(Op0, m_FNeg(m_Value(X)))) {
- // fcmp pred (fneg X), C --> fcmp swap(pred) X, -C
- Constant *C;
- if (match(Op1, m_Constant(C))) {
- Constant *NegC = ConstantExpr::getFNeg(C);
- return new FCmpInst(I.getSwappedPredicate(), X, NegC, "", &I);
- }
- }
-
- if (match(Op0, m_FPExt(m_Value(X)))) {
- // fcmp (fpext X), (fpext Y) -> fcmp X, Y
- if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType())
- return new FCmpInst(Pred, X, Y, "", &I);
-
- // fcmp (fpext X), C -> fcmp X, (fptrunc C) if fptrunc is lossless
- const APFloat *C;
- if (match(Op1, m_APFloat(C))) {
- const fltSemantics &FPSem =
- X->getType()->getScalarType()->getFltSemantics();
- bool Lossy;
- APFloat TruncC = *C;
- TruncC.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy);
-
- // Avoid lossy conversions and denormals.
- // Zero is a special case that's OK to convert.
- APFloat Fabs = TruncC;
- Fabs.clearSign();
- if (!Lossy &&
- (!(Fabs < APFloat::getSmallestNormalized(FPSem)) || Fabs.isZero())) {
- Constant *NewC = ConstantFP::get(X->getType(), TruncC);
- return new FCmpInst(Pred, X, NewC, "", &I);
- }
- }
- }
-
- if (I.getType()->isVectorTy())
- if (Instruction *Res = foldVectorCmp(I, Builder))
- return Res;
-
- return Changed ? &I : nullptr;
-}
+ bool Changed = false;
+
+ /// Orders the operands of the compare so that they are listed from most
+ /// complex to least complex. This puts constants before unary operators,
+ /// before binary operators.
+ if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1))) {
+ I.swapOperands();
+ Changed = true;
+ }
+
+ const CmpInst::Predicate Pred = I.getPredicate();
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ if (Value *V = SimplifyFCmpInst(Pred, Op0, Op1, I.getFastMathFlags(),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ // Simplify 'fcmp pred X, X'
+ Type *OpType = Op0->getType();
+ assert(OpType == Op1->getType() && "fcmp with different-typed operands?");
+ if (Op0 == Op1) {
+ switch (Pred) {
+ default: break;
+ case FCmpInst::FCMP_UNO: // True if unordered: isnan(X) | isnan(Y)
+ case FCmpInst::FCMP_ULT: // True if unordered or less than
+ case FCmpInst::FCMP_UGT: // True if unordered or greater than
+ case FCmpInst::FCMP_UNE: // True if unordered or not equal
+ // Canonicalize these to be 'fcmp uno %X, 0.0'.
+ I.setPredicate(FCmpInst::FCMP_UNO);
+ I.setOperand(1, Constant::getNullValue(OpType));
+ return &I;
+
+ case FCmpInst::FCMP_ORD: // True if ordered (no nans)
+ case FCmpInst::FCMP_OEQ: // True if ordered and equal
+ case FCmpInst::FCMP_OGE: // True if ordered and greater than or equal
+ case FCmpInst::FCMP_OLE: // True if ordered and less than or equal
+ // Canonicalize these to be 'fcmp ord %X, 0.0'.
+ I.setPredicate(FCmpInst::FCMP_ORD);
+ I.setOperand(1, Constant::getNullValue(OpType));
+ return &I;
+ }
+ }
+
+ // If we're just checking for a NaN (ORD/UNO) and have a non-NaN operand,
+ // then canonicalize the operand to 0.0.
+ if (Pred == CmpInst::FCMP_ORD || Pred == CmpInst::FCMP_UNO) {
+ if (!match(Op0, m_PosZeroFP()) && isKnownNeverNaN(Op0, &TLI))
+ return replaceOperand(I, 0, ConstantFP::getNullValue(OpType));
+
+ if (!match(Op1, m_PosZeroFP()) && isKnownNeverNaN(Op1, &TLI))
+ return replaceOperand(I, 1, ConstantFP::getNullValue(OpType));
+ }
+
+ // fcmp pred (fneg X), (fneg Y) -> fcmp swap(pred) X, Y
+ Value *X, *Y;
+ if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
+ return new FCmpInst(I.getSwappedPredicate(), X, Y, "", &I);
+
+ // Test if the FCmpInst instruction is used exclusively by a select as
+ // part of a minimum or maximum operation. If so, refrain from doing
+ // any other folding. This helps out other analyses which understand
+ // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
+ // and CodeGen. And in this case, at least one of the comparison
+ // operands has at least one user besides the compare (the select),
+ // which would often largely negate the benefit of folding anyway.
+ if (I.hasOneUse())
+ if (SelectInst *SI = dyn_cast<SelectInst>(I.user_back())) {
+ Value *A, *B;
+ SelectPatternResult SPR = matchSelectPattern(SI, A, B);
+ if (SPR.Flavor != SPF_UNKNOWN)
+ return nullptr;
+ }
+
+ // The sign of 0.0 is ignored by fcmp, so canonicalize to +0.0:
+ // fcmp Pred X, -0.0 --> fcmp Pred X, 0.0
+ if (match(Op1, m_AnyZeroFP()) && !match(Op1, m_PosZeroFP()))
+ return replaceOperand(I, 1, ConstantFP::getNullValue(OpType));
+
+ // Handle fcmp with instruction LHS and constant RHS.
+ Instruction *LHSI;
+ Constant *RHSC;
+ if (match(Op0, m_Instruction(LHSI)) && match(Op1, m_Constant(RHSC))) {
+ switch (LHSI->getOpcode()) {
+ case Instruction::PHI:
+ // Only fold fcmp into the PHI if the phi and fcmp are in the same
+ // block. If in the same block, we're encouraging jump threading. If
+ // not, we are just pessimizing the code by making an i1 phi.
+ if (LHSI->getParent() == I.getParent())
+ if (Instruction *NV = foldOpIntoPhi(I, cast<PHINode>(LHSI)))
+ return NV;
+ break;
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ if (Instruction *NV = foldFCmpIntToFPConst(I, LHSI, RHSC))
+ return NV;
+ break;
+ case Instruction::FDiv:
+ if (Instruction *NV = foldFCmpReciprocalAndZero(I, LHSI, RHSC))
+ return NV;
+ break;
+ case Instruction::Load:
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(LHSI->getOperand(0)))
+ if (auto *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0)))
+ if (GV->isConstant() && GV->hasDefinitiveInitializer() &&
+ !cast<LoadInst>(LHSI)->isVolatile())
+ if (Instruction *Res = foldCmpLoadFromIndexedGlobal(GEP, GV, I))
+ return Res;
+ break;
+ }
+ }
+
+ if (Instruction *R = foldFabsWithFcmpZero(I, *this))
+ return R;
+
+ if (match(Op0, m_FNeg(m_Value(X)))) {
+ // fcmp pred (fneg X), C --> fcmp swap(pred) X, -C
+ Constant *C;
+ if (match(Op1, m_Constant(C))) {
+ Constant *NegC = ConstantExpr::getFNeg(C);
+ return new FCmpInst(I.getSwappedPredicate(), X, NegC, "", &I);
+ }
+ }
+
+ if (match(Op0, m_FPExt(m_Value(X)))) {
+ // fcmp (fpext X), (fpext Y) -> fcmp X, Y
+ if (match(Op1, m_FPExt(m_Value(Y))) && X->getType() == Y->getType())
+ return new FCmpInst(Pred, X, Y, "", &I);
+
+ // fcmp (fpext X), C -> fcmp X, (fptrunc C) if fptrunc is lossless
+ const APFloat *C;
+ if (match(Op1, m_APFloat(C))) {
+ const fltSemantics &FPSem =
+ X->getType()->getScalarType()->getFltSemantics();
+ bool Lossy;
+ APFloat TruncC = *C;
+ TruncC.convert(FPSem, APFloat::rmNearestTiesToEven, &Lossy);
+
+ // Avoid lossy conversions and denormals.
+ // Zero is a special case that's OK to convert.
+ APFloat Fabs = TruncC;
+ Fabs.clearSign();
+ if (!Lossy &&
+ (!(Fabs < APFloat::getSmallestNormalized(FPSem)) || Fabs.isZero())) {
+ Constant *NewC = ConstantFP::get(X->getType(), TruncC);
+ return new FCmpInst(Pred, X, NewC, "", &I);
+ }
+ }
+ }
+
+ if (I.getType()->isVectorTy())
+ if (Instruction *Res = foldVectorCmp(I, Builder))
+ return Res;
+
+ return Changed ? &I : nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineInternal.h b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineInternal.h
index 68d36a72db..79e9d5c46c 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -1,38 +1,38 @@
-//===- InstCombineInternal.h - InstCombine pass internals -------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// This file provides internal interfaces used to implement the InstCombine.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
-#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
-
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/TargetFolder.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/KnownBits.h"
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+//===- InstCombineInternal.h - InstCombine pass internals -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file provides internal interfaces used to implement the InstCombine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
+#define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <cassert>
-
-#define DEBUG_TYPE "instcombine"
-
-using namespace llvm::PatternMatch;
-
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+
+#define DEBUG_TYPE "instcombine"
+
+using namespace llvm::PatternMatch;
+
// As a default, let's assume that we want to be aggressive,
// and attempt to traverse with no limits in attempt to sink negation.
static constexpr unsigned NegatorDefaultMaxDepth = ~0U;
@@ -41,26 +41,26 @@ static constexpr unsigned NegatorDefaultMaxDepth = ~0U;
// fairly small number of new instructions.
static constexpr unsigned NegatorMaxNodesSSO = 16;
-namespace llvm {
-
-class AAResults;
-class APInt;
-class AssumptionCache;
-class BlockFrequencyInfo;
-class DataLayout;
-class DominatorTree;
-class GEPOperator;
-class GlobalVariable;
-class LoopInfo;
-class OptimizationRemarkEmitter;
-class ProfileSummaryInfo;
-class TargetLibraryInfo;
-class User;
-
+namespace llvm {
+
+class AAResults;
+class APInt;
+class AssumptionCache;
+class BlockFrequencyInfo;
+class DataLayout;
+class DominatorTree;
+class GEPOperator;
+class GlobalVariable;
+class LoopInfo;
+class OptimizationRemarkEmitter;
+class ProfileSummaryInfo;
+class TargetLibraryInfo;
+class User;
+
class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
: public InstCombiner,
public InstVisitor<InstCombinerImpl, Instruction *> {
-public:
+public:
InstCombinerImpl(InstCombineWorklist &Worklist, BuilderTy &Builder,
bool MinimizeSize, AAResults *AA, AssumptionCache &AC,
TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
@@ -69,551 +69,551 @@ public:
const DataLayout &DL, LoopInfo *LI)
: InstCombiner(Worklist, Builder, MinimizeSize, AA, AC, TLI, TTI, DT, ORE,
BFI, PSI, DL, LI) {}
-
+
virtual ~InstCombinerImpl() {}
-
- /// Run the combiner over the entire worklist until it is empty.
- ///
- /// \returns true if the IR is changed.
- bool run();
-
- // Visitation implementation - Implement instruction combining for different
- // instruction types. The semantics are as follows:
- // Return Value:
- // null - No change was made
- // I - Change was made, I is still valid, I may be dead though
- // otherwise - Change was made, replace I with returned instruction
- //
- Instruction *visitFNeg(UnaryOperator &I);
- Instruction *visitAdd(BinaryOperator &I);
- Instruction *visitFAdd(BinaryOperator &I);
- Value *OptimizePointerDifference(
- Value *LHS, Value *RHS, Type *Ty, bool isNUW);
- Instruction *visitSub(BinaryOperator &I);
- Instruction *visitFSub(BinaryOperator &I);
- Instruction *visitMul(BinaryOperator &I);
- Instruction *visitFMul(BinaryOperator &I);
- Instruction *visitURem(BinaryOperator &I);
- Instruction *visitSRem(BinaryOperator &I);
- Instruction *visitFRem(BinaryOperator &I);
- bool simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I);
- Instruction *commonIRemTransforms(BinaryOperator &I);
- Instruction *commonIDivTransforms(BinaryOperator &I);
- Instruction *visitUDiv(BinaryOperator &I);
- Instruction *visitSDiv(BinaryOperator &I);
- Instruction *visitFDiv(BinaryOperator &I);
- Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted);
- Instruction *visitAnd(BinaryOperator &I);
- Instruction *visitOr(BinaryOperator &I);
+
+ /// Run the combiner over the entire worklist until it is empty.
+ ///
+ /// \returns true if the IR is changed.
+ bool run();
+
+ // Visitation implementation - Implement instruction combining for different
+ // instruction types. The semantics are as follows:
+ // Return Value:
+ // null - No change was made
+ // I - Change was made, I is still valid, I may be dead though
+ // otherwise - Change was made, replace I with returned instruction
+ //
+ Instruction *visitFNeg(UnaryOperator &I);
+ Instruction *visitAdd(BinaryOperator &I);
+ Instruction *visitFAdd(BinaryOperator &I);
+ Value *OptimizePointerDifference(
+ Value *LHS, Value *RHS, Type *Ty, bool isNUW);
+ Instruction *visitSub(BinaryOperator &I);
+ Instruction *visitFSub(BinaryOperator &I);
+ Instruction *visitMul(BinaryOperator &I);
+ Instruction *visitFMul(BinaryOperator &I);
+ Instruction *visitURem(BinaryOperator &I);
+ Instruction *visitSRem(BinaryOperator &I);
+ Instruction *visitFRem(BinaryOperator &I);
+ bool simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I);
+ Instruction *commonIRemTransforms(BinaryOperator &I);
+ Instruction *commonIDivTransforms(BinaryOperator &I);
+ Instruction *visitUDiv(BinaryOperator &I);
+ Instruction *visitSDiv(BinaryOperator &I);
+ Instruction *visitFDiv(BinaryOperator &I);
+ Value *simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, bool Inverted);
+ Instruction *visitAnd(BinaryOperator &I);
+ Instruction *visitOr(BinaryOperator &I);
bool sinkNotIntoOtherHandOfAndOrOr(BinaryOperator &I);
- Instruction *visitXor(BinaryOperator &I);
- Instruction *visitShl(BinaryOperator &I);
- Value *reassociateShiftAmtsOfTwoSameDirectionShifts(
- BinaryOperator *Sh0, const SimplifyQuery &SQ,
- bool AnalyzeForSignBitExtraction = false);
- Instruction *canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
- BinaryOperator &I);
- Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract(
- BinaryOperator &OldAShr);
- Instruction *visitAShr(BinaryOperator &I);
- Instruction *visitLShr(BinaryOperator &I);
- Instruction *commonShiftTransforms(BinaryOperator &I);
- Instruction *visitFCmpInst(FCmpInst &I);
+ Instruction *visitXor(BinaryOperator &I);
+ Instruction *visitShl(BinaryOperator &I);
+ Value *reassociateShiftAmtsOfTwoSameDirectionShifts(
+ BinaryOperator *Sh0, const SimplifyQuery &SQ,
+ bool AnalyzeForSignBitExtraction = false);
+ Instruction *canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(
+ BinaryOperator &I);
+ Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract(
+ BinaryOperator &OldAShr);
+ Instruction *visitAShr(BinaryOperator &I);
+ Instruction *visitLShr(BinaryOperator &I);
+ Instruction *commonShiftTransforms(BinaryOperator &I);
+ Instruction *visitFCmpInst(FCmpInst &I);
CmpInst *canonicalizeICmpPredicate(CmpInst &I);
- Instruction *visitICmpInst(ICmpInst &I);
- Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1,
- BinaryOperator &I);
- Instruction *commonCastTransforms(CastInst &CI);
- Instruction *commonPointerCastTransforms(CastInst &CI);
- Instruction *visitTrunc(TruncInst &CI);
- Instruction *visitZExt(ZExtInst &CI);
- Instruction *visitSExt(SExtInst &CI);
- Instruction *visitFPTrunc(FPTruncInst &CI);
- Instruction *visitFPExt(CastInst &CI);
- Instruction *visitFPToUI(FPToUIInst &FI);
- Instruction *visitFPToSI(FPToSIInst &FI);
- Instruction *visitUIToFP(CastInst &CI);
- Instruction *visitSIToFP(CastInst &CI);
- Instruction *visitPtrToInt(PtrToIntInst &CI);
- Instruction *visitIntToPtr(IntToPtrInst &CI);
- Instruction *visitBitCast(BitCastInst &CI);
- Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI);
- Instruction *foldItoFPtoI(CastInst &FI);
- Instruction *visitSelectInst(SelectInst &SI);
- Instruction *visitCallInst(CallInst &CI);
- Instruction *visitInvokeInst(InvokeInst &II);
- Instruction *visitCallBrInst(CallBrInst &CBI);
-
- Instruction *SliceUpIllegalIntegerPHI(PHINode &PN);
- Instruction *visitPHINode(PHINode &PN);
- Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
- Instruction *visitAllocaInst(AllocaInst &AI);
- Instruction *visitAllocSite(Instruction &FI);
- Instruction *visitFree(CallInst &FI);
- Instruction *visitLoadInst(LoadInst &LI);
- Instruction *visitStoreInst(StoreInst &SI);
- Instruction *visitAtomicRMWInst(AtomicRMWInst &SI);
- Instruction *visitUnconditionalBranchInst(BranchInst &BI);
- Instruction *visitBranchInst(BranchInst &BI);
- Instruction *visitFenceInst(FenceInst &FI);
- Instruction *visitSwitchInst(SwitchInst &SI);
- Instruction *visitReturnInst(ReturnInst &RI);
+ Instruction *visitICmpInst(ICmpInst &I);
+ Instruction *FoldShiftByConstant(Value *Op0, Constant *Op1,
+ BinaryOperator &I);
+ Instruction *commonCastTransforms(CastInst &CI);
+ Instruction *commonPointerCastTransforms(CastInst &CI);
+ Instruction *visitTrunc(TruncInst &CI);
+ Instruction *visitZExt(ZExtInst &CI);
+ Instruction *visitSExt(SExtInst &CI);
+ Instruction *visitFPTrunc(FPTruncInst &CI);
+ Instruction *visitFPExt(CastInst &CI);
+ Instruction *visitFPToUI(FPToUIInst &FI);
+ Instruction *visitFPToSI(FPToSIInst &FI);
+ Instruction *visitUIToFP(CastInst &CI);
+ Instruction *visitSIToFP(CastInst &CI);
+ Instruction *visitPtrToInt(PtrToIntInst &CI);
+ Instruction *visitIntToPtr(IntToPtrInst &CI);
+ Instruction *visitBitCast(BitCastInst &CI);
+ Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI);
+ Instruction *foldItoFPtoI(CastInst &FI);
+ Instruction *visitSelectInst(SelectInst &SI);
+ Instruction *visitCallInst(CallInst &CI);
+ Instruction *visitInvokeInst(InvokeInst &II);
+ Instruction *visitCallBrInst(CallBrInst &CBI);
+
+ Instruction *SliceUpIllegalIntegerPHI(PHINode &PN);
+ Instruction *visitPHINode(PHINode &PN);
+ Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
+ Instruction *visitAllocaInst(AllocaInst &AI);
+ Instruction *visitAllocSite(Instruction &FI);
+ Instruction *visitFree(CallInst &FI);
+ Instruction *visitLoadInst(LoadInst &LI);
+ Instruction *visitStoreInst(StoreInst &SI);
+ Instruction *visitAtomicRMWInst(AtomicRMWInst &SI);
+ Instruction *visitUnconditionalBranchInst(BranchInst &BI);
+ Instruction *visitBranchInst(BranchInst &BI);
+ Instruction *visitFenceInst(FenceInst &FI);
+ Instruction *visitSwitchInst(SwitchInst &SI);
+ Instruction *visitReturnInst(ReturnInst &RI);
Instruction *visitUnreachableInst(UnreachableInst &I);
Instruction *
foldAggregateConstructionIntoAggregateReuse(InsertValueInst &OrigIVI);
- Instruction *visitInsertValueInst(InsertValueInst &IV);
- Instruction *visitInsertElementInst(InsertElementInst &IE);
- Instruction *visitExtractElementInst(ExtractElementInst &EI);
- Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
- Instruction *visitExtractValueInst(ExtractValueInst &EV);
- Instruction *visitLandingPadInst(LandingPadInst &LI);
- Instruction *visitVAEndInst(VAEndInst &I);
- Instruction *visitFreeze(FreezeInst &I);
-
- /// Specify what to return for unhandled instructions.
- Instruction *visitInstruction(Instruction &I) { return nullptr; }
-
- /// True when DB dominates all uses of DI except UI.
- /// UI must be in the same block as DI.
- /// The routine checks that the DI parent and DB are different.
- bool dominatesAllUses(const Instruction *DI, const Instruction *UI,
- const BasicBlock *DB) const;
-
- /// Try to replace select with select operand SIOpd in SI-ICmp sequence.
- bool replacedSelectWithOperand(SelectInst *SI, const ICmpInst *Icmp,
- const unsigned SIOpd);
-
- LoadInst *combineLoadToNewType(LoadInst &LI, Type *NewTy,
- const Twine &Suffix = "");
-
-private:
- bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
- bool shouldChangeType(Type *From, Type *To) const;
- Value *dyn_castNegVal(Value *V) const;
- Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
- SmallVectorImpl<Value *> &NewIndices);
-
- /// Classify whether a cast is worth optimizing.
- ///
- /// This is a helper to decide whether the simplification of
- /// logic(cast(A), cast(B)) to cast(logic(A, B)) should be performed.
- ///
- /// \param CI The cast we are interested in.
- ///
- /// \return true if this cast actually results in any code being generated and
- /// if it cannot already be eliminated by some other transformation.
- bool shouldOptimizeCast(CastInst *CI);
-
- /// Try to optimize a sequence of instructions checking if an operation
- /// on LHS and RHS overflows.
- ///
- /// If this overflow check is done via one of the overflow check intrinsics,
- /// then CtxI has to be the call instruction calling that intrinsic. If this
- /// overflow check is done by arithmetic followed by a compare, then CtxI has
- /// to be the arithmetic instruction.
- ///
- /// If a simplification is possible, stores the simplified result of the
- /// operation in OperationResult and result of the overflow check in
- /// OverflowResult, and return true. If no simplification is possible,
- /// returns false.
- bool OptimizeOverflowCheck(Instruction::BinaryOps BinaryOp, bool IsSigned,
- Value *LHS, Value *RHS,
- Instruction &CtxI, Value *&OperationResult,
- Constant *&OverflowResult);
-
- Instruction *visitCallBase(CallBase &Call);
- Instruction *tryOptimizeCall(CallInst *CI);
- bool transformConstExprCastCall(CallBase &Call);
- Instruction *transformCallThroughTrampoline(CallBase &Call,
- IntrinsicInst &Tramp);
-
- Value *simplifyMaskedLoad(IntrinsicInst &II);
- Instruction *simplifyMaskedStore(IntrinsicInst &II);
- Instruction *simplifyMaskedGather(IntrinsicInst &II);
- Instruction *simplifyMaskedScatter(IntrinsicInst &II);
-
- /// Transform (zext icmp) to bitwise / integer operations in order to
- /// eliminate it.
- ///
- /// \param ICI The icmp of the (zext icmp) pair we are interested in.
- /// \parem CI The zext of the (zext icmp) pair we are interested in.
- /// \param DoTransform Pass false to just test whether the given (zext icmp)
- /// would be transformed. Pass true to actually perform the transformation.
- ///
- /// \return null if the transformation cannot be performed. If the
- /// transformation can be performed the new instruction that replaces the
- /// (zext icmp) pair will be returned (if \p DoTransform is false the
- /// unmodified \p ICI will be returned in this case).
- Instruction *transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
- bool DoTransform = true);
-
- Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
-
- bool willNotOverflowSignedAdd(const Value *LHS, const Value *RHS,
- const Instruction &CxtI) const {
- return computeOverflowForSignedAdd(LHS, RHS, &CxtI) ==
- OverflowResult::NeverOverflows;
- }
-
- bool willNotOverflowUnsignedAdd(const Value *LHS, const Value *RHS,
- const Instruction &CxtI) const {
- return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) ==
- OverflowResult::NeverOverflows;
- }
-
- bool willNotOverflowAdd(const Value *LHS, const Value *RHS,
- const Instruction &CxtI, bool IsSigned) const {
- return IsSigned ? willNotOverflowSignedAdd(LHS, RHS, CxtI)
- : willNotOverflowUnsignedAdd(LHS, RHS, CxtI);
- }
-
- bool willNotOverflowSignedSub(const Value *LHS, const Value *RHS,
- const Instruction &CxtI) const {
- return computeOverflowForSignedSub(LHS, RHS, &CxtI) ==
- OverflowResult::NeverOverflows;
- }
-
- bool willNotOverflowUnsignedSub(const Value *LHS, const Value *RHS,
- const Instruction &CxtI) const {
- return computeOverflowForUnsignedSub(LHS, RHS, &CxtI) ==
- OverflowResult::NeverOverflows;
- }
-
- bool willNotOverflowSub(const Value *LHS, const Value *RHS,
- const Instruction &CxtI, bool IsSigned) const {
- return IsSigned ? willNotOverflowSignedSub(LHS, RHS, CxtI)
- : willNotOverflowUnsignedSub(LHS, RHS, CxtI);
- }
-
- bool willNotOverflowSignedMul(const Value *LHS, const Value *RHS,
- const Instruction &CxtI) const {
- return computeOverflowForSignedMul(LHS, RHS, &CxtI) ==
- OverflowResult::NeverOverflows;
- }
-
- bool willNotOverflowUnsignedMul(const Value *LHS, const Value *RHS,
- const Instruction &CxtI) const {
- return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) ==
- OverflowResult::NeverOverflows;
- }
-
- bool willNotOverflowMul(const Value *LHS, const Value *RHS,
- const Instruction &CxtI, bool IsSigned) const {
- return IsSigned ? willNotOverflowSignedMul(LHS, RHS, CxtI)
- : willNotOverflowUnsignedMul(LHS, RHS, CxtI);
- }
-
- bool willNotOverflow(BinaryOperator::BinaryOps Opcode, const Value *LHS,
- const Value *RHS, const Instruction &CxtI,
- bool IsSigned) const {
- switch (Opcode) {
- case Instruction::Add: return willNotOverflowAdd(LHS, RHS, CxtI, IsSigned);
- case Instruction::Sub: return willNotOverflowSub(LHS, RHS, CxtI, IsSigned);
- case Instruction::Mul: return willNotOverflowMul(LHS, RHS, CxtI, IsSigned);
- default: llvm_unreachable("Unexpected opcode for overflow query");
- }
- }
-
- Value *EmitGEPOffset(User *GEP);
- Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
- Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
- Instruction *narrowBinOp(TruncInst &Trunc);
- Instruction *narrowMaskedBinOp(BinaryOperator &And);
- Instruction *narrowMathIfNoOverflow(BinaryOperator &I);
+ Instruction *visitInsertValueInst(InsertValueInst &IV);
+ Instruction *visitInsertElementInst(InsertElementInst &IE);
+ Instruction *visitExtractElementInst(ExtractElementInst &EI);
+ Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
+ Instruction *visitExtractValueInst(ExtractValueInst &EV);
+ Instruction *visitLandingPadInst(LandingPadInst &LI);
+ Instruction *visitVAEndInst(VAEndInst &I);
+ Instruction *visitFreeze(FreezeInst &I);
+
+ /// Specify what to return for unhandled instructions.
+ Instruction *visitInstruction(Instruction &I) { return nullptr; }
+
+ /// True when DB dominates all uses of DI except UI.
+ /// UI must be in the same block as DI.
+ /// The routine checks that the DI parent and DB are different.
+ bool dominatesAllUses(const Instruction *DI, const Instruction *UI,
+ const BasicBlock *DB) const;
+
+ /// Try to replace select with select operand SIOpd in SI-ICmp sequence.
+ bool replacedSelectWithOperand(SelectInst *SI, const ICmpInst *Icmp,
+ const unsigned SIOpd);
+
+ LoadInst *combineLoadToNewType(LoadInst &LI, Type *NewTy,
+ const Twine &Suffix = "");
+
+private:
+ bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
+ bool shouldChangeType(Type *From, Type *To) const;
+ Value *dyn_castNegVal(Value *V) const;
+ Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
+ SmallVectorImpl<Value *> &NewIndices);
+
+ /// Classify whether a cast is worth optimizing.
+ ///
+ /// This is a helper to decide whether the simplification of
+ /// logic(cast(A), cast(B)) to cast(logic(A, B)) should be performed.
+ ///
+ /// \param CI The cast we are interested in.
+ ///
+ /// \return true if this cast actually results in any code being generated and
+ /// if it cannot already be eliminated by some other transformation.
+ bool shouldOptimizeCast(CastInst *CI);
+
+ /// Try to optimize a sequence of instructions checking if an operation
+ /// on LHS and RHS overflows.
+ ///
+ /// If this overflow check is done via one of the overflow check intrinsics,
+ /// then CtxI has to be the call instruction calling that intrinsic. If this
+ /// overflow check is done by arithmetic followed by a compare, then CtxI has
+ /// to be the arithmetic instruction.
+ ///
+ /// If a simplification is possible, stores the simplified result of the
+ /// operation in OperationResult and result of the overflow check in
+ /// OverflowResult, and return true. If no simplification is possible,
+ /// returns false.
+ bool OptimizeOverflowCheck(Instruction::BinaryOps BinaryOp, bool IsSigned,
+ Value *LHS, Value *RHS,
+ Instruction &CtxI, Value *&OperationResult,
+ Constant *&OverflowResult);
+
+ Instruction *visitCallBase(CallBase &Call);
+ Instruction *tryOptimizeCall(CallInst *CI);
+ bool transformConstExprCastCall(CallBase &Call);
+ Instruction *transformCallThroughTrampoline(CallBase &Call,
+ IntrinsicInst &Tramp);
+
+ Value *simplifyMaskedLoad(IntrinsicInst &II);
+ Instruction *simplifyMaskedStore(IntrinsicInst &II);
+ Instruction *simplifyMaskedGather(IntrinsicInst &II);
+ Instruction *simplifyMaskedScatter(IntrinsicInst &II);
+
+ /// Transform (zext icmp) to bitwise / integer operations in order to
+ /// eliminate it.
+ ///
+ /// \param ICI The icmp of the (zext icmp) pair we are interested in.
+ /// \parem CI The zext of the (zext icmp) pair we are interested in.
+ /// \param DoTransform Pass false to just test whether the given (zext icmp)
+ /// would be transformed. Pass true to actually perform the transformation.
+ ///
+ /// \return null if the transformation cannot be performed. If the
+ /// transformation can be performed the new instruction that replaces the
+ /// (zext icmp) pair will be returned (if \p DoTransform is false the
+ /// unmodified \p ICI will be returned in this case).
+ Instruction *transformZExtICmp(ICmpInst *ICI, ZExtInst &CI,
+ bool DoTransform = true);
+
+ Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
+
+ bool willNotOverflowSignedAdd(const Value *LHS, const Value *RHS,
+ const Instruction &CxtI) const {
+ return computeOverflowForSignedAdd(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ }
+
+ bool willNotOverflowUnsignedAdd(const Value *LHS, const Value *RHS,
+ const Instruction &CxtI) const {
+ return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ }
+
+ bool willNotOverflowAdd(const Value *LHS, const Value *RHS,
+ const Instruction &CxtI, bool IsSigned) const {
+ return IsSigned ? willNotOverflowSignedAdd(LHS, RHS, CxtI)
+ : willNotOverflowUnsignedAdd(LHS, RHS, CxtI);
+ }
+
+ bool willNotOverflowSignedSub(const Value *LHS, const Value *RHS,
+ const Instruction &CxtI) const {
+ return computeOverflowForSignedSub(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ }
+
+ bool willNotOverflowUnsignedSub(const Value *LHS, const Value *RHS,
+ const Instruction &CxtI) const {
+ return computeOverflowForUnsignedSub(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ }
+
+ bool willNotOverflowSub(const Value *LHS, const Value *RHS,
+ const Instruction &CxtI, bool IsSigned) const {
+ return IsSigned ? willNotOverflowSignedSub(LHS, RHS, CxtI)
+ : willNotOverflowUnsignedSub(LHS, RHS, CxtI);
+ }
+
+ bool willNotOverflowSignedMul(const Value *LHS, const Value *RHS,
+ const Instruction &CxtI) const {
+ return computeOverflowForSignedMul(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ }
+
+ bool willNotOverflowUnsignedMul(const Value *LHS, const Value *RHS,
+ const Instruction &CxtI) const {
+ return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) ==
+ OverflowResult::NeverOverflows;
+ }
+
+ bool willNotOverflowMul(const Value *LHS, const Value *RHS,
+ const Instruction &CxtI, bool IsSigned) const {
+ return IsSigned ? willNotOverflowSignedMul(LHS, RHS, CxtI)
+ : willNotOverflowUnsignedMul(LHS, RHS, CxtI);
+ }
+
+ bool willNotOverflow(BinaryOperator::BinaryOps Opcode, const Value *LHS,
+ const Value *RHS, const Instruction &CxtI,
+ bool IsSigned) const {
+ switch (Opcode) {
+ case Instruction::Add: return willNotOverflowAdd(LHS, RHS, CxtI, IsSigned);
+ case Instruction::Sub: return willNotOverflowSub(LHS, RHS, CxtI, IsSigned);
+ case Instruction::Mul: return willNotOverflowMul(LHS, RHS, CxtI, IsSigned);
+ default: llvm_unreachable("Unexpected opcode for overflow query");
+ }
+ }
+
+ Value *EmitGEPOffset(User *GEP);
+ Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
+ Instruction *foldCastedBitwiseLogic(BinaryOperator &I);
+ Instruction *narrowBinOp(TruncInst &Trunc);
+ Instruction *narrowMaskedBinOp(BinaryOperator &And);
+ Instruction *narrowMathIfNoOverflow(BinaryOperator &I);
Instruction *narrowFunnelShift(TruncInst &Trunc);
- Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN);
- Instruction *matchSAddSubSat(SelectInst &MinMax1);
-
+ Instruction *optimizeBitCastFromPhi(CastInst &CI, PHINode *PN);
+ Instruction *matchSAddSubSat(SelectInst &MinMax1);
+
void freelyInvertAllUsersOf(Value *V);
- /// Determine if a pair of casts can be replaced by a single cast.
- ///
- /// \param CI1 The first of a pair of casts.
- /// \param CI2 The second of a pair of casts.
- ///
- /// \return 0 if the cast pair cannot be eliminated, otherwise returns an
- /// Instruction::CastOps value for a cast that can replace the pair, casting
- /// CI1->getSrcTy() to CI2->getDstTy().
- ///
- /// \see CastInst::isEliminableCastPair
- Instruction::CastOps isEliminableCastPair(const CastInst *CI1,
- const CastInst *CI2);
-
- Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &And);
- Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Or);
- Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Xor);
-
- /// Optimize (fcmp)&(fcmp) or (fcmp)|(fcmp).
- /// NOTE: Unlike most of instcombine, this returns a Value which should
- /// already be inserted into the function.
- Value *foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd);
-
- Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
- BinaryOperator &Logic);
- Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D);
- Value *getSelectCondition(Value *A, Value *B);
-
- Instruction *foldIntrinsicWithOverflowCommon(IntrinsicInst *II);
- Instruction *foldFPSignBitOps(BinaryOperator &I);
-
-public:
- /// Inserts an instruction \p New before instruction \p Old
- ///
- /// Also adds the new instruction to the worklist and returns \p New so that
- /// it is suitable for use as the return from the visitation patterns.
- Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) {
- assert(New && !New->getParent() &&
- "New instruction already inserted into a basic block!");
- BasicBlock *BB = Old.getParent();
- BB->getInstList().insert(Old.getIterator(), New); // Insert inst
- Worklist.add(New);
- return New;
- }
-
- /// Same as InsertNewInstBefore, but also sets the debug loc.
- Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
- New->setDebugLoc(Old.getDebugLoc());
- return InsertNewInstBefore(New, Old);
- }
-
- /// A combiner-aware RAUW-like routine.
- ///
- /// This method is to be used when an instruction is found to be dead,
- /// replaceable with another preexisting expression. Here we add all uses of
- /// I to the worklist, replace all uses of I with the new value, then return
- /// I, so that the inst combiner will know that I was modified.
- Instruction *replaceInstUsesWith(Instruction &I, Value *V) {
- // If there are no uses to replace, then we return nullptr to indicate that
- // no changes were made to the program.
- if (I.use_empty()) return nullptr;
-
- Worklist.pushUsersToWorkList(I); // Add all modified instrs to worklist.
-
- // If we are replacing the instruction with itself, this must be in a
- // segment of unreachable code, so just clobber the instruction.
- if (&I == V)
- V = UndefValue::get(I.getType());
-
- LLVM_DEBUG(dbgs() << "IC: Replacing " << I << "\n"
- << " with " << *V << '\n');
-
- I.replaceAllUsesWith(V);
+ /// Determine if a pair of casts can be replaced by a single cast.
+ ///
+ /// \param CI1 The first of a pair of casts.
+ /// \param CI2 The second of a pair of casts.
+ ///
+ /// \return 0 if the cast pair cannot be eliminated, otherwise returns an
+ /// Instruction::CastOps value for a cast that can replace the pair, casting
+ /// CI1->getSrcTy() to CI2->getDstTy().
+ ///
+ /// \see CastInst::isEliminableCastPair
+ Instruction::CastOps isEliminableCastPair(const CastInst *CI1,
+ const CastInst *CI2);
+
+ Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &And);
+ Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Or);
+ Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Xor);
+
+ /// Optimize (fcmp)&(fcmp) or (fcmp)|(fcmp).
+ /// NOTE: Unlike most of instcombine, this returns a Value which should
+ /// already be inserted into the function.
+ Value *foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd);
+
+ Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
+ BinaryOperator &Logic);
+ Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D);
+ Value *getSelectCondition(Value *A, Value *B);
+
+ Instruction *foldIntrinsicWithOverflowCommon(IntrinsicInst *II);
+ Instruction *foldFPSignBitOps(BinaryOperator &I);
+
+public:
+ /// Inserts an instruction \p New before instruction \p Old
+ ///
+ /// Also adds the new instruction to the worklist and returns \p New so that
+ /// it is suitable for use as the return from the visitation patterns.
+ Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) {
+ assert(New && !New->getParent() &&
+ "New instruction already inserted into a basic block!");
+ BasicBlock *BB = Old.getParent();
+ BB->getInstList().insert(Old.getIterator(), New); // Insert inst
+ Worklist.add(New);
+ return New;
+ }
+
+ /// Same as InsertNewInstBefore, but also sets the debug loc.
+ Instruction *InsertNewInstWith(Instruction *New, Instruction &Old) {
+ New->setDebugLoc(Old.getDebugLoc());
+ return InsertNewInstBefore(New, Old);
+ }
+
+ /// A combiner-aware RAUW-like routine.
+ ///
+ /// This method is to be used when an instruction is found to be dead,
+ /// replaceable with another preexisting expression. Here we add all uses of
+ /// I to the worklist, replace all uses of I with the new value, then return
+ /// I, so that the inst combiner will know that I was modified.
+ Instruction *replaceInstUsesWith(Instruction &I, Value *V) {
+ // If there are no uses to replace, then we return nullptr to indicate that
+ // no changes were made to the program.
+ if (I.use_empty()) return nullptr;
+
+ Worklist.pushUsersToWorkList(I); // Add all modified instrs to worklist.
+
+ // If we are replacing the instruction with itself, this must be in a
+ // segment of unreachable code, so just clobber the instruction.
+ if (&I == V)
+ V = UndefValue::get(I.getType());
+
+ LLVM_DEBUG(dbgs() << "IC: Replacing " << I << "\n"
+ << " with " << *V << '\n');
+
+ I.replaceAllUsesWith(V);
MadeIRChange = true;
- return &I;
- }
-
- /// Replace operand of instruction and add old operand to the worklist.
- Instruction *replaceOperand(Instruction &I, unsigned OpNum, Value *V) {
- Worklist.addValue(I.getOperand(OpNum));
- I.setOperand(OpNum, V);
- return &I;
- }
-
- /// Replace use and add the previously used value to the worklist.
- void replaceUse(Use &U, Value *NewValue) {
- Worklist.addValue(U);
- U = NewValue;
- }
-
- /// Creates a result tuple for an overflow intrinsic \p II with a given
- /// \p Result and a constant \p Overflow value.
- Instruction *CreateOverflowTuple(IntrinsicInst *II, Value *Result,
- Constant *Overflow) {
- Constant *V[] = {UndefValue::get(Result->getType()), Overflow};
- StructType *ST = cast<StructType>(II->getType());
- Constant *Struct = ConstantStruct::get(ST, V);
- return InsertValueInst::Create(Struct, Result, 0);
- }
-
- /// Create and insert the idiom we use to indicate a block is unreachable
- /// without having to rewrite the CFG from within InstCombine.
- void CreateNonTerminatorUnreachable(Instruction *InsertAt) {
- auto &Ctx = InsertAt->getContext();
- new StoreInst(ConstantInt::getTrue(Ctx),
- UndefValue::get(Type::getInt1PtrTy(Ctx)),
- InsertAt);
- }
-
-
- /// Combiner aware instruction erasure.
- ///
- /// When dealing with an instruction that has side effects or produces a void
- /// value, we can't rely on DCE to delete the instruction. Instead, visit
- /// methods should return the value returned by this function.
+ return &I;
+ }
+
+ /// Replace operand of instruction and add old operand to the worklist.
+ Instruction *replaceOperand(Instruction &I, unsigned OpNum, Value *V) {
+ Worklist.addValue(I.getOperand(OpNum));
+ I.setOperand(OpNum, V);
+ return &I;
+ }
+
+ /// Replace use and add the previously used value to the worklist.
+ void replaceUse(Use &U, Value *NewValue) {
+ Worklist.addValue(U);
+ U = NewValue;
+ }
+
+ /// Creates a result tuple for an overflow intrinsic \p II with a given
+ /// \p Result and a constant \p Overflow value.
+ Instruction *CreateOverflowTuple(IntrinsicInst *II, Value *Result,
+ Constant *Overflow) {
+ Constant *V[] = {UndefValue::get(Result->getType()), Overflow};
+ StructType *ST = cast<StructType>(II->getType());
+ Constant *Struct = ConstantStruct::get(ST, V);
+ return InsertValueInst::Create(Struct, Result, 0);
+ }
+
+ /// Create and insert the idiom we use to indicate a block is unreachable
+ /// without having to rewrite the CFG from within InstCombine.
+ void CreateNonTerminatorUnreachable(Instruction *InsertAt) {
+ auto &Ctx = InsertAt->getContext();
+ new StoreInst(ConstantInt::getTrue(Ctx),
+ UndefValue::get(Type::getInt1PtrTy(Ctx)),
+ InsertAt);
+ }
+
+
+ /// Combiner aware instruction erasure.
+ ///
+ /// When dealing with an instruction that has side effects or produces a void
+ /// value, we can't rely on DCE to delete the instruction. Instead, visit
+ /// methods should return the value returned by this function.
Instruction *eraseInstFromFunction(Instruction &I) override {
- LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n');
- assert(I.use_empty() && "Cannot erase instruction that is used!");
- salvageDebugInfo(I);
-
- // Make sure that we reprocess all operands now that we reduced their
- // use counts.
- for (Use &Operand : I.operands())
- if (auto *Inst = dyn_cast<Instruction>(Operand))
- Worklist.add(Inst);
-
- Worklist.remove(&I);
- I.eraseFromParent();
- MadeIRChange = true;
- return nullptr; // Don't do anything with FI
- }
-
- void computeKnownBits(const Value *V, KnownBits &Known,
- unsigned Depth, const Instruction *CxtI) const {
- llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
- }
-
- KnownBits computeKnownBits(const Value *V, unsigned Depth,
- const Instruction *CxtI) const {
- return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
- }
-
- bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero = false,
- unsigned Depth = 0,
- const Instruction *CxtI = nullptr) {
- return llvm::isKnownToBeAPowerOfTwo(V, DL, OrZero, Depth, &AC, CxtI, &DT);
- }
-
- bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0,
- const Instruction *CxtI = nullptr) const {
- return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT);
- }
-
- unsigned ComputeNumSignBits(const Value *Op, unsigned Depth = 0,
- const Instruction *CxtI = nullptr) const {
- return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
- }
-
- OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
- const Value *RHS,
- const Instruction *CxtI) const {
- return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
- }
-
- OverflowResult computeOverflowForSignedMul(const Value *LHS,
- const Value *RHS,
- const Instruction *CxtI) const {
- return llvm::computeOverflowForSignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
- }
-
- OverflowResult computeOverflowForUnsignedAdd(const Value *LHS,
- const Value *RHS,
- const Instruction *CxtI) const {
- return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
- }
-
- OverflowResult computeOverflowForSignedAdd(const Value *LHS,
- const Value *RHS,
- const Instruction *CxtI) const {
- return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
- }
-
- OverflowResult computeOverflowForUnsignedSub(const Value *LHS,
- const Value *RHS,
- const Instruction *CxtI) const {
- return llvm::computeOverflowForUnsignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
- }
-
- OverflowResult computeOverflowForSignedSub(const Value *LHS, const Value *RHS,
- const Instruction *CxtI) const {
- return llvm::computeOverflowForSignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
- }
-
- OverflowResult computeOverflow(
- Instruction::BinaryOps BinaryOp, bool IsSigned,
- Value *LHS, Value *RHS, Instruction *CxtI) const;
-
- /// Performs a few simplifications for operators which are associative
- /// or commutative.
- bool SimplifyAssociativeOrCommutative(BinaryOperator &I);
-
- /// Tries to simplify binary operations which some other binary
- /// operation distributes over.
- ///
- /// It does this by either by factorizing out common terms (eg "(A*B)+(A*C)"
- /// -> "A*(B+C)") or expanding out if this results in simplifications (eg: "A
- /// & (B | C) -> (A&B) | (A&C)" if this is a win). Returns the simplified
- /// value, or null if it didn't simplify.
- Value *SimplifyUsingDistributiveLaws(BinaryOperator &I);
-
- /// Tries to simplify add operations using the definition of remainder.
- ///
- /// The definition of remainder is X % C = X - (X / C ) * C. The add
- /// expression X % C0 + (( X / C0 ) % C1) * C0 can be simplified to
- /// X % (C0 * C1)
- Value *SimplifyAddWithRemainder(BinaryOperator &I);
-
- // Binary Op helper for select operations where the expression can be
- // efficiently reorganized.
- Value *SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS,
- Value *RHS);
-
- /// This tries to simplify binary operations by factorizing out common terms
- /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
- Value *tryFactorization(BinaryOperator &, Instruction::BinaryOps, Value *,
- Value *, Value *, Value *);
-
- /// Match a select chain which produces one of three values based on whether
- /// the LHS is less than, equal to, or greater than RHS respectively.
- /// Return true if we matched a three way compare idiom. The LHS, RHS, Less,
- /// Equal and Greater values are saved in the matching process and returned to
- /// the caller.
- bool matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, Value *&RHS,
- ConstantInt *&Less, ConstantInt *&Equal,
- ConstantInt *&Greater);
-
- /// Attempts to replace V with a simpler value based on the demanded
- /// bits.
- Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownBits &Known,
- unsigned Depth, Instruction *CxtI);
- bool SimplifyDemandedBits(Instruction *I, unsigned Op,
- const APInt &DemandedMask, KnownBits &Known,
+ LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n');
+ assert(I.use_empty() && "Cannot erase instruction that is used!");
+ salvageDebugInfo(I);
+
+ // Make sure that we reprocess all operands now that we reduced their
+ // use counts.
+ for (Use &Operand : I.operands())
+ if (auto *Inst = dyn_cast<Instruction>(Operand))
+ Worklist.add(Inst);
+
+ Worklist.remove(&I);
+ I.eraseFromParent();
+ MadeIRChange = true;
+ return nullptr; // Don't do anything with FI
+ }
+
+ void computeKnownBits(const Value *V, KnownBits &Known,
+ unsigned Depth, const Instruction *CxtI) const {
+ llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT);
+ }
+
+ KnownBits computeKnownBits(const Value *V, unsigned Depth,
+ const Instruction *CxtI) const {
+ return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT);
+ }
+
+ bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero = false,
+ unsigned Depth = 0,
+ const Instruction *CxtI = nullptr) {
+ return llvm::isKnownToBeAPowerOfTwo(V, DL, OrZero, Depth, &AC, CxtI, &DT);
+ }
+
+ bool MaskedValueIsZero(const Value *V, const APInt &Mask, unsigned Depth = 0,
+ const Instruction *CxtI = nullptr) const {
+ return llvm::MaskedValueIsZero(V, Mask, DL, Depth, &AC, CxtI, &DT);
+ }
+
+ unsigned ComputeNumSignBits(const Value *Op, unsigned Depth = 0,
+ const Instruction *CxtI = nullptr) const {
+ return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT);
+ }
+
+ OverflowResult computeOverflowForUnsignedMul(const Value *LHS,
+ const Value *RHS,
+ const Instruction *CxtI) const {
+ return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
+ }
+
+ OverflowResult computeOverflowForSignedMul(const Value *LHS,
+ const Value *RHS,
+ const Instruction *CxtI) const {
+ return llvm::computeOverflowForSignedMul(LHS, RHS, DL, &AC, CxtI, &DT);
+ }
+
+ OverflowResult computeOverflowForUnsignedAdd(const Value *LHS,
+ const Value *RHS,
+ const Instruction *CxtI) const {
+ return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+ }
+
+ OverflowResult computeOverflowForSignedAdd(const Value *LHS,
+ const Value *RHS,
+ const Instruction *CxtI) const {
+ return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
+ }
+
+ OverflowResult computeOverflowForUnsignedSub(const Value *LHS,
+ const Value *RHS,
+ const Instruction *CxtI) const {
+ return llvm::computeOverflowForUnsignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
+ }
+
+ OverflowResult computeOverflowForSignedSub(const Value *LHS, const Value *RHS,
+ const Instruction *CxtI) const {
+ return llvm::computeOverflowForSignedSub(LHS, RHS, DL, &AC, CxtI, &DT);
+ }
+
+ OverflowResult computeOverflow(
+ Instruction::BinaryOps BinaryOp, bool IsSigned,
+ Value *LHS, Value *RHS, Instruction *CxtI) const;
+
+ /// Performs a few simplifications for operators which are associative
+ /// or commutative.
+ bool SimplifyAssociativeOrCommutative(BinaryOperator &I);
+
+ /// Tries to simplify binary operations which some other binary
+ /// operation distributes over.
+ ///
+ /// It does this by either by factorizing out common terms (eg "(A*B)+(A*C)"
+ /// -> "A*(B+C)") or expanding out if this results in simplifications (eg: "A
+ /// & (B | C) -> (A&B) | (A&C)" if this is a win). Returns the simplified
+ /// value, or null if it didn't simplify.
+ Value *SimplifyUsingDistributiveLaws(BinaryOperator &I);
+
+ /// Tries to simplify add operations using the definition of remainder.
+ ///
+ /// The definition of remainder is X % C = X - (X / C ) * C. The add
+ /// expression X % C0 + (( X / C0 ) % C1) * C0 can be simplified to
+ /// X % (C0 * C1)
+ Value *SimplifyAddWithRemainder(BinaryOperator &I);
+
+ // Binary Op helper for select operations where the expression can be
+ // efficiently reorganized.
+ Value *SimplifySelectsFeedingBinaryOp(BinaryOperator &I, Value *LHS,
+ Value *RHS);
+
+ /// This tries to simplify binary operations by factorizing out common terms
+ /// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
+ Value *tryFactorization(BinaryOperator &, Instruction::BinaryOps, Value *,
+ Value *, Value *, Value *);
+
+ /// Match a select chain which produces one of three values based on whether
+ /// the LHS is less than, equal to, or greater than RHS respectively.
+ /// Return true if we matched a three way compare idiom. The LHS, RHS, Less,
+ /// Equal and Greater values are saved in the matching process and returned to
+ /// the caller.
+ bool matchThreeWayIntCompare(SelectInst *SI, Value *&LHS, Value *&RHS,
+ ConstantInt *&Less, ConstantInt *&Equal,
+ ConstantInt *&Greater);
+
+ /// Attempts to replace V with a simpler value based on the demanded
+ /// bits.
+ Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, KnownBits &Known,
+ unsigned Depth, Instruction *CxtI);
+ bool SimplifyDemandedBits(Instruction *I, unsigned Op,
+ const APInt &DemandedMask, KnownBits &Known,
unsigned Depth = 0) override;
-
- /// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne
- /// bits. It also tries to handle simplifications that can be done based on
- /// DemandedMask, but without modifying the Instruction.
- Value *SimplifyMultipleUseDemandedBits(Instruction *I,
- const APInt &DemandedMask,
- KnownBits &Known,
- unsigned Depth, Instruction *CxtI);
-
- /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
- /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence.
- Value *simplifyShrShlDemandedBits(
- Instruction *Shr, const APInt &ShrOp1, Instruction *Shl,
- const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known);
-
- /// Tries to simplify operands to an integer instruction based on its
- /// demanded bits.
- bool SimplifyDemandedInstructionBits(Instruction &Inst);
-
+
+ /// Helper routine of SimplifyDemandedUseBits. It computes KnownZero/KnownOne
+ /// bits. It also tries to handle simplifications that can be done based on
+ /// DemandedMask, but without modifying the Instruction.
+ Value *SimplifyMultipleUseDemandedBits(Instruction *I,
+ const APInt &DemandedMask,
+ KnownBits &Known,
+ unsigned Depth, Instruction *CxtI);
+
+ /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
+ /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence.
+ Value *simplifyShrShlDemandedBits(
+ Instruction *Shr, const APInt &ShrOp1, Instruction *Shl,
+ const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known);
+
+ /// Tries to simplify operands to an integer instruction based on its
+ /// demanded bits.
+ bool SimplifyDemandedInstructionBits(Instruction &Inst);
+
virtual Value *
SimplifyDemandedVectorElts(Value *V, APInt DemandedElts, APInt &UndefElts,
unsigned Depth = 0,
bool AllowMultipleUsers = false) override;
-
- /// Canonicalize the position of binops relative to shufflevector.
- Instruction *foldVectorBinop(BinaryOperator &Inst);
- Instruction *foldVectorSelect(SelectInst &Sel);
-
- /// Given a binary operator, cast instruction, or select which has a PHI node
- /// as operand #0, see if we can fold the instruction into the PHI (which is
- /// only possible if all operands to the PHI are constants).
- Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN);
-
- /// Given an instruction with a select as one operand and a constant as the
- /// other operand, try to fold the binary operator into the select arguments.
- /// This also works for Cast instructions, which obviously do not have a
- /// second operand.
- Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
-
- /// This is a convenience wrapper function for the above two functions.
- Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I);
-
- Instruction *foldAddWithConstant(BinaryOperator &Add);
-
- /// Try to rotate an operation below a PHI node, using PHI nodes for
- /// its operands.
+
+ /// Canonicalize the position of binops relative to shufflevector.
+ Instruction *foldVectorBinop(BinaryOperator &Inst);
+ Instruction *foldVectorSelect(SelectInst &Sel);
+
+ /// Given a binary operator, cast instruction, or select which has a PHI node
+ /// as operand #0, see if we can fold the instruction into the PHI (which is
+ /// only possible if all operands to the PHI are constants).
+ Instruction *foldOpIntoPhi(Instruction &I, PHINode *PN);
+
+ /// Given an instruction with a select as one operand and a constant as the
+ /// other operand, try to fold the binary operator into the select arguments.
+ /// This also works for Cast instructions, which obviously do not have a
+ /// second operand.
+ Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
+
+ /// This is a convenience wrapper function for the above two functions.
+ Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I);
+
+ Instruction *foldAddWithConstant(BinaryOperator &Add);
+
+ /// Try to rotate an operation below a PHI node, using PHI nodes for
+ /// its operands.
Instruction *foldPHIArgOpIntoPHI(PHINode &PN);
Instruction *foldPHIArgBinOpIntoPHI(PHINode &PN);
Instruction *foldPHIArgInsertValueInstructionIntoPHI(PHINode &PN);
@@ -621,167 +621,167 @@ public:
Instruction *foldPHIArgGEPIntoPHI(PHINode &PN);
Instruction *foldPHIArgLoadIntoPHI(PHINode &PN);
Instruction *foldPHIArgZextsIntoPHI(PHINode &PN);
-
- /// If an integer typed PHI has only one use which is an IntToPtr operation,
- /// replace the PHI with an existing pointer typed PHI if it exists. Otherwise
- /// insert a new pointer typed PHI and replace the original one.
+
+ /// If an integer typed PHI has only one use which is an IntToPtr operation,
+ /// replace the PHI with an existing pointer typed PHI if it exists. Otherwise
+ /// insert a new pointer typed PHI and replace the original one.
Instruction *foldIntegerTypedPHI(PHINode &PN);
-
- /// Helper function for FoldPHIArgXIntoPHI() to set debug location for the
- /// folded operation.
- void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN);
-
- Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
- ICmpInst::Predicate Cond, Instruction &I);
- Instruction *foldAllocaCmp(ICmpInst &ICI, const AllocaInst *Alloca,
- const Value *Other);
- Instruction *foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
- GlobalVariable *GV, CmpInst &ICI,
- ConstantInt *AndCst = nullptr);
- Instruction *foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
- Constant *RHSC);
- Instruction *foldICmpAddOpConst(Value *X, const APInt &C,
- ICmpInst::Predicate Pred);
- Instruction *foldICmpWithCastOp(ICmpInst &ICI);
-
- Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp);
- Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp);
- Instruction *foldICmpWithConstant(ICmpInst &Cmp);
- Instruction *foldICmpInstWithConstant(ICmpInst &Cmp);
- Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp);
- Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ);
- Instruction *foldICmpEquality(ICmpInst &Cmp);
- Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I);
- Instruction *foldSignBitTest(ICmpInst &I);
- Instruction *foldICmpWithZero(ICmpInst &Cmp);
-
- Value *foldUnsignedMultiplicationOverflowCheck(ICmpInst &Cmp);
-
- Instruction *foldICmpSelectConstant(ICmpInst &Cmp, SelectInst *Select,
- ConstantInt *C);
- Instruction *foldICmpTruncConstant(ICmpInst &Cmp, TruncInst *Trunc,
- const APInt &C);
- Instruction *foldICmpAndConstant(ICmpInst &Cmp, BinaryOperator *And,
- const APInt &C);
- Instruction *foldICmpXorConstant(ICmpInst &Cmp, BinaryOperator *Xor,
- const APInt &C);
- Instruction *foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
- const APInt &C);
- Instruction *foldICmpMulConstant(ICmpInst &Cmp, BinaryOperator *Mul,
- const APInt &C);
- Instruction *foldICmpShlConstant(ICmpInst &Cmp, BinaryOperator *Shl,
- const APInt &C);
- Instruction *foldICmpShrConstant(ICmpInst &Cmp, BinaryOperator *Shr,
- const APInt &C);
- Instruction *foldICmpSRemConstant(ICmpInst &Cmp, BinaryOperator *UDiv,
- const APInt &C);
- Instruction *foldICmpUDivConstant(ICmpInst &Cmp, BinaryOperator *UDiv,
- const APInt &C);
- Instruction *foldICmpDivConstant(ICmpInst &Cmp, BinaryOperator *Div,
- const APInt &C);
- Instruction *foldICmpSubConstant(ICmpInst &Cmp, BinaryOperator *Sub,
- const APInt &C);
- Instruction *foldICmpAddConstant(ICmpInst &Cmp, BinaryOperator *Add,
- const APInt &C);
- Instruction *foldICmpAndConstConst(ICmpInst &Cmp, BinaryOperator *And,
- const APInt &C1);
- Instruction *foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
- const APInt &C1, const APInt &C2);
- Instruction *foldICmpShrConstConst(ICmpInst &I, Value *ShAmt, const APInt &C1,
- const APInt &C2);
- Instruction *foldICmpShlConstConst(ICmpInst &I, Value *ShAmt, const APInt &C1,
- const APInt &C2);
-
- Instruction *foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
- BinaryOperator *BO,
- const APInt &C);
- Instruction *foldICmpIntrinsicWithConstant(ICmpInst &ICI, IntrinsicInst *II,
- const APInt &C);
- Instruction *foldICmpEqIntrinsicWithConstant(ICmpInst &ICI, IntrinsicInst *II,
- const APInt &C);
-
- // Helpers of visitSelectInst().
- Instruction *foldSelectExtConst(SelectInst &Sel);
- Instruction *foldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI);
- Instruction *foldSelectIntoOp(SelectInst &SI, Value *, Value *);
- Instruction *foldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1,
- Value *A, Value *B, Instruction &Outer,
- SelectPatternFlavor SPF2, Value *C);
- Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
+
+ /// Helper function for FoldPHIArgXIntoPHI() to set debug location for the
+ /// folded operation.
+ void PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN);
+
+ Instruction *foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
+ ICmpInst::Predicate Cond, Instruction &I);
+ Instruction *foldAllocaCmp(ICmpInst &ICI, const AllocaInst *Alloca,
+ const Value *Other);
+ Instruction *foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
+ GlobalVariable *GV, CmpInst &ICI,
+ ConstantInt *AndCst = nullptr);
+ Instruction *foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
+ Constant *RHSC);
+ Instruction *foldICmpAddOpConst(Value *X, const APInt &C,
+ ICmpInst::Predicate Pred);
+ Instruction *foldICmpWithCastOp(ICmpInst &ICI);
+
+ Instruction *foldICmpUsingKnownBits(ICmpInst &Cmp);
+ Instruction *foldICmpWithDominatingICmp(ICmpInst &Cmp);
+ Instruction *foldICmpWithConstant(ICmpInst &Cmp);
+ Instruction *foldICmpInstWithConstant(ICmpInst &Cmp);
+ Instruction *foldICmpInstWithConstantNotInt(ICmpInst &Cmp);
+ Instruction *foldICmpBinOp(ICmpInst &Cmp, const SimplifyQuery &SQ);
+ Instruction *foldICmpEquality(ICmpInst &Cmp);
+ Instruction *foldIRemByPowerOfTwoToBitTest(ICmpInst &I);
+ Instruction *foldSignBitTest(ICmpInst &I);
+ Instruction *foldICmpWithZero(ICmpInst &Cmp);
+
+ Value *foldUnsignedMultiplicationOverflowCheck(ICmpInst &Cmp);
+
+ Instruction *foldICmpSelectConstant(ICmpInst &Cmp, SelectInst *Select,
+ ConstantInt *C);
+ Instruction *foldICmpTruncConstant(ICmpInst &Cmp, TruncInst *Trunc,
+ const APInt &C);
+ Instruction *foldICmpAndConstant(ICmpInst &Cmp, BinaryOperator *And,
+ const APInt &C);
+ Instruction *foldICmpXorConstant(ICmpInst &Cmp, BinaryOperator *Xor,
+ const APInt &C);
+ Instruction *foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
+ const APInt &C);
+ Instruction *foldICmpMulConstant(ICmpInst &Cmp, BinaryOperator *Mul,
+ const APInt &C);
+ Instruction *foldICmpShlConstant(ICmpInst &Cmp, BinaryOperator *Shl,
+ const APInt &C);
+ Instruction *foldICmpShrConstant(ICmpInst &Cmp, BinaryOperator *Shr,
+ const APInt &C);
+ Instruction *foldICmpSRemConstant(ICmpInst &Cmp, BinaryOperator *UDiv,
+ const APInt &C);
+ Instruction *foldICmpUDivConstant(ICmpInst &Cmp, BinaryOperator *UDiv,
+ const APInt &C);
+ Instruction *foldICmpDivConstant(ICmpInst &Cmp, BinaryOperator *Div,
+ const APInt &C);
+ Instruction *foldICmpSubConstant(ICmpInst &Cmp, BinaryOperator *Sub,
+ const APInt &C);
+ Instruction *foldICmpAddConstant(ICmpInst &Cmp, BinaryOperator *Add,
+ const APInt &C);
+ Instruction *foldICmpAndConstConst(ICmpInst &Cmp, BinaryOperator *And,
+ const APInt &C1);
+ Instruction *foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
+ const APInt &C1, const APInt &C2);
+ Instruction *foldICmpShrConstConst(ICmpInst &I, Value *ShAmt, const APInt &C1,
+ const APInt &C2);
+ Instruction *foldICmpShlConstConst(ICmpInst &I, Value *ShAmt, const APInt &C1,
+ const APInt &C2);
+
+ Instruction *foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
+ BinaryOperator *BO,
+ const APInt &C);
+ Instruction *foldICmpIntrinsicWithConstant(ICmpInst &ICI, IntrinsicInst *II,
+ const APInt &C);
+ Instruction *foldICmpEqIntrinsicWithConstant(ICmpInst &ICI, IntrinsicInst *II,
+ const APInt &C);
+
+ // Helpers of visitSelectInst().
+ Instruction *foldSelectExtConst(SelectInst &Sel);
+ Instruction *foldSelectOpOp(SelectInst &SI, Instruction *TI, Instruction *FI);
+ Instruction *foldSelectIntoOp(SelectInst &SI, Value *, Value *);
+ Instruction *foldSPFofSPF(Instruction *Inner, SelectPatternFlavor SPF1,
+ Value *A, Value *B, Instruction &Outer,
+ SelectPatternFlavor SPF2, Value *C);
+ Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
Instruction *foldSelectValueEquivalence(SelectInst &SI, ICmpInst &ICI);
-
- Value *insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
- bool isSigned, bool Inside);
- Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
- bool mergeStoreIntoSuccessor(StoreInst &SI);
-
+
+ Value *insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
+ bool isSigned, bool Inside);
+ Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
+ bool mergeStoreIntoSuccessor(StoreInst &SI);
+
/// Given an 'or' instruction, check to see if it is part of a
/// bswap/bitreverse idiom. If so, return the equivalent bswap/bitreverse
/// intrinsic.
Instruction *matchBSwapOrBitReverse(BinaryOperator &Or, bool MatchBSwaps,
bool MatchBitReversals);
-
- Instruction *SimplifyAnyMemTransfer(AnyMemTransferInst *MI);
- Instruction *SimplifyAnyMemSet(AnyMemSetInst *MI);
-
- Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
-
- /// Returns a value X such that Val = X * Scale, or null if none.
- ///
- /// If the multiplication is known not to overflow then NoSignedWrap is set.
- Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
-};
-
-class Negator final {
- /// Top-to-bottom, def-to-use negated instruction tree we produced.
- SmallVector<Instruction *, NegatorMaxNodesSSO> NewInstructions;
-
- using BuilderTy = IRBuilder<TargetFolder, IRBuilderCallbackInserter>;
- BuilderTy Builder;
-
- const DataLayout &DL;
- AssumptionCache &AC;
- const DominatorTree &DT;
-
- const bool IsTrulyNegation;
-
- SmallDenseMap<Value *, Value *> NegationsCache;
-
- Negator(LLVMContext &C, const DataLayout &DL, AssumptionCache &AC,
- const DominatorTree &DT, bool IsTrulyNegation);
-
-#if LLVM_ENABLE_STATS
- unsigned NumValuesVisitedInThisNegator = 0;
- ~Negator();
-#endif
-
- using Result = std::pair<ArrayRef<Instruction *> /*NewInstructions*/,
- Value * /*NegatedRoot*/>;
-
+
+ Instruction *SimplifyAnyMemTransfer(AnyMemTransferInst *MI);
+ Instruction *SimplifyAnyMemSet(AnyMemSetInst *MI);
+
+ Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
+
+ /// Returns a value X such that Val = X * Scale, or null if none.
+ ///
+ /// If the multiplication is known not to overflow then NoSignedWrap is set.
+ Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
+};
+
+class Negator final {
+ /// Top-to-bottom, def-to-use negated instruction tree we produced.
+ SmallVector<Instruction *, NegatorMaxNodesSSO> NewInstructions;
+
+ using BuilderTy = IRBuilder<TargetFolder, IRBuilderCallbackInserter>;
+ BuilderTy Builder;
+
+ const DataLayout &DL;
+ AssumptionCache &AC;
+ const DominatorTree &DT;
+
+ const bool IsTrulyNegation;
+
+ SmallDenseMap<Value *, Value *> NegationsCache;
+
+ Negator(LLVMContext &C, const DataLayout &DL, AssumptionCache &AC,
+ const DominatorTree &DT, bool IsTrulyNegation);
+
+#if LLVM_ENABLE_STATS
+ unsigned NumValuesVisitedInThisNegator = 0;
+ ~Negator();
+#endif
+
+ using Result = std::pair<ArrayRef<Instruction *> /*NewInstructions*/,
+ Value * /*NegatedRoot*/>;
+
std::array<Value *, 2> getSortedOperandsOfBinOp(Instruction *I);
- LLVM_NODISCARD Value *visitImpl(Value *V, unsigned Depth);
-
- LLVM_NODISCARD Value *negate(Value *V, unsigned Depth);
-
- /// Recurse depth-first and attempt to sink the negation.
- /// FIXME: use worklist?
- LLVM_NODISCARD Optional<Result> run(Value *Root);
-
- Negator(const Negator &) = delete;
- Negator(Negator &&) = delete;
- Negator &operator=(const Negator &) = delete;
- Negator &operator=(Negator &&) = delete;
-
-public:
- /// Attempt to negate \p Root. Retuns nullptr if negation can't be performed,
- /// otherwise returns negated value.
- LLVM_NODISCARD static Value *Negate(bool LHSIsZero, Value *Root,
+ LLVM_NODISCARD Value *visitImpl(Value *V, unsigned Depth);
+
+ LLVM_NODISCARD Value *negate(Value *V, unsigned Depth);
+
+ /// Recurse depth-first and attempt to sink the negation.
+ /// FIXME: use worklist?
+ LLVM_NODISCARD Optional<Result> run(Value *Root);
+
+ Negator(const Negator &) = delete;
+ Negator(Negator &&) = delete;
+ Negator &operator=(const Negator &) = delete;
+ Negator &operator=(Negator &&) = delete;
+
+public:
+ /// Attempt to negate \p Root. Retuns nullptr if negation can't be performed,
+ /// otherwise returns negated value.
+ LLVM_NODISCARD static Value *Negate(bool LHSIsZero, Value *Root,
InstCombinerImpl &IC);
-};
-
-} // end namespace llvm
-
-#undef DEBUG_TYPE
-
-#endif // LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
+};
+
+} // end namespace llvm
+
+#undef DEBUG_TYPE
+
+#endif // LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index b5a97a4e26..c7b5f6f780 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1,315 +1,315 @@
-//===- InstCombineLoadStoreAlloca.cpp -------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the visit functions for load, store and alloca.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/IR/ConstantRange.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/PatternMatch.h"
+//===- InstCombineLoadStoreAlloca.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for load, store and alloca.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "instcombine"
-
-STATISTIC(NumDeadStore, "Number of dead stores eliminated");
-STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
-
-/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
-/// pointer to an alloca. Ignore any reads of the pointer, return false if we
-/// see any stores or other unknown uses. If we see pointer arithmetic, keep
-/// track of whether it moves the pointer (with IsOffset) but otherwise traverse
-/// the uses. If we see a memcpy/memmove that targets an unoffseted pointer to
-/// the alloca, and if the source pointer is a pointer to a constant global, we
-/// can optimize this.
-static bool
-isOnlyCopiedFromConstantMemory(AAResults *AA,
- Value *V, MemTransferInst *&TheCopy,
- SmallVectorImpl<Instruction *> &ToDelete) {
- // We track lifetime intrinsics as we encounter them. If we decide to go
- // ahead and replace the value with the global, this lets the caller quickly
- // eliminate the markers.
-
- SmallVector<std::pair<Value *, bool>, 35> ValuesToInspect;
- ValuesToInspect.emplace_back(V, false);
- while (!ValuesToInspect.empty()) {
- auto ValuePair = ValuesToInspect.pop_back_val();
- const bool IsOffset = ValuePair.second;
- for (auto &U : ValuePair.first->uses()) {
- auto *I = cast<Instruction>(U.getUser());
-
- if (auto *LI = dyn_cast<LoadInst>(I)) {
- // Ignore non-volatile loads, they are always ok.
- if (!LI->isSimple()) return false;
- continue;
- }
-
- if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) {
- // If uses of the bitcast are ok, we are ok.
- ValuesToInspect.emplace_back(I, IsOffset);
- continue;
- }
- if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
- // If the GEP has all zero indices, it doesn't offset the pointer. If it
- // doesn't, it does.
- ValuesToInspect.emplace_back(I, IsOffset || !GEP->hasAllZeroIndices());
- continue;
- }
-
- if (auto *Call = dyn_cast<CallBase>(I)) {
- // If this is the function being called then we treat it like a load and
- // ignore it.
- if (Call->isCallee(&U))
- continue;
-
- unsigned DataOpNo = Call->getDataOperandNo(&U);
- bool IsArgOperand = Call->isArgOperand(&U);
-
- // Inalloca arguments are clobbered by the call.
- if (IsArgOperand && Call->isInAllocaArgument(DataOpNo))
- return false;
-
- // If this is a readonly/readnone call site, then we know it is just a
- // load (but one that potentially returns the value itself), so we can
- // ignore it if we know that the value isn't captured.
- if (Call->onlyReadsMemory() &&
- (Call->use_empty() || Call->doesNotCapture(DataOpNo)))
- continue;
-
- // If this is being passed as a byval argument, the caller is making a
- // copy, so it is only a read of the alloca.
- if (IsArgOperand && Call->isByValArgument(DataOpNo))
- continue;
- }
-
- // Lifetime intrinsics can be handled by the caller.
- if (I->isLifetimeStartOrEnd()) {
- assert(I->use_empty() && "Lifetime markers have no result to use!");
- ToDelete.push_back(I);
- continue;
- }
-
- // If this is isn't our memcpy/memmove, reject it as something we can't
- // handle.
- MemTransferInst *MI = dyn_cast<MemTransferInst>(I);
- if (!MI)
- return false;
-
- // If the transfer is using the alloca as a source of the transfer, then
- // ignore it since it is a load (unless the transfer is volatile).
- if (U.getOperandNo() == 1) {
- if (MI->isVolatile()) return false;
- continue;
- }
-
- // If we already have seen a copy, reject the second one.
- if (TheCopy) return false;
-
- // If the pointer has been offset from the start of the alloca, we can't
- // safely handle this.
- if (IsOffset) return false;
-
- // If the memintrinsic isn't using the alloca as the dest, reject it.
- if (U.getOperandNo() != 0) return false;
-
- // If the source of the memcpy/move is not a constant global, reject it.
- if (!AA->pointsToConstantMemory(MI->getSource()))
- return false;
-
- // Otherwise, the transform is safe. Remember the copy instruction.
- TheCopy = MI;
- }
- }
- return true;
-}
-
-/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
-/// modified by a copy from a constant global. If we can prove this, we can
-/// replace any uses of the alloca with uses of the global directly.
-static MemTransferInst *
-isOnlyCopiedFromConstantMemory(AAResults *AA,
- AllocaInst *AI,
- SmallVectorImpl<Instruction *> &ToDelete) {
- MemTransferInst *TheCopy = nullptr;
- if (isOnlyCopiedFromConstantMemory(AA, AI, TheCopy, ToDelete))
- return TheCopy;
- return nullptr;
-}
-
-/// Returns true if V is dereferenceable for size of alloca.
-static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
- const DataLayout &DL) {
- if (AI->isArrayAllocation())
- return false;
- uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
- if (!AllocaSize)
- return false;
- return isDereferenceableAndAlignedPointer(V, Align(AI->getAlignment()),
- APInt(64, AllocaSize), DL);
-}
-
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+STATISTIC(NumDeadStore, "Number of dead stores eliminated");
+STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
+
+/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
+/// pointer to an alloca. Ignore any reads of the pointer, return false if we
+/// see any stores or other unknown uses. If we see pointer arithmetic, keep
+/// track of whether it moves the pointer (with IsOffset) but otherwise traverse
+/// the uses. If we see a memcpy/memmove that targets an unoffseted pointer to
+/// the alloca, and if the source pointer is a pointer to a constant global, we
+/// can optimize this.
+static bool
+isOnlyCopiedFromConstantMemory(AAResults *AA,
+ Value *V, MemTransferInst *&TheCopy,
+ SmallVectorImpl<Instruction *> &ToDelete) {
+ // We track lifetime intrinsics as we encounter them. If we decide to go
+ // ahead and replace the value with the global, this lets the caller quickly
+ // eliminate the markers.
+
+ SmallVector<std::pair<Value *, bool>, 35> ValuesToInspect;
+ ValuesToInspect.emplace_back(V, false);
+ while (!ValuesToInspect.empty()) {
+ auto ValuePair = ValuesToInspect.pop_back_val();
+ const bool IsOffset = ValuePair.second;
+ for (auto &U : ValuePair.first->uses()) {
+ auto *I = cast<Instruction>(U.getUser());
+
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ // Ignore non-volatile loads, they are always ok.
+ if (!LI->isSimple()) return false;
+ continue;
+ }
+
+ if (isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I)) {
+ // If uses of the bitcast are ok, we are ok.
+ ValuesToInspect.emplace_back(I, IsOffset);
+ continue;
+ }
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+ // If the GEP has all zero indices, it doesn't offset the pointer. If it
+ // doesn't, it does.
+ ValuesToInspect.emplace_back(I, IsOffset || !GEP->hasAllZeroIndices());
+ continue;
+ }
+
+ if (auto *Call = dyn_cast<CallBase>(I)) {
+ // If this is the function being called then we treat it like a load and
+ // ignore it.
+ if (Call->isCallee(&U))
+ continue;
+
+ unsigned DataOpNo = Call->getDataOperandNo(&U);
+ bool IsArgOperand = Call->isArgOperand(&U);
+
+ // Inalloca arguments are clobbered by the call.
+ if (IsArgOperand && Call->isInAllocaArgument(DataOpNo))
+ return false;
+
+ // If this is a readonly/readnone call site, then we know it is just a
+ // load (but one that potentially returns the value itself), so we can
+ // ignore it if we know that the value isn't captured.
+ if (Call->onlyReadsMemory() &&
+ (Call->use_empty() || Call->doesNotCapture(DataOpNo)))
+ continue;
+
+ // If this is being passed as a byval argument, the caller is making a
+ // copy, so it is only a read of the alloca.
+ if (IsArgOperand && Call->isByValArgument(DataOpNo))
+ continue;
+ }
+
+ // Lifetime intrinsics can be handled by the caller.
+ if (I->isLifetimeStartOrEnd()) {
+ assert(I->use_empty() && "Lifetime markers have no result to use!");
+ ToDelete.push_back(I);
+ continue;
+ }
+
+ // If this is isn't our memcpy/memmove, reject it as something we can't
+ // handle.
+ MemTransferInst *MI = dyn_cast<MemTransferInst>(I);
+ if (!MI)
+ return false;
+
+ // If the transfer is using the alloca as a source of the transfer, then
+ // ignore it since it is a load (unless the transfer is volatile).
+ if (U.getOperandNo() == 1) {
+ if (MI->isVolatile()) return false;
+ continue;
+ }
+
+ // If we already have seen a copy, reject the second one.
+ if (TheCopy) return false;
+
+ // If the pointer has been offset from the start of the alloca, we can't
+ // safely handle this.
+ if (IsOffset) return false;
+
+ // If the memintrinsic isn't using the alloca as the dest, reject it.
+ if (U.getOperandNo() != 0) return false;
+
+ // If the source of the memcpy/move is not a constant global, reject it.
+ if (!AA->pointsToConstantMemory(MI->getSource()))
+ return false;
+
+ // Otherwise, the transform is safe. Remember the copy instruction.
+ TheCopy = MI;
+ }
+ }
+ return true;
+}
+
+/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
+/// modified by a copy from a constant global. If we can prove this, we can
+/// replace any uses of the alloca with uses of the global directly.
+static MemTransferInst *
+isOnlyCopiedFromConstantMemory(AAResults *AA,
+ AllocaInst *AI,
+ SmallVectorImpl<Instruction *> &ToDelete) {
+ MemTransferInst *TheCopy = nullptr;
+ if (isOnlyCopiedFromConstantMemory(AA, AI, TheCopy, ToDelete))
+ return TheCopy;
+ return nullptr;
+}
+
+/// Returns true if V is dereferenceable for size of alloca.
+static bool isDereferenceableForAllocaSize(const Value *V, const AllocaInst *AI,
+ const DataLayout &DL) {
+ if (AI->isArrayAllocation())
+ return false;
+ uint64_t AllocaSize = DL.getTypeStoreSize(AI->getAllocatedType());
+ if (!AllocaSize)
+ return false;
+ return isDereferenceableAndAlignedPointer(V, Align(AI->getAlignment()),
+ APInt(64, AllocaSize), DL);
+}
+
static Instruction *simplifyAllocaArraySize(InstCombinerImpl &IC,
AllocaInst &AI) {
- // Check for array size of 1 (scalar allocation).
- if (!AI.isArrayAllocation()) {
- // i32 1 is the canonical array size for scalar allocations.
- if (AI.getArraySize()->getType()->isIntegerTy(32))
- return nullptr;
-
- // Canonicalize it.
- return IC.replaceOperand(AI, 0, IC.Builder.getInt32(1));
- }
-
- // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
- if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
- if (C->getValue().getActiveBits() <= 64) {
- Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
- AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
- New->setAlignment(AI.getAlign());
-
- // Scan to the end of the allocation instructions, to skip over a block of
- // allocas if possible...also skip interleaved debug info
- //
- BasicBlock::iterator It(New);
- while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
- ++It;
-
- // Now that I is pointing to the first non-allocation-inst in the block,
- // insert our getelementptr instruction...
- //
- Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType());
- Value *NullIdx = Constant::getNullValue(IdxTy);
- Value *Idx[2] = {NullIdx, NullIdx};
- Instruction *GEP = GetElementPtrInst::CreateInBounds(
- NewTy, New, Idx, New->getName() + ".sub");
- IC.InsertNewInstBefore(GEP, *It);
-
- // Now make everything use the getelementptr instead of the original
- // allocation.
- return IC.replaceInstUsesWith(AI, GEP);
- }
- }
-
- if (isa<UndefValue>(AI.getArraySize()))
- return IC.replaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
-
- // Ensure that the alloca array size argument has type intptr_t, so that
- // any casting is exposed early.
- Type *IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType());
- if (AI.getArraySize()->getType() != IntPtrTy) {
- Value *V = IC.Builder.CreateIntCast(AI.getArraySize(), IntPtrTy, false);
- return IC.replaceOperand(AI, 0, V);
- }
-
- return nullptr;
-}
-
-namespace {
-// If I and V are pointers in different address space, it is not allowed to
-// use replaceAllUsesWith since I and V have different types. A
-// non-target-specific transformation should not use addrspacecast on V since
-// the two address space may be disjoint depending on target.
-//
-// This class chases down uses of the old pointer until reaching the load
-// instructions, then replaces the old pointer in the load instructions with
-// the new pointer. If during the chasing it sees bitcast or GEP, it will
-// create new bitcast or GEP with the new pointer and use them in the load
-// instruction.
-class PointerReplacer {
-public:
+ // Check for array size of 1 (scalar allocation).
+ if (!AI.isArrayAllocation()) {
+ // i32 1 is the canonical array size for scalar allocations.
+ if (AI.getArraySize()->getType()->isIntegerTy(32))
+ return nullptr;
+
+ // Canonicalize it.
+ return IC.replaceOperand(AI, 0, IC.Builder.getInt32(1));
+ }
+
+ // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
+ if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
+ if (C->getValue().getActiveBits() <= 64) {
+ Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
+ AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
+ New->setAlignment(AI.getAlign());
+
+ // Scan to the end of the allocation instructions, to skip over a block of
+ // allocas if possible...also skip interleaved debug info
+ //
+ BasicBlock::iterator It(New);
+ while (isa<AllocaInst>(*It) || isa<DbgInfoIntrinsic>(*It))
+ ++It;
+
+ // Now that I is pointing to the first non-allocation-inst in the block,
+ // insert our getelementptr instruction...
+ //
+ Type *IdxTy = IC.getDataLayout().getIntPtrType(AI.getType());
+ Value *NullIdx = Constant::getNullValue(IdxTy);
+ Value *Idx[2] = {NullIdx, NullIdx};
+ Instruction *GEP = GetElementPtrInst::CreateInBounds(
+ NewTy, New, Idx, New->getName() + ".sub");
+ IC.InsertNewInstBefore(GEP, *It);
+
+ // Now make everything use the getelementptr instead of the original
+ // allocation.
+ return IC.replaceInstUsesWith(AI, GEP);
+ }
+ }
+
+ if (isa<UndefValue>(AI.getArraySize()))
+ return IC.replaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
+
+ // Ensure that the alloca array size argument has type intptr_t, so that
+ // any casting is exposed early.
+ Type *IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType());
+ if (AI.getArraySize()->getType() != IntPtrTy) {
+ Value *V = IC.Builder.CreateIntCast(AI.getArraySize(), IntPtrTy, false);
+ return IC.replaceOperand(AI, 0, V);
+ }
+
+ return nullptr;
+}
+
+namespace {
+// If I and V are pointers in different address space, it is not allowed to
+// use replaceAllUsesWith since I and V have different types. A
+// non-target-specific transformation should not use addrspacecast on V since
+// the two address space may be disjoint depending on target.
+//
+// This class chases down uses of the old pointer until reaching the load
+// instructions, then replaces the old pointer in the load instructions with
+// the new pointer. If during the chasing it sees bitcast or GEP, it will
+// create new bitcast or GEP with the new pointer and use them in the load
+// instruction.
+class PointerReplacer {
+public:
PointerReplacer(InstCombinerImpl &IC) : IC(IC) {}
bool collectUsers(Instruction &I);
- void replacePointer(Instruction &I, Value *V);
-
-private:
- void replace(Instruction *I);
- Value *getReplacement(Value *I);
-
+ void replacePointer(Instruction &I, Value *V);
+
+private:
+ void replace(Instruction *I);
+ Value *getReplacement(Value *I);
+
SmallSetVector<Instruction *, 4> Worklist;
- MapVector<Value *, Value *> WorkMap;
+ MapVector<Value *, Value *> WorkMap;
InstCombinerImpl &IC;
-};
-} // end anonymous namespace
-
+};
+} // end anonymous namespace
+
bool PointerReplacer::collectUsers(Instruction &I) {
- for (auto U : I.users()) {
+ for (auto U : I.users()) {
Instruction *Inst = cast<Instruction>(&*U);
if (LoadInst *Load = dyn_cast<LoadInst>(Inst)) {
if (Load->isVolatile())
return false;
Worklist.insert(Load);
- } else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) {
+ } else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) {
Worklist.insert(Inst);
if (!collectUsers(*Inst))
return false;
} else if (isa<MemTransferInst>(Inst)) {
Worklist.insert(Inst);
- } else {
+ } else {
LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n');
return false;
- }
- }
-
+ }
+ }
+
return true;
-}
-
+}
+
Value *PointerReplacer::getReplacement(Value *V) { return WorkMap.lookup(V); }
-void PointerReplacer::replace(Instruction *I) {
- if (getReplacement(I))
- return;
-
- if (auto *LT = dyn_cast<LoadInst>(I)) {
- auto *V = getReplacement(LT->getPointerOperand());
- assert(V && "Operand not replaced");
+void PointerReplacer::replace(Instruction *I) {
+ if (getReplacement(I))
+ return;
+
+ if (auto *LT = dyn_cast<LoadInst>(I)) {
+ auto *V = getReplacement(LT->getPointerOperand());
+ assert(V && "Operand not replaced");
auto *NewI = new LoadInst(LT->getType(), V, "", LT->isVolatile(),
LT->getAlign(), LT->getOrdering(),
LT->getSyncScopeID());
- NewI->takeName(LT);
+ NewI->takeName(LT);
copyMetadataForLoad(*NewI, *LT);
- IC.InsertNewInstWith(NewI, *LT);
- IC.replaceInstUsesWith(*LT, NewI);
- WorkMap[LT] = NewI;
- } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
- auto *V = getReplacement(GEP->getPointerOperand());
- assert(V && "Operand not replaced");
- SmallVector<Value *, 8> Indices;
- Indices.append(GEP->idx_begin(), GEP->idx_end());
- auto *NewI = GetElementPtrInst::Create(
- V->getType()->getPointerElementType(), V, Indices);
- IC.InsertNewInstWith(NewI, *GEP);
- NewI->takeName(GEP);
- WorkMap[GEP] = NewI;
- } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
- auto *V = getReplacement(BC->getOperand(0));
- assert(V && "Operand not replaced");
- auto *NewT = PointerType::get(BC->getType()->getPointerElementType(),
- V->getType()->getPointerAddressSpace());
- auto *NewI = new BitCastInst(V, NewT);
- IC.InsertNewInstWith(NewI, *BC);
- NewI->takeName(BC);
- WorkMap[BC] = NewI;
+ IC.InsertNewInstWith(NewI, *LT);
+ IC.replaceInstUsesWith(*LT, NewI);
+ WorkMap[LT] = NewI;
+ } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+ auto *V = getReplacement(GEP->getPointerOperand());
+ assert(V && "Operand not replaced");
+ SmallVector<Value *, 8> Indices;
+ Indices.append(GEP->idx_begin(), GEP->idx_end());
+ auto *NewI = GetElementPtrInst::Create(
+ V->getType()->getPointerElementType(), V, Indices);
+ IC.InsertNewInstWith(NewI, *GEP);
+ NewI->takeName(GEP);
+ WorkMap[GEP] = NewI;
+ } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
+ auto *V = getReplacement(BC->getOperand(0));
+ assert(V && "Operand not replaced");
+ auto *NewT = PointerType::get(BC->getType()->getPointerElementType(),
+ V->getType()->getPointerAddressSpace());
+ auto *NewI = new BitCastInst(V, NewT);
+ IC.InsertNewInstWith(NewI, *BC);
+ NewI->takeName(BC);
+ WorkMap[BC] = NewI;
} else if (auto *MemCpy = dyn_cast<MemTransferInst>(I)) {
auto *SrcV = getReplacement(MemCpy->getRawSource());
// The pointer may appear in the destination of a copy, but we don't want to
@@ -332,83 +332,83 @@ void PointerReplacer::replace(Instruction *I) {
IC.eraseInstFromFunction(*MemCpy);
WorkMap[MemCpy] = NewI;
- } else {
- llvm_unreachable("should never reach here");
- }
-}
-
-void PointerReplacer::replacePointer(Instruction &I, Value *V) {
-#ifndef NDEBUG
- auto *PT = cast<PointerType>(I.getType());
- auto *NT = cast<PointerType>(V->getType());
- assert(PT != NT && PT->getElementType() == NT->getElementType() &&
- "Invalid usage");
-#endif
- WorkMap[&I] = V;
+ } else {
+ llvm_unreachable("should never reach here");
+ }
+}
+
+void PointerReplacer::replacePointer(Instruction &I, Value *V) {
+#ifndef NDEBUG
+ auto *PT = cast<PointerType>(I.getType());
+ auto *NT = cast<PointerType>(V->getType());
+ assert(PT != NT && PT->getElementType() == NT->getElementType() &&
+ "Invalid usage");
+#endif
+ WorkMap[&I] = V;
for (Instruction *Workitem : Worklist)
replace(Workitem);
-}
-
+}
+
Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
- if (auto *I = simplifyAllocaArraySize(*this, AI))
- return I;
-
- if (AI.getAllocatedType()->isSized()) {
- // Move all alloca's of zero byte objects to the entry block and merge them
- // together. Note that we only do this for alloca's, because malloc should
- // allocate and return a unique pointer, even for a zero byte allocation.
- if (DL.getTypeAllocSize(AI.getAllocatedType()).getKnownMinSize() == 0) {
- // For a zero sized alloca there is no point in doing an array allocation.
- // This is helpful if the array size is a complicated expression not used
- // elsewhere.
- if (AI.isArrayAllocation())
- return replaceOperand(AI, 0,
- ConstantInt::get(AI.getArraySize()->getType(), 1));
-
- // Get the first instruction in the entry block.
- BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
- Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg();
- if (FirstInst != &AI) {
- // If the entry block doesn't start with a zero-size alloca then move
- // this one to the start of the entry block. There is no problem with
- // dominance as the array size was forced to a constant earlier already.
- AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
- if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
- DL.getTypeAllocSize(EntryAI->getAllocatedType())
- .getKnownMinSize() != 0) {
- AI.moveBefore(FirstInst);
- return &AI;
- }
-
- // Replace this zero-sized alloca with the one at the start of the entry
- // block after ensuring that the address will be aligned enough for both
- // types.
- const Align MaxAlign = std::max(EntryAI->getAlign(), AI.getAlign());
- EntryAI->setAlignment(MaxAlign);
- if (AI.getType() != EntryAI->getType())
- return new BitCastInst(EntryAI, AI.getType());
- return replaceInstUsesWith(AI, EntryAI);
- }
- }
- }
-
- // Check to see if this allocation is only modified by a memcpy/memmove from
- // a constant whose alignment is equal to or exceeds that of the allocation.
- // If this is the case, we can change all users to use the constant global
- // instead. This is commonly produced by the CFE by constructs like "void
- // foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' is only subsequently
- // read.
- SmallVector<Instruction *, 4> ToDelete;
- if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) {
+ if (auto *I = simplifyAllocaArraySize(*this, AI))
+ return I;
+
+ if (AI.getAllocatedType()->isSized()) {
+ // Move all alloca's of zero byte objects to the entry block and merge them
+ // together. Note that we only do this for alloca's, because malloc should
+ // allocate and return a unique pointer, even for a zero byte allocation.
+ if (DL.getTypeAllocSize(AI.getAllocatedType()).getKnownMinSize() == 0) {
+ // For a zero sized alloca there is no point in doing an array allocation.
+ // This is helpful if the array size is a complicated expression not used
+ // elsewhere.
+ if (AI.isArrayAllocation())
+ return replaceOperand(AI, 0,
+ ConstantInt::get(AI.getArraySize()->getType(), 1));
+
+ // Get the first instruction in the entry block.
+ BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
+ Instruction *FirstInst = EntryBlock.getFirstNonPHIOrDbg();
+ if (FirstInst != &AI) {
+ // If the entry block doesn't start with a zero-size alloca then move
+ // this one to the start of the entry block. There is no problem with
+ // dominance as the array size was forced to a constant earlier already.
+ AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
+ if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
+ DL.getTypeAllocSize(EntryAI->getAllocatedType())
+ .getKnownMinSize() != 0) {
+ AI.moveBefore(FirstInst);
+ return &AI;
+ }
+
+ // Replace this zero-sized alloca with the one at the start of the entry
+ // block after ensuring that the address will be aligned enough for both
+ // types.
+ const Align MaxAlign = std::max(EntryAI->getAlign(), AI.getAlign());
+ EntryAI->setAlignment(MaxAlign);
+ if (AI.getType() != EntryAI->getType())
+ return new BitCastInst(EntryAI, AI.getType());
+ return replaceInstUsesWith(AI, EntryAI);
+ }
+ }
+ }
+
+ // Check to see if this allocation is only modified by a memcpy/memmove from
+ // a constant whose alignment is equal to or exceeds that of the allocation.
+ // If this is the case, we can change all users to use the constant global
+ // instead. This is commonly produced by the CFE by constructs like "void
+ // foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' is only subsequently
+ // read.
+ SmallVector<Instruction *, 4> ToDelete;
+ if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) {
Value *TheSrc = Copy->getSource();
- Align AllocaAlign = AI.getAlign();
- Align SourceAlign = getOrEnforceKnownAlignment(
+ Align AllocaAlign = AI.getAlign();
+ Align SourceAlign = getOrEnforceKnownAlignment(
TheSrc, AllocaAlign, DL, &AI, &AC, &DT);
- if (AllocaAlign <= SourceAlign &&
+ if (AllocaAlign <= SourceAlign &&
isDereferenceableForAllocaSize(TheSrc, &AI, DL)) {
- LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
- LLVM_DEBUG(dbgs() << " memcpy = " << *Copy << '\n');
+ LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
+ LLVM_DEBUG(dbgs() << " memcpy = " << *Copy << '\n');
unsigned SrcAddrSpace = TheSrc->getType()->getPointerAddressSpace();
auto *DestTy = PointerType::get(AI.getAllocatedType(), SrcAddrSpace);
if (AI.getType()->getAddressSpace() == SrcAddrSpace) {
@@ -416,13 +416,13 @@ Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
eraseInstFromFunction(*Delete);
Value *Cast = Builder.CreateBitCast(TheSrc, DestTy);
- Instruction *NewI = replaceInstUsesWith(AI, Cast);
- eraseInstFromFunction(*Copy);
- ++NumGlobalCopies;
- return NewI;
- }
-
- PointerReplacer PtrReplacer(*this);
+ Instruction *NewI = replaceInstUsesWith(AI, Cast);
+ eraseInstFromFunction(*Copy);
+ ++NumGlobalCopies;
+ return NewI;
+ }
+
+ PointerReplacer PtrReplacer(*this);
if (PtrReplacer.collectUsers(AI)) {
for (Instruction *Delete : ToDelete)
eraseInstFromFunction(*Delete);
@@ -431,161 +431,161 @@ Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) {
PtrReplacer.replacePointer(AI, Cast);
++NumGlobalCopies;
}
- }
- }
-
- // At last, use the generic allocation site handler to aggressively remove
- // unused allocas.
- return visitAllocSite(AI);
-}
-
-// Are we allowed to form a atomic load or store of this type?
-static bool isSupportedAtomicType(Type *Ty) {
- return Ty->isIntOrPtrTy() || Ty->isFloatingPointTy();
-}
-
-/// Helper to combine a load to a new type.
-///
-/// This just does the work of combining a load to a new type. It handles
-/// metadata, etc., and returns the new instruction. The \c NewTy should be the
-/// loaded *value* type. This will convert it to a pointer, cast the operand to
-/// that pointer type, load it, etc.
-///
-/// Note that this will create all of the instructions with whatever insert
+ }
+ }
+
+ // At last, use the generic allocation site handler to aggressively remove
+ // unused allocas.
+ return visitAllocSite(AI);
+}
+
+// Are we allowed to form a atomic load or store of this type?
+static bool isSupportedAtomicType(Type *Ty) {
+ return Ty->isIntOrPtrTy() || Ty->isFloatingPointTy();
+}
+
+/// Helper to combine a load to a new type.
+///
+/// This just does the work of combining a load to a new type. It handles
+/// metadata, etc., and returns the new instruction. The \c NewTy should be the
+/// loaded *value* type. This will convert it to a pointer, cast the operand to
+/// that pointer type, load it, etc.
+///
+/// Note that this will create all of the instructions with whatever insert
/// point the \c InstCombinerImpl currently is using.
LoadInst *InstCombinerImpl::combineLoadToNewType(LoadInst &LI, Type *NewTy,
const Twine &Suffix) {
- assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
- "can't fold an atomic load to requested type");
-
- Value *Ptr = LI.getPointerOperand();
- unsigned AS = LI.getPointerAddressSpace();
- Value *NewPtr = nullptr;
- if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) &&
- NewPtr->getType()->getPointerElementType() == NewTy &&
- NewPtr->getType()->getPointerAddressSpace() == AS))
- NewPtr = Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS));
-
- LoadInst *NewLoad = Builder.CreateAlignedLoad(
- NewTy, NewPtr, LI.getAlign(), LI.isVolatile(), LI.getName() + Suffix);
- NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
- copyMetadataForLoad(*NewLoad, LI);
- return NewLoad;
-}
-
-/// Combine a store to a new type.
-///
-/// Returns the newly created store instruction.
+ assert((!LI.isAtomic() || isSupportedAtomicType(NewTy)) &&
+ "can't fold an atomic load to requested type");
+
+ Value *Ptr = LI.getPointerOperand();
+ unsigned AS = LI.getPointerAddressSpace();
+ Value *NewPtr = nullptr;
+ if (!(match(Ptr, m_BitCast(m_Value(NewPtr))) &&
+ NewPtr->getType()->getPointerElementType() == NewTy &&
+ NewPtr->getType()->getPointerAddressSpace() == AS))
+ NewPtr = Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS));
+
+ LoadInst *NewLoad = Builder.CreateAlignedLoad(
+ NewTy, NewPtr, LI.getAlign(), LI.isVolatile(), LI.getName() + Suffix);
+ NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+ copyMetadataForLoad(*NewLoad, LI);
+ return NewLoad;
+}
+
+/// Combine a store to a new type.
+///
+/// Returns the newly created store instruction.
static StoreInst *combineStoreToNewValue(InstCombinerImpl &IC, StoreInst &SI,
Value *V) {
- assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) &&
- "can't fold an atomic store of requested type");
-
- Value *Ptr = SI.getPointerOperand();
- unsigned AS = SI.getPointerAddressSpace();
- SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
- SI.getAllMetadata(MD);
-
- StoreInst *NewStore = IC.Builder.CreateAlignedStore(
- V, IC.Builder.CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
- SI.getAlign(), SI.isVolatile());
- NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
- for (const auto &MDPair : MD) {
- unsigned ID = MDPair.first;
- MDNode *N = MDPair.second;
- // Note, essentially every kind of metadata should be preserved here! This
- // routine is supposed to clone a store instruction changing *only its
- // type*. The only metadata it makes sense to drop is metadata which is
- // invalidated when the pointer type changes. This should essentially
- // never be the case in LLVM, but we explicitly switch over only known
- // metadata to be conservatively correct. If you are adding metadata to
- // LLVM which pertains to stores, you almost certainly want to add it
- // here.
- switch (ID) {
- case LLVMContext::MD_dbg:
- case LLVMContext::MD_tbaa:
- case LLVMContext::MD_prof:
- case LLVMContext::MD_fpmath:
- case LLVMContext::MD_tbaa_struct:
- case LLVMContext::MD_alias_scope:
- case LLVMContext::MD_noalias:
- case LLVMContext::MD_nontemporal:
- case LLVMContext::MD_mem_parallel_loop_access:
- case LLVMContext::MD_access_group:
- // All of these directly apply.
- NewStore->setMetadata(ID, N);
- break;
- case LLVMContext::MD_invariant_load:
- case LLVMContext::MD_nonnull:
+ assert((!SI.isAtomic() || isSupportedAtomicType(V->getType())) &&
+ "can't fold an atomic store of requested type");
+
+ Value *Ptr = SI.getPointerOperand();
+ unsigned AS = SI.getPointerAddressSpace();
+ SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+ SI.getAllMetadata(MD);
+
+ StoreInst *NewStore = IC.Builder.CreateAlignedStore(
+ V, IC.Builder.CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
+ SI.getAlign(), SI.isVolatile());
+ NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
+ for (const auto &MDPair : MD) {
+ unsigned ID = MDPair.first;
+ MDNode *N = MDPair.second;
+ // Note, essentially every kind of metadata should be preserved here! This
+ // routine is supposed to clone a store instruction changing *only its
+ // type*. The only metadata it makes sense to drop is metadata which is
+ // invalidated when the pointer type changes. This should essentially
+ // never be the case in LLVM, but we explicitly switch over only known
+ // metadata to be conservatively correct. If you are adding metadata to
+ // LLVM which pertains to stores, you almost certainly want to add it
+ // here.
+ switch (ID) {
+ case LLVMContext::MD_dbg:
+ case LLVMContext::MD_tbaa:
+ case LLVMContext::MD_prof:
+ case LLVMContext::MD_fpmath:
+ case LLVMContext::MD_tbaa_struct:
+ case LLVMContext::MD_alias_scope:
+ case LLVMContext::MD_noalias:
+ case LLVMContext::MD_nontemporal:
+ case LLVMContext::MD_mem_parallel_loop_access:
+ case LLVMContext::MD_access_group:
+ // All of these directly apply.
+ NewStore->setMetadata(ID, N);
+ break;
+ case LLVMContext::MD_invariant_load:
+ case LLVMContext::MD_nonnull:
case LLVMContext::MD_noundef:
- case LLVMContext::MD_range:
- case LLVMContext::MD_align:
- case LLVMContext::MD_dereferenceable:
- case LLVMContext::MD_dereferenceable_or_null:
- // These don't apply for stores.
- break;
- }
- }
-
- return NewStore;
-}
-
-/// Returns true if instruction represent minmax pattern like:
-/// select ((cmp load V1, load V2), V1, V2).
-static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) {
- assert(V->getType()->isPointerTy() && "Expected pointer type.");
- // Ignore possible ty* to ixx* bitcast.
+ case LLVMContext::MD_range:
+ case LLVMContext::MD_align:
+ case LLVMContext::MD_dereferenceable:
+ case LLVMContext::MD_dereferenceable_or_null:
+ // These don't apply for stores.
+ break;
+ }
+ }
+
+ return NewStore;
+}
+
+/// Returns true if instruction represent minmax pattern like:
+/// select ((cmp load V1, load V2), V1, V2).
+static bool isMinMaxWithLoads(Value *V, Type *&LoadTy) {
+ assert(V->getType()->isPointerTy() && "Expected pointer type.");
+ // Ignore possible ty* to ixx* bitcast.
V = InstCombiner::peekThroughBitcast(V);
- // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax
- // pattern.
- CmpInst::Predicate Pred;
- Instruction *L1;
- Instruction *L2;
- Value *LHS;
- Value *RHS;
- if (!match(V, m_Select(m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2)),
- m_Value(LHS), m_Value(RHS))))
- return false;
- LoadTy = L1->getType();
- return (match(L1, m_Load(m_Specific(LHS))) &&
- match(L2, m_Load(m_Specific(RHS)))) ||
- (match(L1, m_Load(m_Specific(RHS))) &&
- match(L2, m_Load(m_Specific(LHS))));
-}
-
-/// Combine loads to match the type of their uses' value after looking
-/// through intervening bitcasts.
-///
-/// The core idea here is that if the result of a load is used in an operation,
-/// we should load the type most conducive to that operation. For example, when
-/// loading an integer and converting that immediately to a pointer, we should
-/// instead directly load a pointer.
-///
-/// However, this routine must never change the width of a load or the number of
-/// loads as that would introduce a semantic change. This combine is expected to
-/// be a semantic no-op which just allows loads to more closely model the types
-/// of their consuming operations.
-///
-/// Currently, we also refuse to change the precise type used for an atomic load
-/// or a volatile load. This is debatable, and might be reasonable to change
-/// later. However, it is risky in case some backend or other part of LLVM is
-/// relying on the exact type loaded to select appropriate atomic operations.
+ // Check that select is select ((cmp load V1, load V2), V1, V2) - minmax
+ // pattern.
+ CmpInst::Predicate Pred;
+ Instruction *L1;
+ Instruction *L2;
+ Value *LHS;
+ Value *RHS;
+ if (!match(V, m_Select(m_Cmp(Pred, m_Instruction(L1), m_Instruction(L2)),
+ m_Value(LHS), m_Value(RHS))))
+ return false;
+ LoadTy = L1->getType();
+ return (match(L1, m_Load(m_Specific(LHS))) &&
+ match(L2, m_Load(m_Specific(RHS)))) ||
+ (match(L1, m_Load(m_Specific(RHS))) &&
+ match(L2, m_Load(m_Specific(LHS))));
+}
+
+/// Combine loads to match the type of their uses' value after looking
+/// through intervening bitcasts.
+///
+/// The core idea here is that if the result of a load is used in an operation,
+/// we should load the type most conducive to that operation. For example, when
+/// loading an integer and converting that immediately to a pointer, we should
+/// instead directly load a pointer.
+///
+/// However, this routine must never change the width of a load or the number of
+/// loads as that would introduce a semantic change. This combine is expected to
+/// be a semantic no-op which just allows loads to more closely model the types
+/// of their consuming operations.
+///
+/// Currently, we also refuse to change the precise type used for an atomic load
+/// or a volatile load. This is debatable, and might be reasonable to change
+/// later. However, it is risky in case some backend or other part of LLVM is
+/// relying on the exact type loaded to select appropriate atomic operations.
static Instruction *combineLoadToOperationType(InstCombinerImpl &IC,
LoadInst &LI) {
- // FIXME: We could probably with some care handle both volatile and ordered
- // atomic loads here but it isn't clear that this is important.
- if (!LI.isUnordered())
- return nullptr;
-
- if (LI.use_empty())
- return nullptr;
-
- // swifterror values can't be bitcasted.
- if (LI.getPointerOperand()->isSwiftError())
- return nullptr;
-
- const DataLayout &DL = IC.getDataLayout();
-
+ // FIXME: We could probably with some care handle both volatile and ordered
+ // atomic loads here but it isn't clear that this is important.
+ if (!LI.isUnordered())
+ return nullptr;
+
+ if (LI.use_empty())
+ return nullptr;
+
+ // swifterror values can't be bitcasted.
+ if (LI.getPointerOperand()->isSwiftError())
+ return nullptr;
+
+ const DataLayout &DL = IC.getDataLayout();
+
// Fold away bit casts of the loaded value by loading the desired type.
// Note that we should not do this for pointer<->integer casts,
// because that would result in type punning.
@@ -597,253 +597,253 @@ static Instruction *combineLoadToOperationType(InstCombinerImpl &IC,
"load from x86_amx* should not happen!");
if (BC->getType()->isX86_AMXTy())
return nullptr;
- }
-
- if (auto* CI = dyn_cast<CastInst>(LI.user_back()))
+ }
+
+ if (auto* CI = dyn_cast<CastInst>(LI.user_back()))
if (CI->isNoopCast(DL) && LI.getType()->isPtrOrPtrVectorTy() ==
CI->getDestTy()->isPtrOrPtrVectorTy())
- if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) {
- LoadInst *NewLoad = IC.combineLoadToNewType(LI, CI->getDestTy());
- CI->replaceAllUsesWith(NewLoad);
- IC.eraseInstFromFunction(*CI);
- return &LI;
- }
+ if (!LI.isAtomic() || isSupportedAtomicType(CI->getDestTy())) {
+ LoadInst *NewLoad = IC.combineLoadToNewType(LI, CI->getDestTy());
+ CI->replaceAllUsesWith(NewLoad);
+ IC.eraseInstFromFunction(*CI);
+ return &LI;
+ }
}
-
- // FIXME: We should also canonicalize loads of vectors when their elements are
- // cast to other types.
- return nullptr;
-}
-
+
+ // FIXME: We should also canonicalize loads of vectors when their elements are
+ // cast to other types.
+ return nullptr;
+}
+
static Instruction *unpackLoadToAggregate(InstCombinerImpl &IC, LoadInst &LI) {
- // FIXME: We could probably with some care handle both volatile and atomic
- // stores here but it isn't clear that this is important.
- if (!LI.isSimple())
- return nullptr;
-
- Type *T = LI.getType();
- if (!T->isAggregateType())
- return nullptr;
-
- StringRef Name = LI.getName();
- assert(LI.getAlignment() && "Alignment must be set at this point");
-
- if (auto *ST = dyn_cast<StructType>(T)) {
- // If the struct only have one element, we unpack.
- auto NumElements = ST->getNumElements();
- if (NumElements == 1) {
- LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U),
- ".unpack");
- AAMDNodes AAMD;
- LI.getAAMetadata(AAMD);
- NewLoad->setAAMetadata(AAMD);
- return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
- UndefValue::get(T), NewLoad, 0, Name));
- }
-
- // We don't want to break loads with padding here as we'd loose
- // the knowledge that padding exists for the rest of the pipeline.
- const DataLayout &DL = IC.getDataLayout();
- auto *SL = DL.getStructLayout(ST);
- if (SL->hasPadding())
- return nullptr;
-
- const auto Align = LI.getAlign();
- auto *Addr = LI.getPointerOperand();
- auto *IdxType = Type::getInt32Ty(T->getContext());
- auto *Zero = ConstantInt::get(IdxType, 0);
-
- Value *V = UndefValue::get(T);
- for (unsigned i = 0; i < NumElements; i++) {
- Value *Indices[2] = {
- Zero,
- ConstantInt::get(IdxType, i),
- };
- auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
- Name + ".elt");
- auto *L = IC.Builder.CreateAlignedLoad(
- ST->getElementType(i), Ptr,
- commonAlignment(Align, SL->getElementOffset(i)), Name + ".unpack");
- // Propagate AA metadata. It'll still be valid on the narrowed load.
- AAMDNodes AAMD;
- LI.getAAMetadata(AAMD);
- L->setAAMetadata(AAMD);
- V = IC.Builder.CreateInsertValue(V, L, i);
- }
-
- V->setName(Name);
- return IC.replaceInstUsesWith(LI, V);
- }
-
- if (auto *AT = dyn_cast<ArrayType>(T)) {
- auto *ET = AT->getElementType();
- auto NumElements = AT->getNumElements();
- if (NumElements == 1) {
- LoadInst *NewLoad = IC.combineLoadToNewType(LI, ET, ".unpack");
- AAMDNodes AAMD;
- LI.getAAMetadata(AAMD);
- NewLoad->setAAMetadata(AAMD);
- return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
- UndefValue::get(T), NewLoad, 0, Name));
- }
-
- // Bail out if the array is too large. Ideally we would like to optimize
- // arrays of arbitrary size but this has a terrible impact on compile time.
- // The threshold here is chosen arbitrarily, maybe needs a little bit of
- // tuning.
- if (NumElements > IC.MaxArraySizeForCombine)
- return nullptr;
-
- const DataLayout &DL = IC.getDataLayout();
- auto EltSize = DL.getTypeAllocSize(ET);
- const auto Align = LI.getAlign();
-
- auto *Addr = LI.getPointerOperand();
- auto *IdxType = Type::getInt64Ty(T->getContext());
- auto *Zero = ConstantInt::get(IdxType, 0);
-
- Value *V = UndefValue::get(T);
- uint64_t Offset = 0;
- for (uint64_t i = 0; i < NumElements; i++) {
- Value *Indices[2] = {
- Zero,
- ConstantInt::get(IdxType, i),
- };
- auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
- Name + ".elt");
- auto *L = IC.Builder.CreateAlignedLoad(AT->getElementType(), Ptr,
- commonAlignment(Align, Offset),
- Name + ".unpack");
- AAMDNodes AAMD;
- LI.getAAMetadata(AAMD);
- L->setAAMetadata(AAMD);
- V = IC.Builder.CreateInsertValue(V, L, i);
- Offset += EltSize;
- }
-
- V->setName(Name);
- return IC.replaceInstUsesWith(LI, V);
- }
-
- return nullptr;
-}
-
-// If we can determine that all possible objects pointed to by the provided
-// pointer value are, not only dereferenceable, but also definitively less than
-// or equal to the provided maximum size, then return true. Otherwise, return
-// false (constant global values and allocas fall into this category).
-//
-// FIXME: This should probably live in ValueTracking (or similar).
-static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
- const DataLayout &DL) {
- SmallPtrSet<Value *, 4> Visited;
- SmallVector<Value *, 4> Worklist(1, V);
-
- do {
- Value *P = Worklist.pop_back_val();
- P = P->stripPointerCasts();
-
- if (!Visited.insert(P).second)
- continue;
-
- if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
- Worklist.push_back(SI->getTrueValue());
- Worklist.push_back(SI->getFalseValue());
- continue;
- }
-
- if (PHINode *PN = dyn_cast<PHINode>(P)) {
+ // FIXME: We could probably with some care handle both volatile and atomic
+ // stores here but it isn't clear that this is important.
+ if (!LI.isSimple())
+ return nullptr;
+
+ Type *T = LI.getType();
+ if (!T->isAggregateType())
+ return nullptr;
+
+ StringRef Name = LI.getName();
+ assert(LI.getAlignment() && "Alignment must be set at this point");
+
+ if (auto *ST = dyn_cast<StructType>(T)) {
+ // If the struct only have one element, we unpack.
+ auto NumElements = ST->getNumElements();
+ if (NumElements == 1) {
+ LoadInst *NewLoad = IC.combineLoadToNewType(LI, ST->getTypeAtIndex(0U),
+ ".unpack");
+ AAMDNodes AAMD;
+ LI.getAAMetadata(AAMD);
+ NewLoad->setAAMetadata(AAMD);
+ return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
+ UndefValue::get(T), NewLoad, 0, Name));
+ }
+
+ // We don't want to break loads with padding here as we'd loose
+ // the knowledge that padding exists for the rest of the pipeline.
+ const DataLayout &DL = IC.getDataLayout();
+ auto *SL = DL.getStructLayout(ST);
+ if (SL->hasPadding())
+ return nullptr;
+
+ const auto Align = LI.getAlign();
+ auto *Addr = LI.getPointerOperand();
+ auto *IdxType = Type::getInt32Ty(T->getContext());
+ auto *Zero = ConstantInt::get(IdxType, 0);
+
+ Value *V = UndefValue::get(T);
+ for (unsigned i = 0; i < NumElements; i++) {
+ Value *Indices[2] = {
+ Zero,
+ ConstantInt::get(IdxType, i),
+ };
+ auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
+ Name + ".elt");
+ auto *L = IC.Builder.CreateAlignedLoad(
+ ST->getElementType(i), Ptr,
+ commonAlignment(Align, SL->getElementOffset(i)), Name + ".unpack");
+ // Propagate AA metadata. It'll still be valid on the narrowed load.
+ AAMDNodes AAMD;
+ LI.getAAMetadata(AAMD);
+ L->setAAMetadata(AAMD);
+ V = IC.Builder.CreateInsertValue(V, L, i);
+ }
+
+ V->setName(Name);
+ return IC.replaceInstUsesWith(LI, V);
+ }
+
+ if (auto *AT = dyn_cast<ArrayType>(T)) {
+ auto *ET = AT->getElementType();
+ auto NumElements = AT->getNumElements();
+ if (NumElements == 1) {
+ LoadInst *NewLoad = IC.combineLoadToNewType(LI, ET, ".unpack");
+ AAMDNodes AAMD;
+ LI.getAAMetadata(AAMD);
+ NewLoad->setAAMetadata(AAMD);
+ return IC.replaceInstUsesWith(LI, IC.Builder.CreateInsertValue(
+ UndefValue::get(T), NewLoad, 0, Name));
+ }
+
+ // Bail out if the array is too large. Ideally we would like to optimize
+ // arrays of arbitrary size but this has a terrible impact on compile time.
+ // The threshold here is chosen arbitrarily, maybe needs a little bit of
+ // tuning.
+ if (NumElements > IC.MaxArraySizeForCombine)
+ return nullptr;
+
+ const DataLayout &DL = IC.getDataLayout();
+ auto EltSize = DL.getTypeAllocSize(ET);
+ const auto Align = LI.getAlign();
+
+ auto *Addr = LI.getPointerOperand();
+ auto *IdxType = Type::getInt64Ty(T->getContext());
+ auto *Zero = ConstantInt::get(IdxType, 0);
+
+ Value *V = UndefValue::get(T);
+ uint64_t Offset = 0;
+ for (uint64_t i = 0; i < NumElements; i++) {
+ Value *Indices[2] = {
+ Zero,
+ ConstantInt::get(IdxType, i),
+ };
+ auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
+ Name + ".elt");
+ auto *L = IC.Builder.CreateAlignedLoad(AT->getElementType(), Ptr,
+ commonAlignment(Align, Offset),
+ Name + ".unpack");
+ AAMDNodes AAMD;
+ LI.getAAMetadata(AAMD);
+ L->setAAMetadata(AAMD);
+ V = IC.Builder.CreateInsertValue(V, L, i);
+ Offset += EltSize;
+ }
+
+ V->setName(Name);
+ return IC.replaceInstUsesWith(LI, V);
+ }
+
+ return nullptr;
+}
+
+// If we can determine that all possible objects pointed to by the provided
+// pointer value are, not only dereferenceable, but also definitively less than
+// or equal to the provided maximum size, then return true. Otherwise, return
+// false (constant global values and allocas fall into this category).
+//
+// FIXME: This should probably live in ValueTracking (or similar).
+static bool isObjectSizeLessThanOrEq(Value *V, uint64_t MaxSize,
+ const DataLayout &DL) {
+ SmallPtrSet<Value *, 4> Visited;
+ SmallVector<Value *, 4> Worklist(1, V);
+
+ do {
+ Value *P = Worklist.pop_back_val();
+ P = P->stripPointerCasts();
+
+ if (!Visited.insert(P).second)
+ continue;
+
+ if (SelectInst *SI = dyn_cast<SelectInst>(P)) {
+ Worklist.push_back(SI->getTrueValue());
+ Worklist.push_back(SI->getFalseValue());
+ continue;
+ }
+
+ if (PHINode *PN = dyn_cast<PHINode>(P)) {
append_range(Worklist, PN->incoming_values());
- continue;
- }
-
- if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) {
- if (GA->isInterposable())
- return false;
- Worklist.push_back(GA->getAliasee());
- continue;
- }
-
- // If we know how big this object is, and it is less than MaxSize, continue
- // searching. Otherwise, return false.
- if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) {
- if (!AI->getAllocatedType()->isSized())
- return false;
-
- ConstantInt *CS = dyn_cast<ConstantInt>(AI->getArraySize());
- if (!CS)
- return false;
-
- uint64_t TypeSize = DL.getTypeAllocSize(AI->getAllocatedType());
- // Make sure that, even if the multiplication below would wrap as an
- // uint64_t, we still do the right thing.
- if ((CS->getValue().zextOrSelf(128)*APInt(128, TypeSize)).ugt(MaxSize))
- return false;
- continue;
- }
-
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
- if (!GV->hasDefinitiveInitializer() || !GV->isConstant())
- return false;
-
- uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
- if (InitSize > MaxSize)
- return false;
- continue;
- }
-
- return false;
- } while (!Worklist.empty());
-
- return true;
-}
-
-// If we're indexing into an object of a known size, and the outer index is
-// not a constant, but having any value but zero would lead to undefined
-// behavior, replace it with zero.
-//
-// For example, if we have:
-// @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4
-// ...
-// %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x
-// ... = load i32* %arrayidx, align 4
-// Then we know that we can replace %x in the GEP with i64 0.
-//
-// FIXME: We could fold any GEP index to zero that would cause UB if it were
-// not zero. Currently, we only handle the first such index. Also, we could
-// also search through non-zero constant indices if we kept track of the
-// offsets those indices implied.
+ continue;
+ }
+
+ if (GlobalAlias *GA = dyn_cast<GlobalAlias>(P)) {
+ if (GA->isInterposable())
+ return false;
+ Worklist.push_back(GA->getAliasee());
+ continue;
+ }
+
+ // If we know how big this object is, and it is less than MaxSize, continue
+ // searching. Otherwise, return false.
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(P)) {
+ if (!AI->getAllocatedType()->isSized())
+ return false;
+
+ ConstantInt *CS = dyn_cast<ConstantInt>(AI->getArraySize());
+ if (!CS)
+ return false;
+
+ uint64_t TypeSize = DL.getTypeAllocSize(AI->getAllocatedType());
+ // Make sure that, even if the multiplication below would wrap as an
+ // uint64_t, we still do the right thing.
+ if ((CS->getValue().zextOrSelf(128)*APInt(128, TypeSize)).ugt(MaxSize))
+ return false;
+ continue;
+ }
+
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
+ if (!GV->hasDefinitiveInitializer() || !GV->isConstant())
+ return false;
+
+ uint64_t InitSize = DL.getTypeAllocSize(GV->getValueType());
+ if (InitSize > MaxSize)
+ return false;
+ continue;
+ }
+
+ return false;
+ } while (!Worklist.empty());
+
+ return true;
+}
+
+// If we're indexing into an object of a known size, and the outer index is
+// not a constant, but having any value but zero would lead to undefined
+// behavior, replace it with zero.
+//
+// For example, if we have:
+// @f.a = private unnamed_addr constant [1 x i32] [i32 12], align 4
+// ...
+// %arrayidx = getelementptr inbounds [1 x i32]* @f.a, i64 0, i64 %x
+// ... = load i32* %arrayidx, align 4
+// Then we know that we can replace %x in the GEP with i64 0.
+//
+// FIXME: We could fold any GEP index to zero that would cause UB if it were
+// not zero. Currently, we only handle the first such index. Also, we could
+// also search through non-zero constant indices if we kept track of the
+// offsets those indices implied.
static bool canReplaceGEPIdxWithZero(InstCombinerImpl &IC,
GetElementPtrInst *GEPI, Instruction *MemI,
unsigned &Idx) {
- if (GEPI->getNumOperands() < 2)
- return false;
-
- // Find the first non-zero index of a GEP. If all indices are zero, return
- // one past the last index.
- auto FirstNZIdx = [](const GetElementPtrInst *GEPI) {
- unsigned I = 1;
- for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) {
- Value *V = GEPI->getOperand(I);
- if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
- if (CI->isZero())
- continue;
-
- break;
- }
-
- return I;
- };
-
- // Skip through initial 'zero' indices, and find the corresponding pointer
- // type. See if the next index is not a constant.
- Idx = FirstNZIdx(GEPI);
- if (Idx == GEPI->getNumOperands())
- return false;
- if (isa<Constant>(GEPI->getOperand(Idx)))
- return false;
-
- SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx);
+ if (GEPI->getNumOperands() < 2)
+ return false;
+
+ // Find the first non-zero index of a GEP. If all indices are zero, return
+ // one past the last index.
+ auto FirstNZIdx = [](const GetElementPtrInst *GEPI) {
+ unsigned I = 1;
+ for (unsigned IE = GEPI->getNumOperands(); I != IE; ++I) {
+ Value *V = GEPI->getOperand(I);
+ if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
+ if (CI->isZero())
+ continue;
+
+ break;
+ }
+
+ return I;
+ };
+
+ // Skip through initial 'zero' indices, and find the corresponding pointer
+ // type. See if the next index is not a constant.
+ Idx = FirstNZIdx(GEPI);
+ if (Idx == GEPI->getNumOperands())
+ return false;
+ if (isa<Constant>(GEPI->getOperand(Idx)))
+ return false;
+
+ SmallVector<Value *, 4> Ops(GEPI->idx_begin(), GEPI->idx_begin() + Idx);
Type *SourceElementType = GEPI->getSourceElementType();
// Size information about scalable vectors is not available, so we cannot
// deduce whether indexing at n is undefined behaviour or not. Bail out.
@@ -851,720 +851,720 @@ static bool canReplaceGEPIdxWithZero(InstCombinerImpl &IC,
return false;
Type *AllocTy = GetElementPtrInst::getIndexedType(SourceElementType, Ops);
- if (!AllocTy || !AllocTy->isSized())
- return false;
- const DataLayout &DL = IC.getDataLayout();
+ if (!AllocTy || !AllocTy->isSized())
+ return false;
+ const DataLayout &DL = IC.getDataLayout();
uint64_t TyAllocSize = DL.getTypeAllocSize(AllocTy).getFixedSize();
-
- // If there are more indices after the one we might replace with a zero, make
- // sure they're all non-negative. If any of them are negative, the overall
- // address being computed might be before the base address determined by the
- // first non-zero index.
- auto IsAllNonNegative = [&]() {
- for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
- KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
- if (Known.isNonNegative())
- continue;
- return false;
- }
-
- return true;
- };
-
- // FIXME: If the GEP is not inbounds, and there are extra indices after the
- // one we'll replace, those could cause the address computation to wrap
- // (rendering the IsAllNonNegative() check below insufficient). We can do
- // better, ignoring zero indices (and other indices we can prove small
- // enough not to wrap).
- if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds())
- return false;
-
- // Note that isObjectSizeLessThanOrEq will return true only if the pointer is
- // also known to be dereferenceable.
- return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) &&
- IsAllNonNegative();
-}
-
-// If we're indexing into an object with a variable index for the memory
-// access, but the object has only one element, we can assume that the index
-// will always be zero. If we replace the GEP, return it.
-template <typename T>
+
+ // If there are more indices after the one we might replace with a zero, make
+ // sure they're all non-negative. If any of them are negative, the overall
+ // address being computed might be before the base address determined by the
+ // first non-zero index.
+ auto IsAllNonNegative = [&]() {
+ for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) {
+ KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI);
+ if (Known.isNonNegative())
+ continue;
+ return false;
+ }
+
+ return true;
+ };
+
+ // FIXME: If the GEP is not inbounds, and there are extra indices after the
+ // one we'll replace, those could cause the address computation to wrap
+ // (rendering the IsAllNonNegative() check below insufficient). We can do
+ // better, ignoring zero indices (and other indices we can prove small
+ // enough not to wrap).
+ if (Idx+1 != GEPI->getNumOperands() && !GEPI->isInBounds())
+ return false;
+
+ // Note that isObjectSizeLessThanOrEq will return true only if the pointer is
+ // also known to be dereferenceable.
+ return isObjectSizeLessThanOrEq(GEPI->getOperand(0), TyAllocSize, DL) &&
+ IsAllNonNegative();
+}
+
+// If we're indexing into an object with a variable index for the memory
+// access, but the object has only one element, we can assume that the index
+// will always be zero. If we replace the GEP, return it.
+template <typename T>
static Instruction *replaceGEPIdxWithZero(InstCombinerImpl &IC, Value *Ptr,
- T &MemI) {
- if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
- unsigned Idx;
- if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) {
- Instruction *NewGEPI = GEPI->clone();
- NewGEPI->setOperand(Idx,
- ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0));
- NewGEPI->insertBefore(GEPI);
- MemI.setOperand(MemI.getPointerOperandIndex(), NewGEPI);
- return NewGEPI;
- }
- }
-
- return nullptr;
-}
-
-static bool canSimplifyNullStoreOrGEP(StoreInst &SI) {
- if (NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()))
- return false;
-
- auto *Ptr = SI.getPointerOperand();
- if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
- Ptr = GEPI->getOperand(0);
- return (isa<ConstantPointerNull>(Ptr) &&
- !NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()));
-}
-
-static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) {
- if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
- const Value *GEPI0 = GEPI->getOperand(0);
- if (isa<ConstantPointerNull>(GEPI0) &&
- !NullPointerIsDefined(LI.getFunction(), GEPI->getPointerAddressSpace()))
- return true;
- }
- if (isa<UndefValue>(Op) ||
- (isa<ConstantPointerNull>(Op) &&
- !NullPointerIsDefined(LI.getFunction(), LI.getPointerAddressSpace())))
- return true;
- return false;
-}
-
+ T &MemI) {
+ if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr)) {
+ unsigned Idx;
+ if (canReplaceGEPIdxWithZero(IC, GEPI, &MemI, Idx)) {
+ Instruction *NewGEPI = GEPI->clone();
+ NewGEPI->setOperand(Idx,
+ ConstantInt::get(GEPI->getOperand(Idx)->getType(), 0));
+ NewGEPI->insertBefore(GEPI);
+ MemI.setOperand(MemI.getPointerOperandIndex(), NewGEPI);
+ return NewGEPI;
+ }
+ }
+
+ return nullptr;
+}
+
+static bool canSimplifyNullStoreOrGEP(StoreInst &SI) {
+ if (NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()))
+ return false;
+
+ auto *Ptr = SI.getPointerOperand();
+ if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Ptr))
+ Ptr = GEPI->getOperand(0);
+ return (isa<ConstantPointerNull>(Ptr) &&
+ !NullPointerIsDefined(SI.getFunction(), SI.getPointerAddressSpace()));
+}
+
+static bool canSimplifyNullLoadOrGEP(LoadInst &LI, Value *Op) {
+ if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
+ const Value *GEPI0 = GEPI->getOperand(0);
+ if (isa<ConstantPointerNull>(GEPI0) &&
+ !NullPointerIsDefined(LI.getFunction(), GEPI->getPointerAddressSpace()))
+ return true;
+ }
+ if (isa<UndefValue>(Op) ||
+ (isa<ConstantPointerNull>(Op) &&
+ !NullPointerIsDefined(LI.getFunction(), LI.getPointerAddressSpace())))
+ return true;
+ return false;
+}
+
Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
- Value *Op = LI.getOperand(0);
-
- // Try to canonicalize the loaded type.
- if (Instruction *Res = combineLoadToOperationType(*this, LI))
- return Res;
-
- // Attempt to improve the alignment.
- Align KnownAlign = getOrEnforceKnownAlignment(
- Op, DL.getPrefTypeAlign(LI.getType()), DL, &LI, &AC, &DT);
- if (KnownAlign > LI.getAlign())
- LI.setAlignment(KnownAlign);
-
- // Replace GEP indices if possible.
- if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) {
- Worklist.push(NewGEPI);
- return &LI;
- }
-
- if (Instruction *Res = unpackLoadToAggregate(*this, LI))
- return Res;
-
- // Do really simple store-to-load forwarding and load CSE, to catch cases
- // where there are several consecutive memory accesses to the same location,
- // separated by a few arithmetic operations.
- BasicBlock::iterator BBI(LI);
- bool IsLoadCSE = false;
- if (Value *AvailableVal = FindAvailableLoadedValue(
- &LI, LI.getParent(), BBI, DefMaxInstsToScan, AA, &IsLoadCSE)) {
- if (IsLoadCSE)
- combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);
-
- return replaceInstUsesWith(
- LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
- LI.getName() + ".cast"));
- }
-
- // None of the following transforms are legal for volatile/ordered atomic
- // loads. Most of them do apply for unordered atomics.
- if (!LI.isUnordered()) return nullptr;
-
- // load(gep null, ...) -> unreachable
- // load null/undef -> unreachable
- // TODO: Consider a target hook for valid address spaces for this xforms.
- if (canSimplifyNullLoadOrGEP(LI, Op)) {
- // Insert a new store to null instruction before the load to indicate
- // that this code is not reachable. We do this instead of inserting
- // an unreachable instruction directly because we cannot modify the
- // CFG.
- StoreInst *SI = new StoreInst(UndefValue::get(LI.getType()),
- Constant::getNullValue(Op->getType()), &LI);
- SI->setDebugLoc(LI.getDebugLoc());
- return replaceInstUsesWith(LI, UndefValue::get(LI.getType()));
- }
-
- if (Op->hasOneUse()) {
- // Change select and PHI nodes to select values instead of addresses: this
- // helps alias analysis out a lot, allows many others simplifications, and
- // exposes redundancy in the code.
- //
- // Note that we cannot do the transformation unless we know that the
- // introduced loads cannot trap! Something like this is valid as long as
- // the condition is always false: load (select bool %C, int* null, int* %G),
- // but it would not be valid if we transformed it to load from null
- // unconditionally.
- //
- if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
- // load (select (Cond, &V1, &V2)) --> select(Cond, load &V1, load &V2).
- Align Alignment = LI.getAlign();
- if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(),
- Alignment, DL, SI) &&
- isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(),
- Alignment, DL, SI)) {
- LoadInst *V1 =
- Builder.CreateLoad(LI.getType(), SI->getOperand(1),
- SI->getOperand(1)->getName() + ".val");
- LoadInst *V2 =
- Builder.CreateLoad(LI.getType(), SI->getOperand(2),
- SI->getOperand(2)->getName() + ".val");
- assert(LI.isUnordered() && "implied by above");
- V1->setAlignment(Alignment);
- V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
- V2->setAlignment(Alignment);
- V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
- return SelectInst::Create(SI->getCondition(), V1, V2);
- }
-
- // load (select (cond, null, P)) -> load P
- if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
- !NullPointerIsDefined(SI->getFunction(),
- LI.getPointerAddressSpace()))
- return replaceOperand(LI, 0, SI->getOperand(2));
-
- // load (select (cond, P, null)) -> load P
- if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
- !NullPointerIsDefined(SI->getFunction(),
- LI.getPointerAddressSpace()))
- return replaceOperand(LI, 0, SI->getOperand(1));
- }
- }
- return nullptr;
-}
-
-/// Look for extractelement/insertvalue sequence that acts like a bitcast.
-///
-/// \returns underlying value that was "cast", or nullptr otherwise.
-///
-/// For example, if we have:
-///
-/// %E0 = extractelement <2 x double> %U, i32 0
-/// %V0 = insertvalue [2 x double] undef, double %E0, 0
-/// %E1 = extractelement <2 x double> %U, i32 1
-/// %V1 = insertvalue [2 x double] %V0, double %E1, 1
-///
-/// and the layout of a <2 x double> is isomorphic to a [2 x double],
-/// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
-/// Note that %U may contain non-undef values where %V1 has undef.
+ Value *Op = LI.getOperand(0);
+
+ // Try to canonicalize the loaded type.
+ if (Instruction *Res = combineLoadToOperationType(*this, LI))
+ return Res;
+
+ // Attempt to improve the alignment.
+ Align KnownAlign = getOrEnforceKnownAlignment(
+ Op, DL.getPrefTypeAlign(LI.getType()), DL, &LI, &AC, &DT);
+ if (KnownAlign > LI.getAlign())
+ LI.setAlignment(KnownAlign);
+
+ // Replace GEP indices if possible.
+ if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) {
+ Worklist.push(NewGEPI);
+ return &LI;
+ }
+
+ if (Instruction *Res = unpackLoadToAggregate(*this, LI))
+ return Res;
+
+ // Do really simple store-to-load forwarding and load CSE, to catch cases
+ // where there are several consecutive memory accesses to the same location,
+ // separated by a few arithmetic operations.
+ BasicBlock::iterator BBI(LI);
+ bool IsLoadCSE = false;
+ if (Value *AvailableVal = FindAvailableLoadedValue(
+ &LI, LI.getParent(), BBI, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+ if (IsLoadCSE)
+ combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);
+
+ return replaceInstUsesWith(
+ LI, Builder.CreateBitOrPointerCast(AvailableVal, LI.getType(),
+ LI.getName() + ".cast"));
+ }
+
+ // None of the following transforms are legal for volatile/ordered atomic
+ // loads. Most of them do apply for unordered atomics.
+ if (!LI.isUnordered()) return nullptr;
+
+ // load(gep null, ...) -> unreachable
+ // load null/undef -> unreachable
+ // TODO: Consider a target hook for valid address spaces for this xforms.
+ if (canSimplifyNullLoadOrGEP(LI, Op)) {
+ // Insert a new store to null instruction before the load to indicate
+ // that this code is not reachable. We do this instead of inserting
+ // an unreachable instruction directly because we cannot modify the
+ // CFG.
+ StoreInst *SI = new StoreInst(UndefValue::get(LI.getType()),
+ Constant::getNullValue(Op->getType()), &LI);
+ SI->setDebugLoc(LI.getDebugLoc());
+ return replaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+ }
+
+ if (Op->hasOneUse()) {
+ // Change select and PHI nodes to select values instead of addresses: this
+ // helps alias analysis out a lot, allows many others simplifications, and
+ // exposes redundancy in the code.
+ //
+ // Note that we cannot do the transformation unless we know that the
+ // introduced loads cannot trap! Something like this is valid as long as
+ // the condition is always false: load (select bool %C, int* null, int* %G),
+ // but it would not be valid if we transformed it to load from null
+ // unconditionally.
+ //
+ if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
+ // load (select (Cond, &V1, &V2)) --> select(Cond, load &V1, load &V2).
+ Align Alignment = LI.getAlign();
+ if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(),
+ Alignment, DL, SI) &&
+ isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(),
+ Alignment, DL, SI)) {
+ LoadInst *V1 =
+ Builder.CreateLoad(LI.getType(), SI->getOperand(1),
+ SI->getOperand(1)->getName() + ".val");
+ LoadInst *V2 =
+ Builder.CreateLoad(LI.getType(), SI->getOperand(2),
+ SI->getOperand(2)->getName() + ".val");
+ assert(LI.isUnordered() && "implied by above");
+ V1->setAlignment(Alignment);
+ V1->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+ V2->setAlignment(Alignment);
+ V2->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+ return SelectInst::Create(SI->getCondition(), V1, V2);
+ }
+
+ // load (select (cond, null, P)) -> load P
+ if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
+ !NullPointerIsDefined(SI->getFunction(),
+ LI.getPointerAddressSpace()))
+ return replaceOperand(LI, 0, SI->getOperand(2));
+
+ // load (select (cond, P, null)) -> load P
+ if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
+ !NullPointerIsDefined(SI->getFunction(),
+ LI.getPointerAddressSpace()))
+ return replaceOperand(LI, 0, SI->getOperand(1));
+ }
+ }
+ return nullptr;
+}
+
+/// Look for extractelement/insertvalue sequence that acts like a bitcast.
+///
+/// \returns underlying value that was "cast", or nullptr otherwise.
+///
+/// For example, if we have:
+///
+/// %E0 = extractelement <2 x double> %U, i32 0
+/// %V0 = insertvalue [2 x double] undef, double %E0, 0
+/// %E1 = extractelement <2 x double> %U, i32 1
+/// %V1 = insertvalue [2 x double] %V0, double %E1, 1
+///
+/// and the layout of a <2 x double> is isomorphic to a [2 x double],
+/// then %V1 can be safely approximated by a conceptual "bitcast" of %U.
+/// Note that %U may contain non-undef values where %V1 has undef.
static Value *likeBitCastFromVector(InstCombinerImpl &IC, Value *V) {
- Value *U = nullptr;
- while (auto *IV = dyn_cast<InsertValueInst>(V)) {
- auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
- if (!E)
- return nullptr;
- auto *W = E->getVectorOperand();
- if (!U)
- U = W;
- else if (U != W)
- return nullptr;
- auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand());
- if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
- return nullptr;
- V = IV->getAggregateOperand();
- }
- if (!isa<UndefValue>(V) ||!U)
- return nullptr;
-
- auto *UT = cast<VectorType>(U->getType());
- auto *VT = V->getType();
- // Check that types UT and VT are bitwise isomorphic.
- const auto &DL = IC.getDataLayout();
- if (DL.getTypeStoreSizeInBits(UT) != DL.getTypeStoreSizeInBits(VT)) {
- return nullptr;
- }
- if (auto *AT = dyn_cast<ArrayType>(VT)) {
+ Value *U = nullptr;
+ while (auto *IV = dyn_cast<InsertValueInst>(V)) {
+ auto *E = dyn_cast<ExtractElementInst>(IV->getInsertedValueOperand());
+ if (!E)
+ return nullptr;
+ auto *W = E->getVectorOperand();
+ if (!U)
+ U = W;
+ else if (U != W)
+ return nullptr;
+ auto *CI = dyn_cast<ConstantInt>(E->getIndexOperand());
+ if (!CI || IV->getNumIndices() != 1 || CI->getZExtValue() != *IV->idx_begin())
+ return nullptr;
+ V = IV->getAggregateOperand();
+ }
+ if (!isa<UndefValue>(V) ||!U)
+ return nullptr;
+
+ auto *UT = cast<VectorType>(U->getType());
+ auto *VT = V->getType();
+ // Check that types UT and VT are bitwise isomorphic.
+ const auto &DL = IC.getDataLayout();
+ if (DL.getTypeStoreSizeInBits(UT) != DL.getTypeStoreSizeInBits(VT)) {
+ return nullptr;
+ }
+ if (auto *AT = dyn_cast<ArrayType>(VT)) {
if (AT->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
- return nullptr;
- } else {
- auto *ST = cast<StructType>(VT);
+ return nullptr;
+ } else {
+ auto *ST = cast<StructType>(VT);
if (ST->getNumElements() != cast<FixedVectorType>(UT)->getNumElements())
- return nullptr;
- for (const auto *EltT : ST->elements()) {
- if (EltT != UT->getElementType())
- return nullptr;
- }
- }
- return U;
-}
-
-/// Combine stores to match the type of value being stored.
-///
-/// The core idea here is that the memory does not have any intrinsic type and
-/// where we can we should match the type of a store to the type of value being
-/// stored.
-///
-/// However, this routine must never change the width of a store or the number of
-/// stores as that would introduce a semantic change. This combine is expected to
-/// be a semantic no-op which just allows stores to more closely model the types
-/// of their incoming values.
-///
-/// Currently, we also refuse to change the precise type used for an atomic or
-/// volatile store. This is debatable, and might be reasonable to change later.
-/// However, it is risky in case some backend or other part of LLVM is relying
-/// on the exact type stored to select appropriate atomic operations.
-///
-/// \returns true if the store was successfully combined away. This indicates
-/// the caller must erase the store instruction. We have to let the caller erase
-/// the store instruction as otherwise there is no way to signal whether it was
-/// combined or not: IC.EraseInstFromFunction returns a null pointer.
+ return nullptr;
+ for (const auto *EltT : ST->elements()) {
+ if (EltT != UT->getElementType())
+ return nullptr;
+ }
+ }
+ return U;
+}
+
+/// Combine stores to match the type of value being stored.
+///
+/// The core idea here is that the memory does not have any intrinsic type and
+/// where we can we should match the type of a store to the type of value being
+/// stored.
+///
+/// However, this routine must never change the width of a store or the number of
+/// stores as that would introduce a semantic change. This combine is expected to
+/// be a semantic no-op which just allows stores to more closely model the types
+/// of their incoming values.
+///
+/// Currently, we also refuse to change the precise type used for an atomic or
+/// volatile store. This is debatable, and might be reasonable to change later.
+/// However, it is risky in case some backend or other part of LLVM is relying
+/// on the exact type stored to select appropriate atomic operations.
+///
+/// \returns true if the store was successfully combined away. This indicates
+/// the caller must erase the store instruction. We have to let the caller erase
+/// the store instruction as otherwise there is no way to signal whether it was
+/// combined or not: IC.EraseInstFromFunction returns a null pointer.
static bool combineStoreToValueType(InstCombinerImpl &IC, StoreInst &SI) {
- // FIXME: We could probably with some care handle both volatile and ordered
- // atomic stores here but it isn't clear that this is important.
- if (!SI.isUnordered())
- return false;
-
- // swifterror values can't be bitcasted.
- if (SI.getPointerOperand()->isSwiftError())
- return false;
-
- Value *V = SI.getValueOperand();
-
- // Fold away bit casts of the stored value by storing the original type.
- if (auto *BC = dyn_cast<BitCastInst>(V)) {
+ // FIXME: We could probably with some care handle both volatile and ordered
+ // atomic stores here but it isn't clear that this is important.
+ if (!SI.isUnordered())
+ return false;
+
+ // swifterror values can't be bitcasted.
+ if (SI.getPointerOperand()->isSwiftError())
+ return false;
+
+ Value *V = SI.getValueOperand();
+
+ // Fold away bit casts of the stored value by storing the original type.
+ if (auto *BC = dyn_cast<BitCastInst>(V)) {
assert(!BC->getType()->isX86_AMXTy() &&
"store to x86_amx* should not happen!");
- V = BC->getOperand(0);
+ V = BC->getOperand(0);
// Don't transform when the type is x86_amx, it makes the pass that lower
// x86_amx type happy.
if (V->getType()->isX86_AMXTy())
return false;
- if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) {
- combineStoreToNewValue(IC, SI, V);
- return true;
- }
- }
-
- if (Value *U = likeBitCastFromVector(IC, V))
- if (!SI.isAtomic() || isSupportedAtomicType(U->getType())) {
- combineStoreToNewValue(IC, SI, U);
- return true;
- }
-
- // FIXME: We should also canonicalize stores of vectors when their elements
- // are cast to other types.
- return false;
-}
-
+ if (!SI.isAtomic() || isSupportedAtomicType(V->getType())) {
+ combineStoreToNewValue(IC, SI, V);
+ return true;
+ }
+ }
+
+ if (Value *U = likeBitCastFromVector(IC, V))
+ if (!SI.isAtomic() || isSupportedAtomicType(U->getType())) {
+ combineStoreToNewValue(IC, SI, U);
+ return true;
+ }
+
+ // FIXME: We should also canonicalize stores of vectors when their elements
+ // are cast to other types.
+ return false;
+}
+
static bool unpackStoreToAggregate(InstCombinerImpl &IC, StoreInst &SI) {
- // FIXME: We could probably with some care handle both volatile and atomic
- // stores here but it isn't clear that this is important.
- if (!SI.isSimple())
- return false;
-
- Value *V = SI.getValueOperand();
- Type *T = V->getType();
-
- if (!T->isAggregateType())
- return false;
-
- if (auto *ST = dyn_cast<StructType>(T)) {
- // If the struct only have one element, we unpack.
- unsigned Count = ST->getNumElements();
- if (Count == 1) {
- V = IC.Builder.CreateExtractValue(V, 0);
- combineStoreToNewValue(IC, SI, V);
- return true;
- }
-
- // We don't want to break loads with padding here as we'd loose
- // the knowledge that padding exists for the rest of the pipeline.
- const DataLayout &DL = IC.getDataLayout();
- auto *SL = DL.getStructLayout(ST);
- if (SL->hasPadding())
- return false;
-
- const auto Align = SI.getAlign();
-
- SmallString<16> EltName = V->getName();
- EltName += ".elt";
- auto *Addr = SI.getPointerOperand();
- SmallString<16> AddrName = Addr->getName();
- AddrName += ".repack";
-
- auto *IdxType = Type::getInt32Ty(ST->getContext());
- auto *Zero = ConstantInt::get(IdxType, 0);
- for (unsigned i = 0; i < Count; i++) {
- Value *Indices[2] = {
- Zero,
- ConstantInt::get(IdxType, i),
- };
- auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
- AddrName);
- auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
- auto EltAlign = commonAlignment(Align, SL->getElementOffset(i));
- llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
- AAMDNodes AAMD;
- SI.getAAMetadata(AAMD);
- NS->setAAMetadata(AAMD);
- }
-
- return true;
- }
-
- if (auto *AT = dyn_cast<ArrayType>(T)) {
- // If the array only have one element, we unpack.
- auto NumElements = AT->getNumElements();
- if (NumElements == 1) {
- V = IC.Builder.CreateExtractValue(V, 0);
- combineStoreToNewValue(IC, SI, V);
- return true;
- }
-
- // Bail out if the array is too large. Ideally we would like to optimize
- // arrays of arbitrary size but this has a terrible impact on compile time.
- // The threshold here is chosen arbitrarily, maybe needs a little bit of
- // tuning.
- if (NumElements > IC.MaxArraySizeForCombine)
- return false;
-
- const DataLayout &DL = IC.getDataLayout();
- auto EltSize = DL.getTypeAllocSize(AT->getElementType());
- const auto Align = SI.getAlign();
-
- SmallString<16> EltName = V->getName();
- EltName += ".elt";
- auto *Addr = SI.getPointerOperand();
- SmallString<16> AddrName = Addr->getName();
- AddrName += ".repack";
-
- auto *IdxType = Type::getInt64Ty(T->getContext());
- auto *Zero = ConstantInt::get(IdxType, 0);
-
- uint64_t Offset = 0;
- for (uint64_t i = 0; i < NumElements; i++) {
- Value *Indices[2] = {
- Zero,
- ConstantInt::get(IdxType, i),
- };
- auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
- AddrName);
- auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
- auto EltAlign = commonAlignment(Align, Offset);
- Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
- AAMDNodes AAMD;
- SI.getAAMetadata(AAMD);
- NS->setAAMetadata(AAMD);
- Offset += EltSize;
- }
-
- return true;
- }
-
- return false;
-}
-
-/// equivalentAddressValues - Test if A and B will obviously have the same
-/// value. This includes recognizing that %t0 and %t1 will have the same
-/// value in code like this:
-/// %t0 = getelementptr \@a, 0, 3
-/// store i32 0, i32* %t0
-/// %t1 = getelementptr \@a, 0, 3
-/// %t2 = load i32* %t1
-///
-static bool equivalentAddressValues(Value *A, Value *B) {
- // Test if the values are trivially equivalent.
- if (A == B) return true;
-
- // Test if the values come form identical arithmetic instructions.
- // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
- // its only used to compare two uses within the same basic block, which
- // means that they'll always either have the same value or one of them
- // will have an undefined value.
- if (isa<BinaryOperator>(A) ||
- isa<CastInst>(A) ||
- isa<PHINode>(A) ||
- isa<GetElementPtrInst>(A))
- if (Instruction *BI = dyn_cast<Instruction>(B))
- if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
- return true;
-
- // Otherwise they may not be equivalent.
- return false;
-}
-
-/// Converts store (bitcast (load (bitcast (select ...)))) to
-/// store (load (select ...)), where select is minmax:
-/// select ((cmp load V1, load V2), V1, V2).
+ // FIXME: We could probably with some care handle both volatile and atomic
+ // stores here but it isn't clear that this is important.
+ if (!SI.isSimple())
+ return false;
+
+ Value *V = SI.getValueOperand();
+ Type *T = V->getType();
+
+ if (!T->isAggregateType())
+ return false;
+
+ if (auto *ST = dyn_cast<StructType>(T)) {
+ // If the struct only have one element, we unpack.
+ unsigned Count = ST->getNumElements();
+ if (Count == 1) {
+ V = IC.Builder.CreateExtractValue(V, 0);
+ combineStoreToNewValue(IC, SI, V);
+ return true;
+ }
+
+ // We don't want to break loads with padding here as we'd loose
+ // the knowledge that padding exists for the rest of the pipeline.
+ const DataLayout &DL = IC.getDataLayout();
+ auto *SL = DL.getStructLayout(ST);
+ if (SL->hasPadding())
+ return false;
+
+ const auto Align = SI.getAlign();
+
+ SmallString<16> EltName = V->getName();
+ EltName += ".elt";
+ auto *Addr = SI.getPointerOperand();
+ SmallString<16> AddrName = Addr->getName();
+ AddrName += ".repack";
+
+ auto *IdxType = Type::getInt32Ty(ST->getContext());
+ auto *Zero = ConstantInt::get(IdxType, 0);
+ for (unsigned i = 0; i < Count; i++) {
+ Value *Indices[2] = {
+ Zero,
+ ConstantInt::get(IdxType, i),
+ };
+ auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
+ AddrName);
+ auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
+ auto EltAlign = commonAlignment(Align, SL->getElementOffset(i));
+ llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
+ AAMDNodes AAMD;
+ SI.getAAMetadata(AAMD);
+ NS->setAAMetadata(AAMD);
+ }
+
+ return true;
+ }
+
+ if (auto *AT = dyn_cast<ArrayType>(T)) {
+ // If the array only have one element, we unpack.
+ auto NumElements = AT->getNumElements();
+ if (NumElements == 1) {
+ V = IC.Builder.CreateExtractValue(V, 0);
+ combineStoreToNewValue(IC, SI, V);
+ return true;
+ }
+
+ // Bail out if the array is too large. Ideally we would like to optimize
+ // arrays of arbitrary size but this has a terrible impact on compile time.
+ // The threshold here is chosen arbitrarily, maybe needs a little bit of
+ // tuning.
+ if (NumElements > IC.MaxArraySizeForCombine)
+ return false;
+
+ const DataLayout &DL = IC.getDataLayout();
+ auto EltSize = DL.getTypeAllocSize(AT->getElementType());
+ const auto Align = SI.getAlign();
+
+ SmallString<16> EltName = V->getName();
+ EltName += ".elt";
+ auto *Addr = SI.getPointerOperand();
+ SmallString<16> AddrName = Addr->getName();
+ AddrName += ".repack";
+
+ auto *IdxType = Type::getInt64Ty(T->getContext());
+ auto *Zero = ConstantInt::get(IdxType, 0);
+
+ uint64_t Offset = 0;
+ for (uint64_t i = 0; i < NumElements; i++) {
+ Value *Indices[2] = {
+ Zero,
+ ConstantInt::get(IdxType, i),
+ };
+ auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
+ AddrName);
+ auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
+ auto EltAlign = commonAlignment(Align, Offset);
+ Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
+ AAMDNodes AAMD;
+ SI.getAAMetadata(AAMD);
+ NS->setAAMetadata(AAMD);
+ Offset += EltSize;
+ }
+
+ return true;
+ }
+
+ return false;
+}
+
+/// equivalentAddressValues - Test if A and B will obviously have the same
+/// value. This includes recognizing that %t0 and %t1 will have the same
+/// value in code like this:
+/// %t0 = getelementptr \@a, 0, 3
+/// store i32 0, i32* %t0
+/// %t1 = getelementptr \@a, 0, 3
+/// %t2 = load i32* %t1
+///
+static bool equivalentAddressValues(Value *A, Value *B) {
+ // Test if the values are trivially equivalent.
+ if (A == B) return true;
+
+ // Test if the values come form identical arithmetic instructions.
+ // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
+ // its only used to compare two uses within the same basic block, which
+ // means that they'll always either have the same value or one of them
+ // will have an undefined value.
+ if (isa<BinaryOperator>(A) ||
+ isa<CastInst>(A) ||
+ isa<PHINode>(A) ||
+ isa<GetElementPtrInst>(A))
+ if (Instruction *BI = dyn_cast<Instruction>(B))
+ if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
+ return true;
+
+ // Otherwise they may not be equivalent.
+ return false;
+}
+
+/// Converts store (bitcast (load (bitcast (select ...)))) to
+/// store (load (select ...)), where select is minmax:
+/// select ((cmp load V1, load V2), V1, V2).
static bool removeBitcastsFromLoadStoreOnMinMax(InstCombinerImpl &IC,
- StoreInst &SI) {
- // bitcast?
- if (!match(SI.getPointerOperand(), m_BitCast(m_Value())))
- return false;
- // load? integer?
- Value *LoadAddr;
- if (!match(SI.getValueOperand(), m_Load(m_BitCast(m_Value(LoadAddr)))))
- return false;
- auto *LI = cast<LoadInst>(SI.getValueOperand());
- if (!LI->getType()->isIntegerTy())
- return false;
- Type *CmpLoadTy;
- if (!isMinMaxWithLoads(LoadAddr, CmpLoadTy))
- return false;
-
- // Make sure the type would actually change.
- // This condition can be hit with chains of bitcasts.
- if (LI->getType() == CmpLoadTy)
- return false;
-
- // Make sure we're not changing the size of the load/store.
- const auto &DL = IC.getDataLayout();
- if (DL.getTypeStoreSizeInBits(LI->getType()) !=
- DL.getTypeStoreSizeInBits(CmpLoadTy))
- return false;
-
- if (!all_of(LI->users(), [LI, LoadAddr](User *U) {
- auto *SI = dyn_cast<StoreInst>(U);
- return SI && SI->getPointerOperand() != LI &&
+ StoreInst &SI) {
+ // bitcast?
+ if (!match(SI.getPointerOperand(), m_BitCast(m_Value())))
+ return false;
+ // load? integer?
+ Value *LoadAddr;
+ if (!match(SI.getValueOperand(), m_Load(m_BitCast(m_Value(LoadAddr)))))
+ return false;
+ auto *LI = cast<LoadInst>(SI.getValueOperand());
+ if (!LI->getType()->isIntegerTy())
+ return false;
+ Type *CmpLoadTy;
+ if (!isMinMaxWithLoads(LoadAddr, CmpLoadTy))
+ return false;
+
+ // Make sure the type would actually change.
+ // This condition can be hit with chains of bitcasts.
+ if (LI->getType() == CmpLoadTy)
+ return false;
+
+ // Make sure we're not changing the size of the load/store.
+ const auto &DL = IC.getDataLayout();
+ if (DL.getTypeStoreSizeInBits(LI->getType()) !=
+ DL.getTypeStoreSizeInBits(CmpLoadTy))
+ return false;
+
+ if (!all_of(LI->users(), [LI, LoadAddr](User *U) {
+ auto *SI = dyn_cast<StoreInst>(U);
+ return SI && SI->getPointerOperand() != LI &&
InstCombiner::peekThroughBitcast(SI->getPointerOperand()) !=
LoadAddr &&
- !SI->getPointerOperand()->isSwiftError();
- }))
- return false;
-
- IC.Builder.SetInsertPoint(LI);
- LoadInst *NewLI = IC.combineLoadToNewType(*LI, CmpLoadTy);
- // Replace all the stores with stores of the newly loaded value.
- for (auto *UI : LI->users()) {
- auto *USI = cast<StoreInst>(UI);
- IC.Builder.SetInsertPoint(USI);
- combineStoreToNewValue(IC, *USI, NewLI);
- }
- IC.replaceInstUsesWith(*LI, UndefValue::get(LI->getType()));
- IC.eraseInstFromFunction(*LI);
- return true;
-}
-
+ !SI->getPointerOperand()->isSwiftError();
+ }))
+ return false;
+
+ IC.Builder.SetInsertPoint(LI);
+ LoadInst *NewLI = IC.combineLoadToNewType(*LI, CmpLoadTy);
+ // Replace all the stores with stores of the newly loaded value.
+ for (auto *UI : LI->users()) {
+ auto *USI = cast<StoreInst>(UI);
+ IC.Builder.SetInsertPoint(USI);
+ combineStoreToNewValue(IC, *USI, NewLI);
+ }
+ IC.replaceInstUsesWith(*LI, UndefValue::get(LI->getType()));
+ IC.eraseInstFromFunction(*LI);
+ return true;
+}
+
Instruction *InstCombinerImpl::visitStoreInst(StoreInst &SI) {
- Value *Val = SI.getOperand(0);
- Value *Ptr = SI.getOperand(1);
-
- // Try to canonicalize the stored type.
- if (combineStoreToValueType(*this, SI))
- return eraseInstFromFunction(SI);
-
- // Attempt to improve the alignment.
- const Align KnownAlign = getOrEnforceKnownAlignment(
- Ptr, DL.getPrefTypeAlign(Val->getType()), DL, &SI, &AC, &DT);
- if (KnownAlign > SI.getAlign())
- SI.setAlignment(KnownAlign);
-
- // Try to canonicalize the stored type.
- if (unpackStoreToAggregate(*this, SI))
- return eraseInstFromFunction(SI);
-
- if (removeBitcastsFromLoadStoreOnMinMax(*this, SI))
- return eraseInstFromFunction(SI);
-
- // Replace GEP indices if possible.
- if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) {
- Worklist.push(NewGEPI);
- return &SI;
- }
-
- // Don't hack volatile/ordered stores.
- // FIXME: Some bits are legal for ordered atomic stores; needs refactoring.
- if (!SI.isUnordered()) return nullptr;
-
- // If the RHS is an alloca with a single use, zapify the store, making the
- // alloca dead.
- if (Ptr->hasOneUse()) {
- if (isa<AllocaInst>(Ptr))
- return eraseInstFromFunction(SI);
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
- if (isa<AllocaInst>(GEP->getOperand(0))) {
- if (GEP->getOperand(0)->hasOneUse())
- return eraseInstFromFunction(SI);
- }
- }
- }
-
- // If we have a store to a location which is known constant, we can conclude
- // that the store must be storing the constant value (else the memory
- // wouldn't be constant), and this must be a noop.
- if (AA->pointsToConstantMemory(Ptr))
- return eraseInstFromFunction(SI);
-
- // Do really simple DSE, to catch cases where there are several consecutive
- // stores to the same location, separated by a few arithmetic operations. This
- // situation often occurs with bitfield accesses.
- BasicBlock::iterator BBI(SI);
- for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
- --ScanInsts) {
- --BBI;
- // Don't count debug info directives, lest they affect codegen,
- // and we skip pointer-to-pointer bitcasts, which are NOPs.
- if (isa<DbgInfoIntrinsic>(BBI) ||
- (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
- ScanInsts++;
- continue;
- }
-
- if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
- // Prev store isn't volatile, and stores to the same location?
- if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1),
- SI.getOperand(1))) {
- ++NumDeadStore;
- // Manually add back the original store to the worklist now, so it will
- // be processed after the operands of the removed store, as this may
- // expose additional DSE opportunities.
- Worklist.push(&SI);
- eraseInstFromFunction(*PrevSI);
- return nullptr;
- }
- break;
- }
-
- // If this is a load, we have to stop. However, if the loaded value is from
- // the pointer we're loading and is producing the pointer we're storing,
- // then *this* store is dead (X = load P; store X -> P).
- if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
- if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) {
- assert(SI.isUnordered() && "can't eliminate ordering operation");
- return eraseInstFromFunction(SI);
- }
-
- // Otherwise, this is a load from some other location. Stores before it
- // may not be dead.
- break;
- }
-
- // Don't skip over loads, throws or things that can modify memory.
- if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow())
- break;
- }
-
- // store X, null -> turns into 'unreachable' in SimplifyCFG
- // store X, GEP(null, Y) -> turns into 'unreachable' in SimplifyCFG
- if (canSimplifyNullStoreOrGEP(SI)) {
- if (!isa<UndefValue>(Val))
- return replaceOperand(SI, 0, UndefValue::get(Val->getType()));
- return nullptr; // Do not modify these!
- }
-
- // store undef, Ptr -> noop
- if (isa<UndefValue>(Val))
- return eraseInstFromFunction(SI);
-
- return nullptr;
-}
-
-/// Try to transform:
-/// if () { *P = v1; } else { *P = v2 }
-/// or:
-/// *P = v1; if () { *P = v2; }
-/// into a phi node with a store in the successor.
+ Value *Val = SI.getOperand(0);
+ Value *Ptr = SI.getOperand(1);
+
+ // Try to canonicalize the stored type.
+ if (combineStoreToValueType(*this, SI))
+ return eraseInstFromFunction(SI);
+
+ // Attempt to improve the alignment.
+ const Align KnownAlign = getOrEnforceKnownAlignment(
+ Ptr, DL.getPrefTypeAlign(Val->getType()), DL, &SI, &AC, &DT);
+ if (KnownAlign > SI.getAlign())
+ SI.setAlignment(KnownAlign);
+
+ // Try to canonicalize the stored type.
+ if (unpackStoreToAggregate(*this, SI))
+ return eraseInstFromFunction(SI);
+
+ if (removeBitcastsFromLoadStoreOnMinMax(*this, SI))
+ return eraseInstFromFunction(SI);
+
+ // Replace GEP indices if possible.
+ if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) {
+ Worklist.push(NewGEPI);
+ return &SI;
+ }
+
+ // Don't hack volatile/ordered stores.
+ // FIXME: Some bits are legal for ordered atomic stores; needs refactoring.
+ if (!SI.isUnordered()) return nullptr;
+
+ // If the RHS is an alloca with a single use, zapify the store, making the
+ // alloca dead.
+ if (Ptr->hasOneUse()) {
+ if (isa<AllocaInst>(Ptr))
+ return eraseInstFromFunction(SI);
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+ if (isa<AllocaInst>(GEP->getOperand(0))) {
+ if (GEP->getOperand(0)->hasOneUse())
+ return eraseInstFromFunction(SI);
+ }
+ }
+ }
+
+ // If we have a store to a location which is known constant, we can conclude
+ // that the store must be storing the constant value (else the memory
+ // wouldn't be constant), and this must be a noop.
+ if (AA->pointsToConstantMemory(Ptr))
+ return eraseInstFromFunction(SI);
+
+ // Do really simple DSE, to catch cases where there are several consecutive
+ // stores to the same location, separated by a few arithmetic operations. This
+ // situation often occurs with bitfield accesses.
+ BasicBlock::iterator BBI(SI);
+ for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
+ --ScanInsts) {
+ --BBI;
+ // Don't count debug info directives, lest they affect codegen,
+ // and we skip pointer-to-pointer bitcasts, which are NOPs.
+ if (isa<DbgInfoIntrinsic>(BBI) ||
+ (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
+ ScanInsts++;
+ continue;
+ }
+
+ if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
+ // Prev store isn't volatile, and stores to the same location?
+ if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1),
+ SI.getOperand(1))) {
+ ++NumDeadStore;
+ // Manually add back the original store to the worklist now, so it will
+ // be processed after the operands of the removed store, as this may
+ // expose additional DSE opportunities.
+ Worklist.push(&SI);
+ eraseInstFromFunction(*PrevSI);
+ return nullptr;
+ }
+ break;
+ }
+
+ // If this is a load, we have to stop. However, if the loaded value is from
+ // the pointer we're loading and is producing the pointer we're storing,
+ // then *this* store is dead (X = load P; store X -> P).
+ if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+ if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr)) {
+ assert(SI.isUnordered() && "can't eliminate ordering operation");
+ return eraseInstFromFunction(SI);
+ }
+
+ // Otherwise, this is a load from some other location. Stores before it
+ // may not be dead.
+ break;
+ }
+
+ // Don't skip over loads, throws or things that can modify memory.
+ if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory() || BBI->mayThrow())
+ break;
+ }
+
+ // store X, null -> turns into 'unreachable' in SimplifyCFG
+ // store X, GEP(null, Y) -> turns into 'unreachable' in SimplifyCFG
+ if (canSimplifyNullStoreOrGEP(SI)) {
+ if (!isa<UndefValue>(Val))
+ return replaceOperand(SI, 0, UndefValue::get(Val->getType()));
+ return nullptr; // Do not modify these!
+ }
+
+ // store undef, Ptr -> noop
+ if (isa<UndefValue>(Val))
+ return eraseInstFromFunction(SI);
+
+ return nullptr;
+}
+
+/// Try to transform:
+/// if () { *P = v1; } else { *P = v2 }
+/// or:
+/// *P = v1; if () { *P = v2; }
+/// into a phi node with a store in the successor.
bool InstCombinerImpl::mergeStoreIntoSuccessor(StoreInst &SI) {
- if (!SI.isUnordered())
- return false; // This code has not been audited for volatile/ordered case.
-
- // Check if the successor block has exactly 2 incoming edges.
- BasicBlock *StoreBB = SI.getParent();
- BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
- if (!DestBB->hasNPredecessors(2))
- return false;
-
- // Capture the other block (the block that doesn't contain our store).
- pred_iterator PredIter = pred_begin(DestBB);
- if (*PredIter == StoreBB)
- ++PredIter;
- BasicBlock *OtherBB = *PredIter;
-
- // Bail out if all of the relevant blocks aren't distinct. This can happen,
- // for example, if SI is in an infinite loop.
- if (StoreBB == DestBB || OtherBB == DestBB)
- return false;
-
- // Verify that the other block ends in a branch and is not otherwise empty.
- BasicBlock::iterator BBI(OtherBB->getTerminator());
- BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
- if (!OtherBr || BBI == OtherBB->begin())
- return false;
-
- // If the other block ends in an unconditional branch, check for the 'if then
- // else' case. There is an instruction before the branch.
- StoreInst *OtherStore = nullptr;
- if (OtherBr->isUnconditional()) {
- --BBI;
- // Skip over debugging info.
- while (isa<DbgInfoIntrinsic>(BBI) ||
- (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
- if (BBI==OtherBB->begin())
- return false;
- --BBI;
- }
- // If this isn't a store, isn't a store to the same location, or is not the
- // right kind of store, bail out.
- OtherStore = dyn_cast<StoreInst>(BBI);
- if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) ||
- !SI.isSameOperationAs(OtherStore))
- return false;
- } else {
- // Otherwise, the other block ended with a conditional branch. If one of the
- // destinations is StoreBB, then we have the if/then case.
- if (OtherBr->getSuccessor(0) != StoreBB &&
- OtherBr->getSuccessor(1) != StoreBB)
- return false;
-
- // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
- // if/then triangle. See if there is a store to the same ptr as SI that
- // lives in OtherBB.
- for (;; --BBI) {
- // Check to see if we find the matching store.
- if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
- if (OtherStore->getOperand(1) != SI.getOperand(1) ||
- !SI.isSameOperationAs(OtherStore))
- return false;
- break;
- }
- // If we find something that may be using or overwriting the stored
- // value, or if we run out of instructions, we can't do the transform.
- if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
- BBI->mayWriteToMemory() || BBI == OtherBB->begin())
- return false;
- }
-
- // In order to eliminate the store in OtherBr, we have to make sure nothing
- // reads or overwrites the stored value in StoreBB.
- for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
- // FIXME: This should really be AA driven.
- if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
- return false;
- }
- }
-
- // Insert a PHI node now if we need it.
- Value *MergedVal = OtherStore->getOperand(0);
- // The debug locations of the original instructions might differ. Merge them.
- DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(),
- OtherStore->getDebugLoc());
- if (MergedVal != SI.getOperand(0)) {
- PHINode *PN = PHINode::Create(MergedVal->getType(), 2, "storemerge");
- PN->addIncoming(SI.getOperand(0), SI.getParent());
- PN->addIncoming(OtherStore->getOperand(0), OtherBB);
- MergedVal = InsertNewInstBefore(PN, DestBB->front());
- PN->setDebugLoc(MergedLoc);
- }
-
- // Advance to a place where it is safe to insert the new store and insert it.
- BBI = DestBB->getFirstInsertionPt();
- StoreInst *NewSI =
- new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(), SI.getAlign(),
- SI.getOrdering(), SI.getSyncScopeID());
- InsertNewInstBefore(NewSI, *BBI);
- NewSI->setDebugLoc(MergedLoc);
-
- // If the two stores had AA tags, merge them.
- AAMDNodes AATags;
- SI.getAAMetadata(AATags);
- if (AATags) {
- OtherStore->getAAMetadata(AATags, /* Merge = */ true);
- NewSI->setAAMetadata(AATags);
- }
-
- // Nuke the old stores.
- eraseInstFromFunction(SI);
- eraseInstFromFunction(*OtherStore);
- return true;
-}
+ if (!SI.isUnordered())
+ return false; // This code has not been audited for volatile/ordered case.
+
+ // Check if the successor block has exactly 2 incoming edges.
+ BasicBlock *StoreBB = SI.getParent();
+ BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
+ if (!DestBB->hasNPredecessors(2))
+ return false;
+
+ // Capture the other block (the block that doesn't contain our store).
+ pred_iterator PredIter = pred_begin(DestBB);
+ if (*PredIter == StoreBB)
+ ++PredIter;
+ BasicBlock *OtherBB = *PredIter;
+
+ // Bail out if all of the relevant blocks aren't distinct. This can happen,
+ // for example, if SI is in an infinite loop.
+ if (StoreBB == DestBB || OtherBB == DestBB)
+ return false;
+
+ // Verify that the other block ends in a branch and is not otherwise empty.
+ BasicBlock::iterator BBI(OtherBB->getTerminator());
+ BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
+ if (!OtherBr || BBI == OtherBB->begin())
+ return false;
+
+ // If the other block ends in an unconditional branch, check for the 'if then
+ // else' case. There is an instruction before the branch.
+ StoreInst *OtherStore = nullptr;
+ if (OtherBr->isUnconditional()) {
+ --BBI;
+ // Skip over debugging info.
+ while (isa<DbgInfoIntrinsic>(BBI) ||
+ (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
+ if (BBI==OtherBB->begin())
+ return false;
+ --BBI;
+ }
+ // If this isn't a store, isn't a store to the same location, or is not the
+ // right kind of store, bail out.
+ OtherStore = dyn_cast<StoreInst>(BBI);
+ if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1) ||
+ !SI.isSameOperationAs(OtherStore))
+ return false;
+ } else {
+ // Otherwise, the other block ended with a conditional branch. If one of the
+ // destinations is StoreBB, then we have the if/then case.
+ if (OtherBr->getSuccessor(0) != StoreBB &&
+ OtherBr->getSuccessor(1) != StoreBB)
+ return false;
+
+ // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
+ // if/then triangle. See if there is a store to the same ptr as SI that
+ // lives in OtherBB.
+ for (;; --BBI) {
+ // Check to see if we find the matching store.
+ if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
+ if (OtherStore->getOperand(1) != SI.getOperand(1) ||
+ !SI.isSameOperationAs(OtherStore))
+ return false;
+ break;
+ }
+ // If we find something that may be using or overwriting the stored
+ // value, or if we run out of instructions, we can't do the transform.
+ if (BBI->mayReadFromMemory() || BBI->mayThrow() ||
+ BBI->mayWriteToMemory() || BBI == OtherBB->begin())
+ return false;
+ }
+
+ // In order to eliminate the store in OtherBr, we have to make sure nothing
+ // reads or overwrites the stored value in StoreBB.
+ for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
+ // FIXME: This should really be AA driven.
+ if (I->mayReadFromMemory() || I->mayThrow() || I->mayWriteToMemory())
+ return false;
+ }
+ }
+
+ // Insert a PHI node now if we need it.
+ Value *MergedVal = OtherStore->getOperand(0);
+ // The debug locations of the original instructions might differ. Merge them.
+ DebugLoc MergedLoc = DILocation::getMergedLocation(SI.getDebugLoc(),
+ OtherStore->getDebugLoc());
+ if (MergedVal != SI.getOperand(0)) {
+ PHINode *PN = PHINode::Create(MergedVal->getType(), 2, "storemerge");
+ PN->addIncoming(SI.getOperand(0), SI.getParent());
+ PN->addIncoming(OtherStore->getOperand(0), OtherBB);
+ MergedVal = InsertNewInstBefore(PN, DestBB->front());
+ PN->setDebugLoc(MergedLoc);
+ }
+
+ // Advance to a place where it is safe to insert the new store and insert it.
+ BBI = DestBB->getFirstInsertionPt();
+ StoreInst *NewSI =
+ new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(), SI.getAlign(),
+ SI.getOrdering(), SI.getSyncScopeID());
+ InsertNewInstBefore(NewSI, *BBI);
+ NewSI->setDebugLoc(MergedLoc);
+
+ // If the two stores had AA tags, merge them.
+ AAMDNodes AATags;
+ SI.getAAMetadata(AATags);
+ if (AATags) {
+ OtherStore->getAAMetadata(AATags, /* Merge = */ true);
+ NewSI->setAAMetadata(AATags);
+ }
+
+ // Nuke the old stores.
+ eraseInstFromFunction(SI);
+ eraseInstFromFunction(*OtherStore);
+ return true;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 7987d53b03..4b485a0ad8 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1,367 +1,367 @@
-//===- InstCombineMulDivRem.cpp -------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the visit functions for mul, fmul, sdiv, udiv, fdiv,
-// srem, urem, frem.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/KnownBits.h"
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+//===- InstCombineMulDivRem.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visit functions for mul, fmul, sdiv, udiv, fdiv,
+// srem, urem, frem.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <utility>
-
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "instcombine"
-
-/// The specific integer value is used in a context where it is known to be
-/// non-zero. If this allows us to simplify the computation, do so and return
-/// the new operand, otherwise return null.
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+/// The specific integer value is used in a context where it is known to be
+/// non-zero. If this allows us to simplify the computation, do so and return
+/// the new operand, otherwise return null.
static Value *simplifyValueKnownNonZero(Value *V, InstCombinerImpl &IC,
- Instruction &CxtI) {
- // If V has multiple uses, then we would have to do more analysis to determine
- // if this is safe. For example, the use could be in dynamically unreached
- // code.
- if (!V->hasOneUse()) return nullptr;
-
- bool MadeChange = false;
-
- // ((1 << A) >>u B) --> (1 << (A-B))
- // Because V cannot be zero, we know that B is less than A.
- Value *A = nullptr, *B = nullptr, *One = nullptr;
- if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(One), m_Value(A))), m_Value(B))) &&
- match(One, m_One())) {
- A = IC.Builder.CreateSub(A, B);
- return IC.Builder.CreateShl(One, A);
- }
-
- // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
- // inexact. Similarly for <<.
- BinaryOperator *I = dyn_cast<BinaryOperator>(V);
- if (I && I->isLogicalShift() &&
- IC.isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0, &CxtI)) {
- // We know that this is an exact/nuw shift and that the input is a
- // non-zero context as well.
- if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) {
- IC.replaceOperand(*I, 0, V2);
- MadeChange = true;
- }
-
- if (I->getOpcode() == Instruction::LShr && !I->isExact()) {
- I->setIsExact();
- MadeChange = true;
- }
-
- if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) {
- I->setHasNoUnsignedWrap();
- MadeChange = true;
- }
- }
-
- // TODO: Lots more we could do here:
- // If V is a phi node, we can call this on each of its operands.
- // "select cond, X, 0" can simplify to "X".
-
- return MadeChange ? V : nullptr;
-}
-
-// TODO: This is a specific form of a much more general pattern.
-// We could detect a select with any binop identity constant, or we
-// could use SimplifyBinOp to see if either arm of the select reduces.
-// But that needs to be done carefully and/or while removing potential
-// reverse canonicalizations as in InstCombiner::foldSelectIntoOp().
-static Value *foldMulSelectToNegate(BinaryOperator &I,
- InstCombiner::BuilderTy &Builder) {
- Value *Cond, *OtherOp;
-
- // mul (select Cond, 1, -1), OtherOp --> select Cond, OtherOp, -OtherOp
- // mul OtherOp, (select Cond, 1, -1) --> select Cond, OtherOp, -OtherOp
- if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_One(), m_AllOnes())),
- m_Value(OtherOp))))
- return Builder.CreateSelect(Cond, OtherOp, Builder.CreateNeg(OtherOp));
-
- // mul (select Cond, -1, 1), OtherOp --> select Cond, -OtherOp, OtherOp
- // mul OtherOp, (select Cond, -1, 1) --> select Cond, -OtherOp, OtherOp
- if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_AllOnes(), m_One())),
- m_Value(OtherOp))))
- return Builder.CreateSelect(Cond, Builder.CreateNeg(OtherOp), OtherOp);
-
- // fmul (select Cond, 1.0, -1.0), OtherOp --> select Cond, OtherOp, -OtherOp
- // fmul OtherOp, (select Cond, 1.0, -1.0) --> select Cond, OtherOp, -OtherOp
- if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(1.0),
- m_SpecificFP(-1.0))),
- m_Value(OtherOp)))) {
- IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
- Builder.setFastMathFlags(I.getFastMathFlags());
- return Builder.CreateSelect(Cond, OtherOp, Builder.CreateFNeg(OtherOp));
- }
-
- // fmul (select Cond, -1.0, 1.0), OtherOp --> select Cond, -OtherOp, OtherOp
- // fmul OtherOp, (select Cond, -1.0, 1.0) --> select Cond, -OtherOp, OtherOp
- if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(-1.0),
- m_SpecificFP(1.0))),
- m_Value(OtherOp)))) {
- IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
- Builder.setFastMathFlags(I.getFastMathFlags());
- return Builder.CreateSelect(Cond, Builder.CreateFNeg(OtherOp), OtherOp);
- }
-
- return nullptr;
-}
-
+ Instruction &CxtI) {
+ // If V has multiple uses, then we would have to do more analysis to determine
+ // if this is safe. For example, the use could be in dynamically unreached
+ // code.
+ if (!V->hasOneUse()) return nullptr;
+
+ bool MadeChange = false;
+
+ // ((1 << A) >>u B) --> (1 << (A-B))
+ // Because V cannot be zero, we know that B is less than A.
+ Value *A = nullptr, *B = nullptr, *One = nullptr;
+ if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(One), m_Value(A))), m_Value(B))) &&
+ match(One, m_One())) {
+ A = IC.Builder.CreateSub(A, B);
+ return IC.Builder.CreateShl(One, A);
+ }
+
+ // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
+ // inexact. Similarly for <<.
+ BinaryOperator *I = dyn_cast<BinaryOperator>(V);
+ if (I && I->isLogicalShift() &&
+ IC.isKnownToBeAPowerOfTwo(I->getOperand(0), false, 0, &CxtI)) {
+ // We know that this is an exact/nuw shift and that the input is a
+ // non-zero context as well.
+ if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) {
+ IC.replaceOperand(*I, 0, V2);
+ MadeChange = true;
+ }
+
+ if (I->getOpcode() == Instruction::LShr && !I->isExact()) {
+ I->setIsExact();
+ MadeChange = true;
+ }
+
+ if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) {
+ I->setHasNoUnsignedWrap();
+ MadeChange = true;
+ }
+ }
+
+ // TODO: Lots more we could do here:
+ // If V is a phi node, we can call this on each of its operands.
+ // "select cond, X, 0" can simplify to "X".
+
+ return MadeChange ? V : nullptr;
+}
+
+// TODO: This is a specific form of a much more general pattern.
+// We could detect a select with any binop identity constant, or we
+// could use SimplifyBinOp to see if either arm of the select reduces.
+// But that needs to be done carefully and/or while removing potential
+// reverse canonicalizations as in InstCombiner::foldSelectIntoOp().
+static Value *foldMulSelectToNegate(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Cond, *OtherOp;
+
+ // mul (select Cond, 1, -1), OtherOp --> select Cond, OtherOp, -OtherOp
+ // mul OtherOp, (select Cond, 1, -1) --> select Cond, OtherOp, -OtherOp
+ if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_One(), m_AllOnes())),
+ m_Value(OtherOp))))
+ return Builder.CreateSelect(Cond, OtherOp, Builder.CreateNeg(OtherOp));
+
+ // mul (select Cond, -1, 1), OtherOp --> select Cond, -OtherOp, OtherOp
+ // mul OtherOp, (select Cond, -1, 1) --> select Cond, -OtherOp, OtherOp
+ if (match(&I, m_c_Mul(m_OneUse(m_Select(m_Value(Cond), m_AllOnes(), m_One())),
+ m_Value(OtherOp))))
+ return Builder.CreateSelect(Cond, Builder.CreateNeg(OtherOp), OtherOp);
+
+ // fmul (select Cond, 1.0, -1.0), OtherOp --> select Cond, OtherOp, -OtherOp
+ // fmul OtherOp, (select Cond, 1.0, -1.0) --> select Cond, OtherOp, -OtherOp
+ if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(1.0),
+ m_SpecificFP(-1.0))),
+ m_Value(OtherOp)))) {
+ IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+ Builder.setFastMathFlags(I.getFastMathFlags());
+ return Builder.CreateSelect(Cond, OtherOp, Builder.CreateFNeg(OtherOp));
+ }
+
+ // fmul (select Cond, -1.0, 1.0), OtherOp --> select Cond, -OtherOp, OtherOp
+ // fmul OtherOp, (select Cond, -1.0, 1.0) --> select Cond, -OtherOp, OtherOp
+ if (match(&I, m_c_FMul(m_OneUse(m_Select(m_Value(Cond), m_SpecificFP(-1.0),
+ m_SpecificFP(1.0))),
+ m_Value(OtherOp)))) {
+ IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+ Builder.setFastMathFlags(I.getFastMathFlags());
+ return Builder.CreateSelect(Cond, Builder.CreateFNeg(OtherOp), OtherOp);
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
- if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (SimplifyAssociativeOrCommutative(I))
- return &I;
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- if (Value *V = SimplifyUsingDistributiveLaws(I))
- return replaceInstUsesWith(I, V);
-
+ if (Value *V = SimplifyMulInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (SimplifyAssociativeOrCommutative(I))
+ return &I;
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ if (Value *V = SimplifyUsingDistributiveLaws(I))
+ return replaceInstUsesWith(I, V);
+
Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
unsigned BitWidth = I.getType()->getScalarSizeInBits();
- // X * -1 == 0 - X
- if (match(Op1, m_AllOnes())) {
- BinaryOperator *BO = BinaryOperator::CreateNeg(Op0, I.getName());
- if (I.hasNoSignedWrap())
- BO->setHasNoSignedWrap();
- return BO;
- }
-
- // Also allow combining multiply instructions on vectors.
- {
- Value *NewOp;
- Constant *C1, *C2;
- const APInt *IVal;
- if (match(&I, m_Mul(m_Shl(m_Value(NewOp), m_Constant(C2)),
- m_Constant(C1))) &&
- match(C1, m_APInt(IVal))) {
- // ((X << C2)*C1) == (X * (C1 << C2))
- Constant *Shl = ConstantExpr::getShl(C1, C2);
- BinaryOperator *Mul = cast<BinaryOperator>(I.getOperand(0));
- BinaryOperator *BO = BinaryOperator::CreateMul(NewOp, Shl);
- if (I.hasNoUnsignedWrap() && Mul->hasNoUnsignedWrap())
- BO->setHasNoUnsignedWrap();
- if (I.hasNoSignedWrap() && Mul->hasNoSignedWrap() &&
- Shl->isNotMinSignedValue())
- BO->setHasNoSignedWrap();
- return BO;
- }
-
- if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) {
- // Replace X*(2^C) with X << C, where C is either a scalar or a vector.
+ // X * -1 == 0 - X
+ if (match(Op1, m_AllOnes())) {
+ BinaryOperator *BO = BinaryOperator::CreateNeg(Op0, I.getName());
+ if (I.hasNoSignedWrap())
+ BO->setHasNoSignedWrap();
+ return BO;
+ }
+
+ // Also allow combining multiply instructions on vectors.
+ {
+ Value *NewOp;
+ Constant *C1, *C2;
+ const APInt *IVal;
+ if (match(&I, m_Mul(m_Shl(m_Value(NewOp), m_Constant(C2)),
+ m_Constant(C1))) &&
+ match(C1, m_APInt(IVal))) {
+ // ((X << C2)*C1) == (X * (C1 << C2))
+ Constant *Shl = ConstantExpr::getShl(C1, C2);
+ BinaryOperator *Mul = cast<BinaryOperator>(I.getOperand(0));
+ BinaryOperator *BO = BinaryOperator::CreateMul(NewOp, Shl);
+ if (I.hasNoUnsignedWrap() && Mul->hasNoUnsignedWrap())
+ BO->setHasNoUnsignedWrap();
+ if (I.hasNoSignedWrap() && Mul->hasNoSignedWrap() &&
+ Shl->isNotMinSignedValue())
+ BO->setHasNoSignedWrap();
+ return BO;
+ }
+
+ if (match(&I, m_Mul(m_Value(NewOp), m_Constant(C1)))) {
+ // Replace X*(2^C) with X << C, where C is either a scalar or a vector.
if (Constant *NewCst = ConstantExpr::getExactLogBase2(C1)) {
- BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst);
-
- if (I.hasNoUnsignedWrap())
- Shl->setHasNoUnsignedWrap();
- if (I.hasNoSignedWrap()) {
- const APInt *V;
- if (match(NewCst, m_APInt(V)) && *V != V->getBitWidth() - 1)
- Shl->setHasNoSignedWrap();
- }
-
- return Shl;
- }
- }
- }
-
+ BinaryOperator *Shl = BinaryOperator::CreateShl(NewOp, NewCst);
+
+ if (I.hasNoUnsignedWrap())
+ Shl->setHasNoUnsignedWrap();
+ if (I.hasNoSignedWrap()) {
+ const APInt *V;
+ if (match(NewCst, m_APInt(V)) && *V != V->getBitWidth() - 1)
+ Shl->setHasNoSignedWrap();
+ }
+
+ return Shl;
+ }
+ }
+ }
+
if (Op0->hasOneUse() && match(Op1, m_NegatedPower2())) {
// Interpret X * (-1<<C) as (-X) * (1<<C) and try to sink the negation.
// The "* (1<<C)" thus becomes a potential shifting opportunity.
if (Value *NegOp0 = Negator::Negate(/*IsNegation*/ true, Op0, *this))
return BinaryOperator::CreateMul(
NegOp0, ConstantExpr::getNeg(cast<Constant>(Op1)), I.getName());
- }
-
- if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
- return FoldedMul;
-
- if (Value *FoldedMul = foldMulSelectToNegate(I, Builder))
- return replaceInstUsesWith(I, FoldedMul);
-
- // Simplify mul instructions with a constant RHS.
- if (isa<Constant>(Op1)) {
- // Canonicalize (X+C1)*CI -> X*CI+C1*CI.
- Value *X;
- Constant *C1;
- if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) {
- Value *Mul = Builder.CreateMul(C1, Op1);
- // Only go forward with the transform if C1*CI simplifies to a tidier
- // constant.
- if (!match(Mul, m_Mul(m_Value(), m_Value())))
- return BinaryOperator::CreateAdd(Builder.CreateMul(X, Op1), Mul);
- }
- }
-
- // abs(X) * abs(X) -> X * X
- // nabs(X) * nabs(X) -> X * X
- if (Op0 == Op1) {
- Value *X, *Y;
- SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
- if (SPF == SPF_ABS || SPF == SPF_NABS)
- return BinaryOperator::CreateMul(X, X);
+ }
+
+ if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
+ return FoldedMul;
+
+ if (Value *FoldedMul = foldMulSelectToNegate(I, Builder))
+ return replaceInstUsesWith(I, FoldedMul);
+
+ // Simplify mul instructions with a constant RHS.
+ if (isa<Constant>(Op1)) {
+ // Canonicalize (X+C1)*CI -> X*CI+C1*CI.
+ Value *X;
+ Constant *C1;
+ if (match(Op0, m_OneUse(m_Add(m_Value(X), m_Constant(C1))))) {
+ Value *Mul = Builder.CreateMul(C1, Op1);
+ // Only go forward with the transform if C1*CI simplifies to a tidier
+ // constant.
+ if (!match(Mul, m_Mul(m_Value(), m_Value())))
+ return BinaryOperator::CreateAdd(Builder.CreateMul(X, Op1), Mul);
+ }
+ }
+
+ // abs(X) * abs(X) -> X * X
+ // nabs(X) * nabs(X) -> X * X
+ if (Op0 == Op1) {
+ Value *X, *Y;
+ SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
+ if (SPF == SPF_ABS || SPF == SPF_NABS)
+ return BinaryOperator::CreateMul(X, X);
if (match(Op0, m_Intrinsic<Intrinsic::abs>(m_Value(X))))
return BinaryOperator::CreateMul(X, X);
- }
-
- // -X * C --> X * -C
- Value *X, *Y;
- Constant *Op1C;
- if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Constant(Op1C)))
- return BinaryOperator::CreateMul(X, ConstantExpr::getNeg(Op1C));
-
- // -X * -Y --> X * Y
- if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Neg(m_Value(Y)))) {
- auto *NewMul = BinaryOperator::CreateMul(X, Y);
- if (I.hasNoSignedWrap() &&
- cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap() &&
- cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap())
- NewMul->setHasNoSignedWrap();
- return NewMul;
- }
-
- // -X * Y --> -(X * Y)
- // X * -Y --> -(X * Y)
- if (match(&I, m_c_Mul(m_OneUse(m_Neg(m_Value(X))), m_Value(Y))))
- return BinaryOperator::CreateNeg(Builder.CreateMul(X, Y));
-
- // (X / Y) * Y = X - (X % Y)
- // (X / Y) * -Y = (X % Y) - X
- {
- Value *Y = Op1;
- BinaryOperator *Div = dyn_cast<BinaryOperator>(Op0);
- if (!Div || (Div->getOpcode() != Instruction::UDiv &&
- Div->getOpcode() != Instruction::SDiv)) {
- Y = Op0;
- Div = dyn_cast<BinaryOperator>(Op1);
- }
- Value *Neg = dyn_castNegVal(Y);
- if (Div && Div->hasOneUse() &&
- (Div->getOperand(1) == Y || Div->getOperand(1) == Neg) &&
- (Div->getOpcode() == Instruction::UDiv ||
- Div->getOpcode() == Instruction::SDiv)) {
- Value *X = Div->getOperand(0), *DivOp1 = Div->getOperand(1);
-
- // If the division is exact, X % Y is zero, so we end up with X or -X.
- if (Div->isExact()) {
- if (DivOp1 == Y)
- return replaceInstUsesWith(I, X);
- return BinaryOperator::CreateNeg(X);
- }
-
- auto RemOpc = Div->getOpcode() == Instruction::UDiv ? Instruction::URem
- : Instruction::SRem;
- Value *Rem = Builder.CreateBinOp(RemOpc, X, DivOp1);
- if (DivOp1 == Y)
- return BinaryOperator::CreateSub(X, Rem);
- return BinaryOperator::CreateSub(Rem, X);
- }
- }
-
- /// i1 mul -> i1 and.
- if (I.getType()->isIntOrIntVectorTy(1))
- return BinaryOperator::CreateAnd(Op0, Op1);
-
- // X*(1 << Y) --> X << Y
- // (1 << Y)*X --> X << Y
- {
- Value *Y;
- BinaryOperator *BO = nullptr;
- bool ShlNSW = false;
- if (match(Op0, m_Shl(m_One(), m_Value(Y)))) {
- BO = BinaryOperator::CreateShl(Op1, Y);
- ShlNSW = cast<ShlOperator>(Op0)->hasNoSignedWrap();
- } else if (match(Op1, m_Shl(m_One(), m_Value(Y)))) {
- BO = BinaryOperator::CreateShl(Op0, Y);
- ShlNSW = cast<ShlOperator>(Op1)->hasNoSignedWrap();
- }
- if (BO) {
- if (I.hasNoUnsignedWrap())
- BO->setHasNoUnsignedWrap();
- if (I.hasNoSignedWrap() && ShlNSW)
- BO->setHasNoSignedWrap();
- return BO;
- }
- }
-
- // (zext bool X) * (zext bool Y) --> zext (and X, Y)
- // (sext bool X) * (sext bool Y) --> zext (and X, Y)
- // Note: -1 * -1 == 1 * 1 == 1 (if the extends match, the result is the same)
- if (((match(Op0, m_ZExt(m_Value(X))) && match(Op1, m_ZExt(m_Value(Y)))) ||
- (match(Op0, m_SExt(m_Value(X))) && match(Op1, m_SExt(m_Value(Y))))) &&
- X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() &&
- (Op0->hasOneUse() || Op1->hasOneUse())) {
- Value *And = Builder.CreateAnd(X, Y, "mulbool");
- return CastInst::Create(Instruction::ZExt, And, I.getType());
- }
- // (sext bool X) * (zext bool Y) --> sext (and X, Y)
- // (zext bool X) * (sext bool Y) --> sext (and X, Y)
- // Note: -1 * 1 == 1 * -1 == -1
- if (((match(Op0, m_SExt(m_Value(X))) && match(Op1, m_ZExt(m_Value(Y)))) ||
- (match(Op0, m_ZExt(m_Value(X))) && match(Op1, m_SExt(m_Value(Y))))) &&
- X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() &&
- (Op0->hasOneUse() || Op1->hasOneUse())) {
- Value *And = Builder.CreateAnd(X, Y, "mulbool");
- return CastInst::Create(Instruction::SExt, And, I.getType());
- }
-
- // (bool X) * Y --> X ? Y : 0
- // Y * (bool X) --> X ? Y : 0
- if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
- return SelectInst::Create(X, Op1, ConstantInt::get(I.getType(), 0));
- if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
- return SelectInst::Create(X, Op0, ConstantInt::get(I.getType(), 0));
-
- // (lshr X, 31) * Y --> (ashr X, 31) & Y
- // Y * (lshr X, 31) --> (ashr X, 31) & Y
- // TODO: We are not checking one-use because the elimination of the multiply
- // is better for analysis?
- // TODO: Should we canonicalize to '(X < 0) ? Y : 0' instead? That would be
- // more similar to what we're doing above.
- const APInt *C;
- if (match(Op0, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
- return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op1);
- if (match(Op1, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
- return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op0);
-
+ }
+
+ // -X * C --> X * -C
+ Value *X, *Y;
+ Constant *Op1C;
+ if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Constant(Op1C)))
+ return BinaryOperator::CreateMul(X, ConstantExpr::getNeg(Op1C));
+
+ // -X * -Y --> X * Y
+ if (match(Op0, m_Neg(m_Value(X))) && match(Op1, m_Neg(m_Value(Y)))) {
+ auto *NewMul = BinaryOperator::CreateMul(X, Y);
+ if (I.hasNoSignedWrap() &&
+ cast<OverflowingBinaryOperator>(Op0)->hasNoSignedWrap() &&
+ cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap())
+ NewMul->setHasNoSignedWrap();
+ return NewMul;
+ }
+
+ // -X * Y --> -(X * Y)
+ // X * -Y --> -(X * Y)
+ if (match(&I, m_c_Mul(m_OneUse(m_Neg(m_Value(X))), m_Value(Y))))
+ return BinaryOperator::CreateNeg(Builder.CreateMul(X, Y));
+
+ // (X / Y) * Y = X - (X % Y)
+ // (X / Y) * -Y = (X % Y) - X
+ {
+ Value *Y = Op1;
+ BinaryOperator *Div = dyn_cast<BinaryOperator>(Op0);
+ if (!Div || (Div->getOpcode() != Instruction::UDiv &&
+ Div->getOpcode() != Instruction::SDiv)) {
+ Y = Op0;
+ Div = dyn_cast<BinaryOperator>(Op1);
+ }
+ Value *Neg = dyn_castNegVal(Y);
+ if (Div && Div->hasOneUse() &&
+ (Div->getOperand(1) == Y || Div->getOperand(1) == Neg) &&
+ (Div->getOpcode() == Instruction::UDiv ||
+ Div->getOpcode() == Instruction::SDiv)) {
+ Value *X = Div->getOperand(0), *DivOp1 = Div->getOperand(1);
+
+ // If the division is exact, X % Y is zero, so we end up with X or -X.
+ if (Div->isExact()) {
+ if (DivOp1 == Y)
+ return replaceInstUsesWith(I, X);
+ return BinaryOperator::CreateNeg(X);
+ }
+
+ auto RemOpc = Div->getOpcode() == Instruction::UDiv ? Instruction::URem
+ : Instruction::SRem;
+ Value *Rem = Builder.CreateBinOp(RemOpc, X, DivOp1);
+ if (DivOp1 == Y)
+ return BinaryOperator::CreateSub(X, Rem);
+ return BinaryOperator::CreateSub(Rem, X);
+ }
+ }
+
+ /// i1 mul -> i1 and.
+ if (I.getType()->isIntOrIntVectorTy(1))
+ return BinaryOperator::CreateAnd(Op0, Op1);
+
+ // X*(1 << Y) --> X << Y
+ // (1 << Y)*X --> X << Y
+ {
+ Value *Y;
+ BinaryOperator *BO = nullptr;
+ bool ShlNSW = false;
+ if (match(Op0, m_Shl(m_One(), m_Value(Y)))) {
+ BO = BinaryOperator::CreateShl(Op1, Y);
+ ShlNSW = cast<ShlOperator>(Op0)->hasNoSignedWrap();
+ } else if (match(Op1, m_Shl(m_One(), m_Value(Y)))) {
+ BO = BinaryOperator::CreateShl(Op0, Y);
+ ShlNSW = cast<ShlOperator>(Op1)->hasNoSignedWrap();
+ }
+ if (BO) {
+ if (I.hasNoUnsignedWrap())
+ BO->setHasNoUnsignedWrap();
+ if (I.hasNoSignedWrap() && ShlNSW)
+ BO->setHasNoSignedWrap();
+ return BO;
+ }
+ }
+
+ // (zext bool X) * (zext bool Y) --> zext (and X, Y)
+ // (sext bool X) * (sext bool Y) --> zext (and X, Y)
+ // Note: -1 * -1 == 1 * 1 == 1 (if the extends match, the result is the same)
+ if (((match(Op0, m_ZExt(m_Value(X))) && match(Op1, m_ZExt(m_Value(Y)))) ||
+ (match(Op0, m_SExt(m_Value(X))) && match(Op1, m_SExt(m_Value(Y))))) &&
+ X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() &&
+ (Op0->hasOneUse() || Op1->hasOneUse())) {
+ Value *And = Builder.CreateAnd(X, Y, "mulbool");
+ return CastInst::Create(Instruction::ZExt, And, I.getType());
+ }
+ // (sext bool X) * (zext bool Y) --> sext (and X, Y)
+ // (zext bool X) * (sext bool Y) --> sext (and X, Y)
+ // Note: -1 * 1 == 1 * -1 == -1
+ if (((match(Op0, m_SExt(m_Value(X))) && match(Op1, m_ZExt(m_Value(Y)))) ||
+ (match(Op0, m_ZExt(m_Value(X))) && match(Op1, m_SExt(m_Value(Y))))) &&
+ X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() &&
+ (Op0->hasOneUse() || Op1->hasOneUse())) {
+ Value *And = Builder.CreateAnd(X, Y, "mulbool");
+ return CastInst::Create(Instruction::SExt, And, I.getType());
+ }
+
+ // (bool X) * Y --> X ? Y : 0
+ // Y * (bool X) --> X ? Y : 0
+ if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
+ return SelectInst::Create(X, Op1, ConstantInt::get(I.getType(), 0));
+ if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
+ return SelectInst::Create(X, Op0, ConstantInt::get(I.getType(), 0));
+
+ // (lshr X, 31) * Y --> (ashr X, 31) & Y
+ // Y * (lshr X, 31) --> (ashr X, 31) & Y
+ // TODO: We are not checking one-use because the elimination of the multiply
+ // is better for analysis?
+ // TODO: Should we canonicalize to '(X < 0) ? Y : 0' instead? That would be
+ // more similar to what we're doing above.
+ const APInt *C;
+ if (match(Op0, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
+ return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op1);
+ if (match(Op1, m_LShr(m_Value(X), m_APInt(C))) && *C == C->getBitWidth() - 1)
+ return BinaryOperator::CreateAnd(Builder.CreateAShr(X, *C), Op0);
+
// ((ashr X, 31) | 1) * X --> abs(X)
// X * ((ashr X, 31) | 1) --> abs(X)
if (match(&I, m_c_BinOp(m_Or(m_AShr(m_Value(X),
@@ -375,152 +375,152 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) {
return replaceInstUsesWith(I, Abs);
}
- if (Instruction *Ext = narrowMathIfNoOverflow(I))
- return Ext;
-
- bool Changed = false;
- if (!I.hasNoSignedWrap() && willNotOverflowSignedMul(Op0, Op1, I)) {
- Changed = true;
- I.setHasNoSignedWrap(true);
- }
-
- if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) {
- Changed = true;
- I.setHasNoUnsignedWrap(true);
- }
-
- return Changed ? &I : nullptr;
-}
-
+ if (Instruction *Ext = narrowMathIfNoOverflow(I))
+ return Ext;
+
+ bool Changed = false;
+ if (!I.hasNoSignedWrap() && willNotOverflowSignedMul(Op0, Op1, I)) {
+ Changed = true;
+ I.setHasNoSignedWrap(true);
+ }
+
+ if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) {
+ Changed = true;
+ I.setHasNoUnsignedWrap(true);
+ }
+
+ return Changed ? &I : nullptr;
+}
+
Instruction *InstCombinerImpl::foldFPSignBitOps(BinaryOperator &I) {
- BinaryOperator::BinaryOps Opcode = I.getOpcode();
- assert((Opcode == Instruction::FMul || Opcode == Instruction::FDiv) &&
- "Expected fmul or fdiv");
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- Value *X, *Y;
-
- // -X * -Y --> X * Y
- // -X / -Y --> X / Y
- if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
- return BinaryOperator::CreateWithCopiedFlags(Opcode, X, Y, &I);
-
- // fabs(X) * fabs(X) -> X * X
- // fabs(X) / fabs(X) -> X / X
+ BinaryOperator::BinaryOps Opcode = I.getOpcode();
+ assert((Opcode == Instruction::FMul || Opcode == Instruction::FDiv) &&
+ "Expected fmul or fdiv");
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Value *X, *Y;
+
+ // -X * -Y --> X * Y
+ // -X / -Y --> X / Y
+ if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
+ return BinaryOperator::CreateWithCopiedFlags(Opcode, X, Y, &I);
+
+ // fabs(X) * fabs(X) -> X * X
+ // fabs(X) / fabs(X) -> X / X
if (Op0 == Op1 && match(Op0, m_FAbs(m_Value(X))))
- return BinaryOperator::CreateWithCopiedFlags(Opcode, X, X, &I);
-
- // fabs(X) * fabs(Y) --> fabs(X * Y)
- // fabs(X) / fabs(Y) --> fabs(X / Y)
+ return BinaryOperator::CreateWithCopiedFlags(Opcode, X, X, &I);
+
+ // fabs(X) * fabs(Y) --> fabs(X * Y)
+ // fabs(X) / fabs(Y) --> fabs(X / Y)
if (match(Op0, m_FAbs(m_Value(X))) && match(Op1, m_FAbs(m_Value(Y))) &&
- (Op0->hasOneUse() || Op1->hasOneUse())) {
- IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
- Builder.setFastMathFlags(I.getFastMathFlags());
- Value *XY = Builder.CreateBinOp(Opcode, X, Y);
- Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, XY);
- Fabs->takeName(&I);
- return replaceInstUsesWith(I, Fabs);
- }
-
- return nullptr;
-}
-
+ (Op0->hasOneUse() || Op1->hasOneUse())) {
+ IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+ Builder.setFastMathFlags(I.getFastMathFlags());
+ Value *XY = Builder.CreateBinOp(Opcode, X, Y);
+ Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, XY);
+ Fabs->takeName(&I);
+ return replaceInstUsesWith(I, Fabs);
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
- if (Value *V = SimplifyFMulInst(I.getOperand(0), I.getOperand(1),
- I.getFastMathFlags(),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (SimplifyAssociativeOrCommutative(I))
- return &I;
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
- return FoldedMul;
-
- if (Value *FoldedMul = foldMulSelectToNegate(I, Builder))
- return replaceInstUsesWith(I, FoldedMul);
-
- if (Instruction *R = foldFPSignBitOps(I))
- return R;
-
- // X * -1.0 --> -X
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- if (match(Op1, m_SpecificFP(-1.0)))
- return UnaryOperator::CreateFNegFMF(Op0, &I);
-
- // -X * C --> X * -C
- Value *X, *Y;
- Constant *C;
- if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_Constant(C)))
- return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);
-
- // (select A, B, C) * (select A, D, E) --> select A, (B*D), (C*E)
- if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
- return replaceInstUsesWith(I, V);
-
- if (I.hasAllowReassoc()) {
- // Reassociate constant RHS with another constant to form constant
- // expression.
- if (match(Op1, m_Constant(C)) && C->isFiniteNonZeroFP()) {
- Constant *C1;
- if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) {
- // (C1 / X) * C --> (C * C1) / X
- Constant *CC1 = ConstantExpr::getFMul(C, C1);
- if (CC1->isNormalFP())
- return BinaryOperator::CreateFDivFMF(CC1, X, &I);
- }
- if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) {
- // (X / C1) * C --> X * (C / C1)
- Constant *CDivC1 = ConstantExpr::getFDiv(C, C1);
- if (CDivC1->isNormalFP())
- return BinaryOperator::CreateFMulFMF(X, CDivC1, &I);
-
- // If the constant was a denormal, try reassociating differently.
- // (X / C1) * C --> X / (C1 / C)
- Constant *C1DivC = ConstantExpr::getFDiv(C1, C);
- if (Op0->hasOneUse() && C1DivC->isNormalFP())
- return BinaryOperator::CreateFDivFMF(X, C1DivC, &I);
- }
-
- // We do not need to match 'fadd C, X' and 'fsub X, C' because they are
- // canonicalized to 'fadd X, C'. Distributing the multiply may allow
- // further folds and (X * C) + C2 is 'fma'.
- if (match(Op0, m_OneUse(m_FAdd(m_Value(X), m_Constant(C1))))) {
- // (X + C1) * C --> (X * C) + (C * C1)
- Constant *CC1 = ConstantExpr::getFMul(C, C1);
- Value *XC = Builder.CreateFMulFMF(X, C, &I);
- return BinaryOperator::CreateFAddFMF(XC, CC1, &I);
- }
- if (match(Op0, m_OneUse(m_FSub(m_Constant(C1), m_Value(X))))) {
- // (C1 - X) * C --> (C * C1) - (X * C)
- Constant *CC1 = ConstantExpr::getFMul(C, C1);
- Value *XC = Builder.CreateFMulFMF(X, C, &I);
- return BinaryOperator::CreateFSubFMF(CC1, XC, &I);
- }
- }
-
- Value *Z;
- if (match(&I, m_c_FMul(m_OneUse(m_FDiv(m_Value(X), m_Value(Y))),
- m_Value(Z)))) {
- // Sink division: (X / Y) * Z --> (X * Z) / Y
- Value *NewFMul = Builder.CreateFMulFMF(X, Z, &I);
- return BinaryOperator::CreateFDivFMF(NewFMul, Y, &I);
- }
-
- // sqrt(X) * sqrt(Y) -> sqrt(X * Y)
- // nnan disallows the possibility of returning a number if both operands are
- // negative (in that case, we should return NaN).
- if (I.hasNoNaNs() &&
- match(Op0, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(X)))) &&
- match(Op1, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) {
- Value *XY = Builder.CreateFMulFMF(X, Y, &I);
- Value *Sqrt = Builder.CreateUnaryIntrinsic(Intrinsic::sqrt, XY, &I);
- return replaceInstUsesWith(I, Sqrt);
- }
-
+ if (Value *V = SimplifyFMulInst(I.getOperand(0), I.getOperand(1),
+ I.getFastMathFlags(),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (SimplifyAssociativeOrCommutative(I))
+ return &I;
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
+ return FoldedMul;
+
+ if (Value *FoldedMul = foldMulSelectToNegate(I, Builder))
+ return replaceInstUsesWith(I, FoldedMul);
+
+ if (Instruction *R = foldFPSignBitOps(I))
+ return R;
+
+ // X * -1.0 --> -X
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ if (match(Op1, m_SpecificFP(-1.0)))
+ return UnaryOperator::CreateFNegFMF(Op0, &I);
+
+ // -X * C --> X * -C
+ Value *X, *Y;
+ Constant *C;
+ if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_Constant(C)))
+ return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);
+
+ // (select A, B, C) * (select A, D, E) --> select A, (B*D), (C*E)
+ if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
+ return replaceInstUsesWith(I, V);
+
+ if (I.hasAllowReassoc()) {
+ // Reassociate constant RHS with another constant to form constant
+ // expression.
+ if (match(Op1, m_Constant(C)) && C->isFiniteNonZeroFP()) {
+ Constant *C1;
+ if (match(Op0, m_OneUse(m_FDiv(m_Constant(C1), m_Value(X))))) {
+ // (C1 / X) * C --> (C * C1) / X
+ Constant *CC1 = ConstantExpr::getFMul(C, C1);
+ if (CC1->isNormalFP())
+ return BinaryOperator::CreateFDivFMF(CC1, X, &I);
+ }
+ if (match(Op0, m_FDiv(m_Value(X), m_Constant(C1)))) {
+ // (X / C1) * C --> X * (C / C1)
+ Constant *CDivC1 = ConstantExpr::getFDiv(C, C1);
+ if (CDivC1->isNormalFP())
+ return BinaryOperator::CreateFMulFMF(X, CDivC1, &I);
+
+ // If the constant was a denormal, try reassociating differently.
+ // (X / C1) * C --> X / (C1 / C)
+ Constant *C1DivC = ConstantExpr::getFDiv(C1, C);
+ if (Op0->hasOneUse() && C1DivC->isNormalFP())
+ return BinaryOperator::CreateFDivFMF(X, C1DivC, &I);
+ }
+
+ // We do not need to match 'fadd C, X' and 'fsub X, C' because they are
+ // canonicalized to 'fadd X, C'. Distributing the multiply may allow
+ // further folds and (X * C) + C2 is 'fma'.
+ if (match(Op0, m_OneUse(m_FAdd(m_Value(X), m_Constant(C1))))) {
+ // (X + C1) * C --> (X * C) + (C * C1)
+ Constant *CC1 = ConstantExpr::getFMul(C, C1);
+ Value *XC = Builder.CreateFMulFMF(X, C, &I);
+ return BinaryOperator::CreateFAddFMF(XC, CC1, &I);
+ }
+ if (match(Op0, m_OneUse(m_FSub(m_Constant(C1), m_Value(X))))) {
+ // (C1 - X) * C --> (C * C1) - (X * C)
+ Constant *CC1 = ConstantExpr::getFMul(C, C1);
+ Value *XC = Builder.CreateFMulFMF(X, C, &I);
+ return BinaryOperator::CreateFSubFMF(CC1, XC, &I);
+ }
+ }
+
+ Value *Z;
+ if (match(&I, m_c_FMul(m_OneUse(m_FDiv(m_Value(X), m_Value(Y))),
+ m_Value(Z)))) {
+ // Sink division: (X / Y) * Z --> (X * Z) / Y
+ Value *NewFMul = Builder.CreateFMulFMF(X, Z, &I);
+ return BinaryOperator::CreateFDivFMF(NewFMul, Y, &I);
+ }
+
+ // sqrt(X) * sqrt(Y) -> sqrt(X * Y)
+ // nnan disallows the possibility of returning a number if both operands are
+ // negative (in that case, we should return NaN).
+ if (I.hasNoNaNs() &&
+ match(Op0, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(X)))) &&
+ match(Op1, m_OneUse(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) {
+ Value *XY = Builder.CreateFMulFMF(X, Y, &I);
+ Value *Sqrt = Builder.CreateUnaryIntrinsic(Intrinsic::sqrt, XY, &I);
+ return replaceInstUsesWith(I, Sqrt);
+ }
+
// The following transforms are done irrespective of the number of uses
// for the expression "1.0/sqrt(X)".
// 1) 1.0/sqrt(X) * X -> X/sqrt(X)
@@ -536,588 +536,588 @@ Instruction *InstCombinerImpl::visitFMul(BinaryOperator &I) {
match(Y, m_Intrinsic<Intrinsic::sqrt>(m_Value(X))) && Op0 == X)
return BinaryOperator::CreateFDivFMF(X, Y, &I);
- // Like the similar transform in instsimplify, this requires 'nsz' because
- // sqrt(-0.0) = -0.0, and -0.0 * -0.0 does not simplify to -0.0.
- if (I.hasNoNaNs() && I.hasNoSignedZeros() && Op0 == Op1 &&
- Op0->hasNUses(2)) {
- // Peek through fdiv to find squaring of square root:
- // (X / sqrt(Y)) * (X / sqrt(Y)) --> (X * X) / Y
- if (match(Op0, m_FDiv(m_Value(X),
- m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) {
- Value *XX = Builder.CreateFMulFMF(X, X, &I);
- return BinaryOperator::CreateFDivFMF(XX, Y, &I);
- }
- // (sqrt(Y) / X) * (sqrt(Y) / X) --> Y / (X * X)
- if (match(Op0, m_FDiv(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y)),
- m_Value(X)))) {
- Value *XX = Builder.CreateFMulFMF(X, X, &I);
- return BinaryOperator::CreateFDivFMF(Y, XX, &I);
- }
- }
-
- // exp(X) * exp(Y) -> exp(X + Y)
- // Match as long as at least one of exp has only one use.
- if (match(Op0, m_Intrinsic<Intrinsic::exp>(m_Value(X))) &&
- match(Op1, m_Intrinsic<Intrinsic::exp>(m_Value(Y))) &&
- (Op0->hasOneUse() || Op1->hasOneUse())) {
- Value *XY = Builder.CreateFAddFMF(X, Y, &I);
- Value *Exp = Builder.CreateUnaryIntrinsic(Intrinsic::exp, XY, &I);
- return replaceInstUsesWith(I, Exp);
- }
-
- // exp2(X) * exp2(Y) -> exp2(X + Y)
- // Match as long as at least one of exp2 has only one use.
- if (match(Op0, m_Intrinsic<Intrinsic::exp2>(m_Value(X))) &&
- match(Op1, m_Intrinsic<Intrinsic::exp2>(m_Value(Y))) &&
- (Op0->hasOneUse() || Op1->hasOneUse())) {
- Value *XY = Builder.CreateFAddFMF(X, Y, &I);
- Value *Exp2 = Builder.CreateUnaryIntrinsic(Intrinsic::exp2, XY, &I);
- return replaceInstUsesWith(I, Exp2);
- }
-
- // (X*Y) * X => (X*X) * Y where Y != X
- // The purpose is two-fold:
- // 1) to form a power expression (of X).
- // 2) potentially shorten the critical path: After transformation, the
- // latency of the instruction Y is amortized by the expression of X*X,
- // and therefore Y is in a "less critical" position compared to what it
- // was before the transformation.
- if (match(Op0, m_OneUse(m_c_FMul(m_Specific(Op1), m_Value(Y)))) &&
- Op1 != Y) {
- Value *XX = Builder.CreateFMulFMF(Op1, Op1, &I);
- return BinaryOperator::CreateFMulFMF(XX, Y, &I);
- }
- if (match(Op1, m_OneUse(m_c_FMul(m_Specific(Op0), m_Value(Y)))) &&
- Op0 != Y) {
- Value *XX = Builder.CreateFMulFMF(Op0, Op0, &I);
- return BinaryOperator::CreateFMulFMF(XX, Y, &I);
- }
- }
-
- // log2(X * 0.5) * Y = log2(X) * Y - Y
- if (I.isFast()) {
- IntrinsicInst *Log2 = nullptr;
- if (match(Op0, m_OneUse(m_Intrinsic<Intrinsic::log2>(
- m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) {
- Log2 = cast<IntrinsicInst>(Op0);
- Y = Op1;
- }
- if (match(Op1, m_OneUse(m_Intrinsic<Intrinsic::log2>(
- m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) {
- Log2 = cast<IntrinsicInst>(Op1);
- Y = Op0;
- }
- if (Log2) {
- Value *Log2 = Builder.CreateUnaryIntrinsic(Intrinsic::log2, X, &I);
- Value *LogXTimesY = Builder.CreateFMulFMF(Log2, Y, &I);
- return BinaryOperator::CreateFSubFMF(LogXTimesY, Y, &I);
- }
- }
-
- return nullptr;
-}
-
-/// Fold a divide or remainder with a select instruction divisor when one of the
-/// select operands is zero. In that case, we can use the other select operand
-/// because div/rem by zero is undefined.
+ // Like the similar transform in instsimplify, this requires 'nsz' because
+ // sqrt(-0.0) = -0.0, and -0.0 * -0.0 does not simplify to -0.0.
+ if (I.hasNoNaNs() && I.hasNoSignedZeros() && Op0 == Op1 &&
+ Op0->hasNUses(2)) {
+ // Peek through fdiv to find squaring of square root:
+ // (X / sqrt(Y)) * (X / sqrt(Y)) --> (X * X) / Y
+ if (match(Op0, m_FDiv(m_Value(X),
+ m_Intrinsic<Intrinsic::sqrt>(m_Value(Y))))) {
+ Value *XX = Builder.CreateFMulFMF(X, X, &I);
+ return BinaryOperator::CreateFDivFMF(XX, Y, &I);
+ }
+ // (sqrt(Y) / X) * (sqrt(Y) / X) --> Y / (X * X)
+ if (match(Op0, m_FDiv(m_Intrinsic<Intrinsic::sqrt>(m_Value(Y)),
+ m_Value(X)))) {
+ Value *XX = Builder.CreateFMulFMF(X, X, &I);
+ return BinaryOperator::CreateFDivFMF(Y, XX, &I);
+ }
+ }
+
+ // exp(X) * exp(Y) -> exp(X + Y)
+ // Match as long as at least one of exp has only one use.
+ if (match(Op0, m_Intrinsic<Intrinsic::exp>(m_Value(X))) &&
+ match(Op1, m_Intrinsic<Intrinsic::exp>(m_Value(Y))) &&
+ (Op0->hasOneUse() || Op1->hasOneUse())) {
+ Value *XY = Builder.CreateFAddFMF(X, Y, &I);
+ Value *Exp = Builder.CreateUnaryIntrinsic(Intrinsic::exp, XY, &I);
+ return replaceInstUsesWith(I, Exp);
+ }
+
+ // exp2(X) * exp2(Y) -> exp2(X + Y)
+ // Match as long as at least one of exp2 has only one use.
+ if (match(Op0, m_Intrinsic<Intrinsic::exp2>(m_Value(X))) &&
+ match(Op1, m_Intrinsic<Intrinsic::exp2>(m_Value(Y))) &&
+ (Op0->hasOneUse() || Op1->hasOneUse())) {
+ Value *XY = Builder.CreateFAddFMF(X, Y, &I);
+ Value *Exp2 = Builder.CreateUnaryIntrinsic(Intrinsic::exp2, XY, &I);
+ return replaceInstUsesWith(I, Exp2);
+ }
+
+ // (X*Y) * X => (X*X) * Y where Y != X
+ // The purpose is two-fold:
+ // 1) to form a power expression (of X).
+ // 2) potentially shorten the critical path: After transformation, the
+ // latency of the instruction Y is amortized by the expression of X*X,
+ // and therefore Y is in a "less critical" position compared to what it
+ // was before the transformation.
+ if (match(Op0, m_OneUse(m_c_FMul(m_Specific(Op1), m_Value(Y)))) &&
+ Op1 != Y) {
+ Value *XX = Builder.CreateFMulFMF(Op1, Op1, &I);
+ return BinaryOperator::CreateFMulFMF(XX, Y, &I);
+ }
+ if (match(Op1, m_OneUse(m_c_FMul(m_Specific(Op0), m_Value(Y)))) &&
+ Op0 != Y) {
+ Value *XX = Builder.CreateFMulFMF(Op0, Op0, &I);
+ return BinaryOperator::CreateFMulFMF(XX, Y, &I);
+ }
+ }
+
+ // log2(X * 0.5) * Y = log2(X) * Y - Y
+ if (I.isFast()) {
+ IntrinsicInst *Log2 = nullptr;
+ if (match(Op0, m_OneUse(m_Intrinsic<Intrinsic::log2>(
+ m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) {
+ Log2 = cast<IntrinsicInst>(Op0);
+ Y = Op1;
+ }
+ if (match(Op1, m_OneUse(m_Intrinsic<Intrinsic::log2>(
+ m_OneUse(m_FMul(m_Value(X), m_SpecificFP(0.5))))))) {
+ Log2 = cast<IntrinsicInst>(Op1);
+ Y = Op0;
+ }
+ if (Log2) {
+ Value *Log2 = Builder.CreateUnaryIntrinsic(Intrinsic::log2, X, &I);
+ Value *LogXTimesY = Builder.CreateFMulFMF(Log2, Y, &I);
+ return BinaryOperator::CreateFSubFMF(LogXTimesY, Y, &I);
+ }
+ }
+
+ return nullptr;
+}
+
+/// Fold a divide or remainder with a select instruction divisor when one of the
+/// select operands is zero. In that case, we can use the other select operand
+/// because div/rem by zero is undefined.
bool InstCombinerImpl::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
- SelectInst *SI = dyn_cast<SelectInst>(I.getOperand(1));
- if (!SI)
- return false;
-
- int NonNullOperand;
- if (match(SI->getTrueValue(), m_Zero()))
- // div/rem X, (Cond ? 0 : Y) -> div/rem X, Y
- NonNullOperand = 2;
- else if (match(SI->getFalseValue(), m_Zero()))
- // div/rem X, (Cond ? Y : 0) -> div/rem X, Y
- NonNullOperand = 1;
- else
- return false;
-
- // Change the div/rem to use 'Y' instead of the select.
- replaceOperand(I, 1, SI->getOperand(NonNullOperand));
-
- // Okay, we know we replace the operand of the div/rem with 'Y' with no
- // problem. However, the select, or the condition of the select may have
- // multiple uses. Based on our knowledge that the operand must be non-zero,
- // propagate the known value for the select into other uses of it, and
- // propagate a known value of the condition into its other users.
-
- // If the select and condition only have a single use, don't bother with this,
- // early exit.
- Value *SelectCond = SI->getCondition();
- if (SI->use_empty() && SelectCond->hasOneUse())
- return true;
-
- // Scan the current block backward, looking for other uses of SI.
- BasicBlock::iterator BBI = I.getIterator(), BBFront = I.getParent()->begin();
- Type *CondTy = SelectCond->getType();
- while (BBI != BBFront) {
- --BBI;
- // If we found an instruction that we can't assume will return, so
- // information from below it cannot be propagated above it.
- if (!isGuaranteedToTransferExecutionToSuccessor(&*BBI))
- break;
-
- // Replace uses of the select or its condition with the known values.
- for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end();
- I != E; ++I) {
- if (*I == SI) {
- replaceUse(*I, SI->getOperand(NonNullOperand));
- Worklist.push(&*BBI);
- } else if (*I == SelectCond) {
- replaceUse(*I, NonNullOperand == 1 ? ConstantInt::getTrue(CondTy)
- : ConstantInt::getFalse(CondTy));
- Worklist.push(&*BBI);
- }
- }
-
- // If we past the instruction, quit looking for it.
- if (&*BBI == SI)
- SI = nullptr;
- if (&*BBI == SelectCond)
- SelectCond = nullptr;
-
- // If we ran out of things to eliminate, break out of the loop.
- if (!SelectCond && !SI)
- break;
-
- }
- return true;
-}
-
-/// True if the multiply can not be expressed in an int this size.
-static bool multiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product,
- bool IsSigned) {
- bool Overflow;
- Product = IsSigned ? C1.smul_ov(C2, Overflow) : C1.umul_ov(C2, Overflow);
- return Overflow;
-}
-
-/// True if C1 is a multiple of C2. Quotient contains C1/C2.
-static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
- bool IsSigned) {
- assert(C1.getBitWidth() == C2.getBitWidth() && "Constant widths not equal");
-
- // Bail if we will divide by zero.
- if (C2.isNullValue())
- return false;
-
- // Bail if we would divide INT_MIN by -1.
- if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue())
- return false;
-
- APInt Remainder(C1.getBitWidth(), /*val=*/0ULL, IsSigned);
- if (IsSigned)
- APInt::sdivrem(C1, C2, Quotient, Remainder);
- else
- APInt::udivrem(C1, C2, Quotient, Remainder);
-
- return Remainder.isMinValue();
-}
-
-/// This function implements the transforms common to both integer division
-/// instructions (udiv and sdiv). It is called by the visitors to those integer
-/// division instructions.
-/// Common integer divide transforms
+ SelectInst *SI = dyn_cast<SelectInst>(I.getOperand(1));
+ if (!SI)
+ return false;
+
+ int NonNullOperand;
+ if (match(SI->getTrueValue(), m_Zero()))
+ // div/rem X, (Cond ? 0 : Y) -> div/rem X, Y
+ NonNullOperand = 2;
+ else if (match(SI->getFalseValue(), m_Zero()))
+ // div/rem X, (Cond ? Y : 0) -> div/rem X, Y
+ NonNullOperand = 1;
+ else
+ return false;
+
+ // Change the div/rem to use 'Y' instead of the select.
+ replaceOperand(I, 1, SI->getOperand(NonNullOperand));
+
+ // Okay, we know we replace the operand of the div/rem with 'Y' with no
+ // problem. However, the select, or the condition of the select may have
+ // multiple uses. Based on our knowledge that the operand must be non-zero,
+ // propagate the known value for the select into other uses of it, and
+ // propagate a known value of the condition into its other users.
+
+ // If the select and condition only have a single use, don't bother with this,
+ // early exit.
+ Value *SelectCond = SI->getCondition();
+ if (SI->use_empty() && SelectCond->hasOneUse())
+ return true;
+
+ // Scan the current block backward, looking for other uses of SI.
+ BasicBlock::iterator BBI = I.getIterator(), BBFront = I.getParent()->begin();
+ Type *CondTy = SelectCond->getType();
+ while (BBI != BBFront) {
+ --BBI;
+ // If we found an instruction that we can't assume will return, so
+ // information from below it cannot be propagated above it.
+ if (!isGuaranteedToTransferExecutionToSuccessor(&*BBI))
+ break;
+
+ // Replace uses of the select or its condition with the known values.
+ for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end();
+ I != E; ++I) {
+ if (*I == SI) {
+ replaceUse(*I, SI->getOperand(NonNullOperand));
+ Worklist.push(&*BBI);
+ } else if (*I == SelectCond) {
+ replaceUse(*I, NonNullOperand == 1 ? ConstantInt::getTrue(CondTy)
+ : ConstantInt::getFalse(CondTy));
+ Worklist.push(&*BBI);
+ }
+ }
+
+ // If we past the instruction, quit looking for it.
+ if (&*BBI == SI)
+ SI = nullptr;
+ if (&*BBI == SelectCond)
+ SelectCond = nullptr;
+
+ // If we ran out of things to eliminate, break out of the loop.
+ if (!SelectCond && !SI)
+ break;
+
+ }
+ return true;
+}
+
+/// True if the multiply can not be expressed in an int this size.
+static bool multiplyOverflows(const APInt &C1, const APInt &C2, APInt &Product,
+ bool IsSigned) {
+ bool Overflow;
+ Product = IsSigned ? C1.smul_ov(C2, Overflow) : C1.umul_ov(C2, Overflow);
+ return Overflow;
+}
+
+/// True if C1 is a multiple of C2. Quotient contains C1/C2.
+static bool isMultiple(const APInt &C1, const APInt &C2, APInt &Quotient,
+ bool IsSigned) {
+ assert(C1.getBitWidth() == C2.getBitWidth() && "Constant widths not equal");
+
+ // Bail if we will divide by zero.
+ if (C2.isNullValue())
+ return false;
+
+ // Bail if we would divide INT_MIN by -1.
+ if (IsSigned && C1.isMinSignedValue() && C2.isAllOnesValue())
+ return false;
+
+ APInt Remainder(C1.getBitWidth(), /*val=*/0ULL, IsSigned);
+ if (IsSigned)
+ APInt::sdivrem(C1, C2, Quotient, Remainder);
+ else
+ APInt::udivrem(C1, C2, Quotient, Remainder);
+
+ return Remainder.isMinValue();
+}
+
+/// This function implements the transforms common to both integer division
+/// instructions (udiv and sdiv). It is called by the visitors to those integer
+/// division instructions.
+/// Common integer divide transforms
Instruction *InstCombinerImpl::commonIDivTransforms(BinaryOperator &I) {
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- bool IsSigned = I.getOpcode() == Instruction::SDiv;
- Type *Ty = I.getType();
-
- // The RHS is known non-zero.
- if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I))
- return replaceOperand(I, 1, V);
-
- // Handle cases involving: [su]div X, (select Cond, Y, Z)
- // This does not apply for fdiv.
- if (simplifyDivRemOfSelectWithZeroOp(I))
- return &I;
-
- const APInt *C2;
- if (match(Op1, m_APInt(C2))) {
- Value *X;
- const APInt *C1;
-
- // (X / C1) / C2 -> X / (C1*C2)
- if ((IsSigned && match(Op0, m_SDiv(m_Value(X), m_APInt(C1)))) ||
- (!IsSigned && match(Op0, m_UDiv(m_Value(X), m_APInt(C1))))) {
- APInt Product(C1->getBitWidth(), /*val=*/0ULL, IsSigned);
- if (!multiplyOverflows(*C1, *C2, Product, IsSigned))
- return BinaryOperator::Create(I.getOpcode(), X,
- ConstantInt::get(Ty, Product));
- }
-
- if ((IsSigned && match(Op0, m_NSWMul(m_Value(X), m_APInt(C1)))) ||
- (!IsSigned && match(Op0, m_NUWMul(m_Value(X), m_APInt(C1))))) {
- APInt Quotient(C1->getBitWidth(), /*val=*/0ULL, IsSigned);
-
- // (X * C1) / C2 -> X / (C2 / C1) if C2 is a multiple of C1.
- if (isMultiple(*C2, *C1, Quotient, IsSigned)) {
- auto *NewDiv = BinaryOperator::Create(I.getOpcode(), X,
- ConstantInt::get(Ty, Quotient));
- NewDiv->setIsExact(I.isExact());
- return NewDiv;
- }
-
- // (X * C1) / C2 -> X * (C1 / C2) if C1 is a multiple of C2.
- if (isMultiple(*C1, *C2, Quotient, IsSigned)) {
- auto *Mul = BinaryOperator::Create(Instruction::Mul, X,
- ConstantInt::get(Ty, Quotient));
- auto *OBO = cast<OverflowingBinaryOperator>(Op0);
- Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap());
- Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap());
- return Mul;
- }
- }
-
- if ((IsSigned && match(Op0, m_NSWShl(m_Value(X), m_APInt(C1))) &&
- *C1 != C1->getBitWidth() - 1) ||
- (!IsSigned && match(Op0, m_NUWShl(m_Value(X), m_APInt(C1))))) {
- APInt Quotient(C1->getBitWidth(), /*val=*/0ULL, IsSigned);
- APInt C1Shifted = APInt::getOneBitSet(
- C1->getBitWidth(), static_cast<unsigned>(C1->getLimitedValue()));
-
- // (X << C1) / C2 -> X / (C2 >> C1) if C2 is a multiple of 1 << C1.
- if (isMultiple(*C2, C1Shifted, Quotient, IsSigned)) {
- auto *BO = BinaryOperator::Create(I.getOpcode(), X,
- ConstantInt::get(Ty, Quotient));
- BO->setIsExact(I.isExact());
- return BO;
- }
-
- // (X << C1) / C2 -> X * ((1 << C1) / C2) if 1 << C1 is a multiple of C2.
- if (isMultiple(C1Shifted, *C2, Quotient, IsSigned)) {
- auto *Mul = BinaryOperator::Create(Instruction::Mul, X,
- ConstantInt::get(Ty, Quotient));
- auto *OBO = cast<OverflowingBinaryOperator>(Op0);
- Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap());
- Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap());
- return Mul;
- }
- }
-
- if (!C2->isNullValue()) // avoid X udiv 0
- if (Instruction *FoldedDiv = foldBinOpIntoSelectOrPhi(I))
- return FoldedDiv;
- }
-
- if (match(Op0, m_One())) {
- assert(!Ty->isIntOrIntVectorTy(1) && "i1 divide not removed?");
- if (IsSigned) {
- // If Op1 is 0 then it's undefined behaviour, if Op1 is 1 then the
- // result is one, if Op1 is -1 then the result is minus one, otherwise
- // it's zero.
- Value *Inc = Builder.CreateAdd(Op1, Op0);
- Value *Cmp = Builder.CreateICmpULT(Inc, ConstantInt::get(Ty, 3));
- return SelectInst::Create(Cmp, Op1, ConstantInt::get(Ty, 0));
- } else {
- // If Op1 is 0 then it's undefined behaviour. If Op1 is 1 then the
- // result is one, otherwise it's zero.
- return new ZExtInst(Builder.CreateICmpEQ(Op1, Op0), Ty);
- }
- }
-
- // See if we can fold away this div instruction.
- if (SimplifyDemandedInstructionBits(I))
- return &I;
-
- // (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y
- Value *X, *Z;
- if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) // (X - Z) / Y; Y = Op1
- if ((IsSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) ||
- (!IsSigned && match(Z, m_URem(m_Specific(X), m_Specific(Op1)))))
- return BinaryOperator::Create(I.getOpcode(), X, Op1);
-
- // (X << Y) / X -> 1 << Y
- Value *Y;
- if (IsSigned && match(Op0, m_NSWShl(m_Specific(Op1), m_Value(Y))))
- return BinaryOperator::CreateNSWShl(ConstantInt::get(Ty, 1), Y);
- if (!IsSigned && match(Op0, m_NUWShl(m_Specific(Op1), m_Value(Y))))
- return BinaryOperator::CreateNUWShl(ConstantInt::get(Ty, 1), Y);
-
- // X / (X * Y) -> 1 / Y if the multiplication does not overflow.
- if (match(Op1, m_c_Mul(m_Specific(Op0), m_Value(Y)))) {
- bool HasNSW = cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap();
- bool HasNUW = cast<OverflowingBinaryOperator>(Op1)->hasNoUnsignedWrap();
- if ((IsSigned && HasNSW) || (!IsSigned && HasNUW)) {
- replaceOperand(I, 0, ConstantInt::get(Ty, 1));
- replaceOperand(I, 1, Y);
- return &I;
- }
- }
-
- return nullptr;
-}
-
-static const unsigned MaxDepth = 6;
-
-namespace {
-
-using FoldUDivOperandCb = Instruction *(*)(Value *Op0, Value *Op1,
- const BinaryOperator &I,
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ bool IsSigned = I.getOpcode() == Instruction::SDiv;
+ Type *Ty = I.getType();
+
+ // The RHS is known non-zero.
+ if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I))
+ return replaceOperand(I, 1, V);
+
+ // Handle cases involving: [su]div X, (select Cond, Y, Z)
+ // This does not apply for fdiv.
+ if (simplifyDivRemOfSelectWithZeroOp(I))
+ return &I;
+
+ const APInt *C2;
+ if (match(Op1, m_APInt(C2))) {
+ Value *X;
+ const APInt *C1;
+
+ // (X / C1) / C2 -> X / (C1*C2)
+ if ((IsSigned && match(Op0, m_SDiv(m_Value(X), m_APInt(C1)))) ||
+ (!IsSigned && match(Op0, m_UDiv(m_Value(X), m_APInt(C1))))) {
+ APInt Product(C1->getBitWidth(), /*val=*/0ULL, IsSigned);
+ if (!multiplyOverflows(*C1, *C2, Product, IsSigned))
+ return BinaryOperator::Create(I.getOpcode(), X,
+ ConstantInt::get(Ty, Product));
+ }
+
+ if ((IsSigned && match(Op0, m_NSWMul(m_Value(X), m_APInt(C1)))) ||
+ (!IsSigned && match(Op0, m_NUWMul(m_Value(X), m_APInt(C1))))) {
+ APInt Quotient(C1->getBitWidth(), /*val=*/0ULL, IsSigned);
+
+ // (X * C1) / C2 -> X / (C2 / C1) if C2 is a multiple of C1.
+ if (isMultiple(*C2, *C1, Quotient, IsSigned)) {
+ auto *NewDiv = BinaryOperator::Create(I.getOpcode(), X,
+ ConstantInt::get(Ty, Quotient));
+ NewDiv->setIsExact(I.isExact());
+ return NewDiv;
+ }
+
+ // (X * C1) / C2 -> X * (C1 / C2) if C1 is a multiple of C2.
+ if (isMultiple(*C1, *C2, Quotient, IsSigned)) {
+ auto *Mul = BinaryOperator::Create(Instruction::Mul, X,
+ ConstantInt::get(Ty, Quotient));
+ auto *OBO = cast<OverflowingBinaryOperator>(Op0);
+ Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap());
+ Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap());
+ return Mul;
+ }
+ }
+
+ if ((IsSigned && match(Op0, m_NSWShl(m_Value(X), m_APInt(C1))) &&
+ *C1 != C1->getBitWidth() - 1) ||
+ (!IsSigned && match(Op0, m_NUWShl(m_Value(X), m_APInt(C1))))) {
+ APInt Quotient(C1->getBitWidth(), /*val=*/0ULL, IsSigned);
+ APInt C1Shifted = APInt::getOneBitSet(
+ C1->getBitWidth(), static_cast<unsigned>(C1->getLimitedValue()));
+
+ // (X << C1) / C2 -> X / (C2 >> C1) if C2 is a multiple of 1 << C1.
+ if (isMultiple(*C2, C1Shifted, Quotient, IsSigned)) {
+ auto *BO = BinaryOperator::Create(I.getOpcode(), X,
+ ConstantInt::get(Ty, Quotient));
+ BO->setIsExact(I.isExact());
+ return BO;
+ }
+
+ // (X << C1) / C2 -> X * ((1 << C1) / C2) if 1 << C1 is a multiple of C2.
+ if (isMultiple(C1Shifted, *C2, Quotient, IsSigned)) {
+ auto *Mul = BinaryOperator::Create(Instruction::Mul, X,
+ ConstantInt::get(Ty, Quotient));
+ auto *OBO = cast<OverflowingBinaryOperator>(Op0);
+ Mul->setHasNoUnsignedWrap(!IsSigned && OBO->hasNoUnsignedWrap());
+ Mul->setHasNoSignedWrap(OBO->hasNoSignedWrap());
+ return Mul;
+ }
+ }
+
+ if (!C2->isNullValue()) // avoid X udiv 0
+ if (Instruction *FoldedDiv = foldBinOpIntoSelectOrPhi(I))
+ return FoldedDiv;
+ }
+
+ if (match(Op0, m_One())) {
+ assert(!Ty->isIntOrIntVectorTy(1) && "i1 divide not removed?");
+ if (IsSigned) {
+ // If Op1 is 0 then it's undefined behaviour, if Op1 is 1 then the
+ // result is one, if Op1 is -1 then the result is minus one, otherwise
+ // it's zero.
+ Value *Inc = Builder.CreateAdd(Op1, Op0);
+ Value *Cmp = Builder.CreateICmpULT(Inc, ConstantInt::get(Ty, 3));
+ return SelectInst::Create(Cmp, Op1, ConstantInt::get(Ty, 0));
+ } else {
+ // If Op1 is 0 then it's undefined behaviour. If Op1 is 1 then the
+ // result is one, otherwise it's zero.
+ return new ZExtInst(Builder.CreateICmpEQ(Op1, Op0), Ty);
+ }
+ }
+
+ // See if we can fold away this div instruction.
+ if (SimplifyDemandedInstructionBits(I))
+ return &I;
+
+ // (X - (X rem Y)) / Y -> X / Y; usually originates as ((X / Y) * Y) / Y
+ Value *X, *Z;
+ if (match(Op0, m_Sub(m_Value(X), m_Value(Z)))) // (X - Z) / Y; Y = Op1
+ if ((IsSigned && match(Z, m_SRem(m_Specific(X), m_Specific(Op1)))) ||
+ (!IsSigned && match(Z, m_URem(m_Specific(X), m_Specific(Op1)))))
+ return BinaryOperator::Create(I.getOpcode(), X, Op1);
+
+ // (X << Y) / X -> 1 << Y
+ Value *Y;
+ if (IsSigned && match(Op0, m_NSWShl(m_Specific(Op1), m_Value(Y))))
+ return BinaryOperator::CreateNSWShl(ConstantInt::get(Ty, 1), Y);
+ if (!IsSigned && match(Op0, m_NUWShl(m_Specific(Op1), m_Value(Y))))
+ return BinaryOperator::CreateNUWShl(ConstantInt::get(Ty, 1), Y);
+
+ // X / (X * Y) -> 1 / Y if the multiplication does not overflow.
+ if (match(Op1, m_c_Mul(m_Specific(Op0), m_Value(Y)))) {
+ bool HasNSW = cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap();
+ bool HasNUW = cast<OverflowingBinaryOperator>(Op1)->hasNoUnsignedWrap();
+ if ((IsSigned && HasNSW) || (!IsSigned && HasNUW)) {
+ replaceOperand(I, 0, ConstantInt::get(Ty, 1));
+ replaceOperand(I, 1, Y);
+ return &I;
+ }
+ }
+
+ return nullptr;
+}
+
+static const unsigned MaxDepth = 6;
+
+namespace {
+
+using FoldUDivOperandCb = Instruction *(*)(Value *Op0, Value *Op1,
+ const BinaryOperator &I,
InstCombinerImpl &IC);
-
-/// Used to maintain state for visitUDivOperand().
-struct UDivFoldAction {
- /// Informs visitUDiv() how to fold this operand. This can be zero if this
- /// action joins two actions together.
- FoldUDivOperandCb FoldAction;
-
- /// Which operand to fold.
- Value *OperandToFold;
-
- union {
- /// The instruction returned when FoldAction is invoked.
- Instruction *FoldResult;
-
- /// Stores the LHS action index if this action joins two actions together.
- size_t SelectLHSIdx;
- };
-
- UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand)
- : FoldAction(FA), OperandToFold(InputOperand), FoldResult(nullptr) {}
- UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand, size_t SLHS)
- : FoldAction(FA), OperandToFold(InputOperand), SelectLHSIdx(SLHS) {}
-};
-
-} // end anonymous namespace
-
-// X udiv 2^C -> X >> C
-static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
+
+/// Used to maintain state for visitUDivOperand().
+struct UDivFoldAction {
+ /// Informs visitUDiv() how to fold this operand. This can be zero if this
+ /// action joins two actions together.
+ FoldUDivOperandCb FoldAction;
+
+ /// Which operand to fold.
+ Value *OperandToFold;
+
+ union {
+ /// The instruction returned when FoldAction is invoked.
+ Instruction *FoldResult;
+
+ /// Stores the LHS action index if this action joins two actions together.
+ size_t SelectLHSIdx;
+ };
+
+ UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand)
+ : FoldAction(FA), OperandToFold(InputOperand), FoldResult(nullptr) {}
+ UDivFoldAction(FoldUDivOperandCb FA, Value *InputOperand, size_t SLHS)
+ : FoldAction(FA), OperandToFold(InputOperand), SelectLHSIdx(SLHS) {}
+};
+
+} // end anonymous namespace
+
+// X udiv 2^C -> X >> C
+static Instruction *foldUDivPow2Cst(Value *Op0, Value *Op1,
const BinaryOperator &I,
InstCombinerImpl &IC) {
Constant *C1 = ConstantExpr::getExactLogBase2(cast<Constant>(Op1));
- if (!C1)
- llvm_unreachable("Failed to constant fold udiv -> logbase2");
- BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, C1);
- if (I.isExact())
- LShr->setIsExact();
- return LShr;
-}
-
-// X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2)
-// X udiv (zext (C1 << N)), where C1 is "1<<C2" --> X >> (N+C2)
-static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
+ if (!C1)
+ llvm_unreachable("Failed to constant fold udiv -> logbase2");
+ BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, C1);
+ if (I.isExact())
+ LShr->setIsExact();
+ return LShr;
+}
+
+// X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2)
+// X udiv (zext (C1 << N)), where C1 is "1<<C2" --> X >> (N+C2)
+static Instruction *foldUDivShl(Value *Op0, Value *Op1, const BinaryOperator &I,
InstCombinerImpl &IC) {
- Value *ShiftLeft;
- if (!match(Op1, m_ZExt(m_Value(ShiftLeft))))
- ShiftLeft = Op1;
-
- Constant *CI;
- Value *N;
- if (!match(ShiftLeft, m_Shl(m_Constant(CI), m_Value(N))))
- llvm_unreachable("match should never fail here!");
+ Value *ShiftLeft;
+ if (!match(Op1, m_ZExt(m_Value(ShiftLeft))))
+ ShiftLeft = Op1;
+
+ Constant *CI;
+ Value *N;
+ if (!match(ShiftLeft, m_Shl(m_Constant(CI), m_Value(N))))
+ llvm_unreachable("match should never fail here!");
Constant *Log2Base = ConstantExpr::getExactLogBase2(CI);
- if (!Log2Base)
- llvm_unreachable("getLogBase2 should never fail here!");
- N = IC.Builder.CreateAdd(N, Log2Base);
- if (Op1 != ShiftLeft)
- N = IC.Builder.CreateZExt(N, Op1->getType());
- BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N);
- if (I.isExact())
- LShr->setIsExact();
- return LShr;
-}
-
-// Recursively visits the possible right hand operands of a udiv
-// instruction, seeing through select instructions, to determine if we can
-// replace the udiv with something simpler. If we find that an operand is not
-// able to simplify the udiv, we abort the entire transformation.
-static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I,
- SmallVectorImpl<UDivFoldAction> &Actions,
- unsigned Depth = 0) {
+ if (!Log2Base)
+ llvm_unreachable("getLogBase2 should never fail here!");
+ N = IC.Builder.CreateAdd(N, Log2Base);
+ if (Op1 != ShiftLeft)
+ N = IC.Builder.CreateZExt(N, Op1->getType());
+ BinaryOperator *LShr = BinaryOperator::CreateLShr(Op0, N);
+ if (I.isExact())
+ LShr->setIsExact();
+ return LShr;
+}
+
+// Recursively visits the possible right hand operands of a udiv
+// instruction, seeing through select instructions, to determine if we can
+// replace the udiv with something simpler. If we find that an operand is not
+// able to simplify the udiv, we abort the entire transformation.
+static size_t visitUDivOperand(Value *Op0, Value *Op1, const BinaryOperator &I,
+ SmallVectorImpl<UDivFoldAction> &Actions,
+ unsigned Depth = 0) {
// FIXME: assert that Op1 isn't/doesn't contain undef.
- // Check to see if this is an unsigned division with an exact power of 2,
- // if so, convert to a right shift.
- if (match(Op1, m_Power2())) {
- Actions.push_back(UDivFoldAction(foldUDivPow2Cst, Op1));
- return Actions.size();
- }
-
- // X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2)
- if (match(Op1, m_Shl(m_Power2(), m_Value())) ||
- match(Op1, m_ZExt(m_Shl(m_Power2(), m_Value())))) {
- Actions.push_back(UDivFoldAction(foldUDivShl, Op1));
- return Actions.size();
- }
-
- // The remaining tests are all recursive, so bail out if we hit the limit.
- if (Depth++ == MaxDepth)
- return 0;
-
- if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+ // Check to see if this is an unsigned division with an exact power of 2,
+ // if so, convert to a right shift.
+ if (match(Op1, m_Power2())) {
+ Actions.push_back(UDivFoldAction(foldUDivPow2Cst, Op1));
+ return Actions.size();
+ }
+
+ // X udiv (C1 << N), where C1 is "1<<C2" --> X >> (N+C2)
+ if (match(Op1, m_Shl(m_Power2(), m_Value())) ||
+ match(Op1, m_ZExt(m_Shl(m_Power2(), m_Value())))) {
+ Actions.push_back(UDivFoldAction(foldUDivShl, Op1));
+ return Actions.size();
+ }
+
+ // The remaining tests are all recursive, so bail out if we hit the limit.
+ if (Depth++ == MaxDepth)
+ return 0;
+
+ if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
// FIXME: missed optimization: if one of the hands of select is/contains
// undef, just directly pick the other one.
// FIXME: can both hands contain undef?
- if (size_t LHSIdx =
- visitUDivOperand(Op0, SI->getOperand(1), I, Actions, Depth))
- if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions, Depth)) {
- Actions.push_back(UDivFoldAction(nullptr, Op1, LHSIdx - 1));
- return Actions.size();
- }
-
- return 0;
-}
-
-/// If we have zero-extended operands of an unsigned div or rem, we may be able
-/// to narrow the operation (sink the zext below the math).
-static Instruction *narrowUDivURem(BinaryOperator &I,
- InstCombiner::BuilderTy &Builder) {
- Instruction::BinaryOps Opcode = I.getOpcode();
- Value *N = I.getOperand(0);
- Value *D = I.getOperand(1);
- Type *Ty = I.getType();
- Value *X, *Y;
- if (match(N, m_ZExt(m_Value(X))) && match(D, m_ZExt(m_Value(Y))) &&
- X->getType() == Y->getType() && (N->hasOneUse() || D->hasOneUse())) {
- // udiv (zext X), (zext Y) --> zext (udiv X, Y)
- // urem (zext X), (zext Y) --> zext (urem X, Y)
- Value *NarrowOp = Builder.CreateBinOp(Opcode, X, Y);
- return new ZExtInst(NarrowOp, Ty);
- }
-
- Constant *C;
- if ((match(N, m_OneUse(m_ZExt(m_Value(X)))) && match(D, m_Constant(C))) ||
- (match(D, m_OneUse(m_ZExt(m_Value(X)))) && match(N, m_Constant(C)))) {
- // If the constant is the same in the smaller type, use the narrow version.
- Constant *TruncC = ConstantExpr::getTrunc(C, X->getType());
- if (ConstantExpr::getZExt(TruncC, Ty) != C)
- return nullptr;
-
- // udiv (zext X), C --> zext (udiv X, C')
- // urem (zext X), C --> zext (urem X, C')
- // udiv C, (zext X) --> zext (udiv C', X)
- // urem C, (zext X) --> zext (urem C', X)
- Value *NarrowOp = isa<Constant>(D) ? Builder.CreateBinOp(Opcode, X, TruncC)
- : Builder.CreateBinOp(Opcode, TruncC, X);
- return new ZExtInst(NarrowOp, Ty);
- }
-
- return nullptr;
-}
-
+ if (size_t LHSIdx =
+ visitUDivOperand(Op0, SI->getOperand(1), I, Actions, Depth))
+ if (visitUDivOperand(Op0, SI->getOperand(2), I, Actions, Depth)) {
+ Actions.push_back(UDivFoldAction(nullptr, Op1, LHSIdx - 1));
+ return Actions.size();
+ }
+
+ return 0;
+}
+
+/// If we have zero-extended operands of an unsigned div or rem, we may be able
+/// to narrow the operation (sink the zext below the math).
+static Instruction *narrowUDivURem(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ Instruction::BinaryOps Opcode = I.getOpcode();
+ Value *N = I.getOperand(0);
+ Value *D = I.getOperand(1);
+ Type *Ty = I.getType();
+ Value *X, *Y;
+ if (match(N, m_ZExt(m_Value(X))) && match(D, m_ZExt(m_Value(Y))) &&
+ X->getType() == Y->getType() && (N->hasOneUse() || D->hasOneUse())) {
+ // udiv (zext X), (zext Y) --> zext (udiv X, Y)
+ // urem (zext X), (zext Y) --> zext (urem X, Y)
+ Value *NarrowOp = Builder.CreateBinOp(Opcode, X, Y);
+ return new ZExtInst(NarrowOp, Ty);
+ }
+
+ Constant *C;
+ if ((match(N, m_OneUse(m_ZExt(m_Value(X)))) && match(D, m_Constant(C))) ||
+ (match(D, m_OneUse(m_ZExt(m_Value(X)))) && match(N, m_Constant(C)))) {
+ // If the constant is the same in the smaller type, use the narrow version.
+ Constant *TruncC = ConstantExpr::getTrunc(C, X->getType());
+ if (ConstantExpr::getZExt(TruncC, Ty) != C)
+ return nullptr;
+
+ // udiv (zext X), C --> zext (udiv X, C')
+ // urem (zext X), C --> zext (urem X, C')
+ // udiv C, (zext X) --> zext (udiv C', X)
+ // urem C, (zext X) --> zext (urem C', X)
+ Value *NarrowOp = isa<Constant>(D) ? Builder.CreateBinOp(Opcode, X, TruncC)
+ : Builder.CreateBinOp(Opcode, TruncC, X);
+ return new ZExtInst(NarrowOp, Ty);
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitUDiv(BinaryOperator &I) {
- if (Value *V = SimplifyUDivInst(I.getOperand(0), I.getOperand(1),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- // Handle the integer div common cases
- if (Instruction *Common = commonIDivTransforms(I))
- return Common;
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- Value *X;
- const APInt *C1, *C2;
- if (match(Op0, m_LShr(m_Value(X), m_APInt(C1))) && match(Op1, m_APInt(C2))) {
- // (X lshr C1) udiv C2 --> X udiv (C2 << C1)
- bool Overflow;
- APInt C2ShlC1 = C2->ushl_ov(*C1, Overflow);
- if (!Overflow) {
- bool IsExact = I.isExact() && match(Op0, m_Exact(m_Value()));
- BinaryOperator *BO = BinaryOperator::CreateUDiv(
- X, ConstantInt::get(X->getType(), C2ShlC1));
- if (IsExact)
- BO->setIsExact();
- return BO;
- }
- }
-
- // Op0 / C where C is large (negative) --> zext (Op0 >= C)
- // TODO: Could use isKnownNegative() to handle non-constant values.
- Type *Ty = I.getType();
- if (match(Op1, m_Negative())) {
- Value *Cmp = Builder.CreateICmpUGE(Op0, Op1);
- return CastInst::CreateZExtOrBitCast(Cmp, Ty);
- }
- // Op0 / (sext i1 X) --> zext (Op0 == -1) (if X is 0, the div is undefined)
- if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
- Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty));
- return CastInst::CreateZExtOrBitCast(Cmp, Ty);
- }
-
- if (Instruction *NarrowDiv = narrowUDivURem(I, Builder))
- return NarrowDiv;
-
- // If the udiv operands are non-overflowing multiplies with a common operand,
- // then eliminate the common factor:
- // (A * B) / (A * X) --> B / X (and commuted variants)
- // TODO: The code would be reduced if we had m_c_NUWMul pattern matching.
- // TODO: If -reassociation handled this generally, we could remove this.
- Value *A, *B;
- if (match(Op0, m_NUWMul(m_Value(A), m_Value(B)))) {
- if (match(Op1, m_NUWMul(m_Specific(A), m_Value(X))) ||
- match(Op1, m_NUWMul(m_Value(X), m_Specific(A))))
- return BinaryOperator::CreateUDiv(B, X);
- if (match(Op1, m_NUWMul(m_Specific(B), m_Value(X))) ||
- match(Op1, m_NUWMul(m_Value(X), m_Specific(B))))
- return BinaryOperator::CreateUDiv(A, X);
- }
-
- // (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...))))
- SmallVector<UDivFoldAction, 6> UDivActions;
- if (visitUDivOperand(Op0, Op1, I, UDivActions))
- for (unsigned i = 0, e = UDivActions.size(); i != e; ++i) {
- FoldUDivOperandCb Action = UDivActions[i].FoldAction;
- Value *ActionOp1 = UDivActions[i].OperandToFold;
- Instruction *Inst;
- if (Action)
- Inst = Action(Op0, ActionOp1, I, *this);
- else {
- // This action joins two actions together. The RHS of this action is
- // simply the last action we processed, we saved the LHS action index in
- // the joining action.
- size_t SelectRHSIdx = i - 1;
- Value *SelectRHS = UDivActions[SelectRHSIdx].FoldResult;
- size_t SelectLHSIdx = UDivActions[i].SelectLHSIdx;
- Value *SelectLHS = UDivActions[SelectLHSIdx].FoldResult;
- Inst = SelectInst::Create(cast<SelectInst>(ActionOp1)->getCondition(),
- SelectLHS, SelectRHS);
- }
-
- // If this is the last action to process, return it to the InstCombiner.
- // Otherwise, we insert it before the UDiv and record it so that we may
- // use it as part of a joining action (i.e., a SelectInst).
- if (e - i != 1) {
- Inst->insertBefore(&I);
- UDivActions[i].FoldResult = Inst;
- } else
- return Inst;
- }
-
- return nullptr;
-}
-
+ if (Value *V = SimplifyUDivInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ // Handle the integer div common cases
+ if (Instruction *Common = commonIDivTransforms(I))
+ return Common;
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Value *X;
+ const APInt *C1, *C2;
+ if (match(Op0, m_LShr(m_Value(X), m_APInt(C1))) && match(Op1, m_APInt(C2))) {
+ // (X lshr C1) udiv C2 --> X udiv (C2 << C1)
+ bool Overflow;
+ APInt C2ShlC1 = C2->ushl_ov(*C1, Overflow);
+ if (!Overflow) {
+ bool IsExact = I.isExact() && match(Op0, m_Exact(m_Value()));
+ BinaryOperator *BO = BinaryOperator::CreateUDiv(
+ X, ConstantInt::get(X->getType(), C2ShlC1));
+ if (IsExact)
+ BO->setIsExact();
+ return BO;
+ }
+ }
+
+ // Op0 / C where C is large (negative) --> zext (Op0 >= C)
+ // TODO: Could use isKnownNegative() to handle non-constant values.
+ Type *Ty = I.getType();
+ if (match(Op1, m_Negative())) {
+ Value *Cmp = Builder.CreateICmpUGE(Op0, Op1);
+ return CastInst::CreateZExtOrBitCast(Cmp, Ty);
+ }
+ // Op0 / (sext i1 X) --> zext (Op0 == -1) (if X is 0, the div is undefined)
+ if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+ Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty));
+ return CastInst::CreateZExtOrBitCast(Cmp, Ty);
+ }
+
+ if (Instruction *NarrowDiv = narrowUDivURem(I, Builder))
+ return NarrowDiv;
+
+ // If the udiv operands are non-overflowing multiplies with a common operand,
+ // then eliminate the common factor:
+ // (A * B) / (A * X) --> B / X (and commuted variants)
+ // TODO: The code would be reduced if we had m_c_NUWMul pattern matching.
+ // TODO: If -reassociation handled this generally, we could remove this.
+ Value *A, *B;
+ if (match(Op0, m_NUWMul(m_Value(A), m_Value(B)))) {
+ if (match(Op1, m_NUWMul(m_Specific(A), m_Value(X))) ||
+ match(Op1, m_NUWMul(m_Value(X), m_Specific(A))))
+ return BinaryOperator::CreateUDiv(B, X);
+ if (match(Op1, m_NUWMul(m_Specific(B), m_Value(X))) ||
+ match(Op1, m_NUWMul(m_Value(X), m_Specific(B))))
+ return BinaryOperator::CreateUDiv(A, X);
+ }
+
+ // (LHS udiv (select (select (...)))) -> (LHS >> (select (select (...))))
+ SmallVector<UDivFoldAction, 6> UDivActions;
+ if (visitUDivOperand(Op0, Op1, I, UDivActions))
+ for (unsigned i = 0, e = UDivActions.size(); i != e; ++i) {
+ FoldUDivOperandCb Action = UDivActions[i].FoldAction;
+ Value *ActionOp1 = UDivActions[i].OperandToFold;
+ Instruction *Inst;
+ if (Action)
+ Inst = Action(Op0, ActionOp1, I, *this);
+ else {
+ // This action joins two actions together. The RHS of this action is
+ // simply the last action we processed, we saved the LHS action index in
+ // the joining action.
+ size_t SelectRHSIdx = i - 1;
+ Value *SelectRHS = UDivActions[SelectRHSIdx].FoldResult;
+ size_t SelectLHSIdx = UDivActions[i].SelectLHSIdx;
+ Value *SelectLHS = UDivActions[SelectLHSIdx].FoldResult;
+ Inst = SelectInst::Create(cast<SelectInst>(ActionOp1)->getCondition(),
+ SelectLHS, SelectRHS);
+ }
+
+ // If this is the last action to process, return it to the InstCombiner.
+ // Otherwise, we insert it before the UDiv and record it so that we may
+ // use it as part of a joining action (i.e., a SelectInst).
+ if (e - i != 1) {
+ Inst->insertBefore(&I);
+ UDivActions[i].FoldResult = Inst;
+ } else
+ return Inst;
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
- if (Value *V = SimplifySDivInst(I.getOperand(0), I.getOperand(1),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- // Handle the integer div common cases
- if (Instruction *Common = commonIDivTransforms(I))
- return Common;
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ if (Value *V = SimplifySDivInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ // Handle the integer div common cases
+ if (Instruction *Common = commonIDivTransforms(I))
+ return Common;
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
Type *Ty = I.getType();
- Value *X;
- // sdiv Op0, -1 --> -Op0
- // sdiv Op0, (sext i1 X) --> -Op0 (because if X is 0, the op is undefined)
- if (match(Op1, m_AllOnes()) ||
- (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)))
- return BinaryOperator::CreateNeg(Op0);
-
- // X / INT_MIN --> X == INT_MIN
- if (match(Op1, m_SignMask()))
+ Value *X;
+ // sdiv Op0, -1 --> -Op0
+ // sdiv Op0, (sext i1 X) --> -Op0 (because if X is 0, the op is undefined)
+ if (match(Op1, m_AllOnes()) ||
+ (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)))
+ return BinaryOperator::CreateNeg(Op0);
+
+ // X / INT_MIN --> X == INT_MIN
+ if (match(Op1, m_SignMask()))
return new ZExtInst(Builder.CreateICmpEQ(Op0, Op1), Ty);
-
+
// sdiv exact X, 1<<C --> ashr exact X, C iff 1<<C is non-negative
// sdiv exact X, -1<<C --> -(ashr exact X, C)
if (I.isExact() && ((match(Op1, m_Power2()) && match(Op1, m_NonNegative())) ||
@@ -1134,43 +1134,43 @@ Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
return BinaryOperator::CreateNeg(AShr, I.getName());
}
- const APInt *Op1C;
- if (match(Op1, m_APInt(Op1C))) {
- // If the dividend is sign-extended and the constant divisor is small enough
- // to fit in the source type, shrink the division to the narrower type:
- // (sext X) sdiv C --> sext (X sdiv C)
- Value *Op0Src;
- if (match(Op0, m_OneUse(m_SExt(m_Value(Op0Src)))) &&
- Op0Src->getType()->getScalarSizeInBits() >= Op1C->getMinSignedBits()) {
-
- // In the general case, we need to make sure that the dividend is not the
- // minimum signed value because dividing that by -1 is UB. But here, we
- // know that the -1 divisor case is already handled above.
-
- Constant *NarrowDivisor =
- ConstantExpr::getTrunc(cast<Constant>(Op1), Op0Src->getType());
- Value *NarrowOp = Builder.CreateSDiv(Op0Src, NarrowDivisor);
+ const APInt *Op1C;
+ if (match(Op1, m_APInt(Op1C))) {
+ // If the dividend is sign-extended and the constant divisor is small enough
+ // to fit in the source type, shrink the division to the narrower type:
+ // (sext X) sdiv C --> sext (X sdiv C)
+ Value *Op0Src;
+ if (match(Op0, m_OneUse(m_SExt(m_Value(Op0Src)))) &&
+ Op0Src->getType()->getScalarSizeInBits() >= Op1C->getMinSignedBits()) {
+
+ // In the general case, we need to make sure that the dividend is not the
+ // minimum signed value because dividing that by -1 is UB. But here, we
+ // know that the -1 divisor case is already handled above.
+
+ Constant *NarrowDivisor =
+ ConstantExpr::getTrunc(cast<Constant>(Op1), Op0Src->getType());
+ Value *NarrowOp = Builder.CreateSDiv(Op0Src, NarrowDivisor);
return new SExtInst(NarrowOp, Ty);
- }
-
- // -X / C --> X / -C (if the negation doesn't overflow).
- // TODO: This could be enhanced to handle arbitrary vector constants by
- // checking if all elements are not the min-signed-val.
- if (!Op1C->isMinSignedValue() &&
- match(Op0, m_NSWSub(m_Zero(), m_Value(X)))) {
+ }
+
+ // -X / C --> X / -C (if the negation doesn't overflow).
+ // TODO: This could be enhanced to handle arbitrary vector constants by
+ // checking if all elements are not the min-signed-val.
+ if (!Op1C->isMinSignedValue() &&
+ match(Op0, m_NSWSub(m_Zero(), m_Value(X)))) {
Constant *NegC = ConstantInt::get(Ty, -(*Op1C));
- Instruction *BO = BinaryOperator::CreateSDiv(X, NegC);
- BO->setIsExact(I.isExact());
- return BO;
- }
- }
-
- // -X / Y --> -(X / Y)
- Value *Y;
- if (match(&I, m_SDiv(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y))))
- return BinaryOperator::CreateNSWNeg(
- Builder.CreateSDiv(X, Y, I.getName(), I.isExact()));
-
+ Instruction *BO = BinaryOperator::CreateSDiv(X, NegC);
+ BO->setIsExact(I.isExact());
+ return BO;
+ }
+ }
+
+ // -X / Y --> -(X / Y)
+ Value *Y;
+ if (match(&I, m_SDiv(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y))))
+ return BinaryOperator::CreateNSWNeg(
+ Builder.CreateSDiv(X, Y, I.getName(), I.isExact()));
+
// abs(X) / X --> X > -1 ? 1 : -1
// X / abs(X) --> X > -1 ? 1 : -1
if (match(&I, m_c_BinOp(
@@ -1181,17 +1181,17 @@ Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
return SelectInst::Create(Cond, ConstantInt::get(Ty, 1), NegOne);
}
- // If the sign bits of both operands are zero (i.e. we can prove they are
- // unsigned inputs), turn this into a udiv.
+ // If the sign bits of both operands are zero (i.e. we can prove they are
+ // unsigned inputs), turn this into a udiv.
APInt Mask(APInt::getSignMask(Ty->getScalarSizeInBits()));
- if (MaskedValueIsZero(Op0, Mask, 0, &I)) {
- if (MaskedValueIsZero(Op1, Mask, 0, &I)) {
- // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
- auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
- BO->setIsExact(I.isExact());
- return BO;
- }
-
+ if (MaskedValueIsZero(Op0, Mask, 0, &I)) {
+ if (MaskedValueIsZero(Op1, Mask, 0, &I)) {
+ // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
+ auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+ BO->setIsExact(I.isExact());
+ return BO;
+ }
+
if (match(Op1, m_NegatedPower2())) {
// X sdiv (-(1 << C)) -> -(X sdiv (1 << C)) ->
// -> -(X udiv (1 << C)) -> -(X u>> C)
@@ -1199,356 +1199,356 @@ Instruction *InstCombinerImpl::visitSDiv(BinaryOperator &I) {
Op0, ConstantExpr::getNeg(cast<Constant>(Op1)), I, *this)));
}
- if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
- // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
- // Safe because the only negative value (1 << Y) can take on is
- // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have
- // the sign bit set.
- auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
- BO->setIsExact(I.isExact());
- return BO;
- }
- }
-
- return nullptr;
-}
-
-/// Remove negation and try to convert division into multiplication.
-static Instruction *foldFDivConstantDivisor(BinaryOperator &I) {
- Constant *C;
- if (!match(I.getOperand(1), m_Constant(C)))
- return nullptr;
-
- // -X / C --> X / -C
- Value *X;
- if (match(I.getOperand(0), m_FNeg(m_Value(X))))
- return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I);
-
- // If the constant divisor has an exact inverse, this is always safe. If not,
- // then we can still create a reciprocal if fast-math-flags allow it and the
- // constant is a regular number (not zero, infinite, or denormal).
- if (!(C->hasExactInverseFP() || (I.hasAllowReciprocal() && C->isNormalFP())))
- return nullptr;
-
- // Disallow denormal constants because we don't know what would happen
- // on all targets.
- // TODO: Use Intrinsic::canonicalize or let function attributes tell us that
- // denorms are flushed?
- auto *RecipC = ConstantExpr::getFDiv(ConstantFP::get(I.getType(), 1.0), C);
- if (!RecipC->isNormalFP())
- return nullptr;
-
- // X / C --> X * (1 / C)
- return BinaryOperator::CreateFMulFMF(I.getOperand(0), RecipC, &I);
-}
-
-/// Remove negation and try to reassociate constant math.
-static Instruction *foldFDivConstantDividend(BinaryOperator &I) {
- Constant *C;
- if (!match(I.getOperand(0), m_Constant(C)))
- return nullptr;
-
- // C / -X --> -C / X
- Value *X;
- if (match(I.getOperand(1), m_FNeg(m_Value(X))))
- return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I);
-
- if (!I.hasAllowReassoc() || !I.hasAllowReciprocal())
- return nullptr;
-
- // Try to reassociate C / X expressions where X includes another constant.
- Constant *C2, *NewC = nullptr;
- if (match(I.getOperand(1), m_FMul(m_Value(X), m_Constant(C2)))) {
- // C / (X * C2) --> (C / C2) / X
- NewC = ConstantExpr::getFDiv(C, C2);
- } else if (match(I.getOperand(1), m_FDiv(m_Value(X), m_Constant(C2)))) {
- // C / (X / C2) --> (C * C2) / X
- NewC = ConstantExpr::getFMul(C, C2);
- }
- // Disallow denormal constants because we don't know what would happen
- // on all targets.
- // TODO: Use Intrinsic::canonicalize or let function attributes tell us that
- // denorms are flushed?
- if (!NewC || !NewC->isNormalFP())
- return nullptr;
-
- return BinaryOperator::CreateFDivFMF(NewC, X, &I);
-}
-
+ if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
+ // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
+ // Safe because the only negative value (1 << Y) can take on is
+ // INT_MIN, and X sdiv INT_MIN == X udiv INT_MIN == 0 if X doesn't have
+ // the sign bit set.
+ auto *BO = BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
+ BO->setIsExact(I.isExact());
+ return BO;
+ }
+ }
+
+ return nullptr;
+}
+
+/// Remove negation and try to convert division into multiplication.
+static Instruction *foldFDivConstantDivisor(BinaryOperator &I) {
+ Constant *C;
+ if (!match(I.getOperand(1), m_Constant(C)))
+ return nullptr;
+
+ // -X / C --> X / -C
+ Value *X;
+ if (match(I.getOperand(0), m_FNeg(m_Value(X))))
+ return BinaryOperator::CreateFDivFMF(X, ConstantExpr::getFNeg(C), &I);
+
+ // If the constant divisor has an exact inverse, this is always safe. If not,
+ // then we can still create a reciprocal if fast-math-flags allow it and the
+ // constant is a regular number (not zero, infinite, or denormal).
+ if (!(C->hasExactInverseFP() || (I.hasAllowReciprocal() && C->isNormalFP())))
+ return nullptr;
+
+ // Disallow denormal constants because we don't know what would happen
+ // on all targets.
+ // TODO: Use Intrinsic::canonicalize or let function attributes tell us that
+ // denorms are flushed?
+ auto *RecipC = ConstantExpr::getFDiv(ConstantFP::get(I.getType(), 1.0), C);
+ if (!RecipC->isNormalFP())
+ return nullptr;
+
+ // X / C --> X * (1 / C)
+ return BinaryOperator::CreateFMulFMF(I.getOperand(0), RecipC, &I);
+}
+
+/// Remove negation and try to reassociate constant math.
+static Instruction *foldFDivConstantDividend(BinaryOperator &I) {
+ Constant *C;
+ if (!match(I.getOperand(0), m_Constant(C)))
+ return nullptr;
+
+ // C / -X --> -C / X
+ Value *X;
+ if (match(I.getOperand(1), m_FNeg(m_Value(X))))
+ return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I);
+
+ if (!I.hasAllowReassoc() || !I.hasAllowReciprocal())
+ return nullptr;
+
+ // Try to reassociate C / X expressions where X includes another constant.
+ Constant *C2, *NewC = nullptr;
+ if (match(I.getOperand(1), m_FMul(m_Value(X), m_Constant(C2)))) {
+ // C / (X * C2) --> (C / C2) / X
+ NewC = ConstantExpr::getFDiv(C, C2);
+ } else if (match(I.getOperand(1), m_FDiv(m_Value(X), m_Constant(C2)))) {
+ // C / (X / C2) --> (C * C2) / X
+ NewC = ConstantExpr::getFMul(C, C2);
+ }
+ // Disallow denormal constants because we don't know what would happen
+ // on all targets.
+ // TODO: Use Intrinsic::canonicalize or let function attributes tell us that
+ // denorms are flushed?
+ if (!NewC || !NewC->isNormalFP())
+ return nullptr;
+
+ return BinaryOperator::CreateFDivFMF(NewC, X, &I);
+}
+
Instruction *InstCombinerImpl::visitFDiv(BinaryOperator &I) {
- if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1),
- I.getFastMathFlags(),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- if (Instruction *R = foldFDivConstantDivisor(I))
- return R;
-
- if (Instruction *R = foldFDivConstantDividend(I))
- return R;
-
- if (Instruction *R = foldFPSignBitOps(I))
- return R;
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- if (isa<Constant>(Op0))
- if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
- if (Instruction *R = FoldOpIntoSelect(I, SI))
- return R;
-
- if (isa<Constant>(Op1))
- if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
- if (Instruction *R = FoldOpIntoSelect(I, SI))
- return R;
-
- if (I.hasAllowReassoc() && I.hasAllowReciprocal()) {
- Value *X, *Y;
- if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) &&
- (!isa<Constant>(Y) || !isa<Constant>(Op1))) {
- // (X / Y) / Z => X / (Y * Z)
- Value *YZ = Builder.CreateFMulFMF(Y, Op1, &I);
- return BinaryOperator::CreateFDivFMF(X, YZ, &I);
- }
- if (match(Op1, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) &&
- (!isa<Constant>(Y) || !isa<Constant>(Op0))) {
- // Z / (X / Y) => (Y * Z) / X
- Value *YZ = Builder.CreateFMulFMF(Y, Op0, &I);
- return BinaryOperator::CreateFDivFMF(YZ, X, &I);
- }
- // Z / (1.0 / Y) => (Y * Z)
- //
- // This is a special case of Z / (X / Y) => (Y * Z) / X, with X = 1.0. The
- // m_OneUse check is avoided because even in the case of the multiple uses
- // for 1.0/Y, the number of instructions remain the same and a division is
- // replaced by a multiplication.
- if (match(Op1, m_FDiv(m_SpecificFP(1.0), m_Value(Y))))
- return BinaryOperator::CreateFMulFMF(Y, Op0, &I);
- }
-
- if (I.hasAllowReassoc() && Op0->hasOneUse() && Op1->hasOneUse()) {
- // sin(X) / cos(X) -> tan(X)
- // cos(X) / sin(X) -> 1/tan(X) (cotangent)
- Value *X;
- bool IsTan = match(Op0, m_Intrinsic<Intrinsic::sin>(m_Value(X))) &&
- match(Op1, m_Intrinsic<Intrinsic::cos>(m_Specific(X)));
- bool IsCot =
- !IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) &&
- match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X)));
-
- if ((IsTan || IsCot) &&
- hasFloatFn(&TLI, I.getType(), LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) {
- IRBuilder<> B(&I);
- IRBuilder<>::FastMathFlagGuard FMFGuard(B);
- B.setFastMathFlags(I.getFastMathFlags());
- AttributeList Attrs =
- cast<CallBase>(Op0)->getCalledFunction()->getAttributes();
- Value *Res = emitUnaryFloatFnCall(X, &TLI, LibFunc_tan, LibFunc_tanf,
- LibFunc_tanl, B, Attrs);
- if (IsCot)
- Res = B.CreateFDiv(ConstantFP::get(I.getType(), 1.0), Res);
- return replaceInstUsesWith(I, Res);
- }
- }
-
- // X / (X * Y) --> 1.0 / Y
- // Reassociate to (X / X -> 1.0) is legal when NaNs are not allowed.
- // We can ignore the possibility that X is infinity because INF/INF is NaN.
- Value *X, *Y;
- if (I.hasNoNaNs() && I.hasAllowReassoc() &&
- match(Op1, m_c_FMul(m_Specific(Op0), m_Value(Y)))) {
- replaceOperand(I, 0, ConstantFP::get(I.getType(), 1.0));
- replaceOperand(I, 1, Y);
- return &I;
- }
-
- // X / fabs(X) -> copysign(1.0, X)
- // fabs(X) / X -> copysign(1.0, X)
- if (I.hasNoNaNs() && I.hasNoInfs() &&
+ if (Value *V = SimplifyFDivInst(I.getOperand(0), I.getOperand(1),
+ I.getFastMathFlags(),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ if (Instruction *R = foldFDivConstantDivisor(I))
+ return R;
+
+ if (Instruction *R = foldFDivConstantDividend(I))
+ return R;
+
+ if (Instruction *R = foldFPSignBitOps(I))
+ return R;
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ if (isa<Constant>(Op0))
+ if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+ if (Instruction *R = FoldOpIntoSelect(I, SI))
+ return R;
+
+ if (isa<Constant>(Op1))
+ if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+ if (Instruction *R = FoldOpIntoSelect(I, SI))
+ return R;
+
+ if (I.hasAllowReassoc() && I.hasAllowReciprocal()) {
+ Value *X, *Y;
+ if (match(Op0, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) &&
+ (!isa<Constant>(Y) || !isa<Constant>(Op1))) {
+ // (X / Y) / Z => X / (Y * Z)
+ Value *YZ = Builder.CreateFMulFMF(Y, Op1, &I);
+ return BinaryOperator::CreateFDivFMF(X, YZ, &I);
+ }
+ if (match(Op1, m_OneUse(m_FDiv(m_Value(X), m_Value(Y)))) &&
+ (!isa<Constant>(Y) || !isa<Constant>(Op0))) {
+ // Z / (X / Y) => (Y * Z) / X
+ Value *YZ = Builder.CreateFMulFMF(Y, Op0, &I);
+ return BinaryOperator::CreateFDivFMF(YZ, X, &I);
+ }
+ // Z / (1.0 / Y) => (Y * Z)
+ //
+ // This is a special case of Z / (X / Y) => (Y * Z) / X, with X = 1.0. The
+ // m_OneUse check is avoided because even in the case of the multiple uses
+ // for 1.0/Y, the number of instructions remain the same and a division is
+ // replaced by a multiplication.
+ if (match(Op1, m_FDiv(m_SpecificFP(1.0), m_Value(Y))))
+ return BinaryOperator::CreateFMulFMF(Y, Op0, &I);
+ }
+
+ if (I.hasAllowReassoc() && Op0->hasOneUse() && Op1->hasOneUse()) {
+ // sin(X) / cos(X) -> tan(X)
+ // cos(X) / sin(X) -> 1/tan(X) (cotangent)
+ Value *X;
+ bool IsTan = match(Op0, m_Intrinsic<Intrinsic::sin>(m_Value(X))) &&
+ match(Op1, m_Intrinsic<Intrinsic::cos>(m_Specific(X)));
+ bool IsCot =
+ !IsTan && match(Op0, m_Intrinsic<Intrinsic::cos>(m_Value(X))) &&
+ match(Op1, m_Intrinsic<Intrinsic::sin>(m_Specific(X)));
+
+ if ((IsTan || IsCot) &&
+ hasFloatFn(&TLI, I.getType(), LibFunc_tan, LibFunc_tanf, LibFunc_tanl)) {
+ IRBuilder<> B(&I);
+ IRBuilder<>::FastMathFlagGuard FMFGuard(B);
+ B.setFastMathFlags(I.getFastMathFlags());
+ AttributeList Attrs =
+ cast<CallBase>(Op0)->getCalledFunction()->getAttributes();
+ Value *Res = emitUnaryFloatFnCall(X, &TLI, LibFunc_tan, LibFunc_tanf,
+ LibFunc_tanl, B, Attrs);
+ if (IsCot)
+ Res = B.CreateFDiv(ConstantFP::get(I.getType(), 1.0), Res);
+ return replaceInstUsesWith(I, Res);
+ }
+ }
+
+ // X / (X * Y) --> 1.0 / Y
+ // Reassociate to (X / X -> 1.0) is legal when NaNs are not allowed.
+ // We can ignore the possibility that X is infinity because INF/INF is NaN.
+ Value *X, *Y;
+ if (I.hasNoNaNs() && I.hasAllowReassoc() &&
+ match(Op1, m_c_FMul(m_Specific(Op0), m_Value(Y)))) {
+ replaceOperand(I, 0, ConstantFP::get(I.getType(), 1.0));
+ replaceOperand(I, 1, Y);
+ return &I;
+ }
+
+ // X / fabs(X) -> copysign(1.0, X)
+ // fabs(X) / X -> copysign(1.0, X)
+ if (I.hasNoNaNs() && I.hasNoInfs() &&
(match(&I, m_FDiv(m_Value(X), m_FAbs(m_Deferred(X)))) ||
match(&I, m_FDiv(m_FAbs(m_Value(X)), m_Deferred(X))))) {
- Value *V = Builder.CreateBinaryIntrinsic(
- Intrinsic::copysign, ConstantFP::get(I.getType(), 1.0), X, &I);
- return replaceInstUsesWith(I, V);
- }
- return nullptr;
-}
-
-/// This function implements the transforms common to both integer remainder
-/// instructions (urem and srem). It is called by the visitors to those integer
-/// remainder instructions.
-/// Common integer remainder transforms
+ Value *V = Builder.CreateBinaryIntrinsic(
+ Intrinsic::copysign, ConstantFP::get(I.getType(), 1.0), X, &I);
+ return replaceInstUsesWith(I, V);
+ }
+ return nullptr;
+}
+
+/// This function implements the transforms common to both integer remainder
+/// instructions (urem and srem). It is called by the visitors to those integer
+/// remainder instructions.
+/// Common integer remainder transforms
Instruction *InstCombinerImpl::commonIRemTransforms(BinaryOperator &I) {
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-
- // The RHS is known non-zero.
- if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I))
- return replaceOperand(I, 1, V);
-
- // Handle cases involving: rem X, (select Cond, Y, Z)
- if (simplifyDivRemOfSelectWithZeroOp(I))
- return &I;
-
- if (isa<Constant>(Op1)) {
- if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
- if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
- if (Instruction *R = FoldOpIntoSelect(I, SI))
- return R;
- } else if (auto *PN = dyn_cast<PHINode>(Op0I)) {
- const APInt *Op1Int;
- if (match(Op1, m_APInt(Op1Int)) && !Op1Int->isMinValue() &&
- (I.getOpcode() == Instruction::URem ||
- !Op1Int->isMinSignedValue())) {
- // foldOpIntoPhi will speculate instructions to the end of the PHI's
- // predecessor blocks, so do this only if we know the srem or urem
- // will not fault.
- if (Instruction *NV = foldOpIntoPhi(I, PN))
- return NV;
- }
- }
-
- // See if we can fold away this rem instruction.
- if (SimplifyDemandedInstructionBits(I))
- return &I;
- }
- }
-
- return nullptr;
-}
-
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+ // The RHS is known non-zero.
+ if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I))
+ return replaceOperand(I, 1, V);
+
+ // Handle cases involving: rem X, (select Cond, Y, Z)
+ if (simplifyDivRemOfSelectWithZeroOp(I))
+ return &I;
+
+ if (isa<Constant>(Op1)) {
+ if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
+ if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
+ if (Instruction *R = FoldOpIntoSelect(I, SI))
+ return R;
+ } else if (auto *PN = dyn_cast<PHINode>(Op0I)) {
+ const APInt *Op1Int;
+ if (match(Op1, m_APInt(Op1Int)) && !Op1Int->isMinValue() &&
+ (I.getOpcode() == Instruction::URem ||
+ !Op1Int->isMinSignedValue())) {
+ // foldOpIntoPhi will speculate instructions to the end of the PHI's
+ // predecessor blocks, so do this only if we know the srem or urem
+ // will not fault.
+ if (Instruction *NV = foldOpIntoPhi(I, PN))
+ return NV;
+ }
+ }
+
+ // See if we can fold away this rem instruction.
+ if (SimplifyDemandedInstructionBits(I))
+ return &I;
+ }
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitURem(BinaryOperator &I) {
- if (Value *V = SimplifyURemInst(I.getOperand(0), I.getOperand(1),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- if (Instruction *common = commonIRemTransforms(I))
- return common;
-
- if (Instruction *NarrowRem = narrowUDivURem(I, Builder))
- return NarrowRem;
-
- // X urem Y -> X and Y-1, where Y is a power of 2,
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- Type *Ty = I.getType();
- if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
- // This may increase instruction count, we don't enforce that Y is a
- // constant.
- Constant *N1 = Constant::getAllOnesValue(Ty);
- Value *Add = Builder.CreateAdd(Op1, N1);
- return BinaryOperator::CreateAnd(Op0, Add);
- }
-
- // 1 urem X -> zext(X != 1)
- if (match(Op0, m_One())) {
- Value *Cmp = Builder.CreateICmpNE(Op1, ConstantInt::get(Ty, 1));
- return CastInst::CreateZExtOrBitCast(Cmp, Ty);
- }
-
- // X urem C -> X < C ? X : X - C, where C >= signbit.
- if (match(Op1, m_Negative())) {
- Value *Cmp = Builder.CreateICmpULT(Op0, Op1);
- Value *Sub = Builder.CreateSub(Op0, Op1);
- return SelectInst::Create(Cmp, Op0, Sub);
- }
-
- // If the divisor is a sext of a boolean, then the divisor must be max
- // unsigned value (-1). Therefore, the remainder is Op0 unless Op0 is also
- // max unsigned value. In that case, the remainder is 0:
- // urem Op0, (sext i1 X) --> (Op0 == -1) ? 0 : Op0
- Value *X;
- if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
- Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty));
- return SelectInst::Create(Cmp, ConstantInt::getNullValue(Ty), Op0);
- }
-
- return nullptr;
-}
-
+ if (Value *V = SimplifyURemInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ if (Instruction *common = commonIRemTransforms(I))
+ return common;
+
+ if (Instruction *NarrowRem = narrowUDivURem(I, Builder))
+ return NarrowRem;
+
+ // X urem Y -> X and Y-1, where Y is a power of 2,
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Type *Ty = I.getType();
+ if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/ true, 0, &I)) {
+ // This may increase instruction count, we don't enforce that Y is a
+ // constant.
+ Constant *N1 = Constant::getAllOnesValue(Ty);
+ Value *Add = Builder.CreateAdd(Op1, N1);
+ return BinaryOperator::CreateAnd(Op0, Add);
+ }
+
+ // 1 urem X -> zext(X != 1)
+ if (match(Op0, m_One())) {
+ Value *Cmp = Builder.CreateICmpNE(Op1, ConstantInt::get(Ty, 1));
+ return CastInst::CreateZExtOrBitCast(Cmp, Ty);
+ }
+
+ // X urem C -> X < C ? X : X - C, where C >= signbit.
+ if (match(Op1, m_Negative())) {
+ Value *Cmp = Builder.CreateICmpULT(Op0, Op1);
+ Value *Sub = Builder.CreateSub(Op0, Op1);
+ return SelectInst::Create(Cmp, Op0, Sub);
+ }
+
+ // If the divisor is a sext of a boolean, then the divisor must be max
+ // unsigned value (-1). Therefore, the remainder is Op0 unless Op0 is also
+ // max unsigned value. In that case, the remainder is 0:
+ // urem Op0, (sext i1 X) --> (Op0 == -1) ? 0 : Op0
+ Value *X;
+ if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+ Value *Cmp = Builder.CreateICmpEQ(Op0, ConstantInt::getAllOnesValue(Ty));
+ return SelectInst::Create(Cmp, ConstantInt::getNullValue(Ty), Op0);
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitSRem(BinaryOperator &I) {
- if (Value *V = SimplifySRemInst(I.getOperand(0), I.getOperand(1),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- // Handle the integer rem common cases
- if (Instruction *Common = commonIRemTransforms(I))
- return Common;
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- {
- const APInt *Y;
- // X % -Y -> X % Y
- if (match(Op1, m_Negative(Y)) && !Y->isMinSignedValue())
- return replaceOperand(I, 1, ConstantInt::get(I.getType(), -*Y));
- }
-
- // -X srem Y --> -(X srem Y)
- Value *X, *Y;
- if (match(&I, m_SRem(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y))))
+ if (Value *V = SimplifySRemInst(I.getOperand(0), I.getOperand(1),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ // Handle the integer rem common cases
+ if (Instruction *Common = commonIRemTransforms(I))
+ return Common;
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ {
+ const APInt *Y;
+ // X % -Y -> X % Y
+ if (match(Op1, m_Negative(Y)) && !Y->isMinSignedValue())
+ return replaceOperand(I, 1, ConstantInt::get(I.getType(), -*Y));
+ }
+
+ // -X srem Y --> -(X srem Y)
+ Value *X, *Y;
+ if (match(&I, m_SRem(m_OneUse(m_NSWSub(m_Zero(), m_Value(X))), m_Value(Y))))
return BinaryOperator::CreateNSWNeg(Builder.CreateSRem(X, Y));
-
- // If the sign bits of both operands are zero (i.e. we can prove they are
- // unsigned inputs), turn this into a urem.
- APInt Mask(APInt::getSignMask(I.getType()->getScalarSizeInBits()));
- if (MaskedValueIsZero(Op1, Mask, 0, &I) &&
- MaskedValueIsZero(Op0, Mask, 0, &I)) {
- // X srem Y -> X urem Y, iff X and Y don't have sign bit set
- return BinaryOperator::CreateURem(Op0, Op1, I.getName());
- }
-
- // If it's a constant vector, flip any negative values positive.
- if (isa<ConstantVector>(Op1) || isa<ConstantDataVector>(Op1)) {
- Constant *C = cast<Constant>(Op1);
+
+ // If the sign bits of both operands are zero (i.e. we can prove they are
+ // unsigned inputs), turn this into a urem.
+ APInt Mask(APInt::getSignMask(I.getType()->getScalarSizeInBits()));
+ if (MaskedValueIsZero(Op1, Mask, 0, &I) &&
+ MaskedValueIsZero(Op0, Mask, 0, &I)) {
+ // X srem Y -> X urem Y, iff X and Y don't have sign bit set
+ return BinaryOperator::CreateURem(Op0, Op1, I.getName());
+ }
+
+ // If it's a constant vector, flip any negative values positive.
+ if (isa<ConstantVector>(Op1) || isa<ConstantDataVector>(Op1)) {
+ Constant *C = cast<Constant>(Op1);
unsigned VWidth = cast<FixedVectorType>(C->getType())->getNumElements();
-
- bool hasNegative = false;
- bool hasMissing = false;
- for (unsigned i = 0; i != VWidth; ++i) {
- Constant *Elt = C->getAggregateElement(i);
- if (!Elt) {
- hasMissing = true;
- break;
- }
-
- if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elt))
- if (RHS->isNegative())
- hasNegative = true;
- }
-
- if (hasNegative && !hasMissing) {
- SmallVector<Constant *, 16> Elts(VWidth);
- for (unsigned i = 0; i != VWidth; ++i) {
- Elts[i] = C->getAggregateElement(i); // Handle undef, etc.
- if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elts[i])) {
- if (RHS->isNegative())
- Elts[i] = cast<ConstantInt>(ConstantExpr::getNeg(RHS));
- }
- }
-
- Constant *NewRHSV = ConstantVector::get(Elts);
- if (NewRHSV != C) // Don't loop on -MININT
- return replaceOperand(I, 1, NewRHSV);
- }
- }
-
- return nullptr;
-}
-
+
+ bool hasNegative = false;
+ bool hasMissing = false;
+ for (unsigned i = 0; i != VWidth; ++i) {
+ Constant *Elt = C->getAggregateElement(i);
+ if (!Elt) {
+ hasMissing = true;
+ break;
+ }
+
+ if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elt))
+ if (RHS->isNegative())
+ hasNegative = true;
+ }
+
+ if (hasNegative && !hasMissing) {
+ SmallVector<Constant *, 16> Elts(VWidth);
+ for (unsigned i = 0; i != VWidth; ++i) {
+ Elts[i] = C->getAggregateElement(i); // Handle undef, etc.
+ if (ConstantInt *RHS = dyn_cast<ConstantInt>(Elts[i])) {
+ if (RHS->isNegative())
+ Elts[i] = cast<ConstantInt>(ConstantExpr::getNeg(RHS));
+ }
+ }
+
+ Constant *NewRHSV = ConstantVector::get(Elts);
+ if (NewRHSV != C) // Don't loop on -MININT
+ return replaceOperand(I, 1, NewRHSV);
+ }
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitFRem(BinaryOperator &I) {
- if (Value *V = SimplifyFRemInst(I.getOperand(0), I.getOperand(1),
- I.getFastMathFlags(),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- return nullptr;
-}
+ if (Value *V = SimplifyFRemInst(I.getOperand(0), I.getOperand(1),
+ I.getFastMathFlags(),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineNegator.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineNegator.cpp
index c6d3604de8..7718c8b0ee 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineNegator.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineNegator.cpp
@@ -1,120 +1,120 @@
-//===- InstCombineNegator.cpp -----------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements sinking of negation into expression trees,
-// as long as that can be done without increasing instruction count.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/TargetFolder.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/DebugCounter.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+//===- InstCombineNegator.cpp -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements sinking of negation into expression trees,
+// as long as that can be done without increasing instruction count.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include <cassert>
#include <cstdint>
-#include <functional>
-#include <tuple>
-#include <type_traits>
-#include <utility>
-
-namespace llvm {
-class AssumptionCache;
-class DataLayout;
-class DominatorTree;
-class LLVMContext;
-} // namespace llvm
-
-using namespace llvm;
-
-#define DEBUG_TYPE "instcombine"
-
-STATISTIC(NegatorTotalNegationsAttempted,
- "Negator: Number of negations attempted to be sinked");
-STATISTIC(NegatorNumTreesNegated,
- "Negator: Number of negations successfully sinked");
-STATISTIC(NegatorMaxDepthVisited, "Negator: Maximal traversal depth ever "
- "reached while attempting to sink negation");
-STATISTIC(NegatorTimesDepthLimitReached,
- "Negator: How many times did the traversal depth limit was reached "
- "during sinking");
-STATISTIC(
- NegatorNumValuesVisited,
- "Negator: Total number of values visited during attempts to sink negation");
-STATISTIC(NegatorNumNegationsFoundInCache,
- "Negator: How many negations did we retrieve/reuse from cache");
-STATISTIC(NegatorMaxTotalValuesVisited,
- "Negator: Maximal number of values ever visited while attempting to "
- "sink negation");
-STATISTIC(NegatorNumInstructionsCreatedTotal,
- "Negator: Number of new negated instructions created, total");
-STATISTIC(NegatorMaxInstructionsCreated,
- "Negator: Maximal number of new instructions created during negation "
- "attempt");
-STATISTIC(NegatorNumInstructionsNegatedSuccess,
- "Negator: Number of new negated instructions created in successful "
- "negation sinking attempts");
-
-DEBUG_COUNTER(NegatorCounter, "instcombine-negator",
- "Controls Negator transformations in InstCombine pass");
-
-static cl::opt<bool>
- NegatorEnabled("instcombine-negator-enabled", cl::init(true),
- cl::desc("Should we attempt to sink negations?"));
-
-static cl::opt<unsigned>
- NegatorMaxDepth("instcombine-negator-max-depth",
- cl::init(NegatorDefaultMaxDepth),
- cl::desc("What is the maximal lookup depth when trying to "
- "check for viability of negation sinking."));
-
-Negator::Negator(LLVMContext &C, const DataLayout &DL_, AssumptionCache &AC_,
- const DominatorTree &DT_, bool IsTrulyNegation_)
- : Builder(C, TargetFolder(DL_),
- IRBuilderCallbackInserter([&](Instruction *I) {
- ++NegatorNumInstructionsCreatedTotal;
- NewInstructions.push_back(I);
- })),
- DL(DL_), AC(AC_), DT(DT_), IsTrulyNegation(IsTrulyNegation_) {}
-
-#if LLVM_ENABLE_STATS
-Negator::~Negator() {
- NegatorMaxTotalValuesVisited.updateMax(NumValuesVisitedInThisNegator);
-}
-#endif
-
+#include <functional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace llvm {
+class AssumptionCache;
+class DataLayout;
+class DominatorTree;
+class LLVMContext;
+} // namespace llvm
+
+using namespace llvm;
+
+#define DEBUG_TYPE "instcombine"
+
+STATISTIC(NegatorTotalNegationsAttempted,
+ "Negator: Number of negations attempted to be sinked");
+STATISTIC(NegatorNumTreesNegated,
+ "Negator: Number of negations successfully sinked");
+STATISTIC(NegatorMaxDepthVisited, "Negator: Maximal traversal depth ever "
+ "reached while attempting to sink negation");
+STATISTIC(NegatorTimesDepthLimitReached,
+ "Negator: How many times did the traversal depth limit was reached "
+ "during sinking");
+STATISTIC(
+ NegatorNumValuesVisited,
+ "Negator: Total number of values visited during attempts to sink negation");
+STATISTIC(NegatorNumNegationsFoundInCache,
+ "Negator: How many negations did we retrieve/reuse from cache");
+STATISTIC(NegatorMaxTotalValuesVisited,
+ "Negator: Maximal number of values ever visited while attempting to "
+ "sink negation");
+STATISTIC(NegatorNumInstructionsCreatedTotal,
+ "Negator: Number of new negated instructions created, total");
+STATISTIC(NegatorMaxInstructionsCreated,
+ "Negator: Maximal number of new instructions created during negation "
+ "attempt");
+STATISTIC(NegatorNumInstructionsNegatedSuccess,
+ "Negator: Number of new negated instructions created in successful "
+ "negation sinking attempts");
+
+DEBUG_COUNTER(NegatorCounter, "instcombine-negator",
+ "Controls Negator transformations in InstCombine pass");
+
+static cl::opt<bool>
+ NegatorEnabled("instcombine-negator-enabled", cl::init(true),
+ cl::desc("Should we attempt to sink negations?"));
+
+static cl::opt<unsigned>
+ NegatorMaxDepth("instcombine-negator-max-depth",
+ cl::init(NegatorDefaultMaxDepth),
+ cl::desc("What is the maximal lookup depth when trying to "
+ "check for viability of negation sinking."));
+
+Negator::Negator(LLVMContext &C, const DataLayout &DL_, AssumptionCache &AC_,
+ const DominatorTree &DT_, bool IsTrulyNegation_)
+ : Builder(C, TargetFolder(DL_),
+ IRBuilderCallbackInserter([&](Instruction *I) {
+ ++NegatorNumInstructionsCreatedTotal;
+ NewInstructions.push_back(I);
+ })),
+ DL(DL_), AC(AC_), DT(DT_), IsTrulyNegation(IsTrulyNegation_) {}
+
+#if LLVM_ENABLE_STATS
+Negator::~Negator() {
+ NegatorMaxTotalValuesVisited.updateMax(NumValuesVisitedInThisNegator);
+}
+#endif
+
// Due to the InstCombine's worklist management, there are no guarantees that
// each instruction we'll encounter has been visited by InstCombine already.
// In particular, most importantly for us, that means we have to canonicalize
@@ -128,97 +128,97 @@ std::array<Value *, 2> Negator::getSortedOperandsOfBinOp(Instruction *I) {
return Ops;
}
-// FIXME: can this be reworked into a worklist-based algorithm while preserving
-// the depth-first, early bailout traversal?
-LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
- // -(undef) -> undef.
- if (match(V, m_Undef()))
- return V;
-
- // In i1, negation can simply be ignored.
- if (V->getType()->isIntOrIntVectorTy(1))
- return V;
-
- Value *X;
-
- // -(-(X)) -> X.
- if (match(V, m_Neg(m_Value(X))))
- return X;
-
- // Integral constants can be freely negated.
- if (match(V, m_AnyIntegralConstant()))
- return ConstantExpr::getNeg(cast<Constant>(V), /*HasNUW=*/false,
- /*HasNSW=*/false);
-
- // If we have a non-instruction, then give up.
- if (!isa<Instruction>(V))
- return nullptr;
-
- // If we have started with a true negation (i.e. `sub 0, %y`), then if we've
- // got instruction that does not require recursive reasoning, we can still
- // negate it even if it has other uses, without increasing instruction count.
- if (!V->hasOneUse() && !IsTrulyNegation)
- return nullptr;
-
- auto *I = cast<Instruction>(V);
- unsigned BitWidth = I->getType()->getScalarSizeInBits();
-
- // We must preserve the insertion point and debug info that is set in the
- // builder at the time this function is called.
- InstCombiner::BuilderTy::InsertPointGuard Guard(Builder);
- // And since we are trying to negate instruction I, that tells us about the
- // insertion point and the debug info that we need to keep.
- Builder.SetInsertPoint(I);
-
- // In some cases we can give the answer without further recursion.
- switch (I->getOpcode()) {
+// FIXME: can this be reworked into a worklist-based algorithm while preserving
+// the depth-first, early bailout traversal?
+LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
+ // -(undef) -> undef.
+ if (match(V, m_Undef()))
+ return V;
+
+ // In i1, negation can simply be ignored.
+ if (V->getType()->isIntOrIntVectorTy(1))
+ return V;
+
+ Value *X;
+
+ // -(-(X)) -> X.
+ if (match(V, m_Neg(m_Value(X))))
+ return X;
+
+ // Integral constants can be freely negated.
+ if (match(V, m_AnyIntegralConstant()))
+ return ConstantExpr::getNeg(cast<Constant>(V), /*HasNUW=*/false,
+ /*HasNSW=*/false);
+
+ // If we have a non-instruction, then give up.
+ if (!isa<Instruction>(V))
+ return nullptr;
+
+ // If we have started with a true negation (i.e. `sub 0, %y`), then if we've
+ // got instruction that does not require recursive reasoning, we can still
+ // negate it even if it has other uses, without increasing instruction count.
+ if (!V->hasOneUse() && !IsTrulyNegation)
+ return nullptr;
+
+ auto *I = cast<Instruction>(V);
+ unsigned BitWidth = I->getType()->getScalarSizeInBits();
+
+ // We must preserve the insertion point and debug info that is set in the
+ // builder at the time this function is called.
+ InstCombiner::BuilderTy::InsertPointGuard Guard(Builder);
+ // And since we are trying to negate instruction I, that tells us about the
+ // insertion point and the debug info that we need to keep.
+ Builder.SetInsertPoint(I);
+
+ // In some cases we can give the answer without further recursion.
+ switch (I->getOpcode()) {
case Instruction::Add: {
std::array<Value *, 2> Ops = getSortedOperandsOfBinOp(I);
- // `inc` is always negatible.
+ // `inc` is always negatible.
if (match(Ops[1], m_One()))
return Builder.CreateNot(Ops[0], I->getName() + ".neg");
- break;
+ break;
}
- case Instruction::Xor:
- // `not` is always negatible.
- if (match(I, m_Not(m_Value(X))))
- return Builder.CreateAdd(X, ConstantInt::get(X->getType(), 1),
- I->getName() + ".neg");
- break;
- case Instruction::AShr:
- case Instruction::LShr: {
- // Right-shift sign bit smear is negatible.
- const APInt *Op1Val;
- if (match(I->getOperand(1), m_APInt(Op1Val)) && *Op1Val == BitWidth - 1) {
- Value *BO = I->getOpcode() == Instruction::AShr
- ? Builder.CreateLShr(I->getOperand(0), I->getOperand(1))
- : Builder.CreateAShr(I->getOperand(0), I->getOperand(1));
- if (auto *NewInstr = dyn_cast<Instruction>(BO)) {
- NewInstr->copyIRFlags(I);
- NewInstr->setName(I->getName() + ".neg");
- }
- return BO;
- }
+ case Instruction::Xor:
+ // `not` is always negatible.
+ if (match(I, m_Not(m_Value(X))))
+ return Builder.CreateAdd(X, ConstantInt::get(X->getType(), 1),
+ I->getName() + ".neg");
+ break;
+ case Instruction::AShr:
+ case Instruction::LShr: {
+ // Right-shift sign bit smear is negatible.
+ const APInt *Op1Val;
+ if (match(I->getOperand(1), m_APInt(Op1Val)) && *Op1Val == BitWidth - 1) {
+ Value *BO = I->getOpcode() == Instruction::AShr
+ ? Builder.CreateLShr(I->getOperand(0), I->getOperand(1))
+ : Builder.CreateAShr(I->getOperand(0), I->getOperand(1));
+ if (auto *NewInstr = dyn_cast<Instruction>(BO)) {
+ NewInstr->copyIRFlags(I);
+ NewInstr->setName(I->getName() + ".neg");
+ }
+ return BO;
+ }
// While we could negate exact arithmetic shift:
// ashr exact %x, C --> sdiv exact i8 %x, -1<<C
// iff C != 0 and C u< bitwidth(%x), we don't want to,
// because division is *THAT* much worse than a shift.
- break;
- }
- case Instruction::SExt:
- case Instruction::ZExt:
- // `*ext` of i1 is always negatible
- if (I->getOperand(0)->getType()->isIntOrIntVectorTy(1))
- return I->getOpcode() == Instruction::SExt
- ? Builder.CreateZExt(I->getOperand(0), I->getType(),
- I->getName() + ".neg")
- : Builder.CreateSExt(I->getOperand(0), I->getType(),
- I->getName() + ".neg");
- break;
- default:
- break; // Other instructions require recursive reasoning.
- }
-
+ break;
+ }
+ case Instruction::SExt:
+ case Instruction::ZExt:
+ // `*ext` of i1 is always negatible
+ if (I->getOperand(0)->getType()->isIntOrIntVectorTy(1))
+ return I->getOpcode() == Instruction::SExt
+ ? Builder.CreateZExt(I->getOperand(0), I->getType(),
+ I->getName() + ".neg")
+ : Builder.CreateSExt(I->getOperand(0), I->getType(),
+ I->getName() + ".neg");
+ break;
+ default:
+ break; // Other instructions require recursive reasoning.
+ }
+
if (I->getOpcode() == Instruction::Sub &&
(I->hasOneUse() || match(I->getOperand(0), m_ImmConstant()))) {
// `sub` is always negatible.
@@ -228,39 +228,39 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
I->getName() + ".neg");
}
- // Some other cases, while still don't require recursion,
- // are restricted to the one-use case.
- if (!V->hasOneUse())
- return nullptr;
-
- switch (I->getOpcode()) {
- case Instruction::SDiv:
- // `sdiv` is negatible if divisor is not undef/INT_MIN/1.
- // While this is normally not behind a use-check,
- // let's consider division to be special since it's costly.
- if (auto *Op1C = dyn_cast<Constant>(I->getOperand(1))) {
+ // Some other cases, while still don't require recursion,
+ // are restricted to the one-use case.
+ if (!V->hasOneUse())
+ return nullptr;
+
+ switch (I->getOpcode()) {
+ case Instruction::SDiv:
+ // `sdiv` is negatible if divisor is not undef/INT_MIN/1.
+ // While this is normally not behind a use-check,
+ // let's consider division to be special since it's costly.
+ if (auto *Op1C = dyn_cast<Constant>(I->getOperand(1))) {
if (!Op1C->containsUndefOrPoisonElement() &&
Op1C->isNotMinSignedValue() && Op1C->isNotOneValue()) {
- Value *BO =
- Builder.CreateSDiv(I->getOperand(0), ConstantExpr::getNeg(Op1C),
- I->getName() + ".neg");
- if (auto *NewInstr = dyn_cast<Instruction>(BO))
- NewInstr->setIsExact(I->isExact());
- return BO;
- }
- }
- break;
- }
-
- // Rest of the logic is recursive, so if it's time to give up then it's time.
- if (Depth > NegatorMaxDepth) {
- LLVM_DEBUG(dbgs() << "Negator: reached maximal allowed traversal depth in "
- << *V << ". Giving up.\n");
- ++NegatorTimesDepthLimitReached;
- return nullptr;
- }
-
- switch (I->getOpcode()) {
+ Value *BO =
+ Builder.CreateSDiv(I->getOperand(0), ConstantExpr::getNeg(Op1C),
+ I->getName() + ".neg");
+ if (auto *NewInstr = dyn_cast<Instruction>(BO))
+ NewInstr->setIsExact(I->isExact());
+ return BO;
+ }
+ }
+ break;
+ }
+
+ // Rest of the logic is recursive, so if it's time to give up then it's time.
+ if (Depth > NegatorMaxDepth) {
+ LLVM_DEBUG(dbgs() << "Negator: reached maximal allowed traversal depth in "
+ << *V << ". Giving up.\n");
+ ++NegatorTimesDepthLimitReached;
+ return nullptr;
+ }
+
+ switch (I->getOpcode()) {
case Instruction::Freeze: {
// `freeze` is negatible if its operand is negatible.
Value *NegOp = negate(I->getOperand(0), Depth + 1);
@@ -268,23 +268,23 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
return nullptr;
return Builder.CreateFreeze(NegOp, I->getName() + ".neg");
}
- case Instruction::PHI: {
- // `phi` is negatible if all the incoming values are negatible.
- auto *PHI = cast<PHINode>(I);
- SmallVector<Value *, 4> NegatedIncomingValues(PHI->getNumOperands());
- for (auto I : zip(PHI->incoming_values(), NegatedIncomingValues)) {
- if (!(std::get<1>(I) =
- negate(std::get<0>(I), Depth + 1))) // Early return.
- return nullptr;
- }
- // All incoming values are indeed negatible. Create negated PHI node.
- PHINode *NegatedPHI = Builder.CreatePHI(
- PHI->getType(), PHI->getNumOperands(), PHI->getName() + ".neg");
- for (auto I : zip(NegatedIncomingValues, PHI->blocks()))
- NegatedPHI->addIncoming(std::get<0>(I), std::get<1>(I));
- return NegatedPHI;
- }
- case Instruction::Select: {
+ case Instruction::PHI: {
+ // `phi` is negatible if all the incoming values are negatible.
+ auto *PHI = cast<PHINode>(I);
+ SmallVector<Value *, 4> NegatedIncomingValues(PHI->getNumOperands());
+ for (auto I : zip(PHI->incoming_values(), NegatedIncomingValues)) {
+ if (!(std::get<1>(I) =
+ negate(std::get<0>(I), Depth + 1))) // Early return.
+ return nullptr;
+ }
+ // All incoming values are indeed negatible. Create negated PHI node.
+ PHINode *NegatedPHI = Builder.CreatePHI(
+ PHI->getType(), PHI->getNumOperands(), PHI->getName() + ".neg");
+ for (auto I : zip(NegatedIncomingValues, PHI->blocks()))
+ NegatedPHI->addIncoming(std::get<0>(I), std::get<1>(I));
+ return NegatedPHI;
+ }
+ case Instruction::Select: {
if (isKnownNegation(I->getOperand(1), I->getOperand(2))) {
// Of one hand of select is known to be negation of another hand,
// just swap the hands around.
@@ -295,86 +295,86 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
NewSelect->setName(I->getName() + ".neg");
Builder.Insert(NewSelect);
return NewSelect;
- }
- // `select` is negatible if both hands of `select` are negatible.
- Value *NegOp1 = negate(I->getOperand(1), Depth + 1);
- if (!NegOp1) // Early return.
- return nullptr;
- Value *NegOp2 = negate(I->getOperand(2), Depth + 1);
- if (!NegOp2)
- return nullptr;
- // Do preserve the metadata!
- return Builder.CreateSelect(I->getOperand(0), NegOp1, NegOp2,
- I->getName() + ".neg", /*MDFrom=*/I);
- }
- case Instruction::ShuffleVector: {
- // `shufflevector` is negatible if both operands are negatible.
- auto *Shuf = cast<ShuffleVectorInst>(I);
- Value *NegOp0 = negate(I->getOperand(0), Depth + 1);
- if (!NegOp0) // Early return.
- return nullptr;
- Value *NegOp1 = negate(I->getOperand(1), Depth + 1);
- if (!NegOp1)
- return nullptr;
- return Builder.CreateShuffleVector(NegOp0, NegOp1, Shuf->getShuffleMask(),
- I->getName() + ".neg");
- }
- case Instruction::ExtractElement: {
- // `extractelement` is negatible if source operand is negatible.
- auto *EEI = cast<ExtractElementInst>(I);
- Value *NegVector = negate(EEI->getVectorOperand(), Depth + 1);
- if (!NegVector) // Early return.
- return nullptr;
- return Builder.CreateExtractElement(NegVector, EEI->getIndexOperand(),
- I->getName() + ".neg");
- }
- case Instruction::InsertElement: {
- // `insertelement` is negatible if both the source vector and
- // element-to-be-inserted are negatible.
- auto *IEI = cast<InsertElementInst>(I);
- Value *NegVector = negate(IEI->getOperand(0), Depth + 1);
- if (!NegVector) // Early return.
- return nullptr;
- Value *NegNewElt = negate(IEI->getOperand(1), Depth + 1);
- if (!NegNewElt) // Early return.
- return nullptr;
- return Builder.CreateInsertElement(NegVector, NegNewElt, IEI->getOperand(2),
- I->getName() + ".neg");
- }
- case Instruction::Trunc: {
- // `trunc` is negatible if its operand is negatible.
- Value *NegOp = negate(I->getOperand(0), Depth + 1);
- if (!NegOp) // Early return.
- return nullptr;
- return Builder.CreateTrunc(NegOp, I->getType(), I->getName() + ".neg");
- }
- case Instruction::Shl: {
- // `shl` is negatible if the first operand is negatible.
+ }
+ // `select` is negatible if both hands of `select` are negatible.
+ Value *NegOp1 = negate(I->getOperand(1), Depth + 1);
+ if (!NegOp1) // Early return.
+ return nullptr;
+ Value *NegOp2 = negate(I->getOperand(2), Depth + 1);
+ if (!NegOp2)
+ return nullptr;
+ // Do preserve the metadata!
+ return Builder.CreateSelect(I->getOperand(0), NegOp1, NegOp2,
+ I->getName() + ".neg", /*MDFrom=*/I);
+ }
+ case Instruction::ShuffleVector: {
+ // `shufflevector` is negatible if both operands are negatible.
+ auto *Shuf = cast<ShuffleVectorInst>(I);
+ Value *NegOp0 = negate(I->getOperand(0), Depth + 1);
+ if (!NegOp0) // Early return.
+ return nullptr;
+ Value *NegOp1 = negate(I->getOperand(1), Depth + 1);
+ if (!NegOp1)
+ return nullptr;
+ return Builder.CreateShuffleVector(NegOp0, NegOp1, Shuf->getShuffleMask(),
+ I->getName() + ".neg");
+ }
+ case Instruction::ExtractElement: {
+ // `extractelement` is negatible if source operand is negatible.
+ auto *EEI = cast<ExtractElementInst>(I);
+ Value *NegVector = negate(EEI->getVectorOperand(), Depth + 1);
+ if (!NegVector) // Early return.
+ return nullptr;
+ return Builder.CreateExtractElement(NegVector, EEI->getIndexOperand(),
+ I->getName() + ".neg");
+ }
+ case Instruction::InsertElement: {
+ // `insertelement` is negatible if both the source vector and
+ // element-to-be-inserted are negatible.
+ auto *IEI = cast<InsertElementInst>(I);
+ Value *NegVector = negate(IEI->getOperand(0), Depth + 1);
+ if (!NegVector) // Early return.
+ return nullptr;
+ Value *NegNewElt = negate(IEI->getOperand(1), Depth + 1);
+ if (!NegNewElt) // Early return.
+ return nullptr;
+ return Builder.CreateInsertElement(NegVector, NegNewElt, IEI->getOperand(2),
+ I->getName() + ".neg");
+ }
+ case Instruction::Trunc: {
+ // `trunc` is negatible if its operand is negatible.
+ Value *NegOp = negate(I->getOperand(0), Depth + 1);
+ if (!NegOp) // Early return.
+ return nullptr;
+ return Builder.CreateTrunc(NegOp, I->getType(), I->getName() + ".neg");
+ }
+ case Instruction::Shl: {
+ // `shl` is negatible if the first operand is negatible.
if (Value *NegOp0 = negate(I->getOperand(0), Depth + 1))
return Builder.CreateShl(NegOp0, I->getOperand(1), I->getName() + ".neg");
// Otherwise, `shl %x, C` can be interpreted as `mul %x, 1<<C`.
auto *Op1C = dyn_cast<Constant>(I->getOperand(1));
if (!Op1C) // Early return.
- return nullptr;
+ return nullptr;
return Builder.CreateMul(
I->getOperand(0),
ConstantExpr::getShl(Constant::getAllOnesValue(Op1C->getType()), Op1C),
I->getName() + ".neg");
- }
+ }
case Instruction::Or: {
- if (!haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1), DL, &AC, I,
- &DT))
- return nullptr; // Don't know how to handle `or` in general.
+ if (!haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1), DL, &AC, I,
+ &DT))
+ return nullptr; // Don't know how to handle `or` in general.
std::array<Value *, 2> Ops = getSortedOperandsOfBinOp(I);
- // `or`/`add` are interchangeable when operands have no common bits set.
- // `inc` is always negatible.
+ // `or`/`add` are interchangeable when operands have no common bits set.
+ // `inc` is always negatible.
if (match(Ops[1], m_One()))
return Builder.CreateNot(Ops[0], I->getName() + ".neg");
- // Else, just defer to Instruction::Add handling.
- LLVM_FALLTHROUGH;
+ // Else, just defer to Instruction::Add handling.
+ LLVM_FALLTHROUGH;
}
- case Instruction::Add: {
- // `add` is negatible if both of its operands are negatible.
+ case Instruction::Add: {
+ // `add` is negatible if both of its operands are negatible.
SmallVector<Value *, 2> NegatedOps, NonNegatedOps;
for (Value *Op : I->operands()) {
// Can we sink the negation into this operand?
@@ -397,135 +397,135 @@ LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
assert(IsTrulyNegation && "We should have early-exited then.");
// Completely failed to sink negation?
if (NonNegatedOps.size() == 2)
- return nullptr;
+ return nullptr;
// 0-(a+b) --> (-a)-b
return Builder.CreateSub(NegatedOps[0], NonNegatedOps[0],
I->getName() + ".neg");
- }
+ }
case Instruction::Xor: {
std::array<Value *, 2> Ops = getSortedOperandsOfBinOp(I);
- // `xor` is negatible if one of its operands is invertible.
- // FIXME: InstCombineInverter? But how to connect Inverter and Negator?
+ // `xor` is negatible if one of its operands is invertible.
+ // FIXME: InstCombineInverter? But how to connect Inverter and Negator?
if (auto *C = dyn_cast<Constant>(Ops[1])) {
Value *Xor = Builder.CreateXor(Ops[0], ConstantExpr::getNot(C));
- return Builder.CreateAdd(Xor, ConstantInt::get(Xor->getType(), 1),
- I->getName() + ".neg");
- }
- return nullptr;
+ return Builder.CreateAdd(Xor, ConstantInt::get(Xor->getType(), 1),
+ I->getName() + ".neg");
+ }
+ return nullptr;
}
- case Instruction::Mul: {
+ case Instruction::Mul: {
std::array<Value *, 2> Ops = getSortedOperandsOfBinOp(I);
- // `mul` is negatible if one of its operands is negatible.
- Value *NegatedOp, *OtherOp;
- // First try the second operand, in case it's a constant it will be best to
- // just invert it instead of sinking the `neg` deeper.
+ // `mul` is negatible if one of its operands is negatible.
+ Value *NegatedOp, *OtherOp;
+ // First try the second operand, in case it's a constant it will be best to
+ // just invert it instead of sinking the `neg` deeper.
if (Value *NegOp1 = negate(Ops[1], Depth + 1)) {
- NegatedOp = NegOp1;
+ NegatedOp = NegOp1;
OtherOp = Ops[0];
} else if (Value *NegOp0 = negate(Ops[0], Depth + 1)) {
- NegatedOp = NegOp0;
+ NegatedOp = NegOp0;
OtherOp = Ops[1];
- } else
- // Can't negate either of them.
- return nullptr;
- return Builder.CreateMul(NegatedOp, OtherOp, I->getName() + ".neg");
- }
- default:
- return nullptr; // Don't know, likely not negatible for free.
- }
-
- llvm_unreachable("Can't get here. We always return from switch.");
-}
-
-LLVM_NODISCARD Value *Negator::negate(Value *V, unsigned Depth) {
- NegatorMaxDepthVisited.updateMax(Depth);
- ++NegatorNumValuesVisited;
-
-#if LLVM_ENABLE_STATS
- ++NumValuesVisitedInThisNegator;
-#endif
-
-#ifndef NDEBUG
- // We can't ever have a Value with such an address.
- Value *Placeholder = reinterpret_cast<Value *>(static_cast<uintptr_t>(-1));
-#endif
-
- // Did we already try to negate this value?
- auto NegationsCacheIterator = NegationsCache.find(V);
- if (NegationsCacheIterator != NegationsCache.end()) {
- ++NegatorNumNegationsFoundInCache;
- Value *NegatedV = NegationsCacheIterator->second;
- assert(NegatedV != Placeholder && "Encountered a cycle during negation.");
- return NegatedV;
- }
-
-#ifndef NDEBUG
- // We did not find a cached result for negation of V. While there,
- // let's temporairly cache a placeholder value, with the idea that if later
- // during negation we fetch it from cache, we'll know we're in a cycle.
- NegationsCache[V] = Placeholder;
-#endif
-
- // No luck. Try negating it for real.
- Value *NegatedV = visitImpl(V, Depth);
- // And cache the (real) result for the future.
- NegationsCache[V] = NegatedV;
-
- return NegatedV;
-}
-
-LLVM_NODISCARD Optional<Negator::Result> Negator::run(Value *Root) {
- Value *Negated = negate(Root, /*Depth=*/0);
- if (!Negated) {
- // We must cleanup newly-inserted instructions, to avoid any potential
- // endless combine looping.
- llvm::for_each(llvm::reverse(NewInstructions),
- [&](Instruction *I) { I->eraseFromParent(); });
- return llvm::None;
- }
- return std::make_pair(ArrayRef<Instruction *>(NewInstructions), Negated);
-}
-
-LLVM_NODISCARD Value *Negator::Negate(bool LHSIsZero, Value *Root,
+ } else
+ // Can't negate either of them.
+ return nullptr;
+ return Builder.CreateMul(NegatedOp, OtherOp, I->getName() + ".neg");
+ }
+ default:
+ return nullptr; // Don't know, likely not negatible for free.
+ }
+
+ llvm_unreachable("Can't get here. We always return from switch.");
+}
+
+LLVM_NODISCARD Value *Negator::negate(Value *V, unsigned Depth) {
+ NegatorMaxDepthVisited.updateMax(Depth);
+ ++NegatorNumValuesVisited;
+
+#if LLVM_ENABLE_STATS
+ ++NumValuesVisitedInThisNegator;
+#endif
+
+#ifndef NDEBUG
+ // We can't ever have a Value with such an address.
+ Value *Placeholder = reinterpret_cast<Value *>(static_cast<uintptr_t>(-1));
+#endif
+
+ // Did we already try to negate this value?
+ auto NegationsCacheIterator = NegationsCache.find(V);
+ if (NegationsCacheIterator != NegationsCache.end()) {
+ ++NegatorNumNegationsFoundInCache;
+ Value *NegatedV = NegationsCacheIterator->second;
+ assert(NegatedV != Placeholder && "Encountered a cycle during negation.");
+ return NegatedV;
+ }
+
+#ifndef NDEBUG
+ // We did not find a cached result for negation of V. While there,
+ // let's temporairly cache a placeholder value, with the idea that if later
+ // during negation we fetch it from cache, we'll know we're in a cycle.
+ NegationsCache[V] = Placeholder;
+#endif
+
+ // No luck. Try negating it for real.
+ Value *NegatedV = visitImpl(V, Depth);
+ // And cache the (real) result for the future.
+ NegationsCache[V] = NegatedV;
+
+ return NegatedV;
+}
+
+LLVM_NODISCARD Optional<Negator::Result> Negator::run(Value *Root) {
+ Value *Negated = negate(Root, /*Depth=*/0);
+ if (!Negated) {
+ // We must cleanup newly-inserted instructions, to avoid any potential
+ // endless combine looping.
+ llvm::for_each(llvm::reverse(NewInstructions),
+ [&](Instruction *I) { I->eraseFromParent(); });
+ return llvm::None;
+ }
+ return std::make_pair(ArrayRef<Instruction *>(NewInstructions), Negated);
+}
+
+LLVM_NODISCARD Value *Negator::Negate(bool LHSIsZero, Value *Root,
InstCombinerImpl &IC) {
- ++NegatorTotalNegationsAttempted;
- LLVM_DEBUG(dbgs() << "Negator: attempting to sink negation into " << *Root
- << "\n");
-
- if (!NegatorEnabled || !DebugCounter::shouldExecute(NegatorCounter))
- return nullptr;
-
- Negator N(Root->getContext(), IC.getDataLayout(), IC.getAssumptionCache(),
- IC.getDominatorTree(), LHSIsZero);
- Optional<Result> Res = N.run(Root);
- if (!Res) { // Negation failed.
- LLVM_DEBUG(dbgs() << "Negator: failed to sink negation into " << *Root
- << "\n");
- return nullptr;
- }
-
- LLVM_DEBUG(dbgs() << "Negator: successfully sunk negation into " << *Root
- << "\n NEW: " << *Res->second << "\n");
- ++NegatorNumTreesNegated;
-
- // We must temporarily unset the 'current' insertion point and DebugLoc of the
- // InstCombine's IRBuilder so that it won't interfere with the ones we have
- // already specified when producing negated instructions.
- InstCombiner::BuilderTy::InsertPointGuard Guard(IC.Builder);
- IC.Builder.ClearInsertionPoint();
- IC.Builder.SetCurrentDebugLocation(DebugLoc());
-
- // And finally, we must add newly-created instructions into the InstCombine's
- // worklist (in a proper order!) so it can attempt to combine them.
- LLVM_DEBUG(dbgs() << "Negator: Propagating " << Res->first.size()
- << " instrs to InstCombine\n");
- NegatorMaxInstructionsCreated.updateMax(Res->first.size());
- NegatorNumInstructionsNegatedSuccess += Res->first.size();
-
- // They are in def-use order, so nothing fancy, just insert them in order.
- llvm::for_each(Res->first,
- [&](Instruction *I) { IC.Builder.Insert(I, I->getName()); });
-
- // And return the new root.
- return Res->second;
-}
+ ++NegatorTotalNegationsAttempted;
+ LLVM_DEBUG(dbgs() << "Negator: attempting to sink negation into " << *Root
+ << "\n");
+
+ if (!NegatorEnabled || !DebugCounter::shouldExecute(NegatorCounter))
+ return nullptr;
+
+ Negator N(Root->getContext(), IC.getDataLayout(), IC.getAssumptionCache(),
+ IC.getDominatorTree(), LHSIsZero);
+ Optional<Result> Res = N.run(Root);
+ if (!Res) { // Negation failed.
+ LLVM_DEBUG(dbgs() << "Negator: failed to sink negation into " << *Root
+ << "\n");
+ return nullptr;
+ }
+
+ LLVM_DEBUG(dbgs() << "Negator: successfully sunk negation into " << *Root
+ << "\n NEW: " << *Res->second << "\n");
+ ++NegatorNumTreesNegated;
+
+ // We must temporarily unset the 'current' insertion point and DebugLoc of the
+ // InstCombine's IRBuilder so that it won't interfere with the ones we have
+ // already specified when producing negated instructions.
+ InstCombiner::BuilderTy::InsertPointGuard Guard(IC.Builder);
+ IC.Builder.ClearInsertionPoint();
+ IC.Builder.SetCurrentDebugLocation(DebugLoc());
+
+ // And finally, we must add newly-created instructions into the InstCombine's
+ // worklist (in a proper order!) so it can attempt to combine them.
+ LLVM_DEBUG(dbgs() << "Negator: Propagating " << Res->first.size()
+ << " instrs to InstCombine\n");
+ NegatorMaxInstructionsCreated.updateMax(Res->first.size());
+ NegatorNumInstructionsNegatedSuccess += Res->first.size();
+
+ // They are in def-use order, so nothing fancy, just insert them in order.
+ llvm::for_each(Res->first,
+ [&](Instruction *I) { IC.Builder.Insert(I, I->getName()); });
+
+ // And return the new root.
+ return Res->second;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombinePHI.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombinePHI.cpp
index e4ba78e459..b211b08136 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -1,304 +1,304 @@
-//===- InstCombinePHI.cpp -------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the visitPHINode function.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
+//===- InstCombinePHI.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitPHINode function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-#define DEBUG_TYPE "instcombine"
-
-static cl::opt<unsigned>
-MaxNumPhis("instcombine-max-num-phis", cl::init(512),
- cl::desc("Maximum number phis to handle in intptr/ptrint folding"));
-
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+static cl::opt<unsigned>
+MaxNumPhis("instcombine-max-num-phis", cl::init(512),
+ cl::desc("Maximum number phis to handle in intptr/ptrint folding"));
+
STATISTIC(NumPHIsOfInsertValues,
"Number of phi-of-insertvalue turned into insertvalue-of-phis");
STATISTIC(NumPHIsOfExtractValues,
"Number of phi-of-extractvalue turned into extractvalue-of-phi");
STATISTIC(NumPHICSEs, "Number of PHI's that got CSE'd");
-/// The PHI arguments will be folded into a single operation with a PHI node
-/// as input. The debug location of the single operation will be the merged
-/// locations of the original PHI node arguments.
+/// The PHI arguments will be folded into a single operation with a PHI node
+/// as input. The debug location of the single operation will be the merged
+/// locations of the original PHI node arguments.
void InstCombinerImpl::PHIArgMergedDebugLoc(Instruction *Inst, PHINode &PN) {
- auto *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
- Inst->setDebugLoc(FirstInst->getDebugLoc());
- // We do not expect a CallInst here, otherwise, N-way merging of DebugLoc
- // will be inefficient.
- assert(!isa<CallInst>(Inst));
-
- for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
- auto *I = cast<Instruction>(PN.getIncomingValue(i));
- Inst->applyMergedLocation(Inst->getDebugLoc(), I->getDebugLoc());
- }
-}
-
-// Replace Integer typed PHI PN if the PHI's value is used as a pointer value.
-// If there is an existing pointer typed PHI that produces the same value as PN,
-// replace PN and the IntToPtr operation with it. Otherwise, synthesize a new
-// PHI node:
-//
-// Case-1:
-// bb1:
-// int_init = PtrToInt(ptr_init)
-// br label %bb2
-// bb2:
-// int_val = PHI([int_init, %bb1], [int_val_inc, %bb2]
-// ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2]
-// ptr_val2 = IntToPtr(int_val)
-// ...
-// use(ptr_val2)
-// ptr_val_inc = ...
-// inc_val_inc = PtrToInt(ptr_val_inc)
-//
-// ==>
-// bb1:
-// br label %bb2
-// bb2:
-// ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2]
-// ...
-// use(ptr_val)
-// ptr_val_inc = ...
-//
-// Case-2:
-// bb1:
-// int_ptr = BitCast(ptr_ptr)
-// int_init = Load(int_ptr)
-// br label %bb2
-// bb2:
-// int_val = PHI([int_init, %bb1], [int_val_inc, %bb2]
-// ptr_val2 = IntToPtr(int_val)
-// ...
-// use(ptr_val2)
-// ptr_val_inc = ...
-// inc_val_inc = PtrToInt(ptr_val_inc)
-// ==>
-// bb1:
-// ptr_init = Load(ptr_ptr)
-// br label %bb2
-// bb2:
-// ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2]
-// ...
-// use(ptr_val)
-// ptr_val_inc = ...
-// ...
-//
+ auto *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+ Inst->setDebugLoc(FirstInst->getDebugLoc());
+ // We do not expect a CallInst here, otherwise, N-way merging of DebugLoc
+ // will be inefficient.
+ assert(!isa<CallInst>(Inst));
+
+ for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+ auto *I = cast<Instruction>(PN.getIncomingValue(i));
+ Inst->applyMergedLocation(Inst->getDebugLoc(), I->getDebugLoc());
+ }
+}
+
+// Replace Integer typed PHI PN if the PHI's value is used as a pointer value.
+// If there is an existing pointer typed PHI that produces the same value as PN,
+// replace PN and the IntToPtr operation with it. Otherwise, synthesize a new
+// PHI node:
+//
+// Case-1:
+// bb1:
+// int_init = PtrToInt(ptr_init)
+// br label %bb2
+// bb2:
+// int_val = PHI([int_init, %bb1], [int_val_inc, %bb2]
+// ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2]
+// ptr_val2 = IntToPtr(int_val)
+// ...
+// use(ptr_val2)
+// ptr_val_inc = ...
+// inc_val_inc = PtrToInt(ptr_val_inc)
+//
+// ==>
+// bb1:
+// br label %bb2
+// bb2:
+// ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2]
+// ...
+// use(ptr_val)
+// ptr_val_inc = ...
+//
+// Case-2:
+// bb1:
+// int_ptr = BitCast(ptr_ptr)
+// int_init = Load(int_ptr)
+// br label %bb2
+// bb2:
+// int_val = PHI([int_init, %bb1], [int_val_inc, %bb2]
+// ptr_val2 = IntToPtr(int_val)
+// ...
+// use(ptr_val2)
+// ptr_val_inc = ...
+// inc_val_inc = PtrToInt(ptr_val_inc)
+// ==>
+// bb1:
+// ptr_init = Load(ptr_ptr)
+// br label %bb2
+// bb2:
+// ptr_val = PHI([ptr_init, %bb1], [ptr_val_inc, %bb2]
+// ...
+// use(ptr_val)
+// ptr_val_inc = ...
+// ...
+//
Instruction *InstCombinerImpl::foldIntegerTypedPHI(PHINode &PN) {
- if (!PN.getType()->isIntegerTy())
- return nullptr;
- if (!PN.hasOneUse())
- return nullptr;
-
- auto *IntToPtr = dyn_cast<IntToPtrInst>(PN.user_back());
- if (!IntToPtr)
- return nullptr;
-
- // Check if the pointer is actually used as pointer:
- auto HasPointerUse = [](Instruction *IIP) {
- for (User *U : IIP->users()) {
- Value *Ptr = nullptr;
- if (LoadInst *LoadI = dyn_cast<LoadInst>(U)) {
- Ptr = LoadI->getPointerOperand();
- } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
- Ptr = SI->getPointerOperand();
- } else if (GetElementPtrInst *GI = dyn_cast<GetElementPtrInst>(U)) {
- Ptr = GI->getPointerOperand();
- }
-
- if (Ptr && Ptr == IIP)
- return true;
- }
- return false;
- };
-
- if (!HasPointerUse(IntToPtr))
- return nullptr;
-
- if (DL.getPointerSizeInBits(IntToPtr->getAddressSpace()) !=
- DL.getTypeSizeInBits(IntToPtr->getOperand(0)->getType()))
- return nullptr;
-
- SmallVector<Value *, 4> AvailablePtrVals;
- for (unsigned i = 0; i != PN.getNumIncomingValues(); ++i) {
- Value *Arg = PN.getIncomingValue(i);
-
- // First look backward:
- if (auto *PI = dyn_cast<PtrToIntInst>(Arg)) {
- AvailablePtrVals.emplace_back(PI->getOperand(0));
- continue;
- }
-
- // Next look forward:
- Value *ArgIntToPtr = nullptr;
- for (User *U : Arg->users()) {
- if (isa<IntToPtrInst>(U) && U->getType() == IntToPtr->getType() &&
- (DT.dominates(cast<Instruction>(U), PN.getIncomingBlock(i)) ||
- cast<Instruction>(U)->getParent() == PN.getIncomingBlock(i))) {
- ArgIntToPtr = U;
- break;
- }
- }
-
- if (ArgIntToPtr) {
- AvailablePtrVals.emplace_back(ArgIntToPtr);
- continue;
- }
-
- // If Arg is defined by a PHI, allow it. This will also create
- // more opportunities iteratively.
- if (isa<PHINode>(Arg)) {
- AvailablePtrVals.emplace_back(Arg);
- continue;
- }
-
- // For a single use integer load:
- auto *LoadI = dyn_cast<LoadInst>(Arg);
- if (!LoadI)
- return nullptr;
-
- if (!LoadI->hasOneUse())
- return nullptr;
-
- // Push the integer typed Load instruction into the available
- // value set, and fix it up later when the pointer typed PHI
- // is synthesized.
- AvailablePtrVals.emplace_back(LoadI);
- }
-
- // Now search for a matching PHI
- auto *BB = PN.getParent();
- assert(AvailablePtrVals.size() == PN.getNumIncomingValues() &&
- "Not enough available ptr typed incoming values");
- PHINode *MatchingPtrPHI = nullptr;
- unsigned NumPhis = 0;
- for (auto II = BB->begin(); II != BB->end(); II++, NumPhis++) {
- // FIXME: consider handling this in AggressiveInstCombine
- PHINode *PtrPHI = dyn_cast<PHINode>(II);
- if (!PtrPHI)
- break;
- if (NumPhis > MaxNumPhis)
- return nullptr;
- if (PtrPHI == &PN || PtrPHI->getType() != IntToPtr->getType())
- continue;
- MatchingPtrPHI = PtrPHI;
- for (unsigned i = 0; i != PtrPHI->getNumIncomingValues(); ++i) {
- if (AvailablePtrVals[i] !=
- PtrPHI->getIncomingValueForBlock(PN.getIncomingBlock(i))) {
- MatchingPtrPHI = nullptr;
- break;
- }
- }
-
- if (MatchingPtrPHI)
- break;
- }
-
- if (MatchingPtrPHI) {
- assert(MatchingPtrPHI->getType() == IntToPtr->getType() &&
- "Phi's Type does not match with IntToPtr");
- // The PtrToCast + IntToPtr will be simplified later
- return CastInst::CreateBitOrPointerCast(MatchingPtrPHI,
- IntToPtr->getOperand(0)->getType());
- }
-
- // If it requires a conversion for every PHI operand, do not do it.
- if (all_of(AvailablePtrVals, [&](Value *V) {
- return (V->getType() != IntToPtr->getType()) || isa<IntToPtrInst>(V);
- }))
- return nullptr;
-
- // If any of the operand that requires casting is a terminator
- // instruction, do not do it. Similarly, do not do the transform if the value
- // is PHI in a block with no insertion point, for example, a catchswitch
- // block, since we will not be able to insert a cast after the PHI.
- if (any_of(AvailablePtrVals, [&](Value *V) {
- if (V->getType() == IntToPtr->getType())
- return false;
- auto *Inst = dyn_cast<Instruction>(V);
- if (!Inst)
- return false;
- if (Inst->isTerminator())
- return true;
- auto *BB = Inst->getParent();
- if (isa<PHINode>(Inst) && BB->getFirstInsertionPt() == BB->end())
- return true;
- return false;
- }))
- return nullptr;
-
- PHINode *NewPtrPHI = PHINode::Create(
- IntToPtr->getType(), PN.getNumIncomingValues(), PN.getName() + ".ptr");
-
- InsertNewInstBefore(NewPtrPHI, PN);
- SmallDenseMap<Value *, Instruction *> Casts;
- for (unsigned i = 0; i != PN.getNumIncomingValues(); ++i) {
- auto *IncomingBB = PN.getIncomingBlock(i);
- auto *IncomingVal = AvailablePtrVals[i];
-
- if (IncomingVal->getType() == IntToPtr->getType()) {
- NewPtrPHI->addIncoming(IncomingVal, IncomingBB);
- continue;
- }
-
-#ifndef NDEBUG
- LoadInst *LoadI = dyn_cast<LoadInst>(IncomingVal);
- assert((isa<PHINode>(IncomingVal) ||
- IncomingVal->getType()->isPointerTy() ||
- (LoadI && LoadI->hasOneUse())) &&
- "Can not replace LoadInst with multiple uses");
-#endif
- // Need to insert a BitCast.
- // For an integer Load instruction with a single use, the load + IntToPtr
- // cast will be simplified into a pointer load:
- // %v = load i64, i64* %a.ip, align 8
- // %v.cast = inttoptr i64 %v to float **
- // ==>
- // %v.ptrp = bitcast i64 * %a.ip to float **
- // %v.cast = load float *, float ** %v.ptrp, align 8
- Instruction *&CI = Casts[IncomingVal];
- if (!CI) {
- CI = CastInst::CreateBitOrPointerCast(IncomingVal, IntToPtr->getType(),
- IncomingVal->getName() + ".ptr");
- if (auto *IncomingI = dyn_cast<Instruction>(IncomingVal)) {
- BasicBlock::iterator InsertPos(IncomingI);
- InsertPos++;
- BasicBlock *BB = IncomingI->getParent();
- if (isa<PHINode>(IncomingI))
- InsertPos = BB->getFirstInsertionPt();
- assert(InsertPos != BB->end() && "should have checked above");
- InsertNewInstBefore(CI, *InsertPos);
- } else {
- auto *InsertBB = &IncomingBB->getParent()->getEntryBlock();
- InsertNewInstBefore(CI, *InsertBB->getFirstInsertionPt());
- }
- }
- NewPtrPHI->addIncoming(CI, IncomingBB);
- }
-
- // The PtrToCast + IntToPtr will be simplified later
- return CastInst::CreateBitOrPointerCast(NewPtrPHI,
- IntToPtr->getOperand(0)->getType());
-}
-
+ if (!PN.getType()->isIntegerTy())
+ return nullptr;
+ if (!PN.hasOneUse())
+ return nullptr;
+
+ auto *IntToPtr = dyn_cast<IntToPtrInst>(PN.user_back());
+ if (!IntToPtr)
+ return nullptr;
+
+ // Check if the pointer is actually used as pointer:
+ auto HasPointerUse = [](Instruction *IIP) {
+ for (User *U : IIP->users()) {
+ Value *Ptr = nullptr;
+ if (LoadInst *LoadI = dyn_cast<LoadInst>(U)) {
+ Ptr = LoadI->getPointerOperand();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+ Ptr = SI->getPointerOperand();
+ } else if (GetElementPtrInst *GI = dyn_cast<GetElementPtrInst>(U)) {
+ Ptr = GI->getPointerOperand();
+ }
+
+ if (Ptr && Ptr == IIP)
+ return true;
+ }
+ return false;
+ };
+
+ if (!HasPointerUse(IntToPtr))
+ return nullptr;
+
+ if (DL.getPointerSizeInBits(IntToPtr->getAddressSpace()) !=
+ DL.getTypeSizeInBits(IntToPtr->getOperand(0)->getType()))
+ return nullptr;
+
+ SmallVector<Value *, 4> AvailablePtrVals;
+ for (unsigned i = 0; i != PN.getNumIncomingValues(); ++i) {
+ Value *Arg = PN.getIncomingValue(i);
+
+ // First look backward:
+ if (auto *PI = dyn_cast<PtrToIntInst>(Arg)) {
+ AvailablePtrVals.emplace_back(PI->getOperand(0));
+ continue;
+ }
+
+ // Next look forward:
+ Value *ArgIntToPtr = nullptr;
+ for (User *U : Arg->users()) {
+ if (isa<IntToPtrInst>(U) && U->getType() == IntToPtr->getType() &&
+ (DT.dominates(cast<Instruction>(U), PN.getIncomingBlock(i)) ||
+ cast<Instruction>(U)->getParent() == PN.getIncomingBlock(i))) {
+ ArgIntToPtr = U;
+ break;
+ }
+ }
+
+ if (ArgIntToPtr) {
+ AvailablePtrVals.emplace_back(ArgIntToPtr);
+ continue;
+ }
+
+ // If Arg is defined by a PHI, allow it. This will also create
+ // more opportunities iteratively.
+ if (isa<PHINode>(Arg)) {
+ AvailablePtrVals.emplace_back(Arg);
+ continue;
+ }
+
+ // For a single use integer load:
+ auto *LoadI = dyn_cast<LoadInst>(Arg);
+ if (!LoadI)
+ return nullptr;
+
+ if (!LoadI->hasOneUse())
+ return nullptr;
+
+ // Push the integer typed Load instruction into the available
+ // value set, and fix it up later when the pointer typed PHI
+ // is synthesized.
+ AvailablePtrVals.emplace_back(LoadI);
+ }
+
+ // Now search for a matching PHI
+ auto *BB = PN.getParent();
+ assert(AvailablePtrVals.size() == PN.getNumIncomingValues() &&
+ "Not enough available ptr typed incoming values");
+ PHINode *MatchingPtrPHI = nullptr;
+ unsigned NumPhis = 0;
+ for (auto II = BB->begin(); II != BB->end(); II++, NumPhis++) {
+ // FIXME: consider handling this in AggressiveInstCombine
+ PHINode *PtrPHI = dyn_cast<PHINode>(II);
+ if (!PtrPHI)
+ break;
+ if (NumPhis > MaxNumPhis)
+ return nullptr;
+ if (PtrPHI == &PN || PtrPHI->getType() != IntToPtr->getType())
+ continue;
+ MatchingPtrPHI = PtrPHI;
+ for (unsigned i = 0; i != PtrPHI->getNumIncomingValues(); ++i) {
+ if (AvailablePtrVals[i] !=
+ PtrPHI->getIncomingValueForBlock(PN.getIncomingBlock(i))) {
+ MatchingPtrPHI = nullptr;
+ break;
+ }
+ }
+
+ if (MatchingPtrPHI)
+ break;
+ }
+
+ if (MatchingPtrPHI) {
+ assert(MatchingPtrPHI->getType() == IntToPtr->getType() &&
+ "Phi's Type does not match with IntToPtr");
+ // The PtrToCast + IntToPtr will be simplified later
+ return CastInst::CreateBitOrPointerCast(MatchingPtrPHI,
+ IntToPtr->getOperand(0)->getType());
+ }
+
+ // If it requires a conversion for every PHI operand, do not do it.
+ if (all_of(AvailablePtrVals, [&](Value *V) {
+ return (V->getType() != IntToPtr->getType()) || isa<IntToPtrInst>(V);
+ }))
+ return nullptr;
+
+ // If any of the operand that requires casting is a terminator
+ // instruction, do not do it. Similarly, do not do the transform if the value
+ // is PHI in a block with no insertion point, for example, a catchswitch
+ // block, since we will not be able to insert a cast after the PHI.
+ if (any_of(AvailablePtrVals, [&](Value *V) {
+ if (V->getType() == IntToPtr->getType())
+ return false;
+ auto *Inst = dyn_cast<Instruction>(V);
+ if (!Inst)
+ return false;
+ if (Inst->isTerminator())
+ return true;
+ auto *BB = Inst->getParent();
+ if (isa<PHINode>(Inst) && BB->getFirstInsertionPt() == BB->end())
+ return true;
+ return false;
+ }))
+ return nullptr;
+
+ PHINode *NewPtrPHI = PHINode::Create(
+ IntToPtr->getType(), PN.getNumIncomingValues(), PN.getName() + ".ptr");
+
+ InsertNewInstBefore(NewPtrPHI, PN);
+ SmallDenseMap<Value *, Instruction *> Casts;
+ for (unsigned i = 0; i != PN.getNumIncomingValues(); ++i) {
+ auto *IncomingBB = PN.getIncomingBlock(i);
+ auto *IncomingVal = AvailablePtrVals[i];
+
+ if (IncomingVal->getType() == IntToPtr->getType()) {
+ NewPtrPHI->addIncoming(IncomingVal, IncomingBB);
+ continue;
+ }
+
+#ifndef NDEBUG
+ LoadInst *LoadI = dyn_cast<LoadInst>(IncomingVal);
+ assert((isa<PHINode>(IncomingVal) ||
+ IncomingVal->getType()->isPointerTy() ||
+ (LoadI && LoadI->hasOneUse())) &&
+ "Can not replace LoadInst with multiple uses");
+#endif
+ // Need to insert a BitCast.
+ // For an integer Load instruction with a single use, the load + IntToPtr
+ // cast will be simplified into a pointer load:
+ // %v = load i64, i64* %a.ip, align 8
+ // %v.cast = inttoptr i64 %v to float **
+ // ==>
+ // %v.ptrp = bitcast i64 * %a.ip to float **
+ // %v.cast = load float *, float ** %v.ptrp, align 8
+ Instruction *&CI = Casts[IncomingVal];
+ if (!CI) {
+ CI = CastInst::CreateBitOrPointerCast(IncomingVal, IntToPtr->getType(),
+ IncomingVal->getName() + ".ptr");
+ if (auto *IncomingI = dyn_cast<Instruction>(IncomingVal)) {
+ BasicBlock::iterator InsertPos(IncomingI);
+ InsertPos++;
+ BasicBlock *BB = IncomingI->getParent();
+ if (isa<PHINode>(IncomingI))
+ InsertPos = BB->getFirstInsertionPt();
+ assert(InsertPos != BB->end() && "should have checked above");
+ InsertNewInstBefore(CI, *InsertPos);
+ } else {
+ auto *InsertBB = &IncomingBB->getParent()->getEntryBlock();
+ InsertNewInstBefore(CI, *InsertBB->getFirstInsertionPt());
+ }
+ }
+ NewPtrPHI->addIncoming(CI, IncomingBB);
+ }
+
+ // The PtrToCast + IntToPtr will be simplified later
+ return CastInst::CreateBitOrPointerCast(NewPtrPHI,
+ IntToPtr->getOperand(0)->getType());
+}
+
/// If we have something like phi [insertvalue(a,b,0), insertvalue(c,d,0)],
/// turn this into a phi[a,c] and phi[b,d] and a single insertvalue.
Instruction *
@@ -376,855 +376,855 @@ InstCombinerImpl::foldPHIArgExtractValueInstructionIntoPHI(PHINode &PN) {
return NewEVI;
}
-/// If we have something like phi [add (a,b), add(a,c)] and if a/b/c and the
+/// If we have something like phi [add (a,b), add(a,c)] and if a/b/c and the
/// adds all have a single user, turn this into a phi and a single binop.
Instruction *InstCombinerImpl::foldPHIArgBinOpIntoPHI(PHINode &PN) {
- Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
- assert(isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst));
- unsigned Opc = FirstInst->getOpcode();
- Value *LHSVal = FirstInst->getOperand(0);
- Value *RHSVal = FirstInst->getOperand(1);
-
- Type *LHSType = LHSVal->getType();
- Type *RHSType = RHSVal->getType();
-
+ Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+ assert(isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst));
+ unsigned Opc = FirstInst->getOpcode();
+ Value *LHSVal = FirstInst->getOperand(0);
+ Value *RHSVal = FirstInst->getOperand(1);
+
+ Type *LHSType = LHSVal->getType();
+ Type *RHSType = RHSVal->getType();
+
// Scan to see if all operands are the same opcode, and all have one user.
- for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
- Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
+ for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+ Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
if (!I || I->getOpcode() != Opc || !I->hasOneUser() ||
- // Verify type of the LHS matches so we don't fold cmp's of different
- // types.
- I->getOperand(0)->getType() != LHSType ||
- I->getOperand(1)->getType() != RHSType)
- return nullptr;
-
- // If they are CmpInst instructions, check their predicates
- if (CmpInst *CI = dyn_cast<CmpInst>(I))
- if (CI->getPredicate() != cast<CmpInst>(FirstInst)->getPredicate())
- return nullptr;
-
- // Keep track of which operand needs a phi node.
- if (I->getOperand(0) != LHSVal) LHSVal = nullptr;
- if (I->getOperand(1) != RHSVal) RHSVal = nullptr;
- }
-
- // If both LHS and RHS would need a PHI, don't do this transformation,
- // because it would increase the number of PHIs entering the block,
- // which leads to higher register pressure. This is especially
- // bad when the PHIs are in the header of a loop.
- if (!LHSVal && !RHSVal)
- return nullptr;
-
- // Otherwise, this is safe to transform!
-
- Value *InLHS = FirstInst->getOperand(0);
- Value *InRHS = FirstInst->getOperand(1);
- PHINode *NewLHS = nullptr, *NewRHS = nullptr;
- if (!LHSVal) {
- NewLHS = PHINode::Create(LHSType, PN.getNumIncomingValues(),
- FirstInst->getOperand(0)->getName() + ".pn");
- NewLHS->addIncoming(InLHS, PN.getIncomingBlock(0));
- InsertNewInstBefore(NewLHS, PN);
- LHSVal = NewLHS;
- }
-
- if (!RHSVal) {
- NewRHS = PHINode::Create(RHSType, PN.getNumIncomingValues(),
- FirstInst->getOperand(1)->getName() + ".pn");
- NewRHS->addIncoming(InRHS, PN.getIncomingBlock(0));
- InsertNewInstBefore(NewRHS, PN);
- RHSVal = NewRHS;
- }
-
- // Add all operands to the new PHIs.
- if (NewLHS || NewRHS) {
- for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
- Instruction *InInst = cast<Instruction>(PN.getIncomingValue(i));
- if (NewLHS) {
- Value *NewInLHS = InInst->getOperand(0);
- NewLHS->addIncoming(NewInLHS, PN.getIncomingBlock(i));
- }
- if (NewRHS) {
- Value *NewInRHS = InInst->getOperand(1);
- NewRHS->addIncoming(NewInRHS, PN.getIncomingBlock(i));
- }
- }
- }
-
- if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) {
- CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
- LHSVal, RHSVal);
- PHIArgMergedDebugLoc(NewCI, PN);
- return NewCI;
- }
-
- BinaryOperator *BinOp = cast<BinaryOperator>(FirstInst);
- BinaryOperator *NewBinOp =
- BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal);
-
- NewBinOp->copyIRFlags(PN.getIncomingValue(0));
-
- for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i)
- NewBinOp->andIRFlags(PN.getIncomingValue(i));
-
- PHIArgMergedDebugLoc(NewBinOp, PN);
- return NewBinOp;
-}
-
+ // Verify type of the LHS matches so we don't fold cmp's of different
+ // types.
+ I->getOperand(0)->getType() != LHSType ||
+ I->getOperand(1)->getType() != RHSType)
+ return nullptr;
+
+ // If they are CmpInst instructions, check their predicates
+ if (CmpInst *CI = dyn_cast<CmpInst>(I))
+ if (CI->getPredicate() != cast<CmpInst>(FirstInst)->getPredicate())
+ return nullptr;
+
+ // Keep track of which operand needs a phi node.
+ if (I->getOperand(0) != LHSVal) LHSVal = nullptr;
+ if (I->getOperand(1) != RHSVal) RHSVal = nullptr;
+ }
+
+ // If both LHS and RHS would need a PHI, don't do this transformation,
+ // because it would increase the number of PHIs entering the block,
+ // which leads to higher register pressure. This is especially
+ // bad when the PHIs are in the header of a loop.
+ if (!LHSVal && !RHSVal)
+ return nullptr;
+
+ // Otherwise, this is safe to transform!
+
+ Value *InLHS = FirstInst->getOperand(0);
+ Value *InRHS = FirstInst->getOperand(1);
+ PHINode *NewLHS = nullptr, *NewRHS = nullptr;
+ if (!LHSVal) {
+ NewLHS = PHINode::Create(LHSType, PN.getNumIncomingValues(),
+ FirstInst->getOperand(0)->getName() + ".pn");
+ NewLHS->addIncoming(InLHS, PN.getIncomingBlock(0));
+ InsertNewInstBefore(NewLHS, PN);
+ LHSVal = NewLHS;
+ }
+
+ if (!RHSVal) {
+ NewRHS = PHINode::Create(RHSType, PN.getNumIncomingValues(),
+ FirstInst->getOperand(1)->getName() + ".pn");
+ NewRHS->addIncoming(InRHS, PN.getIncomingBlock(0));
+ InsertNewInstBefore(NewRHS, PN);
+ RHSVal = NewRHS;
+ }
+
+ // Add all operands to the new PHIs.
+ if (NewLHS || NewRHS) {
+ for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+ Instruction *InInst = cast<Instruction>(PN.getIncomingValue(i));
+ if (NewLHS) {
+ Value *NewInLHS = InInst->getOperand(0);
+ NewLHS->addIncoming(NewInLHS, PN.getIncomingBlock(i));
+ }
+ if (NewRHS) {
+ Value *NewInRHS = InInst->getOperand(1);
+ NewRHS->addIncoming(NewInRHS, PN.getIncomingBlock(i));
+ }
+ }
+ }
+
+ if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) {
+ CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
+ LHSVal, RHSVal);
+ PHIArgMergedDebugLoc(NewCI, PN);
+ return NewCI;
+ }
+
+ BinaryOperator *BinOp = cast<BinaryOperator>(FirstInst);
+ BinaryOperator *NewBinOp =
+ BinaryOperator::Create(BinOp->getOpcode(), LHSVal, RHSVal);
+
+ NewBinOp->copyIRFlags(PN.getIncomingValue(0));
+
+ for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i)
+ NewBinOp->andIRFlags(PN.getIncomingValue(i));
+
+ PHIArgMergedDebugLoc(NewBinOp, PN);
+ return NewBinOp;
+}
+
Instruction *InstCombinerImpl::foldPHIArgGEPIntoPHI(PHINode &PN) {
- GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0));
-
- SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(),
- FirstInst->op_end());
- // This is true if all GEP bases are allocas and if all indices into them are
- // constants.
- bool AllBasePointersAreAllocas = true;
-
- // We don't want to replace this phi if the replacement would require
- // more than one phi, which leads to higher register pressure. This is
- // especially bad when the PHIs are in the header of a loop.
- bool NeededPhi = false;
-
- bool AllInBounds = true;
-
+ GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0));
+
+ SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(),
+ FirstInst->op_end());
+ // This is true if all GEP bases are allocas and if all indices into them are
+ // constants.
+ bool AllBasePointersAreAllocas = true;
+
+ // We don't want to replace this phi if the replacement would require
+ // more than one phi, which leads to higher register pressure. This is
+ // especially bad when the PHIs are in the header of a loop.
+ bool NeededPhi = false;
+
+ bool AllInBounds = true;
+
// Scan to see if all operands are the same opcode, and all have one user.
- for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
+ for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
GetElementPtrInst *GEP =
dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i));
if (!GEP || !GEP->hasOneUser() || GEP->getType() != FirstInst->getType() ||
GEP->getNumOperands() != FirstInst->getNumOperands())
- return nullptr;
-
- AllInBounds &= GEP->isInBounds();
-
- // Keep track of whether or not all GEPs are of alloca pointers.
- if (AllBasePointersAreAllocas &&
- (!isa<AllocaInst>(GEP->getOperand(0)) ||
- !GEP->hasAllConstantIndices()))
- AllBasePointersAreAllocas = false;
-
- // Compare the operand lists.
- for (unsigned op = 0, e = FirstInst->getNumOperands(); op != e; ++op) {
- if (FirstInst->getOperand(op) == GEP->getOperand(op))
- continue;
-
- // Don't merge two GEPs when two operands differ (introducing phi nodes)
- // if one of the PHIs has a constant for the index. The index may be
- // substantially cheaper to compute for the constants, so making it a
- // variable index could pessimize the path. This also handles the case
- // for struct indices, which must always be constant.
- if (isa<ConstantInt>(FirstInst->getOperand(op)) ||
- isa<ConstantInt>(GEP->getOperand(op)))
- return nullptr;
-
- if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType())
- return nullptr;
-
- // If we already needed a PHI for an earlier operand, and another operand
- // also requires a PHI, we'd be introducing more PHIs than we're
- // eliminating, which increases register pressure on entry to the PHI's
- // block.
- if (NeededPhi)
- return nullptr;
-
- FixedOperands[op] = nullptr; // Needs a PHI.
- NeededPhi = true;
- }
- }
-
- // If all of the base pointers of the PHI'd GEPs are from allocas, don't
- // bother doing this transformation. At best, this will just save a bit of
- // offset calculation, but all the predecessors will have to materialize the
- // stack address into a register anyway. We'd actually rather *clone* the
- // load up into the predecessors so that we have a load of a gep of an alloca,
- // which can usually all be folded into the load.
- if (AllBasePointersAreAllocas)
- return nullptr;
-
- // Otherwise, this is safe to transform. Insert PHI nodes for each operand
- // that is variable.
- SmallVector<PHINode*, 16> OperandPhis(FixedOperands.size());
-
- bool HasAnyPHIs = false;
- for (unsigned i = 0, e = FixedOperands.size(); i != e; ++i) {
- if (FixedOperands[i]) continue; // operand doesn't need a phi.
- Value *FirstOp = FirstInst->getOperand(i);
- PHINode *NewPN = PHINode::Create(FirstOp->getType(), e,
- FirstOp->getName()+".pn");
- InsertNewInstBefore(NewPN, PN);
-
- NewPN->addIncoming(FirstOp, PN.getIncomingBlock(0));
- OperandPhis[i] = NewPN;
- FixedOperands[i] = NewPN;
- HasAnyPHIs = true;
- }
-
-
- // Add all operands to the new PHIs.
- if (HasAnyPHIs) {
- for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
- GetElementPtrInst *InGEP =cast<GetElementPtrInst>(PN.getIncomingValue(i));
- BasicBlock *InBB = PN.getIncomingBlock(i);
-
- for (unsigned op = 0, e = OperandPhis.size(); op != e; ++op)
- if (PHINode *OpPhi = OperandPhis[op])
- OpPhi->addIncoming(InGEP->getOperand(op), InBB);
- }
- }
-
- Value *Base = FixedOperands[0];
- GetElementPtrInst *NewGEP =
- GetElementPtrInst::Create(FirstInst->getSourceElementType(), Base,
- makeArrayRef(FixedOperands).slice(1));
- if (AllInBounds) NewGEP->setIsInBounds();
- PHIArgMergedDebugLoc(NewGEP, PN);
- return NewGEP;
-}
-
-/// Return true if we know that it is safe to sink the load out of the block
-/// that defines it. This means that it must be obvious the value of the load is
-/// not changed from the point of the load to the end of the block it is in.
-///
-/// Finally, it is safe, but not profitable, to sink a load targeting a
-/// non-address-taken alloca. Doing so will cause us to not promote the alloca
-/// to a register.
-static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
- BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end();
-
- for (++BBI; BBI != E; ++BBI)
+ return nullptr;
+
+ AllInBounds &= GEP->isInBounds();
+
+ // Keep track of whether or not all GEPs are of alloca pointers.
+ if (AllBasePointersAreAllocas &&
+ (!isa<AllocaInst>(GEP->getOperand(0)) ||
+ !GEP->hasAllConstantIndices()))
+ AllBasePointersAreAllocas = false;
+
+ // Compare the operand lists.
+ for (unsigned op = 0, e = FirstInst->getNumOperands(); op != e; ++op) {
+ if (FirstInst->getOperand(op) == GEP->getOperand(op))
+ continue;
+
+ // Don't merge two GEPs when two operands differ (introducing phi nodes)
+ // if one of the PHIs has a constant for the index. The index may be
+ // substantially cheaper to compute for the constants, so making it a
+ // variable index could pessimize the path. This also handles the case
+ // for struct indices, which must always be constant.
+ if (isa<ConstantInt>(FirstInst->getOperand(op)) ||
+ isa<ConstantInt>(GEP->getOperand(op)))
+ return nullptr;
+
+ if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType())
+ return nullptr;
+
+ // If we already needed a PHI for an earlier operand, and another operand
+ // also requires a PHI, we'd be introducing more PHIs than we're
+ // eliminating, which increases register pressure on entry to the PHI's
+ // block.
+ if (NeededPhi)
+ return nullptr;
+
+ FixedOperands[op] = nullptr; // Needs a PHI.
+ NeededPhi = true;
+ }
+ }
+
+ // If all of the base pointers of the PHI'd GEPs are from allocas, don't
+ // bother doing this transformation. At best, this will just save a bit of
+ // offset calculation, but all the predecessors will have to materialize the
+ // stack address into a register anyway. We'd actually rather *clone* the
+ // load up into the predecessors so that we have a load of a gep of an alloca,
+ // which can usually all be folded into the load.
+ if (AllBasePointersAreAllocas)
+ return nullptr;
+
+ // Otherwise, this is safe to transform. Insert PHI nodes for each operand
+ // that is variable.
+ SmallVector<PHINode*, 16> OperandPhis(FixedOperands.size());
+
+ bool HasAnyPHIs = false;
+ for (unsigned i = 0, e = FixedOperands.size(); i != e; ++i) {
+ if (FixedOperands[i]) continue; // operand doesn't need a phi.
+ Value *FirstOp = FirstInst->getOperand(i);
+ PHINode *NewPN = PHINode::Create(FirstOp->getType(), e,
+ FirstOp->getName()+".pn");
+ InsertNewInstBefore(NewPN, PN);
+
+ NewPN->addIncoming(FirstOp, PN.getIncomingBlock(0));
+ OperandPhis[i] = NewPN;
+ FixedOperands[i] = NewPN;
+ HasAnyPHIs = true;
+ }
+
+
+ // Add all operands to the new PHIs.
+ if (HasAnyPHIs) {
+ for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+ GetElementPtrInst *InGEP =cast<GetElementPtrInst>(PN.getIncomingValue(i));
+ BasicBlock *InBB = PN.getIncomingBlock(i);
+
+ for (unsigned op = 0, e = OperandPhis.size(); op != e; ++op)
+ if (PHINode *OpPhi = OperandPhis[op])
+ OpPhi->addIncoming(InGEP->getOperand(op), InBB);
+ }
+ }
+
+ Value *Base = FixedOperands[0];
+ GetElementPtrInst *NewGEP =
+ GetElementPtrInst::Create(FirstInst->getSourceElementType(), Base,
+ makeArrayRef(FixedOperands).slice(1));
+ if (AllInBounds) NewGEP->setIsInBounds();
+ PHIArgMergedDebugLoc(NewGEP, PN);
+ return NewGEP;
+}
+
+/// Return true if we know that it is safe to sink the load out of the block
+/// that defines it. This means that it must be obvious the value of the load is
+/// not changed from the point of the load to the end of the block it is in.
+///
+/// Finally, it is safe, but not profitable, to sink a load targeting a
+/// non-address-taken alloca. Doing so will cause us to not promote the alloca
+/// to a register.
+static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
+ BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end();
+
+ for (++BBI; BBI != E; ++BBI)
if (BBI->mayWriteToMemory()) {
// Calls that only access inaccessible memory do not block sinking the
// load.
if (auto *CB = dyn_cast<CallBase>(BBI))
if (CB->onlyAccessesInaccessibleMemory())
continue;
- return false;
+ return false;
}
-
- // Check for non-address taken alloca. If not address-taken already, it isn't
- // profitable to do this xform.
- if (AllocaInst *AI = dyn_cast<AllocaInst>(L->getOperand(0))) {
- bool isAddressTaken = false;
- for (User *U : AI->users()) {
- if (isa<LoadInst>(U)) continue;
- if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
- // If storing TO the alloca, then the address isn't taken.
- if (SI->getOperand(1) == AI) continue;
- }
- isAddressTaken = true;
- break;
- }
-
- if (!isAddressTaken && AI->isStaticAlloca())
- return false;
- }
-
- // If this load is a load from a GEP with a constant offset from an alloca,
- // then we don't want to sink it. In its present form, it will be
- // load [constant stack offset]. Sinking it will cause us to have to
- // materialize the stack addresses in each predecessor in a register only to
- // do a shared load from register in the successor.
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(L->getOperand(0)))
- if (AllocaInst *AI = dyn_cast<AllocaInst>(GEP->getOperand(0)))
- if (AI->isStaticAlloca() && GEP->hasAllConstantIndices())
- return false;
-
- return true;
-}
-
+
+ // Check for non-address taken alloca. If not address-taken already, it isn't
+ // profitable to do this xform.
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(L->getOperand(0))) {
+ bool isAddressTaken = false;
+ for (User *U : AI->users()) {
+ if (isa<LoadInst>(U)) continue;
+ if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+ // If storing TO the alloca, then the address isn't taken.
+ if (SI->getOperand(1) == AI) continue;
+ }
+ isAddressTaken = true;
+ break;
+ }
+
+ if (!isAddressTaken && AI->isStaticAlloca())
+ return false;
+ }
+
+ // If this load is a load from a GEP with a constant offset from an alloca,
+ // then we don't want to sink it. In its present form, it will be
+ // load [constant stack offset]. Sinking it will cause us to have to
+ // materialize the stack addresses in each predecessor in a register only to
+ // do a shared load from register in the successor.
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(L->getOperand(0)))
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(GEP->getOperand(0)))
+ if (AI->isStaticAlloca() && GEP->hasAllConstantIndices())
+ return false;
+
+ return true;
+}
+
Instruction *InstCombinerImpl::foldPHIArgLoadIntoPHI(PHINode &PN) {
- LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0));
-
- // FIXME: This is overconservative; this transform is allowed in some cases
- // for atomic operations.
- if (FirstLI->isAtomic())
- return nullptr;
-
- // When processing loads, we need to propagate two bits of information to the
- // sunk load: whether it is volatile, and what its alignment is. We currently
- // don't sink loads when some have their alignment specified and some don't.
- // visitLoadInst will propagate an alignment onto the load when TD is around,
- // and if TD isn't around, we can't handle the mixed case.
- bool isVolatile = FirstLI->isVolatile();
- Align LoadAlignment = FirstLI->getAlign();
- unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
-
- // We can't sink the load if the loaded value could be modified between the
- // load and the PHI.
- if (FirstLI->getParent() != PN.getIncomingBlock(0) ||
- !isSafeAndProfitableToSinkLoad(FirstLI))
- return nullptr;
-
- // If the PHI is of volatile loads and the load block has multiple
- // successors, sinking it would remove a load of the volatile value from
- // the path through the other successor.
- if (isVolatile &&
- FirstLI->getParent()->getTerminator()->getNumSuccessors() != 1)
- return nullptr;
-
- // Check to see if all arguments are the same operation.
- for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
- LoadInst *LI = dyn_cast<LoadInst>(PN.getIncomingValue(i));
+ LoadInst *FirstLI = cast<LoadInst>(PN.getIncomingValue(0));
+
+ // FIXME: This is overconservative; this transform is allowed in some cases
+ // for atomic operations.
+ if (FirstLI->isAtomic())
+ return nullptr;
+
+ // When processing loads, we need to propagate two bits of information to the
+ // sunk load: whether it is volatile, and what its alignment is. We currently
+ // don't sink loads when some have their alignment specified and some don't.
+ // visitLoadInst will propagate an alignment onto the load when TD is around,
+ // and if TD isn't around, we can't handle the mixed case.
+ bool isVolatile = FirstLI->isVolatile();
+ Align LoadAlignment = FirstLI->getAlign();
+ unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
+
+ // We can't sink the load if the loaded value could be modified between the
+ // load and the PHI.
+ if (FirstLI->getParent() != PN.getIncomingBlock(0) ||
+ !isSafeAndProfitableToSinkLoad(FirstLI))
+ return nullptr;
+
+ // If the PHI is of volatile loads and the load block has multiple
+ // successors, sinking it would remove a load of the volatile value from
+ // the path through the other successor.
+ if (isVolatile &&
+ FirstLI->getParent()->getTerminator()->getNumSuccessors() != 1)
+ return nullptr;
+
+ // Check to see if all arguments are the same operation.
+ for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+ LoadInst *LI = dyn_cast<LoadInst>(PN.getIncomingValue(i));
if (!LI || !LI->hasOneUser())
- return nullptr;
-
- // We can't sink the load if the loaded value could be modified between
- // the load and the PHI.
- if (LI->isVolatile() != isVolatile ||
- LI->getParent() != PN.getIncomingBlock(i) ||
- LI->getPointerAddressSpace() != LoadAddrSpace ||
- !isSafeAndProfitableToSinkLoad(LI))
- return nullptr;
-
- LoadAlignment = std::min(LoadAlignment, Align(LI->getAlign()));
-
- // If the PHI is of volatile loads and the load block has multiple
- // successors, sinking it would remove a load of the volatile value from
- // the path through the other successor.
- if (isVolatile &&
- LI->getParent()->getTerminator()->getNumSuccessors() != 1)
- return nullptr;
- }
-
- // Okay, they are all the same operation. Create a new PHI node of the
- // correct type, and PHI together all of the LHS's of the instructions.
- PHINode *NewPN = PHINode::Create(FirstLI->getOperand(0)->getType(),
- PN.getNumIncomingValues(),
- PN.getName()+".in");
-
- Value *InVal = FirstLI->getOperand(0);
- NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
- LoadInst *NewLI =
- new LoadInst(FirstLI->getType(), NewPN, "", isVolatile, LoadAlignment);
-
- unsigned KnownIDs[] = {
- LLVMContext::MD_tbaa,
- LLVMContext::MD_range,
- LLVMContext::MD_invariant_load,
- LLVMContext::MD_alias_scope,
- LLVMContext::MD_noalias,
- LLVMContext::MD_nonnull,
- LLVMContext::MD_align,
- LLVMContext::MD_dereferenceable,
- LLVMContext::MD_dereferenceable_or_null,
- LLVMContext::MD_access_group,
- };
-
- for (unsigned ID : KnownIDs)
- NewLI->setMetadata(ID, FirstLI->getMetadata(ID));
-
- // Add all operands to the new PHI and combine TBAA metadata.
- for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
- LoadInst *LI = cast<LoadInst>(PN.getIncomingValue(i));
- combineMetadata(NewLI, LI, KnownIDs, true);
- Value *NewInVal = LI->getOperand(0);
- if (NewInVal != InVal)
- InVal = nullptr;
- NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
- }
-
- if (InVal) {
- // The new PHI unions all of the same values together. This is really
- // common, so we handle it intelligently here for compile-time speed.
- NewLI->setOperand(0, InVal);
- delete NewPN;
- } else {
- InsertNewInstBefore(NewPN, PN);
- }
-
- // If this was a volatile load that we are merging, make sure to loop through
- // and mark all the input loads as non-volatile. If we don't do this, we will
- // insert a new volatile load and the old ones will not be deletable.
- if (isVolatile)
- for (Value *IncValue : PN.incoming_values())
- cast<LoadInst>(IncValue)->setVolatile(false);
-
- PHIArgMergedDebugLoc(NewLI, PN);
- return NewLI;
-}
-
-/// TODO: This function could handle other cast types, but then it might
-/// require special-casing a cast from the 'i1' type. See the comment in
-/// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types.
+ return nullptr;
+
+ // We can't sink the load if the loaded value could be modified between
+ // the load and the PHI.
+ if (LI->isVolatile() != isVolatile ||
+ LI->getParent() != PN.getIncomingBlock(i) ||
+ LI->getPointerAddressSpace() != LoadAddrSpace ||
+ !isSafeAndProfitableToSinkLoad(LI))
+ return nullptr;
+
+ LoadAlignment = std::min(LoadAlignment, Align(LI->getAlign()));
+
+ // If the PHI is of volatile loads and the load block has multiple
+ // successors, sinking it would remove a load of the volatile value from
+ // the path through the other successor.
+ if (isVolatile &&
+ LI->getParent()->getTerminator()->getNumSuccessors() != 1)
+ return nullptr;
+ }
+
+ // Okay, they are all the same operation. Create a new PHI node of the
+ // correct type, and PHI together all of the LHS's of the instructions.
+ PHINode *NewPN = PHINode::Create(FirstLI->getOperand(0)->getType(),
+ PN.getNumIncomingValues(),
+ PN.getName()+".in");
+
+ Value *InVal = FirstLI->getOperand(0);
+ NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
+ LoadInst *NewLI =
+ new LoadInst(FirstLI->getType(), NewPN, "", isVolatile, LoadAlignment);
+
+ unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa,
+ LLVMContext::MD_range,
+ LLVMContext::MD_invariant_load,
+ LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias,
+ LLVMContext::MD_nonnull,
+ LLVMContext::MD_align,
+ LLVMContext::MD_dereferenceable,
+ LLVMContext::MD_dereferenceable_or_null,
+ LLVMContext::MD_access_group,
+ };
+
+ for (unsigned ID : KnownIDs)
+ NewLI->setMetadata(ID, FirstLI->getMetadata(ID));
+
+ // Add all operands to the new PHI and combine TBAA metadata.
+ for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+ LoadInst *LI = cast<LoadInst>(PN.getIncomingValue(i));
+ combineMetadata(NewLI, LI, KnownIDs, true);
+ Value *NewInVal = LI->getOperand(0);
+ if (NewInVal != InVal)
+ InVal = nullptr;
+ NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
+ }
+
+ if (InVal) {
+ // The new PHI unions all of the same values together. This is really
+ // common, so we handle it intelligently here for compile-time speed.
+ NewLI->setOperand(0, InVal);
+ delete NewPN;
+ } else {
+ InsertNewInstBefore(NewPN, PN);
+ }
+
+ // If this was a volatile load that we are merging, make sure to loop through
+ // and mark all the input loads as non-volatile. If we don't do this, we will
+ // insert a new volatile load and the old ones will not be deletable.
+ if (isVolatile)
+ for (Value *IncValue : PN.incoming_values())
+ cast<LoadInst>(IncValue)->setVolatile(false);
+
+ PHIArgMergedDebugLoc(NewLI, PN);
+ return NewLI;
+}
+
+/// TODO: This function could handle other cast types, but then it might
+/// require special-casing a cast from the 'i1' type. See the comment in
+/// FoldPHIArgOpIntoPHI() about pessimizing illegal integer types.
Instruction *InstCombinerImpl::foldPHIArgZextsIntoPHI(PHINode &Phi) {
- // We cannot create a new instruction after the PHI if the terminator is an
- // EHPad because there is no valid insertion point.
- if (Instruction *TI = Phi.getParent()->getTerminator())
- if (TI->isEHPad())
- return nullptr;
-
- // Early exit for the common case of a phi with two operands. These are
- // handled elsewhere. See the comment below where we check the count of zexts
- // and constants for more details.
- unsigned NumIncomingValues = Phi.getNumIncomingValues();
- if (NumIncomingValues < 3)
- return nullptr;
-
- // Find the narrower type specified by the first zext.
- Type *NarrowType = nullptr;
- for (Value *V : Phi.incoming_values()) {
- if (auto *Zext = dyn_cast<ZExtInst>(V)) {
- NarrowType = Zext->getSrcTy();
- break;
- }
- }
- if (!NarrowType)
- return nullptr;
-
- // Walk the phi operands checking that we only have zexts or constants that
- // we can shrink for free. Store the new operands for the new phi.
- SmallVector<Value *, 4> NewIncoming;
- unsigned NumZexts = 0;
- unsigned NumConsts = 0;
- for (Value *V : Phi.incoming_values()) {
- if (auto *Zext = dyn_cast<ZExtInst>(V)) {
+ // We cannot create a new instruction after the PHI if the terminator is an
+ // EHPad because there is no valid insertion point.
+ if (Instruction *TI = Phi.getParent()->getTerminator())
+ if (TI->isEHPad())
+ return nullptr;
+
+ // Early exit for the common case of a phi with two operands. These are
+ // handled elsewhere. See the comment below where we check the count of zexts
+ // and constants for more details.
+ unsigned NumIncomingValues = Phi.getNumIncomingValues();
+ if (NumIncomingValues < 3)
+ return nullptr;
+
+ // Find the narrower type specified by the first zext.
+ Type *NarrowType = nullptr;
+ for (Value *V : Phi.incoming_values()) {
+ if (auto *Zext = dyn_cast<ZExtInst>(V)) {
+ NarrowType = Zext->getSrcTy();
+ break;
+ }
+ }
+ if (!NarrowType)
+ return nullptr;
+
+ // Walk the phi operands checking that we only have zexts or constants that
+ // we can shrink for free. Store the new operands for the new phi.
+ SmallVector<Value *, 4> NewIncoming;
+ unsigned NumZexts = 0;
+ unsigned NumConsts = 0;
+ for (Value *V : Phi.incoming_values()) {
+ if (auto *Zext = dyn_cast<ZExtInst>(V)) {
// All zexts must be identical and have one user.
if (Zext->getSrcTy() != NarrowType || !Zext->hasOneUser())
- return nullptr;
- NewIncoming.push_back(Zext->getOperand(0));
- NumZexts++;
- } else if (auto *C = dyn_cast<Constant>(V)) {
- // Make sure that constants can fit in the new type.
- Constant *Trunc = ConstantExpr::getTrunc(C, NarrowType);
- if (ConstantExpr::getZExt(Trunc, C->getType()) != C)
- return nullptr;
- NewIncoming.push_back(Trunc);
- NumConsts++;
- } else {
- // If it's not a cast or a constant, bail out.
- return nullptr;
- }
- }
-
- // The more common cases of a phi with no constant operands or just one
- // variable operand are handled by FoldPHIArgOpIntoPHI() and foldOpIntoPhi()
- // respectively. foldOpIntoPhi() wants to do the opposite transform that is
- // performed here. It tries to replicate a cast in the phi operand's basic
- // block to expose other folding opportunities. Thus, InstCombine will
- // infinite loop without this check.
- if (NumConsts == 0 || NumZexts < 2)
- return nullptr;
-
- // All incoming values are zexts or constants that are safe to truncate.
- // Create a new phi node of the narrow type, phi together all of the new
- // operands, and zext the result back to the original type.
- PHINode *NewPhi = PHINode::Create(NarrowType, NumIncomingValues,
- Phi.getName() + ".shrunk");
- for (unsigned i = 0; i != NumIncomingValues; ++i)
- NewPhi->addIncoming(NewIncoming[i], Phi.getIncomingBlock(i));
-
- InsertNewInstBefore(NewPhi, Phi);
- return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType());
-}
-
-/// If all operands to a PHI node are the same "unary" operator and they all are
-/// only used by the PHI, PHI together their inputs, and do the operation once,
-/// to the result of the PHI.
+ return nullptr;
+ NewIncoming.push_back(Zext->getOperand(0));
+ NumZexts++;
+ } else if (auto *C = dyn_cast<Constant>(V)) {
+ // Make sure that constants can fit in the new type.
+ Constant *Trunc = ConstantExpr::getTrunc(C, NarrowType);
+ if (ConstantExpr::getZExt(Trunc, C->getType()) != C)
+ return nullptr;
+ NewIncoming.push_back(Trunc);
+ NumConsts++;
+ } else {
+ // If it's not a cast or a constant, bail out.
+ return nullptr;
+ }
+ }
+
+ // The more common cases of a phi with no constant operands or just one
+ // variable operand are handled by FoldPHIArgOpIntoPHI() and foldOpIntoPhi()
+ // respectively. foldOpIntoPhi() wants to do the opposite transform that is
+ // performed here. It tries to replicate a cast in the phi operand's basic
+ // block to expose other folding opportunities. Thus, InstCombine will
+ // infinite loop without this check.
+ if (NumConsts == 0 || NumZexts < 2)
+ return nullptr;
+
+ // All incoming values are zexts or constants that are safe to truncate.
+ // Create a new phi node of the narrow type, phi together all of the new
+ // operands, and zext the result back to the original type.
+ PHINode *NewPhi = PHINode::Create(NarrowType, NumIncomingValues,
+ Phi.getName() + ".shrunk");
+ for (unsigned i = 0; i != NumIncomingValues; ++i)
+ NewPhi->addIncoming(NewIncoming[i], Phi.getIncomingBlock(i));
+
+ InsertNewInstBefore(NewPhi, Phi);
+ return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType());
+}
+
+/// If all operands to a PHI node are the same "unary" operator and they all are
+/// only used by the PHI, PHI together their inputs, and do the operation once,
+/// to the result of the PHI.
Instruction *InstCombinerImpl::foldPHIArgOpIntoPHI(PHINode &PN) {
- // We cannot create a new instruction after the PHI if the terminator is an
- // EHPad because there is no valid insertion point.
- if (Instruction *TI = PN.getParent()->getTerminator())
- if (TI->isEHPad())
- return nullptr;
-
- Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
-
- if (isa<GetElementPtrInst>(FirstInst))
+ // We cannot create a new instruction after the PHI if the terminator is an
+ // EHPad because there is no valid insertion point.
+ if (Instruction *TI = PN.getParent()->getTerminator())
+ if (TI->isEHPad())
+ return nullptr;
+
+ Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+
+ if (isa<GetElementPtrInst>(FirstInst))
return foldPHIArgGEPIntoPHI(PN);
- if (isa<LoadInst>(FirstInst))
+ if (isa<LoadInst>(FirstInst))
return foldPHIArgLoadIntoPHI(PN);
if (isa<InsertValueInst>(FirstInst))
return foldPHIArgInsertValueInstructionIntoPHI(PN);
if (isa<ExtractValueInst>(FirstInst))
return foldPHIArgExtractValueInstructionIntoPHI(PN);
-
- // Scan the instruction, looking for input operations that can be folded away.
- // If all input operands to the phi are the same instruction (e.g. a cast from
- // the same type or "+42") we can pull the operation through the PHI, reducing
- // code size and simplifying code.
- Constant *ConstantOp = nullptr;
- Type *CastSrcTy = nullptr;
-
- if (isa<CastInst>(FirstInst)) {
- CastSrcTy = FirstInst->getOperand(0)->getType();
-
- // Be careful about transforming integer PHIs. We don't want to pessimize
- // the code by turning an i32 into an i1293.
- if (PN.getType()->isIntegerTy() && CastSrcTy->isIntegerTy()) {
- if (!shouldChangeType(PN.getType(), CastSrcTy))
- return nullptr;
- }
- } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
- // Can fold binop, compare or shift here if the RHS is a constant,
- // otherwise call FoldPHIArgBinOpIntoPHI.
- ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1));
- if (!ConstantOp)
+
+ // Scan the instruction, looking for input operations that can be folded away.
+ // If all input operands to the phi are the same instruction (e.g. a cast from
+ // the same type or "+42") we can pull the operation through the PHI, reducing
+ // code size and simplifying code.
+ Constant *ConstantOp = nullptr;
+ Type *CastSrcTy = nullptr;
+
+ if (isa<CastInst>(FirstInst)) {
+ CastSrcTy = FirstInst->getOperand(0)->getType();
+
+ // Be careful about transforming integer PHIs. We don't want to pessimize
+ // the code by turning an i32 into an i1293.
+ if (PN.getType()->isIntegerTy() && CastSrcTy->isIntegerTy()) {
+ if (!shouldChangeType(PN.getType(), CastSrcTy))
+ return nullptr;
+ }
+ } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
+ // Can fold binop, compare or shift here if the RHS is a constant,
+ // otherwise call FoldPHIArgBinOpIntoPHI.
+ ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1));
+ if (!ConstantOp)
return foldPHIArgBinOpIntoPHI(PN);
- } else {
- return nullptr; // Cannot fold this operation.
- }
-
- // Check to see if all arguments are the same operation.
- for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
- Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
+ } else {
+ return nullptr; // Cannot fold this operation.
+ }
+
+ // Check to see if all arguments are the same operation.
+ for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+ Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
if (!I || !I->hasOneUser() || !I->isSameOperationAs(FirstInst))
- return nullptr;
- if (CastSrcTy) {
- if (I->getOperand(0)->getType() != CastSrcTy)
- return nullptr; // Cast operation must match.
- } else if (I->getOperand(1) != ConstantOp) {
- return nullptr;
- }
- }
-
- // Okay, they are all the same operation. Create a new PHI node of the
- // correct type, and PHI together all of the LHS's of the instructions.
- PHINode *NewPN = PHINode::Create(FirstInst->getOperand(0)->getType(),
- PN.getNumIncomingValues(),
- PN.getName()+".in");
-
- Value *InVal = FirstInst->getOperand(0);
- NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
-
- // Add all operands to the new PHI.
- for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
- Value *NewInVal = cast<Instruction>(PN.getIncomingValue(i))->getOperand(0);
- if (NewInVal != InVal)
- InVal = nullptr;
- NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
- }
-
- Value *PhiVal;
- if (InVal) {
- // The new PHI unions all of the same values together. This is really
- // common, so we handle it intelligently here for compile-time speed.
- PhiVal = InVal;
- delete NewPN;
- } else {
- InsertNewInstBefore(NewPN, PN);
- PhiVal = NewPN;
- }
-
- // Insert and return the new operation.
- if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst)) {
- CastInst *NewCI = CastInst::Create(FirstCI->getOpcode(), PhiVal,
- PN.getType());
- PHIArgMergedDebugLoc(NewCI, PN);
- return NewCI;
- }
-
- if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst)) {
- BinOp = BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp);
- BinOp->copyIRFlags(PN.getIncomingValue(0));
-
- for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i)
- BinOp->andIRFlags(PN.getIncomingValue(i));
-
- PHIArgMergedDebugLoc(BinOp, PN);
- return BinOp;
- }
-
- CmpInst *CIOp = cast<CmpInst>(FirstInst);
- CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
- PhiVal, ConstantOp);
- PHIArgMergedDebugLoc(NewCI, PN);
- return NewCI;
-}
-
-/// Return true if this PHI node is only used by a PHI node cycle that is dead.
-static bool DeadPHICycle(PHINode *PN,
- SmallPtrSetImpl<PHINode*> &PotentiallyDeadPHIs) {
- if (PN->use_empty()) return true;
- if (!PN->hasOneUse()) return false;
-
- // Remember this node, and if we find the cycle, return.
- if (!PotentiallyDeadPHIs.insert(PN).second)
- return true;
-
- // Don't scan crazily complex things.
- if (PotentiallyDeadPHIs.size() == 16)
- return false;
-
- if (PHINode *PU = dyn_cast<PHINode>(PN->user_back()))
- return DeadPHICycle(PU, PotentiallyDeadPHIs);
-
- return false;
-}
-
-/// Return true if this phi node is always equal to NonPhiInVal.
-/// This happens with mutually cyclic phi nodes like:
-/// z = some value; x = phi (y, z); y = phi (x, z)
-static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal,
- SmallPtrSetImpl<PHINode*> &ValueEqualPHIs) {
- // See if we already saw this PHI node.
- if (!ValueEqualPHIs.insert(PN).second)
- return true;
-
- // Don't scan crazily complex things.
- if (ValueEqualPHIs.size() == 16)
- return false;
-
- // Scan the operands to see if they are either phi nodes or are equal to
- // the value.
- for (Value *Op : PN->incoming_values()) {
- if (PHINode *OpPN = dyn_cast<PHINode>(Op)) {
- if (!PHIsEqualValue(OpPN, NonPhiInVal, ValueEqualPHIs))
- return false;
- } else if (Op != NonPhiInVal)
- return false;
- }
-
- return true;
-}
-
-/// Return an existing non-zero constant if this phi node has one, otherwise
-/// return constant 1.
-static ConstantInt *GetAnyNonZeroConstInt(PHINode &PN) {
- assert(isa<IntegerType>(PN.getType()) && "Expect only integer type phi");
- for (Value *V : PN.operands())
- if (auto *ConstVA = dyn_cast<ConstantInt>(V))
- if (!ConstVA->isZero())
- return ConstVA;
- return ConstantInt::get(cast<IntegerType>(PN.getType()), 1);
-}
-
-namespace {
-struct PHIUsageRecord {
- unsigned PHIId; // The ID # of the PHI (something determinstic to sort on)
- unsigned Shift; // The amount shifted.
- Instruction *Inst; // The trunc instruction.
-
- PHIUsageRecord(unsigned pn, unsigned Sh, Instruction *User)
- : PHIId(pn), Shift(Sh), Inst(User) {}
-
- bool operator<(const PHIUsageRecord &RHS) const {
- if (PHIId < RHS.PHIId) return true;
- if (PHIId > RHS.PHIId) return false;
- if (Shift < RHS.Shift) return true;
- if (Shift > RHS.Shift) return false;
- return Inst->getType()->getPrimitiveSizeInBits() <
- RHS.Inst->getType()->getPrimitiveSizeInBits();
- }
-};
-
-struct LoweredPHIRecord {
- PHINode *PN; // The PHI that was lowered.
- unsigned Shift; // The amount shifted.
- unsigned Width; // The width extracted.
-
- LoweredPHIRecord(PHINode *pn, unsigned Sh, Type *Ty)
- : PN(pn), Shift(Sh), Width(Ty->getPrimitiveSizeInBits()) {}
-
- // Ctor form used by DenseMap.
- LoweredPHIRecord(PHINode *pn, unsigned Sh)
- : PN(pn), Shift(Sh), Width(0) {}
-};
+ return nullptr;
+ if (CastSrcTy) {
+ if (I->getOperand(0)->getType() != CastSrcTy)
+ return nullptr; // Cast operation must match.
+ } else if (I->getOperand(1) != ConstantOp) {
+ return nullptr;
+ }
+ }
+
+ // Okay, they are all the same operation. Create a new PHI node of the
+ // correct type, and PHI together all of the LHS's of the instructions.
+ PHINode *NewPN = PHINode::Create(FirstInst->getOperand(0)->getType(),
+ PN.getNumIncomingValues(),
+ PN.getName()+".in");
+
+ Value *InVal = FirstInst->getOperand(0);
+ NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
+
+ // Add all operands to the new PHI.
+ for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+ Value *NewInVal = cast<Instruction>(PN.getIncomingValue(i))->getOperand(0);
+ if (NewInVal != InVal)
+ InVal = nullptr;
+ NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
+ }
+
+ Value *PhiVal;
+ if (InVal) {
+ // The new PHI unions all of the same values together. This is really
+ // common, so we handle it intelligently here for compile-time speed.
+ PhiVal = InVal;
+ delete NewPN;
+ } else {
+ InsertNewInstBefore(NewPN, PN);
+ PhiVal = NewPN;
+ }
+
+ // Insert and return the new operation.
+ if (CastInst *FirstCI = dyn_cast<CastInst>(FirstInst)) {
+ CastInst *NewCI = CastInst::Create(FirstCI->getOpcode(), PhiVal,
+ PN.getType());
+ PHIArgMergedDebugLoc(NewCI, PN);
+ return NewCI;
+ }
+
+ if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst)) {
+ BinOp = BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp);
+ BinOp->copyIRFlags(PN.getIncomingValue(0));
+
+ for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i)
+ BinOp->andIRFlags(PN.getIncomingValue(i));
+
+ PHIArgMergedDebugLoc(BinOp, PN);
+ return BinOp;
+ }
+
+ CmpInst *CIOp = cast<CmpInst>(FirstInst);
+ CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
+ PhiVal, ConstantOp);
+ PHIArgMergedDebugLoc(NewCI, PN);
+ return NewCI;
+}
+
+/// Return true if this PHI node is only used by a PHI node cycle that is dead.
+static bool DeadPHICycle(PHINode *PN,
+ SmallPtrSetImpl<PHINode*> &PotentiallyDeadPHIs) {
+ if (PN->use_empty()) return true;
+ if (!PN->hasOneUse()) return false;
+
+ // Remember this node, and if we find the cycle, return.
+ if (!PotentiallyDeadPHIs.insert(PN).second)
+ return true;
+
+ // Don't scan crazily complex things.
+ if (PotentiallyDeadPHIs.size() == 16)
+ return false;
+
+ if (PHINode *PU = dyn_cast<PHINode>(PN->user_back()))
+ return DeadPHICycle(PU, PotentiallyDeadPHIs);
+
+ return false;
+}
+
+/// Return true if this phi node is always equal to NonPhiInVal.
+/// This happens with mutually cyclic phi nodes like:
+/// z = some value; x = phi (y, z); y = phi (x, z)
+static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal,
+ SmallPtrSetImpl<PHINode*> &ValueEqualPHIs) {
+ // See if we already saw this PHI node.
+ if (!ValueEqualPHIs.insert(PN).second)
+ return true;
+
+ // Don't scan crazily complex things.
+ if (ValueEqualPHIs.size() == 16)
+ return false;
+
+ // Scan the operands to see if they are either phi nodes or are equal to
+ // the value.
+ for (Value *Op : PN->incoming_values()) {
+ if (PHINode *OpPN = dyn_cast<PHINode>(Op)) {
+ if (!PHIsEqualValue(OpPN, NonPhiInVal, ValueEqualPHIs))
+ return false;
+ } else if (Op != NonPhiInVal)
+ return false;
+ }
+
+ return true;
+}
+
+/// Return an existing non-zero constant if this phi node has one, otherwise
+/// return constant 1.
+static ConstantInt *GetAnyNonZeroConstInt(PHINode &PN) {
+ assert(isa<IntegerType>(PN.getType()) && "Expect only integer type phi");
+ for (Value *V : PN.operands())
+ if (auto *ConstVA = dyn_cast<ConstantInt>(V))
+ if (!ConstVA->isZero())
+ return ConstVA;
+ return ConstantInt::get(cast<IntegerType>(PN.getType()), 1);
+}
+
+namespace {
+struct PHIUsageRecord {
+ unsigned PHIId; // The ID # of the PHI (something determinstic to sort on)
+ unsigned Shift; // The amount shifted.
+ Instruction *Inst; // The trunc instruction.
+
+ PHIUsageRecord(unsigned pn, unsigned Sh, Instruction *User)
+ : PHIId(pn), Shift(Sh), Inst(User) {}
+
+ bool operator<(const PHIUsageRecord &RHS) const {
+ if (PHIId < RHS.PHIId) return true;
+ if (PHIId > RHS.PHIId) return false;
+ if (Shift < RHS.Shift) return true;
+ if (Shift > RHS.Shift) return false;
+ return Inst->getType()->getPrimitiveSizeInBits() <
+ RHS.Inst->getType()->getPrimitiveSizeInBits();
+ }
+};
+
+struct LoweredPHIRecord {
+ PHINode *PN; // The PHI that was lowered.
+ unsigned Shift; // The amount shifted.
+ unsigned Width; // The width extracted.
+
+ LoweredPHIRecord(PHINode *pn, unsigned Sh, Type *Ty)
+ : PN(pn), Shift(Sh), Width(Ty->getPrimitiveSizeInBits()) {}
+
+ // Ctor form used by DenseMap.
+ LoweredPHIRecord(PHINode *pn, unsigned Sh)
+ : PN(pn), Shift(Sh), Width(0) {}
+};
} // namespace
-
-namespace llvm {
- template<>
- struct DenseMapInfo<LoweredPHIRecord> {
- static inline LoweredPHIRecord getEmptyKey() {
- return LoweredPHIRecord(nullptr, 0);
- }
- static inline LoweredPHIRecord getTombstoneKey() {
- return LoweredPHIRecord(nullptr, 1);
- }
- static unsigned getHashValue(const LoweredPHIRecord &Val) {
- return DenseMapInfo<PHINode*>::getHashValue(Val.PN) ^ (Val.Shift>>3) ^
- (Val.Width>>3);
- }
- static bool isEqual(const LoweredPHIRecord &LHS,
- const LoweredPHIRecord &RHS) {
- return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift &&
- LHS.Width == RHS.Width;
- }
- };
+
+namespace llvm {
+ template<>
+ struct DenseMapInfo<LoweredPHIRecord> {
+ static inline LoweredPHIRecord getEmptyKey() {
+ return LoweredPHIRecord(nullptr, 0);
+ }
+ static inline LoweredPHIRecord getTombstoneKey() {
+ return LoweredPHIRecord(nullptr, 1);
+ }
+ static unsigned getHashValue(const LoweredPHIRecord &Val) {
+ return DenseMapInfo<PHINode*>::getHashValue(Val.PN) ^ (Val.Shift>>3) ^
+ (Val.Width>>3);
+ }
+ static bool isEqual(const LoweredPHIRecord &LHS,
+ const LoweredPHIRecord &RHS) {
+ return LHS.PN == RHS.PN && LHS.Shift == RHS.Shift &&
+ LHS.Width == RHS.Width;
+ }
+ };
} // namespace llvm
-
-
-/// This is an integer PHI and we know that it has an illegal type: see if it is
-/// only used by trunc or trunc(lshr) operations. If so, we split the PHI into
-/// the various pieces being extracted. This sort of thing is introduced when
-/// SROA promotes an aggregate to large integer values.
-///
-/// TODO: The user of the trunc may be an bitcast to float/double/vector or an
-/// inttoptr. We should produce new PHIs in the right type.
-///
+
+
+/// This is an integer PHI and we know that it has an illegal type: see if it is
+/// only used by trunc or trunc(lshr) operations. If so, we split the PHI into
+/// the various pieces being extracted. This sort of thing is introduced when
+/// SROA promotes an aggregate to large integer values.
+///
+/// TODO: The user of the trunc may be an bitcast to float/double/vector or an
+/// inttoptr. We should produce new PHIs in the right type.
+///
Instruction *InstCombinerImpl::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
- // PHIUsers - Keep track of all of the truncated values extracted from a set
- // of PHIs, along with their offset. These are the things we want to rewrite.
- SmallVector<PHIUsageRecord, 16> PHIUsers;
-
- // PHIs are often mutually cyclic, so we keep track of a whole set of PHI
- // nodes which are extracted from. PHIsToSlice is a set we use to avoid
- // revisiting PHIs, PHIsInspected is a ordered list of PHIs that we need to
- // check the uses of (to ensure they are all extracts).
- SmallVector<PHINode*, 8> PHIsToSlice;
- SmallPtrSet<PHINode*, 8> PHIsInspected;
-
- PHIsToSlice.push_back(&FirstPhi);
- PHIsInspected.insert(&FirstPhi);
-
- for (unsigned PHIId = 0; PHIId != PHIsToSlice.size(); ++PHIId) {
- PHINode *PN = PHIsToSlice[PHIId];
-
- // Scan the input list of the PHI. If any input is an invoke, and if the
- // input is defined in the predecessor, then we won't be split the critical
- // edge which is required to insert a truncate. Because of this, we have to
- // bail out.
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- InvokeInst *II = dyn_cast<InvokeInst>(PN->getIncomingValue(i));
- if (!II) continue;
- if (II->getParent() != PN->getIncomingBlock(i))
- continue;
-
- // If we have a phi, and if it's directly in the predecessor, then we have
- // a critical edge where we need to put the truncate. Since we can't
- // split the edge in instcombine, we have to bail out.
- return nullptr;
- }
-
- for (User *U : PN->users()) {
- Instruction *UserI = cast<Instruction>(U);
-
- // If the user is a PHI, inspect its uses recursively.
- if (PHINode *UserPN = dyn_cast<PHINode>(UserI)) {
- if (PHIsInspected.insert(UserPN).second)
- PHIsToSlice.push_back(UserPN);
- continue;
- }
-
- // Truncates are always ok.
- if (isa<TruncInst>(UserI)) {
- PHIUsers.push_back(PHIUsageRecord(PHIId, 0, UserI));
- continue;
- }
-
- // Otherwise it must be a lshr which can only be used by one trunc.
- if (UserI->getOpcode() != Instruction::LShr ||
- !UserI->hasOneUse() || !isa<TruncInst>(UserI->user_back()) ||
- !isa<ConstantInt>(UserI->getOperand(1)))
- return nullptr;
-
- // Bail on out of range shifts.
- unsigned SizeInBits = UserI->getType()->getScalarSizeInBits();
- if (cast<ConstantInt>(UserI->getOperand(1))->getValue().uge(SizeInBits))
- return nullptr;
-
- unsigned Shift = cast<ConstantInt>(UserI->getOperand(1))->getZExtValue();
- PHIUsers.push_back(PHIUsageRecord(PHIId, Shift, UserI->user_back()));
- }
- }
-
- // If we have no users, they must be all self uses, just nuke the PHI.
- if (PHIUsers.empty())
- return replaceInstUsesWith(FirstPhi, UndefValue::get(FirstPhi.getType()));
-
- // If this phi node is transformable, create new PHIs for all the pieces
- // extracted out of it. First, sort the users by their offset and size.
- array_pod_sort(PHIUsers.begin(), PHIUsers.end());
-
- LLVM_DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n';
- for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) dbgs()
- << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n';);
-
- // PredValues - This is a temporary used when rewriting PHI nodes. It is
- // hoisted out here to avoid construction/destruction thrashing.
- DenseMap<BasicBlock*, Value*> PredValues;
-
- // ExtractedVals - Each new PHI we introduce is saved here so we don't
- // introduce redundant PHIs.
- DenseMap<LoweredPHIRecord, PHINode*> ExtractedVals;
-
- for (unsigned UserI = 0, UserE = PHIUsers.size(); UserI != UserE; ++UserI) {
- unsigned PHIId = PHIUsers[UserI].PHIId;
- PHINode *PN = PHIsToSlice[PHIId];
- unsigned Offset = PHIUsers[UserI].Shift;
- Type *Ty = PHIUsers[UserI].Inst->getType();
-
- PHINode *EltPHI;
-
- // If we've already lowered a user like this, reuse the previously lowered
- // value.
- if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == nullptr) {
-
- // Otherwise, Create the new PHI node for this user.
- EltPHI = PHINode::Create(Ty, PN->getNumIncomingValues(),
- PN->getName()+".off"+Twine(Offset), PN);
- assert(EltPHI->getType() != PN->getType() &&
- "Truncate didn't shrink phi?");
-
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- BasicBlock *Pred = PN->getIncomingBlock(i);
- Value *&PredVal = PredValues[Pred];
-
- // If we already have a value for this predecessor, reuse it.
- if (PredVal) {
- EltPHI->addIncoming(PredVal, Pred);
- continue;
- }
-
- // Handle the PHI self-reuse case.
- Value *InVal = PN->getIncomingValue(i);
- if (InVal == PN) {
- PredVal = EltPHI;
- EltPHI->addIncoming(PredVal, Pred);
- continue;
- }
-
- if (PHINode *InPHI = dyn_cast<PHINode>(PN)) {
- // If the incoming value was a PHI, and if it was one of the PHIs we
- // already rewrote it, just use the lowered value.
- if (Value *Res = ExtractedVals[LoweredPHIRecord(InPHI, Offset, Ty)]) {
- PredVal = Res;
- EltPHI->addIncoming(PredVal, Pred);
- continue;
- }
- }
-
- // Otherwise, do an extract in the predecessor.
- Builder.SetInsertPoint(Pred->getTerminator());
- Value *Res = InVal;
- if (Offset)
- Res = Builder.CreateLShr(Res, ConstantInt::get(InVal->getType(),
- Offset), "extract");
- Res = Builder.CreateTrunc(Res, Ty, "extract.t");
- PredVal = Res;
- EltPHI->addIncoming(Res, Pred);
-
- // If the incoming value was a PHI, and if it was one of the PHIs we are
- // rewriting, we will ultimately delete the code we inserted. This
- // means we need to revisit that PHI to make sure we extract out the
- // needed piece.
- if (PHINode *OldInVal = dyn_cast<PHINode>(PN->getIncomingValue(i)))
- if (PHIsInspected.count(OldInVal)) {
- unsigned RefPHIId =
- find(PHIsToSlice, OldInVal) - PHIsToSlice.begin();
- PHIUsers.push_back(PHIUsageRecord(RefPHIId, Offset,
- cast<Instruction>(Res)));
- ++UserE;
- }
- }
- PredValues.clear();
-
- LLVM_DEBUG(dbgs() << " Made element PHI for offset " << Offset << ": "
- << *EltPHI << '\n');
- ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI;
- }
-
- // Replace the use of this piece with the PHI node.
- replaceInstUsesWith(*PHIUsers[UserI].Inst, EltPHI);
- }
-
- // Replace all the remaining uses of the PHI nodes (self uses and the lshrs)
- // with undefs.
- Value *Undef = UndefValue::get(FirstPhi.getType());
- for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
- replaceInstUsesWith(*PHIsToSlice[i], Undef);
- return replaceInstUsesWith(FirstPhi, Undef);
-}
-
+ // PHIUsers - Keep track of all of the truncated values extracted from a set
+ // of PHIs, along with their offset. These are the things we want to rewrite.
+ SmallVector<PHIUsageRecord, 16> PHIUsers;
+
+ // PHIs are often mutually cyclic, so we keep track of a whole set of PHI
+ // nodes which are extracted from. PHIsToSlice is a set we use to avoid
+ // revisiting PHIs, PHIsInspected is a ordered list of PHIs that we need to
+ // check the uses of (to ensure they are all extracts).
+ SmallVector<PHINode*, 8> PHIsToSlice;
+ SmallPtrSet<PHINode*, 8> PHIsInspected;
+
+ PHIsToSlice.push_back(&FirstPhi);
+ PHIsInspected.insert(&FirstPhi);
+
+ for (unsigned PHIId = 0; PHIId != PHIsToSlice.size(); ++PHIId) {
+ PHINode *PN = PHIsToSlice[PHIId];
+
+ // Scan the input list of the PHI. If any input is an invoke, and if the
+ // input is defined in the predecessor, then we won't be split the critical
+ // edge which is required to insert a truncate. Because of this, we have to
+ // bail out.
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ InvokeInst *II = dyn_cast<InvokeInst>(PN->getIncomingValue(i));
+ if (!II) continue;
+ if (II->getParent() != PN->getIncomingBlock(i))
+ continue;
+
+ // If we have a phi, and if it's directly in the predecessor, then we have
+ // a critical edge where we need to put the truncate. Since we can't
+ // split the edge in instcombine, we have to bail out.
+ return nullptr;
+ }
+
+ for (User *U : PN->users()) {
+ Instruction *UserI = cast<Instruction>(U);
+
+ // If the user is a PHI, inspect its uses recursively.
+ if (PHINode *UserPN = dyn_cast<PHINode>(UserI)) {
+ if (PHIsInspected.insert(UserPN).second)
+ PHIsToSlice.push_back(UserPN);
+ continue;
+ }
+
+ // Truncates are always ok.
+ if (isa<TruncInst>(UserI)) {
+ PHIUsers.push_back(PHIUsageRecord(PHIId, 0, UserI));
+ continue;
+ }
+
+ // Otherwise it must be a lshr which can only be used by one trunc.
+ if (UserI->getOpcode() != Instruction::LShr ||
+ !UserI->hasOneUse() || !isa<TruncInst>(UserI->user_back()) ||
+ !isa<ConstantInt>(UserI->getOperand(1)))
+ return nullptr;
+
+ // Bail on out of range shifts.
+ unsigned SizeInBits = UserI->getType()->getScalarSizeInBits();
+ if (cast<ConstantInt>(UserI->getOperand(1))->getValue().uge(SizeInBits))
+ return nullptr;
+
+ unsigned Shift = cast<ConstantInt>(UserI->getOperand(1))->getZExtValue();
+ PHIUsers.push_back(PHIUsageRecord(PHIId, Shift, UserI->user_back()));
+ }
+ }
+
+ // If we have no users, they must be all self uses, just nuke the PHI.
+ if (PHIUsers.empty())
+ return replaceInstUsesWith(FirstPhi, UndefValue::get(FirstPhi.getType()));
+
+ // If this phi node is transformable, create new PHIs for all the pieces
+ // extracted out of it. First, sort the users by their offset and size.
+ array_pod_sort(PHIUsers.begin(), PHIUsers.end());
+
+ LLVM_DEBUG(dbgs() << "SLICING UP PHI: " << FirstPhi << '\n';
+ for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i) dbgs()
+ << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] << '\n';);
+
+ // PredValues - This is a temporary used when rewriting PHI nodes. It is
+ // hoisted out here to avoid construction/destruction thrashing.
+ DenseMap<BasicBlock*, Value*> PredValues;
+
+ // ExtractedVals - Each new PHI we introduce is saved here so we don't
+ // introduce redundant PHIs.
+ DenseMap<LoweredPHIRecord, PHINode*> ExtractedVals;
+
+ for (unsigned UserI = 0, UserE = PHIUsers.size(); UserI != UserE; ++UserI) {
+ unsigned PHIId = PHIUsers[UserI].PHIId;
+ PHINode *PN = PHIsToSlice[PHIId];
+ unsigned Offset = PHIUsers[UserI].Shift;
+ Type *Ty = PHIUsers[UserI].Inst->getType();
+
+ PHINode *EltPHI;
+
+ // If we've already lowered a user like this, reuse the previously lowered
+ // value.
+ if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == nullptr) {
+
+ // Otherwise, Create the new PHI node for this user.
+ EltPHI = PHINode::Create(Ty, PN->getNumIncomingValues(),
+ PN->getName()+".off"+Twine(Offset), PN);
+ assert(EltPHI->getType() != PN->getType() &&
+ "Truncate didn't shrink phi?");
+
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ BasicBlock *Pred = PN->getIncomingBlock(i);
+ Value *&PredVal = PredValues[Pred];
+
+ // If we already have a value for this predecessor, reuse it.
+ if (PredVal) {
+ EltPHI->addIncoming(PredVal, Pred);
+ continue;
+ }
+
+ // Handle the PHI self-reuse case.
+ Value *InVal = PN->getIncomingValue(i);
+ if (InVal == PN) {
+ PredVal = EltPHI;
+ EltPHI->addIncoming(PredVal, Pred);
+ continue;
+ }
+
+ if (PHINode *InPHI = dyn_cast<PHINode>(PN)) {
+ // If the incoming value was a PHI, and if it was one of the PHIs we
+ // already rewrote it, just use the lowered value.
+ if (Value *Res = ExtractedVals[LoweredPHIRecord(InPHI, Offset, Ty)]) {
+ PredVal = Res;
+ EltPHI->addIncoming(PredVal, Pred);
+ continue;
+ }
+ }
+
+ // Otherwise, do an extract in the predecessor.
+ Builder.SetInsertPoint(Pred->getTerminator());
+ Value *Res = InVal;
+ if (Offset)
+ Res = Builder.CreateLShr(Res, ConstantInt::get(InVal->getType(),
+ Offset), "extract");
+ Res = Builder.CreateTrunc(Res, Ty, "extract.t");
+ PredVal = Res;
+ EltPHI->addIncoming(Res, Pred);
+
+ // If the incoming value was a PHI, and if it was one of the PHIs we are
+ // rewriting, we will ultimately delete the code we inserted. This
+ // means we need to revisit that PHI to make sure we extract out the
+ // needed piece.
+ if (PHINode *OldInVal = dyn_cast<PHINode>(PN->getIncomingValue(i)))
+ if (PHIsInspected.count(OldInVal)) {
+ unsigned RefPHIId =
+ find(PHIsToSlice, OldInVal) - PHIsToSlice.begin();
+ PHIUsers.push_back(PHIUsageRecord(RefPHIId, Offset,
+ cast<Instruction>(Res)));
+ ++UserE;
+ }
+ }
+ PredValues.clear();
+
+ LLVM_DEBUG(dbgs() << " Made element PHI for offset " << Offset << ": "
+ << *EltPHI << '\n');
+ ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI;
+ }
+
+ // Replace the use of this piece with the PHI node.
+ replaceInstUsesWith(*PHIUsers[UserI].Inst, EltPHI);
+ }
+
+ // Replace all the remaining uses of the PHI nodes (self uses and the lshrs)
+ // with undefs.
+ Value *Undef = UndefValue::get(FirstPhi.getType());
+ for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
+ replaceInstUsesWith(*PHIsToSlice[i], Undef);
+ return replaceInstUsesWith(FirstPhi, Undef);
+}
+
static Value *SimplifyUsingControlFlow(InstCombiner &Self, PHINode &PN,
const DominatorTree &DT) {
// Simplify the following patterns:
@@ -1297,142 +1297,142 @@ static Value *SimplifyUsingControlFlow(InstCombiner &Self, PHINode &PN,
return nullptr;
}
-// PHINode simplification
-//
+// PHINode simplification
+//
Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) {
- if (Value *V = SimplifyInstruction(&PN, SQ.getWithInstruction(&PN)))
- return replaceInstUsesWith(PN, V);
-
+ if (Value *V = SimplifyInstruction(&PN, SQ.getWithInstruction(&PN)))
+ return replaceInstUsesWith(PN, V);
+
if (Instruction *Result = foldPHIArgZextsIntoPHI(PN))
- return Result;
-
- // If all PHI operands are the same operation, pull them through the PHI,
- // reducing code size.
- if (isa<Instruction>(PN.getIncomingValue(0)) &&
- isa<Instruction>(PN.getIncomingValue(1)) &&
- cast<Instruction>(PN.getIncomingValue(0))->getOpcode() ==
+ return Result;
+
+ // If all PHI operands are the same operation, pull them through the PHI,
+ // reducing code size.
+ if (isa<Instruction>(PN.getIncomingValue(0)) &&
+ isa<Instruction>(PN.getIncomingValue(1)) &&
+ cast<Instruction>(PN.getIncomingValue(0))->getOpcode() ==
cast<Instruction>(PN.getIncomingValue(1))->getOpcode() &&
PN.getIncomingValue(0)->hasOneUser())
if (Instruction *Result = foldPHIArgOpIntoPHI(PN))
- return Result;
-
- // If this is a trivial cycle in the PHI node graph, remove it. Basically, if
- // this PHI only has a single use (a PHI), and if that PHI only has one use (a
- // PHI)... break the cycle.
- if (PN.hasOneUse()) {
+ return Result;
+
+ // If this is a trivial cycle in the PHI node graph, remove it. Basically, if
+ // this PHI only has a single use (a PHI), and if that PHI only has one use (a
+ // PHI)... break the cycle.
+ if (PN.hasOneUse()) {
if (Instruction *Result = foldIntegerTypedPHI(PN))
- return Result;
-
- Instruction *PHIUser = cast<Instruction>(PN.user_back());
- if (PHINode *PU = dyn_cast<PHINode>(PHIUser)) {
- SmallPtrSet<PHINode*, 16> PotentiallyDeadPHIs;
- PotentiallyDeadPHIs.insert(&PN);
- if (DeadPHICycle(PU, PotentiallyDeadPHIs))
- return replaceInstUsesWith(PN, UndefValue::get(PN.getType()));
- }
-
- // If this phi has a single use, and if that use just computes a value for
- // the next iteration of a loop, delete the phi. This occurs with unused
- // induction variables, e.g. "for (int j = 0; ; ++j);". Detecting this
- // common case here is good because the only other things that catch this
- // are induction variable analysis (sometimes) and ADCE, which is only run
- // late.
- if (PHIUser->hasOneUse() &&
- (isa<BinaryOperator>(PHIUser) || isa<GetElementPtrInst>(PHIUser)) &&
- PHIUser->user_back() == &PN) {
- return replaceInstUsesWith(PN, UndefValue::get(PN.getType()));
- }
- // When a PHI is used only to be compared with zero, it is safe to replace
- // an incoming value proved as known nonzero with any non-zero constant.
- // For example, in the code below, the incoming value %v can be replaced
- // with any non-zero constant based on the fact that the PHI is only used to
- // be compared with zero and %v is a known non-zero value:
- // %v = select %cond, 1, 2
- // %p = phi [%v, BB] ...
- // icmp eq, %p, 0
- auto *CmpInst = dyn_cast<ICmpInst>(PHIUser);
- // FIXME: To be simple, handle only integer type for now.
- if (CmpInst && isa<IntegerType>(PN.getType()) && CmpInst->isEquality() &&
- match(CmpInst->getOperand(1), m_Zero())) {
- ConstantInt *NonZeroConst = nullptr;
- bool MadeChange = false;
- for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
- Instruction *CtxI = PN.getIncomingBlock(i)->getTerminator();
- Value *VA = PN.getIncomingValue(i);
- if (isKnownNonZero(VA, DL, 0, &AC, CtxI, &DT)) {
- if (!NonZeroConst)
- NonZeroConst = GetAnyNonZeroConstInt(PN);
-
- if (NonZeroConst != VA) {
- replaceOperand(PN, i, NonZeroConst);
- MadeChange = true;
- }
- }
- }
- if (MadeChange)
- return &PN;
- }
- }
-
- // We sometimes end up with phi cycles that non-obviously end up being the
- // same value, for example:
- // z = some value; x = phi (y, z); y = phi (x, z)
- // where the phi nodes don't necessarily need to be in the same block. Do a
- // quick check to see if the PHI node only contains a single non-phi value, if
- // so, scan to see if the phi cycle is actually equal to that value.
- {
- unsigned InValNo = 0, NumIncomingVals = PN.getNumIncomingValues();
- // Scan for the first non-phi operand.
- while (InValNo != NumIncomingVals &&
- isa<PHINode>(PN.getIncomingValue(InValNo)))
- ++InValNo;
-
- if (InValNo != NumIncomingVals) {
- Value *NonPhiInVal = PN.getIncomingValue(InValNo);
-
- // Scan the rest of the operands to see if there are any conflicts, if so
- // there is no need to recursively scan other phis.
- for (++InValNo; InValNo != NumIncomingVals; ++InValNo) {
- Value *OpVal = PN.getIncomingValue(InValNo);
- if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal))
- break;
- }
-
- // If we scanned over all operands, then we have one unique value plus
- // phi values. Scan PHI nodes to see if they all merge in each other or
- // the value.
- if (InValNo == NumIncomingVals) {
- SmallPtrSet<PHINode*, 16> ValueEqualPHIs;
- if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs))
- return replaceInstUsesWith(PN, NonPhiInVal);
- }
- }
- }
-
- // If there are multiple PHIs, sort their operands so that they all list
- // the blocks in the same order. This will help identical PHIs be eliminated
- // by other passes. Other passes shouldn't depend on this for correctness
- // however.
- PHINode *FirstPN = cast<PHINode>(PN.getParent()->begin());
- if (&PN != FirstPN)
- for (unsigned i = 0, e = FirstPN->getNumIncomingValues(); i != e; ++i) {
- BasicBlock *BBA = PN.getIncomingBlock(i);
- BasicBlock *BBB = FirstPN->getIncomingBlock(i);
- if (BBA != BBB) {
- Value *VA = PN.getIncomingValue(i);
- unsigned j = PN.getBasicBlockIndex(BBB);
- Value *VB = PN.getIncomingValue(j);
- PN.setIncomingBlock(i, BBB);
- PN.setIncomingValue(i, VB);
- PN.setIncomingBlock(j, BBA);
- PN.setIncomingValue(j, VA);
- // NOTE: Instcombine normally would want us to "return &PN" if we
- // modified any of the operands of an instruction. However, since we
- // aren't adding or removing uses (just rearranging them) we don't do
- // this in this case.
- }
- }
-
+ return Result;
+
+ Instruction *PHIUser = cast<Instruction>(PN.user_back());
+ if (PHINode *PU = dyn_cast<PHINode>(PHIUser)) {
+ SmallPtrSet<PHINode*, 16> PotentiallyDeadPHIs;
+ PotentiallyDeadPHIs.insert(&PN);
+ if (DeadPHICycle(PU, PotentiallyDeadPHIs))
+ return replaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+ }
+
+ // If this phi has a single use, and if that use just computes a value for
+ // the next iteration of a loop, delete the phi. This occurs with unused
+ // induction variables, e.g. "for (int j = 0; ; ++j);". Detecting this
+ // common case here is good because the only other things that catch this
+ // are induction variable analysis (sometimes) and ADCE, which is only run
+ // late.
+ if (PHIUser->hasOneUse() &&
+ (isa<BinaryOperator>(PHIUser) || isa<GetElementPtrInst>(PHIUser)) &&
+ PHIUser->user_back() == &PN) {
+ return replaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+ }
+ // When a PHI is used only to be compared with zero, it is safe to replace
+ // an incoming value proved as known nonzero with any non-zero constant.
+ // For example, in the code below, the incoming value %v can be replaced
+ // with any non-zero constant based on the fact that the PHI is only used to
+ // be compared with zero and %v is a known non-zero value:
+ // %v = select %cond, 1, 2
+ // %p = phi [%v, BB] ...
+ // icmp eq, %p, 0
+ auto *CmpInst = dyn_cast<ICmpInst>(PHIUser);
+ // FIXME: To be simple, handle only integer type for now.
+ if (CmpInst && isa<IntegerType>(PN.getType()) && CmpInst->isEquality() &&
+ match(CmpInst->getOperand(1), m_Zero())) {
+ ConstantInt *NonZeroConst = nullptr;
+ bool MadeChange = false;
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+ Instruction *CtxI = PN.getIncomingBlock(i)->getTerminator();
+ Value *VA = PN.getIncomingValue(i);
+ if (isKnownNonZero(VA, DL, 0, &AC, CtxI, &DT)) {
+ if (!NonZeroConst)
+ NonZeroConst = GetAnyNonZeroConstInt(PN);
+
+ if (NonZeroConst != VA) {
+ replaceOperand(PN, i, NonZeroConst);
+ MadeChange = true;
+ }
+ }
+ }
+ if (MadeChange)
+ return &PN;
+ }
+ }
+
+ // We sometimes end up with phi cycles that non-obviously end up being the
+ // same value, for example:
+ // z = some value; x = phi (y, z); y = phi (x, z)
+ // where the phi nodes don't necessarily need to be in the same block. Do a
+ // quick check to see if the PHI node only contains a single non-phi value, if
+ // so, scan to see if the phi cycle is actually equal to that value.
+ {
+ unsigned InValNo = 0, NumIncomingVals = PN.getNumIncomingValues();
+ // Scan for the first non-phi operand.
+ while (InValNo != NumIncomingVals &&
+ isa<PHINode>(PN.getIncomingValue(InValNo)))
+ ++InValNo;
+
+ if (InValNo != NumIncomingVals) {
+ Value *NonPhiInVal = PN.getIncomingValue(InValNo);
+
+ // Scan the rest of the operands to see if there are any conflicts, if so
+ // there is no need to recursively scan other phis.
+ for (++InValNo; InValNo != NumIncomingVals; ++InValNo) {
+ Value *OpVal = PN.getIncomingValue(InValNo);
+ if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal))
+ break;
+ }
+
+ // If we scanned over all operands, then we have one unique value plus
+ // phi values. Scan PHI nodes to see if they all merge in each other or
+ // the value.
+ if (InValNo == NumIncomingVals) {
+ SmallPtrSet<PHINode*, 16> ValueEqualPHIs;
+ if (PHIsEqualValue(&PN, NonPhiInVal, ValueEqualPHIs))
+ return replaceInstUsesWith(PN, NonPhiInVal);
+ }
+ }
+ }
+
+ // If there are multiple PHIs, sort their operands so that they all list
+ // the blocks in the same order. This will help identical PHIs be eliminated
+ // by other passes. Other passes shouldn't depend on this for correctness
+ // however.
+ PHINode *FirstPN = cast<PHINode>(PN.getParent()->begin());
+ if (&PN != FirstPN)
+ for (unsigned i = 0, e = FirstPN->getNumIncomingValues(); i != e; ++i) {
+ BasicBlock *BBA = PN.getIncomingBlock(i);
+ BasicBlock *BBB = FirstPN->getIncomingBlock(i);
+ if (BBA != BBB) {
+ Value *VA = PN.getIncomingValue(i);
+ unsigned j = PN.getBasicBlockIndex(BBB);
+ Value *VB = PN.getIncomingValue(j);
+ PN.setIncomingBlock(i, BBB);
+ PN.setIncomingValue(i, VB);
+ PN.setIncomingBlock(j, BBA);
+ PN.setIncomingValue(j, VA);
+ // NOTE: Instcombine normally would want us to "return &PN" if we
+ // modified any of the operands of an instruction. However, since we
+ // aren't adding or removing uses (just rearranging them) we don't do
+ // this in this case.
+ }
+ }
+
// Is there an identical PHI node in this basic block?
for (PHINode &IdenticalPN : PN.getParent()->phis()) {
// Ignore the PHI node itself.
@@ -1448,18 +1448,18 @@ Instruction *InstCombinerImpl::visitPHINode(PHINode &PN) {
return replaceInstUsesWith(PN, &IdenticalPN);
}
- // If this is an integer PHI and we know that it has an illegal type, see if
- // it is only used by trunc or trunc(lshr) operations. If so, we split the
- // PHI into the various pieces being extracted. This sort of thing is
- // introduced when SROA promotes an aggregate to a single large integer type.
- if (PN.getType()->isIntegerTy() &&
- !DL.isLegalInteger(PN.getType()->getPrimitiveSizeInBits()))
- if (Instruction *Res = SliceUpIllegalIntegerPHI(PN))
- return Res;
-
+ // If this is an integer PHI and we know that it has an illegal type, see if
+ // it is only used by trunc or trunc(lshr) operations. If so, we split the
+ // PHI into the various pieces being extracted. This sort of thing is
+ // introduced when SROA promotes an aggregate to a single large integer type.
+ if (PN.getType()->isIntegerTy() &&
+ !DL.isLegalInteger(PN.getType()->getPrimitiveSizeInBits()))
+ if (Instruction *Res = SliceUpIllegalIntegerPHI(PN))
+ return Res;
+
// Ultimately, try to replace this Phi with a dominating condition.
if (auto *V = SimplifyUsingControlFlow(*this, PN, DT))
return replaceInstUsesWith(PN, V);
- return nullptr;
-}
+ return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSelect.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 4197c03672..5f174aae09 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1,1068 +1,1068 @@
-//===- InstCombineSelect.cpp ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the visitSelect function.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CmpInstAnalysis.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/KnownBits.h"
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+//===- InstCombineSelect.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitSelect function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CmpInstAnalysis.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include <cassert>
-#include <utility>
-
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "instcombine"
-
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
/// FIXME: Enabled by default until the pattern is supported well.
static cl::opt<bool> EnableUnsafeSelectTransform(
"instcombine-unsafe-select-transform", cl::init(true),
cl::desc("Enable poison-unsafe select to and/or transform"));
-static Value *createMinMax(InstCombiner::BuilderTy &Builder,
- SelectPatternFlavor SPF, Value *A, Value *B) {
- CmpInst::Predicate Pred = getMinMaxPred(SPF);
- assert(CmpInst::isIntPredicate(Pred) && "Expected integer predicate");
- return Builder.CreateSelect(Builder.CreateICmp(Pred, A, B), A, B);
-}
-
-/// Replace a select operand based on an equality comparison with the identity
-/// constant of a binop.
-static Instruction *foldSelectBinOpIdentity(SelectInst &Sel,
- const TargetLibraryInfo &TLI,
+static Value *createMinMax(InstCombiner::BuilderTy &Builder,
+ SelectPatternFlavor SPF, Value *A, Value *B) {
+ CmpInst::Predicate Pred = getMinMaxPred(SPF);
+ assert(CmpInst::isIntPredicate(Pred) && "Expected integer predicate");
+ return Builder.CreateSelect(Builder.CreateICmp(Pred, A, B), A, B);
+}
+
+/// Replace a select operand based on an equality comparison with the identity
+/// constant of a binop.
+static Instruction *foldSelectBinOpIdentity(SelectInst &Sel,
+ const TargetLibraryInfo &TLI,
InstCombinerImpl &IC) {
- // The select condition must be an equality compare with a constant operand.
- Value *X;
- Constant *C;
- CmpInst::Predicate Pred;
- if (!match(Sel.getCondition(), m_Cmp(Pred, m_Value(X), m_Constant(C))))
- return nullptr;
-
- bool IsEq;
- if (ICmpInst::isEquality(Pred))
- IsEq = Pred == ICmpInst::ICMP_EQ;
- else if (Pred == FCmpInst::FCMP_OEQ)
- IsEq = true;
- else if (Pred == FCmpInst::FCMP_UNE)
- IsEq = false;
- else
- return nullptr;
-
- // A select operand must be a binop.
- BinaryOperator *BO;
- if (!match(Sel.getOperand(IsEq ? 1 : 2), m_BinOp(BO)))
- return nullptr;
-
- // The compare constant must be the identity constant for that binop.
- // If this a floating-point compare with 0.0, any zero constant will do.
- Type *Ty = BO->getType();
- Constant *IdC = ConstantExpr::getBinOpIdentity(BO->getOpcode(), Ty, true);
- if (IdC != C) {
- if (!IdC || !CmpInst::isFPPredicate(Pred))
- return nullptr;
- if (!match(IdC, m_AnyZeroFP()) || !match(C, m_AnyZeroFP()))
- return nullptr;
- }
-
- // Last, match the compare variable operand with a binop operand.
- Value *Y;
- if (!BO->isCommutative() && !match(BO, m_BinOp(m_Value(Y), m_Specific(X))))
- return nullptr;
- if (!match(BO, m_c_BinOp(m_Value(Y), m_Specific(X))))
- return nullptr;
-
- // +0.0 compares equal to -0.0, and so it does not behave as required for this
- // transform. Bail out if we can not exclude that possibility.
- if (isa<FPMathOperator>(BO))
- if (!BO->hasNoSignedZeros() && !CannotBeNegativeZero(Y, &TLI))
- return nullptr;
-
- // BO = binop Y, X
- // S = { select (cmp eq X, C), BO, ? } or { select (cmp ne X, C), ?, BO }
- // =>
- // S = { select (cmp eq X, C), Y, ? } or { select (cmp ne X, C), ?, Y }
- return IC.replaceOperand(Sel, IsEq ? 1 : 2, Y);
-}
-
-/// This folds:
-/// select (icmp eq (and X, C1)), TC, FC
-/// iff C1 is a power 2 and the difference between TC and FC is a power-of-2.
-/// To something like:
-/// (shr (and (X, C1)), (log2(C1) - log2(TC-FC))) + FC
-/// Or:
-/// (shl (and (X, C1)), (log2(TC-FC) - log2(C1))) + FC
-/// With some variations depending if FC is larger than TC, or the shift
-/// isn't needed, or the bit widths don't match.
-static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp,
- InstCombiner::BuilderTy &Builder) {
- const APInt *SelTC, *SelFC;
- if (!match(Sel.getTrueValue(), m_APInt(SelTC)) ||
- !match(Sel.getFalseValue(), m_APInt(SelFC)))
- return nullptr;
-
- // If this is a vector select, we need a vector compare.
- Type *SelType = Sel.getType();
- if (SelType->isVectorTy() != Cmp->getType()->isVectorTy())
- return nullptr;
-
- Value *V;
- APInt AndMask;
- bool CreateAnd = false;
- ICmpInst::Predicate Pred = Cmp->getPredicate();
- if (ICmpInst::isEquality(Pred)) {
- if (!match(Cmp->getOperand(1), m_Zero()))
- return nullptr;
-
- V = Cmp->getOperand(0);
- const APInt *AndRHS;
- if (!match(V, m_And(m_Value(), m_Power2(AndRHS))))
- return nullptr;
-
- AndMask = *AndRHS;
- } else if (decomposeBitTestICmp(Cmp->getOperand(0), Cmp->getOperand(1),
- Pred, V, AndMask)) {
- assert(ICmpInst::isEquality(Pred) && "Not equality test?");
- if (!AndMask.isPowerOf2())
- return nullptr;
-
- CreateAnd = true;
- } else {
- return nullptr;
- }
-
- // In general, when both constants are non-zero, we would need an offset to
- // replace the select. This would require more instructions than we started
- // with. But there's one special-case that we handle here because it can
- // simplify/reduce the instructions.
- APInt TC = *SelTC;
- APInt FC = *SelFC;
- if (!TC.isNullValue() && !FC.isNullValue()) {
- // If the select constants differ by exactly one bit and that's the same
- // bit that is masked and checked by the select condition, the select can
- // be replaced by bitwise logic to set/clear one bit of the constant result.
- if (TC.getBitWidth() != AndMask.getBitWidth() || (TC ^ FC) != AndMask)
- return nullptr;
- if (CreateAnd) {
- // If we have to create an 'and', then we must kill the cmp to not
- // increase the instruction count.
- if (!Cmp->hasOneUse())
- return nullptr;
- V = Builder.CreateAnd(V, ConstantInt::get(SelType, AndMask));
- }
- bool ExtraBitInTC = TC.ugt(FC);
- if (Pred == ICmpInst::ICMP_EQ) {
- // If the masked bit in V is clear, clear or set the bit in the result:
- // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) ^ TC
- // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) | TC
- Constant *C = ConstantInt::get(SelType, TC);
- return ExtraBitInTC ? Builder.CreateXor(V, C) : Builder.CreateOr(V, C);
- }
- if (Pred == ICmpInst::ICMP_NE) {
- // If the masked bit in V is set, set or clear the bit in the result:
- // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) | FC
- // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) ^ FC
- Constant *C = ConstantInt::get(SelType, FC);
- return ExtraBitInTC ? Builder.CreateOr(V, C) : Builder.CreateXor(V, C);
- }
- llvm_unreachable("Only expecting equality predicates");
- }
-
- // Make sure one of the select arms is a power-of-2.
- if (!TC.isPowerOf2() && !FC.isPowerOf2())
- return nullptr;
-
- // Determine which shift is needed to transform result of the 'and' into the
- // desired result.
- const APInt &ValC = !TC.isNullValue() ? TC : FC;
- unsigned ValZeros = ValC.logBase2();
- unsigned AndZeros = AndMask.logBase2();
-
- // Insert the 'and' instruction on the input to the truncate.
- if (CreateAnd)
- V = Builder.CreateAnd(V, ConstantInt::get(V->getType(), AndMask));
-
- // If types don't match, we can still convert the select by introducing a zext
- // or a trunc of the 'and'.
- if (ValZeros > AndZeros) {
- V = Builder.CreateZExtOrTrunc(V, SelType);
- V = Builder.CreateShl(V, ValZeros - AndZeros);
- } else if (ValZeros < AndZeros) {
- V = Builder.CreateLShr(V, AndZeros - ValZeros);
- V = Builder.CreateZExtOrTrunc(V, SelType);
- } else {
- V = Builder.CreateZExtOrTrunc(V, SelType);
- }
-
- // Okay, now we know that everything is set up, we just don't know whether we
- // have a icmp_ne or icmp_eq and whether the true or false val is the zero.
- bool ShouldNotVal = !TC.isNullValue();
- ShouldNotVal ^= Pred == ICmpInst::ICMP_NE;
- if (ShouldNotVal)
- V = Builder.CreateXor(V, ValC);
-
- return V;
-}
-
-/// We want to turn code that looks like this:
-/// %C = or %A, %B
-/// %D = select %cond, %C, %A
-/// into:
-/// %C = select %cond, %B, 0
-/// %D = or %A, %C
-///
-/// Assuming that the specified instruction is an operand to the select, return
-/// a bitmask indicating which operands of this instruction are foldable if they
-/// equal the other incoming value of the select.
-static unsigned getSelectFoldableOperands(BinaryOperator *I) {
- switch (I->getOpcode()) {
- case Instruction::Add:
- case Instruction::Mul:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- return 3; // Can fold through either operand.
- case Instruction::Sub: // Can only fold on the amount subtracted.
- case Instruction::Shl: // Can only fold on the shift amount.
- case Instruction::LShr:
- case Instruction::AShr:
- return 1;
- default:
- return 0; // Cannot fold
- }
-}
-
-/// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
+ // The select condition must be an equality compare with a constant operand.
+ Value *X;
+ Constant *C;
+ CmpInst::Predicate Pred;
+ if (!match(Sel.getCondition(), m_Cmp(Pred, m_Value(X), m_Constant(C))))
+ return nullptr;
+
+ bool IsEq;
+ if (ICmpInst::isEquality(Pred))
+ IsEq = Pred == ICmpInst::ICMP_EQ;
+ else if (Pred == FCmpInst::FCMP_OEQ)
+ IsEq = true;
+ else if (Pred == FCmpInst::FCMP_UNE)
+ IsEq = false;
+ else
+ return nullptr;
+
+ // A select operand must be a binop.
+ BinaryOperator *BO;
+ if (!match(Sel.getOperand(IsEq ? 1 : 2), m_BinOp(BO)))
+ return nullptr;
+
+ // The compare constant must be the identity constant for that binop.
+ // If this a floating-point compare with 0.0, any zero constant will do.
+ Type *Ty = BO->getType();
+ Constant *IdC = ConstantExpr::getBinOpIdentity(BO->getOpcode(), Ty, true);
+ if (IdC != C) {
+ if (!IdC || !CmpInst::isFPPredicate(Pred))
+ return nullptr;
+ if (!match(IdC, m_AnyZeroFP()) || !match(C, m_AnyZeroFP()))
+ return nullptr;
+ }
+
+ // Last, match the compare variable operand with a binop operand.
+ Value *Y;
+ if (!BO->isCommutative() && !match(BO, m_BinOp(m_Value(Y), m_Specific(X))))
+ return nullptr;
+ if (!match(BO, m_c_BinOp(m_Value(Y), m_Specific(X))))
+ return nullptr;
+
+ // +0.0 compares equal to -0.0, and so it does not behave as required for this
+ // transform. Bail out if we can not exclude that possibility.
+ if (isa<FPMathOperator>(BO))
+ if (!BO->hasNoSignedZeros() && !CannotBeNegativeZero(Y, &TLI))
+ return nullptr;
+
+ // BO = binop Y, X
+ // S = { select (cmp eq X, C), BO, ? } or { select (cmp ne X, C), ?, BO }
+ // =>
+ // S = { select (cmp eq X, C), Y, ? } or { select (cmp ne X, C), ?, Y }
+ return IC.replaceOperand(Sel, IsEq ? 1 : 2, Y);
+}
+
+/// This folds:
+/// select (icmp eq (and X, C1)), TC, FC
+/// iff C1 is a power 2 and the difference between TC and FC is a power-of-2.
+/// To something like:
+/// (shr (and (X, C1)), (log2(C1) - log2(TC-FC))) + FC
+/// Or:
+/// (shl (and (X, C1)), (log2(TC-FC) - log2(C1))) + FC
+/// With some variations depending if FC is larger than TC, or the shift
+/// isn't needed, or the bit widths don't match.
+static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp,
+ InstCombiner::BuilderTy &Builder) {
+ const APInt *SelTC, *SelFC;
+ if (!match(Sel.getTrueValue(), m_APInt(SelTC)) ||
+ !match(Sel.getFalseValue(), m_APInt(SelFC)))
+ return nullptr;
+
+ // If this is a vector select, we need a vector compare.
+ Type *SelType = Sel.getType();
+ if (SelType->isVectorTy() != Cmp->getType()->isVectorTy())
+ return nullptr;
+
+ Value *V;
+ APInt AndMask;
+ bool CreateAnd = false;
+ ICmpInst::Predicate Pred = Cmp->getPredicate();
+ if (ICmpInst::isEquality(Pred)) {
+ if (!match(Cmp->getOperand(1), m_Zero()))
+ return nullptr;
+
+ V = Cmp->getOperand(0);
+ const APInt *AndRHS;
+ if (!match(V, m_And(m_Value(), m_Power2(AndRHS))))
+ return nullptr;
+
+ AndMask = *AndRHS;
+ } else if (decomposeBitTestICmp(Cmp->getOperand(0), Cmp->getOperand(1),
+ Pred, V, AndMask)) {
+ assert(ICmpInst::isEquality(Pred) && "Not equality test?");
+ if (!AndMask.isPowerOf2())
+ return nullptr;
+
+ CreateAnd = true;
+ } else {
+ return nullptr;
+ }
+
+ // In general, when both constants are non-zero, we would need an offset to
+ // replace the select. This would require more instructions than we started
+ // with. But there's one special-case that we handle here because it can
+ // simplify/reduce the instructions.
+ APInt TC = *SelTC;
+ APInt FC = *SelFC;
+ if (!TC.isNullValue() && !FC.isNullValue()) {
+ // If the select constants differ by exactly one bit and that's the same
+ // bit that is masked and checked by the select condition, the select can
+ // be replaced by bitwise logic to set/clear one bit of the constant result.
+ if (TC.getBitWidth() != AndMask.getBitWidth() || (TC ^ FC) != AndMask)
+ return nullptr;
+ if (CreateAnd) {
+ // If we have to create an 'and', then we must kill the cmp to not
+ // increase the instruction count.
+ if (!Cmp->hasOneUse())
+ return nullptr;
+ V = Builder.CreateAnd(V, ConstantInt::get(SelType, AndMask));
+ }
+ bool ExtraBitInTC = TC.ugt(FC);
+ if (Pred == ICmpInst::ICMP_EQ) {
+ // If the masked bit in V is clear, clear or set the bit in the result:
+ // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) ^ TC
+ // (V & AndMaskC) == 0 ? TC : FC --> (V & AndMaskC) | TC
+ Constant *C = ConstantInt::get(SelType, TC);
+ return ExtraBitInTC ? Builder.CreateXor(V, C) : Builder.CreateOr(V, C);
+ }
+ if (Pred == ICmpInst::ICMP_NE) {
+ // If the masked bit in V is set, set or clear the bit in the result:
+ // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) | FC
+ // (V & AndMaskC) != 0 ? TC : FC --> (V & AndMaskC) ^ FC
+ Constant *C = ConstantInt::get(SelType, FC);
+ return ExtraBitInTC ? Builder.CreateOr(V, C) : Builder.CreateXor(V, C);
+ }
+ llvm_unreachable("Only expecting equality predicates");
+ }
+
+ // Make sure one of the select arms is a power-of-2.
+ if (!TC.isPowerOf2() && !FC.isPowerOf2())
+ return nullptr;
+
+ // Determine which shift is needed to transform result of the 'and' into the
+ // desired result.
+ const APInt &ValC = !TC.isNullValue() ? TC : FC;
+ unsigned ValZeros = ValC.logBase2();
+ unsigned AndZeros = AndMask.logBase2();
+
+ // Insert the 'and' instruction on the input to the truncate.
+ if (CreateAnd)
+ V = Builder.CreateAnd(V, ConstantInt::get(V->getType(), AndMask));
+
+ // If types don't match, we can still convert the select by introducing a zext
+ // or a trunc of the 'and'.
+ if (ValZeros > AndZeros) {
+ V = Builder.CreateZExtOrTrunc(V, SelType);
+ V = Builder.CreateShl(V, ValZeros - AndZeros);
+ } else if (ValZeros < AndZeros) {
+ V = Builder.CreateLShr(V, AndZeros - ValZeros);
+ V = Builder.CreateZExtOrTrunc(V, SelType);
+ } else {
+ V = Builder.CreateZExtOrTrunc(V, SelType);
+ }
+
+ // Okay, now we know that everything is set up, we just don't know whether we
+ // have a icmp_ne or icmp_eq and whether the true or false val is the zero.
+ bool ShouldNotVal = !TC.isNullValue();
+ ShouldNotVal ^= Pred == ICmpInst::ICMP_NE;
+ if (ShouldNotVal)
+ V = Builder.CreateXor(V, ValC);
+
+ return V;
+}
+
+/// We want to turn code that looks like this:
+/// %C = or %A, %B
+/// %D = select %cond, %C, %A
+/// into:
+/// %C = select %cond, %B, 0
+/// %D = or %A, %C
+///
+/// Assuming that the specified instruction is an operand to the select, return
+/// a bitmask indicating which operands of this instruction are foldable if they
+/// equal the other incoming value of the select.
+static unsigned getSelectFoldableOperands(BinaryOperator *I) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return 3; // Can fold through either operand.
+ case Instruction::Sub: // Can only fold on the amount subtracted.
+ case Instruction::Shl: // Can only fold on the shift amount.
+ case Instruction::LShr:
+ case Instruction::AShr:
+ return 1;
+ default:
+ return 0; // Cannot fold
+ }
+}
+
+/// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
Instruction *InstCombinerImpl::foldSelectOpOp(SelectInst &SI, Instruction *TI,
Instruction *FI) {
- // Don't break up min/max patterns. The hasOneUse checks below prevent that
- // for most cases, but vector min/max with bitcasts can be transformed. If the
- // one-use restrictions are eased for other patterns, we still don't want to
- // obfuscate min/max.
- if ((match(&SI, m_SMin(m_Value(), m_Value())) ||
- match(&SI, m_SMax(m_Value(), m_Value())) ||
- match(&SI, m_UMin(m_Value(), m_Value())) ||
- match(&SI, m_UMax(m_Value(), m_Value()))))
- return nullptr;
-
- // If this is a cast from the same type, merge.
- Value *Cond = SI.getCondition();
- Type *CondTy = Cond->getType();
- if (TI->getNumOperands() == 1 && TI->isCast()) {
- Type *FIOpndTy = FI->getOperand(0)->getType();
- if (TI->getOperand(0)->getType() != FIOpndTy)
- return nullptr;
-
- // The select condition may be a vector. We may only change the operand
- // type if the vector width remains the same (and matches the condition).
- if (auto *CondVTy = dyn_cast<VectorType>(CondTy)) {
+ // Don't break up min/max patterns. The hasOneUse checks below prevent that
+ // for most cases, but vector min/max with bitcasts can be transformed. If the
+ // one-use restrictions are eased for other patterns, we still don't want to
+ // obfuscate min/max.
+ if ((match(&SI, m_SMin(m_Value(), m_Value())) ||
+ match(&SI, m_SMax(m_Value(), m_Value())) ||
+ match(&SI, m_UMin(m_Value(), m_Value())) ||
+ match(&SI, m_UMax(m_Value(), m_Value()))))
+ return nullptr;
+
+ // If this is a cast from the same type, merge.
+ Value *Cond = SI.getCondition();
+ Type *CondTy = Cond->getType();
+ if (TI->getNumOperands() == 1 && TI->isCast()) {
+ Type *FIOpndTy = FI->getOperand(0)->getType();
+ if (TI->getOperand(0)->getType() != FIOpndTy)
+ return nullptr;
+
+ // The select condition may be a vector. We may only change the operand
+ // type if the vector width remains the same (and matches the condition).
+ if (auto *CondVTy = dyn_cast<VectorType>(CondTy)) {
if (!FIOpndTy->isVectorTy() ||
CondVTy->getElementCount() !=
cast<VectorType>(FIOpndTy)->getElementCount())
- return nullptr;
-
- // TODO: If the backend knew how to deal with casts better, we could
- // remove this limitation. For now, there's too much potential to create
- // worse codegen by promoting the select ahead of size-altering casts
- // (PR28160).
- //
- // Note that ValueTracking's matchSelectPattern() looks through casts
- // without checking 'hasOneUse' when it matches min/max patterns, so this
- // transform may end up happening anyway.
- if (TI->getOpcode() != Instruction::BitCast &&
- (!TI->hasOneUse() || !FI->hasOneUse()))
- return nullptr;
- } else if (!TI->hasOneUse() || !FI->hasOneUse()) {
- // TODO: The one-use restrictions for a scalar select could be eased if
- // the fold of a select in visitLoadInst() was enhanced to match a pattern
- // that includes a cast.
- return nullptr;
- }
-
- // Fold this by inserting a select from the input values.
- Value *NewSI =
- Builder.CreateSelect(Cond, TI->getOperand(0), FI->getOperand(0),
- SI.getName() + ".v", &SI);
- return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI,
- TI->getType());
- }
-
- // Cond ? -X : -Y --> -(Cond ? X : Y)
- Value *X, *Y;
- if (match(TI, m_FNeg(m_Value(X))) && match(FI, m_FNeg(m_Value(Y))) &&
- (TI->hasOneUse() || FI->hasOneUse())) {
- Value *NewSel = Builder.CreateSelect(Cond, X, Y, SI.getName() + ".v", &SI);
- return UnaryOperator::CreateFNegFMF(NewSel, TI);
- }
-
- // Only handle binary operators (including two-operand getelementptr) with
- // one-use here. As with the cast case above, it may be possible to relax the
- // one-use constraint, but that needs be examined carefully since it may not
- // reduce the total number of instructions.
- if (TI->getNumOperands() != 2 || FI->getNumOperands() != 2 ||
- (!isa<BinaryOperator>(TI) && !isa<GetElementPtrInst>(TI)) ||
- !TI->hasOneUse() || !FI->hasOneUse())
- return nullptr;
-
- // Figure out if the operations have any operands in common.
- Value *MatchOp, *OtherOpT, *OtherOpF;
- bool MatchIsOpZero;
- if (TI->getOperand(0) == FI->getOperand(0)) {
- MatchOp = TI->getOperand(0);
- OtherOpT = TI->getOperand(1);
- OtherOpF = FI->getOperand(1);
- MatchIsOpZero = true;
- } else if (TI->getOperand(1) == FI->getOperand(1)) {
- MatchOp = TI->getOperand(1);
- OtherOpT = TI->getOperand(0);
- OtherOpF = FI->getOperand(0);
- MatchIsOpZero = false;
- } else if (!TI->isCommutative()) {
- return nullptr;
- } else if (TI->getOperand(0) == FI->getOperand(1)) {
- MatchOp = TI->getOperand(0);
- OtherOpT = TI->getOperand(1);
- OtherOpF = FI->getOperand(0);
- MatchIsOpZero = true;
- } else if (TI->getOperand(1) == FI->getOperand(0)) {
- MatchOp = TI->getOperand(1);
- OtherOpT = TI->getOperand(0);
- OtherOpF = FI->getOperand(1);
- MatchIsOpZero = true;
- } else {
- return nullptr;
- }
-
- // If the select condition is a vector, the operands of the original select's
- // operands also must be vectors. This may not be the case for getelementptr
- // for example.
- if (CondTy->isVectorTy() && (!OtherOpT->getType()->isVectorTy() ||
- !OtherOpF->getType()->isVectorTy()))
- return nullptr;
-
- // If we reach here, they do have operations in common.
- Value *NewSI = Builder.CreateSelect(Cond, OtherOpT, OtherOpF,
- SI.getName() + ".v", &SI);
- Value *Op0 = MatchIsOpZero ? MatchOp : NewSI;
- Value *Op1 = MatchIsOpZero ? NewSI : MatchOp;
- if (auto *BO = dyn_cast<BinaryOperator>(TI)) {
- BinaryOperator *NewBO = BinaryOperator::Create(BO->getOpcode(), Op0, Op1);
- NewBO->copyIRFlags(TI);
- NewBO->andIRFlags(FI);
- return NewBO;
- }
- if (auto *TGEP = dyn_cast<GetElementPtrInst>(TI)) {
- auto *FGEP = cast<GetElementPtrInst>(FI);
- Type *ElementType = TGEP->getResultElementType();
- return TGEP->isInBounds() && FGEP->isInBounds()
- ? GetElementPtrInst::CreateInBounds(ElementType, Op0, {Op1})
- : GetElementPtrInst::Create(ElementType, Op0, {Op1});
- }
- llvm_unreachable("Expected BinaryOperator or GEP");
- return nullptr;
-}
-
-static bool isSelect01(const APInt &C1I, const APInt &C2I) {
- if (!C1I.isNullValue() && !C2I.isNullValue()) // One side must be zero.
- return false;
- return C1I.isOneValue() || C1I.isAllOnesValue() ||
- C2I.isOneValue() || C2I.isAllOnesValue();
-}
-
-/// Try to fold the select into one of the operands to allow further
-/// optimization.
+ return nullptr;
+
+ // TODO: If the backend knew how to deal with casts better, we could
+ // remove this limitation. For now, there's too much potential to create
+ // worse codegen by promoting the select ahead of size-altering casts
+ // (PR28160).
+ //
+ // Note that ValueTracking's matchSelectPattern() looks through casts
+ // without checking 'hasOneUse' when it matches min/max patterns, so this
+ // transform may end up happening anyway.
+ if (TI->getOpcode() != Instruction::BitCast &&
+ (!TI->hasOneUse() || !FI->hasOneUse()))
+ return nullptr;
+ } else if (!TI->hasOneUse() || !FI->hasOneUse()) {
+ // TODO: The one-use restrictions for a scalar select could be eased if
+ // the fold of a select in visitLoadInst() was enhanced to match a pattern
+ // that includes a cast.
+ return nullptr;
+ }
+
+ // Fold this by inserting a select from the input values.
+ Value *NewSI =
+ Builder.CreateSelect(Cond, TI->getOperand(0), FI->getOperand(0),
+ SI.getName() + ".v", &SI);
+ return CastInst::Create(Instruction::CastOps(TI->getOpcode()), NewSI,
+ TI->getType());
+ }
+
+ // Cond ? -X : -Y --> -(Cond ? X : Y)
+ Value *X, *Y;
+ if (match(TI, m_FNeg(m_Value(X))) && match(FI, m_FNeg(m_Value(Y))) &&
+ (TI->hasOneUse() || FI->hasOneUse())) {
+ Value *NewSel = Builder.CreateSelect(Cond, X, Y, SI.getName() + ".v", &SI);
+ return UnaryOperator::CreateFNegFMF(NewSel, TI);
+ }
+
+ // Only handle binary operators (including two-operand getelementptr) with
+ // one-use here. As with the cast case above, it may be possible to relax the
+ // one-use constraint, but that needs be examined carefully since it may not
+ // reduce the total number of instructions.
+ if (TI->getNumOperands() != 2 || FI->getNumOperands() != 2 ||
+ (!isa<BinaryOperator>(TI) && !isa<GetElementPtrInst>(TI)) ||
+ !TI->hasOneUse() || !FI->hasOneUse())
+ return nullptr;
+
+ // Figure out if the operations have any operands in common.
+ Value *MatchOp, *OtherOpT, *OtherOpF;
+ bool MatchIsOpZero;
+ if (TI->getOperand(0) == FI->getOperand(0)) {
+ MatchOp = TI->getOperand(0);
+ OtherOpT = TI->getOperand(1);
+ OtherOpF = FI->getOperand(1);
+ MatchIsOpZero = true;
+ } else if (TI->getOperand(1) == FI->getOperand(1)) {
+ MatchOp = TI->getOperand(1);
+ OtherOpT = TI->getOperand(0);
+ OtherOpF = FI->getOperand(0);
+ MatchIsOpZero = false;
+ } else if (!TI->isCommutative()) {
+ return nullptr;
+ } else if (TI->getOperand(0) == FI->getOperand(1)) {
+ MatchOp = TI->getOperand(0);
+ OtherOpT = TI->getOperand(1);
+ OtherOpF = FI->getOperand(0);
+ MatchIsOpZero = true;
+ } else if (TI->getOperand(1) == FI->getOperand(0)) {
+ MatchOp = TI->getOperand(1);
+ OtherOpT = TI->getOperand(0);
+ OtherOpF = FI->getOperand(1);
+ MatchIsOpZero = true;
+ } else {
+ return nullptr;
+ }
+
+ // If the select condition is a vector, the operands of the original select's
+ // operands also must be vectors. This may not be the case for getelementptr
+ // for example.
+ if (CondTy->isVectorTy() && (!OtherOpT->getType()->isVectorTy() ||
+ !OtherOpF->getType()->isVectorTy()))
+ return nullptr;
+
+ // If we reach here, they do have operations in common.
+ Value *NewSI = Builder.CreateSelect(Cond, OtherOpT, OtherOpF,
+ SI.getName() + ".v", &SI);
+ Value *Op0 = MatchIsOpZero ? MatchOp : NewSI;
+ Value *Op1 = MatchIsOpZero ? NewSI : MatchOp;
+ if (auto *BO = dyn_cast<BinaryOperator>(TI)) {
+ BinaryOperator *NewBO = BinaryOperator::Create(BO->getOpcode(), Op0, Op1);
+ NewBO->copyIRFlags(TI);
+ NewBO->andIRFlags(FI);
+ return NewBO;
+ }
+ if (auto *TGEP = dyn_cast<GetElementPtrInst>(TI)) {
+ auto *FGEP = cast<GetElementPtrInst>(FI);
+ Type *ElementType = TGEP->getResultElementType();
+ return TGEP->isInBounds() && FGEP->isInBounds()
+ ? GetElementPtrInst::CreateInBounds(ElementType, Op0, {Op1})
+ : GetElementPtrInst::Create(ElementType, Op0, {Op1});
+ }
+ llvm_unreachable("Expected BinaryOperator or GEP");
+ return nullptr;
+}
+
+static bool isSelect01(const APInt &C1I, const APInt &C2I) {
+ if (!C1I.isNullValue() && !C2I.isNullValue()) // One side must be zero.
+ return false;
+ return C1I.isOneValue() || C1I.isAllOnesValue() ||
+ C2I.isOneValue() || C2I.isAllOnesValue();
+}
+
+/// Try to fold the select into one of the operands to allow further
+/// optimization.
Instruction *InstCombinerImpl::foldSelectIntoOp(SelectInst &SI, Value *TrueVal,
Value *FalseVal) {
- // See the comment above GetSelectFoldableOperands for a description of the
- // transformation we are doing here.
- if (auto *TVI = dyn_cast<BinaryOperator>(TrueVal)) {
- if (TVI->hasOneUse() && !isa<Constant>(FalseVal)) {
- if (unsigned SFO = getSelectFoldableOperands(TVI)) {
- unsigned OpToFold = 0;
- if ((SFO & 1) && FalseVal == TVI->getOperand(0)) {
- OpToFold = 1;
- } else if ((SFO & 2) && FalseVal == TVI->getOperand(1)) {
- OpToFold = 2;
- }
-
- if (OpToFold) {
+ // See the comment above GetSelectFoldableOperands for a description of the
+ // transformation we are doing here.
+ if (auto *TVI = dyn_cast<BinaryOperator>(TrueVal)) {
+ if (TVI->hasOneUse() && !isa<Constant>(FalseVal)) {
+ if (unsigned SFO = getSelectFoldableOperands(TVI)) {
+ unsigned OpToFold = 0;
+ if ((SFO & 1) && FalseVal == TVI->getOperand(0)) {
+ OpToFold = 1;
+ } else if ((SFO & 2) && FalseVal == TVI->getOperand(1)) {
+ OpToFold = 2;
+ }
+
+ if (OpToFold) {
Constant *C = ConstantExpr::getBinOpIdentity(TVI->getOpcode(),
TVI->getType(), true);
- Value *OOp = TVI->getOperand(2-OpToFold);
- // Avoid creating select between 2 constants unless it's selecting
- // between 0, 1 and -1.
- const APInt *OOpC;
- bool OOpIsAPInt = match(OOp, m_APInt(OOpC));
+ Value *OOp = TVI->getOperand(2-OpToFold);
+ // Avoid creating select between 2 constants unless it's selecting
+ // between 0, 1 and -1.
+ const APInt *OOpC;
+ bool OOpIsAPInt = match(OOp, m_APInt(OOpC));
if (!isa<Constant>(OOp) ||
(OOpIsAPInt && isSelect01(C->getUniqueInteger(), *OOpC))) {
- Value *NewSel = Builder.CreateSelect(SI.getCondition(), OOp, C);
- NewSel->takeName(TVI);
- BinaryOperator *BO = BinaryOperator::Create(TVI->getOpcode(),
- FalseVal, NewSel);
- BO->copyIRFlags(TVI);
- return BO;
- }
- }
- }
- }
- }
-
- if (auto *FVI = dyn_cast<BinaryOperator>(FalseVal)) {
- if (FVI->hasOneUse() && !isa<Constant>(TrueVal)) {
- if (unsigned SFO = getSelectFoldableOperands(FVI)) {
- unsigned OpToFold = 0;
- if ((SFO & 1) && TrueVal == FVI->getOperand(0)) {
- OpToFold = 1;
- } else if ((SFO & 2) && TrueVal == FVI->getOperand(1)) {
- OpToFold = 2;
- }
-
- if (OpToFold) {
+ Value *NewSel = Builder.CreateSelect(SI.getCondition(), OOp, C);
+ NewSel->takeName(TVI);
+ BinaryOperator *BO = BinaryOperator::Create(TVI->getOpcode(),
+ FalseVal, NewSel);
+ BO->copyIRFlags(TVI);
+ return BO;
+ }
+ }
+ }
+ }
+ }
+
+ if (auto *FVI = dyn_cast<BinaryOperator>(FalseVal)) {
+ if (FVI->hasOneUse() && !isa<Constant>(TrueVal)) {
+ if (unsigned SFO = getSelectFoldableOperands(FVI)) {
+ unsigned OpToFold = 0;
+ if ((SFO & 1) && TrueVal == FVI->getOperand(0)) {
+ OpToFold = 1;
+ } else if ((SFO & 2) && TrueVal == FVI->getOperand(1)) {
+ OpToFold = 2;
+ }
+
+ if (OpToFold) {
Constant *C = ConstantExpr::getBinOpIdentity(FVI->getOpcode(),
FVI->getType(), true);
- Value *OOp = FVI->getOperand(2-OpToFold);
- // Avoid creating select between 2 constants unless it's selecting
- // between 0, 1 and -1.
- const APInt *OOpC;
- bool OOpIsAPInt = match(OOp, m_APInt(OOpC));
+ Value *OOp = FVI->getOperand(2-OpToFold);
+ // Avoid creating select between 2 constants unless it's selecting
+ // between 0, 1 and -1.
+ const APInt *OOpC;
+ bool OOpIsAPInt = match(OOp, m_APInt(OOpC));
if (!isa<Constant>(OOp) ||
(OOpIsAPInt && isSelect01(C->getUniqueInteger(), *OOpC))) {
- Value *NewSel = Builder.CreateSelect(SI.getCondition(), C, OOp);
- NewSel->takeName(FVI);
- BinaryOperator *BO = BinaryOperator::Create(FVI->getOpcode(),
- TrueVal, NewSel);
- BO->copyIRFlags(FVI);
- return BO;
- }
- }
- }
- }
- }
-
- return nullptr;
-}
-
-/// We want to turn:
-/// (select (icmp eq (and X, Y), 0), (and (lshr X, Z), 1), 1)
-/// into:
-/// zext (icmp ne i32 (and X, (or Y, (shl 1, Z))), 0)
-/// Note:
-/// Z may be 0 if lshr is missing.
-/// Worst-case scenario is that we will replace 5 instructions with 5 different
-/// instructions, but we got rid of select.
-static Instruction *foldSelectICmpAndAnd(Type *SelType, const ICmpInst *Cmp,
- Value *TVal, Value *FVal,
- InstCombiner::BuilderTy &Builder) {
- if (!(Cmp->hasOneUse() && Cmp->getOperand(0)->hasOneUse() &&
- Cmp->getPredicate() == ICmpInst::ICMP_EQ &&
- match(Cmp->getOperand(1), m_Zero()) && match(FVal, m_One())))
- return nullptr;
-
- // The TrueVal has general form of: and %B, 1
- Value *B;
- if (!match(TVal, m_OneUse(m_And(m_Value(B), m_One()))))
- return nullptr;
-
- // Where %B may be optionally shifted: lshr %X, %Z.
- Value *X, *Z;
- const bool HasShift = match(B, m_OneUse(m_LShr(m_Value(X), m_Value(Z))));
- if (!HasShift)
- X = B;
-
- Value *Y;
- if (!match(Cmp->getOperand(0), m_c_And(m_Specific(X), m_Value(Y))))
- return nullptr;
-
- // ((X & Y) == 0) ? ((X >> Z) & 1) : 1 --> (X & (Y | (1 << Z))) != 0
- // ((X & Y) == 0) ? (X & 1) : 1 --> (X & (Y | 1)) != 0
- Constant *One = ConstantInt::get(SelType, 1);
- Value *MaskB = HasShift ? Builder.CreateShl(One, Z) : One;
- Value *FullMask = Builder.CreateOr(Y, MaskB);
- Value *MaskedX = Builder.CreateAnd(X, FullMask);
- Value *ICmpNeZero = Builder.CreateIsNotNull(MaskedX);
- return new ZExtInst(ICmpNeZero, SelType);
-}
-
-/// We want to turn:
-/// (select (icmp sgt x, C), lshr (X, Y), ashr (X, Y)); iff C s>= -1
-/// (select (icmp slt x, C), ashr (X, Y), lshr (X, Y)); iff C s>= 0
-/// into:
-/// ashr (X, Y)
-static Value *foldSelectICmpLshrAshr(const ICmpInst *IC, Value *TrueVal,
- Value *FalseVal,
- InstCombiner::BuilderTy &Builder) {
- ICmpInst::Predicate Pred = IC->getPredicate();
- Value *CmpLHS = IC->getOperand(0);
- Value *CmpRHS = IC->getOperand(1);
- if (!CmpRHS->getType()->isIntOrIntVectorTy())
- return nullptr;
-
- Value *X, *Y;
- unsigned Bitwidth = CmpRHS->getType()->getScalarSizeInBits();
- if ((Pred != ICmpInst::ICMP_SGT ||
- !match(CmpRHS,
- m_SpecificInt_ICMP(ICmpInst::ICMP_SGE, APInt(Bitwidth, -1)))) &&
- (Pred != ICmpInst::ICMP_SLT ||
- !match(CmpRHS,
- m_SpecificInt_ICMP(ICmpInst::ICMP_SGE, APInt(Bitwidth, 0)))))
- return nullptr;
-
- // Canonicalize so that ashr is in FalseVal.
- if (Pred == ICmpInst::ICMP_SLT)
- std::swap(TrueVal, FalseVal);
-
- if (match(TrueVal, m_LShr(m_Value(X), m_Value(Y))) &&
- match(FalseVal, m_AShr(m_Specific(X), m_Specific(Y))) &&
- match(CmpLHS, m_Specific(X))) {
- const auto *Ashr = cast<Instruction>(FalseVal);
- // if lshr is not exact and ashr is, this new ashr must not be exact.
- bool IsExact = Ashr->isExact() && cast<Instruction>(TrueVal)->isExact();
- return Builder.CreateAShr(X, Y, IC->getName(), IsExact);
- }
-
- return nullptr;
-}
-
-/// We want to turn:
-/// (select (icmp eq (and X, C1), 0), Y, (or Y, C2))
-/// into:
-/// (or (shl (and X, C1), C3), Y)
-/// iff:
-/// C1 and C2 are both powers of 2
-/// where:
-/// C3 = Log(C2) - Log(C1)
-///
-/// This transform handles cases where:
-/// 1. The icmp predicate is inverted
-/// 2. The select operands are reversed
-/// 3. The magnitude of C2 and C1 are flipped
-static Value *foldSelectICmpAndOr(const ICmpInst *IC, Value *TrueVal,
- Value *FalseVal,
- InstCombiner::BuilderTy &Builder) {
- // Only handle integer compares. Also, if this is a vector select, we need a
- // vector compare.
- if (!TrueVal->getType()->isIntOrIntVectorTy() ||
- TrueVal->getType()->isVectorTy() != IC->getType()->isVectorTy())
- return nullptr;
-
- Value *CmpLHS = IC->getOperand(0);
- Value *CmpRHS = IC->getOperand(1);
-
- Value *V;
- unsigned C1Log;
- bool IsEqualZero;
- bool NeedAnd = false;
- if (IC->isEquality()) {
- if (!match(CmpRHS, m_Zero()))
- return nullptr;
-
- const APInt *C1;
- if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1))))
- return nullptr;
-
- V = CmpLHS;
- C1Log = C1->logBase2();
- IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_EQ;
- } else if (IC->getPredicate() == ICmpInst::ICMP_SLT ||
- IC->getPredicate() == ICmpInst::ICMP_SGT) {
- // We also need to recognize (icmp slt (trunc (X)), 0) and
- // (icmp sgt (trunc (X)), -1).
- IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_SGT;
- if ((IsEqualZero && !match(CmpRHS, m_AllOnes())) ||
- (!IsEqualZero && !match(CmpRHS, m_Zero())))
- return nullptr;
-
- if (!match(CmpLHS, m_OneUse(m_Trunc(m_Value(V)))))
- return nullptr;
-
- C1Log = CmpLHS->getType()->getScalarSizeInBits() - 1;
- NeedAnd = true;
- } else {
- return nullptr;
- }
-
- const APInt *C2;
- bool OrOnTrueVal = false;
- bool OrOnFalseVal = match(FalseVal, m_Or(m_Specific(TrueVal), m_Power2(C2)));
- if (!OrOnFalseVal)
- OrOnTrueVal = match(TrueVal, m_Or(m_Specific(FalseVal), m_Power2(C2)));
-
- if (!OrOnFalseVal && !OrOnTrueVal)
- return nullptr;
-
- Value *Y = OrOnFalseVal ? TrueVal : FalseVal;
-
- unsigned C2Log = C2->logBase2();
-
- bool NeedXor = (!IsEqualZero && OrOnFalseVal) || (IsEqualZero && OrOnTrueVal);
- bool NeedShift = C1Log != C2Log;
- bool NeedZExtTrunc = Y->getType()->getScalarSizeInBits() !=
- V->getType()->getScalarSizeInBits();
-
- // Make sure we don't create more instructions than we save.
- Value *Or = OrOnFalseVal ? FalseVal : TrueVal;
- if ((NeedShift + NeedXor + NeedZExtTrunc) >
- (IC->hasOneUse() + Or->hasOneUse()))
- return nullptr;
-
- if (NeedAnd) {
- // Insert the AND instruction on the input to the truncate.
- APInt C1 = APInt::getOneBitSet(V->getType()->getScalarSizeInBits(), C1Log);
- V = Builder.CreateAnd(V, ConstantInt::get(V->getType(), C1));
- }
-
- if (C2Log > C1Log) {
- V = Builder.CreateZExtOrTrunc(V, Y->getType());
- V = Builder.CreateShl(V, C2Log - C1Log);
- } else if (C1Log > C2Log) {
- V = Builder.CreateLShr(V, C1Log - C2Log);
- V = Builder.CreateZExtOrTrunc(V, Y->getType());
- } else
- V = Builder.CreateZExtOrTrunc(V, Y->getType());
-
- if (NeedXor)
- V = Builder.CreateXor(V, *C2);
-
- return Builder.CreateOr(V, Y);
-}
-
-/// Canonicalize a set or clear of a masked set of constant bits to
-/// select-of-constants form.
-static Instruction *foldSetClearBits(SelectInst &Sel,
- InstCombiner::BuilderTy &Builder) {
- Value *Cond = Sel.getCondition();
- Value *T = Sel.getTrueValue();
- Value *F = Sel.getFalseValue();
- Type *Ty = Sel.getType();
- Value *X;
- const APInt *NotC, *C;
-
- // Cond ? (X & ~C) : (X | C) --> (X & ~C) | (Cond ? 0 : C)
- if (match(T, m_And(m_Value(X), m_APInt(NotC))) &&
- match(F, m_OneUse(m_Or(m_Specific(X), m_APInt(C)))) && *NotC == ~(*C)) {
- Constant *Zero = ConstantInt::getNullValue(Ty);
- Constant *OrC = ConstantInt::get(Ty, *C);
- Value *NewSel = Builder.CreateSelect(Cond, Zero, OrC, "masksel", &Sel);
- return BinaryOperator::CreateOr(T, NewSel);
- }
-
- // Cond ? (X | C) : (X & ~C) --> (X & ~C) | (Cond ? C : 0)
- if (match(F, m_And(m_Value(X), m_APInt(NotC))) &&
- match(T, m_OneUse(m_Or(m_Specific(X), m_APInt(C)))) && *NotC == ~(*C)) {
- Constant *Zero = ConstantInt::getNullValue(Ty);
- Constant *OrC = ConstantInt::get(Ty, *C);
- Value *NewSel = Builder.CreateSelect(Cond, OrC, Zero, "masksel", &Sel);
- return BinaryOperator::CreateOr(F, NewSel);
- }
-
- return nullptr;
-}
-
-/// Transform patterns such as (a > b) ? a - b : 0 into usub.sat(a, b).
-/// There are 8 commuted/swapped variants of this pattern.
-/// TODO: Also support a - UMIN(a,b) patterns.
-static Value *canonicalizeSaturatedSubtract(const ICmpInst *ICI,
- const Value *TrueVal,
- const Value *FalseVal,
- InstCombiner::BuilderTy &Builder) {
- ICmpInst::Predicate Pred = ICI->getPredicate();
- if (!ICmpInst::isUnsigned(Pred))
- return nullptr;
-
- // (b > a) ? 0 : a - b -> (b <= a) ? a - b : 0
- if (match(TrueVal, m_Zero())) {
- Pred = ICmpInst::getInversePredicate(Pred);
- std::swap(TrueVal, FalseVal);
- }
- if (!match(FalseVal, m_Zero()))
- return nullptr;
-
- Value *A = ICI->getOperand(0);
- Value *B = ICI->getOperand(1);
- if (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_ULT) {
- // (b < a) ? a - b : 0 -> (a > b) ? a - b : 0
- std::swap(A, B);
- Pred = ICmpInst::getSwappedPredicate(Pred);
- }
-
- assert((Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_UGT) &&
- "Unexpected isUnsigned predicate!");
-
- // Ensure the sub is of the form:
- // (a > b) ? a - b : 0 -> usub.sat(a, b)
- // (a > b) ? b - a : 0 -> -usub.sat(a, b)
- // Checking for both a-b and a+(-b) as a constant.
- bool IsNegative = false;
- const APInt *C;
- if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A))) ||
- (match(A, m_APInt(C)) &&
- match(TrueVal, m_Add(m_Specific(B), m_SpecificInt(-*C)))))
- IsNegative = true;
- else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B))) &&
- !(match(B, m_APInt(C)) &&
- match(TrueVal, m_Add(m_Specific(A), m_SpecificInt(-*C)))))
- return nullptr;
-
- // If we are adding a negate and the sub and icmp are used anywhere else, we
- // would end up with more instructions.
- if (IsNegative && !TrueVal->hasOneUse() && !ICI->hasOneUse())
- return nullptr;
-
- // (a > b) ? a - b : 0 -> usub.sat(a, b)
- // (a > b) ? b - a : 0 -> -usub.sat(a, b)
- Value *Result = Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, A, B);
- if (IsNegative)
- Result = Builder.CreateNeg(Result);
- return Result;
-}
-
-static Value *canonicalizeSaturatedAdd(ICmpInst *Cmp, Value *TVal, Value *FVal,
- InstCombiner::BuilderTy &Builder) {
- if (!Cmp->hasOneUse())
- return nullptr;
-
- // Match unsigned saturated add with constant.
- Value *Cmp0 = Cmp->getOperand(0);
- Value *Cmp1 = Cmp->getOperand(1);
- ICmpInst::Predicate Pred = Cmp->getPredicate();
- Value *X;
- const APInt *C, *CmpC;
- if (Pred == ICmpInst::ICMP_ULT &&
- match(TVal, m_Add(m_Value(X), m_APInt(C))) && X == Cmp0 &&
- match(FVal, m_AllOnes()) && match(Cmp1, m_APInt(CmpC)) && *CmpC == ~*C) {
- // (X u< ~C) ? (X + C) : -1 --> uadd.sat(X, C)
- return Builder.CreateBinaryIntrinsic(
- Intrinsic::uadd_sat, X, ConstantInt::get(X->getType(), *C));
- }
-
- // Match unsigned saturated add of 2 variables with an unnecessary 'not'.
- // There are 8 commuted variants.
- // Canonicalize -1 (saturated result) to true value of the select.
- if (match(FVal, m_AllOnes())) {
- std::swap(TVal, FVal);
- Pred = CmpInst::getInversePredicate(Pred);
- }
- if (!match(TVal, m_AllOnes()))
- return nullptr;
-
- // Canonicalize predicate to less-than or less-or-equal-than.
- if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) {
- std::swap(Cmp0, Cmp1);
- Pred = CmpInst::getSwappedPredicate(Pred);
- }
- if (Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_ULE)
- return nullptr;
-
- // Match unsigned saturated add of 2 variables with an unnecessary 'not'.
- // Strictness of the comparison is irrelevant.
- Value *Y;
- if (match(Cmp0, m_Not(m_Value(X))) &&
- match(FVal, m_c_Add(m_Specific(X), m_Value(Y))) && Y == Cmp1) {
- // (~X u< Y) ? -1 : (X + Y) --> uadd.sat(X, Y)
- // (~X u< Y) ? -1 : (Y + X) --> uadd.sat(X, Y)
- return Builder.CreateBinaryIntrinsic(Intrinsic::uadd_sat, X, Y);
- }
- // The 'not' op may be included in the sum but not the compare.
- // Strictness of the comparison is irrelevant.
- X = Cmp0;
- Y = Cmp1;
- if (match(FVal, m_c_Add(m_Not(m_Specific(X)), m_Specific(Y)))) {
- // (X u< Y) ? -1 : (~X + Y) --> uadd.sat(~X, Y)
- // (X u< Y) ? -1 : (Y + ~X) --> uadd.sat(Y, ~X)
- BinaryOperator *BO = cast<BinaryOperator>(FVal);
- return Builder.CreateBinaryIntrinsic(
- Intrinsic::uadd_sat, BO->getOperand(0), BO->getOperand(1));
- }
- // The overflow may be detected via the add wrapping round.
- // This is only valid for strict comparison!
- if (Pred == ICmpInst::ICMP_ULT &&
- match(Cmp0, m_c_Add(m_Specific(Cmp1), m_Value(Y))) &&
- match(FVal, m_c_Add(m_Specific(Cmp1), m_Specific(Y)))) {
- // ((X + Y) u< X) ? -1 : (X + Y) --> uadd.sat(X, Y)
- // ((X + Y) u< Y) ? -1 : (X + Y) --> uadd.sat(X, Y)
- return Builder.CreateBinaryIntrinsic(Intrinsic::uadd_sat, Cmp1, Y);
- }
-
- return nullptr;
-}
-
-/// Fold the following code sequence:
-/// \code
-/// int a = ctlz(x & -x);
-// x ? 31 - a : a;
-/// \code
-///
-/// into:
-/// cttz(x)
-static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal,
- Value *FalseVal,
- InstCombiner::BuilderTy &Builder) {
- unsigned BitWidth = TrueVal->getType()->getScalarSizeInBits();
- if (!ICI->isEquality() || !match(ICI->getOperand(1), m_Zero()))
- return nullptr;
-
- if (ICI->getPredicate() == ICmpInst::ICMP_NE)
- std::swap(TrueVal, FalseVal);
-
- if (!match(FalseVal,
- m_Xor(m_Deferred(TrueVal), m_SpecificInt(BitWidth - 1))))
- return nullptr;
-
- if (!match(TrueVal, m_Intrinsic<Intrinsic::ctlz>()))
- return nullptr;
-
- Value *X = ICI->getOperand(0);
- auto *II = cast<IntrinsicInst>(TrueVal);
- if (!match(II->getOperand(0), m_c_And(m_Specific(X), m_Neg(m_Specific(X)))))
- return nullptr;
-
- Function *F = Intrinsic::getDeclaration(II->getModule(), Intrinsic::cttz,
- II->getType());
- return CallInst::Create(F, {X, II->getArgOperand(1)});
-}
-
-/// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single
-/// call to cttz/ctlz with flag 'is_zero_undef' cleared.
-///
-/// For example, we can fold the following code sequence:
-/// \code
-/// %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
-/// %1 = icmp ne i32 %x, 0
-/// %2 = select i1 %1, i32 %0, i32 32
-/// \code
-///
-/// into:
-/// %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
-static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
- InstCombiner::BuilderTy &Builder) {
- ICmpInst::Predicate Pred = ICI->getPredicate();
- Value *CmpLHS = ICI->getOperand(0);
- Value *CmpRHS = ICI->getOperand(1);
-
- // Check if the condition value compares a value for equality against zero.
- if (!ICI->isEquality() || !match(CmpRHS, m_Zero()))
- return nullptr;
-
- Value *SelectArg = FalseVal;
- Value *ValueOnZero = TrueVal;
- if (Pred == ICmpInst::ICMP_NE)
- std::swap(SelectArg, ValueOnZero);
-
- // Skip zero extend/truncate.
- Value *Count = nullptr;
- if (!match(SelectArg, m_ZExt(m_Value(Count))) &&
- !match(SelectArg, m_Trunc(m_Value(Count))))
- Count = SelectArg;
-
- // Check that 'Count' is a call to intrinsic cttz/ctlz. Also check that the
- // input to the cttz/ctlz is used as LHS for the compare instruction.
- if (!match(Count, m_Intrinsic<Intrinsic::cttz>(m_Specific(CmpLHS))) &&
- !match(Count, m_Intrinsic<Intrinsic::ctlz>(m_Specific(CmpLHS))))
- return nullptr;
-
- IntrinsicInst *II = cast<IntrinsicInst>(Count);
-
- // Check if the value propagated on zero is a constant number equal to the
- // sizeof in bits of 'Count'.
- unsigned SizeOfInBits = Count->getType()->getScalarSizeInBits();
- if (match(ValueOnZero, m_SpecificInt(SizeOfInBits))) {
- // Explicitly clear the 'undef_on_zero' flag. It's always valid to go from
- // true to false on this flag, so we can replace it for all users.
- II->setArgOperand(1, ConstantInt::getFalse(II->getContext()));
- return SelectArg;
- }
-
- // The ValueOnZero is not the bitwidth. But if the cttz/ctlz (and optional
- // zext/trunc) have one use (ending at the select), the cttz/ctlz result will
- // not be used if the input is zero. Relax to 'undef_on_zero' for that case.
- if (II->hasOneUse() && SelectArg->hasOneUse() &&
- !match(II->getArgOperand(1), m_One()))
- II->setArgOperand(1, ConstantInt::getTrue(II->getContext()));
-
- return nullptr;
-}
-
-/// Return true if we find and adjust an icmp+select pattern where the compare
-/// is with a constant that can be incremented or decremented to match the
-/// minimum or maximum idiom.
-static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) {
- ICmpInst::Predicate Pred = Cmp.getPredicate();
- Value *CmpLHS = Cmp.getOperand(0);
- Value *CmpRHS = Cmp.getOperand(1);
- Value *TrueVal = Sel.getTrueValue();
- Value *FalseVal = Sel.getFalseValue();
-
- // We may move or edit the compare, so make sure the select is the only user.
- const APInt *CmpC;
- if (!Cmp.hasOneUse() || !match(CmpRHS, m_APInt(CmpC)))
- return false;
-
- // These transforms only work for selects of integers or vector selects of
- // integer vectors.
- Type *SelTy = Sel.getType();
- auto *SelEltTy = dyn_cast<IntegerType>(SelTy->getScalarType());
- if (!SelEltTy || SelTy->isVectorTy() != Cmp.getType()->isVectorTy())
- return false;
-
- Constant *AdjustedRHS;
- if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_SGT)
- AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC + 1);
- else if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SLT)
- AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC - 1);
- else
- return false;
-
- // X > C ? X : C+1 --> X < C+1 ? C+1 : X
- // X < C ? X : C-1 --> X > C-1 ? C-1 : X
- if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) ||
- (CmpLHS == FalseVal && AdjustedRHS == TrueVal)) {
- ; // Nothing to do here. Values match without any sign/zero extension.
- }
- // Types do not match. Instead of calculating this with mixed types, promote
- // all to the larger type. This enables scalar evolution to analyze this
- // expression.
- else if (CmpRHS->getType()->getScalarSizeInBits() < SelEltTy->getBitWidth()) {
- Constant *SextRHS = ConstantExpr::getSExt(AdjustedRHS, SelTy);
-
- // X = sext x; x >s c ? X : C+1 --> X = sext x; X <s C+1 ? C+1 : X
- // X = sext x; x <s c ? X : C-1 --> X = sext x; X >s C-1 ? C-1 : X
- // X = sext x; x >u c ? X : C+1 --> X = sext x; X <u C+1 ? C+1 : X
- // X = sext x; x <u c ? X : C-1 --> X = sext x; X >u C-1 ? C-1 : X
- if (match(TrueVal, m_SExt(m_Specific(CmpLHS))) && SextRHS == FalseVal) {
- CmpLHS = TrueVal;
- AdjustedRHS = SextRHS;
- } else if (match(FalseVal, m_SExt(m_Specific(CmpLHS))) &&
- SextRHS == TrueVal) {
- CmpLHS = FalseVal;
- AdjustedRHS = SextRHS;
- } else if (Cmp.isUnsigned()) {
- Constant *ZextRHS = ConstantExpr::getZExt(AdjustedRHS, SelTy);
- // X = zext x; x >u c ? X : C+1 --> X = zext x; X <u C+1 ? C+1 : X
- // X = zext x; x <u c ? X : C-1 --> X = zext x; X >u C-1 ? C-1 : X
- // zext + signed compare cannot be changed:
- // 0xff <s 0x00, but 0x00ff >s 0x0000
- if (match(TrueVal, m_ZExt(m_Specific(CmpLHS))) && ZextRHS == FalseVal) {
- CmpLHS = TrueVal;
- AdjustedRHS = ZextRHS;
- } else if (match(FalseVal, m_ZExt(m_Specific(CmpLHS))) &&
- ZextRHS == TrueVal) {
- CmpLHS = FalseVal;
- AdjustedRHS = ZextRHS;
- } else {
- return false;
- }
- } else {
- return false;
- }
- } else {
- return false;
- }
-
- Pred = ICmpInst::getSwappedPredicate(Pred);
- CmpRHS = AdjustedRHS;
- std::swap(FalseVal, TrueVal);
- Cmp.setPredicate(Pred);
- Cmp.setOperand(0, CmpLHS);
- Cmp.setOperand(1, CmpRHS);
- Sel.setOperand(1, TrueVal);
- Sel.setOperand(2, FalseVal);
- Sel.swapProfMetadata();
-
- // Move the compare instruction right before the select instruction. Otherwise
- // the sext/zext value may be defined after the compare instruction uses it.
- Cmp.moveBefore(&Sel);
-
- return true;
-}
-
-/// If this is an integer min/max (icmp + select) with a constant operand,
-/// create the canonical icmp for the min/max operation and canonicalize the
-/// constant to the 'false' operand of the select:
-/// select (icmp Pred X, C1), C2, X --> select (icmp Pred' X, C2), X, C2
-/// Note: if C1 != C2, this will change the icmp constant to the existing
-/// constant operand of the select.
+ Value *NewSel = Builder.CreateSelect(SI.getCondition(), C, OOp);
+ NewSel->takeName(FVI);
+ BinaryOperator *BO = BinaryOperator::Create(FVI->getOpcode(),
+ TrueVal, NewSel);
+ BO->copyIRFlags(FVI);
+ return BO;
+ }
+ }
+ }
+ }
+ }
+
+ return nullptr;
+}
+
+/// We want to turn:
+/// (select (icmp eq (and X, Y), 0), (and (lshr X, Z), 1), 1)
+/// into:
+/// zext (icmp ne i32 (and X, (or Y, (shl 1, Z))), 0)
+/// Note:
+/// Z may be 0 if lshr is missing.
+/// Worst-case scenario is that we will replace 5 instructions with 5 different
+/// instructions, but we got rid of select.
+static Instruction *foldSelectICmpAndAnd(Type *SelType, const ICmpInst *Cmp,
+ Value *TVal, Value *FVal,
+ InstCombiner::BuilderTy &Builder) {
+ if (!(Cmp->hasOneUse() && Cmp->getOperand(0)->hasOneUse() &&
+ Cmp->getPredicate() == ICmpInst::ICMP_EQ &&
+ match(Cmp->getOperand(1), m_Zero()) && match(FVal, m_One())))
+ return nullptr;
+
+ // The TrueVal has general form of: and %B, 1
+ Value *B;
+ if (!match(TVal, m_OneUse(m_And(m_Value(B), m_One()))))
+ return nullptr;
+
+ // Where %B may be optionally shifted: lshr %X, %Z.
+ Value *X, *Z;
+ const bool HasShift = match(B, m_OneUse(m_LShr(m_Value(X), m_Value(Z))));
+ if (!HasShift)
+ X = B;
+
+ Value *Y;
+ if (!match(Cmp->getOperand(0), m_c_And(m_Specific(X), m_Value(Y))))
+ return nullptr;
+
+ // ((X & Y) == 0) ? ((X >> Z) & 1) : 1 --> (X & (Y | (1 << Z))) != 0
+ // ((X & Y) == 0) ? (X & 1) : 1 --> (X & (Y | 1)) != 0
+ Constant *One = ConstantInt::get(SelType, 1);
+ Value *MaskB = HasShift ? Builder.CreateShl(One, Z) : One;
+ Value *FullMask = Builder.CreateOr(Y, MaskB);
+ Value *MaskedX = Builder.CreateAnd(X, FullMask);
+ Value *ICmpNeZero = Builder.CreateIsNotNull(MaskedX);
+ return new ZExtInst(ICmpNeZero, SelType);
+}
+
+/// We want to turn:
+/// (select (icmp sgt x, C), lshr (X, Y), ashr (X, Y)); iff C s>= -1
+/// (select (icmp slt x, C), ashr (X, Y), lshr (X, Y)); iff C s>= 0
+/// into:
+/// ashr (X, Y)
+static Value *foldSelectICmpLshrAshr(const ICmpInst *IC, Value *TrueVal,
+ Value *FalseVal,
+ InstCombiner::BuilderTy &Builder) {
+ ICmpInst::Predicate Pred = IC->getPredicate();
+ Value *CmpLHS = IC->getOperand(0);
+ Value *CmpRHS = IC->getOperand(1);
+ if (!CmpRHS->getType()->isIntOrIntVectorTy())
+ return nullptr;
+
+ Value *X, *Y;
+ unsigned Bitwidth = CmpRHS->getType()->getScalarSizeInBits();
+ if ((Pred != ICmpInst::ICMP_SGT ||
+ !match(CmpRHS,
+ m_SpecificInt_ICMP(ICmpInst::ICMP_SGE, APInt(Bitwidth, -1)))) &&
+ (Pred != ICmpInst::ICMP_SLT ||
+ !match(CmpRHS,
+ m_SpecificInt_ICMP(ICmpInst::ICMP_SGE, APInt(Bitwidth, 0)))))
+ return nullptr;
+
+ // Canonicalize so that ashr is in FalseVal.
+ if (Pred == ICmpInst::ICMP_SLT)
+ std::swap(TrueVal, FalseVal);
+
+ if (match(TrueVal, m_LShr(m_Value(X), m_Value(Y))) &&
+ match(FalseVal, m_AShr(m_Specific(X), m_Specific(Y))) &&
+ match(CmpLHS, m_Specific(X))) {
+ const auto *Ashr = cast<Instruction>(FalseVal);
+ // if lshr is not exact and ashr is, this new ashr must not be exact.
+ bool IsExact = Ashr->isExact() && cast<Instruction>(TrueVal)->isExact();
+ return Builder.CreateAShr(X, Y, IC->getName(), IsExact);
+ }
+
+ return nullptr;
+}
+
+/// We want to turn:
+/// (select (icmp eq (and X, C1), 0), Y, (or Y, C2))
+/// into:
+/// (or (shl (and X, C1), C3), Y)
+/// iff:
+/// C1 and C2 are both powers of 2
+/// where:
+/// C3 = Log(C2) - Log(C1)
+///
+/// This transform handles cases where:
+/// 1. The icmp predicate is inverted
+/// 2. The select operands are reversed
+/// 3. The magnitude of C2 and C1 are flipped
+static Value *foldSelectICmpAndOr(const ICmpInst *IC, Value *TrueVal,
+ Value *FalseVal,
+ InstCombiner::BuilderTy &Builder) {
+ // Only handle integer compares. Also, if this is a vector select, we need a
+ // vector compare.
+ if (!TrueVal->getType()->isIntOrIntVectorTy() ||
+ TrueVal->getType()->isVectorTy() != IC->getType()->isVectorTy())
+ return nullptr;
+
+ Value *CmpLHS = IC->getOperand(0);
+ Value *CmpRHS = IC->getOperand(1);
+
+ Value *V;
+ unsigned C1Log;
+ bool IsEqualZero;
+ bool NeedAnd = false;
+ if (IC->isEquality()) {
+ if (!match(CmpRHS, m_Zero()))
+ return nullptr;
+
+ const APInt *C1;
+ if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1))))
+ return nullptr;
+
+ V = CmpLHS;
+ C1Log = C1->logBase2();
+ IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_EQ;
+ } else if (IC->getPredicate() == ICmpInst::ICMP_SLT ||
+ IC->getPredicate() == ICmpInst::ICMP_SGT) {
+ // We also need to recognize (icmp slt (trunc (X)), 0) and
+ // (icmp sgt (trunc (X)), -1).
+ IsEqualZero = IC->getPredicate() == ICmpInst::ICMP_SGT;
+ if ((IsEqualZero && !match(CmpRHS, m_AllOnes())) ||
+ (!IsEqualZero && !match(CmpRHS, m_Zero())))
+ return nullptr;
+
+ if (!match(CmpLHS, m_OneUse(m_Trunc(m_Value(V)))))
+ return nullptr;
+
+ C1Log = CmpLHS->getType()->getScalarSizeInBits() - 1;
+ NeedAnd = true;
+ } else {
+ return nullptr;
+ }
+
+ const APInt *C2;
+ bool OrOnTrueVal = false;
+ bool OrOnFalseVal = match(FalseVal, m_Or(m_Specific(TrueVal), m_Power2(C2)));
+ if (!OrOnFalseVal)
+ OrOnTrueVal = match(TrueVal, m_Or(m_Specific(FalseVal), m_Power2(C2)));
+
+ if (!OrOnFalseVal && !OrOnTrueVal)
+ return nullptr;
+
+ Value *Y = OrOnFalseVal ? TrueVal : FalseVal;
+
+ unsigned C2Log = C2->logBase2();
+
+ bool NeedXor = (!IsEqualZero && OrOnFalseVal) || (IsEqualZero && OrOnTrueVal);
+ bool NeedShift = C1Log != C2Log;
+ bool NeedZExtTrunc = Y->getType()->getScalarSizeInBits() !=
+ V->getType()->getScalarSizeInBits();
+
+ // Make sure we don't create more instructions than we save.
+ Value *Or = OrOnFalseVal ? FalseVal : TrueVal;
+ if ((NeedShift + NeedXor + NeedZExtTrunc) >
+ (IC->hasOneUse() + Or->hasOneUse()))
+ return nullptr;
+
+ if (NeedAnd) {
+ // Insert the AND instruction on the input to the truncate.
+ APInt C1 = APInt::getOneBitSet(V->getType()->getScalarSizeInBits(), C1Log);
+ V = Builder.CreateAnd(V, ConstantInt::get(V->getType(), C1));
+ }
+
+ if (C2Log > C1Log) {
+ V = Builder.CreateZExtOrTrunc(V, Y->getType());
+ V = Builder.CreateShl(V, C2Log - C1Log);
+ } else if (C1Log > C2Log) {
+ V = Builder.CreateLShr(V, C1Log - C2Log);
+ V = Builder.CreateZExtOrTrunc(V, Y->getType());
+ } else
+ V = Builder.CreateZExtOrTrunc(V, Y->getType());
+
+ if (NeedXor)
+ V = Builder.CreateXor(V, *C2);
+
+ return Builder.CreateOr(V, Y);
+}
+
+/// Canonicalize a set or clear of a masked set of constant bits to
+/// select-of-constants form.
+static Instruction *foldSetClearBits(SelectInst &Sel,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Cond = Sel.getCondition();
+ Value *T = Sel.getTrueValue();
+ Value *F = Sel.getFalseValue();
+ Type *Ty = Sel.getType();
+ Value *X;
+ const APInt *NotC, *C;
+
+ // Cond ? (X & ~C) : (X | C) --> (X & ~C) | (Cond ? 0 : C)
+ if (match(T, m_And(m_Value(X), m_APInt(NotC))) &&
+ match(F, m_OneUse(m_Or(m_Specific(X), m_APInt(C)))) && *NotC == ~(*C)) {
+ Constant *Zero = ConstantInt::getNullValue(Ty);
+ Constant *OrC = ConstantInt::get(Ty, *C);
+ Value *NewSel = Builder.CreateSelect(Cond, Zero, OrC, "masksel", &Sel);
+ return BinaryOperator::CreateOr(T, NewSel);
+ }
+
+ // Cond ? (X | C) : (X & ~C) --> (X & ~C) | (Cond ? C : 0)
+ if (match(F, m_And(m_Value(X), m_APInt(NotC))) &&
+ match(T, m_OneUse(m_Or(m_Specific(X), m_APInt(C)))) && *NotC == ~(*C)) {
+ Constant *Zero = ConstantInt::getNullValue(Ty);
+ Constant *OrC = ConstantInt::get(Ty, *C);
+ Value *NewSel = Builder.CreateSelect(Cond, OrC, Zero, "masksel", &Sel);
+ return BinaryOperator::CreateOr(F, NewSel);
+ }
+
+ return nullptr;
+}
+
+/// Transform patterns such as (a > b) ? a - b : 0 into usub.sat(a, b).
+/// There are 8 commuted/swapped variants of this pattern.
+/// TODO: Also support a - UMIN(a,b) patterns.
+static Value *canonicalizeSaturatedSubtract(const ICmpInst *ICI,
+ const Value *TrueVal,
+ const Value *FalseVal,
+ InstCombiner::BuilderTy &Builder) {
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+ if (!ICmpInst::isUnsigned(Pred))
+ return nullptr;
+
+ // (b > a) ? 0 : a - b -> (b <= a) ? a - b : 0
+ if (match(TrueVal, m_Zero())) {
+ Pred = ICmpInst::getInversePredicate(Pred);
+ std::swap(TrueVal, FalseVal);
+ }
+ if (!match(FalseVal, m_Zero()))
+ return nullptr;
+
+ Value *A = ICI->getOperand(0);
+ Value *B = ICI->getOperand(1);
+ if (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_ULT) {
+ // (b < a) ? a - b : 0 -> (a > b) ? a - b : 0
+ std::swap(A, B);
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ }
+
+ assert((Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_UGT) &&
+ "Unexpected isUnsigned predicate!");
+
+ // Ensure the sub is of the form:
+ // (a > b) ? a - b : 0 -> usub.sat(a, b)
+ // (a > b) ? b - a : 0 -> -usub.sat(a, b)
+ // Checking for both a-b and a+(-b) as a constant.
+ bool IsNegative = false;
+ const APInt *C;
+ if (match(TrueVal, m_Sub(m_Specific(B), m_Specific(A))) ||
+ (match(A, m_APInt(C)) &&
+ match(TrueVal, m_Add(m_Specific(B), m_SpecificInt(-*C)))))
+ IsNegative = true;
+ else if (!match(TrueVal, m_Sub(m_Specific(A), m_Specific(B))) &&
+ !(match(B, m_APInt(C)) &&
+ match(TrueVal, m_Add(m_Specific(A), m_SpecificInt(-*C)))))
+ return nullptr;
+
+ // If we are adding a negate and the sub and icmp are used anywhere else, we
+ // would end up with more instructions.
+ if (IsNegative && !TrueVal->hasOneUse() && !ICI->hasOneUse())
+ return nullptr;
+
+ // (a > b) ? a - b : 0 -> usub.sat(a, b)
+ // (a > b) ? b - a : 0 -> -usub.sat(a, b)
+ Value *Result = Builder.CreateBinaryIntrinsic(Intrinsic::usub_sat, A, B);
+ if (IsNegative)
+ Result = Builder.CreateNeg(Result);
+ return Result;
+}
+
+static Value *canonicalizeSaturatedAdd(ICmpInst *Cmp, Value *TVal, Value *FVal,
+ InstCombiner::BuilderTy &Builder) {
+ if (!Cmp->hasOneUse())
+ return nullptr;
+
+ // Match unsigned saturated add with constant.
+ Value *Cmp0 = Cmp->getOperand(0);
+ Value *Cmp1 = Cmp->getOperand(1);
+ ICmpInst::Predicate Pred = Cmp->getPredicate();
+ Value *X;
+ const APInt *C, *CmpC;
+ if (Pred == ICmpInst::ICMP_ULT &&
+ match(TVal, m_Add(m_Value(X), m_APInt(C))) && X == Cmp0 &&
+ match(FVal, m_AllOnes()) && match(Cmp1, m_APInt(CmpC)) && *CmpC == ~*C) {
+ // (X u< ~C) ? (X + C) : -1 --> uadd.sat(X, C)
+ return Builder.CreateBinaryIntrinsic(
+ Intrinsic::uadd_sat, X, ConstantInt::get(X->getType(), *C));
+ }
+
+ // Match unsigned saturated add of 2 variables with an unnecessary 'not'.
+ // There are 8 commuted variants.
+ // Canonicalize -1 (saturated result) to true value of the select.
+ if (match(FVal, m_AllOnes())) {
+ std::swap(TVal, FVal);
+ Pred = CmpInst::getInversePredicate(Pred);
+ }
+ if (!match(TVal, m_AllOnes()))
+ return nullptr;
+
+ // Canonicalize predicate to less-than or less-or-equal-than.
+ if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE) {
+ std::swap(Cmp0, Cmp1);
+ Pred = CmpInst::getSwappedPredicate(Pred);
+ }
+ if (Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_ULE)
+ return nullptr;
+
+ // Match unsigned saturated add of 2 variables with an unnecessary 'not'.
+ // Strictness of the comparison is irrelevant.
+ Value *Y;
+ if (match(Cmp0, m_Not(m_Value(X))) &&
+ match(FVal, m_c_Add(m_Specific(X), m_Value(Y))) && Y == Cmp1) {
+ // (~X u< Y) ? -1 : (X + Y) --> uadd.sat(X, Y)
+ // (~X u< Y) ? -1 : (Y + X) --> uadd.sat(X, Y)
+ return Builder.CreateBinaryIntrinsic(Intrinsic::uadd_sat, X, Y);
+ }
+ // The 'not' op may be included in the sum but not the compare.
+ // Strictness of the comparison is irrelevant.
+ X = Cmp0;
+ Y = Cmp1;
+ if (match(FVal, m_c_Add(m_Not(m_Specific(X)), m_Specific(Y)))) {
+ // (X u< Y) ? -1 : (~X + Y) --> uadd.sat(~X, Y)
+ // (X u< Y) ? -1 : (Y + ~X) --> uadd.sat(Y, ~X)
+ BinaryOperator *BO = cast<BinaryOperator>(FVal);
+ return Builder.CreateBinaryIntrinsic(
+ Intrinsic::uadd_sat, BO->getOperand(0), BO->getOperand(1));
+ }
+ // The overflow may be detected via the add wrapping round.
+ // This is only valid for strict comparison!
+ if (Pred == ICmpInst::ICMP_ULT &&
+ match(Cmp0, m_c_Add(m_Specific(Cmp1), m_Value(Y))) &&
+ match(FVal, m_c_Add(m_Specific(Cmp1), m_Specific(Y)))) {
+ // ((X + Y) u< X) ? -1 : (X + Y) --> uadd.sat(X, Y)
+ // ((X + Y) u< Y) ? -1 : (X + Y) --> uadd.sat(X, Y)
+ return Builder.CreateBinaryIntrinsic(Intrinsic::uadd_sat, Cmp1, Y);
+ }
+
+ return nullptr;
+}
+
+/// Fold the following code sequence:
+/// \code
+/// int a = ctlz(x & -x);
+// x ? 31 - a : a;
+/// \code
+///
+/// into:
+/// cttz(x)
+static Instruction *foldSelectCtlzToCttz(ICmpInst *ICI, Value *TrueVal,
+ Value *FalseVal,
+ InstCombiner::BuilderTy &Builder) {
+ unsigned BitWidth = TrueVal->getType()->getScalarSizeInBits();
+ if (!ICI->isEquality() || !match(ICI->getOperand(1), m_Zero()))
+ return nullptr;
+
+ if (ICI->getPredicate() == ICmpInst::ICMP_NE)
+ std::swap(TrueVal, FalseVal);
+
+ if (!match(FalseVal,
+ m_Xor(m_Deferred(TrueVal), m_SpecificInt(BitWidth - 1))))
+ return nullptr;
+
+ if (!match(TrueVal, m_Intrinsic<Intrinsic::ctlz>()))
+ return nullptr;
+
+ Value *X = ICI->getOperand(0);
+ auto *II = cast<IntrinsicInst>(TrueVal);
+ if (!match(II->getOperand(0), m_c_And(m_Specific(X), m_Neg(m_Specific(X)))))
+ return nullptr;
+
+ Function *F = Intrinsic::getDeclaration(II->getModule(), Intrinsic::cttz,
+ II->getType());
+ return CallInst::Create(F, {X, II->getArgOperand(1)});
+}
+
+/// Attempt to fold a cttz/ctlz followed by a icmp plus select into a single
+/// call to cttz/ctlz with flag 'is_zero_undef' cleared.
+///
+/// For example, we can fold the following code sequence:
+/// \code
+/// %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true)
+/// %1 = icmp ne i32 %x, 0
+/// %2 = select i1 %1, i32 %0, i32 32
+/// \code
+///
+/// into:
+/// %0 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false)
+static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
+ InstCombiner::BuilderTy &Builder) {
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+ Value *CmpLHS = ICI->getOperand(0);
+ Value *CmpRHS = ICI->getOperand(1);
+
+ // Check if the condition value compares a value for equality against zero.
+ if (!ICI->isEquality() || !match(CmpRHS, m_Zero()))
+ return nullptr;
+
+ Value *SelectArg = FalseVal;
+ Value *ValueOnZero = TrueVal;
+ if (Pred == ICmpInst::ICMP_NE)
+ std::swap(SelectArg, ValueOnZero);
+
+ // Skip zero extend/truncate.
+ Value *Count = nullptr;
+ if (!match(SelectArg, m_ZExt(m_Value(Count))) &&
+ !match(SelectArg, m_Trunc(m_Value(Count))))
+ Count = SelectArg;
+
+ // Check that 'Count' is a call to intrinsic cttz/ctlz. Also check that the
+ // input to the cttz/ctlz is used as LHS for the compare instruction.
+ if (!match(Count, m_Intrinsic<Intrinsic::cttz>(m_Specific(CmpLHS))) &&
+ !match(Count, m_Intrinsic<Intrinsic::ctlz>(m_Specific(CmpLHS))))
+ return nullptr;
+
+ IntrinsicInst *II = cast<IntrinsicInst>(Count);
+
+ // Check if the value propagated on zero is a constant number equal to the
+ // sizeof in bits of 'Count'.
+ unsigned SizeOfInBits = Count->getType()->getScalarSizeInBits();
+ if (match(ValueOnZero, m_SpecificInt(SizeOfInBits))) {
+ // Explicitly clear the 'undef_on_zero' flag. It's always valid to go from
+ // true to false on this flag, so we can replace it for all users.
+ II->setArgOperand(1, ConstantInt::getFalse(II->getContext()));
+ return SelectArg;
+ }
+
+ // The ValueOnZero is not the bitwidth. But if the cttz/ctlz (and optional
+ // zext/trunc) have one use (ending at the select), the cttz/ctlz result will
+ // not be used if the input is zero. Relax to 'undef_on_zero' for that case.
+ if (II->hasOneUse() && SelectArg->hasOneUse() &&
+ !match(II->getArgOperand(1), m_One()))
+ II->setArgOperand(1, ConstantInt::getTrue(II->getContext()));
+
+ return nullptr;
+}
+
+/// Return true if we find and adjust an icmp+select pattern where the compare
+/// is with a constant that can be incremented or decremented to match the
+/// minimum or maximum idiom.
+static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) {
+ ICmpInst::Predicate Pred = Cmp.getPredicate();
+ Value *CmpLHS = Cmp.getOperand(0);
+ Value *CmpRHS = Cmp.getOperand(1);
+ Value *TrueVal = Sel.getTrueValue();
+ Value *FalseVal = Sel.getFalseValue();
+
+ // We may move or edit the compare, so make sure the select is the only user.
+ const APInt *CmpC;
+ if (!Cmp.hasOneUse() || !match(CmpRHS, m_APInt(CmpC)))
+ return false;
+
+ // These transforms only work for selects of integers or vector selects of
+ // integer vectors.
+ Type *SelTy = Sel.getType();
+ auto *SelEltTy = dyn_cast<IntegerType>(SelTy->getScalarType());
+ if (!SelEltTy || SelTy->isVectorTy() != Cmp.getType()->isVectorTy())
+ return false;
+
+ Constant *AdjustedRHS;
+ if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_SGT)
+ AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC + 1);
+ else if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SLT)
+ AdjustedRHS = ConstantInt::get(CmpRHS->getType(), *CmpC - 1);
+ else
+ return false;
+
+ // X > C ? X : C+1 --> X < C+1 ? C+1 : X
+ // X < C ? X : C-1 --> X > C-1 ? C-1 : X
+ if ((CmpLHS == TrueVal && AdjustedRHS == FalseVal) ||
+ (CmpLHS == FalseVal && AdjustedRHS == TrueVal)) {
+ ; // Nothing to do here. Values match without any sign/zero extension.
+ }
+ // Types do not match. Instead of calculating this with mixed types, promote
+ // all to the larger type. This enables scalar evolution to analyze this
+ // expression.
+ else if (CmpRHS->getType()->getScalarSizeInBits() < SelEltTy->getBitWidth()) {
+ Constant *SextRHS = ConstantExpr::getSExt(AdjustedRHS, SelTy);
+
+ // X = sext x; x >s c ? X : C+1 --> X = sext x; X <s C+1 ? C+1 : X
+ // X = sext x; x <s c ? X : C-1 --> X = sext x; X >s C-1 ? C-1 : X
+ // X = sext x; x >u c ? X : C+1 --> X = sext x; X <u C+1 ? C+1 : X
+ // X = sext x; x <u c ? X : C-1 --> X = sext x; X >u C-1 ? C-1 : X
+ if (match(TrueVal, m_SExt(m_Specific(CmpLHS))) && SextRHS == FalseVal) {
+ CmpLHS = TrueVal;
+ AdjustedRHS = SextRHS;
+ } else if (match(FalseVal, m_SExt(m_Specific(CmpLHS))) &&
+ SextRHS == TrueVal) {
+ CmpLHS = FalseVal;
+ AdjustedRHS = SextRHS;
+ } else if (Cmp.isUnsigned()) {
+ Constant *ZextRHS = ConstantExpr::getZExt(AdjustedRHS, SelTy);
+ // X = zext x; x >u c ? X : C+1 --> X = zext x; X <u C+1 ? C+1 : X
+ // X = zext x; x <u c ? X : C-1 --> X = zext x; X >u C-1 ? C-1 : X
+ // zext + signed compare cannot be changed:
+ // 0xff <s 0x00, but 0x00ff >s 0x0000
+ if (match(TrueVal, m_ZExt(m_Specific(CmpLHS))) && ZextRHS == FalseVal) {
+ CmpLHS = TrueVal;
+ AdjustedRHS = ZextRHS;
+ } else if (match(FalseVal, m_ZExt(m_Specific(CmpLHS))) &&
+ ZextRHS == TrueVal) {
+ CmpLHS = FalseVal;
+ AdjustedRHS = ZextRHS;
+ } else {
+ return false;
+ }
+ } else {
+ return false;
+ }
+ } else {
+ return false;
+ }
+
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ CmpRHS = AdjustedRHS;
+ std::swap(FalseVal, TrueVal);
+ Cmp.setPredicate(Pred);
+ Cmp.setOperand(0, CmpLHS);
+ Cmp.setOperand(1, CmpRHS);
+ Sel.setOperand(1, TrueVal);
+ Sel.setOperand(2, FalseVal);
+ Sel.swapProfMetadata();
+
+ // Move the compare instruction right before the select instruction. Otherwise
+ // the sext/zext value may be defined after the compare instruction uses it.
+ Cmp.moveBefore(&Sel);
+
+ return true;
+}
+
+/// If this is an integer min/max (icmp + select) with a constant operand,
+/// create the canonical icmp for the min/max operation and canonicalize the
+/// constant to the 'false' operand of the select:
+/// select (icmp Pred X, C1), C2, X --> select (icmp Pred' X, C2), X, C2
+/// Note: if C1 != C2, this will change the icmp constant to the existing
+/// constant operand of the select.
static Instruction *canonicalizeMinMaxWithConstant(SelectInst &Sel,
ICmpInst &Cmp,
InstCombinerImpl &IC) {
- if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
- return nullptr;
-
- // Canonicalize the compare predicate based on whether we have min or max.
- Value *LHS, *RHS;
- SelectPatternResult SPR = matchSelectPattern(&Sel, LHS, RHS);
- if (!SelectPatternResult::isMinOrMax(SPR.Flavor))
- return nullptr;
-
- // Is this already canonical?
- ICmpInst::Predicate CanonicalPred = getMinMaxPred(SPR.Flavor);
- if (Cmp.getOperand(0) == LHS && Cmp.getOperand(1) == RHS &&
- Cmp.getPredicate() == CanonicalPred)
- return nullptr;
-
- // Bail out on unsimplified X-0 operand (due to some worklist management bug),
- // as this may cause an infinite combine loop. Let the sub be folded first.
- if (match(LHS, m_Sub(m_Value(), m_Zero())) ||
- match(RHS, m_Sub(m_Value(), m_Zero())))
- return nullptr;
-
- // Create the canonical compare and plug it into the select.
- IC.replaceOperand(Sel, 0, IC.Builder.CreateICmp(CanonicalPred, LHS, RHS));
-
- // If the select operands did not change, we're done.
- if (Sel.getTrueValue() == LHS && Sel.getFalseValue() == RHS)
- return &Sel;
-
- // If we are swapping the select operands, swap the metadata too.
- assert(Sel.getTrueValue() == RHS && Sel.getFalseValue() == LHS &&
- "Unexpected results from matchSelectPattern");
- Sel.swapValues();
- Sel.swapProfMetadata();
- return &Sel;
-}
-
-static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
+ if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
+ return nullptr;
+
+ // Canonicalize the compare predicate based on whether we have min or max.
+ Value *LHS, *RHS;
+ SelectPatternResult SPR = matchSelectPattern(&Sel, LHS, RHS);
+ if (!SelectPatternResult::isMinOrMax(SPR.Flavor))
+ return nullptr;
+
+ // Is this already canonical?
+ ICmpInst::Predicate CanonicalPred = getMinMaxPred(SPR.Flavor);
+ if (Cmp.getOperand(0) == LHS && Cmp.getOperand(1) == RHS &&
+ Cmp.getPredicate() == CanonicalPred)
+ return nullptr;
+
+ // Bail out on unsimplified X-0 operand (due to some worklist management bug),
+ // as this may cause an infinite combine loop. Let the sub be folded first.
+ if (match(LHS, m_Sub(m_Value(), m_Zero())) ||
+ match(RHS, m_Sub(m_Value(), m_Zero())))
+ return nullptr;
+
+ // Create the canonical compare and plug it into the select.
+ IC.replaceOperand(Sel, 0, IC.Builder.CreateICmp(CanonicalPred, LHS, RHS));
+
+ // If the select operands did not change, we're done.
+ if (Sel.getTrueValue() == LHS && Sel.getFalseValue() == RHS)
+ return &Sel;
+
+ // If we are swapping the select operands, swap the metadata too.
+ assert(Sel.getTrueValue() == RHS && Sel.getFalseValue() == LHS &&
+ "Unexpected results from matchSelectPattern");
+ Sel.swapValues();
+ Sel.swapProfMetadata();
+ return &Sel;
+}
+
+static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
InstCombinerImpl &IC) {
- if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
- return nullptr;
-
- Value *LHS, *RHS;
- SelectPatternFlavor SPF = matchSelectPattern(&Sel, LHS, RHS).Flavor;
- if (SPF != SelectPatternFlavor::SPF_ABS &&
- SPF != SelectPatternFlavor::SPF_NABS)
- return nullptr;
-
+ if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
+ return nullptr;
+
+ Value *LHS, *RHS;
+ SelectPatternFlavor SPF = matchSelectPattern(&Sel, LHS, RHS).Flavor;
+ if (SPF != SelectPatternFlavor::SPF_ABS &&
+ SPF != SelectPatternFlavor::SPF_NABS)
+ return nullptr;
+
// Note that NSW flag can only be propagated for normal, non-negated abs!
bool IntMinIsPoison = SPF == SelectPatternFlavor::SPF_ABS &&
match(RHS, m_NSWNeg(m_Specific(LHS)));
@@ -1070,45 +1070,45 @@ static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
ConstantInt::get(Type::getInt1Ty(Sel.getContext()), IntMinIsPoison);
Instruction *Abs =
IC.Builder.CreateBinaryIntrinsic(Intrinsic::abs, LHS, IntMinIsPoisonC);
-
+
if (SPF == SelectPatternFlavor::SPF_NABS)
return BinaryOperator::CreateNeg(Abs); // Always without NSW flag!
-
+
return IC.replaceInstUsesWith(Sel, Abs);
-}
-
-/// If we have a select with an equality comparison, then we know the value in
-/// one of the arms of the select. See if substituting this value into an arm
-/// and simplifying the result yields the same value as the other arm.
-///
-/// To make this transform safe, we must drop poison-generating flags
-/// (nsw, etc) if we simplified to a binop because the select may be guarding
-/// that poison from propagating. If the existing binop already had no
-/// poison-generating flags, then this transform can be done by instsimplify.
-///
-/// Consider:
-/// %cmp = icmp eq i32 %x, 2147483647
-/// %add = add nsw i32 %x, 1
-/// %sel = select i1 %cmp, i32 -2147483648, i32 %add
-///
-/// We can't replace %sel with %add unless we strip away the flags.
-/// TODO: Wrapping flags could be preserved in some cases with better analysis.
+}
+
+/// If we have a select with an equality comparison, then we know the value in
+/// one of the arms of the select. See if substituting this value into an arm
+/// and simplifying the result yields the same value as the other arm.
+///
+/// To make this transform safe, we must drop poison-generating flags
+/// (nsw, etc) if we simplified to a binop because the select may be guarding
+/// that poison from propagating. If the existing binop already had no
+/// poison-generating flags, then this transform can be done by instsimplify.
+///
+/// Consider:
+/// %cmp = icmp eq i32 %x, 2147483647
+/// %add = add nsw i32 %x, 1
+/// %sel = select i1 %cmp, i32 -2147483648, i32 %add
+///
+/// We can't replace %sel with %add unless we strip away the flags.
+/// TODO: Wrapping flags could be preserved in some cases with better analysis.
Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
ICmpInst &Cmp) {
// Value equivalence substitution requires an all-or-nothing replacement.
// It does not make sense for a vector compare where each lane is chosen
// independently.
if (!Cmp.isEquality() || Cmp.getType()->isVectorTy())
- return nullptr;
-
- // Canonicalize the pattern to ICMP_EQ by swapping the select operands.
- Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue();
+ return nullptr;
+
+ // Canonicalize the pattern to ICMP_EQ by swapping the select operands.
+ Value *TrueVal = Sel.getTrueValue(), *FalseVal = Sel.getFalseValue();
bool Swapped = false;
if (Cmp.getPredicate() == ICmpInst::ICMP_NE) {
- std::swap(TrueVal, FalseVal);
+ std::swap(TrueVal, FalseVal);
Swapped = true;
}
-
+
// In X == Y ? f(X) : Z, try to evaluate f(Y) and replace the operand.
// Make sure Y cannot be undef though, as we might pick different values for
// undef in the icmp and in f(Y). Additionally, take care to avoid replacing
@@ -1143,1145 +1143,1145 @@ Instruction *InstCombinerImpl::foldSelectValueEquivalence(SelectInst &Sel,
/* AllowRefinement */ true))
return replaceOperand(Sel, Swapped ? 2 : 1, V);
- auto *FalseInst = dyn_cast<Instruction>(FalseVal);
- if (!FalseInst)
- return nullptr;
-
- // InstSimplify already performed this fold if it was possible subject to
- // current poison-generating flags. Try the transform again with
- // poison-generating flags temporarily dropped.
+ auto *FalseInst = dyn_cast<Instruction>(FalseVal);
+ if (!FalseInst)
+ return nullptr;
+
+ // InstSimplify already performed this fold if it was possible subject to
+ // current poison-generating flags. Try the transform again with
+ // poison-generating flags temporarily dropped.
bool WasNUW = false, WasNSW = false, WasExact = false, WasInBounds = false;
- if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(FalseVal)) {
- WasNUW = OBO->hasNoUnsignedWrap();
- WasNSW = OBO->hasNoSignedWrap();
- FalseInst->setHasNoUnsignedWrap(false);
- FalseInst->setHasNoSignedWrap(false);
- }
- if (auto *PEO = dyn_cast<PossiblyExactOperator>(FalseVal)) {
- WasExact = PEO->isExact();
- FalseInst->setIsExact(false);
- }
+ if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(FalseVal)) {
+ WasNUW = OBO->hasNoUnsignedWrap();
+ WasNSW = OBO->hasNoSignedWrap();
+ FalseInst->setHasNoUnsignedWrap(false);
+ FalseInst->setHasNoSignedWrap(false);
+ }
+ if (auto *PEO = dyn_cast<PossiblyExactOperator>(FalseVal)) {
+ WasExact = PEO->isExact();
+ FalseInst->setIsExact(false);
+ }
if (auto *GEP = dyn_cast<GetElementPtrInst>(FalseVal)) {
WasInBounds = GEP->isInBounds();
GEP->setIsInBounds(false);
}
-
- // Try each equivalence substitution possibility.
- // We have an 'EQ' comparison, so the select's false value will propagate.
- // Example:
- // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1
+
+ // Try each equivalence substitution possibility.
+ // We have an 'EQ' comparison, so the select's false value will propagate.
+ // Example:
+ // (X == 42) ? 43 : (X + 1) --> (X == 42) ? (X + 1) : (X + 1) --> X + 1
if (SimplifyWithOpReplaced(FalseVal, CmpLHS, CmpRHS, SQ,
- /* AllowRefinement */ false) == TrueVal ||
+ /* AllowRefinement */ false) == TrueVal ||
SimplifyWithOpReplaced(FalseVal, CmpRHS, CmpLHS, SQ,
- /* AllowRefinement */ false) == TrueVal) {
+ /* AllowRefinement */ false) == TrueVal) {
return replaceInstUsesWith(Sel, FalseVal);
- }
-
- // Restore poison-generating flags if the transform did not apply.
- if (WasNUW)
- FalseInst->setHasNoUnsignedWrap();
- if (WasNSW)
- FalseInst->setHasNoSignedWrap();
- if (WasExact)
- FalseInst->setIsExact();
+ }
+
+ // Restore poison-generating flags if the transform did not apply.
+ if (WasNUW)
+ FalseInst->setHasNoUnsignedWrap();
+ if (WasNSW)
+ FalseInst->setHasNoSignedWrap();
+ if (WasExact)
+ FalseInst->setIsExact();
if (WasInBounds)
cast<GetElementPtrInst>(FalseInst)->setIsInBounds();
-
- return nullptr;
-}
-
-// See if this is a pattern like:
-// %old_cmp1 = icmp slt i32 %x, C2
-// %old_replacement = select i1 %old_cmp1, i32 %target_low, i32 %target_high
-// %old_x_offseted = add i32 %x, C1
-// %old_cmp0 = icmp ult i32 %old_x_offseted, C0
-// %r = select i1 %old_cmp0, i32 %x, i32 %old_replacement
-// This can be rewritten as more canonical pattern:
-// %new_cmp1 = icmp slt i32 %x, -C1
-// %new_cmp2 = icmp sge i32 %x, C0-C1
-// %new_clamped_low = select i1 %new_cmp1, i32 %target_low, i32 %x
-// %r = select i1 %new_cmp2, i32 %target_high, i32 %new_clamped_low
-// Iff -C1 s<= C2 s<= C0-C1
-// Also ULT predicate can also be UGT iff C0 != -1 (+invert result)
-// SLT predicate can also be SGT iff C2 != INT_MAX (+invert res.)
-static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
- InstCombiner::BuilderTy &Builder) {
- Value *X = Sel0.getTrueValue();
- Value *Sel1 = Sel0.getFalseValue();
-
- // First match the condition of the outermost select.
- // Said condition must be one-use.
- if (!Cmp0.hasOneUse())
- return nullptr;
- Value *Cmp00 = Cmp0.getOperand(0);
- Constant *C0;
- if (!match(Cmp0.getOperand(1),
- m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0))))
- return nullptr;
- // Canonicalize Cmp0 into the form we expect.
- // FIXME: we shouldn't care about lanes that are 'undef' in the end?
- switch (Cmp0.getPredicate()) {
- case ICmpInst::Predicate::ICMP_ULT:
- break; // Great!
- case ICmpInst::Predicate::ICMP_ULE:
- // We'd have to increment C0 by one, and for that it must not have all-ones
- // element, but then it would have been canonicalized to 'ult' before
- // we get here. So we can't do anything useful with 'ule'.
- return nullptr;
- case ICmpInst::Predicate::ICMP_UGT:
- // We want to canonicalize it to 'ult', so we'll need to increment C0,
- // which again means it must not have any all-ones elements.
- if (!match(C0,
- m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE,
- APInt::getAllOnesValue(
- C0->getType()->getScalarSizeInBits()))))
- return nullptr; // Can't do, have all-ones element[s].
+
+ return nullptr;
+}
+
+// See if this is a pattern like:
+// %old_cmp1 = icmp slt i32 %x, C2
+// %old_replacement = select i1 %old_cmp1, i32 %target_low, i32 %target_high
+// %old_x_offseted = add i32 %x, C1
+// %old_cmp0 = icmp ult i32 %old_x_offseted, C0
+// %r = select i1 %old_cmp0, i32 %x, i32 %old_replacement
+// This can be rewritten as more canonical pattern:
+// %new_cmp1 = icmp slt i32 %x, -C1
+// %new_cmp2 = icmp sge i32 %x, C0-C1
+// %new_clamped_low = select i1 %new_cmp1, i32 %target_low, i32 %x
+// %r = select i1 %new_cmp2, i32 %target_high, i32 %new_clamped_low
+// Iff -C1 s<= C2 s<= C0-C1
+// Also ULT predicate can also be UGT iff C0 != -1 (+invert result)
+// SLT predicate can also be SGT iff C2 != INT_MAX (+invert res.)
+static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
+ InstCombiner::BuilderTy &Builder) {
+ Value *X = Sel0.getTrueValue();
+ Value *Sel1 = Sel0.getFalseValue();
+
+ // First match the condition of the outermost select.
+ // Said condition must be one-use.
+ if (!Cmp0.hasOneUse())
+ return nullptr;
+ Value *Cmp00 = Cmp0.getOperand(0);
+ Constant *C0;
+ if (!match(Cmp0.getOperand(1),
+ m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0))))
+ return nullptr;
+ // Canonicalize Cmp0 into the form we expect.
+ // FIXME: we shouldn't care about lanes that are 'undef' in the end?
+ switch (Cmp0.getPredicate()) {
+ case ICmpInst::Predicate::ICMP_ULT:
+ break; // Great!
+ case ICmpInst::Predicate::ICMP_ULE:
+ // We'd have to increment C0 by one, and for that it must not have all-ones
+ // element, but then it would have been canonicalized to 'ult' before
+ // we get here. So we can't do anything useful with 'ule'.
+ return nullptr;
+ case ICmpInst::Predicate::ICMP_UGT:
+ // We want to canonicalize it to 'ult', so we'll need to increment C0,
+ // which again means it must not have any all-ones elements.
+ if (!match(C0,
+ m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE,
+ APInt::getAllOnesValue(
+ C0->getType()->getScalarSizeInBits()))))
+ return nullptr; // Can't do, have all-ones element[s].
C0 = InstCombiner::AddOne(C0);
- std::swap(X, Sel1);
- break;
- case ICmpInst::Predicate::ICMP_UGE:
- // The only way we'd get this predicate if this `icmp` has extra uses,
- // but then we won't be able to do this fold.
- return nullptr;
- default:
- return nullptr; // Unknown predicate.
- }
-
- // Now that we've canonicalized the ICmp, we know the X we expect;
- // the select in other hand should be one-use.
- if (!Sel1->hasOneUse())
- return nullptr;
-
- // We now can finish matching the condition of the outermost select:
- // it should either be the X itself, or an addition of some constant to X.
- Constant *C1;
- if (Cmp00 == X)
- C1 = ConstantInt::getNullValue(Sel0.getType());
- else if (!match(Cmp00,
- m_Add(m_Specific(X),
- m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C1)))))
- return nullptr;
-
- Value *Cmp1;
- ICmpInst::Predicate Pred1;
- Constant *C2;
- Value *ReplacementLow, *ReplacementHigh;
- if (!match(Sel1, m_Select(m_Value(Cmp1), m_Value(ReplacementLow),
- m_Value(ReplacementHigh))) ||
- !match(Cmp1,
- m_ICmp(Pred1, m_Specific(X),
- m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C2)))))
- return nullptr;
-
- if (!Cmp1->hasOneUse() && (Cmp00 == X || !Cmp00->hasOneUse()))
- return nullptr; // Not enough one-use instructions for the fold.
- // FIXME: this restriction could be relaxed if Cmp1 can be reused as one of
- // two comparisons we'll need to build.
-
- // Canonicalize Cmp1 into the form we expect.
- // FIXME: we shouldn't care about lanes that are 'undef' in the end?
- switch (Pred1) {
- case ICmpInst::Predicate::ICMP_SLT:
- break;
- case ICmpInst::Predicate::ICMP_SLE:
- // We'd have to increment C2 by one, and for that it must not have signed
- // max element, but then it would have been canonicalized to 'slt' before
- // we get here. So we can't do anything useful with 'sle'.
- return nullptr;
- case ICmpInst::Predicate::ICMP_SGT:
- // We want to canonicalize it to 'slt', so we'll need to increment C2,
- // which again means it must not have any signed max elements.
- if (!match(C2,
- m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE,
- APInt::getSignedMaxValue(
- C2->getType()->getScalarSizeInBits()))))
- return nullptr; // Can't do, have signed max element[s].
+ std::swap(X, Sel1);
+ break;
+ case ICmpInst::Predicate::ICMP_UGE:
+ // The only way we'd get this predicate if this `icmp` has extra uses,
+ // but then we won't be able to do this fold.
+ return nullptr;
+ default:
+ return nullptr; // Unknown predicate.
+ }
+
+ // Now that we've canonicalized the ICmp, we know the X we expect;
+ // the select in other hand should be one-use.
+ if (!Sel1->hasOneUse())
+ return nullptr;
+
+ // We now can finish matching the condition of the outermost select:
+ // it should either be the X itself, or an addition of some constant to X.
+ Constant *C1;
+ if (Cmp00 == X)
+ C1 = ConstantInt::getNullValue(Sel0.getType());
+ else if (!match(Cmp00,
+ m_Add(m_Specific(X),
+ m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C1)))))
+ return nullptr;
+
+ Value *Cmp1;
+ ICmpInst::Predicate Pred1;
+ Constant *C2;
+ Value *ReplacementLow, *ReplacementHigh;
+ if (!match(Sel1, m_Select(m_Value(Cmp1), m_Value(ReplacementLow),
+ m_Value(ReplacementHigh))) ||
+ !match(Cmp1,
+ m_ICmp(Pred1, m_Specific(X),
+ m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C2)))))
+ return nullptr;
+
+ if (!Cmp1->hasOneUse() && (Cmp00 == X || !Cmp00->hasOneUse()))
+ return nullptr; // Not enough one-use instructions for the fold.
+ // FIXME: this restriction could be relaxed if Cmp1 can be reused as one of
+ // two comparisons we'll need to build.
+
+ // Canonicalize Cmp1 into the form we expect.
+ // FIXME: we shouldn't care about lanes that are 'undef' in the end?
+ switch (Pred1) {
+ case ICmpInst::Predicate::ICMP_SLT:
+ break;
+ case ICmpInst::Predicate::ICMP_SLE:
+ // We'd have to increment C2 by one, and for that it must not have signed
+ // max element, but then it would have been canonicalized to 'slt' before
+ // we get here. So we can't do anything useful with 'sle'.
+ return nullptr;
+ case ICmpInst::Predicate::ICMP_SGT:
+ // We want to canonicalize it to 'slt', so we'll need to increment C2,
+ // which again means it must not have any signed max elements.
+ if (!match(C2,
+ m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_NE,
+ APInt::getSignedMaxValue(
+ C2->getType()->getScalarSizeInBits()))))
+ return nullptr; // Can't do, have signed max element[s].
C2 = InstCombiner::AddOne(C2);
- LLVM_FALLTHROUGH;
- case ICmpInst::Predicate::ICMP_SGE:
- // Also non-canonical, but here we don't need to change C2,
- // so we don't have any restrictions on C2, so we can just handle it.
- std::swap(ReplacementLow, ReplacementHigh);
- break;
- default:
- return nullptr; // Unknown predicate.
- }
-
- // The thresholds of this clamp-like pattern.
- auto *ThresholdLowIncl = ConstantExpr::getNeg(C1);
- auto *ThresholdHighExcl = ConstantExpr::getSub(C0, C1);
-
- // The fold has a precondition 1: C2 s>= ThresholdLow
- auto *Precond1 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SGE, C2,
- ThresholdLowIncl);
- if (!match(Precond1, m_One()))
- return nullptr;
- // The fold has a precondition 2: C2 s<= ThresholdHigh
- auto *Precond2 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SLE, C2,
- ThresholdHighExcl);
- if (!match(Precond2, m_One()))
- return nullptr;
-
- // All good, finally emit the new pattern.
- Value *ShouldReplaceLow = Builder.CreateICmpSLT(X, ThresholdLowIncl);
- Value *ShouldReplaceHigh = Builder.CreateICmpSGE(X, ThresholdHighExcl);
- Value *MaybeReplacedLow =
- Builder.CreateSelect(ShouldReplaceLow, ReplacementLow, X);
- Instruction *MaybeReplacedHigh =
- SelectInst::Create(ShouldReplaceHigh, ReplacementHigh, MaybeReplacedLow);
-
- return MaybeReplacedHigh;
-}
-
-// If we have
-// %cmp = icmp [canonical predicate] i32 %x, C0
-// %r = select i1 %cmp, i32 %y, i32 C1
-// Where C0 != C1 and %x may be different from %y, see if the constant that we
-// will have if we flip the strictness of the predicate (i.e. without changing
-// the result) is identical to the C1 in select. If it matches we can change
-// original comparison to one with swapped predicate, reuse the constant,
-// and swap the hands of select.
-static Instruction *
-tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
+ LLVM_FALLTHROUGH;
+ case ICmpInst::Predicate::ICMP_SGE:
+ // Also non-canonical, but here we don't need to change C2,
+ // so we don't have any restrictions on C2, so we can just handle it.
+ std::swap(ReplacementLow, ReplacementHigh);
+ break;
+ default:
+ return nullptr; // Unknown predicate.
+ }
+
+ // The thresholds of this clamp-like pattern.
+ auto *ThresholdLowIncl = ConstantExpr::getNeg(C1);
+ auto *ThresholdHighExcl = ConstantExpr::getSub(C0, C1);
+
+ // The fold has a precondition 1: C2 s>= ThresholdLow
+ auto *Precond1 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SGE, C2,
+ ThresholdLowIncl);
+ if (!match(Precond1, m_One()))
+ return nullptr;
+ // The fold has a precondition 2: C2 s<= ThresholdHigh
+ auto *Precond2 = ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_SLE, C2,
+ ThresholdHighExcl);
+ if (!match(Precond2, m_One()))
+ return nullptr;
+
+ // All good, finally emit the new pattern.
+ Value *ShouldReplaceLow = Builder.CreateICmpSLT(X, ThresholdLowIncl);
+ Value *ShouldReplaceHigh = Builder.CreateICmpSGE(X, ThresholdHighExcl);
+ Value *MaybeReplacedLow =
+ Builder.CreateSelect(ShouldReplaceLow, ReplacementLow, X);
+ Instruction *MaybeReplacedHigh =
+ SelectInst::Create(ShouldReplaceHigh, ReplacementHigh, MaybeReplacedLow);
+
+ return MaybeReplacedHigh;
+}
+
+// If we have
+// %cmp = icmp [canonical predicate] i32 %x, C0
+// %r = select i1 %cmp, i32 %y, i32 C1
+// Where C0 != C1 and %x may be different from %y, see if the constant that we
+// will have if we flip the strictness of the predicate (i.e. without changing
+// the result) is identical to the C1 in select. If it matches we can change
+// original comparison to one with swapped predicate, reuse the constant,
+// and swap the hands of select.
+static Instruction *
+tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
InstCombinerImpl &IC) {
- ICmpInst::Predicate Pred;
- Value *X;
- Constant *C0;
- if (!match(&Cmp, m_OneUse(m_ICmp(
- Pred, m_Value(X),
- m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0))))))
- return nullptr;
-
- // If comparison predicate is non-relational, we won't be able to do anything.
- if (ICmpInst::isEquality(Pred))
- return nullptr;
-
- // If comparison predicate is non-canonical, then we certainly won't be able
- // to make it canonical; canonicalizeCmpWithConstant() already tried.
+ ICmpInst::Predicate Pred;
+ Value *X;
+ Constant *C0;
+ if (!match(&Cmp, m_OneUse(m_ICmp(
+ Pred, m_Value(X),
+ m_CombineAnd(m_AnyIntegralConstant(), m_Constant(C0))))))
+ return nullptr;
+
+ // If comparison predicate is non-relational, we won't be able to do anything.
+ if (ICmpInst::isEquality(Pred))
+ return nullptr;
+
+ // If comparison predicate is non-canonical, then we certainly won't be able
+ // to make it canonical; canonicalizeCmpWithConstant() already tried.
if (!InstCombiner::isCanonicalPredicate(Pred))
- return nullptr;
-
- // If the [input] type of comparison and select type are different, lets abort
- // for now. We could try to compare constants with trunc/[zs]ext though.
- if (C0->getType() != Sel.getType())
- return nullptr;
-
- // FIXME: are there any magic icmp predicate+constant pairs we must not touch?
-
- Value *SelVal0, *SelVal1; // We do not care which one is from where.
- match(&Sel, m_Select(m_Value(), m_Value(SelVal0), m_Value(SelVal1)));
- // At least one of these values we are selecting between must be a constant
- // else we'll never succeed.
- if (!match(SelVal0, m_AnyIntegralConstant()) &&
- !match(SelVal1, m_AnyIntegralConstant()))
- return nullptr;
-
- // Does this constant C match any of the `select` values?
- auto MatchesSelectValue = [SelVal0, SelVal1](Constant *C) {
- return C->isElementWiseEqual(SelVal0) || C->isElementWiseEqual(SelVal1);
- };
-
- // If C0 *already* matches true/false value of select, we are done.
- if (MatchesSelectValue(C0))
- return nullptr;
-
- // Check the constant we'd have with flipped-strictness predicate.
+ return nullptr;
+
+ // If the [input] type of comparison and select type are different, lets abort
+ // for now. We could try to compare constants with trunc/[zs]ext though.
+ if (C0->getType() != Sel.getType())
+ return nullptr;
+
+ // FIXME: are there any magic icmp predicate+constant pairs we must not touch?
+
+ Value *SelVal0, *SelVal1; // We do not care which one is from where.
+ match(&Sel, m_Select(m_Value(), m_Value(SelVal0), m_Value(SelVal1)));
+ // At least one of these values we are selecting between must be a constant
+ // else we'll never succeed.
+ if (!match(SelVal0, m_AnyIntegralConstant()) &&
+ !match(SelVal1, m_AnyIntegralConstant()))
+ return nullptr;
+
+ // Does this constant C match any of the `select` values?
+ auto MatchesSelectValue = [SelVal0, SelVal1](Constant *C) {
+ return C->isElementWiseEqual(SelVal0) || C->isElementWiseEqual(SelVal1);
+ };
+
+ // If C0 *already* matches true/false value of select, we are done.
+ if (MatchesSelectValue(C0))
+ return nullptr;
+
+ // Check the constant we'd have with flipped-strictness predicate.
auto FlippedStrictness =
InstCombiner::getFlippedStrictnessPredicateAndConstant(Pred, C0);
- if (!FlippedStrictness)
- return nullptr;
-
- // If said constant doesn't match either, then there is no hope,
- if (!MatchesSelectValue(FlippedStrictness->second))
- return nullptr;
-
- // It matched! Lets insert the new comparison just before select.
- InstCombiner::BuilderTy::InsertPointGuard Guard(IC.Builder);
- IC.Builder.SetInsertPoint(&Sel);
-
- Pred = ICmpInst::getSwappedPredicate(Pred); // Yes, swapped.
- Value *NewCmp = IC.Builder.CreateICmp(Pred, X, FlippedStrictness->second,
- Cmp.getName() + ".inv");
- IC.replaceOperand(Sel, 0, NewCmp);
- Sel.swapValues();
- Sel.swapProfMetadata();
-
- return &Sel;
-}
-
-/// Visit a SelectInst that has an ICmpInst as its first operand.
+ if (!FlippedStrictness)
+ return nullptr;
+
+ // If said constant doesn't match either, then there is no hope,
+ if (!MatchesSelectValue(FlippedStrictness->second))
+ return nullptr;
+
+ // It matched! Lets insert the new comparison just before select.
+ InstCombiner::BuilderTy::InsertPointGuard Guard(IC.Builder);
+ IC.Builder.SetInsertPoint(&Sel);
+
+ Pred = ICmpInst::getSwappedPredicate(Pred); // Yes, swapped.
+ Value *NewCmp = IC.Builder.CreateICmp(Pred, X, FlippedStrictness->second,
+ Cmp.getName() + ".inv");
+ IC.replaceOperand(Sel, 0, NewCmp);
+ Sel.swapValues();
+ Sel.swapProfMetadata();
+
+ return &Sel;
+}
+
+/// Visit a SelectInst that has an ICmpInst as its first operand.
Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
ICmpInst *ICI) {
if (Instruction *NewSel = foldSelectValueEquivalence(SI, *ICI))
return NewSel;
-
- if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, *this))
- return NewSel;
-
- if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, *this))
- return NewAbs;
-
- if (Instruction *NewAbs = canonicalizeClampLike(SI, *ICI, Builder))
- return NewAbs;
-
- if (Instruction *NewSel =
- tryToReuseConstantFromSelectInComparison(SI, *ICI, *this))
- return NewSel;
-
- bool Changed = adjustMinMax(SI, *ICI);
-
- if (Value *V = foldSelectICmpAnd(SI, ICI, Builder))
- return replaceInstUsesWith(SI, V);
-
- // NOTE: if we wanted to, this is where to detect integer MIN/MAX
- Value *TrueVal = SI.getTrueValue();
- Value *FalseVal = SI.getFalseValue();
- ICmpInst::Predicate Pred = ICI->getPredicate();
- Value *CmpLHS = ICI->getOperand(0);
- Value *CmpRHS = ICI->getOperand(1);
- if (CmpRHS != CmpLHS && isa<Constant>(CmpRHS)) {
- if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) {
- // Transform (X == C) ? X : Y -> (X == C) ? C : Y
- SI.setOperand(1, CmpRHS);
- Changed = true;
- } else if (CmpLHS == FalseVal && Pred == ICmpInst::ICMP_NE) {
- // Transform (X != C) ? Y : X -> (X != C) ? Y : C
- SI.setOperand(2, CmpRHS);
- Changed = true;
- }
- }
-
- // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring
- // decomposeBitTestICmp() might help.
- {
- unsigned BitWidth =
- DL.getTypeSizeInBits(TrueVal->getType()->getScalarType());
- APInt MinSignedValue = APInt::getSignedMinValue(BitWidth);
- Value *X;
- const APInt *Y, *C;
- bool TrueWhenUnset;
- bool IsBitTest = false;
- if (ICmpInst::isEquality(Pred) &&
- match(CmpLHS, m_And(m_Value(X), m_Power2(Y))) &&
- match(CmpRHS, m_Zero())) {
- IsBitTest = true;
- TrueWhenUnset = Pred == ICmpInst::ICMP_EQ;
- } else if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_Zero())) {
- X = CmpLHS;
- Y = &MinSignedValue;
- IsBitTest = true;
- TrueWhenUnset = false;
- } else if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_AllOnes())) {
- X = CmpLHS;
- Y = &MinSignedValue;
- IsBitTest = true;
- TrueWhenUnset = true;
- }
- if (IsBitTest) {
- Value *V = nullptr;
- // (X & Y) == 0 ? X : X ^ Y --> X & ~Y
- if (TrueWhenUnset && TrueVal == X &&
- match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
- V = Builder.CreateAnd(X, ~(*Y));
- // (X & Y) != 0 ? X ^ Y : X --> X & ~Y
- else if (!TrueWhenUnset && FalseVal == X &&
- match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
- V = Builder.CreateAnd(X, ~(*Y));
- // (X & Y) == 0 ? X ^ Y : X --> X | Y
- else if (TrueWhenUnset && FalseVal == X &&
- match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
- V = Builder.CreateOr(X, *Y);
- // (X & Y) != 0 ? X : X ^ Y --> X | Y
- else if (!TrueWhenUnset && TrueVal == X &&
- match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
- V = Builder.CreateOr(X, *Y);
-
- if (V)
- return replaceInstUsesWith(SI, V);
- }
- }
-
- if (Instruction *V =
- foldSelectICmpAndAnd(SI.getType(), ICI, TrueVal, FalseVal, Builder))
- return V;
-
- if (Instruction *V = foldSelectCtlzToCttz(ICI, TrueVal, FalseVal, Builder))
- return V;
-
- if (Value *V = foldSelectICmpAndOr(ICI, TrueVal, FalseVal, Builder))
- return replaceInstUsesWith(SI, V);
-
- if (Value *V = foldSelectICmpLshrAshr(ICI, TrueVal, FalseVal, Builder))
- return replaceInstUsesWith(SI, V);
-
- if (Value *V = foldSelectCttzCtlz(ICI, TrueVal, FalseVal, Builder))
- return replaceInstUsesWith(SI, V);
-
- if (Value *V = canonicalizeSaturatedSubtract(ICI, TrueVal, FalseVal, Builder))
- return replaceInstUsesWith(SI, V);
-
- if (Value *V = canonicalizeSaturatedAdd(ICI, TrueVal, FalseVal, Builder))
- return replaceInstUsesWith(SI, V);
-
- return Changed ? &SI : nullptr;
-}
-
-/// SI is a select whose condition is a PHI node (but the two may be in
-/// different blocks). See if the true/false values (V) are live in all of the
-/// predecessor blocks of the PHI. For example, cases like this can't be mapped:
-///
-/// X = phi [ C1, BB1], [C2, BB2]
-/// Y = add
-/// Z = select X, Y, 0
-///
-/// because Y is not live in BB1/BB2.
-static bool canSelectOperandBeMappingIntoPredBlock(const Value *V,
- const SelectInst &SI) {
- // If the value is a non-instruction value like a constant or argument, it
- // can always be mapped.
- const Instruction *I = dyn_cast<Instruction>(V);
- if (!I) return true;
-
- // If V is a PHI node defined in the same block as the condition PHI, we can
- // map the arguments.
- const PHINode *CondPHI = cast<PHINode>(SI.getCondition());
-
- if (const PHINode *VP = dyn_cast<PHINode>(I))
- if (VP->getParent() == CondPHI->getParent())
- return true;
-
- // Otherwise, if the PHI and select are defined in the same block and if V is
- // defined in a different block, then we can transform it.
- if (SI.getParent() == CondPHI->getParent() &&
- I->getParent() != CondPHI->getParent())
- return true;
-
- // Otherwise we have a 'hard' case and we can't tell without doing more
- // detailed dominator based analysis, punt.
- return false;
-}
-
-/// We have an SPF (e.g. a min or max) of an SPF of the form:
-/// SPF2(SPF1(A, B), C)
+
+ if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, *this))
+ return NewSel;
+
+ if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, *this))
+ return NewAbs;
+
+ if (Instruction *NewAbs = canonicalizeClampLike(SI, *ICI, Builder))
+ return NewAbs;
+
+ if (Instruction *NewSel =
+ tryToReuseConstantFromSelectInComparison(SI, *ICI, *this))
+ return NewSel;
+
+ bool Changed = adjustMinMax(SI, *ICI);
+
+ if (Value *V = foldSelectICmpAnd(SI, ICI, Builder))
+ return replaceInstUsesWith(SI, V);
+
+ // NOTE: if we wanted to, this is where to detect integer MIN/MAX
+ Value *TrueVal = SI.getTrueValue();
+ Value *FalseVal = SI.getFalseValue();
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+ Value *CmpLHS = ICI->getOperand(0);
+ Value *CmpRHS = ICI->getOperand(1);
+ if (CmpRHS != CmpLHS && isa<Constant>(CmpRHS)) {
+ if (CmpLHS == TrueVal && Pred == ICmpInst::ICMP_EQ) {
+ // Transform (X == C) ? X : Y -> (X == C) ? C : Y
+ SI.setOperand(1, CmpRHS);
+ Changed = true;
+ } else if (CmpLHS == FalseVal && Pred == ICmpInst::ICMP_NE) {
+ // Transform (X != C) ? Y : X -> (X != C) ? Y : C
+ SI.setOperand(2, CmpRHS);
+ Changed = true;
+ }
+ }
+
+ // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring
+ // decomposeBitTestICmp() might help.
+ {
+ unsigned BitWidth =
+ DL.getTypeSizeInBits(TrueVal->getType()->getScalarType());
+ APInt MinSignedValue = APInt::getSignedMinValue(BitWidth);
+ Value *X;
+ const APInt *Y, *C;
+ bool TrueWhenUnset;
+ bool IsBitTest = false;
+ if (ICmpInst::isEquality(Pred) &&
+ match(CmpLHS, m_And(m_Value(X), m_Power2(Y))) &&
+ match(CmpRHS, m_Zero())) {
+ IsBitTest = true;
+ TrueWhenUnset = Pred == ICmpInst::ICMP_EQ;
+ } else if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_Zero())) {
+ X = CmpLHS;
+ Y = &MinSignedValue;
+ IsBitTest = true;
+ TrueWhenUnset = false;
+ } else if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_AllOnes())) {
+ X = CmpLHS;
+ Y = &MinSignedValue;
+ IsBitTest = true;
+ TrueWhenUnset = true;
+ }
+ if (IsBitTest) {
+ Value *V = nullptr;
+ // (X & Y) == 0 ? X : X ^ Y --> X & ~Y
+ if (TrueWhenUnset && TrueVal == X &&
+ match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+ V = Builder.CreateAnd(X, ~(*Y));
+ // (X & Y) != 0 ? X ^ Y : X --> X & ~Y
+ else if (!TrueWhenUnset && FalseVal == X &&
+ match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+ V = Builder.CreateAnd(X, ~(*Y));
+ // (X & Y) == 0 ? X ^ Y : X --> X | Y
+ else if (TrueWhenUnset && FalseVal == X &&
+ match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+ V = Builder.CreateOr(X, *Y);
+ // (X & Y) != 0 ? X : X ^ Y --> X | Y
+ else if (!TrueWhenUnset && TrueVal == X &&
+ match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
+ V = Builder.CreateOr(X, *Y);
+
+ if (V)
+ return replaceInstUsesWith(SI, V);
+ }
+ }
+
+ if (Instruction *V =
+ foldSelectICmpAndAnd(SI.getType(), ICI, TrueVal, FalseVal, Builder))
+ return V;
+
+ if (Instruction *V = foldSelectCtlzToCttz(ICI, TrueVal, FalseVal, Builder))
+ return V;
+
+ if (Value *V = foldSelectICmpAndOr(ICI, TrueVal, FalseVal, Builder))
+ return replaceInstUsesWith(SI, V);
+
+ if (Value *V = foldSelectICmpLshrAshr(ICI, TrueVal, FalseVal, Builder))
+ return replaceInstUsesWith(SI, V);
+
+ if (Value *V = foldSelectCttzCtlz(ICI, TrueVal, FalseVal, Builder))
+ return replaceInstUsesWith(SI, V);
+
+ if (Value *V = canonicalizeSaturatedSubtract(ICI, TrueVal, FalseVal, Builder))
+ return replaceInstUsesWith(SI, V);
+
+ if (Value *V = canonicalizeSaturatedAdd(ICI, TrueVal, FalseVal, Builder))
+ return replaceInstUsesWith(SI, V);
+
+ return Changed ? &SI : nullptr;
+}
+
+/// SI is a select whose condition is a PHI node (but the two may be in
+/// different blocks). See if the true/false values (V) are live in all of the
+/// predecessor blocks of the PHI. For example, cases like this can't be mapped:
+///
+/// X = phi [ C1, BB1], [C2, BB2]
+/// Y = add
+/// Z = select X, Y, 0
+///
+/// because Y is not live in BB1/BB2.
+static bool canSelectOperandBeMappingIntoPredBlock(const Value *V,
+ const SelectInst &SI) {
+ // If the value is a non-instruction value like a constant or argument, it
+ // can always be mapped.
+ const Instruction *I = dyn_cast<Instruction>(V);
+ if (!I) return true;
+
+ // If V is a PHI node defined in the same block as the condition PHI, we can
+ // map the arguments.
+ const PHINode *CondPHI = cast<PHINode>(SI.getCondition());
+
+ if (const PHINode *VP = dyn_cast<PHINode>(I))
+ if (VP->getParent() == CondPHI->getParent())
+ return true;
+
+ // Otherwise, if the PHI and select are defined in the same block and if V is
+ // defined in a different block, then we can transform it.
+ if (SI.getParent() == CondPHI->getParent() &&
+ I->getParent() != CondPHI->getParent())
+ return true;
+
+ // Otherwise we have a 'hard' case and we can't tell without doing more
+ // detailed dominator based analysis, punt.
+ return false;
+}
+
+/// We have an SPF (e.g. a min or max) of an SPF of the form:
+/// SPF2(SPF1(A, B), C)
Instruction *InstCombinerImpl::foldSPFofSPF(Instruction *Inner,
SelectPatternFlavor SPF1, Value *A,
Value *B, Instruction &Outer,
SelectPatternFlavor SPF2,
Value *C) {
- if (Outer.getType() != Inner->getType())
- return nullptr;
-
- if (C == A || C == B) {
- // MAX(MAX(A, B), B) -> MAX(A, B)
- // MIN(MIN(a, b), a) -> MIN(a, b)
- // TODO: This could be done in instsimplify.
- if (SPF1 == SPF2 && SelectPatternResult::isMinOrMax(SPF1))
- return replaceInstUsesWith(Outer, Inner);
-
- // MAX(MIN(a, b), a) -> a
- // MIN(MAX(a, b), a) -> a
- // TODO: This could be done in instsimplify.
- if ((SPF1 == SPF_SMIN && SPF2 == SPF_SMAX) ||
- (SPF1 == SPF_SMAX && SPF2 == SPF_SMIN) ||
- (SPF1 == SPF_UMIN && SPF2 == SPF_UMAX) ||
- (SPF1 == SPF_UMAX && SPF2 == SPF_UMIN))
- return replaceInstUsesWith(Outer, C);
- }
-
- if (SPF1 == SPF2) {
- const APInt *CB, *CC;
- if (match(B, m_APInt(CB)) && match(C, m_APInt(CC))) {
- // MIN(MIN(A, 23), 97) -> MIN(A, 23)
- // MAX(MAX(A, 97), 23) -> MAX(A, 97)
- // TODO: This could be done in instsimplify.
- if ((SPF1 == SPF_UMIN && CB->ule(*CC)) ||
- (SPF1 == SPF_SMIN && CB->sle(*CC)) ||
- (SPF1 == SPF_UMAX && CB->uge(*CC)) ||
- (SPF1 == SPF_SMAX && CB->sge(*CC)))
- return replaceInstUsesWith(Outer, Inner);
-
- // MIN(MIN(A, 97), 23) -> MIN(A, 23)
- // MAX(MAX(A, 23), 97) -> MAX(A, 97)
- if ((SPF1 == SPF_UMIN && CB->ugt(*CC)) ||
- (SPF1 == SPF_SMIN && CB->sgt(*CC)) ||
- (SPF1 == SPF_UMAX && CB->ult(*CC)) ||
- (SPF1 == SPF_SMAX && CB->slt(*CC))) {
- Outer.replaceUsesOfWith(Inner, A);
- return &Outer;
- }
- }
- }
-
- // max(max(A, B), min(A, B)) --> max(A, B)
- // min(min(A, B), max(A, B)) --> min(A, B)
- // TODO: This could be done in instsimplify.
- if (SPF1 == SPF2 &&
- ((SPF1 == SPF_UMIN && match(C, m_c_UMax(m_Specific(A), m_Specific(B)))) ||
- (SPF1 == SPF_SMIN && match(C, m_c_SMax(m_Specific(A), m_Specific(B)))) ||
- (SPF1 == SPF_UMAX && match(C, m_c_UMin(m_Specific(A), m_Specific(B)))) ||
- (SPF1 == SPF_SMAX && match(C, m_c_SMin(m_Specific(A), m_Specific(B))))))
- return replaceInstUsesWith(Outer, Inner);
-
- // ABS(ABS(X)) -> ABS(X)
- // NABS(NABS(X)) -> NABS(X)
- // TODO: This could be done in instsimplify.
- if (SPF1 == SPF2 && (SPF1 == SPF_ABS || SPF1 == SPF_NABS)) {
- return replaceInstUsesWith(Outer, Inner);
- }
-
- // ABS(NABS(X)) -> ABS(X)
- // NABS(ABS(X)) -> NABS(X)
- if ((SPF1 == SPF_ABS && SPF2 == SPF_NABS) ||
- (SPF1 == SPF_NABS && SPF2 == SPF_ABS)) {
- SelectInst *SI = cast<SelectInst>(Inner);
- Value *NewSI =
- Builder.CreateSelect(SI->getCondition(), SI->getFalseValue(),
- SI->getTrueValue(), SI->getName(), SI);
- return replaceInstUsesWith(Outer, NewSI);
- }
-
- auto IsFreeOrProfitableToInvert =
- [&](Value *V, Value *&NotV, bool &ElidesXor) {
- if (match(V, m_Not(m_Value(NotV)))) {
- // If V has at most 2 uses then we can get rid of the xor operation
- // entirely.
- ElidesXor |= !V->hasNUsesOrMore(3);
- return true;
- }
-
- if (isFreeToInvert(V, !V->hasNUsesOrMore(3))) {
- NotV = nullptr;
- return true;
- }
-
- return false;
- };
-
- Value *NotA, *NotB, *NotC;
- bool ElidesXor = false;
-
- // MIN(MIN(~A, ~B), ~C) == ~MAX(MAX(A, B), C)
- // MIN(MAX(~A, ~B), ~C) == ~MAX(MIN(A, B), C)
- // MAX(MIN(~A, ~B), ~C) == ~MIN(MAX(A, B), C)
- // MAX(MAX(~A, ~B), ~C) == ~MIN(MIN(A, B), C)
- //
- // This transform is performance neutral if we can elide at least one xor from
- // the set of three operands, since we'll be tacking on an xor at the very
- // end.
- if (SelectPatternResult::isMinOrMax(SPF1) &&
- SelectPatternResult::isMinOrMax(SPF2) &&
- IsFreeOrProfitableToInvert(A, NotA, ElidesXor) &&
- IsFreeOrProfitableToInvert(B, NotB, ElidesXor) &&
- IsFreeOrProfitableToInvert(C, NotC, ElidesXor) && ElidesXor) {
- if (!NotA)
- NotA = Builder.CreateNot(A);
- if (!NotB)
- NotB = Builder.CreateNot(B);
- if (!NotC)
- NotC = Builder.CreateNot(C);
-
- Value *NewInner = createMinMax(Builder, getInverseMinMaxFlavor(SPF1), NotA,
- NotB);
- Value *NewOuter = Builder.CreateNot(
- createMinMax(Builder, getInverseMinMaxFlavor(SPF2), NewInner, NotC));
- return replaceInstUsesWith(Outer, NewOuter);
- }
-
- return nullptr;
-}
-
-/// Turn select C, (X + Y), (X - Y) --> (X + (select C, Y, (-Y))).
-/// This is even legal for FP.
-static Instruction *foldAddSubSelect(SelectInst &SI,
- InstCombiner::BuilderTy &Builder) {
- Value *CondVal = SI.getCondition();
- Value *TrueVal = SI.getTrueValue();
- Value *FalseVal = SI.getFalseValue();
- auto *TI = dyn_cast<Instruction>(TrueVal);
- auto *FI = dyn_cast<Instruction>(FalseVal);
- if (!TI || !FI || !TI->hasOneUse() || !FI->hasOneUse())
- return nullptr;
-
- Instruction *AddOp = nullptr, *SubOp = nullptr;
- if ((TI->getOpcode() == Instruction::Sub &&
- FI->getOpcode() == Instruction::Add) ||
- (TI->getOpcode() == Instruction::FSub &&
- FI->getOpcode() == Instruction::FAdd)) {
- AddOp = FI;
- SubOp = TI;
- } else if ((FI->getOpcode() == Instruction::Sub &&
- TI->getOpcode() == Instruction::Add) ||
- (FI->getOpcode() == Instruction::FSub &&
- TI->getOpcode() == Instruction::FAdd)) {
- AddOp = TI;
- SubOp = FI;
- }
-
- if (AddOp) {
- Value *OtherAddOp = nullptr;
- if (SubOp->getOperand(0) == AddOp->getOperand(0)) {
- OtherAddOp = AddOp->getOperand(1);
- } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) {
- OtherAddOp = AddOp->getOperand(0);
- }
-
- if (OtherAddOp) {
- // So at this point we know we have (Y -> OtherAddOp):
- // select C, (add X, Y), (sub X, Z)
- Value *NegVal; // Compute -Z
- if (SI.getType()->isFPOrFPVectorTy()) {
- NegVal = Builder.CreateFNeg(SubOp->getOperand(1));
- if (Instruction *NegInst = dyn_cast<Instruction>(NegVal)) {
- FastMathFlags Flags = AddOp->getFastMathFlags();
- Flags &= SubOp->getFastMathFlags();
- NegInst->setFastMathFlags(Flags);
- }
- } else {
- NegVal = Builder.CreateNeg(SubOp->getOperand(1));
- }
-
- Value *NewTrueOp = OtherAddOp;
- Value *NewFalseOp = NegVal;
- if (AddOp != TI)
- std::swap(NewTrueOp, NewFalseOp);
- Value *NewSel = Builder.CreateSelect(CondVal, NewTrueOp, NewFalseOp,
- SI.getName() + ".p", &SI);
-
- if (SI.getType()->isFPOrFPVectorTy()) {
- Instruction *RI =
- BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel);
-
- FastMathFlags Flags = AddOp->getFastMathFlags();
- Flags &= SubOp->getFastMathFlags();
- RI->setFastMathFlags(Flags);
- return RI;
- } else
- return BinaryOperator::CreateAdd(SubOp->getOperand(0), NewSel);
- }
- }
- return nullptr;
-}
-
-/// Turn X + Y overflows ? -1 : X + Y -> uadd_sat X, Y
-/// And X - Y overflows ? 0 : X - Y -> usub_sat X, Y
-/// Along with a number of patterns similar to:
-/// X + Y overflows ? (X < 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
-/// X - Y overflows ? (X > 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
-static Instruction *
-foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) {
- Value *CondVal = SI.getCondition();
- Value *TrueVal = SI.getTrueValue();
- Value *FalseVal = SI.getFalseValue();
-
- WithOverflowInst *II;
- if (!match(CondVal, m_ExtractValue<1>(m_WithOverflowInst(II))) ||
- !match(FalseVal, m_ExtractValue<0>(m_Specific(II))))
- return nullptr;
-
- Value *X = II->getLHS();
- Value *Y = II->getRHS();
-
- auto IsSignedSaturateLimit = [&](Value *Limit, bool IsAdd) {
- Type *Ty = Limit->getType();
-
- ICmpInst::Predicate Pred;
- Value *TrueVal, *FalseVal, *Op;
- const APInt *C;
- if (!match(Limit, m_Select(m_ICmp(Pred, m_Value(Op), m_APInt(C)),
- m_Value(TrueVal), m_Value(FalseVal))))
- return false;
-
- auto IsZeroOrOne = [](const APInt &C) {
- return C.isNullValue() || C.isOneValue();
- };
- auto IsMinMax = [&](Value *Min, Value *Max) {
- APInt MinVal = APInt::getSignedMinValue(Ty->getScalarSizeInBits());
- APInt MaxVal = APInt::getSignedMaxValue(Ty->getScalarSizeInBits());
- return match(Min, m_SpecificInt(MinVal)) &&
- match(Max, m_SpecificInt(MaxVal));
- };
-
- if (Op != X && Op != Y)
- return false;
-
- if (IsAdd) {
- // X + Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
- // X + Y overflows ? (X <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
- // X + Y overflows ? (Y <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
- // X + Y overflows ? (Y <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
- if (Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C) &&
- IsMinMax(TrueVal, FalseVal))
- return true;
- // X + Y overflows ? (X >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
- // X + Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
- // X + Y overflows ? (Y >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
- // X + Y overflows ? (Y >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
- if (Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 1) &&
- IsMinMax(FalseVal, TrueVal))
- return true;
- } else {
- // X - Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
- // X - Y overflows ? (X <s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
- if (Op == X && Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C + 1) &&
- IsMinMax(TrueVal, FalseVal))
- return true;
- // X - Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
- // X - Y overflows ? (X >s -2 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
- if (Op == X && Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 2) &&
- IsMinMax(FalseVal, TrueVal))
- return true;
- // X - Y overflows ? (Y <s 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
- // X - Y overflows ? (Y <s 1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
- if (Op == Y && Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C) &&
- IsMinMax(FalseVal, TrueVal))
- return true;
- // X - Y overflows ? (Y >s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
- // X - Y overflows ? (Y >s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
- if (Op == Y && Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 1) &&
- IsMinMax(TrueVal, FalseVal))
- return true;
- }
-
- return false;
- };
-
- Intrinsic::ID NewIntrinsicID;
- if (II->getIntrinsicID() == Intrinsic::uadd_with_overflow &&
- match(TrueVal, m_AllOnes()))
- // X + Y overflows ? -1 : X + Y -> uadd_sat X, Y
- NewIntrinsicID = Intrinsic::uadd_sat;
- else if (II->getIntrinsicID() == Intrinsic::usub_with_overflow &&
- match(TrueVal, m_Zero()))
- // X - Y overflows ? 0 : X - Y -> usub_sat X, Y
- NewIntrinsicID = Intrinsic::usub_sat;
- else if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow &&
- IsSignedSaturateLimit(TrueVal, /*IsAdd=*/true))
- // X + Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
- // X + Y overflows ? (X <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
- // X + Y overflows ? (X >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
- // X + Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
- // X + Y overflows ? (Y <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
- // X + Y overflows ? (Y <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
- // X + Y overflows ? (Y >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
- // X + Y overflows ? (Y >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
- NewIntrinsicID = Intrinsic::sadd_sat;
- else if (II->getIntrinsicID() == Intrinsic::ssub_with_overflow &&
- IsSignedSaturateLimit(TrueVal, /*IsAdd=*/false))
- // X - Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
- // X - Y overflows ? (X <s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
- // X - Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
- // X - Y overflows ? (X >s -2 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
- // X - Y overflows ? (Y <s 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
- // X - Y overflows ? (Y <s 1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
- // X - Y overflows ? (Y >s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
- // X - Y overflows ? (Y >s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
- NewIntrinsicID = Intrinsic::ssub_sat;
- else
- return nullptr;
-
- Function *F =
- Intrinsic::getDeclaration(SI.getModule(), NewIntrinsicID, SI.getType());
- return CallInst::Create(F, {X, Y});
-}
-
+ if (Outer.getType() != Inner->getType())
+ return nullptr;
+
+ if (C == A || C == B) {
+ // MAX(MAX(A, B), B) -> MAX(A, B)
+ // MIN(MIN(a, b), a) -> MIN(a, b)
+ // TODO: This could be done in instsimplify.
+ if (SPF1 == SPF2 && SelectPatternResult::isMinOrMax(SPF1))
+ return replaceInstUsesWith(Outer, Inner);
+
+ // MAX(MIN(a, b), a) -> a
+ // MIN(MAX(a, b), a) -> a
+ // TODO: This could be done in instsimplify.
+ if ((SPF1 == SPF_SMIN && SPF2 == SPF_SMAX) ||
+ (SPF1 == SPF_SMAX && SPF2 == SPF_SMIN) ||
+ (SPF1 == SPF_UMIN && SPF2 == SPF_UMAX) ||
+ (SPF1 == SPF_UMAX && SPF2 == SPF_UMIN))
+ return replaceInstUsesWith(Outer, C);
+ }
+
+ if (SPF1 == SPF2) {
+ const APInt *CB, *CC;
+ if (match(B, m_APInt(CB)) && match(C, m_APInt(CC))) {
+ // MIN(MIN(A, 23), 97) -> MIN(A, 23)
+ // MAX(MAX(A, 97), 23) -> MAX(A, 97)
+ // TODO: This could be done in instsimplify.
+ if ((SPF1 == SPF_UMIN && CB->ule(*CC)) ||
+ (SPF1 == SPF_SMIN && CB->sle(*CC)) ||
+ (SPF1 == SPF_UMAX && CB->uge(*CC)) ||
+ (SPF1 == SPF_SMAX && CB->sge(*CC)))
+ return replaceInstUsesWith(Outer, Inner);
+
+ // MIN(MIN(A, 97), 23) -> MIN(A, 23)
+ // MAX(MAX(A, 23), 97) -> MAX(A, 97)
+ if ((SPF1 == SPF_UMIN && CB->ugt(*CC)) ||
+ (SPF1 == SPF_SMIN && CB->sgt(*CC)) ||
+ (SPF1 == SPF_UMAX && CB->ult(*CC)) ||
+ (SPF1 == SPF_SMAX && CB->slt(*CC))) {
+ Outer.replaceUsesOfWith(Inner, A);
+ return &Outer;
+ }
+ }
+ }
+
+ // max(max(A, B), min(A, B)) --> max(A, B)
+ // min(min(A, B), max(A, B)) --> min(A, B)
+ // TODO: This could be done in instsimplify.
+ if (SPF1 == SPF2 &&
+ ((SPF1 == SPF_UMIN && match(C, m_c_UMax(m_Specific(A), m_Specific(B)))) ||
+ (SPF1 == SPF_SMIN && match(C, m_c_SMax(m_Specific(A), m_Specific(B)))) ||
+ (SPF1 == SPF_UMAX && match(C, m_c_UMin(m_Specific(A), m_Specific(B)))) ||
+ (SPF1 == SPF_SMAX && match(C, m_c_SMin(m_Specific(A), m_Specific(B))))))
+ return replaceInstUsesWith(Outer, Inner);
+
+ // ABS(ABS(X)) -> ABS(X)
+ // NABS(NABS(X)) -> NABS(X)
+ // TODO: This could be done in instsimplify.
+ if (SPF1 == SPF2 && (SPF1 == SPF_ABS || SPF1 == SPF_NABS)) {
+ return replaceInstUsesWith(Outer, Inner);
+ }
+
+ // ABS(NABS(X)) -> ABS(X)
+ // NABS(ABS(X)) -> NABS(X)
+ if ((SPF1 == SPF_ABS && SPF2 == SPF_NABS) ||
+ (SPF1 == SPF_NABS && SPF2 == SPF_ABS)) {
+ SelectInst *SI = cast<SelectInst>(Inner);
+ Value *NewSI =
+ Builder.CreateSelect(SI->getCondition(), SI->getFalseValue(),
+ SI->getTrueValue(), SI->getName(), SI);
+ return replaceInstUsesWith(Outer, NewSI);
+ }
+
+ auto IsFreeOrProfitableToInvert =
+ [&](Value *V, Value *&NotV, bool &ElidesXor) {
+ if (match(V, m_Not(m_Value(NotV)))) {
+ // If V has at most 2 uses then we can get rid of the xor operation
+ // entirely.
+ ElidesXor |= !V->hasNUsesOrMore(3);
+ return true;
+ }
+
+ if (isFreeToInvert(V, !V->hasNUsesOrMore(3))) {
+ NotV = nullptr;
+ return true;
+ }
+
+ return false;
+ };
+
+ Value *NotA, *NotB, *NotC;
+ bool ElidesXor = false;
+
+ // MIN(MIN(~A, ~B), ~C) == ~MAX(MAX(A, B), C)
+ // MIN(MAX(~A, ~B), ~C) == ~MAX(MIN(A, B), C)
+ // MAX(MIN(~A, ~B), ~C) == ~MIN(MAX(A, B), C)
+ // MAX(MAX(~A, ~B), ~C) == ~MIN(MIN(A, B), C)
+ //
+ // This transform is performance neutral if we can elide at least one xor from
+ // the set of three operands, since we'll be tacking on an xor at the very
+ // end.
+ if (SelectPatternResult::isMinOrMax(SPF1) &&
+ SelectPatternResult::isMinOrMax(SPF2) &&
+ IsFreeOrProfitableToInvert(A, NotA, ElidesXor) &&
+ IsFreeOrProfitableToInvert(B, NotB, ElidesXor) &&
+ IsFreeOrProfitableToInvert(C, NotC, ElidesXor) && ElidesXor) {
+ if (!NotA)
+ NotA = Builder.CreateNot(A);
+ if (!NotB)
+ NotB = Builder.CreateNot(B);
+ if (!NotC)
+ NotC = Builder.CreateNot(C);
+
+ Value *NewInner = createMinMax(Builder, getInverseMinMaxFlavor(SPF1), NotA,
+ NotB);
+ Value *NewOuter = Builder.CreateNot(
+ createMinMax(Builder, getInverseMinMaxFlavor(SPF2), NewInner, NotC));
+ return replaceInstUsesWith(Outer, NewOuter);
+ }
+
+ return nullptr;
+}
+
+/// Turn select C, (X + Y), (X - Y) --> (X + (select C, Y, (-Y))).
+/// This is even legal for FP.
+static Instruction *foldAddSubSelect(SelectInst &SI,
+ InstCombiner::BuilderTy &Builder) {
+ Value *CondVal = SI.getCondition();
+ Value *TrueVal = SI.getTrueValue();
+ Value *FalseVal = SI.getFalseValue();
+ auto *TI = dyn_cast<Instruction>(TrueVal);
+ auto *FI = dyn_cast<Instruction>(FalseVal);
+ if (!TI || !FI || !TI->hasOneUse() || !FI->hasOneUse())
+ return nullptr;
+
+ Instruction *AddOp = nullptr, *SubOp = nullptr;
+ if ((TI->getOpcode() == Instruction::Sub &&
+ FI->getOpcode() == Instruction::Add) ||
+ (TI->getOpcode() == Instruction::FSub &&
+ FI->getOpcode() == Instruction::FAdd)) {
+ AddOp = FI;
+ SubOp = TI;
+ } else if ((FI->getOpcode() == Instruction::Sub &&
+ TI->getOpcode() == Instruction::Add) ||
+ (FI->getOpcode() == Instruction::FSub &&
+ TI->getOpcode() == Instruction::FAdd)) {
+ AddOp = TI;
+ SubOp = FI;
+ }
+
+ if (AddOp) {
+ Value *OtherAddOp = nullptr;
+ if (SubOp->getOperand(0) == AddOp->getOperand(0)) {
+ OtherAddOp = AddOp->getOperand(1);
+ } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) {
+ OtherAddOp = AddOp->getOperand(0);
+ }
+
+ if (OtherAddOp) {
+ // So at this point we know we have (Y -> OtherAddOp):
+ // select C, (add X, Y), (sub X, Z)
+ Value *NegVal; // Compute -Z
+ if (SI.getType()->isFPOrFPVectorTy()) {
+ NegVal = Builder.CreateFNeg(SubOp->getOperand(1));
+ if (Instruction *NegInst = dyn_cast<Instruction>(NegVal)) {
+ FastMathFlags Flags = AddOp->getFastMathFlags();
+ Flags &= SubOp->getFastMathFlags();
+ NegInst->setFastMathFlags(Flags);
+ }
+ } else {
+ NegVal = Builder.CreateNeg(SubOp->getOperand(1));
+ }
+
+ Value *NewTrueOp = OtherAddOp;
+ Value *NewFalseOp = NegVal;
+ if (AddOp != TI)
+ std::swap(NewTrueOp, NewFalseOp);
+ Value *NewSel = Builder.CreateSelect(CondVal, NewTrueOp, NewFalseOp,
+ SI.getName() + ".p", &SI);
+
+ if (SI.getType()->isFPOrFPVectorTy()) {
+ Instruction *RI =
+ BinaryOperator::CreateFAdd(SubOp->getOperand(0), NewSel);
+
+ FastMathFlags Flags = AddOp->getFastMathFlags();
+ Flags &= SubOp->getFastMathFlags();
+ RI->setFastMathFlags(Flags);
+ return RI;
+ } else
+ return BinaryOperator::CreateAdd(SubOp->getOperand(0), NewSel);
+ }
+ }
+ return nullptr;
+}
+
+/// Turn X + Y overflows ? -1 : X + Y -> uadd_sat X, Y
+/// And X - Y overflows ? 0 : X - Y -> usub_sat X, Y
+/// Along with a number of patterns similar to:
+/// X + Y overflows ? (X < 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+/// X - Y overflows ? (X > 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+static Instruction *
+foldOverflowingAddSubSelect(SelectInst &SI, InstCombiner::BuilderTy &Builder) {
+ Value *CondVal = SI.getCondition();
+ Value *TrueVal = SI.getTrueValue();
+ Value *FalseVal = SI.getFalseValue();
+
+ WithOverflowInst *II;
+ if (!match(CondVal, m_ExtractValue<1>(m_WithOverflowInst(II))) ||
+ !match(FalseVal, m_ExtractValue<0>(m_Specific(II))))
+ return nullptr;
+
+ Value *X = II->getLHS();
+ Value *Y = II->getRHS();
+
+ auto IsSignedSaturateLimit = [&](Value *Limit, bool IsAdd) {
+ Type *Ty = Limit->getType();
+
+ ICmpInst::Predicate Pred;
+ Value *TrueVal, *FalseVal, *Op;
+ const APInt *C;
+ if (!match(Limit, m_Select(m_ICmp(Pred, m_Value(Op), m_APInt(C)),
+ m_Value(TrueVal), m_Value(FalseVal))))
+ return false;
+
+ auto IsZeroOrOne = [](const APInt &C) {
+ return C.isNullValue() || C.isOneValue();
+ };
+ auto IsMinMax = [&](Value *Min, Value *Max) {
+ APInt MinVal = APInt::getSignedMinValue(Ty->getScalarSizeInBits());
+ APInt MaxVal = APInt::getSignedMaxValue(Ty->getScalarSizeInBits());
+ return match(Min, m_SpecificInt(MinVal)) &&
+ match(Max, m_SpecificInt(MaxVal));
+ };
+
+ if (Op != X && Op != Y)
+ return false;
+
+ if (IsAdd) {
+ // X + Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+ // X + Y overflows ? (X <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+ // X + Y overflows ? (Y <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+ // X + Y overflows ? (Y <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+ if (Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C) &&
+ IsMinMax(TrueVal, FalseVal))
+ return true;
+ // X + Y overflows ? (X >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+ // X + Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+ // X + Y overflows ? (Y >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+ // X + Y overflows ? (Y >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+ if (Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 1) &&
+ IsMinMax(FalseVal, TrueVal))
+ return true;
+ } else {
+ // X - Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+ // X - Y overflows ? (X <s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+ if (Op == X && Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C + 1) &&
+ IsMinMax(TrueVal, FalseVal))
+ return true;
+ // X - Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+ // X - Y overflows ? (X >s -2 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+ if (Op == X && Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 2) &&
+ IsMinMax(FalseVal, TrueVal))
+ return true;
+ // X - Y overflows ? (Y <s 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+ // X - Y overflows ? (Y <s 1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+ if (Op == Y && Pred == ICmpInst::ICMP_SLT && IsZeroOrOne(*C) &&
+ IsMinMax(FalseVal, TrueVal))
+ return true;
+ // X - Y overflows ? (Y >s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+ // X - Y overflows ? (Y >s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+ if (Op == Y && Pred == ICmpInst::ICMP_SGT && IsZeroOrOne(*C + 1) &&
+ IsMinMax(TrueVal, FalseVal))
+ return true;
+ }
+
+ return false;
+ };
+
+ Intrinsic::ID NewIntrinsicID;
+ if (II->getIntrinsicID() == Intrinsic::uadd_with_overflow &&
+ match(TrueVal, m_AllOnes()))
+ // X + Y overflows ? -1 : X + Y -> uadd_sat X, Y
+ NewIntrinsicID = Intrinsic::uadd_sat;
+ else if (II->getIntrinsicID() == Intrinsic::usub_with_overflow &&
+ match(TrueVal, m_Zero()))
+ // X - Y overflows ? 0 : X - Y -> usub_sat X, Y
+ NewIntrinsicID = Intrinsic::usub_sat;
+ else if (II->getIntrinsicID() == Intrinsic::sadd_with_overflow &&
+ IsSignedSaturateLimit(TrueVal, /*IsAdd=*/true))
+ // X + Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+ // X + Y overflows ? (X <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+ // X + Y overflows ? (X >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+ // X + Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+ // X + Y overflows ? (Y <s 0 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+ // X + Y overflows ? (Y <s 1 ? INTMIN : INTMAX) : X + Y --> sadd_sat X, Y
+ // X + Y overflows ? (Y >s 0 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+ // X + Y overflows ? (Y >s -1 ? INTMAX : INTMIN) : X + Y --> sadd_sat X, Y
+ NewIntrinsicID = Intrinsic::sadd_sat;
+ else if (II->getIntrinsicID() == Intrinsic::ssub_with_overflow &&
+ IsSignedSaturateLimit(TrueVal, /*IsAdd=*/false))
+ // X - Y overflows ? (X <s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+ // X - Y overflows ? (X <s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+ // X - Y overflows ? (X >s -1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+ // X - Y overflows ? (X >s -2 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+ // X - Y overflows ? (Y <s 0 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+ // X - Y overflows ? (Y <s 1 ? INTMAX : INTMIN) : X - Y --> ssub_sat X, Y
+ // X - Y overflows ? (Y >s 0 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+ // X - Y overflows ? (Y >s -1 ? INTMIN : INTMAX) : X - Y --> ssub_sat X, Y
+ NewIntrinsicID = Intrinsic::ssub_sat;
+ else
+ return nullptr;
+
+ Function *F =
+ Intrinsic::getDeclaration(SI.getModule(), NewIntrinsicID, SI.getType());
+ return CallInst::Create(F, {X, Y});
+}
+
Instruction *InstCombinerImpl::foldSelectExtConst(SelectInst &Sel) {
- Constant *C;
- if (!match(Sel.getTrueValue(), m_Constant(C)) &&
- !match(Sel.getFalseValue(), m_Constant(C)))
- return nullptr;
-
- Instruction *ExtInst;
- if (!match(Sel.getTrueValue(), m_Instruction(ExtInst)) &&
- !match(Sel.getFalseValue(), m_Instruction(ExtInst)))
- return nullptr;
-
- auto ExtOpcode = ExtInst->getOpcode();
- if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt)
- return nullptr;
-
- // If we are extending from a boolean type or if we can create a select that
- // has the same size operands as its condition, try to narrow the select.
- Value *X = ExtInst->getOperand(0);
- Type *SmallType = X->getType();
- Value *Cond = Sel.getCondition();
- auto *Cmp = dyn_cast<CmpInst>(Cond);
- if (!SmallType->isIntOrIntVectorTy(1) &&
- (!Cmp || Cmp->getOperand(0)->getType() != SmallType))
- return nullptr;
-
- // If the constant is the same after truncation to the smaller type and
- // extension to the original type, we can narrow the select.
- Type *SelType = Sel.getType();
- Constant *TruncC = ConstantExpr::getTrunc(C, SmallType);
- Constant *ExtC = ConstantExpr::getCast(ExtOpcode, TruncC, SelType);
- if (ExtC == C && ExtInst->hasOneUse()) {
- Value *TruncCVal = cast<Value>(TruncC);
- if (ExtInst == Sel.getFalseValue())
- std::swap(X, TruncCVal);
-
- // select Cond, (ext X), C --> ext(select Cond, X, C')
- // select Cond, C, (ext X) --> ext(select Cond, C', X)
- Value *NewSel = Builder.CreateSelect(Cond, X, TruncCVal, "narrow", &Sel);
- return CastInst::Create(Instruction::CastOps(ExtOpcode), NewSel, SelType);
- }
-
- // If one arm of the select is the extend of the condition, replace that arm
- // with the extension of the appropriate known bool value.
- if (Cond == X) {
- if (ExtInst == Sel.getTrueValue()) {
- // select X, (sext X), C --> select X, -1, C
- // select X, (zext X), C --> select X, 1, C
- Constant *One = ConstantInt::getTrue(SmallType);
- Constant *AllOnesOrOne = ConstantExpr::getCast(ExtOpcode, One, SelType);
- return SelectInst::Create(Cond, AllOnesOrOne, C, "", nullptr, &Sel);
- } else {
- // select X, C, (sext X) --> select X, C, 0
- // select X, C, (zext X) --> select X, C, 0
- Constant *Zero = ConstantInt::getNullValue(SelType);
- return SelectInst::Create(Cond, C, Zero, "", nullptr, &Sel);
- }
- }
-
- return nullptr;
-}
-
-/// Try to transform a vector select with a constant condition vector into a
-/// shuffle for easier combining with other shuffles and insert/extract.
-static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
- Value *CondVal = SI.getCondition();
- Constant *CondC;
+ Constant *C;
+ if (!match(Sel.getTrueValue(), m_Constant(C)) &&
+ !match(Sel.getFalseValue(), m_Constant(C)))
+ return nullptr;
+
+ Instruction *ExtInst;
+ if (!match(Sel.getTrueValue(), m_Instruction(ExtInst)) &&
+ !match(Sel.getFalseValue(), m_Instruction(ExtInst)))
+ return nullptr;
+
+ auto ExtOpcode = ExtInst->getOpcode();
+ if (ExtOpcode != Instruction::ZExt && ExtOpcode != Instruction::SExt)
+ return nullptr;
+
+ // If we are extending from a boolean type or if we can create a select that
+ // has the same size operands as its condition, try to narrow the select.
+ Value *X = ExtInst->getOperand(0);
+ Type *SmallType = X->getType();
+ Value *Cond = Sel.getCondition();
+ auto *Cmp = dyn_cast<CmpInst>(Cond);
+ if (!SmallType->isIntOrIntVectorTy(1) &&
+ (!Cmp || Cmp->getOperand(0)->getType() != SmallType))
+ return nullptr;
+
+ // If the constant is the same after truncation to the smaller type and
+ // extension to the original type, we can narrow the select.
+ Type *SelType = Sel.getType();
+ Constant *TruncC = ConstantExpr::getTrunc(C, SmallType);
+ Constant *ExtC = ConstantExpr::getCast(ExtOpcode, TruncC, SelType);
+ if (ExtC == C && ExtInst->hasOneUse()) {
+ Value *TruncCVal = cast<Value>(TruncC);
+ if (ExtInst == Sel.getFalseValue())
+ std::swap(X, TruncCVal);
+
+ // select Cond, (ext X), C --> ext(select Cond, X, C')
+ // select Cond, C, (ext X) --> ext(select Cond, C', X)
+ Value *NewSel = Builder.CreateSelect(Cond, X, TruncCVal, "narrow", &Sel);
+ return CastInst::Create(Instruction::CastOps(ExtOpcode), NewSel, SelType);
+ }
+
+ // If one arm of the select is the extend of the condition, replace that arm
+ // with the extension of the appropriate known bool value.
+ if (Cond == X) {
+ if (ExtInst == Sel.getTrueValue()) {
+ // select X, (sext X), C --> select X, -1, C
+ // select X, (zext X), C --> select X, 1, C
+ Constant *One = ConstantInt::getTrue(SmallType);
+ Constant *AllOnesOrOne = ConstantExpr::getCast(ExtOpcode, One, SelType);
+ return SelectInst::Create(Cond, AllOnesOrOne, C, "", nullptr, &Sel);
+ } else {
+ // select X, C, (sext X) --> select X, C, 0
+ // select X, C, (zext X) --> select X, C, 0
+ Constant *Zero = ConstantInt::getNullValue(SelType);
+ return SelectInst::Create(Cond, C, Zero, "", nullptr, &Sel);
+ }
+ }
+
+ return nullptr;
+}
+
+/// Try to transform a vector select with a constant condition vector into a
+/// shuffle for easier combining with other shuffles and insert/extract.
+static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
+ Value *CondVal = SI.getCondition();
+ Constant *CondC;
auto *CondValTy = dyn_cast<FixedVectorType>(CondVal->getType());
if (!CondValTy || !match(CondVal, m_Constant(CondC)))
- return nullptr;
-
+ return nullptr;
+
unsigned NumElts = CondValTy->getNumElements();
- SmallVector<int, 16> Mask;
- Mask.reserve(NumElts);
- for (unsigned i = 0; i != NumElts; ++i) {
- Constant *Elt = CondC->getAggregateElement(i);
- if (!Elt)
- return nullptr;
-
- if (Elt->isOneValue()) {
- // If the select condition element is true, choose from the 1st vector.
- Mask.push_back(i);
- } else if (Elt->isNullValue()) {
- // If the select condition element is false, choose from the 2nd vector.
- Mask.push_back(i + NumElts);
- } else if (isa<UndefValue>(Elt)) {
- // Undef in a select condition (choose one of the operands) does not mean
- // the same thing as undef in a shuffle mask (any value is acceptable), so
- // give up.
- return nullptr;
- } else {
- // Bail out on a constant expression.
- return nullptr;
- }
- }
-
- return new ShuffleVectorInst(SI.getTrueValue(), SI.getFalseValue(), Mask);
-}
-
-/// If we have a select of vectors with a scalar condition, try to convert that
-/// to a vector select by splatting the condition. A splat may get folded with
-/// other operations in IR and having all operands of a select be vector types
-/// is likely better for vector codegen.
+ SmallVector<int, 16> Mask;
+ Mask.reserve(NumElts);
+ for (unsigned i = 0; i != NumElts; ++i) {
+ Constant *Elt = CondC->getAggregateElement(i);
+ if (!Elt)
+ return nullptr;
+
+ if (Elt->isOneValue()) {
+ // If the select condition element is true, choose from the 1st vector.
+ Mask.push_back(i);
+ } else if (Elt->isNullValue()) {
+ // If the select condition element is false, choose from the 2nd vector.
+ Mask.push_back(i + NumElts);
+ } else if (isa<UndefValue>(Elt)) {
+ // Undef in a select condition (choose one of the operands) does not mean
+ // the same thing as undef in a shuffle mask (any value is acceptable), so
+ // give up.
+ return nullptr;
+ } else {
+ // Bail out on a constant expression.
+ return nullptr;
+ }
+ }
+
+ return new ShuffleVectorInst(SI.getTrueValue(), SI.getFalseValue(), Mask);
+}
+
+/// If we have a select of vectors with a scalar condition, try to convert that
+/// to a vector select by splatting the condition. A splat may get folded with
+/// other operations in IR and having all operands of a select be vector types
+/// is likely better for vector codegen.
static Instruction *canonicalizeScalarSelectOfVecs(SelectInst &Sel,
InstCombinerImpl &IC) {
- auto *Ty = dyn_cast<VectorType>(Sel.getType());
- if (!Ty)
- return nullptr;
-
- // We can replace a single-use extract with constant index.
- Value *Cond = Sel.getCondition();
- if (!match(Cond, m_OneUse(m_ExtractElt(m_Value(), m_ConstantInt()))))
- return nullptr;
-
- // select (extelt V, Index), T, F --> select (splat V, Index), T, F
- // Splatting the extracted condition reduces code (we could directly create a
- // splat shuffle of the source vector to eliminate the intermediate step).
+ auto *Ty = dyn_cast<VectorType>(Sel.getType());
+ if (!Ty)
+ return nullptr;
+
+ // We can replace a single-use extract with constant index.
+ Value *Cond = Sel.getCondition();
+ if (!match(Cond, m_OneUse(m_ExtractElt(m_Value(), m_ConstantInt()))))
+ return nullptr;
+
+ // select (extelt V, Index), T, F --> select (splat V, Index), T, F
+ // Splatting the extracted condition reduces code (we could directly create a
+ // splat shuffle of the source vector to eliminate the intermediate step).
return IC.replaceOperand(
Sel, 0, IC.Builder.CreateVectorSplat(Ty->getElementCount(), Cond));
-}
-
-/// Reuse bitcasted operands between a compare and select:
-/// select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) -->
-/// bitcast (select (cmp (bitcast C), (bitcast D)), (bitcast C), (bitcast D))
-static Instruction *foldSelectCmpBitcasts(SelectInst &Sel,
- InstCombiner::BuilderTy &Builder) {
- Value *Cond = Sel.getCondition();
- Value *TVal = Sel.getTrueValue();
- Value *FVal = Sel.getFalseValue();
-
- CmpInst::Predicate Pred;
- Value *A, *B;
- if (!match(Cond, m_Cmp(Pred, m_Value(A), m_Value(B))))
- return nullptr;
-
- // The select condition is a compare instruction. If the select's true/false
- // values are already the same as the compare operands, there's nothing to do.
- if (TVal == A || TVal == B || FVal == A || FVal == B)
- return nullptr;
-
- Value *C, *D;
- if (!match(A, m_BitCast(m_Value(C))) || !match(B, m_BitCast(m_Value(D))))
- return nullptr;
-
- // select (cmp (bitcast C), (bitcast D)), (bitcast TSrc), (bitcast FSrc)
- Value *TSrc, *FSrc;
- if (!match(TVal, m_BitCast(m_Value(TSrc))) ||
- !match(FVal, m_BitCast(m_Value(FSrc))))
- return nullptr;
-
- // If the select true/false values are *different bitcasts* of the same source
- // operands, make the select operands the same as the compare operands and
- // cast the result. This is the canonical select form for min/max.
- Value *NewSel;
- if (TSrc == C && FSrc == D) {
- // select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) -->
- // bitcast (select (cmp A, B), A, B)
- NewSel = Builder.CreateSelect(Cond, A, B, "", &Sel);
- } else if (TSrc == D && FSrc == C) {
- // select (cmp (bitcast C), (bitcast D)), (bitcast' D), (bitcast' C) -->
- // bitcast (select (cmp A, B), B, A)
- NewSel = Builder.CreateSelect(Cond, B, A, "", &Sel);
- } else {
- return nullptr;
- }
- return CastInst::CreateBitOrPointerCast(NewSel, Sel.getType());
-}
-
-/// Try to eliminate select instructions that test the returned flag of cmpxchg
-/// instructions.
-///
-/// If a select instruction tests the returned flag of a cmpxchg instruction and
-/// selects between the returned value of the cmpxchg instruction its compare
-/// operand, the result of the select will always be equal to its false value.
-/// For example:
-///
-/// %0 = cmpxchg i64* %ptr, i64 %compare, i64 %new_value seq_cst seq_cst
-/// %1 = extractvalue { i64, i1 } %0, 1
-/// %2 = extractvalue { i64, i1 } %0, 0
-/// %3 = select i1 %1, i64 %compare, i64 %2
-/// ret i64 %3
-///
-/// The returned value of the cmpxchg instruction (%2) is the original value
-/// located at %ptr prior to any update. If the cmpxchg operation succeeds, %2
-/// must have been equal to %compare. Thus, the result of the select is always
-/// equal to %2, and the code can be simplified to:
-///
-/// %0 = cmpxchg i64* %ptr, i64 %compare, i64 %new_value seq_cst seq_cst
-/// %1 = extractvalue { i64, i1 } %0, 0
-/// ret i64 %1
-///
-static Value *foldSelectCmpXchg(SelectInst &SI) {
- // A helper that determines if V is an extractvalue instruction whose
- // aggregate operand is a cmpxchg instruction and whose single index is equal
- // to I. If such conditions are true, the helper returns the cmpxchg
- // instruction; otherwise, a nullptr is returned.
- auto isExtractFromCmpXchg = [](Value *V, unsigned I) -> AtomicCmpXchgInst * {
- auto *Extract = dyn_cast<ExtractValueInst>(V);
- if (!Extract)
- return nullptr;
- if (Extract->getIndices()[0] != I)
- return nullptr;
- return dyn_cast<AtomicCmpXchgInst>(Extract->getAggregateOperand());
- };
-
- // If the select has a single user, and this user is a select instruction that
- // we can simplify, skip the cmpxchg simplification for now.
- if (SI.hasOneUse())
- if (auto *Select = dyn_cast<SelectInst>(SI.user_back()))
- if (Select->getCondition() == SI.getCondition())
- if (Select->getFalseValue() == SI.getTrueValue() ||
- Select->getTrueValue() == SI.getFalseValue())
- return nullptr;
-
- // Ensure the select condition is the returned flag of a cmpxchg instruction.
- auto *CmpXchg = isExtractFromCmpXchg(SI.getCondition(), 1);
- if (!CmpXchg)
- return nullptr;
-
- // Check the true value case: The true value of the select is the returned
- // value of the same cmpxchg used by the condition, and the false value is the
- // cmpxchg instruction's compare operand.
- if (auto *X = isExtractFromCmpXchg(SI.getTrueValue(), 0))
- if (X == CmpXchg && X->getCompareOperand() == SI.getFalseValue())
- return SI.getFalseValue();
-
- // Check the false value case: The false value of the select is the returned
- // value of the same cmpxchg used by the condition, and the true value is the
- // cmpxchg instruction's compare operand.
- if (auto *X = isExtractFromCmpXchg(SI.getFalseValue(), 0))
- if (X == CmpXchg && X->getCompareOperand() == SI.getTrueValue())
- return SI.getFalseValue();
-
- return nullptr;
-}
-
-static Instruction *moveAddAfterMinMax(SelectPatternFlavor SPF, Value *X,
- Value *Y,
- InstCombiner::BuilderTy &Builder) {
- assert(SelectPatternResult::isMinOrMax(SPF) && "Expected min/max pattern");
- bool IsUnsigned = SPF == SelectPatternFlavor::SPF_UMIN ||
- SPF == SelectPatternFlavor::SPF_UMAX;
- // TODO: If InstSimplify could fold all cases where C2 <= C1, we could change
- // the constant value check to an assert.
- Value *A;
- const APInt *C1, *C2;
- if (IsUnsigned && match(X, m_NUWAdd(m_Value(A), m_APInt(C1))) &&
- match(Y, m_APInt(C2)) && C2->uge(*C1) && X->hasNUses(2)) {
- // umin (add nuw A, C1), C2 --> add nuw (umin A, C2 - C1), C1
- // umax (add nuw A, C1), C2 --> add nuw (umax A, C2 - C1), C1
- Value *NewMinMax = createMinMax(Builder, SPF, A,
- ConstantInt::get(X->getType(), *C2 - *C1));
- return BinaryOperator::CreateNUW(BinaryOperator::Add, NewMinMax,
- ConstantInt::get(X->getType(), *C1));
- }
-
- if (!IsUnsigned && match(X, m_NSWAdd(m_Value(A), m_APInt(C1))) &&
- match(Y, m_APInt(C2)) && X->hasNUses(2)) {
- bool Overflow;
- APInt Diff = C2->ssub_ov(*C1, Overflow);
- if (!Overflow) {
- // smin (add nsw A, C1), C2 --> add nsw (smin A, C2 - C1), C1
- // smax (add nsw A, C1), C2 --> add nsw (smax A, C2 - C1), C1
- Value *NewMinMax = createMinMax(Builder, SPF, A,
- ConstantInt::get(X->getType(), Diff));
- return BinaryOperator::CreateNSW(BinaryOperator::Add, NewMinMax,
- ConstantInt::get(X->getType(), *C1));
- }
- }
-
- return nullptr;
-}
-
-/// Match a sadd_sat or ssub_sat which is using min/max to clamp the value.
+}
+
+/// Reuse bitcasted operands between a compare and select:
+/// select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) -->
+/// bitcast (select (cmp (bitcast C), (bitcast D)), (bitcast C), (bitcast D))
+static Instruction *foldSelectCmpBitcasts(SelectInst &Sel,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Cond = Sel.getCondition();
+ Value *TVal = Sel.getTrueValue();
+ Value *FVal = Sel.getFalseValue();
+
+ CmpInst::Predicate Pred;
+ Value *A, *B;
+ if (!match(Cond, m_Cmp(Pred, m_Value(A), m_Value(B))))
+ return nullptr;
+
+ // The select condition is a compare instruction. If the select's true/false
+ // values are already the same as the compare operands, there's nothing to do.
+ if (TVal == A || TVal == B || FVal == A || FVal == B)
+ return nullptr;
+
+ Value *C, *D;
+ if (!match(A, m_BitCast(m_Value(C))) || !match(B, m_BitCast(m_Value(D))))
+ return nullptr;
+
+ // select (cmp (bitcast C), (bitcast D)), (bitcast TSrc), (bitcast FSrc)
+ Value *TSrc, *FSrc;
+ if (!match(TVal, m_BitCast(m_Value(TSrc))) ||
+ !match(FVal, m_BitCast(m_Value(FSrc))))
+ return nullptr;
+
+ // If the select true/false values are *different bitcasts* of the same source
+ // operands, make the select operands the same as the compare operands and
+ // cast the result. This is the canonical select form for min/max.
+ Value *NewSel;
+ if (TSrc == C && FSrc == D) {
+ // select (cmp (bitcast C), (bitcast D)), (bitcast' C), (bitcast' D) -->
+ // bitcast (select (cmp A, B), A, B)
+ NewSel = Builder.CreateSelect(Cond, A, B, "", &Sel);
+ } else if (TSrc == D && FSrc == C) {
+ // select (cmp (bitcast C), (bitcast D)), (bitcast' D), (bitcast' C) -->
+ // bitcast (select (cmp A, B), B, A)
+ NewSel = Builder.CreateSelect(Cond, B, A, "", &Sel);
+ } else {
+ return nullptr;
+ }
+ return CastInst::CreateBitOrPointerCast(NewSel, Sel.getType());
+}
+
+/// Try to eliminate select instructions that test the returned flag of cmpxchg
+/// instructions.
+///
+/// If a select instruction tests the returned flag of a cmpxchg instruction and
+/// selects between the returned value of the cmpxchg instruction its compare
+/// operand, the result of the select will always be equal to its false value.
+/// For example:
+///
+/// %0 = cmpxchg i64* %ptr, i64 %compare, i64 %new_value seq_cst seq_cst
+/// %1 = extractvalue { i64, i1 } %0, 1
+/// %2 = extractvalue { i64, i1 } %0, 0
+/// %3 = select i1 %1, i64 %compare, i64 %2
+/// ret i64 %3
+///
+/// The returned value of the cmpxchg instruction (%2) is the original value
+/// located at %ptr prior to any update. If the cmpxchg operation succeeds, %2
+/// must have been equal to %compare. Thus, the result of the select is always
+/// equal to %2, and the code can be simplified to:
+///
+/// %0 = cmpxchg i64* %ptr, i64 %compare, i64 %new_value seq_cst seq_cst
+/// %1 = extractvalue { i64, i1 } %0, 0
+/// ret i64 %1
+///
+static Value *foldSelectCmpXchg(SelectInst &SI) {
+ // A helper that determines if V is an extractvalue instruction whose
+ // aggregate operand is a cmpxchg instruction and whose single index is equal
+ // to I. If such conditions are true, the helper returns the cmpxchg
+ // instruction; otherwise, a nullptr is returned.
+ auto isExtractFromCmpXchg = [](Value *V, unsigned I) -> AtomicCmpXchgInst * {
+ auto *Extract = dyn_cast<ExtractValueInst>(V);
+ if (!Extract)
+ return nullptr;
+ if (Extract->getIndices()[0] != I)
+ return nullptr;
+ return dyn_cast<AtomicCmpXchgInst>(Extract->getAggregateOperand());
+ };
+
+ // If the select has a single user, and this user is a select instruction that
+ // we can simplify, skip the cmpxchg simplification for now.
+ if (SI.hasOneUse())
+ if (auto *Select = dyn_cast<SelectInst>(SI.user_back()))
+ if (Select->getCondition() == SI.getCondition())
+ if (Select->getFalseValue() == SI.getTrueValue() ||
+ Select->getTrueValue() == SI.getFalseValue())
+ return nullptr;
+
+ // Ensure the select condition is the returned flag of a cmpxchg instruction.
+ auto *CmpXchg = isExtractFromCmpXchg(SI.getCondition(), 1);
+ if (!CmpXchg)
+ return nullptr;
+
+ // Check the true value case: The true value of the select is the returned
+ // value of the same cmpxchg used by the condition, and the false value is the
+ // cmpxchg instruction's compare operand.
+ if (auto *X = isExtractFromCmpXchg(SI.getTrueValue(), 0))
+ if (X == CmpXchg && X->getCompareOperand() == SI.getFalseValue())
+ return SI.getFalseValue();
+
+ // Check the false value case: The false value of the select is the returned
+ // value of the same cmpxchg used by the condition, and the true value is the
+ // cmpxchg instruction's compare operand.
+ if (auto *X = isExtractFromCmpXchg(SI.getFalseValue(), 0))
+ if (X == CmpXchg && X->getCompareOperand() == SI.getTrueValue())
+ return SI.getFalseValue();
+
+ return nullptr;
+}
+
+static Instruction *moveAddAfterMinMax(SelectPatternFlavor SPF, Value *X,
+ Value *Y,
+ InstCombiner::BuilderTy &Builder) {
+ assert(SelectPatternResult::isMinOrMax(SPF) && "Expected min/max pattern");
+ bool IsUnsigned = SPF == SelectPatternFlavor::SPF_UMIN ||
+ SPF == SelectPatternFlavor::SPF_UMAX;
+ // TODO: If InstSimplify could fold all cases where C2 <= C1, we could change
+ // the constant value check to an assert.
+ Value *A;
+ const APInt *C1, *C2;
+ if (IsUnsigned && match(X, m_NUWAdd(m_Value(A), m_APInt(C1))) &&
+ match(Y, m_APInt(C2)) && C2->uge(*C1) && X->hasNUses(2)) {
+ // umin (add nuw A, C1), C2 --> add nuw (umin A, C2 - C1), C1
+ // umax (add nuw A, C1), C2 --> add nuw (umax A, C2 - C1), C1
+ Value *NewMinMax = createMinMax(Builder, SPF, A,
+ ConstantInt::get(X->getType(), *C2 - *C1));
+ return BinaryOperator::CreateNUW(BinaryOperator::Add, NewMinMax,
+ ConstantInt::get(X->getType(), *C1));
+ }
+
+ if (!IsUnsigned && match(X, m_NSWAdd(m_Value(A), m_APInt(C1))) &&
+ match(Y, m_APInt(C2)) && X->hasNUses(2)) {
+ bool Overflow;
+ APInt Diff = C2->ssub_ov(*C1, Overflow);
+ if (!Overflow) {
+ // smin (add nsw A, C1), C2 --> add nsw (smin A, C2 - C1), C1
+ // smax (add nsw A, C1), C2 --> add nsw (smax A, C2 - C1), C1
+ Value *NewMinMax = createMinMax(Builder, SPF, A,
+ ConstantInt::get(X->getType(), Diff));
+ return BinaryOperator::CreateNSW(BinaryOperator::Add, NewMinMax,
+ ConstantInt::get(X->getType(), *C1));
+ }
+ }
+
+ return nullptr;
+}
+
+/// Match a sadd_sat or ssub_sat which is using min/max to clamp the value.
Instruction *InstCombinerImpl::matchSAddSubSat(SelectInst &MinMax1) {
- Type *Ty = MinMax1.getType();
-
- // We are looking for a tree of:
- // max(INT_MIN, min(INT_MAX, add(sext(A), sext(B))))
- // Where the min and max could be reversed
- Instruction *MinMax2;
- BinaryOperator *AddSub;
- const APInt *MinValue, *MaxValue;
- if (match(&MinMax1, m_SMin(m_Instruction(MinMax2), m_APInt(MaxValue)))) {
- if (!match(MinMax2, m_SMax(m_BinOp(AddSub), m_APInt(MinValue))))
- return nullptr;
- } else if (match(&MinMax1,
- m_SMax(m_Instruction(MinMax2), m_APInt(MinValue)))) {
- if (!match(MinMax2, m_SMin(m_BinOp(AddSub), m_APInt(MaxValue))))
- return nullptr;
- } else
- return nullptr;
-
- // Check that the constants clamp a saturate, and that the new type would be
- // sensible to convert to.
- if (!(*MaxValue + 1).isPowerOf2() || -*MinValue != *MaxValue + 1)
- return nullptr;
- // In what bitwidth can this be treated as saturating arithmetics?
- unsigned NewBitWidth = (*MaxValue + 1).logBase2() + 1;
- // FIXME: This isn't quite right for vectors, but using the scalar type is a
- // good first approximation for what should be done there.
- if (!shouldChangeType(Ty->getScalarType()->getIntegerBitWidth(), NewBitWidth))
- return nullptr;
-
- // Also make sure that the number of uses is as expected. The "3"s are for the
- // the two items of min/max (the compare and the select).
- if (MinMax2->hasNUsesOrMore(3) || AddSub->hasNUsesOrMore(3))
- return nullptr;
-
- // Create the new type (which can be a vector type)
- Type *NewTy = Ty->getWithNewBitWidth(NewBitWidth);
- // Match the two extends from the add/sub
- Value *A, *B;
- if(!match(AddSub, m_BinOp(m_SExt(m_Value(A)), m_SExt(m_Value(B)))))
- return nullptr;
- // And check the incoming values are of a type smaller than or equal to the
- // size of the saturation. Otherwise the higher bits can cause different
- // results.
- if (A->getType()->getScalarSizeInBits() > NewBitWidth ||
- B->getType()->getScalarSizeInBits() > NewBitWidth)
- return nullptr;
-
- Intrinsic::ID IntrinsicID;
- if (AddSub->getOpcode() == Instruction::Add)
- IntrinsicID = Intrinsic::sadd_sat;
- else if (AddSub->getOpcode() == Instruction::Sub)
- IntrinsicID = Intrinsic::ssub_sat;
- else
- return nullptr;
-
- // Finally create and return the sat intrinsic, truncated to the new type
- Function *F = Intrinsic::getDeclaration(MinMax1.getModule(), IntrinsicID, NewTy);
- Value *AT = Builder.CreateSExt(A, NewTy);
- Value *BT = Builder.CreateSExt(B, NewTy);
- Value *Sat = Builder.CreateCall(F, {AT, BT});
- return CastInst::Create(Instruction::SExt, Sat, Ty);
-}
-
-/// Reduce a sequence of min/max with a common operand.
-static Instruction *factorizeMinMaxTree(SelectPatternFlavor SPF, Value *LHS,
- Value *RHS,
- InstCombiner::BuilderTy &Builder) {
- assert(SelectPatternResult::isMinOrMax(SPF) && "Expected a min/max");
- // TODO: Allow FP min/max with nnan/nsz.
- if (!LHS->getType()->isIntOrIntVectorTy())
- return nullptr;
-
- // Match 3 of the same min/max ops. Example: umin(umin(), umin()).
- Value *A, *B, *C, *D;
- SelectPatternResult L = matchSelectPattern(LHS, A, B);
- SelectPatternResult R = matchSelectPattern(RHS, C, D);
- if (SPF != L.Flavor || L.Flavor != R.Flavor)
- return nullptr;
-
- // Look for a common operand. The use checks are different than usual because
- // a min/max pattern typically has 2 uses of each op: 1 by the cmp and 1 by
- // the select.
- Value *MinMaxOp = nullptr;
- Value *ThirdOp = nullptr;
- if (!LHS->hasNUsesOrMore(3) && RHS->hasNUsesOrMore(3)) {
- // If the LHS is only used in this chain and the RHS is used outside of it,
- // reuse the RHS min/max because that will eliminate the LHS.
- if (D == A || C == A) {
- // min(min(a, b), min(c, a)) --> min(min(c, a), b)
- // min(min(a, b), min(a, d)) --> min(min(a, d), b)
- MinMaxOp = RHS;
- ThirdOp = B;
- } else if (D == B || C == B) {
- // min(min(a, b), min(c, b)) --> min(min(c, b), a)
- // min(min(a, b), min(b, d)) --> min(min(b, d), a)
- MinMaxOp = RHS;
- ThirdOp = A;
- }
- } else if (!RHS->hasNUsesOrMore(3)) {
- // Reuse the LHS. This will eliminate the RHS.
- if (D == A || D == B) {
- // min(min(a, b), min(c, a)) --> min(min(a, b), c)
- // min(min(a, b), min(c, b)) --> min(min(a, b), c)
- MinMaxOp = LHS;
- ThirdOp = C;
- } else if (C == A || C == B) {
- // min(min(a, b), min(b, d)) --> min(min(a, b), d)
- // min(min(a, b), min(c, b)) --> min(min(a, b), d)
- MinMaxOp = LHS;
- ThirdOp = D;
- }
- }
- if (!MinMaxOp || !ThirdOp)
- return nullptr;
-
- CmpInst::Predicate P = getMinMaxPred(SPF);
- Value *CmpABC = Builder.CreateICmp(P, MinMaxOp, ThirdOp);
- return SelectInst::Create(CmpABC, MinMaxOp, ThirdOp);
-}
-
+ Type *Ty = MinMax1.getType();
+
+ // We are looking for a tree of:
+ // max(INT_MIN, min(INT_MAX, add(sext(A), sext(B))))
+ // Where the min and max could be reversed
+ Instruction *MinMax2;
+ BinaryOperator *AddSub;
+ const APInt *MinValue, *MaxValue;
+ if (match(&MinMax1, m_SMin(m_Instruction(MinMax2), m_APInt(MaxValue)))) {
+ if (!match(MinMax2, m_SMax(m_BinOp(AddSub), m_APInt(MinValue))))
+ return nullptr;
+ } else if (match(&MinMax1,
+ m_SMax(m_Instruction(MinMax2), m_APInt(MinValue)))) {
+ if (!match(MinMax2, m_SMin(m_BinOp(AddSub), m_APInt(MaxValue))))
+ return nullptr;
+ } else
+ return nullptr;
+
+ // Check that the constants clamp a saturate, and that the new type would be
+ // sensible to convert to.
+ if (!(*MaxValue + 1).isPowerOf2() || -*MinValue != *MaxValue + 1)
+ return nullptr;
+ // In what bitwidth can this be treated as saturating arithmetics?
+ unsigned NewBitWidth = (*MaxValue + 1).logBase2() + 1;
+ // FIXME: This isn't quite right for vectors, but using the scalar type is a
+ // good first approximation for what should be done there.
+ if (!shouldChangeType(Ty->getScalarType()->getIntegerBitWidth(), NewBitWidth))
+ return nullptr;
+
+ // Also make sure that the number of uses is as expected. The "3"s are for the
+ // the two items of min/max (the compare and the select).
+ if (MinMax2->hasNUsesOrMore(3) || AddSub->hasNUsesOrMore(3))
+ return nullptr;
+
+ // Create the new type (which can be a vector type)
+ Type *NewTy = Ty->getWithNewBitWidth(NewBitWidth);
+ // Match the two extends from the add/sub
+ Value *A, *B;
+ if(!match(AddSub, m_BinOp(m_SExt(m_Value(A)), m_SExt(m_Value(B)))))
+ return nullptr;
+ // And check the incoming values are of a type smaller than or equal to the
+ // size of the saturation. Otherwise the higher bits can cause different
+ // results.
+ if (A->getType()->getScalarSizeInBits() > NewBitWidth ||
+ B->getType()->getScalarSizeInBits() > NewBitWidth)
+ return nullptr;
+
+ Intrinsic::ID IntrinsicID;
+ if (AddSub->getOpcode() == Instruction::Add)
+ IntrinsicID = Intrinsic::sadd_sat;
+ else if (AddSub->getOpcode() == Instruction::Sub)
+ IntrinsicID = Intrinsic::ssub_sat;
+ else
+ return nullptr;
+
+ // Finally create and return the sat intrinsic, truncated to the new type
+ Function *F = Intrinsic::getDeclaration(MinMax1.getModule(), IntrinsicID, NewTy);
+ Value *AT = Builder.CreateSExt(A, NewTy);
+ Value *BT = Builder.CreateSExt(B, NewTy);
+ Value *Sat = Builder.CreateCall(F, {AT, BT});
+ return CastInst::Create(Instruction::SExt, Sat, Ty);
+}
+
+/// Reduce a sequence of min/max with a common operand.
+static Instruction *factorizeMinMaxTree(SelectPatternFlavor SPF, Value *LHS,
+ Value *RHS,
+ InstCombiner::BuilderTy &Builder) {
+ assert(SelectPatternResult::isMinOrMax(SPF) && "Expected a min/max");
+ // TODO: Allow FP min/max with nnan/nsz.
+ if (!LHS->getType()->isIntOrIntVectorTy())
+ return nullptr;
+
+ // Match 3 of the same min/max ops. Example: umin(umin(), umin()).
+ Value *A, *B, *C, *D;
+ SelectPatternResult L = matchSelectPattern(LHS, A, B);
+ SelectPatternResult R = matchSelectPattern(RHS, C, D);
+ if (SPF != L.Flavor || L.Flavor != R.Flavor)
+ return nullptr;
+
+ // Look for a common operand. The use checks are different than usual because
+ // a min/max pattern typically has 2 uses of each op: 1 by the cmp and 1 by
+ // the select.
+ Value *MinMaxOp = nullptr;
+ Value *ThirdOp = nullptr;
+ if (!LHS->hasNUsesOrMore(3) && RHS->hasNUsesOrMore(3)) {
+ // If the LHS is only used in this chain and the RHS is used outside of it,
+ // reuse the RHS min/max because that will eliminate the LHS.
+ if (D == A || C == A) {
+ // min(min(a, b), min(c, a)) --> min(min(c, a), b)
+ // min(min(a, b), min(a, d)) --> min(min(a, d), b)
+ MinMaxOp = RHS;
+ ThirdOp = B;
+ } else if (D == B || C == B) {
+ // min(min(a, b), min(c, b)) --> min(min(c, b), a)
+ // min(min(a, b), min(b, d)) --> min(min(b, d), a)
+ MinMaxOp = RHS;
+ ThirdOp = A;
+ }
+ } else if (!RHS->hasNUsesOrMore(3)) {
+ // Reuse the LHS. This will eliminate the RHS.
+ if (D == A || D == B) {
+ // min(min(a, b), min(c, a)) --> min(min(a, b), c)
+ // min(min(a, b), min(c, b)) --> min(min(a, b), c)
+ MinMaxOp = LHS;
+ ThirdOp = C;
+ } else if (C == A || C == B) {
+ // min(min(a, b), min(b, d)) --> min(min(a, b), d)
+ // min(min(a, b), min(c, b)) --> min(min(a, b), d)
+ MinMaxOp = LHS;
+ ThirdOp = D;
+ }
+ }
+ if (!MinMaxOp || !ThirdOp)
+ return nullptr;
+
+ CmpInst::Predicate P = getMinMaxPred(SPF);
+ Value *CmpABC = Builder.CreateICmp(P, MinMaxOp, ThirdOp);
+ return SelectInst::Create(CmpABC, MinMaxOp, ThirdOp);
+}
+
/// Try to reduce a funnel/rotate pattern that includes a compare and select
/// into a funnel shift intrinsic. Example:
-/// rotl32(a, b) --> (b == 0 ? a : ((a >> (32 - b)) | (a << b)))
-/// --> call llvm.fshl.i32(a, a, b)
+/// rotl32(a, b) --> (b == 0 ? a : ((a >> (32 - b)) | (a << b)))
+/// --> call llvm.fshl.i32(a, a, b)
/// fshl32(a, b, c) --> (c == 0 ? a : ((b >> (32 - c)) | (a << c)))
/// --> call llvm.fshl.i32(a, b, c)
/// fshr32(a, b, c) --> (c == 0 ? b : ((a >> (32 - c)) | (b << c)))
@@ -2291,20 +2291,20 @@ static Instruction *foldSelectFunnelShift(SelectInst &Sel,
// This must be a power-of-2 type for a bitmasking transform to be valid.
unsigned Width = Sel.getType()->getScalarSizeInBits();
if (!isPowerOf2_32(Width))
- return nullptr;
-
+ return nullptr;
+
BinaryOperator *Or0, *Or1;
if (!match(Sel.getFalseValue(), m_OneUse(m_Or(m_BinOp(Or0), m_BinOp(Or1)))))
- return nullptr;
-
+ return nullptr;
+
Value *SV0, *SV1, *SA0, *SA1;
if (!match(Or0, m_OneUse(m_LogicalShift(m_Value(SV0),
m_ZExtOrSelf(m_Value(SA0))))) ||
!match(Or1, m_OneUse(m_LogicalShift(m_Value(SV1),
m_ZExtOrSelf(m_Value(SA1))))) ||
Or0->getOpcode() == Or1->getOpcode())
- return nullptr;
-
+ return nullptr;
+
// Canonicalize to or(shl(SV0, SA0), lshr(SV1, SA1)).
if (Or0->getOpcode() == BinaryOperator::LShr) {
std::swap(Or0, Or1);
@@ -2314,16 +2314,16 @@ static Instruction *foldSelectFunnelShift(SelectInst &Sel,
assert(Or0->getOpcode() == BinaryOperator::Shl &&
Or1->getOpcode() == BinaryOperator::LShr &&
"Illegal or(shift,shift) pair");
-
- // Check the shift amounts to see if they are an opposite pair.
- Value *ShAmt;
- if (match(SA1, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(SA0)))))
- ShAmt = SA0;
- else if (match(SA0, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(SA1)))))
- ShAmt = SA1;
- else
- return nullptr;
-
+
+ // Check the shift amounts to see if they are an opposite pair.
+ Value *ShAmt;
+ if (match(SA1, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(SA0)))))
+ ShAmt = SA0;
+ else if (match(SA0, m_OneUse(m_Sub(m_SpecificInt(Width), m_Specific(SA1)))))
+ ShAmt = SA1;
+ else
+ return nullptr;
+
// We should now have this pattern:
// select ?, TVal, (or (shl SV0, SA0), (lshr SV1, SA1))
// The false value of the select must be a funnel-shift of the true value:
@@ -2333,13 +2333,13 @@ static Instruction *foldSelectFunnelShift(SelectInst &Sel,
if ((IsFshl && TVal != SV0) || (!IsFshl && TVal != SV1))
return nullptr;
- // Finally, see if the select is filtering out a shift-by-zero.
- Value *Cond = Sel.getCondition();
- ICmpInst::Predicate Pred;
- if (!match(Cond, m_OneUse(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()))) ||
- Pred != ICmpInst::ICMP_EQ)
- return nullptr;
-
+ // Finally, see if the select is filtering out a shift-by-zero.
+ Value *Cond = Sel.getCondition();
+ ICmpInst::Predicate Pred;
+ if (!match(Cond, m_OneUse(m_ICmp(Pred, m_Specific(ShAmt), m_ZeroInt()))) ||
+ Pred != ICmpInst::ICMP_EQ)
+ return nullptr;
+
// If this is not a rotate then the select was blocking poison from the
// 'shift-by-zero' non-TVal, but a funnel shift won't - so freeze it.
if (SV0 != SV1) {
@@ -2350,186 +2350,186 @@ static Instruction *foldSelectFunnelShift(SelectInst &Sel,
}
// This is a funnel/rotate that avoids shift-by-bitwidth UB in a suboptimal way.
- // Convert to funnel shift intrinsic.
- Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
- Function *F = Intrinsic::getDeclaration(Sel.getModule(), IID, Sel.getType());
+ // Convert to funnel shift intrinsic.
+ Intrinsic::ID IID = IsFshl ? Intrinsic::fshl : Intrinsic::fshr;
+ Function *F = Intrinsic::getDeclaration(Sel.getModule(), IID, Sel.getType());
ShAmt = Builder.CreateZExt(ShAmt, Sel.getType());
return IntrinsicInst::Create(F, { SV0, SV1, ShAmt });
-}
-
-static Instruction *foldSelectToCopysign(SelectInst &Sel,
- InstCombiner::BuilderTy &Builder) {
- Value *Cond = Sel.getCondition();
- Value *TVal = Sel.getTrueValue();
- Value *FVal = Sel.getFalseValue();
- Type *SelType = Sel.getType();
-
- // Match select ?, TC, FC where the constants are equal but negated.
- // TODO: Generalize to handle a negated variable operand?
- const APFloat *TC, *FC;
- if (!match(TVal, m_APFloat(TC)) || !match(FVal, m_APFloat(FC)) ||
- !abs(*TC).bitwiseIsEqual(abs(*FC)))
- return nullptr;
-
- assert(TC != FC && "Expected equal select arms to simplify");
-
- Value *X;
- const APInt *C;
- bool IsTrueIfSignSet;
- ICmpInst::Predicate Pred;
- if (!match(Cond, m_OneUse(m_ICmp(Pred, m_BitCast(m_Value(X)), m_APInt(C)))) ||
+}
+
+static Instruction *foldSelectToCopysign(SelectInst &Sel,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Cond = Sel.getCondition();
+ Value *TVal = Sel.getTrueValue();
+ Value *FVal = Sel.getFalseValue();
+ Type *SelType = Sel.getType();
+
+ // Match select ?, TC, FC where the constants are equal but negated.
+ // TODO: Generalize to handle a negated variable operand?
+ const APFloat *TC, *FC;
+ if (!match(TVal, m_APFloat(TC)) || !match(FVal, m_APFloat(FC)) ||
+ !abs(*TC).bitwiseIsEqual(abs(*FC)))
+ return nullptr;
+
+ assert(TC != FC && "Expected equal select arms to simplify");
+
+ Value *X;
+ const APInt *C;
+ bool IsTrueIfSignSet;
+ ICmpInst::Predicate Pred;
+ if (!match(Cond, m_OneUse(m_ICmp(Pred, m_BitCast(m_Value(X)), m_APInt(C)))) ||
!InstCombiner::isSignBitCheck(Pred, *C, IsTrueIfSignSet) ||
X->getType() != SelType)
- return nullptr;
-
- // If needed, negate the value that will be the sign argument of the copysign:
- // (bitcast X) < 0 ? -TC : TC --> copysign(TC, X)
- // (bitcast X) < 0 ? TC : -TC --> copysign(TC, -X)
- // (bitcast X) >= 0 ? -TC : TC --> copysign(TC, -X)
- // (bitcast X) >= 0 ? TC : -TC --> copysign(TC, X)
- if (IsTrueIfSignSet ^ TC->isNegative())
- X = Builder.CreateFNegFMF(X, &Sel);
-
- // Canonicalize the magnitude argument as the positive constant since we do
- // not care about its sign.
- Value *MagArg = TC->isNegative() ? FVal : TVal;
- Function *F = Intrinsic::getDeclaration(Sel.getModule(), Intrinsic::copysign,
- Sel.getType());
- Instruction *CopySign = IntrinsicInst::Create(F, { MagArg, X });
- CopySign->setFastMathFlags(Sel.getFastMathFlags());
- return CopySign;
-}
-
+ return nullptr;
+
+ // If needed, negate the value that will be the sign argument of the copysign:
+ // (bitcast X) < 0 ? -TC : TC --> copysign(TC, X)
+ // (bitcast X) < 0 ? TC : -TC --> copysign(TC, -X)
+ // (bitcast X) >= 0 ? -TC : TC --> copysign(TC, -X)
+ // (bitcast X) >= 0 ? TC : -TC --> copysign(TC, X)
+ if (IsTrueIfSignSet ^ TC->isNegative())
+ X = Builder.CreateFNegFMF(X, &Sel);
+
+ // Canonicalize the magnitude argument as the positive constant since we do
+ // not care about its sign.
+ Value *MagArg = TC->isNegative() ? FVal : TVal;
+ Function *F = Intrinsic::getDeclaration(Sel.getModule(), Intrinsic::copysign,
+ Sel.getType());
+ Instruction *CopySign = IntrinsicInst::Create(F, { MagArg, X });
+ CopySign->setFastMathFlags(Sel.getFastMathFlags());
+ return CopySign;
+}
+
Instruction *InstCombinerImpl::foldVectorSelect(SelectInst &Sel) {
- auto *VecTy = dyn_cast<FixedVectorType>(Sel.getType());
- if (!VecTy)
- return nullptr;
-
- unsigned NumElts = VecTy->getNumElements();
- APInt UndefElts(NumElts, 0);
- APInt AllOnesEltMask(APInt::getAllOnesValue(NumElts));
- if (Value *V = SimplifyDemandedVectorElts(&Sel, AllOnesEltMask, UndefElts)) {
- if (V != &Sel)
- return replaceInstUsesWith(Sel, V);
- return &Sel;
- }
-
- // A select of a "select shuffle" with a common operand can be rearranged
- // to select followed by "select shuffle". Because of poison, this only works
- // in the case of a shuffle with no undefined mask elements.
- Value *Cond = Sel.getCondition();
- Value *TVal = Sel.getTrueValue();
- Value *FVal = Sel.getFalseValue();
- Value *X, *Y;
- ArrayRef<int> Mask;
- if (match(TVal, m_OneUse(m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask)))) &&
- !is_contained(Mask, UndefMaskElem) &&
- cast<ShuffleVectorInst>(TVal)->isSelect()) {
- if (X == FVal) {
- // select Cond, (shuf_sel X, Y), X --> shuf_sel X, (select Cond, Y, X)
- Value *NewSel = Builder.CreateSelect(Cond, Y, X, "sel", &Sel);
- return new ShuffleVectorInst(X, NewSel, Mask);
- }
- if (Y == FVal) {
- // select Cond, (shuf_sel X, Y), Y --> shuf_sel (select Cond, X, Y), Y
- Value *NewSel = Builder.CreateSelect(Cond, X, Y, "sel", &Sel);
- return new ShuffleVectorInst(NewSel, Y, Mask);
- }
- }
- if (match(FVal, m_OneUse(m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask)))) &&
- !is_contained(Mask, UndefMaskElem) &&
- cast<ShuffleVectorInst>(FVal)->isSelect()) {
- if (X == TVal) {
- // select Cond, X, (shuf_sel X, Y) --> shuf_sel X, (select Cond, X, Y)
- Value *NewSel = Builder.CreateSelect(Cond, X, Y, "sel", &Sel);
- return new ShuffleVectorInst(X, NewSel, Mask);
- }
- if (Y == TVal) {
- // select Cond, Y, (shuf_sel X, Y) --> shuf_sel (select Cond, Y, X), Y
- Value *NewSel = Builder.CreateSelect(Cond, Y, X, "sel", &Sel);
- return new ShuffleVectorInst(NewSel, Y, Mask);
- }
- }
-
- return nullptr;
-}
-
-static Instruction *foldSelectToPhiImpl(SelectInst &Sel, BasicBlock *BB,
- const DominatorTree &DT,
- InstCombiner::BuilderTy &Builder) {
- // Find the block's immediate dominator that ends with a conditional branch
- // that matches select's condition (maybe inverted).
- auto *IDomNode = DT[BB]->getIDom();
- if (!IDomNode)
- return nullptr;
- BasicBlock *IDom = IDomNode->getBlock();
-
- Value *Cond = Sel.getCondition();
- Value *IfTrue, *IfFalse;
- BasicBlock *TrueSucc, *FalseSucc;
- if (match(IDom->getTerminator(),
- m_Br(m_Specific(Cond), m_BasicBlock(TrueSucc),
- m_BasicBlock(FalseSucc)))) {
- IfTrue = Sel.getTrueValue();
- IfFalse = Sel.getFalseValue();
- } else if (match(IDom->getTerminator(),
- m_Br(m_Not(m_Specific(Cond)), m_BasicBlock(TrueSucc),
- m_BasicBlock(FalseSucc)))) {
- IfTrue = Sel.getFalseValue();
- IfFalse = Sel.getTrueValue();
- } else
- return nullptr;
-
- // Make sure the branches are actually different.
- if (TrueSucc == FalseSucc)
- return nullptr;
-
- // We want to replace select %cond, %a, %b with a phi that takes value %a
- // for all incoming edges that are dominated by condition `%cond == true`,
- // and value %b for edges dominated by condition `%cond == false`. If %a
- // or %b are also phis from the same basic block, we can go further and take
- // their incoming values from the corresponding blocks.
- BasicBlockEdge TrueEdge(IDom, TrueSucc);
- BasicBlockEdge FalseEdge(IDom, FalseSucc);
- DenseMap<BasicBlock *, Value *> Inputs;
- for (auto *Pred : predecessors(BB)) {
- // Check implication.
- BasicBlockEdge Incoming(Pred, BB);
- if (DT.dominates(TrueEdge, Incoming))
- Inputs[Pred] = IfTrue->DoPHITranslation(BB, Pred);
- else if (DT.dominates(FalseEdge, Incoming))
- Inputs[Pred] = IfFalse->DoPHITranslation(BB, Pred);
- else
- return nullptr;
- // Check availability.
- if (auto *Insn = dyn_cast<Instruction>(Inputs[Pred]))
- if (!DT.dominates(Insn, Pred->getTerminator()))
- return nullptr;
- }
-
- Builder.SetInsertPoint(&*BB->begin());
- auto *PN = Builder.CreatePHI(Sel.getType(), Inputs.size());
- for (auto *Pred : predecessors(BB))
- PN->addIncoming(Inputs[Pred], Pred);
- PN->takeName(&Sel);
- return PN;
-}
-
-static Instruction *foldSelectToPhi(SelectInst &Sel, const DominatorTree &DT,
- InstCombiner::BuilderTy &Builder) {
- // Try to replace this select with Phi in one of these blocks.
- SmallSetVector<BasicBlock *, 4> CandidateBlocks;
- CandidateBlocks.insert(Sel.getParent());
- for (Value *V : Sel.operands())
- if (auto *I = dyn_cast<Instruction>(V))
- CandidateBlocks.insert(I->getParent());
-
- for (BasicBlock *BB : CandidateBlocks)
- if (auto *PN = foldSelectToPhiImpl(Sel, BB, DT, Builder))
- return PN;
- return nullptr;
-}
-
+ auto *VecTy = dyn_cast<FixedVectorType>(Sel.getType());
+ if (!VecTy)
+ return nullptr;
+
+ unsigned NumElts = VecTy->getNumElements();
+ APInt UndefElts(NumElts, 0);
+ APInt AllOnesEltMask(APInt::getAllOnesValue(NumElts));
+ if (Value *V = SimplifyDemandedVectorElts(&Sel, AllOnesEltMask, UndefElts)) {
+ if (V != &Sel)
+ return replaceInstUsesWith(Sel, V);
+ return &Sel;
+ }
+
+ // A select of a "select shuffle" with a common operand can be rearranged
+ // to select followed by "select shuffle". Because of poison, this only works
+ // in the case of a shuffle with no undefined mask elements.
+ Value *Cond = Sel.getCondition();
+ Value *TVal = Sel.getTrueValue();
+ Value *FVal = Sel.getFalseValue();
+ Value *X, *Y;
+ ArrayRef<int> Mask;
+ if (match(TVal, m_OneUse(m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask)))) &&
+ !is_contained(Mask, UndefMaskElem) &&
+ cast<ShuffleVectorInst>(TVal)->isSelect()) {
+ if (X == FVal) {
+ // select Cond, (shuf_sel X, Y), X --> shuf_sel X, (select Cond, Y, X)
+ Value *NewSel = Builder.CreateSelect(Cond, Y, X, "sel", &Sel);
+ return new ShuffleVectorInst(X, NewSel, Mask);
+ }
+ if (Y == FVal) {
+ // select Cond, (shuf_sel X, Y), Y --> shuf_sel (select Cond, X, Y), Y
+ Value *NewSel = Builder.CreateSelect(Cond, X, Y, "sel", &Sel);
+ return new ShuffleVectorInst(NewSel, Y, Mask);
+ }
+ }
+ if (match(FVal, m_OneUse(m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask)))) &&
+ !is_contained(Mask, UndefMaskElem) &&
+ cast<ShuffleVectorInst>(FVal)->isSelect()) {
+ if (X == TVal) {
+ // select Cond, X, (shuf_sel X, Y) --> shuf_sel X, (select Cond, X, Y)
+ Value *NewSel = Builder.CreateSelect(Cond, X, Y, "sel", &Sel);
+ return new ShuffleVectorInst(X, NewSel, Mask);
+ }
+ if (Y == TVal) {
+ // select Cond, Y, (shuf_sel X, Y) --> shuf_sel (select Cond, Y, X), Y
+ Value *NewSel = Builder.CreateSelect(Cond, Y, X, "sel", &Sel);
+ return new ShuffleVectorInst(NewSel, Y, Mask);
+ }
+ }
+
+ return nullptr;
+}
+
+static Instruction *foldSelectToPhiImpl(SelectInst &Sel, BasicBlock *BB,
+ const DominatorTree &DT,
+ InstCombiner::BuilderTy &Builder) {
+ // Find the block's immediate dominator that ends with a conditional branch
+ // that matches select's condition (maybe inverted).
+ auto *IDomNode = DT[BB]->getIDom();
+ if (!IDomNode)
+ return nullptr;
+ BasicBlock *IDom = IDomNode->getBlock();
+
+ Value *Cond = Sel.getCondition();
+ Value *IfTrue, *IfFalse;
+ BasicBlock *TrueSucc, *FalseSucc;
+ if (match(IDom->getTerminator(),
+ m_Br(m_Specific(Cond), m_BasicBlock(TrueSucc),
+ m_BasicBlock(FalseSucc)))) {
+ IfTrue = Sel.getTrueValue();
+ IfFalse = Sel.getFalseValue();
+ } else if (match(IDom->getTerminator(),
+ m_Br(m_Not(m_Specific(Cond)), m_BasicBlock(TrueSucc),
+ m_BasicBlock(FalseSucc)))) {
+ IfTrue = Sel.getFalseValue();
+ IfFalse = Sel.getTrueValue();
+ } else
+ return nullptr;
+
+ // Make sure the branches are actually different.
+ if (TrueSucc == FalseSucc)
+ return nullptr;
+
+ // We want to replace select %cond, %a, %b with a phi that takes value %a
+ // for all incoming edges that are dominated by condition `%cond == true`,
+ // and value %b for edges dominated by condition `%cond == false`. If %a
+ // or %b are also phis from the same basic block, we can go further and take
+ // their incoming values from the corresponding blocks.
+ BasicBlockEdge TrueEdge(IDom, TrueSucc);
+ BasicBlockEdge FalseEdge(IDom, FalseSucc);
+ DenseMap<BasicBlock *, Value *> Inputs;
+ for (auto *Pred : predecessors(BB)) {
+ // Check implication.
+ BasicBlockEdge Incoming(Pred, BB);
+ if (DT.dominates(TrueEdge, Incoming))
+ Inputs[Pred] = IfTrue->DoPHITranslation(BB, Pred);
+ else if (DT.dominates(FalseEdge, Incoming))
+ Inputs[Pred] = IfFalse->DoPHITranslation(BB, Pred);
+ else
+ return nullptr;
+ // Check availability.
+ if (auto *Insn = dyn_cast<Instruction>(Inputs[Pred]))
+ if (!DT.dominates(Insn, Pred->getTerminator()))
+ return nullptr;
+ }
+
+ Builder.SetInsertPoint(&*BB->begin());
+ auto *PN = Builder.CreatePHI(Sel.getType(), Inputs.size());
+ for (auto *Pred : predecessors(BB))
+ PN->addIncoming(Inputs[Pred], Pred);
+ PN->takeName(&Sel);
+ return PN;
+}
+
+static Instruction *foldSelectToPhi(SelectInst &Sel, const DominatorTree &DT,
+ InstCombiner::BuilderTy &Builder) {
+ // Try to replace this select with Phi in one of these blocks.
+ SmallSetVector<BasicBlock *, 4> CandidateBlocks;
+ CandidateBlocks.insert(Sel.getParent());
+ for (Value *V : Sel.operands())
+ if (auto *I = dyn_cast<Instruction>(V))
+ CandidateBlocks.insert(I->getParent());
+
+ for (BasicBlock *BB : CandidateBlocks)
+ if (auto *PN = foldSelectToPhiImpl(Sel, BB, DT, Builder))
+ return PN;
+ return nullptr;
+}
+
static Value *foldSelectWithFrozenICmp(SelectInst &Sel, InstCombiner::BuilderTy &Builder) {
FreezeInst *FI = dyn_cast<FreezeInst>(Sel.getCondition());
if (!FI)
@@ -2557,46 +2557,46 @@ static Value *foldSelectWithFrozenICmp(SelectInst &Sel, InstCombiner::BuilderTy
}
Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
- Value *CondVal = SI.getCondition();
- Value *TrueVal = SI.getTrueValue();
- Value *FalseVal = SI.getFalseValue();
- Type *SelType = SI.getType();
-
- // FIXME: Remove this workaround when freeze related patches are done.
- // For select with undef operand which feeds into an equality comparison,
- // don't simplify it so loop unswitch can know the equality comparison
- // may have an undef operand. This is a workaround for PR31652 caused by
- // descrepancy about branch on undef between LoopUnswitch and GVN.
- if (isa<UndefValue>(TrueVal) || isa<UndefValue>(FalseVal)) {
- if (llvm::any_of(SI.users(), [&](User *U) {
- ICmpInst *CI = dyn_cast<ICmpInst>(U);
- if (CI && CI->isEquality())
- return true;
- return false;
- })) {
- return nullptr;
- }
- }
-
- if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal,
- SQ.getWithInstruction(&SI)))
- return replaceInstUsesWith(SI, V);
-
- if (Instruction *I = canonicalizeSelectToShuffle(SI))
- return I;
-
- if (Instruction *I = canonicalizeScalarSelectOfVecs(SI, *this))
- return I;
-
- CmpInst::Predicate Pred;
-
- if (SelType->isIntOrIntVectorTy(1) &&
- TrueVal->getType() == CondVal->getType()) {
+ Value *CondVal = SI.getCondition();
+ Value *TrueVal = SI.getTrueValue();
+ Value *FalseVal = SI.getFalseValue();
+ Type *SelType = SI.getType();
+
+ // FIXME: Remove this workaround when freeze related patches are done.
+ // For select with undef operand which feeds into an equality comparison,
+ // don't simplify it so loop unswitch can know the equality comparison
+ // may have an undef operand. This is a workaround for PR31652 caused by
+ // descrepancy about branch on undef between LoopUnswitch and GVN.
+ if (isa<UndefValue>(TrueVal) || isa<UndefValue>(FalseVal)) {
+ if (llvm::any_of(SI.users(), [&](User *U) {
+ ICmpInst *CI = dyn_cast<ICmpInst>(U);
+ if (CI && CI->isEquality())
+ return true;
+ return false;
+ })) {
+ return nullptr;
+ }
+ }
+
+ if (Value *V = SimplifySelectInst(CondVal, TrueVal, FalseVal,
+ SQ.getWithInstruction(&SI)))
+ return replaceInstUsesWith(SI, V);
+
+ if (Instruction *I = canonicalizeSelectToShuffle(SI))
+ return I;
+
+ if (Instruction *I = canonicalizeScalarSelectOfVecs(SI, *this))
+ return I;
+
+ CmpInst::Predicate Pred;
+
+ if (SelType->isIntOrIntVectorTy(1) &&
+ TrueVal->getType() == CondVal->getType()) {
if (match(TrueVal, m_One()) &&
(EnableUnsafeSelectTransform || impliesPoison(FalseVal, CondVal))) {
- // Change: A = select B, true, C --> A = or B, C
- return BinaryOperator::CreateOr(CondVal, FalseVal);
- }
+ // Change: A = select B, true, C --> A = or B, C
+ return BinaryOperator::CreateOr(CondVal, FalseVal);
+ }
if (match(FalseVal, m_Zero()) &&
(EnableUnsafeSelectTransform || impliesPoison(TrueVal, CondVal))) {
// Change: A = select B, C, false --> A = and B, C
@@ -2604,422 +2604,422 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
}
// select a, false, b -> select !a, b, false
- if (match(TrueVal, m_Zero())) {
- Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
+ if (match(TrueVal, m_Zero())) {
+ Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
return SelectInst::Create(NotCond, FalseVal,
ConstantInt::getFalse(SelType));
- }
+ }
// select a, b, true -> select !a, true, b
- if (match(FalseVal, m_One())) {
- Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
+ if (match(FalseVal, m_One())) {
+ Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
return SelectInst::Create(NotCond, ConstantInt::getTrue(SelType),
TrueVal);
- }
-
+ }
+
// select a, a, b -> select a, true, b
- if (CondVal == TrueVal)
+ if (CondVal == TrueVal)
return replaceOperand(SI, 1, ConstantInt::getTrue(SelType));
// select a, b, a -> select a, b, false
- if (CondVal == FalseVal)
+ if (CondVal == FalseVal)
return replaceOperand(SI, 2, ConstantInt::getFalse(SelType));
-
+
// select a, !a, b -> select !a, b, false
- if (match(TrueVal, m_Not(m_Specific(CondVal))))
+ if (match(TrueVal, m_Not(m_Specific(CondVal))))
return SelectInst::Create(TrueVal, FalseVal,
ConstantInt::getFalse(SelType));
// select a, b, !a -> select !a, true, b
- if (match(FalseVal, m_Not(m_Specific(CondVal))))
+ if (match(FalseVal, m_Not(m_Specific(CondVal))))
return SelectInst::Create(FalseVal, ConstantInt::getTrue(SelType),
TrueVal);
- }
-
- // Selecting between two integer or vector splat integer constants?
- //
- // Note that we don't handle a scalar select of vectors:
- // select i1 %c, <2 x i8> <1, 1>, <2 x i8> <0, 0>
- // because that may need 3 instructions to splat the condition value:
- // extend, insertelement, shufflevector.
+ }
+
+ // Selecting between two integer or vector splat integer constants?
+ //
+ // Note that we don't handle a scalar select of vectors:
+ // select i1 %c, <2 x i8> <1, 1>, <2 x i8> <0, 0>
+ // because that may need 3 instructions to splat the condition value:
+ // extend, insertelement, shufflevector.
//
// Do not handle i1 TrueVal and FalseVal otherwise would result in
// zext/sext i1 to i1.
if (SelType->isIntOrIntVectorTy() && !SelType->isIntOrIntVectorTy(1) &&
- CondVal->getType()->isVectorTy() == SelType->isVectorTy()) {
- // select C, 1, 0 -> zext C to int
- if (match(TrueVal, m_One()) && match(FalseVal, m_Zero()))
- return new ZExtInst(CondVal, SelType);
-
- // select C, -1, 0 -> sext C to int
- if (match(TrueVal, m_AllOnes()) && match(FalseVal, m_Zero()))
- return new SExtInst(CondVal, SelType);
-
- // select C, 0, 1 -> zext !C to int
- if (match(TrueVal, m_Zero()) && match(FalseVal, m_One())) {
- Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
- return new ZExtInst(NotCond, SelType);
- }
-
- // select C, 0, -1 -> sext !C to int
- if (match(TrueVal, m_Zero()) && match(FalseVal, m_AllOnes())) {
- Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
- return new SExtInst(NotCond, SelType);
- }
- }
-
- // See if we are selecting two values based on a comparison of the two values.
- if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) {
- Value *Cmp0 = FCI->getOperand(0), *Cmp1 = FCI->getOperand(1);
- if ((Cmp0 == TrueVal && Cmp1 == FalseVal) ||
- (Cmp0 == FalseVal && Cmp1 == TrueVal)) {
- // Canonicalize to use ordered comparisons by swapping the select
- // operands.
- //
- // e.g.
- // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X
- if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {
- FCmpInst::Predicate InvPred = FCI->getInversePredicate();
- IRBuilder<>::FastMathFlagGuard FMFG(Builder);
- // FIXME: The FMF should propagate from the select, not the fcmp.
- Builder.setFastMathFlags(FCI->getFastMathFlags());
- Value *NewCond = Builder.CreateFCmp(InvPred, Cmp0, Cmp1,
- FCI->getName() + ".inv");
- Value *NewSel = Builder.CreateSelect(NewCond, FalseVal, TrueVal);
- return replaceInstUsesWith(SI, NewSel);
- }
-
- // NOTE: if we wanted to, this is where to detect MIN/MAX
- }
- }
-
- // Canonicalize select with fcmp to fabs(). -0.0 makes this tricky. We need
- // fast-math-flags (nsz) or fsub with +0.0 (not fneg) for this to work. We
- // also require nnan because we do not want to unintentionally change the
- // sign of a NaN value.
- // FIXME: These folds should test/propagate FMF from the select, not the
- // fsub or fneg.
- // (X <= +/-0.0) ? (0.0 - X) : X --> fabs(X)
- Instruction *FSub;
- if (match(CondVal, m_FCmp(Pred, m_Specific(FalseVal), m_AnyZeroFP())) &&
- match(TrueVal, m_FSub(m_PosZeroFP(), m_Specific(FalseVal))) &&
- match(TrueVal, m_Instruction(FSub)) && FSub->hasNoNaNs() &&
- (Pred == FCmpInst::FCMP_OLE || Pred == FCmpInst::FCMP_ULE)) {
- Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FalseVal, FSub);
- return replaceInstUsesWith(SI, Fabs);
- }
- // (X > +/-0.0) ? X : (0.0 - X) --> fabs(X)
- if (match(CondVal, m_FCmp(Pred, m_Specific(TrueVal), m_AnyZeroFP())) &&
- match(FalseVal, m_FSub(m_PosZeroFP(), m_Specific(TrueVal))) &&
- match(FalseVal, m_Instruction(FSub)) && FSub->hasNoNaNs() &&
- (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_UGT)) {
- Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, TrueVal, FSub);
- return replaceInstUsesWith(SI, Fabs);
- }
- // With nnan and nsz:
- // (X < +/-0.0) ? -X : X --> fabs(X)
- // (X <= +/-0.0) ? -X : X --> fabs(X)
- Instruction *FNeg;
- if (match(CondVal, m_FCmp(Pred, m_Specific(FalseVal), m_AnyZeroFP())) &&
- match(TrueVal, m_FNeg(m_Specific(FalseVal))) &&
- match(TrueVal, m_Instruction(FNeg)) &&
- FNeg->hasNoNaNs() && FNeg->hasNoSignedZeros() &&
- (Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_OLE ||
- Pred == FCmpInst::FCMP_ULT || Pred == FCmpInst::FCMP_ULE)) {
- Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FalseVal, FNeg);
- return replaceInstUsesWith(SI, Fabs);
- }
- // With nnan and nsz:
- // (X > +/-0.0) ? X : -X --> fabs(X)
- // (X >= +/-0.0) ? X : -X --> fabs(X)
- if (match(CondVal, m_FCmp(Pred, m_Specific(TrueVal), m_AnyZeroFP())) &&
- match(FalseVal, m_FNeg(m_Specific(TrueVal))) &&
- match(FalseVal, m_Instruction(FNeg)) &&
- FNeg->hasNoNaNs() && FNeg->hasNoSignedZeros() &&
- (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_OGE ||
- Pred == FCmpInst::FCMP_UGT || Pred == FCmpInst::FCMP_UGE)) {
- Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, TrueVal, FNeg);
- return replaceInstUsesWith(SI, Fabs);
- }
-
- // See if we are selecting two values based on a comparison of the two values.
- if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal))
- if (Instruction *Result = foldSelectInstWithICmp(SI, ICI))
- return Result;
-
- if (Instruction *Add = foldAddSubSelect(SI, Builder))
- return Add;
- if (Instruction *Add = foldOverflowingAddSubSelect(SI, Builder))
- return Add;
- if (Instruction *Or = foldSetClearBits(SI, Builder))
- return Or;
-
- // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
- auto *TI = dyn_cast<Instruction>(TrueVal);
- auto *FI = dyn_cast<Instruction>(FalseVal);
- if (TI && FI && TI->getOpcode() == FI->getOpcode())
- if (Instruction *IV = foldSelectOpOp(SI, TI, FI))
- return IV;
-
- if (Instruction *I = foldSelectExtConst(SI))
- return I;
-
- // See if we can fold the select into one of our operands.
- if (SelType->isIntOrIntVectorTy() || SelType->isFPOrFPVectorTy()) {
- if (Instruction *FoldI = foldSelectIntoOp(SI, TrueVal, FalseVal))
- return FoldI;
-
- Value *LHS, *RHS;
- Instruction::CastOps CastOp;
- SelectPatternResult SPR = matchSelectPattern(&SI, LHS, RHS, &CastOp);
- auto SPF = SPR.Flavor;
- if (SPF) {
- Value *LHS2, *RHS2;
- if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor)
- if (Instruction *R = foldSPFofSPF(cast<Instruction>(LHS), SPF2, LHS2,
- RHS2, SI, SPF, RHS))
- return R;
- if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2).Flavor)
- if (Instruction *R = foldSPFofSPF(cast<Instruction>(RHS), SPF2, LHS2,
- RHS2, SI, SPF, LHS))
- return R;
- // TODO.
- // ABS(-X) -> ABS(X)
- }
-
- if (SelectPatternResult::isMinOrMax(SPF)) {
- // Canonicalize so that
- // - type casts are outside select patterns.
- // - float clamp is transformed to min/max pattern
-
- bool IsCastNeeded = LHS->getType() != SelType;
- Value *CmpLHS = cast<CmpInst>(CondVal)->getOperand(0);
- Value *CmpRHS = cast<CmpInst>(CondVal)->getOperand(1);
- if (IsCastNeeded ||
- (LHS->getType()->isFPOrFPVectorTy() &&
- ((CmpLHS != LHS && CmpLHS != RHS) ||
- (CmpRHS != LHS && CmpRHS != RHS)))) {
- CmpInst::Predicate MinMaxPred = getMinMaxPred(SPF, SPR.Ordered);
-
- Value *Cmp;
- if (CmpInst::isIntPredicate(MinMaxPred)) {
- Cmp = Builder.CreateICmp(MinMaxPred, LHS, RHS);
- } else {
- IRBuilder<>::FastMathFlagGuard FMFG(Builder);
- auto FMF =
- cast<FPMathOperator>(SI.getCondition())->getFastMathFlags();
- Builder.setFastMathFlags(FMF);
- Cmp = Builder.CreateFCmp(MinMaxPred, LHS, RHS);
- }
-
- Value *NewSI = Builder.CreateSelect(Cmp, LHS, RHS, SI.getName(), &SI);
- if (!IsCastNeeded)
- return replaceInstUsesWith(SI, NewSI);
-
- Value *NewCast = Builder.CreateCast(CastOp, NewSI, SelType);
- return replaceInstUsesWith(SI, NewCast);
- }
-
- // MAX(~a, ~b) -> ~MIN(a, b)
- // MAX(~a, C) -> ~MIN(a, ~C)
- // MIN(~a, ~b) -> ~MAX(a, b)
- // MIN(~a, C) -> ~MAX(a, ~C)
- auto moveNotAfterMinMax = [&](Value *X, Value *Y) -> Instruction * {
- Value *A;
- if (match(X, m_Not(m_Value(A))) && !X->hasNUsesOrMore(3) &&
- !isFreeToInvert(A, A->hasOneUse()) &&
- // Passing false to only consider m_Not and constants.
- isFreeToInvert(Y, false)) {
- Value *B = Builder.CreateNot(Y);
- Value *NewMinMax = createMinMax(Builder, getInverseMinMaxFlavor(SPF),
- A, B);
- // Copy the profile metadata.
- if (MDNode *MD = SI.getMetadata(LLVMContext::MD_prof)) {
- cast<SelectInst>(NewMinMax)->setMetadata(LLVMContext::MD_prof, MD);
- // Swap the metadata if the operands are swapped.
- if (X == SI.getFalseValue() && Y == SI.getTrueValue())
- cast<SelectInst>(NewMinMax)->swapProfMetadata();
- }
-
- return BinaryOperator::CreateNot(NewMinMax);
- }
-
- return nullptr;
- };
-
- if (Instruction *I = moveNotAfterMinMax(LHS, RHS))
- return I;
- if (Instruction *I = moveNotAfterMinMax(RHS, LHS))
- return I;
-
- if (Instruction *I = moveAddAfterMinMax(SPF, LHS, RHS, Builder))
- return I;
-
- if (Instruction *I = factorizeMinMaxTree(SPF, LHS, RHS, Builder))
- return I;
- if (Instruction *I = matchSAddSubSat(SI))
- return I;
- }
- }
-
- // Canonicalize select of FP values where NaN and -0.0 are not valid as
- // minnum/maxnum intrinsics.
- if (isa<FPMathOperator>(SI) && SI.hasNoNaNs() && SI.hasNoSignedZeros()) {
- Value *X, *Y;
- if (match(&SI, m_OrdFMax(m_Value(X), m_Value(Y))))
- return replaceInstUsesWith(
- SI, Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, X, Y, &SI));
-
- if (match(&SI, m_OrdFMin(m_Value(X), m_Value(Y))))
- return replaceInstUsesWith(
- SI, Builder.CreateBinaryIntrinsic(Intrinsic::minnum, X, Y, &SI));
- }
-
- // See if we can fold the select into a phi node if the condition is a select.
- if (auto *PN = dyn_cast<PHINode>(SI.getCondition()))
- // The true/false values have to be live in the PHI predecessor's blocks.
- if (canSelectOperandBeMappingIntoPredBlock(TrueVal, SI) &&
- canSelectOperandBeMappingIntoPredBlock(FalseVal, SI))
- if (Instruction *NV = foldOpIntoPhi(SI, PN))
- return NV;
-
- if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) {
- if (TrueSI->getCondition()->getType() == CondVal->getType()) {
- // select(C, select(C, a, b), c) -> select(C, a, c)
- if (TrueSI->getCondition() == CondVal) {
- if (SI.getTrueValue() == TrueSI->getTrueValue())
- return nullptr;
- return replaceOperand(SI, 1, TrueSI->getTrueValue());
- }
- // select(C0, select(C1, a, b), b) -> select(C0&C1, a, b)
+ CondVal->getType()->isVectorTy() == SelType->isVectorTy()) {
+ // select C, 1, 0 -> zext C to int
+ if (match(TrueVal, m_One()) && match(FalseVal, m_Zero()))
+ return new ZExtInst(CondVal, SelType);
+
+ // select C, -1, 0 -> sext C to int
+ if (match(TrueVal, m_AllOnes()) && match(FalseVal, m_Zero()))
+ return new SExtInst(CondVal, SelType);
+
+ // select C, 0, 1 -> zext !C to int
+ if (match(TrueVal, m_Zero()) && match(FalseVal, m_One())) {
+ Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
+ return new ZExtInst(NotCond, SelType);
+ }
+
+ // select C, 0, -1 -> sext !C to int
+ if (match(TrueVal, m_Zero()) && match(FalseVal, m_AllOnes())) {
+ Value *NotCond = Builder.CreateNot(CondVal, "not." + CondVal->getName());
+ return new SExtInst(NotCond, SelType);
+ }
+ }
+
+ // See if we are selecting two values based on a comparison of the two values.
+ if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) {
+ Value *Cmp0 = FCI->getOperand(0), *Cmp1 = FCI->getOperand(1);
+ if ((Cmp0 == TrueVal && Cmp1 == FalseVal) ||
+ (Cmp0 == FalseVal && Cmp1 == TrueVal)) {
+ // Canonicalize to use ordered comparisons by swapping the select
+ // operands.
+ //
+ // e.g.
+ // (X ugt Y) ? X : Y -> (X ole Y) ? Y : X
+ if (FCI->hasOneUse() && FCmpInst::isUnordered(FCI->getPredicate())) {
+ FCmpInst::Predicate InvPred = FCI->getInversePredicate();
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ // FIXME: The FMF should propagate from the select, not the fcmp.
+ Builder.setFastMathFlags(FCI->getFastMathFlags());
+ Value *NewCond = Builder.CreateFCmp(InvPred, Cmp0, Cmp1,
+ FCI->getName() + ".inv");
+ Value *NewSel = Builder.CreateSelect(NewCond, FalseVal, TrueVal);
+ return replaceInstUsesWith(SI, NewSel);
+ }
+
+ // NOTE: if we wanted to, this is where to detect MIN/MAX
+ }
+ }
+
+ // Canonicalize select with fcmp to fabs(). -0.0 makes this tricky. We need
+ // fast-math-flags (nsz) or fsub with +0.0 (not fneg) for this to work. We
+ // also require nnan because we do not want to unintentionally change the
+ // sign of a NaN value.
+ // FIXME: These folds should test/propagate FMF from the select, not the
+ // fsub or fneg.
+ // (X <= +/-0.0) ? (0.0 - X) : X --> fabs(X)
+ Instruction *FSub;
+ if (match(CondVal, m_FCmp(Pred, m_Specific(FalseVal), m_AnyZeroFP())) &&
+ match(TrueVal, m_FSub(m_PosZeroFP(), m_Specific(FalseVal))) &&
+ match(TrueVal, m_Instruction(FSub)) && FSub->hasNoNaNs() &&
+ (Pred == FCmpInst::FCMP_OLE || Pred == FCmpInst::FCMP_ULE)) {
+ Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FalseVal, FSub);
+ return replaceInstUsesWith(SI, Fabs);
+ }
+ // (X > +/-0.0) ? X : (0.0 - X) --> fabs(X)
+ if (match(CondVal, m_FCmp(Pred, m_Specific(TrueVal), m_AnyZeroFP())) &&
+ match(FalseVal, m_FSub(m_PosZeroFP(), m_Specific(TrueVal))) &&
+ match(FalseVal, m_Instruction(FSub)) && FSub->hasNoNaNs() &&
+ (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_UGT)) {
+ Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, TrueVal, FSub);
+ return replaceInstUsesWith(SI, Fabs);
+ }
+ // With nnan and nsz:
+ // (X < +/-0.0) ? -X : X --> fabs(X)
+ // (X <= +/-0.0) ? -X : X --> fabs(X)
+ Instruction *FNeg;
+ if (match(CondVal, m_FCmp(Pred, m_Specific(FalseVal), m_AnyZeroFP())) &&
+ match(TrueVal, m_FNeg(m_Specific(FalseVal))) &&
+ match(TrueVal, m_Instruction(FNeg)) &&
+ FNeg->hasNoNaNs() && FNeg->hasNoSignedZeros() &&
+ (Pred == FCmpInst::FCMP_OLT || Pred == FCmpInst::FCMP_OLE ||
+ Pred == FCmpInst::FCMP_ULT || Pred == FCmpInst::FCMP_ULE)) {
+ Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FalseVal, FNeg);
+ return replaceInstUsesWith(SI, Fabs);
+ }
+ // With nnan and nsz:
+ // (X > +/-0.0) ? X : -X --> fabs(X)
+ // (X >= +/-0.0) ? X : -X --> fabs(X)
+ if (match(CondVal, m_FCmp(Pred, m_Specific(TrueVal), m_AnyZeroFP())) &&
+ match(FalseVal, m_FNeg(m_Specific(TrueVal))) &&
+ match(FalseVal, m_Instruction(FNeg)) &&
+ FNeg->hasNoNaNs() && FNeg->hasNoSignedZeros() &&
+ (Pred == FCmpInst::FCMP_OGT || Pred == FCmpInst::FCMP_OGE ||
+ Pred == FCmpInst::FCMP_UGT || Pred == FCmpInst::FCMP_UGE)) {
+ Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, TrueVal, FNeg);
+ return replaceInstUsesWith(SI, Fabs);
+ }
+
+ // See if we are selecting two values based on a comparison of the two values.
+ if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal))
+ if (Instruction *Result = foldSelectInstWithICmp(SI, ICI))
+ return Result;
+
+ if (Instruction *Add = foldAddSubSelect(SI, Builder))
+ return Add;
+ if (Instruction *Add = foldOverflowingAddSubSelect(SI, Builder))
+ return Add;
+ if (Instruction *Or = foldSetClearBits(SI, Builder))
+ return Or;
+
+ // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
+ auto *TI = dyn_cast<Instruction>(TrueVal);
+ auto *FI = dyn_cast<Instruction>(FalseVal);
+ if (TI && FI && TI->getOpcode() == FI->getOpcode())
+ if (Instruction *IV = foldSelectOpOp(SI, TI, FI))
+ return IV;
+
+ if (Instruction *I = foldSelectExtConst(SI))
+ return I;
+
+ // See if we can fold the select into one of our operands.
+ if (SelType->isIntOrIntVectorTy() || SelType->isFPOrFPVectorTy()) {
+ if (Instruction *FoldI = foldSelectIntoOp(SI, TrueVal, FalseVal))
+ return FoldI;
+
+ Value *LHS, *RHS;
+ Instruction::CastOps CastOp;
+ SelectPatternResult SPR = matchSelectPattern(&SI, LHS, RHS, &CastOp);
+ auto SPF = SPR.Flavor;
+ if (SPF) {
+ Value *LHS2, *RHS2;
+ if (SelectPatternFlavor SPF2 = matchSelectPattern(LHS, LHS2, RHS2).Flavor)
+ if (Instruction *R = foldSPFofSPF(cast<Instruction>(LHS), SPF2, LHS2,
+ RHS2, SI, SPF, RHS))
+ return R;
+ if (SelectPatternFlavor SPF2 = matchSelectPattern(RHS, LHS2, RHS2).Flavor)
+ if (Instruction *R = foldSPFofSPF(cast<Instruction>(RHS), SPF2, LHS2,
+ RHS2, SI, SPF, LHS))
+ return R;
+ // TODO.
+ // ABS(-X) -> ABS(X)
+ }
+
+ if (SelectPatternResult::isMinOrMax(SPF)) {
+ // Canonicalize so that
+ // - type casts are outside select patterns.
+ // - float clamp is transformed to min/max pattern
+
+ bool IsCastNeeded = LHS->getType() != SelType;
+ Value *CmpLHS = cast<CmpInst>(CondVal)->getOperand(0);
+ Value *CmpRHS = cast<CmpInst>(CondVal)->getOperand(1);
+ if (IsCastNeeded ||
+ (LHS->getType()->isFPOrFPVectorTy() &&
+ ((CmpLHS != LHS && CmpLHS != RHS) ||
+ (CmpRHS != LHS && CmpRHS != RHS)))) {
+ CmpInst::Predicate MinMaxPred = getMinMaxPred(SPF, SPR.Ordered);
+
+ Value *Cmp;
+ if (CmpInst::isIntPredicate(MinMaxPred)) {
+ Cmp = Builder.CreateICmp(MinMaxPred, LHS, RHS);
+ } else {
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ auto FMF =
+ cast<FPMathOperator>(SI.getCondition())->getFastMathFlags();
+ Builder.setFastMathFlags(FMF);
+ Cmp = Builder.CreateFCmp(MinMaxPred, LHS, RHS);
+ }
+
+ Value *NewSI = Builder.CreateSelect(Cmp, LHS, RHS, SI.getName(), &SI);
+ if (!IsCastNeeded)
+ return replaceInstUsesWith(SI, NewSI);
+
+ Value *NewCast = Builder.CreateCast(CastOp, NewSI, SelType);
+ return replaceInstUsesWith(SI, NewCast);
+ }
+
+ // MAX(~a, ~b) -> ~MIN(a, b)
+ // MAX(~a, C) -> ~MIN(a, ~C)
+ // MIN(~a, ~b) -> ~MAX(a, b)
+ // MIN(~a, C) -> ~MAX(a, ~C)
+ auto moveNotAfterMinMax = [&](Value *X, Value *Y) -> Instruction * {
+ Value *A;
+ if (match(X, m_Not(m_Value(A))) && !X->hasNUsesOrMore(3) &&
+ !isFreeToInvert(A, A->hasOneUse()) &&
+ // Passing false to only consider m_Not and constants.
+ isFreeToInvert(Y, false)) {
+ Value *B = Builder.CreateNot(Y);
+ Value *NewMinMax = createMinMax(Builder, getInverseMinMaxFlavor(SPF),
+ A, B);
+ // Copy the profile metadata.
+ if (MDNode *MD = SI.getMetadata(LLVMContext::MD_prof)) {
+ cast<SelectInst>(NewMinMax)->setMetadata(LLVMContext::MD_prof, MD);
+ // Swap the metadata if the operands are swapped.
+ if (X == SI.getFalseValue() && Y == SI.getTrueValue())
+ cast<SelectInst>(NewMinMax)->swapProfMetadata();
+ }
+
+ return BinaryOperator::CreateNot(NewMinMax);
+ }
+
+ return nullptr;
+ };
+
+ if (Instruction *I = moveNotAfterMinMax(LHS, RHS))
+ return I;
+ if (Instruction *I = moveNotAfterMinMax(RHS, LHS))
+ return I;
+
+ if (Instruction *I = moveAddAfterMinMax(SPF, LHS, RHS, Builder))
+ return I;
+
+ if (Instruction *I = factorizeMinMaxTree(SPF, LHS, RHS, Builder))
+ return I;
+ if (Instruction *I = matchSAddSubSat(SI))
+ return I;
+ }
+ }
+
+ // Canonicalize select of FP values where NaN and -0.0 are not valid as
+ // minnum/maxnum intrinsics.
+ if (isa<FPMathOperator>(SI) && SI.hasNoNaNs() && SI.hasNoSignedZeros()) {
+ Value *X, *Y;
+ if (match(&SI, m_OrdFMax(m_Value(X), m_Value(Y))))
+ return replaceInstUsesWith(
+ SI, Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, X, Y, &SI));
+
+ if (match(&SI, m_OrdFMin(m_Value(X), m_Value(Y))))
+ return replaceInstUsesWith(
+ SI, Builder.CreateBinaryIntrinsic(Intrinsic::minnum, X, Y, &SI));
+ }
+
+ // See if we can fold the select into a phi node if the condition is a select.
+ if (auto *PN = dyn_cast<PHINode>(SI.getCondition()))
+ // The true/false values have to be live in the PHI predecessor's blocks.
+ if (canSelectOperandBeMappingIntoPredBlock(TrueVal, SI) &&
+ canSelectOperandBeMappingIntoPredBlock(FalseVal, SI))
+ if (Instruction *NV = foldOpIntoPhi(SI, PN))
+ return NV;
+
+ if (SelectInst *TrueSI = dyn_cast<SelectInst>(TrueVal)) {
+ if (TrueSI->getCondition()->getType() == CondVal->getType()) {
+ // select(C, select(C, a, b), c) -> select(C, a, c)
+ if (TrueSI->getCondition() == CondVal) {
+ if (SI.getTrueValue() == TrueSI->getTrueValue())
+ return nullptr;
+ return replaceOperand(SI, 1, TrueSI->getTrueValue());
+ }
+ // select(C0, select(C1, a, b), b) -> select(C0&C1, a, b)
// We choose this as normal form to enable folding on the And and
// shortening paths for the values (this helps getUnderlyingObjects() for
// example).
- if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) {
- Value *And = Builder.CreateAnd(CondVal, TrueSI->getCondition());
- replaceOperand(SI, 0, And);
- replaceOperand(SI, 1, TrueSI->getTrueValue());
- return &SI;
- }
- }
- }
- if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) {
- if (FalseSI->getCondition()->getType() == CondVal->getType()) {
- // select(C, a, select(C, b, c)) -> select(C, a, c)
- if (FalseSI->getCondition() == CondVal) {
- if (SI.getFalseValue() == FalseSI->getFalseValue())
- return nullptr;
- return replaceOperand(SI, 2, FalseSI->getFalseValue());
- }
- // select(C0, a, select(C1, a, b)) -> select(C0|C1, a, b)
- if (FalseSI->getTrueValue() == TrueVal && FalseSI->hasOneUse()) {
- Value *Or = Builder.CreateOr(CondVal, FalseSI->getCondition());
- replaceOperand(SI, 0, Or);
- replaceOperand(SI, 2, FalseSI->getFalseValue());
- return &SI;
- }
- }
- }
-
- auto canMergeSelectThroughBinop = [](BinaryOperator *BO) {
- // The select might be preventing a division by 0.
- switch (BO->getOpcode()) {
- default:
- return true;
- case Instruction::SRem:
- case Instruction::URem:
- case Instruction::SDiv:
- case Instruction::UDiv:
- return false;
- }
- };
-
- // Try to simplify a binop sandwiched between 2 selects with the same
- // condition.
- // select(C, binop(select(C, X, Y), W), Z) -> select(C, binop(X, W), Z)
- BinaryOperator *TrueBO;
- if (match(TrueVal, m_OneUse(m_BinOp(TrueBO))) &&
- canMergeSelectThroughBinop(TrueBO)) {
- if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(0))) {
- if (TrueBOSI->getCondition() == CondVal) {
- replaceOperand(*TrueBO, 0, TrueBOSI->getTrueValue());
- Worklist.push(TrueBO);
- return &SI;
- }
- }
- if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(1))) {
- if (TrueBOSI->getCondition() == CondVal) {
- replaceOperand(*TrueBO, 1, TrueBOSI->getTrueValue());
- Worklist.push(TrueBO);
- return &SI;
- }
- }
- }
-
- // select(C, Z, binop(select(C, X, Y), W)) -> select(C, Z, binop(Y, W))
- BinaryOperator *FalseBO;
- if (match(FalseVal, m_OneUse(m_BinOp(FalseBO))) &&
- canMergeSelectThroughBinop(FalseBO)) {
- if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(0))) {
- if (FalseBOSI->getCondition() == CondVal) {
- replaceOperand(*FalseBO, 0, FalseBOSI->getFalseValue());
- Worklist.push(FalseBO);
- return &SI;
- }
- }
- if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(1))) {
- if (FalseBOSI->getCondition() == CondVal) {
- replaceOperand(*FalseBO, 1, FalseBOSI->getFalseValue());
- Worklist.push(FalseBO);
- return &SI;
- }
- }
- }
-
- Value *NotCond;
+ if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) {
+ Value *And = Builder.CreateAnd(CondVal, TrueSI->getCondition());
+ replaceOperand(SI, 0, And);
+ replaceOperand(SI, 1, TrueSI->getTrueValue());
+ return &SI;
+ }
+ }
+ }
+ if (SelectInst *FalseSI = dyn_cast<SelectInst>(FalseVal)) {
+ if (FalseSI->getCondition()->getType() == CondVal->getType()) {
+ // select(C, a, select(C, b, c)) -> select(C, a, c)
+ if (FalseSI->getCondition() == CondVal) {
+ if (SI.getFalseValue() == FalseSI->getFalseValue())
+ return nullptr;
+ return replaceOperand(SI, 2, FalseSI->getFalseValue());
+ }
+ // select(C0, a, select(C1, a, b)) -> select(C0|C1, a, b)
+ if (FalseSI->getTrueValue() == TrueVal && FalseSI->hasOneUse()) {
+ Value *Or = Builder.CreateOr(CondVal, FalseSI->getCondition());
+ replaceOperand(SI, 0, Or);
+ replaceOperand(SI, 2, FalseSI->getFalseValue());
+ return &SI;
+ }
+ }
+ }
+
+ auto canMergeSelectThroughBinop = [](BinaryOperator *BO) {
+ // The select might be preventing a division by 0.
+ switch (BO->getOpcode()) {
+ default:
+ return true;
+ case Instruction::SRem:
+ case Instruction::URem:
+ case Instruction::SDiv:
+ case Instruction::UDiv:
+ return false;
+ }
+ };
+
+ // Try to simplify a binop sandwiched between 2 selects with the same
+ // condition.
+ // select(C, binop(select(C, X, Y), W), Z) -> select(C, binop(X, W), Z)
+ BinaryOperator *TrueBO;
+ if (match(TrueVal, m_OneUse(m_BinOp(TrueBO))) &&
+ canMergeSelectThroughBinop(TrueBO)) {
+ if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(0))) {
+ if (TrueBOSI->getCondition() == CondVal) {
+ replaceOperand(*TrueBO, 0, TrueBOSI->getTrueValue());
+ Worklist.push(TrueBO);
+ return &SI;
+ }
+ }
+ if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(1))) {
+ if (TrueBOSI->getCondition() == CondVal) {
+ replaceOperand(*TrueBO, 1, TrueBOSI->getTrueValue());
+ Worklist.push(TrueBO);
+ return &SI;
+ }
+ }
+ }
+
+ // select(C, Z, binop(select(C, X, Y), W)) -> select(C, Z, binop(Y, W))
+ BinaryOperator *FalseBO;
+ if (match(FalseVal, m_OneUse(m_BinOp(FalseBO))) &&
+ canMergeSelectThroughBinop(FalseBO)) {
+ if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(0))) {
+ if (FalseBOSI->getCondition() == CondVal) {
+ replaceOperand(*FalseBO, 0, FalseBOSI->getFalseValue());
+ Worklist.push(FalseBO);
+ return &SI;
+ }
+ }
+ if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(1))) {
+ if (FalseBOSI->getCondition() == CondVal) {
+ replaceOperand(*FalseBO, 1, FalseBOSI->getFalseValue());
+ Worklist.push(FalseBO);
+ return &SI;
+ }
+ }
+ }
+
+ Value *NotCond;
if (match(CondVal, m_Not(m_Value(NotCond))) &&
!InstCombiner::shouldAvoidAbsorbingNotIntoSelect(SI)) {
- replaceOperand(SI, 0, NotCond);
- SI.swapValues();
- SI.swapProfMetadata();
- return &SI;
- }
-
- if (Instruction *I = foldVectorSelect(SI))
- return I;
-
- // If we can compute the condition, there's no need for a select.
- // Like the above fold, we are attempting to reduce compile-time cost by
- // putting this fold here with limitations rather than in InstSimplify.
- // The motivation for this call into value tracking is to take advantage of
- // the assumption cache, so make sure that is populated.
- if (!CondVal->getType()->isVectorTy() && !AC.assumptions().empty()) {
- KnownBits Known(1);
- computeKnownBits(CondVal, Known, 0, &SI);
- if (Known.One.isOneValue())
- return replaceInstUsesWith(SI, TrueVal);
- if (Known.Zero.isOneValue())
- return replaceInstUsesWith(SI, FalseVal);
- }
-
- if (Instruction *BitCastSel = foldSelectCmpBitcasts(SI, Builder))
- return BitCastSel;
-
- // Simplify selects that test the returned flag of cmpxchg instructions.
- if (Value *V = foldSelectCmpXchg(SI))
- return replaceInstUsesWith(SI, V);
-
- if (Instruction *Select = foldSelectBinOpIdentity(SI, TLI, *this))
- return Select;
-
+ replaceOperand(SI, 0, NotCond);
+ SI.swapValues();
+ SI.swapProfMetadata();
+ return &SI;
+ }
+
+ if (Instruction *I = foldVectorSelect(SI))
+ return I;
+
+ // If we can compute the condition, there's no need for a select.
+ // Like the above fold, we are attempting to reduce compile-time cost by
+ // putting this fold here with limitations rather than in InstSimplify.
+ // The motivation for this call into value tracking is to take advantage of
+ // the assumption cache, so make sure that is populated.
+ if (!CondVal->getType()->isVectorTy() && !AC.assumptions().empty()) {
+ KnownBits Known(1);
+ computeKnownBits(CondVal, Known, 0, &SI);
+ if (Known.One.isOneValue())
+ return replaceInstUsesWith(SI, TrueVal);
+ if (Known.Zero.isOneValue())
+ return replaceInstUsesWith(SI, FalseVal);
+ }
+
+ if (Instruction *BitCastSel = foldSelectCmpBitcasts(SI, Builder))
+ return BitCastSel;
+
+ // Simplify selects that test the returned flag of cmpxchg instructions.
+ if (Value *V = foldSelectCmpXchg(SI))
+ return replaceInstUsesWith(SI, V);
+
+ if (Instruction *Select = foldSelectBinOpIdentity(SI, TLI, *this))
+ return Select;
+
if (Instruction *Funnel = foldSelectFunnelShift(SI, Builder))
return Funnel;
-
- if (Instruction *Copysign = foldSelectToCopysign(SI, Builder))
- return Copysign;
-
- if (Instruction *PN = foldSelectToPhi(SI, DT, Builder))
- return replaceInstUsesWith(SI, PN);
-
+
+ if (Instruction *Copysign = foldSelectToCopysign(SI, Builder))
+ return Copysign;
+
+ if (Instruction *PN = foldSelectToPhi(SI, DT, Builder))
+ return replaceInstUsesWith(SI, PN);
+
if (Value *Fr = foldSelectWithFrozenICmp(SI, Builder))
return replaceInstUsesWith(SI, Fr);
- return nullptr;
-}
+ return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineShifts.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 194c67f595..127bf80809 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -1,26 +1,26 @@
-//===- InstCombineShifts.cpp ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the visitShl, visitLShr, and visitAShr functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PatternMatch.h"
+//===- InstCombineShifts.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the visitShl, visitLShr, and visitAShr functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "instcombine"
-
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
bool canTryToConstantAddTwoShiftAmounts(Value *Sh0, Value *ShAmt0, Value *Sh1,
Value *ShAmt1) {
// We have two shift amounts from two different shifts. The types of those
@@ -45,911 +45,911 @@ bool canTryToConstantAddTwoShiftAmounts(Value *Sh0, Value *ShAmt0, Value *Sh1,
return MaximalRepresentableShiftAmount.uge(MaximalPossibleTotalShiftAmount);
}
-// Given pattern:
-// (x shiftopcode Q) shiftopcode K
-// we should rewrite it as
-// x shiftopcode (Q+K) iff (Q+K) u< bitwidth(x) and
-//
-// This is valid for any shift, but they must be identical, and we must be
-// careful in case we have (zext(Q)+zext(K)) and look past extensions,
-// (Q+K) must not overflow or else (Q+K) u< bitwidth(x) is bogus.
-//
-// AnalyzeForSignBitExtraction indicates that we will only analyze whether this
-// pattern has any 2 right-shifts that sum to 1 less than original bit width.
+// Given pattern:
+// (x shiftopcode Q) shiftopcode K
+// we should rewrite it as
+// x shiftopcode (Q+K) iff (Q+K) u< bitwidth(x) and
+//
+// This is valid for any shift, but they must be identical, and we must be
+// careful in case we have (zext(Q)+zext(K)) and look past extensions,
+// (Q+K) must not overflow or else (Q+K) u< bitwidth(x) is bogus.
+//
+// AnalyzeForSignBitExtraction indicates that we will only analyze whether this
+// pattern has any 2 right-shifts that sum to 1 less than original bit width.
Value *InstCombinerImpl::reassociateShiftAmtsOfTwoSameDirectionShifts(
- BinaryOperator *Sh0, const SimplifyQuery &SQ,
- bool AnalyzeForSignBitExtraction) {
- // Look for a shift of some instruction, ignore zext of shift amount if any.
- Instruction *Sh0Op0;
- Value *ShAmt0;
- if (!match(Sh0,
- m_Shift(m_Instruction(Sh0Op0), m_ZExtOrSelf(m_Value(ShAmt0)))))
- return nullptr;
-
- // If there is a truncation between the two shifts, we must make note of it
- // and look through it. The truncation imposes additional constraints on the
- // transform.
- Instruction *Sh1;
- Value *Trunc = nullptr;
- match(Sh0Op0,
- m_CombineOr(m_CombineAnd(m_Trunc(m_Instruction(Sh1)), m_Value(Trunc)),
- m_Instruction(Sh1)));
-
- // Inner shift: (x shiftopcode ShAmt1)
- // Like with other shift, ignore zext of shift amount if any.
- Value *X, *ShAmt1;
- if (!match(Sh1, m_Shift(m_Value(X), m_ZExtOrSelf(m_Value(ShAmt1)))))
- return nullptr;
-
+ BinaryOperator *Sh0, const SimplifyQuery &SQ,
+ bool AnalyzeForSignBitExtraction) {
+ // Look for a shift of some instruction, ignore zext of shift amount if any.
+ Instruction *Sh0Op0;
+ Value *ShAmt0;
+ if (!match(Sh0,
+ m_Shift(m_Instruction(Sh0Op0), m_ZExtOrSelf(m_Value(ShAmt0)))))
+ return nullptr;
+
+ // If there is a truncation between the two shifts, we must make note of it
+ // and look through it. The truncation imposes additional constraints on the
+ // transform.
+ Instruction *Sh1;
+ Value *Trunc = nullptr;
+ match(Sh0Op0,
+ m_CombineOr(m_CombineAnd(m_Trunc(m_Instruction(Sh1)), m_Value(Trunc)),
+ m_Instruction(Sh1)));
+
+ // Inner shift: (x shiftopcode ShAmt1)
+ // Like with other shift, ignore zext of shift amount if any.
+ Value *X, *ShAmt1;
+ if (!match(Sh1, m_Shift(m_Value(X), m_ZExtOrSelf(m_Value(ShAmt1)))))
+ return nullptr;
+
// Verify that it would be safe to try to add those two shift amounts.
if (!canTryToConstantAddTwoShiftAmounts(Sh0, ShAmt0, Sh1, ShAmt1))
- return nullptr;
-
- // We are only looking for signbit extraction if we have two right shifts.
- bool HadTwoRightShifts = match(Sh0, m_Shr(m_Value(), m_Value())) &&
- match(Sh1, m_Shr(m_Value(), m_Value()));
- // ... and if it's not two right-shifts, we know the answer already.
- if (AnalyzeForSignBitExtraction && !HadTwoRightShifts)
- return nullptr;
-
- // The shift opcodes must be identical, unless we are just checking whether
- // this pattern can be interpreted as a sign-bit-extraction.
- Instruction::BinaryOps ShiftOpcode = Sh0->getOpcode();
- bool IdenticalShOpcodes = Sh0->getOpcode() == Sh1->getOpcode();
- if (!IdenticalShOpcodes && !AnalyzeForSignBitExtraction)
- return nullptr;
-
- // If we saw truncation, we'll need to produce extra instruction,
- // and for that one of the operands of the shift must be one-use,
- // unless of course we don't actually plan to produce any instructions here.
- if (Trunc && !AnalyzeForSignBitExtraction &&
- !match(Sh0, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
- return nullptr;
-
- // Can we fold (ShAmt0+ShAmt1) ?
- auto *NewShAmt = dyn_cast_or_null<Constant>(
- SimplifyAddInst(ShAmt0, ShAmt1, /*isNSW=*/false, /*isNUW=*/false,
- SQ.getWithInstruction(Sh0)));
- if (!NewShAmt)
- return nullptr; // Did not simplify.
- unsigned NewShAmtBitWidth = NewShAmt->getType()->getScalarSizeInBits();
- unsigned XBitWidth = X->getType()->getScalarSizeInBits();
- // Is the new shift amount smaller than the bit width of inner/new shift?
- if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
- APInt(NewShAmtBitWidth, XBitWidth))))
- return nullptr; // FIXME: could perform constant-folding.
-
- // If there was a truncation, and we have a right-shift, we can only fold if
- // we are left with the original sign bit. Likewise, if we were just checking
- // that this is a sighbit extraction, this is the place to check it.
- // FIXME: zero shift amount is also legal here, but we can't *easily* check
- // more than one predicate so it's not really worth it.
- if (HadTwoRightShifts && (Trunc || AnalyzeForSignBitExtraction)) {
- // If it's not a sign bit extraction, then we're done.
- if (!match(NewShAmt,
- m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
- APInt(NewShAmtBitWidth, XBitWidth - 1))))
- return nullptr;
- // If it is, and that was the question, return the base value.
- if (AnalyzeForSignBitExtraction)
- return X;
- }
-
- assert(IdenticalShOpcodes && "Should not get here with different shifts.");
-
- // All good, we can do this fold.
- NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, X->getType());
-
- BinaryOperator *NewShift = BinaryOperator::Create(ShiftOpcode, X, NewShAmt);
-
- // The flags can only be propagated if there wasn't a trunc.
- if (!Trunc) {
- // If the pattern did not involve trunc, and both of the original shifts
- // had the same flag set, preserve the flag.
- if (ShiftOpcode == Instruction::BinaryOps::Shl) {
- NewShift->setHasNoUnsignedWrap(Sh0->hasNoUnsignedWrap() &&
- Sh1->hasNoUnsignedWrap());
- NewShift->setHasNoSignedWrap(Sh0->hasNoSignedWrap() &&
- Sh1->hasNoSignedWrap());
- } else {
- NewShift->setIsExact(Sh0->isExact() && Sh1->isExact());
- }
- }
-
- Instruction *Ret = NewShift;
- if (Trunc) {
- Builder.Insert(NewShift);
- Ret = CastInst::Create(Instruction::Trunc, NewShift, Sh0->getType());
- }
-
- return Ret;
-}
-
-// If we have some pattern that leaves only some low bits set, and then performs
-// left-shift of those bits, if none of the bits that are left after the final
-// shift are modified by the mask, we can omit the mask.
-//
-// There are many variants to this pattern:
-// a) (x & ((1 << MaskShAmt) - 1)) << ShiftShAmt
-// b) (x & (~(-1 << MaskShAmt))) << ShiftShAmt
-// c) (x & (-1 >> MaskShAmt)) << ShiftShAmt
-// d) (x & ((-1 << MaskShAmt) >> MaskShAmt)) << ShiftShAmt
-// e) ((x << MaskShAmt) l>> MaskShAmt) << ShiftShAmt
-// f) ((x << MaskShAmt) a>> MaskShAmt) << ShiftShAmt
-// All these patterns can be simplified to just:
-// x << ShiftShAmt
-// iff:
-// a,b) (MaskShAmt+ShiftShAmt) u>= bitwidth(x)
-// c,d,e,f) (ShiftShAmt-MaskShAmt) s>= 0 (i.e. ShiftShAmt u>= MaskShAmt)
-static Instruction *
-dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
- const SimplifyQuery &Q,
- InstCombiner::BuilderTy &Builder) {
- assert(OuterShift->getOpcode() == Instruction::BinaryOps::Shl &&
- "The input must be 'shl'!");
-
- Value *Masked, *ShiftShAmt;
- match(OuterShift,
- m_Shift(m_Value(Masked), m_ZExtOrSelf(m_Value(ShiftShAmt))));
-
- // *If* there is a truncation between an outer shift and a possibly-mask,
- // then said truncation *must* be one-use, else we can't perform the fold.
- Value *Trunc;
- if (match(Masked, m_CombineAnd(m_Trunc(m_Value(Masked)), m_Value(Trunc))) &&
- !Trunc->hasOneUse())
- return nullptr;
-
- Type *NarrowestTy = OuterShift->getType();
- Type *WidestTy = Masked->getType();
- bool HadTrunc = WidestTy != NarrowestTy;
-
- // The mask must be computed in a type twice as wide to ensure
- // that no bits are lost if the sum-of-shifts is wider than the base type.
- Type *ExtendedTy = WidestTy->getExtendedType();
-
- Value *MaskShAmt;
-
- // ((1 << MaskShAmt) - 1)
- auto MaskA = m_Add(m_Shl(m_One(), m_Value(MaskShAmt)), m_AllOnes());
- // (~(-1 << maskNbits))
- auto MaskB = m_Xor(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_AllOnes());
- // (-1 >> MaskShAmt)
- auto MaskC = m_Shr(m_AllOnes(), m_Value(MaskShAmt));
- // ((-1 << MaskShAmt) >> MaskShAmt)
- auto MaskD =
- m_Shr(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_Deferred(MaskShAmt));
-
- Value *X;
- Constant *NewMask;
-
- if (match(Masked, m_c_And(m_CombineOr(MaskA, MaskB), m_Value(X)))) {
- // Peek through an optional zext of the shift amount.
- match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt)));
-
+ return nullptr;
+
+ // We are only looking for signbit extraction if we have two right shifts.
+ bool HadTwoRightShifts = match(Sh0, m_Shr(m_Value(), m_Value())) &&
+ match(Sh1, m_Shr(m_Value(), m_Value()));
+ // ... and if it's not two right-shifts, we know the answer already.
+ if (AnalyzeForSignBitExtraction && !HadTwoRightShifts)
+ return nullptr;
+
+ // The shift opcodes must be identical, unless we are just checking whether
+ // this pattern can be interpreted as a sign-bit-extraction.
+ Instruction::BinaryOps ShiftOpcode = Sh0->getOpcode();
+ bool IdenticalShOpcodes = Sh0->getOpcode() == Sh1->getOpcode();
+ if (!IdenticalShOpcodes && !AnalyzeForSignBitExtraction)
+ return nullptr;
+
+ // If we saw truncation, we'll need to produce extra instruction,
+ // and for that one of the operands of the shift must be one-use,
+ // unless of course we don't actually plan to produce any instructions here.
+ if (Trunc && !AnalyzeForSignBitExtraction &&
+ !match(Sh0, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
+ return nullptr;
+
+ // Can we fold (ShAmt0+ShAmt1) ?
+ auto *NewShAmt = dyn_cast_or_null<Constant>(
+ SimplifyAddInst(ShAmt0, ShAmt1, /*isNSW=*/false, /*isNUW=*/false,
+ SQ.getWithInstruction(Sh0)));
+ if (!NewShAmt)
+ return nullptr; // Did not simplify.
+ unsigned NewShAmtBitWidth = NewShAmt->getType()->getScalarSizeInBits();
+ unsigned XBitWidth = X->getType()->getScalarSizeInBits();
+ // Is the new shift amount smaller than the bit width of inner/new shift?
+ if (!match(NewShAmt, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_ULT,
+ APInt(NewShAmtBitWidth, XBitWidth))))
+ return nullptr; // FIXME: could perform constant-folding.
+
+ // If there was a truncation, and we have a right-shift, we can only fold if
+ // we are left with the original sign bit. Likewise, if we were just checking
+ // that this is a sighbit extraction, this is the place to check it.
+ // FIXME: zero shift amount is also legal here, but we can't *easily* check
+ // more than one predicate so it's not really worth it.
+ if (HadTwoRightShifts && (Trunc || AnalyzeForSignBitExtraction)) {
+ // If it's not a sign bit extraction, then we're done.
+ if (!match(NewShAmt,
+ m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+ APInt(NewShAmtBitWidth, XBitWidth - 1))))
+ return nullptr;
+ // If it is, and that was the question, return the base value.
+ if (AnalyzeForSignBitExtraction)
+ return X;
+ }
+
+ assert(IdenticalShOpcodes && "Should not get here with different shifts.");
+
+ // All good, we can do this fold.
+ NewShAmt = ConstantExpr::getZExtOrBitCast(NewShAmt, X->getType());
+
+ BinaryOperator *NewShift = BinaryOperator::Create(ShiftOpcode, X, NewShAmt);
+
+ // The flags can only be propagated if there wasn't a trunc.
+ if (!Trunc) {
+ // If the pattern did not involve trunc, and both of the original shifts
+ // had the same flag set, preserve the flag.
+ if (ShiftOpcode == Instruction::BinaryOps::Shl) {
+ NewShift->setHasNoUnsignedWrap(Sh0->hasNoUnsignedWrap() &&
+ Sh1->hasNoUnsignedWrap());
+ NewShift->setHasNoSignedWrap(Sh0->hasNoSignedWrap() &&
+ Sh1->hasNoSignedWrap());
+ } else {
+ NewShift->setIsExact(Sh0->isExact() && Sh1->isExact());
+ }
+ }
+
+ Instruction *Ret = NewShift;
+ if (Trunc) {
+ Builder.Insert(NewShift);
+ Ret = CastInst::Create(Instruction::Trunc, NewShift, Sh0->getType());
+ }
+
+ return Ret;
+}
+
+// If we have some pattern that leaves only some low bits set, and then performs
+// left-shift of those bits, if none of the bits that are left after the final
+// shift are modified by the mask, we can omit the mask.
+//
+// There are many variants to this pattern:
+// a) (x & ((1 << MaskShAmt) - 1)) << ShiftShAmt
+// b) (x & (~(-1 << MaskShAmt))) << ShiftShAmt
+// c) (x & (-1 >> MaskShAmt)) << ShiftShAmt
+// d) (x & ((-1 << MaskShAmt) >> MaskShAmt)) << ShiftShAmt
+// e) ((x << MaskShAmt) l>> MaskShAmt) << ShiftShAmt
+// f) ((x << MaskShAmt) a>> MaskShAmt) << ShiftShAmt
+// All these patterns can be simplified to just:
+// x << ShiftShAmt
+// iff:
+// a,b) (MaskShAmt+ShiftShAmt) u>= bitwidth(x)
+// c,d,e,f) (ShiftShAmt-MaskShAmt) s>= 0 (i.e. ShiftShAmt u>= MaskShAmt)
+static Instruction *
+dropRedundantMaskingOfLeftShiftInput(BinaryOperator *OuterShift,
+ const SimplifyQuery &Q,
+ InstCombiner::BuilderTy &Builder) {
+ assert(OuterShift->getOpcode() == Instruction::BinaryOps::Shl &&
+ "The input must be 'shl'!");
+
+ Value *Masked, *ShiftShAmt;
+ match(OuterShift,
+ m_Shift(m_Value(Masked), m_ZExtOrSelf(m_Value(ShiftShAmt))));
+
+ // *If* there is a truncation between an outer shift and a possibly-mask,
+ // then said truncation *must* be one-use, else we can't perform the fold.
+ Value *Trunc;
+ if (match(Masked, m_CombineAnd(m_Trunc(m_Value(Masked)), m_Value(Trunc))) &&
+ !Trunc->hasOneUse())
+ return nullptr;
+
+ Type *NarrowestTy = OuterShift->getType();
+ Type *WidestTy = Masked->getType();
+ bool HadTrunc = WidestTy != NarrowestTy;
+
+ // The mask must be computed in a type twice as wide to ensure
+ // that no bits are lost if the sum-of-shifts is wider than the base type.
+ Type *ExtendedTy = WidestTy->getExtendedType();
+
+ Value *MaskShAmt;
+
+ // ((1 << MaskShAmt) - 1)
+ auto MaskA = m_Add(m_Shl(m_One(), m_Value(MaskShAmt)), m_AllOnes());
+ // (~(-1 << maskNbits))
+ auto MaskB = m_Xor(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_AllOnes());
+ // (-1 >> MaskShAmt)
+ auto MaskC = m_Shr(m_AllOnes(), m_Value(MaskShAmt));
+ // ((-1 << MaskShAmt) >> MaskShAmt)
+ auto MaskD =
+ m_Shr(m_Shl(m_AllOnes(), m_Value(MaskShAmt)), m_Deferred(MaskShAmt));
+
+ Value *X;
+ Constant *NewMask;
+
+ if (match(Masked, m_c_And(m_CombineOr(MaskA, MaskB), m_Value(X)))) {
+ // Peek through an optional zext of the shift amount.
+ match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt)));
+
// Verify that it would be safe to try to add those two shift amounts.
if (!canTryToConstantAddTwoShiftAmounts(OuterShift, ShiftShAmt, Masked,
MaskShAmt))
- return nullptr;
-
- // Can we simplify (MaskShAmt+ShiftShAmt) ?
- auto *SumOfShAmts = dyn_cast_or_null<Constant>(SimplifyAddInst(
- MaskShAmt, ShiftShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q));
- if (!SumOfShAmts)
- return nullptr; // Did not simplify.
- // In this pattern SumOfShAmts correlates with the number of low bits
- // that shall remain in the root value (OuterShift).
-
- // An extend of an undef value becomes zero because the high bits are never
- // completely unknown. Replace the the `undef` shift amounts with final
- // shift bitwidth to ensure that the value remains undef when creating the
- // subsequent shift op.
- SumOfShAmts = Constant::replaceUndefsWith(
- SumOfShAmts, ConstantInt::get(SumOfShAmts->getType()->getScalarType(),
- ExtendedTy->getScalarSizeInBits()));
- auto *ExtendedSumOfShAmts = ConstantExpr::getZExt(SumOfShAmts, ExtendedTy);
- // And compute the mask as usual: ~(-1 << (SumOfShAmts))
- auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy);
- auto *ExtendedInvertedMask =
- ConstantExpr::getShl(ExtendedAllOnes, ExtendedSumOfShAmts);
- NewMask = ConstantExpr::getNot(ExtendedInvertedMask);
- } else if (match(Masked, m_c_And(m_CombineOr(MaskC, MaskD), m_Value(X))) ||
- match(Masked, m_Shr(m_Shl(m_Value(X), m_Value(MaskShAmt)),
- m_Deferred(MaskShAmt)))) {
- // Peek through an optional zext of the shift amount.
- match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt)));
-
+ return nullptr;
+
+ // Can we simplify (MaskShAmt+ShiftShAmt) ?
+ auto *SumOfShAmts = dyn_cast_or_null<Constant>(SimplifyAddInst(
+ MaskShAmt, ShiftShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q));
+ if (!SumOfShAmts)
+ return nullptr; // Did not simplify.
+ // In this pattern SumOfShAmts correlates with the number of low bits
+ // that shall remain in the root value (OuterShift).
+
+ // An extend of an undef value becomes zero because the high bits are never
+ // completely unknown. Replace the the `undef` shift amounts with final
+ // shift bitwidth to ensure that the value remains undef when creating the
+ // subsequent shift op.
+ SumOfShAmts = Constant::replaceUndefsWith(
+ SumOfShAmts, ConstantInt::get(SumOfShAmts->getType()->getScalarType(),
+ ExtendedTy->getScalarSizeInBits()));
+ auto *ExtendedSumOfShAmts = ConstantExpr::getZExt(SumOfShAmts, ExtendedTy);
+ // And compute the mask as usual: ~(-1 << (SumOfShAmts))
+ auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy);
+ auto *ExtendedInvertedMask =
+ ConstantExpr::getShl(ExtendedAllOnes, ExtendedSumOfShAmts);
+ NewMask = ConstantExpr::getNot(ExtendedInvertedMask);
+ } else if (match(Masked, m_c_And(m_CombineOr(MaskC, MaskD), m_Value(X))) ||
+ match(Masked, m_Shr(m_Shl(m_Value(X), m_Value(MaskShAmt)),
+ m_Deferred(MaskShAmt)))) {
+ // Peek through an optional zext of the shift amount.
+ match(MaskShAmt, m_ZExtOrSelf(m_Value(MaskShAmt)));
+
// Verify that it would be safe to try to add those two shift amounts.
if (!canTryToConstantAddTwoShiftAmounts(OuterShift, ShiftShAmt, Masked,
MaskShAmt))
- return nullptr;
-
- // Can we simplify (ShiftShAmt-MaskShAmt) ?
- auto *ShAmtsDiff = dyn_cast_or_null<Constant>(SimplifySubInst(
- ShiftShAmt, MaskShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q));
- if (!ShAmtsDiff)
- return nullptr; // Did not simplify.
- // In this pattern ShAmtsDiff correlates with the number of high bits that
- // shall be unset in the root value (OuterShift).
-
- // An extend of an undef value becomes zero because the high bits are never
- // completely unknown. Replace the the `undef` shift amounts with negated
- // bitwidth of innermost shift to ensure that the value remains undef when
- // creating the subsequent shift op.
- unsigned WidestTyBitWidth = WidestTy->getScalarSizeInBits();
- ShAmtsDiff = Constant::replaceUndefsWith(
- ShAmtsDiff, ConstantInt::get(ShAmtsDiff->getType()->getScalarType(),
- -WidestTyBitWidth));
- auto *ExtendedNumHighBitsToClear = ConstantExpr::getZExt(
- ConstantExpr::getSub(ConstantInt::get(ShAmtsDiff->getType(),
- WidestTyBitWidth,
- /*isSigned=*/false),
- ShAmtsDiff),
- ExtendedTy);
- // And compute the mask as usual: (-1 l>> (NumHighBitsToClear))
- auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy);
- NewMask =
- ConstantExpr::getLShr(ExtendedAllOnes, ExtendedNumHighBitsToClear);
- } else
- return nullptr; // Don't know anything about this pattern.
-
- NewMask = ConstantExpr::getTrunc(NewMask, NarrowestTy);
-
- // Does this mask has any unset bits? If not then we can just not apply it.
- bool NeedMask = !match(NewMask, m_AllOnes());
-
- // If we need to apply a mask, there are several more restrictions we have.
- if (NeedMask) {
- // The old masking instruction must go away.
- if (!Masked->hasOneUse())
- return nullptr;
- // The original "masking" instruction must not have been`ashr`.
- if (match(Masked, m_AShr(m_Value(), m_Value())))
- return nullptr;
- }
-
- // If we need to apply truncation, let's do it first, since we can.
- // We have already ensured that the old truncation will go away.
- if (HadTrunc)
- X = Builder.CreateTrunc(X, NarrowestTy);
-
- // No 'NUW'/'NSW'! We no longer know that we won't shift-out non-0 bits.
- // We didn't change the Type of this outermost shift, so we can just do it.
- auto *NewShift = BinaryOperator::Create(OuterShift->getOpcode(), X,
- OuterShift->getOperand(1));
- if (!NeedMask)
- return NewShift;
-
- Builder.Insert(NewShift);
- return BinaryOperator::Create(Instruction::And, NewShift, NewMask);
-}
-
-/// If we have a shift-by-constant of a bitwise logic op that itself has a
-/// shift-by-constant operand with identical opcode, we may be able to convert
-/// that into 2 independent shifts followed by the logic op. This eliminates a
-/// a use of an intermediate value (reduces dependency chain).
-static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I,
- InstCombiner::BuilderTy &Builder) {
- assert(I.isShift() && "Expected a shift as input");
- auto *LogicInst = dyn_cast<BinaryOperator>(I.getOperand(0));
- if (!LogicInst || !LogicInst->isBitwiseLogicOp() || !LogicInst->hasOneUse())
- return nullptr;
-
+ return nullptr;
+
+ // Can we simplify (ShiftShAmt-MaskShAmt) ?
+ auto *ShAmtsDiff = dyn_cast_or_null<Constant>(SimplifySubInst(
+ ShiftShAmt, MaskShAmt, /*IsNSW=*/false, /*IsNUW=*/false, Q));
+ if (!ShAmtsDiff)
+ return nullptr; // Did not simplify.
+ // In this pattern ShAmtsDiff correlates with the number of high bits that
+ // shall be unset in the root value (OuterShift).
+
+ // An extend of an undef value becomes zero because the high bits are never
+ // completely unknown. Replace the the `undef` shift amounts with negated
+ // bitwidth of innermost shift to ensure that the value remains undef when
+ // creating the subsequent shift op.
+ unsigned WidestTyBitWidth = WidestTy->getScalarSizeInBits();
+ ShAmtsDiff = Constant::replaceUndefsWith(
+ ShAmtsDiff, ConstantInt::get(ShAmtsDiff->getType()->getScalarType(),
+ -WidestTyBitWidth));
+ auto *ExtendedNumHighBitsToClear = ConstantExpr::getZExt(
+ ConstantExpr::getSub(ConstantInt::get(ShAmtsDiff->getType(),
+ WidestTyBitWidth,
+ /*isSigned=*/false),
+ ShAmtsDiff),
+ ExtendedTy);
+ // And compute the mask as usual: (-1 l>> (NumHighBitsToClear))
+ auto *ExtendedAllOnes = ConstantExpr::getAllOnesValue(ExtendedTy);
+ NewMask =
+ ConstantExpr::getLShr(ExtendedAllOnes, ExtendedNumHighBitsToClear);
+ } else
+ return nullptr; // Don't know anything about this pattern.
+
+ NewMask = ConstantExpr::getTrunc(NewMask, NarrowestTy);
+
+ // Does this mask has any unset bits? If not then we can just not apply it.
+ bool NeedMask = !match(NewMask, m_AllOnes());
+
+ // If we need to apply a mask, there are several more restrictions we have.
+ if (NeedMask) {
+ // The old masking instruction must go away.
+ if (!Masked->hasOneUse())
+ return nullptr;
+ // The original "masking" instruction must not have been`ashr`.
+ if (match(Masked, m_AShr(m_Value(), m_Value())))
+ return nullptr;
+ }
+
+ // If we need to apply truncation, let's do it first, since we can.
+ // We have already ensured that the old truncation will go away.
+ if (HadTrunc)
+ X = Builder.CreateTrunc(X, NarrowestTy);
+
+ // No 'NUW'/'NSW'! We no longer know that we won't shift-out non-0 bits.
+ // We didn't change the Type of this outermost shift, so we can just do it.
+ auto *NewShift = BinaryOperator::Create(OuterShift->getOpcode(), X,
+ OuterShift->getOperand(1));
+ if (!NeedMask)
+ return NewShift;
+
+ Builder.Insert(NewShift);
+ return BinaryOperator::Create(Instruction::And, NewShift, NewMask);
+}
+
+/// If we have a shift-by-constant of a bitwise logic op that itself has a
+/// shift-by-constant operand with identical opcode, we may be able to convert
+/// that into 2 independent shifts followed by the logic op. This eliminates a
+/// a use of an intermediate value (reduces dependency chain).
+static Instruction *foldShiftOfShiftedLogic(BinaryOperator &I,
+ InstCombiner::BuilderTy &Builder) {
+ assert(I.isShift() && "Expected a shift as input");
+ auto *LogicInst = dyn_cast<BinaryOperator>(I.getOperand(0));
+ if (!LogicInst || !LogicInst->isBitwiseLogicOp() || !LogicInst->hasOneUse())
+ return nullptr;
+
Constant *C0, *C1;
if (!match(I.getOperand(1), m_Constant(C1)))
- return nullptr;
-
- Instruction::BinaryOps ShiftOpcode = I.getOpcode();
- Type *Ty = I.getType();
-
- // Find a matching one-use shift by constant. The fold is not valid if the sum
- // of the shift values equals or exceeds bitwidth.
- // TODO: Remove the one-use check if the other logic operand (Y) is constant.
- Value *X, *Y;
- auto matchFirstShift = [&](Value *V) {
+ return nullptr;
+
+ Instruction::BinaryOps ShiftOpcode = I.getOpcode();
+ Type *Ty = I.getType();
+
+ // Find a matching one-use shift by constant. The fold is not valid if the sum
+ // of the shift values equals or exceeds bitwidth.
+ // TODO: Remove the one-use check if the other logic operand (Y) is constant.
+ Value *X, *Y;
+ auto matchFirstShift = [&](Value *V) {
BinaryOperator *BO;
APInt Threshold(Ty->getScalarSizeInBits(), Ty->getScalarSizeInBits());
return match(V, m_BinOp(BO)) && BO->getOpcode() == ShiftOpcode &&
match(V, m_OneUse(m_Shift(m_Value(X), m_Constant(C0)))) &&
match(ConstantExpr::getAdd(C0, C1),
m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, Threshold));
- };
-
- // Logic ops are commutative, so check each operand for a match.
- if (matchFirstShift(LogicInst->getOperand(0)))
- Y = LogicInst->getOperand(1);
- else if (matchFirstShift(LogicInst->getOperand(1)))
- Y = LogicInst->getOperand(0);
- else
- return nullptr;
-
- // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
+ };
+
+ // Logic ops are commutative, so check each operand for a match.
+ if (matchFirstShift(LogicInst->getOperand(0)))
+ Y = LogicInst->getOperand(1);
+ else if (matchFirstShift(LogicInst->getOperand(1)))
+ Y = LogicInst->getOperand(0);
+ else
+ return nullptr;
+
+ // shift (logic (shift X, C0), Y), C1 -> logic (shift X, C0+C1), (shift Y, C1)
Constant *ShiftSumC = ConstantExpr::getAdd(C0, C1);
- Value *NewShift1 = Builder.CreateBinOp(ShiftOpcode, X, ShiftSumC);
- Value *NewShift2 = Builder.CreateBinOp(ShiftOpcode, Y, I.getOperand(1));
- return BinaryOperator::Create(LogicInst->getOpcode(), NewShift1, NewShift2);
-}
-
+ Value *NewShift1 = Builder.CreateBinOp(ShiftOpcode, X, ShiftSumC);
+ Value *NewShift2 = Builder.CreateBinOp(ShiftOpcode, Y, I.getOperand(1));
+ return BinaryOperator::Create(LogicInst->getOpcode(), NewShift1, NewShift2);
+}
+
Instruction *InstCombinerImpl::commonShiftTransforms(BinaryOperator &I) {
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- assert(Op0->getType() == Op1->getType());
-
- // If the shift amount is a one-use `sext`, we can demote it to `zext`.
- Value *Y;
- if (match(Op1, m_OneUse(m_SExt(m_Value(Y))))) {
- Value *NewExt = Builder.CreateZExt(Y, I.getType(), Op1->getName());
- return BinaryOperator::Create(I.getOpcode(), Op0, NewExt);
- }
-
- // See if we can fold away this shift.
- if (SimplifyDemandedInstructionBits(I))
- return &I;
-
- // Try to fold constant and into select arguments.
- if (isa<Constant>(Op0))
- if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
- if (Instruction *R = FoldOpIntoSelect(I, SI))
- return R;
-
- if (Constant *CUI = dyn_cast<Constant>(Op1))
- if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I))
- return Res;
-
- if (auto *NewShift = cast_or_null<Instruction>(
- reassociateShiftAmtsOfTwoSameDirectionShifts(&I, SQ)))
- return NewShift;
-
- // (C1 shift (A add C2)) -> (C1 shift C2) shift A)
- // iff A and C2 are both positive.
- Value *A;
- Constant *C;
- if (match(Op0, m_Constant()) && match(Op1, m_Add(m_Value(A), m_Constant(C))))
- if (isKnownNonNegative(A, DL, 0, &AC, &I, &DT) &&
- isKnownNonNegative(C, DL, 0, &AC, &I, &DT))
- return BinaryOperator::Create(
- I.getOpcode(), Builder.CreateBinOp(I.getOpcode(), Op0, C), A);
-
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ assert(Op0->getType() == Op1->getType());
+
+ // If the shift amount is a one-use `sext`, we can demote it to `zext`.
+ Value *Y;
+ if (match(Op1, m_OneUse(m_SExt(m_Value(Y))))) {
+ Value *NewExt = Builder.CreateZExt(Y, I.getType(), Op1->getName());
+ return BinaryOperator::Create(I.getOpcode(), Op0, NewExt);
+ }
+
+ // See if we can fold away this shift.
+ if (SimplifyDemandedInstructionBits(I))
+ return &I;
+
+ // Try to fold constant and into select arguments.
+ if (isa<Constant>(Op0))
+ if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+ if (Instruction *R = FoldOpIntoSelect(I, SI))
+ return R;
+
+ if (Constant *CUI = dyn_cast<Constant>(Op1))
+ if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I))
+ return Res;
+
+ if (auto *NewShift = cast_or_null<Instruction>(
+ reassociateShiftAmtsOfTwoSameDirectionShifts(&I, SQ)))
+ return NewShift;
+
+ // (C1 shift (A add C2)) -> (C1 shift C2) shift A)
+ // iff A and C2 are both positive.
+ Value *A;
+ Constant *C;
+ if (match(Op0, m_Constant()) && match(Op1, m_Add(m_Value(A), m_Constant(C))))
+ if (isKnownNonNegative(A, DL, 0, &AC, &I, &DT) &&
+ isKnownNonNegative(C, DL, 0, &AC, &I, &DT))
+ return BinaryOperator::Create(
+ I.getOpcode(), Builder.CreateBinOp(I.getOpcode(), Op0, C), A);
+
// X shift (A srem C) -> X shift (A and (C - 1)) iff C is a power of 2.
- // Because shifts by negative values (which could occur if A were negative)
- // are undefined.
+ // Because shifts by negative values (which could occur if A were negative)
+ // are undefined.
if (Op1->hasOneUse() && match(Op1, m_SRem(m_Value(A), m_Constant(C))) &&
match(C, m_Power2())) {
- // FIXME: Should this get moved into SimplifyDemandedBits by saying we don't
- // demand the sign bit (and many others) here??
+ // FIXME: Should this get moved into SimplifyDemandedBits by saying we don't
+ // demand the sign bit (and many others) here??
Constant *Mask = ConstantExpr::getSub(C, ConstantInt::get(I.getType(), 1));
Value *Rem = Builder.CreateAnd(A, Mask, Op1->getName());
- return replaceOperand(I, 1, Rem);
- }
-
- if (Instruction *Logic = foldShiftOfShiftedLogic(I, Builder))
- return Logic;
-
- return nullptr;
-}
-
-/// Return true if we can simplify two logical (either left or right) shifts
-/// that have constant shift amounts: OuterShift (InnerShift X, C1), C2.
-static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
+ return replaceOperand(I, 1, Rem);
+ }
+
+ if (Instruction *Logic = foldShiftOfShiftedLogic(I, Builder))
+ return Logic;
+
+ return nullptr;
+}
+
+/// Return true if we can simplify two logical (either left or right) shifts
+/// that have constant shift amounts: OuterShift (InnerShift X, C1), C2.
+static bool canEvaluateShiftedShift(unsigned OuterShAmt, bool IsOuterShl,
Instruction *InnerShift,
InstCombinerImpl &IC, Instruction *CxtI) {
- assert(InnerShift->isLogicalShift() && "Unexpected instruction type");
-
- // We need constant scalar or constant splat shifts.
- const APInt *InnerShiftConst;
- if (!match(InnerShift->getOperand(1), m_APInt(InnerShiftConst)))
- return false;
-
- // Two logical shifts in the same direction:
- // shl (shl X, C1), C2 --> shl X, C1 + C2
- // lshr (lshr X, C1), C2 --> lshr X, C1 + C2
- bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl;
- if (IsInnerShl == IsOuterShl)
- return true;
-
- // Equal shift amounts in opposite directions become bitwise 'and':
- // lshr (shl X, C), C --> and X, C'
- // shl (lshr X, C), C --> and X, C'
- if (*InnerShiftConst == OuterShAmt)
- return true;
-
- // If the 2nd shift is bigger than the 1st, we can fold:
- // lshr (shl X, C1), C2 --> and (shl X, C1 - C2), C3
- // shl (lshr X, C1), C2 --> and (lshr X, C1 - C2), C3
- // but it isn't profitable unless we know the and'd out bits are already zero.
- // Also, check that the inner shift is valid (less than the type width) or
- // we'll crash trying to produce the bit mask for the 'and'.
- unsigned TypeWidth = InnerShift->getType()->getScalarSizeInBits();
- if (InnerShiftConst->ugt(OuterShAmt) && InnerShiftConst->ult(TypeWidth)) {
- unsigned InnerShAmt = InnerShiftConst->getZExtValue();
- unsigned MaskShift =
- IsInnerShl ? TypeWidth - InnerShAmt : InnerShAmt - OuterShAmt;
- APInt Mask = APInt::getLowBitsSet(TypeWidth, OuterShAmt) << MaskShift;
- if (IC.MaskedValueIsZero(InnerShift->getOperand(0), Mask, 0, CxtI))
- return true;
- }
-
- return false;
-}
-
-/// See if we can compute the specified value, but shifted logically to the left
-/// or right by some number of bits. This should return true if the expression
-/// can be computed for the same cost as the current expression tree. This is
-/// used to eliminate extraneous shifting from things like:
-/// %C = shl i128 %A, 64
-/// %D = shl i128 %B, 96
-/// %E = or i128 %C, %D
-/// %F = lshr i128 %E, 64
-/// where the client will ask if E can be computed shifted right by 64-bits. If
-/// this succeeds, getShiftedValue() will be called to produce the value.
-static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
+ assert(InnerShift->isLogicalShift() && "Unexpected instruction type");
+
+ // We need constant scalar or constant splat shifts.
+ const APInt *InnerShiftConst;
+ if (!match(InnerShift->getOperand(1), m_APInt(InnerShiftConst)))
+ return false;
+
+ // Two logical shifts in the same direction:
+ // shl (shl X, C1), C2 --> shl X, C1 + C2
+ // lshr (lshr X, C1), C2 --> lshr X, C1 + C2
+ bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl;
+ if (IsInnerShl == IsOuterShl)
+ return true;
+
+ // Equal shift amounts in opposite directions become bitwise 'and':
+ // lshr (shl X, C), C --> and X, C'
+ // shl (lshr X, C), C --> and X, C'
+ if (*InnerShiftConst == OuterShAmt)
+ return true;
+
+ // If the 2nd shift is bigger than the 1st, we can fold:
+ // lshr (shl X, C1), C2 --> and (shl X, C1 - C2), C3
+ // shl (lshr X, C1), C2 --> and (lshr X, C1 - C2), C3
+ // but it isn't profitable unless we know the and'd out bits are already zero.
+ // Also, check that the inner shift is valid (less than the type width) or
+ // we'll crash trying to produce the bit mask for the 'and'.
+ unsigned TypeWidth = InnerShift->getType()->getScalarSizeInBits();
+ if (InnerShiftConst->ugt(OuterShAmt) && InnerShiftConst->ult(TypeWidth)) {
+ unsigned InnerShAmt = InnerShiftConst->getZExtValue();
+ unsigned MaskShift =
+ IsInnerShl ? TypeWidth - InnerShAmt : InnerShAmt - OuterShAmt;
+ APInt Mask = APInt::getLowBitsSet(TypeWidth, OuterShAmt) << MaskShift;
+ if (IC.MaskedValueIsZero(InnerShift->getOperand(0), Mask, 0, CxtI))
+ return true;
+ }
+
+ return false;
+}
+
+/// See if we can compute the specified value, but shifted logically to the left
+/// or right by some number of bits. This should return true if the expression
+/// can be computed for the same cost as the current expression tree. This is
+/// used to eliminate extraneous shifting from things like:
+/// %C = shl i128 %A, 64
+/// %D = shl i128 %B, 96
+/// %E = or i128 %C, %D
+/// %F = lshr i128 %E, 64
+/// where the client will ask if E can be computed shifted right by 64-bits. If
+/// this succeeds, getShiftedValue() will be called to produce the value.
+static bool canEvaluateShifted(Value *V, unsigned NumBits, bool IsLeftShift,
InstCombinerImpl &IC, Instruction *CxtI) {
- // We can always evaluate constants shifted.
- if (isa<Constant>(V))
- return true;
-
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I) return false;
-
- // We can't mutate something that has multiple uses: doing so would
- // require duplicating the instruction in general, which isn't profitable.
- if (!I->hasOneUse()) return false;
-
- switch (I->getOpcode()) {
- default: return false;
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
- return canEvaluateShifted(I->getOperand(0), NumBits, IsLeftShift, IC, I) &&
- canEvaluateShifted(I->getOperand(1), NumBits, IsLeftShift, IC, I);
-
- case Instruction::Shl:
- case Instruction::LShr:
- return canEvaluateShiftedShift(NumBits, IsLeftShift, I, IC, CxtI);
-
- case Instruction::Select: {
- SelectInst *SI = cast<SelectInst>(I);
- Value *TrueVal = SI->getTrueValue();
- Value *FalseVal = SI->getFalseValue();
- return canEvaluateShifted(TrueVal, NumBits, IsLeftShift, IC, SI) &&
- canEvaluateShifted(FalseVal, NumBits, IsLeftShift, IC, SI);
- }
- case Instruction::PHI: {
- // We can change a phi if we can change all operands. Note that we never
- // get into trouble with cyclic PHIs here because we only consider
- // instructions with a single use.
- PHINode *PN = cast<PHINode>(I);
- for (Value *IncValue : PN->incoming_values())
- if (!canEvaluateShifted(IncValue, NumBits, IsLeftShift, IC, PN))
- return false;
- return true;
- }
- }
-}
-
-/// Fold OuterShift (InnerShift X, C1), C2.
-/// See canEvaluateShiftedShift() for the constraints on these instructions.
-static Value *foldShiftedShift(BinaryOperator *InnerShift, unsigned OuterShAmt,
- bool IsOuterShl,
- InstCombiner::BuilderTy &Builder) {
- bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl;
- Type *ShType = InnerShift->getType();
- unsigned TypeWidth = ShType->getScalarSizeInBits();
-
- // We only accept shifts-by-a-constant in canEvaluateShifted().
- const APInt *C1;
- match(InnerShift->getOperand(1), m_APInt(C1));
- unsigned InnerShAmt = C1->getZExtValue();
-
- // Change the shift amount and clear the appropriate IR flags.
- auto NewInnerShift = [&](unsigned ShAmt) {
- InnerShift->setOperand(1, ConstantInt::get(ShType, ShAmt));
- if (IsInnerShl) {
- InnerShift->setHasNoUnsignedWrap(false);
- InnerShift->setHasNoSignedWrap(false);
- } else {
- InnerShift->setIsExact(false);
- }
- return InnerShift;
- };
-
- // Two logical shifts in the same direction:
- // shl (shl X, C1), C2 --> shl X, C1 + C2
- // lshr (lshr X, C1), C2 --> lshr X, C1 + C2
- if (IsInnerShl == IsOuterShl) {
- // If this is an oversized composite shift, then unsigned shifts get 0.
- if (InnerShAmt + OuterShAmt >= TypeWidth)
- return Constant::getNullValue(ShType);
-
- return NewInnerShift(InnerShAmt + OuterShAmt);
- }
-
- // Equal shift amounts in opposite directions become bitwise 'and':
- // lshr (shl X, C), C --> and X, C'
- // shl (lshr X, C), C --> and X, C'
- if (InnerShAmt == OuterShAmt) {
- APInt Mask = IsInnerShl
- ? APInt::getLowBitsSet(TypeWidth, TypeWidth - OuterShAmt)
- : APInt::getHighBitsSet(TypeWidth, TypeWidth - OuterShAmt);
- Value *And = Builder.CreateAnd(InnerShift->getOperand(0),
- ConstantInt::get(ShType, Mask));
- if (auto *AndI = dyn_cast<Instruction>(And)) {
- AndI->moveBefore(InnerShift);
- AndI->takeName(InnerShift);
- }
- return And;
- }
-
- assert(InnerShAmt > OuterShAmt &&
- "Unexpected opposite direction logical shift pair");
-
- // In general, we would need an 'and' for this transform, but
- // canEvaluateShiftedShift() guarantees that the masked-off bits are not used.
- // lshr (shl X, C1), C2 --> shl X, C1 - C2
- // shl (lshr X, C1), C2 --> lshr X, C1 - C2
- return NewInnerShift(InnerShAmt - OuterShAmt);
-}
-
-/// When canEvaluateShifted() returns true for an expression, this function
-/// inserts the new computation that produces the shifted value.
-static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
+ // We can always evaluate constants shifted.
+ if (isa<Constant>(V))
+ return true;
+
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I) return false;
+
+ // We can't mutate something that has multiple uses: doing so would
+ // require duplicating the instruction in general, which isn't profitable.
+ if (!I->hasOneUse()) return false;
+
+ switch (I->getOpcode()) {
+ default: return false;
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
+ return canEvaluateShifted(I->getOperand(0), NumBits, IsLeftShift, IC, I) &&
+ canEvaluateShifted(I->getOperand(1), NumBits, IsLeftShift, IC, I);
+
+ case Instruction::Shl:
+ case Instruction::LShr:
+ return canEvaluateShiftedShift(NumBits, IsLeftShift, I, IC, CxtI);
+
+ case Instruction::Select: {
+ SelectInst *SI = cast<SelectInst>(I);
+ Value *TrueVal = SI->getTrueValue();
+ Value *FalseVal = SI->getFalseValue();
+ return canEvaluateShifted(TrueVal, NumBits, IsLeftShift, IC, SI) &&
+ canEvaluateShifted(FalseVal, NumBits, IsLeftShift, IC, SI);
+ }
+ case Instruction::PHI: {
+ // We can change a phi if we can change all operands. Note that we never
+ // get into trouble with cyclic PHIs here because we only consider
+ // instructions with a single use.
+ PHINode *PN = cast<PHINode>(I);
+ for (Value *IncValue : PN->incoming_values())
+ if (!canEvaluateShifted(IncValue, NumBits, IsLeftShift, IC, PN))
+ return false;
+ return true;
+ }
+ }
+}
+
+/// Fold OuterShift (InnerShift X, C1), C2.
+/// See canEvaluateShiftedShift() for the constraints on these instructions.
+static Value *foldShiftedShift(BinaryOperator *InnerShift, unsigned OuterShAmt,
+ bool IsOuterShl,
+ InstCombiner::BuilderTy &Builder) {
+ bool IsInnerShl = InnerShift->getOpcode() == Instruction::Shl;
+ Type *ShType = InnerShift->getType();
+ unsigned TypeWidth = ShType->getScalarSizeInBits();
+
+ // We only accept shifts-by-a-constant in canEvaluateShifted().
+ const APInt *C1;
+ match(InnerShift->getOperand(1), m_APInt(C1));
+ unsigned InnerShAmt = C1->getZExtValue();
+
+ // Change the shift amount and clear the appropriate IR flags.
+ auto NewInnerShift = [&](unsigned ShAmt) {
+ InnerShift->setOperand(1, ConstantInt::get(ShType, ShAmt));
+ if (IsInnerShl) {
+ InnerShift->setHasNoUnsignedWrap(false);
+ InnerShift->setHasNoSignedWrap(false);
+ } else {
+ InnerShift->setIsExact(false);
+ }
+ return InnerShift;
+ };
+
+ // Two logical shifts in the same direction:
+ // shl (shl X, C1), C2 --> shl X, C1 + C2
+ // lshr (lshr X, C1), C2 --> lshr X, C1 + C2
+ if (IsInnerShl == IsOuterShl) {
+ // If this is an oversized composite shift, then unsigned shifts get 0.
+ if (InnerShAmt + OuterShAmt >= TypeWidth)
+ return Constant::getNullValue(ShType);
+
+ return NewInnerShift(InnerShAmt + OuterShAmt);
+ }
+
+ // Equal shift amounts in opposite directions become bitwise 'and':
+ // lshr (shl X, C), C --> and X, C'
+ // shl (lshr X, C), C --> and X, C'
+ if (InnerShAmt == OuterShAmt) {
+ APInt Mask = IsInnerShl
+ ? APInt::getLowBitsSet(TypeWidth, TypeWidth - OuterShAmt)
+ : APInt::getHighBitsSet(TypeWidth, TypeWidth - OuterShAmt);
+ Value *And = Builder.CreateAnd(InnerShift->getOperand(0),
+ ConstantInt::get(ShType, Mask));
+ if (auto *AndI = dyn_cast<Instruction>(And)) {
+ AndI->moveBefore(InnerShift);
+ AndI->takeName(InnerShift);
+ }
+ return And;
+ }
+
+ assert(InnerShAmt > OuterShAmt &&
+ "Unexpected opposite direction logical shift pair");
+
+ // In general, we would need an 'and' for this transform, but
+ // canEvaluateShiftedShift() guarantees that the masked-off bits are not used.
+ // lshr (shl X, C1), C2 --> shl X, C1 - C2
+ // shl (lshr X, C1), C2 --> lshr X, C1 - C2
+ return NewInnerShift(InnerShAmt - OuterShAmt);
+}
+
+/// When canEvaluateShifted() returns true for an expression, this function
+/// inserts the new computation that produces the shifted value.
+static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
InstCombinerImpl &IC, const DataLayout &DL) {
- // We can always evaluate constants shifted.
- if (Constant *C = dyn_cast<Constant>(V)) {
- if (isLeftShift)
- return IC.Builder.CreateShl(C, NumBits);
- else
- return IC.Builder.CreateLShr(C, NumBits);
- }
-
- Instruction *I = cast<Instruction>(V);
+ // We can always evaluate constants shifted.
+ if (Constant *C = dyn_cast<Constant>(V)) {
+ if (isLeftShift)
+ return IC.Builder.CreateShl(C, NumBits);
+ else
+ return IC.Builder.CreateLShr(C, NumBits);
+ }
+
+ Instruction *I = cast<Instruction>(V);
IC.addToWorklist(I);
-
- switch (I->getOpcode()) {
- default: llvm_unreachable("Inconsistency with CanEvaluateShifted");
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
- I->setOperand(
- 0, getShiftedValue(I->getOperand(0), NumBits, isLeftShift, IC, DL));
- I->setOperand(
- 1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
- return I;
-
- case Instruction::Shl:
- case Instruction::LShr:
- return foldShiftedShift(cast<BinaryOperator>(I), NumBits, isLeftShift,
- IC.Builder);
-
- case Instruction::Select:
- I->setOperand(
- 1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
- I->setOperand(
- 2, getShiftedValue(I->getOperand(2), NumBits, isLeftShift, IC, DL));
- return I;
- case Instruction::PHI: {
- // We can change a phi if we can change all operands. Note that we never
- // get into trouble with cyclic PHIs here because we only consider
- // instructions with a single use.
- PHINode *PN = cast<PHINode>(I);
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- PN->setIncomingValue(i, getShiftedValue(PN->getIncomingValue(i), NumBits,
- isLeftShift, IC, DL));
- return PN;
- }
- }
-}
-
-// If this is a bitwise operator or add with a constant RHS we might be able
-// to pull it through a shift.
-static bool canShiftBinOpWithConstantRHS(BinaryOperator &Shift,
- BinaryOperator *BO) {
- switch (BO->getOpcode()) {
- default:
- return false; // Do not perform transform!
- case Instruction::Add:
- return Shift.getOpcode() == Instruction::Shl;
- case Instruction::Or:
- case Instruction::And:
- return true;
+
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Inconsistency with CanEvaluateShifted");
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
+ I->setOperand(
+ 0, getShiftedValue(I->getOperand(0), NumBits, isLeftShift, IC, DL));
+ I->setOperand(
+ 1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
+ return I;
+
+ case Instruction::Shl:
+ case Instruction::LShr:
+ return foldShiftedShift(cast<BinaryOperator>(I), NumBits, isLeftShift,
+ IC.Builder);
+
+ case Instruction::Select:
+ I->setOperand(
+ 1, getShiftedValue(I->getOperand(1), NumBits, isLeftShift, IC, DL));
+ I->setOperand(
+ 2, getShiftedValue(I->getOperand(2), NumBits, isLeftShift, IC, DL));
+ return I;
+ case Instruction::PHI: {
+ // We can change a phi if we can change all operands. Note that we never
+ // get into trouble with cyclic PHIs here because we only consider
+ // instructions with a single use.
+ PHINode *PN = cast<PHINode>(I);
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ PN->setIncomingValue(i, getShiftedValue(PN->getIncomingValue(i), NumBits,
+ isLeftShift, IC, DL));
+ return PN;
+ }
+ }
+}
+
+// If this is a bitwise operator or add with a constant RHS we might be able
+// to pull it through a shift.
+static bool canShiftBinOpWithConstantRHS(BinaryOperator &Shift,
+ BinaryOperator *BO) {
+ switch (BO->getOpcode()) {
+ default:
+ return false; // Do not perform transform!
+ case Instruction::Add:
+ return Shift.getOpcode() == Instruction::Shl;
+ case Instruction::Or:
+ case Instruction::And:
+ return true;
case Instruction::Xor:
// Do not change a 'not' of logical shift because that would create a normal
// 'xor'. The 'not' is likely better for analysis, SCEV, and codegen.
return !(Shift.isLogicalShift() && match(BO, m_Not(m_Value())));
- }
-}
-
+ }
+}
+
Instruction *InstCombinerImpl::FoldShiftByConstant(Value *Op0, Constant *Op1,
BinaryOperator &I) {
- bool isLeftShift = I.getOpcode() == Instruction::Shl;
-
- const APInt *Op1C;
- if (!match(Op1, m_APInt(Op1C)))
- return nullptr;
-
- // See if we can propagate this shift into the input, this covers the trivial
- // cast of lshr(shl(x,c1),c2) as well as other more complex cases.
- if (I.getOpcode() != Instruction::AShr &&
- canEvaluateShifted(Op0, Op1C->getZExtValue(), isLeftShift, *this, &I)) {
- LLVM_DEBUG(
- dbgs() << "ICE: GetShiftedValue propagating shift through expression"
- " to eliminate shift:\n IN: "
- << *Op0 << "\n SH: " << I << "\n");
-
- return replaceInstUsesWith(
- I, getShiftedValue(Op0, Op1C->getZExtValue(), isLeftShift, *this, DL));
- }
-
- // See if we can simplify any instructions used by the instruction whose sole
- // purpose is to compute bits we don't care about.
+ bool isLeftShift = I.getOpcode() == Instruction::Shl;
+
+ const APInt *Op1C;
+ if (!match(Op1, m_APInt(Op1C)))
+ return nullptr;
+
+ // See if we can propagate this shift into the input, this covers the trivial
+ // cast of lshr(shl(x,c1),c2) as well as other more complex cases.
+ if (I.getOpcode() != Instruction::AShr &&
+ canEvaluateShifted(Op0, Op1C->getZExtValue(), isLeftShift, *this, &I)) {
+ LLVM_DEBUG(
+ dbgs() << "ICE: GetShiftedValue propagating shift through expression"
+ " to eliminate shift:\n IN: "
+ << *Op0 << "\n SH: " << I << "\n");
+
+ return replaceInstUsesWith(
+ I, getShiftedValue(Op0, Op1C->getZExtValue(), isLeftShift, *this, DL));
+ }
+
+ // See if we can simplify any instructions used by the instruction whose sole
+ // purpose is to compute bits we don't care about.
Type *Ty = I.getType();
unsigned TypeBits = Ty->getScalarSizeInBits();
- assert(!Op1C->uge(TypeBits) &&
- "Shift over the type width should have been removed already");
-
- if (Instruction *FoldedShift = foldBinOpIntoSelectOrPhi(I))
- return FoldedShift;
-
- // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
+ assert(!Op1C->uge(TypeBits) &&
+ "Shift over the type width should have been removed already");
+
+ if (Instruction *FoldedShift = foldBinOpIntoSelectOrPhi(I))
+ return FoldedShift;
+
+ // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
if (auto *TI = dyn_cast<TruncInst>(Op0)) {
- // If 'shift2' is an ashr, we would have to get the sign bit into a funny
- // place. Don't try to do this transformation in this case. Also, we
- // require that the input operand is a shift-by-constant so that we have
- // confidence that the shifts will get folded together. We could do this
- // xform in more cases, but it is unlikely to be profitable.
+ // If 'shift2' is an ashr, we would have to get the sign bit into a funny
+ // place. Don't try to do this transformation in this case. Also, we
+ // require that the input operand is a shift-by-constant so that we have
+ // confidence that the shifts will get folded together. We could do this
+ // xform in more cases, but it is unlikely to be profitable.
const APInt *TrShiftAmt;
if (I.isLogicalShift() &&
match(TI->getOperand(0), m_Shift(m_Value(), m_APInt(TrShiftAmt)))) {
auto *TrOp = cast<Instruction>(TI->getOperand(0));
Type *SrcTy = TrOp->getType();
- // Okay, we'll do this xform. Make the shift of shift.
+ // Okay, we'll do this xform. Make the shift of shift.
Constant *ShAmt = ConstantExpr::getZExt(Op1, SrcTy);
- // (shift2 (shift1 & 0x00FF), c2)
- Value *NSh = Builder.CreateBinOp(I.getOpcode(), TrOp, ShAmt, I.getName());
-
- // For logical shifts, the truncation has the effect of making the high
- // part of the register be zeros. Emulate this by inserting an AND to
- // clear the top bits as needed. This 'and' will usually be zapped by
- // other xforms later if dead.
+ // (shift2 (shift1 & 0x00FF), c2)
+ Value *NSh = Builder.CreateBinOp(I.getOpcode(), TrOp, ShAmt, I.getName());
+
+ // For logical shifts, the truncation has the effect of making the high
+ // part of the register be zeros. Emulate this by inserting an AND to
+ // clear the top bits as needed. This 'and' will usually be zapped by
+ // other xforms later if dead.
unsigned SrcSize = SrcTy->getScalarSizeInBits();
Constant *MaskV =
ConstantInt::get(SrcTy, APInt::getLowBitsSet(SrcSize, TypeBits));
-
- // The mask we constructed says what the trunc would do if occurring
- // between the shifts. We want to know the effect *after* the second
- // shift. We know that it is a logical shift by a constant, so adjust the
- // mask as appropriate.
+
+ // The mask we constructed says what the trunc would do if occurring
+ // between the shifts. We want to know the effect *after* the second
+ // shift. We know that it is a logical shift by a constant, so adjust the
+ // mask as appropriate.
MaskV = ConstantExpr::get(I.getOpcode(), MaskV, ShAmt);
- // shift1 & 0x00FF
+ // shift1 & 0x00FF
Value *And = Builder.CreateAnd(NSh, MaskV, TI->getName());
- // Return the value truncated to the interesting size.
+ // Return the value truncated to the interesting size.
return new TruncInst(And, Ty);
- }
- }
-
- if (Op0->hasOneUse()) {
- if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) {
- // Turn ((X >> C) + Y) << C -> (X + (Y << C)) & (~0 << C)
+ }
+ }
+
+ if (Op0->hasOneUse()) {
+ if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) {
+ // Turn ((X >> C) + Y) << C -> (X + (Y << C)) & (~0 << C)
Value *V1;
const APInt *CC;
- switch (Op0BO->getOpcode()) {
- default: break;
- case Instruction::Add:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- // These operators commute.
- // Turn (Y + (X >> C)) << C -> (X + (Y << C)) & (~0 << C)
- if (isLeftShift && Op0BO->getOperand(1)->hasOneUse() &&
- match(Op0BO->getOperand(1), m_Shr(m_Value(V1),
- m_Specific(Op1)))) {
- Value *YS = // (Y << C)
- Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
- // (X + (Y << C))
- Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), YS, V1,
- Op0BO->getOperand(1)->getName());
- unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
- APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
+ switch (Op0BO->getOpcode()) {
+ default: break;
+ case Instruction::Add:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // These operators commute.
+ // Turn (Y + (X >> C)) << C -> (X + (Y << C)) & (~0 << C)
+ if (isLeftShift && Op0BO->getOperand(1)->hasOneUse() &&
+ match(Op0BO->getOperand(1), m_Shr(m_Value(V1),
+ m_Specific(Op1)))) {
+ Value *YS = // (Y << C)
+ Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
+ // (X + (Y << C))
+ Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), YS, V1,
+ Op0BO->getOperand(1)->getName());
+ unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
+ APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
Constant *Mask = ConstantInt::get(Ty, Bits);
- return BinaryOperator::CreateAnd(X, Mask);
- }
-
- // Turn (Y + ((X >> C) & CC)) << C -> ((X & (CC << C)) + (Y << C))
- Value *Op0BOOp1 = Op0BO->getOperand(1);
- if (isLeftShift && Op0BOOp1->hasOneUse() &&
+ return BinaryOperator::CreateAnd(X, Mask);
+ }
+
+ // Turn (Y + ((X >> C) & CC)) << C -> ((X & (CC << C)) + (Y << C))
+ Value *Op0BOOp1 = Op0BO->getOperand(1);
+ if (isLeftShift && Op0BOOp1->hasOneUse() &&
match(Op0BOOp1, m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))),
m_APInt(CC)))) {
Value *YS = // (Y << C)
Builder.CreateShl(Op0BO->getOperand(0), Op1, Op0BO->getName());
- // X & (CC << C)
+ // X & (CC << C)
Value *XM = Builder.CreateAnd(
V1, ConstantExpr::getShl(ConstantInt::get(Ty, *CC), Op1),
V1->getName() + ".mask");
- return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM);
- }
- LLVM_FALLTHROUGH;
- }
-
- case Instruction::Sub: {
- // Turn ((X >> C) + Y) << C -> (X + (Y << C)) & (~0 << C)
- if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
- match(Op0BO->getOperand(0), m_Shr(m_Value(V1),
- m_Specific(Op1)))) {
- Value *YS = // (Y << C)
- Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
- // (X + (Y << C))
- Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), V1, YS,
- Op0BO->getOperand(0)->getName());
- unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
- APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
+ return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM);
+ }
+ LLVM_FALLTHROUGH;
+ }
+
+ case Instruction::Sub: {
+ // Turn ((X >> C) + Y) << C -> (X + (Y << C)) & (~0 << C)
+ if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
+ match(Op0BO->getOperand(0), m_Shr(m_Value(V1),
+ m_Specific(Op1)))) {
+ Value *YS = // (Y << C)
+ Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
+ // (X + (Y << C))
+ Value *X = Builder.CreateBinOp(Op0BO->getOpcode(), V1, YS,
+ Op0BO->getOperand(0)->getName());
+ unsigned Op1Val = Op1C->getLimitedValue(TypeBits);
+ APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
Constant *Mask = ConstantInt::get(Ty, Bits);
- return BinaryOperator::CreateAnd(X, Mask);
- }
-
- // Turn (((X >> C)&CC) + Y) << C -> (X + (Y << C)) & (CC << C)
- if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
- match(Op0BO->getOperand(0),
+ return BinaryOperator::CreateAnd(X, Mask);
+ }
+
+ // Turn (((X >> C)&CC) + Y) << C -> (X + (Y << C)) & (CC << C)
+ if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
+ match(Op0BO->getOperand(0),
m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))),
m_APInt(CC)))) {
- Value *YS = // (Y << C)
+ Value *YS = // (Y << C)
Builder.CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
- // X & (CC << C)
+ // X & (CC << C)
Value *XM = Builder.CreateAnd(
V1, ConstantExpr::getShl(ConstantInt::get(Ty, *CC), Op1),
V1->getName() + ".mask");
- return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS);
- }
-
- break;
- }
- }
-
- // If the operand is a bitwise operator with a constant RHS, and the
- // shift is the only use, we can pull it out of the shift.
- const APInt *Op0C;
- if (match(Op0BO->getOperand(1), m_APInt(Op0C))) {
- if (canShiftBinOpWithConstantRHS(I, Op0BO)) {
- Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
- cast<Constant>(Op0BO->getOperand(1)), Op1);
-
- Value *NewShift =
- Builder.CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1);
- NewShift->takeName(Op0BO);
-
- return BinaryOperator::Create(Op0BO->getOpcode(), NewShift,
- NewRHS);
- }
- }
-
- // If the operand is a subtract with a constant LHS, and the shift
- // is the only use, we can pull it out of the shift.
- // This folds (shl (sub C1, X), C2) -> (sub (C1 << C2), (shl X, C2))
- if (isLeftShift && Op0BO->getOpcode() == Instruction::Sub &&
- match(Op0BO->getOperand(0), m_APInt(Op0C))) {
- Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
- cast<Constant>(Op0BO->getOperand(0)), Op1);
-
- Value *NewShift = Builder.CreateShl(Op0BO->getOperand(1), Op1);
- NewShift->takeName(Op0BO);
-
- return BinaryOperator::CreateSub(NewRHS, NewShift);
- }
- }
-
- // If we have a select that conditionally executes some binary operator,
- // see if we can pull it the select and operator through the shift.
- //
- // For example, turning:
- // shl (select C, (add X, C1), X), C2
- // Into:
- // Y = shl X, C2
- // select C, (add Y, C1 << C2), Y
- Value *Cond;
- BinaryOperator *TBO;
- Value *FalseVal;
- if (match(Op0, m_Select(m_Value(Cond), m_OneUse(m_BinOp(TBO)),
- m_Value(FalseVal)))) {
- const APInt *C;
- if (!isa<Constant>(FalseVal) && TBO->getOperand(0) == FalseVal &&
- match(TBO->getOperand(1), m_APInt(C)) &&
- canShiftBinOpWithConstantRHS(I, TBO)) {
- Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
- cast<Constant>(TBO->getOperand(1)), Op1);
-
- Value *NewShift =
- Builder.CreateBinOp(I.getOpcode(), FalseVal, Op1);
- Value *NewOp = Builder.CreateBinOp(TBO->getOpcode(), NewShift,
- NewRHS);
- return SelectInst::Create(Cond, NewOp, NewShift);
- }
- }
-
- BinaryOperator *FBO;
- Value *TrueVal;
- if (match(Op0, m_Select(m_Value(Cond), m_Value(TrueVal),
- m_OneUse(m_BinOp(FBO))))) {
- const APInt *C;
- if (!isa<Constant>(TrueVal) && FBO->getOperand(0) == TrueVal &&
- match(FBO->getOperand(1), m_APInt(C)) &&
- canShiftBinOpWithConstantRHS(I, FBO)) {
- Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
- cast<Constant>(FBO->getOperand(1)), Op1);
-
- Value *NewShift =
- Builder.CreateBinOp(I.getOpcode(), TrueVal, Op1);
- Value *NewOp = Builder.CreateBinOp(FBO->getOpcode(), NewShift,
- NewRHS);
- return SelectInst::Create(Cond, NewShift, NewOp);
- }
- }
- }
-
- return nullptr;
-}
-
+ return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS);
+ }
+
+ break;
+ }
+ }
+
+ // If the operand is a bitwise operator with a constant RHS, and the
+ // shift is the only use, we can pull it out of the shift.
+ const APInt *Op0C;
+ if (match(Op0BO->getOperand(1), m_APInt(Op0C))) {
+ if (canShiftBinOpWithConstantRHS(I, Op0BO)) {
+ Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
+ cast<Constant>(Op0BO->getOperand(1)), Op1);
+
+ Value *NewShift =
+ Builder.CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1);
+ NewShift->takeName(Op0BO);
+
+ return BinaryOperator::Create(Op0BO->getOpcode(), NewShift,
+ NewRHS);
+ }
+ }
+
+ // If the operand is a subtract with a constant LHS, and the shift
+ // is the only use, we can pull it out of the shift.
+ // This folds (shl (sub C1, X), C2) -> (sub (C1 << C2), (shl X, C2))
+ if (isLeftShift && Op0BO->getOpcode() == Instruction::Sub &&
+ match(Op0BO->getOperand(0), m_APInt(Op0C))) {
+ Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
+ cast<Constant>(Op0BO->getOperand(0)), Op1);
+
+ Value *NewShift = Builder.CreateShl(Op0BO->getOperand(1), Op1);
+ NewShift->takeName(Op0BO);
+
+ return BinaryOperator::CreateSub(NewRHS, NewShift);
+ }
+ }
+
+ // If we have a select that conditionally executes some binary operator,
+ // see if we can pull it the select and operator through the shift.
+ //
+ // For example, turning:
+ // shl (select C, (add X, C1), X), C2
+ // Into:
+ // Y = shl X, C2
+ // select C, (add Y, C1 << C2), Y
+ Value *Cond;
+ BinaryOperator *TBO;
+ Value *FalseVal;
+ if (match(Op0, m_Select(m_Value(Cond), m_OneUse(m_BinOp(TBO)),
+ m_Value(FalseVal)))) {
+ const APInt *C;
+ if (!isa<Constant>(FalseVal) && TBO->getOperand(0) == FalseVal &&
+ match(TBO->getOperand(1), m_APInt(C)) &&
+ canShiftBinOpWithConstantRHS(I, TBO)) {
+ Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
+ cast<Constant>(TBO->getOperand(1)), Op1);
+
+ Value *NewShift =
+ Builder.CreateBinOp(I.getOpcode(), FalseVal, Op1);
+ Value *NewOp = Builder.CreateBinOp(TBO->getOpcode(), NewShift,
+ NewRHS);
+ return SelectInst::Create(Cond, NewOp, NewShift);
+ }
+ }
+
+ BinaryOperator *FBO;
+ Value *TrueVal;
+ if (match(Op0, m_Select(m_Value(Cond), m_Value(TrueVal),
+ m_OneUse(m_BinOp(FBO))))) {
+ const APInt *C;
+ if (!isa<Constant>(TrueVal) && FBO->getOperand(0) == TrueVal &&
+ match(FBO->getOperand(1), m_APInt(C)) &&
+ canShiftBinOpWithConstantRHS(I, FBO)) {
+ Constant *NewRHS = ConstantExpr::get(I.getOpcode(),
+ cast<Constant>(FBO->getOperand(1)), Op1);
+
+ Value *NewShift =
+ Builder.CreateBinOp(I.getOpcode(), TrueVal, Op1);
+ Value *NewOp = Builder.CreateBinOp(FBO->getOpcode(), NewShift,
+ NewRHS);
+ return SelectInst::Create(Cond, NewShift, NewOp);
+ }
+ }
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
- const SimplifyQuery Q = SQ.getWithInstruction(&I);
-
- if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
- I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), Q))
- return replaceInstUsesWith(I, V);
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- if (Instruction *V = commonShiftTransforms(I))
- return V;
-
- if (Instruction *V = dropRedundantMaskingOfLeftShiftInput(&I, Q, Builder))
- return V;
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- Type *Ty = I.getType();
- unsigned BitWidth = Ty->getScalarSizeInBits();
-
- const APInt *ShAmtAPInt;
- if (match(Op1, m_APInt(ShAmtAPInt))) {
- unsigned ShAmt = ShAmtAPInt->getZExtValue();
-
- // shl (zext X), ShAmt --> zext (shl X, ShAmt)
- // This is only valid if X would have zeros shifted out.
- Value *X;
- if (match(Op0, m_OneUse(m_ZExt(m_Value(X))))) {
- unsigned SrcWidth = X->getType()->getScalarSizeInBits();
- if (ShAmt < SrcWidth &&
- MaskedValueIsZero(X, APInt::getHighBitsSet(SrcWidth, ShAmt), 0, &I))
- return new ZExtInst(Builder.CreateShl(X, ShAmt), Ty);
- }
-
- // (X >> C) << C --> X & (-1 << C)
- if (match(Op0, m_Shr(m_Value(X), m_Specific(Op1)))) {
- APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt));
- return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
- }
-
- const APInt *ShOp1;
+ const SimplifyQuery Q = SQ.getWithInstruction(&I);
+
+ if (Value *V = SimplifyShlInst(I.getOperand(0), I.getOperand(1),
+ I.hasNoSignedWrap(), I.hasNoUnsignedWrap(), Q))
+ return replaceInstUsesWith(I, V);
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ if (Instruction *V = commonShiftTransforms(I))
+ return V;
+
+ if (Instruction *V = dropRedundantMaskingOfLeftShiftInput(&I, Q, Builder))
+ return V;
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Type *Ty = I.getType();
+ unsigned BitWidth = Ty->getScalarSizeInBits();
+
+ const APInt *ShAmtAPInt;
+ if (match(Op1, m_APInt(ShAmtAPInt))) {
+ unsigned ShAmt = ShAmtAPInt->getZExtValue();
+
+ // shl (zext X), ShAmt --> zext (shl X, ShAmt)
+ // This is only valid if X would have zeros shifted out.
+ Value *X;
+ if (match(Op0, m_OneUse(m_ZExt(m_Value(X))))) {
+ unsigned SrcWidth = X->getType()->getScalarSizeInBits();
+ if (ShAmt < SrcWidth &&
+ MaskedValueIsZero(X, APInt::getHighBitsSet(SrcWidth, ShAmt), 0, &I))
+ return new ZExtInst(Builder.CreateShl(X, ShAmt), Ty);
+ }
+
+ // (X >> C) << C --> X & (-1 << C)
+ if (match(Op0, m_Shr(m_Value(X), m_Specific(Op1)))) {
+ APInt Mask(APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt));
+ return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
+ }
+
+ const APInt *ShOp1;
if (match(Op0, m_Exact(m_Shr(m_Value(X), m_APInt(ShOp1)))) &&
ShOp1->ult(BitWidth)) {
- unsigned ShrAmt = ShOp1->getZExtValue();
- if (ShrAmt < ShAmt) {
- // If C1 < C2: (X >>?,exact C1) << C2 --> X << (C2 - C1)
- Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShrAmt);
- auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
- NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
- NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
- return NewShl;
- }
- if (ShrAmt > ShAmt) {
- // If C1 > C2: (X >>?exact C1) << C2 --> X >>?exact (C1 - C2)
- Constant *ShiftDiff = ConstantInt::get(Ty, ShrAmt - ShAmt);
- auto *NewShr = BinaryOperator::Create(
- cast<BinaryOperator>(Op0)->getOpcode(), X, ShiftDiff);
- NewShr->setIsExact(true);
- return NewShr;
- }
- }
-
+ unsigned ShrAmt = ShOp1->getZExtValue();
+ if (ShrAmt < ShAmt) {
+ // If C1 < C2: (X >>?,exact C1) << C2 --> X << (C2 - C1)
+ Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShrAmt);
+ auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
+ NewShl->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
+ NewShl->setHasNoSignedWrap(I.hasNoSignedWrap());
+ return NewShl;
+ }
+ if (ShrAmt > ShAmt) {
+ // If C1 > C2: (X >>?exact C1) << C2 --> X >>?exact (C1 - C2)
+ Constant *ShiftDiff = ConstantInt::get(Ty, ShrAmt - ShAmt);
+ auto *NewShr = BinaryOperator::Create(
+ cast<BinaryOperator>(Op0)->getOpcode(), X, ShiftDiff);
+ NewShr->setIsExact(true);
+ return NewShr;
+ }
+ }
+
if (match(Op0, m_OneUse(m_Shr(m_Value(X), m_APInt(ShOp1)))) &&
ShOp1->ult(BitWidth)) {
unsigned ShrAmt = ShOp1->getZExtValue();
@@ -977,354 +977,354 @@ Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) {
}
if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1))) && ShOp1->ult(BitWidth)) {
- unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
- // Oversized shifts are simplified to zero in InstSimplify.
- if (AmtSum < BitWidth)
- // (X << C1) << C2 --> X << (C1 + C2)
- return BinaryOperator::CreateShl(X, ConstantInt::get(Ty, AmtSum));
- }
-
- // If the shifted-out value is known-zero, then this is a NUW shift.
- if (!I.hasNoUnsignedWrap() &&
- MaskedValueIsZero(Op0, APInt::getHighBitsSet(BitWidth, ShAmt), 0, &I)) {
- I.setHasNoUnsignedWrap();
- return &I;
- }
-
- // If the shifted-out value is all signbits, then this is a NSW shift.
- if (!I.hasNoSignedWrap() && ComputeNumSignBits(Op0, 0, &I) > ShAmt) {
- I.setHasNoSignedWrap();
- return &I;
- }
- }
-
- // Transform (x >> y) << y to x & (-1 << y)
- // Valid for any type of right-shift.
- Value *X;
- if (match(Op0, m_OneUse(m_Shr(m_Value(X), m_Specific(Op1))))) {
- Constant *AllOnes = ConstantInt::getAllOnesValue(Ty);
- Value *Mask = Builder.CreateShl(AllOnes, Op1);
- return BinaryOperator::CreateAnd(Mask, X);
- }
-
- Constant *C1;
- if (match(Op1, m_Constant(C1))) {
- Constant *C2;
- Value *X;
- // (C2 << X) << C1 --> (C2 << C1) << X
- if (match(Op0, m_OneUse(m_Shl(m_Constant(C2), m_Value(X)))))
- return BinaryOperator::CreateShl(ConstantExpr::getShl(C2, C1), X);
-
- // (X * C2) << C1 --> X * (C2 << C1)
- if (match(Op0, m_Mul(m_Value(X), m_Constant(C2))))
- return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1));
-
- // shl (zext i1 X), C1 --> select (X, 1 << C1, 0)
- if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
- auto *NewC = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C1);
- return SelectInst::Create(X, NewC, ConstantInt::getNullValue(Ty));
- }
- }
-
- // (1 << (C - x)) -> ((1 << C) >> x) if C is bitwidth - 1
- if (match(Op0, m_One()) &&
- match(Op1, m_Sub(m_SpecificInt(BitWidth - 1), m_Value(X))))
- return BinaryOperator::CreateLShr(
- ConstantInt::get(Ty, APInt::getSignMask(BitWidth)), X);
-
- return nullptr;
-}
-
+ unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+ // Oversized shifts are simplified to zero in InstSimplify.
+ if (AmtSum < BitWidth)
+ // (X << C1) << C2 --> X << (C1 + C2)
+ return BinaryOperator::CreateShl(X, ConstantInt::get(Ty, AmtSum));
+ }
+
+ // If the shifted-out value is known-zero, then this is a NUW shift.
+ if (!I.hasNoUnsignedWrap() &&
+ MaskedValueIsZero(Op0, APInt::getHighBitsSet(BitWidth, ShAmt), 0, &I)) {
+ I.setHasNoUnsignedWrap();
+ return &I;
+ }
+
+ // If the shifted-out value is all signbits, then this is a NSW shift.
+ if (!I.hasNoSignedWrap() && ComputeNumSignBits(Op0, 0, &I) > ShAmt) {
+ I.setHasNoSignedWrap();
+ return &I;
+ }
+ }
+
+ // Transform (x >> y) << y to x & (-1 << y)
+ // Valid for any type of right-shift.
+ Value *X;
+ if (match(Op0, m_OneUse(m_Shr(m_Value(X), m_Specific(Op1))))) {
+ Constant *AllOnes = ConstantInt::getAllOnesValue(Ty);
+ Value *Mask = Builder.CreateShl(AllOnes, Op1);
+ return BinaryOperator::CreateAnd(Mask, X);
+ }
+
+ Constant *C1;
+ if (match(Op1, m_Constant(C1))) {
+ Constant *C2;
+ Value *X;
+ // (C2 << X) << C1 --> (C2 << C1) << X
+ if (match(Op0, m_OneUse(m_Shl(m_Constant(C2), m_Value(X)))))
+ return BinaryOperator::CreateShl(ConstantExpr::getShl(C2, C1), X);
+
+ // (X * C2) << C1 --> X * (C2 << C1)
+ if (match(Op0, m_Mul(m_Value(X), m_Constant(C2))))
+ return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1));
+
+ // shl (zext i1 X), C1 --> select (X, 1 << C1, 0)
+ if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
+ auto *NewC = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C1);
+ return SelectInst::Create(X, NewC, ConstantInt::getNullValue(Ty));
+ }
+ }
+
+ // (1 << (C - x)) -> ((1 << C) >> x) if C is bitwidth - 1
+ if (match(Op0, m_One()) &&
+ match(Op1, m_Sub(m_SpecificInt(BitWidth - 1), m_Value(X))))
+ return BinaryOperator::CreateLShr(
+ ConstantInt::get(Ty, APInt::getSignMask(BitWidth)), X);
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitLShr(BinaryOperator &I) {
- if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- if (Instruction *R = commonShiftTransforms(I))
- return R;
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- Type *Ty = I.getType();
- const APInt *ShAmtAPInt;
- if (match(Op1, m_APInt(ShAmtAPInt))) {
- unsigned ShAmt = ShAmtAPInt->getZExtValue();
- unsigned BitWidth = Ty->getScalarSizeInBits();
- auto *II = dyn_cast<IntrinsicInst>(Op0);
- if (II && isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == ShAmt &&
- (II->getIntrinsicID() == Intrinsic::ctlz ||
- II->getIntrinsicID() == Intrinsic::cttz ||
- II->getIntrinsicID() == Intrinsic::ctpop)) {
- // ctlz.i32(x)>>5 --> zext(x == 0)
- // cttz.i32(x)>>5 --> zext(x == 0)
- // ctpop.i32(x)>>5 --> zext(x == -1)
- bool IsPop = II->getIntrinsicID() == Intrinsic::ctpop;
- Constant *RHS = ConstantInt::getSigned(Ty, IsPop ? -1 : 0);
- Value *Cmp = Builder.CreateICmpEQ(II->getArgOperand(0), RHS);
- return new ZExtInst(Cmp, Ty);
- }
-
- Value *X;
- const APInt *ShOp1;
- if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1))) && ShOp1->ult(BitWidth)) {
- if (ShOp1->ult(ShAmt)) {
- unsigned ShlAmt = ShOp1->getZExtValue();
- Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
- if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
- // (X <<nuw C1) >>u C2 --> X >>u (C2 - C1)
- auto *NewLShr = BinaryOperator::CreateLShr(X, ShiftDiff);
- NewLShr->setIsExact(I.isExact());
- return NewLShr;
- }
- // (X << C1) >>u C2 --> (X >>u (C2 - C1)) & (-1 >> C2)
- Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact());
- APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
- return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask));
- }
- if (ShOp1->ugt(ShAmt)) {
- unsigned ShlAmt = ShOp1->getZExtValue();
- Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
- if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
- // (X <<nuw C1) >>u C2 --> X <<nuw (C1 - C2)
- auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
- NewShl->setHasNoUnsignedWrap(true);
- return NewShl;
- }
- // (X << C1) >>u C2 --> X << (C1 - C2) & (-1 >> C2)
- Value *NewShl = Builder.CreateShl(X, ShiftDiff);
- APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
- return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
- }
- assert(*ShOp1 == ShAmt);
- // (X << C) >>u C --> X & (-1 >>u C)
- APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
- return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
- }
-
- if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) &&
- (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
- assert(ShAmt < X->getType()->getScalarSizeInBits() &&
- "Big shift not simplified to zero?");
- // lshr (zext iM X to iN), C --> zext (lshr X, C) to iN
- Value *NewLShr = Builder.CreateLShr(X, ShAmt);
- return new ZExtInst(NewLShr, Ty);
- }
-
- if (match(Op0, m_SExt(m_Value(X))) &&
- (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
- // Are we moving the sign bit to the low bit and widening with high zeros?
- unsigned SrcTyBitWidth = X->getType()->getScalarSizeInBits();
- if (ShAmt == BitWidth - 1) {
- // lshr (sext i1 X to iN), N-1 --> zext X to iN
- if (SrcTyBitWidth == 1)
- return new ZExtInst(X, Ty);
-
- // lshr (sext iM X to iN), N-1 --> zext (lshr X, M-1) to iN
- if (Op0->hasOneUse()) {
- Value *NewLShr = Builder.CreateLShr(X, SrcTyBitWidth - 1);
- return new ZExtInst(NewLShr, Ty);
- }
- }
-
- // lshr (sext iM X to iN), N-M --> zext (ashr X, min(N-M, M-1)) to iN
- if (ShAmt == BitWidth - SrcTyBitWidth && Op0->hasOneUse()) {
- // The new shift amount can't be more than the narrow source type.
- unsigned NewShAmt = std::min(ShAmt, SrcTyBitWidth - 1);
- Value *AShr = Builder.CreateAShr(X, NewShAmt);
- return new ZExtInst(AShr, Ty);
- }
- }
-
+ if (Value *V = SimplifyLShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ if (Instruction *R = commonShiftTransforms(I))
+ return R;
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Type *Ty = I.getType();
+ const APInt *ShAmtAPInt;
+ if (match(Op1, m_APInt(ShAmtAPInt))) {
+ unsigned ShAmt = ShAmtAPInt->getZExtValue();
+ unsigned BitWidth = Ty->getScalarSizeInBits();
+ auto *II = dyn_cast<IntrinsicInst>(Op0);
+ if (II && isPowerOf2_32(BitWidth) && Log2_32(BitWidth) == ShAmt &&
+ (II->getIntrinsicID() == Intrinsic::ctlz ||
+ II->getIntrinsicID() == Intrinsic::cttz ||
+ II->getIntrinsicID() == Intrinsic::ctpop)) {
+ // ctlz.i32(x)>>5 --> zext(x == 0)
+ // cttz.i32(x)>>5 --> zext(x == 0)
+ // ctpop.i32(x)>>5 --> zext(x == -1)
+ bool IsPop = II->getIntrinsicID() == Intrinsic::ctpop;
+ Constant *RHS = ConstantInt::getSigned(Ty, IsPop ? -1 : 0);
+ Value *Cmp = Builder.CreateICmpEQ(II->getArgOperand(0), RHS);
+ return new ZExtInst(Cmp, Ty);
+ }
+
+ Value *X;
+ const APInt *ShOp1;
+ if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1))) && ShOp1->ult(BitWidth)) {
+ if (ShOp1->ult(ShAmt)) {
+ unsigned ShlAmt = ShOp1->getZExtValue();
+ Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
+ if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
+ // (X <<nuw C1) >>u C2 --> X >>u (C2 - C1)
+ auto *NewLShr = BinaryOperator::CreateLShr(X, ShiftDiff);
+ NewLShr->setIsExact(I.isExact());
+ return NewLShr;
+ }
+ // (X << C1) >>u C2 --> (X >>u (C2 - C1)) & (-1 >> C2)
+ Value *NewLShr = Builder.CreateLShr(X, ShiftDiff, "", I.isExact());
+ APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+ return BinaryOperator::CreateAnd(NewLShr, ConstantInt::get(Ty, Mask));
+ }
+ if (ShOp1->ugt(ShAmt)) {
+ unsigned ShlAmt = ShOp1->getZExtValue();
+ Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
+ if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
+ // (X <<nuw C1) >>u C2 --> X <<nuw (C1 - C2)
+ auto *NewShl = BinaryOperator::CreateShl(X, ShiftDiff);
+ NewShl->setHasNoUnsignedWrap(true);
+ return NewShl;
+ }
+ // (X << C1) >>u C2 --> X << (C1 - C2) & (-1 >> C2)
+ Value *NewShl = Builder.CreateShl(X, ShiftDiff);
+ APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+ return BinaryOperator::CreateAnd(NewShl, ConstantInt::get(Ty, Mask));
+ }
+ assert(*ShOp1 == ShAmt);
+ // (X << C) >>u C --> X & (-1 >>u C)
+ APInt Mask(APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt));
+ return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
+ }
+
+ if (match(Op0, m_OneUse(m_ZExt(m_Value(X)))) &&
+ (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
+ assert(ShAmt < X->getType()->getScalarSizeInBits() &&
+ "Big shift not simplified to zero?");
+ // lshr (zext iM X to iN), C --> zext (lshr X, C) to iN
+ Value *NewLShr = Builder.CreateLShr(X, ShAmt);
+ return new ZExtInst(NewLShr, Ty);
+ }
+
+ if (match(Op0, m_SExt(m_Value(X))) &&
+ (!Ty->isIntegerTy() || shouldChangeType(Ty, X->getType()))) {
+ // Are we moving the sign bit to the low bit and widening with high zeros?
+ unsigned SrcTyBitWidth = X->getType()->getScalarSizeInBits();
+ if (ShAmt == BitWidth - 1) {
+ // lshr (sext i1 X to iN), N-1 --> zext X to iN
+ if (SrcTyBitWidth == 1)
+ return new ZExtInst(X, Ty);
+
+ // lshr (sext iM X to iN), N-1 --> zext (lshr X, M-1) to iN
+ if (Op0->hasOneUse()) {
+ Value *NewLShr = Builder.CreateLShr(X, SrcTyBitWidth - 1);
+ return new ZExtInst(NewLShr, Ty);
+ }
+ }
+
+ // lshr (sext iM X to iN), N-M --> zext (ashr X, min(N-M, M-1)) to iN
+ if (ShAmt == BitWidth - SrcTyBitWidth && Op0->hasOneUse()) {
+ // The new shift amount can't be more than the narrow source type.
+ unsigned NewShAmt = std::min(ShAmt, SrcTyBitWidth - 1);
+ Value *AShr = Builder.CreateAShr(X, NewShAmt);
+ return new ZExtInst(AShr, Ty);
+ }
+ }
+
// lshr i32 (X -nsw Y), 31 --> zext (X < Y)
Value *Y;
if (ShAmt == BitWidth - 1 &&
match(Op0, m_OneUse(m_NSWSub(m_Value(X), m_Value(Y)))))
return new ZExtInst(Builder.CreateICmpSLT(X, Y), Ty);
- if (match(Op0, m_LShr(m_Value(X), m_APInt(ShOp1)))) {
- unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
- // Oversized shifts are simplified to zero in InstSimplify.
- if (AmtSum < BitWidth)
- // (X >>u C1) >>u C2 --> X >>u (C1 + C2)
- return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum));
- }
-
- // If the shifted-out value is known-zero, then this is an exact shift.
- if (!I.isExact() &&
- MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
- I.setIsExact();
- return &I;
- }
- }
-
- // Transform (x << y) >> y to x & (-1 >> y)
- Value *X;
- if (match(Op0, m_OneUse(m_Shl(m_Value(X), m_Specific(Op1))))) {
- Constant *AllOnes = ConstantInt::getAllOnesValue(Ty);
- Value *Mask = Builder.CreateLShr(AllOnes, Op1);
- return BinaryOperator::CreateAnd(Mask, X);
- }
-
- return nullptr;
-}
-
-Instruction *
+ if (match(Op0, m_LShr(m_Value(X), m_APInt(ShOp1)))) {
+ unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+ // Oversized shifts are simplified to zero in InstSimplify.
+ if (AmtSum < BitWidth)
+ // (X >>u C1) >>u C2 --> X >>u (C1 + C2)
+ return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum));
+ }
+
+ // If the shifted-out value is known-zero, then this is an exact shift.
+ if (!I.isExact() &&
+ MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
+ I.setIsExact();
+ return &I;
+ }
+ }
+
+ // Transform (x << y) >> y to x & (-1 >> y)
+ Value *X;
+ if (match(Op0, m_OneUse(m_Shl(m_Value(X), m_Specific(Op1))))) {
+ Constant *AllOnes = ConstantInt::getAllOnesValue(Ty);
+ Value *Mask = Builder.CreateLShr(AllOnes, Op1);
+ return BinaryOperator::CreateAnd(Mask, X);
+ }
+
+ return nullptr;
+}
+
+Instruction *
InstCombinerImpl::foldVariableSignZeroExtensionOfVariableHighBitExtract(
- BinaryOperator &OldAShr) {
- assert(OldAShr.getOpcode() == Instruction::AShr &&
- "Must be called with arithmetic right-shift instruction only.");
-
- // Check that constant C is a splat of the element-wise bitwidth of V.
- auto BitWidthSplat = [](Constant *C, Value *V) {
- return match(
- C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
- APInt(C->getType()->getScalarSizeInBits(),
- V->getType()->getScalarSizeInBits())));
- };
-
- // It should look like variable-length sign-extension on the outside:
- // (Val << (bitwidth(Val)-Nbits)) a>> (bitwidth(Val)-Nbits)
- Value *NBits;
- Instruction *MaybeTrunc;
- Constant *C1, *C2;
- if (!match(&OldAShr,
- m_AShr(m_Shl(m_Instruction(MaybeTrunc),
- m_ZExtOrSelf(m_Sub(m_Constant(C1),
- m_ZExtOrSelf(m_Value(NBits))))),
- m_ZExtOrSelf(m_Sub(m_Constant(C2),
- m_ZExtOrSelf(m_Deferred(NBits)))))) ||
- !BitWidthSplat(C1, &OldAShr) || !BitWidthSplat(C2, &OldAShr))
- return nullptr;
-
- // There may or may not be a truncation after outer two shifts.
- Instruction *HighBitExtract;
- match(MaybeTrunc, m_TruncOrSelf(m_Instruction(HighBitExtract)));
- bool HadTrunc = MaybeTrunc != HighBitExtract;
-
- // And finally, the innermost part of the pattern must be a right-shift.
- Value *X, *NumLowBitsToSkip;
- if (!match(HighBitExtract, m_Shr(m_Value(X), m_Value(NumLowBitsToSkip))))
- return nullptr;
-
- // Said right-shift must extract high NBits bits - C0 must be it's bitwidth.
- Constant *C0;
- if (!match(NumLowBitsToSkip,
- m_ZExtOrSelf(
- m_Sub(m_Constant(C0), m_ZExtOrSelf(m_Specific(NBits))))) ||
- !BitWidthSplat(C0, HighBitExtract))
- return nullptr;
-
- // Since the NBits is identical for all shifts, if the outermost and
- // innermost shifts are identical, then outermost shifts are redundant.
- // If we had truncation, do keep it though.
- if (HighBitExtract->getOpcode() == OldAShr.getOpcode())
- return replaceInstUsesWith(OldAShr, MaybeTrunc);
-
- // Else, if there was a truncation, then we need to ensure that one
- // instruction will go away.
- if (HadTrunc && !match(&OldAShr, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
- return nullptr;
-
- // Finally, bypass two innermost shifts, and perform the outermost shift on
- // the operands of the innermost shift.
- Instruction *NewAShr =
- BinaryOperator::Create(OldAShr.getOpcode(), X, NumLowBitsToSkip);
- NewAShr->copyIRFlags(HighBitExtract); // We can preserve 'exact'-ness.
- if (!HadTrunc)
- return NewAShr;
-
- Builder.Insert(NewAShr);
- return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType());
-}
-
+ BinaryOperator &OldAShr) {
+ assert(OldAShr.getOpcode() == Instruction::AShr &&
+ "Must be called with arithmetic right-shift instruction only.");
+
+ // Check that constant C is a splat of the element-wise bitwidth of V.
+ auto BitWidthSplat = [](Constant *C, Value *V) {
+ return match(
+ C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ,
+ APInt(C->getType()->getScalarSizeInBits(),
+ V->getType()->getScalarSizeInBits())));
+ };
+
+ // It should look like variable-length sign-extension on the outside:
+ // (Val << (bitwidth(Val)-Nbits)) a>> (bitwidth(Val)-Nbits)
+ Value *NBits;
+ Instruction *MaybeTrunc;
+ Constant *C1, *C2;
+ if (!match(&OldAShr,
+ m_AShr(m_Shl(m_Instruction(MaybeTrunc),
+ m_ZExtOrSelf(m_Sub(m_Constant(C1),
+ m_ZExtOrSelf(m_Value(NBits))))),
+ m_ZExtOrSelf(m_Sub(m_Constant(C2),
+ m_ZExtOrSelf(m_Deferred(NBits)))))) ||
+ !BitWidthSplat(C1, &OldAShr) || !BitWidthSplat(C2, &OldAShr))
+ return nullptr;
+
+ // There may or may not be a truncation after outer two shifts.
+ Instruction *HighBitExtract;
+ match(MaybeTrunc, m_TruncOrSelf(m_Instruction(HighBitExtract)));
+ bool HadTrunc = MaybeTrunc != HighBitExtract;
+
+ // And finally, the innermost part of the pattern must be a right-shift.
+ Value *X, *NumLowBitsToSkip;
+ if (!match(HighBitExtract, m_Shr(m_Value(X), m_Value(NumLowBitsToSkip))))
+ return nullptr;
+
+ // Said right-shift must extract high NBits bits - C0 must be it's bitwidth.
+ Constant *C0;
+ if (!match(NumLowBitsToSkip,
+ m_ZExtOrSelf(
+ m_Sub(m_Constant(C0), m_ZExtOrSelf(m_Specific(NBits))))) ||
+ !BitWidthSplat(C0, HighBitExtract))
+ return nullptr;
+
+ // Since the NBits is identical for all shifts, if the outermost and
+ // innermost shifts are identical, then outermost shifts are redundant.
+ // If we had truncation, do keep it though.
+ if (HighBitExtract->getOpcode() == OldAShr.getOpcode())
+ return replaceInstUsesWith(OldAShr, MaybeTrunc);
+
+ // Else, if there was a truncation, then we need to ensure that one
+ // instruction will go away.
+ if (HadTrunc && !match(&OldAShr, m_c_BinOp(m_OneUse(m_Value()), m_Value())))
+ return nullptr;
+
+ // Finally, bypass two innermost shifts, and perform the outermost shift on
+ // the operands of the innermost shift.
+ Instruction *NewAShr =
+ BinaryOperator::Create(OldAShr.getOpcode(), X, NumLowBitsToSkip);
+ NewAShr->copyIRFlags(HighBitExtract); // We can preserve 'exact'-ness.
+ if (!HadTrunc)
+ return NewAShr;
+
+ Builder.Insert(NewAShr);
+ return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType());
+}
+
Instruction *InstCombinerImpl::visitAShr(BinaryOperator &I) {
- if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
- SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
- if (Instruction *X = foldVectorBinop(I))
- return X;
-
- if (Instruction *R = commonShiftTransforms(I))
- return R;
-
- Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
- Type *Ty = I.getType();
- unsigned BitWidth = Ty->getScalarSizeInBits();
- const APInt *ShAmtAPInt;
- if (match(Op1, m_APInt(ShAmtAPInt)) && ShAmtAPInt->ult(BitWidth)) {
- unsigned ShAmt = ShAmtAPInt->getZExtValue();
-
- // If the shift amount equals the difference in width of the destination
- // and source scalar types:
- // ashr (shl (zext X), C), C --> sext X
- Value *X;
- if (match(Op0, m_Shl(m_ZExt(m_Value(X)), m_Specific(Op1))) &&
- ShAmt == BitWidth - X->getType()->getScalarSizeInBits())
- return new SExtInst(X, Ty);
-
- // We can't handle (X << C1) >>s C2. It shifts arbitrary bits in. However,
- // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
- const APInt *ShOp1;
- if (match(Op0, m_NSWShl(m_Value(X), m_APInt(ShOp1))) &&
- ShOp1->ult(BitWidth)) {
- unsigned ShlAmt = ShOp1->getZExtValue();
- if (ShlAmt < ShAmt) {
- // (X <<nsw C1) >>s C2 --> X >>s (C2 - C1)
- Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
- auto *NewAShr = BinaryOperator::CreateAShr(X, ShiftDiff);
- NewAShr->setIsExact(I.isExact());
- return NewAShr;
- }
- if (ShlAmt > ShAmt) {
- // (X <<nsw C1) >>s C2 --> X <<nsw (C1 - C2)
- Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
- auto *NewShl = BinaryOperator::Create(Instruction::Shl, X, ShiftDiff);
- NewShl->setHasNoSignedWrap(true);
- return NewShl;
- }
- }
-
- if (match(Op0, m_AShr(m_Value(X), m_APInt(ShOp1))) &&
- ShOp1->ult(BitWidth)) {
- unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
- // Oversized arithmetic shifts replicate the sign bit.
- AmtSum = std::min(AmtSum, BitWidth - 1);
- // (X >>s C1) >>s C2 --> X >>s (C1 + C2)
- return BinaryOperator::CreateAShr(X, ConstantInt::get(Ty, AmtSum));
- }
-
- if (match(Op0, m_OneUse(m_SExt(m_Value(X)))) &&
- (Ty->isVectorTy() || shouldChangeType(Ty, X->getType()))) {
- // ashr (sext X), C --> sext (ashr X, C')
- Type *SrcTy = X->getType();
- ShAmt = std::min(ShAmt, SrcTy->getScalarSizeInBits() - 1);
- Value *NewSh = Builder.CreateAShr(X, ConstantInt::get(SrcTy, ShAmt));
- return new SExtInst(NewSh, Ty);
- }
-
+ if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(),
+ SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
+ if (Instruction *X = foldVectorBinop(I))
+ return X;
+
+ if (Instruction *R = commonShiftTransforms(I))
+ return R;
+
+ Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+ Type *Ty = I.getType();
+ unsigned BitWidth = Ty->getScalarSizeInBits();
+ const APInt *ShAmtAPInt;
+ if (match(Op1, m_APInt(ShAmtAPInt)) && ShAmtAPInt->ult(BitWidth)) {
+ unsigned ShAmt = ShAmtAPInt->getZExtValue();
+
+ // If the shift amount equals the difference in width of the destination
+ // and source scalar types:
+ // ashr (shl (zext X), C), C --> sext X
+ Value *X;
+ if (match(Op0, m_Shl(m_ZExt(m_Value(X)), m_Specific(Op1))) &&
+ ShAmt == BitWidth - X->getType()->getScalarSizeInBits())
+ return new SExtInst(X, Ty);
+
+ // We can't handle (X << C1) >>s C2. It shifts arbitrary bits in. However,
+ // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
+ const APInt *ShOp1;
+ if (match(Op0, m_NSWShl(m_Value(X), m_APInt(ShOp1))) &&
+ ShOp1->ult(BitWidth)) {
+ unsigned ShlAmt = ShOp1->getZExtValue();
+ if (ShlAmt < ShAmt) {
+ // (X <<nsw C1) >>s C2 --> X >>s (C2 - C1)
+ Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
+ auto *NewAShr = BinaryOperator::CreateAShr(X, ShiftDiff);
+ NewAShr->setIsExact(I.isExact());
+ return NewAShr;
+ }
+ if (ShlAmt > ShAmt) {
+ // (X <<nsw C1) >>s C2 --> X <<nsw (C1 - C2)
+ Constant *ShiftDiff = ConstantInt::get(Ty, ShlAmt - ShAmt);
+ auto *NewShl = BinaryOperator::Create(Instruction::Shl, X, ShiftDiff);
+ NewShl->setHasNoSignedWrap(true);
+ return NewShl;
+ }
+ }
+
+ if (match(Op0, m_AShr(m_Value(X), m_APInt(ShOp1))) &&
+ ShOp1->ult(BitWidth)) {
+ unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+ // Oversized arithmetic shifts replicate the sign bit.
+ AmtSum = std::min(AmtSum, BitWidth - 1);
+ // (X >>s C1) >>s C2 --> X >>s (C1 + C2)
+ return BinaryOperator::CreateAShr(X, ConstantInt::get(Ty, AmtSum));
+ }
+
+ if (match(Op0, m_OneUse(m_SExt(m_Value(X)))) &&
+ (Ty->isVectorTy() || shouldChangeType(Ty, X->getType()))) {
+ // ashr (sext X), C --> sext (ashr X, C')
+ Type *SrcTy = X->getType();
+ ShAmt = std::min(ShAmt, SrcTy->getScalarSizeInBits() - 1);
+ Value *NewSh = Builder.CreateAShr(X, ConstantInt::get(SrcTy, ShAmt));
+ return new SExtInst(NewSh, Ty);
+ }
+
// ashr i32 (X -nsw Y), 31 --> sext (X < Y)
Value *Y;
if (ShAmt == BitWidth - 1 &&
match(Op0, m_OneUse(m_NSWSub(m_Value(X), m_Value(Y)))))
return new SExtInst(Builder.CreateICmpSLT(X, Y), Ty);
- // If the shifted-out value is known-zero, then this is an exact shift.
- if (!I.isExact() &&
- MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
- I.setIsExact();
- return &I;
- }
- }
-
- if (Instruction *R = foldVariableSignZeroExtensionOfVariableHighBitExtract(I))
- return R;
-
- // See if we can turn a signed shr into an unsigned shr.
- if (MaskedValueIsZero(Op0, APInt::getSignMask(BitWidth), 0, &I))
- return BinaryOperator::CreateLShr(Op0, Op1);
-
- return nullptr;
-}
+ // If the shifted-out value is known-zero, then this is an exact shift.
+ if (!I.isExact() &&
+ MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
+ I.setIsExact();
+ return &I;
+ }
+ }
+
+ if (Instruction *R = foldVariableSignZeroExtensionOfVariableHighBitExtract(I))
+ return R;
+
+ // See if we can turn a signed shr into an unsigned shr.
+ if (MaskedValueIsZero(Op0, APInt::getSignMask(BitWidth), 0, &I))
+ return BinaryOperator::CreateLShr(Op0, Op1);
+
+ return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 7aaa36f730..16efe86377 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -1,264 +1,264 @@
-//===- InstCombineSimplifyDemanded.cpp ------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains logic for simplifying instructions based on information
-// about how they are used.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
+//===- InstCombineSimplifyDemanded.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains logic for simplifying instructions based on information
+// about how they are used.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/KnownBits.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/KnownBits.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-#define DEBUG_TYPE "instcombine"
-
-/// Check to see if the specified operand of the specified instruction is a
-/// constant integer. If so, check to see if there are any bits set in the
-/// constant that are not demanded. If so, shrink the constant and return true.
-static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
- const APInt &Demanded) {
- assert(I && "No instruction?");
- assert(OpNo < I->getNumOperands() && "Operand index too large");
-
- // The operand must be a constant integer or splat integer.
- Value *Op = I->getOperand(OpNo);
- const APInt *C;
- if (!match(Op, m_APInt(C)))
- return false;
-
- // If there are no bits set that aren't demanded, nothing to do.
- if (C->isSubsetOf(Demanded))
- return false;
-
- // This instruction is producing bits that are not demanded. Shrink the RHS.
- I->setOperand(OpNo, ConstantInt::get(Op->getType(), *C & Demanded));
-
- return true;
-}
-
-
-
-/// Inst is an integer instruction that SimplifyDemandedBits knows about. See if
-/// the instruction has any properties that allow us to simplify its operands.
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
+/// Check to see if the specified operand of the specified instruction is a
+/// constant integer. If so, check to see if there are any bits set in the
+/// constant that are not demanded. If so, shrink the constant and return true.
+static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
+ const APInt &Demanded) {
+ assert(I && "No instruction?");
+ assert(OpNo < I->getNumOperands() && "Operand index too large");
+
+ // The operand must be a constant integer or splat integer.
+ Value *Op = I->getOperand(OpNo);
+ const APInt *C;
+ if (!match(Op, m_APInt(C)))
+ return false;
+
+ // If there are no bits set that aren't demanded, nothing to do.
+ if (C->isSubsetOf(Demanded))
+ return false;
+
+ // This instruction is producing bits that are not demanded. Shrink the RHS.
+ I->setOperand(OpNo, ConstantInt::get(Op->getType(), *C & Demanded));
+
+ return true;
+}
+
+
+
+/// Inst is an integer instruction that SimplifyDemandedBits knows about. See if
+/// the instruction has any properties that allow us to simplify its operands.
bool InstCombinerImpl::SimplifyDemandedInstructionBits(Instruction &Inst) {
- unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
- KnownBits Known(BitWidth);
- APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
-
- Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, Known,
- 0, &Inst);
- if (!V) return false;
- if (V == &Inst) return true;
- replaceInstUsesWith(Inst, V);
- return true;
-}
-
-/// This form of SimplifyDemandedBits simplifies the specified instruction
-/// operand if possible, updating it in place. It returns true if it made any
-/// change and false otherwise.
+ unsigned BitWidth = Inst.getType()->getScalarSizeInBits();
+ KnownBits Known(BitWidth);
+ APInt DemandedMask(APInt::getAllOnesValue(BitWidth));
+
+ Value *V = SimplifyDemandedUseBits(&Inst, DemandedMask, Known,
+ 0, &Inst);
+ if (!V) return false;
+ if (V == &Inst) return true;
+ replaceInstUsesWith(Inst, V);
+ return true;
+}
+
+/// This form of SimplifyDemandedBits simplifies the specified instruction
+/// operand if possible, updating it in place. It returns true if it made any
+/// change and false otherwise.
bool InstCombinerImpl::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
const APInt &DemandedMask,
KnownBits &Known, unsigned Depth) {
- Use &U = I->getOperandUse(OpNo);
- Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known,
- Depth, I);
- if (!NewVal) return false;
- if (Instruction* OpInst = dyn_cast<Instruction>(U))
- salvageDebugInfo(*OpInst);
-
- replaceUse(U, NewVal);
- return true;
-}
-
-/// This function attempts to replace V with a simpler value based on the
-/// demanded bits. When this function is called, it is known that only the bits
-/// set in DemandedMask of the result of V are ever used downstream.
-/// Consequently, depending on the mask and V, it may be possible to replace V
-/// with a constant or one of its operands. In such cases, this function does
-/// the replacement and returns true. In all other cases, it returns false after
-/// analyzing the expression and setting KnownOne and known to be one in the
-/// expression. Known.Zero contains all the bits that are known to be zero in
-/// the expression. These are provided to potentially allow the caller (which
-/// might recursively be SimplifyDemandedBits itself) to simplify the
-/// expression.
-/// Known.One and Known.Zero always follow the invariant that:
-/// Known.One & Known.Zero == 0.
-/// That is, a bit can't be both 1 and 0. Note that the bits in Known.One and
-/// Known.Zero may only be accurate for those bits set in DemandedMask. Note
-/// also that the bitwidth of V, DemandedMask, Known.Zero and Known.One must all
-/// be the same.
-///
-/// This returns null if it did not change anything and it permits no
-/// simplification. This returns V itself if it did some simplification of V's
-/// operands based on the information about what bits are demanded. This returns
-/// some other non-null value if it found out that V is equal to another value
-/// in the context where the specified bits are demanded, but not for all users.
+ Use &U = I->getOperandUse(OpNo);
+ Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known,
+ Depth, I);
+ if (!NewVal) return false;
+ if (Instruction* OpInst = dyn_cast<Instruction>(U))
+ salvageDebugInfo(*OpInst);
+
+ replaceUse(U, NewVal);
+ return true;
+}
+
+/// This function attempts to replace V with a simpler value based on the
+/// demanded bits. When this function is called, it is known that only the bits
+/// set in DemandedMask of the result of V are ever used downstream.
+/// Consequently, depending on the mask and V, it may be possible to replace V
+/// with a constant or one of its operands. In such cases, this function does
+/// the replacement and returns true. In all other cases, it returns false after
+/// analyzing the expression and setting KnownOne and known to be one in the
+/// expression. Known.Zero contains all the bits that are known to be zero in
+/// the expression. These are provided to potentially allow the caller (which
+/// might recursively be SimplifyDemandedBits itself) to simplify the
+/// expression.
+/// Known.One and Known.Zero always follow the invariant that:
+/// Known.One & Known.Zero == 0.
+/// That is, a bit can't be both 1 and 0. Note that the bits in Known.One and
+/// Known.Zero may only be accurate for those bits set in DemandedMask. Note
+/// also that the bitwidth of V, DemandedMask, Known.Zero and Known.One must all
+/// be the same.
+///
+/// This returns null if it did not change anything and it permits no
+/// simplification. This returns V itself if it did some simplification of V's
+/// operands based on the information about what bits are demanded. This returns
+/// some other non-null value if it found out that V is equal to another value
+/// in the context where the specified bits are demanded, but not for all users.
Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
KnownBits &Known,
unsigned Depth,
Instruction *CxtI) {
- assert(V != nullptr && "Null pointer of Value???");
+ assert(V != nullptr && "Null pointer of Value???");
assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
- uint32_t BitWidth = DemandedMask.getBitWidth();
- Type *VTy = V->getType();
- assert(
- (!VTy->isIntOrIntVectorTy() || VTy->getScalarSizeInBits() == BitWidth) &&
- Known.getBitWidth() == BitWidth &&
- "Value *V, DemandedMask and Known must have same BitWidth");
-
- if (isa<Constant>(V)) {
- computeKnownBits(V, Known, Depth, CxtI);
- return nullptr;
- }
-
- Known.resetAll();
- if (DemandedMask.isNullValue()) // Not demanding any bits from V.
- return UndefValue::get(VTy);
-
+ uint32_t BitWidth = DemandedMask.getBitWidth();
+ Type *VTy = V->getType();
+ assert(
+ (!VTy->isIntOrIntVectorTy() || VTy->getScalarSizeInBits() == BitWidth) &&
+ Known.getBitWidth() == BitWidth &&
+ "Value *V, DemandedMask and Known must have same BitWidth");
+
+ if (isa<Constant>(V)) {
+ computeKnownBits(V, Known, Depth, CxtI);
+ return nullptr;
+ }
+
+ Known.resetAll();
+ if (DemandedMask.isNullValue()) // Not demanding any bits from V.
+ return UndefValue::get(VTy);
+
if (Depth == MaxAnalysisRecursionDepth)
- return nullptr;
-
+ return nullptr;
+
if (isa<ScalableVectorType>(VTy))
return nullptr;
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I) {
- computeKnownBits(V, Known, Depth, CxtI);
- return nullptr; // Only analyze instructions.
- }
-
- // If there are multiple uses of this value and we aren't at the root, then
- // we can't do any simplifications of the operands, because DemandedMask
- // only reflects the bits demanded by *one* of the users.
- if (Depth != 0 && !I->hasOneUse())
- return SimplifyMultipleUseDemandedBits(I, DemandedMask, Known, Depth, CxtI);
-
- KnownBits LHSKnown(BitWidth), RHSKnown(BitWidth);
-
- // If this is the root being simplified, allow it to have multiple uses,
- // just set the DemandedMask to all bits so that we can try to simplify the
- // operands. This allows visitTruncInst (for example) to simplify the
- // operand of a trunc without duplicating all the logic below.
- if (Depth == 0 && !V->hasOneUse())
- DemandedMask.setAllBits();
-
- switch (I->getOpcode()) {
- default:
- computeKnownBits(I, Known, Depth, CxtI);
- break;
- case Instruction::And: {
- // If either the LHS or the RHS are Zero, the result is zero.
- if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
- SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.Zero, LHSKnown,
- Depth + 1))
- return I;
- assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
- assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
-
- Known = LHSKnown & RHSKnown;
-
- // If the client is only demanding bits that we know, return the known
- // constant.
- if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
- return Constant::getIntegerValue(VTy, Known.One);
-
- // If all of the demanded bits are known 1 on one side, return the other.
- // These bits cannot contribute to the result of the 'and'.
- if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
- return I->getOperand(0);
- if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
- return I->getOperand(1);
-
- // If the RHS is a constant, see if we can simplify it.
- if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnown.Zero))
- return I;
-
- break;
- }
- case Instruction::Or: {
- // If either the LHS or the RHS are One, the result is One.
- if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
- SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.One, LHSKnown,
- Depth + 1))
- return I;
- assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
- assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
-
- Known = LHSKnown | RHSKnown;
-
- // If the client is only demanding bits that we know, return the known
- // constant.
- if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
- return Constant::getIntegerValue(VTy, Known.One);
-
- // If all of the demanded bits are known zero on one side, return the other.
- // These bits cannot contribute to the result of the 'or'.
- if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
- return I->getOperand(0);
- if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
- return I->getOperand(1);
-
- // If the RHS is a constant, see if we can simplify it.
- if (ShrinkDemandedConstant(I, 1, DemandedMask))
- return I;
-
- break;
- }
- case Instruction::Xor: {
- if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
- SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1))
- return I;
- assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
- assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
-
- Known = LHSKnown ^ RHSKnown;
-
- // If the client is only demanding bits that we know, return the known
- // constant.
- if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
- return Constant::getIntegerValue(VTy, Known.One);
-
- // If all of the demanded bits are known zero on one side, return the other.
- // These bits cannot contribute to the result of the 'xor'.
- if (DemandedMask.isSubsetOf(RHSKnown.Zero))
- return I->getOperand(0);
- if (DemandedMask.isSubsetOf(LHSKnown.Zero))
- return I->getOperand(1);
-
- // If all of the demanded bits are known to be zero on one side or the
- // other, turn this into an *inclusive* or.
- // e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
- if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero)) {
- Instruction *Or =
- BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
- I->getName());
- return InsertNewInstWith(Or, *I);
- }
-
- // If all of the demanded bits on one side are known, and all of the set
- // bits on that side are also known to be set on the other side, turn this
- // into an AND, as we know the bits will be cleared.
- // e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
- if (DemandedMask.isSubsetOf(RHSKnown.Zero|RHSKnown.One) &&
- RHSKnown.One.isSubsetOf(LHSKnown.One)) {
- Constant *AndC = Constant::getIntegerValue(VTy,
- ~RHSKnown.One & DemandedMask);
- Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
- return InsertNewInstWith(And, *I);
- }
-
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I) {
+ computeKnownBits(V, Known, Depth, CxtI);
+ return nullptr; // Only analyze instructions.
+ }
+
+ // If there are multiple uses of this value and we aren't at the root, then
+ // we can't do any simplifications of the operands, because DemandedMask
+ // only reflects the bits demanded by *one* of the users.
+ if (Depth != 0 && !I->hasOneUse())
+ return SimplifyMultipleUseDemandedBits(I, DemandedMask, Known, Depth, CxtI);
+
+ KnownBits LHSKnown(BitWidth), RHSKnown(BitWidth);
+
+ // If this is the root being simplified, allow it to have multiple uses,
+ // just set the DemandedMask to all bits so that we can try to simplify the
+ // operands. This allows visitTruncInst (for example) to simplify the
+ // operand of a trunc without duplicating all the logic below.
+ if (Depth == 0 && !V->hasOneUse())
+ DemandedMask.setAllBits();
+
+ switch (I->getOpcode()) {
+ default:
+ computeKnownBits(I, Known, Depth, CxtI);
+ break;
+ case Instruction::And: {
+ // If either the LHS or the RHS are Zero, the result is zero.
+ if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
+ SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.Zero, LHSKnown,
+ Depth + 1))
+ return I;
+ assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+ assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+ Known = LHSKnown & RHSKnown;
+
+ // If the client is only demanding bits that we know, return the known
+ // constant.
+ if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+ return Constant::getIntegerValue(VTy, Known.One);
+
+ // If all of the demanded bits are known 1 on one side, return the other.
+ // These bits cannot contribute to the result of the 'and'.
+ if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
+ return I->getOperand(0);
+ if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
+ return I->getOperand(1);
+
+ // If the RHS is a constant, see if we can simplify it.
+ if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnown.Zero))
+ return I;
+
+ break;
+ }
+ case Instruction::Or: {
+ // If either the LHS or the RHS are One, the result is One.
+ if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
+ SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnown.One, LHSKnown,
+ Depth + 1))
+ return I;
+ assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+ assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+ Known = LHSKnown | RHSKnown;
+
+ // If the client is only demanding bits that we know, return the known
+ // constant.
+ if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+ return Constant::getIntegerValue(VTy, Known.One);
+
+ // If all of the demanded bits are known zero on one side, return the other.
+ // These bits cannot contribute to the result of the 'or'.
+ if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
+ return I->getOperand(0);
+ if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
+ return I->getOperand(1);
+
+ // If the RHS is a constant, see if we can simplify it.
+ if (ShrinkDemandedConstant(I, 1, DemandedMask))
+ return I;
+
+ break;
+ }
+ case Instruction::Xor: {
+ if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnown, Depth + 1) ||
+ SimplifyDemandedBits(I, 0, DemandedMask, LHSKnown, Depth + 1))
+ return I;
+ assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+ assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+ Known = LHSKnown ^ RHSKnown;
+
+ // If the client is only demanding bits that we know, return the known
+ // constant.
+ if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+ return Constant::getIntegerValue(VTy, Known.One);
+
+ // If all of the demanded bits are known zero on one side, return the other.
+ // These bits cannot contribute to the result of the 'xor'.
+ if (DemandedMask.isSubsetOf(RHSKnown.Zero))
+ return I->getOperand(0);
+ if (DemandedMask.isSubsetOf(LHSKnown.Zero))
+ return I->getOperand(1);
+
+ // If all of the demanded bits are known to be zero on one side or the
+ // other, turn this into an *inclusive* or.
+ // e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
+ if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero)) {
+ Instruction *Or =
+ BinaryOperator::CreateOr(I->getOperand(0), I->getOperand(1),
+ I->getName());
+ return InsertNewInstWith(Or, *I);
+ }
+
+ // If all of the demanded bits on one side are known, and all of the set
+ // bits on that side are also known to be set on the other side, turn this
+ // into an AND, as we know the bits will be cleared.
+ // e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
+ if (DemandedMask.isSubsetOf(RHSKnown.Zero|RHSKnown.One) &&
+ RHSKnown.One.isSubsetOf(LHSKnown.One)) {
+ Constant *AndC = Constant::getIntegerValue(VTy,
+ ~RHSKnown.One & DemandedMask);
+ Instruction *And = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
+ return InsertNewInstWith(And, *I);
+ }
+
// If the RHS is a constant, see if we can change it. Don't alter a -1
// constant because that's a canonical 'not' op, and that is better for
// combining, SCEV, and codegen.
@@ -273,636 +273,636 @@ Value *InstCombinerImpl::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
if (ShrinkDemandedConstant(I, 1, DemandedMask))
return I;
}
-
- // If our LHS is an 'and' and if it has one use, and if any of the bits we
- // are flipping are known to be set, then the xor is just resetting those
- // bits to zero. We can just knock out bits from the 'and' and the 'xor',
- // simplifying both of them.
+
+ // If our LHS is an 'and' and if it has one use, and if any of the bits we
+ // are flipping are known to be set, then the xor is just resetting those
+ // bits to zero. We can just knock out bits from the 'and' and the 'xor',
+ // simplifying both of them.
if (Instruction *LHSInst = dyn_cast<Instruction>(I->getOperand(0))) {
ConstantInt *AndRHS, *XorRHS;
- if (LHSInst->getOpcode() == Instruction::And && LHSInst->hasOneUse() &&
+ if (LHSInst->getOpcode() == Instruction::And && LHSInst->hasOneUse() &&
match(I->getOperand(1), m_ConstantInt(XorRHS)) &&
match(LHSInst->getOperand(1), m_ConstantInt(AndRHS)) &&
- (LHSKnown.One & RHSKnown.One & DemandedMask) != 0) {
- APInt NewMask = ~(LHSKnown.One & RHSKnown.One & DemandedMask);
-
- Constant *AndC =
+ (LHSKnown.One & RHSKnown.One & DemandedMask) != 0) {
+ APInt NewMask = ~(LHSKnown.One & RHSKnown.One & DemandedMask);
+
+ Constant *AndC =
ConstantInt::get(I->getType(), NewMask & AndRHS->getValue());
- Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
- InsertNewInstWith(NewAnd, *I);
-
- Constant *XorC =
+ Instruction *NewAnd = BinaryOperator::CreateAnd(I->getOperand(0), AndC);
+ InsertNewInstWith(NewAnd, *I);
+
+ Constant *XorC =
ConstantInt::get(I->getType(), NewMask & XorRHS->getValue());
- Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC);
- return InsertNewInstWith(NewXor, *I);
- }
+ Instruction *NewXor = BinaryOperator::CreateXor(NewAnd, XorC);
+ return InsertNewInstWith(NewXor, *I);
+ }
}
- break;
- }
- case Instruction::Select: {
- Value *LHS, *RHS;
- SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
- if (SPF == SPF_UMAX) {
- // UMax(A, C) == A if ...
- // The lowest non-zero bit of DemandMask is higher than the highest
- // non-zero bit of C.
- const APInt *C;
- unsigned CTZ = DemandedMask.countTrailingZeros();
- if (match(RHS, m_APInt(C)) && CTZ >= C->getActiveBits())
- return LHS;
- } else if (SPF == SPF_UMIN) {
- // UMin(A, C) == A if ...
- // The lowest non-zero bit of DemandMask is higher than the highest
- // non-one bit of C.
- // This comes from using DeMorgans on the above umax example.
- const APInt *C;
- unsigned CTZ = DemandedMask.countTrailingZeros();
- if (match(RHS, m_APInt(C)) &&
- CTZ >= C->getBitWidth() - C->countLeadingOnes())
- return LHS;
- }
-
- // If this is a select as part of any other min/max pattern, don't simplify
- // any further in case we break the structure.
- if (SPF != SPF_UNKNOWN)
- return nullptr;
-
- if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) ||
- SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1))
- return I;
- assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
- assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
-
- // If the operands are constants, see if we can simplify them.
- // This is similar to ShrinkDemandedConstant, but for a select we want to
- // try to keep the selected constants the same as icmp value constants, if
- // we can. This helps not break apart (or helps put back together)
- // canonical patterns like min and max.
- auto CanonicalizeSelectConstant = [](Instruction *I, unsigned OpNo,
+ break;
+ }
+ case Instruction::Select: {
+ Value *LHS, *RHS;
+ SelectPatternFlavor SPF = matchSelectPattern(I, LHS, RHS).Flavor;
+ if (SPF == SPF_UMAX) {
+ // UMax(A, C) == A if ...
+ // The lowest non-zero bit of DemandMask is higher than the highest
+ // non-zero bit of C.
+ const APInt *C;
+ unsigned CTZ = DemandedMask.countTrailingZeros();
+ if (match(RHS, m_APInt(C)) && CTZ >= C->getActiveBits())
+ return LHS;
+ } else if (SPF == SPF_UMIN) {
+ // UMin(A, C) == A if ...
+ // The lowest non-zero bit of DemandMask is higher than the highest
+ // non-one bit of C.
+ // This comes from using DeMorgans on the above umax example.
+ const APInt *C;
+ unsigned CTZ = DemandedMask.countTrailingZeros();
+ if (match(RHS, m_APInt(C)) &&
+ CTZ >= C->getBitWidth() - C->countLeadingOnes())
+ return LHS;
+ }
+
+ // If this is a select as part of any other min/max pattern, don't simplify
+ // any further in case we break the structure.
+ if (SPF != SPF_UNKNOWN)
+ return nullptr;
+
+ if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnown, Depth + 1) ||
+ SimplifyDemandedBits(I, 1, DemandedMask, LHSKnown, Depth + 1))
+ return I;
+ assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
+ assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
+
+ // If the operands are constants, see if we can simplify them.
+ // This is similar to ShrinkDemandedConstant, but for a select we want to
+ // try to keep the selected constants the same as icmp value constants, if
+ // we can. This helps not break apart (or helps put back together)
+ // canonical patterns like min and max.
+ auto CanonicalizeSelectConstant = [](Instruction *I, unsigned OpNo,
const APInt &DemandedMask) {
- const APInt *SelC;
- if (!match(I->getOperand(OpNo), m_APInt(SelC)))
- return false;
-
- // Get the constant out of the ICmp, if there is one.
+ const APInt *SelC;
+ if (!match(I->getOperand(OpNo), m_APInt(SelC)))
+ return false;
+
+ // Get the constant out of the ICmp, if there is one.
// Only try this when exactly 1 operand is a constant (if both operands
// are constant, the icmp should eventually simplify). Otherwise, we may
// invert the transform that reduces set bits and infinite-loop.
Value *X;
- const APInt *CmpC;
- ICmpInst::Predicate Pred;
+ const APInt *CmpC;
+ ICmpInst::Predicate Pred;
if (!match(I->getOperand(0), m_ICmp(Pred, m_Value(X), m_APInt(CmpC))) ||
isa<Constant>(X) || CmpC->getBitWidth() != SelC->getBitWidth())
- return ShrinkDemandedConstant(I, OpNo, DemandedMask);
-
- // If the constant is already the same as the ICmp, leave it as-is.
- if (*CmpC == *SelC)
- return false;
- // If the constants are not already the same, but can be with the demand
- // mask, use the constant value from the ICmp.
- if ((*CmpC & DemandedMask) == (*SelC & DemandedMask)) {
- I->setOperand(OpNo, ConstantInt::get(I->getType(), *CmpC));
- return true;
- }
- return ShrinkDemandedConstant(I, OpNo, DemandedMask);
- };
- if (CanonicalizeSelectConstant(I, 1, DemandedMask) ||
- CanonicalizeSelectConstant(I, 2, DemandedMask))
- return I;
-
- // Only known if known in both the LHS and RHS.
+ return ShrinkDemandedConstant(I, OpNo, DemandedMask);
+
+ // If the constant is already the same as the ICmp, leave it as-is.
+ if (*CmpC == *SelC)
+ return false;
+ // If the constants are not already the same, but can be with the demand
+ // mask, use the constant value from the ICmp.
+ if ((*CmpC & DemandedMask) == (*SelC & DemandedMask)) {
+ I->setOperand(OpNo, ConstantInt::get(I->getType(), *CmpC));
+ return true;
+ }
+ return ShrinkDemandedConstant(I, OpNo, DemandedMask);
+ };
+ if (CanonicalizeSelectConstant(I, 1, DemandedMask) ||
+ CanonicalizeSelectConstant(I, 2, DemandedMask))
+ return I;
+
+ // Only known if known in both the LHS and RHS.
Known = KnownBits::commonBits(LHSKnown, RHSKnown);
- break;
- }
- case Instruction::ZExt:
- case Instruction::Trunc: {
- unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
-
- APInt InputDemandedMask = DemandedMask.zextOrTrunc(SrcBitWidth);
- KnownBits InputKnown(SrcBitWidth);
- if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1))
- return I;
- assert(InputKnown.getBitWidth() == SrcBitWidth && "Src width changed?");
- Known = InputKnown.zextOrTrunc(BitWidth);
- assert(!Known.hasConflict() && "Bits known to be one AND zero?");
- break;
- }
- case Instruction::BitCast:
- if (!I->getOperand(0)->getType()->isIntOrIntVectorTy())
- return nullptr; // vector->int or fp->int?
-
- if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) {
- if (VectorType *SrcVTy =
- dyn_cast<VectorType>(I->getOperand(0)->getType())) {
+ break;
+ }
+ case Instruction::ZExt:
+ case Instruction::Trunc: {
+ unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
+
+ APInt InputDemandedMask = DemandedMask.zextOrTrunc(SrcBitWidth);
+ KnownBits InputKnown(SrcBitWidth);
+ if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1))
+ return I;
+ assert(InputKnown.getBitWidth() == SrcBitWidth && "Src width changed?");
+ Known = InputKnown.zextOrTrunc(BitWidth);
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ break;
+ }
+ case Instruction::BitCast:
+ if (!I->getOperand(0)->getType()->isIntOrIntVectorTy())
+ return nullptr; // vector->int or fp->int?
+
+ if (VectorType *DstVTy = dyn_cast<VectorType>(I->getType())) {
+ if (VectorType *SrcVTy =
+ dyn_cast<VectorType>(I->getOperand(0)->getType())) {
if (cast<FixedVectorType>(DstVTy)->getNumElements() !=
cast<FixedVectorType>(SrcVTy)->getNumElements())
- // Don't touch a bitcast between vectors of different element counts.
- return nullptr;
- } else
- // Don't touch a scalar-to-vector bitcast.
- return nullptr;
- } else if (I->getOperand(0)->getType()->isVectorTy())
- // Don't touch a vector-to-scalar bitcast.
- return nullptr;
-
- if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1))
- return I;
- assert(!Known.hasConflict() && "Bits known to be one AND zero?");
- break;
- case Instruction::SExt: {
- // Compute the bits in the result that are not present in the input.
- unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
-
- APInt InputDemandedBits = DemandedMask.trunc(SrcBitWidth);
-
- // If any of the sign extended bits are demanded, we know that the sign
- // bit is demanded.
- if (DemandedMask.getActiveBits() > SrcBitWidth)
- InputDemandedBits.setBit(SrcBitWidth-1);
-
- KnownBits InputKnown(SrcBitWidth);
- if (SimplifyDemandedBits(I, 0, InputDemandedBits, InputKnown, Depth + 1))
- return I;
-
- // If the input sign bit is known zero, or if the NewBits are not demanded
- // convert this into a zero extension.
- if (InputKnown.isNonNegative() ||
- DemandedMask.getActiveBits() <= SrcBitWidth) {
- // Convert to ZExt cast.
- CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName());
- return InsertNewInstWith(NewCast, *I);
- }
-
- // If the sign bit of the input is known set or clear, then we know the
- // top bits of the result.
- Known = InputKnown.sext(BitWidth);
- assert(!Known.hasConflict() && "Bits known to be one AND zero?");
- break;
- }
- case Instruction::Add:
- if ((DemandedMask & 1) == 0) {
- // If we do not need the low bit, try to convert bool math to logic:
- // add iN (zext i1 X), (sext i1 Y) --> sext (~X & Y) to iN
- Value *X, *Y;
- if (match(I, m_c_Add(m_OneUse(m_ZExt(m_Value(X))),
- m_OneUse(m_SExt(m_Value(Y))))) &&
- X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) {
- // Truth table for inputs and output signbits:
- // X:0 | X:1
- // ----------
- // Y:0 | 0 | 0 |
- // Y:1 | -1 | 0 |
- // ----------
- IRBuilderBase::InsertPointGuard Guard(Builder);
- Builder.SetInsertPoint(I);
- Value *AndNot = Builder.CreateAnd(Builder.CreateNot(X), Y);
- return Builder.CreateSExt(AndNot, VTy);
- }
-
- // add iN (sext i1 X), (sext i1 Y) --> sext (X | Y) to iN
- // TODO: Relax the one-use checks because we are removing an instruction?
- if (match(I, m_Add(m_OneUse(m_SExt(m_Value(X))),
- m_OneUse(m_SExt(m_Value(Y))))) &&
- X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) {
- // Truth table for inputs and output signbits:
- // X:0 | X:1
- // -----------
- // Y:0 | -1 | -1 |
- // Y:1 | -1 | 0 |
- // -----------
- IRBuilderBase::InsertPointGuard Guard(Builder);
- Builder.SetInsertPoint(I);
- Value *Or = Builder.CreateOr(X, Y);
- return Builder.CreateSExt(Or, VTy);
- }
- }
- LLVM_FALLTHROUGH;
- case Instruction::Sub: {
- /// If the high-bits of an ADD/SUB are not demanded, then we do not care
- /// about the high bits of the operands.
- unsigned NLZ = DemandedMask.countLeadingZeros();
- // Right fill the mask of bits for this ADD/SUB to demand the most
- // significant bit and all those below it.
- APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
- if (ShrinkDemandedConstant(I, 0, DemandedFromOps) ||
- SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnown, Depth + 1) ||
- ShrinkDemandedConstant(I, 1, DemandedFromOps) ||
- SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnown, Depth + 1)) {
- if (NLZ > 0) {
- // Disable the nsw and nuw flags here: We can no longer guarantee that
- // we won't wrap after simplification. Removing the nsw/nuw flags is
- // legal here because the top bit is not demanded.
- BinaryOperator &BinOP = *cast<BinaryOperator>(I);
- BinOP.setHasNoSignedWrap(false);
- BinOP.setHasNoUnsignedWrap(false);
- }
- return I;
- }
-
- // If we are known to be adding/subtracting zeros to every bit below
- // the highest demanded bit, we just return the other side.
- if (DemandedFromOps.isSubsetOf(RHSKnown.Zero))
- return I->getOperand(0);
- // We can't do this with the LHS for subtraction, unless we are only
- // demanding the LSB.
- if ((I->getOpcode() == Instruction::Add ||
- DemandedFromOps.isOneValue()) &&
- DemandedFromOps.isSubsetOf(LHSKnown.Zero))
- return I->getOperand(1);
-
- // Otherwise just compute the known bits of the result.
- bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
- Known = KnownBits::computeForAddSub(I->getOpcode() == Instruction::Add,
- NSW, LHSKnown, RHSKnown);
- break;
- }
- case Instruction::Shl: {
- const APInt *SA;
- if (match(I->getOperand(1), m_APInt(SA))) {
- const APInt *ShrAmt;
- if (match(I->getOperand(0), m_Shr(m_Value(), m_APInt(ShrAmt))))
- if (Instruction *Shr = dyn_cast<Instruction>(I->getOperand(0)))
- if (Value *R = simplifyShrShlDemandedBits(Shr, *ShrAmt, I, *SA,
- DemandedMask, Known))
- return R;
-
- uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
- APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt));
-
- // If the shift is NUW/NSW, then it does demand the high bits.
- ShlOperator *IOp = cast<ShlOperator>(I);
- if (IOp->hasNoSignedWrap())
- DemandedMaskIn.setHighBits(ShiftAmt+1);
- else if (IOp->hasNoUnsignedWrap())
- DemandedMaskIn.setHighBits(ShiftAmt);
-
- if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
- return I;
- assert(!Known.hasConflict() && "Bits known to be one AND zero?");
-
- bool SignBitZero = Known.Zero.isSignBitSet();
- bool SignBitOne = Known.One.isSignBitSet();
- Known.Zero <<= ShiftAmt;
- Known.One <<= ShiftAmt;
- // low bits known zero.
- if (ShiftAmt)
- Known.Zero.setLowBits(ShiftAmt);
-
- // If this shift has "nsw" keyword, then the result is either a poison
- // value or has the same sign bit as the first operand.
- if (IOp->hasNoSignedWrap()) {
- if (SignBitZero)
- Known.Zero.setSignBit();
- else if (SignBitOne)
- Known.One.setSignBit();
- if (Known.hasConflict())
- return UndefValue::get(I->getType());
- }
- } else {
- computeKnownBits(I, Known, Depth, CxtI);
- }
- break;
- }
- case Instruction::LShr: {
- const APInt *SA;
- if (match(I->getOperand(1), m_APInt(SA))) {
- uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
-
- // Unsigned shift right.
- APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
-
- // If the shift is exact, then it does demand the low bits (and knows that
- // they are zero).
- if (cast<LShrOperator>(I)->isExact())
- DemandedMaskIn.setLowBits(ShiftAmt);
-
- if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
- return I;
- assert(!Known.hasConflict() && "Bits known to be one AND zero?");
- Known.Zero.lshrInPlace(ShiftAmt);
- Known.One.lshrInPlace(ShiftAmt);
- if (ShiftAmt)
- Known.Zero.setHighBits(ShiftAmt); // high bits known zero.
- } else {
- computeKnownBits(I, Known, Depth, CxtI);
- }
- break;
- }
- case Instruction::AShr: {
- // If this is an arithmetic shift right and only the low-bit is set, we can
- // always convert this into a logical shr, even if the shift amount is
- // variable. The low bit of the shift cannot be an input sign bit unless
- // the shift amount is >= the size of the datatype, which is undefined.
- if (DemandedMask.isOneValue()) {
- // Perform the logical shift right.
- Instruction *NewVal = BinaryOperator::CreateLShr(
- I->getOperand(0), I->getOperand(1), I->getName());
- return InsertNewInstWith(NewVal, *I);
- }
-
- // If the sign bit is the only bit demanded by this ashr, then there is no
- // need to do it, the shift doesn't change the high bit.
- if (DemandedMask.isSignMask())
- return I->getOperand(0);
-
- const APInt *SA;
- if (match(I->getOperand(1), m_APInt(SA))) {
- uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
-
- // Signed shift right.
- APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
- // If any of the high bits are demanded, we should set the sign bit as
- // demanded.
- if (DemandedMask.countLeadingZeros() <= ShiftAmt)
- DemandedMaskIn.setSignBit();
-
- // If the shift is exact, then it does demand the low bits (and knows that
- // they are zero).
- if (cast<AShrOperator>(I)->isExact())
- DemandedMaskIn.setLowBits(ShiftAmt);
-
- if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
- return I;
-
- unsigned SignBits = ComputeNumSignBits(I->getOperand(0), Depth + 1, CxtI);
-
- assert(!Known.hasConflict() && "Bits known to be one AND zero?");
- // Compute the new bits that are at the top now plus sign bits.
- APInt HighBits(APInt::getHighBitsSet(
- BitWidth, std::min(SignBits + ShiftAmt - 1, BitWidth)));
- Known.Zero.lshrInPlace(ShiftAmt);
- Known.One.lshrInPlace(ShiftAmt);
-
- // If the input sign bit is known to be zero, or if none of the top bits
- // are demanded, turn this into an unsigned shift right.
- assert(BitWidth > ShiftAmt && "Shift amount not saturated?");
- if (Known.Zero[BitWidth-ShiftAmt-1] ||
- !DemandedMask.intersects(HighBits)) {
- BinaryOperator *LShr = BinaryOperator::CreateLShr(I->getOperand(0),
- I->getOperand(1));
- LShr->setIsExact(cast<BinaryOperator>(I)->isExact());
- return InsertNewInstWith(LShr, *I);
- } else if (Known.One[BitWidth-ShiftAmt-1]) { // New bits are known one.
- Known.One |= HighBits;
- }
- } else {
- computeKnownBits(I, Known, Depth, CxtI);
- }
- break;
- }
- case Instruction::UDiv: {
- // UDiv doesn't demand low bits that are zero in the divisor.
- const APInt *SA;
- if (match(I->getOperand(1), m_APInt(SA))) {
- // If the shift is exact, then it does demand the low bits.
- if (cast<UDivOperator>(I)->isExact())
- break;
-
- // FIXME: Take the demanded mask of the result into account.
- unsigned RHSTrailingZeros = SA->countTrailingZeros();
- APInt DemandedMaskIn =
- APInt::getHighBitsSet(BitWidth, BitWidth - RHSTrailingZeros);
- if (SimplifyDemandedBits(I, 0, DemandedMaskIn, LHSKnown, Depth + 1))
- return I;
-
- // Propagate zero bits from the input.
- Known.Zero.setHighBits(std::min(
- BitWidth, LHSKnown.Zero.countLeadingOnes() + RHSTrailingZeros));
- } else {
- computeKnownBits(I, Known, Depth, CxtI);
- }
- break;
- }
+ // Don't touch a bitcast between vectors of different element counts.
+ return nullptr;
+ } else
+ // Don't touch a scalar-to-vector bitcast.
+ return nullptr;
+ } else if (I->getOperand(0)->getType()->isVectorTy())
+ // Don't touch a vector-to-scalar bitcast.
+ return nullptr;
+
+ if (SimplifyDemandedBits(I, 0, DemandedMask, Known, Depth + 1))
+ return I;
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ break;
+ case Instruction::SExt: {
+ // Compute the bits in the result that are not present in the input.
+ unsigned SrcBitWidth = I->getOperand(0)->getType()->getScalarSizeInBits();
+
+ APInt InputDemandedBits = DemandedMask.trunc(SrcBitWidth);
+
+ // If any of the sign extended bits are demanded, we know that the sign
+ // bit is demanded.
+ if (DemandedMask.getActiveBits() > SrcBitWidth)
+ InputDemandedBits.setBit(SrcBitWidth-1);
+
+ KnownBits InputKnown(SrcBitWidth);
+ if (SimplifyDemandedBits(I, 0, InputDemandedBits, InputKnown, Depth + 1))
+ return I;
+
+ // If the input sign bit is known zero, or if the NewBits are not demanded
+ // convert this into a zero extension.
+ if (InputKnown.isNonNegative() ||
+ DemandedMask.getActiveBits() <= SrcBitWidth) {
+ // Convert to ZExt cast.
+ CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName());
+ return InsertNewInstWith(NewCast, *I);
+ }
+
+ // If the sign bit of the input is known set or clear, then we know the
+ // top bits of the result.
+ Known = InputKnown.sext(BitWidth);
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ break;
+ }
+ case Instruction::Add:
+ if ((DemandedMask & 1) == 0) {
+ // If we do not need the low bit, try to convert bool math to logic:
+ // add iN (zext i1 X), (sext i1 Y) --> sext (~X & Y) to iN
+ Value *X, *Y;
+ if (match(I, m_c_Add(m_OneUse(m_ZExt(m_Value(X))),
+ m_OneUse(m_SExt(m_Value(Y))))) &&
+ X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) {
+ // Truth table for inputs and output signbits:
+ // X:0 | X:1
+ // ----------
+ // Y:0 | 0 | 0 |
+ // Y:1 | -1 | 0 |
+ // ----------
+ IRBuilderBase::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(I);
+ Value *AndNot = Builder.CreateAnd(Builder.CreateNot(X), Y);
+ return Builder.CreateSExt(AndNot, VTy);
+ }
+
+ // add iN (sext i1 X), (sext i1 Y) --> sext (X | Y) to iN
+ // TODO: Relax the one-use checks because we are removing an instruction?
+ if (match(I, m_Add(m_OneUse(m_SExt(m_Value(X))),
+ m_OneUse(m_SExt(m_Value(Y))))) &&
+ X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) {
+ // Truth table for inputs and output signbits:
+ // X:0 | X:1
+ // -----------
+ // Y:0 | -1 | -1 |
+ // Y:1 | -1 | 0 |
+ // -----------
+ IRBuilderBase::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(I);
+ Value *Or = Builder.CreateOr(X, Y);
+ return Builder.CreateSExt(Or, VTy);
+ }
+ }
+ LLVM_FALLTHROUGH;
+ case Instruction::Sub: {
+ /// If the high-bits of an ADD/SUB are not demanded, then we do not care
+ /// about the high bits of the operands.
+ unsigned NLZ = DemandedMask.countLeadingZeros();
+ // Right fill the mask of bits for this ADD/SUB to demand the most
+ // significant bit and all those below it.
+ APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
+ if (ShrinkDemandedConstant(I, 0, DemandedFromOps) ||
+ SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnown, Depth + 1) ||
+ ShrinkDemandedConstant(I, 1, DemandedFromOps) ||
+ SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnown, Depth + 1)) {
+ if (NLZ > 0) {
+ // Disable the nsw and nuw flags here: We can no longer guarantee that
+ // we won't wrap after simplification. Removing the nsw/nuw flags is
+ // legal here because the top bit is not demanded.
+ BinaryOperator &BinOP = *cast<BinaryOperator>(I);
+ BinOP.setHasNoSignedWrap(false);
+ BinOP.setHasNoUnsignedWrap(false);
+ }
+ return I;
+ }
+
+ // If we are known to be adding/subtracting zeros to every bit below
+ // the highest demanded bit, we just return the other side.
+ if (DemandedFromOps.isSubsetOf(RHSKnown.Zero))
+ return I->getOperand(0);
+ // We can't do this with the LHS for subtraction, unless we are only
+ // demanding the LSB.
+ if ((I->getOpcode() == Instruction::Add ||
+ DemandedFromOps.isOneValue()) &&
+ DemandedFromOps.isSubsetOf(LHSKnown.Zero))
+ return I->getOperand(1);
+
+ // Otherwise just compute the known bits of the result.
+ bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
+ Known = KnownBits::computeForAddSub(I->getOpcode() == Instruction::Add,
+ NSW, LHSKnown, RHSKnown);
+ break;
+ }
+ case Instruction::Shl: {
+ const APInt *SA;
+ if (match(I->getOperand(1), m_APInt(SA))) {
+ const APInt *ShrAmt;
+ if (match(I->getOperand(0), m_Shr(m_Value(), m_APInt(ShrAmt))))
+ if (Instruction *Shr = dyn_cast<Instruction>(I->getOperand(0)))
+ if (Value *R = simplifyShrShlDemandedBits(Shr, *ShrAmt, I, *SA,
+ DemandedMask, Known))
+ return R;
+
+ uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+ APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt));
+
+ // If the shift is NUW/NSW, then it does demand the high bits.
+ ShlOperator *IOp = cast<ShlOperator>(I);
+ if (IOp->hasNoSignedWrap())
+ DemandedMaskIn.setHighBits(ShiftAmt+1);
+ else if (IOp->hasNoUnsignedWrap())
+ DemandedMaskIn.setHighBits(ShiftAmt);
+
+ if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
+ return I;
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+
+ bool SignBitZero = Known.Zero.isSignBitSet();
+ bool SignBitOne = Known.One.isSignBitSet();
+ Known.Zero <<= ShiftAmt;
+ Known.One <<= ShiftAmt;
+ // low bits known zero.
+ if (ShiftAmt)
+ Known.Zero.setLowBits(ShiftAmt);
+
+ // If this shift has "nsw" keyword, then the result is either a poison
+ // value or has the same sign bit as the first operand.
+ if (IOp->hasNoSignedWrap()) {
+ if (SignBitZero)
+ Known.Zero.setSignBit();
+ else if (SignBitOne)
+ Known.One.setSignBit();
+ if (Known.hasConflict())
+ return UndefValue::get(I->getType());
+ }
+ } else {
+ computeKnownBits(I, Known, Depth, CxtI);
+ }
+ break;
+ }
+ case Instruction::LShr: {
+ const APInt *SA;
+ if (match(I->getOperand(1), m_APInt(SA))) {
+ uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+
+ // Unsigned shift right.
+ APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
+
+ // If the shift is exact, then it does demand the low bits (and knows that
+ // they are zero).
+ if (cast<LShrOperator>(I)->isExact())
+ DemandedMaskIn.setLowBits(ShiftAmt);
+
+ if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
+ return I;
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ Known.Zero.lshrInPlace(ShiftAmt);
+ Known.One.lshrInPlace(ShiftAmt);
+ if (ShiftAmt)
+ Known.Zero.setHighBits(ShiftAmt); // high bits known zero.
+ } else {
+ computeKnownBits(I, Known, Depth, CxtI);
+ }
+ break;
+ }
+ case Instruction::AShr: {
+ // If this is an arithmetic shift right and only the low-bit is set, we can
+ // always convert this into a logical shr, even if the shift amount is
+ // variable. The low bit of the shift cannot be an input sign bit unless
+ // the shift amount is >= the size of the datatype, which is undefined.
+ if (DemandedMask.isOneValue()) {
+ // Perform the logical shift right.
+ Instruction *NewVal = BinaryOperator::CreateLShr(
+ I->getOperand(0), I->getOperand(1), I->getName());
+ return InsertNewInstWith(NewVal, *I);
+ }
+
+ // If the sign bit is the only bit demanded by this ashr, then there is no
+ // need to do it, the shift doesn't change the high bit.
+ if (DemandedMask.isSignMask())
+ return I->getOperand(0);
+
+ const APInt *SA;
+ if (match(I->getOperand(1), m_APInt(SA))) {
+ uint32_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
+
+ // Signed shift right.
+ APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
+ // If any of the high bits are demanded, we should set the sign bit as
+ // demanded.
+ if (DemandedMask.countLeadingZeros() <= ShiftAmt)
+ DemandedMaskIn.setSignBit();
+
+ // If the shift is exact, then it does demand the low bits (and knows that
+ // they are zero).
+ if (cast<AShrOperator>(I)->isExact())
+ DemandedMaskIn.setLowBits(ShiftAmt);
+
+ if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
+ return I;
+
+ unsigned SignBits = ComputeNumSignBits(I->getOperand(0), Depth + 1, CxtI);
+
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ // Compute the new bits that are at the top now plus sign bits.
+ APInt HighBits(APInt::getHighBitsSet(
+ BitWidth, std::min(SignBits + ShiftAmt - 1, BitWidth)));
+ Known.Zero.lshrInPlace(ShiftAmt);
+ Known.One.lshrInPlace(ShiftAmt);
+
+ // If the input sign bit is known to be zero, or if none of the top bits
+ // are demanded, turn this into an unsigned shift right.
+ assert(BitWidth > ShiftAmt && "Shift amount not saturated?");
+ if (Known.Zero[BitWidth-ShiftAmt-1] ||
+ !DemandedMask.intersects(HighBits)) {
+ BinaryOperator *LShr = BinaryOperator::CreateLShr(I->getOperand(0),
+ I->getOperand(1));
+ LShr->setIsExact(cast<BinaryOperator>(I)->isExact());
+ return InsertNewInstWith(LShr, *I);
+ } else if (Known.One[BitWidth-ShiftAmt-1]) { // New bits are known one.
+ Known.One |= HighBits;
+ }
+ } else {
+ computeKnownBits(I, Known, Depth, CxtI);
+ }
+ break;
+ }
+ case Instruction::UDiv: {
+ // UDiv doesn't demand low bits that are zero in the divisor.
+ const APInt *SA;
+ if (match(I->getOperand(1), m_APInt(SA))) {
+ // If the shift is exact, then it does demand the low bits.
+ if (cast<UDivOperator>(I)->isExact())
+ break;
+
+ // FIXME: Take the demanded mask of the result into account.
+ unsigned RHSTrailingZeros = SA->countTrailingZeros();
+ APInt DemandedMaskIn =
+ APInt::getHighBitsSet(BitWidth, BitWidth - RHSTrailingZeros);
+ if (SimplifyDemandedBits(I, 0, DemandedMaskIn, LHSKnown, Depth + 1))
+ return I;
+
+ // Propagate zero bits from the input.
+ Known.Zero.setHighBits(std::min(
+ BitWidth, LHSKnown.Zero.countLeadingOnes() + RHSTrailingZeros));
+ } else {
+ computeKnownBits(I, Known, Depth, CxtI);
+ }
+ break;
+ }
case Instruction::SRem: {
ConstantInt *Rem;
if (match(I->getOperand(1), m_ConstantInt(Rem))) {
- // X % -1 demands all the bits because we don't want to introduce
- // INT_MIN % -1 (== undef) by accident.
- if (Rem->isMinusOne())
- break;
- APInt RA = Rem->getValue().abs();
- if (RA.isPowerOf2()) {
- if (DemandedMask.ult(RA)) // srem won't affect demanded bits
- return I->getOperand(0);
-
- APInt LowBits = RA - 1;
- APInt Mask2 = LowBits | APInt::getSignMask(BitWidth);
- if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1))
- return I;
-
- // The low bits of LHS are unchanged by the srem.
- Known.Zero = LHSKnown.Zero & LowBits;
- Known.One = LHSKnown.One & LowBits;
-
- // If LHS is non-negative or has all low bits zero, then the upper bits
- // are all zero.
- if (LHSKnown.isNonNegative() || LowBits.isSubsetOf(LHSKnown.Zero))
- Known.Zero |= ~LowBits;
-
- // If LHS is negative and not all low bits are zero, then the upper bits
- // are all one.
- if (LHSKnown.isNegative() && LowBits.intersects(LHSKnown.One))
- Known.One |= ~LowBits;
-
- assert(!Known.hasConflict() && "Bits known to be one AND zero?");
- break;
- }
- }
-
- // The sign bit is the LHS's sign bit, except when the result of the
- // remainder is zero.
- if (DemandedMask.isSignBitSet()) {
- computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, CxtI);
- // If it's known zero, our sign bit is also zero.
- if (LHSKnown.isNonNegative())
- Known.makeNonNegative();
- }
- break;
+ // X % -1 demands all the bits because we don't want to introduce
+ // INT_MIN % -1 (== undef) by accident.
+ if (Rem->isMinusOne())
+ break;
+ APInt RA = Rem->getValue().abs();
+ if (RA.isPowerOf2()) {
+ if (DemandedMask.ult(RA)) // srem won't affect demanded bits
+ return I->getOperand(0);
+
+ APInt LowBits = RA - 1;
+ APInt Mask2 = LowBits | APInt::getSignMask(BitWidth);
+ if (SimplifyDemandedBits(I, 0, Mask2, LHSKnown, Depth + 1))
+ return I;
+
+ // The low bits of LHS are unchanged by the srem.
+ Known.Zero = LHSKnown.Zero & LowBits;
+ Known.One = LHSKnown.One & LowBits;
+
+ // If LHS is non-negative or has all low bits zero, then the upper bits
+ // are all zero.
+ if (LHSKnown.isNonNegative() || LowBits.isSubsetOf(LHSKnown.Zero))
+ Known.Zero |= ~LowBits;
+
+ // If LHS is negative and not all low bits are zero, then the upper bits
+ // are all one.
+ if (LHSKnown.isNegative() && LowBits.intersects(LHSKnown.One))
+ Known.One |= ~LowBits;
+
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ break;
+ }
+ }
+
+ // The sign bit is the LHS's sign bit, except when the result of the
+ // remainder is zero.
+ if (DemandedMask.isSignBitSet()) {
+ computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1, CxtI);
+ // If it's known zero, our sign bit is also zero.
+ if (LHSKnown.isNonNegative())
+ Known.makeNonNegative();
+ }
+ break;
}
- case Instruction::URem: {
- KnownBits Known2(BitWidth);
- APInt AllOnes = APInt::getAllOnesValue(BitWidth);
- if (SimplifyDemandedBits(I, 0, AllOnes, Known2, Depth + 1) ||
- SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1))
- return I;
-
- unsigned Leaders = Known2.countMinLeadingZeros();
- Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
- break;
- }
- case Instruction::Call: {
- bool KnownBitsComputed = false;
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- switch (II->getIntrinsicID()) {
- case Intrinsic::bswap: {
- // If the only bits demanded come from one byte of the bswap result,
- // just shift the input byte into position to eliminate the bswap.
- unsigned NLZ = DemandedMask.countLeadingZeros();
- unsigned NTZ = DemandedMask.countTrailingZeros();
-
- // Round NTZ down to the next byte. If we have 11 trailing zeros, then
- // we need all the bits down to bit 8. Likewise, round NLZ. If we
- // have 14 leading zeros, round to 8.
- NLZ &= ~7;
- NTZ &= ~7;
- // If we need exactly one byte, we can do this transformation.
- if (BitWidth-NLZ-NTZ == 8) {
- unsigned ResultBit = NTZ;
- unsigned InputBit = BitWidth-NTZ-8;
-
- // Replace this with either a left or right shift to get the byte into
- // the right place.
- Instruction *NewVal;
- if (InputBit > ResultBit)
- NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0),
- ConstantInt::get(I->getType(), InputBit-ResultBit));
- else
- NewVal = BinaryOperator::CreateShl(II->getArgOperand(0),
- ConstantInt::get(I->getType(), ResultBit-InputBit));
- NewVal->takeName(I);
- return InsertNewInstWith(NewVal, *I);
- }
- break;
- }
- case Intrinsic::fshr:
- case Intrinsic::fshl: {
- const APInt *SA;
- if (!match(I->getOperand(2), m_APInt(SA)))
- break;
-
- // Normalize to funnel shift left. APInt shifts of BitWidth are well-
- // defined, so no need to special-case zero shifts here.
- uint64_t ShiftAmt = SA->urem(BitWidth);
- if (II->getIntrinsicID() == Intrinsic::fshr)
- ShiftAmt = BitWidth - ShiftAmt;
-
- APInt DemandedMaskLHS(DemandedMask.lshr(ShiftAmt));
- APInt DemandedMaskRHS(DemandedMask.shl(BitWidth - ShiftAmt));
- if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1) ||
- SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1))
- return I;
-
- Known.Zero = LHSKnown.Zero.shl(ShiftAmt) |
- RHSKnown.Zero.lshr(BitWidth - ShiftAmt);
- Known.One = LHSKnown.One.shl(ShiftAmt) |
- RHSKnown.One.lshr(BitWidth - ShiftAmt);
- KnownBitsComputed = true;
- break;
- }
+ case Instruction::URem: {
+ KnownBits Known2(BitWidth);
+ APInt AllOnes = APInt::getAllOnesValue(BitWidth);
+ if (SimplifyDemandedBits(I, 0, AllOnes, Known2, Depth + 1) ||
+ SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1))
+ return I;
+
+ unsigned Leaders = Known2.countMinLeadingZeros();
+ Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
+ break;
+ }
+ case Instruction::Call: {
+ bool KnownBitsComputed = false;
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::bswap: {
+ // If the only bits demanded come from one byte of the bswap result,
+ // just shift the input byte into position to eliminate the bswap.
+ unsigned NLZ = DemandedMask.countLeadingZeros();
+ unsigned NTZ = DemandedMask.countTrailingZeros();
+
+ // Round NTZ down to the next byte. If we have 11 trailing zeros, then
+ // we need all the bits down to bit 8. Likewise, round NLZ. If we
+ // have 14 leading zeros, round to 8.
+ NLZ &= ~7;
+ NTZ &= ~7;
+ // If we need exactly one byte, we can do this transformation.
+ if (BitWidth-NLZ-NTZ == 8) {
+ unsigned ResultBit = NTZ;
+ unsigned InputBit = BitWidth-NTZ-8;
+
+ // Replace this with either a left or right shift to get the byte into
+ // the right place.
+ Instruction *NewVal;
+ if (InputBit > ResultBit)
+ NewVal = BinaryOperator::CreateLShr(II->getArgOperand(0),
+ ConstantInt::get(I->getType(), InputBit-ResultBit));
+ else
+ NewVal = BinaryOperator::CreateShl(II->getArgOperand(0),
+ ConstantInt::get(I->getType(), ResultBit-InputBit));
+ NewVal->takeName(I);
+ return InsertNewInstWith(NewVal, *I);
+ }
+ break;
+ }
+ case Intrinsic::fshr:
+ case Intrinsic::fshl: {
+ const APInt *SA;
+ if (!match(I->getOperand(2), m_APInt(SA)))
+ break;
+
+ // Normalize to funnel shift left. APInt shifts of BitWidth are well-
+ // defined, so no need to special-case zero shifts here.
+ uint64_t ShiftAmt = SA->urem(BitWidth);
+ if (II->getIntrinsicID() == Intrinsic::fshr)
+ ShiftAmt = BitWidth - ShiftAmt;
+
+ APInt DemandedMaskLHS(DemandedMask.lshr(ShiftAmt));
+ APInt DemandedMaskRHS(DemandedMask.shl(BitWidth - ShiftAmt));
+ if (SimplifyDemandedBits(I, 0, DemandedMaskLHS, LHSKnown, Depth + 1) ||
+ SimplifyDemandedBits(I, 1, DemandedMaskRHS, RHSKnown, Depth + 1))
+ return I;
+
+ Known.Zero = LHSKnown.Zero.shl(ShiftAmt) |
+ RHSKnown.Zero.lshr(BitWidth - ShiftAmt);
+ Known.One = LHSKnown.One.shl(ShiftAmt) |
+ RHSKnown.One.lshr(BitWidth - ShiftAmt);
+ KnownBitsComputed = true;
+ break;
+ }
default: {
// Handle target specific intrinsics
Optional<Value *> V = targetSimplifyDemandedUseBitsIntrinsic(
*II, DemandedMask, Known, KnownBitsComputed);
if (V.hasValue())
return V.getValue();
- break;
- }
- }
- }
-
- if (!KnownBitsComputed)
- computeKnownBits(V, Known, Depth, CxtI);
- break;
- }
- }
-
- // If the client is only demanding bits that we know, return the known
- // constant.
- if (DemandedMask.isSubsetOf(Known.Zero|Known.One))
- return Constant::getIntegerValue(VTy, Known.One);
- return nullptr;
-}
-
-/// Helper routine of SimplifyDemandedUseBits. It computes Known
-/// bits. It also tries to handle simplifications that can be done based on
-/// DemandedMask, but without modifying the Instruction.
+ break;
+ }
+ }
+ }
+
+ if (!KnownBitsComputed)
+ computeKnownBits(V, Known, Depth, CxtI);
+ break;
+ }
+ }
+
+ // If the client is only demanding bits that we know, return the known
+ // constant.
+ if (DemandedMask.isSubsetOf(Known.Zero|Known.One))
+ return Constant::getIntegerValue(VTy, Known.One);
+ return nullptr;
+}
+
+/// Helper routine of SimplifyDemandedUseBits. It computes Known
+/// bits. It also tries to handle simplifications that can be done based on
+/// DemandedMask, but without modifying the Instruction.
Value *InstCombinerImpl::SimplifyMultipleUseDemandedBits(
Instruction *I, const APInt &DemandedMask, KnownBits &Known, unsigned Depth,
Instruction *CxtI) {
- unsigned BitWidth = DemandedMask.getBitWidth();
- Type *ITy = I->getType();
-
- KnownBits LHSKnown(BitWidth);
- KnownBits RHSKnown(BitWidth);
-
- // Despite the fact that we can't simplify this instruction in all User's
- // context, we can at least compute the known bits, and we can
- // do simplifications that apply to *just* the one user if we know that
- // this instruction has a simpler value in that context.
- switch (I->getOpcode()) {
- case Instruction::And: {
- // If either the LHS or the RHS are Zero, the result is zero.
- computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
- computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
- CxtI);
-
- Known = LHSKnown & RHSKnown;
-
- // If the client is only demanding bits that we know, return the known
- // constant.
- if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
- return Constant::getIntegerValue(ITy, Known.One);
-
- // If all of the demanded bits are known 1 on one side, return the other.
- // These bits cannot contribute to the result of the 'and' in this
- // context.
- if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
- return I->getOperand(0);
- if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
- return I->getOperand(1);
-
- break;
- }
- case Instruction::Or: {
- // We can simplify (X|Y) -> X or Y in the user's context if we know that
- // only bits from X or Y are demanded.
-
- // If either the LHS or the RHS are One, the result is One.
- computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
- computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
- CxtI);
-
- Known = LHSKnown | RHSKnown;
-
- // If the client is only demanding bits that we know, return the known
- // constant.
- if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
- return Constant::getIntegerValue(ITy, Known.One);
-
- // If all of the demanded bits are known zero on one side, return the
- // other. These bits cannot contribute to the result of the 'or' in this
- // context.
- if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
- return I->getOperand(0);
- if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
- return I->getOperand(1);
-
- break;
- }
- case Instruction::Xor: {
- // We can simplify (X^Y) -> X or Y in the user's context if we know that
- // only bits from X or Y are demanded.
-
- computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
- computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
- CxtI);
-
- Known = LHSKnown ^ RHSKnown;
-
- // If the client is only demanding bits that we know, return the known
- // constant.
- if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
- return Constant::getIntegerValue(ITy, Known.One);
-
- // If all of the demanded bits are known zero on one side, return the
- // other.
- if (DemandedMask.isSubsetOf(RHSKnown.Zero))
- return I->getOperand(0);
- if (DemandedMask.isSubsetOf(LHSKnown.Zero))
- return I->getOperand(1);
-
- break;
- }
+ unsigned BitWidth = DemandedMask.getBitWidth();
+ Type *ITy = I->getType();
+
+ KnownBits LHSKnown(BitWidth);
+ KnownBits RHSKnown(BitWidth);
+
+ // Despite the fact that we can't simplify this instruction in all User's
+ // context, we can at least compute the known bits, and we can
+ // do simplifications that apply to *just* the one user if we know that
+ // this instruction has a simpler value in that context.
+ switch (I->getOpcode()) {
+ case Instruction::And: {
+ // If either the LHS or the RHS are Zero, the result is zero.
+ computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
+ computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
+ CxtI);
+
+ Known = LHSKnown & RHSKnown;
+
+ // If the client is only demanding bits that we know, return the known
+ // constant.
+ if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+ return Constant::getIntegerValue(ITy, Known.One);
+
+ // If all of the demanded bits are known 1 on one side, return the other.
+ // These bits cannot contribute to the result of the 'and' in this
+ // context.
+ if (DemandedMask.isSubsetOf(LHSKnown.Zero | RHSKnown.One))
+ return I->getOperand(0);
+ if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
+ return I->getOperand(1);
+
+ break;
+ }
+ case Instruction::Or: {
+ // We can simplify (X|Y) -> X or Y in the user's context if we know that
+ // only bits from X or Y are demanded.
+
+ // If either the LHS or the RHS are One, the result is One.
+ computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
+ computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
+ CxtI);
+
+ Known = LHSKnown | RHSKnown;
+
+ // If the client is only demanding bits that we know, return the known
+ // constant.
+ if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+ return Constant::getIntegerValue(ITy, Known.One);
+
+ // If all of the demanded bits are known zero on one side, return the
+ // other. These bits cannot contribute to the result of the 'or' in this
+ // context.
+ if (DemandedMask.isSubsetOf(LHSKnown.One | RHSKnown.Zero))
+ return I->getOperand(0);
+ if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
+ return I->getOperand(1);
+
+ break;
+ }
+ case Instruction::Xor: {
+ // We can simplify (X^Y) -> X or Y in the user's context if we know that
+ // only bits from X or Y are demanded.
+
+ computeKnownBits(I->getOperand(1), RHSKnown, Depth + 1, CxtI);
+ computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
+ CxtI);
+
+ Known = LHSKnown ^ RHSKnown;
+
+ // If the client is only demanding bits that we know, return the known
+ // constant.
+ if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+ return Constant::getIntegerValue(ITy, Known.One);
+
+ // If all of the demanded bits are known zero on one side, return the
+ // other.
+ if (DemandedMask.isSubsetOf(RHSKnown.Zero))
+ return I->getOperand(0);
+ if (DemandedMask.isSubsetOf(LHSKnown.Zero))
+ return I->getOperand(1);
+
+ break;
+ }
case Instruction::AShr: {
// Compute the Known bits to simplify things downstream.
computeKnownBits(I, Known, Depth, CxtI);
@@ -930,260 +930,260 @@ Value *InstCombinerImpl::SimplifyMultipleUseDemandedBits(
break;
}
- default:
- // Compute the Known bits to simplify things downstream.
- computeKnownBits(I, Known, Depth, CxtI);
-
- // If this user is only demanding bits that we know, return the known
- // constant.
- if (DemandedMask.isSubsetOf(Known.Zero|Known.One))
- return Constant::getIntegerValue(ITy, Known.One);
-
- break;
- }
-
- return nullptr;
-}
-
-/// Helper routine of SimplifyDemandedUseBits. It tries to simplify
-/// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into
-/// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign
-/// of "C2-C1".
-///
-/// Suppose E1 and E2 are generally different in bits S={bm, bm+1,
-/// ..., bn}, without considering the specific value X is holding.
-/// This transformation is legal iff one of following conditions is hold:
-/// 1) All the bit in S are 0, in this case E1 == E2.
-/// 2) We don't care those bits in S, per the input DemandedMask.
-/// 3) Combination of 1) and 2). Some bits in S are 0, and we don't care the
-/// rest bits.
-///
-/// Currently we only test condition 2).
-///
-/// As with SimplifyDemandedUseBits, it returns NULL if the simplification was
-/// not successful.
+ default:
+ // Compute the Known bits to simplify things downstream.
+ computeKnownBits(I, Known, Depth, CxtI);
+
+ // If this user is only demanding bits that we know, return the known
+ // constant.
+ if (DemandedMask.isSubsetOf(Known.Zero|Known.One))
+ return Constant::getIntegerValue(ITy, Known.One);
+
+ break;
+ }
+
+ return nullptr;
+}
+
+/// Helper routine of SimplifyDemandedUseBits. It tries to simplify
+/// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into
+/// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign
+/// of "C2-C1".
+///
+/// Suppose E1 and E2 are generally different in bits S={bm, bm+1,
+/// ..., bn}, without considering the specific value X is holding.
+/// This transformation is legal iff one of following conditions is hold:
+/// 1) All the bit in S are 0, in this case E1 == E2.
+/// 2) We don't care those bits in S, per the input DemandedMask.
+/// 3) Combination of 1) and 2). Some bits in S are 0, and we don't care the
+/// rest bits.
+///
+/// Currently we only test condition 2).
+///
+/// As with SimplifyDemandedUseBits, it returns NULL if the simplification was
+/// not successful.
Value *InstCombinerImpl::simplifyShrShlDemandedBits(
Instruction *Shr, const APInt &ShrOp1, Instruction *Shl,
const APInt &ShlOp1, const APInt &DemandedMask, KnownBits &Known) {
- if (!ShlOp1 || !ShrOp1)
- return nullptr; // No-op.
-
- Value *VarX = Shr->getOperand(0);
- Type *Ty = VarX->getType();
- unsigned BitWidth = Ty->getScalarSizeInBits();
- if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth))
- return nullptr; // Undef.
-
- unsigned ShlAmt = ShlOp1.getZExtValue();
- unsigned ShrAmt = ShrOp1.getZExtValue();
-
- Known.One.clearAllBits();
- Known.Zero.setLowBits(ShlAmt - 1);
- Known.Zero &= DemandedMask;
-
- APInt BitMask1(APInt::getAllOnesValue(BitWidth));
- APInt BitMask2(APInt::getAllOnesValue(BitWidth));
-
- bool isLshr = (Shr->getOpcode() == Instruction::LShr);
- BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) :
- (BitMask1.ashr(ShrAmt) << ShlAmt);
-
- if (ShrAmt <= ShlAmt) {
- BitMask2 <<= (ShlAmt - ShrAmt);
- } else {
- BitMask2 = isLshr ? BitMask2.lshr(ShrAmt - ShlAmt):
- BitMask2.ashr(ShrAmt - ShlAmt);
- }
-
- // Check if condition-2 (see the comment to this function) is satified.
- if ((BitMask1 & DemandedMask) == (BitMask2 & DemandedMask)) {
- if (ShrAmt == ShlAmt)
- return VarX;
-
- if (!Shr->hasOneUse())
- return nullptr;
-
- BinaryOperator *New;
- if (ShrAmt < ShlAmt) {
- Constant *Amt = ConstantInt::get(VarX->getType(), ShlAmt - ShrAmt);
- New = BinaryOperator::CreateShl(VarX, Amt);
- BinaryOperator *Orig = cast<BinaryOperator>(Shl);
- New->setHasNoSignedWrap(Orig->hasNoSignedWrap());
- New->setHasNoUnsignedWrap(Orig->hasNoUnsignedWrap());
- } else {
- Constant *Amt = ConstantInt::get(VarX->getType(), ShrAmt - ShlAmt);
- New = isLshr ? BinaryOperator::CreateLShr(VarX, Amt) :
- BinaryOperator::CreateAShr(VarX, Amt);
- if (cast<BinaryOperator>(Shr)->isExact())
- New->setIsExact(true);
- }
-
- return InsertNewInstWith(New, *Shl);
- }
-
- return nullptr;
-}
-
-/// The specified value produces a vector with any number of elements.
+ if (!ShlOp1 || !ShrOp1)
+ return nullptr; // No-op.
+
+ Value *VarX = Shr->getOperand(0);
+ Type *Ty = VarX->getType();
+ unsigned BitWidth = Ty->getScalarSizeInBits();
+ if (ShlOp1.uge(BitWidth) || ShrOp1.uge(BitWidth))
+ return nullptr; // Undef.
+
+ unsigned ShlAmt = ShlOp1.getZExtValue();
+ unsigned ShrAmt = ShrOp1.getZExtValue();
+
+ Known.One.clearAllBits();
+ Known.Zero.setLowBits(ShlAmt - 1);
+ Known.Zero &= DemandedMask;
+
+ APInt BitMask1(APInt::getAllOnesValue(BitWidth));
+ APInt BitMask2(APInt::getAllOnesValue(BitWidth));
+
+ bool isLshr = (Shr->getOpcode() == Instruction::LShr);
+ BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) :
+ (BitMask1.ashr(ShrAmt) << ShlAmt);
+
+ if (ShrAmt <= ShlAmt) {
+ BitMask2 <<= (ShlAmt - ShrAmt);
+ } else {
+ BitMask2 = isLshr ? BitMask2.lshr(ShrAmt - ShlAmt):
+ BitMask2.ashr(ShrAmt - ShlAmt);
+ }
+
+ // Check if condition-2 (see the comment to this function) is satified.
+ if ((BitMask1 & DemandedMask) == (BitMask2 & DemandedMask)) {
+ if (ShrAmt == ShlAmt)
+ return VarX;
+
+ if (!Shr->hasOneUse())
+ return nullptr;
+
+ BinaryOperator *New;
+ if (ShrAmt < ShlAmt) {
+ Constant *Amt = ConstantInt::get(VarX->getType(), ShlAmt - ShrAmt);
+ New = BinaryOperator::CreateShl(VarX, Amt);
+ BinaryOperator *Orig = cast<BinaryOperator>(Shl);
+ New->setHasNoSignedWrap(Orig->hasNoSignedWrap());
+ New->setHasNoUnsignedWrap(Orig->hasNoUnsignedWrap());
+ } else {
+ Constant *Amt = ConstantInt::get(VarX->getType(), ShrAmt - ShlAmt);
+ New = isLshr ? BinaryOperator::CreateLShr(VarX, Amt) :
+ BinaryOperator::CreateAShr(VarX, Amt);
+ if (cast<BinaryOperator>(Shr)->isExact())
+ New->setIsExact(true);
+ }
+
+ return InsertNewInstWith(New, *Shl);
+ }
+
+ return nullptr;
+}
+
+/// The specified value produces a vector with any number of elements.
/// This method analyzes which elements of the operand are undef or poison and
/// returns that information in UndefElts.
-///
-/// DemandedElts contains the set of elements that are actually used by the
-/// caller, and by default (AllowMultipleUsers equals false) the value is
-/// simplified only if it has a single caller. If AllowMultipleUsers is set
-/// to true, DemandedElts refers to the union of sets of elements that are
-/// used by all callers.
-///
-/// If the information about demanded elements can be used to simplify the
-/// operation, the operation is simplified, then the resultant value is
-/// returned. This returns null if no change was made.
+///
+/// DemandedElts contains the set of elements that are actually used by the
+/// caller, and by default (AllowMultipleUsers equals false) the value is
+/// simplified only if it has a single caller. If AllowMultipleUsers is set
+/// to true, DemandedElts refers to the union of sets of elements that are
+/// used by all callers.
+///
+/// If the information about demanded elements can be used to simplify the
+/// operation, the operation is simplified, then the resultant value is
+/// returned. This returns null if no change was made.
Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
APInt DemandedElts,
APInt &UndefElts,
unsigned Depth,
bool AllowMultipleUsers) {
- // Cannot analyze scalable type. The number of vector elements is not a
- // compile-time constant.
- if (isa<ScalableVectorType>(V->getType()))
- return nullptr;
-
- unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
- APInt EltMask(APInt::getAllOnesValue(VWidth));
- assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
-
- if (isa<UndefValue>(V)) {
+ // Cannot analyze scalable type. The number of vector elements is not a
+ // compile-time constant.
+ if (isa<ScalableVectorType>(V->getType()))
+ return nullptr;
+
+ unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
+ APInt EltMask(APInt::getAllOnesValue(VWidth));
+ assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
+
+ if (isa<UndefValue>(V)) {
// If the entire vector is undef or poison, just return this info.
- UndefElts = EltMask;
- return nullptr;
- }
-
+ UndefElts = EltMask;
+ return nullptr;
+ }
+
if (DemandedElts.isNullValue()) { // If nothing is demanded, provide poison.
- UndefElts = EltMask;
+ UndefElts = EltMask;
return PoisonValue::get(V->getType());
- }
-
- UndefElts = 0;
-
- if (auto *C = dyn_cast<Constant>(V)) {
- // Check if this is identity. If so, return 0 since we are not simplifying
- // anything.
- if (DemandedElts.isAllOnesValue())
- return nullptr;
-
- Type *EltTy = cast<VectorType>(V->getType())->getElementType();
+ }
+
+ UndefElts = 0;
+
+ if (auto *C = dyn_cast<Constant>(V)) {
+ // Check if this is identity. If so, return 0 since we are not simplifying
+ // anything.
+ if (DemandedElts.isAllOnesValue())
+ return nullptr;
+
+ Type *EltTy = cast<VectorType>(V->getType())->getElementType();
Constant *Poison = PoisonValue::get(EltTy);
- SmallVector<Constant*, 16> Elts;
- for (unsigned i = 0; i != VWidth; ++i) {
+ SmallVector<Constant*, 16> Elts;
+ for (unsigned i = 0; i != VWidth; ++i) {
if (!DemandedElts[i]) { // If not demanded, set to poison.
Elts.push_back(Poison);
- UndefElts.setBit(i);
- continue;
- }
-
- Constant *Elt = C->getAggregateElement(i);
- if (!Elt) return nullptr;
-
+ UndefElts.setBit(i);
+ continue;
+ }
+
+ Constant *Elt = C->getAggregateElement(i);
+ if (!Elt) return nullptr;
+
Elts.push_back(Elt);
if (isa<UndefValue>(Elt)) // Already undef or poison.
- UndefElts.setBit(i);
- }
-
- // If we changed the constant, return it.
- Constant *NewCV = ConstantVector::get(Elts);
- return NewCV != C ? NewCV : nullptr;
- }
-
- // Limit search depth.
- if (Depth == 10)
- return nullptr;
-
- if (!AllowMultipleUsers) {
- // If multiple users are using the root value, proceed with
- // simplification conservatively assuming that all elements
- // are needed.
- if (!V->hasOneUse()) {
- // Quit if we find multiple users of a non-root value though.
- // They'll be handled when it's their turn to be visited by
- // the main instcombine process.
- if (Depth != 0)
- // TODO: Just compute the UndefElts information recursively.
- return nullptr;
-
- // Conservatively assume that all elements are needed.
- DemandedElts = EltMask;
- }
- }
-
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I) return nullptr; // Only analyze instructions.
-
- bool MadeChange = false;
- auto simplifyAndSetOp = [&](Instruction *Inst, unsigned OpNum,
- APInt Demanded, APInt &Undef) {
- auto *II = dyn_cast<IntrinsicInst>(Inst);
- Value *Op = II ? II->getArgOperand(OpNum) : Inst->getOperand(OpNum);
- if (Value *V = SimplifyDemandedVectorElts(Op, Demanded, Undef, Depth + 1)) {
- replaceOperand(*Inst, OpNum, V);
- MadeChange = true;
- }
- };
-
- APInt UndefElts2(VWidth, 0);
- APInt UndefElts3(VWidth, 0);
- switch (I->getOpcode()) {
- default: break;
-
- case Instruction::GetElementPtr: {
- // The LangRef requires that struct geps have all constant indices. As
- // such, we can't convert any operand to partial undef.
- auto mayIndexStructType = [](GetElementPtrInst &GEP) {
- for (auto I = gep_type_begin(GEP), E = gep_type_end(GEP);
- I != E; I++)
- if (I.isStruct())
- return true;;
- return false;
- };
- if (mayIndexStructType(cast<GetElementPtrInst>(*I)))
- break;
-
- // Conservatively track the demanded elements back through any vector
- // operands we may have. We know there must be at least one, or we
- // wouldn't have a vector result to get here. Note that we intentionally
- // merge the undef bits here since gepping with either an undef base or
+ UndefElts.setBit(i);
+ }
+
+ // If we changed the constant, return it.
+ Constant *NewCV = ConstantVector::get(Elts);
+ return NewCV != C ? NewCV : nullptr;
+ }
+
+ // Limit search depth.
+ if (Depth == 10)
+ return nullptr;
+
+ if (!AllowMultipleUsers) {
+ // If multiple users are using the root value, proceed with
+ // simplification conservatively assuming that all elements
+ // are needed.
+ if (!V->hasOneUse()) {
+ // Quit if we find multiple users of a non-root value though.
+ // They'll be handled when it's their turn to be visited by
+ // the main instcombine process.
+ if (Depth != 0)
+ // TODO: Just compute the UndefElts information recursively.
+ return nullptr;
+
+ // Conservatively assume that all elements are needed.
+ DemandedElts = EltMask;
+ }
+ }
+
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I) return nullptr; // Only analyze instructions.
+
+ bool MadeChange = false;
+ auto simplifyAndSetOp = [&](Instruction *Inst, unsigned OpNum,
+ APInt Demanded, APInt &Undef) {
+ auto *II = dyn_cast<IntrinsicInst>(Inst);
+ Value *Op = II ? II->getArgOperand(OpNum) : Inst->getOperand(OpNum);
+ if (Value *V = SimplifyDemandedVectorElts(Op, Demanded, Undef, Depth + 1)) {
+ replaceOperand(*Inst, OpNum, V);
+ MadeChange = true;
+ }
+ };
+
+ APInt UndefElts2(VWidth, 0);
+ APInt UndefElts3(VWidth, 0);
+ switch (I->getOpcode()) {
+ default: break;
+
+ case Instruction::GetElementPtr: {
+ // The LangRef requires that struct geps have all constant indices. As
+ // such, we can't convert any operand to partial undef.
+ auto mayIndexStructType = [](GetElementPtrInst &GEP) {
+ for (auto I = gep_type_begin(GEP), E = gep_type_end(GEP);
+ I != E; I++)
+ if (I.isStruct())
+ return true;;
+ return false;
+ };
+ if (mayIndexStructType(cast<GetElementPtrInst>(*I)))
+ break;
+
+ // Conservatively track the demanded elements back through any vector
+ // operands we may have. We know there must be at least one, or we
+ // wouldn't have a vector result to get here. Note that we intentionally
+ // merge the undef bits here since gepping with either an undef base or
// index results in undef.
- for (unsigned i = 0; i < I->getNumOperands(); i++) {
- if (isa<UndefValue>(I->getOperand(i))) {
- // If the entire vector is undefined, just return this info.
- UndefElts = EltMask;
- return nullptr;
- }
- if (I->getOperand(i)->getType()->isVectorTy()) {
- APInt UndefEltsOp(VWidth, 0);
- simplifyAndSetOp(I, i, DemandedElts, UndefEltsOp);
- UndefElts |= UndefEltsOp;
- }
- }
-
- break;
- }
- case Instruction::InsertElement: {
- // If this is a variable index, we don't know which element it overwrites.
- // demand exactly the same input as we produce.
- ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2));
- if (!Idx) {
- // Note that we can't propagate undef elt info, because we don't know
- // which elt is getting updated.
- simplifyAndSetOp(I, 0, DemandedElts, UndefElts2);
- break;
- }
-
- // The element inserted overwrites whatever was there, so the input demanded
- // set is simpler than the output set.
- unsigned IdxNo = Idx->getZExtValue();
- APInt PreInsertDemandedElts = DemandedElts;
- if (IdxNo < VWidth)
- PreInsertDemandedElts.clearBit(IdxNo);
-
+ for (unsigned i = 0; i < I->getNumOperands(); i++) {
+ if (isa<UndefValue>(I->getOperand(i))) {
+ // If the entire vector is undefined, just return this info.
+ UndefElts = EltMask;
+ return nullptr;
+ }
+ if (I->getOperand(i)->getType()->isVectorTy()) {
+ APInt UndefEltsOp(VWidth, 0);
+ simplifyAndSetOp(I, i, DemandedElts, UndefEltsOp);
+ UndefElts |= UndefEltsOp;
+ }
+ }
+
+ break;
+ }
+ case Instruction::InsertElement: {
+ // If this is a variable index, we don't know which element it overwrites.
+ // demand exactly the same input as we produce.
+ ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2));
+ if (!Idx) {
+ // Note that we can't propagate undef elt info, because we don't know
+ // which elt is getting updated.
+ simplifyAndSetOp(I, 0, DemandedElts, UndefElts2);
+ break;
+ }
+
+ // The element inserted overwrites whatever was there, so the input demanded
+ // set is simpler than the output set.
+ unsigned IdxNo = Idx->getZExtValue();
+ APInt PreInsertDemandedElts = DemandedElts;
+ if (IdxNo < VWidth)
+ PreInsertDemandedElts.clearBit(IdxNo);
+
// If we only demand the element that is being inserted and that element
// was extracted from the same index in another vector with the same type,
// replace this insert with that other vector.
@@ -1197,339 +1197,339 @@ Value *InstCombinerImpl::SimplifyDemandedVectorElts(Value *V,
return Vec;
}
- simplifyAndSetOp(I, 0, PreInsertDemandedElts, UndefElts);
-
- // If this is inserting an element that isn't demanded, remove this
- // insertelement.
- if (IdxNo >= VWidth || !DemandedElts[IdxNo]) {
- Worklist.push(I);
- return I->getOperand(0);
- }
-
- // The inserted element is defined.
- UndefElts.clearBit(IdxNo);
- break;
- }
- case Instruction::ShuffleVector: {
- auto *Shuffle = cast<ShuffleVectorInst>(I);
- assert(Shuffle->getOperand(0)->getType() ==
- Shuffle->getOperand(1)->getType() &&
- "Expected shuffle operands to have same type");
+ simplifyAndSetOp(I, 0, PreInsertDemandedElts, UndefElts);
+
+ // If this is inserting an element that isn't demanded, remove this
+ // insertelement.
+ if (IdxNo >= VWidth || !DemandedElts[IdxNo]) {
+ Worklist.push(I);
+ return I->getOperand(0);
+ }
+
+ // The inserted element is defined.
+ UndefElts.clearBit(IdxNo);
+ break;
+ }
+ case Instruction::ShuffleVector: {
+ auto *Shuffle = cast<ShuffleVectorInst>(I);
+ assert(Shuffle->getOperand(0)->getType() ==
+ Shuffle->getOperand(1)->getType() &&
+ "Expected shuffle operands to have same type");
unsigned OpWidth = cast<FixedVectorType>(Shuffle->getOperand(0)->getType())
->getNumElements();
- // Handle trivial case of a splat. Only check the first element of LHS
- // operand.
- if (all_of(Shuffle->getShuffleMask(), [](int Elt) { return Elt == 0; }) &&
- DemandedElts.isAllOnesValue()) {
- if (!isa<UndefValue>(I->getOperand(1))) {
- I->setOperand(1, UndefValue::get(I->getOperand(1)->getType()));
- MadeChange = true;
- }
- APInt LeftDemanded(OpWidth, 1);
- APInt LHSUndefElts(OpWidth, 0);
- simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts);
- if (LHSUndefElts[0])
- UndefElts = EltMask;
- else
- UndefElts.clearAllBits();
- break;
- }
-
- APInt LeftDemanded(OpWidth, 0), RightDemanded(OpWidth, 0);
- for (unsigned i = 0; i < VWidth; i++) {
- if (DemandedElts[i]) {
- unsigned MaskVal = Shuffle->getMaskValue(i);
- if (MaskVal != -1u) {
- assert(MaskVal < OpWidth * 2 &&
- "shufflevector mask index out of range!");
- if (MaskVal < OpWidth)
- LeftDemanded.setBit(MaskVal);
- else
- RightDemanded.setBit(MaskVal - OpWidth);
- }
- }
- }
-
- APInt LHSUndefElts(OpWidth, 0);
- simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts);
-
- APInt RHSUndefElts(OpWidth, 0);
- simplifyAndSetOp(I, 1, RightDemanded, RHSUndefElts);
-
- // If this shuffle does not change the vector length and the elements
- // demanded by this shuffle are an identity mask, then this shuffle is
- // unnecessary.
- //
- // We are assuming canonical form for the mask, so the source vector is
- // operand 0 and operand 1 is not used.
- //
- // Note that if an element is demanded and this shuffle mask is undefined
- // for that element, then the shuffle is not considered an identity
- // operation. The shuffle prevents poison from the operand vector from
- // leaking to the result by replacing poison with an undefined value.
- if (VWidth == OpWidth) {
- bool IsIdentityShuffle = true;
- for (unsigned i = 0; i < VWidth; i++) {
- unsigned MaskVal = Shuffle->getMaskValue(i);
- if (DemandedElts[i] && i != MaskVal) {
- IsIdentityShuffle = false;
- break;
- }
- }
- if (IsIdentityShuffle)
- return Shuffle->getOperand(0);
- }
-
- bool NewUndefElts = false;
- unsigned LHSIdx = -1u, LHSValIdx = -1u;
- unsigned RHSIdx = -1u, RHSValIdx = -1u;
- bool LHSUniform = true;
- bool RHSUniform = true;
- for (unsigned i = 0; i < VWidth; i++) {
- unsigned MaskVal = Shuffle->getMaskValue(i);
- if (MaskVal == -1u) {
- UndefElts.setBit(i);
- } else if (!DemandedElts[i]) {
- NewUndefElts = true;
- UndefElts.setBit(i);
- } else if (MaskVal < OpWidth) {
- if (LHSUndefElts[MaskVal]) {
- NewUndefElts = true;
- UndefElts.setBit(i);
- } else {
- LHSIdx = LHSIdx == -1u ? i : OpWidth;
- LHSValIdx = LHSValIdx == -1u ? MaskVal : OpWidth;
- LHSUniform = LHSUniform && (MaskVal == i);
- }
- } else {
- if (RHSUndefElts[MaskVal - OpWidth]) {
- NewUndefElts = true;
- UndefElts.setBit(i);
- } else {
- RHSIdx = RHSIdx == -1u ? i : OpWidth;
- RHSValIdx = RHSValIdx == -1u ? MaskVal - OpWidth : OpWidth;
- RHSUniform = RHSUniform && (MaskVal - OpWidth == i);
- }
- }
- }
-
- // Try to transform shuffle with constant vector and single element from
- // this constant vector to single insertelement instruction.
- // shufflevector V, C, <v1, v2, .., ci, .., vm> ->
- // insertelement V, C[ci], ci-n
+ // Handle trivial case of a splat. Only check the first element of LHS
+ // operand.
+ if (all_of(Shuffle->getShuffleMask(), [](int Elt) { return Elt == 0; }) &&
+ DemandedElts.isAllOnesValue()) {
+ if (!isa<UndefValue>(I->getOperand(1))) {
+ I->setOperand(1, UndefValue::get(I->getOperand(1)->getType()));
+ MadeChange = true;
+ }
+ APInt LeftDemanded(OpWidth, 1);
+ APInt LHSUndefElts(OpWidth, 0);
+ simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts);
+ if (LHSUndefElts[0])
+ UndefElts = EltMask;
+ else
+ UndefElts.clearAllBits();
+ break;
+ }
+
+ APInt LeftDemanded(OpWidth, 0), RightDemanded(OpWidth, 0);
+ for (unsigned i = 0; i < VWidth; i++) {
+ if (DemandedElts[i]) {
+ unsigned MaskVal = Shuffle->getMaskValue(i);
+ if (MaskVal != -1u) {
+ assert(MaskVal < OpWidth * 2 &&
+ "shufflevector mask index out of range!");
+ if (MaskVal < OpWidth)
+ LeftDemanded.setBit(MaskVal);
+ else
+ RightDemanded.setBit(MaskVal - OpWidth);
+ }
+ }
+ }
+
+ APInt LHSUndefElts(OpWidth, 0);
+ simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts);
+
+ APInt RHSUndefElts(OpWidth, 0);
+ simplifyAndSetOp(I, 1, RightDemanded, RHSUndefElts);
+
+ // If this shuffle does not change the vector length and the elements
+ // demanded by this shuffle are an identity mask, then this shuffle is
+ // unnecessary.
+ //
+ // We are assuming canonical form for the mask, so the source vector is
+ // operand 0 and operand 1 is not used.
+ //
+ // Note that if an element is demanded and this shuffle mask is undefined
+ // for that element, then the shuffle is not considered an identity
+ // operation. The shuffle prevents poison from the operand vector from
+ // leaking to the result by replacing poison with an undefined value.
+ if (VWidth == OpWidth) {
+ bool IsIdentityShuffle = true;
+ for (unsigned i = 0; i < VWidth; i++) {
+ unsigned MaskVal = Shuffle->getMaskValue(i);
+ if (DemandedElts[i] && i != MaskVal) {
+ IsIdentityShuffle = false;
+ break;
+ }
+ }
+ if (IsIdentityShuffle)
+ return Shuffle->getOperand(0);
+ }
+
+ bool NewUndefElts = false;
+ unsigned LHSIdx = -1u, LHSValIdx = -1u;
+ unsigned RHSIdx = -1u, RHSValIdx = -1u;
+ bool LHSUniform = true;
+ bool RHSUniform = true;
+ for (unsigned i = 0; i < VWidth; i++) {
+ unsigned MaskVal = Shuffle->getMaskValue(i);
+ if (MaskVal == -1u) {
+ UndefElts.setBit(i);
+ } else if (!DemandedElts[i]) {
+ NewUndefElts = true;
+ UndefElts.setBit(i);
+ } else if (MaskVal < OpWidth) {
+ if (LHSUndefElts[MaskVal]) {
+ NewUndefElts = true;
+ UndefElts.setBit(i);
+ } else {
+ LHSIdx = LHSIdx == -1u ? i : OpWidth;
+ LHSValIdx = LHSValIdx == -1u ? MaskVal : OpWidth;
+ LHSUniform = LHSUniform && (MaskVal == i);
+ }
+ } else {
+ if (RHSUndefElts[MaskVal - OpWidth]) {
+ NewUndefElts = true;
+ UndefElts.setBit(i);
+ } else {
+ RHSIdx = RHSIdx == -1u ? i : OpWidth;
+ RHSValIdx = RHSValIdx == -1u ? MaskVal - OpWidth : OpWidth;
+ RHSUniform = RHSUniform && (MaskVal - OpWidth == i);
+ }
+ }
+ }
+
+ // Try to transform shuffle with constant vector and single element from
+ // this constant vector to single insertelement instruction.
+ // shufflevector V, C, <v1, v2, .., ci, .., vm> ->
+ // insertelement V, C[ci], ci-n
if (OpWidth ==
cast<FixedVectorType>(Shuffle->getType())->getNumElements()) {
- Value *Op = nullptr;
- Constant *Value = nullptr;
- unsigned Idx = -1u;
-
- // Find constant vector with the single element in shuffle (LHS or RHS).
- if (LHSIdx < OpWidth && RHSUniform) {
- if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(0))) {
- Op = Shuffle->getOperand(1);
- Value = CV->getOperand(LHSValIdx);
- Idx = LHSIdx;
- }
- }
- if (RHSIdx < OpWidth && LHSUniform) {
- if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(1))) {
- Op = Shuffle->getOperand(0);
- Value = CV->getOperand(RHSValIdx);
- Idx = RHSIdx;
- }
- }
- // Found constant vector with single element - convert to insertelement.
- if (Op && Value) {
- Instruction *New = InsertElementInst::Create(
- Op, Value, ConstantInt::get(Type::getInt32Ty(I->getContext()), Idx),
- Shuffle->getName());
- InsertNewInstWith(New, *Shuffle);
- return New;
- }
- }
- if (NewUndefElts) {
- // Add additional discovered undefs.
- SmallVector<int, 16> Elts;
- for (unsigned i = 0; i < VWidth; ++i) {
- if (UndefElts[i])
- Elts.push_back(UndefMaskElem);
- else
- Elts.push_back(Shuffle->getMaskValue(i));
- }
- Shuffle->setShuffleMask(Elts);
- MadeChange = true;
- }
- break;
- }
- case Instruction::Select: {
- // If this is a vector select, try to transform the select condition based
- // on the current demanded elements.
- SelectInst *Sel = cast<SelectInst>(I);
- if (Sel->getCondition()->getType()->isVectorTy()) {
- // TODO: We are not doing anything with UndefElts based on this call.
- // It is overwritten below based on the other select operands. If an
- // element of the select condition is known undef, then we are free to
- // choose the output value from either arm of the select. If we know that
- // one of those values is undef, then the output can be undef.
- simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
- }
-
- // Next, see if we can transform the arms of the select.
- APInt DemandedLHS(DemandedElts), DemandedRHS(DemandedElts);
- if (auto *CV = dyn_cast<ConstantVector>(Sel->getCondition())) {
- for (unsigned i = 0; i < VWidth; i++) {
- // isNullValue() always returns false when called on a ConstantExpr.
- // Skip constant expressions to avoid propagating incorrect information.
- Constant *CElt = CV->getAggregateElement(i);
- if (isa<ConstantExpr>(CElt))
- continue;
- // TODO: If a select condition element is undef, we can demand from
- // either side. If one side is known undef, choosing that side would
- // propagate undef.
- if (CElt->isNullValue())
- DemandedLHS.clearBit(i);
- else
- DemandedRHS.clearBit(i);
- }
- }
-
- simplifyAndSetOp(I, 1, DemandedLHS, UndefElts2);
- simplifyAndSetOp(I, 2, DemandedRHS, UndefElts3);
-
- // Output elements are undefined if the element from each arm is undefined.
- // TODO: This can be improved. See comment in select condition handling.
- UndefElts = UndefElts2 & UndefElts3;
- break;
- }
- case Instruction::BitCast: {
- // Vector->vector casts only.
- VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
- if (!VTy) break;
+ Value *Op = nullptr;
+ Constant *Value = nullptr;
+ unsigned Idx = -1u;
+
+ // Find constant vector with the single element in shuffle (LHS or RHS).
+ if (LHSIdx < OpWidth && RHSUniform) {
+ if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(0))) {
+ Op = Shuffle->getOperand(1);
+ Value = CV->getOperand(LHSValIdx);
+ Idx = LHSIdx;
+ }
+ }
+ if (RHSIdx < OpWidth && LHSUniform) {
+ if (auto *CV = dyn_cast<ConstantVector>(Shuffle->getOperand(1))) {
+ Op = Shuffle->getOperand(0);
+ Value = CV->getOperand(RHSValIdx);
+ Idx = RHSIdx;
+ }
+ }
+ // Found constant vector with single element - convert to insertelement.
+ if (Op && Value) {
+ Instruction *New = InsertElementInst::Create(
+ Op, Value, ConstantInt::get(Type::getInt32Ty(I->getContext()), Idx),
+ Shuffle->getName());
+ InsertNewInstWith(New, *Shuffle);
+ return New;
+ }
+ }
+ if (NewUndefElts) {
+ // Add additional discovered undefs.
+ SmallVector<int, 16> Elts;
+ for (unsigned i = 0; i < VWidth; ++i) {
+ if (UndefElts[i])
+ Elts.push_back(UndefMaskElem);
+ else
+ Elts.push_back(Shuffle->getMaskValue(i));
+ }
+ Shuffle->setShuffleMask(Elts);
+ MadeChange = true;
+ }
+ break;
+ }
+ case Instruction::Select: {
+ // If this is a vector select, try to transform the select condition based
+ // on the current demanded elements.
+ SelectInst *Sel = cast<SelectInst>(I);
+ if (Sel->getCondition()->getType()->isVectorTy()) {
+ // TODO: We are not doing anything with UndefElts based on this call.
+ // It is overwritten below based on the other select operands. If an
+ // element of the select condition is known undef, then we are free to
+ // choose the output value from either arm of the select. If we know that
+ // one of those values is undef, then the output can be undef.
+ simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
+ }
+
+ // Next, see if we can transform the arms of the select.
+ APInt DemandedLHS(DemandedElts), DemandedRHS(DemandedElts);
+ if (auto *CV = dyn_cast<ConstantVector>(Sel->getCondition())) {
+ for (unsigned i = 0; i < VWidth; i++) {
+ // isNullValue() always returns false when called on a ConstantExpr.
+ // Skip constant expressions to avoid propagating incorrect information.
+ Constant *CElt = CV->getAggregateElement(i);
+ if (isa<ConstantExpr>(CElt))
+ continue;
+ // TODO: If a select condition element is undef, we can demand from
+ // either side. If one side is known undef, choosing that side would
+ // propagate undef.
+ if (CElt->isNullValue())
+ DemandedLHS.clearBit(i);
+ else
+ DemandedRHS.clearBit(i);
+ }
+ }
+
+ simplifyAndSetOp(I, 1, DemandedLHS, UndefElts2);
+ simplifyAndSetOp(I, 2, DemandedRHS, UndefElts3);
+
+ // Output elements are undefined if the element from each arm is undefined.
+ // TODO: This can be improved. See comment in select condition handling.
+ UndefElts = UndefElts2 & UndefElts3;
+ break;
+ }
+ case Instruction::BitCast: {
+ // Vector->vector casts only.
+ VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
+ if (!VTy) break;
unsigned InVWidth = cast<FixedVectorType>(VTy)->getNumElements();
- APInt InputDemandedElts(InVWidth, 0);
- UndefElts2 = APInt(InVWidth, 0);
- unsigned Ratio;
-
- if (VWidth == InVWidth) {
- // If we are converting from <4 x i32> -> <4 x f32>, we demand the same
- // elements as are demanded of us.
- Ratio = 1;
- InputDemandedElts = DemandedElts;
- } else if ((VWidth % InVWidth) == 0) {
- // If the number of elements in the output is a multiple of the number of
- // elements in the input then an input element is live if any of the
- // corresponding output elements are live.
- Ratio = VWidth / InVWidth;
- for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
- if (DemandedElts[OutIdx])
- InputDemandedElts.setBit(OutIdx / Ratio);
- } else if ((InVWidth % VWidth) == 0) {
- // If the number of elements in the input is a multiple of the number of
- // elements in the output then an input element is live if the
- // corresponding output element is live.
- Ratio = InVWidth / VWidth;
- for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
- if (DemandedElts[InIdx / Ratio])
- InputDemandedElts.setBit(InIdx);
- } else {
- // Unsupported so far.
- break;
- }
-
- simplifyAndSetOp(I, 0, InputDemandedElts, UndefElts2);
-
- if (VWidth == InVWidth) {
- UndefElts = UndefElts2;
- } else if ((VWidth % InVWidth) == 0) {
- // If the number of elements in the output is a multiple of the number of
- // elements in the input then an output element is undef if the
- // corresponding input element is undef.
- for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
- if (UndefElts2[OutIdx / Ratio])
- UndefElts.setBit(OutIdx);
- } else if ((InVWidth % VWidth) == 0) {
- // If the number of elements in the input is a multiple of the number of
- // elements in the output then an output element is undef if all of the
- // corresponding input elements are undef.
- for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
- APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio);
- if (SubUndef.countPopulation() == Ratio)
- UndefElts.setBit(OutIdx);
- }
- } else {
- llvm_unreachable("Unimp");
- }
- break;
- }
- case Instruction::FPTrunc:
- case Instruction::FPExt:
- simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
- break;
-
- case Instruction::Call: {
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
- if (!II) break;
- switch (II->getIntrinsicID()) {
- case Intrinsic::masked_gather: // fallthrough
- case Intrinsic::masked_load: {
- // Subtlety: If we load from a pointer, the pointer must be valid
- // regardless of whether the element is demanded. Doing otherwise risks
- // segfaults which didn't exist in the original program.
- APInt DemandedPtrs(APInt::getAllOnesValue(VWidth)),
- DemandedPassThrough(DemandedElts);
- if (auto *CV = dyn_cast<ConstantVector>(II->getOperand(2)))
- for (unsigned i = 0; i < VWidth; i++) {
- Constant *CElt = CV->getAggregateElement(i);
- if (CElt->isNullValue())
- DemandedPtrs.clearBit(i);
- else if (CElt->isAllOnesValue())
- DemandedPassThrough.clearBit(i);
- }
- if (II->getIntrinsicID() == Intrinsic::masked_gather)
- simplifyAndSetOp(II, 0, DemandedPtrs, UndefElts2);
- simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3);
-
- // Output elements are undefined if the element from both sources are.
- // TODO: can strengthen via mask as well.
- UndefElts = UndefElts2 & UndefElts3;
- break;
- }
- default: {
+ APInt InputDemandedElts(InVWidth, 0);
+ UndefElts2 = APInt(InVWidth, 0);
+ unsigned Ratio;
+
+ if (VWidth == InVWidth) {
+ // If we are converting from <4 x i32> -> <4 x f32>, we demand the same
+ // elements as are demanded of us.
+ Ratio = 1;
+ InputDemandedElts = DemandedElts;
+ } else if ((VWidth % InVWidth) == 0) {
+ // If the number of elements in the output is a multiple of the number of
+ // elements in the input then an input element is live if any of the
+ // corresponding output elements are live.
+ Ratio = VWidth / InVWidth;
+ for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
+ if (DemandedElts[OutIdx])
+ InputDemandedElts.setBit(OutIdx / Ratio);
+ } else if ((InVWidth % VWidth) == 0) {
+ // If the number of elements in the input is a multiple of the number of
+ // elements in the output then an input element is live if the
+ // corresponding output element is live.
+ Ratio = InVWidth / VWidth;
+ for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
+ if (DemandedElts[InIdx / Ratio])
+ InputDemandedElts.setBit(InIdx);
+ } else {
+ // Unsupported so far.
+ break;
+ }
+
+ simplifyAndSetOp(I, 0, InputDemandedElts, UndefElts2);
+
+ if (VWidth == InVWidth) {
+ UndefElts = UndefElts2;
+ } else if ((VWidth % InVWidth) == 0) {
+ // If the number of elements in the output is a multiple of the number of
+ // elements in the input then an output element is undef if the
+ // corresponding input element is undef.
+ for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
+ if (UndefElts2[OutIdx / Ratio])
+ UndefElts.setBit(OutIdx);
+ } else if ((InVWidth % VWidth) == 0) {
+ // If the number of elements in the input is a multiple of the number of
+ // elements in the output then an output element is undef if all of the
+ // corresponding input elements are undef.
+ for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
+ APInt SubUndef = UndefElts2.lshr(OutIdx * Ratio).zextOrTrunc(Ratio);
+ if (SubUndef.countPopulation() == Ratio)
+ UndefElts.setBit(OutIdx);
+ }
+ } else {
+ llvm_unreachable("Unimp");
+ }
+ break;
+ }
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
+ break;
+
+ case Instruction::Call: {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+ if (!II) break;
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::masked_gather: // fallthrough
+ case Intrinsic::masked_load: {
+ // Subtlety: If we load from a pointer, the pointer must be valid
+ // regardless of whether the element is demanded. Doing otherwise risks
+ // segfaults which didn't exist in the original program.
+ APInt DemandedPtrs(APInt::getAllOnesValue(VWidth)),
+ DemandedPassThrough(DemandedElts);
+ if (auto *CV = dyn_cast<ConstantVector>(II->getOperand(2)))
+ for (unsigned i = 0; i < VWidth; i++) {
+ Constant *CElt = CV->getAggregateElement(i);
+ if (CElt->isNullValue())
+ DemandedPtrs.clearBit(i);
+ else if (CElt->isAllOnesValue())
+ DemandedPassThrough.clearBit(i);
+ }
+ if (II->getIntrinsicID() == Intrinsic::masked_gather)
+ simplifyAndSetOp(II, 0, DemandedPtrs, UndefElts2);
+ simplifyAndSetOp(II, 3, DemandedPassThrough, UndefElts3);
+
+ // Output elements are undefined if the element from both sources are.
+ // TODO: can strengthen via mask as well.
+ UndefElts = UndefElts2 & UndefElts3;
+ break;
+ }
+ default: {
// Handle target specific intrinsics
Optional<Value *> V = targetSimplifyDemandedVectorEltsIntrinsic(
*II, DemandedElts, UndefElts, UndefElts2, UndefElts3,
simplifyAndSetOp);
if (V.hasValue())
return V.getValue();
- break;
- }
- } // switch on IntrinsicID
- break;
- } // case Call
- } // switch on Opcode
-
- // TODO: We bail completely on integer div/rem and shifts because they have
- // UB/poison potential, but that should be refined.
- BinaryOperator *BO;
- if (match(I, m_BinOp(BO)) && !BO->isIntDivRem() && !BO->isShift()) {
- simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
- simplifyAndSetOp(I, 1, DemandedElts, UndefElts2);
-
- // Any change to an instruction with potential poison must clear those flags
- // because we can not guarantee those constraints now. Other analysis may
- // determine that it is safe to re-apply the flags.
- if (MadeChange)
- BO->dropPoisonGeneratingFlags();
-
- // Output elements are undefined if both are undefined. Consider things
- // like undef & 0. The result is known zero, not undef.
- UndefElts &= UndefElts2;
- }
-
- // If we've proven all of the lanes undef, return an undef value.
- // TODO: Intersect w/demanded lanes
- if (UndefElts.isAllOnesValue())
- return UndefValue::get(I->getType());;
-
- return MadeChange ? I : nullptr;
-}
+ break;
+ }
+ } // switch on IntrinsicID
+ break;
+ } // case Call
+ } // switch on Opcode
+
+ // TODO: We bail completely on integer div/rem and shifts because they have
+ // UB/poison potential, but that should be refined.
+ BinaryOperator *BO;
+ if (match(I, m_BinOp(BO)) && !BO->isIntDivRem() && !BO->isShift()) {
+ simplifyAndSetOp(I, 0, DemandedElts, UndefElts);
+ simplifyAndSetOp(I, 1, DemandedElts, UndefElts2);
+
+ // Any change to an instruction with potential poison must clear those flags
+ // because we can not guarantee those constraints now. Other analysis may
+ // determine that it is safe to re-apply the flags.
+ if (MadeChange)
+ BO->dropPoisonGeneratingFlags();
+
+ // Output elements are undefined if both are undefined. Consider things
+ // like undef & 0. The result is known zero, not undef.
+ UndefElts &= UndefElts2;
+ }
+
+ // If we've proven all of the lanes undef, return an undef value.
+ // TODO: Intersect w/demanded lanes
+ if (UndefElts.isAllOnesValue())
+ return UndefValue::get(I->getType());;
+
+ return MadeChange ? I : nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 42c981566c..06f22cdfb6 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1,710 +1,710 @@
-//===- InstCombineVectorOps.cpp -------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements instcombine for ExtractElement, InsertElement and
-// ShuffleVector.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SmallVector.h"
+//===- InstCombineVectorOps.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements instcombine for ExtractElement, InsertElement and
+// ShuffleVector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <utility>
-
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "instcombine"
-
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
STATISTIC(NumAggregateReconstructionsSimplified,
"Number of aggregate reconstructions turned into reuse of the "
"original aggregate");
-/// Return true if the value is cheaper to scalarize than it is to leave as a
-/// vector operation. IsConstantExtractIndex indicates whether we are extracting
-/// one known element from a vector constant.
-///
-/// FIXME: It's possible to create more instructions than previously existed.
-static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) {
- // If we can pick a scalar constant value out of a vector, that is free.
- if (auto *C = dyn_cast<Constant>(V))
- return IsConstantExtractIndex || C->getSplatValue();
-
- // An insertelement to the same constant index as our extract will simplify
- // to the scalar inserted element. An insertelement to a different constant
- // index is irrelevant to our extract.
- if (match(V, m_InsertElt(m_Value(), m_Value(), m_ConstantInt())))
- return IsConstantExtractIndex;
-
- if (match(V, m_OneUse(m_Load(m_Value()))))
- return true;
-
- if (match(V, m_OneUse(m_UnOp())))
- return true;
-
- Value *V0, *V1;
- if (match(V, m_OneUse(m_BinOp(m_Value(V0), m_Value(V1)))))
- if (cheapToScalarize(V0, IsConstantExtractIndex) ||
- cheapToScalarize(V1, IsConstantExtractIndex))
- return true;
-
- CmpInst::Predicate UnusedPred;
- if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1)))))
- if (cheapToScalarize(V0, IsConstantExtractIndex) ||
- cheapToScalarize(V1, IsConstantExtractIndex))
- return true;
-
- return false;
-}
-
-// If we have a PHI node with a vector type that is only used to feed
-// itself and be an operand of extractelement at a constant location,
-// try to replace the PHI of the vector type with a PHI of a scalar type.
+/// Return true if the value is cheaper to scalarize than it is to leave as a
+/// vector operation. IsConstantExtractIndex indicates whether we are extracting
+/// one known element from a vector constant.
+///
+/// FIXME: It's possible to create more instructions than previously existed.
+static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) {
+ // If we can pick a scalar constant value out of a vector, that is free.
+ if (auto *C = dyn_cast<Constant>(V))
+ return IsConstantExtractIndex || C->getSplatValue();
+
+ // An insertelement to the same constant index as our extract will simplify
+ // to the scalar inserted element. An insertelement to a different constant
+ // index is irrelevant to our extract.
+ if (match(V, m_InsertElt(m_Value(), m_Value(), m_ConstantInt())))
+ return IsConstantExtractIndex;
+
+ if (match(V, m_OneUse(m_Load(m_Value()))))
+ return true;
+
+ if (match(V, m_OneUse(m_UnOp())))
+ return true;
+
+ Value *V0, *V1;
+ if (match(V, m_OneUse(m_BinOp(m_Value(V0), m_Value(V1)))))
+ if (cheapToScalarize(V0, IsConstantExtractIndex) ||
+ cheapToScalarize(V1, IsConstantExtractIndex))
+ return true;
+
+ CmpInst::Predicate UnusedPred;
+ if (match(V, m_OneUse(m_Cmp(UnusedPred, m_Value(V0), m_Value(V1)))))
+ if (cheapToScalarize(V0, IsConstantExtractIndex) ||
+ cheapToScalarize(V1, IsConstantExtractIndex))
+ return true;
+
+ return false;
+}
+
+// If we have a PHI node with a vector type that is only used to feed
+// itself and be an operand of extractelement at a constant location,
+// try to replace the PHI of the vector type with a PHI of a scalar type.
Instruction *InstCombinerImpl::scalarizePHI(ExtractElementInst &EI,
PHINode *PN) {
- SmallVector<Instruction *, 2> Extracts;
- // The users we want the PHI to have are:
- // 1) The EI ExtractElement (we already know this)
- // 2) Possibly more ExtractElements with the same index.
- // 3) Another operand, which will feed back into the PHI.
- Instruction *PHIUser = nullptr;
- for (auto U : PN->users()) {
- if (ExtractElementInst *EU = dyn_cast<ExtractElementInst>(U)) {
- if (EI.getIndexOperand() == EU->getIndexOperand())
- Extracts.push_back(EU);
- else
- return nullptr;
- } else if (!PHIUser) {
- PHIUser = cast<Instruction>(U);
- } else {
- return nullptr;
- }
- }
-
- if (!PHIUser)
- return nullptr;
-
- // Verify that this PHI user has one use, which is the PHI itself,
- // and that it is a binary operation which is cheap to scalarize.
- // otherwise return nullptr.
- if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) ||
- !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true))
- return nullptr;
-
- // Create a scalar PHI node that will replace the vector PHI node
- // just before the current PHI node.
- PHINode *scalarPHI = cast<PHINode>(InsertNewInstWith(
- PHINode::Create(EI.getType(), PN->getNumIncomingValues(), ""), *PN));
- // Scalarize each PHI operand.
- for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
- Value *PHIInVal = PN->getIncomingValue(i);
- BasicBlock *inBB = PN->getIncomingBlock(i);
- Value *Elt = EI.getIndexOperand();
- // If the operand is the PHI induction variable:
- if (PHIInVal == PHIUser) {
- // Scalarize the binary operation. Its first operand is the
- // scalar PHI, and the second operand is extracted from the other
- // vector operand.
- BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);
- unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0;
- Value *Op = InsertNewInstWith(
- ExtractElementInst::Create(B0->getOperand(opId), Elt,
- B0->getOperand(opId)->getName() + ".Elt"),
- *B0);
- Value *newPHIUser = InsertNewInstWith(
- BinaryOperator::CreateWithCopiedFlags(B0->getOpcode(),
- scalarPHI, Op, B0), *B0);
- scalarPHI->addIncoming(newPHIUser, inBB);
- } else {
- // Scalarize PHI input:
- Instruction *newEI = ExtractElementInst::Create(PHIInVal, Elt, "");
- // Insert the new instruction into the predecessor basic block.
- Instruction *pos = dyn_cast<Instruction>(PHIInVal);
- BasicBlock::iterator InsertPos;
- if (pos && !isa<PHINode>(pos)) {
- InsertPos = ++pos->getIterator();
- } else {
- InsertPos = inBB->getFirstInsertionPt();
- }
-
- InsertNewInstWith(newEI, *InsertPos);
-
- scalarPHI->addIncoming(newEI, inBB);
- }
- }
-
- for (auto E : Extracts)
- replaceInstUsesWith(*E, scalarPHI);
-
- return &EI;
-}
-
-static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
- InstCombiner::BuilderTy &Builder,
- bool IsBigEndian) {
- Value *X;
- uint64_t ExtIndexC;
- if (!match(Ext.getVectorOperand(), m_BitCast(m_Value(X))) ||
- !X->getType()->isVectorTy() ||
- !match(Ext.getIndexOperand(), m_ConstantInt(ExtIndexC)))
- return nullptr;
-
- // If this extractelement is using a bitcast from a vector of the same number
- // of elements, see if we can find the source element from the source vector:
- // extelt (bitcast VecX), IndexC --> bitcast X[IndexC]
- auto *SrcTy = cast<VectorType>(X->getType());
- Type *DestTy = Ext.getType();
+ SmallVector<Instruction *, 2> Extracts;
+ // The users we want the PHI to have are:
+ // 1) The EI ExtractElement (we already know this)
+ // 2) Possibly more ExtractElements with the same index.
+ // 3) Another operand, which will feed back into the PHI.
+ Instruction *PHIUser = nullptr;
+ for (auto U : PN->users()) {
+ if (ExtractElementInst *EU = dyn_cast<ExtractElementInst>(U)) {
+ if (EI.getIndexOperand() == EU->getIndexOperand())
+ Extracts.push_back(EU);
+ else
+ return nullptr;
+ } else if (!PHIUser) {
+ PHIUser = cast<Instruction>(U);
+ } else {
+ return nullptr;
+ }
+ }
+
+ if (!PHIUser)
+ return nullptr;
+
+ // Verify that this PHI user has one use, which is the PHI itself,
+ // and that it is a binary operation which is cheap to scalarize.
+ // otherwise return nullptr.
+ if (!PHIUser->hasOneUse() || !(PHIUser->user_back() == PN) ||
+ !(isa<BinaryOperator>(PHIUser)) || !cheapToScalarize(PHIUser, true))
+ return nullptr;
+
+ // Create a scalar PHI node that will replace the vector PHI node
+ // just before the current PHI node.
+ PHINode *scalarPHI = cast<PHINode>(InsertNewInstWith(
+ PHINode::Create(EI.getType(), PN->getNumIncomingValues(), ""), *PN));
+ // Scalarize each PHI operand.
+ for (unsigned i = 0; i < PN->getNumIncomingValues(); i++) {
+ Value *PHIInVal = PN->getIncomingValue(i);
+ BasicBlock *inBB = PN->getIncomingBlock(i);
+ Value *Elt = EI.getIndexOperand();
+ // If the operand is the PHI induction variable:
+ if (PHIInVal == PHIUser) {
+ // Scalarize the binary operation. Its first operand is the
+ // scalar PHI, and the second operand is extracted from the other
+ // vector operand.
+ BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);
+ unsigned opId = (B0->getOperand(0) == PN) ? 1 : 0;
+ Value *Op = InsertNewInstWith(
+ ExtractElementInst::Create(B0->getOperand(opId), Elt,
+ B0->getOperand(opId)->getName() + ".Elt"),
+ *B0);
+ Value *newPHIUser = InsertNewInstWith(
+ BinaryOperator::CreateWithCopiedFlags(B0->getOpcode(),
+ scalarPHI, Op, B0), *B0);
+ scalarPHI->addIncoming(newPHIUser, inBB);
+ } else {
+ // Scalarize PHI input:
+ Instruction *newEI = ExtractElementInst::Create(PHIInVal, Elt, "");
+ // Insert the new instruction into the predecessor basic block.
+ Instruction *pos = dyn_cast<Instruction>(PHIInVal);
+ BasicBlock::iterator InsertPos;
+ if (pos && !isa<PHINode>(pos)) {
+ InsertPos = ++pos->getIterator();
+ } else {
+ InsertPos = inBB->getFirstInsertionPt();
+ }
+
+ InsertNewInstWith(newEI, *InsertPos);
+
+ scalarPHI->addIncoming(newEI, inBB);
+ }
+ }
+
+ for (auto E : Extracts)
+ replaceInstUsesWith(*E, scalarPHI);
+
+ return &EI;
+}
+
+static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
+ InstCombiner::BuilderTy &Builder,
+ bool IsBigEndian) {
+ Value *X;
+ uint64_t ExtIndexC;
+ if (!match(Ext.getVectorOperand(), m_BitCast(m_Value(X))) ||
+ !X->getType()->isVectorTy() ||
+ !match(Ext.getIndexOperand(), m_ConstantInt(ExtIndexC)))
+ return nullptr;
+
+ // If this extractelement is using a bitcast from a vector of the same number
+ // of elements, see if we can find the source element from the source vector:
+ // extelt (bitcast VecX), IndexC --> bitcast X[IndexC]
+ auto *SrcTy = cast<VectorType>(X->getType());
+ Type *DestTy = Ext.getType();
ElementCount NumSrcElts = SrcTy->getElementCount();
ElementCount NumElts =
cast<VectorType>(Ext.getVectorOperandType())->getElementCount();
- if (NumSrcElts == NumElts)
- if (Value *Elt = findScalarElement(X, ExtIndexC))
- return new BitCastInst(Elt, DestTy);
-
+ if (NumSrcElts == NumElts)
+ if (Value *Elt = findScalarElement(X, ExtIndexC))
+ return new BitCastInst(Elt, DestTy);
+
assert(NumSrcElts.isScalable() == NumElts.isScalable() &&
"Src and Dst must be the same sort of vector type");
- // If the source elements are wider than the destination, try to shift and
- // truncate a subset of scalar bits of an insert op.
+ // If the source elements are wider than the destination, try to shift and
+ // truncate a subset of scalar bits of an insert op.
if (NumSrcElts.getKnownMinValue() < NumElts.getKnownMinValue()) {
- Value *Scalar;
- uint64_t InsIndexC;
- if (!match(X, m_InsertElt(m_Value(), m_Value(Scalar),
- m_ConstantInt(InsIndexC))))
- return nullptr;
-
- // The extract must be from the subset of vector elements that we inserted
- // into. Example: if we inserted element 1 of a <2 x i64> and we are
- // extracting an i16 (narrowing ratio = 4), then this extract must be from 1
- // of elements 4-7 of the bitcasted vector.
+ Value *Scalar;
+ uint64_t InsIndexC;
+ if (!match(X, m_InsertElt(m_Value(), m_Value(Scalar),
+ m_ConstantInt(InsIndexC))))
+ return nullptr;
+
+ // The extract must be from the subset of vector elements that we inserted
+ // into. Example: if we inserted element 1 of a <2 x i64> and we are
+ // extracting an i16 (narrowing ratio = 4), then this extract must be from 1
+ // of elements 4-7 of the bitcasted vector.
unsigned NarrowingRatio =
NumElts.getKnownMinValue() / NumSrcElts.getKnownMinValue();
- if (ExtIndexC / NarrowingRatio != InsIndexC)
- return nullptr;
-
- // We are extracting part of the original scalar. How that scalar is
- // inserted into the vector depends on the endian-ness. Example:
- // Vector Byte Elt Index: 0 1 2 3 4 5 6 7
- // +--+--+--+--+--+--+--+--+
- // inselt <2 x i32> V, <i32> S, 1: |V0|V1|V2|V3|S0|S1|S2|S3|
- // extelt <4 x i16> V', 3: | |S2|S3|
- // +--+--+--+--+--+--+--+--+
- // If this is little-endian, S2|S3 are the MSB of the 32-bit 'S' value.
- // If this is big-endian, S2|S3 are the LSB of the 32-bit 'S' value.
- // In this example, we must right-shift little-endian. Big-endian is just a
- // truncate.
- unsigned Chunk = ExtIndexC % NarrowingRatio;
- if (IsBigEndian)
- Chunk = NarrowingRatio - 1 - Chunk;
-
- // Bail out if this is an FP vector to FP vector sequence. That would take
- // more instructions than we started with unless there is no shift, and it
- // may not be handled as well in the backend.
- bool NeedSrcBitcast = SrcTy->getScalarType()->isFloatingPointTy();
- bool NeedDestBitcast = DestTy->isFloatingPointTy();
- if (NeedSrcBitcast && NeedDestBitcast)
- return nullptr;
-
- unsigned SrcWidth = SrcTy->getScalarSizeInBits();
- unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
- unsigned ShAmt = Chunk * DestWidth;
-
- // TODO: This limitation is more strict than necessary. We could sum the
- // number of new instructions and subtract the number eliminated to know if
- // we can proceed.
- if (!X->hasOneUse() || !Ext.getVectorOperand()->hasOneUse())
- if (NeedSrcBitcast || NeedDestBitcast)
- return nullptr;
-
- if (NeedSrcBitcast) {
- Type *SrcIntTy = IntegerType::getIntNTy(Scalar->getContext(), SrcWidth);
- Scalar = Builder.CreateBitCast(Scalar, SrcIntTy);
- }
-
- if (ShAmt) {
- // Bail out if we could end with more instructions than we started with.
- if (!Ext.getVectorOperand()->hasOneUse())
- return nullptr;
- Scalar = Builder.CreateLShr(Scalar, ShAmt);
- }
-
- if (NeedDestBitcast) {
- Type *DestIntTy = IntegerType::getIntNTy(Scalar->getContext(), DestWidth);
- return new BitCastInst(Builder.CreateTrunc(Scalar, DestIntTy), DestTy);
- }
- return new TruncInst(Scalar, DestTy);
- }
-
- return nullptr;
-}
-
-/// Find elements of V demanded by UserInstr.
-static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
+ if (ExtIndexC / NarrowingRatio != InsIndexC)
+ return nullptr;
+
+ // We are extracting part of the original scalar. How that scalar is
+ // inserted into the vector depends on the endian-ness. Example:
+ // Vector Byte Elt Index: 0 1 2 3 4 5 6 7
+ // +--+--+--+--+--+--+--+--+
+ // inselt <2 x i32> V, <i32> S, 1: |V0|V1|V2|V3|S0|S1|S2|S3|
+ // extelt <4 x i16> V', 3: | |S2|S3|
+ // +--+--+--+--+--+--+--+--+
+ // If this is little-endian, S2|S3 are the MSB of the 32-bit 'S' value.
+ // If this is big-endian, S2|S3 are the LSB of the 32-bit 'S' value.
+ // In this example, we must right-shift little-endian. Big-endian is just a
+ // truncate.
+ unsigned Chunk = ExtIndexC % NarrowingRatio;
+ if (IsBigEndian)
+ Chunk = NarrowingRatio - 1 - Chunk;
+
+ // Bail out if this is an FP vector to FP vector sequence. That would take
+ // more instructions than we started with unless there is no shift, and it
+ // may not be handled as well in the backend.
+ bool NeedSrcBitcast = SrcTy->getScalarType()->isFloatingPointTy();
+ bool NeedDestBitcast = DestTy->isFloatingPointTy();
+ if (NeedSrcBitcast && NeedDestBitcast)
+ return nullptr;
+
+ unsigned SrcWidth = SrcTy->getScalarSizeInBits();
+ unsigned DestWidth = DestTy->getPrimitiveSizeInBits();
+ unsigned ShAmt = Chunk * DestWidth;
+
+ // TODO: This limitation is more strict than necessary. We could sum the
+ // number of new instructions and subtract the number eliminated to know if
+ // we can proceed.
+ if (!X->hasOneUse() || !Ext.getVectorOperand()->hasOneUse())
+ if (NeedSrcBitcast || NeedDestBitcast)
+ return nullptr;
+
+ if (NeedSrcBitcast) {
+ Type *SrcIntTy = IntegerType::getIntNTy(Scalar->getContext(), SrcWidth);
+ Scalar = Builder.CreateBitCast(Scalar, SrcIntTy);
+ }
+
+ if (ShAmt) {
+ // Bail out if we could end with more instructions than we started with.
+ if (!Ext.getVectorOperand()->hasOneUse())
+ return nullptr;
+ Scalar = Builder.CreateLShr(Scalar, ShAmt);
+ }
+
+ if (NeedDestBitcast) {
+ Type *DestIntTy = IntegerType::getIntNTy(Scalar->getContext(), DestWidth);
+ return new BitCastInst(Builder.CreateTrunc(Scalar, DestIntTy), DestTy);
+ }
+ return new TruncInst(Scalar, DestTy);
+ }
+
+ return nullptr;
+}
+
+/// Find elements of V demanded by UserInstr.
+static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
-
- // Conservatively assume that all elements are needed.
- APInt UsedElts(APInt::getAllOnesValue(VWidth));
-
- switch (UserInstr->getOpcode()) {
- case Instruction::ExtractElement: {
- ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr);
- assert(EEI->getVectorOperand() == V);
- ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand());
- if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) {
- UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue());
- }
- break;
- }
- case Instruction::ShuffleVector: {
- ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr);
- unsigned MaskNumElts =
+
+ // Conservatively assume that all elements are needed.
+ APInt UsedElts(APInt::getAllOnesValue(VWidth));
+
+ switch (UserInstr->getOpcode()) {
+ case Instruction::ExtractElement: {
+ ExtractElementInst *EEI = cast<ExtractElementInst>(UserInstr);
+ assert(EEI->getVectorOperand() == V);
+ ConstantInt *EEIIndexC = dyn_cast<ConstantInt>(EEI->getIndexOperand());
+ if (EEIIndexC && EEIIndexC->getValue().ult(VWidth)) {
+ UsedElts = APInt::getOneBitSet(VWidth, EEIIndexC->getZExtValue());
+ }
+ break;
+ }
+ case Instruction::ShuffleVector: {
+ ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr);
+ unsigned MaskNumElts =
cast<FixedVectorType>(UserInstr->getType())->getNumElements();
-
- UsedElts = APInt(VWidth, 0);
- for (unsigned i = 0; i < MaskNumElts; i++) {
- unsigned MaskVal = Shuffle->getMaskValue(i);
- if (MaskVal == -1u || MaskVal >= 2 * VWidth)
- continue;
- if (Shuffle->getOperand(0) == V && (MaskVal < VWidth))
- UsedElts.setBit(MaskVal);
- if (Shuffle->getOperand(1) == V &&
- ((MaskVal >= VWidth) && (MaskVal < 2 * VWidth)))
- UsedElts.setBit(MaskVal - VWidth);
- }
- break;
- }
- default:
- break;
- }
- return UsedElts;
-}
-
-/// Find union of elements of V demanded by all its users.
-/// If it is known by querying findDemandedEltsBySingleUser that
-/// no user demands an element of V, then the corresponding bit
-/// remains unset in the returned value.
-static APInt findDemandedEltsByAllUsers(Value *V) {
+
+ UsedElts = APInt(VWidth, 0);
+ for (unsigned i = 0; i < MaskNumElts; i++) {
+ unsigned MaskVal = Shuffle->getMaskValue(i);
+ if (MaskVal == -1u || MaskVal >= 2 * VWidth)
+ continue;
+ if (Shuffle->getOperand(0) == V && (MaskVal < VWidth))
+ UsedElts.setBit(MaskVal);
+ if (Shuffle->getOperand(1) == V &&
+ ((MaskVal >= VWidth) && (MaskVal < 2 * VWidth)))
+ UsedElts.setBit(MaskVal - VWidth);
+ }
+ break;
+ }
+ default:
+ break;
+ }
+ return UsedElts;
+}
+
+/// Find union of elements of V demanded by all its users.
+/// If it is known by querying findDemandedEltsBySingleUser that
+/// no user demands an element of V, then the corresponding bit
+/// remains unset in the returned value.
+static APInt findDemandedEltsByAllUsers(Value *V) {
unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
-
- APInt UnionUsedElts(VWidth, 0);
- for (const Use &U : V->uses()) {
- if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {
- UnionUsedElts |= findDemandedEltsBySingleUser(V, I);
- } else {
- UnionUsedElts = APInt::getAllOnesValue(VWidth);
- break;
- }
-
- if (UnionUsedElts.isAllOnesValue())
- break;
- }
-
- return UnionUsedElts;
-}
-
+
+ APInt UnionUsedElts(VWidth, 0);
+ for (const Use &U : V->uses()) {
+ if (Instruction *I = dyn_cast<Instruction>(U.getUser())) {
+ UnionUsedElts |= findDemandedEltsBySingleUser(V, I);
+ } else {
+ UnionUsedElts = APInt::getAllOnesValue(VWidth);
+ break;
+ }
+
+ if (UnionUsedElts.isAllOnesValue())
+ break;
+ }
+
+ return UnionUsedElts;
+}
+
Instruction *InstCombinerImpl::visitExtractElementInst(ExtractElementInst &EI) {
- Value *SrcVec = EI.getVectorOperand();
- Value *Index = EI.getIndexOperand();
- if (Value *V = SimplifyExtractElementInst(SrcVec, Index,
- SQ.getWithInstruction(&EI)))
- return replaceInstUsesWith(EI, V);
-
- // If extracting a specified index from the vector, see if we can recursively
- // find a previously computed scalar that was inserted into the vector.
- auto *IndexC = dyn_cast<ConstantInt>(Index);
- if (IndexC) {
- ElementCount EC = EI.getVectorOperandType()->getElementCount();
+ Value *SrcVec = EI.getVectorOperand();
+ Value *Index = EI.getIndexOperand();
+ if (Value *V = SimplifyExtractElementInst(SrcVec, Index,
+ SQ.getWithInstruction(&EI)))
+ return replaceInstUsesWith(EI, V);
+
+ // If extracting a specified index from the vector, see if we can recursively
+ // find a previously computed scalar that was inserted into the vector.
+ auto *IndexC = dyn_cast<ConstantInt>(Index);
+ if (IndexC) {
+ ElementCount EC = EI.getVectorOperandType()->getElementCount();
unsigned NumElts = EC.getKnownMinValue();
-
- // InstSimplify should handle cases where the index is invalid.
- // For fixed-length vector, it's invalid to extract out-of-range element.
+
+ // InstSimplify should handle cases where the index is invalid.
+ // For fixed-length vector, it's invalid to extract out-of-range element.
if (!EC.isScalable() && IndexC->getValue().uge(NumElts))
- return nullptr;
-
- // This instruction only demands the single element from the input vector.
- // Skip for scalable type, the number of elements is unknown at
- // compile-time.
+ return nullptr;
+
+ // This instruction only demands the single element from the input vector.
+ // Skip for scalable type, the number of elements is unknown at
+ // compile-time.
if (!EC.isScalable() && NumElts != 1) {
- // If the input vector has a single use, simplify it based on this use
- // property.
- if (SrcVec->hasOneUse()) {
- APInt UndefElts(NumElts, 0);
- APInt DemandedElts(NumElts, 0);
- DemandedElts.setBit(IndexC->getZExtValue());
- if (Value *V =
- SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts))
- return replaceOperand(EI, 0, V);
- } else {
- // If the input vector has multiple uses, simplify it based on a union
- // of all elements used.
- APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec);
- if (!DemandedElts.isAllOnesValue()) {
- APInt UndefElts(NumElts, 0);
- if (Value *V = SimplifyDemandedVectorElts(
- SrcVec, DemandedElts, UndefElts, 0 /* Depth */,
- true /* AllowMultipleUsers */)) {
- if (V != SrcVec) {
- SrcVec->replaceAllUsesWith(V);
- return &EI;
- }
- }
- }
- }
- }
- if (Instruction *I = foldBitcastExtElt(EI, Builder, DL.isBigEndian()))
- return I;
-
- // If there's a vector PHI feeding a scalar use through this extractelement
- // instruction, try to scalarize the PHI.
- if (auto *Phi = dyn_cast<PHINode>(SrcVec))
- if (Instruction *ScalarPHI = scalarizePHI(EI, Phi))
- return ScalarPHI;
- }
-
- // TODO come up with a n-ary matcher that subsumes both unary and
- // binary matchers.
- UnaryOperator *UO;
- if (match(SrcVec, m_UnOp(UO)) && cheapToScalarize(SrcVec, IndexC)) {
- // extelt (unop X), Index --> unop (extelt X, Index)
- Value *X = UO->getOperand(0);
- Value *E = Builder.CreateExtractElement(X, Index);
- return UnaryOperator::CreateWithCopiedFlags(UO->getOpcode(), E, UO);
- }
-
- BinaryOperator *BO;
- if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, IndexC)) {
- // extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index)
- Value *X = BO->getOperand(0), *Y = BO->getOperand(1);
- Value *E0 = Builder.CreateExtractElement(X, Index);
- Value *E1 = Builder.CreateExtractElement(Y, Index);
- return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(), E0, E1, BO);
- }
-
- Value *X, *Y;
- CmpInst::Predicate Pred;
- if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&
- cheapToScalarize(SrcVec, IndexC)) {
- // extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index)
- Value *E0 = Builder.CreateExtractElement(X, Index);
- Value *E1 = Builder.CreateExtractElement(Y, Index);
- return CmpInst::Create(cast<CmpInst>(SrcVec)->getOpcode(), Pred, E0, E1);
- }
-
- if (auto *I = dyn_cast<Instruction>(SrcVec)) {
- if (auto *IE = dyn_cast<InsertElementInst>(I)) {
- // Extracting the inserted element?
- if (IE->getOperand(2) == Index)
- return replaceInstUsesWith(EI, IE->getOperand(1));
- // If the inserted and extracted elements are constants, they must not
- // be the same value, extract from the pre-inserted value instead.
- if (isa<Constant>(IE->getOperand(2)) && IndexC)
- return replaceOperand(EI, 0, IE->getOperand(0));
- } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
- // If this is extracting an element from a shufflevector, figure out where
- // it came from and extract from the appropriate input element instead.
- // Restrict the following transformation to fixed-length vector.
- if (isa<FixedVectorType>(SVI->getType()) && isa<ConstantInt>(Index)) {
- int SrcIdx =
- SVI->getMaskValue(cast<ConstantInt>(Index)->getZExtValue());
- Value *Src;
- unsigned LHSWidth = cast<FixedVectorType>(SVI->getOperand(0)->getType())
- ->getNumElements();
-
- if (SrcIdx < 0)
- return replaceInstUsesWith(EI, UndefValue::get(EI.getType()));
- if (SrcIdx < (int)LHSWidth)
- Src = SVI->getOperand(0);
- else {
- SrcIdx -= LHSWidth;
- Src = SVI->getOperand(1);
- }
- Type *Int32Ty = Type::getInt32Ty(EI.getContext());
- return ExtractElementInst::Create(
- Src, ConstantInt::get(Int32Ty, SrcIdx, false));
- }
- } else if (auto *CI = dyn_cast<CastInst>(I)) {
- // Canonicalize extractelement(cast) -> cast(extractelement).
- // Bitcasts can change the number of vector elements, and they cost
- // nothing.
- if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
- Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index);
- return CastInst::Create(CI->getOpcode(), EE, EI.getType());
- }
- }
- }
- return nullptr;
-}
-
-/// If V is a shuffle of values that ONLY returns elements from either LHS or
-/// RHS, return the shuffle mask and true. Otherwise, return false.
-static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
- SmallVectorImpl<int> &Mask) {
- assert(LHS->getType() == RHS->getType() &&
- "Invalid CollectSingleShuffleElements");
+ // If the input vector has a single use, simplify it based on this use
+ // property.
+ if (SrcVec->hasOneUse()) {
+ APInt UndefElts(NumElts, 0);
+ APInt DemandedElts(NumElts, 0);
+ DemandedElts.setBit(IndexC->getZExtValue());
+ if (Value *V =
+ SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts))
+ return replaceOperand(EI, 0, V);
+ } else {
+ // If the input vector has multiple uses, simplify it based on a union
+ // of all elements used.
+ APInt DemandedElts = findDemandedEltsByAllUsers(SrcVec);
+ if (!DemandedElts.isAllOnesValue()) {
+ APInt UndefElts(NumElts, 0);
+ if (Value *V = SimplifyDemandedVectorElts(
+ SrcVec, DemandedElts, UndefElts, 0 /* Depth */,
+ true /* AllowMultipleUsers */)) {
+ if (V != SrcVec) {
+ SrcVec->replaceAllUsesWith(V);
+ return &EI;
+ }
+ }
+ }
+ }
+ }
+ if (Instruction *I = foldBitcastExtElt(EI, Builder, DL.isBigEndian()))
+ return I;
+
+ // If there's a vector PHI feeding a scalar use through this extractelement
+ // instruction, try to scalarize the PHI.
+ if (auto *Phi = dyn_cast<PHINode>(SrcVec))
+ if (Instruction *ScalarPHI = scalarizePHI(EI, Phi))
+ return ScalarPHI;
+ }
+
+ // TODO come up with a n-ary matcher that subsumes both unary and
+ // binary matchers.
+ UnaryOperator *UO;
+ if (match(SrcVec, m_UnOp(UO)) && cheapToScalarize(SrcVec, IndexC)) {
+ // extelt (unop X), Index --> unop (extelt X, Index)
+ Value *X = UO->getOperand(0);
+ Value *E = Builder.CreateExtractElement(X, Index);
+ return UnaryOperator::CreateWithCopiedFlags(UO->getOpcode(), E, UO);
+ }
+
+ BinaryOperator *BO;
+ if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, IndexC)) {
+ // extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index)
+ Value *X = BO->getOperand(0), *Y = BO->getOperand(1);
+ Value *E0 = Builder.CreateExtractElement(X, Index);
+ Value *E1 = Builder.CreateExtractElement(Y, Index);
+ return BinaryOperator::CreateWithCopiedFlags(BO->getOpcode(), E0, E1, BO);
+ }
+
+ Value *X, *Y;
+ CmpInst::Predicate Pred;
+ if (match(SrcVec, m_Cmp(Pred, m_Value(X), m_Value(Y))) &&
+ cheapToScalarize(SrcVec, IndexC)) {
+ // extelt (cmp X, Y), Index --> cmp (extelt X, Index), (extelt Y, Index)
+ Value *E0 = Builder.CreateExtractElement(X, Index);
+ Value *E1 = Builder.CreateExtractElement(Y, Index);
+ return CmpInst::Create(cast<CmpInst>(SrcVec)->getOpcode(), Pred, E0, E1);
+ }
+
+ if (auto *I = dyn_cast<Instruction>(SrcVec)) {
+ if (auto *IE = dyn_cast<InsertElementInst>(I)) {
+ // Extracting the inserted element?
+ if (IE->getOperand(2) == Index)
+ return replaceInstUsesWith(EI, IE->getOperand(1));
+ // If the inserted and extracted elements are constants, they must not
+ // be the same value, extract from the pre-inserted value instead.
+ if (isa<Constant>(IE->getOperand(2)) && IndexC)
+ return replaceOperand(EI, 0, IE->getOperand(0));
+ } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
+ // If this is extracting an element from a shufflevector, figure out where
+ // it came from and extract from the appropriate input element instead.
+ // Restrict the following transformation to fixed-length vector.
+ if (isa<FixedVectorType>(SVI->getType()) && isa<ConstantInt>(Index)) {
+ int SrcIdx =
+ SVI->getMaskValue(cast<ConstantInt>(Index)->getZExtValue());
+ Value *Src;
+ unsigned LHSWidth = cast<FixedVectorType>(SVI->getOperand(0)->getType())
+ ->getNumElements();
+
+ if (SrcIdx < 0)
+ return replaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+ if (SrcIdx < (int)LHSWidth)
+ Src = SVI->getOperand(0);
+ else {
+ SrcIdx -= LHSWidth;
+ Src = SVI->getOperand(1);
+ }
+ Type *Int32Ty = Type::getInt32Ty(EI.getContext());
+ return ExtractElementInst::Create(
+ Src, ConstantInt::get(Int32Ty, SrcIdx, false));
+ }
+ } else if (auto *CI = dyn_cast<CastInst>(I)) {
+ // Canonicalize extractelement(cast) -> cast(extractelement).
+ // Bitcasts can change the number of vector elements, and they cost
+ // nothing.
+ if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
+ Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index);
+ return CastInst::Create(CI->getOpcode(), EE, EI.getType());
+ }
+ }
+ }
+ return nullptr;
+}
+
+/// If V is a shuffle of values that ONLY returns elements from either LHS or
+/// RHS, return the shuffle mask and true. Otherwise, return false.
+static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
+ SmallVectorImpl<int> &Mask) {
+ assert(LHS->getType() == RHS->getType() &&
+ "Invalid CollectSingleShuffleElements");
unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();
-
- if (isa<UndefValue>(V)) {
- Mask.assign(NumElts, -1);
- return true;
- }
-
- if (V == LHS) {
- for (unsigned i = 0; i != NumElts; ++i)
- Mask.push_back(i);
- return true;
- }
-
- if (V == RHS) {
- for (unsigned i = 0; i != NumElts; ++i)
- Mask.push_back(i + NumElts);
- return true;
- }
-
- if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
- // If this is an insert of an extract from some other vector, include it.
- Value *VecOp = IEI->getOperand(0);
- Value *ScalarOp = IEI->getOperand(1);
- Value *IdxOp = IEI->getOperand(2);
-
- if (!isa<ConstantInt>(IdxOp))
- return false;
- unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
-
- if (isa<UndefValue>(ScalarOp)) { // inserting undef into vector.
- // We can handle this if the vector we are inserting into is
- // transitively ok.
- if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
- // If so, update the mask to reflect the inserted undef.
- Mask[InsertedIdx] = -1;
- return true;
- }
- } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
- if (isa<ConstantInt>(EI->getOperand(1))) {
- unsigned ExtractedIdx =
- cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
- unsigned NumLHSElts =
+
+ if (isa<UndefValue>(V)) {
+ Mask.assign(NumElts, -1);
+ return true;
+ }
+
+ if (V == LHS) {
+ for (unsigned i = 0; i != NumElts; ++i)
+ Mask.push_back(i);
+ return true;
+ }
+
+ if (V == RHS) {
+ for (unsigned i = 0; i != NumElts; ++i)
+ Mask.push_back(i + NumElts);
+ return true;
+ }
+
+ if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+ // If this is an insert of an extract from some other vector, include it.
+ Value *VecOp = IEI->getOperand(0);
+ Value *ScalarOp = IEI->getOperand(1);
+ Value *IdxOp = IEI->getOperand(2);
+
+ if (!isa<ConstantInt>(IdxOp))
+ return false;
+ unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+
+ if (isa<UndefValue>(ScalarOp)) { // inserting undef into vector.
+ // We can handle this if the vector we are inserting into is
+ // transitively ok.
+ if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+ // If so, update the mask to reflect the inserted undef.
+ Mask[InsertedIdx] = -1;
+ return true;
+ }
+ } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
+ if (isa<ConstantInt>(EI->getOperand(1))) {
+ unsigned ExtractedIdx =
+ cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+ unsigned NumLHSElts =
cast<FixedVectorType>(LHS->getType())->getNumElements();
-
- // This must be extracting from either LHS or RHS.
- if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
- // We can handle this if the vector we are inserting into is
- // transitively ok.
- if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
- // If so, update the mask to reflect the inserted value.
- if (EI->getOperand(0) == LHS) {
- Mask[InsertedIdx % NumElts] = ExtractedIdx;
- } else {
- assert(EI->getOperand(0) == RHS);
- Mask[InsertedIdx % NumElts] = ExtractedIdx + NumLHSElts;
- }
- return true;
- }
- }
- }
- }
- }
-
- return false;
-}
-
-/// If we have insertion into a vector that is wider than the vector that we
-/// are extracting from, try to widen the source vector to allow a single
-/// shufflevector to replace one or more insert/extract pairs.
-static void replaceExtractElements(InsertElementInst *InsElt,
- ExtractElementInst *ExtElt,
+
+ // This must be extracting from either LHS or RHS.
+ if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
+ // We can handle this if the vector we are inserting into is
+ // transitively ok.
+ if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+ // If so, update the mask to reflect the inserted value.
+ if (EI->getOperand(0) == LHS) {
+ Mask[InsertedIdx % NumElts] = ExtractedIdx;
+ } else {
+ assert(EI->getOperand(0) == RHS);
+ Mask[InsertedIdx % NumElts] = ExtractedIdx + NumLHSElts;
+ }
+ return true;
+ }
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
+/// If we have insertion into a vector that is wider than the vector that we
+/// are extracting from, try to widen the source vector to allow a single
+/// shufflevector to replace one or more insert/extract pairs.
+static void replaceExtractElements(InsertElementInst *InsElt,
+ ExtractElementInst *ExtElt,
InstCombinerImpl &IC) {
auto *InsVecType = cast<FixedVectorType>(InsElt->getType());
auto *ExtVecType = cast<FixedVectorType>(ExtElt->getVectorOperandType());
- unsigned NumInsElts = InsVecType->getNumElements();
- unsigned NumExtElts = ExtVecType->getNumElements();
-
- // The inserted-to vector must be wider than the extracted-from vector.
- if (InsVecType->getElementType() != ExtVecType->getElementType() ||
- NumExtElts >= NumInsElts)
- return;
-
- // Create a shuffle mask to widen the extended-from vector using undefined
- // values. The mask selects all of the values of the original vector followed
- // by as many undefined values as needed to create a vector of the same length
- // as the inserted-to vector.
- SmallVector<int, 16> ExtendMask;
- for (unsigned i = 0; i < NumExtElts; ++i)
- ExtendMask.push_back(i);
- for (unsigned i = NumExtElts; i < NumInsElts; ++i)
- ExtendMask.push_back(-1);
-
- Value *ExtVecOp = ExtElt->getVectorOperand();
- auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
- BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
- ? ExtVecOpInst->getParent()
- : ExtElt->getParent();
-
- // TODO: This restriction matches the basic block check below when creating
- // new extractelement instructions. If that limitation is removed, this one
- // could also be removed. But for now, we just bail out to ensure that we
- // will replace the extractelement instruction that is feeding our
- // insertelement instruction. This allows the insertelement to then be
- // replaced by a shufflevector. If the insertelement is not replaced, we can
- // induce infinite looping because there's an optimization for extractelement
- // that will delete our widening shuffle. This would trigger another attempt
- // here to create that shuffle, and we spin forever.
- if (InsertionBlock != InsElt->getParent())
- return;
-
- // TODO: This restriction matches the check in visitInsertElementInst() and
- // prevents an infinite loop caused by not turning the extract/insert pair
- // into a shuffle. We really should not need either check, but we're lacking
- // folds for shufflevectors because we're afraid to generate shuffle masks
- // that the backend can't handle.
- if (InsElt->hasOneUse() && isa<InsertElementInst>(InsElt->user_back()))
- return;
-
- auto *WideVec =
- new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType), ExtendMask);
-
- // Insert the new shuffle after the vector operand of the extract is defined
- // (as long as it's not a PHI) or at the start of the basic block of the
- // extract, so any subsequent extracts in the same basic block can use it.
- // TODO: Insert before the earliest ExtractElementInst that is replaced.
- if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
- WideVec->insertAfter(ExtVecOpInst);
- else
- IC.InsertNewInstWith(WideVec, *ExtElt->getParent()->getFirstInsertionPt());
-
- // Replace extracts from the original narrow vector with extracts from the new
- // wide vector.
- for (User *U : ExtVecOp->users()) {
- ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U);
- if (!OldExt || OldExt->getParent() != WideVec->getParent())
- continue;
- auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));
- NewExt->insertAfter(OldExt);
- IC.replaceInstUsesWith(*OldExt, NewExt);
- }
-}
-
-/// We are building a shuffle to create V, which is a sequence of insertelement,
-/// extractelement pairs. If PermittedRHS is set, then we must either use it or
-/// not rely on the second vector source. Return a std::pair containing the
-/// left and right vectors of the proposed shuffle (or 0), and set the Mask
-/// parameter as required.
-///
-/// Note: we intentionally don't try to fold earlier shuffles since they have
-/// often been chosen carefully to be efficiently implementable on the target.
-using ShuffleOps = std::pair<Value *, Value *>;
-
-static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,
- Value *PermittedRHS,
+ unsigned NumInsElts = InsVecType->getNumElements();
+ unsigned NumExtElts = ExtVecType->getNumElements();
+
+ // The inserted-to vector must be wider than the extracted-from vector.
+ if (InsVecType->getElementType() != ExtVecType->getElementType() ||
+ NumExtElts >= NumInsElts)
+ return;
+
+ // Create a shuffle mask to widen the extended-from vector using undefined
+ // values. The mask selects all of the values of the original vector followed
+ // by as many undefined values as needed to create a vector of the same length
+ // as the inserted-to vector.
+ SmallVector<int, 16> ExtendMask;
+ for (unsigned i = 0; i < NumExtElts; ++i)
+ ExtendMask.push_back(i);
+ for (unsigned i = NumExtElts; i < NumInsElts; ++i)
+ ExtendMask.push_back(-1);
+
+ Value *ExtVecOp = ExtElt->getVectorOperand();
+ auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
+ BasicBlock *InsertionBlock = (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
+ ? ExtVecOpInst->getParent()
+ : ExtElt->getParent();
+
+ // TODO: This restriction matches the basic block check below when creating
+ // new extractelement instructions. If that limitation is removed, this one
+ // could also be removed. But for now, we just bail out to ensure that we
+ // will replace the extractelement instruction that is feeding our
+ // insertelement instruction. This allows the insertelement to then be
+ // replaced by a shufflevector. If the insertelement is not replaced, we can
+ // induce infinite looping because there's an optimization for extractelement
+ // that will delete our widening shuffle. This would trigger another attempt
+ // here to create that shuffle, and we spin forever.
+ if (InsertionBlock != InsElt->getParent())
+ return;
+
+ // TODO: This restriction matches the check in visitInsertElementInst() and
+ // prevents an infinite loop caused by not turning the extract/insert pair
+ // into a shuffle. We really should not need either check, but we're lacking
+ // folds for shufflevectors because we're afraid to generate shuffle masks
+ // that the backend can't handle.
+ if (InsElt->hasOneUse() && isa<InsertElementInst>(InsElt->user_back()))
+ return;
+
+ auto *WideVec =
+ new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType), ExtendMask);
+
+ // Insert the new shuffle after the vector operand of the extract is defined
+ // (as long as it's not a PHI) or at the start of the basic block of the
+ // extract, so any subsequent extracts in the same basic block can use it.
+ // TODO: Insert before the earliest ExtractElementInst that is replaced.
+ if (ExtVecOpInst && !isa<PHINode>(ExtVecOpInst))
+ WideVec->insertAfter(ExtVecOpInst);
+ else
+ IC.InsertNewInstWith(WideVec, *ExtElt->getParent()->getFirstInsertionPt());
+
+ // Replace extracts from the original narrow vector with extracts from the new
+ // wide vector.
+ for (User *U : ExtVecOp->users()) {
+ ExtractElementInst *OldExt = dyn_cast<ExtractElementInst>(U);
+ if (!OldExt || OldExt->getParent() != WideVec->getParent())
+ continue;
+ auto *NewExt = ExtractElementInst::Create(WideVec, OldExt->getOperand(1));
+ NewExt->insertAfter(OldExt);
+ IC.replaceInstUsesWith(*OldExt, NewExt);
+ }
+}
+
+/// We are building a shuffle to create V, which is a sequence of insertelement,
+/// extractelement pairs. If PermittedRHS is set, then we must either use it or
+/// not rely on the second vector source. Return a std::pair containing the
+/// left and right vectors of the proposed shuffle (or 0), and set the Mask
+/// parameter as required.
+///
+/// Note: we intentionally don't try to fold earlier shuffles since they have
+/// often been chosen carefully to be efficiently implementable on the target.
+using ShuffleOps = std::pair<Value *, Value *>;
+
+static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,
+ Value *PermittedRHS,
InstCombinerImpl &IC) {
- assert(V->getType()->isVectorTy() && "Invalid shuffle!");
- unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();
-
- if (isa<UndefValue>(V)) {
- Mask.assign(NumElts, -1);
- return std::make_pair(
- PermittedRHS ? UndefValue::get(PermittedRHS->getType()) : V, nullptr);
- }
-
- if (isa<ConstantAggregateZero>(V)) {
- Mask.assign(NumElts, 0);
- return std::make_pair(V, nullptr);
- }
-
- if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
- // If this is an insert of an extract from some other vector, include it.
- Value *VecOp = IEI->getOperand(0);
- Value *ScalarOp = IEI->getOperand(1);
- Value *IdxOp = IEI->getOperand(2);
-
- if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
- if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) {
- unsigned ExtractedIdx =
- cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
- unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
-
- // Either the extracted from or inserted into vector must be RHSVec,
- // otherwise we'd end up with a shuffle of three inputs.
- if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {
- Value *RHS = EI->getOperand(0);
- ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC);
- assert(LR.second == nullptr || LR.second == RHS);
-
- if (LR.first->getType() != RHS->getType()) {
- // Although we are giving up for now, see if we can create extracts
- // that match the inserts for another round of combining.
- replaceExtractElements(IEI, EI, IC);
-
- // We tried our best, but we can't find anything compatible with RHS
- // further up the chain. Return a trivial shuffle.
- for (unsigned i = 0; i < NumElts; ++i)
- Mask[i] = i;
- return std::make_pair(V, nullptr);
- }
-
- unsigned NumLHSElts =
+ assert(V->getType()->isVectorTy() && "Invalid shuffle!");
+ unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();
+
+ if (isa<UndefValue>(V)) {
+ Mask.assign(NumElts, -1);
+ return std::make_pair(
+ PermittedRHS ? UndefValue::get(PermittedRHS->getType()) : V, nullptr);
+ }
+
+ if (isa<ConstantAggregateZero>(V)) {
+ Mask.assign(NumElts, 0);
+ return std::make_pair(V, nullptr);
+ }
+
+ if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+ // If this is an insert of an extract from some other vector, include it.
+ Value *VecOp = IEI->getOperand(0);
+ Value *ScalarOp = IEI->getOperand(1);
+ Value *IdxOp = IEI->getOperand(2);
+
+ if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+ if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp)) {
+ unsigned ExtractedIdx =
+ cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+ unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+
+ // Either the extracted from or inserted into vector must be RHSVec,
+ // otherwise we'd end up with a shuffle of three inputs.
+ if (EI->getOperand(0) == PermittedRHS || PermittedRHS == nullptr) {
+ Value *RHS = EI->getOperand(0);
+ ShuffleOps LR = collectShuffleElements(VecOp, Mask, RHS, IC);
+ assert(LR.second == nullptr || LR.second == RHS);
+
+ if (LR.first->getType() != RHS->getType()) {
+ // Although we are giving up for now, see if we can create extracts
+ // that match the inserts for another round of combining.
+ replaceExtractElements(IEI, EI, IC);
+
+ // We tried our best, but we can't find anything compatible with RHS
+ // further up the chain. Return a trivial shuffle.
+ for (unsigned i = 0; i < NumElts; ++i)
+ Mask[i] = i;
+ return std::make_pair(V, nullptr);
+ }
+
+ unsigned NumLHSElts =
cast<FixedVectorType>(RHS->getType())->getNumElements();
- Mask[InsertedIdx % NumElts] = NumLHSElts + ExtractedIdx;
- return std::make_pair(LR.first, RHS);
- }
-
- if (VecOp == PermittedRHS) {
- // We've gone as far as we can: anything on the other side of the
- // extractelement will already have been converted into a shuffle.
- unsigned NumLHSElts =
+ Mask[InsertedIdx % NumElts] = NumLHSElts + ExtractedIdx;
+ return std::make_pair(LR.first, RHS);
+ }
+
+ if (VecOp == PermittedRHS) {
+ // We've gone as far as we can: anything on the other side of the
+ // extractelement will already have been converted into a shuffle.
+ unsigned NumLHSElts =
cast<FixedVectorType>(EI->getOperand(0)->getType())
->getNumElements();
- for (unsigned i = 0; i != NumElts; ++i)
- Mask.push_back(i == InsertedIdx ? ExtractedIdx : NumLHSElts + i);
- return std::make_pair(EI->getOperand(0), PermittedRHS);
- }
-
- // If this insertelement is a chain that comes from exactly these two
- // vectors, return the vector and the effective shuffle.
- if (EI->getOperand(0)->getType() == PermittedRHS->getType() &&
- collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS,
- Mask))
- return std::make_pair(EI->getOperand(0), PermittedRHS);
- }
- }
- }
-
- // Otherwise, we can't do anything fancy. Return an identity vector.
- for (unsigned i = 0; i != NumElts; ++i)
- Mask.push_back(i);
- return std::make_pair(V, nullptr);
-}
-
+ for (unsigned i = 0; i != NumElts; ++i)
+ Mask.push_back(i == InsertedIdx ? ExtractedIdx : NumLHSElts + i);
+ return std::make_pair(EI->getOperand(0), PermittedRHS);
+ }
+
+ // If this insertelement is a chain that comes from exactly these two
+ // vectors, return the vector and the effective shuffle.
+ if (EI->getOperand(0)->getType() == PermittedRHS->getType() &&
+ collectSingleShuffleElements(IEI, EI->getOperand(0), PermittedRHS,
+ Mask))
+ return std::make_pair(EI->getOperand(0), PermittedRHS);
+ }
+ }
+ }
+
+ // Otherwise, we can't do anything fancy. Return an identity vector.
+ for (unsigned i = 0; i != NumElts; ++i)
+ Mask.push_back(i);
+ return std::make_pair(V, nullptr);
+}
+
/// Look for chain of insertvalue's that fully define an aggregate, and trace
/// back the values inserted, see if they are all were extractvalue'd from
/// the same source aggregate from the exact same element indexes.
@@ -984,1661 +984,1661 @@ Instruction *InstCombinerImpl::foldAggregateConstructionIntoAggregateReuse(
return replaceInstUsesWith(OrigIVI, PHI);
}
-/// Try to find redundant insertvalue instructions, like the following ones:
-/// %0 = insertvalue { i8, i32 } undef, i8 %x, 0
-/// %1 = insertvalue { i8, i32 } %0, i8 %y, 0
-/// Here the second instruction inserts values at the same indices, as the
-/// first one, making the first one redundant.
-/// It should be transformed to:
-/// %0 = insertvalue { i8, i32 } undef, i8 %y, 0
+/// Try to find redundant insertvalue instructions, like the following ones:
+/// %0 = insertvalue { i8, i32 } undef, i8 %x, 0
+/// %1 = insertvalue { i8, i32 } %0, i8 %y, 0
+/// Here the second instruction inserts values at the same indices, as the
+/// first one, making the first one redundant.
+/// It should be transformed to:
+/// %0 = insertvalue { i8, i32 } undef, i8 %y, 0
Instruction *InstCombinerImpl::visitInsertValueInst(InsertValueInst &I) {
- bool IsRedundant = false;
- ArrayRef<unsigned int> FirstIndices = I.getIndices();
-
- // If there is a chain of insertvalue instructions (each of them except the
- // last one has only one use and it's another insertvalue insn from this
- // chain), check if any of the 'children' uses the same indices as the first
- // instruction. In this case, the first one is redundant.
- Value *V = &I;
- unsigned Depth = 0;
- while (V->hasOneUse() && Depth < 10) {
- User *U = V->user_back();
- auto UserInsInst = dyn_cast<InsertValueInst>(U);
- if (!UserInsInst || U->getOperand(0) != V)
- break;
- if (UserInsInst->getIndices() == FirstIndices) {
- IsRedundant = true;
- break;
- }
- V = UserInsInst;
- Depth++;
- }
-
- if (IsRedundant)
- return replaceInstUsesWith(I, I.getOperand(0));
+ bool IsRedundant = false;
+ ArrayRef<unsigned int> FirstIndices = I.getIndices();
+
+ // If there is a chain of insertvalue instructions (each of them except the
+ // last one has only one use and it's another insertvalue insn from this
+ // chain), check if any of the 'children' uses the same indices as the first
+ // instruction. In this case, the first one is redundant.
+ Value *V = &I;
+ unsigned Depth = 0;
+ while (V->hasOneUse() && Depth < 10) {
+ User *U = V->user_back();
+ auto UserInsInst = dyn_cast<InsertValueInst>(U);
+ if (!UserInsInst || U->getOperand(0) != V)
+ break;
+ if (UserInsInst->getIndices() == FirstIndices) {
+ IsRedundant = true;
+ break;
+ }
+ V = UserInsInst;
+ Depth++;
+ }
+
+ if (IsRedundant)
+ return replaceInstUsesWith(I, I.getOperand(0));
if (Instruction *NewI = foldAggregateConstructionIntoAggregateReuse(I))
return NewI;
- return nullptr;
-}
-
-static bool isShuffleEquivalentToSelect(ShuffleVectorInst &Shuf) {
- // Can not analyze scalable type, the number of elements is not a compile-time
- // constant.
- if (isa<ScalableVectorType>(Shuf.getOperand(0)->getType()))
- return false;
-
- int MaskSize = Shuf.getShuffleMask().size();
- int VecSize =
- cast<FixedVectorType>(Shuf.getOperand(0)->getType())->getNumElements();
-
- // A vector select does not change the size of the operands.
- if (MaskSize != VecSize)
- return false;
-
- // Each mask element must be undefined or choose a vector element from one of
- // the source operands without crossing vector lanes.
- for (int i = 0; i != MaskSize; ++i) {
- int Elt = Shuf.getMaskValue(i);
- if (Elt != -1 && Elt != i && Elt != i + VecSize)
- return false;
- }
-
- return true;
-}
-
-/// Turn a chain of inserts that splats a value into an insert + shuffle:
-/// insertelt(insertelt(insertelt(insertelt X, %k, 0), %k, 1), %k, 2) ... ->
-/// shufflevector(insertelt(X, %k, 0), undef, zero)
-static Instruction *foldInsSequenceIntoSplat(InsertElementInst &InsElt) {
- // We are interested in the last insert in a chain. So if this insert has a
- // single user and that user is an insert, bail.
- if (InsElt.hasOneUse() && isa<InsertElementInst>(InsElt.user_back()))
- return nullptr;
-
- VectorType *VecTy = InsElt.getType();
- // Can not handle scalable type, the number of elements is not a compile-time
- // constant.
- if (isa<ScalableVectorType>(VecTy))
- return nullptr;
- unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
-
- // Do not try to do this for a one-element vector, since that's a nop,
- // and will cause an inf-loop.
- if (NumElements == 1)
- return nullptr;
-
- Value *SplatVal = InsElt.getOperand(1);
- InsertElementInst *CurrIE = &InsElt;
- SmallBitVector ElementPresent(NumElements, false);
- InsertElementInst *FirstIE = nullptr;
-
- // Walk the chain backwards, keeping track of which indices we inserted into,
- // until we hit something that isn't an insert of the splatted value.
- while (CurrIE) {
- auto *Idx = dyn_cast<ConstantInt>(CurrIE->getOperand(2));
- if (!Idx || CurrIE->getOperand(1) != SplatVal)
- return nullptr;
-
- auto *NextIE = dyn_cast<InsertElementInst>(CurrIE->getOperand(0));
- // Check none of the intermediate steps have any additional uses, except
- // for the root insertelement instruction, which can be re-used, if it
- // inserts at position 0.
- if (CurrIE != &InsElt &&
- (!CurrIE->hasOneUse() && (NextIE != nullptr || !Idx->isZero())))
- return nullptr;
-
- ElementPresent[Idx->getZExtValue()] = true;
- FirstIE = CurrIE;
- CurrIE = NextIE;
- }
-
- // If this is just a single insertelement (not a sequence), we are done.
- if (FirstIE == &InsElt)
- return nullptr;
-
- // If we are not inserting into an undef vector, make sure we've seen an
- // insert into every element.
- // TODO: If the base vector is not undef, it might be better to create a splat
- // and then a select-shuffle (blend) with the base vector.
- if (!isa<UndefValue>(FirstIE->getOperand(0)))
- if (!ElementPresent.all())
- return nullptr;
-
- // Create the insert + shuffle.
- Type *Int32Ty = Type::getInt32Ty(InsElt.getContext());
- UndefValue *UndefVec = UndefValue::get(VecTy);
- Constant *Zero = ConstantInt::get(Int32Ty, 0);
- if (!cast<ConstantInt>(FirstIE->getOperand(2))->isZero())
- FirstIE = InsertElementInst::Create(UndefVec, SplatVal, Zero, "", &InsElt);
-
- // Splat from element 0, but replace absent elements with undef in the mask.
- SmallVector<int, 16> Mask(NumElements, 0);
- for (unsigned i = 0; i != NumElements; ++i)
- if (!ElementPresent[i])
- Mask[i] = -1;
-
- return new ShuffleVectorInst(FirstIE, UndefVec, Mask);
-}
-
-/// Try to fold an insert element into an existing splat shuffle by changing
-/// the shuffle's mask to include the index of this insert element.
-static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) {
- // Check if the vector operand of this insert is a canonical splat shuffle.
- auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0));
- if (!Shuf || !Shuf->isZeroEltSplat())
- return nullptr;
-
- // Bail out early if shuffle is scalable type. The number of elements in
- // shuffle mask is unknown at compile-time.
- if (isa<ScalableVectorType>(Shuf->getType()))
- return nullptr;
-
- // Check for a constant insertion index.
- uint64_t IdxC;
- if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))
- return nullptr;
-
- // Check if the splat shuffle's input is the same as this insert's scalar op.
- Value *X = InsElt.getOperand(1);
- Value *Op0 = Shuf->getOperand(0);
- if (!match(Op0, m_InsertElt(m_Undef(), m_Specific(X), m_ZeroInt())))
- return nullptr;
-
- // Replace the shuffle mask element at the index of this insert with a zero.
- // For example:
- // inselt (shuf (inselt undef, X, 0), undef, <0,undef,0,undef>), X, 1
- // --> shuf (inselt undef, X, 0), undef, <0,0,0,undef>
+ return nullptr;
+}
+
+static bool isShuffleEquivalentToSelect(ShuffleVectorInst &Shuf) {
+ // Can not analyze scalable type, the number of elements is not a compile-time
+ // constant.
+ if (isa<ScalableVectorType>(Shuf.getOperand(0)->getType()))
+ return false;
+
+ int MaskSize = Shuf.getShuffleMask().size();
+ int VecSize =
+ cast<FixedVectorType>(Shuf.getOperand(0)->getType())->getNumElements();
+
+ // A vector select does not change the size of the operands.
+ if (MaskSize != VecSize)
+ return false;
+
+ // Each mask element must be undefined or choose a vector element from one of
+ // the source operands without crossing vector lanes.
+ for (int i = 0; i != MaskSize; ++i) {
+ int Elt = Shuf.getMaskValue(i);
+ if (Elt != -1 && Elt != i && Elt != i + VecSize)
+ return false;
+ }
+
+ return true;
+}
+
+/// Turn a chain of inserts that splats a value into an insert + shuffle:
+/// insertelt(insertelt(insertelt(insertelt X, %k, 0), %k, 1), %k, 2) ... ->
+/// shufflevector(insertelt(X, %k, 0), undef, zero)
+static Instruction *foldInsSequenceIntoSplat(InsertElementInst &InsElt) {
+ // We are interested in the last insert in a chain. So if this insert has a
+ // single user and that user is an insert, bail.
+ if (InsElt.hasOneUse() && isa<InsertElementInst>(InsElt.user_back()))
+ return nullptr;
+
+ VectorType *VecTy = InsElt.getType();
+ // Can not handle scalable type, the number of elements is not a compile-time
+ // constant.
+ if (isa<ScalableVectorType>(VecTy))
+ return nullptr;
+ unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
+
+ // Do not try to do this for a one-element vector, since that's a nop,
+ // and will cause an inf-loop.
+ if (NumElements == 1)
+ return nullptr;
+
+ Value *SplatVal = InsElt.getOperand(1);
+ InsertElementInst *CurrIE = &InsElt;
+ SmallBitVector ElementPresent(NumElements, false);
+ InsertElementInst *FirstIE = nullptr;
+
+ // Walk the chain backwards, keeping track of which indices we inserted into,
+ // until we hit something that isn't an insert of the splatted value.
+ while (CurrIE) {
+ auto *Idx = dyn_cast<ConstantInt>(CurrIE->getOperand(2));
+ if (!Idx || CurrIE->getOperand(1) != SplatVal)
+ return nullptr;
+
+ auto *NextIE = dyn_cast<InsertElementInst>(CurrIE->getOperand(0));
+ // Check none of the intermediate steps have any additional uses, except
+ // for the root insertelement instruction, which can be re-used, if it
+ // inserts at position 0.
+ if (CurrIE != &InsElt &&
+ (!CurrIE->hasOneUse() && (NextIE != nullptr || !Idx->isZero())))
+ return nullptr;
+
+ ElementPresent[Idx->getZExtValue()] = true;
+ FirstIE = CurrIE;
+ CurrIE = NextIE;
+ }
+
+ // If this is just a single insertelement (not a sequence), we are done.
+ if (FirstIE == &InsElt)
+ return nullptr;
+
+ // If we are not inserting into an undef vector, make sure we've seen an
+ // insert into every element.
+ // TODO: If the base vector is not undef, it might be better to create a splat
+ // and then a select-shuffle (blend) with the base vector.
+ if (!isa<UndefValue>(FirstIE->getOperand(0)))
+ if (!ElementPresent.all())
+ return nullptr;
+
+ // Create the insert + shuffle.
+ Type *Int32Ty = Type::getInt32Ty(InsElt.getContext());
+ UndefValue *UndefVec = UndefValue::get(VecTy);
+ Constant *Zero = ConstantInt::get(Int32Ty, 0);
+ if (!cast<ConstantInt>(FirstIE->getOperand(2))->isZero())
+ FirstIE = InsertElementInst::Create(UndefVec, SplatVal, Zero, "", &InsElt);
+
+ // Splat from element 0, but replace absent elements with undef in the mask.
+ SmallVector<int, 16> Mask(NumElements, 0);
+ for (unsigned i = 0; i != NumElements; ++i)
+ if (!ElementPresent[i])
+ Mask[i] = -1;
+
+ return new ShuffleVectorInst(FirstIE, UndefVec, Mask);
+}
+
+/// Try to fold an insert element into an existing splat shuffle by changing
+/// the shuffle's mask to include the index of this insert element.
+static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) {
+ // Check if the vector operand of this insert is a canonical splat shuffle.
+ auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0));
+ if (!Shuf || !Shuf->isZeroEltSplat())
+ return nullptr;
+
+ // Bail out early if shuffle is scalable type. The number of elements in
+ // shuffle mask is unknown at compile-time.
+ if (isa<ScalableVectorType>(Shuf->getType()))
+ return nullptr;
+
+ // Check for a constant insertion index.
+ uint64_t IdxC;
+ if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))
+ return nullptr;
+
+ // Check if the splat shuffle's input is the same as this insert's scalar op.
+ Value *X = InsElt.getOperand(1);
+ Value *Op0 = Shuf->getOperand(0);
+ if (!match(Op0, m_InsertElt(m_Undef(), m_Specific(X), m_ZeroInt())))
+ return nullptr;
+
+ // Replace the shuffle mask element at the index of this insert with a zero.
+ // For example:
+ // inselt (shuf (inselt undef, X, 0), undef, <0,undef,0,undef>), X, 1
+ // --> shuf (inselt undef, X, 0), undef, <0,0,0,undef>
unsigned NumMaskElts =
cast<FixedVectorType>(Shuf->getType())->getNumElements();
- SmallVector<int, 16> NewMask(NumMaskElts);
- for (unsigned i = 0; i != NumMaskElts; ++i)
- NewMask[i] = i == IdxC ? 0 : Shuf->getMaskValue(i);
-
- return new ShuffleVectorInst(Op0, UndefValue::get(Op0->getType()), NewMask);
-}
-
-/// Try to fold an extract+insert element into an existing identity shuffle by
-/// changing the shuffle's mask to include the index of this insert element.
-static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) {
- // Check if the vector operand of this insert is an identity shuffle.
- auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0));
- if (!Shuf || !isa<UndefValue>(Shuf->getOperand(1)) ||
- !(Shuf->isIdentityWithExtract() || Shuf->isIdentityWithPadding()))
- return nullptr;
-
- // Bail out early if shuffle is scalable type. The number of elements in
- // shuffle mask is unknown at compile-time.
- if (isa<ScalableVectorType>(Shuf->getType()))
- return nullptr;
-
- // Check for a constant insertion index.
- uint64_t IdxC;
- if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))
- return nullptr;
-
- // Check if this insert's scalar op is extracted from the identity shuffle's
- // input vector.
- Value *Scalar = InsElt.getOperand(1);
- Value *X = Shuf->getOperand(0);
- if (!match(Scalar, m_ExtractElt(m_Specific(X), m_SpecificInt(IdxC))))
- return nullptr;
-
- // Replace the shuffle mask element at the index of this extract+insert with
- // that same index value.
- // For example:
- // inselt (shuf X, IdMask), (extelt X, IdxC), IdxC --> shuf X, IdMask'
+ SmallVector<int, 16> NewMask(NumMaskElts);
+ for (unsigned i = 0; i != NumMaskElts; ++i)
+ NewMask[i] = i == IdxC ? 0 : Shuf->getMaskValue(i);
+
+ return new ShuffleVectorInst(Op0, UndefValue::get(Op0->getType()), NewMask);
+}
+
+/// Try to fold an extract+insert element into an existing identity shuffle by
+/// changing the shuffle's mask to include the index of this insert element.
+static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) {
+ // Check if the vector operand of this insert is an identity shuffle.
+ auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0));
+ if (!Shuf || !isa<UndefValue>(Shuf->getOperand(1)) ||
+ !(Shuf->isIdentityWithExtract() || Shuf->isIdentityWithPadding()))
+ return nullptr;
+
+ // Bail out early if shuffle is scalable type. The number of elements in
+ // shuffle mask is unknown at compile-time.
+ if (isa<ScalableVectorType>(Shuf->getType()))
+ return nullptr;
+
+ // Check for a constant insertion index.
+ uint64_t IdxC;
+ if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))
+ return nullptr;
+
+ // Check if this insert's scalar op is extracted from the identity shuffle's
+ // input vector.
+ Value *Scalar = InsElt.getOperand(1);
+ Value *X = Shuf->getOperand(0);
+ if (!match(Scalar, m_ExtractElt(m_Specific(X), m_SpecificInt(IdxC))))
+ return nullptr;
+
+ // Replace the shuffle mask element at the index of this extract+insert with
+ // that same index value.
+ // For example:
+ // inselt (shuf X, IdMask), (extelt X, IdxC), IdxC --> shuf X, IdMask'
unsigned NumMaskElts =
cast<FixedVectorType>(Shuf->getType())->getNumElements();
- SmallVector<int, 16> NewMask(NumMaskElts);
- ArrayRef<int> OldMask = Shuf->getShuffleMask();
- for (unsigned i = 0; i != NumMaskElts; ++i) {
- if (i != IdxC) {
- // All mask elements besides the inserted element remain the same.
- NewMask[i] = OldMask[i];
- } else if (OldMask[i] == (int)IdxC) {
- // If the mask element was already set, there's nothing to do
- // (demanded elements analysis may unset it later).
- return nullptr;
- } else {
- assert(OldMask[i] == UndefMaskElem &&
- "Unexpected shuffle mask element for identity shuffle");
- NewMask[i] = IdxC;
- }
- }
-
- return new ShuffleVectorInst(X, Shuf->getOperand(1), NewMask);
-}
-
-/// If we have an insertelement instruction feeding into another insertelement
-/// and the 2nd is inserting a constant into the vector, canonicalize that
-/// constant insertion before the insertion of a variable:
-///
-/// insertelement (insertelement X, Y, IdxC1), ScalarC, IdxC2 -->
-/// insertelement (insertelement X, ScalarC, IdxC2), Y, IdxC1
-///
-/// This has the potential of eliminating the 2nd insertelement instruction
-/// via constant folding of the scalar constant into a vector constant.
-static Instruction *hoistInsEltConst(InsertElementInst &InsElt2,
- InstCombiner::BuilderTy &Builder) {
- auto *InsElt1 = dyn_cast<InsertElementInst>(InsElt2.getOperand(0));
- if (!InsElt1 || !InsElt1->hasOneUse())
- return nullptr;
-
- Value *X, *Y;
- Constant *ScalarC;
- ConstantInt *IdxC1, *IdxC2;
- if (match(InsElt1->getOperand(0), m_Value(X)) &&
- match(InsElt1->getOperand(1), m_Value(Y)) && !isa<Constant>(Y) &&
- match(InsElt1->getOperand(2), m_ConstantInt(IdxC1)) &&
- match(InsElt2.getOperand(1), m_Constant(ScalarC)) &&
- match(InsElt2.getOperand(2), m_ConstantInt(IdxC2)) && IdxC1 != IdxC2) {
- Value *NewInsElt1 = Builder.CreateInsertElement(X, ScalarC, IdxC2);
- return InsertElementInst::Create(NewInsElt1, Y, IdxC1);
- }
-
- return nullptr;
-}
-
-/// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex
-/// --> shufflevector X, CVec', Mask'
-static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
- auto *Inst = dyn_cast<Instruction>(InsElt.getOperand(0));
- // Bail out if the parent has more than one use. In that case, we'd be
- // replacing the insertelt with a shuffle, and that's not a clear win.
- if (!Inst || !Inst->hasOneUse())
- return nullptr;
- if (auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0))) {
- // The shuffle must have a constant vector operand. The insertelt must have
- // a constant scalar being inserted at a constant position in the vector.
- Constant *ShufConstVec, *InsEltScalar;
- uint64_t InsEltIndex;
- if (!match(Shuf->getOperand(1), m_Constant(ShufConstVec)) ||
- !match(InsElt.getOperand(1), m_Constant(InsEltScalar)) ||
- !match(InsElt.getOperand(2), m_ConstantInt(InsEltIndex)))
- return nullptr;
-
- // Adding an element to an arbitrary shuffle could be expensive, but a
- // shuffle that selects elements from vectors without crossing lanes is
- // assumed cheap.
- // If we're just adding a constant into that shuffle, it will still be
- // cheap.
- if (!isShuffleEquivalentToSelect(*Shuf))
- return nullptr;
-
- // From the above 'select' check, we know that the mask has the same number
- // of elements as the vector input operands. We also know that each constant
- // input element is used in its lane and can not be used more than once by
- // the shuffle. Therefore, replace the constant in the shuffle's constant
- // vector with the insertelt constant. Replace the constant in the shuffle's
- // mask vector with the insertelt index plus the length of the vector
- // (because the constant vector operand of a shuffle is always the 2nd
- // operand).
- ArrayRef<int> Mask = Shuf->getShuffleMask();
- unsigned NumElts = Mask.size();
- SmallVector<Constant *, 16> NewShufElts(NumElts);
- SmallVector<int, 16> NewMaskElts(NumElts);
- for (unsigned I = 0; I != NumElts; ++I) {
- if (I == InsEltIndex) {
- NewShufElts[I] = InsEltScalar;
- NewMaskElts[I] = InsEltIndex + NumElts;
- } else {
- // Copy over the existing values.
- NewShufElts[I] = ShufConstVec->getAggregateElement(I);
- NewMaskElts[I] = Mask[I];
- }
- }
-
- // Create new operands for a shuffle that includes the constant of the
- // original insertelt. The old shuffle will be dead now.
- return new ShuffleVectorInst(Shuf->getOperand(0),
- ConstantVector::get(NewShufElts), NewMaskElts);
- } else if (auto *IEI = dyn_cast<InsertElementInst>(Inst)) {
- // Transform sequences of insertelements ops with constant data/indexes into
- // a single shuffle op.
- // Can not handle scalable type, the number of elements needed to create
- // shuffle mask is not a compile-time constant.
- if (isa<ScalableVectorType>(InsElt.getType()))
- return nullptr;
- unsigned NumElts =
- cast<FixedVectorType>(InsElt.getType())->getNumElements();
-
- uint64_t InsertIdx[2];
- Constant *Val[2];
- if (!match(InsElt.getOperand(2), m_ConstantInt(InsertIdx[0])) ||
- !match(InsElt.getOperand(1), m_Constant(Val[0])) ||
- !match(IEI->getOperand(2), m_ConstantInt(InsertIdx[1])) ||
- !match(IEI->getOperand(1), m_Constant(Val[1])))
- return nullptr;
- SmallVector<Constant *, 16> Values(NumElts);
- SmallVector<int, 16> Mask(NumElts);
- auto ValI = std::begin(Val);
- // Generate new constant vector and mask.
- // We have 2 values/masks from the insertelements instructions. Insert them
- // into new value/mask vectors.
- for (uint64_t I : InsertIdx) {
- if (!Values[I]) {
- Values[I] = *ValI;
- Mask[I] = NumElts + I;
- }
- ++ValI;
- }
- // Remaining values are filled with 'undef' values.
- for (unsigned I = 0; I < NumElts; ++I) {
- if (!Values[I]) {
- Values[I] = UndefValue::get(InsElt.getType()->getElementType());
- Mask[I] = I;
- }
- }
- // Create new operands for a shuffle that includes the constant of the
- // original insertelt.
- return new ShuffleVectorInst(IEI->getOperand(0),
- ConstantVector::get(Values), Mask);
- }
- return nullptr;
-}
-
+ SmallVector<int, 16> NewMask(NumMaskElts);
+ ArrayRef<int> OldMask = Shuf->getShuffleMask();
+ for (unsigned i = 0; i != NumMaskElts; ++i) {
+ if (i != IdxC) {
+ // All mask elements besides the inserted element remain the same.
+ NewMask[i] = OldMask[i];
+ } else if (OldMask[i] == (int)IdxC) {
+ // If the mask element was already set, there's nothing to do
+ // (demanded elements analysis may unset it later).
+ return nullptr;
+ } else {
+ assert(OldMask[i] == UndefMaskElem &&
+ "Unexpected shuffle mask element for identity shuffle");
+ NewMask[i] = IdxC;
+ }
+ }
+
+ return new ShuffleVectorInst(X, Shuf->getOperand(1), NewMask);
+}
+
+/// If we have an insertelement instruction feeding into another insertelement
+/// and the 2nd is inserting a constant into the vector, canonicalize that
+/// constant insertion before the insertion of a variable:
+///
+/// insertelement (insertelement X, Y, IdxC1), ScalarC, IdxC2 -->
+/// insertelement (insertelement X, ScalarC, IdxC2), Y, IdxC1
+///
+/// This has the potential of eliminating the 2nd insertelement instruction
+/// via constant folding of the scalar constant into a vector constant.
+static Instruction *hoistInsEltConst(InsertElementInst &InsElt2,
+ InstCombiner::BuilderTy &Builder) {
+ auto *InsElt1 = dyn_cast<InsertElementInst>(InsElt2.getOperand(0));
+ if (!InsElt1 || !InsElt1->hasOneUse())
+ return nullptr;
+
+ Value *X, *Y;
+ Constant *ScalarC;
+ ConstantInt *IdxC1, *IdxC2;
+ if (match(InsElt1->getOperand(0), m_Value(X)) &&
+ match(InsElt1->getOperand(1), m_Value(Y)) && !isa<Constant>(Y) &&
+ match(InsElt1->getOperand(2), m_ConstantInt(IdxC1)) &&
+ match(InsElt2.getOperand(1), m_Constant(ScalarC)) &&
+ match(InsElt2.getOperand(2), m_ConstantInt(IdxC2)) && IdxC1 != IdxC2) {
+ Value *NewInsElt1 = Builder.CreateInsertElement(X, ScalarC, IdxC2);
+ return InsertElementInst::Create(NewInsElt1, Y, IdxC1);
+ }
+
+ return nullptr;
+}
+
+/// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex
+/// --> shufflevector X, CVec', Mask'
+static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
+ auto *Inst = dyn_cast<Instruction>(InsElt.getOperand(0));
+ // Bail out if the parent has more than one use. In that case, we'd be
+ // replacing the insertelt with a shuffle, and that's not a clear win.
+ if (!Inst || !Inst->hasOneUse())
+ return nullptr;
+ if (auto *Shuf = dyn_cast<ShuffleVectorInst>(InsElt.getOperand(0))) {
+ // The shuffle must have a constant vector operand. The insertelt must have
+ // a constant scalar being inserted at a constant position in the vector.
+ Constant *ShufConstVec, *InsEltScalar;
+ uint64_t InsEltIndex;
+ if (!match(Shuf->getOperand(1), m_Constant(ShufConstVec)) ||
+ !match(InsElt.getOperand(1), m_Constant(InsEltScalar)) ||
+ !match(InsElt.getOperand(2), m_ConstantInt(InsEltIndex)))
+ return nullptr;
+
+ // Adding an element to an arbitrary shuffle could be expensive, but a
+ // shuffle that selects elements from vectors without crossing lanes is
+ // assumed cheap.
+ // If we're just adding a constant into that shuffle, it will still be
+ // cheap.
+ if (!isShuffleEquivalentToSelect(*Shuf))
+ return nullptr;
+
+ // From the above 'select' check, we know that the mask has the same number
+ // of elements as the vector input operands. We also know that each constant
+ // input element is used in its lane and can not be used more than once by
+ // the shuffle. Therefore, replace the constant in the shuffle's constant
+ // vector with the insertelt constant. Replace the constant in the shuffle's
+ // mask vector with the insertelt index plus the length of the vector
+ // (because the constant vector operand of a shuffle is always the 2nd
+ // operand).
+ ArrayRef<int> Mask = Shuf->getShuffleMask();
+ unsigned NumElts = Mask.size();
+ SmallVector<Constant *, 16> NewShufElts(NumElts);
+ SmallVector<int, 16> NewMaskElts(NumElts);
+ for (unsigned I = 0; I != NumElts; ++I) {
+ if (I == InsEltIndex) {
+ NewShufElts[I] = InsEltScalar;
+ NewMaskElts[I] = InsEltIndex + NumElts;
+ } else {
+ // Copy over the existing values.
+ NewShufElts[I] = ShufConstVec->getAggregateElement(I);
+ NewMaskElts[I] = Mask[I];
+ }
+ }
+
+ // Create new operands for a shuffle that includes the constant of the
+ // original insertelt. The old shuffle will be dead now.
+ return new ShuffleVectorInst(Shuf->getOperand(0),
+ ConstantVector::get(NewShufElts), NewMaskElts);
+ } else if (auto *IEI = dyn_cast<InsertElementInst>(Inst)) {
+ // Transform sequences of insertelements ops with constant data/indexes into
+ // a single shuffle op.
+ // Can not handle scalable type, the number of elements needed to create
+ // shuffle mask is not a compile-time constant.
+ if (isa<ScalableVectorType>(InsElt.getType()))
+ return nullptr;
+ unsigned NumElts =
+ cast<FixedVectorType>(InsElt.getType())->getNumElements();
+
+ uint64_t InsertIdx[2];
+ Constant *Val[2];
+ if (!match(InsElt.getOperand(2), m_ConstantInt(InsertIdx[0])) ||
+ !match(InsElt.getOperand(1), m_Constant(Val[0])) ||
+ !match(IEI->getOperand(2), m_ConstantInt(InsertIdx[1])) ||
+ !match(IEI->getOperand(1), m_Constant(Val[1])))
+ return nullptr;
+ SmallVector<Constant *, 16> Values(NumElts);
+ SmallVector<int, 16> Mask(NumElts);
+ auto ValI = std::begin(Val);
+ // Generate new constant vector and mask.
+ // We have 2 values/masks from the insertelements instructions. Insert them
+ // into new value/mask vectors.
+ for (uint64_t I : InsertIdx) {
+ if (!Values[I]) {
+ Values[I] = *ValI;
+ Mask[I] = NumElts + I;
+ }
+ ++ValI;
+ }
+ // Remaining values are filled with 'undef' values.
+ for (unsigned I = 0; I < NumElts; ++I) {
+ if (!Values[I]) {
+ Values[I] = UndefValue::get(InsElt.getType()->getElementType());
+ Mask[I] = I;
+ }
+ }
+ // Create new operands for a shuffle that includes the constant of the
+ // original insertelt.
+ return new ShuffleVectorInst(IEI->getOperand(0),
+ ConstantVector::get(Values), Mask);
+ }
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
- Value *VecOp = IE.getOperand(0);
- Value *ScalarOp = IE.getOperand(1);
- Value *IdxOp = IE.getOperand(2);
-
- if (auto *V = SimplifyInsertElementInst(
- VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE)))
- return replaceInstUsesWith(IE, V);
-
- // If the scalar is bitcast and inserted into undef, do the insert in the
- // source type followed by bitcast.
- // TODO: Generalize for insert into any constant, not just undef?
- Value *ScalarSrc;
- if (match(VecOp, m_Undef()) &&
- match(ScalarOp, m_OneUse(m_BitCast(m_Value(ScalarSrc)))) &&
- (ScalarSrc->getType()->isIntegerTy() ||
- ScalarSrc->getType()->isFloatingPointTy())) {
- // inselt undef, (bitcast ScalarSrc), IdxOp -->
- // bitcast (inselt undef, ScalarSrc, IdxOp)
- Type *ScalarTy = ScalarSrc->getType();
- Type *VecTy = VectorType::get(ScalarTy, IE.getType()->getElementCount());
- UndefValue *NewUndef = UndefValue::get(VecTy);
- Value *NewInsElt = Builder.CreateInsertElement(NewUndef, ScalarSrc, IdxOp);
- return new BitCastInst(NewInsElt, IE.getType());
- }
-
- // If the vector and scalar are both bitcast from the same element type, do
- // the insert in that source type followed by bitcast.
- Value *VecSrc;
- if (match(VecOp, m_BitCast(m_Value(VecSrc))) &&
- match(ScalarOp, m_BitCast(m_Value(ScalarSrc))) &&
- (VecOp->hasOneUse() || ScalarOp->hasOneUse()) &&
- VecSrc->getType()->isVectorTy() && !ScalarSrc->getType()->isVectorTy() &&
- cast<VectorType>(VecSrc->getType())->getElementType() ==
- ScalarSrc->getType()) {
- // inselt (bitcast VecSrc), (bitcast ScalarSrc), IdxOp -->
- // bitcast (inselt VecSrc, ScalarSrc, IdxOp)
- Value *NewInsElt = Builder.CreateInsertElement(VecSrc, ScalarSrc, IdxOp);
- return new BitCastInst(NewInsElt, IE.getType());
- }
-
- // If the inserted element was extracted from some other fixed-length vector
- // and both indexes are valid constants, try to turn this into a shuffle.
- // Can not handle scalable vector type, the number of elements needed to
- // create shuffle mask is not a compile-time constant.
- uint64_t InsertedIdx, ExtractedIdx;
- Value *ExtVecOp;
- if (isa<FixedVectorType>(IE.getType()) &&
- match(IdxOp, m_ConstantInt(InsertedIdx)) &&
- match(ScalarOp,
- m_ExtractElt(m_Value(ExtVecOp), m_ConstantInt(ExtractedIdx))) &&
- isa<FixedVectorType>(ExtVecOp->getType()) &&
- ExtractedIdx <
- cast<FixedVectorType>(ExtVecOp->getType())->getNumElements()) {
- // TODO: Looking at the user(s) to determine if this insert is a
- // fold-to-shuffle opportunity does not match the usual instcombine
- // constraints. We should decide if the transform is worthy based only
- // on this instruction and its operands, but that may not work currently.
- //
- // Here, we are trying to avoid creating shuffles before reaching
- // the end of a chain of extract-insert pairs. This is complicated because
- // we do not generally form arbitrary shuffle masks in instcombine
- // (because those may codegen poorly), but collectShuffleElements() does
- // exactly that.
- //
- // The rules for determining what is an acceptable target-independent
- // shuffle mask are fuzzy because they evolve based on the backend's
- // capabilities and real-world impact.
- auto isShuffleRootCandidate = [](InsertElementInst &Insert) {
- if (!Insert.hasOneUse())
- return true;
- auto *InsertUser = dyn_cast<InsertElementInst>(Insert.user_back());
- if (!InsertUser)
- return true;
- return false;
- };
-
- // Try to form a shuffle from a chain of extract-insert ops.
- if (isShuffleRootCandidate(IE)) {
- SmallVector<int, 16> Mask;
- ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
-
- // The proposed shuffle may be trivial, in which case we shouldn't
- // perform the combine.
- if (LR.first != &IE && LR.second != &IE) {
- // We now have a shuffle of LHS, RHS, Mask.
- if (LR.second == nullptr)
- LR.second = UndefValue::get(LR.first->getType());
- return new ShuffleVectorInst(LR.first, LR.second, Mask);
- }
- }
- }
-
- if (auto VecTy = dyn_cast<FixedVectorType>(VecOp->getType())) {
- unsigned VWidth = VecTy->getNumElements();
- APInt UndefElts(VWidth, 0);
- APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
- if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) {
- if (V != &IE)
- return replaceInstUsesWith(IE, V);
- return &IE;
- }
- }
-
- if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE))
- return Shuf;
-
- if (Instruction *NewInsElt = hoistInsEltConst(IE, Builder))
- return NewInsElt;
-
- if (Instruction *Broadcast = foldInsSequenceIntoSplat(IE))
- return Broadcast;
-
- if (Instruction *Splat = foldInsEltIntoSplat(IE))
- return Splat;
-
- if (Instruction *IdentityShuf = foldInsEltIntoIdentityShuffle(IE))
- return IdentityShuf;
-
- return nullptr;
-}
-
-/// Return true if we can evaluate the specified expression tree if the vector
-/// elements were shuffled in a different order.
-static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask,
- unsigned Depth = 5) {
- // We can always reorder the elements of a constant.
- if (isa<Constant>(V))
- return true;
-
- // We won't reorder vector arguments. No IPO here.
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I) return false;
-
- // Two users may expect different orders of the elements. Don't try it.
- if (!I->hasOneUse())
- return false;
-
- if (Depth == 0) return false;
-
- switch (I->getOpcode()) {
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::URem:
- case Instruction::SRem:
- // Propagating an undefined shuffle mask element to integer div/rem is not
- // allowed because those opcodes can create immediate undefined behavior
- // from an undefined element in an operand.
+ Value *VecOp = IE.getOperand(0);
+ Value *ScalarOp = IE.getOperand(1);
+ Value *IdxOp = IE.getOperand(2);
+
+ if (auto *V = SimplifyInsertElementInst(
+ VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE)))
+ return replaceInstUsesWith(IE, V);
+
+ // If the scalar is bitcast and inserted into undef, do the insert in the
+ // source type followed by bitcast.
+ // TODO: Generalize for insert into any constant, not just undef?
+ Value *ScalarSrc;
+ if (match(VecOp, m_Undef()) &&
+ match(ScalarOp, m_OneUse(m_BitCast(m_Value(ScalarSrc)))) &&
+ (ScalarSrc->getType()->isIntegerTy() ||
+ ScalarSrc->getType()->isFloatingPointTy())) {
+ // inselt undef, (bitcast ScalarSrc), IdxOp -->
+ // bitcast (inselt undef, ScalarSrc, IdxOp)
+ Type *ScalarTy = ScalarSrc->getType();
+ Type *VecTy = VectorType::get(ScalarTy, IE.getType()->getElementCount());
+ UndefValue *NewUndef = UndefValue::get(VecTy);
+ Value *NewInsElt = Builder.CreateInsertElement(NewUndef, ScalarSrc, IdxOp);
+ return new BitCastInst(NewInsElt, IE.getType());
+ }
+
+ // If the vector and scalar are both bitcast from the same element type, do
+ // the insert in that source type followed by bitcast.
+ Value *VecSrc;
+ if (match(VecOp, m_BitCast(m_Value(VecSrc))) &&
+ match(ScalarOp, m_BitCast(m_Value(ScalarSrc))) &&
+ (VecOp->hasOneUse() || ScalarOp->hasOneUse()) &&
+ VecSrc->getType()->isVectorTy() && !ScalarSrc->getType()->isVectorTy() &&
+ cast<VectorType>(VecSrc->getType())->getElementType() ==
+ ScalarSrc->getType()) {
+ // inselt (bitcast VecSrc), (bitcast ScalarSrc), IdxOp -->
+ // bitcast (inselt VecSrc, ScalarSrc, IdxOp)
+ Value *NewInsElt = Builder.CreateInsertElement(VecSrc, ScalarSrc, IdxOp);
+ return new BitCastInst(NewInsElt, IE.getType());
+ }
+
+ // If the inserted element was extracted from some other fixed-length vector
+ // and both indexes are valid constants, try to turn this into a shuffle.
+ // Can not handle scalable vector type, the number of elements needed to
+ // create shuffle mask is not a compile-time constant.
+ uint64_t InsertedIdx, ExtractedIdx;
+ Value *ExtVecOp;
+ if (isa<FixedVectorType>(IE.getType()) &&
+ match(IdxOp, m_ConstantInt(InsertedIdx)) &&
+ match(ScalarOp,
+ m_ExtractElt(m_Value(ExtVecOp), m_ConstantInt(ExtractedIdx))) &&
+ isa<FixedVectorType>(ExtVecOp->getType()) &&
+ ExtractedIdx <
+ cast<FixedVectorType>(ExtVecOp->getType())->getNumElements()) {
+ // TODO: Looking at the user(s) to determine if this insert is a
+ // fold-to-shuffle opportunity does not match the usual instcombine
+ // constraints. We should decide if the transform is worthy based only
+ // on this instruction and its operands, but that may not work currently.
+ //
+ // Here, we are trying to avoid creating shuffles before reaching
+ // the end of a chain of extract-insert pairs. This is complicated because
+ // we do not generally form arbitrary shuffle masks in instcombine
+ // (because those may codegen poorly), but collectShuffleElements() does
+ // exactly that.
+ //
+ // The rules for determining what is an acceptable target-independent
+ // shuffle mask are fuzzy because they evolve based on the backend's
+ // capabilities and real-world impact.
+ auto isShuffleRootCandidate = [](InsertElementInst &Insert) {
+ if (!Insert.hasOneUse())
+ return true;
+ auto *InsertUser = dyn_cast<InsertElementInst>(Insert.user_back());
+ if (!InsertUser)
+ return true;
+ return false;
+ };
+
+ // Try to form a shuffle from a chain of extract-insert ops.
+ if (isShuffleRootCandidate(IE)) {
+ SmallVector<int, 16> Mask;
+ ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
+
+ // The proposed shuffle may be trivial, in which case we shouldn't
+ // perform the combine.
+ if (LR.first != &IE && LR.second != &IE) {
+ // We now have a shuffle of LHS, RHS, Mask.
+ if (LR.second == nullptr)
+ LR.second = UndefValue::get(LR.first->getType());
+ return new ShuffleVectorInst(LR.first, LR.second, Mask);
+ }
+ }
+ }
+
+ if (auto VecTy = dyn_cast<FixedVectorType>(VecOp->getType())) {
+ unsigned VWidth = VecTy->getNumElements();
+ APInt UndefElts(VWidth, 0);
+ APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+ if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) {
+ if (V != &IE)
+ return replaceInstUsesWith(IE, V);
+ return &IE;
+ }
+ }
+
+ if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE))
+ return Shuf;
+
+ if (Instruction *NewInsElt = hoistInsEltConst(IE, Builder))
+ return NewInsElt;
+
+ if (Instruction *Broadcast = foldInsSequenceIntoSplat(IE))
+ return Broadcast;
+
+ if (Instruction *Splat = foldInsEltIntoSplat(IE))
+ return Splat;
+
+ if (Instruction *IdentityShuf = foldInsEltIntoIdentityShuffle(IE))
+ return IdentityShuf;
+
+ return nullptr;
+}
+
+/// Return true if we can evaluate the specified expression tree if the vector
+/// elements were shuffled in a different order.
+static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask,
+ unsigned Depth = 5) {
+ // We can always reorder the elements of a constant.
+ if (isa<Constant>(V))
+ return true;
+
+ // We won't reorder vector arguments. No IPO here.
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I) return false;
+
+ // Two users may expect different orders of the elements. Don't try it.
+ if (!I->hasOneUse())
+ return false;
+
+ if (Depth == 0) return false;
+
+ switch (I->getOpcode()) {
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ // Propagating an undefined shuffle mask element to integer div/rem is not
+ // allowed because those opcodes can create immediate undefined behavior
+ // from an undefined element in an operand.
if (llvm::is_contained(Mask, -1))
- return false;
- LLVM_FALLTHROUGH;
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::FDiv:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::ICmp:
- case Instruction::FCmp:
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::UIToFP:
- case Instruction::SIToFP:
- case Instruction::FPTrunc:
- case Instruction::FPExt:
- case Instruction::GetElementPtr: {
- // Bail out if we would create longer vector ops. We could allow creating
- // longer vector ops, but that may result in more expensive codegen.
- Type *ITy = I->getType();
- if (ITy->isVectorTy() &&
+ return false;
+ LLVM_FALLTHROUGH;
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::GetElementPtr: {
+ // Bail out if we would create longer vector ops. We could allow creating
+ // longer vector ops, but that may result in more expensive codegen.
+ Type *ITy = I->getType();
+ if (ITy->isVectorTy() &&
Mask.size() > cast<FixedVectorType>(ITy)->getNumElements())
- return false;
- for (Value *Operand : I->operands()) {
- if (!canEvaluateShuffled(Operand, Mask, Depth - 1))
- return false;
- }
- return true;
- }
- case Instruction::InsertElement: {
- ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
- if (!CI) return false;
- int ElementNumber = CI->getLimitedValue();
-
- // Verify that 'CI' does not occur twice in Mask. A single 'insertelement'
- // can't put an element into multiple indices.
- bool SeenOnce = false;
- for (int i = 0, e = Mask.size(); i != e; ++i) {
- if (Mask[i] == ElementNumber) {
- if (SeenOnce)
- return false;
- SeenOnce = true;
- }
- }
- return canEvaluateShuffled(I->getOperand(0), Mask, Depth - 1);
- }
- }
- return false;
-}
-
-/// Rebuild a new instruction just like 'I' but with the new operands given.
-/// In the event of type mismatch, the type of the operands is correct.
-static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) {
- // We don't want to use the IRBuilder here because we want the replacement
- // instructions to appear next to 'I', not the builder's insertion point.
- switch (I->getOpcode()) {
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- BinaryOperator *BO = cast<BinaryOperator>(I);
- assert(NewOps.size() == 2 && "binary operator with #ops != 2");
- BinaryOperator *New =
- BinaryOperator::Create(cast<BinaryOperator>(I)->getOpcode(),
- NewOps[0], NewOps[1], "", BO);
- if (isa<OverflowingBinaryOperator>(BO)) {
- New->setHasNoUnsignedWrap(BO->hasNoUnsignedWrap());
- New->setHasNoSignedWrap(BO->hasNoSignedWrap());
- }
- if (isa<PossiblyExactOperator>(BO)) {
- New->setIsExact(BO->isExact());
- }
- if (isa<FPMathOperator>(BO))
- New->copyFastMathFlags(I);
- return New;
- }
- case Instruction::ICmp:
- assert(NewOps.size() == 2 && "icmp with #ops != 2");
- return new ICmpInst(I, cast<ICmpInst>(I)->getPredicate(),
- NewOps[0], NewOps[1]);
- case Instruction::FCmp:
- assert(NewOps.size() == 2 && "fcmp with #ops != 2");
- return new FCmpInst(I, cast<FCmpInst>(I)->getPredicate(),
- NewOps[0], NewOps[1]);
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::UIToFP:
- case Instruction::SIToFP:
- case Instruction::FPTrunc:
- case Instruction::FPExt: {
- // It's possible that the mask has a different number of elements from
- // the original cast. We recompute the destination type to match the mask.
- Type *DestTy = VectorType::get(
- I->getType()->getScalarType(),
- cast<VectorType>(NewOps[0]->getType())->getElementCount());
- assert(NewOps.size() == 1 && "cast with #ops != 1");
- return CastInst::Create(cast<CastInst>(I)->getOpcode(), NewOps[0], DestTy,
- "", I);
- }
- case Instruction::GetElementPtr: {
- Value *Ptr = NewOps[0];
- ArrayRef<Value*> Idx = NewOps.slice(1);
- GetElementPtrInst *GEP = GetElementPtrInst::Create(
- cast<GetElementPtrInst>(I)->getSourceElementType(), Ptr, Idx, "", I);
- GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds());
- return GEP;
- }
- }
- llvm_unreachable("failed to rebuild vector instructions");
-}
-
-static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
- // Mask.size() does not need to be equal to the number of vector elements.
-
- assert(V->getType()->isVectorTy() && "can't reorder non-vector elements");
- Type *EltTy = V->getType()->getScalarType();
- Type *I32Ty = IntegerType::getInt32Ty(V->getContext());
- if (isa<UndefValue>(V))
- return UndefValue::get(FixedVectorType::get(EltTy, Mask.size()));
-
- if (isa<ConstantAggregateZero>(V))
- return ConstantAggregateZero::get(FixedVectorType::get(EltTy, Mask.size()));
-
- if (Constant *C = dyn_cast<Constant>(V))
- return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()),
- Mask);
-
- Instruction *I = cast<Instruction>(V);
- switch (I->getOpcode()) {
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::ICmp:
- case Instruction::FCmp:
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::UIToFP:
- case Instruction::SIToFP:
- case Instruction::FPTrunc:
- case Instruction::FPExt:
- case Instruction::Select:
- case Instruction::GetElementPtr: {
- SmallVector<Value*, 8> NewOps;
- bool NeedsRebuild =
+ return false;
+ for (Value *Operand : I->operands()) {
+ if (!canEvaluateShuffled(Operand, Mask, Depth - 1))
+ return false;
+ }
+ return true;
+ }
+ case Instruction::InsertElement: {
+ ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
+ if (!CI) return false;
+ int ElementNumber = CI->getLimitedValue();
+
+ // Verify that 'CI' does not occur twice in Mask. A single 'insertelement'
+ // can't put an element into multiple indices.
+ bool SeenOnce = false;
+ for (int i = 0, e = Mask.size(); i != e; ++i) {
+ if (Mask[i] == ElementNumber) {
+ if (SeenOnce)
+ return false;
+ SeenOnce = true;
+ }
+ }
+ return canEvaluateShuffled(I->getOperand(0), Mask, Depth - 1);
+ }
+ }
+ return false;
+}
+
+/// Rebuild a new instruction just like 'I' but with the new operands given.
+/// In the event of type mismatch, the type of the operands is correct.
+static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) {
+ // We don't want to use the IRBuilder here because we want the replacement
+ // instructions to appear next to 'I', not the builder's insertion point.
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ BinaryOperator *BO = cast<BinaryOperator>(I);
+ assert(NewOps.size() == 2 && "binary operator with #ops != 2");
+ BinaryOperator *New =
+ BinaryOperator::Create(cast<BinaryOperator>(I)->getOpcode(),
+ NewOps[0], NewOps[1], "", BO);
+ if (isa<OverflowingBinaryOperator>(BO)) {
+ New->setHasNoUnsignedWrap(BO->hasNoUnsignedWrap());
+ New->setHasNoSignedWrap(BO->hasNoSignedWrap());
+ }
+ if (isa<PossiblyExactOperator>(BO)) {
+ New->setIsExact(BO->isExact());
+ }
+ if (isa<FPMathOperator>(BO))
+ New->copyFastMathFlags(I);
+ return New;
+ }
+ case Instruction::ICmp:
+ assert(NewOps.size() == 2 && "icmp with #ops != 2");
+ return new ICmpInst(I, cast<ICmpInst>(I)->getPredicate(),
+ NewOps[0], NewOps[1]);
+ case Instruction::FCmp:
+ assert(NewOps.size() == 2 && "fcmp with #ops != 2");
+ return new FCmpInst(I, cast<FCmpInst>(I)->getPredicate(),
+ NewOps[0], NewOps[1]);
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt: {
+ // It's possible that the mask has a different number of elements from
+ // the original cast. We recompute the destination type to match the mask.
+ Type *DestTy = VectorType::get(
+ I->getType()->getScalarType(),
+ cast<VectorType>(NewOps[0]->getType())->getElementCount());
+ assert(NewOps.size() == 1 && "cast with #ops != 1");
+ return CastInst::Create(cast<CastInst>(I)->getOpcode(), NewOps[0], DestTy,
+ "", I);
+ }
+ case Instruction::GetElementPtr: {
+ Value *Ptr = NewOps[0];
+ ArrayRef<Value*> Idx = NewOps.slice(1);
+ GetElementPtrInst *GEP = GetElementPtrInst::Create(
+ cast<GetElementPtrInst>(I)->getSourceElementType(), Ptr, Idx, "", I);
+ GEP->setIsInBounds(cast<GetElementPtrInst>(I)->isInBounds());
+ return GEP;
+ }
+ }
+ llvm_unreachable("failed to rebuild vector instructions");
+}
+
+static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
+ // Mask.size() does not need to be equal to the number of vector elements.
+
+ assert(V->getType()->isVectorTy() && "can't reorder non-vector elements");
+ Type *EltTy = V->getType()->getScalarType();
+ Type *I32Ty = IntegerType::getInt32Ty(V->getContext());
+ if (isa<UndefValue>(V))
+ return UndefValue::get(FixedVectorType::get(EltTy, Mask.size()));
+
+ if (isa<ConstantAggregateZero>(V))
+ return ConstantAggregateZero::get(FixedVectorType::get(EltTy, Mask.size()));
+
+ if (Constant *C = dyn_cast<Constant>(V))
+ return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()),
+ Mask);
+
+ Instruction *I = cast<Instruction>(V);
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::Select:
+ case Instruction::GetElementPtr: {
+ SmallVector<Value*, 8> NewOps;
+ bool NeedsRebuild =
(Mask.size() !=
cast<FixedVectorType>(I->getType())->getNumElements());
- for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
- Value *V;
- // Recursively call evaluateInDifferentElementOrder on vector arguments
- // as well. E.g. GetElementPtr may have scalar operands even if the
- // return value is a vector, so we need to examine the operand type.
- if (I->getOperand(i)->getType()->isVectorTy())
- V = evaluateInDifferentElementOrder(I->getOperand(i), Mask);
- else
- V = I->getOperand(i);
- NewOps.push_back(V);
- NeedsRebuild |= (V != I->getOperand(i));
- }
- if (NeedsRebuild) {
- return buildNew(I, NewOps);
- }
- return I;
- }
- case Instruction::InsertElement: {
- int Element = cast<ConstantInt>(I->getOperand(2))->getLimitedValue();
-
- // The insertelement was inserting at Element. Figure out which element
- // that becomes after shuffling. The answer is guaranteed to be unique
- // by CanEvaluateShuffled.
- bool Found = false;
- int Index = 0;
- for (int e = Mask.size(); Index != e; ++Index) {
- if (Mask[Index] == Element) {
- Found = true;
- break;
- }
- }
-
- // If element is not in Mask, no need to handle the operand 1 (element to
- // be inserted). Just evaluate values in operand 0 according to Mask.
- if (!Found)
- return evaluateInDifferentElementOrder(I->getOperand(0), Mask);
-
- Value *V = evaluateInDifferentElementOrder(I->getOperand(0), Mask);
- return InsertElementInst::Create(V, I->getOperand(1),
- ConstantInt::get(I32Ty, Index), "", I);
- }
- }
- llvm_unreachable("failed to reorder elements of vector instruction!");
-}
-
-// Returns true if the shuffle is extracting a contiguous range of values from
-// LHS, for example:
-// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
-// Input: |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP|
-// Shuffles to: |EE|FF|GG|HH|
-// +--+--+--+--+
-static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
- ArrayRef<int> Mask) {
- unsigned LHSElems =
+ for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
+ Value *V;
+ // Recursively call evaluateInDifferentElementOrder on vector arguments
+ // as well. E.g. GetElementPtr may have scalar operands even if the
+ // return value is a vector, so we need to examine the operand type.
+ if (I->getOperand(i)->getType()->isVectorTy())
+ V = evaluateInDifferentElementOrder(I->getOperand(i), Mask);
+ else
+ V = I->getOperand(i);
+ NewOps.push_back(V);
+ NeedsRebuild |= (V != I->getOperand(i));
+ }
+ if (NeedsRebuild) {
+ return buildNew(I, NewOps);
+ }
+ return I;
+ }
+ case Instruction::InsertElement: {
+ int Element = cast<ConstantInt>(I->getOperand(2))->getLimitedValue();
+
+ // The insertelement was inserting at Element. Figure out which element
+ // that becomes after shuffling. The answer is guaranteed to be unique
+ // by CanEvaluateShuffled.
+ bool Found = false;
+ int Index = 0;
+ for (int e = Mask.size(); Index != e; ++Index) {
+ if (Mask[Index] == Element) {
+ Found = true;
+ break;
+ }
+ }
+
+ // If element is not in Mask, no need to handle the operand 1 (element to
+ // be inserted). Just evaluate values in operand 0 according to Mask.
+ if (!Found)
+ return evaluateInDifferentElementOrder(I->getOperand(0), Mask);
+
+ Value *V = evaluateInDifferentElementOrder(I->getOperand(0), Mask);
+ return InsertElementInst::Create(V, I->getOperand(1),
+ ConstantInt::get(I32Ty, Index), "", I);
+ }
+ }
+ llvm_unreachable("failed to reorder elements of vector instruction!");
+}
+
+// Returns true if the shuffle is extracting a contiguous range of values from
+// LHS, for example:
+// +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+// Input: |AA|BB|CC|DD|EE|FF|GG|HH|II|JJ|KK|LL|MM|NN|OO|PP|
+// Shuffles to: |EE|FF|GG|HH|
+// +--+--+--+--+
+static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
+ ArrayRef<int> Mask) {
+ unsigned LHSElems =
cast<FixedVectorType>(SVI.getOperand(0)->getType())->getNumElements();
- unsigned MaskElems = Mask.size();
- unsigned BegIdx = Mask.front();
- unsigned EndIdx = Mask.back();
- if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1)
- return false;
- for (unsigned I = 0; I != MaskElems; ++I)
- if (static_cast<unsigned>(Mask[I]) != BegIdx + I)
- return false;
- return true;
-}
-
-/// These are the ingredients in an alternate form binary operator as described
-/// below.
-struct BinopElts {
- BinaryOperator::BinaryOps Opcode;
- Value *Op0;
- Value *Op1;
- BinopElts(BinaryOperator::BinaryOps Opc = (BinaryOperator::BinaryOps)0,
- Value *V0 = nullptr, Value *V1 = nullptr) :
- Opcode(Opc), Op0(V0), Op1(V1) {}
- operator bool() const { return Opcode != 0; }
-};
-
-/// Binops may be transformed into binops with different opcodes and operands.
-/// Reverse the usual canonicalization to enable folds with the non-canonical
-/// form of the binop. If a transform is possible, return the elements of the
-/// new binop. If not, return invalid elements.
-static BinopElts getAlternateBinop(BinaryOperator *BO, const DataLayout &DL) {
- Value *BO0 = BO->getOperand(0), *BO1 = BO->getOperand(1);
- Type *Ty = BO->getType();
- switch (BO->getOpcode()) {
- case Instruction::Shl: {
- // shl X, C --> mul X, (1 << C)
- Constant *C;
- if (match(BO1, m_Constant(C))) {
- Constant *ShlOne = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C);
- return { Instruction::Mul, BO0, ShlOne };
- }
- break;
- }
- case Instruction::Or: {
- // or X, C --> add X, C (when X and C have no common bits set)
- const APInt *C;
- if (match(BO1, m_APInt(C)) && MaskedValueIsZero(BO0, *C, DL))
- return { Instruction::Add, BO0, BO1 };
- break;
- }
- default:
- break;
- }
- return {};
-}
-
-static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
- assert(Shuf.isSelect() && "Must have select-equivalent shuffle");
-
- // Are we shuffling together some value and that same value after it has been
- // modified by a binop with a constant?
- Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
- Constant *C;
- bool Op0IsBinop;
- if (match(Op0, m_BinOp(m_Specific(Op1), m_Constant(C))))
- Op0IsBinop = true;
- else if (match(Op1, m_BinOp(m_Specific(Op0), m_Constant(C))))
- Op0IsBinop = false;
- else
- return nullptr;
-
- // The identity constant for a binop leaves a variable operand unchanged. For
- // a vector, this is a splat of something like 0, -1, or 1.
- // If there's no identity constant for this binop, we're done.
- auto *BO = cast<BinaryOperator>(Op0IsBinop ? Op0 : Op1);
- BinaryOperator::BinaryOps BOpcode = BO->getOpcode();
- Constant *IdC = ConstantExpr::getBinOpIdentity(BOpcode, Shuf.getType(), true);
- if (!IdC)
- return nullptr;
-
- // Shuffle identity constants into the lanes that return the original value.
- // Example: shuf (mul X, {-1,-2,-3,-4}), X, {0,5,6,3} --> mul X, {-1,1,1,-4}
- // Example: shuf X, (add X, {-1,-2,-3,-4}), {0,1,6,7} --> add X, {0,0,-3,-4}
- // The existing binop constant vector remains in the same operand position.
- ArrayRef<int> Mask = Shuf.getShuffleMask();
- Constant *NewC = Op0IsBinop ? ConstantExpr::getShuffleVector(C, IdC, Mask) :
- ConstantExpr::getShuffleVector(IdC, C, Mask);
-
- bool MightCreatePoisonOrUB =
- is_contained(Mask, UndefMaskElem) &&
- (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode));
- if (MightCreatePoisonOrUB)
+ unsigned MaskElems = Mask.size();
+ unsigned BegIdx = Mask.front();
+ unsigned EndIdx = Mask.back();
+ if (BegIdx > EndIdx || EndIdx >= LHSElems || EndIdx - BegIdx != MaskElems - 1)
+ return false;
+ for (unsigned I = 0; I != MaskElems; ++I)
+ if (static_cast<unsigned>(Mask[I]) != BegIdx + I)
+ return false;
+ return true;
+}
+
+/// These are the ingredients in an alternate form binary operator as described
+/// below.
+struct BinopElts {
+ BinaryOperator::BinaryOps Opcode;
+ Value *Op0;
+ Value *Op1;
+ BinopElts(BinaryOperator::BinaryOps Opc = (BinaryOperator::BinaryOps)0,
+ Value *V0 = nullptr, Value *V1 = nullptr) :
+ Opcode(Opc), Op0(V0), Op1(V1) {}
+ operator bool() const { return Opcode != 0; }
+};
+
+/// Binops may be transformed into binops with different opcodes and operands.
+/// Reverse the usual canonicalization to enable folds with the non-canonical
+/// form of the binop. If a transform is possible, return the elements of the
+/// new binop. If not, return invalid elements.
+static BinopElts getAlternateBinop(BinaryOperator *BO, const DataLayout &DL) {
+ Value *BO0 = BO->getOperand(0), *BO1 = BO->getOperand(1);
+ Type *Ty = BO->getType();
+ switch (BO->getOpcode()) {
+ case Instruction::Shl: {
+ // shl X, C --> mul X, (1 << C)
+ Constant *C;
+ if (match(BO1, m_Constant(C))) {
+ Constant *ShlOne = ConstantExpr::getShl(ConstantInt::get(Ty, 1), C);
+ return { Instruction::Mul, BO0, ShlOne };
+ }
+ break;
+ }
+ case Instruction::Or: {
+ // or X, C --> add X, C (when X and C have no common bits set)
+ const APInt *C;
+ if (match(BO1, m_APInt(C)) && MaskedValueIsZero(BO0, *C, DL))
+ return { Instruction::Add, BO0, BO1 };
+ break;
+ }
+ default:
+ break;
+ }
+ return {};
+}
+
+static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
+ assert(Shuf.isSelect() && "Must have select-equivalent shuffle");
+
+ // Are we shuffling together some value and that same value after it has been
+ // modified by a binop with a constant?
+ Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
+ Constant *C;
+ bool Op0IsBinop;
+ if (match(Op0, m_BinOp(m_Specific(Op1), m_Constant(C))))
+ Op0IsBinop = true;
+ else if (match(Op1, m_BinOp(m_Specific(Op0), m_Constant(C))))
+ Op0IsBinop = false;
+ else
+ return nullptr;
+
+ // The identity constant for a binop leaves a variable operand unchanged. For
+ // a vector, this is a splat of something like 0, -1, or 1.
+ // If there's no identity constant for this binop, we're done.
+ auto *BO = cast<BinaryOperator>(Op0IsBinop ? Op0 : Op1);
+ BinaryOperator::BinaryOps BOpcode = BO->getOpcode();
+ Constant *IdC = ConstantExpr::getBinOpIdentity(BOpcode, Shuf.getType(), true);
+ if (!IdC)
+ return nullptr;
+
+ // Shuffle identity constants into the lanes that return the original value.
+ // Example: shuf (mul X, {-1,-2,-3,-4}), X, {0,5,6,3} --> mul X, {-1,1,1,-4}
+ // Example: shuf X, (add X, {-1,-2,-3,-4}), {0,1,6,7} --> add X, {0,0,-3,-4}
+ // The existing binop constant vector remains in the same operand position.
+ ArrayRef<int> Mask = Shuf.getShuffleMask();
+ Constant *NewC = Op0IsBinop ? ConstantExpr::getShuffleVector(C, IdC, Mask) :
+ ConstantExpr::getShuffleVector(IdC, C, Mask);
+
+ bool MightCreatePoisonOrUB =
+ is_contained(Mask, UndefMaskElem) &&
+ (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode));
+ if (MightCreatePoisonOrUB)
NewC = InstCombiner::getSafeVectorConstantForBinop(BOpcode, NewC, true);
-
- // shuf (bop X, C), X, M --> bop X, C'
- // shuf X, (bop X, C), M --> bop X, C'
- Value *X = Op0IsBinop ? Op1 : Op0;
- Instruction *NewBO = BinaryOperator::Create(BOpcode, X, NewC);
- NewBO->copyIRFlags(BO);
-
- // An undef shuffle mask element may propagate as an undef constant element in
- // the new binop. That would produce poison where the original code might not.
- // If we already made a safe constant, then there's no danger.
- if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
- NewBO->dropPoisonGeneratingFlags();
- return NewBO;
-}
-
-/// If we have an insert of a scalar to a non-zero element of an undefined
-/// vector and then shuffle that value, that's the same as inserting to the zero
-/// element and shuffling. Splatting from the zero element is recognized as the
-/// canonical form of splat.
-static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf,
- InstCombiner::BuilderTy &Builder) {
- Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
- ArrayRef<int> Mask = Shuf.getShuffleMask();
- Value *X;
- uint64_t IndexC;
-
- // Match a shuffle that is a splat to a non-zero element.
- if (!match(Op0, m_OneUse(m_InsertElt(m_Undef(), m_Value(X),
- m_ConstantInt(IndexC)))) ||
- !match(Op1, m_Undef()) || match(Mask, m_ZeroMask()) || IndexC == 0)
- return nullptr;
-
- // Insert into element 0 of an undef vector.
- UndefValue *UndefVec = UndefValue::get(Shuf.getType());
- Constant *Zero = Builder.getInt32(0);
- Value *NewIns = Builder.CreateInsertElement(UndefVec, X, Zero);
-
- // Splat from element 0. Any mask element that is undefined remains undefined.
- // For example:
- // shuf (inselt undef, X, 2), undef, <2,2,undef>
- // --> shuf (inselt undef, X, 0), undef, <0,0,undef>
+
+ // shuf (bop X, C), X, M --> bop X, C'
+ // shuf X, (bop X, C), M --> bop X, C'
+ Value *X = Op0IsBinop ? Op1 : Op0;
+ Instruction *NewBO = BinaryOperator::Create(BOpcode, X, NewC);
+ NewBO->copyIRFlags(BO);
+
+ // An undef shuffle mask element may propagate as an undef constant element in
+ // the new binop. That would produce poison where the original code might not.
+ // If we already made a safe constant, then there's no danger.
+ if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
+ NewBO->dropPoisonGeneratingFlags();
+ return NewBO;
+}
+
+/// If we have an insert of a scalar to a non-zero element of an undefined
+/// vector and then shuffle that value, that's the same as inserting to the zero
+/// element and shuffling. Splatting from the zero element is recognized as the
+/// canonical form of splat.
+static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf,
+ InstCombiner::BuilderTy &Builder) {
+ Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
+ ArrayRef<int> Mask = Shuf.getShuffleMask();
+ Value *X;
+ uint64_t IndexC;
+
+ // Match a shuffle that is a splat to a non-zero element.
+ if (!match(Op0, m_OneUse(m_InsertElt(m_Undef(), m_Value(X),
+ m_ConstantInt(IndexC)))) ||
+ !match(Op1, m_Undef()) || match(Mask, m_ZeroMask()) || IndexC == 0)
+ return nullptr;
+
+ // Insert into element 0 of an undef vector.
+ UndefValue *UndefVec = UndefValue::get(Shuf.getType());
+ Constant *Zero = Builder.getInt32(0);
+ Value *NewIns = Builder.CreateInsertElement(UndefVec, X, Zero);
+
+ // Splat from element 0. Any mask element that is undefined remains undefined.
+ // For example:
+ // shuf (inselt undef, X, 2), undef, <2,2,undef>
+ // --> shuf (inselt undef, X, 0), undef, <0,0,undef>
unsigned NumMaskElts =
cast<FixedVectorType>(Shuf.getType())->getNumElements();
- SmallVector<int, 16> NewMask(NumMaskElts, 0);
- for (unsigned i = 0; i != NumMaskElts; ++i)
- if (Mask[i] == UndefMaskElem)
- NewMask[i] = Mask[i];
-
- return new ShuffleVectorInst(NewIns, UndefVec, NewMask);
-}
-
-/// Try to fold shuffles that are the equivalent of a vector select.
-static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
- InstCombiner::BuilderTy &Builder,
- const DataLayout &DL) {
- if (!Shuf.isSelect())
- return nullptr;
-
- // Canonicalize to choose from operand 0 first unless operand 1 is undefined.
- // Commuting undef to operand 0 conflicts with another canonicalization.
+ SmallVector<int, 16> NewMask(NumMaskElts, 0);
+ for (unsigned i = 0; i != NumMaskElts; ++i)
+ if (Mask[i] == UndefMaskElem)
+ NewMask[i] = Mask[i];
+
+ return new ShuffleVectorInst(NewIns, UndefVec, NewMask);
+}
+
+/// Try to fold shuffles that are the equivalent of a vector select.
+static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
+ InstCombiner::BuilderTy &Builder,
+ const DataLayout &DL) {
+ if (!Shuf.isSelect())
+ return nullptr;
+
+ // Canonicalize to choose from operand 0 first unless operand 1 is undefined.
+ // Commuting undef to operand 0 conflicts with another canonicalization.
unsigned NumElts = cast<FixedVectorType>(Shuf.getType())->getNumElements();
- if (!isa<UndefValue>(Shuf.getOperand(1)) &&
- Shuf.getMaskValue(0) >= (int)NumElts) {
- // TODO: Can we assert that both operands of a shuffle-select are not undef
- // (otherwise, it would have been folded by instsimplify?
- Shuf.commute();
- return &Shuf;
- }
-
- if (Instruction *I = foldSelectShuffleWith1Binop(Shuf))
- return I;
-
- BinaryOperator *B0, *B1;
- if (!match(Shuf.getOperand(0), m_BinOp(B0)) ||
- !match(Shuf.getOperand(1), m_BinOp(B1)))
- return nullptr;
-
- Value *X, *Y;
- Constant *C0, *C1;
- bool ConstantsAreOp1;
- if (match(B0, m_BinOp(m_Value(X), m_Constant(C0))) &&
- match(B1, m_BinOp(m_Value(Y), m_Constant(C1))))
- ConstantsAreOp1 = true;
- else if (match(B0, m_BinOp(m_Constant(C0), m_Value(X))) &&
- match(B1, m_BinOp(m_Constant(C1), m_Value(Y))))
- ConstantsAreOp1 = false;
- else
- return nullptr;
-
- // We need matching binops to fold the lanes together.
- BinaryOperator::BinaryOps Opc0 = B0->getOpcode();
- BinaryOperator::BinaryOps Opc1 = B1->getOpcode();
- bool DropNSW = false;
- if (ConstantsAreOp1 && Opc0 != Opc1) {
- // TODO: We drop "nsw" if shift is converted into multiply because it may
- // not be correct when the shift amount is BitWidth - 1. We could examine
- // each vector element to determine if it is safe to keep that flag.
- if (Opc0 == Instruction::Shl || Opc1 == Instruction::Shl)
- DropNSW = true;
- if (BinopElts AltB0 = getAlternateBinop(B0, DL)) {
- assert(isa<Constant>(AltB0.Op1) && "Expecting constant with alt binop");
- Opc0 = AltB0.Opcode;
- C0 = cast<Constant>(AltB0.Op1);
- } else if (BinopElts AltB1 = getAlternateBinop(B1, DL)) {
- assert(isa<Constant>(AltB1.Op1) && "Expecting constant with alt binop");
- Opc1 = AltB1.Opcode;
- C1 = cast<Constant>(AltB1.Op1);
- }
- }
-
- if (Opc0 != Opc1)
- return nullptr;
-
- // The opcodes must be the same. Use a new name to make that clear.
- BinaryOperator::BinaryOps BOpc = Opc0;
-
- // Select the constant elements needed for the single binop.
- ArrayRef<int> Mask = Shuf.getShuffleMask();
- Constant *NewC = ConstantExpr::getShuffleVector(C0, C1, Mask);
-
- // We are moving a binop after a shuffle. When a shuffle has an undefined
- // mask element, the result is undefined, but it is not poison or undefined
- // behavior. That is not necessarily true for div/rem/shift.
- bool MightCreatePoisonOrUB =
- is_contained(Mask, UndefMaskElem) &&
- (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc));
- if (MightCreatePoisonOrUB)
+ if (!isa<UndefValue>(Shuf.getOperand(1)) &&
+ Shuf.getMaskValue(0) >= (int)NumElts) {
+ // TODO: Can we assert that both operands of a shuffle-select are not undef
+ // (otherwise, it would have been folded by instsimplify?
+ Shuf.commute();
+ return &Shuf;
+ }
+
+ if (Instruction *I = foldSelectShuffleWith1Binop(Shuf))
+ return I;
+
+ BinaryOperator *B0, *B1;
+ if (!match(Shuf.getOperand(0), m_BinOp(B0)) ||
+ !match(Shuf.getOperand(1), m_BinOp(B1)))
+ return nullptr;
+
+ Value *X, *Y;
+ Constant *C0, *C1;
+ bool ConstantsAreOp1;
+ if (match(B0, m_BinOp(m_Value(X), m_Constant(C0))) &&
+ match(B1, m_BinOp(m_Value(Y), m_Constant(C1))))
+ ConstantsAreOp1 = true;
+ else if (match(B0, m_BinOp(m_Constant(C0), m_Value(X))) &&
+ match(B1, m_BinOp(m_Constant(C1), m_Value(Y))))
+ ConstantsAreOp1 = false;
+ else
+ return nullptr;
+
+ // We need matching binops to fold the lanes together.
+ BinaryOperator::BinaryOps Opc0 = B0->getOpcode();
+ BinaryOperator::BinaryOps Opc1 = B1->getOpcode();
+ bool DropNSW = false;
+ if (ConstantsAreOp1 && Opc0 != Opc1) {
+ // TODO: We drop "nsw" if shift is converted into multiply because it may
+ // not be correct when the shift amount is BitWidth - 1. We could examine
+ // each vector element to determine if it is safe to keep that flag.
+ if (Opc0 == Instruction::Shl || Opc1 == Instruction::Shl)
+ DropNSW = true;
+ if (BinopElts AltB0 = getAlternateBinop(B0, DL)) {
+ assert(isa<Constant>(AltB0.Op1) && "Expecting constant with alt binop");
+ Opc0 = AltB0.Opcode;
+ C0 = cast<Constant>(AltB0.Op1);
+ } else if (BinopElts AltB1 = getAlternateBinop(B1, DL)) {
+ assert(isa<Constant>(AltB1.Op1) && "Expecting constant with alt binop");
+ Opc1 = AltB1.Opcode;
+ C1 = cast<Constant>(AltB1.Op1);
+ }
+ }
+
+ if (Opc0 != Opc1)
+ return nullptr;
+
+ // The opcodes must be the same. Use a new name to make that clear.
+ BinaryOperator::BinaryOps BOpc = Opc0;
+
+ // Select the constant elements needed for the single binop.
+ ArrayRef<int> Mask = Shuf.getShuffleMask();
+ Constant *NewC = ConstantExpr::getShuffleVector(C0, C1, Mask);
+
+ // We are moving a binop after a shuffle. When a shuffle has an undefined
+ // mask element, the result is undefined, but it is not poison or undefined
+ // behavior. That is not necessarily true for div/rem/shift.
+ bool MightCreatePoisonOrUB =
+ is_contained(Mask, UndefMaskElem) &&
+ (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc));
+ if (MightCreatePoisonOrUB)
NewC = InstCombiner::getSafeVectorConstantForBinop(BOpc, NewC,
ConstantsAreOp1);
-
- Value *V;
- if (X == Y) {
- // Remove a binop and the shuffle by rearranging the constant:
- // shuffle (op V, C0), (op V, C1), M --> op V, C'
- // shuffle (op C0, V), (op C1, V), M --> op C', V
- V = X;
- } else {
- // If there are 2 different variable operands, we must create a new shuffle
- // (select) first, so check uses to ensure that we don't end up with more
- // instructions than we started with.
- if (!B0->hasOneUse() && !B1->hasOneUse())
- return nullptr;
-
- // If we use the original shuffle mask and op1 is *variable*, we would be
- // putting an undef into operand 1 of div/rem/shift. This is either UB or
- // poison. We do not have to guard against UB when *constants* are op1
- // because safe constants guarantee that we do not overflow sdiv/srem (and
- // there's no danger for other opcodes).
- // TODO: To allow this case, create a new shuffle mask with no undefs.
- if (MightCreatePoisonOrUB && !ConstantsAreOp1)
- return nullptr;
-
- // Note: In general, we do not create new shuffles in InstCombine because we
- // do not know if a target can lower an arbitrary shuffle optimally. In this
- // case, the shuffle uses the existing mask, so there is no additional risk.
-
- // Select the variable vectors first, then perform the binop:
- // shuffle (op X, C0), (op Y, C1), M --> op (shuffle X, Y, M), C'
- // shuffle (op C0, X), (op C1, Y), M --> op C', (shuffle X, Y, M)
- V = Builder.CreateShuffleVector(X, Y, Mask);
- }
-
- Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, V, NewC) :
- BinaryOperator::Create(BOpc, NewC, V);
-
- // Flags are intersected from the 2 source binops. But there are 2 exceptions:
- // 1. If we changed an opcode, poison conditions might have changed.
- // 2. If the shuffle had undef mask elements, the new binop might have undefs
- // where the original code did not. But if we already made a safe constant,
- // then there's no danger.
- NewBO->copyIRFlags(B0);
- NewBO->andIRFlags(B1);
- if (DropNSW)
- NewBO->setHasNoSignedWrap(false);
- if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
- NewBO->dropPoisonGeneratingFlags();
- return NewBO;
-}
-
-/// Convert a narrowing shuffle of a bitcasted vector into a vector truncate.
-/// Example (little endian):
-/// shuf (bitcast <4 x i16> X to <8 x i8>), <0, 2, 4, 6> --> trunc X to <4 x i8>
-static Instruction *foldTruncShuffle(ShuffleVectorInst &Shuf,
- bool IsBigEndian) {
- // This must be a bitcasted shuffle of 1 vector integer operand.
- Type *DestType = Shuf.getType();
- Value *X;
- if (!match(Shuf.getOperand(0), m_BitCast(m_Value(X))) ||
- !match(Shuf.getOperand(1), m_Undef()) || !DestType->isIntOrIntVectorTy())
- return nullptr;
-
- // The source type must have the same number of elements as the shuffle,
- // and the source element type must be larger than the shuffle element type.
- Type *SrcType = X->getType();
- if (!SrcType->isVectorTy() || !SrcType->isIntOrIntVectorTy() ||
+
+ Value *V;
+ if (X == Y) {
+ // Remove a binop and the shuffle by rearranging the constant:
+ // shuffle (op V, C0), (op V, C1), M --> op V, C'
+ // shuffle (op C0, V), (op C1, V), M --> op C', V
+ V = X;
+ } else {
+ // If there are 2 different variable operands, we must create a new shuffle
+ // (select) first, so check uses to ensure that we don't end up with more
+ // instructions than we started with.
+ if (!B0->hasOneUse() && !B1->hasOneUse())
+ return nullptr;
+
+ // If we use the original shuffle mask and op1 is *variable*, we would be
+ // putting an undef into operand 1 of div/rem/shift. This is either UB or
+ // poison. We do not have to guard against UB when *constants* are op1
+ // because safe constants guarantee that we do not overflow sdiv/srem (and
+ // there's no danger for other opcodes).
+ // TODO: To allow this case, create a new shuffle mask with no undefs.
+ if (MightCreatePoisonOrUB && !ConstantsAreOp1)
+ return nullptr;
+
+ // Note: In general, we do not create new shuffles in InstCombine because we
+ // do not know if a target can lower an arbitrary shuffle optimally. In this
+ // case, the shuffle uses the existing mask, so there is no additional risk.
+
+ // Select the variable vectors first, then perform the binop:
+ // shuffle (op X, C0), (op Y, C1), M --> op (shuffle X, Y, M), C'
+ // shuffle (op C0, X), (op C1, Y), M --> op C', (shuffle X, Y, M)
+ V = Builder.CreateShuffleVector(X, Y, Mask);
+ }
+
+ Instruction *NewBO = ConstantsAreOp1 ? BinaryOperator::Create(BOpc, V, NewC) :
+ BinaryOperator::Create(BOpc, NewC, V);
+
+ // Flags are intersected from the 2 source binops. But there are 2 exceptions:
+ // 1. If we changed an opcode, poison conditions might have changed.
+ // 2. If the shuffle had undef mask elements, the new binop might have undefs
+ // where the original code did not. But if we already made a safe constant,
+ // then there's no danger.
+ NewBO->copyIRFlags(B0);
+ NewBO->andIRFlags(B1);
+ if (DropNSW)
+ NewBO->setHasNoSignedWrap(false);
+ if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
+ NewBO->dropPoisonGeneratingFlags();
+ return NewBO;
+}
+
+/// Convert a narrowing shuffle of a bitcasted vector into a vector truncate.
+/// Example (little endian):
+/// shuf (bitcast <4 x i16> X to <8 x i8>), <0, 2, 4, 6> --> trunc X to <4 x i8>
+static Instruction *foldTruncShuffle(ShuffleVectorInst &Shuf,
+ bool IsBigEndian) {
+ // This must be a bitcasted shuffle of 1 vector integer operand.
+ Type *DestType = Shuf.getType();
+ Value *X;
+ if (!match(Shuf.getOperand(0), m_BitCast(m_Value(X))) ||
+ !match(Shuf.getOperand(1), m_Undef()) || !DestType->isIntOrIntVectorTy())
+ return nullptr;
+
+ // The source type must have the same number of elements as the shuffle,
+ // and the source element type must be larger than the shuffle element type.
+ Type *SrcType = X->getType();
+ if (!SrcType->isVectorTy() || !SrcType->isIntOrIntVectorTy() ||
cast<FixedVectorType>(SrcType)->getNumElements() !=
cast<FixedVectorType>(DestType)->getNumElements() ||
- SrcType->getScalarSizeInBits() % DestType->getScalarSizeInBits() != 0)
- return nullptr;
-
- assert(Shuf.changesLength() && !Shuf.increasesLength() &&
- "Expected a shuffle that decreases length");
-
- // Last, check that the mask chooses the correct low bits for each narrow
- // element in the result.
- uint64_t TruncRatio =
- SrcType->getScalarSizeInBits() / DestType->getScalarSizeInBits();
- ArrayRef<int> Mask = Shuf.getShuffleMask();
- for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
- if (Mask[i] == UndefMaskElem)
- continue;
- uint64_t LSBIndex = IsBigEndian ? (i + 1) * TruncRatio - 1 : i * TruncRatio;
+ SrcType->getScalarSizeInBits() % DestType->getScalarSizeInBits() != 0)
+ return nullptr;
+
+ assert(Shuf.changesLength() && !Shuf.increasesLength() &&
+ "Expected a shuffle that decreases length");
+
+ // Last, check that the mask chooses the correct low bits for each narrow
+ // element in the result.
+ uint64_t TruncRatio =
+ SrcType->getScalarSizeInBits() / DestType->getScalarSizeInBits();
+ ArrayRef<int> Mask = Shuf.getShuffleMask();
+ for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+ if (Mask[i] == UndefMaskElem)
+ continue;
+ uint64_t LSBIndex = IsBigEndian ? (i + 1) * TruncRatio - 1 : i * TruncRatio;
assert(LSBIndex <= INT32_MAX && "Overflowed 32-bits");
- if (Mask[i] != (int)LSBIndex)
- return nullptr;
- }
-
- return new TruncInst(X, DestType);
-}
-
-/// Match a shuffle-select-shuffle pattern where the shuffles are widening and
-/// narrowing (concatenating with undef and extracting back to the original
-/// length). This allows replacing the wide select with a narrow select.
-static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf,
- InstCombiner::BuilderTy &Builder) {
- // This must be a narrowing identity shuffle. It extracts the 1st N elements
- // of the 1st vector operand of a shuffle.
- if (!match(Shuf.getOperand(1), m_Undef()) || !Shuf.isIdentityWithExtract())
- return nullptr;
-
- // The vector being shuffled must be a vector select that we can eliminate.
- // TODO: The one-use requirement could be eased if X and/or Y are constants.
- Value *Cond, *X, *Y;
- if (!match(Shuf.getOperand(0),
- m_OneUse(m_Select(m_Value(Cond), m_Value(X), m_Value(Y)))))
- return nullptr;
-
- // We need a narrow condition value. It must be extended with undef elements
- // and have the same number of elements as this shuffle.
+ if (Mask[i] != (int)LSBIndex)
+ return nullptr;
+ }
+
+ return new TruncInst(X, DestType);
+}
+
+/// Match a shuffle-select-shuffle pattern where the shuffles are widening and
+/// narrowing (concatenating with undef and extracting back to the original
+/// length). This allows replacing the wide select with a narrow select.
+static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf,
+ InstCombiner::BuilderTy &Builder) {
+ // This must be a narrowing identity shuffle. It extracts the 1st N elements
+ // of the 1st vector operand of a shuffle.
+ if (!match(Shuf.getOperand(1), m_Undef()) || !Shuf.isIdentityWithExtract())
+ return nullptr;
+
+ // The vector being shuffled must be a vector select that we can eliminate.
+ // TODO: The one-use requirement could be eased if X and/or Y are constants.
+ Value *Cond, *X, *Y;
+ if (!match(Shuf.getOperand(0),
+ m_OneUse(m_Select(m_Value(Cond), m_Value(X), m_Value(Y)))))
+ return nullptr;
+
+ // We need a narrow condition value. It must be extended with undef elements
+ // and have the same number of elements as this shuffle.
unsigned NarrowNumElts =
cast<FixedVectorType>(Shuf.getType())->getNumElements();
- Value *NarrowCond;
- if (!match(Cond, m_OneUse(m_Shuffle(m_Value(NarrowCond), m_Undef()))) ||
+ Value *NarrowCond;
+ if (!match(Cond, m_OneUse(m_Shuffle(m_Value(NarrowCond), m_Undef()))) ||
cast<FixedVectorType>(NarrowCond->getType())->getNumElements() !=
- NarrowNumElts ||
- !cast<ShuffleVectorInst>(Cond)->isIdentityWithPadding())
- return nullptr;
-
- // shuf (sel (shuf NarrowCond, undef, WideMask), X, Y), undef, NarrowMask) -->
- // sel NarrowCond, (shuf X, undef, NarrowMask), (shuf Y, undef, NarrowMask)
+ NarrowNumElts ||
+ !cast<ShuffleVectorInst>(Cond)->isIdentityWithPadding())
+ return nullptr;
+
+ // shuf (sel (shuf NarrowCond, undef, WideMask), X, Y), undef, NarrowMask) -->
+ // sel NarrowCond, (shuf X, undef, NarrowMask), (shuf Y, undef, NarrowMask)
Value *NarrowX = Builder.CreateShuffleVector(X, Shuf.getShuffleMask());
Value *NarrowY = Builder.CreateShuffleVector(Y, Shuf.getShuffleMask());
- return SelectInst::Create(NarrowCond, NarrowX, NarrowY);
-}
-
-/// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask.
-static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
- Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
- if (!Shuf.isIdentityWithExtract() || !isa<UndefValue>(Op1))
- return nullptr;
-
- Value *X, *Y;
- ArrayRef<int> Mask;
- if (!match(Op0, m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask))))
- return nullptr;
-
- // Be conservative with shuffle transforms. If we can't kill the 1st shuffle,
- // then combining may result in worse codegen.
- if (!Op0->hasOneUse())
- return nullptr;
-
- // We are extracting a subvector from a shuffle. Remove excess elements from
- // the 1st shuffle mask to eliminate the extract.
- //
- // This transform is conservatively limited to identity extracts because we do
- // not allow arbitrary shuffle mask creation as a target-independent transform
- // (because we can't guarantee that will lower efficiently).
- //
- // If the extracting shuffle has an undef mask element, it transfers to the
- // new shuffle mask. Otherwise, copy the original mask element. Example:
- // shuf (shuf X, Y, <C0, C1, C2, undef, C4>), undef, <0, undef, 2, 3> -->
- // shuf X, Y, <C0, undef, C2, undef>
+ return SelectInst::Create(NarrowCond, NarrowX, NarrowY);
+}
+
+/// Try to combine 2 shuffles into 1 shuffle by concatenating a shuffle mask.
+static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
+ Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
+ if (!Shuf.isIdentityWithExtract() || !isa<UndefValue>(Op1))
+ return nullptr;
+
+ Value *X, *Y;
+ ArrayRef<int> Mask;
+ if (!match(Op0, m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask))))
+ return nullptr;
+
+ // Be conservative with shuffle transforms. If we can't kill the 1st shuffle,
+ // then combining may result in worse codegen.
+ if (!Op0->hasOneUse())
+ return nullptr;
+
+ // We are extracting a subvector from a shuffle. Remove excess elements from
+ // the 1st shuffle mask to eliminate the extract.
+ //
+ // This transform is conservatively limited to identity extracts because we do
+ // not allow arbitrary shuffle mask creation as a target-independent transform
+ // (because we can't guarantee that will lower efficiently).
+ //
+ // If the extracting shuffle has an undef mask element, it transfers to the
+ // new shuffle mask. Otherwise, copy the original mask element. Example:
+ // shuf (shuf X, Y, <C0, C1, C2, undef, C4>), undef, <0, undef, 2, 3> -->
+ // shuf X, Y, <C0, undef, C2, undef>
unsigned NumElts = cast<FixedVectorType>(Shuf.getType())->getNumElements();
- SmallVector<int, 16> NewMask(NumElts);
- assert(NumElts < Mask.size() &&
- "Identity with extract must have less elements than its inputs");
-
- for (unsigned i = 0; i != NumElts; ++i) {
- int ExtractMaskElt = Shuf.getMaskValue(i);
- int MaskElt = Mask[i];
- NewMask[i] = ExtractMaskElt == UndefMaskElem ? ExtractMaskElt : MaskElt;
- }
- return new ShuffleVectorInst(X, Y, NewMask);
-}
-
-/// Try to replace a shuffle with an insertelement or try to replace a shuffle
-/// operand with the operand of an insertelement.
-static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,
+ SmallVector<int, 16> NewMask(NumElts);
+ assert(NumElts < Mask.size() &&
+ "Identity with extract must have less elements than its inputs");
+
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int ExtractMaskElt = Shuf.getMaskValue(i);
+ int MaskElt = Mask[i];
+ NewMask[i] = ExtractMaskElt == UndefMaskElem ? ExtractMaskElt : MaskElt;
+ }
+ return new ShuffleVectorInst(X, Y, NewMask);
+}
+
+/// Try to replace a shuffle with an insertelement or try to replace a shuffle
+/// operand with the operand of an insertelement.
+static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,
InstCombinerImpl &IC) {
- Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1);
- SmallVector<int, 16> Mask;
- Shuf.getShuffleMask(Mask);
-
- // The shuffle must not change vector sizes.
- // TODO: This restriction could be removed if the insert has only one use
- // (because the transform would require a new length-changing shuffle).
- int NumElts = Mask.size();
+ Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1);
+ SmallVector<int, 16> Mask;
+ Shuf.getShuffleMask(Mask);
+
+ // The shuffle must not change vector sizes.
+ // TODO: This restriction could be removed if the insert has only one use
+ // (because the transform would require a new length-changing shuffle).
+ int NumElts = Mask.size();
if (NumElts != (int)(cast<FixedVectorType>(V0->getType())->getNumElements()))
- return nullptr;
-
- // This is a specialization of a fold in SimplifyDemandedVectorElts. We may
- // not be able to handle it there if the insertelement has >1 use.
- // If the shuffle has an insertelement operand but does not choose the
- // inserted scalar element from that value, then we can replace that shuffle
- // operand with the source vector of the insertelement.
- Value *X;
- uint64_t IdxC;
- if (match(V0, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
- // shuf (inselt X, ?, IdxC), ?, Mask --> shuf X, ?, Mask
+ return nullptr;
+
+ // This is a specialization of a fold in SimplifyDemandedVectorElts. We may
+ // not be able to handle it there if the insertelement has >1 use.
+ // If the shuffle has an insertelement operand but does not choose the
+ // inserted scalar element from that value, then we can replace that shuffle
+ // operand with the source vector of the insertelement.
+ Value *X;
+ uint64_t IdxC;
+ if (match(V0, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
+ // shuf (inselt X, ?, IdxC), ?, Mask --> shuf X, ?, Mask
if (!is_contained(Mask, (int)IdxC))
- return IC.replaceOperand(Shuf, 0, X);
- }
- if (match(V1, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
- // Offset the index constant by the vector width because we are checking for
- // accesses to the 2nd vector input of the shuffle.
- IdxC += NumElts;
- // shuf ?, (inselt X, ?, IdxC), Mask --> shuf ?, X, Mask
+ return IC.replaceOperand(Shuf, 0, X);
+ }
+ if (match(V1, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
+ // Offset the index constant by the vector width because we are checking for
+ // accesses to the 2nd vector input of the shuffle.
+ IdxC += NumElts;
+ // shuf ?, (inselt X, ?, IdxC), Mask --> shuf ?, X, Mask
if (!is_contained(Mask, (int)IdxC))
- return IC.replaceOperand(Shuf, 1, X);
- }
-
- // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC'
- auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) {
- // We need an insertelement with a constant index.
- if (!match(V0, m_InsertElt(m_Value(), m_Value(Scalar),
- m_ConstantInt(IndexC))))
- return false;
-
- // Test the shuffle mask to see if it splices the inserted scalar into the
- // operand 1 vector of the shuffle.
- int NewInsIndex = -1;
- for (int i = 0; i != NumElts; ++i) {
- // Ignore undef mask elements.
- if (Mask[i] == -1)
- continue;
-
- // The shuffle takes elements of operand 1 without lane changes.
- if (Mask[i] == NumElts + i)
- continue;
-
- // The shuffle must choose the inserted scalar exactly once.
- if (NewInsIndex != -1 || Mask[i] != IndexC->getSExtValue())
- return false;
-
- // The shuffle is placing the inserted scalar into element i.
- NewInsIndex = i;
- }
-
- assert(NewInsIndex != -1 && "Did not fold shuffle with unused operand?");
-
- // Index is updated to the potentially translated insertion lane.
- IndexC = ConstantInt::get(IndexC->getType(), NewInsIndex);
- return true;
- };
-
- // If the shuffle is unnecessary, insert the scalar operand directly into
- // operand 1 of the shuffle. Example:
- // shuffle (insert ?, S, 1), V1, <1, 5, 6, 7> --> insert V1, S, 0
- Value *Scalar;
- ConstantInt *IndexC;
- if (isShufflingScalarIntoOp1(Scalar, IndexC))
- return InsertElementInst::Create(V1, Scalar, IndexC);
-
- // Try again after commuting shuffle. Example:
- // shuffle V0, (insert ?, S, 0), <0, 1, 2, 4> -->
- // shuffle (insert ?, S, 0), V0, <4, 5, 6, 0> --> insert V0, S, 3
- std::swap(V0, V1);
- ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
- if (isShufflingScalarIntoOp1(Scalar, IndexC))
- return InsertElementInst::Create(V1, Scalar, IndexC);
-
- return nullptr;
-}
-
-static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) {
- // Match the operands as identity with padding (also known as concatenation
- // with undef) shuffles of the same source type. The backend is expected to
- // recreate these concatenations from a shuffle of narrow operands.
- auto *Shuffle0 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(0));
- auto *Shuffle1 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(1));
- if (!Shuffle0 || !Shuffle0->isIdentityWithPadding() ||
- !Shuffle1 || !Shuffle1->isIdentityWithPadding())
- return nullptr;
-
- // We limit this transform to power-of-2 types because we expect that the
- // backend can convert the simplified IR patterns to identical nodes as the
- // original IR.
- // TODO: If we can verify the same behavior for arbitrary types, the
- // power-of-2 checks can be removed.
- Value *X = Shuffle0->getOperand(0);
- Value *Y = Shuffle1->getOperand(0);
- if (X->getType() != Y->getType() ||
+ return IC.replaceOperand(Shuf, 1, X);
+ }
+
+ // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC'
+ auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) {
+ // We need an insertelement with a constant index.
+ if (!match(V0, m_InsertElt(m_Value(), m_Value(Scalar),
+ m_ConstantInt(IndexC))))
+ return false;
+
+ // Test the shuffle mask to see if it splices the inserted scalar into the
+ // operand 1 vector of the shuffle.
+ int NewInsIndex = -1;
+ for (int i = 0; i != NumElts; ++i) {
+ // Ignore undef mask elements.
+ if (Mask[i] == -1)
+ continue;
+
+ // The shuffle takes elements of operand 1 without lane changes.
+ if (Mask[i] == NumElts + i)
+ continue;
+
+ // The shuffle must choose the inserted scalar exactly once.
+ if (NewInsIndex != -1 || Mask[i] != IndexC->getSExtValue())
+ return false;
+
+ // The shuffle is placing the inserted scalar into element i.
+ NewInsIndex = i;
+ }
+
+ assert(NewInsIndex != -1 && "Did not fold shuffle with unused operand?");
+
+ // Index is updated to the potentially translated insertion lane.
+ IndexC = ConstantInt::get(IndexC->getType(), NewInsIndex);
+ return true;
+ };
+
+ // If the shuffle is unnecessary, insert the scalar operand directly into
+ // operand 1 of the shuffle. Example:
+ // shuffle (insert ?, S, 1), V1, <1, 5, 6, 7> --> insert V1, S, 0
+ Value *Scalar;
+ ConstantInt *IndexC;
+ if (isShufflingScalarIntoOp1(Scalar, IndexC))
+ return InsertElementInst::Create(V1, Scalar, IndexC);
+
+ // Try again after commuting shuffle. Example:
+ // shuffle V0, (insert ?, S, 0), <0, 1, 2, 4> -->
+ // shuffle (insert ?, S, 0), V0, <4, 5, 6, 0> --> insert V0, S, 3
+ std::swap(V0, V1);
+ ShuffleVectorInst::commuteShuffleMask(Mask, NumElts);
+ if (isShufflingScalarIntoOp1(Scalar, IndexC))
+ return InsertElementInst::Create(V1, Scalar, IndexC);
+
+ return nullptr;
+}
+
+static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) {
+ // Match the operands as identity with padding (also known as concatenation
+ // with undef) shuffles of the same source type. The backend is expected to
+ // recreate these concatenations from a shuffle of narrow operands.
+ auto *Shuffle0 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(0));
+ auto *Shuffle1 = dyn_cast<ShuffleVectorInst>(Shuf.getOperand(1));
+ if (!Shuffle0 || !Shuffle0->isIdentityWithPadding() ||
+ !Shuffle1 || !Shuffle1->isIdentityWithPadding())
+ return nullptr;
+
+ // We limit this transform to power-of-2 types because we expect that the
+ // backend can convert the simplified IR patterns to identical nodes as the
+ // original IR.
+ // TODO: If we can verify the same behavior for arbitrary types, the
+ // power-of-2 checks can be removed.
+ Value *X = Shuffle0->getOperand(0);
+ Value *Y = Shuffle1->getOperand(0);
+ if (X->getType() != Y->getType() ||
!isPowerOf2_32(cast<FixedVectorType>(Shuf.getType())->getNumElements()) ||
!isPowerOf2_32(
cast<FixedVectorType>(Shuffle0->getType())->getNumElements()) ||
!isPowerOf2_32(cast<FixedVectorType>(X->getType())->getNumElements()) ||
- isa<UndefValue>(X) || isa<UndefValue>(Y))
- return nullptr;
- assert(isa<UndefValue>(Shuffle0->getOperand(1)) &&
- isa<UndefValue>(Shuffle1->getOperand(1)) &&
- "Unexpected operand for identity shuffle");
-
- // This is a shuffle of 2 widening shuffles. We can shuffle the narrow source
- // operands directly by adjusting the shuffle mask to account for the narrower
- // types:
- // shuf (widen X), (widen Y), Mask --> shuf X, Y, Mask'
+ isa<UndefValue>(X) || isa<UndefValue>(Y))
+ return nullptr;
+ assert(isa<UndefValue>(Shuffle0->getOperand(1)) &&
+ isa<UndefValue>(Shuffle1->getOperand(1)) &&
+ "Unexpected operand for identity shuffle");
+
+ // This is a shuffle of 2 widening shuffles. We can shuffle the narrow source
+ // operands directly by adjusting the shuffle mask to account for the narrower
+ // types:
+ // shuf (widen X), (widen Y), Mask --> shuf X, Y, Mask'
int NarrowElts = cast<FixedVectorType>(X->getType())->getNumElements();
int WideElts = cast<FixedVectorType>(Shuffle0->getType())->getNumElements();
- assert(WideElts > NarrowElts && "Unexpected types for identity with padding");
-
- ArrayRef<int> Mask = Shuf.getShuffleMask();
- SmallVector<int, 16> NewMask(Mask.size(), -1);
- for (int i = 0, e = Mask.size(); i != e; ++i) {
- if (Mask[i] == -1)
- continue;
-
- // If this shuffle is choosing an undef element from 1 of the sources, that
- // element is undef.
- if (Mask[i] < WideElts) {
- if (Shuffle0->getMaskValue(Mask[i]) == -1)
- continue;
- } else {
- if (Shuffle1->getMaskValue(Mask[i] - WideElts) == -1)
- continue;
- }
-
- // If this shuffle is choosing from the 1st narrow op, the mask element is
- // the same. If this shuffle is choosing from the 2nd narrow op, the mask
- // element is offset down to adjust for the narrow vector widths.
- if (Mask[i] < WideElts) {
- assert(Mask[i] < NarrowElts && "Unexpected shuffle mask");
- NewMask[i] = Mask[i];
- } else {
- assert(Mask[i] < (WideElts + NarrowElts) && "Unexpected shuffle mask");
- NewMask[i] = Mask[i] - (WideElts - NarrowElts);
- }
- }
- return new ShuffleVectorInst(X, Y, NewMask);
-}
-
+ assert(WideElts > NarrowElts && "Unexpected types for identity with padding");
+
+ ArrayRef<int> Mask = Shuf.getShuffleMask();
+ SmallVector<int, 16> NewMask(Mask.size(), -1);
+ for (int i = 0, e = Mask.size(); i != e; ++i) {
+ if (Mask[i] == -1)
+ continue;
+
+ // If this shuffle is choosing an undef element from 1 of the sources, that
+ // element is undef.
+ if (Mask[i] < WideElts) {
+ if (Shuffle0->getMaskValue(Mask[i]) == -1)
+ continue;
+ } else {
+ if (Shuffle1->getMaskValue(Mask[i] - WideElts) == -1)
+ continue;
+ }
+
+ // If this shuffle is choosing from the 1st narrow op, the mask element is
+ // the same. If this shuffle is choosing from the 2nd narrow op, the mask
+ // element is offset down to adjust for the narrow vector widths.
+ if (Mask[i] < WideElts) {
+ assert(Mask[i] < NarrowElts && "Unexpected shuffle mask");
+ NewMask[i] = Mask[i];
+ } else {
+ assert(Mask[i] < (WideElts + NarrowElts) && "Unexpected shuffle mask");
+ NewMask[i] = Mask[i] - (WideElts - NarrowElts);
+ }
+ }
+ return new ShuffleVectorInst(X, Y, NewMask);
+}
+
Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
- Value *LHS = SVI.getOperand(0);
- Value *RHS = SVI.getOperand(1);
- SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI);
- if (auto *V = SimplifyShuffleVectorInst(LHS, RHS, SVI.getShuffleMask(),
- SVI.getType(), ShufQuery))
- return replaceInstUsesWith(SVI, V);
-
+ Value *LHS = SVI.getOperand(0);
+ Value *RHS = SVI.getOperand(1);
+ SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI);
+ if (auto *V = SimplifyShuffleVectorInst(LHS, RHS, SVI.getShuffleMask(),
+ SVI.getType(), ShufQuery))
+ return replaceInstUsesWith(SVI, V);
+
// Bail out for scalable vectors
if (isa<ScalableVectorType>(LHS->getType()))
return nullptr;
- // shuffle x, x, mask --> shuffle x, undef, mask'
+ // shuffle x, x, mask --> shuffle x, undef, mask'
unsigned VWidth = cast<FixedVectorType>(SVI.getType())->getNumElements();
unsigned LHSWidth = cast<FixedVectorType>(LHS->getType())->getNumElements();
- ArrayRef<int> Mask = SVI.getShuffleMask();
- Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
-
- // Peek through a bitcasted shuffle operand by scaling the mask. If the
- // simulated shuffle can simplify, then this shuffle is unnecessary:
- // shuf (bitcast X), undef, Mask --> bitcast X'
- // TODO: This could be extended to allow length-changing shuffles.
- // The transform might also be obsoleted if we allowed canonicalization
- // of bitcasted shuffles.
- Value *X;
- if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_Undef()) &&
- X->getType()->isVectorTy() && VWidth == LHSWidth) {
- // Try to create a scaled mask constant.
+ ArrayRef<int> Mask = SVI.getShuffleMask();
+ Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
+
+ // Peek through a bitcasted shuffle operand by scaling the mask. If the
+ // simulated shuffle can simplify, then this shuffle is unnecessary:
+ // shuf (bitcast X), undef, Mask --> bitcast X'
+ // TODO: This could be extended to allow length-changing shuffles.
+ // The transform might also be obsoleted if we allowed canonicalization
+ // of bitcasted shuffles.
+ Value *X;
+ if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_Undef()) &&
+ X->getType()->isVectorTy() && VWidth == LHSWidth) {
+ // Try to create a scaled mask constant.
auto *XType = cast<FixedVectorType>(X->getType());
- unsigned XNumElts = XType->getNumElements();
- SmallVector<int, 16> ScaledMask;
- if (XNumElts >= VWidth) {
- assert(XNumElts % VWidth == 0 && "Unexpected vector bitcast");
- narrowShuffleMaskElts(XNumElts / VWidth, Mask, ScaledMask);
- } else {
- assert(VWidth % XNumElts == 0 && "Unexpected vector bitcast");
- if (!widenShuffleMaskElts(VWidth / XNumElts, Mask, ScaledMask))
- ScaledMask.clear();
- }
- if (!ScaledMask.empty()) {
- // If the shuffled source vector simplifies, cast that value to this
- // shuffle's type.
- if (auto *V = SimplifyShuffleVectorInst(X, UndefValue::get(XType),
- ScaledMask, XType, ShufQuery))
- return BitCastInst::Create(Instruction::BitCast, V, SVI.getType());
- }
- }
-
- if (LHS == RHS) {
- assert(!isa<UndefValue>(RHS) && "Shuffle with 2 undef ops not simplified?");
- // Remap any references to RHS to use LHS.
- SmallVector<int, 16> Elts;
- for (unsigned i = 0; i != VWidth; ++i) {
- // Propagate undef elements or force mask to LHS.
- if (Mask[i] < 0)
- Elts.push_back(UndefMaskElem);
- else
- Elts.push_back(Mask[i] % LHSWidth);
- }
- return new ShuffleVectorInst(LHS, UndefValue::get(RHS->getType()), Elts);
- }
-
- // shuffle undef, x, mask --> shuffle x, undef, mask'
- if (isa<UndefValue>(LHS)) {
- SVI.commute();
- return &SVI;
- }
-
- if (Instruction *I = canonicalizeInsertSplat(SVI, Builder))
- return I;
-
- if (Instruction *I = foldSelectShuffle(SVI, Builder, DL))
- return I;
-
- if (Instruction *I = foldTruncShuffle(SVI, DL.isBigEndian()))
- return I;
-
- if (Instruction *I = narrowVectorSelect(SVI, Builder))
- return I;
-
- APInt UndefElts(VWidth, 0);
- APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
- if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
- if (V != &SVI)
- return replaceInstUsesWith(SVI, V);
- return &SVI;
- }
-
- if (Instruction *I = foldIdentityExtractShuffle(SVI))
- return I;
-
- // These transforms have the potential to lose undef knowledge, so they are
- // intentionally placed after SimplifyDemandedVectorElts().
- if (Instruction *I = foldShuffleWithInsert(SVI, *this))
- return I;
- if (Instruction *I = foldIdentityPaddedShuffles(SVI))
- return I;
-
- if (isa<UndefValue>(RHS) && canEvaluateShuffled(LHS, Mask)) {
- Value *V = evaluateInDifferentElementOrder(LHS, Mask);
- return replaceInstUsesWith(SVI, V);
- }
-
- // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to
- // a non-vector type. We can instead bitcast the original vector followed by
- // an extract of the desired element:
- //
- // %sroa = shufflevector <16 x i8> %in, <16 x i8> undef,
- // <4 x i32> <i32 0, i32 1, i32 2, i32 3>
- // %1 = bitcast <4 x i8> %sroa to i32
- // Becomes:
- // %bc = bitcast <16 x i8> %in to <4 x i32>
- // %ext = extractelement <4 x i32> %bc, i32 0
- //
- // If the shuffle is extracting a contiguous range of values from the input
- // vector then each use which is a bitcast of the extracted size can be
- // replaced. This will work if the vector types are compatible, and the begin
- // index is aligned to a value in the casted vector type. If the begin index
- // isn't aligned then we can shuffle the original vector (keeping the same
- // vector type) before extracting.
- //
- // This code will bail out if the target type is fundamentally incompatible
- // with vectors of the source type.
- //
- // Example of <16 x i8>, target type i32:
- // Index range [4,8): v-----------v Will work.
- // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
- // <16 x i8>: | | | | | | | | | | | | | | | | |
- // <4 x i32>: | | | | |
- // +-----------+-----------+-----------+-----------+
- // Index range [6,10): ^-----------^ Needs an extra shuffle.
- // Target type i40: ^--------------^ Won't work, bail.
- bool MadeChange = false;
- if (isShuffleExtractingFromLHS(SVI, Mask)) {
- Value *V = LHS;
- unsigned MaskElems = Mask.size();
+ unsigned XNumElts = XType->getNumElements();
+ SmallVector<int, 16> ScaledMask;
+ if (XNumElts >= VWidth) {
+ assert(XNumElts % VWidth == 0 && "Unexpected vector bitcast");
+ narrowShuffleMaskElts(XNumElts / VWidth, Mask, ScaledMask);
+ } else {
+ assert(VWidth % XNumElts == 0 && "Unexpected vector bitcast");
+ if (!widenShuffleMaskElts(VWidth / XNumElts, Mask, ScaledMask))
+ ScaledMask.clear();
+ }
+ if (!ScaledMask.empty()) {
+ // If the shuffled source vector simplifies, cast that value to this
+ // shuffle's type.
+ if (auto *V = SimplifyShuffleVectorInst(X, UndefValue::get(XType),
+ ScaledMask, XType, ShufQuery))
+ return BitCastInst::Create(Instruction::BitCast, V, SVI.getType());
+ }
+ }
+
+ if (LHS == RHS) {
+ assert(!isa<UndefValue>(RHS) && "Shuffle with 2 undef ops not simplified?");
+ // Remap any references to RHS to use LHS.
+ SmallVector<int, 16> Elts;
+ for (unsigned i = 0; i != VWidth; ++i) {
+ // Propagate undef elements or force mask to LHS.
+ if (Mask[i] < 0)
+ Elts.push_back(UndefMaskElem);
+ else
+ Elts.push_back(Mask[i] % LHSWidth);
+ }
+ return new ShuffleVectorInst(LHS, UndefValue::get(RHS->getType()), Elts);
+ }
+
+ // shuffle undef, x, mask --> shuffle x, undef, mask'
+ if (isa<UndefValue>(LHS)) {
+ SVI.commute();
+ return &SVI;
+ }
+
+ if (Instruction *I = canonicalizeInsertSplat(SVI, Builder))
+ return I;
+
+ if (Instruction *I = foldSelectShuffle(SVI, Builder, DL))
+ return I;
+
+ if (Instruction *I = foldTruncShuffle(SVI, DL.isBigEndian()))
+ return I;
+
+ if (Instruction *I = narrowVectorSelect(SVI, Builder))
+ return I;
+
+ APInt UndefElts(VWidth, 0);
+ APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+ if (Value *V = SimplifyDemandedVectorElts(&SVI, AllOnesEltMask, UndefElts)) {
+ if (V != &SVI)
+ return replaceInstUsesWith(SVI, V);
+ return &SVI;
+ }
+
+ if (Instruction *I = foldIdentityExtractShuffle(SVI))
+ return I;
+
+ // These transforms have the potential to lose undef knowledge, so they are
+ // intentionally placed after SimplifyDemandedVectorElts().
+ if (Instruction *I = foldShuffleWithInsert(SVI, *this))
+ return I;
+ if (Instruction *I = foldIdentityPaddedShuffles(SVI))
+ return I;
+
+ if (isa<UndefValue>(RHS) && canEvaluateShuffled(LHS, Mask)) {
+ Value *V = evaluateInDifferentElementOrder(LHS, Mask);
+ return replaceInstUsesWith(SVI, V);
+ }
+
+ // SROA generates shuffle+bitcast when the extracted sub-vector is bitcast to
+ // a non-vector type. We can instead bitcast the original vector followed by
+ // an extract of the desired element:
+ //
+ // %sroa = shufflevector <16 x i8> %in, <16 x i8> undef,
+ // <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ // %1 = bitcast <4 x i8> %sroa to i32
+ // Becomes:
+ // %bc = bitcast <16 x i8> %in to <4 x i32>
+ // %ext = extractelement <4 x i32> %bc, i32 0
+ //
+ // If the shuffle is extracting a contiguous range of values from the input
+ // vector then each use which is a bitcast of the extracted size can be
+ // replaced. This will work if the vector types are compatible, and the begin
+ // index is aligned to a value in the casted vector type. If the begin index
+ // isn't aligned then we can shuffle the original vector (keeping the same
+ // vector type) before extracting.
+ //
+ // This code will bail out if the target type is fundamentally incompatible
+ // with vectors of the source type.
+ //
+ // Example of <16 x i8>, target type i32:
+ // Index range [4,8): v-----------v Will work.
+ // +--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+--+
+ // <16 x i8>: | | | | | | | | | | | | | | | | |
+ // <4 x i32>: | | | | |
+ // +-----------+-----------+-----------+-----------+
+ // Index range [6,10): ^-----------^ Needs an extra shuffle.
+ // Target type i40: ^--------------^ Won't work, bail.
+ bool MadeChange = false;
+ if (isShuffleExtractingFromLHS(SVI, Mask)) {
+ Value *V = LHS;
+ unsigned MaskElems = Mask.size();
auto *SrcTy = cast<FixedVectorType>(V->getType());
- unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedSize();
- unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
- assert(SrcElemBitWidth && "vector elements must have a bitwidth");
- unsigned SrcNumElems = SrcTy->getNumElements();
- SmallVector<BitCastInst *, 8> BCs;
- DenseMap<Type *, Value *> NewBCs;
- for (User *U : SVI.users())
- if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
- if (!BC->use_empty())
- // Only visit bitcasts that weren't previously handled.
- BCs.push_back(BC);
- for (BitCastInst *BC : BCs) {
- unsigned BegIdx = Mask.front();
- Type *TgtTy = BC->getDestTy();
- unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy);
- if (!TgtElemBitWidth)
- continue;
- unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth;
- bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth;
- bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth);
- if (!VecBitWidthsEqual)
- continue;
- if (!VectorType::isValidElementType(TgtTy))
- continue;
- auto *CastSrcTy = FixedVectorType::get(TgtTy, TgtNumElems);
- if (!BegIsAligned) {
- // Shuffle the input so [0,NumElements) contains the output, and
- // [NumElems,SrcNumElems) is undef.
- SmallVector<int, 16> ShuffleMask(SrcNumElems, -1);
- for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)
- ShuffleMask[I] = Idx;
+ unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedSize();
+ unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
+ assert(SrcElemBitWidth && "vector elements must have a bitwidth");
+ unsigned SrcNumElems = SrcTy->getNumElements();
+ SmallVector<BitCastInst *, 8> BCs;
+ DenseMap<Type *, Value *> NewBCs;
+ for (User *U : SVI.users())
+ if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
+ if (!BC->use_empty())
+ // Only visit bitcasts that weren't previously handled.
+ BCs.push_back(BC);
+ for (BitCastInst *BC : BCs) {
+ unsigned BegIdx = Mask.front();
+ Type *TgtTy = BC->getDestTy();
+ unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy);
+ if (!TgtElemBitWidth)
+ continue;
+ unsigned TgtNumElems = VecBitWidth / TgtElemBitWidth;
+ bool VecBitWidthsEqual = VecBitWidth == TgtNumElems * TgtElemBitWidth;
+ bool BegIsAligned = 0 == ((SrcElemBitWidth * BegIdx) % TgtElemBitWidth);
+ if (!VecBitWidthsEqual)
+ continue;
+ if (!VectorType::isValidElementType(TgtTy))
+ continue;
+ auto *CastSrcTy = FixedVectorType::get(TgtTy, TgtNumElems);
+ if (!BegIsAligned) {
+ // Shuffle the input so [0,NumElements) contains the output, and
+ // [NumElems,SrcNumElems) is undef.
+ SmallVector<int, 16> ShuffleMask(SrcNumElems, -1);
+ for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)
+ ShuffleMask[I] = Idx;
V = Builder.CreateShuffleVector(V, ShuffleMask,
- SVI.getName() + ".extract");
- BegIdx = 0;
- }
- unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth;
- assert(SrcElemsPerTgtElem);
- BegIdx /= SrcElemsPerTgtElem;
- bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end();
- auto *NewBC =
- BCAlreadyExists
- ? NewBCs[CastSrcTy]
- : Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
- if (!BCAlreadyExists)
- NewBCs[CastSrcTy] = NewBC;
- auto *Ext = Builder.CreateExtractElement(
- NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract");
- // The shufflevector isn't being replaced: the bitcast that used it
- // is. InstCombine will visit the newly-created instructions.
- replaceInstUsesWith(*BC, Ext);
- MadeChange = true;
- }
- }
-
- // If the LHS is a shufflevector itself, see if we can combine it with this
- // one without producing an unusual shuffle.
- // Cases that might be simplified:
- // 1.
- // x1=shuffle(v1,v2,mask1)
- // x=shuffle(x1,undef,mask)
- // ==>
- // x=shuffle(v1,undef,newMask)
- // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : -1
- // 2.
- // x1=shuffle(v1,undef,mask1)
- // x=shuffle(x1,x2,mask)
- // where v1.size() == mask1.size()
- // ==>
- // x=shuffle(v1,x2,newMask)
- // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : mask[i]
- // 3.
- // x2=shuffle(v2,undef,mask2)
- // x=shuffle(x1,x2,mask)
- // where v2.size() == mask2.size()
- // ==>
- // x=shuffle(x1,v2,newMask)
- // newMask[i] = (mask[i] < x1.size())
- // ? mask[i] : mask2[mask[i]-x1.size()]+x1.size()
- // 4.
- // x1=shuffle(v1,undef,mask1)
- // x2=shuffle(v2,undef,mask2)
- // x=shuffle(x1,x2,mask)
- // where v1.size() == v2.size()
- // ==>
- // x=shuffle(v1,v2,newMask)
- // newMask[i] = (mask[i] < x1.size())
- // ? mask1[mask[i]] : mask2[mask[i]-x1.size()]+v1.size()
- //
- // Here we are really conservative:
- // we are absolutely afraid of producing a shuffle mask not in the input
- // program, because the code gen may not be smart enough to turn a merged
- // shuffle into two specific shuffles: it may produce worse code. As such,
- // we only merge two shuffles if the result is either a splat or one of the
- // input shuffle masks. In this case, merging the shuffles just removes
- // one instruction, which we know is safe. This is good for things like
- // turning: (splat(splat)) -> splat, or
- // merge(V[0..n], V[n+1..2n]) -> V[0..2n]
- ShuffleVectorInst* LHSShuffle = dyn_cast<ShuffleVectorInst>(LHS);
- ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS);
- if (LHSShuffle)
- if (!isa<UndefValue>(LHSShuffle->getOperand(1)) && !isa<UndefValue>(RHS))
- LHSShuffle = nullptr;
- if (RHSShuffle)
- if (!isa<UndefValue>(RHSShuffle->getOperand(1)))
- RHSShuffle = nullptr;
- if (!LHSShuffle && !RHSShuffle)
- return MadeChange ? &SVI : nullptr;
-
- Value* LHSOp0 = nullptr;
- Value* LHSOp1 = nullptr;
- Value* RHSOp0 = nullptr;
- unsigned LHSOp0Width = 0;
- unsigned RHSOp0Width = 0;
- if (LHSShuffle) {
- LHSOp0 = LHSShuffle->getOperand(0);
- LHSOp1 = LHSShuffle->getOperand(1);
+ SVI.getName() + ".extract");
+ BegIdx = 0;
+ }
+ unsigned SrcElemsPerTgtElem = TgtElemBitWidth / SrcElemBitWidth;
+ assert(SrcElemsPerTgtElem);
+ BegIdx /= SrcElemsPerTgtElem;
+ bool BCAlreadyExists = NewBCs.find(CastSrcTy) != NewBCs.end();
+ auto *NewBC =
+ BCAlreadyExists
+ ? NewBCs[CastSrcTy]
+ : Builder.CreateBitCast(V, CastSrcTy, SVI.getName() + ".bc");
+ if (!BCAlreadyExists)
+ NewBCs[CastSrcTy] = NewBC;
+ auto *Ext = Builder.CreateExtractElement(
+ NewBC, ConstantInt::get(Int32Ty, BegIdx), SVI.getName() + ".extract");
+ // The shufflevector isn't being replaced: the bitcast that used it
+ // is. InstCombine will visit the newly-created instructions.
+ replaceInstUsesWith(*BC, Ext);
+ MadeChange = true;
+ }
+ }
+
+ // If the LHS is a shufflevector itself, see if we can combine it with this
+ // one without producing an unusual shuffle.
+ // Cases that might be simplified:
+ // 1.
+ // x1=shuffle(v1,v2,mask1)
+ // x=shuffle(x1,undef,mask)
+ // ==>
+ // x=shuffle(v1,undef,newMask)
+ // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : -1
+ // 2.
+ // x1=shuffle(v1,undef,mask1)
+ // x=shuffle(x1,x2,mask)
+ // where v1.size() == mask1.size()
+ // ==>
+ // x=shuffle(v1,x2,newMask)
+ // newMask[i] = (mask[i] < x1.size()) ? mask1[mask[i]] : mask[i]
+ // 3.
+ // x2=shuffle(v2,undef,mask2)
+ // x=shuffle(x1,x2,mask)
+ // where v2.size() == mask2.size()
+ // ==>
+ // x=shuffle(x1,v2,newMask)
+ // newMask[i] = (mask[i] < x1.size())
+ // ? mask[i] : mask2[mask[i]-x1.size()]+x1.size()
+ // 4.
+ // x1=shuffle(v1,undef,mask1)
+ // x2=shuffle(v2,undef,mask2)
+ // x=shuffle(x1,x2,mask)
+ // where v1.size() == v2.size()
+ // ==>
+ // x=shuffle(v1,v2,newMask)
+ // newMask[i] = (mask[i] < x1.size())
+ // ? mask1[mask[i]] : mask2[mask[i]-x1.size()]+v1.size()
+ //
+ // Here we are really conservative:
+ // we are absolutely afraid of producing a shuffle mask not in the input
+ // program, because the code gen may not be smart enough to turn a merged
+ // shuffle into two specific shuffles: it may produce worse code. As such,
+ // we only merge two shuffles if the result is either a splat or one of the
+ // input shuffle masks. In this case, merging the shuffles just removes
+ // one instruction, which we know is safe. This is good for things like
+ // turning: (splat(splat)) -> splat, or
+ // merge(V[0..n], V[n+1..2n]) -> V[0..2n]
+ ShuffleVectorInst* LHSShuffle = dyn_cast<ShuffleVectorInst>(LHS);
+ ShuffleVectorInst* RHSShuffle = dyn_cast<ShuffleVectorInst>(RHS);
+ if (LHSShuffle)
+ if (!isa<UndefValue>(LHSShuffle->getOperand(1)) && !isa<UndefValue>(RHS))
+ LHSShuffle = nullptr;
+ if (RHSShuffle)
+ if (!isa<UndefValue>(RHSShuffle->getOperand(1)))
+ RHSShuffle = nullptr;
+ if (!LHSShuffle && !RHSShuffle)
+ return MadeChange ? &SVI : nullptr;
+
+ Value* LHSOp0 = nullptr;
+ Value* LHSOp1 = nullptr;
+ Value* RHSOp0 = nullptr;
+ unsigned LHSOp0Width = 0;
+ unsigned RHSOp0Width = 0;
+ if (LHSShuffle) {
+ LHSOp0 = LHSShuffle->getOperand(0);
+ LHSOp1 = LHSShuffle->getOperand(1);
LHSOp0Width = cast<FixedVectorType>(LHSOp0->getType())->getNumElements();
- }
- if (RHSShuffle) {
- RHSOp0 = RHSShuffle->getOperand(0);
+ }
+ if (RHSShuffle) {
+ RHSOp0 = RHSShuffle->getOperand(0);
RHSOp0Width = cast<FixedVectorType>(RHSOp0->getType())->getNumElements();
- }
- Value* newLHS = LHS;
- Value* newRHS = RHS;
- if (LHSShuffle) {
- // case 1
- if (isa<UndefValue>(RHS)) {
- newLHS = LHSOp0;
- newRHS = LHSOp1;
- }
- // case 2 or 4
- else if (LHSOp0Width == LHSWidth) {
- newLHS = LHSOp0;
- }
- }
- // case 3 or 4
- if (RHSShuffle && RHSOp0Width == LHSWidth) {
- newRHS = RHSOp0;
- }
- // case 4
- if (LHSOp0 == RHSOp0) {
- newLHS = LHSOp0;
- newRHS = nullptr;
- }
-
- if (newLHS == LHS && newRHS == RHS)
- return MadeChange ? &SVI : nullptr;
-
- ArrayRef<int> LHSMask;
- ArrayRef<int> RHSMask;
- if (newLHS != LHS)
- LHSMask = LHSShuffle->getShuffleMask();
- if (RHSShuffle && newRHS != RHS)
- RHSMask = RHSShuffle->getShuffleMask();
-
- unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth;
- SmallVector<int, 16> newMask;
- bool isSplat = true;
- int SplatElt = -1;
- // Create a new mask for the new ShuffleVectorInst so that the new
- // ShuffleVectorInst is equivalent to the original one.
- for (unsigned i = 0; i < VWidth; ++i) {
- int eltMask;
- if (Mask[i] < 0) {
- // This element is an undef value.
- eltMask = -1;
- } else if (Mask[i] < (int)LHSWidth) {
- // This element is from left hand side vector operand.
- //
- // If LHS is going to be replaced (case 1, 2, or 4), calculate the
- // new mask value for the element.
- if (newLHS != LHS) {
- eltMask = LHSMask[Mask[i]];
- // If the value selected is an undef value, explicitly specify it
- // with a -1 mask value.
- if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1))
- eltMask = -1;
- } else
- eltMask = Mask[i];
- } else {
- // This element is from right hand side vector operand
- //
- // If the value selected is an undef value, explicitly specify it
- // with a -1 mask value. (case 1)
- if (isa<UndefValue>(RHS))
- eltMask = -1;
- // If RHS is going to be replaced (case 3 or 4), calculate the
- // new mask value for the element.
- else if (newRHS != RHS) {
- eltMask = RHSMask[Mask[i]-LHSWidth];
- // If the value selected is an undef value, explicitly specify it
- // with a -1 mask value.
- if (eltMask >= (int)RHSOp0Width) {
- assert(isa<UndefValue>(RHSShuffle->getOperand(1))
- && "should have been check above");
- eltMask = -1;
- }
- } else
- eltMask = Mask[i]-LHSWidth;
-
- // If LHS's width is changed, shift the mask value accordingly.
- // If newRHS == nullptr, i.e. LHSOp0 == RHSOp0, we want to remap any
- // references from RHSOp0 to LHSOp0, so we don't need to shift the mask.
- // If newRHS == newLHS, we want to remap any references from newRHS to
- // newLHS so that we can properly identify splats that may occur due to
- // obfuscation across the two vectors.
- if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS)
- eltMask += newLHSWidth;
- }
-
- // Check if this could still be a splat.
- if (eltMask >= 0) {
- if (SplatElt >= 0 && SplatElt != eltMask)
- isSplat = false;
- SplatElt = eltMask;
- }
-
- newMask.push_back(eltMask);
- }
-
- // If the result mask is equal to one of the original shuffle masks,
- // or is a splat, do the replacement.
- if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
- if (!newRHS)
- newRHS = UndefValue::get(newLHS->getType());
+ }
+ Value* newLHS = LHS;
+ Value* newRHS = RHS;
+ if (LHSShuffle) {
+ // case 1
+ if (isa<UndefValue>(RHS)) {
+ newLHS = LHSOp0;
+ newRHS = LHSOp1;
+ }
+ // case 2 or 4
+ else if (LHSOp0Width == LHSWidth) {
+ newLHS = LHSOp0;
+ }
+ }
+ // case 3 or 4
+ if (RHSShuffle && RHSOp0Width == LHSWidth) {
+ newRHS = RHSOp0;
+ }
+ // case 4
+ if (LHSOp0 == RHSOp0) {
+ newLHS = LHSOp0;
+ newRHS = nullptr;
+ }
+
+ if (newLHS == LHS && newRHS == RHS)
+ return MadeChange ? &SVI : nullptr;
+
+ ArrayRef<int> LHSMask;
+ ArrayRef<int> RHSMask;
+ if (newLHS != LHS)
+ LHSMask = LHSShuffle->getShuffleMask();
+ if (RHSShuffle && newRHS != RHS)
+ RHSMask = RHSShuffle->getShuffleMask();
+
+ unsigned newLHSWidth = (newLHS != LHS) ? LHSOp0Width : LHSWidth;
+ SmallVector<int, 16> newMask;
+ bool isSplat = true;
+ int SplatElt = -1;
+ // Create a new mask for the new ShuffleVectorInst so that the new
+ // ShuffleVectorInst is equivalent to the original one.
+ for (unsigned i = 0; i < VWidth; ++i) {
+ int eltMask;
+ if (Mask[i] < 0) {
+ // This element is an undef value.
+ eltMask = -1;
+ } else if (Mask[i] < (int)LHSWidth) {
+ // This element is from left hand side vector operand.
+ //
+ // If LHS is going to be replaced (case 1, 2, or 4), calculate the
+ // new mask value for the element.
+ if (newLHS != LHS) {
+ eltMask = LHSMask[Mask[i]];
+ // If the value selected is an undef value, explicitly specify it
+ // with a -1 mask value.
+ if (eltMask >= (int)LHSOp0Width && isa<UndefValue>(LHSOp1))
+ eltMask = -1;
+ } else
+ eltMask = Mask[i];
+ } else {
+ // This element is from right hand side vector operand
+ //
+ // If the value selected is an undef value, explicitly specify it
+ // with a -1 mask value. (case 1)
+ if (isa<UndefValue>(RHS))
+ eltMask = -1;
+ // If RHS is going to be replaced (case 3 or 4), calculate the
+ // new mask value for the element.
+ else if (newRHS != RHS) {
+ eltMask = RHSMask[Mask[i]-LHSWidth];
+ // If the value selected is an undef value, explicitly specify it
+ // with a -1 mask value.
+ if (eltMask >= (int)RHSOp0Width) {
+ assert(isa<UndefValue>(RHSShuffle->getOperand(1))
+ && "should have been check above");
+ eltMask = -1;
+ }
+ } else
+ eltMask = Mask[i]-LHSWidth;
+
+ // If LHS's width is changed, shift the mask value accordingly.
+ // If newRHS == nullptr, i.e. LHSOp0 == RHSOp0, we want to remap any
+ // references from RHSOp0 to LHSOp0, so we don't need to shift the mask.
+ // If newRHS == newLHS, we want to remap any references from newRHS to
+ // newLHS so that we can properly identify splats that may occur due to
+ // obfuscation across the two vectors.
+ if (eltMask >= 0 && newRHS != nullptr && newLHS != newRHS)
+ eltMask += newLHSWidth;
+ }
+
+ // Check if this could still be a splat.
+ if (eltMask >= 0) {
+ if (SplatElt >= 0 && SplatElt != eltMask)
+ isSplat = false;
+ SplatElt = eltMask;
+ }
+
+ newMask.push_back(eltMask);
+ }
+
+ // If the result mask is equal to one of the original shuffle masks,
+ // or is a splat, do the replacement.
+ if (isSplat || newMask == LHSMask || newMask == RHSMask || newMask == Mask) {
+ if (!newRHS)
+ newRHS = UndefValue::get(newLHS->getType());
return new ShuffleVectorInst(newLHS, newRHS, newMask);
- }
-
- return MadeChange ? &SVI : nullptr;
-}
+ }
+
+ return MadeChange ? &SVI : nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstructionCombining.cpp b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstructionCombining.cpp
index 98006215ef..828fd49524 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1,169 +1,169 @@
-//===- InstructionCombining.cpp - Combine multiple instructions -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// InstructionCombining - Combine instructions to form fewer, simple
-// instructions. This pass does not modify the CFG. This pass is where
-// algebraic simplification happens.
-//
-// This pass combines things like:
-// %Y = add i32 %X, 1
-// %Z = add i32 %Y, 1
-// into:
-// %Z = add i32 %X, 2
-//
-// This is a simple worklist driven algorithm.
-//
-// This pass guarantees that the following canonicalizations are performed on
-// the program:
-// 1. If a binary operator has a constant operand, it is moved to the RHS
-// 2. Bitwise operators with constant operands are always grouped so that
-// shifts are performed first, then or's, then and's, then xor's.
-// 3. Compare instructions are converted from <,>,<=,>= to ==,!= if possible
-// 4. All cmp instructions on boolean values are replaced with logical ops
-// 5. add X, X is represented as (X*2) => (X << 1)
-// 6. Multiplies with a power-of-two constant argument are transformed into
-// shifts.
-// ... etc.
-//
-//===----------------------------------------------------------------------===//
-
-#include "InstCombineInternal.h"
-#include "llvm-c/Initialization.h"
-#include "llvm-c/Transforms/InstCombine.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/TargetFolder.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+//===- InstructionCombining.cpp - Combine multiple instructions -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// InstructionCombining - Combine instructions to form fewer, simple
+// instructions. This pass does not modify the CFG. This pass is where
+// algebraic simplification happens.
+//
+// This pass combines things like:
+// %Y = add i32 %X, 1
+// %Z = add i32 %Y, 1
+// into:
+// %Z = add i32 %X, 2
+//
+// This is a simple worklist driven algorithm.
+//
+// This pass guarantees that the following canonicalizations are performed on
+// the program:
+// 1. If a binary operator has a constant operand, it is moved to the RHS
+// 2. Bitwise operators with constant operands are always grouped so that
+// shifts are performed first, then or's, then and's, then xor's.
+// 3. Compare instructions are converted from <,>,<=,>= to ==,!= if possible
+// 4. All cmp instructions on boolean values are replaced with logical ops
+// 5. add X, X is represented as (X*2) => (X << 1)
+// 6. Multiplies with a power-of-two constant argument are transformed into
+// shifts.
+// ... etc.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/InstCombine.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CBindingWrapping.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/DebugCounter.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/KnownBits.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-#define DEBUG_TYPE "instcombine"
-
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CBindingWrapping.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "instcombine"
+
STATISTIC(NumWorklistIterations,
"Number of instruction combining iterations performed");
-STATISTIC(NumCombined , "Number of insts combined");
-STATISTIC(NumConstProp, "Number of constant folds");
-STATISTIC(NumDeadInst , "Number of dead inst eliminated");
-STATISTIC(NumSunkInst , "Number of instructions sunk");
-STATISTIC(NumExpand, "Number of expansions");
-STATISTIC(NumFactor , "Number of factorizations");
-STATISTIC(NumReassoc , "Number of reassociations");
-DEBUG_COUNTER(VisitCounter, "instcombine-visit",
- "Controls which instructions are visited");
-
+STATISTIC(NumCombined , "Number of insts combined");
+STATISTIC(NumConstProp, "Number of constant folds");
+STATISTIC(NumDeadInst , "Number of dead inst eliminated");
+STATISTIC(NumSunkInst , "Number of instructions sunk");
+STATISTIC(NumExpand, "Number of expansions");
+STATISTIC(NumFactor , "Number of factorizations");
+STATISTIC(NumReassoc , "Number of reassociations");
+DEBUG_COUNTER(VisitCounter, "instcombine-visit",
+ "Controls which instructions are visited");
+
// FIXME: these limits eventually should be as low as 2.
-static constexpr unsigned InstCombineDefaultMaxIterations = 1000;
+static constexpr unsigned InstCombineDefaultMaxIterations = 1000;
#ifndef NDEBUG
static constexpr unsigned InstCombineDefaultInfiniteLoopThreshold = 100;
#else
-static constexpr unsigned InstCombineDefaultInfiniteLoopThreshold = 1000;
+static constexpr unsigned InstCombineDefaultInfiniteLoopThreshold = 1000;
#endif
-
-static cl::opt<bool>
-EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
- cl::init(true));
-
-static cl::opt<unsigned> LimitMaxIterations(
- "instcombine-max-iterations",
- cl::desc("Limit the maximum number of instruction combining iterations"),
- cl::init(InstCombineDefaultMaxIterations));
-
-static cl::opt<unsigned> InfiniteLoopDetectionThreshold(
- "instcombine-infinite-loop-threshold",
- cl::desc("Number of instruction combining iterations considered an "
- "infinite loop"),
- cl::init(InstCombineDefaultInfiniteLoopThreshold), cl::Hidden);
-
-static cl::opt<unsigned>
-MaxArraySize("instcombine-maxarray-size", cl::init(1024),
- cl::desc("Maximum array size considered when doing a combine"));
-
-// FIXME: Remove this flag when it is no longer necessary to convert
-// llvm.dbg.declare to avoid inaccurate debug info. Setting this to false
-// increases variable availability at the cost of accuracy. Variables that
-// cannot be promoted by mem2reg or SROA will be described as living in memory
-// for their entire lifetime. However, passes like DSE and instcombine can
-// delete stores to the alloca, leading to misleading and inaccurate debug
-// information. This flag can be removed when those passes are fixed.
-static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare",
- cl::Hidden, cl::init(true));
-
+
+static cl::opt<bool>
+EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
+ cl::init(true));
+
+static cl::opt<unsigned> LimitMaxIterations(
+ "instcombine-max-iterations",
+ cl::desc("Limit the maximum number of instruction combining iterations"),
+ cl::init(InstCombineDefaultMaxIterations));
+
+static cl::opt<unsigned> InfiniteLoopDetectionThreshold(
+ "instcombine-infinite-loop-threshold",
+ cl::desc("Number of instruction combining iterations considered an "
+ "infinite loop"),
+ cl::init(InstCombineDefaultInfiniteLoopThreshold), cl::Hidden);
+
+static cl::opt<unsigned>
+MaxArraySize("instcombine-maxarray-size", cl::init(1024),
+ cl::desc("Maximum array size considered when doing a combine"));
+
+// FIXME: Remove this flag when it is no longer necessary to convert
+// llvm.dbg.declare to avoid inaccurate debug info. Setting this to false
+// increases variable availability at the cost of accuracy. Variables that
+// cannot be promoted by mem2reg or SROA will be described as living in memory
+// for their entire lifetime. However, passes like DSE and instcombine can
+// delete stores to the alloca, leading to misleading and inaccurate debug
+// information. This flag can be removed when those passes are fixed.
+static cl::opt<unsigned> ShouldLowerDbgDeclare("instcombine-lower-dbg-declare",
+ cl::Hidden, cl::init(true));
+
Optional<Instruction *>
InstCombiner::targetInstCombineIntrinsic(IntrinsicInst &II) {
// Handle target specific intrinsics
@@ -199,677 +199,677 @@ Optional<Value *> InstCombiner::targetSimplifyDemandedVectorEltsIntrinsic(
}
Value *InstCombinerImpl::EmitGEPOffset(User *GEP) {
- return llvm::EmitGEPOffset(&Builder, DL, GEP);
-}
-
-/// Return true if it is desirable to convert an integer computation from a
-/// given bit width to a new bit width.
-/// We don't want to convert from a legal to an illegal type or from a smaller
-/// to a larger illegal type. A width of '1' is always treated as a legal type
-/// because i1 is a fundamental type in IR, and there are many specialized
-/// optimizations for i1 types. Widths of 8, 16 or 32 are equally treated as
-/// legal to convert to, in order to open up more combining opportunities.
-/// NOTE: this treats i8, i16 and i32 specially, due to them being so common
-/// from frontend languages.
+ return llvm::EmitGEPOffset(&Builder, DL, GEP);
+}
+
+/// Return true if it is desirable to convert an integer computation from a
+/// given bit width to a new bit width.
+/// We don't want to convert from a legal to an illegal type or from a smaller
+/// to a larger illegal type. A width of '1' is always treated as a legal type
+/// because i1 is a fundamental type in IR, and there are many specialized
+/// optimizations for i1 types. Widths of 8, 16 or 32 are equally treated as
+/// legal to convert to, in order to open up more combining opportunities.
+/// NOTE: this treats i8, i16 and i32 specially, due to them being so common
+/// from frontend languages.
bool InstCombinerImpl::shouldChangeType(unsigned FromWidth,
unsigned ToWidth) const {
- bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth);
- bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth);
-
- // Convert to widths of 8, 16 or 32 even if they are not legal types. Only
- // shrink types, to prevent infinite loops.
- if (ToWidth < FromWidth && (ToWidth == 8 || ToWidth == 16 || ToWidth == 32))
- return true;
-
- // If this is a legal integer from type, and the result would be an illegal
- // type, don't do the transformation.
- if (FromLegal && !ToLegal)
- return false;
-
- // Otherwise, if both are illegal, do not increase the size of the result. We
- // do allow things like i160 -> i64, but not i64 -> i160.
- if (!FromLegal && !ToLegal && ToWidth > FromWidth)
- return false;
-
- return true;
-}
-
-/// Return true if it is desirable to convert a computation from 'From' to 'To'.
-/// We don't want to convert from a legal to an illegal type or from a smaller
-/// to a larger illegal type. i1 is always treated as a legal type because it is
-/// a fundamental type in IR, and there are many specialized optimizations for
-/// i1 types.
+ bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth);
+ bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth);
+
+ // Convert to widths of 8, 16 or 32 even if they are not legal types. Only
+ // shrink types, to prevent infinite loops.
+ if (ToWidth < FromWidth && (ToWidth == 8 || ToWidth == 16 || ToWidth == 32))
+ return true;
+
+ // If this is a legal integer from type, and the result would be an illegal
+ // type, don't do the transformation.
+ if (FromLegal && !ToLegal)
+ return false;
+
+ // Otherwise, if both are illegal, do not increase the size of the result. We
+ // do allow things like i160 -> i64, but not i64 -> i160.
+ if (!FromLegal && !ToLegal && ToWidth > FromWidth)
+ return false;
+
+ return true;
+}
+
+/// Return true if it is desirable to convert a computation from 'From' to 'To'.
+/// We don't want to convert from a legal to an illegal type or from a smaller
+/// to a larger illegal type. i1 is always treated as a legal type because it is
+/// a fundamental type in IR, and there are many specialized optimizations for
+/// i1 types.
bool InstCombinerImpl::shouldChangeType(Type *From, Type *To) const {
- // TODO: This could be extended to allow vectors. Datalayout changes might be
- // needed to properly support that.
- if (!From->isIntegerTy() || !To->isIntegerTy())
- return false;
-
- unsigned FromWidth = From->getPrimitiveSizeInBits();
- unsigned ToWidth = To->getPrimitiveSizeInBits();
- return shouldChangeType(FromWidth, ToWidth);
-}
-
-// Return true, if No Signed Wrap should be maintained for I.
-// The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C",
-// where both B and C should be ConstantInts, results in a constant that does
-// not overflow. This function only handles the Add and Sub opcodes. For
-// all other opcodes, the function conservatively returns false.
-static bool maintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
- auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
- if (!OBO || !OBO->hasNoSignedWrap())
- return false;
-
- // We reason about Add and Sub Only.
- Instruction::BinaryOps Opcode = I.getOpcode();
- if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
- return false;
-
- const APInt *BVal, *CVal;
- if (!match(B, m_APInt(BVal)) || !match(C, m_APInt(CVal)))
- return false;
-
- bool Overflow = false;
- if (Opcode == Instruction::Add)
- (void)BVal->sadd_ov(*CVal, Overflow);
- else
- (void)BVal->ssub_ov(*CVal, Overflow);
-
- return !Overflow;
-}
-
-static bool hasNoUnsignedWrap(BinaryOperator &I) {
- auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
- return OBO && OBO->hasNoUnsignedWrap();
-}
-
-static bool hasNoSignedWrap(BinaryOperator &I) {
- auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
- return OBO && OBO->hasNoSignedWrap();
-}
-
-/// Conservatively clears subclassOptionalData after a reassociation or
-/// commutation. We preserve fast-math flags when applicable as they can be
-/// preserved.
-static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
- FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I);
- if (!FPMO) {
- I.clearSubclassOptionalData();
- return;
- }
-
- FastMathFlags FMF = I.getFastMathFlags();
- I.clearSubclassOptionalData();
- I.setFastMathFlags(FMF);
-}
-
-/// Combine constant operands of associative operations either before or after a
-/// cast to eliminate one of the associative operations:
-/// (op (cast (op X, C2)), C1) --> (cast (op X, op (C1, C2)))
-/// (op (cast (op X, C2)), C1) --> (op (cast X), op (C1, C2))
+ // TODO: This could be extended to allow vectors. Datalayout changes might be
+ // needed to properly support that.
+ if (!From->isIntegerTy() || !To->isIntegerTy())
+ return false;
+
+ unsigned FromWidth = From->getPrimitiveSizeInBits();
+ unsigned ToWidth = To->getPrimitiveSizeInBits();
+ return shouldChangeType(FromWidth, ToWidth);
+}
+
+// Return true, if No Signed Wrap should be maintained for I.
+// The No Signed Wrap flag can be kept if the operation "B (I.getOpcode) C",
+// where both B and C should be ConstantInts, results in a constant that does
+// not overflow. This function only handles the Add and Sub opcodes. For
+// all other opcodes, the function conservatively returns false.
+static bool maintainNoSignedWrap(BinaryOperator &I, Value *B, Value *C) {
+ auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+ if (!OBO || !OBO->hasNoSignedWrap())
+ return false;
+
+ // We reason about Add and Sub Only.
+ Instruction::BinaryOps Opcode = I.getOpcode();
+ if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
+ return false;
+
+ const APInt *BVal, *CVal;
+ if (!match(B, m_APInt(BVal)) || !match(C, m_APInt(CVal)))
+ return false;
+
+ bool Overflow = false;
+ if (Opcode == Instruction::Add)
+ (void)BVal->sadd_ov(*CVal, Overflow);
+ else
+ (void)BVal->ssub_ov(*CVal, Overflow);
+
+ return !Overflow;
+}
+
+static bool hasNoUnsignedWrap(BinaryOperator &I) {
+ auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+ return OBO && OBO->hasNoUnsignedWrap();
+}
+
+static bool hasNoSignedWrap(BinaryOperator &I) {
+ auto *OBO = dyn_cast<OverflowingBinaryOperator>(&I);
+ return OBO && OBO->hasNoSignedWrap();
+}
+
+/// Conservatively clears subclassOptionalData after a reassociation or
+/// commutation. We preserve fast-math flags when applicable as they can be
+/// preserved.
+static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
+ FPMathOperator *FPMO = dyn_cast<FPMathOperator>(&I);
+ if (!FPMO) {
+ I.clearSubclassOptionalData();
+ return;
+ }
+
+ FastMathFlags FMF = I.getFastMathFlags();
+ I.clearSubclassOptionalData();
+ I.setFastMathFlags(FMF);
+}
+
+/// Combine constant operands of associative operations either before or after a
+/// cast to eliminate one of the associative operations:
+/// (op (cast (op X, C2)), C1) --> (cast (op X, op (C1, C2)))
+/// (op (cast (op X, C2)), C1) --> (op (cast X), op (C1, C2))
static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1,
InstCombinerImpl &IC) {
- auto *Cast = dyn_cast<CastInst>(BinOp1->getOperand(0));
- if (!Cast || !Cast->hasOneUse())
- return false;
-
- // TODO: Enhance logic for other casts and remove this check.
- auto CastOpcode = Cast->getOpcode();
- if (CastOpcode != Instruction::ZExt)
- return false;
-
- // TODO: Enhance logic for other BinOps and remove this check.
- if (!BinOp1->isBitwiseLogicOp())
- return false;
-
- auto AssocOpcode = BinOp1->getOpcode();
- auto *BinOp2 = dyn_cast<BinaryOperator>(Cast->getOperand(0));
- if (!BinOp2 || !BinOp2->hasOneUse() || BinOp2->getOpcode() != AssocOpcode)
- return false;
-
- Constant *C1, *C2;
- if (!match(BinOp1->getOperand(1), m_Constant(C1)) ||
- !match(BinOp2->getOperand(1), m_Constant(C2)))
- return false;
-
- // TODO: This assumes a zext cast.
- // Eg, if it was a trunc, we'd cast C1 to the source type because casting C2
- // to the destination type might lose bits.
-
- // Fold the constants together in the destination type:
- // (op (cast (op X, C2)), C1) --> (op (cast X), FoldedC)
- Type *DestTy = C1->getType();
- Constant *CastC2 = ConstantExpr::getCast(CastOpcode, C2, DestTy);
- Constant *FoldedC = ConstantExpr::get(AssocOpcode, C1, CastC2);
- IC.replaceOperand(*Cast, 0, BinOp2->getOperand(0));
- IC.replaceOperand(*BinOp1, 1, FoldedC);
- return true;
-}
-
-/// This performs a few simplifications for operators that are associative or
-/// commutative:
-///
-/// Commutative operators:
-///
-/// 1. Order operands such that they are listed from right (least complex) to
-/// left (most complex). This puts constants before unary operators before
-/// binary operators.
-///
-/// Associative operators:
-///
-/// 2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
-/// 3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
-///
-/// Associative and commutative operators:
-///
-/// 4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
-/// 5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
-/// 6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
-/// if C1 and C2 are constants.
+ auto *Cast = dyn_cast<CastInst>(BinOp1->getOperand(0));
+ if (!Cast || !Cast->hasOneUse())
+ return false;
+
+ // TODO: Enhance logic for other casts and remove this check.
+ auto CastOpcode = Cast->getOpcode();
+ if (CastOpcode != Instruction::ZExt)
+ return false;
+
+ // TODO: Enhance logic for other BinOps and remove this check.
+ if (!BinOp1->isBitwiseLogicOp())
+ return false;
+
+ auto AssocOpcode = BinOp1->getOpcode();
+ auto *BinOp2 = dyn_cast<BinaryOperator>(Cast->getOperand(0));
+ if (!BinOp2 || !BinOp2->hasOneUse() || BinOp2->getOpcode() != AssocOpcode)
+ return false;
+
+ Constant *C1, *C2;
+ if (!match(BinOp1->getOperand(1), m_Constant(C1)) ||
+ !match(BinOp2->getOperand(1), m_Constant(C2)))
+ return false;
+
+ // TODO: This assumes a zext cast.
+ // Eg, if it was a trunc, we'd cast C1 to the source type because casting C2
+ // to the destination type might lose bits.
+
+ // Fold the constants together in the destination type:
+ // (op (cast (op X, C2)), C1) --> (op (cast X), FoldedC)
+ Type *DestTy = C1->getType();
+ Constant *CastC2 = ConstantExpr::getCast(CastOpcode, C2, DestTy);
+ Constant *FoldedC = ConstantExpr::get(AssocOpcode, C1, CastC2);
+ IC.replaceOperand(*Cast, 0, BinOp2->getOperand(0));
+ IC.replaceOperand(*BinOp1, 1, FoldedC);
+ return true;
+}
+
+/// This performs a few simplifications for operators that are associative or
+/// commutative:
+///
+/// Commutative operators:
+///
+/// 1. Order operands such that they are listed from right (least complex) to
+/// left (most complex). This puts constants before unary operators before
+/// binary operators.
+///
+/// Associative operators:
+///
+/// 2. Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
+/// 3. Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
+///
+/// Associative and commutative operators:
+///
+/// 4. Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
+/// 5. Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
+/// 6. Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
+/// if C1 and C2 are constants.
bool InstCombinerImpl::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
- Instruction::BinaryOps Opcode = I.getOpcode();
- bool Changed = false;
-
- do {
- // Order operands such that they are listed from right (least complex) to
- // left (most complex). This puts constants before unary operators before
- // binary operators.
- if (I.isCommutative() && getComplexity(I.getOperand(0)) <
- getComplexity(I.getOperand(1)))
- Changed = !I.swapOperands();
-
- BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0));
- BinaryOperator *Op1 = dyn_cast<BinaryOperator>(I.getOperand(1));
-
- if (I.isAssociative()) {
- // Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
- if (Op0 && Op0->getOpcode() == Opcode) {
- Value *A = Op0->getOperand(0);
- Value *B = Op0->getOperand(1);
- Value *C = I.getOperand(1);
-
- // Does "B op C" simplify?
- if (Value *V = SimplifyBinOp(Opcode, B, C, SQ.getWithInstruction(&I))) {
- // It simplifies to V. Form "A op V".
- replaceOperand(I, 0, A);
- replaceOperand(I, 1, V);
- bool IsNUW = hasNoUnsignedWrap(I) && hasNoUnsignedWrap(*Op0);
- bool IsNSW = maintainNoSignedWrap(I, B, C) && hasNoSignedWrap(*Op0);
-
- // Conservatively clear all optional flags since they may not be
- // preserved by the reassociation. Reset nsw/nuw based on the above
- // analysis.
- ClearSubclassDataAfterReassociation(I);
-
- // Note: this is only valid because SimplifyBinOp doesn't look at
- // the operands to Op0.
- if (IsNUW)
- I.setHasNoUnsignedWrap(true);
-
- if (IsNSW)
- I.setHasNoSignedWrap(true);
-
- Changed = true;
- ++NumReassoc;
- continue;
- }
- }
-
- // Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
- if (Op1 && Op1->getOpcode() == Opcode) {
- Value *A = I.getOperand(0);
- Value *B = Op1->getOperand(0);
- Value *C = Op1->getOperand(1);
-
- // Does "A op B" simplify?
- if (Value *V = SimplifyBinOp(Opcode, A, B, SQ.getWithInstruction(&I))) {
- // It simplifies to V. Form "V op C".
- replaceOperand(I, 0, V);
- replaceOperand(I, 1, C);
- // Conservatively clear the optional flags, since they may not be
- // preserved by the reassociation.
- ClearSubclassDataAfterReassociation(I);
- Changed = true;
- ++NumReassoc;
- continue;
- }
- }
- }
-
- if (I.isAssociative() && I.isCommutative()) {
- if (simplifyAssocCastAssoc(&I, *this)) {
- Changed = true;
- ++NumReassoc;
- continue;
- }
-
- // Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
- if (Op0 && Op0->getOpcode() == Opcode) {
- Value *A = Op0->getOperand(0);
- Value *B = Op0->getOperand(1);
- Value *C = I.getOperand(1);
-
- // Does "C op A" simplify?
- if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
- // It simplifies to V. Form "V op B".
- replaceOperand(I, 0, V);
- replaceOperand(I, 1, B);
- // Conservatively clear the optional flags, since they may not be
- // preserved by the reassociation.
- ClearSubclassDataAfterReassociation(I);
- Changed = true;
- ++NumReassoc;
- continue;
- }
- }
-
- // Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
- if (Op1 && Op1->getOpcode() == Opcode) {
- Value *A = I.getOperand(0);
- Value *B = Op1->getOperand(0);
- Value *C = Op1->getOperand(1);
-
- // Does "C op A" simplify?
- if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
- // It simplifies to V. Form "B op V".
- replaceOperand(I, 0, B);
- replaceOperand(I, 1, V);
- // Conservatively clear the optional flags, since they may not be
- // preserved by the reassociation.
- ClearSubclassDataAfterReassociation(I);
- Changed = true;
- ++NumReassoc;
- continue;
- }
- }
-
- // Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
- // if C1 and C2 are constants.
- Value *A, *B;
- Constant *C1, *C2;
- if (Op0 && Op1 &&
- Op0->getOpcode() == Opcode && Op1->getOpcode() == Opcode &&
- match(Op0, m_OneUse(m_BinOp(m_Value(A), m_Constant(C1)))) &&
- match(Op1, m_OneUse(m_BinOp(m_Value(B), m_Constant(C2))))) {
- bool IsNUW = hasNoUnsignedWrap(I) &&
- hasNoUnsignedWrap(*Op0) &&
- hasNoUnsignedWrap(*Op1);
- BinaryOperator *NewBO = (IsNUW && Opcode == Instruction::Add) ?
- BinaryOperator::CreateNUW(Opcode, A, B) :
- BinaryOperator::Create(Opcode, A, B);
-
- if (isa<FPMathOperator>(NewBO)) {
- FastMathFlags Flags = I.getFastMathFlags();
- Flags &= Op0->getFastMathFlags();
- Flags &= Op1->getFastMathFlags();
- NewBO->setFastMathFlags(Flags);
- }
- InsertNewInstWith(NewBO, I);
- NewBO->takeName(Op1);
- replaceOperand(I, 0, NewBO);
- replaceOperand(I, 1, ConstantExpr::get(Opcode, C1, C2));
- // Conservatively clear the optional flags, since they may not be
- // preserved by the reassociation.
- ClearSubclassDataAfterReassociation(I);
- if (IsNUW)
- I.setHasNoUnsignedWrap(true);
-
- Changed = true;
- continue;
- }
- }
-
- // No further simplifications.
- return Changed;
- } while (true);
-}
-
-/// Return whether "X LOp (Y ROp Z)" is always equal to
-/// "(X LOp Y) ROp (X LOp Z)".
-static bool leftDistributesOverRight(Instruction::BinaryOps LOp,
- Instruction::BinaryOps ROp) {
- // X & (Y | Z) <--> (X & Y) | (X & Z)
- // X & (Y ^ Z) <--> (X & Y) ^ (X & Z)
- if (LOp == Instruction::And)
- return ROp == Instruction::Or || ROp == Instruction::Xor;
-
- // X | (Y & Z) <--> (X | Y) & (X | Z)
- if (LOp == Instruction::Or)
- return ROp == Instruction::And;
-
- // X * (Y + Z) <--> (X * Y) + (X * Z)
- // X * (Y - Z) <--> (X * Y) - (X * Z)
- if (LOp == Instruction::Mul)
- return ROp == Instruction::Add || ROp == Instruction::Sub;
-
- return false;
-}
-
-/// Return whether "(X LOp Y) ROp Z" is always equal to
-/// "(X ROp Z) LOp (Y ROp Z)".
-static bool rightDistributesOverLeft(Instruction::BinaryOps LOp,
- Instruction::BinaryOps ROp) {
- if (Instruction::isCommutative(ROp))
- return leftDistributesOverRight(ROp, LOp);
-
- // (X {&|^} Y) >> Z <--> (X >> Z) {&|^} (Y >> Z) for all shifts.
- return Instruction::isBitwiseLogicOp(LOp) && Instruction::isShift(ROp);
-
- // TODO: It would be nice to handle division, aka "(X + Y)/Z = X/Z + Y/Z",
- // but this requires knowing that the addition does not overflow and other
- // such subtleties.
-}
-
-/// This function returns identity value for given opcode, which can be used to
-/// factor patterns like (X * 2) + X ==> (X * 2) + (X * 1) ==> X * (2 + 1).
-static Value *getIdentityValue(Instruction::BinaryOps Opcode, Value *V) {
- if (isa<Constant>(V))
- return nullptr;
-
- return ConstantExpr::getBinOpIdentity(Opcode, V->getType());
-}
-
-/// This function predicates factorization using distributive laws. By default,
-/// it just returns the 'Op' inputs. But for special-cases like
-/// 'add(shl(X, 5), ...)', this function will have TopOpcode == Instruction::Add
-/// and Op = shl(X, 5). The 'shl' is treated as the more general 'mul X, 32' to
-/// allow more factorization opportunities.
-static Instruction::BinaryOps
-getBinOpsForFactorization(Instruction::BinaryOps TopOpcode, BinaryOperator *Op,
- Value *&LHS, Value *&RHS) {
- assert(Op && "Expected a binary operator");
- LHS = Op->getOperand(0);
- RHS = Op->getOperand(1);
- if (TopOpcode == Instruction::Add || TopOpcode == Instruction::Sub) {
- Constant *C;
- if (match(Op, m_Shl(m_Value(), m_Constant(C)))) {
- // X << C --> X * (1 << C)
- RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), C);
- return Instruction::Mul;
- }
- // TODO: We can add other conversions e.g. shr => div etc.
- }
- return Op->getOpcode();
-}
-
-/// This tries to simplify binary operations by factorizing out common terms
-/// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
+ Instruction::BinaryOps Opcode = I.getOpcode();
+ bool Changed = false;
+
+ do {
+ // Order operands such that they are listed from right (least complex) to
+ // left (most complex). This puts constants before unary operators before
+ // binary operators.
+ if (I.isCommutative() && getComplexity(I.getOperand(0)) <
+ getComplexity(I.getOperand(1)))
+ Changed = !I.swapOperands();
+
+ BinaryOperator *Op0 = dyn_cast<BinaryOperator>(I.getOperand(0));
+ BinaryOperator *Op1 = dyn_cast<BinaryOperator>(I.getOperand(1));
+
+ if (I.isAssociative()) {
+ // Transform: "(A op B) op C" ==> "A op (B op C)" if "B op C" simplifies.
+ if (Op0 && Op0->getOpcode() == Opcode) {
+ Value *A = Op0->getOperand(0);
+ Value *B = Op0->getOperand(1);
+ Value *C = I.getOperand(1);
+
+ // Does "B op C" simplify?
+ if (Value *V = SimplifyBinOp(Opcode, B, C, SQ.getWithInstruction(&I))) {
+ // It simplifies to V. Form "A op V".
+ replaceOperand(I, 0, A);
+ replaceOperand(I, 1, V);
+ bool IsNUW = hasNoUnsignedWrap(I) && hasNoUnsignedWrap(*Op0);
+ bool IsNSW = maintainNoSignedWrap(I, B, C) && hasNoSignedWrap(*Op0);
+
+ // Conservatively clear all optional flags since they may not be
+ // preserved by the reassociation. Reset nsw/nuw based on the above
+ // analysis.
+ ClearSubclassDataAfterReassociation(I);
+
+ // Note: this is only valid because SimplifyBinOp doesn't look at
+ // the operands to Op0.
+ if (IsNUW)
+ I.setHasNoUnsignedWrap(true);
+
+ if (IsNSW)
+ I.setHasNoSignedWrap(true);
+
+ Changed = true;
+ ++NumReassoc;
+ continue;
+ }
+ }
+
+ // Transform: "A op (B op C)" ==> "(A op B) op C" if "A op B" simplifies.
+ if (Op1 && Op1->getOpcode() == Opcode) {
+ Value *A = I.getOperand(0);
+ Value *B = Op1->getOperand(0);
+ Value *C = Op1->getOperand(1);
+
+ // Does "A op B" simplify?
+ if (Value *V = SimplifyBinOp(Opcode, A, B, SQ.getWithInstruction(&I))) {
+ // It simplifies to V. Form "V op C".
+ replaceOperand(I, 0, V);
+ replaceOperand(I, 1, C);
+ // Conservatively clear the optional flags, since they may not be
+ // preserved by the reassociation.
+ ClearSubclassDataAfterReassociation(I);
+ Changed = true;
+ ++NumReassoc;
+ continue;
+ }
+ }
+ }
+
+ if (I.isAssociative() && I.isCommutative()) {
+ if (simplifyAssocCastAssoc(&I, *this)) {
+ Changed = true;
+ ++NumReassoc;
+ continue;
+ }
+
+ // Transform: "(A op B) op C" ==> "(C op A) op B" if "C op A" simplifies.
+ if (Op0 && Op0->getOpcode() == Opcode) {
+ Value *A = Op0->getOperand(0);
+ Value *B = Op0->getOperand(1);
+ Value *C = I.getOperand(1);
+
+ // Does "C op A" simplify?
+ if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
+ // It simplifies to V. Form "V op B".
+ replaceOperand(I, 0, V);
+ replaceOperand(I, 1, B);
+ // Conservatively clear the optional flags, since they may not be
+ // preserved by the reassociation.
+ ClearSubclassDataAfterReassociation(I);
+ Changed = true;
+ ++NumReassoc;
+ continue;
+ }
+ }
+
+ // Transform: "A op (B op C)" ==> "B op (C op A)" if "C op A" simplifies.
+ if (Op1 && Op1->getOpcode() == Opcode) {
+ Value *A = I.getOperand(0);
+ Value *B = Op1->getOperand(0);
+ Value *C = Op1->getOperand(1);
+
+ // Does "C op A" simplify?
+ if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
+ // It simplifies to V. Form "B op V".
+ replaceOperand(I, 0, B);
+ replaceOperand(I, 1, V);
+ // Conservatively clear the optional flags, since they may not be
+ // preserved by the reassociation.
+ ClearSubclassDataAfterReassociation(I);
+ Changed = true;
+ ++NumReassoc;
+ continue;
+ }
+ }
+
+ // Transform: "(A op C1) op (B op C2)" ==> "(A op B) op (C1 op C2)"
+ // if C1 and C2 are constants.
+ Value *A, *B;
+ Constant *C1, *C2;
+ if (Op0 && Op1 &&
+ Op0->getOpcode() == Opcode && Op1->getOpcode() == Opcode &&
+ match(Op0, m_OneUse(m_BinOp(m_Value(A), m_Constant(C1)))) &&
+ match(Op1, m_OneUse(m_BinOp(m_Value(B), m_Constant(C2))))) {
+ bool IsNUW = hasNoUnsignedWrap(I) &&
+ hasNoUnsignedWrap(*Op0) &&
+ hasNoUnsignedWrap(*Op1);
+ BinaryOperator *NewBO = (IsNUW && Opcode == Instruction::Add) ?
+ BinaryOperator::CreateNUW(Opcode, A, B) :
+ BinaryOperator::Create(Opcode, A, B);
+
+ if (isa<FPMathOperator>(NewBO)) {
+ FastMathFlags Flags = I.getFastMathFlags();
+ Flags &= Op0->getFastMathFlags();
+ Flags &= Op1->getFastMathFlags();
+ NewBO->setFastMathFlags(Flags);
+ }
+ InsertNewInstWith(NewBO, I);
+ NewBO->takeName(Op1);
+ replaceOperand(I, 0, NewBO);
+ replaceOperand(I, 1, ConstantExpr::get(Opcode, C1, C2));
+ // Conservatively clear the optional flags, since they may not be
+ // preserved by the reassociation.
+ ClearSubclassDataAfterReassociation(I);
+ if (IsNUW)
+ I.setHasNoUnsignedWrap(true);
+
+ Changed = true;
+ continue;
+ }
+ }
+
+ // No further simplifications.
+ return Changed;
+ } while (true);
+}
+
+/// Return whether "X LOp (Y ROp Z)" is always equal to
+/// "(X LOp Y) ROp (X LOp Z)".
+static bool leftDistributesOverRight(Instruction::BinaryOps LOp,
+ Instruction::BinaryOps ROp) {
+ // X & (Y | Z) <--> (X & Y) | (X & Z)
+ // X & (Y ^ Z) <--> (X & Y) ^ (X & Z)
+ if (LOp == Instruction::And)
+ return ROp == Instruction::Or || ROp == Instruction::Xor;
+
+ // X | (Y & Z) <--> (X | Y) & (X | Z)
+ if (LOp == Instruction::Or)
+ return ROp == Instruction::And;
+
+ // X * (Y + Z) <--> (X * Y) + (X * Z)
+ // X * (Y - Z) <--> (X * Y) - (X * Z)
+ if (LOp == Instruction::Mul)
+ return ROp == Instruction::Add || ROp == Instruction::Sub;
+
+ return false;
+}
+
+/// Return whether "(X LOp Y) ROp Z" is always equal to
+/// "(X ROp Z) LOp (Y ROp Z)".
+static bool rightDistributesOverLeft(Instruction::BinaryOps LOp,
+ Instruction::BinaryOps ROp) {
+ if (Instruction::isCommutative(ROp))
+ return leftDistributesOverRight(ROp, LOp);
+
+ // (X {&|^} Y) >> Z <--> (X >> Z) {&|^} (Y >> Z) for all shifts.
+ return Instruction::isBitwiseLogicOp(LOp) && Instruction::isShift(ROp);
+
+ // TODO: It would be nice to handle division, aka "(X + Y)/Z = X/Z + Y/Z",
+ // but this requires knowing that the addition does not overflow and other
+ // such subtleties.
+}
+
+/// This function returns identity value for given opcode, which can be used to
+/// factor patterns like (X * 2) + X ==> (X * 2) + (X * 1) ==> X * (2 + 1).
+static Value *getIdentityValue(Instruction::BinaryOps Opcode, Value *V) {
+ if (isa<Constant>(V))
+ return nullptr;
+
+ return ConstantExpr::getBinOpIdentity(Opcode, V->getType());
+}
+
+/// This function predicates factorization using distributive laws. By default,
+/// it just returns the 'Op' inputs. But for special-cases like
+/// 'add(shl(X, 5), ...)', this function will have TopOpcode == Instruction::Add
+/// and Op = shl(X, 5). The 'shl' is treated as the more general 'mul X, 32' to
+/// allow more factorization opportunities.
+static Instruction::BinaryOps
+getBinOpsForFactorization(Instruction::BinaryOps TopOpcode, BinaryOperator *Op,
+ Value *&LHS, Value *&RHS) {
+ assert(Op && "Expected a binary operator");
+ LHS = Op->getOperand(0);
+ RHS = Op->getOperand(1);
+ if (TopOpcode == Instruction::Add || TopOpcode == Instruction::Sub) {
+ Constant *C;
+ if (match(Op, m_Shl(m_Value(), m_Constant(C)))) {
+ // X << C --> X * (1 << C)
+ RHS = ConstantExpr::getShl(ConstantInt::get(Op->getType(), 1), C);
+ return Instruction::Mul;
+ }
+ // TODO: We can add other conversions e.g. shr => div etc.
+ }
+ return Op->getOpcode();
+}
+
+/// This tries to simplify binary operations by factorizing out common terms
+/// (e. g. "(A*B)+(A*C)" -> "A*(B+C)").
Value *InstCombinerImpl::tryFactorization(BinaryOperator &I,
Instruction::BinaryOps InnerOpcode,
Value *A, Value *B, Value *C,
Value *D) {
- assert(A && B && C && D && "All values must be provided");
-
- Value *V = nullptr;
- Value *SimplifiedInst = nullptr;
- Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
- Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
-
- // Does "X op' Y" always equal "Y op' X"?
- bool InnerCommutative = Instruction::isCommutative(InnerOpcode);
-
- // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"?
- if (leftDistributesOverRight(InnerOpcode, TopLevelOpcode))
- // Does the instruction have the form "(A op' B) op (A op' D)" or, in the
- // commutative case, "(A op' B) op (C op' A)"?
- if (A == C || (InnerCommutative && A == D)) {
- if (A != C)
- std::swap(C, D);
- // Consider forming "A op' (B op D)".
- // If "B op D" simplifies then it can be formed with no cost.
- V = SimplifyBinOp(TopLevelOpcode, B, D, SQ.getWithInstruction(&I));
- // If "B op D" doesn't simplify then only go on if both of the existing
- // operations "A op' B" and "C op' D" will be zapped as no longer used.
- if (!V && LHS->hasOneUse() && RHS->hasOneUse())
- V = Builder.CreateBinOp(TopLevelOpcode, B, D, RHS->getName());
- if (V) {
- SimplifiedInst = Builder.CreateBinOp(InnerOpcode, A, V);
- }
- }
-
- // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"?
- if (!SimplifiedInst && rightDistributesOverLeft(TopLevelOpcode, InnerOpcode))
- // Does the instruction have the form "(A op' B) op (C op' B)" or, in the
- // commutative case, "(A op' B) op (B op' D)"?
- if (B == D || (InnerCommutative && B == C)) {
- if (B != D)
- std::swap(C, D);
- // Consider forming "(A op C) op' B".
- // If "A op C" simplifies then it can be formed with no cost.
- V = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
-
- // If "A op C" doesn't simplify then only go on if both of the existing
- // operations "A op' B" and "C op' D" will be zapped as no longer used.
- if (!V && LHS->hasOneUse() && RHS->hasOneUse())
- V = Builder.CreateBinOp(TopLevelOpcode, A, C, LHS->getName());
- if (V) {
- SimplifiedInst = Builder.CreateBinOp(InnerOpcode, V, B);
- }
- }
-
- if (SimplifiedInst) {
- ++NumFactor;
- SimplifiedInst->takeName(&I);
-
- // Check if we can add NSW/NUW flags to SimplifiedInst. If so, set them.
- if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SimplifiedInst)) {
- if (isa<OverflowingBinaryOperator>(SimplifiedInst)) {
- bool HasNSW = false;
- bool HasNUW = false;
- if (isa<OverflowingBinaryOperator>(&I)) {
- HasNSW = I.hasNoSignedWrap();
- HasNUW = I.hasNoUnsignedWrap();
- }
-
- if (auto *LOBO = dyn_cast<OverflowingBinaryOperator>(LHS)) {
- HasNSW &= LOBO->hasNoSignedWrap();
- HasNUW &= LOBO->hasNoUnsignedWrap();
- }
-
- if (auto *ROBO = dyn_cast<OverflowingBinaryOperator>(RHS)) {
- HasNSW &= ROBO->hasNoSignedWrap();
- HasNUW &= ROBO->hasNoUnsignedWrap();
- }
-
- if (TopLevelOpcode == Instruction::Add &&
- InnerOpcode == Instruction::Mul) {
- // We can propagate 'nsw' if we know that
- // %Y = mul nsw i16 %X, C
- // %Z = add nsw i16 %Y, %X
- // =>
- // %Z = mul nsw i16 %X, C+1
- //
- // iff C+1 isn't INT_MIN
- const APInt *CInt;
- if (match(V, m_APInt(CInt))) {
- if (!CInt->isMinSignedValue())
- BO->setHasNoSignedWrap(HasNSW);
- }
-
- // nuw can be propagated with any constant or nuw value.
- BO->setHasNoUnsignedWrap(HasNUW);
- }
- }
- }
- }
- return SimplifiedInst;
-}
-
-/// This tries to simplify binary operations which some other binary operation
-/// distributes over either by factorizing out common terms
-/// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in
-/// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win).
-/// Returns the simplified value, or null if it didn't simplify.
+ assert(A && B && C && D && "All values must be provided");
+
+ Value *V = nullptr;
+ Value *SimplifiedInst = nullptr;
+ Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+ Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
+
+ // Does "X op' Y" always equal "Y op' X"?
+ bool InnerCommutative = Instruction::isCommutative(InnerOpcode);
+
+ // Does "X op' (Y op Z)" always equal "(X op' Y) op (X op' Z)"?
+ if (leftDistributesOverRight(InnerOpcode, TopLevelOpcode))
+ // Does the instruction have the form "(A op' B) op (A op' D)" or, in the
+ // commutative case, "(A op' B) op (C op' A)"?
+ if (A == C || (InnerCommutative && A == D)) {
+ if (A != C)
+ std::swap(C, D);
+ // Consider forming "A op' (B op D)".
+ // If "B op D" simplifies then it can be formed with no cost.
+ V = SimplifyBinOp(TopLevelOpcode, B, D, SQ.getWithInstruction(&I));
+ // If "B op D" doesn't simplify then only go on if both of the existing
+ // operations "A op' B" and "C op' D" will be zapped as no longer used.
+ if (!V && LHS->hasOneUse() && RHS->hasOneUse())
+ V = Builder.CreateBinOp(TopLevelOpcode, B, D, RHS->getName());
+ if (V) {
+ SimplifiedInst = Builder.CreateBinOp(InnerOpcode, A, V);
+ }
+ }
+
+ // Does "(X op Y) op' Z" always equal "(X op' Z) op (Y op' Z)"?
+ if (!SimplifiedInst && rightDistributesOverLeft(TopLevelOpcode, InnerOpcode))
+ // Does the instruction have the form "(A op' B) op (C op' B)" or, in the
+ // commutative case, "(A op' B) op (B op' D)"?
+ if (B == D || (InnerCommutative && B == C)) {
+ if (B != D)
+ std::swap(C, D);
+ // Consider forming "(A op C) op' B".
+ // If "A op C" simplifies then it can be formed with no cost.
+ V = SimplifyBinOp(TopLevelOpcode, A, C, SQ.getWithInstruction(&I));
+
+ // If "A op C" doesn't simplify then only go on if both of the existing
+ // operations "A op' B" and "C op' D" will be zapped as no longer used.
+ if (!V && LHS->hasOneUse() && RHS->hasOneUse())
+ V = Builder.CreateBinOp(TopLevelOpcode, A, C, LHS->getName());
+ if (V) {
+ SimplifiedInst = Builder.CreateBinOp(InnerOpcode, V, B);
+ }
+ }
+
+ if (SimplifiedInst) {
+ ++NumFactor;
+ SimplifiedInst->takeName(&I);
+
+ // Check if we can add NSW/NUW flags to SimplifiedInst. If so, set them.
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(SimplifiedInst)) {
+ if (isa<OverflowingBinaryOperator>(SimplifiedInst)) {
+ bool HasNSW = false;
+ bool HasNUW = false;
+ if (isa<OverflowingBinaryOperator>(&I)) {
+ HasNSW = I.hasNoSignedWrap();
+ HasNUW = I.hasNoUnsignedWrap();
+ }
+
+ if (auto *LOBO = dyn_cast<OverflowingBinaryOperator>(LHS)) {
+ HasNSW &= LOBO->hasNoSignedWrap();
+ HasNUW &= LOBO->hasNoUnsignedWrap();
+ }
+
+ if (auto *ROBO = dyn_cast<OverflowingBinaryOperator>(RHS)) {
+ HasNSW &= ROBO->hasNoSignedWrap();
+ HasNUW &= ROBO->hasNoUnsignedWrap();
+ }
+
+ if (TopLevelOpcode == Instruction::Add &&
+ InnerOpcode == Instruction::Mul) {
+ // We can propagate 'nsw' if we know that
+ // %Y = mul nsw i16 %X, C
+ // %Z = add nsw i16 %Y, %X
+ // =>
+ // %Z = mul nsw i16 %X, C+1
+ //
+ // iff C+1 isn't INT_MIN
+ const APInt *CInt;
+ if (match(V, m_APInt(CInt))) {
+ if (!CInt->isMinSignedValue())
+ BO->setHasNoSignedWrap(HasNSW);
+ }
+
+ // nuw can be propagated with any constant or nuw value.
+ BO->setHasNoUnsignedWrap(HasNUW);
+ }
+ }
+ }
+ }
+ return SimplifiedInst;
+}
+
+/// This tries to simplify binary operations which some other binary operation
+/// distributes over either by factorizing out common terms
+/// (eg "(A*B)+(A*C)" -> "A*(B+C)") or expanding out if this results in
+/// simplifications (eg: "A & (B | C) -> (A&B) | (A&C)" if this is a win).
+/// Returns the simplified value, or null if it didn't simplify.
Value *InstCombinerImpl::SimplifyUsingDistributiveLaws(BinaryOperator &I) {
- Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
- BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
- BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
- Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
-
- {
- // Factorization.
- Value *A, *B, *C, *D;
- Instruction::BinaryOps LHSOpcode, RHSOpcode;
- if (Op0)
- LHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op0, A, B);
- if (Op1)
- RHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op1, C, D);
-
- // The instruction has the form "(A op' B) op (C op' D)". Try to factorize
- // a common term.
- if (Op0 && Op1 && LHSOpcode == RHSOpcode)
- if (Value *V = tryFactorization(I, LHSOpcode, A, B, C, D))
- return V;
-
- // The instruction has the form "(A op' B) op (C)". Try to factorize common
- // term.
- if (Op0)
- if (Value *Ident = getIdentityValue(LHSOpcode, RHS))
- if (Value *V = tryFactorization(I, LHSOpcode, A, B, RHS, Ident))
- return V;
-
- // The instruction has the form "(B) op (C op' D)". Try to factorize common
- // term.
- if (Op1)
- if (Value *Ident = getIdentityValue(RHSOpcode, LHS))
- if (Value *V = tryFactorization(I, RHSOpcode, LHS, Ident, C, D))
- return V;
- }
-
- // Expansion.
- if (Op0 && rightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
- // The instruction has the form "(A op' B) op C". See if expanding it out
- // to "(A op C) op' (B op C)" results in simplifications.
- Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
- Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op'
-
+ Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+ BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS);
+ BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS);
+ Instruction::BinaryOps TopLevelOpcode = I.getOpcode();
+
+ {
+ // Factorization.
+ Value *A, *B, *C, *D;
+ Instruction::BinaryOps LHSOpcode, RHSOpcode;
+ if (Op0)
+ LHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op0, A, B);
+ if (Op1)
+ RHSOpcode = getBinOpsForFactorization(TopLevelOpcode, Op1, C, D);
+
+ // The instruction has the form "(A op' B) op (C op' D)". Try to factorize
+ // a common term.
+ if (Op0 && Op1 && LHSOpcode == RHSOpcode)
+ if (Value *V = tryFactorization(I, LHSOpcode, A, B, C, D))
+ return V;
+
+ // The instruction has the form "(A op' B) op (C)". Try to factorize common
+ // term.
+ if (Op0)
+ if (Value *Ident = getIdentityValue(LHSOpcode, RHS))
+ if (Value *V = tryFactorization(I, LHSOpcode, A, B, RHS, Ident))
+ return V;
+
+ // The instruction has the form "(B) op (C op' D)". Try to factorize common
+ // term.
+ if (Op1)
+ if (Value *Ident = getIdentityValue(RHSOpcode, LHS))
+ if (Value *V = tryFactorization(I, RHSOpcode, LHS, Ident, C, D))
+ return V;
+ }
+
+ // Expansion.
+ if (Op0 && rightDistributesOverLeft(Op0->getOpcode(), TopLevelOpcode)) {
+ // The instruction has the form "(A op' B) op C". See if expanding it out
+ // to "(A op C) op' (B op C)" results in simplifications.
+ Value *A = Op0->getOperand(0), *B = Op0->getOperand(1), *C = RHS;
+ Instruction::BinaryOps InnerOpcode = Op0->getOpcode(); // op'
+
// Disable the use of undef because it's not safe to distribute undef.
auto SQDistributive = SQ.getWithInstruction(&I).getWithoutUndef();
Value *L = SimplifyBinOp(TopLevelOpcode, A, C, SQDistributive);
Value *R = SimplifyBinOp(TopLevelOpcode, B, C, SQDistributive);
-
- // Do "A op C" and "B op C" both simplify?
- if (L && R) {
- // They do! Return "L op' R".
- ++NumExpand;
- C = Builder.CreateBinOp(InnerOpcode, L, R);
- C->takeName(&I);
- return C;
- }
-
- // Does "A op C" simplify to the identity value for the inner opcode?
- if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
- // They do! Return "B op C".
- ++NumExpand;
- C = Builder.CreateBinOp(TopLevelOpcode, B, C);
- C->takeName(&I);
- return C;
- }
-
- // Does "B op C" simplify to the identity value for the inner opcode?
- if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
- // They do! Return "A op C".
- ++NumExpand;
- C = Builder.CreateBinOp(TopLevelOpcode, A, C);
- C->takeName(&I);
- return C;
- }
- }
-
- if (Op1 && leftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) {
- // The instruction has the form "A op (B op' C)". See if expanding it out
- // to "(A op B) op' (A op C)" results in simplifications.
- Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
- Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op'
-
+
+ // Do "A op C" and "B op C" both simplify?
+ if (L && R) {
+ // They do! Return "L op' R".
+ ++NumExpand;
+ C = Builder.CreateBinOp(InnerOpcode, L, R);
+ C->takeName(&I);
+ return C;
+ }
+
+ // Does "A op C" simplify to the identity value for the inner opcode?
+ if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
+ // They do! Return "B op C".
+ ++NumExpand;
+ C = Builder.CreateBinOp(TopLevelOpcode, B, C);
+ C->takeName(&I);
+ return C;
+ }
+
+ // Does "B op C" simplify to the identity value for the inner opcode?
+ if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
+ // They do! Return "A op C".
+ ++NumExpand;
+ C = Builder.CreateBinOp(TopLevelOpcode, A, C);
+ C->takeName(&I);
+ return C;
+ }
+ }
+
+ if (Op1 && leftDistributesOverRight(TopLevelOpcode, Op1->getOpcode())) {
+ // The instruction has the form "A op (B op' C)". See if expanding it out
+ // to "(A op B) op' (A op C)" results in simplifications.
+ Value *A = LHS, *B = Op1->getOperand(0), *C = Op1->getOperand(1);
+ Instruction::BinaryOps InnerOpcode = Op1->getOpcode(); // op'
+
// Disable the use of undef because it's not safe to distribute undef.
auto SQDistributive = SQ.getWithInstruction(&I).getWithoutUndef();
Value *L = SimplifyBinOp(TopLevelOpcode, A, B, SQDistributive);
Value *R = SimplifyBinOp(TopLevelOpcode, A, C, SQDistributive);
-
- // Do "A op B" and "A op C" both simplify?
- if (L && R) {
- // They do! Return "L op' R".
- ++NumExpand;
- A = Builder.CreateBinOp(InnerOpcode, L, R);
- A->takeName(&I);
- return A;
- }
-
- // Does "A op B" simplify to the identity value for the inner opcode?
- if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
- // They do! Return "A op C".
- ++NumExpand;
- A = Builder.CreateBinOp(TopLevelOpcode, A, C);
- A->takeName(&I);
- return A;
- }
-
- // Does "A op C" simplify to the identity value for the inner opcode?
- if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
- // They do! Return "A op B".
- ++NumExpand;
- A = Builder.CreateBinOp(TopLevelOpcode, A, B);
- A->takeName(&I);
- return A;
- }
- }
-
- return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
-}
-
+
+ // Do "A op B" and "A op C" both simplify?
+ if (L && R) {
+ // They do! Return "L op' R".
+ ++NumExpand;
+ A = Builder.CreateBinOp(InnerOpcode, L, R);
+ A->takeName(&I);
+ return A;
+ }
+
+ // Does "A op B" simplify to the identity value for the inner opcode?
+ if (L && L == ConstantExpr::getBinOpIdentity(InnerOpcode, L->getType())) {
+ // They do! Return "A op C".
+ ++NumExpand;
+ A = Builder.CreateBinOp(TopLevelOpcode, A, C);
+ A->takeName(&I);
+ return A;
+ }
+
+ // Does "A op C" simplify to the identity value for the inner opcode?
+ if (R && R == ConstantExpr::getBinOpIdentity(InnerOpcode, R->getType())) {
+ // They do! Return "A op B".
+ ++NumExpand;
+ A = Builder.CreateBinOp(TopLevelOpcode, A, B);
+ A->takeName(&I);
+ return A;
+ }
+ }
+
+ return SimplifySelectsFeedingBinaryOp(I, LHS, RHS);
+}
+
Value *InstCombinerImpl::SimplifySelectsFeedingBinaryOp(BinaryOperator &I,
Value *LHS,
Value *RHS) {
- Value *A, *B, *C, *D, *E, *F;
- bool LHSIsSelect = match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C)));
- bool RHSIsSelect = match(RHS, m_Select(m_Value(D), m_Value(E), m_Value(F)));
- if (!LHSIsSelect && !RHSIsSelect)
- return nullptr;
-
- FastMathFlags FMF;
- BuilderTy::FastMathFlagGuard Guard(Builder);
- if (isa<FPMathOperator>(&I)) {
- FMF = I.getFastMathFlags();
- Builder.setFastMathFlags(FMF);
- }
-
- Instruction::BinaryOps Opcode = I.getOpcode();
- SimplifyQuery Q = SQ.getWithInstruction(&I);
-
- Value *Cond, *True = nullptr, *False = nullptr;
- if (LHSIsSelect && RHSIsSelect && A == D) {
- // (A ? B : C) op (A ? E : F) -> A ? (B op E) : (C op F)
- Cond = A;
- True = SimplifyBinOp(Opcode, B, E, FMF, Q);
- False = SimplifyBinOp(Opcode, C, F, FMF, Q);
-
- if (LHS->hasOneUse() && RHS->hasOneUse()) {
- if (False && !True)
- True = Builder.CreateBinOp(Opcode, B, E);
- else if (True && !False)
- False = Builder.CreateBinOp(Opcode, C, F);
- }
- } else if (LHSIsSelect && LHS->hasOneUse()) {
- // (A ? B : C) op Y -> A ? (B op Y) : (C op Y)
- Cond = A;
- True = SimplifyBinOp(Opcode, B, RHS, FMF, Q);
- False = SimplifyBinOp(Opcode, C, RHS, FMF, Q);
- } else if (RHSIsSelect && RHS->hasOneUse()) {
- // X op (D ? E : F) -> D ? (X op E) : (X op F)
- Cond = D;
- True = SimplifyBinOp(Opcode, LHS, E, FMF, Q);
- False = SimplifyBinOp(Opcode, LHS, F, FMF, Q);
- }
-
- if (!True || !False)
- return nullptr;
-
- Value *SI = Builder.CreateSelect(Cond, True, False);
- SI->takeName(&I);
- return SI;
-}
-
+ Value *A, *B, *C, *D, *E, *F;
+ bool LHSIsSelect = match(LHS, m_Select(m_Value(A), m_Value(B), m_Value(C)));
+ bool RHSIsSelect = match(RHS, m_Select(m_Value(D), m_Value(E), m_Value(F)));
+ if (!LHSIsSelect && !RHSIsSelect)
+ return nullptr;
+
+ FastMathFlags FMF;
+ BuilderTy::FastMathFlagGuard Guard(Builder);
+ if (isa<FPMathOperator>(&I)) {
+ FMF = I.getFastMathFlags();
+ Builder.setFastMathFlags(FMF);
+ }
+
+ Instruction::BinaryOps Opcode = I.getOpcode();
+ SimplifyQuery Q = SQ.getWithInstruction(&I);
+
+ Value *Cond, *True = nullptr, *False = nullptr;
+ if (LHSIsSelect && RHSIsSelect && A == D) {
+ // (A ? B : C) op (A ? E : F) -> A ? (B op E) : (C op F)
+ Cond = A;
+ True = SimplifyBinOp(Opcode, B, E, FMF, Q);
+ False = SimplifyBinOp(Opcode, C, F, FMF, Q);
+
+ if (LHS->hasOneUse() && RHS->hasOneUse()) {
+ if (False && !True)
+ True = Builder.CreateBinOp(Opcode, B, E);
+ else if (True && !False)
+ False = Builder.CreateBinOp(Opcode, C, F);
+ }
+ } else if (LHSIsSelect && LHS->hasOneUse()) {
+ // (A ? B : C) op Y -> A ? (B op Y) : (C op Y)
+ Cond = A;
+ True = SimplifyBinOp(Opcode, B, RHS, FMF, Q);
+ False = SimplifyBinOp(Opcode, C, RHS, FMF, Q);
+ } else if (RHSIsSelect && RHS->hasOneUse()) {
+ // X op (D ? E : F) -> D ? (X op E) : (X op F)
+ Cond = D;
+ True = SimplifyBinOp(Opcode, LHS, E, FMF, Q);
+ False = SimplifyBinOp(Opcode, LHS, F, FMF, Q);
+ }
+
+ if (!True || !False)
+ return nullptr;
+
+ Value *SI = Builder.CreateSelect(Cond, True, False);
+ SI->takeName(&I);
+ return SI;
+}
+
/// Freely adapt every user of V as-if V was changed to !V.
/// WARNING: only if canFreelyInvertAllUsersOf() said this can be done.
void InstCombinerImpl::freelyInvertAllUsersOf(Value *I) {
@@ -894,288 +894,288 @@ void InstCombinerImpl::freelyInvertAllUsersOf(Value *I) {
}
}
-/// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a
-/// constant zero (which is the 'negate' form).
+/// Given a 'sub' instruction, return the RHS of the instruction if the LHS is a
+/// constant zero (which is the 'negate' form).
Value *InstCombinerImpl::dyn_castNegVal(Value *V) const {
- Value *NegV;
- if (match(V, m_Neg(m_Value(NegV))))
- return NegV;
-
- // Constants can be considered to be negated values if they can be folded.
- if (ConstantInt *C = dyn_cast<ConstantInt>(V))
- return ConstantExpr::getNeg(C);
-
- if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V))
- if (C->getType()->getElementType()->isIntegerTy())
- return ConstantExpr::getNeg(C);
-
- if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) {
- for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
- Constant *Elt = CV->getAggregateElement(i);
- if (!Elt)
- return nullptr;
-
- if (isa<UndefValue>(Elt))
- continue;
-
- if (!isa<ConstantInt>(Elt))
- return nullptr;
- }
- return ConstantExpr::getNeg(CV);
- }
-
- return nullptr;
-}
-
-static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
- InstCombiner::BuilderTy &Builder) {
- if (auto *Cast = dyn_cast<CastInst>(&I))
- return Builder.CreateCast(Cast->getOpcode(), SO, I.getType());
-
- assert(I.isBinaryOp() && "Unexpected opcode for select folding");
-
- // Figure out if the constant is the left or the right argument.
- bool ConstIsRHS = isa<Constant>(I.getOperand(1));
- Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS));
-
- if (auto *SOC = dyn_cast<Constant>(SO)) {
- if (ConstIsRHS)
- return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand);
- return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC);
- }
-
- Value *Op0 = SO, *Op1 = ConstOperand;
- if (!ConstIsRHS)
- std::swap(Op0, Op1);
-
- auto *BO = cast<BinaryOperator>(&I);
- Value *RI = Builder.CreateBinOp(BO->getOpcode(), Op0, Op1,
- SO->getName() + ".op");
- auto *FPInst = dyn_cast<Instruction>(RI);
- if (FPInst && isa<FPMathOperator>(FPInst))
- FPInst->copyFastMathFlags(BO);
- return RI;
-}
-
+ Value *NegV;
+ if (match(V, m_Neg(m_Value(NegV))))
+ return NegV;
+
+ // Constants can be considered to be negated values if they can be folded.
+ if (ConstantInt *C = dyn_cast<ConstantInt>(V))
+ return ConstantExpr::getNeg(C);
+
+ if (ConstantDataVector *C = dyn_cast<ConstantDataVector>(V))
+ if (C->getType()->getElementType()->isIntegerTy())
+ return ConstantExpr::getNeg(C);
+
+ if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) {
+ for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
+ Constant *Elt = CV->getAggregateElement(i);
+ if (!Elt)
+ return nullptr;
+
+ if (isa<UndefValue>(Elt))
+ continue;
+
+ if (!isa<ConstantInt>(Elt))
+ return nullptr;
+ }
+ return ConstantExpr::getNeg(CV);
+ }
+
+ return nullptr;
+}
+
+static Value *foldOperationIntoSelectOperand(Instruction &I, Value *SO,
+ InstCombiner::BuilderTy &Builder) {
+ if (auto *Cast = dyn_cast<CastInst>(&I))
+ return Builder.CreateCast(Cast->getOpcode(), SO, I.getType());
+
+ assert(I.isBinaryOp() && "Unexpected opcode for select folding");
+
+ // Figure out if the constant is the left or the right argument.
+ bool ConstIsRHS = isa<Constant>(I.getOperand(1));
+ Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS));
+
+ if (auto *SOC = dyn_cast<Constant>(SO)) {
+ if (ConstIsRHS)
+ return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand);
+ return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC);
+ }
+
+ Value *Op0 = SO, *Op1 = ConstOperand;
+ if (!ConstIsRHS)
+ std::swap(Op0, Op1);
+
+ auto *BO = cast<BinaryOperator>(&I);
+ Value *RI = Builder.CreateBinOp(BO->getOpcode(), Op0, Op1,
+ SO->getName() + ".op");
+ auto *FPInst = dyn_cast<Instruction>(RI);
+ if (FPInst && isa<FPMathOperator>(FPInst))
+ FPInst->copyFastMathFlags(BO);
+ return RI;
+}
+
Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op,
SelectInst *SI) {
- // Don't modify shared select instructions.
- if (!SI->hasOneUse())
- return nullptr;
-
- Value *TV = SI->getTrueValue();
- Value *FV = SI->getFalseValue();
- if (!(isa<Constant>(TV) || isa<Constant>(FV)))
- return nullptr;
-
- // Bool selects with constant operands can be folded to logical ops.
- if (SI->getType()->isIntOrIntVectorTy(1))
- return nullptr;
-
- // If it's a bitcast involving vectors, make sure it has the same number of
- // elements on both sides.
- if (auto *BC = dyn_cast<BitCastInst>(&Op)) {
- VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy());
- VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy());
-
- // Verify that either both or neither are vectors.
- if ((SrcTy == nullptr) != (DestTy == nullptr))
- return nullptr;
-
- // If vectors, verify that they have the same number of elements.
+ // Don't modify shared select instructions.
+ if (!SI->hasOneUse())
+ return nullptr;
+
+ Value *TV = SI->getTrueValue();
+ Value *FV = SI->getFalseValue();
+ if (!(isa<Constant>(TV) || isa<Constant>(FV)))
+ return nullptr;
+
+ // Bool selects with constant operands can be folded to logical ops.
+ if (SI->getType()->isIntOrIntVectorTy(1))
+ return nullptr;
+
+ // If it's a bitcast involving vectors, make sure it has the same number of
+ // elements on both sides.
+ if (auto *BC = dyn_cast<BitCastInst>(&Op)) {
+ VectorType *DestTy = dyn_cast<VectorType>(BC->getDestTy());
+ VectorType *SrcTy = dyn_cast<VectorType>(BC->getSrcTy());
+
+ // Verify that either both or neither are vectors.
+ if ((SrcTy == nullptr) != (DestTy == nullptr))
+ return nullptr;
+
+ // If vectors, verify that they have the same number of elements.
if (SrcTy && SrcTy->getElementCount() != DestTy->getElementCount())
- return nullptr;
- }
-
- // Test if a CmpInst instruction is used exclusively by a select as
- // part of a minimum or maximum operation. If so, refrain from doing
- // any other folding. This helps out other analyses which understand
- // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
- // and CodeGen. And in this case, at least one of the comparison
- // operands has at least one user besides the compare (the select),
- // which would often largely negate the benefit of folding anyway.
- if (auto *CI = dyn_cast<CmpInst>(SI->getCondition())) {
- if (CI->hasOneUse()) {
- Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
-
- // FIXME: This is a hack to avoid infinite looping with min/max patterns.
- // We have to ensure that vector constants that only differ with
- // undef elements are treated as equivalent.
- auto areLooselyEqual = [](Value *A, Value *B) {
- if (A == B)
- return true;
-
- // Test for vector constants.
- Constant *ConstA, *ConstB;
- if (!match(A, m_Constant(ConstA)) || !match(B, m_Constant(ConstB)))
- return false;
-
- // TODO: Deal with FP constants?
- if (!A->getType()->isIntOrIntVectorTy() || A->getType() != B->getType())
- return false;
-
- // Compare for equality including undefs as equal.
- auto *Cmp = ConstantExpr::getCompare(ICmpInst::ICMP_EQ, ConstA, ConstB);
- const APInt *C;
- return match(Cmp, m_APIntAllowUndef(C)) && C->isOneValue();
- };
-
- if ((areLooselyEqual(TV, Op0) && areLooselyEqual(FV, Op1)) ||
- (areLooselyEqual(FV, Op0) && areLooselyEqual(TV, Op1)))
- return nullptr;
- }
- }
-
- Value *NewTV = foldOperationIntoSelectOperand(Op, TV, Builder);
- Value *NewFV = foldOperationIntoSelectOperand(Op, FV, Builder);
- return SelectInst::Create(SI->getCondition(), NewTV, NewFV, "", nullptr, SI);
-}
-
-static Value *foldOperationIntoPhiValue(BinaryOperator *I, Value *InV,
- InstCombiner::BuilderTy &Builder) {
- bool ConstIsRHS = isa<Constant>(I->getOperand(1));
- Constant *C = cast<Constant>(I->getOperand(ConstIsRHS));
-
- if (auto *InC = dyn_cast<Constant>(InV)) {
- if (ConstIsRHS)
- return ConstantExpr::get(I->getOpcode(), InC, C);
- return ConstantExpr::get(I->getOpcode(), C, InC);
- }
-
- Value *Op0 = InV, *Op1 = C;
- if (!ConstIsRHS)
- std::swap(Op0, Op1);
-
- Value *RI = Builder.CreateBinOp(I->getOpcode(), Op0, Op1, "phi.bo");
- auto *FPInst = dyn_cast<Instruction>(RI);
- if (FPInst && isa<FPMathOperator>(FPInst))
- FPInst->copyFastMathFlags(I);
- return RI;
-}
-
+ return nullptr;
+ }
+
+ // Test if a CmpInst instruction is used exclusively by a select as
+ // part of a minimum or maximum operation. If so, refrain from doing
+ // any other folding. This helps out other analyses which understand
+ // non-obfuscated minimum and maximum idioms, such as ScalarEvolution
+ // and CodeGen. And in this case, at least one of the comparison
+ // operands has at least one user besides the compare (the select),
+ // which would often largely negate the benefit of folding anyway.
+ if (auto *CI = dyn_cast<CmpInst>(SI->getCondition())) {
+ if (CI->hasOneUse()) {
+ Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+
+ // FIXME: This is a hack to avoid infinite looping with min/max patterns.
+ // We have to ensure that vector constants that only differ with
+ // undef elements are treated as equivalent.
+ auto areLooselyEqual = [](Value *A, Value *B) {
+ if (A == B)
+ return true;
+
+ // Test for vector constants.
+ Constant *ConstA, *ConstB;
+ if (!match(A, m_Constant(ConstA)) || !match(B, m_Constant(ConstB)))
+ return false;
+
+ // TODO: Deal with FP constants?
+ if (!A->getType()->isIntOrIntVectorTy() || A->getType() != B->getType())
+ return false;
+
+ // Compare for equality including undefs as equal.
+ auto *Cmp = ConstantExpr::getCompare(ICmpInst::ICMP_EQ, ConstA, ConstB);
+ const APInt *C;
+ return match(Cmp, m_APIntAllowUndef(C)) && C->isOneValue();
+ };
+
+ if ((areLooselyEqual(TV, Op0) && areLooselyEqual(FV, Op1)) ||
+ (areLooselyEqual(FV, Op0) && areLooselyEqual(TV, Op1)))
+ return nullptr;
+ }
+ }
+
+ Value *NewTV = foldOperationIntoSelectOperand(Op, TV, Builder);
+ Value *NewFV = foldOperationIntoSelectOperand(Op, FV, Builder);
+ return SelectInst::Create(SI->getCondition(), NewTV, NewFV, "", nullptr, SI);
+}
+
+static Value *foldOperationIntoPhiValue(BinaryOperator *I, Value *InV,
+ InstCombiner::BuilderTy &Builder) {
+ bool ConstIsRHS = isa<Constant>(I->getOperand(1));
+ Constant *C = cast<Constant>(I->getOperand(ConstIsRHS));
+
+ if (auto *InC = dyn_cast<Constant>(InV)) {
+ if (ConstIsRHS)
+ return ConstantExpr::get(I->getOpcode(), InC, C);
+ return ConstantExpr::get(I->getOpcode(), C, InC);
+ }
+
+ Value *Op0 = InV, *Op1 = C;
+ if (!ConstIsRHS)
+ std::swap(Op0, Op1);
+
+ Value *RI = Builder.CreateBinOp(I->getOpcode(), Op0, Op1, "phi.bo");
+ auto *FPInst = dyn_cast<Instruction>(RI);
+ if (FPInst && isa<FPMathOperator>(FPInst))
+ FPInst->copyFastMathFlags(I);
+ return RI;
+}
+
Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
- unsigned NumPHIValues = PN->getNumIncomingValues();
- if (NumPHIValues == 0)
- return nullptr;
-
- // We normally only transform phis with a single use. However, if a PHI has
- // multiple uses and they are all the same operation, we can fold *all* of the
- // uses into the PHI.
- if (!PN->hasOneUse()) {
- // Walk the use list for the instruction, comparing them to I.
- for (User *U : PN->users()) {
- Instruction *UI = cast<Instruction>(U);
- if (UI != &I && !I.isIdenticalTo(UI))
- return nullptr;
- }
- // Otherwise, we can replace *all* users with the new PHI we form.
- }
-
- // Check to see if all of the operands of the PHI are simple constants
- // (constantint/constantfp/undef). If there is one non-constant value,
- // remember the BB it is in. If there is more than one or if *it* is a PHI,
- // bail out. We don't do arbitrary constant expressions here because moving
- // their computation can be expensive without a cost model.
- BasicBlock *NonConstBB = nullptr;
- for (unsigned i = 0; i != NumPHIValues; ++i) {
- Value *InVal = PN->getIncomingValue(i);
+ unsigned NumPHIValues = PN->getNumIncomingValues();
+ if (NumPHIValues == 0)
+ return nullptr;
+
+ // We normally only transform phis with a single use. However, if a PHI has
+ // multiple uses and they are all the same operation, we can fold *all* of the
+ // uses into the PHI.
+ if (!PN->hasOneUse()) {
+ // Walk the use list for the instruction, comparing them to I.
+ for (User *U : PN->users()) {
+ Instruction *UI = cast<Instruction>(U);
+ if (UI != &I && !I.isIdenticalTo(UI))
+ return nullptr;
+ }
+ // Otherwise, we can replace *all* users with the new PHI we form.
+ }
+
+ // Check to see if all of the operands of the PHI are simple constants
+ // (constantint/constantfp/undef). If there is one non-constant value,
+ // remember the BB it is in. If there is more than one or if *it* is a PHI,
+ // bail out. We don't do arbitrary constant expressions here because moving
+ // their computation can be expensive without a cost model.
+ BasicBlock *NonConstBB = nullptr;
+ for (unsigned i = 0; i != NumPHIValues; ++i) {
+ Value *InVal = PN->getIncomingValue(i);
// If I is a freeze instruction, count undef as a non-constant.
if (match(InVal, m_ImmConstant()) &&
(!isa<FreezeInst>(I) || isGuaranteedNotToBeUndefOrPoison(InVal)))
- continue;
-
- if (isa<PHINode>(InVal)) return nullptr; // Itself a phi.
- if (NonConstBB) return nullptr; // More than one non-const value.
-
- NonConstBB = PN->getIncomingBlock(i);
-
- // If the InVal is an invoke at the end of the pred block, then we can't
- // insert a computation after it without breaking the edge.
- if (isa<InvokeInst>(InVal))
- if (cast<Instruction>(InVal)->getParent() == NonConstBB)
- return nullptr;
-
- // If the incoming non-constant value is in I's block, we will remove one
- // instruction, but insert another equivalent one, leading to infinite
- // instcombine.
- if (isPotentiallyReachable(I.getParent(), NonConstBB, &DT, LI))
- return nullptr;
- }
-
- // If there is exactly one non-constant value, we can insert a copy of the
- // operation in that block. However, if this is a critical edge, we would be
- // inserting the computation on some other paths (e.g. inside a loop). Only
- // do this if the pred block is unconditionally branching into the phi block.
+ continue;
+
+ if (isa<PHINode>(InVal)) return nullptr; // Itself a phi.
+ if (NonConstBB) return nullptr; // More than one non-const value.
+
+ NonConstBB = PN->getIncomingBlock(i);
+
+ // If the InVal is an invoke at the end of the pred block, then we can't
+ // insert a computation after it without breaking the edge.
+ if (isa<InvokeInst>(InVal))
+ if (cast<Instruction>(InVal)->getParent() == NonConstBB)
+ return nullptr;
+
+ // If the incoming non-constant value is in I's block, we will remove one
+ // instruction, but insert another equivalent one, leading to infinite
+ // instcombine.
+ if (isPotentiallyReachable(I.getParent(), NonConstBB, &DT, LI))
+ return nullptr;
+ }
+
+ // If there is exactly one non-constant value, we can insert a copy of the
+ // operation in that block. However, if this is a critical edge, we would be
+ // inserting the computation on some other paths (e.g. inside a loop). Only
+ // do this if the pred block is unconditionally branching into the phi block.
// Also, make sure that the pred block is not dead code.
- if (NonConstBB != nullptr) {
- BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator());
+ if (NonConstBB != nullptr) {
+ BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator());
if (!BI || !BI->isUnconditional() || !DT.isReachableFromEntry(NonConstBB))
return nullptr;
- }
-
- // Okay, we can do the transformation: create the new PHI node.
- PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues());
- InsertNewInstBefore(NewPN, *PN);
- NewPN->takeName(PN);
-
- // If we are going to have to insert a new computation, do so right before the
- // predecessor's terminator.
- if (NonConstBB)
- Builder.SetInsertPoint(NonConstBB->getTerminator());
-
- // Next, add all of the operands to the PHI.
- if (SelectInst *SI = dyn_cast<SelectInst>(&I)) {
- // We only currently try to fold the condition of a select when it is a phi,
- // not the true/false values.
- Value *TrueV = SI->getTrueValue();
- Value *FalseV = SI->getFalseValue();
- BasicBlock *PhiTransBB = PN->getParent();
- for (unsigned i = 0; i != NumPHIValues; ++i) {
- BasicBlock *ThisBB = PN->getIncomingBlock(i);
- Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB);
- Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB);
- Value *InV = nullptr;
- // Beware of ConstantExpr: it may eventually evaluate to getNullValue,
- // even if currently isNullValue gives false.
- Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i));
- // For vector constants, we cannot use isNullValue to fold into
- // FalseVInPred versus TrueVInPred. When we have individual nonzero
- // elements in the vector, we will incorrectly fold InC to
- // `TrueVInPred`.
+ }
+
+ // Okay, we can do the transformation: create the new PHI node.
+ PHINode *NewPN = PHINode::Create(I.getType(), PN->getNumIncomingValues());
+ InsertNewInstBefore(NewPN, *PN);
+ NewPN->takeName(PN);
+
+ // If we are going to have to insert a new computation, do so right before the
+ // predecessor's terminator.
+ if (NonConstBB)
+ Builder.SetInsertPoint(NonConstBB->getTerminator());
+
+ // Next, add all of the operands to the PHI.
+ if (SelectInst *SI = dyn_cast<SelectInst>(&I)) {
+ // We only currently try to fold the condition of a select when it is a phi,
+ // not the true/false values.
+ Value *TrueV = SI->getTrueValue();
+ Value *FalseV = SI->getFalseValue();
+ BasicBlock *PhiTransBB = PN->getParent();
+ for (unsigned i = 0; i != NumPHIValues; ++i) {
+ BasicBlock *ThisBB = PN->getIncomingBlock(i);
+ Value *TrueVInPred = TrueV->DoPHITranslation(PhiTransBB, ThisBB);
+ Value *FalseVInPred = FalseV->DoPHITranslation(PhiTransBB, ThisBB);
+ Value *InV = nullptr;
+ // Beware of ConstantExpr: it may eventually evaluate to getNullValue,
+ // even if currently isNullValue gives false.
+ Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i));
+ // For vector constants, we cannot use isNullValue to fold into
+ // FalseVInPred versus TrueVInPred. When we have individual nonzero
+ // elements in the vector, we will incorrectly fold InC to
+ // `TrueVInPred`.
if (InC && isa<ConstantInt>(InC))
- InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
- else {
- // Generate the select in the same block as PN's current incoming block.
- // Note: ThisBB need not be the NonConstBB because vector constants
- // which are constants by definition are handled here.
- // FIXME: This can lead to an increase in IR generation because we might
- // generate selects for vector constant phi operand, that could not be
- // folded to TrueVInPred or FalseVInPred as done for ConstantInt. For
- // non-vector phis, this transformation was always profitable because
- // the select would be generated exactly once in the NonConstBB.
- Builder.SetInsertPoint(ThisBB->getTerminator());
- InV = Builder.CreateSelect(PN->getIncomingValue(i), TrueVInPred,
- FalseVInPred, "phi.sel");
- }
- NewPN->addIncoming(InV, ThisBB);
- }
- } else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) {
- Constant *C = cast<Constant>(I.getOperand(1));
- for (unsigned i = 0; i != NumPHIValues; ++i) {
- Value *InV = nullptr;
- if (auto *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
- InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C);
- else
- InV = Builder.CreateCmp(CI->getPredicate(), PN->getIncomingValue(i),
- C, "phi.cmp");
- NewPN->addIncoming(InV, PN->getIncomingBlock(i));
- }
- } else if (auto *BO = dyn_cast<BinaryOperator>(&I)) {
- for (unsigned i = 0; i != NumPHIValues; ++i) {
- Value *InV = foldOperationIntoPhiValue(BO, PN->getIncomingValue(i),
- Builder);
- NewPN->addIncoming(InV, PN->getIncomingBlock(i));
- }
+ InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
+ else {
+ // Generate the select in the same block as PN's current incoming block.
+ // Note: ThisBB need not be the NonConstBB because vector constants
+ // which are constants by definition are handled here.
+ // FIXME: This can lead to an increase in IR generation because we might
+ // generate selects for vector constant phi operand, that could not be
+ // folded to TrueVInPred or FalseVInPred as done for ConstantInt. For
+ // non-vector phis, this transformation was always profitable because
+ // the select would be generated exactly once in the NonConstBB.
+ Builder.SetInsertPoint(ThisBB->getTerminator());
+ InV = Builder.CreateSelect(PN->getIncomingValue(i), TrueVInPred,
+ FalseVInPred, "phi.sel");
+ }
+ NewPN->addIncoming(InV, ThisBB);
+ }
+ } else if (CmpInst *CI = dyn_cast<CmpInst>(&I)) {
+ Constant *C = cast<Constant>(I.getOperand(1));
+ for (unsigned i = 0; i != NumPHIValues; ++i) {
+ Value *InV = nullptr;
+ if (auto *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+ InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C);
+ else
+ InV = Builder.CreateCmp(CI->getPredicate(), PN->getIncomingValue(i),
+ C, "phi.cmp");
+ NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+ }
+ } else if (auto *BO = dyn_cast<BinaryOperator>(&I)) {
+ for (unsigned i = 0; i != NumPHIValues; ++i) {
+ Value *InV = foldOperationIntoPhiValue(BO, PN->getIncomingValue(i),
+ Builder);
+ NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+ }
} else if (isa<FreezeInst>(&I)) {
for (unsigned i = 0; i != NumPHIValues; ++i) {
Value *InV;
@@ -1185,1493 +1185,1493 @@ Instruction *InstCombinerImpl::foldOpIntoPhi(Instruction &I, PHINode *PN) {
InV = PN->getIncomingValue(i);
NewPN->addIncoming(InV, PN->getIncomingBlock(i));
}
- } else {
- CastInst *CI = cast<CastInst>(&I);
- Type *RetTy = CI->getType();
- for (unsigned i = 0; i != NumPHIValues; ++i) {
- Value *InV;
- if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
- InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
- else
- InV = Builder.CreateCast(CI->getOpcode(), PN->getIncomingValue(i),
- I.getType(), "phi.cast");
- NewPN->addIncoming(InV, PN->getIncomingBlock(i));
- }
- }
-
+ } else {
+ CastInst *CI = cast<CastInst>(&I);
+ Type *RetTy = CI->getType();
+ for (unsigned i = 0; i != NumPHIValues; ++i) {
+ Value *InV;
+ if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+ InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
+ else
+ InV = Builder.CreateCast(CI->getOpcode(), PN->getIncomingValue(i),
+ I.getType(), "phi.cast");
+ NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+ }
+ }
+
for (User *U : make_early_inc_range(PN->users())) {
Instruction *User = cast<Instruction>(U);
- if (User == &I) continue;
- replaceInstUsesWith(*User, NewPN);
- eraseInstFromFunction(*User);
- }
- return replaceInstUsesWith(I, NewPN);
-}
-
+ if (User == &I) continue;
+ replaceInstUsesWith(*User, NewPN);
+ eraseInstFromFunction(*User);
+ }
+ return replaceInstUsesWith(I, NewPN);
+}
+
Instruction *InstCombinerImpl::foldBinOpIntoSelectOrPhi(BinaryOperator &I) {
- if (!isa<Constant>(I.getOperand(1)))
- return nullptr;
-
- if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
- if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
- return NewSel;
- } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) {
- if (Instruction *NewPhi = foldOpIntoPhi(I, PN))
- return NewPhi;
- }
- return nullptr;
-}
-
-/// Given a pointer type and a constant offset, determine whether or not there
-/// is a sequence of GEP indices into the pointed type that will land us at the
-/// specified offset. If so, fill them into NewIndices and return the resultant
-/// element type, otherwise return null.
+ if (!isa<Constant>(I.getOperand(1)))
+ return nullptr;
+
+ if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
+ if (Instruction *NewSel = FoldOpIntoSelect(I, Sel))
+ return NewSel;
+ } else if (auto *PN = dyn_cast<PHINode>(I.getOperand(0))) {
+ if (Instruction *NewPhi = foldOpIntoPhi(I, PN))
+ return NewPhi;
+ }
+ return nullptr;
+}
+
+/// Given a pointer type and a constant offset, determine whether or not there
+/// is a sequence of GEP indices into the pointed type that will land us at the
+/// specified offset. If so, fill them into NewIndices and return the resultant
+/// element type, otherwise return null.
Type *
InstCombinerImpl::FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
SmallVectorImpl<Value *> &NewIndices) {
- Type *Ty = PtrTy->getElementType();
- if (!Ty->isSized())
- return nullptr;
-
- // Start with the index over the outer type. Note that the type size
- // might be zero (even if the offset isn't zero) if the indexed type
- // is something like [0 x {int, int}]
- Type *IndexTy = DL.getIndexType(PtrTy);
- int64_t FirstIdx = 0;
- if (int64_t TySize = DL.getTypeAllocSize(Ty)) {
- FirstIdx = Offset/TySize;
- Offset -= FirstIdx*TySize;
-
- // Handle hosts where % returns negative instead of values [0..TySize).
- if (Offset < 0) {
- --FirstIdx;
- Offset += TySize;
- assert(Offset >= 0);
- }
- assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
- }
-
- NewIndices.push_back(ConstantInt::get(IndexTy, FirstIdx));
-
- // Index into the types. If we fail, set OrigBase to null.
- while (Offset) {
- // Indexing into tail padding between struct/array elements.
- if (uint64_t(Offset * 8) >= DL.getTypeSizeInBits(Ty))
- return nullptr;
-
- if (StructType *STy = dyn_cast<StructType>(Ty)) {
- const StructLayout *SL = DL.getStructLayout(STy);
- assert(Offset < (int64_t)SL->getSizeInBytes() &&
- "Offset must stay within the indexed type");
-
- unsigned Elt = SL->getElementContainingOffset(Offset);
- NewIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()),
- Elt));
-
- Offset -= SL->getElementOffset(Elt);
- Ty = STy->getElementType(Elt);
- } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
- uint64_t EltSize = DL.getTypeAllocSize(AT->getElementType());
- assert(EltSize && "Cannot index into a zero-sized array");
- NewIndices.push_back(ConstantInt::get(IndexTy,Offset/EltSize));
- Offset %= EltSize;
- Ty = AT->getElementType();
- } else {
- // Otherwise, we can't index into the middle of this atomic type, bail.
- return nullptr;
- }
- }
-
- return Ty;
-}
-
-static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) {
- // If this GEP has only 0 indices, it is the same pointer as
- // Src. If Src is not a trivial GEP too, don't combine
- // the indices.
- if (GEP.hasAllZeroIndices() && !Src.hasAllZeroIndices() &&
- !Src.hasOneUse())
- return false;
- return true;
-}
-
-/// Return a value X such that Val = X * Scale, or null if none.
-/// If the multiplication is known not to overflow, then NoSignedWrap is set.
+ Type *Ty = PtrTy->getElementType();
+ if (!Ty->isSized())
+ return nullptr;
+
+ // Start with the index over the outer type. Note that the type size
+ // might be zero (even if the offset isn't zero) if the indexed type
+ // is something like [0 x {int, int}]
+ Type *IndexTy = DL.getIndexType(PtrTy);
+ int64_t FirstIdx = 0;
+ if (int64_t TySize = DL.getTypeAllocSize(Ty)) {
+ FirstIdx = Offset/TySize;
+ Offset -= FirstIdx*TySize;
+
+ // Handle hosts where % returns negative instead of values [0..TySize).
+ if (Offset < 0) {
+ --FirstIdx;
+ Offset += TySize;
+ assert(Offset >= 0);
+ }
+ assert((uint64_t)Offset < (uint64_t)TySize && "Out of range offset");
+ }
+
+ NewIndices.push_back(ConstantInt::get(IndexTy, FirstIdx));
+
+ // Index into the types. If we fail, set OrigBase to null.
+ while (Offset) {
+ // Indexing into tail padding between struct/array elements.
+ if (uint64_t(Offset * 8) >= DL.getTypeSizeInBits(Ty))
+ return nullptr;
+
+ if (StructType *STy = dyn_cast<StructType>(Ty)) {
+ const StructLayout *SL = DL.getStructLayout(STy);
+ assert(Offset < (int64_t)SL->getSizeInBytes() &&
+ "Offset must stay within the indexed type");
+
+ unsigned Elt = SL->getElementContainingOffset(Offset);
+ NewIndices.push_back(ConstantInt::get(Type::getInt32Ty(Ty->getContext()),
+ Elt));
+
+ Offset -= SL->getElementOffset(Elt);
+ Ty = STy->getElementType(Elt);
+ } else if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+ uint64_t EltSize = DL.getTypeAllocSize(AT->getElementType());
+ assert(EltSize && "Cannot index into a zero-sized array");
+ NewIndices.push_back(ConstantInt::get(IndexTy,Offset/EltSize));
+ Offset %= EltSize;
+ Ty = AT->getElementType();
+ } else {
+ // Otherwise, we can't index into the middle of this atomic type, bail.
+ return nullptr;
+ }
+ }
+
+ return Ty;
+}
+
+static bool shouldMergeGEPs(GEPOperator &GEP, GEPOperator &Src) {
+ // If this GEP has only 0 indices, it is the same pointer as
+ // Src. If Src is not a trivial GEP too, don't combine
+ // the indices.
+ if (GEP.hasAllZeroIndices() && !Src.hasAllZeroIndices() &&
+ !Src.hasOneUse())
+ return false;
+ return true;
+}
+
+/// Return a value X such that Val = X * Scale, or null if none.
+/// If the multiplication is known not to overflow, then NoSignedWrap is set.
Value *InstCombinerImpl::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
- assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!");
- assert(cast<IntegerType>(Val->getType())->getBitWidth() ==
- Scale.getBitWidth() && "Scale not compatible with value!");
-
- // If Val is zero or Scale is one then Val = Val * Scale.
- if (match(Val, m_Zero()) || Scale == 1) {
- NoSignedWrap = true;
- return Val;
- }
-
- // If Scale is zero then it does not divide Val.
- if (Scale.isMinValue())
- return nullptr;
-
- // Look through chains of multiplications, searching for a constant that is
- // divisible by Scale. For example, descaling X*(Y*(Z*4)) by a factor of 4
- // will find the constant factor 4 and produce X*(Y*Z). Descaling X*(Y*8) by
- // a factor of 4 will produce X*(Y*2). The principle of operation is to bore
- // down from Val:
- //
- // Val = M1 * X || Analysis starts here and works down
- // M1 = M2 * Y || Doesn't descend into terms with more
- // M2 = Z * 4 \/ than one use
- //
- // Then to modify a term at the bottom:
- //
- // Val = M1 * X
- // M1 = Z * Y || Replaced M2 with Z
- //
- // Then to work back up correcting nsw flags.
-
- // Op - the term we are currently analyzing. Starts at Val then drills down.
- // Replaced with its descaled value before exiting from the drill down loop.
- Value *Op = Val;
-
- // Parent - initially null, but after drilling down notes where Op came from.
- // In the example above, Parent is (Val, 0) when Op is M1, because M1 is the
- // 0'th operand of Val.
- std::pair<Instruction *, unsigned> Parent;
-
- // Set if the transform requires a descaling at deeper levels that doesn't
- // overflow.
- bool RequireNoSignedWrap = false;
-
- // Log base 2 of the scale. Negative if not a power of 2.
- int32_t logScale = Scale.exactLogBase2();
-
- for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
- // If Op is a constant divisible by Scale then descale to the quotient.
- APInt Quotient(Scale), Remainder(Scale); // Init ensures right bitwidth.
- APInt::sdivrem(CI->getValue(), Scale, Quotient, Remainder);
- if (!Remainder.isMinValue())
- // Not divisible by Scale.
- return nullptr;
- // Replace with the quotient in the parent.
- Op = ConstantInt::get(CI->getType(), Quotient);
- NoSignedWrap = true;
- break;
- }
-
- if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op)) {
- if (BO->getOpcode() == Instruction::Mul) {
- // Multiplication.
- NoSignedWrap = BO->hasNoSignedWrap();
- if (RequireNoSignedWrap && !NoSignedWrap)
- return nullptr;
-
- // There are three cases for multiplication: multiplication by exactly
- // the scale, multiplication by a constant different to the scale, and
- // multiplication by something else.
- Value *LHS = BO->getOperand(0);
- Value *RHS = BO->getOperand(1);
-
- if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
- // Multiplication by a constant.
- if (CI->getValue() == Scale) {
- // Multiplication by exactly the scale, replace the multiplication
- // by its left-hand side in the parent.
- Op = LHS;
- break;
- }
-
- // Otherwise drill down into the constant.
- if (!Op->hasOneUse())
- return nullptr;
-
- Parent = std::make_pair(BO, 1);
- continue;
- }
-
- // Multiplication by something else. Drill down into the left-hand side
- // since that's where the reassociate pass puts the good stuff.
- if (!Op->hasOneUse())
- return nullptr;
-
- Parent = std::make_pair(BO, 0);
- continue;
- }
-
- if (logScale > 0 && BO->getOpcode() == Instruction::Shl &&
- isa<ConstantInt>(BO->getOperand(1))) {
- // Multiplication by a power of 2.
- NoSignedWrap = BO->hasNoSignedWrap();
- if (RequireNoSignedWrap && !NoSignedWrap)
- return nullptr;
-
- Value *LHS = BO->getOperand(0);
- int32_t Amt = cast<ConstantInt>(BO->getOperand(1))->
- getLimitedValue(Scale.getBitWidth());
- // Op = LHS << Amt.
-
- if (Amt == logScale) {
- // Multiplication by exactly the scale, replace the multiplication
- // by its left-hand side in the parent.
- Op = LHS;
- break;
- }
- if (Amt < logScale || !Op->hasOneUse())
- return nullptr;
-
- // Multiplication by more than the scale. Reduce the multiplying amount
- // by the scale in the parent.
- Parent = std::make_pair(BO, 1);
- Op = ConstantInt::get(BO->getType(), Amt - logScale);
- break;
- }
- }
-
- if (!Op->hasOneUse())
- return nullptr;
-
- if (CastInst *Cast = dyn_cast<CastInst>(Op)) {
- if (Cast->getOpcode() == Instruction::SExt) {
- // Op is sign-extended from a smaller type, descale in the smaller type.
- unsigned SmallSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
- APInt SmallScale = Scale.trunc(SmallSize);
- // Suppose Op = sext X, and we descale X as Y * SmallScale. We want to
- // descale Op as (sext Y) * Scale. In order to have
- // sext (Y * SmallScale) = (sext Y) * Scale
- // some conditions need to hold however: SmallScale must sign-extend to
- // Scale and the multiplication Y * SmallScale should not overflow.
- if (SmallScale.sext(Scale.getBitWidth()) != Scale)
- // SmallScale does not sign-extend to Scale.
- return nullptr;
- assert(SmallScale.exactLogBase2() == logScale);
- // Require that Y * SmallScale must not overflow.
- RequireNoSignedWrap = true;
-
- // Drill down through the cast.
- Parent = std::make_pair(Cast, 0);
- Scale = SmallScale;
- continue;
- }
-
- if (Cast->getOpcode() == Instruction::Trunc) {
- // Op is truncated from a larger type, descale in the larger type.
- // Suppose Op = trunc X, and we descale X as Y * sext Scale. Then
- // trunc (Y * sext Scale) = (trunc Y) * Scale
- // always holds. However (trunc Y) * Scale may overflow even if
- // trunc (Y * sext Scale) does not, so nsw flags need to be cleared
- // from this point up in the expression (see later).
- if (RequireNoSignedWrap)
- return nullptr;
-
- // Drill down through the cast.
- unsigned LargeSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
- Parent = std::make_pair(Cast, 0);
- Scale = Scale.sext(LargeSize);
- if (logScale + 1 == (int32_t)Cast->getType()->getPrimitiveSizeInBits())
- logScale = -1;
- assert(Scale.exactLogBase2() == logScale);
- continue;
- }
- }
-
- // Unsupported expression, bail out.
- return nullptr;
- }
-
- // If Op is zero then Val = Op * Scale.
- if (match(Op, m_Zero())) {
- NoSignedWrap = true;
- return Op;
- }
-
- // We know that we can successfully descale, so from here on we can safely
- // modify the IR. Op holds the descaled version of the deepest term in the
- // expression. NoSignedWrap is 'true' if multiplying Op by Scale is known
- // not to overflow.
-
- if (!Parent.first)
- // The expression only had one term.
- return Op;
-
- // Rewrite the parent using the descaled version of its operand.
- assert(Parent.first->hasOneUse() && "Drilled down when more than one use!");
- assert(Op != Parent.first->getOperand(Parent.second) &&
- "Descaling was a no-op?");
- replaceOperand(*Parent.first, Parent.second, Op);
- Worklist.push(Parent.first);
-
- // Now work back up the expression correcting nsw flags. The logic is based
- // on the following observation: if X * Y is known not to overflow as a signed
- // multiplication, and Y is replaced by a value Z with smaller absolute value,
- // then X * Z will not overflow as a signed multiplication either. As we work
- // our way up, having NoSignedWrap 'true' means that the descaled value at the
- // current level has strictly smaller absolute value than the original.
- Instruction *Ancestor = Parent.first;
- do {
- if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Ancestor)) {
- // If the multiplication wasn't nsw then we can't say anything about the
- // value of the descaled multiplication, and we have to clear nsw flags
- // from this point on up.
- bool OpNoSignedWrap = BO->hasNoSignedWrap();
- NoSignedWrap &= OpNoSignedWrap;
- if (NoSignedWrap != OpNoSignedWrap) {
- BO->setHasNoSignedWrap(NoSignedWrap);
- Worklist.push(Ancestor);
- }
- } else if (Ancestor->getOpcode() == Instruction::Trunc) {
- // The fact that the descaled input to the trunc has smaller absolute
- // value than the original input doesn't tell us anything useful about
- // the absolute values of the truncations.
- NoSignedWrap = false;
- }
- assert((Ancestor->getOpcode() != Instruction::SExt || NoSignedWrap) &&
- "Failed to keep proper track of nsw flags while drilling down?");
-
- if (Ancestor == Val)
- // Got to the top, all done!
- return Val;
-
- // Move up one level in the expression.
- assert(Ancestor->hasOneUse() && "Drilled down when more than one use!");
- Ancestor = Ancestor->user_back();
- } while (true);
-}
-
+ assert(isa<IntegerType>(Val->getType()) && "Can only descale integers!");
+ assert(cast<IntegerType>(Val->getType())->getBitWidth() ==
+ Scale.getBitWidth() && "Scale not compatible with value!");
+
+ // If Val is zero or Scale is one then Val = Val * Scale.
+ if (match(Val, m_Zero()) || Scale == 1) {
+ NoSignedWrap = true;
+ return Val;
+ }
+
+ // If Scale is zero then it does not divide Val.
+ if (Scale.isMinValue())
+ return nullptr;
+
+ // Look through chains of multiplications, searching for a constant that is
+ // divisible by Scale. For example, descaling X*(Y*(Z*4)) by a factor of 4
+ // will find the constant factor 4 and produce X*(Y*Z). Descaling X*(Y*8) by
+ // a factor of 4 will produce X*(Y*2). The principle of operation is to bore
+ // down from Val:
+ //
+ // Val = M1 * X || Analysis starts here and works down
+ // M1 = M2 * Y || Doesn't descend into terms with more
+ // M2 = Z * 4 \/ than one use
+ //
+ // Then to modify a term at the bottom:
+ //
+ // Val = M1 * X
+ // M1 = Z * Y || Replaced M2 with Z
+ //
+ // Then to work back up correcting nsw flags.
+
+ // Op - the term we are currently analyzing. Starts at Val then drills down.
+ // Replaced with its descaled value before exiting from the drill down loop.
+ Value *Op = Val;
+
+ // Parent - initially null, but after drilling down notes where Op came from.
+ // In the example above, Parent is (Val, 0) when Op is M1, because M1 is the
+ // 0'th operand of Val.
+ std::pair<Instruction *, unsigned> Parent;
+
+ // Set if the transform requires a descaling at deeper levels that doesn't
+ // overflow.
+ bool RequireNoSignedWrap = false;
+
+ // Log base 2 of the scale. Negative if not a power of 2.
+ int32_t logScale = Scale.exactLogBase2();
+
+ for (;; Op = Parent.first->getOperand(Parent.second)) { // Drill down
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+ // If Op is a constant divisible by Scale then descale to the quotient.
+ APInt Quotient(Scale), Remainder(Scale); // Init ensures right bitwidth.
+ APInt::sdivrem(CI->getValue(), Scale, Quotient, Remainder);
+ if (!Remainder.isMinValue())
+ // Not divisible by Scale.
+ return nullptr;
+ // Replace with the quotient in the parent.
+ Op = ConstantInt::get(CI->getType(), Quotient);
+ NoSignedWrap = true;
+ break;
+ }
+
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op)) {
+ if (BO->getOpcode() == Instruction::Mul) {
+ // Multiplication.
+ NoSignedWrap = BO->hasNoSignedWrap();
+ if (RequireNoSignedWrap && !NoSignedWrap)
+ return nullptr;
+
+ // There are three cases for multiplication: multiplication by exactly
+ // the scale, multiplication by a constant different to the scale, and
+ // multiplication by something else.
+ Value *LHS = BO->getOperand(0);
+ Value *RHS = BO->getOperand(1);
+
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
+ // Multiplication by a constant.
+ if (CI->getValue() == Scale) {
+ // Multiplication by exactly the scale, replace the multiplication
+ // by its left-hand side in the parent.
+ Op = LHS;
+ break;
+ }
+
+ // Otherwise drill down into the constant.
+ if (!Op->hasOneUse())
+ return nullptr;
+
+ Parent = std::make_pair(BO, 1);
+ continue;
+ }
+
+ // Multiplication by something else. Drill down into the left-hand side
+ // since that's where the reassociate pass puts the good stuff.
+ if (!Op->hasOneUse())
+ return nullptr;
+
+ Parent = std::make_pair(BO, 0);
+ continue;
+ }
+
+ if (logScale > 0 && BO->getOpcode() == Instruction::Shl &&
+ isa<ConstantInt>(BO->getOperand(1))) {
+ // Multiplication by a power of 2.
+ NoSignedWrap = BO->hasNoSignedWrap();
+ if (RequireNoSignedWrap && !NoSignedWrap)
+ return nullptr;
+
+ Value *LHS = BO->getOperand(0);
+ int32_t Amt = cast<ConstantInt>(BO->getOperand(1))->
+ getLimitedValue(Scale.getBitWidth());
+ // Op = LHS << Amt.
+
+ if (Amt == logScale) {
+ // Multiplication by exactly the scale, replace the multiplication
+ // by its left-hand side in the parent.
+ Op = LHS;
+ break;
+ }
+ if (Amt < logScale || !Op->hasOneUse())
+ return nullptr;
+
+ // Multiplication by more than the scale. Reduce the multiplying amount
+ // by the scale in the parent.
+ Parent = std::make_pair(BO, 1);
+ Op = ConstantInt::get(BO->getType(), Amt - logScale);
+ break;
+ }
+ }
+
+ if (!Op->hasOneUse())
+ return nullptr;
+
+ if (CastInst *Cast = dyn_cast<CastInst>(Op)) {
+ if (Cast->getOpcode() == Instruction::SExt) {
+ // Op is sign-extended from a smaller type, descale in the smaller type.
+ unsigned SmallSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
+ APInt SmallScale = Scale.trunc(SmallSize);
+ // Suppose Op = sext X, and we descale X as Y * SmallScale. We want to
+ // descale Op as (sext Y) * Scale. In order to have
+ // sext (Y * SmallScale) = (sext Y) * Scale
+ // some conditions need to hold however: SmallScale must sign-extend to
+ // Scale and the multiplication Y * SmallScale should not overflow.
+ if (SmallScale.sext(Scale.getBitWidth()) != Scale)
+ // SmallScale does not sign-extend to Scale.
+ return nullptr;
+ assert(SmallScale.exactLogBase2() == logScale);
+ // Require that Y * SmallScale must not overflow.
+ RequireNoSignedWrap = true;
+
+ // Drill down through the cast.
+ Parent = std::make_pair(Cast, 0);
+ Scale = SmallScale;
+ continue;
+ }
+
+ if (Cast->getOpcode() == Instruction::Trunc) {
+ // Op is truncated from a larger type, descale in the larger type.
+ // Suppose Op = trunc X, and we descale X as Y * sext Scale. Then
+ // trunc (Y * sext Scale) = (trunc Y) * Scale
+ // always holds. However (trunc Y) * Scale may overflow even if
+ // trunc (Y * sext Scale) does not, so nsw flags need to be cleared
+ // from this point up in the expression (see later).
+ if (RequireNoSignedWrap)
+ return nullptr;
+
+ // Drill down through the cast.
+ unsigned LargeSize = Cast->getSrcTy()->getPrimitiveSizeInBits();
+ Parent = std::make_pair(Cast, 0);
+ Scale = Scale.sext(LargeSize);
+ if (logScale + 1 == (int32_t)Cast->getType()->getPrimitiveSizeInBits())
+ logScale = -1;
+ assert(Scale.exactLogBase2() == logScale);
+ continue;
+ }
+ }
+
+ // Unsupported expression, bail out.
+ return nullptr;
+ }
+
+ // If Op is zero then Val = Op * Scale.
+ if (match(Op, m_Zero())) {
+ NoSignedWrap = true;
+ return Op;
+ }
+
+ // We know that we can successfully descale, so from here on we can safely
+ // modify the IR. Op holds the descaled version of the deepest term in the
+ // expression. NoSignedWrap is 'true' if multiplying Op by Scale is known
+ // not to overflow.
+
+ if (!Parent.first)
+ // The expression only had one term.
+ return Op;
+
+ // Rewrite the parent using the descaled version of its operand.
+ assert(Parent.first->hasOneUse() && "Drilled down when more than one use!");
+ assert(Op != Parent.first->getOperand(Parent.second) &&
+ "Descaling was a no-op?");
+ replaceOperand(*Parent.first, Parent.second, Op);
+ Worklist.push(Parent.first);
+
+ // Now work back up the expression correcting nsw flags. The logic is based
+ // on the following observation: if X * Y is known not to overflow as a signed
+ // multiplication, and Y is replaced by a value Z with smaller absolute value,
+ // then X * Z will not overflow as a signed multiplication either. As we work
+ // our way up, having NoSignedWrap 'true' means that the descaled value at the
+ // current level has strictly smaller absolute value than the original.
+ Instruction *Ancestor = Parent.first;
+ do {
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Ancestor)) {
+ // If the multiplication wasn't nsw then we can't say anything about the
+ // value of the descaled multiplication, and we have to clear nsw flags
+ // from this point on up.
+ bool OpNoSignedWrap = BO->hasNoSignedWrap();
+ NoSignedWrap &= OpNoSignedWrap;
+ if (NoSignedWrap != OpNoSignedWrap) {
+ BO->setHasNoSignedWrap(NoSignedWrap);
+ Worklist.push(Ancestor);
+ }
+ } else if (Ancestor->getOpcode() == Instruction::Trunc) {
+ // The fact that the descaled input to the trunc has smaller absolute
+ // value than the original input doesn't tell us anything useful about
+ // the absolute values of the truncations.
+ NoSignedWrap = false;
+ }
+ assert((Ancestor->getOpcode() != Instruction::SExt || NoSignedWrap) &&
+ "Failed to keep proper track of nsw flags while drilling down?");
+
+ if (Ancestor == Val)
+ // Got to the top, all done!
+ return Val;
+
+ // Move up one level in the expression.
+ assert(Ancestor->hasOneUse() && "Drilled down when more than one use!");
+ Ancestor = Ancestor->user_back();
+ } while (true);
+}
+
Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
if (!isa<VectorType>(Inst.getType()))
- return nullptr;
-
- BinaryOperator::BinaryOps Opcode = Inst.getOpcode();
- Value *LHS = Inst.getOperand(0), *RHS = Inst.getOperand(1);
- assert(cast<VectorType>(LHS->getType())->getElementCount() ==
- cast<VectorType>(Inst.getType())->getElementCount());
- assert(cast<VectorType>(RHS->getType())->getElementCount() ==
- cast<VectorType>(Inst.getType())->getElementCount());
-
- // If both operands of the binop are vector concatenations, then perform the
- // narrow binop on each pair of the source operands followed by concatenation
- // of the results.
- Value *L0, *L1, *R0, *R1;
- ArrayRef<int> Mask;
- if (match(LHS, m_Shuffle(m_Value(L0), m_Value(L1), m_Mask(Mask))) &&
- match(RHS, m_Shuffle(m_Value(R0), m_Value(R1), m_SpecificMask(Mask))) &&
- LHS->hasOneUse() && RHS->hasOneUse() &&
- cast<ShuffleVectorInst>(LHS)->isConcat() &&
- cast<ShuffleVectorInst>(RHS)->isConcat()) {
- // This transform does not have the speculative execution constraint as
- // below because the shuffle is a concatenation. The new binops are
- // operating on exactly the same elements as the existing binop.
- // TODO: We could ease the mask requirement to allow different undef lanes,
- // but that requires an analysis of the binop-with-undef output value.
- Value *NewBO0 = Builder.CreateBinOp(Opcode, L0, R0);
- if (auto *BO = dyn_cast<BinaryOperator>(NewBO0))
- BO->copyIRFlags(&Inst);
- Value *NewBO1 = Builder.CreateBinOp(Opcode, L1, R1);
- if (auto *BO = dyn_cast<BinaryOperator>(NewBO1))
- BO->copyIRFlags(&Inst);
- return new ShuffleVectorInst(NewBO0, NewBO1, Mask);
- }
-
- // It may not be safe to reorder shuffles and things like div, urem, etc.
- // because we may trap when executing those ops on unknown vector elements.
- // See PR20059.
- if (!isSafeToSpeculativelyExecute(&Inst))
- return nullptr;
-
- auto createBinOpShuffle = [&](Value *X, Value *Y, ArrayRef<int> M) {
- Value *XY = Builder.CreateBinOp(Opcode, X, Y);
- if (auto *BO = dyn_cast<BinaryOperator>(XY))
- BO->copyIRFlags(&Inst);
- return new ShuffleVectorInst(XY, UndefValue::get(XY->getType()), M);
- };
-
- // If both arguments of the binary operation are shuffles that use the same
- // mask and shuffle within a single vector, move the shuffle after the binop.
- Value *V1, *V2;
- if (match(LHS, m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))) &&
- match(RHS, m_Shuffle(m_Value(V2), m_Undef(), m_SpecificMask(Mask))) &&
- V1->getType() == V2->getType() &&
- (LHS->hasOneUse() || RHS->hasOneUse() || LHS == RHS)) {
- // Op(shuffle(V1, Mask), shuffle(V2, Mask)) -> shuffle(Op(V1, V2), Mask)
- return createBinOpShuffle(V1, V2, Mask);
- }
-
- // If both arguments of a commutative binop are select-shuffles that use the
- // same mask with commuted operands, the shuffles are unnecessary.
- if (Inst.isCommutative() &&
- match(LHS, m_Shuffle(m_Value(V1), m_Value(V2), m_Mask(Mask))) &&
- match(RHS,
- m_Shuffle(m_Specific(V2), m_Specific(V1), m_SpecificMask(Mask)))) {
- auto *LShuf = cast<ShuffleVectorInst>(LHS);
- auto *RShuf = cast<ShuffleVectorInst>(RHS);
- // TODO: Allow shuffles that contain undefs in the mask?
- // That is legal, but it reduces undef knowledge.
- // TODO: Allow arbitrary shuffles by shuffling after binop?
- // That might be legal, but we have to deal with poison.
- if (LShuf->isSelect() &&
- !is_contained(LShuf->getShuffleMask(), UndefMaskElem) &&
- RShuf->isSelect() &&
- !is_contained(RShuf->getShuffleMask(), UndefMaskElem)) {
- // Example:
- // LHS = shuffle V1, V2, <0, 5, 6, 3>
- // RHS = shuffle V2, V1, <0, 5, 6, 3>
- // LHS + RHS --> (V10+V20, V21+V11, V22+V12, V13+V23) --> V1 + V2
- Instruction *NewBO = BinaryOperator::Create(Opcode, V1, V2);
- NewBO->copyIRFlags(&Inst);
- return NewBO;
- }
- }
-
- // If one argument is a shuffle within one vector and the other is a constant,
- // try moving the shuffle after the binary operation. This canonicalization
- // intends to move shuffles closer to other shuffles and binops closer to
- // other binops, so they can be folded. It may also enable demanded elements
- // transforms.
- Constant *C;
+ return nullptr;
+
+ BinaryOperator::BinaryOps Opcode = Inst.getOpcode();
+ Value *LHS = Inst.getOperand(0), *RHS = Inst.getOperand(1);
+ assert(cast<VectorType>(LHS->getType())->getElementCount() ==
+ cast<VectorType>(Inst.getType())->getElementCount());
+ assert(cast<VectorType>(RHS->getType())->getElementCount() ==
+ cast<VectorType>(Inst.getType())->getElementCount());
+
+ // If both operands of the binop are vector concatenations, then perform the
+ // narrow binop on each pair of the source operands followed by concatenation
+ // of the results.
+ Value *L0, *L1, *R0, *R1;
+ ArrayRef<int> Mask;
+ if (match(LHS, m_Shuffle(m_Value(L0), m_Value(L1), m_Mask(Mask))) &&
+ match(RHS, m_Shuffle(m_Value(R0), m_Value(R1), m_SpecificMask(Mask))) &&
+ LHS->hasOneUse() && RHS->hasOneUse() &&
+ cast<ShuffleVectorInst>(LHS)->isConcat() &&
+ cast<ShuffleVectorInst>(RHS)->isConcat()) {
+ // This transform does not have the speculative execution constraint as
+ // below because the shuffle is a concatenation. The new binops are
+ // operating on exactly the same elements as the existing binop.
+ // TODO: We could ease the mask requirement to allow different undef lanes,
+ // but that requires an analysis of the binop-with-undef output value.
+ Value *NewBO0 = Builder.CreateBinOp(Opcode, L0, R0);
+ if (auto *BO = dyn_cast<BinaryOperator>(NewBO0))
+ BO->copyIRFlags(&Inst);
+ Value *NewBO1 = Builder.CreateBinOp(Opcode, L1, R1);
+ if (auto *BO = dyn_cast<BinaryOperator>(NewBO1))
+ BO->copyIRFlags(&Inst);
+ return new ShuffleVectorInst(NewBO0, NewBO1, Mask);
+ }
+
+ // It may not be safe to reorder shuffles and things like div, urem, etc.
+ // because we may trap when executing those ops on unknown vector elements.
+ // See PR20059.
+ if (!isSafeToSpeculativelyExecute(&Inst))
+ return nullptr;
+
+ auto createBinOpShuffle = [&](Value *X, Value *Y, ArrayRef<int> M) {
+ Value *XY = Builder.CreateBinOp(Opcode, X, Y);
+ if (auto *BO = dyn_cast<BinaryOperator>(XY))
+ BO->copyIRFlags(&Inst);
+ return new ShuffleVectorInst(XY, UndefValue::get(XY->getType()), M);
+ };
+
+ // If both arguments of the binary operation are shuffles that use the same
+ // mask and shuffle within a single vector, move the shuffle after the binop.
+ Value *V1, *V2;
+ if (match(LHS, m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))) &&
+ match(RHS, m_Shuffle(m_Value(V2), m_Undef(), m_SpecificMask(Mask))) &&
+ V1->getType() == V2->getType() &&
+ (LHS->hasOneUse() || RHS->hasOneUse() || LHS == RHS)) {
+ // Op(shuffle(V1, Mask), shuffle(V2, Mask)) -> shuffle(Op(V1, V2), Mask)
+ return createBinOpShuffle(V1, V2, Mask);
+ }
+
+ // If both arguments of a commutative binop are select-shuffles that use the
+ // same mask with commuted operands, the shuffles are unnecessary.
+ if (Inst.isCommutative() &&
+ match(LHS, m_Shuffle(m_Value(V1), m_Value(V2), m_Mask(Mask))) &&
+ match(RHS,
+ m_Shuffle(m_Specific(V2), m_Specific(V1), m_SpecificMask(Mask)))) {
+ auto *LShuf = cast<ShuffleVectorInst>(LHS);
+ auto *RShuf = cast<ShuffleVectorInst>(RHS);
+ // TODO: Allow shuffles that contain undefs in the mask?
+ // That is legal, but it reduces undef knowledge.
+ // TODO: Allow arbitrary shuffles by shuffling after binop?
+ // That might be legal, but we have to deal with poison.
+ if (LShuf->isSelect() &&
+ !is_contained(LShuf->getShuffleMask(), UndefMaskElem) &&
+ RShuf->isSelect() &&
+ !is_contained(RShuf->getShuffleMask(), UndefMaskElem)) {
+ // Example:
+ // LHS = shuffle V1, V2, <0, 5, 6, 3>
+ // RHS = shuffle V2, V1, <0, 5, 6, 3>
+ // LHS + RHS --> (V10+V20, V21+V11, V22+V12, V13+V23) --> V1 + V2
+ Instruction *NewBO = BinaryOperator::Create(Opcode, V1, V2);
+ NewBO->copyIRFlags(&Inst);
+ return NewBO;
+ }
+ }
+
+ // If one argument is a shuffle within one vector and the other is a constant,
+ // try moving the shuffle after the binary operation. This canonicalization
+ // intends to move shuffles closer to other shuffles and binops closer to
+ // other binops, so they can be folded. It may also enable demanded elements
+ // transforms.
+ Constant *C;
auto *InstVTy = dyn_cast<FixedVectorType>(Inst.getType());
if (InstVTy &&
match(&Inst,
- m_c_BinOp(m_OneUse(m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))),
+ m_c_BinOp(m_OneUse(m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))),
m_ImmConstant(C))) &&
cast<FixedVectorType>(V1->getType())->getNumElements() <=
InstVTy->getNumElements()) {
assert(InstVTy->getScalarType() == V1->getType()->getScalarType() &&
- "Shuffle should not change scalar type");
-
- // Find constant NewC that has property:
- // shuffle(NewC, ShMask) = C
- // If such constant does not exist (example: ShMask=<0,0> and C=<1,2>)
- // reorder is not possible. A 1-to-1 mapping is not required. Example:
- // ShMask = <1,1,2,2> and C = <5,5,6,6> --> NewC = <undef,5,6,undef>
- bool ConstOp1 = isa<Constant>(RHS);
- ArrayRef<int> ShMask = Mask;
- unsigned SrcVecNumElts =
- cast<FixedVectorType>(V1->getType())->getNumElements();
- UndefValue *UndefScalar = UndefValue::get(C->getType()->getScalarType());
- SmallVector<Constant *, 16> NewVecC(SrcVecNumElts, UndefScalar);
- bool MayChange = true;
+ "Shuffle should not change scalar type");
+
+ // Find constant NewC that has property:
+ // shuffle(NewC, ShMask) = C
+ // If such constant does not exist (example: ShMask=<0,0> and C=<1,2>)
+ // reorder is not possible. A 1-to-1 mapping is not required. Example:
+ // ShMask = <1,1,2,2> and C = <5,5,6,6> --> NewC = <undef,5,6,undef>
+ bool ConstOp1 = isa<Constant>(RHS);
+ ArrayRef<int> ShMask = Mask;
+ unsigned SrcVecNumElts =
+ cast<FixedVectorType>(V1->getType())->getNumElements();
+ UndefValue *UndefScalar = UndefValue::get(C->getType()->getScalarType());
+ SmallVector<Constant *, 16> NewVecC(SrcVecNumElts, UndefScalar);
+ bool MayChange = true;
unsigned NumElts = InstVTy->getNumElements();
- for (unsigned I = 0; I < NumElts; ++I) {
- Constant *CElt = C->getAggregateElement(I);
- if (ShMask[I] >= 0) {
- assert(ShMask[I] < (int)NumElts && "Not expecting narrowing shuffle");
- Constant *NewCElt = NewVecC[ShMask[I]];
- // Bail out if:
- // 1. The constant vector contains a constant expression.
- // 2. The shuffle needs an element of the constant vector that can't
- // be mapped to a new constant vector.
- // 3. This is a widening shuffle that copies elements of V1 into the
- // extended elements (extending with undef is allowed).
- if (!CElt || (!isa<UndefValue>(NewCElt) && NewCElt != CElt) ||
- I >= SrcVecNumElts) {
- MayChange = false;
- break;
- }
- NewVecC[ShMask[I]] = CElt;
- }
- // If this is a widening shuffle, we must be able to extend with undef
- // elements. If the original binop does not produce an undef in the high
- // lanes, then this transform is not safe.
- // Similarly for undef lanes due to the shuffle mask, we can only
- // transform binops that preserve undef.
- // TODO: We could shuffle those non-undef constant values into the
- // result by using a constant vector (rather than an undef vector)
- // as operand 1 of the new binop, but that might be too aggressive
- // for target-independent shuffle creation.
- if (I >= SrcVecNumElts || ShMask[I] < 0) {
- Constant *MaybeUndef =
- ConstOp1 ? ConstantExpr::get(Opcode, UndefScalar, CElt)
- : ConstantExpr::get(Opcode, CElt, UndefScalar);
- if (!isa<UndefValue>(MaybeUndef)) {
- MayChange = false;
- break;
- }
- }
- }
- if (MayChange) {
- Constant *NewC = ConstantVector::get(NewVecC);
- // It may not be safe to execute a binop on a vector with undef elements
- // because the entire instruction can be folded to undef or create poison
- // that did not exist in the original code.
- if (Inst.isIntDivRem() || (Inst.isShift() && ConstOp1))
- NewC = getSafeVectorConstantForBinop(Opcode, NewC, ConstOp1);
-
- // Op(shuffle(V1, Mask), C) -> shuffle(Op(V1, NewC), Mask)
- // Op(C, shuffle(V1, Mask)) -> shuffle(Op(NewC, V1), Mask)
- Value *NewLHS = ConstOp1 ? V1 : NewC;
- Value *NewRHS = ConstOp1 ? NewC : V1;
- return createBinOpShuffle(NewLHS, NewRHS, Mask);
- }
- }
-
- // Try to reassociate to sink a splat shuffle after a binary operation.
- if (Inst.isAssociative() && Inst.isCommutative()) {
- // Canonicalize shuffle operand as LHS.
- if (isa<ShuffleVectorInst>(RHS))
- std::swap(LHS, RHS);
-
- Value *X;
- ArrayRef<int> MaskC;
- int SplatIndex;
- BinaryOperator *BO;
- if (!match(LHS,
- m_OneUse(m_Shuffle(m_Value(X), m_Undef(), m_Mask(MaskC)))) ||
- !match(MaskC, m_SplatOrUndefMask(SplatIndex)) ||
- X->getType() != Inst.getType() || !match(RHS, m_OneUse(m_BinOp(BO))) ||
- BO->getOpcode() != Opcode)
- return nullptr;
-
- // FIXME: This may not be safe if the analysis allows undef elements. By
- // moving 'Y' before the splat shuffle, we are implicitly assuming
- // that it is not undef/poison at the splat index.
- Value *Y, *OtherOp;
- if (isSplatValue(BO->getOperand(0), SplatIndex)) {
- Y = BO->getOperand(0);
- OtherOp = BO->getOperand(1);
- } else if (isSplatValue(BO->getOperand(1), SplatIndex)) {
- Y = BO->getOperand(1);
- OtherOp = BO->getOperand(0);
- } else {
- return nullptr;
- }
-
- // X and Y are splatted values, so perform the binary operation on those
- // values followed by a splat followed by the 2nd binary operation:
- // bo (splat X), (bo Y, OtherOp) --> bo (splat (bo X, Y)), OtherOp
- Value *NewBO = Builder.CreateBinOp(Opcode, X, Y);
- SmallVector<int, 8> NewMask(MaskC.size(), SplatIndex);
+ for (unsigned I = 0; I < NumElts; ++I) {
+ Constant *CElt = C->getAggregateElement(I);
+ if (ShMask[I] >= 0) {
+ assert(ShMask[I] < (int)NumElts && "Not expecting narrowing shuffle");
+ Constant *NewCElt = NewVecC[ShMask[I]];
+ // Bail out if:
+ // 1. The constant vector contains a constant expression.
+ // 2. The shuffle needs an element of the constant vector that can't
+ // be mapped to a new constant vector.
+ // 3. This is a widening shuffle that copies elements of V1 into the
+ // extended elements (extending with undef is allowed).
+ if (!CElt || (!isa<UndefValue>(NewCElt) && NewCElt != CElt) ||
+ I >= SrcVecNumElts) {
+ MayChange = false;
+ break;
+ }
+ NewVecC[ShMask[I]] = CElt;
+ }
+ // If this is a widening shuffle, we must be able to extend with undef
+ // elements. If the original binop does not produce an undef in the high
+ // lanes, then this transform is not safe.
+ // Similarly for undef lanes due to the shuffle mask, we can only
+ // transform binops that preserve undef.
+ // TODO: We could shuffle those non-undef constant values into the
+ // result by using a constant vector (rather than an undef vector)
+ // as operand 1 of the new binop, but that might be too aggressive
+ // for target-independent shuffle creation.
+ if (I >= SrcVecNumElts || ShMask[I] < 0) {
+ Constant *MaybeUndef =
+ ConstOp1 ? ConstantExpr::get(Opcode, UndefScalar, CElt)
+ : ConstantExpr::get(Opcode, CElt, UndefScalar);
+ if (!isa<UndefValue>(MaybeUndef)) {
+ MayChange = false;
+ break;
+ }
+ }
+ }
+ if (MayChange) {
+ Constant *NewC = ConstantVector::get(NewVecC);
+ // It may not be safe to execute a binop on a vector with undef elements
+ // because the entire instruction can be folded to undef or create poison
+ // that did not exist in the original code.
+ if (Inst.isIntDivRem() || (Inst.isShift() && ConstOp1))
+ NewC = getSafeVectorConstantForBinop(Opcode, NewC, ConstOp1);
+
+ // Op(shuffle(V1, Mask), C) -> shuffle(Op(V1, NewC), Mask)
+ // Op(C, shuffle(V1, Mask)) -> shuffle(Op(NewC, V1), Mask)
+ Value *NewLHS = ConstOp1 ? V1 : NewC;
+ Value *NewRHS = ConstOp1 ? NewC : V1;
+ return createBinOpShuffle(NewLHS, NewRHS, Mask);
+ }
+ }
+
+ // Try to reassociate to sink a splat shuffle after a binary operation.
+ if (Inst.isAssociative() && Inst.isCommutative()) {
+ // Canonicalize shuffle operand as LHS.
+ if (isa<ShuffleVectorInst>(RHS))
+ std::swap(LHS, RHS);
+
+ Value *X;
+ ArrayRef<int> MaskC;
+ int SplatIndex;
+ BinaryOperator *BO;
+ if (!match(LHS,
+ m_OneUse(m_Shuffle(m_Value(X), m_Undef(), m_Mask(MaskC)))) ||
+ !match(MaskC, m_SplatOrUndefMask(SplatIndex)) ||
+ X->getType() != Inst.getType() || !match(RHS, m_OneUse(m_BinOp(BO))) ||
+ BO->getOpcode() != Opcode)
+ return nullptr;
+
+ // FIXME: This may not be safe if the analysis allows undef elements. By
+ // moving 'Y' before the splat shuffle, we are implicitly assuming
+ // that it is not undef/poison at the splat index.
+ Value *Y, *OtherOp;
+ if (isSplatValue(BO->getOperand(0), SplatIndex)) {
+ Y = BO->getOperand(0);
+ OtherOp = BO->getOperand(1);
+ } else if (isSplatValue(BO->getOperand(1), SplatIndex)) {
+ Y = BO->getOperand(1);
+ OtherOp = BO->getOperand(0);
+ } else {
+ return nullptr;
+ }
+
+ // X and Y are splatted values, so perform the binary operation on those
+ // values followed by a splat followed by the 2nd binary operation:
+ // bo (splat X), (bo Y, OtherOp) --> bo (splat (bo X, Y)), OtherOp
+ Value *NewBO = Builder.CreateBinOp(Opcode, X, Y);
+ SmallVector<int, 8> NewMask(MaskC.size(), SplatIndex);
Value *NewSplat = Builder.CreateShuffleVector(NewBO, NewMask);
- Instruction *R = BinaryOperator::Create(Opcode, NewSplat, OtherOp);
-
- // Intersect FMF on both new binops. Other (poison-generating) flags are
- // dropped to be safe.
- if (isa<FPMathOperator>(R)) {
- R->copyFastMathFlags(&Inst);
- R->andIRFlags(BO);
- }
- if (auto *NewInstBO = dyn_cast<BinaryOperator>(NewBO))
- NewInstBO->copyIRFlags(R);
- return R;
- }
-
- return nullptr;
-}
-
-/// Try to narrow the width of a binop if at least 1 operand is an extend of
-/// of a value. This requires a potentially expensive known bits check to make
-/// sure the narrow op does not overflow.
+ Instruction *R = BinaryOperator::Create(Opcode, NewSplat, OtherOp);
+
+ // Intersect FMF on both new binops. Other (poison-generating) flags are
+ // dropped to be safe.
+ if (isa<FPMathOperator>(R)) {
+ R->copyFastMathFlags(&Inst);
+ R->andIRFlags(BO);
+ }
+ if (auto *NewInstBO = dyn_cast<BinaryOperator>(NewBO))
+ NewInstBO->copyIRFlags(R);
+ return R;
+ }
+
+ return nullptr;
+}
+
+/// Try to narrow the width of a binop if at least 1 operand is an extend of
+/// of a value. This requires a potentially expensive known bits check to make
+/// sure the narrow op does not overflow.
Instruction *InstCombinerImpl::narrowMathIfNoOverflow(BinaryOperator &BO) {
- // We need at least one extended operand.
- Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1);
-
- // If this is a sub, we swap the operands since we always want an extension
- // on the RHS. The LHS can be an extension or a constant.
- if (BO.getOpcode() == Instruction::Sub)
- std::swap(Op0, Op1);
-
- Value *X;
- bool IsSext = match(Op0, m_SExt(m_Value(X)));
- if (!IsSext && !match(Op0, m_ZExt(m_Value(X))))
- return nullptr;
-
- // If both operands are the same extension from the same source type and we
- // can eliminate at least one (hasOneUse), this might work.
- CastInst::CastOps CastOpc = IsSext ? Instruction::SExt : Instruction::ZExt;
- Value *Y;
- if (!(match(Op1, m_ZExtOrSExt(m_Value(Y))) && X->getType() == Y->getType() &&
- cast<Operator>(Op1)->getOpcode() == CastOpc &&
- (Op0->hasOneUse() || Op1->hasOneUse()))) {
- // If that did not match, see if we have a suitable constant operand.
- // Truncating and extending must produce the same constant.
- Constant *WideC;
- if (!Op0->hasOneUse() || !match(Op1, m_Constant(WideC)))
- return nullptr;
- Constant *NarrowC = ConstantExpr::getTrunc(WideC, X->getType());
- if (ConstantExpr::getCast(CastOpc, NarrowC, BO.getType()) != WideC)
- return nullptr;
- Y = NarrowC;
- }
-
- // Swap back now that we found our operands.
- if (BO.getOpcode() == Instruction::Sub)
- std::swap(X, Y);
-
- // Both operands have narrow versions. Last step: the math must not overflow
- // in the narrow width.
- if (!willNotOverflow(BO.getOpcode(), X, Y, BO, IsSext))
- return nullptr;
-
- // bo (ext X), (ext Y) --> ext (bo X, Y)
- // bo (ext X), C --> ext (bo X, C')
- Value *NarrowBO = Builder.CreateBinOp(BO.getOpcode(), X, Y, "narrow");
- if (auto *NewBinOp = dyn_cast<BinaryOperator>(NarrowBO)) {
- if (IsSext)
- NewBinOp->setHasNoSignedWrap();
- else
- NewBinOp->setHasNoUnsignedWrap();
- }
- return CastInst::Create(CastOpc, NarrowBO, BO.getType());
-}
-
-static bool isMergedGEPInBounds(GEPOperator &GEP1, GEPOperator &GEP2) {
- // At least one GEP must be inbounds.
- if (!GEP1.isInBounds() && !GEP2.isInBounds())
- return false;
-
- return (GEP1.isInBounds() || GEP1.hasAllZeroIndices()) &&
- (GEP2.isInBounds() || GEP2.hasAllZeroIndices());
-}
-
-/// Thread a GEP operation with constant indices through the constant true/false
-/// arms of a select.
-static Instruction *foldSelectGEP(GetElementPtrInst &GEP,
- InstCombiner::BuilderTy &Builder) {
- if (!GEP.hasAllConstantIndices())
- return nullptr;
-
- Instruction *Sel;
- Value *Cond;
- Constant *TrueC, *FalseC;
- if (!match(GEP.getPointerOperand(), m_Instruction(Sel)) ||
- !match(Sel,
- m_Select(m_Value(Cond), m_Constant(TrueC), m_Constant(FalseC))))
- return nullptr;
-
- // gep (select Cond, TrueC, FalseC), IndexC --> select Cond, TrueC', FalseC'
- // Propagate 'inbounds' and metadata from existing instructions.
- // Note: using IRBuilder to create the constants for efficiency.
+ // We need at least one extended operand.
+ Value *Op0 = BO.getOperand(0), *Op1 = BO.getOperand(1);
+
+ // If this is a sub, we swap the operands since we always want an extension
+ // on the RHS. The LHS can be an extension or a constant.
+ if (BO.getOpcode() == Instruction::Sub)
+ std::swap(Op0, Op1);
+
+ Value *X;
+ bool IsSext = match(Op0, m_SExt(m_Value(X)));
+ if (!IsSext && !match(Op0, m_ZExt(m_Value(X))))
+ return nullptr;
+
+ // If both operands are the same extension from the same source type and we
+ // can eliminate at least one (hasOneUse), this might work.
+ CastInst::CastOps CastOpc = IsSext ? Instruction::SExt : Instruction::ZExt;
+ Value *Y;
+ if (!(match(Op1, m_ZExtOrSExt(m_Value(Y))) && X->getType() == Y->getType() &&
+ cast<Operator>(Op1)->getOpcode() == CastOpc &&
+ (Op0->hasOneUse() || Op1->hasOneUse()))) {
+ // If that did not match, see if we have a suitable constant operand.
+ // Truncating and extending must produce the same constant.
+ Constant *WideC;
+ if (!Op0->hasOneUse() || !match(Op1, m_Constant(WideC)))
+ return nullptr;
+ Constant *NarrowC = ConstantExpr::getTrunc(WideC, X->getType());
+ if (ConstantExpr::getCast(CastOpc, NarrowC, BO.getType()) != WideC)
+ return nullptr;
+ Y = NarrowC;
+ }
+
+ // Swap back now that we found our operands.
+ if (BO.getOpcode() == Instruction::Sub)
+ std::swap(X, Y);
+
+ // Both operands have narrow versions. Last step: the math must not overflow
+ // in the narrow width.
+ if (!willNotOverflow(BO.getOpcode(), X, Y, BO, IsSext))
+ return nullptr;
+
+ // bo (ext X), (ext Y) --> ext (bo X, Y)
+ // bo (ext X), C --> ext (bo X, C')
+ Value *NarrowBO = Builder.CreateBinOp(BO.getOpcode(), X, Y, "narrow");
+ if (auto *NewBinOp = dyn_cast<BinaryOperator>(NarrowBO)) {
+ if (IsSext)
+ NewBinOp->setHasNoSignedWrap();
+ else
+ NewBinOp->setHasNoUnsignedWrap();
+ }
+ return CastInst::Create(CastOpc, NarrowBO, BO.getType());
+}
+
+static bool isMergedGEPInBounds(GEPOperator &GEP1, GEPOperator &GEP2) {
+ // At least one GEP must be inbounds.
+ if (!GEP1.isInBounds() && !GEP2.isInBounds())
+ return false;
+
+ return (GEP1.isInBounds() || GEP1.hasAllZeroIndices()) &&
+ (GEP2.isInBounds() || GEP2.hasAllZeroIndices());
+}
+
+/// Thread a GEP operation with constant indices through the constant true/false
+/// arms of a select.
+static Instruction *foldSelectGEP(GetElementPtrInst &GEP,
+ InstCombiner::BuilderTy &Builder) {
+ if (!GEP.hasAllConstantIndices())
+ return nullptr;
+
+ Instruction *Sel;
+ Value *Cond;
+ Constant *TrueC, *FalseC;
+ if (!match(GEP.getPointerOperand(), m_Instruction(Sel)) ||
+ !match(Sel,
+ m_Select(m_Value(Cond), m_Constant(TrueC), m_Constant(FalseC))))
+ return nullptr;
+
+ // gep (select Cond, TrueC, FalseC), IndexC --> select Cond, TrueC', FalseC'
+ // Propagate 'inbounds' and metadata from existing instructions.
+ // Note: using IRBuilder to create the constants for efficiency.
SmallVector<Value *, 4> IndexC(GEP.indices());
- bool IsInBounds = GEP.isInBounds();
- Value *NewTrueC = IsInBounds ? Builder.CreateInBoundsGEP(TrueC, IndexC)
- : Builder.CreateGEP(TrueC, IndexC);
- Value *NewFalseC = IsInBounds ? Builder.CreateInBoundsGEP(FalseC, IndexC)
- : Builder.CreateGEP(FalseC, IndexC);
- return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel);
-}
-
+ bool IsInBounds = GEP.isInBounds();
+ Value *NewTrueC = IsInBounds ? Builder.CreateInBoundsGEP(TrueC, IndexC)
+ : Builder.CreateGEP(TrueC, IndexC);
+ Value *NewFalseC = IsInBounds ? Builder.CreateInBoundsGEP(FalseC, IndexC)
+ : Builder.CreateGEP(FalseC, IndexC);
+ return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel);
+}
+
Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
SmallVector<Value *, 8> Ops(GEP.operands());
- Type *GEPType = GEP.getType();
- Type *GEPEltType = GEP.getSourceElementType();
- bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType);
- if (Value *V = SimplifyGEPInst(GEPEltType, Ops, SQ.getWithInstruction(&GEP)))
- return replaceInstUsesWith(GEP, V);
-
- // For vector geps, use the generic demanded vector support.
- // Skip if GEP return type is scalable. The number of elements is unknown at
- // compile-time.
- if (auto *GEPFVTy = dyn_cast<FixedVectorType>(GEPType)) {
- auto VWidth = GEPFVTy->getNumElements();
- APInt UndefElts(VWidth, 0);
- APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
- if (Value *V = SimplifyDemandedVectorElts(&GEP, AllOnesEltMask,
- UndefElts)) {
- if (V != &GEP)
- return replaceInstUsesWith(GEP, V);
- return &GEP;
- }
-
- // TODO: 1) Scalarize splat operands, 2) scalarize entire instruction if
- // possible (decide on canonical form for pointer broadcast), 3) exploit
- // undef elements to decrease demanded bits
- }
-
- Value *PtrOp = GEP.getOperand(0);
-
- // Eliminate unneeded casts for indices, and replace indices which displace
- // by multiples of a zero size type with zero.
- bool MadeChange = false;
-
- // Index width may not be the same width as pointer width.
- // Data layout chooses the right type based on supported integer types.
- Type *NewScalarIndexTy =
- DL.getIndexType(GEP.getPointerOperandType()->getScalarType());
-
- gep_type_iterator GTI = gep_type_begin(GEP);
- for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E;
- ++I, ++GTI) {
- // Skip indices into struct types.
- if (GTI.isStruct())
- continue;
-
- Type *IndexTy = (*I)->getType();
- Type *NewIndexType =
- IndexTy->isVectorTy()
- ? VectorType::get(NewScalarIndexTy,
- cast<VectorType>(IndexTy)->getElementCount())
- : NewScalarIndexTy;
-
- // If the element type has zero size then any index over it is equivalent
- // to an index of zero, so replace it with zero if it is not zero already.
- Type *EltTy = GTI.getIndexedType();
- if (EltTy->isSized() && DL.getTypeAllocSize(EltTy).isZero())
- if (!isa<Constant>(*I) || !match(I->get(), m_Zero())) {
- *I = Constant::getNullValue(NewIndexType);
- MadeChange = true;
- }
-
- if (IndexTy != NewIndexType) {
- // If we are using a wider index than needed for this platform, shrink
- // it to what we need. If narrower, sign-extend it to what we need.
- // This explicit cast can make subsequent optimizations more obvious.
- *I = Builder.CreateIntCast(*I, NewIndexType, true);
- MadeChange = true;
- }
- }
- if (MadeChange)
- return &GEP;
-
- // Check to see if the inputs to the PHI node are getelementptr instructions.
- if (auto *PN = dyn_cast<PHINode>(PtrOp)) {
- auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
- if (!Op1)
- return nullptr;
-
- // Don't fold a GEP into itself through a PHI node. This can only happen
- // through the back-edge of a loop. Folding a GEP into itself means that
- // the value of the previous iteration needs to be stored in the meantime,
- // thus requiring an additional register variable to be live, but not
- // actually achieving anything (the GEP still needs to be executed once per
- // loop iteration).
- if (Op1 == &GEP)
- return nullptr;
-
- int DI = -1;
-
- for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
- auto *Op2 = dyn_cast<GetElementPtrInst>(*I);
- if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands())
- return nullptr;
-
- // As for Op1 above, don't try to fold a GEP into itself.
- if (Op2 == &GEP)
- return nullptr;
-
- // Keep track of the type as we walk the GEP.
- Type *CurTy = nullptr;
-
- for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) {
- if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType())
- return nullptr;
-
- if (Op1->getOperand(J) != Op2->getOperand(J)) {
- if (DI == -1) {
- // We have not seen any differences yet in the GEPs feeding the
- // PHI yet, so we record this one if it is allowed to be a
- // variable.
-
- // The first two arguments can vary for any GEP, the rest have to be
- // static for struct slots
- if (J > 1) {
- assert(CurTy && "No current type?");
- if (CurTy->isStructTy())
- return nullptr;
- }
-
- DI = J;
- } else {
- // The GEP is different by more than one input. While this could be
- // extended to support GEPs that vary by more than one variable it
- // doesn't make sense since it greatly increases the complexity and
- // would result in an R+R+R addressing mode which no backend
- // directly supports and would need to be broken into several
- // simpler instructions anyway.
- return nullptr;
- }
- }
-
- // Sink down a layer of the type for the next iteration.
- if (J > 0) {
- if (J == 1) {
- CurTy = Op1->getSourceElementType();
- } else {
- CurTy =
- GetElementPtrInst::getTypeAtIndex(CurTy, Op1->getOperand(J));
- }
- }
- }
- }
-
- // If not all GEPs are identical we'll have to create a new PHI node.
- // Check that the old PHI node has only one use so that it will get
- // removed.
- if (DI != -1 && !PN->hasOneUse())
- return nullptr;
-
- auto *NewGEP = cast<GetElementPtrInst>(Op1->clone());
- if (DI == -1) {
- // All the GEPs feeding the PHI are identical. Clone one down into our
- // BB so that it can be merged with the current GEP.
- } else {
- // All the GEPs feeding the PHI differ at a single offset. Clone a GEP
- // into the current block so it can be merged, and create a new PHI to
- // set that index.
- PHINode *NewPN;
- {
- IRBuilderBase::InsertPointGuard Guard(Builder);
- Builder.SetInsertPoint(PN);
- NewPN = Builder.CreatePHI(Op1->getOperand(DI)->getType(),
- PN->getNumOperands());
- }
-
- for (auto &I : PN->operands())
- NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI),
- PN->getIncomingBlock(I));
-
- NewGEP->setOperand(DI, NewPN);
- }
-
- GEP.getParent()->getInstList().insert(
- GEP.getParent()->getFirstInsertionPt(), NewGEP);
- replaceOperand(GEP, 0, NewGEP);
- PtrOp = NewGEP;
- }
-
- // Combine Indices - If the source pointer to this getelementptr instruction
- // is a getelementptr instruction, combine the indices of the two
- // getelementptr instructions into a single instruction.
- if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) {
- if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
- return nullptr;
-
- // Try to reassociate loop invariant GEP chains to enable LICM.
- if (LI && Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
- Src->hasOneUse()) {
- if (Loop *L = LI->getLoopFor(GEP.getParent())) {
- Value *GO1 = GEP.getOperand(1);
- Value *SO1 = Src->getOperand(1);
- // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is
- // invariant: this breaks the dependence between GEPs and allows LICM
- // to hoist the invariant part out of the loop.
- if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) {
- // We have to be careful here.
- // We have something like:
- // %src = getelementptr <ty>, <ty>* %base, <ty> %idx
- // %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2
- // If we just swap idx & idx2 then we could inadvertantly
- // change %src from a vector to a scalar, or vice versa.
- // Cases:
- // 1) %base a scalar & idx a scalar & idx2 a vector
- // => Swapping idx & idx2 turns %src into a vector type.
- // 2) %base a scalar & idx a vector & idx2 a scalar
- // => Swapping idx & idx2 turns %src in a scalar type
- // 3) %base, %idx, and %idx2 are scalars
- // => %src & %gep are scalars
- // => swapping idx & idx2 is safe
- // 4) %base a vector
- // => %src is a vector
- // => swapping idx & idx2 is safe.
- auto *SO0 = Src->getOperand(0);
- auto *SO0Ty = SO0->getType();
- if (!isa<VectorType>(GEPType) || // case 3
- isa<VectorType>(SO0Ty)) { // case 4
- Src->setOperand(1, GO1);
- GEP.setOperand(1, SO1);
- return &GEP;
- } else {
- // Case 1 or 2
- // -- have to recreate %src & %gep
- // put NewSrc at same location as %src
- Builder.SetInsertPoint(cast<Instruction>(PtrOp));
- auto *NewSrc = cast<GetElementPtrInst>(
- Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName()));
- NewSrc->setIsInBounds(Src->isInBounds());
- auto *NewGEP = GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1});
- NewGEP->setIsInBounds(GEP.isInBounds());
- return NewGEP;
- }
- }
- }
- }
-
- // Note that if our source is a gep chain itself then we wait for that
- // chain to be resolved before we perform this transformation. This
- // avoids us creating a TON of code in some cases.
- if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0)))
- if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
- return nullptr; // Wait until our source is folded to completion.
-
- SmallVector<Value*, 8> Indices;
-
- // Find out whether the last index in the source GEP is a sequential idx.
- bool EndsWithSequential = false;
- for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
- I != E; ++I)
- EndsWithSequential = I.isSequential();
-
- // Can we combine the two pointer arithmetics offsets?
- if (EndsWithSequential) {
- // Replace: gep (gep %P, long B), long A, ...
- // With: T = long A+B; gep %P, T, ...
- Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
- Value *GO1 = GEP.getOperand(1);
-
- // If they aren't the same type, then the input hasn't been processed
- // by the loop above yet (which canonicalizes sequential index types to
- // intptr_t). Just avoid transforming this until the input has been
- // normalized.
- if (SO1->getType() != GO1->getType())
- return nullptr;
-
- Value *Sum =
- SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
- // Only do the combine when we are sure the cost after the
- // merge is never more than that before the merge.
- if (Sum == nullptr)
- return nullptr;
-
- // Update the GEP in place if possible.
- if (Src->getNumOperands() == 2) {
- GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)));
- replaceOperand(GEP, 0, Src->getOperand(0));
- replaceOperand(GEP, 1, Sum);
- return &GEP;
- }
- Indices.append(Src->op_begin()+1, Src->op_end()-1);
- Indices.push_back(Sum);
- Indices.append(GEP.op_begin()+2, GEP.op_end());
- } else if (isa<Constant>(*GEP.idx_begin()) &&
- cast<Constant>(*GEP.idx_begin())->isNullValue() &&
- Src->getNumOperands() != 1) {
- // Otherwise we can do the fold if the first index of the GEP is a zero
- Indices.append(Src->op_begin()+1, Src->op_end());
- Indices.append(GEP.idx_begin()+1, GEP.idx_end());
- }
-
- if (!Indices.empty())
- return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))
- ? GetElementPtrInst::CreateInBounds(
- Src->getSourceElementType(), Src->getOperand(0), Indices,
- GEP.getName())
- : GetElementPtrInst::Create(Src->getSourceElementType(),
- Src->getOperand(0), Indices,
- GEP.getName());
- }
-
- // Skip if GEP source element type is scalable. The type alloc size is unknown
- // at compile-time.
- if (GEP.getNumIndices() == 1 && !IsGEPSrcEleScalable) {
- unsigned AS = GEP.getPointerAddressSpace();
- if (GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
- DL.getIndexSizeInBits(AS)) {
- uint64_t TyAllocSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
-
- bool Matched = false;
- uint64_t C;
- Value *V = nullptr;
- if (TyAllocSize == 1) {
- V = GEP.getOperand(1);
- Matched = true;
- } else if (match(GEP.getOperand(1),
- m_AShr(m_Value(V), m_ConstantInt(C)))) {
- if (TyAllocSize == 1ULL << C)
- Matched = true;
- } else if (match(GEP.getOperand(1),
- m_SDiv(m_Value(V), m_ConstantInt(C)))) {
- if (TyAllocSize == C)
- Matched = true;
- }
-
- if (Matched) {
- // Canonicalize (gep i8* X, -(ptrtoint Y))
- // to (inttoptr (sub (ptrtoint X), (ptrtoint Y)))
- // The GEP pattern is emitted by the SCEV expander for certain kinds of
- // pointer arithmetic.
- if (match(V, m_Neg(m_PtrToInt(m_Value())))) {
- Operator *Index = cast<Operator>(V);
- Value *PtrToInt = Builder.CreatePtrToInt(PtrOp, Index->getType());
- Value *NewSub = Builder.CreateSub(PtrToInt, Index->getOperand(1));
- return CastInst::Create(Instruction::IntToPtr, NewSub, GEPType);
- }
- // Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X))
- // to (bitcast Y)
- Value *Y;
- if (match(V, m_Sub(m_PtrToInt(m_Value(Y)),
- m_PtrToInt(m_Specific(GEP.getOperand(0))))))
- return CastInst::CreatePointerBitCastOrAddrSpaceCast(Y, GEPType);
- }
- }
- }
-
- // We do not handle pointer-vector geps here.
- if (GEPType->isVectorTy())
- return nullptr;
-
- // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
- Value *StrippedPtr = PtrOp->stripPointerCasts();
- PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType());
-
- if (StrippedPtr != PtrOp) {
- bool HasZeroPointerIndex = false;
- Type *StrippedPtrEltTy = StrippedPtrTy->getElementType();
-
- if (auto *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
- HasZeroPointerIndex = C->isZero();
-
- // Transform: GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ...
- // into : GEP [10 x i8]* X, i32 0, ...
- //
- // Likewise, transform: GEP (bitcast i8* X to [0 x i8]*), i32 0, ...
- // into : GEP i8* X, ...
- //
- // This occurs when the program declares an array extern like "int X[];"
- if (HasZeroPointerIndex) {
- if (auto *CATy = dyn_cast<ArrayType>(GEPEltType)) {
- // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
- if (CATy->getElementType() == StrippedPtrEltTy) {
- // -> GEP i8* X, ...
+ Type *GEPType = GEP.getType();
+ Type *GEPEltType = GEP.getSourceElementType();
+ bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType);
+ if (Value *V = SimplifyGEPInst(GEPEltType, Ops, SQ.getWithInstruction(&GEP)))
+ return replaceInstUsesWith(GEP, V);
+
+ // For vector geps, use the generic demanded vector support.
+ // Skip if GEP return type is scalable. The number of elements is unknown at
+ // compile-time.
+ if (auto *GEPFVTy = dyn_cast<FixedVectorType>(GEPType)) {
+ auto VWidth = GEPFVTy->getNumElements();
+ APInt UndefElts(VWidth, 0);
+ APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+ if (Value *V = SimplifyDemandedVectorElts(&GEP, AllOnesEltMask,
+ UndefElts)) {
+ if (V != &GEP)
+ return replaceInstUsesWith(GEP, V);
+ return &GEP;
+ }
+
+ // TODO: 1) Scalarize splat operands, 2) scalarize entire instruction if
+ // possible (decide on canonical form for pointer broadcast), 3) exploit
+ // undef elements to decrease demanded bits
+ }
+
+ Value *PtrOp = GEP.getOperand(0);
+
+ // Eliminate unneeded casts for indices, and replace indices which displace
+ // by multiples of a zero size type with zero.
+ bool MadeChange = false;
+
+ // Index width may not be the same width as pointer width.
+ // Data layout chooses the right type based on supported integer types.
+ Type *NewScalarIndexTy =
+ DL.getIndexType(GEP.getPointerOperandType()->getScalarType());
+
+ gep_type_iterator GTI = gep_type_begin(GEP);
+ for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end(); I != E;
+ ++I, ++GTI) {
+ // Skip indices into struct types.
+ if (GTI.isStruct())
+ continue;
+
+ Type *IndexTy = (*I)->getType();
+ Type *NewIndexType =
+ IndexTy->isVectorTy()
+ ? VectorType::get(NewScalarIndexTy,
+ cast<VectorType>(IndexTy)->getElementCount())
+ : NewScalarIndexTy;
+
+ // If the element type has zero size then any index over it is equivalent
+ // to an index of zero, so replace it with zero if it is not zero already.
+ Type *EltTy = GTI.getIndexedType();
+ if (EltTy->isSized() && DL.getTypeAllocSize(EltTy).isZero())
+ if (!isa<Constant>(*I) || !match(I->get(), m_Zero())) {
+ *I = Constant::getNullValue(NewIndexType);
+ MadeChange = true;
+ }
+
+ if (IndexTy != NewIndexType) {
+ // If we are using a wider index than needed for this platform, shrink
+ // it to what we need. If narrower, sign-extend it to what we need.
+ // This explicit cast can make subsequent optimizations more obvious.
+ *I = Builder.CreateIntCast(*I, NewIndexType, true);
+ MadeChange = true;
+ }
+ }
+ if (MadeChange)
+ return &GEP;
+
+ // Check to see if the inputs to the PHI node are getelementptr instructions.
+ if (auto *PN = dyn_cast<PHINode>(PtrOp)) {
+ auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
+ if (!Op1)
+ return nullptr;
+
+ // Don't fold a GEP into itself through a PHI node. This can only happen
+ // through the back-edge of a loop. Folding a GEP into itself means that
+ // the value of the previous iteration needs to be stored in the meantime,
+ // thus requiring an additional register variable to be live, but not
+ // actually achieving anything (the GEP still needs to be executed once per
+ // loop iteration).
+ if (Op1 == &GEP)
+ return nullptr;
+
+ int DI = -1;
+
+ for (auto I = PN->op_begin()+1, E = PN->op_end(); I !=E; ++I) {
+ auto *Op2 = dyn_cast<GetElementPtrInst>(*I);
+ if (!Op2 || Op1->getNumOperands() != Op2->getNumOperands())
+ return nullptr;
+
+ // As for Op1 above, don't try to fold a GEP into itself.
+ if (Op2 == &GEP)
+ return nullptr;
+
+ // Keep track of the type as we walk the GEP.
+ Type *CurTy = nullptr;
+
+ for (unsigned J = 0, F = Op1->getNumOperands(); J != F; ++J) {
+ if (Op1->getOperand(J)->getType() != Op2->getOperand(J)->getType())
+ return nullptr;
+
+ if (Op1->getOperand(J) != Op2->getOperand(J)) {
+ if (DI == -1) {
+ // We have not seen any differences yet in the GEPs feeding the
+ // PHI yet, so we record this one if it is allowed to be a
+ // variable.
+
+ // The first two arguments can vary for any GEP, the rest have to be
+ // static for struct slots
+ if (J > 1) {
+ assert(CurTy && "No current type?");
+ if (CurTy->isStructTy())
+ return nullptr;
+ }
+
+ DI = J;
+ } else {
+ // The GEP is different by more than one input. While this could be
+ // extended to support GEPs that vary by more than one variable it
+ // doesn't make sense since it greatly increases the complexity and
+ // would result in an R+R+R addressing mode which no backend
+ // directly supports and would need to be broken into several
+ // simpler instructions anyway.
+ return nullptr;
+ }
+ }
+
+ // Sink down a layer of the type for the next iteration.
+ if (J > 0) {
+ if (J == 1) {
+ CurTy = Op1->getSourceElementType();
+ } else {
+ CurTy =
+ GetElementPtrInst::getTypeAtIndex(CurTy, Op1->getOperand(J));
+ }
+ }
+ }
+ }
+
+ // If not all GEPs are identical we'll have to create a new PHI node.
+ // Check that the old PHI node has only one use so that it will get
+ // removed.
+ if (DI != -1 && !PN->hasOneUse())
+ return nullptr;
+
+ auto *NewGEP = cast<GetElementPtrInst>(Op1->clone());
+ if (DI == -1) {
+ // All the GEPs feeding the PHI are identical. Clone one down into our
+ // BB so that it can be merged with the current GEP.
+ } else {
+ // All the GEPs feeding the PHI differ at a single offset. Clone a GEP
+ // into the current block so it can be merged, and create a new PHI to
+ // set that index.
+ PHINode *NewPN;
+ {
+ IRBuilderBase::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(PN);
+ NewPN = Builder.CreatePHI(Op1->getOperand(DI)->getType(),
+ PN->getNumOperands());
+ }
+
+ for (auto &I : PN->operands())
+ NewPN->addIncoming(cast<GEPOperator>(I)->getOperand(DI),
+ PN->getIncomingBlock(I));
+
+ NewGEP->setOperand(DI, NewPN);
+ }
+
+ GEP.getParent()->getInstList().insert(
+ GEP.getParent()->getFirstInsertionPt(), NewGEP);
+ replaceOperand(GEP, 0, NewGEP);
+ PtrOp = NewGEP;
+ }
+
+ // Combine Indices - If the source pointer to this getelementptr instruction
+ // is a getelementptr instruction, combine the indices of the two
+ // getelementptr instructions into a single instruction.
+ if (auto *Src = dyn_cast<GEPOperator>(PtrOp)) {
+ if (!shouldMergeGEPs(*cast<GEPOperator>(&GEP), *Src))
+ return nullptr;
+
+ // Try to reassociate loop invariant GEP chains to enable LICM.
+ if (LI && Src->getNumOperands() == 2 && GEP.getNumOperands() == 2 &&
+ Src->hasOneUse()) {
+ if (Loop *L = LI->getLoopFor(GEP.getParent())) {
+ Value *GO1 = GEP.getOperand(1);
+ Value *SO1 = Src->getOperand(1);
+ // Reassociate the two GEPs if SO1 is variant in the loop and GO1 is
+ // invariant: this breaks the dependence between GEPs and allows LICM
+ // to hoist the invariant part out of the loop.
+ if (L->isLoopInvariant(GO1) && !L->isLoopInvariant(SO1)) {
+ // We have to be careful here.
+ // We have something like:
+ // %src = getelementptr <ty>, <ty>* %base, <ty> %idx
+ // %gep = getelementptr <ty>, <ty>* %src, <ty> %idx2
+ // If we just swap idx & idx2 then we could inadvertantly
+ // change %src from a vector to a scalar, or vice versa.
+ // Cases:
+ // 1) %base a scalar & idx a scalar & idx2 a vector
+ // => Swapping idx & idx2 turns %src into a vector type.
+ // 2) %base a scalar & idx a vector & idx2 a scalar
+ // => Swapping idx & idx2 turns %src in a scalar type
+ // 3) %base, %idx, and %idx2 are scalars
+ // => %src & %gep are scalars
+ // => swapping idx & idx2 is safe
+ // 4) %base a vector
+ // => %src is a vector
+ // => swapping idx & idx2 is safe.
+ auto *SO0 = Src->getOperand(0);
+ auto *SO0Ty = SO0->getType();
+ if (!isa<VectorType>(GEPType) || // case 3
+ isa<VectorType>(SO0Ty)) { // case 4
+ Src->setOperand(1, GO1);
+ GEP.setOperand(1, SO1);
+ return &GEP;
+ } else {
+ // Case 1 or 2
+ // -- have to recreate %src & %gep
+ // put NewSrc at same location as %src
+ Builder.SetInsertPoint(cast<Instruction>(PtrOp));
+ auto *NewSrc = cast<GetElementPtrInst>(
+ Builder.CreateGEP(GEPEltType, SO0, GO1, Src->getName()));
+ NewSrc->setIsInBounds(Src->isInBounds());
+ auto *NewGEP = GetElementPtrInst::Create(GEPEltType, NewSrc, {SO1});
+ NewGEP->setIsInBounds(GEP.isInBounds());
+ return NewGEP;
+ }
+ }
+ }
+ }
+
+ // Note that if our source is a gep chain itself then we wait for that
+ // chain to be resolved before we perform this transformation. This
+ // avoids us creating a TON of code in some cases.
+ if (auto *SrcGEP = dyn_cast<GEPOperator>(Src->getOperand(0)))
+ if (SrcGEP->getNumOperands() == 2 && shouldMergeGEPs(*Src, *SrcGEP))
+ return nullptr; // Wait until our source is folded to completion.
+
+ SmallVector<Value*, 8> Indices;
+
+ // Find out whether the last index in the source GEP is a sequential idx.
+ bool EndsWithSequential = false;
+ for (gep_type_iterator I = gep_type_begin(*Src), E = gep_type_end(*Src);
+ I != E; ++I)
+ EndsWithSequential = I.isSequential();
+
+ // Can we combine the two pointer arithmetics offsets?
+ if (EndsWithSequential) {
+ // Replace: gep (gep %P, long B), long A, ...
+ // With: T = long A+B; gep %P, T, ...
+ Value *SO1 = Src->getOperand(Src->getNumOperands()-1);
+ Value *GO1 = GEP.getOperand(1);
+
+ // If they aren't the same type, then the input hasn't been processed
+ // by the loop above yet (which canonicalizes sequential index types to
+ // intptr_t). Just avoid transforming this until the input has been
+ // normalized.
+ if (SO1->getType() != GO1->getType())
+ return nullptr;
+
+ Value *Sum =
+ SimplifyAddInst(GO1, SO1, false, false, SQ.getWithInstruction(&GEP));
+ // Only do the combine when we are sure the cost after the
+ // merge is never more than that before the merge.
+ if (Sum == nullptr)
+ return nullptr;
+
+ // Update the GEP in place if possible.
+ if (Src->getNumOperands() == 2) {
+ GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)));
+ replaceOperand(GEP, 0, Src->getOperand(0));
+ replaceOperand(GEP, 1, Sum);
+ return &GEP;
+ }
+ Indices.append(Src->op_begin()+1, Src->op_end()-1);
+ Indices.push_back(Sum);
+ Indices.append(GEP.op_begin()+2, GEP.op_end());
+ } else if (isa<Constant>(*GEP.idx_begin()) &&
+ cast<Constant>(*GEP.idx_begin())->isNullValue() &&
+ Src->getNumOperands() != 1) {
+ // Otherwise we can do the fold if the first index of the GEP is a zero
+ Indices.append(Src->op_begin()+1, Src->op_end());
+ Indices.append(GEP.idx_begin()+1, GEP.idx_end());
+ }
+
+ if (!Indices.empty())
+ return isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP))
+ ? GetElementPtrInst::CreateInBounds(
+ Src->getSourceElementType(), Src->getOperand(0), Indices,
+ GEP.getName())
+ : GetElementPtrInst::Create(Src->getSourceElementType(),
+ Src->getOperand(0), Indices,
+ GEP.getName());
+ }
+
+ // Skip if GEP source element type is scalable. The type alloc size is unknown
+ // at compile-time.
+ if (GEP.getNumIndices() == 1 && !IsGEPSrcEleScalable) {
+ unsigned AS = GEP.getPointerAddressSpace();
+ if (GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
+ DL.getIndexSizeInBits(AS)) {
+ uint64_t TyAllocSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
+
+ bool Matched = false;
+ uint64_t C;
+ Value *V = nullptr;
+ if (TyAllocSize == 1) {
+ V = GEP.getOperand(1);
+ Matched = true;
+ } else if (match(GEP.getOperand(1),
+ m_AShr(m_Value(V), m_ConstantInt(C)))) {
+ if (TyAllocSize == 1ULL << C)
+ Matched = true;
+ } else if (match(GEP.getOperand(1),
+ m_SDiv(m_Value(V), m_ConstantInt(C)))) {
+ if (TyAllocSize == C)
+ Matched = true;
+ }
+
+ if (Matched) {
+ // Canonicalize (gep i8* X, -(ptrtoint Y))
+ // to (inttoptr (sub (ptrtoint X), (ptrtoint Y)))
+ // The GEP pattern is emitted by the SCEV expander for certain kinds of
+ // pointer arithmetic.
+ if (match(V, m_Neg(m_PtrToInt(m_Value())))) {
+ Operator *Index = cast<Operator>(V);
+ Value *PtrToInt = Builder.CreatePtrToInt(PtrOp, Index->getType());
+ Value *NewSub = Builder.CreateSub(PtrToInt, Index->getOperand(1));
+ return CastInst::Create(Instruction::IntToPtr, NewSub, GEPType);
+ }
+ // Canonicalize (gep i8* X, (ptrtoint Y)-(ptrtoint X))
+ // to (bitcast Y)
+ Value *Y;
+ if (match(V, m_Sub(m_PtrToInt(m_Value(Y)),
+ m_PtrToInt(m_Specific(GEP.getOperand(0))))))
+ return CastInst::CreatePointerBitCastOrAddrSpaceCast(Y, GEPType);
+ }
+ }
+ }
+
+ // We do not handle pointer-vector geps here.
+ if (GEPType->isVectorTy())
+ return nullptr;
+
+ // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
+ Value *StrippedPtr = PtrOp->stripPointerCasts();
+ PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType());
+
+ if (StrippedPtr != PtrOp) {
+ bool HasZeroPointerIndex = false;
+ Type *StrippedPtrEltTy = StrippedPtrTy->getElementType();
+
+ if (auto *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
+ HasZeroPointerIndex = C->isZero();
+
+ // Transform: GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ...
+ // into : GEP [10 x i8]* X, i32 0, ...
+ //
+ // Likewise, transform: GEP (bitcast i8* X to [0 x i8]*), i32 0, ...
+ // into : GEP i8* X, ...
+ //
+ // This occurs when the program declares an array extern like "int X[];"
+ if (HasZeroPointerIndex) {
+ if (auto *CATy = dyn_cast<ArrayType>(GEPEltType)) {
+ // GEP (bitcast i8* X to [0 x i8]*), i32 0, ... ?
+ if (CATy->getElementType() == StrippedPtrEltTy) {
+ // -> GEP i8* X, ...
SmallVector<Value *, 8> Idx(drop_begin(GEP.indices()));
- GetElementPtrInst *Res = GetElementPtrInst::Create(
- StrippedPtrEltTy, StrippedPtr, Idx, GEP.getName());
- Res->setIsInBounds(GEP.isInBounds());
- if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace())
- return Res;
- // Insert Res, and create an addrspacecast.
- // e.g.,
- // GEP (addrspacecast i8 addrspace(1)* X to [0 x i8]*), i32 0, ...
- // ->
- // %0 = GEP i8 addrspace(1)* X, ...
- // addrspacecast i8 addrspace(1)* %0 to i8*
- return new AddrSpaceCastInst(Builder.Insert(Res), GEPType);
- }
-
- if (auto *XATy = dyn_cast<ArrayType>(StrippedPtrEltTy)) {
- // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
- if (CATy->getElementType() == XATy->getElementType()) {
- // -> GEP [10 x i8]* X, i32 0, ...
- // At this point, we know that the cast source type is a pointer
- // to an array of the same type as the destination pointer
- // array. Because the array type is never stepped over (there
- // is a leading zero) we can fold the cast into this GEP.
- if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) {
- GEP.setSourceElementType(XATy);
- return replaceOperand(GEP, 0, StrippedPtr);
- }
- // Cannot replace the base pointer directly because StrippedPtr's
- // address space is different. Instead, create a new GEP followed by
- // an addrspacecast.
- // e.g.,
- // GEP (addrspacecast [10 x i8] addrspace(1)* X to [0 x i8]*),
- // i32 0, ...
- // ->
- // %0 = GEP [10 x i8] addrspace(1)* X, ...
- // addrspacecast i8 addrspace(1)* %0 to i8*
+ GetElementPtrInst *Res = GetElementPtrInst::Create(
+ StrippedPtrEltTy, StrippedPtr, Idx, GEP.getName());
+ Res->setIsInBounds(GEP.isInBounds());
+ if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace())
+ return Res;
+ // Insert Res, and create an addrspacecast.
+ // e.g.,
+ // GEP (addrspacecast i8 addrspace(1)* X to [0 x i8]*), i32 0, ...
+ // ->
+ // %0 = GEP i8 addrspace(1)* X, ...
+ // addrspacecast i8 addrspace(1)* %0 to i8*
+ return new AddrSpaceCastInst(Builder.Insert(Res), GEPType);
+ }
+
+ if (auto *XATy = dyn_cast<ArrayType>(StrippedPtrEltTy)) {
+ // GEP (bitcast [10 x i8]* X to [0 x i8]*), i32 0, ... ?
+ if (CATy->getElementType() == XATy->getElementType()) {
+ // -> GEP [10 x i8]* X, i32 0, ...
+ // At this point, we know that the cast source type is a pointer
+ // to an array of the same type as the destination pointer
+ // array. Because the array type is never stepped over (there
+ // is a leading zero) we can fold the cast into this GEP.
+ if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) {
+ GEP.setSourceElementType(XATy);
+ return replaceOperand(GEP, 0, StrippedPtr);
+ }
+ // Cannot replace the base pointer directly because StrippedPtr's
+ // address space is different. Instead, create a new GEP followed by
+ // an addrspacecast.
+ // e.g.,
+ // GEP (addrspacecast [10 x i8] addrspace(1)* X to [0 x i8]*),
+ // i32 0, ...
+ // ->
+ // %0 = GEP [10 x i8] addrspace(1)* X, ...
+ // addrspacecast i8 addrspace(1)* %0 to i8*
SmallVector<Value *, 8> Idx(GEP.indices());
- Value *NewGEP =
- GEP.isInBounds()
- ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
- Idx, GEP.getName())
- : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx,
- GEP.getName());
- return new AddrSpaceCastInst(NewGEP, GEPType);
- }
- }
- }
- } else if (GEP.getNumOperands() == 2 && !IsGEPSrcEleScalable) {
- // Skip if GEP source element type is scalable. The type alloc size is
- // unknown at compile-time.
- // Transform things like: %t = getelementptr i32*
- // bitcast ([2 x i32]* %str to i32*), i32 %V into: %t1 = getelementptr [2
- // x i32]* %str, i32 0, i32 %V; bitcast
- if (StrippedPtrEltTy->isArrayTy() &&
- DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType()) ==
- DL.getTypeAllocSize(GEPEltType)) {
- Type *IdxType = DL.getIndexType(GEPType);
- Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) };
- Value *NewGEP =
- GEP.isInBounds()
- ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr, Idx,
- GEP.getName())
- : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx,
- GEP.getName());
-
- // V and GEP are both pointer types --> BitCast
- return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, GEPType);
- }
-
- // Transform things like:
- // %V = mul i64 %N, 4
- // %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V
- // into: %t1 = getelementptr i32* %arr, i32 %N; bitcast
- if (GEPEltType->isSized() && StrippedPtrEltTy->isSized()) {
- // Check that changing the type amounts to dividing the index by a scale
- // factor.
- uint64_t ResSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
- uint64_t SrcSize = DL.getTypeAllocSize(StrippedPtrEltTy).getFixedSize();
- if (ResSize && SrcSize % ResSize == 0) {
- Value *Idx = GEP.getOperand(1);
- unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
- uint64_t Scale = SrcSize / ResSize;
-
- // Earlier transforms ensure that the index has the right type
- // according to Data Layout, which considerably simplifies the
- // logic by eliminating implicit casts.
- assert(Idx->getType() == DL.getIndexType(GEPType) &&
- "Index type does not match the Data Layout preferences");
-
- bool NSW;
- if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
- // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
- // If the multiplication NewIdx * Scale may overflow then the new
- // GEP may not be "inbounds".
- Value *NewGEP =
- GEP.isInBounds() && NSW
- ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
- NewIdx, GEP.getName())
- : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, NewIdx,
- GEP.getName());
-
- // The NewGEP must be pointer typed, so must the old one -> BitCast
- return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
- GEPType);
- }
- }
- }
-
- // Similarly, transform things like:
- // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
- // (where tmp = 8*tmp2) into:
- // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
- if (GEPEltType->isSized() && StrippedPtrEltTy->isSized() &&
- StrippedPtrEltTy->isArrayTy()) {
- // Check that changing to the array element type amounts to dividing the
- // index by a scale factor.
- uint64_t ResSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
- uint64_t ArrayEltSize =
- DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType())
- .getFixedSize();
- if (ResSize && ArrayEltSize % ResSize == 0) {
- Value *Idx = GEP.getOperand(1);
- unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
- uint64_t Scale = ArrayEltSize / ResSize;
-
- // Earlier transforms ensure that the index has the right type
- // according to the Data Layout, which considerably simplifies
- // the logic by eliminating implicit casts.
- assert(Idx->getType() == DL.getIndexType(GEPType) &&
- "Index type does not match the Data Layout preferences");
-
- bool NSW;
- if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
- // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
- // If the multiplication NewIdx * Scale may overflow then the new
- // GEP may not be "inbounds".
- Type *IndTy = DL.getIndexType(GEPType);
- Value *Off[2] = {Constant::getNullValue(IndTy), NewIdx};
-
- Value *NewGEP =
- GEP.isInBounds() && NSW
- ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
- Off, GEP.getName())
- : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Off,
- GEP.getName());
- // The NewGEP must be pointer typed, so must the old one -> BitCast
- return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
- GEPType);
- }
- }
- }
- }
- }
-
- // addrspacecast between types is canonicalized as a bitcast, then an
- // addrspacecast. To take advantage of the below bitcast + struct GEP, look
- // through the addrspacecast.
- Value *ASCStrippedPtrOp = PtrOp;
- if (auto *ASC = dyn_cast<AddrSpaceCastInst>(PtrOp)) {
- // X = bitcast A addrspace(1)* to B addrspace(1)*
- // Y = addrspacecast A addrspace(1)* to B addrspace(2)*
- // Z = gep Y, <...constant indices...>
- // Into an addrspacecasted GEP of the struct.
- if (auto *BC = dyn_cast<BitCastInst>(ASC->getOperand(0)))
- ASCStrippedPtrOp = BC;
- }
-
- if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) {
- Value *SrcOp = BCI->getOperand(0);
- PointerType *SrcType = cast<PointerType>(BCI->getSrcTy());
- Type *SrcEltType = SrcType->getElementType();
-
- // GEP directly using the source operand if this GEP is accessing an element
- // of a bitcasted pointer to vector or array of the same dimensions:
- // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z
- // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z
- auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy,
- const DataLayout &DL) {
+ Value *NewGEP =
+ GEP.isInBounds()
+ ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
+ Idx, GEP.getName())
+ : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx,
+ GEP.getName());
+ return new AddrSpaceCastInst(NewGEP, GEPType);
+ }
+ }
+ }
+ } else if (GEP.getNumOperands() == 2 && !IsGEPSrcEleScalable) {
+ // Skip if GEP source element type is scalable. The type alloc size is
+ // unknown at compile-time.
+ // Transform things like: %t = getelementptr i32*
+ // bitcast ([2 x i32]* %str to i32*), i32 %V into: %t1 = getelementptr [2
+ // x i32]* %str, i32 0, i32 %V; bitcast
+ if (StrippedPtrEltTy->isArrayTy() &&
+ DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType()) ==
+ DL.getTypeAllocSize(GEPEltType)) {
+ Type *IdxType = DL.getIndexType(GEPType);
+ Value *Idx[2] = { Constant::getNullValue(IdxType), GEP.getOperand(1) };
+ Value *NewGEP =
+ GEP.isInBounds()
+ ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr, Idx,
+ GEP.getName())
+ : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Idx,
+ GEP.getName());
+
+ // V and GEP are both pointer types --> BitCast
+ return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP, GEPType);
+ }
+
+ // Transform things like:
+ // %V = mul i64 %N, 4
+ // %t = getelementptr i8* bitcast (i32* %arr to i8*), i32 %V
+ // into: %t1 = getelementptr i32* %arr, i32 %N; bitcast
+ if (GEPEltType->isSized() && StrippedPtrEltTy->isSized()) {
+ // Check that changing the type amounts to dividing the index by a scale
+ // factor.
+ uint64_t ResSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
+ uint64_t SrcSize = DL.getTypeAllocSize(StrippedPtrEltTy).getFixedSize();
+ if (ResSize && SrcSize % ResSize == 0) {
+ Value *Idx = GEP.getOperand(1);
+ unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
+ uint64_t Scale = SrcSize / ResSize;
+
+ // Earlier transforms ensure that the index has the right type
+ // according to Data Layout, which considerably simplifies the
+ // logic by eliminating implicit casts.
+ assert(Idx->getType() == DL.getIndexType(GEPType) &&
+ "Index type does not match the Data Layout preferences");
+
+ bool NSW;
+ if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
+ // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
+ // If the multiplication NewIdx * Scale may overflow then the new
+ // GEP may not be "inbounds".
+ Value *NewGEP =
+ GEP.isInBounds() && NSW
+ ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
+ NewIdx, GEP.getName())
+ : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, NewIdx,
+ GEP.getName());
+
+ // The NewGEP must be pointer typed, so must the old one -> BitCast
+ return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
+ GEPType);
+ }
+ }
+ }
+
+ // Similarly, transform things like:
+ // getelementptr i8* bitcast ([100 x double]* X to i8*), i32 %tmp
+ // (where tmp = 8*tmp2) into:
+ // getelementptr [100 x double]* %arr, i32 0, i32 %tmp2; bitcast
+ if (GEPEltType->isSized() && StrippedPtrEltTy->isSized() &&
+ StrippedPtrEltTy->isArrayTy()) {
+ // Check that changing to the array element type amounts to dividing the
+ // index by a scale factor.
+ uint64_t ResSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
+ uint64_t ArrayEltSize =
+ DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType())
+ .getFixedSize();
+ if (ResSize && ArrayEltSize % ResSize == 0) {
+ Value *Idx = GEP.getOperand(1);
+ unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
+ uint64_t Scale = ArrayEltSize / ResSize;
+
+ // Earlier transforms ensure that the index has the right type
+ // according to the Data Layout, which considerably simplifies
+ // the logic by eliminating implicit casts.
+ assert(Idx->getType() == DL.getIndexType(GEPType) &&
+ "Index type does not match the Data Layout preferences");
+
+ bool NSW;
+ if (Value *NewIdx = Descale(Idx, APInt(BitWidth, Scale), NSW)) {
+ // Successfully decomposed Idx as NewIdx * Scale, form a new GEP.
+ // If the multiplication NewIdx * Scale may overflow then the new
+ // GEP may not be "inbounds".
+ Type *IndTy = DL.getIndexType(GEPType);
+ Value *Off[2] = {Constant::getNullValue(IndTy), NewIdx};
+
+ Value *NewGEP =
+ GEP.isInBounds() && NSW
+ ? Builder.CreateInBoundsGEP(StrippedPtrEltTy, StrippedPtr,
+ Off, GEP.getName())
+ : Builder.CreateGEP(StrippedPtrEltTy, StrippedPtr, Off,
+ GEP.getName());
+ // The NewGEP must be pointer typed, so must the old one -> BitCast
+ return CastInst::CreatePointerBitCastOrAddrSpaceCast(NewGEP,
+ GEPType);
+ }
+ }
+ }
+ }
+ }
+
+ // addrspacecast between types is canonicalized as a bitcast, then an
+ // addrspacecast. To take advantage of the below bitcast + struct GEP, look
+ // through the addrspacecast.
+ Value *ASCStrippedPtrOp = PtrOp;
+ if (auto *ASC = dyn_cast<AddrSpaceCastInst>(PtrOp)) {
+ // X = bitcast A addrspace(1)* to B addrspace(1)*
+ // Y = addrspacecast A addrspace(1)* to B addrspace(2)*
+ // Z = gep Y, <...constant indices...>
+ // Into an addrspacecasted GEP of the struct.
+ if (auto *BC = dyn_cast<BitCastInst>(ASC->getOperand(0)))
+ ASCStrippedPtrOp = BC;
+ }
+
+ if (auto *BCI = dyn_cast<BitCastInst>(ASCStrippedPtrOp)) {
+ Value *SrcOp = BCI->getOperand(0);
+ PointerType *SrcType = cast<PointerType>(BCI->getSrcTy());
+ Type *SrcEltType = SrcType->getElementType();
+
+ // GEP directly using the source operand if this GEP is accessing an element
+ // of a bitcasted pointer to vector or array of the same dimensions:
+ // gep (bitcast <c x ty>* X to [c x ty]*), Y, Z --> gep X, Y, Z
+ // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z
+ auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy,
+ const DataLayout &DL) {
auto *VecVTy = cast<FixedVectorType>(VecTy);
- return ArrTy->getArrayElementType() == VecVTy->getElementType() &&
- ArrTy->getArrayNumElements() == VecVTy->getNumElements() &&
- DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy);
- };
- if (GEP.getNumOperands() == 3 &&
+ return ArrTy->getArrayElementType() == VecVTy->getElementType() &&
+ ArrTy->getArrayNumElements() == VecVTy->getNumElements() &&
+ DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy);
+ };
+ if (GEP.getNumOperands() == 3 &&
((GEPEltType->isArrayTy() && isa<FixedVectorType>(SrcEltType) &&
- areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) ||
+ areMatchingArrayAndVecTypes(GEPEltType, SrcEltType, DL)) ||
(isa<FixedVectorType>(GEPEltType) && SrcEltType->isArrayTy() &&
- areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) {
-
- // Create a new GEP here, as using `setOperand()` followed by
- // `setSourceElementType()` won't actually update the type of the
- // existing GEP Value. Causing issues if this Value is accessed when
- // constructing an AddrSpaceCastInst
- Value *NGEP =
- GEP.isInBounds()
- ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]})
- : Builder.CreateGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]});
- NGEP->takeName(&GEP);
-
- // Preserve GEP address space to satisfy users
- if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
- return new AddrSpaceCastInst(NGEP, GEPType);
-
- return replaceInstUsesWith(GEP, NGEP);
- }
-
- // See if we can simplify:
- // X = bitcast A* to B*
- // Y = gep X, <...constant indices...>
- // into a gep of the original struct. This is important for SROA and alias
- // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged.
- unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEPType);
- APInt Offset(OffsetBits, 0);
- if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset)) {
- // If this GEP instruction doesn't move the pointer, just replace the GEP
- // with a bitcast of the real input to the dest type.
- if (!Offset) {
- // If the bitcast is of an allocation, and the allocation will be
- // converted to match the type of the cast, don't touch this.
- if (isa<AllocaInst>(SrcOp) || isAllocationFn(SrcOp, &TLI)) {
- // See if the bitcast simplifies, if so, don't nuke this GEP yet.
- if (Instruction *I = visitBitCast(*BCI)) {
- if (I != BCI) {
- I->takeName(BCI);
- BCI->getParent()->getInstList().insert(BCI->getIterator(), I);
- replaceInstUsesWith(*BCI, I);
- }
- return &GEP;
- }
- }
-
- if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace())
- return new AddrSpaceCastInst(SrcOp, GEPType);
- return new BitCastInst(SrcOp, GEPType);
- }
-
- // Otherwise, if the offset is non-zero, we need to find out if there is a
- // field at Offset in 'A's type. If so, we can pull the cast through the
- // GEP.
- SmallVector<Value*, 8> NewIndices;
- if (FindElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices)) {
- Value *NGEP =
- GEP.isInBounds()
- ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices)
- : Builder.CreateGEP(SrcEltType, SrcOp, NewIndices);
-
- if (NGEP->getType() == GEPType)
- return replaceInstUsesWith(GEP, NGEP);
- NGEP->takeName(&GEP);
-
- if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
- return new AddrSpaceCastInst(NGEP, GEPType);
- return new BitCastInst(NGEP, GEPType);
- }
- }
- }
-
- if (!GEP.isInBounds()) {
- unsigned IdxWidth =
- DL.getIndexSizeInBits(PtrOp->getType()->getPointerAddressSpace());
- APInt BasePtrOffset(IdxWidth, 0);
- Value *UnderlyingPtrOp =
- PtrOp->stripAndAccumulateInBoundsConstantOffsets(DL,
- BasePtrOffset);
- if (auto *AI = dyn_cast<AllocaInst>(UnderlyingPtrOp)) {
- if (GEP.accumulateConstantOffset(DL, BasePtrOffset) &&
- BasePtrOffset.isNonNegative()) {
- APInt AllocSize(
- IdxWidth,
- DL.getTypeAllocSize(AI->getAllocatedType()).getKnownMinSize());
- if (BasePtrOffset.ule(AllocSize)) {
- return GetElementPtrInst::CreateInBounds(
- GEP.getSourceElementType(), PtrOp, makeArrayRef(Ops).slice(1),
- GEP.getName());
- }
- }
- }
- }
-
- if (Instruction *R = foldSelectGEP(GEP, Builder))
- return R;
-
- return nullptr;
-}
-
-static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI,
- Instruction *AI) {
- if (isa<ConstantPointerNull>(V))
- return true;
- if (auto *LI = dyn_cast<LoadInst>(V))
- return isa<GlobalVariable>(LI->getPointerOperand());
- // Two distinct allocations will never be equal.
- // We rely on LookThroughBitCast in isAllocLikeFn being false, since looking
- // through bitcasts of V can cause
- // the result statement below to be true, even when AI and V (ex:
- // i8* ->i32* ->i8* of AI) are the same allocations.
- return isAllocLikeFn(V, TLI) && V != AI;
-}
-
-static bool isAllocSiteRemovable(Instruction *AI,
- SmallVectorImpl<WeakTrackingVH> &Users,
- const TargetLibraryInfo *TLI) {
- SmallVector<Instruction*, 4> Worklist;
- Worklist.push_back(AI);
-
- do {
- Instruction *PI = Worklist.pop_back_val();
- for (User *U : PI->users()) {
- Instruction *I = cast<Instruction>(U);
- switch (I->getOpcode()) {
- default:
- // Give up the moment we see something we can't handle.
- return false;
-
- case Instruction::AddrSpaceCast:
- case Instruction::BitCast:
- case Instruction::GetElementPtr:
- Users.emplace_back(I);
- Worklist.push_back(I);
- continue;
-
- case Instruction::ICmp: {
- ICmpInst *ICI = cast<ICmpInst>(I);
- // We can fold eq/ne comparisons with null to false/true, respectively.
- // We also fold comparisons in some conditions provided the alloc has
- // not escaped (see isNeverEqualToUnescapedAlloc).
- if (!ICI->isEquality())
- return false;
- unsigned OtherIndex = (ICI->getOperand(0) == PI) ? 1 : 0;
- if (!isNeverEqualToUnescapedAlloc(ICI->getOperand(OtherIndex), TLI, AI))
- return false;
- Users.emplace_back(I);
- continue;
- }
-
- case Instruction::Call:
- // Ignore no-op and store intrinsics.
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- switch (II->getIntrinsicID()) {
- default:
- return false;
-
- case Intrinsic::memmove:
- case Intrinsic::memcpy:
- case Intrinsic::memset: {
- MemIntrinsic *MI = cast<MemIntrinsic>(II);
- if (MI->isVolatile() || MI->getRawDest() != PI)
- return false;
- LLVM_FALLTHROUGH;
- }
- case Intrinsic::assume:
- case Intrinsic::invariant_start:
- case Intrinsic::invariant_end:
- case Intrinsic::lifetime_start:
- case Intrinsic::lifetime_end:
- case Intrinsic::objectsize:
- Users.emplace_back(I);
- continue;
- }
- }
-
- if (isFreeCall(I, TLI)) {
- Users.emplace_back(I);
- continue;
- }
- return false;
-
- case Instruction::Store: {
- StoreInst *SI = cast<StoreInst>(I);
- if (SI->isVolatile() || SI->getPointerOperand() != PI)
- return false;
- Users.emplace_back(I);
- continue;
- }
- }
- llvm_unreachable("missing a return?");
- }
- } while (!Worklist.empty());
- return true;
-}
-
+ areMatchingArrayAndVecTypes(SrcEltType, GEPEltType, DL)))) {
+
+ // Create a new GEP here, as using `setOperand()` followed by
+ // `setSourceElementType()` won't actually update the type of the
+ // existing GEP Value. Causing issues if this Value is accessed when
+ // constructing an AddrSpaceCastInst
+ Value *NGEP =
+ GEP.isInBounds()
+ ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]})
+ : Builder.CreateGEP(SrcEltType, SrcOp, {Ops[1], Ops[2]});
+ NGEP->takeName(&GEP);
+
+ // Preserve GEP address space to satisfy users
+ if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
+ return new AddrSpaceCastInst(NGEP, GEPType);
+
+ return replaceInstUsesWith(GEP, NGEP);
+ }
+
+ // See if we can simplify:
+ // X = bitcast A* to B*
+ // Y = gep X, <...constant indices...>
+ // into a gep of the original struct. This is important for SROA and alias
+ // analysis of unions. If "A" is also a bitcast, wait for A/X to be merged.
+ unsigned OffsetBits = DL.getIndexTypeSizeInBits(GEPType);
+ APInt Offset(OffsetBits, 0);
+ if (!isa<BitCastInst>(SrcOp) && GEP.accumulateConstantOffset(DL, Offset)) {
+ // If this GEP instruction doesn't move the pointer, just replace the GEP
+ // with a bitcast of the real input to the dest type.
+ if (!Offset) {
+ // If the bitcast is of an allocation, and the allocation will be
+ // converted to match the type of the cast, don't touch this.
+ if (isa<AllocaInst>(SrcOp) || isAllocationFn(SrcOp, &TLI)) {
+ // See if the bitcast simplifies, if so, don't nuke this GEP yet.
+ if (Instruction *I = visitBitCast(*BCI)) {
+ if (I != BCI) {
+ I->takeName(BCI);
+ BCI->getParent()->getInstList().insert(BCI->getIterator(), I);
+ replaceInstUsesWith(*BCI, I);
+ }
+ return &GEP;
+ }
+ }
+
+ if (SrcType->getPointerAddressSpace() != GEP.getAddressSpace())
+ return new AddrSpaceCastInst(SrcOp, GEPType);
+ return new BitCastInst(SrcOp, GEPType);
+ }
+
+ // Otherwise, if the offset is non-zero, we need to find out if there is a
+ // field at Offset in 'A's type. If so, we can pull the cast through the
+ // GEP.
+ SmallVector<Value*, 8> NewIndices;
+ if (FindElementAtOffset(SrcType, Offset.getSExtValue(), NewIndices)) {
+ Value *NGEP =
+ GEP.isInBounds()
+ ? Builder.CreateInBoundsGEP(SrcEltType, SrcOp, NewIndices)
+ : Builder.CreateGEP(SrcEltType, SrcOp, NewIndices);
+
+ if (NGEP->getType() == GEPType)
+ return replaceInstUsesWith(GEP, NGEP);
+ NGEP->takeName(&GEP);
+
+ if (NGEP->getType()->getPointerAddressSpace() != GEP.getAddressSpace())
+ return new AddrSpaceCastInst(NGEP, GEPType);
+ return new BitCastInst(NGEP, GEPType);
+ }
+ }
+ }
+
+ if (!GEP.isInBounds()) {
+ unsigned IdxWidth =
+ DL.getIndexSizeInBits(PtrOp->getType()->getPointerAddressSpace());
+ APInt BasePtrOffset(IdxWidth, 0);
+ Value *UnderlyingPtrOp =
+ PtrOp->stripAndAccumulateInBoundsConstantOffsets(DL,
+ BasePtrOffset);
+ if (auto *AI = dyn_cast<AllocaInst>(UnderlyingPtrOp)) {
+ if (GEP.accumulateConstantOffset(DL, BasePtrOffset) &&
+ BasePtrOffset.isNonNegative()) {
+ APInt AllocSize(
+ IdxWidth,
+ DL.getTypeAllocSize(AI->getAllocatedType()).getKnownMinSize());
+ if (BasePtrOffset.ule(AllocSize)) {
+ return GetElementPtrInst::CreateInBounds(
+ GEP.getSourceElementType(), PtrOp, makeArrayRef(Ops).slice(1),
+ GEP.getName());
+ }
+ }
+ }
+ }
+
+ if (Instruction *R = foldSelectGEP(GEP, Builder))
+ return R;
+
+ return nullptr;
+}
+
+static bool isNeverEqualToUnescapedAlloc(Value *V, const TargetLibraryInfo *TLI,
+ Instruction *AI) {
+ if (isa<ConstantPointerNull>(V))
+ return true;
+ if (auto *LI = dyn_cast<LoadInst>(V))
+ return isa<GlobalVariable>(LI->getPointerOperand());
+ // Two distinct allocations will never be equal.
+ // We rely on LookThroughBitCast in isAllocLikeFn being false, since looking
+ // through bitcasts of V can cause
+ // the result statement below to be true, even when AI and V (ex:
+ // i8* ->i32* ->i8* of AI) are the same allocations.
+ return isAllocLikeFn(V, TLI) && V != AI;
+}
+
+static bool isAllocSiteRemovable(Instruction *AI,
+ SmallVectorImpl<WeakTrackingVH> &Users,
+ const TargetLibraryInfo *TLI) {
+ SmallVector<Instruction*, 4> Worklist;
+ Worklist.push_back(AI);
+
+ do {
+ Instruction *PI = Worklist.pop_back_val();
+ for (User *U : PI->users()) {
+ Instruction *I = cast<Instruction>(U);
+ switch (I->getOpcode()) {
+ default:
+ // Give up the moment we see something we can't handle.
+ return false;
+
+ case Instruction::AddrSpaceCast:
+ case Instruction::BitCast:
+ case Instruction::GetElementPtr:
+ Users.emplace_back(I);
+ Worklist.push_back(I);
+ continue;
+
+ case Instruction::ICmp: {
+ ICmpInst *ICI = cast<ICmpInst>(I);
+ // We can fold eq/ne comparisons with null to false/true, respectively.
+ // We also fold comparisons in some conditions provided the alloc has
+ // not escaped (see isNeverEqualToUnescapedAlloc).
+ if (!ICI->isEquality())
+ return false;
+ unsigned OtherIndex = (ICI->getOperand(0) == PI) ? 1 : 0;
+ if (!isNeverEqualToUnescapedAlloc(ICI->getOperand(OtherIndex), TLI, AI))
+ return false;
+ Users.emplace_back(I);
+ continue;
+ }
+
+ case Instruction::Call:
+ // Ignore no-op and store intrinsics.
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default:
+ return false;
+
+ case Intrinsic::memmove:
+ case Intrinsic::memcpy:
+ case Intrinsic::memset: {
+ MemIntrinsic *MI = cast<MemIntrinsic>(II);
+ if (MI->isVolatile() || MI->getRawDest() != PI)
+ return false;
+ LLVM_FALLTHROUGH;
+ }
+ case Intrinsic::assume:
+ case Intrinsic::invariant_start:
+ case Intrinsic::invariant_end:
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ case Intrinsic::objectsize:
+ Users.emplace_back(I);
+ continue;
+ }
+ }
+
+ if (isFreeCall(I, TLI)) {
+ Users.emplace_back(I);
+ continue;
+ }
+ return false;
+
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(I);
+ if (SI->isVolatile() || SI->getPointerOperand() != PI)
+ return false;
+ Users.emplace_back(I);
+ continue;
+ }
+ }
+ llvm_unreachable("missing a return?");
+ }
+ } while (!Worklist.empty());
+ return true;
+}
+
Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
- // If we have a malloc call which is only used in any amount of comparisons to
- // null and free calls, delete the calls and replace the comparisons with true
- // or false as appropriate.
-
- // This is based on the principle that we can substitute our own allocation
- // function (which will never return null) rather than knowledge of the
- // specific function being called. In some sense this can change the permitted
- // outputs of a program (when we convert a malloc to an alloca, the fact that
- // the allocation is now on the stack is potentially visible, for example),
- // but we believe in a permissible manner.
- SmallVector<WeakTrackingVH, 64> Users;
-
- // If we are removing an alloca with a dbg.declare, insert dbg.value calls
- // before each store.
+ // If we have a malloc call which is only used in any amount of comparisons to
+ // null and free calls, delete the calls and replace the comparisons with true
+ // or false as appropriate.
+
+ // This is based on the principle that we can substitute our own allocation
+ // function (which will never return null) rather than knowledge of the
+ // specific function being called. In some sense this can change the permitted
+ // outputs of a program (when we convert a malloc to an alloca, the fact that
+ // the allocation is now on the stack is potentially visible, for example),
+ // but we believe in a permissible manner.
+ SmallVector<WeakTrackingVH, 64> Users;
+
+ // If we are removing an alloca with a dbg.declare, insert dbg.value calls
+ // before each store.
SmallVector<DbgVariableIntrinsic *, 8> DVIs;
- std::unique_ptr<DIBuilder> DIB;
- if (isa<AllocaInst>(MI)) {
+ std::unique_ptr<DIBuilder> DIB;
+ if (isa<AllocaInst>(MI)) {
findDbgUsers(DVIs, &MI);
- DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
- }
-
- if (isAllocSiteRemovable(&MI, Users, &TLI)) {
- for (unsigned i = 0, e = Users.size(); i != e; ++i) {
- // Lowering all @llvm.objectsize calls first because they may
- // use a bitcast/GEP of the alloca we are removing.
- if (!Users[i])
- continue;
-
- Instruction *I = cast<Instruction>(&*Users[i]);
-
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- if (II->getIntrinsicID() == Intrinsic::objectsize) {
- Value *Result =
- lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/true);
- replaceInstUsesWith(*I, Result);
- eraseInstFromFunction(*I);
- Users[i] = nullptr; // Skip examining in the next loop.
- }
- }
- }
- for (unsigned i = 0, e = Users.size(); i != e; ++i) {
- if (!Users[i])
- continue;
-
- Instruction *I = cast<Instruction>(&*Users[i]);
-
- if (ICmpInst *C = dyn_cast<ICmpInst>(I)) {
- replaceInstUsesWith(*C,
- ConstantInt::get(Type::getInt1Ty(C->getContext()),
- C->isFalseWhenEqual()));
- } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+ DIB.reset(new DIBuilder(*MI.getModule(), /*AllowUnresolved=*/false));
+ }
+
+ if (isAllocSiteRemovable(&MI, Users, &TLI)) {
+ for (unsigned i = 0, e = Users.size(); i != e; ++i) {
+ // Lowering all @llvm.objectsize calls first because they may
+ // use a bitcast/GEP of the alloca we are removing.
+ if (!Users[i])
+ continue;
+
+ Instruction *I = cast<Instruction>(&*Users[i]);
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ if (II->getIntrinsicID() == Intrinsic::objectsize) {
+ Value *Result =
+ lowerObjectSizeCall(II, DL, &TLI, /*MustSucceed=*/true);
+ replaceInstUsesWith(*I, Result);
+ eraseInstFromFunction(*I);
+ Users[i] = nullptr; // Skip examining in the next loop.
+ }
+ }
+ }
+ for (unsigned i = 0, e = Users.size(); i != e; ++i) {
+ if (!Users[i])
+ continue;
+
+ Instruction *I = cast<Instruction>(&*Users[i]);
+
+ if (ICmpInst *C = dyn_cast<ICmpInst>(I)) {
+ replaceInstUsesWith(*C,
+ ConstantInt::get(Type::getInt1Ty(C->getContext()),
+ C->isFalseWhenEqual()));
+ } else if (auto *SI = dyn_cast<StoreInst>(I)) {
for (auto *DVI : DVIs)
if (DVI->isAddressOfVariable())
ConvertDebugDeclareToDebugValue(DVI, SI, *DIB);
- } else {
- // Casts, GEP, or anything else: we're about to delete this instruction,
- // so it can not have any valid uses.
- replaceInstUsesWith(*I, UndefValue::get(I->getType()));
- }
- eraseInstFromFunction(*I);
- }
-
- if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) {
- // Replace invoke with a NOP intrinsic to maintain the original CFG
- Module *M = II->getModule();
- Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);
- InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),
- None, "", II->getParent());
- }
-
+ } else {
+ // Casts, GEP, or anything else: we're about to delete this instruction,
+ // so it can not have any valid uses.
+ replaceInstUsesWith(*I, UndefValue::get(I->getType()));
+ }
+ eraseInstFromFunction(*I);
+ }
+
+ if (InvokeInst *II = dyn_cast<InvokeInst>(&MI)) {
+ // Replace invoke with a NOP intrinsic to maintain the original CFG
+ Module *M = II->getModule();
+ Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);
+ InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),
+ None, "", II->getParent());
+ }
+
// Remove debug intrinsics which describe the value contained within the
// alloca. In addition to removing dbg.{declare,addr} which simply point to
// the alloca, remove dbg.value(<alloca>, ..., DW_OP_deref)'s as well, e.g.:
@@ -2697,157 +2697,157 @@ Instruction *InstCombinerImpl::visitAllocSite(Instruction &MI) {
for (auto *DVI : DVIs)
if (DVI->isAddressOfVariable() || DVI->getExpression()->startsWithDeref())
DVI->eraseFromParent();
-
- return eraseInstFromFunction(MI);
- }
- return nullptr;
-}
-
-/// Move the call to free before a NULL test.
-///
-/// Check if this free is accessed after its argument has been test
-/// against NULL (property 0).
-/// If yes, it is legal to move this call in its predecessor block.
-///
-/// The move is performed only if the block containing the call to free
-/// will be removed, i.e.:
-/// 1. it has only one predecessor P, and P has two successors
-/// 2. it contains the call, noops, and an unconditional branch
-/// 3. its successor is the same as its predecessor's successor
-///
-/// The profitability is out-of concern here and this function should
-/// be called only if the caller knows this transformation would be
-/// profitable (e.g., for code size).
-static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI,
- const DataLayout &DL) {
- Value *Op = FI.getArgOperand(0);
- BasicBlock *FreeInstrBB = FI.getParent();
- BasicBlock *PredBB = FreeInstrBB->getSinglePredecessor();
-
- // Validate part of constraint #1: Only one predecessor
- // FIXME: We can extend the number of predecessor, but in that case, we
- // would duplicate the call to free in each predecessor and it may
- // not be profitable even for code size.
- if (!PredBB)
- return nullptr;
-
- // Validate constraint #2: Does this block contains only the call to
- // free, noops, and an unconditional branch?
- BasicBlock *SuccBB;
- Instruction *FreeInstrBBTerminator = FreeInstrBB->getTerminator();
- if (!match(FreeInstrBBTerminator, m_UnconditionalBr(SuccBB)))
- return nullptr;
-
- // If there are only 2 instructions in the block, at this point,
- // this is the call to free and unconditional.
- // If there are more than 2 instructions, check that they are noops
- // i.e., they won't hurt the performance of the generated code.
- if (FreeInstrBB->size() != 2) {
- for (const Instruction &Inst : FreeInstrBB->instructionsWithoutDebug()) {
- if (&Inst == &FI || &Inst == FreeInstrBBTerminator)
- continue;
- auto *Cast = dyn_cast<CastInst>(&Inst);
- if (!Cast || !Cast->isNoopCast(DL))
- return nullptr;
- }
- }
- // Validate the rest of constraint #1 by matching on the pred branch.
- Instruction *TI = PredBB->getTerminator();
- BasicBlock *TrueBB, *FalseBB;
- ICmpInst::Predicate Pred;
- if (!match(TI, m_Br(m_ICmp(Pred,
- m_CombineOr(m_Specific(Op),
- m_Specific(Op->stripPointerCasts())),
- m_Zero()),
- TrueBB, FalseBB)))
- return nullptr;
- if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE)
- return nullptr;
-
- // Validate constraint #3: Ensure the null case just falls through.
- if (SuccBB != (Pred == ICmpInst::ICMP_EQ ? TrueBB : FalseBB))
- return nullptr;
- assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) &&
- "Broken CFG: missing edge from predecessor to successor");
-
- // At this point, we know that everything in FreeInstrBB can be moved
- // before TI.
- for (BasicBlock::iterator It = FreeInstrBB->begin(), End = FreeInstrBB->end();
- It != End;) {
- Instruction &Instr = *It++;
- if (&Instr == FreeInstrBBTerminator)
- break;
- Instr.moveBefore(TI);
- }
- assert(FreeInstrBB->size() == 1 &&
- "Only the branch instruction should remain");
- return &FI;
-}
-
+
+ return eraseInstFromFunction(MI);
+ }
+ return nullptr;
+}
+
+/// Move the call to free before a NULL test.
+///
+/// Check if this free is accessed after its argument has been test
+/// against NULL (property 0).
+/// If yes, it is legal to move this call in its predecessor block.
+///
+/// The move is performed only if the block containing the call to free
+/// will be removed, i.e.:
+/// 1. it has only one predecessor P, and P has two successors
+/// 2. it contains the call, noops, and an unconditional branch
+/// 3. its successor is the same as its predecessor's successor
+///
+/// The profitability is out-of concern here and this function should
+/// be called only if the caller knows this transformation would be
+/// profitable (e.g., for code size).
+static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI,
+ const DataLayout &DL) {
+ Value *Op = FI.getArgOperand(0);
+ BasicBlock *FreeInstrBB = FI.getParent();
+ BasicBlock *PredBB = FreeInstrBB->getSinglePredecessor();
+
+ // Validate part of constraint #1: Only one predecessor
+ // FIXME: We can extend the number of predecessor, but in that case, we
+ // would duplicate the call to free in each predecessor and it may
+ // not be profitable even for code size.
+ if (!PredBB)
+ return nullptr;
+
+ // Validate constraint #2: Does this block contains only the call to
+ // free, noops, and an unconditional branch?
+ BasicBlock *SuccBB;
+ Instruction *FreeInstrBBTerminator = FreeInstrBB->getTerminator();
+ if (!match(FreeInstrBBTerminator, m_UnconditionalBr(SuccBB)))
+ return nullptr;
+
+ // If there are only 2 instructions in the block, at this point,
+ // this is the call to free and unconditional.
+ // If there are more than 2 instructions, check that they are noops
+ // i.e., they won't hurt the performance of the generated code.
+ if (FreeInstrBB->size() != 2) {
+ for (const Instruction &Inst : FreeInstrBB->instructionsWithoutDebug()) {
+ if (&Inst == &FI || &Inst == FreeInstrBBTerminator)
+ continue;
+ auto *Cast = dyn_cast<CastInst>(&Inst);
+ if (!Cast || !Cast->isNoopCast(DL))
+ return nullptr;
+ }
+ }
+ // Validate the rest of constraint #1 by matching on the pred branch.
+ Instruction *TI = PredBB->getTerminator();
+ BasicBlock *TrueBB, *FalseBB;
+ ICmpInst::Predicate Pred;
+ if (!match(TI, m_Br(m_ICmp(Pred,
+ m_CombineOr(m_Specific(Op),
+ m_Specific(Op->stripPointerCasts())),
+ m_Zero()),
+ TrueBB, FalseBB)))
+ return nullptr;
+ if (Pred != ICmpInst::ICMP_EQ && Pred != ICmpInst::ICMP_NE)
+ return nullptr;
+
+ // Validate constraint #3: Ensure the null case just falls through.
+ if (SuccBB != (Pred == ICmpInst::ICMP_EQ ? TrueBB : FalseBB))
+ return nullptr;
+ assert(FreeInstrBB == (Pred == ICmpInst::ICMP_EQ ? FalseBB : TrueBB) &&
+ "Broken CFG: missing edge from predecessor to successor");
+
+ // At this point, we know that everything in FreeInstrBB can be moved
+ // before TI.
+ for (BasicBlock::iterator It = FreeInstrBB->begin(), End = FreeInstrBB->end();
+ It != End;) {
+ Instruction &Instr = *It++;
+ if (&Instr == FreeInstrBBTerminator)
+ break;
+ Instr.moveBefore(TI);
+ }
+ assert(FreeInstrBB->size() == 1 &&
+ "Only the branch instruction should remain");
+ return &FI;
+}
+
Instruction *InstCombinerImpl::visitFree(CallInst &FI) {
- Value *Op = FI.getArgOperand(0);
-
- // free undef -> unreachable.
- if (isa<UndefValue>(Op)) {
- // Leave a marker since we can't modify the CFG here.
- CreateNonTerminatorUnreachable(&FI);
- return eraseInstFromFunction(FI);
- }
-
- // If we have 'free null' delete the instruction. This can happen in stl code
- // when lots of inlining happens.
- if (isa<ConstantPointerNull>(Op))
- return eraseInstFromFunction(FI);
-
- // If we optimize for code size, try to move the call to free before the null
- // test so that simplify cfg can remove the empty block and dead code
- // elimination the branch. I.e., helps to turn something like:
- // if (foo) free(foo);
- // into
- // free(foo);
- //
- // Note that we can only do this for 'free' and not for any flavor of
- // 'operator delete'; there is no 'operator delete' symbol for which we are
- // permitted to invent a call, even if we're passing in a null pointer.
- if (MinimizeSize) {
- LibFunc Func;
- if (TLI.getLibFunc(FI, Func) && TLI.has(Func) && Func == LibFunc_free)
- if (Instruction *I = tryToMoveFreeBeforeNullTest(FI, DL))
- return I;
- }
-
- return nullptr;
-}
-
-static bool isMustTailCall(Value *V) {
- if (auto *CI = dyn_cast<CallInst>(V))
- return CI->isMustTailCall();
- return false;
-}
-
+ Value *Op = FI.getArgOperand(0);
+
+ // free undef -> unreachable.
+ if (isa<UndefValue>(Op)) {
+ // Leave a marker since we can't modify the CFG here.
+ CreateNonTerminatorUnreachable(&FI);
+ return eraseInstFromFunction(FI);
+ }
+
+ // If we have 'free null' delete the instruction. This can happen in stl code
+ // when lots of inlining happens.
+ if (isa<ConstantPointerNull>(Op))
+ return eraseInstFromFunction(FI);
+
+ // If we optimize for code size, try to move the call to free before the null
+ // test so that simplify cfg can remove the empty block and dead code
+ // elimination the branch. I.e., helps to turn something like:
+ // if (foo) free(foo);
+ // into
+ // free(foo);
+ //
+ // Note that we can only do this for 'free' and not for any flavor of
+ // 'operator delete'; there is no 'operator delete' symbol for which we are
+ // permitted to invent a call, even if we're passing in a null pointer.
+ if (MinimizeSize) {
+ LibFunc Func;
+ if (TLI.getLibFunc(FI, Func) && TLI.has(Func) && Func == LibFunc_free)
+ if (Instruction *I = tryToMoveFreeBeforeNullTest(FI, DL))
+ return I;
+ }
+
+ return nullptr;
+}
+
+static bool isMustTailCall(Value *V) {
+ if (auto *CI = dyn_cast<CallInst>(V))
+ return CI->isMustTailCall();
+ return false;
+}
+
Instruction *InstCombinerImpl::visitReturnInst(ReturnInst &RI) {
- if (RI.getNumOperands() == 0) // ret void
- return nullptr;
-
- Value *ResultOp = RI.getOperand(0);
- Type *VTy = ResultOp->getType();
- if (!VTy->isIntegerTy() || isa<Constant>(ResultOp))
- return nullptr;
-
- // Don't replace result of musttail calls.
- if (isMustTailCall(ResultOp))
- return nullptr;
-
- // There might be assume intrinsics dominating this return that completely
- // determine the value. If so, constant fold it.
- KnownBits Known = computeKnownBits(ResultOp, 0, &RI);
- if (Known.isConstant())
- return replaceOperand(RI, 0,
- Constant::getIntegerValue(VTy, Known.getConstant()));
-
- return nullptr;
-}
-
+ if (RI.getNumOperands() == 0) // ret void
+ return nullptr;
+
+ Value *ResultOp = RI.getOperand(0);
+ Type *VTy = ResultOp->getType();
+ if (!VTy->isIntegerTy() || isa<Constant>(ResultOp))
+ return nullptr;
+
+ // Don't replace result of musttail calls.
+ if (isMustTailCall(ResultOp))
+ return nullptr;
+
+ // There might be assume intrinsics dominating this return that completely
+ // determine the value. If so, constant fold it.
+ KnownBits Known = computeKnownBits(ResultOp, 0, &RI);
+ if (Known.isConstant())
+ return replaceOperand(RI, 0,
+ Constant::getIntegerValue(VTy, Known.getConstant()));
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitUnreachableInst(UnreachableInst &I) {
// Try to remove the previous instruction if it must lead to unreachable.
// This includes instructions like stores and "llvm.assume" that may not get
@@ -2873,597 +2873,597 @@ Instruction *InstCombinerImpl::visitUnreachableInst(UnreachableInst &I) {
}
Instruction *InstCombinerImpl::visitUnconditionalBranchInst(BranchInst &BI) {
- assert(BI.isUnconditional() && "Only for unconditional branches.");
-
- // If this store is the second-to-last instruction in the basic block
- // (excluding debug info and bitcasts of pointers) and if the block ends with
- // an unconditional branch, try to move the store to the successor block.
-
- auto GetLastSinkableStore = [](BasicBlock::iterator BBI) {
- auto IsNoopInstrForStoreMerging = [](BasicBlock::iterator BBI) {
- return isa<DbgInfoIntrinsic>(BBI) ||
- (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy());
- };
-
- BasicBlock::iterator FirstInstr = BBI->getParent()->begin();
- do {
- if (BBI != FirstInstr)
- --BBI;
- } while (BBI != FirstInstr && IsNoopInstrForStoreMerging(BBI));
-
- return dyn_cast<StoreInst>(BBI);
- };
-
- if (StoreInst *SI = GetLastSinkableStore(BasicBlock::iterator(BI)))
- if (mergeStoreIntoSuccessor(*SI))
- return &BI;
-
- return nullptr;
-}
-
+ assert(BI.isUnconditional() && "Only for unconditional branches.");
+
+ // If this store is the second-to-last instruction in the basic block
+ // (excluding debug info and bitcasts of pointers) and if the block ends with
+ // an unconditional branch, try to move the store to the successor block.
+
+ auto GetLastSinkableStore = [](BasicBlock::iterator BBI) {
+ auto IsNoopInstrForStoreMerging = [](BasicBlock::iterator BBI) {
+ return isa<DbgInfoIntrinsic>(BBI) ||
+ (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy());
+ };
+
+ BasicBlock::iterator FirstInstr = BBI->getParent()->begin();
+ do {
+ if (BBI != FirstInstr)
+ --BBI;
+ } while (BBI != FirstInstr && IsNoopInstrForStoreMerging(BBI));
+
+ return dyn_cast<StoreInst>(BBI);
+ };
+
+ if (StoreInst *SI = GetLastSinkableStore(BasicBlock::iterator(BI)))
+ if (mergeStoreIntoSuccessor(*SI))
+ return &BI;
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitBranchInst(BranchInst &BI) {
- if (BI.isUnconditional())
- return visitUnconditionalBranchInst(BI);
-
- // Change br (not X), label True, label False to: br X, label False, True
- Value *X = nullptr;
- if (match(&BI, m_Br(m_Not(m_Value(X)), m_BasicBlock(), m_BasicBlock())) &&
- !isa<Constant>(X)) {
- // Swap Destinations and condition...
- BI.swapSuccessors();
- return replaceOperand(BI, 0, X);
- }
-
- // If the condition is irrelevant, remove the use so that other
- // transforms on the condition become more effective.
- if (!isa<ConstantInt>(BI.getCondition()) &&
- BI.getSuccessor(0) == BI.getSuccessor(1))
- return replaceOperand(
- BI, 0, ConstantInt::getFalse(BI.getCondition()->getType()));
-
- // Canonicalize, for example, fcmp_one -> fcmp_oeq.
- CmpInst::Predicate Pred;
- if (match(&BI, m_Br(m_OneUse(m_FCmp(Pred, m_Value(), m_Value())),
- m_BasicBlock(), m_BasicBlock())) &&
- !isCanonicalPredicate(Pred)) {
- // Swap destinations and condition.
- CmpInst *Cond = cast<CmpInst>(BI.getCondition());
- Cond->setPredicate(CmpInst::getInversePredicate(Pred));
- BI.swapSuccessors();
- Worklist.push(Cond);
- return &BI;
- }
-
- return nullptr;
-}
-
+ if (BI.isUnconditional())
+ return visitUnconditionalBranchInst(BI);
+
+ // Change br (not X), label True, label False to: br X, label False, True
+ Value *X = nullptr;
+ if (match(&BI, m_Br(m_Not(m_Value(X)), m_BasicBlock(), m_BasicBlock())) &&
+ !isa<Constant>(X)) {
+ // Swap Destinations and condition...
+ BI.swapSuccessors();
+ return replaceOperand(BI, 0, X);
+ }
+
+ // If the condition is irrelevant, remove the use so that other
+ // transforms on the condition become more effective.
+ if (!isa<ConstantInt>(BI.getCondition()) &&
+ BI.getSuccessor(0) == BI.getSuccessor(1))
+ return replaceOperand(
+ BI, 0, ConstantInt::getFalse(BI.getCondition()->getType()));
+
+ // Canonicalize, for example, fcmp_one -> fcmp_oeq.
+ CmpInst::Predicate Pred;
+ if (match(&BI, m_Br(m_OneUse(m_FCmp(Pred, m_Value(), m_Value())),
+ m_BasicBlock(), m_BasicBlock())) &&
+ !isCanonicalPredicate(Pred)) {
+ // Swap destinations and condition.
+ CmpInst *Cond = cast<CmpInst>(BI.getCondition());
+ Cond->setPredicate(CmpInst::getInversePredicate(Pred));
+ BI.swapSuccessors();
+ Worklist.push(Cond);
+ return &BI;
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitSwitchInst(SwitchInst &SI) {
- Value *Cond = SI.getCondition();
- Value *Op0;
- ConstantInt *AddRHS;
- if (match(Cond, m_Add(m_Value(Op0), m_ConstantInt(AddRHS)))) {
- // Change 'switch (X+4) case 1:' into 'switch (X) case -3'.
- for (auto Case : SI.cases()) {
- Constant *NewCase = ConstantExpr::getSub(Case.getCaseValue(), AddRHS);
- assert(isa<ConstantInt>(NewCase) &&
- "Result of expression should be constant");
- Case.setValue(cast<ConstantInt>(NewCase));
- }
- return replaceOperand(SI, 0, Op0);
- }
-
- KnownBits Known = computeKnownBits(Cond, 0, &SI);
- unsigned LeadingKnownZeros = Known.countMinLeadingZeros();
- unsigned LeadingKnownOnes = Known.countMinLeadingOnes();
-
- // Compute the number of leading bits we can ignore.
- // TODO: A better way to determine this would use ComputeNumSignBits().
- for (auto &C : SI.cases()) {
- LeadingKnownZeros = std::min(
- LeadingKnownZeros, C.getCaseValue()->getValue().countLeadingZeros());
- LeadingKnownOnes = std::min(
- LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes());
- }
-
- unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);
-
- // Shrink the condition operand if the new type is smaller than the old type.
+ Value *Cond = SI.getCondition();
+ Value *Op0;
+ ConstantInt *AddRHS;
+ if (match(Cond, m_Add(m_Value(Op0), m_ConstantInt(AddRHS)))) {
+ // Change 'switch (X+4) case 1:' into 'switch (X) case -3'.
+ for (auto Case : SI.cases()) {
+ Constant *NewCase = ConstantExpr::getSub(Case.getCaseValue(), AddRHS);
+ assert(isa<ConstantInt>(NewCase) &&
+ "Result of expression should be constant");
+ Case.setValue(cast<ConstantInt>(NewCase));
+ }
+ return replaceOperand(SI, 0, Op0);
+ }
+
+ KnownBits Known = computeKnownBits(Cond, 0, &SI);
+ unsigned LeadingKnownZeros = Known.countMinLeadingZeros();
+ unsigned LeadingKnownOnes = Known.countMinLeadingOnes();
+
+ // Compute the number of leading bits we can ignore.
+ // TODO: A better way to determine this would use ComputeNumSignBits().
+ for (auto &C : SI.cases()) {
+ LeadingKnownZeros = std::min(
+ LeadingKnownZeros, C.getCaseValue()->getValue().countLeadingZeros());
+ LeadingKnownOnes = std::min(
+ LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes());
+ }
+
+ unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);
+
+ // Shrink the condition operand if the new type is smaller than the old type.
// But do not shrink to a non-standard type, because backend can't generate
- // good code for that yet.
- // TODO: We can make it aggressive again after fixing PR39569.
- if (NewWidth > 0 && NewWidth < Known.getBitWidth() &&
- shouldChangeType(Known.getBitWidth(), NewWidth)) {
- IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
- Builder.SetInsertPoint(&SI);
- Value *NewCond = Builder.CreateTrunc(Cond, Ty, "trunc");
-
- for (auto Case : SI.cases()) {
- APInt TruncatedCase = Case.getCaseValue()->getValue().trunc(NewWidth);
- Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
- }
- return replaceOperand(SI, 0, NewCond);
- }
-
- return nullptr;
-}
-
+ // good code for that yet.
+ // TODO: We can make it aggressive again after fixing PR39569.
+ if (NewWidth > 0 && NewWidth < Known.getBitWidth() &&
+ shouldChangeType(Known.getBitWidth(), NewWidth)) {
+ IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
+ Builder.SetInsertPoint(&SI);
+ Value *NewCond = Builder.CreateTrunc(Cond, Ty, "trunc");
+
+ for (auto Case : SI.cases()) {
+ APInt TruncatedCase = Case.getCaseValue()->getValue().trunc(NewWidth);
+ Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
+ }
+ return replaceOperand(SI, 0, NewCond);
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitExtractValueInst(ExtractValueInst &EV) {
- Value *Agg = EV.getAggregateOperand();
-
- if (!EV.hasIndices())
- return replaceInstUsesWith(EV, Agg);
-
- if (Value *V = SimplifyExtractValueInst(Agg, EV.getIndices(),
- SQ.getWithInstruction(&EV)))
- return replaceInstUsesWith(EV, V);
-
- if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
- // We're extracting from an insertvalue instruction, compare the indices
- const unsigned *exti, *exte, *insi, *inse;
- for (exti = EV.idx_begin(), insi = IV->idx_begin(),
- exte = EV.idx_end(), inse = IV->idx_end();
- exti != exte && insi != inse;
- ++exti, ++insi) {
- if (*insi != *exti)
- // The insert and extract both reference distinctly different elements.
- // This means the extract is not influenced by the insert, and we can
- // replace the aggregate operand of the extract with the aggregate
- // operand of the insert. i.e., replace
- // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
- // %E = extractvalue { i32, { i32 } } %I, 0
- // with
- // %E = extractvalue { i32, { i32 } } %A, 0
- return ExtractValueInst::Create(IV->getAggregateOperand(),
- EV.getIndices());
- }
- if (exti == exte && insi == inse)
- // Both iterators are at the end: Index lists are identical. Replace
- // %B = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
- // %C = extractvalue { i32, { i32 } } %B, 1, 0
- // with "i32 42"
- return replaceInstUsesWith(EV, IV->getInsertedValueOperand());
- if (exti == exte) {
- // The extract list is a prefix of the insert list. i.e. replace
- // %I = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
- // %E = extractvalue { i32, { i32 } } %I, 1
- // with
- // %X = extractvalue { i32, { i32 } } %A, 1
- // %E = insertvalue { i32 } %X, i32 42, 0
- // by switching the order of the insert and extract (though the
- // insertvalue should be left in, since it may have other uses).
- Value *NewEV = Builder.CreateExtractValue(IV->getAggregateOperand(),
- EV.getIndices());
- return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(),
- makeArrayRef(insi, inse));
- }
- if (insi == inse)
- // The insert list is a prefix of the extract list
- // We can simply remove the common indices from the extract and make it
- // operate on the inserted value instead of the insertvalue result.
- // i.e., replace
- // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
- // %E = extractvalue { i32, { i32 } } %I, 1, 0
- // with
- // %E extractvalue { i32 } { i32 42 }, 0
- return ExtractValueInst::Create(IV->getInsertedValueOperand(),
- makeArrayRef(exti, exte));
- }
- if (WithOverflowInst *WO = dyn_cast<WithOverflowInst>(Agg)) {
- // We're extracting from an overflow intrinsic, see if we're the only user,
- // which allows us to simplify multiple result intrinsics to simpler
- // things that just get one value.
- if (WO->hasOneUse()) {
- // Check if we're grabbing only the result of a 'with overflow' intrinsic
- // and replace it with a traditional binary instruction.
- if (*EV.idx_begin() == 0) {
- Instruction::BinaryOps BinOp = WO->getBinaryOp();
- Value *LHS = WO->getLHS(), *RHS = WO->getRHS();
- replaceInstUsesWith(*WO, UndefValue::get(WO->getType()));
- eraseInstFromFunction(*WO);
- return BinaryOperator::Create(BinOp, LHS, RHS);
- }
-
- // If the normal result of the add is dead, and the RHS is a constant,
- // we can transform this into a range comparison.
- // overflow = uadd a, -4 --> overflow = icmp ugt a, 3
- if (WO->getIntrinsicID() == Intrinsic::uadd_with_overflow)
- if (ConstantInt *CI = dyn_cast<ConstantInt>(WO->getRHS()))
- return new ICmpInst(ICmpInst::ICMP_UGT, WO->getLHS(),
- ConstantExpr::getNot(CI));
- }
- }
- if (LoadInst *L = dyn_cast<LoadInst>(Agg))
- // If the (non-volatile) load only has one use, we can rewrite this to a
- // load from a GEP. This reduces the size of the load. If a load is used
- // only by extractvalue instructions then this either must have been
- // optimized before, or it is a struct with padding, in which case we
- // don't want to do the transformation as it loses padding knowledge.
- if (L->isSimple() && L->hasOneUse()) {
- // extractvalue has integer indices, getelementptr has Value*s. Convert.
- SmallVector<Value*, 4> Indices;
- // Prefix an i32 0 since we need the first element.
- Indices.push_back(Builder.getInt32(0));
- for (ExtractValueInst::idx_iterator I = EV.idx_begin(), E = EV.idx_end();
- I != E; ++I)
- Indices.push_back(Builder.getInt32(*I));
-
- // We need to insert these at the location of the old load, not at that of
- // the extractvalue.
- Builder.SetInsertPoint(L);
- Value *GEP = Builder.CreateInBoundsGEP(L->getType(),
- L->getPointerOperand(), Indices);
- Instruction *NL = Builder.CreateLoad(EV.getType(), GEP);
- // Whatever aliasing information we had for the orignal load must also
- // hold for the smaller load, so propagate the annotations.
- AAMDNodes Nodes;
- L->getAAMetadata(Nodes);
- NL->setAAMetadata(Nodes);
- // Returning the load directly will cause the main loop to insert it in
- // the wrong spot, so use replaceInstUsesWith().
- return replaceInstUsesWith(EV, NL);
- }
- // We could simplify extracts from other values. Note that nested extracts may
- // already be simplified implicitly by the above: extract (extract (insert) )
- // will be translated into extract ( insert ( extract ) ) first and then just
- // the value inserted, if appropriate. Similarly for extracts from single-use
- // loads: extract (extract (load)) will be translated to extract (load (gep))
- // and if again single-use then via load (gep (gep)) to load (gep).
- // However, double extracts from e.g. function arguments or return values
- // aren't handled yet.
- return nullptr;
-}
-
-/// Return 'true' if the given typeinfo will match anything.
-static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {
- switch (Personality) {
- case EHPersonality::GNU_C:
- case EHPersonality::GNU_C_SjLj:
- case EHPersonality::Rust:
- // The GCC C EH and Rust personality only exists to support cleanups, so
- // it's not clear what the semantics of catch clauses are.
- return false;
- case EHPersonality::Unknown:
- return false;
- case EHPersonality::GNU_Ada:
- // While __gnat_all_others_value will match any Ada exception, it doesn't
- // match foreign exceptions (or didn't, before gcc-4.7).
- return false;
- case EHPersonality::GNU_CXX:
- case EHPersonality::GNU_CXX_SjLj:
- case EHPersonality::GNU_ObjC:
- case EHPersonality::MSVC_X86SEH:
+ Value *Agg = EV.getAggregateOperand();
+
+ if (!EV.hasIndices())
+ return replaceInstUsesWith(EV, Agg);
+
+ if (Value *V = SimplifyExtractValueInst(Agg, EV.getIndices(),
+ SQ.getWithInstruction(&EV)))
+ return replaceInstUsesWith(EV, V);
+
+ if (InsertValueInst *IV = dyn_cast<InsertValueInst>(Agg)) {
+ // We're extracting from an insertvalue instruction, compare the indices
+ const unsigned *exti, *exte, *insi, *inse;
+ for (exti = EV.idx_begin(), insi = IV->idx_begin(),
+ exte = EV.idx_end(), inse = IV->idx_end();
+ exti != exte && insi != inse;
+ ++exti, ++insi) {
+ if (*insi != *exti)
+ // The insert and extract both reference distinctly different elements.
+ // This means the extract is not influenced by the insert, and we can
+ // replace the aggregate operand of the extract with the aggregate
+ // operand of the insert. i.e., replace
+ // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
+ // %E = extractvalue { i32, { i32 } } %I, 0
+ // with
+ // %E = extractvalue { i32, { i32 } } %A, 0
+ return ExtractValueInst::Create(IV->getAggregateOperand(),
+ EV.getIndices());
+ }
+ if (exti == exte && insi == inse)
+ // Both iterators are at the end: Index lists are identical. Replace
+ // %B = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
+ // %C = extractvalue { i32, { i32 } } %B, 1, 0
+ // with "i32 42"
+ return replaceInstUsesWith(EV, IV->getInsertedValueOperand());
+ if (exti == exte) {
+ // The extract list is a prefix of the insert list. i.e. replace
+ // %I = insertvalue { i32, { i32 } } %A, i32 42, 1, 0
+ // %E = extractvalue { i32, { i32 } } %I, 1
+ // with
+ // %X = extractvalue { i32, { i32 } } %A, 1
+ // %E = insertvalue { i32 } %X, i32 42, 0
+ // by switching the order of the insert and extract (though the
+ // insertvalue should be left in, since it may have other uses).
+ Value *NewEV = Builder.CreateExtractValue(IV->getAggregateOperand(),
+ EV.getIndices());
+ return InsertValueInst::Create(NewEV, IV->getInsertedValueOperand(),
+ makeArrayRef(insi, inse));
+ }
+ if (insi == inse)
+ // The insert list is a prefix of the extract list
+ // We can simply remove the common indices from the extract and make it
+ // operate on the inserted value instead of the insertvalue result.
+ // i.e., replace
+ // %I = insertvalue { i32, { i32 } } %A, { i32 } { i32 42 }, 1
+ // %E = extractvalue { i32, { i32 } } %I, 1, 0
+ // with
+ // %E extractvalue { i32 } { i32 42 }, 0
+ return ExtractValueInst::Create(IV->getInsertedValueOperand(),
+ makeArrayRef(exti, exte));
+ }
+ if (WithOverflowInst *WO = dyn_cast<WithOverflowInst>(Agg)) {
+ // We're extracting from an overflow intrinsic, see if we're the only user,
+ // which allows us to simplify multiple result intrinsics to simpler
+ // things that just get one value.
+ if (WO->hasOneUse()) {
+ // Check if we're grabbing only the result of a 'with overflow' intrinsic
+ // and replace it with a traditional binary instruction.
+ if (*EV.idx_begin() == 0) {
+ Instruction::BinaryOps BinOp = WO->getBinaryOp();
+ Value *LHS = WO->getLHS(), *RHS = WO->getRHS();
+ replaceInstUsesWith(*WO, UndefValue::get(WO->getType()));
+ eraseInstFromFunction(*WO);
+ return BinaryOperator::Create(BinOp, LHS, RHS);
+ }
+
+ // If the normal result of the add is dead, and the RHS is a constant,
+ // we can transform this into a range comparison.
+ // overflow = uadd a, -4 --> overflow = icmp ugt a, 3
+ if (WO->getIntrinsicID() == Intrinsic::uadd_with_overflow)
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(WO->getRHS()))
+ return new ICmpInst(ICmpInst::ICMP_UGT, WO->getLHS(),
+ ConstantExpr::getNot(CI));
+ }
+ }
+ if (LoadInst *L = dyn_cast<LoadInst>(Agg))
+ // If the (non-volatile) load only has one use, we can rewrite this to a
+ // load from a GEP. This reduces the size of the load. If a load is used
+ // only by extractvalue instructions then this either must have been
+ // optimized before, or it is a struct with padding, in which case we
+ // don't want to do the transformation as it loses padding knowledge.
+ if (L->isSimple() && L->hasOneUse()) {
+ // extractvalue has integer indices, getelementptr has Value*s. Convert.
+ SmallVector<Value*, 4> Indices;
+ // Prefix an i32 0 since we need the first element.
+ Indices.push_back(Builder.getInt32(0));
+ for (ExtractValueInst::idx_iterator I = EV.idx_begin(), E = EV.idx_end();
+ I != E; ++I)
+ Indices.push_back(Builder.getInt32(*I));
+
+ // We need to insert these at the location of the old load, not at that of
+ // the extractvalue.
+ Builder.SetInsertPoint(L);
+ Value *GEP = Builder.CreateInBoundsGEP(L->getType(),
+ L->getPointerOperand(), Indices);
+ Instruction *NL = Builder.CreateLoad(EV.getType(), GEP);
+ // Whatever aliasing information we had for the orignal load must also
+ // hold for the smaller load, so propagate the annotations.
+ AAMDNodes Nodes;
+ L->getAAMetadata(Nodes);
+ NL->setAAMetadata(Nodes);
+ // Returning the load directly will cause the main loop to insert it in
+ // the wrong spot, so use replaceInstUsesWith().
+ return replaceInstUsesWith(EV, NL);
+ }
+ // We could simplify extracts from other values. Note that nested extracts may
+ // already be simplified implicitly by the above: extract (extract (insert) )
+ // will be translated into extract ( insert ( extract ) ) first and then just
+ // the value inserted, if appropriate. Similarly for extracts from single-use
+ // loads: extract (extract (load)) will be translated to extract (load (gep))
+ // and if again single-use then via load (gep (gep)) to load (gep).
+ // However, double extracts from e.g. function arguments or return values
+ // aren't handled yet.
+ return nullptr;
+}
+
+/// Return 'true' if the given typeinfo will match anything.
+static bool isCatchAll(EHPersonality Personality, Constant *TypeInfo) {
+ switch (Personality) {
+ case EHPersonality::GNU_C:
+ case EHPersonality::GNU_C_SjLj:
+ case EHPersonality::Rust:
+ // The GCC C EH and Rust personality only exists to support cleanups, so
+ // it's not clear what the semantics of catch clauses are.
+ return false;
+ case EHPersonality::Unknown:
+ return false;
+ case EHPersonality::GNU_Ada:
+ // While __gnat_all_others_value will match any Ada exception, it doesn't
+ // match foreign exceptions (or didn't, before gcc-4.7).
+ return false;
+ case EHPersonality::GNU_CXX:
+ case EHPersonality::GNU_CXX_SjLj:
+ case EHPersonality::GNU_ObjC:
+ case EHPersonality::MSVC_X86SEH:
case EHPersonality::MSVC_TableSEH:
- case EHPersonality::MSVC_CXX:
- case EHPersonality::CoreCLR:
- case EHPersonality::Wasm_CXX:
+ case EHPersonality::MSVC_CXX:
+ case EHPersonality::CoreCLR:
+ case EHPersonality::Wasm_CXX:
case EHPersonality::XL_CXX:
- return TypeInfo->isNullValue();
- }
- llvm_unreachable("invalid enum");
-}
-
-static bool shorter_filter(const Value *LHS, const Value *RHS) {
- return
- cast<ArrayType>(LHS->getType())->getNumElements()
- <
- cast<ArrayType>(RHS->getType())->getNumElements();
-}
-
+ return TypeInfo->isNullValue();
+ }
+ llvm_unreachable("invalid enum");
+}
+
+static bool shorter_filter(const Value *LHS, const Value *RHS) {
+ return
+ cast<ArrayType>(LHS->getType())->getNumElements()
+ <
+ cast<ArrayType>(RHS->getType())->getNumElements();
+}
+
Instruction *InstCombinerImpl::visitLandingPadInst(LandingPadInst &LI) {
- // The logic here should be correct for any real-world personality function.
- // However if that turns out not to be true, the offending logic can always
- // be conditioned on the personality function, like the catch-all logic is.
- EHPersonality Personality =
- classifyEHPersonality(LI.getParent()->getParent()->getPersonalityFn());
-
- // Simplify the list of clauses, eg by removing repeated catch clauses
- // (these are often created by inlining).
- bool MakeNewInstruction = false; // If true, recreate using the following:
- SmallVector<Constant *, 16> NewClauses; // - Clauses for the new instruction;
- bool CleanupFlag = LI.isCleanup(); // - The new instruction is a cleanup.
-
- SmallPtrSet<Value *, 16> AlreadyCaught; // Typeinfos known caught already.
- for (unsigned i = 0, e = LI.getNumClauses(); i != e; ++i) {
- bool isLastClause = i + 1 == e;
- if (LI.isCatch(i)) {
- // A catch clause.
- Constant *CatchClause = LI.getClause(i);
- Constant *TypeInfo = CatchClause->stripPointerCasts();
-
- // If we already saw this clause, there is no point in having a second
- // copy of it.
- if (AlreadyCaught.insert(TypeInfo).second) {
- // This catch clause was not already seen.
- NewClauses.push_back(CatchClause);
- } else {
- // Repeated catch clause - drop the redundant copy.
- MakeNewInstruction = true;
- }
-
- // If this is a catch-all then there is no point in keeping any following
- // clauses or marking the landingpad as having a cleanup.
- if (isCatchAll(Personality, TypeInfo)) {
- if (!isLastClause)
- MakeNewInstruction = true;
- CleanupFlag = false;
- break;
- }
- } else {
- // A filter clause. If any of the filter elements were already caught
- // then they can be dropped from the filter. It is tempting to try to
- // exploit the filter further by saying that any typeinfo that does not
- // occur in the filter can't be caught later (and thus can be dropped).
- // However this would be wrong, since typeinfos can match without being
- // equal (for example if one represents a C++ class, and the other some
- // class derived from it).
- assert(LI.isFilter(i) && "Unsupported landingpad clause!");
- Constant *FilterClause = LI.getClause(i);
- ArrayType *FilterType = cast<ArrayType>(FilterClause->getType());
- unsigned NumTypeInfos = FilterType->getNumElements();
-
- // An empty filter catches everything, so there is no point in keeping any
- // following clauses or marking the landingpad as having a cleanup. By
- // dealing with this case here the following code is made a bit simpler.
- if (!NumTypeInfos) {
- NewClauses.push_back(FilterClause);
- if (!isLastClause)
- MakeNewInstruction = true;
- CleanupFlag = false;
- break;
- }
-
- bool MakeNewFilter = false; // If true, make a new filter.
- SmallVector<Constant *, 16> NewFilterElts; // New elements.
- if (isa<ConstantAggregateZero>(FilterClause)) {
- // Not an empty filter - it contains at least one null typeinfo.
- assert(NumTypeInfos > 0 && "Should have handled empty filter already!");
- Constant *TypeInfo =
- Constant::getNullValue(FilterType->getElementType());
- // If this typeinfo is a catch-all then the filter can never match.
- if (isCatchAll(Personality, TypeInfo)) {
- // Throw the filter away.
- MakeNewInstruction = true;
- continue;
- }
-
- // There is no point in having multiple copies of this typeinfo, so
- // discard all but the first copy if there is more than one.
- NewFilterElts.push_back(TypeInfo);
- if (NumTypeInfos > 1)
- MakeNewFilter = true;
- } else {
- ConstantArray *Filter = cast<ConstantArray>(FilterClause);
- SmallPtrSet<Value *, 16> SeenInFilter; // For uniquing the elements.
- NewFilterElts.reserve(NumTypeInfos);
-
- // Remove any filter elements that were already caught or that already
- // occurred in the filter. While there, see if any of the elements are
- // catch-alls. If so, the filter can be discarded.
- bool SawCatchAll = false;
- for (unsigned j = 0; j != NumTypeInfos; ++j) {
- Constant *Elt = Filter->getOperand(j);
- Constant *TypeInfo = Elt->stripPointerCasts();
- if (isCatchAll(Personality, TypeInfo)) {
- // This element is a catch-all. Bail out, noting this fact.
- SawCatchAll = true;
- break;
- }
-
- // Even if we've seen a type in a catch clause, we don't want to
- // remove it from the filter. An unexpected type handler may be
- // set up for a call site which throws an exception of the same
- // type caught. In order for the exception thrown by the unexpected
- // handler to propagate correctly, the filter must be correctly
- // described for the call site.
- //
- // Example:
- //
- // void unexpected() { throw 1;}
- // void foo() throw (int) {
- // std::set_unexpected(unexpected);
- // try {
- // throw 2.0;
- // } catch (int i) {}
- // }
-
- // There is no point in having multiple copies of the same typeinfo in
- // a filter, so only add it if we didn't already.
- if (SeenInFilter.insert(TypeInfo).second)
- NewFilterElts.push_back(cast<Constant>(Elt));
- }
- // A filter containing a catch-all cannot match anything by definition.
- if (SawCatchAll) {
- // Throw the filter away.
- MakeNewInstruction = true;
- continue;
- }
-
- // If we dropped something from the filter, make a new one.
- if (NewFilterElts.size() < NumTypeInfos)
- MakeNewFilter = true;
- }
- if (MakeNewFilter) {
- FilterType = ArrayType::get(FilterType->getElementType(),
- NewFilterElts.size());
- FilterClause = ConstantArray::get(FilterType, NewFilterElts);
- MakeNewInstruction = true;
- }
-
- NewClauses.push_back(FilterClause);
-
- // If the new filter is empty then it will catch everything so there is
- // no point in keeping any following clauses or marking the landingpad
- // as having a cleanup. The case of the original filter being empty was
- // already handled above.
- if (MakeNewFilter && !NewFilterElts.size()) {
- assert(MakeNewInstruction && "New filter but not a new instruction!");
- CleanupFlag = false;
- break;
- }
- }
- }
-
- // If several filters occur in a row then reorder them so that the shortest
- // filters come first (those with the smallest number of elements). This is
- // advantageous because shorter filters are more likely to match, speeding up
- // unwinding, but mostly because it increases the effectiveness of the other
- // filter optimizations below.
- for (unsigned i = 0, e = NewClauses.size(); i + 1 < e; ) {
- unsigned j;
- // Find the maximal 'j' s.t. the range [i, j) consists entirely of filters.
- for (j = i; j != e; ++j)
- if (!isa<ArrayType>(NewClauses[j]->getType()))
- break;
-
- // Check whether the filters are already sorted by length. We need to know
- // if sorting them is actually going to do anything so that we only make a
- // new landingpad instruction if it does.
- for (unsigned k = i; k + 1 < j; ++k)
- if (shorter_filter(NewClauses[k+1], NewClauses[k])) {
- // Not sorted, so sort the filters now. Doing an unstable sort would be
- // correct too but reordering filters pointlessly might confuse users.
- std::stable_sort(NewClauses.begin() + i, NewClauses.begin() + j,
- shorter_filter);
- MakeNewInstruction = true;
- break;
- }
-
- // Look for the next batch of filters.
- i = j + 1;
- }
-
- // If typeinfos matched if and only if equal, then the elements of a filter L
- // that occurs later than a filter F could be replaced by the intersection of
- // the elements of F and L. In reality two typeinfos can match without being
- // equal (for example if one represents a C++ class, and the other some class
- // derived from it) so it would be wrong to perform this transform in general.
- // However the transform is correct and useful if F is a subset of L. In that
- // case L can be replaced by F, and thus removed altogether since repeating a
- // filter is pointless. So here we look at all pairs of filters F and L where
- // L follows F in the list of clauses, and remove L if every element of F is
- // an element of L. This can occur when inlining C++ functions with exception
- // specifications.
- for (unsigned i = 0; i + 1 < NewClauses.size(); ++i) {
- // Examine each filter in turn.
- Value *Filter = NewClauses[i];
- ArrayType *FTy = dyn_cast<ArrayType>(Filter->getType());
- if (!FTy)
- // Not a filter - skip it.
- continue;
- unsigned FElts = FTy->getNumElements();
- // Examine each filter following this one. Doing this backwards means that
- // we don't have to worry about filters disappearing under us when removed.
- for (unsigned j = NewClauses.size() - 1; j != i; --j) {
- Value *LFilter = NewClauses[j];
- ArrayType *LTy = dyn_cast<ArrayType>(LFilter->getType());
- if (!LTy)
- // Not a filter - skip it.
- continue;
- // If Filter is a subset of LFilter, i.e. every element of Filter is also
- // an element of LFilter, then discard LFilter.
- SmallVectorImpl<Constant *>::iterator J = NewClauses.begin() + j;
- // If Filter is empty then it is a subset of LFilter.
- if (!FElts) {
- // Discard LFilter.
- NewClauses.erase(J);
- MakeNewInstruction = true;
- // Move on to the next filter.
- continue;
- }
- unsigned LElts = LTy->getNumElements();
- // If Filter is longer than LFilter then it cannot be a subset of it.
- if (FElts > LElts)
- // Move on to the next filter.
- continue;
- // At this point we know that LFilter has at least one element.
- if (isa<ConstantAggregateZero>(LFilter)) { // LFilter only contains zeros.
- // Filter is a subset of LFilter iff Filter contains only zeros (as we
- // already know that Filter is not longer than LFilter).
- if (isa<ConstantAggregateZero>(Filter)) {
- assert(FElts <= LElts && "Should have handled this case earlier!");
- // Discard LFilter.
- NewClauses.erase(J);
- MakeNewInstruction = true;
- }
- // Move on to the next filter.
- continue;
- }
- ConstantArray *LArray = cast<ConstantArray>(LFilter);
- if (isa<ConstantAggregateZero>(Filter)) { // Filter only contains zeros.
- // Since Filter is non-empty and contains only zeros, it is a subset of
- // LFilter iff LFilter contains a zero.
- assert(FElts > 0 && "Should have eliminated the empty filter earlier!");
- for (unsigned l = 0; l != LElts; ++l)
- if (LArray->getOperand(l)->isNullValue()) {
- // LFilter contains a zero - discard it.
- NewClauses.erase(J);
- MakeNewInstruction = true;
- break;
- }
- // Move on to the next filter.
- continue;
- }
- // At this point we know that both filters are ConstantArrays. Loop over
- // operands to see whether every element of Filter is also an element of
- // LFilter. Since filters tend to be short this is probably faster than
- // using a method that scales nicely.
- ConstantArray *FArray = cast<ConstantArray>(Filter);
- bool AllFound = true;
- for (unsigned f = 0; f != FElts; ++f) {
- Value *FTypeInfo = FArray->getOperand(f)->stripPointerCasts();
- AllFound = false;
- for (unsigned l = 0; l != LElts; ++l) {
- Value *LTypeInfo = LArray->getOperand(l)->stripPointerCasts();
- if (LTypeInfo == FTypeInfo) {
- AllFound = true;
- break;
- }
- }
- if (!AllFound)
- break;
- }
- if (AllFound) {
- // Discard LFilter.
- NewClauses.erase(J);
- MakeNewInstruction = true;
- }
- // Move on to the next filter.
- }
- }
-
- // If we changed any of the clauses, replace the old landingpad instruction
- // with a new one.
- if (MakeNewInstruction) {
- LandingPadInst *NLI = LandingPadInst::Create(LI.getType(),
- NewClauses.size());
- for (unsigned i = 0, e = NewClauses.size(); i != e; ++i)
- NLI->addClause(NewClauses[i]);
- // A landing pad with no clauses must have the cleanup flag set. It is
- // theoretically possible, though highly unlikely, that we eliminated all
- // clauses. If so, force the cleanup flag to true.
- if (NewClauses.empty())
- CleanupFlag = true;
- NLI->setCleanup(CleanupFlag);
- return NLI;
- }
-
- // Even if none of the clauses changed, we may nonetheless have understood
- // that the cleanup flag is pointless. Clear it if so.
- if (LI.isCleanup() != CleanupFlag) {
- assert(!CleanupFlag && "Adding a cleanup, not removing one?!");
- LI.setCleanup(CleanupFlag);
- return &LI;
- }
-
- return nullptr;
-}
-
+ // The logic here should be correct for any real-world personality function.
+ // However if that turns out not to be true, the offending logic can always
+ // be conditioned on the personality function, like the catch-all logic is.
+ EHPersonality Personality =
+ classifyEHPersonality(LI.getParent()->getParent()->getPersonalityFn());
+
+ // Simplify the list of clauses, eg by removing repeated catch clauses
+ // (these are often created by inlining).
+ bool MakeNewInstruction = false; // If true, recreate using the following:
+ SmallVector<Constant *, 16> NewClauses; // - Clauses for the new instruction;
+ bool CleanupFlag = LI.isCleanup(); // - The new instruction is a cleanup.
+
+ SmallPtrSet<Value *, 16> AlreadyCaught; // Typeinfos known caught already.
+ for (unsigned i = 0, e = LI.getNumClauses(); i != e; ++i) {
+ bool isLastClause = i + 1 == e;
+ if (LI.isCatch(i)) {
+ // A catch clause.
+ Constant *CatchClause = LI.getClause(i);
+ Constant *TypeInfo = CatchClause->stripPointerCasts();
+
+ // If we already saw this clause, there is no point in having a second
+ // copy of it.
+ if (AlreadyCaught.insert(TypeInfo).second) {
+ // This catch clause was not already seen.
+ NewClauses.push_back(CatchClause);
+ } else {
+ // Repeated catch clause - drop the redundant copy.
+ MakeNewInstruction = true;
+ }
+
+ // If this is a catch-all then there is no point in keeping any following
+ // clauses or marking the landingpad as having a cleanup.
+ if (isCatchAll(Personality, TypeInfo)) {
+ if (!isLastClause)
+ MakeNewInstruction = true;
+ CleanupFlag = false;
+ break;
+ }
+ } else {
+ // A filter clause. If any of the filter elements were already caught
+ // then they can be dropped from the filter. It is tempting to try to
+ // exploit the filter further by saying that any typeinfo that does not
+ // occur in the filter can't be caught later (and thus can be dropped).
+ // However this would be wrong, since typeinfos can match without being
+ // equal (for example if one represents a C++ class, and the other some
+ // class derived from it).
+ assert(LI.isFilter(i) && "Unsupported landingpad clause!");
+ Constant *FilterClause = LI.getClause(i);
+ ArrayType *FilterType = cast<ArrayType>(FilterClause->getType());
+ unsigned NumTypeInfos = FilterType->getNumElements();
+
+ // An empty filter catches everything, so there is no point in keeping any
+ // following clauses or marking the landingpad as having a cleanup. By
+ // dealing with this case here the following code is made a bit simpler.
+ if (!NumTypeInfos) {
+ NewClauses.push_back(FilterClause);
+ if (!isLastClause)
+ MakeNewInstruction = true;
+ CleanupFlag = false;
+ break;
+ }
+
+ bool MakeNewFilter = false; // If true, make a new filter.
+ SmallVector<Constant *, 16> NewFilterElts; // New elements.
+ if (isa<ConstantAggregateZero>(FilterClause)) {
+ // Not an empty filter - it contains at least one null typeinfo.
+ assert(NumTypeInfos > 0 && "Should have handled empty filter already!");
+ Constant *TypeInfo =
+ Constant::getNullValue(FilterType->getElementType());
+ // If this typeinfo is a catch-all then the filter can never match.
+ if (isCatchAll(Personality, TypeInfo)) {
+ // Throw the filter away.
+ MakeNewInstruction = true;
+ continue;
+ }
+
+ // There is no point in having multiple copies of this typeinfo, so
+ // discard all but the first copy if there is more than one.
+ NewFilterElts.push_back(TypeInfo);
+ if (NumTypeInfos > 1)
+ MakeNewFilter = true;
+ } else {
+ ConstantArray *Filter = cast<ConstantArray>(FilterClause);
+ SmallPtrSet<Value *, 16> SeenInFilter; // For uniquing the elements.
+ NewFilterElts.reserve(NumTypeInfos);
+
+ // Remove any filter elements that were already caught or that already
+ // occurred in the filter. While there, see if any of the elements are
+ // catch-alls. If so, the filter can be discarded.
+ bool SawCatchAll = false;
+ for (unsigned j = 0; j != NumTypeInfos; ++j) {
+ Constant *Elt = Filter->getOperand(j);
+ Constant *TypeInfo = Elt->stripPointerCasts();
+ if (isCatchAll(Personality, TypeInfo)) {
+ // This element is a catch-all. Bail out, noting this fact.
+ SawCatchAll = true;
+ break;
+ }
+
+ // Even if we've seen a type in a catch clause, we don't want to
+ // remove it from the filter. An unexpected type handler may be
+ // set up for a call site which throws an exception of the same
+ // type caught. In order for the exception thrown by the unexpected
+ // handler to propagate correctly, the filter must be correctly
+ // described for the call site.
+ //
+ // Example:
+ //
+ // void unexpected() { throw 1;}
+ // void foo() throw (int) {
+ // std::set_unexpected(unexpected);
+ // try {
+ // throw 2.0;
+ // } catch (int i) {}
+ // }
+
+ // There is no point in having multiple copies of the same typeinfo in
+ // a filter, so only add it if we didn't already.
+ if (SeenInFilter.insert(TypeInfo).second)
+ NewFilterElts.push_back(cast<Constant>(Elt));
+ }
+ // A filter containing a catch-all cannot match anything by definition.
+ if (SawCatchAll) {
+ // Throw the filter away.
+ MakeNewInstruction = true;
+ continue;
+ }
+
+ // If we dropped something from the filter, make a new one.
+ if (NewFilterElts.size() < NumTypeInfos)
+ MakeNewFilter = true;
+ }
+ if (MakeNewFilter) {
+ FilterType = ArrayType::get(FilterType->getElementType(),
+ NewFilterElts.size());
+ FilterClause = ConstantArray::get(FilterType, NewFilterElts);
+ MakeNewInstruction = true;
+ }
+
+ NewClauses.push_back(FilterClause);
+
+ // If the new filter is empty then it will catch everything so there is
+ // no point in keeping any following clauses or marking the landingpad
+ // as having a cleanup. The case of the original filter being empty was
+ // already handled above.
+ if (MakeNewFilter && !NewFilterElts.size()) {
+ assert(MakeNewInstruction && "New filter but not a new instruction!");
+ CleanupFlag = false;
+ break;
+ }
+ }
+ }
+
+ // If several filters occur in a row then reorder them so that the shortest
+ // filters come first (those with the smallest number of elements). This is
+ // advantageous because shorter filters are more likely to match, speeding up
+ // unwinding, but mostly because it increases the effectiveness of the other
+ // filter optimizations below.
+ for (unsigned i = 0, e = NewClauses.size(); i + 1 < e; ) {
+ unsigned j;
+ // Find the maximal 'j' s.t. the range [i, j) consists entirely of filters.
+ for (j = i; j != e; ++j)
+ if (!isa<ArrayType>(NewClauses[j]->getType()))
+ break;
+
+ // Check whether the filters are already sorted by length. We need to know
+ // if sorting them is actually going to do anything so that we only make a
+ // new landingpad instruction if it does.
+ for (unsigned k = i; k + 1 < j; ++k)
+ if (shorter_filter(NewClauses[k+1], NewClauses[k])) {
+ // Not sorted, so sort the filters now. Doing an unstable sort would be
+ // correct too but reordering filters pointlessly might confuse users.
+ std::stable_sort(NewClauses.begin() + i, NewClauses.begin() + j,
+ shorter_filter);
+ MakeNewInstruction = true;
+ break;
+ }
+
+ // Look for the next batch of filters.
+ i = j + 1;
+ }
+
+ // If typeinfos matched if and only if equal, then the elements of a filter L
+ // that occurs later than a filter F could be replaced by the intersection of
+ // the elements of F and L. In reality two typeinfos can match without being
+ // equal (for example if one represents a C++ class, and the other some class
+ // derived from it) so it would be wrong to perform this transform in general.
+ // However the transform is correct and useful if F is a subset of L. In that
+ // case L can be replaced by F, and thus removed altogether since repeating a
+ // filter is pointless. So here we look at all pairs of filters F and L where
+ // L follows F in the list of clauses, and remove L if every element of F is
+ // an element of L. This can occur when inlining C++ functions with exception
+ // specifications.
+ for (unsigned i = 0; i + 1 < NewClauses.size(); ++i) {
+ // Examine each filter in turn.
+ Value *Filter = NewClauses[i];
+ ArrayType *FTy = dyn_cast<ArrayType>(Filter->getType());
+ if (!FTy)
+ // Not a filter - skip it.
+ continue;
+ unsigned FElts = FTy->getNumElements();
+ // Examine each filter following this one. Doing this backwards means that
+ // we don't have to worry about filters disappearing under us when removed.
+ for (unsigned j = NewClauses.size() - 1; j != i; --j) {
+ Value *LFilter = NewClauses[j];
+ ArrayType *LTy = dyn_cast<ArrayType>(LFilter->getType());
+ if (!LTy)
+ // Not a filter - skip it.
+ continue;
+ // If Filter is a subset of LFilter, i.e. every element of Filter is also
+ // an element of LFilter, then discard LFilter.
+ SmallVectorImpl<Constant *>::iterator J = NewClauses.begin() + j;
+ // If Filter is empty then it is a subset of LFilter.
+ if (!FElts) {
+ // Discard LFilter.
+ NewClauses.erase(J);
+ MakeNewInstruction = true;
+ // Move on to the next filter.
+ continue;
+ }
+ unsigned LElts = LTy->getNumElements();
+ // If Filter is longer than LFilter then it cannot be a subset of it.
+ if (FElts > LElts)
+ // Move on to the next filter.
+ continue;
+ // At this point we know that LFilter has at least one element.
+ if (isa<ConstantAggregateZero>(LFilter)) { // LFilter only contains zeros.
+ // Filter is a subset of LFilter iff Filter contains only zeros (as we
+ // already know that Filter is not longer than LFilter).
+ if (isa<ConstantAggregateZero>(Filter)) {
+ assert(FElts <= LElts && "Should have handled this case earlier!");
+ // Discard LFilter.
+ NewClauses.erase(J);
+ MakeNewInstruction = true;
+ }
+ // Move on to the next filter.
+ continue;
+ }
+ ConstantArray *LArray = cast<ConstantArray>(LFilter);
+ if (isa<ConstantAggregateZero>(Filter)) { // Filter only contains zeros.
+ // Since Filter is non-empty and contains only zeros, it is a subset of
+ // LFilter iff LFilter contains a zero.
+ assert(FElts > 0 && "Should have eliminated the empty filter earlier!");
+ for (unsigned l = 0; l != LElts; ++l)
+ if (LArray->getOperand(l)->isNullValue()) {
+ // LFilter contains a zero - discard it.
+ NewClauses.erase(J);
+ MakeNewInstruction = true;
+ break;
+ }
+ // Move on to the next filter.
+ continue;
+ }
+ // At this point we know that both filters are ConstantArrays. Loop over
+ // operands to see whether every element of Filter is also an element of
+ // LFilter. Since filters tend to be short this is probably faster than
+ // using a method that scales nicely.
+ ConstantArray *FArray = cast<ConstantArray>(Filter);
+ bool AllFound = true;
+ for (unsigned f = 0; f != FElts; ++f) {
+ Value *FTypeInfo = FArray->getOperand(f)->stripPointerCasts();
+ AllFound = false;
+ for (unsigned l = 0; l != LElts; ++l) {
+ Value *LTypeInfo = LArray->getOperand(l)->stripPointerCasts();
+ if (LTypeInfo == FTypeInfo) {
+ AllFound = true;
+ break;
+ }
+ }
+ if (!AllFound)
+ break;
+ }
+ if (AllFound) {
+ // Discard LFilter.
+ NewClauses.erase(J);
+ MakeNewInstruction = true;
+ }
+ // Move on to the next filter.
+ }
+ }
+
+ // If we changed any of the clauses, replace the old landingpad instruction
+ // with a new one.
+ if (MakeNewInstruction) {
+ LandingPadInst *NLI = LandingPadInst::Create(LI.getType(),
+ NewClauses.size());
+ for (unsigned i = 0, e = NewClauses.size(); i != e; ++i)
+ NLI->addClause(NewClauses[i]);
+ // A landing pad with no clauses must have the cleanup flag set. It is
+ // theoretically possible, though highly unlikely, that we eliminated all
+ // clauses. If so, force the cleanup flag to true.
+ if (NewClauses.empty())
+ CleanupFlag = true;
+ NLI->setCleanup(CleanupFlag);
+ return NLI;
+ }
+
+ // Even if none of the clauses changed, we may nonetheless have understood
+ // that the cleanup flag is pointless. Clear it if so.
+ if (LI.isCleanup() != CleanupFlag) {
+ assert(!CleanupFlag && "Adding a cleanup, not removing one?!");
+ LI.setCleanup(CleanupFlag);
+ return &LI;
+ }
+
+ return nullptr;
+}
+
Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
- Value *Op0 = I.getOperand(0);
-
- if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I)))
- return replaceInstUsesWith(I, V);
-
+ Value *Op0 = I.getOperand(0);
+
+ if (Value *V = SimplifyFreezeInst(Op0, SQ.getWithInstruction(&I)))
+ return replaceInstUsesWith(I, V);
+
// freeze (phi const, x) --> phi const, (freeze x)
if (auto *PN = dyn_cast<PHINode>(Op0)) {
if (Instruction *NV = foldOpIntoPhi(I, PN))
@@ -3498,237 +3498,237 @@ Instruction *InstCombinerImpl::visitFreeze(FreezeInst &I) {
return replaceInstUsesWith(I, BestValue);
}
- return nullptr;
-}
-
-/// Try to move the specified instruction from its current block into the
-/// beginning of DestBlock, which can only happen if it's safe to move the
-/// instruction past all of the instructions between it and the end of its
-/// block.
-static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
- assert(I->getSingleUndroppableUse() && "Invariants didn't hold!");
- BasicBlock *SrcBlock = I->getParent();
-
- // Cannot move control-flow-involving, volatile loads, vaarg, etc.
- if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() ||
- I->isTerminator())
- return false;
-
- // Do not sink static or dynamic alloca instructions. Static allocas must
- // remain in the entry block, and dynamic allocas must not be sunk in between
- // a stacksave / stackrestore pair, which would incorrectly shorten its
- // lifetime.
- if (isa<AllocaInst>(I))
- return false;
-
- // Do not sink into catchswitch blocks.
- if (isa<CatchSwitchInst>(DestBlock->getTerminator()))
- return false;
-
- // Do not sink convergent call instructions.
- if (auto *CI = dyn_cast<CallInst>(I)) {
- if (CI->isConvergent())
- return false;
- }
- // We can only sink load instructions if there is nothing between the load and
- // the end of block that could change the value.
- if (I->mayReadFromMemory()) {
- // We don't want to do any sophisticated alias analysis, so we only check
- // the instructions after I in I's parent block if we try to sink to its
- // successor block.
- if (DestBlock->getUniquePredecessor() != I->getParent())
- return false;
- for (BasicBlock::iterator Scan = I->getIterator(),
- E = I->getParent()->end();
- Scan != E; ++Scan)
- if (Scan->mayWriteToMemory())
- return false;
- }
-
- I->dropDroppableUses([DestBlock](const Use *U) {
- if (auto *I = dyn_cast<Instruction>(U->getUser()))
- return I->getParent() != DestBlock;
- return true;
- });
- /// FIXME: We could remove droppable uses that are not dominated by
- /// the new position.
-
- BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
- I->moveBefore(&*InsertPos);
- ++NumSunkInst;
-
- // Also sink all related debug uses from the source basic block. Otherwise we
- // get debug use before the def. Attempt to salvage debug uses first, to
- // maximise the range variables have location for. If we cannot salvage, then
- // mark the location undef: we know it was supposed to receive a new location
- // here, but that computation has been sunk.
- SmallVector<DbgVariableIntrinsic *, 2> DbgUsers;
- findDbgUsers(DbgUsers, I);
-
- // Update the arguments of a dbg.declare instruction, so that it
- // does not point into a sunk instruction.
- auto updateDbgDeclare = [&I](DbgVariableIntrinsic *DII) {
- if (!isa<DbgDeclareInst>(DII))
- return false;
-
- if (isa<CastInst>(I))
- DII->setOperand(
- 0, MetadataAsValue::get(I->getContext(),
- ValueAsMetadata::get(I->getOperand(0))));
- return true;
- };
-
- SmallVector<DbgVariableIntrinsic *, 2> DIIClones;
- for (auto User : DbgUsers) {
- // A dbg.declare instruction should not be cloned, since there can only be
- // one per variable fragment. It should be left in the original place
- // because the sunk instruction is not an alloca (otherwise we could not be
- // here).
- if (User->getParent() != SrcBlock || updateDbgDeclare(User))
- continue;
-
- DIIClones.emplace_back(cast<DbgVariableIntrinsic>(User->clone()));
- LLVM_DEBUG(dbgs() << "CLONE: " << *DIIClones.back() << '\n');
- }
-
- // Perform salvaging without the clones, then sink the clones.
- if (!DIIClones.empty()) {
- salvageDebugInfoForDbgValues(*I, DbgUsers);
- for (auto &DIIClone : DIIClones) {
- DIIClone->insertBefore(&*InsertPos);
- LLVM_DEBUG(dbgs() << "SINK: " << *DIIClone << '\n');
- }
- }
-
- return true;
-}
-
+ return nullptr;
+}
+
+/// Try to move the specified instruction from its current block into the
+/// beginning of DestBlock, which can only happen if it's safe to move the
+/// instruction past all of the instructions between it and the end of its
+/// block.
+static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
+ assert(I->getSingleUndroppableUse() && "Invariants didn't hold!");
+ BasicBlock *SrcBlock = I->getParent();
+
+ // Cannot move control-flow-involving, volatile loads, vaarg, etc.
+ if (isa<PHINode>(I) || I->isEHPad() || I->mayHaveSideEffects() ||
+ I->isTerminator())
+ return false;
+
+ // Do not sink static or dynamic alloca instructions. Static allocas must
+ // remain in the entry block, and dynamic allocas must not be sunk in between
+ // a stacksave / stackrestore pair, which would incorrectly shorten its
+ // lifetime.
+ if (isa<AllocaInst>(I))
+ return false;
+
+ // Do not sink into catchswitch blocks.
+ if (isa<CatchSwitchInst>(DestBlock->getTerminator()))
+ return false;
+
+ // Do not sink convergent call instructions.
+ if (auto *CI = dyn_cast<CallInst>(I)) {
+ if (CI->isConvergent())
+ return false;
+ }
+ // We can only sink load instructions if there is nothing between the load and
+ // the end of block that could change the value.
+ if (I->mayReadFromMemory()) {
+ // We don't want to do any sophisticated alias analysis, so we only check
+ // the instructions after I in I's parent block if we try to sink to its
+ // successor block.
+ if (DestBlock->getUniquePredecessor() != I->getParent())
+ return false;
+ for (BasicBlock::iterator Scan = I->getIterator(),
+ E = I->getParent()->end();
+ Scan != E; ++Scan)
+ if (Scan->mayWriteToMemory())
+ return false;
+ }
+
+ I->dropDroppableUses([DestBlock](const Use *U) {
+ if (auto *I = dyn_cast<Instruction>(U->getUser()))
+ return I->getParent() != DestBlock;
+ return true;
+ });
+ /// FIXME: We could remove droppable uses that are not dominated by
+ /// the new position.
+
+ BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
+ I->moveBefore(&*InsertPos);
+ ++NumSunkInst;
+
+ // Also sink all related debug uses from the source basic block. Otherwise we
+ // get debug use before the def. Attempt to salvage debug uses first, to
+ // maximise the range variables have location for. If we cannot salvage, then
+ // mark the location undef: we know it was supposed to receive a new location
+ // here, but that computation has been sunk.
+ SmallVector<DbgVariableIntrinsic *, 2> DbgUsers;
+ findDbgUsers(DbgUsers, I);
+
+ // Update the arguments of a dbg.declare instruction, so that it
+ // does not point into a sunk instruction.
+ auto updateDbgDeclare = [&I](DbgVariableIntrinsic *DII) {
+ if (!isa<DbgDeclareInst>(DII))
+ return false;
+
+ if (isa<CastInst>(I))
+ DII->setOperand(
+ 0, MetadataAsValue::get(I->getContext(),
+ ValueAsMetadata::get(I->getOperand(0))));
+ return true;
+ };
+
+ SmallVector<DbgVariableIntrinsic *, 2> DIIClones;
+ for (auto User : DbgUsers) {
+ // A dbg.declare instruction should not be cloned, since there can only be
+ // one per variable fragment. It should be left in the original place
+ // because the sunk instruction is not an alloca (otherwise we could not be
+ // here).
+ if (User->getParent() != SrcBlock || updateDbgDeclare(User))
+ continue;
+
+ DIIClones.emplace_back(cast<DbgVariableIntrinsic>(User->clone()));
+ LLVM_DEBUG(dbgs() << "CLONE: " << *DIIClones.back() << '\n');
+ }
+
+ // Perform salvaging without the clones, then sink the clones.
+ if (!DIIClones.empty()) {
+ salvageDebugInfoForDbgValues(*I, DbgUsers);
+ for (auto &DIIClone : DIIClones) {
+ DIIClone->insertBefore(&*InsertPos);
+ LLVM_DEBUG(dbgs() << "SINK: " << *DIIClone << '\n');
+ }
+ }
+
+ return true;
+}
+
bool InstCombinerImpl::run() {
- while (!Worklist.isEmpty()) {
- // Walk deferred instructions in reverse order, and push them to the
- // worklist, which means they'll end up popped from the worklist in-order.
- while (Instruction *I = Worklist.popDeferred()) {
- // Check to see if we can DCE the instruction. We do this already here to
- // reduce the number of uses and thus allow other folds to trigger.
- // Note that eraseInstFromFunction() may push additional instructions on
- // the deferred worklist, so this will DCE whole instruction chains.
- if (isInstructionTriviallyDead(I, &TLI)) {
- eraseInstFromFunction(*I);
- ++NumDeadInst;
- continue;
- }
-
- Worklist.push(I);
- }
-
- Instruction *I = Worklist.removeOne();
- if (I == nullptr) continue; // skip null values.
-
- // Check to see if we can DCE the instruction.
- if (isInstructionTriviallyDead(I, &TLI)) {
- eraseInstFromFunction(*I);
- ++NumDeadInst;
- continue;
- }
-
- if (!DebugCounter::shouldExecute(VisitCounter))
- continue;
-
- // Instruction isn't dead, see if we can constant propagate it.
- if (!I->use_empty() &&
- (I->getNumOperands() == 0 || isa<Constant>(I->getOperand(0)))) {
- if (Constant *C = ConstantFoldInstruction(I, DL, &TLI)) {
- LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I
- << '\n');
-
- // Add operands to the worklist.
- replaceInstUsesWith(*I, C);
- ++NumConstProp;
- if (isInstructionTriviallyDead(I, &TLI))
- eraseInstFromFunction(*I);
- MadeIRChange = true;
- continue;
- }
- }
-
- // See if we can trivially sink this instruction to its user if we can
- // prove that the successor is not executed more frequently than our block.
- if (EnableCodeSinking)
- if (Use *SingleUse = I->getSingleUndroppableUse()) {
- BasicBlock *BB = I->getParent();
- Instruction *UserInst = cast<Instruction>(SingleUse->getUser());
- BasicBlock *UserParent;
-
- // Get the block the use occurs in.
- if (PHINode *PN = dyn_cast<PHINode>(UserInst))
- UserParent = PN->getIncomingBlock(*SingleUse);
- else
- UserParent = UserInst->getParent();
-
+ while (!Worklist.isEmpty()) {
+ // Walk deferred instructions in reverse order, and push them to the
+ // worklist, which means they'll end up popped from the worklist in-order.
+ while (Instruction *I = Worklist.popDeferred()) {
+ // Check to see if we can DCE the instruction. We do this already here to
+ // reduce the number of uses and thus allow other folds to trigger.
+ // Note that eraseInstFromFunction() may push additional instructions on
+ // the deferred worklist, so this will DCE whole instruction chains.
+ if (isInstructionTriviallyDead(I, &TLI)) {
+ eraseInstFromFunction(*I);
+ ++NumDeadInst;
+ continue;
+ }
+
+ Worklist.push(I);
+ }
+
+ Instruction *I = Worklist.removeOne();
+ if (I == nullptr) continue; // skip null values.
+
+ // Check to see if we can DCE the instruction.
+ if (isInstructionTriviallyDead(I, &TLI)) {
+ eraseInstFromFunction(*I);
+ ++NumDeadInst;
+ continue;
+ }
+
+ if (!DebugCounter::shouldExecute(VisitCounter))
+ continue;
+
+ // Instruction isn't dead, see if we can constant propagate it.
+ if (!I->use_empty() &&
+ (I->getNumOperands() == 0 || isa<Constant>(I->getOperand(0)))) {
+ if (Constant *C = ConstantFoldInstruction(I, DL, &TLI)) {
+ LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *I
+ << '\n');
+
+ // Add operands to the worklist.
+ replaceInstUsesWith(*I, C);
+ ++NumConstProp;
+ if (isInstructionTriviallyDead(I, &TLI))
+ eraseInstFromFunction(*I);
+ MadeIRChange = true;
+ continue;
+ }
+ }
+
+ // See if we can trivially sink this instruction to its user if we can
+ // prove that the successor is not executed more frequently than our block.
+ if (EnableCodeSinking)
+ if (Use *SingleUse = I->getSingleUndroppableUse()) {
+ BasicBlock *BB = I->getParent();
+ Instruction *UserInst = cast<Instruction>(SingleUse->getUser());
+ BasicBlock *UserParent;
+
+ // Get the block the use occurs in.
+ if (PHINode *PN = dyn_cast<PHINode>(UserInst))
+ UserParent = PN->getIncomingBlock(*SingleUse);
+ else
+ UserParent = UserInst->getParent();
+
// Try sinking to another block. If that block is unreachable, then do
// not bother. SimplifyCFG should handle it.
if (UserParent != BB && DT.isReachableFromEntry(UserParent)) {
- // See if the user is one of our successors that has only one
- // predecessor, so that we don't have to split the critical edge.
- bool ShouldSink = UserParent->getUniquePredecessor() == BB;
- // Another option where we can sink is a block that ends with a
- // terminator that does not pass control to other block (such as
- // return or unreachable). In this case:
- // - I dominates the User (by SSA form);
- // - the User will be executed at most once.
- // So sinking I down to User is always profitable or neutral.
- if (!ShouldSink) {
- auto *Term = UserParent->getTerminator();
- ShouldSink = isa<ReturnInst>(Term) || isa<UnreachableInst>(Term);
- }
- if (ShouldSink) {
- assert(DT.dominates(BB, UserParent) &&
- "Dominance relation broken?");
- // Okay, the CFG is simple enough, try to sink this instruction.
- if (TryToSinkInstruction(I, UserParent)) {
- LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
- MadeIRChange = true;
- // We'll add uses of the sunk instruction below, but since sinking
- // can expose opportunities for it's *operands* add them to the
- // worklist
- for (Use &U : I->operands())
- if (Instruction *OpI = dyn_cast<Instruction>(U.get()))
- Worklist.push(OpI);
- }
- }
- }
- }
-
- // Now that we have an instruction, try combining it to simplify it.
- Builder.SetInsertPoint(I);
+ // See if the user is one of our successors that has only one
+ // predecessor, so that we don't have to split the critical edge.
+ bool ShouldSink = UserParent->getUniquePredecessor() == BB;
+ // Another option where we can sink is a block that ends with a
+ // terminator that does not pass control to other block (such as
+ // return or unreachable). In this case:
+ // - I dominates the User (by SSA form);
+ // - the User will be executed at most once.
+ // So sinking I down to User is always profitable or neutral.
+ if (!ShouldSink) {
+ auto *Term = UserParent->getTerminator();
+ ShouldSink = isa<ReturnInst>(Term) || isa<UnreachableInst>(Term);
+ }
+ if (ShouldSink) {
+ assert(DT.dominates(BB, UserParent) &&
+ "Dominance relation broken?");
+ // Okay, the CFG is simple enough, try to sink this instruction.
+ if (TryToSinkInstruction(I, UserParent)) {
+ LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
+ MadeIRChange = true;
+ // We'll add uses of the sunk instruction below, but since sinking
+ // can expose opportunities for it's *operands* add them to the
+ // worklist
+ for (Use &U : I->operands())
+ if (Instruction *OpI = dyn_cast<Instruction>(U.get()))
+ Worklist.push(OpI);
+ }
+ }
+ }
+ }
+
+ // Now that we have an instruction, try combining it to simplify it.
+ Builder.SetInsertPoint(I);
Builder.CollectMetadataToCopy(
I, {LLVMContext::MD_dbg, LLVMContext::MD_annotation});
-
-#ifndef NDEBUG
- std::string OrigI;
-#endif
- LLVM_DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
- LLVM_DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
-
- if (Instruction *Result = visit(*I)) {
- ++NumCombined;
- // Should we replace the old instruction with a new one?
- if (Result != I) {
- LLVM_DEBUG(dbgs() << "IC: Old = " << *I << '\n'
- << " New = " << *Result << '\n');
-
+
+#ifndef NDEBUG
+ std::string OrigI;
+#endif
+ LLVM_DEBUG(raw_string_ostream SS(OrigI); I->print(SS); OrigI = SS.str(););
+ LLVM_DEBUG(dbgs() << "IC: Visiting: " << OrigI << '\n');
+
+ if (Instruction *Result = visit(*I)) {
+ ++NumCombined;
+ // Should we replace the old instruction with a new one?
+ if (Result != I) {
+ LLVM_DEBUG(dbgs() << "IC: Old = " << *I << '\n'
+ << " New = " << *Result << '\n');
+
Result->copyMetadata(*I,
{LLVMContext::MD_dbg, LLVMContext::MD_annotation});
- // Everything uses the new instruction now.
- I->replaceAllUsesWith(Result);
-
- // Move the name to the new instruction first.
- Result->takeName(I);
-
- // Insert the new instruction into the basic block...
- BasicBlock *InstParent = I->getParent();
- BasicBlock::iterator InsertPos = I->getIterator();
-
+ // Everything uses the new instruction now.
+ I->replaceAllUsesWith(Result);
+
+ // Move the name to the new instruction first.
+ Result->takeName(I);
+
+ // Insert the new instruction into the basic block...
+ BasicBlock *InstParent = I->getParent();
+ BasicBlock::iterator InsertPos = I->getIterator();
+
// Are we replace a PHI with something that isn't a PHI, or vice versa?
if (isa<PHINode>(Result) != isa<PHINode>(I)) {
// We need to fix up the insertion point.
@@ -3737,35 +3737,35 @@ bool InstCombinerImpl::run() {
else // Non-PHI -> PHI
InsertPos = InstParent->getFirstNonPHI()->getIterator();
}
-
- InstParent->getInstList().insert(InsertPos, Result);
-
- // Push the new instruction and any users onto the worklist.
- Worklist.pushUsersToWorkList(*Result);
- Worklist.push(Result);
-
- eraseInstFromFunction(*I);
- } else {
- LLVM_DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
- << " New = " << *I << '\n');
-
- // If the instruction was modified, it's possible that it is now dead.
- // if so, remove it.
- if (isInstructionTriviallyDead(I, &TLI)) {
- eraseInstFromFunction(*I);
- } else {
- Worklist.pushUsersToWorkList(*I);
- Worklist.push(I);
- }
- }
- MadeIRChange = true;
- }
- }
-
- Worklist.zap();
- return MadeIRChange;
-}
-
+
+ InstParent->getInstList().insert(InsertPos, Result);
+
+ // Push the new instruction and any users onto the worklist.
+ Worklist.pushUsersToWorkList(*Result);
+ Worklist.push(Result);
+
+ eraseInstFromFunction(*I);
+ } else {
+ LLVM_DEBUG(dbgs() << "IC: Mod = " << OrigI << '\n'
+ << " New = " << *I << '\n');
+
+ // If the instruction was modified, it's possible that it is now dead.
+ // if so, remove it.
+ if (isInstructionTriviallyDead(I, &TLI)) {
+ eraseInstFromFunction(*I);
+ } else {
+ Worklist.pushUsersToWorkList(*I);
+ Worklist.push(I);
+ }
+ }
+ MadeIRChange = true;
+ }
+ }
+
+ Worklist.zap();
+ return MadeIRChange;
+}
+
// Track the scopes used by !alias.scope and !noalias. In a function, a
// @llvm.experimental.noalias.scope.decl is only useful if that scope is used
// by both sets. If not, the declaration of the scope can be safely omitted.
@@ -3815,321 +3815,321 @@ public:
}
};
-/// Populate the IC worklist from a function, by walking it in depth-first
-/// order and adding all reachable code to the worklist.
-///
-/// This has a couple of tricks to make the code faster and more powerful. In
-/// particular, we constant fold and DCE instructions as we go, to avoid adding
-/// them to the worklist (this significantly speeds up instcombine on code where
-/// many instructions are dead or constant). Additionally, if we find a branch
-/// whose condition is a known constant, we only visit the reachable successors.
-static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
- const TargetLibraryInfo *TLI,
- InstCombineWorklist &ICWorklist) {
- bool MadeIRChange = false;
- SmallPtrSet<BasicBlock *, 32> Visited;
- SmallVector<BasicBlock*, 256> Worklist;
- Worklist.push_back(&F.front());
-
- SmallVector<Instruction*, 128> InstrsForInstCombineWorklist;
- DenseMap<Constant *, Constant *> FoldedConstants;
+/// Populate the IC worklist from a function, by walking it in depth-first
+/// order and adding all reachable code to the worklist.
+///
+/// This has a couple of tricks to make the code faster and more powerful. In
+/// particular, we constant fold and DCE instructions as we go, to avoid adding
+/// them to the worklist (this significantly speeds up instcombine on code where
+/// many instructions are dead or constant). Additionally, if we find a branch
+/// whose condition is a known constant, we only visit the reachable successors.
+static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
+ const TargetLibraryInfo *TLI,
+ InstCombineWorklist &ICWorklist) {
+ bool MadeIRChange = false;
+ SmallPtrSet<BasicBlock *, 32> Visited;
+ SmallVector<BasicBlock*, 256> Worklist;
+ Worklist.push_back(&F.front());
+
+ SmallVector<Instruction*, 128> InstrsForInstCombineWorklist;
+ DenseMap<Constant *, Constant *> FoldedConstants;
AliasScopeTracker SeenAliasScopes;
-
- do {
- BasicBlock *BB = Worklist.pop_back_val();
-
- // We have now visited this block! If we've already been here, ignore it.
- if (!Visited.insert(BB).second)
- continue;
-
- for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
- Instruction *Inst = &*BBI++;
-
- // ConstantProp instruction if trivially constant.
- if (!Inst->use_empty() &&
- (Inst->getNumOperands() == 0 || isa<Constant>(Inst->getOperand(0))))
- if (Constant *C = ConstantFoldInstruction(Inst, DL, TLI)) {
- LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *Inst
- << '\n');
- Inst->replaceAllUsesWith(C);
- ++NumConstProp;
- if (isInstructionTriviallyDead(Inst, TLI))
- Inst->eraseFromParent();
- MadeIRChange = true;
- continue;
- }
-
- // See if we can constant fold its operands.
- for (Use &U : Inst->operands()) {
- if (!isa<ConstantVector>(U) && !isa<ConstantExpr>(U))
- continue;
-
- auto *C = cast<Constant>(U);
- Constant *&FoldRes = FoldedConstants[C];
- if (!FoldRes)
- FoldRes = ConstantFoldConstant(C, DL, TLI);
-
- if (FoldRes != C) {
- LLVM_DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
- << "\n Old = " << *C
- << "\n New = " << *FoldRes << '\n');
- U = FoldRes;
- MadeIRChange = true;
- }
- }
-
+
+ do {
+ BasicBlock *BB = Worklist.pop_back_val();
+
+ // We have now visited this block! If we've already been here, ignore it.
+ if (!Visited.insert(BB).second)
+ continue;
+
+ for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
+ Instruction *Inst = &*BBI++;
+
+ // ConstantProp instruction if trivially constant.
+ if (!Inst->use_empty() &&
+ (Inst->getNumOperands() == 0 || isa<Constant>(Inst->getOperand(0))))
+ if (Constant *C = ConstantFoldInstruction(Inst, DL, TLI)) {
+ LLVM_DEBUG(dbgs() << "IC: ConstFold to: " << *C << " from: " << *Inst
+ << '\n');
+ Inst->replaceAllUsesWith(C);
+ ++NumConstProp;
+ if (isInstructionTriviallyDead(Inst, TLI))
+ Inst->eraseFromParent();
+ MadeIRChange = true;
+ continue;
+ }
+
+ // See if we can constant fold its operands.
+ for (Use &U : Inst->operands()) {
+ if (!isa<ConstantVector>(U) && !isa<ConstantExpr>(U))
+ continue;
+
+ auto *C = cast<Constant>(U);
+ Constant *&FoldRes = FoldedConstants[C];
+ if (!FoldRes)
+ FoldRes = ConstantFoldConstant(C, DL, TLI);
+
+ if (FoldRes != C) {
+ LLVM_DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
+ << "\n Old = " << *C
+ << "\n New = " << *FoldRes << '\n');
+ U = FoldRes;
+ MadeIRChange = true;
+ }
+ }
+
// Skip processing debug and pseudo intrinsics in InstCombine. Processing
// these call instructions consumes non-trivial amount of time and
// provides no value for the optimization.
if (!Inst->isDebugOrPseudoInst()) {
- InstrsForInstCombineWorklist.push_back(Inst);
+ InstrsForInstCombineWorklist.push_back(Inst);
SeenAliasScopes.analyse(Inst);
}
- }
-
- // Recursively visit successors. If this is a branch or switch on a
- // constant, only visit the reachable successor.
- Instruction *TI = BB->getTerminator();
- if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
- if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) {
- bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue();
- BasicBlock *ReachableBB = BI->getSuccessor(!CondVal);
- Worklist.push_back(ReachableBB);
- continue;
- }
- } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
- if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
- Worklist.push_back(SI->findCaseValue(Cond)->getCaseSuccessor());
- continue;
- }
- }
-
+ }
+
+ // Recursively visit successors. If this is a branch or switch on a
+ // constant, only visit the reachable successor.
+ Instruction *TI = BB->getTerminator();
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) {
+ bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue();
+ BasicBlock *ReachableBB = BI->getSuccessor(!CondVal);
+ Worklist.push_back(ReachableBB);
+ continue;
+ }
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
+ Worklist.push_back(SI->findCaseValue(Cond)->getCaseSuccessor());
+ continue;
+ }
+ }
+
append_range(Worklist, successors(TI));
- } while (!Worklist.empty());
-
- // Remove instructions inside unreachable blocks. This prevents the
- // instcombine code from having to deal with some bad special cases, and
- // reduces use counts of instructions.
- for (BasicBlock &BB : F) {
- if (Visited.count(&BB))
- continue;
-
+ } while (!Worklist.empty());
+
+ // Remove instructions inside unreachable blocks. This prevents the
+ // instcombine code from having to deal with some bad special cases, and
+ // reduces use counts of instructions.
+ for (BasicBlock &BB : F) {
+ if (Visited.count(&BB))
+ continue;
+
unsigned NumDeadInstInBB;
unsigned NumDeadDbgInstInBB;
std::tie(NumDeadInstInBB, NumDeadDbgInstInBB) =
removeAllNonTerminatorAndEHPadInstructions(&BB);
MadeIRChange |= NumDeadInstInBB + NumDeadDbgInstInBB > 0;
- NumDeadInst += NumDeadInstInBB;
- }
-
- // Once we've found all of the instructions to add to instcombine's worklist,
- // add them in reverse order. This way instcombine will visit from the top
- // of the function down. This jives well with the way that it adds all uses
- // of instructions to the worklist after doing a transformation, thus avoiding
- // some N^2 behavior in pathological cases.
- ICWorklist.reserve(InstrsForInstCombineWorklist.size());
- for (Instruction *Inst : reverse(InstrsForInstCombineWorklist)) {
- // DCE instruction if trivially dead. As we iterate in reverse program
- // order here, we will clean up whole chains of dead instructions.
+ NumDeadInst += NumDeadInstInBB;
+ }
+
+ // Once we've found all of the instructions to add to instcombine's worklist,
+ // add them in reverse order. This way instcombine will visit from the top
+ // of the function down. This jives well with the way that it adds all uses
+ // of instructions to the worklist after doing a transformation, thus avoiding
+ // some N^2 behavior in pathological cases.
+ ICWorklist.reserve(InstrsForInstCombineWorklist.size());
+ for (Instruction *Inst : reverse(InstrsForInstCombineWorklist)) {
+ // DCE instruction if trivially dead. As we iterate in reverse program
+ // order here, we will clean up whole chains of dead instructions.
if (isInstructionTriviallyDead(Inst, TLI) ||
SeenAliasScopes.isNoAliasScopeDeclDead(Inst)) {
- ++NumDeadInst;
- LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
- salvageDebugInfo(*Inst);
- Inst->eraseFromParent();
- MadeIRChange = true;
- continue;
- }
-
- ICWorklist.push(Inst);
- }
-
- return MadeIRChange;
-}
-
-static bool combineInstructionsOverFunction(
- Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA,
+ ++NumDeadInst;
+ LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
+ salvageDebugInfo(*Inst);
+ Inst->eraseFromParent();
+ MadeIRChange = true;
+ continue;
+ }
+
+ ICWorklist.push(Inst);
+ }
+
+ return MadeIRChange;
+}
+
+static bool combineInstructionsOverFunction(
+ Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA,
AssumptionCache &AC, TargetLibraryInfo &TLI, TargetTransformInfo &TTI,
DominatorTree &DT, OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI, unsigned MaxIterations, LoopInfo *LI) {
- auto &DL = F.getParent()->getDataLayout();
- MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue());
-
- /// Builder - This is an IRBuilder that automatically inserts new
- /// instructions into the worklist when they are created.
- IRBuilder<TargetFolder, IRBuilderCallbackInserter> Builder(
- F.getContext(), TargetFolder(DL),
- IRBuilderCallbackInserter([&Worklist, &AC](Instruction *I) {
- Worklist.add(I);
- if (match(I, m_Intrinsic<Intrinsic::assume>()))
- AC.registerAssumption(cast<CallInst>(I));
- }));
-
- // Lower dbg.declare intrinsics otherwise their value may be clobbered
- // by instcombiner.
- bool MadeIRChange = false;
- if (ShouldLowerDbgDeclare)
- MadeIRChange = LowerDbgDeclare(F);
-
- // Iterate while there is work to do.
- unsigned Iteration = 0;
- while (true) {
+ ProfileSummaryInfo *PSI, unsigned MaxIterations, LoopInfo *LI) {
+ auto &DL = F.getParent()->getDataLayout();
+ MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue());
+
+ /// Builder - This is an IRBuilder that automatically inserts new
+ /// instructions into the worklist when they are created.
+ IRBuilder<TargetFolder, IRBuilderCallbackInserter> Builder(
+ F.getContext(), TargetFolder(DL),
+ IRBuilderCallbackInserter([&Worklist, &AC](Instruction *I) {
+ Worklist.add(I);
+ if (match(I, m_Intrinsic<Intrinsic::assume>()))
+ AC.registerAssumption(cast<CallInst>(I));
+ }));
+
+ // Lower dbg.declare intrinsics otherwise their value may be clobbered
+ // by instcombiner.
+ bool MadeIRChange = false;
+ if (ShouldLowerDbgDeclare)
+ MadeIRChange = LowerDbgDeclare(F);
+
+ // Iterate while there is work to do.
+ unsigned Iteration = 0;
+ while (true) {
++NumWorklistIterations;
- ++Iteration;
-
- if (Iteration > InfiniteLoopDetectionThreshold) {
- report_fatal_error(
- "Instruction Combining seems stuck in an infinite loop after " +
- Twine(InfiniteLoopDetectionThreshold) + " iterations.");
- }
-
- if (Iteration > MaxIterations) {
- LLVM_DEBUG(dbgs() << "\n\n[IC] Iteration limit #" << MaxIterations
- << " on " << F.getName()
- << " reached; stopping before reaching a fixpoint\n");
- break;
- }
-
- LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
- << F.getName() << "\n");
-
- MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
-
+ ++Iteration;
+
+ if (Iteration > InfiniteLoopDetectionThreshold) {
+ report_fatal_error(
+ "Instruction Combining seems stuck in an infinite loop after " +
+ Twine(InfiniteLoopDetectionThreshold) + " iterations.");
+ }
+
+ if (Iteration > MaxIterations) {
+ LLVM_DEBUG(dbgs() << "\n\n[IC] Iteration limit #" << MaxIterations
+ << " on " << F.getName()
+ << " reached; stopping before reaching a fixpoint\n");
+ break;
+ }
+
+ LLVM_DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
+ << F.getName() << "\n");
+
+ MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
+
InstCombinerImpl IC(Worklist, Builder, F.hasMinSize(), AA, AC, TLI, TTI, DT,
ORE, BFI, PSI, DL, LI);
- IC.MaxArraySizeForCombine = MaxArraySize;
-
- if (!IC.run())
- break;
-
- MadeIRChange = true;
- }
-
- return MadeIRChange;
-}
-
-InstCombinePass::InstCombinePass() : MaxIterations(LimitMaxIterations) {}
-
-InstCombinePass::InstCombinePass(unsigned MaxIterations)
- : MaxIterations(MaxIterations) {}
-
-PreservedAnalyses InstCombinePass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &AC = AM.getResult<AssumptionAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ IC.MaxArraySizeForCombine = MaxArraySize;
+
+ if (!IC.run())
+ break;
+
+ MadeIRChange = true;
+ }
+
+ return MadeIRChange;
+}
+
+InstCombinePass::InstCombinePass() : MaxIterations(LimitMaxIterations) {}
+
+InstCombinePass::InstCombinePass(unsigned MaxIterations)
+ : MaxIterations(MaxIterations) {}
+
+PreservedAnalyses InstCombinePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
-
- auto *LI = AM.getCachedResult<LoopAnalysis>(F);
-
- auto *AA = &AM.getResult<AAManager>(F);
- auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
- ProfileSummaryInfo *PSI =
- MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
- auto *BFI = (PSI && PSI->hasProfileSummary()) ?
- &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
-
+
+ auto *LI = AM.getCachedResult<LoopAnalysis>(F);
+
+ auto *AA = &AM.getResult<AAManager>(F);
+ auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+ ProfileSummaryInfo *PSI =
+ MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+ &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
+
if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE,
BFI, PSI, MaxIterations, LI))
- // No changes, all analyses are preserved.
- return PreservedAnalyses::all();
-
- // Mark all the analyses that instcombine updates as preserved.
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<AAManager>();
- PA.preserve<BasicAA>();
- PA.preserve<GlobalsAA>();
- return PA;
-}
-
-void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.setPreservesCFG();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
+ // No changes, all analyses are preserved.
+ return PreservedAnalyses::all();
+
+ // Mark all the analyses that instcombine updates as preserved.
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<AAManager>();
+ PA.preserve<BasicAA>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+void InstructionCombiningPass::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<BasicAAWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addRequired<ProfileSummaryInfoWrapperPass>();
- LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
-}
-
-bool InstructionCombiningPass::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- // Required analyses.
- auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+}
+
+bool InstructionCombiningPass::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ // Required analyses.
+ auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
- // Optional analyses.
- auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
- auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
- ProfileSummaryInfo *PSI =
- &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
- BlockFrequencyInfo *BFI =
- (PSI && PSI->hasProfileSummary()) ?
- &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
- nullptr;
-
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+ // Optional analyses.
+ auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+ auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+ ProfileSummaryInfo *PSI =
+ &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ BlockFrequencyInfo *BFI =
+ (PSI && PSI->hasProfileSummary()) ?
+ &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
+ nullptr;
+
return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, TTI, DT, ORE,
BFI, PSI, MaxIterations, LI);
-}
-
-char InstructionCombiningPass::ID = 0;
-
-InstructionCombiningPass::InstructionCombiningPass()
- : FunctionPass(ID), MaxIterations(InstCombineDefaultMaxIterations) {
- initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
-}
-
-InstructionCombiningPass::InstructionCombiningPass(unsigned MaxIterations)
- : FunctionPass(ID), MaxIterations(MaxIterations) {
- initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
-}
-
-INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine",
- "Combine redundant instructions", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+}
+
+char InstructionCombiningPass::ID = 0;
+
+InstructionCombiningPass::InstructionCombiningPass()
+ : FunctionPass(ID), MaxIterations(InstCombineDefaultMaxIterations) {
+ initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
+}
+
+InstructionCombiningPass::InstructionCombiningPass(unsigned MaxIterations)
+ : FunctionPass(ID), MaxIterations(MaxIterations) {
+ initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
+}
+
+INITIALIZE_PASS_BEGIN(InstructionCombiningPass, "instcombine",
+ "Combine redundant instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",
- "Combine redundant instructions", false, false)
-
-// Initialization Routines
-void llvm::initializeInstCombine(PassRegistry &Registry) {
- initializeInstructionCombiningPassPass(Registry);
-}
-
-void LLVMInitializeInstCombine(LLVMPassRegistryRef R) {
- initializeInstructionCombiningPassPass(*unwrap(R));
-}
-
-FunctionPass *llvm::createInstructionCombiningPass() {
- return new InstructionCombiningPass();
-}
-
-FunctionPass *llvm::createInstructionCombiningPass(unsigned MaxIterations) {
- return new InstructionCombiningPass(MaxIterations);
-}
-
-void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createInstructionCombiningPass());
-}
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_END(InstructionCombiningPass, "instcombine",
+ "Combine redundant instructions", false, false)
+
+// Initialization Routines
+void llvm::initializeInstCombine(PassRegistry &Registry) {
+ initializeInstructionCombiningPassPass(Registry);
+}
+
+void LLVMInitializeInstCombine(LLVMPassRegistryRef R) {
+ initializeInstructionCombiningPassPass(*unwrap(R));
+}
+
+FunctionPass *llvm::createInstructionCombiningPass() {
+ return new InstructionCombiningPass();
+}
+
+FunctionPass *llvm::createInstructionCombiningPass(unsigned MaxIterations) {
+ return new InstructionCombiningPass(MaxIterations);
+}
+
+void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createInstructionCombiningPass());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/InstCombine/ya.make b/contrib/libs/llvm12/lib/Transforms/InstCombine/ya.make
index 69d2077a71..3f74e68d16 100644
--- a/contrib/libs/llvm12/lib/Transforms/InstCombine/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/InstCombine/ya.make
@@ -1,49 +1,49 @@
-# Generated by devtools/yamaker.
-
-LIBRARY()
-
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
OWNER(
orivej
g:cpp-contrib
)
-
+
LICENSE(Apache-2.0 WITH LLVM-exception)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-PEERDIR(
+PEERDIR(
contrib/libs/llvm12
contrib/libs/llvm12/include
contrib/libs/llvm12/lib/Analysis
contrib/libs/llvm12/lib/IR
contrib/libs/llvm12/lib/Support
contrib/libs/llvm12/lib/Transforms/Utils
-)
-
+)
+
ADDINCL(
contrib/libs/llvm12/lib/Transforms/InstCombine
)
-
-NO_COMPILER_WARNINGS()
-
-NO_UTIL()
-
-SRCS(
- InstCombineAddSub.cpp
- InstCombineAndOrXor.cpp
- InstCombineAtomicRMW.cpp
- InstCombineCalls.cpp
- InstCombineCasts.cpp
- InstCombineCompares.cpp
- InstCombineLoadStoreAlloca.cpp
- InstCombineMulDivRem.cpp
- InstCombineNegator.cpp
- InstCombinePHI.cpp
- InstCombineSelect.cpp
- InstCombineShifts.cpp
- InstCombineSimplifyDemanded.cpp
- InstCombineVectorOps.cpp
- InstructionCombining.cpp
-)
-
-END()
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+ InstCombineAddSub.cpp
+ InstCombineAndOrXor.cpp
+ InstCombineAtomicRMW.cpp
+ InstCombineCalls.cpp
+ InstCombineCasts.cpp
+ InstCombineCompares.cpp
+ InstCombineLoadStoreAlloca.cpp
+ InstCombineMulDivRem.cpp
+ InstCombineNegator.cpp
+ InstCombinePHI.cpp
+ InstCombineSelect.cpp
+ InstCombineShifts.cpp
+ InstCombineSimplifyDemanded.cpp
+ InstCombineVectorOps.cpp
+ InstructionCombining.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 7212096f1b..f4e471706d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1,140 +1,140 @@
-//===- AddressSanitizer.cpp - memory error detector -----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of AddressSanitizer, an address sanity checker.
-// Details of the algorithm:
-// https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
-//
-// FIXME: This sanitizer does not yet handle scalable vectors
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/BinaryFormat/MachO.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Comdat.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/ScopedPrinter.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
-#include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iomanip>
-#include <limits>
-#include <memory>
-#include <sstream>
-#include <string>
-#include <tuple>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "asan"
-
-static const uint64_t kDefaultShadowScale = 3;
-static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
-static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
-static const uint64_t kDynamicShadowSentinel =
- std::numeric_limits<uint64_t>::max();
-static const uint64_t kSmallX86_64ShadowOffsetBase = 0x7FFFFFFF; // < 2G.
-static const uint64_t kSmallX86_64ShadowOffsetAlignMask = ~0xFFFULL;
-static const uint64_t kLinuxKasan_ShadowOffset64 = 0xdffffc0000000000;
-static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 44;
-static const uint64_t kSystemZ_ShadowOffset64 = 1ULL << 52;
-static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000;
-static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
-static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
+//===- AddressSanitizer.cpp - memory error detector -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+// Details of the algorithm:
+// https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
+//
+// FIXME: This sanitizer does not yet handle scalable vectors
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
+#include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iomanip>
+#include <limits>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "asan"
+
+static const uint64_t kDefaultShadowScale = 3;
+static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
+static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
+static const uint64_t kDynamicShadowSentinel =
+ std::numeric_limits<uint64_t>::max();
+static const uint64_t kSmallX86_64ShadowOffsetBase = 0x7FFFFFFF; // < 2G.
+static const uint64_t kSmallX86_64ShadowOffsetAlignMask = ~0xFFFULL;
+static const uint64_t kLinuxKasan_ShadowOffset64 = 0xdffffc0000000000;
+static const uint64_t kPPC64_ShadowOffset64 = 1ULL << 44;
+static const uint64_t kSystemZ_ShadowOffset64 = 1ULL << 52;
+static const uint64_t kMIPS32_ShadowOffset32 = 0x0aaa0000;
+static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
+static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
static const uint64_t kRISCV64_ShadowOffset64 = 0x20000000;
-static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
-static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
-static const uint64_t kNetBSD_ShadowOffset32 = 1ULL << 30;
-static const uint64_t kNetBSD_ShadowOffset64 = 1ULL << 46;
-static const uint64_t kNetBSDKasan_ShadowOffset64 = 0xdfff900000000000;
-static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40;
-static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
-static const uint64_t kEmscriptenShadowOffset = 0;
-
-static const uint64_t kMyriadShadowScale = 5;
-static const uint64_t kMyriadMemoryOffset32 = 0x80000000ULL;
-static const uint64_t kMyriadMemorySize32 = 0x20000000ULL;
-static const uint64_t kMyriadTagShift = 29;
-static const uint64_t kMyriadDDRTag = 4;
-static const uint64_t kMyriadCacheBitMask32 = 0x40000000ULL;
-
-// The shadow memory space is dynamically allocated.
-static const uint64_t kWindowsShadowOffset64 = kDynamicShadowSentinel;
-
-static const size_t kMinStackMallocSize = 1 << 6; // 64B
-static const size_t kMaxStackMallocSize = 1 << 16; // 64K
-static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
-static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
-
+static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
+static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kNetBSD_ShadowOffset32 = 1ULL << 30;
+static const uint64_t kNetBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kNetBSDKasan_ShadowOffset64 = 0xdfff900000000000;
+static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40;
+static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
+static const uint64_t kEmscriptenShadowOffset = 0;
+
+static const uint64_t kMyriadShadowScale = 5;
+static const uint64_t kMyriadMemoryOffset32 = 0x80000000ULL;
+static const uint64_t kMyriadMemorySize32 = 0x20000000ULL;
+static const uint64_t kMyriadTagShift = 29;
+static const uint64_t kMyriadDDRTag = 4;
+static const uint64_t kMyriadCacheBitMask32 = 0x40000000ULL;
+
+// The shadow memory space is dynamically allocated.
+static const uint64_t kWindowsShadowOffset64 = kDynamicShadowSentinel;
+
+static const size_t kMinStackMallocSize = 1 << 6; // 64B
+static const size_t kMaxStackMallocSize = 1 << 16; // 64K
+static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
+static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
+
const char kAsanModuleCtorName[] = "asan.module_ctor";
const char kAsanModuleDtorName[] = "asan.module_dtor";
-static const uint64_t kAsanCtorAndDtorPriority = 1;
-// On Emscripten, the system needs more than one priorities for constructors.
-static const uint64_t kAsanEmscriptenCtorAndDtorPriority = 50;
+static const uint64_t kAsanCtorAndDtorPriority = 1;
+// On Emscripten, the system needs more than one priorities for constructors.
+static const uint64_t kAsanEmscriptenCtorAndDtorPriority = 50;
const char kAsanReportErrorTemplate[] = "__asan_report_";
const char kAsanRegisterGlobalsName[] = "__asan_register_globals";
const char kAsanUnregisterGlobalsName[] = "__asan_unregister_globals";
@@ -150,7 +150,7 @@ const char kAsanVersionCheckNamePrefix[] = "__asan_version_mismatch_check_v";
const char kAsanPtrCmp[] = "__sanitizer_ptr_cmp";
const char kAsanPtrSub[] = "__sanitizer_ptr_sub";
const char kAsanHandleNoReturnName[] = "__asan_handle_no_return";
-static const int kMaxAsanStackMallocSizeClass = 10;
+static const int kMaxAsanStackMallocSizeClass = 10;
const char kAsanStackMallocNameTemplate[] = "__asan_stack_malloc_";
const char kAsanStackFreeNameTemplate[] = "__asan_stack_free_";
const char kAsanGenPrefix[] = "___asan_gen_";
@@ -159,808 +159,808 @@ const char kSanCovGenPrefix[] = "__sancov_gen_";
const char kAsanSetShadowPrefix[] = "__asan_set_shadow_";
const char kAsanPoisonStackMemoryName[] = "__asan_poison_stack_memory";
const char kAsanUnpoisonStackMemoryName[] = "__asan_unpoison_stack_memory";
-
-// ASan version script has __asan_* wildcard. Triple underscore prevents a
-// linker (gold) warning about attempting to export a local symbol.
+
+// ASan version script has __asan_* wildcard. Triple underscore prevents a
+// linker (gold) warning about attempting to export a local symbol.
const char kAsanGlobalsRegisteredFlagName[] = "___asan_globals_registered";
-
+
const char kAsanOptionDetectUseAfterReturn[] =
- "__asan_option_detect_stack_use_after_return";
-
+ "__asan_option_detect_stack_use_after_return";
+
const char kAsanShadowMemoryDynamicAddress[] =
- "__asan_shadow_memory_dynamic_address";
-
+ "__asan_shadow_memory_dynamic_address";
+
const char kAsanAllocaPoison[] = "__asan_alloca_poison";
const char kAsanAllocasUnpoison[] = "__asan_allocas_unpoison";
-
-// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
-static const size_t kNumberOfAccessSizes = 5;
-
-static const unsigned kAllocaRzSize = 32;
-
-// Command-line flags.
-
-static cl::opt<bool> ClEnableKasan(
- "asan-kernel", cl::desc("Enable KernelAddressSanitizer instrumentation"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool> ClRecover(
- "asan-recover",
- cl::desc("Enable recovery mode (continue-after-error)."),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool> ClInsertVersionCheck(
- "asan-guard-against-version-mismatch",
- cl::desc("Guard against compiler/runtime version mismatch."),
- cl::Hidden, cl::init(true));
-
-// This flag may need to be replaced with -f[no-]asan-reads.
-static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
- cl::desc("instrument read instructions"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClInstrumentWrites(
- "asan-instrument-writes", cl::desc("instrument write instructions"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClInstrumentAtomics(
- "asan-instrument-atomics",
- cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
- cl::init(true));
-
-static cl::opt<bool>
- ClInstrumentByval("asan-instrument-byval",
- cl::desc("instrument byval call arguments"), cl::Hidden,
- cl::init(true));
-
-static cl::opt<bool> ClAlwaysSlowPath(
- "asan-always-slow-path",
- cl::desc("use instrumentation with slow path for all accesses"), cl::Hidden,
- cl::init(false));
-
-static cl::opt<bool> ClForceDynamicShadow(
- "asan-force-dynamic-shadow",
- cl::desc("Load shadow address into a local variable for each function"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool>
- ClWithIfunc("asan-with-ifunc",
- cl::desc("Access dynamic shadow through an ifunc global on "
- "platforms that support this"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClWithIfuncSuppressRemat(
- "asan-with-ifunc-suppress-remat",
- cl::desc("Suppress rematerialization of dynamic shadow address by passing "
- "it through inline asm in prologue."),
- cl::Hidden, cl::init(true));
-
-// This flag limits the number of instructions to be instrumented
-// in any given BB. Normally, this should be set to unlimited (INT_MAX),
-// but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary
-// set it to 10000.
-static cl::opt<int> ClMaxInsnsToInstrumentPerBB(
- "asan-max-ins-per-bb", cl::init(10000),
- cl::desc("maximal number of instructions to instrument in any given BB"),
- cl::Hidden);
-
-// This flag may need to be replaced with -f[no]asan-stack.
-static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"),
- cl::Hidden, cl::init(true));
-static cl::opt<uint32_t> ClMaxInlinePoisoningSize(
- "asan-max-inline-poisoning-size",
- cl::desc(
- "Inline shadow poisoning for blocks up to the given size in bytes."),
- cl::Hidden, cl::init(64));
-
-static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
- cl::desc("Check stack-use-after-return"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClRedzoneByvalArgs("asan-redzone-byval-args",
- cl::desc("Create redzones for byval "
- "arguments (extra copy "
- "required)"), cl::Hidden,
- cl::init(true));
-
-static cl::opt<bool> ClUseAfterScope("asan-use-after-scope",
- cl::desc("Check stack-use-after-scope"),
- cl::Hidden, cl::init(false));
-
-// This flag may need to be replaced with -f[no]asan-globals.
-static cl::opt<bool> ClGlobals("asan-globals",
- cl::desc("Handle global objects"), cl::Hidden,
- cl::init(true));
-
-static cl::opt<bool> ClInitializers("asan-initialization-order",
- cl::desc("Handle C++ initializer order"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClInvalidPointerPairs(
- "asan-detect-invalid-pointer-pair",
- cl::desc("Instrument <, <=, >, >=, - with pointer operands"), cl::Hidden,
- cl::init(false));
-
-static cl::opt<bool> ClInvalidPointerCmp(
- "asan-detect-invalid-pointer-cmp",
- cl::desc("Instrument <, <=, >, >= with pointer operands"), cl::Hidden,
- cl::init(false));
-
-static cl::opt<bool> ClInvalidPointerSub(
- "asan-detect-invalid-pointer-sub",
- cl::desc("Instrument - operations with pointer operands"), cl::Hidden,
- cl::init(false));
-
-static cl::opt<unsigned> ClRealignStack(
- "asan-realign-stack",
- cl::desc("Realign stack to the value of this flag (power of two)"),
- cl::Hidden, cl::init(32));
-
-static cl::opt<int> ClInstrumentationWithCallsThreshold(
- "asan-instrumentation-with-call-threshold",
- cl::desc(
- "If the function being instrumented contains more than "
- "this number of memory accesses, use callbacks instead of "
- "inline checks (-1 means never use callbacks)."),
- cl::Hidden, cl::init(7000));
-
-static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
- "asan-memory-access-callback-prefix",
- cl::desc("Prefix for memory access callbacks"), cl::Hidden,
- cl::init("__asan_"));
-
-static cl::opt<bool>
- ClInstrumentDynamicAllocas("asan-instrument-dynamic-allocas",
- cl::desc("instrument dynamic allocas"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClSkipPromotableAllocas(
- "asan-skip-promotable-allocas",
- cl::desc("Do not instrument promotable allocas"), cl::Hidden,
- cl::init(true));
-
-// These flags allow to change the shadow mapping.
-// The shadow mapping looks like
-// Shadow = (Mem >> scale) + offset
-
-static cl::opt<int> ClMappingScale("asan-mapping-scale",
- cl::desc("scale of asan shadow mapping"),
- cl::Hidden, cl::init(0));
-
-static cl::opt<uint64_t>
- ClMappingOffset("asan-mapping-offset",
- cl::desc("offset of asan shadow mapping [EXPERIMENTAL]"),
- cl::Hidden, cl::init(0));
-
-// Optimization flags. Not user visible, used mostly for testing
-// and benchmarking the tool.
-
-static cl::opt<bool> ClOpt("asan-opt", cl::desc("Optimize instrumentation"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClOptSameTemp(
- "asan-opt-same-temp", cl::desc("Instrument the same temp just once"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClOptGlobals("asan-opt-globals",
- cl::desc("Don't instrument scalar globals"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClOptStack(
- "asan-opt-stack", cl::desc("Don't instrument scalar stack variables"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool> ClDynamicAllocaStack(
- "asan-stack-dynamic-alloca",
- cl::desc("Use dynamic alloca to represent stack variables"), cl::Hidden,
- cl::init(true));
-
-static cl::opt<uint32_t> ClForceExperiment(
- "asan-force-experiment",
- cl::desc("Force optimization experiment (for testing)"), cl::Hidden,
- cl::init(0));
-
-static cl::opt<bool>
- ClUsePrivateAlias("asan-use-private-alias",
- cl::desc("Use private aliases for global variables"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool>
- ClUseOdrIndicator("asan-use-odr-indicator",
- cl::desc("Use odr indicators to improve ODR reporting"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool>
- ClUseGlobalsGC("asan-globals-live-support",
- cl::desc("Use linker features to support dead "
- "code stripping of globals"),
- cl::Hidden, cl::init(true));
-
-// This is on by default even though there is a bug in gold:
-// https://sourceware.org/bugzilla/show_bug.cgi?id=19002
-static cl::opt<bool>
- ClWithComdat("asan-with-comdat",
- cl::desc("Place ASan constructors in comdat sections"),
- cl::Hidden, cl::init(true));
-
-// Debug flags.
-
-static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
- cl::init(0));
-
-static cl::opt<int> ClDebugStack("asan-debug-stack", cl::desc("debug stack"),
- cl::Hidden, cl::init(0));
-
-static cl::opt<std::string> ClDebugFunc("asan-debug-func", cl::Hidden,
- cl::desc("Debug func"));
-
-static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
- cl::Hidden, cl::init(-1));
-
-static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug max inst"),
- cl::Hidden, cl::init(-1));
-
-STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
-STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
-STATISTIC(NumOptimizedAccessesToGlobalVar,
- "Number of optimized accesses to global vars");
-STATISTIC(NumOptimizedAccessesToStackVar,
- "Number of optimized accesses to stack vars");
-
-namespace {
-
-/// This struct defines the shadow mapping using the rule:
-/// shadow = (mem >> Scale) ADD-or-OR Offset.
-/// If InGlobal is true, then
-/// extern char __asan_shadow[];
-/// shadow = (mem >> Scale) + &__asan_shadow
-struct ShadowMapping {
- int Scale;
- uint64_t Offset;
- bool OrShadowOffset;
- bool InGlobal;
-};
-
-} // end anonymous namespace
-
-static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
- bool IsKasan) {
- bool IsAndroid = TargetTriple.isAndroid();
- bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS();
+
+// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+static const size_t kNumberOfAccessSizes = 5;
+
+static const unsigned kAllocaRzSize = 32;
+
+// Command-line flags.
+
+static cl::opt<bool> ClEnableKasan(
+ "asan-kernel", cl::desc("Enable KernelAddressSanitizer instrumentation"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClRecover(
+ "asan-recover",
+ cl::desc("Enable recovery mode (continue-after-error)."),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClInsertVersionCheck(
+ "asan-guard-against-version-mismatch",
+ cl::desc("Guard against compiler/runtime version mismatch."),
+ cl::Hidden, cl::init(true));
+
+// This flag may need to be replaced with -f[no-]asan-reads.
+static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
+ cl::desc("instrument read instructions"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInstrumentWrites(
+ "asan-instrument-writes", cl::desc("instrument write instructions"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInstrumentAtomics(
+ "asan-instrument-atomics",
+ cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
+ cl::init(true));
+
+static cl::opt<bool>
+ ClInstrumentByval("asan-instrument-byval",
+ cl::desc("instrument byval call arguments"), cl::Hidden,
+ cl::init(true));
+
+static cl::opt<bool> ClAlwaysSlowPath(
+ "asan-always-slow-path",
+ cl::desc("use instrumentation with slow path for all accesses"), cl::Hidden,
+ cl::init(false));
+
+static cl::opt<bool> ClForceDynamicShadow(
+ "asan-force-dynamic-shadow",
+ cl::desc("Load shadow address into a local variable for each function"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+ ClWithIfunc("asan-with-ifunc",
+ cl::desc("Access dynamic shadow through an ifunc global on "
+ "platforms that support this"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClWithIfuncSuppressRemat(
+ "asan-with-ifunc-suppress-remat",
+ cl::desc("Suppress rematerialization of dynamic shadow address by passing "
+ "it through inline asm in prologue."),
+ cl::Hidden, cl::init(true));
+
+// This flag limits the number of instructions to be instrumented
+// in any given BB. Normally, this should be set to unlimited (INT_MAX),
+// but due to http://llvm.org/bugs/show_bug.cgi?id=12652 we temporary
+// set it to 10000.
+static cl::opt<int> ClMaxInsnsToInstrumentPerBB(
+ "asan-max-ins-per-bb", cl::init(10000),
+ cl::desc("maximal number of instructions to instrument in any given BB"),
+ cl::Hidden);
+
+// This flag may need to be replaced with -f[no]asan-stack.
+static cl::opt<bool> ClStack("asan-stack", cl::desc("Handle stack memory"),
+ cl::Hidden, cl::init(true));
+static cl::opt<uint32_t> ClMaxInlinePoisoningSize(
+ "asan-max-inline-poisoning-size",
+ cl::desc(
+ "Inline shadow poisoning for blocks up to the given size in bytes."),
+ cl::Hidden, cl::init(64));
+
+static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
+ cl::desc("Check stack-use-after-return"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClRedzoneByvalArgs("asan-redzone-byval-args",
+ cl::desc("Create redzones for byval "
+ "arguments (extra copy "
+ "required)"), cl::Hidden,
+ cl::init(true));
+
+static cl::opt<bool> ClUseAfterScope("asan-use-after-scope",
+ cl::desc("Check stack-use-after-scope"),
+ cl::Hidden, cl::init(false));
+
+// This flag may need to be replaced with -f[no]asan-globals.
+static cl::opt<bool> ClGlobals("asan-globals",
+ cl::desc("Handle global objects"), cl::Hidden,
+ cl::init(true));
+
+static cl::opt<bool> ClInitializers("asan-initialization-order",
+ cl::desc("Handle C++ initializer order"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInvalidPointerPairs(
+ "asan-detect-invalid-pointer-pair",
+ cl::desc("Instrument <, <=, >, >=, - with pointer operands"), cl::Hidden,
+ cl::init(false));
+
+static cl::opt<bool> ClInvalidPointerCmp(
+ "asan-detect-invalid-pointer-cmp",
+ cl::desc("Instrument <, <=, >, >= with pointer operands"), cl::Hidden,
+ cl::init(false));
+
+static cl::opt<bool> ClInvalidPointerSub(
+ "asan-detect-invalid-pointer-sub",
+ cl::desc("Instrument - operations with pointer operands"), cl::Hidden,
+ cl::init(false));
+
+static cl::opt<unsigned> ClRealignStack(
+ "asan-realign-stack",
+ cl::desc("Realign stack to the value of this flag (power of two)"),
+ cl::Hidden, cl::init(32));
+
+static cl::opt<int> ClInstrumentationWithCallsThreshold(
+ "asan-instrumentation-with-call-threshold",
+ cl::desc(
+ "If the function being instrumented contains more than "
+ "this number of memory accesses, use callbacks instead of "
+ "inline checks (-1 means never use callbacks)."),
+ cl::Hidden, cl::init(7000));
+
+static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
+ "asan-memory-access-callback-prefix",
+ cl::desc("Prefix for memory access callbacks"), cl::Hidden,
+ cl::init("__asan_"));
+
+static cl::opt<bool>
+ ClInstrumentDynamicAllocas("asan-instrument-dynamic-allocas",
+ cl::desc("instrument dynamic allocas"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClSkipPromotableAllocas(
+ "asan-skip-promotable-allocas",
+ cl::desc("Do not instrument promotable allocas"), cl::Hidden,
+ cl::init(true));
+
+// These flags allow to change the shadow mapping.
+// The shadow mapping looks like
+// Shadow = (Mem >> scale) + offset
+
+static cl::opt<int> ClMappingScale("asan-mapping-scale",
+ cl::desc("scale of asan shadow mapping"),
+ cl::Hidden, cl::init(0));
+
+static cl::opt<uint64_t>
+ ClMappingOffset("asan-mapping-offset",
+ cl::desc("offset of asan shadow mapping [EXPERIMENTAL]"),
+ cl::Hidden, cl::init(0));
+
+// Optimization flags. Not user visible, used mostly for testing
+// and benchmarking the tool.
+
+static cl::opt<bool> ClOpt("asan-opt", cl::desc("Optimize instrumentation"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClOptSameTemp(
+ "asan-opt-same-temp", cl::desc("Instrument the same temp just once"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClOptGlobals("asan-opt-globals",
+ cl::desc("Don't instrument scalar globals"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClOptStack(
+ "asan-opt-stack", cl::desc("Don't instrument scalar stack variables"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDynamicAllocaStack(
+ "asan-stack-dynamic-alloca",
+ cl::desc("Use dynamic alloca to represent stack variables"), cl::Hidden,
+ cl::init(true));
+
+static cl::opt<uint32_t> ClForceExperiment(
+ "asan-force-experiment",
+ cl::desc("Force optimization experiment (for testing)"), cl::Hidden,
+ cl::init(0));
+
+static cl::opt<bool>
+ ClUsePrivateAlias("asan-use-private-alias",
+ cl::desc("Use private aliases for global variables"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+ ClUseOdrIndicator("asan-use-odr-indicator",
+ cl::desc("Use odr indicators to improve ODR reporting"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+ ClUseGlobalsGC("asan-globals-live-support",
+ cl::desc("Use linker features to support dead "
+ "code stripping of globals"),
+ cl::Hidden, cl::init(true));
+
+// This is on by default even though there is a bug in gold:
+// https://sourceware.org/bugzilla/show_bug.cgi?id=19002
+static cl::opt<bool>
+ ClWithComdat("asan-with-comdat",
+ cl::desc("Place ASan constructors in comdat sections"),
+ cl::Hidden, cl::init(true));
+
+// Debug flags.
+
+static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
+ cl::init(0));
+
+static cl::opt<int> ClDebugStack("asan-debug-stack", cl::desc("debug stack"),
+ cl::Hidden, cl::init(0));
+
+static cl::opt<std::string> ClDebugFunc("asan-debug-func", cl::Hidden,
+ cl::desc("Debug func"));
+
+static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
+ cl::Hidden, cl::init(-1));
+
+static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug max inst"),
+ cl::Hidden, cl::init(-1));
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOptimizedAccessesToGlobalVar,
+ "Number of optimized accesses to global vars");
+STATISTIC(NumOptimizedAccessesToStackVar,
+ "Number of optimized accesses to stack vars");
+
+namespace {
+
+/// This struct defines the shadow mapping using the rule:
+/// shadow = (mem >> Scale) ADD-or-OR Offset.
+/// If InGlobal is true, then
+/// extern char __asan_shadow[];
+/// shadow = (mem >> Scale) + &__asan_shadow
+struct ShadowMapping {
+ int Scale;
+ uint64_t Offset;
+ bool OrShadowOffset;
+ bool InGlobal;
+};
+
+} // end anonymous namespace
+
+static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
+ bool IsKasan) {
+ bool IsAndroid = TargetTriple.isAndroid();
+ bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS();
bool IsMacOS = TargetTriple.isMacOSX();
- bool IsFreeBSD = TargetTriple.isOSFreeBSD();
- bool IsNetBSD = TargetTriple.isOSNetBSD();
- bool IsPS4CPU = TargetTriple.isPS4CPU();
- bool IsLinux = TargetTriple.isOSLinux();
- bool IsPPC64 = TargetTriple.getArch() == Triple::ppc64 ||
- TargetTriple.getArch() == Triple::ppc64le;
- bool IsSystemZ = TargetTriple.getArch() == Triple::systemz;
- bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64;
- bool IsMIPS32 = TargetTriple.isMIPS32();
- bool IsMIPS64 = TargetTriple.isMIPS64();
- bool IsArmOrThumb = TargetTriple.isARM() || TargetTriple.isThumb();
- bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64;
+ bool IsFreeBSD = TargetTriple.isOSFreeBSD();
+ bool IsNetBSD = TargetTriple.isOSNetBSD();
+ bool IsPS4CPU = TargetTriple.isPS4CPU();
+ bool IsLinux = TargetTriple.isOSLinux();
+ bool IsPPC64 = TargetTriple.getArch() == Triple::ppc64 ||
+ TargetTriple.getArch() == Triple::ppc64le;
+ bool IsSystemZ = TargetTriple.getArch() == Triple::systemz;
+ bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64;
+ bool IsMIPS32 = TargetTriple.isMIPS32();
+ bool IsMIPS64 = TargetTriple.isMIPS64();
+ bool IsArmOrThumb = TargetTriple.isARM() || TargetTriple.isThumb();
+ bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64;
bool IsRISCV64 = TargetTriple.getArch() == Triple::riscv64;
- bool IsWindows = TargetTriple.isOSWindows();
- bool IsFuchsia = TargetTriple.isOSFuchsia();
- bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad;
- bool IsEmscripten = TargetTriple.isOSEmscripten();
-
- ShadowMapping Mapping;
-
- Mapping.Scale = IsMyriad ? kMyriadShadowScale : kDefaultShadowScale;
- if (ClMappingScale.getNumOccurrences() > 0) {
- Mapping.Scale = ClMappingScale;
- }
-
- if (LongSize == 32) {
- if (IsAndroid)
- Mapping.Offset = kDynamicShadowSentinel;
- else if (IsMIPS32)
- Mapping.Offset = kMIPS32_ShadowOffset32;
- else if (IsFreeBSD)
- Mapping.Offset = kFreeBSD_ShadowOffset32;
- else if (IsNetBSD)
- Mapping.Offset = kNetBSD_ShadowOffset32;
- else if (IsIOS)
- Mapping.Offset = kDynamicShadowSentinel;
- else if (IsWindows)
- Mapping.Offset = kWindowsShadowOffset32;
- else if (IsEmscripten)
- Mapping.Offset = kEmscriptenShadowOffset;
- else if (IsMyriad) {
- uint64_t ShadowOffset = (kMyriadMemoryOffset32 + kMyriadMemorySize32 -
- (kMyriadMemorySize32 >> Mapping.Scale));
- Mapping.Offset = ShadowOffset - (kMyriadMemoryOffset32 >> Mapping.Scale);
- }
- else
- Mapping.Offset = kDefaultShadowOffset32;
- } else { // LongSize == 64
- // Fuchsia is always PIE, which means that the beginning of the address
- // space is always available.
- if (IsFuchsia)
- Mapping.Offset = 0;
- else if (IsPPC64)
- Mapping.Offset = kPPC64_ShadowOffset64;
- else if (IsSystemZ)
- Mapping.Offset = kSystemZ_ShadowOffset64;
- else if (IsFreeBSD && !IsMIPS64)
- Mapping.Offset = kFreeBSD_ShadowOffset64;
- else if (IsNetBSD) {
- if (IsKasan)
- Mapping.Offset = kNetBSDKasan_ShadowOffset64;
- else
- Mapping.Offset = kNetBSD_ShadowOffset64;
- } else if (IsPS4CPU)
- Mapping.Offset = kPS4CPU_ShadowOffset64;
- else if (IsLinux && IsX86_64) {
- if (IsKasan)
- Mapping.Offset = kLinuxKasan_ShadowOffset64;
- else
- Mapping.Offset = (kSmallX86_64ShadowOffsetBase &
- (kSmallX86_64ShadowOffsetAlignMask << Mapping.Scale));
- } else if (IsWindows && IsX86_64) {
- Mapping.Offset = kWindowsShadowOffset64;
- } else if (IsMIPS64)
- Mapping.Offset = kMIPS64_ShadowOffset64;
- else if (IsIOS)
- Mapping.Offset = kDynamicShadowSentinel;
+ bool IsWindows = TargetTriple.isOSWindows();
+ bool IsFuchsia = TargetTriple.isOSFuchsia();
+ bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad;
+ bool IsEmscripten = TargetTriple.isOSEmscripten();
+
+ ShadowMapping Mapping;
+
+ Mapping.Scale = IsMyriad ? kMyriadShadowScale : kDefaultShadowScale;
+ if (ClMappingScale.getNumOccurrences() > 0) {
+ Mapping.Scale = ClMappingScale;
+ }
+
+ if (LongSize == 32) {
+ if (IsAndroid)
+ Mapping.Offset = kDynamicShadowSentinel;
+ else if (IsMIPS32)
+ Mapping.Offset = kMIPS32_ShadowOffset32;
+ else if (IsFreeBSD)
+ Mapping.Offset = kFreeBSD_ShadowOffset32;
+ else if (IsNetBSD)
+ Mapping.Offset = kNetBSD_ShadowOffset32;
+ else if (IsIOS)
+ Mapping.Offset = kDynamicShadowSentinel;
+ else if (IsWindows)
+ Mapping.Offset = kWindowsShadowOffset32;
+ else if (IsEmscripten)
+ Mapping.Offset = kEmscriptenShadowOffset;
+ else if (IsMyriad) {
+ uint64_t ShadowOffset = (kMyriadMemoryOffset32 + kMyriadMemorySize32 -
+ (kMyriadMemorySize32 >> Mapping.Scale));
+ Mapping.Offset = ShadowOffset - (kMyriadMemoryOffset32 >> Mapping.Scale);
+ }
+ else
+ Mapping.Offset = kDefaultShadowOffset32;
+ } else { // LongSize == 64
+ // Fuchsia is always PIE, which means that the beginning of the address
+ // space is always available.
+ if (IsFuchsia)
+ Mapping.Offset = 0;
+ else if (IsPPC64)
+ Mapping.Offset = kPPC64_ShadowOffset64;
+ else if (IsSystemZ)
+ Mapping.Offset = kSystemZ_ShadowOffset64;
+ else if (IsFreeBSD && !IsMIPS64)
+ Mapping.Offset = kFreeBSD_ShadowOffset64;
+ else if (IsNetBSD) {
+ if (IsKasan)
+ Mapping.Offset = kNetBSDKasan_ShadowOffset64;
+ else
+ Mapping.Offset = kNetBSD_ShadowOffset64;
+ } else if (IsPS4CPU)
+ Mapping.Offset = kPS4CPU_ShadowOffset64;
+ else if (IsLinux && IsX86_64) {
+ if (IsKasan)
+ Mapping.Offset = kLinuxKasan_ShadowOffset64;
+ else
+ Mapping.Offset = (kSmallX86_64ShadowOffsetBase &
+ (kSmallX86_64ShadowOffsetAlignMask << Mapping.Scale));
+ } else if (IsWindows && IsX86_64) {
+ Mapping.Offset = kWindowsShadowOffset64;
+ } else if (IsMIPS64)
+ Mapping.Offset = kMIPS64_ShadowOffset64;
+ else if (IsIOS)
+ Mapping.Offset = kDynamicShadowSentinel;
else if (IsMacOS && IsAArch64)
Mapping.Offset = kDynamicShadowSentinel;
- else if (IsAArch64)
- Mapping.Offset = kAArch64_ShadowOffset64;
+ else if (IsAArch64)
+ Mapping.Offset = kAArch64_ShadowOffset64;
else if (IsRISCV64)
Mapping.Offset = kRISCV64_ShadowOffset64;
- else
- Mapping.Offset = kDefaultShadowOffset64;
- }
-
- if (ClForceDynamicShadow) {
- Mapping.Offset = kDynamicShadowSentinel;
- }
-
- if (ClMappingOffset.getNumOccurrences() > 0) {
- Mapping.Offset = ClMappingOffset;
- }
-
- // OR-ing shadow offset if more efficient (at least on x86) if the offset
- // is a power of two, but on ppc64 we have to use add since the shadow
- // offset is not necessary 1/8-th of the address space. On SystemZ,
- // we could OR the constant in a single instruction, but it's more
- // efficient to load it once and use indexed addressing.
- Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ && !IsPS4CPU &&
+ else
+ Mapping.Offset = kDefaultShadowOffset64;
+ }
+
+ if (ClForceDynamicShadow) {
+ Mapping.Offset = kDynamicShadowSentinel;
+ }
+
+ if (ClMappingOffset.getNumOccurrences() > 0) {
+ Mapping.Offset = ClMappingOffset;
+ }
+
+ // OR-ing shadow offset if more efficient (at least on x86) if the offset
+ // is a power of two, but on ppc64 we have to use add since the shadow
+ // offset is not necessary 1/8-th of the address space. On SystemZ,
+ // we could OR the constant in a single instruction, but it's more
+ // efficient to load it once and use indexed addressing.
+ Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ && !IsPS4CPU &&
!IsRISCV64 &&
- !(Mapping.Offset & (Mapping.Offset - 1)) &&
- Mapping.Offset != kDynamicShadowSentinel;
- bool IsAndroidWithIfuncSupport =
- IsAndroid && !TargetTriple.isAndroidVersionLT(21);
- Mapping.InGlobal = ClWithIfunc && IsAndroidWithIfuncSupport && IsArmOrThumb;
-
- return Mapping;
-}
-
-static uint64_t getRedzoneSizeForScale(int MappingScale) {
- // Redzone used for stack and globals is at least 32 bytes.
- // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
- return std::max(32U, 1U << MappingScale);
-}
-
-static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) {
- if (TargetTriple.isOSEmscripten()) {
- return kAsanEmscriptenCtorAndDtorPriority;
- } else {
- return kAsanCtorAndDtorPriority;
- }
-}
-
-namespace {
-
-/// Module analysis for getting various metadata about the module.
-class ASanGlobalsMetadataWrapperPass : public ModulePass {
-public:
- static char ID;
-
- ASanGlobalsMetadataWrapperPass() : ModulePass(ID) {
- initializeASanGlobalsMetadataWrapperPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- GlobalsMD = GlobalsMetadata(M);
- return false;
- }
-
- StringRef getPassName() const override {
- return "ASanGlobalsMetadataWrapperPass";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- }
-
- GlobalsMetadata &getGlobalsMD() { return GlobalsMD; }
-
-private:
- GlobalsMetadata GlobalsMD;
-};
-
-char ASanGlobalsMetadataWrapperPass::ID = 0;
-
-/// AddressSanitizer: instrument the code in module to find memory bugs.
-struct AddressSanitizer {
- AddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
- bool CompileKernel = false, bool Recover = false,
- bool UseAfterScope = false)
- : CompileKernel(ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan
- : CompileKernel),
- Recover(ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover),
- UseAfterScope(UseAfterScope || ClUseAfterScope), GlobalsMD(*GlobalsMD) {
- C = &(M.getContext());
- LongSize = M.getDataLayout().getPointerSizeInBits();
- IntptrTy = Type::getIntNTy(*C, LongSize);
- TargetTriple = Triple(M.getTargetTriple());
-
- Mapping = getShadowMapping(TargetTriple, LongSize, this->CompileKernel);
- }
-
- uint64_t getAllocaSizeInBytes(const AllocaInst &AI) const {
- uint64_t ArraySize = 1;
- if (AI.isArrayAllocation()) {
- const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
- assert(CI && "non-constant array size");
- ArraySize = CI->getZExtValue();
- }
- Type *Ty = AI.getAllocatedType();
- uint64_t SizeInBytes =
- AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
- return SizeInBytes * ArraySize;
- }
-
- /// Check if we want (and can) handle this alloca.
- bool isInterestingAlloca(const AllocaInst &AI);
-
- bool ignoreAccess(Value *Ptr);
- void getInterestingMemoryOperands(
- Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting);
-
- void instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
- InterestingMemoryOperand &O, bool UseCalls,
- const DataLayout &DL);
- void instrumentPointerComparisonOrSubtraction(Instruction *I);
- void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
- Value *Addr, uint32_t TypeSize, bool IsWrite,
- Value *SizeArgument, bool UseCalls, uint32_t Exp);
- void instrumentUnusualSizeOrAlignment(Instruction *I,
- Instruction *InsertBefore, Value *Addr,
- uint32_t TypeSize, bool IsWrite,
- Value *SizeArgument, bool UseCalls,
- uint32_t Exp);
- Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
- Value *ShadowValue, uint32_t TypeSize);
- Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr,
- bool IsWrite, size_t AccessSizeIndex,
- Value *SizeArgument, uint32_t Exp);
- void instrumentMemIntrinsic(MemIntrinsic *MI);
- Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
- bool suppressInstrumentationSiteForDebug(int &Instrumented);
- bool instrumentFunction(Function &F, const TargetLibraryInfo *TLI);
- bool maybeInsertAsanInitAtFunctionEntry(Function &F);
- bool maybeInsertDynamicShadowAtFunctionEntry(Function &F);
- void markEscapedLocalAllocas(Function &F);
-
-private:
- friend struct FunctionStackPoisoner;
-
- void initializeCallbacks(Module &M);
-
- bool LooksLikeCodeInBug11395(Instruction *I);
- bool GlobalIsLinkerInitialized(GlobalVariable *G);
- bool isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, Value *Addr,
- uint64_t TypeSize) const;
-
- /// Helper to cleanup per-function state.
- struct FunctionStateRAII {
- AddressSanitizer *Pass;
-
- FunctionStateRAII(AddressSanitizer *Pass) : Pass(Pass) {
- assert(Pass->ProcessedAllocas.empty() &&
- "last pass forgot to clear cache");
- assert(!Pass->LocalDynamicShadow);
- }
-
- ~FunctionStateRAII() {
- Pass->LocalDynamicShadow = nullptr;
- Pass->ProcessedAllocas.clear();
- }
- };
-
- LLVMContext *C;
- Triple TargetTriple;
- int LongSize;
- bool CompileKernel;
- bool Recover;
- bool UseAfterScope;
- Type *IntptrTy;
- ShadowMapping Mapping;
- FunctionCallee AsanHandleNoReturnFunc;
- FunctionCallee AsanPtrCmpFunction, AsanPtrSubFunction;
- Constant *AsanShadowGlobal;
-
- // These arrays is indexed by AccessIsWrite, Experiment and log2(AccessSize).
- FunctionCallee AsanErrorCallback[2][2][kNumberOfAccessSizes];
- FunctionCallee AsanMemoryAccessCallback[2][2][kNumberOfAccessSizes];
-
- // These arrays is indexed by AccessIsWrite and Experiment.
- FunctionCallee AsanErrorCallbackSized[2][2];
- FunctionCallee AsanMemoryAccessCallbackSized[2][2];
-
- FunctionCallee AsanMemmove, AsanMemcpy, AsanMemset;
- Value *LocalDynamicShadow = nullptr;
- const GlobalsMetadata &GlobalsMD;
- DenseMap<const AllocaInst *, bool> ProcessedAllocas;
-};
-
-class AddressSanitizerLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- explicit AddressSanitizerLegacyPass(bool CompileKernel = false,
- bool Recover = false,
- bool UseAfterScope = false)
- : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover),
- UseAfterScope(UseAfterScope) {
- initializeAddressSanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- StringRef getPassName() const override {
- return "AddressSanitizerFunctionPass";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<ASanGlobalsMetadataWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-
- bool runOnFunction(Function &F) override {
- GlobalsMetadata &GlobalsMD =
- getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
- const TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- AddressSanitizer ASan(*F.getParent(), &GlobalsMD, CompileKernel, Recover,
- UseAfterScope);
- return ASan.instrumentFunction(F, TLI);
- }
-
-private:
- bool CompileKernel;
- bool Recover;
- bool UseAfterScope;
-};
-
-class ModuleAddressSanitizer {
-public:
- ModuleAddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
- bool CompileKernel = false, bool Recover = false,
- bool UseGlobalsGC = true, bool UseOdrIndicator = false)
- : GlobalsMD(*GlobalsMD),
- CompileKernel(ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan
- : CompileKernel),
- Recover(ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover),
- UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC && !this->CompileKernel),
- // Enable aliases as they should have no downside with ODR indicators.
- UsePrivateAlias(UseOdrIndicator || ClUsePrivateAlias),
- UseOdrIndicator(UseOdrIndicator || ClUseOdrIndicator),
- // Not a typo: ClWithComdat is almost completely pointless without
- // ClUseGlobalsGC (because then it only works on modules without
- // globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
- // and both suffer from gold PR19002 for which UseGlobalsGC constructor
- // argument is designed as workaround. Therefore, disable both
- // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
- // do globals-gc.
- UseCtorComdat(UseGlobalsGC && ClWithComdat && !this->CompileKernel) {
- C = &(M.getContext());
- int LongSize = M.getDataLayout().getPointerSizeInBits();
- IntptrTy = Type::getIntNTy(*C, LongSize);
- TargetTriple = Triple(M.getTargetTriple());
- Mapping = getShadowMapping(TargetTriple, LongSize, this->CompileKernel);
- }
-
- bool instrumentModule(Module &);
-
-private:
- void initializeCallbacks(Module &M);
-
- bool InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat);
- void InstrumentGlobalsCOFF(IRBuilder<> &IRB, Module &M,
- ArrayRef<GlobalVariable *> ExtendedGlobals,
- ArrayRef<Constant *> MetadataInitializers);
- void InstrumentGlobalsELF(IRBuilder<> &IRB, Module &M,
- ArrayRef<GlobalVariable *> ExtendedGlobals,
- ArrayRef<Constant *> MetadataInitializers,
- const std::string &UniqueModuleId);
- void InstrumentGlobalsMachO(IRBuilder<> &IRB, Module &M,
- ArrayRef<GlobalVariable *> ExtendedGlobals,
- ArrayRef<Constant *> MetadataInitializers);
- void
- InstrumentGlobalsWithMetadataArray(IRBuilder<> &IRB, Module &M,
- ArrayRef<GlobalVariable *> ExtendedGlobals,
- ArrayRef<Constant *> MetadataInitializers);
-
- GlobalVariable *CreateMetadataGlobal(Module &M, Constant *Initializer,
- StringRef OriginalName);
- void SetComdatForGlobalMetadata(GlobalVariable *G, GlobalVariable *Metadata,
- StringRef InternalSuffix);
- Instruction *CreateAsanModuleDtor(Module &M);
-
- const GlobalVariable *getExcludedAliasedGlobal(const GlobalAlias &GA) const;
- bool shouldInstrumentGlobal(GlobalVariable *G) const;
- bool ShouldUseMachOGlobalsSection() const;
- StringRef getGlobalMetadataSection() const;
- void poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName);
- void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
- uint64_t getMinRedzoneSizeForGlobal() const {
- return getRedzoneSizeForScale(Mapping.Scale);
- }
- uint64_t getRedzoneSizeForGlobal(uint64_t SizeInBytes) const;
- int GetAsanVersion(const Module &M) const;
-
- const GlobalsMetadata &GlobalsMD;
- bool CompileKernel;
- bool Recover;
- bool UseGlobalsGC;
- bool UsePrivateAlias;
- bool UseOdrIndicator;
- bool UseCtorComdat;
- Type *IntptrTy;
- LLVMContext *C;
- Triple TargetTriple;
- ShadowMapping Mapping;
- FunctionCallee AsanPoisonGlobals;
- FunctionCallee AsanUnpoisonGlobals;
- FunctionCallee AsanRegisterGlobals;
- FunctionCallee AsanUnregisterGlobals;
- FunctionCallee AsanRegisterImageGlobals;
- FunctionCallee AsanUnregisterImageGlobals;
- FunctionCallee AsanRegisterElfGlobals;
- FunctionCallee AsanUnregisterElfGlobals;
-
- Function *AsanCtorFunction = nullptr;
- Function *AsanDtorFunction = nullptr;
-};
-
-class ModuleAddressSanitizerLegacyPass : public ModulePass {
-public:
- static char ID;
-
- explicit ModuleAddressSanitizerLegacyPass(bool CompileKernel = false,
- bool Recover = false,
- bool UseGlobalGC = true,
- bool UseOdrIndicator = false)
- : ModulePass(ID), CompileKernel(CompileKernel), Recover(Recover),
- UseGlobalGC(UseGlobalGC), UseOdrIndicator(UseOdrIndicator) {
- initializeModuleAddressSanitizerLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- StringRef getPassName() const override { return "ModuleAddressSanitizer"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<ASanGlobalsMetadataWrapperPass>();
- }
-
- bool runOnModule(Module &M) override {
- GlobalsMetadata &GlobalsMD =
- getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
- ModuleAddressSanitizer ASanModule(M, &GlobalsMD, CompileKernel, Recover,
- UseGlobalGC, UseOdrIndicator);
- return ASanModule.instrumentModule(M);
- }
-
-private:
- bool CompileKernel;
- bool Recover;
- bool UseGlobalGC;
- bool UseOdrIndicator;
-};
-
-// Stack poisoning does not play well with exception handling.
-// When an exception is thrown, we essentially bypass the code
-// that unpoisones the stack. This is why the run-time library has
-// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire
-// stack in the interceptor. This however does not work inside the
-// actual function which catches the exception. Most likely because the
-// compiler hoists the load of the shadow value somewhere too high.
-// This causes asan to report a non-existing bug on 453.povray.
-// It sounds like an LLVM bug.
-struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
- Function &F;
- AddressSanitizer &ASan;
- DIBuilder DIB;
- LLVMContext *C;
- Type *IntptrTy;
- Type *IntptrPtrTy;
- ShadowMapping Mapping;
-
- SmallVector<AllocaInst *, 16> AllocaVec;
- SmallVector<AllocaInst *, 16> StaticAllocasToMoveUp;
- SmallVector<Instruction *, 8> RetVec;
- unsigned StackAlignment;
-
- FunctionCallee AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1],
- AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1];
- FunctionCallee AsanSetShadowFunc[0x100] = {};
- FunctionCallee AsanPoisonStackMemoryFunc, AsanUnpoisonStackMemoryFunc;
- FunctionCallee AsanAllocaPoisonFunc, AsanAllocasUnpoisonFunc;
-
- // Stores a place and arguments of poisoning/unpoisoning call for alloca.
- struct AllocaPoisonCall {
- IntrinsicInst *InsBefore;
- AllocaInst *AI;
- uint64_t Size;
- bool DoPoison;
- };
- SmallVector<AllocaPoisonCall, 8> DynamicAllocaPoisonCallVec;
- SmallVector<AllocaPoisonCall, 8> StaticAllocaPoisonCallVec;
- bool HasUntracedLifetimeIntrinsic = false;
-
- SmallVector<AllocaInst *, 1> DynamicAllocaVec;
- SmallVector<IntrinsicInst *, 1> StackRestoreVec;
- AllocaInst *DynamicAllocaLayout = nullptr;
- IntrinsicInst *LocalEscapeCall = nullptr;
-
- bool HasInlineAsm = false;
- bool HasReturnsTwiceCall = false;
-
- FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
- : F(F), ASan(ASan), DIB(*F.getParent(), /*AllowUnresolved*/ false),
- C(ASan.C), IntptrTy(ASan.IntptrTy),
- IntptrPtrTy(PointerType::get(IntptrTy, 0)), Mapping(ASan.Mapping),
- StackAlignment(1 << Mapping.Scale) {}
-
- bool runOnFunction() {
- if (!ClStack) return false;
-
- if (ClRedzoneByvalArgs)
- copyArgsPassedByValToAllocas();
-
- // Collect alloca, ret, lifetime instructions etc.
- for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB);
-
- if (AllocaVec.empty() && DynamicAllocaVec.empty()) return false;
-
- initializeCallbacks(*F.getParent());
-
- if (HasUntracedLifetimeIntrinsic) {
- // If there are lifetime intrinsics which couldn't be traced back to an
- // alloca, we may not know exactly when a variable enters scope, and
- // therefore should "fail safe" by not poisoning them.
- StaticAllocaPoisonCallVec.clear();
- DynamicAllocaPoisonCallVec.clear();
- }
-
- processDynamicAllocas();
- processStaticAllocas();
-
- if (ClDebugStack) {
- LLVM_DEBUG(dbgs() << F);
- }
- return true;
- }
-
- // Arguments marked with the "byval" attribute are implicitly copied without
- // using an alloca instruction. To produce redzones for those arguments, we
- // copy them a second time into memory allocated with an alloca instruction.
- void copyArgsPassedByValToAllocas();
-
- // Finds all Alloca instructions and puts
- // poisoned red zones around all of them.
- // Then unpoison everything back before the function returns.
- void processStaticAllocas();
- void processDynamicAllocas();
-
- void createDynamicAllocasInitStorage();
-
- // ----------------------- Visitors.
+ !(Mapping.Offset & (Mapping.Offset - 1)) &&
+ Mapping.Offset != kDynamicShadowSentinel;
+ bool IsAndroidWithIfuncSupport =
+ IsAndroid && !TargetTriple.isAndroidVersionLT(21);
+ Mapping.InGlobal = ClWithIfunc && IsAndroidWithIfuncSupport && IsArmOrThumb;
+
+ return Mapping;
+}
+
+static uint64_t getRedzoneSizeForScale(int MappingScale) {
+ // Redzone used for stack and globals is at least 32 bytes.
+ // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
+ return std::max(32U, 1U << MappingScale);
+}
+
+static uint64_t GetCtorAndDtorPriority(Triple &TargetTriple) {
+ if (TargetTriple.isOSEmscripten()) {
+ return kAsanEmscriptenCtorAndDtorPriority;
+ } else {
+ return kAsanCtorAndDtorPriority;
+ }
+}
+
+namespace {
+
+/// Module analysis for getting various metadata about the module.
+class ASanGlobalsMetadataWrapperPass : public ModulePass {
+public:
+ static char ID;
+
+ ASanGlobalsMetadataWrapperPass() : ModulePass(ID) {
+ initializeASanGlobalsMetadataWrapperPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ GlobalsMD = GlobalsMetadata(M);
+ return false;
+ }
+
+ StringRef getPassName() const override {
+ return "ASanGlobalsMetadataWrapperPass";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+
+ GlobalsMetadata &getGlobalsMD() { return GlobalsMD; }
+
+private:
+ GlobalsMetadata GlobalsMD;
+};
+
+char ASanGlobalsMetadataWrapperPass::ID = 0;
+
+/// AddressSanitizer: instrument the code in module to find memory bugs.
+struct AddressSanitizer {
+ AddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
+ bool CompileKernel = false, bool Recover = false,
+ bool UseAfterScope = false)
+ : CompileKernel(ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan
+ : CompileKernel),
+ Recover(ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover),
+ UseAfterScope(UseAfterScope || ClUseAfterScope), GlobalsMD(*GlobalsMD) {
+ C = &(M.getContext());
+ LongSize = M.getDataLayout().getPointerSizeInBits();
+ IntptrTy = Type::getIntNTy(*C, LongSize);
+ TargetTriple = Triple(M.getTargetTriple());
+
+ Mapping = getShadowMapping(TargetTriple, LongSize, this->CompileKernel);
+ }
+
+ uint64_t getAllocaSizeInBytes(const AllocaInst &AI) const {
+ uint64_t ArraySize = 1;
+ if (AI.isArrayAllocation()) {
+ const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
+ assert(CI && "non-constant array size");
+ ArraySize = CI->getZExtValue();
+ }
+ Type *Ty = AI.getAllocatedType();
+ uint64_t SizeInBytes =
+ AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
+ return SizeInBytes * ArraySize;
+ }
+
+ /// Check if we want (and can) handle this alloca.
+ bool isInterestingAlloca(const AllocaInst &AI);
+
+ bool ignoreAccess(Value *Ptr);
+ void getInterestingMemoryOperands(
+ Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting);
+
+ void instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
+ InterestingMemoryOperand &O, bool UseCalls,
+ const DataLayout &DL);
+ void instrumentPointerComparisonOrSubtraction(Instruction *I);
+ void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
+ Value *Addr, uint32_t TypeSize, bool IsWrite,
+ Value *SizeArgument, bool UseCalls, uint32_t Exp);
+ void instrumentUnusualSizeOrAlignment(Instruction *I,
+ Instruction *InsertBefore, Value *Addr,
+ uint32_t TypeSize, bool IsWrite,
+ Value *SizeArgument, bool UseCalls,
+ uint32_t Exp);
+ Value *createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+ Value *ShadowValue, uint32_t TypeSize);
+ Instruction *generateCrashCode(Instruction *InsertBefore, Value *Addr,
+ bool IsWrite, size_t AccessSizeIndex,
+ Value *SizeArgument, uint32_t Exp);
+ void instrumentMemIntrinsic(MemIntrinsic *MI);
+ Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+ bool suppressInstrumentationSiteForDebug(int &Instrumented);
+ bool instrumentFunction(Function &F, const TargetLibraryInfo *TLI);
+ bool maybeInsertAsanInitAtFunctionEntry(Function &F);
+ bool maybeInsertDynamicShadowAtFunctionEntry(Function &F);
+ void markEscapedLocalAllocas(Function &F);
+
+private:
+ friend struct FunctionStackPoisoner;
+
+ void initializeCallbacks(Module &M);
+
+ bool LooksLikeCodeInBug11395(Instruction *I);
+ bool GlobalIsLinkerInitialized(GlobalVariable *G);
+ bool isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis, Value *Addr,
+ uint64_t TypeSize) const;
+
+ /// Helper to cleanup per-function state.
+ struct FunctionStateRAII {
+ AddressSanitizer *Pass;
+
+ FunctionStateRAII(AddressSanitizer *Pass) : Pass(Pass) {
+ assert(Pass->ProcessedAllocas.empty() &&
+ "last pass forgot to clear cache");
+ assert(!Pass->LocalDynamicShadow);
+ }
+
+ ~FunctionStateRAII() {
+ Pass->LocalDynamicShadow = nullptr;
+ Pass->ProcessedAllocas.clear();
+ }
+ };
+
+ LLVMContext *C;
+ Triple TargetTriple;
+ int LongSize;
+ bool CompileKernel;
+ bool Recover;
+ bool UseAfterScope;
+ Type *IntptrTy;
+ ShadowMapping Mapping;
+ FunctionCallee AsanHandleNoReturnFunc;
+ FunctionCallee AsanPtrCmpFunction, AsanPtrSubFunction;
+ Constant *AsanShadowGlobal;
+
+ // These arrays is indexed by AccessIsWrite, Experiment and log2(AccessSize).
+ FunctionCallee AsanErrorCallback[2][2][kNumberOfAccessSizes];
+ FunctionCallee AsanMemoryAccessCallback[2][2][kNumberOfAccessSizes];
+
+ // These arrays is indexed by AccessIsWrite and Experiment.
+ FunctionCallee AsanErrorCallbackSized[2][2];
+ FunctionCallee AsanMemoryAccessCallbackSized[2][2];
+
+ FunctionCallee AsanMemmove, AsanMemcpy, AsanMemset;
+ Value *LocalDynamicShadow = nullptr;
+ const GlobalsMetadata &GlobalsMD;
+ DenseMap<const AllocaInst *, bool> ProcessedAllocas;
+};
+
+class AddressSanitizerLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ explicit AddressSanitizerLegacyPass(bool CompileKernel = false,
+ bool Recover = false,
+ bool UseAfterScope = false)
+ : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover),
+ UseAfterScope(UseAfterScope) {
+ initializeAddressSanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "AddressSanitizerFunctionPass";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ASanGlobalsMetadataWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override {
+ GlobalsMetadata &GlobalsMD =
+ getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ AddressSanitizer ASan(*F.getParent(), &GlobalsMD, CompileKernel, Recover,
+ UseAfterScope);
+ return ASan.instrumentFunction(F, TLI);
+ }
+
+private:
+ bool CompileKernel;
+ bool Recover;
+ bool UseAfterScope;
+};
+
+class ModuleAddressSanitizer {
+public:
+ ModuleAddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
+ bool CompileKernel = false, bool Recover = false,
+ bool UseGlobalsGC = true, bool UseOdrIndicator = false)
+ : GlobalsMD(*GlobalsMD),
+ CompileKernel(ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan
+ : CompileKernel),
+ Recover(ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover),
+ UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC && !this->CompileKernel),
+ // Enable aliases as they should have no downside with ODR indicators.
+ UsePrivateAlias(UseOdrIndicator || ClUsePrivateAlias),
+ UseOdrIndicator(UseOdrIndicator || ClUseOdrIndicator),
+ // Not a typo: ClWithComdat is almost completely pointless without
+ // ClUseGlobalsGC (because then it only works on modules without
+ // globals, which are rare); it is a prerequisite for ClUseGlobalsGC;
+ // and both suffer from gold PR19002 for which UseGlobalsGC constructor
+ // argument is designed as workaround. Therefore, disable both
+ // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
+ // do globals-gc.
+ UseCtorComdat(UseGlobalsGC && ClWithComdat && !this->CompileKernel) {
+ C = &(M.getContext());
+ int LongSize = M.getDataLayout().getPointerSizeInBits();
+ IntptrTy = Type::getIntNTy(*C, LongSize);
+ TargetTriple = Triple(M.getTargetTriple());
+ Mapping = getShadowMapping(TargetTriple, LongSize, this->CompileKernel);
+ }
+
+ bool instrumentModule(Module &);
+
+private:
+ void initializeCallbacks(Module &M);
+
+ bool InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool *CtorComdat);
+ void InstrumentGlobalsCOFF(IRBuilder<> &IRB, Module &M,
+ ArrayRef<GlobalVariable *> ExtendedGlobals,
+ ArrayRef<Constant *> MetadataInitializers);
+ void InstrumentGlobalsELF(IRBuilder<> &IRB, Module &M,
+ ArrayRef<GlobalVariable *> ExtendedGlobals,
+ ArrayRef<Constant *> MetadataInitializers,
+ const std::string &UniqueModuleId);
+ void InstrumentGlobalsMachO(IRBuilder<> &IRB, Module &M,
+ ArrayRef<GlobalVariable *> ExtendedGlobals,
+ ArrayRef<Constant *> MetadataInitializers);
+ void
+ InstrumentGlobalsWithMetadataArray(IRBuilder<> &IRB, Module &M,
+ ArrayRef<GlobalVariable *> ExtendedGlobals,
+ ArrayRef<Constant *> MetadataInitializers);
+
+ GlobalVariable *CreateMetadataGlobal(Module &M, Constant *Initializer,
+ StringRef OriginalName);
+ void SetComdatForGlobalMetadata(GlobalVariable *G, GlobalVariable *Metadata,
+ StringRef InternalSuffix);
+ Instruction *CreateAsanModuleDtor(Module &M);
+
+ const GlobalVariable *getExcludedAliasedGlobal(const GlobalAlias &GA) const;
+ bool shouldInstrumentGlobal(GlobalVariable *G) const;
+ bool ShouldUseMachOGlobalsSection() const;
+ StringRef getGlobalMetadataSection() const;
+ void poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName);
+ void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
+ uint64_t getMinRedzoneSizeForGlobal() const {
+ return getRedzoneSizeForScale(Mapping.Scale);
+ }
+ uint64_t getRedzoneSizeForGlobal(uint64_t SizeInBytes) const;
+ int GetAsanVersion(const Module &M) const;
+
+ const GlobalsMetadata &GlobalsMD;
+ bool CompileKernel;
+ bool Recover;
+ bool UseGlobalsGC;
+ bool UsePrivateAlias;
+ bool UseOdrIndicator;
+ bool UseCtorComdat;
+ Type *IntptrTy;
+ LLVMContext *C;
+ Triple TargetTriple;
+ ShadowMapping Mapping;
+ FunctionCallee AsanPoisonGlobals;
+ FunctionCallee AsanUnpoisonGlobals;
+ FunctionCallee AsanRegisterGlobals;
+ FunctionCallee AsanUnregisterGlobals;
+ FunctionCallee AsanRegisterImageGlobals;
+ FunctionCallee AsanUnregisterImageGlobals;
+ FunctionCallee AsanRegisterElfGlobals;
+ FunctionCallee AsanUnregisterElfGlobals;
+
+ Function *AsanCtorFunction = nullptr;
+ Function *AsanDtorFunction = nullptr;
+};
+
+class ModuleAddressSanitizerLegacyPass : public ModulePass {
+public:
+ static char ID;
+
+ explicit ModuleAddressSanitizerLegacyPass(bool CompileKernel = false,
+ bool Recover = false,
+ bool UseGlobalGC = true,
+ bool UseOdrIndicator = false)
+ : ModulePass(ID), CompileKernel(CompileKernel), Recover(Recover),
+ UseGlobalGC(UseGlobalGC), UseOdrIndicator(UseOdrIndicator) {
+ initializeModuleAddressSanitizerLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return "ModuleAddressSanitizer"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ASanGlobalsMetadataWrapperPass>();
+ }
+
+ bool runOnModule(Module &M) override {
+ GlobalsMetadata &GlobalsMD =
+ getAnalysis<ASanGlobalsMetadataWrapperPass>().getGlobalsMD();
+ ModuleAddressSanitizer ASanModule(M, &GlobalsMD, CompileKernel, Recover,
+ UseGlobalGC, UseOdrIndicator);
+ return ASanModule.instrumentModule(M);
+ }
+
+private:
+ bool CompileKernel;
+ bool Recover;
+ bool UseGlobalGC;
+ bool UseOdrIndicator;
+};
+
+// Stack poisoning does not play well with exception handling.
+// When an exception is thrown, we essentially bypass the code
+// that unpoisones the stack. This is why the run-time library has
+// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire
+// stack in the interceptor. This however does not work inside the
+// actual function which catches the exception. Most likely because the
+// compiler hoists the load of the shadow value somewhere too high.
+// This causes asan to report a non-existing bug on 453.povray.
+// It sounds like an LLVM bug.
+struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
+ Function &F;
+ AddressSanitizer &ASan;
+ DIBuilder DIB;
+ LLVMContext *C;
+ Type *IntptrTy;
+ Type *IntptrPtrTy;
+ ShadowMapping Mapping;
+
+ SmallVector<AllocaInst *, 16> AllocaVec;
+ SmallVector<AllocaInst *, 16> StaticAllocasToMoveUp;
+ SmallVector<Instruction *, 8> RetVec;
+ unsigned StackAlignment;
+
+ FunctionCallee AsanStackMallocFunc[kMaxAsanStackMallocSizeClass + 1],
+ AsanStackFreeFunc[kMaxAsanStackMallocSizeClass + 1];
+ FunctionCallee AsanSetShadowFunc[0x100] = {};
+ FunctionCallee AsanPoisonStackMemoryFunc, AsanUnpoisonStackMemoryFunc;
+ FunctionCallee AsanAllocaPoisonFunc, AsanAllocasUnpoisonFunc;
+
+ // Stores a place and arguments of poisoning/unpoisoning call for alloca.
+ struct AllocaPoisonCall {
+ IntrinsicInst *InsBefore;
+ AllocaInst *AI;
+ uint64_t Size;
+ bool DoPoison;
+ };
+ SmallVector<AllocaPoisonCall, 8> DynamicAllocaPoisonCallVec;
+ SmallVector<AllocaPoisonCall, 8> StaticAllocaPoisonCallVec;
+ bool HasUntracedLifetimeIntrinsic = false;
+
+ SmallVector<AllocaInst *, 1> DynamicAllocaVec;
+ SmallVector<IntrinsicInst *, 1> StackRestoreVec;
+ AllocaInst *DynamicAllocaLayout = nullptr;
+ IntrinsicInst *LocalEscapeCall = nullptr;
+
+ bool HasInlineAsm = false;
+ bool HasReturnsTwiceCall = false;
+
+ FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
+ : F(F), ASan(ASan), DIB(*F.getParent(), /*AllowUnresolved*/ false),
+ C(ASan.C), IntptrTy(ASan.IntptrTy),
+ IntptrPtrTy(PointerType::get(IntptrTy, 0)), Mapping(ASan.Mapping),
+ StackAlignment(1 << Mapping.Scale) {}
+
+ bool runOnFunction() {
+ if (!ClStack) return false;
+
+ if (ClRedzoneByvalArgs)
+ copyArgsPassedByValToAllocas();
+
+ // Collect alloca, ret, lifetime instructions etc.
+ for (BasicBlock *BB : depth_first(&F.getEntryBlock())) visit(*BB);
+
+ if (AllocaVec.empty() && DynamicAllocaVec.empty()) return false;
+
+ initializeCallbacks(*F.getParent());
+
+ if (HasUntracedLifetimeIntrinsic) {
+ // If there are lifetime intrinsics which couldn't be traced back to an
+ // alloca, we may not know exactly when a variable enters scope, and
+ // therefore should "fail safe" by not poisoning them.
+ StaticAllocaPoisonCallVec.clear();
+ DynamicAllocaPoisonCallVec.clear();
+ }
+
+ processDynamicAllocas();
+ processStaticAllocas();
+
+ if (ClDebugStack) {
+ LLVM_DEBUG(dbgs() << F);
+ }
+ return true;
+ }
+
+ // Arguments marked with the "byval" attribute are implicitly copied without
+ // using an alloca instruction. To produce redzones for those arguments, we
+ // copy them a second time into memory allocated with an alloca instruction.
+ void copyArgsPassedByValToAllocas();
+
+ // Finds all Alloca instructions and puts
+ // poisoned red zones around all of them.
+ // Then unpoison everything back before the function returns.
+ void processStaticAllocas();
+ void processDynamicAllocas();
+
+ void createDynamicAllocasInitStorage();
+
+ // ----------------------- Visitors.
/// Collect all Ret instructions, or the musttail call instruction if it
/// precedes the return instruction.
void visitReturnInst(ReturnInst &RI) {
@@ -969,910 +969,910 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
else
RetVec.push_back(&RI);
}
-
- /// Collect all Resume instructions.
- void visitResumeInst(ResumeInst &RI) { RetVec.push_back(&RI); }
-
- /// Collect all CatchReturnInst instructions.
- void visitCleanupReturnInst(CleanupReturnInst &CRI) { RetVec.push_back(&CRI); }
-
- void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore,
- Value *SavedStack) {
- IRBuilder<> IRB(InstBefore);
- Value *DynamicAreaPtr = IRB.CreatePtrToInt(SavedStack, IntptrTy);
- // When we insert _asan_allocas_unpoison before @llvm.stackrestore, we
- // need to adjust extracted SP to compute the address of the most recent
- // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for
- // this purpose.
- if (!isa<ReturnInst>(InstBefore)) {
- Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration(
- InstBefore->getModule(), Intrinsic::get_dynamic_area_offset,
- {IntptrTy});
-
- Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {});
-
- DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy),
- DynamicAreaOffset);
- }
-
- IRB.CreateCall(
- AsanAllocasUnpoisonFunc,
- {IRB.CreateLoad(IntptrTy, DynamicAllocaLayout), DynamicAreaPtr});
- }
-
- // Unpoison dynamic allocas redzones.
- void unpoisonDynamicAllocas() {
+
+ /// Collect all Resume instructions.
+ void visitResumeInst(ResumeInst &RI) { RetVec.push_back(&RI); }
+
+ /// Collect all CatchReturnInst instructions.
+ void visitCleanupReturnInst(CleanupReturnInst &CRI) { RetVec.push_back(&CRI); }
+
+ void unpoisonDynamicAllocasBeforeInst(Instruction *InstBefore,
+ Value *SavedStack) {
+ IRBuilder<> IRB(InstBefore);
+ Value *DynamicAreaPtr = IRB.CreatePtrToInt(SavedStack, IntptrTy);
+ // When we insert _asan_allocas_unpoison before @llvm.stackrestore, we
+ // need to adjust extracted SP to compute the address of the most recent
+ // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for
+ // this purpose.
+ if (!isa<ReturnInst>(InstBefore)) {
+ Function *DynamicAreaOffsetFunc = Intrinsic::getDeclaration(
+ InstBefore->getModule(), Intrinsic::get_dynamic_area_offset,
+ {IntptrTy});
+
+ Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {});
+
+ DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy),
+ DynamicAreaOffset);
+ }
+
+ IRB.CreateCall(
+ AsanAllocasUnpoisonFunc,
+ {IRB.CreateLoad(IntptrTy, DynamicAllocaLayout), DynamicAreaPtr});
+ }
+
+ // Unpoison dynamic allocas redzones.
+ void unpoisonDynamicAllocas() {
for (Instruction *Ret : RetVec)
- unpoisonDynamicAllocasBeforeInst(Ret, DynamicAllocaLayout);
-
+ unpoisonDynamicAllocasBeforeInst(Ret, DynamicAllocaLayout);
+
for (Instruction *StackRestoreInst : StackRestoreVec)
- unpoisonDynamicAllocasBeforeInst(StackRestoreInst,
- StackRestoreInst->getOperand(0));
- }
-
- // Deploy and poison redzones around dynamic alloca call. To do this, we
- // should replace this call with another one with changed parameters and
- // replace all its uses with new address, so
- // addr = alloca type, old_size, align
- // is replaced by
- // new_size = (old_size + additional_size) * sizeof(type)
- // tmp = alloca i8, new_size, max(align, 32)
- // addr = tmp + 32 (first 32 bytes are for the left redzone).
- // Additional_size is added to make new memory allocation contain not only
- // requested memory, but also left, partial and right redzones.
- void handleDynamicAllocaCall(AllocaInst *AI);
-
- /// Collect Alloca instructions we want (and can) handle.
- void visitAllocaInst(AllocaInst &AI) {
- if (!ASan.isInterestingAlloca(AI)) {
- if (AI.isStaticAlloca()) {
- // Skip over allocas that are present *before* the first instrumented
- // alloca, we don't want to move those around.
- if (AllocaVec.empty())
- return;
-
- StaticAllocasToMoveUp.push_back(&AI);
- }
- return;
- }
-
- StackAlignment = std::max(StackAlignment, AI.getAlignment());
- if (!AI.isStaticAlloca())
- DynamicAllocaVec.push_back(&AI);
- else
- AllocaVec.push_back(&AI);
- }
-
- /// Collect lifetime intrinsic calls to check for use-after-scope
- /// errors.
- void visitIntrinsicInst(IntrinsicInst &II) {
- Intrinsic::ID ID = II.getIntrinsicID();
- if (ID == Intrinsic::stackrestore) StackRestoreVec.push_back(&II);
- if (ID == Intrinsic::localescape) LocalEscapeCall = &II;
- if (!ASan.UseAfterScope)
- return;
- if (!II.isLifetimeStartOrEnd())
- return;
- // Found lifetime intrinsic, add ASan instrumentation if necessary.
- auto *Size = cast<ConstantInt>(II.getArgOperand(0));
- // If size argument is undefined, don't do anything.
- if (Size->isMinusOne()) return;
- // Check that size doesn't saturate uint64_t and can
- // be stored in IntptrTy.
- const uint64_t SizeValue = Size->getValue().getLimitedValue();
- if (SizeValue == ~0ULL ||
- !ConstantInt::isValueValidForType(IntptrTy, SizeValue))
- return;
- // Find alloca instruction that corresponds to llvm.lifetime argument.
+ unpoisonDynamicAllocasBeforeInst(StackRestoreInst,
+ StackRestoreInst->getOperand(0));
+ }
+
+ // Deploy and poison redzones around dynamic alloca call. To do this, we
+ // should replace this call with another one with changed parameters and
+ // replace all its uses with new address, so
+ // addr = alloca type, old_size, align
+ // is replaced by
+ // new_size = (old_size + additional_size) * sizeof(type)
+ // tmp = alloca i8, new_size, max(align, 32)
+ // addr = tmp + 32 (first 32 bytes are for the left redzone).
+ // Additional_size is added to make new memory allocation contain not only
+ // requested memory, but also left, partial and right redzones.
+ void handleDynamicAllocaCall(AllocaInst *AI);
+
+ /// Collect Alloca instructions we want (and can) handle.
+ void visitAllocaInst(AllocaInst &AI) {
+ if (!ASan.isInterestingAlloca(AI)) {
+ if (AI.isStaticAlloca()) {
+ // Skip over allocas that are present *before* the first instrumented
+ // alloca, we don't want to move those around.
+ if (AllocaVec.empty())
+ return;
+
+ StaticAllocasToMoveUp.push_back(&AI);
+ }
+ return;
+ }
+
+ StackAlignment = std::max(StackAlignment, AI.getAlignment());
+ if (!AI.isStaticAlloca())
+ DynamicAllocaVec.push_back(&AI);
+ else
+ AllocaVec.push_back(&AI);
+ }
+
+ /// Collect lifetime intrinsic calls to check for use-after-scope
+ /// errors.
+ void visitIntrinsicInst(IntrinsicInst &II) {
+ Intrinsic::ID ID = II.getIntrinsicID();
+ if (ID == Intrinsic::stackrestore) StackRestoreVec.push_back(&II);
+ if (ID == Intrinsic::localescape) LocalEscapeCall = &II;
+ if (!ASan.UseAfterScope)
+ return;
+ if (!II.isLifetimeStartOrEnd())
+ return;
+ // Found lifetime intrinsic, add ASan instrumentation if necessary.
+ auto *Size = cast<ConstantInt>(II.getArgOperand(0));
+ // If size argument is undefined, don't do anything.
+ if (Size->isMinusOne()) return;
+ // Check that size doesn't saturate uint64_t and can
+ // be stored in IntptrTy.
+ const uint64_t SizeValue = Size->getValue().getLimitedValue();
+ if (SizeValue == ~0ULL ||
+ !ConstantInt::isValueValidForType(IntptrTy, SizeValue))
+ return;
+ // Find alloca instruction that corresponds to llvm.lifetime argument.
// Currently we can only handle lifetime markers pointing to the
// beginning of the alloca.
AllocaInst *AI = findAllocaForValue(II.getArgOperand(1), true);
- if (!AI) {
- HasUntracedLifetimeIntrinsic = true;
- return;
- }
- // We're interested only in allocas we can handle.
- if (!ASan.isInterestingAlloca(*AI))
- return;
- bool DoPoison = (ID == Intrinsic::lifetime_end);
- AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
- if (AI->isStaticAlloca())
- StaticAllocaPoisonCallVec.push_back(APC);
- else if (ClInstrumentDynamicAllocas)
- DynamicAllocaPoisonCallVec.push_back(APC);
- }
-
- void visitCallBase(CallBase &CB) {
- if (CallInst *CI = dyn_cast<CallInst>(&CB)) {
- HasInlineAsm |= CI->isInlineAsm() && &CB != ASan.LocalDynamicShadow;
- HasReturnsTwiceCall |= CI->canReturnTwice();
- }
- }
-
- // ---------------------- Helpers.
- void initializeCallbacks(Module &M);
-
- // Copies bytes from ShadowBytes into shadow memory for indexes where
- // ShadowMask is not zero. If ShadowMask[i] is zero, we assume that
- // ShadowBytes[i] is constantly zero and doesn't need to be overwritten.
- void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes,
- IRBuilder<> &IRB, Value *ShadowBase);
- void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes,
- size_t Begin, size_t End, IRBuilder<> &IRB,
- Value *ShadowBase);
- void copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
- ArrayRef<uint8_t> ShadowBytes, size_t Begin,
- size_t End, IRBuilder<> &IRB, Value *ShadowBase);
-
- void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison);
-
- Value *createAllocaForLayout(IRBuilder<> &IRB, const ASanStackFrameLayout &L,
- bool Dynamic);
- PHINode *createPHI(IRBuilder<> &IRB, Value *Cond, Value *ValueIfTrue,
- Instruction *ThenTerm, Value *ValueIfFalse);
-};
-
-} // end anonymous namespace
-
-void LocationMetadata::parse(MDNode *MDN) {
- assert(MDN->getNumOperands() == 3);
- MDString *DIFilename = cast<MDString>(MDN->getOperand(0));
- Filename = DIFilename->getString();
- LineNo = mdconst::extract<ConstantInt>(MDN->getOperand(1))->getLimitedValue();
- ColumnNo =
- mdconst::extract<ConstantInt>(MDN->getOperand(2))->getLimitedValue();
-}
-
-// FIXME: It would be cleaner to instead attach relevant metadata to the globals
-// we want to sanitize instead and reading this metadata on each pass over a
-// function instead of reading module level metadata at first.
-GlobalsMetadata::GlobalsMetadata(Module &M) {
- NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals");
- if (!Globals)
- return;
- for (auto MDN : Globals->operands()) {
- // Metadata node contains the global and the fields of "Entry".
- assert(MDN->getNumOperands() == 5);
- auto *V = mdconst::extract_or_null<Constant>(MDN->getOperand(0));
- // The optimizer may optimize away a global entirely.
- if (!V)
- continue;
- auto *StrippedV = V->stripPointerCasts();
- auto *GV = dyn_cast<GlobalVariable>(StrippedV);
- if (!GV)
- continue;
- // We can already have an entry for GV if it was merged with another
- // global.
- Entry &E = Entries[GV];
- if (auto *Loc = cast_or_null<MDNode>(MDN->getOperand(1)))
- E.SourceLoc.parse(Loc);
- if (auto *Name = cast_or_null<MDString>(MDN->getOperand(2)))
- E.Name = Name->getString();
- ConstantInt *IsDynInit = mdconst::extract<ConstantInt>(MDN->getOperand(3));
- E.IsDynInit |= IsDynInit->isOne();
- ConstantInt *IsExcluded =
- mdconst::extract<ConstantInt>(MDN->getOperand(4));
- E.IsExcluded |= IsExcluded->isOne();
- }
-}
-
-AnalysisKey ASanGlobalsMetadataAnalysis::Key;
-
-GlobalsMetadata ASanGlobalsMetadataAnalysis::run(Module &M,
- ModuleAnalysisManager &AM) {
- return GlobalsMetadata(M);
-}
-
-AddressSanitizerPass::AddressSanitizerPass(bool CompileKernel, bool Recover,
- bool UseAfterScope)
- : CompileKernel(CompileKernel), Recover(Recover),
- UseAfterScope(UseAfterScope) {}
-
-PreservedAnalyses AddressSanitizerPass::run(Function &F,
- AnalysisManager<Function> &AM) {
- auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
- Module &M = *F.getParent();
- if (auto *R = MAMProxy.getCachedResult<ASanGlobalsMetadataAnalysis>(M)) {
- const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
- AddressSanitizer Sanitizer(M, R, CompileKernel, Recover, UseAfterScope);
- if (Sanitizer.instrumentFunction(F, TLI))
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
- }
-
- report_fatal_error(
- "The ASanGlobalsMetadataAnalysis is required to run before "
- "AddressSanitizer can run");
- return PreservedAnalyses::all();
-}
-
-ModuleAddressSanitizerPass::ModuleAddressSanitizerPass(bool CompileKernel,
- bool Recover,
- bool UseGlobalGC,
- bool UseOdrIndicator)
- : CompileKernel(CompileKernel), Recover(Recover), UseGlobalGC(UseGlobalGC),
- UseOdrIndicator(UseOdrIndicator) {}
-
-PreservedAnalyses ModuleAddressSanitizerPass::run(Module &M,
- AnalysisManager<Module> &AM) {
- GlobalsMetadata &GlobalsMD = AM.getResult<ASanGlobalsMetadataAnalysis>(M);
- ModuleAddressSanitizer Sanitizer(M, &GlobalsMD, CompileKernel, Recover,
- UseGlobalGC, UseOdrIndicator);
- if (Sanitizer.instrumentModule(M))
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
-}
-
-INITIALIZE_PASS(ASanGlobalsMetadataWrapperPass, "asan-globals-md",
- "Read metadata to mark which globals should be instrumented "
- "when running ASan.",
- false, true)
-
-char AddressSanitizerLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(
- AddressSanitizerLegacyPass, "asan",
- "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(ASanGlobalsMetadataWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(
- AddressSanitizerLegacyPass, "asan",
- "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
- false)
-
-FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel,
- bool Recover,
- bool UseAfterScope) {
- assert(!CompileKernel || Recover);
- return new AddressSanitizerLegacyPass(CompileKernel, Recover, UseAfterScope);
-}
-
-char ModuleAddressSanitizerLegacyPass::ID = 0;
-
-INITIALIZE_PASS(
- ModuleAddressSanitizerLegacyPass, "asan-module",
- "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
- "ModulePass",
- false, false)
-
-ModulePass *llvm::createModuleAddressSanitizerLegacyPassPass(
- bool CompileKernel, bool Recover, bool UseGlobalsGC, bool UseOdrIndicator) {
- assert(!CompileKernel || Recover);
- return new ModuleAddressSanitizerLegacyPass(CompileKernel, Recover,
- UseGlobalsGC, UseOdrIndicator);
-}
-
-static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
- size_t Res = countTrailingZeros(TypeSize / 8);
- assert(Res < kNumberOfAccessSizes);
- return Res;
-}
-
-/// Create a global describing a source location.
-static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
- LocationMetadata MD) {
- Constant *LocData[] = {
- createPrivateGlobalForString(M, MD.Filename, true, kAsanGenPrefix),
- ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.LineNo),
- ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.ColumnNo),
- };
- auto LocStruct = ConstantStruct::getAnon(LocData);
- auto GV = new GlobalVariable(M, LocStruct->getType(), true,
- GlobalValue::PrivateLinkage, LocStruct,
- kAsanGenPrefix);
- GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
- return GV;
-}
-
-/// Check if \p G has been created by a trusted compiler pass.
-static bool GlobalWasGeneratedByCompiler(GlobalVariable *G) {
- // Do not instrument @llvm.global_ctors, @llvm.used, etc.
- if (G->getName().startswith("llvm."))
- return true;
-
- // Do not instrument asan globals.
- if (G->getName().startswith(kAsanGenPrefix) ||
- G->getName().startswith(kSanCovGenPrefix) ||
- G->getName().startswith(kODRGenPrefix))
- return true;
-
- // Do not instrument gcov counter arrays.
- if (G->getName() == "__llvm_gcov_ctr")
- return true;
-
- return false;
-}
-
-Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
- // Shadow >> scale
- Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
- if (Mapping.Offset == 0) return Shadow;
- // (Shadow >> scale) | offset
- Value *ShadowBase;
- if (LocalDynamicShadow)
- ShadowBase = LocalDynamicShadow;
- else
- ShadowBase = ConstantInt::get(IntptrTy, Mapping.Offset);
- if (Mapping.OrShadowOffset)
- return IRB.CreateOr(Shadow, ShadowBase);
- else
- return IRB.CreateAdd(Shadow, ShadowBase);
-}
-
-// Instrument memset/memmove/memcpy
-void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
- IRBuilder<> IRB(MI);
- if (isa<MemTransferInst>(MI)) {
- IRB.CreateCall(
- isa<MemMoveInst>(MI) ? AsanMemmove : AsanMemcpy,
- {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
- IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
- IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
- } else if (isa<MemSetInst>(MI)) {
- IRB.CreateCall(
- AsanMemset,
- {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
- IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
- IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
- }
- MI->eraseFromParent();
-}
-
-/// Check if we want (and can) handle this alloca.
-bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
- auto PreviouslySeenAllocaInfo = ProcessedAllocas.find(&AI);
-
- if (PreviouslySeenAllocaInfo != ProcessedAllocas.end())
- return PreviouslySeenAllocaInfo->getSecond();
-
- bool IsInteresting =
- (AI.getAllocatedType()->isSized() &&
- // alloca() may be called with 0 size, ignore it.
- ((!AI.isStaticAlloca()) || getAllocaSizeInBytes(AI) > 0) &&
- // We are only interested in allocas not promotable to registers.
- // Promotable allocas are common under -O0.
- (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) &&
- // inalloca allocas are not treated as static, and we don't want
- // dynamic alloca instrumentation for them as well.
- !AI.isUsedWithInAlloca() &&
- // swifterror allocas are register promoted by ISel
- !AI.isSwiftError());
-
- ProcessedAllocas[&AI] = IsInteresting;
- return IsInteresting;
-}
-
-bool AddressSanitizer::ignoreAccess(Value *Ptr) {
- // Do not instrument acesses from different address spaces; we cannot deal
- // with them.
- Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
- if (PtrTy->getPointerAddressSpace() != 0)
- return true;
-
- // Ignore swifterror addresses.
- // swifterror memory addresses are mem2reg promoted by instruction
- // selection. As such they cannot have regular uses like an instrumentation
- // function and it makes no sense to track them as memory.
- if (Ptr->isSwiftError())
- return true;
-
- // Treat memory accesses to promotable allocas as non-interesting since they
- // will not cause memory violations. This greatly speeds up the instrumented
- // executable at -O0.
- if (auto AI = dyn_cast_or_null<AllocaInst>(Ptr))
- if (ClSkipPromotableAllocas && !isInterestingAlloca(*AI))
- return true;
-
- return false;
-}
-
-void AddressSanitizer::getInterestingMemoryOperands(
- Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting) {
- // Skip memory accesses inserted by another instrumentation.
- if (I->hasMetadata("nosanitize"))
- return;
-
- // Do not instrument the load fetching the dynamic shadow address.
- if (LocalDynamicShadow == I)
- return;
-
- if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
- if (!ClInstrumentReads || ignoreAccess(LI->getPointerOperand()))
- return;
- Interesting.emplace_back(I, LI->getPointerOperandIndex(), false,
- LI->getType(), LI->getAlign());
- } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
- if (!ClInstrumentWrites || ignoreAccess(SI->getPointerOperand()))
- return;
- Interesting.emplace_back(I, SI->getPointerOperandIndex(), true,
- SI->getValueOperand()->getType(), SI->getAlign());
- } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
- if (!ClInstrumentAtomics || ignoreAccess(RMW->getPointerOperand()))
- return;
- Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true,
- RMW->getValOperand()->getType(), None);
- } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
- if (!ClInstrumentAtomics || ignoreAccess(XCHG->getPointerOperand()))
- return;
- Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
- XCHG->getCompareOperand()->getType(), None);
- } else if (auto CI = dyn_cast<CallInst>(I)) {
- auto *F = CI->getCalledFunction();
- if (F && (F->getName().startswith("llvm.masked.load.") ||
- F->getName().startswith("llvm.masked.store."))) {
- bool IsWrite = F->getName().startswith("llvm.masked.store.");
- // Masked store has an initial operand for the value.
- unsigned OpOffset = IsWrite ? 1 : 0;
- if (IsWrite ? !ClInstrumentWrites : !ClInstrumentReads)
- return;
-
- auto BasePtr = CI->getOperand(OpOffset);
- if (ignoreAccess(BasePtr))
- return;
- auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
- MaybeAlign Alignment = Align(1);
- // Otherwise no alignment guarantees. We probably got Undef.
- if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
- Alignment = Op->getMaybeAlignValue();
- Value *Mask = CI->getOperand(2 + OpOffset);
- Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask);
- } else {
- for (unsigned ArgNo = 0; ArgNo < CI->getNumArgOperands(); ArgNo++) {
- if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) ||
- ignoreAccess(CI->getArgOperand(ArgNo)))
- continue;
- Type *Ty = CI->getParamByValType(ArgNo);
- Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
- }
- }
- }
-}
-
-static bool isPointerOperand(Value *V) {
- return V->getType()->isPointerTy() || isa<PtrToIntInst>(V);
-}
-
-// This is a rough heuristic; it may cause both false positives and
-// false negatives. The proper implementation requires cooperation with
-// the frontend.
-static bool isInterestingPointerComparison(Instruction *I) {
- if (ICmpInst *Cmp = dyn_cast<ICmpInst>(I)) {
- if (!Cmp->isRelational())
- return false;
- } else {
- return false;
- }
- return isPointerOperand(I->getOperand(0)) &&
- isPointerOperand(I->getOperand(1));
-}
-
-// This is a rough heuristic; it may cause both false positives and
-// false negatives. The proper implementation requires cooperation with
-// the frontend.
-static bool isInterestingPointerSubtraction(Instruction *I) {
- if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
- if (BO->getOpcode() != Instruction::Sub)
- return false;
- } else {
- return false;
- }
- return isPointerOperand(I->getOperand(0)) &&
- isPointerOperand(I->getOperand(1));
-}
-
-bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) {
- // If a global variable does not have dynamic initialization we don't
- // have to instrument it. However, if a global does not have initializer
- // at all, we assume it has dynamic initializer (in other TU).
- //
- // FIXME: Metadata should be attched directly to the global directly instead
- // of being added to llvm.asan.globals.
- return G->hasInitializer() && !GlobalsMD.get(G).IsDynInit;
-}
-
-void AddressSanitizer::instrumentPointerComparisonOrSubtraction(
- Instruction *I) {
- IRBuilder<> IRB(I);
- FunctionCallee F = isa<ICmpInst>(I) ? AsanPtrCmpFunction : AsanPtrSubFunction;
- Value *Param[2] = {I->getOperand(0), I->getOperand(1)};
- for (Value *&i : Param) {
- if (i->getType()->isPointerTy())
- i = IRB.CreatePointerCast(i, IntptrTy);
- }
- IRB.CreateCall(F, Param);
-}
-
-static void doInstrumentAddress(AddressSanitizer *Pass, Instruction *I,
- Instruction *InsertBefore, Value *Addr,
- MaybeAlign Alignment, unsigned Granularity,
- uint32_t TypeSize, bool IsWrite,
- Value *SizeArgument, bool UseCalls,
- uint32_t Exp) {
- // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check
- // if the data is properly aligned.
- if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 ||
- TypeSize == 128) &&
- (!Alignment || *Alignment >= Granularity || *Alignment >= TypeSize / 8))
- return Pass->instrumentAddress(I, InsertBefore, Addr, TypeSize, IsWrite,
- nullptr, UseCalls, Exp);
- Pass->instrumentUnusualSizeOrAlignment(I, InsertBefore, Addr, TypeSize,
- IsWrite, nullptr, UseCalls, Exp);
-}
-
-static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
- const DataLayout &DL, Type *IntptrTy,
- Value *Mask, Instruction *I,
- Value *Addr, MaybeAlign Alignment,
- unsigned Granularity, uint32_t TypeSize,
- bool IsWrite, Value *SizeArgument,
- bool UseCalls, uint32_t Exp) {
- auto *VTy = cast<FixedVectorType>(
- cast<PointerType>(Addr->getType())->getElementType());
- uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
- unsigned Num = VTy->getNumElements();
- auto Zero = ConstantInt::get(IntptrTy, 0);
- for (unsigned Idx = 0; Idx < Num; ++Idx) {
- Value *InstrumentedAddress = nullptr;
- Instruction *InsertBefore = I;
- if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
- // dyn_cast as we might get UndefValue
- if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
- if (Masked->isZero())
- // Mask is constant false, so no instrumentation needed.
- continue;
- // If we have a true or undef value, fall through to doInstrumentAddress
- // with InsertBefore == I
- }
- } else {
- IRBuilder<> IRB(I);
- Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
- Instruction *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
- InsertBefore = ThenTerm;
- }
-
- IRBuilder<> IRB(InsertBefore);
- InstrumentedAddress =
- IRB.CreateGEP(VTy, Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
- doInstrumentAddress(Pass, I, InsertBefore, InstrumentedAddress, Alignment,
- Granularity, ElemTypeSize, IsWrite, SizeArgument,
- UseCalls, Exp);
- }
-}
-
-void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
- InterestingMemoryOperand &O, bool UseCalls,
- const DataLayout &DL) {
- Value *Addr = O.getPtr();
-
- // Optimization experiments.
- // The experiments can be used to evaluate potential optimizations that remove
- // instrumentation (assess false negatives). Instead of completely removing
- // some instrumentation, you set Exp to a non-zero value (mask of optimization
- // experiments that want to remove instrumentation of this instruction).
- // If Exp is non-zero, this pass will emit special calls into runtime
- // (e.g. __asan_report_exp_load1 instead of __asan_report_load1). These calls
- // make runtime terminate the program in a special way (with a different
- // exit status). Then you run the new compiler on a buggy corpus, collect
- // the special terminations (ideally, you don't see them at all -- no false
- // negatives) and make the decision on the optimization.
- uint32_t Exp = ClForceExperiment;
-
- if (ClOpt && ClOptGlobals) {
- // If initialization order checking is disabled, a simple access to a
- // dynamically initialized global is always valid.
+ if (!AI) {
+ HasUntracedLifetimeIntrinsic = true;
+ return;
+ }
+ // We're interested only in allocas we can handle.
+ if (!ASan.isInterestingAlloca(*AI))
+ return;
+ bool DoPoison = (ID == Intrinsic::lifetime_end);
+ AllocaPoisonCall APC = {&II, AI, SizeValue, DoPoison};
+ if (AI->isStaticAlloca())
+ StaticAllocaPoisonCallVec.push_back(APC);
+ else if (ClInstrumentDynamicAllocas)
+ DynamicAllocaPoisonCallVec.push_back(APC);
+ }
+
+ void visitCallBase(CallBase &CB) {
+ if (CallInst *CI = dyn_cast<CallInst>(&CB)) {
+ HasInlineAsm |= CI->isInlineAsm() && &CB != ASan.LocalDynamicShadow;
+ HasReturnsTwiceCall |= CI->canReturnTwice();
+ }
+ }
+
+ // ---------------------- Helpers.
+ void initializeCallbacks(Module &M);
+
+ // Copies bytes from ShadowBytes into shadow memory for indexes where
+ // ShadowMask is not zero. If ShadowMask[i] is zero, we assume that
+ // ShadowBytes[i] is constantly zero and doesn't need to be overwritten.
+ void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes,
+ IRBuilder<> &IRB, Value *ShadowBase);
+ void copyToShadow(ArrayRef<uint8_t> ShadowMask, ArrayRef<uint8_t> ShadowBytes,
+ size_t Begin, size_t End, IRBuilder<> &IRB,
+ Value *ShadowBase);
+ void copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
+ ArrayRef<uint8_t> ShadowBytes, size_t Begin,
+ size_t End, IRBuilder<> &IRB, Value *ShadowBase);
+
+ void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> &IRB, bool DoPoison);
+
+ Value *createAllocaForLayout(IRBuilder<> &IRB, const ASanStackFrameLayout &L,
+ bool Dynamic);
+ PHINode *createPHI(IRBuilder<> &IRB, Value *Cond, Value *ValueIfTrue,
+ Instruction *ThenTerm, Value *ValueIfFalse);
+};
+
+} // end anonymous namespace
+
+void LocationMetadata::parse(MDNode *MDN) {
+ assert(MDN->getNumOperands() == 3);
+ MDString *DIFilename = cast<MDString>(MDN->getOperand(0));
+ Filename = DIFilename->getString();
+ LineNo = mdconst::extract<ConstantInt>(MDN->getOperand(1))->getLimitedValue();
+ ColumnNo =
+ mdconst::extract<ConstantInt>(MDN->getOperand(2))->getLimitedValue();
+}
+
+// FIXME: It would be cleaner to instead attach relevant metadata to the globals
+// we want to sanitize instead and reading this metadata on each pass over a
+// function instead of reading module level metadata at first.
+GlobalsMetadata::GlobalsMetadata(Module &M) {
+ NamedMDNode *Globals = M.getNamedMetadata("llvm.asan.globals");
+ if (!Globals)
+ return;
+ for (auto MDN : Globals->operands()) {
+ // Metadata node contains the global and the fields of "Entry".
+ assert(MDN->getNumOperands() == 5);
+ auto *V = mdconst::extract_or_null<Constant>(MDN->getOperand(0));
+ // The optimizer may optimize away a global entirely.
+ if (!V)
+ continue;
+ auto *StrippedV = V->stripPointerCasts();
+ auto *GV = dyn_cast<GlobalVariable>(StrippedV);
+ if (!GV)
+ continue;
+ // We can already have an entry for GV if it was merged with another
+ // global.
+ Entry &E = Entries[GV];
+ if (auto *Loc = cast_or_null<MDNode>(MDN->getOperand(1)))
+ E.SourceLoc.parse(Loc);
+ if (auto *Name = cast_or_null<MDString>(MDN->getOperand(2)))
+ E.Name = Name->getString();
+ ConstantInt *IsDynInit = mdconst::extract<ConstantInt>(MDN->getOperand(3));
+ E.IsDynInit |= IsDynInit->isOne();
+ ConstantInt *IsExcluded =
+ mdconst::extract<ConstantInt>(MDN->getOperand(4));
+ E.IsExcluded |= IsExcluded->isOne();
+ }
+}
+
+AnalysisKey ASanGlobalsMetadataAnalysis::Key;
+
+GlobalsMetadata ASanGlobalsMetadataAnalysis::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ return GlobalsMetadata(M);
+}
+
+AddressSanitizerPass::AddressSanitizerPass(bool CompileKernel, bool Recover,
+ bool UseAfterScope)
+ : CompileKernel(CompileKernel), Recover(Recover),
+ UseAfterScope(UseAfterScope) {}
+
+PreservedAnalyses AddressSanitizerPass::run(Function &F,
+ AnalysisManager<Function> &AM) {
+ auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+ Module &M = *F.getParent();
+ if (auto *R = MAMProxy.getCachedResult<ASanGlobalsMetadataAnalysis>(M)) {
+ const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+ AddressSanitizer Sanitizer(M, R, CompileKernel, Recover, UseAfterScope);
+ if (Sanitizer.instrumentFunction(F, TLI))
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+ }
+
+ report_fatal_error(
+ "The ASanGlobalsMetadataAnalysis is required to run before "
+ "AddressSanitizer can run");
+ return PreservedAnalyses::all();
+}
+
+ModuleAddressSanitizerPass::ModuleAddressSanitizerPass(bool CompileKernel,
+ bool Recover,
+ bool UseGlobalGC,
+ bool UseOdrIndicator)
+ : CompileKernel(CompileKernel), Recover(Recover), UseGlobalGC(UseGlobalGC),
+ UseOdrIndicator(UseOdrIndicator) {}
+
+PreservedAnalyses ModuleAddressSanitizerPass::run(Module &M,
+ AnalysisManager<Module> &AM) {
+ GlobalsMetadata &GlobalsMD = AM.getResult<ASanGlobalsMetadataAnalysis>(M);
+ ModuleAddressSanitizer Sanitizer(M, &GlobalsMD, CompileKernel, Recover,
+ UseGlobalGC, UseOdrIndicator);
+ if (Sanitizer.instrumentModule(M))
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
+INITIALIZE_PASS(ASanGlobalsMetadataWrapperPass, "asan-globals-md",
+ "Read metadata to mark which globals should be instrumented "
+ "when running ASan.",
+ false, true)
+
+char AddressSanitizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+ AddressSanitizerLegacyPass, "asan",
+ "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(ASanGlobalsMetadataWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+ AddressSanitizerLegacyPass, "asan",
+ "AddressSanitizer: detects use-after-free and out-of-bounds bugs.", false,
+ false)
+
+FunctionPass *llvm::createAddressSanitizerFunctionPass(bool CompileKernel,
+ bool Recover,
+ bool UseAfterScope) {
+ assert(!CompileKernel || Recover);
+ return new AddressSanitizerLegacyPass(CompileKernel, Recover, UseAfterScope);
+}
+
+char ModuleAddressSanitizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS(
+ ModuleAddressSanitizerLegacyPass, "asan-module",
+ "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
+ "ModulePass",
+ false, false)
+
+ModulePass *llvm::createModuleAddressSanitizerLegacyPassPass(
+ bool CompileKernel, bool Recover, bool UseGlobalsGC, bool UseOdrIndicator) {
+ assert(!CompileKernel || Recover);
+ return new ModuleAddressSanitizerLegacyPass(CompileKernel, Recover,
+ UseGlobalsGC, UseOdrIndicator);
+}
+
+static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
+ size_t Res = countTrailingZeros(TypeSize / 8);
+ assert(Res < kNumberOfAccessSizes);
+ return Res;
+}
+
+/// Create a global describing a source location.
+static GlobalVariable *createPrivateGlobalForSourceLoc(Module &M,
+ LocationMetadata MD) {
+ Constant *LocData[] = {
+ createPrivateGlobalForString(M, MD.Filename, true, kAsanGenPrefix),
+ ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.LineNo),
+ ConstantInt::get(Type::getInt32Ty(M.getContext()), MD.ColumnNo),
+ };
+ auto LocStruct = ConstantStruct::getAnon(LocData);
+ auto GV = new GlobalVariable(M, LocStruct->getType(), true,
+ GlobalValue::PrivateLinkage, LocStruct,
+ kAsanGenPrefix);
+ GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+ return GV;
+}
+
+/// Check if \p G has been created by a trusted compiler pass.
+static bool GlobalWasGeneratedByCompiler(GlobalVariable *G) {
+ // Do not instrument @llvm.global_ctors, @llvm.used, etc.
+ if (G->getName().startswith("llvm."))
+ return true;
+
+ // Do not instrument asan globals.
+ if (G->getName().startswith(kAsanGenPrefix) ||
+ G->getName().startswith(kSanCovGenPrefix) ||
+ G->getName().startswith(kODRGenPrefix))
+ return true;
+
+ // Do not instrument gcov counter arrays.
+ if (G->getName() == "__llvm_gcov_ctr")
+ return true;
+
+ return false;
+}
+
+Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
+ // Shadow >> scale
+ Shadow = IRB.CreateLShr(Shadow, Mapping.Scale);
+ if (Mapping.Offset == 0) return Shadow;
+ // (Shadow >> scale) | offset
+ Value *ShadowBase;
+ if (LocalDynamicShadow)
+ ShadowBase = LocalDynamicShadow;
+ else
+ ShadowBase = ConstantInt::get(IntptrTy, Mapping.Offset);
+ if (Mapping.OrShadowOffset)
+ return IRB.CreateOr(Shadow, ShadowBase);
+ else
+ return IRB.CreateAdd(Shadow, ShadowBase);
+}
+
+// Instrument memset/memmove/memcpy
+void AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+ IRBuilder<> IRB(MI);
+ if (isa<MemTransferInst>(MI)) {
+ IRB.CreateCall(
+ isa<MemMoveInst>(MI) ? AsanMemmove : AsanMemcpy,
+ {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+ IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
+ IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+ } else if (isa<MemSetInst>(MI)) {
+ IRB.CreateCall(
+ AsanMemset,
+ {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+ IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
+ IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+ }
+ MI->eraseFromParent();
+}
+
+/// Check if we want (and can) handle this alloca.
+bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
+ auto PreviouslySeenAllocaInfo = ProcessedAllocas.find(&AI);
+
+ if (PreviouslySeenAllocaInfo != ProcessedAllocas.end())
+ return PreviouslySeenAllocaInfo->getSecond();
+
+ bool IsInteresting =
+ (AI.getAllocatedType()->isSized() &&
+ // alloca() may be called with 0 size, ignore it.
+ ((!AI.isStaticAlloca()) || getAllocaSizeInBytes(AI) > 0) &&
+ // We are only interested in allocas not promotable to registers.
+ // Promotable allocas are common under -O0.
+ (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) &&
+ // inalloca allocas are not treated as static, and we don't want
+ // dynamic alloca instrumentation for them as well.
+ !AI.isUsedWithInAlloca() &&
+ // swifterror allocas are register promoted by ISel
+ !AI.isSwiftError());
+
+ ProcessedAllocas[&AI] = IsInteresting;
+ return IsInteresting;
+}
+
+bool AddressSanitizer::ignoreAccess(Value *Ptr) {
+ // Do not instrument acesses from different address spaces; we cannot deal
+ // with them.
+ Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
+ if (PtrTy->getPointerAddressSpace() != 0)
+ return true;
+
+ // Ignore swifterror addresses.
+ // swifterror memory addresses are mem2reg promoted by instruction
+ // selection. As such they cannot have regular uses like an instrumentation
+ // function and it makes no sense to track them as memory.
+ if (Ptr->isSwiftError())
+ return true;
+
+ // Treat memory accesses to promotable allocas as non-interesting since they
+ // will not cause memory violations. This greatly speeds up the instrumented
+ // executable at -O0.
+ if (auto AI = dyn_cast_or_null<AllocaInst>(Ptr))
+ if (ClSkipPromotableAllocas && !isInterestingAlloca(*AI))
+ return true;
+
+ return false;
+}
+
+void AddressSanitizer::getInterestingMemoryOperands(
+ Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting) {
+ // Skip memory accesses inserted by another instrumentation.
+ if (I->hasMetadata("nosanitize"))
+ return;
+
+ // Do not instrument the load fetching the dynamic shadow address.
+ if (LocalDynamicShadow == I)
+ return;
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ if (!ClInstrumentReads || ignoreAccess(LI->getPointerOperand()))
+ return;
+ Interesting.emplace_back(I, LI->getPointerOperandIndex(), false,
+ LI->getType(), LI->getAlign());
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ if (!ClInstrumentWrites || ignoreAccess(SI->getPointerOperand()))
+ return;
+ Interesting.emplace_back(I, SI->getPointerOperandIndex(), true,
+ SI->getValueOperand()->getType(), SI->getAlign());
+ } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+ if (!ClInstrumentAtomics || ignoreAccess(RMW->getPointerOperand()))
+ return;
+ Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true,
+ RMW->getValOperand()->getType(), None);
+ } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+ if (!ClInstrumentAtomics || ignoreAccess(XCHG->getPointerOperand()))
+ return;
+ Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
+ XCHG->getCompareOperand()->getType(), None);
+ } else if (auto CI = dyn_cast<CallInst>(I)) {
+ auto *F = CI->getCalledFunction();
+ if (F && (F->getName().startswith("llvm.masked.load.") ||
+ F->getName().startswith("llvm.masked.store."))) {
+ bool IsWrite = F->getName().startswith("llvm.masked.store.");
+ // Masked store has an initial operand for the value.
+ unsigned OpOffset = IsWrite ? 1 : 0;
+ if (IsWrite ? !ClInstrumentWrites : !ClInstrumentReads)
+ return;
+
+ auto BasePtr = CI->getOperand(OpOffset);
+ if (ignoreAccess(BasePtr))
+ return;
+ auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
+ MaybeAlign Alignment = Align(1);
+ // Otherwise no alignment guarantees. We probably got Undef.
+ if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
+ Alignment = Op->getMaybeAlignValue();
+ Value *Mask = CI->getOperand(2 + OpOffset);
+ Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask);
+ } else {
+ for (unsigned ArgNo = 0; ArgNo < CI->getNumArgOperands(); ArgNo++) {
+ if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) ||
+ ignoreAccess(CI->getArgOperand(ArgNo)))
+ continue;
+ Type *Ty = CI->getParamByValType(ArgNo);
+ Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
+ }
+ }
+ }
+}
+
+static bool isPointerOperand(Value *V) {
+ return V->getType()->isPointerTy() || isa<PtrToIntInst>(V);
+}
+
+// This is a rough heuristic; it may cause both false positives and
+// false negatives. The proper implementation requires cooperation with
+// the frontend.
+static bool isInterestingPointerComparison(Instruction *I) {
+ if (ICmpInst *Cmp = dyn_cast<ICmpInst>(I)) {
+ if (!Cmp->isRelational())
+ return false;
+ } else {
+ return false;
+ }
+ return isPointerOperand(I->getOperand(0)) &&
+ isPointerOperand(I->getOperand(1));
+}
+
+// This is a rough heuristic; it may cause both false positives and
+// false negatives. The proper implementation requires cooperation with
+// the frontend.
+static bool isInterestingPointerSubtraction(Instruction *I) {
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+ if (BO->getOpcode() != Instruction::Sub)
+ return false;
+ } else {
+ return false;
+ }
+ return isPointerOperand(I->getOperand(0)) &&
+ isPointerOperand(I->getOperand(1));
+}
+
+bool AddressSanitizer::GlobalIsLinkerInitialized(GlobalVariable *G) {
+ // If a global variable does not have dynamic initialization we don't
+ // have to instrument it. However, if a global does not have initializer
+ // at all, we assume it has dynamic initializer (in other TU).
+ //
+ // FIXME: Metadata should be attched directly to the global directly instead
+ // of being added to llvm.asan.globals.
+ return G->hasInitializer() && !GlobalsMD.get(G).IsDynInit;
+}
+
+void AddressSanitizer::instrumentPointerComparisonOrSubtraction(
+ Instruction *I) {
+ IRBuilder<> IRB(I);
+ FunctionCallee F = isa<ICmpInst>(I) ? AsanPtrCmpFunction : AsanPtrSubFunction;
+ Value *Param[2] = {I->getOperand(0), I->getOperand(1)};
+ for (Value *&i : Param) {
+ if (i->getType()->isPointerTy())
+ i = IRB.CreatePointerCast(i, IntptrTy);
+ }
+ IRB.CreateCall(F, Param);
+}
+
+static void doInstrumentAddress(AddressSanitizer *Pass, Instruction *I,
+ Instruction *InsertBefore, Value *Addr,
+ MaybeAlign Alignment, unsigned Granularity,
+ uint32_t TypeSize, bool IsWrite,
+ Value *SizeArgument, bool UseCalls,
+ uint32_t Exp) {
+ // Instrument a 1-, 2-, 4-, 8-, or 16- byte access with one check
+ // if the data is properly aligned.
+ if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 ||
+ TypeSize == 128) &&
+ (!Alignment || *Alignment >= Granularity || *Alignment >= TypeSize / 8))
+ return Pass->instrumentAddress(I, InsertBefore, Addr, TypeSize, IsWrite,
+ nullptr, UseCalls, Exp);
+ Pass->instrumentUnusualSizeOrAlignment(I, InsertBefore, Addr, TypeSize,
+ IsWrite, nullptr, UseCalls, Exp);
+}
+
+static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
+ const DataLayout &DL, Type *IntptrTy,
+ Value *Mask, Instruction *I,
+ Value *Addr, MaybeAlign Alignment,
+ unsigned Granularity, uint32_t TypeSize,
+ bool IsWrite, Value *SizeArgument,
+ bool UseCalls, uint32_t Exp) {
+ auto *VTy = cast<FixedVectorType>(
+ cast<PointerType>(Addr->getType())->getElementType());
+ uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
+ unsigned Num = VTy->getNumElements();
+ auto Zero = ConstantInt::get(IntptrTy, 0);
+ for (unsigned Idx = 0; Idx < Num; ++Idx) {
+ Value *InstrumentedAddress = nullptr;
+ Instruction *InsertBefore = I;
+ if (auto *Vector = dyn_cast<ConstantVector>(Mask)) {
+ // dyn_cast as we might get UndefValue
+ if (auto *Masked = dyn_cast<ConstantInt>(Vector->getOperand(Idx))) {
+ if (Masked->isZero())
+ // Mask is constant false, so no instrumentation needed.
+ continue;
+ // If we have a true or undef value, fall through to doInstrumentAddress
+ // with InsertBefore == I
+ }
+ } else {
+ IRBuilder<> IRB(I);
+ Value *MaskElem = IRB.CreateExtractElement(Mask, Idx);
+ Instruction *ThenTerm = SplitBlockAndInsertIfThen(MaskElem, I, false);
+ InsertBefore = ThenTerm;
+ }
+
+ IRBuilder<> IRB(InsertBefore);
+ InstrumentedAddress =
+ IRB.CreateGEP(VTy, Addr, {Zero, ConstantInt::get(IntptrTy, Idx)});
+ doInstrumentAddress(Pass, I, InsertBefore, InstrumentedAddress, Alignment,
+ Granularity, ElemTypeSize, IsWrite, SizeArgument,
+ UseCalls, Exp);
+ }
+}
+
+void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
+ InterestingMemoryOperand &O, bool UseCalls,
+ const DataLayout &DL) {
+ Value *Addr = O.getPtr();
+
+ // Optimization experiments.
+ // The experiments can be used to evaluate potential optimizations that remove
+ // instrumentation (assess false negatives). Instead of completely removing
+ // some instrumentation, you set Exp to a non-zero value (mask of optimization
+ // experiments that want to remove instrumentation of this instruction).
+ // If Exp is non-zero, this pass will emit special calls into runtime
+ // (e.g. __asan_report_exp_load1 instead of __asan_report_load1). These calls
+ // make runtime terminate the program in a special way (with a different
+ // exit status). Then you run the new compiler on a buggy corpus, collect
+ // the special terminations (ideally, you don't see them at all -- no false
+ // negatives) and make the decision on the optimization.
+ uint32_t Exp = ClForceExperiment;
+
+ if (ClOpt && ClOptGlobals) {
+ // If initialization order checking is disabled, a simple access to a
+ // dynamically initialized global is always valid.
GlobalVariable *G = dyn_cast<GlobalVariable>(getUnderlyingObject(Addr));
- if (G && (!ClInitializers || GlobalIsLinkerInitialized(G)) &&
- isSafeAccess(ObjSizeVis, Addr, O.TypeSize)) {
- NumOptimizedAccessesToGlobalVar++;
- return;
- }
- }
-
- if (ClOpt && ClOptStack) {
- // A direct inbounds access to a stack variable is always valid.
+ if (G && (!ClInitializers || GlobalIsLinkerInitialized(G)) &&
+ isSafeAccess(ObjSizeVis, Addr, O.TypeSize)) {
+ NumOptimizedAccessesToGlobalVar++;
+ return;
+ }
+ }
+
+ if (ClOpt && ClOptStack) {
+ // A direct inbounds access to a stack variable is always valid.
if (isa<AllocaInst>(getUnderlyingObject(Addr)) &&
- isSafeAccess(ObjSizeVis, Addr, O.TypeSize)) {
- NumOptimizedAccessesToStackVar++;
- return;
- }
- }
-
- if (O.IsWrite)
- NumInstrumentedWrites++;
- else
- NumInstrumentedReads++;
-
- unsigned Granularity = 1 << Mapping.Scale;
- if (O.MaybeMask) {
- instrumentMaskedLoadOrStore(this, DL, IntptrTy, O.MaybeMask, O.getInsn(),
- Addr, O.Alignment, Granularity, O.TypeSize,
- O.IsWrite, nullptr, UseCalls, Exp);
- } else {
- doInstrumentAddress(this, O.getInsn(), O.getInsn(), Addr, O.Alignment,
- Granularity, O.TypeSize, O.IsWrite, nullptr, UseCalls,
- Exp);
- }
-}
-
-Instruction *AddressSanitizer::generateCrashCode(Instruction *InsertBefore,
- Value *Addr, bool IsWrite,
- size_t AccessSizeIndex,
- Value *SizeArgument,
- uint32_t Exp) {
- IRBuilder<> IRB(InsertBefore);
- Value *ExpVal = Exp == 0 ? nullptr : ConstantInt::get(IRB.getInt32Ty(), Exp);
- CallInst *Call = nullptr;
- if (SizeArgument) {
- if (Exp == 0)
- Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][0],
- {Addr, SizeArgument});
- else
- Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][1],
- {Addr, SizeArgument, ExpVal});
- } else {
- if (Exp == 0)
- Call =
- IRB.CreateCall(AsanErrorCallback[IsWrite][0][AccessSizeIndex], Addr);
- else
- Call = IRB.CreateCall(AsanErrorCallback[IsWrite][1][AccessSizeIndex],
- {Addr, ExpVal});
- }
-
- Call->setCannotMerge();
- return Call;
-}
-
-Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
- Value *ShadowValue,
- uint32_t TypeSize) {
- size_t Granularity = static_cast<size_t>(1) << Mapping.Scale;
- // Addr & (Granularity - 1)
- Value *LastAccessedByte =
- IRB.CreateAnd(AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
- // (Addr & (Granularity - 1)) + size - 1
- if (TypeSize / 8 > 1)
- LastAccessedByte = IRB.CreateAdd(
- LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
- // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
- LastAccessedByte =
- IRB.CreateIntCast(LastAccessedByte, ShadowValue->getType(), false);
- // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
- return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
-}
-
-void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
- Instruction *InsertBefore, Value *Addr,
- uint32_t TypeSize, bool IsWrite,
- Value *SizeArgument, bool UseCalls,
- uint32_t Exp) {
- bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad;
-
- IRBuilder<> IRB(InsertBefore);
- Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
- size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
-
- if (UseCalls) {
- if (Exp == 0)
- IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][0][AccessSizeIndex],
- AddrLong);
- else
- IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][1][AccessSizeIndex],
- {AddrLong, ConstantInt::get(IRB.getInt32Ty(), Exp)});
- return;
- }
-
- if (IsMyriad) {
- // Strip the cache bit and do range check.
- // AddrLong &= ~kMyriadCacheBitMask32
- AddrLong = IRB.CreateAnd(AddrLong, ~kMyriadCacheBitMask32);
- // Tag = AddrLong >> kMyriadTagShift
- Value *Tag = IRB.CreateLShr(AddrLong, kMyriadTagShift);
- // Tag == kMyriadDDRTag
- Value *TagCheck =
- IRB.CreateICmpEQ(Tag, ConstantInt::get(IntptrTy, kMyriadDDRTag));
-
- Instruction *TagCheckTerm =
- SplitBlockAndInsertIfThen(TagCheck, InsertBefore, false,
- MDBuilder(*C).createBranchWeights(1, 100000));
- assert(cast<BranchInst>(TagCheckTerm)->isUnconditional());
- IRB.SetInsertPoint(TagCheckTerm);
- InsertBefore = TagCheckTerm;
- }
-
- Type *ShadowTy =
- IntegerType::get(*C, std::max(8U, TypeSize >> Mapping.Scale));
- Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
- Value *ShadowPtr = memToShadow(AddrLong, IRB);
- Value *CmpVal = Constant::getNullValue(ShadowTy);
- Value *ShadowValue =
- IRB.CreateLoad(ShadowTy, IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
-
- Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
- size_t Granularity = 1ULL << Mapping.Scale;
- Instruction *CrashTerm = nullptr;
-
- if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
- // We use branch weights for the slow path check, to indicate that the slow
- // path is rarely taken. This seems to be the case for SPEC benchmarks.
- Instruction *CheckTerm = SplitBlockAndInsertIfThen(
- Cmp, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000));
- assert(cast<BranchInst>(CheckTerm)->isUnconditional());
- BasicBlock *NextBB = CheckTerm->getSuccessor(0);
- IRB.SetInsertPoint(CheckTerm);
- Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize);
- if (Recover) {
- CrashTerm = SplitBlockAndInsertIfThen(Cmp2, CheckTerm, false);
- } else {
- BasicBlock *CrashBlock =
- BasicBlock::Create(*C, "", NextBB->getParent(), NextBB);
- CrashTerm = new UnreachableInst(*C, CrashBlock);
- BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
- ReplaceInstWithInst(CheckTerm, NewTerm);
- }
- } else {
- CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, !Recover);
- }
-
- Instruction *Crash = generateCrashCode(CrashTerm, AddrLong, IsWrite,
- AccessSizeIndex, SizeArgument, Exp);
- Crash->setDebugLoc(OrigIns->getDebugLoc());
-}
-
-// Instrument unusual size or unusual alignment.
-// We can not do it with a single check, so we do 1-byte check for the first
-// and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
-// to report the actual access size.
-void AddressSanitizer::instrumentUnusualSizeOrAlignment(
- Instruction *I, Instruction *InsertBefore, Value *Addr, uint32_t TypeSize,
- bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp) {
- IRBuilder<> IRB(InsertBefore);
- Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
- Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
- if (UseCalls) {
- if (Exp == 0)
- IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][0],
- {AddrLong, Size});
- else
- IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][1],
- {AddrLong, Size, ConstantInt::get(IRB.getInt32Ty(), Exp)});
- } else {
- Value *LastByte = IRB.CreateIntToPtr(
- IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
- Addr->getType());
- instrumentAddress(I, InsertBefore, Addr, 8, IsWrite, Size, false, Exp);
- instrumentAddress(I, InsertBefore, LastByte, 8, IsWrite, Size, false, Exp);
- }
-}
-
-void ModuleAddressSanitizer::poisonOneInitializer(Function &GlobalInit,
- GlobalValue *ModuleName) {
- // Set up the arguments to our poison/unpoison functions.
- IRBuilder<> IRB(&GlobalInit.front(),
- GlobalInit.front().getFirstInsertionPt());
-
- // Add a call to poison all external globals before the given function starts.
- Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy);
- IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr);
-
- // Add calls to unpoison all globals before each return instruction.
- for (auto &BB : GlobalInit.getBasicBlockList())
- if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
- CallInst::Create(AsanUnpoisonGlobals, "", RI);
-}
-
-void ModuleAddressSanitizer::createInitializerPoisonCalls(
- Module &M, GlobalValue *ModuleName) {
- GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
- if (!GV)
- return;
-
- ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
- if (!CA)
- return;
-
- for (Use &OP : CA->operands()) {
- if (isa<ConstantAggregateZero>(OP)) continue;
- ConstantStruct *CS = cast<ConstantStruct>(OP);
-
- // Must have a function or null ptr.
- if (Function *F = dyn_cast<Function>(CS->getOperand(1))) {
- if (F->getName() == kAsanModuleCtorName) continue;
- auto *Priority = cast<ConstantInt>(CS->getOperand(0));
- // Don't instrument CTORs that will run before asan.module_ctor.
- if (Priority->getLimitedValue() <= GetCtorAndDtorPriority(TargetTriple))
- continue;
- poisonOneInitializer(*F, ModuleName);
- }
- }
-}
-
-const GlobalVariable *
-ModuleAddressSanitizer::getExcludedAliasedGlobal(const GlobalAlias &GA) const {
- // In case this function should be expanded to include rules that do not just
- // apply when CompileKernel is true, either guard all existing rules with an
- // 'if (CompileKernel) { ... }' or be absolutely sure that all these rules
- // should also apply to user space.
- assert(CompileKernel && "Only expecting to be called when compiling kernel");
-
- const Constant *C = GA.getAliasee();
-
- // When compiling the kernel, globals that are aliased by symbols prefixed
- // by "__" are special and cannot be padded with a redzone.
- if (GA.getName().startswith("__"))
- return dyn_cast<GlobalVariable>(C->stripPointerCastsAndAliases());
-
- return nullptr;
-}
-
-bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
- Type *Ty = G->getValueType();
- LLVM_DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
-
- // FIXME: Metadata should be attched directly to the global directly instead
- // of being added to llvm.asan.globals.
- if (GlobalsMD.get(G).IsExcluded) return false;
- if (!Ty->isSized()) return false;
- if (!G->hasInitializer()) return false;
- // Only instrument globals of default address spaces
- if (G->getAddressSpace()) return false;
- if (GlobalWasGeneratedByCompiler(G)) return false; // Our own globals.
- // Two problems with thread-locals:
- // - The address of the main thread's copy can't be computed at link-time.
- // - Need to poison all copies, not just the main thread's one.
- if (G->isThreadLocal()) return false;
- // For now, just ignore this Global if the alignment is large.
- if (G->getAlignment() > getMinRedzoneSizeForGlobal()) return false;
-
- // For non-COFF targets, only instrument globals known to be defined by this
- // TU.
- // FIXME: We can instrument comdat globals on ELF if we are using the
- // GC-friendly metadata scheme.
- if (!TargetTriple.isOSBinFormatCOFF()) {
- if (!G->hasExactDefinition() || G->hasComdat())
- return false;
- } else {
- // On COFF, don't instrument non-ODR linkages.
- if (G->isInterposable())
- return false;
- }
-
- // If a comdat is present, it must have a selection kind that implies ODR
- // semantics: no duplicates, any, or exact match.
- if (Comdat *C = G->getComdat()) {
- switch (C->getSelectionKind()) {
- case Comdat::Any:
- case Comdat::ExactMatch:
- case Comdat::NoDuplicates:
- break;
- case Comdat::Largest:
- case Comdat::SameSize:
- return false;
- }
- }
-
- if (G->hasSection()) {
- // The kernel uses explicit sections for mostly special global variables
- // that we should not instrument. E.g. the kernel may rely on their layout
- // without redzones, or remove them at link time ("discard.*"), etc.
- if (CompileKernel)
- return false;
-
- StringRef Section = G->getSection();
-
- // Globals from llvm.metadata aren't emitted, do not instrument them.
- if (Section == "llvm.metadata") return false;
- // Do not instrument globals from special LLVM sections.
- if (Section.find("__llvm") != StringRef::npos || Section.find("__LLVM") != StringRef::npos) return false;
-
- // Do not instrument function pointers to initialization and termination
- // routines: dynamic linker will not properly handle redzones.
- if (Section.startswith(".preinit_array") ||
- Section.startswith(".init_array") ||
- Section.startswith(".fini_array")) {
- return false;
- }
-
+ isSafeAccess(ObjSizeVis, Addr, O.TypeSize)) {
+ NumOptimizedAccessesToStackVar++;
+ return;
+ }
+ }
+
+ if (O.IsWrite)
+ NumInstrumentedWrites++;
+ else
+ NumInstrumentedReads++;
+
+ unsigned Granularity = 1 << Mapping.Scale;
+ if (O.MaybeMask) {
+ instrumentMaskedLoadOrStore(this, DL, IntptrTy, O.MaybeMask, O.getInsn(),
+ Addr, O.Alignment, Granularity, O.TypeSize,
+ O.IsWrite, nullptr, UseCalls, Exp);
+ } else {
+ doInstrumentAddress(this, O.getInsn(), O.getInsn(), Addr, O.Alignment,
+ Granularity, O.TypeSize, O.IsWrite, nullptr, UseCalls,
+ Exp);
+ }
+}
+
+Instruction *AddressSanitizer::generateCrashCode(Instruction *InsertBefore,
+ Value *Addr, bool IsWrite,
+ size_t AccessSizeIndex,
+ Value *SizeArgument,
+ uint32_t Exp) {
+ IRBuilder<> IRB(InsertBefore);
+ Value *ExpVal = Exp == 0 ? nullptr : ConstantInt::get(IRB.getInt32Ty(), Exp);
+ CallInst *Call = nullptr;
+ if (SizeArgument) {
+ if (Exp == 0)
+ Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][0],
+ {Addr, SizeArgument});
+ else
+ Call = IRB.CreateCall(AsanErrorCallbackSized[IsWrite][1],
+ {Addr, SizeArgument, ExpVal});
+ } else {
+ if (Exp == 0)
+ Call =
+ IRB.CreateCall(AsanErrorCallback[IsWrite][0][AccessSizeIndex], Addr);
+ else
+ Call = IRB.CreateCall(AsanErrorCallback[IsWrite][1][AccessSizeIndex],
+ {Addr, ExpVal});
+ }
+
+ Call->setCannotMerge();
+ return Call;
+}
+
+Value *AddressSanitizer::createSlowPathCmp(IRBuilder<> &IRB, Value *AddrLong,
+ Value *ShadowValue,
+ uint32_t TypeSize) {
+ size_t Granularity = static_cast<size_t>(1) << Mapping.Scale;
+ // Addr & (Granularity - 1)
+ Value *LastAccessedByte =
+ IRB.CreateAnd(AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
+ // (Addr & (Granularity - 1)) + size - 1
+ if (TypeSize / 8 > 1)
+ LastAccessedByte = IRB.CreateAdd(
+ LastAccessedByte, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
+ // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
+ LastAccessedByte =
+ IRB.CreateIntCast(LastAccessedByte, ShadowValue->getType(), false);
+ // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
+ return IRB.CreateICmpSGE(LastAccessedByte, ShadowValue);
+}
+
+void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
+ Instruction *InsertBefore, Value *Addr,
+ uint32_t TypeSize, bool IsWrite,
+ Value *SizeArgument, bool UseCalls,
+ uint32_t Exp) {
+ bool IsMyriad = TargetTriple.getVendor() == llvm::Triple::Myriad;
+
+ IRBuilder<> IRB(InsertBefore);
+ Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+ size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
+
+ if (UseCalls) {
+ if (Exp == 0)
+ IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][0][AccessSizeIndex],
+ AddrLong);
+ else
+ IRB.CreateCall(AsanMemoryAccessCallback[IsWrite][1][AccessSizeIndex],
+ {AddrLong, ConstantInt::get(IRB.getInt32Ty(), Exp)});
+ return;
+ }
+
+ if (IsMyriad) {
+ // Strip the cache bit and do range check.
+ // AddrLong &= ~kMyriadCacheBitMask32
+ AddrLong = IRB.CreateAnd(AddrLong, ~kMyriadCacheBitMask32);
+ // Tag = AddrLong >> kMyriadTagShift
+ Value *Tag = IRB.CreateLShr(AddrLong, kMyriadTagShift);
+ // Tag == kMyriadDDRTag
+ Value *TagCheck =
+ IRB.CreateICmpEQ(Tag, ConstantInt::get(IntptrTy, kMyriadDDRTag));
+
+ Instruction *TagCheckTerm =
+ SplitBlockAndInsertIfThen(TagCheck, InsertBefore, false,
+ MDBuilder(*C).createBranchWeights(1, 100000));
+ assert(cast<BranchInst>(TagCheckTerm)->isUnconditional());
+ IRB.SetInsertPoint(TagCheckTerm);
+ InsertBefore = TagCheckTerm;
+ }
+
+ Type *ShadowTy =
+ IntegerType::get(*C, std::max(8U, TypeSize >> Mapping.Scale));
+ Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+ Value *ShadowPtr = memToShadow(AddrLong, IRB);
+ Value *CmpVal = Constant::getNullValue(ShadowTy);
+ Value *ShadowValue =
+ IRB.CreateLoad(ShadowTy, IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+
+ Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
+ size_t Granularity = 1ULL << Mapping.Scale;
+ Instruction *CrashTerm = nullptr;
+
+ if (ClAlwaysSlowPath || (TypeSize < 8 * Granularity)) {
+ // We use branch weights for the slow path check, to indicate that the slow
+ // path is rarely taken. This seems to be the case for SPEC benchmarks.
+ Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+ Cmp, InsertBefore, false, MDBuilder(*C).createBranchWeights(1, 100000));
+ assert(cast<BranchInst>(CheckTerm)->isUnconditional());
+ BasicBlock *NextBB = CheckTerm->getSuccessor(0);
+ IRB.SetInsertPoint(CheckTerm);
+ Value *Cmp2 = createSlowPathCmp(IRB, AddrLong, ShadowValue, TypeSize);
+ if (Recover) {
+ CrashTerm = SplitBlockAndInsertIfThen(Cmp2, CheckTerm, false);
+ } else {
+ BasicBlock *CrashBlock =
+ BasicBlock::Create(*C, "", NextBB->getParent(), NextBB);
+ CrashTerm = new UnreachableInst(*C, CrashBlock);
+ BranchInst *NewTerm = BranchInst::Create(CrashBlock, NextBB, Cmp2);
+ ReplaceInstWithInst(CheckTerm, NewTerm);
+ }
+ } else {
+ CrashTerm = SplitBlockAndInsertIfThen(Cmp, InsertBefore, !Recover);
+ }
+
+ Instruction *Crash = generateCrashCode(CrashTerm, AddrLong, IsWrite,
+ AccessSizeIndex, SizeArgument, Exp);
+ Crash->setDebugLoc(OrigIns->getDebugLoc());
+}
+
+// Instrument unusual size or unusual alignment.
+// We can not do it with a single check, so we do 1-byte check for the first
+// and the last bytes. We call __asan_report_*_n(addr, real_size) to be able
+// to report the actual access size.
+void AddressSanitizer::instrumentUnusualSizeOrAlignment(
+ Instruction *I, Instruction *InsertBefore, Value *Addr, uint32_t TypeSize,
+ bool IsWrite, Value *SizeArgument, bool UseCalls, uint32_t Exp) {
+ IRBuilder<> IRB(InsertBefore);
+ Value *Size = ConstantInt::get(IntptrTy, TypeSize / 8);
+ Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+ if (UseCalls) {
+ if (Exp == 0)
+ IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][0],
+ {AddrLong, Size});
+ else
+ IRB.CreateCall(AsanMemoryAccessCallbackSized[IsWrite][1],
+ {AddrLong, Size, ConstantInt::get(IRB.getInt32Ty(), Exp)});
+ } else {
+ Value *LastByte = IRB.CreateIntToPtr(
+ IRB.CreateAdd(AddrLong, ConstantInt::get(IntptrTy, TypeSize / 8 - 1)),
+ Addr->getType());
+ instrumentAddress(I, InsertBefore, Addr, 8, IsWrite, Size, false, Exp);
+ instrumentAddress(I, InsertBefore, LastByte, 8, IsWrite, Size, false, Exp);
+ }
+}
+
+void ModuleAddressSanitizer::poisonOneInitializer(Function &GlobalInit,
+ GlobalValue *ModuleName) {
+ // Set up the arguments to our poison/unpoison functions.
+ IRBuilder<> IRB(&GlobalInit.front(),
+ GlobalInit.front().getFirstInsertionPt());
+
+ // Add a call to poison all external globals before the given function starts.
+ Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy);
+ IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr);
+
+ // Add calls to unpoison all globals before each return instruction.
+ for (auto &BB : GlobalInit.getBasicBlockList())
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+ CallInst::Create(AsanUnpoisonGlobals, "", RI);
+}
+
+void ModuleAddressSanitizer::createInitializerPoisonCalls(
+ Module &M, GlobalValue *ModuleName) {
+ GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+ if (!GV)
+ return;
+
+ ConstantArray *CA = dyn_cast<ConstantArray>(GV->getInitializer());
+ if (!CA)
+ return;
+
+ for (Use &OP : CA->operands()) {
+ if (isa<ConstantAggregateZero>(OP)) continue;
+ ConstantStruct *CS = cast<ConstantStruct>(OP);
+
+ // Must have a function or null ptr.
+ if (Function *F = dyn_cast<Function>(CS->getOperand(1))) {
+ if (F->getName() == kAsanModuleCtorName) continue;
+ auto *Priority = cast<ConstantInt>(CS->getOperand(0));
+ // Don't instrument CTORs that will run before asan.module_ctor.
+ if (Priority->getLimitedValue() <= GetCtorAndDtorPriority(TargetTriple))
+ continue;
+ poisonOneInitializer(*F, ModuleName);
+ }
+ }
+}
+
+const GlobalVariable *
+ModuleAddressSanitizer::getExcludedAliasedGlobal(const GlobalAlias &GA) const {
+ // In case this function should be expanded to include rules that do not just
+ // apply when CompileKernel is true, either guard all existing rules with an
+ // 'if (CompileKernel) { ... }' or be absolutely sure that all these rules
+ // should also apply to user space.
+ assert(CompileKernel && "Only expecting to be called when compiling kernel");
+
+ const Constant *C = GA.getAliasee();
+
+ // When compiling the kernel, globals that are aliased by symbols prefixed
+ // by "__" are special and cannot be padded with a redzone.
+ if (GA.getName().startswith("__"))
+ return dyn_cast<GlobalVariable>(C->stripPointerCastsAndAliases());
+
+ return nullptr;
+}
+
+bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
+ Type *Ty = G->getValueType();
+ LLVM_DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
+
+ // FIXME: Metadata should be attched directly to the global directly instead
+ // of being added to llvm.asan.globals.
+ if (GlobalsMD.get(G).IsExcluded) return false;
+ if (!Ty->isSized()) return false;
+ if (!G->hasInitializer()) return false;
+ // Only instrument globals of default address spaces
+ if (G->getAddressSpace()) return false;
+ if (GlobalWasGeneratedByCompiler(G)) return false; // Our own globals.
+ // Two problems with thread-locals:
+ // - The address of the main thread's copy can't be computed at link-time.
+ // - Need to poison all copies, not just the main thread's one.
+ if (G->isThreadLocal()) return false;
+ // For now, just ignore this Global if the alignment is large.
+ if (G->getAlignment() > getMinRedzoneSizeForGlobal()) return false;
+
+ // For non-COFF targets, only instrument globals known to be defined by this
+ // TU.
+ // FIXME: We can instrument comdat globals on ELF if we are using the
+ // GC-friendly metadata scheme.
+ if (!TargetTriple.isOSBinFormatCOFF()) {
+ if (!G->hasExactDefinition() || G->hasComdat())
+ return false;
+ } else {
+ // On COFF, don't instrument non-ODR linkages.
+ if (G->isInterposable())
+ return false;
+ }
+
+ // If a comdat is present, it must have a selection kind that implies ODR
+ // semantics: no duplicates, any, or exact match.
+ if (Comdat *C = G->getComdat()) {
+ switch (C->getSelectionKind()) {
+ case Comdat::Any:
+ case Comdat::ExactMatch:
+ case Comdat::NoDuplicates:
+ break;
+ case Comdat::Largest:
+ case Comdat::SameSize:
+ return false;
+ }
+ }
+
+ if (G->hasSection()) {
+ // The kernel uses explicit sections for mostly special global variables
+ // that we should not instrument. E.g. the kernel may rely on their layout
+ // without redzones, or remove them at link time ("discard.*"), etc.
+ if (CompileKernel)
+ return false;
+
+ StringRef Section = G->getSection();
+
+ // Globals from llvm.metadata aren't emitted, do not instrument them.
+ if (Section == "llvm.metadata") return false;
+ // Do not instrument globals from special LLVM sections.
+ if (Section.find("__llvm") != StringRef::npos || Section.find("__LLVM") != StringRef::npos) return false;
+
+ // Do not instrument function pointers to initialization and termination
+ // routines: dynamic linker will not properly handle redzones.
+ if (Section.startswith(".preinit_array") ||
+ Section.startswith(".init_array") ||
+ Section.startswith(".fini_array")) {
+ return false;
+ }
+
// Do not instrument user-defined sections (with names resembling
// valid C identifiers)
if (TargetTriple.isOSBinFormatELF()) {
@@ -1881,258 +1881,258 @@ bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
return false;
}
- // On COFF, if the section name contains '$', it is highly likely that the
- // user is using section sorting to create an array of globals similar to
- // the way initialization callbacks are registered in .init_array and
- // .CRT$XCU. The ATL also registers things in .ATL$__[azm]. Adding redzones
- // to such globals is counterproductive, because the intent is that they
- // will form an array, and out-of-bounds accesses are expected.
- // See https://github.com/google/sanitizers/issues/305
- // and http://msdn.microsoft.com/en-US/en-en/library/bb918180(v=vs.120).aspx
- if (TargetTriple.isOSBinFormatCOFF() && Section.contains('$')) {
- LLVM_DEBUG(dbgs() << "Ignoring global in sorted section (contains '$'): "
- << *G << "\n");
- return false;
- }
-
- if (TargetTriple.isOSBinFormatMachO()) {
- StringRef ParsedSegment, ParsedSection;
- unsigned TAA = 0, StubSize = 0;
- bool TAAParsed;
- std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier(
- Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize);
- assert(ErrorCode.empty() && "Invalid section specifier.");
-
- // Ignore the globals from the __OBJC section. The ObjC runtime assumes
- // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
- // them.
- if (ParsedSegment == "__OBJC" ||
- (ParsedSegment == "__DATA" && ParsedSection.startswith("__objc_"))) {
- LLVM_DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n");
- return false;
- }
- // See https://github.com/google/sanitizers/issues/32
- // Constant CFString instances are compiled in the following way:
- // -- the string buffer is emitted into
- // __TEXT,__cstring,cstring_literals
- // -- the constant NSConstantString structure referencing that buffer
- // is placed into __DATA,__cfstring
- // Therefore there's no point in placing redzones into __DATA,__cfstring.
- // Moreover, it causes the linker to crash on OS X 10.7
- if (ParsedSegment == "__DATA" && ParsedSection == "__cfstring") {
- LLVM_DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n");
- return false;
- }
- // The linker merges the contents of cstring_literals and removes the
- // trailing zeroes.
- if (ParsedSegment == "__TEXT" && (TAA & MachO::S_CSTRING_LITERALS)) {
- LLVM_DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
- return false;
- }
- }
- }
-
- if (CompileKernel) {
- // Globals that prefixed by "__" are special and cannot be padded with a
- // redzone.
- if (G->getName().startswith("__"))
- return false;
- }
-
- return true;
-}
-
-// On Mach-O platforms, we emit global metadata in a separate section of the
-// binary in order to allow the linker to properly dead strip. This is only
-// supported on recent versions of ld64.
-bool ModuleAddressSanitizer::ShouldUseMachOGlobalsSection() const {
- if (!TargetTriple.isOSBinFormatMachO())
- return false;
-
- if (TargetTriple.isMacOSX() && !TargetTriple.isMacOSXVersionLT(10, 11))
- return true;
- if (TargetTriple.isiOS() /* or tvOS */ && !TargetTriple.isOSVersionLT(9))
- return true;
- if (TargetTriple.isWatchOS() && !TargetTriple.isOSVersionLT(2))
- return true;
-
- return false;
-}
-
-StringRef ModuleAddressSanitizer::getGlobalMetadataSection() const {
- switch (TargetTriple.getObjectFormat()) {
- case Triple::COFF: return ".ASAN$GL";
- case Triple::ELF: return "asan_globals";
- case Triple::MachO: return "__DATA,__asan_globals,regular";
- case Triple::Wasm:
+ // On COFF, if the section name contains '$', it is highly likely that the
+ // user is using section sorting to create an array of globals similar to
+ // the way initialization callbacks are registered in .init_array and
+ // .CRT$XCU. The ATL also registers things in .ATL$__[azm]. Adding redzones
+ // to such globals is counterproductive, because the intent is that they
+ // will form an array, and out-of-bounds accesses are expected.
+ // See https://github.com/google/sanitizers/issues/305
+ // and http://msdn.microsoft.com/en-US/en-en/library/bb918180(v=vs.120).aspx
+ if (TargetTriple.isOSBinFormatCOFF() && Section.contains('$')) {
+ LLVM_DEBUG(dbgs() << "Ignoring global in sorted section (contains '$'): "
+ << *G << "\n");
+ return false;
+ }
+
+ if (TargetTriple.isOSBinFormatMachO()) {
+ StringRef ParsedSegment, ParsedSection;
+ unsigned TAA = 0, StubSize = 0;
+ bool TAAParsed;
+ std::string ErrorCode = MCSectionMachO::ParseSectionSpecifier(
+ Section, ParsedSegment, ParsedSection, TAA, TAAParsed, StubSize);
+ assert(ErrorCode.empty() && "Invalid section specifier.");
+
+ // Ignore the globals from the __OBJC section. The ObjC runtime assumes
+ // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
+ // them.
+ if (ParsedSegment == "__OBJC" ||
+ (ParsedSegment == "__DATA" && ParsedSection.startswith("__objc_"))) {
+ LLVM_DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G << "\n");
+ return false;
+ }
+ // See https://github.com/google/sanitizers/issues/32
+ // Constant CFString instances are compiled in the following way:
+ // -- the string buffer is emitted into
+ // __TEXT,__cstring,cstring_literals
+ // -- the constant NSConstantString structure referencing that buffer
+ // is placed into __DATA,__cfstring
+ // Therefore there's no point in placing redzones into __DATA,__cfstring.
+ // Moreover, it causes the linker to crash on OS X 10.7
+ if (ParsedSegment == "__DATA" && ParsedSection == "__cfstring") {
+ LLVM_DEBUG(dbgs() << "Ignoring CFString: " << *G << "\n");
+ return false;
+ }
+ // The linker merges the contents of cstring_literals and removes the
+ // trailing zeroes.
+ if (ParsedSegment == "__TEXT" && (TAA & MachO::S_CSTRING_LITERALS)) {
+ LLVM_DEBUG(dbgs() << "Ignoring a cstring literal: " << *G << "\n");
+ return false;
+ }
+ }
+ }
+
+ if (CompileKernel) {
+ // Globals that prefixed by "__" are special and cannot be padded with a
+ // redzone.
+ if (G->getName().startswith("__"))
+ return false;
+ }
+
+ return true;
+}
+
+// On Mach-O platforms, we emit global metadata in a separate section of the
+// binary in order to allow the linker to properly dead strip. This is only
+// supported on recent versions of ld64.
+bool ModuleAddressSanitizer::ShouldUseMachOGlobalsSection() const {
+ if (!TargetTriple.isOSBinFormatMachO())
+ return false;
+
+ if (TargetTriple.isMacOSX() && !TargetTriple.isMacOSXVersionLT(10, 11))
+ return true;
+ if (TargetTriple.isiOS() /* or tvOS */ && !TargetTriple.isOSVersionLT(9))
+ return true;
+ if (TargetTriple.isWatchOS() && !TargetTriple.isOSVersionLT(2))
+ return true;
+
+ return false;
+}
+
+StringRef ModuleAddressSanitizer::getGlobalMetadataSection() const {
+ switch (TargetTriple.getObjectFormat()) {
+ case Triple::COFF: return ".ASAN$GL";
+ case Triple::ELF: return "asan_globals";
+ case Triple::MachO: return "__DATA,__asan_globals,regular";
+ case Triple::Wasm:
case Triple::GOFF:
- case Triple::XCOFF:
- report_fatal_error(
+ case Triple::XCOFF:
+ report_fatal_error(
"ModuleAddressSanitizer not implemented for object file format");
- case Triple::UnknownObjectFormat:
- break;
- }
- llvm_unreachable("unsupported object format");
-}
-
-void ModuleAddressSanitizer::initializeCallbacks(Module &M) {
- IRBuilder<> IRB(*C);
-
- // Declare our poisoning and unpoisoning functions.
- AsanPoisonGlobals =
- M.getOrInsertFunction(kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy);
- AsanUnpoisonGlobals =
- M.getOrInsertFunction(kAsanUnpoisonGlobalsName, IRB.getVoidTy());
-
- // Declare functions that register/unregister globals.
- AsanRegisterGlobals = M.getOrInsertFunction(
- kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy);
- AsanUnregisterGlobals = M.getOrInsertFunction(
- kAsanUnregisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy);
-
- // Declare the functions that find globals in a shared object and then invoke
- // the (un)register function on them.
- AsanRegisterImageGlobals = M.getOrInsertFunction(
- kAsanRegisterImageGlobalsName, IRB.getVoidTy(), IntptrTy);
- AsanUnregisterImageGlobals = M.getOrInsertFunction(
- kAsanUnregisterImageGlobalsName, IRB.getVoidTy(), IntptrTy);
-
- AsanRegisterElfGlobals =
- M.getOrInsertFunction(kAsanRegisterElfGlobalsName, IRB.getVoidTy(),
- IntptrTy, IntptrTy, IntptrTy);
- AsanUnregisterElfGlobals =
- M.getOrInsertFunction(kAsanUnregisterElfGlobalsName, IRB.getVoidTy(),
- IntptrTy, IntptrTy, IntptrTy);
-}
-
-// Put the metadata and the instrumented global in the same group. This ensures
-// that the metadata is discarded if the instrumented global is discarded.
-void ModuleAddressSanitizer::SetComdatForGlobalMetadata(
- GlobalVariable *G, GlobalVariable *Metadata, StringRef InternalSuffix) {
- Module &M = *G->getParent();
- Comdat *C = G->getComdat();
- if (!C) {
- if (!G->hasName()) {
- // If G is unnamed, it must be internal. Give it an artificial name
- // so we can put it in a comdat.
- assert(G->hasLocalLinkage());
- G->setName(Twine(kAsanGenPrefix) + "_anon_global");
- }
-
- if (!InternalSuffix.empty() && G->hasLocalLinkage()) {
- std::string Name = std::string(G->getName());
- Name += InternalSuffix;
- C = M.getOrInsertComdat(Name);
- } else {
- C = M.getOrInsertComdat(G->getName());
- }
-
- // Make this IMAGE_COMDAT_SELECT_NODUPLICATES on COFF. Also upgrade private
- // linkage to internal linkage so that a symbol table entry is emitted. This
- // is necessary in order to create the comdat group.
- if (TargetTriple.isOSBinFormatCOFF()) {
- C->setSelectionKind(Comdat::NoDuplicates);
- if (G->hasPrivateLinkage())
- G->setLinkage(GlobalValue::InternalLinkage);
- }
- G->setComdat(C);
- }
-
- assert(G->hasComdat());
- Metadata->setComdat(G->getComdat());
-}
-
-// Create a separate metadata global and put it in the appropriate ASan
-// global registration section.
-GlobalVariable *
-ModuleAddressSanitizer::CreateMetadataGlobal(Module &M, Constant *Initializer,
- StringRef OriginalName) {
- auto Linkage = TargetTriple.isOSBinFormatMachO()
- ? GlobalVariable::InternalLinkage
- : GlobalVariable::PrivateLinkage;
- GlobalVariable *Metadata = new GlobalVariable(
- M, Initializer->getType(), false, Linkage, Initializer,
- Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName));
- Metadata->setSection(getGlobalMetadataSection());
- return Metadata;
-}
-
-Instruction *ModuleAddressSanitizer::CreateAsanModuleDtor(Module &M) {
- AsanDtorFunction =
- Function::Create(FunctionType::get(Type::getVoidTy(*C), false),
- GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
- BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
-
- return ReturnInst::Create(*C, AsanDtorBB);
-}
-
-void ModuleAddressSanitizer::InstrumentGlobalsCOFF(
- IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
- ArrayRef<Constant *> MetadataInitializers) {
- assert(ExtendedGlobals.size() == MetadataInitializers.size());
- auto &DL = M.getDataLayout();
-
- SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size());
- for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
- Constant *Initializer = MetadataInitializers[i];
- GlobalVariable *G = ExtendedGlobals[i];
- GlobalVariable *Metadata =
- CreateMetadataGlobal(M, Initializer, G->getName());
- MDNode *MD = MDNode::get(M.getContext(), ValueAsMetadata::get(G));
- Metadata->setMetadata(LLVMContext::MD_associated, MD);
- MetadataGlobals[i] = Metadata;
-
- // The MSVC linker always inserts padding when linking incrementally. We
- // cope with that by aligning each struct to its size, which must be a power
- // of two.
- unsigned SizeOfGlobalStruct = DL.getTypeAllocSize(Initializer->getType());
- assert(isPowerOf2_32(SizeOfGlobalStruct) &&
- "global metadata will not be padded appropriately");
- Metadata->setAlignment(assumeAligned(SizeOfGlobalStruct));
-
- SetComdatForGlobalMetadata(G, Metadata, "");
- }
-
- // Update llvm.compiler.used, adding the new metadata globals. This is
- // needed so that during LTO these variables stay alive.
- if (!MetadataGlobals.empty())
- appendToCompilerUsed(M, MetadataGlobals);
-}
-
-void ModuleAddressSanitizer::InstrumentGlobalsELF(
- IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
- ArrayRef<Constant *> MetadataInitializers,
- const std::string &UniqueModuleId) {
- assert(ExtendedGlobals.size() == MetadataInitializers.size());
-
- SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size());
- for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
- GlobalVariable *G = ExtendedGlobals[i];
- GlobalVariable *Metadata =
- CreateMetadataGlobal(M, MetadataInitializers[i], G->getName());
- MDNode *MD = MDNode::get(M.getContext(), ValueAsMetadata::get(G));
- Metadata->setMetadata(LLVMContext::MD_associated, MD);
- MetadataGlobals[i] = Metadata;
-
- SetComdatForGlobalMetadata(G, Metadata, UniqueModuleId);
- }
-
- // Update llvm.compiler.used, adding the new metadata globals. This is
- // needed so that during LTO these variables stay alive.
+ case Triple::UnknownObjectFormat:
+ break;
+ }
+ llvm_unreachable("unsupported object format");
+}
+
+void ModuleAddressSanitizer::initializeCallbacks(Module &M) {
+ IRBuilder<> IRB(*C);
+
+ // Declare our poisoning and unpoisoning functions.
+ AsanPoisonGlobals =
+ M.getOrInsertFunction(kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy);
+ AsanUnpoisonGlobals =
+ M.getOrInsertFunction(kAsanUnpoisonGlobalsName, IRB.getVoidTy());
+
+ // Declare functions that register/unregister globals.
+ AsanRegisterGlobals = M.getOrInsertFunction(
+ kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy);
+ AsanUnregisterGlobals = M.getOrInsertFunction(
+ kAsanUnregisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy);
+
+ // Declare the functions that find globals in a shared object and then invoke
+ // the (un)register function on them.
+ AsanRegisterImageGlobals = M.getOrInsertFunction(
+ kAsanRegisterImageGlobalsName, IRB.getVoidTy(), IntptrTy);
+ AsanUnregisterImageGlobals = M.getOrInsertFunction(
+ kAsanUnregisterImageGlobalsName, IRB.getVoidTy(), IntptrTy);
+
+ AsanRegisterElfGlobals =
+ M.getOrInsertFunction(kAsanRegisterElfGlobalsName, IRB.getVoidTy(),
+ IntptrTy, IntptrTy, IntptrTy);
+ AsanUnregisterElfGlobals =
+ M.getOrInsertFunction(kAsanUnregisterElfGlobalsName, IRB.getVoidTy(),
+ IntptrTy, IntptrTy, IntptrTy);
+}
+
+// Put the metadata and the instrumented global in the same group. This ensures
+// that the metadata is discarded if the instrumented global is discarded.
+void ModuleAddressSanitizer::SetComdatForGlobalMetadata(
+ GlobalVariable *G, GlobalVariable *Metadata, StringRef InternalSuffix) {
+ Module &M = *G->getParent();
+ Comdat *C = G->getComdat();
+ if (!C) {
+ if (!G->hasName()) {
+ // If G is unnamed, it must be internal. Give it an artificial name
+ // so we can put it in a comdat.
+ assert(G->hasLocalLinkage());
+ G->setName(Twine(kAsanGenPrefix) + "_anon_global");
+ }
+
+ if (!InternalSuffix.empty() && G->hasLocalLinkage()) {
+ std::string Name = std::string(G->getName());
+ Name += InternalSuffix;
+ C = M.getOrInsertComdat(Name);
+ } else {
+ C = M.getOrInsertComdat(G->getName());
+ }
+
+ // Make this IMAGE_COMDAT_SELECT_NODUPLICATES on COFF. Also upgrade private
+ // linkage to internal linkage so that a symbol table entry is emitted. This
+ // is necessary in order to create the comdat group.
+ if (TargetTriple.isOSBinFormatCOFF()) {
+ C->setSelectionKind(Comdat::NoDuplicates);
+ if (G->hasPrivateLinkage())
+ G->setLinkage(GlobalValue::InternalLinkage);
+ }
+ G->setComdat(C);
+ }
+
+ assert(G->hasComdat());
+ Metadata->setComdat(G->getComdat());
+}
+
+// Create a separate metadata global and put it in the appropriate ASan
+// global registration section.
+GlobalVariable *
+ModuleAddressSanitizer::CreateMetadataGlobal(Module &M, Constant *Initializer,
+ StringRef OriginalName) {
+ auto Linkage = TargetTriple.isOSBinFormatMachO()
+ ? GlobalVariable::InternalLinkage
+ : GlobalVariable::PrivateLinkage;
+ GlobalVariable *Metadata = new GlobalVariable(
+ M, Initializer->getType(), false, Linkage, Initializer,
+ Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName));
+ Metadata->setSection(getGlobalMetadataSection());
+ return Metadata;
+}
+
+Instruction *ModuleAddressSanitizer::CreateAsanModuleDtor(Module &M) {
+ AsanDtorFunction =
+ Function::Create(FunctionType::get(Type::getVoidTy(*C), false),
+ GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
+ BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
+
+ return ReturnInst::Create(*C, AsanDtorBB);
+}
+
+void ModuleAddressSanitizer::InstrumentGlobalsCOFF(
+ IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+ ArrayRef<Constant *> MetadataInitializers) {
+ assert(ExtendedGlobals.size() == MetadataInitializers.size());
+ auto &DL = M.getDataLayout();
+
+ SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size());
+ for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+ Constant *Initializer = MetadataInitializers[i];
+ GlobalVariable *G = ExtendedGlobals[i];
+ GlobalVariable *Metadata =
+ CreateMetadataGlobal(M, Initializer, G->getName());
+ MDNode *MD = MDNode::get(M.getContext(), ValueAsMetadata::get(G));
+ Metadata->setMetadata(LLVMContext::MD_associated, MD);
+ MetadataGlobals[i] = Metadata;
+
+ // The MSVC linker always inserts padding when linking incrementally. We
+ // cope with that by aligning each struct to its size, which must be a power
+ // of two.
+ unsigned SizeOfGlobalStruct = DL.getTypeAllocSize(Initializer->getType());
+ assert(isPowerOf2_32(SizeOfGlobalStruct) &&
+ "global metadata will not be padded appropriately");
+ Metadata->setAlignment(assumeAligned(SizeOfGlobalStruct));
+
+ SetComdatForGlobalMetadata(G, Metadata, "");
+ }
+
+ // Update llvm.compiler.used, adding the new metadata globals. This is
+ // needed so that during LTO these variables stay alive.
if (!MetadataGlobals.empty())
appendToCompilerUsed(M, MetadataGlobals);
-
- // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
- // to look up the loaded image that contains it. Second, we can store in it
- // whether registration has already occurred, to prevent duplicate
- // registration.
- //
- // Common linkage ensures that there is only one global per shared library.
- GlobalVariable *RegisteredFlag = new GlobalVariable(
- M, IntptrTy, false, GlobalVariable::CommonLinkage,
- ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
- RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
-
+}
+
+void ModuleAddressSanitizer::InstrumentGlobalsELF(
+ IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+ ArrayRef<Constant *> MetadataInitializers,
+ const std::string &UniqueModuleId) {
+ assert(ExtendedGlobals.size() == MetadataInitializers.size());
+
+ SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size());
+ for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+ GlobalVariable *G = ExtendedGlobals[i];
+ GlobalVariable *Metadata =
+ CreateMetadataGlobal(M, MetadataInitializers[i], G->getName());
+ MDNode *MD = MDNode::get(M.getContext(), ValueAsMetadata::get(G));
+ Metadata->setMetadata(LLVMContext::MD_associated, MD);
+ MetadataGlobals[i] = Metadata;
+
+ SetComdatForGlobalMetadata(G, Metadata, UniqueModuleId);
+ }
+
+ // Update llvm.compiler.used, adding the new metadata globals. This is
+ // needed so that during LTO these variables stay alive.
+ if (!MetadataGlobals.empty())
+ appendToCompilerUsed(M, MetadataGlobals);
+
+ // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
+ // to look up the loaded image that contains it. Second, we can store in it
+ // whether registration has already occurred, to prevent duplicate
+ // registration.
+ //
+ // Common linkage ensures that there is only one global per shared library.
+ GlobalVariable *RegisteredFlag = new GlobalVariable(
+ M, IntptrTy, false, GlobalVariable::CommonLinkage,
+ ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
+ RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
+
// Create start and stop symbols.
GlobalVariable *StartELFMetadata = new GlobalVariable(
M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
@@ -2142,1326 +2142,1326 @@ void ModuleAddressSanitizer::InstrumentGlobalsELF(
M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
"__stop_" + getGlobalMetadataSection());
StopELFMetadata->setVisibility(GlobalVariable::HiddenVisibility);
-
- // Create a call to register the globals with the runtime.
- IRB.CreateCall(AsanRegisterElfGlobals,
- {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
- IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
- IRB.CreatePointerCast(StopELFMetadata, IntptrTy)});
-
- // We also need to unregister globals at the end, e.g., when a shared library
- // gets closed.
- IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M));
- IRB_Dtor.CreateCall(AsanUnregisterElfGlobals,
- {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
- IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
- IRB.CreatePointerCast(StopELFMetadata, IntptrTy)});
-}
-
-void ModuleAddressSanitizer::InstrumentGlobalsMachO(
- IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
- ArrayRef<Constant *> MetadataInitializers) {
- assert(ExtendedGlobals.size() == MetadataInitializers.size());
-
- // On recent Mach-O platforms, use a structure which binds the liveness of
- // the global variable to the metadata struct. Keep the list of "Liveness" GV
- // created to be added to llvm.compiler.used
- StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy);
- SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size());
-
- for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
- Constant *Initializer = MetadataInitializers[i];
- GlobalVariable *G = ExtendedGlobals[i];
- GlobalVariable *Metadata =
- CreateMetadataGlobal(M, Initializer, G->getName());
-
- // On recent Mach-O platforms, we emit the global metadata in a way that
- // allows the linker to properly strip dead globals.
- auto LivenessBinder =
- ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u),
- ConstantExpr::getPointerCast(Metadata, IntptrTy));
- GlobalVariable *Liveness = new GlobalVariable(
- M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder,
- Twine("__asan_binder_") + G->getName());
- Liveness->setSection("__DATA,__asan_liveness,regular,live_support");
- LivenessGlobals[i] = Liveness;
- }
-
- // Update llvm.compiler.used, adding the new liveness globals. This is
- // needed so that during LTO these variables stay alive. The alternative
- // would be to have the linker handling the LTO symbols, but libLTO
- // current API does not expose access to the section for each symbol.
- if (!LivenessGlobals.empty())
- appendToCompilerUsed(M, LivenessGlobals);
-
- // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
- // to look up the loaded image that contains it. Second, we can store in it
- // whether registration has already occurred, to prevent duplicate
- // registration.
- //
- // common linkage ensures that there is only one global per shared library.
- GlobalVariable *RegisteredFlag = new GlobalVariable(
- M, IntptrTy, false, GlobalVariable::CommonLinkage,
- ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
- RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
-
- IRB.CreateCall(AsanRegisterImageGlobals,
- {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
-
- // We also need to unregister globals at the end, e.g., when a shared library
- // gets closed.
- IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M));
- IRB_Dtor.CreateCall(AsanUnregisterImageGlobals,
- {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
-}
-
-void ModuleAddressSanitizer::InstrumentGlobalsWithMetadataArray(
- IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
- ArrayRef<Constant *> MetadataInitializers) {
- assert(ExtendedGlobals.size() == MetadataInitializers.size());
- unsigned N = ExtendedGlobals.size();
- assert(N > 0);
-
- // On platforms that don't have a custom metadata section, we emit an array
- // of global metadata structures.
- ArrayType *ArrayOfGlobalStructTy =
- ArrayType::get(MetadataInitializers[0]->getType(), N);
- auto AllGlobals = new GlobalVariable(
- M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage,
- ConstantArray::get(ArrayOfGlobalStructTy, MetadataInitializers), "");
- if (Mapping.Scale > 3)
- AllGlobals->setAlignment(Align(1ULL << Mapping.Scale));
-
- IRB.CreateCall(AsanRegisterGlobals,
- {IRB.CreatePointerCast(AllGlobals, IntptrTy),
- ConstantInt::get(IntptrTy, N)});
-
- // We also need to unregister globals at the end, e.g., when a shared library
- // gets closed.
- IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M));
- IRB_Dtor.CreateCall(AsanUnregisterGlobals,
- {IRB.CreatePointerCast(AllGlobals, IntptrTy),
- ConstantInt::get(IntptrTy, N)});
-}
-
-// This function replaces all global variables with new variables that have
-// trailing redzones. It also creates a function that poisons
-// redzones and inserts this function into llvm.global_ctors.
-// Sets *CtorComdat to true if the global registration code emitted into the
-// asan constructor is comdat-compatible.
-bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
- bool *CtorComdat) {
- *CtorComdat = false;
-
- // Build set of globals that are aliased by some GA, where
- // getExcludedAliasedGlobal(GA) returns the relevant GlobalVariable.
- SmallPtrSet<const GlobalVariable *, 16> AliasedGlobalExclusions;
- if (CompileKernel) {
- for (auto &GA : M.aliases()) {
- if (const GlobalVariable *GV = getExcludedAliasedGlobal(GA))
- AliasedGlobalExclusions.insert(GV);
- }
- }
-
- SmallVector<GlobalVariable *, 16> GlobalsToChange;
- for (auto &G : M.globals()) {
- if (!AliasedGlobalExclusions.count(&G) && shouldInstrumentGlobal(&G))
- GlobalsToChange.push_back(&G);
- }
-
- size_t n = GlobalsToChange.size();
- if (n == 0) {
- *CtorComdat = true;
- return false;
- }
-
- auto &DL = M.getDataLayout();
-
- // A global is described by a structure
- // size_t beg;
- // size_t size;
- // size_t size_with_redzone;
- // const char *name;
- // const char *module_name;
- // size_t has_dynamic_init;
- // void *source_location;
- // size_t odr_indicator;
- // We initialize an array of such structures and pass it to a run-time call.
- StructType *GlobalStructTy =
- StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
- IntptrTy, IntptrTy, IntptrTy);
- SmallVector<GlobalVariable *, 16> NewGlobals(n);
- SmallVector<Constant *, 16> Initializers(n);
-
- bool HasDynamicallyInitializedGlobals = false;
-
- // We shouldn't merge same module names, as this string serves as unique
- // module ID in runtime.
- GlobalVariable *ModuleName = createPrivateGlobalForString(
- M, M.getModuleIdentifier(), /*AllowMerging*/ false, kAsanGenPrefix);
-
- for (size_t i = 0; i < n; i++) {
- GlobalVariable *G = GlobalsToChange[i];
-
- // FIXME: Metadata should be attched directly to the global directly instead
- // of being added to llvm.asan.globals.
- auto MD = GlobalsMD.get(G);
- StringRef NameForGlobal = G->getName();
- // Create string holding the global name (use global name from metadata
- // if it's available, otherwise just write the name of global variable).
- GlobalVariable *Name = createPrivateGlobalForString(
- M, MD.Name.empty() ? NameForGlobal : MD.Name,
- /*AllowMerging*/ true, kAsanGenPrefix);
-
- Type *Ty = G->getValueType();
- const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
- const uint64_t RightRedzoneSize = getRedzoneSizeForGlobal(SizeInBytes);
- Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
-
- StructType *NewTy = StructType::get(Ty, RightRedZoneTy);
- Constant *NewInitializer = ConstantStruct::get(
- NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy));
-
- // Create a new global variable with enough space for a redzone.
- GlobalValue::LinkageTypes Linkage = G->getLinkage();
- if (G->isConstant() && Linkage == GlobalValue::PrivateLinkage)
- Linkage = GlobalValue::InternalLinkage;
- GlobalVariable *NewGlobal =
- new GlobalVariable(M, NewTy, G->isConstant(), Linkage, NewInitializer,
- "", G, G->getThreadLocalMode());
- NewGlobal->copyAttributesFrom(G);
- NewGlobal->setComdat(G->getComdat());
- NewGlobal->setAlignment(MaybeAlign(getMinRedzoneSizeForGlobal()));
- // Don't fold globals with redzones. ODR violation detector and redzone
- // poisoning implicitly creates a dependence on the global's address, so it
- // is no longer valid for it to be marked unnamed_addr.
- NewGlobal->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
-
- // Move null-terminated C strings to "__asan_cstring" section on Darwin.
- if (TargetTriple.isOSBinFormatMachO() && !G->hasSection() &&
- G->isConstant()) {
- auto Seq = dyn_cast<ConstantDataSequential>(G->getInitializer());
- if (Seq && Seq->isCString())
- NewGlobal->setSection("__TEXT,__asan_cstring,regular");
- }
-
+
+ // Create a call to register the globals with the runtime.
+ IRB.CreateCall(AsanRegisterElfGlobals,
+ {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
+ IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
+ IRB.CreatePointerCast(StopELFMetadata, IntptrTy)});
+
+ // We also need to unregister globals at the end, e.g., when a shared library
+ // gets closed.
+ IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M));
+ IRB_Dtor.CreateCall(AsanUnregisterElfGlobals,
+ {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
+ IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
+ IRB.CreatePointerCast(StopELFMetadata, IntptrTy)});
+}
+
+void ModuleAddressSanitizer::InstrumentGlobalsMachO(
+ IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+ ArrayRef<Constant *> MetadataInitializers) {
+ assert(ExtendedGlobals.size() == MetadataInitializers.size());
+
+ // On recent Mach-O platforms, use a structure which binds the liveness of
+ // the global variable to the metadata struct. Keep the list of "Liveness" GV
+ // created to be added to llvm.compiler.used
+ StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy);
+ SmallVector<GlobalValue *, 16> LivenessGlobals(ExtendedGlobals.size());
+
+ for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
+ Constant *Initializer = MetadataInitializers[i];
+ GlobalVariable *G = ExtendedGlobals[i];
+ GlobalVariable *Metadata =
+ CreateMetadataGlobal(M, Initializer, G->getName());
+
+ // On recent Mach-O platforms, we emit the global metadata in a way that
+ // allows the linker to properly strip dead globals.
+ auto LivenessBinder =
+ ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u),
+ ConstantExpr::getPointerCast(Metadata, IntptrTy));
+ GlobalVariable *Liveness = new GlobalVariable(
+ M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder,
+ Twine("__asan_binder_") + G->getName());
+ Liveness->setSection("__DATA,__asan_liveness,regular,live_support");
+ LivenessGlobals[i] = Liveness;
+ }
+
+ // Update llvm.compiler.used, adding the new liveness globals. This is
+ // needed so that during LTO these variables stay alive. The alternative
+ // would be to have the linker handling the LTO symbols, but libLTO
+ // current API does not expose access to the section for each symbol.
+ if (!LivenessGlobals.empty())
+ appendToCompilerUsed(M, LivenessGlobals);
+
+ // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
+ // to look up the loaded image that contains it. Second, we can store in it
+ // whether registration has already occurred, to prevent duplicate
+ // registration.
+ //
+ // common linkage ensures that there is only one global per shared library.
+ GlobalVariable *RegisteredFlag = new GlobalVariable(
+ M, IntptrTy, false, GlobalVariable::CommonLinkage,
+ ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
+ RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
+
+ IRB.CreateCall(AsanRegisterImageGlobals,
+ {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
+
+ // We also need to unregister globals at the end, e.g., when a shared library
+ // gets closed.
+ IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M));
+ IRB_Dtor.CreateCall(AsanUnregisterImageGlobals,
+ {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
+}
+
+void ModuleAddressSanitizer::InstrumentGlobalsWithMetadataArray(
+ IRBuilder<> &IRB, Module &M, ArrayRef<GlobalVariable *> ExtendedGlobals,
+ ArrayRef<Constant *> MetadataInitializers) {
+ assert(ExtendedGlobals.size() == MetadataInitializers.size());
+ unsigned N = ExtendedGlobals.size();
+ assert(N > 0);
+
+ // On platforms that don't have a custom metadata section, we emit an array
+ // of global metadata structures.
+ ArrayType *ArrayOfGlobalStructTy =
+ ArrayType::get(MetadataInitializers[0]->getType(), N);
+ auto AllGlobals = new GlobalVariable(
+ M, ArrayOfGlobalStructTy, false, GlobalVariable::InternalLinkage,
+ ConstantArray::get(ArrayOfGlobalStructTy, MetadataInitializers), "");
+ if (Mapping.Scale > 3)
+ AllGlobals->setAlignment(Align(1ULL << Mapping.Scale));
+
+ IRB.CreateCall(AsanRegisterGlobals,
+ {IRB.CreatePointerCast(AllGlobals, IntptrTy),
+ ConstantInt::get(IntptrTy, N)});
+
+ // We also need to unregister globals at the end, e.g., when a shared library
+ // gets closed.
+ IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M));
+ IRB_Dtor.CreateCall(AsanUnregisterGlobals,
+ {IRB.CreatePointerCast(AllGlobals, IntptrTy),
+ ConstantInt::get(IntptrTy, N)});
+}
+
+// This function replaces all global variables with new variables that have
+// trailing redzones. It also creates a function that poisons
+// redzones and inserts this function into llvm.global_ctors.
+// Sets *CtorComdat to true if the global registration code emitted into the
+// asan constructor is comdat-compatible.
+bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
+ bool *CtorComdat) {
+ *CtorComdat = false;
+
+ // Build set of globals that are aliased by some GA, where
+ // getExcludedAliasedGlobal(GA) returns the relevant GlobalVariable.
+ SmallPtrSet<const GlobalVariable *, 16> AliasedGlobalExclusions;
+ if (CompileKernel) {
+ for (auto &GA : M.aliases()) {
+ if (const GlobalVariable *GV = getExcludedAliasedGlobal(GA))
+ AliasedGlobalExclusions.insert(GV);
+ }
+ }
+
+ SmallVector<GlobalVariable *, 16> GlobalsToChange;
+ for (auto &G : M.globals()) {
+ if (!AliasedGlobalExclusions.count(&G) && shouldInstrumentGlobal(&G))
+ GlobalsToChange.push_back(&G);
+ }
+
+ size_t n = GlobalsToChange.size();
+ if (n == 0) {
+ *CtorComdat = true;
+ return false;
+ }
+
+ auto &DL = M.getDataLayout();
+
+ // A global is described by a structure
+ // size_t beg;
+ // size_t size;
+ // size_t size_with_redzone;
+ // const char *name;
+ // const char *module_name;
+ // size_t has_dynamic_init;
+ // void *source_location;
+ // size_t odr_indicator;
+ // We initialize an array of such structures and pass it to a run-time call.
+ StructType *GlobalStructTy =
+ StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy,
+ IntptrTy, IntptrTy, IntptrTy);
+ SmallVector<GlobalVariable *, 16> NewGlobals(n);
+ SmallVector<Constant *, 16> Initializers(n);
+
+ bool HasDynamicallyInitializedGlobals = false;
+
+ // We shouldn't merge same module names, as this string serves as unique
+ // module ID in runtime.
+ GlobalVariable *ModuleName = createPrivateGlobalForString(
+ M, M.getModuleIdentifier(), /*AllowMerging*/ false, kAsanGenPrefix);
+
+ for (size_t i = 0; i < n; i++) {
+ GlobalVariable *G = GlobalsToChange[i];
+
+ // FIXME: Metadata should be attched directly to the global directly instead
+ // of being added to llvm.asan.globals.
+ auto MD = GlobalsMD.get(G);
+ StringRef NameForGlobal = G->getName();
+ // Create string holding the global name (use global name from metadata
+ // if it's available, otherwise just write the name of global variable).
+ GlobalVariable *Name = createPrivateGlobalForString(
+ M, MD.Name.empty() ? NameForGlobal : MD.Name,
+ /*AllowMerging*/ true, kAsanGenPrefix);
+
+ Type *Ty = G->getValueType();
+ const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
+ const uint64_t RightRedzoneSize = getRedzoneSizeForGlobal(SizeInBytes);
+ Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
+
+ StructType *NewTy = StructType::get(Ty, RightRedZoneTy);
+ Constant *NewInitializer = ConstantStruct::get(
+ NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy));
+
+ // Create a new global variable with enough space for a redzone.
+ GlobalValue::LinkageTypes Linkage = G->getLinkage();
+ if (G->isConstant() && Linkage == GlobalValue::PrivateLinkage)
+ Linkage = GlobalValue::InternalLinkage;
+ GlobalVariable *NewGlobal =
+ new GlobalVariable(M, NewTy, G->isConstant(), Linkage, NewInitializer,
+ "", G, G->getThreadLocalMode());
+ NewGlobal->copyAttributesFrom(G);
+ NewGlobal->setComdat(G->getComdat());
+ NewGlobal->setAlignment(MaybeAlign(getMinRedzoneSizeForGlobal()));
+ // Don't fold globals with redzones. ODR violation detector and redzone
+ // poisoning implicitly creates a dependence on the global's address, so it
+ // is no longer valid for it to be marked unnamed_addr.
+ NewGlobal->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
+
+ // Move null-terminated C strings to "__asan_cstring" section on Darwin.
+ if (TargetTriple.isOSBinFormatMachO() && !G->hasSection() &&
+ G->isConstant()) {
+ auto Seq = dyn_cast<ConstantDataSequential>(G->getInitializer());
+ if (Seq && Seq->isCString())
+ NewGlobal->setSection("__TEXT,__asan_cstring,regular");
+ }
+
// Transfer the debug info and type metadata. The payload starts at offset
// zero so we can copy the metadata over as is.
NewGlobal->copyMetadata(G, 0);
-
- Value *Indices2[2];
- Indices2[0] = IRB.getInt32(0);
- Indices2[1] = IRB.getInt32(0);
-
- G->replaceAllUsesWith(
- ConstantExpr::getGetElementPtr(NewTy, NewGlobal, Indices2, true));
- NewGlobal->takeName(G);
- G->eraseFromParent();
- NewGlobals[i] = NewGlobal;
-
- Constant *SourceLoc;
- if (!MD.SourceLoc.empty()) {
- auto SourceLocGlobal = createPrivateGlobalForSourceLoc(M, MD.SourceLoc);
- SourceLoc = ConstantExpr::getPointerCast(SourceLocGlobal, IntptrTy);
- } else {
- SourceLoc = ConstantInt::get(IntptrTy, 0);
- }
-
- Constant *ODRIndicator = ConstantExpr::getNullValue(IRB.getInt8PtrTy());
- GlobalValue *InstrumentedGlobal = NewGlobal;
-
- bool CanUsePrivateAliases =
- TargetTriple.isOSBinFormatELF() || TargetTriple.isOSBinFormatMachO() ||
- TargetTriple.isOSBinFormatWasm();
- if (CanUsePrivateAliases && UsePrivateAlias) {
- // Create local alias for NewGlobal to avoid crash on ODR between
- // instrumented and non-instrumented libraries.
- InstrumentedGlobal =
- GlobalAlias::create(GlobalValue::PrivateLinkage, "", NewGlobal);
- }
-
- // ODR should not happen for local linkage.
- if (NewGlobal->hasLocalLinkage()) {
- ODRIndicator = ConstantExpr::getIntToPtr(ConstantInt::get(IntptrTy, -1),
- IRB.getInt8PtrTy());
- } else if (UseOdrIndicator) {
- // With local aliases, we need to provide another externally visible
- // symbol __odr_asan_XXX to detect ODR violation.
- auto *ODRIndicatorSym =
- new GlobalVariable(M, IRB.getInt8Ty(), false, Linkage,
- Constant::getNullValue(IRB.getInt8Ty()),
- kODRGenPrefix + NameForGlobal, nullptr,
- NewGlobal->getThreadLocalMode());
-
- // Set meaningful attributes for indicator symbol.
- ODRIndicatorSym->setVisibility(NewGlobal->getVisibility());
- ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass());
- ODRIndicatorSym->setAlignment(Align(1));
- ODRIndicator = ODRIndicatorSym;
- }
-
- Constant *Initializer = ConstantStruct::get(
- GlobalStructTy,
- ConstantExpr::getPointerCast(InstrumentedGlobal, IntptrTy),
- ConstantInt::get(IntptrTy, SizeInBytes),
- ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
- ConstantExpr::getPointerCast(Name, IntptrTy),
- ConstantExpr::getPointerCast(ModuleName, IntptrTy),
- ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc,
- ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));
-
- if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
-
- LLVM_DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
-
- Initializers[i] = Initializer;
- }
-
- // Add instrumented globals to llvm.compiler.used list to avoid LTO from
- // ConstantMerge'ing them.
- SmallVector<GlobalValue *, 16> GlobalsToAddToUsedList;
- for (size_t i = 0; i < n; i++) {
- GlobalVariable *G = NewGlobals[i];
- if (G->getName().empty()) continue;
- GlobalsToAddToUsedList.push_back(G);
- }
- appendToCompilerUsed(M, ArrayRef<GlobalValue *>(GlobalsToAddToUsedList));
-
- std::string ELFUniqueModuleId =
- (UseGlobalsGC && TargetTriple.isOSBinFormatELF()) ? getUniqueModuleId(&M)
- : "";
-
- if (!ELFUniqueModuleId.empty()) {
- InstrumentGlobalsELF(IRB, M, NewGlobals, Initializers, ELFUniqueModuleId);
- *CtorComdat = true;
- } else if (UseGlobalsGC && TargetTriple.isOSBinFormatCOFF()) {
- InstrumentGlobalsCOFF(IRB, M, NewGlobals, Initializers);
- } else if (UseGlobalsGC && ShouldUseMachOGlobalsSection()) {
- InstrumentGlobalsMachO(IRB, M, NewGlobals, Initializers);
- } else {
- InstrumentGlobalsWithMetadataArray(IRB, M, NewGlobals, Initializers);
- }
-
- // Create calls for poisoning before initializers run and unpoisoning after.
- if (HasDynamicallyInitializedGlobals)
- createInitializerPoisonCalls(M, ModuleName);
-
- LLVM_DEBUG(dbgs() << M);
- return true;
-}
-
-uint64_t
-ModuleAddressSanitizer::getRedzoneSizeForGlobal(uint64_t SizeInBytes) const {
- constexpr uint64_t kMaxRZ = 1 << 18;
- const uint64_t MinRZ = getMinRedzoneSizeForGlobal();
-
- // Calculate RZ, where MinRZ <= RZ <= MaxRZ, and RZ ~ 1/4 * SizeInBytes.
- uint64_t RZ =
- std::max(MinRZ, std::min(kMaxRZ, (SizeInBytes / MinRZ / 4) * MinRZ));
-
- // Round up to multiple of MinRZ.
- if (SizeInBytes % MinRZ)
- RZ += MinRZ - (SizeInBytes % MinRZ);
- assert((RZ + SizeInBytes) % MinRZ == 0);
-
- return RZ;
-}
-
-int ModuleAddressSanitizer::GetAsanVersion(const Module &M) const {
- int LongSize = M.getDataLayout().getPointerSizeInBits();
- bool isAndroid = Triple(M.getTargetTriple()).isAndroid();
- int Version = 8;
- // 32-bit Android is one version ahead because of the switch to dynamic
- // shadow.
- Version += (LongSize == 32 && isAndroid);
- return Version;
-}
-
-bool ModuleAddressSanitizer::instrumentModule(Module &M) {
- initializeCallbacks(M);
-
- // Create a module constructor. A destructor is created lazily because not all
- // platforms, and not all modules need it.
- if (CompileKernel) {
- // The kernel always builds with its own runtime, and therefore does not
- // need the init and version check calls.
- AsanCtorFunction = createSanitizerCtor(M, kAsanModuleCtorName);
- } else {
- std::string AsanVersion = std::to_string(GetAsanVersion(M));
- std::string VersionCheckName =
- ClInsertVersionCheck ? (kAsanVersionCheckNamePrefix + AsanVersion) : "";
- std::tie(AsanCtorFunction, std::ignore) =
- createSanitizerCtorAndInitFunctions(M, kAsanModuleCtorName,
- kAsanInitName, /*InitArgTypes=*/{},
- /*InitArgs=*/{}, VersionCheckName);
- }
-
- bool CtorComdat = true;
- if (ClGlobals) {
- IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator());
- InstrumentGlobals(IRB, M, &CtorComdat);
- }
-
- const uint64_t Priority = GetCtorAndDtorPriority(TargetTriple);
-
- // Put the constructor and destructor in comdat if both
- // (1) global instrumentation is not TU-specific
- // (2) target is ELF.
- if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
- AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));
- appendToGlobalCtors(M, AsanCtorFunction, Priority, AsanCtorFunction);
- if (AsanDtorFunction) {
- AsanDtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleDtorName));
- appendToGlobalDtors(M, AsanDtorFunction, Priority, AsanDtorFunction);
- }
- } else {
- appendToGlobalCtors(M, AsanCtorFunction, Priority);
- if (AsanDtorFunction)
- appendToGlobalDtors(M, AsanDtorFunction, Priority);
- }
-
- return true;
-}
-
-void AddressSanitizer::initializeCallbacks(Module &M) {
- IRBuilder<> IRB(*C);
- // Create __asan_report* callbacks.
- // IsWrite, TypeSize and Exp are encoded in the function name.
- for (int Exp = 0; Exp < 2; Exp++) {
- for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
- const std::string TypeStr = AccessIsWrite ? "store" : "load";
- const std::string ExpStr = Exp ? "exp_" : "";
- const std::string EndingStr = Recover ? "_noabort" : "";
-
- SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
- SmallVector<Type *, 2> Args1{1, IntptrTy};
- if (Exp) {
- Type *ExpType = Type::getInt32Ty(*C);
- Args2.push_back(ExpType);
- Args1.push_back(ExpType);
- }
- AsanErrorCallbackSized[AccessIsWrite][Exp] = M.getOrInsertFunction(
- kAsanReportErrorTemplate + ExpStr + TypeStr + "_n" + EndingStr,
- FunctionType::get(IRB.getVoidTy(), Args2, false));
-
- AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] = M.getOrInsertFunction(
- ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N" + EndingStr,
- FunctionType::get(IRB.getVoidTy(), Args2, false));
-
- for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
- AccessSizeIndex++) {
- const std::string Suffix = TypeStr + itostr(1ULL << AccessSizeIndex);
- AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
- M.getOrInsertFunction(
- kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,
- FunctionType::get(IRB.getVoidTy(), Args1, false));
-
- AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
- M.getOrInsertFunction(
- ClMemoryAccessCallbackPrefix + ExpStr + Suffix + EndingStr,
- FunctionType::get(IRB.getVoidTy(), Args1, false));
- }
- }
- }
-
- const std::string MemIntrinCallbackPrefix =
- CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
- AsanMemmove = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memmove",
- IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
- IRB.getInt8PtrTy(), IntptrTy);
- AsanMemcpy = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memcpy",
- IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
- IRB.getInt8PtrTy(), IntptrTy);
- AsanMemset = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memset",
- IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
- IRB.getInt32Ty(), IntptrTy);
-
- AsanHandleNoReturnFunc =
- M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy());
-
- AsanPtrCmpFunction =
- M.getOrInsertFunction(kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy);
- AsanPtrSubFunction =
- M.getOrInsertFunction(kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy);
- if (Mapping.InGlobal)
- AsanShadowGlobal = M.getOrInsertGlobal("__asan_shadow",
- ArrayType::get(IRB.getInt8Ty(), 0));
-}
-
-bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
- // For each NSObject descendant having a +load method, this method is invoked
- // by the ObjC runtime before any of the static constructors is called.
- // Therefore we need to instrument such methods with a call to __asan_init
- // at the beginning in order to initialize our runtime before any access to
- // the shadow memory.
- // We cannot just ignore these methods, because they may call other
- // instrumented functions.
- if (F.getName().find(" load]") != std::string::npos) {
- FunctionCallee AsanInitFunction =
- declareSanitizerInitFunction(*F.getParent(), kAsanInitName, {});
- IRBuilder<> IRB(&F.front(), F.front().begin());
- IRB.CreateCall(AsanInitFunction, {});
- return true;
- }
- return false;
-}
-
-bool AddressSanitizer::maybeInsertDynamicShadowAtFunctionEntry(Function &F) {
- // Generate code only when dynamic addressing is needed.
- if (Mapping.Offset != kDynamicShadowSentinel)
- return false;
-
- IRBuilder<> IRB(&F.front().front());
- if (Mapping.InGlobal) {
- if (ClWithIfuncSuppressRemat) {
- // An empty inline asm with input reg == output reg.
- // An opaque pointer-to-int cast, basically.
- InlineAsm *Asm = InlineAsm::get(
- FunctionType::get(IntptrTy, {AsanShadowGlobal->getType()}, false),
- StringRef(""), StringRef("=r,0"),
- /*hasSideEffects=*/false);
- LocalDynamicShadow =
- IRB.CreateCall(Asm, {AsanShadowGlobal}, ".asan.shadow");
- } else {
- LocalDynamicShadow =
- IRB.CreatePointerCast(AsanShadowGlobal, IntptrTy, ".asan.shadow");
- }
- } else {
- Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal(
- kAsanShadowMemoryDynamicAddress, IntptrTy);
- LocalDynamicShadow = IRB.CreateLoad(IntptrTy, GlobalDynamicAddress);
- }
- return true;
-}
-
-void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
- // Find the one possible call to llvm.localescape and pre-mark allocas passed
- // to it as uninteresting. This assumes we haven't started processing allocas
- // yet. This check is done up front because iterating the use list in
- // isInterestingAlloca would be algorithmically slower.
- assert(ProcessedAllocas.empty() && "must process localescape before allocas");
-
- // Try to get the declaration of llvm.localescape. If it's not in the module,
- // we can exit early.
- if (!F.getParent()->getFunction("llvm.localescape")) return;
-
- // Look for a call to llvm.localescape call in the entry block. It can't be in
- // any other block.
- for (Instruction &I : F.getEntryBlock()) {
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
- if (II && II->getIntrinsicID() == Intrinsic::localescape) {
- // We found a call. Mark all the allocas passed in as uninteresting.
- for (Value *Arg : II->arg_operands()) {
- AllocaInst *AI = dyn_cast<AllocaInst>(Arg->stripPointerCasts());
- assert(AI && AI->isStaticAlloca() &&
- "non-static alloca arg to localescape");
- ProcessedAllocas[AI] = false;
- }
- break;
- }
- }
-}
-
-bool AddressSanitizer::suppressInstrumentationSiteForDebug(int &Instrumented) {
- bool ShouldInstrument =
- ClDebugMin < 0 || ClDebugMax < 0 ||
- (Instrumented >= ClDebugMin && Instrumented <= ClDebugMax);
- Instrumented++;
- return !ShouldInstrument;
-}
-
-bool AddressSanitizer::instrumentFunction(Function &F,
- const TargetLibraryInfo *TLI) {
- if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
- if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
- if (F.getName().startswith("__asan_")) return false;
-
- bool FunctionModified = false;
-
- // If needed, insert __asan_init before checking for SanitizeAddress attr.
- // This function needs to be called even if the function body is not
- // instrumented.
- if (maybeInsertAsanInitAtFunctionEntry(F))
- FunctionModified = true;
-
- // Leave if the function doesn't need instrumentation.
- if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return FunctionModified;
-
- LLVM_DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
-
- initializeCallbacks(*F.getParent());
-
- FunctionStateRAII CleanupObj(this);
-
- FunctionModified |= maybeInsertDynamicShadowAtFunctionEntry(F);
-
- // We can't instrument allocas used with llvm.localescape. Only static allocas
- // can be passed to that intrinsic.
- markEscapedLocalAllocas(F);
-
- // We want to instrument every address only once per basic block (unless there
- // are calls between uses).
- SmallPtrSet<Value *, 16> TempsToInstrument;
- SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
- SmallVector<MemIntrinsic *, 16> IntrinToInstrument;
- SmallVector<Instruction *, 8> NoReturnCalls;
- SmallVector<BasicBlock *, 16> AllBlocks;
- SmallVector<Instruction *, 16> PointerComparisonsOrSubtracts;
- int NumAllocas = 0;
-
- // Fill the set of memory operations to instrument.
- for (auto &BB : F) {
- AllBlocks.push_back(&BB);
- TempsToInstrument.clear();
- int NumInsnsPerBB = 0;
- for (auto &Inst : BB) {
- if (LooksLikeCodeInBug11395(&Inst)) return false;
- SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
- getInterestingMemoryOperands(&Inst, InterestingOperands);
-
- if (!InterestingOperands.empty()) {
- for (auto &Operand : InterestingOperands) {
- if (ClOpt && ClOptSameTemp) {
- Value *Ptr = Operand.getPtr();
- // If we have a mask, skip instrumentation if we've already
- // instrumented the full object. But don't add to TempsToInstrument
- // because we might get another load/store with a different mask.
- if (Operand.MaybeMask) {
- if (TempsToInstrument.count(Ptr))
- continue; // We've seen this (whole) temp in the current BB.
- } else {
- if (!TempsToInstrument.insert(Ptr).second)
- continue; // We've seen this temp in the current BB.
- }
- }
- OperandsToInstrument.push_back(Operand);
- NumInsnsPerBB++;
- }
- } else if (((ClInvalidPointerPairs || ClInvalidPointerCmp) &&
- isInterestingPointerComparison(&Inst)) ||
- ((ClInvalidPointerPairs || ClInvalidPointerSub) &&
- isInterestingPointerSubtraction(&Inst))) {
- PointerComparisonsOrSubtracts.push_back(&Inst);
- } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst)) {
- // ok, take it.
- IntrinToInstrument.push_back(MI);
- NumInsnsPerBB++;
- } else {
- if (isa<AllocaInst>(Inst)) NumAllocas++;
- if (auto *CB = dyn_cast<CallBase>(&Inst)) {
- // A call inside BB.
- TempsToInstrument.clear();
- if (CB->doesNotReturn() && !CB->hasMetadata("nosanitize"))
- NoReturnCalls.push_back(CB);
- }
- if (CallInst *CI = dyn_cast<CallInst>(&Inst))
- maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
- }
- if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) break;
- }
- }
-
- bool UseCalls = (ClInstrumentationWithCallsThreshold >= 0 &&
- OperandsToInstrument.size() + IntrinToInstrument.size() >
- (unsigned)ClInstrumentationWithCallsThreshold);
- const DataLayout &DL = F.getParent()->getDataLayout();
- ObjectSizeOpts ObjSizeOpts;
- ObjSizeOpts.RoundToAlign = true;
- ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(), ObjSizeOpts);
-
- // Instrument.
- int NumInstrumented = 0;
- for (auto &Operand : OperandsToInstrument) {
- if (!suppressInstrumentationSiteForDebug(NumInstrumented))
- instrumentMop(ObjSizeVis, Operand, UseCalls,
- F.getParent()->getDataLayout());
- FunctionModified = true;
- }
- for (auto Inst : IntrinToInstrument) {
- if (!suppressInstrumentationSiteForDebug(NumInstrumented))
- instrumentMemIntrinsic(Inst);
- FunctionModified = true;
- }
-
- FunctionStackPoisoner FSP(F, *this);
- bool ChangedStack = FSP.runOnFunction();
-
- // We must unpoison the stack before NoReturn calls (throw, _exit, etc).
- // See e.g. https://github.com/google/sanitizers/issues/37
- for (auto CI : NoReturnCalls) {
- IRBuilder<> IRB(CI);
- IRB.CreateCall(AsanHandleNoReturnFunc, {});
- }
-
- for (auto Inst : PointerComparisonsOrSubtracts) {
- instrumentPointerComparisonOrSubtraction(Inst);
- FunctionModified = true;
- }
-
- if (ChangedStack || !NoReturnCalls.empty())
- FunctionModified = true;
-
- LLVM_DEBUG(dbgs() << "ASAN done instrumenting: " << FunctionModified << " "
- << F << "\n");
-
- return FunctionModified;
-}
-
-// Workaround for bug 11395: we don't want to instrument stack in functions
-// with large assembly blobs (32-bit only), otherwise reg alloc may crash.
-// FIXME: remove once the bug 11395 is fixed.
-bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
- if (LongSize != 32) return false;
- CallInst *CI = dyn_cast<CallInst>(I);
- if (!CI || !CI->isInlineAsm()) return false;
- if (CI->getNumArgOperands() <= 5) return false;
- // We have inline assembly with quite a few arguments.
- return true;
-}
-
-void FunctionStackPoisoner::initializeCallbacks(Module &M) {
- IRBuilder<> IRB(*C);
- for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) {
- std::string Suffix = itostr(i);
- AsanStackMallocFunc[i] = M.getOrInsertFunction(
- kAsanStackMallocNameTemplate + Suffix, IntptrTy, IntptrTy);
- AsanStackFreeFunc[i] =
- M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix,
- IRB.getVoidTy(), IntptrTy, IntptrTy);
- }
- if (ASan.UseAfterScope) {
- AsanPoisonStackMemoryFunc = M.getOrInsertFunction(
- kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy);
- AsanUnpoisonStackMemoryFunc = M.getOrInsertFunction(
- kAsanUnpoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy);
- }
-
- for (size_t Val : {0x00, 0xf1, 0xf2, 0xf3, 0xf5, 0xf8}) {
- std::ostringstream Name;
- Name << kAsanSetShadowPrefix;
- Name << std::setw(2) << std::setfill('0') << std::hex << Val;
- AsanSetShadowFunc[Val] =
- M.getOrInsertFunction(Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy);
- }
-
- AsanAllocaPoisonFunc = M.getOrInsertFunction(
- kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy);
- AsanAllocasUnpoisonFunc = M.getOrInsertFunction(
- kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy);
-}
-
-void FunctionStackPoisoner::copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
- ArrayRef<uint8_t> ShadowBytes,
- size_t Begin, size_t End,
- IRBuilder<> &IRB,
- Value *ShadowBase) {
- if (Begin >= End)
- return;
-
- const size_t LargestStoreSizeInBytes =
- std::min<size_t>(sizeof(uint64_t), ASan.LongSize / 8);
-
- const bool IsLittleEndian = F.getParent()->getDataLayout().isLittleEndian();
-
- // Poison given range in shadow using larges store size with out leading and
- // trailing zeros in ShadowMask. Zeros never change, so they need neither
- // poisoning nor up-poisoning. Still we don't mind if some of them get into a
- // middle of a store.
- for (size_t i = Begin; i < End;) {
- if (!ShadowMask[i]) {
- assert(!ShadowBytes[i]);
- ++i;
- continue;
- }
-
- size_t StoreSizeInBytes = LargestStoreSizeInBytes;
- // Fit store size into the range.
- while (StoreSizeInBytes > End - i)
- StoreSizeInBytes /= 2;
-
- // Minimize store size by trimming trailing zeros.
- for (size_t j = StoreSizeInBytes - 1; j && !ShadowMask[i + j]; --j) {
- while (j <= StoreSizeInBytes / 2)
- StoreSizeInBytes /= 2;
- }
-
- uint64_t Val = 0;
- for (size_t j = 0; j < StoreSizeInBytes; j++) {
- if (IsLittleEndian)
- Val |= (uint64_t)ShadowBytes[i + j] << (8 * j);
- else
- Val = (Val << 8) | ShadowBytes[i + j];
- }
-
- Value *Ptr = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
- Value *Poison = IRB.getIntN(StoreSizeInBytes * 8, Val);
- IRB.CreateAlignedStore(
- Poison, IRB.CreateIntToPtr(Ptr, Poison->getType()->getPointerTo()),
- Align(1));
-
- i += StoreSizeInBytes;
- }
-}
-
-void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask,
- ArrayRef<uint8_t> ShadowBytes,
- IRBuilder<> &IRB, Value *ShadowBase) {
- copyToShadow(ShadowMask, ShadowBytes, 0, ShadowMask.size(), IRB, ShadowBase);
-}
-
-void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask,
- ArrayRef<uint8_t> ShadowBytes,
- size_t Begin, size_t End,
- IRBuilder<> &IRB, Value *ShadowBase) {
- assert(ShadowMask.size() == ShadowBytes.size());
- size_t Done = Begin;
- for (size_t i = Begin, j = Begin + 1; i < End; i = j++) {
- if (!ShadowMask[i]) {
- assert(!ShadowBytes[i]);
- continue;
- }
- uint8_t Val = ShadowBytes[i];
- if (!AsanSetShadowFunc[Val])
- continue;
-
- // Skip same values.
- for (; j < End && ShadowMask[j] && Val == ShadowBytes[j]; ++j) {
- }
-
- if (j - i >= ClMaxInlinePoisoningSize) {
- copyToShadowInline(ShadowMask, ShadowBytes, Done, i, IRB, ShadowBase);
- IRB.CreateCall(AsanSetShadowFunc[Val],
- {IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i)),
- ConstantInt::get(IntptrTy, j - i)});
- Done = j;
- }
- }
-
- copyToShadowInline(ShadowMask, ShadowBytes, Done, End, IRB, ShadowBase);
-}
-
-// Fake stack allocator (asan_fake_stack.h) has 11 size classes
-// for every power of 2 from kMinStackMallocSize to kMaxAsanStackMallocSizeClass
-static int StackMallocSizeClass(uint64_t LocalStackSize) {
- assert(LocalStackSize <= kMaxStackMallocSize);
- uint64_t MaxSize = kMinStackMallocSize;
- for (int i = 0;; i++, MaxSize *= 2)
- if (LocalStackSize <= MaxSize) return i;
- llvm_unreachable("impossible LocalStackSize");
-}
-
-void FunctionStackPoisoner::copyArgsPassedByValToAllocas() {
- Instruction *CopyInsertPoint = &F.front().front();
- if (CopyInsertPoint == ASan.LocalDynamicShadow) {
- // Insert after the dynamic shadow location is determined
- CopyInsertPoint = CopyInsertPoint->getNextNode();
- assert(CopyInsertPoint);
- }
- IRBuilder<> IRB(CopyInsertPoint);
- const DataLayout &DL = F.getParent()->getDataLayout();
- for (Argument &Arg : F.args()) {
- if (Arg.hasByValAttr()) {
- Type *Ty = Arg.getParamByValType();
- const Align Alignment =
- DL.getValueOrABITypeAlignment(Arg.getParamAlign(), Ty);
-
- AllocaInst *AI = IRB.CreateAlloca(
- Ty, nullptr,
- (Arg.hasName() ? Arg.getName() : "Arg" + Twine(Arg.getArgNo())) +
- ".byval");
- AI->setAlignment(Alignment);
- Arg.replaceAllUsesWith(AI);
-
- uint64_t AllocSize = DL.getTypeAllocSize(Ty);
- IRB.CreateMemCpy(AI, Alignment, &Arg, Alignment, AllocSize);
- }
- }
-}
-
-PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond,
- Value *ValueIfTrue,
- Instruction *ThenTerm,
- Value *ValueIfFalse) {
- PHINode *PHI = IRB.CreatePHI(IntptrTy, 2);
- BasicBlock *CondBlock = cast<Instruction>(Cond)->getParent();
- PHI->addIncoming(ValueIfFalse, CondBlock);
- BasicBlock *ThenBlock = ThenTerm->getParent();
- PHI->addIncoming(ValueIfTrue, ThenBlock);
- return PHI;
-}
-
-Value *FunctionStackPoisoner::createAllocaForLayout(
- IRBuilder<> &IRB, const ASanStackFrameLayout &L, bool Dynamic) {
- AllocaInst *Alloca;
- if (Dynamic) {
- Alloca = IRB.CreateAlloca(IRB.getInt8Ty(),
- ConstantInt::get(IRB.getInt64Ty(), L.FrameSize),
- "MyAlloca");
- } else {
- Alloca = IRB.CreateAlloca(ArrayType::get(IRB.getInt8Ty(), L.FrameSize),
- nullptr, "MyAlloca");
- assert(Alloca->isStaticAlloca());
- }
- assert((ClRealignStack & (ClRealignStack - 1)) == 0);
- size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
- Alloca->setAlignment(Align(FrameAlignment));
- return IRB.CreatePointerCast(Alloca, IntptrTy);
-}
-
-void FunctionStackPoisoner::createDynamicAllocasInitStorage() {
- BasicBlock &FirstBB = *F.begin();
- IRBuilder<> IRB(dyn_cast<Instruction>(FirstBB.begin()));
- DynamicAllocaLayout = IRB.CreateAlloca(IntptrTy, nullptr);
- IRB.CreateStore(Constant::getNullValue(IntptrTy), DynamicAllocaLayout);
- DynamicAllocaLayout->setAlignment(Align(32));
-}
-
-void FunctionStackPoisoner::processDynamicAllocas() {
- if (!ClInstrumentDynamicAllocas || DynamicAllocaVec.empty()) {
- assert(DynamicAllocaPoisonCallVec.empty());
- return;
- }
-
- // Insert poison calls for lifetime intrinsics for dynamic allocas.
- for (const auto &APC : DynamicAllocaPoisonCallVec) {
- assert(APC.InsBefore);
- assert(APC.AI);
- assert(ASan.isInterestingAlloca(*APC.AI));
- assert(!APC.AI->isStaticAlloca());
-
- IRBuilder<> IRB(APC.InsBefore);
- poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison);
- // Dynamic allocas will be unpoisoned unconditionally below in
- // unpoisonDynamicAllocas.
- // Flag that we need unpoison static allocas.
- }
-
- // Handle dynamic allocas.
- createDynamicAllocasInitStorage();
- for (auto &AI : DynamicAllocaVec)
- handleDynamicAllocaCall(AI);
- unpoisonDynamicAllocas();
-}
-
-/// Collect instructions in the entry block after \p InsBefore which initialize
-/// permanent storage for a function argument. These instructions must remain in
-/// the entry block so that uninitialized values do not appear in backtraces. An
-/// added benefit is that this conserves spill slots. This does not move stores
-/// before instrumented / "interesting" allocas.
-static void findStoresToUninstrumentedArgAllocas(
- AddressSanitizer &ASan, Instruction &InsBefore,
- SmallVectorImpl<Instruction *> &InitInsts) {
- Instruction *Start = InsBefore.getNextNonDebugInstruction();
- for (Instruction *It = Start; It; It = It->getNextNonDebugInstruction()) {
- // Argument initialization looks like:
- // 1) store <Argument>, <Alloca> OR
- // 2) <CastArgument> = cast <Argument> to ...
- // store <CastArgument> to <Alloca>
- // Do not consider any other kind of instruction.
- //
- // Note: This covers all known cases, but may not be exhaustive. An
- // alternative to pattern-matching stores is to DFS over all Argument uses:
- // this might be more general, but is probably much more complicated.
- if (isa<AllocaInst>(It) || isa<CastInst>(It))
- continue;
- if (auto *Store = dyn_cast<StoreInst>(It)) {
- // The store destination must be an alloca that isn't interesting for
- // ASan to instrument. These are moved up before InsBefore, and they're
- // not interesting because allocas for arguments can be mem2reg'd.
- auto *Alloca = dyn_cast<AllocaInst>(Store->getPointerOperand());
- if (!Alloca || ASan.isInterestingAlloca(*Alloca))
- continue;
-
- Value *Val = Store->getValueOperand();
- bool IsDirectArgInit = isa<Argument>(Val);
- bool IsArgInitViaCast =
- isa<CastInst>(Val) &&
- isa<Argument>(cast<CastInst>(Val)->getOperand(0)) &&
- // Check that the cast appears directly before the store. Otherwise
- // moving the cast before InsBefore may break the IR.
- Val == It->getPrevNonDebugInstruction();
- bool IsArgInit = IsDirectArgInit || IsArgInitViaCast;
- if (!IsArgInit)
- continue;
-
- if (IsArgInitViaCast)
- InitInsts.push_back(cast<Instruction>(Val));
- InitInsts.push_back(Store);
- continue;
- }
-
- // Do not reorder past unknown instructions: argument initialization should
- // only involve casts and stores.
- return;
- }
-}
-
-void FunctionStackPoisoner::processStaticAllocas() {
- if (AllocaVec.empty()) {
- assert(StaticAllocaPoisonCallVec.empty());
- return;
- }
-
- int StackMallocIdx = -1;
- DebugLoc EntryDebugLocation;
- if (auto SP = F.getSubprogram())
+
+ Value *Indices2[2];
+ Indices2[0] = IRB.getInt32(0);
+ Indices2[1] = IRB.getInt32(0);
+
+ G->replaceAllUsesWith(
+ ConstantExpr::getGetElementPtr(NewTy, NewGlobal, Indices2, true));
+ NewGlobal->takeName(G);
+ G->eraseFromParent();
+ NewGlobals[i] = NewGlobal;
+
+ Constant *SourceLoc;
+ if (!MD.SourceLoc.empty()) {
+ auto SourceLocGlobal = createPrivateGlobalForSourceLoc(M, MD.SourceLoc);
+ SourceLoc = ConstantExpr::getPointerCast(SourceLocGlobal, IntptrTy);
+ } else {
+ SourceLoc = ConstantInt::get(IntptrTy, 0);
+ }
+
+ Constant *ODRIndicator = ConstantExpr::getNullValue(IRB.getInt8PtrTy());
+ GlobalValue *InstrumentedGlobal = NewGlobal;
+
+ bool CanUsePrivateAliases =
+ TargetTriple.isOSBinFormatELF() || TargetTriple.isOSBinFormatMachO() ||
+ TargetTriple.isOSBinFormatWasm();
+ if (CanUsePrivateAliases && UsePrivateAlias) {
+ // Create local alias for NewGlobal to avoid crash on ODR between
+ // instrumented and non-instrumented libraries.
+ InstrumentedGlobal =
+ GlobalAlias::create(GlobalValue::PrivateLinkage, "", NewGlobal);
+ }
+
+ // ODR should not happen for local linkage.
+ if (NewGlobal->hasLocalLinkage()) {
+ ODRIndicator = ConstantExpr::getIntToPtr(ConstantInt::get(IntptrTy, -1),
+ IRB.getInt8PtrTy());
+ } else if (UseOdrIndicator) {
+ // With local aliases, we need to provide another externally visible
+ // symbol __odr_asan_XXX to detect ODR violation.
+ auto *ODRIndicatorSym =
+ new GlobalVariable(M, IRB.getInt8Ty(), false, Linkage,
+ Constant::getNullValue(IRB.getInt8Ty()),
+ kODRGenPrefix + NameForGlobal, nullptr,
+ NewGlobal->getThreadLocalMode());
+
+ // Set meaningful attributes for indicator symbol.
+ ODRIndicatorSym->setVisibility(NewGlobal->getVisibility());
+ ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass());
+ ODRIndicatorSym->setAlignment(Align(1));
+ ODRIndicator = ODRIndicatorSym;
+ }
+
+ Constant *Initializer = ConstantStruct::get(
+ GlobalStructTy,
+ ConstantExpr::getPointerCast(InstrumentedGlobal, IntptrTy),
+ ConstantInt::get(IntptrTy, SizeInBytes),
+ ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
+ ConstantExpr::getPointerCast(Name, IntptrTy),
+ ConstantExpr::getPointerCast(ModuleName, IntptrTy),
+ ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc,
+ ConstantExpr::getPointerCast(ODRIndicator, IntptrTy));
+
+ if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true;
+
+ LLVM_DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
+
+ Initializers[i] = Initializer;
+ }
+
+ // Add instrumented globals to llvm.compiler.used list to avoid LTO from
+ // ConstantMerge'ing them.
+ SmallVector<GlobalValue *, 16> GlobalsToAddToUsedList;
+ for (size_t i = 0; i < n; i++) {
+ GlobalVariable *G = NewGlobals[i];
+ if (G->getName().empty()) continue;
+ GlobalsToAddToUsedList.push_back(G);
+ }
+ appendToCompilerUsed(M, ArrayRef<GlobalValue *>(GlobalsToAddToUsedList));
+
+ std::string ELFUniqueModuleId =
+ (UseGlobalsGC && TargetTriple.isOSBinFormatELF()) ? getUniqueModuleId(&M)
+ : "";
+
+ if (!ELFUniqueModuleId.empty()) {
+ InstrumentGlobalsELF(IRB, M, NewGlobals, Initializers, ELFUniqueModuleId);
+ *CtorComdat = true;
+ } else if (UseGlobalsGC && TargetTriple.isOSBinFormatCOFF()) {
+ InstrumentGlobalsCOFF(IRB, M, NewGlobals, Initializers);
+ } else if (UseGlobalsGC && ShouldUseMachOGlobalsSection()) {
+ InstrumentGlobalsMachO(IRB, M, NewGlobals, Initializers);
+ } else {
+ InstrumentGlobalsWithMetadataArray(IRB, M, NewGlobals, Initializers);
+ }
+
+ // Create calls for poisoning before initializers run and unpoisoning after.
+ if (HasDynamicallyInitializedGlobals)
+ createInitializerPoisonCalls(M, ModuleName);
+
+ LLVM_DEBUG(dbgs() << M);
+ return true;
+}
+
+uint64_t
+ModuleAddressSanitizer::getRedzoneSizeForGlobal(uint64_t SizeInBytes) const {
+ constexpr uint64_t kMaxRZ = 1 << 18;
+ const uint64_t MinRZ = getMinRedzoneSizeForGlobal();
+
+ // Calculate RZ, where MinRZ <= RZ <= MaxRZ, and RZ ~ 1/4 * SizeInBytes.
+ uint64_t RZ =
+ std::max(MinRZ, std::min(kMaxRZ, (SizeInBytes / MinRZ / 4) * MinRZ));
+
+ // Round up to multiple of MinRZ.
+ if (SizeInBytes % MinRZ)
+ RZ += MinRZ - (SizeInBytes % MinRZ);
+ assert((RZ + SizeInBytes) % MinRZ == 0);
+
+ return RZ;
+}
+
+int ModuleAddressSanitizer::GetAsanVersion(const Module &M) const {
+ int LongSize = M.getDataLayout().getPointerSizeInBits();
+ bool isAndroid = Triple(M.getTargetTriple()).isAndroid();
+ int Version = 8;
+ // 32-bit Android is one version ahead because of the switch to dynamic
+ // shadow.
+ Version += (LongSize == 32 && isAndroid);
+ return Version;
+}
+
+bool ModuleAddressSanitizer::instrumentModule(Module &M) {
+ initializeCallbacks(M);
+
+ // Create a module constructor. A destructor is created lazily because not all
+ // platforms, and not all modules need it.
+ if (CompileKernel) {
+ // The kernel always builds with its own runtime, and therefore does not
+ // need the init and version check calls.
+ AsanCtorFunction = createSanitizerCtor(M, kAsanModuleCtorName);
+ } else {
+ std::string AsanVersion = std::to_string(GetAsanVersion(M));
+ std::string VersionCheckName =
+ ClInsertVersionCheck ? (kAsanVersionCheckNamePrefix + AsanVersion) : "";
+ std::tie(AsanCtorFunction, std::ignore) =
+ createSanitizerCtorAndInitFunctions(M, kAsanModuleCtorName,
+ kAsanInitName, /*InitArgTypes=*/{},
+ /*InitArgs=*/{}, VersionCheckName);
+ }
+
+ bool CtorComdat = true;
+ if (ClGlobals) {
+ IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator());
+ InstrumentGlobals(IRB, M, &CtorComdat);
+ }
+
+ const uint64_t Priority = GetCtorAndDtorPriority(TargetTriple);
+
+ // Put the constructor and destructor in comdat if both
+ // (1) global instrumentation is not TU-specific
+ // (2) target is ELF.
+ if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) {
+ AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName));
+ appendToGlobalCtors(M, AsanCtorFunction, Priority, AsanCtorFunction);
+ if (AsanDtorFunction) {
+ AsanDtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleDtorName));
+ appendToGlobalDtors(M, AsanDtorFunction, Priority, AsanDtorFunction);
+ }
+ } else {
+ appendToGlobalCtors(M, AsanCtorFunction, Priority);
+ if (AsanDtorFunction)
+ appendToGlobalDtors(M, AsanDtorFunction, Priority);
+ }
+
+ return true;
+}
+
+void AddressSanitizer::initializeCallbacks(Module &M) {
+ IRBuilder<> IRB(*C);
+ // Create __asan_report* callbacks.
+ // IsWrite, TypeSize and Exp are encoded in the function name.
+ for (int Exp = 0; Exp < 2; Exp++) {
+ for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+ const std::string TypeStr = AccessIsWrite ? "store" : "load";
+ const std::string ExpStr = Exp ? "exp_" : "";
+ const std::string EndingStr = Recover ? "_noabort" : "";
+
+ SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
+ SmallVector<Type *, 2> Args1{1, IntptrTy};
+ if (Exp) {
+ Type *ExpType = Type::getInt32Ty(*C);
+ Args2.push_back(ExpType);
+ Args1.push_back(ExpType);
+ }
+ AsanErrorCallbackSized[AccessIsWrite][Exp] = M.getOrInsertFunction(
+ kAsanReportErrorTemplate + ExpStr + TypeStr + "_n" + EndingStr,
+ FunctionType::get(IRB.getVoidTy(), Args2, false));
+
+ AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] = M.getOrInsertFunction(
+ ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N" + EndingStr,
+ FunctionType::get(IRB.getVoidTy(), Args2, false));
+
+ for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+ AccessSizeIndex++) {
+ const std::string Suffix = TypeStr + itostr(1ULL << AccessSizeIndex);
+ AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+ M.getOrInsertFunction(
+ kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,
+ FunctionType::get(IRB.getVoidTy(), Args1, false));
+
+ AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+ M.getOrInsertFunction(
+ ClMemoryAccessCallbackPrefix + ExpStr + Suffix + EndingStr,
+ FunctionType::get(IRB.getVoidTy(), Args1, false));
+ }
+ }
+ }
+
+ const std::string MemIntrinCallbackPrefix =
+ CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
+ AsanMemmove = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memmove",
+ IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+ IRB.getInt8PtrTy(), IntptrTy);
+ AsanMemcpy = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memcpy",
+ IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+ IRB.getInt8PtrTy(), IntptrTy);
+ AsanMemset = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memset",
+ IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+ IRB.getInt32Ty(), IntptrTy);
+
+ AsanHandleNoReturnFunc =
+ M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy());
+
+ AsanPtrCmpFunction =
+ M.getOrInsertFunction(kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy);
+ AsanPtrSubFunction =
+ M.getOrInsertFunction(kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy);
+ if (Mapping.InGlobal)
+ AsanShadowGlobal = M.getOrInsertGlobal("__asan_shadow",
+ ArrayType::get(IRB.getInt8Ty(), 0));
+}
+
+bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
+ // For each NSObject descendant having a +load method, this method is invoked
+ // by the ObjC runtime before any of the static constructors is called.
+ // Therefore we need to instrument such methods with a call to __asan_init
+ // at the beginning in order to initialize our runtime before any access to
+ // the shadow memory.
+ // We cannot just ignore these methods, because they may call other
+ // instrumented functions.
+ if (F.getName().find(" load]") != std::string::npos) {
+ FunctionCallee AsanInitFunction =
+ declareSanitizerInitFunction(*F.getParent(), kAsanInitName, {});
+ IRBuilder<> IRB(&F.front(), F.front().begin());
+ IRB.CreateCall(AsanInitFunction, {});
+ return true;
+ }
+ return false;
+}
+
+bool AddressSanitizer::maybeInsertDynamicShadowAtFunctionEntry(Function &F) {
+ // Generate code only when dynamic addressing is needed.
+ if (Mapping.Offset != kDynamicShadowSentinel)
+ return false;
+
+ IRBuilder<> IRB(&F.front().front());
+ if (Mapping.InGlobal) {
+ if (ClWithIfuncSuppressRemat) {
+ // An empty inline asm with input reg == output reg.
+ // An opaque pointer-to-int cast, basically.
+ InlineAsm *Asm = InlineAsm::get(
+ FunctionType::get(IntptrTy, {AsanShadowGlobal->getType()}, false),
+ StringRef(""), StringRef("=r,0"),
+ /*hasSideEffects=*/false);
+ LocalDynamicShadow =
+ IRB.CreateCall(Asm, {AsanShadowGlobal}, ".asan.shadow");
+ } else {
+ LocalDynamicShadow =
+ IRB.CreatePointerCast(AsanShadowGlobal, IntptrTy, ".asan.shadow");
+ }
+ } else {
+ Value *GlobalDynamicAddress = F.getParent()->getOrInsertGlobal(
+ kAsanShadowMemoryDynamicAddress, IntptrTy);
+ LocalDynamicShadow = IRB.CreateLoad(IntptrTy, GlobalDynamicAddress);
+ }
+ return true;
+}
+
+void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
+ // Find the one possible call to llvm.localescape and pre-mark allocas passed
+ // to it as uninteresting. This assumes we haven't started processing allocas
+ // yet. This check is done up front because iterating the use list in
+ // isInterestingAlloca would be algorithmically slower.
+ assert(ProcessedAllocas.empty() && "must process localescape before allocas");
+
+ // Try to get the declaration of llvm.localescape. If it's not in the module,
+ // we can exit early.
+ if (!F.getParent()->getFunction("llvm.localescape")) return;
+
+ // Look for a call to llvm.localescape call in the entry block. It can't be in
+ // any other block.
+ for (Instruction &I : F.getEntryBlock()) {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+ if (II && II->getIntrinsicID() == Intrinsic::localescape) {
+ // We found a call. Mark all the allocas passed in as uninteresting.
+ for (Value *Arg : II->arg_operands()) {
+ AllocaInst *AI = dyn_cast<AllocaInst>(Arg->stripPointerCasts());
+ assert(AI && AI->isStaticAlloca() &&
+ "non-static alloca arg to localescape");
+ ProcessedAllocas[AI] = false;
+ }
+ break;
+ }
+ }
+}
+
+bool AddressSanitizer::suppressInstrumentationSiteForDebug(int &Instrumented) {
+ bool ShouldInstrument =
+ ClDebugMin < 0 || ClDebugMax < 0 ||
+ (Instrumented >= ClDebugMin && Instrumented <= ClDebugMax);
+ Instrumented++;
+ return !ShouldInstrument;
+}
+
+bool AddressSanitizer::instrumentFunction(Function &F,
+ const TargetLibraryInfo *TLI) {
+ if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
+ if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
+ if (F.getName().startswith("__asan_")) return false;
+
+ bool FunctionModified = false;
+
+ // If needed, insert __asan_init before checking for SanitizeAddress attr.
+ // This function needs to be called even if the function body is not
+ // instrumented.
+ if (maybeInsertAsanInitAtFunctionEntry(F))
+ FunctionModified = true;
+
+ // Leave if the function doesn't need instrumentation.
+ if (!F.hasFnAttribute(Attribute::SanitizeAddress)) return FunctionModified;
+
+ LLVM_DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
+
+ initializeCallbacks(*F.getParent());
+
+ FunctionStateRAII CleanupObj(this);
+
+ FunctionModified |= maybeInsertDynamicShadowAtFunctionEntry(F);
+
+ // We can't instrument allocas used with llvm.localescape. Only static allocas
+ // can be passed to that intrinsic.
+ markEscapedLocalAllocas(F);
+
+ // We want to instrument every address only once per basic block (unless there
+ // are calls between uses).
+ SmallPtrSet<Value *, 16> TempsToInstrument;
+ SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
+ SmallVector<MemIntrinsic *, 16> IntrinToInstrument;
+ SmallVector<Instruction *, 8> NoReturnCalls;
+ SmallVector<BasicBlock *, 16> AllBlocks;
+ SmallVector<Instruction *, 16> PointerComparisonsOrSubtracts;
+ int NumAllocas = 0;
+
+ // Fill the set of memory operations to instrument.
+ for (auto &BB : F) {
+ AllBlocks.push_back(&BB);
+ TempsToInstrument.clear();
+ int NumInsnsPerBB = 0;
+ for (auto &Inst : BB) {
+ if (LooksLikeCodeInBug11395(&Inst)) return false;
+ SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
+ getInterestingMemoryOperands(&Inst, InterestingOperands);
+
+ if (!InterestingOperands.empty()) {
+ for (auto &Operand : InterestingOperands) {
+ if (ClOpt && ClOptSameTemp) {
+ Value *Ptr = Operand.getPtr();
+ // If we have a mask, skip instrumentation if we've already
+ // instrumented the full object. But don't add to TempsToInstrument
+ // because we might get another load/store with a different mask.
+ if (Operand.MaybeMask) {
+ if (TempsToInstrument.count(Ptr))
+ continue; // We've seen this (whole) temp in the current BB.
+ } else {
+ if (!TempsToInstrument.insert(Ptr).second)
+ continue; // We've seen this temp in the current BB.
+ }
+ }
+ OperandsToInstrument.push_back(Operand);
+ NumInsnsPerBB++;
+ }
+ } else if (((ClInvalidPointerPairs || ClInvalidPointerCmp) &&
+ isInterestingPointerComparison(&Inst)) ||
+ ((ClInvalidPointerPairs || ClInvalidPointerSub) &&
+ isInterestingPointerSubtraction(&Inst))) {
+ PointerComparisonsOrSubtracts.push_back(&Inst);
+ } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst)) {
+ // ok, take it.
+ IntrinToInstrument.push_back(MI);
+ NumInsnsPerBB++;
+ } else {
+ if (isa<AllocaInst>(Inst)) NumAllocas++;
+ if (auto *CB = dyn_cast<CallBase>(&Inst)) {
+ // A call inside BB.
+ TempsToInstrument.clear();
+ if (CB->doesNotReturn() && !CB->hasMetadata("nosanitize"))
+ NoReturnCalls.push_back(CB);
+ }
+ if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+ maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
+ }
+ if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) break;
+ }
+ }
+
+ bool UseCalls = (ClInstrumentationWithCallsThreshold >= 0 &&
+ OperandsToInstrument.size() + IntrinToInstrument.size() >
+ (unsigned)ClInstrumentationWithCallsThreshold);
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ ObjectSizeOpts ObjSizeOpts;
+ ObjSizeOpts.RoundToAlign = true;
+ ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(), ObjSizeOpts);
+
+ // Instrument.
+ int NumInstrumented = 0;
+ for (auto &Operand : OperandsToInstrument) {
+ if (!suppressInstrumentationSiteForDebug(NumInstrumented))
+ instrumentMop(ObjSizeVis, Operand, UseCalls,
+ F.getParent()->getDataLayout());
+ FunctionModified = true;
+ }
+ for (auto Inst : IntrinToInstrument) {
+ if (!suppressInstrumentationSiteForDebug(NumInstrumented))
+ instrumentMemIntrinsic(Inst);
+ FunctionModified = true;
+ }
+
+ FunctionStackPoisoner FSP(F, *this);
+ bool ChangedStack = FSP.runOnFunction();
+
+ // We must unpoison the stack before NoReturn calls (throw, _exit, etc).
+ // See e.g. https://github.com/google/sanitizers/issues/37
+ for (auto CI : NoReturnCalls) {
+ IRBuilder<> IRB(CI);
+ IRB.CreateCall(AsanHandleNoReturnFunc, {});
+ }
+
+ for (auto Inst : PointerComparisonsOrSubtracts) {
+ instrumentPointerComparisonOrSubtraction(Inst);
+ FunctionModified = true;
+ }
+
+ if (ChangedStack || !NoReturnCalls.empty())
+ FunctionModified = true;
+
+ LLVM_DEBUG(dbgs() << "ASAN done instrumenting: " << FunctionModified << " "
+ << F << "\n");
+
+ return FunctionModified;
+}
+
+// Workaround for bug 11395: we don't want to instrument stack in functions
+// with large assembly blobs (32-bit only), otherwise reg alloc may crash.
+// FIXME: remove once the bug 11395 is fixed.
+bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
+ if (LongSize != 32) return false;
+ CallInst *CI = dyn_cast<CallInst>(I);
+ if (!CI || !CI->isInlineAsm()) return false;
+ if (CI->getNumArgOperands() <= 5) return false;
+ // We have inline assembly with quite a few arguments.
+ return true;
+}
+
+void FunctionStackPoisoner::initializeCallbacks(Module &M) {
+ IRBuilder<> IRB(*C);
+ for (int i = 0; i <= kMaxAsanStackMallocSizeClass; i++) {
+ std::string Suffix = itostr(i);
+ AsanStackMallocFunc[i] = M.getOrInsertFunction(
+ kAsanStackMallocNameTemplate + Suffix, IntptrTy, IntptrTy);
+ AsanStackFreeFunc[i] =
+ M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix,
+ IRB.getVoidTy(), IntptrTy, IntptrTy);
+ }
+ if (ASan.UseAfterScope) {
+ AsanPoisonStackMemoryFunc = M.getOrInsertFunction(
+ kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy);
+ AsanUnpoisonStackMemoryFunc = M.getOrInsertFunction(
+ kAsanUnpoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy);
+ }
+
+ for (size_t Val : {0x00, 0xf1, 0xf2, 0xf3, 0xf5, 0xf8}) {
+ std::ostringstream Name;
+ Name << kAsanSetShadowPrefix;
+ Name << std::setw(2) << std::setfill('0') << std::hex << Val;
+ AsanSetShadowFunc[Val] =
+ M.getOrInsertFunction(Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy);
+ }
+
+ AsanAllocaPoisonFunc = M.getOrInsertFunction(
+ kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy);
+ AsanAllocasUnpoisonFunc = M.getOrInsertFunction(
+ kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy);
+}
+
+void FunctionStackPoisoner::copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
+ ArrayRef<uint8_t> ShadowBytes,
+ size_t Begin, size_t End,
+ IRBuilder<> &IRB,
+ Value *ShadowBase) {
+ if (Begin >= End)
+ return;
+
+ const size_t LargestStoreSizeInBytes =
+ std::min<size_t>(sizeof(uint64_t), ASan.LongSize / 8);
+
+ const bool IsLittleEndian = F.getParent()->getDataLayout().isLittleEndian();
+
+ // Poison given range in shadow using larges store size with out leading and
+ // trailing zeros in ShadowMask. Zeros never change, so they need neither
+ // poisoning nor up-poisoning. Still we don't mind if some of them get into a
+ // middle of a store.
+ for (size_t i = Begin; i < End;) {
+ if (!ShadowMask[i]) {
+ assert(!ShadowBytes[i]);
+ ++i;
+ continue;
+ }
+
+ size_t StoreSizeInBytes = LargestStoreSizeInBytes;
+ // Fit store size into the range.
+ while (StoreSizeInBytes > End - i)
+ StoreSizeInBytes /= 2;
+
+ // Minimize store size by trimming trailing zeros.
+ for (size_t j = StoreSizeInBytes - 1; j && !ShadowMask[i + j]; --j) {
+ while (j <= StoreSizeInBytes / 2)
+ StoreSizeInBytes /= 2;
+ }
+
+ uint64_t Val = 0;
+ for (size_t j = 0; j < StoreSizeInBytes; j++) {
+ if (IsLittleEndian)
+ Val |= (uint64_t)ShadowBytes[i + j] << (8 * j);
+ else
+ Val = (Val << 8) | ShadowBytes[i + j];
+ }
+
+ Value *Ptr = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
+ Value *Poison = IRB.getIntN(StoreSizeInBytes * 8, Val);
+ IRB.CreateAlignedStore(
+ Poison, IRB.CreateIntToPtr(Ptr, Poison->getType()->getPointerTo()),
+ Align(1));
+
+ i += StoreSizeInBytes;
+ }
+}
+
+void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask,
+ ArrayRef<uint8_t> ShadowBytes,
+ IRBuilder<> &IRB, Value *ShadowBase) {
+ copyToShadow(ShadowMask, ShadowBytes, 0, ShadowMask.size(), IRB, ShadowBase);
+}
+
+void FunctionStackPoisoner::copyToShadow(ArrayRef<uint8_t> ShadowMask,
+ ArrayRef<uint8_t> ShadowBytes,
+ size_t Begin, size_t End,
+ IRBuilder<> &IRB, Value *ShadowBase) {
+ assert(ShadowMask.size() == ShadowBytes.size());
+ size_t Done = Begin;
+ for (size_t i = Begin, j = Begin + 1; i < End; i = j++) {
+ if (!ShadowMask[i]) {
+ assert(!ShadowBytes[i]);
+ continue;
+ }
+ uint8_t Val = ShadowBytes[i];
+ if (!AsanSetShadowFunc[Val])
+ continue;
+
+ // Skip same values.
+ for (; j < End && ShadowMask[j] && Val == ShadowBytes[j]; ++j) {
+ }
+
+ if (j - i >= ClMaxInlinePoisoningSize) {
+ copyToShadowInline(ShadowMask, ShadowBytes, Done, i, IRB, ShadowBase);
+ IRB.CreateCall(AsanSetShadowFunc[Val],
+ {IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i)),
+ ConstantInt::get(IntptrTy, j - i)});
+ Done = j;
+ }
+ }
+
+ copyToShadowInline(ShadowMask, ShadowBytes, Done, End, IRB, ShadowBase);
+}
+
+// Fake stack allocator (asan_fake_stack.h) has 11 size classes
+// for every power of 2 from kMinStackMallocSize to kMaxAsanStackMallocSizeClass
+static int StackMallocSizeClass(uint64_t LocalStackSize) {
+ assert(LocalStackSize <= kMaxStackMallocSize);
+ uint64_t MaxSize = kMinStackMallocSize;
+ for (int i = 0;; i++, MaxSize *= 2)
+ if (LocalStackSize <= MaxSize) return i;
+ llvm_unreachable("impossible LocalStackSize");
+}
+
+void FunctionStackPoisoner::copyArgsPassedByValToAllocas() {
+ Instruction *CopyInsertPoint = &F.front().front();
+ if (CopyInsertPoint == ASan.LocalDynamicShadow) {
+ // Insert after the dynamic shadow location is determined
+ CopyInsertPoint = CopyInsertPoint->getNextNode();
+ assert(CopyInsertPoint);
+ }
+ IRBuilder<> IRB(CopyInsertPoint);
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ for (Argument &Arg : F.args()) {
+ if (Arg.hasByValAttr()) {
+ Type *Ty = Arg.getParamByValType();
+ const Align Alignment =
+ DL.getValueOrABITypeAlignment(Arg.getParamAlign(), Ty);
+
+ AllocaInst *AI = IRB.CreateAlloca(
+ Ty, nullptr,
+ (Arg.hasName() ? Arg.getName() : "Arg" + Twine(Arg.getArgNo())) +
+ ".byval");
+ AI->setAlignment(Alignment);
+ Arg.replaceAllUsesWith(AI);
+
+ uint64_t AllocSize = DL.getTypeAllocSize(Ty);
+ IRB.CreateMemCpy(AI, Alignment, &Arg, Alignment, AllocSize);
+ }
+ }
+}
+
+PHINode *FunctionStackPoisoner::createPHI(IRBuilder<> &IRB, Value *Cond,
+ Value *ValueIfTrue,
+ Instruction *ThenTerm,
+ Value *ValueIfFalse) {
+ PHINode *PHI = IRB.CreatePHI(IntptrTy, 2);
+ BasicBlock *CondBlock = cast<Instruction>(Cond)->getParent();
+ PHI->addIncoming(ValueIfFalse, CondBlock);
+ BasicBlock *ThenBlock = ThenTerm->getParent();
+ PHI->addIncoming(ValueIfTrue, ThenBlock);
+ return PHI;
+}
+
+Value *FunctionStackPoisoner::createAllocaForLayout(
+ IRBuilder<> &IRB, const ASanStackFrameLayout &L, bool Dynamic) {
+ AllocaInst *Alloca;
+ if (Dynamic) {
+ Alloca = IRB.CreateAlloca(IRB.getInt8Ty(),
+ ConstantInt::get(IRB.getInt64Ty(), L.FrameSize),
+ "MyAlloca");
+ } else {
+ Alloca = IRB.CreateAlloca(ArrayType::get(IRB.getInt8Ty(), L.FrameSize),
+ nullptr, "MyAlloca");
+ assert(Alloca->isStaticAlloca());
+ }
+ assert((ClRealignStack & (ClRealignStack - 1)) == 0);
+ size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
+ Alloca->setAlignment(Align(FrameAlignment));
+ return IRB.CreatePointerCast(Alloca, IntptrTy);
+}
+
+void FunctionStackPoisoner::createDynamicAllocasInitStorage() {
+ BasicBlock &FirstBB = *F.begin();
+ IRBuilder<> IRB(dyn_cast<Instruction>(FirstBB.begin()));
+ DynamicAllocaLayout = IRB.CreateAlloca(IntptrTy, nullptr);
+ IRB.CreateStore(Constant::getNullValue(IntptrTy), DynamicAllocaLayout);
+ DynamicAllocaLayout->setAlignment(Align(32));
+}
+
+void FunctionStackPoisoner::processDynamicAllocas() {
+ if (!ClInstrumentDynamicAllocas || DynamicAllocaVec.empty()) {
+ assert(DynamicAllocaPoisonCallVec.empty());
+ return;
+ }
+
+ // Insert poison calls for lifetime intrinsics for dynamic allocas.
+ for (const auto &APC : DynamicAllocaPoisonCallVec) {
+ assert(APC.InsBefore);
+ assert(APC.AI);
+ assert(ASan.isInterestingAlloca(*APC.AI));
+ assert(!APC.AI->isStaticAlloca());
+
+ IRBuilder<> IRB(APC.InsBefore);
+ poisonAlloca(APC.AI, APC.Size, IRB, APC.DoPoison);
+ // Dynamic allocas will be unpoisoned unconditionally below in
+ // unpoisonDynamicAllocas.
+ // Flag that we need unpoison static allocas.
+ }
+
+ // Handle dynamic allocas.
+ createDynamicAllocasInitStorage();
+ for (auto &AI : DynamicAllocaVec)
+ handleDynamicAllocaCall(AI);
+ unpoisonDynamicAllocas();
+}
+
+/// Collect instructions in the entry block after \p InsBefore which initialize
+/// permanent storage for a function argument. These instructions must remain in
+/// the entry block so that uninitialized values do not appear in backtraces. An
+/// added benefit is that this conserves spill slots. This does not move stores
+/// before instrumented / "interesting" allocas.
+static void findStoresToUninstrumentedArgAllocas(
+ AddressSanitizer &ASan, Instruction &InsBefore,
+ SmallVectorImpl<Instruction *> &InitInsts) {
+ Instruction *Start = InsBefore.getNextNonDebugInstruction();
+ for (Instruction *It = Start; It; It = It->getNextNonDebugInstruction()) {
+ // Argument initialization looks like:
+ // 1) store <Argument>, <Alloca> OR
+ // 2) <CastArgument> = cast <Argument> to ...
+ // store <CastArgument> to <Alloca>
+ // Do not consider any other kind of instruction.
+ //
+ // Note: This covers all known cases, but may not be exhaustive. An
+ // alternative to pattern-matching stores is to DFS over all Argument uses:
+ // this might be more general, but is probably much more complicated.
+ if (isa<AllocaInst>(It) || isa<CastInst>(It))
+ continue;
+ if (auto *Store = dyn_cast<StoreInst>(It)) {
+ // The store destination must be an alloca that isn't interesting for
+ // ASan to instrument. These are moved up before InsBefore, and they're
+ // not interesting because allocas for arguments can be mem2reg'd.
+ auto *Alloca = dyn_cast<AllocaInst>(Store->getPointerOperand());
+ if (!Alloca || ASan.isInterestingAlloca(*Alloca))
+ continue;
+
+ Value *Val = Store->getValueOperand();
+ bool IsDirectArgInit = isa<Argument>(Val);
+ bool IsArgInitViaCast =
+ isa<CastInst>(Val) &&
+ isa<Argument>(cast<CastInst>(Val)->getOperand(0)) &&
+ // Check that the cast appears directly before the store. Otherwise
+ // moving the cast before InsBefore may break the IR.
+ Val == It->getPrevNonDebugInstruction();
+ bool IsArgInit = IsDirectArgInit || IsArgInitViaCast;
+ if (!IsArgInit)
+ continue;
+
+ if (IsArgInitViaCast)
+ InitInsts.push_back(cast<Instruction>(Val));
+ InitInsts.push_back(Store);
+ continue;
+ }
+
+ // Do not reorder past unknown instructions: argument initialization should
+ // only involve casts and stores.
+ return;
+ }
+}
+
+void FunctionStackPoisoner::processStaticAllocas() {
+ if (AllocaVec.empty()) {
+ assert(StaticAllocaPoisonCallVec.empty());
+ return;
+ }
+
+ int StackMallocIdx = -1;
+ DebugLoc EntryDebugLocation;
+ if (auto SP = F.getSubprogram())
EntryDebugLocation =
DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP);
-
- Instruction *InsBefore = AllocaVec[0];
- IRBuilder<> IRB(InsBefore);
-
- // Make sure non-instrumented allocas stay in the entry block. Otherwise,
- // debug info is broken, because only entry-block allocas are treated as
- // regular stack slots.
- auto InsBeforeB = InsBefore->getParent();
- assert(InsBeforeB == &F.getEntryBlock());
- for (auto *AI : StaticAllocasToMoveUp)
- if (AI->getParent() == InsBeforeB)
- AI->moveBefore(InsBefore);
-
- // Move stores of arguments into entry-block allocas as well. This prevents
- // extra stack slots from being generated (to house the argument values until
- // they can be stored into the allocas). This also prevents uninitialized
- // values from being shown in backtraces.
- SmallVector<Instruction *, 8> ArgInitInsts;
- findStoresToUninstrumentedArgAllocas(ASan, *InsBefore, ArgInitInsts);
- for (Instruction *ArgInitInst : ArgInitInsts)
- ArgInitInst->moveBefore(InsBefore);
-
- // If we have a call to llvm.localescape, keep it in the entry block.
- if (LocalEscapeCall) LocalEscapeCall->moveBefore(InsBefore);
-
- SmallVector<ASanStackVariableDescription, 16> SVD;
- SVD.reserve(AllocaVec.size());
- for (AllocaInst *AI : AllocaVec) {
- ASanStackVariableDescription D = {AI->getName().data(),
- ASan.getAllocaSizeInBytes(*AI),
- 0,
- AI->getAlignment(),
- AI,
- 0,
- 0};
- SVD.push_back(D);
- }
-
- // Minimal header size (left redzone) is 4 pointers,
- // i.e. 32 bytes on 64-bit platforms and 16 bytes in 32-bit platforms.
- size_t Granularity = 1ULL << Mapping.Scale;
- size_t MinHeaderSize = std::max((size_t)ASan.LongSize / 2, Granularity);
- const ASanStackFrameLayout &L =
- ComputeASanStackFrameLayout(SVD, Granularity, MinHeaderSize);
-
- // Build AllocaToSVDMap for ASanStackVariableDescription lookup.
- DenseMap<const AllocaInst *, ASanStackVariableDescription *> AllocaToSVDMap;
- for (auto &Desc : SVD)
- AllocaToSVDMap[Desc.AI] = &Desc;
-
- // Update SVD with information from lifetime intrinsics.
- for (const auto &APC : StaticAllocaPoisonCallVec) {
- assert(APC.InsBefore);
- assert(APC.AI);
- assert(ASan.isInterestingAlloca(*APC.AI));
- assert(APC.AI->isStaticAlloca());
-
- ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI];
- Desc.LifetimeSize = Desc.Size;
- if (const DILocation *FnLoc = EntryDebugLocation.get()) {
- if (const DILocation *LifetimeLoc = APC.InsBefore->getDebugLoc().get()) {
- if (LifetimeLoc->getFile() == FnLoc->getFile())
- if (unsigned Line = LifetimeLoc->getLine())
- Desc.Line = std::min(Desc.Line ? Desc.Line : Line, Line);
- }
- }
- }
-
- auto DescriptionString = ComputeASanStackFrameDescription(SVD);
- LLVM_DEBUG(dbgs() << DescriptionString << " --- " << L.FrameSize << "\n");
- uint64_t LocalStackSize = L.FrameSize;
- bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel &&
- LocalStackSize <= kMaxStackMallocSize;
- bool DoDynamicAlloca = ClDynamicAllocaStack;
- // Don't do dynamic alloca or stack malloc if:
- // 1) There is inline asm: too often it makes assumptions on which registers
- // are available.
- // 2) There is a returns_twice call (typically setjmp), which is
- // optimization-hostile, and doesn't play well with introduced indirect
- // register-relative calculation of local variable addresses.
- DoDynamicAlloca &= !HasInlineAsm && !HasReturnsTwiceCall;
- DoStackMalloc &= !HasInlineAsm && !HasReturnsTwiceCall;
-
- Value *StaticAlloca =
- DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false);
-
- Value *FakeStack;
- Value *LocalStackBase;
- Value *LocalStackBaseAlloca;
- uint8_t DIExprFlags = DIExpression::ApplyOffset;
-
- if (DoStackMalloc) {
- LocalStackBaseAlloca =
- IRB.CreateAlloca(IntptrTy, nullptr, "asan_local_stack_base");
- // void *FakeStack = __asan_option_detect_stack_use_after_return
- // ? __asan_stack_malloc_N(LocalStackSize)
- // : nullptr;
- // void *LocalStackBase = (FakeStack) ? FakeStack : alloca(LocalStackSize);
- Constant *OptionDetectUseAfterReturn = F.getParent()->getOrInsertGlobal(
- kAsanOptionDetectUseAfterReturn, IRB.getInt32Ty());
- Value *UseAfterReturnIsEnabled = IRB.CreateICmpNE(
- IRB.CreateLoad(IRB.getInt32Ty(), OptionDetectUseAfterReturn),
- Constant::getNullValue(IRB.getInt32Ty()));
- Instruction *Term =
- SplitBlockAndInsertIfThen(UseAfterReturnIsEnabled, InsBefore, false);
- IRBuilder<> IRBIf(Term);
- StackMallocIdx = StackMallocSizeClass(LocalStackSize);
- assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass);
- Value *FakeStackValue =
- IRBIf.CreateCall(AsanStackMallocFunc[StackMallocIdx],
- ConstantInt::get(IntptrTy, LocalStackSize));
- IRB.SetInsertPoint(InsBefore);
- FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term,
- ConstantInt::get(IntptrTy, 0));
-
- Value *NoFakeStack =
- IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy));
- Term = SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false);
- IRBIf.SetInsertPoint(Term);
- Value *AllocaValue =
- DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca;
-
- IRB.SetInsertPoint(InsBefore);
- LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack);
- IRB.CreateStore(LocalStackBase, LocalStackBaseAlloca);
- DIExprFlags |= DIExpression::DerefBefore;
- } else {
- // void *FakeStack = nullptr;
- // void *LocalStackBase = alloca(LocalStackSize);
- FakeStack = ConstantInt::get(IntptrTy, 0);
- LocalStackBase =
- DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca;
- LocalStackBaseAlloca = LocalStackBase;
- }
-
- // It shouldn't matter whether we pass an `alloca` or a `ptrtoint` as the
- // dbg.declare address opereand, but passing a `ptrtoint` seems to confuse
- // later passes and can result in dropped variable coverage in debug info.
- Value *LocalStackBaseAllocaPtr =
- isa<PtrToIntInst>(LocalStackBaseAlloca)
- ? cast<PtrToIntInst>(LocalStackBaseAlloca)->getPointerOperand()
- : LocalStackBaseAlloca;
- assert(isa<AllocaInst>(LocalStackBaseAllocaPtr) &&
- "Variable descriptions relative to ASan stack base will be dropped");
-
- // Replace Alloca instructions with base+offset.
- for (const auto &Desc : SVD) {
- AllocaInst *AI = Desc.AI;
- replaceDbgDeclare(AI, LocalStackBaseAllocaPtr, DIB, DIExprFlags,
- Desc.Offset);
- Value *NewAllocaPtr = IRB.CreateIntToPtr(
- IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
- AI->getType());
- AI->replaceAllUsesWith(NewAllocaPtr);
- }
-
- // The left-most redzone has enough space for at least 4 pointers.
- // Write the Magic value to redzone[0].
- Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
- IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
- BasePlus0);
- // Write the frame description constant to redzone[1].
- Value *BasePlus1 = IRB.CreateIntToPtr(
- IRB.CreateAdd(LocalStackBase,
- ConstantInt::get(IntptrTy, ASan.LongSize / 8)),
- IntptrPtrTy);
- GlobalVariable *StackDescriptionGlobal =
- createPrivateGlobalForString(*F.getParent(), DescriptionString,
- /*AllowMerging*/ true, kAsanGenPrefix);
- Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
- IRB.CreateStore(Description, BasePlus1);
- // Write the PC to redzone[2].
- Value *BasePlus2 = IRB.CreateIntToPtr(
- IRB.CreateAdd(LocalStackBase,
- ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8)),
- IntptrPtrTy);
- IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
-
- const auto &ShadowAfterScope = GetShadowBytesAfterScope(SVD, L);
-
- // Poison the stack red zones at the entry.
- Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
- // As mask we must use most poisoned case: red zones and after scope.
- // As bytes we can use either the same or just red zones only.
- copyToShadow(ShadowAfterScope, ShadowAfterScope, IRB, ShadowBase);
-
- if (!StaticAllocaPoisonCallVec.empty()) {
- const auto &ShadowInScope = GetShadowBytes(SVD, L);
-
- // Poison static allocas near lifetime intrinsics.
- for (const auto &APC : StaticAllocaPoisonCallVec) {
- const ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI];
- assert(Desc.Offset % L.Granularity == 0);
- size_t Begin = Desc.Offset / L.Granularity;
- size_t End = Begin + (APC.Size + L.Granularity - 1) / L.Granularity;
-
- IRBuilder<> IRB(APC.InsBefore);
- copyToShadow(ShadowAfterScope,
- APC.DoPoison ? ShadowAfterScope : ShadowInScope, Begin, End,
- IRB, ShadowBase);
- }
- }
-
- SmallVector<uint8_t, 64> ShadowClean(ShadowAfterScope.size(), 0);
- SmallVector<uint8_t, 64> ShadowAfterReturn;
-
- // (Un)poison the stack before all ret instructions.
+
+ Instruction *InsBefore = AllocaVec[0];
+ IRBuilder<> IRB(InsBefore);
+
+ // Make sure non-instrumented allocas stay in the entry block. Otherwise,
+ // debug info is broken, because only entry-block allocas are treated as
+ // regular stack slots.
+ auto InsBeforeB = InsBefore->getParent();
+ assert(InsBeforeB == &F.getEntryBlock());
+ for (auto *AI : StaticAllocasToMoveUp)
+ if (AI->getParent() == InsBeforeB)
+ AI->moveBefore(InsBefore);
+
+ // Move stores of arguments into entry-block allocas as well. This prevents
+ // extra stack slots from being generated (to house the argument values until
+ // they can be stored into the allocas). This also prevents uninitialized
+ // values from being shown in backtraces.
+ SmallVector<Instruction *, 8> ArgInitInsts;
+ findStoresToUninstrumentedArgAllocas(ASan, *InsBefore, ArgInitInsts);
+ for (Instruction *ArgInitInst : ArgInitInsts)
+ ArgInitInst->moveBefore(InsBefore);
+
+ // If we have a call to llvm.localescape, keep it in the entry block.
+ if (LocalEscapeCall) LocalEscapeCall->moveBefore(InsBefore);
+
+ SmallVector<ASanStackVariableDescription, 16> SVD;
+ SVD.reserve(AllocaVec.size());
+ for (AllocaInst *AI : AllocaVec) {
+ ASanStackVariableDescription D = {AI->getName().data(),
+ ASan.getAllocaSizeInBytes(*AI),
+ 0,
+ AI->getAlignment(),
+ AI,
+ 0,
+ 0};
+ SVD.push_back(D);
+ }
+
+ // Minimal header size (left redzone) is 4 pointers,
+ // i.e. 32 bytes on 64-bit platforms and 16 bytes in 32-bit platforms.
+ size_t Granularity = 1ULL << Mapping.Scale;
+ size_t MinHeaderSize = std::max((size_t)ASan.LongSize / 2, Granularity);
+ const ASanStackFrameLayout &L =
+ ComputeASanStackFrameLayout(SVD, Granularity, MinHeaderSize);
+
+ // Build AllocaToSVDMap for ASanStackVariableDescription lookup.
+ DenseMap<const AllocaInst *, ASanStackVariableDescription *> AllocaToSVDMap;
+ for (auto &Desc : SVD)
+ AllocaToSVDMap[Desc.AI] = &Desc;
+
+ // Update SVD with information from lifetime intrinsics.
+ for (const auto &APC : StaticAllocaPoisonCallVec) {
+ assert(APC.InsBefore);
+ assert(APC.AI);
+ assert(ASan.isInterestingAlloca(*APC.AI));
+ assert(APC.AI->isStaticAlloca());
+
+ ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI];
+ Desc.LifetimeSize = Desc.Size;
+ if (const DILocation *FnLoc = EntryDebugLocation.get()) {
+ if (const DILocation *LifetimeLoc = APC.InsBefore->getDebugLoc().get()) {
+ if (LifetimeLoc->getFile() == FnLoc->getFile())
+ if (unsigned Line = LifetimeLoc->getLine())
+ Desc.Line = std::min(Desc.Line ? Desc.Line : Line, Line);
+ }
+ }
+ }
+
+ auto DescriptionString = ComputeASanStackFrameDescription(SVD);
+ LLVM_DEBUG(dbgs() << DescriptionString << " --- " << L.FrameSize << "\n");
+ uint64_t LocalStackSize = L.FrameSize;
+ bool DoStackMalloc = ClUseAfterReturn && !ASan.CompileKernel &&
+ LocalStackSize <= kMaxStackMallocSize;
+ bool DoDynamicAlloca = ClDynamicAllocaStack;
+ // Don't do dynamic alloca or stack malloc if:
+ // 1) There is inline asm: too often it makes assumptions on which registers
+ // are available.
+ // 2) There is a returns_twice call (typically setjmp), which is
+ // optimization-hostile, and doesn't play well with introduced indirect
+ // register-relative calculation of local variable addresses.
+ DoDynamicAlloca &= !HasInlineAsm && !HasReturnsTwiceCall;
+ DoStackMalloc &= !HasInlineAsm && !HasReturnsTwiceCall;
+
+ Value *StaticAlloca =
+ DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false);
+
+ Value *FakeStack;
+ Value *LocalStackBase;
+ Value *LocalStackBaseAlloca;
+ uint8_t DIExprFlags = DIExpression::ApplyOffset;
+
+ if (DoStackMalloc) {
+ LocalStackBaseAlloca =
+ IRB.CreateAlloca(IntptrTy, nullptr, "asan_local_stack_base");
+ // void *FakeStack = __asan_option_detect_stack_use_after_return
+ // ? __asan_stack_malloc_N(LocalStackSize)
+ // : nullptr;
+ // void *LocalStackBase = (FakeStack) ? FakeStack : alloca(LocalStackSize);
+ Constant *OptionDetectUseAfterReturn = F.getParent()->getOrInsertGlobal(
+ kAsanOptionDetectUseAfterReturn, IRB.getInt32Ty());
+ Value *UseAfterReturnIsEnabled = IRB.CreateICmpNE(
+ IRB.CreateLoad(IRB.getInt32Ty(), OptionDetectUseAfterReturn),
+ Constant::getNullValue(IRB.getInt32Ty()));
+ Instruction *Term =
+ SplitBlockAndInsertIfThen(UseAfterReturnIsEnabled, InsBefore, false);
+ IRBuilder<> IRBIf(Term);
+ StackMallocIdx = StackMallocSizeClass(LocalStackSize);
+ assert(StackMallocIdx <= kMaxAsanStackMallocSizeClass);
+ Value *FakeStackValue =
+ IRBIf.CreateCall(AsanStackMallocFunc[StackMallocIdx],
+ ConstantInt::get(IntptrTy, LocalStackSize));
+ IRB.SetInsertPoint(InsBefore);
+ FakeStack = createPHI(IRB, UseAfterReturnIsEnabled, FakeStackValue, Term,
+ ConstantInt::get(IntptrTy, 0));
+
+ Value *NoFakeStack =
+ IRB.CreateICmpEQ(FakeStack, Constant::getNullValue(IntptrTy));
+ Term = SplitBlockAndInsertIfThen(NoFakeStack, InsBefore, false);
+ IRBIf.SetInsertPoint(Term);
+ Value *AllocaValue =
+ DoDynamicAlloca ? createAllocaForLayout(IRBIf, L, true) : StaticAlloca;
+
+ IRB.SetInsertPoint(InsBefore);
+ LocalStackBase = createPHI(IRB, NoFakeStack, AllocaValue, Term, FakeStack);
+ IRB.CreateStore(LocalStackBase, LocalStackBaseAlloca);
+ DIExprFlags |= DIExpression::DerefBefore;
+ } else {
+ // void *FakeStack = nullptr;
+ // void *LocalStackBase = alloca(LocalStackSize);
+ FakeStack = ConstantInt::get(IntptrTy, 0);
+ LocalStackBase =
+ DoDynamicAlloca ? createAllocaForLayout(IRB, L, true) : StaticAlloca;
+ LocalStackBaseAlloca = LocalStackBase;
+ }
+
+ // It shouldn't matter whether we pass an `alloca` or a `ptrtoint` as the
+ // dbg.declare address opereand, but passing a `ptrtoint` seems to confuse
+ // later passes and can result in dropped variable coverage in debug info.
+ Value *LocalStackBaseAllocaPtr =
+ isa<PtrToIntInst>(LocalStackBaseAlloca)
+ ? cast<PtrToIntInst>(LocalStackBaseAlloca)->getPointerOperand()
+ : LocalStackBaseAlloca;
+ assert(isa<AllocaInst>(LocalStackBaseAllocaPtr) &&
+ "Variable descriptions relative to ASan stack base will be dropped");
+
+ // Replace Alloca instructions with base+offset.
+ for (const auto &Desc : SVD) {
+ AllocaInst *AI = Desc.AI;
+ replaceDbgDeclare(AI, LocalStackBaseAllocaPtr, DIB, DIExprFlags,
+ Desc.Offset);
+ Value *NewAllocaPtr = IRB.CreateIntToPtr(
+ IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
+ AI->getType());
+ AI->replaceAllUsesWith(NewAllocaPtr);
+ }
+
+ // The left-most redzone has enough space for at least 4 pointers.
+ // Write the Magic value to redzone[0].
+ Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
+ IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
+ BasePlus0);
+ // Write the frame description constant to redzone[1].
+ Value *BasePlus1 = IRB.CreateIntToPtr(
+ IRB.CreateAdd(LocalStackBase,
+ ConstantInt::get(IntptrTy, ASan.LongSize / 8)),
+ IntptrPtrTy);
+ GlobalVariable *StackDescriptionGlobal =
+ createPrivateGlobalForString(*F.getParent(), DescriptionString,
+ /*AllowMerging*/ true, kAsanGenPrefix);
+ Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal, IntptrTy);
+ IRB.CreateStore(Description, BasePlus1);
+ // Write the PC to redzone[2].
+ Value *BasePlus2 = IRB.CreateIntToPtr(
+ IRB.CreateAdd(LocalStackBase,
+ ConstantInt::get(IntptrTy, 2 * ASan.LongSize / 8)),
+ IntptrPtrTy);
+ IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
+
+ const auto &ShadowAfterScope = GetShadowBytesAfterScope(SVD, L);
+
+ // Poison the stack red zones at the entry.
+ Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
+ // As mask we must use most poisoned case: red zones and after scope.
+ // As bytes we can use either the same or just red zones only.
+ copyToShadow(ShadowAfterScope, ShadowAfterScope, IRB, ShadowBase);
+
+ if (!StaticAllocaPoisonCallVec.empty()) {
+ const auto &ShadowInScope = GetShadowBytes(SVD, L);
+
+ // Poison static allocas near lifetime intrinsics.
+ for (const auto &APC : StaticAllocaPoisonCallVec) {
+ const ASanStackVariableDescription &Desc = *AllocaToSVDMap[APC.AI];
+ assert(Desc.Offset % L.Granularity == 0);
+ size_t Begin = Desc.Offset / L.Granularity;
+ size_t End = Begin + (APC.Size + L.Granularity - 1) / L.Granularity;
+
+ IRBuilder<> IRB(APC.InsBefore);
+ copyToShadow(ShadowAfterScope,
+ APC.DoPoison ? ShadowAfterScope : ShadowInScope, Begin, End,
+ IRB, ShadowBase);
+ }
+ }
+
+ SmallVector<uint8_t, 64> ShadowClean(ShadowAfterScope.size(), 0);
+ SmallVector<uint8_t, 64> ShadowAfterReturn;
+
+ // (Un)poison the stack before all ret instructions.
for (Instruction *Ret : RetVec) {
- IRBuilder<> IRBRet(Ret);
- // Mark the current frame as retired.
- IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
- BasePlus0);
- if (DoStackMalloc) {
- assert(StackMallocIdx >= 0);
- // if FakeStack != 0 // LocalStackBase == FakeStack
- // // In use-after-return mode, poison the whole stack frame.
- // if StackMallocIdx <= 4
- // // For small sizes inline the whole thing:
- // memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize);
- // **SavedFlagPtr(FakeStack) = 0
- // else
- // __asan_stack_free_N(FakeStack, LocalStackSize)
- // else
- // <This is not a fake stack; unpoison the redzones>
- Value *Cmp =
- IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy));
- Instruction *ThenTerm, *ElseTerm;
- SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm);
-
- IRBuilder<> IRBPoison(ThenTerm);
- if (StackMallocIdx <= 4) {
- int ClassSize = kMinStackMallocSize << StackMallocIdx;
- ShadowAfterReturn.resize(ClassSize / L.Granularity,
- kAsanStackUseAfterReturnMagic);
- copyToShadow(ShadowAfterReturn, ShadowAfterReturn, IRBPoison,
- ShadowBase);
- Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
- FakeStack,
- ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
- Value *SavedFlagPtr = IRBPoison.CreateLoad(
- IntptrTy, IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy));
- IRBPoison.CreateStore(
- Constant::getNullValue(IRBPoison.getInt8Ty()),
- IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy()));
- } else {
- // For larger frames call __asan_stack_free_*.
- IRBPoison.CreateCall(
- AsanStackFreeFunc[StackMallocIdx],
- {FakeStack, ConstantInt::get(IntptrTy, LocalStackSize)});
- }
-
- IRBuilder<> IRBElse(ElseTerm);
- copyToShadow(ShadowAfterScope, ShadowClean, IRBElse, ShadowBase);
- } else {
- copyToShadow(ShadowAfterScope, ShadowClean, IRBRet, ShadowBase);
- }
- }
-
- // We are done. Remove the old unused alloca instructions.
- for (auto AI : AllocaVec) AI->eraseFromParent();
-}
-
-void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
- IRBuilder<> &IRB, bool DoPoison) {
- // For now just insert the call to ASan runtime.
- Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy);
- Value *SizeArg = ConstantInt::get(IntptrTy, Size);
- IRB.CreateCall(
- DoPoison ? AsanPoisonStackMemoryFunc : AsanUnpoisonStackMemoryFunc,
- {AddrArg, SizeArg});
-}
-
-// Handling llvm.lifetime intrinsics for a given %alloca:
-// (1) collect all llvm.lifetime.xxx(%size, %value) describing the alloca.
-// (2) if %size is constant, poison memory for llvm.lifetime.end (to detect
-// invalid accesses) and unpoison it for llvm.lifetime.start (the memory
-// could be poisoned by previous llvm.lifetime.end instruction, as the
-// variable may go in and out of scope several times, e.g. in loops).
-// (3) if we poisoned at least one %alloca in a function,
-// unpoison the whole stack frame at function exit.
-void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
- IRBuilder<> IRB(AI);
-
- const unsigned Alignment = std::max(kAllocaRzSize, AI->getAlignment());
- const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1;
-
- Value *Zero = Constant::getNullValue(IntptrTy);
- Value *AllocaRzSize = ConstantInt::get(IntptrTy, kAllocaRzSize);
- Value *AllocaRzMask = ConstantInt::get(IntptrTy, AllocaRedzoneMask);
-
- // Since we need to extend alloca with additional memory to locate
- // redzones, and OldSize is number of allocated blocks with
- // ElementSize size, get allocated memory size in bytes by
- // OldSize * ElementSize.
- const unsigned ElementSize =
- F.getParent()->getDataLayout().getTypeAllocSize(AI->getAllocatedType());
- Value *OldSize =
- IRB.CreateMul(IRB.CreateIntCast(AI->getArraySize(), IntptrTy, false),
- ConstantInt::get(IntptrTy, ElementSize));
-
- // PartialSize = OldSize % 32
- Value *PartialSize = IRB.CreateAnd(OldSize, AllocaRzMask);
-
- // Misalign = kAllocaRzSize - PartialSize;
- Value *Misalign = IRB.CreateSub(AllocaRzSize, PartialSize);
-
- // PartialPadding = Misalign != kAllocaRzSize ? Misalign : 0;
- Value *Cond = IRB.CreateICmpNE(Misalign, AllocaRzSize);
- Value *PartialPadding = IRB.CreateSelect(Cond, Misalign, Zero);
-
- // AdditionalChunkSize = Alignment + PartialPadding + kAllocaRzSize
- // Alignment is added to locate left redzone, PartialPadding for possible
- // partial redzone and kAllocaRzSize for right redzone respectively.
- Value *AdditionalChunkSize = IRB.CreateAdd(
- ConstantInt::get(IntptrTy, Alignment + kAllocaRzSize), PartialPadding);
-
- Value *NewSize = IRB.CreateAdd(OldSize, AdditionalChunkSize);
-
- // Insert new alloca with new NewSize and Alignment params.
- AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize);
- NewAlloca->setAlignment(Align(Alignment));
-
- // NewAddress = Address + Alignment
- Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),
- ConstantInt::get(IntptrTy, Alignment));
-
- // Insert __asan_alloca_poison call for new created alloca.
- IRB.CreateCall(AsanAllocaPoisonFunc, {NewAddress, OldSize});
-
- // Store the last alloca's address to DynamicAllocaLayout. We'll need this
- // for unpoisoning stuff.
- IRB.CreateStore(IRB.CreatePtrToInt(NewAlloca, IntptrTy), DynamicAllocaLayout);
-
- Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType());
-
- // Replace all uses of AddessReturnedByAlloca with NewAddressPtr.
- AI->replaceAllUsesWith(NewAddressPtr);
-
- // We are done. Erase old alloca from parent.
- AI->eraseFromParent();
-}
-
-// isSafeAccess returns true if Addr is always inbounds with respect to its
-// base object. For example, it is a field access or an array access with
-// constant inbounds index.
-bool AddressSanitizer::isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis,
- Value *Addr, uint64_t TypeSize) const {
- SizeOffsetType SizeOffset = ObjSizeVis.compute(Addr);
- if (!ObjSizeVis.bothKnown(SizeOffset)) return false;
- uint64_t Size = SizeOffset.first.getZExtValue();
- int64_t Offset = SizeOffset.second.getSExtValue();
- // Three checks are required to ensure safety:
- // . Offset >= 0 (since the offset is given from the base ptr)
- // . Size >= Offset (unsigned)
- // . Size - Offset >= NeededSize (unsigned)
- return Offset >= 0 && Size >= uint64_t(Offset) &&
- Size - uint64_t(Offset) >= TypeSize / 8;
-}
+ IRBuilder<> IRBRet(Ret);
+ // Mark the current frame as retired.
+ IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
+ BasePlus0);
+ if (DoStackMalloc) {
+ assert(StackMallocIdx >= 0);
+ // if FakeStack != 0 // LocalStackBase == FakeStack
+ // // In use-after-return mode, poison the whole stack frame.
+ // if StackMallocIdx <= 4
+ // // For small sizes inline the whole thing:
+ // memset(ShadowBase, kAsanStackAfterReturnMagic, ShadowSize);
+ // **SavedFlagPtr(FakeStack) = 0
+ // else
+ // __asan_stack_free_N(FakeStack, LocalStackSize)
+ // else
+ // <This is not a fake stack; unpoison the redzones>
+ Value *Cmp =
+ IRBRet.CreateICmpNE(FakeStack, Constant::getNullValue(IntptrTy));
+ Instruction *ThenTerm, *ElseTerm;
+ SplitBlockAndInsertIfThenElse(Cmp, Ret, &ThenTerm, &ElseTerm);
+
+ IRBuilder<> IRBPoison(ThenTerm);
+ if (StackMallocIdx <= 4) {
+ int ClassSize = kMinStackMallocSize << StackMallocIdx;
+ ShadowAfterReturn.resize(ClassSize / L.Granularity,
+ kAsanStackUseAfterReturnMagic);
+ copyToShadow(ShadowAfterReturn, ShadowAfterReturn, IRBPoison,
+ ShadowBase);
+ Value *SavedFlagPtrPtr = IRBPoison.CreateAdd(
+ FakeStack,
+ ConstantInt::get(IntptrTy, ClassSize - ASan.LongSize / 8));
+ Value *SavedFlagPtr = IRBPoison.CreateLoad(
+ IntptrTy, IRBPoison.CreateIntToPtr(SavedFlagPtrPtr, IntptrPtrTy));
+ IRBPoison.CreateStore(
+ Constant::getNullValue(IRBPoison.getInt8Ty()),
+ IRBPoison.CreateIntToPtr(SavedFlagPtr, IRBPoison.getInt8PtrTy()));
+ } else {
+ // For larger frames call __asan_stack_free_*.
+ IRBPoison.CreateCall(
+ AsanStackFreeFunc[StackMallocIdx],
+ {FakeStack, ConstantInt::get(IntptrTy, LocalStackSize)});
+ }
+
+ IRBuilder<> IRBElse(ElseTerm);
+ copyToShadow(ShadowAfterScope, ShadowClean, IRBElse, ShadowBase);
+ } else {
+ copyToShadow(ShadowAfterScope, ShadowClean, IRBRet, ShadowBase);
+ }
+ }
+
+ // We are done. Remove the old unused alloca instructions.
+ for (auto AI : AllocaVec) AI->eraseFromParent();
+}
+
+void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
+ IRBuilder<> &IRB, bool DoPoison) {
+ // For now just insert the call to ASan runtime.
+ Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy);
+ Value *SizeArg = ConstantInt::get(IntptrTy, Size);
+ IRB.CreateCall(
+ DoPoison ? AsanPoisonStackMemoryFunc : AsanUnpoisonStackMemoryFunc,
+ {AddrArg, SizeArg});
+}
+
+// Handling llvm.lifetime intrinsics for a given %alloca:
+// (1) collect all llvm.lifetime.xxx(%size, %value) describing the alloca.
+// (2) if %size is constant, poison memory for llvm.lifetime.end (to detect
+// invalid accesses) and unpoison it for llvm.lifetime.start (the memory
+// could be poisoned by previous llvm.lifetime.end instruction, as the
+// variable may go in and out of scope several times, e.g. in loops).
+// (3) if we poisoned at least one %alloca in a function,
+// unpoison the whole stack frame at function exit.
+void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
+ IRBuilder<> IRB(AI);
+
+ const unsigned Alignment = std::max(kAllocaRzSize, AI->getAlignment());
+ const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1;
+
+ Value *Zero = Constant::getNullValue(IntptrTy);
+ Value *AllocaRzSize = ConstantInt::get(IntptrTy, kAllocaRzSize);
+ Value *AllocaRzMask = ConstantInt::get(IntptrTy, AllocaRedzoneMask);
+
+ // Since we need to extend alloca with additional memory to locate
+ // redzones, and OldSize is number of allocated blocks with
+ // ElementSize size, get allocated memory size in bytes by
+ // OldSize * ElementSize.
+ const unsigned ElementSize =
+ F.getParent()->getDataLayout().getTypeAllocSize(AI->getAllocatedType());
+ Value *OldSize =
+ IRB.CreateMul(IRB.CreateIntCast(AI->getArraySize(), IntptrTy, false),
+ ConstantInt::get(IntptrTy, ElementSize));
+
+ // PartialSize = OldSize % 32
+ Value *PartialSize = IRB.CreateAnd(OldSize, AllocaRzMask);
+
+ // Misalign = kAllocaRzSize - PartialSize;
+ Value *Misalign = IRB.CreateSub(AllocaRzSize, PartialSize);
+
+ // PartialPadding = Misalign != kAllocaRzSize ? Misalign : 0;
+ Value *Cond = IRB.CreateICmpNE(Misalign, AllocaRzSize);
+ Value *PartialPadding = IRB.CreateSelect(Cond, Misalign, Zero);
+
+ // AdditionalChunkSize = Alignment + PartialPadding + kAllocaRzSize
+ // Alignment is added to locate left redzone, PartialPadding for possible
+ // partial redzone and kAllocaRzSize for right redzone respectively.
+ Value *AdditionalChunkSize = IRB.CreateAdd(
+ ConstantInt::get(IntptrTy, Alignment + kAllocaRzSize), PartialPadding);
+
+ Value *NewSize = IRB.CreateAdd(OldSize, AdditionalChunkSize);
+
+ // Insert new alloca with new NewSize and Alignment params.
+ AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize);
+ NewAlloca->setAlignment(Align(Alignment));
+
+ // NewAddress = Address + Alignment
+ Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),
+ ConstantInt::get(IntptrTy, Alignment));
+
+ // Insert __asan_alloca_poison call for new created alloca.
+ IRB.CreateCall(AsanAllocaPoisonFunc, {NewAddress, OldSize});
+
+ // Store the last alloca's address to DynamicAllocaLayout. We'll need this
+ // for unpoisoning stuff.
+ IRB.CreateStore(IRB.CreatePtrToInt(NewAlloca, IntptrTy), DynamicAllocaLayout);
+
+ Value *NewAddressPtr = IRB.CreateIntToPtr(NewAddress, AI->getType());
+
+ // Replace all uses of AddessReturnedByAlloca with NewAddressPtr.
+ AI->replaceAllUsesWith(NewAddressPtr);
+
+ // We are done. Erase old alloca from parent.
+ AI->eraseFromParent();
+}
+
+// isSafeAccess returns true if Addr is always inbounds with respect to its
+// base object. For example, it is a field access or an array access with
+// constant inbounds index.
+bool AddressSanitizer::isSafeAccess(ObjectSizeOffsetVisitor &ObjSizeVis,
+ Value *Addr, uint64_t TypeSize) const {
+ SizeOffsetType SizeOffset = ObjSizeVis.compute(Addr);
+ if (!ObjSizeVis.bothKnown(SizeOffset)) return false;
+ uint64_t Size = SizeOffset.first.getZExtValue();
+ int64_t Offset = SizeOffset.second.getSExtValue();
+ // Three checks are required to ensure safety:
+ // . Offset >= 0 (since the offset is given from the base ptr)
+ // . Size >= Offset (unsigned)
+ // . Size - Offset >= NeededSize (unsigned)
+ return Offset >= 0 && Size >= uint64_t(Offset) &&
+ Size - uint64_t(Offset) >= TypeSize / 8;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/BoundsChecking.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/BoundsChecking.cpp
index c2d9964ecc..efb11b68a1 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -1,254 +1,254 @@
-//===- BoundsChecking.cpp - Instrumentation for run-time bounds checking --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation/BoundsChecking.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetFolder.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cstdint>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "bounds-checking"
-
-static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap",
- cl::desc("Use one trap block per function"));
-
-STATISTIC(ChecksAdded, "Bounds checks added");
-STATISTIC(ChecksSkipped, "Bounds checks skipped");
-STATISTIC(ChecksUnable, "Bounds checks unable to add");
-
-using BuilderTy = IRBuilder<TargetFolder>;
-
-/// Gets the conditions under which memory accessing instructions will overflow.
-///
-/// \p Ptr is the pointer that will be read/written, and \p InstVal is either
-/// the result from the load or the value being stored. It is used to determine
-/// the size of memory block that is touched.
-///
-/// Returns the condition under which the access will overflow.
-static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal,
- const DataLayout &DL, TargetLibraryInfo &TLI,
- ObjectSizeOffsetEvaluator &ObjSizeEval,
- BuilderTy &IRB, ScalarEvolution &SE) {
- uint64_t NeededSize = DL.getTypeStoreSize(InstVal->getType());
- LLVM_DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
- << " bytes\n");
-
- SizeOffsetEvalType SizeOffset = ObjSizeEval.compute(Ptr);
-
- if (!ObjSizeEval.bothKnown(SizeOffset)) {
- ++ChecksUnable;
- return nullptr;
- }
-
- Value *Size = SizeOffset.first;
- Value *Offset = SizeOffset.second;
- ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
-
- Type *IntTy = DL.getIntPtrType(Ptr->getType());
- Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
-
- auto SizeRange = SE.getUnsignedRange(SE.getSCEV(Size));
- auto OffsetRange = SE.getUnsignedRange(SE.getSCEV(Offset));
- auto NeededSizeRange = SE.getUnsignedRange(SE.getSCEV(NeededSizeVal));
-
- // three checks are required to ensure safety:
- // . Offset >= 0 (since the offset is given from the base ptr)
- // . Size >= Offset (unsigned)
- // . Size - Offset >= NeededSize (unsigned)
- //
- // optimization: if Size >= 0 (signed), skip 1st check
- // FIXME: add NSW/NUW here? -- we dont care if the subtraction overflows
- Value *ObjSize = IRB.CreateSub(Size, Offset);
- Value *Cmp2 = SizeRange.getUnsignedMin().uge(OffsetRange.getUnsignedMax())
- ? ConstantInt::getFalse(Ptr->getContext())
- : IRB.CreateICmpULT(Size, Offset);
- Value *Cmp3 = SizeRange.sub(OffsetRange)
- .getUnsignedMin()
- .uge(NeededSizeRange.getUnsignedMax())
- ? ConstantInt::getFalse(Ptr->getContext())
- : IRB.CreateICmpULT(ObjSize, NeededSizeVal);
- Value *Or = IRB.CreateOr(Cmp2, Cmp3);
- if ((!SizeCI || SizeCI->getValue().slt(0)) &&
- !SizeRange.getSignedMin().isNonNegative()) {
- Value *Cmp1 = IRB.CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0));
- Or = IRB.CreateOr(Cmp1, Or);
- }
-
- return Or;
-}
-
-/// Adds run-time bounds checks to memory accessing instructions.
-///
-/// \p Or is the condition that should guard the trap.
-///
-/// \p GetTrapBB is a callable that returns the trap BB to use on failure.
-template <typename GetTrapBBT>
-static void insertBoundsCheck(Value *Or, BuilderTy &IRB, GetTrapBBT GetTrapBB) {
- // check if the comparison is always false
- ConstantInt *C = dyn_cast_or_null<ConstantInt>(Or);
- if (C) {
- ++ChecksSkipped;
- // If non-zero, nothing to do.
- if (!C->getZExtValue())
- return;
- }
- ++ChecksAdded;
-
- BasicBlock::iterator SplitI = IRB.GetInsertPoint();
- BasicBlock *OldBB = SplitI->getParent();
- BasicBlock *Cont = OldBB->splitBasicBlock(SplitI);
- OldBB->getTerminator()->eraseFromParent();
-
- if (C) {
- // If we have a constant zero, unconditionally branch.
- // FIXME: We should really handle this differently to bypass the splitting
- // the block.
- BranchInst::Create(GetTrapBB(IRB), OldBB);
- return;
- }
-
- // Create the conditional branch.
- BranchInst::Create(GetTrapBB(IRB), Cont, Or, OldBB);
-}
-
-static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
- ScalarEvolution &SE) {
- const DataLayout &DL = F.getParent()->getDataLayout();
- ObjectSizeOpts EvalOpts;
- EvalOpts.RoundToAlign = true;
- ObjectSizeOffsetEvaluator ObjSizeEval(DL, &TLI, F.getContext(), EvalOpts);
-
- // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
- // touching instructions
- SmallVector<std::pair<Instruction *, Value *>, 4> TrapInfo;
- for (Instruction &I : instructions(F)) {
- Value *Or = nullptr;
- BuilderTy IRB(I.getParent(), BasicBlock::iterator(&I), TargetFolder(DL));
- if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
- if (!LI->isVolatile())
- Or = getBoundsCheckCond(LI->getPointerOperand(), LI, DL, TLI,
- ObjSizeEval, IRB, SE);
- } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
- if (!SI->isVolatile())
- Or = getBoundsCheckCond(SI->getPointerOperand(), SI->getValueOperand(),
- DL, TLI, ObjSizeEval, IRB, SE);
- } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(&I)) {
- if (!AI->isVolatile())
- Or =
- getBoundsCheckCond(AI->getPointerOperand(), AI->getCompareOperand(),
- DL, TLI, ObjSizeEval, IRB, SE);
- } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(&I)) {
- if (!AI->isVolatile())
- Or = getBoundsCheckCond(AI->getPointerOperand(), AI->getValOperand(),
- DL, TLI, ObjSizeEval, IRB, SE);
- }
- if (Or)
- TrapInfo.push_back(std::make_pair(&I, Or));
- }
-
- // Create a trapping basic block on demand using a callback. Depending on
- // flags, this will either create a single block for the entire function or
- // will create a fresh block every time it is called.
- BasicBlock *TrapBB = nullptr;
- auto GetTrapBB = [&TrapBB](BuilderTy &IRB) {
- if (TrapBB && SingleTrapBB)
- return TrapBB;
-
- Function *Fn = IRB.GetInsertBlock()->getParent();
- // FIXME: This debug location doesn't make a lot of sense in the
- // `SingleTrapBB` case.
- auto DebugLoc = IRB.getCurrentDebugLocation();
- IRBuilder<>::InsertPointGuard Guard(IRB);
- TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
- IRB.SetInsertPoint(TrapBB);
-
- auto *F = Intrinsic::getDeclaration(Fn->getParent(), Intrinsic::trap);
- CallInst *TrapCall = IRB.CreateCall(F, {});
- TrapCall->setDoesNotReturn();
- TrapCall->setDoesNotThrow();
- TrapCall->setDebugLoc(DebugLoc);
- IRB.CreateUnreachable();
-
- return TrapBB;
- };
-
- // Add the checks.
- for (const auto &Entry : TrapInfo) {
- Instruction *Inst = Entry.first;
- BuilderTy IRB(Inst->getParent(), BasicBlock::iterator(Inst), TargetFolder(DL));
- insertBoundsCheck(Entry.second, IRB, GetTrapBB);
- }
-
- return !TrapInfo.empty();
-}
-
-PreservedAnalyses BoundsCheckingPass::run(Function &F, FunctionAnalysisManager &AM) {
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
-
- if (!addBoundsChecking(F, TLI, SE))
- return PreservedAnalyses::all();
-
- return PreservedAnalyses::none();
-}
-
-namespace {
-struct BoundsCheckingLegacyPass : public FunctionPass {
- static char ID;
-
- BoundsCheckingLegacyPass() : FunctionPass(ID) {
- initializeBoundsCheckingLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- return addBoundsChecking(F, TLI, SE);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- }
-};
-} // namespace
-
-char BoundsCheckingLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(BoundsCheckingLegacyPass, "bounds-checking",
- "Run-time bounds checking", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(BoundsCheckingLegacyPass, "bounds-checking",
- "Run-time bounds checking", false, false)
-
-FunctionPass *llvm::createBoundsCheckingLegacyPass() {
- return new BoundsCheckingLegacyPass();
-}
+//===- BoundsChecking.cpp - Instrumentation for run-time bounds checking --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/BoundsChecking.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bounds-checking"
+
+static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap",
+ cl::desc("Use one trap block per function"));
+
+STATISTIC(ChecksAdded, "Bounds checks added");
+STATISTIC(ChecksSkipped, "Bounds checks skipped");
+STATISTIC(ChecksUnable, "Bounds checks unable to add");
+
+using BuilderTy = IRBuilder<TargetFolder>;
+
+/// Gets the conditions under which memory accessing instructions will overflow.
+///
+/// \p Ptr is the pointer that will be read/written, and \p InstVal is either
+/// the result from the load or the value being stored. It is used to determine
+/// the size of memory block that is touched.
+///
+/// Returns the condition under which the access will overflow.
+static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal,
+ const DataLayout &DL, TargetLibraryInfo &TLI,
+ ObjectSizeOffsetEvaluator &ObjSizeEval,
+ BuilderTy &IRB, ScalarEvolution &SE) {
+ uint64_t NeededSize = DL.getTypeStoreSize(InstVal->getType());
+ LLVM_DEBUG(dbgs() << "Instrument " << *Ptr << " for " << Twine(NeededSize)
+ << " bytes\n");
+
+ SizeOffsetEvalType SizeOffset = ObjSizeEval.compute(Ptr);
+
+ if (!ObjSizeEval.bothKnown(SizeOffset)) {
+ ++ChecksUnable;
+ return nullptr;
+ }
+
+ Value *Size = SizeOffset.first;
+ Value *Offset = SizeOffset.second;
+ ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
+
+ Type *IntTy = DL.getIntPtrType(Ptr->getType());
+ Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
+
+ auto SizeRange = SE.getUnsignedRange(SE.getSCEV(Size));
+ auto OffsetRange = SE.getUnsignedRange(SE.getSCEV(Offset));
+ auto NeededSizeRange = SE.getUnsignedRange(SE.getSCEV(NeededSizeVal));
+
+ // three checks are required to ensure safety:
+ // . Offset >= 0 (since the offset is given from the base ptr)
+ // . Size >= Offset (unsigned)
+ // . Size - Offset >= NeededSize (unsigned)
+ //
+ // optimization: if Size >= 0 (signed), skip 1st check
+ // FIXME: add NSW/NUW here? -- we dont care if the subtraction overflows
+ Value *ObjSize = IRB.CreateSub(Size, Offset);
+ Value *Cmp2 = SizeRange.getUnsignedMin().uge(OffsetRange.getUnsignedMax())
+ ? ConstantInt::getFalse(Ptr->getContext())
+ : IRB.CreateICmpULT(Size, Offset);
+ Value *Cmp3 = SizeRange.sub(OffsetRange)
+ .getUnsignedMin()
+ .uge(NeededSizeRange.getUnsignedMax())
+ ? ConstantInt::getFalse(Ptr->getContext())
+ : IRB.CreateICmpULT(ObjSize, NeededSizeVal);
+ Value *Or = IRB.CreateOr(Cmp2, Cmp3);
+ if ((!SizeCI || SizeCI->getValue().slt(0)) &&
+ !SizeRange.getSignedMin().isNonNegative()) {
+ Value *Cmp1 = IRB.CreateICmpSLT(Offset, ConstantInt::get(IntTy, 0));
+ Or = IRB.CreateOr(Cmp1, Or);
+ }
+
+ return Or;
+}
+
+/// Adds run-time bounds checks to memory accessing instructions.
+///
+/// \p Or is the condition that should guard the trap.
+///
+/// \p GetTrapBB is a callable that returns the trap BB to use on failure.
+template <typename GetTrapBBT>
+static void insertBoundsCheck(Value *Or, BuilderTy &IRB, GetTrapBBT GetTrapBB) {
+ // check if the comparison is always false
+ ConstantInt *C = dyn_cast_or_null<ConstantInt>(Or);
+ if (C) {
+ ++ChecksSkipped;
+ // If non-zero, nothing to do.
+ if (!C->getZExtValue())
+ return;
+ }
+ ++ChecksAdded;
+
+ BasicBlock::iterator SplitI = IRB.GetInsertPoint();
+ BasicBlock *OldBB = SplitI->getParent();
+ BasicBlock *Cont = OldBB->splitBasicBlock(SplitI);
+ OldBB->getTerminator()->eraseFromParent();
+
+ if (C) {
+ // If we have a constant zero, unconditionally branch.
+ // FIXME: We should really handle this differently to bypass the splitting
+ // the block.
+ BranchInst::Create(GetTrapBB(IRB), OldBB);
+ return;
+ }
+
+ // Create the conditional branch.
+ BranchInst::Create(GetTrapBB(IRB), Cont, Or, OldBB);
+}
+
+static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
+ ScalarEvolution &SE) {
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ ObjectSizeOpts EvalOpts;
+ EvalOpts.RoundToAlign = true;
+ ObjectSizeOffsetEvaluator ObjSizeEval(DL, &TLI, F.getContext(), EvalOpts);
+
+ // check HANDLE_MEMORY_INST in include/llvm/Instruction.def for memory
+ // touching instructions
+ SmallVector<std::pair<Instruction *, Value *>, 4> TrapInfo;
+ for (Instruction &I : instructions(F)) {
+ Value *Or = nullptr;
+ BuilderTy IRB(I.getParent(), BasicBlock::iterator(&I), TargetFolder(DL));
+ if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+ if (!LI->isVolatile())
+ Or = getBoundsCheckCond(LI->getPointerOperand(), LI, DL, TLI,
+ ObjSizeEval, IRB, SE);
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+ if (!SI->isVolatile())
+ Or = getBoundsCheckCond(SI->getPointerOperand(), SI->getValueOperand(),
+ DL, TLI, ObjSizeEval, IRB, SE);
+ } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(&I)) {
+ if (!AI->isVolatile())
+ Or =
+ getBoundsCheckCond(AI->getPointerOperand(), AI->getCompareOperand(),
+ DL, TLI, ObjSizeEval, IRB, SE);
+ } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(&I)) {
+ if (!AI->isVolatile())
+ Or = getBoundsCheckCond(AI->getPointerOperand(), AI->getValOperand(),
+ DL, TLI, ObjSizeEval, IRB, SE);
+ }
+ if (Or)
+ TrapInfo.push_back(std::make_pair(&I, Or));
+ }
+
+ // Create a trapping basic block on demand using a callback. Depending on
+ // flags, this will either create a single block for the entire function or
+ // will create a fresh block every time it is called.
+ BasicBlock *TrapBB = nullptr;
+ auto GetTrapBB = [&TrapBB](BuilderTy &IRB) {
+ if (TrapBB && SingleTrapBB)
+ return TrapBB;
+
+ Function *Fn = IRB.GetInsertBlock()->getParent();
+ // FIXME: This debug location doesn't make a lot of sense in the
+ // `SingleTrapBB` case.
+ auto DebugLoc = IRB.getCurrentDebugLocation();
+ IRBuilder<>::InsertPointGuard Guard(IRB);
+ TrapBB = BasicBlock::Create(Fn->getContext(), "trap", Fn);
+ IRB.SetInsertPoint(TrapBB);
+
+ auto *F = Intrinsic::getDeclaration(Fn->getParent(), Intrinsic::trap);
+ CallInst *TrapCall = IRB.CreateCall(F, {});
+ TrapCall->setDoesNotReturn();
+ TrapCall->setDoesNotThrow();
+ TrapCall->setDebugLoc(DebugLoc);
+ IRB.CreateUnreachable();
+
+ return TrapBB;
+ };
+
+ // Add the checks.
+ for (const auto &Entry : TrapInfo) {
+ Instruction *Inst = Entry.first;
+ BuilderTy IRB(Inst->getParent(), BasicBlock::iterator(Inst), TargetFolder(DL));
+ insertBoundsCheck(Entry.second, IRB, GetTrapBB);
+ }
+
+ return !TrapInfo.empty();
+}
+
+PreservedAnalyses BoundsCheckingPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+
+ if (!addBoundsChecking(F, TLI, SE))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+namespace {
+struct BoundsCheckingLegacyPass : public FunctionPass {
+ static char ID;
+
+ BoundsCheckingLegacyPass() : FunctionPass(ID) {
+ initializeBoundsCheckingLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ return addBoundsChecking(F, TLI, SE);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ }
+};
+} // namespace
+
+char BoundsCheckingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(BoundsCheckingLegacyPass, "bounds-checking",
+ "Run-time bounds checking", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(BoundsCheckingLegacyPass, "bounds-checking",
+ "Run-time bounds checking", false, false)
+
+FunctionPass *llvm::createBoundsCheckingLegacyPass() {
+ return new BoundsCheckingLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/CFGMST.h b/contrib/libs/llvm12/lib/Transforms/Instrumentation/CFGMST.h
index 9de6edaadf..6580b6d7d7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/CFGMST.h
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/CFGMST.h
@@ -1,303 +1,303 @@
-//===-- CFGMST.h - Minimum Spanning Tree for CFG ----------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a Union-find algorithm to compute Minimum Spanning Tree
-// for a given CFG.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
-#define LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Support/BranchProbability.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <utility>
-#include <vector>
-
-#define DEBUG_TYPE "cfgmst"
-
-using namespace llvm;
-
-namespace llvm {
-
-/// An union-find based Minimum Spanning Tree for CFG
-///
-/// Implements a Union-find algorithm to compute Minimum Spanning Tree
-/// for a given CFG.
-template <class Edge, class BBInfo> class CFGMST {
-public:
- Function &F;
-
- // Store all the edges in CFG. It may contain some stale edges
- // when Removed is set.
- std::vector<std::unique_ptr<Edge>> AllEdges;
-
- // This map records the auxiliary information for each BB.
- DenseMap<const BasicBlock *, std::unique_ptr<BBInfo>> BBInfos;
-
- // Whehter the function has an exit block with no successors.
- // (For function with an infinite loop, this block may be absent)
- bool ExitBlockFound = false;
-
- // Find the root group of the G and compress the path from G to the root.
- BBInfo *findAndCompressGroup(BBInfo *G) {
- if (G->Group != G)
- G->Group = findAndCompressGroup(static_cast<BBInfo *>(G->Group));
- return static_cast<BBInfo *>(G->Group);
- }
-
- // Union BB1 and BB2 into the same group and return true.
- // Returns false if BB1 and BB2 are already in the same group.
- bool unionGroups(const BasicBlock *BB1, const BasicBlock *BB2) {
- BBInfo *BB1G = findAndCompressGroup(&getBBInfo(BB1));
- BBInfo *BB2G = findAndCompressGroup(&getBBInfo(BB2));
-
- if (BB1G == BB2G)
- return false;
-
- // Make the smaller rank tree a direct child or the root of high rank tree.
- if (BB1G->Rank < BB2G->Rank)
- BB1G->Group = BB2G;
- else {
- BB2G->Group = BB1G;
- // If the ranks are the same, increment root of one tree by one.
- if (BB1G->Rank == BB2G->Rank)
- BB1G->Rank++;
- }
- return true;
- }
-
- // Give BB, return the auxiliary information.
- BBInfo &getBBInfo(const BasicBlock *BB) const {
- auto It = BBInfos.find(BB);
- assert(It->second.get() != nullptr);
- return *It->second.get();
- }
-
- // Give BB, return the auxiliary information if it's available.
- BBInfo *findBBInfo(const BasicBlock *BB) const {
- auto It = BBInfos.find(BB);
- if (It == BBInfos.end())
- return nullptr;
- return It->second.get();
- }
-
- // Traverse the CFG using a stack. Find all the edges and assign the weight.
- // Edges with large weight will be put into MST first so they are less likely
- // to be instrumented.
- void buildEdges() {
- LLVM_DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n");
-
- const BasicBlock *Entry = &(F.getEntryBlock());
- uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2);
- // If we want to instrument the entry count, lower the weight to 0.
+//===-- CFGMST.h - Minimum Spanning Tree for CFG ----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Union-find algorithm to compute Minimum Spanning Tree
+// for a given CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
+#define LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "cfgmst"
+
+using namespace llvm;
+
+namespace llvm {
+
+/// An union-find based Minimum Spanning Tree for CFG
+///
+/// Implements a Union-find algorithm to compute Minimum Spanning Tree
+/// for a given CFG.
+template <class Edge, class BBInfo> class CFGMST {
+public:
+ Function &F;
+
+ // Store all the edges in CFG. It may contain some stale edges
+ // when Removed is set.
+ std::vector<std::unique_ptr<Edge>> AllEdges;
+
+ // This map records the auxiliary information for each BB.
+ DenseMap<const BasicBlock *, std::unique_ptr<BBInfo>> BBInfos;
+
+ // Whehter the function has an exit block with no successors.
+ // (For function with an infinite loop, this block may be absent)
+ bool ExitBlockFound = false;
+
+ // Find the root group of the G and compress the path from G to the root.
+ BBInfo *findAndCompressGroup(BBInfo *G) {
+ if (G->Group != G)
+ G->Group = findAndCompressGroup(static_cast<BBInfo *>(G->Group));
+ return static_cast<BBInfo *>(G->Group);
+ }
+
+ // Union BB1 and BB2 into the same group and return true.
+ // Returns false if BB1 and BB2 are already in the same group.
+ bool unionGroups(const BasicBlock *BB1, const BasicBlock *BB2) {
+ BBInfo *BB1G = findAndCompressGroup(&getBBInfo(BB1));
+ BBInfo *BB2G = findAndCompressGroup(&getBBInfo(BB2));
+
+ if (BB1G == BB2G)
+ return false;
+
+ // Make the smaller rank tree a direct child or the root of high rank tree.
+ if (BB1G->Rank < BB2G->Rank)
+ BB1G->Group = BB2G;
+ else {
+ BB2G->Group = BB1G;
+ // If the ranks are the same, increment root of one tree by one.
+ if (BB1G->Rank == BB2G->Rank)
+ BB1G->Rank++;
+ }
+ return true;
+ }
+
+ // Give BB, return the auxiliary information.
+ BBInfo &getBBInfo(const BasicBlock *BB) const {
+ auto It = BBInfos.find(BB);
+ assert(It->second.get() != nullptr);
+ return *It->second.get();
+ }
+
+ // Give BB, return the auxiliary information if it's available.
+ BBInfo *findBBInfo(const BasicBlock *BB) const {
+ auto It = BBInfos.find(BB);
+ if (It == BBInfos.end())
+ return nullptr;
+ return It->second.get();
+ }
+
+ // Traverse the CFG using a stack. Find all the edges and assign the weight.
+ // Edges with large weight will be put into MST first so they are less likely
+ // to be instrumented.
+ void buildEdges() {
+ LLVM_DEBUG(dbgs() << "Build Edge on " << F.getName() << "\n");
+
+ const BasicBlock *Entry = &(F.getEntryBlock());
+ uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2);
+ // If we want to instrument the entry count, lower the weight to 0.
if (InstrumentFuncEntry)
- EntryWeight = 0;
- Edge *EntryIncoming = nullptr, *EntryOutgoing = nullptr,
- *ExitOutgoing = nullptr, *ExitIncoming = nullptr;
- uint64_t MaxEntryOutWeight = 0, MaxExitOutWeight = 0, MaxExitInWeight = 0;
-
- // Add a fake edge to the entry.
- EntryIncoming = &addEdge(nullptr, Entry, EntryWeight);
- LLVM_DEBUG(dbgs() << " Edge: from fake node to " << Entry->getName()
- << " w = " << EntryWeight << "\n");
-
- // Special handling for single BB functions.
- if (succ_empty(Entry)) {
- addEdge(Entry, nullptr, EntryWeight);
- return;
- }
-
- static const uint32_t CriticalEdgeMultiplier = 1000;
-
- for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
- Instruction *TI = BB->getTerminator();
- uint64_t BBWeight =
- (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2);
- uint64_t Weight = 2;
- if (int successors = TI->getNumSuccessors()) {
- for (int i = 0; i != successors; ++i) {
- BasicBlock *TargetBB = TI->getSuccessor(i);
- bool Critical = isCriticalEdge(TI, i);
- uint64_t scaleFactor = BBWeight;
- if (Critical) {
- if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier)
- scaleFactor *= CriticalEdgeMultiplier;
- else
- scaleFactor = UINT64_MAX;
- }
- if (BPI != nullptr)
- Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor);
- if (Weight == 0)
- Weight++;
- auto *E = &addEdge(&*BB, TargetBB, Weight);
- E->IsCritical = Critical;
- LLVM_DEBUG(dbgs() << " Edge: from " << BB->getName() << " to "
- << TargetBB->getName() << " w=" << Weight << "\n");
-
- // Keep track of entry/exit edges:
- if (&*BB == Entry) {
- if (Weight > MaxEntryOutWeight) {
- MaxEntryOutWeight = Weight;
- EntryOutgoing = E;
- }
- }
-
- auto *TargetTI = TargetBB->getTerminator();
- if (TargetTI && !TargetTI->getNumSuccessors()) {
- if (Weight > MaxExitInWeight) {
- MaxExitInWeight = Weight;
- ExitIncoming = E;
- }
- }
- }
- } else {
- ExitBlockFound = true;
- Edge *ExitO = &addEdge(&*BB, nullptr, BBWeight);
- if (BBWeight > MaxExitOutWeight) {
- MaxExitOutWeight = BBWeight;
- ExitOutgoing = ExitO;
- }
- LLVM_DEBUG(dbgs() << " Edge: from " << BB->getName() << " to fake exit"
- << " w = " << BBWeight << "\n");
- }
- }
-
- // Entry/exit edge adjustment heurisitic:
- // prefer instrumenting entry edge over exit edge
- // if possible. Those exit edges may never have a chance to be
- // executed (for instance the program is an event handling loop)
- // before the profile is asynchronously dumped.
- //
- // If EntryIncoming and ExitOutgoing has similar weight, make sure
- // ExitOutging is selected as the min-edge. Similarly, if EntryOutgoing
- // and ExitIncoming has similar weight, make sure ExitIncoming becomes
- // the min-edge.
- uint64_t EntryInWeight = EntryWeight;
-
- if (EntryInWeight >= MaxExitOutWeight &&
- EntryInWeight * 2 < MaxExitOutWeight * 3) {
- EntryIncoming->Weight = MaxExitOutWeight;
- ExitOutgoing->Weight = EntryInWeight + 1;
- }
-
- if (MaxEntryOutWeight >= MaxExitInWeight &&
- MaxEntryOutWeight * 2 < MaxExitInWeight * 3) {
- EntryOutgoing->Weight = MaxExitInWeight;
- ExitIncoming->Weight = MaxEntryOutWeight + 1;
- }
- }
-
- // Sort CFG edges based on its weight.
- void sortEdgesByWeight() {
- llvm::stable_sort(AllEdges, [](const std::unique_ptr<Edge> &Edge1,
- const std::unique_ptr<Edge> &Edge2) {
- return Edge1->Weight > Edge2->Weight;
- });
- }
-
- // Traverse all the edges and compute the Minimum Weight Spanning Tree
- // using union-find algorithm.
- void computeMinimumSpanningTree() {
- // First, put all the critical edge with landing-pad as the Dest to MST.
- // This works around the insufficient support of critical edges split
- // when destination BB is a landing pad.
- for (auto &Ei : AllEdges) {
- if (Ei->Removed)
- continue;
- if (Ei->IsCritical) {
- if (Ei->DestBB && Ei->DestBB->isLandingPad()) {
- if (unionGroups(Ei->SrcBB, Ei->DestBB))
- Ei->InMST = true;
- }
- }
- }
-
- for (auto &Ei : AllEdges) {
- if (Ei->Removed)
- continue;
- // If we detect infinite loops, force
- // instrumenting the entry edge:
- if (!ExitBlockFound && Ei->SrcBB == nullptr)
- continue;
- if (unionGroups(Ei->SrcBB, Ei->DestBB))
- Ei->InMST = true;
- }
- }
-
- // Dump the Debug information about the instrumentation.
- void dumpEdges(raw_ostream &OS, const Twine &Message) const {
- if (!Message.str().empty())
- OS << Message << "\n";
- OS << " Number of Basic Blocks: " << BBInfos.size() << "\n";
- for (auto &BI : BBInfos) {
- const BasicBlock *BB = BI.first;
- OS << " BB: " << (BB == nullptr ? "FakeNode" : BB->getName()) << " "
- << BI.second->infoString() << "\n";
- }
-
- OS << " Number of Edges: " << AllEdges.size()
- << " (*: Instrument, C: CriticalEdge, -: Removed)\n";
- uint32_t Count = 0;
- for (auto &EI : AllEdges)
- OS << " Edge " << Count++ << ": " << getBBInfo(EI->SrcBB).Index << "-->"
- << getBBInfo(EI->DestBB).Index << EI->infoString() << "\n";
- }
-
- // Add an edge to AllEdges with weight W.
- Edge &addEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W) {
- uint32_t Index = BBInfos.size();
- auto Iter = BBInfos.end();
- bool Inserted;
- std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr));
- if (Inserted) {
- // Newly inserted, update the real info.
- Iter->second = std::move(std::make_unique<BBInfo>(Index));
- Index++;
- }
- std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr));
- if (Inserted)
- // Newly inserted, update the real info.
- Iter->second = std::move(std::make_unique<BBInfo>(Index));
- AllEdges.emplace_back(new Edge(Src, Dest, W));
- return *AllEdges.back();
- }
-
- BranchProbabilityInfo *BPI;
- BlockFrequencyInfo *BFI;
-
+ EntryWeight = 0;
+ Edge *EntryIncoming = nullptr, *EntryOutgoing = nullptr,
+ *ExitOutgoing = nullptr, *ExitIncoming = nullptr;
+ uint64_t MaxEntryOutWeight = 0, MaxExitOutWeight = 0, MaxExitInWeight = 0;
+
+ // Add a fake edge to the entry.
+ EntryIncoming = &addEdge(nullptr, Entry, EntryWeight);
+ LLVM_DEBUG(dbgs() << " Edge: from fake node to " << Entry->getName()
+ << " w = " << EntryWeight << "\n");
+
+ // Special handling for single BB functions.
+ if (succ_empty(Entry)) {
+ addEdge(Entry, nullptr, EntryWeight);
+ return;
+ }
+
+ static const uint32_t CriticalEdgeMultiplier = 1000;
+
+ for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+ Instruction *TI = BB->getTerminator();
+ uint64_t BBWeight =
+ (BFI != nullptr ? BFI->getBlockFreq(&*BB).getFrequency() : 2);
+ uint64_t Weight = 2;
+ if (int successors = TI->getNumSuccessors()) {
+ for (int i = 0; i != successors; ++i) {
+ BasicBlock *TargetBB = TI->getSuccessor(i);
+ bool Critical = isCriticalEdge(TI, i);
+ uint64_t scaleFactor = BBWeight;
+ if (Critical) {
+ if (scaleFactor < UINT64_MAX / CriticalEdgeMultiplier)
+ scaleFactor *= CriticalEdgeMultiplier;
+ else
+ scaleFactor = UINT64_MAX;
+ }
+ if (BPI != nullptr)
+ Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor);
+ if (Weight == 0)
+ Weight++;
+ auto *E = &addEdge(&*BB, TargetBB, Weight);
+ E->IsCritical = Critical;
+ LLVM_DEBUG(dbgs() << " Edge: from " << BB->getName() << " to "
+ << TargetBB->getName() << " w=" << Weight << "\n");
+
+ // Keep track of entry/exit edges:
+ if (&*BB == Entry) {
+ if (Weight > MaxEntryOutWeight) {
+ MaxEntryOutWeight = Weight;
+ EntryOutgoing = E;
+ }
+ }
+
+ auto *TargetTI = TargetBB->getTerminator();
+ if (TargetTI && !TargetTI->getNumSuccessors()) {
+ if (Weight > MaxExitInWeight) {
+ MaxExitInWeight = Weight;
+ ExitIncoming = E;
+ }
+ }
+ }
+ } else {
+ ExitBlockFound = true;
+ Edge *ExitO = &addEdge(&*BB, nullptr, BBWeight);
+ if (BBWeight > MaxExitOutWeight) {
+ MaxExitOutWeight = BBWeight;
+ ExitOutgoing = ExitO;
+ }
+ LLVM_DEBUG(dbgs() << " Edge: from " << BB->getName() << " to fake exit"
+ << " w = " << BBWeight << "\n");
+ }
+ }
+
+ // Entry/exit edge adjustment heurisitic:
+ // prefer instrumenting entry edge over exit edge
+ // if possible. Those exit edges may never have a chance to be
+ // executed (for instance the program is an event handling loop)
+ // before the profile is asynchronously dumped.
+ //
+ // If EntryIncoming and ExitOutgoing has similar weight, make sure
+ // ExitOutging is selected as the min-edge. Similarly, if EntryOutgoing
+ // and ExitIncoming has similar weight, make sure ExitIncoming becomes
+ // the min-edge.
+ uint64_t EntryInWeight = EntryWeight;
+
+ if (EntryInWeight >= MaxExitOutWeight &&
+ EntryInWeight * 2 < MaxExitOutWeight * 3) {
+ EntryIncoming->Weight = MaxExitOutWeight;
+ ExitOutgoing->Weight = EntryInWeight + 1;
+ }
+
+ if (MaxEntryOutWeight >= MaxExitInWeight &&
+ MaxEntryOutWeight * 2 < MaxExitInWeight * 3) {
+ EntryOutgoing->Weight = MaxExitInWeight;
+ ExitIncoming->Weight = MaxEntryOutWeight + 1;
+ }
+ }
+
+ // Sort CFG edges based on its weight.
+ void sortEdgesByWeight() {
+ llvm::stable_sort(AllEdges, [](const std::unique_ptr<Edge> &Edge1,
+ const std::unique_ptr<Edge> &Edge2) {
+ return Edge1->Weight > Edge2->Weight;
+ });
+ }
+
+ // Traverse all the edges and compute the Minimum Weight Spanning Tree
+ // using union-find algorithm.
+ void computeMinimumSpanningTree() {
+ // First, put all the critical edge with landing-pad as the Dest to MST.
+ // This works around the insufficient support of critical edges split
+ // when destination BB is a landing pad.
+ for (auto &Ei : AllEdges) {
+ if (Ei->Removed)
+ continue;
+ if (Ei->IsCritical) {
+ if (Ei->DestBB && Ei->DestBB->isLandingPad()) {
+ if (unionGroups(Ei->SrcBB, Ei->DestBB))
+ Ei->InMST = true;
+ }
+ }
+ }
+
+ for (auto &Ei : AllEdges) {
+ if (Ei->Removed)
+ continue;
+ // If we detect infinite loops, force
+ // instrumenting the entry edge:
+ if (!ExitBlockFound && Ei->SrcBB == nullptr)
+ continue;
+ if (unionGroups(Ei->SrcBB, Ei->DestBB))
+ Ei->InMST = true;
+ }
+ }
+
+ // Dump the Debug information about the instrumentation.
+ void dumpEdges(raw_ostream &OS, const Twine &Message) const {
+ if (!Message.str().empty())
+ OS << Message << "\n";
+ OS << " Number of Basic Blocks: " << BBInfos.size() << "\n";
+ for (auto &BI : BBInfos) {
+ const BasicBlock *BB = BI.first;
+ OS << " BB: " << (BB == nullptr ? "FakeNode" : BB->getName()) << " "
+ << BI.second->infoString() << "\n";
+ }
+
+ OS << " Number of Edges: " << AllEdges.size()
+ << " (*: Instrument, C: CriticalEdge, -: Removed)\n";
+ uint32_t Count = 0;
+ for (auto &EI : AllEdges)
+ OS << " Edge " << Count++ << ": " << getBBInfo(EI->SrcBB).Index << "-->"
+ << getBBInfo(EI->DestBB).Index << EI->infoString() << "\n";
+ }
+
+ // Add an edge to AllEdges with weight W.
+ Edge &addEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W) {
+ uint32_t Index = BBInfos.size();
+ auto Iter = BBInfos.end();
+ bool Inserted;
+ std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Src, nullptr));
+ if (Inserted) {
+ // Newly inserted, update the real info.
+ Iter->second = std::move(std::make_unique<BBInfo>(Index));
+ Index++;
+ }
+ std::tie(Iter, Inserted) = BBInfos.insert(std::make_pair(Dest, nullptr));
+ if (Inserted)
+ // Newly inserted, update the real info.
+ Iter->second = std::move(std::make_unique<BBInfo>(Index));
+ AllEdges.emplace_back(new Edge(Src, Dest, W));
+ return *AllEdges.back();
+ }
+
+ BranchProbabilityInfo *BPI;
+ BlockFrequencyInfo *BFI;
+
// If function entry will be always instrumented.
bool InstrumentFuncEntry;
-public:
+public:
CFGMST(Function &Func, bool InstrumentFuncEntry_,
BranchProbabilityInfo *BPI_ = nullptr,
- BlockFrequencyInfo *BFI_ = nullptr)
+ BlockFrequencyInfo *BFI_ = nullptr)
: F(Func), BPI(BPI_), BFI(BFI_),
InstrumentFuncEntry(InstrumentFuncEntry_) {
- buildEdges();
- sortEdgesByWeight();
- computeMinimumSpanningTree();
+ buildEdges();
+ sortEdgesByWeight();
+ computeMinimumSpanningTree();
if (AllEdges.size() > 1 && InstrumentFuncEntry)
- std::iter_swap(std::move(AllEdges.begin()),
- std::move(AllEdges.begin() + AllEdges.size() - 1));
- }
-};
-
-} // end namespace llvm
-
-#undef DEBUG_TYPE // "cfgmst"
-
-#endif // LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
+ std::iter_swap(std::move(AllEdges.begin()),
+ std::move(AllEdges.begin() + AllEdges.size() - 1));
+ }
+};
+
+} // end namespace llvm
+
+#undef DEBUG_TYPE // "cfgmst"
+
+#endif // LLVM_LIB_TRANSFORMS_INSTRUMENTATION_CFGMST_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/CGProfile.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/CGProfile.cpp
index 7f658fa68f..9acd82c005 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/CGProfile.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -1,153 +1,153 @@
-//===-- CGProfile.cpp -----------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation/CGProfile.h"
-
-#include "llvm/ADT/MapVector.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/ProfileData/InstrProf.h"
-#include "llvm/Transforms/Instrumentation.h"
-
-#include <array>
-
-using namespace llvm;
-
-static bool
-addModuleFlags(Module &M,
- MapVector<std::pair<Function *, Function *>, uint64_t> &Counts) {
- if (Counts.empty())
- return false;
-
- LLVMContext &Context = M.getContext();
- MDBuilder MDB(Context);
- std::vector<Metadata *> Nodes;
-
- for (auto E : Counts) {
- Metadata *Vals[] = {ValueAsMetadata::get(E.first.first),
- ValueAsMetadata::get(E.first.second),
- MDB.createConstant(ConstantInt::get(
- Type::getInt64Ty(Context), E.second))};
- Nodes.push_back(MDNode::get(Context, Vals));
- }
-
- M.addModuleFlag(Module::Append, "CG Profile", MDNode::get(Context, Nodes));
- return true;
-}
-
-static bool runCGProfilePass(
- Module &M, function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
- function_ref<TargetTransformInfo &(Function &)> GetTTI, bool LazyBFI) {
- MapVector<std::pair<Function *, Function *>, uint64_t> Counts;
- InstrProfSymtab Symtab;
- auto UpdateCounts = [&](TargetTransformInfo &TTI, Function *F,
- Function *CalledF, uint64_t NewCount) {
+//===-- CGProfile.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/CGProfile.h"
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Transforms/Instrumentation.h"
+
+#include <array>
+
+using namespace llvm;
+
+static bool
+addModuleFlags(Module &M,
+ MapVector<std::pair<Function *, Function *>, uint64_t> &Counts) {
+ if (Counts.empty())
+ return false;
+
+ LLVMContext &Context = M.getContext();
+ MDBuilder MDB(Context);
+ std::vector<Metadata *> Nodes;
+
+ for (auto E : Counts) {
+ Metadata *Vals[] = {ValueAsMetadata::get(E.first.first),
+ ValueAsMetadata::get(E.first.second),
+ MDB.createConstant(ConstantInt::get(
+ Type::getInt64Ty(Context), E.second))};
+ Nodes.push_back(MDNode::get(Context, Vals));
+ }
+
+ M.addModuleFlag(Module::Append, "CG Profile", MDNode::get(Context, Nodes));
+ return true;
+}
+
+static bool runCGProfilePass(
+ Module &M, function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+ function_ref<TargetTransformInfo &(Function &)> GetTTI, bool LazyBFI) {
+ MapVector<std::pair<Function *, Function *>, uint64_t> Counts;
+ InstrProfSymtab Symtab;
+ auto UpdateCounts = [&](TargetTransformInfo &TTI, Function *F,
+ Function *CalledF, uint64_t NewCount) {
if (!CalledF || !TTI.isLoweredToCall(CalledF) ||
CalledF->hasDLLImportStorageClass())
- return;
- uint64_t &Count = Counts[std::make_pair(F, CalledF)];
- Count = SaturatingAdd(Count, NewCount);
- };
- // Ignore error here. Indirect calls are ignored if this fails.
- (void)(bool) Symtab.create(M);
- for (auto &F : M) {
- // Avoid extra cost of running passes for BFI when the function doesn't have
- // entry count. Since LazyBlockFrequencyInfoPass only exists in LPM, check
- // if using LazyBlockFrequencyInfoPass.
- // TODO: Remove LazyBFI when LazyBlockFrequencyInfoPass is available in NPM.
- if (F.isDeclaration() || (LazyBFI && !F.getEntryCount()))
- continue;
- auto &BFI = GetBFI(F);
- if (BFI.getEntryFreq() == 0)
- continue;
- TargetTransformInfo &TTI = GetTTI(F);
- for (auto &BB : F) {
- Optional<uint64_t> BBCount = BFI.getBlockProfileCount(&BB);
- if (!BBCount)
- continue;
- for (auto &I : BB) {
- CallBase *CB = dyn_cast<CallBase>(&I);
- if (!CB)
- continue;
- if (CB->isIndirectCall()) {
- InstrProfValueData ValueData[8];
- uint32_t ActualNumValueData;
- uint64_t TotalC;
- if (!getValueProfDataFromInst(*CB, IPVK_IndirectCallTarget, 8,
- ValueData, ActualNumValueData, TotalC))
- continue;
- for (const auto &VD :
- ArrayRef<InstrProfValueData>(ValueData, ActualNumValueData)) {
- UpdateCounts(TTI, &F, Symtab.getFunction(VD.Value), VD.Count);
- }
- continue;
- }
- UpdateCounts(TTI, &F, CB->getCalledFunction(), *BBCount);
- }
- }
- }
-
- return addModuleFlags(M, Counts);
-}
-
-namespace {
-struct CGProfileLegacyPass final : public ModulePass {
- static char ID;
- CGProfileLegacyPass() : ModulePass(ID) {
- initializeCGProfileLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<LazyBlockFrequencyInfoPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- }
-
- bool runOnModule(Module &M) override {
- auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & {
- return this->getAnalysis<LazyBlockFrequencyInfoPass>(F).getBFI();
- };
- auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
- return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- };
-
- return runCGProfilePass(M, GetBFI, GetTTI, true);
- }
-};
-
-} // namespace
-
-char CGProfileLegacyPass::ID = 0;
-
-INITIALIZE_PASS(CGProfileLegacyPass, "cg-profile", "Call Graph Profile", false,
- false)
-
-ModulePass *llvm::createCGProfileLegacyPass() {
- return new CGProfileLegacyPass();
-}
-
-PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) {
- FunctionAnalysisManager &FAM =
- MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
- return FAM.getResult<BlockFrequencyAnalysis>(F);
- };
- auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
- return FAM.getResult<TargetIRAnalysis>(F);
- };
-
- runCGProfilePass(M, GetBFI, GetTTI, false);
-
- return PreservedAnalyses::all();
-}
+ return;
+ uint64_t &Count = Counts[std::make_pair(F, CalledF)];
+ Count = SaturatingAdd(Count, NewCount);
+ };
+ // Ignore error here. Indirect calls are ignored if this fails.
+ (void)(bool) Symtab.create(M);
+ for (auto &F : M) {
+ // Avoid extra cost of running passes for BFI when the function doesn't have
+ // entry count. Since LazyBlockFrequencyInfoPass only exists in LPM, check
+ // if using LazyBlockFrequencyInfoPass.
+ // TODO: Remove LazyBFI when LazyBlockFrequencyInfoPass is available in NPM.
+ if (F.isDeclaration() || (LazyBFI && !F.getEntryCount()))
+ continue;
+ auto &BFI = GetBFI(F);
+ if (BFI.getEntryFreq() == 0)
+ continue;
+ TargetTransformInfo &TTI = GetTTI(F);
+ for (auto &BB : F) {
+ Optional<uint64_t> BBCount = BFI.getBlockProfileCount(&BB);
+ if (!BBCount)
+ continue;
+ for (auto &I : BB) {
+ CallBase *CB = dyn_cast<CallBase>(&I);
+ if (!CB)
+ continue;
+ if (CB->isIndirectCall()) {
+ InstrProfValueData ValueData[8];
+ uint32_t ActualNumValueData;
+ uint64_t TotalC;
+ if (!getValueProfDataFromInst(*CB, IPVK_IndirectCallTarget, 8,
+ ValueData, ActualNumValueData, TotalC))
+ continue;
+ for (const auto &VD :
+ ArrayRef<InstrProfValueData>(ValueData, ActualNumValueData)) {
+ UpdateCounts(TTI, &F, Symtab.getFunction(VD.Value), VD.Count);
+ }
+ continue;
+ }
+ UpdateCounts(TTI, &F, CB->getCalledFunction(), *BBCount);
+ }
+ }
+ }
+
+ return addModuleFlags(M, Counts);
+}
+
+namespace {
+struct CGProfileLegacyPass final : public ModulePass {
+ static char ID;
+ CGProfileLegacyPass() : ModulePass(ID) {
+ initializeCGProfileLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<LazyBlockFrequencyInfoPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+ bool runOnModule(Module &M) override {
+ auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & {
+ return this->getAnalysis<LazyBlockFrequencyInfoPass>(F).getBFI();
+ };
+ auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
+ return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ };
+
+ return runCGProfilePass(M, GetBFI, GetTTI, true);
+ }
+};
+
+} // namespace
+
+char CGProfileLegacyPass::ID = 0;
+
+INITIALIZE_PASS(CGProfileLegacyPass, "cg-profile", "Call Graph Profile", false,
+ false)
+
+ModulePass *llvm::createCGProfileLegacyPass() {
+ return new CGProfileLegacyPass();
+}
+
+PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) {
+ FunctionAnalysisManager &FAM =
+ MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
+ return FAM.getResult<BlockFrequencyAnalysis>(F);
+ };
+ auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
+ return FAM.getResult<TargetIRAnalysis>(F);
+ };
+
+ runCGProfilePass(M, GetBFI, GetTTI, false);
+
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 6fdeb88658..927c34180d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -1,2103 +1,2103 @@
-//===-- ControlHeightReduction.cpp - Control Height Reduction -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass merges conditional blocks of code and reduces the number of
-// conditional branches in the hot paths based on profiles.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/RegionInfo.h"
-#include "llvm/Analysis/RegionIterator.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/BranchProbability.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-
-#include <set>
-#include <sstream>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "chr"
-
-#define CHR_DEBUG(X) LLVM_DEBUG(X)
-
-static cl::opt<bool> ForceCHR("force-chr", cl::init(false), cl::Hidden,
- cl::desc("Apply CHR for all functions"));
-
-static cl::opt<double> CHRBiasThreshold(
- "chr-bias-threshold", cl::init(0.99), cl::Hidden,
- cl::desc("CHR considers a branch bias greater than this ratio as biased"));
-
-static cl::opt<unsigned> CHRMergeThreshold(
- "chr-merge-threshold", cl::init(2), cl::Hidden,
- cl::desc("CHR merges a group of N branches/selects where N >= this value"));
-
-static cl::opt<std::string> CHRModuleList(
- "chr-module-list", cl::init(""), cl::Hidden,
- cl::desc("Specify file to retrieve the list of modules to apply CHR to"));
-
-static cl::opt<std::string> CHRFunctionList(
- "chr-function-list", cl::init(""), cl::Hidden,
- cl::desc("Specify file to retrieve the list of functions to apply CHR to"));
-
-static StringSet<> CHRModules;
-static StringSet<> CHRFunctions;
-
-static void parseCHRFilterFiles() {
- if (!CHRModuleList.empty()) {
- auto FileOrErr = MemoryBuffer::getFile(CHRModuleList);
- if (!FileOrErr) {
- errs() << "Error: Couldn't read the chr-module-list file " << CHRModuleList << "\n";
- std::exit(1);
- }
- StringRef Buf = FileOrErr->get()->getBuffer();
- SmallVector<StringRef, 0> Lines;
- Buf.split(Lines, '\n');
- for (StringRef Line : Lines) {
- Line = Line.trim();
- if (!Line.empty())
- CHRModules.insert(Line);
- }
- }
- if (!CHRFunctionList.empty()) {
- auto FileOrErr = MemoryBuffer::getFile(CHRFunctionList);
- if (!FileOrErr) {
- errs() << "Error: Couldn't read the chr-function-list file " << CHRFunctionList << "\n";
- std::exit(1);
- }
- StringRef Buf = FileOrErr->get()->getBuffer();
- SmallVector<StringRef, 0> Lines;
- Buf.split(Lines, '\n');
- for (StringRef Line : Lines) {
- Line = Line.trim();
- if (!Line.empty())
- CHRFunctions.insert(Line);
- }
- }
-}
-
-namespace {
-class ControlHeightReductionLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- ControlHeightReductionLegacyPass() : FunctionPass(ID) {
- initializeControlHeightReductionLegacyPassPass(
- *PassRegistry::getPassRegistry());
- parseCHRFilterFiles();
- }
-
- bool runOnFunction(Function &F) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<BlockFrequencyInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<ProfileSummaryInfoWrapperPass>();
- AU.addRequired<RegionInfoPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
-};
-} // end anonymous namespace
-
-char ControlHeightReductionLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(ControlHeightReductionLegacyPass,
- "chr",
- "Reduce control height in the hot paths",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
-INITIALIZE_PASS_END(ControlHeightReductionLegacyPass,
- "chr",
- "Reduce control height in the hot paths",
- false, false)
-
-FunctionPass *llvm::createControlHeightReductionLegacyPass() {
- return new ControlHeightReductionLegacyPass();
-}
-
-namespace {
-
-struct CHRStats {
- CHRStats() : NumBranches(0), NumBranchesDelta(0),
- WeightedNumBranchesDelta(0) {}
- void print(raw_ostream &OS) const {
- OS << "CHRStats: NumBranches " << NumBranches
- << " NumBranchesDelta " << NumBranchesDelta
- << " WeightedNumBranchesDelta " << WeightedNumBranchesDelta;
- }
- uint64_t NumBranches; // The original number of conditional branches /
- // selects
- uint64_t NumBranchesDelta; // The decrease of the number of conditional
- // branches / selects in the hot paths due to CHR.
- uint64_t WeightedNumBranchesDelta; // NumBranchesDelta weighted by the profile
- // count at the scope entry.
-};
-
-// RegInfo - some properties of a Region.
-struct RegInfo {
- RegInfo() : R(nullptr), HasBranch(false) {}
- RegInfo(Region *RegionIn) : R(RegionIn), HasBranch(false) {}
- Region *R;
- bool HasBranch;
- SmallVector<SelectInst *, 8> Selects;
-};
-
-typedef DenseMap<Region *, DenseSet<Instruction *>> HoistStopMapTy;
-
-// CHRScope - a sequence of regions to CHR together. It corresponds to a
-// sequence of conditional blocks. It can have subscopes which correspond to
-// nested conditional blocks. Nested CHRScopes form a tree.
-class CHRScope {
- public:
- CHRScope(RegInfo RI) : BranchInsertPoint(nullptr) {
- assert(RI.R && "Null RegionIn");
- RegInfos.push_back(RI);
- }
-
- Region *getParentRegion() {
- assert(RegInfos.size() > 0 && "Empty CHRScope");
- Region *Parent = RegInfos[0].R->getParent();
- assert(Parent && "Unexpected to call this on the top-level region");
- return Parent;
- }
-
- BasicBlock *getEntryBlock() {
- assert(RegInfos.size() > 0 && "Empty CHRScope");
- return RegInfos.front().R->getEntry();
- }
-
- BasicBlock *getExitBlock() {
- assert(RegInfos.size() > 0 && "Empty CHRScope");
- return RegInfos.back().R->getExit();
- }
-
- bool appendable(CHRScope *Next) {
- // The next scope is appendable only if this scope is directly connected to
- // it (which implies it post-dominates this scope) and this scope dominates
- // it (no edge to the next scope outside this scope).
- BasicBlock *NextEntry = Next->getEntryBlock();
- if (getExitBlock() != NextEntry)
- // Not directly connected.
- return false;
- Region *LastRegion = RegInfos.back().R;
- for (BasicBlock *Pred : predecessors(NextEntry))
- if (!LastRegion->contains(Pred))
- // There's an edge going into the entry of the next scope from outside
- // of this scope.
- return false;
- return true;
- }
-
- void append(CHRScope *Next) {
- assert(RegInfos.size() > 0 && "Empty CHRScope");
- assert(Next->RegInfos.size() > 0 && "Empty CHRScope");
- assert(getParentRegion() == Next->getParentRegion() &&
- "Must be siblings");
- assert(getExitBlock() == Next->getEntryBlock() &&
- "Must be adjacent");
- RegInfos.append(Next->RegInfos.begin(), Next->RegInfos.end());
- Subs.append(Next->Subs.begin(), Next->Subs.end());
- }
-
- void addSub(CHRScope *SubIn) {
-#ifndef NDEBUG
- bool IsChild = false;
- for (RegInfo &RI : RegInfos)
- if (RI.R == SubIn->getParentRegion()) {
- IsChild = true;
- break;
- }
- assert(IsChild && "Must be a child");
-#endif
- Subs.push_back(SubIn);
- }
-
- // Split this scope at the boundary region into two, which will belong to the
- // tail and returns the tail.
- CHRScope *split(Region *Boundary) {
- assert(Boundary && "Boundary null");
- assert(RegInfos.begin()->R != Boundary &&
- "Can't be split at beginning");
- auto BoundaryIt = llvm::find_if(
- RegInfos, [&Boundary](const RegInfo &RI) { return Boundary == RI.R; });
- if (BoundaryIt == RegInfos.end())
- return nullptr;
- ArrayRef<RegInfo> TailRegInfos(BoundaryIt, RegInfos.end());
- DenseSet<Region *> TailRegionSet;
- for (const RegInfo &RI : TailRegInfos)
- TailRegionSet.insert(RI.R);
-
- auto TailIt =
- std::stable_partition(Subs.begin(), Subs.end(), [&](CHRScope *Sub) {
- assert(Sub && "null Sub");
- Region *Parent = Sub->getParentRegion();
- if (TailRegionSet.count(Parent))
- return false;
-
+//===-- ControlHeightReduction.cpp - Control Height Reduction -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass merges conditional blocks of code and reduces the number of
+// conditional branches in the hot paths based on profiles.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/ControlHeightReduction.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+
+#include <set>
+#include <sstream>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "chr"
+
+#define CHR_DEBUG(X) LLVM_DEBUG(X)
+
+static cl::opt<bool> ForceCHR("force-chr", cl::init(false), cl::Hidden,
+ cl::desc("Apply CHR for all functions"));
+
+static cl::opt<double> CHRBiasThreshold(
+ "chr-bias-threshold", cl::init(0.99), cl::Hidden,
+ cl::desc("CHR considers a branch bias greater than this ratio as biased"));
+
+static cl::opt<unsigned> CHRMergeThreshold(
+ "chr-merge-threshold", cl::init(2), cl::Hidden,
+ cl::desc("CHR merges a group of N branches/selects where N >= this value"));
+
+static cl::opt<std::string> CHRModuleList(
+ "chr-module-list", cl::init(""), cl::Hidden,
+ cl::desc("Specify file to retrieve the list of modules to apply CHR to"));
+
+static cl::opt<std::string> CHRFunctionList(
+ "chr-function-list", cl::init(""), cl::Hidden,
+ cl::desc("Specify file to retrieve the list of functions to apply CHR to"));
+
+static StringSet<> CHRModules;
+static StringSet<> CHRFunctions;
+
+static void parseCHRFilterFiles() {
+ if (!CHRModuleList.empty()) {
+ auto FileOrErr = MemoryBuffer::getFile(CHRModuleList);
+ if (!FileOrErr) {
+ errs() << "Error: Couldn't read the chr-module-list file " << CHRModuleList << "\n";
+ std::exit(1);
+ }
+ StringRef Buf = FileOrErr->get()->getBuffer();
+ SmallVector<StringRef, 0> Lines;
+ Buf.split(Lines, '\n');
+ for (StringRef Line : Lines) {
+ Line = Line.trim();
+ if (!Line.empty())
+ CHRModules.insert(Line);
+ }
+ }
+ if (!CHRFunctionList.empty()) {
+ auto FileOrErr = MemoryBuffer::getFile(CHRFunctionList);
+ if (!FileOrErr) {
+ errs() << "Error: Couldn't read the chr-function-list file " << CHRFunctionList << "\n";
+ std::exit(1);
+ }
+ StringRef Buf = FileOrErr->get()->getBuffer();
+ SmallVector<StringRef, 0> Lines;
+ Buf.split(Lines, '\n');
+ for (StringRef Line : Lines) {
+ Line = Line.trim();
+ if (!Line.empty())
+ CHRFunctions.insert(Line);
+ }
+ }
+}
+
+namespace {
+class ControlHeightReductionLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ ControlHeightReductionLegacyPass() : FunctionPass(ID) {
+ initializeControlHeightReductionLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ parseCHRFilterFiles();
+ }
+
+ bool runOnFunction(Function &F) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<RegionInfoPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+} // end anonymous namespace
+
+char ControlHeightReductionLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ControlHeightReductionLegacyPass,
+ "chr",
+ "Reduce control height in the hot paths",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
+INITIALIZE_PASS_END(ControlHeightReductionLegacyPass,
+ "chr",
+ "Reduce control height in the hot paths",
+ false, false)
+
+FunctionPass *llvm::createControlHeightReductionLegacyPass() {
+ return new ControlHeightReductionLegacyPass();
+}
+
+namespace {
+
+struct CHRStats {
+ CHRStats() : NumBranches(0), NumBranchesDelta(0),
+ WeightedNumBranchesDelta(0) {}
+ void print(raw_ostream &OS) const {
+ OS << "CHRStats: NumBranches " << NumBranches
+ << " NumBranchesDelta " << NumBranchesDelta
+ << " WeightedNumBranchesDelta " << WeightedNumBranchesDelta;
+ }
+ uint64_t NumBranches; // The original number of conditional branches /
+ // selects
+ uint64_t NumBranchesDelta; // The decrease of the number of conditional
+ // branches / selects in the hot paths due to CHR.
+ uint64_t WeightedNumBranchesDelta; // NumBranchesDelta weighted by the profile
+ // count at the scope entry.
+};
+
+// RegInfo - some properties of a Region.
+struct RegInfo {
+ RegInfo() : R(nullptr), HasBranch(false) {}
+ RegInfo(Region *RegionIn) : R(RegionIn), HasBranch(false) {}
+ Region *R;
+ bool HasBranch;
+ SmallVector<SelectInst *, 8> Selects;
+};
+
+typedef DenseMap<Region *, DenseSet<Instruction *>> HoistStopMapTy;
+
+// CHRScope - a sequence of regions to CHR together. It corresponds to a
+// sequence of conditional blocks. It can have subscopes which correspond to
+// nested conditional blocks. Nested CHRScopes form a tree.
+class CHRScope {
+ public:
+ CHRScope(RegInfo RI) : BranchInsertPoint(nullptr) {
+ assert(RI.R && "Null RegionIn");
+ RegInfos.push_back(RI);
+ }
+
+ Region *getParentRegion() {
+ assert(RegInfos.size() > 0 && "Empty CHRScope");
+ Region *Parent = RegInfos[0].R->getParent();
+ assert(Parent && "Unexpected to call this on the top-level region");
+ return Parent;
+ }
+
+ BasicBlock *getEntryBlock() {
+ assert(RegInfos.size() > 0 && "Empty CHRScope");
+ return RegInfos.front().R->getEntry();
+ }
+
+ BasicBlock *getExitBlock() {
+ assert(RegInfos.size() > 0 && "Empty CHRScope");
+ return RegInfos.back().R->getExit();
+ }
+
+ bool appendable(CHRScope *Next) {
+ // The next scope is appendable only if this scope is directly connected to
+ // it (which implies it post-dominates this scope) and this scope dominates
+ // it (no edge to the next scope outside this scope).
+ BasicBlock *NextEntry = Next->getEntryBlock();
+ if (getExitBlock() != NextEntry)
+ // Not directly connected.
+ return false;
+ Region *LastRegion = RegInfos.back().R;
+ for (BasicBlock *Pred : predecessors(NextEntry))
+ if (!LastRegion->contains(Pred))
+ // There's an edge going into the entry of the next scope from outside
+ // of this scope.
+ return false;
+ return true;
+ }
+
+ void append(CHRScope *Next) {
+ assert(RegInfos.size() > 0 && "Empty CHRScope");
+ assert(Next->RegInfos.size() > 0 && "Empty CHRScope");
+ assert(getParentRegion() == Next->getParentRegion() &&
+ "Must be siblings");
+ assert(getExitBlock() == Next->getEntryBlock() &&
+ "Must be adjacent");
+ RegInfos.append(Next->RegInfos.begin(), Next->RegInfos.end());
+ Subs.append(Next->Subs.begin(), Next->Subs.end());
+ }
+
+ void addSub(CHRScope *SubIn) {
+#ifndef NDEBUG
+ bool IsChild = false;
+ for (RegInfo &RI : RegInfos)
+ if (RI.R == SubIn->getParentRegion()) {
+ IsChild = true;
+ break;
+ }
+ assert(IsChild && "Must be a child");
+#endif
+ Subs.push_back(SubIn);
+ }
+
+ // Split this scope at the boundary region into two, which will belong to the
+ // tail and returns the tail.
+ CHRScope *split(Region *Boundary) {
+ assert(Boundary && "Boundary null");
+ assert(RegInfos.begin()->R != Boundary &&
+ "Can't be split at beginning");
+ auto BoundaryIt = llvm::find_if(
+ RegInfos, [&Boundary](const RegInfo &RI) { return Boundary == RI.R; });
+ if (BoundaryIt == RegInfos.end())
+ return nullptr;
+ ArrayRef<RegInfo> TailRegInfos(BoundaryIt, RegInfos.end());
+ DenseSet<Region *> TailRegionSet;
+ for (const RegInfo &RI : TailRegInfos)
+ TailRegionSet.insert(RI.R);
+
+ auto TailIt =
+ std::stable_partition(Subs.begin(), Subs.end(), [&](CHRScope *Sub) {
+ assert(Sub && "null Sub");
+ Region *Parent = Sub->getParentRegion();
+ if (TailRegionSet.count(Parent))
+ return false;
+
assert(llvm::any_of(
RegInfos,
[&Parent](const RegInfo &RI) { return Parent == RI.R; }) &&
- "Must be in head");
- return true;
- });
- ArrayRef<CHRScope *> TailSubs(TailIt, Subs.end());
-
- assert(HoistStopMap.empty() && "MapHoistStops must be empty");
- auto *Scope = new CHRScope(TailRegInfos, TailSubs);
- RegInfos.erase(BoundaryIt, RegInfos.end());
- Subs.erase(TailIt, Subs.end());
- return Scope;
- }
-
- bool contains(Instruction *I) const {
- BasicBlock *Parent = I->getParent();
- for (const RegInfo &RI : RegInfos)
- if (RI.R->contains(Parent))
- return true;
- return false;
- }
-
- void print(raw_ostream &OS) const;
-
- SmallVector<RegInfo, 8> RegInfos; // Regions that belong to this scope
- SmallVector<CHRScope *, 8> Subs; // Subscopes.
-
- // The instruction at which to insert the CHR conditional branch (and hoist
- // the dependent condition values).
- Instruction *BranchInsertPoint;
-
- // True-biased and false-biased regions (conditional blocks),
- // respectively. Used only for the outermost scope and includes regions in
- // subscopes. The rest are unbiased.
- DenseSet<Region *> TrueBiasedRegions;
- DenseSet<Region *> FalseBiasedRegions;
- // Among the biased regions, the regions that get CHRed.
- SmallVector<RegInfo, 8> CHRRegions;
-
- // True-biased and false-biased selects, respectively. Used only for the
- // outermost scope and includes ones in subscopes.
- DenseSet<SelectInst *> TrueBiasedSelects;
- DenseSet<SelectInst *> FalseBiasedSelects;
-
- // Map from one of the above regions to the instructions to stop
- // hoisting instructions at through use-def chains.
- HoistStopMapTy HoistStopMap;
-
- private:
- CHRScope(ArrayRef<RegInfo> RegInfosIn, ArrayRef<CHRScope *> SubsIn)
- : RegInfos(RegInfosIn.begin(), RegInfosIn.end()),
- Subs(SubsIn.begin(), SubsIn.end()), BranchInsertPoint(nullptr) {}
-};
-
-class CHR {
- public:
- CHR(Function &Fin, BlockFrequencyInfo &BFIin, DominatorTree &DTin,
- ProfileSummaryInfo &PSIin, RegionInfo &RIin,
- OptimizationRemarkEmitter &OREin)
- : F(Fin), BFI(BFIin), DT(DTin), PSI(PSIin), RI(RIin), ORE(OREin) {}
-
- ~CHR() {
- for (CHRScope *Scope : Scopes) {
- delete Scope;
- }
- }
-
- bool run();
-
- private:
- // See the comments in CHR::run() for the high level flow of the algorithm and
- // what the following functions do.
-
- void findScopes(SmallVectorImpl<CHRScope *> &Output) {
- Region *R = RI.getTopLevelRegion();
- if (CHRScope *Scope = findScopes(R, nullptr, nullptr, Output)) {
- Output.push_back(Scope);
- }
- }
- CHRScope *findScopes(Region *R, Region *NextRegion, Region *ParentRegion,
- SmallVectorImpl<CHRScope *> &Scopes);
- CHRScope *findScope(Region *R);
- void checkScopeHoistable(CHRScope *Scope);
-
- void splitScopes(SmallVectorImpl<CHRScope *> &Input,
- SmallVectorImpl<CHRScope *> &Output);
- SmallVector<CHRScope *, 8> splitScope(CHRScope *Scope,
- CHRScope *Outer,
- DenseSet<Value *> *OuterConditionValues,
- Instruction *OuterInsertPoint,
- SmallVectorImpl<CHRScope *> &Output,
- DenseSet<Instruction *> &Unhoistables);
-
- void classifyBiasedScopes(SmallVectorImpl<CHRScope *> &Scopes);
- void classifyBiasedScopes(CHRScope *Scope, CHRScope *OutermostScope);
-
- void filterScopes(SmallVectorImpl<CHRScope *> &Input,
- SmallVectorImpl<CHRScope *> &Output);
-
- void setCHRRegions(SmallVectorImpl<CHRScope *> &Input,
- SmallVectorImpl<CHRScope *> &Output);
- void setCHRRegions(CHRScope *Scope, CHRScope *OutermostScope);
-
- void sortScopes(SmallVectorImpl<CHRScope *> &Input,
- SmallVectorImpl<CHRScope *> &Output);
-
- void transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes);
- void transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs);
- void cloneScopeBlocks(CHRScope *Scope,
- BasicBlock *PreEntryBlock,
- BasicBlock *ExitBlock,
- Region *LastRegion,
- ValueToValueMapTy &VMap);
- BranchInst *createMergedBranch(BasicBlock *PreEntryBlock,
- BasicBlock *EntryBlock,
- BasicBlock *NewEntryBlock,
- ValueToValueMapTy &VMap);
- void fixupBranchesAndSelects(CHRScope *Scope,
- BasicBlock *PreEntryBlock,
- BranchInst *MergedBR,
- uint64_t ProfileCount);
- void fixupBranch(Region *R,
- CHRScope *Scope,
- IRBuilder<> &IRB,
- Value *&MergedCondition, BranchProbability &CHRBranchBias);
- void fixupSelect(SelectInst* SI,
- CHRScope *Scope,
- IRBuilder<> &IRB,
- Value *&MergedCondition, BranchProbability &CHRBranchBias);
- void addToMergedCondition(bool IsTrueBiased, Value *Cond,
- Instruction *BranchOrSelect,
- CHRScope *Scope,
- IRBuilder<> &IRB,
- Value *&MergedCondition);
-
- Function &F;
- BlockFrequencyInfo &BFI;
- DominatorTree &DT;
- ProfileSummaryInfo &PSI;
- RegionInfo &RI;
- OptimizationRemarkEmitter &ORE;
- CHRStats Stats;
-
- // All the true-biased regions in the function
- DenseSet<Region *> TrueBiasedRegionsGlobal;
- // All the false-biased regions in the function
- DenseSet<Region *> FalseBiasedRegionsGlobal;
- // All the true-biased selects in the function
- DenseSet<SelectInst *> TrueBiasedSelectsGlobal;
- // All the false-biased selects in the function
- DenseSet<SelectInst *> FalseBiasedSelectsGlobal;
- // A map from biased regions to their branch bias
- DenseMap<Region *, BranchProbability> BranchBiasMap;
- // A map from biased selects to their branch bias
- DenseMap<SelectInst *, BranchProbability> SelectBiasMap;
- // All the scopes.
- DenseSet<CHRScope *> Scopes;
-};
-
-} // end anonymous namespace
-
-static inline
-raw_ostream LLVM_ATTRIBUTE_UNUSED &operator<<(raw_ostream &OS,
- const CHRStats &Stats) {
- Stats.print(OS);
- return OS;
-}
-
-static inline
-raw_ostream &operator<<(raw_ostream &OS, const CHRScope &Scope) {
- Scope.print(OS);
- return OS;
-}
-
-static bool shouldApply(Function &F, ProfileSummaryInfo& PSI) {
- if (ForceCHR)
- return true;
-
- if (!CHRModuleList.empty() || !CHRFunctionList.empty()) {
- if (CHRModules.count(F.getParent()->getName()))
- return true;
- return CHRFunctions.count(F.getName());
- }
-
- assert(PSI.hasProfileSummary() && "Empty PSI?");
- return PSI.isFunctionEntryHot(&F);
-}
-
-static void LLVM_ATTRIBUTE_UNUSED dumpIR(Function &F, const char *Label,
- CHRStats *Stats) {
- StringRef FuncName = F.getName();
- StringRef ModuleName = F.getParent()->getName();
- (void)(FuncName); // Unused in release build.
- (void)(ModuleName); // Unused in release build.
- CHR_DEBUG(dbgs() << "CHR IR dump " << Label << " " << ModuleName << " "
- << FuncName);
- if (Stats)
- CHR_DEBUG(dbgs() << " " << *Stats);
- CHR_DEBUG(dbgs() << "\n");
- CHR_DEBUG(F.dump());
-}
-
-void CHRScope::print(raw_ostream &OS) const {
- assert(RegInfos.size() > 0 && "Empty CHRScope");
- OS << "CHRScope[";
- OS << RegInfos.size() << ", Regions[";
- for (const RegInfo &RI : RegInfos) {
- OS << RI.R->getNameStr();
- if (RI.HasBranch)
- OS << " B";
- if (RI.Selects.size() > 0)
- OS << " S" << RI.Selects.size();
- OS << ", ";
- }
- if (RegInfos[0].R->getParent()) {
- OS << "], Parent " << RegInfos[0].R->getParent()->getNameStr();
- } else {
- // top level region
- OS << "]";
- }
- OS << ", Subs[";
- for (CHRScope *Sub : Subs) {
- OS << *Sub << ", ";
- }
- OS << "]]";
-}
-
-// Return true if the given instruction type can be hoisted by CHR.
-static bool isHoistableInstructionType(Instruction *I) {
- return isa<BinaryOperator>(I) || isa<CastInst>(I) || isa<SelectInst>(I) ||
- isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
- isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
- isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) ||
- isa<InsertValueInst>(I);
-}
-
-// Return true if the given instruction can be hoisted by CHR.
-static bool isHoistable(Instruction *I, DominatorTree &DT) {
- if (!isHoistableInstructionType(I))
- return false;
- return isSafeToSpeculativelyExecute(I, nullptr, &DT);
-}
-
-// Recursively traverse the use-def chains of the given value and return a set
-// of the unhoistable base values defined within the scope (excluding the
-// first-region entry block) or the (hoistable or unhoistable) base values that
-// are defined outside (including the first-region entry block) of the
-// scope. The returned set doesn't include constants.
-static const std::set<Value *> &
-getBaseValues(Value *V, DominatorTree &DT,
- DenseMap<Value *, std::set<Value *>> &Visited) {
- auto It = Visited.find(V);
- if (It != Visited.end()) {
- return It->second;
- }
- std::set<Value *> Result;
- if (auto *I = dyn_cast<Instruction>(V)) {
- // We don't stop at a block that's not in the Scope because we would miss
- // some instructions that are based on the same base values if we stop
- // there.
- if (!isHoistable(I, DT)) {
- Result.insert(I);
- return Visited.insert(std::make_pair(V, std::move(Result))).first->second;
- }
- // I is hoistable above the Scope.
- for (Value *Op : I->operands()) {
- const std::set<Value *> &OpResult = getBaseValues(Op, DT, Visited);
- Result.insert(OpResult.begin(), OpResult.end());
- }
- return Visited.insert(std::make_pair(V, std::move(Result))).first->second;
- }
- if (isa<Argument>(V)) {
- Result.insert(V);
- }
- // We don't include others like constants because those won't lead to any
- // chance of folding of conditions (eg two bit checks merged into one check)
- // after CHR.
- return Visited.insert(std::make_pair(V, std::move(Result))).first->second;
-}
-
-// Return true if V is already hoisted or can be hoisted (along with its
-// operands) above the insert point. When it returns true and HoistStops is
-// non-null, the instructions to stop hoisting at through the use-def chains are
-// inserted into HoistStops.
-static bool
-checkHoistValue(Value *V, Instruction *InsertPoint, DominatorTree &DT,
- DenseSet<Instruction *> &Unhoistables,
- DenseSet<Instruction *> *HoistStops,
- DenseMap<Instruction *, bool> &Visited) {
- assert(InsertPoint && "Null InsertPoint");
- if (auto *I = dyn_cast<Instruction>(V)) {
- auto It = Visited.find(I);
- if (It != Visited.end()) {
- return It->second;
- }
- assert(DT.getNode(I->getParent()) && "DT must contain I's parent block");
- assert(DT.getNode(InsertPoint->getParent()) && "DT must contain Destination");
- if (Unhoistables.count(I)) {
- // Don't hoist if they are not to be hoisted.
- Visited[I] = false;
- return false;
- }
- if (DT.dominates(I, InsertPoint)) {
- // We are already above the insert point. Stop here.
- if (HoistStops)
- HoistStops->insert(I);
- Visited[I] = true;
- return true;
- }
- // We aren't not above the insert point, check if we can hoist it above the
- // insert point.
- if (isHoistable(I, DT)) {
- // Check operands first.
- DenseSet<Instruction *> OpsHoistStops;
- bool AllOpsHoisted = true;
- for (Value *Op : I->operands()) {
- if (!checkHoistValue(Op, InsertPoint, DT, Unhoistables, &OpsHoistStops,
- Visited)) {
- AllOpsHoisted = false;
- break;
- }
- }
- if (AllOpsHoisted) {
- CHR_DEBUG(dbgs() << "checkHoistValue " << *I << "\n");
- if (HoistStops)
- HoistStops->insert(OpsHoistStops.begin(), OpsHoistStops.end());
- Visited[I] = true;
- return true;
- }
- }
- Visited[I] = false;
- return false;
- }
- // Non-instructions are considered hoistable.
- return true;
-}
-
-// Returns true and sets the true probability and false probability of an
-// MD_prof metadata if it's well-formed.
-static bool checkMDProf(MDNode *MD, BranchProbability &TrueProb,
- BranchProbability &FalseProb) {
- if (!MD) return false;
- MDString *MDName = cast<MDString>(MD->getOperand(0));
- if (MDName->getString() != "branch_weights" ||
- MD->getNumOperands() != 3)
- return false;
- ConstantInt *TrueWeight = mdconst::extract<ConstantInt>(MD->getOperand(1));
- ConstantInt *FalseWeight = mdconst::extract<ConstantInt>(MD->getOperand(2));
- if (!TrueWeight || !FalseWeight)
- return false;
- uint64_t TrueWt = TrueWeight->getValue().getZExtValue();
- uint64_t FalseWt = FalseWeight->getValue().getZExtValue();
- uint64_t SumWt = TrueWt + FalseWt;
-
- assert(SumWt >= TrueWt && SumWt >= FalseWt &&
- "Overflow calculating branch probabilities.");
-
- // Guard against 0-to-0 branch weights to avoid a division-by-zero crash.
- if (SumWt == 0)
- return false;
-
- TrueProb = BranchProbability::getBranchProbability(TrueWt, SumWt);
- FalseProb = BranchProbability::getBranchProbability(FalseWt, SumWt);
- return true;
-}
-
-static BranchProbability getCHRBiasThreshold() {
- return BranchProbability::getBranchProbability(
- static_cast<uint64_t>(CHRBiasThreshold * 1000000), 1000000);
-}
-
-// A helper for CheckBiasedBranch and CheckBiasedSelect. If TrueProb >=
-// CHRBiasThreshold, put Key into TrueSet and return true. If FalseProb >=
-// CHRBiasThreshold, put Key into FalseSet and return true. Otherwise, return
-// false.
-template <typename K, typename S, typename M>
-static bool checkBias(K *Key, BranchProbability TrueProb,
- BranchProbability FalseProb, S &TrueSet, S &FalseSet,
- M &BiasMap) {
- BranchProbability Threshold = getCHRBiasThreshold();
- if (TrueProb >= Threshold) {
- TrueSet.insert(Key);
- BiasMap[Key] = TrueProb;
- return true;
- } else if (FalseProb >= Threshold) {
- FalseSet.insert(Key);
- BiasMap[Key] = FalseProb;
- return true;
- }
- return false;
-}
-
-// Returns true and insert a region into the right biased set and the map if the
-// branch of the region is biased.
-static bool checkBiasedBranch(BranchInst *BI, Region *R,
- DenseSet<Region *> &TrueBiasedRegionsGlobal,
- DenseSet<Region *> &FalseBiasedRegionsGlobal,
- DenseMap<Region *, BranchProbability> &BranchBiasMap) {
- if (!BI->isConditional())
- return false;
- BranchProbability ThenProb, ElseProb;
- if (!checkMDProf(BI->getMetadata(LLVMContext::MD_prof),
- ThenProb, ElseProb))
- return false;
- BasicBlock *IfThen = BI->getSuccessor(0);
- BasicBlock *IfElse = BI->getSuccessor(1);
- assert((IfThen == R->getExit() || IfElse == R->getExit()) &&
- IfThen != IfElse &&
- "Invariant from findScopes");
- if (IfThen == R->getExit()) {
- // Swap them so that IfThen/ThenProb means going into the conditional code
- // and IfElse/ElseProb means skipping it.
- std::swap(IfThen, IfElse);
- std::swap(ThenProb, ElseProb);
- }
- CHR_DEBUG(dbgs() << "BI " << *BI << " ");
- CHR_DEBUG(dbgs() << "ThenProb " << ThenProb << " ");
- CHR_DEBUG(dbgs() << "ElseProb " << ElseProb << "\n");
- return checkBias(R, ThenProb, ElseProb,
- TrueBiasedRegionsGlobal, FalseBiasedRegionsGlobal,
- BranchBiasMap);
-}
-
-// Returns true and insert a select into the right biased set and the map if the
-// select is biased.
-static bool checkBiasedSelect(
- SelectInst *SI, Region *R,
- DenseSet<SelectInst *> &TrueBiasedSelectsGlobal,
- DenseSet<SelectInst *> &FalseBiasedSelectsGlobal,
- DenseMap<SelectInst *, BranchProbability> &SelectBiasMap) {
- BranchProbability TrueProb, FalseProb;
- if (!checkMDProf(SI->getMetadata(LLVMContext::MD_prof),
- TrueProb, FalseProb))
- return false;
- CHR_DEBUG(dbgs() << "SI " << *SI << " ");
- CHR_DEBUG(dbgs() << "TrueProb " << TrueProb << " ");
- CHR_DEBUG(dbgs() << "FalseProb " << FalseProb << "\n");
- return checkBias(SI, TrueProb, FalseProb,
- TrueBiasedSelectsGlobal, FalseBiasedSelectsGlobal,
- SelectBiasMap);
-}
-
-// Returns the instruction at which to hoist the dependent condition values and
-// insert the CHR branch for a region. This is the terminator branch in the
-// entry block or the first select in the entry block, if any.
-static Instruction* getBranchInsertPoint(RegInfo &RI) {
- Region *R = RI.R;
- BasicBlock *EntryBB = R->getEntry();
- // The hoist point is by default the terminator of the entry block, which is
- // the same as the branch instruction if RI.HasBranch is true.
- Instruction *HoistPoint = EntryBB->getTerminator();
- for (SelectInst *SI : RI.Selects) {
- if (SI->getParent() == EntryBB) {
- // Pick the first select in Selects in the entry block. Note Selects is
- // sorted in the instruction order within a block (asserted below).
- HoistPoint = SI;
- break;
- }
- }
- assert(HoistPoint && "Null HoistPoint");
-#ifndef NDEBUG
- // Check that HoistPoint is the first one in Selects in the entry block,
- // if any.
- DenseSet<Instruction *> EntryBlockSelectSet;
- for (SelectInst *SI : RI.Selects) {
- if (SI->getParent() == EntryBB) {
- EntryBlockSelectSet.insert(SI);
- }
- }
- for (Instruction &I : *EntryBB) {
+ "Must be in head");
+ return true;
+ });
+ ArrayRef<CHRScope *> TailSubs(TailIt, Subs.end());
+
+ assert(HoistStopMap.empty() && "MapHoistStops must be empty");
+ auto *Scope = new CHRScope(TailRegInfos, TailSubs);
+ RegInfos.erase(BoundaryIt, RegInfos.end());
+ Subs.erase(TailIt, Subs.end());
+ return Scope;
+ }
+
+ bool contains(Instruction *I) const {
+ BasicBlock *Parent = I->getParent();
+ for (const RegInfo &RI : RegInfos)
+ if (RI.R->contains(Parent))
+ return true;
+ return false;
+ }
+
+ void print(raw_ostream &OS) const;
+
+ SmallVector<RegInfo, 8> RegInfos; // Regions that belong to this scope
+ SmallVector<CHRScope *, 8> Subs; // Subscopes.
+
+ // The instruction at which to insert the CHR conditional branch (and hoist
+ // the dependent condition values).
+ Instruction *BranchInsertPoint;
+
+ // True-biased and false-biased regions (conditional blocks),
+ // respectively. Used only for the outermost scope and includes regions in
+ // subscopes. The rest are unbiased.
+ DenseSet<Region *> TrueBiasedRegions;
+ DenseSet<Region *> FalseBiasedRegions;
+ // Among the biased regions, the regions that get CHRed.
+ SmallVector<RegInfo, 8> CHRRegions;
+
+ // True-biased and false-biased selects, respectively. Used only for the
+ // outermost scope and includes ones in subscopes.
+ DenseSet<SelectInst *> TrueBiasedSelects;
+ DenseSet<SelectInst *> FalseBiasedSelects;
+
+ // Map from one of the above regions to the instructions to stop
+ // hoisting instructions at through use-def chains.
+ HoistStopMapTy HoistStopMap;
+
+ private:
+ CHRScope(ArrayRef<RegInfo> RegInfosIn, ArrayRef<CHRScope *> SubsIn)
+ : RegInfos(RegInfosIn.begin(), RegInfosIn.end()),
+ Subs(SubsIn.begin(), SubsIn.end()), BranchInsertPoint(nullptr) {}
+};
+
+class CHR {
+ public:
+ CHR(Function &Fin, BlockFrequencyInfo &BFIin, DominatorTree &DTin,
+ ProfileSummaryInfo &PSIin, RegionInfo &RIin,
+ OptimizationRemarkEmitter &OREin)
+ : F(Fin), BFI(BFIin), DT(DTin), PSI(PSIin), RI(RIin), ORE(OREin) {}
+
+ ~CHR() {
+ for (CHRScope *Scope : Scopes) {
+ delete Scope;
+ }
+ }
+
+ bool run();
+
+ private:
+ // See the comments in CHR::run() for the high level flow of the algorithm and
+ // what the following functions do.
+
+ void findScopes(SmallVectorImpl<CHRScope *> &Output) {
+ Region *R = RI.getTopLevelRegion();
+ if (CHRScope *Scope = findScopes(R, nullptr, nullptr, Output)) {
+ Output.push_back(Scope);
+ }
+ }
+ CHRScope *findScopes(Region *R, Region *NextRegion, Region *ParentRegion,
+ SmallVectorImpl<CHRScope *> &Scopes);
+ CHRScope *findScope(Region *R);
+ void checkScopeHoistable(CHRScope *Scope);
+
+ void splitScopes(SmallVectorImpl<CHRScope *> &Input,
+ SmallVectorImpl<CHRScope *> &Output);
+ SmallVector<CHRScope *, 8> splitScope(CHRScope *Scope,
+ CHRScope *Outer,
+ DenseSet<Value *> *OuterConditionValues,
+ Instruction *OuterInsertPoint,
+ SmallVectorImpl<CHRScope *> &Output,
+ DenseSet<Instruction *> &Unhoistables);
+
+ void classifyBiasedScopes(SmallVectorImpl<CHRScope *> &Scopes);
+ void classifyBiasedScopes(CHRScope *Scope, CHRScope *OutermostScope);
+
+ void filterScopes(SmallVectorImpl<CHRScope *> &Input,
+ SmallVectorImpl<CHRScope *> &Output);
+
+ void setCHRRegions(SmallVectorImpl<CHRScope *> &Input,
+ SmallVectorImpl<CHRScope *> &Output);
+ void setCHRRegions(CHRScope *Scope, CHRScope *OutermostScope);
+
+ void sortScopes(SmallVectorImpl<CHRScope *> &Input,
+ SmallVectorImpl<CHRScope *> &Output);
+
+ void transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes);
+ void transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs);
+ void cloneScopeBlocks(CHRScope *Scope,
+ BasicBlock *PreEntryBlock,
+ BasicBlock *ExitBlock,
+ Region *LastRegion,
+ ValueToValueMapTy &VMap);
+ BranchInst *createMergedBranch(BasicBlock *PreEntryBlock,
+ BasicBlock *EntryBlock,
+ BasicBlock *NewEntryBlock,
+ ValueToValueMapTy &VMap);
+ void fixupBranchesAndSelects(CHRScope *Scope,
+ BasicBlock *PreEntryBlock,
+ BranchInst *MergedBR,
+ uint64_t ProfileCount);
+ void fixupBranch(Region *R,
+ CHRScope *Scope,
+ IRBuilder<> &IRB,
+ Value *&MergedCondition, BranchProbability &CHRBranchBias);
+ void fixupSelect(SelectInst* SI,
+ CHRScope *Scope,
+ IRBuilder<> &IRB,
+ Value *&MergedCondition, BranchProbability &CHRBranchBias);
+ void addToMergedCondition(bool IsTrueBiased, Value *Cond,
+ Instruction *BranchOrSelect,
+ CHRScope *Scope,
+ IRBuilder<> &IRB,
+ Value *&MergedCondition);
+
+ Function &F;
+ BlockFrequencyInfo &BFI;
+ DominatorTree &DT;
+ ProfileSummaryInfo &PSI;
+ RegionInfo &RI;
+ OptimizationRemarkEmitter &ORE;
+ CHRStats Stats;
+
+ // All the true-biased regions in the function
+ DenseSet<Region *> TrueBiasedRegionsGlobal;
+ // All the false-biased regions in the function
+ DenseSet<Region *> FalseBiasedRegionsGlobal;
+ // All the true-biased selects in the function
+ DenseSet<SelectInst *> TrueBiasedSelectsGlobal;
+ // All the false-biased selects in the function
+ DenseSet<SelectInst *> FalseBiasedSelectsGlobal;
+ // A map from biased regions to their branch bias
+ DenseMap<Region *, BranchProbability> BranchBiasMap;
+ // A map from biased selects to their branch bias
+ DenseMap<SelectInst *, BranchProbability> SelectBiasMap;
+ // All the scopes.
+ DenseSet<CHRScope *> Scopes;
+};
+
+} // end anonymous namespace
+
+static inline
+raw_ostream LLVM_ATTRIBUTE_UNUSED &operator<<(raw_ostream &OS,
+ const CHRStats &Stats) {
+ Stats.print(OS);
+ return OS;
+}
+
+static inline
+raw_ostream &operator<<(raw_ostream &OS, const CHRScope &Scope) {
+ Scope.print(OS);
+ return OS;
+}
+
+static bool shouldApply(Function &F, ProfileSummaryInfo& PSI) {
+ if (ForceCHR)
+ return true;
+
+ if (!CHRModuleList.empty() || !CHRFunctionList.empty()) {
+ if (CHRModules.count(F.getParent()->getName()))
+ return true;
+ return CHRFunctions.count(F.getName());
+ }
+
+ assert(PSI.hasProfileSummary() && "Empty PSI?");
+ return PSI.isFunctionEntryHot(&F);
+}
+
+static void LLVM_ATTRIBUTE_UNUSED dumpIR(Function &F, const char *Label,
+ CHRStats *Stats) {
+ StringRef FuncName = F.getName();
+ StringRef ModuleName = F.getParent()->getName();
+ (void)(FuncName); // Unused in release build.
+ (void)(ModuleName); // Unused in release build.
+ CHR_DEBUG(dbgs() << "CHR IR dump " << Label << " " << ModuleName << " "
+ << FuncName);
+ if (Stats)
+ CHR_DEBUG(dbgs() << " " << *Stats);
+ CHR_DEBUG(dbgs() << "\n");
+ CHR_DEBUG(F.dump());
+}
+
+void CHRScope::print(raw_ostream &OS) const {
+ assert(RegInfos.size() > 0 && "Empty CHRScope");
+ OS << "CHRScope[";
+ OS << RegInfos.size() << ", Regions[";
+ for (const RegInfo &RI : RegInfos) {
+ OS << RI.R->getNameStr();
+ if (RI.HasBranch)
+ OS << " B";
+ if (RI.Selects.size() > 0)
+ OS << " S" << RI.Selects.size();
+ OS << ", ";
+ }
+ if (RegInfos[0].R->getParent()) {
+ OS << "], Parent " << RegInfos[0].R->getParent()->getNameStr();
+ } else {
+ // top level region
+ OS << "]";
+ }
+ OS << ", Subs[";
+ for (CHRScope *Sub : Subs) {
+ OS << *Sub << ", ";
+ }
+ OS << "]]";
+}
+
+// Return true if the given instruction type can be hoisted by CHR.
+static bool isHoistableInstructionType(Instruction *I) {
+ return isa<BinaryOperator>(I) || isa<CastInst>(I) || isa<SelectInst>(I) ||
+ isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
+ isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+ isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) ||
+ isa<InsertValueInst>(I);
+}
+
+// Return true if the given instruction can be hoisted by CHR.
+static bool isHoistable(Instruction *I, DominatorTree &DT) {
+ if (!isHoistableInstructionType(I))
+ return false;
+ return isSafeToSpeculativelyExecute(I, nullptr, &DT);
+}
+
+// Recursively traverse the use-def chains of the given value and return a set
+// of the unhoistable base values defined within the scope (excluding the
+// first-region entry block) or the (hoistable or unhoistable) base values that
+// are defined outside (including the first-region entry block) of the
+// scope. The returned set doesn't include constants.
+static const std::set<Value *> &
+getBaseValues(Value *V, DominatorTree &DT,
+ DenseMap<Value *, std::set<Value *>> &Visited) {
+ auto It = Visited.find(V);
+ if (It != Visited.end()) {
+ return It->second;
+ }
+ std::set<Value *> Result;
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ // We don't stop at a block that's not in the Scope because we would miss
+ // some instructions that are based on the same base values if we stop
+ // there.
+ if (!isHoistable(I, DT)) {
+ Result.insert(I);
+ return Visited.insert(std::make_pair(V, std::move(Result))).first->second;
+ }
+ // I is hoistable above the Scope.
+ for (Value *Op : I->operands()) {
+ const std::set<Value *> &OpResult = getBaseValues(Op, DT, Visited);
+ Result.insert(OpResult.begin(), OpResult.end());
+ }
+ return Visited.insert(std::make_pair(V, std::move(Result))).first->second;
+ }
+ if (isa<Argument>(V)) {
+ Result.insert(V);
+ }
+ // We don't include others like constants because those won't lead to any
+ // chance of folding of conditions (eg two bit checks merged into one check)
+ // after CHR.
+ return Visited.insert(std::make_pair(V, std::move(Result))).first->second;
+}
+
+// Return true if V is already hoisted or can be hoisted (along with its
+// operands) above the insert point. When it returns true and HoistStops is
+// non-null, the instructions to stop hoisting at through the use-def chains are
+// inserted into HoistStops.
+static bool
+checkHoistValue(Value *V, Instruction *InsertPoint, DominatorTree &DT,
+ DenseSet<Instruction *> &Unhoistables,
+ DenseSet<Instruction *> *HoistStops,
+ DenseMap<Instruction *, bool> &Visited) {
+ assert(InsertPoint && "Null InsertPoint");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ auto It = Visited.find(I);
+ if (It != Visited.end()) {
+ return It->second;
+ }
+ assert(DT.getNode(I->getParent()) && "DT must contain I's parent block");
+ assert(DT.getNode(InsertPoint->getParent()) && "DT must contain Destination");
+ if (Unhoistables.count(I)) {
+ // Don't hoist if they are not to be hoisted.
+ Visited[I] = false;
+ return false;
+ }
+ if (DT.dominates(I, InsertPoint)) {
+ // We are already above the insert point. Stop here.
+ if (HoistStops)
+ HoistStops->insert(I);
+ Visited[I] = true;
+ return true;
+ }
+ // We aren't not above the insert point, check if we can hoist it above the
+ // insert point.
+ if (isHoistable(I, DT)) {
+ // Check operands first.
+ DenseSet<Instruction *> OpsHoistStops;
+ bool AllOpsHoisted = true;
+ for (Value *Op : I->operands()) {
+ if (!checkHoistValue(Op, InsertPoint, DT, Unhoistables, &OpsHoistStops,
+ Visited)) {
+ AllOpsHoisted = false;
+ break;
+ }
+ }
+ if (AllOpsHoisted) {
+ CHR_DEBUG(dbgs() << "checkHoistValue " << *I << "\n");
+ if (HoistStops)
+ HoistStops->insert(OpsHoistStops.begin(), OpsHoistStops.end());
+ Visited[I] = true;
+ return true;
+ }
+ }
+ Visited[I] = false;
+ return false;
+ }
+ // Non-instructions are considered hoistable.
+ return true;
+}
+
+// Returns true and sets the true probability and false probability of an
+// MD_prof metadata if it's well-formed.
+static bool checkMDProf(MDNode *MD, BranchProbability &TrueProb,
+ BranchProbability &FalseProb) {
+ if (!MD) return false;
+ MDString *MDName = cast<MDString>(MD->getOperand(0));
+ if (MDName->getString() != "branch_weights" ||
+ MD->getNumOperands() != 3)
+ return false;
+ ConstantInt *TrueWeight = mdconst::extract<ConstantInt>(MD->getOperand(1));
+ ConstantInt *FalseWeight = mdconst::extract<ConstantInt>(MD->getOperand(2));
+ if (!TrueWeight || !FalseWeight)
+ return false;
+ uint64_t TrueWt = TrueWeight->getValue().getZExtValue();
+ uint64_t FalseWt = FalseWeight->getValue().getZExtValue();
+ uint64_t SumWt = TrueWt + FalseWt;
+
+ assert(SumWt >= TrueWt && SumWt >= FalseWt &&
+ "Overflow calculating branch probabilities.");
+
+ // Guard against 0-to-0 branch weights to avoid a division-by-zero crash.
+ if (SumWt == 0)
+ return false;
+
+ TrueProb = BranchProbability::getBranchProbability(TrueWt, SumWt);
+ FalseProb = BranchProbability::getBranchProbability(FalseWt, SumWt);
+ return true;
+}
+
+static BranchProbability getCHRBiasThreshold() {
+ return BranchProbability::getBranchProbability(
+ static_cast<uint64_t>(CHRBiasThreshold * 1000000), 1000000);
+}
+
+// A helper for CheckBiasedBranch and CheckBiasedSelect. If TrueProb >=
+// CHRBiasThreshold, put Key into TrueSet and return true. If FalseProb >=
+// CHRBiasThreshold, put Key into FalseSet and return true. Otherwise, return
+// false.
+template <typename K, typename S, typename M>
+static bool checkBias(K *Key, BranchProbability TrueProb,
+ BranchProbability FalseProb, S &TrueSet, S &FalseSet,
+ M &BiasMap) {
+ BranchProbability Threshold = getCHRBiasThreshold();
+ if (TrueProb >= Threshold) {
+ TrueSet.insert(Key);
+ BiasMap[Key] = TrueProb;
+ return true;
+ } else if (FalseProb >= Threshold) {
+ FalseSet.insert(Key);
+ BiasMap[Key] = FalseProb;
+ return true;
+ }
+ return false;
+}
+
+// Returns true and insert a region into the right biased set and the map if the
+// branch of the region is biased.
+static bool checkBiasedBranch(BranchInst *BI, Region *R,
+ DenseSet<Region *> &TrueBiasedRegionsGlobal,
+ DenseSet<Region *> &FalseBiasedRegionsGlobal,
+ DenseMap<Region *, BranchProbability> &BranchBiasMap) {
+ if (!BI->isConditional())
+ return false;
+ BranchProbability ThenProb, ElseProb;
+ if (!checkMDProf(BI->getMetadata(LLVMContext::MD_prof),
+ ThenProb, ElseProb))
+ return false;
+ BasicBlock *IfThen = BI->getSuccessor(0);
+ BasicBlock *IfElse = BI->getSuccessor(1);
+ assert((IfThen == R->getExit() || IfElse == R->getExit()) &&
+ IfThen != IfElse &&
+ "Invariant from findScopes");
+ if (IfThen == R->getExit()) {
+ // Swap them so that IfThen/ThenProb means going into the conditional code
+ // and IfElse/ElseProb means skipping it.
+ std::swap(IfThen, IfElse);
+ std::swap(ThenProb, ElseProb);
+ }
+ CHR_DEBUG(dbgs() << "BI " << *BI << " ");
+ CHR_DEBUG(dbgs() << "ThenProb " << ThenProb << " ");
+ CHR_DEBUG(dbgs() << "ElseProb " << ElseProb << "\n");
+ return checkBias(R, ThenProb, ElseProb,
+ TrueBiasedRegionsGlobal, FalseBiasedRegionsGlobal,
+ BranchBiasMap);
+}
+
+// Returns true and insert a select into the right biased set and the map if the
+// select is biased.
+static bool checkBiasedSelect(
+ SelectInst *SI, Region *R,
+ DenseSet<SelectInst *> &TrueBiasedSelectsGlobal,
+ DenseSet<SelectInst *> &FalseBiasedSelectsGlobal,
+ DenseMap<SelectInst *, BranchProbability> &SelectBiasMap) {
+ BranchProbability TrueProb, FalseProb;
+ if (!checkMDProf(SI->getMetadata(LLVMContext::MD_prof),
+ TrueProb, FalseProb))
+ return false;
+ CHR_DEBUG(dbgs() << "SI " << *SI << " ");
+ CHR_DEBUG(dbgs() << "TrueProb " << TrueProb << " ");
+ CHR_DEBUG(dbgs() << "FalseProb " << FalseProb << "\n");
+ return checkBias(SI, TrueProb, FalseProb,
+ TrueBiasedSelectsGlobal, FalseBiasedSelectsGlobal,
+ SelectBiasMap);
+}
+
+// Returns the instruction at which to hoist the dependent condition values and
+// insert the CHR branch for a region. This is the terminator branch in the
+// entry block or the first select in the entry block, if any.
+static Instruction* getBranchInsertPoint(RegInfo &RI) {
+ Region *R = RI.R;
+ BasicBlock *EntryBB = R->getEntry();
+ // The hoist point is by default the terminator of the entry block, which is
+ // the same as the branch instruction if RI.HasBranch is true.
+ Instruction *HoistPoint = EntryBB->getTerminator();
+ for (SelectInst *SI : RI.Selects) {
+ if (SI->getParent() == EntryBB) {
+ // Pick the first select in Selects in the entry block. Note Selects is
+ // sorted in the instruction order within a block (asserted below).
+ HoistPoint = SI;
+ break;
+ }
+ }
+ assert(HoistPoint && "Null HoistPoint");
+#ifndef NDEBUG
+ // Check that HoistPoint is the first one in Selects in the entry block,
+ // if any.
+ DenseSet<Instruction *> EntryBlockSelectSet;
+ for (SelectInst *SI : RI.Selects) {
+ if (SI->getParent() == EntryBB) {
+ EntryBlockSelectSet.insert(SI);
+ }
+ }
+ for (Instruction &I : *EntryBB) {
if (EntryBlockSelectSet.contains(&I)) {
- assert(&I == HoistPoint &&
- "HoistPoint must be the first one in Selects");
- break;
- }
- }
-#endif
- return HoistPoint;
-}
-
-// Find a CHR scope in the given region.
-CHRScope * CHR::findScope(Region *R) {
- CHRScope *Result = nullptr;
- BasicBlock *Entry = R->getEntry();
- BasicBlock *Exit = R->getExit(); // null if top level.
- assert(Entry && "Entry must not be null");
- assert((Exit == nullptr) == (R->isTopLevelRegion()) &&
- "Only top level region has a null exit");
- if (Entry)
- CHR_DEBUG(dbgs() << "Entry " << Entry->getName() << "\n");
- else
- CHR_DEBUG(dbgs() << "Entry null\n");
- if (Exit)
- CHR_DEBUG(dbgs() << "Exit " << Exit->getName() << "\n");
- else
- CHR_DEBUG(dbgs() << "Exit null\n");
- // Exclude cases where Entry is part of a subregion (hence it doesn't belong
- // to this region).
- bool EntryInSubregion = RI.getRegionFor(Entry) != R;
- if (EntryInSubregion)
- return nullptr;
- // Exclude loops
- for (BasicBlock *Pred : predecessors(Entry))
- if (R->contains(Pred))
- return nullptr;
- if (Exit) {
- // Try to find an if-then block (check if R is an if-then).
- // if (cond) {
- // ...
- // }
- auto *BI = dyn_cast<BranchInst>(Entry->getTerminator());
- if (BI)
- CHR_DEBUG(dbgs() << "BI.isConditional " << BI->isConditional() << "\n");
- else
- CHR_DEBUG(dbgs() << "BI null\n");
- if (BI && BI->isConditional()) {
- BasicBlock *S0 = BI->getSuccessor(0);
- BasicBlock *S1 = BI->getSuccessor(1);
- CHR_DEBUG(dbgs() << "S0 " << S0->getName() << "\n");
- CHR_DEBUG(dbgs() << "S1 " << S1->getName() << "\n");
- if (S0 != S1 && (S0 == Exit || S1 == Exit)) {
- RegInfo RI(R);
- RI.HasBranch = checkBiasedBranch(
- BI, R, TrueBiasedRegionsGlobal, FalseBiasedRegionsGlobal,
- BranchBiasMap);
- Result = new CHRScope(RI);
- Scopes.insert(Result);
- CHR_DEBUG(dbgs() << "Found a region with a branch\n");
- ++Stats.NumBranches;
- if (!RI.HasBranch) {
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "BranchNotBiased", BI)
- << "Branch not biased";
- });
- }
- }
- }
- }
- {
- // Try to look for selects in the direct child blocks (as opposed to in
- // subregions) of R.
- // ...
- // if (..) { // Some subregion
- // ...
- // }
- // if (..) { // Some subregion
- // ...
- // }
- // ...
- // a = cond ? b : c;
- // ...
- SmallVector<SelectInst *, 8> Selects;
- for (RegionNode *E : R->elements()) {
- if (E->isSubRegion())
- continue;
- // This returns the basic block of E if E is a direct child of R (not a
- // subregion.)
- BasicBlock *BB = E->getEntry();
- // Need to push in the order to make it easier to find the first Select
- // later.
- for (Instruction &I : *BB) {
- if (auto *SI = dyn_cast<SelectInst>(&I)) {
- Selects.push_back(SI);
- ++Stats.NumBranches;
- }
- }
- }
- if (Selects.size() > 0) {
- auto AddSelects = [&](RegInfo &RI) {
- for (auto *SI : Selects)
- if (checkBiasedSelect(SI, RI.R,
- TrueBiasedSelectsGlobal,
- FalseBiasedSelectsGlobal,
- SelectBiasMap))
- RI.Selects.push_back(SI);
- else
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "SelectNotBiased", SI)
- << "Select not biased";
- });
- };
- if (!Result) {
- CHR_DEBUG(dbgs() << "Found a select-only region\n");
- RegInfo RI(R);
- AddSelects(RI);
- Result = new CHRScope(RI);
- Scopes.insert(Result);
- } else {
- CHR_DEBUG(dbgs() << "Found select(s) in a region with a branch\n");
- AddSelects(Result->RegInfos[0]);
- }
- }
- }
-
- if (Result) {
- checkScopeHoistable(Result);
- }
- return Result;
-}
-
-// Check that any of the branch and the selects in the region could be
-// hoisted above the the CHR branch insert point (the most dominating of
-// them, either the branch (at the end of the first block) or the first
-// select in the first block). If the branch can't be hoisted, drop the
-// selects in the first blocks.
-//
-// For example, for the following scope/region with selects, we want to insert
-// the merged branch right before the first select in the first/entry block by
-// hoisting c1, c2, c3, and c4.
-//
-// // Branch insert point here.
-// a = c1 ? b : c; // Select 1
-// d = c2 ? e : f; // Select 2
-// if (c3) { // Branch
-// ...
-// c4 = foo() // A call.
-// g = c4 ? h : i; // Select 3
-// }
-//
-// But suppose we can't hoist c4 because it's dependent on the preceding
-// call. Then, we drop Select 3. Furthermore, if we can't hoist c2, we also drop
-// Select 2. If we can't hoist c3, we drop Selects 1 & 2.
-void CHR::checkScopeHoistable(CHRScope *Scope) {
- RegInfo &RI = Scope->RegInfos[0];
- Region *R = RI.R;
- BasicBlock *EntryBB = R->getEntry();
- auto *Branch = RI.HasBranch ?
- cast<BranchInst>(EntryBB->getTerminator()) : nullptr;
- SmallVector<SelectInst *, 8> &Selects = RI.Selects;
- if (RI.HasBranch || !Selects.empty()) {
- Instruction *InsertPoint = getBranchInsertPoint(RI);
- CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n");
- // Avoid a data dependence from a select or a branch to a(nother)
- // select. Note no instruction can't data-depend on a branch (a branch
- // instruction doesn't produce a value).
- DenseSet<Instruction *> Unhoistables;
- // Initialize Unhoistables with the selects.
- for (SelectInst *SI : Selects) {
- Unhoistables.insert(SI);
- }
- // Remove Selects that can't be hoisted.
- for (auto it = Selects.begin(); it != Selects.end(); ) {
- SelectInst *SI = *it;
- if (SI == InsertPoint) {
- ++it;
- continue;
- }
- DenseMap<Instruction *, bool> Visited;
- bool IsHoistable = checkHoistValue(SI->getCondition(), InsertPoint,
- DT, Unhoistables, nullptr, Visited);
- if (!IsHoistable) {
- CHR_DEBUG(dbgs() << "Dropping select " << *SI << "\n");
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE,
- "DropUnhoistableSelect", SI)
- << "Dropped unhoistable select";
- });
- it = Selects.erase(it);
- // Since we are dropping the select here, we also drop it from
- // Unhoistables.
- Unhoistables.erase(SI);
- } else
- ++it;
- }
- // Update InsertPoint after potentially removing selects.
- InsertPoint = getBranchInsertPoint(RI);
- CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n");
- if (RI.HasBranch && InsertPoint != Branch) {
- DenseMap<Instruction *, bool> Visited;
- bool IsHoistable = checkHoistValue(Branch->getCondition(), InsertPoint,
- DT, Unhoistables, nullptr, Visited);
- if (!IsHoistable) {
- // If the branch isn't hoistable, drop the selects in the entry
- // block, preferring the branch, which makes the branch the hoist
- // point.
- assert(InsertPoint != Branch && "Branch must not be the hoist point");
- CHR_DEBUG(dbgs() << "Dropping selects in entry block \n");
- CHR_DEBUG(
- for (SelectInst *SI : Selects) {
- dbgs() << "SI " << *SI << "\n";
- });
- for (SelectInst *SI : Selects) {
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE,
- "DropSelectUnhoistableBranch", SI)
- << "Dropped select due to unhoistable branch";
- });
- }
+ assert(&I == HoistPoint &&
+ "HoistPoint must be the first one in Selects");
+ break;
+ }
+ }
+#endif
+ return HoistPoint;
+}
+
+// Find a CHR scope in the given region.
+CHRScope * CHR::findScope(Region *R) {
+ CHRScope *Result = nullptr;
+ BasicBlock *Entry = R->getEntry();
+ BasicBlock *Exit = R->getExit(); // null if top level.
+ assert(Entry && "Entry must not be null");
+ assert((Exit == nullptr) == (R->isTopLevelRegion()) &&
+ "Only top level region has a null exit");
+ if (Entry)
+ CHR_DEBUG(dbgs() << "Entry " << Entry->getName() << "\n");
+ else
+ CHR_DEBUG(dbgs() << "Entry null\n");
+ if (Exit)
+ CHR_DEBUG(dbgs() << "Exit " << Exit->getName() << "\n");
+ else
+ CHR_DEBUG(dbgs() << "Exit null\n");
+ // Exclude cases where Entry is part of a subregion (hence it doesn't belong
+ // to this region).
+ bool EntryInSubregion = RI.getRegionFor(Entry) != R;
+ if (EntryInSubregion)
+ return nullptr;
+ // Exclude loops
+ for (BasicBlock *Pred : predecessors(Entry))
+ if (R->contains(Pred))
+ return nullptr;
+ if (Exit) {
+ // Try to find an if-then block (check if R is an if-then).
+ // if (cond) {
+ // ...
+ // }
+ auto *BI = dyn_cast<BranchInst>(Entry->getTerminator());
+ if (BI)
+ CHR_DEBUG(dbgs() << "BI.isConditional " << BI->isConditional() << "\n");
+ else
+ CHR_DEBUG(dbgs() << "BI null\n");
+ if (BI && BI->isConditional()) {
+ BasicBlock *S0 = BI->getSuccessor(0);
+ BasicBlock *S1 = BI->getSuccessor(1);
+ CHR_DEBUG(dbgs() << "S0 " << S0->getName() << "\n");
+ CHR_DEBUG(dbgs() << "S1 " << S1->getName() << "\n");
+ if (S0 != S1 && (S0 == Exit || S1 == Exit)) {
+ RegInfo RI(R);
+ RI.HasBranch = checkBiasedBranch(
+ BI, R, TrueBiasedRegionsGlobal, FalseBiasedRegionsGlobal,
+ BranchBiasMap);
+ Result = new CHRScope(RI);
+ Scopes.insert(Result);
+ CHR_DEBUG(dbgs() << "Found a region with a branch\n");
+ ++Stats.NumBranches;
+ if (!RI.HasBranch) {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "BranchNotBiased", BI)
+ << "Branch not biased";
+ });
+ }
+ }
+ }
+ }
+ {
+ // Try to look for selects in the direct child blocks (as opposed to in
+ // subregions) of R.
+ // ...
+ // if (..) { // Some subregion
+ // ...
+ // }
+ // if (..) { // Some subregion
+ // ...
+ // }
+ // ...
+ // a = cond ? b : c;
+ // ...
+ SmallVector<SelectInst *, 8> Selects;
+ for (RegionNode *E : R->elements()) {
+ if (E->isSubRegion())
+ continue;
+ // This returns the basic block of E if E is a direct child of R (not a
+ // subregion.)
+ BasicBlock *BB = E->getEntry();
+ // Need to push in the order to make it easier to find the first Select
+ // later.
+ for (Instruction &I : *BB) {
+ if (auto *SI = dyn_cast<SelectInst>(&I)) {
+ Selects.push_back(SI);
+ ++Stats.NumBranches;
+ }
+ }
+ }
+ if (Selects.size() > 0) {
+ auto AddSelects = [&](RegInfo &RI) {
+ for (auto *SI : Selects)
+ if (checkBiasedSelect(SI, RI.R,
+ TrueBiasedSelectsGlobal,
+ FalseBiasedSelectsGlobal,
+ SelectBiasMap))
+ RI.Selects.push_back(SI);
+ else
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "SelectNotBiased", SI)
+ << "Select not biased";
+ });
+ };
+ if (!Result) {
+ CHR_DEBUG(dbgs() << "Found a select-only region\n");
+ RegInfo RI(R);
+ AddSelects(RI);
+ Result = new CHRScope(RI);
+ Scopes.insert(Result);
+ } else {
+ CHR_DEBUG(dbgs() << "Found select(s) in a region with a branch\n");
+ AddSelects(Result->RegInfos[0]);
+ }
+ }
+ }
+
+ if (Result) {
+ checkScopeHoistable(Result);
+ }
+ return Result;
+}
+
+// Check that any of the branch and the selects in the region could be
+// hoisted above the the CHR branch insert point (the most dominating of
+// them, either the branch (at the end of the first block) or the first
+// select in the first block). If the branch can't be hoisted, drop the
+// selects in the first blocks.
+//
+// For example, for the following scope/region with selects, we want to insert
+// the merged branch right before the first select in the first/entry block by
+// hoisting c1, c2, c3, and c4.
+//
+// // Branch insert point here.
+// a = c1 ? b : c; // Select 1
+// d = c2 ? e : f; // Select 2
+// if (c3) { // Branch
+// ...
+// c4 = foo() // A call.
+// g = c4 ? h : i; // Select 3
+// }
+//
+// But suppose we can't hoist c4 because it's dependent on the preceding
+// call. Then, we drop Select 3. Furthermore, if we can't hoist c2, we also drop
+// Select 2. If we can't hoist c3, we drop Selects 1 & 2.
+void CHR::checkScopeHoistable(CHRScope *Scope) {
+ RegInfo &RI = Scope->RegInfos[0];
+ Region *R = RI.R;
+ BasicBlock *EntryBB = R->getEntry();
+ auto *Branch = RI.HasBranch ?
+ cast<BranchInst>(EntryBB->getTerminator()) : nullptr;
+ SmallVector<SelectInst *, 8> &Selects = RI.Selects;
+ if (RI.HasBranch || !Selects.empty()) {
+ Instruction *InsertPoint = getBranchInsertPoint(RI);
+ CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n");
+ // Avoid a data dependence from a select or a branch to a(nother)
+ // select. Note no instruction can't data-depend on a branch (a branch
+ // instruction doesn't produce a value).
+ DenseSet<Instruction *> Unhoistables;
+ // Initialize Unhoistables with the selects.
+ for (SelectInst *SI : Selects) {
+ Unhoistables.insert(SI);
+ }
+ // Remove Selects that can't be hoisted.
+ for (auto it = Selects.begin(); it != Selects.end(); ) {
+ SelectInst *SI = *it;
+ if (SI == InsertPoint) {
+ ++it;
+ continue;
+ }
+ DenseMap<Instruction *, bool> Visited;
+ bool IsHoistable = checkHoistValue(SI->getCondition(), InsertPoint,
+ DT, Unhoistables, nullptr, Visited);
+ if (!IsHoistable) {
+ CHR_DEBUG(dbgs() << "Dropping select " << *SI << "\n");
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE,
+ "DropUnhoistableSelect", SI)
+ << "Dropped unhoistable select";
+ });
+ it = Selects.erase(it);
+ // Since we are dropping the select here, we also drop it from
+ // Unhoistables.
+ Unhoistables.erase(SI);
+ } else
+ ++it;
+ }
+ // Update InsertPoint after potentially removing selects.
+ InsertPoint = getBranchInsertPoint(RI);
+ CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n");
+ if (RI.HasBranch && InsertPoint != Branch) {
+ DenseMap<Instruction *, bool> Visited;
+ bool IsHoistable = checkHoistValue(Branch->getCondition(), InsertPoint,
+ DT, Unhoistables, nullptr, Visited);
+ if (!IsHoistable) {
+ // If the branch isn't hoistable, drop the selects in the entry
+ // block, preferring the branch, which makes the branch the hoist
+ // point.
+ assert(InsertPoint != Branch && "Branch must not be the hoist point");
+ CHR_DEBUG(dbgs() << "Dropping selects in entry block \n");
+ CHR_DEBUG(
+ for (SelectInst *SI : Selects) {
+ dbgs() << "SI " << *SI << "\n";
+ });
+ for (SelectInst *SI : Selects) {
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE,
+ "DropSelectUnhoistableBranch", SI)
+ << "Dropped select due to unhoistable branch";
+ });
+ }
llvm::erase_if(Selects, [EntryBB](SelectInst *SI) {
return SI->getParent() == EntryBB;
});
- Unhoistables.clear();
- InsertPoint = Branch;
- }
- }
- CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n");
-#ifndef NDEBUG
- if (RI.HasBranch) {
- assert(!DT.dominates(Branch, InsertPoint) &&
- "Branch can't be already above the hoist point");
- DenseMap<Instruction *, bool> Visited;
- assert(checkHoistValue(Branch->getCondition(), InsertPoint,
- DT, Unhoistables, nullptr, Visited) &&
- "checkHoistValue for branch");
- }
- for (auto *SI : Selects) {
- assert(!DT.dominates(SI, InsertPoint) &&
- "SI can't be already above the hoist point");
- DenseMap<Instruction *, bool> Visited;
- assert(checkHoistValue(SI->getCondition(), InsertPoint, DT,
- Unhoistables, nullptr, Visited) &&
- "checkHoistValue for selects");
- }
- CHR_DEBUG(dbgs() << "Result\n");
- if (RI.HasBranch) {
- CHR_DEBUG(dbgs() << "BI " << *Branch << "\n");
- }
- for (auto *SI : Selects) {
- CHR_DEBUG(dbgs() << "SI " << *SI << "\n");
- }
-#endif
- }
-}
-
-// Traverse the region tree, find all nested scopes and merge them if possible.
-CHRScope * CHR::findScopes(Region *R, Region *NextRegion, Region *ParentRegion,
- SmallVectorImpl<CHRScope *> &Scopes) {
- CHR_DEBUG(dbgs() << "findScopes " << R->getNameStr() << "\n");
- CHRScope *Result = findScope(R);
- // Visit subscopes.
- CHRScope *ConsecutiveSubscope = nullptr;
- SmallVector<CHRScope *, 8> Subscopes;
- for (auto It = R->begin(); It != R->end(); ++It) {
- const std::unique_ptr<Region> &SubR = *It;
- auto NextIt = std::next(It);
- Region *NextSubR = NextIt != R->end() ? NextIt->get() : nullptr;
- CHR_DEBUG(dbgs() << "Looking at subregion " << SubR.get()->getNameStr()
- << "\n");
- CHRScope *SubCHRScope = findScopes(SubR.get(), NextSubR, R, Scopes);
- if (SubCHRScope) {
- CHR_DEBUG(dbgs() << "Subregion Scope " << *SubCHRScope << "\n");
- } else {
- CHR_DEBUG(dbgs() << "Subregion Scope null\n");
- }
- if (SubCHRScope) {
- if (!ConsecutiveSubscope)
- ConsecutiveSubscope = SubCHRScope;
- else if (!ConsecutiveSubscope->appendable(SubCHRScope)) {
- Subscopes.push_back(ConsecutiveSubscope);
- ConsecutiveSubscope = SubCHRScope;
- } else
- ConsecutiveSubscope->append(SubCHRScope);
- } else {
- if (ConsecutiveSubscope) {
- Subscopes.push_back(ConsecutiveSubscope);
- }
- ConsecutiveSubscope = nullptr;
- }
- }
- if (ConsecutiveSubscope) {
- Subscopes.push_back(ConsecutiveSubscope);
- }
- for (CHRScope *Sub : Subscopes) {
- if (Result) {
- // Combine it with the parent.
- Result->addSub(Sub);
- } else {
- // Push Subscopes as they won't be combined with the parent.
- Scopes.push_back(Sub);
- }
- }
- return Result;
-}
-
-static DenseSet<Value *> getCHRConditionValuesForRegion(RegInfo &RI) {
- DenseSet<Value *> ConditionValues;
- if (RI.HasBranch) {
- auto *BI = cast<BranchInst>(RI.R->getEntry()->getTerminator());
- ConditionValues.insert(BI->getCondition());
- }
- for (SelectInst *SI : RI.Selects) {
- ConditionValues.insert(SI->getCondition());
- }
- return ConditionValues;
-}
-
-
-// Determine whether to split a scope depending on the sets of the branch
-// condition values of the previous region and the current region. We split
-// (return true) it if 1) the condition values of the inner/lower scope can't be
-// hoisted up to the outer/upper scope, or 2) the two sets of the condition
-// values have an empty intersection (because the combined branch conditions
-// won't probably lead to a simpler combined condition).
-static bool shouldSplit(Instruction *InsertPoint,
- DenseSet<Value *> &PrevConditionValues,
- DenseSet<Value *> &ConditionValues,
- DominatorTree &DT,
- DenseSet<Instruction *> &Unhoistables) {
- assert(InsertPoint && "Null InsertPoint");
- CHR_DEBUG(
- dbgs() << "shouldSplit " << *InsertPoint << " PrevConditionValues ";
- for (Value *V : PrevConditionValues) {
- dbgs() << *V << ", ";
- }
- dbgs() << " ConditionValues ";
- for (Value *V : ConditionValues) {
- dbgs() << *V << ", ";
- }
- dbgs() << "\n");
- // If any of Bases isn't hoistable to the hoist point, split.
- for (Value *V : ConditionValues) {
- DenseMap<Instruction *, bool> Visited;
- if (!checkHoistValue(V, InsertPoint, DT, Unhoistables, nullptr, Visited)) {
- CHR_DEBUG(dbgs() << "Split. checkHoistValue false " << *V << "\n");
- return true; // Not hoistable, split.
- }
- }
- // If PrevConditionValues or ConditionValues is empty, don't split to avoid
- // unnecessary splits at scopes with no branch/selects. If
- // PrevConditionValues and ConditionValues don't intersect at all, split.
- if (!PrevConditionValues.empty() && !ConditionValues.empty()) {
- // Use std::set as DenseSet doesn't work with set_intersection.
- std::set<Value *> PrevBases, Bases;
- DenseMap<Value *, std::set<Value *>> Visited;
- for (Value *V : PrevConditionValues) {
- const std::set<Value *> &BaseValues = getBaseValues(V, DT, Visited);
- PrevBases.insert(BaseValues.begin(), BaseValues.end());
- }
- for (Value *V : ConditionValues) {
- const std::set<Value *> &BaseValues = getBaseValues(V, DT, Visited);
- Bases.insert(BaseValues.begin(), BaseValues.end());
- }
- CHR_DEBUG(
- dbgs() << "PrevBases ";
- for (Value *V : PrevBases) {
- dbgs() << *V << ", ";
- }
- dbgs() << " Bases ";
- for (Value *V : Bases) {
- dbgs() << *V << ", ";
- }
- dbgs() << "\n");
- std::vector<Value *> Intersection;
- std::set_intersection(PrevBases.begin(), PrevBases.end(), Bases.begin(),
- Bases.end(), std::back_inserter(Intersection));
- if (Intersection.empty()) {
- // Empty intersection, split.
- CHR_DEBUG(dbgs() << "Split. Intersection empty\n");
- return true;
- }
- }
- CHR_DEBUG(dbgs() << "No split\n");
- return false; // Don't split.
-}
-
-static void getSelectsInScope(CHRScope *Scope,
- DenseSet<Instruction *> &Output) {
- for (RegInfo &RI : Scope->RegInfos)
- for (SelectInst *SI : RI.Selects)
- Output.insert(SI);
- for (CHRScope *Sub : Scope->Subs)
- getSelectsInScope(Sub, Output);
-}
-
-void CHR::splitScopes(SmallVectorImpl<CHRScope *> &Input,
- SmallVectorImpl<CHRScope *> &Output) {
- for (CHRScope *Scope : Input) {
- assert(!Scope->BranchInsertPoint &&
- "BranchInsertPoint must not be set");
- DenseSet<Instruction *> Unhoistables;
- getSelectsInScope(Scope, Unhoistables);
- splitScope(Scope, nullptr, nullptr, nullptr, Output, Unhoistables);
- }
-#ifndef NDEBUG
- for (CHRScope *Scope : Output) {
- assert(Scope->BranchInsertPoint && "BranchInsertPoint must be set");
- }
-#endif
-}
-
-SmallVector<CHRScope *, 8> CHR::splitScope(
- CHRScope *Scope,
- CHRScope *Outer,
- DenseSet<Value *> *OuterConditionValues,
- Instruction *OuterInsertPoint,
- SmallVectorImpl<CHRScope *> &Output,
- DenseSet<Instruction *> &Unhoistables) {
- if (Outer) {
- assert(OuterConditionValues && "Null OuterConditionValues");
- assert(OuterInsertPoint && "Null OuterInsertPoint");
- }
- bool PrevSplitFromOuter = true;
- DenseSet<Value *> PrevConditionValues;
- Instruction *PrevInsertPoint = nullptr;
- SmallVector<CHRScope *, 8> Splits;
- SmallVector<bool, 8> SplitsSplitFromOuter;
- SmallVector<DenseSet<Value *>, 8> SplitsConditionValues;
- SmallVector<Instruction *, 8> SplitsInsertPoints;
- SmallVector<RegInfo, 8> RegInfos(Scope->RegInfos); // Copy
- for (RegInfo &RI : RegInfos) {
- Instruction *InsertPoint = getBranchInsertPoint(RI);
- DenseSet<Value *> ConditionValues = getCHRConditionValuesForRegion(RI);
- CHR_DEBUG(
- dbgs() << "ConditionValues ";
- for (Value *V : ConditionValues) {
- dbgs() << *V << ", ";
- }
- dbgs() << "\n");
- if (RI.R == RegInfos[0].R) {
- // First iteration. Check to see if we should split from the outer.
- if (Outer) {
- CHR_DEBUG(dbgs() << "Outer " << *Outer << "\n");
- CHR_DEBUG(dbgs() << "Should split from outer at "
- << RI.R->getNameStr() << "\n");
- if (shouldSplit(OuterInsertPoint, *OuterConditionValues,
- ConditionValues, DT, Unhoistables)) {
- PrevConditionValues = ConditionValues;
- PrevInsertPoint = InsertPoint;
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE,
- "SplitScopeFromOuter",
- RI.R->getEntry()->getTerminator())
- << "Split scope from outer due to unhoistable branch/select "
- << "and/or lack of common condition values";
- });
- } else {
- // Not splitting from the outer. Use the outer bases and insert
- // point. Union the bases.
- PrevSplitFromOuter = false;
- PrevConditionValues = *OuterConditionValues;
- PrevConditionValues.insert(ConditionValues.begin(),
- ConditionValues.end());
- PrevInsertPoint = OuterInsertPoint;
- }
- } else {
- CHR_DEBUG(dbgs() << "Outer null\n");
- PrevConditionValues = ConditionValues;
- PrevInsertPoint = InsertPoint;
- }
- } else {
- CHR_DEBUG(dbgs() << "Should split from prev at "
- << RI.R->getNameStr() << "\n");
- if (shouldSplit(PrevInsertPoint, PrevConditionValues, ConditionValues,
- DT, Unhoistables)) {
- CHRScope *Tail = Scope->split(RI.R);
- Scopes.insert(Tail);
- Splits.push_back(Scope);
- SplitsSplitFromOuter.push_back(PrevSplitFromOuter);
- SplitsConditionValues.push_back(PrevConditionValues);
- SplitsInsertPoints.push_back(PrevInsertPoint);
- Scope = Tail;
- PrevConditionValues = ConditionValues;
- PrevInsertPoint = InsertPoint;
- PrevSplitFromOuter = true;
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE,
- "SplitScopeFromPrev",
- RI.R->getEntry()->getTerminator())
- << "Split scope from previous due to unhoistable branch/select "
- << "and/or lack of common condition values";
- });
- } else {
- // Not splitting. Union the bases. Keep the hoist point.
- PrevConditionValues.insert(ConditionValues.begin(), ConditionValues.end());
- }
- }
- }
- Splits.push_back(Scope);
- SplitsSplitFromOuter.push_back(PrevSplitFromOuter);
- SplitsConditionValues.push_back(PrevConditionValues);
- assert(PrevInsertPoint && "Null PrevInsertPoint");
- SplitsInsertPoints.push_back(PrevInsertPoint);
- assert(Splits.size() == SplitsConditionValues.size() &&
- Splits.size() == SplitsSplitFromOuter.size() &&
- Splits.size() == SplitsInsertPoints.size() && "Mismatching sizes");
- for (size_t I = 0; I < Splits.size(); ++I) {
- CHRScope *Split = Splits[I];
- DenseSet<Value *> &SplitConditionValues = SplitsConditionValues[I];
- Instruction *SplitInsertPoint = SplitsInsertPoints[I];
- SmallVector<CHRScope *, 8> NewSubs;
- DenseSet<Instruction *> SplitUnhoistables;
- getSelectsInScope(Split, SplitUnhoistables);
- for (CHRScope *Sub : Split->Subs) {
- SmallVector<CHRScope *, 8> SubSplits = splitScope(
- Sub, Split, &SplitConditionValues, SplitInsertPoint, Output,
- SplitUnhoistables);
+ Unhoistables.clear();
+ InsertPoint = Branch;
+ }
+ }
+ CHR_DEBUG(dbgs() << "InsertPoint " << *InsertPoint << "\n");
+#ifndef NDEBUG
+ if (RI.HasBranch) {
+ assert(!DT.dominates(Branch, InsertPoint) &&
+ "Branch can't be already above the hoist point");
+ DenseMap<Instruction *, bool> Visited;
+ assert(checkHoistValue(Branch->getCondition(), InsertPoint,
+ DT, Unhoistables, nullptr, Visited) &&
+ "checkHoistValue for branch");
+ }
+ for (auto *SI : Selects) {
+ assert(!DT.dominates(SI, InsertPoint) &&
+ "SI can't be already above the hoist point");
+ DenseMap<Instruction *, bool> Visited;
+ assert(checkHoistValue(SI->getCondition(), InsertPoint, DT,
+ Unhoistables, nullptr, Visited) &&
+ "checkHoistValue for selects");
+ }
+ CHR_DEBUG(dbgs() << "Result\n");
+ if (RI.HasBranch) {
+ CHR_DEBUG(dbgs() << "BI " << *Branch << "\n");
+ }
+ for (auto *SI : Selects) {
+ CHR_DEBUG(dbgs() << "SI " << *SI << "\n");
+ }
+#endif
+ }
+}
+
+// Traverse the region tree, find all nested scopes and merge them if possible.
+CHRScope * CHR::findScopes(Region *R, Region *NextRegion, Region *ParentRegion,
+ SmallVectorImpl<CHRScope *> &Scopes) {
+ CHR_DEBUG(dbgs() << "findScopes " << R->getNameStr() << "\n");
+ CHRScope *Result = findScope(R);
+ // Visit subscopes.
+ CHRScope *ConsecutiveSubscope = nullptr;
+ SmallVector<CHRScope *, 8> Subscopes;
+ for (auto It = R->begin(); It != R->end(); ++It) {
+ const std::unique_ptr<Region> &SubR = *It;
+ auto NextIt = std::next(It);
+ Region *NextSubR = NextIt != R->end() ? NextIt->get() : nullptr;
+ CHR_DEBUG(dbgs() << "Looking at subregion " << SubR.get()->getNameStr()
+ << "\n");
+ CHRScope *SubCHRScope = findScopes(SubR.get(), NextSubR, R, Scopes);
+ if (SubCHRScope) {
+ CHR_DEBUG(dbgs() << "Subregion Scope " << *SubCHRScope << "\n");
+ } else {
+ CHR_DEBUG(dbgs() << "Subregion Scope null\n");
+ }
+ if (SubCHRScope) {
+ if (!ConsecutiveSubscope)
+ ConsecutiveSubscope = SubCHRScope;
+ else if (!ConsecutiveSubscope->appendable(SubCHRScope)) {
+ Subscopes.push_back(ConsecutiveSubscope);
+ ConsecutiveSubscope = SubCHRScope;
+ } else
+ ConsecutiveSubscope->append(SubCHRScope);
+ } else {
+ if (ConsecutiveSubscope) {
+ Subscopes.push_back(ConsecutiveSubscope);
+ }
+ ConsecutiveSubscope = nullptr;
+ }
+ }
+ if (ConsecutiveSubscope) {
+ Subscopes.push_back(ConsecutiveSubscope);
+ }
+ for (CHRScope *Sub : Subscopes) {
+ if (Result) {
+ // Combine it with the parent.
+ Result->addSub(Sub);
+ } else {
+ // Push Subscopes as they won't be combined with the parent.
+ Scopes.push_back(Sub);
+ }
+ }
+ return Result;
+}
+
+static DenseSet<Value *> getCHRConditionValuesForRegion(RegInfo &RI) {
+ DenseSet<Value *> ConditionValues;
+ if (RI.HasBranch) {
+ auto *BI = cast<BranchInst>(RI.R->getEntry()->getTerminator());
+ ConditionValues.insert(BI->getCondition());
+ }
+ for (SelectInst *SI : RI.Selects) {
+ ConditionValues.insert(SI->getCondition());
+ }
+ return ConditionValues;
+}
+
+
+// Determine whether to split a scope depending on the sets of the branch
+// condition values of the previous region and the current region. We split
+// (return true) it if 1) the condition values of the inner/lower scope can't be
+// hoisted up to the outer/upper scope, or 2) the two sets of the condition
+// values have an empty intersection (because the combined branch conditions
+// won't probably lead to a simpler combined condition).
+static bool shouldSplit(Instruction *InsertPoint,
+ DenseSet<Value *> &PrevConditionValues,
+ DenseSet<Value *> &ConditionValues,
+ DominatorTree &DT,
+ DenseSet<Instruction *> &Unhoistables) {
+ assert(InsertPoint && "Null InsertPoint");
+ CHR_DEBUG(
+ dbgs() << "shouldSplit " << *InsertPoint << " PrevConditionValues ";
+ for (Value *V : PrevConditionValues) {
+ dbgs() << *V << ", ";
+ }
+ dbgs() << " ConditionValues ";
+ for (Value *V : ConditionValues) {
+ dbgs() << *V << ", ";
+ }
+ dbgs() << "\n");
+ // If any of Bases isn't hoistable to the hoist point, split.
+ for (Value *V : ConditionValues) {
+ DenseMap<Instruction *, bool> Visited;
+ if (!checkHoistValue(V, InsertPoint, DT, Unhoistables, nullptr, Visited)) {
+ CHR_DEBUG(dbgs() << "Split. checkHoistValue false " << *V << "\n");
+ return true; // Not hoistable, split.
+ }
+ }
+ // If PrevConditionValues or ConditionValues is empty, don't split to avoid
+ // unnecessary splits at scopes with no branch/selects. If
+ // PrevConditionValues and ConditionValues don't intersect at all, split.
+ if (!PrevConditionValues.empty() && !ConditionValues.empty()) {
+ // Use std::set as DenseSet doesn't work with set_intersection.
+ std::set<Value *> PrevBases, Bases;
+ DenseMap<Value *, std::set<Value *>> Visited;
+ for (Value *V : PrevConditionValues) {
+ const std::set<Value *> &BaseValues = getBaseValues(V, DT, Visited);
+ PrevBases.insert(BaseValues.begin(), BaseValues.end());
+ }
+ for (Value *V : ConditionValues) {
+ const std::set<Value *> &BaseValues = getBaseValues(V, DT, Visited);
+ Bases.insert(BaseValues.begin(), BaseValues.end());
+ }
+ CHR_DEBUG(
+ dbgs() << "PrevBases ";
+ for (Value *V : PrevBases) {
+ dbgs() << *V << ", ";
+ }
+ dbgs() << " Bases ";
+ for (Value *V : Bases) {
+ dbgs() << *V << ", ";
+ }
+ dbgs() << "\n");
+ std::vector<Value *> Intersection;
+ std::set_intersection(PrevBases.begin(), PrevBases.end(), Bases.begin(),
+ Bases.end(), std::back_inserter(Intersection));
+ if (Intersection.empty()) {
+ // Empty intersection, split.
+ CHR_DEBUG(dbgs() << "Split. Intersection empty\n");
+ return true;
+ }
+ }
+ CHR_DEBUG(dbgs() << "No split\n");
+ return false; // Don't split.
+}
+
+static void getSelectsInScope(CHRScope *Scope,
+ DenseSet<Instruction *> &Output) {
+ for (RegInfo &RI : Scope->RegInfos)
+ for (SelectInst *SI : RI.Selects)
+ Output.insert(SI);
+ for (CHRScope *Sub : Scope->Subs)
+ getSelectsInScope(Sub, Output);
+}
+
+void CHR::splitScopes(SmallVectorImpl<CHRScope *> &Input,
+ SmallVectorImpl<CHRScope *> &Output) {
+ for (CHRScope *Scope : Input) {
+ assert(!Scope->BranchInsertPoint &&
+ "BranchInsertPoint must not be set");
+ DenseSet<Instruction *> Unhoistables;
+ getSelectsInScope(Scope, Unhoistables);
+ splitScope(Scope, nullptr, nullptr, nullptr, Output, Unhoistables);
+ }
+#ifndef NDEBUG
+ for (CHRScope *Scope : Output) {
+ assert(Scope->BranchInsertPoint && "BranchInsertPoint must be set");
+ }
+#endif
+}
+
+SmallVector<CHRScope *, 8> CHR::splitScope(
+ CHRScope *Scope,
+ CHRScope *Outer,
+ DenseSet<Value *> *OuterConditionValues,
+ Instruction *OuterInsertPoint,
+ SmallVectorImpl<CHRScope *> &Output,
+ DenseSet<Instruction *> &Unhoistables) {
+ if (Outer) {
+ assert(OuterConditionValues && "Null OuterConditionValues");
+ assert(OuterInsertPoint && "Null OuterInsertPoint");
+ }
+ bool PrevSplitFromOuter = true;
+ DenseSet<Value *> PrevConditionValues;
+ Instruction *PrevInsertPoint = nullptr;
+ SmallVector<CHRScope *, 8> Splits;
+ SmallVector<bool, 8> SplitsSplitFromOuter;
+ SmallVector<DenseSet<Value *>, 8> SplitsConditionValues;
+ SmallVector<Instruction *, 8> SplitsInsertPoints;
+ SmallVector<RegInfo, 8> RegInfos(Scope->RegInfos); // Copy
+ for (RegInfo &RI : RegInfos) {
+ Instruction *InsertPoint = getBranchInsertPoint(RI);
+ DenseSet<Value *> ConditionValues = getCHRConditionValuesForRegion(RI);
+ CHR_DEBUG(
+ dbgs() << "ConditionValues ";
+ for (Value *V : ConditionValues) {
+ dbgs() << *V << ", ";
+ }
+ dbgs() << "\n");
+ if (RI.R == RegInfos[0].R) {
+ // First iteration. Check to see if we should split from the outer.
+ if (Outer) {
+ CHR_DEBUG(dbgs() << "Outer " << *Outer << "\n");
+ CHR_DEBUG(dbgs() << "Should split from outer at "
+ << RI.R->getNameStr() << "\n");
+ if (shouldSplit(OuterInsertPoint, *OuterConditionValues,
+ ConditionValues, DT, Unhoistables)) {
+ PrevConditionValues = ConditionValues;
+ PrevInsertPoint = InsertPoint;
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE,
+ "SplitScopeFromOuter",
+ RI.R->getEntry()->getTerminator())
+ << "Split scope from outer due to unhoistable branch/select "
+ << "and/or lack of common condition values";
+ });
+ } else {
+ // Not splitting from the outer. Use the outer bases and insert
+ // point. Union the bases.
+ PrevSplitFromOuter = false;
+ PrevConditionValues = *OuterConditionValues;
+ PrevConditionValues.insert(ConditionValues.begin(),
+ ConditionValues.end());
+ PrevInsertPoint = OuterInsertPoint;
+ }
+ } else {
+ CHR_DEBUG(dbgs() << "Outer null\n");
+ PrevConditionValues = ConditionValues;
+ PrevInsertPoint = InsertPoint;
+ }
+ } else {
+ CHR_DEBUG(dbgs() << "Should split from prev at "
+ << RI.R->getNameStr() << "\n");
+ if (shouldSplit(PrevInsertPoint, PrevConditionValues, ConditionValues,
+ DT, Unhoistables)) {
+ CHRScope *Tail = Scope->split(RI.R);
+ Scopes.insert(Tail);
+ Splits.push_back(Scope);
+ SplitsSplitFromOuter.push_back(PrevSplitFromOuter);
+ SplitsConditionValues.push_back(PrevConditionValues);
+ SplitsInsertPoints.push_back(PrevInsertPoint);
+ Scope = Tail;
+ PrevConditionValues = ConditionValues;
+ PrevInsertPoint = InsertPoint;
+ PrevSplitFromOuter = true;
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE,
+ "SplitScopeFromPrev",
+ RI.R->getEntry()->getTerminator())
+ << "Split scope from previous due to unhoistable branch/select "
+ << "and/or lack of common condition values";
+ });
+ } else {
+ // Not splitting. Union the bases. Keep the hoist point.
+ PrevConditionValues.insert(ConditionValues.begin(), ConditionValues.end());
+ }
+ }
+ }
+ Splits.push_back(Scope);
+ SplitsSplitFromOuter.push_back(PrevSplitFromOuter);
+ SplitsConditionValues.push_back(PrevConditionValues);
+ assert(PrevInsertPoint && "Null PrevInsertPoint");
+ SplitsInsertPoints.push_back(PrevInsertPoint);
+ assert(Splits.size() == SplitsConditionValues.size() &&
+ Splits.size() == SplitsSplitFromOuter.size() &&
+ Splits.size() == SplitsInsertPoints.size() && "Mismatching sizes");
+ for (size_t I = 0; I < Splits.size(); ++I) {
+ CHRScope *Split = Splits[I];
+ DenseSet<Value *> &SplitConditionValues = SplitsConditionValues[I];
+ Instruction *SplitInsertPoint = SplitsInsertPoints[I];
+ SmallVector<CHRScope *, 8> NewSubs;
+ DenseSet<Instruction *> SplitUnhoistables;
+ getSelectsInScope(Split, SplitUnhoistables);
+ for (CHRScope *Sub : Split->Subs) {
+ SmallVector<CHRScope *, 8> SubSplits = splitScope(
+ Sub, Split, &SplitConditionValues, SplitInsertPoint, Output,
+ SplitUnhoistables);
llvm::append_range(NewSubs, SubSplits);
- }
- Split->Subs = NewSubs;
- }
- SmallVector<CHRScope *, 8> Result;
- for (size_t I = 0; I < Splits.size(); ++I) {
- CHRScope *Split = Splits[I];
- if (SplitsSplitFromOuter[I]) {
- // Split from the outer.
- Output.push_back(Split);
- Split->BranchInsertPoint = SplitsInsertPoints[I];
- CHR_DEBUG(dbgs() << "BranchInsertPoint " << *SplitsInsertPoints[I]
- << "\n");
- } else {
- // Connected to the outer.
- Result.push_back(Split);
- }
- }
- if (!Outer)
- assert(Result.empty() &&
- "If no outer (top-level), must return no nested ones");
- return Result;
-}
-
-void CHR::classifyBiasedScopes(SmallVectorImpl<CHRScope *> &Scopes) {
- for (CHRScope *Scope : Scopes) {
- assert(Scope->TrueBiasedRegions.empty() && Scope->FalseBiasedRegions.empty() && "Empty");
- classifyBiasedScopes(Scope, Scope);
- CHR_DEBUG(
- dbgs() << "classifyBiasedScopes " << *Scope << "\n";
- dbgs() << "TrueBiasedRegions ";
- for (Region *R : Scope->TrueBiasedRegions) {
- dbgs() << R->getNameStr() << ", ";
- }
- dbgs() << "\n";
- dbgs() << "FalseBiasedRegions ";
- for (Region *R : Scope->FalseBiasedRegions) {
- dbgs() << R->getNameStr() << ", ";
- }
- dbgs() << "\n";
- dbgs() << "TrueBiasedSelects ";
- for (SelectInst *SI : Scope->TrueBiasedSelects) {
- dbgs() << *SI << ", ";
- }
- dbgs() << "\n";
- dbgs() << "FalseBiasedSelects ";
- for (SelectInst *SI : Scope->FalseBiasedSelects) {
- dbgs() << *SI << ", ";
- }
- dbgs() << "\n";);
- }
-}
-
-void CHR::classifyBiasedScopes(CHRScope *Scope, CHRScope *OutermostScope) {
- for (RegInfo &RI : Scope->RegInfos) {
- if (RI.HasBranch) {
- Region *R = RI.R;
+ }
+ Split->Subs = NewSubs;
+ }
+ SmallVector<CHRScope *, 8> Result;
+ for (size_t I = 0; I < Splits.size(); ++I) {
+ CHRScope *Split = Splits[I];
+ if (SplitsSplitFromOuter[I]) {
+ // Split from the outer.
+ Output.push_back(Split);
+ Split->BranchInsertPoint = SplitsInsertPoints[I];
+ CHR_DEBUG(dbgs() << "BranchInsertPoint " << *SplitsInsertPoints[I]
+ << "\n");
+ } else {
+ // Connected to the outer.
+ Result.push_back(Split);
+ }
+ }
+ if (!Outer)
+ assert(Result.empty() &&
+ "If no outer (top-level), must return no nested ones");
+ return Result;
+}
+
+void CHR::classifyBiasedScopes(SmallVectorImpl<CHRScope *> &Scopes) {
+ for (CHRScope *Scope : Scopes) {
+ assert(Scope->TrueBiasedRegions.empty() && Scope->FalseBiasedRegions.empty() && "Empty");
+ classifyBiasedScopes(Scope, Scope);
+ CHR_DEBUG(
+ dbgs() << "classifyBiasedScopes " << *Scope << "\n";
+ dbgs() << "TrueBiasedRegions ";
+ for (Region *R : Scope->TrueBiasedRegions) {
+ dbgs() << R->getNameStr() << ", ";
+ }
+ dbgs() << "\n";
+ dbgs() << "FalseBiasedRegions ";
+ for (Region *R : Scope->FalseBiasedRegions) {
+ dbgs() << R->getNameStr() << ", ";
+ }
+ dbgs() << "\n";
+ dbgs() << "TrueBiasedSelects ";
+ for (SelectInst *SI : Scope->TrueBiasedSelects) {
+ dbgs() << *SI << ", ";
+ }
+ dbgs() << "\n";
+ dbgs() << "FalseBiasedSelects ";
+ for (SelectInst *SI : Scope->FalseBiasedSelects) {
+ dbgs() << *SI << ", ";
+ }
+ dbgs() << "\n";);
+ }
+}
+
+void CHR::classifyBiasedScopes(CHRScope *Scope, CHRScope *OutermostScope) {
+ for (RegInfo &RI : Scope->RegInfos) {
+ if (RI.HasBranch) {
+ Region *R = RI.R;
if (TrueBiasedRegionsGlobal.contains(R))
- OutermostScope->TrueBiasedRegions.insert(R);
+ OutermostScope->TrueBiasedRegions.insert(R);
else if (FalseBiasedRegionsGlobal.contains(R))
- OutermostScope->FalseBiasedRegions.insert(R);
- else
- llvm_unreachable("Must be biased");
- }
- for (SelectInst *SI : RI.Selects) {
+ OutermostScope->FalseBiasedRegions.insert(R);
+ else
+ llvm_unreachable("Must be biased");
+ }
+ for (SelectInst *SI : RI.Selects) {
if (TrueBiasedSelectsGlobal.contains(SI))
- OutermostScope->TrueBiasedSelects.insert(SI);
+ OutermostScope->TrueBiasedSelects.insert(SI);
else if (FalseBiasedSelectsGlobal.contains(SI))
- OutermostScope->FalseBiasedSelects.insert(SI);
- else
- llvm_unreachable("Must be biased");
- }
- }
- for (CHRScope *Sub : Scope->Subs) {
- classifyBiasedScopes(Sub, OutermostScope);
- }
-}
-
-static bool hasAtLeastTwoBiasedBranches(CHRScope *Scope) {
- unsigned NumBiased = Scope->TrueBiasedRegions.size() +
- Scope->FalseBiasedRegions.size() +
- Scope->TrueBiasedSelects.size() +
- Scope->FalseBiasedSelects.size();
- return NumBiased >= CHRMergeThreshold;
-}
-
-void CHR::filterScopes(SmallVectorImpl<CHRScope *> &Input,
- SmallVectorImpl<CHRScope *> &Output) {
- for (CHRScope *Scope : Input) {
- // Filter out the ones with only one region and no subs.
- if (!hasAtLeastTwoBiasedBranches(Scope)) {
- CHR_DEBUG(dbgs() << "Filtered out by biased branches truthy-regions "
- << Scope->TrueBiasedRegions.size()
- << " falsy-regions " << Scope->FalseBiasedRegions.size()
- << " true-selects " << Scope->TrueBiasedSelects.size()
- << " false-selects " << Scope->FalseBiasedSelects.size() << "\n");
- ORE.emit([&]() {
- return OptimizationRemarkMissed(
- DEBUG_TYPE,
- "DropScopeWithOneBranchOrSelect",
- Scope->RegInfos[0].R->getEntry()->getTerminator())
- << "Drop scope with < "
- << ore::NV("CHRMergeThreshold", CHRMergeThreshold)
- << " biased branch(es) or select(s)";
- });
- continue;
- }
- Output.push_back(Scope);
- }
-}
-
-void CHR::setCHRRegions(SmallVectorImpl<CHRScope *> &Input,
- SmallVectorImpl<CHRScope *> &Output) {
- for (CHRScope *Scope : Input) {
- assert(Scope->HoistStopMap.empty() && Scope->CHRRegions.empty() &&
- "Empty");
- setCHRRegions(Scope, Scope);
- Output.push_back(Scope);
- CHR_DEBUG(
- dbgs() << "setCHRRegions HoistStopMap " << *Scope << "\n";
- for (auto pair : Scope->HoistStopMap) {
- Region *R = pair.first;
- dbgs() << "Region " << R->getNameStr() << "\n";
- for (Instruction *I : pair.second) {
- dbgs() << "HoistStop " << *I << "\n";
- }
- }
- dbgs() << "CHRRegions" << "\n";
- for (RegInfo &RI : Scope->CHRRegions) {
- dbgs() << RI.R->getNameStr() << "\n";
- });
- }
-}
-
-void CHR::setCHRRegions(CHRScope *Scope, CHRScope *OutermostScope) {
- DenseSet<Instruction *> Unhoistables;
- // Put the biased selects in Unhoistables because they should stay where they
- // are and constant-folded after CHR (in case one biased select or a branch
- // can depend on another biased select.)
- for (RegInfo &RI : Scope->RegInfos) {
- for (SelectInst *SI : RI.Selects) {
- Unhoistables.insert(SI);
- }
- }
- Instruction *InsertPoint = OutermostScope->BranchInsertPoint;
- for (RegInfo &RI : Scope->RegInfos) {
- Region *R = RI.R;
- DenseSet<Instruction *> HoistStops;
- bool IsHoisted = false;
- if (RI.HasBranch) {
+ OutermostScope->FalseBiasedSelects.insert(SI);
+ else
+ llvm_unreachable("Must be biased");
+ }
+ }
+ for (CHRScope *Sub : Scope->Subs) {
+ classifyBiasedScopes(Sub, OutermostScope);
+ }
+}
+
+static bool hasAtLeastTwoBiasedBranches(CHRScope *Scope) {
+ unsigned NumBiased = Scope->TrueBiasedRegions.size() +
+ Scope->FalseBiasedRegions.size() +
+ Scope->TrueBiasedSelects.size() +
+ Scope->FalseBiasedSelects.size();
+ return NumBiased >= CHRMergeThreshold;
+}
+
+void CHR::filterScopes(SmallVectorImpl<CHRScope *> &Input,
+ SmallVectorImpl<CHRScope *> &Output) {
+ for (CHRScope *Scope : Input) {
+ // Filter out the ones with only one region and no subs.
+ if (!hasAtLeastTwoBiasedBranches(Scope)) {
+ CHR_DEBUG(dbgs() << "Filtered out by biased branches truthy-regions "
+ << Scope->TrueBiasedRegions.size()
+ << " falsy-regions " << Scope->FalseBiasedRegions.size()
+ << " true-selects " << Scope->TrueBiasedSelects.size()
+ << " false-selects " << Scope->FalseBiasedSelects.size() << "\n");
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(
+ DEBUG_TYPE,
+ "DropScopeWithOneBranchOrSelect",
+ Scope->RegInfos[0].R->getEntry()->getTerminator())
+ << "Drop scope with < "
+ << ore::NV("CHRMergeThreshold", CHRMergeThreshold)
+ << " biased branch(es) or select(s)";
+ });
+ continue;
+ }
+ Output.push_back(Scope);
+ }
+}
+
+void CHR::setCHRRegions(SmallVectorImpl<CHRScope *> &Input,
+ SmallVectorImpl<CHRScope *> &Output) {
+ for (CHRScope *Scope : Input) {
+ assert(Scope->HoistStopMap.empty() && Scope->CHRRegions.empty() &&
+ "Empty");
+ setCHRRegions(Scope, Scope);
+ Output.push_back(Scope);
+ CHR_DEBUG(
+ dbgs() << "setCHRRegions HoistStopMap " << *Scope << "\n";
+ for (auto pair : Scope->HoistStopMap) {
+ Region *R = pair.first;
+ dbgs() << "Region " << R->getNameStr() << "\n";
+ for (Instruction *I : pair.second) {
+ dbgs() << "HoistStop " << *I << "\n";
+ }
+ }
+ dbgs() << "CHRRegions" << "\n";
+ for (RegInfo &RI : Scope->CHRRegions) {
+ dbgs() << RI.R->getNameStr() << "\n";
+ });
+ }
+}
+
+void CHR::setCHRRegions(CHRScope *Scope, CHRScope *OutermostScope) {
+ DenseSet<Instruction *> Unhoistables;
+ // Put the biased selects in Unhoistables because they should stay where they
+ // are and constant-folded after CHR (in case one biased select or a branch
+ // can depend on another biased select.)
+ for (RegInfo &RI : Scope->RegInfos) {
+ for (SelectInst *SI : RI.Selects) {
+ Unhoistables.insert(SI);
+ }
+ }
+ Instruction *InsertPoint = OutermostScope->BranchInsertPoint;
+ for (RegInfo &RI : Scope->RegInfos) {
+ Region *R = RI.R;
+ DenseSet<Instruction *> HoistStops;
+ bool IsHoisted = false;
+ if (RI.HasBranch) {
assert((OutermostScope->TrueBiasedRegions.contains(R) ||
OutermostScope->FalseBiasedRegions.contains(R)) &&
- "Must be truthy or falsy");
- auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
- // Note checkHoistValue fills in HoistStops.
- DenseMap<Instruction *, bool> Visited;
- bool IsHoistable = checkHoistValue(BI->getCondition(), InsertPoint, DT,
- Unhoistables, &HoistStops, Visited);
- assert(IsHoistable && "Must be hoistable");
- (void)(IsHoistable); // Unused in release build
- IsHoisted = true;
- }
- for (SelectInst *SI : RI.Selects) {
+ "Must be truthy or falsy");
+ auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+ // Note checkHoistValue fills in HoistStops.
+ DenseMap<Instruction *, bool> Visited;
+ bool IsHoistable = checkHoistValue(BI->getCondition(), InsertPoint, DT,
+ Unhoistables, &HoistStops, Visited);
+ assert(IsHoistable && "Must be hoistable");
+ (void)(IsHoistable); // Unused in release build
+ IsHoisted = true;
+ }
+ for (SelectInst *SI : RI.Selects) {
assert((OutermostScope->TrueBiasedSelects.contains(SI) ||
OutermostScope->FalseBiasedSelects.contains(SI)) &&
- "Must be true or false biased");
- // Note checkHoistValue fills in HoistStops.
- DenseMap<Instruction *, bool> Visited;
- bool IsHoistable = checkHoistValue(SI->getCondition(), InsertPoint, DT,
- Unhoistables, &HoistStops, Visited);
- assert(IsHoistable && "Must be hoistable");
- (void)(IsHoistable); // Unused in release build
- IsHoisted = true;
- }
- if (IsHoisted) {
- OutermostScope->CHRRegions.push_back(RI);
- OutermostScope->HoistStopMap[R] = HoistStops;
- }
- }
- for (CHRScope *Sub : Scope->Subs)
- setCHRRegions(Sub, OutermostScope);
-}
-
-static bool CHRScopeSorter(CHRScope *Scope1, CHRScope *Scope2) {
- return Scope1->RegInfos[0].R->getDepth() < Scope2->RegInfos[0].R->getDepth();
-}
-
-void CHR::sortScopes(SmallVectorImpl<CHRScope *> &Input,
- SmallVectorImpl<CHRScope *> &Output) {
- Output.resize(Input.size());
- llvm::copy(Input, Output.begin());
- llvm::stable_sort(Output, CHRScopeSorter);
-}
-
-// Return true if V is already hoisted or was hoisted (along with its operands)
-// to the insert point.
-static void hoistValue(Value *V, Instruction *HoistPoint, Region *R,
- HoistStopMapTy &HoistStopMap,
- DenseSet<Instruction *> &HoistedSet,
- DenseSet<PHINode *> &TrivialPHIs,
- DominatorTree &DT) {
- auto IT = HoistStopMap.find(R);
- assert(IT != HoistStopMap.end() && "Region must be in hoist stop map");
- DenseSet<Instruction *> &HoistStops = IT->second;
- if (auto *I = dyn_cast<Instruction>(V)) {
- if (I == HoistPoint)
- return;
- if (HoistStops.count(I))
- return;
- if (auto *PN = dyn_cast<PHINode>(I))
- if (TrivialPHIs.count(PN))
- // The trivial phi inserted by the previous CHR scope could replace a
- // non-phi in HoistStops. Note that since this phi is at the exit of a
- // previous CHR scope, which dominates this scope, it's safe to stop
- // hoisting there.
- return;
- if (HoistedSet.count(I))
- // Already hoisted, return.
- return;
- assert(isHoistableInstructionType(I) && "Unhoistable instruction type");
- assert(DT.getNode(I->getParent()) && "DT must contain I's block");
- assert(DT.getNode(HoistPoint->getParent()) &&
- "DT must contain HoistPoint block");
- if (DT.dominates(I, HoistPoint))
- // We are already above the hoist point. Stop here. This may be necessary
- // when multiple scopes would independently hoist the same
- // instruction. Since an outer (dominating) scope would hoist it to its
- // entry before an inner (dominated) scope would to its entry, the inner
- // scope may see the instruction already hoisted, in which case it
- // potentially wrong for the inner scope to hoist it and could cause bad
- // IR (non-dominating def), but safe to skip hoisting it instead because
- // it's already in a block that dominates the inner scope.
- return;
- for (Value *Op : I->operands()) {
- hoistValue(Op, HoistPoint, R, HoistStopMap, HoistedSet, TrivialPHIs, DT);
- }
- I->moveBefore(HoistPoint);
- HoistedSet.insert(I);
- CHR_DEBUG(dbgs() << "hoistValue " << *I << "\n");
- }
-}
-
-// Hoist the dependent condition values of the branches and the selects in the
-// scope to the insert point.
-static void hoistScopeConditions(CHRScope *Scope, Instruction *HoistPoint,
- DenseSet<PHINode *> &TrivialPHIs,
- DominatorTree &DT) {
- DenseSet<Instruction *> HoistedSet;
- for (const RegInfo &RI : Scope->CHRRegions) {
- Region *R = RI.R;
- bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
- bool IsFalseBiased = Scope->FalseBiasedRegions.count(R);
- if (RI.HasBranch && (IsTrueBiased || IsFalseBiased)) {
- auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
- hoistValue(BI->getCondition(), HoistPoint, R, Scope->HoistStopMap,
- HoistedSet, TrivialPHIs, DT);
- }
- for (SelectInst *SI : RI.Selects) {
- bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI);
- bool IsFalseBiased = Scope->FalseBiasedSelects.count(SI);
- if (!(IsTrueBiased || IsFalseBiased))
- continue;
- hoistValue(SI->getCondition(), HoistPoint, R, Scope->HoistStopMap,
- HoistedSet, TrivialPHIs, DT);
- }
- }
-}
-
-// Negate the predicate if an ICmp if it's used only by branches or selects by
-// swapping the operands of the branches or the selects. Returns true if success.
-static bool negateICmpIfUsedByBranchOrSelectOnly(ICmpInst *ICmp,
- Instruction *ExcludedUser,
- CHRScope *Scope) {
- for (User *U : ICmp->users()) {
- if (U == ExcludedUser)
- continue;
- if (isa<BranchInst>(U) && cast<BranchInst>(U)->isConditional())
- continue;
- if (isa<SelectInst>(U) && cast<SelectInst>(U)->getCondition() == ICmp)
- continue;
- return false;
- }
- for (User *U : ICmp->users()) {
- if (U == ExcludedUser)
- continue;
- if (auto *BI = dyn_cast<BranchInst>(U)) {
- assert(BI->isConditional() && "Must be conditional");
- BI->swapSuccessors();
- // Don't need to swap this in terms of
- // TrueBiasedRegions/FalseBiasedRegions because true-based/false-based
- // mean whehter the branch is likely go into the if-then rather than
- // successor0/successor1 and because we can tell which edge is the then or
- // the else one by comparing the destination to the region exit block.
- continue;
- }
- if (auto *SI = dyn_cast<SelectInst>(U)) {
- // Swap operands
- SI->swapValues();
- SI->swapProfMetadata();
- if (Scope->TrueBiasedSelects.count(SI)) {
- assert(Scope->FalseBiasedSelects.count(SI) == 0 &&
- "Must not be already in");
- Scope->FalseBiasedSelects.insert(SI);
- } else if (Scope->FalseBiasedSelects.count(SI)) {
- assert(Scope->TrueBiasedSelects.count(SI) == 0 &&
- "Must not be already in");
- Scope->TrueBiasedSelects.insert(SI);
- }
- continue;
- }
- llvm_unreachable("Must be a branch or a select");
- }
- ICmp->setPredicate(CmpInst::getInversePredicate(ICmp->getPredicate()));
- return true;
-}
-
-// A helper for transformScopes. Insert a trivial phi at the scope exit block
-// for a value that's defined in the scope but used outside it (meaning it's
-// alive at the exit block).
-static void insertTrivialPHIs(CHRScope *Scope,
- BasicBlock *EntryBlock, BasicBlock *ExitBlock,
- DenseSet<PHINode *> &TrivialPHIs) {
- SmallSetVector<BasicBlock *, 8> BlocksInScope;
- for (RegInfo &RI : Scope->RegInfos) {
- for (BasicBlock *BB : RI.R->blocks()) { // This includes the blocks in the
- // sub-Scopes.
- BlocksInScope.insert(BB);
- }
- }
- CHR_DEBUG({
- dbgs() << "Inserting redundant phis\n";
- for (BasicBlock *BB : BlocksInScope)
- dbgs() << "BlockInScope " << BB->getName() << "\n";
- });
- for (BasicBlock *BB : BlocksInScope) {
- for (Instruction &I : *BB) {
- SmallVector<Instruction *, 8> Users;
- for (User *U : I.users()) {
- if (auto *UI = dyn_cast<Instruction>(U)) {
- if (BlocksInScope.count(UI->getParent()) == 0 &&
- // Unless there's already a phi for I at the exit block.
- !(isa<PHINode>(UI) && UI->getParent() == ExitBlock)) {
- CHR_DEBUG(dbgs() << "V " << I << "\n");
- CHR_DEBUG(dbgs() << "Used outside scope by user " << *UI << "\n");
- Users.push_back(UI);
- } else if (UI->getParent() == EntryBlock && isa<PHINode>(UI)) {
- // There's a loop backedge from a block that's dominated by this
- // scope to the entry block.
- CHR_DEBUG(dbgs() << "V " << I << "\n");
- CHR_DEBUG(dbgs()
- << "Used at entry block (for a back edge) by a phi user "
- << *UI << "\n");
- Users.push_back(UI);
- }
- }
- }
- if (Users.size() > 0) {
- // Insert a trivial phi for I (phi [&I, P0], [&I, P1], ...) at
- // ExitBlock. Replace I with the new phi in UI unless UI is another
- // phi at ExitBlock.
+ "Must be true or false biased");
+ // Note checkHoistValue fills in HoistStops.
+ DenseMap<Instruction *, bool> Visited;
+ bool IsHoistable = checkHoistValue(SI->getCondition(), InsertPoint, DT,
+ Unhoistables, &HoistStops, Visited);
+ assert(IsHoistable && "Must be hoistable");
+ (void)(IsHoistable); // Unused in release build
+ IsHoisted = true;
+ }
+ if (IsHoisted) {
+ OutermostScope->CHRRegions.push_back(RI);
+ OutermostScope->HoistStopMap[R] = HoistStops;
+ }
+ }
+ for (CHRScope *Sub : Scope->Subs)
+ setCHRRegions(Sub, OutermostScope);
+}
+
+static bool CHRScopeSorter(CHRScope *Scope1, CHRScope *Scope2) {
+ return Scope1->RegInfos[0].R->getDepth() < Scope2->RegInfos[0].R->getDepth();
+}
+
+void CHR::sortScopes(SmallVectorImpl<CHRScope *> &Input,
+ SmallVectorImpl<CHRScope *> &Output) {
+ Output.resize(Input.size());
+ llvm::copy(Input, Output.begin());
+ llvm::stable_sort(Output, CHRScopeSorter);
+}
+
+// Return true if V is already hoisted or was hoisted (along with its operands)
+// to the insert point.
+static void hoistValue(Value *V, Instruction *HoistPoint, Region *R,
+ HoistStopMapTy &HoistStopMap,
+ DenseSet<Instruction *> &HoistedSet,
+ DenseSet<PHINode *> &TrivialPHIs,
+ DominatorTree &DT) {
+ auto IT = HoistStopMap.find(R);
+ assert(IT != HoistStopMap.end() && "Region must be in hoist stop map");
+ DenseSet<Instruction *> &HoistStops = IT->second;
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ if (I == HoistPoint)
+ return;
+ if (HoistStops.count(I))
+ return;
+ if (auto *PN = dyn_cast<PHINode>(I))
+ if (TrivialPHIs.count(PN))
+ // The trivial phi inserted by the previous CHR scope could replace a
+ // non-phi in HoistStops. Note that since this phi is at the exit of a
+ // previous CHR scope, which dominates this scope, it's safe to stop
+ // hoisting there.
+ return;
+ if (HoistedSet.count(I))
+ // Already hoisted, return.
+ return;
+ assert(isHoistableInstructionType(I) && "Unhoistable instruction type");
+ assert(DT.getNode(I->getParent()) && "DT must contain I's block");
+ assert(DT.getNode(HoistPoint->getParent()) &&
+ "DT must contain HoistPoint block");
+ if (DT.dominates(I, HoistPoint))
+ // We are already above the hoist point. Stop here. This may be necessary
+ // when multiple scopes would independently hoist the same
+ // instruction. Since an outer (dominating) scope would hoist it to its
+ // entry before an inner (dominated) scope would to its entry, the inner
+ // scope may see the instruction already hoisted, in which case it
+ // potentially wrong for the inner scope to hoist it and could cause bad
+ // IR (non-dominating def), but safe to skip hoisting it instead because
+ // it's already in a block that dominates the inner scope.
+ return;
+ for (Value *Op : I->operands()) {
+ hoistValue(Op, HoistPoint, R, HoistStopMap, HoistedSet, TrivialPHIs, DT);
+ }
+ I->moveBefore(HoistPoint);
+ HoistedSet.insert(I);
+ CHR_DEBUG(dbgs() << "hoistValue " << *I << "\n");
+ }
+}
+
+// Hoist the dependent condition values of the branches and the selects in the
+// scope to the insert point.
+static void hoistScopeConditions(CHRScope *Scope, Instruction *HoistPoint,
+ DenseSet<PHINode *> &TrivialPHIs,
+ DominatorTree &DT) {
+ DenseSet<Instruction *> HoistedSet;
+ for (const RegInfo &RI : Scope->CHRRegions) {
+ Region *R = RI.R;
+ bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
+ bool IsFalseBiased = Scope->FalseBiasedRegions.count(R);
+ if (RI.HasBranch && (IsTrueBiased || IsFalseBiased)) {
+ auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+ hoistValue(BI->getCondition(), HoistPoint, R, Scope->HoistStopMap,
+ HoistedSet, TrivialPHIs, DT);
+ }
+ for (SelectInst *SI : RI.Selects) {
+ bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI);
+ bool IsFalseBiased = Scope->FalseBiasedSelects.count(SI);
+ if (!(IsTrueBiased || IsFalseBiased))
+ continue;
+ hoistValue(SI->getCondition(), HoistPoint, R, Scope->HoistStopMap,
+ HoistedSet, TrivialPHIs, DT);
+ }
+ }
+}
+
+// Negate the predicate if an ICmp if it's used only by branches or selects by
+// swapping the operands of the branches or the selects. Returns true if success.
+static bool negateICmpIfUsedByBranchOrSelectOnly(ICmpInst *ICmp,
+ Instruction *ExcludedUser,
+ CHRScope *Scope) {
+ for (User *U : ICmp->users()) {
+ if (U == ExcludedUser)
+ continue;
+ if (isa<BranchInst>(U) && cast<BranchInst>(U)->isConditional())
+ continue;
+ if (isa<SelectInst>(U) && cast<SelectInst>(U)->getCondition() == ICmp)
+ continue;
+ return false;
+ }
+ for (User *U : ICmp->users()) {
+ if (U == ExcludedUser)
+ continue;
+ if (auto *BI = dyn_cast<BranchInst>(U)) {
+ assert(BI->isConditional() && "Must be conditional");
+ BI->swapSuccessors();
+ // Don't need to swap this in terms of
+ // TrueBiasedRegions/FalseBiasedRegions because true-based/false-based
+ // mean whehter the branch is likely go into the if-then rather than
+ // successor0/successor1 and because we can tell which edge is the then or
+ // the else one by comparing the destination to the region exit block.
+ continue;
+ }
+ if (auto *SI = dyn_cast<SelectInst>(U)) {
+ // Swap operands
+ SI->swapValues();
+ SI->swapProfMetadata();
+ if (Scope->TrueBiasedSelects.count(SI)) {
+ assert(Scope->FalseBiasedSelects.count(SI) == 0 &&
+ "Must not be already in");
+ Scope->FalseBiasedSelects.insert(SI);
+ } else if (Scope->FalseBiasedSelects.count(SI)) {
+ assert(Scope->TrueBiasedSelects.count(SI) == 0 &&
+ "Must not be already in");
+ Scope->TrueBiasedSelects.insert(SI);
+ }
+ continue;
+ }
+ llvm_unreachable("Must be a branch or a select");
+ }
+ ICmp->setPredicate(CmpInst::getInversePredicate(ICmp->getPredicate()));
+ return true;
+}
+
+// A helper for transformScopes. Insert a trivial phi at the scope exit block
+// for a value that's defined in the scope but used outside it (meaning it's
+// alive at the exit block).
+static void insertTrivialPHIs(CHRScope *Scope,
+ BasicBlock *EntryBlock, BasicBlock *ExitBlock,
+ DenseSet<PHINode *> &TrivialPHIs) {
+ SmallSetVector<BasicBlock *, 8> BlocksInScope;
+ for (RegInfo &RI : Scope->RegInfos) {
+ for (BasicBlock *BB : RI.R->blocks()) { // This includes the blocks in the
+ // sub-Scopes.
+ BlocksInScope.insert(BB);
+ }
+ }
+ CHR_DEBUG({
+ dbgs() << "Inserting redundant phis\n";
+ for (BasicBlock *BB : BlocksInScope)
+ dbgs() << "BlockInScope " << BB->getName() << "\n";
+ });
+ for (BasicBlock *BB : BlocksInScope) {
+ for (Instruction &I : *BB) {
+ SmallVector<Instruction *, 8> Users;
+ for (User *U : I.users()) {
+ if (auto *UI = dyn_cast<Instruction>(U)) {
+ if (BlocksInScope.count(UI->getParent()) == 0 &&
+ // Unless there's already a phi for I at the exit block.
+ !(isa<PHINode>(UI) && UI->getParent() == ExitBlock)) {
+ CHR_DEBUG(dbgs() << "V " << I << "\n");
+ CHR_DEBUG(dbgs() << "Used outside scope by user " << *UI << "\n");
+ Users.push_back(UI);
+ } else if (UI->getParent() == EntryBlock && isa<PHINode>(UI)) {
+ // There's a loop backedge from a block that's dominated by this
+ // scope to the entry block.
+ CHR_DEBUG(dbgs() << "V " << I << "\n");
+ CHR_DEBUG(dbgs()
+ << "Used at entry block (for a back edge) by a phi user "
+ << *UI << "\n");
+ Users.push_back(UI);
+ }
+ }
+ }
+ if (Users.size() > 0) {
+ // Insert a trivial phi for I (phi [&I, P0], [&I, P1], ...) at
+ // ExitBlock. Replace I with the new phi in UI unless UI is another
+ // phi at ExitBlock.
PHINode *PN = PHINode::Create(I.getType(), pred_size(ExitBlock), "",
- &ExitBlock->front());
- for (BasicBlock *Pred : predecessors(ExitBlock)) {
- PN->addIncoming(&I, Pred);
- }
- TrivialPHIs.insert(PN);
- CHR_DEBUG(dbgs() << "Insert phi " << *PN << "\n");
- for (Instruction *UI : Users) {
- for (unsigned J = 0, NumOps = UI->getNumOperands(); J < NumOps; ++J) {
- if (UI->getOperand(J) == &I) {
- UI->setOperand(J, PN);
- }
- }
- CHR_DEBUG(dbgs() << "Updated user " << *UI << "\n");
- }
- }
- }
- }
-}
-
-// Assert that all the CHR regions of the scope have a biased branch or select.
-static void LLVM_ATTRIBUTE_UNUSED
-assertCHRRegionsHaveBiasedBranchOrSelect(CHRScope *Scope) {
-#ifndef NDEBUG
- auto HasBiasedBranchOrSelect = [](RegInfo &RI, CHRScope *Scope) {
- if (Scope->TrueBiasedRegions.count(RI.R) ||
- Scope->FalseBiasedRegions.count(RI.R))
- return true;
- for (SelectInst *SI : RI.Selects)
- if (Scope->TrueBiasedSelects.count(SI) ||
- Scope->FalseBiasedSelects.count(SI))
- return true;
- return false;
- };
- for (RegInfo &RI : Scope->CHRRegions) {
- assert(HasBiasedBranchOrSelect(RI, Scope) &&
- "Must have biased branch or select");
- }
-#endif
-}
-
-// Assert that all the condition values of the biased branches and selects have
-// been hoisted to the pre-entry block or outside of the scope.
-static void LLVM_ATTRIBUTE_UNUSED assertBranchOrSelectConditionHoisted(
- CHRScope *Scope, BasicBlock *PreEntryBlock) {
- CHR_DEBUG(dbgs() << "Biased regions condition values \n");
- for (RegInfo &RI : Scope->CHRRegions) {
- Region *R = RI.R;
- bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
- bool IsFalseBiased = Scope->FalseBiasedRegions.count(R);
- if (RI.HasBranch && (IsTrueBiased || IsFalseBiased)) {
- auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
- Value *V = BI->getCondition();
- CHR_DEBUG(dbgs() << *V << "\n");
- if (auto *I = dyn_cast<Instruction>(V)) {
- (void)(I); // Unused in release build.
- assert((I->getParent() == PreEntryBlock ||
- !Scope->contains(I)) &&
- "Must have been hoisted to PreEntryBlock or outside the scope");
- }
- }
- for (SelectInst *SI : RI.Selects) {
- bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI);
- bool IsFalseBiased = Scope->FalseBiasedSelects.count(SI);
- if (!(IsTrueBiased || IsFalseBiased))
- continue;
- Value *V = SI->getCondition();
- CHR_DEBUG(dbgs() << *V << "\n");
- if (auto *I = dyn_cast<Instruction>(V)) {
- (void)(I); // Unused in release build.
- assert((I->getParent() == PreEntryBlock ||
- !Scope->contains(I)) &&
- "Must have been hoisted to PreEntryBlock or outside the scope");
- }
- }
- }
-}
-
-void CHR::transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs) {
- CHR_DEBUG(dbgs() << "transformScopes " << *Scope << "\n");
-
- assert(Scope->RegInfos.size() >= 1 && "Should have at least one Region");
- Region *FirstRegion = Scope->RegInfos[0].R;
- BasicBlock *EntryBlock = FirstRegion->getEntry();
- Region *LastRegion = Scope->RegInfos[Scope->RegInfos.size() - 1].R;
- BasicBlock *ExitBlock = LastRegion->getExit();
- Optional<uint64_t> ProfileCount = BFI.getBlockProfileCount(EntryBlock);
-
- if (ExitBlock) {
- // Insert a trivial phi at the exit block (where the CHR hot path and the
- // cold path merges) for a value that's defined in the scope but used
- // outside it (meaning it's alive at the exit block). We will add the
- // incoming values for the CHR cold paths to it below. Without this, we'd
- // miss updating phi's for such values unless there happens to already be a
- // phi for that value there.
- insertTrivialPHIs(Scope, EntryBlock, ExitBlock, TrivialPHIs);
- }
-
- // Split the entry block of the first region. The new block becomes the new
- // entry block of the first region. The old entry block becomes the block to
- // insert the CHR branch into. Note DT gets updated. Since DT gets updated
- // through the split, we update the entry of the first region after the split,
- // and Region only points to the entry and the exit blocks, rather than
- // keeping everything in a list or set, the blocks membership and the
- // entry/exit blocks of the region are still valid after the split.
- CHR_DEBUG(dbgs() << "Splitting entry block " << EntryBlock->getName()
- << " at " << *Scope->BranchInsertPoint << "\n");
- BasicBlock *NewEntryBlock =
- SplitBlock(EntryBlock, Scope->BranchInsertPoint, &DT);
- assert(NewEntryBlock->getSinglePredecessor() == EntryBlock &&
- "NewEntryBlock's only pred must be EntryBlock");
- FirstRegion->replaceEntryRecursive(NewEntryBlock);
- BasicBlock *PreEntryBlock = EntryBlock;
-
- ValueToValueMapTy VMap;
- // Clone the blocks in the scope (excluding the PreEntryBlock) to split into a
- // hot path (originals) and a cold path (clones) and update the PHIs at the
- // exit block.
- cloneScopeBlocks(Scope, PreEntryBlock, ExitBlock, LastRegion, VMap);
-
- // Replace the old (placeholder) branch with the new (merged) conditional
- // branch.
- BranchInst *MergedBr = createMergedBranch(PreEntryBlock, EntryBlock,
- NewEntryBlock, VMap);
-
-#ifndef NDEBUG
- assertCHRRegionsHaveBiasedBranchOrSelect(Scope);
-#endif
-
- // Hoist the conditional values of the branches/selects.
- hoistScopeConditions(Scope, PreEntryBlock->getTerminator(), TrivialPHIs, DT);
-
-#ifndef NDEBUG
- assertBranchOrSelectConditionHoisted(Scope, PreEntryBlock);
-#endif
-
- // Create the combined branch condition and constant-fold the branches/selects
- // in the hot path.
- fixupBranchesAndSelects(Scope, PreEntryBlock, MergedBr,
- ProfileCount ? ProfileCount.getValue() : 0);
-}
-
-// A helper for transformScopes. Clone the blocks in the scope (excluding the
-// PreEntryBlock) to split into a hot path and a cold path and update the PHIs
-// at the exit block.
-void CHR::cloneScopeBlocks(CHRScope *Scope,
- BasicBlock *PreEntryBlock,
- BasicBlock *ExitBlock,
- Region *LastRegion,
- ValueToValueMapTy &VMap) {
- // Clone all the blocks. The original blocks will be the hot-path
- // CHR-optimized code and the cloned blocks will be the original unoptimized
- // code. This is so that the block pointers from the
- // CHRScope/Region/RegionInfo can stay valid in pointing to the hot-path code
- // which CHR should apply to.
- SmallVector<BasicBlock*, 8> NewBlocks;
- for (RegInfo &RI : Scope->RegInfos)
- for (BasicBlock *BB : RI.R->blocks()) { // This includes the blocks in the
- // sub-Scopes.
- assert(BB != PreEntryBlock && "Don't copy the preetntry block");
- BasicBlock *NewBB = CloneBasicBlock(BB, VMap, ".nonchr", &F);
- NewBlocks.push_back(NewBB);
- VMap[BB] = NewBB;
- }
-
- // Place the cloned blocks right after the original blocks (right before the
- // exit block of.)
- if (ExitBlock)
- F.getBasicBlockList().splice(ExitBlock->getIterator(),
- F.getBasicBlockList(),
- NewBlocks[0]->getIterator(), F.end());
-
- // Update the cloned blocks/instructions to refer to themselves.
- for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
- for (Instruction &I : *NewBlocks[i])
- RemapInstruction(&I, VMap,
- RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-
- // Add the cloned blocks to the PHIs of the exit blocks. ExitBlock is null for
- // the top-level region but we don't need to add PHIs. The trivial PHIs
- // inserted above will be updated here.
- if (ExitBlock)
- for (PHINode &PN : ExitBlock->phis())
- for (unsigned I = 0, NumOps = PN.getNumIncomingValues(); I < NumOps;
- ++I) {
- BasicBlock *Pred = PN.getIncomingBlock(I);
- if (LastRegion->contains(Pred)) {
- Value *V = PN.getIncomingValue(I);
- auto It = VMap.find(V);
- if (It != VMap.end()) V = It->second;
- assert(VMap.find(Pred) != VMap.end() && "Pred must have been cloned");
- PN.addIncoming(V, cast<BasicBlock>(VMap[Pred]));
- }
- }
-}
-
-// A helper for transformScope. Replace the old (placeholder) branch with the
-// new (merged) conditional branch.
-BranchInst *CHR::createMergedBranch(BasicBlock *PreEntryBlock,
- BasicBlock *EntryBlock,
- BasicBlock *NewEntryBlock,
- ValueToValueMapTy &VMap) {
- BranchInst *OldBR = cast<BranchInst>(PreEntryBlock->getTerminator());
- assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == NewEntryBlock &&
- "SplitBlock did not work correctly!");
- assert(NewEntryBlock->getSinglePredecessor() == EntryBlock &&
- "NewEntryBlock's only pred must be EntryBlock");
- assert(VMap.find(NewEntryBlock) != VMap.end() &&
- "NewEntryBlock must have been copied");
- OldBR->dropAllReferences();
- OldBR->eraseFromParent();
- // The true predicate is a placeholder. It will be replaced later in
- // fixupBranchesAndSelects().
- BranchInst *NewBR = BranchInst::Create(NewEntryBlock,
- cast<BasicBlock>(VMap[NewEntryBlock]),
- ConstantInt::getTrue(F.getContext()));
- PreEntryBlock->getInstList().push_back(NewBR);
- assert(NewEntryBlock->getSinglePredecessor() == EntryBlock &&
- "NewEntryBlock's only pred must be EntryBlock");
- return NewBR;
-}
-
-// A helper for transformScopes. Create the combined branch condition and
-// constant-fold the branches/selects in the hot path.
-void CHR::fixupBranchesAndSelects(CHRScope *Scope,
- BasicBlock *PreEntryBlock,
- BranchInst *MergedBR,
- uint64_t ProfileCount) {
- Value *MergedCondition = ConstantInt::getTrue(F.getContext());
- BranchProbability CHRBranchBias(1, 1);
- uint64_t NumCHRedBranches = 0;
- IRBuilder<> IRB(PreEntryBlock->getTerminator());
- for (RegInfo &RI : Scope->CHRRegions) {
- Region *R = RI.R;
- if (RI.HasBranch) {
- fixupBranch(R, Scope, IRB, MergedCondition, CHRBranchBias);
- ++NumCHRedBranches;
- }
- for (SelectInst *SI : RI.Selects) {
- fixupSelect(SI, Scope, IRB, MergedCondition, CHRBranchBias);
- ++NumCHRedBranches;
- }
- }
- Stats.NumBranchesDelta += NumCHRedBranches - 1;
- Stats.WeightedNumBranchesDelta += (NumCHRedBranches - 1) * ProfileCount;
- ORE.emit([&]() {
- return OptimizationRemark(DEBUG_TYPE,
- "CHR",
- // Refer to the hot (original) path
- MergedBR->getSuccessor(0)->getTerminator())
- << "Merged " << ore::NV("NumCHRedBranches", NumCHRedBranches)
- << " branches or selects";
- });
- MergedBR->setCondition(MergedCondition);
- uint32_t Weights[] = {
- static_cast<uint32_t>(CHRBranchBias.scale(1000)),
- static_cast<uint32_t>(CHRBranchBias.getCompl().scale(1000)),
- };
- MDBuilder MDB(F.getContext());
- MergedBR->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
- CHR_DEBUG(dbgs() << "CHR branch bias " << Weights[0] << ":" << Weights[1]
- << "\n");
-}
-
-// A helper for fixupBranchesAndSelects. Add to the combined branch condition
-// and constant-fold a branch in the hot path.
-void CHR::fixupBranch(Region *R, CHRScope *Scope,
- IRBuilder<> &IRB,
- Value *&MergedCondition,
- BranchProbability &CHRBranchBias) {
- bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
- assert((IsTrueBiased || Scope->FalseBiasedRegions.count(R)) &&
- "Must be truthy or falsy");
- auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
- assert(BranchBiasMap.find(R) != BranchBiasMap.end() &&
- "Must be in the bias map");
- BranchProbability Bias = BranchBiasMap[R];
- assert(Bias >= getCHRBiasThreshold() && "Must be highly biased");
- // Take the min.
- if (CHRBranchBias > Bias)
- CHRBranchBias = Bias;
- BasicBlock *IfThen = BI->getSuccessor(1);
- BasicBlock *IfElse = BI->getSuccessor(0);
- BasicBlock *RegionExitBlock = R->getExit();
- assert(RegionExitBlock && "Null ExitBlock");
- assert((IfThen == RegionExitBlock || IfElse == RegionExitBlock) &&
- IfThen != IfElse && "Invariant from findScopes");
- if (IfThen == RegionExitBlock) {
- // Swap them so that IfThen means going into it and IfElse means skipping
- // it.
- std::swap(IfThen, IfElse);
- }
- CHR_DEBUG(dbgs() << "IfThen " << IfThen->getName()
- << " IfElse " << IfElse->getName() << "\n");
- Value *Cond = BI->getCondition();
- BasicBlock *HotTarget = IsTrueBiased ? IfThen : IfElse;
- bool ConditionTrue = HotTarget == BI->getSuccessor(0);
- addToMergedCondition(ConditionTrue, Cond, BI, Scope, IRB,
- MergedCondition);
- // Constant-fold the branch at ClonedEntryBlock.
- assert(ConditionTrue == (HotTarget == BI->getSuccessor(0)) &&
- "The successor shouldn't change");
- Value *NewCondition = ConditionTrue ?
- ConstantInt::getTrue(F.getContext()) :
- ConstantInt::getFalse(F.getContext());
- BI->setCondition(NewCondition);
-}
-
-// A helper for fixupBranchesAndSelects. Add to the combined branch condition
-// and constant-fold a select in the hot path.
-void CHR::fixupSelect(SelectInst *SI, CHRScope *Scope,
- IRBuilder<> &IRB,
- Value *&MergedCondition,
- BranchProbability &CHRBranchBias) {
- bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI);
- assert((IsTrueBiased ||
- Scope->FalseBiasedSelects.count(SI)) && "Must be biased");
- assert(SelectBiasMap.find(SI) != SelectBiasMap.end() &&
- "Must be in the bias map");
- BranchProbability Bias = SelectBiasMap[SI];
- assert(Bias >= getCHRBiasThreshold() && "Must be highly biased");
- // Take the min.
- if (CHRBranchBias > Bias)
- CHRBranchBias = Bias;
- Value *Cond = SI->getCondition();
- addToMergedCondition(IsTrueBiased, Cond, SI, Scope, IRB,
- MergedCondition);
- Value *NewCondition = IsTrueBiased ?
- ConstantInt::getTrue(F.getContext()) :
- ConstantInt::getFalse(F.getContext());
- SI->setCondition(NewCondition);
-}
-
-// A helper for fixupBranch/fixupSelect. Add a branch condition to the merged
-// condition.
-void CHR::addToMergedCondition(bool IsTrueBiased, Value *Cond,
- Instruction *BranchOrSelect,
- CHRScope *Scope,
- IRBuilder<> &IRB,
- Value *&MergedCondition) {
- if (IsTrueBiased) {
- MergedCondition = IRB.CreateAnd(MergedCondition, Cond);
- } else {
- // If Cond is an icmp and all users of V except for BranchOrSelect is a
- // branch, negate the icmp predicate and swap the branch targets and avoid
- // inserting an Xor to negate Cond.
- bool Done = false;
- if (auto *ICmp = dyn_cast<ICmpInst>(Cond))
- if (negateICmpIfUsedByBranchOrSelectOnly(ICmp, BranchOrSelect, Scope)) {
- MergedCondition = IRB.CreateAnd(MergedCondition, Cond);
- Done = true;
- }
- if (!Done) {
- Value *Negate = IRB.CreateXor(
- ConstantInt::getTrue(F.getContext()), Cond);
- MergedCondition = IRB.CreateAnd(MergedCondition, Negate);
- }
- }
-}
-
-void CHR::transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes) {
- unsigned I = 0;
- DenseSet<PHINode *> TrivialPHIs;
- for (CHRScope *Scope : CHRScopes) {
- transformScopes(Scope, TrivialPHIs);
- CHR_DEBUG(
- std::ostringstream oss;
- oss << " after transformScopes " << I++;
- dumpIR(F, oss.str().c_str(), nullptr));
- (void)I;
- }
-}
-
-static void LLVM_ATTRIBUTE_UNUSED
-dumpScopes(SmallVectorImpl<CHRScope *> &Scopes, const char *Label) {
- dbgs() << Label << " " << Scopes.size() << "\n";
- for (CHRScope *Scope : Scopes) {
- dbgs() << *Scope << "\n";
- }
-}
-
-bool CHR::run() {
- if (!shouldApply(F, PSI))
- return false;
-
- CHR_DEBUG(dumpIR(F, "before", nullptr));
-
- bool Changed = false;
- {
- CHR_DEBUG(
- dbgs() << "RegionInfo:\n";
- RI.print(dbgs()));
-
- // Recursively traverse the region tree and find regions that have biased
- // branches and/or selects and create scopes.
- SmallVector<CHRScope *, 8> AllScopes;
- findScopes(AllScopes);
- CHR_DEBUG(dumpScopes(AllScopes, "All scopes"));
-
- // Split the scopes if 1) the conditiona values of the biased
- // branches/selects of the inner/lower scope can't be hoisted up to the
- // outermost/uppermost scope entry, or 2) the condition values of the biased
- // branches/selects in a scope (including subscopes) don't share at least
- // one common value.
- SmallVector<CHRScope *, 8> SplitScopes;
- splitScopes(AllScopes, SplitScopes);
- CHR_DEBUG(dumpScopes(SplitScopes, "Split scopes"));
-
- // After splitting, set the biased regions and selects of a scope (a tree
- // root) that include those of the subscopes.
- classifyBiasedScopes(SplitScopes);
- CHR_DEBUG(dbgs() << "Set per-scope bias " << SplitScopes.size() << "\n");
-
- // Filter out the scopes that has only one biased region or select (CHR
- // isn't useful in such a case).
- SmallVector<CHRScope *, 8> FilteredScopes;
- filterScopes(SplitScopes, FilteredScopes);
- CHR_DEBUG(dumpScopes(FilteredScopes, "Filtered scopes"));
-
- // Set the regions to be CHR'ed and their hoist stops for each scope.
- SmallVector<CHRScope *, 8> SetScopes;
- setCHRRegions(FilteredScopes, SetScopes);
- CHR_DEBUG(dumpScopes(SetScopes, "Set CHR regions"));
-
- // Sort CHRScopes by the depth so that outer CHRScopes comes before inner
- // ones. We need to apply CHR from outer to inner so that we apply CHR only
- // to the hot path, rather than both hot and cold paths.
- SmallVector<CHRScope *, 8> SortedScopes;
- sortScopes(SetScopes, SortedScopes);
- CHR_DEBUG(dumpScopes(SortedScopes, "Sorted scopes"));
-
- CHR_DEBUG(
- dbgs() << "RegionInfo:\n";
- RI.print(dbgs()));
-
- // Apply the CHR transformation.
- if (!SortedScopes.empty()) {
- transformScopes(SortedScopes);
- Changed = true;
- }
- }
-
- if (Changed) {
- CHR_DEBUG(dumpIR(F, "after", &Stats));
- ORE.emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "Stats", &F)
- << ore::NV("Function", &F) << " "
- << "Reduced the number of branches in hot paths by "
- << ore::NV("NumBranchesDelta", Stats.NumBranchesDelta)
- << " (static) and "
- << ore::NV("WeightedNumBranchesDelta", Stats.WeightedNumBranchesDelta)
- << " (weighted by PGO count)";
- });
- }
-
- return Changed;
-}
-
-bool ControlHeightReductionLegacyPass::runOnFunction(Function &F) {
- BlockFrequencyInfo &BFI =
- getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
- DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- ProfileSummaryInfo &PSI =
- getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
- RegionInfo &RI = getAnalysis<RegionInfoPass>().getRegionInfo();
- std::unique_ptr<OptimizationRemarkEmitter> OwnedORE =
- std::make_unique<OptimizationRemarkEmitter>(&F);
- return CHR(F, BFI, DT, PSI, RI, *OwnedORE.get()).run();
-}
-
-namespace llvm {
-
-ControlHeightReductionPass::ControlHeightReductionPass() {
- parseCHRFilterFiles();
-}
-
-PreservedAnalyses ControlHeightReductionPass::run(
- Function &F,
- FunctionAnalysisManager &FAM) {
- auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
- auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
- auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
- auto &PSI = *MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
- auto &RI = FAM.getResult<RegionInfoAnalysis>(F);
- auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- bool Changed = CHR(F, BFI, DT, PSI, RI, ORE).run();
- if (!Changed)
- return PreservedAnalyses::all();
- auto PA = PreservedAnalyses();
- PA.preserve<GlobalsAA>();
- return PA;
-}
-
-} // namespace llvm
+ &ExitBlock->front());
+ for (BasicBlock *Pred : predecessors(ExitBlock)) {
+ PN->addIncoming(&I, Pred);
+ }
+ TrivialPHIs.insert(PN);
+ CHR_DEBUG(dbgs() << "Insert phi " << *PN << "\n");
+ for (Instruction *UI : Users) {
+ for (unsigned J = 0, NumOps = UI->getNumOperands(); J < NumOps; ++J) {
+ if (UI->getOperand(J) == &I) {
+ UI->setOperand(J, PN);
+ }
+ }
+ CHR_DEBUG(dbgs() << "Updated user " << *UI << "\n");
+ }
+ }
+ }
+ }
+}
+
+// Assert that all the CHR regions of the scope have a biased branch or select.
+static void LLVM_ATTRIBUTE_UNUSED
+assertCHRRegionsHaveBiasedBranchOrSelect(CHRScope *Scope) {
+#ifndef NDEBUG
+ auto HasBiasedBranchOrSelect = [](RegInfo &RI, CHRScope *Scope) {
+ if (Scope->TrueBiasedRegions.count(RI.R) ||
+ Scope->FalseBiasedRegions.count(RI.R))
+ return true;
+ for (SelectInst *SI : RI.Selects)
+ if (Scope->TrueBiasedSelects.count(SI) ||
+ Scope->FalseBiasedSelects.count(SI))
+ return true;
+ return false;
+ };
+ for (RegInfo &RI : Scope->CHRRegions) {
+ assert(HasBiasedBranchOrSelect(RI, Scope) &&
+ "Must have biased branch or select");
+ }
+#endif
+}
+
+// Assert that all the condition values of the biased branches and selects have
+// been hoisted to the pre-entry block or outside of the scope.
+static void LLVM_ATTRIBUTE_UNUSED assertBranchOrSelectConditionHoisted(
+ CHRScope *Scope, BasicBlock *PreEntryBlock) {
+ CHR_DEBUG(dbgs() << "Biased regions condition values \n");
+ for (RegInfo &RI : Scope->CHRRegions) {
+ Region *R = RI.R;
+ bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
+ bool IsFalseBiased = Scope->FalseBiasedRegions.count(R);
+ if (RI.HasBranch && (IsTrueBiased || IsFalseBiased)) {
+ auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+ Value *V = BI->getCondition();
+ CHR_DEBUG(dbgs() << *V << "\n");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ (void)(I); // Unused in release build.
+ assert((I->getParent() == PreEntryBlock ||
+ !Scope->contains(I)) &&
+ "Must have been hoisted to PreEntryBlock or outside the scope");
+ }
+ }
+ for (SelectInst *SI : RI.Selects) {
+ bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI);
+ bool IsFalseBiased = Scope->FalseBiasedSelects.count(SI);
+ if (!(IsTrueBiased || IsFalseBiased))
+ continue;
+ Value *V = SI->getCondition();
+ CHR_DEBUG(dbgs() << *V << "\n");
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ (void)(I); // Unused in release build.
+ assert((I->getParent() == PreEntryBlock ||
+ !Scope->contains(I)) &&
+ "Must have been hoisted to PreEntryBlock or outside the scope");
+ }
+ }
+ }
+}
+
+void CHR::transformScopes(CHRScope *Scope, DenseSet<PHINode *> &TrivialPHIs) {
+ CHR_DEBUG(dbgs() << "transformScopes " << *Scope << "\n");
+
+ assert(Scope->RegInfos.size() >= 1 && "Should have at least one Region");
+ Region *FirstRegion = Scope->RegInfos[0].R;
+ BasicBlock *EntryBlock = FirstRegion->getEntry();
+ Region *LastRegion = Scope->RegInfos[Scope->RegInfos.size() - 1].R;
+ BasicBlock *ExitBlock = LastRegion->getExit();
+ Optional<uint64_t> ProfileCount = BFI.getBlockProfileCount(EntryBlock);
+
+ if (ExitBlock) {
+ // Insert a trivial phi at the exit block (where the CHR hot path and the
+ // cold path merges) for a value that's defined in the scope but used
+ // outside it (meaning it's alive at the exit block). We will add the
+ // incoming values for the CHR cold paths to it below. Without this, we'd
+ // miss updating phi's for such values unless there happens to already be a
+ // phi for that value there.
+ insertTrivialPHIs(Scope, EntryBlock, ExitBlock, TrivialPHIs);
+ }
+
+ // Split the entry block of the first region. The new block becomes the new
+ // entry block of the first region. The old entry block becomes the block to
+ // insert the CHR branch into. Note DT gets updated. Since DT gets updated
+ // through the split, we update the entry of the first region after the split,
+ // and Region only points to the entry and the exit blocks, rather than
+ // keeping everything in a list or set, the blocks membership and the
+ // entry/exit blocks of the region are still valid after the split.
+ CHR_DEBUG(dbgs() << "Splitting entry block " << EntryBlock->getName()
+ << " at " << *Scope->BranchInsertPoint << "\n");
+ BasicBlock *NewEntryBlock =
+ SplitBlock(EntryBlock, Scope->BranchInsertPoint, &DT);
+ assert(NewEntryBlock->getSinglePredecessor() == EntryBlock &&
+ "NewEntryBlock's only pred must be EntryBlock");
+ FirstRegion->replaceEntryRecursive(NewEntryBlock);
+ BasicBlock *PreEntryBlock = EntryBlock;
+
+ ValueToValueMapTy VMap;
+ // Clone the blocks in the scope (excluding the PreEntryBlock) to split into a
+ // hot path (originals) and a cold path (clones) and update the PHIs at the
+ // exit block.
+ cloneScopeBlocks(Scope, PreEntryBlock, ExitBlock, LastRegion, VMap);
+
+ // Replace the old (placeholder) branch with the new (merged) conditional
+ // branch.
+ BranchInst *MergedBr = createMergedBranch(PreEntryBlock, EntryBlock,
+ NewEntryBlock, VMap);
+
+#ifndef NDEBUG
+ assertCHRRegionsHaveBiasedBranchOrSelect(Scope);
+#endif
+
+ // Hoist the conditional values of the branches/selects.
+ hoistScopeConditions(Scope, PreEntryBlock->getTerminator(), TrivialPHIs, DT);
+
+#ifndef NDEBUG
+ assertBranchOrSelectConditionHoisted(Scope, PreEntryBlock);
+#endif
+
+ // Create the combined branch condition and constant-fold the branches/selects
+ // in the hot path.
+ fixupBranchesAndSelects(Scope, PreEntryBlock, MergedBr,
+ ProfileCount ? ProfileCount.getValue() : 0);
+}
+
+// A helper for transformScopes. Clone the blocks in the scope (excluding the
+// PreEntryBlock) to split into a hot path and a cold path and update the PHIs
+// at the exit block.
+void CHR::cloneScopeBlocks(CHRScope *Scope,
+ BasicBlock *PreEntryBlock,
+ BasicBlock *ExitBlock,
+ Region *LastRegion,
+ ValueToValueMapTy &VMap) {
+ // Clone all the blocks. The original blocks will be the hot-path
+ // CHR-optimized code and the cloned blocks will be the original unoptimized
+ // code. This is so that the block pointers from the
+ // CHRScope/Region/RegionInfo can stay valid in pointing to the hot-path code
+ // which CHR should apply to.
+ SmallVector<BasicBlock*, 8> NewBlocks;
+ for (RegInfo &RI : Scope->RegInfos)
+ for (BasicBlock *BB : RI.R->blocks()) { // This includes the blocks in the
+ // sub-Scopes.
+ assert(BB != PreEntryBlock && "Don't copy the preetntry block");
+ BasicBlock *NewBB = CloneBasicBlock(BB, VMap, ".nonchr", &F);
+ NewBlocks.push_back(NewBB);
+ VMap[BB] = NewBB;
+ }
+
+ // Place the cloned blocks right after the original blocks (right before the
+ // exit block of.)
+ if (ExitBlock)
+ F.getBasicBlockList().splice(ExitBlock->getIterator(),
+ F.getBasicBlockList(),
+ NewBlocks[0]->getIterator(), F.end());
+
+ // Update the cloned blocks/instructions to refer to themselves.
+ for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
+ for (Instruction &I : *NewBlocks[i])
+ RemapInstruction(&I, VMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+ // Add the cloned blocks to the PHIs of the exit blocks. ExitBlock is null for
+ // the top-level region but we don't need to add PHIs. The trivial PHIs
+ // inserted above will be updated here.
+ if (ExitBlock)
+ for (PHINode &PN : ExitBlock->phis())
+ for (unsigned I = 0, NumOps = PN.getNumIncomingValues(); I < NumOps;
+ ++I) {
+ BasicBlock *Pred = PN.getIncomingBlock(I);
+ if (LastRegion->contains(Pred)) {
+ Value *V = PN.getIncomingValue(I);
+ auto It = VMap.find(V);
+ if (It != VMap.end()) V = It->second;
+ assert(VMap.find(Pred) != VMap.end() && "Pred must have been cloned");
+ PN.addIncoming(V, cast<BasicBlock>(VMap[Pred]));
+ }
+ }
+}
+
+// A helper for transformScope. Replace the old (placeholder) branch with the
+// new (merged) conditional branch.
+BranchInst *CHR::createMergedBranch(BasicBlock *PreEntryBlock,
+ BasicBlock *EntryBlock,
+ BasicBlock *NewEntryBlock,
+ ValueToValueMapTy &VMap) {
+ BranchInst *OldBR = cast<BranchInst>(PreEntryBlock->getTerminator());
+ assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == NewEntryBlock &&
+ "SplitBlock did not work correctly!");
+ assert(NewEntryBlock->getSinglePredecessor() == EntryBlock &&
+ "NewEntryBlock's only pred must be EntryBlock");
+ assert(VMap.find(NewEntryBlock) != VMap.end() &&
+ "NewEntryBlock must have been copied");
+ OldBR->dropAllReferences();
+ OldBR->eraseFromParent();
+ // The true predicate is a placeholder. It will be replaced later in
+ // fixupBranchesAndSelects().
+ BranchInst *NewBR = BranchInst::Create(NewEntryBlock,
+ cast<BasicBlock>(VMap[NewEntryBlock]),
+ ConstantInt::getTrue(F.getContext()));
+ PreEntryBlock->getInstList().push_back(NewBR);
+ assert(NewEntryBlock->getSinglePredecessor() == EntryBlock &&
+ "NewEntryBlock's only pred must be EntryBlock");
+ return NewBR;
+}
+
+// A helper for transformScopes. Create the combined branch condition and
+// constant-fold the branches/selects in the hot path.
+void CHR::fixupBranchesAndSelects(CHRScope *Scope,
+ BasicBlock *PreEntryBlock,
+ BranchInst *MergedBR,
+ uint64_t ProfileCount) {
+ Value *MergedCondition = ConstantInt::getTrue(F.getContext());
+ BranchProbability CHRBranchBias(1, 1);
+ uint64_t NumCHRedBranches = 0;
+ IRBuilder<> IRB(PreEntryBlock->getTerminator());
+ for (RegInfo &RI : Scope->CHRRegions) {
+ Region *R = RI.R;
+ if (RI.HasBranch) {
+ fixupBranch(R, Scope, IRB, MergedCondition, CHRBranchBias);
+ ++NumCHRedBranches;
+ }
+ for (SelectInst *SI : RI.Selects) {
+ fixupSelect(SI, Scope, IRB, MergedCondition, CHRBranchBias);
+ ++NumCHRedBranches;
+ }
+ }
+ Stats.NumBranchesDelta += NumCHRedBranches - 1;
+ Stats.WeightedNumBranchesDelta += (NumCHRedBranches - 1) * ProfileCount;
+ ORE.emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE,
+ "CHR",
+ // Refer to the hot (original) path
+ MergedBR->getSuccessor(0)->getTerminator())
+ << "Merged " << ore::NV("NumCHRedBranches", NumCHRedBranches)
+ << " branches or selects";
+ });
+ MergedBR->setCondition(MergedCondition);
+ uint32_t Weights[] = {
+ static_cast<uint32_t>(CHRBranchBias.scale(1000)),
+ static_cast<uint32_t>(CHRBranchBias.getCompl().scale(1000)),
+ };
+ MDBuilder MDB(F.getContext());
+ MergedBR->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+ CHR_DEBUG(dbgs() << "CHR branch bias " << Weights[0] << ":" << Weights[1]
+ << "\n");
+}
+
+// A helper for fixupBranchesAndSelects. Add to the combined branch condition
+// and constant-fold a branch in the hot path.
+void CHR::fixupBranch(Region *R, CHRScope *Scope,
+ IRBuilder<> &IRB,
+ Value *&MergedCondition,
+ BranchProbability &CHRBranchBias) {
+ bool IsTrueBiased = Scope->TrueBiasedRegions.count(R);
+ assert((IsTrueBiased || Scope->FalseBiasedRegions.count(R)) &&
+ "Must be truthy or falsy");
+ auto *BI = cast<BranchInst>(R->getEntry()->getTerminator());
+ assert(BranchBiasMap.find(R) != BranchBiasMap.end() &&
+ "Must be in the bias map");
+ BranchProbability Bias = BranchBiasMap[R];
+ assert(Bias >= getCHRBiasThreshold() && "Must be highly biased");
+ // Take the min.
+ if (CHRBranchBias > Bias)
+ CHRBranchBias = Bias;
+ BasicBlock *IfThen = BI->getSuccessor(1);
+ BasicBlock *IfElse = BI->getSuccessor(0);
+ BasicBlock *RegionExitBlock = R->getExit();
+ assert(RegionExitBlock && "Null ExitBlock");
+ assert((IfThen == RegionExitBlock || IfElse == RegionExitBlock) &&
+ IfThen != IfElse && "Invariant from findScopes");
+ if (IfThen == RegionExitBlock) {
+ // Swap them so that IfThen means going into it and IfElse means skipping
+ // it.
+ std::swap(IfThen, IfElse);
+ }
+ CHR_DEBUG(dbgs() << "IfThen " << IfThen->getName()
+ << " IfElse " << IfElse->getName() << "\n");
+ Value *Cond = BI->getCondition();
+ BasicBlock *HotTarget = IsTrueBiased ? IfThen : IfElse;
+ bool ConditionTrue = HotTarget == BI->getSuccessor(0);
+ addToMergedCondition(ConditionTrue, Cond, BI, Scope, IRB,
+ MergedCondition);
+ // Constant-fold the branch at ClonedEntryBlock.
+ assert(ConditionTrue == (HotTarget == BI->getSuccessor(0)) &&
+ "The successor shouldn't change");
+ Value *NewCondition = ConditionTrue ?
+ ConstantInt::getTrue(F.getContext()) :
+ ConstantInt::getFalse(F.getContext());
+ BI->setCondition(NewCondition);
+}
+
+// A helper for fixupBranchesAndSelects. Add to the combined branch condition
+// and constant-fold a select in the hot path.
+void CHR::fixupSelect(SelectInst *SI, CHRScope *Scope,
+ IRBuilder<> &IRB,
+ Value *&MergedCondition,
+ BranchProbability &CHRBranchBias) {
+ bool IsTrueBiased = Scope->TrueBiasedSelects.count(SI);
+ assert((IsTrueBiased ||
+ Scope->FalseBiasedSelects.count(SI)) && "Must be biased");
+ assert(SelectBiasMap.find(SI) != SelectBiasMap.end() &&
+ "Must be in the bias map");
+ BranchProbability Bias = SelectBiasMap[SI];
+ assert(Bias >= getCHRBiasThreshold() && "Must be highly biased");
+ // Take the min.
+ if (CHRBranchBias > Bias)
+ CHRBranchBias = Bias;
+ Value *Cond = SI->getCondition();
+ addToMergedCondition(IsTrueBiased, Cond, SI, Scope, IRB,
+ MergedCondition);
+ Value *NewCondition = IsTrueBiased ?
+ ConstantInt::getTrue(F.getContext()) :
+ ConstantInt::getFalse(F.getContext());
+ SI->setCondition(NewCondition);
+}
+
+// A helper for fixupBranch/fixupSelect. Add a branch condition to the merged
+// condition.
+void CHR::addToMergedCondition(bool IsTrueBiased, Value *Cond,
+ Instruction *BranchOrSelect,
+ CHRScope *Scope,
+ IRBuilder<> &IRB,
+ Value *&MergedCondition) {
+ if (IsTrueBiased) {
+ MergedCondition = IRB.CreateAnd(MergedCondition, Cond);
+ } else {
+ // If Cond is an icmp and all users of V except for BranchOrSelect is a
+ // branch, negate the icmp predicate and swap the branch targets and avoid
+ // inserting an Xor to negate Cond.
+ bool Done = false;
+ if (auto *ICmp = dyn_cast<ICmpInst>(Cond))
+ if (negateICmpIfUsedByBranchOrSelectOnly(ICmp, BranchOrSelect, Scope)) {
+ MergedCondition = IRB.CreateAnd(MergedCondition, Cond);
+ Done = true;
+ }
+ if (!Done) {
+ Value *Negate = IRB.CreateXor(
+ ConstantInt::getTrue(F.getContext()), Cond);
+ MergedCondition = IRB.CreateAnd(MergedCondition, Negate);
+ }
+ }
+}
+
+void CHR::transformScopes(SmallVectorImpl<CHRScope *> &CHRScopes) {
+ unsigned I = 0;
+ DenseSet<PHINode *> TrivialPHIs;
+ for (CHRScope *Scope : CHRScopes) {
+ transformScopes(Scope, TrivialPHIs);
+ CHR_DEBUG(
+ std::ostringstream oss;
+ oss << " after transformScopes " << I++;
+ dumpIR(F, oss.str().c_str(), nullptr));
+ (void)I;
+ }
+}
+
+static void LLVM_ATTRIBUTE_UNUSED
+dumpScopes(SmallVectorImpl<CHRScope *> &Scopes, const char *Label) {
+ dbgs() << Label << " " << Scopes.size() << "\n";
+ for (CHRScope *Scope : Scopes) {
+ dbgs() << *Scope << "\n";
+ }
+}
+
+bool CHR::run() {
+ if (!shouldApply(F, PSI))
+ return false;
+
+ CHR_DEBUG(dumpIR(F, "before", nullptr));
+
+ bool Changed = false;
+ {
+ CHR_DEBUG(
+ dbgs() << "RegionInfo:\n";
+ RI.print(dbgs()));
+
+ // Recursively traverse the region tree and find regions that have biased
+ // branches and/or selects and create scopes.
+ SmallVector<CHRScope *, 8> AllScopes;
+ findScopes(AllScopes);
+ CHR_DEBUG(dumpScopes(AllScopes, "All scopes"));
+
+ // Split the scopes if 1) the conditiona values of the biased
+ // branches/selects of the inner/lower scope can't be hoisted up to the
+ // outermost/uppermost scope entry, or 2) the condition values of the biased
+ // branches/selects in a scope (including subscopes) don't share at least
+ // one common value.
+ SmallVector<CHRScope *, 8> SplitScopes;
+ splitScopes(AllScopes, SplitScopes);
+ CHR_DEBUG(dumpScopes(SplitScopes, "Split scopes"));
+
+ // After splitting, set the biased regions and selects of a scope (a tree
+ // root) that include those of the subscopes.
+ classifyBiasedScopes(SplitScopes);
+ CHR_DEBUG(dbgs() << "Set per-scope bias " << SplitScopes.size() << "\n");
+
+ // Filter out the scopes that has only one biased region or select (CHR
+ // isn't useful in such a case).
+ SmallVector<CHRScope *, 8> FilteredScopes;
+ filterScopes(SplitScopes, FilteredScopes);
+ CHR_DEBUG(dumpScopes(FilteredScopes, "Filtered scopes"));
+
+ // Set the regions to be CHR'ed and their hoist stops for each scope.
+ SmallVector<CHRScope *, 8> SetScopes;
+ setCHRRegions(FilteredScopes, SetScopes);
+ CHR_DEBUG(dumpScopes(SetScopes, "Set CHR regions"));
+
+ // Sort CHRScopes by the depth so that outer CHRScopes comes before inner
+ // ones. We need to apply CHR from outer to inner so that we apply CHR only
+ // to the hot path, rather than both hot and cold paths.
+ SmallVector<CHRScope *, 8> SortedScopes;
+ sortScopes(SetScopes, SortedScopes);
+ CHR_DEBUG(dumpScopes(SortedScopes, "Sorted scopes"));
+
+ CHR_DEBUG(
+ dbgs() << "RegionInfo:\n";
+ RI.print(dbgs()));
+
+ // Apply the CHR transformation.
+ if (!SortedScopes.empty()) {
+ transformScopes(SortedScopes);
+ Changed = true;
+ }
+ }
+
+ if (Changed) {
+ CHR_DEBUG(dumpIR(F, "after", &Stats));
+ ORE.emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Stats", &F)
+ << ore::NV("Function", &F) << " "
+ << "Reduced the number of branches in hot paths by "
+ << ore::NV("NumBranchesDelta", Stats.NumBranchesDelta)
+ << " (static) and "
+ << ore::NV("WeightedNumBranchesDelta", Stats.WeightedNumBranchesDelta)
+ << " (weighted by PGO count)";
+ });
+ }
+
+ return Changed;
+}
+
+bool ControlHeightReductionLegacyPass::runOnFunction(Function &F) {
+ BlockFrequencyInfo &BFI =
+ getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ ProfileSummaryInfo &PSI =
+ getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ RegionInfo &RI = getAnalysis<RegionInfoPass>().getRegionInfo();
+ std::unique_ptr<OptimizationRemarkEmitter> OwnedORE =
+ std::make_unique<OptimizationRemarkEmitter>(&F);
+ return CHR(F, BFI, DT, PSI, RI, *OwnedORE.get()).run();
+}
+
+namespace llvm {
+
+ControlHeightReductionPass::ControlHeightReductionPass() {
+ parseCHRFilterFiles();
+}
+
+PreservedAnalyses ControlHeightReductionPass::run(
+ Function &F,
+ FunctionAnalysisManager &FAM) {
+ auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+ auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+ auto &PSI = *MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ auto &RI = FAM.getResult<RegionInfoAnalysis>(F);
+ auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ bool Changed = CHR(F, BFI, DT, PSI, RI, ORE).run();
+ if (!Changed)
+ return PreservedAnalyses::all();
+ auto PA = PreservedAnalyses();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index ebd7a997dd..1b14b8d569 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -1,111 +1,111 @@
-//===- DataFlowSanitizer.cpp - dynamic data flow analysis -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This file is a part of DataFlowSanitizer, a generalised dynamic data flow
-/// analysis.
-///
-/// Unlike other Sanitizer tools, this tool is not designed to detect a specific
-/// class of bugs on its own. Instead, it provides a generic dynamic data flow
-/// analysis framework to be used by clients to help detect application-specific
-/// issues within their own code.
-///
-/// The analysis is based on automatic propagation of data flow labels (also
-/// known as taint labels) through a program as it performs computation. Each
-/// byte of application memory is backed by two bytes of shadow memory which
-/// hold the label. On Linux/x86_64, memory is laid out as follows:
-///
-/// +--------------------+ 0x800000000000 (top of memory)
-/// | application memory |
-/// +--------------------+ 0x700000008000 (kAppAddr)
-/// | |
-/// | unused |
-/// | |
-/// +--------------------+ 0x200200000000 (kUnusedAddr)
-/// | union table |
-/// +--------------------+ 0x200000000000 (kUnionTableAddr)
-/// | shadow memory |
-/// +--------------------+ 0x000000010000 (kShadowAddr)
-/// | reserved by kernel |
-/// +--------------------+ 0x000000000000
-///
-/// To derive a shadow memory address from an application memory address,
-/// bits 44-46 are cleared to bring the address into the range
-/// [0x000000008000,0x100000000000). Then the address is shifted left by 1 to
-/// account for the double byte representation of shadow labels and move the
-/// address into the shadow memory range. See the function
-/// DataFlowSanitizer::getShadowAddress below.
-///
-/// For more information, please refer to the design document:
-/// http://clang.llvm.org/docs/DataFlowSanitizerDesign.html
-//
-//===----------------------------------------------------------------------===//
-
+//===- DataFlowSanitizer.cpp - dynamic data flow analysis -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file is a part of DataFlowSanitizer, a generalised dynamic data flow
+/// analysis.
+///
+/// Unlike other Sanitizer tools, this tool is not designed to detect a specific
+/// class of bugs on its own. Instead, it provides a generic dynamic data flow
+/// analysis framework to be used by clients to help detect application-specific
+/// issues within their own code.
+///
+/// The analysis is based on automatic propagation of data flow labels (also
+/// known as taint labels) through a program as it performs computation. Each
+/// byte of application memory is backed by two bytes of shadow memory which
+/// hold the label. On Linux/x86_64, memory is laid out as follows:
+///
+/// +--------------------+ 0x800000000000 (top of memory)
+/// | application memory |
+/// +--------------------+ 0x700000008000 (kAppAddr)
+/// | |
+/// | unused |
+/// | |
+/// +--------------------+ 0x200200000000 (kUnusedAddr)
+/// | union table |
+/// +--------------------+ 0x200000000000 (kUnionTableAddr)
+/// | shadow memory |
+/// +--------------------+ 0x000000010000 (kShadowAddr)
+/// | reserved by kernel |
+/// +--------------------+ 0x000000000000
+///
+/// To derive a shadow memory address from an application memory address,
+/// bits 44-46 are cleared to bring the address into the range
+/// [0x000000008000,0x100000000000). Then the address is shifted left by 1 to
+/// account for the double byte representation of shadow labels and move the
+/// address into the shadow memory range. See the function
+/// DataFlowSanitizer::getShadowAddress below.
+///
+/// For more information, please refer to the design document:
+/// http://clang.llvm.org/docs/DataFlowSanitizerDesign.html
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Module.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/SpecialCaseList.h"
-#include "llvm/Support/VirtualFileSystem.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <memory>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SpecialCaseList.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
// This must be consistent with ShadowWidthBits.
static const Align kShadowTLSAlignment = Align(2);
@@ -114,78 +114,78 @@ static const Align kShadowTLSAlignment = Align(2);
static const unsigned kArgTLSSize = 800;
static const unsigned kRetvalTLSSize = 800;
-// External symbol to be used when generating the shadow address for
-// architectures with multiple VMAs. Instead of using a constant integer
-// the runtime will set the external mask based on the VMA range.
+// External symbol to be used when generating the shadow address for
+// architectures with multiple VMAs. Instead of using a constant integer
+// the runtime will set the external mask based on the VMA range.
const char kDFSanExternShadowPtrMask[] = "__dfsan_shadow_ptr_mask";
-
-// The -dfsan-preserve-alignment flag controls whether this pass assumes that
-// alignment requirements provided by the input IR are correct. For example,
-// if the input IR contains a load with alignment 8, this flag will cause
-// the shadow load to have alignment 16. This flag is disabled by default as
-// we have unfortunately encountered too much code (including Clang itself;
-// see PR14291) which performs misaligned access.
-static cl::opt<bool> ClPreserveAlignment(
- "dfsan-preserve-alignment",
- cl::desc("respect alignment requirements provided by input IR"), cl::Hidden,
- cl::init(false));
-
-// The ABI list files control how shadow parameters are passed. The pass treats
-// every function labelled "uninstrumented" in the ABI list file as conforming
-// to the "native" (i.e. unsanitized) ABI. Unless the ABI list contains
-// additional annotations for those functions, a call to one of those functions
-// will produce a warning message, as the labelling behaviour of the function is
-// unknown. The other supported annotations are "functional" and "discard",
-// which are described below under DataFlowSanitizer::WrapperKind.
-static cl::list<std::string> ClABIListFiles(
- "dfsan-abilist",
- cl::desc("File listing native ABI functions and how the pass treats them"),
- cl::Hidden);
-
-// Controls whether the pass uses IA_Args or IA_TLS as the ABI for instrumented
-// functions (see DataFlowSanitizer::InstrumentedABI below).
-static cl::opt<bool> ClArgsABI(
- "dfsan-args-abi",
- cl::desc("Use the argument ABI rather than the TLS ABI"),
- cl::Hidden);
-
-// Controls whether the pass includes or ignores the labels of pointers in load
-// instructions.
-static cl::opt<bool> ClCombinePointerLabelsOnLoad(
- "dfsan-combine-pointer-labels-on-load",
- cl::desc("Combine the label of the pointer with the label of the data when "
- "loading from memory."),
- cl::Hidden, cl::init(true));
-
-// Controls whether the pass includes or ignores the labels of pointers in
-// stores instructions.
-static cl::opt<bool> ClCombinePointerLabelsOnStore(
- "dfsan-combine-pointer-labels-on-store",
- cl::desc("Combine the label of the pointer with the label of the data when "
- "storing in memory."),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool> ClDebugNonzeroLabels(
- "dfsan-debug-nonzero-labels",
- cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, "
- "load or return with a nonzero label"),
- cl::Hidden);
-
-// Experimental feature that inserts callbacks for certain data events.
-// Currently callbacks are only inserted for loads, stores, memory transfers
-// (i.e. memcpy and memmove), and comparisons.
-//
-// If this flag is set to true, the user must provide definitions for the
-// following callback functions:
+
+// The -dfsan-preserve-alignment flag controls whether this pass assumes that
+// alignment requirements provided by the input IR are correct. For example,
+// if the input IR contains a load with alignment 8, this flag will cause
+// the shadow load to have alignment 16. This flag is disabled by default as
+// we have unfortunately encountered too much code (including Clang itself;
+// see PR14291) which performs misaligned access.
+static cl::opt<bool> ClPreserveAlignment(
+ "dfsan-preserve-alignment",
+ cl::desc("respect alignment requirements provided by input IR"), cl::Hidden,
+ cl::init(false));
+
+// The ABI list files control how shadow parameters are passed. The pass treats
+// every function labelled "uninstrumented" in the ABI list file as conforming
+// to the "native" (i.e. unsanitized) ABI. Unless the ABI list contains
+// additional annotations for those functions, a call to one of those functions
+// will produce a warning message, as the labelling behaviour of the function is
+// unknown. The other supported annotations are "functional" and "discard",
+// which are described below under DataFlowSanitizer::WrapperKind.
+static cl::list<std::string> ClABIListFiles(
+ "dfsan-abilist",
+ cl::desc("File listing native ABI functions and how the pass treats them"),
+ cl::Hidden);
+
+// Controls whether the pass uses IA_Args or IA_TLS as the ABI for instrumented
+// functions (see DataFlowSanitizer::InstrumentedABI below).
+static cl::opt<bool> ClArgsABI(
+ "dfsan-args-abi",
+ cl::desc("Use the argument ABI rather than the TLS ABI"),
+ cl::Hidden);
+
+// Controls whether the pass includes or ignores the labels of pointers in load
+// instructions.
+static cl::opt<bool> ClCombinePointerLabelsOnLoad(
+ "dfsan-combine-pointer-labels-on-load",
+ cl::desc("Combine the label of the pointer with the label of the data when "
+ "loading from memory."),
+ cl::Hidden, cl::init(true));
+
+// Controls whether the pass includes or ignores the labels of pointers in
+// stores instructions.
+static cl::opt<bool> ClCombinePointerLabelsOnStore(
+ "dfsan-combine-pointer-labels-on-store",
+ cl::desc("Combine the label of the pointer with the label of the data when "
+ "storing in memory."),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDebugNonzeroLabels(
+ "dfsan-debug-nonzero-labels",
+ cl::desc("Insert calls to __dfsan_nonzero_label on observing a parameter, "
+ "load or return with a nonzero label"),
+ cl::Hidden);
+
+// Experimental feature that inserts callbacks for certain data events.
+// Currently callbacks are only inserted for loads, stores, memory transfers
+// (i.e. memcpy and memmove), and comparisons.
+//
+// If this flag is set to true, the user must provide definitions for the
+// following callback functions:
// void __dfsan_load_callback(dfsan_label Label, void* addr);
// void __dfsan_store_callback(dfsan_label Label, void* addr);
-// void __dfsan_mem_transfer_callback(dfsan_label *Start, size_t Len);
-// void __dfsan_cmp_callback(dfsan_label CombinedLabel);
-static cl::opt<bool> ClEventCallbacks(
- "dfsan-event-callbacks",
- cl::desc("Insert calls to __dfsan_*_callback functions on data events."),
- cl::Hidden, cl::init(false));
-
+// void __dfsan_mem_transfer_callback(dfsan_label *Start, size_t Len);
+// void __dfsan_cmp_callback(dfsan_label CombinedLabel);
+static cl::opt<bool> ClEventCallbacks(
+ "dfsan-event-callbacks",
+ cl::desc("Insert calls to __dfsan_*_callback functions on data events."),
+ cl::Hidden, cl::init(false));
+
// Use a distinct bit for each base label, enabling faster unions with less
// instrumentation. Limits the max number of base labels to 16.
static cl::opt<bool> ClFast16Labels(
@@ -201,220 +201,220 @@ static cl::opt<bool> ClTrackSelectControlFlow(
"to results."),
cl::Hidden, cl::init(true));
-static StringRef GetGlobalTypeString(const GlobalValue &G) {
- // Types of GlobalVariables are always pointer types.
- Type *GType = G.getValueType();
- // For now we support excluding struct types only.
- if (StructType *SGType = dyn_cast<StructType>(GType)) {
- if (!SGType->isLiteral())
- return SGType->getName();
- }
- return "<unknown type>";
-}
-
-namespace {
-
-class DFSanABIList {
- std::unique_ptr<SpecialCaseList> SCL;
-
- public:
- DFSanABIList() = default;
-
- void set(std::unique_ptr<SpecialCaseList> List) { SCL = std::move(List); }
-
- /// Returns whether either this function or its source file are listed in the
- /// given category.
- bool isIn(const Function &F, StringRef Category) const {
- return isIn(*F.getParent(), Category) ||
- SCL->inSection("dataflow", "fun", F.getName(), Category);
- }
-
- /// Returns whether this global alias is listed in the given category.
- ///
- /// If GA aliases a function, the alias's name is matched as a function name
- /// would be. Similarly, aliases of globals are matched like globals.
- bool isIn(const GlobalAlias &GA, StringRef Category) const {
- if (isIn(*GA.getParent(), Category))
- return true;
-
- if (isa<FunctionType>(GA.getValueType()))
- return SCL->inSection("dataflow", "fun", GA.getName(), Category);
-
- return SCL->inSection("dataflow", "global", GA.getName(), Category) ||
- SCL->inSection("dataflow", "type", GetGlobalTypeString(GA),
- Category);
- }
-
- /// Returns whether this module is listed in the given category.
- bool isIn(const Module &M, StringRef Category) const {
- return SCL->inSection("dataflow", "src", M.getModuleIdentifier(), Category);
- }
-};
-
-/// TransformedFunction is used to express the result of transforming one
-/// function type into another. This struct is immutable. It holds metadata
-/// useful for updating calls of the old function to the new type.
-struct TransformedFunction {
- TransformedFunction(FunctionType* OriginalType,
- FunctionType* TransformedType,
- std::vector<unsigned> ArgumentIndexMapping)
- : OriginalType(OriginalType),
- TransformedType(TransformedType),
- ArgumentIndexMapping(ArgumentIndexMapping) {}
-
- // Disallow copies.
- TransformedFunction(const TransformedFunction&) = delete;
- TransformedFunction& operator=(const TransformedFunction&) = delete;
-
- // Allow moves.
- TransformedFunction(TransformedFunction&&) = default;
- TransformedFunction& operator=(TransformedFunction&&) = default;
-
- /// Type of the function before the transformation.
- FunctionType *OriginalType;
-
- /// Type of the function after the transformation.
- FunctionType *TransformedType;
-
- /// Transforming a function may change the position of arguments. This
- /// member records the mapping from each argument's old position to its new
- /// position. Argument positions are zero-indexed. If the transformation
- /// from F to F' made the first argument of F into the third argument of F',
- /// then ArgumentIndexMapping[0] will equal 2.
- std::vector<unsigned> ArgumentIndexMapping;
-};
-
-/// Given function attributes from a call site for the original function,
-/// return function attributes appropriate for a call to the transformed
-/// function.
-AttributeList TransformFunctionAttributes(
- const TransformedFunction& TransformedFunction,
- LLVMContext& Ctx, AttributeList CallSiteAttrs) {
-
- // Construct a vector of AttributeSet for each function argument.
- std::vector<llvm::AttributeSet> ArgumentAttributes(
- TransformedFunction.TransformedType->getNumParams());
-
- // Copy attributes from the parameter of the original function to the
- // transformed version. 'ArgumentIndexMapping' holds the mapping from
- // old argument position to new.
- for (unsigned i=0, ie = TransformedFunction.ArgumentIndexMapping.size();
- i < ie; ++i) {
- unsigned TransformedIndex = TransformedFunction.ArgumentIndexMapping[i];
- ArgumentAttributes[TransformedIndex] = CallSiteAttrs.getParamAttributes(i);
- }
-
- // Copy annotations on varargs arguments.
- for (unsigned i = TransformedFunction.OriginalType->getNumParams(),
- ie = CallSiteAttrs.getNumAttrSets(); i<ie; ++i) {
- ArgumentAttributes.push_back(CallSiteAttrs.getParamAttributes(i));
- }
-
- return AttributeList::get(
- Ctx,
- CallSiteAttrs.getFnAttributes(),
- CallSiteAttrs.getRetAttributes(),
- llvm::makeArrayRef(ArgumentAttributes));
-}
-
+static StringRef GetGlobalTypeString(const GlobalValue &G) {
+ // Types of GlobalVariables are always pointer types.
+ Type *GType = G.getValueType();
+ // For now we support excluding struct types only.
+ if (StructType *SGType = dyn_cast<StructType>(GType)) {
+ if (!SGType->isLiteral())
+ return SGType->getName();
+ }
+ return "<unknown type>";
+}
+
+namespace {
+
+class DFSanABIList {
+ std::unique_ptr<SpecialCaseList> SCL;
+
+ public:
+ DFSanABIList() = default;
+
+ void set(std::unique_ptr<SpecialCaseList> List) { SCL = std::move(List); }
+
+ /// Returns whether either this function or its source file are listed in the
+ /// given category.
+ bool isIn(const Function &F, StringRef Category) const {
+ return isIn(*F.getParent(), Category) ||
+ SCL->inSection("dataflow", "fun", F.getName(), Category);
+ }
+
+ /// Returns whether this global alias is listed in the given category.
+ ///
+ /// If GA aliases a function, the alias's name is matched as a function name
+ /// would be. Similarly, aliases of globals are matched like globals.
+ bool isIn(const GlobalAlias &GA, StringRef Category) const {
+ if (isIn(*GA.getParent(), Category))
+ return true;
+
+ if (isa<FunctionType>(GA.getValueType()))
+ return SCL->inSection("dataflow", "fun", GA.getName(), Category);
+
+ return SCL->inSection("dataflow", "global", GA.getName(), Category) ||
+ SCL->inSection("dataflow", "type", GetGlobalTypeString(GA),
+ Category);
+ }
+
+ /// Returns whether this module is listed in the given category.
+ bool isIn(const Module &M, StringRef Category) const {
+ return SCL->inSection("dataflow", "src", M.getModuleIdentifier(), Category);
+ }
+};
+
+/// TransformedFunction is used to express the result of transforming one
+/// function type into another. This struct is immutable. It holds metadata
+/// useful for updating calls of the old function to the new type.
+struct TransformedFunction {
+ TransformedFunction(FunctionType* OriginalType,
+ FunctionType* TransformedType,
+ std::vector<unsigned> ArgumentIndexMapping)
+ : OriginalType(OriginalType),
+ TransformedType(TransformedType),
+ ArgumentIndexMapping(ArgumentIndexMapping) {}
+
+ // Disallow copies.
+ TransformedFunction(const TransformedFunction&) = delete;
+ TransformedFunction& operator=(const TransformedFunction&) = delete;
+
+ // Allow moves.
+ TransformedFunction(TransformedFunction&&) = default;
+ TransformedFunction& operator=(TransformedFunction&&) = default;
+
+ /// Type of the function before the transformation.
+ FunctionType *OriginalType;
+
+ /// Type of the function after the transformation.
+ FunctionType *TransformedType;
+
+ /// Transforming a function may change the position of arguments. This
+ /// member records the mapping from each argument's old position to its new
+ /// position. Argument positions are zero-indexed. If the transformation
+ /// from F to F' made the first argument of F into the third argument of F',
+ /// then ArgumentIndexMapping[0] will equal 2.
+ std::vector<unsigned> ArgumentIndexMapping;
+};
+
+/// Given function attributes from a call site for the original function,
+/// return function attributes appropriate for a call to the transformed
+/// function.
+AttributeList TransformFunctionAttributes(
+ const TransformedFunction& TransformedFunction,
+ LLVMContext& Ctx, AttributeList CallSiteAttrs) {
+
+ // Construct a vector of AttributeSet for each function argument.
+ std::vector<llvm::AttributeSet> ArgumentAttributes(
+ TransformedFunction.TransformedType->getNumParams());
+
+ // Copy attributes from the parameter of the original function to the
+ // transformed version. 'ArgumentIndexMapping' holds the mapping from
+ // old argument position to new.
+ for (unsigned i=0, ie = TransformedFunction.ArgumentIndexMapping.size();
+ i < ie; ++i) {
+ unsigned TransformedIndex = TransformedFunction.ArgumentIndexMapping[i];
+ ArgumentAttributes[TransformedIndex] = CallSiteAttrs.getParamAttributes(i);
+ }
+
+ // Copy annotations on varargs arguments.
+ for (unsigned i = TransformedFunction.OriginalType->getNumParams(),
+ ie = CallSiteAttrs.getNumAttrSets(); i<ie; ++i) {
+ ArgumentAttributes.push_back(CallSiteAttrs.getParamAttributes(i));
+ }
+
+ return AttributeList::get(
+ Ctx,
+ CallSiteAttrs.getFnAttributes(),
+ CallSiteAttrs.getRetAttributes(),
+ llvm::makeArrayRef(ArgumentAttributes));
+}
+
class DataFlowSanitizer {
- friend struct DFSanFunction;
- friend class DFSanVisitor;
-
- enum { ShadowWidthBits = 16, ShadowWidthBytes = ShadowWidthBits / 8 };
-
- /// Which ABI should be used for instrumented functions?
- enum InstrumentedABI {
- /// Argument and return value labels are passed through additional
- /// arguments and by modifying the return type.
- IA_Args,
-
- /// Argument and return value labels are passed through TLS variables
- /// __dfsan_arg_tls and __dfsan_retval_tls.
- IA_TLS
- };
-
- /// How should calls to uninstrumented functions be handled?
- enum WrapperKind {
- /// This function is present in an uninstrumented form but we don't know
- /// how it should be handled. Print a warning and call the function anyway.
- /// Don't label the return value.
- WK_Warning,
-
- /// This function does not write to (user-accessible) memory, and its return
- /// value is unlabelled.
- WK_Discard,
-
- /// This function does not write to (user-accessible) memory, and the label
- /// of its return value is the union of the label of its arguments.
- WK_Functional,
-
- /// Instead of calling the function, a custom wrapper __dfsw_F is called,
- /// where F is the name of the function. This function may wrap the
- /// original function or provide its own implementation. This is similar to
- /// the IA_Args ABI, except that IA_Args uses a struct return type to
- /// pass the return value shadow in a register, while WK_Custom uses an
- /// extra pointer argument to return the shadow. This allows the wrapped
- /// form of the function type to be expressed in C.
- WK_Custom
- };
-
- Module *Mod;
- LLVMContext *Ctx;
+ friend struct DFSanFunction;
+ friend class DFSanVisitor;
+
+ enum { ShadowWidthBits = 16, ShadowWidthBytes = ShadowWidthBits / 8 };
+
+ /// Which ABI should be used for instrumented functions?
+ enum InstrumentedABI {
+ /// Argument and return value labels are passed through additional
+ /// arguments and by modifying the return type.
+ IA_Args,
+
+ /// Argument and return value labels are passed through TLS variables
+ /// __dfsan_arg_tls and __dfsan_retval_tls.
+ IA_TLS
+ };
+
+ /// How should calls to uninstrumented functions be handled?
+ enum WrapperKind {
+ /// This function is present in an uninstrumented form but we don't know
+ /// how it should be handled. Print a warning and call the function anyway.
+ /// Don't label the return value.
+ WK_Warning,
+
+ /// This function does not write to (user-accessible) memory, and its return
+ /// value is unlabelled.
+ WK_Discard,
+
+ /// This function does not write to (user-accessible) memory, and the label
+ /// of its return value is the union of the label of its arguments.
+ WK_Functional,
+
+ /// Instead of calling the function, a custom wrapper __dfsw_F is called,
+ /// where F is the name of the function. This function may wrap the
+ /// original function or provide its own implementation. This is similar to
+ /// the IA_Args ABI, except that IA_Args uses a struct return type to
+ /// pass the return value shadow in a register, while WK_Custom uses an
+ /// extra pointer argument to return the shadow. This allows the wrapped
+ /// form of the function type to be expressed in C.
+ WK_Custom
+ };
+
+ Module *Mod;
+ LLVMContext *Ctx;
Type *Int8Ptr;
/// The shadow type for all primitive types and vector types.
IntegerType *PrimitiveShadowTy;
PointerType *PrimitiveShadowPtrTy;
- IntegerType *IntptrTy;
+ IntegerType *IntptrTy;
ConstantInt *ZeroPrimitiveShadow;
- ConstantInt *ShadowPtrMask;
- ConstantInt *ShadowPtrMul;
- Constant *ArgTLS;
- Constant *RetvalTLS;
- Constant *ExternalShadowMask;
- FunctionType *DFSanUnionFnTy;
- FunctionType *DFSanUnionLoadFnTy;
- FunctionType *DFSanUnimplementedFnTy;
- FunctionType *DFSanSetLabelFnTy;
- FunctionType *DFSanNonzeroLabelFnTy;
- FunctionType *DFSanVarargWrapperFnTy;
+ ConstantInt *ShadowPtrMask;
+ ConstantInt *ShadowPtrMul;
+ Constant *ArgTLS;
+ Constant *RetvalTLS;
+ Constant *ExternalShadowMask;
+ FunctionType *DFSanUnionFnTy;
+ FunctionType *DFSanUnionLoadFnTy;
+ FunctionType *DFSanUnimplementedFnTy;
+ FunctionType *DFSanSetLabelFnTy;
+ FunctionType *DFSanNonzeroLabelFnTy;
+ FunctionType *DFSanVarargWrapperFnTy;
FunctionType *DFSanCmpCallbackFnTy;
FunctionType *DFSanLoadStoreCallbackFnTy;
- FunctionType *DFSanMemTransferCallbackFnTy;
- FunctionCallee DFSanUnionFn;
- FunctionCallee DFSanCheckedUnionFn;
- FunctionCallee DFSanUnionLoadFn;
+ FunctionType *DFSanMemTransferCallbackFnTy;
+ FunctionCallee DFSanUnionFn;
+ FunctionCallee DFSanCheckedUnionFn;
+ FunctionCallee DFSanUnionLoadFn;
FunctionCallee DFSanUnionLoadFast16LabelsFn;
- FunctionCallee DFSanUnimplementedFn;
- FunctionCallee DFSanSetLabelFn;
- FunctionCallee DFSanNonzeroLabelFn;
- FunctionCallee DFSanVarargWrapperFn;
- FunctionCallee DFSanLoadCallbackFn;
- FunctionCallee DFSanStoreCallbackFn;
- FunctionCallee DFSanMemTransferCallbackFn;
- FunctionCallee DFSanCmpCallbackFn;
- MDNode *ColdCallWeights;
- DFSanABIList ABIList;
- DenseMap<Value *, Function *> UnwrappedFnMap;
- AttrBuilder ReadOnlyNoneAttrs;
- bool DFSanRuntimeShadowMask = false;
-
- Value *getShadowAddress(Value *Addr, Instruction *Pos);
- bool isInstrumented(const Function *F);
- bool isInstrumented(const GlobalAlias *GA);
- FunctionType *getArgsFunctionType(FunctionType *T);
- FunctionType *getTrampolineFunctionType(FunctionType *T);
- TransformedFunction getCustomFunctionType(FunctionType *T);
- InstrumentedABI getInstrumentedABI();
- WrapperKind getWrapperKind(Function *F);
- void addGlobalNamePrefix(GlobalValue *GV);
- Function *buildWrapperFunction(Function *F, StringRef NewFName,
- GlobalValue::LinkageTypes NewFLink,
- FunctionType *NewFT);
- Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName);
- void initializeCallbackFunctions(Module &M);
- void initializeRuntimeFunctions(Module &M);
-
+ FunctionCallee DFSanUnimplementedFn;
+ FunctionCallee DFSanSetLabelFn;
+ FunctionCallee DFSanNonzeroLabelFn;
+ FunctionCallee DFSanVarargWrapperFn;
+ FunctionCallee DFSanLoadCallbackFn;
+ FunctionCallee DFSanStoreCallbackFn;
+ FunctionCallee DFSanMemTransferCallbackFn;
+ FunctionCallee DFSanCmpCallbackFn;
+ MDNode *ColdCallWeights;
+ DFSanABIList ABIList;
+ DenseMap<Value *, Function *> UnwrappedFnMap;
+ AttrBuilder ReadOnlyNoneAttrs;
+ bool DFSanRuntimeShadowMask = false;
+
+ Value *getShadowAddress(Value *Addr, Instruction *Pos);
+ bool isInstrumented(const Function *F);
+ bool isInstrumented(const GlobalAlias *GA);
+ FunctionType *getArgsFunctionType(FunctionType *T);
+ FunctionType *getTrampolineFunctionType(FunctionType *T);
+ TransformedFunction getCustomFunctionType(FunctionType *T);
+ InstrumentedABI getInstrumentedABI();
+ WrapperKind getWrapperKind(Function *F);
+ void addGlobalNamePrefix(GlobalValue *GV);
+ Function *buildWrapperFunction(Function *F, StringRef NewFName,
+ GlobalValue::LinkageTypes NewFLink,
+ FunctionType *NewFT);
+ Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName);
+ void initializeCallbackFunctions(Module &M);
+ void initializeRuntimeFunctions(Module &M);
+
bool init(Module &M);
/// Returns whether the pass tracks labels for struct fields and array
@@ -448,30 +448,30 @@ class DataFlowSanitizer {
/// Returns the shadow type of of V's type.
Type *getShadowTy(Value *V);
-public:
+public:
DataFlowSanitizer(const std::vector<std::string> &ABIListFiles);
-
+
bool runImpl(Module &M);
-};
-
-struct DFSanFunction {
- DataFlowSanitizer &DFS;
- Function *F;
- DominatorTree DT;
- DataFlowSanitizer::InstrumentedABI IA;
- bool IsNativeABI;
- AllocaInst *LabelReturnAlloca = nullptr;
- DenseMap<Value *, Value *> ValShadowMap;
- DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap;
- std::vector<std::pair<PHINode *, PHINode *>> PHIFixups;
- DenseSet<Instruction *> SkipInsts;
- std::vector<Value *> NonZeroChecks;
- bool AvoidNewBlocks;
-
+};
+
+struct DFSanFunction {
+ DataFlowSanitizer &DFS;
+ Function *F;
+ DominatorTree DT;
+ DataFlowSanitizer::InstrumentedABI IA;
+ bool IsNativeABI;
+ AllocaInst *LabelReturnAlloca = nullptr;
+ DenseMap<Value *, Value *> ValShadowMap;
+ DenseMap<AllocaInst *, AllocaInst *> AllocaShadowMap;
+ std::vector<std::pair<PHINode *, PHINode *>> PHIFixups;
+ DenseSet<Instruction *> SkipInsts;
+ std::vector<Value *> NonZeroChecks;
+ bool AvoidNewBlocks;
+
struct CachedShadow {
BasicBlock *Block; // The block where Shadow is defined.
- Value *Shadow;
- };
+ Value *Shadow;
+ };
/// Maps a value to its latest shadow value in terms of domination tree.
DenseMap<std::pair<Value *, Value *>, CachedShadow> CachedShadows;
/// Maps a value to its latest collapsed shadow value it was converted to in
@@ -479,16 +479,16 @@ struct DFSanFunction {
/// used at a post process where CFG blocks are split. So it does not cache
/// BasicBlock like CachedShadows, but uses domination between values.
DenseMap<Value *, Value *> CachedCollapsedShadows;
- DenseMap<Value *, std::set<Value *>> ShadowElements;
-
- DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI)
- : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()), IsNativeABI(IsNativeABI) {
- DT.recalculate(*F);
- // FIXME: Need to track down the register allocator issue which causes poor
- // performance in pathological cases with large numbers of basic blocks.
- AvoidNewBlocks = F->size() > 1000;
- }
-
+ DenseMap<Value *, std::set<Value *>> ShadowElements;
+
+ DFSanFunction(DataFlowSanitizer &DFS, Function *F, bool IsNativeABI)
+ : DFS(DFS), F(F), IA(DFS.getInstrumentedABI()), IsNativeABI(IsNativeABI) {
+ DT.recalculate(*F);
+ // FIXME: Need to track down the register allocator issue which causes poor
+ // performance in pathological cases with large numbers of basic blocks.
+ AvoidNewBlocks = F->size() > 1000;
+ }
+
/// Computes the shadow address for a given function argument.
///
/// Shadow = ArgTLS+ArgOffset.
@@ -497,18 +497,18 @@ struct DFSanFunction {
/// Computes the shadow address for a retval.
Value *getRetvalTLS(Type *T, IRBuilder<> &IRB);
- Value *getShadow(Value *V);
- void setShadow(Instruction *I, Value *Shadow);
+ Value *getShadow(Value *V);
+ void setShadow(Instruction *I, Value *Shadow);
/// Generates IR to compute the union of the two given shadows, inserting it
/// before Pos. The combined value is with primitive type.
- Value *combineShadows(Value *V1, Value *V2, Instruction *Pos);
+ Value *combineShadows(Value *V1, Value *V2, Instruction *Pos);
/// Combines the shadow values of V1 and V2, then converts the combined value
/// with primitive type into a shadow value with the original type T.
Value *combineShadowsThenConvert(Type *T, Value *V1, Value *V2,
Instruction *Pos);
- Value *combineOperandShadows(Instruction *Inst);
- Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align,
- Instruction *Pos);
+ Value *combineOperandShadows(Instruction *Inst);
+ Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align,
+ Instruction *Pos);
void storePrimitiveShadow(Value *Addr, uint64_t Size, Align Alignment,
Value *PrimitiveShadow, Instruction *Pos);
/// Applies PrimitiveShadow to all primitive subtypes of T, returning
@@ -539,110 +539,110 @@ private:
/// Returns the shadow value of an argument A.
Value *getShadowForTLSArgument(Argument *A);
-};
-
-class DFSanVisitor : public InstVisitor<DFSanVisitor> {
-public:
- DFSanFunction &DFSF;
-
- DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {}
-
- const DataLayout &getDataLayout() const {
- return DFSF.F->getParent()->getDataLayout();
- }
-
- // Combines shadow values for all of I's operands. Returns the combined shadow
- // value.
- Value *visitOperandShadowInst(Instruction &I);
-
- void visitUnaryOperator(UnaryOperator &UO);
- void visitBinaryOperator(BinaryOperator &BO);
- void visitCastInst(CastInst &CI);
- void visitCmpInst(CmpInst &CI);
- void visitGetElementPtrInst(GetElementPtrInst &GEPI);
- void visitLoadInst(LoadInst &LI);
- void visitStoreInst(StoreInst &SI);
- void visitReturnInst(ReturnInst &RI);
- void visitCallBase(CallBase &CB);
- void visitPHINode(PHINode &PN);
- void visitExtractElementInst(ExtractElementInst &I);
- void visitInsertElementInst(InsertElementInst &I);
- void visitShuffleVectorInst(ShuffleVectorInst &I);
- void visitExtractValueInst(ExtractValueInst &I);
- void visitInsertValueInst(InsertValueInst &I);
- void visitAllocaInst(AllocaInst &I);
- void visitSelectInst(SelectInst &I);
- void visitMemSetInst(MemSetInst &I);
- void visitMemTransferInst(MemTransferInst &I);
-};
-
-} // end anonymous namespace
-
-DataFlowSanitizer::DataFlowSanitizer(
+};
+
+class DFSanVisitor : public InstVisitor<DFSanVisitor> {
+public:
+ DFSanFunction &DFSF;
+
+ DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {}
+
+ const DataLayout &getDataLayout() const {
+ return DFSF.F->getParent()->getDataLayout();
+ }
+
+ // Combines shadow values for all of I's operands. Returns the combined shadow
+ // value.
+ Value *visitOperandShadowInst(Instruction &I);
+
+ void visitUnaryOperator(UnaryOperator &UO);
+ void visitBinaryOperator(BinaryOperator &BO);
+ void visitCastInst(CastInst &CI);
+ void visitCmpInst(CmpInst &CI);
+ void visitGetElementPtrInst(GetElementPtrInst &GEPI);
+ void visitLoadInst(LoadInst &LI);
+ void visitStoreInst(StoreInst &SI);
+ void visitReturnInst(ReturnInst &RI);
+ void visitCallBase(CallBase &CB);
+ void visitPHINode(PHINode &PN);
+ void visitExtractElementInst(ExtractElementInst &I);
+ void visitInsertElementInst(InsertElementInst &I);
+ void visitShuffleVectorInst(ShuffleVectorInst &I);
+ void visitExtractValueInst(ExtractValueInst &I);
+ void visitInsertValueInst(InsertValueInst &I);
+ void visitAllocaInst(AllocaInst &I);
+ void visitSelectInst(SelectInst &I);
+ void visitMemSetInst(MemSetInst &I);
+ void visitMemTransferInst(MemTransferInst &I);
+};
+
+} // end anonymous namespace
+
+DataFlowSanitizer::DataFlowSanitizer(
const std::vector<std::string> &ABIListFiles) {
- std::vector<std::string> AllABIListFiles(std::move(ABIListFiles));
+ std::vector<std::string> AllABIListFiles(std::move(ABIListFiles));
llvm::append_range(AllABIListFiles, ClABIListFiles);
- // FIXME: should we propagate vfs::FileSystem to this constructor?
- ABIList.set(
- SpecialCaseList::createOrDie(AllABIListFiles, *vfs::getRealFileSystem()));
-}
-
-FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
- SmallVector<Type *, 4> ArgTypes(T->param_begin(), T->param_end());
+ // FIXME: should we propagate vfs::FileSystem to this constructor?
+ ABIList.set(
+ SpecialCaseList::createOrDie(AllABIListFiles, *vfs::getRealFileSystem()));
+}
+
+FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) {
+ SmallVector<Type *, 4> ArgTypes(T->param_begin(), T->param_end());
ArgTypes.append(T->getNumParams(), PrimitiveShadowTy);
- if (T->isVarArg())
+ if (T->isVarArg())
ArgTypes.push_back(PrimitiveShadowPtrTy);
- Type *RetType = T->getReturnType();
- if (!RetType->isVoidTy())
+ Type *RetType = T->getReturnType();
+ if (!RetType->isVoidTy())
RetType = StructType::get(RetType, PrimitiveShadowTy);
- return FunctionType::get(RetType, ArgTypes, T->isVarArg());
-}
-
-FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
- assert(!T->isVarArg());
- SmallVector<Type *, 4> ArgTypes;
- ArgTypes.push_back(T->getPointerTo());
- ArgTypes.append(T->param_begin(), T->param_end());
+ return FunctionType::get(RetType, ArgTypes, T->isVarArg());
+}
+
+FunctionType *DataFlowSanitizer::getTrampolineFunctionType(FunctionType *T) {
+ assert(!T->isVarArg());
+ SmallVector<Type *, 4> ArgTypes;
+ ArgTypes.push_back(T->getPointerTo());
+ ArgTypes.append(T->param_begin(), T->param_end());
ArgTypes.append(T->getNumParams(), PrimitiveShadowTy);
- Type *RetType = T->getReturnType();
- if (!RetType->isVoidTy())
+ Type *RetType = T->getReturnType();
+ if (!RetType->isVoidTy())
ArgTypes.push_back(PrimitiveShadowPtrTy);
- return FunctionType::get(T->getReturnType(), ArgTypes, false);
-}
-
-TransformedFunction DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
- SmallVector<Type *, 4> ArgTypes;
-
- // Some parameters of the custom function being constructed are
- // parameters of T. Record the mapping from parameters of T to
- // parameters of the custom function, so that parameter attributes
- // at call sites can be updated.
- std::vector<unsigned> ArgumentIndexMapping;
- for (unsigned i = 0, ie = T->getNumParams(); i != ie; ++i) {
- Type* param_type = T->getParamType(i);
- FunctionType *FT;
- if (isa<PointerType>(param_type) && (FT = dyn_cast<FunctionType>(
- cast<PointerType>(param_type)->getElementType()))) {
- ArgumentIndexMapping.push_back(ArgTypes.size());
- ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo());
- ArgTypes.push_back(Type::getInt8PtrTy(*Ctx));
- } else {
- ArgumentIndexMapping.push_back(ArgTypes.size());
- ArgTypes.push_back(param_type);
- }
- }
- for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
+ return FunctionType::get(T->getReturnType(), ArgTypes, false);
+}
+
+TransformedFunction DataFlowSanitizer::getCustomFunctionType(FunctionType *T) {
+ SmallVector<Type *, 4> ArgTypes;
+
+ // Some parameters of the custom function being constructed are
+ // parameters of T. Record the mapping from parameters of T to
+ // parameters of the custom function, so that parameter attributes
+ // at call sites can be updated.
+ std::vector<unsigned> ArgumentIndexMapping;
+ for (unsigned i = 0, ie = T->getNumParams(); i != ie; ++i) {
+ Type* param_type = T->getParamType(i);
+ FunctionType *FT;
+ if (isa<PointerType>(param_type) && (FT = dyn_cast<FunctionType>(
+ cast<PointerType>(param_type)->getElementType()))) {
+ ArgumentIndexMapping.push_back(ArgTypes.size());
+ ArgTypes.push_back(getTrampolineFunctionType(FT)->getPointerTo());
+ ArgTypes.push_back(Type::getInt8PtrTy(*Ctx));
+ } else {
+ ArgumentIndexMapping.push_back(ArgTypes.size());
+ ArgTypes.push_back(param_type);
+ }
+ }
+ for (unsigned i = 0, e = T->getNumParams(); i != e; ++i)
ArgTypes.push_back(PrimitiveShadowTy);
- if (T->isVarArg())
+ if (T->isVarArg())
ArgTypes.push_back(PrimitiveShadowPtrTy);
- Type *RetType = T->getReturnType();
- if (!RetType->isVoidTy())
+ Type *RetType = T->getReturnType();
+ if (!RetType->isVoidTy())
ArgTypes.push_back(PrimitiveShadowPtrTy);
- return TransformedFunction(
- T, FunctionType::get(T->getReturnType(), ArgTypes, T->isVarArg()),
- ArgumentIndexMapping);
-}
-
+ return TransformedFunction(
+ T, FunctionType::get(T->getReturnType(), ArgTypes, T->isVarArg()),
+ ArgumentIndexMapping);
+}
+
bool DataFlowSanitizer::isZeroShadow(Value *V) {
if (!shouldTrackFieldsAndIndices())
return ZeroPrimitiveShadow == V;
@@ -800,48 +800,48 @@ Type *DataFlowSanitizer::getShadowTy(Value *V) {
}
bool DataFlowSanitizer::init(Module &M) {
- Triple TargetTriple(M.getTargetTriple());
- bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64;
- bool IsMIPS64 = TargetTriple.isMIPS64();
- bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64 ||
- TargetTriple.getArch() == Triple::aarch64_be;
-
- const DataLayout &DL = M.getDataLayout();
-
- Mod = &M;
- Ctx = &M.getContext();
+ Triple TargetTriple(M.getTargetTriple());
+ bool IsX86_64 = TargetTriple.getArch() == Triple::x86_64;
+ bool IsMIPS64 = TargetTriple.isMIPS64();
+ bool IsAArch64 = TargetTriple.getArch() == Triple::aarch64 ||
+ TargetTriple.getArch() == Triple::aarch64_be;
+
+ const DataLayout &DL = M.getDataLayout();
+
+ Mod = &M;
+ Ctx = &M.getContext();
Int8Ptr = Type::getInt8PtrTy(*Ctx);
PrimitiveShadowTy = IntegerType::get(*Ctx, ShadowWidthBits);
PrimitiveShadowPtrTy = PointerType::getUnqual(PrimitiveShadowTy);
- IntptrTy = DL.getIntPtrType(*Ctx);
+ IntptrTy = DL.getIntPtrType(*Ctx);
ZeroPrimitiveShadow = ConstantInt::getSigned(PrimitiveShadowTy, 0);
- ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidthBytes);
- if (IsX86_64)
- ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
- else if (IsMIPS64)
- ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL);
- // AArch64 supports multiple VMAs and the shadow mask is set at runtime.
- else if (IsAArch64)
- DFSanRuntimeShadowMask = true;
- else
- report_fatal_error("unsupported triple");
-
+ ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidthBytes);
+ if (IsX86_64)
+ ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
+ else if (IsMIPS64)
+ ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0xF000000000LL);
+ // AArch64 supports multiple VMAs and the shadow mask is set at runtime.
+ else if (IsAArch64)
+ DFSanRuntimeShadowMask = true;
+ else
+ report_fatal_error("unsupported triple");
+
Type *DFSanUnionArgs[2] = {PrimitiveShadowTy, PrimitiveShadowTy};
- DFSanUnionFnTy =
+ DFSanUnionFnTy =
FunctionType::get(PrimitiveShadowTy, DFSanUnionArgs, /*isVarArg=*/false);
Type *DFSanUnionLoadArgs[2] = {PrimitiveShadowPtrTy, IntptrTy};
DFSanUnionLoadFnTy = FunctionType::get(PrimitiveShadowTy, DFSanUnionLoadArgs,
/*isVarArg=*/false);
- DFSanUnimplementedFnTy = FunctionType::get(
- Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+ DFSanUnimplementedFnTy = FunctionType::get(
+ Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
Type *DFSanSetLabelArgs[3] = {PrimitiveShadowTy, Type::getInt8PtrTy(*Ctx),
IntptrTy};
- DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx),
- DFSanSetLabelArgs, /*isVarArg=*/false);
+ DFSanSetLabelFnTy = FunctionType::get(Type::getVoidTy(*Ctx),
+ DFSanSetLabelArgs, /*isVarArg=*/false);
DFSanNonzeroLabelFnTy =
FunctionType::get(Type::getVoidTy(*Ctx), None, /*isVarArg=*/false);
- DFSanVarargWrapperFnTy = FunctionType::get(
- Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+ DFSanVarargWrapperFnTy = FunctionType::get(
+ Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
DFSanCmpCallbackFnTy =
FunctionType::get(Type::getVoidTy(*Ctx), PrimitiveShadowTy,
/*isVarArg=*/false);
@@ -850,169 +850,169 @@ bool DataFlowSanitizer::init(Module &M) {
FunctionType::get(Type::getVoidTy(*Ctx), DFSanLoadStoreCallbackArgs,
/*isVarArg=*/false);
Type *DFSanMemTransferCallbackArgs[2] = {PrimitiveShadowPtrTy, IntptrTy};
- DFSanMemTransferCallbackFnTy =
- FunctionType::get(Type::getVoidTy(*Ctx), DFSanMemTransferCallbackArgs,
- /*isVarArg=*/false);
-
- ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
- return true;
-}
-
-bool DataFlowSanitizer::isInstrumented(const Function *F) {
- return !ABIList.isIn(*F, "uninstrumented");
-}
-
-bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) {
- return !ABIList.isIn(*GA, "uninstrumented");
-}
-
-DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() {
- return ClArgsABI ? IA_Args : IA_TLS;
-}
-
-DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
- if (ABIList.isIn(*F, "functional"))
- return WK_Functional;
- if (ABIList.isIn(*F, "discard"))
- return WK_Discard;
- if (ABIList.isIn(*F, "custom"))
- return WK_Custom;
-
- return WK_Warning;
-}
-
-void DataFlowSanitizer::addGlobalNamePrefix(GlobalValue *GV) {
- std::string GVName = std::string(GV->getName()), Prefix = "dfs$";
- GV->setName(Prefix + GVName);
-
- // Try to change the name of the function in module inline asm. We only do
- // this for specific asm directives, currently only ".symver", to try to avoid
- // corrupting asm which happens to contain the symbol name as a substring.
- // Note that the substitution for .symver assumes that the versioned symbol
- // also has an instrumented name.
- std::string Asm = GV->getParent()->getModuleInlineAsm();
- std::string SearchStr = ".symver " + GVName + ",";
- size_t Pos = Asm.find(SearchStr);
- if (Pos != std::string::npos) {
- Asm.replace(Pos, SearchStr.size(),
- ".symver " + Prefix + GVName + "," + Prefix);
- GV->getParent()->setModuleInlineAsm(Asm);
- }
-}
-
-Function *
-DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
- GlobalValue::LinkageTypes NewFLink,
- FunctionType *NewFT) {
- FunctionType *FT = F->getFunctionType();
- Function *NewF = Function::Create(NewFT, NewFLink, F->getAddressSpace(),
- NewFName, F->getParent());
- NewF->copyAttributesFrom(F);
- NewF->removeAttributes(
- AttributeList::ReturnIndex,
- AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
-
- BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
- if (F->isVarArg()) {
- NewF->removeAttributes(AttributeList::FunctionIndex,
- AttrBuilder().addAttribute("split-stack"));
- CallInst::Create(DFSanVarargWrapperFn,
- IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
- BB);
- new UnreachableInst(*Ctx, BB);
- } else {
- std::vector<Value *> Args;
- unsigned n = FT->getNumParams();
- for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n)
- Args.push_back(&*ai);
- CallInst *CI = CallInst::Create(F, Args, "", BB);
- if (FT->getReturnType()->isVoidTy())
- ReturnInst::Create(*Ctx, BB);
- else
- ReturnInst::Create(*Ctx, CI, BB);
- }
-
- return NewF;
-}
-
-Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
- StringRef FName) {
- FunctionType *FTT = getTrampolineFunctionType(FT);
- FunctionCallee C = Mod->getOrInsertFunction(FName, FTT);
- Function *F = dyn_cast<Function>(C.getCallee());
- if (F && F->isDeclaration()) {
- F->setLinkage(GlobalValue::LinkOnceODRLinkage);
- BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
- std::vector<Value *> Args;
- Function::arg_iterator AI = F->arg_begin(); ++AI;
- for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N)
- Args.push_back(&*AI);
- CallInst *CI = CallInst::Create(FT, &*F->arg_begin(), Args, "", BB);
- ReturnInst *RI;
- if (FT->getReturnType()->isVoidTy())
- RI = ReturnInst::Create(*Ctx, BB);
- else
- RI = ReturnInst::Create(*Ctx, CI, BB);
-
+ DFSanMemTransferCallbackFnTy =
+ FunctionType::get(Type::getVoidTy(*Ctx), DFSanMemTransferCallbackArgs,
+ /*isVarArg=*/false);
+
+ ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000);
+ return true;
+}
+
+bool DataFlowSanitizer::isInstrumented(const Function *F) {
+ return !ABIList.isIn(*F, "uninstrumented");
+}
+
+bool DataFlowSanitizer::isInstrumented(const GlobalAlias *GA) {
+ return !ABIList.isIn(*GA, "uninstrumented");
+}
+
+DataFlowSanitizer::InstrumentedABI DataFlowSanitizer::getInstrumentedABI() {
+ return ClArgsABI ? IA_Args : IA_TLS;
+}
+
+DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
+ if (ABIList.isIn(*F, "functional"))
+ return WK_Functional;
+ if (ABIList.isIn(*F, "discard"))
+ return WK_Discard;
+ if (ABIList.isIn(*F, "custom"))
+ return WK_Custom;
+
+ return WK_Warning;
+}
+
+void DataFlowSanitizer::addGlobalNamePrefix(GlobalValue *GV) {
+ std::string GVName = std::string(GV->getName()), Prefix = "dfs$";
+ GV->setName(Prefix + GVName);
+
+ // Try to change the name of the function in module inline asm. We only do
+ // this for specific asm directives, currently only ".symver", to try to avoid
+ // corrupting asm which happens to contain the symbol name as a substring.
+ // Note that the substitution for .symver assumes that the versioned symbol
+ // also has an instrumented name.
+ std::string Asm = GV->getParent()->getModuleInlineAsm();
+ std::string SearchStr = ".symver " + GVName + ",";
+ size_t Pos = Asm.find(SearchStr);
+ if (Pos != std::string::npos) {
+ Asm.replace(Pos, SearchStr.size(),
+ ".symver " + Prefix + GVName + "," + Prefix);
+ GV->getParent()->setModuleInlineAsm(Asm);
+ }
+}
+
+Function *
+DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
+ GlobalValue::LinkageTypes NewFLink,
+ FunctionType *NewFT) {
+ FunctionType *FT = F->getFunctionType();
+ Function *NewF = Function::Create(NewFT, NewFLink, F->getAddressSpace(),
+ NewFName, F->getParent());
+ NewF->copyAttributesFrom(F);
+ NewF->removeAttributes(
+ AttributeList::ReturnIndex,
+ AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
+
+ BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
+ if (F->isVarArg()) {
+ NewF->removeAttributes(AttributeList::FunctionIndex,
+ AttrBuilder().addAttribute("split-stack"));
+ CallInst::Create(DFSanVarargWrapperFn,
+ IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
+ BB);
+ new UnreachableInst(*Ctx, BB);
+ } else {
+ std::vector<Value *> Args;
+ unsigned n = FT->getNumParams();
+ for (Function::arg_iterator ai = NewF->arg_begin(); n != 0; ++ai, --n)
+ Args.push_back(&*ai);
+ CallInst *CI = CallInst::Create(F, Args, "", BB);
+ if (FT->getReturnType()->isVoidTy())
+ ReturnInst::Create(*Ctx, BB);
+ else
+ ReturnInst::Create(*Ctx, CI, BB);
+ }
+
+ return NewF;
+}
+
+Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
+ StringRef FName) {
+ FunctionType *FTT = getTrampolineFunctionType(FT);
+ FunctionCallee C = Mod->getOrInsertFunction(FName, FTT);
+ Function *F = dyn_cast<Function>(C.getCallee());
+ if (F && F->isDeclaration()) {
+ F->setLinkage(GlobalValue::LinkOnceODRLinkage);
+ BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+ std::vector<Value *> Args;
+ Function::arg_iterator AI = F->arg_begin(); ++AI;
+ for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N)
+ Args.push_back(&*AI);
+ CallInst *CI = CallInst::Create(FT, &*F->arg_begin(), Args, "", BB);
+ ReturnInst *RI;
+ if (FT->getReturnType()->isVoidTy())
+ RI = ReturnInst::Create(*Ctx, BB);
+ else
+ RI = ReturnInst::Create(*Ctx, CI, BB);
+
// F is called by a wrapped custom function with primitive shadows. So
// its arguments and return value need conversion.
- DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true);
- Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI;
+ DFSanFunction DFSF(*this, F, /*IsNativeABI=*/true);
+ Function::arg_iterator ValAI = F->arg_begin(), ShadowAI = AI; ++ValAI;
for (unsigned N = FT->getNumParams(); N != 0; ++ValAI, ++ShadowAI, --N) {
Value *Shadow =
DFSF.expandFromPrimitiveShadow(ValAI->getType(), &*ShadowAI, CI);
DFSF.ValShadowMap[&*ValAI] = Shadow;
}
- DFSanVisitor(DFSF).visitCallInst(*CI);
+ DFSanVisitor(DFSF).visitCallInst(*CI);
if (!FT->getReturnType()->isVoidTy()) {
Value *PrimitiveShadow = DFSF.collapseToPrimitiveShadow(
DFSF.getShadow(RI->getReturnValue()), RI);
new StoreInst(PrimitiveShadow, &*std::prev(F->arg_end()), RI);
}
- }
-
- return cast<Constant>(C.getCallee());
-}
-
-// Initialize DataFlowSanitizer runtime functions and declare them in the module
-void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
- {
- AttributeList AL;
- AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
- Attribute::NoUnwind);
- AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
- Attribute::ReadNone);
- AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
- Attribute::ZExt);
- AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
- AL = AL.addParamAttribute(M.getContext(), 1, Attribute::ZExt);
- DFSanUnionFn =
- Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy, AL);
- }
- {
- AttributeList AL;
- AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
- Attribute::NoUnwind);
- AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
- Attribute::ReadNone);
- AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
- Attribute::ZExt);
- AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
- AL = AL.addParamAttribute(M.getContext(), 1, Attribute::ZExt);
- DFSanCheckedUnionFn =
- Mod->getOrInsertFunction("dfsan_union", DFSanUnionFnTy, AL);
- }
- {
- AttributeList AL;
- AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
- Attribute::NoUnwind);
- AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
- Attribute::ReadOnly);
- AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
- Attribute::ZExt);
- DFSanUnionLoadFn =
- Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy, AL);
- }
+ }
+
+ return cast<Constant>(C.getCallee());
+}
+
+// Initialize DataFlowSanitizer runtime functions and declare them in the module
+void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
+ {
+ AttributeList AL;
+ AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+ Attribute::NoUnwind);
+ AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+ Attribute::ReadNone);
+ AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
+ Attribute::ZExt);
+ AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
+ AL = AL.addParamAttribute(M.getContext(), 1, Attribute::ZExt);
+ DFSanUnionFn =
+ Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy, AL);
+ }
+ {
+ AttributeList AL;
+ AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+ Attribute::NoUnwind);
+ AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+ Attribute::ReadNone);
+ AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
+ Attribute::ZExt);
+ AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
+ AL = AL.addParamAttribute(M.getContext(), 1, Attribute::ZExt);
+ DFSanCheckedUnionFn =
+ Mod->getOrInsertFunction("dfsan_union", DFSanUnionFnTy, AL);
+ }
+ {
+ AttributeList AL;
+ AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+ Attribute::NoUnwind);
+ AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+ Attribute::ReadOnly);
+ AL = AL.addAttribute(M.getContext(), AttributeList::ReturnIndex,
+ Attribute::ZExt);
+ DFSanUnionLoadFn =
+ Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy, AL);
+ }
{
AttributeList AL;
AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
@@ -1024,285 +1024,285 @@ void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
DFSanUnionLoadFast16LabelsFn = Mod->getOrInsertFunction(
"__dfsan_union_load_fast16labels", DFSanUnionLoadFnTy, AL);
}
- DFSanUnimplementedFn =
- Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
- {
- AttributeList AL;
- AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
- DFSanSetLabelFn =
- Mod->getOrInsertFunction("__dfsan_set_label", DFSanSetLabelFnTy, AL);
- }
- DFSanNonzeroLabelFn =
- Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy);
- DFSanVarargWrapperFn = Mod->getOrInsertFunction("__dfsan_vararg_wrapper",
- DFSanVarargWrapperFnTy);
-}
-
-// Initializes event callback functions and declare them in the module
-void DataFlowSanitizer::initializeCallbackFunctions(Module &M) {
- DFSanLoadCallbackFn = Mod->getOrInsertFunction("__dfsan_load_callback",
+ DFSanUnimplementedFn =
+ Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
+ {
+ AttributeList AL;
+ AL = AL.addParamAttribute(M.getContext(), 0, Attribute::ZExt);
+ DFSanSetLabelFn =
+ Mod->getOrInsertFunction("__dfsan_set_label", DFSanSetLabelFnTy, AL);
+ }
+ DFSanNonzeroLabelFn =
+ Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy);
+ DFSanVarargWrapperFn = Mod->getOrInsertFunction("__dfsan_vararg_wrapper",
+ DFSanVarargWrapperFnTy);
+}
+
+// Initializes event callback functions and declare them in the module
+void DataFlowSanitizer::initializeCallbackFunctions(Module &M) {
+ DFSanLoadCallbackFn = Mod->getOrInsertFunction("__dfsan_load_callback",
DFSanLoadStoreCallbackFnTy);
DFSanStoreCallbackFn = Mod->getOrInsertFunction("__dfsan_store_callback",
DFSanLoadStoreCallbackFnTy);
- DFSanMemTransferCallbackFn = Mod->getOrInsertFunction(
- "__dfsan_mem_transfer_callback", DFSanMemTransferCallbackFnTy);
+ DFSanMemTransferCallbackFn = Mod->getOrInsertFunction(
+ "__dfsan_mem_transfer_callback", DFSanMemTransferCallbackFnTy);
DFSanCmpCallbackFn =
Mod->getOrInsertFunction("__dfsan_cmp_callback", DFSanCmpCallbackFnTy);
-}
-
+}
+
bool DataFlowSanitizer::runImpl(Module &M) {
init(M);
- if (ABIList.isIn(M, "skip"))
- return false;
-
- const unsigned InitialGlobalSize = M.global_size();
- const unsigned InitialModuleSize = M.size();
-
- bool Changed = false;
-
+ if (ABIList.isIn(M, "skip"))
+ return false;
+
+ const unsigned InitialGlobalSize = M.global_size();
+ const unsigned InitialModuleSize = M.size();
+
+ bool Changed = false;
+
Type *ArgTLSTy = ArrayType::get(Type::getInt64Ty(*Ctx), kArgTLSSize / 8);
ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS)) {
Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel;
G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
- }
+ }
Type *RetvalTLSTy =
ArrayType::get(Type::getInt64Ty(*Ctx), kRetvalTLSSize / 8);
RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", RetvalTLSTy);
if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS)) {
Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel;
G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
- }
-
- ExternalShadowMask =
- Mod->getOrInsertGlobal(kDFSanExternShadowPtrMask, IntptrTy);
-
- initializeCallbackFunctions(M);
- initializeRuntimeFunctions(M);
-
- std::vector<Function *> FnsToInstrument;
- SmallPtrSet<Function *, 2> FnsWithNativeABI;
- for (Function &i : M) {
- if (!i.isIntrinsic() &&
- &i != DFSanUnionFn.getCallee()->stripPointerCasts() &&
- &i != DFSanCheckedUnionFn.getCallee()->stripPointerCasts() &&
- &i != DFSanUnionLoadFn.getCallee()->stripPointerCasts() &&
+ }
+
+ ExternalShadowMask =
+ Mod->getOrInsertGlobal(kDFSanExternShadowPtrMask, IntptrTy);
+
+ initializeCallbackFunctions(M);
+ initializeRuntimeFunctions(M);
+
+ std::vector<Function *> FnsToInstrument;
+ SmallPtrSet<Function *, 2> FnsWithNativeABI;
+ for (Function &i : M) {
+ if (!i.isIntrinsic() &&
+ &i != DFSanUnionFn.getCallee()->stripPointerCasts() &&
+ &i != DFSanCheckedUnionFn.getCallee()->stripPointerCasts() &&
+ &i != DFSanUnionLoadFn.getCallee()->stripPointerCasts() &&
&i != DFSanUnionLoadFast16LabelsFn.getCallee()->stripPointerCasts() &&
- &i != DFSanUnimplementedFn.getCallee()->stripPointerCasts() &&
- &i != DFSanSetLabelFn.getCallee()->stripPointerCasts() &&
- &i != DFSanNonzeroLabelFn.getCallee()->stripPointerCasts() &&
- &i != DFSanVarargWrapperFn.getCallee()->stripPointerCasts() &&
- &i != DFSanLoadCallbackFn.getCallee()->stripPointerCasts() &&
- &i != DFSanStoreCallbackFn.getCallee()->stripPointerCasts() &&
- &i != DFSanMemTransferCallbackFn.getCallee()->stripPointerCasts() &&
- &i != DFSanCmpCallbackFn.getCallee()->stripPointerCasts())
- FnsToInstrument.push_back(&i);
- }
-
- // Give function aliases prefixes when necessary, and build wrappers where the
- // instrumentedness is inconsistent.
- for (Module::alias_iterator i = M.alias_begin(), e = M.alias_end(); i != e;) {
- GlobalAlias *GA = &*i;
- ++i;
- // Don't stop on weak. We assume people aren't playing games with the
- // instrumentedness of overridden weak aliases.
- if (auto F = dyn_cast<Function>(GA->getBaseObject())) {
- bool GAInst = isInstrumented(GA), FInst = isInstrumented(F);
- if (GAInst && FInst) {
- addGlobalNamePrefix(GA);
- } else if (GAInst != FInst) {
- // Non-instrumented alias of an instrumented function, or vice versa.
- // Replace the alias with a native-ABI wrapper of the aliasee. The pass
- // below will take care of instrumenting it.
- Function *NewF =
- buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType());
- GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewF, GA->getType()));
- NewF->takeName(GA);
- GA->eraseFromParent();
- FnsToInstrument.push_back(NewF);
- }
- }
- }
-
- ReadOnlyNoneAttrs.addAttribute(Attribute::ReadOnly)
- .addAttribute(Attribute::ReadNone);
-
- // First, change the ABI of every function in the module. ABI-listed
- // functions keep their original ABI and get a wrapper function.
- for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
- e = FnsToInstrument.end();
- i != e; ++i) {
- Function &F = **i;
- FunctionType *FT = F.getFunctionType();
-
- bool IsZeroArgsVoidRet = (FT->getNumParams() == 0 && !FT->isVarArg() &&
- FT->getReturnType()->isVoidTy());
-
- if (isInstrumented(&F)) {
- // Instrumented functions get a 'dfs$' prefix. This allows us to more
- // easily identify cases of mismatching ABIs.
- if (getInstrumentedABI() == IA_Args && !IsZeroArgsVoidRet) {
- FunctionType *NewFT = getArgsFunctionType(FT);
- Function *NewF = Function::Create(NewFT, F.getLinkage(),
- F.getAddressSpace(), "", &M);
- NewF->copyAttributesFrom(&F);
- NewF->removeAttributes(
- AttributeList::ReturnIndex,
- AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
- for (Function::arg_iterator FArg = F.arg_begin(),
- NewFArg = NewF->arg_begin(),
- FArgEnd = F.arg_end();
- FArg != FArgEnd; ++FArg, ++NewFArg) {
- FArg->replaceAllUsesWith(&*NewFArg);
- }
- NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList());
-
- for (Function::user_iterator UI = F.user_begin(), UE = F.user_end();
- UI != UE;) {
- BlockAddress *BA = dyn_cast<BlockAddress>(*UI);
- ++UI;
- if (BA) {
- BA->replaceAllUsesWith(
- BlockAddress::get(NewF, BA->getBasicBlock()));
- delete BA;
- }
- }
- F.replaceAllUsesWith(
- ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)));
- NewF->takeName(&F);
- F.eraseFromParent();
- *i = NewF;
- addGlobalNamePrefix(NewF);
- } else {
- addGlobalNamePrefix(&F);
- }
- } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) {
- // Build a wrapper function for F. The wrapper simply calls F, and is
- // added to FnsToInstrument so that any instrumentation according to its
- // WrapperKind is done in the second pass below.
- FunctionType *NewFT = getInstrumentedABI() == IA_Args
- ? getArgsFunctionType(FT)
- : FT;
-
- // If the function being wrapped has local linkage, then preserve the
- // function's linkage in the wrapper function.
- GlobalValue::LinkageTypes wrapperLinkage =
- F.hasLocalLinkage()
- ? F.getLinkage()
- : GlobalValue::LinkOnceODRLinkage;
-
- Function *NewF = buildWrapperFunction(
- &F, std::string("dfsw$") + std::string(F.getName()),
- wrapperLinkage, NewFT);
- if (getInstrumentedABI() == IA_TLS)
- NewF->removeAttributes(AttributeList::FunctionIndex, ReadOnlyNoneAttrs);
-
- Value *WrappedFnCst =
- ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
- F.replaceAllUsesWith(WrappedFnCst);
-
- UnwrappedFnMap[WrappedFnCst] = &F;
- *i = NewF;
-
- if (!F.isDeclaration()) {
- // This function is probably defining an interposition of an
- // uninstrumented function and hence needs to keep the original ABI.
- // But any functions it may call need to use the instrumented ABI, so
- // we instrument it in a mode which preserves the original ABI.
- FnsWithNativeABI.insert(&F);
-
- // This code needs to rebuild the iterators, as they may be invalidated
- // by the push_back, taking care that the new range does not include
- // any functions added by this code.
- size_t N = i - FnsToInstrument.begin(),
- Count = e - FnsToInstrument.begin();
- FnsToInstrument.push_back(&F);
- i = FnsToInstrument.begin() + N;
- e = FnsToInstrument.begin() + Count;
- }
- // Hopefully, nobody will try to indirectly call a vararg
- // function... yet.
- } else if (FT->isVarArg()) {
- UnwrappedFnMap[&F] = &F;
- *i = nullptr;
- }
- }
-
- for (Function *i : FnsToInstrument) {
- if (!i || i->isDeclaration())
- continue;
-
- removeUnreachableBlocks(*i);
-
- DFSanFunction DFSF(*this, i, FnsWithNativeABI.count(i));
-
- // DFSanVisitor may create new basic blocks, which confuses df_iterator.
- // Build a copy of the list before iterating over it.
- SmallVector<BasicBlock *, 4> BBList(depth_first(&i->getEntryBlock()));
-
- for (BasicBlock *i : BBList) {
- Instruction *Inst = &i->front();
- while (true) {
- // DFSanVisitor may split the current basic block, changing the current
- // instruction's next pointer and moving the next instruction to the
- // tail block from which we should continue.
- Instruction *Next = Inst->getNextNode();
- // DFSanVisitor may delete Inst, so keep track of whether it was a
- // terminator.
- bool IsTerminator = Inst->isTerminator();
- if (!DFSF.SkipInsts.count(Inst))
- DFSanVisitor(DFSF).visit(Inst);
- if (IsTerminator)
- break;
- Inst = Next;
- }
- }
-
- // We will not necessarily be able to compute the shadow for every phi node
- // until we have visited every block. Therefore, the code that handles phi
- // nodes adds them to the PHIFixups list so that they can be properly
- // handled here.
- for (std::vector<std::pair<PHINode *, PHINode *>>::iterator
- i = DFSF.PHIFixups.begin(),
- e = DFSF.PHIFixups.end();
- i != e; ++i) {
- for (unsigned val = 0, n = i->first->getNumIncomingValues(); val != n;
- ++val) {
- i->second->setIncomingValue(
- val, DFSF.getShadow(i->first->getIncomingValue(val)));
- }
- }
-
- // -dfsan-debug-nonzero-labels will split the CFG in all kinds of crazy
- // places (i.e. instructions in basic blocks we haven't even begun visiting
- // yet). To make our life easier, do this work in a pass after the main
- // instrumentation.
- if (ClDebugNonzeroLabels) {
- for (Value *V : DFSF.NonZeroChecks) {
- Instruction *Pos;
- if (Instruction *I = dyn_cast<Instruction>(V))
- Pos = I->getNextNode();
- else
- Pos = &DFSF.F->getEntryBlock().front();
- while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos))
- Pos = Pos->getNextNode();
- IRBuilder<> IRB(Pos);
+ &i != DFSanUnimplementedFn.getCallee()->stripPointerCasts() &&
+ &i != DFSanSetLabelFn.getCallee()->stripPointerCasts() &&
+ &i != DFSanNonzeroLabelFn.getCallee()->stripPointerCasts() &&
+ &i != DFSanVarargWrapperFn.getCallee()->stripPointerCasts() &&
+ &i != DFSanLoadCallbackFn.getCallee()->stripPointerCasts() &&
+ &i != DFSanStoreCallbackFn.getCallee()->stripPointerCasts() &&
+ &i != DFSanMemTransferCallbackFn.getCallee()->stripPointerCasts() &&
+ &i != DFSanCmpCallbackFn.getCallee()->stripPointerCasts())
+ FnsToInstrument.push_back(&i);
+ }
+
+ // Give function aliases prefixes when necessary, and build wrappers where the
+ // instrumentedness is inconsistent.
+ for (Module::alias_iterator i = M.alias_begin(), e = M.alias_end(); i != e;) {
+ GlobalAlias *GA = &*i;
+ ++i;
+ // Don't stop on weak. We assume people aren't playing games with the
+ // instrumentedness of overridden weak aliases.
+ if (auto F = dyn_cast<Function>(GA->getBaseObject())) {
+ bool GAInst = isInstrumented(GA), FInst = isInstrumented(F);
+ if (GAInst && FInst) {
+ addGlobalNamePrefix(GA);
+ } else if (GAInst != FInst) {
+ // Non-instrumented alias of an instrumented function, or vice versa.
+ // Replace the alias with a native-ABI wrapper of the aliasee. The pass
+ // below will take care of instrumenting it.
+ Function *NewF =
+ buildWrapperFunction(F, "", GA->getLinkage(), F->getFunctionType());
+ GA->replaceAllUsesWith(ConstantExpr::getBitCast(NewF, GA->getType()));
+ NewF->takeName(GA);
+ GA->eraseFromParent();
+ FnsToInstrument.push_back(NewF);
+ }
+ }
+ }
+
+ ReadOnlyNoneAttrs.addAttribute(Attribute::ReadOnly)
+ .addAttribute(Attribute::ReadNone);
+
+ // First, change the ABI of every function in the module. ABI-listed
+ // functions keep their original ABI and get a wrapper function.
+ for (std::vector<Function *>::iterator i = FnsToInstrument.begin(),
+ e = FnsToInstrument.end();
+ i != e; ++i) {
+ Function &F = **i;
+ FunctionType *FT = F.getFunctionType();
+
+ bool IsZeroArgsVoidRet = (FT->getNumParams() == 0 && !FT->isVarArg() &&
+ FT->getReturnType()->isVoidTy());
+
+ if (isInstrumented(&F)) {
+ // Instrumented functions get a 'dfs$' prefix. This allows us to more
+ // easily identify cases of mismatching ABIs.
+ if (getInstrumentedABI() == IA_Args && !IsZeroArgsVoidRet) {
+ FunctionType *NewFT = getArgsFunctionType(FT);
+ Function *NewF = Function::Create(NewFT, F.getLinkage(),
+ F.getAddressSpace(), "", &M);
+ NewF->copyAttributesFrom(&F);
+ NewF->removeAttributes(
+ AttributeList::ReturnIndex,
+ AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
+ for (Function::arg_iterator FArg = F.arg_begin(),
+ NewFArg = NewF->arg_begin(),
+ FArgEnd = F.arg_end();
+ FArg != FArgEnd; ++FArg, ++NewFArg) {
+ FArg->replaceAllUsesWith(&*NewFArg);
+ }
+ NewF->getBasicBlockList().splice(NewF->begin(), F.getBasicBlockList());
+
+ for (Function::user_iterator UI = F.user_begin(), UE = F.user_end();
+ UI != UE;) {
+ BlockAddress *BA = dyn_cast<BlockAddress>(*UI);
+ ++UI;
+ if (BA) {
+ BA->replaceAllUsesWith(
+ BlockAddress::get(NewF, BA->getBasicBlock()));
+ delete BA;
+ }
+ }
+ F.replaceAllUsesWith(
+ ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT)));
+ NewF->takeName(&F);
+ F.eraseFromParent();
+ *i = NewF;
+ addGlobalNamePrefix(NewF);
+ } else {
+ addGlobalNamePrefix(&F);
+ }
+ } else if (!IsZeroArgsVoidRet || getWrapperKind(&F) == WK_Custom) {
+ // Build a wrapper function for F. The wrapper simply calls F, and is
+ // added to FnsToInstrument so that any instrumentation according to its
+ // WrapperKind is done in the second pass below.
+ FunctionType *NewFT = getInstrumentedABI() == IA_Args
+ ? getArgsFunctionType(FT)
+ : FT;
+
+ // If the function being wrapped has local linkage, then preserve the
+ // function's linkage in the wrapper function.
+ GlobalValue::LinkageTypes wrapperLinkage =
+ F.hasLocalLinkage()
+ ? F.getLinkage()
+ : GlobalValue::LinkOnceODRLinkage;
+
+ Function *NewF = buildWrapperFunction(
+ &F, std::string("dfsw$") + std::string(F.getName()),
+ wrapperLinkage, NewFT);
+ if (getInstrumentedABI() == IA_TLS)
+ NewF->removeAttributes(AttributeList::FunctionIndex, ReadOnlyNoneAttrs);
+
+ Value *WrappedFnCst =
+ ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
+ F.replaceAllUsesWith(WrappedFnCst);
+
+ UnwrappedFnMap[WrappedFnCst] = &F;
+ *i = NewF;
+
+ if (!F.isDeclaration()) {
+ // This function is probably defining an interposition of an
+ // uninstrumented function and hence needs to keep the original ABI.
+ // But any functions it may call need to use the instrumented ABI, so
+ // we instrument it in a mode which preserves the original ABI.
+ FnsWithNativeABI.insert(&F);
+
+ // This code needs to rebuild the iterators, as they may be invalidated
+ // by the push_back, taking care that the new range does not include
+ // any functions added by this code.
+ size_t N = i - FnsToInstrument.begin(),
+ Count = e - FnsToInstrument.begin();
+ FnsToInstrument.push_back(&F);
+ i = FnsToInstrument.begin() + N;
+ e = FnsToInstrument.begin() + Count;
+ }
+ // Hopefully, nobody will try to indirectly call a vararg
+ // function... yet.
+ } else if (FT->isVarArg()) {
+ UnwrappedFnMap[&F] = &F;
+ *i = nullptr;
+ }
+ }
+
+ for (Function *i : FnsToInstrument) {
+ if (!i || i->isDeclaration())
+ continue;
+
+ removeUnreachableBlocks(*i);
+
+ DFSanFunction DFSF(*this, i, FnsWithNativeABI.count(i));
+
+ // DFSanVisitor may create new basic blocks, which confuses df_iterator.
+ // Build a copy of the list before iterating over it.
+ SmallVector<BasicBlock *, 4> BBList(depth_first(&i->getEntryBlock()));
+
+ for (BasicBlock *i : BBList) {
+ Instruction *Inst = &i->front();
+ while (true) {
+ // DFSanVisitor may split the current basic block, changing the current
+ // instruction's next pointer and moving the next instruction to the
+ // tail block from which we should continue.
+ Instruction *Next = Inst->getNextNode();
+ // DFSanVisitor may delete Inst, so keep track of whether it was a
+ // terminator.
+ bool IsTerminator = Inst->isTerminator();
+ if (!DFSF.SkipInsts.count(Inst))
+ DFSanVisitor(DFSF).visit(Inst);
+ if (IsTerminator)
+ break;
+ Inst = Next;
+ }
+ }
+
+ // We will not necessarily be able to compute the shadow for every phi node
+ // until we have visited every block. Therefore, the code that handles phi
+ // nodes adds them to the PHIFixups list so that they can be properly
+ // handled here.
+ for (std::vector<std::pair<PHINode *, PHINode *>>::iterator
+ i = DFSF.PHIFixups.begin(),
+ e = DFSF.PHIFixups.end();
+ i != e; ++i) {
+ for (unsigned val = 0, n = i->first->getNumIncomingValues(); val != n;
+ ++val) {
+ i->second->setIncomingValue(
+ val, DFSF.getShadow(i->first->getIncomingValue(val)));
+ }
+ }
+
+ // -dfsan-debug-nonzero-labels will split the CFG in all kinds of crazy
+ // places (i.e. instructions in basic blocks we haven't even begun visiting
+ // yet). To make our life easier, do this work in a pass after the main
+ // instrumentation.
+ if (ClDebugNonzeroLabels) {
+ for (Value *V : DFSF.NonZeroChecks) {
+ Instruction *Pos;
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ Pos = I->getNextNode();
+ else
+ Pos = &DFSF.F->getEntryBlock().front();
+ while (isa<PHINode>(Pos) || isa<AllocaInst>(Pos))
+ Pos = Pos->getNextNode();
+ IRBuilder<> IRB(Pos);
Value *PrimitiveShadow = DFSF.collapseToPrimitiveShadow(V, Pos);
Value *Ne =
IRB.CreateICmpNE(PrimitiveShadow, DFSF.DFS.ZeroPrimitiveShadow);
- BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
- Ne, Pos, /*Unreachable=*/false, ColdCallWeights));
- IRBuilder<> ThenIRB(BI);
- ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn, {});
- }
- }
- }
-
- return Changed || !FnsToInstrument.empty() ||
- M.global_size() != InitialGlobalSize || M.size() != InitialModuleSize;
-}
-
+ BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+ Ne, Pos, /*Unreachable=*/false, ColdCallWeights));
+ IRBuilder<> ThenIRB(BI);
+ ThenIRB.CreateCall(DFSF.DFS.DFSanNonzeroLabelFn, {});
+ }
+ }
+ }
+
+ return Changed || !FnsToInstrument.empty() ||
+ M.global_size() != InitialGlobalSize || M.size() != InitialModuleSize;
+}
+
Value *DFSanFunction::getArgTLS(Type *T, unsigned ArgOffset, IRBuilder<> &IRB) {
Value *Base = IRB.CreatePointerCast(DFS.ArgTLS, DFS.IntptrTy);
if (ArgOffset)
@@ -1310,12 +1310,12 @@ Value *DFSanFunction::getArgTLS(Type *T, unsigned ArgOffset, IRBuilder<> &IRB) {
return IRB.CreateIntToPtr(Base, PointerType::get(DFS.getShadowTy(T), 0),
"_dfsarg");
}
-
+
Value *DFSanFunction::getRetvalTLS(Type *T, IRBuilder<> &IRB) {
return IRB.CreatePointerCast(
DFS.RetvalTLS, PointerType::get(DFS.getShadowTy(T), 0), "_dfsret");
-}
-
+}
+
Value *DFSanFunction::getShadowForTLSArgument(Argument *A) {
unsigned ArgOffset = 0;
const DataLayout &DL = F->getParent()->getDataLayout();
@@ -1325,7 +1325,7 @@ Value *DFSanFunction::getShadowForTLSArgument(Argument *A) {
break;
continue;
}
-
+
unsigned Size = DL.getTypeAllocSize(DFS.getShadowTy(&FArg));
if (A != &FArg) {
ArgOffset += alignTo(Size, kShadowTLSAlignment);
@@ -1333,7 +1333,7 @@ Value *DFSanFunction::getShadowForTLSArgument(Argument *A) {
break; // ArgTLS overflows, uses a zero shadow.
continue;
}
-
+
if (ArgOffset + Size > kArgTLSSize)
break; // ArgTLS overflows, uses a zero shadow.
@@ -1345,224 +1345,224 @@ Value *DFSanFunction::getShadowForTLSArgument(Argument *A) {
}
return DFS.getZeroShadow(A);
-}
-
-Value *DFSanFunction::getShadow(Value *V) {
- if (!isa<Argument>(V) && !isa<Instruction>(V))
+}
+
+Value *DFSanFunction::getShadow(Value *V) {
+ if (!isa<Argument>(V) && !isa<Instruction>(V))
return DFS.getZeroShadow(V);
- Value *&Shadow = ValShadowMap[V];
- if (!Shadow) {
- if (Argument *A = dyn_cast<Argument>(V)) {
- if (IsNativeABI)
+ Value *&Shadow = ValShadowMap[V];
+ if (!Shadow) {
+ if (Argument *A = dyn_cast<Argument>(V)) {
+ if (IsNativeABI)
return DFS.getZeroShadow(V);
- switch (IA) {
- case DataFlowSanitizer::IA_TLS: {
+ switch (IA) {
+ case DataFlowSanitizer::IA_TLS: {
Shadow = getShadowForTLSArgument(A);
- break;
- }
- case DataFlowSanitizer::IA_Args: {
- unsigned ArgIdx = A->getArgNo() + F->arg_size() / 2;
- Function::arg_iterator i = F->arg_begin();
- while (ArgIdx--)
- ++i;
- Shadow = &*i;
+ break;
+ }
+ case DataFlowSanitizer::IA_Args: {
+ unsigned ArgIdx = A->getArgNo() + F->arg_size() / 2;
+ Function::arg_iterator i = F->arg_begin();
+ while (ArgIdx--)
+ ++i;
+ Shadow = &*i;
assert(Shadow->getType() == DFS.PrimitiveShadowTy);
- break;
- }
- }
- NonZeroChecks.push_back(Shadow);
- } else {
+ break;
+ }
+ }
+ NonZeroChecks.push_back(Shadow);
+ } else {
Shadow = DFS.getZeroShadow(V);
- }
- }
- return Shadow;
-}
-
-void DFSanFunction::setShadow(Instruction *I, Value *Shadow) {
- assert(!ValShadowMap.count(I));
+ }
+ }
+ return Shadow;
+}
+
+void DFSanFunction::setShadow(Instruction *I, Value *Shadow) {
+ assert(!ValShadowMap.count(I));
assert(DFS.shouldTrackFieldsAndIndices() ||
Shadow->getType() == DFS.PrimitiveShadowTy);
- ValShadowMap[I] = Shadow;
-}
-
-Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {
- assert(Addr != RetvalTLS && "Reinstrumenting?");
- IRBuilder<> IRB(Pos);
- Value *ShadowPtrMaskValue;
- if (DFSanRuntimeShadowMask)
- ShadowPtrMaskValue = IRB.CreateLoad(IntptrTy, ExternalShadowMask);
- else
- ShadowPtrMaskValue = ShadowPtrMask;
- return IRB.CreateIntToPtr(
- IRB.CreateMul(
- IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy),
- IRB.CreatePtrToInt(ShadowPtrMaskValue, IntptrTy)),
- ShadowPtrMul),
+ ValShadowMap[I] = Shadow;
+}
+
+Value *DataFlowSanitizer::getShadowAddress(Value *Addr, Instruction *Pos) {
+ assert(Addr != RetvalTLS && "Reinstrumenting?");
+ IRBuilder<> IRB(Pos);
+ Value *ShadowPtrMaskValue;
+ if (DFSanRuntimeShadowMask)
+ ShadowPtrMaskValue = IRB.CreateLoad(IntptrTy, ExternalShadowMask);
+ else
+ ShadowPtrMaskValue = ShadowPtrMask;
+ return IRB.CreateIntToPtr(
+ IRB.CreateMul(
+ IRB.CreateAnd(IRB.CreatePtrToInt(Addr, IntptrTy),
+ IRB.CreatePtrToInt(ShadowPtrMaskValue, IntptrTy)),
+ ShadowPtrMul),
PrimitiveShadowPtrTy);
-}
-
+}
+
Value *DFSanFunction::combineShadowsThenConvert(Type *T, Value *V1, Value *V2,
Instruction *Pos) {
Value *PrimitiveValue = combineShadows(V1, V2, Pos);
return expandFromPrimitiveShadow(T, PrimitiveValue, Pos);
}
-// Generates IR to compute the union of the two given shadows, inserting it
+// Generates IR to compute the union of the two given shadows, inserting it
// before Pos. The combined value is with primitive type.
-Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
+Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
if (DFS.isZeroShadow(V1))
return collapseToPrimitiveShadow(V2, Pos);
if (DFS.isZeroShadow(V2))
return collapseToPrimitiveShadow(V1, Pos);
- if (V1 == V2)
+ if (V1 == V2)
return collapseToPrimitiveShadow(V1, Pos);
-
- auto V1Elems = ShadowElements.find(V1);
- auto V2Elems = ShadowElements.find(V2);
- if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) {
- if (std::includes(V1Elems->second.begin(), V1Elems->second.end(),
- V2Elems->second.begin(), V2Elems->second.end())) {
+
+ auto V1Elems = ShadowElements.find(V1);
+ auto V2Elems = ShadowElements.find(V2);
+ if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) {
+ if (std::includes(V1Elems->second.begin(), V1Elems->second.end(),
+ V2Elems->second.begin(), V2Elems->second.end())) {
return collapseToPrimitiveShadow(V1, Pos);
- } else if (std::includes(V2Elems->second.begin(), V2Elems->second.end(),
- V1Elems->second.begin(), V1Elems->second.end())) {
+ } else if (std::includes(V2Elems->second.begin(), V2Elems->second.end(),
+ V1Elems->second.begin(), V1Elems->second.end())) {
return collapseToPrimitiveShadow(V2, Pos);
- }
- } else if (V1Elems != ShadowElements.end()) {
- if (V1Elems->second.count(V2))
+ }
+ } else if (V1Elems != ShadowElements.end()) {
+ if (V1Elems->second.count(V2))
return collapseToPrimitiveShadow(V1, Pos);
- } else if (V2Elems != ShadowElements.end()) {
- if (V2Elems->second.count(V1))
+ } else if (V2Elems != ShadowElements.end()) {
+ if (V2Elems->second.count(V1))
return collapseToPrimitiveShadow(V2, Pos);
- }
-
- auto Key = std::make_pair(V1, V2);
- if (V1 > V2)
- std::swap(Key.first, Key.second);
+ }
+
+ auto Key = std::make_pair(V1, V2);
+ if (V1 > V2)
+ std::swap(Key.first, Key.second);
CachedShadow &CCS = CachedShadows[Key];
- if (CCS.Block && DT.dominates(CCS.Block, Pos->getParent()))
- return CCS.Shadow;
-
+ if (CCS.Block && DT.dominates(CCS.Block, Pos->getParent()))
+ return CCS.Shadow;
+
// Converts inputs shadows to shadows with primitive types.
Value *PV1 = collapseToPrimitiveShadow(V1, Pos);
Value *PV2 = collapseToPrimitiveShadow(V2, Pos);
- IRBuilder<> IRB(Pos);
+ IRBuilder<> IRB(Pos);
if (ClFast16Labels) {
CCS.Block = Pos->getParent();
CCS.Shadow = IRB.CreateOr(PV1, PV2);
} else if (AvoidNewBlocks) {
CallInst *Call = IRB.CreateCall(DFS.DFSanCheckedUnionFn, {PV1, PV2});
- Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
- Call->addParamAttr(0, Attribute::ZExt);
- Call->addParamAttr(1, Attribute::ZExt);
-
- CCS.Block = Pos->getParent();
- CCS.Shadow = Call;
- } else {
- BasicBlock *Head = Pos->getParent();
+ Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+ Call->addParamAttr(0, Attribute::ZExt);
+ Call->addParamAttr(1, Attribute::ZExt);
+
+ CCS.Block = Pos->getParent();
+ CCS.Shadow = Call;
+ } else {
+ BasicBlock *Head = Pos->getParent();
Value *Ne = IRB.CreateICmpNE(PV1, PV2);
- BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
- Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT));
- IRBuilder<> ThenIRB(BI);
+ BranchInst *BI = cast<BranchInst>(SplitBlockAndInsertIfThen(
+ Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT));
+ IRBuilder<> ThenIRB(BI);
CallInst *Call = ThenIRB.CreateCall(DFS.DFSanUnionFn, {PV1, PV2});
- Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
- Call->addParamAttr(0, Attribute::ZExt);
- Call->addParamAttr(1, Attribute::ZExt);
-
- BasicBlock *Tail = BI->getSuccessor(0);
+ Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+ Call->addParamAttr(0, Attribute::ZExt);
+ Call->addParamAttr(1, Attribute::ZExt);
+
+ BasicBlock *Tail = BI->getSuccessor(0);
PHINode *Phi =
PHINode::Create(DFS.PrimitiveShadowTy, 2, "", &Tail->front());
- Phi->addIncoming(Call, Call->getParent());
+ Phi->addIncoming(Call, Call->getParent());
Phi->addIncoming(PV1, Head);
-
- CCS.Block = Tail;
- CCS.Shadow = Phi;
- }
-
- std::set<Value *> UnionElems;
- if (V1Elems != ShadowElements.end()) {
- UnionElems = V1Elems->second;
- } else {
- UnionElems.insert(V1);
- }
- if (V2Elems != ShadowElements.end()) {
- UnionElems.insert(V2Elems->second.begin(), V2Elems->second.end());
- } else {
- UnionElems.insert(V2);
- }
- ShadowElements[CCS.Shadow] = std::move(UnionElems);
-
- return CCS.Shadow;
-}
-
-// A convenience function which folds the shadows of each of the operands
-// of the provided instruction Inst, inserting the IR before Inst. Returns
-// the computed union Value.
-Value *DFSanFunction::combineOperandShadows(Instruction *Inst) {
- if (Inst->getNumOperands() == 0)
+
+ CCS.Block = Tail;
+ CCS.Shadow = Phi;
+ }
+
+ std::set<Value *> UnionElems;
+ if (V1Elems != ShadowElements.end()) {
+ UnionElems = V1Elems->second;
+ } else {
+ UnionElems.insert(V1);
+ }
+ if (V2Elems != ShadowElements.end()) {
+ UnionElems.insert(V2Elems->second.begin(), V2Elems->second.end());
+ } else {
+ UnionElems.insert(V2);
+ }
+ ShadowElements[CCS.Shadow] = std::move(UnionElems);
+
+ return CCS.Shadow;
+}
+
+// A convenience function which folds the shadows of each of the operands
+// of the provided instruction Inst, inserting the IR before Inst. Returns
+// the computed union Value.
+Value *DFSanFunction::combineOperandShadows(Instruction *Inst) {
+ if (Inst->getNumOperands() == 0)
return DFS.getZeroShadow(Inst);
-
- Value *Shadow = getShadow(Inst->getOperand(0));
- for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) {
- Shadow = combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst);
- }
+
+ Value *Shadow = getShadow(Inst->getOperand(0));
+ for (unsigned i = 1, n = Inst->getNumOperands(); i != n; ++i) {
+ Shadow = combineShadows(Shadow, getShadow(Inst->getOperand(i)), Inst);
+ }
return expandFromPrimitiveShadow(Inst->getType(), Shadow, Inst);
-}
-
-Value *DFSanVisitor::visitOperandShadowInst(Instruction &I) {
- Value *CombinedShadow = DFSF.combineOperandShadows(&I);
- DFSF.setShadow(&I, CombinedShadow);
- return CombinedShadow;
-}
-
-// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where
+}
+
+Value *DFSanVisitor::visitOperandShadowInst(Instruction &I) {
+ Value *CombinedShadow = DFSF.combineOperandShadows(&I);
+ DFSF.setShadow(&I, CombinedShadow);
+ return CombinedShadow;
+}
+
+// Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where
// Addr has alignment Align, and take the union of each of those shadows. The
// returned shadow always has primitive type.
-Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
- Instruction *Pos) {
- if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
- const auto i = AllocaShadowMap.find(AI);
- if (i != AllocaShadowMap.end()) {
- IRBuilder<> IRB(Pos);
+Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
+ Instruction *Pos) {
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+ const auto i = AllocaShadowMap.find(AI);
+ if (i != AllocaShadowMap.end()) {
+ IRBuilder<> IRB(Pos);
return IRB.CreateLoad(DFS.PrimitiveShadowTy, i->second);
- }
- }
-
- const llvm::Align ShadowAlign(Align * DFS.ShadowWidthBytes);
- SmallVector<const Value *, 2> Objs;
+ }
+ }
+
+ const llvm::Align ShadowAlign(Align * DFS.ShadowWidthBytes);
+ SmallVector<const Value *, 2> Objs;
getUnderlyingObjects(Addr, Objs);
- bool AllConstants = true;
- for (const Value *Obj : Objs) {
- if (isa<Function>(Obj) || isa<BlockAddress>(Obj))
- continue;
- if (isa<GlobalVariable>(Obj) && cast<GlobalVariable>(Obj)->isConstant())
- continue;
-
- AllConstants = false;
- break;
- }
- if (AllConstants)
+ bool AllConstants = true;
+ for (const Value *Obj : Objs) {
+ if (isa<Function>(Obj) || isa<BlockAddress>(Obj))
+ continue;
+ if (isa<GlobalVariable>(Obj) && cast<GlobalVariable>(Obj)->isConstant())
+ continue;
+
+ AllConstants = false;
+ break;
+ }
+ if (AllConstants)
return DFS.ZeroPrimitiveShadow;
-
- Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
- switch (Size) {
- case 0:
+
+ Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+ switch (Size) {
+ case 0:
return DFS.ZeroPrimitiveShadow;
- case 1: {
+ case 1: {
LoadInst *LI = new LoadInst(DFS.PrimitiveShadowTy, ShadowAddr, "", Pos);
- LI->setAlignment(ShadowAlign);
- return LI;
- }
- case 2: {
- IRBuilder<> IRB(Pos);
+ LI->setAlignment(ShadowAlign);
+ return LI;
+ }
+ case 2: {
+ IRBuilder<> IRB(Pos);
Value *ShadowAddr1 = IRB.CreateGEP(DFS.PrimitiveShadowTy, ShadowAddr,
- ConstantInt::get(DFS.IntptrTy, 1));
- return combineShadows(
+ ConstantInt::get(DFS.IntptrTy, 1));
+ return combineShadows(
IRB.CreateAlignedLoad(DFS.PrimitiveShadowTy, ShadowAddr, ShadowAlign),
IRB.CreateAlignedLoad(DFS.PrimitiveShadowTy, ShadowAddr1, ShadowAlign),
Pos);
- }
- }
+ }
+ }
if (ClFast16Labels && Size % (64 / DFS.ShadowWidthBits) == 0) {
// First OR all the WideShadows, then OR individual shadows within the
@@ -1587,226 +1587,226 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
}
return IRB.CreateTrunc(CombinedWideShadow, DFS.PrimitiveShadowTy);
}
- if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidthBits) == 0) {
- // Fast path for the common case where each byte has identical shadow: load
- // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any
- // shadow is non-equal.
- BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F);
- IRBuilder<> FallbackIRB(FallbackBB);
- CallInst *FallbackCall = FallbackIRB.CreateCall(
- DFS.DFSanUnionLoadFn,
- {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
- FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
-
- // Compare each of the shadows stored in the loaded 64 bits to each other,
- // by computing (WideShadow rotl ShadowWidthBits) == WideShadow.
- IRBuilder<> IRB(Pos);
- Value *WideAddr =
- IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
- Value *WideShadow =
- IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
+ if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidthBits) == 0) {
+ // Fast path for the common case where each byte has identical shadow: load
+ // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any
+ // shadow is non-equal.
+ BasicBlock *FallbackBB = BasicBlock::Create(*DFS.Ctx, "", F);
+ IRBuilder<> FallbackIRB(FallbackBB);
+ CallInst *FallbackCall = FallbackIRB.CreateCall(
+ DFS.DFSanUnionLoadFn,
+ {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
+ FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+
+ // Compare each of the shadows stored in the loaded 64 bits to each other,
+ // by computing (WideShadow rotl ShadowWidthBits) == WideShadow.
+ IRBuilder<> IRB(Pos);
+ Value *WideAddr =
+ IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
+ Value *WideShadow =
+ IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.PrimitiveShadowTy);
- Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidthBits);
- Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidthBits);
- Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow);
- Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);
-
- BasicBlock *Head = Pos->getParent();
- BasicBlock *Tail = Head->splitBasicBlock(Pos->getIterator());
-
- if (DomTreeNode *OldNode = DT.getNode(Head)) {
- std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
-
- DomTreeNode *NewNode = DT.addNewBlock(Tail, Head);
- for (auto Child : Children)
- DT.changeImmediateDominator(Child, NewNode);
- }
-
- // In the following code LastBr will refer to the previous basic block's
- // conditional branch instruction, whose true successor is fixed up to point
- // to the next block during the loop below or to the tail after the final
- // iteration.
- BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq);
- ReplaceInstWithInst(Head->getTerminator(), LastBr);
- DT.addNewBlock(FallbackBB, Head);
-
- for (uint64_t Ofs = 64 / DFS.ShadowWidthBits; Ofs != Size;
- Ofs += 64 / DFS.ShadowWidthBits) {
- BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F);
- DT.addNewBlock(NextBB, LastBr->getParent());
- IRBuilder<> NextIRB(NextBB);
- WideAddr = NextIRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr,
- ConstantInt::get(DFS.IntptrTy, 1));
- Value *NextWideShadow = NextIRB.CreateAlignedLoad(NextIRB.getInt64Ty(),
- WideAddr, ShadowAlign);
- ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow);
- LastBr->setSuccessor(0, NextBB);
- LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB);
- }
-
- LastBr->setSuccessor(0, Tail);
- FallbackIRB.CreateBr(Tail);
+ Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidthBits);
+ Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidthBits);
+ Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow);
+ Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);
+
+ BasicBlock *Head = Pos->getParent();
+ BasicBlock *Tail = Head->splitBasicBlock(Pos->getIterator());
+
+ if (DomTreeNode *OldNode = DT.getNode(Head)) {
+ std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
+
+ DomTreeNode *NewNode = DT.addNewBlock(Tail, Head);
+ for (auto Child : Children)
+ DT.changeImmediateDominator(Child, NewNode);
+ }
+
+ // In the following code LastBr will refer to the previous basic block's
+ // conditional branch instruction, whose true successor is fixed up to point
+ // to the next block during the loop below or to the tail after the final
+ // iteration.
+ BranchInst *LastBr = BranchInst::Create(FallbackBB, FallbackBB, ShadowsEq);
+ ReplaceInstWithInst(Head->getTerminator(), LastBr);
+ DT.addNewBlock(FallbackBB, Head);
+
+ for (uint64_t Ofs = 64 / DFS.ShadowWidthBits; Ofs != Size;
+ Ofs += 64 / DFS.ShadowWidthBits) {
+ BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F);
+ DT.addNewBlock(NextBB, LastBr->getParent());
+ IRBuilder<> NextIRB(NextBB);
+ WideAddr = NextIRB.CreateGEP(Type::getInt64Ty(*DFS.Ctx), WideAddr,
+ ConstantInt::get(DFS.IntptrTy, 1));
+ Value *NextWideShadow = NextIRB.CreateAlignedLoad(NextIRB.getInt64Ty(),
+ WideAddr, ShadowAlign);
+ ShadowsEq = NextIRB.CreateICmpEQ(WideShadow, NextWideShadow);
+ LastBr->setSuccessor(0, NextBB);
+ LastBr = NextIRB.CreateCondBr(ShadowsEq, FallbackBB, FallbackBB);
+ }
+
+ LastBr->setSuccessor(0, Tail);
+ FallbackIRB.CreateBr(Tail);
PHINode *Shadow =
PHINode::Create(DFS.PrimitiveShadowTy, 2, "", &Tail->front());
- Shadow->addIncoming(FallbackCall, FallbackBB);
- Shadow->addIncoming(TruncShadow, LastBr->getParent());
- return Shadow;
- }
-
- IRBuilder<> IRB(Pos);
+ Shadow->addIncoming(FallbackCall, FallbackBB);
+ Shadow->addIncoming(TruncShadow, LastBr->getParent());
+ return Shadow;
+ }
+
+ IRBuilder<> IRB(Pos);
FunctionCallee &UnionLoadFn =
ClFast16Labels ? DFS.DFSanUnionLoadFast16LabelsFn : DFS.DFSanUnionLoadFn;
- CallInst *FallbackCall = IRB.CreateCall(
+ CallInst *FallbackCall = IRB.CreateCall(
UnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
- FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
- return FallbackCall;
-}
-
-void DFSanVisitor::visitLoadInst(LoadInst &LI) {
- auto &DL = LI.getModule()->getDataLayout();
- uint64_t Size = DL.getTypeStoreSize(LI.getType());
- if (Size == 0) {
+ FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
+ return FallbackCall;
+}
+
+void DFSanVisitor::visitLoadInst(LoadInst &LI) {
+ auto &DL = LI.getModule()->getDataLayout();
+ uint64_t Size = DL.getTypeStoreSize(LI.getType());
+ if (Size == 0) {
DFSF.setShadow(&LI, DFSF.DFS.getZeroShadow(&LI));
- return;
- }
-
- Align Alignment = ClPreserveAlignment ? LI.getAlign() : Align(1);
+ return;
+ }
+
+ Align Alignment = ClPreserveAlignment ? LI.getAlign() : Align(1);
Value *PrimitiveShadow =
- DFSF.loadShadow(LI.getPointerOperand(), Size, Alignment.value(), &LI);
- if (ClCombinePointerLabelsOnLoad) {
- Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
+ DFSF.loadShadow(LI.getPointerOperand(), Size, Alignment.value(), &LI);
+ if (ClCombinePointerLabelsOnLoad) {
+ Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
PrimitiveShadow = DFSF.combineShadows(PrimitiveShadow, PtrShadow, &LI);
- }
+ }
if (!DFSF.DFS.isZeroShadow(PrimitiveShadow))
DFSF.NonZeroChecks.push_back(PrimitiveShadow);
-
+
Value *Shadow =
DFSF.expandFromPrimitiveShadow(LI.getType(), PrimitiveShadow, &LI);
- DFSF.setShadow(&LI, Shadow);
- if (ClEventCallbacks) {
- IRBuilder<> IRB(&LI);
+ DFSF.setShadow(&LI, Shadow);
+ if (ClEventCallbacks) {
+ IRBuilder<> IRB(&LI);
Value *Addr8 = IRB.CreateBitCast(LI.getPointerOperand(), DFSF.DFS.Int8Ptr);
IRB.CreateCall(DFSF.DFS.DFSanLoadCallbackFn, {PrimitiveShadow, Addr8});
- }
-}
-
+ }
+}
+
void DFSanFunction::storePrimitiveShadow(Value *Addr, uint64_t Size,
Align Alignment,
Value *PrimitiveShadow,
Instruction *Pos) {
- if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
- const auto i = AllocaShadowMap.find(AI);
- if (i != AllocaShadowMap.end()) {
- IRBuilder<> IRB(Pos);
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
+ const auto i = AllocaShadowMap.find(AI);
+ if (i != AllocaShadowMap.end()) {
+ IRBuilder<> IRB(Pos);
IRB.CreateStore(PrimitiveShadow, i->second);
- return;
- }
- }
-
- const Align ShadowAlign(Alignment.value() * DFS.ShadowWidthBytes);
- IRBuilder<> IRB(Pos);
- Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
+ return;
+ }
+ }
+
+ const Align ShadowAlign(Alignment.value() * DFS.ShadowWidthBytes);
+ IRBuilder<> IRB(Pos);
+ Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
if (DFS.isZeroShadow(PrimitiveShadow)) {
- IntegerType *ShadowTy =
- IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidthBits);
- Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0);
- Value *ExtShadowAddr =
- IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy));
- IRB.CreateAlignedStore(ExtZeroShadow, ExtShadowAddr, ShadowAlign);
- return;
- }
-
- const unsigned ShadowVecSize = 128 / DFS.ShadowWidthBits;
- uint64_t Offset = 0;
- if (Size >= ShadowVecSize) {
+ IntegerType *ShadowTy =
+ IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidthBits);
+ Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0);
+ Value *ExtShadowAddr =
+ IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy));
+ IRB.CreateAlignedStore(ExtZeroShadow, ExtShadowAddr, ShadowAlign);
+ return;
+ }
+
+ const unsigned ShadowVecSize = 128 / DFS.ShadowWidthBits;
+ uint64_t Offset = 0;
+ if (Size >= ShadowVecSize) {
auto *ShadowVecTy =
FixedVectorType::get(DFS.PrimitiveShadowTy, ShadowVecSize);
- Value *ShadowVec = UndefValue::get(ShadowVecTy);
- for (unsigned i = 0; i != ShadowVecSize; ++i) {
- ShadowVec = IRB.CreateInsertElement(
+ Value *ShadowVec = UndefValue::get(ShadowVecTy);
+ for (unsigned i = 0; i != ShadowVecSize; ++i) {
+ ShadowVec = IRB.CreateInsertElement(
ShadowVec, PrimitiveShadow,
ConstantInt::get(Type::getInt32Ty(*DFS.Ctx), i));
- }
- Value *ShadowVecAddr =
- IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy));
- do {
- Value *CurShadowVecAddr =
- IRB.CreateConstGEP1_32(ShadowVecTy, ShadowVecAddr, Offset);
- IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign);
- Size -= ShadowVecSize;
- ++Offset;
- } while (Size >= ShadowVecSize);
- Offset *= ShadowVecSize;
- }
- while (Size > 0) {
- Value *CurShadowAddr =
+ }
+ Value *ShadowVecAddr =
+ IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowVecTy));
+ do {
+ Value *CurShadowVecAddr =
+ IRB.CreateConstGEP1_32(ShadowVecTy, ShadowVecAddr, Offset);
+ IRB.CreateAlignedStore(ShadowVec, CurShadowVecAddr, ShadowAlign);
+ Size -= ShadowVecSize;
+ ++Offset;
+ } while (Size >= ShadowVecSize);
+ Offset *= ShadowVecSize;
+ }
+ while (Size > 0) {
+ Value *CurShadowAddr =
IRB.CreateConstGEP1_32(DFS.PrimitiveShadowTy, ShadowAddr, Offset);
IRB.CreateAlignedStore(PrimitiveShadow, CurShadowAddr, ShadowAlign);
- --Size;
- ++Offset;
- }
-}
-
-void DFSanVisitor::visitStoreInst(StoreInst &SI) {
- auto &DL = SI.getModule()->getDataLayout();
- uint64_t Size = DL.getTypeStoreSize(SI.getValueOperand()->getType());
- if (Size == 0)
- return;
-
- const Align Alignment = ClPreserveAlignment ? SI.getAlign() : Align(1);
-
- Value* Shadow = DFSF.getShadow(SI.getValueOperand());
+ --Size;
+ ++Offset;
+ }
+}
+
+void DFSanVisitor::visitStoreInst(StoreInst &SI) {
+ auto &DL = SI.getModule()->getDataLayout();
+ uint64_t Size = DL.getTypeStoreSize(SI.getValueOperand()->getType());
+ if (Size == 0)
+ return;
+
+ const Align Alignment = ClPreserveAlignment ? SI.getAlign() : Align(1);
+
+ Value* Shadow = DFSF.getShadow(SI.getValueOperand());
Value *PrimitiveShadow;
- if (ClCombinePointerLabelsOnStore) {
- Value *PtrShadow = DFSF.getShadow(SI.getPointerOperand());
+ if (ClCombinePointerLabelsOnStore) {
+ Value *PtrShadow = DFSF.getShadow(SI.getPointerOperand());
PrimitiveShadow = DFSF.combineShadows(Shadow, PtrShadow, &SI);
} else {
PrimitiveShadow = DFSF.collapseToPrimitiveShadow(Shadow, &SI);
- }
+ }
DFSF.storePrimitiveShadow(SI.getPointerOperand(), Size, Alignment,
PrimitiveShadow, &SI);
- if (ClEventCallbacks) {
- IRBuilder<> IRB(&SI);
+ if (ClEventCallbacks) {
+ IRBuilder<> IRB(&SI);
Value *Addr8 = IRB.CreateBitCast(SI.getPointerOperand(), DFSF.DFS.Int8Ptr);
IRB.CreateCall(DFSF.DFS.DFSanStoreCallbackFn, {PrimitiveShadow, Addr8});
- }
-}
-
-void DFSanVisitor::visitUnaryOperator(UnaryOperator &UO) {
- visitOperandShadowInst(UO);
-}
-
-void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) {
- visitOperandShadowInst(BO);
-}
-
-void DFSanVisitor::visitCastInst(CastInst &CI) { visitOperandShadowInst(CI); }
-
-void DFSanVisitor::visitCmpInst(CmpInst &CI) {
- Value *CombinedShadow = visitOperandShadowInst(CI);
- if (ClEventCallbacks) {
- IRBuilder<> IRB(&CI);
- IRB.CreateCall(DFSF.DFS.DFSanCmpCallbackFn, CombinedShadow);
- }
-}
-
-void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
- visitOperandShadowInst(GEPI);
-}
-
-void DFSanVisitor::visitExtractElementInst(ExtractElementInst &I) {
- visitOperandShadowInst(I);
-}
-
-void DFSanVisitor::visitInsertElementInst(InsertElementInst &I) {
- visitOperandShadowInst(I);
-}
-
-void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) {
- visitOperandShadowInst(I);
-}
-
-void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
+ }
+}
+
+void DFSanVisitor::visitUnaryOperator(UnaryOperator &UO) {
+ visitOperandShadowInst(UO);
+}
+
+void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) {
+ visitOperandShadowInst(BO);
+}
+
+void DFSanVisitor::visitCastInst(CastInst &CI) { visitOperandShadowInst(CI); }
+
+void DFSanVisitor::visitCmpInst(CmpInst &CI) {
+ Value *CombinedShadow = visitOperandShadowInst(CI);
+ if (ClEventCallbacks) {
+ IRBuilder<> IRB(&CI);
+ IRB.CreateCall(DFSF.DFS.DFSanCmpCallbackFn, CombinedShadow);
+ }
+}
+
+void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+ visitOperandShadowInst(GEPI);
+}
+
+void DFSanVisitor::visitExtractElementInst(ExtractElementInst &I) {
+ visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitInsertElementInst(InsertElementInst &I) {
+ visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitShuffleVectorInst(ShuffleVectorInst &I) {
+ visitOperandShadowInst(I);
+}
+
+void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
if (!DFSF.DFS.shouldTrackFieldsAndIndices()) {
visitOperandShadowInst(I);
return;
@@ -1817,9 +1817,9 @@ void DFSanVisitor::visitExtractValueInst(ExtractValueInst &I) {
Value *AggShadow = DFSF.getShadow(Agg);
Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
DFSF.setShadow(&I, ResShadow);
-}
-
-void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) {
+}
+
+void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) {
if (!DFSF.DFS.shouldTrackFieldsAndIndices()) {
visitOperandShadowInst(I);
return;
@@ -1830,93 +1830,93 @@ void DFSanVisitor::visitInsertValueInst(InsertValueInst &I) {
Value *InsShadow = DFSF.getShadow(I.getInsertedValueOperand());
Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
DFSF.setShadow(&I, Res);
-}
-
-void DFSanVisitor::visitAllocaInst(AllocaInst &I) {
- bool AllLoadsStores = true;
- for (User *U : I.users()) {
- if (isa<LoadInst>(U))
- continue;
-
- if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
- if (SI->getPointerOperand() == &I)
- continue;
- }
-
- AllLoadsStores = false;
- break;
- }
- if (AllLoadsStores) {
- IRBuilder<> IRB(&I);
+}
+
+void DFSanVisitor::visitAllocaInst(AllocaInst &I) {
+ bool AllLoadsStores = true;
+ for (User *U : I.users()) {
+ if (isa<LoadInst>(U))
+ continue;
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+ if (SI->getPointerOperand() == &I)
+ continue;
+ }
+
+ AllLoadsStores = false;
+ break;
+ }
+ if (AllLoadsStores) {
+ IRBuilder<> IRB(&I);
DFSF.AllocaShadowMap[&I] = IRB.CreateAlloca(DFSF.DFS.PrimitiveShadowTy);
- }
+ }
DFSF.setShadow(&I, DFSF.DFS.ZeroPrimitiveShadow);
-}
-
-void DFSanVisitor::visitSelectInst(SelectInst &I) {
- Value *CondShadow = DFSF.getShadow(I.getCondition());
- Value *TrueShadow = DFSF.getShadow(I.getTrueValue());
- Value *FalseShadow = DFSF.getShadow(I.getFalseValue());
+}
+
+void DFSanVisitor::visitSelectInst(SelectInst &I) {
+ Value *CondShadow = DFSF.getShadow(I.getCondition());
+ Value *TrueShadow = DFSF.getShadow(I.getTrueValue());
+ Value *FalseShadow = DFSF.getShadow(I.getFalseValue());
Value *ShadowSel = nullptr;
-
- if (isa<VectorType>(I.getCondition()->getType())) {
+
+ if (isa<VectorType>(I.getCondition()->getType())) {
ShadowSel = DFSF.combineShadowsThenConvert(I.getType(), TrueShadow,
FalseShadow, &I);
- } else {
- if (TrueShadow == FalseShadow) {
- ShadowSel = TrueShadow;
- } else {
- ShadowSel =
- SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I);
- }
- }
+ } else {
+ if (TrueShadow == FalseShadow) {
+ ShadowSel = TrueShadow;
+ } else {
+ ShadowSel =
+ SelectInst::Create(I.getCondition(), TrueShadow, FalseShadow, "", &I);
+ }
+ }
DFSF.setShadow(&I, ClTrackSelectControlFlow
? DFSF.combineShadowsThenConvert(
I.getType(), CondShadow, ShadowSel, &I)
: ShadowSel);
-}
-
-void DFSanVisitor::visitMemSetInst(MemSetInst &I) {
- IRBuilder<> IRB(&I);
- Value *ValShadow = DFSF.getShadow(I.getValue());
- IRB.CreateCall(DFSF.DFS.DFSanSetLabelFn,
- {ValShadow, IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy(
- *DFSF.DFS.Ctx)),
- IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)});
-}
-
-void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
- IRBuilder<> IRB(&I);
- Value *RawDestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I);
- Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I);
- Value *LenShadow =
- IRB.CreateMul(I.getLength(), ConstantInt::get(I.getLength()->getType(),
- DFSF.DFS.ShadowWidthBytes));
- Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx);
- Value *DestShadow = IRB.CreateBitCast(RawDestShadow, Int8Ptr);
- SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr);
- auto *MTI = cast<MemTransferInst>(
- IRB.CreateCall(I.getFunctionType(), I.getCalledOperand(),
- {DestShadow, SrcShadow, LenShadow, I.getVolatileCst()}));
- if (ClPreserveAlignment) {
- MTI->setDestAlignment(I.getDestAlign() * DFSF.DFS.ShadowWidthBytes);
- MTI->setSourceAlignment(I.getSourceAlign() * DFSF.DFS.ShadowWidthBytes);
- } else {
- MTI->setDestAlignment(Align(DFSF.DFS.ShadowWidthBytes));
- MTI->setSourceAlignment(Align(DFSF.DFS.ShadowWidthBytes));
- }
- if (ClEventCallbacks) {
- IRB.CreateCall(DFSF.DFS.DFSanMemTransferCallbackFn,
- {RawDestShadow, I.getLength()});
- }
-}
-
-void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
- if (!DFSF.IsNativeABI && RI.getReturnValue()) {
- switch (DFSF.IA) {
- case DataFlowSanitizer::IA_TLS: {
- Value *S = DFSF.getShadow(RI.getReturnValue());
- IRBuilder<> IRB(&RI);
+}
+
+void DFSanVisitor::visitMemSetInst(MemSetInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *ValShadow = DFSF.getShadow(I.getValue());
+ IRB.CreateCall(DFSF.DFS.DFSanSetLabelFn,
+ {ValShadow, IRB.CreateBitCast(I.getDest(), Type::getInt8PtrTy(
+ *DFSF.DFS.Ctx)),
+ IRB.CreateZExtOrTrunc(I.getLength(), DFSF.DFS.IntptrTy)});
+}
+
+void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *RawDestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I);
+ Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I);
+ Value *LenShadow =
+ IRB.CreateMul(I.getLength(), ConstantInt::get(I.getLength()->getType(),
+ DFSF.DFS.ShadowWidthBytes));
+ Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx);
+ Value *DestShadow = IRB.CreateBitCast(RawDestShadow, Int8Ptr);
+ SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr);
+ auto *MTI = cast<MemTransferInst>(
+ IRB.CreateCall(I.getFunctionType(), I.getCalledOperand(),
+ {DestShadow, SrcShadow, LenShadow, I.getVolatileCst()}));
+ if (ClPreserveAlignment) {
+ MTI->setDestAlignment(I.getDestAlign() * DFSF.DFS.ShadowWidthBytes);
+ MTI->setSourceAlignment(I.getSourceAlign() * DFSF.DFS.ShadowWidthBytes);
+ } else {
+ MTI->setDestAlignment(Align(DFSF.DFS.ShadowWidthBytes));
+ MTI->setSourceAlignment(Align(DFSF.DFS.ShadowWidthBytes));
+ }
+ if (ClEventCallbacks) {
+ IRB.CreateCall(DFSF.DFS.DFSanMemTransferCallbackFn,
+ {RawDestShadow, I.getLength()});
+ }
+}
+
+void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
+ if (!DFSF.IsNativeABI && RI.getReturnValue()) {
+ switch (DFSF.IA) {
+ case DataFlowSanitizer::IA_TLS: {
+ Value *S = DFSF.getShadow(RI.getReturnValue());
+ IRBuilder<> IRB(&RI);
Type *RT = DFSF.F->getFunctionType()->getReturnType();
unsigned Size =
getDataLayout().getTypeAllocSize(DFSF.DFS.getShadowTy(RT));
@@ -1926,166 +1926,166 @@ void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
IRB.CreateAlignedStore(S, DFSF.getRetvalTLS(RT, IRB),
kShadowTLSAlignment);
}
- break;
- }
- case DataFlowSanitizer::IA_Args: {
- IRBuilder<> IRB(&RI);
- Type *RT = DFSF.F->getFunctionType()->getReturnType();
- Value *InsVal =
- IRB.CreateInsertValue(UndefValue::get(RT), RI.getReturnValue(), 0);
- Value *InsShadow =
- IRB.CreateInsertValue(InsVal, DFSF.getShadow(RI.getReturnValue()), 1);
- RI.setOperand(0, InsShadow);
- break;
- }
- }
- }
-}
-
-void DFSanVisitor::visitCallBase(CallBase &CB) {
- Function *F = CB.getCalledFunction();
- if ((F && F->isIntrinsic()) || CB.isInlineAsm()) {
- visitOperandShadowInst(CB);
- return;
- }
-
- // Calls to this function are synthesized in wrappers, and we shouldn't
- // instrument them.
- if (F == DFSF.DFS.DFSanVarargWrapperFn.getCallee()->stripPointerCasts())
- return;
-
- IRBuilder<> IRB(&CB);
-
- DenseMap<Value *, Function *>::iterator i =
- DFSF.DFS.UnwrappedFnMap.find(CB.getCalledOperand());
- if (i != DFSF.DFS.UnwrappedFnMap.end()) {
- Function *F = i->second;
- switch (DFSF.DFS.getWrapperKind(F)) {
- case DataFlowSanitizer::WK_Warning:
- CB.setCalledFunction(F);
- IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn,
- IRB.CreateGlobalStringPtr(F->getName()));
+ break;
+ }
+ case DataFlowSanitizer::IA_Args: {
+ IRBuilder<> IRB(&RI);
+ Type *RT = DFSF.F->getFunctionType()->getReturnType();
+ Value *InsVal =
+ IRB.CreateInsertValue(UndefValue::get(RT), RI.getReturnValue(), 0);
+ Value *InsShadow =
+ IRB.CreateInsertValue(InsVal, DFSF.getShadow(RI.getReturnValue()), 1);
+ RI.setOperand(0, InsShadow);
+ break;
+ }
+ }
+ }
+}
+
+void DFSanVisitor::visitCallBase(CallBase &CB) {
+ Function *F = CB.getCalledFunction();
+ if ((F && F->isIntrinsic()) || CB.isInlineAsm()) {
+ visitOperandShadowInst(CB);
+ return;
+ }
+
+ // Calls to this function are synthesized in wrappers, and we shouldn't
+ // instrument them.
+ if (F == DFSF.DFS.DFSanVarargWrapperFn.getCallee()->stripPointerCasts())
+ return;
+
+ IRBuilder<> IRB(&CB);
+
+ DenseMap<Value *, Function *>::iterator i =
+ DFSF.DFS.UnwrappedFnMap.find(CB.getCalledOperand());
+ if (i != DFSF.DFS.UnwrappedFnMap.end()) {
+ Function *F = i->second;
+ switch (DFSF.DFS.getWrapperKind(F)) {
+ case DataFlowSanitizer::WK_Warning:
+ CB.setCalledFunction(F);
+ IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn,
+ IRB.CreateGlobalStringPtr(F->getName()));
DFSF.setShadow(&CB, DFSF.DFS.getZeroShadow(&CB));
- return;
- case DataFlowSanitizer::WK_Discard:
- CB.setCalledFunction(F);
+ return;
+ case DataFlowSanitizer::WK_Discard:
+ CB.setCalledFunction(F);
DFSF.setShadow(&CB, DFSF.DFS.getZeroShadow(&CB));
- return;
- case DataFlowSanitizer::WK_Functional:
- CB.setCalledFunction(F);
- visitOperandShadowInst(CB);
- return;
- case DataFlowSanitizer::WK_Custom:
- // Don't try to handle invokes of custom functions, it's too complicated.
- // Instead, invoke the dfsw$ wrapper, which will in turn call the __dfsw_
- // wrapper.
- if (CallInst *CI = dyn_cast<CallInst>(&CB)) {
- FunctionType *FT = F->getFunctionType();
- TransformedFunction CustomFn = DFSF.DFS.getCustomFunctionType(FT);
- std::string CustomFName = "__dfsw_";
- CustomFName += F->getName();
- FunctionCallee CustomF = DFSF.DFS.Mod->getOrInsertFunction(
- CustomFName, CustomFn.TransformedType);
- if (Function *CustomFn = dyn_cast<Function>(CustomF.getCallee())) {
- CustomFn->copyAttributesFrom(F);
-
- // Custom functions returning non-void will write to the return label.
- if (!FT->getReturnType()->isVoidTy()) {
- CustomFn->removeAttributes(AttributeList::FunctionIndex,
- DFSF.DFS.ReadOnlyNoneAttrs);
- }
- }
-
- std::vector<Value *> Args;
-
- auto i = CB.arg_begin();
- for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) {
- Type *T = (*i)->getType();
- FunctionType *ParamFT;
- if (isa<PointerType>(T) &&
- (ParamFT = dyn_cast<FunctionType>(
- cast<PointerType>(T)->getElementType()))) {
- std::string TName = "dfst";
- TName += utostr(FT->getNumParams() - n);
- TName += "$";
- TName += F->getName();
- Constant *T = DFSF.DFS.getOrBuildTrampolineFunction(ParamFT, TName);
- Args.push_back(T);
- Args.push_back(
- IRB.CreateBitCast(*i, Type::getInt8PtrTy(*DFSF.DFS.Ctx)));
- } else {
- Args.push_back(*i);
- }
- }
-
- i = CB.arg_begin();
- const unsigned ShadowArgStart = Args.size();
- for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+ return;
+ case DataFlowSanitizer::WK_Functional:
+ CB.setCalledFunction(F);
+ visitOperandShadowInst(CB);
+ return;
+ case DataFlowSanitizer::WK_Custom:
+ // Don't try to handle invokes of custom functions, it's too complicated.
+ // Instead, invoke the dfsw$ wrapper, which will in turn call the __dfsw_
+ // wrapper.
+ if (CallInst *CI = dyn_cast<CallInst>(&CB)) {
+ FunctionType *FT = F->getFunctionType();
+ TransformedFunction CustomFn = DFSF.DFS.getCustomFunctionType(FT);
+ std::string CustomFName = "__dfsw_";
+ CustomFName += F->getName();
+ FunctionCallee CustomF = DFSF.DFS.Mod->getOrInsertFunction(
+ CustomFName, CustomFn.TransformedType);
+ if (Function *CustomFn = dyn_cast<Function>(CustomF.getCallee())) {
+ CustomFn->copyAttributesFrom(F);
+
+ // Custom functions returning non-void will write to the return label.
+ if (!FT->getReturnType()->isVoidTy()) {
+ CustomFn->removeAttributes(AttributeList::FunctionIndex,
+ DFSF.DFS.ReadOnlyNoneAttrs);
+ }
+ }
+
+ std::vector<Value *> Args;
+
+ auto i = CB.arg_begin();
+ for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) {
+ Type *T = (*i)->getType();
+ FunctionType *ParamFT;
+ if (isa<PointerType>(T) &&
+ (ParamFT = dyn_cast<FunctionType>(
+ cast<PointerType>(T)->getElementType()))) {
+ std::string TName = "dfst";
+ TName += utostr(FT->getNumParams() - n);
+ TName += "$";
+ TName += F->getName();
+ Constant *T = DFSF.DFS.getOrBuildTrampolineFunction(ParamFT, TName);
+ Args.push_back(T);
+ Args.push_back(
+ IRB.CreateBitCast(*i, Type::getInt8PtrTy(*DFSF.DFS.Ctx)));
+ } else {
+ Args.push_back(*i);
+ }
+ }
+
+ i = CB.arg_begin();
+ const unsigned ShadowArgStart = Args.size();
+ for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
Args.push_back(
DFSF.collapseToPrimitiveShadow(DFSF.getShadow(*i), &CB));
-
- if (FT->isVarArg()) {
+
+ if (FT->isVarArg()) {
auto *LabelVATy = ArrayType::get(DFSF.DFS.PrimitiveShadowTy,
- CB.arg_size() - FT->getNumParams());
- auto *LabelVAAlloca = new AllocaInst(
- LabelVATy, getDataLayout().getAllocaAddrSpace(),
- "labelva", &DFSF.F->getEntryBlock().front());
-
- for (unsigned n = 0; i != CB.arg_end(); ++i, ++n) {
- auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n);
+ CB.arg_size() - FT->getNumParams());
+ auto *LabelVAAlloca = new AllocaInst(
+ LabelVATy, getDataLayout().getAllocaAddrSpace(),
+ "labelva", &DFSF.F->getEntryBlock().front());
+
+ for (unsigned n = 0; i != CB.arg_end(); ++i, ++n) {
+ auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n);
IRB.CreateStore(
DFSF.collapseToPrimitiveShadow(DFSF.getShadow(*i), &CB),
LabelVAPtr);
- }
-
- Args.push_back(IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, 0));
- }
-
- if (!FT->getReturnType()->isVoidTy()) {
- if (!DFSF.LabelReturnAlloca) {
- DFSF.LabelReturnAlloca =
+ }
+
+ Args.push_back(IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, 0));
+ }
+
+ if (!FT->getReturnType()->isVoidTy()) {
+ if (!DFSF.LabelReturnAlloca) {
+ DFSF.LabelReturnAlloca =
new AllocaInst(DFSF.DFS.PrimitiveShadowTy,
getDataLayout().getAllocaAddrSpace(),
"labelreturn", &DFSF.F->getEntryBlock().front());
- }
- Args.push_back(DFSF.LabelReturnAlloca);
- }
-
- for (i = CB.arg_begin() + FT->getNumParams(); i != CB.arg_end(); ++i)
- Args.push_back(*i);
-
- CallInst *CustomCI = IRB.CreateCall(CustomF, Args);
- CustomCI->setCallingConv(CI->getCallingConv());
- CustomCI->setAttributes(TransformFunctionAttributes(CustomFn,
- CI->getContext(), CI->getAttributes()));
-
- // Update the parameter attributes of the custom call instruction to
- // zero extend the shadow parameters. This is required for targets
+ }
+ Args.push_back(DFSF.LabelReturnAlloca);
+ }
+
+ for (i = CB.arg_begin() + FT->getNumParams(); i != CB.arg_end(); ++i)
+ Args.push_back(*i);
+
+ CallInst *CustomCI = IRB.CreateCall(CustomF, Args);
+ CustomCI->setCallingConv(CI->getCallingConv());
+ CustomCI->setAttributes(TransformFunctionAttributes(CustomFn,
+ CI->getContext(), CI->getAttributes()));
+
+ // Update the parameter attributes of the custom call instruction to
+ // zero extend the shadow parameters. This is required for targets
// which consider PrimitiveShadowTy an illegal type.
- for (unsigned n = 0; n < FT->getNumParams(); n++) {
- const unsigned ArgNo = ShadowArgStart + n;
+ for (unsigned n = 0; n < FT->getNumParams(); n++) {
+ const unsigned ArgNo = ShadowArgStart + n;
if (CustomCI->getArgOperand(ArgNo)->getType() ==
DFSF.DFS.PrimitiveShadowTy)
- CustomCI->addParamAttr(ArgNo, Attribute::ZExt);
- }
-
- if (!FT->getReturnType()->isVoidTy()) {
+ CustomCI->addParamAttr(ArgNo, Attribute::ZExt);
+ }
+
+ if (!FT->getReturnType()->isVoidTy()) {
LoadInst *LabelLoad = IRB.CreateLoad(DFSF.DFS.PrimitiveShadowTy,
DFSF.LabelReturnAlloca);
DFSF.setShadow(CustomCI, DFSF.expandFromPrimitiveShadow(
FT->getReturnType(), LabelLoad, &CB));
- }
-
- CI->replaceAllUsesWith(CustomCI);
- CI->eraseFromParent();
- return;
- }
- break;
- }
- }
-
- FunctionType *FT = CB.getFunctionType();
- if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+ }
+
+ CI->replaceAllUsesWith(CustomCI);
+ CI->eraseFromParent();
+ return;
+ }
+ break;
+ }
+ }
+
+ FunctionType *FT = CB.getFunctionType();
+ if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
unsigned ArgOffset = 0;
const DataLayout &DL = getDataLayout();
for (unsigned I = 0, N = FT->getNumParams(); I != N; ++I) {
@@ -2100,26 +2100,26 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
DFSF.getArgTLS(FT->getParamType(I), ArgOffset, IRB),
kShadowTLSAlignment);
ArgOffset += alignTo(Size, kShadowTLSAlignment);
- }
- }
-
- Instruction *Next = nullptr;
- if (!CB.getType()->isVoidTy()) {
- if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
- if (II->getNormalDest()->getSinglePredecessor()) {
- Next = &II->getNormalDest()->front();
- } else {
- BasicBlock *NewBB =
- SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT);
- Next = &NewBB->front();
- }
- } else {
- assert(CB.getIterator() != CB.getParent()->end());
- Next = CB.getNextNode();
- }
-
- if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
- IRBuilder<> NextIRB(Next);
+ }
+ }
+
+ Instruction *Next = nullptr;
+ if (!CB.getType()->isVoidTy()) {
+ if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+ if (II->getNormalDest()->getSinglePredecessor()) {
+ Next = &II->getNormalDest()->front();
+ } else {
+ BasicBlock *NewBB =
+ SplitEdge(II->getParent(), II->getNormalDest(), &DFSF.DT);
+ Next = &NewBB->front();
+ }
+ } else {
+ assert(CB.getIterator() != CB.getParent()->end());
+ Next = CB.getNextNode();
+ }
+
+ if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
+ IRBuilder<> NextIRB(Next);
const DataLayout &DL = getDataLayout();
unsigned Size = DL.getTypeAllocSize(DFSF.DFS.getShadowTy(&CB));
if (Size > kRetvalTLSSize) {
@@ -2133,83 +2133,83 @@ void DFSanVisitor::visitCallBase(CallBase &CB) {
DFSF.setShadow(&CB, LI);
DFSF.NonZeroChecks.push_back(LI);
}
- }
- }
-
- // Do all instrumentation for IA_Args down here to defer tampering with the
- // CFG in a way that SplitEdge may be able to detect.
- if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) {
- FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT);
- Value *Func =
- IRB.CreateBitCast(CB.getCalledOperand(), PointerType::getUnqual(NewFT));
- std::vector<Value *> Args;
-
- auto i = CB.arg_begin(), E = CB.arg_end();
- for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
- Args.push_back(*i);
-
- i = CB.arg_begin();
- for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
- Args.push_back(DFSF.getShadow(*i));
-
- if (FT->isVarArg()) {
- unsigned VarArgSize = CB.arg_size() - FT->getNumParams();
+ }
+ }
+
+ // Do all instrumentation for IA_Args down here to defer tampering with the
+ // CFG in a way that SplitEdge may be able to detect.
+ if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) {
+ FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT);
+ Value *Func =
+ IRB.CreateBitCast(CB.getCalledOperand(), PointerType::getUnqual(NewFT));
+ std::vector<Value *> Args;
+
+ auto i = CB.arg_begin(), E = CB.arg_end();
+ for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+ Args.push_back(*i);
+
+ i = CB.arg_begin();
+ for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
+ Args.push_back(DFSF.getShadow(*i));
+
+ if (FT->isVarArg()) {
+ unsigned VarArgSize = CB.arg_size() - FT->getNumParams();
ArrayType *VarArgArrayTy =
ArrayType::get(DFSF.DFS.PrimitiveShadowTy, VarArgSize);
- AllocaInst *VarArgShadow =
- new AllocaInst(VarArgArrayTy, getDataLayout().getAllocaAddrSpace(),
- "", &DFSF.F->getEntryBlock().front());
- Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0));
- for (unsigned n = 0; i != E; ++i, ++n) {
- IRB.CreateStore(
- DFSF.getShadow(*i),
- IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, n));
- Args.push_back(*i);
- }
- }
-
- CallBase *NewCB;
- if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
- NewCB = IRB.CreateInvoke(NewFT, Func, II->getNormalDest(),
- II->getUnwindDest(), Args);
- } else {
- NewCB = IRB.CreateCall(NewFT, Func, Args);
- }
- NewCB->setCallingConv(CB.getCallingConv());
- NewCB->setAttributes(CB.getAttributes().removeAttributes(
- *DFSF.DFS.Ctx, AttributeList::ReturnIndex,
- AttributeFuncs::typeIncompatible(NewCB->getType())));
-
- if (Next) {
- ExtractValueInst *ExVal = ExtractValueInst::Create(NewCB, 0, "", Next);
- DFSF.SkipInsts.insert(ExVal);
- ExtractValueInst *ExShadow = ExtractValueInst::Create(NewCB, 1, "", Next);
- DFSF.SkipInsts.insert(ExShadow);
- DFSF.setShadow(ExVal, ExShadow);
- DFSF.NonZeroChecks.push_back(ExShadow);
-
- CB.replaceAllUsesWith(ExVal);
- }
-
- CB.eraseFromParent();
- }
-}
-
-void DFSanVisitor::visitPHINode(PHINode &PN) {
+ AllocaInst *VarArgShadow =
+ new AllocaInst(VarArgArrayTy, getDataLayout().getAllocaAddrSpace(),
+ "", &DFSF.F->getEntryBlock().front());
+ Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0));
+ for (unsigned n = 0; i != E; ++i, ++n) {
+ IRB.CreateStore(
+ DFSF.getShadow(*i),
+ IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, n));
+ Args.push_back(*i);
+ }
+ }
+
+ CallBase *NewCB;
+ if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+ NewCB = IRB.CreateInvoke(NewFT, Func, II->getNormalDest(),
+ II->getUnwindDest(), Args);
+ } else {
+ NewCB = IRB.CreateCall(NewFT, Func, Args);
+ }
+ NewCB->setCallingConv(CB.getCallingConv());
+ NewCB->setAttributes(CB.getAttributes().removeAttributes(
+ *DFSF.DFS.Ctx, AttributeList::ReturnIndex,
+ AttributeFuncs::typeIncompatible(NewCB->getType())));
+
+ if (Next) {
+ ExtractValueInst *ExVal = ExtractValueInst::Create(NewCB, 0, "", Next);
+ DFSF.SkipInsts.insert(ExVal);
+ ExtractValueInst *ExShadow = ExtractValueInst::Create(NewCB, 1, "", Next);
+ DFSF.SkipInsts.insert(ExShadow);
+ DFSF.setShadow(ExVal, ExShadow);
+ DFSF.NonZeroChecks.push_back(ExShadow);
+
+ CB.replaceAllUsesWith(ExVal);
+ }
+
+ CB.eraseFromParent();
+ }
+}
+
+void DFSanVisitor::visitPHINode(PHINode &PN) {
Type *ShadowTy = DFSF.DFS.getShadowTy(&PN);
- PHINode *ShadowPN =
+ PHINode *ShadowPN =
PHINode::Create(ShadowTy, PN.getNumIncomingValues(), "", &PN);
-
- // Give the shadow phi node valid predecessors to fool SplitEdge into working.
+
+ // Give the shadow phi node valid predecessors to fool SplitEdge into working.
Value *UndefShadow = UndefValue::get(ShadowTy);
- for (PHINode::block_iterator i = PN.block_begin(), e = PN.block_end(); i != e;
- ++i) {
- ShadowPN->addIncoming(UndefShadow, *i);
- }
-
- DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN));
- DFSF.setShadow(&PN, ShadowPN);
-}
+ for (PHINode::block_iterator i = PN.block_begin(), e = PN.block_end(); i != e;
+ ++i) {
+ ShadowPN->addIncoming(UndefShadow, *i);
+ }
+
+ DFSF.PHIFixups.push_back(std::make_pair(&PN, ShadowPN));
+ DFSF.setShadow(&PN, ShadowPN);
+}
namespace {
class DataFlowSanitizerLegacyPass : public ModulePass {
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 8d53a5d27f..527644a69d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -1,185 +1,185 @@
-//===- GCOVProfiling.cpp - Insert edge counters for gcov profiling --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass implements GCOV-style profiling. When this pass is run it emits
-// "gcno" files next to the existing source, and instruments the code that runs
-// to records the edges between blocks that run and emit a complementary "gcda"
-// file on exit.
-//
-//===----------------------------------------------------------------------===//
-
+//===- GCOVProfiling.cpp - Insert edge counters for gcov profiling --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements GCOV-style profiling. When this pass is run it emits
+// "gcno" files next to the existing source, and instruments the code that runs
+// to records the edges between blocks that run and emit a complementary "gcda"
+// file on exit.
+//
+//===----------------------------------------------------------------------===//
+
#include "CFGMST.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
#include "llvm/Support/CRC.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/Regex.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <utility>
-
-using namespace llvm;
-namespace endian = llvm::support::endian;
-
-#define DEBUG_TYPE "insert-gcov-profiling"
-
-enum : uint32_t {
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/GCOVProfiler.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <utility>
+
+using namespace llvm;
+namespace endian = llvm::support::endian;
+
+#define DEBUG_TYPE "insert-gcov-profiling"
+
+enum : uint32_t {
GCOV_ARC_ON_TREE = 1 << 0,
- GCOV_TAG_FUNCTION = 0x01000000,
- GCOV_TAG_BLOCKS = 0x01410000,
- GCOV_TAG_ARCS = 0x01430000,
- GCOV_TAG_LINES = 0x01450000,
-};
-
-static cl::opt<std::string> DefaultGCOVVersion("default-gcov-version",
- cl::init("408*"), cl::Hidden,
- cl::ValueRequired);
-
+ GCOV_TAG_FUNCTION = 0x01000000,
+ GCOV_TAG_BLOCKS = 0x01410000,
+ GCOV_TAG_ARCS = 0x01430000,
+ GCOV_TAG_LINES = 0x01450000,
+};
+
+static cl::opt<std::string> DefaultGCOVVersion("default-gcov-version",
+ cl::init("408*"), cl::Hidden,
+ cl::ValueRequired);
+
static cl::opt<bool> AtomicCounter("gcov-atomic-counter", cl::Hidden,
cl::desc("Make counter updates atomic"));
-// Returns the number of words which will be used to represent this string.
-static unsigned wordsOfString(StringRef s) {
- // Length + NUL-terminated string + 0~3 padding NULs.
- return (s.size() / 4) + 2;
-}
-
-GCOVOptions GCOVOptions::getDefault() {
- GCOVOptions Options;
- Options.EmitNotes = true;
- Options.EmitData = true;
- Options.NoRedZone = false;
+// Returns the number of words which will be used to represent this string.
+static unsigned wordsOfString(StringRef s) {
+ // Length + NUL-terminated string + 0~3 padding NULs.
+ return (s.size() / 4) + 2;
+}
+
+GCOVOptions GCOVOptions::getDefault() {
+ GCOVOptions Options;
+ Options.EmitNotes = true;
+ Options.EmitData = true;
+ Options.NoRedZone = false;
Options.Atomic = AtomicCounter;
-
- if (DefaultGCOVVersion.size() != 4) {
- llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") +
- DefaultGCOVVersion);
- }
- memcpy(Options.Version, DefaultGCOVVersion.c_str(), 4);
- return Options;
-}
-
-namespace {
-class GCOVFunction;
-
-class GCOVProfiler {
-public:
- GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
- GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {}
- bool
+
+ if (DefaultGCOVVersion.size() != 4) {
+ llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") +
+ DefaultGCOVVersion);
+ }
+ memcpy(Options.Version, DefaultGCOVVersion.c_str(), 4);
+ return Options;
+}
+
+namespace {
+class GCOVFunction;
+
+class GCOVProfiler {
+public:
+ GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
+ GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {}
+ bool
runOnModule(Module &M, function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
- std::function<const TargetLibraryInfo &(Function &F)> GetTLI);
-
- void write(uint32_t i) {
- char Bytes[4];
- endian::write32(Bytes, i, Endian);
- os->write(Bytes, 4);
- }
- void writeString(StringRef s) {
- write(wordsOfString(s) - 1);
- os->write(s.data(), s.size());
- os->write_zeros(4 - s.size() % 4);
- }
- void writeBytes(const char *Bytes, int Size) { os->write(Bytes, Size); }
-
-private:
- // Create the .gcno files for the Module based on DebugInfo.
+ std::function<const TargetLibraryInfo &(Function &F)> GetTLI);
+
+ void write(uint32_t i) {
+ char Bytes[4];
+ endian::write32(Bytes, i, Endian);
+ os->write(Bytes, 4);
+ }
+ void writeString(StringRef s) {
+ write(wordsOfString(s) - 1);
+ os->write(s.data(), s.size());
+ os->write_zeros(4 - s.size() % 4);
+ }
+ void writeBytes(const char *Bytes, int Size) { os->write(Bytes, Size); }
+
+private:
+ // Create the .gcno files for the Module based on DebugInfo.
bool
emitProfileNotes(NamedMDNode *CUNode, bool HasExecOrFork,
function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
function_ref<const TargetLibraryInfo &(Function &F)> GetTLI);
-
+
void emitGlobalConstructor(
SmallVectorImpl<std::pair<GlobalVariable *, MDNode *>> &CountersBySP);
-
- bool isFunctionInstrumented(const Function &F);
- std::vector<Regex> createRegexesFromString(StringRef RegexesStr);
- static bool doesFilenameMatchARegex(StringRef Filename,
- std::vector<Regex> &Regexes);
-
- // Get pointers to the functions in the runtime library.
- FunctionCallee getStartFileFunc(const TargetLibraryInfo *TLI);
- FunctionCallee getEmitFunctionFunc(const TargetLibraryInfo *TLI);
- FunctionCallee getEmitArcsFunc(const TargetLibraryInfo *TLI);
- FunctionCallee getSummaryInfoFunc();
- FunctionCallee getEndFileFunc();
-
- // Add the function to write out all our counters to the global destructor
- // list.
- Function *
- insertCounterWriteout(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
- Function *insertReset(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
-
- bool AddFlushBeforeForkAndExec();
-
- enum class GCovFileType { GCNO, GCDA };
- std::string mangleName(const DICompileUnit *CU, GCovFileType FileType);
-
- GCOVOptions Options;
- support::endianness Endian;
- raw_ostream *os;
-
- // Checksum, produced by hash of EdgeDestinations
- SmallVector<uint32_t, 4> FileChecksums;
-
- Module *M = nullptr;
- std::function<const TargetLibraryInfo &(Function &F)> GetTLI;
- LLVMContext *Ctx = nullptr;
- SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
- std::vector<Regex> FilterRe;
- std::vector<Regex> ExcludeRe;
+
+ bool isFunctionInstrumented(const Function &F);
+ std::vector<Regex> createRegexesFromString(StringRef RegexesStr);
+ static bool doesFilenameMatchARegex(StringRef Filename,
+ std::vector<Regex> &Regexes);
+
+ // Get pointers to the functions in the runtime library.
+ FunctionCallee getStartFileFunc(const TargetLibraryInfo *TLI);
+ FunctionCallee getEmitFunctionFunc(const TargetLibraryInfo *TLI);
+ FunctionCallee getEmitArcsFunc(const TargetLibraryInfo *TLI);
+ FunctionCallee getSummaryInfoFunc();
+ FunctionCallee getEndFileFunc();
+
+ // Add the function to write out all our counters to the global destructor
+ // list.
+ Function *
+ insertCounterWriteout(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
+ Function *insertReset(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
+
+ bool AddFlushBeforeForkAndExec();
+
+ enum class GCovFileType { GCNO, GCDA };
+ std::string mangleName(const DICompileUnit *CU, GCovFileType FileType);
+
+ GCOVOptions Options;
+ support::endianness Endian;
+ raw_ostream *os;
+
+ // Checksum, produced by hash of EdgeDestinations
+ SmallVector<uint32_t, 4> FileChecksums;
+
+ Module *M = nullptr;
+ std::function<const TargetLibraryInfo &(Function &F)> GetTLI;
+ LLVMContext *Ctx = nullptr;
+ SmallVector<std::unique_ptr<GCOVFunction>, 16> Funcs;
+ std::vector<Regex> FilterRe;
+ std::vector<Regex> ExcludeRe;
DenseSet<const BasicBlock *> ExecBlocks;
- StringMap<bool> InstrumentedFiles;
-};
-
-class GCOVProfilerLegacyPass : public ModulePass {
-public:
- static char ID;
- GCOVProfilerLegacyPass()
- : GCOVProfilerLegacyPass(GCOVOptions::getDefault()) {}
- GCOVProfilerLegacyPass(const GCOVOptions &Opts)
- : ModulePass(ID), Profiler(Opts) {
- initializeGCOVProfilerLegacyPassPass(*PassRegistry::getPassRegistry());
- }
- StringRef getPassName() const override { return "GCOV Profiler"; }
-
- bool runOnModule(Module &M) override {
+ StringMap<bool> InstrumentedFiles;
+};
+
+class GCOVProfilerLegacyPass : public ModulePass {
+public:
+ static char ID;
+ GCOVProfilerLegacyPass()
+ : GCOVProfilerLegacyPass(GCOVOptions::getDefault()) {}
+ GCOVProfilerLegacyPass(const GCOVOptions &Opts)
+ : ModulePass(ID), Profiler(Opts) {
+ initializeGCOVProfilerLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+ StringRef getPassName() const override { return "GCOV Profiler"; }
+
+ bool runOnModule(Module &M) override {
auto GetBFI = [this](Function &F) {
return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
};
@@ -190,16 +190,16 @@ public:
return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
};
return Profiler.runOnModule(M, GetBFI, GetBPI, GetTLI);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<BlockFrequencyInfoWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-
-private:
- GCOVProfiler Profiler;
-};
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+
+private:
+ GCOVProfiler Profiler;
+};
struct BBInfo {
BBInfo *Group;
@@ -234,225 +234,225 @@ struct Edge {
.str();
}
};
-}
-
-char GCOVProfilerLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(
- GCOVProfilerLegacyPass, "insert-gcov-profiling",
- "Insert instrumentation for GCOV profiling", false, false)
+}
+
+char GCOVProfilerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(
+ GCOVProfilerLegacyPass, "insert-gcov-profiling",
+ "Insert instrumentation for GCOV profiling", false, false)
INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(
- GCOVProfilerLegacyPass, "insert-gcov-profiling",
- "Insert instrumentation for GCOV profiling", false, false)
-
-ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) {
- return new GCOVProfilerLegacyPass(Options);
-}
-
-static StringRef getFunctionName(const DISubprogram *SP) {
- if (!SP->getLinkageName().empty())
- return SP->getLinkageName();
- return SP->getName();
-}
-
-/// Extract a filename for a DISubprogram.
-///
-/// Prefer relative paths in the coverage notes. Clang also may split
-/// up absolute paths into a directory and filename component. When
-/// the relative path doesn't exist, reconstruct the absolute path.
-static SmallString<128> getFilename(const DISubprogram *SP) {
- SmallString<128> Path;
- StringRef RelPath = SP->getFilename();
- if (sys::fs::exists(RelPath))
- Path = RelPath;
- else
- sys::path::append(Path, SP->getDirectory(), SP->getFilename());
- return Path;
-}
-
-namespace {
- class GCOVRecord {
- protected:
- GCOVProfiler *P;
-
- GCOVRecord(GCOVProfiler *P) : P(P) {}
-
- void write(uint32_t i) { P->write(i); }
- void writeString(StringRef s) { P->writeString(s); }
- void writeBytes(const char *Bytes, int Size) { P->writeBytes(Bytes, Size); }
- };
-
- class GCOVFunction;
- class GCOVBlock;
-
- // Constructed only by requesting it from a GCOVBlock, this object stores a
- // list of line numbers and a single filename, representing lines that belong
- // to the block.
- class GCOVLines : public GCOVRecord {
- public:
- void addLine(uint32_t Line) {
- assert(Line != 0 && "Line zero is not a valid real line number.");
- Lines.push_back(Line);
- }
-
- uint32_t length() const {
- return 1 + wordsOfString(Filename) + Lines.size();
- }
-
- void writeOut() {
- write(0);
- writeString(Filename);
- for (int i = 0, e = Lines.size(); i != e; ++i)
- write(Lines[i]);
- }
-
- GCOVLines(GCOVProfiler *P, StringRef F)
- : GCOVRecord(P), Filename(std::string(F)) {}
-
- private:
- std::string Filename;
- SmallVector<uint32_t, 32> Lines;
- };
-
-
- // Represent a basic block in GCOV. Each block has a unique number in the
- // function, number of lines belonging to each block, and a set of edges to
- // other blocks.
- class GCOVBlock : public GCOVRecord {
- public:
- GCOVLines &getFile(StringRef Filename) {
- return LinesByFile.try_emplace(Filename, P, Filename).first->second;
- }
-
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+ GCOVProfilerLegacyPass, "insert-gcov-profiling",
+ "Insert instrumentation for GCOV profiling", false, false)
+
+ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) {
+ return new GCOVProfilerLegacyPass(Options);
+}
+
+static StringRef getFunctionName(const DISubprogram *SP) {
+ if (!SP->getLinkageName().empty())
+ return SP->getLinkageName();
+ return SP->getName();
+}
+
+/// Extract a filename for a DISubprogram.
+///
+/// Prefer relative paths in the coverage notes. Clang also may split
+/// up absolute paths into a directory and filename component. When
+/// the relative path doesn't exist, reconstruct the absolute path.
+static SmallString<128> getFilename(const DISubprogram *SP) {
+ SmallString<128> Path;
+ StringRef RelPath = SP->getFilename();
+ if (sys::fs::exists(RelPath))
+ Path = RelPath;
+ else
+ sys::path::append(Path, SP->getDirectory(), SP->getFilename());
+ return Path;
+}
+
+namespace {
+ class GCOVRecord {
+ protected:
+ GCOVProfiler *P;
+
+ GCOVRecord(GCOVProfiler *P) : P(P) {}
+
+ void write(uint32_t i) { P->write(i); }
+ void writeString(StringRef s) { P->writeString(s); }
+ void writeBytes(const char *Bytes, int Size) { P->writeBytes(Bytes, Size); }
+ };
+
+ class GCOVFunction;
+ class GCOVBlock;
+
+ // Constructed only by requesting it from a GCOVBlock, this object stores a
+ // list of line numbers and a single filename, representing lines that belong
+ // to the block.
+ class GCOVLines : public GCOVRecord {
+ public:
+ void addLine(uint32_t Line) {
+ assert(Line != 0 && "Line zero is not a valid real line number.");
+ Lines.push_back(Line);
+ }
+
+ uint32_t length() const {
+ return 1 + wordsOfString(Filename) + Lines.size();
+ }
+
+ void writeOut() {
+ write(0);
+ writeString(Filename);
+ for (int i = 0, e = Lines.size(); i != e; ++i)
+ write(Lines[i]);
+ }
+
+ GCOVLines(GCOVProfiler *P, StringRef F)
+ : GCOVRecord(P), Filename(std::string(F)) {}
+
+ private:
+ std::string Filename;
+ SmallVector<uint32_t, 32> Lines;
+ };
+
+
+ // Represent a basic block in GCOV. Each block has a unique number in the
+ // function, number of lines belonging to each block, and a set of edges to
+ // other blocks.
+ class GCOVBlock : public GCOVRecord {
+ public:
+ GCOVLines &getFile(StringRef Filename) {
+ return LinesByFile.try_emplace(Filename, P, Filename).first->second;
+ }
+
void addEdge(GCOVBlock &Successor, uint32_t Flags) {
OutEdges.emplace_back(&Successor, Flags);
- }
-
- void writeOut() {
- uint32_t Len = 3;
- SmallVector<StringMapEntry<GCOVLines> *, 32> SortedLinesByFile;
- for (auto &I : LinesByFile) {
- Len += I.second.length();
- SortedLinesByFile.push_back(&I);
- }
-
- write(GCOV_TAG_LINES);
- write(Len);
- write(Number);
-
- llvm::sort(SortedLinesByFile, [](StringMapEntry<GCOVLines> *LHS,
- StringMapEntry<GCOVLines> *RHS) {
- return LHS->getKey() < RHS->getKey();
- });
- for (auto &I : SortedLinesByFile)
- I->getValue().writeOut();
- write(0);
- write(0);
- }
-
- GCOVBlock(const GCOVBlock &RHS) : GCOVRecord(RHS), Number(RHS.Number) {
- // Only allow copy before edges and lines have been added. After that,
- // there are inter-block pointers (eg: edges) that won't take kindly to
- // blocks being copied or moved around.
- assert(LinesByFile.empty());
- assert(OutEdges.empty());
- }
-
+ }
+
+ void writeOut() {
+ uint32_t Len = 3;
+ SmallVector<StringMapEntry<GCOVLines> *, 32> SortedLinesByFile;
+ for (auto &I : LinesByFile) {
+ Len += I.second.length();
+ SortedLinesByFile.push_back(&I);
+ }
+
+ write(GCOV_TAG_LINES);
+ write(Len);
+ write(Number);
+
+ llvm::sort(SortedLinesByFile, [](StringMapEntry<GCOVLines> *LHS,
+ StringMapEntry<GCOVLines> *RHS) {
+ return LHS->getKey() < RHS->getKey();
+ });
+ for (auto &I : SortedLinesByFile)
+ I->getValue().writeOut();
+ write(0);
+ write(0);
+ }
+
+ GCOVBlock(const GCOVBlock &RHS) : GCOVRecord(RHS), Number(RHS.Number) {
+ // Only allow copy before edges and lines have been added. After that,
+ // there are inter-block pointers (eg: edges) that won't take kindly to
+ // blocks being copied or moved around.
+ assert(LinesByFile.empty());
+ assert(OutEdges.empty());
+ }
+
uint32_t Number;
SmallVector<std::pair<GCOVBlock *, uint32_t>, 4> OutEdges;
private:
- friend class GCOVFunction;
-
- GCOVBlock(GCOVProfiler *P, uint32_t Number)
- : GCOVRecord(P), Number(Number) {}
-
- StringMap<GCOVLines> LinesByFile;
- };
-
- // A function has a unique identifier, a checksum (we leave as zero) and a
- // set of blocks and a map of edges between blocks. This is the only GCOV
- // object users can construct, the blocks and lines will be rooted here.
- class GCOVFunction : public GCOVRecord {
- public:
- GCOVFunction(GCOVProfiler *P, Function *F, const DISubprogram *SP,
- unsigned EndLine, uint32_t Ident, int Version)
- : GCOVRecord(P), SP(SP), EndLine(EndLine), Ident(Ident),
+ friend class GCOVFunction;
+
+ GCOVBlock(GCOVProfiler *P, uint32_t Number)
+ : GCOVRecord(P), Number(Number) {}
+
+ StringMap<GCOVLines> LinesByFile;
+ };
+
+ // A function has a unique identifier, a checksum (we leave as zero) and a
+ // set of blocks and a map of edges between blocks. This is the only GCOV
+ // object users can construct, the blocks and lines will be rooted here.
+ class GCOVFunction : public GCOVRecord {
+ public:
+ GCOVFunction(GCOVProfiler *P, Function *F, const DISubprogram *SP,
+ unsigned EndLine, uint32_t Ident, int Version)
+ : GCOVRecord(P), SP(SP), EndLine(EndLine), Ident(Ident),
Version(Version), EntryBlock(P, 0), ReturnBlock(P, 1) {
- LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
- bool ExitBlockBeforeBody = Version >= 48;
+ LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
+ bool ExitBlockBeforeBody = Version >= 48;
uint32_t i = ExitBlockBeforeBody ? 2 : 1;
for (BasicBlock &BB : *F)
- Blocks.insert(std::make_pair(&BB, GCOVBlock(P, i++)));
- if (!ExitBlockBeforeBody)
- ReturnBlock.Number = i;
-
- std::string FunctionNameAndLine;
- raw_string_ostream FNLOS(FunctionNameAndLine);
- FNLOS << getFunctionName(SP) << SP->getLine();
- FNLOS.flush();
- FuncChecksum = hash_value(FunctionNameAndLine);
- }
-
+ Blocks.insert(std::make_pair(&BB, GCOVBlock(P, i++)));
+ if (!ExitBlockBeforeBody)
+ ReturnBlock.Number = i;
+
+ std::string FunctionNameAndLine;
+ raw_string_ostream FNLOS(FunctionNameAndLine);
+ FNLOS << getFunctionName(SP) << SP->getLine();
+ FNLOS.flush();
+ FuncChecksum = hash_value(FunctionNameAndLine);
+ }
+
GCOVBlock &getBlock(const BasicBlock *BB) {
return Blocks.find(const_cast<BasicBlock *>(BB))->second;
- }
-
+ }
+
GCOVBlock &getEntryBlock() { return EntryBlock; }
- GCOVBlock &getReturnBlock() {
- return ReturnBlock;
- }
-
- uint32_t getFuncChecksum() const {
- return FuncChecksum;
- }
-
- void writeOut(uint32_t CfgChecksum) {
- write(GCOV_TAG_FUNCTION);
- SmallString<128> Filename = getFilename(SP);
- uint32_t BlockLen =
- 2 + (Version >= 47) + wordsOfString(getFunctionName(SP));
- if (Version < 80)
- BlockLen += wordsOfString(Filename) + 1;
- else
- BlockLen += 1 + wordsOfString(Filename) + 3 + (Version >= 90);
-
- write(BlockLen);
- write(Ident);
- write(FuncChecksum);
- if (Version >= 47)
- write(CfgChecksum);
- writeString(getFunctionName(SP));
- if (Version < 80) {
- writeString(Filename);
- write(SP->getLine());
- } else {
- write(SP->isArtificial()); // artificial
- writeString(Filename);
- write(SP->getLine()); // start_line
- write(0); // start_column
- // EndLine is the last line with !dbg. It is not the } line as in GCC,
- // but good enough.
- write(EndLine);
- if (Version >= 90)
- write(0); // end_column
- }
-
- // Emit count of blocks.
- write(GCOV_TAG_BLOCKS);
- if (Version < 80) {
+ GCOVBlock &getReturnBlock() {
+ return ReturnBlock;
+ }
+
+ uint32_t getFuncChecksum() const {
+ return FuncChecksum;
+ }
+
+ void writeOut(uint32_t CfgChecksum) {
+ write(GCOV_TAG_FUNCTION);
+ SmallString<128> Filename = getFilename(SP);
+ uint32_t BlockLen =
+ 2 + (Version >= 47) + wordsOfString(getFunctionName(SP));
+ if (Version < 80)
+ BlockLen += wordsOfString(Filename) + 1;
+ else
+ BlockLen += 1 + wordsOfString(Filename) + 3 + (Version >= 90);
+
+ write(BlockLen);
+ write(Ident);
+ write(FuncChecksum);
+ if (Version >= 47)
+ write(CfgChecksum);
+ writeString(getFunctionName(SP));
+ if (Version < 80) {
+ writeString(Filename);
+ write(SP->getLine());
+ } else {
+ write(SP->isArtificial()); // artificial
+ writeString(Filename);
+ write(SP->getLine()); // start_line
+ write(0); // start_column
+ // EndLine is the last line with !dbg. It is not the } line as in GCC,
+ // but good enough.
+ write(EndLine);
+ if (Version >= 90)
+ write(0); // end_column
+ }
+
+ // Emit count of blocks.
+ write(GCOV_TAG_BLOCKS);
+ if (Version < 80) {
write(Blocks.size() + 2);
for (int i = Blocks.size() + 2; i; --i)
- write(0);
- } else {
- write(1);
+ write(0);
+ } else {
+ write(1);
write(Blocks.size() + 2);
- }
- LLVM_DEBUG(dbgs() << (Blocks.size() + 1) << " blocks\n");
-
- // Emit edges between blocks.
+ }
+ LLVM_DEBUG(dbgs() << (Blocks.size() + 1) << " blocks\n");
+
+ // Emit edges between blocks.
const uint32_t Outgoing = EntryBlock.OutEdges.size();
if (Outgoing) {
write(GCOV_TAG_ARCS);
@@ -465,169 +465,169 @@ namespace {
}
for (auto &It : Blocks) {
const GCOVBlock &Block = It.second;
- if (Block.OutEdges.empty()) continue;
-
- write(GCOV_TAG_ARCS);
- write(Block.OutEdges.size() * 2 + 1);
- write(Block.Number);
+ if (Block.OutEdges.empty()) continue;
+
+ write(GCOV_TAG_ARCS);
+ write(Block.OutEdges.size() * 2 + 1);
+ write(Block.Number);
for (const auto &E : Block.OutEdges) {
write(E.first->Number);
write(E.second);
- }
- }
-
- // Emit lines for each block.
+ }
+ }
+
+ // Emit lines for each block.
for (auto &It : Blocks)
It.second.writeOut();
- }
-
+ }
+
public:
- const DISubprogram *SP;
- unsigned EndLine;
- uint32_t Ident;
- uint32_t FuncChecksum;
- int Version;
+ const DISubprogram *SP;
+ unsigned EndLine;
+ uint32_t Ident;
+ uint32_t FuncChecksum;
+ int Version;
MapVector<BasicBlock *, GCOVBlock> Blocks;
GCOVBlock EntryBlock;
- GCOVBlock ReturnBlock;
- };
-}
-
-// RegexesStr is a string containing differents regex separated by a semi-colon.
-// For example "foo\..*$;bar\..*$".
-std::vector<Regex> GCOVProfiler::createRegexesFromString(StringRef RegexesStr) {
- std::vector<Regex> Regexes;
- while (!RegexesStr.empty()) {
- std::pair<StringRef, StringRef> HeadTail = RegexesStr.split(';');
- if (!HeadTail.first.empty()) {
- Regex Re(HeadTail.first);
- std::string Err;
- if (!Re.isValid(Err)) {
- Ctx->emitError(Twine("Regex ") + HeadTail.first +
- " is not valid: " + Err);
- }
- Regexes.emplace_back(std::move(Re));
- }
- RegexesStr = HeadTail.second;
- }
- return Regexes;
-}
-
-bool GCOVProfiler::doesFilenameMatchARegex(StringRef Filename,
- std::vector<Regex> &Regexes) {
- for (Regex &Re : Regexes)
- if (Re.match(Filename))
- return true;
- return false;
-}
-
-bool GCOVProfiler::isFunctionInstrumented(const Function &F) {
- if (FilterRe.empty() && ExcludeRe.empty()) {
- return true;
- }
- SmallString<128> Filename = getFilename(F.getSubprogram());
- auto It = InstrumentedFiles.find(Filename);
- if (It != InstrumentedFiles.end()) {
- return It->second;
- }
-
- SmallString<256> RealPath;
- StringRef RealFilename;
-
- // Path can be
- // /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/bits/*.h so for
- // such a case we must get the real_path.
- if (sys::fs::real_path(Filename, RealPath)) {
- // real_path can fail with path like "foo.c".
- RealFilename = Filename;
- } else {
- RealFilename = RealPath;
- }
-
- bool ShouldInstrument;
- if (FilterRe.empty()) {
- ShouldInstrument = !doesFilenameMatchARegex(RealFilename, ExcludeRe);
- } else if (ExcludeRe.empty()) {
- ShouldInstrument = doesFilenameMatchARegex(RealFilename, FilterRe);
- } else {
- ShouldInstrument = doesFilenameMatchARegex(RealFilename, FilterRe) &&
- !doesFilenameMatchARegex(RealFilename, ExcludeRe);
- }
- InstrumentedFiles[Filename] = ShouldInstrument;
- return ShouldInstrument;
-}
-
-std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
- GCovFileType OutputType) {
- bool Notes = OutputType == GCovFileType::GCNO;
-
- if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) {
- for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) {
- MDNode *N = GCov->getOperand(i);
- bool ThreeElement = N->getNumOperands() == 3;
- if (!ThreeElement && N->getNumOperands() != 2)
- continue;
- if (dyn_cast<MDNode>(N->getOperand(ThreeElement ? 2 : 1)) != CU)
- continue;
-
- if (ThreeElement) {
- // These nodes have no mangling to apply, it's stored mangled in the
- // bitcode.
- MDString *NotesFile = dyn_cast<MDString>(N->getOperand(0));
- MDString *DataFile = dyn_cast<MDString>(N->getOperand(1));
- if (!NotesFile || !DataFile)
- continue;
- return std::string(Notes ? NotesFile->getString()
- : DataFile->getString());
- }
-
- MDString *GCovFile = dyn_cast<MDString>(N->getOperand(0));
- if (!GCovFile)
- continue;
-
- SmallString<128> Filename = GCovFile->getString();
- sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
- return std::string(Filename.str());
- }
- }
-
- SmallString<128> Filename = CU->getFilename();
- sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
- StringRef FName = sys::path::filename(Filename);
- SmallString<128> CurPath;
- if (sys::fs::current_path(CurPath))
- return std::string(FName);
- sys::path::append(CurPath, FName);
- return std::string(CurPath.str());
-}
-
-bool GCOVProfiler::runOnModule(
+ GCOVBlock ReturnBlock;
+ };
+}
+
+// RegexesStr is a string containing differents regex separated by a semi-colon.
+// For example "foo\..*$;bar\..*$".
+std::vector<Regex> GCOVProfiler::createRegexesFromString(StringRef RegexesStr) {
+ std::vector<Regex> Regexes;
+ while (!RegexesStr.empty()) {
+ std::pair<StringRef, StringRef> HeadTail = RegexesStr.split(';');
+ if (!HeadTail.first.empty()) {
+ Regex Re(HeadTail.first);
+ std::string Err;
+ if (!Re.isValid(Err)) {
+ Ctx->emitError(Twine("Regex ") + HeadTail.first +
+ " is not valid: " + Err);
+ }
+ Regexes.emplace_back(std::move(Re));
+ }
+ RegexesStr = HeadTail.second;
+ }
+ return Regexes;
+}
+
+bool GCOVProfiler::doesFilenameMatchARegex(StringRef Filename,
+ std::vector<Regex> &Regexes) {
+ for (Regex &Re : Regexes)
+ if (Re.match(Filename))
+ return true;
+ return false;
+}
+
+bool GCOVProfiler::isFunctionInstrumented(const Function &F) {
+ if (FilterRe.empty() && ExcludeRe.empty()) {
+ return true;
+ }
+ SmallString<128> Filename = getFilename(F.getSubprogram());
+ auto It = InstrumentedFiles.find(Filename);
+ if (It != InstrumentedFiles.end()) {
+ return It->second;
+ }
+
+ SmallString<256> RealPath;
+ StringRef RealFilename;
+
+ // Path can be
+ // /usr/lib/gcc/x86_64-linux-gnu/8/../../../../include/c++/8/bits/*.h so for
+ // such a case we must get the real_path.
+ if (sys::fs::real_path(Filename, RealPath)) {
+ // real_path can fail with path like "foo.c".
+ RealFilename = Filename;
+ } else {
+ RealFilename = RealPath;
+ }
+
+ bool ShouldInstrument;
+ if (FilterRe.empty()) {
+ ShouldInstrument = !doesFilenameMatchARegex(RealFilename, ExcludeRe);
+ } else if (ExcludeRe.empty()) {
+ ShouldInstrument = doesFilenameMatchARegex(RealFilename, FilterRe);
+ } else {
+ ShouldInstrument = doesFilenameMatchARegex(RealFilename, FilterRe) &&
+ !doesFilenameMatchARegex(RealFilename, ExcludeRe);
+ }
+ InstrumentedFiles[Filename] = ShouldInstrument;
+ return ShouldInstrument;
+}
+
+std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
+ GCovFileType OutputType) {
+ bool Notes = OutputType == GCovFileType::GCNO;
+
+ if (NamedMDNode *GCov = M->getNamedMetadata("llvm.gcov")) {
+ for (int i = 0, e = GCov->getNumOperands(); i != e; ++i) {
+ MDNode *N = GCov->getOperand(i);
+ bool ThreeElement = N->getNumOperands() == 3;
+ if (!ThreeElement && N->getNumOperands() != 2)
+ continue;
+ if (dyn_cast<MDNode>(N->getOperand(ThreeElement ? 2 : 1)) != CU)
+ continue;
+
+ if (ThreeElement) {
+ // These nodes have no mangling to apply, it's stored mangled in the
+ // bitcode.
+ MDString *NotesFile = dyn_cast<MDString>(N->getOperand(0));
+ MDString *DataFile = dyn_cast<MDString>(N->getOperand(1));
+ if (!NotesFile || !DataFile)
+ continue;
+ return std::string(Notes ? NotesFile->getString()
+ : DataFile->getString());
+ }
+
+ MDString *GCovFile = dyn_cast<MDString>(N->getOperand(0));
+ if (!GCovFile)
+ continue;
+
+ SmallString<128> Filename = GCovFile->getString();
+ sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
+ return std::string(Filename.str());
+ }
+ }
+
+ SmallString<128> Filename = CU->getFilename();
+ sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
+ StringRef FName = sys::path::filename(Filename);
+ SmallString<128> CurPath;
+ if (sys::fs::current_path(CurPath))
+ return std::string(FName);
+ sys::path::append(CurPath, FName);
+ return std::string(CurPath.str());
+}
+
+bool GCOVProfiler::runOnModule(
Module &M, function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
std::function<const TargetLibraryInfo &(Function &F)> GetTLI) {
- this->M = &M;
- this->GetTLI = std::move(GetTLI);
- Ctx = &M.getContext();
-
+ this->M = &M;
+ this->GetTLI = std::move(GetTLI);
+ Ctx = &M.getContext();
+
NamedMDNode *CUNode = M.getNamedMetadata("llvm.dbg.cu");
if (!CUNode || (!Options.EmitNotes && !Options.EmitData))
return false;
-
+
bool HasExecOrFork = AddFlushBeforeForkAndExec();
- FilterRe = createRegexesFromString(Options.Filter);
- ExcludeRe = createRegexesFromString(Options.Exclude);
+ FilterRe = createRegexesFromString(Options.Filter);
+ ExcludeRe = createRegexesFromString(Options.Exclude);
emitProfileNotes(CUNode, HasExecOrFork, GetBFI, GetBPI, this->GetTLI);
return true;
-}
-
-PreservedAnalyses GCOVProfilerPass::run(Module &M,
- ModuleAnalysisManager &AM) {
-
- GCOVProfiler Profiler(GCOVOpts);
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-
+}
+
+PreservedAnalyses GCOVProfilerPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+
+ GCOVProfiler Profiler(GCOVOpts);
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+
auto GetBFI = [&FAM](Function &F) {
return &FAM.getResult<BlockFrequencyAnalysis>(F);
};
@@ -639,124 +639,124 @@ PreservedAnalyses GCOVProfilerPass::run(Module &M,
};
if (!Profiler.runOnModule(M, GetBFI, GetBPI, GetTLI))
- return PreservedAnalyses::all();
-
- return PreservedAnalyses::none();
-}
-
-static bool functionHasLines(const Function &F, unsigned &EndLine) {
- // Check whether this function actually has any source lines. Not only
- // do these waste space, they also can crash gcov.
- EndLine = 0;
- for (auto &BB : F) {
- for (auto &I : BB) {
- // Debug intrinsic locations correspond to the location of the
- // declaration, not necessarily any statements or expressions.
- if (isa<DbgInfoIntrinsic>(&I)) continue;
-
- const DebugLoc &Loc = I.getDebugLoc();
- if (!Loc)
- continue;
-
- // Artificial lines such as calls to the global constructors.
- if (Loc.getLine() == 0) continue;
- EndLine = std::max(EndLine, Loc.getLine());
-
- return true;
- }
- }
- return false;
-}
-
-static bool isUsingScopeBasedEH(Function &F) {
- if (!F.hasPersonalityFn()) return false;
-
- EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
- return isScopedEHPersonality(Personality);
-}
-
-bool GCOVProfiler::AddFlushBeforeForkAndExec() {
- SmallVector<CallInst *, 2> Forks;
- SmallVector<CallInst *, 2> Execs;
- for (auto &F : M->functions()) {
- auto *TLI = &GetTLI(F);
- for (auto &I : instructions(F)) {
- if (CallInst *CI = dyn_cast<CallInst>(&I)) {
- if (Function *Callee = CI->getCalledFunction()) {
- LibFunc LF;
- if (TLI->getLibFunc(*Callee, LF)) {
- if (LF == LibFunc_fork) {
-#if !defined(_WIN32)
- Forks.push_back(CI);
-#endif
- } else if (LF == LibFunc_execl || LF == LibFunc_execle ||
- LF == LibFunc_execlp || LF == LibFunc_execv ||
- LF == LibFunc_execvp || LF == LibFunc_execve ||
- LF == LibFunc_execvpe || LF == LibFunc_execvP) {
- Execs.push_back(CI);
- }
- }
- }
- }
- }
- }
-
- for (auto F : Forks) {
- IRBuilder<> Builder(F);
- BasicBlock *Parent = F->getParent();
- auto NextInst = ++F->getIterator();
-
- // We've a fork so just reset the counters in the child process
- FunctionType *FTy = FunctionType::get(Builder.getInt32Ty(), {}, false);
- FunctionCallee GCOVFork = M->getOrInsertFunction("__gcov_fork", FTy);
- F->setCalledFunction(GCOVFork);
-
- // We split just after the fork to have a counter for the lines after
- // Anyway there's a bug:
- // void foo() { fork(); }
- // void bar() { foo(); blah(); }
- // then "blah();" will be called 2 times but showed as 1
- // because "blah()" belongs to the same block as "foo();"
- Parent->splitBasicBlock(NextInst);
-
- // back() is a br instruction with a debug location
- // equals to the one from NextAfterFork
- // So to avoid to have two debug locs on two blocks just change it
- DebugLoc Loc = F->getDebugLoc();
- Parent->back().setDebugLoc(Loc);
- }
-
- for (auto E : Execs) {
- IRBuilder<> Builder(E);
- BasicBlock *Parent = E->getParent();
- auto NextInst = ++E->getIterator();
-
- // Since the process is replaced by a new one we need to write out gcdas
- // No need to reset the counters since they'll be lost after the exec**
- FunctionType *FTy = FunctionType::get(Builder.getVoidTy(), {}, false);
- FunctionCallee WriteoutF =
- M->getOrInsertFunction("llvm_writeout_files", FTy);
- Builder.CreateCall(WriteoutF);
-
- DebugLoc Loc = E->getDebugLoc();
- Builder.SetInsertPoint(&*NextInst);
- // If the exec** fails we must reset the counters since they've been
- // dumped
- FunctionCallee ResetF = M->getOrInsertFunction("llvm_reset_counters", FTy);
- Builder.CreateCall(ResetF)->setDebugLoc(Loc);
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+static bool functionHasLines(const Function &F, unsigned &EndLine) {
+ // Check whether this function actually has any source lines. Not only
+ // do these waste space, they also can crash gcov.
+ EndLine = 0;
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ // Debug intrinsic locations correspond to the location of the
+ // declaration, not necessarily any statements or expressions.
+ if (isa<DbgInfoIntrinsic>(&I)) continue;
+
+ const DebugLoc &Loc = I.getDebugLoc();
+ if (!Loc)
+ continue;
+
+ // Artificial lines such as calls to the global constructors.
+ if (Loc.getLine() == 0) continue;
+ EndLine = std::max(EndLine, Loc.getLine());
+
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool isUsingScopeBasedEH(Function &F) {
+ if (!F.hasPersonalityFn()) return false;
+
+ EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
+ return isScopedEHPersonality(Personality);
+}
+
+bool GCOVProfiler::AddFlushBeforeForkAndExec() {
+ SmallVector<CallInst *, 2> Forks;
+ SmallVector<CallInst *, 2> Execs;
+ for (auto &F : M->functions()) {
+ auto *TLI = &GetTLI(F);
+ for (auto &I : instructions(F)) {
+ if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+ if (Function *Callee = CI->getCalledFunction()) {
+ LibFunc LF;
+ if (TLI->getLibFunc(*Callee, LF)) {
+ if (LF == LibFunc_fork) {
+#if !defined(_WIN32)
+ Forks.push_back(CI);
+#endif
+ } else if (LF == LibFunc_execl || LF == LibFunc_execle ||
+ LF == LibFunc_execlp || LF == LibFunc_execv ||
+ LF == LibFunc_execvp || LF == LibFunc_execve ||
+ LF == LibFunc_execvpe || LF == LibFunc_execvP) {
+ Execs.push_back(CI);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ for (auto F : Forks) {
+ IRBuilder<> Builder(F);
+ BasicBlock *Parent = F->getParent();
+ auto NextInst = ++F->getIterator();
+
+ // We've a fork so just reset the counters in the child process
+ FunctionType *FTy = FunctionType::get(Builder.getInt32Ty(), {}, false);
+ FunctionCallee GCOVFork = M->getOrInsertFunction("__gcov_fork", FTy);
+ F->setCalledFunction(GCOVFork);
+
+ // We split just after the fork to have a counter for the lines after
+ // Anyway there's a bug:
+ // void foo() { fork(); }
+ // void bar() { foo(); blah(); }
+ // then "blah();" will be called 2 times but showed as 1
+ // because "blah()" belongs to the same block as "foo();"
+ Parent->splitBasicBlock(NextInst);
+
+ // back() is a br instruction with a debug location
+ // equals to the one from NextAfterFork
+ // So to avoid to have two debug locs on two blocks just change it
+ DebugLoc Loc = F->getDebugLoc();
+ Parent->back().setDebugLoc(Loc);
+ }
+
+ for (auto E : Execs) {
+ IRBuilder<> Builder(E);
+ BasicBlock *Parent = E->getParent();
+ auto NextInst = ++E->getIterator();
+
+ // Since the process is replaced by a new one we need to write out gcdas
+ // No need to reset the counters since they'll be lost after the exec**
+ FunctionType *FTy = FunctionType::get(Builder.getVoidTy(), {}, false);
+ FunctionCallee WriteoutF =
+ M->getOrInsertFunction("llvm_writeout_files", FTy);
+ Builder.CreateCall(WriteoutF);
+
+ DebugLoc Loc = E->getDebugLoc();
+ Builder.SetInsertPoint(&*NextInst);
+ // If the exec** fails we must reset the counters since they've been
+ // dumped
+ FunctionCallee ResetF = M->getOrInsertFunction("llvm_reset_counters", FTy);
+ Builder.CreateCall(ResetF)->setDebugLoc(Loc);
ExecBlocks.insert(Parent);
- Parent->splitBasicBlock(NextInst);
- Parent->back().setDebugLoc(Loc);
- }
-
- return !Forks.empty() || !Execs.empty();
-}
-
+ Parent->splitBasicBlock(NextInst);
+ Parent->back().setDebugLoc(Loc);
+ }
+
+ return !Forks.empty() || !Execs.empty();
+}
+
static BasicBlock *getInstrBB(CFGMST<Edge, BBInfo> &MST, Edge &E,
const DenseSet<const BasicBlock *> &ExecBlocks) {
if (E.InMST || E.Removed)
return nullptr;
-
+
BasicBlock *SrcBB = const_cast<BasicBlock *>(E.SrcBB);
BasicBlock *DestBB = const_cast<BasicBlock *>(E.DestBB);
// For a fake edge, instrument the real BB.
@@ -813,42 +813,42 @@ bool GCOVProfiler::emitProfileNotes(
function_ref<BlockFrequencyInfo *(Function &F)> GetBFI,
function_ref<BranchProbabilityInfo *(Function &F)> GetBPI,
function_ref<const TargetLibraryInfo &(Function &F)> GetTLI) {
- int Version;
- {
- uint8_t c3 = Options.Version[0];
- uint8_t c2 = Options.Version[1];
- uint8_t c1 = Options.Version[2];
- Version = c3 >= 'A' ? (c3 - 'A') * 100 + (c2 - '0') * 10 + c1 - '0'
- : (c3 - '0') * 10 + c1 - '0';
- }
-
+ int Version;
+ {
+ uint8_t c3 = Options.Version[0];
+ uint8_t c2 = Options.Version[1];
+ uint8_t c1 = Options.Version[2];
+ Version = c3 >= 'A' ? (c3 - 'A') * 100 + (c2 - '0') * 10 + c1 - '0'
+ : (c3 - '0') * 10 + c1 - '0';
+ }
+
bool EmitGCDA = Options.EmitData;
for (unsigned i = 0, e = CUNode->getNumOperands(); i != e; ++i) {
- // Each compile unit gets its own .gcno file. This means that whether we run
- // this pass over the original .o's as they're produced, or run it after
- // LTO, we'll generate the same .gcno files.
-
+ // Each compile unit gets its own .gcno file. This means that whether we run
+ // this pass over the original .o's as they're produced, or run it after
+ // LTO, we'll generate the same .gcno files.
+
auto *CU = cast<DICompileUnit>(CUNode->getOperand(i));
-
- // Skip module skeleton (and module) CUs.
- if (CU->getDWOId())
- continue;
-
+
+ // Skip module skeleton (and module) CUs.
+ if (CU->getDWOId())
+ continue;
+
std::vector<uint8_t> EdgeDestinations;
SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
-
- Endian = M->getDataLayout().isLittleEndian() ? support::endianness::little
- : support::endianness::big;
- unsigned FunctionIdent = 0;
- for (auto &F : M->functions()) {
- DISubprogram *SP = F.getSubprogram();
- unsigned EndLine;
- if (!SP) continue;
- if (!functionHasLines(F, EndLine) || !isFunctionInstrumented(F))
- continue;
- // TODO: Functions using scope-based EH are currently not supported.
- if (isUsingScopeBasedEH(F)) continue;
-
+
+ Endian = M->getDataLayout().isLittleEndian() ? support::endianness::little
+ : support::endianness::big;
+ unsigned FunctionIdent = 0;
+ for (auto &F : M->functions()) {
+ DISubprogram *SP = F.getSubprogram();
+ unsigned EndLine;
+ if (!SP) continue;
+ if (!functionHasLines(F, EndLine) || !isFunctionInstrumented(F))
+ continue;
+ // TODO: Functions using scope-based EH are currently not supported.
+ if (isUsingScopeBasedEH(F)) continue;
+
// Add the function line number to the lines of the entry block
// to have a counter for the function definition.
uint32_t Line = SP->getLine();
@@ -873,11 +873,11 @@ bool GCOVProfiler::emitProfileNotes(
E.Place = getInstrBB(MST, E, ExecBlocks);
}
// Basic blocks in F are finalized at this point.
- BasicBlock &EntryBlock = F.getEntryBlock();
- Funcs.push_back(std::make_unique<GCOVFunction>(this, &F, SP, EndLine,
- FunctionIdent++, Version));
- GCOVFunction &Func = *Funcs.back();
-
+ BasicBlock &EntryBlock = F.getEntryBlock();
+ Funcs.push_back(std::make_unique<GCOVFunction>(this, &F, SP, EndLine,
+ FunctionIdent++, Version));
+ GCOVFunction &Func = *Funcs.back();
+
// Some non-tree edges are IndirectBr which cannot be split. Ignore them
// as well.
llvm::erase_if(MST.AllEdges, [](std::unique_ptr<Edge> &E) {
@@ -903,7 +903,7 @@ bool GCOVProfiler::emitProfileNotes(
return L->SrcNumber != R->SrcNumber ? L->SrcNumber < R->SrcNumber
: L->DstNumber < R->DstNumber;
});
-
+
for (const Edge &E : make_pointee_range(MST.AllEdges)) {
GCOVBlock &Src =
E.SrcBB ? Func.getBlock(E.SrcBB) : Func.getEntryBlock();
@@ -912,10 +912,10 @@ bool GCOVProfiler::emitProfileNotes(
Src.addEdge(Dst, E.Place ? 0 : uint32_t(GCOV_ARC_ON_TREE));
}
- // Artificial functions such as global initializers
- if (!SP->isArtificial())
- Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line);
-
+ // Artificial functions such as global initializers
+ if (!SP->isArtificial())
+ Func.getBlock(&EntryBlock).getFile(Filename).addLine(Line);
+
LLVM_DEBUG(dumpEdges(MST, Func));
for (auto &GB : Func.Blocks) {
@@ -925,31 +925,31 @@ bool GCOVProfiler::emitProfileNotes(
uint32_t Idx = Succ.first->Number;
do EdgeDestinations.push_back(Idx & 255);
while ((Idx >>= 8) > 0);
- }
-
- for (auto &I : BB) {
- // Debug intrinsic locations correspond to the location of the
- // declaration, not necessarily any statements or expressions.
- if (isa<DbgInfoIntrinsic>(&I)) continue;
-
- const DebugLoc &Loc = I.getDebugLoc();
- if (!Loc)
- continue;
-
- // Artificial lines such as calls to the global constructors.
- if (Loc.getLine() == 0 || Loc.isImplicitCode())
- continue;
-
- if (Line == Loc.getLine()) continue;
- Line = Loc.getLine();
- if (SP != getDISubprogram(Loc.getScope()))
- continue;
-
- GCOVLines &Lines = Block.getFile(Filename);
- Lines.addLine(Loc.getLine());
- }
- Line = 0;
- }
+ }
+
+ for (auto &I : BB) {
+ // Debug intrinsic locations correspond to the location of the
+ // declaration, not necessarily any statements or expressions.
+ if (isa<DbgInfoIntrinsic>(&I)) continue;
+
+ const DebugLoc &Loc = I.getDebugLoc();
+ if (!Loc)
+ continue;
+
+ // Artificial lines such as calls to the global constructors.
+ if (Loc.getLine() == 0 || Loc.isImplicitCode())
+ continue;
+
+ if (Line == Loc.getLine()) continue;
+ Line = Loc.getLine();
+ if (SP != getDISubprogram(Loc.getScope()))
+ continue;
+
+ GCOVLines &Lines = Block.getFile(Filename);
+ Lines.addLine(Loc.getLine());
+ }
+ Line = 0;
+ }
if (EmitGCDA) {
DISubprogram *SP = F.getSubprogram();
ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(*Ctx), Measured);
@@ -974,14 +974,14 @@ bool GCOVProfiler::emitProfileNotes(
}
}
}
- }
-
- char Tmp[4];
+ }
+
+ char Tmp[4];
JamCRC JC;
JC.update(EdgeDestinations);
uint32_t Stamp = JC.getCRC();
- FileChecksums.push_back(Stamp);
-
+ FileChecksums.push_back(Stamp);
+
if (Options.EmitNotes) {
std::error_code EC;
raw_fd_ostream out(mangleName(CU, GCovFileType::GCNO), EC,
@@ -990,8 +990,8 @@ bool GCOVProfiler::emitProfileNotes(
Ctx->emitError(
Twine("failed to open coverage notes file for writing: ") +
EC.message());
- continue;
- }
+ continue;
+ }
os = &out;
if (Endian == support::endianness::big) {
out.write("gcno", 4);
@@ -1006,28 +1006,28 @@ bool GCOVProfiler::emitProfileNotes(
writeString(""); // unuseful current_working_directory
if (Version >= 80)
write(0); // unuseful has_unexecuted_blocks
-
+
for (auto &Func : Funcs)
Func->writeOut(Stamp);
-
+
write(0);
write(0);
out.close();
}
-
+
if (EmitGCDA) {
emitGlobalConstructor(CountersBySP);
EmitGCDA = false;
- }
+ }
}
return true;
}
-
+
void GCOVProfiler::emitGlobalConstructor(
SmallVectorImpl<std::pair<GlobalVariable *, MDNode *>> &CountersBySP) {
Function *WriteoutF = insertCounterWriteout(CountersBySP);
Function *ResetF = insertReset(CountersBySP);
-
+
// Create a small bit of code that registers the "__llvm_gcov_writeout" to
// be executed at exit and the "__llvm_gcov_flush" function to be executed
// when "__gcov_flush" is called.
@@ -1039,355 +1039,355 @@ void GCOVProfiler::emitGlobalConstructor(
F->addFnAttr(Attribute::NoInline);
if (Options.NoRedZone)
F->addFnAttr(Attribute::NoRedZone);
-
+
BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
IRBuilder<> Builder(BB);
-
+
FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
auto *PFTy = PointerType::get(FTy, 0);
FTy = FunctionType::get(Builder.getVoidTy(), {PFTy, PFTy}, false);
-
+
// Initialize the environment and register the local writeout, flush and
// reset functions.
FunctionCallee GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
Builder.CreateCall(GCOVInit, {WriteoutF, ResetF});
Builder.CreateRetVoid();
-
+
appendToGlobalCtors(*M, F, 0);
-}
-
-FunctionCallee GCOVProfiler::getStartFileFunc(const TargetLibraryInfo *TLI) {
- Type *Args[] = {
- Type::getInt8PtrTy(*Ctx), // const char *orig_filename
- Type::getInt32Ty(*Ctx), // uint32_t version
- Type::getInt32Ty(*Ctx), // uint32_t checksum
- };
- FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
- AttributeList AL;
- if (auto AK = TLI->getExtAttrForI32Param(false))
- AL = AL.addParamAttribute(*Ctx, 2, AK);
- FunctionCallee Res = M->getOrInsertFunction("llvm_gcda_start_file", FTy, AL);
- return Res;
-}
-
-FunctionCallee GCOVProfiler::getEmitFunctionFunc(const TargetLibraryInfo *TLI) {
- Type *Args[] = {
- Type::getInt32Ty(*Ctx), // uint32_t ident
- Type::getInt32Ty(*Ctx), // uint32_t func_checksum
- Type::getInt32Ty(*Ctx), // uint32_t cfg_checksum
- };
- FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
- AttributeList AL;
- if (auto AK = TLI->getExtAttrForI32Param(false)) {
- AL = AL.addParamAttribute(*Ctx, 0, AK);
- AL = AL.addParamAttribute(*Ctx, 1, AK);
- AL = AL.addParamAttribute(*Ctx, 2, AK);
- }
- return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
-}
-
-FunctionCallee GCOVProfiler::getEmitArcsFunc(const TargetLibraryInfo *TLI) {
- Type *Args[] = {
- Type::getInt32Ty(*Ctx), // uint32_t num_counters
- Type::getInt64PtrTy(*Ctx), // uint64_t *counters
- };
- FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
- AttributeList AL;
- if (auto AK = TLI->getExtAttrForI32Param(false))
- AL = AL.addParamAttribute(*Ctx, 0, AK);
- return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy, AL);
-}
-
-FunctionCallee GCOVProfiler::getSummaryInfoFunc() {
- FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
- return M->getOrInsertFunction("llvm_gcda_summary_info", FTy);
-}
-
-FunctionCallee GCOVProfiler::getEndFileFunc() {
- FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
- return M->getOrInsertFunction("llvm_gcda_end_file", FTy);
-}
-
-Function *GCOVProfiler::insertCounterWriteout(
- ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) {
- FunctionType *WriteoutFTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
- Function *WriteoutF = M->getFunction("__llvm_gcov_writeout");
- if (!WriteoutF)
- WriteoutF = Function::Create(WriteoutFTy, GlobalValue::InternalLinkage,
- "__llvm_gcov_writeout", M);
- WriteoutF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
- WriteoutF->addFnAttr(Attribute::NoInline);
- if (Options.NoRedZone)
- WriteoutF->addFnAttr(Attribute::NoRedZone);
-
- BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF);
- IRBuilder<> Builder(BB);
-
- auto *TLI = &GetTLI(*WriteoutF);
-
- FunctionCallee StartFile = getStartFileFunc(TLI);
- FunctionCallee EmitFunction = getEmitFunctionFunc(TLI);
- FunctionCallee EmitArcs = getEmitArcsFunc(TLI);
- FunctionCallee SummaryInfo = getSummaryInfoFunc();
- FunctionCallee EndFile = getEndFileFunc();
-
- NamedMDNode *CUNodes = M->getNamedMetadata("llvm.dbg.cu");
- if (!CUNodes) {
- Builder.CreateRetVoid();
- return WriteoutF;
- }
-
- // Collect the relevant data into a large constant data structure that we can
- // walk to write out everything.
- StructType *StartFileCallArgsTy = StructType::create(
+}
+
+FunctionCallee GCOVProfiler::getStartFileFunc(const TargetLibraryInfo *TLI) {
+ Type *Args[] = {
+ Type::getInt8PtrTy(*Ctx), // const char *orig_filename
+ Type::getInt32Ty(*Ctx), // uint32_t version
+ Type::getInt32Ty(*Ctx), // uint32_t checksum
+ };
+ FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+ AttributeList AL;
+ if (auto AK = TLI->getExtAttrForI32Param(false))
+ AL = AL.addParamAttribute(*Ctx, 2, AK);
+ FunctionCallee Res = M->getOrInsertFunction("llvm_gcda_start_file", FTy, AL);
+ return Res;
+}
+
+FunctionCallee GCOVProfiler::getEmitFunctionFunc(const TargetLibraryInfo *TLI) {
+ Type *Args[] = {
+ Type::getInt32Ty(*Ctx), // uint32_t ident
+ Type::getInt32Ty(*Ctx), // uint32_t func_checksum
+ Type::getInt32Ty(*Ctx), // uint32_t cfg_checksum
+ };
+ FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+ AttributeList AL;
+ if (auto AK = TLI->getExtAttrForI32Param(false)) {
+ AL = AL.addParamAttribute(*Ctx, 0, AK);
+ AL = AL.addParamAttribute(*Ctx, 1, AK);
+ AL = AL.addParamAttribute(*Ctx, 2, AK);
+ }
+ return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
+}
+
+FunctionCallee GCOVProfiler::getEmitArcsFunc(const TargetLibraryInfo *TLI) {
+ Type *Args[] = {
+ Type::getInt32Ty(*Ctx), // uint32_t num_counters
+ Type::getInt64PtrTy(*Ctx), // uint64_t *counters
+ };
+ FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
+ AttributeList AL;
+ if (auto AK = TLI->getExtAttrForI32Param(false))
+ AL = AL.addParamAttribute(*Ctx, 0, AK);
+ return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy, AL);
+}
+
+FunctionCallee GCOVProfiler::getSummaryInfoFunc() {
+ FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+ return M->getOrInsertFunction("llvm_gcda_summary_info", FTy);
+}
+
+FunctionCallee GCOVProfiler::getEndFileFunc() {
+ FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+ return M->getOrInsertFunction("llvm_gcda_end_file", FTy);
+}
+
+Function *GCOVProfiler::insertCounterWriteout(
+ ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) {
+ FunctionType *WriteoutFTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+ Function *WriteoutF = M->getFunction("__llvm_gcov_writeout");
+ if (!WriteoutF)
+ WriteoutF = Function::Create(WriteoutFTy, GlobalValue::InternalLinkage,
+ "__llvm_gcov_writeout", M);
+ WriteoutF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+ WriteoutF->addFnAttr(Attribute::NoInline);
+ if (Options.NoRedZone)
+ WriteoutF->addFnAttr(Attribute::NoRedZone);
+
+ BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF);
+ IRBuilder<> Builder(BB);
+
+ auto *TLI = &GetTLI(*WriteoutF);
+
+ FunctionCallee StartFile = getStartFileFunc(TLI);
+ FunctionCallee EmitFunction = getEmitFunctionFunc(TLI);
+ FunctionCallee EmitArcs = getEmitArcsFunc(TLI);
+ FunctionCallee SummaryInfo = getSummaryInfoFunc();
+ FunctionCallee EndFile = getEndFileFunc();
+
+ NamedMDNode *CUNodes = M->getNamedMetadata("llvm.dbg.cu");
+ if (!CUNodes) {
+ Builder.CreateRetVoid();
+ return WriteoutF;
+ }
+
+ // Collect the relevant data into a large constant data structure that we can
+ // walk to write out everything.
+ StructType *StartFileCallArgsTy = StructType::create(
{Builder.getInt8PtrTy(), Builder.getInt32Ty(), Builder.getInt32Ty()},
"start_file_args_ty");
- StructType *EmitFunctionCallArgsTy = StructType::create(
+ StructType *EmitFunctionCallArgsTy = StructType::create(
{Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()},
"emit_function_args_ty");
- StructType *EmitArcsCallArgsTy = StructType::create(
+ StructType *EmitArcsCallArgsTy = StructType::create(
{Builder.getInt32Ty(), Builder.getInt64Ty()->getPointerTo()},
"emit_arcs_args_ty");
- StructType *FileInfoTy =
- StructType::create({StartFileCallArgsTy, Builder.getInt32Ty(),
- EmitFunctionCallArgsTy->getPointerTo(),
+ StructType *FileInfoTy =
+ StructType::create({StartFileCallArgsTy, Builder.getInt32Ty(),
+ EmitFunctionCallArgsTy->getPointerTo(),
EmitArcsCallArgsTy->getPointerTo()},
"file_info");
-
- Constant *Zero32 = Builder.getInt32(0);
- // Build an explicit array of two zeros for use in ConstantExpr GEP building.
- Constant *TwoZero32s[] = {Zero32, Zero32};
-
- SmallVector<Constant *, 8> FileInfos;
- for (int i : llvm::seq<int>(0, CUNodes->getNumOperands())) {
- auto *CU = cast<DICompileUnit>(CUNodes->getOperand(i));
-
- // Skip module skeleton (and module) CUs.
- if (CU->getDWOId())
- continue;
-
- std::string FilenameGcda = mangleName(CU, GCovFileType::GCDA);
- uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i];
- auto *StartFileCallArgs = ConstantStruct::get(
- StartFileCallArgsTy,
- {Builder.CreateGlobalStringPtr(FilenameGcda),
- Builder.getInt32(endian::read32be(Options.Version)),
- Builder.getInt32(CfgChecksum)});
-
- SmallVector<Constant *, 8> EmitFunctionCallArgsArray;
- SmallVector<Constant *, 8> EmitArcsCallArgsArray;
- for (int j : llvm::seq<int>(0, CountersBySP.size())) {
- uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum();
- EmitFunctionCallArgsArray.push_back(ConstantStruct::get(
- EmitFunctionCallArgsTy,
- {Builder.getInt32(j),
- Builder.getInt32(FuncChecksum),
- Builder.getInt32(CfgChecksum)}));
-
- GlobalVariable *GV = CountersBySP[j].first;
- unsigned Arcs = cast<ArrayType>(GV->getValueType())->getNumElements();
- EmitArcsCallArgsArray.push_back(ConstantStruct::get(
- EmitArcsCallArgsTy,
- {Builder.getInt32(Arcs), ConstantExpr::getInBoundsGetElementPtr(
- GV->getValueType(), GV, TwoZero32s)}));
- }
- // Create global arrays for the two emit calls.
- int CountersSize = CountersBySP.size();
- assert(CountersSize == (int)EmitFunctionCallArgsArray.size() &&
- "Mismatched array size!");
- assert(CountersSize == (int)EmitArcsCallArgsArray.size() &&
- "Mismatched array size!");
- auto *EmitFunctionCallArgsArrayTy =
- ArrayType::get(EmitFunctionCallArgsTy, CountersSize);
- auto *EmitFunctionCallArgsArrayGV = new GlobalVariable(
- *M, EmitFunctionCallArgsArrayTy, /*isConstant*/ true,
- GlobalValue::InternalLinkage,
- ConstantArray::get(EmitFunctionCallArgsArrayTy,
- EmitFunctionCallArgsArray),
- Twine("__llvm_internal_gcov_emit_function_args.") + Twine(i));
- auto *EmitArcsCallArgsArrayTy =
- ArrayType::get(EmitArcsCallArgsTy, CountersSize);
- EmitFunctionCallArgsArrayGV->setUnnamedAddr(
- GlobalValue::UnnamedAddr::Global);
- auto *EmitArcsCallArgsArrayGV = new GlobalVariable(
- *M, EmitArcsCallArgsArrayTy, /*isConstant*/ true,
- GlobalValue::InternalLinkage,
- ConstantArray::get(EmitArcsCallArgsArrayTy, EmitArcsCallArgsArray),
- Twine("__llvm_internal_gcov_emit_arcs_args.") + Twine(i));
- EmitArcsCallArgsArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-
- FileInfos.push_back(ConstantStruct::get(
- FileInfoTy,
- {StartFileCallArgs, Builder.getInt32(CountersSize),
- ConstantExpr::getInBoundsGetElementPtr(EmitFunctionCallArgsArrayTy,
- EmitFunctionCallArgsArrayGV,
- TwoZero32s),
- ConstantExpr::getInBoundsGetElementPtr(
- EmitArcsCallArgsArrayTy, EmitArcsCallArgsArrayGV, TwoZero32s)}));
- }
-
- // If we didn't find anything to actually emit, bail on out.
- if (FileInfos.empty()) {
- Builder.CreateRetVoid();
- return WriteoutF;
- }
-
- // To simplify code, we cap the number of file infos we write out to fit
- // easily in a 32-bit signed integer. This gives consistent behavior between
- // 32-bit and 64-bit systems without requiring (potentially very slow) 64-bit
- // operations on 32-bit systems. It also seems unreasonable to try to handle
- // more than 2 billion files.
- if ((int64_t)FileInfos.size() > (int64_t)INT_MAX)
- FileInfos.resize(INT_MAX);
-
- // Create a global for the entire data structure so we can walk it more
- // easily.
- auto *FileInfoArrayTy = ArrayType::get(FileInfoTy, FileInfos.size());
- auto *FileInfoArrayGV = new GlobalVariable(
- *M, FileInfoArrayTy, /*isConstant*/ true, GlobalValue::InternalLinkage,
- ConstantArray::get(FileInfoArrayTy, FileInfos),
- "__llvm_internal_gcov_emit_file_info");
- FileInfoArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-
- // Create the CFG for walking this data structure.
- auto *FileLoopHeader =
- BasicBlock::Create(*Ctx, "file.loop.header", WriteoutF);
- auto *CounterLoopHeader =
- BasicBlock::Create(*Ctx, "counter.loop.header", WriteoutF);
- auto *FileLoopLatch = BasicBlock::Create(*Ctx, "file.loop.latch", WriteoutF);
- auto *ExitBB = BasicBlock::Create(*Ctx, "exit", WriteoutF);
-
- // We always have at least one file, so just branch to the header.
- Builder.CreateBr(FileLoopHeader);
-
- // The index into the files structure is our loop induction variable.
- Builder.SetInsertPoint(FileLoopHeader);
+
+ Constant *Zero32 = Builder.getInt32(0);
+ // Build an explicit array of two zeros for use in ConstantExpr GEP building.
+ Constant *TwoZero32s[] = {Zero32, Zero32};
+
+ SmallVector<Constant *, 8> FileInfos;
+ for (int i : llvm::seq<int>(0, CUNodes->getNumOperands())) {
+ auto *CU = cast<DICompileUnit>(CUNodes->getOperand(i));
+
+ // Skip module skeleton (and module) CUs.
+ if (CU->getDWOId())
+ continue;
+
+ std::string FilenameGcda = mangleName(CU, GCovFileType::GCDA);
+ uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i];
+ auto *StartFileCallArgs = ConstantStruct::get(
+ StartFileCallArgsTy,
+ {Builder.CreateGlobalStringPtr(FilenameGcda),
+ Builder.getInt32(endian::read32be(Options.Version)),
+ Builder.getInt32(CfgChecksum)});
+
+ SmallVector<Constant *, 8> EmitFunctionCallArgsArray;
+ SmallVector<Constant *, 8> EmitArcsCallArgsArray;
+ for (int j : llvm::seq<int>(0, CountersBySP.size())) {
+ uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum();
+ EmitFunctionCallArgsArray.push_back(ConstantStruct::get(
+ EmitFunctionCallArgsTy,
+ {Builder.getInt32(j),
+ Builder.getInt32(FuncChecksum),
+ Builder.getInt32(CfgChecksum)}));
+
+ GlobalVariable *GV = CountersBySP[j].first;
+ unsigned Arcs = cast<ArrayType>(GV->getValueType())->getNumElements();
+ EmitArcsCallArgsArray.push_back(ConstantStruct::get(
+ EmitArcsCallArgsTy,
+ {Builder.getInt32(Arcs), ConstantExpr::getInBoundsGetElementPtr(
+ GV->getValueType(), GV, TwoZero32s)}));
+ }
+ // Create global arrays for the two emit calls.
+ int CountersSize = CountersBySP.size();
+ assert(CountersSize == (int)EmitFunctionCallArgsArray.size() &&
+ "Mismatched array size!");
+ assert(CountersSize == (int)EmitArcsCallArgsArray.size() &&
+ "Mismatched array size!");
+ auto *EmitFunctionCallArgsArrayTy =
+ ArrayType::get(EmitFunctionCallArgsTy, CountersSize);
+ auto *EmitFunctionCallArgsArrayGV = new GlobalVariable(
+ *M, EmitFunctionCallArgsArrayTy, /*isConstant*/ true,
+ GlobalValue::InternalLinkage,
+ ConstantArray::get(EmitFunctionCallArgsArrayTy,
+ EmitFunctionCallArgsArray),
+ Twine("__llvm_internal_gcov_emit_function_args.") + Twine(i));
+ auto *EmitArcsCallArgsArrayTy =
+ ArrayType::get(EmitArcsCallArgsTy, CountersSize);
+ EmitFunctionCallArgsArrayGV->setUnnamedAddr(
+ GlobalValue::UnnamedAddr::Global);
+ auto *EmitArcsCallArgsArrayGV = new GlobalVariable(
+ *M, EmitArcsCallArgsArrayTy, /*isConstant*/ true,
+ GlobalValue::InternalLinkage,
+ ConstantArray::get(EmitArcsCallArgsArrayTy, EmitArcsCallArgsArray),
+ Twine("__llvm_internal_gcov_emit_arcs_args.") + Twine(i));
+ EmitArcsCallArgsArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+ FileInfos.push_back(ConstantStruct::get(
+ FileInfoTy,
+ {StartFileCallArgs, Builder.getInt32(CountersSize),
+ ConstantExpr::getInBoundsGetElementPtr(EmitFunctionCallArgsArrayTy,
+ EmitFunctionCallArgsArrayGV,
+ TwoZero32s),
+ ConstantExpr::getInBoundsGetElementPtr(
+ EmitArcsCallArgsArrayTy, EmitArcsCallArgsArrayGV, TwoZero32s)}));
+ }
+
+ // If we didn't find anything to actually emit, bail on out.
+ if (FileInfos.empty()) {
+ Builder.CreateRetVoid();
+ return WriteoutF;
+ }
+
+ // To simplify code, we cap the number of file infos we write out to fit
+ // easily in a 32-bit signed integer. This gives consistent behavior between
+ // 32-bit and 64-bit systems without requiring (potentially very slow) 64-bit
+ // operations on 32-bit systems. It also seems unreasonable to try to handle
+ // more than 2 billion files.
+ if ((int64_t)FileInfos.size() > (int64_t)INT_MAX)
+ FileInfos.resize(INT_MAX);
+
+ // Create a global for the entire data structure so we can walk it more
+ // easily.
+ auto *FileInfoArrayTy = ArrayType::get(FileInfoTy, FileInfos.size());
+ auto *FileInfoArrayGV = new GlobalVariable(
+ *M, FileInfoArrayTy, /*isConstant*/ true, GlobalValue::InternalLinkage,
+ ConstantArray::get(FileInfoArrayTy, FileInfos),
+ "__llvm_internal_gcov_emit_file_info");
+ FileInfoArrayGV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+ // Create the CFG for walking this data structure.
+ auto *FileLoopHeader =
+ BasicBlock::Create(*Ctx, "file.loop.header", WriteoutF);
+ auto *CounterLoopHeader =
+ BasicBlock::Create(*Ctx, "counter.loop.header", WriteoutF);
+ auto *FileLoopLatch = BasicBlock::Create(*Ctx, "file.loop.latch", WriteoutF);
+ auto *ExitBB = BasicBlock::Create(*Ctx, "exit", WriteoutF);
+
+ // We always have at least one file, so just branch to the header.
+ Builder.CreateBr(FileLoopHeader);
+
+ // The index into the files structure is our loop induction variable.
+ Builder.SetInsertPoint(FileLoopHeader);
PHINode *IV = Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2,
"file_idx");
- IV->addIncoming(Builder.getInt32(0), BB);
- auto *FileInfoPtr = Builder.CreateInBoundsGEP(
- FileInfoArrayTy, FileInfoArrayGV, {Builder.getInt32(0), IV});
- auto *StartFileCallArgsPtr =
+ IV->addIncoming(Builder.getInt32(0), BB);
+ auto *FileInfoPtr = Builder.CreateInBoundsGEP(
+ FileInfoArrayTy, FileInfoArrayGV, {Builder.getInt32(0), IV});
+ auto *StartFileCallArgsPtr =
Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 0, "start_file_args");
- auto *StartFileCall = Builder.CreateCall(
- StartFile,
- {Builder.CreateLoad(StartFileCallArgsTy->getElementType(0),
- Builder.CreateStructGEP(StartFileCallArgsTy,
+ auto *StartFileCall = Builder.CreateCall(
+ StartFile,
+ {Builder.CreateLoad(StartFileCallArgsTy->getElementType(0),
+ Builder.CreateStructGEP(StartFileCallArgsTy,
StartFileCallArgsPtr, 0),
"filename"),
- Builder.CreateLoad(StartFileCallArgsTy->getElementType(1),
- Builder.CreateStructGEP(StartFileCallArgsTy,
+ Builder.CreateLoad(StartFileCallArgsTy->getElementType(1),
+ Builder.CreateStructGEP(StartFileCallArgsTy,
StartFileCallArgsPtr, 1),
"version"),
- Builder.CreateLoad(StartFileCallArgsTy->getElementType(2),
- Builder.CreateStructGEP(StartFileCallArgsTy,
+ Builder.CreateLoad(StartFileCallArgsTy->getElementType(2),
+ Builder.CreateStructGEP(StartFileCallArgsTy,
StartFileCallArgsPtr, 2),
"stamp")});
- if (auto AK = TLI->getExtAttrForI32Param(false))
- StartFileCall->addParamAttr(2, AK);
+ if (auto AK = TLI->getExtAttrForI32Param(false))
+ StartFileCall->addParamAttr(2, AK);
auto *NumCounters = Builder.CreateLoad(
FileInfoTy->getElementType(1),
Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 1), "num_ctrs");
- auto *EmitFunctionCallArgsArray =
- Builder.CreateLoad(FileInfoTy->getElementType(2),
+ auto *EmitFunctionCallArgsArray =
+ Builder.CreateLoad(FileInfoTy->getElementType(2),
Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 2),
"emit_function_args");
auto *EmitArcsCallArgsArray = Builder.CreateLoad(
FileInfoTy->getElementType(3),
Builder.CreateStructGEP(FileInfoTy, FileInfoPtr, 3), "emit_arcs_args");
- auto *EnterCounterLoopCond =
- Builder.CreateICmpSLT(Builder.getInt32(0), NumCounters);
- Builder.CreateCondBr(EnterCounterLoopCond, CounterLoopHeader, FileLoopLatch);
-
- Builder.SetInsertPoint(CounterLoopHeader);
+ auto *EnterCounterLoopCond =
+ Builder.CreateICmpSLT(Builder.getInt32(0), NumCounters);
+ Builder.CreateCondBr(EnterCounterLoopCond, CounterLoopHeader, FileLoopLatch);
+
+ Builder.SetInsertPoint(CounterLoopHeader);
auto *JV = Builder.CreatePHI(Builder.getInt32Ty(), /*NumReservedValues*/ 2,
"ctr_idx");
- JV->addIncoming(Builder.getInt32(0), FileLoopHeader);
- auto *EmitFunctionCallArgsPtr = Builder.CreateInBoundsGEP(
- EmitFunctionCallArgsTy, EmitFunctionCallArgsArray, JV);
- auto *EmitFunctionCall = Builder.CreateCall(
- EmitFunction,
- {Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(0),
- Builder.CreateStructGEP(EmitFunctionCallArgsTy,
+ JV->addIncoming(Builder.getInt32(0), FileLoopHeader);
+ auto *EmitFunctionCallArgsPtr = Builder.CreateInBoundsGEP(
+ EmitFunctionCallArgsTy, EmitFunctionCallArgsArray, JV);
+ auto *EmitFunctionCall = Builder.CreateCall(
+ EmitFunction,
+ {Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(0),
+ Builder.CreateStructGEP(EmitFunctionCallArgsTy,
EmitFunctionCallArgsPtr, 0),
"ident"),
- Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(1),
- Builder.CreateStructGEP(EmitFunctionCallArgsTy,
+ Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(1),
+ Builder.CreateStructGEP(EmitFunctionCallArgsTy,
EmitFunctionCallArgsPtr, 1),
"func_checkssum"),
- Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(2),
- Builder.CreateStructGEP(EmitFunctionCallArgsTy,
+ Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(2),
+ Builder.CreateStructGEP(EmitFunctionCallArgsTy,
EmitFunctionCallArgsPtr, 2),
"cfg_checksum")});
- if (auto AK = TLI->getExtAttrForI32Param(false)) {
- EmitFunctionCall->addParamAttr(0, AK);
- EmitFunctionCall->addParamAttr(1, AK);
- EmitFunctionCall->addParamAttr(2, AK);
- }
- auto *EmitArcsCallArgsPtr =
- Builder.CreateInBoundsGEP(EmitArcsCallArgsTy, EmitArcsCallArgsArray, JV);
- auto *EmitArcsCall = Builder.CreateCall(
- EmitArcs,
- {Builder.CreateLoad(
- EmitArcsCallArgsTy->getElementType(0),
+ if (auto AK = TLI->getExtAttrForI32Param(false)) {
+ EmitFunctionCall->addParamAttr(0, AK);
+ EmitFunctionCall->addParamAttr(1, AK);
+ EmitFunctionCall->addParamAttr(2, AK);
+ }
+ auto *EmitArcsCallArgsPtr =
+ Builder.CreateInBoundsGEP(EmitArcsCallArgsTy, EmitArcsCallArgsArray, JV);
+ auto *EmitArcsCall = Builder.CreateCall(
+ EmitArcs,
+ {Builder.CreateLoad(
+ EmitArcsCallArgsTy->getElementType(0),
Builder.CreateStructGEP(EmitArcsCallArgsTy, EmitArcsCallArgsPtr, 0),
"num_counters"),
Builder.CreateLoad(
EmitArcsCallArgsTy->getElementType(1),
Builder.CreateStructGEP(EmitArcsCallArgsTy, EmitArcsCallArgsPtr, 1),
"counters")});
- if (auto AK = TLI->getExtAttrForI32Param(false))
- EmitArcsCall->addParamAttr(0, AK);
- auto *NextJV = Builder.CreateAdd(JV, Builder.getInt32(1));
- auto *CounterLoopCond = Builder.CreateICmpSLT(NextJV, NumCounters);
- Builder.CreateCondBr(CounterLoopCond, CounterLoopHeader, FileLoopLatch);
- JV->addIncoming(NextJV, CounterLoopHeader);
-
- Builder.SetInsertPoint(FileLoopLatch);
- Builder.CreateCall(SummaryInfo, {});
- Builder.CreateCall(EndFile, {});
+ if (auto AK = TLI->getExtAttrForI32Param(false))
+ EmitArcsCall->addParamAttr(0, AK);
+ auto *NextJV = Builder.CreateAdd(JV, Builder.getInt32(1));
+ auto *CounterLoopCond = Builder.CreateICmpSLT(NextJV, NumCounters);
+ Builder.CreateCondBr(CounterLoopCond, CounterLoopHeader, FileLoopLatch);
+ JV->addIncoming(NextJV, CounterLoopHeader);
+
+ Builder.SetInsertPoint(FileLoopLatch);
+ Builder.CreateCall(SummaryInfo, {});
+ Builder.CreateCall(EndFile, {});
auto *NextIV = Builder.CreateAdd(IV, Builder.getInt32(1), "next_file_idx");
- auto *FileLoopCond =
- Builder.CreateICmpSLT(NextIV, Builder.getInt32(FileInfos.size()));
- Builder.CreateCondBr(FileLoopCond, FileLoopHeader, ExitBB);
- IV->addIncoming(NextIV, FileLoopLatch);
-
- Builder.SetInsertPoint(ExitBB);
- Builder.CreateRetVoid();
-
- return WriteoutF;
-}
-
-Function *GCOVProfiler::insertReset(
- ArrayRef<std::pair<GlobalVariable *, MDNode *>> CountersBySP) {
- FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
- Function *ResetF = M->getFunction("__llvm_gcov_reset");
- if (!ResetF)
- ResetF = Function::Create(FTy, GlobalValue::InternalLinkage,
- "__llvm_gcov_reset", M);
- ResetF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
- ResetF->addFnAttr(Attribute::NoInline);
- if (Options.NoRedZone)
- ResetF->addFnAttr(Attribute::NoRedZone);
-
- BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", ResetF);
- IRBuilder<> Builder(Entry);
-
- // Zero out the counters.
- for (const auto &I : CountersBySP) {
- GlobalVariable *GV = I.first;
- Constant *Null = Constant::getNullValue(GV->getValueType());
- Builder.CreateStore(Null, GV);
- }
-
- Type *RetTy = ResetF->getReturnType();
- if (RetTy->isVoidTy())
- Builder.CreateRetVoid();
- else if (RetTy->isIntegerTy())
- // Used if __llvm_gcov_reset was implicitly declared.
- Builder.CreateRet(ConstantInt::get(RetTy, 0));
- else
- report_fatal_error("invalid return type for __llvm_gcov_reset");
-
- return ResetF;
-}
+ auto *FileLoopCond =
+ Builder.CreateICmpSLT(NextIV, Builder.getInt32(FileInfos.size()));
+ Builder.CreateCondBr(FileLoopCond, FileLoopHeader, ExitBB);
+ IV->addIncoming(NextIV, FileLoopLatch);
+
+ Builder.SetInsertPoint(ExitBB);
+ Builder.CreateRetVoid();
+
+ return WriteoutF;
+}
+
+Function *GCOVProfiler::insertReset(
+ ArrayRef<std::pair<GlobalVariable *, MDNode *>> CountersBySP) {
+ FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+ Function *ResetF = M->getFunction("__llvm_gcov_reset");
+ if (!ResetF)
+ ResetF = Function::Create(FTy, GlobalValue::InternalLinkage,
+ "__llvm_gcov_reset", M);
+ ResetF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+ ResetF->addFnAttr(Attribute::NoInline);
+ if (Options.NoRedZone)
+ ResetF->addFnAttr(Attribute::NoRedZone);
+
+ BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", ResetF);
+ IRBuilder<> Builder(Entry);
+
+ // Zero out the counters.
+ for (const auto &I : CountersBySP) {
+ GlobalVariable *GV = I.first;
+ Constant *Null = Constant::getNullValue(GV->getValueType());
+ Builder.CreateStore(Null, GV);
+ }
+
+ Type *RetTy = ResetF->getReturnType();
+ if (RetTy->isVoidTy())
+ Builder.CreateRetVoid();
+ else if (RetTy->isIntegerTy())
+ // Used if __llvm_gcov_reset was implicitly declared.
+ Builder.CreateRet(ConstantInt::get(RetTy, 0));
+ else
+ report_fatal_error("invalid return type for __llvm_gcov_reset");
+
+ return ResetF;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 1dffdacc3a..fedd9bfc97 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1,375 +1,375 @@
-//===- HWAddressSanitizer.cpp - detector of uninitialized reads -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This file is a part of HWAddressSanitizer, an address sanity checker
-/// based on tagged addressing.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include <sstream>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "hwasan"
-
+//===- HWAddressSanitizer.cpp - detector of uninitialized reads -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file is a part of HWAddressSanitizer, an address sanity checker
+/// based on tagged addressing.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/HWAddressSanitizer.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <sstream>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hwasan"
+
const char kHwasanModuleCtorName[] = "hwasan.module_ctor";
const char kHwasanNoteName[] = "hwasan.note";
const char kHwasanInitName[] = "__hwasan_init";
const char kHwasanPersonalityThunkName[] = "__hwasan_personality_thunk";
-
+
const char kHwasanShadowMemoryDynamicAddress[] =
- "__hwasan_shadow_memory_dynamic_address";
-
-// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
-static const size_t kNumberOfAccessSizes = 5;
-
-static const size_t kDefaultShadowScale = 4;
-static const uint64_t kDynamicShadowSentinel =
- std::numeric_limits<uint64_t>::max();
-static const unsigned kPointerTagShift = 56;
-
-static const unsigned kShadowBaseAlignment = 32;
-
-static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
- "hwasan-memory-access-callback-prefix",
- cl::desc("Prefix for memory access callbacks"), cl::Hidden,
- cl::init("__hwasan_"));
-
-static cl::opt<bool>
- ClInstrumentWithCalls("hwasan-instrument-with-calls",
- cl::desc("instrument reads and writes with callbacks"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool> ClInstrumentReads("hwasan-instrument-reads",
- cl::desc("instrument read instructions"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClInstrumentWrites(
- "hwasan-instrument-writes", cl::desc("instrument write instructions"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClInstrumentAtomics(
- "hwasan-instrument-atomics",
- cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
- cl::init(true));
-
-static cl::opt<bool> ClInstrumentByval("hwasan-instrument-byval",
- cl::desc("instrument byval arguments"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClRecover(
- "hwasan-recover",
- cl::desc("Enable recovery mode (continue-after-error)."),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool> ClInstrumentStack("hwasan-instrument-stack",
- cl::desc("instrument stack (allocas)"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClUARRetagToZero(
- "hwasan-uar-retag-to-zero",
- cl::desc("Clear alloca tags before returning from the function to allow "
- "non-instrumented and instrumented function calls mix. When set "
- "to false, allocas are retagged before returning from the "
- "function to detect use after return."),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClGenerateTagsWithCalls(
- "hwasan-generate-tags-with-calls",
- cl::desc("generate new tags with runtime library calls"), cl::Hidden,
- cl::init(false));
-
-static cl::opt<bool> ClGlobals("hwasan-globals", cl::desc("Instrument globals"),
- cl::Hidden, cl::init(false), cl::ZeroOrMore);
-
-static cl::opt<int> ClMatchAllTag(
- "hwasan-match-all-tag",
- cl::desc("don't report bad accesses via pointers with this tag"),
- cl::Hidden, cl::init(-1));
-
-static cl::opt<bool> ClEnableKhwasan(
- "hwasan-kernel",
- cl::desc("Enable KernelHWAddressSanitizer instrumentation"),
- cl::Hidden, cl::init(false));
-
-// These flags allow to change the shadow mapping and control how shadow memory
-// is accessed. The shadow mapping looks like:
-// Shadow = (Mem >> scale) + offset
-
-static cl::opt<uint64_t>
- ClMappingOffset("hwasan-mapping-offset",
- cl::desc("HWASan shadow mapping offset [EXPERIMENTAL]"),
- cl::Hidden, cl::init(0));
-
-static cl::opt<bool>
- ClWithIfunc("hwasan-with-ifunc",
- cl::desc("Access dynamic shadow through an ifunc global on "
- "platforms that support this"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool> ClWithTls(
- "hwasan-with-tls",
- cl::desc("Access dynamic shadow through an thread-local pointer on "
- "platforms that support this"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool>
- ClRecordStackHistory("hwasan-record-stack-history",
- cl::desc("Record stack frames with tagged allocations "
- "in a thread-local ring buffer"),
- cl::Hidden, cl::init(true));
-static cl::opt<bool>
- ClInstrumentMemIntrinsics("hwasan-instrument-mem-intrinsics",
- cl::desc("instrument memory intrinsics"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool>
- ClInstrumentLandingPads("hwasan-instrument-landing-pads",
- cl::desc("instrument landing pads"), cl::Hidden,
- cl::init(false), cl::ZeroOrMore);
-
-static cl::opt<bool> ClUseShortGranules(
- "hwasan-use-short-granules",
- cl::desc("use short granules in allocas and outlined checks"), cl::Hidden,
- cl::init(false), cl::ZeroOrMore);
-
-static cl::opt<bool> ClInstrumentPersonalityFunctions(
- "hwasan-instrument-personality-functions",
- cl::desc("instrument personality functions"), cl::Hidden, cl::init(false),
- cl::ZeroOrMore);
-
-static cl::opt<bool> ClInlineAllChecks("hwasan-inline-all-checks",
- cl::desc("inline all checks"),
- cl::Hidden, cl::init(false));
-
-namespace {
-
-/// An instrumentation pass implementing detection of addressability bugs
-/// using tagged pointers.
-class HWAddressSanitizer {
-public:
- explicit HWAddressSanitizer(Module &M, bool CompileKernel = false,
- bool Recover = false) : M(M) {
- this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
- this->CompileKernel = ClEnableKhwasan.getNumOccurrences() > 0 ?
- ClEnableKhwasan : CompileKernel;
-
- initializeModule();
- }
-
- bool sanitizeFunction(Function &F);
- void initializeModule();
+ "__hwasan_shadow_memory_dynamic_address";
+
+// Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+static const size_t kNumberOfAccessSizes = 5;
+
+static const size_t kDefaultShadowScale = 4;
+static const uint64_t kDynamicShadowSentinel =
+ std::numeric_limits<uint64_t>::max();
+static const unsigned kPointerTagShift = 56;
+
+static const unsigned kShadowBaseAlignment = 32;
+
+static cl::opt<std::string> ClMemoryAccessCallbackPrefix(
+ "hwasan-memory-access-callback-prefix",
+ cl::desc("Prefix for memory access callbacks"), cl::Hidden,
+ cl::init("__hwasan_"));
+
+static cl::opt<bool>
+ ClInstrumentWithCalls("hwasan-instrument-with-calls",
+ cl::desc("instrument reads and writes with callbacks"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClInstrumentReads("hwasan-instrument-reads",
+ cl::desc("instrument read instructions"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInstrumentWrites(
+ "hwasan-instrument-writes", cl::desc("instrument write instructions"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClInstrumentAtomics(
+ "hwasan-instrument-atomics",
+ cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
+ cl::init(true));
+
+static cl::opt<bool> ClInstrumentByval("hwasan-instrument-byval",
+ cl::desc("instrument byval arguments"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClRecover(
+ "hwasan-recover",
+ cl::desc("Enable recovery mode (continue-after-error)."),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClInstrumentStack("hwasan-instrument-stack",
+ cl::desc("instrument stack (allocas)"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClUARRetagToZero(
+ "hwasan-uar-retag-to-zero",
+ cl::desc("Clear alloca tags before returning from the function to allow "
+ "non-instrumented and instrumented function calls mix. When set "
+ "to false, allocas are retagged before returning from the "
+ "function to detect use after return."),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClGenerateTagsWithCalls(
+ "hwasan-generate-tags-with-calls",
+ cl::desc("generate new tags with runtime library calls"), cl::Hidden,
+ cl::init(false));
+
+static cl::opt<bool> ClGlobals("hwasan-globals", cl::desc("Instrument globals"),
+ cl::Hidden, cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<int> ClMatchAllTag(
+ "hwasan-match-all-tag",
+ cl::desc("don't report bad accesses via pointers with this tag"),
+ cl::Hidden, cl::init(-1));
+
+static cl::opt<bool> ClEnableKhwasan(
+ "hwasan-kernel",
+ cl::desc("Enable KernelHWAddressSanitizer instrumentation"),
+ cl::Hidden, cl::init(false));
+
+// These flags allow to change the shadow mapping and control how shadow memory
+// is accessed. The shadow mapping looks like:
+// Shadow = (Mem >> scale) + offset
+
+static cl::opt<uint64_t>
+ ClMappingOffset("hwasan-mapping-offset",
+ cl::desc("HWASan shadow mapping offset [EXPERIMENTAL]"),
+ cl::Hidden, cl::init(0));
+
+static cl::opt<bool>
+ ClWithIfunc("hwasan-with-ifunc",
+ cl::desc("Access dynamic shadow through an ifunc global on "
+ "platforms that support this"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClWithTls(
+ "hwasan-with-tls",
+ cl::desc("Access dynamic shadow through an thread-local pointer on "
+ "platforms that support this"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+ ClRecordStackHistory("hwasan-record-stack-history",
+ cl::desc("Record stack frames with tagged allocations "
+ "in a thread-local ring buffer"),
+ cl::Hidden, cl::init(true));
+static cl::opt<bool>
+ ClInstrumentMemIntrinsics("hwasan-instrument-mem-intrinsics",
+ cl::desc("instrument memory intrinsics"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+ ClInstrumentLandingPads("hwasan-instrument-landing-pads",
+ cl::desc("instrument landing pads"), cl::Hidden,
+ cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<bool> ClUseShortGranules(
+ "hwasan-use-short-granules",
+ cl::desc("use short granules in allocas and outlined checks"), cl::Hidden,
+ cl::init(false), cl::ZeroOrMore);
+
+static cl::opt<bool> ClInstrumentPersonalityFunctions(
+ "hwasan-instrument-personality-functions",
+ cl::desc("instrument personality functions"), cl::Hidden, cl::init(false),
+ cl::ZeroOrMore);
+
+static cl::opt<bool> ClInlineAllChecks("hwasan-inline-all-checks",
+ cl::desc("inline all checks"),
+ cl::Hidden, cl::init(false));
+
+namespace {
+
+/// An instrumentation pass implementing detection of addressability bugs
+/// using tagged pointers.
+class HWAddressSanitizer {
+public:
+ explicit HWAddressSanitizer(Module &M, bool CompileKernel = false,
+ bool Recover = false) : M(M) {
+ this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
+ this->CompileKernel = ClEnableKhwasan.getNumOccurrences() > 0 ?
+ ClEnableKhwasan : CompileKernel;
+
+ initializeModule();
+ }
+
+ bool sanitizeFunction(Function &F);
+ void initializeModule();
void createHwasanCtorComdat();
-
- void initializeCallbacks(Module &M);
-
+
+ void initializeCallbacks(Module &M);
+
Value *getOpaqueNoopCast(IRBuilder<> &IRB, Value *Val);
- Value *getDynamicShadowIfunc(IRBuilder<> &IRB);
+ Value *getDynamicShadowIfunc(IRBuilder<> &IRB);
Value *getShadowNonTls(IRBuilder<> &IRB);
-
- void untagPointerOperand(Instruction *I, Value *Addr);
- Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
- void instrumentMemAccessInline(Value *Ptr, bool IsWrite,
- unsigned AccessSizeIndex,
- Instruction *InsertBefore);
- void instrumentMemIntrinsic(MemIntrinsic *MI);
- bool instrumentMemAccess(InterestingMemoryOperand &O);
- bool ignoreAccess(Value *Ptr);
- void getInterestingMemoryOperands(
- Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting);
-
- bool isInterestingAlloca(const AllocaInst &AI);
- bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
- Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag);
- Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
- bool instrumentStack(
- SmallVectorImpl<AllocaInst *> &Allocas,
- DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap,
- SmallVectorImpl<Instruction *> &RetVec, Value *StackTag);
- Value *readRegister(IRBuilder<> &IRB, StringRef Name);
- bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
- Value *getNextTagWithCall(IRBuilder<> &IRB);
- Value *getStackBaseTag(IRBuilder<> &IRB);
- Value *getAllocaTag(IRBuilder<> &IRB, Value *StackTag, AllocaInst *AI,
- unsigned AllocaNo);
- Value *getUARTag(IRBuilder<> &IRB, Value *StackTag);
-
- Value *getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty);
- void emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord);
-
- void instrumentGlobal(GlobalVariable *GV, uint8_t Tag);
- void instrumentGlobals();
-
- void instrumentPersonalityFunctions();
-
-private:
- LLVMContext *C;
- Module &M;
- Triple TargetTriple;
- FunctionCallee HWAsanMemmove, HWAsanMemcpy, HWAsanMemset;
- FunctionCallee HWAsanHandleVfork;
-
- /// This struct defines the shadow mapping using the rule:
- /// shadow = (mem >> Scale) + Offset.
- /// If InGlobal is true, then
- /// extern char __hwasan_shadow[];
- /// shadow = (mem >> Scale) + &__hwasan_shadow
- /// If InTls is true, then
- /// extern char *__hwasan_tls;
- /// shadow = (mem>>Scale) + align_up(__hwasan_shadow, kShadowBaseAlignment)
- struct ShadowMapping {
- int Scale;
- uint64_t Offset;
- bool InGlobal;
- bool InTls;
-
- void init(Triple &TargetTriple);
- unsigned getObjectAlignment() const { return 1U << Scale; }
- };
- ShadowMapping Mapping;
-
- Type *VoidTy = Type::getVoidTy(M.getContext());
- Type *IntptrTy;
- Type *Int8PtrTy;
- Type *Int8Ty;
- Type *Int32Ty;
- Type *Int64Ty = Type::getInt64Ty(M.getContext());
-
- bool CompileKernel;
- bool Recover;
+
+ void untagPointerOperand(Instruction *I, Value *Addr);
+ Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+ void instrumentMemAccessInline(Value *Ptr, bool IsWrite,
+ unsigned AccessSizeIndex,
+ Instruction *InsertBefore);
+ void instrumentMemIntrinsic(MemIntrinsic *MI);
+ bool instrumentMemAccess(InterestingMemoryOperand &O);
+ bool ignoreAccess(Value *Ptr);
+ void getInterestingMemoryOperands(
+ Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting);
+
+ bool isInterestingAlloca(const AllocaInst &AI);
+ bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
+ Value *tagPointer(IRBuilder<> &IRB, Type *Ty, Value *PtrLong, Value *Tag);
+ Value *untagPointer(IRBuilder<> &IRB, Value *PtrLong);
+ bool instrumentStack(
+ SmallVectorImpl<AllocaInst *> &Allocas,
+ DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap,
+ SmallVectorImpl<Instruction *> &RetVec, Value *StackTag);
+ Value *readRegister(IRBuilder<> &IRB, StringRef Name);
+ bool instrumentLandingPads(SmallVectorImpl<Instruction *> &RetVec);
+ Value *getNextTagWithCall(IRBuilder<> &IRB);
+ Value *getStackBaseTag(IRBuilder<> &IRB);
+ Value *getAllocaTag(IRBuilder<> &IRB, Value *StackTag, AllocaInst *AI,
+ unsigned AllocaNo);
+ Value *getUARTag(IRBuilder<> &IRB, Value *StackTag);
+
+ Value *getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty);
+ void emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord);
+
+ void instrumentGlobal(GlobalVariable *GV, uint8_t Tag);
+ void instrumentGlobals();
+
+ void instrumentPersonalityFunctions();
+
+private:
+ LLVMContext *C;
+ Module &M;
+ Triple TargetTriple;
+ FunctionCallee HWAsanMemmove, HWAsanMemcpy, HWAsanMemset;
+ FunctionCallee HWAsanHandleVfork;
+
+ /// This struct defines the shadow mapping using the rule:
+ /// shadow = (mem >> Scale) + Offset.
+ /// If InGlobal is true, then
+ /// extern char __hwasan_shadow[];
+ /// shadow = (mem >> Scale) + &__hwasan_shadow
+ /// If InTls is true, then
+ /// extern char *__hwasan_tls;
+ /// shadow = (mem>>Scale) + align_up(__hwasan_shadow, kShadowBaseAlignment)
+ struct ShadowMapping {
+ int Scale;
+ uint64_t Offset;
+ bool InGlobal;
+ bool InTls;
+
+ void init(Triple &TargetTriple);
+ unsigned getObjectAlignment() const { return 1U << Scale; }
+ };
+ ShadowMapping Mapping;
+
+ Type *VoidTy = Type::getVoidTy(M.getContext());
+ Type *IntptrTy;
+ Type *Int8PtrTy;
+ Type *Int8Ty;
+ Type *Int32Ty;
+ Type *Int64Ty = Type::getInt64Ty(M.getContext());
+
+ bool CompileKernel;
+ bool Recover;
bool OutlinedChecks;
- bool UseShortGranules;
- bool InstrumentLandingPads;
-
+ bool UseShortGranules;
+ bool InstrumentLandingPads;
+
bool HasMatchAllTag = false;
uint8_t MatchAllTag = 0;
- Function *HwasanCtorFunction;
-
- FunctionCallee HwasanMemoryAccessCallback[2][kNumberOfAccessSizes];
- FunctionCallee HwasanMemoryAccessCallbackSized[2];
-
- FunctionCallee HwasanTagMemoryFunc;
- FunctionCallee HwasanGenerateTagFunc;
-
- Constant *ShadowGlobal;
-
+ Function *HwasanCtorFunction;
+
+ FunctionCallee HwasanMemoryAccessCallback[2][kNumberOfAccessSizes];
+ FunctionCallee HwasanMemoryAccessCallbackSized[2];
+
+ FunctionCallee HwasanTagMemoryFunc;
+ FunctionCallee HwasanGenerateTagFunc;
+
+ Constant *ShadowGlobal;
+
Value *ShadowBase = nullptr;
- Value *StackBaseTag = nullptr;
- GlobalValue *ThreadPtrGlobal = nullptr;
-};
-
-class HWAddressSanitizerLegacyPass : public FunctionPass {
-public:
- // Pass identification, replacement for typeid.
- static char ID;
-
- explicit HWAddressSanitizerLegacyPass(bool CompileKernel = false,
- bool Recover = false)
- : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover) {
- initializeHWAddressSanitizerLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- StringRef getPassName() const override { return "HWAddressSanitizer"; }
-
- bool doInitialization(Module &M) override {
- HWASan = std::make_unique<HWAddressSanitizer>(M, CompileKernel, Recover);
- return true;
- }
-
- bool runOnFunction(Function &F) override {
- return HWASan->sanitizeFunction(F);
- }
-
- bool doFinalization(Module &M) override {
- HWASan.reset();
- return false;
- }
-
-private:
- std::unique_ptr<HWAddressSanitizer> HWASan;
- bool CompileKernel;
- bool Recover;
-};
-
-} // end anonymous namespace
-
-char HWAddressSanitizerLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(
- HWAddressSanitizerLegacyPass, "hwasan",
- "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
- false)
-INITIALIZE_PASS_END(
- HWAddressSanitizerLegacyPass, "hwasan",
- "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
- false)
-
-FunctionPass *llvm::createHWAddressSanitizerLegacyPassPass(bool CompileKernel,
- bool Recover) {
- assert(!CompileKernel || Recover);
- return new HWAddressSanitizerLegacyPass(CompileKernel, Recover);
-}
-
-HWAddressSanitizerPass::HWAddressSanitizerPass(bool CompileKernel, bool Recover)
- : CompileKernel(CompileKernel), Recover(Recover) {}
-
-PreservedAnalyses HWAddressSanitizerPass::run(Module &M,
- ModuleAnalysisManager &MAM) {
- HWAddressSanitizer HWASan(M, CompileKernel, Recover);
- bool Modified = false;
- for (Function &F : M)
- Modified |= HWASan.sanitizeFunction(F);
- if (Modified)
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
-}
-
+ Value *StackBaseTag = nullptr;
+ GlobalValue *ThreadPtrGlobal = nullptr;
+};
+
+class HWAddressSanitizerLegacyPass : public FunctionPass {
+public:
+ // Pass identification, replacement for typeid.
+ static char ID;
+
+ explicit HWAddressSanitizerLegacyPass(bool CompileKernel = false,
+ bool Recover = false)
+ : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover) {
+ initializeHWAddressSanitizerLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return "HWAddressSanitizer"; }
+
+ bool doInitialization(Module &M) override {
+ HWASan = std::make_unique<HWAddressSanitizer>(M, CompileKernel, Recover);
+ return true;
+ }
+
+ bool runOnFunction(Function &F) override {
+ return HWASan->sanitizeFunction(F);
+ }
+
+ bool doFinalization(Module &M) override {
+ HWASan.reset();
+ return false;
+ }
+
+private:
+ std::unique_ptr<HWAddressSanitizer> HWASan;
+ bool CompileKernel;
+ bool Recover;
+};
+
+} // end anonymous namespace
+
+char HWAddressSanitizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(
+ HWAddressSanitizerLegacyPass, "hwasan",
+ "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
+ false)
+INITIALIZE_PASS_END(
+ HWAddressSanitizerLegacyPass, "hwasan",
+ "HWAddressSanitizer: detect memory bugs using tagged addressing.", false,
+ false)
+
+FunctionPass *llvm::createHWAddressSanitizerLegacyPassPass(bool CompileKernel,
+ bool Recover) {
+ assert(!CompileKernel || Recover);
+ return new HWAddressSanitizerLegacyPass(CompileKernel, Recover);
+}
+
+HWAddressSanitizerPass::HWAddressSanitizerPass(bool CompileKernel, bool Recover)
+ : CompileKernel(CompileKernel), Recover(Recover) {}
+
+PreservedAnalyses HWAddressSanitizerPass::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+ HWAddressSanitizer HWASan(M, CompileKernel, Recover);
+ bool Modified = false;
+ for (Function &F : M)
+ Modified |= HWASan.sanitizeFunction(F);
+ if (Modified)
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
void HWAddressSanitizer::createHwasanCtorComdat() {
std::tie(HwasanCtorFunction, std::ignore) =
getOrCreateSanitizerCtorAndInitFunctions(
@@ -470,38 +470,38 @@ void HWAddressSanitizer::createHwasanCtorComdat() {
appendToCompilerUsed(M, Dummy);
}
-/// Module-level initialization.
-///
-/// inserts a call to __hwasan_init to the module's constructor list.
-void HWAddressSanitizer::initializeModule() {
- LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
- auto &DL = M.getDataLayout();
-
- TargetTriple = Triple(M.getTargetTriple());
-
- Mapping.init(TargetTriple);
-
- C = &(M.getContext());
- IRBuilder<> IRB(*C);
- IntptrTy = IRB.getIntPtrTy(DL);
- Int8PtrTy = IRB.getInt8PtrTy();
- Int8Ty = IRB.getInt8Ty();
- Int32Ty = IRB.getInt32Ty();
-
- HwasanCtorFunction = nullptr;
-
- // Older versions of Android do not have the required runtime support for
- // short granules, global or personality function instrumentation. On other
- // platforms we currently require using the latest version of the runtime.
- bool NewRuntime =
- !TargetTriple.isAndroid() || !TargetTriple.isAndroidVersionLT(30);
-
- UseShortGranules =
- ClUseShortGranules.getNumOccurrences() ? ClUseShortGranules : NewRuntime;
+/// Module-level initialization.
+///
+/// inserts a call to __hwasan_init to the module's constructor list.
+void HWAddressSanitizer::initializeModule() {
+ LLVM_DEBUG(dbgs() << "Init " << M.getName() << "\n");
+ auto &DL = M.getDataLayout();
+
+ TargetTriple = Triple(M.getTargetTriple());
+
+ Mapping.init(TargetTriple);
+
+ C = &(M.getContext());
+ IRBuilder<> IRB(*C);
+ IntptrTy = IRB.getIntPtrTy(DL);
+ Int8PtrTy = IRB.getInt8PtrTy();
+ Int8Ty = IRB.getInt8Ty();
+ Int32Ty = IRB.getInt32Ty();
+
+ HwasanCtorFunction = nullptr;
+
+ // Older versions of Android do not have the required runtime support for
+ // short granules, global or personality function instrumentation. On other
+ // platforms we currently require using the latest version of the runtime.
+ bool NewRuntime =
+ !TargetTriple.isAndroid() || !TargetTriple.isAndroidVersionLT(30);
+
+ UseShortGranules =
+ ClUseShortGranules.getNumOccurrences() ? ClUseShortGranules : NewRuntime;
OutlinedChecks =
TargetTriple.isAArch64() && TargetTriple.isOSBinFormatELF() &&
(ClInlineAllChecks.getNumOccurrences() ? !ClInlineAllChecks : !Recover);
-
+
if (ClMatchAllTag.getNumOccurrences()) {
if (ClMatchAllTag != -1) {
HasMatchAllTag = true;
@@ -512,86 +512,86 @@ void HWAddressSanitizer::initializeModule() {
MatchAllTag = 0xFF;
}
- // If we don't have personality function support, fall back to landing pads.
- InstrumentLandingPads = ClInstrumentLandingPads.getNumOccurrences()
- ? ClInstrumentLandingPads
- : !NewRuntime;
-
- if (!CompileKernel) {
+ // If we don't have personality function support, fall back to landing pads.
+ InstrumentLandingPads = ClInstrumentLandingPads.getNumOccurrences()
+ ? ClInstrumentLandingPads
+ : !NewRuntime;
+
+ if (!CompileKernel) {
createHwasanCtorComdat();
- bool InstrumentGlobals =
- ClGlobals.getNumOccurrences() ? ClGlobals : NewRuntime;
- if (InstrumentGlobals)
- instrumentGlobals();
-
- bool InstrumentPersonalityFunctions =
- ClInstrumentPersonalityFunctions.getNumOccurrences()
- ? ClInstrumentPersonalityFunctions
- : NewRuntime;
- if (InstrumentPersonalityFunctions)
- instrumentPersonalityFunctions();
- }
-
- if (!TargetTriple.isAndroid()) {
- Constant *C = M.getOrInsertGlobal("__hwasan_tls", IntptrTy, [&] {
- auto *GV = new GlobalVariable(M, IntptrTy, /*isConstant=*/false,
- GlobalValue::ExternalLinkage, nullptr,
- "__hwasan_tls", nullptr,
- GlobalVariable::InitialExecTLSModel);
- appendToCompilerUsed(M, GV);
- return GV;
- });
- ThreadPtrGlobal = cast<GlobalVariable>(C);
- }
-}
-
-void HWAddressSanitizer::initializeCallbacks(Module &M) {
- IRBuilder<> IRB(*C);
- for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
- const std::string TypeStr = AccessIsWrite ? "store" : "load";
- const std::string EndingStr = Recover ? "_noabort" : "";
-
- HwasanMemoryAccessCallbackSized[AccessIsWrite] = M.getOrInsertFunction(
- ClMemoryAccessCallbackPrefix + TypeStr + "N" + EndingStr,
- FunctionType::get(IRB.getVoidTy(), {IntptrTy, IntptrTy}, false));
-
- for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
- AccessSizeIndex++) {
- HwasanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] =
- M.getOrInsertFunction(
- ClMemoryAccessCallbackPrefix + TypeStr +
- itostr(1ULL << AccessSizeIndex) + EndingStr,
- FunctionType::get(IRB.getVoidTy(), {IntptrTy}, false));
- }
- }
-
- HwasanTagMemoryFunc = M.getOrInsertFunction(
- "__hwasan_tag_memory", IRB.getVoidTy(), Int8PtrTy, Int8Ty, IntptrTy);
- HwasanGenerateTagFunc =
- M.getOrInsertFunction("__hwasan_generate_tag", Int8Ty);
-
- ShadowGlobal = M.getOrInsertGlobal("__hwasan_shadow",
- ArrayType::get(IRB.getInt8Ty(), 0));
-
- const std::string MemIntrinCallbackPrefix =
- CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
- HWAsanMemmove = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memmove",
- IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
- IRB.getInt8PtrTy(), IntptrTy);
- HWAsanMemcpy = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memcpy",
- IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
- IRB.getInt8PtrTy(), IntptrTy);
- HWAsanMemset = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memset",
- IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
- IRB.getInt32Ty(), IntptrTy);
-
- HWAsanHandleVfork =
- M.getOrInsertFunction("__hwasan_handle_vfork", IRB.getVoidTy(), IntptrTy);
-}
-
+ bool InstrumentGlobals =
+ ClGlobals.getNumOccurrences() ? ClGlobals : NewRuntime;
+ if (InstrumentGlobals)
+ instrumentGlobals();
+
+ bool InstrumentPersonalityFunctions =
+ ClInstrumentPersonalityFunctions.getNumOccurrences()
+ ? ClInstrumentPersonalityFunctions
+ : NewRuntime;
+ if (InstrumentPersonalityFunctions)
+ instrumentPersonalityFunctions();
+ }
+
+ if (!TargetTriple.isAndroid()) {
+ Constant *C = M.getOrInsertGlobal("__hwasan_tls", IntptrTy, [&] {
+ auto *GV = new GlobalVariable(M, IntptrTy, /*isConstant=*/false,
+ GlobalValue::ExternalLinkage, nullptr,
+ "__hwasan_tls", nullptr,
+ GlobalVariable::InitialExecTLSModel);
+ appendToCompilerUsed(M, GV);
+ return GV;
+ });
+ ThreadPtrGlobal = cast<GlobalVariable>(C);
+ }
+}
+
+void HWAddressSanitizer::initializeCallbacks(Module &M) {
+ IRBuilder<> IRB(*C);
+ for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
+ const std::string TypeStr = AccessIsWrite ? "store" : "load";
+ const std::string EndingStr = Recover ? "_noabort" : "";
+
+ HwasanMemoryAccessCallbackSized[AccessIsWrite] = M.getOrInsertFunction(
+ ClMemoryAccessCallbackPrefix + TypeStr + "N" + EndingStr,
+ FunctionType::get(IRB.getVoidTy(), {IntptrTy, IntptrTy}, false));
+
+ for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+ AccessSizeIndex++) {
+ HwasanMemoryAccessCallback[AccessIsWrite][AccessSizeIndex] =
+ M.getOrInsertFunction(
+ ClMemoryAccessCallbackPrefix + TypeStr +
+ itostr(1ULL << AccessSizeIndex) + EndingStr,
+ FunctionType::get(IRB.getVoidTy(), {IntptrTy}, false));
+ }
+ }
+
+ HwasanTagMemoryFunc = M.getOrInsertFunction(
+ "__hwasan_tag_memory", IRB.getVoidTy(), Int8PtrTy, Int8Ty, IntptrTy);
+ HwasanGenerateTagFunc =
+ M.getOrInsertFunction("__hwasan_generate_tag", Int8Ty);
+
+ ShadowGlobal = M.getOrInsertGlobal("__hwasan_shadow",
+ ArrayType::get(IRB.getInt8Ty(), 0));
+
+ const std::string MemIntrinCallbackPrefix =
+ CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
+ HWAsanMemmove = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memmove",
+ IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+ IRB.getInt8PtrTy(), IntptrTy);
+ HWAsanMemcpy = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memcpy",
+ IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+ IRB.getInt8PtrTy(), IntptrTy);
+ HWAsanMemset = M.getOrInsertFunction(MemIntrinCallbackPrefix + "memset",
+ IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+ IRB.getInt32Ty(), IntptrTy);
+
+ HWAsanHandleVfork =
+ M.getOrInsertFunction("__hwasan_handle_vfork", IRB.getVoidTy(), IntptrTy);
+}
+
Value *HWAddressSanitizer::getOpaqueNoopCast(IRBuilder<> &IRB, Value *Val) {
- // An empty inline asm with input reg == output reg.
- // An opaque no-op cast, basically.
+ // An empty inline asm with input reg == output reg.
+ // An opaque no-op cast, basically.
// This prevents code bloat as a result of rematerializing trivial definitions
// such as constants or global addresses at every load and store.
InlineAsm *Asm =
@@ -599,128 +599,128 @@ Value *HWAddressSanitizer::getOpaqueNoopCast(IRBuilder<> &IRB, Value *Val) {
StringRef(""), StringRef("=r,0"),
/*hasSideEffects=*/false);
return IRB.CreateCall(Asm, {Val}, ".hwasan.shadow");
-}
-
+}
+
Value *HWAddressSanitizer::getDynamicShadowIfunc(IRBuilder<> &IRB) {
return getOpaqueNoopCast(IRB, ShadowGlobal);
}
Value *HWAddressSanitizer::getShadowNonTls(IRBuilder<> &IRB) {
- if (Mapping.Offset != kDynamicShadowSentinel)
+ if (Mapping.Offset != kDynamicShadowSentinel)
return getOpaqueNoopCast(
IRB, ConstantExpr::getIntToPtr(
ConstantInt::get(IntptrTy, Mapping.Offset), Int8PtrTy));
-
- if (Mapping.InGlobal) {
- return getDynamicShadowIfunc(IRB);
- } else {
- Value *GlobalDynamicAddress =
- IRB.GetInsertBlock()->getParent()->getParent()->getOrInsertGlobal(
- kHwasanShadowMemoryDynamicAddress, Int8PtrTy);
- return IRB.CreateLoad(Int8PtrTy, GlobalDynamicAddress);
- }
-}
-
-bool HWAddressSanitizer::ignoreAccess(Value *Ptr) {
- // Do not instrument acesses from different address spaces; we cannot deal
- // with them.
- Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
- if (PtrTy->getPointerAddressSpace() != 0)
- return true;
-
- // Ignore swifterror addresses.
- // swifterror memory addresses are mem2reg promoted by instruction
- // selection. As such they cannot have regular uses like an instrumentation
- // function and it makes no sense to track them as memory.
- if (Ptr->isSwiftError())
- return true;
-
- return false;
-}
-
-void HWAddressSanitizer::getInterestingMemoryOperands(
- Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting) {
- // Skip memory accesses inserted by another instrumentation.
- if (I->hasMetadata("nosanitize"))
- return;
-
- // Do not instrument the load fetching the dynamic shadow address.
+
+ if (Mapping.InGlobal) {
+ return getDynamicShadowIfunc(IRB);
+ } else {
+ Value *GlobalDynamicAddress =
+ IRB.GetInsertBlock()->getParent()->getParent()->getOrInsertGlobal(
+ kHwasanShadowMemoryDynamicAddress, Int8PtrTy);
+ return IRB.CreateLoad(Int8PtrTy, GlobalDynamicAddress);
+ }
+}
+
+bool HWAddressSanitizer::ignoreAccess(Value *Ptr) {
+ // Do not instrument acesses from different address spaces; we cannot deal
+ // with them.
+ Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
+ if (PtrTy->getPointerAddressSpace() != 0)
+ return true;
+
+ // Ignore swifterror addresses.
+ // swifterror memory addresses are mem2reg promoted by instruction
+ // selection. As such they cannot have regular uses like an instrumentation
+ // function and it makes no sense to track them as memory.
+ if (Ptr->isSwiftError())
+ return true;
+
+ return false;
+}
+
+void HWAddressSanitizer::getInterestingMemoryOperands(
+ Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting) {
+ // Skip memory accesses inserted by another instrumentation.
+ if (I->hasMetadata("nosanitize"))
+ return;
+
+ // Do not instrument the load fetching the dynamic shadow address.
if (ShadowBase == I)
- return;
-
- if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
- if (!ClInstrumentReads || ignoreAccess(LI->getPointerOperand()))
- return;
- Interesting.emplace_back(I, LI->getPointerOperandIndex(), false,
- LI->getType(), LI->getAlign());
- } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
- if (!ClInstrumentWrites || ignoreAccess(SI->getPointerOperand()))
- return;
- Interesting.emplace_back(I, SI->getPointerOperandIndex(), true,
- SI->getValueOperand()->getType(), SI->getAlign());
- } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
- if (!ClInstrumentAtomics || ignoreAccess(RMW->getPointerOperand()))
- return;
- Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true,
- RMW->getValOperand()->getType(), None);
- } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
- if (!ClInstrumentAtomics || ignoreAccess(XCHG->getPointerOperand()))
- return;
- Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
- XCHG->getCompareOperand()->getType(), None);
- } else if (auto CI = dyn_cast<CallInst>(I)) {
- for (unsigned ArgNo = 0; ArgNo < CI->getNumArgOperands(); ArgNo++) {
- if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) ||
- ignoreAccess(CI->getArgOperand(ArgNo)))
- continue;
- Type *Ty = CI->getParamByValType(ArgNo);
- Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
- }
- }
-}
-
-static unsigned getPointerOperandIndex(Instruction *I) {
- if (LoadInst *LI = dyn_cast<LoadInst>(I))
- return LI->getPointerOperandIndex();
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->getPointerOperandIndex();
- if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I))
- return RMW->getPointerOperandIndex();
- if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I))
- return XCHG->getPointerOperandIndex();
- report_fatal_error("Unexpected instruction");
- return -1;
-}
-
-static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
- size_t Res = countTrailingZeros(TypeSize / 8);
- assert(Res < kNumberOfAccessSizes);
- return Res;
-}
-
-void HWAddressSanitizer::untagPointerOperand(Instruction *I, Value *Addr) {
- if (TargetTriple.isAArch64())
- return;
-
- IRBuilder<> IRB(I);
- Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
- Value *UntaggedPtr =
- IRB.CreateIntToPtr(untagPointer(IRB, AddrLong), Addr->getType());
- I->setOperand(getPointerOperandIndex(I), UntaggedPtr);
-}
-
-Value *HWAddressSanitizer::memToShadow(Value *Mem, IRBuilder<> &IRB) {
- // Mem >> Scale
- Value *Shadow = IRB.CreateLShr(Mem, Mapping.Scale);
- if (Mapping.Offset == 0)
- return IRB.CreateIntToPtr(Shadow, Int8PtrTy);
- // (Mem >> Scale) + Offset
+ return;
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ if (!ClInstrumentReads || ignoreAccess(LI->getPointerOperand()))
+ return;
+ Interesting.emplace_back(I, LI->getPointerOperandIndex(), false,
+ LI->getType(), LI->getAlign());
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ if (!ClInstrumentWrites || ignoreAccess(SI->getPointerOperand()))
+ return;
+ Interesting.emplace_back(I, SI->getPointerOperandIndex(), true,
+ SI->getValueOperand()->getType(), SI->getAlign());
+ } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+ if (!ClInstrumentAtomics || ignoreAccess(RMW->getPointerOperand()))
+ return;
+ Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true,
+ RMW->getValOperand()->getType(), None);
+ } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
+ if (!ClInstrumentAtomics || ignoreAccess(XCHG->getPointerOperand()))
+ return;
+ Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
+ XCHG->getCompareOperand()->getType(), None);
+ } else if (auto CI = dyn_cast<CallInst>(I)) {
+ for (unsigned ArgNo = 0; ArgNo < CI->getNumArgOperands(); ArgNo++) {
+ if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) ||
+ ignoreAccess(CI->getArgOperand(ArgNo)))
+ continue;
+ Type *Ty = CI->getParamByValType(ArgNo);
+ Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
+ }
+ }
+}
+
+static unsigned getPointerOperandIndex(Instruction *I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return LI->getPointerOperandIndex();
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->getPointerOperandIndex();
+ if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I))
+ return RMW->getPointerOperandIndex();
+ if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I))
+ return XCHG->getPointerOperandIndex();
+ report_fatal_error("Unexpected instruction");
+ return -1;
+}
+
+static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
+ size_t Res = countTrailingZeros(TypeSize / 8);
+ assert(Res < kNumberOfAccessSizes);
+ return Res;
+}
+
+void HWAddressSanitizer::untagPointerOperand(Instruction *I, Value *Addr) {
+ if (TargetTriple.isAArch64())
+ return;
+
+ IRBuilder<> IRB(I);
+ Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+ Value *UntaggedPtr =
+ IRB.CreateIntToPtr(untagPointer(IRB, AddrLong), Addr->getType());
+ I->setOperand(getPointerOperandIndex(I), UntaggedPtr);
+}
+
+Value *HWAddressSanitizer::memToShadow(Value *Mem, IRBuilder<> &IRB) {
+ // Mem >> Scale
+ Value *Shadow = IRB.CreateLShr(Mem, Mapping.Scale);
+ if (Mapping.Offset == 0)
+ return IRB.CreateIntToPtr(Shadow, Int8PtrTy);
+ // (Mem >> Scale) + Offset
return IRB.CreateGEP(Int8Ty, ShadowBase, Shadow);
-}
-
-void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
- unsigned AccessSizeIndex,
- Instruction *InsertBefore) {
+}
+
+void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
+ unsigned AccessSizeIndex,
+ Instruction *InsertBefore) {
const int64_t AccessInfo =
(CompileKernel << HWASanAccessInfo::CompileKernelShift) +
(HasMatchAllTag << HWASanAccessInfo::HasMatchAllShift) +
@@ -728,809 +728,809 @@ void HWAddressSanitizer::instrumentMemAccessInline(Value *Ptr, bool IsWrite,
(Recover << HWASanAccessInfo::RecoverShift) +
(IsWrite << HWASanAccessInfo::IsWriteShift) +
(AccessSizeIndex << HWASanAccessInfo::AccessSizeShift);
- IRBuilder<> IRB(InsertBefore);
-
+ IRBuilder<> IRB(InsertBefore);
+
if (OutlinedChecks) {
- Module *M = IRB.GetInsertBlock()->getParent()->getParent();
- Ptr = IRB.CreateBitCast(Ptr, Int8PtrTy);
- IRB.CreateCall(Intrinsic::getDeclaration(
- M, UseShortGranules
- ? Intrinsic::hwasan_check_memaccess_shortgranules
- : Intrinsic::hwasan_check_memaccess),
+ Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+ Ptr = IRB.CreateBitCast(Ptr, Int8PtrTy);
+ IRB.CreateCall(Intrinsic::getDeclaration(
+ M, UseShortGranules
+ ? Intrinsic::hwasan_check_memaccess_shortgranules
+ : Intrinsic::hwasan_check_memaccess),
{ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
- return;
- }
-
- Value *PtrLong = IRB.CreatePointerCast(Ptr, IntptrTy);
- Value *PtrTag = IRB.CreateTrunc(IRB.CreateLShr(PtrLong, kPointerTagShift),
- IRB.getInt8Ty());
- Value *AddrLong = untagPointer(IRB, PtrLong);
- Value *Shadow = memToShadow(AddrLong, IRB);
- Value *MemTag = IRB.CreateLoad(Int8Ty, Shadow);
- Value *TagMismatch = IRB.CreateICmpNE(PtrTag, MemTag);
-
+ return;
+ }
+
+ Value *PtrLong = IRB.CreatePointerCast(Ptr, IntptrTy);
+ Value *PtrTag = IRB.CreateTrunc(IRB.CreateLShr(PtrLong, kPointerTagShift),
+ IRB.getInt8Ty());
+ Value *AddrLong = untagPointer(IRB, PtrLong);
+ Value *Shadow = memToShadow(AddrLong, IRB);
+ Value *MemTag = IRB.CreateLoad(Int8Ty, Shadow);
+ Value *TagMismatch = IRB.CreateICmpNE(PtrTag, MemTag);
+
if (HasMatchAllTag) {
Value *TagNotIgnored = IRB.CreateICmpNE(
PtrTag, ConstantInt::get(PtrTag->getType(), MatchAllTag));
- TagMismatch = IRB.CreateAnd(TagMismatch, TagNotIgnored);
- }
-
- Instruction *CheckTerm =
- SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, false,
- MDBuilder(*C).createBranchWeights(1, 100000));
-
- IRB.SetInsertPoint(CheckTerm);
- Value *OutOfShortGranuleTagRange =
- IRB.CreateICmpUGT(MemTag, ConstantInt::get(Int8Ty, 15));
- Instruction *CheckFailTerm =
- SplitBlockAndInsertIfThen(OutOfShortGranuleTagRange, CheckTerm, !Recover,
- MDBuilder(*C).createBranchWeights(1, 100000));
-
- IRB.SetInsertPoint(CheckTerm);
- Value *PtrLowBits = IRB.CreateTrunc(IRB.CreateAnd(PtrLong, 15), Int8Ty);
- PtrLowBits = IRB.CreateAdd(
- PtrLowBits, ConstantInt::get(Int8Ty, (1 << AccessSizeIndex) - 1));
- Value *PtrLowBitsOOB = IRB.CreateICmpUGE(PtrLowBits, MemTag);
- SplitBlockAndInsertIfThen(PtrLowBitsOOB, CheckTerm, false,
- MDBuilder(*C).createBranchWeights(1, 100000),
+ TagMismatch = IRB.CreateAnd(TagMismatch, TagNotIgnored);
+ }
+
+ Instruction *CheckTerm =
+ SplitBlockAndInsertIfThen(TagMismatch, InsertBefore, false,
+ MDBuilder(*C).createBranchWeights(1, 100000));
+
+ IRB.SetInsertPoint(CheckTerm);
+ Value *OutOfShortGranuleTagRange =
+ IRB.CreateICmpUGT(MemTag, ConstantInt::get(Int8Ty, 15));
+ Instruction *CheckFailTerm =
+ SplitBlockAndInsertIfThen(OutOfShortGranuleTagRange, CheckTerm, !Recover,
+ MDBuilder(*C).createBranchWeights(1, 100000));
+
+ IRB.SetInsertPoint(CheckTerm);
+ Value *PtrLowBits = IRB.CreateTrunc(IRB.CreateAnd(PtrLong, 15), Int8Ty);
+ PtrLowBits = IRB.CreateAdd(
+ PtrLowBits, ConstantInt::get(Int8Ty, (1 << AccessSizeIndex) - 1));
+ Value *PtrLowBitsOOB = IRB.CreateICmpUGE(PtrLowBits, MemTag);
+ SplitBlockAndInsertIfThen(PtrLowBitsOOB, CheckTerm, false,
+ MDBuilder(*C).createBranchWeights(1, 100000),
(DomTreeUpdater *)nullptr, nullptr,
CheckFailTerm->getParent());
-
- IRB.SetInsertPoint(CheckTerm);
- Value *InlineTagAddr = IRB.CreateOr(AddrLong, 15);
- InlineTagAddr = IRB.CreateIntToPtr(InlineTagAddr, Int8PtrTy);
- Value *InlineTag = IRB.CreateLoad(Int8Ty, InlineTagAddr);
- Value *InlineTagMismatch = IRB.CreateICmpNE(PtrTag, InlineTag);
- SplitBlockAndInsertIfThen(InlineTagMismatch, CheckTerm, false,
- MDBuilder(*C).createBranchWeights(1, 100000),
+
+ IRB.SetInsertPoint(CheckTerm);
+ Value *InlineTagAddr = IRB.CreateOr(AddrLong, 15);
+ InlineTagAddr = IRB.CreateIntToPtr(InlineTagAddr, Int8PtrTy);
+ Value *InlineTag = IRB.CreateLoad(Int8Ty, InlineTagAddr);
+ Value *InlineTagMismatch = IRB.CreateICmpNE(PtrTag, InlineTag);
+ SplitBlockAndInsertIfThen(InlineTagMismatch, CheckTerm, false,
+ MDBuilder(*C).createBranchWeights(1, 100000),
(DomTreeUpdater *)nullptr, nullptr,
CheckFailTerm->getParent());
-
- IRB.SetInsertPoint(CheckFailTerm);
- InlineAsm *Asm;
- switch (TargetTriple.getArch()) {
- case Triple::x86_64:
- // The signal handler will find the data address in rdi.
- Asm = InlineAsm::get(
- FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
+
+ IRB.SetInsertPoint(CheckFailTerm);
+ InlineAsm *Asm;
+ switch (TargetTriple.getArch()) {
+ case Triple::x86_64:
+ // The signal handler will find the data address in rdi.
+ Asm = InlineAsm::get(
+ FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
"int3\nnopl " +
itostr(0x40 + (AccessInfo & HWASanAccessInfo::RuntimeMask)) +
"(%rax)",
- "{rdi}",
- /*hasSideEffects=*/true);
- break;
- case Triple::aarch64:
- case Triple::aarch64_be:
- // The signal handler will find the data address in x0.
- Asm = InlineAsm::get(
- FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
+ "{rdi}",
+ /*hasSideEffects=*/true);
+ break;
+ case Triple::aarch64:
+ case Triple::aarch64_be:
+ // The signal handler will find the data address in x0.
+ Asm = InlineAsm::get(
+ FunctionType::get(IRB.getVoidTy(), {PtrLong->getType()}, false),
"brk #" +
itostr(0x900 + (AccessInfo & HWASanAccessInfo::RuntimeMask)),
- "{x0}",
- /*hasSideEffects=*/true);
- break;
- default:
- report_fatal_error("unsupported architecture");
- }
- IRB.CreateCall(Asm, PtrLong);
- if (Recover)
- cast<BranchInst>(CheckFailTerm)->setSuccessor(0, CheckTerm->getParent());
-}
-
-void HWAddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
- IRBuilder<> IRB(MI);
- if (isa<MemTransferInst>(MI)) {
- IRB.CreateCall(
- isa<MemMoveInst>(MI) ? HWAsanMemmove : HWAsanMemcpy,
- {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
- IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
- IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
- } else if (isa<MemSetInst>(MI)) {
- IRB.CreateCall(
- HWAsanMemset,
- {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
- IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
- IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
- }
- MI->eraseFromParent();
-}
-
-bool HWAddressSanitizer::instrumentMemAccess(InterestingMemoryOperand &O) {
- Value *Addr = O.getPtr();
-
- LLVM_DEBUG(dbgs() << "Instrumenting: " << O.getInsn() << "\n");
-
- if (O.MaybeMask)
- return false; //FIXME
-
- IRBuilder<> IRB(O.getInsn());
- if (isPowerOf2_64(O.TypeSize) &&
- (O.TypeSize / 8 <= (1ULL << (kNumberOfAccessSizes - 1))) &&
- (!O.Alignment || *O.Alignment >= (1ULL << Mapping.Scale) ||
- *O.Alignment >= O.TypeSize / 8)) {
- size_t AccessSizeIndex = TypeSizeToSizeIndex(O.TypeSize);
- if (ClInstrumentWithCalls) {
- IRB.CreateCall(HwasanMemoryAccessCallback[O.IsWrite][AccessSizeIndex],
- IRB.CreatePointerCast(Addr, IntptrTy));
- } else {
- instrumentMemAccessInline(Addr, O.IsWrite, AccessSizeIndex, O.getInsn());
- }
- } else {
- IRB.CreateCall(HwasanMemoryAccessCallbackSized[O.IsWrite],
- {IRB.CreatePointerCast(Addr, IntptrTy),
- ConstantInt::get(IntptrTy, O.TypeSize / 8)});
- }
- untagPointerOperand(O.getInsn(), Addr);
-
- return true;
-}
-
-static uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
- uint64_t ArraySize = 1;
- if (AI.isArrayAllocation()) {
- const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
- assert(CI && "non-constant array size");
- ArraySize = CI->getZExtValue();
- }
- Type *Ty = AI.getAllocatedType();
- uint64_t SizeInBytes = AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
- return SizeInBytes * ArraySize;
-}
-
-bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
- Value *Tag, size_t Size) {
- size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
- if (!UseShortGranules)
- Size = AlignedSize;
-
- Value *JustTag = IRB.CreateTrunc(Tag, IRB.getInt8Ty());
- if (ClInstrumentWithCalls) {
- IRB.CreateCall(HwasanTagMemoryFunc,
- {IRB.CreatePointerCast(AI, Int8PtrTy), JustTag,
- ConstantInt::get(IntptrTy, AlignedSize)});
- } else {
- size_t ShadowSize = Size >> Mapping.Scale;
- Value *ShadowPtr = memToShadow(IRB.CreatePointerCast(AI, IntptrTy), IRB);
- // If this memset is not inlined, it will be intercepted in the hwasan
- // runtime library. That's OK, because the interceptor skips the checks if
- // the address is in the shadow region.
- // FIXME: the interceptor is not as fast as real memset. Consider lowering
- // llvm.memset right here into either a sequence of stores, or a call to
- // hwasan_tag_memory.
- if (ShadowSize)
- IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, Align(1));
- if (Size != AlignedSize) {
- IRB.CreateStore(
- ConstantInt::get(Int8Ty, Size % Mapping.getObjectAlignment()),
- IRB.CreateConstGEP1_32(Int8Ty, ShadowPtr, ShadowSize));
- IRB.CreateStore(JustTag, IRB.CreateConstGEP1_32(
- Int8Ty, IRB.CreateBitCast(AI, Int8PtrTy),
- AlignedSize - 1));
- }
- }
- return true;
-}
-
-static unsigned RetagMask(unsigned AllocaNo) {
- // A list of 8-bit numbers that have at most one run of non-zero bits.
- // x = x ^ (mask << 56) can be encoded as a single armv8 instruction for these
- // masks.
- // The list does not include the value 255, which is used for UAR.
- //
- // Because we are more likely to use earlier elements of this list than later
- // ones, it is sorted in increasing order of probability of collision with a
- // mask allocated (temporally) nearby. The program that generated this list
- // can be found at:
- // https://github.com/google/sanitizers/blob/master/hwaddress-sanitizer/sort_masks.py
- static unsigned FastMasks[] = {0, 128, 64, 192, 32, 96, 224, 112, 240,
- 48, 16, 120, 248, 56, 24, 8, 124, 252,
- 60, 28, 12, 4, 126, 254, 62, 30, 14,
- 6, 2, 127, 63, 31, 15, 7, 3, 1};
- return FastMasks[AllocaNo % (sizeof(FastMasks) / sizeof(FastMasks[0]))];
-}
-
-Value *HWAddressSanitizer::getNextTagWithCall(IRBuilder<> &IRB) {
- return IRB.CreateZExt(IRB.CreateCall(HwasanGenerateTagFunc), IntptrTy);
-}
-
-Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) {
- if (ClGenerateTagsWithCalls)
- return getNextTagWithCall(IRB);
- if (StackBaseTag)
- return StackBaseTag;
- // FIXME: use addressofreturnaddress (but implement it in aarch64 backend
- // first).
- Module *M = IRB.GetInsertBlock()->getParent()->getParent();
- auto GetStackPointerFn = Intrinsic::getDeclaration(
- M, Intrinsic::frameaddress,
- IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
- Value *StackPointer = IRB.CreateCall(
- GetStackPointerFn, {Constant::getNullValue(IRB.getInt32Ty())});
-
- // Extract some entropy from the stack pointer for the tags.
- // Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ
- // between functions).
- Value *StackPointerLong = IRB.CreatePointerCast(StackPointer, IntptrTy);
- Value *StackTag =
- IRB.CreateXor(StackPointerLong, IRB.CreateLShr(StackPointerLong, 20),
- "hwasan.stack.base.tag");
- return StackTag;
-}
-
-Value *HWAddressSanitizer::getAllocaTag(IRBuilder<> &IRB, Value *StackTag,
- AllocaInst *AI, unsigned AllocaNo) {
- if (ClGenerateTagsWithCalls)
- return getNextTagWithCall(IRB);
- return IRB.CreateXor(StackTag,
- ConstantInt::get(IntptrTy, RetagMask(AllocaNo)));
-}
-
-Value *HWAddressSanitizer::getUARTag(IRBuilder<> &IRB, Value *StackTag) {
- if (ClUARRetagToZero)
- return ConstantInt::get(IntptrTy, 0);
- if (ClGenerateTagsWithCalls)
- return getNextTagWithCall(IRB);
- return IRB.CreateXor(StackTag, ConstantInt::get(IntptrTy, 0xFFU));
-}
-
-// Add a tag to an address.
-Value *HWAddressSanitizer::tagPointer(IRBuilder<> &IRB, Type *Ty,
- Value *PtrLong, Value *Tag) {
- Value *TaggedPtrLong;
- if (CompileKernel) {
- // Kernel addresses have 0xFF in the most significant byte.
- Value *ShiftedTag = IRB.CreateOr(
- IRB.CreateShl(Tag, kPointerTagShift),
- ConstantInt::get(IntptrTy, (1ULL << kPointerTagShift) - 1));
- TaggedPtrLong = IRB.CreateAnd(PtrLong, ShiftedTag);
- } else {
- // Userspace can simply do OR (tag << 56);
- Value *ShiftedTag = IRB.CreateShl(Tag, kPointerTagShift);
- TaggedPtrLong = IRB.CreateOr(PtrLong, ShiftedTag);
- }
- return IRB.CreateIntToPtr(TaggedPtrLong, Ty);
-}
-
-// Remove tag from an address.
-Value *HWAddressSanitizer::untagPointer(IRBuilder<> &IRB, Value *PtrLong) {
- Value *UntaggedPtrLong;
- if (CompileKernel) {
- // Kernel addresses have 0xFF in the most significant byte.
- UntaggedPtrLong = IRB.CreateOr(PtrLong,
- ConstantInt::get(PtrLong->getType(), 0xFFULL << kPointerTagShift));
- } else {
- // Userspace addresses have 0x00.
- UntaggedPtrLong = IRB.CreateAnd(PtrLong,
- ConstantInt::get(PtrLong->getType(), ~(0xFFULL << kPointerTagShift)));
- }
- return UntaggedPtrLong;
-}
-
-Value *HWAddressSanitizer::getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty) {
- Module *M = IRB.GetInsertBlock()->getParent()->getParent();
- if (TargetTriple.isAArch64() && TargetTriple.isAndroid()) {
- // Android provides a fixed TLS slot for sanitizers. See TLS_SLOT_SANITIZER
- // in Bionic's libc/private/bionic_tls.h.
- Function *ThreadPointerFunc =
- Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
- Value *SlotPtr = IRB.CreatePointerCast(
- IRB.CreateConstGEP1_32(IRB.getInt8Ty(),
- IRB.CreateCall(ThreadPointerFunc), 0x30),
- Ty->getPointerTo(0));
- return SlotPtr;
- }
- if (ThreadPtrGlobal)
- return ThreadPtrGlobal;
-
-
- return nullptr;
-}
-
-void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
- if (!Mapping.InTls) {
+ "{x0}",
+ /*hasSideEffects=*/true);
+ break;
+ default:
+ report_fatal_error("unsupported architecture");
+ }
+ IRB.CreateCall(Asm, PtrLong);
+ if (Recover)
+ cast<BranchInst>(CheckFailTerm)->setSuccessor(0, CheckTerm->getParent());
+}
+
+void HWAddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+ IRBuilder<> IRB(MI);
+ if (isa<MemTransferInst>(MI)) {
+ IRB.CreateCall(
+ isa<MemMoveInst>(MI) ? HWAsanMemmove : HWAsanMemcpy,
+ {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+ IRB.CreatePointerCast(MI->getOperand(1), IRB.getInt8PtrTy()),
+ IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+ } else if (isa<MemSetInst>(MI)) {
+ IRB.CreateCall(
+ HWAsanMemset,
+ {IRB.CreatePointerCast(MI->getOperand(0), IRB.getInt8PtrTy()),
+ IRB.CreateIntCast(MI->getOperand(1), IRB.getInt32Ty(), false),
+ IRB.CreateIntCast(MI->getOperand(2), IntptrTy, false)});
+ }
+ MI->eraseFromParent();
+}
+
+bool HWAddressSanitizer::instrumentMemAccess(InterestingMemoryOperand &O) {
+ Value *Addr = O.getPtr();
+
+ LLVM_DEBUG(dbgs() << "Instrumenting: " << O.getInsn() << "\n");
+
+ if (O.MaybeMask)
+ return false; //FIXME
+
+ IRBuilder<> IRB(O.getInsn());
+ if (isPowerOf2_64(O.TypeSize) &&
+ (O.TypeSize / 8 <= (1ULL << (kNumberOfAccessSizes - 1))) &&
+ (!O.Alignment || *O.Alignment >= (1ULL << Mapping.Scale) ||
+ *O.Alignment >= O.TypeSize / 8)) {
+ size_t AccessSizeIndex = TypeSizeToSizeIndex(O.TypeSize);
+ if (ClInstrumentWithCalls) {
+ IRB.CreateCall(HwasanMemoryAccessCallback[O.IsWrite][AccessSizeIndex],
+ IRB.CreatePointerCast(Addr, IntptrTy));
+ } else {
+ instrumentMemAccessInline(Addr, O.IsWrite, AccessSizeIndex, O.getInsn());
+ }
+ } else {
+ IRB.CreateCall(HwasanMemoryAccessCallbackSized[O.IsWrite],
+ {IRB.CreatePointerCast(Addr, IntptrTy),
+ ConstantInt::get(IntptrTy, O.TypeSize / 8)});
+ }
+ untagPointerOperand(O.getInsn(), Addr);
+
+ return true;
+}
+
+static uint64_t getAllocaSizeInBytes(const AllocaInst &AI) {
+ uint64_t ArraySize = 1;
+ if (AI.isArrayAllocation()) {
+ const ConstantInt *CI = dyn_cast<ConstantInt>(AI.getArraySize());
+ assert(CI && "non-constant array size");
+ ArraySize = CI->getZExtValue();
+ }
+ Type *Ty = AI.getAllocatedType();
+ uint64_t SizeInBytes = AI.getModule()->getDataLayout().getTypeAllocSize(Ty);
+ return SizeInBytes * ArraySize;
+}
+
+bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
+ Value *Tag, size_t Size) {
+ size_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+ if (!UseShortGranules)
+ Size = AlignedSize;
+
+ Value *JustTag = IRB.CreateTrunc(Tag, IRB.getInt8Ty());
+ if (ClInstrumentWithCalls) {
+ IRB.CreateCall(HwasanTagMemoryFunc,
+ {IRB.CreatePointerCast(AI, Int8PtrTy), JustTag,
+ ConstantInt::get(IntptrTy, AlignedSize)});
+ } else {
+ size_t ShadowSize = Size >> Mapping.Scale;
+ Value *ShadowPtr = memToShadow(IRB.CreatePointerCast(AI, IntptrTy), IRB);
+ // If this memset is not inlined, it will be intercepted in the hwasan
+ // runtime library. That's OK, because the interceptor skips the checks if
+ // the address is in the shadow region.
+ // FIXME: the interceptor is not as fast as real memset. Consider lowering
+ // llvm.memset right here into either a sequence of stores, or a call to
+ // hwasan_tag_memory.
+ if (ShadowSize)
+ IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, Align(1));
+ if (Size != AlignedSize) {
+ IRB.CreateStore(
+ ConstantInt::get(Int8Ty, Size % Mapping.getObjectAlignment()),
+ IRB.CreateConstGEP1_32(Int8Ty, ShadowPtr, ShadowSize));
+ IRB.CreateStore(JustTag, IRB.CreateConstGEP1_32(
+ Int8Ty, IRB.CreateBitCast(AI, Int8PtrTy),
+ AlignedSize - 1));
+ }
+ }
+ return true;
+}
+
+static unsigned RetagMask(unsigned AllocaNo) {
+ // A list of 8-bit numbers that have at most one run of non-zero bits.
+ // x = x ^ (mask << 56) can be encoded as a single armv8 instruction for these
+ // masks.
+ // The list does not include the value 255, which is used for UAR.
+ //
+ // Because we are more likely to use earlier elements of this list than later
+ // ones, it is sorted in increasing order of probability of collision with a
+ // mask allocated (temporally) nearby. The program that generated this list
+ // can be found at:
+ // https://github.com/google/sanitizers/blob/master/hwaddress-sanitizer/sort_masks.py
+ static unsigned FastMasks[] = {0, 128, 64, 192, 32, 96, 224, 112, 240,
+ 48, 16, 120, 248, 56, 24, 8, 124, 252,
+ 60, 28, 12, 4, 126, 254, 62, 30, 14,
+ 6, 2, 127, 63, 31, 15, 7, 3, 1};
+ return FastMasks[AllocaNo % (sizeof(FastMasks) / sizeof(FastMasks[0]))];
+}
+
+Value *HWAddressSanitizer::getNextTagWithCall(IRBuilder<> &IRB) {
+ return IRB.CreateZExt(IRB.CreateCall(HwasanGenerateTagFunc), IntptrTy);
+}
+
+Value *HWAddressSanitizer::getStackBaseTag(IRBuilder<> &IRB) {
+ if (ClGenerateTagsWithCalls)
+ return getNextTagWithCall(IRB);
+ if (StackBaseTag)
+ return StackBaseTag;
+ // FIXME: use addressofreturnaddress (but implement it in aarch64 backend
+ // first).
+ Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+ auto GetStackPointerFn = Intrinsic::getDeclaration(
+ M, Intrinsic::frameaddress,
+ IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
+ Value *StackPointer = IRB.CreateCall(
+ GetStackPointerFn, {Constant::getNullValue(IRB.getInt32Ty())});
+
+ // Extract some entropy from the stack pointer for the tags.
+ // Take bits 20..28 (ASLR entropy) and xor with bits 0..8 (these differ
+ // between functions).
+ Value *StackPointerLong = IRB.CreatePointerCast(StackPointer, IntptrTy);
+ Value *StackTag =
+ IRB.CreateXor(StackPointerLong, IRB.CreateLShr(StackPointerLong, 20),
+ "hwasan.stack.base.tag");
+ return StackTag;
+}
+
+Value *HWAddressSanitizer::getAllocaTag(IRBuilder<> &IRB, Value *StackTag,
+ AllocaInst *AI, unsigned AllocaNo) {
+ if (ClGenerateTagsWithCalls)
+ return getNextTagWithCall(IRB);
+ return IRB.CreateXor(StackTag,
+ ConstantInt::get(IntptrTy, RetagMask(AllocaNo)));
+}
+
+Value *HWAddressSanitizer::getUARTag(IRBuilder<> &IRB, Value *StackTag) {
+ if (ClUARRetagToZero)
+ return ConstantInt::get(IntptrTy, 0);
+ if (ClGenerateTagsWithCalls)
+ return getNextTagWithCall(IRB);
+ return IRB.CreateXor(StackTag, ConstantInt::get(IntptrTy, 0xFFU));
+}
+
+// Add a tag to an address.
+Value *HWAddressSanitizer::tagPointer(IRBuilder<> &IRB, Type *Ty,
+ Value *PtrLong, Value *Tag) {
+ Value *TaggedPtrLong;
+ if (CompileKernel) {
+ // Kernel addresses have 0xFF in the most significant byte.
+ Value *ShiftedTag = IRB.CreateOr(
+ IRB.CreateShl(Tag, kPointerTagShift),
+ ConstantInt::get(IntptrTy, (1ULL << kPointerTagShift) - 1));
+ TaggedPtrLong = IRB.CreateAnd(PtrLong, ShiftedTag);
+ } else {
+ // Userspace can simply do OR (tag << 56);
+ Value *ShiftedTag = IRB.CreateShl(Tag, kPointerTagShift);
+ TaggedPtrLong = IRB.CreateOr(PtrLong, ShiftedTag);
+ }
+ return IRB.CreateIntToPtr(TaggedPtrLong, Ty);
+}
+
+// Remove tag from an address.
+Value *HWAddressSanitizer::untagPointer(IRBuilder<> &IRB, Value *PtrLong) {
+ Value *UntaggedPtrLong;
+ if (CompileKernel) {
+ // Kernel addresses have 0xFF in the most significant byte.
+ UntaggedPtrLong = IRB.CreateOr(PtrLong,
+ ConstantInt::get(PtrLong->getType(), 0xFFULL << kPointerTagShift));
+ } else {
+ // Userspace addresses have 0x00.
+ UntaggedPtrLong = IRB.CreateAnd(PtrLong,
+ ConstantInt::get(PtrLong->getType(), ~(0xFFULL << kPointerTagShift)));
+ }
+ return UntaggedPtrLong;
+}
+
+Value *HWAddressSanitizer::getHwasanThreadSlotPtr(IRBuilder<> &IRB, Type *Ty) {
+ Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+ if (TargetTriple.isAArch64() && TargetTriple.isAndroid()) {
+ // Android provides a fixed TLS slot for sanitizers. See TLS_SLOT_SANITIZER
+ // in Bionic's libc/private/bionic_tls.h.
+ Function *ThreadPointerFunc =
+ Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
+ Value *SlotPtr = IRB.CreatePointerCast(
+ IRB.CreateConstGEP1_32(IRB.getInt8Ty(),
+ IRB.CreateCall(ThreadPointerFunc), 0x30),
+ Ty->getPointerTo(0));
+ return SlotPtr;
+ }
+ if (ThreadPtrGlobal)
+ return ThreadPtrGlobal;
+
+
+ return nullptr;
+}
+
+void HWAddressSanitizer::emitPrologue(IRBuilder<> &IRB, bool WithFrameRecord) {
+ if (!Mapping.InTls) {
ShadowBase = getShadowNonTls(IRB);
- return;
- }
-
- if (!WithFrameRecord && TargetTriple.isAndroid()) {
+ return;
+ }
+
+ if (!WithFrameRecord && TargetTriple.isAndroid()) {
ShadowBase = getDynamicShadowIfunc(IRB);
- return;
- }
-
- Value *SlotPtr = getHwasanThreadSlotPtr(IRB, IntptrTy);
- assert(SlotPtr);
-
- Value *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr);
- // Extract the address field from ThreadLong. Unnecessary on AArch64 with TBI.
- Value *ThreadLongMaybeUntagged =
- TargetTriple.isAArch64() ? ThreadLong : untagPointer(IRB, ThreadLong);
-
- if (WithFrameRecord) {
- Function *F = IRB.GetInsertBlock()->getParent();
- StackBaseTag = IRB.CreateAShr(ThreadLong, 3);
-
- // Prepare ring buffer data.
- Value *PC;
- if (TargetTriple.getArch() == Triple::aarch64)
- PC = readRegister(IRB, "pc");
- else
- PC = IRB.CreatePtrToInt(F, IntptrTy);
- Module *M = F->getParent();
- auto GetStackPointerFn = Intrinsic::getDeclaration(
- M, Intrinsic::frameaddress,
- IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
- Value *SP = IRB.CreatePtrToInt(
- IRB.CreateCall(GetStackPointerFn,
- {Constant::getNullValue(IRB.getInt32Ty())}),
- IntptrTy);
- // Mix SP and PC.
- // Assumptions:
- // PC is 0x0000PPPPPPPPPPPP (48 bits are meaningful, others are zero)
- // SP is 0xsssssssssssSSSS0 (4 lower bits are zero)
- // We only really need ~20 lower non-zero bits (SSSS), so we mix like this:
- // 0xSSSSPPPPPPPPPPPP
- SP = IRB.CreateShl(SP, 44);
-
- // Store data to ring buffer.
- Value *RecordPtr =
- IRB.CreateIntToPtr(ThreadLongMaybeUntagged, IntptrTy->getPointerTo(0));
- IRB.CreateStore(IRB.CreateOr(PC, SP), RecordPtr);
-
- // Update the ring buffer. Top byte of ThreadLong defines the size of the
- // buffer in pages, it must be a power of two, and the start of the buffer
- // must be aligned by twice that much. Therefore wrap around of the ring
- // buffer is simply Addr &= ~((ThreadLong >> 56) << 12).
- // The use of AShr instead of LShr is due to
- // https://bugs.llvm.org/show_bug.cgi?id=39030
- // Runtime library makes sure not to use the highest bit.
- Value *WrapMask = IRB.CreateXor(
- IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true),
- ConstantInt::get(IntptrTy, (uint64_t)-1));
- Value *ThreadLongNew = IRB.CreateAnd(
- IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 8)), WrapMask);
- IRB.CreateStore(ThreadLongNew, SlotPtr);
- }
-
- // Get shadow base address by aligning RecordPtr up.
- // Note: this is not correct if the pointer is already aligned.
- // Runtime library will make sure this never happens.
+ return;
+ }
+
+ Value *SlotPtr = getHwasanThreadSlotPtr(IRB, IntptrTy);
+ assert(SlotPtr);
+
+ Value *ThreadLong = IRB.CreateLoad(IntptrTy, SlotPtr);
+ // Extract the address field from ThreadLong. Unnecessary on AArch64 with TBI.
+ Value *ThreadLongMaybeUntagged =
+ TargetTriple.isAArch64() ? ThreadLong : untagPointer(IRB, ThreadLong);
+
+ if (WithFrameRecord) {
+ Function *F = IRB.GetInsertBlock()->getParent();
+ StackBaseTag = IRB.CreateAShr(ThreadLong, 3);
+
+ // Prepare ring buffer data.
+ Value *PC;
+ if (TargetTriple.getArch() == Triple::aarch64)
+ PC = readRegister(IRB, "pc");
+ else
+ PC = IRB.CreatePtrToInt(F, IntptrTy);
+ Module *M = F->getParent();
+ auto GetStackPointerFn = Intrinsic::getDeclaration(
+ M, Intrinsic::frameaddress,
+ IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
+ Value *SP = IRB.CreatePtrToInt(
+ IRB.CreateCall(GetStackPointerFn,
+ {Constant::getNullValue(IRB.getInt32Ty())}),
+ IntptrTy);
+ // Mix SP and PC.
+ // Assumptions:
+ // PC is 0x0000PPPPPPPPPPPP (48 bits are meaningful, others are zero)
+ // SP is 0xsssssssssssSSSS0 (4 lower bits are zero)
+ // We only really need ~20 lower non-zero bits (SSSS), so we mix like this:
+ // 0xSSSSPPPPPPPPPPPP
+ SP = IRB.CreateShl(SP, 44);
+
+ // Store data to ring buffer.
+ Value *RecordPtr =
+ IRB.CreateIntToPtr(ThreadLongMaybeUntagged, IntptrTy->getPointerTo(0));
+ IRB.CreateStore(IRB.CreateOr(PC, SP), RecordPtr);
+
+ // Update the ring buffer. Top byte of ThreadLong defines the size of the
+ // buffer in pages, it must be a power of two, and the start of the buffer
+ // must be aligned by twice that much. Therefore wrap around of the ring
+ // buffer is simply Addr &= ~((ThreadLong >> 56) << 12).
+ // The use of AShr instead of LShr is due to
+ // https://bugs.llvm.org/show_bug.cgi?id=39030
+ // Runtime library makes sure not to use the highest bit.
+ Value *WrapMask = IRB.CreateXor(
+ IRB.CreateShl(IRB.CreateAShr(ThreadLong, 56), 12, "", true, true),
+ ConstantInt::get(IntptrTy, (uint64_t)-1));
+ Value *ThreadLongNew = IRB.CreateAnd(
+ IRB.CreateAdd(ThreadLong, ConstantInt::get(IntptrTy, 8)), WrapMask);
+ IRB.CreateStore(ThreadLongNew, SlotPtr);
+ }
+
+ // Get shadow base address by aligning RecordPtr up.
+ // Note: this is not correct if the pointer is already aligned.
+ // Runtime library will make sure this never happens.
ShadowBase = IRB.CreateAdd(
- IRB.CreateOr(
- ThreadLongMaybeUntagged,
- ConstantInt::get(IntptrTy, (1ULL << kShadowBaseAlignment) - 1)),
- ConstantInt::get(IntptrTy, 1), "hwasan.shadow");
+ IRB.CreateOr(
+ ThreadLongMaybeUntagged,
+ ConstantInt::get(IntptrTy, (1ULL << kShadowBaseAlignment) - 1)),
+ ConstantInt::get(IntptrTy, 1), "hwasan.shadow");
ShadowBase = IRB.CreateIntToPtr(ShadowBase, Int8PtrTy);
-}
-
-Value *HWAddressSanitizer::readRegister(IRBuilder<> &IRB, StringRef Name) {
- Module *M = IRB.GetInsertBlock()->getParent()->getParent();
- Function *ReadRegister =
- Intrinsic::getDeclaration(M, Intrinsic::read_register, IntptrTy);
- MDNode *MD = MDNode::get(*C, {MDString::get(*C, Name)});
- Value *Args[] = {MetadataAsValue::get(*C, MD)};
- return IRB.CreateCall(ReadRegister, Args);
-}
-
-bool HWAddressSanitizer::instrumentLandingPads(
- SmallVectorImpl<Instruction *> &LandingPadVec) {
- for (auto *LP : LandingPadVec) {
- IRBuilder<> IRB(LP->getNextNode());
- IRB.CreateCall(
- HWAsanHandleVfork,
- {readRegister(IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp"
- : "sp")});
- }
- return true;
-}
-
-bool HWAddressSanitizer::instrumentStack(
- SmallVectorImpl<AllocaInst *> &Allocas,
- DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap,
- SmallVectorImpl<Instruction *> &RetVec, Value *StackTag) {
- // Ideally, we want to calculate tagged stack base pointer, and rewrite all
- // alloca addresses using that. Unfortunately, offsets are not known yet
- // (unless we use ASan-style mega-alloca). Instead we keep the base tag in a
- // temp, shift-OR it into each alloca address and xor with the retag mask.
- // This generates one extra instruction per alloca use.
- for (unsigned N = 0; N < Allocas.size(); ++N) {
- auto *AI = Allocas[N];
- IRBuilder<> IRB(AI->getNextNode());
-
- // Replace uses of the alloca with tagged address.
- Value *Tag = getAllocaTag(IRB, StackTag, AI, N);
- Value *AILong = IRB.CreatePointerCast(AI, IntptrTy);
- Value *Replacement = tagPointer(IRB, AI->getType(), AILong, Tag);
- std::string Name =
- AI->hasName() ? AI->getName().str() : "alloca." + itostr(N);
- Replacement->setName(Name + ".hwasan");
-
- AI->replaceUsesWithIf(Replacement,
- [AILong](Use &U) { return U.getUser() != AILong; });
-
- for (auto *DDI : AllocaDbgMap.lookup(AI)) {
- // Prepend "tag_offset, N" to the dwarf expression.
- // Tag offset logically applies to the alloca pointer, and it makes sense
- // to put it at the beginning of the expression.
- SmallVector<uint64_t, 8> NewOps = {dwarf::DW_OP_LLVM_tag_offset,
- RetagMask(N)};
- DDI->setArgOperand(
- 2, MetadataAsValue::get(*C, DIExpression::prependOpcodes(
- DDI->getExpression(), NewOps)));
- }
-
- size_t Size = getAllocaSizeInBytes(*AI);
- tagAlloca(IRB, AI, Tag, Size);
-
- for (auto RI : RetVec) {
- IRB.SetInsertPoint(RI);
-
- // Re-tag alloca memory with the special UAR tag.
- Value *Tag = getUARTag(IRB, StackTag);
- tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getObjectAlignment()));
- }
- }
-
- return true;
-}
-
-bool HWAddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
- return (AI.getAllocatedType()->isSized() &&
- // FIXME: instrument dynamic allocas, too
- AI.isStaticAlloca() &&
- // alloca() may be called with 0 size, ignore it.
- getAllocaSizeInBytes(AI) > 0 &&
- // We are only interested in allocas not promotable to registers.
- // Promotable allocas are common under -O0.
- !isAllocaPromotable(&AI) &&
- // inalloca allocas are not treated as static, and we don't want
- // dynamic alloca instrumentation for them as well.
- !AI.isUsedWithInAlloca() &&
- // swifterror allocas are register promoted by ISel
- !AI.isSwiftError());
-}
-
-bool HWAddressSanitizer::sanitizeFunction(Function &F) {
- if (&F == HwasanCtorFunction)
- return false;
-
- if (!F.hasFnAttribute(Attribute::SanitizeHWAddress))
- return false;
-
- LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");
-
- SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
- SmallVector<MemIntrinsic *, 16> IntrinToInstrument;
- SmallVector<AllocaInst*, 8> AllocasToInstrument;
- SmallVector<Instruction*, 8> RetVec;
- SmallVector<Instruction*, 8> LandingPadVec;
- DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> AllocaDbgMap;
- for (auto &BB : F) {
- for (auto &Inst : BB) {
- if (ClInstrumentStack)
- if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
- if (isInterestingAlloca(*AI))
- AllocasToInstrument.push_back(AI);
- continue;
- }
-
- if (isa<ReturnInst>(Inst) || isa<ResumeInst>(Inst) ||
- isa<CleanupReturnInst>(Inst))
- RetVec.push_back(&Inst);
-
- if (auto *DDI = dyn_cast<DbgVariableIntrinsic>(&Inst))
- if (auto *Alloca =
- dyn_cast_or_null<AllocaInst>(DDI->getVariableLocation()))
- AllocaDbgMap[Alloca].push_back(DDI);
-
- if (InstrumentLandingPads && isa<LandingPadInst>(Inst))
- LandingPadVec.push_back(&Inst);
-
- getInterestingMemoryOperands(&Inst, OperandsToInstrument);
-
- if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst))
- IntrinToInstrument.push_back(MI);
- }
- }
-
- initializeCallbacks(*F.getParent());
-
- bool Changed = false;
-
- if (!LandingPadVec.empty())
- Changed |= instrumentLandingPads(LandingPadVec);
-
- if (AllocasToInstrument.empty() && F.hasPersonalityFn() &&
- F.getPersonalityFn()->getName() == kHwasanPersonalityThunkName) {
- // __hwasan_personality_thunk is a no-op for functions without an
- // instrumented stack, so we can drop it.
- F.setPersonalityFn(nullptr);
- Changed = true;
- }
-
- if (AllocasToInstrument.empty() && OperandsToInstrument.empty() &&
- IntrinToInstrument.empty())
- return Changed;
-
+}
+
+Value *HWAddressSanitizer::readRegister(IRBuilder<> &IRB, StringRef Name) {
+ Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+ Function *ReadRegister =
+ Intrinsic::getDeclaration(M, Intrinsic::read_register, IntptrTy);
+ MDNode *MD = MDNode::get(*C, {MDString::get(*C, Name)});
+ Value *Args[] = {MetadataAsValue::get(*C, MD)};
+ return IRB.CreateCall(ReadRegister, Args);
+}
+
+bool HWAddressSanitizer::instrumentLandingPads(
+ SmallVectorImpl<Instruction *> &LandingPadVec) {
+ for (auto *LP : LandingPadVec) {
+ IRBuilder<> IRB(LP->getNextNode());
+ IRB.CreateCall(
+ HWAsanHandleVfork,
+ {readRegister(IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp"
+ : "sp")});
+ }
+ return true;
+}
+
+bool HWAddressSanitizer::instrumentStack(
+ SmallVectorImpl<AllocaInst *> &Allocas,
+ DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> &AllocaDbgMap,
+ SmallVectorImpl<Instruction *> &RetVec, Value *StackTag) {
+ // Ideally, we want to calculate tagged stack base pointer, and rewrite all
+ // alloca addresses using that. Unfortunately, offsets are not known yet
+ // (unless we use ASan-style mega-alloca). Instead we keep the base tag in a
+ // temp, shift-OR it into each alloca address and xor with the retag mask.
+ // This generates one extra instruction per alloca use.
+ for (unsigned N = 0; N < Allocas.size(); ++N) {
+ auto *AI = Allocas[N];
+ IRBuilder<> IRB(AI->getNextNode());
+
+ // Replace uses of the alloca with tagged address.
+ Value *Tag = getAllocaTag(IRB, StackTag, AI, N);
+ Value *AILong = IRB.CreatePointerCast(AI, IntptrTy);
+ Value *Replacement = tagPointer(IRB, AI->getType(), AILong, Tag);
+ std::string Name =
+ AI->hasName() ? AI->getName().str() : "alloca." + itostr(N);
+ Replacement->setName(Name + ".hwasan");
+
+ AI->replaceUsesWithIf(Replacement,
+ [AILong](Use &U) { return U.getUser() != AILong; });
+
+ for (auto *DDI : AllocaDbgMap.lookup(AI)) {
+ // Prepend "tag_offset, N" to the dwarf expression.
+ // Tag offset logically applies to the alloca pointer, and it makes sense
+ // to put it at the beginning of the expression.
+ SmallVector<uint64_t, 8> NewOps = {dwarf::DW_OP_LLVM_tag_offset,
+ RetagMask(N)};
+ DDI->setArgOperand(
+ 2, MetadataAsValue::get(*C, DIExpression::prependOpcodes(
+ DDI->getExpression(), NewOps)));
+ }
+
+ size_t Size = getAllocaSizeInBytes(*AI);
+ tagAlloca(IRB, AI, Tag, Size);
+
+ for (auto RI : RetVec) {
+ IRB.SetInsertPoint(RI);
+
+ // Re-tag alloca memory with the special UAR tag.
+ Value *Tag = getUARTag(IRB, StackTag);
+ tagAlloca(IRB, AI, Tag, alignTo(Size, Mapping.getObjectAlignment()));
+ }
+ }
+
+ return true;
+}
+
+bool HWAddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
+ return (AI.getAllocatedType()->isSized() &&
+ // FIXME: instrument dynamic allocas, too
+ AI.isStaticAlloca() &&
+ // alloca() may be called with 0 size, ignore it.
+ getAllocaSizeInBytes(AI) > 0 &&
+ // We are only interested in allocas not promotable to registers.
+ // Promotable allocas are common under -O0.
+ !isAllocaPromotable(&AI) &&
+ // inalloca allocas are not treated as static, and we don't want
+ // dynamic alloca instrumentation for them as well.
+ !AI.isUsedWithInAlloca() &&
+ // swifterror allocas are register promoted by ISel
+ !AI.isSwiftError());
+}
+
+bool HWAddressSanitizer::sanitizeFunction(Function &F) {
+ if (&F == HwasanCtorFunction)
+ return false;
+
+ if (!F.hasFnAttribute(Attribute::SanitizeHWAddress))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");
+
+ SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
+ SmallVector<MemIntrinsic *, 16> IntrinToInstrument;
+ SmallVector<AllocaInst*, 8> AllocasToInstrument;
+ SmallVector<Instruction*, 8> RetVec;
+ SmallVector<Instruction*, 8> LandingPadVec;
+ DenseMap<AllocaInst *, std::vector<DbgVariableIntrinsic *>> AllocaDbgMap;
+ for (auto &BB : F) {
+ for (auto &Inst : BB) {
+ if (ClInstrumentStack)
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(&Inst)) {
+ if (isInterestingAlloca(*AI))
+ AllocasToInstrument.push_back(AI);
+ continue;
+ }
+
+ if (isa<ReturnInst>(Inst) || isa<ResumeInst>(Inst) ||
+ isa<CleanupReturnInst>(Inst))
+ RetVec.push_back(&Inst);
+
+ if (auto *DDI = dyn_cast<DbgVariableIntrinsic>(&Inst))
+ if (auto *Alloca =
+ dyn_cast_or_null<AllocaInst>(DDI->getVariableLocation()))
+ AllocaDbgMap[Alloca].push_back(DDI);
+
+ if (InstrumentLandingPads && isa<LandingPadInst>(Inst))
+ LandingPadVec.push_back(&Inst);
+
+ getInterestingMemoryOperands(&Inst, OperandsToInstrument);
+
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst))
+ IntrinToInstrument.push_back(MI);
+ }
+ }
+
+ initializeCallbacks(*F.getParent());
+
+ bool Changed = false;
+
+ if (!LandingPadVec.empty())
+ Changed |= instrumentLandingPads(LandingPadVec);
+
+ if (AllocasToInstrument.empty() && F.hasPersonalityFn() &&
+ F.getPersonalityFn()->getName() == kHwasanPersonalityThunkName) {
+ // __hwasan_personality_thunk is a no-op for functions without an
+ // instrumented stack, so we can drop it.
+ F.setPersonalityFn(nullptr);
+ Changed = true;
+ }
+
+ if (AllocasToInstrument.empty() && OperandsToInstrument.empty() &&
+ IntrinToInstrument.empty())
+ return Changed;
+
assert(!ShadowBase);
-
- Instruction *InsertPt = &*F.getEntryBlock().begin();
- IRBuilder<> EntryIRB(InsertPt);
- emitPrologue(EntryIRB,
- /*WithFrameRecord*/ ClRecordStackHistory &&
- !AllocasToInstrument.empty());
-
- if (!AllocasToInstrument.empty()) {
- Value *StackTag =
- ClGenerateTagsWithCalls ? nullptr : getStackBaseTag(EntryIRB);
- instrumentStack(AllocasToInstrument, AllocaDbgMap, RetVec, StackTag);
- }
- // Pad and align each of the allocas that we instrumented to stop small
- // uninteresting allocas from hiding in instrumented alloca's padding and so
- // that we have enough space to store real tags for short granules.
- DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
- for (AllocaInst *AI : AllocasToInstrument) {
- uint64_t Size = getAllocaSizeInBytes(*AI);
- uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
- AI->setAlignment(
- Align(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
- if (Size != AlignedSize) {
- Type *AllocatedType = AI->getAllocatedType();
- if (AI->isArrayAllocation()) {
- uint64_t ArraySize =
- cast<ConstantInt>(AI->getArraySize())->getZExtValue();
- AllocatedType = ArrayType::get(AllocatedType, ArraySize);
- }
- Type *TypeWithPadding = StructType::get(
- AllocatedType, ArrayType::get(Int8Ty, AlignedSize - Size));
- auto *NewAI = new AllocaInst(
- TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
- NewAI->takeName(AI);
- NewAI->setAlignment(AI->getAlign());
- NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
- NewAI->setSwiftError(AI->isSwiftError());
- NewAI->copyMetadata(*AI);
- auto *Bitcast = new BitCastInst(NewAI, AI->getType(), "", AI);
- AI->replaceAllUsesWith(Bitcast);
- AllocaToPaddedAllocaMap[AI] = NewAI;
- }
- }
-
- if (!AllocaToPaddedAllocaMap.empty()) {
- for (auto &BB : F)
- for (auto &Inst : BB)
- if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst))
- if (auto *AI =
- dyn_cast_or_null<AllocaInst>(DVI->getVariableLocation()))
- if (auto *NewAI = AllocaToPaddedAllocaMap.lookup(AI))
- DVI->setArgOperand(
- 0, MetadataAsValue::get(*C, LocalAsMetadata::get(NewAI)));
- for (auto &P : AllocaToPaddedAllocaMap)
- P.first->eraseFromParent();
- }
-
- // If we split the entry block, move any allocas that were originally in the
- // entry block back into the entry block so that they aren't treated as
- // dynamic allocas.
- if (EntryIRB.GetInsertBlock() != &F.getEntryBlock()) {
- InsertPt = &*F.getEntryBlock().begin();
- for (auto II = EntryIRB.GetInsertBlock()->begin(),
- IE = EntryIRB.GetInsertBlock()->end();
- II != IE;) {
- Instruction *I = &*II++;
- if (auto *AI = dyn_cast<AllocaInst>(I))
- if (isa<ConstantInt>(AI->getArraySize()))
- I->moveBefore(InsertPt);
- }
- }
-
- for (auto &Operand : OperandsToInstrument)
- instrumentMemAccess(Operand);
-
- if (ClInstrumentMemIntrinsics && !IntrinToInstrument.empty()) {
- for (auto Inst : IntrinToInstrument)
- instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
- }
-
+
+ Instruction *InsertPt = &*F.getEntryBlock().begin();
+ IRBuilder<> EntryIRB(InsertPt);
+ emitPrologue(EntryIRB,
+ /*WithFrameRecord*/ ClRecordStackHistory &&
+ !AllocasToInstrument.empty());
+
+ if (!AllocasToInstrument.empty()) {
+ Value *StackTag =
+ ClGenerateTagsWithCalls ? nullptr : getStackBaseTag(EntryIRB);
+ instrumentStack(AllocasToInstrument, AllocaDbgMap, RetVec, StackTag);
+ }
+ // Pad and align each of the allocas that we instrumented to stop small
+ // uninteresting allocas from hiding in instrumented alloca's padding and so
+ // that we have enough space to store real tags for short granules.
+ DenseMap<AllocaInst *, AllocaInst *> AllocaToPaddedAllocaMap;
+ for (AllocaInst *AI : AllocasToInstrument) {
+ uint64_t Size = getAllocaSizeInBytes(*AI);
+ uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
+ AI->setAlignment(
+ Align(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
+ if (Size != AlignedSize) {
+ Type *AllocatedType = AI->getAllocatedType();
+ if (AI->isArrayAllocation()) {
+ uint64_t ArraySize =
+ cast<ConstantInt>(AI->getArraySize())->getZExtValue();
+ AllocatedType = ArrayType::get(AllocatedType, ArraySize);
+ }
+ Type *TypeWithPadding = StructType::get(
+ AllocatedType, ArrayType::get(Int8Ty, AlignedSize - Size));
+ auto *NewAI = new AllocaInst(
+ TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
+ NewAI->takeName(AI);
+ NewAI->setAlignment(AI->getAlign());
+ NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
+ NewAI->setSwiftError(AI->isSwiftError());
+ NewAI->copyMetadata(*AI);
+ auto *Bitcast = new BitCastInst(NewAI, AI->getType(), "", AI);
+ AI->replaceAllUsesWith(Bitcast);
+ AllocaToPaddedAllocaMap[AI] = NewAI;
+ }
+ }
+
+ if (!AllocaToPaddedAllocaMap.empty()) {
+ for (auto &BB : F)
+ for (auto &Inst : BB)
+ if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst))
+ if (auto *AI =
+ dyn_cast_or_null<AllocaInst>(DVI->getVariableLocation()))
+ if (auto *NewAI = AllocaToPaddedAllocaMap.lookup(AI))
+ DVI->setArgOperand(
+ 0, MetadataAsValue::get(*C, LocalAsMetadata::get(NewAI)));
+ for (auto &P : AllocaToPaddedAllocaMap)
+ P.first->eraseFromParent();
+ }
+
+ // If we split the entry block, move any allocas that were originally in the
+ // entry block back into the entry block so that they aren't treated as
+ // dynamic allocas.
+ if (EntryIRB.GetInsertBlock() != &F.getEntryBlock()) {
+ InsertPt = &*F.getEntryBlock().begin();
+ for (auto II = EntryIRB.GetInsertBlock()->begin(),
+ IE = EntryIRB.GetInsertBlock()->end();
+ II != IE;) {
+ Instruction *I = &*II++;
+ if (auto *AI = dyn_cast<AllocaInst>(I))
+ if (isa<ConstantInt>(AI->getArraySize()))
+ I->moveBefore(InsertPt);
+ }
+ }
+
+ for (auto &Operand : OperandsToInstrument)
+ instrumentMemAccess(Operand);
+
+ if (ClInstrumentMemIntrinsics && !IntrinToInstrument.empty()) {
+ for (auto Inst : IntrinToInstrument)
+ instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+ }
+
ShadowBase = nullptr;
- StackBaseTag = nullptr;
-
- return true;
-}
-
-void HWAddressSanitizer::instrumentGlobal(GlobalVariable *GV, uint8_t Tag) {
- Constant *Initializer = GV->getInitializer();
- uint64_t SizeInBytes =
- M.getDataLayout().getTypeAllocSize(Initializer->getType());
- uint64_t NewSize = alignTo(SizeInBytes, Mapping.getObjectAlignment());
- if (SizeInBytes != NewSize) {
- // Pad the initializer out to the next multiple of 16 bytes and add the
- // required short granule tag.
- std::vector<uint8_t> Init(NewSize - SizeInBytes, 0);
- Init.back() = Tag;
- Constant *Padding = ConstantDataArray::get(*C, Init);
- Initializer = ConstantStruct::getAnon({Initializer, Padding});
- }
-
- auto *NewGV = new GlobalVariable(M, Initializer->getType(), GV->isConstant(),
- GlobalValue::ExternalLinkage, Initializer,
- GV->getName() + ".hwasan");
- NewGV->copyAttributesFrom(GV);
- NewGV->setLinkage(GlobalValue::PrivateLinkage);
- NewGV->copyMetadata(GV, 0);
- NewGV->setAlignment(
- MaybeAlign(std::max(GV->getAlignment(), Mapping.getObjectAlignment())));
-
- // It is invalid to ICF two globals that have different tags. In the case
- // where the size of the global is a multiple of the tag granularity the
- // contents of the globals may be the same but the tags (i.e. symbol values)
- // may be different, and the symbols are not considered during ICF. In the
- // case where the size is not a multiple of the granularity, the short granule
- // tags would discriminate two globals with different tags, but there would
- // otherwise be nothing stopping such a global from being incorrectly ICF'd
- // with an uninstrumented (i.e. tag 0) global that happened to have the short
- // granule tag in the last byte.
- NewGV->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
-
- // Descriptor format (assuming little-endian):
- // bytes 0-3: relative address of global
- // bytes 4-6: size of global (16MB ought to be enough for anyone, but in case
- // it isn't, we create multiple descriptors)
- // byte 7: tag
- auto *DescriptorTy = StructType::get(Int32Ty, Int32Ty);
- const uint64_t MaxDescriptorSize = 0xfffff0;
- for (uint64_t DescriptorPos = 0; DescriptorPos < SizeInBytes;
- DescriptorPos += MaxDescriptorSize) {
- auto *Descriptor =
- new GlobalVariable(M, DescriptorTy, true, GlobalValue::PrivateLinkage,
- nullptr, GV->getName() + ".hwasan.descriptor");
- auto *GVRelPtr = ConstantExpr::getTrunc(
- ConstantExpr::getAdd(
- ConstantExpr::getSub(
- ConstantExpr::getPtrToInt(NewGV, Int64Ty),
- ConstantExpr::getPtrToInt(Descriptor, Int64Ty)),
- ConstantInt::get(Int64Ty, DescriptorPos)),
- Int32Ty);
- uint32_t Size = std::min(SizeInBytes - DescriptorPos, MaxDescriptorSize);
- auto *SizeAndTag = ConstantInt::get(Int32Ty, Size | (uint32_t(Tag) << 24));
- Descriptor->setComdat(NewGV->getComdat());
- Descriptor->setInitializer(ConstantStruct::getAnon({GVRelPtr, SizeAndTag}));
- Descriptor->setSection("hwasan_globals");
- Descriptor->setMetadata(LLVMContext::MD_associated,
- MDNode::get(*C, ValueAsMetadata::get(NewGV)));
- appendToCompilerUsed(M, Descriptor);
- }
-
- Constant *Aliasee = ConstantExpr::getIntToPtr(
- ConstantExpr::getAdd(
- ConstantExpr::getPtrToInt(NewGV, Int64Ty),
- ConstantInt::get(Int64Ty, uint64_t(Tag) << kPointerTagShift)),
- GV->getType());
- auto *Alias = GlobalAlias::create(GV->getValueType(), GV->getAddressSpace(),
- GV->getLinkage(), "", Aliasee, &M);
- Alias->setVisibility(GV->getVisibility());
- Alias->takeName(GV);
- GV->replaceAllUsesWith(Alias);
- GV->eraseFromParent();
-}
-
-void HWAddressSanitizer::instrumentGlobals() {
- std::vector<GlobalVariable *> Globals;
- for (GlobalVariable &GV : M.globals()) {
- if (GV.isDeclarationForLinker() || GV.getName().startswith("llvm.") ||
- GV.isThreadLocal())
- continue;
-
- // Common symbols can't have aliases point to them, so they can't be tagged.
- if (GV.hasCommonLinkage())
- continue;
-
- // Globals with custom sections may be used in __start_/__stop_ enumeration,
- // which would be broken both by adding tags and potentially by the extra
- // padding/alignment that we insert.
- if (GV.hasSection())
- continue;
-
- Globals.push_back(&GV);
- }
-
- MD5 Hasher;
- Hasher.update(M.getSourceFileName());
- MD5::MD5Result Hash;
- Hasher.final(Hash);
- uint8_t Tag = Hash[0];
-
- for (GlobalVariable *GV : Globals) {
- // Skip tag 0 in order to avoid collisions with untagged memory.
- if (Tag == 0)
- Tag = 1;
- instrumentGlobal(GV, Tag++);
- }
-}
-
-void HWAddressSanitizer::instrumentPersonalityFunctions() {
- // We need to untag stack frames as we unwind past them. That is the job of
- // the personality function wrapper, which either wraps an existing
- // personality function or acts as a personality function on its own. Each
- // function that has a personality function or that can be unwound past has
- // its personality function changed to a thunk that calls the personality
- // function wrapper in the runtime.
- MapVector<Constant *, std::vector<Function *>> PersonalityFns;
- for (Function &F : M) {
- if (F.isDeclaration() || !F.hasFnAttribute(Attribute::SanitizeHWAddress))
- continue;
-
- if (F.hasPersonalityFn()) {
- PersonalityFns[F.getPersonalityFn()->stripPointerCasts()].push_back(&F);
- } else if (!F.hasFnAttribute(Attribute::NoUnwind)) {
- PersonalityFns[nullptr].push_back(&F);
- }
- }
-
- if (PersonalityFns.empty())
- return;
-
- FunctionCallee HwasanPersonalityWrapper = M.getOrInsertFunction(
- "__hwasan_personality_wrapper", Int32Ty, Int32Ty, Int32Ty, Int64Ty,
- Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy);
- FunctionCallee UnwindGetGR = M.getOrInsertFunction("_Unwind_GetGR", VoidTy);
- FunctionCallee UnwindGetCFA = M.getOrInsertFunction("_Unwind_GetCFA", VoidTy);
-
- for (auto &P : PersonalityFns) {
- std::string ThunkName = kHwasanPersonalityThunkName;
- if (P.first)
- ThunkName += ("." + P.first->getName()).str();
- FunctionType *ThunkFnTy = FunctionType::get(
- Int32Ty, {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int8PtrTy}, false);
- bool IsLocal = P.first && (!isa<GlobalValue>(P.first) ||
- cast<GlobalValue>(P.first)->hasLocalLinkage());
- auto *ThunkFn = Function::Create(ThunkFnTy,
- IsLocal ? GlobalValue::InternalLinkage
- : GlobalValue::LinkOnceODRLinkage,
- ThunkName, &M);
- if (!IsLocal) {
- ThunkFn->setVisibility(GlobalValue::HiddenVisibility);
- ThunkFn->setComdat(M.getOrInsertComdat(ThunkName));
- }
-
- auto *BB = BasicBlock::Create(*C, "entry", ThunkFn);
- IRBuilder<> IRB(BB);
- CallInst *WrapperCall = IRB.CreateCall(
- HwasanPersonalityWrapper,
- {ThunkFn->getArg(0), ThunkFn->getArg(1), ThunkFn->getArg(2),
- ThunkFn->getArg(3), ThunkFn->getArg(4),
- P.first ? IRB.CreateBitCast(P.first, Int8PtrTy)
- : Constant::getNullValue(Int8PtrTy),
- IRB.CreateBitCast(UnwindGetGR.getCallee(), Int8PtrTy),
- IRB.CreateBitCast(UnwindGetCFA.getCallee(), Int8PtrTy)});
- WrapperCall->setTailCall();
- IRB.CreateRet(WrapperCall);
-
- for (Function *F : P.second)
- F->setPersonalityFn(ThunkFn);
- }
-}
-
-void HWAddressSanitizer::ShadowMapping::init(Triple &TargetTriple) {
- Scale = kDefaultShadowScale;
- if (ClMappingOffset.getNumOccurrences() > 0) {
- InGlobal = false;
- InTls = false;
- Offset = ClMappingOffset;
- } else if (ClEnableKhwasan || ClInstrumentWithCalls) {
- InGlobal = false;
- InTls = false;
- Offset = 0;
- } else if (ClWithIfunc) {
- InGlobal = true;
- InTls = false;
- Offset = kDynamicShadowSentinel;
- } else if (ClWithTls) {
- InGlobal = false;
- InTls = true;
- Offset = kDynamicShadowSentinel;
- } else {
- InGlobal = false;
- InTls = false;
- Offset = kDynamicShadowSentinel;
- }
-}
+ StackBaseTag = nullptr;
+
+ return true;
+}
+
+void HWAddressSanitizer::instrumentGlobal(GlobalVariable *GV, uint8_t Tag) {
+ Constant *Initializer = GV->getInitializer();
+ uint64_t SizeInBytes =
+ M.getDataLayout().getTypeAllocSize(Initializer->getType());
+ uint64_t NewSize = alignTo(SizeInBytes, Mapping.getObjectAlignment());
+ if (SizeInBytes != NewSize) {
+ // Pad the initializer out to the next multiple of 16 bytes and add the
+ // required short granule tag.
+ std::vector<uint8_t> Init(NewSize - SizeInBytes, 0);
+ Init.back() = Tag;
+ Constant *Padding = ConstantDataArray::get(*C, Init);
+ Initializer = ConstantStruct::getAnon({Initializer, Padding});
+ }
+
+ auto *NewGV = new GlobalVariable(M, Initializer->getType(), GV->isConstant(),
+ GlobalValue::ExternalLinkage, Initializer,
+ GV->getName() + ".hwasan");
+ NewGV->copyAttributesFrom(GV);
+ NewGV->setLinkage(GlobalValue::PrivateLinkage);
+ NewGV->copyMetadata(GV, 0);
+ NewGV->setAlignment(
+ MaybeAlign(std::max(GV->getAlignment(), Mapping.getObjectAlignment())));
+
+ // It is invalid to ICF two globals that have different tags. In the case
+ // where the size of the global is a multiple of the tag granularity the
+ // contents of the globals may be the same but the tags (i.e. symbol values)
+ // may be different, and the symbols are not considered during ICF. In the
+ // case where the size is not a multiple of the granularity, the short granule
+ // tags would discriminate two globals with different tags, but there would
+ // otherwise be nothing stopping such a global from being incorrectly ICF'd
+ // with an uninstrumented (i.e. tag 0) global that happened to have the short
+ // granule tag in the last byte.
+ NewGV->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
+
+ // Descriptor format (assuming little-endian):
+ // bytes 0-3: relative address of global
+ // bytes 4-6: size of global (16MB ought to be enough for anyone, but in case
+ // it isn't, we create multiple descriptors)
+ // byte 7: tag
+ auto *DescriptorTy = StructType::get(Int32Ty, Int32Ty);
+ const uint64_t MaxDescriptorSize = 0xfffff0;
+ for (uint64_t DescriptorPos = 0; DescriptorPos < SizeInBytes;
+ DescriptorPos += MaxDescriptorSize) {
+ auto *Descriptor =
+ new GlobalVariable(M, DescriptorTy, true, GlobalValue::PrivateLinkage,
+ nullptr, GV->getName() + ".hwasan.descriptor");
+ auto *GVRelPtr = ConstantExpr::getTrunc(
+ ConstantExpr::getAdd(
+ ConstantExpr::getSub(
+ ConstantExpr::getPtrToInt(NewGV, Int64Ty),
+ ConstantExpr::getPtrToInt(Descriptor, Int64Ty)),
+ ConstantInt::get(Int64Ty, DescriptorPos)),
+ Int32Ty);
+ uint32_t Size = std::min(SizeInBytes - DescriptorPos, MaxDescriptorSize);
+ auto *SizeAndTag = ConstantInt::get(Int32Ty, Size | (uint32_t(Tag) << 24));
+ Descriptor->setComdat(NewGV->getComdat());
+ Descriptor->setInitializer(ConstantStruct::getAnon({GVRelPtr, SizeAndTag}));
+ Descriptor->setSection("hwasan_globals");
+ Descriptor->setMetadata(LLVMContext::MD_associated,
+ MDNode::get(*C, ValueAsMetadata::get(NewGV)));
+ appendToCompilerUsed(M, Descriptor);
+ }
+
+ Constant *Aliasee = ConstantExpr::getIntToPtr(
+ ConstantExpr::getAdd(
+ ConstantExpr::getPtrToInt(NewGV, Int64Ty),
+ ConstantInt::get(Int64Ty, uint64_t(Tag) << kPointerTagShift)),
+ GV->getType());
+ auto *Alias = GlobalAlias::create(GV->getValueType(), GV->getAddressSpace(),
+ GV->getLinkage(), "", Aliasee, &M);
+ Alias->setVisibility(GV->getVisibility());
+ Alias->takeName(GV);
+ GV->replaceAllUsesWith(Alias);
+ GV->eraseFromParent();
+}
+
+void HWAddressSanitizer::instrumentGlobals() {
+ std::vector<GlobalVariable *> Globals;
+ for (GlobalVariable &GV : M.globals()) {
+ if (GV.isDeclarationForLinker() || GV.getName().startswith("llvm.") ||
+ GV.isThreadLocal())
+ continue;
+
+ // Common symbols can't have aliases point to them, so they can't be tagged.
+ if (GV.hasCommonLinkage())
+ continue;
+
+ // Globals with custom sections may be used in __start_/__stop_ enumeration,
+ // which would be broken both by adding tags and potentially by the extra
+ // padding/alignment that we insert.
+ if (GV.hasSection())
+ continue;
+
+ Globals.push_back(&GV);
+ }
+
+ MD5 Hasher;
+ Hasher.update(M.getSourceFileName());
+ MD5::MD5Result Hash;
+ Hasher.final(Hash);
+ uint8_t Tag = Hash[0];
+
+ for (GlobalVariable *GV : Globals) {
+ // Skip tag 0 in order to avoid collisions with untagged memory.
+ if (Tag == 0)
+ Tag = 1;
+ instrumentGlobal(GV, Tag++);
+ }
+}
+
+void HWAddressSanitizer::instrumentPersonalityFunctions() {
+ // We need to untag stack frames as we unwind past them. That is the job of
+ // the personality function wrapper, which either wraps an existing
+ // personality function or acts as a personality function on its own. Each
+ // function that has a personality function or that can be unwound past has
+ // its personality function changed to a thunk that calls the personality
+ // function wrapper in the runtime.
+ MapVector<Constant *, std::vector<Function *>> PersonalityFns;
+ for (Function &F : M) {
+ if (F.isDeclaration() || !F.hasFnAttribute(Attribute::SanitizeHWAddress))
+ continue;
+
+ if (F.hasPersonalityFn()) {
+ PersonalityFns[F.getPersonalityFn()->stripPointerCasts()].push_back(&F);
+ } else if (!F.hasFnAttribute(Attribute::NoUnwind)) {
+ PersonalityFns[nullptr].push_back(&F);
+ }
+ }
+
+ if (PersonalityFns.empty())
+ return;
+
+ FunctionCallee HwasanPersonalityWrapper = M.getOrInsertFunction(
+ "__hwasan_personality_wrapper", Int32Ty, Int32Ty, Int32Ty, Int64Ty,
+ Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy, Int8PtrTy);
+ FunctionCallee UnwindGetGR = M.getOrInsertFunction("_Unwind_GetGR", VoidTy);
+ FunctionCallee UnwindGetCFA = M.getOrInsertFunction("_Unwind_GetCFA", VoidTy);
+
+ for (auto &P : PersonalityFns) {
+ std::string ThunkName = kHwasanPersonalityThunkName;
+ if (P.first)
+ ThunkName += ("." + P.first->getName()).str();
+ FunctionType *ThunkFnTy = FunctionType::get(
+ Int32Ty, {Int32Ty, Int32Ty, Int64Ty, Int8PtrTy, Int8PtrTy}, false);
+ bool IsLocal = P.first && (!isa<GlobalValue>(P.first) ||
+ cast<GlobalValue>(P.first)->hasLocalLinkage());
+ auto *ThunkFn = Function::Create(ThunkFnTy,
+ IsLocal ? GlobalValue::InternalLinkage
+ : GlobalValue::LinkOnceODRLinkage,
+ ThunkName, &M);
+ if (!IsLocal) {
+ ThunkFn->setVisibility(GlobalValue::HiddenVisibility);
+ ThunkFn->setComdat(M.getOrInsertComdat(ThunkName));
+ }
+
+ auto *BB = BasicBlock::Create(*C, "entry", ThunkFn);
+ IRBuilder<> IRB(BB);
+ CallInst *WrapperCall = IRB.CreateCall(
+ HwasanPersonalityWrapper,
+ {ThunkFn->getArg(0), ThunkFn->getArg(1), ThunkFn->getArg(2),
+ ThunkFn->getArg(3), ThunkFn->getArg(4),
+ P.first ? IRB.CreateBitCast(P.first, Int8PtrTy)
+ : Constant::getNullValue(Int8PtrTy),
+ IRB.CreateBitCast(UnwindGetGR.getCallee(), Int8PtrTy),
+ IRB.CreateBitCast(UnwindGetCFA.getCallee(), Int8PtrTy)});
+ WrapperCall->setTailCall();
+ IRB.CreateRet(WrapperCall);
+
+ for (Function *F : P.second)
+ F->setPersonalityFn(ThunkFn);
+ }
+}
+
+void HWAddressSanitizer::ShadowMapping::init(Triple &TargetTriple) {
+ Scale = kDefaultShadowScale;
+ if (ClMappingOffset.getNumOccurrences() > 0) {
+ InGlobal = false;
+ InTls = false;
+ Offset = ClMappingOffset;
+ } else if (ClEnableKhwasan || ClInstrumentWithCalls) {
+ InGlobal = false;
+ InTls = false;
+ Offset = 0;
+ } else if (ClWithIfunc) {
+ InGlobal = true;
+ InTls = false;
+ Offset = kDynamicShadowSentinel;
+ } else if (ClWithTls) {
+ InGlobal = false;
+ InTls = true;
+ Offset = kDynamicShadowSentinel;
+ } else {
+ InGlobal = false;
+ InTls = false;
+ Offset = kDynamicShadowSentinel;
+ }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 6baf7e7dae..5b9557a9b3 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -1,268 +1,268 @@
-//===- IndirectCallPromotion.cpp - Optimizations based on value profiling -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the transformation that promotes indirect calls to
-// conditional direct calls when the indirect-call value profile metadata is
-// available.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
-#include "llvm/Analysis/IndirectCallVisitor.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/ProfileData/InstrProf.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/CallPromotionUtils.h"
-#include <cassert>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "pgo-icall-prom"
-
-STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
-STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
-
-// Command line option to disable indirect-call promotion with the default as
-// false. This is for debug purpose.
-static cl::opt<bool> DisableICP("disable-icp", cl::init(false), cl::Hidden,
- cl::desc("Disable indirect call promotion"));
-
-// Set the cutoff value for the promotion. If the value is other than 0, we
-// stop the transformation once the total number of promotions equals the cutoff
-// value.
-// For debug use only.
-static cl::opt<unsigned>
- ICPCutOff("icp-cutoff", cl::init(0), cl::Hidden, cl::ZeroOrMore,
- cl::desc("Max number of promotions for this compilation"));
-
-// If ICPCSSkip is non zero, the first ICPCSSkip callsites will be skipped.
-// For debug use only.
-static cl::opt<unsigned>
- ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden, cl::ZeroOrMore,
- cl::desc("Skip Callsite up to this number for this compilation"));
-
-// Set if the pass is called in LTO optimization. The difference for LTO mode
-// is the pass won't prefix the source module name to the internal linkage
-// symbols.
-static cl::opt<bool> ICPLTOMode("icp-lto", cl::init(false), cl::Hidden,
- cl::desc("Run indirect-call promotion in LTO "
- "mode"));
-
-// Set if the pass is called in SamplePGO mode. The difference for SamplePGO
-// mode is it will add prof metadatato the created direct call.
-static cl::opt<bool>
- ICPSamplePGOMode("icp-samplepgo", cl::init(false), cl::Hidden,
- cl::desc("Run indirect-call promotion in SamplePGO mode"));
-
-// If the option is set to true, only call instructions will be considered for
-// transformation -- invoke instructions will be ignored.
-static cl::opt<bool>
- ICPCallOnly("icp-call-only", cl::init(false), cl::Hidden,
- cl::desc("Run indirect-call promotion for call instructions "
- "only"));
-
-// If the option is set to true, only invoke instructions will be considered for
-// transformation -- call instructions will be ignored.
-static cl::opt<bool> ICPInvokeOnly("icp-invoke-only", cl::init(false),
- cl::Hidden,
- cl::desc("Run indirect-call promotion for "
- "invoke instruction only"));
-
-// Dump the function level IR if the transformation happened in this
-// function. For debug use only.
-static cl::opt<bool>
- ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden,
- cl::desc("Dump IR after transformation happens"));
-
-namespace {
-
-class PGOIndirectCallPromotionLegacyPass : public ModulePass {
-public:
- static char ID;
-
- PGOIndirectCallPromotionLegacyPass(bool InLTO = false, bool SamplePGO = false)
- : ModulePass(ID), InLTO(InLTO), SamplePGO(SamplePGO) {
- initializePGOIndirectCallPromotionLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<ProfileSummaryInfoWrapperPass>();
- }
-
- StringRef getPassName() const override { return "PGOIndirectCallPromotion"; }
-
-private:
- bool runOnModule(Module &M) override;
-
- // If this pass is called in LTO. We need to special handling the PGOFuncName
- // for the static variables due to LTO's internalization.
- bool InLTO;
-
- // If this pass is called in SamplePGO. We need to add the prof metadata to
- // the promoted direct call.
- bool SamplePGO;
-};
-
-} // end anonymous namespace
-
-char PGOIndirectCallPromotionLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
- "Use PGO instrumentation profile to promote indirect "
- "calls to direct calls.",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_END(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
- "Use PGO instrumentation profile to promote indirect "
- "calls to direct calls.",
- false, false)
-
-ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO,
- bool SamplePGO) {
- return new PGOIndirectCallPromotionLegacyPass(InLTO, SamplePGO);
-}
-
-namespace {
-
-// The class for main data structure to promote indirect calls to conditional
-// direct calls.
-class ICallPromotionFunc {
-private:
- Function &F;
- Module *M;
-
- // Symtab that maps indirect call profile values to function names and
- // defines.
- InstrProfSymtab *Symtab;
-
- bool SamplePGO;
-
- OptimizationRemarkEmitter &ORE;
-
- // A struct that records the direct target and it's call count.
- struct PromotionCandidate {
- Function *TargetFunction;
- uint64_t Count;
-
- PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {}
- };
-
- // Check if the indirect-call call site should be promoted. Return the number
- // of promotions. Inst is the candidate indirect call, ValueDataRef
- // contains the array of value profile data for profiled targets,
- // TotalCount is the total profiled count of call executions, and
- // NumCandidates is the number of candidate entries in ValueDataRef.
- std::vector<PromotionCandidate> getPromotionCandidatesForCallSite(
- const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
- uint64_t TotalCount, uint32_t NumCandidates);
-
- // Promote a list of targets for one indirect-call callsite. Return
- // the number of promotions.
- uint32_t tryToPromote(CallBase &CB,
- const std::vector<PromotionCandidate> &Candidates,
- uint64_t &TotalCount);
-
-public:
- ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab,
- bool SamplePGO, OptimizationRemarkEmitter &ORE)
- : F(Func), M(Modu), Symtab(Symtab), SamplePGO(SamplePGO), ORE(ORE) {}
- ICallPromotionFunc(const ICallPromotionFunc &) = delete;
- ICallPromotionFunc &operator=(const ICallPromotionFunc &) = delete;
-
- bool processFunction(ProfileSummaryInfo *PSI);
-};
-
-} // end anonymous namespace
-
-// Indirect-call promotion heuristic. The direct targets are sorted based on
-// the count. Stop at the first target that is not promoted.
-std::vector<ICallPromotionFunc::PromotionCandidate>
-ICallPromotionFunc::getPromotionCandidatesForCallSite(
- const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
- uint64_t TotalCount, uint32_t NumCandidates) {
- std::vector<PromotionCandidate> Ret;
-
- LLVM_DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << CB
- << " Num_targets: " << ValueDataRef.size()
- << " Num_candidates: " << NumCandidates << "\n");
- NumOfPGOICallsites++;
- if (ICPCSSkip != 0 && NumOfPGOICallsites <= ICPCSSkip) {
- LLVM_DEBUG(dbgs() << " Skip: User options.\n");
- return Ret;
- }
-
- for (uint32_t I = 0; I < NumCandidates; I++) {
- uint64_t Count = ValueDataRef[I].Count;
- assert(Count <= TotalCount);
- uint64_t Target = ValueDataRef[I].Value;
- LLVM_DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
- << " Target_func: " << Target << "\n");
-
- if (ICPInvokeOnly && isa<CallInst>(CB)) {
- LLVM_DEBUG(dbgs() << " Not promote: User options.\n");
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", &CB)
- << " Not promote: User options";
- });
- break;
- }
- if (ICPCallOnly && isa<InvokeInst>(CB)) {
- LLVM_DEBUG(dbgs() << " Not promote: User option.\n");
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", &CB)
- << " Not promote: User options";
- });
- break;
- }
- if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
- LLVM_DEBUG(dbgs() << " Not promote: Cutoff reached.\n");
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "CutOffReached", &CB)
- << " Not promote: Cutoff reached";
- });
- break;
- }
-
+//===- IndirectCallPromotion.cpp - Optimizations based on value profiling -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the transformation that promotes indirect calls to
+// conditional direct calls when the indirect-call value profile metadata is
+// available.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
+#include "llvm/Analysis/IndirectCallVisitor.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-icall-prom"
+
+STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
+STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
+
+// Command line option to disable indirect-call promotion with the default as
+// false. This is for debug purpose.
+static cl::opt<bool> DisableICP("disable-icp", cl::init(false), cl::Hidden,
+ cl::desc("Disable indirect call promotion"));
+
+// Set the cutoff value for the promotion. If the value is other than 0, we
+// stop the transformation once the total number of promotions equals the cutoff
+// value.
+// For debug use only.
+static cl::opt<unsigned>
+ ICPCutOff("icp-cutoff", cl::init(0), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Max number of promotions for this compilation"));
+
+// If ICPCSSkip is non zero, the first ICPCSSkip callsites will be skipped.
+// For debug use only.
+static cl::opt<unsigned>
+ ICPCSSkip("icp-csskip", cl::init(0), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Skip Callsite up to this number for this compilation"));
+
+// Set if the pass is called in LTO optimization. The difference for LTO mode
+// is the pass won't prefix the source module name to the internal linkage
+// symbols.
+static cl::opt<bool> ICPLTOMode("icp-lto", cl::init(false), cl::Hidden,
+ cl::desc("Run indirect-call promotion in LTO "
+ "mode"));
+
+// Set if the pass is called in SamplePGO mode. The difference for SamplePGO
+// mode is it will add prof metadatato the created direct call.
+static cl::opt<bool>
+ ICPSamplePGOMode("icp-samplepgo", cl::init(false), cl::Hidden,
+ cl::desc("Run indirect-call promotion in SamplePGO mode"));
+
+// If the option is set to true, only call instructions will be considered for
+// transformation -- invoke instructions will be ignored.
+static cl::opt<bool>
+ ICPCallOnly("icp-call-only", cl::init(false), cl::Hidden,
+ cl::desc("Run indirect-call promotion for call instructions "
+ "only"));
+
+// If the option is set to true, only invoke instructions will be considered for
+// transformation -- call instructions will be ignored.
+static cl::opt<bool> ICPInvokeOnly("icp-invoke-only", cl::init(false),
+ cl::Hidden,
+ cl::desc("Run indirect-call promotion for "
+ "invoke instruction only"));
+
+// Dump the function level IR if the transformation happened in this
+// function. For debug use only.
+static cl::opt<bool>
+ ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden,
+ cl::desc("Dump IR after transformation happens"));
+
+namespace {
+
+class PGOIndirectCallPromotionLegacyPass : public ModulePass {
+public:
+ static char ID;
+
+ PGOIndirectCallPromotionLegacyPass(bool InLTO = false, bool SamplePGO = false)
+ : ModulePass(ID), InLTO(InLTO), SamplePGO(SamplePGO) {
+ initializePGOIndirectCallPromotionLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ }
+
+ StringRef getPassName() const override { return "PGOIndirectCallPromotion"; }
+
+private:
+ bool runOnModule(Module &M) override;
+
+ // If this pass is called in LTO. We need to special handling the PGOFuncName
+ // for the static variables due to LTO's internalization.
+ bool InLTO;
+
+ // If this pass is called in SamplePGO. We need to add the prof metadata to
+ // the promoted direct call.
+ bool SamplePGO;
+};
+
+} // end anonymous namespace
+
+char PGOIndirectCallPromotionLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
+ "Use PGO instrumentation profile to promote indirect "
+ "calls to direct calls.",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_END(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
+ "Use PGO instrumentation profile to promote indirect "
+ "calls to direct calls.",
+ false, false)
+
+ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO,
+ bool SamplePGO) {
+ return new PGOIndirectCallPromotionLegacyPass(InLTO, SamplePGO);
+}
+
+namespace {
+
+// The class for main data structure to promote indirect calls to conditional
+// direct calls.
+class ICallPromotionFunc {
+private:
+ Function &F;
+ Module *M;
+
+ // Symtab that maps indirect call profile values to function names and
+ // defines.
+ InstrProfSymtab *Symtab;
+
+ bool SamplePGO;
+
+ OptimizationRemarkEmitter &ORE;
+
+ // A struct that records the direct target and it's call count.
+ struct PromotionCandidate {
+ Function *TargetFunction;
+ uint64_t Count;
+
+ PromotionCandidate(Function *F, uint64_t C) : TargetFunction(F), Count(C) {}
+ };
+
+ // Check if the indirect-call call site should be promoted. Return the number
+ // of promotions. Inst is the candidate indirect call, ValueDataRef
+ // contains the array of value profile data for profiled targets,
+ // TotalCount is the total profiled count of call executions, and
+ // NumCandidates is the number of candidate entries in ValueDataRef.
+ std::vector<PromotionCandidate> getPromotionCandidatesForCallSite(
+ const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
+ uint64_t TotalCount, uint32_t NumCandidates);
+
+ // Promote a list of targets for one indirect-call callsite. Return
+ // the number of promotions.
+ uint32_t tryToPromote(CallBase &CB,
+ const std::vector<PromotionCandidate> &Candidates,
+ uint64_t &TotalCount);
+
+public:
+ ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab,
+ bool SamplePGO, OptimizationRemarkEmitter &ORE)
+ : F(Func), M(Modu), Symtab(Symtab), SamplePGO(SamplePGO), ORE(ORE) {}
+ ICallPromotionFunc(const ICallPromotionFunc &) = delete;
+ ICallPromotionFunc &operator=(const ICallPromotionFunc &) = delete;
+
+ bool processFunction(ProfileSummaryInfo *PSI);
+};
+
+} // end anonymous namespace
+
+// Indirect-call promotion heuristic. The direct targets are sorted based on
+// the count. Stop at the first target that is not promoted.
+std::vector<ICallPromotionFunc::PromotionCandidate>
+ICallPromotionFunc::getPromotionCandidatesForCallSite(
+ const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
+ uint64_t TotalCount, uint32_t NumCandidates) {
+ std::vector<PromotionCandidate> Ret;
+
+ LLVM_DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << CB
+ << " Num_targets: " << ValueDataRef.size()
+ << " Num_candidates: " << NumCandidates << "\n");
+ NumOfPGOICallsites++;
+ if (ICPCSSkip != 0 && NumOfPGOICallsites <= ICPCSSkip) {
+ LLVM_DEBUG(dbgs() << " Skip: User options.\n");
+ return Ret;
+ }
+
+ for (uint32_t I = 0; I < NumCandidates; I++) {
+ uint64_t Count = ValueDataRef[I].Count;
+ assert(Count <= TotalCount);
+ uint64_t Target = ValueDataRef[I].Value;
+ LLVM_DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
+ << " Target_func: " << Target << "\n");
+
+ if (ICPInvokeOnly && isa<CallInst>(CB)) {
+ LLVM_DEBUG(dbgs() << " Not promote: User options.\n");
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", &CB)
+ << " Not promote: User options";
+ });
+ break;
+ }
+ if (ICPCallOnly && isa<InvokeInst>(CB)) {
+ LLVM_DEBUG(dbgs() << " Not promote: User option.\n");
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", &CB)
+ << " Not promote: User options";
+ });
+ break;
+ }
+ if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
+ LLVM_DEBUG(dbgs() << " Not promote: Cutoff reached.\n");
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "CutOffReached", &CB)
+ << " Not promote: Cutoff reached";
+ });
+ break;
+ }
+
// Don't promote if the symbol is not defined in the module. This avoids
// creating a reference to a symbol that doesn't exist in the module
// This can happen when we compile with a sample profile collected from
@@ -270,180 +270,180 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
// aren't used in the new binary. We might have a declaration initially in
// the case where the symbol is globally dead in the binary and removed by
// ThinLTO.
- Function *TargetFunction = Symtab->getFunction(Target);
+ Function *TargetFunction = Symtab->getFunction(Target);
if (TargetFunction == nullptr || TargetFunction->isDeclaration()) {
- LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n");
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", &CB)
- << "Cannot promote indirect call: target with md5sum "
- << ore::NV("target md5sum", Target) << " not found";
- });
- break;
- }
-
- const char *Reason = nullptr;
- if (!isLegalToPromote(CB, TargetFunction, &Reason)) {
- using namespace ore;
-
- ORE.emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", &CB)
- << "Cannot promote indirect call to "
- << NV("TargetFunction", TargetFunction) << " with count of "
- << NV("Count", Count) << ": " << Reason;
- });
- break;
- }
-
- Ret.push_back(PromotionCandidate(TargetFunction, Count));
- TotalCount -= Count;
- }
- return Ret;
-}
-
-CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
- uint64_t Count, uint64_t TotalCount,
- bool AttachProfToDirectCall,
- OptimizationRemarkEmitter *ORE) {
-
- uint64_t ElseCount = TotalCount - Count;
- uint64_t MaxCount = (Count >= ElseCount ? Count : ElseCount);
- uint64_t Scale = calculateCountScale(MaxCount);
- MDBuilder MDB(CB.getContext());
- MDNode *BranchWeights = MDB.createBranchWeights(
- scaleBranchCount(Count, Scale), scaleBranchCount(ElseCount, Scale));
-
- CallBase &NewInst =
- promoteCallWithIfThenElse(CB, DirectCallee, BranchWeights);
-
- if (AttachProfToDirectCall) {
- MDBuilder MDB(NewInst.getContext());
- NewInst.setMetadata(
- LLVMContext::MD_prof,
- MDB.createBranchWeights({static_cast<uint32_t>(Count)}));
- }
-
- using namespace ore;
-
- if (ORE)
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "Promoted", &CB)
- << "Promote indirect call to " << NV("DirectCallee", DirectCallee)
- << " with count " << NV("Count", Count) << " out of "
- << NV("TotalCount", TotalCount);
- });
- return NewInst;
-}
-
-// Promote indirect-call to conditional direct-call for one callsite.
-uint32_t ICallPromotionFunc::tryToPromote(
- CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
- uint64_t &TotalCount) {
- uint32_t NumPromoted = 0;
-
- for (auto &C : Candidates) {
- uint64_t Count = C.Count;
- pgo::promoteIndirectCall(CB, C.TargetFunction, Count, TotalCount, SamplePGO,
- &ORE);
- assert(TotalCount >= Count);
- TotalCount -= Count;
- NumOfPGOICallPromotion++;
- NumPromoted++;
- }
- return NumPromoted;
-}
-
-// Traverse all the indirect-call callsite and get the value profile
-// annotation to perform indirect-call promotion.
-bool ICallPromotionFunc::processFunction(ProfileSummaryInfo *PSI) {
- bool Changed = false;
- ICallPromotionAnalysis ICallAnalysis;
- for (auto *CB : findIndirectCalls(F)) {
- uint32_t NumVals, NumCandidates;
- uint64_t TotalCount;
- auto ICallProfDataRef = ICallAnalysis.getPromotionCandidatesForInstruction(
- CB, NumVals, TotalCount, NumCandidates);
- if (!NumCandidates ||
- (PSI && PSI->hasProfileSummary() && !PSI->isHotCount(TotalCount)))
- continue;
- auto PromotionCandidates = getPromotionCandidatesForCallSite(
- *CB, ICallProfDataRef, TotalCount, NumCandidates);
- uint32_t NumPromoted = tryToPromote(*CB, PromotionCandidates, TotalCount);
- if (NumPromoted == 0)
- continue;
-
- Changed = true;
- // Adjust the MD.prof metadata. First delete the old one.
- CB->setMetadata(LLVMContext::MD_prof, nullptr);
- // If all promoted, we don't need the MD.prof metadata.
- if (TotalCount == 0 || NumPromoted == NumVals)
- continue;
- // Otherwise we need update with the un-promoted records back.
- annotateValueSite(*M, *CB, ICallProfDataRef.slice(NumPromoted), TotalCount,
- IPVK_IndirectCallTarget, NumCandidates);
- }
- return Changed;
-}
-
-// A wrapper function that does the actual work.
-static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI,
- bool InLTO, bool SamplePGO,
- ModuleAnalysisManager *AM = nullptr) {
- if (DisableICP)
- return false;
- InstrProfSymtab Symtab;
- if (Error E = Symtab.create(M, InLTO)) {
- std::string SymtabFailure = toString(std::move(E));
- LLVM_DEBUG(dbgs() << "Failed to create symtab: " << SymtabFailure << "\n");
- (void)SymtabFailure;
- return false;
- }
- bool Changed = false;
- for (auto &F : M) {
- if (F.isDeclaration() || F.hasOptNone())
- continue;
-
- std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
- OptimizationRemarkEmitter *ORE;
- if (AM) {
- auto &FAM =
- AM->getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- } else {
- OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
- ORE = OwnedORE.get();
- }
-
- ICallPromotionFunc ICallPromotion(F, &M, &Symtab, SamplePGO, *ORE);
- bool FuncChanged = ICallPromotion.processFunction(PSI);
- if (ICPDUMPAFTER && FuncChanged) {
- LLVM_DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
- LLVM_DEBUG(dbgs() << "\n");
- }
- Changed |= FuncChanged;
- if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
- LLVM_DEBUG(dbgs() << " Stop: Cutoff reached.\n");
- break;
- }
- }
- return Changed;
-}
-
-bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) {
- ProfileSummaryInfo *PSI =
- &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-
- // Command-line option has the priority for InLTO.
- return promoteIndirectCalls(M, PSI, InLTO | ICPLTOMode,
- SamplePGO | ICPSamplePGOMode);
-}
-
-PreservedAnalyses PGOIndirectCallPromotion::run(Module &M,
- ModuleAnalysisManager &AM) {
- ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
-
- if (!promoteIndirectCalls(M, PSI, InLTO | ICPLTOMode,
- SamplePGO | ICPSamplePGOMode, &AM))
- return PreservedAnalyses::all();
-
- return PreservedAnalyses::none();
-}
+ LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n");
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", &CB)
+ << "Cannot promote indirect call: target with md5sum "
+ << ore::NV("target md5sum", Target) << " not found";
+ });
+ break;
+ }
+
+ const char *Reason = nullptr;
+ if (!isLegalToPromote(CB, TargetFunction, &Reason)) {
+ using namespace ore;
+
+ ORE.emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", &CB)
+ << "Cannot promote indirect call to "
+ << NV("TargetFunction", TargetFunction) << " with count of "
+ << NV("Count", Count) << ": " << Reason;
+ });
+ break;
+ }
+
+ Ret.push_back(PromotionCandidate(TargetFunction, Count));
+ TotalCount -= Count;
+ }
+ return Ret;
+}
+
+CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
+ uint64_t Count, uint64_t TotalCount,
+ bool AttachProfToDirectCall,
+ OptimizationRemarkEmitter *ORE) {
+
+ uint64_t ElseCount = TotalCount - Count;
+ uint64_t MaxCount = (Count >= ElseCount ? Count : ElseCount);
+ uint64_t Scale = calculateCountScale(MaxCount);
+ MDBuilder MDB(CB.getContext());
+ MDNode *BranchWeights = MDB.createBranchWeights(
+ scaleBranchCount(Count, Scale), scaleBranchCount(ElseCount, Scale));
+
+ CallBase &NewInst =
+ promoteCallWithIfThenElse(CB, DirectCallee, BranchWeights);
+
+ if (AttachProfToDirectCall) {
+ MDBuilder MDB(NewInst.getContext());
+ NewInst.setMetadata(
+ LLVMContext::MD_prof,
+ MDB.createBranchWeights({static_cast<uint32_t>(Count)}));
+ }
+
+ using namespace ore;
+
+ if (ORE)
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Promoted", &CB)
+ << "Promote indirect call to " << NV("DirectCallee", DirectCallee)
+ << " with count " << NV("Count", Count) << " out of "
+ << NV("TotalCount", TotalCount);
+ });
+ return NewInst;
+}
+
+// Promote indirect-call to conditional direct-call for one callsite.
+uint32_t ICallPromotionFunc::tryToPromote(
+ CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
+ uint64_t &TotalCount) {
+ uint32_t NumPromoted = 0;
+
+ for (auto &C : Candidates) {
+ uint64_t Count = C.Count;
+ pgo::promoteIndirectCall(CB, C.TargetFunction, Count, TotalCount, SamplePGO,
+ &ORE);
+ assert(TotalCount >= Count);
+ TotalCount -= Count;
+ NumOfPGOICallPromotion++;
+ NumPromoted++;
+ }
+ return NumPromoted;
+}
+
+// Traverse all the indirect-call callsite and get the value profile
+// annotation to perform indirect-call promotion.
+bool ICallPromotionFunc::processFunction(ProfileSummaryInfo *PSI) {
+ bool Changed = false;
+ ICallPromotionAnalysis ICallAnalysis;
+ for (auto *CB : findIndirectCalls(F)) {
+ uint32_t NumVals, NumCandidates;
+ uint64_t TotalCount;
+ auto ICallProfDataRef = ICallAnalysis.getPromotionCandidatesForInstruction(
+ CB, NumVals, TotalCount, NumCandidates);
+ if (!NumCandidates ||
+ (PSI && PSI->hasProfileSummary() && !PSI->isHotCount(TotalCount)))
+ continue;
+ auto PromotionCandidates = getPromotionCandidatesForCallSite(
+ *CB, ICallProfDataRef, TotalCount, NumCandidates);
+ uint32_t NumPromoted = tryToPromote(*CB, PromotionCandidates, TotalCount);
+ if (NumPromoted == 0)
+ continue;
+
+ Changed = true;
+ // Adjust the MD.prof metadata. First delete the old one.
+ CB->setMetadata(LLVMContext::MD_prof, nullptr);
+ // If all promoted, we don't need the MD.prof metadata.
+ if (TotalCount == 0 || NumPromoted == NumVals)
+ continue;
+ // Otherwise we need update with the un-promoted records back.
+ annotateValueSite(*M, *CB, ICallProfDataRef.slice(NumPromoted), TotalCount,
+ IPVK_IndirectCallTarget, NumCandidates);
+ }
+ return Changed;
+}
+
+// A wrapper function that does the actual work.
+static bool promoteIndirectCalls(Module &M, ProfileSummaryInfo *PSI,
+ bool InLTO, bool SamplePGO,
+ ModuleAnalysisManager *AM = nullptr) {
+ if (DisableICP)
+ return false;
+ InstrProfSymtab Symtab;
+ if (Error E = Symtab.create(M, InLTO)) {
+ std::string SymtabFailure = toString(std::move(E));
+ LLVM_DEBUG(dbgs() << "Failed to create symtab: " << SymtabFailure << "\n");
+ (void)SymtabFailure;
+ return false;
+ }
+ bool Changed = false;
+ for (auto &F : M) {
+ if (F.isDeclaration() || F.hasOptNone())
+ continue;
+
+ std::unique_ptr<OptimizationRemarkEmitter> OwnedORE;
+ OptimizationRemarkEmitter *ORE;
+ if (AM) {
+ auto &FAM =
+ AM->getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ ORE = &FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ } else {
+ OwnedORE = std::make_unique<OptimizationRemarkEmitter>(&F);
+ ORE = OwnedORE.get();
+ }
+
+ ICallPromotionFunc ICallPromotion(F, &M, &Symtab, SamplePGO, *ORE);
+ bool FuncChanged = ICallPromotion.processFunction(PSI);
+ if (ICPDUMPAFTER && FuncChanged) {
+ LLVM_DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
+ LLVM_DEBUG(dbgs() << "\n");
+ }
+ Changed |= FuncChanged;
+ if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
+ LLVM_DEBUG(dbgs() << " Stop: Cutoff reached.\n");
+ break;
+ }
+ }
+ return Changed;
+}
+
+bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) {
+ ProfileSummaryInfo *PSI =
+ &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+
+ // Command-line option has the priority for InLTO.
+ return promoteIndirectCalls(M, PSI, InLTO | ICPLTOMode,
+ SamplePGO | ICPSamplePGOMode);
+}
+
+PreservedAnalyses PGOIndirectCallPromotion::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+
+ if (!promoteIndirectCalls(M, PSI, InLTO | ICPLTOMode,
+ SamplePGO | ICPSamplePGOMode, &AM))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrOrderFile.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrOrderFile.cpp
index 0addfb46b2..853385fbf8 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrOrderFile.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrOrderFile.cpp
@@ -1,212 +1,212 @@
-//===- InstrOrderFile.cpp ---- Late IR instrumentation for order file ----===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/PassRegistry.h"
-#include "llvm/ProfileData/InstrProf.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include <fstream>
-#include <map>
-#include <mutex>
-#include <set>
-#include <sstream>
-
-using namespace llvm;
-#define DEBUG_TYPE "instrorderfile"
-
-static cl::opt<std::string> ClOrderFileWriteMapping(
- "orderfile-write-mapping", cl::init(""),
- cl::desc(
- "Dump functions and their MD5 hash to deobfuscate profile data"),
- cl::Hidden);
-
-namespace {
-
-// We need a global bitmap to tell if a function is executed. We also
-// need a global variable to save the order of functions. We can use a
-// fixed-size buffer that saves the MD5 hash of the function. We need
-// a global variable to save the index into the buffer.
-
-std::mutex MappingMutex;
-
-struct InstrOrderFile {
-private:
- GlobalVariable *OrderFileBuffer;
- GlobalVariable *BufferIdx;
- GlobalVariable *BitMap;
- ArrayType *BufferTy;
- ArrayType *MapTy;
-
-public:
- InstrOrderFile() {}
-
- void createOrderFileData(Module &M) {
- LLVMContext &Ctx = M.getContext();
- int NumFunctions = 0;
- for (Function &F : M) {
- if (!F.isDeclaration())
- NumFunctions++;
- }
-
- BufferTy =
- ArrayType::get(Type::getInt64Ty(Ctx), INSTR_ORDER_FILE_BUFFER_SIZE);
- Type *IdxTy = Type::getInt32Ty(Ctx);
- MapTy = ArrayType::get(Type::getInt8Ty(Ctx), NumFunctions);
-
- // Create the global variables.
- std::string SymbolName = INSTR_PROF_ORDERFILE_BUFFER_NAME_STR;
- OrderFileBuffer = new GlobalVariable(M, BufferTy, false, GlobalValue::LinkOnceODRLinkage,
- Constant::getNullValue(BufferTy), SymbolName);
- Triple TT = Triple(M.getTargetTriple());
- OrderFileBuffer->setSection(
- getInstrProfSectionName(IPSK_orderfile, TT.getObjectFormat()));
-
- std::string IndexName = INSTR_PROF_ORDERFILE_BUFFER_IDX_NAME_STR;
- BufferIdx = new GlobalVariable(M, IdxTy, false, GlobalValue::LinkOnceODRLinkage,
- Constant::getNullValue(IdxTy), IndexName);
-
- std::string BitMapName = "bitmap_0";
- BitMap = new GlobalVariable(M, MapTy, false, GlobalValue::PrivateLinkage,
- Constant::getNullValue(MapTy), BitMapName);
- }
-
- // Generate the code sequence in the entry block of each function to
- // update the buffer.
- void generateCodeSequence(Module &M, Function &F, int FuncId) {
- if (!ClOrderFileWriteMapping.empty()) {
- std::lock_guard<std::mutex> LogLock(MappingMutex);
- std::error_code EC;
- llvm::raw_fd_ostream OS(ClOrderFileWriteMapping, EC,
- llvm::sys::fs::OF_Append);
- if (EC) {
- report_fatal_error(Twine("Failed to open ") + ClOrderFileWriteMapping +
- " to save mapping file for order file instrumentation\n");
- } else {
- std::stringstream stream;
- stream << std::hex << MD5Hash(F.getName());
- std::string singleLine = "MD5 " + stream.str() + " " +
- std::string(F.getName()) + '\n';
- OS << singleLine;
- }
- }
-
- BasicBlock *OrigEntry = &F.getEntryBlock();
-
- LLVMContext &Ctx = M.getContext();
- IntegerType *Int32Ty = Type::getInt32Ty(Ctx);
- IntegerType *Int8Ty = Type::getInt8Ty(Ctx);
-
- // Create a new entry block for instrumentation. We will check the bitmap
- // in this basic block.
- BasicBlock *NewEntry =
- BasicBlock::Create(M.getContext(), "order_file_entry", &F, OrigEntry);
- IRBuilder<> entryB(NewEntry);
- // Create a basic block for updating the circular buffer.
- BasicBlock *UpdateOrderFileBB =
- BasicBlock::Create(M.getContext(), "order_file_set", &F, OrigEntry);
- IRBuilder<> updateB(UpdateOrderFileBB);
-
- // Check the bitmap, if it is already 1, do nothing.
- // Otherwise, set the bit, grab the index, update the buffer.
- Value *IdxFlags[] = {ConstantInt::get(Int32Ty, 0),
- ConstantInt::get(Int32Ty, FuncId)};
- Value *MapAddr = entryB.CreateGEP(MapTy, BitMap, IdxFlags, "");
- LoadInst *loadBitMap = entryB.CreateLoad(Int8Ty, MapAddr, "");
- entryB.CreateStore(ConstantInt::get(Int8Ty, 1), MapAddr);
- Value *IsNotExecuted =
- entryB.CreateICmpEQ(loadBitMap, ConstantInt::get(Int8Ty, 0));
- entryB.CreateCondBr(IsNotExecuted, UpdateOrderFileBB, OrigEntry);
-
- // Fill up UpdateOrderFileBB: grab the index, update the buffer!
- Value *IdxVal = updateB.CreateAtomicRMW(
- AtomicRMWInst::Add, BufferIdx, ConstantInt::get(Int32Ty, 1),
- AtomicOrdering::SequentiallyConsistent);
- // We need to wrap around the index to fit it inside the buffer.
- Value *WrappedIdx = updateB.CreateAnd(
- IdxVal, ConstantInt::get(Int32Ty, INSTR_ORDER_FILE_BUFFER_MASK));
- Value *BufferGEPIdx[] = {ConstantInt::get(Int32Ty, 0), WrappedIdx};
- Value *BufferAddr =
- updateB.CreateGEP(BufferTy, OrderFileBuffer, BufferGEPIdx, "");
- updateB.CreateStore(ConstantInt::get(Type::getInt64Ty(Ctx), MD5Hash(F.getName())),
- BufferAddr);
- updateB.CreateBr(OrigEntry);
- }
-
- bool run(Module &M) {
- createOrderFileData(M);
-
- int FuncId = 0;
- for (Function &F : M) {
- if (F.isDeclaration())
- continue;
- generateCodeSequence(M, F, FuncId);
- ++FuncId;
- }
-
- return true;
- }
-
-}; // End of InstrOrderFile struct
-
-class InstrOrderFileLegacyPass : public ModulePass {
-public:
- static char ID;
-
- InstrOrderFileLegacyPass() : ModulePass(ID) {
- initializeInstrOrderFileLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override;
-};
-
-} // End anonymous namespace
-
-bool InstrOrderFileLegacyPass::runOnModule(Module &M) {
- if (skipModule(M))
- return false;
-
- return InstrOrderFile().run(M);
-}
-
-PreservedAnalyses
-InstrOrderFilePass::run(Module &M, ModuleAnalysisManager &AM) {
- if (InstrOrderFile().run(M))
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
-}
-
-INITIALIZE_PASS_BEGIN(InstrOrderFileLegacyPass, "instrorderfile",
- "Instrumentation for Order File", false, false)
-INITIALIZE_PASS_END(InstrOrderFileLegacyPass, "instrorderfile",
- "Instrumentation for Order File", false, false)
-
-char InstrOrderFileLegacyPass::ID = 0;
-
-ModulePass *llvm::createInstrOrderFilePass() {
- return new InstrOrderFileLegacyPass();
-}
+//===- InstrOrderFile.cpp ---- Late IR instrumentation for order file ----===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include <fstream>
+#include <map>
+#include <mutex>
+#include <set>
+#include <sstream>
+
+using namespace llvm;
+#define DEBUG_TYPE "instrorderfile"
+
+static cl::opt<std::string> ClOrderFileWriteMapping(
+ "orderfile-write-mapping", cl::init(""),
+ cl::desc(
+ "Dump functions and their MD5 hash to deobfuscate profile data"),
+ cl::Hidden);
+
+namespace {
+
+// We need a global bitmap to tell if a function is executed. We also
+// need a global variable to save the order of functions. We can use a
+// fixed-size buffer that saves the MD5 hash of the function. We need
+// a global variable to save the index into the buffer.
+
+std::mutex MappingMutex;
+
+struct InstrOrderFile {
+private:
+ GlobalVariable *OrderFileBuffer;
+ GlobalVariable *BufferIdx;
+ GlobalVariable *BitMap;
+ ArrayType *BufferTy;
+ ArrayType *MapTy;
+
+public:
+ InstrOrderFile() {}
+
+ void createOrderFileData(Module &M) {
+ LLVMContext &Ctx = M.getContext();
+ int NumFunctions = 0;
+ for (Function &F : M) {
+ if (!F.isDeclaration())
+ NumFunctions++;
+ }
+
+ BufferTy =
+ ArrayType::get(Type::getInt64Ty(Ctx), INSTR_ORDER_FILE_BUFFER_SIZE);
+ Type *IdxTy = Type::getInt32Ty(Ctx);
+ MapTy = ArrayType::get(Type::getInt8Ty(Ctx), NumFunctions);
+
+ // Create the global variables.
+ std::string SymbolName = INSTR_PROF_ORDERFILE_BUFFER_NAME_STR;
+ OrderFileBuffer = new GlobalVariable(M, BufferTy, false, GlobalValue::LinkOnceODRLinkage,
+ Constant::getNullValue(BufferTy), SymbolName);
+ Triple TT = Triple(M.getTargetTriple());
+ OrderFileBuffer->setSection(
+ getInstrProfSectionName(IPSK_orderfile, TT.getObjectFormat()));
+
+ std::string IndexName = INSTR_PROF_ORDERFILE_BUFFER_IDX_NAME_STR;
+ BufferIdx = new GlobalVariable(M, IdxTy, false, GlobalValue::LinkOnceODRLinkage,
+ Constant::getNullValue(IdxTy), IndexName);
+
+ std::string BitMapName = "bitmap_0";
+ BitMap = new GlobalVariable(M, MapTy, false, GlobalValue::PrivateLinkage,
+ Constant::getNullValue(MapTy), BitMapName);
+ }
+
+ // Generate the code sequence in the entry block of each function to
+ // update the buffer.
+ void generateCodeSequence(Module &M, Function &F, int FuncId) {
+ if (!ClOrderFileWriteMapping.empty()) {
+ std::lock_guard<std::mutex> LogLock(MappingMutex);
+ std::error_code EC;
+ llvm::raw_fd_ostream OS(ClOrderFileWriteMapping, EC,
+ llvm::sys::fs::OF_Append);
+ if (EC) {
+ report_fatal_error(Twine("Failed to open ") + ClOrderFileWriteMapping +
+ " to save mapping file for order file instrumentation\n");
+ } else {
+ std::stringstream stream;
+ stream << std::hex << MD5Hash(F.getName());
+ std::string singleLine = "MD5 " + stream.str() + " " +
+ std::string(F.getName()) + '\n';
+ OS << singleLine;
+ }
+ }
+
+ BasicBlock *OrigEntry = &F.getEntryBlock();
+
+ LLVMContext &Ctx = M.getContext();
+ IntegerType *Int32Ty = Type::getInt32Ty(Ctx);
+ IntegerType *Int8Ty = Type::getInt8Ty(Ctx);
+
+ // Create a new entry block for instrumentation. We will check the bitmap
+ // in this basic block.
+ BasicBlock *NewEntry =
+ BasicBlock::Create(M.getContext(), "order_file_entry", &F, OrigEntry);
+ IRBuilder<> entryB(NewEntry);
+ // Create a basic block for updating the circular buffer.
+ BasicBlock *UpdateOrderFileBB =
+ BasicBlock::Create(M.getContext(), "order_file_set", &F, OrigEntry);
+ IRBuilder<> updateB(UpdateOrderFileBB);
+
+ // Check the bitmap, if it is already 1, do nothing.
+ // Otherwise, set the bit, grab the index, update the buffer.
+ Value *IdxFlags[] = {ConstantInt::get(Int32Ty, 0),
+ ConstantInt::get(Int32Ty, FuncId)};
+ Value *MapAddr = entryB.CreateGEP(MapTy, BitMap, IdxFlags, "");
+ LoadInst *loadBitMap = entryB.CreateLoad(Int8Ty, MapAddr, "");
+ entryB.CreateStore(ConstantInt::get(Int8Ty, 1), MapAddr);
+ Value *IsNotExecuted =
+ entryB.CreateICmpEQ(loadBitMap, ConstantInt::get(Int8Ty, 0));
+ entryB.CreateCondBr(IsNotExecuted, UpdateOrderFileBB, OrigEntry);
+
+ // Fill up UpdateOrderFileBB: grab the index, update the buffer!
+ Value *IdxVal = updateB.CreateAtomicRMW(
+ AtomicRMWInst::Add, BufferIdx, ConstantInt::get(Int32Ty, 1),
+ AtomicOrdering::SequentiallyConsistent);
+ // We need to wrap around the index to fit it inside the buffer.
+ Value *WrappedIdx = updateB.CreateAnd(
+ IdxVal, ConstantInt::get(Int32Ty, INSTR_ORDER_FILE_BUFFER_MASK));
+ Value *BufferGEPIdx[] = {ConstantInt::get(Int32Ty, 0), WrappedIdx};
+ Value *BufferAddr =
+ updateB.CreateGEP(BufferTy, OrderFileBuffer, BufferGEPIdx, "");
+ updateB.CreateStore(ConstantInt::get(Type::getInt64Ty(Ctx), MD5Hash(F.getName())),
+ BufferAddr);
+ updateB.CreateBr(OrigEntry);
+ }
+
+ bool run(Module &M) {
+ createOrderFileData(M);
+
+ int FuncId = 0;
+ for (Function &F : M) {
+ if (F.isDeclaration())
+ continue;
+ generateCodeSequence(M, F, FuncId);
+ ++FuncId;
+ }
+
+ return true;
+ }
+
+}; // End of InstrOrderFile struct
+
+class InstrOrderFileLegacyPass : public ModulePass {
+public:
+ static char ID;
+
+ InstrOrderFileLegacyPass() : ModulePass(ID) {
+ initializeInstrOrderFileLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override;
+};
+
+} // End anonymous namespace
+
+bool InstrOrderFileLegacyPass::runOnModule(Module &M) {
+ if (skipModule(M))
+ return false;
+
+ return InstrOrderFile().run(M);
+}
+
+PreservedAnalyses
+InstrOrderFilePass::run(Module &M, ModuleAnalysisManager &AM) {
+ if (InstrOrderFile().run(M))
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
+INITIALIZE_PASS_BEGIN(InstrOrderFileLegacyPass, "instrorderfile",
+ "Instrumentation for Order File", false, false)
+INITIALIZE_PASS_END(InstrOrderFileLegacyPass, "instrorderfile",
+ "Instrumentation for Order File", false, false)
+
+char InstrOrderFileLegacyPass::ID = 0;
+
+ModulePass *llvm::createInstrOrderFilePass() {
+ return new InstrOrderFileLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrProfiling.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 925c018135..9efc7d1ac5 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -1,266 +1,266 @@
-//===-- InstrProfiling.cpp - Frontend instrumentation based profiling -----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass lowers instrprof_* intrinsics emitted by a frontend for profiling.
-// It also builds the data structures and initialization code needed for
-// updating execution counts and emitting the profile at runtime.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation/InstrProfiling.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/ProfileData/InstrProf.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <string>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "instrprof"
-
-namespace {
-
-cl::opt<bool> DoHashBasedCounterSplit(
- "hash-based-counter-split",
- cl::desc("Rename counter variable of a comdat function based on cfg hash"),
- cl::init(true));
-
-cl::opt<bool> RuntimeCounterRelocation(
- "runtime-counter-relocation",
- cl::desc("Enable relocating counters at runtime."),
- cl::init(false));
-
-cl::opt<bool> ValueProfileStaticAlloc(
- "vp-static-alloc",
- cl::desc("Do static counter allocation for value profiler"),
- cl::init(true));
-
-cl::opt<double> NumCountersPerValueSite(
- "vp-counters-per-site",
- cl::desc("The average number of profile counters allocated "
- "per value profiling site."),
- // This is set to a very small value because in real programs, only
- // a very small percentage of value sites have non-zero targets, e.g, 1/30.
- // For those sites with non-zero profile, the average number of targets
- // is usually smaller than 2.
- cl::init(1.0));
-
-cl::opt<bool> AtomicCounterUpdateAll(
- "instrprof-atomic-counter-update-all", cl::ZeroOrMore,
- cl::desc("Make all profile counter updates atomic (for testing only)"),
- cl::init(false));
-
-cl::opt<bool> AtomicCounterUpdatePromoted(
- "atomic-counter-update-promoted", cl::ZeroOrMore,
- cl::desc("Do counter update using atomic fetch add "
- " for promoted counters only"),
- cl::init(false));
-
-cl::opt<bool> AtomicFirstCounter(
- "atomic-first-counter", cl::ZeroOrMore,
- cl::desc("Use atomic fetch add for first counter in a function (usually "
- "the entry counter)"),
- cl::init(false));
-
-// If the option is not specified, the default behavior about whether
-// counter promotion is done depends on how instrumentaiton lowering
-// pipeline is setup, i.e., the default value of true of this option
-// does not mean the promotion will be done by default. Explicitly
-// setting this option can override the default behavior.
-cl::opt<bool> DoCounterPromotion("do-counter-promotion", cl::ZeroOrMore,
- cl::desc("Do counter register promotion"),
- cl::init(false));
-cl::opt<unsigned> MaxNumOfPromotionsPerLoop(
- cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(20),
- cl::desc("Max number counter promotions per loop to avoid"
- " increasing register pressure too much"));
-
-// A debug option
-cl::opt<int>
- MaxNumOfPromotions(cl::ZeroOrMore, "max-counter-promotions", cl::init(-1),
- cl::desc("Max number of allowed counter promotions"));
-
-cl::opt<unsigned> SpeculativeCounterPromotionMaxExiting(
- cl::ZeroOrMore, "speculative-counter-promotion-max-exiting", cl::init(3),
- cl::desc("The max number of exiting blocks of a loop to allow "
- " speculative counter promotion"));
-
-cl::opt<bool> SpeculativeCounterPromotionToLoop(
- cl::ZeroOrMore, "speculative-counter-promotion-to-loop", cl::init(false),
- cl::desc("When the option is false, if the target block is in a loop, "
- "the promotion will be disallowed unless the promoted counter "
- " update can be further/iteratively promoted into an acyclic "
- " region."));
-
-cl::opt<bool> IterativeCounterPromotion(
- cl::ZeroOrMore, "iterative-counter-promotion", cl::init(true),
- cl::desc("Allow counter promotion across the whole loop nest."));
-
+//===-- InstrProfiling.cpp - Frontend instrumentation based profiling -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers instrprof_* intrinsics emitted by a frontend for profiling.
+// It also builds the data structures and initialization code needed for
+// updating execution counts and emitting the profile at runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/InstrProfiling.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "instrprof"
+
+namespace {
+
+cl::opt<bool> DoHashBasedCounterSplit(
+ "hash-based-counter-split",
+ cl::desc("Rename counter variable of a comdat function based on cfg hash"),
+ cl::init(true));
+
+cl::opt<bool> RuntimeCounterRelocation(
+ "runtime-counter-relocation",
+ cl::desc("Enable relocating counters at runtime."),
+ cl::init(false));
+
+cl::opt<bool> ValueProfileStaticAlloc(
+ "vp-static-alloc",
+ cl::desc("Do static counter allocation for value profiler"),
+ cl::init(true));
+
+cl::opt<double> NumCountersPerValueSite(
+ "vp-counters-per-site",
+ cl::desc("The average number of profile counters allocated "
+ "per value profiling site."),
+ // This is set to a very small value because in real programs, only
+ // a very small percentage of value sites have non-zero targets, e.g, 1/30.
+ // For those sites with non-zero profile, the average number of targets
+ // is usually smaller than 2.
+ cl::init(1.0));
+
+cl::opt<bool> AtomicCounterUpdateAll(
+ "instrprof-atomic-counter-update-all", cl::ZeroOrMore,
+ cl::desc("Make all profile counter updates atomic (for testing only)"),
+ cl::init(false));
+
+cl::opt<bool> AtomicCounterUpdatePromoted(
+ "atomic-counter-update-promoted", cl::ZeroOrMore,
+ cl::desc("Do counter update using atomic fetch add "
+ " for promoted counters only"),
+ cl::init(false));
+
+cl::opt<bool> AtomicFirstCounter(
+ "atomic-first-counter", cl::ZeroOrMore,
+ cl::desc("Use atomic fetch add for first counter in a function (usually "
+ "the entry counter)"),
+ cl::init(false));
+
+// If the option is not specified, the default behavior about whether
+// counter promotion is done depends on how instrumentaiton lowering
+// pipeline is setup, i.e., the default value of true of this option
+// does not mean the promotion will be done by default. Explicitly
+// setting this option can override the default behavior.
+cl::opt<bool> DoCounterPromotion("do-counter-promotion", cl::ZeroOrMore,
+ cl::desc("Do counter register promotion"),
+ cl::init(false));
+cl::opt<unsigned> MaxNumOfPromotionsPerLoop(
+ cl::ZeroOrMore, "max-counter-promotions-per-loop", cl::init(20),
+ cl::desc("Max number counter promotions per loop to avoid"
+ " increasing register pressure too much"));
+
+// A debug option
+cl::opt<int>
+ MaxNumOfPromotions(cl::ZeroOrMore, "max-counter-promotions", cl::init(-1),
+ cl::desc("Max number of allowed counter promotions"));
+
+cl::opt<unsigned> SpeculativeCounterPromotionMaxExiting(
+ cl::ZeroOrMore, "speculative-counter-promotion-max-exiting", cl::init(3),
+ cl::desc("The max number of exiting blocks of a loop to allow "
+ " speculative counter promotion"));
+
+cl::opt<bool> SpeculativeCounterPromotionToLoop(
+ cl::ZeroOrMore, "speculative-counter-promotion-to-loop", cl::init(false),
+ cl::desc("When the option is false, if the target block is in a loop, "
+ "the promotion will be disallowed unless the promoted counter "
+ " update can be further/iteratively promoted into an acyclic "
+ " region."));
+
+cl::opt<bool> IterativeCounterPromotion(
+ cl::ZeroOrMore, "iterative-counter-promotion", cl::init(true),
+ cl::desc("Allow counter promotion across the whole loop nest."));
+
cl::opt<bool> SkipRetExitBlock(
cl::ZeroOrMore, "skip-ret-exit-block", cl::init(true),
cl::desc("Suppress counter promotion if exit blocks contain ret."));
-class InstrProfilingLegacyPass : public ModulePass {
- InstrProfiling InstrProf;
-
-public:
- static char ID;
-
- InstrProfilingLegacyPass() : ModulePass(ID) {}
- InstrProfilingLegacyPass(const InstrProfOptions &Options, bool IsCS = false)
- : ModulePass(ID), InstrProf(Options, IsCS) {
- initializeInstrProfilingLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- StringRef getPassName() const override {
- return "Frontend instrumentation-based coverage lowering";
- }
-
- bool runOnModule(Module &M) override {
- auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
- return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- };
- return InstrProf.run(M, GetTLI);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-};
-
-///
-/// A helper class to promote one counter RMW operation in the loop
-/// into register update.
-///
-/// RWM update for the counter will be sinked out of the loop after
-/// the transformation.
-///
-class PGOCounterPromoterHelper : public LoadAndStorePromoter {
-public:
- PGOCounterPromoterHelper(
- Instruction *L, Instruction *S, SSAUpdater &SSA, Value *Init,
- BasicBlock *PH, ArrayRef<BasicBlock *> ExitBlocks,
- ArrayRef<Instruction *> InsertPts,
- DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
- LoopInfo &LI)
- : LoadAndStorePromoter({L, S}, SSA), Store(S), ExitBlocks(ExitBlocks),
- InsertPts(InsertPts), LoopToCandidates(LoopToCands), LI(LI) {
- assert(isa<LoadInst>(L));
- assert(isa<StoreInst>(S));
- SSA.AddAvailableValue(PH, Init);
- }
-
- void doExtraRewritesBeforeFinalDeletion() override {
- for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
- BasicBlock *ExitBlock = ExitBlocks[i];
- Instruction *InsertPos = InsertPts[i];
- // Get LiveIn value into the ExitBlock. If there are multiple
- // predecessors, the value is defined by a PHI node in this
- // block.
- Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
- Value *Addr = cast<StoreInst>(Store)->getPointerOperand();
- Type *Ty = LiveInValue->getType();
- IRBuilder<> Builder(InsertPos);
- if (AtomicCounterUpdatePromoted)
- // automic update currently can only be promoted across the current
- // loop, not the whole loop nest.
- Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, LiveInValue,
- AtomicOrdering::SequentiallyConsistent);
- else {
- LoadInst *OldVal = Builder.CreateLoad(Ty, Addr, "pgocount.promoted");
- auto *NewVal = Builder.CreateAdd(OldVal, LiveInValue);
- auto *NewStore = Builder.CreateStore(NewVal, Addr);
-
- // Now update the parent loop's candidate list:
- if (IterativeCounterPromotion) {
- auto *TargetLoop = LI.getLoopFor(ExitBlock);
- if (TargetLoop)
- LoopToCandidates[TargetLoop].emplace_back(OldVal, NewStore);
- }
- }
- }
- }
-
-private:
- Instruction *Store;
- ArrayRef<BasicBlock *> ExitBlocks;
- ArrayRef<Instruction *> InsertPts;
- DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates;
- LoopInfo &LI;
-};
-
-/// A helper class to do register promotion for all profile counter
-/// updates in a loop.
-///
-class PGOCounterPromoter {
-public:
- PGOCounterPromoter(
- DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
- Loop &CurLoop, LoopInfo &LI, BlockFrequencyInfo *BFI)
- : LoopToCandidates(LoopToCands), ExitBlocks(), InsertPts(), L(CurLoop),
- LI(LI), BFI(BFI) {
-
- // Skip collection of ExitBlocks and InsertPts for loops that will not be
- // able to have counters promoted.
- SmallVector<BasicBlock *, 8> LoopExitBlocks;
- SmallPtrSet<BasicBlock *, 8> BlockSet;
-
- L.getExitBlocks(LoopExitBlocks);
- if (!isPromotionPossible(&L, LoopExitBlocks))
- return;
-
- for (BasicBlock *ExitBlock : LoopExitBlocks) {
- if (BlockSet.insert(ExitBlock).second) {
- ExitBlocks.push_back(ExitBlock);
- InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
- }
- }
- }
-
- bool run(int64_t *NumPromoted) {
- // Skip 'infinite' loops:
- if (ExitBlocks.size() == 0)
- return false;
+class InstrProfilingLegacyPass : public ModulePass {
+ InstrProfiling InstrProf;
+
+public:
+ static char ID;
+
+ InstrProfilingLegacyPass() : ModulePass(ID) {}
+ InstrProfilingLegacyPass(const InstrProfOptions &Options, bool IsCS = false)
+ : ModulePass(ID), InstrProf(Options, IsCS) {
+ initializeInstrProfilingLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override {
+ return "Frontend instrumentation-based coverage lowering";
+ }
+
+ bool runOnModule(Module &M) override {
+ auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+ return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ };
+ return InstrProf.run(M, GetTLI);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+};
+
+///
+/// A helper class to promote one counter RMW operation in the loop
+/// into register update.
+///
+/// RWM update for the counter will be sinked out of the loop after
+/// the transformation.
+///
+class PGOCounterPromoterHelper : public LoadAndStorePromoter {
+public:
+ PGOCounterPromoterHelper(
+ Instruction *L, Instruction *S, SSAUpdater &SSA, Value *Init,
+ BasicBlock *PH, ArrayRef<BasicBlock *> ExitBlocks,
+ ArrayRef<Instruction *> InsertPts,
+ DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
+ LoopInfo &LI)
+ : LoadAndStorePromoter({L, S}, SSA), Store(S), ExitBlocks(ExitBlocks),
+ InsertPts(InsertPts), LoopToCandidates(LoopToCands), LI(LI) {
+ assert(isa<LoadInst>(L));
+ assert(isa<StoreInst>(S));
+ SSA.AddAvailableValue(PH, Init);
+ }
+
+ void doExtraRewritesBeforeFinalDeletion() override {
+ for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+ BasicBlock *ExitBlock = ExitBlocks[i];
+ Instruction *InsertPos = InsertPts[i];
+ // Get LiveIn value into the ExitBlock. If there are multiple
+ // predecessors, the value is defined by a PHI node in this
+ // block.
+ Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+ Value *Addr = cast<StoreInst>(Store)->getPointerOperand();
+ Type *Ty = LiveInValue->getType();
+ IRBuilder<> Builder(InsertPos);
+ if (AtomicCounterUpdatePromoted)
+ // automic update currently can only be promoted across the current
+ // loop, not the whole loop nest.
+ Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, LiveInValue,
+ AtomicOrdering::SequentiallyConsistent);
+ else {
+ LoadInst *OldVal = Builder.CreateLoad(Ty, Addr, "pgocount.promoted");
+ auto *NewVal = Builder.CreateAdd(OldVal, LiveInValue);
+ auto *NewStore = Builder.CreateStore(NewVal, Addr);
+
+ // Now update the parent loop's candidate list:
+ if (IterativeCounterPromotion) {
+ auto *TargetLoop = LI.getLoopFor(ExitBlock);
+ if (TargetLoop)
+ LoopToCandidates[TargetLoop].emplace_back(OldVal, NewStore);
+ }
+ }
+ }
+ }
+
+private:
+ Instruction *Store;
+ ArrayRef<BasicBlock *> ExitBlocks;
+ ArrayRef<Instruction *> InsertPts;
+ DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates;
+ LoopInfo &LI;
+};
+
+/// A helper class to do register promotion for all profile counter
+/// updates in a loop.
+///
+class PGOCounterPromoter {
+public:
+ PGOCounterPromoter(
+ DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCands,
+ Loop &CurLoop, LoopInfo &LI, BlockFrequencyInfo *BFI)
+ : LoopToCandidates(LoopToCands), ExitBlocks(), InsertPts(), L(CurLoop),
+ LI(LI), BFI(BFI) {
+
+ // Skip collection of ExitBlocks and InsertPts for loops that will not be
+ // able to have counters promoted.
+ SmallVector<BasicBlock *, 8> LoopExitBlocks;
+ SmallPtrSet<BasicBlock *, 8> BlockSet;
+
+ L.getExitBlocks(LoopExitBlocks);
+ if (!isPromotionPossible(&L, LoopExitBlocks))
+ return;
+
+ for (BasicBlock *ExitBlock : LoopExitBlocks) {
+ if (BlockSet.insert(ExitBlock).second) {
+ ExitBlocks.push_back(ExitBlock);
+ InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+ }
+ }
+ }
+
+ bool run(int64_t *NumPromoted) {
+ // Skip 'infinite' loops:
+ if (ExitBlocks.size() == 0)
+ return false;
// Skip if any of the ExitBlocks contains a ret instruction.
// This is to prevent dumping of incomplete profile -- if the
@@ -273,129 +273,129 @@ public:
return false;
}
- unsigned MaxProm = getMaxNumOfPromotionsInLoop(&L);
- if (MaxProm == 0)
- return false;
-
- unsigned Promoted = 0;
- for (auto &Cand : LoopToCandidates[&L]) {
-
- SmallVector<PHINode *, 4> NewPHIs;
- SSAUpdater SSA(&NewPHIs);
- Value *InitVal = ConstantInt::get(Cand.first->getType(), 0);
-
- // If BFI is set, we will use it to guide the promotions.
- if (BFI) {
- auto *BB = Cand.first->getParent();
- auto InstrCount = BFI->getBlockProfileCount(BB);
- if (!InstrCount)
- continue;
- auto PreheaderCount = BFI->getBlockProfileCount(L.getLoopPreheader());
- // If the average loop trip count is not greater than 1.5, we skip
- // promotion.
- if (PreheaderCount &&
- (PreheaderCount.getValue() * 3) >= (InstrCount.getValue() * 2))
- continue;
- }
-
- PGOCounterPromoterHelper Promoter(Cand.first, Cand.second, SSA, InitVal,
- L.getLoopPreheader(), ExitBlocks,
- InsertPts, LoopToCandidates, LI);
- Promoter.run(SmallVector<Instruction *, 2>({Cand.first, Cand.second}));
- Promoted++;
- if (Promoted >= MaxProm)
- break;
-
- (*NumPromoted)++;
- if (MaxNumOfPromotions != -1 && *NumPromoted >= MaxNumOfPromotions)
- break;
- }
-
- LLVM_DEBUG(dbgs() << Promoted << " counters promoted for loop (depth="
- << L.getLoopDepth() << ")\n");
- return Promoted != 0;
- }
-
-private:
- bool allowSpeculativeCounterPromotion(Loop *LP) {
- SmallVector<BasicBlock *, 8> ExitingBlocks;
- L.getExitingBlocks(ExitingBlocks);
- // Not considierered speculative.
- if (ExitingBlocks.size() == 1)
- return true;
- if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
- return false;
- return true;
- }
-
- // Check whether the loop satisfies the basic conditions needed to perform
- // Counter Promotions.
- bool isPromotionPossible(Loop *LP,
- const SmallVectorImpl<BasicBlock *> &LoopExitBlocks) {
- // We can't insert into a catchswitch.
- if (llvm::any_of(LoopExitBlocks, [](BasicBlock *Exit) {
- return isa<CatchSwitchInst>(Exit->getTerminator());
- }))
- return false;
-
- if (!LP->hasDedicatedExits())
- return false;
-
- BasicBlock *PH = LP->getLoopPreheader();
- if (!PH)
- return false;
-
- return true;
- }
-
- // Returns the max number of Counter Promotions for LP.
- unsigned getMaxNumOfPromotionsInLoop(Loop *LP) {
- SmallVector<BasicBlock *, 8> LoopExitBlocks;
- LP->getExitBlocks(LoopExitBlocks);
- if (!isPromotionPossible(LP, LoopExitBlocks))
- return 0;
-
- SmallVector<BasicBlock *, 8> ExitingBlocks;
- LP->getExitingBlocks(ExitingBlocks);
-
- // If BFI is set, we do more aggressive promotions based on BFI.
- if (BFI)
- return (unsigned)-1;
-
- // Not considierered speculative.
- if (ExitingBlocks.size() == 1)
- return MaxNumOfPromotionsPerLoop;
-
- if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
- return 0;
-
- // Whether the target block is in a loop does not matter:
- if (SpeculativeCounterPromotionToLoop)
- return MaxNumOfPromotionsPerLoop;
-
- // Now check the target block:
- unsigned MaxProm = MaxNumOfPromotionsPerLoop;
- for (auto *TargetBlock : LoopExitBlocks) {
- auto *TargetLoop = LI.getLoopFor(TargetBlock);
- if (!TargetLoop)
- continue;
- unsigned MaxPromForTarget = getMaxNumOfPromotionsInLoop(TargetLoop);
- unsigned PendingCandsInTarget = LoopToCandidates[TargetLoop].size();
- MaxProm =
- std::min(MaxProm, std::max(MaxPromForTarget, PendingCandsInTarget) -
- PendingCandsInTarget);
- }
- return MaxProm;
- }
-
- DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates;
- SmallVector<BasicBlock *, 8> ExitBlocks;
- SmallVector<Instruction *, 8> InsertPts;
- Loop &L;
- LoopInfo &LI;
- BlockFrequencyInfo *BFI;
-};
-
+ unsigned MaxProm = getMaxNumOfPromotionsInLoop(&L);
+ if (MaxProm == 0)
+ return false;
+
+ unsigned Promoted = 0;
+ for (auto &Cand : LoopToCandidates[&L]) {
+
+ SmallVector<PHINode *, 4> NewPHIs;
+ SSAUpdater SSA(&NewPHIs);
+ Value *InitVal = ConstantInt::get(Cand.first->getType(), 0);
+
+ // If BFI is set, we will use it to guide the promotions.
+ if (BFI) {
+ auto *BB = Cand.first->getParent();
+ auto InstrCount = BFI->getBlockProfileCount(BB);
+ if (!InstrCount)
+ continue;
+ auto PreheaderCount = BFI->getBlockProfileCount(L.getLoopPreheader());
+ // If the average loop trip count is not greater than 1.5, we skip
+ // promotion.
+ if (PreheaderCount &&
+ (PreheaderCount.getValue() * 3) >= (InstrCount.getValue() * 2))
+ continue;
+ }
+
+ PGOCounterPromoterHelper Promoter(Cand.first, Cand.second, SSA, InitVal,
+ L.getLoopPreheader(), ExitBlocks,
+ InsertPts, LoopToCandidates, LI);
+ Promoter.run(SmallVector<Instruction *, 2>({Cand.first, Cand.second}));
+ Promoted++;
+ if (Promoted >= MaxProm)
+ break;
+
+ (*NumPromoted)++;
+ if (MaxNumOfPromotions != -1 && *NumPromoted >= MaxNumOfPromotions)
+ break;
+ }
+
+ LLVM_DEBUG(dbgs() << Promoted << " counters promoted for loop (depth="
+ << L.getLoopDepth() << ")\n");
+ return Promoted != 0;
+ }
+
+private:
+ bool allowSpeculativeCounterPromotion(Loop *LP) {
+ SmallVector<BasicBlock *, 8> ExitingBlocks;
+ L.getExitingBlocks(ExitingBlocks);
+ // Not considierered speculative.
+ if (ExitingBlocks.size() == 1)
+ return true;
+ if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
+ return false;
+ return true;
+ }
+
+ // Check whether the loop satisfies the basic conditions needed to perform
+ // Counter Promotions.
+ bool isPromotionPossible(Loop *LP,
+ const SmallVectorImpl<BasicBlock *> &LoopExitBlocks) {
+ // We can't insert into a catchswitch.
+ if (llvm::any_of(LoopExitBlocks, [](BasicBlock *Exit) {
+ return isa<CatchSwitchInst>(Exit->getTerminator());
+ }))
+ return false;
+
+ if (!LP->hasDedicatedExits())
+ return false;
+
+ BasicBlock *PH = LP->getLoopPreheader();
+ if (!PH)
+ return false;
+
+ return true;
+ }
+
+ // Returns the max number of Counter Promotions for LP.
+ unsigned getMaxNumOfPromotionsInLoop(Loop *LP) {
+ SmallVector<BasicBlock *, 8> LoopExitBlocks;
+ LP->getExitBlocks(LoopExitBlocks);
+ if (!isPromotionPossible(LP, LoopExitBlocks))
+ return 0;
+
+ SmallVector<BasicBlock *, 8> ExitingBlocks;
+ LP->getExitingBlocks(ExitingBlocks);
+
+ // If BFI is set, we do more aggressive promotions based on BFI.
+ if (BFI)
+ return (unsigned)-1;
+
+ // Not considierered speculative.
+ if (ExitingBlocks.size() == 1)
+ return MaxNumOfPromotionsPerLoop;
+
+ if (ExitingBlocks.size() > SpeculativeCounterPromotionMaxExiting)
+ return 0;
+
+ // Whether the target block is in a loop does not matter:
+ if (SpeculativeCounterPromotionToLoop)
+ return MaxNumOfPromotionsPerLoop;
+
+ // Now check the target block:
+ unsigned MaxProm = MaxNumOfPromotionsPerLoop;
+ for (auto *TargetBlock : LoopExitBlocks) {
+ auto *TargetLoop = LI.getLoopFor(TargetBlock);
+ if (!TargetLoop)
+ continue;
+ unsigned MaxPromForTarget = getMaxNumOfPromotionsInLoop(TargetLoop);
+ unsigned PendingCandsInTarget = LoopToCandidates[TargetLoop].size();
+ MaxProm =
+ std::min(MaxProm, std::max(MaxPromForTarget, PendingCandsInTarget) -
+ PendingCandsInTarget);
+ }
+ return MaxProm;
+ }
+
+ DenseMap<Loop *, SmallVector<LoadStorePair, 8>> &LoopToCandidates;
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ SmallVector<Instruction *, 8> InsertPts;
+ Loop &L;
+ LoopInfo &LI;
+ BlockFrequencyInfo *BFI;
+};
+
enum class ValueProfilingCallType {
// Individual values are tracked. Currently used for indiret call target
// profiling.
@@ -405,204 +405,204 @@ enum class ValueProfilingCallType {
MemOp
};
-} // end anonymous namespace
-
-PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
- FunctionAnalysisManager &FAM =
- AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
- return FAM.getResult<TargetLibraryAnalysis>(F);
- };
- if (!run(M, GetTLI))
- return PreservedAnalyses::all();
-
- return PreservedAnalyses::none();
-}
-
-char InstrProfilingLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(
- InstrProfilingLegacyPass, "instrprof",
- "Frontend instrumentation-based coverage lowering.", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(
- InstrProfilingLegacyPass, "instrprof",
- "Frontend instrumentation-based coverage lowering.", false, false)
-
-ModulePass *
-llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options,
- bool IsCS) {
- return new InstrProfilingLegacyPass(Options, IsCS);
-}
-
-static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) {
- InstrProfIncrementInst *Inc = dyn_cast<InstrProfIncrementInstStep>(Instr);
- if (Inc)
- return Inc;
- return dyn_cast<InstrProfIncrementInst>(Instr);
-}
-
-bool InstrProfiling::lowerIntrinsics(Function *F) {
- bool MadeChange = false;
- PromotionCandidates.clear();
- for (BasicBlock &BB : *F) {
- for (auto I = BB.begin(), E = BB.end(); I != E;) {
- auto Instr = I++;
- InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr);
- if (Inc) {
- lowerIncrement(Inc);
- MadeChange = true;
- } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) {
- lowerValueProfileInst(Ind);
- MadeChange = true;
- }
- }
- }
-
- if (!MadeChange)
- return false;
-
- promoteCounterLoadStores(F);
- return true;
-}
-
-bool InstrProfiling::isRuntimeCounterRelocationEnabled() const {
- if (RuntimeCounterRelocation.getNumOccurrences() > 0)
- return RuntimeCounterRelocation;
-
- return TT.isOSFuchsia();
-}
-
-bool InstrProfiling::isCounterPromotionEnabled() const {
- if (DoCounterPromotion.getNumOccurrences() > 0)
- return DoCounterPromotion;
-
- return Options.DoCounterPromotion;
-}
-
-void InstrProfiling::promoteCounterLoadStores(Function *F) {
- if (!isCounterPromotionEnabled())
- return;
-
- DominatorTree DT(*F);
- LoopInfo LI(DT);
- DenseMap<Loop *, SmallVector<LoadStorePair, 8>> LoopPromotionCandidates;
-
- std::unique_ptr<BlockFrequencyInfo> BFI;
- if (Options.UseBFIInPromotion) {
- std::unique_ptr<BranchProbabilityInfo> BPI;
- BPI.reset(new BranchProbabilityInfo(*F, LI, &GetTLI(*F)));
- BFI.reset(new BlockFrequencyInfo(*F, *BPI, LI));
- }
-
- for (const auto &LoadStore : PromotionCandidates) {
- auto *CounterLoad = LoadStore.first;
- auto *CounterStore = LoadStore.second;
- BasicBlock *BB = CounterLoad->getParent();
- Loop *ParentLoop = LI.getLoopFor(BB);
- if (!ParentLoop)
- continue;
- LoopPromotionCandidates[ParentLoop].emplace_back(CounterLoad, CounterStore);
- }
-
- SmallVector<Loop *, 4> Loops = LI.getLoopsInPreorder();
-
- // Do a post-order traversal of the loops so that counter updates can be
- // iteratively hoisted outside the loop nest.
- for (auto *Loop : llvm::reverse(Loops)) {
- PGOCounterPromoter Promoter(LoopPromotionCandidates, *Loop, LI, BFI.get());
- Promoter.run(&TotalCountersPromoted);
- }
-}
-
-/// Check if the module contains uses of any profiling intrinsics.
-static bool containsProfilingIntrinsics(Module &M) {
- if (auto *F = M.getFunction(
- Intrinsic::getName(llvm::Intrinsic::instrprof_increment)))
- if (!F->use_empty())
- return true;
- if (auto *F = M.getFunction(
- Intrinsic::getName(llvm::Intrinsic::instrprof_increment_step)))
- if (!F->use_empty())
- return true;
- if (auto *F = M.getFunction(
- Intrinsic::getName(llvm::Intrinsic::instrprof_value_profile)))
- if (!F->use_empty())
- return true;
- return false;
-}
-
-bool InstrProfiling::run(
- Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI) {
- this->M = &M;
- this->GetTLI = std::move(GetTLI);
- NamesVar = nullptr;
- NamesSize = 0;
- ProfileDataMap.clear();
- UsedVars.clear();
- TT = Triple(M.getTargetTriple());
-
- // Emit the runtime hook even if no counters are present.
- bool MadeChange = emitRuntimeHook();
-
- // Improve compile time by avoiding linear scans when there is no work.
- GlobalVariable *CoverageNamesVar =
- M.getNamedGlobal(getCoverageUnusedNamesVarName());
- if (!containsProfilingIntrinsics(M) && !CoverageNamesVar)
- return MadeChange;
-
- // We did not know how many value sites there would be inside
- // the instrumented function. This is counting the number of instrumented
- // target value sites to enter it as field in the profile data variable.
- for (Function &F : M) {
- InstrProfIncrementInst *FirstProfIncInst = nullptr;
- for (BasicBlock &BB : F)
- for (auto I = BB.begin(), E = BB.end(); I != E; I++)
- if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I))
- computeNumValueSiteCounts(Ind);
- else if (FirstProfIncInst == nullptr)
- FirstProfIncInst = dyn_cast<InstrProfIncrementInst>(I);
-
- // Value profiling intrinsic lowering requires per-function profile data
- // variable to be created first.
- if (FirstProfIncInst != nullptr)
- static_cast<void>(getOrCreateRegionCounters(FirstProfIncInst));
- }
-
- for (Function &F : M)
- MadeChange |= lowerIntrinsics(&F);
-
- if (CoverageNamesVar) {
- lowerCoverageData(CoverageNamesVar);
- MadeChange = true;
- }
-
- if (!MadeChange)
- return false;
-
- emitVNodes();
- emitNameData();
- emitRegistration();
- emitUses();
- emitInitialization();
- return true;
-}
-
+} // end anonymous namespace
+
+PreservedAnalyses InstrProfiling::run(Module &M, ModuleAnalysisManager &AM) {
+ FunctionAnalysisManager &FAM =
+ AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+ return FAM.getResult<TargetLibraryAnalysis>(F);
+ };
+ if (!run(M, GetTLI))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+char InstrProfilingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(
+ InstrProfilingLegacyPass, "instrprof",
+ "Frontend instrumentation-based coverage lowering.", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
+ InstrProfilingLegacyPass, "instrprof",
+ "Frontend instrumentation-based coverage lowering.", false, false)
+
+ModulePass *
+llvm::createInstrProfilingLegacyPass(const InstrProfOptions &Options,
+ bool IsCS) {
+ return new InstrProfilingLegacyPass(Options, IsCS);
+}
+
+static InstrProfIncrementInst *castToIncrementInst(Instruction *Instr) {
+ InstrProfIncrementInst *Inc = dyn_cast<InstrProfIncrementInstStep>(Instr);
+ if (Inc)
+ return Inc;
+ return dyn_cast<InstrProfIncrementInst>(Instr);
+}
+
+bool InstrProfiling::lowerIntrinsics(Function *F) {
+ bool MadeChange = false;
+ PromotionCandidates.clear();
+ for (BasicBlock &BB : *F) {
+ for (auto I = BB.begin(), E = BB.end(); I != E;) {
+ auto Instr = I++;
+ InstrProfIncrementInst *Inc = castToIncrementInst(&*Instr);
+ if (Inc) {
+ lowerIncrement(Inc);
+ MadeChange = true;
+ } else if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(Instr)) {
+ lowerValueProfileInst(Ind);
+ MadeChange = true;
+ }
+ }
+ }
+
+ if (!MadeChange)
+ return false;
+
+ promoteCounterLoadStores(F);
+ return true;
+}
+
+bool InstrProfiling::isRuntimeCounterRelocationEnabled() const {
+ if (RuntimeCounterRelocation.getNumOccurrences() > 0)
+ return RuntimeCounterRelocation;
+
+ return TT.isOSFuchsia();
+}
+
+bool InstrProfiling::isCounterPromotionEnabled() const {
+ if (DoCounterPromotion.getNumOccurrences() > 0)
+ return DoCounterPromotion;
+
+ return Options.DoCounterPromotion;
+}
+
+void InstrProfiling::promoteCounterLoadStores(Function *F) {
+ if (!isCounterPromotionEnabled())
+ return;
+
+ DominatorTree DT(*F);
+ LoopInfo LI(DT);
+ DenseMap<Loop *, SmallVector<LoadStorePair, 8>> LoopPromotionCandidates;
+
+ std::unique_ptr<BlockFrequencyInfo> BFI;
+ if (Options.UseBFIInPromotion) {
+ std::unique_ptr<BranchProbabilityInfo> BPI;
+ BPI.reset(new BranchProbabilityInfo(*F, LI, &GetTLI(*F)));
+ BFI.reset(new BlockFrequencyInfo(*F, *BPI, LI));
+ }
+
+ for (const auto &LoadStore : PromotionCandidates) {
+ auto *CounterLoad = LoadStore.first;
+ auto *CounterStore = LoadStore.second;
+ BasicBlock *BB = CounterLoad->getParent();
+ Loop *ParentLoop = LI.getLoopFor(BB);
+ if (!ParentLoop)
+ continue;
+ LoopPromotionCandidates[ParentLoop].emplace_back(CounterLoad, CounterStore);
+ }
+
+ SmallVector<Loop *, 4> Loops = LI.getLoopsInPreorder();
+
+ // Do a post-order traversal of the loops so that counter updates can be
+ // iteratively hoisted outside the loop nest.
+ for (auto *Loop : llvm::reverse(Loops)) {
+ PGOCounterPromoter Promoter(LoopPromotionCandidates, *Loop, LI, BFI.get());
+ Promoter.run(&TotalCountersPromoted);
+ }
+}
+
+/// Check if the module contains uses of any profiling intrinsics.
+static bool containsProfilingIntrinsics(Module &M) {
+ if (auto *F = M.getFunction(
+ Intrinsic::getName(llvm::Intrinsic::instrprof_increment)))
+ if (!F->use_empty())
+ return true;
+ if (auto *F = M.getFunction(
+ Intrinsic::getName(llvm::Intrinsic::instrprof_increment_step)))
+ if (!F->use_empty())
+ return true;
+ if (auto *F = M.getFunction(
+ Intrinsic::getName(llvm::Intrinsic::instrprof_value_profile)))
+ if (!F->use_empty())
+ return true;
+ return false;
+}
+
+bool InstrProfiling::run(
+ Module &M, std::function<const TargetLibraryInfo &(Function &F)> GetTLI) {
+ this->M = &M;
+ this->GetTLI = std::move(GetTLI);
+ NamesVar = nullptr;
+ NamesSize = 0;
+ ProfileDataMap.clear();
+ UsedVars.clear();
+ TT = Triple(M.getTargetTriple());
+
+ // Emit the runtime hook even if no counters are present.
+ bool MadeChange = emitRuntimeHook();
+
+ // Improve compile time by avoiding linear scans when there is no work.
+ GlobalVariable *CoverageNamesVar =
+ M.getNamedGlobal(getCoverageUnusedNamesVarName());
+ if (!containsProfilingIntrinsics(M) && !CoverageNamesVar)
+ return MadeChange;
+
+ // We did not know how many value sites there would be inside
+ // the instrumented function. This is counting the number of instrumented
+ // target value sites to enter it as field in the profile data variable.
+ for (Function &F : M) {
+ InstrProfIncrementInst *FirstProfIncInst = nullptr;
+ for (BasicBlock &BB : F)
+ for (auto I = BB.begin(), E = BB.end(); I != E; I++)
+ if (auto *Ind = dyn_cast<InstrProfValueProfileInst>(I))
+ computeNumValueSiteCounts(Ind);
+ else if (FirstProfIncInst == nullptr)
+ FirstProfIncInst = dyn_cast<InstrProfIncrementInst>(I);
+
+ // Value profiling intrinsic lowering requires per-function profile data
+ // variable to be created first.
+ if (FirstProfIncInst != nullptr)
+ static_cast<void>(getOrCreateRegionCounters(FirstProfIncInst));
+ }
+
+ for (Function &F : M)
+ MadeChange |= lowerIntrinsics(&F);
+
+ if (CoverageNamesVar) {
+ lowerCoverageData(CoverageNamesVar);
+ MadeChange = true;
+ }
+
+ if (!MadeChange)
+ return false;
+
+ emitVNodes();
+ emitNameData();
+ emitRegistration();
+ emitUses();
+ emitInitialization();
+ return true;
+}
+
static FunctionCallee getOrInsertValueProfilingCall(
Module &M, const TargetLibraryInfo &TLI,
ValueProfilingCallType CallType = ValueProfilingCallType::Default) {
- LLVMContext &Ctx = M.getContext();
- auto *ReturnTy = Type::getVoidTy(M.getContext());
-
- AttributeList AL;
- if (auto AK = TLI.getExtAttrForI32Param(false))
- AL = AL.addParamAttribute(M.getContext(), 2, AK);
-
+ LLVMContext &Ctx = M.getContext();
+ auto *ReturnTy = Type::getVoidTy(M.getContext());
+
+ AttributeList AL;
+ if (auto AK = TLI.getExtAttrForI32Param(false))
+ AL = AL.addParamAttribute(M.getContext(), 2, AK);
+
assert((CallType == ValueProfilingCallType::Default ||
CallType == ValueProfilingCallType::MemOp) &&
"Must be Default or MemOp");
Type *ParamTypes[] = {
-#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
-#include "llvm/ProfileData/InstrProfData.inc"
+#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
+#include "llvm/ProfileData/InstrProfData.inc"
};
auto *ValueProfilingCallTy =
FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false);
@@ -610,501 +610,501 @@ static FunctionCallee getOrInsertValueProfilingCall(
? getInstrProfValueProfFuncName()
: getInstrProfValueProfMemOpFuncName();
return M.getOrInsertFunction(FuncName, ValueProfilingCallTy, AL);
-}
-
-void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
- GlobalVariable *Name = Ind->getName();
- uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
- uint64_t Index = Ind->getIndex()->getZExtValue();
- auto It = ProfileDataMap.find(Name);
- if (It == ProfileDataMap.end()) {
- PerFunctionProfileData PD;
- PD.NumValueSites[ValueKind] = Index + 1;
- ProfileDataMap[Name] = PD;
- } else if (It->second.NumValueSites[ValueKind] <= Index)
- It->second.NumValueSites[ValueKind] = Index + 1;
-}
-
-void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
- GlobalVariable *Name = Ind->getName();
- auto It = ProfileDataMap.find(Name);
- assert(It != ProfileDataMap.end() && It->second.DataVar &&
- "value profiling detected in function with no counter incerement");
-
- GlobalVariable *DataVar = It->second.DataVar;
- uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
- uint64_t Index = Ind->getIndex()->getZExtValue();
- for (uint32_t Kind = IPVK_First; Kind < ValueKind; ++Kind)
- Index += It->second.NumValueSites[Kind];
-
- IRBuilder<> Builder(Ind);
+}
+
+void InstrProfiling::computeNumValueSiteCounts(InstrProfValueProfileInst *Ind) {
+ GlobalVariable *Name = Ind->getName();
+ uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
+ uint64_t Index = Ind->getIndex()->getZExtValue();
+ auto It = ProfileDataMap.find(Name);
+ if (It == ProfileDataMap.end()) {
+ PerFunctionProfileData PD;
+ PD.NumValueSites[ValueKind] = Index + 1;
+ ProfileDataMap[Name] = PD;
+ } else if (It->second.NumValueSites[ValueKind] <= Index)
+ It->second.NumValueSites[ValueKind] = Index + 1;
+}
+
+void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
+ GlobalVariable *Name = Ind->getName();
+ auto It = ProfileDataMap.find(Name);
+ assert(It != ProfileDataMap.end() && It->second.DataVar &&
+ "value profiling detected in function with no counter incerement");
+
+ GlobalVariable *DataVar = It->second.DataVar;
+ uint64_t ValueKind = Ind->getValueKind()->getZExtValue();
+ uint64_t Index = Ind->getIndex()->getZExtValue();
+ for (uint32_t Kind = IPVK_First; Kind < ValueKind; ++Kind)
+ Index += It->second.NumValueSites[Kind];
+
+ IRBuilder<> Builder(Ind);
bool IsMemOpSize = (Ind->getValueKind()->getZExtValue() ==
llvm::InstrProfValueKind::IPVK_MemOPSize);
- CallInst *Call = nullptr;
- auto *TLI = &GetTLI(*Ind->getFunction());
-
- // To support value profiling calls within Windows exception handlers, funclet
- // information contained within operand bundles needs to be copied over to
- // the library call. This is required for the IR to be processed by the
- // WinEHPrepare pass.
- SmallVector<OperandBundleDef, 1> OpBundles;
- Ind->getOperandBundlesAsDefs(OpBundles);
+ CallInst *Call = nullptr;
+ auto *TLI = &GetTLI(*Ind->getFunction());
+
+ // To support value profiling calls within Windows exception handlers, funclet
+ // information contained within operand bundles needs to be copied over to
+ // the library call. This is required for the IR to be processed by the
+ // WinEHPrepare pass.
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ Ind->getOperandBundlesAsDefs(OpBundles);
if (!IsMemOpSize) {
- Value *Args[3] = {Ind->getTargetValue(),
- Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
- Builder.getInt32(Index)};
- Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI), Args,
- OpBundles);
- } else {
+ Value *Args[3] = {Ind->getTargetValue(),
+ Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+ Builder.getInt32(Index)};
+ Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI), Args,
+ OpBundles);
+ } else {
Value *Args[3] = {Ind->getTargetValue(),
Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
Builder.getInt32(Index)};
Call = Builder.CreateCall(
getOrInsertValueProfilingCall(*M, *TLI, ValueProfilingCallType::MemOp),
Args, OpBundles);
- }
- if (auto AK = TLI->getExtAttrForI32Param(false))
- Call->addParamAttr(2, AK);
- Ind->replaceAllUsesWith(Call);
- Ind->eraseFromParent();
-}
-
-void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
- GlobalVariable *Counters = getOrCreateRegionCounters(Inc);
-
- IRBuilder<> Builder(Inc);
- uint64_t Index = Inc->getIndex()->getZExtValue();
- Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters->getValueType(),
- Counters, 0, Index);
-
- if (isRuntimeCounterRelocationEnabled()) {
- Type *Int64Ty = Type::getInt64Ty(M->getContext());
- Type *Int64PtrTy = Type::getInt64PtrTy(M->getContext());
- Function *Fn = Inc->getParent()->getParent();
- Instruction &I = Fn->getEntryBlock().front();
- LoadInst *LI = dyn_cast<LoadInst>(&I);
- if (!LI) {
- IRBuilder<> Builder(&I);
- Type *Int64Ty = Type::getInt64Ty(M->getContext());
- GlobalVariable *Bias = M->getGlobalVariable(getInstrProfCounterBiasVarName());
- if (!Bias) {
- Bias = new GlobalVariable(*M, Int64Ty, false, GlobalValue::LinkOnceODRLinkage,
- Constant::getNullValue(Int64Ty),
- getInstrProfCounterBiasVarName());
- Bias->setVisibility(GlobalVariable::HiddenVisibility);
- }
- LI = Builder.CreateLoad(Int64Ty, Bias);
- }
- auto *Add = Builder.CreateAdd(Builder.CreatePtrToInt(Addr, Int64Ty), LI);
- Addr = Builder.CreateIntToPtr(Add, Int64PtrTy);
- }
-
- if (Options.Atomic || AtomicCounterUpdateAll ||
- (Index == 0 && AtomicFirstCounter)) {
- Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(),
- AtomicOrdering::Monotonic);
- } else {
- Value *IncStep = Inc->getStep();
- Value *Load = Builder.CreateLoad(IncStep->getType(), Addr, "pgocount");
- auto *Count = Builder.CreateAdd(Load, Inc->getStep());
- auto *Store = Builder.CreateStore(Count, Addr);
- if (isCounterPromotionEnabled())
- PromotionCandidates.emplace_back(cast<Instruction>(Load), Store);
- }
- Inc->eraseFromParent();
-}
-
-void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
- ConstantArray *Names =
- cast<ConstantArray>(CoverageNamesVar->getInitializer());
- for (unsigned I = 0, E = Names->getNumOperands(); I < E; ++I) {
- Constant *NC = Names->getOperand(I);
- Value *V = NC->stripPointerCasts();
- assert(isa<GlobalVariable>(V) && "Missing reference to function name");
- GlobalVariable *Name = cast<GlobalVariable>(V);
-
- Name->setLinkage(GlobalValue::PrivateLinkage);
- ReferencedNames.push_back(Name);
- NC->dropAllReferences();
- }
- CoverageNamesVar->eraseFromParent();
-}
-
-/// Get the name of a profiling variable for a particular function.
-static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) {
- StringRef NamePrefix = getInstrProfNameVarPrefix();
- StringRef Name = Inc->getName()->getName().substr(NamePrefix.size());
- Function *F = Inc->getParent()->getParent();
- Module *M = F->getParent();
- if (!DoHashBasedCounterSplit || !isIRPGOFlagSet(M) ||
- !canRenameComdatFunc(*F))
- return (Prefix + Name).str();
- uint64_t FuncHash = Inc->getHash()->getZExtValue();
- SmallVector<char, 24> HashPostfix;
- if (Name.endswith((Twine(".") + Twine(FuncHash)).toStringRef(HashPostfix)))
- return (Prefix + Name).str();
- return (Prefix + Name + "." + Twine(FuncHash)).str();
-}
-
-static inline bool shouldRecordFunctionAddr(Function *F) {
- // Check the linkage
- bool HasAvailableExternallyLinkage = F->hasAvailableExternallyLinkage();
- if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() &&
- !HasAvailableExternallyLinkage)
- return true;
-
- // A function marked 'alwaysinline' with available_externally linkage can't
- // have its address taken. Doing so would create an undefined external ref to
- // the function, which would fail to link.
- if (HasAvailableExternallyLinkage &&
- F->hasFnAttribute(Attribute::AlwaysInline))
- return false;
-
- // Prohibit function address recording if the function is both internal and
- // COMDAT. This avoids the profile data variable referencing internal symbols
- // in COMDAT.
- if (F->hasLocalLinkage() && F->hasComdat())
- return false;
-
- // Check uses of this function for other than direct calls or invokes to it.
- // Inline virtual functions have linkeOnceODR linkage. When a key method
- // exists, the vtable will only be emitted in the TU where the key method
- // is defined. In a TU where vtable is not available, the function won't
- // be 'addresstaken'. If its address is not recorded here, the profile data
- // with missing address may be picked by the linker leading to missing
- // indirect call target info.
- return F->hasAddressTaken() || F->hasLinkOnceLinkage();
-}
-
-static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
- // Don't do this for Darwin. compiler-rt uses linker magic.
- if (TT.isOSDarwin())
- return false;
- // Use linker script magic to get data/cnts/name start/end.
- if (TT.isOSLinux() || TT.isOSFreeBSD() || TT.isOSNetBSD() ||
- TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS4CPU() ||
- TT.isOSWindows())
- return false;
-
- return true;
-}
-
-GlobalVariable *
-InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
- GlobalVariable *NamePtr = Inc->getName();
- auto It = ProfileDataMap.find(NamePtr);
- PerFunctionProfileData PD;
- if (It != ProfileDataMap.end()) {
- if (It->second.RegionCounters)
- return It->second.RegionCounters;
- PD = It->second;
- }
-
- // Match the linkage and visibility of the name global. COFF supports using
- // comdats with internal symbols, so do that if we can.
- Function *Fn = Inc->getParent()->getParent();
- GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage();
- GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility();
- if (TT.isOSBinFormatCOFF()) {
- Linkage = GlobalValue::InternalLinkage;
- Visibility = GlobalValue::DefaultVisibility;
- }
-
- // Move the name variable to the right section. Place them in a COMDAT group
- // if the associated function is a COMDAT. This will make sure that only one
- // copy of counters of the COMDAT function will be emitted after linking. Keep
- // in mind that this pass may run before the inliner, so we need to create a
- // new comdat group for the counters and profiling data. If we use the comdat
- // of the parent function, that will result in relocations against discarded
- // sections.
- bool NeedComdat = needsComdatForCounter(*Fn, *M);
- if (NeedComdat) {
- if (TT.isOSBinFormatCOFF()) {
- // For COFF, put the counters, data, and values each into their own
- // comdats. We can't use a group because the Visual C++ linker will
- // report duplicate symbol errors if there are multiple external symbols
- // with the same name marked IMAGE_COMDAT_SELECT_ASSOCIATIVE.
- Linkage = GlobalValue::LinkOnceODRLinkage;
- Visibility = GlobalValue::HiddenVisibility;
- }
- }
+ }
+ if (auto AK = TLI->getExtAttrForI32Param(false))
+ Call->addParamAttr(2, AK);
+ Ind->replaceAllUsesWith(Call);
+ Ind->eraseFromParent();
+}
+
+void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
+ GlobalVariable *Counters = getOrCreateRegionCounters(Inc);
+
+ IRBuilder<> Builder(Inc);
+ uint64_t Index = Inc->getIndex()->getZExtValue();
+ Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters->getValueType(),
+ Counters, 0, Index);
+
+ if (isRuntimeCounterRelocationEnabled()) {
+ Type *Int64Ty = Type::getInt64Ty(M->getContext());
+ Type *Int64PtrTy = Type::getInt64PtrTy(M->getContext());
+ Function *Fn = Inc->getParent()->getParent();
+ Instruction &I = Fn->getEntryBlock().front();
+ LoadInst *LI = dyn_cast<LoadInst>(&I);
+ if (!LI) {
+ IRBuilder<> Builder(&I);
+ Type *Int64Ty = Type::getInt64Ty(M->getContext());
+ GlobalVariable *Bias = M->getGlobalVariable(getInstrProfCounterBiasVarName());
+ if (!Bias) {
+ Bias = new GlobalVariable(*M, Int64Ty, false, GlobalValue::LinkOnceODRLinkage,
+ Constant::getNullValue(Int64Ty),
+ getInstrProfCounterBiasVarName());
+ Bias->setVisibility(GlobalVariable::HiddenVisibility);
+ }
+ LI = Builder.CreateLoad(Int64Ty, Bias);
+ }
+ auto *Add = Builder.CreateAdd(Builder.CreatePtrToInt(Addr, Int64Ty), LI);
+ Addr = Builder.CreateIntToPtr(Add, Int64PtrTy);
+ }
+
+ if (Options.Atomic || AtomicCounterUpdateAll ||
+ (Index == 0 && AtomicFirstCounter)) {
+ Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(),
+ AtomicOrdering::Monotonic);
+ } else {
+ Value *IncStep = Inc->getStep();
+ Value *Load = Builder.CreateLoad(IncStep->getType(), Addr, "pgocount");
+ auto *Count = Builder.CreateAdd(Load, Inc->getStep());
+ auto *Store = Builder.CreateStore(Count, Addr);
+ if (isCounterPromotionEnabled())
+ PromotionCandidates.emplace_back(cast<Instruction>(Load), Store);
+ }
+ Inc->eraseFromParent();
+}
+
+void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
+ ConstantArray *Names =
+ cast<ConstantArray>(CoverageNamesVar->getInitializer());
+ for (unsigned I = 0, E = Names->getNumOperands(); I < E; ++I) {
+ Constant *NC = Names->getOperand(I);
+ Value *V = NC->stripPointerCasts();
+ assert(isa<GlobalVariable>(V) && "Missing reference to function name");
+ GlobalVariable *Name = cast<GlobalVariable>(V);
+
+ Name->setLinkage(GlobalValue::PrivateLinkage);
+ ReferencedNames.push_back(Name);
+ NC->dropAllReferences();
+ }
+ CoverageNamesVar->eraseFromParent();
+}
+
+/// Get the name of a profiling variable for a particular function.
+static std::string getVarName(InstrProfIncrementInst *Inc, StringRef Prefix) {
+ StringRef NamePrefix = getInstrProfNameVarPrefix();
+ StringRef Name = Inc->getName()->getName().substr(NamePrefix.size());
+ Function *F = Inc->getParent()->getParent();
+ Module *M = F->getParent();
+ if (!DoHashBasedCounterSplit || !isIRPGOFlagSet(M) ||
+ !canRenameComdatFunc(*F))
+ return (Prefix + Name).str();
+ uint64_t FuncHash = Inc->getHash()->getZExtValue();
+ SmallVector<char, 24> HashPostfix;
+ if (Name.endswith((Twine(".") + Twine(FuncHash)).toStringRef(HashPostfix)))
+ return (Prefix + Name).str();
+ return (Prefix + Name + "." + Twine(FuncHash)).str();
+}
+
+static inline bool shouldRecordFunctionAddr(Function *F) {
+ // Check the linkage
+ bool HasAvailableExternallyLinkage = F->hasAvailableExternallyLinkage();
+ if (!F->hasLinkOnceLinkage() && !F->hasLocalLinkage() &&
+ !HasAvailableExternallyLinkage)
+ return true;
+
+ // A function marked 'alwaysinline' with available_externally linkage can't
+ // have its address taken. Doing so would create an undefined external ref to
+ // the function, which would fail to link.
+ if (HasAvailableExternallyLinkage &&
+ F->hasFnAttribute(Attribute::AlwaysInline))
+ return false;
+
+ // Prohibit function address recording if the function is both internal and
+ // COMDAT. This avoids the profile data variable referencing internal symbols
+ // in COMDAT.
+ if (F->hasLocalLinkage() && F->hasComdat())
+ return false;
+
+ // Check uses of this function for other than direct calls or invokes to it.
+ // Inline virtual functions have linkeOnceODR linkage. When a key method
+ // exists, the vtable will only be emitted in the TU where the key method
+ // is defined. In a TU where vtable is not available, the function won't
+ // be 'addresstaken'. If its address is not recorded here, the profile data
+ // with missing address may be picked by the linker leading to missing
+ // indirect call target info.
+ return F->hasAddressTaken() || F->hasLinkOnceLinkage();
+}
+
+static bool needsRuntimeRegistrationOfSectionRange(const Triple &TT) {
+ // Don't do this for Darwin. compiler-rt uses linker magic.
+ if (TT.isOSDarwin())
+ return false;
+ // Use linker script magic to get data/cnts/name start/end.
+ if (TT.isOSLinux() || TT.isOSFreeBSD() || TT.isOSNetBSD() ||
+ TT.isOSSolaris() || TT.isOSFuchsia() || TT.isPS4CPU() ||
+ TT.isOSWindows())
+ return false;
+
+ return true;
+}
+
+GlobalVariable *
+InstrProfiling::getOrCreateRegionCounters(InstrProfIncrementInst *Inc) {
+ GlobalVariable *NamePtr = Inc->getName();
+ auto It = ProfileDataMap.find(NamePtr);
+ PerFunctionProfileData PD;
+ if (It != ProfileDataMap.end()) {
+ if (It->second.RegionCounters)
+ return It->second.RegionCounters;
+ PD = It->second;
+ }
+
+ // Match the linkage and visibility of the name global. COFF supports using
+ // comdats with internal symbols, so do that if we can.
+ Function *Fn = Inc->getParent()->getParent();
+ GlobalValue::LinkageTypes Linkage = NamePtr->getLinkage();
+ GlobalValue::VisibilityTypes Visibility = NamePtr->getVisibility();
+ if (TT.isOSBinFormatCOFF()) {
+ Linkage = GlobalValue::InternalLinkage;
+ Visibility = GlobalValue::DefaultVisibility;
+ }
+
+ // Move the name variable to the right section. Place them in a COMDAT group
+ // if the associated function is a COMDAT. This will make sure that only one
+ // copy of counters of the COMDAT function will be emitted after linking. Keep
+ // in mind that this pass may run before the inliner, so we need to create a
+ // new comdat group for the counters and profiling data. If we use the comdat
+ // of the parent function, that will result in relocations against discarded
+ // sections.
+ bool NeedComdat = needsComdatForCounter(*Fn, *M);
+ if (NeedComdat) {
+ if (TT.isOSBinFormatCOFF()) {
+ // For COFF, put the counters, data, and values each into their own
+ // comdats. We can't use a group because the Visual C++ linker will
+ // report duplicate symbol errors if there are multiple external symbols
+ // with the same name marked IMAGE_COMDAT_SELECT_ASSOCIATIVE.
+ Linkage = GlobalValue::LinkOnceODRLinkage;
+ Visibility = GlobalValue::HiddenVisibility;
+ }
+ }
std::string DataVarName = getVarName(Inc, getInstrProfDataVarPrefix());
- auto MaybeSetComdat = [=](GlobalVariable *GV) {
- if (NeedComdat)
+ auto MaybeSetComdat = [=](GlobalVariable *GV) {
+ if (NeedComdat)
GV->setComdat(M->getOrInsertComdat(TT.isOSBinFormatCOFF() ? GV->getName()
: DataVarName));
- };
-
- uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
- LLVMContext &Ctx = M->getContext();
- ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
-
- // Create the counters variable.
- auto *CounterPtr =
- new GlobalVariable(*M, CounterTy, false, Linkage,
- Constant::getNullValue(CounterTy),
- getVarName(Inc, getInstrProfCountersVarPrefix()));
- CounterPtr->setVisibility(Visibility);
- CounterPtr->setSection(
- getInstrProfSectionName(IPSK_cnts, TT.getObjectFormat()));
- CounterPtr->setAlignment(Align(8));
- MaybeSetComdat(CounterPtr);
- CounterPtr->setLinkage(Linkage);
-
- auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
- // Allocate statically the array of pointers to value profile nodes for
- // the current function.
- Constant *ValuesPtrExpr = ConstantPointerNull::get(Int8PtrTy);
- if (ValueProfileStaticAlloc && !needsRuntimeRegistrationOfSectionRange(TT)) {
- uint64_t NS = 0;
- for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
- NS += PD.NumValueSites[Kind];
- if (NS) {
- ArrayType *ValuesTy = ArrayType::get(Type::getInt64Ty(Ctx), NS);
-
- auto *ValuesVar =
- new GlobalVariable(*M, ValuesTy, false, Linkage,
- Constant::getNullValue(ValuesTy),
- getVarName(Inc, getInstrProfValuesVarPrefix()));
- ValuesVar->setVisibility(Visibility);
- ValuesVar->setSection(
- getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
- ValuesVar->setAlignment(Align(8));
- MaybeSetComdat(ValuesVar);
- ValuesPtrExpr =
- ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
- }
- }
-
- // Create data variable.
- auto *Int16Ty = Type::getInt16Ty(Ctx);
- auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1);
- Type *DataTypes[] = {
-#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType,
-#include "llvm/ProfileData/InstrProfData.inc"
- };
- auto *DataTy = StructType::get(Ctx, makeArrayRef(DataTypes));
-
- Constant *FunctionAddr = shouldRecordFunctionAddr(Fn)
- ? ConstantExpr::getBitCast(Fn, Int8PtrTy)
- : ConstantPointerNull::get(Int8PtrTy);
-
- Constant *Int16ArrayVals[IPVK_Last + 1];
- for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
- Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
-
- Constant *DataVals[] = {
-#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
-#include "llvm/ProfileData/InstrProfData.inc"
- };
+ };
+
+ uint64_t NumCounters = Inc->getNumCounters()->getZExtValue();
+ LLVMContext &Ctx = M->getContext();
+ ArrayType *CounterTy = ArrayType::get(Type::getInt64Ty(Ctx), NumCounters);
+
+ // Create the counters variable.
+ auto *CounterPtr =
+ new GlobalVariable(*M, CounterTy, false, Linkage,
+ Constant::getNullValue(CounterTy),
+ getVarName(Inc, getInstrProfCountersVarPrefix()));
+ CounterPtr->setVisibility(Visibility);
+ CounterPtr->setSection(
+ getInstrProfSectionName(IPSK_cnts, TT.getObjectFormat()));
+ CounterPtr->setAlignment(Align(8));
+ MaybeSetComdat(CounterPtr);
+ CounterPtr->setLinkage(Linkage);
+
+ auto *Int8PtrTy = Type::getInt8PtrTy(Ctx);
+ // Allocate statically the array of pointers to value profile nodes for
+ // the current function.
+ Constant *ValuesPtrExpr = ConstantPointerNull::get(Int8PtrTy);
+ if (ValueProfileStaticAlloc && !needsRuntimeRegistrationOfSectionRange(TT)) {
+ uint64_t NS = 0;
+ for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+ NS += PD.NumValueSites[Kind];
+ if (NS) {
+ ArrayType *ValuesTy = ArrayType::get(Type::getInt64Ty(Ctx), NS);
+
+ auto *ValuesVar =
+ new GlobalVariable(*M, ValuesTy, false, Linkage,
+ Constant::getNullValue(ValuesTy),
+ getVarName(Inc, getInstrProfValuesVarPrefix()));
+ ValuesVar->setVisibility(Visibility);
+ ValuesVar->setSection(
+ getInstrProfSectionName(IPSK_vals, TT.getObjectFormat()));
+ ValuesVar->setAlignment(Align(8));
+ MaybeSetComdat(ValuesVar);
+ ValuesPtrExpr =
+ ConstantExpr::getBitCast(ValuesVar, Type::getInt8PtrTy(Ctx));
+ }
+ }
+
+ // Create data variable.
+ auto *Int16Ty = Type::getInt16Ty(Ctx);
+ auto *Int16ArrayTy = ArrayType::get(Int16Ty, IPVK_Last + 1);
+ Type *DataTypes[] = {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
+ };
+ auto *DataTy = StructType::get(Ctx, makeArrayRef(DataTypes));
+
+ Constant *FunctionAddr = shouldRecordFunctionAddr(Fn)
+ ? ConstantExpr::getBitCast(Fn, Int8PtrTy)
+ : ConstantPointerNull::get(Int8PtrTy);
+
+ Constant *Int16ArrayVals[IPVK_Last + 1];
+ for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+ Int16ArrayVals[Kind] = ConstantInt::get(Int16Ty, PD.NumValueSites[Kind]);
+
+ Constant *DataVals[] = {
+#define INSTR_PROF_DATA(Type, LLVMType, Name, Init) Init,
+#include "llvm/ProfileData/InstrProfData.inc"
+ };
auto *Data =
new GlobalVariable(*M, DataTy, false, Linkage,
ConstantStruct::get(DataTy, DataVals), DataVarName);
- Data->setVisibility(Visibility);
- Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat()));
- Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
- MaybeSetComdat(Data);
- Data->setLinkage(Linkage);
-
- PD.RegionCounters = CounterPtr;
- PD.DataVar = Data;
- ProfileDataMap[NamePtr] = PD;
-
- // Mark the data variable as used so that it isn't stripped out.
- UsedVars.push_back(Data);
- // Now that the linkage set by the FE has been passed to the data and counter
- // variables, reset Name variable's linkage and visibility to private so that
- // it can be removed later by the compiler.
- NamePtr->setLinkage(GlobalValue::PrivateLinkage);
- // Collect the referenced names to be used by emitNameData.
- ReferencedNames.push_back(NamePtr);
-
- return CounterPtr;
-}
-
-void InstrProfiling::emitVNodes() {
- if (!ValueProfileStaticAlloc)
- return;
-
- // For now only support this on platforms that do
- // not require runtime registration to discover
- // named section start/end.
- if (needsRuntimeRegistrationOfSectionRange(TT))
- return;
-
- size_t TotalNS = 0;
- for (auto &PD : ProfileDataMap) {
- for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
- TotalNS += PD.second.NumValueSites[Kind];
- }
-
- if (!TotalNS)
- return;
-
- uint64_t NumCounters = TotalNS * NumCountersPerValueSite;
-// Heuristic for small programs with very few total value sites.
-// The default value of vp-counters-per-site is chosen based on
-// the observation that large apps usually have a low percentage
-// of value sites that actually have any profile data, and thus
-// the average number of counters per site is low. For small
-// apps with very few sites, this may not be true. Bump up the
-// number of counters in this case.
-#define INSTR_PROF_MIN_VAL_COUNTS 10
- if (NumCounters < INSTR_PROF_MIN_VAL_COUNTS)
- NumCounters = std::max(INSTR_PROF_MIN_VAL_COUNTS, (int)NumCounters * 2);
-
- auto &Ctx = M->getContext();
- Type *VNodeTypes[] = {
-#define INSTR_PROF_VALUE_NODE(Type, LLVMType, Name, Init) LLVMType,
-#include "llvm/ProfileData/InstrProfData.inc"
- };
- auto *VNodeTy = StructType::get(Ctx, makeArrayRef(VNodeTypes));
-
- ArrayType *VNodesTy = ArrayType::get(VNodeTy, NumCounters);
- auto *VNodesVar = new GlobalVariable(
- *M, VNodesTy, false, GlobalValue::PrivateLinkage,
- Constant::getNullValue(VNodesTy), getInstrProfVNodesVarName());
- VNodesVar->setSection(
- getInstrProfSectionName(IPSK_vnodes, TT.getObjectFormat()));
- UsedVars.push_back(VNodesVar);
-}
-
-void InstrProfiling::emitNameData() {
- std::string UncompressedData;
-
- if (ReferencedNames.empty())
- return;
-
- std::string CompressedNameStr;
- if (Error E = collectPGOFuncNameStrings(ReferencedNames, CompressedNameStr,
- DoInstrProfNameCompression)) {
- report_fatal_error(toString(std::move(E)), false);
- }
-
- auto &Ctx = M->getContext();
- auto *NamesVal = ConstantDataArray::getString(
- Ctx, StringRef(CompressedNameStr), false);
- NamesVar = new GlobalVariable(*M, NamesVal->getType(), true,
- GlobalValue::PrivateLinkage, NamesVal,
- getInstrProfNamesVarName());
- NamesSize = CompressedNameStr.size();
- NamesVar->setSection(
- getInstrProfSectionName(IPSK_name, TT.getObjectFormat()));
- // On COFF, it's important to reduce the alignment down to 1 to prevent the
- // linker from inserting padding before the start of the names section or
- // between names entries.
- NamesVar->setAlignment(Align(1));
- UsedVars.push_back(NamesVar);
-
- for (auto *NamePtr : ReferencedNames)
- NamePtr->eraseFromParent();
-}
-
-void InstrProfiling::emitRegistration() {
- if (!needsRuntimeRegistrationOfSectionRange(TT))
- return;
-
- // Construct the function.
- auto *VoidTy = Type::getVoidTy(M->getContext());
- auto *VoidPtrTy = Type::getInt8PtrTy(M->getContext());
- auto *Int64Ty = Type::getInt64Ty(M->getContext());
- auto *RegisterFTy = FunctionType::get(VoidTy, false);
- auto *RegisterF = Function::Create(RegisterFTy, GlobalValue::InternalLinkage,
- getInstrProfRegFuncsName(), M);
- RegisterF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
- if (Options.NoRedZone)
- RegisterF->addFnAttr(Attribute::NoRedZone);
-
- auto *RuntimeRegisterTy = FunctionType::get(VoidTy, VoidPtrTy, false);
- auto *RuntimeRegisterF =
- Function::Create(RuntimeRegisterTy, GlobalVariable::ExternalLinkage,
- getInstrProfRegFuncName(), M);
-
- IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", RegisterF));
- for (Value *Data : UsedVars)
- if (Data != NamesVar && !isa<Function>(Data))
- IRB.CreateCall(RuntimeRegisterF, IRB.CreateBitCast(Data, VoidPtrTy));
-
- if (NamesVar) {
- Type *ParamTypes[] = {VoidPtrTy, Int64Ty};
- auto *NamesRegisterTy =
- FunctionType::get(VoidTy, makeArrayRef(ParamTypes), false);
- auto *NamesRegisterF =
- Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage,
- getInstrProfNamesRegFuncName(), M);
- IRB.CreateCall(NamesRegisterF, {IRB.CreateBitCast(NamesVar, VoidPtrTy),
- IRB.getInt64(NamesSize)});
- }
-
- IRB.CreateRetVoid();
-}
-
-bool InstrProfiling::emitRuntimeHook() {
- // We expect the linker to be invoked with -u<hook_var> flag for Linux or
- // Fuchsia, in which case there is no need to emit the user function.
- if (TT.isOSLinux() || TT.isOSFuchsia())
- return false;
-
- // If the module's provided its own runtime, we don't need to do anything.
- if (M->getGlobalVariable(getInstrProfRuntimeHookVarName()))
- return false;
-
- // Declare an external variable that will pull in the runtime initialization.
- auto *Int32Ty = Type::getInt32Ty(M->getContext());
- auto *Var =
- new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
- nullptr, getInstrProfRuntimeHookVarName());
-
- // Make a function that uses it.
- auto *User = Function::Create(FunctionType::get(Int32Ty, false),
- GlobalValue::LinkOnceODRLinkage,
- getInstrProfRuntimeHookVarUseFuncName(), M);
- User->addFnAttr(Attribute::NoInline);
- if (Options.NoRedZone)
- User->addFnAttr(Attribute::NoRedZone);
- User->setVisibility(GlobalValue::HiddenVisibility);
- if (TT.supportsCOMDAT())
- User->setComdat(M->getOrInsertComdat(User->getName()));
-
- IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User));
- auto *Load = IRB.CreateLoad(Int32Ty, Var);
- IRB.CreateRet(Load);
-
- // Mark the user variable as used so that it isn't stripped out.
- UsedVars.push_back(User);
- return true;
-}
-
-void InstrProfiling::emitUses() {
- if (!UsedVars.empty())
- appendToUsed(*M, UsedVars);
-}
-
-void InstrProfiling::emitInitialization() {
- // Create ProfileFileName variable. Don't don't this for the
- // context-sensitive instrumentation lowering: This lowering is after
- // LTO/ThinLTO linking. Pass PGOInstrumentationGenCreateVar should
- // have already create the variable before LTO/ThinLTO linking.
- if (!IsCS)
- createProfileFileNameVar(*M, Options.InstrProfileOutput);
- Function *RegisterF = M->getFunction(getInstrProfRegFuncsName());
- if (!RegisterF)
- return;
-
- // Create the initialization function.
- auto *VoidTy = Type::getVoidTy(M->getContext());
- auto *F = Function::Create(FunctionType::get(VoidTy, false),
- GlobalValue::InternalLinkage,
- getInstrProfInitFuncName(), M);
- F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
- F->addFnAttr(Attribute::NoInline);
- if (Options.NoRedZone)
- F->addFnAttr(Attribute::NoRedZone);
-
- // Add the basic block and the necessary calls.
- IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F));
- IRB.CreateCall(RegisterF, {});
- IRB.CreateRetVoid();
-
- appendToGlobalCtors(*M, F, 0);
-}
+ Data->setVisibility(Visibility);
+ Data->setSection(getInstrProfSectionName(IPSK_data, TT.getObjectFormat()));
+ Data->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
+ MaybeSetComdat(Data);
+ Data->setLinkage(Linkage);
+
+ PD.RegionCounters = CounterPtr;
+ PD.DataVar = Data;
+ ProfileDataMap[NamePtr] = PD;
+
+ // Mark the data variable as used so that it isn't stripped out.
+ UsedVars.push_back(Data);
+ // Now that the linkage set by the FE has been passed to the data and counter
+ // variables, reset Name variable's linkage and visibility to private so that
+ // it can be removed later by the compiler.
+ NamePtr->setLinkage(GlobalValue::PrivateLinkage);
+ // Collect the referenced names to be used by emitNameData.
+ ReferencedNames.push_back(NamePtr);
+
+ return CounterPtr;
+}
+
+void InstrProfiling::emitVNodes() {
+ if (!ValueProfileStaticAlloc)
+ return;
+
+ // For now only support this on platforms that do
+ // not require runtime registration to discover
+ // named section start/end.
+ if (needsRuntimeRegistrationOfSectionRange(TT))
+ return;
+
+ size_t TotalNS = 0;
+ for (auto &PD : ProfileDataMap) {
+ for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+ TotalNS += PD.second.NumValueSites[Kind];
+ }
+
+ if (!TotalNS)
+ return;
+
+ uint64_t NumCounters = TotalNS * NumCountersPerValueSite;
+// Heuristic for small programs with very few total value sites.
+// The default value of vp-counters-per-site is chosen based on
+// the observation that large apps usually have a low percentage
+// of value sites that actually have any profile data, and thus
+// the average number of counters per site is low. For small
+// apps with very few sites, this may not be true. Bump up the
+// number of counters in this case.
+#define INSTR_PROF_MIN_VAL_COUNTS 10
+ if (NumCounters < INSTR_PROF_MIN_VAL_COUNTS)
+ NumCounters = std::max(INSTR_PROF_MIN_VAL_COUNTS, (int)NumCounters * 2);
+
+ auto &Ctx = M->getContext();
+ Type *VNodeTypes[] = {
+#define INSTR_PROF_VALUE_NODE(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
+ };
+ auto *VNodeTy = StructType::get(Ctx, makeArrayRef(VNodeTypes));
+
+ ArrayType *VNodesTy = ArrayType::get(VNodeTy, NumCounters);
+ auto *VNodesVar = new GlobalVariable(
+ *M, VNodesTy, false, GlobalValue::PrivateLinkage,
+ Constant::getNullValue(VNodesTy), getInstrProfVNodesVarName());
+ VNodesVar->setSection(
+ getInstrProfSectionName(IPSK_vnodes, TT.getObjectFormat()));
+ UsedVars.push_back(VNodesVar);
+}
+
+void InstrProfiling::emitNameData() {
+ std::string UncompressedData;
+
+ if (ReferencedNames.empty())
+ return;
+
+ std::string CompressedNameStr;
+ if (Error E = collectPGOFuncNameStrings(ReferencedNames, CompressedNameStr,
+ DoInstrProfNameCompression)) {
+ report_fatal_error(toString(std::move(E)), false);
+ }
+
+ auto &Ctx = M->getContext();
+ auto *NamesVal = ConstantDataArray::getString(
+ Ctx, StringRef(CompressedNameStr), false);
+ NamesVar = new GlobalVariable(*M, NamesVal->getType(), true,
+ GlobalValue::PrivateLinkage, NamesVal,
+ getInstrProfNamesVarName());
+ NamesSize = CompressedNameStr.size();
+ NamesVar->setSection(
+ getInstrProfSectionName(IPSK_name, TT.getObjectFormat()));
+ // On COFF, it's important to reduce the alignment down to 1 to prevent the
+ // linker from inserting padding before the start of the names section or
+ // between names entries.
+ NamesVar->setAlignment(Align(1));
+ UsedVars.push_back(NamesVar);
+
+ for (auto *NamePtr : ReferencedNames)
+ NamePtr->eraseFromParent();
+}
+
+void InstrProfiling::emitRegistration() {
+ if (!needsRuntimeRegistrationOfSectionRange(TT))
+ return;
+
+ // Construct the function.
+ auto *VoidTy = Type::getVoidTy(M->getContext());
+ auto *VoidPtrTy = Type::getInt8PtrTy(M->getContext());
+ auto *Int64Ty = Type::getInt64Ty(M->getContext());
+ auto *RegisterFTy = FunctionType::get(VoidTy, false);
+ auto *RegisterF = Function::Create(RegisterFTy, GlobalValue::InternalLinkage,
+ getInstrProfRegFuncsName(), M);
+ RegisterF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+ if (Options.NoRedZone)
+ RegisterF->addFnAttr(Attribute::NoRedZone);
+
+ auto *RuntimeRegisterTy = FunctionType::get(VoidTy, VoidPtrTy, false);
+ auto *RuntimeRegisterF =
+ Function::Create(RuntimeRegisterTy, GlobalVariable::ExternalLinkage,
+ getInstrProfRegFuncName(), M);
+
+ IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", RegisterF));
+ for (Value *Data : UsedVars)
+ if (Data != NamesVar && !isa<Function>(Data))
+ IRB.CreateCall(RuntimeRegisterF, IRB.CreateBitCast(Data, VoidPtrTy));
+
+ if (NamesVar) {
+ Type *ParamTypes[] = {VoidPtrTy, Int64Ty};
+ auto *NamesRegisterTy =
+ FunctionType::get(VoidTy, makeArrayRef(ParamTypes), false);
+ auto *NamesRegisterF =
+ Function::Create(NamesRegisterTy, GlobalVariable::ExternalLinkage,
+ getInstrProfNamesRegFuncName(), M);
+ IRB.CreateCall(NamesRegisterF, {IRB.CreateBitCast(NamesVar, VoidPtrTy),
+ IRB.getInt64(NamesSize)});
+ }
+
+ IRB.CreateRetVoid();
+}
+
+bool InstrProfiling::emitRuntimeHook() {
+ // We expect the linker to be invoked with -u<hook_var> flag for Linux or
+ // Fuchsia, in which case there is no need to emit the user function.
+ if (TT.isOSLinux() || TT.isOSFuchsia())
+ return false;
+
+ // If the module's provided its own runtime, we don't need to do anything.
+ if (M->getGlobalVariable(getInstrProfRuntimeHookVarName()))
+ return false;
+
+ // Declare an external variable that will pull in the runtime initialization.
+ auto *Int32Ty = Type::getInt32Ty(M->getContext());
+ auto *Var =
+ new GlobalVariable(*M, Int32Ty, false, GlobalValue::ExternalLinkage,
+ nullptr, getInstrProfRuntimeHookVarName());
+
+ // Make a function that uses it.
+ auto *User = Function::Create(FunctionType::get(Int32Ty, false),
+ GlobalValue::LinkOnceODRLinkage,
+ getInstrProfRuntimeHookVarUseFuncName(), M);
+ User->addFnAttr(Attribute::NoInline);
+ if (Options.NoRedZone)
+ User->addFnAttr(Attribute::NoRedZone);
+ User->setVisibility(GlobalValue::HiddenVisibility);
+ if (TT.supportsCOMDAT())
+ User->setComdat(M->getOrInsertComdat(User->getName()));
+
+ IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", User));
+ auto *Load = IRB.CreateLoad(Int32Ty, Var);
+ IRB.CreateRet(Load);
+
+ // Mark the user variable as used so that it isn't stripped out.
+ UsedVars.push_back(User);
+ return true;
+}
+
+void InstrProfiling::emitUses() {
+ if (!UsedVars.empty())
+ appendToUsed(*M, UsedVars);
+}
+
+void InstrProfiling::emitInitialization() {
+ // Create ProfileFileName variable. Don't don't this for the
+ // context-sensitive instrumentation lowering: This lowering is after
+ // LTO/ThinLTO linking. Pass PGOInstrumentationGenCreateVar should
+ // have already create the variable before LTO/ThinLTO linking.
+ if (!IsCS)
+ createProfileFileNameVar(*M, Options.InstrProfileOutput);
+ Function *RegisterF = M->getFunction(getInstrProfRegFuncsName());
+ if (!RegisterF)
+ return;
+
+ // Create the initialization function.
+ auto *VoidTy = Type::getVoidTy(M->getContext());
+ auto *F = Function::Create(FunctionType::get(VoidTy, false),
+ GlobalValue::InternalLinkage,
+ getInstrProfInitFuncName(), M);
+ F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+ F->addFnAttr(Attribute::NoInline);
+ if (Options.NoRedZone)
+ F->addFnAttr(Attribute::NoRedZone);
+
+ // Add the basic block and the necessary calls.
+ IRBuilder<> IRB(BasicBlock::Create(M->getContext(), "", F));
+ IRB.CreateCall(RegisterF, {});
+ IRB.CreateRetVoid();
+
+ appendToGlobalCtors(*M, F, 0);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/Instrumentation.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/Instrumentation.cpp
index 08137cf836..cfdf3cad97 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -1,131 +1,131 @@
-//===-- Instrumentation.cpp - TransformUtils Infrastructure ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the common initialization infrastructure for the
-// Instrumentation library.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm-c/Initialization.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/PassRegistry.h"
-
-using namespace llvm;
-
-/// Moves I before IP. Returns new insert point.
-static BasicBlock::iterator moveBeforeInsertPoint(BasicBlock::iterator I, BasicBlock::iterator IP) {
- // If I is IP, move the insert point down.
- if (I == IP) {
- ++IP;
- } else {
- // Otherwise, move I before IP and return IP.
- I->moveBefore(&*IP);
- }
- return IP;
-}
-
-/// Instrumentation passes often insert conditional checks into entry blocks.
-/// Call this function before splitting the entry block to move instructions
-/// that must remain in the entry block up before the split point. Static
-/// allocas and llvm.localescape calls, for example, must remain in the entry
-/// block.
-BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB,
- BasicBlock::iterator IP) {
- assert(&BB.getParent()->getEntryBlock() == &BB);
- for (auto I = IP, E = BB.end(); I != E; ++I) {
- bool KeepInEntry = false;
- if (auto *AI = dyn_cast<AllocaInst>(I)) {
- if (AI->isStaticAlloca())
- KeepInEntry = true;
- } else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
- if (II->getIntrinsicID() == llvm::Intrinsic::localescape)
- KeepInEntry = true;
- }
- if (KeepInEntry)
- IP = moveBeforeInsertPoint(I, IP);
- }
- return IP;
-}
-
-// Create a constant for Str so that we can pass it to the run-time lib.
-GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str,
- bool AllowMerging,
- const char *NamePrefix) {
- Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
- // We use private linkage for module-local strings. If they can be merged
- // with another one, we set the unnamed_addr attribute.
- GlobalVariable *GV =
- new GlobalVariable(M, StrConst->getType(), true,
- GlobalValue::PrivateLinkage, StrConst, NamePrefix);
- if (AllowMerging)
- GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
- GV->setAlignment(Align(1)); // Strings may not be merged w/o setting
- // alignment explicitly.
- return GV;
-}
-
-Comdat *llvm::GetOrCreateFunctionComdat(Function &F, Triple &T,
- const std::string &ModuleId) {
- if (auto Comdat = F.getComdat()) return Comdat;
- assert(F.hasName());
- Module *M = F.getParent();
- std::string Name = std::string(F.getName());
-
- // Make a unique comdat name for internal linkage things on ELF. On COFF, the
- // name of the comdat group identifies the leader symbol of the comdat group.
- // The linkage of the leader symbol is considered during comdat resolution,
- // and internal symbols with the same name from different objects will not be
- // merged.
- if (T.isOSBinFormatELF() && F.hasLocalLinkage()) {
- if (ModuleId.empty())
- return nullptr;
- Name += ModuleId;
- }
-
- // Make a new comdat for the function. Use the "no duplicates" selection kind
- // for non-weak symbols if the object file format supports it.
- Comdat *C = M->getOrInsertComdat(Name);
- if (T.isOSBinFormatCOFF() && !F.isWeakForLinker())
- C->setSelectionKind(Comdat::NoDuplicates);
- F.setComdat(C);
- return C;
-}
-
-/// initializeInstrumentation - Initialize all passes in the TransformUtils
-/// library.
-void llvm::initializeInstrumentation(PassRegistry &Registry) {
- initializeAddressSanitizerLegacyPassPass(Registry);
- initializeModuleAddressSanitizerLegacyPassPass(Registry);
+//===-- Instrumentation.cpp - TransformUtils Infrastructure ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the common initialization infrastructure for the
+// Instrumentation library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+/// Moves I before IP. Returns new insert point.
+static BasicBlock::iterator moveBeforeInsertPoint(BasicBlock::iterator I, BasicBlock::iterator IP) {
+ // If I is IP, move the insert point down.
+ if (I == IP) {
+ ++IP;
+ } else {
+ // Otherwise, move I before IP and return IP.
+ I->moveBefore(&*IP);
+ }
+ return IP;
+}
+
+/// Instrumentation passes often insert conditional checks into entry blocks.
+/// Call this function before splitting the entry block to move instructions
+/// that must remain in the entry block up before the split point. Static
+/// allocas and llvm.localescape calls, for example, must remain in the entry
+/// block.
+BasicBlock::iterator llvm::PrepareToSplitEntryBlock(BasicBlock &BB,
+ BasicBlock::iterator IP) {
+ assert(&BB.getParent()->getEntryBlock() == &BB);
+ for (auto I = IP, E = BB.end(); I != E; ++I) {
+ bool KeepInEntry = false;
+ if (auto *AI = dyn_cast<AllocaInst>(I)) {
+ if (AI->isStaticAlloca())
+ KeepInEntry = true;
+ } else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ if (II->getIntrinsicID() == llvm::Intrinsic::localescape)
+ KeepInEntry = true;
+ }
+ if (KeepInEntry)
+ IP = moveBeforeInsertPoint(I, IP);
+ }
+ return IP;
+}
+
+// Create a constant for Str so that we can pass it to the run-time lib.
+GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str,
+ bool AllowMerging,
+ const char *NamePrefix) {
+ Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+ // We use private linkage for module-local strings. If they can be merged
+ // with another one, we set the unnamed_addr attribute.
+ GlobalVariable *GV =
+ new GlobalVariable(M, StrConst->getType(), true,
+ GlobalValue::PrivateLinkage, StrConst, NamePrefix);
+ if (AllowMerging)
+ GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+ GV->setAlignment(Align(1)); // Strings may not be merged w/o setting
+ // alignment explicitly.
+ return GV;
+}
+
+Comdat *llvm::GetOrCreateFunctionComdat(Function &F, Triple &T,
+ const std::string &ModuleId) {
+ if (auto Comdat = F.getComdat()) return Comdat;
+ assert(F.hasName());
+ Module *M = F.getParent();
+ std::string Name = std::string(F.getName());
+
+ // Make a unique comdat name for internal linkage things on ELF. On COFF, the
+ // name of the comdat group identifies the leader symbol of the comdat group.
+ // The linkage of the leader symbol is considered during comdat resolution,
+ // and internal symbols with the same name from different objects will not be
+ // merged.
+ if (T.isOSBinFormatELF() && F.hasLocalLinkage()) {
+ if (ModuleId.empty())
+ return nullptr;
+ Name += ModuleId;
+ }
+
+ // Make a new comdat for the function. Use the "no duplicates" selection kind
+ // for non-weak symbols if the object file format supports it.
+ Comdat *C = M->getOrInsertComdat(Name);
+ if (T.isOSBinFormatCOFF() && !F.isWeakForLinker())
+ C->setSelectionKind(Comdat::NoDuplicates);
+ F.setComdat(C);
+ return C;
+}
+
+/// initializeInstrumentation - Initialize all passes in the TransformUtils
+/// library.
+void llvm::initializeInstrumentation(PassRegistry &Registry) {
+ initializeAddressSanitizerLegacyPassPass(Registry);
+ initializeModuleAddressSanitizerLegacyPassPass(Registry);
initializeMemProfilerLegacyPassPass(Registry);
initializeModuleMemProfilerLegacyPassPass(Registry);
- initializeBoundsCheckingLegacyPassPass(Registry);
- initializeControlHeightReductionLegacyPassPass(Registry);
- initializeGCOVProfilerLegacyPassPass(Registry);
- initializePGOInstrumentationGenLegacyPassPass(Registry);
- initializePGOInstrumentationUseLegacyPassPass(Registry);
- initializePGOIndirectCallPromotionLegacyPassPass(Registry);
- initializePGOMemOPSizeOptLegacyPassPass(Registry);
- initializeCGProfileLegacyPassPass(Registry);
- initializeInstrOrderFileLegacyPassPass(Registry);
- initializeInstrProfilingLegacyPassPass(Registry);
- initializeMemorySanitizerLegacyPassPass(Registry);
- initializeHWAddressSanitizerLegacyPassPass(Registry);
- initializeThreadSanitizerLegacyPassPass(Registry);
- initializeModuleSanitizerCoverageLegacyPassPass(Registry);
+ initializeBoundsCheckingLegacyPassPass(Registry);
+ initializeControlHeightReductionLegacyPassPass(Registry);
+ initializeGCOVProfilerLegacyPassPass(Registry);
+ initializePGOInstrumentationGenLegacyPassPass(Registry);
+ initializePGOInstrumentationUseLegacyPassPass(Registry);
+ initializePGOIndirectCallPromotionLegacyPassPass(Registry);
+ initializePGOMemOPSizeOptLegacyPassPass(Registry);
+ initializeCGProfileLegacyPassPass(Registry);
+ initializeInstrOrderFileLegacyPassPass(Registry);
+ initializeInstrProfilingLegacyPassPass(Registry);
+ initializeMemorySanitizerLegacyPassPass(Registry);
+ initializeHWAddressSanitizerLegacyPassPass(Registry);
+ initializeThreadSanitizerLegacyPassPass(Registry);
+ initializeModuleSanitizerCoverageLegacyPassPass(Registry);
initializeDataFlowSanitizerLegacyPassPass(Registry);
-}
-
-/// LLVMInitializeInstrumentation - C binding for
-/// initializeInstrumentation.
-void LLVMInitializeInstrumentation(LLVMPassRegistryRef R) {
- initializeInstrumentation(*unwrap(R));
-}
+}
+
+/// LLVMInitializeInstrumentation - C binding for
+/// initializeInstrumentation.
+void LLVMInitializeInstrumentation(LLVMPassRegistryRef R) {
+ initializeInstrumentation(*unwrap(R));
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 4159f82db5..7a6874584d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -1,1176 +1,1176 @@
-//===- MemorySanitizer.cpp - detector of uninitialized reads --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This file is a part of MemorySanitizer, a detector of uninitialized
-/// reads.
-///
-/// The algorithm of the tool is similar to Memcheck
-/// (http://goo.gl/QKbem). We associate a few shadow bits with every
-/// byte of the application memory, poison the shadow of the malloc-ed
-/// or alloca-ed memory, load the shadow bits on every memory read,
-/// propagate the shadow bits through some of the arithmetic
-/// instruction (including MOV), store the shadow bits on every memory
-/// write, report a bug on some other instructions (e.g. JMP) if the
-/// associated shadow is poisoned.
-///
-/// But there are differences too. The first and the major one:
-/// compiler instrumentation instead of binary instrumentation. This
-/// gives us much better register allocation, possible compiler
-/// optimizations and a fast start-up. But this brings the major issue
-/// as well: msan needs to see all program events, including system
-/// calls and reads/writes in system libraries, so we either need to
-/// compile *everything* with msan or use a binary translation
-/// component (e.g. DynamoRIO) to instrument pre-built libraries.
-/// Another difference from Memcheck is that we use 8 shadow bits per
-/// byte of application memory and use a direct shadow mapping. This
-/// greatly simplifies the instrumentation code and avoids races on
-/// shadow updates (Memcheck is single-threaded so races are not a
-/// concern there. Memcheck uses 2 shadow bits per byte with a slow
-/// path storage that uses 8 bits per byte).
-///
-/// The default value of shadow is 0, which means "clean" (not poisoned).
-///
-/// Every module initializer should call __msan_init to ensure that the
-/// shadow memory is ready. On error, __msan_warning is called. Since
-/// parameters and return values may be passed via registers, we have a
-/// specialized thread-local shadow for return values
-/// (__msan_retval_tls) and parameters (__msan_param_tls).
-///
-/// Origin tracking.
-///
-/// MemorySanitizer can track origins (allocation points) of all uninitialized
-/// values. This behavior is controlled with a flag (msan-track-origins) and is
-/// disabled by default.
-///
-/// Origins are 4-byte values created and interpreted by the runtime library.
-/// They are stored in a second shadow mapping, one 4-byte value for 4 bytes
-/// of application memory. Propagation of origins is basically a bunch of
-/// "select" instructions that pick the origin of a dirty argument, if an
-/// instruction has one.
-///
-/// Every 4 aligned, consecutive bytes of application memory have one origin
-/// value associated with them. If these bytes contain uninitialized data
-/// coming from 2 different allocations, the last store wins. Because of this,
-/// MemorySanitizer reports can show unrelated origins, but this is unlikely in
-/// practice.
-///
-/// Origins are meaningless for fully initialized values, so MemorySanitizer
-/// avoids storing origin to memory when a fully initialized value is stored.
-/// This way it avoids needless overwriting origin of the 4-byte region on
-/// a short (i.e. 1 byte) clean store, and it is also good for performance.
-///
-/// Atomic handling.
-///
-/// Ideally, every atomic store of application value should update the
-/// corresponding shadow location in an atomic way. Unfortunately, atomic store
-/// of two disjoint locations can not be done without severe slowdown.
-///
-/// Therefore, we implement an approximation that may err on the safe side.
-/// In this implementation, every atomically accessed location in the program
-/// may only change from (partially) uninitialized to fully initialized, but
-/// not the other way around. We load the shadow _after_ the application load,
-/// and we store the shadow _before_ the app store. Also, we always store clean
-/// shadow (if the application store is atomic). This way, if the store-load
-/// pair constitutes a happens-before arc, shadow store and load are correctly
-/// ordered such that the load will get either the value that was stored, or
-/// some later value (which is always clean).
-///
-/// This does not work very well with Compare-And-Swap (CAS) and
-/// Read-Modify-Write (RMW) operations. To follow the above logic, CAS and RMW
-/// must store the new shadow before the app operation, and load the shadow
-/// after the app operation. Computers don't work this way. Current
-/// implementation ignores the load aspect of CAS/RMW, always returning a clean
-/// value. It implements the store part as a simple atomic store by storing a
-/// clean shadow.
-///
-/// Instrumenting inline assembly.
-///
-/// For inline assembly code LLVM has little idea about which memory locations
-/// become initialized depending on the arguments. It can be possible to figure
-/// out which arguments are meant to point to inputs and outputs, but the
-/// actual semantics can be only visible at runtime. In the Linux kernel it's
-/// also possible that the arguments only indicate the offset for a base taken
-/// from a segment register, so it's dangerous to treat any asm() arguments as
-/// pointers. We take a conservative approach generating calls to
-/// __msan_instrument_asm_store(ptr, size)
-/// , which defer the memory unpoisoning to the runtime library.
-/// The latter can perform more complex address checks to figure out whether
-/// it's safe to touch the shadow memory.
-/// Like with atomic operations, we call __msan_instrument_asm_store() before
-/// the assembly call, so that changes to the shadow memory will be seen by
-/// other threads together with main memory initialization.
-///
-/// KernelMemorySanitizer (KMSAN) implementation.
-///
-/// The major differences between KMSAN and MSan instrumentation are:
-/// - KMSAN always tracks the origins and implies msan-keep-going=true;
-/// - KMSAN allocates shadow and origin memory for each page separately, so
-/// there are no explicit accesses to shadow and origin in the
-/// instrumentation.
-/// Shadow and origin values for a particular X-byte memory location
-/// (X=1,2,4,8) are accessed through pointers obtained via the
-/// __msan_metadata_ptr_for_load_X(ptr)
-/// __msan_metadata_ptr_for_store_X(ptr)
-/// functions. The corresponding functions check that the X-byte accesses
-/// are possible and returns the pointers to shadow and origin memory.
-/// Arbitrary sized accesses are handled with:
-/// __msan_metadata_ptr_for_load_n(ptr, size)
-/// __msan_metadata_ptr_for_store_n(ptr, size);
-/// - TLS variables are stored in a single per-task struct. A call to a
-/// function __msan_get_context_state() returning a pointer to that struct
-/// is inserted into every instrumented function before the entry block;
-/// - __msan_warning() takes a 32-bit origin parameter;
-/// - local variables are poisoned with __msan_poison_alloca() upon function
-/// entry and unpoisoned with __msan_unpoison_alloca() before leaving the
-/// function;
-/// - the pass doesn't declare any global variables or add global constructors
-/// to the translation unit.
-///
-/// Also, KMSAN currently ignores uninitialized memory passed into inline asm
-/// calls, making sure we're on the safe side wrt. possible false positives.
-///
-/// KernelMemorySanitizer only supports X86_64 at the moment.
-///
-//
-// FIXME: This sanitizer does not yet handle scalable vectors
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+//===- MemorySanitizer.cpp - detector of uninitialized reads --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file is a part of MemorySanitizer, a detector of uninitialized
+/// reads.
+///
+/// The algorithm of the tool is similar to Memcheck
+/// (http://goo.gl/QKbem). We associate a few shadow bits with every
+/// byte of the application memory, poison the shadow of the malloc-ed
+/// or alloca-ed memory, load the shadow bits on every memory read,
+/// propagate the shadow bits through some of the arithmetic
+/// instruction (including MOV), store the shadow bits on every memory
+/// write, report a bug on some other instructions (e.g. JMP) if the
+/// associated shadow is poisoned.
+///
+/// But there are differences too. The first and the major one:
+/// compiler instrumentation instead of binary instrumentation. This
+/// gives us much better register allocation, possible compiler
+/// optimizations and a fast start-up. But this brings the major issue
+/// as well: msan needs to see all program events, including system
+/// calls and reads/writes in system libraries, so we either need to
+/// compile *everything* with msan or use a binary translation
+/// component (e.g. DynamoRIO) to instrument pre-built libraries.
+/// Another difference from Memcheck is that we use 8 shadow bits per
+/// byte of application memory and use a direct shadow mapping. This
+/// greatly simplifies the instrumentation code and avoids races on
+/// shadow updates (Memcheck is single-threaded so races are not a
+/// concern there. Memcheck uses 2 shadow bits per byte with a slow
+/// path storage that uses 8 bits per byte).
+///
+/// The default value of shadow is 0, which means "clean" (not poisoned).
+///
+/// Every module initializer should call __msan_init to ensure that the
+/// shadow memory is ready. On error, __msan_warning is called. Since
+/// parameters and return values may be passed via registers, we have a
+/// specialized thread-local shadow for return values
+/// (__msan_retval_tls) and parameters (__msan_param_tls).
+///
+/// Origin tracking.
+///
+/// MemorySanitizer can track origins (allocation points) of all uninitialized
+/// values. This behavior is controlled with a flag (msan-track-origins) and is
+/// disabled by default.
+///
+/// Origins are 4-byte values created and interpreted by the runtime library.
+/// They are stored in a second shadow mapping, one 4-byte value for 4 bytes
+/// of application memory. Propagation of origins is basically a bunch of
+/// "select" instructions that pick the origin of a dirty argument, if an
+/// instruction has one.
+///
+/// Every 4 aligned, consecutive bytes of application memory have one origin
+/// value associated with them. If these bytes contain uninitialized data
+/// coming from 2 different allocations, the last store wins. Because of this,
+/// MemorySanitizer reports can show unrelated origins, but this is unlikely in
+/// practice.
+///
+/// Origins are meaningless for fully initialized values, so MemorySanitizer
+/// avoids storing origin to memory when a fully initialized value is stored.
+/// This way it avoids needless overwriting origin of the 4-byte region on
+/// a short (i.e. 1 byte) clean store, and it is also good for performance.
+///
+/// Atomic handling.
+///
+/// Ideally, every atomic store of application value should update the
+/// corresponding shadow location in an atomic way. Unfortunately, atomic store
+/// of two disjoint locations can not be done without severe slowdown.
+///
+/// Therefore, we implement an approximation that may err on the safe side.
+/// In this implementation, every atomically accessed location in the program
+/// may only change from (partially) uninitialized to fully initialized, but
+/// not the other way around. We load the shadow _after_ the application load,
+/// and we store the shadow _before_ the app store. Also, we always store clean
+/// shadow (if the application store is atomic). This way, if the store-load
+/// pair constitutes a happens-before arc, shadow store and load are correctly
+/// ordered such that the load will get either the value that was stored, or
+/// some later value (which is always clean).
+///
+/// This does not work very well with Compare-And-Swap (CAS) and
+/// Read-Modify-Write (RMW) operations. To follow the above logic, CAS and RMW
+/// must store the new shadow before the app operation, and load the shadow
+/// after the app operation. Computers don't work this way. Current
+/// implementation ignores the load aspect of CAS/RMW, always returning a clean
+/// value. It implements the store part as a simple atomic store by storing a
+/// clean shadow.
+///
+/// Instrumenting inline assembly.
+///
+/// For inline assembly code LLVM has little idea about which memory locations
+/// become initialized depending on the arguments. It can be possible to figure
+/// out which arguments are meant to point to inputs and outputs, but the
+/// actual semantics can be only visible at runtime. In the Linux kernel it's
+/// also possible that the arguments only indicate the offset for a base taken
+/// from a segment register, so it's dangerous to treat any asm() arguments as
+/// pointers. We take a conservative approach generating calls to
+/// __msan_instrument_asm_store(ptr, size)
+/// , which defer the memory unpoisoning to the runtime library.
+/// The latter can perform more complex address checks to figure out whether
+/// it's safe to touch the shadow memory.
+/// Like with atomic operations, we call __msan_instrument_asm_store() before
+/// the assembly call, so that changes to the shadow memory will be seen by
+/// other threads together with main memory initialization.
+///
+/// KernelMemorySanitizer (KMSAN) implementation.
+///
+/// The major differences between KMSAN and MSan instrumentation are:
+/// - KMSAN always tracks the origins and implies msan-keep-going=true;
+/// - KMSAN allocates shadow and origin memory for each page separately, so
+/// there are no explicit accesses to shadow and origin in the
+/// instrumentation.
+/// Shadow and origin values for a particular X-byte memory location
+/// (X=1,2,4,8) are accessed through pointers obtained via the
+/// __msan_metadata_ptr_for_load_X(ptr)
+/// __msan_metadata_ptr_for_store_X(ptr)
+/// functions. The corresponding functions check that the X-byte accesses
+/// are possible and returns the pointers to shadow and origin memory.
+/// Arbitrary sized accesses are handled with:
+/// __msan_metadata_ptr_for_load_n(ptr, size)
+/// __msan_metadata_ptr_for_store_n(ptr, size);
+/// - TLS variables are stored in a single per-task struct. A call to a
+/// function __msan_get_context_state() returning a pointer to that struct
+/// is inserted into every instrumented function before the entry block;
+/// - __msan_warning() takes a 32-bit origin parameter;
+/// - local variables are poisoned with __msan_poison_alloca() upon function
+/// entry and unpoisoned with __msan_unpoison_alloca() before leaving the
+/// function;
+/// - the pass doesn't declare any global variables or add global constructors
+/// to the translation unit.
+///
+/// Also, KMSAN currently ignores uninitialized memory passed into inline asm
+/// calls, making sure we're on the safe side wrt. possible false positives.
+///
+/// KernelMemorySanitizer only supports X86_64 at the moment.
+///
+//
+// FIXME: This sanitizer does not yet handle scalable vectors
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IntrinsicsX86.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueMap.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <tuple>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "msan"
-
-static const unsigned kOriginSize = 4;
-static const Align kMinOriginAlignment = Align(4);
-static const Align kShadowTLSAlignment = Align(8);
-
-// These constants must be kept in sync with the ones in msan.h.
-static const unsigned kParamTLSSize = 800;
-static const unsigned kRetvalTLSSize = 800;
-
-// Accesses sizes are powers of two: 1, 2, 4, 8.
-static const size_t kNumberOfAccessSizes = 4;
-
-/// Track origins of uninitialized values.
-///
-/// Adds a section to MemorySanitizer report that points to the allocation
-/// (stack or heap) the uninitialized bits came from originally.
-static cl::opt<int> ClTrackOrigins("msan-track-origins",
- cl::desc("Track origins (allocation sites) of poisoned memory"),
- cl::Hidden, cl::init(0));
-
-static cl::opt<bool> ClKeepGoing("msan-keep-going",
- cl::desc("keep going after reporting a UMR"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool> ClPoisonStack("msan-poison-stack",
- cl::desc("poison uninitialized stack variables"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call",
- cl::desc("poison uninitialized stack variables with a call"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern",
- cl::desc("poison uninitialized stack variables with the given pattern"),
- cl::Hidden, cl::init(0xff));
-
-static cl::opt<bool> ClPoisonUndef("msan-poison-undef",
- cl::desc("poison undef temps"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClHandleICmp("msan-handle-icmp",
- cl::desc("propagate shadow through ICmpEQ and ICmpNE"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact",
- cl::desc("exact handling of relational integer ICmp"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool> ClHandleLifetimeIntrinsics(
- "msan-handle-lifetime-intrinsics",
- cl::desc(
- "when possible, poison scoped variables at the beginning of the scope "
- "(slower, but more precise)"),
- cl::Hidden, cl::init(true));
-
-// When compiling the Linux kernel, we sometimes see false positives related to
-// MSan being unable to understand that inline assembly calls may initialize
-// local variables.
-// This flag makes the compiler conservatively unpoison every memory location
-// passed into an assembly call. Note that this may cause false positives.
-// Because it's impossible to figure out the array sizes, we can only unpoison
-// the first sizeof(type) bytes for each type* pointer.
-// The instrumentation is only enabled in KMSAN builds, and only if
-// -msan-handle-asm-conservative is on. This is done because we may want to
-// quickly disable assembly instrumentation when it breaks.
-static cl::opt<bool> ClHandleAsmConservative(
- "msan-handle-asm-conservative",
- cl::desc("conservative handling of inline assembly"), cl::Hidden,
- cl::init(true));
-
-// This flag controls whether we check the shadow of the address
-// operand of load or store. Such bugs are very rare, since load from
-// a garbage address typically results in SEGV, but still happen
-// (e.g. only lower bits of address are garbage, or the access happens
-// early at program startup where malloc-ed memory is more likely to
-// be zeroed. As of 2012-08-28 this flag adds 20% slowdown.
-static cl::opt<bool> ClCheckAccessAddress("msan-check-access-address",
- cl::desc("report accesses through a pointer which has poisoned shadow"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClEagerChecks(
- "msan-eager-checks",
- cl::desc("check arguments and return values at function call boundaries"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions",
- cl::desc("print out instructions with default strict semantics"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<int> ClInstrumentationWithCallThreshold(
- "msan-instrumentation-with-call-threshold",
- cl::desc(
- "If the function being instrumented requires more than "
- "this number of checks and origin stores, use callbacks instead of "
- "inline checks (-1 means never use callbacks)."),
- cl::Hidden, cl::init(3500));
-
-static cl::opt<bool>
- ClEnableKmsan("msan-kernel",
- cl::desc("Enable KernelMemorySanitizer instrumentation"),
- cl::Hidden, cl::init(false));
-
-// This is an experiment to enable handling of cases where shadow is a non-zero
-// compile-time constant. For some unexplainable reason they were silently
-// ignored in the instrumentation.
-static cl::opt<bool> ClCheckConstantShadow("msan-check-constant-shadow",
- cl::desc("Insert checks for constant shadow values"),
- cl::Hidden, cl::init(false));
-
-// This is off by default because of a bug in gold:
-// https://sourceware.org/bugzilla/show_bug.cgi?id=19002
-static cl::opt<bool> ClWithComdat("msan-with-comdat",
- cl::desc("Place MSan constructors in comdat sections"),
- cl::Hidden, cl::init(false));
-
-// These options allow to specify custom memory map parameters
-// See MemoryMapParams for details.
-static cl::opt<uint64_t> ClAndMask("msan-and-mask",
- cl::desc("Define custom MSan AndMask"),
- cl::Hidden, cl::init(0));
-
-static cl::opt<uint64_t> ClXorMask("msan-xor-mask",
- cl::desc("Define custom MSan XorMask"),
- cl::Hidden, cl::init(0));
-
-static cl::opt<uint64_t> ClShadowBase("msan-shadow-base",
- cl::desc("Define custom MSan ShadowBase"),
- cl::Hidden, cl::init(0));
-
-static cl::opt<uint64_t> ClOriginBase("msan-origin-base",
- cl::desc("Define custom MSan OriginBase"),
- cl::Hidden, cl::init(0));
-
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <tuple>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "msan"
+
+static const unsigned kOriginSize = 4;
+static const Align kMinOriginAlignment = Align(4);
+static const Align kShadowTLSAlignment = Align(8);
+
+// These constants must be kept in sync with the ones in msan.h.
+static const unsigned kParamTLSSize = 800;
+static const unsigned kRetvalTLSSize = 800;
+
+// Accesses sizes are powers of two: 1, 2, 4, 8.
+static const size_t kNumberOfAccessSizes = 4;
+
+/// Track origins of uninitialized values.
+///
+/// Adds a section to MemorySanitizer report that points to the allocation
+/// (stack or heap) the uninitialized bits came from originally.
+static cl::opt<int> ClTrackOrigins("msan-track-origins",
+ cl::desc("Track origins (allocation sites) of poisoned memory"),
+ cl::Hidden, cl::init(0));
+
+static cl::opt<bool> ClKeepGoing("msan-keep-going",
+ cl::desc("keep going after reporting a UMR"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClPoisonStack("msan-poison-stack",
+ cl::desc("poison uninitialized stack variables"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call",
+ cl::desc("poison uninitialized stack variables with a call"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern",
+ cl::desc("poison uninitialized stack variables with the given pattern"),
+ cl::Hidden, cl::init(0xff));
+
+static cl::opt<bool> ClPoisonUndef("msan-poison-undef",
+ cl::desc("poison undef temps"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClHandleICmp("msan-handle-icmp",
+ cl::desc("propagate shadow through ICmpEQ and ICmpNE"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClHandleICmpExact("msan-handle-icmp-exact",
+ cl::desc("exact handling of relational integer ICmp"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClHandleLifetimeIntrinsics(
+ "msan-handle-lifetime-intrinsics",
+ cl::desc(
+ "when possible, poison scoped variables at the beginning of the scope "
+ "(slower, but more precise)"),
+ cl::Hidden, cl::init(true));
+
+// When compiling the Linux kernel, we sometimes see false positives related to
+// MSan being unable to understand that inline assembly calls may initialize
+// local variables.
+// This flag makes the compiler conservatively unpoison every memory location
+// passed into an assembly call. Note that this may cause false positives.
+// Because it's impossible to figure out the array sizes, we can only unpoison
+// the first sizeof(type) bytes for each type* pointer.
+// The instrumentation is only enabled in KMSAN builds, and only if
+// -msan-handle-asm-conservative is on. This is done because we may want to
+// quickly disable assembly instrumentation when it breaks.
+static cl::opt<bool> ClHandleAsmConservative(
+ "msan-handle-asm-conservative",
+ cl::desc("conservative handling of inline assembly"), cl::Hidden,
+ cl::init(true));
+
+// This flag controls whether we check the shadow of the address
+// operand of load or store. Such bugs are very rare, since load from
+// a garbage address typically results in SEGV, but still happen
+// (e.g. only lower bits of address are garbage, or the access happens
+// early at program startup where malloc-ed memory is more likely to
+// be zeroed. As of 2012-08-28 this flag adds 20% slowdown.
+static cl::opt<bool> ClCheckAccessAddress("msan-check-access-address",
+ cl::desc("report accesses through a pointer which has poisoned shadow"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClEagerChecks(
+ "msan-eager-checks",
+ cl::desc("check arguments and return values at function call boundaries"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions",
+ cl::desc("print out instructions with default strict semantics"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<int> ClInstrumentationWithCallThreshold(
+ "msan-instrumentation-with-call-threshold",
+ cl::desc(
+ "If the function being instrumented requires more than "
+ "this number of checks and origin stores, use callbacks instead of "
+ "inline checks (-1 means never use callbacks)."),
+ cl::Hidden, cl::init(3500));
+
+static cl::opt<bool>
+ ClEnableKmsan("msan-kernel",
+ cl::desc("Enable KernelMemorySanitizer instrumentation"),
+ cl::Hidden, cl::init(false));
+
+// This is an experiment to enable handling of cases where shadow is a non-zero
+// compile-time constant. For some unexplainable reason they were silently
+// ignored in the instrumentation.
+static cl::opt<bool> ClCheckConstantShadow("msan-check-constant-shadow",
+ cl::desc("Insert checks for constant shadow values"),
+ cl::Hidden, cl::init(false));
+
+// This is off by default because of a bug in gold:
+// https://sourceware.org/bugzilla/show_bug.cgi?id=19002
+static cl::opt<bool> ClWithComdat("msan-with-comdat",
+ cl::desc("Place MSan constructors in comdat sections"),
+ cl::Hidden, cl::init(false));
+
+// These options allow to specify custom memory map parameters
+// See MemoryMapParams for details.
+static cl::opt<uint64_t> ClAndMask("msan-and-mask",
+ cl::desc("Define custom MSan AndMask"),
+ cl::Hidden, cl::init(0));
+
+static cl::opt<uint64_t> ClXorMask("msan-xor-mask",
+ cl::desc("Define custom MSan XorMask"),
+ cl::Hidden, cl::init(0));
+
+static cl::opt<uint64_t> ClShadowBase("msan-shadow-base",
+ cl::desc("Define custom MSan ShadowBase"),
+ cl::Hidden, cl::init(0));
+
+static cl::opt<uint64_t> ClOriginBase("msan-origin-base",
+ cl::desc("Define custom MSan OriginBase"),
+ cl::Hidden, cl::init(0));
+
const char kMsanModuleCtorName[] = "msan.module_ctor";
const char kMsanInitName[] = "__msan_init";
-
-namespace {
-
-// Memory map parameters used in application-to-shadow address calculation.
-// Offset = (Addr & ~AndMask) ^ XorMask
-// Shadow = ShadowBase + Offset
-// Origin = OriginBase + Offset
-struct MemoryMapParams {
- uint64_t AndMask;
- uint64_t XorMask;
- uint64_t ShadowBase;
- uint64_t OriginBase;
-};
-
-struct PlatformMemoryMapParams {
- const MemoryMapParams *bits32;
- const MemoryMapParams *bits64;
-};
-
-} // end anonymous namespace
-
-// i386 Linux
-static const MemoryMapParams Linux_I386_MemoryMapParams = {
- 0x000080000000, // AndMask
- 0, // XorMask (not used)
- 0, // ShadowBase (not used)
- 0x000040000000, // OriginBase
-};
-
-// x86_64 Linux
-static const MemoryMapParams Linux_X86_64_MemoryMapParams = {
-#ifdef MSAN_LINUX_X86_64_OLD_MAPPING
- 0x400000000000, // AndMask
- 0, // XorMask (not used)
- 0, // ShadowBase (not used)
- 0x200000000000, // OriginBase
-#else
- 0, // AndMask (not used)
- 0x500000000000, // XorMask
- 0, // ShadowBase (not used)
- 0x100000000000, // OriginBase
-#endif
-};
-
-// mips64 Linux
-static const MemoryMapParams Linux_MIPS64_MemoryMapParams = {
- 0, // AndMask (not used)
- 0x008000000000, // XorMask
- 0, // ShadowBase (not used)
- 0x002000000000, // OriginBase
-};
-
-// ppc64 Linux
-static const MemoryMapParams Linux_PowerPC64_MemoryMapParams = {
- 0xE00000000000, // AndMask
- 0x100000000000, // XorMask
- 0x080000000000, // ShadowBase
- 0x1C0000000000, // OriginBase
-};
-
-// s390x Linux
-static const MemoryMapParams Linux_S390X_MemoryMapParams = {
- 0xC00000000000, // AndMask
- 0, // XorMask (not used)
- 0x080000000000, // ShadowBase
- 0x1C0000000000, // OriginBase
-};
-
-// aarch64 Linux
-static const MemoryMapParams Linux_AArch64_MemoryMapParams = {
- 0, // AndMask (not used)
- 0x06000000000, // XorMask
- 0, // ShadowBase (not used)
- 0x01000000000, // OriginBase
-};
-
-// i386 FreeBSD
-static const MemoryMapParams FreeBSD_I386_MemoryMapParams = {
- 0x000180000000, // AndMask
- 0x000040000000, // XorMask
- 0x000020000000, // ShadowBase
- 0x000700000000, // OriginBase
-};
-
-// x86_64 FreeBSD
-static const MemoryMapParams FreeBSD_X86_64_MemoryMapParams = {
- 0xc00000000000, // AndMask
- 0x200000000000, // XorMask
- 0x100000000000, // ShadowBase
- 0x380000000000, // OriginBase
-};
-
-// x86_64 NetBSD
-static const MemoryMapParams NetBSD_X86_64_MemoryMapParams = {
- 0, // AndMask
- 0x500000000000, // XorMask
- 0, // ShadowBase
- 0x100000000000, // OriginBase
-};
-
-static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = {
- &Linux_I386_MemoryMapParams,
- &Linux_X86_64_MemoryMapParams,
-};
-
-static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = {
- nullptr,
- &Linux_MIPS64_MemoryMapParams,
-};
-
-static const PlatformMemoryMapParams Linux_PowerPC_MemoryMapParams = {
- nullptr,
- &Linux_PowerPC64_MemoryMapParams,
-};
-
-static const PlatformMemoryMapParams Linux_S390_MemoryMapParams = {
- nullptr,
- &Linux_S390X_MemoryMapParams,
-};
-
-static const PlatformMemoryMapParams Linux_ARM_MemoryMapParams = {
- nullptr,
- &Linux_AArch64_MemoryMapParams,
-};
-
-static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = {
- &FreeBSD_I386_MemoryMapParams,
- &FreeBSD_X86_64_MemoryMapParams,
-};
-
-static const PlatformMemoryMapParams NetBSD_X86_MemoryMapParams = {
- nullptr,
- &NetBSD_X86_64_MemoryMapParams,
-};
-
-namespace {
-
-/// Instrument functions of a module to detect uninitialized reads.
-///
-/// Instantiating MemorySanitizer inserts the msan runtime library API function
-/// declarations into the module if they don't exist already. Instantiating
-/// ensures the __msan_init function is in the list of global constructors for
-/// the module.
-class MemorySanitizer {
-public:
- MemorySanitizer(Module &M, MemorySanitizerOptions Options)
- : CompileKernel(Options.Kernel), TrackOrigins(Options.TrackOrigins),
- Recover(Options.Recover) {
- initializeModule(M);
- }
-
- // MSan cannot be moved or copied because of MapParams.
- MemorySanitizer(MemorySanitizer &&) = delete;
- MemorySanitizer &operator=(MemorySanitizer &&) = delete;
- MemorySanitizer(const MemorySanitizer &) = delete;
- MemorySanitizer &operator=(const MemorySanitizer &) = delete;
-
- bool sanitizeFunction(Function &F, TargetLibraryInfo &TLI);
-
-private:
- friend struct MemorySanitizerVisitor;
- friend struct VarArgAMD64Helper;
- friend struct VarArgMIPS64Helper;
- friend struct VarArgAArch64Helper;
- friend struct VarArgPowerPC64Helper;
- friend struct VarArgSystemZHelper;
-
- void initializeModule(Module &M);
- void initializeCallbacks(Module &M);
- void createKernelApi(Module &M);
- void createUserspaceApi(Module &M);
-
- /// True if we're compiling the Linux kernel.
- bool CompileKernel;
- /// Track origins (allocation points) of uninitialized values.
- int TrackOrigins;
- bool Recover;
-
- LLVMContext *C;
- Type *IntptrTy;
- Type *OriginTy;
-
- // XxxTLS variables represent the per-thread state in MSan and per-task state
- // in KMSAN.
- // For the userspace these point to thread-local globals. In the kernel land
- // they point to the members of a per-task struct obtained via a call to
- // __msan_get_context_state().
-
- /// Thread-local shadow storage for function parameters.
- Value *ParamTLS;
-
- /// Thread-local origin storage for function parameters.
- Value *ParamOriginTLS;
-
- /// Thread-local shadow storage for function return value.
- Value *RetvalTLS;
-
- /// Thread-local origin storage for function return value.
- Value *RetvalOriginTLS;
-
- /// Thread-local shadow storage for in-register va_arg function
- /// parameters (x86_64-specific).
- Value *VAArgTLS;
-
- /// Thread-local shadow storage for in-register va_arg function
- /// parameters (x86_64-specific).
- Value *VAArgOriginTLS;
-
- /// Thread-local shadow storage for va_arg overflow area
- /// (x86_64-specific).
- Value *VAArgOverflowSizeTLS;
-
- /// Are the instrumentation callbacks set up?
- bool CallbacksInitialized = false;
-
- /// The run-time callback to print a warning.
- FunctionCallee WarningFn;
-
- // These arrays are indexed by log2(AccessSize).
- FunctionCallee MaybeWarningFn[kNumberOfAccessSizes];
- FunctionCallee MaybeStoreOriginFn[kNumberOfAccessSizes];
-
- /// Run-time helper that generates a new origin value for a stack
- /// allocation.
- FunctionCallee MsanSetAllocaOrigin4Fn;
-
- /// Run-time helper that poisons stack on function entry.
- FunctionCallee MsanPoisonStackFn;
-
- /// Run-time helper that records a store (or any event) of an
- /// uninitialized value and returns an updated origin id encoding this info.
- FunctionCallee MsanChainOriginFn;
-
+
+namespace {
+
+// Memory map parameters used in application-to-shadow address calculation.
+// Offset = (Addr & ~AndMask) ^ XorMask
+// Shadow = ShadowBase + Offset
+// Origin = OriginBase + Offset
+struct MemoryMapParams {
+ uint64_t AndMask;
+ uint64_t XorMask;
+ uint64_t ShadowBase;
+ uint64_t OriginBase;
+};
+
+struct PlatformMemoryMapParams {
+ const MemoryMapParams *bits32;
+ const MemoryMapParams *bits64;
+};
+
+} // end anonymous namespace
+
+// i386 Linux
+static const MemoryMapParams Linux_I386_MemoryMapParams = {
+ 0x000080000000, // AndMask
+ 0, // XorMask (not used)
+ 0, // ShadowBase (not used)
+ 0x000040000000, // OriginBase
+};
+
+// x86_64 Linux
+static const MemoryMapParams Linux_X86_64_MemoryMapParams = {
+#ifdef MSAN_LINUX_X86_64_OLD_MAPPING
+ 0x400000000000, // AndMask
+ 0, // XorMask (not used)
+ 0, // ShadowBase (not used)
+ 0x200000000000, // OriginBase
+#else
+ 0, // AndMask (not used)
+ 0x500000000000, // XorMask
+ 0, // ShadowBase (not used)
+ 0x100000000000, // OriginBase
+#endif
+};
+
+// mips64 Linux
+static const MemoryMapParams Linux_MIPS64_MemoryMapParams = {
+ 0, // AndMask (not used)
+ 0x008000000000, // XorMask
+ 0, // ShadowBase (not used)
+ 0x002000000000, // OriginBase
+};
+
+// ppc64 Linux
+static const MemoryMapParams Linux_PowerPC64_MemoryMapParams = {
+ 0xE00000000000, // AndMask
+ 0x100000000000, // XorMask
+ 0x080000000000, // ShadowBase
+ 0x1C0000000000, // OriginBase
+};
+
+// s390x Linux
+static const MemoryMapParams Linux_S390X_MemoryMapParams = {
+ 0xC00000000000, // AndMask
+ 0, // XorMask (not used)
+ 0x080000000000, // ShadowBase
+ 0x1C0000000000, // OriginBase
+};
+
+// aarch64 Linux
+static const MemoryMapParams Linux_AArch64_MemoryMapParams = {
+ 0, // AndMask (not used)
+ 0x06000000000, // XorMask
+ 0, // ShadowBase (not used)
+ 0x01000000000, // OriginBase
+};
+
+// i386 FreeBSD
+static const MemoryMapParams FreeBSD_I386_MemoryMapParams = {
+ 0x000180000000, // AndMask
+ 0x000040000000, // XorMask
+ 0x000020000000, // ShadowBase
+ 0x000700000000, // OriginBase
+};
+
+// x86_64 FreeBSD
+static const MemoryMapParams FreeBSD_X86_64_MemoryMapParams = {
+ 0xc00000000000, // AndMask
+ 0x200000000000, // XorMask
+ 0x100000000000, // ShadowBase
+ 0x380000000000, // OriginBase
+};
+
+// x86_64 NetBSD
+static const MemoryMapParams NetBSD_X86_64_MemoryMapParams = {
+ 0, // AndMask
+ 0x500000000000, // XorMask
+ 0, // ShadowBase
+ 0x100000000000, // OriginBase
+};
+
+static const PlatformMemoryMapParams Linux_X86_MemoryMapParams = {
+ &Linux_I386_MemoryMapParams,
+ &Linux_X86_64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_MIPS_MemoryMapParams = {
+ nullptr,
+ &Linux_MIPS64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_PowerPC_MemoryMapParams = {
+ nullptr,
+ &Linux_PowerPC64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_S390_MemoryMapParams = {
+ nullptr,
+ &Linux_S390X_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams Linux_ARM_MemoryMapParams = {
+ nullptr,
+ &Linux_AArch64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams FreeBSD_X86_MemoryMapParams = {
+ &FreeBSD_I386_MemoryMapParams,
+ &FreeBSD_X86_64_MemoryMapParams,
+};
+
+static const PlatformMemoryMapParams NetBSD_X86_MemoryMapParams = {
+ nullptr,
+ &NetBSD_X86_64_MemoryMapParams,
+};
+
+namespace {
+
+/// Instrument functions of a module to detect uninitialized reads.
+///
+/// Instantiating MemorySanitizer inserts the msan runtime library API function
+/// declarations into the module if they don't exist already. Instantiating
+/// ensures the __msan_init function is in the list of global constructors for
+/// the module.
+class MemorySanitizer {
+public:
+ MemorySanitizer(Module &M, MemorySanitizerOptions Options)
+ : CompileKernel(Options.Kernel), TrackOrigins(Options.TrackOrigins),
+ Recover(Options.Recover) {
+ initializeModule(M);
+ }
+
+ // MSan cannot be moved or copied because of MapParams.
+ MemorySanitizer(MemorySanitizer &&) = delete;
+ MemorySanitizer &operator=(MemorySanitizer &&) = delete;
+ MemorySanitizer(const MemorySanitizer &) = delete;
+ MemorySanitizer &operator=(const MemorySanitizer &) = delete;
+
+ bool sanitizeFunction(Function &F, TargetLibraryInfo &TLI);
+
+private:
+ friend struct MemorySanitizerVisitor;
+ friend struct VarArgAMD64Helper;
+ friend struct VarArgMIPS64Helper;
+ friend struct VarArgAArch64Helper;
+ friend struct VarArgPowerPC64Helper;
+ friend struct VarArgSystemZHelper;
+
+ void initializeModule(Module &M);
+ void initializeCallbacks(Module &M);
+ void createKernelApi(Module &M);
+ void createUserspaceApi(Module &M);
+
+ /// True if we're compiling the Linux kernel.
+ bool CompileKernel;
+ /// Track origins (allocation points) of uninitialized values.
+ int TrackOrigins;
+ bool Recover;
+
+ LLVMContext *C;
+ Type *IntptrTy;
+ Type *OriginTy;
+
+ // XxxTLS variables represent the per-thread state in MSan and per-task state
+ // in KMSAN.
+ // For the userspace these point to thread-local globals. In the kernel land
+ // they point to the members of a per-task struct obtained via a call to
+ // __msan_get_context_state().
+
+ /// Thread-local shadow storage for function parameters.
+ Value *ParamTLS;
+
+ /// Thread-local origin storage for function parameters.
+ Value *ParamOriginTLS;
+
+ /// Thread-local shadow storage for function return value.
+ Value *RetvalTLS;
+
+ /// Thread-local origin storage for function return value.
+ Value *RetvalOriginTLS;
+
+ /// Thread-local shadow storage for in-register va_arg function
+ /// parameters (x86_64-specific).
+ Value *VAArgTLS;
+
+ /// Thread-local shadow storage for in-register va_arg function
+ /// parameters (x86_64-specific).
+ Value *VAArgOriginTLS;
+
+ /// Thread-local shadow storage for va_arg overflow area
+ /// (x86_64-specific).
+ Value *VAArgOverflowSizeTLS;
+
+ /// Are the instrumentation callbacks set up?
+ bool CallbacksInitialized = false;
+
+ /// The run-time callback to print a warning.
+ FunctionCallee WarningFn;
+
+ // These arrays are indexed by log2(AccessSize).
+ FunctionCallee MaybeWarningFn[kNumberOfAccessSizes];
+ FunctionCallee MaybeStoreOriginFn[kNumberOfAccessSizes];
+
+ /// Run-time helper that generates a new origin value for a stack
+ /// allocation.
+ FunctionCallee MsanSetAllocaOrigin4Fn;
+
+ /// Run-time helper that poisons stack on function entry.
+ FunctionCallee MsanPoisonStackFn;
+
+ /// Run-time helper that records a store (or any event) of an
+ /// uninitialized value and returns an updated origin id encoding this info.
+ FunctionCallee MsanChainOriginFn;
+
/// Run-time helper that paints an origin over a region.
FunctionCallee MsanSetOriginFn;
- /// MSan runtime replacements for memmove, memcpy and memset.
- FunctionCallee MemmoveFn, MemcpyFn, MemsetFn;
-
- /// KMSAN callback for task-local function argument shadow.
- StructType *MsanContextStateTy;
- FunctionCallee MsanGetContextStateFn;
-
- /// Functions for poisoning/unpoisoning local variables
- FunctionCallee MsanPoisonAllocaFn, MsanUnpoisonAllocaFn;
-
- /// Each of the MsanMetadataPtrXxx functions returns a pair of shadow/origin
- /// pointers.
- FunctionCallee MsanMetadataPtrForLoadN, MsanMetadataPtrForStoreN;
- FunctionCallee MsanMetadataPtrForLoad_1_8[4];
- FunctionCallee MsanMetadataPtrForStore_1_8[4];
- FunctionCallee MsanInstrumentAsmStoreFn;
-
- /// Helper to choose between different MsanMetadataPtrXxx().
- FunctionCallee getKmsanShadowOriginAccessFn(bool isStore, int size);
-
- /// Memory map parameters used in application-to-shadow calculation.
- const MemoryMapParams *MapParams;
-
- /// Custom memory map parameters used when -msan-shadow-base or
- // -msan-origin-base is provided.
- MemoryMapParams CustomMapParams;
-
- MDNode *ColdCallWeights;
-
- /// Branch weights for origin store.
- MDNode *OriginStoreWeights;
-};
-
-void insertModuleCtor(Module &M) {
- getOrCreateSanitizerCtorAndInitFunctions(
- M, kMsanModuleCtorName, kMsanInitName,
- /*InitArgTypes=*/{},
- /*InitArgs=*/{},
- // This callback is invoked when the functions are created the first
- // time. Hook them into the global ctors list in that case:
- [&](Function *Ctor, FunctionCallee) {
- if (!ClWithComdat) {
- appendToGlobalCtors(M, Ctor, 0);
- return;
- }
- Comdat *MsanCtorComdat = M.getOrInsertComdat(kMsanModuleCtorName);
- Ctor->setComdat(MsanCtorComdat);
- appendToGlobalCtors(M, Ctor, 0, Ctor);
- });
-}
-
-/// A legacy function pass for msan instrumentation.
-///
-/// Instruments functions to detect uninitialized reads.
-struct MemorySanitizerLegacyPass : public FunctionPass {
- // Pass identification, replacement for typeid.
- static char ID;
-
- MemorySanitizerLegacyPass(MemorySanitizerOptions Options = {})
- : FunctionPass(ID), Options(Options) {
- initializeMemorySanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
- }
- StringRef getPassName() const override { return "MemorySanitizerLegacyPass"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-
- bool runOnFunction(Function &F) override {
- return MSan->sanitizeFunction(
- F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F));
- }
- bool doInitialization(Module &M) override;
-
- Optional<MemorySanitizer> MSan;
- MemorySanitizerOptions Options;
-};
-
-template <class T> T getOptOrDefault(const cl::opt<T> &Opt, T Default) {
- return (Opt.getNumOccurrences() > 0) ? Opt : Default;
-}
-
-} // end anonymous namespace
-
-MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K)
- : Kernel(getOptOrDefault(ClEnableKmsan, K)),
- TrackOrigins(getOptOrDefault(ClTrackOrigins, Kernel ? 2 : TO)),
- Recover(getOptOrDefault(ClKeepGoing, Kernel || R)) {}
-
-PreservedAnalyses MemorySanitizerPass::run(Function &F,
- FunctionAnalysisManager &FAM) {
- MemorySanitizer Msan(*F.getParent(), Options);
- if (Msan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F)))
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
-}
-
-PreservedAnalyses MemorySanitizerPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- if (Options.Kernel)
- return PreservedAnalyses::all();
- insertModuleCtor(M);
- return PreservedAnalyses::none();
-}
-
-char MemorySanitizerLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(MemorySanitizerLegacyPass, "msan",
- "MemorySanitizer: detects uninitialized reads.", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(MemorySanitizerLegacyPass, "msan",
- "MemorySanitizer: detects uninitialized reads.", false,
- false)
-
-FunctionPass *
-llvm::createMemorySanitizerLegacyPassPass(MemorySanitizerOptions Options) {
- return new MemorySanitizerLegacyPass(Options);
-}
-
-/// Create a non-const global initialized with the given string.
-///
-/// Creates a writable global for Str so that we can pass it to the
-/// run-time lib. Runtime uses first 4 bytes of the string to store the
-/// frame ID, so the string needs to be mutable.
-static GlobalVariable *createPrivateNonConstGlobalForString(Module &M,
- StringRef Str) {
- Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
- return new GlobalVariable(M, StrConst->getType(), /*isConstant=*/false,
- GlobalValue::PrivateLinkage, StrConst, "");
-}
-
-/// Create KMSAN API callbacks.
-void MemorySanitizer::createKernelApi(Module &M) {
- IRBuilder<> IRB(*C);
-
- // These will be initialized in insertKmsanPrologue().
- RetvalTLS = nullptr;
- RetvalOriginTLS = nullptr;
- ParamTLS = nullptr;
- ParamOriginTLS = nullptr;
- VAArgTLS = nullptr;
- VAArgOriginTLS = nullptr;
- VAArgOverflowSizeTLS = nullptr;
-
- WarningFn = M.getOrInsertFunction("__msan_warning", IRB.getVoidTy(),
- IRB.getInt32Ty());
- // Requests the per-task context state (kmsan_context_state*) from the
- // runtime library.
- MsanContextStateTy = StructType::get(
- ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8),
- ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8),
- ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8),
- ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), /* va_arg_origin */
- IRB.getInt64Ty(), ArrayType::get(OriginTy, kParamTLSSize / 4), OriginTy,
- OriginTy);
- MsanGetContextStateFn = M.getOrInsertFunction(
- "__msan_get_context_state", PointerType::get(MsanContextStateTy, 0));
-
- Type *RetTy = StructType::get(PointerType::get(IRB.getInt8Ty(), 0),
- PointerType::get(IRB.getInt32Ty(), 0));
-
- for (int ind = 0, size = 1; ind < 4; ind++, size <<= 1) {
- std::string name_load =
- "__msan_metadata_ptr_for_load_" + std::to_string(size);
- std::string name_store =
- "__msan_metadata_ptr_for_store_" + std::to_string(size);
- MsanMetadataPtrForLoad_1_8[ind] = M.getOrInsertFunction(
- name_load, RetTy, PointerType::get(IRB.getInt8Ty(), 0));
- MsanMetadataPtrForStore_1_8[ind] = M.getOrInsertFunction(
- name_store, RetTy, PointerType::get(IRB.getInt8Ty(), 0));
- }
-
- MsanMetadataPtrForLoadN = M.getOrInsertFunction(
- "__msan_metadata_ptr_for_load_n", RetTy,
- PointerType::get(IRB.getInt8Ty(), 0), IRB.getInt64Ty());
- MsanMetadataPtrForStoreN = M.getOrInsertFunction(
- "__msan_metadata_ptr_for_store_n", RetTy,
- PointerType::get(IRB.getInt8Ty(), 0), IRB.getInt64Ty());
-
- // Functions for poisoning and unpoisoning memory.
- MsanPoisonAllocaFn =
- M.getOrInsertFunction("__msan_poison_alloca", IRB.getVoidTy(),
- IRB.getInt8PtrTy(), IntptrTy, IRB.getInt8PtrTy());
- MsanUnpoisonAllocaFn = M.getOrInsertFunction(
- "__msan_unpoison_alloca", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy);
-}
-
-static Constant *getOrInsertGlobal(Module &M, StringRef Name, Type *Ty) {
- return M.getOrInsertGlobal(Name, Ty, [&] {
- return new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
- nullptr, Name, nullptr,
- GlobalVariable::InitialExecTLSModel);
- });
-}
-
-/// Insert declarations for userspace-specific functions and globals.
-void MemorySanitizer::createUserspaceApi(Module &M) {
- IRBuilder<> IRB(*C);
-
- // Create the callback.
- // FIXME: this function should have "Cold" calling conv,
- // which is not yet implemented.
- StringRef WarningFnName = Recover ? "__msan_warning_with_origin"
- : "__msan_warning_with_origin_noreturn";
- WarningFn =
- M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), IRB.getInt32Ty());
-
- // Create the global TLS variables.
- RetvalTLS =
- getOrInsertGlobal(M, "__msan_retval_tls",
- ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8));
-
- RetvalOriginTLS = getOrInsertGlobal(M, "__msan_retval_origin_tls", OriginTy);
-
- ParamTLS =
- getOrInsertGlobal(M, "__msan_param_tls",
- ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8));
-
- ParamOriginTLS =
- getOrInsertGlobal(M, "__msan_param_origin_tls",
- ArrayType::get(OriginTy, kParamTLSSize / 4));
-
- VAArgTLS =
- getOrInsertGlobal(M, "__msan_va_arg_tls",
- ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8));
-
- VAArgOriginTLS =
- getOrInsertGlobal(M, "__msan_va_arg_origin_tls",
- ArrayType::get(OriginTy, kParamTLSSize / 4));
-
- VAArgOverflowSizeTLS =
- getOrInsertGlobal(M, "__msan_va_arg_overflow_size_tls", IRB.getInt64Ty());
-
- for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
- AccessSizeIndex++) {
- unsigned AccessSize = 1 << AccessSizeIndex;
- std::string FunctionName = "__msan_maybe_warning_" + itostr(AccessSize);
- SmallVector<std::pair<unsigned, Attribute>, 2> MaybeWarningFnAttrs;
- MaybeWarningFnAttrs.push_back(std::make_pair(
- AttributeList::FirstArgIndex, Attribute::get(*C, Attribute::ZExt)));
- MaybeWarningFnAttrs.push_back(std::make_pair(
- AttributeList::FirstArgIndex + 1, Attribute::get(*C, Attribute::ZExt)));
- MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction(
- FunctionName, AttributeList::get(*C, MaybeWarningFnAttrs),
- IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8), IRB.getInt32Ty());
-
- FunctionName = "__msan_maybe_store_origin_" + itostr(AccessSize);
- SmallVector<std::pair<unsigned, Attribute>, 2> MaybeStoreOriginFnAttrs;
- MaybeStoreOriginFnAttrs.push_back(std::make_pair(
- AttributeList::FirstArgIndex, Attribute::get(*C, Attribute::ZExt)));
- MaybeStoreOriginFnAttrs.push_back(std::make_pair(
- AttributeList::FirstArgIndex + 2, Attribute::get(*C, Attribute::ZExt)));
- MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction(
- FunctionName, AttributeList::get(*C, MaybeStoreOriginFnAttrs),
- IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8), IRB.getInt8PtrTy(),
- IRB.getInt32Ty());
- }
-
- MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
- "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
- IRB.getInt8PtrTy(), IntptrTy);
- MsanPoisonStackFn =
- M.getOrInsertFunction("__msan_poison_stack", IRB.getVoidTy(),
- IRB.getInt8PtrTy(), IntptrTy);
-}
-
-/// Insert extern declaration of runtime-provided functions and globals.
-void MemorySanitizer::initializeCallbacks(Module &M) {
- // Only do this once.
- if (CallbacksInitialized)
- return;
-
- IRBuilder<> IRB(*C);
- // Initialize callbacks that are common for kernel and userspace
- // instrumentation.
- MsanChainOriginFn = M.getOrInsertFunction(
- "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty());
+ /// MSan runtime replacements for memmove, memcpy and memset.
+ FunctionCallee MemmoveFn, MemcpyFn, MemsetFn;
+
+ /// KMSAN callback for task-local function argument shadow.
+ StructType *MsanContextStateTy;
+ FunctionCallee MsanGetContextStateFn;
+
+ /// Functions for poisoning/unpoisoning local variables
+ FunctionCallee MsanPoisonAllocaFn, MsanUnpoisonAllocaFn;
+
+ /// Each of the MsanMetadataPtrXxx functions returns a pair of shadow/origin
+ /// pointers.
+ FunctionCallee MsanMetadataPtrForLoadN, MsanMetadataPtrForStoreN;
+ FunctionCallee MsanMetadataPtrForLoad_1_8[4];
+ FunctionCallee MsanMetadataPtrForStore_1_8[4];
+ FunctionCallee MsanInstrumentAsmStoreFn;
+
+ /// Helper to choose between different MsanMetadataPtrXxx().
+ FunctionCallee getKmsanShadowOriginAccessFn(bool isStore, int size);
+
+ /// Memory map parameters used in application-to-shadow calculation.
+ const MemoryMapParams *MapParams;
+
+ /// Custom memory map parameters used when -msan-shadow-base or
+ // -msan-origin-base is provided.
+ MemoryMapParams CustomMapParams;
+
+ MDNode *ColdCallWeights;
+
+ /// Branch weights for origin store.
+ MDNode *OriginStoreWeights;
+};
+
+void insertModuleCtor(Module &M) {
+ getOrCreateSanitizerCtorAndInitFunctions(
+ M, kMsanModuleCtorName, kMsanInitName,
+ /*InitArgTypes=*/{},
+ /*InitArgs=*/{},
+ // This callback is invoked when the functions are created the first
+ // time. Hook them into the global ctors list in that case:
+ [&](Function *Ctor, FunctionCallee) {
+ if (!ClWithComdat) {
+ appendToGlobalCtors(M, Ctor, 0);
+ return;
+ }
+ Comdat *MsanCtorComdat = M.getOrInsertComdat(kMsanModuleCtorName);
+ Ctor->setComdat(MsanCtorComdat);
+ appendToGlobalCtors(M, Ctor, 0, Ctor);
+ });
+}
+
+/// A legacy function pass for msan instrumentation.
+///
+/// Instruments functions to detect uninitialized reads.
+struct MemorySanitizerLegacyPass : public FunctionPass {
+ // Pass identification, replacement for typeid.
+ static char ID;
+
+ MemorySanitizerLegacyPass(MemorySanitizerOptions Options = {})
+ : FunctionPass(ID), Options(Options) {
+ initializeMemorySanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+ StringRef getPassName() const override { return "MemorySanitizerLegacyPass"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override {
+ return MSan->sanitizeFunction(
+ F, getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F));
+ }
+ bool doInitialization(Module &M) override;
+
+ Optional<MemorySanitizer> MSan;
+ MemorySanitizerOptions Options;
+};
+
+template <class T> T getOptOrDefault(const cl::opt<T> &Opt, T Default) {
+ return (Opt.getNumOccurrences() > 0) ? Opt : Default;
+}
+
+} // end anonymous namespace
+
+MemorySanitizerOptions::MemorySanitizerOptions(int TO, bool R, bool K)
+ : Kernel(getOptOrDefault(ClEnableKmsan, K)),
+ TrackOrigins(getOptOrDefault(ClTrackOrigins, Kernel ? 2 : TO)),
+ Recover(getOptOrDefault(ClKeepGoing, Kernel || R)) {}
+
+PreservedAnalyses MemorySanitizerPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ MemorySanitizer Msan(*F.getParent(), Options);
+ if (Msan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F)))
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
+PreservedAnalyses MemorySanitizerPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ if (Options.Kernel)
+ return PreservedAnalyses::all();
+ insertModuleCtor(M);
+ return PreservedAnalyses::none();
+}
+
+char MemorySanitizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MemorySanitizerLegacyPass, "msan",
+ "MemorySanitizer: detects uninitialized reads.", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(MemorySanitizerLegacyPass, "msan",
+ "MemorySanitizer: detects uninitialized reads.", false,
+ false)
+
+FunctionPass *
+llvm::createMemorySanitizerLegacyPassPass(MemorySanitizerOptions Options) {
+ return new MemorySanitizerLegacyPass(Options);
+}
+
+/// Create a non-const global initialized with the given string.
+///
+/// Creates a writable global for Str so that we can pass it to the
+/// run-time lib. Runtime uses first 4 bytes of the string to store the
+/// frame ID, so the string needs to be mutable.
+static GlobalVariable *createPrivateNonConstGlobalForString(Module &M,
+ StringRef Str) {
+ Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+ return new GlobalVariable(M, StrConst->getType(), /*isConstant=*/false,
+ GlobalValue::PrivateLinkage, StrConst, "");
+}
+
+/// Create KMSAN API callbacks.
+void MemorySanitizer::createKernelApi(Module &M) {
+ IRBuilder<> IRB(*C);
+
+ // These will be initialized in insertKmsanPrologue().
+ RetvalTLS = nullptr;
+ RetvalOriginTLS = nullptr;
+ ParamTLS = nullptr;
+ ParamOriginTLS = nullptr;
+ VAArgTLS = nullptr;
+ VAArgOriginTLS = nullptr;
+ VAArgOverflowSizeTLS = nullptr;
+
+ WarningFn = M.getOrInsertFunction("__msan_warning", IRB.getVoidTy(),
+ IRB.getInt32Ty());
+ // Requests the per-task context state (kmsan_context_state*) from the
+ // runtime library.
+ MsanContextStateTy = StructType::get(
+ ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8),
+ ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8),
+ ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8),
+ ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8), /* va_arg_origin */
+ IRB.getInt64Ty(), ArrayType::get(OriginTy, kParamTLSSize / 4), OriginTy,
+ OriginTy);
+ MsanGetContextStateFn = M.getOrInsertFunction(
+ "__msan_get_context_state", PointerType::get(MsanContextStateTy, 0));
+
+ Type *RetTy = StructType::get(PointerType::get(IRB.getInt8Ty(), 0),
+ PointerType::get(IRB.getInt32Ty(), 0));
+
+ for (int ind = 0, size = 1; ind < 4; ind++, size <<= 1) {
+ std::string name_load =
+ "__msan_metadata_ptr_for_load_" + std::to_string(size);
+ std::string name_store =
+ "__msan_metadata_ptr_for_store_" + std::to_string(size);
+ MsanMetadataPtrForLoad_1_8[ind] = M.getOrInsertFunction(
+ name_load, RetTy, PointerType::get(IRB.getInt8Ty(), 0));
+ MsanMetadataPtrForStore_1_8[ind] = M.getOrInsertFunction(
+ name_store, RetTy, PointerType::get(IRB.getInt8Ty(), 0));
+ }
+
+ MsanMetadataPtrForLoadN = M.getOrInsertFunction(
+ "__msan_metadata_ptr_for_load_n", RetTy,
+ PointerType::get(IRB.getInt8Ty(), 0), IRB.getInt64Ty());
+ MsanMetadataPtrForStoreN = M.getOrInsertFunction(
+ "__msan_metadata_ptr_for_store_n", RetTy,
+ PointerType::get(IRB.getInt8Ty(), 0), IRB.getInt64Ty());
+
+ // Functions for poisoning and unpoisoning memory.
+ MsanPoisonAllocaFn =
+ M.getOrInsertFunction("__msan_poison_alloca", IRB.getVoidTy(),
+ IRB.getInt8PtrTy(), IntptrTy, IRB.getInt8PtrTy());
+ MsanUnpoisonAllocaFn = M.getOrInsertFunction(
+ "__msan_unpoison_alloca", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy);
+}
+
+static Constant *getOrInsertGlobal(Module &M, StringRef Name, Type *Ty) {
+ return M.getOrInsertGlobal(Name, Ty, [&] {
+ return new GlobalVariable(M, Ty, false, GlobalVariable::ExternalLinkage,
+ nullptr, Name, nullptr,
+ GlobalVariable::InitialExecTLSModel);
+ });
+}
+
+/// Insert declarations for userspace-specific functions and globals.
+void MemorySanitizer::createUserspaceApi(Module &M) {
+ IRBuilder<> IRB(*C);
+
+ // Create the callback.
+ // FIXME: this function should have "Cold" calling conv,
+ // which is not yet implemented.
+ StringRef WarningFnName = Recover ? "__msan_warning_with_origin"
+ : "__msan_warning_with_origin_noreturn";
+ WarningFn =
+ M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), IRB.getInt32Ty());
+
+ // Create the global TLS variables.
+ RetvalTLS =
+ getOrInsertGlobal(M, "__msan_retval_tls",
+ ArrayType::get(IRB.getInt64Ty(), kRetvalTLSSize / 8));
+
+ RetvalOriginTLS = getOrInsertGlobal(M, "__msan_retval_origin_tls", OriginTy);
+
+ ParamTLS =
+ getOrInsertGlobal(M, "__msan_param_tls",
+ ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8));
+
+ ParamOriginTLS =
+ getOrInsertGlobal(M, "__msan_param_origin_tls",
+ ArrayType::get(OriginTy, kParamTLSSize / 4));
+
+ VAArgTLS =
+ getOrInsertGlobal(M, "__msan_va_arg_tls",
+ ArrayType::get(IRB.getInt64Ty(), kParamTLSSize / 8));
+
+ VAArgOriginTLS =
+ getOrInsertGlobal(M, "__msan_va_arg_origin_tls",
+ ArrayType::get(OriginTy, kParamTLSSize / 4));
+
+ VAArgOverflowSizeTLS =
+ getOrInsertGlobal(M, "__msan_va_arg_overflow_size_tls", IRB.getInt64Ty());
+
+ for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+ AccessSizeIndex++) {
+ unsigned AccessSize = 1 << AccessSizeIndex;
+ std::string FunctionName = "__msan_maybe_warning_" + itostr(AccessSize);
+ SmallVector<std::pair<unsigned, Attribute>, 2> MaybeWarningFnAttrs;
+ MaybeWarningFnAttrs.push_back(std::make_pair(
+ AttributeList::FirstArgIndex, Attribute::get(*C, Attribute::ZExt)));
+ MaybeWarningFnAttrs.push_back(std::make_pair(
+ AttributeList::FirstArgIndex + 1, Attribute::get(*C, Attribute::ZExt)));
+ MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction(
+ FunctionName, AttributeList::get(*C, MaybeWarningFnAttrs),
+ IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8), IRB.getInt32Ty());
+
+ FunctionName = "__msan_maybe_store_origin_" + itostr(AccessSize);
+ SmallVector<std::pair<unsigned, Attribute>, 2> MaybeStoreOriginFnAttrs;
+ MaybeStoreOriginFnAttrs.push_back(std::make_pair(
+ AttributeList::FirstArgIndex, Attribute::get(*C, Attribute::ZExt)));
+ MaybeStoreOriginFnAttrs.push_back(std::make_pair(
+ AttributeList::FirstArgIndex + 2, Attribute::get(*C, Attribute::ZExt)));
+ MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction(
+ FunctionName, AttributeList::get(*C, MaybeStoreOriginFnAttrs),
+ IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8), IRB.getInt8PtrTy(),
+ IRB.getInt32Ty());
+ }
+
+ MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
+ "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
+ IRB.getInt8PtrTy(), IntptrTy);
+ MsanPoisonStackFn =
+ M.getOrInsertFunction("__msan_poison_stack", IRB.getVoidTy(),
+ IRB.getInt8PtrTy(), IntptrTy);
+}
+
+/// Insert extern declaration of runtime-provided functions and globals.
+void MemorySanitizer::initializeCallbacks(Module &M) {
+ // Only do this once.
+ if (CallbacksInitialized)
+ return;
+
+ IRBuilder<> IRB(*C);
+ // Initialize callbacks that are common for kernel and userspace
+ // instrumentation.
+ MsanChainOriginFn = M.getOrInsertFunction(
+ "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty());
MsanSetOriginFn =
M.getOrInsertFunction("__msan_set_origin", IRB.getVoidTy(),
IRB.getInt8PtrTy(), IntptrTy, IRB.getInt32Ty());
- MemmoveFn = M.getOrInsertFunction(
- "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
- IRB.getInt8PtrTy(), IntptrTy);
- MemcpyFn = M.getOrInsertFunction(
- "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
- IntptrTy);
- MemsetFn = M.getOrInsertFunction(
- "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
- IntptrTy);
-
- MsanInstrumentAsmStoreFn =
- M.getOrInsertFunction("__msan_instrument_asm_store", IRB.getVoidTy(),
- PointerType::get(IRB.getInt8Ty(), 0), IntptrTy);
-
- if (CompileKernel) {
- createKernelApi(M);
- } else {
- createUserspaceApi(M);
- }
- CallbacksInitialized = true;
-}
-
-FunctionCallee MemorySanitizer::getKmsanShadowOriginAccessFn(bool isStore,
- int size) {
- FunctionCallee *Fns =
- isStore ? MsanMetadataPtrForStore_1_8 : MsanMetadataPtrForLoad_1_8;
- switch (size) {
- case 1:
- return Fns[0];
- case 2:
- return Fns[1];
- case 4:
- return Fns[2];
- case 8:
- return Fns[3];
- default:
- return nullptr;
- }
-}
-
-/// Module-level initialization.
-///
-/// inserts a call to __msan_init to the module's constructor list.
-void MemorySanitizer::initializeModule(Module &M) {
- auto &DL = M.getDataLayout();
-
- bool ShadowPassed = ClShadowBase.getNumOccurrences() > 0;
- bool OriginPassed = ClOriginBase.getNumOccurrences() > 0;
- // Check the overrides first
- if (ShadowPassed || OriginPassed) {
- CustomMapParams.AndMask = ClAndMask;
- CustomMapParams.XorMask = ClXorMask;
- CustomMapParams.ShadowBase = ClShadowBase;
- CustomMapParams.OriginBase = ClOriginBase;
- MapParams = &CustomMapParams;
- } else {
- Triple TargetTriple(M.getTargetTriple());
- switch (TargetTriple.getOS()) {
- case Triple::FreeBSD:
- switch (TargetTriple.getArch()) {
- case Triple::x86_64:
- MapParams = FreeBSD_X86_MemoryMapParams.bits64;
- break;
- case Triple::x86:
- MapParams = FreeBSD_X86_MemoryMapParams.bits32;
- break;
- default:
- report_fatal_error("unsupported architecture");
- }
- break;
- case Triple::NetBSD:
- switch (TargetTriple.getArch()) {
- case Triple::x86_64:
- MapParams = NetBSD_X86_MemoryMapParams.bits64;
- break;
- default:
- report_fatal_error("unsupported architecture");
- }
- break;
- case Triple::Linux:
- switch (TargetTriple.getArch()) {
- case Triple::x86_64:
- MapParams = Linux_X86_MemoryMapParams.bits64;
- break;
- case Triple::x86:
- MapParams = Linux_X86_MemoryMapParams.bits32;
- break;
- case Triple::mips64:
- case Triple::mips64el:
- MapParams = Linux_MIPS_MemoryMapParams.bits64;
- break;
- case Triple::ppc64:
- case Triple::ppc64le:
- MapParams = Linux_PowerPC_MemoryMapParams.bits64;
- break;
- case Triple::systemz:
- MapParams = Linux_S390_MemoryMapParams.bits64;
- break;
- case Triple::aarch64:
- case Triple::aarch64_be:
- MapParams = Linux_ARM_MemoryMapParams.bits64;
- break;
- default:
- report_fatal_error("unsupported architecture");
- }
- break;
- default:
- report_fatal_error("unsupported operating system");
- }
- }
-
- C = &(M.getContext());
- IRBuilder<> IRB(*C);
- IntptrTy = IRB.getIntPtrTy(DL);
- OriginTy = IRB.getInt32Ty();
-
- ColdCallWeights = MDBuilder(*C).createBranchWeights(1, 1000);
- OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000);
-
- if (!CompileKernel) {
- if (TrackOrigins)
- M.getOrInsertGlobal("__msan_track_origins", IRB.getInt32Ty(), [&] {
- return new GlobalVariable(
- M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
- IRB.getInt32(TrackOrigins), "__msan_track_origins");
- });
-
- if (Recover)
- M.getOrInsertGlobal("__msan_keep_going", IRB.getInt32Ty(), [&] {
- return new GlobalVariable(M, IRB.getInt32Ty(), true,
- GlobalValue::WeakODRLinkage,
- IRB.getInt32(Recover), "__msan_keep_going");
- });
-}
-}
-
-bool MemorySanitizerLegacyPass::doInitialization(Module &M) {
- if (!Options.Kernel)
- insertModuleCtor(M);
- MSan.emplace(M, Options);
- return true;
-}
-
-namespace {
-
-/// A helper class that handles instrumentation of VarArg
-/// functions on a particular platform.
-///
-/// Implementations are expected to insert the instrumentation
-/// necessary to propagate argument shadow through VarArg function
-/// calls. Visit* methods are called during an InstVisitor pass over
-/// the function, and should avoid creating new basic blocks. A new
-/// instance of this class is created for each instrumented function.
-struct VarArgHelper {
- virtual ~VarArgHelper() = default;
-
- /// Visit a CallBase.
- virtual void visitCallBase(CallBase &CB, IRBuilder<> &IRB) = 0;
-
- /// Visit a va_start call.
- virtual void visitVAStartInst(VAStartInst &I) = 0;
-
- /// Visit a va_copy call.
- virtual void visitVACopyInst(VACopyInst &I) = 0;
-
- /// Finalize function instrumentation.
- ///
- /// This method is called after visiting all interesting (see above)
- /// instructions in a function.
- virtual void finalizeInstrumentation() = 0;
-};
-
-struct MemorySanitizerVisitor;
-
-} // end anonymous namespace
-
-static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
- MemorySanitizerVisitor &Visitor);
-
-static unsigned TypeSizeToSizeIndex(unsigned TypeSize) {
- if (TypeSize <= 8) return 0;
- return Log2_32_Ceil((TypeSize + 7) / 8);
-}
-
-namespace {
-
-/// This class does all the work for a given function. Store and Load
-/// instructions store and load corresponding shadow and origin
-/// values. Most instructions propagate shadow from arguments to their
-/// return values. Certain instructions (most importantly, BranchInst)
-/// test their argument shadow and print reports (with a runtime call) if it's
-/// non-zero.
-struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
- Function &F;
- MemorySanitizer &MS;
- SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes;
- ValueMap<Value*, Value*> ShadowMap, OriginMap;
- std::unique_ptr<VarArgHelper> VAHelper;
- const TargetLibraryInfo *TLI;
+ MemmoveFn = M.getOrInsertFunction(
+ "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+ IRB.getInt8PtrTy(), IntptrTy);
+ MemcpyFn = M.getOrInsertFunction(
+ "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+ IntptrTy);
+ MemsetFn = M.getOrInsertFunction(
+ "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
+ IntptrTy);
+
+ MsanInstrumentAsmStoreFn =
+ M.getOrInsertFunction("__msan_instrument_asm_store", IRB.getVoidTy(),
+ PointerType::get(IRB.getInt8Ty(), 0), IntptrTy);
+
+ if (CompileKernel) {
+ createKernelApi(M);
+ } else {
+ createUserspaceApi(M);
+ }
+ CallbacksInitialized = true;
+}
+
+FunctionCallee MemorySanitizer::getKmsanShadowOriginAccessFn(bool isStore,
+ int size) {
+ FunctionCallee *Fns =
+ isStore ? MsanMetadataPtrForStore_1_8 : MsanMetadataPtrForLoad_1_8;
+ switch (size) {
+ case 1:
+ return Fns[0];
+ case 2:
+ return Fns[1];
+ case 4:
+ return Fns[2];
+ case 8:
+ return Fns[3];
+ default:
+ return nullptr;
+ }
+}
+
+/// Module-level initialization.
+///
+/// inserts a call to __msan_init to the module's constructor list.
+void MemorySanitizer::initializeModule(Module &M) {
+ auto &DL = M.getDataLayout();
+
+ bool ShadowPassed = ClShadowBase.getNumOccurrences() > 0;
+ bool OriginPassed = ClOriginBase.getNumOccurrences() > 0;
+ // Check the overrides first
+ if (ShadowPassed || OriginPassed) {
+ CustomMapParams.AndMask = ClAndMask;
+ CustomMapParams.XorMask = ClXorMask;
+ CustomMapParams.ShadowBase = ClShadowBase;
+ CustomMapParams.OriginBase = ClOriginBase;
+ MapParams = &CustomMapParams;
+ } else {
+ Triple TargetTriple(M.getTargetTriple());
+ switch (TargetTriple.getOS()) {
+ case Triple::FreeBSD:
+ switch (TargetTriple.getArch()) {
+ case Triple::x86_64:
+ MapParams = FreeBSD_X86_MemoryMapParams.bits64;
+ break;
+ case Triple::x86:
+ MapParams = FreeBSD_X86_MemoryMapParams.bits32;
+ break;
+ default:
+ report_fatal_error("unsupported architecture");
+ }
+ break;
+ case Triple::NetBSD:
+ switch (TargetTriple.getArch()) {
+ case Triple::x86_64:
+ MapParams = NetBSD_X86_MemoryMapParams.bits64;
+ break;
+ default:
+ report_fatal_error("unsupported architecture");
+ }
+ break;
+ case Triple::Linux:
+ switch (TargetTriple.getArch()) {
+ case Triple::x86_64:
+ MapParams = Linux_X86_MemoryMapParams.bits64;
+ break;
+ case Triple::x86:
+ MapParams = Linux_X86_MemoryMapParams.bits32;
+ break;
+ case Triple::mips64:
+ case Triple::mips64el:
+ MapParams = Linux_MIPS_MemoryMapParams.bits64;
+ break;
+ case Triple::ppc64:
+ case Triple::ppc64le:
+ MapParams = Linux_PowerPC_MemoryMapParams.bits64;
+ break;
+ case Triple::systemz:
+ MapParams = Linux_S390_MemoryMapParams.bits64;
+ break;
+ case Triple::aarch64:
+ case Triple::aarch64_be:
+ MapParams = Linux_ARM_MemoryMapParams.bits64;
+ break;
+ default:
+ report_fatal_error("unsupported architecture");
+ }
+ break;
+ default:
+ report_fatal_error("unsupported operating system");
+ }
+ }
+
+ C = &(M.getContext());
+ IRBuilder<> IRB(*C);
+ IntptrTy = IRB.getIntPtrTy(DL);
+ OriginTy = IRB.getInt32Ty();
+
+ ColdCallWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+ OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+
+ if (!CompileKernel) {
+ if (TrackOrigins)
+ M.getOrInsertGlobal("__msan_track_origins", IRB.getInt32Ty(), [&] {
+ return new GlobalVariable(
+ M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+ IRB.getInt32(TrackOrigins), "__msan_track_origins");
+ });
+
+ if (Recover)
+ M.getOrInsertGlobal("__msan_keep_going", IRB.getInt32Ty(), [&] {
+ return new GlobalVariable(M, IRB.getInt32Ty(), true,
+ GlobalValue::WeakODRLinkage,
+ IRB.getInt32(Recover), "__msan_keep_going");
+ });
+}
+}
+
+bool MemorySanitizerLegacyPass::doInitialization(Module &M) {
+ if (!Options.Kernel)
+ insertModuleCtor(M);
+ MSan.emplace(M, Options);
+ return true;
+}
+
+namespace {
+
+/// A helper class that handles instrumentation of VarArg
+/// functions on a particular platform.
+///
+/// Implementations are expected to insert the instrumentation
+/// necessary to propagate argument shadow through VarArg function
+/// calls. Visit* methods are called during an InstVisitor pass over
+/// the function, and should avoid creating new basic blocks. A new
+/// instance of this class is created for each instrumented function.
+struct VarArgHelper {
+ virtual ~VarArgHelper() = default;
+
+ /// Visit a CallBase.
+ virtual void visitCallBase(CallBase &CB, IRBuilder<> &IRB) = 0;
+
+ /// Visit a va_start call.
+ virtual void visitVAStartInst(VAStartInst &I) = 0;
+
+ /// Visit a va_copy call.
+ virtual void visitVACopyInst(VACopyInst &I) = 0;
+
+ /// Finalize function instrumentation.
+ ///
+ /// This method is called after visiting all interesting (see above)
+ /// instructions in a function.
+ virtual void finalizeInstrumentation() = 0;
+};
+
+struct MemorySanitizerVisitor;
+
+} // end anonymous namespace
+
+static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+ MemorySanitizerVisitor &Visitor);
+
+static unsigned TypeSizeToSizeIndex(unsigned TypeSize) {
+ if (TypeSize <= 8) return 0;
+ return Log2_32_Ceil((TypeSize + 7) / 8);
+}
+
+namespace {
+
+/// This class does all the work for a given function. Store and Load
+/// instructions store and load corresponding shadow and origin
+/// values. Most instructions propagate shadow from arguments to their
+/// return values. Certain instructions (most importantly, BranchInst)
+/// test their argument shadow and print reports (with a runtime call) if it's
+/// non-zero.
+struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
+ Function &F;
+ MemorySanitizer &MS;
+ SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes;
+ ValueMap<Value*, Value*> ShadowMap, OriginMap;
+ std::unique_ptr<VarArgHelper> VAHelper;
+ const TargetLibraryInfo *TLI;
Instruction *FnPrologueEnd;
-
- // The following flags disable parts of MSan instrumentation based on
- // exclusion list contents and command-line options.
- bool InsertChecks;
- bool PropagateShadow;
- bool PoisonStack;
- bool PoisonUndef;
-
- struct ShadowOriginAndInsertPoint {
- Value *Shadow;
- Value *Origin;
- Instruction *OrigIns;
-
- ShadowOriginAndInsertPoint(Value *S, Value *O, Instruction *I)
- : Shadow(S), Origin(O), OrigIns(I) {}
- };
- SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
- bool InstrumentLifetimeStart = ClHandleLifetimeIntrinsics;
- SmallSet<AllocaInst *, 16> AllocaSet;
- SmallVector<std::pair<IntrinsicInst *, AllocaInst *>, 16> LifetimeStartList;
- SmallVector<StoreInst *, 16> StoreList;
-
- MemorySanitizerVisitor(Function &F, MemorySanitizer &MS,
- const TargetLibraryInfo &TLI)
- : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)), TLI(&TLI) {
- bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeMemory);
- InsertChecks = SanitizeFunction;
- PropagateShadow = SanitizeFunction;
- PoisonStack = SanitizeFunction && ClPoisonStack;
- PoisonUndef = SanitizeFunction && ClPoisonUndef;
-
+
+ // The following flags disable parts of MSan instrumentation based on
+ // exclusion list contents and command-line options.
+ bool InsertChecks;
+ bool PropagateShadow;
+ bool PoisonStack;
+ bool PoisonUndef;
+
+ struct ShadowOriginAndInsertPoint {
+ Value *Shadow;
+ Value *Origin;
+ Instruction *OrigIns;
+
+ ShadowOriginAndInsertPoint(Value *S, Value *O, Instruction *I)
+ : Shadow(S), Origin(O), OrigIns(I) {}
+ };
+ SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
+ bool InstrumentLifetimeStart = ClHandleLifetimeIntrinsics;
+ SmallSet<AllocaInst *, 16> AllocaSet;
+ SmallVector<std::pair<IntrinsicInst *, AllocaInst *>, 16> LifetimeStartList;
+ SmallVector<StoreInst *, 16> StoreList;
+
+ MemorySanitizerVisitor(Function &F, MemorySanitizer &MS,
+ const TargetLibraryInfo &TLI)
+ : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)), TLI(&TLI) {
+ bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeMemory);
+ InsertChecks = SanitizeFunction;
+ PropagateShadow = SanitizeFunction;
+ PoisonStack = SanitizeFunction && ClPoisonStack;
+ PoisonUndef = SanitizeFunction && ClPoisonUndef;
+
// In the presence of unreachable blocks, we may see Phi nodes with
// incoming nodes from such blocks. Since InstVisitor skips unreachable
// blocks, such nodes will not have any shadow value associated with them.
// It's easier to remove unreachable blocks than deal with missing shadow.
removeUnreachableBlocks(F);
- MS.initializeCallbacks(*F.getParent());
+ MS.initializeCallbacks(*F.getParent());
FnPrologueEnd = IRBuilder<>(F.getEntryBlock().getFirstNonPHI())
.CreateIntrinsic(Intrinsic::donothing, {}, {});
-
+
if (MS.CompileKernel) {
IRBuilder<> IRB(FnPrologueEnd);
insertKmsanPrologue(IRB);
}
- LLVM_DEBUG(if (!InsertChecks) dbgs()
- << "MemorySanitizer is not inserting checks into '"
- << F.getName() << "'\n");
- }
-
+ LLVM_DEBUG(if (!InsertChecks) dbgs()
+ << "MemorySanitizer is not inserting checks into '"
+ << F.getName() << "'\n");
+ }
+
bool isInPrologue(Instruction &I) {
return I.getParent() == FnPrologueEnd->getParent() &&
(&I == FnPrologueEnd || I.comesBefore(FnPrologueEnd));
}
- Value *updateOrigin(Value *V, IRBuilder<> &IRB) {
- if (MS.TrackOrigins <= 1) return V;
- return IRB.CreateCall(MS.MsanChainOriginFn, V);
- }
-
- Value *originToIntptr(IRBuilder<> &IRB, Value *Origin) {
- const DataLayout &DL = F.getParent()->getDataLayout();
- unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
- if (IntptrSize == kOriginSize) return Origin;
- assert(IntptrSize == kOriginSize * 2);
- Origin = IRB.CreateIntCast(Origin, MS.IntptrTy, /* isSigned */ false);
- return IRB.CreateOr(Origin, IRB.CreateShl(Origin, kOriginSize * 8));
- }
-
- /// Fill memory range with the given origin value.
- void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr,
- unsigned Size, Align Alignment) {
- const DataLayout &DL = F.getParent()->getDataLayout();
- const Align IntptrAlignment = DL.getABITypeAlign(MS.IntptrTy);
- unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
- assert(IntptrAlignment >= kMinOriginAlignment);
- assert(IntptrSize >= kOriginSize);
-
- unsigned Ofs = 0;
- Align CurrentAlignment = Alignment;
- if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) {
- Value *IntptrOrigin = originToIntptr(IRB, Origin);
- Value *IntptrOriginPtr =
- IRB.CreatePointerCast(OriginPtr, PointerType::get(MS.IntptrTy, 0));
- for (unsigned i = 0; i < Size / IntptrSize; ++i) {
- Value *Ptr = i ? IRB.CreateConstGEP1_32(MS.IntptrTy, IntptrOriginPtr, i)
- : IntptrOriginPtr;
- IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment);
- Ofs += IntptrSize / kOriginSize;
- CurrentAlignment = IntptrAlignment;
- }
- }
-
- for (unsigned i = Ofs; i < (Size + kOriginSize - 1) / kOriginSize; ++i) {
- Value *GEP =
- i ? IRB.CreateConstGEP1_32(MS.OriginTy, OriginPtr, i) : OriginPtr;
- IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment);
- CurrentAlignment = kMinOriginAlignment;
- }
- }
-
- void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin,
- Value *OriginPtr, Align Alignment, bool AsCall) {
- const DataLayout &DL = F.getParent()->getDataLayout();
- const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
- unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
+ Value *updateOrigin(Value *V, IRBuilder<> &IRB) {
+ if (MS.TrackOrigins <= 1) return V;
+ return IRB.CreateCall(MS.MsanChainOriginFn, V);
+ }
+
+ Value *originToIntptr(IRBuilder<> &IRB, Value *Origin) {
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
+ if (IntptrSize == kOriginSize) return Origin;
+ assert(IntptrSize == kOriginSize * 2);
+ Origin = IRB.CreateIntCast(Origin, MS.IntptrTy, /* isSigned */ false);
+ return IRB.CreateOr(Origin, IRB.CreateShl(Origin, kOriginSize * 8));
+ }
+
+ /// Fill memory range with the given origin value.
+ void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr,
+ unsigned Size, Align Alignment) {
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const Align IntptrAlignment = DL.getABITypeAlign(MS.IntptrTy);
+ unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
+ assert(IntptrAlignment >= kMinOriginAlignment);
+ assert(IntptrSize >= kOriginSize);
+
+ unsigned Ofs = 0;
+ Align CurrentAlignment = Alignment;
+ if (Alignment >= IntptrAlignment && IntptrSize > kOriginSize) {
+ Value *IntptrOrigin = originToIntptr(IRB, Origin);
+ Value *IntptrOriginPtr =
+ IRB.CreatePointerCast(OriginPtr, PointerType::get(MS.IntptrTy, 0));
+ for (unsigned i = 0; i < Size / IntptrSize; ++i) {
+ Value *Ptr = i ? IRB.CreateConstGEP1_32(MS.IntptrTy, IntptrOriginPtr, i)
+ : IntptrOriginPtr;
+ IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment);
+ Ofs += IntptrSize / kOriginSize;
+ CurrentAlignment = IntptrAlignment;
+ }
+ }
+
+ for (unsigned i = Ofs; i < (Size + kOriginSize - 1) / kOriginSize; ++i) {
+ Value *GEP =
+ i ? IRB.CreateConstGEP1_32(MS.OriginTy, OriginPtr, i) : OriginPtr;
+ IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment);
+ CurrentAlignment = kMinOriginAlignment;
+ }
+ }
+
+ void storeOrigin(IRBuilder<> &IRB, Value *Addr, Value *Shadow, Value *Origin,
+ Value *OriginPtr, Align Alignment, bool AsCall) {
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+ unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
Value *ConvertedShadow = convertShadowToScalar(Shadow, IRB);
if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) {
if (ClCheckConstantShadow && !ConstantShadow->isZeroValue())
paintOrigin(IRB, updateOrigin(Origin, IRB), OriginPtr, StoreSize,
- OriginAlignment);
+ OriginAlignment);
return;
- }
+ }
unsigned TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType());
unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
@@ -1189,206 +1189,206 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
paintOrigin(IRBNew, updateOrigin(Origin, IRBNew), OriginPtr, StoreSize,
OriginAlignment);
}
- }
-
- void materializeStores(bool InstrumentWithCalls) {
- for (StoreInst *SI : StoreList) {
- IRBuilder<> IRB(SI);
- Value *Val = SI->getValueOperand();
- Value *Addr = SI->getPointerOperand();
- Value *Shadow = SI->isAtomic() ? getCleanShadow(Val) : getShadow(Val);
- Value *ShadowPtr, *OriginPtr;
- Type *ShadowTy = Shadow->getType();
- const Align Alignment = assumeAligned(SI->getAlignment());
- const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
- std::tie(ShadowPtr, OriginPtr) =
- getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true);
-
- StoreInst *NewSI = IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment);
- LLVM_DEBUG(dbgs() << " STORE: " << *NewSI << "\n");
- (void)NewSI;
-
- if (SI->isAtomic())
- SI->setOrdering(addReleaseOrdering(SI->getOrdering()));
-
- if (MS.TrackOrigins && !SI->isAtomic())
- storeOrigin(IRB, Addr, Shadow, getOrigin(Val), OriginPtr,
- OriginAlignment, InstrumentWithCalls);
- }
- }
-
- /// Helper function to insert a warning at IRB's current insert point.
- void insertWarningFn(IRBuilder<> &IRB, Value *Origin) {
- if (!Origin)
- Origin = (Value *)IRB.getInt32(0);
- assert(Origin->getType()->isIntegerTy());
- IRB.CreateCall(MS.WarningFn, Origin)->setCannotMerge();
- // FIXME: Insert UnreachableInst if !MS.Recover?
- // This may invalidate some of the following checks and needs to be done
- // at the very end.
- }
-
- void materializeOneCheck(Instruction *OrigIns, Value *Shadow, Value *Origin,
- bool AsCall) {
- IRBuilder<> IRB(OrigIns);
- LLVM_DEBUG(dbgs() << " SHAD0 : " << *Shadow << "\n");
+ }
+
+ void materializeStores(bool InstrumentWithCalls) {
+ for (StoreInst *SI : StoreList) {
+ IRBuilder<> IRB(SI);
+ Value *Val = SI->getValueOperand();
+ Value *Addr = SI->getPointerOperand();
+ Value *Shadow = SI->isAtomic() ? getCleanShadow(Val) : getShadow(Val);
+ Value *ShadowPtr, *OriginPtr;
+ Type *ShadowTy = Shadow->getType();
+ const Align Alignment = assumeAligned(SI->getAlignment());
+ const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+ std::tie(ShadowPtr, OriginPtr) =
+ getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true);
+
+ StoreInst *NewSI = IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment);
+ LLVM_DEBUG(dbgs() << " STORE: " << *NewSI << "\n");
+ (void)NewSI;
+
+ if (SI->isAtomic())
+ SI->setOrdering(addReleaseOrdering(SI->getOrdering()));
+
+ if (MS.TrackOrigins && !SI->isAtomic())
+ storeOrigin(IRB, Addr, Shadow, getOrigin(Val), OriginPtr,
+ OriginAlignment, InstrumentWithCalls);
+ }
+ }
+
+ /// Helper function to insert a warning at IRB's current insert point.
+ void insertWarningFn(IRBuilder<> &IRB, Value *Origin) {
+ if (!Origin)
+ Origin = (Value *)IRB.getInt32(0);
+ assert(Origin->getType()->isIntegerTy());
+ IRB.CreateCall(MS.WarningFn, Origin)->setCannotMerge();
+ // FIXME: Insert UnreachableInst if !MS.Recover?
+ // This may invalidate some of the following checks and needs to be done
+ // at the very end.
+ }
+
+ void materializeOneCheck(Instruction *OrigIns, Value *Shadow, Value *Origin,
+ bool AsCall) {
+ IRBuilder<> IRB(OrigIns);
+ LLVM_DEBUG(dbgs() << " SHAD0 : " << *Shadow << "\n");
Value *ConvertedShadow = convertShadowToScalar(Shadow, IRB);
- LLVM_DEBUG(dbgs() << " SHAD1 : " << *ConvertedShadow << "\n");
-
- if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) {
- if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) {
- insertWarningFn(IRB, Origin);
- }
- return;
- }
-
- const DataLayout &DL = OrigIns->getModule()->getDataLayout();
-
- unsigned TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType());
- unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
- if (AsCall && SizeIndex < kNumberOfAccessSizes && !MS.CompileKernel) {
- FunctionCallee Fn = MS.MaybeWarningFn[SizeIndex];
- Value *ConvertedShadow2 =
- IRB.CreateZExt(ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex)));
- IRB.CreateCall(Fn, {ConvertedShadow2, MS.TrackOrigins && Origin
- ? Origin
- : (Value *)IRB.getInt32(0)});
- } else {
+ LLVM_DEBUG(dbgs() << " SHAD1 : " << *ConvertedShadow << "\n");
+
+ if (auto *ConstantShadow = dyn_cast<Constant>(ConvertedShadow)) {
+ if (ClCheckConstantShadow && !ConstantShadow->isZeroValue()) {
+ insertWarningFn(IRB, Origin);
+ }
+ return;
+ }
+
+ const DataLayout &DL = OrigIns->getModule()->getDataLayout();
+
+ unsigned TypeSizeInBits = DL.getTypeSizeInBits(ConvertedShadow->getType());
+ unsigned SizeIndex = TypeSizeToSizeIndex(TypeSizeInBits);
+ if (AsCall && SizeIndex < kNumberOfAccessSizes && !MS.CompileKernel) {
+ FunctionCallee Fn = MS.MaybeWarningFn[SizeIndex];
+ Value *ConvertedShadow2 =
+ IRB.CreateZExt(ConvertedShadow, IRB.getIntNTy(8 * (1 << SizeIndex)));
+ IRB.CreateCall(Fn, {ConvertedShadow2, MS.TrackOrigins && Origin
+ ? Origin
+ : (Value *)IRB.getInt32(0)});
+ } else {
Value *Cmp = convertToBool(ConvertedShadow, IRB, "_mscmp");
- Instruction *CheckTerm = SplitBlockAndInsertIfThen(
- Cmp, OrigIns,
- /* Unreachable */ !MS.Recover, MS.ColdCallWeights);
-
- IRB.SetInsertPoint(CheckTerm);
- insertWarningFn(IRB, Origin);
- LLVM_DEBUG(dbgs() << " CHECK: " << *Cmp << "\n");
- }
- }
-
- void materializeChecks(bool InstrumentWithCalls) {
- for (const auto &ShadowData : InstrumentationList) {
- Instruction *OrigIns = ShadowData.OrigIns;
- Value *Shadow = ShadowData.Shadow;
- Value *Origin = ShadowData.Origin;
- materializeOneCheck(OrigIns, Shadow, Origin, InstrumentWithCalls);
- }
- LLVM_DEBUG(dbgs() << "DONE:\n" << F);
- }
-
+ Instruction *CheckTerm = SplitBlockAndInsertIfThen(
+ Cmp, OrigIns,
+ /* Unreachable */ !MS.Recover, MS.ColdCallWeights);
+
+ IRB.SetInsertPoint(CheckTerm);
+ insertWarningFn(IRB, Origin);
+ LLVM_DEBUG(dbgs() << " CHECK: " << *Cmp << "\n");
+ }
+ }
+
+ void materializeChecks(bool InstrumentWithCalls) {
+ for (const auto &ShadowData : InstrumentationList) {
+ Instruction *OrigIns = ShadowData.OrigIns;
+ Value *Shadow = ShadowData.Shadow;
+ Value *Origin = ShadowData.Origin;
+ materializeOneCheck(OrigIns, Shadow, Origin, InstrumentWithCalls);
+ }
+ LLVM_DEBUG(dbgs() << "DONE:\n" << F);
+ }
+
// Returns the last instruction in the new prologue
void insertKmsanPrologue(IRBuilder<> &IRB) {
- Value *ContextState = IRB.CreateCall(MS.MsanGetContextStateFn, {});
- Constant *Zero = IRB.getInt32(0);
- MS.ParamTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
- {Zero, IRB.getInt32(0)}, "param_shadow");
- MS.RetvalTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
- {Zero, IRB.getInt32(1)}, "retval_shadow");
- MS.VAArgTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
- {Zero, IRB.getInt32(2)}, "va_arg_shadow");
- MS.VAArgOriginTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
- {Zero, IRB.getInt32(3)}, "va_arg_origin");
- MS.VAArgOverflowSizeTLS =
- IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
- {Zero, IRB.getInt32(4)}, "va_arg_overflow_size");
- MS.ParamOriginTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
- {Zero, IRB.getInt32(5)}, "param_origin");
- MS.RetvalOriginTLS =
- IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
- {Zero, IRB.getInt32(6)}, "retval_origin");
- }
-
- /// Add MemorySanitizer instrumentation to a function.
- bool runOnFunction() {
- // Iterate all BBs in depth-first order and create shadow instructions
- // for all instructions (where applicable).
- // For PHI nodes we create dummy shadow PHIs which will be finalized later.
+ Value *ContextState = IRB.CreateCall(MS.MsanGetContextStateFn, {});
+ Constant *Zero = IRB.getInt32(0);
+ MS.ParamTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+ {Zero, IRB.getInt32(0)}, "param_shadow");
+ MS.RetvalTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+ {Zero, IRB.getInt32(1)}, "retval_shadow");
+ MS.VAArgTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+ {Zero, IRB.getInt32(2)}, "va_arg_shadow");
+ MS.VAArgOriginTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+ {Zero, IRB.getInt32(3)}, "va_arg_origin");
+ MS.VAArgOverflowSizeTLS =
+ IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+ {Zero, IRB.getInt32(4)}, "va_arg_overflow_size");
+ MS.ParamOriginTLS = IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+ {Zero, IRB.getInt32(5)}, "param_origin");
+ MS.RetvalOriginTLS =
+ IRB.CreateGEP(MS.MsanContextStateTy, ContextState,
+ {Zero, IRB.getInt32(6)}, "retval_origin");
+ }
+
+ /// Add MemorySanitizer instrumentation to a function.
+ bool runOnFunction() {
+ // Iterate all BBs in depth-first order and create shadow instructions
+ // for all instructions (where applicable).
+ // For PHI nodes we create dummy shadow PHIs which will be finalized later.
for (BasicBlock *BB : depth_first(FnPrologueEnd->getParent()))
- visit(*BB);
-
- // Finalize PHI nodes.
- for (PHINode *PN : ShadowPHINodes) {
- PHINode *PNS = cast<PHINode>(getShadow(PN));
- PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : nullptr;
- size_t NumValues = PN->getNumIncomingValues();
- for (size_t v = 0; v < NumValues; v++) {
- PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v));
- if (PNO) PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v));
- }
- }
-
- VAHelper->finalizeInstrumentation();
-
- // Poison llvm.lifetime.start intrinsics, if we haven't fallen back to
- // instrumenting only allocas.
- if (InstrumentLifetimeStart) {
- for (auto Item : LifetimeStartList) {
- instrumentAlloca(*Item.second, Item.first);
- AllocaSet.erase(Item.second);
- }
- }
- // Poison the allocas for which we didn't instrument the corresponding
- // lifetime intrinsics.
- for (AllocaInst *AI : AllocaSet)
- instrumentAlloca(*AI);
-
- bool InstrumentWithCalls = ClInstrumentationWithCallThreshold >= 0 &&
- InstrumentationList.size() + StoreList.size() >
- (unsigned)ClInstrumentationWithCallThreshold;
-
- // Insert shadow value checks.
- materializeChecks(InstrumentWithCalls);
-
- // Delayed instrumentation of StoreInst.
- // This may not add new address checks.
- materializeStores(InstrumentWithCalls);
-
- return true;
- }
-
- /// Compute the shadow type that corresponds to a given Value.
- Type *getShadowTy(Value *V) {
- return getShadowTy(V->getType());
- }
-
- /// Compute the shadow type that corresponds to a given Type.
- Type *getShadowTy(Type *OrigTy) {
- if (!OrigTy->isSized()) {
- return nullptr;
- }
- // For integer type, shadow is the same as the original type.
- // This may return weird-sized types like i1.
- if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy))
- return IT;
- const DataLayout &DL = F.getParent()->getDataLayout();
- if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) {
- uint32_t EltSize = DL.getTypeSizeInBits(VT->getElementType());
- return FixedVectorType::get(IntegerType::get(*MS.C, EltSize),
- cast<FixedVectorType>(VT)->getNumElements());
- }
- if (ArrayType *AT = dyn_cast<ArrayType>(OrigTy)) {
- return ArrayType::get(getShadowTy(AT->getElementType()),
- AT->getNumElements());
- }
- if (StructType *ST = dyn_cast<StructType>(OrigTy)) {
- SmallVector<Type*, 4> Elements;
- for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
- Elements.push_back(getShadowTy(ST->getElementType(i)));
- StructType *Res = StructType::get(*MS.C, Elements, ST->isPacked());
- LLVM_DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
- return Res;
- }
- uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy);
- return IntegerType::get(*MS.C, TypeSize);
- }
-
- /// Flatten a vector type.
- Type *getShadowTyNoVec(Type *ty) {
- if (VectorType *vt = dyn_cast<VectorType>(ty))
- return IntegerType::get(*MS.C,
- vt->getPrimitiveSizeInBits().getFixedSize());
- return ty;
- }
-
+ visit(*BB);
+
+ // Finalize PHI nodes.
+ for (PHINode *PN : ShadowPHINodes) {
+ PHINode *PNS = cast<PHINode>(getShadow(PN));
+ PHINode *PNO = MS.TrackOrigins ? cast<PHINode>(getOrigin(PN)) : nullptr;
+ size_t NumValues = PN->getNumIncomingValues();
+ for (size_t v = 0; v < NumValues; v++) {
+ PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v));
+ if (PNO) PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v));
+ }
+ }
+
+ VAHelper->finalizeInstrumentation();
+
+ // Poison llvm.lifetime.start intrinsics, if we haven't fallen back to
+ // instrumenting only allocas.
+ if (InstrumentLifetimeStart) {
+ for (auto Item : LifetimeStartList) {
+ instrumentAlloca(*Item.second, Item.first);
+ AllocaSet.erase(Item.second);
+ }
+ }
+ // Poison the allocas for which we didn't instrument the corresponding
+ // lifetime intrinsics.
+ for (AllocaInst *AI : AllocaSet)
+ instrumentAlloca(*AI);
+
+ bool InstrumentWithCalls = ClInstrumentationWithCallThreshold >= 0 &&
+ InstrumentationList.size() + StoreList.size() >
+ (unsigned)ClInstrumentationWithCallThreshold;
+
+ // Insert shadow value checks.
+ materializeChecks(InstrumentWithCalls);
+
+ // Delayed instrumentation of StoreInst.
+ // This may not add new address checks.
+ materializeStores(InstrumentWithCalls);
+
+ return true;
+ }
+
+ /// Compute the shadow type that corresponds to a given Value.
+ Type *getShadowTy(Value *V) {
+ return getShadowTy(V->getType());
+ }
+
+ /// Compute the shadow type that corresponds to a given Type.
+ Type *getShadowTy(Type *OrigTy) {
+ if (!OrigTy->isSized()) {
+ return nullptr;
+ }
+ // For integer type, shadow is the same as the original type.
+ // This may return weird-sized types like i1.
+ if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy))
+ return IT;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) {
+ uint32_t EltSize = DL.getTypeSizeInBits(VT->getElementType());
+ return FixedVectorType::get(IntegerType::get(*MS.C, EltSize),
+ cast<FixedVectorType>(VT)->getNumElements());
+ }
+ if (ArrayType *AT = dyn_cast<ArrayType>(OrigTy)) {
+ return ArrayType::get(getShadowTy(AT->getElementType()),
+ AT->getNumElements());
+ }
+ if (StructType *ST = dyn_cast<StructType>(OrigTy)) {
+ SmallVector<Type*, 4> Elements;
+ for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+ Elements.push_back(getShadowTy(ST->getElementType(i)));
+ StructType *Res = StructType::get(*MS.C, Elements, ST->isPacked());
+ LLVM_DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
+ return Res;
+ }
+ uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy);
+ return IntegerType::get(*MS.C, TypeSize);
+ }
+
+ /// Flatten a vector type.
+ Type *getShadowTyNoVec(Type *ty) {
+ if (VectorType *vt = dyn_cast<VectorType>(ty))
+ return IntegerType::get(*MS.C,
+ vt->getPrimitiveSizeInBits().getFixedSize());
+ return ty;
+ }
+
/// Extract combined shadow of struct elements as a bool
Value *collapseStructShadow(StructType *Struct, Value *Shadow,
IRBuilder<> &IRB) {
@@ -1435,12 +1435,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return collapseStructShadow(Struct, V, IRB);
if (ArrayType *Array = dyn_cast<ArrayType>(V->getType()))
return collapseArrayShadow(Array, V, IRB);
- Type *Ty = V->getType();
- Type *NoVecTy = getShadowTyNoVec(Ty);
- if (Ty == NoVecTy) return V;
- return IRB.CreateBitCast(V, NoVecTy);
- }
-
+ Type *Ty = V->getType();
+ Type *NoVecTy = getShadowTyNoVec(Ty);
+ if (Ty == NoVecTy) return V;
+ return IRB.CreateBitCast(V, NoVecTy);
+ }
+
// Convert a scalar value to an i1 by comparing with 0
Value *convertToBool(Value *V, IRBuilder<> &IRB, const Twine &name = "") {
Type *VTy = V->getType();
@@ -1451,386 +1451,386 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
return IRB.CreateICmpNE(V, ConstantInt::get(VTy, 0), name);
}
- /// Compute the integer shadow offset that corresponds to a given
- /// application address.
- ///
- /// Offset = (Addr & ~AndMask) ^ XorMask
- Value *getShadowPtrOffset(Value *Addr, IRBuilder<> &IRB) {
- Value *OffsetLong = IRB.CreatePointerCast(Addr, MS.IntptrTy);
-
- uint64_t AndMask = MS.MapParams->AndMask;
- if (AndMask)
- OffsetLong =
- IRB.CreateAnd(OffsetLong, ConstantInt::get(MS.IntptrTy, ~AndMask));
-
- uint64_t XorMask = MS.MapParams->XorMask;
- if (XorMask)
- OffsetLong =
- IRB.CreateXor(OffsetLong, ConstantInt::get(MS.IntptrTy, XorMask));
- return OffsetLong;
- }
-
- /// Compute the shadow and origin addresses corresponding to a given
- /// application address.
- ///
- /// Shadow = ShadowBase + Offset
- /// Origin = (OriginBase + Offset) & ~3ULL
- std::pair<Value *, Value *>
- getShadowOriginPtrUserspace(Value *Addr, IRBuilder<> &IRB, Type *ShadowTy,
- MaybeAlign Alignment) {
- Value *ShadowOffset = getShadowPtrOffset(Addr, IRB);
- Value *ShadowLong = ShadowOffset;
- uint64_t ShadowBase = MS.MapParams->ShadowBase;
- if (ShadowBase != 0) {
- ShadowLong =
- IRB.CreateAdd(ShadowLong,
- ConstantInt::get(MS.IntptrTy, ShadowBase));
- }
- Value *ShadowPtr =
- IRB.CreateIntToPtr(ShadowLong, PointerType::get(ShadowTy, 0));
- Value *OriginPtr = nullptr;
- if (MS.TrackOrigins) {
- Value *OriginLong = ShadowOffset;
- uint64_t OriginBase = MS.MapParams->OriginBase;
- if (OriginBase != 0)
- OriginLong = IRB.CreateAdd(OriginLong,
- ConstantInt::get(MS.IntptrTy, OriginBase));
- if (!Alignment || *Alignment < kMinOriginAlignment) {
- uint64_t Mask = kMinOriginAlignment.value() - 1;
- OriginLong =
- IRB.CreateAnd(OriginLong, ConstantInt::get(MS.IntptrTy, ~Mask));
- }
- OriginPtr =
- IRB.CreateIntToPtr(OriginLong, PointerType::get(MS.OriginTy, 0));
- }
- return std::make_pair(ShadowPtr, OriginPtr);
- }
-
- std::pair<Value *, Value *> getShadowOriginPtrKernel(Value *Addr,
- IRBuilder<> &IRB,
- Type *ShadowTy,
- bool isStore) {
- Value *ShadowOriginPtrs;
- const DataLayout &DL = F.getParent()->getDataLayout();
- int Size = DL.getTypeStoreSize(ShadowTy);
-
- FunctionCallee Getter = MS.getKmsanShadowOriginAccessFn(isStore, Size);
- Value *AddrCast =
- IRB.CreatePointerCast(Addr, PointerType::get(IRB.getInt8Ty(), 0));
- if (Getter) {
- ShadowOriginPtrs = IRB.CreateCall(Getter, AddrCast);
- } else {
- Value *SizeVal = ConstantInt::get(MS.IntptrTy, Size);
- ShadowOriginPtrs = IRB.CreateCall(isStore ? MS.MsanMetadataPtrForStoreN
- : MS.MsanMetadataPtrForLoadN,
- {AddrCast, SizeVal});
- }
- Value *ShadowPtr = IRB.CreateExtractValue(ShadowOriginPtrs, 0);
- ShadowPtr = IRB.CreatePointerCast(ShadowPtr, PointerType::get(ShadowTy, 0));
- Value *OriginPtr = IRB.CreateExtractValue(ShadowOriginPtrs, 1);
-
- return std::make_pair(ShadowPtr, OriginPtr);
- }
-
- std::pair<Value *, Value *> getShadowOriginPtr(Value *Addr, IRBuilder<> &IRB,
- Type *ShadowTy,
- MaybeAlign Alignment,
- bool isStore) {
- if (MS.CompileKernel)
- return getShadowOriginPtrKernel(Addr, IRB, ShadowTy, isStore);
- return getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment);
- }
-
- /// Compute the shadow address for a given function argument.
- ///
- /// Shadow = ParamTLS+ArgOffset.
- Value *getShadowPtrForArgument(Value *A, IRBuilder<> &IRB,
- int ArgOffset) {
- Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy);
- if (ArgOffset)
- Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
- return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
- "_msarg");
- }
-
- /// Compute the origin address for a given function argument.
- Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB,
- int ArgOffset) {
- if (!MS.TrackOrigins)
- return nullptr;
- Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy);
- if (ArgOffset)
- Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
- return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
- "_msarg_o");
- }
-
- /// Compute the shadow address for a retval.
- Value *getShadowPtrForRetval(Value *A, IRBuilder<> &IRB) {
- return IRB.CreatePointerCast(MS.RetvalTLS,
- PointerType::get(getShadowTy(A), 0),
- "_msret");
- }
-
- /// Compute the origin address for a retval.
- Value *getOriginPtrForRetval(IRBuilder<> &IRB) {
- // We keep a single origin for the entire retval. Might be too optimistic.
- return MS.RetvalOriginTLS;
- }
-
- /// Set SV to be the shadow value for V.
- void setShadow(Value *V, Value *SV) {
- assert(!ShadowMap.count(V) && "Values may only have one shadow");
- ShadowMap[V] = PropagateShadow ? SV : getCleanShadow(V);
- }
-
- /// Set Origin to be the origin value for V.
- void setOrigin(Value *V, Value *Origin) {
- if (!MS.TrackOrigins) return;
- assert(!OriginMap.count(V) && "Values may only have one origin");
- LLVM_DEBUG(dbgs() << "ORIGIN: " << *V << " ==> " << *Origin << "\n");
- OriginMap[V] = Origin;
- }
-
- Constant *getCleanShadow(Type *OrigTy) {
- Type *ShadowTy = getShadowTy(OrigTy);
- if (!ShadowTy)
- return nullptr;
- return Constant::getNullValue(ShadowTy);
- }
-
- /// Create a clean shadow value for a given value.
- ///
- /// Clean shadow (all zeroes) means all bits of the value are defined
- /// (initialized).
- Constant *getCleanShadow(Value *V) {
- return getCleanShadow(V->getType());
- }
-
- /// Create a dirty shadow of a given shadow type.
- Constant *getPoisonedShadow(Type *ShadowTy) {
- assert(ShadowTy);
- if (isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy))
- return Constant::getAllOnesValue(ShadowTy);
- if (ArrayType *AT = dyn_cast<ArrayType>(ShadowTy)) {
- SmallVector<Constant *, 4> Vals(AT->getNumElements(),
- getPoisonedShadow(AT->getElementType()));
- return ConstantArray::get(AT, Vals);
- }
- if (StructType *ST = dyn_cast<StructType>(ShadowTy)) {
- SmallVector<Constant *, 4> Vals;
- for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
- Vals.push_back(getPoisonedShadow(ST->getElementType(i)));
- return ConstantStruct::get(ST, Vals);
- }
- llvm_unreachable("Unexpected shadow type");
- }
-
- /// Create a dirty shadow for a given value.
- Constant *getPoisonedShadow(Value *V) {
- Type *ShadowTy = getShadowTy(V);
- if (!ShadowTy)
- return nullptr;
- return getPoisonedShadow(ShadowTy);
- }
-
- /// Create a clean (zero) origin.
- Value *getCleanOrigin() {
- return Constant::getNullValue(MS.OriginTy);
- }
-
- /// Get the shadow value for a given Value.
- ///
- /// This function either returns the value set earlier with setShadow,
- /// or extracts if from ParamTLS (for function arguments).
- Value *getShadow(Value *V) {
- if (!PropagateShadow) return getCleanShadow(V);
- if (Instruction *I = dyn_cast<Instruction>(V)) {
- if (I->getMetadata("nosanitize"))
- return getCleanShadow(V);
- // For instructions the shadow is already stored in the map.
- Value *Shadow = ShadowMap[V];
- if (!Shadow) {
- LLVM_DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent()));
- (void)I;
- assert(Shadow && "No shadow for a value");
- }
- return Shadow;
- }
- if (UndefValue *U = dyn_cast<UndefValue>(V)) {
- Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
- LLVM_DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
- (void)U;
- return AllOnes;
- }
- if (Argument *A = dyn_cast<Argument>(V)) {
- // For arguments we compute the shadow on demand and store it in the map.
- Value **ShadowPtr = &ShadowMap[V];
- if (*ShadowPtr)
- return *ShadowPtr;
- Function *F = A->getParent();
+ /// Compute the integer shadow offset that corresponds to a given
+ /// application address.
+ ///
+ /// Offset = (Addr & ~AndMask) ^ XorMask
+ Value *getShadowPtrOffset(Value *Addr, IRBuilder<> &IRB) {
+ Value *OffsetLong = IRB.CreatePointerCast(Addr, MS.IntptrTy);
+
+ uint64_t AndMask = MS.MapParams->AndMask;
+ if (AndMask)
+ OffsetLong =
+ IRB.CreateAnd(OffsetLong, ConstantInt::get(MS.IntptrTy, ~AndMask));
+
+ uint64_t XorMask = MS.MapParams->XorMask;
+ if (XorMask)
+ OffsetLong =
+ IRB.CreateXor(OffsetLong, ConstantInt::get(MS.IntptrTy, XorMask));
+ return OffsetLong;
+ }
+
+ /// Compute the shadow and origin addresses corresponding to a given
+ /// application address.
+ ///
+ /// Shadow = ShadowBase + Offset
+ /// Origin = (OriginBase + Offset) & ~3ULL
+ std::pair<Value *, Value *>
+ getShadowOriginPtrUserspace(Value *Addr, IRBuilder<> &IRB, Type *ShadowTy,
+ MaybeAlign Alignment) {
+ Value *ShadowOffset = getShadowPtrOffset(Addr, IRB);
+ Value *ShadowLong = ShadowOffset;
+ uint64_t ShadowBase = MS.MapParams->ShadowBase;
+ if (ShadowBase != 0) {
+ ShadowLong =
+ IRB.CreateAdd(ShadowLong,
+ ConstantInt::get(MS.IntptrTy, ShadowBase));
+ }
+ Value *ShadowPtr =
+ IRB.CreateIntToPtr(ShadowLong, PointerType::get(ShadowTy, 0));
+ Value *OriginPtr = nullptr;
+ if (MS.TrackOrigins) {
+ Value *OriginLong = ShadowOffset;
+ uint64_t OriginBase = MS.MapParams->OriginBase;
+ if (OriginBase != 0)
+ OriginLong = IRB.CreateAdd(OriginLong,
+ ConstantInt::get(MS.IntptrTy, OriginBase));
+ if (!Alignment || *Alignment < kMinOriginAlignment) {
+ uint64_t Mask = kMinOriginAlignment.value() - 1;
+ OriginLong =
+ IRB.CreateAnd(OriginLong, ConstantInt::get(MS.IntptrTy, ~Mask));
+ }
+ OriginPtr =
+ IRB.CreateIntToPtr(OriginLong, PointerType::get(MS.OriginTy, 0));
+ }
+ return std::make_pair(ShadowPtr, OriginPtr);
+ }
+
+ std::pair<Value *, Value *> getShadowOriginPtrKernel(Value *Addr,
+ IRBuilder<> &IRB,
+ Type *ShadowTy,
+ bool isStore) {
+ Value *ShadowOriginPtrs;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ int Size = DL.getTypeStoreSize(ShadowTy);
+
+ FunctionCallee Getter = MS.getKmsanShadowOriginAccessFn(isStore, Size);
+ Value *AddrCast =
+ IRB.CreatePointerCast(Addr, PointerType::get(IRB.getInt8Ty(), 0));
+ if (Getter) {
+ ShadowOriginPtrs = IRB.CreateCall(Getter, AddrCast);
+ } else {
+ Value *SizeVal = ConstantInt::get(MS.IntptrTy, Size);
+ ShadowOriginPtrs = IRB.CreateCall(isStore ? MS.MsanMetadataPtrForStoreN
+ : MS.MsanMetadataPtrForLoadN,
+ {AddrCast, SizeVal});
+ }
+ Value *ShadowPtr = IRB.CreateExtractValue(ShadowOriginPtrs, 0);
+ ShadowPtr = IRB.CreatePointerCast(ShadowPtr, PointerType::get(ShadowTy, 0));
+ Value *OriginPtr = IRB.CreateExtractValue(ShadowOriginPtrs, 1);
+
+ return std::make_pair(ShadowPtr, OriginPtr);
+ }
+
+ std::pair<Value *, Value *> getShadowOriginPtr(Value *Addr, IRBuilder<> &IRB,
+ Type *ShadowTy,
+ MaybeAlign Alignment,
+ bool isStore) {
+ if (MS.CompileKernel)
+ return getShadowOriginPtrKernel(Addr, IRB, ShadowTy, isStore);
+ return getShadowOriginPtrUserspace(Addr, IRB, ShadowTy, Alignment);
+ }
+
+ /// Compute the shadow address for a given function argument.
+ ///
+ /// Shadow = ParamTLS+ArgOffset.
+ Value *getShadowPtrForArgument(Value *A, IRBuilder<> &IRB,
+ int ArgOffset) {
+ Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy);
+ if (ArgOffset)
+ Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+ return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
+ "_msarg");
+ }
+
+ /// Compute the origin address for a given function argument.
+ Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB,
+ int ArgOffset) {
+ if (!MS.TrackOrigins)
+ return nullptr;
+ Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy);
+ if (ArgOffset)
+ Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+ return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
+ "_msarg_o");
+ }
+
+ /// Compute the shadow address for a retval.
+ Value *getShadowPtrForRetval(Value *A, IRBuilder<> &IRB) {
+ return IRB.CreatePointerCast(MS.RetvalTLS,
+ PointerType::get(getShadowTy(A), 0),
+ "_msret");
+ }
+
+ /// Compute the origin address for a retval.
+ Value *getOriginPtrForRetval(IRBuilder<> &IRB) {
+ // We keep a single origin for the entire retval. Might be too optimistic.
+ return MS.RetvalOriginTLS;
+ }
+
+ /// Set SV to be the shadow value for V.
+ void setShadow(Value *V, Value *SV) {
+ assert(!ShadowMap.count(V) && "Values may only have one shadow");
+ ShadowMap[V] = PropagateShadow ? SV : getCleanShadow(V);
+ }
+
+ /// Set Origin to be the origin value for V.
+ void setOrigin(Value *V, Value *Origin) {
+ if (!MS.TrackOrigins) return;
+ assert(!OriginMap.count(V) && "Values may only have one origin");
+ LLVM_DEBUG(dbgs() << "ORIGIN: " << *V << " ==> " << *Origin << "\n");
+ OriginMap[V] = Origin;
+ }
+
+ Constant *getCleanShadow(Type *OrigTy) {
+ Type *ShadowTy = getShadowTy(OrigTy);
+ if (!ShadowTy)
+ return nullptr;
+ return Constant::getNullValue(ShadowTy);
+ }
+
+ /// Create a clean shadow value for a given value.
+ ///
+ /// Clean shadow (all zeroes) means all bits of the value are defined
+ /// (initialized).
+ Constant *getCleanShadow(Value *V) {
+ return getCleanShadow(V->getType());
+ }
+
+ /// Create a dirty shadow of a given shadow type.
+ Constant *getPoisonedShadow(Type *ShadowTy) {
+ assert(ShadowTy);
+ if (isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy))
+ return Constant::getAllOnesValue(ShadowTy);
+ if (ArrayType *AT = dyn_cast<ArrayType>(ShadowTy)) {
+ SmallVector<Constant *, 4> Vals(AT->getNumElements(),
+ getPoisonedShadow(AT->getElementType()));
+ return ConstantArray::get(AT, Vals);
+ }
+ if (StructType *ST = dyn_cast<StructType>(ShadowTy)) {
+ SmallVector<Constant *, 4> Vals;
+ for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+ Vals.push_back(getPoisonedShadow(ST->getElementType(i)));
+ return ConstantStruct::get(ST, Vals);
+ }
+ llvm_unreachable("Unexpected shadow type");
+ }
+
+ /// Create a dirty shadow for a given value.
+ Constant *getPoisonedShadow(Value *V) {
+ Type *ShadowTy = getShadowTy(V);
+ if (!ShadowTy)
+ return nullptr;
+ return getPoisonedShadow(ShadowTy);
+ }
+
+ /// Create a clean (zero) origin.
+ Value *getCleanOrigin() {
+ return Constant::getNullValue(MS.OriginTy);
+ }
+
+ /// Get the shadow value for a given Value.
+ ///
+ /// This function either returns the value set earlier with setShadow,
+ /// or extracts if from ParamTLS (for function arguments).
+ Value *getShadow(Value *V) {
+ if (!PropagateShadow) return getCleanShadow(V);
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ if (I->getMetadata("nosanitize"))
+ return getCleanShadow(V);
+ // For instructions the shadow is already stored in the map.
+ Value *Shadow = ShadowMap[V];
+ if (!Shadow) {
+ LLVM_DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent()));
+ (void)I;
+ assert(Shadow && "No shadow for a value");
+ }
+ return Shadow;
+ }
+ if (UndefValue *U = dyn_cast<UndefValue>(V)) {
+ Value *AllOnes = PoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
+ LLVM_DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
+ (void)U;
+ return AllOnes;
+ }
+ if (Argument *A = dyn_cast<Argument>(V)) {
+ // For arguments we compute the shadow on demand and store it in the map.
+ Value **ShadowPtr = &ShadowMap[V];
+ if (*ShadowPtr)
+ return *ShadowPtr;
+ Function *F = A->getParent();
IRBuilder<> EntryIRB(FnPrologueEnd);
- unsigned ArgOffset = 0;
- const DataLayout &DL = F->getParent()->getDataLayout();
- for (auto &FArg : F->args()) {
- if (!FArg.getType()->isSized()) {
- LLVM_DEBUG(dbgs() << "Arg is not sized\n");
- continue;
- }
-
- bool FArgByVal = FArg.hasByValAttr();
- bool FArgNoUndef = FArg.hasAttribute(Attribute::NoUndef);
- bool FArgEagerCheck = ClEagerChecks && !FArgByVal && FArgNoUndef;
- unsigned Size =
- FArg.hasByValAttr()
- ? DL.getTypeAllocSize(FArg.getParamByValType())
- : DL.getTypeAllocSize(FArg.getType());
-
- if (A == &FArg) {
- bool Overflow = ArgOffset + Size > kParamTLSSize;
- if (FArgEagerCheck) {
- *ShadowPtr = getCleanShadow(V);
- setOrigin(A, getCleanOrigin());
- continue;
- } else if (FArgByVal) {
- Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
- // ByVal pointer itself has clean shadow. We copy the actual
- // argument shadow to the underlying memory.
- // Figure out maximal valid memcpy alignment.
- const Align ArgAlign = DL.getValueOrABITypeAlignment(
- MaybeAlign(FArg.getParamAlignment()), FArg.getParamByValType());
- Value *CpShadowPtr =
- getShadowOriginPtr(V, EntryIRB, EntryIRB.getInt8Ty(), ArgAlign,
- /*isStore*/ true)
- .first;
- // TODO(glider): need to copy origins.
- if (Overflow) {
- // ParamTLS overflow.
- EntryIRB.CreateMemSet(
- CpShadowPtr, Constant::getNullValue(EntryIRB.getInt8Ty()),
- Size, ArgAlign);
- } else {
- const Align CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
- Value *Cpy = EntryIRB.CreateMemCpy(CpShadowPtr, CopyAlign, Base,
- CopyAlign, Size);
- LLVM_DEBUG(dbgs() << " ByValCpy: " << *Cpy << "\n");
- (void)Cpy;
- }
- *ShadowPtr = getCleanShadow(V);
- } else {
- // Shadow over TLS
- Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
- if (Overflow) {
- // ParamTLS overflow.
- *ShadowPtr = getCleanShadow(V);
- } else {
- *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base,
- kShadowTLSAlignment);
- }
- }
- LLVM_DEBUG(dbgs()
- << " ARG: " << FArg << " ==> " << **ShadowPtr << "\n");
- if (MS.TrackOrigins && !Overflow) {
- Value *OriginPtr =
- getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
- setOrigin(A, EntryIRB.CreateLoad(MS.OriginTy, OriginPtr));
- } else {
- setOrigin(A, getCleanOrigin());
- }
+ unsigned ArgOffset = 0;
+ const DataLayout &DL = F->getParent()->getDataLayout();
+ for (auto &FArg : F->args()) {
+ if (!FArg.getType()->isSized()) {
+ LLVM_DEBUG(dbgs() << "Arg is not sized\n");
+ continue;
+ }
+
+ bool FArgByVal = FArg.hasByValAttr();
+ bool FArgNoUndef = FArg.hasAttribute(Attribute::NoUndef);
+ bool FArgEagerCheck = ClEagerChecks && !FArgByVal && FArgNoUndef;
+ unsigned Size =
+ FArg.hasByValAttr()
+ ? DL.getTypeAllocSize(FArg.getParamByValType())
+ : DL.getTypeAllocSize(FArg.getType());
+
+ if (A == &FArg) {
+ bool Overflow = ArgOffset + Size > kParamTLSSize;
+ if (FArgEagerCheck) {
+ *ShadowPtr = getCleanShadow(V);
+ setOrigin(A, getCleanOrigin());
+ continue;
+ } else if (FArgByVal) {
+ Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
+ // ByVal pointer itself has clean shadow. We copy the actual
+ // argument shadow to the underlying memory.
+ // Figure out maximal valid memcpy alignment.
+ const Align ArgAlign = DL.getValueOrABITypeAlignment(
+ MaybeAlign(FArg.getParamAlignment()), FArg.getParamByValType());
+ Value *CpShadowPtr =
+ getShadowOriginPtr(V, EntryIRB, EntryIRB.getInt8Ty(), ArgAlign,
+ /*isStore*/ true)
+ .first;
+ // TODO(glider): need to copy origins.
+ if (Overflow) {
+ // ParamTLS overflow.
+ EntryIRB.CreateMemSet(
+ CpShadowPtr, Constant::getNullValue(EntryIRB.getInt8Ty()),
+ Size, ArgAlign);
+ } else {
+ const Align CopyAlign = std::min(ArgAlign, kShadowTLSAlignment);
+ Value *Cpy = EntryIRB.CreateMemCpy(CpShadowPtr, CopyAlign, Base,
+ CopyAlign, Size);
+ LLVM_DEBUG(dbgs() << " ByValCpy: " << *Cpy << "\n");
+ (void)Cpy;
+ }
+ *ShadowPtr = getCleanShadow(V);
+ } else {
+ // Shadow over TLS
+ Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
+ if (Overflow) {
+ // ParamTLS overflow.
+ *ShadowPtr = getCleanShadow(V);
+ } else {
+ *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base,
+ kShadowTLSAlignment);
+ }
+ }
+ LLVM_DEBUG(dbgs()
+ << " ARG: " << FArg << " ==> " << **ShadowPtr << "\n");
+ if (MS.TrackOrigins && !Overflow) {
+ Value *OriginPtr =
+ getOriginPtrForArgument(&FArg, EntryIRB, ArgOffset);
+ setOrigin(A, EntryIRB.CreateLoad(MS.OriginTy, OriginPtr));
+ } else {
+ setOrigin(A, getCleanOrigin());
+ }
break;
- }
-
- if (!FArgEagerCheck)
- ArgOffset += alignTo(Size, kShadowTLSAlignment);
- }
- assert(*ShadowPtr && "Could not find shadow for an argument");
- return *ShadowPtr;
- }
- // For everything else the shadow is zero.
- return getCleanShadow(V);
- }
-
- /// Get the shadow for i-th argument of the instruction I.
- Value *getShadow(Instruction *I, int i) {
- return getShadow(I->getOperand(i));
- }
-
- /// Get the origin for a value.
- Value *getOrigin(Value *V) {
- if (!MS.TrackOrigins) return nullptr;
- if (!PropagateShadow) return getCleanOrigin();
- if (isa<Constant>(V)) return getCleanOrigin();
- assert((isa<Instruction>(V) || isa<Argument>(V)) &&
- "Unexpected value type in getOrigin()");
- if (Instruction *I = dyn_cast<Instruction>(V)) {
- if (I->getMetadata("nosanitize"))
- return getCleanOrigin();
- }
- Value *Origin = OriginMap[V];
- assert(Origin && "Missing origin");
- return Origin;
- }
-
- /// Get the origin for i-th argument of the instruction I.
- Value *getOrigin(Instruction *I, int i) {
- return getOrigin(I->getOperand(i));
- }
-
- /// Remember the place where a shadow check should be inserted.
- ///
- /// This location will be later instrumented with a check that will print a
- /// UMR warning in runtime if the shadow value is not 0.
- void insertShadowCheck(Value *Shadow, Value *Origin, Instruction *OrigIns) {
- assert(Shadow);
- if (!InsertChecks) return;
-#ifndef NDEBUG
- Type *ShadowTy = Shadow->getType();
+ }
+
+ if (!FArgEagerCheck)
+ ArgOffset += alignTo(Size, kShadowTLSAlignment);
+ }
+ assert(*ShadowPtr && "Could not find shadow for an argument");
+ return *ShadowPtr;
+ }
+ // For everything else the shadow is zero.
+ return getCleanShadow(V);
+ }
+
+ /// Get the shadow for i-th argument of the instruction I.
+ Value *getShadow(Instruction *I, int i) {
+ return getShadow(I->getOperand(i));
+ }
+
+ /// Get the origin for a value.
+ Value *getOrigin(Value *V) {
+ if (!MS.TrackOrigins) return nullptr;
+ if (!PropagateShadow) return getCleanOrigin();
+ if (isa<Constant>(V)) return getCleanOrigin();
+ assert((isa<Instruction>(V) || isa<Argument>(V)) &&
+ "Unexpected value type in getOrigin()");
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ if (I->getMetadata("nosanitize"))
+ return getCleanOrigin();
+ }
+ Value *Origin = OriginMap[V];
+ assert(Origin && "Missing origin");
+ return Origin;
+ }
+
+ /// Get the origin for i-th argument of the instruction I.
+ Value *getOrigin(Instruction *I, int i) {
+ return getOrigin(I->getOperand(i));
+ }
+
+ /// Remember the place where a shadow check should be inserted.
+ ///
+ /// This location will be later instrumented with a check that will print a
+ /// UMR warning in runtime if the shadow value is not 0.
+ void insertShadowCheck(Value *Shadow, Value *Origin, Instruction *OrigIns) {
+ assert(Shadow);
+ if (!InsertChecks) return;
+#ifndef NDEBUG
+ Type *ShadowTy = Shadow->getType();
assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy) ||
isa<StructType>(ShadowTy) || isa<ArrayType>(ShadowTy)) &&
"Can only insert checks for integer, vector, and aggregate shadow "
"types");
-#endif
- InstrumentationList.push_back(
- ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
- }
-
- /// Remember the place where a shadow check should be inserted.
- ///
- /// This location will be later instrumented with a check that will print a
- /// UMR warning in runtime if the value is not fully defined.
- void insertShadowCheck(Value *Val, Instruction *OrigIns) {
- assert(Val);
- Value *Shadow, *Origin;
- if (ClCheckConstantShadow) {
- Shadow = getShadow(Val);
- if (!Shadow) return;
- Origin = getOrigin(Val);
- } else {
- Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
- if (!Shadow) return;
- Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
- }
- insertShadowCheck(Shadow, Origin, OrigIns);
- }
-
- AtomicOrdering addReleaseOrdering(AtomicOrdering a) {
- switch (a) {
- case AtomicOrdering::NotAtomic:
- return AtomicOrdering::NotAtomic;
- case AtomicOrdering::Unordered:
- case AtomicOrdering::Monotonic:
- case AtomicOrdering::Release:
- return AtomicOrdering::Release;
- case AtomicOrdering::Acquire:
- case AtomicOrdering::AcquireRelease:
- return AtomicOrdering::AcquireRelease;
- case AtomicOrdering::SequentiallyConsistent:
- return AtomicOrdering::SequentiallyConsistent;
- }
- llvm_unreachable("Unknown ordering");
- }
-
+#endif
+ InstrumentationList.push_back(
+ ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
+ }
+
+ /// Remember the place where a shadow check should be inserted.
+ ///
+ /// This location will be later instrumented with a check that will print a
+ /// UMR warning in runtime if the value is not fully defined.
+ void insertShadowCheck(Value *Val, Instruction *OrigIns) {
+ assert(Val);
+ Value *Shadow, *Origin;
+ if (ClCheckConstantShadow) {
+ Shadow = getShadow(Val);
+ if (!Shadow) return;
+ Origin = getOrigin(Val);
+ } else {
+ Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
+ if (!Shadow) return;
+ Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
+ }
+ insertShadowCheck(Shadow, Origin, OrigIns);
+ }
+
+ AtomicOrdering addReleaseOrdering(AtomicOrdering a) {
+ switch (a) {
+ case AtomicOrdering::NotAtomic:
+ return AtomicOrdering::NotAtomic;
+ case AtomicOrdering::Unordered:
+ case AtomicOrdering::Monotonic:
+ case AtomicOrdering::Release:
+ return AtomicOrdering::Release;
+ case AtomicOrdering::Acquire:
+ case AtomicOrdering::AcquireRelease:
+ return AtomicOrdering::AcquireRelease;
+ case AtomicOrdering::SequentiallyConsistent:
+ return AtomicOrdering::SequentiallyConsistent;
+ }
+ llvm_unreachable("Unknown ordering");
+ }
+
Value *makeAddReleaseOrderingTable(IRBuilder<> &IRB) {
constexpr int NumOrderings = (int)AtomicOrderingCABI::seq_cst + 1;
uint32_t OrderingTable[NumOrderings] = {};
@@ -1849,23 +1849,23 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
makeArrayRef(OrderingTable, NumOrderings));
}
- AtomicOrdering addAcquireOrdering(AtomicOrdering a) {
- switch (a) {
- case AtomicOrdering::NotAtomic:
- return AtomicOrdering::NotAtomic;
- case AtomicOrdering::Unordered:
- case AtomicOrdering::Monotonic:
- case AtomicOrdering::Acquire:
- return AtomicOrdering::Acquire;
- case AtomicOrdering::Release:
- case AtomicOrdering::AcquireRelease:
- return AtomicOrdering::AcquireRelease;
- case AtomicOrdering::SequentiallyConsistent:
- return AtomicOrdering::SequentiallyConsistent;
- }
- llvm_unreachable("Unknown ordering");
- }
-
+ AtomicOrdering addAcquireOrdering(AtomicOrdering a) {
+ switch (a) {
+ case AtomicOrdering::NotAtomic:
+ return AtomicOrdering::NotAtomic;
+ case AtomicOrdering::Unordered:
+ case AtomicOrdering::Monotonic:
+ case AtomicOrdering::Acquire:
+ return AtomicOrdering::Acquire;
+ case AtomicOrdering::Release:
+ case AtomicOrdering::AcquireRelease:
+ return AtomicOrdering::AcquireRelease;
+ case AtomicOrdering::SequentiallyConsistent:
+ return AtomicOrdering::SequentiallyConsistent;
+ }
+ llvm_unreachable("Unknown ordering");
+ }
+
Value *makeAddAcquireOrderingTable(IRBuilder<> &IRB) {
constexpr int NumOrderings = (int)AtomicOrderingCABI::seq_cst + 1;
uint32_t OrderingTable[NumOrderings] = {};
@@ -1884,1353 +1884,1353 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
makeArrayRef(OrderingTable, NumOrderings));
}
- // ------------------- Visitors.
- using InstVisitor<MemorySanitizerVisitor>::visit;
- void visit(Instruction &I) {
+ // ------------------- Visitors.
+ using InstVisitor<MemorySanitizerVisitor>::visit;
+ void visit(Instruction &I) {
if (I.getMetadata("nosanitize"))
return;
// Don't want to visit if we're in the prologue
if (isInPrologue(I))
return;
InstVisitor<MemorySanitizerVisitor>::visit(I);
- }
-
- /// Instrument LoadInst
- ///
- /// Loads the corresponding shadow and (optionally) origin.
- /// Optionally, checks that the load address is fully defined.
- void visitLoadInst(LoadInst &I) {
- assert(I.getType()->isSized() && "Load type must have size");
- assert(!I.getMetadata("nosanitize"));
- IRBuilder<> IRB(I.getNextNode());
- Type *ShadowTy = getShadowTy(&I);
- Value *Addr = I.getPointerOperand();
- Value *ShadowPtr = nullptr, *OriginPtr = nullptr;
- const Align Alignment = assumeAligned(I.getAlignment());
- if (PropagateShadow) {
- std::tie(ShadowPtr, OriginPtr) =
- getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
- setShadow(&I,
- IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld"));
- } else {
- setShadow(&I, getCleanShadow(&I));
- }
-
- if (ClCheckAccessAddress)
- insertShadowCheck(I.getPointerOperand(), &I);
-
- if (I.isAtomic())
- I.setOrdering(addAcquireOrdering(I.getOrdering()));
-
- if (MS.TrackOrigins) {
- if (PropagateShadow) {
- const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
- setOrigin(
- &I, IRB.CreateAlignedLoad(MS.OriginTy, OriginPtr, OriginAlignment));
- } else {
- setOrigin(&I, getCleanOrigin());
- }
- }
- }
-
- /// Instrument StoreInst
- ///
- /// Stores the corresponding shadow and (optionally) origin.
- /// Optionally, checks that the store address is fully defined.
- void visitStoreInst(StoreInst &I) {
- StoreList.push_back(&I);
- if (ClCheckAccessAddress)
- insertShadowCheck(I.getPointerOperand(), &I);
- }
-
- void handleCASOrRMW(Instruction &I) {
- assert(isa<AtomicRMWInst>(I) || isa<AtomicCmpXchgInst>(I));
-
- IRBuilder<> IRB(&I);
- Value *Addr = I.getOperand(0);
- Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(), Align(1),
- /*isStore*/ true)
- .first;
-
- if (ClCheckAccessAddress)
- insertShadowCheck(Addr, &I);
-
- // Only test the conditional argument of cmpxchg instruction.
- // The other argument can potentially be uninitialized, but we can not
- // detect this situation reliably without possible false positives.
- if (isa<AtomicCmpXchgInst>(I))
- insertShadowCheck(I.getOperand(1), &I);
-
- IRB.CreateStore(getCleanShadow(&I), ShadowPtr);
-
- setShadow(&I, getCleanShadow(&I));
- setOrigin(&I, getCleanOrigin());
- }
-
- void visitAtomicRMWInst(AtomicRMWInst &I) {
- handleCASOrRMW(I);
- I.setOrdering(addReleaseOrdering(I.getOrdering()));
- }
-
- void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
- handleCASOrRMW(I);
- I.setSuccessOrdering(addReleaseOrdering(I.getSuccessOrdering()));
- }
-
- // Vector manipulation.
- void visitExtractElementInst(ExtractElementInst &I) {
- insertShadowCheck(I.getOperand(1), &I);
- IRBuilder<> IRB(&I);
- setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1),
- "_msprop"));
- setOrigin(&I, getOrigin(&I, 0));
- }
-
- void visitInsertElementInst(InsertElementInst &I) {
- insertShadowCheck(I.getOperand(2), &I);
- IRBuilder<> IRB(&I);
- setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1),
- I.getOperand(2), "_msprop"));
- setOriginForNaryOp(I);
- }
-
- void visitShuffleVectorInst(ShuffleVectorInst &I) {
- IRBuilder<> IRB(&I);
- setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1),
- I.getShuffleMask(), "_msprop"));
- setOriginForNaryOp(I);
- }
-
- // Casts.
- void visitSExtInst(SExtInst &I) {
- IRBuilder<> IRB(&I);
- setShadow(&I, IRB.CreateSExt(getShadow(&I, 0), I.getType(), "_msprop"));
- setOrigin(&I, getOrigin(&I, 0));
- }
-
- void visitZExtInst(ZExtInst &I) {
- IRBuilder<> IRB(&I);
- setShadow(&I, IRB.CreateZExt(getShadow(&I, 0), I.getType(), "_msprop"));
- setOrigin(&I, getOrigin(&I, 0));
- }
-
- void visitTruncInst(TruncInst &I) {
- IRBuilder<> IRB(&I);
- setShadow(&I, IRB.CreateTrunc(getShadow(&I, 0), I.getType(), "_msprop"));
- setOrigin(&I, getOrigin(&I, 0));
- }
-
- void visitBitCastInst(BitCastInst &I) {
- // Special case: if this is the bitcast (there is exactly 1 allowed) between
- // a musttail call and a ret, don't instrument. New instructions are not
- // allowed after a musttail call.
- if (auto *CI = dyn_cast<CallInst>(I.getOperand(0)))
- if (CI->isMustTailCall())
- return;
- IRBuilder<> IRB(&I);
- setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I)));
- setOrigin(&I, getOrigin(&I, 0));
- }
-
- void visitPtrToIntInst(PtrToIntInst &I) {
- IRBuilder<> IRB(&I);
- setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
- "_msprop_ptrtoint"));
- setOrigin(&I, getOrigin(&I, 0));
- }
-
- void visitIntToPtrInst(IntToPtrInst &I) {
- IRBuilder<> IRB(&I);
- setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
- "_msprop_inttoptr"));
- setOrigin(&I, getOrigin(&I, 0));
- }
-
- void visitFPToSIInst(CastInst& I) { handleShadowOr(I); }
- void visitFPToUIInst(CastInst& I) { handleShadowOr(I); }
- void visitSIToFPInst(CastInst& I) { handleShadowOr(I); }
- void visitUIToFPInst(CastInst& I) { handleShadowOr(I); }
- void visitFPExtInst(CastInst& I) { handleShadowOr(I); }
- void visitFPTruncInst(CastInst& I) { handleShadowOr(I); }
-
- /// Propagate shadow for bitwise AND.
- ///
- /// This code is exact, i.e. if, for example, a bit in the left argument
- /// is defined and 0, then neither the value not definedness of the
- /// corresponding bit in B don't affect the resulting shadow.
- void visitAnd(BinaryOperator &I) {
- IRBuilder<> IRB(&I);
- // "And" of 0 and a poisoned value results in unpoisoned value.
- // 1&1 => 1; 0&1 => 0; p&1 => p;
- // 1&0 => 0; 0&0 => 0; p&0 => 0;
- // 1&p => p; 0&p => 0; p&p => p;
- // S = (S1 & S2) | (V1 & S2) | (S1 & V2)
- Value *S1 = getShadow(&I, 0);
- Value *S2 = getShadow(&I, 1);
- Value *V1 = I.getOperand(0);
- Value *V2 = I.getOperand(1);
- if (V1->getType() != S1->getType()) {
- V1 = IRB.CreateIntCast(V1, S1->getType(), false);
- V2 = IRB.CreateIntCast(V2, S2->getType(), false);
- }
- Value *S1S2 = IRB.CreateAnd(S1, S2);
- Value *V1S2 = IRB.CreateAnd(V1, S2);
- Value *S1V2 = IRB.CreateAnd(S1, V2);
- setShadow(&I, IRB.CreateOr({S1S2, V1S2, S1V2}));
- setOriginForNaryOp(I);
- }
-
- void visitOr(BinaryOperator &I) {
- IRBuilder<> IRB(&I);
- // "Or" of 1 and a poisoned value results in unpoisoned value.
- // 1|1 => 1; 0|1 => 1; p|1 => 1;
- // 1|0 => 1; 0|0 => 0; p|0 => p;
- // 1|p => 1; 0|p => p; p|p => p;
- // S = (S1 & S2) | (~V1 & S2) | (S1 & ~V2)
- Value *S1 = getShadow(&I, 0);
- Value *S2 = getShadow(&I, 1);
- Value *V1 = IRB.CreateNot(I.getOperand(0));
- Value *V2 = IRB.CreateNot(I.getOperand(1));
- if (V1->getType() != S1->getType()) {
- V1 = IRB.CreateIntCast(V1, S1->getType(), false);
- V2 = IRB.CreateIntCast(V2, S2->getType(), false);
- }
- Value *S1S2 = IRB.CreateAnd(S1, S2);
- Value *V1S2 = IRB.CreateAnd(V1, S2);
- Value *S1V2 = IRB.CreateAnd(S1, V2);
- setShadow(&I, IRB.CreateOr({S1S2, V1S2, S1V2}));
- setOriginForNaryOp(I);
- }
-
- /// Default propagation of shadow and/or origin.
- ///
- /// This class implements the general case of shadow propagation, used in all
- /// cases where we don't know and/or don't care about what the operation
- /// actually does. It converts all input shadow values to a common type
- /// (extending or truncating as necessary), and bitwise OR's them.
- ///
- /// This is much cheaper than inserting checks (i.e. requiring inputs to be
- /// fully initialized), and less prone to false positives.
- ///
- /// This class also implements the general case of origin propagation. For a
- /// Nary operation, result origin is set to the origin of an argument that is
- /// not entirely initialized. If there is more than one such arguments, the
- /// rightmost of them is picked. It does not matter which one is picked if all
- /// arguments are initialized.
- template <bool CombineShadow>
- class Combiner {
- Value *Shadow = nullptr;
- Value *Origin = nullptr;
- IRBuilder<> &IRB;
- MemorySanitizerVisitor *MSV;
-
- public:
- Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB)
- : IRB(IRB), MSV(MSV) {}
-
- /// Add a pair of shadow and origin values to the mix.
- Combiner &Add(Value *OpShadow, Value *OpOrigin) {
- if (CombineShadow) {
- assert(OpShadow);
- if (!Shadow)
- Shadow = OpShadow;
- else {
- OpShadow = MSV->CreateShadowCast(IRB, OpShadow, Shadow->getType());
- Shadow = IRB.CreateOr(Shadow, OpShadow, "_msprop");
- }
- }
-
- if (MSV->MS.TrackOrigins) {
- assert(OpOrigin);
- if (!Origin) {
- Origin = OpOrigin;
- } else {
- Constant *ConstOrigin = dyn_cast<Constant>(OpOrigin);
- // No point in adding something that might result in 0 origin value.
- if (!ConstOrigin || !ConstOrigin->isNullValue()) {
+ }
+
+ /// Instrument LoadInst
+ ///
+ /// Loads the corresponding shadow and (optionally) origin.
+ /// Optionally, checks that the load address is fully defined.
+ void visitLoadInst(LoadInst &I) {
+ assert(I.getType()->isSized() && "Load type must have size");
+ assert(!I.getMetadata("nosanitize"));
+ IRBuilder<> IRB(I.getNextNode());
+ Type *ShadowTy = getShadowTy(&I);
+ Value *Addr = I.getPointerOperand();
+ Value *ShadowPtr = nullptr, *OriginPtr = nullptr;
+ const Align Alignment = assumeAligned(I.getAlignment());
+ if (PropagateShadow) {
+ std::tie(ShadowPtr, OriginPtr) =
+ getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
+ setShadow(&I,
+ IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld"));
+ } else {
+ setShadow(&I, getCleanShadow(&I));
+ }
+
+ if (ClCheckAccessAddress)
+ insertShadowCheck(I.getPointerOperand(), &I);
+
+ if (I.isAtomic())
+ I.setOrdering(addAcquireOrdering(I.getOrdering()));
+
+ if (MS.TrackOrigins) {
+ if (PropagateShadow) {
+ const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
+ setOrigin(
+ &I, IRB.CreateAlignedLoad(MS.OriginTy, OriginPtr, OriginAlignment));
+ } else {
+ setOrigin(&I, getCleanOrigin());
+ }
+ }
+ }
+
+ /// Instrument StoreInst
+ ///
+ /// Stores the corresponding shadow and (optionally) origin.
+ /// Optionally, checks that the store address is fully defined.
+ void visitStoreInst(StoreInst &I) {
+ StoreList.push_back(&I);
+ if (ClCheckAccessAddress)
+ insertShadowCheck(I.getPointerOperand(), &I);
+ }
+
+ void handleCASOrRMW(Instruction &I) {
+ assert(isa<AtomicRMWInst>(I) || isa<AtomicCmpXchgInst>(I));
+
+ IRBuilder<> IRB(&I);
+ Value *Addr = I.getOperand(0);
+ Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(), Align(1),
+ /*isStore*/ true)
+ .first;
+
+ if (ClCheckAccessAddress)
+ insertShadowCheck(Addr, &I);
+
+ // Only test the conditional argument of cmpxchg instruction.
+ // The other argument can potentially be uninitialized, but we can not
+ // detect this situation reliably without possible false positives.
+ if (isa<AtomicCmpXchgInst>(I))
+ insertShadowCheck(I.getOperand(1), &I);
+
+ IRB.CreateStore(getCleanShadow(&I), ShadowPtr);
+
+ setShadow(&I, getCleanShadow(&I));
+ setOrigin(&I, getCleanOrigin());
+ }
+
+ void visitAtomicRMWInst(AtomicRMWInst &I) {
+ handleCASOrRMW(I);
+ I.setOrdering(addReleaseOrdering(I.getOrdering()));
+ }
+
+ void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
+ handleCASOrRMW(I);
+ I.setSuccessOrdering(addReleaseOrdering(I.getSuccessOrdering()));
+ }
+
+ // Vector manipulation.
+ void visitExtractElementInst(ExtractElementInst &I) {
+ insertShadowCheck(I.getOperand(1), &I);
+ IRBuilder<> IRB(&I);
+ setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1),
+ "_msprop"));
+ setOrigin(&I, getOrigin(&I, 0));
+ }
+
+ void visitInsertElementInst(InsertElementInst &I) {
+ insertShadowCheck(I.getOperand(2), &I);
+ IRBuilder<> IRB(&I);
+ setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1),
+ I.getOperand(2), "_msprop"));
+ setOriginForNaryOp(I);
+ }
+
+ void visitShuffleVectorInst(ShuffleVectorInst &I) {
+ IRBuilder<> IRB(&I);
+ setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1),
+ I.getShuffleMask(), "_msprop"));
+ setOriginForNaryOp(I);
+ }
+
+ // Casts.
+ void visitSExtInst(SExtInst &I) {
+ IRBuilder<> IRB(&I);
+ setShadow(&I, IRB.CreateSExt(getShadow(&I, 0), I.getType(), "_msprop"));
+ setOrigin(&I, getOrigin(&I, 0));
+ }
+
+ void visitZExtInst(ZExtInst &I) {
+ IRBuilder<> IRB(&I);
+ setShadow(&I, IRB.CreateZExt(getShadow(&I, 0), I.getType(), "_msprop"));
+ setOrigin(&I, getOrigin(&I, 0));
+ }
+
+ void visitTruncInst(TruncInst &I) {
+ IRBuilder<> IRB(&I);
+ setShadow(&I, IRB.CreateTrunc(getShadow(&I, 0), I.getType(), "_msprop"));
+ setOrigin(&I, getOrigin(&I, 0));
+ }
+
+ void visitBitCastInst(BitCastInst &I) {
+ // Special case: if this is the bitcast (there is exactly 1 allowed) between
+ // a musttail call and a ret, don't instrument. New instructions are not
+ // allowed after a musttail call.
+ if (auto *CI = dyn_cast<CallInst>(I.getOperand(0)))
+ if (CI->isMustTailCall())
+ return;
+ IRBuilder<> IRB(&I);
+ setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I)));
+ setOrigin(&I, getOrigin(&I, 0));
+ }
+
+ void visitPtrToIntInst(PtrToIntInst &I) {
+ IRBuilder<> IRB(&I);
+ setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+ "_msprop_ptrtoint"));
+ setOrigin(&I, getOrigin(&I, 0));
+ }
+
+ void visitIntToPtrInst(IntToPtrInst &I) {
+ IRBuilder<> IRB(&I);
+ setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+ "_msprop_inttoptr"));
+ setOrigin(&I, getOrigin(&I, 0));
+ }
+
+ void visitFPToSIInst(CastInst& I) { handleShadowOr(I); }
+ void visitFPToUIInst(CastInst& I) { handleShadowOr(I); }
+ void visitSIToFPInst(CastInst& I) { handleShadowOr(I); }
+ void visitUIToFPInst(CastInst& I) { handleShadowOr(I); }
+ void visitFPExtInst(CastInst& I) { handleShadowOr(I); }
+ void visitFPTruncInst(CastInst& I) { handleShadowOr(I); }
+
+ /// Propagate shadow for bitwise AND.
+ ///
+ /// This code is exact, i.e. if, for example, a bit in the left argument
+ /// is defined and 0, then neither the value not definedness of the
+ /// corresponding bit in B don't affect the resulting shadow.
+ void visitAnd(BinaryOperator &I) {
+ IRBuilder<> IRB(&I);
+ // "And" of 0 and a poisoned value results in unpoisoned value.
+ // 1&1 => 1; 0&1 => 0; p&1 => p;
+ // 1&0 => 0; 0&0 => 0; p&0 => 0;
+ // 1&p => p; 0&p => 0; p&p => p;
+ // S = (S1 & S2) | (V1 & S2) | (S1 & V2)
+ Value *S1 = getShadow(&I, 0);
+ Value *S2 = getShadow(&I, 1);
+ Value *V1 = I.getOperand(0);
+ Value *V2 = I.getOperand(1);
+ if (V1->getType() != S1->getType()) {
+ V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+ V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+ }
+ Value *S1S2 = IRB.CreateAnd(S1, S2);
+ Value *V1S2 = IRB.CreateAnd(V1, S2);
+ Value *S1V2 = IRB.CreateAnd(S1, V2);
+ setShadow(&I, IRB.CreateOr({S1S2, V1S2, S1V2}));
+ setOriginForNaryOp(I);
+ }
+
+ void visitOr(BinaryOperator &I) {
+ IRBuilder<> IRB(&I);
+ // "Or" of 1 and a poisoned value results in unpoisoned value.
+ // 1|1 => 1; 0|1 => 1; p|1 => 1;
+ // 1|0 => 1; 0|0 => 0; p|0 => p;
+ // 1|p => 1; 0|p => p; p|p => p;
+ // S = (S1 & S2) | (~V1 & S2) | (S1 & ~V2)
+ Value *S1 = getShadow(&I, 0);
+ Value *S2 = getShadow(&I, 1);
+ Value *V1 = IRB.CreateNot(I.getOperand(0));
+ Value *V2 = IRB.CreateNot(I.getOperand(1));
+ if (V1->getType() != S1->getType()) {
+ V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+ V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+ }
+ Value *S1S2 = IRB.CreateAnd(S1, S2);
+ Value *V1S2 = IRB.CreateAnd(V1, S2);
+ Value *S1V2 = IRB.CreateAnd(S1, V2);
+ setShadow(&I, IRB.CreateOr({S1S2, V1S2, S1V2}));
+ setOriginForNaryOp(I);
+ }
+
+ /// Default propagation of shadow and/or origin.
+ ///
+ /// This class implements the general case of shadow propagation, used in all
+ /// cases where we don't know and/or don't care about what the operation
+ /// actually does. It converts all input shadow values to a common type
+ /// (extending or truncating as necessary), and bitwise OR's them.
+ ///
+ /// This is much cheaper than inserting checks (i.e. requiring inputs to be
+ /// fully initialized), and less prone to false positives.
+ ///
+ /// This class also implements the general case of origin propagation. For a
+ /// Nary operation, result origin is set to the origin of an argument that is
+ /// not entirely initialized. If there is more than one such arguments, the
+ /// rightmost of them is picked. It does not matter which one is picked if all
+ /// arguments are initialized.
+ template <bool CombineShadow>
+ class Combiner {
+ Value *Shadow = nullptr;
+ Value *Origin = nullptr;
+ IRBuilder<> &IRB;
+ MemorySanitizerVisitor *MSV;
+
+ public:
+ Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB)
+ : IRB(IRB), MSV(MSV) {}
+
+ /// Add a pair of shadow and origin values to the mix.
+ Combiner &Add(Value *OpShadow, Value *OpOrigin) {
+ if (CombineShadow) {
+ assert(OpShadow);
+ if (!Shadow)
+ Shadow = OpShadow;
+ else {
+ OpShadow = MSV->CreateShadowCast(IRB, OpShadow, Shadow->getType());
+ Shadow = IRB.CreateOr(Shadow, OpShadow, "_msprop");
+ }
+ }
+
+ if (MSV->MS.TrackOrigins) {
+ assert(OpOrigin);
+ if (!Origin) {
+ Origin = OpOrigin;
+ } else {
+ Constant *ConstOrigin = dyn_cast<Constant>(OpOrigin);
+ // No point in adding something that might result in 0 origin value.
+ if (!ConstOrigin || !ConstOrigin->isNullValue()) {
Value *FlatShadow = MSV->convertShadowToScalar(OpShadow, IRB);
- Value *Cond =
- IRB.CreateICmpNE(FlatShadow, MSV->getCleanShadow(FlatShadow));
- Origin = IRB.CreateSelect(Cond, OpOrigin, Origin);
- }
- }
- }
- return *this;
- }
-
- /// Add an application value to the mix.
- Combiner &Add(Value *V) {
- Value *OpShadow = MSV->getShadow(V);
- Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : nullptr;
- return Add(OpShadow, OpOrigin);
- }
-
- /// Set the current combined values as the given instruction's shadow
- /// and origin.
- void Done(Instruction *I) {
- if (CombineShadow) {
- assert(Shadow);
- Shadow = MSV->CreateShadowCast(IRB, Shadow, MSV->getShadowTy(I));
- MSV->setShadow(I, Shadow);
- }
- if (MSV->MS.TrackOrigins) {
- assert(Origin);
- MSV->setOrigin(I, Origin);
- }
- }
- };
-
- using ShadowAndOriginCombiner = Combiner<true>;
- using OriginCombiner = Combiner<false>;
-
- /// Propagate origin for arbitrary operation.
- void setOriginForNaryOp(Instruction &I) {
- if (!MS.TrackOrigins) return;
- IRBuilder<> IRB(&I);
- OriginCombiner OC(this, IRB);
- for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
- OC.Add(OI->get());
- OC.Done(&I);
- }
-
- size_t VectorOrPrimitiveTypeSizeInBits(Type *Ty) {
- assert(!(Ty->isVectorTy() && Ty->getScalarType()->isPointerTy()) &&
- "Vector of pointers is not a valid shadow type");
- return Ty->isVectorTy() ? cast<FixedVectorType>(Ty)->getNumElements() *
- Ty->getScalarSizeInBits()
- : Ty->getPrimitiveSizeInBits();
- }
-
- /// Cast between two shadow types, extending or truncating as
- /// necessary.
- Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
- bool Signed = false) {
- Type *srcTy = V->getType();
- size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
- size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
- if (srcSizeInBits > 1 && dstSizeInBits == 1)
- return IRB.CreateICmpNE(V, getCleanShadow(V));
-
- if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
- return IRB.CreateIntCast(V, dstTy, Signed);
- if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
- cast<FixedVectorType>(dstTy)->getNumElements() ==
- cast<FixedVectorType>(srcTy)->getNumElements())
- return IRB.CreateIntCast(V, dstTy, Signed);
- Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
- Value *V2 =
- IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
- return IRB.CreateBitCast(V2, dstTy);
- // TODO: handle struct types.
- }
-
- /// Cast an application value to the type of its own shadow.
- Value *CreateAppToShadowCast(IRBuilder<> &IRB, Value *V) {
- Type *ShadowTy = getShadowTy(V);
- if (V->getType() == ShadowTy)
- return V;
- if (V->getType()->isPtrOrPtrVectorTy())
- return IRB.CreatePtrToInt(V, ShadowTy);
- else
- return IRB.CreateBitCast(V, ShadowTy);
- }
-
- /// Propagate shadow for arbitrary operation.
- void handleShadowOr(Instruction &I) {
- IRBuilder<> IRB(&I);
- ShadowAndOriginCombiner SC(this, IRB);
- for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
- SC.Add(OI->get());
- SC.Done(&I);
- }
-
- void visitFNeg(UnaryOperator &I) { handleShadowOr(I); }
-
- // Handle multiplication by constant.
- //
- // Handle a special case of multiplication by constant that may have one or
- // more zeros in the lower bits. This makes corresponding number of lower bits
- // of the result zero as well. We model it by shifting the other operand
- // shadow left by the required number of bits. Effectively, we transform
- // (X * (A * 2**B)) to ((X << B) * A) and instrument (X << B) as (Sx << B).
- // We use multiplication by 2**N instead of shift to cover the case of
- // multiplication by 0, which may occur in some elements of a vector operand.
- void handleMulByConstant(BinaryOperator &I, Constant *ConstArg,
- Value *OtherArg) {
- Constant *ShadowMul;
- Type *Ty = ConstArg->getType();
- if (auto *VTy = dyn_cast<VectorType>(Ty)) {
- unsigned NumElements = cast<FixedVectorType>(VTy)->getNumElements();
- Type *EltTy = VTy->getElementType();
- SmallVector<Constant *, 16> Elements;
- for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
- if (ConstantInt *Elt =
- dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx))) {
- const APInt &V = Elt->getValue();
- APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
- Elements.push_back(ConstantInt::get(EltTy, V2));
- } else {
- Elements.push_back(ConstantInt::get(EltTy, 1));
- }
- }
- ShadowMul = ConstantVector::get(Elements);
- } else {
- if (ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg)) {
- const APInt &V = Elt->getValue();
- APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
- ShadowMul = ConstantInt::get(Ty, V2);
- } else {
- ShadowMul = ConstantInt::get(Ty, 1);
- }
- }
-
- IRBuilder<> IRB(&I);
- setShadow(&I,
- IRB.CreateMul(getShadow(OtherArg), ShadowMul, "msprop_mul_cst"));
- setOrigin(&I, getOrigin(OtherArg));
- }
-
- void visitMul(BinaryOperator &I) {
- Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
- Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
- if (constOp0 && !constOp1)
- handleMulByConstant(I, constOp0, I.getOperand(1));
- else if (constOp1 && !constOp0)
- handleMulByConstant(I, constOp1, I.getOperand(0));
- else
- handleShadowOr(I);
- }
-
- void visitFAdd(BinaryOperator &I) { handleShadowOr(I); }
- void visitFSub(BinaryOperator &I) { handleShadowOr(I); }
- void visitFMul(BinaryOperator &I) { handleShadowOr(I); }
- void visitAdd(BinaryOperator &I) { handleShadowOr(I); }
- void visitSub(BinaryOperator &I) { handleShadowOr(I); }
- void visitXor(BinaryOperator &I) { handleShadowOr(I); }
-
- void handleIntegerDiv(Instruction &I) {
- IRBuilder<> IRB(&I);
- // Strict on the second argument.
- insertShadowCheck(I.getOperand(1), &I);
- setShadow(&I, getShadow(&I, 0));
- setOrigin(&I, getOrigin(&I, 0));
- }
-
- void visitUDiv(BinaryOperator &I) { handleIntegerDiv(I); }
- void visitSDiv(BinaryOperator &I) { handleIntegerDiv(I); }
- void visitURem(BinaryOperator &I) { handleIntegerDiv(I); }
- void visitSRem(BinaryOperator &I) { handleIntegerDiv(I); }
-
- // Floating point division is side-effect free. We can not require that the
- // divisor is fully initialized and must propagate shadow. See PR37523.
- void visitFDiv(BinaryOperator &I) { handleShadowOr(I); }
- void visitFRem(BinaryOperator &I) { handleShadowOr(I); }
-
- /// Instrument == and != comparisons.
- ///
- /// Sometimes the comparison result is known even if some of the bits of the
- /// arguments are not.
- void handleEqualityComparison(ICmpInst &I) {
- IRBuilder<> IRB(&I);
- Value *A = I.getOperand(0);
- Value *B = I.getOperand(1);
- Value *Sa = getShadow(A);
- Value *Sb = getShadow(B);
-
- // Get rid of pointers and vectors of pointers.
- // For ints (and vectors of ints), types of A and Sa match,
- // and this is a no-op.
- A = IRB.CreatePointerCast(A, Sa->getType());
- B = IRB.CreatePointerCast(B, Sb->getType());
-
- // A == B <==> (C = A^B) == 0
- // A != B <==> (C = A^B) != 0
- // Sc = Sa | Sb
- Value *C = IRB.CreateXor(A, B);
- Value *Sc = IRB.CreateOr(Sa, Sb);
- // Now dealing with i = (C == 0) comparison (or C != 0, does not matter now)
- // Result is defined if one of the following is true
- // * there is a defined 1 bit in C
- // * C is fully defined
- // Si = !(C & ~Sc) && Sc
- Value *Zero = Constant::getNullValue(Sc->getType());
- Value *MinusOne = Constant::getAllOnesValue(Sc->getType());
- Value *Si =
- IRB.CreateAnd(IRB.CreateICmpNE(Sc, Zero),
- IRB.CreateICmpEQ(
- IRB.CreateAnd(IRB.CreateXor(Sc, MinusOne), C), Zero));
- Si->setName("_msprop_icmp");
- setShadow(&I, Si);
- setOriginForNaryOp(I);
- }
-
- /// Build the lowest possible value of V, taking into account V's
- /// uninitialized bits.
- Value *getLowestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
- bool isSigned) {
- if (isSigned) {
- // Split shadow into sign bit and other bits.
- Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
- Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
- // Maximise the undefined shadow bit, minimize other undefined bits.
- return
- IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaOtherBits)), SaSignBit);
- } else {
- // Minimize undefined bits.
- return IRB.CreateAnd(A, IRB.CreateNot(Sa));
- }
- }
-
- /// Build the highest possible value of V, taking into account V's
- /// uninitialized bits.
- Value *getHighestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
- bool isSigned) {
- if (isSigned) {
- // Split shadow into sign bit and other bits.
- Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
- Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
- // Minimise the undefined shadow bit, maximise other undefined bits.
- return
- IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaSignBit)), SaOtherBits);
- } else {
- // Maximize undefined bits.
- return IRB.CreateOr(A, Sa);
- }
- }
-
- /// Instrument relational comparisons.
- ///
- /// This function does exact shadow propagation for all relational
- /// comparisons of integers, pointers and vectors of those.
- /// FIXME: output seems suboptimal when one of the operands is a constant
- void handleRelationalComparisonExact(ICmpInst &I) {
- IRBuilder<> IRB(&I);
- Value *A = I.getOperand(0);
- Value *B = I.getOperand(1);
- Value *Sa = getShadow(A);
- Value *Sb = getShadow(B);
-
- // Get rid of pointers and vectors of pointers.
- // For ints (and vectors of ints), types of A and Sa match,
- // and this is a no-op.
- A = IRB.CreatePointerCast(A, Sa->getType());
- B = IRB.CreatePointerCast(B, Sb->getType());
-
- // Let [a0, a1] be the interval of possible values of A, taking into account
- // its undefined bits. Let [b0, b1] be the interval of possible values of B.
- // Then (A cmp B) is defined iff (a0 cmp b1) == (a1 cmp b0).
- bool IsSigned = I.isSigned();
- Value *S1 = IRB.CreateICmp(I.getPredicate(),
- getLowestPossibleValue(IRB, A, Sa, IsSigned),
- getHighestPossibleValue(IRB, B, Sb, IsSigned));
- Value *S2 = IRB.CreateICmp(I.getPredicate(),
- getHighestPossibleValue(IRB, A, Sa, IsSigned),
- getLowestPossibleValue(IRB, B, Sb, IsSigned));
- Value *Si = IRB.CreateXor(S1, S2);
- setShadow(&I, Si);
- setOriginForNaryOp(I);
- }
-
- /// Instrument signed relational comparisons.
- ///
- /// Handle sign bit tests: x<0, x>=0, x<=-1, x>-1 by propagating the highest
- /// bit of the shadow. Everything else is delegated to handleShadowOr().
- void handleSignedRelationalComparison(ICmpInst &I) {
- Constant *constOp;
- Value *op = nullptr;
- CmpInst::Predicate pre;
- if ((constOp = dyn_cast<Constant>(I.getOperand(1)))) {
- op = I.getOperand(0);
- pre = I.getPredicate();
- } else if ((constOp = dyn_cast<Constant>(I.getOperand(0)))) {
- op = I.getOperand(1);
- pre = I.getSwappedPredicate();
- } else {
- handleShadowOr(I);
- return;
- }
-
- if ((constOp->isNullValue() &&
- (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) ||
- (constOp->isAllOnesValue() &&
- (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE))) {
- IRBuilder<> IRB(&I);
- Value *Shadow = IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op),
- "_msprop_icmp_s");
- setShadow(&I, Shadow);
- setOrigin(&I, getOrigin(op));
- } else {
- handleShadowOr(I);
- }
- }
-
- void visitICmpInst(ICmpInst &I) {
- if (!ClHandleICmp) {
- handleShadowOr(I);
- return;
- }
- if (I.isEquality()) {
- handleEqualityComparison(I);
- return;
- }
-
- assert(I.isRelational());
- if (ClHandleICmpExact) {
- handleRelationalComparisonExact(I);
- return;
- }
- if (I.isSigned()) {
- handleSignedRelationalComparison(I);
- return;
- }
-
- assert(I.isUnsigned());
- if ((isa<Constant>(I.getOperand(0)) || isa<Constant>(I.getOperand(1)))) {
- handleRelationalComparisonExact(I);
- return;
- }
-
- handleShadowOr(I);
- }
-
- void visitFCmpInst(FCmpInst &I) {
- handleShadowOr(I);
- }
-
- void handleShift(BinaryOperator &I) {
- IRBuilder<> IRB(&I);
- // If any of the S2 bits are poisoned, the whole thing is poisoned.
- // Otherwise perform the same shift on S1.
- Value *S1 = getShadow(&I, 0);
- Value *S2 = getShadow(&I, 1);
- Value *S2Conv = IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)),
- S2->getType());
- Value *V2 = I.getOperand(1);
- Value *Shift = IRB.CreateBinOp(I.getOpcode(), S1, V2);
- setShadow(&I, IRB.CreateOr(Shift, S2Conv));
- setOriginForNaryOp(I);
- }
-
- void visitShl(BinaryOperator &I) { handleShift(I); }
- void visitAShr(BinaryOperator &I) { handleShift(I); }
- void visitLShr(BinaryOperator &I) { handleShift(I); }
-
- /// Instrument llvm.memmove
- ///
- /// At this point we don't know if llvm.memmove will be inlined or not.
- /// If we don't instrument it and it gets inlined,
- /// our interceptor will not kick in and we will lose the memmove.
- /// If we instrument the call here, but it does not get inlined,
- /// we will memove the shadow twice: which is bad in case
- /// of overlapping regions. So, we simply lower the intrinsic to a call.
- ///
- /// Similar situation exists for memcpy and memset.
- void visitMemMoveInst(MemMoveInst &I) {
- IRBuilder<> IRB(&I);
- IRB.CreateCall(
- MS.MemmoveFn,
- {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
- IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
- IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
- I.eraseFromParent();
- }
-
- // Similar to memmove: avoid copying shadow twice.
- // This is somewhat unfortunate as it may slowdown small constant memcpys.
- // FIXME: consider doing manual inline for small constant sizes and proper
- // alignment.
- void visitMemCpyInst(MemCpyInst &I) {
- IRBuilder<> IRB(&I);
- IRB.CreateCall(
- MS.MemcpyFn,
- {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
- IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
- IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
- I.eraseFromParent();
- }
-
- // Same as memcpy.
- void visitMemSetInst(MemSetInst &I) {
- IRBuilder<> IRB(&I);
- IRB.CreateCall(
- MS.MemsetFn,
- {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
- IRB.CreateIntCast(I.getArgOperand(1), IRB.getInt32Ty(), false),
- IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
- I.eraseFromParent();
- }
-
- void visitVAStartInst(VAStartInst &I) {
- VAHelper->visitVAStartInst(I);
- }
-
- void visitVACopyInst(VACopyInst &I) {
- VAHelper->visitVACopyInst(I);
- }
-
- /// Handle vector store-like intrinsics.
- ///
- /// Instrument intrinsics that look like a simple SIMD store: writes memory,
- /// has 1 pointer argument and 1 vector argument, returns void.
- bool handleVectorStoreIntrinsic(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Value* Addr = I.getArgOperand(0);
- Value *Shadow = getShadow(&I, 1);
- Value *ShadowPtr, *OriginPtr;
-
- // We don't know the pointer alignment (could be unaligned SSE store!).
- // Have to assume to worst case.
- std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
- Addr, IRB, Shadow->getType(), Align(1), /*isStore*/ true);
- IRB.CreateAlignedStore(Shadow, ShadowPtr, Align(1));
-
- if (ClCheckAccessAddress)
- insertShadowCheck(Addr, &I);
-
- // FIXME: factor out common code from materializeStores
- if (MS.TrackOrigins) IRB.CreateStore(getOrigin(&I, 1), OriginPtr);
- return true;
- }
-
- /// Handle vector load-like intrinsics.
- ///
- /// Instrument intrinsics that look like a simple SIMD load: reads memory,
- /// has 1 pointer argument, returns a vector.
- bool handleVectorLoadIntrinsic(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Value *Addr = I.getArgOperand(0);
-
- Type *ShadowTy = getShadowTy(&I);
- Value *ShadowPtr = nullptr, *OriginPtr = nullptr;
- if (PropagateShadow) {
- // We don't know the pointer alignment (could be unaligned SSE load!).
- // Have to assume to worst case.
- const Align Alignment = Align(1);
- std::tie(ShadowPtr, OriginPtr) =
- getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
- setShadow(&I,
- IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld"));
- } else {
- setShadow(&I, getCleanShadow(&I));
- }
-
- if (ClCheckAccessAddress)
- insertShadowCheck(Addr, &I);
-
- if (MS.TrackOrigins) {
- if (PropagateShadow)
- setOrigin(&I, IRB.CreateLoad(MS.OriginTy, OriginPtr));
- else
- setOrigin(&I, getCleanOrigin());
- }
- return true;
- }
-
- /// Handle (SIMD arithmetic)-like intrinsics.
- ///
- /// Instrument intrinsics with any number of arguments of the same type,
- /// equal to the return type. The type should be simple (no aggregates or
- /// pointers; vectors are fine).
- /// Caller guarantees that this intrinsic does not access memory.
- bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) {
- Type *RetTy = I.getType();
- if (!(RetTy->isIntOrIntVectorTy() ||
- RetTy->isFPOrFPVectorTy() ||
- RetTy->isX86_MMXTy()))
- return false;
-
- unsigned NumArgOperands = I.getNumArgOperands();
- for (unsigned i = 0; i < NumArgOperands; ++i) {
- Type *Ty = I.getArgOperand(i)->getType();
- if (Ty != RetTy)
- return false;
- }
-
- IRBuilder<> IRB(&I);
- ShadowAndOriginCombiner SC(this, IRB);
- for (unsigned i = 0; i < NumArgOperands; ++i)
- SC.Add(I.getArgOperand(i));
- SC.Done(&I);
-
- return true;
- }
-
- /// Heuristically instrument unknown intrinsics.
- ///
- /// The main purpose of this code is to do something reasonable with all
- /// random intrinsics we might encounter, most importantly - SIMD intrinsics.
- /// We recognize several classes of intrinsics by their argument types and
- /// ModRefBehaviour and apply special instrumentation when we are reasonably
- /// sure that we know what the intrinsic does.
- ///
- /// We special-case intrinsics where this approach fails. See llvm.bswap
- /// handling as an example of that.
- bool handleUnknownIntrinsic(IntrinsicInst &I) {
- unsigned NumArgOperands = I.getNumArgOperands();
- if (NumArgOperands == 0)
- return false;
-
- if (NumArgOperands == 2 &&
- I.getArgOperand(0)->getType()->isPointerTy() &&
- I.getArgOperand(1)->getType()->isVectorTy() &&
- I.getType()->isVoidTy() &&
- !I.onlyReadsMemory()) {
- // This looks like a vector store.
- return handleVectorStoreIntrinsic(I);
- }
-
- if (NumArgOperands == 1 &&
- I.getArgOperand(0)->getType()->isPointerTy() &&
- I.getType()->isVectorTy() &&
- I.onlyReadsMemory()) {
- // This looks like a vector load.
- return handleVectorLoadIntrinsic(I);
- }
-
- if (I.doesNotAccessMemory())
- if (maybeHandleSimpleNomemIntrinsic(I))
- return true;
-
- // FIXME: detect and handle SSE maskstore/maskload
- return false;
- }
-
- void handleInvariantGroup(IntrinsicInst &I) {
- setShadow(&I, getShadow(&I, 0));
- setOrigin(&I, getOrigin(&I, 0));
- }
-
- void handleLifetimeStart(IntrinsicInst &I) {
- if (!PoisonStack)
- return;
+ Value *Cond =
+ IRB.CreateICmpNE(FlatShadow, MSV->getCleanShadow(FlatShadow));
+ Origin = IRB.CreateSelect(Cond, OpOrigin, Origin);
+ }
+ }
+ }
+ return *this;
+ }
+
+ /// Add an application value to the mix.
+ Combiner &Add(Value *V) {
+ Value *OpShadow = MSV->getShadow(V);
+ Value *OpOrigin = MSV->MS.TrackOrigins ? MSV->getOrigin(V) : nullptr;
+ return Add(OpShadow, OpOrigin);
+ }
+
+ /// Set the current combined values as the given instruction's shadow
+ /// and origin.
+ void Done(Instruction *I) {
+ if (CombineShadow) {
+ assert(Shadow);
+ Shadow = MSV->CreateShadowCast(IRB, Shadow, MSV->getShadowTy(I));
+ MSV->setShadow(I, Shadow);
+ }
+ if (MSV->MS.TrackOrigins) {
+ assert(Origin);
+ MSV->setOrigin(I, Origin);
+ }
+ }
+ };
+
+ using ShadowAndOriginCombiner = Combiner<true>;
+ using OriginCombiner = Combiner<false>;
+
+ /// Propagate origin for arbitrary operation.
+ void setOriginForNaryOp(Instruction &I) {
+ if (!MS.TrackOrigins) return;
+ IRBuilder<> IRB(&I);
+ OriginCombiner OC(this, IRB);
+ for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+ OC.Add(OI->get());
+ OC.Done(&I);
+ }
+
+ size_t VectorOrPrimitiveTypeSizeInBits(Type *Ty) {
+ assert(!(Ty->isVectorTy() && Ty->getScalarType()->isPointerTy()) &&
+ "Vector of pointers is not a valid shadow type");
+ return Ty->isVectorTy() ? cast<FixedVectorType>(Ty)->getNumElements() *
+ Ty->getScalarSizeInBits()
+ : Ty->getPrimitiveSizeInBits();
+ }
+
+ /// Cast between two shadow types, extending or truncating as
+ /// necessary.
+ Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy,
+ bool Signed = false) {
+ Type *srcTy = V->getType();
+ size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
+ size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
+ if (srcSizeInBits > 1 && dstSizeInBits == 1)
+ return IRB.CreateICmpNE(V, getCleanShadow(V));
+
+ if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
+ return IRB.CreateIntCast(V, dstTy, Signed);
+ if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
+ cast<FixedVectorType>(dstTy)->getNumElements() ==
+ cast<FixedVectorType>(srcTy)->getNumElements())
+ return IRB.CreateIntCast(V, dstTy, Signed);
+ Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
+ Value *V2 =
+ IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed);
+ return IRB.CreateBitCast(V2, dstTy);
+ // TODO: handle struct types.
+ }
+
+ /// Cast an application value to the type of its own shadow.
+ Value *CreateAppToShadowCast(IRBuilder<> &IRB, Value *V) {
+ Type *ShadowTy = getShadowTy(V);
+ if (V->getType() == ShadowTy)
+ return V;
+ if (V->getType()->isPtrOrPtrVectorTy())
+ return IRB.CreatePtrToInt(V, ShadowTy);
+ else
+ return IRB.CreateBitCast(V, ShadowTy);
+ }
+
+ /// Propagate shadow for arbitrary operation.
+ void handleShadowOr(Instruction &I) {
+ IRBuilder<> IRB(&I);
+ ShadowAndOriginCombiner SC(this, IRB);
+ for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+ SC.Add(OI->get());
+ SC.Done(&I);
+ }
+
+ void visitFNeg(UnaryOperator &I) { handleShadowOr(I); }
+
+ // Handle multiplication by constant.
+ //
+ // Handle a special case of multiplication by constant that may have one or
+ // more zeros in the lower bits. This makes corresponding number of lower bits
+ // of the result zero as well. We model it by shifting the other operand
+ // shadow left by the required number of bits. Effectively, we transform
+ // (X * (A * 2**B)) to ((X << B) * A) and instrument (X << B) as (Sx << B).
+ // We use multiplication by 2**N instead of shift to cover the case of
+ // multiplication by 0, which may occur in some elements of a vector operand.
+ void handleMulByConstant(BinaryOperator &I, Constant *ConstArg,
+ Value *OtherArg) {
+ Constant *ShadowMul;
+ Type *Ty = ConstArg->getType();
+ if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+ unsigned NumElements = cast<FixedVectorType>(VTy)->getNumElements();
+ Type *EltTy = VTy->getElementType();
+ SmallVector<Constant *, 16> Elements;
+ for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
+ if (ConstantInt *Elt =
+ dyn_cast<ConstantInt>(ConstArg->getAggregateElement(Idx))) {
+ const APInt &V = Elt->getValue();
+ APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+ Elements.push_back(ConstantInt::get(EltTy, V2));
+ } else {
+ Elements.push_back(ConstantInt::get(EltTy, 1));
+ }
+ }
+ ShadowMul = ConstantVector::get(Elements);
+ } else {
+ if (ConstantInt *Elt = dyn_cast<ConstantInt>(ConstArg)) {
+ const APInt &V = Elt->getValue();
+ APInt V2 = APInt(V.getBitWidth(), 1) << V.countTrailingZeros();
+ ShadowMul = ConstantInt::get(Ty, V2);
+ } else {
+ ShadowMul = ConstantInt::get(Ty, 1);
+ }
+ }
+
+ IRBuilder<> IRB(&I);
+ setShadow(&I,
+ IRB.CreateMul(getShadow(OtherArg), ShadowMul, "msprop_mul_cst"));
+ setOrigin(&I, getOrigin(OtherArg));
+ }
+
+ void visitMul(BinaryOperator &I) {
+ Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
+ Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
+ if (constOp0 && !constOp1)
+ handleMulByConstant(I, constOp0, I.getOperand(1));
+ else if (constOp1 && !constOp0)
+ handleMulByConstant(I, constOp1, I.getOperand(0));
+ else
+ handleShadowOr(I);
+ }
+
+ void visitFAdd(BinaryOperator &I) { handleShadowOr(I); }
+ void visitFSub(BinaryOperator &I) { handleShadowOr(I); }
+ void visitFMul(BinaryOperator &I) { handleShadowOr(I); }
+ void visitAdd(BinaryOperator &I) { handleShadowOr(I); }
+ void visitSub(BinaryOperator &I) { handleShadowOr(I); }
+ void visitXor(BinaryOperator &I) { handleShadowOr(I); }
+
+ void handleIntegerDiv(Instruction &I) {
+ IRBuilder<> IRB(&I);
+ // Strict on the second argument.
+ insertShadowCheck(I.getOperand(1), &I);
+ setShadow(&I, getShadow(&I, 0));
+ setOrigin(&I, getOrigin(&I, 0));
+ }
+
+ void visitUDiv(BinaryOperator &I) { handleIntegerDiv(I); }
+ void visitSDiv(BinaryOperator &I) { handleIntegerDiv(I); }
+ void visitURem(BinaryOperator &I) { handleIntegerDiv(I); }
+ void visitSRem(BinaryOperator &I) { handleIntegerDiv(I); }
+
+ // Floating point division is side-effect free. We can not require that the
+ // divisor is fully initialized and must propagate shadow. See PR37523.
+ void visitFDiv(BinaryOperator &I) { handleShadowOr(I); }
+ void visitFRem(BinaryOperator &I) { handleShadowOr(I); }
+
+ /// Instrument == and != comparisons.
+ ///
+ /// Sometimes the comparison result is known even if some of the bits of the
+ /// arguments are not.
+ void handleEqualityComparison(ICmpInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *A = I.getOperand(0);
+ Value *B = I.getOperand(1);
+ Value *Sa = getShadow(A);
+ Value *Sb = getShadow(B);
+
+ // Get rid of pointers and vectors of pointers.
+ // For ints (and vectors of ints), types of A and Sa match,
+ // and this is a no-op.
+ A = IRB.CreatePointerCast(A, Sa->getType());
+ B = IRB.CreatePointerCast(B, Sb->getType());
+
+ // A == B <==> (C = A^B) == 0
+ // A != B <==> (C = A^B) != 0
+ // Sc = Sa | Sb
+ Value *C = IRB.CreateXor(A, B);
+ Value *Sc = IRB.CreateOr(Sa, Sb);
+ // Now dealing with i = (C == 0) comparison (or C != 0, does not matter now)
+ // Result is defined if one of the following is true
+ // * there is a defined 1 bit in C
+ // * C is fully defined
+ // Si = !(C & ~Sc) && Sc
+ Value *Zero = Constant::getNullValue(Sc->getType());
+ Value *MinusOne = Constant::getAllOnesValue(Sc->getType());
+ Value *Si =
+ IRB.CreateAnd(IRB.CreateICmpNE(Sc, Zero),
+ IRB.CreateICmpEQ(
+ IRB.CreateAnd(IRB.CreateXor(Sc, MinusOne), C), Zero));
+ Si->setName("_msprop_icmp");
+ setShadow(&I, Si);
+ setOriginForNaryOp(I);
+ }
+
+ /// Build the lowest possible value of V, taking into account V's
+ /// uninitialized bits.
+ Value *getLowestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+ bool isSigned) {
+ if (isSigned) {
+ // Split shadow into sign bit and other bits.
+ Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+ Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+ // Maximise the undefined shadow bit, minimize other undefined bits.
+ return
+ IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaOtherBits)), SaSignBit);
+ } else {
+ // Minimize undefined bits.
+ return IRB.CreateAnd(A, IRB.CreateNot(Sa));
+ }
+ }
+
+ /// Build the highest possible value of V, taking into account V's
+ /// uninitialized bits.
+ Value *getHighestPossibleValue(IRBuilder<> &IRB, Value *A, Value *Sa,
+ bool isSigned) {
+ if (isSigned) {
+ // Split shadow into sign bit and other bits.
+ Value *SaOtherBits = IRB.CreateLShr(IRB.CreateShl(Sa, 1), 1);
+ Value *SaSignBit = IRB.CreateXor(Sa, SaOtherBits);
+ // Minimise the undefined shadow bit, maximise other undefined bits.
+ return
+ IRB.CreateOr(IRB.CreateAnd(A, IRB.CreateNot(SaSignBit)), SaOtherBits);
+ } else {
+ // Maximize undefined bits.
+ return IRB.CreateOr(A, Sa);
+ }
+ }
+
+ /// Instrument relational comparisons.
+ ///
+ /// This function does exact shadow propagation for all relational
+ /// comparisons of integers, pointers and vectors of those.
+ /// FIXME: output seems suboptimal when one of the operands is a constant
+ void handleRelationalComparisonExact(ICmpInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *A = I.getOperand(0);
+ Value *B = I.getOperand(1);
+ Value *Sa = getShadow(A);
+ Value *Sb = getShadow(B);
+
+ // Get rid of pointers and vectors of pointers.
+ // For ints (and vectors of ints), types of A and Sa match,
+ // and this is a no-op.
+ A = IRB.CreatePointerCast(A, Sa->getType());
+ B = IRB.CreatePointerCast(B, Sb->getType());
+
+ // Let [a0, a1] be the interval of possible values of A, taking into account
+ // its undefined bits. Let [b0, b1] be the interval of possible values of B.
+ // Then (A cmp B) is defined iff (a0 cmp b1) == (a1 cmp b0).
+ bool IsSigned = I.isSigned();
+ Value *S1 = IRB.CreateICmp(I.getPredicate(),
+ getLowestPossibleValue(IRB, A, Sa, IsSigned),
+ getHighestPossibleValue(IRB, B, Sb, IsSigned));
+ Value *S2 = IRB.CreateICmp(I.getPredicate(),
+ getHighestPossibleValue(IRB, A, Sa, IsSigned),
+ getLowestPossibleValue(IRB, B, Sb, IsSigned));
+ Value *Si = IRB.CreateXor(S1, S2);
+ setShadow(&I, Si);
+ setOriginForNaryOp(I);
+ }
+
+ /// Instrument signed relational comparisons.
+ ///
+ /// Handle sign bit tests: x<0, x>=0, x<=-1, x>-1 by propagating the highest
+ /// bit of the shadow. Everything else is delegated to handleShadowOr().
+ void handleSignedRelationalComparison(ICmpInst &I) {
+ Constant *constOp;
+ Value *op = nullptr;
+ CmpInst::Predicate pre;
+ if ((constOp = dyn_cast<Constant>(I.getOperand(1)))) {
+ op = I.getOperand(0);
+ pre = I.getPredicate();
+ } else if ((constOp = dyn_cast<Constant>(I.getOperand(0)))) {
+ op = I.getOperand(1);
+ pre = I.getSwappedPredicate();
+ } else {
+ handleShadowOr(I);
+ return;
+ }
+
+ if ((constOp->isNullValue() &&
+ (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) ||
+ (constOp->isAllOnesValue() &&
+ (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE))) {
+ IRBuilder<> IRB(&I);
+ Value *Shadow = IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op),
+ "_msprop_icmp_s");
+ setShadow(&I, Shadow);
+ setOrigin(&I, getOrigin(op));
+ } else {
+ handleShadowOr(I);
+ }
+ }
+
+ void visitICmpInst(ICmpInst &I) {
+ if (!ClHandleICmp) {
+ handleShadowOr(I);
+ return;
+ }
+ if (I.isEquality()) {
+ handleEqualityComparison(I);
+ return;
+ }
+
+ assert(I.isRelational());
+ if (ClHandleICmpExact) {
+ handleRelationalComparisonExact(I);
+ return;
+ }
+ if (I.isSigned()) {
+ handleSignedRelationalComparison(I);
+ return;
+ }
+
+ assert(I.isUnsigned());
+ if ((isa<Constant>(I.getOperand(0)) || isa<Constant>(I.getOperand(1)))) {
+ handleRelationalComparisonExact(I);
+ return;
+ }
+
+ handleShadowOr(I);
+ }
+
+ void visitFCmpInst(FCmpInst &I) {
+ handleShadowOr(I);
+ }
+
+ void handleShift(BinaryOperator &I) {
+ IRBuilder<> IRB(&I);
+ // If any of the S2 bits are poisoned, the whole thing is poisoned.
+ // Otherwise perform the same shift on S1.
+ Value *S1 = getShadow(&I, 0);
+ Value *S2 = getShadow(&I, 1);
+ Value *S2Conv = IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)),
+ S2->getType());
+ Value *V2 = I.getOperand(1);
+ Value *Shift = IRB.CreateBinOp(I.getOpcode(), S1, V2);
+ setShadow(&I, IRB.CreateOr(Shift, S2Conv));
+ setOriginForNaryOp(I);
+ }
+
+ void visitShl(BinaryOperator &I) { handleShift(I); }
+ void visitAShr(BinaryOperator &I) { handleShift(I); }
+ void visitLShr(BinaryOperator &I) { handleShift(I); }
+
+ /// Instrument llvm.memmove
+ ///
+ /// At this point we don't know if llvm.memmove will be inlined or not.
+ /// If we don't instrument it and it gets inlined,
+ /// our interceptor will not kick in and we will lose the memmove.
+ /// If we instrument the call here, but it does not get inlined,
+ /// we will memove the shadow twice: which is bad in case
+ /// of overlapping regions. So, we simply lower the intrinsic to a call.
+ ///
+ /// Similar situation exists for memcpy and memset.
+ void visitMemMoveInst(MemMoveInst &I) {
+ IRBuilder<> IRB(&I);
+ IRB.CreateCall(
+ MS.MemmoveFn,
+ {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+ IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+ IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+ I.eraseFromParent();
+ }
+
+ // Similar to memmove: avoid copying shadow twice.
+ // This is somewhat unfortunate as it may slowdown small constant memcpys.
+ // FIXME: consider doing manual inline for small constant sizes and proper
+ // alignment.
+ void visitMemCpyInst(MemCpyInst &I) {
+ IRBuilder<> IRB(&I);
+ IRB.CreateCall(
+ MS.MemcpyFn,
+ {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+ IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+ IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+ I.eraseFromParent();
+ }
+
+ // Same as memcpy.
+ void visitMemSetInst(MemSetInst &I) {
+ IRBuilder<> IRB(&I);
+ IRB.CreateCall(
+ MS.MemsetFn,
+ {IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+ IRB.CreateIntCast(I.getArgOperand(1), IRB.getInt32Ty(), false),
+ IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false)});
+ I.eraseFromParent();
+ }
+
+ void visitVAStartInst(VAStartInst &I) {
+ VAHelper->visitVAStartInst(I);
+ }
+
+ void visitVACopyInst(VACopyInst &I) {
+ VAHelper->visitVACopyInst(I);
+ }
+
+ /// Handle vector store-like intrinsics.
+ ///
+ /// Instrument intrinsics that look like a simple SIMD store: writes memory,
+ /// has 1 pointer argument and 1 vector argument, returns void.
+ bool handleVectorStoreIntrinsic(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value* Addr = I.getArgOperand(0);
+ Value *Shadow = getShadow(&I, 1);
+ Value *ShadowPtr, *OriginPtr;
+
+ // We don't know the pointer alignment (could be unaligned SSE store!).
+ // Have to assume to worst case.
+ std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
+ Addr, IRB, Shadow->getType(), Align(1), /*isStore*/ true);
+ IRB.CreateAlignedStore(Shadow, ShadowPtr, Align(1));
+
+ if (ClCheckAccessAddress)
+ insertShadowCheck(Addr, &I);
+
+ // FIXME: factor out common code from materializeStores
+ if (MS.TrackOrigins) IRB.CreateStore(getOrigin(&I, 1), OriginPtr);
+ return true;
+ }
+
+ /// Handle vector load-like intrinsics.
+ ///
+ /// Instrument intrinsics that look like a simple SIMD load: reads memory,
+ /// has 1 pointer argument, returns a vector.
+ bool handleVectorLoadIntrinsic(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *Addr = I.getArgOperand(0);
+
+ Type *ShadowTy = getShadowTy(&I);
+ Value *ShadowPtr = nullptr, *OriginPtr = nullptr;
+ if (PropagateShadow) {
+ // We don't know the pointer alignment (could be unaligned SSE load!).
+ // Have to assume to worst case.
+ const Align Alignment = Align(1);
+ std::tie(ShadowPtr, OriginPtr) =
+ getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
+ setShadow(&I,
+ IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld"));
+ } else {
+ setShadow(&I, getCleanShadow(&I));
+ }
+
+ if (ClCheckAccessAddress)
+ insertShadowCheck(Addr, &I);
+
+ if (MS.TrackOrigins) {
+ if (PropagateShadow)
+ setOrigin(&I, IRB.CreateLoad(MS.OriginTy, OriginPtr));
+ else
+ setOrigin(&I, getCleanOrigin());
+ }
+ return true;
+ }
+
+ /// Handle (SIMD arithmetic)-like intrinsics.
+ ///
+ /// Instrument intrinsics with any number of arguments of the same type,
+ /// equal to the return type. The type should be simple (no aggregates or
+ /// pointers; vectors are fine).
+ /// Caller guarantees that this intrinsic does not access memory.
+ bool maybeHandleSimpleNomemIntrinsic(IntrinsicInst &I) {
+ Type *RetTy = I.getType();
+ if (!(RetTy->isIntOrIntVectorTy() ||
+ RetTy->isFPOrFPVectorTy() ||
+ RetTy->isX86_MMXTy()))
+ return false;
+
+ unsigned NumArgOperands = I.getNumArgOperands();
+ for (unsigned i = 0; i < NumArgOperands; ++i) {
+ Type *Ty = I.getArgOperand(i)->getType();
+ if (Ty != RetTy)
+ return false;
+ }
+
+ IRBuilder<> IRB(&I);
+ ShadowAndOriginCombiner SC(this, IRB);
+ for (unsigned i = 0; i < NumArgOperands; ++i)
+ SC.Add(I.getArgOperand(i));
+ SC.Done(&I);
+
+ return true;
+ }
+
+ /// Heuristically instrument unknown intrinsics.
+ ///
+ /// The main purpose of this code is to do something reasonable with all
+ /// random intrinsics we might encounter, most importantly - SIMD intrinsics.
+ /// We recognize several classes of intrinsics by their argument types and
+ /// ModRefBehaviour and apply special instrumentation when we are reasonably
+ /// sure that we know what the intrinsic does.
+ ///
+ /// We special-case intrinsics where this approach fails. See llvm.bswap
+ /// handling as an example of that.
+ bool handleUnknownIntrinsic(IntrinsicInst &I) {
+ unsigned NumArgOperands = I.getNumArgOperands();
+ if (NumArgOperands == 0)
+ return false;
+
+ if (NumArgOperands == 2 &&
+ I.getArgOperand(0)->getType()->isPointerTy() &&
+ I.getArgOperand(1)->getType()->isVectorTy() &&
+ I.getType()->isVoidTy() &&
+ !I.onlyReadsMemory()) {
+ // This looks like a vector store.
+ return handleVectorStoreIntrinsic(I);
+ }
+
+ if (NumArgOperands == 1 &&
+ I.getArgOperand(0)->getType()->isPointerTy() &&
+ I.getType()->isVectorTy() &&
+ I.onlyReadsMemory()) {
+ // This looks like a vector load.
+ return handleVectorLoadIntrinsic(I);
+ }
+
+ if (I.doesNotAccessMemory())
+ if (maybeHandleSimpleNomemIntrinsic(I))
+ return true;
+
+ // FIXME: detect and handle SSE maskstore/maskload
+ return false;
+ }
+
+ void handleInvariantGroup(IntrinsicInst &I) {
+ setShadow(&I, getShadow(&I, 0));
+ setOrigin(&I, getOrigin(&I, 0));
+ }
+
+ void handleLifetimeStart(IntrinsicInst &I) {
+ if (!PoisonStack)
+ return;
AllocaInst *AI = llvm::findAllocaForValue(I.getArgOperand(1));
- if (!AI)
- InstrumentLifetimeStart = false;
- LifetimeStartList.push_back(std::make_pair(&I, AI));
- }
-
- void handleBswap(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Value *Op = I.getArgOperand(0);
- Type *OpType = Op->getType();
- Function *BswapFunc = Intrinsic::getDeclaration(
- F.getParent(), Intrinsic::bswap, makeArrayRef(&OpType, 1));
- setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op)));
- setOrigin(&I, getOrigin(Op));
- }
-
- // Instrument vector convert intrinsic.
- //
- // This function instruments intrinsics like cvtsi2ss:
- // %Out = int_xxx_cvtyyy(%ConvertOp)
- // or
- // %Out = int_xxx_cvtyyy(%CopyOp, %ConvertOp)
- // Intrinsic converts \p NumUsedElements elements of \p ConvertOp to the same
- // number \p Out elements, and (if has 2 arguments) copies the rest of the
- // elements from \p CopyOp.
- // In most cases conversion involves floating-point value which may trigger a
- // hardware exception when not fully initialized. For this reason we require
- // \p ConvertOp[0:NumUsedElements] to be fully initialized and trap otherwise.
- // We copy the shadow of \p CopyOp[NumUsedElements:] to \p
- // Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always
- // return a fully initialized value.
+ if (!AI)
+ InstrumentLifetimeStart = false;
+ LifetimeStartList.push_back(std::make_pair(&I, AI));
+ }
+
+ void handleBswap(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *Op = I.getArgOperand(0);
+ Type *OpType = Op->getType();
+ Function *BswapFunc = Intrinsic::getDeclaration(
+ F.getParent(), Intrinsic::bswap, makeArrayRef(&OpType, 1));
+ setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op)));
+ setOrigin(&I, getOrigin(Op));
+ }
+
+ // Instrument vector convert intrinsic.
+ //
+ // This function instruments intrinsics like cvtsi2ss:
+ // %Out = int_xxx_cvtyyy(%ConvertOp)
+ // or
+ // %Out = int_xxx_cvtyyy(%CopyOp, %ConvertOp)
+ // Intrinsic converts \p NumUsedElements elements of \p ConvertOp to the same
+ // number \p Out elements, and (if has 2 arguments) copies the rest of the
+ // elements from \p CopyOp.
+ // In most cases conversion involves floating-point value which may trigger a
+ // hardware exception when not fully initialized. For this reason we require
+ // \p ConvertOp[0:NumUsedElements] to be fully initialized and trap otherwise.
+ // We copy the shadow of \p CopyOp[NumUsedElements:] to \p
+ // Out[NumUsedElements:]. This means that intrinsics without \p CopyOp always
+ // return a fully initialized value.
void handleVectorConvertIntrinsic(IntrinsicInst &I, int NumUsedElements,
bool HasRoundingMode = false) {
- IRBuilder<> IRB(&I);
- Value *CopyOp, *ConvertOp;
-
+ IRBuilder<> IRB(&I);
+ Value *CopyOp, *ConvertOp;
+
assert((!HasRoundingMode ||
isa<ConstantInt>(I.getArgOperand(I.getNumArgOperands() - 1))) &&
"Invalid rounding mode");
switch (I.getNumArgOperands() - HasRoundingMode) {
- case 2:
- CopyOp = I.getArgOperand(0);
- ConvertOp = I.getArgOperand(1);
- break;
- case 1:
- ConvertOp = I.getArgOperand(0);
- CopyOp = nullptr;
- break;
- default:
- llvm_unreachable("Cvt intrinsic with unsupported number of arguments.");
- }
-
- // The first *NumUsedElements* elements of ConvertOp are converted to the
- // same number of output elements. The rest of the output is copied from
- // CopyOp, or (if not available) filled with zeroes.
- // Combine shadow for elements of ConvertOp that are used in this operation,
- // and insert a check.
- // FIXME: consider propagating shadow of ConvertOp, at least in the case of
- // int->any conversion.
- Value *ConvertShadow = getShadow(ConvertOp);
- Value *AggShadow = nullptr;
- if (ConvertOp->getType()->isVectorTy()) {
- AggShadow = IRB.CreateExtractElement(
- ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
- for (int i = 1; i < NumUsedElements; ++i) {
- Value *MoreShadow = IRB.CreateExtractElement(
- ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), i));
- AggShadow = IRB.CreateOr(AggShadow, MoreShadow);
- }
- } else {
- AggShadow = ConvertShadow;
- }
- assert(AggShadow->getType()->isIntegerTy());
- insertShadowCheck(AggShadow, getOrigin(ConvertOp), &I);
-
- // Build result shadow by zero-filling parts of CopyOp shadow that come from
- // ConvertOp.
- if (CopyOp) {
- assert(CopyOp->getType() == I.getType());
- assert(CopyOp->getType()->isVectorTy());
- Value *ResultShadow = getShadow(CopyOp);
- Type *EltTy = cast<VectorType>(ResultShadow->getType())->getElementType();
- for (int i = 0; i < NumUsedElements; ++i) {
- ResultShadow = IRB.CreateInsertElement(
- ResultShadow, ConstantInt::getNullValue(EltTy),
- ConstantInt::get(IRB.getInt32Ty(), i));
- }
- setShadow(&I, ResultShadow);
- setOrigin(&I, getOrigin(CopyOp));
- } else {
- setShadow(&I, getCleanShadow(&I));
- setOrigin(&I, getCleanOrigin());
- }
- }
-
- // Given a scalar or vector, extract lower 64 bits (or less), and return all
- // zeroes if it is zero, and all ones otherwise.
- Value *Lower64ShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) {
- if (S->getType()->isVectorTy())
- S = CreateShadowCast(IRB, S, IRB.getInt64Ty(), /* Signed */ true);
- assert(S->getType()->getPrimitiveSizeInBits() <= 64);
- Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S));
- return CreateShadowCast(IRB, S2, T, /* Signed */ true);
- }
-
- // Given a vector, extract its first element, and return all
- // zeroes if it is zero, and all ones otherwise.
- Value *LowerElementShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) {
- Value *S1 = IRB.CreateExtractElement(S, (uint64_t)0);
- Value *S2 = IRB.CreateICmpNE(S1, getCleanShadow(S1));
- return CreateShadowCast(IRB, S2, T, /* Signed */ true);
- }
-
- Value *VariableShadowExtend(IRBuilder<> &IRB, Value *S) {
- Type *T = S->getType();
- assert(T->isVectorTy());
- Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S));
- return IRB.CreateSExt(S2, T);
- }
-
- // Instrument vector shift intrinsic.
- //
- // This function instruments intrinsics like int_x86_avx2_psll_w.
- // Intrinsic shifts %In by %ShiftSize bits.
- // %ShiftSize may be a vector. In that case the lower 64 bits determine shift
- // size, and the rest is ignored. Behavior is defined even if shift size is
- // greater than register (or field) width.
- void handleVectorShiftIntrinsic(IntrinsicInst &I, bool Variable) {
- assert(I.getNumArgOperands() == 2);
- IRBuilder<> IRB(&I);
- // If any of the S2 bits are poisoned, the whole thing is poisoned.
- // Otherwise perform the same shift on S1.
- Value *S1 = getShadow(&I, 0);
- Value *S2 = getShadow(&I, 1);
- Value *S2Conv = Variable ? VariableShadowExtend(IRB, S2)
- : Lower64ShadowExtend(IRB, S2, getShadowTy(&I));
- Value *V1 = I.getOperand(0);
- Value *V2 = I.getOperand(1);
- Value *Shift = IRB.CreateCall(I.getFunctionType(), I.getCalledOperand(),
- {IRB.CreateBitCast(S1, V1->getType()), V2});
- Shift = IRB.CreateBitCast(Shift, getShadowTy(&I));
- setShadow(&I, IRB.CreateOr(Shift, S2Conv));
- setOriginForNaryOp(I);
- }
-
- // Get an X86_MMX-sized vector type.
- Type *getMMXVectorTy(unsigned EltSizeInBits) {
- const unsigned X86_MMXSizeInBits = 64;
- assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
- "Illegal MMX vector element size");
- return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
- X86_MMXSizeInBits / EltSizeInBits);
- }
-
- // Returns a signed counterpart for an (un)signed-saturate-and-pack
- // intrinsic.
- Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) {
- switch (id) {
- case Intrinsic::x86_sse2_packsswb_128:
- case Intrinsic::x86_sse2_packuswb_128:
- return Intrinsic::x86_sse2_packsswb_128;
-
- case Intrinsic::x86_sse2_packssdw_128:
- case Intrinsic::x86_sse41_packusdw:
- return Intrinsic::x86_sse2_packssdw_128;
-
- case Intrinsic::x86_avx2_packsswb:
- case Intrinsic::x86_avx2_packuswb:
- return Intrinsic::x86_avx2_packsswb;
-
- case Intrinsic::x86_avx2_packssdw:
- case Intrinsic::x86_avx2_packusdw:
- return Intrinsic::x86_avx2_packssdw;
-
- case Intrinsic::x86_mmx_packsswb:
- case Intrinsic::x86_mmx_packuswb:
- return Intrinsic::x86_mmx_packsswb;
-
- case Intrinsic::x86_mmx_packssdw:
- return Intrinsic::x86_mmx_packssdw;
- default:
- llvm_unreachable("unexpected intrinsic id");
- }
- }
-
- // Instrument vector pack intrinsic.
- //
- // This function instruments intrinsics like x86_mmx_packsswb, that
- // packs elements of 2 input vectors into half as many bits with saturation.
- // Shadow is propagated with the signed variant of the same intrinsic applied
- // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
- // EltSizeInBits is used only for x86mmx arguments.
- void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) {
- assert(I.getNumArgOperands() == 2);
- bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
- IRBuilder<> IRB(&I);
- Value *S1 = getShadow(&I, 0);
- Value *S2 = getShadow(&I, 1);
- assert(isX86_MMX || S1->getType()->isVectorTy());
-
- // SExt and ICmpNE below must apply to individual elements of input vectors.
- // In case of x86mmx arguments, cast them to appropriate vector types and
- // back.
- Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType();
- if (isX86_MMX) {
- S1 = IRB.CreateBitCast(S1, T);
- S2 = IRB.CreateBitCast(S2, T);
- }
- Value *S1_ext = IRB.CreateSExt(
- IRB.CreateICmpNE(S1, Constant::getNullValue(T)), T);
- Value *S2_ext = IRB.CreateSExt(
- IRB.CreateICmpNE(S2, Constant::getNullValue(T)), T);
- if (isX86_MMX) {
- Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C);
- S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy);
- S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy);
- }
-
- Function *ShadowFn = Intrinsic::getDeclaration(
- F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID()));
-
- Value *S =
- IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack");
- if (isX86_MMX) S = IRB.CreateBitCast(S, getShadowTy(&I));
- setShadow(&I, S);
- setOriginForNaryOp(I);
- }
-
- // Instrument sum-of-absolute-differences intrinsic.
- void handleVectorSadIntrinsic(IntrinsicInst &I) {
- const unsigned SignificantBitsPerResultElement = 16;
- bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
- Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType();
- unsigned ZeroBitsPerResultElement =
- ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement;
-
- IRBuilder<> IRB(&I);
- Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
- S = IRB.CreateBitCast(S, ResTy);
- S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
- ResTy);
- S = IRB.CreateLShr(S, ZeroBitsPerResultElement);
- S = IRB.CreateBitCast(S, getShadowTy(&I));
- setShadow(&I, S);
- setOriginForNaryOp(I);
- }
-
- // Instrument multiply-add intrinsic.
- void handleVectorPmaddIntrinsic(IntrinsicInst &I,
- unsigned EltSizeInBits = 0) {
- bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
- Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType();
- IRBuilder<> IRB(&I);
- Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
- S = IRB.CreateBitCast(S, ResTy);
- S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
- ResTy);
- S = IRB.CreateBitCast(S, getShadowTy(&I));
- setShadow(&I, S);
- setOriginForNaryOp(I);
- }
-
- // Instrument compare-packed intrinsic.
- // Basically, an or followed by sext(icmp ne 0) to end up with all-zeros or
- // all-ones shadow.
- void handleVectorComparePackedIntrinsic(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Type *ResTy = getShadowTy(&I);
- Value *S0 = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
- Value *S = IRB.CreateSExt(
- IRB.CreateICmpNE(S0, Constant::getNullValue(ResTy)), ResTy);
- setShadow(&I, S);
- setOriginForNaryOp(I);
- }
-
- // Instrument compare-scalar intrinsic.
- // This handles both cmp* intrinsics which return the result in the first
- // element of a vector, and comi* which return the result as i32.
- void handleVectorCompareScalarIntrinsic(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Value *S0 = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
- Value *S = LowerElementShadowExtend(IRB, S0, getShadowTy(&I));
- setShadow(&I, S);
- setOriginForNaryOp(I);
- }
-
- // Instrument generic vector reduction intrinsics
- // by ORing together all their fields.
- void handleVectorReduceIntrinsic(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Value *S = IRB.CreateOrReduce(getShadow(&I, 0));
- setShadow(&I, S);
- setOrigin(&I, getOrigin(&I, 0));
- }
-
+ case 2:
+ CopyOp = I.getArgOperand(0);
+ ConvertOp = I.getArgOperand(1);
+ break;
+ case 1:
+ ConvertOp = I.getArgOperand(0);
+ CopyOp = nullptr;
+ break;
+ default:
+ llvm_unreachable("Cvt intrinsic with unsupported number of arguments.");
+ }
+
+ // The first *NumUsedElements* elements of ConvertOp are converted to the
+ // same number of output elements. The rest of the output is copied from
+ // CopyOp, or (if not available) filled with zeroes.
+ // Combine shadow for elements of ConvertOp that are used in this operation,
+ // and insert a check.
+ // FIXME: consider propagating shadow of ConvertOp, at least in the case of
+ // int->any conversion.
+ Value *ConvertShadow = getShadow(ConvertOp);
+ Value *AggShadow = nullptr;
+ if (ConvertOp->getType()->isVectorTy()) {
+ AggShadow = IRB.CreateExtractElement(
+ ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+ for (int i = 1; i < NumUsedElements; ++i) {
+ Value *MoreShadow = IRB.CreateExtractElement(
+ ConvertShadow, ConstantInt::get(IRB.getInt32Ty(), i));
+ AggShadow = IRB.CreateOr(AggShadow, MoreShadow);
+ }
+ } else {
+ AggShadow = ConvertShadow;
+ }
+ assert(AggShadow->getType()->isIntegerTy());
+ insertShadowCheck(AggShadow, getOrigin(ConvertOp), &I);
+
+ // Build result shadow by zero-filling parts of CopyOp shadow that come from
+ // ConvertOp.
+ if (CopyOp) {
+ assert(CopyOp->getType() == I.getType());
+ assert(CopyOp->getType()->isVectorTy());
+ Value *ResultShadow = getShadow(CopyOp);
+ Type *EltTy = cast<VectorType>(ResultShadow->getType())->getElementType();
+ for (int i = 0; i < NumUsedElements; ++i) {
+ ResultShadow = IRB.CreateInsertElement(
+ ResultShadow, ConstantInt::getNullValue(EltTy),
+ ConstantInt::get(IRB.getInt32Ty(), i));
+ }
+ setShadow(&I, ResultShadow);
+ setOrigin(&I, getOrigin(CopyOp));
+ } else {
+ setShadow(&I, getCleanShadow(&I));
+ setOrigin(&I, getCleanOrigin());
+ }
+ }
+
+ // Given a scalar or vector, extract lower 64 bits (or less), and return all
+ // zeroes if it is zero, and all ones otherwise.
+ Value *Lower64ShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) {
+ if (S->getType()->isVectorTy())
+ S = CreateShadowCast(IRB, S, IRB.getInt64Ty(), /* Signed */ true);
+ assert(S->getType()->getPrimitiveSizeInBits() <= 64);
+ Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S));
+ return CreateShadowCast(IRB, S2, T, /* Signed */ true);
+ }
+
+ // Given a vector, extract its first element, and return all
+ // zeroes if it is zero, and all ones otherwise.
+ Value *LowerElementShadowExtend(IRBuilder<> &IRB, Value *S, Type *T) {
+ Value *S1 = IRB.CreateExtractElement(S, (uint64_t)0);
+ Value *S2 = IRB.CreateICmpNE(S1, getCleanShadow(S1));
+ return CreateShadowCast(IRB, S2, T, /* Signed */ true);
+ }
+
+ Value *VariableShadowExtend(IRBuilder<> &IRB, Value *S) {
+ Type *T = S->getType();
+ assert(T->isVectorTy());
+ Value *S2 = IRB.CreateICmpNE(S, getCleanShadow(S));
+ return IRB.CreateSExt(S2, T);
+ }
+
+ // Instrument vector shift intrinsic.
+ //
+ // This function instruments intrinsics like int_x86_avx2_psll_w.
+ // Intrinsic shifts %In by %ShiftSize bits.
+ // %ShiftSize may be a vector. In that case the lower 64 bits determine shift
+ // size, and the rest is ignored. Behavior is defined even if shift size is
+ // greater than register (or field) width.
+ void handleVectorShiftIntrinsic(IntrinsicInst &I, bool Variable) {
+ assert(I.getNumArgOperands() == 2);
+ IRBuilder<> IRB(&I);
+ // If any of the S2 bits are poisoned, the whole thing is poisoned.
+ // Otherwise perform the same shift on S1.
+ Value *S1 = getShadow(&I, 0);
+ Value *S2 = getShadow(&I, 1);
+ Value *S2Conv = Variable ? VariableShadowExtend(IRB, S2)
+ : Lower64ShadowExtend(IRB, S2, getShadowTy(&I));
+ Value *V1 = I.getOperand(0);
+ Value *V2 = I.getOperand(1);
+ Value *Shift = IRB.CreateCall(I.getFunctionType(), I.getCalledOperand(),
+ {IRB.CreateBitCast(S1, V1->getType()), V2});
+ Shift = IRB.CreateBitCast(Shift, getShadowTy(&I));
+ setShadow(&I, IRB.CreateOr(Shift, S2Conv));
+ setOriginForNaryOp(I);
+ }
+
+ // Get an X86_MMX-sized vector type.
+ Type *getMMXVectorTy(unsigned EltSizeInBits) {
+ const unsigned X86_MMXSizeInBits = 64;
+ assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
+ "Illegal MMX vector element size");
+ return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
+ X86_MMXSizeInBits / EltSizeInBits);
+ }
+
+ // Returns a signed counterpart for an (un)signed-saturate-and-pack
+ // intrinsic.
+ Intrinsic::ID getSignedPackIntrinsic(Intrinsic::ID id) {
+ switch (id) {
+ case Intrinsic::x86_sse2_packsswb_128:
+ case Intrinsic::x86_sse2_packuswb_128:
+ return Intrinsic::x86_sse2_packsswb_128;
+
+ case Intrinsic::x86_sse2_packssdw_128:
+ case Intrinsic::x86_sse41_packusdw:
+ return Intrinsic::x86_sse2_packssdw_128;
+
+ case Intrinsic::x86_avx2_packsswb:
+ case Intrinsic::x86_avx2_packuswb:
+ return Intrinsic::x86_avx2_packsswb;
+
+ case Intrinsic::x86_avx2_packssdw:
+ case Intrinsic::x86_avx2_packusdw:
+ return Intrinsic::x86_avx2_packssdw;
+
+ case Intrinsic::x86_mmx_packsswb:
+ case Intrinsic::x86_mmx_packuswb:
+ return Intrinsic::x86_mmx_packsswb;
+
+ case Intrinsic::x86_mmx_packssdw:
+ return Intrinsic::x86_mmx_packssdw;
+ default:
+ llvm_unreachable("unexpected intrinsic id");
+ }
+ }
+
+ // Instrument vector pack intrinsic.
+ //
+ // This function instruments intrinsics like x86_mmx_packsswb, that
+ // packs elements of 2 input vectors into half as many bits with saturation.
+ // Shadow is propagated with the signed variant of the same intrinsic applied
+ // to sext(Sa != zeroinitializer), sext(Sb != zeroinitializer).
+ // EltSizeInBits is used only for x86mmx arguments.
+ void handleVectorPackIntrinsic(IntrinsicInst &I, unsigned EltSizeInBits = 0) {
+ assert(I.getNumArgOperands() == 2);
+ bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+ IRBuilder<> IRB(&I);
+ Value *S1 = getShadow(&I, 0);
+ Value *S2 = getShadow(&I, 1);
+ assert(isX86_MMX || S1->getType()->isVectorTy());
+
+ // SExt and ICmpNE below must apply to individual elements of input vectors.
+ // In case of x86mmx arguments, cast them to appropriate vector types and
+ // back.
+ Type *T = isX86_MMX ? getMMXVectorTy(EltSizeInBits) : S1->getType();
+ if (isX86_MMX) {
+ S1 = IRB.CreateBitCast(S1, T);
+ S2 = IRB.CreateBitCast(S2, T);
+ }
+ Value *S1_ext = IRB.CreateSExt(
+ IRB.CreateICmpNE(S1, Constant::getNullValue(T)), T);
+ Value *S2_ext = IRB.CreateSExt(
+ IRB.CreateICmpNE(S2, Constant::getNullValue(T)), T);
+ if (isX86_MMX) {
+ Type *X86_MMXTy = Type::getX86_MMXTy(*MS.C);
+ S1_ext = IRB.CreateBitCast(S1_ext, X86_MMXTy);
+ S2_ext = IRB.CreateBitCast(S2_ext, X86_MMXTy);
+ }
+
+ Function *ShadowFn = Intrinsic::getDeclaration(
+ F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID()));
+
+ Value *S =
+ IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack");
+ if (isX86_MMX) S = IRB.CreateBitCast(S, getShadowTy(&I));
+ setShadow(&I, S);
+ setOriginForNaryOp(I);
+ }
+
+ // Instrument sum-of-absolute-differences intrinsic.
+ void handleVectorSadIntrinsic(IntrinsicInst &I) {
+ const unsigned SignificantBitsPerResultElement = 16;
+ bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+ Type *ResTy = isX86_MMX ? IntegerType::get(*MS.C, 64) : I.getType();
+ unsigned ZeroBitsPerResultElement =
+ ResTy->getScalarSizeInBits() - SignificantBitsPerResultElement;
+
+ IRBuilder<> IRB(&I);
+ Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+ S = IRB.CreateBitCast(S, ResTy);
+ S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+ ResTy);
+ S = IRB.CreateLShr(S, ZeroBitsPerResultElement);
+ S = IRB.CreateBitCast(S, getShadowTy(&I));
+ setShadow(&I, S);
+ setOriginForNaryOp(I);
+ }
+
+ // Instrument multiply-add intrinsic.
+ void handleVectorPmaddIntrinsic(IntrinsicInst &I,
+ unsigned EltSizeInBits = 0) {
+ bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
+ Type *ResTy = isX86_MMX ? getMMXVectorTy(EltSizeInBits * 2) : I.getType();
+ IRBuilder<> IRB(&I);
+ Value *S = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+ S = IRB.CreateBitCast(S, ResTy);
+ S = IRB.CreateSExt(IRB.CreateICmpNE(S, Constant::getNullValue(ResTy)),
+ ResTy);
+ S = IRB.CreateBitCast(S, getShadowTy(&I));
+ setShadow(&I, S);
+ setOriginForNaryOp(I);
+ }
+
+ // Instrument compare-packed intrinsic.
+ // Basically, an or followed by sext(icmp ne 0) to end up with all-zeros or
+ // all-ones shadow.
+ void handleVectorComparePackedIntrinsic(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Type *ResTy = getShadowTy(&I);
+ Value *S0 = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+ Value *S = IRB.CreateSExt(
+ IRB.CreateICmpNE(S0, Constant::getNullValue(ResTy)), ResTy);
+ setShadow(&I, S);
+ setOriginForNaryOp(I);
+ }
+
+ // Instrument compare-scalar intrinsic.
+ // This handles both cmp* intrinsics which return the result in the first
+ // element of a vector, and comi* which return the result as i32.
+ void handleVectorCompareScalarIntrinsic(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *S0 = IRB.CreateOr(getShadow(&I, 0), getShadow(&I, 1));
+ Value *S = LowerElementShadowExtend(IRB, S0, getShadowTy(&I));
+ setShadow(&I, S);
+ setOriginForNaryOp(I);
+ }
+
+ // Instrument generic vector reduction intrinsics
+ // by ORing together all their fields.
+ void handleVectorReduceIntrinsic(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *S = IRB.CreateOrReduce(getShadow(&I, 0));
+ setShadow(&I, S);
+ setOrigin(&I, getOrigin(&I, 0));
+ }
+
// Instrument vector.reduce.or intrinsic.
- // Valid (non-poisoned) set bits in the operand pull low the
- // corresponding shadow bits.
- void handleVectorReduceOrIntrinsic(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Value *OperandShadow = getShadow(&I, 0);
- Value *OperandUnsetBits = IRB.CreateNot(I.getOperand(0));
- Value *OperandUnsetOrPoison = IRB.CreateOr(OperandUnsetBits, OperandShadow);
- // Bit N is clean if any field's bit N is 1 and unpoison
- Value *OutShadowMask = IRB.CreateAndReduce(OperandUnsetOrPoison);
- // Otherwise, it is clean if every field's bit N is unpoison
- Value *OrShadow = IRB.CreateOrReduce(OperandShadow);
- Value *S = IRB.CreateAnd(OutShadowMask, OrShadow);
-
- setShadow(&I, S);
- setOrigin(&I, getOrigin(&I, 0));
- }
-
+ // Valid (non-poisoned) set bits in the operand pull low the
+ // corresponding shadow bits.
+ void handleVectorReduceOrIntrinsic(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *OperandShadow = getShadow(&I, 0);
+ Value *OperandUnsetBits = IRB.CreateNot(I.getOperand(0));
+ Value *OperandUnsetOrPoison = IRB.CreateOr(OperandUnsetBits, OperandShadow);
+ // Bit N is clean if any field's bit N is 1 and unpoison
+ Value *OutShadowMask = IRB.CreateAndReduce(OperandUnsetOrPoison);
+ // Otherwise, it is clean if every field's bit N is unpoison
+ Value *OrShadow = IRB.CreateOrReduce(OperandShadow);
+ Value *S = IRB.CreateAnd(OutShadowMask, OrShadow);
+
+ setShadow(&I, S);
+ setOrigin(&I, getOrigin(&I, 0));
+ }
+
// Instrument vector.reduce.and intrinsic.
- // Valid (non-poisoned) unset bits in the operand pull down the
- // corresponding shadow bits.
- void handleVectorReduceAndIntrinsic(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Value *OperandShadow = getShadow(&I, 0);
- Value *OperandSetOrPoison = IRB.CreateOr(I.getOperand(0), OperandShadow);
- // Bit N is clean if any field's bit N is 0 and unpoison
- Value *OutShadowMask = IRB.CreateAndReduce(OperandSetOrPoison);
- // Otherwise, it is clean if every field's bit N is unpoison
- Value *OrShadow = IRB.CreateOrReduce(OperandShadow);
- Value *S = IRB.CreateAnd(OutShadowMask, OrShadow);
-
- setShadow(&I, S);
- setOrigin(&I, getOrigin(&I, 0));
- }
-
- void handleStmxcsr(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Value* Addr = I.getArgOperand(0);
- Type *Ty = IRB.getInt32Ty();
- Value *ShadowPtr =
- getShadowOriginPtr(Addr, IRB, Ty, Align(1), /*isStore*/ true).first;
-
- IRB.CreateStore(getCleanShadow(Ty),
- IRB.CreatePointerCast(ShadowPtr, Ty->getPointerTo()));
-
- if (ClCheckAccessAddress)
- insertShadowCheck(Addr, &I);
- }
-
- void handleLdmxcsr(IntrinsicInst &I) {
- if (!InsertChecks) return;
-
- IRBuilder<> IRB(&I);
- Value *Addr = I.getArgOperand(0);
- Type *Ty = IRB.getInt32Ty();
- const Align Alignment = Align(1);
- Value *ShadowPtr, *OriginPtr;
- std::tie(ShadowPtr, OriginPtr) =
- getShadowOriginPtr(Addr, IRB, Ty, Alignment, /*isStore*/ false);
-
- if (ClCheckAccessAddress)
- insertShadowCheck(Addr, &I);
-
- Value *Shadow = IRB.CreateAlignedLoad(Ty, ShadowPtr, Alignment, "_ldmxcsr");
- Value *Origin = MS.TrackOrigins ? IRB.CreateLoad(MS.OriginTy, OriginPtr)
- : getCleanOrigin();
- insertShadowCheck(Shadow, Origin, &I);
- }
-
- void handleMaskedStore(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Value *V = I.getArgOperand(0);
- Value *Addr = I.getArgOperand(1);
- const Align Alignment(
- cast<ConstantInt>(I.getArgOperand(2))->getZExtValue());
- Value *Mask = I.getArgOperand(3);
- Value *Shadow = getShadow(V);
-
- Value *ShadowPtr;
- Value *OriginPtr;
- std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
- Addr, IRB, Shadow->getType(), Alignment, /*isStore*/ true);
-
- if (ClCheckAccessAddress) {
- insertShadowCheck(Addr, &I);
- // Uninitialized mask is kind of like uninitialized address, but not as
- // scary.
- insertShadowCheck(Mask, &I);
- }
-
- IRB.CreateMaskedStore(Shadow, ShadowPtr, Alignment, Mask);
-
- if (MS.TrackOrigins) {
- auto &DL = F.getParent()->getDataLayout();
- paintOrigin(IRB, getOrigin(V), OriginPtr,
- DL.getTypeStoreSize(Shadow->getType()),
- std::max(Alignment, kMinOriginAlignment));
- }
- }
-
- bool handleMaskedLoad(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Value *Addr = I.getArgOperand(0);
- const Align Alignment(
- cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
- Value *Mask = I.getArgOperand(2);
- Value *PassThru = I.getArgOperand(3);
-
- Type *ShadowTy = getShadowTy(&I);
- Value *ShadowPtr, *OriginPtr;
- if (PropagateShadow) {
- std::tie(ShadowPtr, OriginPtr) =
- getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
- setShadow(&I, IRB.CreateMaskedLoad(ShadowPtr, Alignment, Mask,
- getShadow(PassThru), "_msmaskedld"));
- } else {
- setShadow(&I, getCleanShadow(&I));
- }
-
- if (ClCheckAccessAddress) {
- insertShadowCheck(Addr, &I);
- insertShadowCheck(Mask, &I);
- }
-
- if (MS.TrackOrigins) {
- if (PropagateShadow) {
- // Choose between PassThru's and the loaded value's origins.
- Value *MaskedPassThruShadow = IRB.CreateAnd(
- getShadow(PassThru), IRB.CreateSExt(IRB.CreateNeg(Mask), ShadowTy));
-
- Value *Acc = IRB.CreateExtractElement(
- MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
- for (int i = 1, N = cast<FixedVectorType>(PassThru->getType())
- ->getNumElements();
- i < N; ++i) {
- Value *More = IRB.CreateExtractElement(
- MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), i));
- Acc = IRB.CreateOr(Acc, More);
- }
-
- Value *Origin = IRB.CreateSelect(
- IRB.CreateICmpNE(Acc, Constant::getNullValue(Acc->getType())),
- getOrigin(PassThru), IRB.CreateLoad(MS.OriginTy, OriginPtr));
-
- setOrigin(&I, Origin);
- } else {
- setOrigin(&I, getCleanOrigin());
- }
- }
- return true;
- }
-
- // Instrument BMI / BMI2 intrinsics.
- // All of these intrinsics are Z = I(X, Y)
- // where the types of all operands and the result match, and are either i32 or i64.
- // The following instrumentation happens to work for all of them:
- // Sz = I(Sx, Y) | (sext (Sy != 0))
- void handleBmiIntrinsic(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Type *ShadowTy = getShadowTy(&I);
-
- // If any bit of the mask operand is poisoned, then the whole thing is.
- Value *SMask = getShadow(&I, 1);
- SMask = IRB.CreateSExt(IRB.CreateICmpNE(SMask, getCleanShadow(ShadowTy)),
- ShadowTy);
- // Apply the same intrinsic to the shadow of the first operand.
- Value *S = IRB.CreateCall(I.getCalledFunction(),
- {getShadow(&I, 0), I.getOperand(1)});
- S = IRB.CreateOr(SMask, S);
- setShadow(&I, S);
- setOriginForNaryOp(I);
- }
-
- SmallVector<int, 8> getPclmulMask(unsigned Width, bool OddElements) {
- SmallVector<int, 8> Mask;
- for (unsigned X = OddElements ? 1 : 0; X < Width; X += 2) {
- Mask.append(2, X);
- }
- return Mask;
- }
-
- // Instrument pclmul intrinsics.
- // These intrinsics operate either on odd or on even elements of the input
- // vectors, depending on the constant in the 3rd argument, ignoring the rest.
- // Replace the unused elements with copies of the used ones, ex:
- // (0, 1, 2, 3) -> (0, 0, 2, 2) (even case)
- // or
- // (0, 1, 2, 3) -> (1, 1, 3, 3) (odd case)
- // and then apply the usual shadow combining logic.
- void handlePclmulIntrinsic(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- unsigned Width =
- cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements();
- assert(isa<ConstantInt>(I.getArgOperand(2)) &&
- "pclmul 3rd operand must be a constant");
- unsigned Imm = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
+ // Valid (non-poisoned) unset bits in the operand pull down the
+ // corresponding shadow bits.
+ void handleVectorReduceAndIntrinsic(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *OperandShadow = getShadow(&I, 0);
+ Value *OperandSetOrPoison = IRB.CreateOr(I.getOperand(0), OperandShadow);
+ // Bit N is clean if any field's bit N is 0 and unpoison
+ Value *OutShadowMask = IRB.CreateAndReduce(OperandSetOrPoison);
+ // Otherwise, it is clean if every field's bit N is unpoison
+ Value *OrShadow = IRB.CreateOrReduce(OperandShadow);
+ Value *S = IRB.CreateAnd(OutShadowMask, OrShadow);
+
+ setShadow(&I, S);
+ setOrigin(&I, getOrigin(&I, 0));
+ }
+
+ void handleStmxcsr(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value* Addr = I.getArgOperand(0);
+ Type *Ty = IRB.getInt32Ty();
+ Value *ShadowPtr =
+ getShadowOriginPtr(Addr, IRB, Ty, Align(1), /*isStore*/ true).first;
+
+ IRB.CreateStore(getCleanShadow(Ty),
+ IRB.CreatePointerCast(ShadowPtr, Ty->getPointerTo()));
+
+ if (ClCheckAccessAddress)
+ insertShadowCheck(Addr, &I);
+ }
+
+ void handleLdmxcsr(IntrinsicInst &I) {
+ if (!InsertChecks) return;
+
+ IRBuilder<> IRB(&I);
+ Value *Addr = I.getArgOperand(0);
+ Type *Ty = IRB.getInt32Ty();
+ const Align Alignment = Align(1);
+ Value *ShadowPtr, *OriginPtr;
+ std::tie(ShadowPtr, OriginPtr) =
+ getShadowOriginPtr(Addr, IRB, Ty, Alignment, /*isStore*/ false);
+
+ if (ClCheckAccessAddress)
+ insertShadowCheck(Addr, &I);
+
+ Value *Shadow = IRB.CreateAlignedLoad(Ty, ShadowPtr, Alignment, "_ldmxcsr");
+ Value *Origin = MS.TrackOrigins ? IRB.CreateLoad(MS.OriginTy, OriginPtr)
+ : getCleanOrigin();
+ insertShadowCheck(Shadow, Origin, &I);
+ }
+
+ void handleMaskedStore(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *V = I.getArgOperand(0);
+ Value *Addr = I.getArgOperand(1);
+ const Align Alignment(
+ cast<ConstantInt>(I.getArgOperand(2))->getZExtValue());
+ Value *Mask = I.getArgOperand(3);
+ Value *Shadow = getShadow(V);
+
+ Value *ShadowPtr;
+ Value *OriginPtr;
+ std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
+ Addr, IRB, Shadow->getType(), Alignment, /*isStore*/ true);
+
+ if (ClCheckAccessAddress) {
+ insertShadowCheck(Addr, &I);
+ // Uninitialized mask is kind of like uninitialized address, but not as
+ // scary.
+ insertShadowCheck(Mask, &I);
+ }
+
+ IRB.CreateMaskedStore(Shadow, ShadowPtr, Alignment, Mask);
+
+ if (MS.TrackOrigins) {
+ auto &DL = F.getParent()->getDataLayout();
+ paintOrigin(IRB, getOrigin(V), OriginPtr,
+ DL.getTypeStoreSize(Shadow->getType()),
+ std::max(Alignment, kMinOriginAlignment));
+ }
+ }
+
+ bool handleMaskedLoad(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *Addr = I.getArgOperand(0);
+ const Align Alignment(
+ cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
+ Value *Mask = I.getArgOperand(2);
+ Value *PassThru = I.getArgOperand(3);
+
+ Type *ShadowTy = getShadowTy(&I);
+ Value *ShadowPtr, *OriginPtr;
+ if (PropagateShadow) {
+ std::tie(ShadowPtr, OriginPtr) =
+ getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
+ setShadow(&I, IRB.CreateMaskedLoad(ShadowPtr, Alignment, Mask,
+ getShadow(PassThru), "_msmaskedld"));
+ } else {
+ setShadow(&I, getCleanShadow(&I));
+ }
+
+ if (ClCheckAccessAddress) {
+ insertShadowCheck(Addr, &I);
+ insertShadowCheck(Mask, &I);
+ }
+
+ if (MS.TrackOrigins) {
+ if (PropagateShadow) {
+ // Choose between PassThru's and the loaded value's origins.
+ Value *MaskedPassThruShadow = IRB.CreateAnd(
+ getShadow(PassThru), IRB.CreateSExt(IRB.CreateNeg(Mask), ShadowTy));
+
+ Value *Acc = IRB.CreateExtractElement(
+ MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
+ for (int i = 1, N = cast<FixedVectorType>(PassThru->getType())
+ ->getNumElements();
+ i < N; ++i) {
+ Value *More = IRB.CreateExtractElement(
+ MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), i));
+ Acc = IRB.CreateOr(Acc, More);
+ }
+
+ Value *Origin = IRB.CreateSelect(
+ IRB.CreateICmpNE(Acc, Constant::getNullValue(Acc->getType())),
+ getOrigin(PassThru), IRB.CreateLoad(MS.OriginTy, OriginPtr));
+
+ setOrigin(&I, Origin);
+ } else {
+ setOrigin(&I, getCleanOrigin());
+ }
+ }
+ return true;
+ }
+
+ // Instrument BMI / BMI2 intrinsics.
+ // All of these intrinsics are Z = I(X, Y)
+ // where the types of all operands and the result match, and are either i32 or i64.
+ // The following instrumentation happens to work for all of them:
+ // Sz = I(Sx, Y) | (sext (Sy != 0))
+ void handleBmiIntrinsic(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Type *ShadowTy = getShadowTy(&I);
+
+ // If any bit of the mask operand is poisoned, then the whole thing is.
+ Value *SMask = getShadow(&I, 1);
+ SMask = IRB.CreateSExt(IRB.CreateICmpNE(SMask, getCleanShadow(ShadowTy)),
+ ShadowTy);
+ // Apply the same intrinsic to the shadow of the first operand.
+ Value *S = IRB.CreateCall(I.getCalledFunction(),
+ {getShadow(&I, 0), I.getOperand(1)});
+ S = IRB.CreateOr(SMask, S);
+ setShadow(&I, S);
+ setOriginForNaryOp(I);
+ }
+
+ SmallVector<int, 8> getPclmulMask(unsigned Width, bool OddElements) {
+ SmallVector<int, 8> Mask;
+ for (unsigned X = OddElements ? 1 : 0; X < Width; X += 2) {
+ Mask.append(2, X);
+ }
+ return Mask;
+ }
+
+ // Instrument pclmul intrinsics.
+ // These intrinsics operate either on odd or on even elements of the input
+ // vectors, depending on the constant in the 3rd argument, ignoring the rest.
+ // Replace the unused elements with copies of the used ones, ex:
+ // (0, 1, 2, 3) -> (0, 0, 2, 2) (even case)
+ // or
+ // (0, 1, 2, 3) -> (1, 1, 3, 3) (odd case)
+ // and then apply the usual shadow combining logic.
+ void handlePclmulIntrinsic(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ unsigned Width =
+ cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements();
+ assert(isa<ConstantInt>(I.getArgOperand(2)) &&
+ "pclmul 3rd operand must be a constant");
+ unsigned Imm = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
Value *Shuf0 = IRB.CreateShuffleVector(getShadow(&I, 0),
getPclmulMask(Width, Imm & 0x01));
Value *Shuf1 = IRB.CreateShuffleVector(getShadow(&I, 1),
getPclmulMask(Width, Imm & 0x10));
- ShadowAndOriginCombiner SOC(this, IRB);
- SOC.Add(Shuf0, getOrigin(&I, 0));
- SOC.Add(Shuf1, getOrigin(&I, 1));
- SOC.Done(&I);
- }
-
- // Instrument _mm_*_sd intrinsics
- void handleUnarySdIntrinsic(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Value *First = getShadow(&I, 0);
- Value *Second = getShadow(&I, 1);
- // High word of first operand, low word of second
- Value *Shadow =
- IRB.CreateShuffleVector(First, Second, llvm::makeArrayRef<int>({2, 1}));
-
- setShadow(&I, Shadow);
- setOriginForNaryOp(I);
- }
-
- void handleBinarySdIntrinsic(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Value *First = getShadow(&I, 0);
- Value *Second = getShadow(&I, 1);
- Value *OrShadow = IRB.CreateOr(First, Second);
- // High word of first operand, low word of both OR'd together
- Value *Shadow = IRB.CreateShuffleVector(First, OrShadow,
- llvm::makeArrayRef<int>({2, 1}));
-
- setShadow(&I, Shadow);
- setOriginForNaryOp(I);
- }
-
+ ShadowAndOriginCombiner SOC(this, IRB);
+ SOC.Add(Shuf0, getOrigin(&I, 0));
+ SOC.Add(Shuf1, getOrigin(&I, 1));
+ SOC.Done(&I);
+ }
+
+ // Instrument _mm_*_sd intrinsics
+ void handleUnarySdIntrinsic(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *First = getShadow(&I, 0);
+ Value *Second = getShadow(&I, 1);
+ // High word of first operand, low word of second
+ Value *Shadow =
+ IRB.CreateShuffleVector(First, Second, llvm::makeArrayRef<int>({2, 1}));
+
+ setShadow(&I, Shadow);
+ setOriginForNaryOp(I);
+ }
+
+ void handleBinarySdIntrinsic(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *First = getShadow(&I, 0);
+ Value *Second = getShadow(&I, 1);
+ Value *OrShadow = IRB.CreateOr(First, Second);
+ // High word of first operand, low word of both OR'd together
+ Value *Shadow = IRB.CreateShuffleVector(First, OrShadow,
+ llvm::makeArrayRef<int>({2, 1}));
+
+ setShadow(&I, Shadow);
+ setOriginForNaryOp(I);
+ }
+
// Instrument abs intrinsic.
// handleUnknownIntrinsic can't handle it because of the last
// is_int_min_poison argument which does not match the result type.
@@ -3244,282 +3244,282 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
setOrigin(&I, getOrigin(&I, 0));
}
- void visitIntrinsicInst(IntrinsicInst &I) {
- switch (I.getIntrinsicID()) {
+ void visitIntrinsicInst(IntrinsicInst &I) {
+ switch (I.getIntrinsicID()) {
case Intrinsic::abs:
handleAbsIntrinsic(I);
break;
- case Intrinsic::lifetime_start:
- handleLifetimeStart(I);
- break;
- case Intrinsic::launder_invariant_group:
- case Intrinsic::strip_invariant_group:
- handleInvariantGroup(I);
- break;
- case Intrinsic::bswap:
- handleBswap(I);
- break;
- case Intrinsic::masked_store:
- handleMaskedStore(I);
- break;
- case Intrinsic::masked_load:
- handleMaskedLoad(I);
- break;
+ case Intrinsic::lifetime_start:
+ handleLifetimeStart(I);
+ break;
+ case Intrinsic::launder_invariant_group:
+ case Intrinsic::strip_invariant_group:
+ handleInvariantGroup(I);
+ break;
+ case Intrinsic::bswap:
+ handleBswap(I);
+ break;
+ case Intrinsic::masked_store:
+ handleMaskedStore(I);
+ break;
+ case Intrinsic::masked_load:
+ handleMaskedLoad(I);
+ break;
case Intrinsic::vector_reduce_and:
- handleVectorReduceAndIntrinsic(I);
- break;
+ handleVectorReduceAndIntrinsic(I);
+ break;
case Intrinsic::vector_reduce_or:
- handleVectorReduceOrIntrinsic(I);
- break;
+ handleVectorReduceOrIntrinsic(I);
+ break;
case Intrinsic::vector_reduce_add:
case Intrinsic::vector_reduce_xor:
case Intrinsic::vector_reduce_mul:
- handleVectorReduceIntrinsic(I);
- break;
- case Intrinsic::x86_sse_stmxcsr:
- handleStmxcsr(I);
- break;
- case Intrinsic::x86_sse_ldmxcsr:
- handleLdmxcsr(I);
- break;
- case Intrinsic::x86_avx512_vcvtsd2usi64:
- case Intrinsic::x86_avx512_vcvtsd2usi32:
- case Intrinsic::x86_avx512_vcvtss2usi64:
- case Intrinsic::x86_avx512_vcvtss2usi32:
- case Intrinsic::x86_avx512_cvttss2usi64:
- case Intrinsic::x86_avx512_cvttss2usi:
- case Intrinsic::x86_avx512_cvttsd2usi64:
- case Intrinsic::x86_avx512_cvttsd2usi:
- case Intrinsic::x86_avx512_cvtusi2ss:
- case Intrinsic::x86_avx512_cvtusi642sd:
- case Intrinsic::x86_avx512_cvtusi642ss:
+ handleVectorReduceIntrinsic(I);
+ break;
+ case Intrinsic::x86_sse_stmxcsr:
+ handleStmxcsr(I);
+ break;
+ case Intrinsic::x86_sse_ldmxcsr:
+ handleLdmxcsr(I);
+ break;
+ case Intrinsic::x86_avx512_vcvtsd2usi64:
+ case Intrinsic::x86_avx512_vcvtsd2usi32:
+ case Intrinsic::x86_avx512_vcvtss2usi64:
+ case Intrinsic::x86_avx512_vcvtss2usi32:
+ case Intrinsic::x86_avx512_cvttss2usi64:
+ case Intrinsic::x86_avx512_cvttss2usi:
+ case Intrinsic::x86_avx512_cvttsd2usi64:
+ case Intrinsic::x86_avx512_cvttsd2usi:
+ case Intrinsic::x86_avx512_cvtusi2ss:
+ case Intrinsic::x86_avx512_cvtusi642sd:
+ case Intrinsic::x86_avx512_cvtusi642ss:
handleVectorConvertIntrinsic(I, 1, true);
break;
- case Intrinsic::x86_sse2_cvtsd2si64:
- case Intrinsic::x86_sse2_cvtsd2si:
- case Intrinsic::x86_sse2_cvtsd2ss:
- case Intrinsic::x86_sse2_cvttsd2si64:
- case Intrinsic::x86_sse2_cvttsd2si:
- case Intrinsic::x86_sse_cvtss2si64:
- case Intrinsic::x86_sse_cvtss2si:
- case Intrinsic::x86_sse_cvttss2si64:
- case Intrinsic::x86_sse_cvttss2si:
- handleVectorConvertIntrinsic(I, 1);
- break;
- case Intrinsic::x86_sse_cvtps2pi:
- case Intrinsic::x86_sse_cvttps2pi:
- handleVectorConvertIntrinsic(I, 2);
- break;
-
- case Intrinsic::x86_avx512_psll_w_512:
- case Intrinsic::x86_avx512_psll_d_512:
- case Intrinsic::x86_avx512_psll_q_512:
- case Intrinsic::x86_avx512_pslli_w_512:
- case Intrinsic::x86_avx512_pslli_d_512:
- case Intrinsic::x86_avx512_pslli_q_512:
- case Intrinsic::x86_avx512_psrl_w_512:
- case Intrinsic::x86_avx512_psrl_d_512:
- case Intrinsic::x86_avx512_psrl_q_512:
- case Intrinsic::x86_avx512_psra_w_512:
- case Intrinsic::x86_avx512_psra_d_512:
- case Intrinsic::x86_avx512_psra_q_512:
- case Intrinsic::x86_avx512_psrli_w_512:
- case Intrinsic::x86_avx512_psrli_d_512:
- case Intrinsic::x86_avx512_psrli_q_512:
- case Intrinsic::x86_avx512_psrai_w_512:
- case Intrinsic::x86_avx512_psrai_d_512:
- case Intrinsic::x86_avx512_psrai_q_512:
- case Intrinsic::x86_avx512_psra_q_256:
- case Intrinsic::x86_avx512_psra_q_128:
- case Intrinsic::x86_avx512_psrai_q_256:
- case Intrinsic::x86_avx512_psrai_q_128:
- case Intrinsic::x86_avx2_psll_w:
- case Intrinsic::x86_avx2_psll_d:
- case Intrinsic::x86_avx2_psll_q:
- case Intrinsic::x86_avx2_pslli_w:
- case Intrinsic::x86_avx2_pslli_d:
- case Intrinsic::x86_avx2_pslli_q:
- case Intrinsic::x86_avx2_psrl_w:
- case Intrinsic::x86_avx2_psrl_d:
- case Intrinsic::x86_avx2_psrl_q:
- case Intrinsic::x86_avx2_psra_w:
- case Intrinsic::x86_avx2_psra_d:
- case Intrinsic::x86_avx2_psrli_w:
- case Intrinsic::x86_avx2_psrli_d:
- case Intrinsic::x86_avx2_psrli_q:
- case Intrinsic::x86_avx2_psrai_w:
- case Intrinsic::x86_avx2_psrai_d:
- case Intrinsic::x86_sse2_psll_w:
- case Intrinsic::x86_sse2_psll_d:
- case Intrinsic::x86_sse2_psll_q:
- case Intrinsic::x86_sse2_pslli_w:
- case Intrinsic::x86_sse2_pslli_d:
- case Intrinsic::x86_sse2_pslli_q:
- case Intrinsic::x86_sse2_psrl_w:
- case Intrinsic::x86_sse2_psrl_d:
- case Intrinsic::x86_sse2_psrl_q:
- case Intrinsic::x86_sse2_psra_w:
- case Intrinsic::x86_sse2_psra_d:
- case Intrinsic::x86_sse2_psrli_w:
- case Intrinsic::x86_sse2_psrli_d:
- case Intrinsic::x86_sse2_psrli_q:
- case Intrinsic::x86_sse2_psrai_w:
- case Intrinsic::x86_sse2_psrai_d:
- case Intrinsic::x86_mmx_psll_w:
- case Intrinsic::x86_mmx_psll_d:
- case Intrinsic::x86_mmx_psll_q:
- case Intrinsic::x86_mmx_pslli_w:
- case Intrinsic::x86_mmx_pslli_d:
- case Intrinsic::x86_mmx_pslli_q:
- case Intrinsic::x86_mmx_psrl_w:
- case Intrinsic::x86_mmx_psrl_d:
- case Intrinsic::x86_mmx_psrl_q:
- case Intrinsic::x86_mmx_psra_w:
- case Intrinsic::x86_mmx_psra_d:
- case Intrinsic::x86_mmx_psrli_w:
- case Intrinsic::x86_mmx_psrli_d:
- case Intrinsic::x86_mmx_psrli_q:
- case Intrinsic::x86_mmx_psrai_w:
- case Intrinsic::x86_mmx_psrai_d:
- handleVectorShiftIntrinsic(I, /* Variable */ false);
- break;
- case Intrinsic::x86_avx2_psllv_d:
- case Intrinsic::x86_avx2_psllv_d_256:
- case Intrinsic::x86_avx512_psllv_d_512:
- case Intrinsic::x86_avx2_psllv_q:
- case Intrinsic::x86_avx2_psllv_q_256:
- case Intrinsic::x86_avx512_psllv_q_512:
- case Intrinsic::x86_avx2_psrlv_d:
- case Intrinsic::x86_avx2_psrlv_d_256:
- case Intrinsic::x86_avx512_psrlv_d_512:
- case Intrinsic::x86_avx2_psrlv_q:
- case Intrinsic::x86_avx2_psrlv_q_256:
- case Intrinsic::x86_avx512_psrlv_q_512:
- case Intrinsic::x86_avx2_psrav_d:
- case Intrinsic::x86_avx2_psrav_d_256:
- case Intrinsic::x86_avx512_psrav_d_512:
- case Intrinsic::x86_avx512_psrav_q_128:
- case Intrinsic::x86_avx512_psrav_q_256:
- case Intrinsic::x86_avx512_psrav_q_512:
- handleVectorShiftIntrinsic(I, /* Variable */ true);
- break;
-
- case Intrinsic::x86_sse2_packsswb_128:
- case Intrinsic::x86_sse2_packssdw_128:
- case Intrinsic::x86_sse2_packuswb_128:
- case Intrinsic::x86_sse41_packusdw:
- case Intrinsic::x86_avx2_packsswb:
- case Intrinsic::x86_avx2_packssdw:
- case Intrinsic::x86_avx2_packuswb:
- case Intrinsic::x86_avx2_packusdw:
- handleVectorPackIntrinsic(I);
- break;
-
- case Intrinsic::x86_mmx_packsswb:
- case Intrinsic::x86_mmx_packuswb:
- handleVectorPackIntrinsic(I, 16);
- break;
-
- case Intrinsic::x86_mmx_packssdw:
- handleVectorPackIntrinsic(I, 32);
- break;
-
- case Intrinsic::x86_mmx_psad_bw:
- case Intrinsic::x86_sse2_psad_bw:
- case Intrinsic::x86_avx2_psad_bw:
- handleVectorSadIntrinsic(I);
- break;
-
- case Intrinsic::x86_sse2_pmadd_wd:
- case Intrinsic::x86_avx2_pmadd_wd:
- case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
- case Intrinsic::x86_avx2_pmadd_ub_sw:
- handleVectorPmaddIntrinsic(I);
- break;
-
- case Intrinsic::x86_ssse3_pmadd_ub_sw:
- handleVectorPmaddIntrinsic(I, 8);
- break;
-
- case Intrinsic::x86_mmx_pmadd_wd:
- handleVectorPmaddIntrinsic(I, 16);
- break;
-
- case Intrinsic::x86_sse_cmp_ss:
- case Intrinsic::x86_sse2_cmp_sd:
- case Intrinsic::x86_sse_comieq_ss:
- case Intrinsic::x86_sse_comilt_ss:
- case Intrinsic::x86_sse_comile_ss:
- case Intrinsic::x86_sse_comigt_ss:
- case Intrinsic::x86_sse_comige_ss:
- case Intrinsic::x86_sse_comineq_ss:
- case Intrinsic::x86_sse_ucomieq_ss:
- case Intrinsic::x86_sse_ucomilt_ss:
- case Intrinsic::x86_sse_ucomile_ss:
- case Intrinsic::x86_sse_ucomigt_ss:
- case Intrinsic::x86_sse_ucomige_ss:
- case Intrinsic::x86_sse_ucomineq_ss:
- case Intrinsic::x86_sse2_comieq_sd:
- case Intrinsic::x86_sse2_comilt_sd:
- case Intrinsic::x86_sse2_comile_sd:
- case Intrinsic::x86_sse2_comigt_sd:
- case Intrinsic::x86_sse2_comige_sd:
- case Intrinsic::x86_sse2_comineq_sd:
- case Intrinsic::x86_sse2_ucomieq_sd:
- case Intrinsic::x86_sse2_ucomilt_sd:
- case Intrinsic::x86_sse2_ucomile_sd:
- case Intrinsic::x86_sse2_ucomigt_sd:
- case Intrinsic::x86_sse2_ucomige_sd:
- case Intrinsic::x86_sse2_ucomineq_sd:
- handleVectorCompareScalarIntrinsic(I);
- break;
-
- case Intrinsic::x86_sse_cmp_ps:
- case Intrinsic::x86_sse2_cmp_pd:
- // FIXME: For x86_avx_cmp_pd_256 and x86_avx_cmp_ps_256 this function
- // generates reasonably looking IR that fails in the backend with "Do not
- // know how to split the result of this operator!".
- handleVectorComparePackedIntrinsic(I);
- break;
-
- case Intrinsic::x86_bmi_bextr_32:
- case Intrinsic::x86_bmi_bextr_64:
- case Intrinsic::x86_bmi_bzhi_32:
- case Intrinsic::x86_bmi_bzhi_64:
- case Intrinsic::x86_bmi_pdep_32:
- case Intrinsic::x86_bmi_pdep_64:
- case Intrinsic::x86_bmi_pext_32:
- case Intrinsic::x86_bmi_pext_64:
- handleBmiIntrinsic(I);
- break;
-
- case Intrinsic::x86_pclmulqdq:
- case Intrinsic::x86_pclmulqdq_256:
- case Intrinsic::x86_pclmulqdq_512:
- handlePclmulIntrinsic(I);
- break;
-
- case Intrinsic::x86_sse41_round_sd:
- handleUnarySdIntrinsic(I);
- break;
- case Intrinsic::x86_sse2_max_sd:
- case Intrinsic::x86_sse2_min_sd:
- handleBinarySdIntrinsic(I);
- break;
-
- case Intrinsic::is_constant:
- // The result of llvm.is.constant() is always defined.
- setShadow(&I, getCleanShadow(&I));
- setOrigin(&I, getCleanOrigin());
- break;
-
- default:
- if (!handleUnknownIntrinsic(I))
- visitInstruction(I);
- break;
- }
- }
-
+ case Intrinsic::x86_sse2_cvtsd2si64:
+ case Intrinsic::x86_sse2_cvtsd2si:
+ case Intrinsic::x86_sse2_cvtsd2ss:
+ case Intrinsic::x86_sse2_cvttsd2si64:
+ case Intrinsic::x86_sse2_cvttsd2si:
+ case Intrinsic::x86_sse_cvtss2si64:
+ case Intrinsic::x86_sse_cvtss2si:
+ case Intrinsic::x86_sse_cvttss2si64:
+ case Intrinsic::x86_sse_cvttss2si:
+ handleVectorConvertIntrinsic(I, 1);
+ break;
+ case Intrinsic::x86_sse_cvtps2pi:
+ case Intrinsic::x86_sse_cvttps2pi:
+ handleVectorConvertIntrinsic(I, 2);
+ break;
+
+ case Intrinsic::x86_avx512_psll_w_512:
+ case Intrinsic::x86_avx512_psll_d_512:
+ case Intrinsic::x86_avx512_psll_q_512:
+ case Intrinsic::x86_avx512_pslli_w_512:
+ case Intrinsic::x86_avx512_pslli_d_512:
+ case Intrinsic::x86_avx512_pslli_q_512:
+ case Intrinsic::x86_avx512_psrl_w_512:
+ case Intrinsic::x86_avx512_psrl_d_512:
+ case Intrinsic::x86_avx512_psrl_q_512:
+ case Intrinsic::x86_avx512_psra_w_512:
+ case Intrinsic::x86_avx512_psra_d_512:
+ case Intrinsic::x86_avx512_psra_q_512:
+ case Intrinsic::x86_avx512_psrli_w_512:
+ case Intrinsic::x86_avx512_psrli_d_512:
+ case Intrinsic::x86_avx512_psrli_q_512:
+ case Intrinsic::x86_avx512_psrai_w_512:
+ case Intrinsic::x86_avx512_psrai_d_512:
+ case Intrinsic::x86_avx512_psrai_q_512:
+ case Intrinsic::x86_avx512_psra_q_256:
+ case Intrinsic::x86_avx512_psra_q_128:
+ case Intrinsic::x86_avx512_psrai_q_256:
+ case Intrinsic::x86_avx512_psrai_q_128:
+ case Intrinsic::x86_avx2_psll_w:
+ case Intrinsic::x86_avx2_psll_d:
+ case Intrinsic::x86_avx2_psll_q:
+ case Intrinsic::x86_avx2_pslli_w:
+ case Intrinsic::x86_avx2_pslli_d:
+ case Intrinsic::x86_avx2_pslli_q:
+ case Intrinsic::x86_avx2_psrl_w:
+ case Intrinsic::x86_avx2_psrl_d:
+ case Intrinsic::x86_avx2_psrl_q:
+ case Intrinsic::x86_avx2_psra_w:
+ case Intrinsic::x86_avx2_psra_d:
+ case Intrinsic::x86_avx2_psrli_w:
+ case Intrinsic::x86_avx2_psrli_d:
+ case Intrinsic::x86_avx2_psrli_q:
+ case Intrinsic::x86_avx2_psrai_w:
+ case Intrinsic::x86_avx2_psrai_d:
+ case Intrinsic::x86_sse2_psll_w:
+ case Intrinsic::x86_sse2_psll_d:
+ case Intrinsic::x86_sse2_psll_q:
+ case Intrinsic::x86_sse2_pslli_w:
+ case Intrinsic::x86_sse2_pslli_d:
+ case Intrinsic::x86_sse2_pslli_q:
+ case Intrinsic::x86_sse2_psrl_w:
+ case Intrinsic::x86_sse2_psrl_d:
+ case Intrinsic::x86_sse2_psrl_q:
+ case Intrinsic::x86_sse2_psra_w:
+ case Intrinsic::x86_sse2_psra_d:
+ case Intrinsic::x86_sse2_psrli_w:
+ case Intrinsic::x86_sse2_psrli_d:
+ case Intrinsic::x86_sse2_psrli_q:
+ case Intrinsic::x86_sse2_psrai_w:
+ case Intrinsic::x86_sse2_psrai_d:
+ case Intrinsic::x86_mmx_psll_w:
+ case Intrinsic::x86_mmx_psll_d:
+ case Intrinsic::x86_mmx_psll_q:
+ case Intrinsic::x86_mmx_pslli_w:
+ case Intrinsic::x86_mmx_pslli_d:
+ case Intrinsic::x86_mmx_pslli_q:
+ case Intrinsic::x86_mmx_psrl_w:
+ case Intrinsic::x86_mmx_psrl_d:
+ case Intrinsic::x86_mmx_psrl_q:
+ case Intrinsic::x86_mmx_psra_w:
+ case Intrinsic::x86_mmx_psra_d:
+ case Intrinsic::x86_mmx_psrli_w:
+ case Intrinsic::x86_mmx_psrli_d:
+ case Intrinsic::x86_mmx_psrli_q:
+ case Intrinsic::x86_mmx_psrai_w:
+ case Intrinsic::x86_mmx_psrai_d:
+ handleVectorShiftIntrinsic(I, /* Variable */ false);
+ break;
+ case Intrinsic::x86_avx2_psllv_d:
+ case Intrinsic::x86_avx2_psllv_d_256:
+ case Intrinsic::x86_avx512_psllv_d_512:
+ case Intrinsic::x86_avx2_psllv_q:
+ case Intrinsic::x86_avx2_psllv_q_256:
+ case Intrinsic::x86_avx512_psllv_q_512:
+ case Intrinsic::x86_avx2_psrlv_d:
+ case Intrinsic::x86_avx2_psrlv_d_256:
+ case Intrinsic::x86_avx512_psrlv_d_512:
+ case Intrinsic::x86_avx2_psrlv_q:
+ case Intrinsic::x86_avx2_psrlv_q_256:
+ case Intrinsic::x86_avx512_psrlv_q_512:
+ case Intrinsic::x86_avx2_psrav_d:
+ case Intrinsic::x86_avx2_psrav_d_256:
+ case Intrinsic::x86_avx512_psrav_d_512:
+ case Intrinsic::x86_avx512_psrav_q_128:
+ case Intrinsic::x86_avx512_psrav_q_256:
+ case Intrinsic::x86_avx512_psrav_q_512:
+ handleVectorShiftIntrinsic(I, /* Variable */ true);
+ break;
+
+ case Intrinsic::x86_sse2_packsswb_128:
+ case Intrinsic::x86_sse2_packssdw_128:
+ case Intrinsic::x86_sse2_packuswb_128:
+ case Intrinsic::x86_sse41_packusdw:
+ case Intrinsic::x86_avx2_packsswb:
+ case Intrinsic::x86_avx2_packssdw:
+ case Intrinsic::x86_avx2_packuswb:
+ case Intrinsic::x86_avx2_packusdw:
+ handleVectorPackIntrinsic(I);
+ break;
+
+ case Intrinsic::x86_mmx_packsswb:
+ case Intrinsic::x86_mmx_packuswb:
+ handleVectorPackIntrinsic(I, 16);
+ break;
+
+ case Intrinsic::x86_mmx_packssdw:
+ handleVectorPackIntrinsic(I, 32);
+ break;
+
+ case Intrinsic::x86_mmx_psad_bw:
+ case Intrinsic::x86_sse2_psad_bw:
+ case Intrinsic::x86_avx2_psad_bw:
+ handleVectorSadIntrinsic(I);
+ break;
+
+ case Intrinsic::x86_sse2_pmadd_wd:
+ case Intrinsic::x86_avx2_pmadd_wd:
+ case Intrinsic::x86_ssse3_pmadd_ub_sw_128:
+ case Intrinsic::x86_avx2_pmadd_ub_sw:
+ handleVectorPmaddIntrinsic(I);
+ break;
+
+ case Intrinsic::x86_ssse3_pmadd_ub_sw:
+ handleVectorPmaddIntrinsic(I, 8);
+ break;
+
+ case Intrinsic::x86_mmx_pmadd_wd:
+ handleVectorPmaddIntrinsic(I, 16);
+ break;
+
+ case Intrinsic::x86_sse_cmp_ss:
+ case Intrinsic::x86_sse2_cmp_sd:
+ case Intrinsic::x86_sse_comieq_ss:
+ case Intrinsic::x86_sse_comilt_ss:
+ case Intrinsic::x86_sse_comile_ss:
+ case Intrinsic::x86_sse_comigt_ss:
+ case Intrinsic::x86_sse_comige_ss:
+ case Intrinsic::x86_sse_comineq_ss:
+ case Intrinsic::x86_sse_ucomieq_ss:
+ case Intrinsic::x86_sse_ucomilt_ss:
+ case Intrinsic::x86_sse_ucomile_ss:
+ case Intrinsic::x86_sse_ucomigt_ss:
+ case Intrinsic::x86_sse_ucomige_ss:
+ case Intrinsic::x86_sse_ucomineq_ss:
+ case Intrinsic::x86_sse2_comieq_sd:
+ case Intrinsic::x86_sse2_comilt_sd:
+ case Intrinsic::x86_sse2_comile_sd:
+ case Intrinsic::x86_sse2_comigt_sd:
+ case Intrinsic::x86_sse2_comige_sd:
+ case Intrinsic::x86_sse2_comineq_sd:
+ case Intrinsic::x86_sse2_ucomieq_sd:
+ case Intrinsic::x86_sse2_ucomilt_sd:
+ case Intrinsic::x86_sse2_ucomile_sd:
+ case Intrinsic::x86_sse2_ucomigt_sd:
+ case Intrinsic::x86_sse2_ucomige_sd:
+ case Intrinsic::x86_sse2_ucomineq_sd:
+ handleVectorCompareScalarIntrinsic(I);
+ break;
+
+ case Intrinsic::x86_sse_cmp_ps:
+ case Intrinsic::x86_sse2_cmp_pd:
+ // FIXME: For x86_avx_cmp_pd_256 and x86_avx_cmp_ps_256 this function
+ // generates reasonably looking IR that fails in the backend with "Do not
+ // know how to split the result of this operator!".
+ handleVectorComparePackedIntrinsic(I);
+ break;
+
+ case Intrinsic::x86_bmi_bextr_32:
+ case Intrinsic::x86_bmi_bextr_64:
+ case Intrinsic::x86_bmi_bzhi_32:
+ case Intrinsic::x86_bmi_bzhi_64:
+ case Intrinsic::x86_bmi_pdep_32:
+ case Intrinsic::x86_bmi_pdep_64:
+ case Intrinsic::x86_bmi_pext_32:
+ case Intrinsic::x86_bmi_pext_64:
+ handleBmiIntrinsic(I);
+ break;
+
+ case Intrinsic::x86_pclmulqdq:
+ case Intrinsic::x86_pclmulqdq_256:
+ case Intrinsic::x86_pclmulqdq_512:
+ handlePclmulIntrinsic(I);
+ break;
+
+ case Intrinsic::x86_sse41_round_sd:
+ handleUnarySdIntrinsic(I);
+ break;
+ case Intrinsic::x86_sse2_max_sd:
+ case Intrinsic::x86_sse2_min_sd:
+ handleBinarySdIntrinsic(I);
+ break;
+
+ case Intrinsic::is_constant:
+ // The result of llvm.is.constant() is always defined.
+ setShadow(&I, getCleanShadow(&I));
+ setOrigin(&I, getCleanOrigin());
+ break;
+
+ default:
+ if (!handleUnknownIntrinsic(I))
+ visitInstruction(I);
+ break;
+ }
+ }
+
void visitLibAtomicLoad(CallBase &CB) {
// Since we use getNextNode here, we can't have CB terminate the BB.
assert(isa<CallInst>(CB));
@@ -3577,19 +3577,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
Align(1));
}
- void visitCallBase(CallBase &CB) {
- assert(!CB.getMetadata("nosanitize"));
- if (CB.isInlineAsm()) {
- // For inline asm (either a call to asm function, or callbr instruction),
- // do the usual thing: check argument shadow and mark all outputs as
- // clean. Note that any side effects of the inline asm that are not
- // immediately visible in its constraints are not handled.
- if (ClHandleAsmConservative && MS.CompileKernel)
- visitAsmInstruction(CB);
- else
- visitInstruction(CB);
- return;
- }
+ void visitCallBase(CallBase &CB) {
+ assert(!CB.getMetadata("nosanitize"));
+ if (CB.isInlineAsm()) {
+ // For inline asm (either a call to asm function, or callbr instruction),
+ // do the usual thing: check argument shadow and mark all outputs as
+ // clean. Note that any side effects of the inline asm that are not
+ // immediately visible in its constraints are not handled.
+ if (ClHandleAsmConservative && MS.CompileKernel)
+ visitAsmInstruction(CB);
+ else
+ visitInstruction(CB);
+ return;
+ }
LibFunc LF;
if (TLI->getLibFunc(CB, LF)) {
// libatomic.a functions need to have special handling because there isn't
@@ -3612,13 +3612,13 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
}
}
- if (auto *Call = dyn_cast<CallInst>(&CB)) {
- assert(!isa<IntrinsicInst>(Call) && "intrinsics are handled elsewhere");
-
- // We are going to insert code that relies on the fact that the callee
- // will become a non-readonly function after it is instrumented by us. To
- // prevent this code from being optimized out, mark that function
- // non-readonly in advance.
+ if (auto *Call = dyn_cast<CallInst>(&CB)) {
+ assert(!isa<IntrinsicInst>(Call) && "intrinsics are handled elsewhere");
+
+ // We are going to insert code that relies on the fact that the callee
+ // will become a non-readonly function after it is instrumented by us. To
+ // prevent this code from being optimized out, mark that function
+ // non-readonly in advance.
AttrBuilder B;
B.addAttribute(Attribute::ReadOnly)
.addAttribute(Attribute::ReadNone)
@@ -3627,1693 +3627,1693 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
.addAttribute(Attribute::Speculatable);
Call->removeAttributes(AttributeList::FunctionIndex, B);
- if (Function *Func = Call->getCalledFunction()) {
- Func->removeAttributes(AttributeList::FunctionIndex, B);
- }
-
- maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
- }
- IRBuilder<> IRB(&CB);
+ if (Function *Func = Call->getCalledFunction()) {
+ Func->removeAttributes(AttributeList::FunctionIndex, B);
+ }
+
+ maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
+ }
+ IRBuilder<> IRB(&CB);
bool MayCheckCall = ClEagerChecks;
if (Function *Func = CB.getCalledFunction()) {
// __sanitizer_unaligned_{load,store} functions may be called by users
// and always expects shadows in the TLS. So don't check them.
MayCheckCall &= !Func->getName().startswith("__sanitizer_unaligned_");
}
-
- unsigned ArgOffset = 0;
- LLVM_DEBUG(dbgs() << " CallSite: " << CB << "\n");
- for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
- ++ArgIt) {
- Value *A = *ArgIt;
- unsigned i = ArgIt - CB.arg_begin();
- if (!A->getType()->isSized()) {
- LLVM_DEBUG(dbgs() << "Arg " << i << " is not sized: " << CB << "\n");
- continue;
- }
- unsigned Size = 0;
- Value *Store = nullptr;
- // Compute the Shadow for arg even if it is ByVal, because
- // in that case getShadow() will copy the actual arg shadow to
- // __msan_param_tls.
- Value *ArgShadow = getShadow(A);
- Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
- LLVM_DEBUG(dbgs() << " Arg#" << i << ": " << *A
- << " Shadow: " << *ArgShadow << "\n");
- bool ArgIsInitialized = false;
- const DataLayout &DL = F.getParent()->getDataLayout();
-
- bool ByVal = CB.paramHasAttr(i, Attribute::ByVal);
- bool NoUndef = CB.paramHasAttr(i, Attribute::NoUndef);
+
+ unsigned ArgOffset = 0;
+ LLVM_DEBUG(dbgs() << " CallSite: " << CB << "\n");
+ for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+ ++ArgIt) {
+ Value *A = *ArgIt;
+ unsigned i = ArgIt - CB.arg_begin();
+ if (!A->getType()->isSized()) {
+ LLVM_DEBUG(dbgs() << "Arg " << i << " is not sized: " << CB << "\n");
+ continue;
+ }
+ unsigned Size = 0;
+ Value *Store = nullptr;
+ // Compute the Shadow for arg even if it is ByVal, because
+ // in that case getShadow() will copy the actual arg shadow to
+ // __msan_param_tls.
+ Value *ArgShadow = getShadow(A);
+ Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
+ LLVM_DEBUG(dbgs() << " Arg#" << i << ": " << *A
+ << " Shadow: " << *ArgShadow << "\n");
+ bool ArgIsInitialized = false;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+
+ bool ByVal = CB.paramHasAttr(i, Attribute::ByVal);
+ bool NoUndef = CB.paramHasAttr(i, Attribute::NoUndef);
bool EagerCheck = MayCheckCall && !ByVal && NoUndef;
-
- if (EagerCheck) {
- insertShadowCheck(A, &CB);
- continue;
- }
- if (ByVal) {
- // ByVal requires some special handling as it's too big for a single
- // load
- assert(A->getType()->isPointerTy() &&
- "ByVal argument is not a pointer!");
- Size = DL.getTypeAllocSize(CB.getParamByValType(i));
- if (ArgOffset + Size > kParamTLSSize) break;
- const MaybeAlign ParamAlignment(CB.getParamAlign(i));
- MaybeAlign Alignment = llvm::None;
- if (ParamAlignment)
- Alignment = std::min(*ParamAlignment, kShadowTLSAlignment);
- Value *AShadowPtr =
- getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment,
- /*isStore*/ false)
- .first;
-
- Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr,
- Alignment, Size);
- // TODO(glider): need to copy origins.
- } else {
- // Any other parameters mean we need bit-grained tracking of uninit data
- Size = DL.getTypeAllocSize(A->getType());
- if (ArgOffset + Size > kParamTLSSize) break;
- Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
- kShadowTLSAlignment);
- Constant *Cst = dyn_cast<Constant>(ArgShadow);
- if (Cst && Cst->isNullValue()) ArgIsInitialized = true;
- }
- if (MS.TrackOrigins && !ArgIsInitialized)
- IRB.CreateStore(getOrigin(A),
- getOriginPtrForArgument(A, IRB, ArgOffset));
- (void)Store;
- assert(Size != 0 && Store != nullptr);
- LLVM_DEBUG(dbgs() << " Param:" << *Store << "\n");
+
+ if (EagerCheck) {
+ insertShadowCheck(A, &CB);
+ continue;
+ }
+ if (ByVal) {
+ // ByVal requires some special handling as it's too big for a single
+ // load
+ assert(A->getType()->isPointerTy() &&
+ "ByVal argument is not a pointer!");
+ Size = DL.getTypeAllocSize(CB.getParamByValType(i));
+ if (ArgOffset + Size > kParamTLSSize) break;
+ const MaybeAlign ParamAlignment(CB.getParamAlign(i));
+ MaybeAlign Alignment = llvm::None;
+ if (ParamAlignment)
+ Alignment = std::min(*ParamAlignment, kShadowTLSAlignment);
+ Value *AShadowPtr =
+ getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), Alignment,
+ /*isStore*/ false)
+ .first;
+
+ Store = IRB.CreateMemCpy(ArgShadowBase, Alignment, AShadowPtr,
+ Alignment, Size);
+ // TODO(glider): need to copy origins.
+ } else {
+ // Any other parameters mean we need bit-grained tracking of uninit data
+ Size = DL.getTypeAllocSize(A->getType());
+ if (ArgOffset + Size > kParamTLSSize) break;
+ Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
+ kShadowTLSAlignment);
+ Constant *Cst = dyn_cast<Constant>(ArgShadow);
+ if (Cst && Cst->isNullValue()) ArgIsInitialized = true;
+ }
+ if (MS.TrackOrigins && !ArgIsInitialized)
+ IRB.CreateStore(getOrigin(A),
+ getOriginPtrForArgument(A, IRB, ArgOffset));
+ (void)Store;
+ assert(Size != 0 && Store != nullptr);
+ LLVM_DEBUG(dbgs() << " Param:" << *Store << "\n");
ArgOffset += alignTo(Size, kShadowTLSAlignment);
- }
- LLVM_DEBUG(dbgs() << " done with call args\n");
-
- FunctionType *FT = CB.getFunctionType();
- if (FT->isVarArg()) {
- VAHelper->visitCallBase(CB, IRB);
- }
-
- // Now, get the shadow for the RetVal.
- if (!CB.getType()->isSized())
- return;
- // Don't emit the epilogue for musttail call returns.
- if (isa<CallInst>(CB) && cast<CallInst>(CB).isMustTailCall())
- return;
-
+ }
+ LLVM_DEBUG(dbgs() << " done with call args\n");
+
+ FunctionType *FT = CB.getFunctionType();
+ if (FT->isVarArg()) {
+ VAHelper->visitCallBase(CB, IRB);
+ }
+
+ // Now, get the shadow for the RetVal.
+ if (!CB.getType()->isSized())
+ return;
+ // Don't emit the epilogue for musttail call returns.
+ if (isa<CallInst>(CB) && cast<CallInst>(CB).isMustTailCall())
+ return;
+
if (MayCheckCall && CB.hasRetAttr(Attribute::NoUndef)) {
- setShadow(&CB, getCleanShadow(&CB));
- setOrigin(&CB, getCleanOrigin());
- return;
- }
-
- IRBuilder<> IRBBefore(&CB);
- // Until we have full dynamic coverage, make sure the retval shadow is 0.
- Value *Base = getShadowPtrForRetval(&CB, IRBBefore);
- IRBBefore.CreateAlignedStore(getCleanShadow(&CB), Base,
- kShadowTLSAlignment);
- BasicBlock::iterator NextInsn;
- if (isa<CallInst>(CB)) {
- NextInsn = ++CB.getIterator();
- assert(NextInsn != CB.getParent()->end());
- } else {
- BasicBlock *NormalDest = cast<InvokeInst>(CB).getNormalDest();
- if (!NormalDest->getSinglePredecessor()) {
- // FIXME: this case is tricky, so we are just conservative here.
- // Perhaps we need to split the edge between this BB and NormalDest,
- // but a naive attempt to use SplitEdge leads to a crash.
- setShadow(&CB, getCleanShadow(&CB));
- setOrigin(&CB, getCleanOrigin());
- return;
- }
- // FIXME: NextInsn is likely in a basic block that has not been visited yet.
- // Anything inserted there will be instrumented by MSan later!
- NextInsn = NormalDest->getFirstInsertionPt();
- assert(NextInsn != NormalDest->end() &&
- "Could not find insertion point for retval shadow load");
- }
- IRBuilder<> IRBAfter(&*NextInsn);
- Value *RetvalShadow = IRBAfter.CreateAlignedLoad(
- getShadowTy(&CB), getShadowPtrForRetval(&CB, IRBAfter),
- kShadowTLSAlignment, "_msret");
- setShadow(&CB, RetvalShadow);
- if (MS.TrackOrigins)
- setOrigin(&CB, IRBAfter.CreateLoad(MS.OriginTy,
- getOriginPtrForRetval(IRBAfter)));
- }
-
- bool isAMustTailRetVal(Value *RetVal) {
- if (auto *I = dyn_cast<BitCastInst>(RetVal)) {
- RetVal = I->getOperand(0);
- }
- if (auto *I = dyn_cast<CallInst>(RetVal)) {
- return I->isMustTailCall();
- }
- return false;
- }
-
- void visitReturnInst(ReturnInst &I) {
- IRBuilder<> IRB(&I);
- Value *RetVal = I.getReturnValue();
- if (!RetVal) return;
- // Don't emit the epilogue for musttail call returns.
- if (isAMustTailRetVal(RetVal)) return;
- Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
- bool HasNoUndef =
- F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoUndef);
- bool StoreShadow = !(ClEagerChecks && HasNoUndef);
- // FIXME: Consider using SpecialCaseList to specify a list of functions that
- // must always return fully initialized values. For now, we hardcode "main".
- bool EagerCheck = (ClEagerChecks && HasNoUndef) || (F.getName() == "main");
-
- Value *Shadow = getShadow(RetVal);
- bool StoreOrigin = true;
- if (EagerCheck) {
- insertShadowCheck(RetVal, &I);
- Shadow = getCleanShadow(RetVal);
- StoreOrigin = false;
- }
-
- // The caller may still expect information passed over TLS if we pass our
- // check
- if (StoreShadow) {
- IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
- if (MS.TrackOrigins && StoreOrigin)
- IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
- }
- }
-
- void visitPHINode(PHINode &I) {
- IRBuilder<> IRB(&I);
- if (!PropagateShadow) {
- setShadow(&I, getCleanShadow(&I));
- setOrigin(&I, getCleanOrigin());
- return;
- }
-
- ShadowPHINodes.push_back(&I);
- setShadow(&I, IRB.CreatePHI(getShadowTy(&I), I.getNumIncomingValues(),
- "_msphi_s"));
- if (MS.TrackOrigins)
- setOrigin(&I, IRB.CreatePHI(MS.OriginTy, I.getNumIncomingValues(),
- "_msphi_o"));
- }
-
- Value *getLocalVarDescription(AllocaInst &I) {
- SmallString<2048> StackDescriptionStorage;
- raw_svector_ostream StackDescription(StackDescriptionStorage);
- // We create a string with a description of the stack allocation and
- // pass it into __msan_set_alloca_origin.
- // It will be printed by the run-time if stack-originated UMR is found.
- // The first 4 bytes of the string are set to '----' and will be replaced
- // by __msan_va_arg_overflow_size_tls at the first call.
- StackDescription << "----" << I.getName() << "@" << F.getName();
- return createPrivateNonConstGlobalForString(*F.getParent(),
- StackDescription.str());
- }
-
- void poisonAllocaUserspace(AllocaInst &I, IRBuilder<> &IRB, Value *Len) {
- if (PoisonStack && ClPoisonStackWithCall) {
- IRB.CreateCall(MS.MsanPoisonStackFn,
- {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
- } else {
- Value *ShadowBase, *OriginBase;
- std::tie(ShadowBase, OriginBase) = getShadowOriginPtr(
- &I, IRB, IRB.getInt8Ty(), Align(1), /*isStore*/ true);
-
- Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
- IRB.CreateMemSet(ShadowBase, PoisonValue, Len,
- MaybeAlign(I.getAlignment()));
- }
-
- if (PoisonStack && MS.TrackOrigins) {
- Value *Descr = getLocalVarDescription(I);
- IRB.CreateCall(MS.MsanSetAllocaOrigin4Fn,
- {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len,
- IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()),
- IRB.CreatePointerCast(&F, MS.IntptrTy)});
- }
- }
-
- void poisonAllocaKmsan(AllocaInst &I, IRBuilder<> &IRB, Value *Len) {
- Value *Descr = getLocalVarDescription(I);
- if (PoisonStack) {
- IRB.CreateCall(MS.MsanPoisonAllocaFn,
- {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len,
- IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy())});
- } else {
- IRB.CreateCall(MS.MsanUnpoisonAllocaFn,
- {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
- }
- }
-
- void instrumentAlloca(AllocaInst &I, Instruction *InsPoint = nullptr) {
- if (!InsPoint)
- InsPoint = &I;
- IRBuilder<> IRB(InsPoint->getNextNode());
- const DataLayout &DL = F.getParent()->getDataLayout();
- uint64_t TypeSize = DL.getTypeAllocSize(I.getAllocatedType());
- Value *Len = ConstantInt::get(MS.IntptrTy, TypeSize);
- if (I.isArrayAllocation())
- Len = IRB.CreateMul(Len, I.getArraySize());
-
- if (MS.CompileKernel)
- poisonAllocaKmsan(I, IRB, Len);
- else
- poisonAllocaUserspace(I, IRB, Len);
- }
-
- void visitAllocaInst(AllocaInst &I) {
- setShadow(&I, getCleanShadow(&I));
- setOrigin(&I, getCleanOrigin());
- // We'll get to this alloca later unless it's poisoned at the corresponding
- // llvm.lifetime.start.
- AllocaSet.insert(&I);
- }
-
- void visitSelectInst(SelectInst& I) {
- IRBuilder<> IRB(&I);
- // a = select b, c, d
- Value *B = I.getCondition();
- Value *C = I.getTrueValue();
- Value *D = I.getFalseValue();
- Value *Sb = getShadow(B);
- Value *Sc = getShadow(C);
- Value *Sd = getShadow(D);
-
- // Result shadow if condition shadow is 0.
- Value *Sa0 = IRB.CreateSelect(B, Sc, Sd);
- Value *Sa1;
- if (I.getType()->isAggregateType()) {
- // To avoid "sign extending" i1 to an arbitrary aggregate type, we just do
- // an extra "select". This results in much more compact IR.
- // Sa = select Sb, poisoned, (select b, Sc, Sd)
- Sa1 = getPoisonedShadow(getShadowTy(I.getType()));
- } else {
- // Sa = select Sb, [ (c^d) | Sc | Sd ], [ b ? Sc : Sd ]
- // If Sb (condition is poisoned), look for bits in c and d that are equal
- // and both unpoisoned.
- // If !Sb (condition is unpoisoned), simply pick one of Sc and Sd.
-
- // Cast arguments to shadow-compatible type.
- C = CreateAppToShadowCast(IRB, C);
- D = CreateAppToShadowCast(IRB, D);
-
- // Result shadow if condition shadow is 1.
- Sa1 = IRB.CreateOr({IRB.CreateXor(C, D), Sc, Sd});
- }
- Value *Sa = IRB.CreateSelect(Sb, Sa1, Sa0, "_msprop_select");
- setShadow(&I, Sa);
- if (MS.TrackOrigins) {
- // Origins are always i32, so any vector conditions must be flattened.
- // FIXME: consider tracking vector origins for app vectors?
- if (B->getType()->isVectorTy()) {
- Type *FlatTy = getShadowTyNoVec(B->getType());
- B = IRB.CreateICmpNE(IRB.CreateBitCast(B, FlatTy),
- ConstantInt::getNullValue(FlatTy));
- Sb = IRB.CreateICmpNE(IRB.CreateBitCast(Sb, FlatTy),
- ConstantInt::getNullValue(FlatTy));
- }
- // a = select b, c, d
- // Oa = Sb ? Ob : (b ? Oc : Od)
- setOrigin(
- &I, IRB.CreateSelect(Sb, getOrigin(I.getCondition()),
- IRB.CreateSelect(B, getOrigin(I.getTrueValue()),
- getOrigin(I.getFalseValue()))));
- }
- }
-
- void visitLandingPadInst(LandingPadInst &I) {
- // Do nothing.
- // See https://github.com/google/sanitizers/issues/504
- setShadow(&I, getCleanShadow(&I));
- setOrigin(&I, getCleanOrigin());
- }
-
- void visitCatchSwitchInst(CatchSwitchInst &I) {
- setShadow(&I, getCleanShadow(&I));
- setOrigin(&I, getCleanOrigin());
- }
-
- void visitFuncletPadInst(FuncletPadInst &I) {
- setShadow(&I, getCleanShadow(&I));
- setOrigin(&I, getCleanOrigin());
- }
-
- void visitGetElementPtrInst(GetElementPtrInst &I) {
- handleShadowOr(I);
- }
-
- void visitExtractValueInst(ExtractValueInst &I) {
- IRBuilder<> IRB(&I);
- Value *Agg = I.getAggregateOperand();
- LLVM_DEBUG(dbgs() << "ExtractValue: " << I << "\n");
- Value *AggShadow = getShadow(Agg);
- LLVM_DEBUG(dbgs() << " AggShadow: " << *AggShadow << "\n");
- Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
- LLVM_DEBUG(dbgs() << " ResShadow: " << *ResShadow << "\n");
- setShadow(&I, ResShadow);
- setOriginForNaryOp(I);
- }
-
- void visitInsertValueInst(InsertValueInst &I) {
- IRBuilder<> IRB(&I);
- LLVM_DEBUG(dbgs() << "InsertValue: " << I << "\n");
- Value *AggShadow = getShadow(I.getAggregateOperand());
- Value *InsShadow = getShadow(I.getInsertedValueOperand());
- LLVM_DEBUG(dbgs() << " AggShadow: " << *AggShadow << "\n");
- LLVM_DEBUG(dbgs() << " InsShadow: " << *InsShadow << "\n");
- Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
- LLVM_DEBUG(dbgs() << " Res: " << *Res << "\n");
- setShadow(&I, Res);
- setOriginForNaryOp(I);
- }
-
- void dumpInst(Instruction &I) {
- if (CallInst *CI = dyn_cast<CallInst>(&I)) {
- errs() << "ZZZ call " << CI->getCalledFunction()->getName() << "\n";
- } else {
- errs() << "ZZZ " << I.getOpcodeName() << "\n";
- }
- errs() << "QQQ " << I << "\n";
- }
-
- void visitResumeInst(ResumeInst &I) {
- LLVM_DEBUG(dbgs() << "Resume: " << I << "\n");
- // Nothing to do here.
- }
-
- void visitCleanupReturnInst(CleanupReturnInst &CRI) {
- LLVM_DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n");
- // Nothing to do here.
- }
-
- void visitCatchReturnInst(CatchReturnInst &CRI) {
- LLVM_DEBUG(dbgs() << "CatchReturn: " << CRI << "\n");
- // Nothing to do here.
- }
-
- void instrumentAsmArgument(Value *Operand, Instruction &I, IRBuilder<> &IRB,
- const DataLayout &DL, bool isOutput) {
- // For each assembly argument, we check its value for being initialized.
- // If the argument is a pointer, we assume it points to a single element
- // of the corresponding type (or to a 8-byte word, if the type is unsized).
- // Each such pointer is instrumented with a call to the runtime library.
- Type *OpType = Operand->getType();
- // Check the operand value itself.
- insertShadowCheck(Operand, &I);
- if (!OpType->isPointerTy() || !isOutput) {
- assert(!isOutput);
- return;
- }
- Type *ElType = OpType->getPointerElementType();
- if (!ElType->isSized())
- return;
- int Size = DL.getTypeStoreSize(ElType);
- Value *Ptr = IRB.CreatePointerCast(Operand, IRB.getInt8PtrTy());
- Value *SizeVal = ConstantInt::get(MS.IntptrTy, Size);
- IRB.CreateCall(MS.MsanInstrumentAsmStoreFn, {Ptr, SizeVal});
- }
-
- /// Get the number of output arguments returned by pointers.
- int getNumOutputArgs(InlineAsm *IA, CallBase *CB) {
- int NumRetOutputs = 0;
- int NumOutputs = 0;
- Type *RetTy = cast<Value>(CB)->getType();
- if (!RetTy->isVoidTy()) {
- // Register outputs are returned via the CallInst return value.
- auto *ST = dyn_cast<StructType>(RetTy);
- if (ST)
- NumRetOutputs = ST->getNumElements();
- else
- NumRetOutputs = 1;
- }
- InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
- for (size_t i = 0, n = Constraints.size(); i < n; i++) {
- InlineAsm::ConstraintInfo Info = Constraints[i];
- switch (Info.Type) {
- case InlineAsm::isOutput:
- NumOutputs++;
- break;
- default:
- break;
- }
- }
- return NumOutputs - NumRetOutputs;
- }
-
- void visitAsmInstruction(Instruction &I) {
- // Conservative inline assembly handling: check for poisoned shadow of
- // asm() arguments, then unpoison the result and all the memory locations
- // pointed to by those arguments.
- // An inline asm() statement in C++ contains lists of input and output
- // arguments used by the assembly code. These are mapped to operands of the
- // CallInst as follows:
- // - nR register outputs ("=r) are returned by value in a single structure
- // (SSA value of the CallInst);
- // - nO other outputs ("=m" and others) are returned by pointer as first
- // nO operands of the CallInst;
- // - nI inputs ("r", "m" and others) are passed to CallInst as the
- // remaining nI operands.
- // The total number of asm() arguments in the source is nR+nO+nI, and the
- // corresponding CallInst has nO+nI+1 operands (the last operand is the
- // function to be called).
- const DataLayout &DL = F.getParent()->getDataLayout();
- CallBase *CB = cast<CallBase>(&I);
- IRBuilder<> IRB(&I);
- InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
- int OutputArgs = getNumOutputArgs(IA, CB);
- // The last operand of a CallInst is the function itself.
- int NumOperands = CB->getNumOperands() - 1;
-
- // Check input arguments. Doing so before unpoisoning output arguments, so
- // that we won't overwrite uninit values before checking them.
- for (int i = OutputArgs; i < NumOperands; i++) {
- Value *Operand = CB->getOperand(i);
- instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ false);
- }
- // Unpoison output arguments. This must happen before the actual InlineAsm
- // call, so that the shadow for memory published in the asm() statement
- // remains valid.
- for (int i = 0; i < OutputArgs; i++) {
- Value *Operand = CB->getOperand(i);
- instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ true);
- }
-
- setShadow(&I, getCleanShadow(&I));
- setOrigin(&I, getCleanOrigin());
- }
-
+ setShadow(&CB, getCleanShadow(&CB));
+ setOrigin(&CB, getCleanOrigin());
+ return;
+ }
+
+ IRBuilder<> IRBBefore(&CB);
+ // Until we have full dynamic coverage, make sure the retval shadow is 0.
+ Value *Base = getShadowPtrForRetval(&CB, IRBBefore);
+ IRBBefore.CreateAlignedStore(getCleanShadow(&CB), Base,
+ kShadowTLSAlignment);
+ BasicBlock::iterator NextInsn;
+ if (isa<CallInst>(CB)) {
+ NextInsn = ++CB.getIterator();
+ assert(NextInsn != CB.getParent()->end());
+ } else {
+ BasicBlock *NormalDest = cast<InvokeInst>(CB).getNormalDest();
+ if (!NormalDest->getSinglePredecessor()) {
+ // FIXME: this case is tricky, so we are just conservative here.
+ // Perhaps we need to split the edge between this BB and NormalDest,
+ // but a naive attempt to use SplitEdge leads to a crash.
+ setShadow(&CB, getCleanShadow(&CB));
+ setOrigin(&CB, getCleanOrigin());
+ return;
+ }
+ // FIXME: NextInsn is likely in a basic block that has not been visited yet.
+ // Anything inserted there will be instrumented by MSan later!
+ NextInsn = NormalDest->getFirstInsertionPt();
+ assert(NextInsn != NormalDest->end() &&
+ "Could not find insertion point for retval shadow load");
+ }
+ IRBuilder<> IRBAfter(&*NextInsn);
+ Value *RetvalShadow = IRBAfter.CreateAlignedLoad(
+ getShadowTy(&CB), getShadowPtrForRetval(&CB, IRBAfter),
+ kShadowTLSAlignment, "_msret");
+ setShadow(&CB, RetvalShadow);
+ if (MS.TrackOrigins)
+ setOrigin(&CB, IRBAfter.CreateLoad(MS.OriginTy,
+ getOriginPtrForRetval(IRBAfter)));
+ }
+
+ bool isAMustTailRetVal(Value *RetVal) {
+ if (auto *I = dyn_cast<BitCastInst>(RetVal)) {
+ RetVal = I->getOperand(0);
+ }
+ if (auto *I = dyn_cast<CallInst>(RetVal)) {
+ return I->isMustTailCall();
+ }
+ return false;
+ }
+
+ void visitReturnInst(ReturnInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *RetVal = I.getReturnValue();
+ if (!RetVal) return;
+ // Don't emit the epilogue for musttail call returns.
+ if (isAMustTailRetVal(RetVal)) return;
+ Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
+ bool HasNoUndef =
+ F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoUndef);
+ bool StoreShadow = !(ClEagerChecks && HasNoUndef);
+ // FIXME: Consider using SpecialCaseList to specify a list of functions that
+ // must always return fully initialized values. For now, we hardcode "main".
+ bool EagerCheck = (ClEagerChecks && HasNoUndef) || (F.getName() == "main");
+
+ Value *Shadow = getShadow(RetVal);
+ bool StoreOrigin = true;
+ if (EagerCheck) {
+ insertShadowCheck(RetVal, &I);
+ Shadow = getCleanShadow(RetVal);
+ StoreOrigin = false;
+ }
+
+ // The caller may still expect information passed over TLS if we pass our
+ // check
+ if (StoreShadow) {
+ IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+ if (MS.TrackOrigins && StoreOrigin)
+ IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
+ }
+ }
+
+ void visitPHINode(PHINode &I) {
+ IRBuilder<> IRB(&I);
+ if (!PropagateShadow) {
+ setShadow(&I, getCleanShadow(&I));
+ setOrigin(&I, getCleanOrigin());
+ return;
+ }
+
+ ShadowPHINodes.push_back(&I);
+ setShadow(&I, IRB.CreatePHI(getShadowTy(&I), I.getNumIncomingValues(),
+ "_msphi_s"));
+ if (MS.TrackOrigins)
+ setOrigin(&I, IRB.CreatePHI(MS.OriginTy, I.getNumIncomingValues(),
+ "_msphi_o"));
+ }
+
+ Value *getLocalVarDescription(AllocaInst &I) {
+ SmallString<2048> StackDescriptionStorage;
+ raw_svector_ostream StackDescription(StackDescriptionStorage);
+ // We create a string with a description of the stack allocation and
+ // pass it into __msan_set_alloca_origin.
+ // It will be printed by the run-time if stack-originated UMR is found.
+ // The first 4 bytes of the string are set to '----' and will be replaced
+ // by __msan_va_arg_overflow_size_tls at the first call.
+ StackDescription << "----" << I.getName() << "@" << F.getName();
+ return createPrivateNonConstGlobalForString(*F.getParent(),
+ StackDescription.str());
+ }
+
+ void poisonAllocaUserspace(AllocaInst &I, IRBuilder<> &IRB, Value *Len) {
+ if (PoisonStack && ClPoisonStackWithCall) {
+ IRB.CreateCall(MS.MsanPoisonStackFn,
+ {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
+ } else {
+ Value *ShadowBase, *OriginBase;
+ std::tie(ShadowBase, OriginBase) = getShadowOriginPtr(
+ &I, IRB, IRB.getInt8Ty(), Align(1), /*isStore*/ true);
+
+ Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
+ IRB.CreateMemSet(ShadowBase, PoisonValue, Len,
+ MaybeAlign(I.getAlignment()));
+ }
+
+ if (PoisonStack && MS.TrackOrigins) {
+ Value *Descr = getLocalVarDescription(I);
+ IRB.CreateCall(MS.MsanSetAllocaOrigin4Fn,
+ {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len,
+ IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()),
+ IRB.CreatePointerCast(&F, MS.IntptrTy)});
+ }
+ }
+
+ void poisonAllocaKmsan(AllocaInst &I, IRBuilder<> &IRB, Value *Len) {
+ Value *Descr = getLocalVarDescription(I);
+ if (PoisonStack) {
+ IRB.CreateCall(MS.MsanPoisonAllocaFn,
+ {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len,
+ IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy())});
+ } else {
+ IRB.CreateCall(MS.MsanUnpoisonAllocaFn,
+ {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
+ }
+ }
+
+ void instrumentAlloca(AllocaInst &I, Instruction *InsPoint = nullptr) {
+ if (!InsPoint)
+ InsPoint = &I;
+ IRBuilder<> IRB(InsPoint->getNextNode());
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ uint64_t TypeSize = DL.getTypeAllocSize(I.getAllocatedType());
+ Value *Len = ConstantInt::get(MS.IntptrTy, TypeSize);
+ if (I.isArrayAllocation())
+ Len = IRB.CreateMul(Len, I.getArraySize());
+
+ if (MS.CompileKernel)
+ poisonAllocaKmsan(I, IRB, Len);
+ else
+ poisonAllocaUserspace(I, IRB, Len);
+ }
+
+ void visitAllocaInst(AllocaInst &I) {
+ setShadow(&I, getCleanShadow(&I));
+ setOrigin(&I, getCleanOrigin());
+ // We'll get to this alloca later unless it's poisoned at the corresponding
+ // llvm.lifetime.start.
+ AllocaSet.insert(&I);
+ }
+
+ void visitSelectInst(SelectInst& I) {
+ IRBuilder<> IRB(&I);
+ // a = select b, c, d
+ Value *B = I.getCondition();
+ Value *C = I.getTrueValue();
+ Value *D = I.getFalseValue();
+ Value *Sb = getShadow(B);
+ Value *Sc = getShadow(C);
+ Value *Sd = getShadow(D);
+
+ // Result shadow if condition shadow is 0.
+ Value *Sa0 = IRB.CreateSelect(B, Sc, Sd);
+ Value *Sa1;
+ if (I.getType()->isAggregateType()) {
+ // To avoid "sign extending" i1 to an arbitrary aggregate type, we just do
+ // an extra "select". This results in much more compact IR.
+ // Sa = select Sb, poisoned, (select b, Sc, Sd)
+ Sa1 = getPoisonedShadow(getShadowTy(I.getType()));
+ } else {
+ // Sa = select Sb, [ (c^d) | Sc | Sd ], [ b ? Sc : Sd ]
+ // If Sb (condition is poisoned), look for bits in c and d that are equal
+ // and both unpoisoned.
+ // If !Sb (condition is unpoisoned), simply pick one of Sc and Sd.
+
+ // Cast arguments to shadow-compatible type.
+ C = CreateAppToShadowCast(IRB, C);
+ D = CreateAppToShadowCast(IRB, D);
+
+ // Result shadow if condition shadow is 1.
+ Sa1 = IRB.CreateOr({IRB.CreateXor(C, D), Sc, Sd});
+ }
+ Value *Sa = IRB.CreateSelect(Sb, Sa1, Sa0, "_msprop_select");
+ setShadow(&I, Sa);
+ if (MS.TrackOrigins) {
+ // Origins are always i32, so any vector conditions must be flattened.
+ // FIXME: consider tracking vector origins for app vectors?
+ if (B->getType()->isVectorTy()) {
+ Type *FlatTy = getShadowTyNoVec(B->getType());
+ B = IRB.CreateICmpNE(IRB.CreateBitCast(B, FlatTy),
+ ConstantInt::getNullValue(FlatTy));
+ Sb = IRB.CreateICmpNE(IRB.CreateBitCast(Sb, FlatTy),
+ ConstantInt::getNullValue(FlatTy));
+ }
+ // a = select b, c, d
+ // Oa = Sb ? Ob : (b ? Oc : Od)
+ setOrigin(
+ &I, IRB.CreateSelect(Sb, getOrigin(I.getCondition()),
+ IRB.CreateSelect(B, getOrigin(I.getTrueValue()),
+ getOrigin(I.getFalseValue()))));
+ }
+ }
+
+ void visitLandingPadInst(LandingPadInst &I) {
+ // Do nothing.
+ // See https://github.com/google/sanitizers/issues/504
+ setShadow(&I, getCleanShadow(&I));
+ setOrigin(&I, getCleanOrigin());
+ }
+
+ void visitCatchSwitchInst(CatchSwitchInst &I) {
+ setShadow(&I, getCleanShadow(&I));
+ setOrigin(&I, getCleanOrigin());
+ }
+
+ void visitFuncletPadInst(FuncletPadInst &I) {
+ setShadow(&I, getCleanShadow(&I));
+ setOrigin(&I, getCleanOrigin());
+ }
+
+ void visitGetElementPtrInst(GetElementPtrInst &I) {
+ handleShadowOr(I);
+ }
+
+ void visitExtractValueInst(ExtractValueInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *Agg = I.getAggregateOperand();
+ LLVM_DEBUG(dbgs() << "ExtractValue: " << I << "\n");
+ Value *AggShadow = getShadow(Agg);
+ LLVM_DEBUG(dbgs() << " AggShadow: " << *AggShadow << "\n");
+ Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
+ LLVM_DEBUG(dbgs() << " ResShadow: " << *ResShadow << "\n");
+ setShadow(&I, ResShadow);
+ setOriginForNaryOp(I);
+ }
+
+ void visitInsertValueInst(InsertValueInst &I) {
+ IRBuilder<> IRB(&I);
+ LLVM_DEBUG(dbgs() << "InsertValue: " << I << "\n");
+ Value *AggShadow = getShadow(I.getAggregateOperand());
+ Value *InsShadow = getShadow(I.getInsertedValueOperand());
+ LLVM_DEBUG(dbgs() << " AggShadow: " << *AggShadow << "\n");
+ LLVM_DEBUG(dbgs() << " InsShadow: " << *InsShadow << "\n");
+ Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
+ LLVM_DEBUG(dbgs() << " Res: " << *Res << "\n");
+ setShadow(&I, Res);
+ setOriginForNaryOp(I);
+ }
+
+ void dumpInst(Instruction &I) {
+ if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+ errs() << "ZZZ call " << CI->getCalledFunction()->getName() << "\n";
+ } else {
+ errs() << "ZZZ " << I.getOpcodeName() << "\n";
+ }
+ errs() << "QQQ " << I << "\n";
+ }
+
+ void visitResumeInst(ResumeInst &I) {
+ LLVM_DEBUG(dbgs() << "Resume: " << I << "\n");
+ // Nothing to do here.
+ }
+
+ void visitCleanupReturnInst(CleanupReturnInst &CRI) {
+ LLVM_DEBUG(dbgs() << "CleanupReturn: " << CRI << "\n");
+ // Nothing to do here.
+ }
+
+ void visitCatchReturnInst(CatchReturnInst &CRI) {
+ LLVM_DEBUG(dbgs() << "CatchReturn: " << CRI << "\n");
+ // Nothing to do here.
+ }
+
+ void instrumentAsmArgument(Value *Operand, Instruction &I, IRBuilder<> &IRB,
+ const DataLayout &DL, bool isOutput) {
+ // For each assembly argument, we check its value for being initialized.
+ // If the argument is a pointer, we assume it points to a single element
+ // of the corresponding type (or to a 8-byte word, if the type is unsized).
+ // Each such pointer is instrumented with a call to the runtime library.
+ Type *OpType = Operand->getType();
+ // Check the operand value itself.
+ insertShadowCheck(Operand, &I);
+ if (!OpType->isPointerTy() || !isOutput) {
+ assert(!isOutput);
+ return;
+ }
+ Type *ElType = OpType->getPointerElementType();
+ if (!ElType->isSized())
+ return;
+ int Size = DL.getTypeStoreSize(ElType);
+ Value *Ptr = IRB.CreatePointerCast(Operand, IRB.getInt8PtrTy());
+ Value *SizeVal = ConstantInt::get(MS.IntptrTy, Size);
+ IRB.CreateCall(MS.MsanInstrumentAsmStoreFn, {Ptr, SizeVal});
+ }
+
+ /// Get the number of output arguments returned by pointers.
+ int getNumOutputArgs(InlineAsm *IA, CallBase *CB) {
+ int NumRetOutputs = 0;
+ int NumOutputs = 0;
+ Type *RetTy = cast<Value>(CB)->getType();
+ if (!RetTy->isVoidTy()) {
+ // Register outputs are returned via the CallInst return value.
+ auto *ST = dyn_cast<StructType>(RetTy);
+ if (ST)
+ NumRetOutputs = ST->getNumElements();
+ else
+ NumRetOutputs = 1;
+ }
+ InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
+ for (size_t i = 0, n = Constraints.size(); i < n; i++) {
+ InlineAsm::ConstraintInfo Info = Constraints[i];
+ switch (Info.Type) {
+ case InlineAsm::isOutput:
+ NumOutputs++;
+ break;
+ default:
+ break;
+ }
+ }
+ return NumOutputs - NumRetOutputs;
+ }
+
+ void visitAsmInstruction(Instruction &I) {
+ // Conservative inline assembly handling: check for poisoned shadow of
+ // asm() arguments, then unpoison the result and all the memory locations
+ // pointed to by those arguments.
+ // An inline asm() statement in C++ contains lists of input and output
+ // arguments used by the assembly code. These are mapped to operands of the
+ // CallInst as follows:
+ // - nR register outputs ("=r) are returned by value in a single structure
+ // (SSA value of the CallInst);
+ // - nO other outputs ("=m" and others) are returned by pointer as first
+ // nO operands of the CallInst;
+ // - nI inputs ("r", "m" and others) are passed to CallInst as the
+ // remaining nI operands.
+ // The total number of asm() arguments in the source is nR+nO+nI, and the
+ // corresponding CallInst has nO+nI+1 operands (the last operand is the
+ // function to be called).
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ CallBase *CB = cast<CallBase>(&I);
+ IRBuilder<> IRB(&I);
+ InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
+ int OutputArgs = getNumOutputArgs(IA, CB);
+ // The last operand of a CallInst is the function itself.
+ int NumOperands = CB->getNumOperands() - 1;
+
+ // Check input arguments. Doing so before unpoisoning output arguments, so
+ // that we won't overwrite uninit values before checking them.
+ for (int i = OutputArgs; i < NumOperands; i++) {
+ Value *Operand = CB->getOperand(i);
+ instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ false);
+ }
+ // Unpoison output arguments. This must happen before the actual InlineAsm
+ // call, so that the shadow for memory published in the asm() statement
+ // remains valid.
+ for (int i = 0; i < OutputArgs; i++) {
+ Value *Operand = CB->getOperand(i);
+ instrumentAsmArgument(Operand, I, IRB, DL, /*isOutput*/ true);
+ }
+
+ setShadow(&I, getCleanShadow(&I));
+ setOrigin(&I, getCleanOrigin());
+ }
+
void visitFreezeInst(FreezeInst &I) {
// Freeze always returns a fully defined value.
setShadow(&I, getCleanShadow(&I));
setOrigin(&I, getCleanOrigin());
}
- void visitInstruction(Instruction &I) {
- // Everything else: stop propagating and check for poisoned shadow.
- if (ClDumpStrictInstructions)
- dumpInst(I);
- LLVM_DEBUG(dbgs() << "DEFAULT: " << I << "\n");
- for (size_t i = 0, n = I.getNumOperands(); i < n; i++) {
- Value *Operand = I.getOperand(i);
- if (Operand->getType()->isSized())
- insertShadowCheck(Operand, &I);
- }
- setShadow(&I, getCleanShadow(&I));
- setOrigin(&I, getCleanOrigin());
- }
-};
-
-/// AMD64-specific implementation of VarArgHelper.
-struct VarArgAMD64Helper : public VarArgHelper {
- // An unfortunate workaround for asymmetric lowering of va_arg stuff.
- // See a comment in visitCallBase for more details.
- static const unsigned AMD64GpEndOffset = 48; // AMD64 ABI Draft 0.99.6 p3.5.7
- static const unsigned AMD64FpEndOffsetSSE = 176;
- // If SSE is disabled, fp_offset in va_list is zero.
- static const unsigned AMD64FpEndOffsetNoSSE = AMD64GpEndOffset;
-
- unsigned AMD64FpEndOffset;
- Function &F;
- MemorySanitizer &MS;
- MemorySanitizerVisitor &MSV;
- Value *VAArgTLSCopy = nullptr;
- Value *VAArgTLSOriginCopy = nullptr;
- Value *VAArgOverflowSize = nullptr;
-
- SmallVector<CallInst*, 16> VAStartInstrumentationList;
-
- enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
-
- VarArgAMD64Helper(Function &F, MemorySanitizer &MS,
- MemorySanitizerVisitor &MSV)
- : F(F), MS(MS), MSV(MSV) {
- AMD64FpEndOffset = AMD64FpEndOffsetSSE;
- for (const auto &Attr : F.getAttributes().getFnAttributes()) {
- if (Attr.isStringAttribute() &&
- (Attr.getKindAsString() == "target-features")) {
- if (Attr.getValueAsString().contains("-sse"))
- AMD64FpEndOffset = AMD64FpEndOffsetNoSSE;
- break;
- }
- }
- }
-
- ArgKind classifyArgument(Value* arg) {
- // A very rough approximation of X86_64 argument classification rules.
- Type *T = arg->getType();
- if (T->isFPOrFPVectorTy() || T->isX86_MMXTy())
- return AK_FloatingPoint;
- if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
- return AK_GeneralPurpose;
- if (T->isPointerTy())
- return AK_GeneralPurpose;
- return AK_Memory;
- }
-
- // For VarArg functions, store the argument shadow in an ABI-specific format
- // that corresponds to va_list layout.
- // We do this because Clang lowers va_arg in the frontend, and this pass
- // only sees the low level code that deals with va_list internals.
- // A much easier alternative (provided that Clang emits va_arg instructions)
- // would have been to associate each live instance of va_list with a copy of
- // MSanParamTLS, and extract shadow on va_arg() call in the argument list
- // order.
- void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
- unsigned GpOffset = 0;
- unsigned FpOffset = AMD64GpEndOffset;
- unsigned OverflowOffset = AMD64FpEndOffset;
- const DataLayout &DL = F.getParent()->getDataLayout();
- for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
- ++ArgIt) {
- Value *A = *ArgIt;
- unsigned ArgNo = CB.getArgOperandNo(ArgIt);
- bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
- bool IsByVal = CB.paramHasAttr(ArgNo, Attribute::ByVal);
- if (IsByVal) {
- // ByVal arguments always go to the overflow area.
- // Fixed arguments passed through the overflow area will be stepped
- // over by va_start, so don't count them towards the offset.
- if (IsFixed)
- continue;
- assert(A->getType()->isPointerTy());
- Type *RealTy = CB.getParamByValType(ArgNo);
- uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
- Value *ShadowBase = getShadowPtrForVAArgument(
- RealTy, IRB, OverflowOffset, alignTo(ArgSize, 8));
- Value *OriginBase = nullptr;
- if (MS.TrackOrigins)
- OriginBase = getOriginPtrForVAArgument(RealTy, IRB, OverflowOffset);
- OverflowOffset += alignTo(ArgSize, 8);
- if (!ShadowBase)
- continue;
- Value *ShadowPtr, *OriginPtr;
- std::tie(ShadowPtr, OriginPtr) =
- MSV.getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), kShadowTLSAlignment,
- /*isStore*/ false);
-
- IRB.CreateMemCpy(ShadowBase, kShadowTLSAlignment, ShadowPtr,
- kShadowTLSAlignment, ArgSize);
- if (MS.TrackOrigins)
- IRB.CreateMemCpy(OriginBase, kShadowTLSAlignment, OriginPtr,
- kShadowTLSAlignment, ArgSize);
- } else {
- ArgKind AK = classifyArgument(A);
- if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset)
- AK = AK_Memory;
- if (AK == AK_FloatingPoint && FpOffset >= AMD64FpEndOffset)
- AK = AK_Memory;
- Value *ShadowBase, *OriginBase = nullptr;
- switch (AK) {
- case AK_GeneralPurpose:
- ShadowBase =
- getShadowPtrForVAArgument(A->getType(), IRB, GpOffset, 8);
- if (MS.TrackOrigins)
- OriginBase =
- getOriginPtrForVAArgument(A->getType(), IRB, GpOffset);
- GpOffset += 8;
- break;
- case AK_FloatingPoint:
- ShadowBase =
- getShadowPtrForVAArgument(A->getType(), IRB, FpOffset, 16);
- if (MS.TrackOrigins)
- OriginBase =
- getOriginPtrForVAArgument(A->getType(), IRB, FpOffset);
- FpOffset += 16;
- break;
- case AK_Memory:
- if (IsFixed)
- continue;
- uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
- ShadowBase =
- getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset, 8);
- if (MS.TrackOrigins)
- OriginBase =
- getOriginPtrForVAArgument(A->getType(), IRB, OverflowOffset);
- OverflowOffset += alignTo(ArgSize, 8);
- }
- // Take fixed arguments into account for GpOffset and FpOffset,
- // but don't actually store shadows for them.
- // TODO(glider): don't call get*PtrForVAArgument() for them.
- if (IsFixed)
- continue;
- if (!ShadowBase)
- continue;
- Value *Shadow = MSV.getShadow(A);
- IRB.CreateAlignedStore(Shadow, ShadowBase, kShadowTLSAlignment);
- if (MS.TrackOrigins) {
- Value *Origin = MSV.getOrigin(A);
- unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
- MSV.paintOrigin(IRB, Origin, OriginBase, StoreSize,
- std::max(kShadowTLSAlignment, kMinOriginAlignment));
- }
- }
- }
- Constant *OverflowSize =
- ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AMD64FpEndOffset);
- IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
- }
-
- /// Compute the shadow address for a given va_arg.
- Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
- unsigned ArgOffset, unsigned ArgSize) {
- // Make sure we don't overflow __msan_va_arg_tls.
- if (ArgOffset + ArgSize > kParamTLSSize)
- return nullptr;
- Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
- Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
- return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
- "_msarg_va_s");
- }
-
- /// Compute the origin address for a given va_arg.
- Value *getOriginPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, int ArgOffset) {
- Value *Base = IRB.CreatePointerCast(MS.VAArgOriginTLS, MS.IntptrTy);
- // getOriginPtrForVAArgument() is always called after
- // getShadowPtrForVAArgument(), so __msan_va_arg_origin_tls can never
- // overflow.
- Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
- return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
- "_msarg_va_o");
- }
-
- void unpoisonVAListTagForInst(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Value *VAListTag = I.getArgOperand(0);
- Value *ShadowPtr, *OriginPtr;
- const Align Alignment = Align(8);
- std::tie(ShadowPtr, OriginPtr) =
- MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment,
- /*isStore*/ true);
-
- // Unpoison the whole __va_list_tag.
- // FIXME: magic ABI constants.
- IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
- /* size */ 24, Alignment, false);
- // We shouldn't need to zero out the origins, as they're only checked for
- // nonzero shadow.
- }
-
- void visitVAStartInst(VAStartInst &I) override {
- if (F.getCallingConv() == CallingConv::Win64)
- return;
- VAStartInstrumentationList.push_back(&I);
- unpoisonVAListTagForInst(I);
- }
-
- void visitVACopyInst(VACopyInst &I) override {
- if (F.getCallingConv() == CallingConv::Win64) return;
- unpoisonVAListTagForInst(I);
- }
-
- void finalizeInstrumentation() override {
- assert(!VAArgOverflowSize && !VAArgTLSCopy &&
- "finalizeInstrumentation called twice");
- if (!VAStartInstrumentationList.empty()) {
- // If there is a va_start in this function, make a backup copy of
- // va_arg_tls somewhere in the function entry block.
+ void visitInstruction(Instruction &I) {
+ // Everything else: stop propagating and check for poisoned shadow.
+ if (ClDumpStrictInstructions)
+ dumpInst(I);
+ LLVM_DEBUG(dbgs() << "DEFAULT: " << I << "\n");
+ for (size_t i = 0, n = I.getNumOperands(); i < n; i++) {
+ Value *Operand = I.getOperand(i);
+ if (Operand->getType()->isSized())
+ insertShadowCheck(Operand, &I);
+ }
+ setShadow(&I, getCleanShadow(&I));
+ setOrigin(&I, getCleanOrigin());
+ }
+};
+
+/// AMD64-specific implementation of VarArgHelper.
+struct VarArgAMD64Helper : public VarArgHelper {
+ // An unfortunate workaround for asymmetric lowering of va_arg stuff.
+ // See a comment in visitCallBase for more details.
+ static const unsigned AMD64GpEndOffset = 48; // AMD64 ABI Draft 0.99.6 p3.5.7
+ static const unsigned AMD64FpEndOffsetSSE = 176;
+ // If SSE is disabled, fp_offset in va_list is zero.
+ static const unsigned AMD64FpEndOffsetNoSSE = AMD64GpEndOffset;
+
+ unsigned AMD64FpEndOffset;
+ Function &F;
+ MemorySanitizer &MS;
+ MemorySanitizerVisitor &MSV;
+ Value *VAArgTLSCopy = nullptr;
+ Value *VAArgTLSOriginCopy = nullptr;
+ Value *VAArgOverflowSize = nullptr;
+
+ SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+ enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
+
+ VarArgAMD64Helper(Function &F, MemorySanitizer &MS,
+ MemorySanitizerVisitor &MSV)
+ : F(F), MS(MS), MSV(MSV) {
+ AMD64FpEndOffset = AMD64FpEndOffsetSSE;
+ for (const auto &Attr : F.getAttributes().getFnAttributes()) {
+ if (Attr.isStringAttribute() &&
+ (Attr.getKindAsString() == "target-features")) {
+ if (Attr.getValueAsString().contains("-sse"))
+ AMD64FpEndOffset = AMD64FpEndOffsetNoSSE;
+ break;
+ }
+ }
+ }
+
+ ArgKind classifyArgument(Value* arg) {
+ // A very rough approximation of X86_64 argument classification rules.
+ Type *T = arg->getType();
+ if (T->isFPOrFPVectorTy() || T->isX86_MMXTy())
+ return AK_FloatingPoint;
+ if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
+ return AK_GeneralPurpose;
+ if (T->isPointerTy())
+ return AK_GeneralPurpose;
+ return AK_Memory;
+ }
+
+ // For VarArg functions, store the argument shadow in an ABI-specific format
+ // that corresponds to va_list layout.
+ // We do this because Clang lowers va_arg in the frontend, and this pass
+ // only sees the low level code that deals with va_list internals.
+ // A much easier alternative (provided that Clang emits va_arg instructions)
+ // would have been to associate each live instance of va_list with a copy of
+ // MSanParamTLS, and extract shadow on va_arg() call in the argument list
+ // order.
+ void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
+ unsigned GpOffset = 0;
+ unsigned FpOffset = AMD64GpEndOffset;
+ unsigned OverflowOffset = AMD64FpEndOffset;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+ ++ArgIt) {
+ Value *A = *ArgIt;
+ unsigned ArgNo = CB.getArgOperandNo(ArgIt);
+ bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
+ bool IsByVal = CB.paramHasAttr(ArgNo, Attribute::ByVal);
+ if (IsByVal) {
+ // ByVal arguments always go to the overflow area.
+ // Fixed arguments passed through the overflow area will be stepped
+ // over by va_start, so don't count them towards the offset.
+ if (IsFixed)
+ continue;
+ assert(A->getType()->isPointerTy());
+ Type *RealTy = CB.getParamByValType(ArgNo);
+ uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
+ Value *ShadowBase = getShadowPtrForVAArgument(
+ RealTy, IRB, OverflowOffset, alignTo(ArgSize, 8));
+ Value *OriginBase = nullptr;
+ if (MS.TrackOrigins)
+ OriginBase = getOriginPtrForVAArgument(RealTy, IRB, OverflowOffset);
+ OverflowOffset += alignTo(ArgSize, 8);
+ if (!ShadowBase)
+ continue;
+ Value *ShadowPtr, *OriginPtr;
+ std::tie(ShadowPtr, OriginPtr) =
+ MSV.getShadowOriginPtr(A, IRB, IRB.getInt8Ty(), kShadowTLSAlignment,
+ /*isStore*/ false);
+
+ IRB.CreateMemCpy(ShadowBase, kShadowTLSAlignment, ShadowPtr,
+ kShadowTLSAlignment, ArgSize);
+ if (MS.TrackOrigins)
+ IRB.CreateMemCpy(OriginBase, kShadowTLSAlignment, OriginPtr,
+ kShadowTLSAlignment, ArgSize);
+ } else {
+ ArgKind AK = classifyArgument(A);
+ if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset)
+ AK = AK_Memory;
+ if (AK == AK_FloatingPoint && FpOffset >= AMD64FpEndOffset)
+ AK = AK_Memory;
+ Value *ShadowBase, *OriginBase = nullptr;
+ switch (AK) {
+ case AK_GeneralPurpose:
+ ShadowBase =
+ getShadowPtrForVAArgument(A->getType(), IRB, GpOffset, 8);
+ if (MS.TrackOrigins)
+ OriginBase =
+ getOriginPtrForVAArgument(A->getType(), IRB, GpOffset);
+ GpOffset += 8;
+ break;
+ case AK_FloatingPoint:
+ ShadowBase =
+ getShadowPtrForVAArgument(A->getType(), IRB, FpOffset, 16);
+ if (MS.TrackOrigins)
+ OriginBase =
+ getOriginPtrForVAArgument(A->getType(), IRB, FpOffset);
+ FpOffset += 16;
+ break;
+ case AK_Memory:
+ if (IsFixed)
+ continue;
+ uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+ ShadowBase =
+ getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset, 8);
+ if (MS.TrackOrigins)
+ OriginBase =
+ getOriginPtrForVAArgument(A->getType(), IRB, OverflowOffset);
+ OverflowOffset += alignTo(ArgSize, 8);
+ }
+ // Take fixed arguments into account for GpOffset and FpOffset,
+ // but don't actually store shadows for them.
+ // TODO(glider): don't call get*PtrForVAArgument() for them.
+ if (IsFixed)
+ continue;
+ if (!ShadowBase)
+ continue;
+ Value *Shadow = MSV.getShadow(A);
+ IRB.CreateAlignedStore(Shadow, ShadowBase, kShadowTLSAlignment);
+ if (MS.TrackOrigins) {
+ Value *Origin = MSV.getOrigin(A);
+ unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
+ MSV.paintOrigin(IRB, Origin, OriginBase, StoreSize,
+ std::max(kShadowTLSAlignment, kMinOriginAlignment));
+ }
+ }
+ }
+ Constant *OverflowSize =
+ ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AMD64FpEndOffset);
+ IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+ }
+
+ /// Compute the shadow address for a given va_arg.
+ Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+ unsigned ArgOffset, unsigned ArgSize) {
+ // Make sure we don't overflow __msan_va_arg_tls.
+ if (ArgOffset + ArgSize > kParamTLSSize)
+ return nullptr;
+ Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+ Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+ return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+ "_msarg_va_s");
+ }
+
+ /// Compute the origin address for a given va_arg.
+ Value *getOriginPtrForVAArgument(Type *Ty, IRBuilder<> &IRB, int ArgOffset) {
+ Value *Base = IRB.CreatePointerCast(MS.VAArgOriginTLS, MS.IntptrTy);
+ // getOriginPtrForVAArgument() is always called after
+ // getShadowPtrForVAArgument(), so __msan_va_arg_origin_tls can never
+ // overflow.
+ Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+ return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
+ "_msarg_va_o");
+ }
+
+ void unpoisonVAListTagForInst(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *VAListTag = I.getArgOperand(0);
+ Value *ShadowPtr, *OriginPtr;
+ const Align Alignment = Align(8);
+ std::tie(ShadowPtr, OriginPtr) =
+ MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment,
+ /*isStore*/ true);
+
+ // Unpoison the whole __va_list_tag.
+ // FIXME: magic ABI constants.
+ IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+ /* size */ 24, Alignment, false);
+ // We shouldn't need to zero out the origins, as they're only checked for
+ // nonzero shadow.
+ }
+
+ void visitVAStartInst(VAStartInst &I) override {
+ if (F.getCallingConv() == CallingConv::Win64)
+ return;
+ VAStartInstrumentationList.push_back(&I);
+ unpoisonVAListTagForInst(I);
+ }
+
+ void visitVACopyInst(VACopyInst &I) override {
+ if (F.getCallingConv() == CallingConv::Win64) return;
+ unpoisonVAListTagForInst(I);
+ }
+
+ void finalizeInstrumentation() override {
+ assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+ "finalizeInstrumentation called twice");
+ if (!VAStartInstrumentationList.empty()) {
+ // If there is a va_start in this function, make a backup copy of
+ // va_arg_tls somewhere in the function entry block.
IRBuilder<> IRB(MSV.FnPrologueEnd);
- VAArgOverflowSize =
- IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
- Value *CopySize =
- IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset),
- VAArgOverflowSize);
- VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
- IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
- if (MS.TrackOrigins) {
- VAArgTLSOriginCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
- IRB.CreateMemCpy(VAArgTLSOriginCopy, Align(8), MS.VAArgOriginTLS,
- Align(8), CopySize);
- }
- }
-
- // Instrument va_start.
- // Copy va_list shadow from the backup copy of the TLS contents.
- for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
- CallInst *OrigInst = VAStartInstrumentationList[i];
- IRBuilder<> IRB(OrigInst->getNextNode());
- Value *VAListTag = OrigInst->getArgOperand(0);
-
- Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
- Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
- IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
- ConstantInt::get(MS.IntptrTy, 16)),
- PointerType::get(RegSaveAreaPtrTy, 0));
- Value *RegSaveAreaPtr =
- IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
- Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
- const Align Alignment = Align(16);
- std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
- MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
- Alignment, /*isStore*/ true);
- IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
- AMD64FpEndOffset);
- if (MS.TrackOrigins)
- IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy,
- Alignment, AMD64FpEndOffset);
- Type *OverflowArgAreaPtrTy = Type::getInt64PtrTy(*MS.C);
- Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
- IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
- ConstantInt::get(MS.IntptrTy, 8)),
- PointerType::get(OverflowArgAreaPtrTy, 0));
- Value *OverflowArgAreaPtr =
- IRB.CreateLoad(OverflowArgAreaPtrTy, OverflowArgAreaPtrPtr);
- Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr;
- std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) =
- MSV.getShadowOriginPtr(OverflowArgAreaPtr, IRB, IRB.getInt8Ty(),
- Alignment, /*isStore*/ true);
- Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy,
- AMD64FpEndOffset);
- IRB.CreateMemCpy(OverflowArgAreaShadowPtr, Alignment, SrcPtr, Alignment,
- VAArgOverflowSize);
- if (MS.TrackOrigins) {
- SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSOriginCopy,
- AMD64FpEndOffset);
- IRB.CreateMemCpy(OverflowArgAreaOriginPtr, Alignment, SrcPtr, Alignment,
- VAArgOverflowSize);
- }
- }
- }
-};
-
-/// MIPS64-specific implementation of VarArgHelper.
-struct VarArgMIPS64Helper : public VarArgHelper {
- Function &F;
- MemorySanitizer &MS;
- MemorySanitizerVisitor &MSV;
- Value *VAArgTLSCopy = nullptr;
- Value *VAArgSize = nullptr;
-
- SmallVector<CallInst*, 16> VAStartInstrumentationList;
-
- VarArgMIPS64Helper(Function &F, MemorySanitizer &MS,
- MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {}
-
- void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
- unsigned VAArgOffset = 0;
- const DataLayout &DL = F.getParent()->getDataLayout();
- for (auto ArgIt = CB.arg_begin() + CB.getFunctionType()->getNumParams(),
- End = CB.arg_end();
- ArgIt != End; ++ArgIt) {
- Triple TargetTriple(F.getParent()->getTargetTriple());
- Value *A = *ArgIt;
- Value *Base;
- uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
- if (TargetTriple.getArch() == Triple::mips64) {
- // Adjusting the shadow for argument with size < 8 to match the placement
- // of bits in big endian system
- if (ArgSize < 8)
- VAArgOffset += (8 - ArgSize);
- }
- Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset, ArgSize);
- VAArgOffset += ArgSize;
- VAArgOffset = alignTo(VAArgOffset, 8);
- if (!Base)
- continue;
- IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
- }
-
- Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), VAArgOffset);
- // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
- // a new class member i.e. it is the total size of all VarArgs.
- IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
- }
-
- /// Compute the shadow address for a given va_arg.
- Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
- unsigned ArgOffset, unsigned ArgSize) {
- // Make sure we don't overflow __msan_va_arg_tls.
- if (ArgOffset + ArgSize > kParamTLSSize)
- return nullptr;
- Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
- Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
- return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
- "_msarg");
- }
-
- void visitVAStartInst(VAStartInst &I) override {
- IRBuilder<> IRB(&I);
- VAStartInstrumentationList.push_back(&I);
- Value *VAListTag = I.getArgOperand(0);
- Value *ShadowPtr, *OriginPtr;
- const Align Alignment = Align(8);
- std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
- VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
- IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
- /* size */ 8, Alignment, false);
- }
-
- void visitVACopyInst(VACopyInst &I) override {
- IRBuilder<> IRB(&I);
- VAStartInstrumentationList.push_back(&I);
- Value *VAListTag = I.getArgOperand(0);
- Value *ShadowPtr, *OriginPtr;
- const Align Alignment = Align(8);
- std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
- VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
- IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
- /* size */ 8, Alignment, false);
- }
-
- void finalizeInstrumentation() override {
- assert(!VAArgSize && !VAArgTLSCopy &&
- "finalizeInstrumentation called twice");
+ VAArgOverflowSize =
+ IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+ Value *CopySize =
+ IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset),
+ VAArgOverflowSize);
+ VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+ IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
+ if (MS.TrackOrigins) {
+ VAArgTLSOriginCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+ IRB.CreateMemCpy(VAArgTLSOriginCopy, Align(8), MS.VAArgOriginTLS,
+ Align(8), CopySize);
+ }
+ }
+
+ // Instrument va_start.
+ // Copy va_list shadow from the backup copy of the TLS contents.
+ for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+ CallInst *OrigInst = VAStartInstrumentationList[i];
+ IRBuilder<> IRB(OrigInst->getNextNode());
+ Value *VAListTag = OrigInst->getArgOperand(0);
+
+ Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+ Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
+ IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+ ConstantInt::get(MS.IntptrTy, 16)),
+ PointerType::get(RegSaveAreaPtrTy, 0));
+ Value *RegSaveAreaPtr =
+ IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+ Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
+ const Align Alignment = Align(16);
+ std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
+ MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+ Alignment, /*isStore*/ true);
+ IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+ AMD64FpEndOffset);
+ if (MS.TrackOrigins)
+ IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy,
+ Alignment, AMD64FpEndOffset);
+ Type *OverflowArgAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+ Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
+ IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+ ConstantInt::get(MS.IntptrTy, 8)),
+ PointerType::get(OverflowArgAreaPtrTy, 0));
+ Value *OverflowArgAreaPtr =
+ IRB.CreateLoad(OverflowArgAreaPtrTy, OverflowArgAreaPtrPtr);
+ Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr;
+ std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) =
+ MSV.getShadowOriginPtr(OverflowArgAreaPtr, IRB, IRB.getInt8Ty(),
+ Alignment, /*isStore*/ true);
+ Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy,
+ AMD64FpEndOffset);
+ IRB.CreateMemCpy(OverflowArgAreaShadowPtr, Alignment, SrcPtr, Alignment,
+ VAArgOverflowSize);
+ if (MS.TrackOrigins) {
+ SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSOriginCopy,
+ AMD64FpEndOffset);
+ IRB.CreateMemCpy(OverflowArgAreaOriginPtr, Alignment, SrcPtr, Alignment,
+ VAArgOverflowSize);
+ }
+ }
+ }
+};
+
+/// MIPS64-specific implementation of VarArgHelper.
+struct VarArgMIPS64Helper : public VarArgHelper {
+ Function &F;
+ MemorySanitizer &MS;
+ MemorySanitizerVisitor &MSV;
+ Value *VAArgTLSCopy = nullptr;
+ Value *VAArgSize = nullptr;
+
+ SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+ VarArgMIPS64Helper(Function &F, MemorySanitizer &MS,
+ MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {}
+
+ void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
+ unsigned VAArgOffset = 0;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ for (auto ArgIt = CB.arg_begin() + CB.getFunctionType()->getNumParams(),
+ End = CB.arg_end();
+ ArgIt != End; ++ArgIt) {
+ Triple TargetTriple(F.getParent()->getTargetTriple());
+ Value *A = *ArgIt;
+ Value *Base;
+ uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+ if (TargetTriple.getArch() == Triple::mips64) {
+ // Adjusting the shadow for argument with size < 8 to match the placement
+ // of bits in big endian system
+ if (ArgSize < 8)
+ VAArgOffset += (8 - ArgSize);
+ }
+ Base = getShadowPtrForVAArgument(A->getType(), IRB, VAArgOffset, ArgSize);
+ VAArgOffset += ArgSize;
+ VAArgOffset = alignTo(VAArgOffset, 8);
+ if (!Base)
+ continue;
+ IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+ }
+
+ Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), VAArgOffset);
+ // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
+ // a new class member i.e. it is the total size of all VarArgs.
+ IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
+ }
+
+ /// Compute the shadow address for a given va_arg.
+ Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+ unsigned ArgOffset, unsigned ArgSize) {
+ // Make sure we don't overflow __msan_va_arg_tls.
+ if (ArgOffset + ArgSize > kParamTLSSize)
+ return nullptr;
+ Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+ Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+ return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+ "_msarg");
+ }
+
+ void visitVAStartInst(VAStartInst &I) override {
+ IRBuilder<> IRB(&I);
+ VAStartInstrumentationList.push_back(&I);
+ Value *VAListTag = I.getArgOperand(0);
+ Value *ShadowPtr, *OriginPtr;
+ const Align Alignment = Align(8);
+ std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+ VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+ IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+ /* size */ 8, Alignment, false);
+ }
+
+ void visitVACopyInst(VACopyInst &I) override {
+ IRBuilder<> IRB(&I);
+ VAStartInstrumentationList.push_back(&I);
+ Value *VAListTag = I.getArgOperand(0);
+ Value *ShadowPtr, *OriginPtr;
+ const Align Alignment = Align(8);
+ std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+ VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+ IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+ /* size */ 8, Alignment, false);
+ }
+
+ void finalizeInstrumentation() override {
+ assert(!VAArgSize && !VAArgTLSCopy &&
+ "finalizeInstrumentation called twice");
IRBuilder<> IRB(MSV.FnPrologueEnd);
- VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
- Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
- VAArgSize);
-
- if (!VAStartInstrumentationList.empty()) {
- // If there is a va_start in this function, make a backup copy of
- // va_arg_tls somewhere in the function entry block.
- VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
- IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
- }
-
- // Instrument va_start.
- // Copy va_list shadow from the backup copy of the TLS contents.
- for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
- CallInst *OrigInst = VAStartInstrumentationList[i];
- IRBuilder<> IRB(OrigInst->getNextNode());
- Value *VAListTag = OrigInst->getArgOperand(0);
- Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
- Value *RegSaveAreaPtrPtr =
- IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
- PointerType::get(RegSaveAreaPtrTy, 0));
- Value *RegSaveAreaPtr =
- IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
- Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
- const Align Alignment = Align(8);
- std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
- MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
- Alignment, /*isStore*/ true);
- IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
- CopySize);
- }
- }
-};
-
-/// AArch64-specific implementation of VarArgHelper.
-struct VarArgAArch64Helper : public VarArgHelper {
- static const unsigned kAArch64GrArgSize = 64;
- static const unsigned kAArch64VrArgSize = 128;
-
- static const unsigned AArch64GrBegOffset = 0;
- static const unsigned AArch64GrEndOffset = kAArch64GrArgSize;
- // Make VR space aligned to 16 bytes.
- static const unsigned AArch64VrBegOffset = AArch64GrEndOffset;
- static const unsigned AArch64VrEndOffset = AArch64VrBegOffset
- + kAArch64VrArgSize;
- static const unsigned AArch64VAEndOffset = AArch64VrEndOffset;
-
- Function &F;
- MemorySanitizer &MS;
- MemorySanitizerVisitor &MSV;
- Value *VAArgTLSCopy = nullptr;
- Value *VAArgOverflowSize = nullptr;
-
- SmallVector<CallInst*, 16> VAStartInstrumentationList;
-
- enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
-
- VarArgAArch64Helper(Function &F, MemorySanitizer &MS,
- MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {}
-
- ArgKind classifyArgument(Value* arg) {
- Type *T = arg->getType();
- if (T->isFPOrFPVectorTy())
- return AK_FloatingPoint;
- if ((T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
- || (T->isPointerTy()))
- return AK_GeneralPurpose;
- return AK_Memory;
- }
-
- // The instrumentation stores the argument shadow in a non ABI-specific
- // format because it does not know which argument is named (since Clang,
- // like x86_64 case, lowers the va_args in the frontend and this pass only
- // sees the low level code that deals with va_list internals).
- // The first seven GR registers are saved in the first 56 bytes of the
- // va_arg tls arra, followers by the first 8 FP/SIMD registers, and then
- // the remaining arguments.
- // Using constant offset within the va_arg TLS array allows fast copy
- // in the finalize instrumentation.
- void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
- unsigned GrOffset = AArch64GrBegOffset;
- unsigned VrOffset = AArch64VrBegOffset;
- unsigned OverflowOffset = AArch64VAEndOffset;
-
- const DataLayout &DL = F.getParent()->getDataLayout();
- for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
- ++ArgIt) {
- Value *A = *ArgIt;
- unsigned ArgNo = CB.getArgOperandNo(ArgIt);
- bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
- ArgKind AK = classifyArgument(A);
- if (AK == AK_GeneralPurpose && GrOffset >= AArch64GrEndOffset)
- AK = AK_Memory;
- if (AK == AK_FloatingPoint && VrOffset >= AArch64VrEndOffset)
- AK = AK_Memory;
- Value *Base;
- switch (AK) {
- case AK_GeneralPurpose:
- Base = getShadowPtrForVAArgument(A->getType(), IRB, GrOffset, 8);
- GrOffset += 8;
- break;
- case AK_FloatingPoint:
- Base = getShadowPtrForVAArgument(A->getType(), IRB, VrOffset, 8);
- VrOffset += 16;
- break;
- case AK_Memory:
- // Don't count fixed arguments in the overflow area - va_start will
- // skip right over them.
- if (IsFixed)
- continue;
- uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
- Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset,
- alignTo(ArgSize, 8));
- OverflowOffset += alignTo(ArgSize, 8);
- break;
- }
- // Count Gp/Vr fixed arguments to their respective offsets, but don't
- // bother to actually store a shadow.
- if (IsFixed)
- continue;
- if (!Base)
- continue;
- IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
- }
- Constant *OverflowSize =
- ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AArch64VAEndOffset);
- IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
- }
-
- /// Compute the shadow address for a given va_arg.
- Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
- unsigned ArgOffset, unsigned ArgSize) {
- // Make sure we don't overflow __msan_va_arg_tls.
- if (ArgOffset + ArgSize > kParamTLSSize)
- return nullptr;
- Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
- Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
- return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
- "_msarg");
- }
-
- void visitVAStartInst(VAStartInst &I) override {
- IRBuilder<> IRB(&I);
- VAStartInstrumentationList.push_back(&I);
- Value *VAListTag = I.getArgOperand(0);
- Value *ShadowPtr, *OriginPtr;
- const Align Alignment = Align(8);
- std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
- VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
- IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
- /* size */ 32, Alignment, false);
- }
-
- void visitVACopyInst(VACopyInst &I) override {
- IRBuilder<> IRB(&I);
- VAStartInstrumentationList.push_back(&I);
- Value *VAListTag = I.getArgOperand(0);
- Value *ShadowPtr, *OriginPtr;
- const Align Alignment = Align(8);
- std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
- VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
- IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
- /* size */ 32, Alignment, false);
- }
-
- // Retrieve a va_list field of 'void*' size.
- Value* getVAField64(IRBuilder<> &IRB, Value *VAListTag, int offset) {
- Value *SaveAreaPtrPtr =
- IRB.CreateIntToPtr(
- IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
- ConstantInt::get(MS.IntptrTy, offset)),
- Type::getInt64PtrTy(*MS.C));
- return IRB.CreateLoad(Type::getInt64Ty(*MS.C), SaveAreaPtrPtr);
- }
-
- // Retrieve a va_list field of 'int' size.
- Value* getVAField32(IRBuilder<> &IRB, Value *VAListTag, int offset) {
- Value *SaveAreaPtr =
- IRB.CreateIntToPtr(
- IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
- ConstantInt::get(MS.IntptrTy, offset)),
- Type::getInt32PtrTy(*MS.C));
- Value *SaveArea32 = IRB.CreateLoad(IRB.getInt32Ty(), SaveAreaPtr);
- return IRB.CreateSExt(SaveArea32, MS.IntptrTy);
- }
-
- void finalizeInstrumentation() override {
- assert(!VAArgOverflowSize && !VAArgTLSCopy &&
- "finalizeInstrumentation called twice");
- if (!VAStartInstrumentationList.empty()) {
- // If there is a va_start in this function, make a backup copy of
- // va_arg_tls somewhere in the function entry block.
+ VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+ Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
+ VAArgSize);
+
+ if (!VAStartInstrumentationList.empty()) {
+ // If there is a va_start in this function, make a backup copy of
+ // va_arg_tls somewhere in the function entry block.
+ VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+ IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
+ }
+
+ // Instrument va_start.
+ // Copy va_list shadow from the backup copy of the TLS contents.
+ for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+ CallInst *OrigInst = VAStartInstrumentationList[i];
+ IRBuilder<> IRB(OrigInst->getNextNode());
+ Value *VAListTag = OrigInst->getArgOperand(0);
+ Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+ Value *RegSaveAreaPtrPtr =
+ IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+ PointerType::get(RegSaveAreaPtrTy, 0));
+ Value *RegSaveAreaPtr =
+ IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+ Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
+ const Align Alignment = Align(8);
+ std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
+ MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+ Alignment, /*isStore*/ true);
+ IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+ CopySize);
+ }
+ }
+};
+
+/// AArch64-specific implementation of VarArgHelper.
+struct VarArgAArch64Helper : public VarArgHelper {
+ static const unsigned kAArch64GrArgSize = 64;
+ static const unsigned kAArch64VrArgSize = 128;
+
+ static const unsigned AArch64GrBegOffset = 0;
+ static const unsigned AArch64GrEndOffset = kAArch64GrArgSize;
+ // Make VR space aligned to 16 bytes.
+ static const unsigned AArch64VrBegOffset = AArch64GrEndOffset;
+ static const unsigned AArch64VrEndOffset = AArch64VrBegOffset
+ + kAArch64VrArgSize;
+ static const unsigned AArch64VAEndOffset = AArch64VrEndOffset;
+
+ Function &F;
+ MemorySanitizer &MS;
+ MemorySanitizerVisitor &MSV;
+ Value *VAArgTLSCopy = nullptr;
+ Value *VAArgOverflowSize = nullptr;
+
+ SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+ enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
+
+ VarArgAArch64Helper(Function &F, MemorySanitizer &MS,
+ MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {}
+
+ ArgKind classifyArgument(Value* arg) {
+ Type *T = arg->getType();
+ if (T->isFPOrFPVectorTy())
+ return AK_FloatingPoint;
+ if ((T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
+ || (T->isPointerTy()))
+ return AK_GeneralPurpose;
+ return AK_Memory;
+ }
+
+ // The instrumentation stores the argument shadow in a non ABI-specific
+ // format because it does not know which argument is named (since Clang,
+ // like x86_64 case, lowers the va_args in the frontend and this pass only
+ // sees the low level code that deals with va_list internals).
+ // The first seven GR registers are saved in the first 56 bytes of the
+ // va_arg tls arra, followers by the first 8 FP/SIMD registers, and then
+ // the remaining arguments.
+ // Using constant offset within the va_arg TLS array allows fast copy
+ // in the finalize instrumentation.
+ void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
+ unsigned GrOffset = AArch64GrBegOffset;
+ unsigned VrOffset = AArch64VrBegOffset;
+ unsigned OverflowOffset = AArch64VAEndOffset;
+
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+ ++ArgIt) {
+ Value *A = *ArgIt;
+ unsigned ArgNo = CB.getArgOperandNo(ArgIt);
+ bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
+ ArgKind AK = classifyArgument(A);
+ if (AK == AK_GeneralPurpose && GrOffset >= AArch64GrEndOffset)
+ AK = AK_Memory;
+ if (AK == AK_FloatingPoint && VrOffset >= AArch64VrEndOffset)
+ AK = AK_Memory;
+ Value *Base;
+ switch (AK) {
+ case AK_GeneralPurpose:
+ Base = getShadowPtrForVAArgument(A->getType(), IRB, GrOffset, 8);
+ GrOffset += 8;
+ break;
+ case AK_FloatingPoint:
+ Base = getShadowPtrForVAArgument(A->getType(), IRB, VrOffset, 8);
+ VrOffset += 16;
+ break;
+ case AK_Memory:
+ // Don't count fixed arguments in the overflow area - va_start will
+ // skip right over them.
+ if (IsFixed)
+ continue;
+ uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+ Base = getShadowPtrForVAArgument(A->getType(), IRB, OverflowOffset,
+ alignTo(ArgSize, 8));
+ OverflowOffset += alignTo(ArgSize, 8);
+ break;
+ }
+ // Count Gp/Vr fixed arguments to their respective offsets, but don't
+ // bother to actually store a shadow.
+ if (IsFixed)
+ continue;
+ if (!Base)
+ continue;
+ IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+ }
+ Constant *OverflowSize =
+ ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AArch64VAEndOffset);
+ IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+ }
+
+ /// Compute the shadow address for a given va_arg.
+ Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+ unsigned ArgOffset, unsigned ArgSize) {
+ // Make sure we don't overflow __msan_va_arg_tls.
+ if (ArgOffset + ArgSize > kParamTLSSize)
+ return nullptr;
+ Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+ Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+ return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+ "_msarg");
+ }
+
+ void visitVAStartInst(VAStartInst &I) override {
+ IRBuilder<> IRB(&I);
+ VAStartInstrumentationList.push_back(&I);
+ Value *VAListTag = I.getArgOperand(0);
+ Value *ShadowPtr, *OriginPtr;
+ const Align Alignment = Align(8);
+ std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+ VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+ IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+ /* size */ 32, Alignment, false);
+ }
+
+ void visitVACopyInst(VACopyInst &I) override {
+ IRBuilder<> IRB(&I);
+ VAStartInstrumentationList.push_back(&I);
+ Value *VAListTag = I.getArgOperand(0);
+ Value *ShadowPtr, *OriginPtr;
+ const Align Alignment = Align(8);
+ std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+ VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+ IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+ /* size */ 32, Alignment, false);
+ }
+
+ // Retrieve a va_list field of 'void*' size.
+ Value* getVAField64(IRBuilder<> &IRB, Value *VAListTag, int offset) {
+ Value *SaveAreaPtrPtr =
+ IRB.CreateIntToPtr(
+ IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+ ConstantInt::get(MS.IntptrTy, offset)),
+ Type::getInt64PtrTy(*MS.C));
+ return IRB.CreateLoad(Type::getInt64Ty(*MS.C), SaveAreaPtrPtr);
+ }
+
+ // Retrieve a va_list field of 'int' size.
+ Value* getVAField32(IRBuilder<> &IRB, Value *VAListTag, int offset) {
+ Value *SaveAreaPtr =
+ IRB.CreateIntToPtr(
+ IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+ ConstantInt::get(MS.IntptrTy, offset)),
+ Type::getInt32PtrTy(*MS.C));
+ Value *SaveArea32 = IRB.CreateLoad(IRB.getInt32Ty(), SaveAreaPtr);
+ return IRB.CreateSExt(SaveArea32, MS.IntptrTy);
+ }
+
+ void finalizeInstrumentation() override {
+ assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+ "finalizeInstrumentation called twice");
+ if (!VAStartInstrumentationList.empty()) {
+ // If there is a va_start in this function, make a backup copy of
+ // va_arg_tls somewhere in the function entry block.
IRBuilder<> IRB(MSV.FnPrologueEnd);
- VAArgOverflowSize =
- IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
- Value *CopySize =
- IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset),
- VAArgOverflowSize);
- VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
- IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
- }
-
- Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize);
- Value *VrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64VrArgSize);
-
- // Instrument va_start, copy va_list shadow from the backup copy of
- // the TLS contents.
- for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
- CallInst *OrigInst = VAStartInstrumentationList[i];
- IRBuilder<> IRB(OrigInst->getNextNode());
-
- Value *VAListTag = OrigInst->getArgOperand(0);
-
- // The variadic ABI for AArch64 creates two areas to save the incoming
- // argument registers (one for 64-bit general register xn-x7 and another
- // for 128-bit FP/SIMD vn-v7).
- // We need then to propagate the shadow arguments on both regions
- // 'va::__gr_top + va::__gr_offs' and 'va::__vr_top + va::__vr_offs'.
- // The remaining arguments are saved on shadow for 'va::stack'.
- // One caveat is it requires only to propagate the non-named arguments,
- // however on the call site instrumentation 'all' the arguments are
- // saved. So to copy the shadow values from the va_arg TLS array
- // we need to adjust the offset for both GR and VR fields based on
- // the __{gr,vr}_offs value (since they are stores based on incoming
- // named arguments).
-
- // Read the stack pointer from the va_list.
- Value *StackSaveAreaPtr = getVAField64(IRB, VAListTag, 0);
-
- // Read both the __gr_top and __gr_off and add them up.
- Value *GrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 8);
- Value *GrOffSaveArea = getVAField32(IRB, VAListTag, 24);
-
- Value *GrRegSaveAreaPtr = IRB.CreateAdd(GrTopSaveAreaPtr, GrOffSaveArea);
-
- // Read both the __vr_top and __vr_off and add them up.
- Value *VrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 16);
- Value *VrOffSaveArea = getVAField32(IRB, VAListTag, 28);
-
- Value *VrRegSaveAreaPtr = IRB.CreateAdd(VrTopSaveAreaPtr, VrOffSaveArea);
-
- // It does not know how many named arguments is being used and, on the
- // callsite all the arguments were saved. Since __gr_off is defined as
- // '0 - ((8 - named_gr) * 8)', the idea is to just propagate the variadic
- // argument by ignoring the bytes of shadow from named arguments.
- Value *GrRegSaveAreaShadowPtrOff =
- IRB.CreateAdd(GrArgSize, GrOffSaveArea);
-
- Value *GrRegSaveAreaShadowPtr =
- MSV.getShadowOriginPtr(GrRegSaveAreaPtr, IRB, IRB.getInt8Ty(),
- Align(8), /*isStore*/ true)
- .first;
-
- Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
- GrRegSaveAreaShadowPtrOff);
- Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff);
-
- IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, Align(8), GrSrcPtr, Align(8),
- GrCopySize);
-
- // Again, but for FP/SIMD values.
- Value *VrRegSaveAreaShadowPtrOff =
- IRB.CreateAdd(VrArgSize, VrOffSaveArea);
-
- Value *VrRegSaveAreaShadowPtr =
- MSV.getShadowOriginPtr(VrRegSaveAreaPtr, IRB, IRB.getInt8Ty(),
- Align(8), /*isStore*/ true)
- .first;
-
- Value *VrSrcPtr = IRB.CreateInBoundsGEP(
- IRB.getInt8Ty(),
- IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
- IRB.getInt32(AArch64VrBegOffset)),
- VrRegSaveAreaShadowPtrOff);
- Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff);
-
- IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, Align(8), VrSrcPtr, Align(8),
- VrCopySize);
-
- // And finally for remaining arguments.
- Value *StackSaveAreaShadowPtr =
- MSV.getShadowOriginPtr(StackSaveAreaPtr, IRB, IRB.getInt8Ty(),
- Align(16), /*isStore*/ true)
- .first;
-
- Value *StackSrcPtr =
- IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
- IRB.getInt32(AArch64VAEndOffset));
-
- IRB.CreateMemCpy(StackSaveAreaShadowPtr, Align(16), StackSrcPtr,
- Align(16), VAArgOverflowSize);
- }
- }
-};
-
-/// PowerPC64-specific implementation of VarArgHelper.
-struct VarArgPowerPC64Helper : public VarArgHelper {
- Function &F;
- MemorySanitizer &MS;
- MemorySanitizerVisitor &MSV;
- Value *VAArgTLSCopy = nullptr;
- Value *VAArgSize = nullptr;
-
- SmallVector<CallInst*, 16> VAStartInstrumentationList;
-
- VarArgPowerPC64Helper(Function &F, MemorySanitizer &MS,
- MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {}
-
- void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
- // For PowerPC, we need to deal with alignment of stack arguments -
- // they are mostly aligned to 8 bytes, but vectors and i128 arrays
- // are aligned to 16 bytes, byvals can be aligned to 8 or 16 bytes,
+ VAArgOverflowSize =
+ IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+ Value *CopySize =
+ IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AArch64VAEndOffset),
+ VAArgOverflowSize);
+ VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+ IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
+ }
+
+ Value *GrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64GrArgSize);
+ Value *VrArgSize = ConstantInt::get(MS.IntptrTy, kAArch64VrArgSize);
+
+ // Instrument va_start, copy va_list shadow from the backup copy of
+ // the TLS contents.
+ for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+ CallInst *OrigInst = VAStartInstrumentationList[i];
+ IRBuilder<> IRB(OrigInst->getNextNode());
+
+ Value *VAListTag = OrigInst->getArgOperand(0);
+
+ // The variadic ABI for AArch64 creates two areas to save the incoming
+ // argument registers (one for 64-bit general register xn-x7 and another
+ // for 128-bit FP/SIMD vn-v7).
+ // We need then to propagate the shadow arguments on both regions
+ // 'va::__gr_top + va::__gr_offs' and 'va::__vr_top + va::__vr_offs'.
+ // The remaining arguments are saved on shadow for 'va::stack'.
+ // One caveat is it requires only to propagate the non-named arguments,
+ // however on the call site instrumentation 'all' the arguments are
+ // saved. So to copy the shadow values from the va_arg TLS array
+ // we need to adjust the offset for both GR and VR fields based on
+ // the __{gr,vr}_offs value (since they are stores based on incoming
+ // named arguments).
+
+ // Read the stack pointer from the va_list.
+ Value *StackSaveAreaPtr = getVAField64(IRB, VAListTag, 0);
+
+ // Read both the __gr_top and __gr_off and add them up.
+ Value *GrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 8);
+ Value *GrOffSaveArea = getVAField32(IRB, VAListTag, 24);
+
+ Value *GrRegSaveAreaPtr = IRB.CreateAdd(GrTopSaveAreaPtr, GrOffSaveArea);
+
+ // Read both the __vr_top and __vr_off and add them up.
+ Value *VrTopSaveAreaPtr = getVAField64(IRB, VAListTag, 16);
+ Value *VrOffSaveArea = getVAField32(IRB, VAListTag, 28);
+
+ Value *VrRegSaveAreaPtr = IRB.CreateAdd(VrTopSaveAreaPtr, VrOffSaveArea);
+
+ // It does not know how many named arguments is being used and, on the
+ // callsite all the arguments were saved. Since __gr_off is defined as
+ // '0 - ((8 - named_gr) * 8)', the idea is to just propagate the variadic
+ // argument by ignoring the bytes of shadow from named arguments.
+ Value *GrRegSaveAreaShadowPtrOff =
+ IRB.CreateAdd(GrArgSize, GrOffSaveArea);
+
+ Value *GrRegSaveAreaShadowPtr =
+ MSV.getShadowOriginPtr(GrRegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+ Align(8), /*isStore*/ true)
+ .first;
+
+ Value *GrSrcPtr = IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+ GrRegSaveAreaShadowPtrOff);
+ Value *GrCopySize = IRB.CreateSub(GrArgSize, GrRegSaveAreaShadowPtrOff);
+
+ IRB.CreateMemCpy(GrRegSaveAreaShadowPtr, Align(8), GrSrcPtr, Align(8),
+ GrCopySize);
+
+ // Again, but for FP/SIMD values.
+ Value *VrRegSaveAreaShadowPtrOff =
+ IRB.CreateAdd(VrArgSize, VrOffSaveArea);
+
+ Value *VrRegSaveAreaShadowPtr =
+ MSV.getShadowOriginPtr(VrRegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+ Align(8), /*isStore*/ true)
+ .first;
+
+ Value *VrSrcPtr = IRB.CreateInBoundsGEP(
+ IRB.getInt8Ty(),
+ IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+ IRB.getInt32(AArch64VrBegOffset)),
+ VrRegSaveAreaShadowPtrOff);
+ Value *VrCopySize = IRB.CreateSub(VrArgSize, VrRegSaveAreaShadowPtrOff);
+
+ IRB.CreateMemCpy(VrRegSaveAreaShadowPtr, Align(8), VrSrcPtr, Align(8),
+ VrCopySize);
+
+ // And finally for remaining arguments.
+ Value *StackSaveAreaShadowPtr =
+ MSV.getShadowOriginPtr(StackSaveAreaPtr, IRB, IRB.getInt8Ty(),
+ Align(16), /*isStore*/ true)
+ .first;
+
+ Value *StackSrcPtr =
+ IRB.CreateInBoundsGEP(IRB.getInt8Ty(), VAArgTLSCopy,
+ IRB.getInt32(AArch64VAEndOffset));
+
+ IRB.CreateMemCpy(StackSaveAreaShadowPtr, Align(16), StackSrcPtr,
+ Align(16), VAArgOverflowSize);
+ }
+ }
+};
+
+/// PowerPC64-specific implementation of VarArgHelper.
+struct VarArgPowerPC64Helper : public VarArgHelper {
+ Function &F;
+ MemorySanitizer &MS;
+ MemorySanitizerVisitor &MSV;
+ Value *VAArgTLSCopy = nullptr;
+ Value *VAArgSize = nullptr;
+
+ SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+ VarArgPowerPC64Helper(Function &F, MemorySanitizer &MS,
+ MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {}
+
+ void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
+ // For PowerPC, we need to deal with alignment of stack arguments -
+ // they are mostly aligned to 8 bytes, but vectors and i128 arrays
+ // are aligned to 16 bytes, byvals can be aligned to 8 or 16 bytes,
// For that reason, we compute current offset from stack pointer (which is
// always properly aligned), and offset for the first vararg, then subtract
// them.
- unsigned VAArgBase;
- Triple TargetTriple(F.getParent()->getTargetTriple());
- // Parameter save area starts at 48 bytes from frame pointer for ABIv1,
- // and 32 bytes for ABIv2. This is usually determined by target
- // endianness, but in theory could be overridden by function attribute.
- if (TargetTriple.getArch() == Triple::ppc64)
- VAArgBase = 48;
- else
- VAArgBase = 32;
- unsigned VAArgOffset = VAArgBase;
- const DataLayout &DL = F.getParent()->getDataLayout();
- for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
- ++ArgIt) {
- Value *A = *ArgIt;
- unsigned ArgNo = CB.getArgOperandNo(ArgIt);
- bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
- bool IsByVal = CB.paramHasAttr(ArgNo, Attribute::ByVal);
- if (IsByVal) {
- assert(A->getType()->isPointerTy());
- Type *RealTy = CB.getParamByValType(ArgNo);
- uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
- MaybeAlign ArgAlign = CB.getParamAlign(ArgNo);
- if (!ArgAlign || *ArgAlign < Align(8))
- ArgAlign = Align(8);
- VAArgOffset = alignTo(VAArgOffset, ArgAlign);
- if (!IsFixed) {
- Value *Base = getShadowPtrForVAArgument(
- RealTy, IRB, VAArgOffset - VAArgBase, ArgSize);
- if (Base) {
- Value *AShadowPtr, *AOriginPtr;
- std::tie(AShadowPtr, AOriginPtr) =
- MSV.getShadowOriginPtr(A, IRB, IRB.getInt8Ty(),
- kShadowTLSAlignment, /*isStore*/ false);
-
- IRB.CreateMemCpy(Base, kShadowTLSAlignment, AShadowPtr,
- kShadowTLSAlignment, ArgSize);
- }
- }
- VAArgOffset += alignTo(ArgSize, 8);
- } else {
- Value *Base;
- uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
- uint64_t ArgAlign = 8;
- if (A->getType()->isArrayTy()) {
- // Arrays are aligned to element size, except for long double
- // arrays, which are aligned to 8 bytes.
- Type *ElementTy = A->getType()->getArrayElementType();
- if (!ElementTy->isPPC_FP128Ty())
- ArgAlign = DL.getTypeAllocSize(ElementTy);
- } else if (A->getType()->isVectorTy()) {
- // Vectors are naturally aligned.
- ArgAlign = DL.getTypeAllocSize(A->getType());
- }
- if (ArgAlign < 8)
- ArgAlign = 8;
- VAArgOffset = alignTo(VAArgOffset, ArgAlign);
- if (DL.isBigEndian()) {
- // Adjusting the shadow for argument with size < 8 to match the placement
- // of bits in big endian system
- if (ArgSize < 8)
- VAArgOffset += (8 - ArgSize);
- }
- if (!IsFixed) {
- Base = getShadowPtrForVAArgument(A->getType(), IRB,
- VAArgOffset - VAArgBase, ArgSize);
- if (Base)
- IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
- }
- VAArgOffset += ArgSize;
- VAArgOffset = alignTo(VAArgOffset, 8);
- }
- if (IsFixed)
- VAArgBase = VAArgOffset;
- }
-
- Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(),
- VAArgOffset - VAArgBase);
- // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
- // a new class member i.e. it is the total size of all VarArgs.
- IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
- }
-
- /// Compute the shadow address for a given va_arg.
- Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
- unsigned ArgOffset, unsigned ArgSize) {
- // Make sure we don't overflow __msan_va_arg_tls.
- if (ArgOffset + ArgSize > kParamTLSSize)
- return nullptr;
- Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
- Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
- return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
- "_msarg");
- }
-
- void visitVAStartInst(VAStartInst &I) override {
- IRBuilder<> IRB(&I);
- VAStartInstrumentationList.push_back(&I);
- Value *VAListTag = I.getArgOperand(0);
- Value *ShadowPtr, *OriginPtr;
- const Align Alignment = Align(8);
- std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
- VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
- IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
- /* size */ 8, Alignment, false);
- }
-
- void visitVACopyInst(VACopyInst &I) override {
- IRBuilder<> IRB(&I);
- Value *VAListTag = I.getArgOperand(0);
- Value *ShadowPtr, *OriginPtr;
- const Align Alignment = Align(8);
- std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
- VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
- // Unpoison the whole __va_list_tag.
- // FIXME: magic ABI constants.
- IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
- /* size */ 8, Alignment, false);
- }
-
- void finalizeInstrumentation() override {
- assert(!VAArgSize && !VAArgTLSCopy &&
- "finalizeInstrumentation called twice");
+ unsigned VAArgBase;
+ Triple TargetTriple(F.getParent()->getTargetTriple());
+ // Parameter save area starts at 48 bytes from frame pointer for ABIv1,
+ // and 32 bytes for ABIv2. This is usually determined by target
+ // endianness, but in theory could be overridden by function attribute.
+ if (TargetTriple.getArch() == Triple::ppc64)
+ VAArgBase = 48;
+ else
+ VAArgBase = 32;
+ unsigned VAArgOffset = VAArgBase;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+ ++ArgIt) {
+ Value *A = *ArgIt;
+ unsigned ArgNo = CB.getArgOperandNo(ArgIt);
+ bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
+ bool IsByVal = CB.paramHasAttr(ArgNo, Attribute::ByVal);
+ if (IsByVal) {
+ assert(A->getType()->isPointerTy());
+ Type *RealTy = CB.getParamByValType(ArgNo);
+ uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
+ MaybeAlign ArgAlign = CB.getParamAlign(ArgNo);
+ if (!ArgAlign || *ArgAlign < Align(8))
+ ArgAlign = Align(8);
+ VAArgOffset = alignTo(VAArgOffset, ArgAlign);
+ if (!IsFixed) {
+ Value *Base = getShadowPtrForVAArgument(
+ RealTy, IRB, VAArgOffset - VAArgBase, ArgSize);
+ if (Base) {
+ Value *AShadowPtr, *AOriginPtr;
+ std::tie(AShadowPtr, AOriginPtr) =
+ MSV.getShadowOriginPtr(A, IRB, IRB.getInt8Ty(),
+ kShadowTLSAlignment, /*isStore*/ false);
+
+ IRB.CreateMemCpy(Base, kShadowTLSAlignment, AShadowPtr,
+ kShadowTLSAlignment, ArgSize);
+ }
+ }
+ VAArgOffset += alignTo(ArgSize, 8);
+ } else {
+ Value *Base;
+ uint64_t ArgSize = DL.getTypeAllocSize(A->getType());
+ uint64_t ArgAlign = 8;
+ if (A->getType()->isArrayTy()) {
+ // Arrays are aligned to element size, except for long double
+ // arrays, which are aligned to 8 bytes.
+ Type *ElementTy = A->getType()->getArrayElementType();
+ if (!ElementTy->isPPC_FP128Ty())
+ ArgAlign = DL.getTypeAllocSize(ElementTy);
+ } else if (A->getType()->isVectorTy()) {
+ // Vectors are naturally aligned.
+ ArgAlign = DL.getTypeAllocSize(A->getType());
+ }
+ if (ArgAlign < 8)
+ ArgAlign = 8;
+ VAArgOffset = alignTo(VAArgOffset, ArgAlign);
+ if (DL.isBigEndian()) {
+ // Adjusting the shadow for argument with size < 8 to match the placement
+ // of bits in big endian system
+ if (ArgSize < 8)
+ VAArgOffset += (8 - ArgSize);
+ }
+ if (!IsFixed) {
+ Base = getShadowPtrForVAArgument(A->getType(), IRB,
+ VAArgOffset - VAArgBase, ArgSize);
+ if (Base)
+ IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
+ }
+ VAArgOffset += ArgSize;
+ VAArgOffset = alignTo(VAArgOffset, 8);
+ }
+ if (IsFixed)
+ VAArgBase = VAArgOffset;
+ }
+
+ Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(),
+ VAArgOffset - VAArgBase);
+ // Here using VAArgOverflowSizeTLS as VAArgSizeTLS to avoid creation of
+ // a new class member i.e. it is the total size of all VarArgs.
+ IRB.CreateStore(TotalVAArgSize, MS.VAArgOverflowSizeTLS);
+ }
+
+ /// Compute the shadow address for a given va_arg.
+ Value *getShadowPtrForVAArgument(Type *Ty, IRBuilder<> &IRB,
+ unsigned ArgOffset, unsigned ArgSize) {
+ // Make sure we don't overflow __msan_va_arg_tls.
+ if (ArgOffset + ArgSize > kParamTLSSize)
+ return nullptr;
+ Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+ Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+ return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(Ty), 0),
+ "_msarg");
+ }
+
+ void visitVAStartInst(VAStartInst &I) override {
+ IRBuilder<> IRB(&I);
+ VAStartInstrumentationList.push_back(&I);
+ Value *VAListTag = I.getArgOperand(0);
+ Value *ShadowPtr, *OriginPtr;
+ const Align Alignment = Align(8);
+ std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+ VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+ IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+ /* size */ 8, Alignment, false);
+ }
+
+ void visitVACopyInst(VACopyInst &I) override {
+ IRBuilder<> IRB(&I);
+ Value *VAListTag = I.getArgOperand(0);
+ Value *ShadowPtr, *OriginPtr;
+ const Align Alignment = Align(8);
+ std::tie(ShadowPtr, OriginPtr) = MSV.getShadowOriginPtr(
+ VAListTag, IRB, IRB.getInt8Ty(), Alignment, /*isStore*/ true);
+ // Unpoison the whole __va_list_tag.
+ // FIXME: magic ABI constants.
+ IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+ /* size */ 8, Alignment, false);
+ }
+
+ void finalizeInstrumentation() override {
+ assert(!VAArgSize && !VAArgTLSCopy &&
+ "finalizeInstrumentation called twice");
IRBuilder<> IRB(MSV.FnPrologueEnd);
- VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
- Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
- VAArgSize);
-
- if (!VAStartInstrumentationList.empty()) {
- // If there is a va_start in this function, make a backup copy of
- // va_arg_tls somewhere in the function entry block.
- VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
- IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
- }
-
- // Instrument va_start.
- // Copy va_list shadow from the backup copy of the TLS contents.
- for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
- CallInst *OrigInst = VAStartInstrumentationList[i];
- IRBuilder<> IRB(OrigInst->getNextNode());
- Value *VAListTag = OrigInst->getArgOperand(0);
- Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
- Value *RegSaveAreaPtrPtr =
- IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
- PointerType::get(RegSaveAreaPtrTy, 0));
- Value *RegSaveAreaPtr =
- IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
- Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
- const Align Alignment = Align(8);
- std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
- MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
- Alignment, /*isStore*/ true);
- IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
- CopySize);
- }
- }
-};
-
-/// SystemZ-specific implementation of VarArgHelper.
-struct VarArgSystemZHelper : public VarArgHelper {
- static const unsigned SystemZGpOffset = 16;
- static const unsigned SystemZGpEndOffset = 56;
- static const unsigned SystemZFpOffset = 128;
- static const unsigned SystemZFpEndOffset = 160;
- static const unsigned SystemZMaxVrArgs = 8;
- static const unsigned SystemZRegSaveAreaSize = 160;
- static const unsigned SystemZOverflowOffset = 160;
- static const unsigned SystemZVAListTagSize = 32;
- static const unsigned SystemZOverflowArgAreaPtrOffset = 16;
- static const unsigned SystemZRegSaveAreaPtrOffset = 24;
-
- Function &F;
- MemorySanitizer &MS;
- MemorySanitizerVisitor &MSV;
- Value *VAArgTLSCopy = nullptr;
- Value *VAArgTLSOriginCopy = nullptr;
- Value *VAArgOverflowSize = nullptr;
-
- SmallVector<CallInst *, 16> VAStartInstrumentationList;
-
- enum class ArgKind {
- GeneralPurpose,
- FloatingPoint,
- Vector,
- Memory,
- Indirect,
- };
-
- enum class ShadowExtension { None, Zero, Sign };
-
- VarArgSystemZHelper(Function &F, MemorySanitizer &MS,
- MemorySanitizerVisitor &MSV)
- : F(F), MS(MS), MSV(MSV) {}
-
- ArgKind classifyArgument(Type *T, bool IsSoftFloatABI) {
- // T is a SystemZABIInfo::classifyArgumentType() output, and there are
- // only a few possibilities of what it can be. In particular, enums, single
- // element structs and large types have already been taken care of.
-
- // Some i128 and fp128 arguments are converted to pointers only in the
- // back end.
- if (T->isIntegerTy(128) || T->isFP128Ty())
- return ArgKind::Indirect;
- if (T->isFloatingPointTy())
- return IsSoftFloatABI ? ArgKind::GeneralPurpose : ArgKind::FloatingPoint;
- if (T->isIntegerTy() || T->isPointerTy())
- return ArgKind::GeneralPurpose;
- if (T->isVectorTy())
- return ArgKind::Vector;
- return ArgKind::Memory;
- }
-
- ShadowExtension getShadowExtension(const CallBase &CB, unsigned ArgNo) {
- // ABI says: "One of the simple integer types no more than 64 bits wide.
- // ... If such an argument is shorter than 64 bits, replace it by a full
- // 64-bit integer representing the same number, using sign or zero
- // extension". Shadow for an integer argument has the same type as the
- // argument itself, so it can be sign or zero extended as well.
- bool ZExt = CB.paramHasAttr(ArgNo, Attribute::ZExt);
- bool SExt = CB.paramHasAttr(ArgNo, Attribute::SExt);
- if (ZExt) {
- assert(!SExt);
- return ShadowExtension::Zero;
- }
- if (SExt) {
- assert(!ZExt);
- return ShadowExtension::Sign;
- }
- return ShadowExtension::None;
- }
-
- void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
- bool IsSoftFloatABI = CB.getCalledFunction()
- ->getFnAttribute("use-soft-float")
- .getValueAsString() == "true";
- unsigned GpOffset = SystemZGpOffset;
- unsigned FpOffset = SystemZFpOffset;
- unsigned VrIndex = 0;
- unsigned OverflowOffset = SystemZOverflowOffset;
- const DataLayout &DL = F.getParent()->getDataLayout();
- for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
- ++ArgIt) {
- Value *A = *ArgIt;
- unsigned ArgNo = CB.getArgOperandNo(ArgIt);
- bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
- // SystemZABIInfo does not produce ByVal parameters.
- assert(!CB.paramHasAttr(ArgNo, Attribute::ByVal));
- Type *T = A->getType();
- ArgKind AK = classifyArgument(T, IsSoftFloatABI);
- if (AK == ArgKind::Indirect) {
- T = PointerType::get(T, 0);
- AK = ArgKind::GeneralPurpose;
- }
- if (AK == ArgKind::GeneralPurpose && GpOffset >= SystemZGpEndOffset)
- AK = ArgKind::Memory;
- if (AK == ArgKind::FloatingPoint && FpOffset >= SystemZFpEndOffset)
- AK = ArgKind::Memory;
- if (AK == ArgKind::Vector && (VrIndex >= SystemZMaxVrArgs || !IsFixed))
- AK = ArgKind::Memory;
- Value *ShadowBase = nullptr;
- Value *OriginBase = nullptr;
- ShadowExtension SE = ShadowExtension::None;
- switch (AK) {
- case ArgKind::GeneralPurpose: {
- // Always keep track of GpOffset, but store shadow only for varargs.
- uint64_t ArgSize = 8;
- if (GpOffset + ArgSize <= kParamTLSSize) {
- if (!IsFixed) {
- SE = getShadowExtension(CB, ArgNo);
- uint64_t GapSize = 0;
- if (SE == ShadowExtension::None) {
- uint64_t ArgAllocSize = DL.getTypeAllocSize(T);
- assert(ArgAllocSize <= ArgSize);
- GapSize = ArgSize - ArgAllocSize;
- }
- ShadowBase = getShadowAddrForVAArgument(IRB, GpOffset + GapSize);
- if (MS.TrackOrigins)
- OriginBase = getOriginPtrForVAArgument(IRB, GpOffset + GapSize);
- }
- GpOffset += ArgSize;
- } else {
- GpOffset = kParamTLSSize;
- }
- break;
- }
- case ArgKind::FloatingPoint: {
- // Always keep track of FpOffset, but store shadow only for varargs.
- uint64_t ArgSize = 8;
- if (FpOffset + ArgSize <= kParamTLSSize) {
- if (!IsFixed) {
- // PoP says: "A short floating-point datum requires only the
- // left-most 32 bit positions of a floating-point register".
- // Therefore, in contrast to AK_GeneralPurpose and AK_Memory,
- // don't extend shadow and don't mind the gap.
- ShadowBase = getShadowAddrForVAArgument(IRB, FpOffset);
- if (MS.TrackOrigins)
- OriginBase = getOriginPtrForVAArgument(IRB, FpOffset);
- }
- FpOffset += ArgSize;
- } else {
- FpOffset = kParamTLSSize;
- }
- break;
- }
- case ArgKind::Vector: {
- // Keep track of VrIndex. No need to store shadow, since vector varargs
- // go through AK_Memory.
- assert(IsFixed);
- VrIndex++;
- break;
- }
- case ArgKind::Memory: {
- // Keep track of OverflowOffset and store shadow only for varargs.
- // Ignore fixed args, since we need to copy only the vararg portion of
- // the overflow area shadow.
- if (!IsFixed) {
- uint64_t ArgAllocSize = DL.getTypeAllocSize(T);
- uint64_t ArgSize = alignTo(ArgAllocSize, 8);
- if (OverflowOffset + ArgSize <= kParamTLSSize) {
- SE = getShadowExtension(CB, ArgNo);
- uint64_t GapSize =
- SE == ShadowExtension::None ? ArgSize - ArgAllocSize : 0;
- ShadowBase =
- getShadowAddrForVAArgument(IRB, OverflowOffset + GapSize);
- if (MS.TrackOrigins)
- OriginBase =
- getOriginPtrForVAArgument(IRB, OverflowOffset + GapSize);
- OverflowOffset += ArgSize;
- } else {
- OverflowOffset = kParamTLSSize;
- }
- }
- break;
- }
- case ArgKind::Indirect:
- llvm_unreachable("Indirect must be converted to GeneralPurpose");
- }
- if (ShadowBase == nullptr)
- continue;
- Value *Shadow = MSV.getShadow(A);
- if (SE != ShadowExtension::None)
- Shadow = MSV.CreateShadowCast(IRB, Shadow, IRB.getInt64Ty(),
- /*Signed*/ SE == ShadowExtension::Sign);
- ShadowBase = IRB.CreateIntToPtr(
- ShadowBase, PointerType::get(Shadow->getType(), 0), "_msarg_va_s");
- IRB.CreateStore(Shadow, ShadowBase);
- if (MS.TrackOrigins) {
- Value *Origin = MSV.getOrigin(A);
- unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
- MSV.paintOrigin(IRB, Origin, OriginBase, StoreSize,
- kMinOriginAlignment);
- }
- }
- Constant *OverflowSize = ConstantInt::get(
- IRB.getInt64Ty(), OverflowOffset - SystemZOverflowOffset);
- IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
- }
-
- Value *getShadowAddrForVAArgument(IRBuilder<> &IRB, unsigned ArgOffset) {
- Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
- return IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
- }
-
- Value *getOriginPtrForVAArgument(IRBuilder<> &IRB, int ArgOffset) {
- Value *Base = IRB.CreatePointerCast(MS.VAArgOriginTLS, MS.IntptrTy);
- Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
- return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
- "_msarg_va_o");
- }
-
- void unpoisonVAListTagForInst(IntrinsicInst &I) {
- IRBuilder<> IRB(&I);
- Value *VAListTag = I.getArgOperand(0);
- Value *ShadowPtr, *OriginPtr;
- const Align Alignment = Align(8);
- std::tie(ShadowPtr, OriginPtr) =
- MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment,
- /*isStore*/ true);
- IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
- SystemZVAListTagSize, Alignment, false);
- }
-
- void visitVAStartInst(VAStartInst &I) override {
- VAStartInstrumentationList.push_back(&I);
- unpoisonVAListTagForInst(I);
- }
-
- void visitVACopyInst(VACopyInst &I) override { unpoisonVAListTagForInst(I); }
-
- void copyRegSaveArea(IRBuilder<> &IRB, Value *VAListTag) {
- Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
- Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
- IRB.CreateAdd(
- IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
- ConstantInt::get(MS.IntptrTy, SystemZRegSaveAreaPtrOffset)),
- PointerType::get(RegSaveAreaPtrTy, 0));
- Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
- Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
- const Align Alignment = Align(8);
- std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
- MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(), Alignment,
- /*isStore*/ true);
- // TODO(iii): copy only fragments filled by visitCallBase()
- IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
- SystemZRegSaveAreaSize);
- if (MS.TrackOrigins)
- IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy,
- Alignment, SystemZRegSaveAreaSize);
- }
-
- void copyOverflowArea(IRBuilder<> &IRB, Value *VAListTag) {
- Type *OverflowArgAreaPtrTy = Type::getInt64PtrTy(*MS.C);
- Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
- IRB.CreateAdd(
- IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
- ConstantInt::get(MS.IntptrTy, SystemZOverflowArgAreaPtrOffset)),
- PointerType::get(OverflowArgAreaPtrTy, 0));
- Value *OverflowArgAreaPtr =
- IRB.CreateLoad(OverflowArgAreaPtrTy, OverflowArgAreaPtrPtr);
- Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr;
- const Align Alignment = Align(8);
- std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) =
- MSV.getShadowOriginPtr(OverflowArgAreaPtr, IRB, IRB.getInt8Ty(),
- Alignment, /*isStore*/ true);
- Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy,
- SystemZOverflowOffset);
- IRB.CreateMemCpy(OverflowArgAreaShadowPtr, Alignment, SrcPtr, Alignment,
- VAArgOverflowSize);
- if (MS.TrackOrigins) {
- SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSOriginCopy,
- SystemZOverflowOffset);
- IRB.CreateMemCpy(OverflowArgAreaOriginPtr, Alignment, SrcPtr, Alignment,
- VAArgOverflowSize);
- }
- }
-
- void finalizeInstrumentation() override {
- assert(!VAArgOverflowSize && !VAArgTLSCopy &&
- "finalizeInstrumentation called twice");
- if (!VAStartInstrumentationList.empty()) {
- // If there is a va_start in this function, make a backup copy of
- // va_arg_tls somewhere in the function entry block.
+ VAArgSize = IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+ Value *CopySize = IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, 0),
+ VAArgSize);
+
+ if (!VAStartInstrumentationList.empty()) {
+ // If there is a va_start in this function, make a backup copy of
+ // va_arg_tls somewhere in the function entry block.
+ VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+ IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
+ }
+
+ // Instrument va_start.
+ // Copy va_list shadow from the backup copy of the TLS contents.
+ for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+ CallInst *OrigInst = VAStartInstrumentationList[i];
+ IRBuilder<> IRB(OrigInst->getNextNode());
+ Value *VAListTag = OrigInst->getArgOperand(0);
+ Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+ Value *RegSaveAreaPtrPtr =
+ IRB.CreateIntToPtr(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+ PointerType::get(RegSaveAreaPtrTy, 0));
+ Value *RegSaveAreaPtr =
+ IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+ Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
+ const Align Alignment = Align(8);
+ std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
+ MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(),
+ Alignment, /*isStore*/ true);
+ IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+ CopySize);
+ }
+ }
+};
+
+/// SystemZ-specific implementation of VarArgHelper.
+struct VarArgSystemZHelper : public VarArgHelper {
+ static const unsigned SystemZGpOffset = 16;
+ static const unsigned SystemZGpEndOffset = 56;
+ static const unsigned SystemZFpOffset = 128;
+ static const unsigned SystemZFpEndOffset = 160;
+ static const unsigned SystemZMaxVrArgs = 8;
+ static const unsigned SystemZRegSaveAreaSize = 160;
+ static const unsigned SystemZOverflowOffset = 160;
+ static const unsigned SystemZVAListTagSize = 32;
+ static const unsigned SystemZOverflowArgAreaPtrOffset = 16;
+ static const unsigned SystemZRegSaveAreaPtrOffset = 24;
+
+ Function &F;
+ MemorySanitizer &MS;
+ MemorySanitizerVisitor &MSV;
+ Value *VAArgTLSCopy = nullptr;
+ Value *VAArgTLSOriginCopy = nullptr;
+ Value *VAArgOverflowSize = nullptr;
+
+ SmallVector<CallInst *, 16> VAStartInstrumentationList;
+
+ enum class ArgKind {
+ GeneralPurpose,
+ FloatingPoint,
+ Vector,
+ Memory,
+ Indirect,
+ };
+
+ enum class ShadowExtension { None, Zero, Sign };
+
+ VarArgSystemZHelper(Function &F, MemorySanitizer &MS,
+ MemorySanitizerVisitor &MSV)
+ : F(F), MS(MS), MSV(MSV) {}
+
+ ArgKind classifyArgument(Type *T, bool IsSoftFloatABI) {
+ // T is a SystemZABIInfo::classifyArgumentType() output, and there are
+ // only a few possibilities of what it can be. In particular, enums, single
+ // element structs and large types have already been taken care of.
+
+ // Some i128 and fp128 arguments are converted to pointers only in the
+ // back end.
+ if (T->isIntegerTy(128) || T->isFP128Ty())
+ return ArgKind::Indirect;
+ if (T->isFloatingPointTy())
+ return IsSoftFloatABI ? ArgKind::GeneralPurpose : ArgKind::FloatingPoint;
+ if (T->isIntegerTy() || T->isPointerTy())
+ return ArgKind::GeneralPurpose;
+ if (T->isVectorTy())
+ return ArgKind::Vector;
+ return ArgKind::Memory;
+ }
+
+ ShadowExtension getShadowExtension(const CallBase &CB, unsigned ArgNo) {
+ // ABI says: "One of the simple integer types no more than 64 bits wide.
+ // ... If such an argument is shorter than 64 bits, replace it by a full
+ // 64-bit integer representing the same number, using sign or zero
+ // extension". Shadow for an integer argument has the same type as the
+ // argument itself, so it can be sign or zero extended as well.
+ bool ZExt = CB.paramHasAttr(ArgNo, Attribute::ZExt);
+ bool SExt = CB.paramHasAttr(ArgNo, Attribute::SExt);
+ if (ZExt) {
+ assert(!SExt);
+ return ShadowExtension::Zero;
+ }
+ if (SExt) {
+ assert(!ZExt);
+ return ShadowExtension::Sign;
+ }
+ return ShadowExtension::None;
+ }
+
+ void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
+ bool IsSoftFloatABI = CB.getCalledFunction()
+ ->getFnAttribute("use-soft-float")
+ .getValueAsString() == "true";
+ unsigned GpOffset = SystemZGpOffset;
+ unsigned FpOffset = SystemZFpOffset;
+ unsigned VrIndex = 0;
+ unsigned OverflowOffset = SystemZOverflowOffset;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+ ++ArgIt) {
+ Value *A = *ArgIt;
+ unsigned ArgNo = CB.getArgOperandNo(ArgIt);
+ bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
+ // SystemZABIInfo does not produce ByVal parameters.
+ assert(!CB.paramHasAttr(ArgNo, Attribute::ByVal));
+ Type *T = A->getType();
+ ArgKind AK = classifyArgument(T, IsSoftFloatABI);
+ if (AK == ArgKind::Indirect) {
+ T = PointerType::get(T, 0);
+ AK = ArgKind::GeneralPurpose;
+ }
+ if (AK == ArgKind::GeneralPurpose && GpOffset >= SystemZGpEndOffset)
+ AK = ArgKind::Memory;
+ if (AK == ArgKind::FloatingPoint && FpOffset >= SystemZFpEndOffset)
+ AK = ArgKind::Memory;
+ if (AK == ArgKind::Vector && (VrIndex >= SystemZMaxVrArgs || !IsFixed))
+ AK = ArgKind::Memory;
+ Value *ShadowBase = nullptr;
+ Value *OriginBase = nullptr;
+ ShadowExtension SE = ShadowExtension::None;
+ switch (AK) {
+ case ArgKind::GeneralPurpose: {
+ // Always keep track of GpOffset, but store shadow only for varargs.
+ uint64_t ArgSize = 8;
+ if (GpOffset + ArgSize <= kParamTLSSize) {
+ if (!IsFixed) {
+ SE = getShadowExtension(CB, ArgNo);
+ uint64_t GapSize = 0;
+ if (SE == ShadowExtension::None) {
+ uint64_t ArgAllocSize = DL.getTypeAllocSize(T);
+ assert(ArgAllocSize <= ArgSize);
+ GapSize = ArgSize - ArgAllocSize;
+ }
+ ShadowBase = getShadowAddrForVAArgument(IRB, GpOffset + GapSize);
+ if (MS.TrackOrigins)
+ OriginBase = getOriginPtrForVAArgument(IRB, GpOffset + GapSize);
+ }
+ GpOffset += ArgSize;
+ } else {
+ GpOffset = kParamTLSSize;
+ }
+ break;
+ }
+ case ArgKind::FloatingPoint: {
+ // Always keep track of FpOffset, but store shadow only for varargs.
+ uint64_t ArgSize = 8;
+ if (FpOffset + ArgSize <= kParamTLSSize) {
+ if (!IsFixed) {
+ // PoP says: "A short floating-point datum requires only the
+ // left-most 32 bit positions of a floating-point register".
+ // Therefore, in contrast to AK_GeneralPurpose and AK_Memory,
+ // don't extend shadow and don't mind the gap.
+ ShadowBase = getShadowAddrForVAArgument(IRB, FpOffset);
+ if (MS.TrackOrigins)
+ OriginBase = getOriginPtrForVAArgument(IRB, FpOffset);
+ }
+ FpOffset += ArgSize;
+ } else {
+ FpOffset = kParamTLSSize;
+ }
+ break;
+ }
+ case ArgKind::Vector: {
+ // Keep track of VrIndex. No need to store shadow, since vector varargs
+ // go through AK_Memory.
+ assert(IsFixed);
+ VrIndex++;
+ break;
+ }
+ case ArgKind::Memory: {
+ // Keep track of OverflowOffset and store shadow only for varargs.
+ // Ignore fixed args, since we need to copy only the vararg portion of
+ // the overflow area shadow.
+ if (!IsFixed) {
+ uint64_t ArgAllocSize = DL.getTypeAllocSize(T);
+ uint64_t ArgSize = alignTo(ArgAllocSize, 8);
+ if (OverflowOffset + ArgSize <= kParamTLSSize) {
+ SE = getShadowExtension(CB, ArgNo);
+ uint64_t GapSize =
+ SE == ShadowExtension::None ? ArgSize - ArgAllocSize : 0;
+ ShadowBase =
+ getShadowAddrForVAArgument(IRB, OverflowOffset + GapSize);
+ if (MS.TrackOrigins)
+ OriginBase =
+ getOriginPtrForVAArgument(IRB, OverflowOffset + GapSize);
+ OverflowOffset += ArgSize;
+ } else {
+ OverflowOffset = kParamTLSSize;
+ }
+ }
+ break;
+ }
+ case ArgKind::Indirect:
+ llvm_unreachable("Indirect must be converted to GeneralPurpose");
+ }
+ if (ShadowBase == nullptr)
+ continue;
+ Value *Shadow = MSV.getShadow(A);
+ if (SE != ShadowExtension::None)
+ Shadow = MSV.CreateShadowCast(IRB, Shadow, IRB.getInt64Ty(),
+ /*Signed*/ SE == ShadowExtension::Sign);
+ ShadowBase = IRB.CreateIntToPtr(
+ ShadowBase, PointerType::get(Shadow->getType(), 0), "_msarg_va_s");
+ IRB.CreateStore(Shadow, ShadowBase);
+ if (MS.TrackOrigins) {
+ Value *Origin = MSV.getOrigin(A);
+ unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
+ MSV.paintOrigin(IRB, Origin, OriginBase, StoreSize,
+ kMinOriginAlignment);
+ }
+ }
+ Constant *OverflowSize = ConstantInt::get(
+ IRB.getInt64Ty(), OverflowOffset - SystemZOverflowOffset);
+ IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+ }
+
+ Value *getShadowAddrForVAArgument(IRBuilder<> &IRB, unsigned ArgOffset) {
+ Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+ return IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+ }
+
+ Value *getOriginPtrForVAArgument(IRBuilder<> &IRB, int ArgOffset) {
+ Value *Base = IRB.CreatePointerCast(MS.VAArgOriginTLS, MS.IntptrTy);
+ Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+ return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
+ "_msarg_va_o");
+ }
+
+ void unpoisonVAListTagForInst(IntrinsicInst &I) {
+ IRBuilder<> IRB(&I);
+ Value *VAListTag = I.getArgOperand(0);
+ Value *ShadowPtr, *OriginPtr;
+ const Align Alignment = Align(8);
+ std::tie(ShadowPtr, OriginPtr) =
+ MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment,
+ /*isStore*/ true);
+ IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+ SystemZVAListTagSize, Alignment, false);
+ }
+
+ void visitVAStartInst(VAStartInst &I) override {
+ VAStartInstrumentationList.push_back(&I);
+ unpoisonVAListTagForInst(I);
+ }
+
+ void visitVACopyInst(VACopyInst &I) override { unpoisonVAListTagForInst(I); }
+
+ void copyRegSaveArea(IRBuilder<> &IRB, Value *VAListTag) {
+ Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+ Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
+ IRB.CreateAdd(
+ IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+ ConstantInt::get(MS.IntptrTy, SystemZRegSaveAreaPtrOffset)),
+ PointerType::get(RegSaveAreaPtrTy, 0));
+ Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+ Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
+ const Align Alignment = Align(8);
+ std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
+ MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(), Alignment,
+ /*isStore*/ true);
+ // TODO(iii): copy only fragments filled by visitCallBase()
+ IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+ SystemZRegSaveAreaSize);
+ if (MS.TrackOrigins)
+ IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy,
+ Alignment, SystemZRegSaveAreaSize);
+ }
+
+ void copyOverflowArea(IRBuilder<> &IRB, Value *VAListTag) {
+ Type *OverflowArgAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+ Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
+ IRB.CreateAdd(
+ IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+ ConstantInt::get(MS.IntptrTy, SystemZOverflowArgAreaPtrOffset)),
+ PointerType::get(OverflowArgAreaPtrTy, 0));
+ Value *OverflowArgAreaPtr =
+ IRB.CreateLoad(OverflowArgAreaPtrTy, OverflowArgAreaPtrPtr);
+ Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr;
+ const Align Alignment = Align(8);
+ std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) =
+ MSV.getShadowOriginPtr(OverflowArgAreaPtr, IRB, IRB.getInt8Ty(),
+ Alignment, /*isStore*/ true);
+ Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy,
+ SystemZOverflowOffset);
+ IRB.CreateMemCpy(OverflowArgAreaShadowPtr, Alignment, SrcPtr, Alignment,
+ VAArgOverflowSize);
+ if (MS.TrackOrigins) {
+ SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSOriginCopy,
+ SystemZOverflowOffset);
+ IRB.CreateMemCpy(OverflowArgAreaOriginPtr, Alignment, SrcPtr, Alignment,
+ VAArgOverflowSize);
+ }
+ }
+
+ void finalizeInstrumentation() override {
+ assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+ "finalizeInstrumentation called twice");
+ if (!VAStartInstrumentationList.empty()) {
+ // If there is a va_start in this function, make a backup copy of
+ // va_arg_tls somewhere in the function entry block.
IRBuilder<> IRB(MSV.FnPrologueEnd);
- VAArgOverflowSize =
- IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
- Value *CopySize =
- IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, SystemZOverflowOffset),
- VAArgOverflowSize);
- VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
- IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
- if (MS.TrackOrigins) {
- VAArgTLSOriginCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
- IRB.CreateMemCpy(VAArgTLSOriginCopy, Align(8), MS.VAArgOriginTLS,
- Align(8), CopySize);
- }
- }
-
- // Instrument va_start.
- // Copy va_list shadow from the backup copy of the TLS contents.
- for (size_t VaStartNo = 0, VaStartNum = VAStartInstrumentationList.size();
- VaStartNo < VaStartNum; VaStartNo++) {
- CallInst *OrigInst = VAStartInstrumentationList[VaStartNo];
- IRBuilder<> IRB(OrigInst->getNextNode());
- Value *VAListTag = OrigInst->getArgOperand(0);
- copyRegSaveArea(IRB, VAListTag);
- copyOverflowArea(IRB, VAListTag);
- }
- }
-};
-
-/// A no-op implementation of VarArgHelper.
-struct VarArgNoOpHelper : public VarArgHelper {
- VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
- MemorySanitizerVisitor &MSV) {}
-
- void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {}
-
- void visitVAStartInst(VAStartInst &I) override {}
-
- void visitVACopyInst(VACopyInst &I) override {}
-
- void finalizeInstrumentation() override {}
-};
-
-} // end anonymous namespace
-
-static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
- MemorySanitizerVisitor &Visitor) {
- // VarArg handling is only implemented on AMD64. False positives are possible
- // on other platforms.
- Triple TargetTriple(Func.getParent()->getTargetTriple());
- if (TargetTriple.getArch() == Triple::x86_64)
- return new VarArgAMD64Helper(Func, Msan, Visitor);
- else if (TargetTriple.isMIPS64())
- return new VarArgMIPS64Helper(Func, Msan, Visitor);
- else if (TargetTriple.getArch() == Triple::aarch64)
- return new VarArgAArch64Helper(Func, Msan, Visitor);
- else if (TargetTriple.getArch() == Triple::ppc64 ||
- TargetTriple.getArch() == Triple::ppc64le)
- return new VarArgPowerPC64Helper(Func, Msan, Visitor);
- else if (TargetTriple.getArch() == Triple::systemz)
- return new VarArgSystemZHelper(Func, Msan, Visitor);
- else
- return new VarArgNoOpHelper(Func, Msan, Visitor);
-}
-
-bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) {
- if (!CompileKernel && F.getName() == kMsanModuleCtorName)
- return false;
-
- MemorySanitizerVisitor Visitor(F, *this, TLI);
-
- // Clear out readonly/readnone attributes.
- AttrBuilder B;
- B.addAttribute(Attribute::ReadOnly)
- .addAttribute(Attribute::ReadNone)
- .addAttribute(Attribute::WriteOnly)
- .addAttribute(Attribute::ArgMemOnly)
- .addAttribute(Attribute::Speculatable);
- F.removeAttributes(AttributeList::FunctionIndex, B);
-
- return Visitor.runOnFunction();
-}
+ VAArgOverflowSize =
+ IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+ Value *CopySize =
+ IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, SystemZOverflowOffset),
+ VAArgOverflowSize);
+ VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+ IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
+ if (MS.TrackOrigins) {
+ VAArgTLSOriginCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+ IRB.CreateMemCpy(VAArgTLSOriginCopy, Align(8), MS.VAArgOriginTLS,
+ Align(8), CopySize);
+ }
+ }
+
+ // Instrument va_start.
+ // Copy va_list shadow from the backup copy of the TLS contents.
+ for (size_t VaStartNo = 0, VaStartNum = VAStartInstrumentationList.size();
+ VaStartNo < VaStartNum; VaStartNo++) {
+ CallInst *OrigInst = VAStartInstrumentationList[VaStartNo];
+ IRBuilder<> IRB(OrigInst->getNextNode());
+ Value *VAListTag = OrigInst->getArgOperand(0);
+ copyRegSaveArea(IRB, VAListTag);
+ copyOverflowArea(IRB, VAListTag);
+ }
+ }
+};
+
+/// A no-op implementation of VarArgHelper.
+struct VarArgNoOpHelper : public VarArgHelper {
+ VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
+ MemorySanitizerVisitor &MSV) {}
+
+ void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {}
+
+ void visitVAStartInst(VAStartInst &I) override {}
+
+ void visitVACopyInst(VACopyInst &I) override {}
+
+ void finalizeInstrumentation() override {}
+};
+
+} // end anonymous namespace
+
+static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+ MemorySanitizerVisitor &Visitor) {
+ // VarArg handling is only implemented on AMD64. False positives are possible
+ // on other platforms.
+ Triple TargetTriple(Func.getParent()->getTargetTriple());
+ if (TargetTriple.getArch() == Triple::x86_64)
+ return new VarArgAMD64Helper(Func, Msan, Visitor);
+ else if (TargetTriple.isMIPS64())
+ return new VarArgMIPS64Helper(Func, Msan, Visitor);
+ else if (TargetTriple.getArch() == Triple::aarch64)
+ return new VarArgAArch64Helper(Func, Msan, Visitor);
+ else if (TargetTriple.getArch() == Triple::ppc64 ||
+ TargetTriple.getArch() == Triple::ppc64le)
+ return new VarArgPowerPC64Helper(Func, Msan, Visitor);
+ else if (TargetTriple.getArch() == Triple::systemz)
+ return new VarArgSystemZHelper(Func, Msan, Visitor);
+ else
+ return new VarArgNoOpHelper(Func, Msan, Visitor);
+}
+
+bool MemorySanitizer::sanitizeFunction(Function &F, TargetLibraryInfo &TLI) {
+ if (!CompileKernel && F.getName() == kMsanModuleCtorName)
+ return false;
+
+ MemorySanitizerVisitor Visitor(F, *this, TLI);
+
+ // Clear out readonly/readnone attributes.
+ AttrBuilder B;
+ B.addAttribute(Attribute::ReadOnly)
+ .addAttribute(Attribute::ReadNone)
+ .addAttribute(Attribute::WriteOnly)
+ .addAttribute(Attribute::ArgMemOnly)
+ .addAttribute(Attribute::Speculatable);
+ F.removeAttributes(AttributeList::FunctionIndex, B);
+
+ return Visitor.runOnFunction();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index 002a03afad..be6c8c6310 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1,253 +1,253 @@
-//===- PGOInstrumentation.cpp - MST-based PGO Instrumentation -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements PGO instrumentation using a minimum spanning tree based
-// on the following paper:
-// [1] Donald E. Knuth, Francis R. Stevenson. Optimal measurement of points
-// for program frequency counts. BIT Numerical Mathematics 1973, Volume 13,
-// Issue 3, pp 313-322
-// The idea of the algorithm based on the fact that for each node (except for
-// the entry and exit), the sum of incoming edge counts equals the sum of
-// outgoing edge counts. The count of edge on spanning tree can be derived from
-// those edges not on the spanning tree. Knuth proves this method instruments
-// the minimum number of edges.
-//
-// The minimal spanning tree here is actually a maximum weight tree -- on-tree
-// edges have higher frequencies (more likely to execute). The idea is to
-// instrument those less frequently executed edges to reduce the runtime
-// overhead of instrumented binaries.
-//
-// This file contains two passes:
-// (1) Pass PGOInstrumentationGen which instruments the IR to generate edge
-// count profile, and generates the instrumentation for indirect call
-// profiling.
-// (2) Pass PGOInstrumentationUse which reads the edge count profile and
-// annotates the branch weights. It also reads the indirect call value
-// profiling records and annotate the indirect call instructions.
-//
-// To get the precise counter information, These two passes need to invoke at
-// the same compilation point (so they see the same IR). For pass
-// PGOInstrumentationGen, the real work is done in instrumentOneFunc(). For
-// pass PGOInstrumentationUse, the real work in done in class PGOUseFunc and
-// the profile is opened in module level and passed to each PGOUseFunc instance.
-// The shared code for PGOInstrumentationGen and PGOInstrumentationUse is put
-// in class FuncPGOInstrumentation.
-//
-// Class PGOEdge represents a CFG edge and some auxiliary information. Class
-// BBInfo contains auxiliary information for each BB. These two classes are used
-// in pass PGOInstrumentationGen. Class PGOUseEdge and UseBBInfo are the derived
-// class of PGOEdge and BBInfo, respectively. They contains extra data structure
-// used in populating profile counters.
-// The MST implementation is in Class CFGMST (CFGMST.h).
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
-#include "CFGMST.h"
-#include "ValueProfileCollector.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/ADT/iterator.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Comdat.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/ProfileSummary.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/ProfileData/InstrProf.h"
-#include "llvm/ProfileData/InstrProfReader.h"
-#include "llvm/Support/BranchProbability.h"
-#include "llvm/Support/CRC.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/DOTGraphTraits.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GraphWriter.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-using ProfileCount = Function::ProfileCount;
-using VPCandidateInfo = ValueProfileCollector::CandidateInfo;
-
-#define DEBUG_TYPE "pgo-instrumentation"
-
-STATISTIC(NumOfPGOInstrument, "Number of edges instrumented.");
-STATISTIC(NumOfPGOSelectInsts, "Number of select instruction instrumented.");
-STATISTIC(NumOfPGOMemIntrinsics, "Number of mem intrinsics instrumented.");
-STATISTIC(NumOfPGOEdge, "Number of edges.");
-STATISTIC(NumOfPGOBB, "Number of basic-blocks.");
-STATISTIC(NumOfPGOSplit, "Number of critical edge splits.");
-STATISTIC(NumOfPGOFunc, "Number of functions having valid profile counts.");
-STATISTIC(NumOfPGOMismatch, "Number of functions having mismatch profile.");
-STATISTIC(NumOfPGOMissing, "Number of functions without profile.");
-STATISTIC(NumOfPGOICall, "Number of indirect call value instrumentations.");
-STATISTIC(NumOfCSPGOInstrument, "Number of edges instrumented in CSPGO.");
-STATISTIC(NumOfCSPGOSelectInsts,
- "Number of select instruction instrumented in CSPGO.");
-STATISTIC(NumOfCSPGOMemIntrinsics,
- "Number of mem intrinsics instrumented in CSPGO.");
-STATISTIC(NumOfCSPGOEdge, "Number of edges in CSPGO.");
-STATISTIC(NumOfCSPGOBB, "Number of basic-blocks in CSPGO.");
-STATISTIC(NumOfCSPGOSplit, "Number of critical edge splits in CSPGO.");
-STATISTIC(NumOfCSPGOFunc,
- "Number of functions having valid profile counts in CSPGO.");
-STATISTIC(NumOfCSPGOMismatch,
- "Number of functions having mismatch profile in CSPGO.");
-STATISTIC(NumOfCSPGOMissing, "Number of functions without profile in CSPGO.");
-
-// Command line option to specify the file to read profile from. This is
-// mainly used for testing.
-static cl::opt<std::string>
- PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden,
- cl::value_desc("filename"),
- cl::desc("Specify the path of profile data file. This is"
- "mainly for test purpose."));
-static cl::opt<std::string> PGOTestProfileRemappingFile(
- "pgo-test-profile-remapping-file", cl::init(""), cl::Hidden,
- cl::value_desc("filename"),
- cl::desc("Specify the path of profile remapping file. This is mainly for "
- "test purpose."));
-
-// Command line option to disable value profiling. The default is false:
-// i.e. value profiling is enabled by default. This is for debug purpose.
-static cl::opt<bool> DisableValueProfiling("disable-vp", cl::init(false),
- cl::Hidden,
- cl::desc("Disable Value Profiling"));
-
-// Command line option to set the maximum number of VP annotations to write to
-// the metadata for a single indirect call callsite.
-static cl::opt<unsigned> MaxNumAnnotations(
- "icp-max-annotations", cl::init(3), cl::Hidden, cl::ZeroOrMore,
- cl::desc("Max number of annotations for a single indirect "
- "call callsite"));
-
-// Command line option to set the maximum number of value annotations
-// to write to the metadata for a single memop intrinsic.
-static cl::opt<unsigned> MaxNumMemOPAnnotations(
- "memop-max-annotations", cl::init(4), cl::Hidden, cl::ZeroOrMore,
- cl::desc("Max number of preicise value annotations for a single memop"
- "intrinsic"));
-
-// Command line option to control appending FunctionHash to the name of a COMDAT
-// function. This is to avoid the hash mismatch caused by the preinliner.
-static cl::opt<bool> DoComdatRenaming(
- "do-comdat-renaming", cl::init(false), cl::Hidden,
- cl::desc("Append function hash to the name of COMDAT function to avoid "
- "function hash mismatch due to the preinliner"));
-
-// Command line option to enable/disable the warning about missing profile
-// information.
-static cl::opt<bool>
- PGOWarnMissing("pgo-warn-missing-function", cl::init(false), cl::Hidden,
- cl::desc("Use this option to turn on/off "
- "warnings about missing profile data for "
- "functions."));
-
-// Command line option to enable/disable the warning about a hash mismatch in
-// the profile data.
-static cl::opt<bool>
- NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false), cl::Hidden,
- cl::desc("Use this option to turn off/on "
- "warnings about profile cfg mismatch."));
-
-// Command line option to enable/disable the warning about a hash mismatch in
-// the profile data for Comdat functions, which often turns out to be false
-// positive due to the pre-instrumentation inline.
-static cl::opt<bool>
- NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat", cl::init(true),
- cl::Hidden,
- cl::desc("The option is used to turn on/off "
- "warnings about hash mismatch for comdat "
- "functions."));
-
-// Command line option to enable/disable select instruction instrumentation.
-static cl::opt<bool>
- PGOInstrSelect("pgo-instr-select", cl::init(true), cl::Hidden,
- cl::desc("Use this option to turn on/off SELECT "
- "instruction instrumentation. "));
-
-// Command line option to turn on CFG dot or text dump of raw profile counts
-static cl::opt<PGOViewCountsType> PGOViewRawCounts(
- "pgo-view-raw-counts", cl::Hidden,
- cl::desc("A boolean option to show CFG dag or text "
- "with raw profile counts from "
- "profile data. See also option "
- "-pgo-view-counts. To limit graph "
- "display to only one function, use "
- "filtering option -view-bfi-func-name."),
- cl::values(clEnumValN(PGOVCT_None, "none", "do not show."),
- clEnumValN(PGOVCT_Graph, "graph", "show a graph."),
- clEnumValN(PGOVCT_Text, "text", "show in text.")));
-
-// Command line option to enable/disable memop intrinsic call.size profiling.
-static cl::opt<bool>
- PGOInstrMemOP("pgo-instr-memop", cl::init(true), cl::Hidden,
- cl::desc("Use this option to turn on/off "
- "memory intrinsic size profiling."));
-
-// Emit branch probability as optimization remarks.
-static cl::opt<bool>
- EmitBranchProbability("pgo-emit-branch-prob", cl::init(false), cl::Hidden,
- cl::desc("When this option is on, the annotated "
- "branch probability will be emitted as "
- "optimization remarks: -{Rpass|"
- "pass-remarks}=pgo-instrumentation"));
-
+//===- PGOInstrumentation.cpp - MST-based PGO Instrumentation -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements PGO instrumentation using a minimum spanning tree based
+// on the following paper:
+// [1] Donald E. Knuth, Francis R. Stevenson. Optimal measurement of points
+// for program frequency counts. BIT Numerical Mathematics 1973, Volume 13,
+// Issue 3, pp 313-322
+// The idea of the algorithm based on the fact that for each node (except for
+// the entry and exit), the sum of incoming edge counts equals the sum of
+// outgoing edge counts. The count of edge on spanning tree can be derived from
+// those edges not on the spanning tree. Knuth proves this method instruments
+// the minimum number of edges.
+//
+// The minimal spanning tree here is actually a maximum weight tree -- on-tree
+// edges have higher frequencies (more likely to execute). The idea is to
+// instrument those less frequently executed edges to reduce the runtime
+// overhead of instrumented binaries.
+//
+// This file contains two passes:
+// (1) Pass PGOInstrumentationGen which instruments the IR to generate edge
+// count profile, and generates the instrumentation for indirect call
+// profiling.
+// (2) Pass PGOInstrumentationUse which reads the edge count profile and
+// annotates the branch weights. It also reads the indirect call value
+// profiling records and annotate the indirect call instructions.
+//
+// To get the precise counter information, These two passes need to invoke at
+// the same compilation point (so they see the same IR). For pass
+// PGOInstrumentationGen, the real work is done in instrumentOneFunc(). For
+// pass PGOInstrumentationUse, the real work in done in class PGOUseFunc and
+// the profile is opened in module level and passed to each PGOUseFunc instance.
+// The shared code for PGOInstrumentationGen and PGOInstrumentationUse is put
+// in class FuncPGOInstrumentation.
+//
+// Class PGOEdge represents a CFG edge and some auxiliary information. Class
+// BBInfo contains auxiliary information for each BB. These two classes are used
+// in pass PGOInstrumentationGen. Class PGOUseEdge and UseBBInfo are the derived
+// class of PGOEdge and BBInfo, respectively. They contains extra data structure
+// used in populating profile counters.
+// The MST implementation is in Class CFGMST (CFGMST.h).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
+#include "CFGMST.h"
+#include "ValueProfileCollector.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/ProfileSummary.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CRC.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using ProfileCount = Function::ProfileCount;
+using VPCandidateInfo = ValueProfileCollector::CandidateInfo;
+
+#define DEBUG_TYPE "pgo-instrumentation"
+
+STATISTIC(NumOfPGOInstrument, "Number of edges instrumented.");
+STATISTIC(NumOfPGOSelectInsts, "Number of select instruction instrumented.");
+STATISTIC(NumOfPGOMemIntrinsics, "Number of mem intrinsics instrumented.");
+STATISTIC(NumOfPGOEdge, "Number of edges.");
+STATISTIC(NumOfPGOBB, "Number of basic-blocks.");
+STATISTIC(NumOfPGOSplit, "Number of critical edge splits.");
+STATISTIC(NumOfPGOFunc, "Number of functions having valid profile counts.");
+STATISTIC(NumOfPGOMismatch, "Number of functions having mismatch profile.");
+STATISTIC(NumOfPGOMissing, "Number of functions without profile.");
+STATISTIC(NumOfPGOICall, "Number of indirect call value instrumentations.");
+STATISTIC(NumOfCSPGOInstrument, "Number of edges instrumented in CSPGO.");
+STATISTIC(NumOfCSPGOSelectInsts,
+ "Number of select instruction instrumented in CSPGO.");
+STATISTIC(NumOfCSPGOMemIntrinsics,
+ "Number of mem intrinsics instrumented in CSPGO.");
+STATISTIC(NumOfCSPGOEdge, "Number of edges in CSPGO.");
+STATISTIC(NumOfCSPGOBB, "Number of basic-blocks in CSPGO.");
+STATISTIC(NumOfCSPGOSplit, "Number of critical edge splits in CSPGO.");
+STATISTIC(NumOfCSPGOFunc,
+ "Number of functions having valid profile counts in CSPGO.");
+STATISTIC(NumOfCSPGOMismatch,
+ "Number of functions having mismatch profile in CSPGO.");
+STATISTIC(NumOfCSPGOMissing, "Number of functions without profile in CSPGO.");
+
+// Command line option to specify the file to read profile from. This is
+// mainly used for testing.
+static cl::opt<std::string>
+ PGOTestProfileFile("pgo-test-profile-file", cl::init(""), cl::Hidden,
+ cl::value_desc("filename"),
+ cl::desc("Specify the path of profile data file. This is"
+ "mainly for test purpose."));
+static cl::opt<std::string> PGOTestProfileRemappingFile(
+ "pgo-test-profile-remapping-file", cl::init(""), cl::Hidden,
+ cl::value_desc("filename"),
+ cl::desc("Specify the path of profile remapping file. This is mainly for "
+ "test purpose."));
+
+// Command line option to disable value profiling. The default is false:
+// i.e. value profiling is enabled by default. This is for debug purpose.
+static cl::opt<bool> DisableValueProfiling("disable-vp", cl::init(false),
+ cl::Hidden,
+ cl::desc("Disable Value Profiling"));
+
+// Command line option to set the maximum number of VP annotations to write to
+// the metadata for a single indirect call callsite.
+static cl::opt<unsigned> MaxNumAnnotations(
+ "icp-max-annotations", cl::init(3), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Max number of annotations for a single indirect "
+ "call callsite"));
+
+// Command line option to set the maximum number of value annotations
+// to write to the metadata for a single memop intrinsic.
+static cl::opt<unsigned> MaxNumMemOPAnnotations(
+ "memop-max-annotations", cl::init(4), cl::Hidden, cl::ZeroOrMore,
+ cl::desc("Max number of preicise value annotations for a single memop"
+ "intrinsic"));
+
+// Command line option to control appending FunctionHash to the name of a COMDAT
+// function. This is to avoid the hash mismatch caused by the preinliner.
+static cl::opt<bool> DoComdatRenaming(
+ "do-comdat-renaming", cl::init(false), cl::Hidden,
+ cl::desc("Append function hash to the name of COMDAT function to avoid "
+ "function hash mismatch due to the preinliner"));
+
+// Command line option to enable/disable the warning about missing profile
+// information.
+static cl::opt<bool>
+ PGOWarnMissing("pgo-warn-missing-function", cl::init(false), cl::Hidden,
+ cl::desc("Use this option to turn on/off "
+ "warnings about missing profile data for "
+ "functions."));
+
+// Command line option to enable/disable the warning about a hash mismatch in
+// the profile data.
+static cl::opt<bool>
+ NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false), cl::Hidden,
+ cl::desc("Use this option to turn off/on "
+ "warnings about profile cfg mismatch."));
+
+// Command line option to enable/disable the warning about a hash mismatch in
+// the profile data for Comdat functions, which often turns out to be false
+// positive due to the pre-instrumentation inline.
+static cl::opt<bool>
+ NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat", cl::init(true),
+ cl::Hidden,
+ cl::desc("The option is used to turn on/off "
+ "warnings about hash mismatch for comdat "
+ "functions."));
+
+// Command line option to enable/disable select instruction instrumentation.
+static cl::opt<bool>
+ PGOInstrSelect("pgo-instr-select", cl::init(true), cl::Hidden,
+ cl::desc("Use this option to turn on/off SELECT "
+ "instruction instrumentation. "));
+
+// Command line option to turn on CFG dot or text dump of raw profile counts
+static cl::opt<PGOViewCountsType> PGOViewRawCounts(
+ "pgo-view-raw-counts", cl::Hidden,
+ cl::desc("A boolean option to show CFG dag or text "
+ "with raw profile counts from "
+ "profile data. See also option "
+ "-pgo-view-counts. To limit graph "
+ "display to only one function, use "
+ "filtering option -view-bfi-func-name."),
+ cl::values(clEnumValN(PGOVCT_None, "none", "do not show."),
+ clEnumValN(PGOVCT_Graph, "graph", "show a graph."),
+ clEnumValN(PGOVCT_Text, "text", "show in text.")));
+
+// Command line option to enable/disable memop intrinsic call.size profiling.
+static cl::opt<bool>
+ PGOInstrMemOP("pgo-instr-memop", cl::init(true), cl::Hidden,
+ cl::desc("Use this option to turn on/off "
+ "memory intrinsic size profiling."));
+
+// Emit branch probability as optimization remarks.
+static cl::opt<bool>
+ EmitBranchProbability("pgo-emit-branch-prob", cl::init(false), cl::Hidden,
+ cl::desc("When this option is on, the annotated "
+ "branch probability will be emitted as "
+ "optimization remarks: -{Rpass|"
+ "pass-remarks}=pgo-instrumentation"));
+
static cl::opt<bool> PGOInstrumentEntry(
"pgo-instrument-entry", cl::init(false), cl::Hidden,
cl::desc("Force to instrument function entry basicblock."));
@@ -280,394 +280,394 @@ static cl::opt<unsigned> PGOVerifyBFICutoff(
cl::desc("Set the threshold for pgo-verify-bfi -- skip the counts whose "
"profile count value is below."));
-// Command line option to turn on CFG dot dump after profile annotation.
-// Defined in Analysis/BlockFrequencyInfo.cpp: -pgo-view-counts
-extern cl::opt<PGOViewCountsType> PGOViewCounts;
-
-// Command line option to specify the name of the function for CFG dump
-// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
-extern cl::opt<std::string> ViewBlockFreqFuncName;
-
+// Command line option to turn on CFG dot dump after profile annotation.
+// Defined in Analysis/BlockFrequencyInfo.cpp: -pgo-view-counts
+extern cl::opt<PGOViewCountsType> PGOViewCounts;
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp: -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+
static cl::opt<bool>
PGOOldCFGHashing("pgo-instr-old-cfg-hashing", cl::init(false), cl::Hidden,
cl::desc("Use the old CFG function hashing"));
-// Return a string describing the branch condition that can be
-// used in static branch probability heuristics:
-static std::string getBranchCondString(Instruction *TI) {
- BranchInst *BI = dyn_cast<BranchInst>(TI);
- if (!BI || !BI->isConditional())
- return std::string();
-
- Value *Cond = BI->getCondition();
- ICmpInst *CI = dyn_cast<ICmpInst>(Cond);
- if (!CI)
- return std::string();
-
- std::string result;
- raw_string_ostream OS(result);
- OS << CmpInst::getPredicateName(CI->getPredicate()) << "_";
- CI->getOperand(0)->getType()->print(OS, true);
-
- Value *RHS = CI->getOperand(1);
- ConstantInt *CV = dyn_cast<ConstantInt>(RHS);
- if (CV) {
- if (CV->isZero())
- OS << "_Zero";
- else if (CV->isOne())
- OS << "_One";
- else if (CV->isMinusOne())
- OS << "_MinusOne";
- else
- OS << "_Const";
- }
- OS.flush();
- return result;
-}
-
-static const char *ValueProfKindDescr[] = {
-#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr,
-#include "llvm/ProfileData/InstrProfData.inc"
-};
-
-namespace {
-
-/// The select instruction visitor plays three roles specified
-/// by the mode. In \c VM_counting mode, it simply counts the number of
-/// select instructions. In \c VM_instrument mode, it inserts code to count
-/// the number times TrueValue of select is taken. In \c VM_annotate mode,
-/// it reads the profile data and annotate the select instruction with metadata.
-enum VisitMode { VM_counting, VM_instrument, VM_annotate };
-class PGOUseFunc;
-
-/// Instruction Visitor class to visit select instructions.
-struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
- Function &F;
- unsigned NSIs = 0; // Number of select instructions instrumented.
- VisitMode Mode = VM_counting; // Visiting mode.
- unsigned *CurCtrIdx = nullptr; // Pointer to current counter index.
- unsigned TotalNumCtrs = 0; // Total number of counters
- GlobalVariable *FuncNameVar = nullptr;
- uint64_t FuncHash = 0;
- PGOUseFunc *UseFunc = nullptr;
-
- SelectInstVisitor(Function &Func) : F(Func) {}
-
- void countSelects(Function &Func) {
- NSIs = 0;
- Mode = VM_counting;
- visit(Func);
- }
-
- // Visit the IR stream and instrument all select instructions. \p
- // Ind is a pointer to the counter index variable; \p TotalNC
- // is the total number of counters; \p FNV is the pointer to the
- // PGO function name var; \p FHash is the function hash.
- void instrumentSelects(Function &Func, unsigned *Ind, unsigned TotalNC,
- GlobalVariable *FNV, uint64_t FHash) {
- Mode = VM_instrument;
- CurCtrIdx = Ind;
- TotalNumCtrs = TotalNC;
- FuncHash = FHash;
- FuncNameVar = FNV;
- visit(Func);
- }
-
- // Visit the IR stream and annotate all select instructions.
- void annotateSelects(Function &Func, PGOUseFunc *UF, unsigned *Ind) {
- Mode = VM_annotate;
- UseFunc = UF;
- CurCtrIdx = Ind;
- visit(Func);
- }
-
- void instrumentOneSelectInst(SelectInst &SI);
- void annotateOneSelectInst(SelectInst &SI);
-
- // Visit \p SI instruction and perform tasks according to visit mode.
- void visitSelectInst(SelectInst &SI);
-
- // Return the number of select instructions. This needs be called after
- // countSelects().
- unsigned getNumOfSelectInsts() const { return NSIs; }
-};
-
-
-class PGOInstrumentationGenLegacyPass : public ModulePass {
-public:
- static char ID;
-
- PGOInstrumentationGenLegacyPass(bool IsCS = false)
- : ModulePass(ID), IsCS(IsCS) {
- initializePGOInstrumentationGenLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- StringRef getPassName() const override { return "PGOInstrumentationGenPass"; }
-
-private:
- // Is this is context-sensitive instrumentation.
- bool IsCS;
- bool runOnModule(Module &M) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<BlockFrequencyInfoWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-};
-
-class PGOInstrumentationUseLegacyPass : public ModulePass {
-public:
- static char ID;
-
- // Provide the profile filename as the parameter.
- PGOInstrumentationUseLegacyPass(std::string Filename = "", bool IsCS = false)
- : ModulePass(ID), ProfileFileName(std::move(Filename)), IsCS(IsCS) {
- if (!PGOTestProfileFile.empty())
- ProfileFileName = PGOTestProfileFile;
- initializePGOInstrumentationUseLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- StringRef getPassName() const override { return "PGOInstrumentationUsePass"; }
-
-private:
- std::string ProfileFileName;
- // Is this is context-sensitive instrumentation use.
- bool IsCS;
-
- bool runOnModule(Module &M) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<ProfileSummaryInfoWrapperPass>();
- AU.addRequired<BlockFrequencyInfoWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-};
-
-class PGOInstrumentationGenCreateVarLegacyPass : public ModulePass {
-public:
- static char ID;
- StringRef getPassName() const override {
- return "PGOInstrumentationGenCreateVarPass";
- }
- PGOInstrumentationGenCreateVarLegacyPass(std::string CSInstrName = "")
- : ModulePass(ID), InstrProfileOutput(CSInstrName) {
- initializePGOInstrumentationGenCreateVarLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
-private:
- bool runOnModule(Module &M) override {
- createProfileFileNameVar(M, InstrProfileOutput);
+// Return a string describing the branch condition that can be
+// used in static branch probability heuristics:
+static std::string getBranchCondString(Instruction *TI) {
+ BranchInst *BI = dyn_cast<BranchInst>(TI);
+ if (!BI || !BI->isConditional())
+ return std::string();
+
+ Value *Cond = BI->getCondition();
+ ICmpInst *CI = dyn_cast<ICmpInst>(Cond);
+ if (!CI)
+ return std::string();
+
+ std::string result;
+ raw_string_ostream OS(result);
+ OS << CmpInst::getPredicateName(CI->getPredicate()) << "_";
+ CI->getOperand(0)->getType()->print(OS, true);
+
+ Value *RHS = CI->getOperand(1);
+ ConstantInt *CV = dyn_cast<ConstantInt>(RHS);
+ if (CV) {
+ if (CV->isZero())
+ OS << "_Zero";
+ else if (CV->isOne())
+ OS << "_One";
+ else if (CV->isMinusOne())
+ OS << "_MinusOne";
+ else
+ OS << "_Const";
+ }
+ OS.flush();
+ return result;
+}
+
+static const char *ValueProfKindDescr[] = {
+#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr,
+#include "llvm/ProfileData/InstrProfData.inc"
+};
+
+namespace {
+
+/// The select instruction visitor plays three roles specified
+/// by the mode. In \c VM_counting mode, it simply counts the number of
+/// select instructions. In \c VM_instrument mode, it inserts code to count
+/// the number times TrueValue of select is taken. In \c VM_annotate mode,
+/// it reads the profile data and annotate the select instruction with metadata.
+enum VisitMode { VM_counting, VM_instrument, VM_annotate };
+class PGOUseFunc;
+
+/// Instruction Visitor class to visit select instructions.
+struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
+ Function &F;
+ unsigned NSIs = 0; // Number of select instructions instrumented.
+ VisitMode Mode = VM_counting; // Visiting mode.
+ unsigned *CurCtrIdx = nullptr; // Pointer to current counter index.
+ unsigned TotalNumCtrs = 0; // Total number of counters
+ GlobalVariable *FuncNameVar = nullptr;
+ uint64_t FuncHash = 0;
+ PGOUseFunc *UseFunc = nullptr;
+
+ SelectInstVisitor(Function &Func) : F(Func) {}
+
+ void countSelects(Function &Func) {
+ NSIs = 0;
+ Mode = VM_counting;
+ visit(Func);
+ }
+
+ // Visit the IR stream and instrument all select instructions. \p
+ // Ind is a pointer to the counter index variable; \p TotalNC
+ // is the total number of counters; \p FNV is the pointer to the
+ // PGO function name var; \p FHash is the function hash.
+ void instrumentSelects(Function &Func, unsigned *Ind, unsigned TotalNC,
+ GlobalVariable *FNV, uint64_t FHash) {
+ Mode = VM_instrument;
+ CurCtrIdx = Ind;
+ TotalNumCtrs = TotalNC;
+ FuncHash = FHash;
+ FuncNameVar = FNV;
+ visit(Func);
+ }
+
+ // Visit the IR stream and annotate all select instructions.
+ void annotateSelects(Function &Func, PGOUseFunc *UF, unsigned *Ind) {
+ Mode = VM_annotate;
+ UseFunc = UF;
+ CurCtrIdx = Ind;
+ visit(Func);
+ }
+
+ void instrumentOneSelectInst(SelectInst &SI);
+ void annotateOneSelectInst(SelectInst &SI);
+
+ // Visit \p SI instruction and perform tasks according to visit mode.
+ void visitSelectInst(SelectInst &SI);
+
+ // Return the number of select instructions. This needs be called after
+ // countSelects().
+ unsigned getNumOfSelectInsts() const { return NSIs; }
+};
+
+
+class PGOInstrumentationGenLegacyPass : public ModulePass {
+public:
+ static char ID;
+
+ PGOInstrumentationGenLegacyPass(bool IsCS = false)
+ : ModulePass(ID), IsCS(IsCS) {
+ initializePGOInstrumentationGenLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return "PGOInstrumentationGenPass"; }
+
+private:
+ // Is this is context-sensitive instrumentation.
+ bool IsCS;
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+};
+
+class PGOInstrumentationUseLegacyPass : public ModulePass {
+public:
+ static char ID;
+
+ // Provide the profile filename as the parameter.
+ PGOInstrumentationUseLegacyPass(std::string Filename = "", bool IsCS = false)
+ : ModulePass(ID), ProfileFileName(std::move(Filename)), IsCS(IsCS) {
+ if (!PGOTestProfileFile.empty())
+ ProfileFileName = PGOTestProfileFile;
+ initializePGOInstrumentationUseLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return "PGOInstrumentationUsePass"; }
+
+private:
+ std::string ProfileFileName;
+ // Is this is context-sensitive instrumentation use.
+ bool IsCS;
+
+ bool runOnModule(Module &M) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+};
+
+class PGOInstrumentationGenCreateVarLegacyPass : public ModulePass {
+public:
+ static char ID;
+ StringRef getPassName() const override {
+ return "PGOInstrumentationGenCreateVarPass";
+ }
+ PGOInstrumentationGenCreateVarLegacyPass(std::string CSInstrName = "")
+ : ModulePass(ID), InstrProfileOutput(CSInstrName) {
+ initializePGOInstrumentationGenCreateVarLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+private:
+ bool runOnModule(Module &M) override {
+ createProfileFileNameVar(M, InstrProfileOutput);
createIRLevelProfileFlagVar(M, /* IsCS */ true, PGOInstrumentEntry);
- return false;
- }
- std::string InstrProfileOutput;
-};
-
-} // end anonymous namespace
-
-char PGOInstrumentationGenLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
- "PGO instrumentation.", false, false)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
- "PGO instrumentation.", false, false)
-
-ModulePass *llvm::createPGOInstrumentationGenLegacyPass(bool IsCS) {
- return new PGOInstrumentationGenLegacyPass(IsCS);
-}
-
-char PGOInstrumentationUseLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(PGOInstrumentationUseLegacyPass, "pgo-instr-use",
- "Read PGO instrumentation profile.", false, false)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_END(PGOInstrumentationUseLegacyPass, "pgo-instr-use",
- "Read PGO instrumentation profile.", false, false)
-
-ModulePass *llvm::createPGOInstrumentationUseLegacyPass(StringRef Filename,
- bool IsCS) {
- return new PGOInstrumentationUseLegacyPass(Filename.str(), IsCS);
-}
-
-char PGOInstrumentationGenCreateVarLegacyPass::ID = 0;
-
-INITIALIZE_PASS(PGOInstrumentationGenCreateVarLegacyPass,
- "pgo-instr-gen-create-var",
- "Create PGO instrumentation version variable for CSPGO.", false,
- false)
-
-ModulePass *
-llvm::createPGOInstrumentationGenCreateVarLegacyPass(StringRef CSInstrName) {
- return new PGOInstrumentationGenCreateVarLegacyPass(std::string(CSInstrName));
-}
-
-namespace {
-
-/// An MST based instrumentation for PGO
-///
-/// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO
-/// in the function level.
-struct PGOEdge {
- // This class implements the CFG edges. Note the CFG can be a multi-graph.
- // So there might be multiple edges with same SrcBB and DestBB.
- const BasicBlock *SrcBB;
- const BasicBlock *DestBB;
- uint64_t Weight;
- bool InMST = false;
- bool Removed = false;
- bool IsCritical = false;
-
- PGOEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W = 1)
- : SrcBB(Src), DestBB(Dest), Weight(W) {}
-
- // Return the information string of an edge.
- const std::string infoString() const {
- return (Twine(Removed ? "-" : " ") + (InMST ? " " : "*") +
- (IsCritical ? "c" : " ") + " W=" + Twine(Weight)).str();
- }
-};
-
-// This class stores the auxiliary information for each BB.
-struct BBInfo {
- BBInfo *Group;
- uint32_t Index;
- uint32_t Rank = 0;
-
- BBInfo(unsigned IX) : Group(this), Index(IX) {}
-
- // Return the information string of this object.
- const std::string infoString() const {
- return (Twine("Index=") + Twine(Index)).str();
- }
-
- // Empty function -- only applicable to UseBBInfo.
- void addOutEdge(PGOEdge *E LLVM_ATTRIBUTE_UNUSED) {}
-
- // Empty function -- only applicable to UseBBInfo.
- void addInEdge(PGOEdge *E LLVM_ATTRIBUTE_UNUSED) {}
-};
-
-// This class implements the CFG edges. Note the CFG can be a multi-graph.
-template <class Edge, class BBInfo> class FuncPGOInstrumentation {
-private:
- Function &F;
-
- // Is this is context-sensitive instrumentation.
- bool IsCS;
-
- // A map that stores the Comdat group in function F.
- std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers;
-
- ValueProfileCollector VPC;
-
- void computeCFGHash();
- void renameComdatFunction();
-
-public:
- std::vector<std::vector<VPCandidateInfo>> ValueSites;
- SelectInstVisitor SIVisitor;
- std::string FuncName;
- GlobalVariable *FuncNameVar;
-
- // CFG hash value for this function.
- uint64_t FunctionHash = 0;
-
- // The Minimum Spanning Tree of function CFG.
- CFGMST<Edge, BBInfo> MST;
-
- // Collect all the BBs that will be instrumented, and store them in
- // InstrumentBBs.
- void getInstrumentBBs(std::vector<BasicBlock *> &InstrumentBBs);
-
- // Give an edge, find the BB that will be instrumented.
- // Return nullptr if there is no BB to be instrumented.
- BasicBlock *getInstrBB(Edge *E);
-
- // Return the auxiliary BB information.
- BBInfo &getBBInfo(const BasicBlock *BB) const { return MST.getBBInfo(BB); }
-
- // Return the auxiliary BB information if available.
- BBInfo *findBBInfo(const BasicBlock *BB) const { return MST.findBBInfo(BB); }
-
- // Dump edges and BB information.
- void dumpInfo(std::string Str = "") const {
- MST.dumpEdges(dbgs(), Twine("Dump Function ") + FuncName + " Hash: " +
- Twine(FunctionHash) + "\t" + Str);
- }
-
- FuncPGOInstrumentation(
- Function &Func, TargetLibraryInfo &TLI,
- std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
- bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
+ return false;
+ }
+ std::string InstrProfileOutput;
+};
+
+} // end anonymous namespace
+
+char PGOInstrumentationGenLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
+ "PGO instrumentation.", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
+ "PGO instrumentation.", false, false)
+
+ModulePass *llvm::createPGOInstrumentationGenLegacyPass(bool IsCS) {
+ return new PGOInstrumentationGenLegacyPass(IsCS);
+}
+
+char PGOInstrumentationUseLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PGOInstrumentationUseLegacyPass, "pgo-instr-use",
+ "Read PGO instrumentation profile.", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_END(PGOInstrumentationUseLegacyPass, "pgo-instr-use",
+ "Read PGO instrumentation profile.", false, false)
+
+ModulePass *llvm::createPGOInstrumentationUseLegacyPass(StringRef Filename,
+ bool IsCS) {
+ return new PGOInstrumentationUseLegacyPass(Filename.str(), IsCS);
+}
+
+char PGOInstrumentationGenCreateVarLegacyPass::ID = 0;
+
+INITIALIZE_PASS(PGOInstrumentationGenCreateVarLegacyPass,
+ "pgo-instr-gen-create-var",
+ "Create PGO instrumentation version variable for CSPGO.", false,
+ false)
+
+ModulePass *
+llvm::createPGOInstrumentationGenCreateVarLegacyPass(StringRef CSInstrName) {
+ return new PGOInstrumentationGenCreateVarLegacyPass(std::string(CSInstrName));
+}
+
+namespace {
+
+/// An MST based instrumentation for PGO
+///
+/// Implements a Minimum Spanning Tree (MST) based instrumentation for PGO
+/// in the function level.
+struct PGOEdge {
+ // This class implements the CFG edges. Note the CFG can be a multi-graph.
+ // So there might be multiple edges with same SrcBB and DestBB.
+ const BasicBlock *SrcBB;
+ const BasicBlock *DestBB;
+ uint64_t Weight;
+ bool InMST = false;
+ bool Removed = false;
+ bool IsCritical = false;
+
+ PGOEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W = 1)
+ : SrcBB(Src), DestBB(Dest), Weight(W) {}
+
+ // Return the information string of an edge.
+ const std::string infoString() const {
+ return (Twine(Removed ? "-" : " ") + (InMST ? " " : "*") +
+ (IsCritical ? "c" : " ") + " W=" + Twine(Weight)).str();
+ }
+};
+
+// This class stores the auxiliary information for each BB.
+struct BBInfo {
+ BBInfo *Group;
+ uint32_t Index;
+ uint32_t Rank = 0;
+
+ BBInfo(unsigned IX) : Group(this), Index(IX) {}
+
+ // Return the information string of this object.
+ const std::string infoString() const {
+ return (Twine("Index=") + Twine(Index)).str();
+ }
+
+ // Empty function -- only applicable to UseBBInfo.
+ void addOutEdge(PGOEdge *E LLVM_ATTRIBUTE_UNUSED) {}
+
+ // Empty function -- only applicable to UseBBInfo.
+ void addInEdge(PGOEdge *E LLVM_ATTRIBUTE_UNUSED) {}
+};
+
+// This class implements the CFG edges. Note the CFG can be a multi-graph.
+template <class Edge, class BBInfo> class FuncPGOInstrumentation {
+private:
+ Function &F;
+
+ // Is this is context-sensitive instrumentation.
+ bool IsCS;
+
+ // A map that stores the Comdat group in function F.
+ std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers;
+
+ ValueProfileCollector VPC;
+
+ void computeCFGHash();
+ void renameComdatFunction();
+
+public:
+ std::vector<std::vector<VPCandidateInfo>> ValueSites;
+ SelectInstVisitor SIVisitor;
+ std::string FuncName;
+ GlobalVariable *FuncNameVar;
+
+ // CFG hash value for this function.
+ uint64_t FunctionHash = 0;
+
+ // The Minimum Spanning Tree of function CFG.
+ CFGMST<Edge, BBInfo> MST;
+
+ // Collect all the BBs that will be instrumented, and store them in
+ // InstrumentBBs.
+ void getInstrumentBBs(std::vector<BasicBlock *> &InstrumentBBs);
+
+ // Give an edge, find the BB that will be instrumented.
+ // Return nullptr if there is no BB to be instrumented.
+ BasicBlock *getInstrBB(Edge *E);
+
+ // Return the auxiliary BB information.
+ BBInfo &getBBInfo(const BasicBlock *BB) const { return MST.getBBInfo(BB); }
+
+ // Return the auxiliary BB information if available.
+ BBInfo *findBBInfo(const BasicBlock *BB) const { return MST.findBBInfo(BB); }
+
+ // Dump edges and BB information.
+ void dumpInfo(std::string Str = "") const {
+ MST.dumpEdges(dbgs(), Twine("Dump Function ") + FuncName + " Hash: " +
+ Twine(FunctionHash) + "\t" + Str);
+ }
+
+ FuncPGOInstrumentation(
+ Function &Func, TargetLibraryInfo &TLI,
+ std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
+ bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
BlockFrequencyInfo *BFI = nullptr, bool IsCS = false,
bool InstrumentFuncEntry = true)
- : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func, TLI),
+ : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func, TLI),
ValueSites(IPVK_Last + 1), SIVisitor(Func),
MST(F, InstrumentFuncEntry, BPI, BFI) {
- // This should be done before CFG hash computation.
- SIVisitor.countSelects(Func);
- ValueSites[IPVK_MemOPSize] = VPC.get(IPVK_MemOPSize);
- if (!IsCS) {
- NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
- NumOfPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size();
- NumOfPGOBB += MST.BBInfos.size();
- ValueSites[IPVK_IndirectCallTarget] = VPC.get(IPVK_IndirectCallTarget);
- } else {
- NumOfCSPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
- NumOfCSPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size();
- NumOfCSPGOBB += MST.BBInfos.size();
- }
-
- FuncName = getPGOFuncName(F);
- computeCFGHash();
- if (!ComdatMembers.empty())
- renameComdatFunction();
- LLVM_DEBUG(dumpInfo("after CFGMST"));
-
- for (auto &E : MST.AllEdges) {
- if (E->Removed)
- continue;
- IsCS ? NumOfCSPGOEdge++ : NumOfPGOEdge++;
- if (!E->InMST)
- IsCS ? NumOfCSPGOInstrument++ : NumOfPGOInstrument++;
- }
-
- if (CreateGlobalVar)
- FuncNameVar = createPGOFuncNameVar(F, FuncName);
- }
-};
-
-} // end anonymous namespace
-
-// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index
+ // This should be done before CFG hash computation.
+ SIVisitor.countSelects(Func);
+ ValueSites[IPVK_MemOPSize] = VPC.get(IPVK_MemOPSize);
+ if (!IsCS) {
+ NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
+ NumOfPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size();
+ NumOfPGOBB += MST.BBInfos.size();
+ ValueSites[IPVK_IndirectCallTarget] = VPC.get(IPVK_IndirectCallTarget);
+ } else {
+ NumOfCSPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
+ NumOfCSPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size();
+ NumOfCSPGOBB += MST.BBInfos.size();
+ }
+
+ FuncName = getPGOFuncName(F);
+ computeCFGHash();
+ if (!ComdatMembers.empty())
+ renameComdatFunction();
+ LLVM_DEBUG(dumpInfo("after CFGMST"));
+
+ for (auto &E : MST.AllEdges) {
+ if (E->Removed)
+ continue;
+ IsCS ? NumOfCSPGOEdge++ : NumOfPGOEdge++;
+ if (!E->InMST)
+ IsCS ? NumOfCSPGOInstrument++ : NumOfPGOInstrument++;
+ }
+
+ if (CreateGlobalVar)
+ FuncNameVar = createPGOFuncNameVar(F, FuncName);
+ }
+};
+
+} // end anonymous namespace
+
+// Compute Hash value for the CFG: the lower 32 bits are CRC32 of the index
// value of each BB in the CFG. The higher 32 bits are the CRC32 of the numbers
// of selects, indirect calls, mem ops and edges.
-template <class Edge, class BBInfo>
-void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
- std::vector<uint8_t> Indexes;
- JamCRC JC;
- for (auto &BB : F) {
- const Instruction *TI = BB.getTerminator();
- for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) {
- BasicBlock *Succ = TI->getSuccessor(I);
- auto BI = findBBInfo(Succ);
- if (BI == nullptr)
- continue;
- uint32_t Index = BI->Index;
- for (int J = 0; J < 4; J++)
- Indexes.push_back((uint8_t)(Index >> (J * 8)));
- }
- }
- JC.update(Indexes);
-
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
+ std::vector<uint8_t> Indexes;
+ JamCRC JC;
+ for (auto &BB : F) {
+ const Instruction *TI = BB.getTerminator();
+ for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) {
+ BasicBlock *Succ = TI->getSuccessor(I);
+ auto BI = findBBInfo(Succ);
+ if (BI == nullptr)
+ continue;
+ uint32_t Index = BI->Index;
+ for (int J = 0; J < 4; J++)
+ Indexes.push_back((uint8_t)(Index >> (J * 8)));
+ }
+ }
+ JC.update(Indexes);
+
JamCRC JCH;
if (PGOOldCFGHashing) {
// Hash format for context sensitive profile. Reserve 4 bits for other
@@ -693,956 +693,956 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
FunctionHash = (((uint64_t)JCH.getCRC()) << 28) + JC.getCRC();
}
- // Reserve bit 60-63 for other information purpose.
- FunctionHash &= 0x0FFFFFFFFFFFFFFF;
- if (IsCS)
- NamedInstrProfRecord::setCSFlagInHash(FunctionHash);
- LLVM_DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n"
- << " CRC = " << JC.getCRC()
- << ", Selects = " << SIVisitor.getNumOfSelectInsts()
- << ", Edges = " << MST.AllEdges.size() << ", ICSites = "
+ // Reserve bit 60-63 for other information purpose.
+ FunctionHash &= 0x0FFFFFFFFFFFFFFF;
+ if (IsCS)
+ NamedInstrProfRecord::setCSFlagInHash(FunctionHash);
+ LLVM_DEBUG(dbgs() << "Function Hash Computation for " << F.getName() << ":\n"
+ << " CRC = " << JC.getCRC()
+ << ", Selects = " << SIVisitor.getNumOfSelectInsts()
+ << ", Edges = " << MST.AllEdges.size() << ", ICSites = "
<< ValueSites[IPVK_IndirectCallTarget].size());
if (!PGOOldCFGHashing) {
LLVM_DEBUG(dbgs() << ", Memops = " << ValueSites[IPVK_MemOPSize].size()
<< ", High32 CRC = " << JCH.getCRC());
}
LLVM_DEBUG(dbgs() << ", Hash = " << FunctionHash << "\n";);
-}
-
-// Check if we can safely rename this Comdat function.
-static bool canRenameComdat(
- Function &F,
- std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
- if (!DoComdatRenaming || !canRenameComdatFunc(F, true))
- return false;
-
- // FIXME: Current only handle those Comdat groups that only containing one
+}
+
+// Check if we can safely rename this Comdat function.
+static bool canRenameComdat(
+ Function &F,
+ std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
+ if (!DoComdatRenaming || !canRenameComdatFunc(F, true))
+ return false;
+
+ // FIXME: Current only handle those Comdat groups that only containing one
// function.
- // (1) For a Comdat group containing multiple functions, we need to have a
- // unique postfix based on the hashes for each function. There is a
- // non-trivial code refactoring to do this efficiently.
- // (2) Variables can not be renamed, so we can not rename Comdat function in a
- // group including global vars.
- Comdat *C = F.getComdat();
- for (auto &&CM : make_range(ComdatMembers.equal_range(C))) {
+ // (1) For a Comdat group containing multiple functions, we need to have a
+ // unique postfix based on the hashes for each function. There is a
+ // non-trivial code refactoring to do this efficiently.
+ // (2) Variables can not be renamed, so we can not rename Comdat function in a
+ // group including global vars.
+ Comdat *C = F.getComdat();
+ for (auto &&CM : make_range(ComdatMembers.equal_range(C))) {
assert(!isa<GlobalAlias>(CM.second));
- Function *FM = dyn_cast<Function>(CM.second);
- if (FM != &F)
- return false;
- }
- return true;
-}
-
-// Append the CFGHash to the Comdat function name.
-template <class Edge, class BBInfo>
-void FuncPGOInstrumentation<Edge, BBInfo>::renameComdatFunction() {
- if (!canRenameComdat(F, ComdatMembers))
- return;
- std::string OrigName = F.getName().str();
- std::string NewFuncName =
- Twine(F.getName() + "." + Twine(FunctionHash)).str();
- F.setName(Twine(NewFuncName));
- GlobalAlias::create(GlobalValue::WeakAnyLinkage, OrigName, &F);
- FuncName = Twine(FuncName + "." + Twine(FunctionHash)).str();
- Comdat *NewComdat;
- Module *M = F.getParent();
- // For AvailableExternallyLinkage functions, change the linkage to
- // LinkOnceODR and put them into comdat. This is because after renaming, there
- // is no backup external copy available for the function.
- if (!F.hasComdat()) {
- assert(F.getLinkage() == GlobalValue::AvailableExternallyLinkage);
- NewComdat = M->getOrInsertComdat(StringRef(NewFuncName));
- F.setLinkage(GlobalValue::LinkOnceODRLinkage);
- F.setComdat(NewComdat);
- return;
- }
-
- // This function belongs to a single function Comdat group.
- Comdat *OrigComdat = F.getComdat();
- std::string NewComdatName =
- Twine(OrigComdat->getName() + "." + Twine(FunctionHash)).str();
- NewComdat = M->getOrInsertComdat(StringRef(NewComdatName));
- NewComdat->setSelectionKind(OrigComdat->getSelectionKind());
-
- for (auto &&CM : make_range(ComdatMembers.equal_range(OrigComdat))) {
- // Must be a function.
+ Function *FM = dyn_cast<Function>(CM.second);
+ if (FM != &F)
+ return false;
+ }
+ return true;
+}
+
+// Append the CFGHash to the Comdat function name.
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::renameComdatFunction() {
+ if (!canRenameComdat(F, ComdatMembers))
+ return;
+ std::string OrigName = F.getName().str();
+ std::string NewFuncName =
+ Twine(F.getName() + "." + Twine(FunctionHash)).str();
+ F.setName(Twine(NewFuncName));
+ GlobalAlias::create(GlobalValue::WeakAnyLinkage, OrigName, &F);
+ FuncName = Twine(FuncName + "." + Twine(FunctionHash)).str();
+ Comdat *NewComdat;
+ Module *M = F.getParent();
+ // For AvailableExternallyLinkage functions, change the linkage to
+ // LinkOnceODR and put them into comdat. This is because after renaming, there
+ // is no backup external copy available for the function.
+ if (!F.hasComdat()) {
+ assert(F.getLinkage() == GlobalValue::AvailableExternallyLinkage);
+ NewComdat = M->getOrInsertComdat(StringRef(NewFuncName));
+ F.setLinkage(GlobalValue::LinkOnceODRLinkage);
+ F.setComdat(NewComdat);
+ return;
+ }
+
+ // This function belongs to a single function Comdat group.
+ Comdat *OrigComdat = F.getComdat();
+ std::string NewComdatName =
+ Twine(OrigComdat->getName() + "." + Twine(FunctionHash)).str();
+ NewComdat = M->getOrInsertComdat(StringRef(NewComdatName));
+ NewComdat->setSelectionKind(OrigComdat->getSelectionKind());
+
+ for (auto &&CM : make_range(ComdatMembers.equal_range(OrigComdat))) {
+ // Must be a function.
cast<Function>(CM.second)->setComdat(NewComdat);
- }
-}
-
-// Collect all the BBs that will be instruments and return them in
-// InstrumentBBs and setup InEdges/OutEdge for UseBBInfo.
-template <class Edge, class BBInfo>
-void FuncPGOInstrumentation<Edge, BBInfo>::getInstrumentBBs(
- std::vector<BasicBlock *> &InstrumentBBs) {
- // Use a worklist as we will update the vector during the iteration.
- std::vector<Edge *> EdgeList;
- EdgeList.reserve(MST.AllEdges.size());
- for (auto &E : MST.AllEdges)
- EdgeList.push_back(E.get());
-
- for (auto &E : EdgeList) {
- BasicBlock *InstrBB = getInstrBB(E);
- if (InstrBB)
- InstrumentBBs.push_back(InstrBB);
- }
-
- // Set up InEdges/OutEdges for all BBs.
- for (auto &E : MST.AllEdges) {
- if (E->Removed)
- continue;
- const BasicBlock *SrcBB = E->SrcBB;
- const BasicBlock *DestBB = E->DestBB;
- BBInfo &SrcInfo = getBBInfo(SrcBB);
- BBInfo &DestInfo = getBBInfo(DestBB);
- SrcInfo.addOutEdge(E.get());
- DestInfo.addInEdge(E.get());
- }
-}
-
-// Given a CFG E to be instrumented, find which BB to place the instrumented
-// code. The function will split the critical edge if necessary.
-template <class Edge, class BBInfo>
-BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
- if (E->InMST || E->Removed)
- return nullptr;
-
- BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB);
- BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB);
- // For a fake edge, instrument the real BB.
- if (SrcBB == nullptr)
- return DestBB;
- if (DestBB == nullptr)
- return SrcBB;
-
- auto canInstrument = [](BasicBlock *BB) -> BasicBlock * {
- // There are basic blocks (such as catchswitch) cannot be instrumented.
- // If the returned first insertion point is the end of BB, skip this BB.
- if (BB->getFirstInsertionPt() == BB->end())
- return nullptr;
- return BB;
- };
-
- // Instrument the SrcBB if it has a single successor,
- // otherwise, the DestBB if this is not a critical edge.
- Instruction *TI = SrcBB->getTerminator();
- if (TI->getNumSuccessors() <= 1)
- return canInstrument(SrcBB);
- if (!E->IsCritical)
- return canInstrument(DestBB);
-
+ }
+}
+
+// Collect all the BBs that will be instruments and return them in
+// InstrumentBBs and setup InEdges/OutEdge for UseBBInfo.
+template <class Edge, class BBInfo>
+void FuncPGOInstrumentation<Edge, BBInfo>::getInstrumentBBs(
+ std::vector<BasicBlock *> &InstrumentBBs) {
+ // Use a worklist as we will update the vector during the iteration.
+ std::vector<Edge *> EdgeList;
+ EdgeList.reserve(MST.AllEdges.size());
+ for (auto &E : MST.AllEdges)
+ EdgeList.push_back(E.get());
+
+ for (auto &E : EdgeList) {
+ BasicBlock *InstrBB = getInstrBB(E);
+ if (InstrBB)
+ InstrumentBBs.push_back(InstrBB);
+ }
+
+ // Set up InEdges/OutEdges for all BBs.
+ for (auto &E : MST.AllEdges) {
+ if (E->Removed)
+ continue;
+ const BasicBlock *SrcBB = E->SrcBB;
+ const BasicBlock *DestBB = E->DestBB;
+ BBInfo &SrcInfo = getBBInfo(SrcBB);
+ BBInfo &DestInfo = getBBInfo(DestBB);
+ SrcInfo.addOutEdge(E.get());
+ DestInfo.addInEdge(E.get());
+ }
+}
+
+// Given a CFG E to be instrumented, find which BB to place the instrumented
+// code. The function will split the critical edge if necessary.
+template <class Edge, class BBInfo>
+BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
+ if (E->InMST || E->Removed)
+ return nullptr;
+
+ BasicBlock *SrcBB = const_cast<BasicBlock *>(E->SrcBB);
+ BasicBlock *DestBB = const_cast<BasicBlock *>(E->DestBB);
+ // For a fake edge, instrument the real BB.
+ if (SrcBB == nullptr)
+ return DestBB;
+ if (DestBB == nullptr)
+ return SrcBB;
+
+ auto canInstrument = [](BasicBlock *BB) -> BasicBlock * {
+ // There are basic blocks (such as catchswitch) cannot be instrumented.
+ // If the returned first insertion point is the end of BB, skip this BB.
+ if (BB->getFirstInsertionPt() == BB->end())
+ return nullptr;
+ return BB;
+ };
+
+ // Instrument the SrcBB if it has a single successor,
+ // otherwise, the DestBB if this is not a critical edge.
+ Instruction *TI = SrcBB->getTerminator();
+ if (TI->getNumSuccessors() <= 1)
+ return canInstrument(SrcBB);
+ if (!E->IsCritical)
+ return canInstrument(DestBB);
+
// Some IndirectBr critical edges cannot be split by the previous
// SplitIndirectBrCriticalEdges call. Bail out.
- unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
+ unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
BasicBlock *InstrBB =
isa<IndirectBrInst>(TI) ? nullptr : SplitCriticalEdge(TI, SuccNum);
- if (!InstrBB) {
- LLVM_DEBUG(
- dbgs() << "Fail to split critical edge: not instrument this edge.\n");
- return nullptr;
- }
- // For a critical edge, we have to split. Instrument the newly
- // created BB.
- IsCS ? NumOfCSPGOSplit++ : NumOfPGOSplit++;
- LLVM_DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index
- << " --> " << getBBInfo(DestBB).Index << "\n");
- // Need to add two new edges. First one: Add new edge of SrcBB->InstrBB.
- MST.addEdge(SrcBB, InstrBB, 0);
- // Second one: Add new edge of InstrBB->DestBB.
- Edge &NewEdge1 = MST.addEdge(InstrBB, DestBB, 0);
- NewEdge1.InMST = true;
- E->Removed = true;
-
- return canInstrument(InstrBB);
-}
-
-// When generating value profiling calls on Windows routines that make use of
-// handler funclets for exception processing an operand bundle needs to attached
-// to the called function. This routine will set \p OpBundles to contain the
-// funclet information, if any is needed, that should be placed on the generated
-// value profiling call for the value profile candidate call.
-static void
-populateEHOperandBundle(VPCandidateInfo &Cand,
- DenseMap<BasicBlock *, ColorVector> &BlockColors,
- SmallVectorImpl<OperandBundleDef> &OpBundles) {
- auto *OrigCall = dyn_cast<CallBase>(Cand.AnnotatedInst);
- if (OrigCall && !isa<IntrinsicInst>(OrigCall)) {
- // The instrumentation call should belong to the same funclet as a
- // non-intrinsic call, so just copy the operand bundle, if any exists.
- Optional<OperandBundleUse> ParentFunclet =
- OrigCall->getOperandBundle(LLVMContext::OB_funclet);
- if (ParentFunclet)
- OpBundles.emplace_back(OperandBundleDef(*ParentFunclet));
- } else {
- // Intrinsics or other instructions do not get funclet information from the
- // front-end. Need to use the BlockColors that was computed by the routine
- // colorEHFunclets to determine whether a funclet is needed.
- if (!BlockColors.empty()) {
- const ColorVector &CV = BlockColors.find(OrigCall->getParent())->second;
- assert(CV.size() == 1 && "non-unique color for block!");
- Instruction *EHPad = CV.front()->getFirstNonPHI();
- if (EHPad->isEHPad())
- OpBundles.emplace_back("funclet", EHPad);
- }
- }
-}
-
-// Visit all edge and instrument the edges not in MST, and do value profiling.
-// Critical edges will be split.
-static void instrumentOneFunc(
- Function &F, Module *M, TargetLibraryInfo &TLI, BranchProbabilityInfo *BPI,
- BlockFrequencyInfo *BFI,
- std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
- bool IsCS) {
- // Split indirectbr critical edges here before computing the MST rather than
- // later in getInstrBB() to avoid invalidating it.
- SplitIndirectBrCriticalEdges(F, BPI, BFI);
-
+ if (!InstrBB) {
+ LLVM_DEBUG(
+ dbgs() << "Fail to split critical edge: not instrument this edge.\n");
+ return nullptr;
+ }
+ // For a critical edge, we have to split. Instrument the newly
+ // created BB.
+ IsCS ? NumOfCSPGOSplit++ : NumOfPGOSplit++;
+ LLVM_DEBUG(dbgs() << "Split critical edge: " << getBBInfo(SrcBB).Index
+ << " --> " << getBBInfo(DestBB).Index << "\n");
+ // Need to add two new edges. First one: Add new edge of SrcBB->InstrBB.
+ MST.addEdge(SrcBB, InstrBB, 0);
+ // Second one: Add new edge of InstrBB->DestBB.
+ Edge &NewEdge1 = MST.addEdge(InstrBB, DestBB, 0);
+ NewEdge1.InMST = true;
+ E->Removed = true;
+
+ return canInstrument(InstrBB);
+}
+
+// When generating value profiling calls on Windows routines that make use of
+// handler funclets for exception processing an operand bundle needs to attached
+// to the called function. This routine will set \p OpBundles to contain the
+// funclet information, if any is needed, that should be placed on the generated
+// value profiling call for the value profile candidate call.
+static void
+populateEHOperandBundle(VPCandidateInfo &Cand,
+ DenseMap<BasicBlock *, ColorVector> &BlockColors,
+ SmallVectorImpl<OperandBundleDef> &OpBundles) {
+ auto *OrigCall = dyn_cast<CallBase>(Cand.AnnotatedInst);
+ if (OrigCall && !isa<IntrinsicInst>(OrigCall)) {
+ // The instrumentation call should belong to the same funclet as a
+ // non-intrinsic call, so just copy the operand bundle, if any exists.
+ Optional<OperandBundleUse> ParentFunclet =
+ OrigCall->getOperandBundle(LLVMContext::OB_funclet);
+ if (ParentFunclet)
+ OpBundles.emplace_back(OperandBundleDef(*ParentFunclet));
+ } else {
+ // Intrinsics or other instructions do not get funclet information from the
+ // front-end. Need to use the BlockColors that was computed by the routine
+ // colorEHFunclets to determine whether a funclet is needed.
+ if (!BlockColors.empty()) {
+ const ColorVector &CV = BlockColors.find(OrigCall->getParent())->second;
+ assert(CV.size() == 1 && "non-unique color for block!");
+ Instruction *EHPad = CV.front()->getFirstNonPHI();
+ if (EHPad->isEHPad())
+ OpBundles.emplace_back("funclet", EHPad);
+ }
+ }
+}
+
+// Visit all edge and instrument the edges not in MST, and do value profiling.
+// Critical edges will be split.
+static void instrumentOneFunc(
+ Function &F, Module *M, TargetLibraryInfo &TLI, BranchProbabilityInfo *BPI,
+ BlockFrequencyInfo *BFI,
+ std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
+ bool IsCS) {
+ // Split indirectbr critical edges here before computing the MST rather than
+ // later in getInstrBB() to avoid invalidating it.
+ SplitIndirectBrCriticalEdges(F, BPI, BFI);
+
FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(
F, TLI, ComdatMembers, true, BPI, BFI, IsCS, PGOInstrumentEntry);
- std::vector<BasicBlock *> InstrumentBBs;
- FuncInfo.getInstrumentBBs(InstrumentBBs);
- unsigned NumCounters =
- InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
-
- uint32_t I = 0;
- Type *I8PtrTy = Type::getInt8PtrTy(M->getContext());
- for (auto *InstrBB : InstrumentBBs) {
- IRBuilder<> Builder(InstrBB, InstrBB->getFirstInsertionPt());
- assert(Builder.GetInsertPoint() != InstrBB->end() &&
- "Cannot get the Instrumentation point");
- Builder.CreateCall(
- Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment),
- {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
- Builder.getInt64(FuncInfo.FunctionHash), Builder.getInt32(NumCounters),
- Builder.getInt32(I++)});
- }
-
- // Now instrument select instructions:
- FuncInfo.SIVisitor.instrumentSelects(F, &I, NumCounters, FuncInfo.FuncNameVar,
- FuncInfo.FunctionHash);
- assert(I == NumCounters);
-
- if (DisableValueProfiling)
- return;
-
- NumOfPGOICall += FuncInfo.ValueSites[IPVK_IndirectCallTarget].size();
-
- // Intrinsic function calls do not have funclet operand bundles needed for
- // Windows exception handling attached to them. However, if value profiling is
- // inserted for one of these calls, then a funclet value will need to be set
- // on the instrumentation call based on the funclet coloring.
- DenseMap<BasicBlock *, ColorVector> BlockColors;
- if (F.hasPersonalityFn() &&
- isFuncletEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
- BlockColors = colorEHFunclets(F);
-
- // For each VP Kind, walk the VP candidates and instrument each one.
- for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) {
- unsigned SiteIndex = 0;
- if (Kind == IPVK_MemOPSize && !PGOInstrMemOP)
- continue;
-
- for (VPCandidateInfo Cand : FuncInfo.ValueSites[Kind]) {
- LLVM_DEBUG(dbgs() << "Instrument one VP " << ValueProfKindDescr[Kind]
- << " site: CallSite Index = " << SiteIndex << "\n");
-
- IRBuilder<> Builder(Cand.InsertPt);
- assert(Builder.GetInsertPoint() != Cand.InsertPt->getParent()->end() &&
- "Cannot get the Instrumentation point");
-
- Value *ToProfile = nullptr;
- if (Cand.V->getType()->isIntegerTy())
- ToProfile = Builder.CreateZExtOrTrunc(Cand.V, Builder.getInt64Ty());
- else if (Cand.V->getType()->isPointerTy())
- ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty());
- assert(ToProfile && "value profiling Value is of unexpected type");
-
- SmallVector<OperandBundleDef, 1> OpBundles;
- populateEHOperandBundle(Cand, BlockColors, OpBundles);
- Builder.CreateCall(
- Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
- {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
- Builder.getInt64(FuncInfo.FunctionHash), ToProfile,
- Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)},
- OpBundles);
- }
- } // IPVK_First <= Kind <= IPVK_Last
-}
-
-namespace {
-
-// This class represents a CFG edge in profile use compilation.
-struct PGOUseEdge : public PGOEdge {
- bool CountValid = false;
- uint64_t CountValue = 0;
-
- PGOUseEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W = 1)
- : PGOEdge(Src, Dest, W) {}
-
- // Set edge count value
- void setEdgeCount(uint64_t Value) {
- CountValue = Value;
- CountValid = true;
- }
-
- // Return the information string for this object.
- const std::string infoString() const {
- if (!CountValid)
- return PGOEdge::infoString();
- return (Twine(PGOEdge::infoString()) + " Count=" + Twine(CountValue))
- .str();
- }
-};
-
-using DirectEdges = SmallVector<PGOUseEdge *, 2>;
-
-// This class stores the auxiliary information for each BB.
-struct UseBBInfo : public BBInfo {
- uint64_t CountValue = 0;
- bool CountValid;
- int32_t UnknownCountInEdge = 0;
- int32_t UnknownCountOutEdge = 0;
- DirectEdges InEdges;
- DirectEdges OutEdges;
-
- UseBBInfo(unsigned IX) : BBInfo(IX), CountValid(false) {}
-
- UseBBInfo(unsigned IX, uint64_t C)
- : BBInfo(IX), CountValue(C), CountValid(true) {}
-
- // Set the profile count value for this BB.
- void setBBInfoCount(uint64_t Value) {
- CountValue = Value;
- CountValid = true;
- }
-
- // Return the information string of this object.
- const std::string infoString() const {
- if (!CountValid)
- return BBInfo::infoString();
- return (Twine(BBInfo::infoString()) + " Count=" + Twine(CountValue)).str();
- }
-
- // Add an OutEdge and update the edge count.
- void addOutEdge(PGOUseEdge *E) {
- OutEdges.push_back(E);
- UnknownCountOutEdge++;
- }
-
- // Add an InEdge and update the edge count.
- void addInEdge(PGOUseEdge *E) {
- InEdges.push_back(E);
- UnknownCountInEdge++;
- }
-};
-
-} // end anonymous namespace
-
-// Sum up the count values for all the edges.
-static uint64_t sumEdgeCount(const ArrayRef<PGOUseEdge *> Edges) {
- uint64_t Total = 0;
- for (auto &E : Edges) {
- if (E->Removed)
- continue;
- Total += E->CountValue;
- }
- return Total;
-}
-
-namespace {
-
-class PGOUseFunc {
-public:
- PGOUseFunc(Function &Func, Module *Modu, TargetLibraryInfo &TLI,
- std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
- BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFIin,
+ std::vector<BasicBlock *> InstrumentBBs;
+ FuncInfo.getInstrumentBBs(InstrumentBBs);
+ unsigned NumCounters =
+ InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
+
+ uint32_t I = 0;
+ Type *I8PtrTy = Type::getInt8PtrTy(M->getContext());
+ for (auto *InstrBB : InstrumentBBs) {
+ IRBuilder<> Builder(InstrBB, InstrBB->getFirstInsertionPt());
+ assert(Builder.GetInsertPoint() != InstrBB->end() &&
+ "Cannot get the Instrumentation point");
+ Builder.CreateCall(
+ Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment),
+ {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
+ Builder.getInt64(FuncInfo.FunctionHash), Builder.getInt32(NumCounters),
+ Builder.getInt32(I++)});
+ }
+
+ // Now instrument select instructions:
+ FuncInfo.SIVisitor.instrumentSelects(F, &I, NumCounters, FuncInfo.FuncNameVar,
+ FuncInfo.FunctionHash);
+ assert(I == NumCounters);
+
+ if (DisableValueProfiling)
+ return;
+
+ NumOfPGOICall += FuncInfo.ValueSites[IPVK_IndirectCallTarget].size();
+
+ // Intrinsic function calls do not have funclet operand bundles needed for
+ // Windows exception handling attached to them. However, if value profiling is
+ // inserted for one of these calls, then a funclet value will need to be set
+ // on the instrumentation call based on the funclet coloring.
+ DenseMap<BasicBlock *, ColorVector> BlockColors;
+ if (F.hasPersonalityFn() &&
+ isFuncletEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+ BlockColors = colorEHFunclets(F);
+
+ // For each VP Kind, walk the VP candidates and instrument each one.
+ for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) {
+ unsigned SiteIndex = 0;
+ if (Kind == IPVK_MemOPSize && !PGOInstrMemOP)
+ continue;
+
+ for (VPCandidateInfo Cand : FuncInfo.ValueSites[Kind]) {
+ LLVM_DEBUG(dbgs() << "Instrument one VP " << ValueProfKindDescr[Kind]
+ << " site: CallSite Index = " << SiteIndex << "\n");
+
+ IRBuilder<> Builder(Cand.InsertPt);
+ assert(Builder.GetInsertPoint() != Cand.InsertPt->getParent()->end() &&
+ "Cannot get the Instrumentation point");
+
+ Value *ToProfile = nullptr;
+ if (Cand.V->getType()->isIntegerTy())
+ ToProfile = Builder.CreateZExtOrTrunc(Cand.V, Builder.getInt64Ty());
+ else if (Cand.V->getType()->isPointerTy())
+ ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty());
+ assert(ToProfile && "value profiling Value is of unexpected type");
+
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ populateEHOperandBundle(Cand, BlockColors, OpBundles);
+ Builder.CreateCall(
+ Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
+ {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
+ Builder.getInt64(FuncInfo.FunctionHash), ToProfile,
+ Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)},
+ OpBundles);
+ }
+ } // IPVK_First <= Kind <= IPVK_Last
+}
+
+namespace {
+
+// This class represents a CFG edge in profile use compilation.
+struct PGOUseEdge : public PGOEdge {
+ bool CountValid = false;
+ uint64_t CountValue = 0;
+
+ PGOUseEdge(const BasicBlock *Src, const BasicBlock *Dest, uint64_t W = 1)
+ : PGOEdge(Src, Dest, W) {}
+
+ // Set edge count value
+ void setEdgeCount(uint64_t Value) {
+ CountValue = Value;
+ CountValid = true;
+ }
+
+ // Return the information string for this object.
+ const std::string infoString() const {
+ if (!CountValid)
+ return PGOEdge::infoString();
+ return (Twine(PGOEdge::infoString()) + " Count=" + Twine(CountValue))
+ .str();
+ }
+};
+
+using DirectEdges = SmallVector<PGOUseEdge *, 2>;
+
+// This class stores the auxiliary information for each BB.
+struct UseBBInfo : public BBInfo {
+ uint64_t CountValue = 0;
+ bool CountValid;
+ int32_t UnknownCountInEdge = 0;
+ int32_t UnknownCountOutEdge = 0;
+ DirectEdges InEdges;
+ DirectEdges OutEdges;
+
+ UseBBInfo(unsigned IX) : BBInfo(IX), CountValid(false) {}
+
+ UseBBInfo(unsigned IX, uint64_t C)
+ : BBInfo(IX), CountValue(C), CountValid(true) {}
+
+ // Set the profile count value for this BB.
+ void setBBInfoCount(uint64_t Value) {
+ CountValue = Value;
+ CountValid = true;
+ }
+
+ // Return the information string of this object.
+ const std::string infoString() const {
+ if (!CountValid)
+ return BBInfo::infoString();
+ return (Twine(BBInfo::infoString()) + " Count=" + Twine(CountValue)).str();
+ }
+
+ // Add an OutEdge and update the edge count.
+ void addOutEdge(PGOUseEdge *E) {
+ OutEdges.push_back(E);
+ UnknownCountOutEdge++;
+ }
+
+ // Add an InEdge and update the edge count.
+ void addInEdge(PGOUseEdge *E) {
+ InEdges.push_back(E);
+ UnknownCountInEdge++;
+ }
+};
+
+} // end anonymous namespace
+
+// Sum up the count values for all the edges.
+static uint64_t sumEdgeCount(const ArrayRef<PGOUseEdge *> Edges) {
+ uint64_t Total = 0;
+ for (auto &E : Edges) {
+ if (E->Removed)
+ continue;
+ Total += E->CountValue;
+ }
+ return Total;
+}
+
+namespace {
+
+class PGOUseFunc {
+public:
+ PGOUseFunc(Function &Func, Module *Modu, TargetLibraryInfo &TLI,
+ std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
+ BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFIin,
ProfileSummaryInfo *PSI, bool IsCS, bool InstrumentFuncEntry)
- : F(Func), M(Modu), BFI(BFIin), PSI(PSI),
+ : F(Func), M(Modu), BFI(BFIin), PSI(PSI),
FuncInfo(Func, TLI, ComdatMembers, false, BPI, BFIin, IsCS,
InstrumentFuncEntry),
- FreqAttr(FFA_Normal), IsCS(IsCS) {}
-
- // Read counts for the instrumented BB from profile.
+ FreqAttr(FFA_Normal), IsCS(IsCS) {}
+
+ // Read counts for the instrumented BB from profile.
bool readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros,
bool &AllMinusOnes);
-
- // Populate the counts for all BBs.
- void populateCounters();
-
- // Set the branch weights based on the count values.
- void setBranchWeights();
-
- // Annotate the value profile call sites for all value kind.
- void annotateValueSites();
-
- // Annotate the value profile call sites for one value kind.
- void annotateValueSites(uint32_t Kind);
-
- // Annotate the irreducible loop header weights.
- void annotateIrrLoopHeaderWeights();
-
- // The hotness of the function from the profile count.
- enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
-
- // Return the function hotness from the profile.
- FuncFreqAttr getFuncFreqAttr() const { return FreqAttr; }
-
- // Return the function hash.
- uint64_t getFuncHash() const { return FuncInfo.FunctionHash; }
-
- // Return the profile record for this function;
- InstrProfRecord &getProfileRecord() { return ProfileRecord; }
-
- // Return the auxiliary BB information.
- UseBBInfo &getBBInfo(const BasicBlock *BB) const {
- return FuncInfo.getBBInfo(BB);
- }
-
- // Return the auxiliary BB information if available.
- UseBBInfo *findBBInfo(const BasicBlock *BB) const {
- return FuncInfo.findBBInfo(BB);
- }
-
- Function &getFunc() const { return F; }
-
- void dumpInfo(std::string Str = "") const {
- FuncInfo.dumpInfo(Str);
- }
-
- uint64_t getProgramMaxCount() const { return ProgramMaxCount; }
-private:
- Function &F;
- Module *M;
- BlockFrequencyInfo *BFI;
- ProfileSummaryInfo *PSI;
-
- // This member stores the shared information with class PGOGenFunc.
- FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
-
- // The maximum count value in the profile. This is only used in PGO use
- // compilation.
- uint64_t ProgramMaxCount;
-
- // Position of counter that remains to be read.
- uint32_t CountPosition = 0;
-
- // Total size of the profile count for this function.
- uint32_t ProfileCountSize = 0;
-
- // ProfileRecord for this function.
- InstrProfRecord ProfileRecord;
-
- // Function hotness info derived from profile.
- FuncFreqAttr FreqAttr;
-
- // Is to use the context sensitive profile.
- bool IsCS;
-
- // Find the Instrumented BB and set the value. Return false on error.
- bool setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile);
-
- // Set the edge counter value for the unknown edge -- there should be only
- // one unknown edge.
- void setEdgeCount(DirectEdges &Edges, uint64_t Value);
-
- // Return FuncName string;
- const std::string getFuncName() const { return FuncInfo.FuncName; }
-
- // Set the hot/cold inline hints based on the count values.
- // FIXME: This function should be removed once the functionality in
- // the inliner is implemented.
- void markFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) {
- if (PSI->isHotCount(EntryCount))
- FreqAttr = FFA_Hot;
- else if (PSI->isColdCount(MaxCount))
- FreqAttr = FFA_Cold;
- }
-};
-
-} // end anonymous namespace
-
-// Visit all the edges and assign the count value for the instrumented
-// edges and the BB. Return false on error.
-bool PGOUseFunc::setInstrumentedCounts(
- const std::vector<uint64_t> &CountFromProfile) {
-
- std::vector<BasicBlock *> InstrumentBBs;
- FuncInfo.getInstrumentBBs(InstrumentBBs);
- unsigned NumCounters =
- InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
- // The number of counters here should match the number of counters
- // in profile. Return if they mismatch.
- if (NumCounters != CountFromProfile.size()) {
- return false;
- }
+
+ // Populate the counts for all BBs.
+ void populateCounters();
+
+ // Set the branch weights based on the count values.
+ void setBranchWeights();
+
+ // Annotate the value profile call sites for all value kind.
+ void annotateValueSites();
+
+ // Annotate the value profile call sites for one value kind.
+ void annotateValueSites(uint32_t Kind);
+
+ // Annotate the irreducible loop header weights.
+ void annotateIrrLoopHeaderWeights();
+
+ // The hotness of the function from the profile count.
+ enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
+
+ // Return the function hotness from the profile.
+ FuncFreqAttr getFuncFreqAttr() const { return FreqAttr; }
+
+ // Return the function hash.
+ uint64_t getFuncHash() const { return FuncInfo.FunctionHash; }
+
+ // Return the profile record for this function;
+ InstrProfRecord &getProfileRecord() { return ProfileRecord; }
+
+ // Return the auxiliary BB information.
+ UseBBInfo &getBBInfo(const BasicBlock *BB) const {
+ return FuncInfo.getBBInfo(BB);
+ }
+
+ // Return the auxiliary BB information if available.
+ UseBBInfo *findBBInfo(const BasicBlock *BB) const {
+ return FuncInfo.findBBInfo(BB);
+ }
+
+ Function &getFunc() const { return F; }
+
+ void dumpInfo(std::string Str = "") const {
+ FuncInfo.dumpInfo(Str);
+ }
+
+ uint64_t getProgramMaxCount() const { return ProgramMaxCount; }
+private:
+ Function &F;
+ Module *M;
+ BlockFrequencyInfo *BFI;
+ ProfileSummaryInfo *PSI;
+
+ // This member stores the shared information with class PGOGenFunc.
+ FuncPGOInstrumentation<PGOUseEdge, UseBBInfo> FuncInfo;
+
+ // The maximum count value in the profile. This is only used in PGO use
+ // compilation.
+ uint64_t ProgramMaxCount;
+
+ // Position of counter that remains to be read.
+ uint32_t CountPosition = 0;
+
+ // Total size of the profile count for this function.
+ uint32_t ProfileCountSize = 0;
+
+ // ProfileRecord for this function.
+ InstrProfRecord ProfileRecord;
+
+ // Function hotness info derived from profile.
+ FuncFreqAttr FreqAttr;
+
+ // Is to use the context sensitive profile.
+ bool IsCS;
+
+ // Find the Instrumented BB and set the value. Return false on error.
+ bool setInstrumentedCounts(const std::vector<uint64_t> &CountFromProfile);
+
+ // Set the edge counter value for the unknown edge -- there should be only
+ // one unknown edge.
+ void setEdgeCount(DirectEdges &Edges, uint64_t Value);
+
+ // Return FuncName string;
+ const std::string getFuncName() const { return FuncInfo.FuncName; }
+
+ // Set the hot/cold inline hints based on the count values.
+ // FIXME: This function should be removed once the functionality in
+ // the inliner is implemented.
+ void markFunctionAttributes(uint64_t EntryCount, uint64_t MaxCount) {
+ if (PSI->isHotCount(EntryCount))
+ FreqAttr = FFA_Hot;
+ else if (PSI->isColdCount(MaxCount))
+ FreqAttr = FFA_Cold;
+ }
+};
+
+} // end anonymous namespace
+
+// Visit all the edges and assign the count value for the instrumented
+// edges and the BB. Return false on error.
+bool PGOUseFunc::setInstrumentedCounts(
+ const std::vector<uint64_t> &CountFromProfile) {
+
+ std::vector<BasicBlock *> InstrumentBBs;
+ FuncInfo.getInstrumentBBs(InstrumentBBs);
+ unsigned NumCounters =
+ InstrumentBBs.size() + FuncInfo.SIVisitor.getNumOfSelectInsts();
+ // The number of counters here should match the number of counters
+ // in profile. Return if they mismatch.
+ if (NumCounters != CountFromProfile.size()) {
+ return false;
+ }
auto *FuncEntry = &*F.begin();
- // Set the profile count to the Instrumented BBs.
- uint32_t I = 0;
- for (BasicBlock *InstrBB : InstrumentBBs) {
- uint64_t CountValue = CountFromProfile[I++];
- UseBBInfo &Info = getBBInfo(InstrBB);
+ // Set the profile count to the Instrumented BBs.
+ uint32_t I = 0;
+ for (BasicBlock *InstrBB : InstrumentBBs) {
+ uint64_t CountValue = CountFromProfile[I++];
+ UseBBInfo &Info = getBBInfo(InstrBB);
// If we reach here, we know that we have some nonzero count
// values in this function. The entry count should not be 0.
// Fix it if necessary.
if (InstrBB == FuncEntry && CountValue == 0)
CountValue = 1;
- Info.setBBInfoCount(CountValue);
- }
- ProfileCountSize = CountFromProfile.size();
- CountPosition = I;
-
- // Set the edge count and update the count of unknown edges for BBs.
- auto setEdgeCount = [this](PGOUseEdge *E, uint64_t Value) -> void {
- E->setEdgeCount(Value);
- this->getBBInfo(E->SrcBB).UnknownCountOutEdge--;
- this->getBBInfo(E->DestBB).UnknownCountInEdge--;
- };
-
- // Set the profile count the Instrumented edges. There are BBs that not in
- // MST but not instrumented. Need to set the edge count value so that we can
- // populate the profile counts later.
- for (auto &E : FuncInfo.MST.AllEdges) {
- if (E->Removed || E->InMST)
- continue;
- const BasicBlock *SrcBB = E->SrcBB;
- UseBBInfo &SrcInfo = getBBInfo(SrcBB);
-
- // If only one out-edge, the edge profile count should be the same as BB
- // profile count.
- if (SrcInfo.CountValid && SrcInfo.OutEdges.size() == 1)
- setEdgeCount(E.get(), SrcInfo.CountValue);
- else {
- const BasicBlock *DestBB = E->DestBB;
- UseBBInfo &DestInfo = getBBInfo(DestBB);
- // If only one in-edge, the edge profile count should be the same as BB
- // profile count.
- if (DestInfo.CountValid && DestInfo.InEdges.size() == 1)
- setEdgeCount(E.get(), DestInfo.CountValue);
- }
- if (E->CountValid)
- continue;
- // E's count should have been set from profile. If not, this meenas E skips
- // the instrumentation. We set the count to 0.
- setEdgeCount(E.get(), 0);
- }
- return true;
-}
-
-// Set the count value for the unknown edge. There should be one and only one
-// unknown edge in Edges vector.
-void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) {
- for (auto &E : Edges) {
- if (E->CountValid)
- continue;
- E->setEdgeCount(Value);
-
- getBBInfo(E->SrcBB).UnknownCountOutEdge--;
- getBBInfo(E->DestBB).UnknownCountInEdge--;
- return;
- }
- llvm_unreachable("Cannot find the unknown count edge");
-}
-
-// Read the profile from ProfileFileName and assign the value to the
-// instrumented BB and the edges. This function also updates ProgramMaxCount.
-// Return true if the profile are successfully read, and false on errors.
+ Info.setBBInfoCount(CountValue);
+ }
+ ProfileCountSize = CountFromProfile.size();
+ CountPosition = I;
+
+ // Set the edge count and update the count of unknown edges for BBs.
+ auto setEdgeCount = [this](PGOUseEdge *E, uint64_t Value) -> void {
+ E->setEdgeCount(Value);
+ this->getBBInfo(E->SrcBB).UnknownCountOutEdge--;
+ this->getBBInfo(E->DestBB).UnknownCountInEdge--;
+ };
+
+ // Set the profile count the Instrumented edges. There are BBs that not in
+ // MST but not instrumented. Need to set the edge count value so that we can
+ // populate the profile counts later.
+ for (auto &E : FuncInfo.MST.AllEdges) {
+ if (E->Removed || E->InMST)
+ continue;
+ const BasicBlock *SrcBB = E->SrcBB;
+ UseBBInfo &SrcInfo = getBBInfo(SrcBB);
+
+ // If only one out-edge, the edge profile count should be the same as BB
+ // profile count.
+ if (SrcInfo.CountValid && SrcInfo.OutEdges.size() == 1)
+ setEdgeCount(E.get(), SrcInfo.CountValue);
+ else {
+ const BasicBlock *DestBB = E->DestBB;
+ UseBBInfo &DestInfo = getBBInfo(DestBB);
+ // If only one in-edge, the edge profile count should be the same as BB
+ // profile count.
+ if (DestInfo.CountValid && DestInfo.InEdges.size() == 1)
+ setEdgeCount(E.get(), DestInfo.CountValue);
+ }
+ if (E->CountValid)
+ continue;
+ // E's count should have been set from profile. If not, this meenas E skips
+ // the instrumentation. We set the count to 0.
+ setEdgeCount(E.get(), 0);
+ }
+ return true;
+}
+
+// Set the count value for the unknown edge. There should be one and only one
+// unknown edge in Edges vector.
+void PGOUseFunc::setEdgeCount(DirectEdges &Edges, uint64_t Value) {
+ for (auto &E : Edges) {
+ if (E->CountValid)
+ continue;
+ E->setEdgeCount(Value);
+
+ getBBInfo(E->SrcBB).UnknownCountOutEdge--;
+ getBBInfo(E->DestBB).UnknownCountInEdge--;
+ return;
+ }
+ llvm_unreachable("Cannot find the unknown count edge");
+}
+
+// Read the profile from ProfileFileName and assign the value to the
+// instrumented BB and the edges. This function also updates ProgramMaxCount.
+// Return true if the profile are successfully read, and false on errors.
bool PGOUseFunc::readCounters(IndexedInstrProfReader *PGOReader, bool &AllZeros,
bool &AllMinusOnes) {
- auto &Ctx = M->getContext();
- Expected<InstrProfRecord> Result =
- PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash);
- if (Error E = Result.takeError()) {
- handleAllErrors(std::move(E), [&](const InstrProfError &IPE) {
- auto Err = IPE.get();
- bool SkipWarning = false;
- LLVM_DEBUG(dbgs() << "Error in reading profile for Func "
- << FuncInfo.FuncName << ": ");
- if (Err == instrprof_error::unknown_function) {
- IsCS ? NumOfCSPGOMissing++ : NumOfPGOMissing++;
- SkipWarning = !PGOWarnMissing;
- LLVM_DEBUG(dbgs() << "unknown function");
- } else if (Err == instrprof_error::hash_mismatch ||
- Err == instrprof_error::malformed) {
- IsCS ? NumOfCSPGOMismatch++ : NumOfPGOMismatch++;
- SkipWarning =
- NoPGOWarnMismatch ||
- (NoPGOWarnMismatchComdat &&
- (F.hasComdat() ||
- F.getLinkage() == GlobalValue::AvailableExternallyLinkage));
- LLVM_DEBUG(dbgs() << "hash mismatch (skip=" << SkipWarning << ")");
- }
-
- LLVM_DEBUG(dbgs() << " IsCS=" << IsCS << "\n");
- if (SkipWarning)
- return;
-
- std::string Msg = IPE.message() + std::string(" ") + F.getName().str() +
- std::string(" Hash = ") +
- std::to_string(FuncInfo.FunctionHash);
-
- Ctx.diagnose(
- DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
- });
- return false;
- }
- ProfileRecord = std::move(Result.get());
- std::vector<uint64_t> &CountFromProfile = ProfileRecord.Counts;
-
- IsCS ? NumOfCSPGOFunc++ : NumOfPGOFunc++;
- LLVM_DEBUG(dbgs() << CountFromProfile.size() << " counts\n");
+ auto &Ctx = M->getContext();
+ Expected<InstrProfRecord> Result =
+ PGOReader->getInstrProfRecord(FuncInfo.FuncName, FuncInfo.FunctionHash);
+ if (Error E = Result.takeError()) {
+ handleAllErrors(std::move(E), [&](const InstrProfError &IPE) {
+ auto Err = IPE.get();
+ bool SkipWarning = false;
+ LLVM_DEBUG(dbgs() << "Error in reading profile for Func "
+ << FuncInfo.FuncName << ": ");
+ if (Err == instrprof_error::unknown_function) {
+ IsCS ? NumOfCSPGOMissing++ : NumOfPGOMissing++;
+ SkipWarning = !PGOWarnMissing;
+ LLVM_DEBUG(dbgs() << "unknown function");
+ } else if (Err == instrprof_error::hash_mismatch ||
+ Err == instrprof_error::malformed) {
+ IsCS ? NumOfCSPGOMismatch++ : NumOfPGOMismatch++;
+ SkipWarning =
+ NoPGOWarnMismatch ||
+ (NoPGOWarnMismatchComdat &&
+ (F.hasComdat() ||
+ F.getLinkage() == GlobalValue::AvailableExternallyLinkage));
+ LLVM_DEBUG(dbgs() << "hash mismatch (skip=" << SkipWarning << ")");
+ }
+
+ LLVM_DEBUG(dbgs() << " IsCS=" << IsCS << "\n");
+ if (SkipWarning)
+ return;
+
+ std::string Msg = IPE.message() + std::string(" ") + F.getName().str() +
+ std::string(" Hash = ") +
+ std::to_string(FuncInfo.FunctionHash);
+
+ Ctx.diagnose(
+ DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+ });
+ return false;
+ }
+ ProfileRecord = std::move(Result.get());
+ std::vector<uint64_t> &CountFromProfile = ProfileRecord.Counts;
+
+ IsCS ? NumOfCSPGOFunc++ : NumOfPGOFunc++;
+ LLVM_DEBUG(dbgs() << CountFromProfile.size() << " counts\n");
AllMinusOnes = (CountFromProfile.size() > 0);
- uint64_t ValueSum = 0;
- for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) {
- LLVM_DEBUG(dbgs() << " " << I << ": " << CountFromProfile[I] << "\n");
- ValueSum += CountFromProfile[I];
+ uint64_t ValueSum = 0;
+ for (unsigned I = 0, S = CountFromProfile.size(); I < S; I++) {
+ LLVM_DEBUG(dbgs() << " " << I << ": " << CountFromProfile[I] << "\n");
+ ValueSum += CountFromProfile[I];
if (CountFromProfile[I] != (uint64_t)-1)
AllMinusOnes = false;
- }
- AllZeros = (ValueSum == 0);
-
- LLVM_DEBUG(dbgs() << "SUM = " << ValueSum << "\n");
-
- getBBInfo(nullptr).UnknownCountOutEdge = 2;
- getBBInfo(nullptr).UnknownCountInEdge = 2;
-
- if (!setInstrumentedCounts(CountFromProfile)) {
- LLVM_DEBUG(
- dbgs() << "Inconsistent number of counts, skipping this function");
- Ctx.diagnose(DiagnosticInfoPGOProfile(
- M->getName().data(),
- Twine("Inconsistent number of counts in ") + F.getName().str()
- + Twine(": the profile may be stale or there is a function name collision."),
- DS_Warning));
- return false;
- }
- ProgramMaxCount = PGOReader->getMaximumFunctionCount(IsCS);
- return true;
-}
-
-// Populate the counters from instrumented BBs to all BBs.
-// In the end of this operation, all BBs should have a valid count value.
-void PGOUseFunc::populateCounters() {
- bool Changes = true;
- unsigned NumPasses = 0;
- while (Changes) {
- NumPasses++;
- Changes = false;
-
- // For efficient traversal, it's better to start from the end as most
- // of the instrumented edges are at the end.
- for (auto &BB : reverse(F)) {
- UseBBInfo *Count = findBBInfo(&BB);
- if (Count == nullptr)
- continue;
- if (!Count->CountValid) {
- if (Count->UnknownCountOutEdge == 0) {
- Count->CountValue = sumEdgeCount(Count->OutEdges);
- Count->CountValid = true;
- Changes = true;
- } else if (Count->UnknownCountInEdge == 0) {
- Count->CountValue = sumEdgeCount(Count->InEdges);
- Count->CountValid = true;
- Changes = true;
- }
- }
- if (Count->CountValid) {
- if (Count->UnknownCountOutEdge == 1) {
- uint64_t Total = 0;
- uint64_t OutSum = sumEdgeCount(Count->OutEdges);
- // If the one of the successor block can early terminate (no-return),
- // we can end up with situation where out edge sum count is larger as
- // the source BB's count is collected by a post-dominated block.
- if (Count->CountValue > OutSum)
- Total = Count->CountValue - OutSum;
- setEdgeCount(Count->OutEdges, Total);
- Changes = true;
- }
- if (Count->UnknownCountInEdge == 1) {
- uint64_t Total = 0;
- uint64_t InSum = sumEdgeCount(Count->InEdges);
- if (Count->CountValue > InSum)
- Total = Count->CountValue - InSum;
- setEdgeCount(Count->InEdges, Total);
- Changes = true;
- }
- }
- }
- }
-
- LLVM_DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
-#ifndef NDEBUG
- // Assert every BB has a valid counter.
- for (auto &BB : F) {
- auto BI = findBBInfo(&BB);
- if (BI == nullptr)
- continue;
- assert(BI->CountValid && "BB count is not valid");
- }
-#endif
- uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue;
- uint64_t FuncMaxCount = FuncEntryCount;
- for (auto &BB : F) {
- auto BI = findBBInfo(&BB);
- if (BI == nullptr)
- continue;
- FuncMaxCount = std::max(FuncMaxCount, BI->CountValue);
- }
+ }
+ AllZeros = (ValueSum == 0);
+
+ LLVM_DEBUG(dbgs() << "SUM = " << ValueSum << "\n");
+
+ getBBInfo(nullptr).UnknownCountOutEdge = 2;
+ getBBInfo(nullptr).UnknownCountInEdge = 2;
+
+ if (!setInstrumentedCounts(CountFromProfile)) {
+ LLVM_DEBUG(
+ dbgs() << "Inconsistent number of counts, skipping this function");
+ Ctx.diagnose(DiagnosticInfoPGOProfile(
+ M->getName().data(),
+ Twine("Inconsistent number of counts in ") + F.getName().str()
+ + Twine(": the profile may be stale or there is a function name collision."),
+ DS_Warning));
+ return false;
+ }
+ ProgramMaxCount = PGOReader->getMaximumFunctionCount(IsCS);
+ return true;
+}
+
+// Populate the counters from instrumented BBs to all BBs.
+// In the end of this operation, all BBs should have a valid count value.
+void PGOUseFunc::populateCounters() {
+ bool Changes = true;
+ unsigned NumPasses = 0;
+ while (Changes) {
+ NumPasses++;
+ Changes = false;
+
+ // For efficient traversal, it's better to start from the end as most
+ // of the instrumented edges are at the end.
+ for (auto &BB : reverse(F)) {
+ UseBBInfo *Count = findBBInfo(&BB);
+ if (Count == nullptr)
+ continue;
+ if (!Count->CountValid) {
+ if (Count->UnknownCountOutEdge == 0) {
+ Count->CountValue = sumEdgeCount(Count->OutEdges);
+ Count->CountValid = true;
+ Changes = true;
+ } else if (Count->UnknownCountInEdge == 0) {
+ Count->CountValue = sumEdgeCount(Count->InEdges);
+ Count->CountValid = true;
+ Changes = true;
+ }
+ }
+ if (Count->CountValid) {
+ if (Count->UnknownCountOutEdge == 1) {
+ uint64_t Total = 0;
+ uint64_t OutSum = sumEdgeCount(Count->OutEdges);
+ // If the one of the successor block can early terminate (no-return),
+ // we can end up with situation where out edge sum count is larger as
+ // the source BB's count is collected by a post-dominated block.
+ if (Count->CountValue > OutSum)
+ Total = Count->CountValue - OutSum;
+ setEdgeCount(Count->OutEdges, Total);
+ Changes = true;
+ }
+ if (Count->UnknownCountInEdge == 1) {
+ uint64_t Total = 0;
+ uint64_t InSum = sumEdgeCount(Count->InEdges);
+ if (Count->CountValue > InSum)
+ Total = Count->CountValue - InSum;
+ setEdgeCount(Count->InEdges, Total);
+ Changes = true;
+ }
+ }
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "Populate counts in " << NumPasses << " passes.\n");
+#ifndef NDEBUG
+ // Assert every BB has a valid counter.
+ for (auto &BB : F) {
+ auto BI = findBBInfo(&BB);
+ if (BI == nullptr)
+ continue;
+ assert(BI->CountValid && "BB count is not valid");
+ }
+#endif
+ uint64_t FuncEntryCount = getBBInfo(&*F.begin()).CountValue;
+ uint64_t FuncMaxCount = FuncEntryCount;
+ for (auto &BB : F) {
+ auto BI = findBBInfo(&BB);
+ if (BI == nullptr)
+ continue;
+ FuncMaxCount = std::max(FuncMaxCount, BI->CountValue);
+ }
// Fix the obviously inconsistent entry count.
if (FuncMaxCount > 0 && FuncEntryCount == 0)
FuncEntryCount = 1;
F.setEntryCount(ProfileCount(FuncEntryCount, Function::PCT_Real));
- markFunctionAttributes(FuncEntryCount, FuncMaxCount);
-
- // Now annotate select instructions
- FuncInfo.SIVisitor.annotateSelects(F, this, &CountPosition);
- assert(CountPosition == ProfileCountSize);
-
- LLVM_DEBUG(FuncInfo.dumpInfo("after reading profile."));
-}
-
-// Assign the scaled count values to the BB with multiple out edges.
-void PGOUseFunc::setBranchWeights() {
- // Generate MD_prof metadata for every branch instruction.
- LLVM_DEBUG(dbgs() << "\nSetting branch weights for func " << F.getName()
- << " IsCS=" << IsCS << "\n");
- for (auto &BB : F) {
- Instruction *TI = BB.getTerminator();
- if (TI->getNumSuccessors() < 2)
- continue;
- if (!(isa<BranchInst>(TI) || isa<SwitchInst>(TI) ||
- isa<IndirectBrInst>(TI) || isa<InvokeInst>(TI)))
- continue;
-
- if (getBBInfo(&BB).CountValue == 0)
- continue;
-
- // We have a non-zero Branch BB.
- const UseBBInfo &BBCountInfo = getBBInfo(&BB);
- unsigned Size = BBCountInfo.OutEdges.size();
- SmallVector<uint64_t, 2> EdgeCounts(Size, 0);
- uint64_t MaxCount = 0;
- for (unsigned s = 0; s < Size; s++) {
- const PGOUseEdge *E = BBCountInfo.OutEdges[s];
- const BasicBlock *SrcBB = E->SrcBB;
- const BasicBlock *DestBB = E->DestBB;
- if (DestBB == nullptr)
- continue;
- unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
- uint64_t EdgeCount = E->CountValue;
- if (EdgeCount > MaxCount)
- MaxCount = EdgeCount;
- EdgeCounts[SuccNum] = EdgeCount;
- }
- setProfMetadata(M, TI, EdgeCounts, MaxCount);
- }
-}
-
-static bool isIndirectBrTarget(BasicBlock *BB) {
- for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
- if (isa<IndirectBrInst>((*PI)->getTerminator()))
- return true;
- }
- return false;
-}
-
-void PGOUseFunc::annotateIrrLoopHeaderWeights() {
- LLVM_DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n");
- // Find irr loop headers
- for (auto &BB : F) {
- // As a heuristic also annotate indrectbr targets as they have a high chance
- // to become an irreducible loop header after the indirectbr tail
- // duplication.
- if (BFI->isIrrLoopHeader(&BB) || isIndirectBrTarget(&BB)) {
- Instruction *TI = BB.getTerminator();
- const UseBBInfo &BBCountInfo = getBBInfo(&BB);
- setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue);
- }
- }
-}
-
-void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
- Module *M = F.getParent();
- IRBuilder<> Builder(&SI);
- Type *Int64Ty = Builder.getInt64Ty();
- Type *I8PtrTy = Builder.getInt8PtrTy();
- auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty);
- Builder.CreateCall(
- Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
- {ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
- Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs),
- Builder.getInt32(*CurCtrIdx), Step});
- ++(*CurCtrIdx);
-}
-
-void SelectInstVisitor::annotateOneSelectInst(SelectInst &SI) {
- std::vector<uint64_t> &CountFromProfile = UseFunc->getProfileRecord().Counts;
- assert(*CurCtrIdx < CountFromProfile.size() &&
- "Out of bound access of counters");
- uint64_t SCounts[2];
- SCounts[0] = CountFromProfile[*CurCtrIdx]; // True count
- ++(*CurCtrIdx);
- uint64_t TotalCount = 0;
- auto BI = UseFunc->findBBInfo(SI.getParent());
- if (BI != nullptr)
- TotalCount = BI->CountValue;
- // False Count
- SCounts[1] = (TotalCount > SCounts[0] ? TotalCount - SCounts[0] : 0);
- uint64_t MaxCount = std::max(SCounts[0], SCounts[1]);
- if (MaxCount)
- setProfMetadata(F.getParent(), &SI, SCounts, MaxCount);
-}
-
-void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
- if (!PGOInstrSelect)
- return;
- // FIXME: do not handle this yet.
- if (SI.getCondition()->getType()->isVectorTy())
- return;
-
- switch (Mode) {
- case VM_counting:
- NSIs++;
- return;
- case VM_instrument:
- instrumentOneSelectInst(SI);
- return;
- case VM_annotate:
- annotateOneSelectInst(SI);
- return;
- }
-
- llvm_unreachable("Unknown visiting mode");
-}
-
-// Traverse all valuesites and annotate the instructions for all value kind.
-void PGOUseFunc::annotateValueSites() {
- if (DisableValueProfiling)
- return;
-
- // Create the PGOFuncName meta data.
- createPGOFuncNameMetadata(F, FuncInfo.FuncName);
-
- for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
- annotateValueSites(Kind);
-}
-
-// Annotate the instructions for a specific value kind.
-void PGOUseFunc::annotateValueSites(uint32_t Kind) {
- assert(Kind <= IPVK_Last);
- unsigned ValueSiteIndex = 0;
- auto &ValueSites = FuncInfo.ValueSites[Kind];
- unsigned NumValueSites = ProfileRecord.getNumValueSites(Kind);
- if (NumValueSites != ValueSites.size()) {
- auto &Ctx = M->getContext();
- Ctx.diagnose(DiagnosticInfoPGOProfile(
- M->getName().data(),
- Twine("Inconsistent number of value sites for ") +
- Twine(ValueProfKindDescr[Kind]) +
- Twine(" profiling in \"") + F.getName().str() +
- Twine("\", possibly due to the use of a stale profile."),
- DS_Warning));
- return;
- }
-
- for (VPCandidateInfo &I : ValueSites) {
- LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
- << "): Index = " << ValueSiteIndex << " out of "
- << NumValueSites << "\n");
- annotateValueSite(*M, *I.AnnotatedInst, ProfileRecord,
- static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
- Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations
- : MaxNumAnnotations);
- ValueSiteIndex++;
- }
-}
-
-// Collect the set of members for each Comdat in module M and store
-// in ComdatMembers.
-static void collectComdatMembers(
- Module &M,
- std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
- if (!DoComdatRenaming)
- return;
- for (Function &F : M)
- if (Comdat *C = F.getComdat())
- ComdatMembers.insert(std::make_pair(C, &F));
- for (GlobalVariable &GV : M.globals())
- if (Comdat *C = GV.getComdat())
- ComdatMembers.insert(std::make_pair(C, &GV));
- for (GlobalAlias &GA : M.aliases())
- if (Comdat *C = GA.getComdat())
- ComdatMembers.insert(std::make_pair(C, &GA));
-}
-
-static bool InstrumentAllFunctions(
- Module &M, function_ref<TargetLibraryInfo &(Function &)> LookupTLI,
- function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
- function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) {
- // For the context-sensitve instrumentation, we should have a separated pass
- // (before LTO/ThinLTO linking) to create these variables.
- if (!IsCS)
+ markFunctionAttributes(FuncEntryCount, FuncMaxCount);
+
+ // Now annotate select instructions
+ FuncInfo.SIVisitor.annotateSelects(F, this, &CountPosition);
+ assert(CountPosition == ProfileCountSize);
+
+ LLVM_DEBUG(FuncInfo.dumpInfo("after reading profile."));
+}
+
+// Assign the scaled count values to the BB with multiple out edges.
+void PGOUseFunc::setBranchWeights() {
+ // Generate MD_prof metadata for every branch instruction.
+ LLVM_DEBUG(dbgs() << "\nSetting branch weights for func " << F.getName()
+ << " IsCS=" << IsCS << "\n");
+ for (auto &BB : F) {
+ Instruction *TI = BB.getTerminator();
+ if (TI->getNumSuccessors() < 2)
+ continue;
+ if (!(isa<BranchInst>(TI) || isa<SwitchInst>(TI) ||
+ isa<IndirectBrInst>(TI) || isa<InvokeInst>(TI)))
+ continue;
+
+ if (getBBInfo(&BB).CountValue == 0)
+ continue;
+
+ // We have a non-zero Branch BB.
+ const UseBBInfo &BBCountInfo = getBBInfo(&BB);
+ unsigned Size = BBCountInfo.OutEdges.size();
+ SmallVector<uint64_t, 2> EdgeCounts(Size, 0);
+ uint64_t MaxCount = 0;
+ for (unsigned s = 0; s < Size; s++) {
+ const PGOUseEdge *E = BBCountInfo.OutEdges[s];
+ const BasicBlock *SrcBB = E->SrcBB;
+ const BasicBlock *DestBB = E->DestBB;
+ if (DestBB == nullptr)
+ continue;
+ unsigned SuccNum = GetSuccessorNumber(SrcBB, DestBB);
+ uint64_t EdgeCount = E->CountValue;
+ if (EdgeCount > MaxCount)
+ MaxCount = EdgeCount;
+ EdgeCounts[SuccNum] = EdgeCount;
+ }
+ setProfMetadata(M, TI, EdgeCounts, MaxCount);
+ }
+}
+
+static bool isIndirectBrTarget(BasicBlock *BB) {
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+ if (isa<IndirectBrInst>((*PI)->getTerminator()))
+ return true;
+ }
+ return false;
+}
+
+void PGOUseFunc::annotateIrrLoopHeaderWeights() {
+ LLVM_DEBUG(dbgs() << "\nAnnotating irreducible loop header weights.\n");
+ // Find irr loop headers
+ for (auto &BB : F) {
+ // As a heuristic also annotate indrectbr targets as they have a high chance
+ // to become an irreducible loop header after the indirectbr tail
+ // duplication.
+ if (BFI->isIrrLoopHeader(&BB) || isIndirectBrTarget(&BB)) {
+ Instruction *TI = BB.getTerminator();
+ const UseBBInfo &BBCountInfo = getBBInfo(&BB);
+ setIrrLoopHeaderMetadata(M, TI, BBCountInfo.CountValue);
+ }
+ }
+}
+
+void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
+ Module *M = F.getParent();
+ IRBuilder<> Builder(&SI);
+ Type *Int64Ty = Builder.getInt64Ty();
+ Type *I8PtrTy = Builder.getInt8PtrTy();
+ auto *Step = Builder.CreateZExt(SI.getCondition(), Int64Ty);
+ Builder.CreateCall(
+ Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
+ {ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
+ Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs),
+ Builder.getInt32(*CurCtrIdx), Step});
+ ++(*CurCtrIdx);
+}
+
+void SelectInstVisitor::annotateOneSelectInst(SelectInst &SI) {
+ std::vector<uint64_t> &CountFromProfile = UseFunc->getProfileRecord().Counts;
+ assert(*CurCtrIdx < CountFromProfile.size() &&
+ "Out of bound access of counters");
+ uint64_t SCounts[2];
+ SCounts[0] = CountFromProfile[*CurCtrIdx]; // True count
+ ++(*CurCtrIdx);
+ uint64_t TotalCount = 0;
+ auto BI = UseFunc->findBBInfo(SI.getParent());
+ if (BI != nullptr)
+ TotalCount = BI->CountValue;
+ // False Count
+ SCounts[1] = (TotalCount > SCounts[0] ? TotalCount - SCounts[0] : 0);
+ uint64_t MaxCount = std::max(SCounts[0], SCounts[1]);
+ if (MaxCount)
+ setProfMetadata(F.getParent(), &SI, SCounts, MaxCount);
+}
+
+void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
+ if (!PGOInstrSelect)
+ return;
+ // FIXME: do not handle this yet.
+ if (SI.getCondition()->getType()->isVectorTy())
+ return;
+
+ switch (Mode) {
+ case VM_counting:
+ NSIs++;
+ return;
+ case VM_instrument:
+ instrumentOneSelectInst(SI);
+ return;
+ case VM_annotate:
+ annotateOneSelectInst(SI);
+ return;
+ }
+
+ llvm_unreachable("Unknown visiting mode");
+}
+
+// Traverse all valuesites and annotate the instructions for all value kind.
+void PGOUseFunc::annotateValueSites() {
+ if (DisableValueProfiling)
+ return;
+
+ // Create the PGOFuncName meta data.
+ createPGOFuncNameMetadata(F, FuncInfo.FuncName);
+
+ for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+ annotateValueSites(Kind);
+}
+
+// Annotate the instructions for a specific value kind.
+void PGOUseFunc::annotateValueSites(uint32_t Kind) {
+ assert(Kind <= IPVK_Last);
+ unsigned ValueSiteIndex = 0;
+ auto &ValueSites = FuncInfo.ValueSites[Kind];
+ unsigned NumValueSites = ProfileRecord.getNumValueSites(Kind);
+ if (NumValueSites != ValueSites.size()) {
+ auto &Ctx = M->getContext();
+ Ctx.diagnose(DiagnosticInfoPGOProfile(
+ M->getName().data(),
+ Twine("Inconsistent number of value sites for ") +
+ Twine(ValueProfKindDescr[Kind]) +
+ Twine(" profiling in \"") + F.getName().str() +
+ Twine("\", possibly due to the use of a stale profile."),
+ DS_Warning));
+ return;
+ }
+
+ for (VPCandidateInfo &I : ValueSites) {
+ LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
+ << "): Index = " << ValueSiteIndex << " out of "
+ << NumValueSites << "\n");
+ annotateValueSite(*M, *I.AnnotatedInst, ProfileRecord,
+ static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
+ Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations
+ : MaxNumAnnotations);
+ ValueSiteIndex++;
+ }
+}
+
+// Collect the set of members for each Comdat in module M and store
+// in ComdatMembers.
+static void collectComdatMembers(
+ Module &M,
+ std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers) {
+ if (!DoComdatRenaming)
+ return;
+ for (Function &F : M)
+ if (Comdat *C = F.getComdat())
+ ComdatMembers.insert(std::make_pair(C, &F));
+ for (GlobalVariable &GV : M.globals())
+ if (Comdat *C = GV.getComdat())
+ ComdatMembers.insert(std::make_pair(C, &GV));
+ for (GlobalAlias &GA : M.aliases())
+ if (Comdat *C = GA.getComdat())
+ ComdatMembers.insert(std::make_pair(C, &GA));
+}
+
+static bool InstrumentAllFunctions(
+ Module &M, function_ref<TargetLibraryInfo &(Function &)> LookupTLI,
+ function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
+ function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) {
+ // For the context-sensitve instrumentation, we should have a separated pass
+ // (before LTO/ThinLTO linking) to create these variables.
+ if (!IsCS)
createIRLevelProfileFlagVar(M, /* IsCS */ false, PGOInstrumentEntry);
- std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
- collectComdatMembers(M, ComdatMembers);
-
- for (auto &F : M) {
- if (F.isDeclaration())
- continue;
+ std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
+ collectComdatMembers(M, ComdatMembers);
+
+ for (auto &F : M) {
+ if (F.isDeclaration())
+ continue;
if (F.hasFnAttribute(llvm::Attribute::NoProfile))
continue;
- auto &TLI = LookupTLI(F);
- auto *BPI = LookupBPI(F);
- auto *BFI = LookupBFI(F);
- instrumentOneFunc(F, &M, TLI, BPI, BFI, ComdatMembers, IsCS);
- }
- return true;
-}
-
-PreservedAnalyses
-PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &AM) {
- createProfileFileNameVar(M, CSInstrName);
+ auto &TLI = LookupTLI(F);
+ auto *BPI = LookupBPI(F);
+ auto *BFI = LookupBFI(F);
+ instrumentOneFunc(F, &M, TLI, BPI, BFI, ComdatMembers, IsCS);
+ }
+ return true;
+}
+
+PreservedAnalyses
+PGOInstrumentationGenCreateVar::run(Module &M, ModuleAnalysisManager &AM) {
+ createProfileFileNameVar(M, CSInstrName);
createIRLevelProfileFlagVar(M, /* IsCS */ true, PGOInstrumentEntry);
- return PreservedAnalyses::all();
-}
-
-bool PGOInstrumentationGenLegacyPass::runOnModule(Module &M) {
- if (skipModule(M))
- return false;
-
- auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
- return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- };
- auto LookupBPI = [this](Function &F) {
- return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
- };
- auto LookupBFI = [this](Function &F) {
- return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
- };
- return InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS);
-}
-
-PreservedAnalyses PGOInstrumentationGen::run(Module &M,
- ModuleAnalysisManager &AM) {
- auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
- return FAM.getResult<TargetLibraryAnalysis>(F);
- };
- auto LookupBPI = [&FAM](Function &F) {
- return &FAM.getResult<BranchProbabilityAnalysis>(F);
- };
- auto LookupBFI = [&FAM](Function &F) {
- return &FAM.getResult<BlockFrequencyAnalysis>(F);
- };
-
- if (!InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS))
- return PreservedAnalyses::all();
-
- return PreservedAnalyses::none();
-}
-
+ return PreservedAnalyses::all();
+}
+
+bool PGOInstrumentationGenLegacyPass::runOnModule(Module &M) {
+ if (skipModule(M))
+ return false;
+
+ auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
+ return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ };
+ auto LookupBPI = [this](Function &F) {
+ return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
+ };
+ auto LookupBFI = [this](Function &F) {
+ return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+ };
+ return InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS);
+}
+
+PreservedAnalyses PGOInstrumentationGen::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+ return FAM.getResult<TargetLibraryAnalysis>(F);
+ };
+ auto LookupBPI = [&FAM](Function &F) {
+ return &FAM.getResult<BranchProbabilityAnalysis>(F);
+ };
+ auto LookupBFI = [&FAM](Function &F) {
+ return &FAM.getResult<BlockFrequencyAnalysis>(F);
+ };
+
+ if (!InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
// Using the ratio b/w sums of profile count values and BFI count values to
// adjust the func entry count.
static void fixFuncEntryCount(PGOUseFunc &Func, LoopInfo &LI,
@@ -1766,69 +1766,69 @@ static void verifyFuncBFI(PGOUseFunc &Func, LoopInfo &LI,
});
}
-static bool annotateAllFunctions(
- Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName,
- function_ref<TargetLibraryInfo &(Function &)> LookupTLI,
- function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
- function_ref<BlockFrequencyInfo *(Function &)> LookupBFI,
- ProfileSummaryInfo *PSI, bool IsCS) {
- LLVM_DEBUG(dbgs() << "Read in profile counters: ");
- auto &Ctx = M.getContext();
- // Read the counter array from file.
- auto ReaderOrErr =
- IndexedInstrProfReader::create(ProfileFileName, ProfileRemappingFileName);
- if (Error E = ReaderOrErr.takeError()) {
- handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) {
- Ctx.diagnose(
- DiagnosticInfoPGOProfile(ProfileFileName.data(), EI.message()));
- });
- return false;
- }
-
- std::unique_ptr<IndexedInstrProfReader> PGOReader =
- std::move(ReaderOrErr.get());
- if (!PGOReader) {
- Ctx.diagnose(DiagnosticInfoPGOProfile(ProfileFileName.data(),
- StringRef("Cannot get PGOReader")));
- return false;
- }
- if (!PGOReader->hasCSIRLevelProfile() && IsCS)
- return false;
-
- // TODO: might need to change the warning once the clang option is finalized.
- if (!PGOReader->isIRLevelProfile()) {
- Ctx.diagnose(DiagnosticInfoPGOProfile(
- ProfileFileName.data(), "Not an IR level instrumentation profile"));
- return false;
- }
-
- // Add the profile summary (read from the header of the indexed summary) here
- // so that we can use it below when reading counters (which checks if the
- // function should be marked with a cold or inlinehint attribute).
- M.setProfileSummary(PGOReader->getSummary(IsCS).getMD(M.getContext()),
- IsCS ? ProfileSummary::PSK_CSInstr
- : ProfileSummary::PSK_Instr);
- PSI->refresh();
-
- std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
- collectComdatMembers(M, ComdatMembers);
- std::vector<Function *> HotFunctions;
- std::vector<Function *> ColdFunctions;
+static bool annotateAllFunctions(
+ Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName,
+ function_ref<TargetLibraryInfo &(Function &)> LookupTLI,
+ function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
+ function_ref<BlockFrequencyInfo *(Function &)> LookupBFI,
+ ProfileSummaryInfo *PSI, bool IsCS) {
+ LLVM_DEBUG(dbgs() << "Read in profile counters: ");
+ auto &Ctx = M.getContext();
+ // Read the counter array from file.
+ auto ReaderOrErr =
+ IndexedInstrProfReader::create(ProfileFileName, ProfileRemappingFileName);
+ if (Error E = ReaderOrErr.takeError()) {
+ handleAllErrors(std::move(E), [&](const ErrorInfoBase &EI) {
+ Ctx.diagnose(
+ DiagnosticInfoPGOProfile(ProfileFileName.data(), EI.message()));
+ });
+ return false;
+ }
+
+ std::unique_ptr<IndexedInstrProfReader> PGOReader =
+ std::move(ReaderOrErr.get());
+ if (!PGOReader) {
+ Ctx.diagnose(DiagnosticInfoPGOProfile(ProfileFileName.data(),
+ StringRef("Cannot get PGOReader")));
+ return false;
+ }
+ if (!PGOReader->hasCSIRLevelProfile() && IsCS)
+ return false;
+
+ // TODO: might need to change the warning once the clang option is finalized.
+ if (!PGOReader->isIRLevelProfile()) {
+ Ctx.diagnose(DiagnosticInfoPGOProfile(
+ ProfileFileName.data(), "Not an IR level instrumentation profile"));
+ return false;
+ }
+
+ // Add the profile summary (read from the header of the indexed summary) here
+ // so that we can use it below when reading counters (which checks if the
+ // function should be marked with a cold or inlinehint attribute).
+ M.setProfileSummary(PGOReader->getSummary(IsCS).getMD(M.getContext()),
+ IsCS ? ProfileSummary::PSK_CSInstr
+ : ProfileSummary::PSK_Instr);
+ PSI->refresh();
+
+ std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
+ collectComdatMembers(M, ComdatMembers);
+ std::vector<Function *> HotFunctions;
+ std::vector<Function *> ColdFunctions;
// If the profile marked as always instrument the entry BB, do the
// same. Note this can be overwritten by the internal option in CFGMST.h
bool InstrumentFuncEntry = PGOReader->instrEntryBBEnabled();
if (PGOInstrumentEntry.getNumOccurrences() > 0)
InstrumentFuncEntry = PGOInstrumentEntry;
- for (auto &F : M) {
- if (F.isDeclaration())
- continue;
- auto &TLI = LookupTLI(F);
- auto *BPI = LookupBPI(F);
- auto *BFI = LookupBFI(F);
- // Split indirectbr critical edges here before computing the MST rather than
- // later in getInstrBB() to avoid invalidating it.
- SplitIndirectBrCriticalEdges(F, BPI, BFI);
+ for (auto &F : M) {
+ if (F.isDeclaration())
+ continue;
+ auto &TLI = LookupTLI(F);
+ auto *BPI = LookupBPI(F);
+ auto *BFI = LookupBFI(F);
+ // Split indirectbr critical edges here before computing the MST rather than
+ // later in getInstrBB() to avoid invalidating it.
+ SplitIndirectBrCriticalEdges(F, BPI, BFI);
PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, PSI, IsCS,
InstrumentFuncEntry);
// When AllMinusOnes is true, it means the profile for the function
@@ -1836,15 +1836,15 @@ static bool annotateAllFunctions(
// entry count of the function to be multiple times of hot threshold
// and drop all its internal counters.
bool AllMinusOnes = false;
- bool AllZeros = false;
+ bool AllZeros = false;
if (!Func.readCounters(PGOReader.get(), AllZeros, AllMinusOnes))
- continue;
- if (AllZeros) {
- F.setEntryCount(ProfileCount(0, Function::PCT_Real));
- if (Func.getProgramMaxCount() != 0)
- ColdFunctions.push_back(&F);
- continue;
- }
+ continue;
+ if (AllZeros) {
+ F.setEntryCount(ProfileCount(0, Function::PCT_Real));
+ if (Func.getProgramMaxCount() != 0)
+ ColdFunctions.push_back(&F);
+ continue;
+ }
const unsigned MultiplyFactor = 3;
if (AllMinusOnes) {
uint64_t HotThreshold = PSI->getHotCountThreshold();
@@ -1854,43 +1854,43 @@ static bool annotateAllFunctions(
HotFunctions.push_back(&F);
continue;
}
- Func.populateCounters();
- Func.setBranchWeights();
- Func.annotateValueSites();
- Func.annotateIrrLoopHeaderWeights();
- PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
- if (FreqAttr == PGOUseFunc::FFA_Cold)
- ColdFunctions.push_back(&F);
- else if (FreqAttr == PGOUseFunc::FFA_Hot)
- HotFunctions.push_back(&F);
- if (PGOViewCounts != PGOVCT_None &&
- (ViewBlockFreqFuncName.empty() ||
- F.getName().equals(ViewBlockFreqFuncName))) {
- LoopInfo LI{DominatorTree(F)};
- std::unique_ptr<BranchProbabilityInfo> NewBPI =
- std::make_unique<BranchProbabilityInfo>(F, LI);
- std::unique_ptr<BlockFrequencyInfo> NewBFI =
- std::make_unique<BlockFrequencyInfo>(F, *NewBPI, LI);
- if (PGOViewCounts == PGOVCT_Graph)
- NewBFI->view();
- else if (PGOViewCounts == PGOVCT_Text) {
- dbgs() << "pgo-view-counts: " << Func.getFunc().getName() << "\n";
- NewBFI->print(dbgs());
- }
- }
- if (PGOViewRawCounts != PGOVCT_None &&
- (ViewBlockFreqFuncName.empty() ||
- F.getName().equals(ViewBlockFreqFuncName))) {
- if (PGOViewRawCounts == PGOVCT_Graph)
- if (ViewBlockFreqFuncName.empty())
- WriteGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
- else
- ViewGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
- else if (PGOViewRawCounts == PGOVCT_Text) {
- dbgs() << "pgo-view-raw-counts: " << Func.getFunc().getName() << "\n";
- Func.dumpInfo();
- }
- }
+ Func.populateCounters();
+ Func.setBranchWeights();
+ Func.annotateValueSites();
+ Func.annotateIrrLoopHeaderWeights();
+ PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
+ if (FreqAttr == PGOUseFunc::FFA_Cold)
+ ColdFunctions.push_back(&F);
+ else if (FreqAttr == PGOUseFunc::FFA_Hot)
+ HotFunctions.push_back(&F);
+ if (PGOViewCounts != PGOVCT_None &&
+ (ViewBlockFreqFuncName.empty() ||
+ F.getName().equals(ViewBlockFreqFuncName))) {
+ LoopInfo LI{DominatorTree(F)};
+ std::unique_ptr<BranchProbabilityInfo> NewBPI =
+ std::make_unique<BranchProbabilityInfo>(F, LI);
+ std::unique_ptr<BlockFrequencyInfo> NewBFI =
+ std::make_unique<BlockFrequencyInfo>(F, *NewBPI, LI);
+ if (PGOViewCounts == PGOVCT_Graph)
+ NewBFI->view();
+ else if (PGOViewCounts == PGOVCT_Text) {
+ dbgs() << "pgo-view-counts: " << Func.getFunc().getName() << "\n";
+ NewBFI->print(dbgs());
+ }
+ }
+ if (PGOViewRawCounts != PGOVCT_None &&
+ (ViewBlockFreqFuncName.empty() ||
+ F.getName().equals(ViewBlockFreqFuncName))) {
+ if (PGOViewRawCounts == PGOVCT_Graph)
+ if (ViewBlockFreqFuncName.empty())
+ WriteGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
+ else
+ ViewGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
+ else if (PGOViewRawCounts == PGOVCT_Text) {
+ dbgs() << "pgo-view-raw-counts: " << Func.getFunc().getName() << "\n";
+ Func.dumpInfo();
+ }
+ }
if (PGOVerifyBFI || PGOVerifyHotBFI || PGOFixEntryCount) {
LoopInfo LI{DominatorTree(F)};
@@ -1908,18 +1908,18 @@ static bool annotateAllFunctions(
}
verifyFuncBFI(Func, LI, NBPI, HotCountThreshold, ColdCountThreshold);
}
- }
-
- // Set function hotness attribute from the profile.
- // We have to apply these attributes at the end because their presence
- // can affect the BranchProbabilityInfo of any callers, resulting in an
- // inconsistent MST between prof-gen and prof-use.
- for (auto &F : HotFunctions) {
- F->addFnAttr(Attribute::InlineHint);
- LLVM_DEBUG(dbgs() << "Set inline attribute to function: " << F->getName()
- << "\n");
- }
- for (auto &F : ColdFunctions) {
+ }
+
+ // Set function hotness attribute from the profile.
+ // We have to apply these attributes at the end because their presence
+ // can affect the BranchProbabilityInfo of any callers, resulting in an
+ // inconsistent MST between prof-gen and prof-use.
+ for (auto &F : HotFunctions) {
+ F->addFnAttr(Attribute::InlineHint);
+ LLVM_DEBUG(dbgs() << "Set inline attribute to function: " << F->getName()
+ << "\n");
+ }
+ for (auto &F : ColdFunctions) {
// Only set when there is no Attribute::Hot set by the user. For Hot
// attribute, user's annotation has the precedence over the profile.
if (F->hasFnAttribute(Attribute::Hot)) {
@@ -1931,190 +1931,190 @@ static bool annotateAllFunctions(
DiagnosticInfoPGOProfile(M.getName().data(), Msg, DS_Warning));
continue;
}
- F->addFnAttr(Attribute::Cold);
- LLVM_DEBUG(dbgs() << "Set cold attribute to function: " << F->getName()
- << "\n");
- }
- return true;
-}
-
-PGOInstrumentationUse::PGOInstrumentationUse(std::string Filename,
- std::string RemappingFilename,
- bool IsCS)
- : ProfileFileName(std::move(Filename)),
- ProfileRemappingFileName(std::move(RemappingFilename)), IsCS(IsCS) {
- if (!PGOTestProfileFile.empty())
- ProfileFileName = PGOTestProfileFile;
- if (!PGOTestProfileRemappingFile.empty())
- ProfileRemappingFileName = PGOTestProfileRemappingFile;
-}
-
-PreservedAnalyses PGOInstrumentationUse::run(Module &M,
- ModuleAnalysisManager &AM) {
-
- auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
- return FAM.getResult<TargetLibraryAnalysis>(F);
- };
- auto LookupBPI = [&FAM](Function &F) {
- return &FAM.getResult<BranchProbabilityAnalysis>(F);
- };
- auto LookupBFI = [&FAM](Function &F) {
- return &FAM.getResult<BlockFrequencyAnalysis>(F);
- };
-
- auto *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
-
- if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName,
- LookupTLI, LookupBPI, LookupBFI, PSI, IsCS))
- return PreservedAnalyses::all();
-
- return PreservedAnalyses::none();
-}
-
-bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
- if (skipModule(M))
- return false;
-
- auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
- return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- };
- auto LookupBPI = [this](Function &F) {
- return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
- };
- auto LookupBFI = [this](Function &F) {
- return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
- };
-
- auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
- return annotateAllFunctions(M, ProfileFileName, "", LookupTLI, LookupBPI,
- LookupBFI, PSI, IsCS);
-}
-
-static std::string getSimpleNodeName(const BasicBlock *Node) {
- if (!Node->getName().empty())
- return std::string(Node->getName());
-
- std::string SimpleNodeName;
- raw_string_ostream OS(SimpleNodeName);
- Node->printAsOperand(OS, false);
- return OS.str();
-}
-
-void llvm::setProfMetadata(Module *M, Instruction *TI,
- ArrayRef<uint64_t> EdgeCounts,
- uint64_t MaxCount) {
- MDBuilder MDB(M->getContext());
- assert(MaxCount > 0 && "Bad max count");
- uint64_t Scale = calculateCountScale(MaxCount);
- SmallVector<unsigned, 4> Weights;
- for (const auto &ECI : EdgeCounts)
- Weights.push_back(scaleBranchCount(ECI, Scale));
-
- LLVM_DEBUG(dbgs() << "Weight is: "; for (const auto &W
- : Weights) {
- dbgs() << W << " ";
- } dbgs() << "\n";);
-
- TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
- if (EmitBranchProbability) {
- std::string BrCondStr = getBranchCondString(TI);
- if (BrCondStr.empty())
- return;
-
- uint64_t WSum =
- std::accumulate(Weights.begin(), Weights.end(), (uint64_t)0,
- [](uint64_t w1, uint64_t w2) { return w1 + w2; });
- uint64_t TotalCount =
- std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), (uint64_t)0,
- [](uint64_t c1, uint64_t c2) { return c1 + c2; });
- Scale = calculateCountScale(WSum);
- BranchProbability BP(scaleBranchCount(Weights[0], Scale),
- scaleBranchCount(WSum, Scale));
- std::string BranchProbStr;
- raw_string_ostream OS(BranchProbStr);
- OS << BP;
- OS << " (total count : " << TotalCount << ")";
- OS.flush();
- Function *F = TI->getParent()->getParent();
- OptimizationRemarkEmitter ORE(F);
- ORE.emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "pgo-instrumentation", TI)
- << BrCondStr << " is true with probability : " << BranchProbStr;
- });
- }
-}
-
-namespace llvm {
-
-void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count) {
- MDBuilder MDB(M->getContext());
- TI->setMetadata(llvm::LLVMContext::MD_irr_loop,
- MDB.createIrrLoopHeaderWeight(Count));
-}
-
-template <> struct GraphTraits<PGOUseFunc *> {
- using NodeRef = const BasicBlock *;
- using ChildIteratorType = const_succ_iterator;
- using nodes_iterator = pointer_iterator<Function::const_iterator>;
-
- static NodeRef getEntryNode(const PGOUseFunc *G) {
- return &G->getFunc().front();
- }
-
- static ChildIteratorType child_begin(const NodeRef N) {
- return succ_begin(N);
- }
-
- static ChildIteratorType child_end(const NodeRef N) { return succ_end(N); }
-
- static nodes_iterator nodes_begin(const PGOUseFunc *G) {
- return nodes_iterator(G->getFunc().begin());
- }
-
- static nodes_iterator nodes_end(const PGOUseFunc *G) {
- return nodes_iterator(G->getFunc().end());
- }
-};
-
-template <> struct DOTGraphTraits<PGOUseFunc *> : DefaultDOTGraphTraits {
- explicit DOTGraphTraits(bool isSimple = false)
- : DefaultDOTGraphTraits(isSimple) {}
-
- static std::string getGraphName(const PGOUseFunc *G) {
- return std::string(G->getFunc().getName());
- }
-
- std::string getNodeLabel(const BasicBlock *Node, const PGOUseFunc *Graph) {
- std::string Result;
- raw_string_ostream OS(Result);
-
- OS << getSimpleNodeName(Node) << ":\\l";
- UseBBInfo *BI = Graph->findBBInfo(Node);
- OS << "Count : ";
- if (BI && BI->CountValid)
- OS << BI->CountValue << "\\l";
- else
- OS << "Unknown\\l";
-
- if (!PGOInstrSelect)
- return Result;
-
- for (auto BI = Node->begin(); BI != Node->end(); ++BI) {
- auto *I = &*BI;
- if (!isa<SelectInst>(I))
- continue;
- // Display scaled counts for SELECT instruction:
- OS << "SELECT : { T = ";
- uint64_t TC, FC;
- bool HasProf = I->extractProfMetadata(TC, FC);
- if (!HasProf)
- OS << "Unknown, F = Unknown }\\l";
- else
- OS << TC << ", F = " << FC << " }\\l";
- }
- return Result;
- }
-};
-
-} // end namespace llvm
+ F->addFnAttr(Attribute::Cold);
+ LLVM_DEBUG(dbgs() << "Set cold attribute to function: " << F->getName()
+ << "\n");
+ }
+ return true;
+}
+
+PGOInstrumentationUse::PGOInstrumentationUse(std::string Filename,
+ std::string RemappingFilename,
+ bool IsCS)
+ : ProfileFileName(std::move(Filename)),
+ ProfileRemappingFileName(std::move(RemappingFilename)), IsCS(IsCS) {
+ if (!PGOTestProfileFile.empty())
+ ProfileFileName = PGOTestProfileFile;
+ if (!PGOTestProfileRemappingFile.empty())
+ ProfileRemappingFileName = PGOTestProfileRemappingFile;
+}
+
+PreservedAnalyses PGOInstrumentationUse::run(Module &M,
+ ModuleAnalysisManager &AM) {
+
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+ return FAM.getResult<TargetLibraryAnalysis>(F);
+ };
+ auto LookupBPI = [&FAM](Function &F) {
+ return &FAM.getResult<BranchProbabilityAnalysis>(F);
+ };
+ auto LookupBFI = [&FAM](Function &F) {
+ return &FAM.getResult<BlockFrequencyAnalysis>(F);
+ };
+
+ auto *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+
+ if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName,
+ LookupTLI, LookupBPI, LookupBFI, PSI, IsCS))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
+ if (skipModule(M))
+ return false;
+
+ auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
+ return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ };
+ auto LookupBPI = [this](Function &F) {
+ return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
+ };
+ auto LookupBFI = [this](Function &F) {
+ return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+ };
+
+ auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ return annotateAllFunctions(M, ProfileFileName, "", LookupTLI, LookupBPI,
+ LookupBFI, PSI, IsCS);
+}
+
+static std::string getSimpleNodeName(const BasicBlock *Node) {
+ if (!Node->getName().empty())
+ return std::string(Node->getName());
+
+ std::string SimpleNodeName;
+ raw_string_ostream OS(SimpleNodeName);
+ Node->printAsOperand(OS, false);
+ return OS.str();
+}
+
+void llvm::setProfMetadata(Module *M, Instruction *TI,
+ ArrayRef<uint64_t> EdgeCounts,
+ uint64_t MaxCount) {
+ MDBuilder MDB(M->getContext());
+ assert(MaxCount > 0 && "Bad max count");
+ uint64_t Scale = calculateCountScale(MaxCount);
+ SmallVector<unsigned, 4> Weights;
+ for (const auto &ECI : EdgeCounts)
+ Weights.push_back(scaleBranchCount(ECI, Scale));
+
+ LLVM_DEBUG(dbgs() << "Weight is: "; for (const auto &W
+ : Weights) {
+ dbgs() << W << " ";
+ } dbgs() << "\n";);
+
+ TI->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+ if (EmitBranchProbability) {
+ std::string BrCondStr = getBranchCondString(TI);
+ if (BrCondStr.empty())
+ return;
+
+ uint64_t WSum =
+ std::accumulate(Weights.begin(), Weights.end(), (uint64_t)0,
+ [](uint64_t w1, uint64_t w2) { return w1 + w2; });
+ uint64_t TotalCount =
+ std::accumulate(EdgeCounts.begin(), EdgeCounts.end(), (uint64_t)0,
+ [](uint64_t c1, uint64_t c2) { return c1 + c2; });
+ Scale = calculateCountScale(WSum);
+ BranchProbability BP(scaleBranchCount(Weights[0], Scale),
+ scaleBranchCount(WSum, Scale));
+ std::string BranchProbStr;
+ raw_string_ostream OS(BranchProbStr);
+ OS << BP;
+ OS << " (total count : " << TotalCount << ")";
+ OS.flush();
+ Function *F = TI->getParent()->getParent();
+ OptimizationRemarkEmitter ORE(F);
+ ORE.emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "pgo-instrumentation", TI)
+ << BrCondStr << " is true with probability : " << BranchProbStr;
+ });
+ }
+}
+
+namespace llvm {
+
+void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count) {
+ MDBuilder MDB(M->getContext());
+ TI->setMetadata(llvm::LLVMContext::MD_irr_loop,
+ MDB.createIrrLoopHeaderWeight(Count));
+}
+
+template <> struct GraphTraits<PGOUseFunc *> {
+ using NodeRef = const BasicBlock *;
+ using ChildIteratorType = const_succ_iterator;
+ using nodes_iterator = pointer_iterator<Function::const_iterator>;
+
+ static NodeRef getEntryNode(const PGOUseFunc *G) {
+ return &G->getFunc().front();
+ }
+
+ static ChildIteratorType child_begin(const NodeRef N) {
+ return succ_begin(N);
+ }
+
+ static ChildIteratorType child_end(const NodeRef N) { return succ_end(N); }
+
+ static nodes_iterator nodes_begin(const PGOUseFunc *G) {
+ return nodes_iterator(G->getFunc().begin());
+ }
+
+ static nodes_iterator nodes_end(const PGOUseFunc *G) {
+ return nodes_iterator(G->getFunc().end());
+ }
+};
+
+template <> struct DOTGraphTraits<PGOUseFunc *> : DefaultDOTGraphTraits {
+ explicit DOTGraphTraits(bool isSimple = false)
+ : DefaultDOTGraphTraits(isSimple) {}
+
+ static std::string getGraphName(const PGOUseFunc *G) {
+ return std::string(G->getFunc().getName());
+ }
+
+ std::string getNodeLabel(const BasicBlock *Node, const PGOUseFunc *Graph) {
+ std::string Result;
+ raw_string_ostream OS(Result);
+
+ OS << getSimpleNodeName(Node) << ":\\l";
+ UseBBInfo *BI = Graph->findBBInfo(Node);
+ OS << "Count : ";
+ if (BI && BI->CountValid)
+ OS << BI->CountValue << "\\l";
+ else
+ OS << "Unknown\\l";
+
+ if (!PGOInstrSelect)
+ return Result;
+
+ for (auto BI = Node->begin(); BI != Node->end(); ++BI) {
+ auto *I = &*BI;
+ if (!isa<SelectInst>(I))
+ continue;
+ // Display scaled counts for SELECT instruction:
+ OS << "SELECT : { T = ";
+ uint64_t TC, FC;
+ bool HasProf = I->extractProfMetadata(TC, FC);
+ if (!HasProf)
+ OS << "Unknown, F = Unknown }\\l";
+ else
+ OS << TC << ", F = " << FC << " }\\l";
+ }
+ return Result;
+ }
+};
+
+} // end namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index edc72d79eb..55a93b6152 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -1,527 +1,527 @@
-//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the transformation that optimizes memory intrinsics
-// such as memcpy using the size value profile. When memory intrinsic size
-// value profile metadata is available, a single memory intrinsic is expanded
-// to a sequence of guarded specialized versions that are called with the
-// hottest size(s), for later expansion into more optimal inline sequences.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+//===-- PGOMemOPSizeOpt.cpp - Optimizations based on value profiling ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the transformation that optimizes memory intrinsics
+// such as memcpy using the size value profile. When memory intrinsic size
+// value profile metadata is available, a single memory intrinsic is expanded
+// to a sequence of guarded specialized versions that are called with the
+// hottest size(s), for later expansion into more optimal inline sequences.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/PassRegistry.h"
-#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/ProfileData/InstrProf.h"
#define INSTR_PROF_VALUE_PROF_MEMOP_API
#include "llvm/ProfileData/InstrProfData.inc"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <cassert>
-#include <cstdint>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "pgo-memop-opt"
-
-STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
-STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
-
-// The minimum call count to optimize memory intrinsic calls.
-static cl::opt<unsigned>
- MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
- cl::init(1000),
- cl::desc("The minimum count to optimize memory "
- "intrinsic calls"));
-
-// Command line option to disable memory intrinsic optimization. The default is
-// false. This is for debug purpose.
-static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
- cl::Hidden, cl::desc("Disable optimize"));
-
-// The percent threshold to optimize memory intrinsic calls.
-static cl::opt<unsigned>
- MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
- cl::Hidden, cl::ZeroOrMore,
- cl::desc("The percentage threshold for the "
- "memory intrinsic calls optimization"));
-
-// Maximum number of versions for optimizing memory intrinsic call.
-static cl::opt<unsigned>
- MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
- cl::ZeroOrMore,
- cl::desc("The max version for the optimized memory "
- " intrinsic calls"));
-
-// Scale the counts from the annotation using the BB count value.
-static cl::opt<bool>
- MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
- cl::desc("Scale the memop size counts using the basic "
- " block count value"));
-
-cl::opt<bool>
- MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
- cl::Hidden,
- cl::desc("Size-specialize memcmp and bcmp calls"));
-
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/PGOInstrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pgo-memop-opt"
+
+STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
+STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
+
+// The minimum call count to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+ MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
+ cl::init(1000),
+ cl::desc("The minimum count to optimize memory "
+ "intrinsic calls"));
+
+// Command line option to disable memory intrinsic optimization. The default is
+// false. This is for debug purpose.
+static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
+ cl::Hidden, cl::desc("Disable optimize"));
+
+// The percent threshold to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+ MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
+ cl::Hidden, cl::ZeroOrMore,
+ cl::desc("The percentage threshold for the "
+ "memory intrinsic calls optimization"));
+
+// Maximum number of versions for optimizing memory intrinsic call.
+static cl::opt<unsigned>
+ MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
+ cl::ZeroOrMore,
+ cl::desc("The max version for the optimized memory "
+ " intrinsic calls"));
+
+// Scale the counts from the annotation using the BB count value.
+static cl::opt<bool>
+ MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
+ cl::desc("Scale the memop size counts using the basic "
+ " block count value"));
+
+cl::opt<bool>
+ MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
+ cl::Hidden,
+ cl::desc("Size-specialize memcmp and bcmp calls"));
+
static cl::opt<unsigned>
MemOpMaxOptSize("memop-value-prof-max-opt-size", cl::Hidden, cl::init(128),
cl::desc("Optimize the memop size <= this value"));
-namespace {
-class PGOMemOPSizeOptLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
- initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- StringRef getPassName() const override { return "PGOMemOPSize"; }
-
-private:
- bool runOnFunction(Function &F) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<BlockFrequencyInfoWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-};
-} // end anonymous namespace
-
-char PGOMemOPSizeOptLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
- "Optimize memory intrinsic using its size value profile",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
- "Optimize memory intrinsic using its size value profile",
- false, false)
-
-FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
- return new PGOMemOPSizeOptLegacyPass();
-}
-
-namespace {
-
-static const char *getMIName(const MemIntrinsic *MI) {
- switch (MI->getIntrinsicID()) {
- case Intrinsic::memcpy:
- return "memcpy";
- case Intrinsic::memmove:
- return "memmove";
- case Intrinsic::memset:
- return "memset";
- default:
- return "unknown";
- }
-}
-
-// A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
-struct MemOp {
- Instruction *I;
- MemOp(MemIntrinsic *MI) : I(MI) {}
- MemOp(CallInst *CI) : I(CI) {}
- MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); }
- CallInst *asCI() { return cast<CallInst>(I); }
- MemOp clone() {
- if (auto MI = asMI())
- return MemOp(cast<MemIntrinsic>(MI->clone()));
- return MemOp(cast<CallInst>(asCI()->clone()));
- }
- Value *getLength() {
- if (auto MI = asMI())
- return MI->getLength();
- return asCI()->getArgOperand(2);
- }
- void setLength(Value *Length) {
- if (auto MI = asMI())
- return MI->setLength(Length);
- asCI()->setArgOperand(2, Length);
- }
- StringRef getFuncName() {
- if (auto MI = asMI())
- return MI->getCalledFunction()->getName();
- return asCI()->getCalledFunction()->getName();
- }
- bool isMemmove() {
- if (auto MI = asMI())
- if (MI->getIntrinsicID() == Intrinsic::memmove)
- return true;
- return false;
- }
- bool isMemcmp(TargetLibraryInfo &TLI) {
- LibFunc Func;
- if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
- Func == LibFunc_memcmp) {
- return true;
- }
- return false;
- }
- bool isBcmp(TargetLibraryInfo &TLI) {
- LibFunc Func;
- if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
- Func == LibFunc_bcmp) {
- return true;
- }
- return false;
- }
- const char *getName(TargetLibraryInfo &TLI) {
- if (auto MI = asMI())
- return getMIName(MI);
- LibFunc Func;
- if (TLI.getLibFunc(*asCI(), Func)) {
- if (Func == LibFunc_memcmp)
- return "memcmp";
- if (Func == LibFunc_bcmp)
- return "bcmp";
- }
- llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
- return nullptr;
- }
-};
-
-class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
-public:
- MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
- OptimizationRemarkEmitter &ORE, DominatorTree *DT,
- TargetLibraryInfo &TLI)
- : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
- ValueDataArray =
- std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
- }
- bool isChanged() const { return Changed; }
- void perform() {
- WorkList.clear();
- visit(Func);
-
- for (auto &MO : WorkList) {
- ++NumOfPGOMemOPAnnotate;
- if (perform(MO)) {
- Changed = true;
- ++NumOfPGOMemOPOpt;
- LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
- << "is Transformed.\n");
- }
- }
- }
-
- void visitMemIntrinsic(MemIntrinsic &MI) {
- Value *Length = MI.getLength();
- // Not perform on constant length calls.
- if (dyn_cast<ConstantInt>(Length))
- return;
- WorkList.push_back(MemOp(&MI));
- }
-
- void visitCallInst(CallInst &CI) {
- LibFunc Func;
- if (TLI.getLibFunc(CI, Func) &&
- (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
+namespace {
+class PGOMemOPSizeOptLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
+ initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ StringRef getPassName() const override { return "PGOMemOPSize"; }
+
+private:
+ bool runOnFunction(Function &F) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+};
+} // end anonymous namespace
+
+char PGOMemOPSizeOptLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+ "Optimize memory intrinsic using its size value profile",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+ "Optimize memory intrinsic using its size value profile",
+ false, false)
+
+FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
+ return new PGOMemOPSizeOptLegacyPass();
+}
+
+namespace {
+
+static const char *getMIName(const MemIntrinsic *MI) {
+ switch (MI->getIntrinsicID()) {
+ case Intrinsic::memcpy:
+ return "memcpy";
+ case Intrinsic::memmove:
+ return "memmove";
+ case Intrinsic::memset:
+ return "memset";
+ default:
+ return "unknown";
+ }
+}
+
+// A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
+struct MemOp {
+ Instruction *I;
+ MemOp(MemIntrinsic *MI) : I(MI) {}
+ MemOp(CallInst *CI) : I(CI) {}
+ MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); }
+ CallInst *asCI() { return cast<CallInst>(I); }
+ MemOp clone() {
+ if (auto MI = asMI())
+ return MemOp(cast<MemIntrinsic>(MI->clone()));
+ return MemOp(cast<CallInst>(asCI()->clone()));
+ }
+ Value *getLength() {
+ if (auto MI = asMI())
+ return MI->getLength();
+ return asCI()->getArgOperand(2);
+ }
+ void setLength(Value *Length) {
+ if (auto MI = asMI())
+ return MI->setLength(Length);
+ asCI()->setArgOperand(2, Length);
+ }
+ StringRef getFuncName() {
+ if (auto MI = asMI())
+ return MI->getCalledFunction()->getName();
+ return asCI()->getCalledFunction()->getName();
+ }
+ bool isMemmove() {
+ if (auto MI = asMI())
+ if (MI->getIntrinsicID() == Intrinsic::memmove)
+ return true;
+ return false;
+ }
+ bool isMemcmp(TargetLibraryInfo &TLI) {
+ LibFunc Func;
+ if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
+ Func == LibFunc_memcmp) {
+ return true;
+ }
+ return false;
+ }
+ bool isBcmp(TargetLibraryInfo &TLI) {
+ LibFunc Func;
+ if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
+ Func == LibFunc_bcmp) {
+ return true;
+ }
+ return false;
+ }
+ const char *getName(TargetLibraryInfo &TLI) {
+ if (auto MI = asMI())
+ return getMIName(MI);
+ LibFunc Func;
+ if (TLI.getLibFunc(*asCI(), Func)) {
+ if (Func == LibFunc_memcmp)
+ return "memcmp";
+ if (Func == LibFunc_bcmp)
+ return "bcmp";
+ }
+ llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
+ return nullptr;
+ }
+};
+
+class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
+public:
+ MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
+ OptimizationRemarkEmitter &ORE, DominatorTree *DT,
+ TargetLibraryInfo &TLI)
+ : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
+ ValueDataArray =
+ std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
+ }
+ bool isChanged() const { return Changed; }
+ void perform() {
+ WorkList.clear();
+ visit(Func);
+
+ for (auto &MO : WorkList) {
+ ++NumOfPGOMemOPAnnotate;
+ if (perform(MO)) {
+ Changed = true;
+ ++NumOfPGOMemOPOpt;
+ LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
+ << "is Transformed.\n");
+ }
+ }
+ }
+
+ void visitMemIntrinsic(MemIntrinsic &MI) {
+ Value *Length = MI.getLength();
+ // Not perform on constant length calls.
+ if (dyn_cast<ConstantInt>(Length))
+ return;
+ WorkList.push_back(MemOp(&MI));
+ }
+
+ void visitCallInst(CallInst &CI) {
+ LibFunc Func;
+ if (TLI.getLibFunc(CI, Func) &&
+ (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
!isa<ConstantInt>(CI.getArgOperand(2))) {
- WorkList.push_back(MemOp(&CI));
- }
- }
-
-private:
- Function &Func;
- BlockFrequencyInfo &BFI;
- OptimizationRemarkEmitter &ORE;
- DominatorTree *DT;
- TargetLibraryInfo &TLI;
- bool Changed;
- std::vector<MemOp> WorkList;
- // The space to read the profile annotation.
- std::unique_ptr<InstrProfValueData[]> ValueDataArray;
- bool perform(MemOp MO);
-};
-
-static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
- assert(Count <= TotalCount);
- if (Count < MemOPCountThreshold)
- return false;
- if (Count < TotalCount * MemOPPercentThreshold / 100)
- return false;
- return true;
-}
-
-static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
- uint64_t Denom) {
- if (!MemOPScaleCount)
- return Count;
- bool Overflowed;
- uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
- return ScaleCount / Denom;
-}
-
-bool MemOPSizeOpt::perform(MemOp MO) {
- assert(MO.I);
- if (MO.isMemmove())
- return false;
- if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
- return false;
-
- uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
- uint64_t TotalCount;
- if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumPromotions,
- ValueDataArray.get(), NumVals, TotalCount))
- return false;
-
- uint64_t ActualCount = TotalCount;
- uint64_t SavedTotalCount = TotalCount;
- if (MemOPScaleCount) {
- auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent());
- if (!BBEdgeCount)
- return false;
- ActualCount = *BBEdgeCount;
- }
-
- ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
- LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
- << ActualCount << "\n");
- LLVM_DEBUG(
- for (auto &VD
- : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; });
-
- if (ActualCount < MemOPCountThreshold)
- return false;
- // Skip if the total value profiled count is 0, in which case we can't
- // scale up the counts properly (and there is no profitable transformation).
- if (TotalCount == 0)
- return false;
-
- TotalCount = ActualCount;
- if (MemOPScaleCount)
- LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
- << " denominator = " << SavedTotalCount << "\n");
-
- // Keeping track of the count of the default case:
- uint64_t RemainCount = TotalCount;
- uint64_t SavedRemainCount = SavedTotalCount;
- SmallVector<uint64_t, 16> SizeIds;
- SmallVector<uint64_t, 16> CaseCounts;
- uint64_t MaxCount = 0;
- unsigned Version = 0;
- // Default case is in the front -- save the slot here.
- CaseCounts.push_back(0);
- for (auto &VD : VDs) {
- int64_t V = VD.Value;
- uint64_t C = VD.Count;
- if (MemOPScaleCount)
- C = getScaledCount(C, ActualCount, SavedTotalCount);
-
+ WorkList.push_back(MemOp(&CI));
+ }
+ }
+
+private:
+ Function &Func;
+ BlockFrequencyInfo &BFI;
+ OptimizationRemarkEmitter &ORE;
+ DominatorTree *DT;
+ TargetLibraryInfo &TLI;
+ bool Changed;
+ std::vector<MemOp> WorkList;
+ // The space to read the profile annotation.
+ std::unique_ptr<InstrProfValueData[]> ValueDataArray;
+ bool perform(MemOp MO);
+};
+
+static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
+ assert(Count <= TotalCount);
+ if (Count < MemOPCountThreshold)
+ return false;
+ if (Count < TotalCount * MemOPPercentThreshold / 100)
+ return false;
+ return true;
+}
+
+static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
+ uint64_t Denom) {
+ if (!MemOPScaleCount)
+ return Count;
+ bool Overflowed;
+ uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
+ return ScaleCount / Denom;
+}
+
+bool MemOPSizeOpt::perform(MemOp MO) {
+ assert(MO.I);
+ if (MO.isMemmove())
+ return false;
+ if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
+ return false;
+
+ uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
+ uint64_t TotalCount;
+ if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumPromotions,
+ ValueDataArray.get(), NumVals, TotalCount))
+ return false;
+
+ uint64_t ActualCount = TotalCount;
+ uint64_t SavedTotalCount = TotalCount;
+ if (MemOPScaleCount) {
+ auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent());
+ if (!BBEdgeCount)
+ return false;
+ ActualCount = *BBEdgeCount;
+ }
+
+ ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
+ LLVM_DEBUG(dbgs() << "Read one memory intrinsic profile with count "
+ << ActualCount << "\n");
+ LLVM_DEBUG(
+ for (auto &VD
+ : VDs) { dbgs() << " (" << VD.Value << "," << VD.Count << ")\n"; });
+
+ if (ActualCount < MemOPCountThreshold)
+ return false;
+ // Skip if the total value profiled count is 0, in which case we can't
+ // scale up the counts properly (and there is no profitable transformation).
+ if (TotalCount == 0)
+ return false;
+
+ TotalCount = ActualCount;
+ if (MemOPScaleCount)
+ LLVM_DEBUG(dbgs() << "Scale counts: numerator = " << ActualCount
+ << " denominator = " << SavedTotalCount << "\n");
+
+ // Keeping track of the count of the default case:
+ uint64_t RemainCount = TotalCount;
+ uint64_t SavedRemainCount = SavedTotalCount;
+ SmallVector<uint64_t, 16> SizeIds;
+ SmallVector<uint64_t, 16> CaseCounts;
+ uint64_t MaxCount = 0;
+ unsigned Version = 0;
+ // Default case is in the front -- save the slot here.
+ CaseCounts.push_back(0);
+ for (auto &VD : VDs) {
+ int64_t V = VD.Value;
+ uint64_t C = VD.Count;
+ if (MemOPScaleCount)
+ C = getScaledCount(C, ActualCount, SavedTotalCount);
+
if (!InstrProfIsSingleValRange(V) || V > MemOpMaxOptSize)
- continue;
-
- // ValueCounts are sorted on the count. Break at the first un-profitable
- // value.
- if (!isProfitable(C, RemainCount))
- break;
-
- SizeIds.push_back(V);
- CaseCounts.push_back(C);
- if (C > MaxCount)
- MaxCount = C;
-
- assert(RemainCount >= C);
- RemainCount -= C;
- assert(SavedRemainCount >= VD.Count);
- SavedRemainCount -= VD.Count;
-
- if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
- break;
- }
-
- if (Version == 0)
- return false;
-
- CaseCounts[0] = RemainCount;
- if (RemainCount > MaxCount)
- MaxCount = RemainCount;
-
- uint64_t SumForOpt = TotalCount - RemainCount;
-
- LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
- << " Versions (covering " << SumForOpt << " out of "
- << TotalCount << ")\n");
-
- // mem_op(..., size)
- // ==>
- // switch (size) {
- // case s1:
- // mem_op(..., s1);
- // goto merge_bb;
- // case s2:
- // mem_op(..., s2);
- // goto merge_bb;
- // ...
- // default:
- // mem_op(..., size);
- // goto merge_bb;
- // }
- // merge_bb:
-
- BasicBlock *BB = MO.I->getParent();
- LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
- LLVM_DEBUG(dbgs() << *BB << "\n");
- auto OrigBBFreq = BFI.getBlockFreq(BB);
-
- BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT);
- BasicBlock::iterator It(*MO.I);
- ++It;
- assert(It != DefaultBB->end());
- BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
- MergeBB->setName("MemOP.Merge");
- BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
- DefaultBB->setName("MemOP.Default");
-
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- auto &Ctx = Func.getContext();
- IRBuilder<> IRB(BB);
- BB->getTerminator()->eraseFromParent();
- Value *SizeVar = MO.getLength();
- SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
- Type *MemOpTy = MO.I->getType();
- PHINode *PHI = nullptr;
- if (!MemOpTy->isVoidTy()) {
- // Insert a phi for the return values at the merge block.
- IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
- PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
- MO.I->replaceAllUsesWith(PHI);
- PHI->addIncoming(MO.I, DefaultBB);
- }
-
- // Clear the value profile data.
- MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
- // If all promoted, we don't need the MD.prof metadata.
- if (SavedRemainCount > 0 || Version != NumVals)
- // Otherwise we need update with the un-promoted records back.
- annotateValueSite(*Func.getParent(), *MO.I, VDs.slice(Version),
- SavedRemainCount, IPVK_MemOPSize, NumVals);
-
- LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
-
- std::vector<DominatorTree::UpdateType> Updates;
- if (DT)
- Updates.reserve(2 * SizeIds.size());
-
- for (uint64_t SizeId : SizeIds) {
- BasicBlock *CaseBB = BasicBlock::Create(
- Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
- MemOp NewMO = MO.clone();
- // Fix the argument.
- auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
- assert(SizeType && "Expected integer type size argument.");
- ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
- NewMO.setLength(CaseSizeId);
- CaseBB->getInstList().push_back(NewMO.I);
- IRBuilder<> IRBCase(CaseBB);
- IRBCase.CreateBr(MergeBB);
- SI->addCase(CaseSizeId, CaseBB);
- if (!MemOpTy->isVoidTy())
- PHI->addIncoming(NewMO.I, CaseBB);
- if (DT) {
- Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
- Updates.push_back({DominatorTree::Insert, BB, CaseBB});
- }
- LLVM_DEBUG(dbgs() << *CaseBB << "\n");
- }
- DTU.applyUpdates(Updates);
- Updates.clear();
-
- setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
-
- LLVM_DEBUG(dbgs() << *BB << "\n");
- LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
- LLVM_DEBUG(dbgs() << *MergeBB << "\n");
-
- ORE.emit([&]() {
- using namespace ore;
- return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
- << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
- << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
- << " for " << NV("Versions", Version) << " versions";
- });
-
- return true;
-}
-} // namespace
-
-static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
- OptimizationRemarkEmitter &ORE,
- DominatorTree *DT, TargetLibraryInfo &TLI) {
- if (DisableMemOPOPT)
- return false;
-
- if (F.hasFnAttribute(Attribute::OptimizeForSize))
- return false;
- MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
- MemOPSizeOpt.perform();
- return MemOPSizeOpt.isChanged();
-}
-
-bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
- BlockFrequencyInfo &BFI =
- getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
- auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
- TargetLibraryInfo &TLI =
- getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
-}
-
-namespace llvm {
-char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
-
-PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
- FunctionAnalysisManager &FAM) {
- auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
- auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
- auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
- bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
- if (!Changed)
- return PreservedAnalyses::all();
- auto PA = PreservedAnalyses();
- PA.preserve<GlobalsAA>();
- PA.preserve<DominatorTreeAnalysis>();
- return PA;
-}
-} // namespace llvm
+ continue;
+
+ // ValueCounts are sorted on the count. Break at the first un-profitable
+ // value.
+ if (!isProfitable(C, RemainCount))
+ break;
+
+ SizeIds.push_back(V);
+ CaseCounts.push_back(C);
+ if (C > MaxCount)
+ MaxCount = C;
+
+ assert(RemainCount >= C);
+ RemainCount -= C;
+ assert(SavedRemainCount >= VD.Count);
+ SavedRemainCount -= VD.Count;
+
+ if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
+ break;
+ }
+
+ if (Version == 0)
+ return false;
+
+ CaseCounts[0] = RemainCount;
+ if (RemainCount > MaxCount)
+ MaxCount = RemainCount;
+
+ uint64_t SumForOpt = TotalCount - RemainCount;
+
+ LLVM_DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
+ << " Versions (covering " << SumForOpt << " out of "
+ << TotalCount << ")\n");
+
+ // mem_op(..., size)
+ // ==>
+ // switch (size) {
+ // case s1:
+ // mem_op(..., s1);
+ // goto merge_bb;
+ // case s2:
+ // mem_op(..., s2);
+ // goto merge_bb;
+ // ...
+ // default:
+ // mem_op(..., size);
+ // goto merge_bb;
+ // }
+ // merge_bb:
+
+ BasicBlock *BB = MO.I->getParent();
+ LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
+ LLVM_DEBUG(dbgs() << *BB << "\n");
+ auto OrigBBFreq = BFI.getBlockFreq(BB);
+
+ BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT);
+ BasicBlock::iterator It(*MO.I);
+ ++It;
+ assert(It != DefaultBB->end());
+ BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
+ MergeBB->setName("MemOP.Merge");
+ BFI.setBlockFreq(MergeBB, OrigBBFreq.getFrequency());
+ DefaultBB->setName("MemOP.Default");
+
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ auto &Ctx = Func.getContext();
+ IRBuilder<> IRB(BB);
+ BB->getTerminator()->eraseFromParent();
+ Value *SizeVar = MO.getLength();
+ SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
+ Type *MemOpTy = MO.I->getType();
+ PHINode *PHI = nullptr;
+ if (!MemOpTy->isVoidTy()) {
+ // Insert a phi for the return values at the merge block.
+ IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
+ PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
+ MO.I->replaceAllUsesWith(PHI);
+ PHI->addIncoming(MO.I, DefaultBB);
+ }
+
+ // Clear the value profile data.
+ MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
+ // If all promoted, we don't need the MD.prof metadata.
+ if (SavedRemainCount > 0 || Version != NumVals)
+ // Otherwise we need update with the un-promoted records back.
+ annotateValueSite(*Func.getParent(), *MO.I, VDs.slice(Version),
+ SavedRemainCount, IPVK_MemOPSize, NumVals);
+
+ LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
+
+ std::vector<DominatorTree::UpdateType> Updates;
+ if (DT)
+ Updates.reserve(2 * SizeIds.size());
+
+ for (uint64_t SizeId : SizeIds) {
+ BasicBlock *CaseBB = BasicBlock::Create(
+ Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
+ MemOp NewMO = MO.clone();
+ // Fix the argument.
+ auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
+ assert(SizeType && "Expected integer type size argument.");
+ ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
+ NewMO.setLength(CaseSizeId);
+ CaseBB->getInstList().push_back(NewMO.I);
+ IRBuilder<> IRBCase(CaseBB);
+ IRBCase.CreateBr(MergeBB);
+ SI->addCase(CaseSizeId, CaseBB);
+ if (!MemOpTy->isVoidTy())
+ PHI->addIncoming(NewMO.I, CaseBB);
+ if (DT) {
+ Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
+ Updates.push_back({DominatorTree::Insert, BB, CaseBB});
+ }
+ LLVM_DEBUG(dbgs() << *CaseBB << "\n");
+ }
+ DTU.applyUpdates(Updates);
+ Updates.clear();
+
+ setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
+
+ LLVM_DEBUG(dbgs() << *BB << "\n");
+ LLVM_DEBUG(dbgs() << *DefaultBB << "\n");
+ LLVM_DEBUG(dbgs() << *MergeBB << "\n");
+
+ ORE.emit([&]() {
+ using namespace ore;
+ return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
+ << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
+ << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
+ << " for " << NV("Versions", Version) << " versions";
+ });
+
+ return true;
+}
+} // namespace
+
+static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
+ OptimizationRemarkEmitter &ORE,
+ DominatorTree *DT, TargetLibraryInfo &TLI) {
+ if (DisableMemOPOPT)
+ return false;
+
+ if (F.hasFnAttribute(Attribute::OptimizeForSize))
+ return false;
+ MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
+ MemOPSizeOpt.perform();
+ return MemOPSizeOpt.isChanged();
+}
+
+bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
+ BlockFrequencyInfo &BFI =
+ getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+ auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
+}
+
+namespace llvm {
+char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
+
+PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+ auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+ auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+ bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
+ if (!Changed)
+ return PreservedAnalyses::all();
+ auto PA = PreservedAnalyses();
+ PA.preserve<GlobalsAA>();
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
+}
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/PoisonChecking.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/PoisonChecking.cpp
index bb822f7b27..fc52672618 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/PoisonChecking.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/PoisonChecking.cpp
@@ -1,359 +1,359 @@
-//===- PoisonChecking.cpp - -----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implements a transform pass which instruments IR such that poison semantics
-// are made explicit. That is, it provides a (possibly partial) executable
-// semantics for every instruction w.r.t. poison as specified in the LLVM
-// LangRef. There are obvious parallels to the sanitizer tools, but this pass
-// is focused purely on the semantics of LLVM IR, not any particular source
-// language. If you're looking for something to see if your C/C++ contains
-// UB, this is not it.
-//
-// The rewritten semantics of each instruction will include the following
-// components:
-//
-// 1) The original instruction, unmodified.
-// 2) A propagation rule which translates dynamic information about the poison
-// state of each input to whether the dynamic output of the instruction
-// produces poison.
-// 3) A creation rule which validates any poison producing flags on the
-// instruction itself (e.g. checks for overflow on nsw).
-// 4) A check rule which traps (to a handler function) if this instruction must
-// execute undefined behavior given the poison state of it's inputs.
-//
-// This is a must analysis based transform; that is, the resulting code may
-// produce a false negative result (not report UB when actually exists
-// according to the LangRef spec), but should never produce a false positive
-// (report UB where it doesn't exist).
-//
-// Use cases for this pass include:
-// - Understanding (and testing!) the implications of the definition of poison
-// from the LangRef.
-// - Validating the output of a IR fuzzer to ensure that all programs produced
-// are well defined on the specific input used.
-// - Finding/confirming poison specific miscompiles by checking the poison
-// status of an input/IR pair is the same before and after an optimization
-// transform.
-// - Checking that a bugpoint reduction does not introduce UB which didn't
-// exist in the original program being reduced.
-//
-// The major sources of inaccuracy are currently:
-// - Most validation rules not yet implemented for instructions with poison
-// relavant flags. At the moment, only nsw/nuw on add/sub are supported.
-// - UB which is control dependent on a branch on poison is not yet
-// reported. Currently, only data flow dependence is modeled.
-// - Poison which is propagated through memory is not modeled. As such,
-// storing poison to memory and then reloading it will cause a false negative
-// as we consider the reloaded value to not be poisoned.
-// - Poison propagation across function boundaries is not modeled. At the
-// moment, all arguments and return values are assumed not to be poison.
-// - Undef is not modeled. In particular, the optimizer's freedom to pick
-// concrete values for undef bits so as to maximize potential for producing
-// poison is not modeled.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation/PoisonChecking.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "poison-checking"
-
-static cl::opt<bool>
-LocalCheck("poison-checking-function-local",
- cl::init(false),
- cl::desc("Check that returns are non-poison (for testing)"));
-
-
-static bool isConstantFalse(Value* V) {
- assert(V->getType()->isIntegerTy(1));
- if (auto *CI = dyn_cast<ConstantInt>(V))
- return CI->isZero();
- return false;
-}
-
-static Value *buildOrChain(IRBuilder<> &B, ArrayRef<Value*> Ops) {
- if (Ops.size() == 0)
- return B.getFalse();
- unsigned i = 0;
- for (; i < Ops.size() && isConstantFalse(Ops[i]); i++) {}
- if (i == Ops.size())
- return B.getFalse();
- Value *Accum = Ops[i++];
- for (; i < Ops.size(); i++)
- if (!isConstantFalse(Ops[i]))
- Accum = B.CreateOr(Accum, Ops[i]);
- return Accum;
-}
-
-static void generateCreationChecksForBinOp(Instruction &I,
- SmallVectorImpl<Value*> &Checks) {
- assert(isa<BinaryOperator>(I));
-
- IRBuilder<> B(&I);
- Value *LHS = I.getOperand(0);
- Value *RHS = I.getOperand(1);
- switch (I.getOpcode()) {
- default:
- return;
- case Instruction::Add: {
- if (I.hasNoSignedWrap()) {
- auto *OverflowOp =
- B.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, LHS, RHS);
- Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
- }
- if (I.hasNoUnsignedWrap()) {
- auto *OverflowOp =
- B.CreateBinaryIntrinsic(Intrinsic::uadd_with_overflow, LHS, RHS);
- Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
- }
- break;
- }
- case Instruction::Sub: {
- if (I.hasNoSignedWrap()) {
- auto *OverflowOp =
- B.CreateBinaryIntrinsic(Intrinsic::ssub_with_overflow, LHS, RHS);
- Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
- }
- if (I.hasNoUnsignedWrap()) {
- auto *OverflowOp =
- B.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow, LHS, RHS);
- Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
- }
- break;
- }
- case Instruction::Mul: {
- if (I.hasNoSignedWrap()) {
- auto *OverflowOp =
- B.CreateBinaryIntrinsic(Intrinsic::smul_with_overflow, LHS, RHS);
- Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
- }
- if (I.hasNoUnsignedWrap()) {
- auto *OverflowOp =
- B.CreateBinaryIntrinsic(Intrinsic::umul_with_overflow, LHS, RHS);
- Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
- }
- break;
- }
- case Instruction::UDiv: {
- if (I.isExact()) {
- auto *Check =
- B.CreateICmp(ICmpInst::ICMP_NE, B.CreateURem(LHS, RHS),
- ConstantInt::get(LHS->getType(), 0));
- Checks.push_back(Check);
- }
- break;
- }
- case Instruction::SDiv: {
- if (I.isExact()) {
- auto *Check =
- B.CreateICmp(ICmpInst::ICMP_NE, B.CreateSRem(LHS, RHS),
- ConstantInt::get(LHS->getType(), 0));
- Checks.push_back(Check);
- }
- break;
- }
- case Instruction::AShr:
- case Instruction::LShr:
- case Instruction::Shl: {
- Value *ShiftCheck =
- B.CreateICmp(ICmpInst::ICMP_UGE, RHS,
- ConstantInt::get(RHS->getType(),
- LHS->getType()->getScalarSizeInBits()));
- Checks.push_back(ShiftCheck);
- break;
- }
- };
-}
-
-/// Given an instruction which can produce poison on non-poison inputs
-/// (i.e. canCreatePoison returns true), generate runtime checks to produce
-/// boolean indicators of when poison would result.
-static void generateCreationChecks(Instruction &I,
- SmallVectorImpl<Value*> &Checks) {
- IRBuilder<> B(&I);
- if (isa<BinaryOperator>(I) && !I.getType()->isVectorTy())
- generateCreationChecksForBinOp(I, Checks);
-
- // Handle non-binops separately
- switch (I.getOpcode()) {
- default:
- // Note there are a couple of missing cases here, once implemented, this
- // should become an llvm_unreachable.
- break;
- case Instruction::ExtractElement: {
- Value *Vec = I.getOperand(0);
- auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType());
- if (!VecVTy)
- break;
- Value *Idx = I.getOperand(1);
- unsigned NumElts = VecVTy->getNumElements();
- Value *Check =
- B.CreateICmp(ICmpInst::ICMP_UGE, Idx,
- ConstantInt::get(Idx->getType(), NumElts));
- Checks.push_back(Check);
- break;
- }
- case Instruction::InsertElement: {
- Value *Vec = I.getOperand(0);
- auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType());
- if (!VecVTy)
- break;
- Value *Idx = I.getOperand(2);
- unsigned NumElts = VecVTy->getNumElements();
- Value *Check =
- B.CreateICmp(ICmpInst::ICMP_UGE, Idx,
- ConstantInt::get(Idx->getType(), NumElts));
- Checks.push_back(Check);
- break;
- }
- };
-}
-
-static Value *getPoisonFor(DenseMap<Value *, Value *> &ValToPoison, Value *V) {
- auto Itr = ValToPoison.find(V);
- if (Itr != ValToPoison.end())
- return Itr->second;
- if (isa<Constant>(V)) {
- return ConstantInt::getFalse(V->getContext());
- }
- // Return false for unknwon values - this implements a non-strict mode where
- // unhandled IR constructs are simply considered to never produce poison. At
- // some point in the future, we probably want a "strict mode" for testing if
- // nothing else.
- return ConstantInt::getFalse(V->getContext());
-}
-
-static void CreateAssert(IRBuilder<> &B, Value *Cond) {
- assert(Cond->getType()->isIntegerTy(1));
- if (auto *CI = dyn_cast<ConstantInt>(Cond))
- if (CI->isAllOnesValue())
- return;
-
- Module *M = B.GetInsertBlock()->getModule();
- M->getOrInsertFunction("__poison_checker_assert",
- Type::getVoidTy(M->getContext()),
- Type::getInt1Ty(M->getContext()));
- Function *TrapFunc = M->getFunction("__poison_checker_assert");
- B.CreateCall(TrapFunc, Cond);
-}
-
-static void CreateAssertNot(IRBuilder<> &B, Value *Cond) {
- assert(Cond->getType()->isIntegerTy(1));
- CreateAssert(B, B.CreateNot(Cond));
-}
-
-static bool rewrite(Function &F) {
- auto * const Int1Ty = Type::getInt1Ty(F.getContext());
-
- DenseMap<Value *, Value *> ValToPoison;
-
- for (BasicBlock &BB : F)
- for (auto I = BB.begin(); isa<PHINode>(&*I); I++) {
- auto *OldPHI = cast<PHINode>(&*I);
- auto *NewPHI = PHINode::Create(Int1Ty, OldPHI->getNumIncomingValues());
- for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++)
- NewPHI->addIncoming(UndefValue::get(Int1Ty),
- OldPHI->getIncomingBlock(i));
- NewPHI->insertBefore(OldPHI);
- ValToPoison[OldPHI] = NewPHI;
- }
-
- for (BasicBlock &BB : F)
- for (Instruction &I : BB) {
- if (isa<PHINode>(I)) continue;
-
- IRBuilder<> B(cast<Instruction>(&I));
-
- // Note: There are many more sources of documented UB, but this pass only
- // attempts to find UB triggered by propagation of poison.
+//===- PoisonChecking.cpp - -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements a transform pass which instruments IR such that poison semantics
+// are made explicit. That is, it provides a (possibly partial) executable
+// semantics for every instruction w.r.t. poison as specified in the LLVM
+// LangRef. There are obvious parallels to the sanitizer tools, but this pass
+// is focused purely on the semantics of LLVM IR, not any particular source
+// language. If you're looking for something to see if your C/C++ contains
+// UB, this is not it.
+//
+// The rewritten semantics of each instruction will include the following
+// components:
+//
+// 1) The original instruction, unmodified.
+// 2) A propagation rule which translates dynamic information about the poison
+// state of each input to whether the dynamic output of the instruction
+// produces poison.
+// 3) A creation rule which validates any poison producing flags on the
+// instruction itself (e.g. checks for overflow on nsw).
+// 4) A check rule which traps (to a handler function) if this instruction must
+// execute undefined behavior given the poison state of it's inputs.
+//
+// This is a must analysis based transform; that is, the resulting code may
+// produce a false negative result (not report UB when actually exists
+// according to the LangRef spec), but should never produce a false positive
+// (report UB where it doesn't exist).
+//
+// Use cases for this pass include:
+// - Understanding (and testing!) the implications of the definition of poison
+// from the LangRef.
+// - Validating the output of a IR fuzzer to ensure that all programs produced
+// are well defined on the specific input used.
+// - Finding/confirming poison specific miscompiles by checking the poison
+// status of an input/IR pair is the same before and after an optimization
+// transform.
+// - Checking that a bugpoint reduction does not introduce UB which didn't
+// exist in the original program being reduced.
+//
+// The major sources of inaccuracy are currently:
+// - Most validation rules not yet implemented for instructions with poison
+// relavant flags. At the moment, only nsw/nuw on add/sub are supported.
+// - UB which is control dependent on a branch on poison is not yet
+// reported. Currently, only data flow dependence is modeled.
+// - Poison which is propagated through memory is not modeled. As such,
+// storing poison to memory and then reloading it will cause a false negative
+// as we consider the reloaded value to not be poisoned.
+// - Poison propagation across function boundaries is not modeled. At the
+// moment, all arguments and return values are assumed not to be poison.
+// - Undef is not modeled. In particular, the optimizer's freedom to pick
+// concrete values for undef bits so as to maximize potential for producing
+// poison is not modeled.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/PoisonChecking.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "poison-checking"
+
+static cl::opt<bool>
+LocalCheck("poison-checking-function-local",
+ cl::init(false),
+ cl::desc("Check that returns are non-poison (for testing)"));
+
+
+static bool isConstantFalse(Value* V) {
+ assert(V->getType()->isIntegerTy(1));
+ if (auto *CI = dyn_cast<ConstantInt>(V))
+ return CI->isZero();
+ return false;
+}
+
+static Value *buildOrChain(IRBuilder<> &B, ArrayRef<Value*> Ops) {
+ if (Ops.size() == 0)
+ return B.getFalse();
+ unsigned i = 0;
+ for (; i < Ops.size() && isConstantFalse(Ops[i]); i++) {}
+ if (i == Ops.size())
+ return B.getFalse();
+ Value *Accum = Ops[i++];
+ for (; i < Ops.size(); i++)
+ if (!isConstantFalse(Ops[i]))
+ Accum = B.CreateOr(Accum, Ops[i]);
+ return Accum;
+}
+
+static void generateCreationChecksForBinOp(Instruction &I,
+ SmallVectorImpl<Value*> &Checks) {
+ assert(isa<BinaryOperator>(I));
+
+ IRBuilder<> B(&I);
+ Value *LHS = I.getOperand(0);
+ Value *RHS = I.getOperand(1);
+ switch (I.getOpcode()) {
+ default:
+ return;
+ case Instruction::Add: {
+ if (I.hasNoSignedWrap()) {
+ auto *OverflowOp =
+ B.CreateBinaryIntrinsic(Intrinsic::sadd_with_overflow, LHS, RHS);
+ Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+ }
+ if (I.hasNoUnsignedWrap()) {
+ auto *OverflowOp =
+ B.CreateBinaryIntrinsic(Intrinsic::uadd_with_overflow, LHS, RHS);
+ Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+ }
+ break;
+ }
+ case Instruction::Sub: {
+ if (I.hasNoSignedWrap()) {
+ auto *OverflowOp =
+ B.CreateBinaryIntrinsic(Intrinsic::ssub_with_overflow, LHS, RHS);
+ Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+ }
+ if (I.hasNoUnsignedWrap()) {
+ auto *OverflowOp =
+ B.CreateBinaryIntrinsic(Intrinsic::usub_with_overflow, LHS, RHS);
+ Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+ }
+ break;
+ }
+ case Instruction::Mul: {
+ if (I.hasNoSignedWrap()) {
+ auto *OverflowOp =
+ B.CreateBinaryIntrinsic(Intrinsic::smul_with_overflow, LHS, RHS);
+ Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+ }
+ if (I.hasNoUnsignedWrap()) {
+ auto *OverflowOp =
+ B.CreateBinaryIntrinsic(Intrinsic::umul_with_overflow, LHS, RHS);
+ Checks.push_back(B.CreateExtractValue(OverflowOp, 1));
+ }
+ break;
+ }
+ case Instruction::UDiv: {
+ if (I.isExact()) {
+ auto *Check =
+ B.CreateICmp(ICmpInst::ICMP_NE, B.CreateURem(LHS, RHS),
+ ConstantInt::get(LHS->getType(), 0));
+ Checks.push_back(Check);
+ }
+ break;
+ }
+ case Instruction::SDiv: {
+ if (I.isExact()) {
+ auto *Check =
+ B.CreateICmp(ICmpInst::ICMP_NE, B.CreateSRem(LHS, RHS),
+ ConstantInt::get(LHS->getType(), 0));
+ Checks.push_back(Check);
+ }
+ break;
+ }
+ case Instruction::AShr:
+ case Instruction::LShr:
+ case Instruction::Shl: {
+ Value *ShiftCheck =
+ B.CreateICmp(ICmpInst::ICMP_UGE, RHS,
+ ConstantInt::get(RHS->getType(),
+ LHS->getType()->getScalarSizeInBits()));
+ Checks.push_back(ShiftCheck);
+ break;
+ }
+ };
+}
+
+/// Given an instruction which can produce poison on non-poison inputs
+/// (i.e. canCreatePoison returns true), generate runtime checks to produce
+/// boolean indicators of when poison would result.
+static void generateCreationChecks(Instruction &I,
+ SmallVectorImpl<Value*> &Checks) {
+ IRBuilder<> B(&I);
+ if (isa<BinaryOperator>(I) && !I.getType()->isVectorTy())
+ generateCreationChecksForBinOp(I, Checks);
+
+ // Handle non-binops separately
+ switch (I.getOpcode()) {
+ default:
+ // Note there are a couple of missing cases here, once implemented, this
+ // should become an llvm_unreachable.
+ break;
+ case Instruction::ExtractElement: {
+ Value *Vec = I.getOperand(0);
+ auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType());
+ if (!VecVTy)
+ break;
+ Value *Idx = I.getOperand(1);
+ unsigned NumElts = VecVTy->getNumElements();
+ Value *Check =
+ B.CreateICmp(ICmpInst::ICMP_UGE, Idx,
+ ConstantInt::get(Idx->getType(), NumElts));
+ Checks.push_back(Check);
+ break;
+ }
+ case Instruction::InsertElement: {
+ Value *Vec = I.getOperand(0);
+ auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType());
+ if (!VecVTy)
+ break;
+ Value *Idx = I.getOperand(2);
+ unsigned NumElts = VecVTy->getNumElements();
+ Value *Check =
+ B.CreateICmp(ICmpInst::ICMP_UGE, Idx,
+ ConstantInt::get(Idx->getType(), NumElts));
+ Checks.push_back(Check);
+ break;
+ }
+ };
+}
+
+static Value *getPoisonFor(DenseMap<Value *, Value *> &ValToPoison, Value *V) {
+ auto Itr = ValToPoison.find(V);
+ if (Itr != ValToPoison.end())
+ return Itr->second;
+ if (isa<Constant>(V)) {
+ return ConstantInt::getFalse(V->getContext());
+ }
+ // Return false for unknwon values - this implements a non-strict mode where
+ // unhandled IR constructs are simply considered to never produce poison. At
+ // some point in the future, we probably want a "strict mode" for testing if
+ // nothing else.
+ return ConstantInt::getFalse(V->getContext());
+}
+
+static void CreateAssert(IRBuilder<> &B, Value *Cond) {
+ assert(Cond->getType()->isIntegerTy(1));
+ if (auto *CI = dyn_cast<ConstantInt>(Cond))
+ if (CI->isAllOnesValue())
+ return;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ M->getOrInsertFunction("__poison_checker_assert",
+ Type::getVoidTy(M->getContext()),
+ Type::getInt1Ty(M->getContext()));
+ Function *TrapFunc = M->getFunction("__poison_checker_assert");
+ B.CreateCall(TrapFunc, Cond);
+}
+
+static void CreateAssertNot(IRBuilder<> &B, Value *Cond) {
+ assert(Cond->getType()->isIntegerTy(1));
+ CreateAssert(B, B.CreateNot(Cond));
+}
+
+static bool rewrite(Function &F) {
+ auto * const Int1Ty = Type::getInt1Ty(F.getContext());
+
+ DenseMap<Value *, Value *> ValToPoison;
+
+ for (BasicBlock &BB : F)
+ for (auto I = BB.begin(); isa<PHINode>(&*I); I++) {
+ auto *OldPHI = cast<PHINode>(&*I);
+ auto *NewPHI = PHINode::Create(Int1Ty, OldPHI->getNumIncomingValues());
+ for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++)
+ NewPHI->addIncoming(UndefValue::get(Int1Ty),
+ OldPHI->getIncomingBlock(i));
+ NewPHI->insertBefore(OldPHI);
+ ValToPoison[OldPHI] = NewPHI;
+ }
+
+ for (BasicBlock &BB : F)
+ for (Instruction &I : BB) {
+ if (isa<PHINode>(I)) continue;
+
+ IRBuilder<> B(cast<Instruction>(&I));
+
+ // Note: There are many more sources of documented UB, but this pass only
+ // attempts to find UB triggered by propagation of poison.
SmallPtrSet<const Value *, 4> NonPoisonOps;
getGuaranteedNonPoisonOps(&I, NonPoisonOps);
for (const Value *Op : NonPoisonOps)
CreateAssertNot(B, getPoisonFor(ValToPoison, const_cast<Value *>(Op)));
-
- if (LocalCheck)
- if (auto *RI = dyn_cast<ReturnInst>(&I))
- if (RI->getNumOperands() != 0) {
- Value *Op = RI->getOperand(0);
- CreateAssertNot(B, getPoisonFor(ValToPoison, Op));
- }
-
- SmallVector<Value*, 4> Checks;
+
+ if (LocalCheck)
+ if (auto *RI = dyn_cast<ReturnInst>(&I))
+ if (RI->getNumOperands() != 0) {
+ Value *Op = RI->getOperand(0);
+ CreateAssertNot(B, getPoisonFor(ValToPoison, Op));
+ }
+
+ SmallVector<Value*, 4> Checks;
if (propagatesPoison(cast<Operator>(&I)))
- for (Value *V : I.operands())
- Checks.push_back(getPoisonFor(ValToPoison, V));
-
+ for (Value *V : I.operands())
+ Checks.push_back(getPoisonFor(ValToPoison, V));
+
if (canCreatePoison(cast<Operator>(&I)))
- generateCreationChecks(I, Checks);
- ValToPoison[&I] = buildOrChain(B, Checks);
- }
-
- for (BasicBlock &BB : F)
- for (auto I = BB.begin(); isa<PHINode>(&*I); I++) {
- auto *OldPHI = cast<PHINode>(&*I);
- if (!ValToPoison.count(OldPHI))
- continue; // skip the newly inserted phis
- auto *NewPHI = cast<PHINode>(ValToPoison[OldPHI]);
- for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++) {
- auto *OldVal = OldPHI->getIncomingValue(i);
- NewPHI->setIncomingValue(i, getPoisonFor(ValToPoison, OldVal));
- }
- }
- return true;
-}
-
-
-PreservedAnalyses PoisonCheckingPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- bool Changed = false;
- for (auto &F : M)
- Changed |= rewrite(F);
-
- return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
-}
-
-PreservedAnalyses PoisonCheckingPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- return rewrite(F) ? PreservedAnalyses::none() : PreservedAnalyses::all();
-}
-
-/* Major TODO Items:
- - Control dependent poison UB
- - Strict mode - (i.e. must analyze every operand)
- - Poison through memory
- - Function ABIs
- - Full coverage of intrinsics, etc.. (ouch)
-
- Instructions w/Unclear Semantics:
- - shufflevector - It would seem reasonable for an out of bounds mask element
- to produce poison, but the LangRef does not state.
- - all binary ops w/vector operands - The likely interpretation would be that
- any element overflowing should produce poison for the entire result, but
- the LangRef does not state.
- - Floating point binary ops w/fmf flags other than (nnan, noinfs). It seems
- strange that only certian flags should be documented as producing poison.
-
- Cases of clear poison semantics not yet implemented:
- - Exact flags on ashr/lshr produce poison
- - NSW/NUW flags on shl produce poison
- - Inbounds flag on getelementptr produce poison
- - fptosi/fptoui (out of bounds input) produce poison
- - Scalable vector types for insertelement/extractelement
- - Floating point binary ops w/fmf nnan/noinfs flags produce poison
- */
+ generateCreationChecks(I, Checks);
+ ValToPoison[&I] = buildOrChain(B, Checks);
+ }
+
+ for (BasicBlock &BB : F)
+ for (auto I = BB.begin(); isa<PHINode>(&*I); I++) {
+ auto *OldPHI = cast<PHINode>(&*I);
+ if (!ValToPoison.count(OldPHI))
+ continue; // skip the newly inserted phis
+ auto *NewPHI = cast<PHINode>(ValToPoison[OldPHI]);
+ for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++) {
+ auto *OldVal = OldPHI->getIncomingValue(i);
+ NewPHI->setIncomingValue(i, getPoisonFor(ValToPoison, OldVal));
+ }
+ }
+ return true;
+}
+
+
+PreservedAnalyses PoisonCheckingPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ bool Changed = false;
+ for (auto &F : M)
+ Changed |= rewrite(F);
+
+ return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+PreservedAnalyses PoisonCheckingPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ return rewrite(F) ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
+
+/* Major TODO Items:
+ - Control dependent poison UB
+ - Strict mode - (i.e. must analyze every operand)
+ - Poison through memory
+ - Function ABIs
+ - Full coverage of intrinsics, etc.. (ouch)
+
+ Instructions w/Unclear Semantics:
+ - shufflevector - It would seem reasonable for an out of bounds mask element
+ to produce poison, but the LangRef does not state.
+ - all binary ops w/vector operands - The likely interpretation would be that
+ any element overflowing should produce poison for the entire result, but
+ the LangRef does not state.
+ - Floating point binary ops w/fmf flags other than (nnan, noinfs). It seems
+ strange that only certian flags should be documented as producing poison.
+
+ Cases of clear poison semantics not yet implemented:
+ - Exact flags on ashr/lshr produce poison
+ - NSW/NUW flags on shl produce poison
+ - Inbounds flag on getelementptr produce poison
+ - fptosi/fptoui (out of bounds input) produce poison
+ - Scalable vector types for insertelement/extractelement
+ - Floating point binary ops w/fmf nnan/noinfs flags produce poison
+ */
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 656cf6267b..2d4b079394 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -1,50 +1,50 @@
-//===-- SanitizerCoverage.cpp - coverage instrumentation for sanitizers ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Coverage instrumentation done on LLVM IR level, works with Sanitizers.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/SpecialCaseList.h"
-#include "llvm/Support/VirtualFileSystem.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "sancov"
-
+//===-- SanitizerCoverage.cpp - coverage instrumentation for sanitizers ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Coverage instrumentation done on LLVM IR level, works with Sanitizers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/SanitizerCoverage.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/SpecialCaseList.h"
+#include "llvm/Support/VirtualFileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sancov"
+
const char SanCovTracePCIndirName[] = "__sanitizer_cov_trace_pc_indir";
const char SanCovTracePCName[] = "__sanitizer_cov_trace_pc";
const char SanCovTraceCmp1[] = "__sanitizer_cov_trace_cmp1";
@@ -60,935 +60,935 @@ const char SanCovTraceDiv8[] = "__sanitizer_cov_trace_div8";
const char SanCovTraceGep[] = "__sanitizer_cov_trace_gep";
const char SanCovTraceSwitchName[] = "__sanitizer_cov_trace_switch";
const char SanCovModuleCtorTracePcGuardName[] =
- "sancov.module_ctor_trace_pc_guard";
+ "sancov.module_ctor_trace_pc_guard";
const char SanCovModuleCtor8bitCountersName[] =
- "sancov.module_ctor_8bit_counters";
+ "sancov.module_ctor_8bit_counters";
const char SanCovModuleCtorBoolFlagName[] = "sancov.module_ctor_bool_flag";
-static const uint64_t SanCtorAndDtorPriority = 2;
-
+static const uint64_t SanCtorAndDtorPriority = 2;
+
const char SanCovTracePCGuardName[] = "__sanitizer_cov_trace_pc_guard";
const char SanCovTracePCGuardInitName[] = "__sanitizer_cov_trace_pc_guard_init";
const char SanCov8bitCountersInitName[] = "__sanitizer_cov_8bit_counters_init";
const char SanCovBoolFlagInitName[] = "__sanitizer_cov_bool_flag_init";
const char SanCovPCsInitName[] = "__sanitizer_cov_pcs_init";
-
+
const char SanCovGuardsSectionName[] = "sancov_guards";
const char SanCovCountersSectionName[] = "sancov_cntrs";
const char SanCovBoolFlagSectionName[] = "sancov_bools";
const char SanCovPCsSectionName[] = "sancov_pcs";
-
+
const char SanCovLowestStackName[] = "__sancov_lowest_stack";
-
-static cl::opt<int> ClCoverageLevel(
- "sanitizer-coverage-level",
- cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, "
- "3: all blocks and critical edges"),
- cl::Hidden, cl::init(0));
-
-static cl::opt<bool> ClTracePC("sanitizer-coverage-trace-pc",
- cl::desc("Experimental pc tracing"), cl::Hidden,
- cl::init(false));
-
-static cl::opt<bool> ClTracePCGuard("sanitizer-coverage-trace-pc-guard",
- cl::desc("pc tracing with a guard"),
- cl::Hidden, cl::init(false));
-
-// If true, we create a global variable that contains PCs of all instrumented
-// BBs, put this global into a named section, and pass this section's bounds
-// to __sanitizer_cov_pcs_init.
-// This way the coverage instrumentation does not need to acquire the PCs
-// at run-time. Works with trace-pc-guard, inline-8bit-counters, and
-// inline-bool-flag.
-static cl::opt<bool> ClCreatePCTable("sanitizer-coverage-pc-table",
- cl::desc("create a static PC table"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool>
- ClInline8bitCounters("sanitizer-coverage-inline-8bit-counters",
- cl::desc("increments 8-bit counter for every edge"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool>
- ClInlineBoolFlag("sanitizer-coverage-inline-bool-flag",
- cl::desc("sets a boolean flag for every edge"), cl::Hidden,
- cl::init(false));
-
-static cl::opt<bool>
- ClCMPTracing("sanitizer-coverage-trace-compares",
- cl::desc("Tracing of CMP and similar instructions"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool> ClDIVTracing("sanitizer-coverage-trace-divs",
- cl::desc("Tracing of DIV instructions"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool> ClGEPTracing("sanitizer-coverage-trace-geps",
- cl::desc("Tracing of GEP instructions"),
- cl::Hidden, cl::init(false));
-
-static cl::opt<bool>
- ClPruneBlocks("sanitizer-coverage-prune-blocks",
- cl::desc("Reduce the number of instrumented blocks"),
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> ClStackDepth("sanitizer-coverage-stack-depth",
- cl::desc("max stack depth tracing"),
- cl::Hidden, cl::init(false));
-
-namespace {
-
-SanitizerCoverageOptions getOptions(int LegacyCoverageLevel) {
- SanitizerCoverageOptions Res;
- switch (LegacyCoverageLevel) {
- case 0:
- Res.CoverageType = SanitizerCoverageOptions::SCK_None;
- break;
- case 1:
- Res.CoverageType = SanitizerCoverageOptions::SCK_Function;
- break;
- case 2:
- Res.CoverageType = SanitizerCoverageOptions::SCK_BB;
- break;
- case 3:
- Res.CoverageType = SanitizerCoverageOptions::SCK_Edge;
- break;
- case 4:
- Res.CoverageType = SanitizerCoverageOptions::SCK_Edge;
- Res.IndirectCalls = true;
- break;
- }
- return Res;
-}
-
-SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
- // Sets CoverageType and IndirectCalls.
- SanitizerCoverageOptions CLOpts = getOptions(ClCoverageLevel);
- Options.CoverageType = std::max(Options.CoverageType, CLOpts.CoverageType);
- Options.IndirectCalls |= CLOpts.IndirectCalls;
- Options.TraceCmp |= ClCMPTracing;
- Options.TraceDiv |= ClDIVTracing;
- Options.TraceGep |= ClGEPTracing;
- Options.TracePC |= ClTracePC;
- Options.TracePCGuard |= ClTracePCGuard;
- Options.Inline8bitCounters |= ClInline8bitCounters;
- Options.InlineBoolFlag |= ClInlineBoolFlag;
- Options.PCTable |= ClCreatePCTable;
- Options.NoPrune |= !ClPruneBlocks;
- Options.StackDepth |= ClStackDepth;
- if (!Options.TracePCGuard && !Options.TracePC &&
- !Options.Inline8bitCounters && !Options.StackDepth &&
- !Options.InlineBoolFlag)
- Options.TracePCGuard = true; // TracePCGuard is default.
- return Options;
-}
-
-using DomTreeCallback = function_ref<const DominatorTree *(Function &F)>;
-using PostDomTreeCallback =
- function_ref<const PostDominatorTree *(Function &F)>;
-
-class ModuleSanitizerCoverage {
-public:
- ModuleSanitizerCoverage(
- const SanitizerCoverageOptions &Options = SanitizerCoverageOptions(),
- const SpecialCaseList *Allowlist = nullptr,
- const SpecialCaseList *Blocklist = nullptr)
- : Options(OverrideFromCL(Options)), Allowlist(Allowlist),
- Blocklist(Blocklist) {}
- bool instrumentModule(Module &M, DomTreeCallback DTCallback,
- PostDomTreeCallback PDTCallback);
-
-private:
- void instrumentFunction(Function &F, DomTreeCallback DTCallback,
- PostDomTreeCallback PDTCallback);
- void InjectCoverageForIndirectCalls(Function &F,
- ArrayRef<Instruction *> IndirCalls);
- void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets);
- void InjectTraceForDiv(Function &F,
- ArrayRef<BinaryOperator *> DivTraceTargets);
- void InjectTraceForGep(Function &F,
- ArrayRef<GetElementPtrInst *> GepTraceTargets);
- void InjectTraceForSwitch(Function &F,
- ArrayRef<Instruction *> SwitchTraceTargets);
- bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks,
- bool IsLeafFunc = true);
- GlobalVariable *CreateFunctionLocalArrayInSection(size_t NumElements,
- Function &F, Type *Ty,
- const char *Section);
- GlobalVariable *CreatePCArray(Function &F, ArrayRef<BasicBlock *> AllBlocks);
- void CreateFunctionLocalArrays(Function &F, ArrayRef<BasicBlock *> AllBlocks);
- void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx,
- bool IsLeafFunc = true);
- Function *CreateInitCallsForSections(Module &M, const char *CtorName,
- const char *InitFunctionName, Type *Ty,
- const char *Section);
- std::pair<Value *, Value *> CreateSecStartEnd(Module &M, const char *Section,
- Type *Ty);
-
- void SetNoSanitizeMetadata(Instruction *I) {
- I->setMetadata(I->getModule()->getMDKindID("nosanitize"),
- MDNode::get(*C, None));
- }
-
- std::string getSectionName(const std::string &Section) const;
- std::string getSectionStart(const std::string &Section) const;
- std::string getSectionEnd(const std::string &Section) const;
- FunctionCallee SanCovTracePCIndir;
- FunctionCallee SanCovTracePC, SanCovTracePCGuard;
- FunctionCallee SanCovTraceCmpFunction[4];
- FunctionCallee SanCovTraceConstCmpFunction[4];
- FunctionCallee SanCovTraceDivFunction[2];
- FunctionCallee SanCovTraceGepFunction;
- FunctionCallee SanCovTraceSwitchFunction;
- GlobalVariable *SanCovLowestStack;
- Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy,
- *Int16Ty, *Int8Ty, *Int8PtrTy, *Int1Ty, *Int1PtrTy;
- Module *CurModule;
- std::string CurModuleUniqueId;
- Triple TargetTriple;
- LLVMContext *C;
- const DataLayout *DL;
-
- GlobalVariable *FunctionGuardArray; // for trace-pc-guard.
- GlobalVariable *Function8bitCounterArray; // for inline-8bit-counters.
- GlobalVariable *FunctionBoolArray; // for inline-bool-flag.
- GlobalVariable *FunctionPCsArray; // for pc-table.
- SmallVector<GlobalValue *, 20> GlobalsToAppendToUsed;
- SmallVector<GlobalValue *, 20> GlobalsToAppendToCompilerUsed;
-
- SanitizerCoverageOptions Options;
-
- const SpecialCaseList *Allowlist;
- const SpecialCaseList *Blocklist;
-};
-
-class ModuleSanitizerCoverageLegacyPass : public ModulePass {
-public:
- ModuleSanitizerCoverageLegacyPass(
- const SanitizerCoverageOptions &Options = SanitizerCoverageOptions(),
- const std::vector<std::string> &AllowlistFiles =
- std::vector<std::string>(),
- const std::vector<std::string> &BlocklistFiles =
- std::vector<std::string>())
- : ModulePass(ID), Options(Options) {
- if (AllowlistFiles.size() > 0)
- Allowlist = SpecialCaseList::createOrDie(AllowlistFiles,
- *vfs::getRealFileSystem());
- if (BlocklistFiles.size() > 0)
- Blocklist = SpecialCaseList::createOrDie(BlocklistFiles,
- *vfs::getRealFileSystem());
- initializeModuleSanitizerCoverageLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
- bool runOnModule(Module &M) override {
- ModuleSanitizerCoverage ModuleSancov(Options, Allowlist.get(),
- Blocklist.get());
- auto DTCallback = [this](Function &F) -> const DominatorTree * {
- return &this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
- };
- auto PDTCallback = [this](Function &F) -> const PostDominatorTree * {
- return &this->getAnalysis<PostDominatorTreeWrapperPass>(F)
- .getPostDomTree();
- };
- return ModuleSancov.instrumentModule(M, DTCallback, PDTCallback);
- }
-
- static char ID; // Pass identification, replacement for typeid
- StringRef getPassName() const override { return "ModuleSanitizerCoverage"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<PostDominatorTreeWrapperPass>();
- }
-
-private:
- SanitizerCoverageOptions Options;
-
- std::unique_ptr<SpecialCaseList> Allowlist;
- std::unique_ptr<SpecialCaseList> Blocklist;
-};
-
-} // namespace
-
-PreservedAnalyses ModuleSanitizerCoveragePass::run(Module &M,
- ModuleAnalysisManager &MAM) {
- ModuleSanitizerCoverage ModuleSancov(Options, Allowlist.get(),
- Blocklist.get());
- auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- auto DTCallback = [&FAM](Function &F) -> const DominatorTree * {
- return &FAM.getResult<DominatorTreeAnalysis>(F);
- };
- auto PDTCallback = [&FAM](Function &F) -> const PostDominatorTree * {
- return &FAM.getResult<PostDominatorTreeAnalysis>(F);
- };
- if (ModuleSancov.instrumentModule(M, DTCallback, PDTCallback))
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
-}
-
-std::pair<Value *, Value *>
-ModuleSanitizerCoverage::CreateSecStartEnd(Module &M, const char *Section,
- Type *Ty) {
+
+static cl::opt<int> ClCoverageLevel(
+ "sanitizer-coverage-level",
+ cl::desc("Sanitizer Coverage. 0: none, 1: entry block, 2: all blocks, "
+ "3: all blocks and critical edges"),
+ cl::Hidden, cl::init(0));
+
+static cl::opt<bool> ClTracePC("sanitizer-coverage-trace-pc",
+ cl::desc("Experimental pc tracing"), cl::Hidden,
+ cl::init(false));
+
+static cl::opt<bool> ClTracePCGuard("sanitizer-coverage-trace-pc-guard",
+ cl::desc("pc tracing with a guard"),
+ cl::Hidden, cl::init(false));
+
+// If true, we create a global variable that contains PCs of all instrumented
+// BBs, put this global into a named section, and pass this section's bounds
+// to __sanitizer_cov_pcs_init.
+// This way the coverage instrumentation does not need to acquire the PCs
+// at run-time. Works with trace-pc-guard, inline-8bit-counters, and
+// inline-bool-flag.
+static cl::opt<bool> ClCreatePCTable("sanitizer-coverage-pc-table",
+ cl::desc("create a static PC table"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+ ClInline8bitCounters("sanitizer-coverage-inline-8bit-counters",
+ cl::desc("increments 8-bit counter for every edge"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+ ClInlineBoolFlag("sanitizer-coverage-inline-bool-flag",
+ cl::desc("sets a boolean flag for every edge"), cl::Hidden,
+ cl::init(false));
+
+static cl::opt<bool>
+ ClCMPTracing("sanitizer-coverage-trace-compares",
+ cl::desc("Tracing of CMP and similar instructions"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClDIVTracing("sanitizer-coverage-trace-divs",
+ cl::desc("Tracing of DIV instructions"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool> ClGEPTracing("sanitizer-coverage-trace-geps",
+ cl::desc("Tracing of GEP instructions"),
+ cl::Hidden, cl::init(false));
+
+static cl::opt<bool>
+ ClPruneBlocks("sanitizer-coverage-prune-blocks",
+ cl::desc("Reduce the number of instrumented blocks"),
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClStackDepth("sanitizer-coverage-stack-depth",
+ cl::desc("max stack depth tracing"),
+ cl::Hidden, cl::init(false));
+
+namespace {
+
+SanitizerCoverageOptions getOptions(int LegacyCoverageLevel) {
+ SanitizerCoverageOptions Res;
+ switch (LegacyCoverageLevel) {
+ case 0:
+ Res.CoverageType = SanitizerCoverageOptions::SCK_None;
+ break;
+ case 1:
+ Res.CoverageType = SanitizerCoverageOptions::SCK_Function;
+ break;
+ case 2:
+ Res.CoverageType = SanitizerCoverageOptions::SCK_BB;
+ break;
+ case 3:
+ Res.CoverageType = SanitizerCoverageOptions::SCK_Edge;
+ break;
+ case 4:
+ Res.CoverageType = SanitizerCoverageOptions::SCK_Edge;
+ Res.IndirectCalls = true;
+ break;
+ }
+ return Res;
+}
+
+SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
+ // Sets CoverageType and IndirectCalls.
+ SanitizerCoverageOptions CLOpts = getOptions(ClCoverageLevel);
+ Options.CoverageType = std::max(Options.CoverageType, CLOpts.CoverageType);
+ Options.IndirectCalls |= CLOpts.IndirectCalls;
+ Options.TraceCmp |= ClCMPTracing;
+ Options.TraceDiv |= ClDIVTracing;
+ Options.TraceGep |= ClGEPTracing;
+ Options.TracePC |= ClTracePC;
+ Options.TracePCGuard |= ClTracePCGuard;
+ Options.Inline8bitCounters |= ClInline8bitCounters;
+ Options.InlineBoolFlag |= ClInlineBoolFlag;
+ Options.PCTable |= ClCreatePCTable;
+ Options.NoPrune |= !ClPruneBlocks;
+ Options.StackDepth |= ClStackDepth;
+ if (!Options.TracePCGuard && !Options.TracePC &&
+ !Options.Inline8bitCounters && !Options.StackDepth &&
+ !Options.InlineBoolFlag)
+ Options.TracePCGuard = true; // TracePCGuard is default.
+ return Options;
+}
+
+using DomTreeCallback = function_ref<const DominatorTree *(Function &F)>;
+using PostDomTreeCallback =
+ function_ref<const PostDominatorTree *(Function &F)>;
+
+class ModuleSanitizerCoverage {
+public:
+ ModuleSanitizerCoverage(
+ const SanitizerCoverageOptions &Options = SanitizerCoverageOptions(),
+ const SpecialCaseList *Allowlist = nullptr,
+ const SpecialCaseList *Blocklist = nullptr)
+ : Options(OverrideFromCL(Options)), Allowlist(Allowlist),
+ Blocklist(Blocklist) {}
+ bool instrumentModule(Module &M, DomTreeCallback DTCallback,
+ PostDomTreeCallback PDTCallback);
+
+private:
+ void instrumentFunction(Function &F, DomTreeCallback DTCallback,
+ PostDomTreeCallback PDTCallback);
+ void InjectCoverageForIndirectCalls(Function &F,
+ ArrayRef<Instruction *> IndirCalls);
+ void InjectTraceForCmp(Function &F, ArrayRef<Instruction *> CmpTraceTargets);
+ void InjectTraceForDiv(Function &F,
+ ArrayRef<BinaryOperator *> DivTraceTargets);
+ void InjectTraceForGep(Function &F,
+ ArrayRef<GetElementPtrInst *> GepTraceTargets);
+ void InjectTraceForSwitch(Function &F,
+ ArrayRef<Instruction *> SwitchTraceTargets);
+ bool InjectCoverage(Function &F, ArrayRef<BasicBlock *> AllBlocks,
+ bool IsLeafFunc = true);
+ GlobalVariable *CreateFunctionLocalArrayInSection(size_t NumElements,
+ Function &F, Type *Ty,
+ const char *Section);
+ GlobalVariable *CreatePCArray(Function &F, ArrayRef<BasicBlock *> AllBlocks);
+ void CreateFunctionLocalArrays(Function &F, ArrayRef<BasicBlock *> AllBlocks);
+ void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx,
+ bool IsLeafFunc = true);
+ Function *CreateInitCallsForSections(Module &M, const char *CtorName,
+ const char *InitFunctionName, Type *Ty,
+ const char *Section);
+ std::pair<Value *, Value *> CreateSecStartEnd(Module &M, const char *Section,
+ Type *Ty);
+
+ void SetNoSanitizeMetadata(Instruction *I) {
+ I->setMetadata(I->getModule()->getMDKindID("nosanitize"),
+ MDNode::get(*C, None));
+ }
+
+ std::string getSectionName(const std::string &Section) const;
+ std::string getSectionStart(const std::string &Section) const;
+ std::string getSectionEnd(const std::string &Section) const;
+ FunctionCallee SanCovTracePCIndir;
+ FunctionCallee SanCovTracePC, SanCovTracePCGuard;
+ FunctionCallee SanCovTraceCmpFunction[4];
+ FunctionCallee SanCovTraceConstCmpFunction[4];
+ FunctionCallee SanCovTraceDivFunction[2];
+ FunctionCallee SanCovTraceGepFunction;
+ FunctionCallee SanCovTraceSwitchFunction;
+ GlobalVariable *SanCovLowestStack;
+ Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy,
+ *Int16Ty, *Int8Ty, *Int8PtrTy, *Int1Ty, *Int1PtrTy;
+ Module *CurModule;
+ std::string CurModuleUniqueId;
+ Triple TargetTriple;
+ LLVMContext *C;
+ const DataLayout *DL;
+
+ GlobalVariable *FunctionGuardArray; // for trace-pc-guard.
+ GlobalVariable *Function8bitCounterArray; // for inline-8bit-counters.
+ GlobalVariable *FunctionBoolArray; // for inline-bool-flag.
+ GlobalVariable *FunctionPCsArray; // for pc-table.
+ SmallVector<GlobalValue *, 20> GlobalsToAppendToUsed;
+ SmallVector<GlobalValue *, 20> GlobalsToAppendToCompilerUsed;
+
+ SanitizerCoverageOptions Options;
+
+ const SpecialCaseList *Allowlist;
+ const SpecialCaseList *Blocklist;
+};
+
+class ModuleSanitizerCoverageLegacyPass : public ModulePass {
+public:
+ ModuleSanitizerCoverageLegacyPass(
+ const SanitizerCoverageOptions &Options = SanitizerCoverageOptions(),
+ const std::vector<std::string> &AllowlistFiles =
+ std::vector<std::string>(),
+ const std::vector<std::string> &BlocklistFiles =
+ std::vector<std::string>())
+ : ModulePass(ID), Options(Options) {
+ if (AllowlistFiles.size() > 0)
+ Allowlist = SpecialCaseList::createOrDie(AllowlistFiles,
+ *vfs::getRealFileSystem());
+ if (BlocklistFiles.size() > 0)
+ Blocklist = SpecialCaseList::createOrDie(BlocklistFiles,
+ *vfs::getRealFileSystem());
+ initializeModuleSanitizerCoverageLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+ bool runOnModule(Module &M) override {
+ ModuleSanitizerCoverage ModuleSancov(Options, Allowlist.get(),
+ Blocklist.get());
+ auto DTCallback = [this](Function &F) -> const DominatorTree * {
+ return &this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+ };
+ auto PDTCallback = [this](Function &F) -> const PostDominatorTree * {
+ return &this->getAnalysis<PostDominatorTreeWrapperPass>(F)
+ .getPostDomTree();
+ };
+ return ModuleSancov.instrumentModule(M, DTCallback, PDTCallback);
+ }
+
+ static char ID; // Pass identification, replacement for typeid
+ StringRef getPassName() const override { return "ModuleSanitizerCoverage"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+ }
+
+private:
+ SanitizerCoverageOptions Options;
+
+ std::unique_ptr<SpecialCaseList> Allowlist;
+ std::unique_ptr<SpecialCaseList> Blocklist;
+};
+
+} // namespace
+
+PreservedAnalyses ModuleSanitizerCoveragePass::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+ ModuleSanitizerCoverage ModuleSancov(Options, Allowlist.get(),
+ Blocklist.get());
+ auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ auto DTCallback = [&FAM](Function &F) -> const DominatorTree * {
+ return &FAM.getResult<DominatorTreeAnalysis>(F);
+ };
+ auto PDTCallback = [&FAM](Function &F) -> const PostDominatorTree * {
+ return &FAM.getResult<PostDominatorTreeAnalysis>(F);
+ };
+ if (ModuleSancov.instrumentModule(M, DTCallback, PDTCallback))
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
+std::pair<Value *, Value *>
+ModuleSanitizerCoverage::CreateSecStartEnd(Module &M, const char *Section,
+ Type *Ty) {
GlobalVariable *SecStart = new GlobalVariable(
M, Ty->getPointerElementType(), false, GlobalVariable::ExternalLinkage,
nullptr, getSectionStart(Section));
- SecStart->setVisibility(GlobalValue::HiddenVisibility);
+ SecStart->setVisibility(GlobalValue::HiddenVisibility);
GlobalVariable *SecEnd = new GlobalVariable(
M, Ty->getPointerElementType(), false, GlobalVariable::ExternalLinkage,
nullptr, getSectionEnd(Section));
- SecEnd->setVisibility(GlobalValue::HiddenVisibility);
- IRBuilder<> IRB(M.getContext());
- if (!TargetTriple.isOSBinFormatCOFF())
+ SecEnd->setVisibility(GlobalValue::HiddenVisibility);
+ IRBuilder<> IRB(M.getContext());
+ if (!TargetTriple.isOSBinFormatCOFF())
return std::make_pair(SecStart, SecEnd);
-
- // Account for the fact that on windows-msvc __start_* symbols actually
- // point to a uint64_t before the start of the array.
- auto SecStartI8Ptr = IRB.CreatePointerCast(SecStart, Int8PtrTy);
- auto GEP = IRB.CreateGEP(Int8Ty, SecStartI8Ptr,
- ConstantInt::get(IntptrTy, sizeof(uint64_t)));
+
+ // Account for the fact that on windows-msvc __start_* symbols actually
+ // point to a uint64_t before the start of the array.
+ auto SecStartI8Ptr = IRB.CreatePointerCast(SecStart, Int8PtrTy);
+ auto GEP = IRB.CreateGEP(Int8Ty, SecStartI8Ptr,
+ ConstantInt::get(IntptrTy, sizeof(uint64_t)));
return std::make_pair(IRB.CreatePointerCast(GEP, Ty), SecEnd);
-}
-
-Function *ModuleSanitizerCoverage::CreateInitCallsForSections(
- Module &M, const char *CtorName, const char *InitFunctionName, Type *Ty,
- const char *Section) {
- auto SecStartEnd = CreateSecStartEnd(M, Section, Ty);
- auto SecStart = SecStartEnd.first;
- auto SecEnd = SecStartEnd.second;
- Function *CtorFunc;
- std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
- M, CtorName, InitFunctionName, {Ty, Ty}, {SecStart, SecEnd});
- assert(CtorFunc->getName() == CtorName);
-
- if (TargetTriple.supportsCOMDAT()) {
- // Use comdat to dedup CtorFunc.
- CtorFunc->setComdat(M.getOrInsertComdat(CtorName));
- appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority, CtorFunc);
- } else {
- appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
- }
-
- if (TargetTriple.isOSBinFormatCOFF()) {
- // In COFF files, if the contructors are set as COMDAT (they are because
- // COFF supports COMDAT) and the linker flag /OPT:REF (strip unreferenced
- // functions and data) is used, the constructors get stripped. To prevent
- // this, give the constructors weak ODR linkage and ensure the linker knows
- // to include the sancov constructor. This way the linker can deduplicate
- // the constructors but always leave one copy.
- CtorFunc->setLinkage(GlobalValue::WeakODRLinkage);
- appendToUsed(M, CtorFunc);
- }
- return CtorFunc;
-}
-
-bool ModuleSanitizerCoverage::instrumentModule(
- Module &M, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
- if (Options.CoverageType == SanitizerCoverageOptions::SCK_None)
- return false;
- if (Allowlist &&
- !Allowlist->inSection("coverage", "src", M.getSourceFileName()))
- return false;
- if (Blocklist &&
- Blocklist->inSection("coverage", "src", M.getSourceFileName()))
- return false;
- C = &(M.getContext());
- DL = &M.getDataLayout();
- CurModule = &M;
- CurModuleUniqueId = getUniqueModuleId(CurModule);
- TargetTriple = Triple(M.getTargetTriple());
- FunctionGuardArray = nullptr;
- Function8bitCounterArray = nullptr;
- FunctionBoolArray = nullptr;
- FunctionPCsArray = nullptr;
- IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());
- IntptrPtrTy = PointerType::getUnqual(IntptrTy);
- Type *VoidTy = Type::getVoidTy(*C);
- IRBuilder<> IRB(*C);
- Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty());
- Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
- Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
- Int1PtrTy = PointerType::getUnqual(IRB.getInt1Ty());
- Int64Ty = IRB.getInt64Ty();
- Int32Ty = IRB.getInt32Ty();
- Int16Ty = IRB.getInt16Ty();
- Int8Ty = IRB.getInt8Ty();
- Int1Ty = IRB.getInt1Ty();
-
- SanCovTracePCIndir =
- M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy);
+}
+
+Function *ModuleSanitizerCoverage::CreateInitCallsForSections(
+ Module &M, const char *CtorName, const char *InitFunctionName, Type *Ty,
+ const char *Section) {
+ auto SecStartEnd = CreateSecStartEnd(M, Section, Ty);
+ auto SecStart = SecStartEnd.first;
+ auto SecEnd = SecStartEnd.second;
+ Function *CtorFunc;
+ std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
+ M, CtorName, InitFunctionName, {Ty, Ty}, {SecStart, SecEnd});
+ assert(CtorFunc->getName() == CtorName);
+
+ if (TargetTriple.supportsCOMDAT()) {
+ // Use comdat to dedup CtorFunc.
+ CtorFunc->setComdat(M.getOrInsertComdat(CtorName));
+ appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority, CtorFunc);
+ } else {
+ appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
+ }
+
+ if (TargetTriple.isOSBinFormatCOFF()) {
+ // In COFF files, if the contructors are set as COMDAT (they are because
+ // COFF supports COMDAT) and the linker flag /OPT:REF (strip unreferenced
+ // functions and data) is used, the constructors get stripped. To prevent
+ // this, give the constructors weak ODR linkage and ensure the linker knows
+ // to include the sancov constructor. This way the linker can deduplicate
+ // the constructors but always leave one copy.
+ CtorFunc->setLinkage(GlobalValue::WeakODRLinkage);
+ appendToUsed(M, CtorFunc);
+ }
+ return CtorFunc;
+}
+
+bool ModuleSanitizerCoverage::instrumentModule(
+ Module &M, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
+ if (Options.CoverageType == SanitizerCoverageOptions::SCK_None)
+ return false;
+ if (Allowlist &&
+ !Allowlist->inSection("coverage", "src", M.getSourceFileName()))
+ return false;
+ if (Blocklist &&
+ Blocklist->inSection("coverage", "src", M.getSourceFileName()))
+ return false;
+ C = &(M.getContext());
+ DL = &M.getDataLayout();
+ CurModule = &M;
+ CurModuleUniqueId = getUniqueModuleId(CurModule);
+ TargetTriple = Triple(M.getTargetTriple());
+ FunctionGuardArray = nullptr;
+ Function8bitCounterArray = nullptr;
+ FunctionBoolArray = nullptr;
+ FunctionPCsArray = nullptr;
+ IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());
+ IntptrPtrTy = PointerType::getUnqual(IntptrTy);
+ Type *VoidTy = Type::getVoidTy(*C);
+ IRBuilder<> IRB(*C);
+ Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty());
+ Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
+ Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
+ Int1PtrTy = PointerType::getUnqual(IRB.getInt1Ty());
+ Int64Ty = IRB.getInt64Ty();
+ Int32Ty = IRB.getInt32Ty();
+ Int16Ty = IRB.getInt16Ty();
+ Int8Ty = IRB.getInt8Ty();
+ Int1Ty = IRB.getInt1Ty();
+
+ SanCovTracePCIndir =
+ M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy);
// Make sure smaller parameters are zero-extended to i64 if required by the
// target ABI.
- AttributeList SanCovTraceCmpZeroExtAL;
+ AttributeList SanCovTraceCmpZeroExtAL;
SanCovTraceCmpZeroExtAL =
SanCovTraceCmpZeroExtAL.addParamAttribute(*C, 0, Attribute::ZExt);
SanCovTraceCmpZeroExtAL =
SanCovTraceCmpZeroExtAL.addParamAttribute(*C, 1, Attribute::ZExt);
-
- SanCovTraceCmpFunction[0] =
- M.getOrInsertFunction(SanCovTraceCmp1, SanCovTraceCmpZeroExtAL, VoidTy,
- IRB.getInt8Ty(), IRB.getInt8Ty());
- SanCovTraceCmpFunction[1] =
- M.getOrInsertFunction(SanCovTraceCmp2, SanCovTraceCmpZeroExtAL, VoidTy,
- IRB.getInt16Ty(), IRB.getInt16Ty());
- SanCovTraceCmpFunction[2] =
- M.getOrInsertFunction(SanCovTraceCmp4, SanCovTraceCmpZeroExtAL, VoidTy,
- IRB.getInt32Ty(), IRB.getInt32Ty());
- SanCovTraceCmpFunction[3] =
- M.getOrInsertFunction(SanCovTraceCmp8, VoidTy, Int64Ty, Int64Ty);
-
- SanCovTraceConstCmpFunction[0] = M.getOrInsertFunction(
- SanCovTraceConstCmp1, SanCovTraceCmpZeroExtAL, VoidTy, Int8Ty, Int8Ty);
- SanCovTraceConstCmpFunction[1] = M.getOrInsertFunction(
- SanCovTraceConstCmp2, SanCovTraceCmpZeroExtAL, VoidTy, Int16Ty, Int16Ty);
- SanCovTraceConstCmpFunction[2] = M.getOrInsertFunction(
- SanCovTraceConstCmp4, SanCovTraceCmpZeroExtAL, VoidTy, Int32Ty, Int32Ty);
- SanCovTraceConstCmpFunction[3] =
- M.getOrInsertFunction(SanCovTraceConstCmp8, VoidTy, Int64Ty, Int64Ty);
-
- {
- AttributeList AL;
+
+ SanCovTraceCmpFunction[0] =
+ M.getOrInsertFunction(SanCovTraceCmp1, SanCovTraceCmpZeroExtAL, VoidTy,
+ IRB.getInt8Ty(), IRB.getInt8Ty());
+ SanCovTraceCmpFunction[1] =
+ M.getOrInsertFunction(SanCovTraceCmp2, SanCovTraceCmpZeroExtAL, VoidTy,
+ IRB.getInt16Ty(), IRB.getInt16Ty());
+ SanCovTraceCmpFunction[2] =
+ M.getOrInsertFunction(SanCovTraceCmp4, SanCovTraceCmpZeroExtAL, VoidTy,
+ IRB.getInt32Ty(), IRB.getInt32Ty());
+ SanCovTraceCmpFunction[3] =
+ M.getOrInsertFunction(SanCovTraceCmp8, VoidTy, Int64Ty, Int64Ty);
+
+ SanCovTraceConstCmpFunction[0] = M.getOrInsertFunction(
+ SanCovTraceConstCmp1, SanCovTraceCmpZeroExtAL, VoidTy, Int8Ty, Int8Ty);
+ SanCovTraceConstCmpFunction[1] = M.getOrInsertFunction(
+ SanCovTraceConstCmp2, SanCovTraceCmpZeroExtAL, VoidTy, Int16Ty, Int16Ty);
+ SanCovTraceConstCmpFunction[2] = M.getOrInsertFunction(
+ SanCovTraceConstCmp4, SanCovTraceCmpZeroExtAL, VoidTy, Int32Ty, Int32Ty);
+ SanCovTraceConstCmpFunction[3] =
+ M.getOrInsertFunction(SanCovTraceConstCmp8, VoidTy, Int64Ty, Int64Ty);
+
+ {
+ AttributeList AL;
AL = AL.addParamAttribute(*C, 0, Attribute::ZExt);
- SanCovTraceDivFunction[0] =
- M.getOrInsertFunction(SanCovTraceDiv4, AL, VoidTy, IRB.getInt32Ty());
- }
- SanCovTraceDivFunction[1] =
- M.getOrInsertFunction(SanCovTraceDiv8, VoidTy, Int64Ty);
- SanCovTraceGepFunction =
- M.getOrInsertFunction(SanCovTraceGep, VoidTy, IntptrTy);
- SanCovTraceSwitchFunction =
- M.getOrInsertFunction(SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy);
-
- Constant *SanCovLowestStackConstant =
- M.getOrInsertGlobal(SanCovLowestStackName, IntptrTy);
- SanCovLowestStack = dyn_cast<GlobalVariable>(SanCovLowestStackConstant);
- if (!SanCovLowestStack) {
- C->emitError(StringRef("'") + SanCovLowestStackName +
- "' should not be declared by the user");
- return true;
- }
- SanCovLowestStack->setThreadLocalMode(
- GlobalValue::ThreadLocalMode::InitialExecTLSModel);
- if (Options.StackDepth && !SanCovLowestStack->isDeclaration())
- SanCovLowestStack->setInitializer(Constant::getAllOnesValue(IntptrTy));
-
- SanCovTracePC = M.getOrInsertFunction(SanCovTracePCName, VoidTy);
- SanCovTracePCGuard =
- M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, Int32PtrTy);
-
- for (auto &F : M)
- instrumentFunction(F, DTCallback, PDTCallback);
-
- Function *Ctor = nullptr;
-
- if (FunctionGuardArray)
- Ctor = CreateInitCallsForSections(M, SanCovModuleCtorTracePcGuardName,
- SanCovTracePCGuardInitName, Int32PtrTy,
- SanCovGuardsSectionName);
- if (Function8bitCounterArray)
- Ctor = CreateInitCallsForSections(M, SanCovModuleCtor8bitCountersName,
- SanCov8bitCountersInitName, Int8PtrTy,
- SanCovCountersSectionName);
- if (FunctionBoolArray) {
- Ctor = CreateInitCallsForSections(M, SanCovModuleCtorBoolFlagName,
- SanCovBoolFlagInitName, Int1PtrTy,
- SanCovBoolFlagSectionName);
- }
- if (Ctor && Options.PCTable) {
- auto SecStartEnd = CreateSecStartEnd(M, SanCovPCsSectionName, IntptrPtrTy);
- FunctionCallee InitFunction = declareSanitizerInitFunction(
- M, SanCovPCsInitName, {IntptrPtrTy, IntptrPtrTy});
- IRBuilder<> IRBCtor(Ctor->getEntryBlock().getTerminator());
- IRBCtor.CreateCall(InitFunction, {SecStartEnd.first, SecStartEnd.second});
- }
- // We don't reference these arrays directly in any of our runtime functions,
- // so we need to prevent them from being dead stripped.
- if (TargetTriple.isOSBinFormatMachO())
- appendToUsed(M, GlobalsToAppendToUsed);
- appendToCompilerUsed(M, GlobalsToAppendToCompilerUsed);
- return true;
-}
-
-// True if block has successors and it dominates all of them.
-static bool isFullDominator(const BasicBlock *BB, const DominatorTree *DT) {
+ SanCovTraceDivFunction[0] =
+ M.getOrInsertFunction(SanCovTraceDiv4, AL, VoidTy, IRB.getInt32Ty());
+ }
+ SanCovTraceDivFunction[1] =
+ M.getOrInsertFunction(SanCovTraceDiv8, VoidTy, Int64Ty);
+ SanCovTraceGepFunction =
+ M.getOrInsertFunction(SanCovTraceGep, VoidTy, IntptrTy);
+ SanCovTraceSwitchFunction =
+ M.getOrInsertFunction(SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy);
+
+ Constant *SanCovLowestStackConstant =
+ M.getOrInsertGlobal(SanCovLowestStackName, IntptrTy);
+ SanCovLowestStack = dyn_cast<GlobalVariable>(SanCovLowestStackConstant);
+ if (!SanCovLowestStack) {
+ C->emitError(StringRef("'") + SanCovLowestStackName +
+ "' should not be declared by the user");
+ return true;
+ }
+ SanCovLowestStack->setThreadLocalMode(
+ GlobalValue::ThreadLocalMode::InitialExecTLSModel);
+ if (Options.StackDepth && !SanCovLowestStack->isDeclaration())
+ SanCovLowestStack->setInitializer(Constant::getAllOnesValue(IntptrTy));
+
+ SanCovTracePC = M.getOrInsertFunction(SanCovTracePCName, VoidTy);
+ SanCovTracePCGuard =
+ M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, Int32PtrTy);
+
+ for (auto &F : M)
+ instrumentFunction(F, DTCallback, PDTCallback);
+
+ Function *Ctor = nullptr;
+
+ if (FunctionGuardArray)
+ Ctor = CreateInitCallsForSections(M, SanCovModuleCtorTracePcGuardName,
+ SanCovTracePCGuardInitName, Int32PtrTy,
+ SanCovGuardsSectionName);
+ if (Function8bitCounterArray)
+ Ctor = CreateInitCallsForSections(M, SanCovModuleCtor8bitCountersName,
+ SanCov8bitCountersInitName, Int8PtrTy,
+ SanCovCountersSectionName);
+ if (FunctionBoolArray) {
+ Ctor = CreateInitCallsForSections(M, SanCovModuleCtorBoolFlagName,
+ SanCovBoolFlagInitName, Int1PtrTy,
+ SanCovBoolFlagSectionName);
+ }
+ if (Ctor && Options.PCTable) {
+ auto SecStartEnd = CreateSecStartEnd(M, SanCovPCsSectionName, IntptrPtrTy);
+ FunctionCallee InitFunction = declareSanitizerInitFunction(
+ M, SanCovPCsInitName, {IntptrPtrTy, IntptrPtrTy});
+ IRBuilder<> IRBCtor(Ctor->getEntryBlock().getTerminator());
+ IRBCtor.CreateCall(InitFunction, {SecStartEnd.first, SecStartEnd.second});
+ }
+ // We don't reference these arrays directly in any of our runtime functions,
+ // so we need to prevent them from being dead stripped.
+ if (TargetTriple.isOSBinFormatMachO())
+ appendToUsed(M, GlobalsToAppendToUsed);
+ appendToCompilerUsed(M, GlobalsToAppendToCompilerUsed);
+ return true;
+}
+
+// True if block has successors and it dominates all of them.
+static bool isFullDominator(const BasicBlock *BB, const DominatorTree *DT) {
if (succ_empty(BB))
- return false;
-
+ return false;
+
return llvm::all_of(successors(BB), [&](const BasicBlock *SUCC) {
return DT->dominates(BB, SUCC);
});
-}
-
-// True if block has predecessors and it postdominates all of them.
-static bool isFullPostDominator(const BasicBlock *BB,
- const PostDominatorTree *PDT) {
+}
+
+// True if block has predecessors and it postdominates all of them.
+static bool isFullPostDominator(const BasicBlock *BB,
+ const PostDominatorTree *PDT) {
if (pred_empty(BB))
- return false;
-
+ return false;
+
return llvm::all_of(predecessors(BB), [&](const BasicBlock *PRED) {
return PDT->dominates(BB, PRED);
});
-}
-
-static bool shouldInstrumentBlock(const Function &F, const BasicBlock *BB,
- const DominatorTree *DT,
- const PostDominatorTree *PDT,
- const SanitizerCoverageOptions &Options) {
- // Don't insert coverage for blocks containing nothing but unreachable: we
- // will never call __sanitizer_cov() for them, so counting them in
- // NumberOfInstrumentedBlocks() might complicate calculation of code coverage
- // percentage. Also, unreachable instructions frequently have no debug
- // locations.
- if (isa<UnreachableInst>(BB->getFirstNonPHIOrDbgOrLifetime()))
- return false;
-
- // Don't insert coverage into blocks without a valid insertion point
- // (catchswitch blocks).
- if (BB->getFirstInsertionPt() == BB->end())
- return false;
-
- if (Options.NoPrune || &F.getEntryBlock() == BB)
- return true;
-
- if (Options.CoverageType == SanitizerCoverageOptions::SCK_Function &&
- &F.getEntryBlock() != BB)
- return false;
-
- // Do not instrument full dominators, or full post-dominators with multiple
- // predecessors.
- return !isFullDominator(BB, DT)
- && !(isFullPostDominator(BB, PDT) && !BB->getSinglePredecessor());
-}
-
-
-// Returns true iff From->To is a backedge.
-// A twist here is that we treat From->To as a backedge if
-// * To dominates From or
-// * To->UniqueSuccessor dominates From
-static bool IsBackEdge(BasicBlock *From, BasicBlock *To,
- const DominatorTree *DT) {
- if (DT->dominates(To, From))
- return true;
- if (auto Next = To->getUniqueSuccessor())
- if (DT->dominates(Next, From))
- return true;
- return false;
-}
-
-// Prunes uninteresting Cmp instrumentation:
-// * CMP instructions that feed into loop backedge branch.
-//
-// Note that Cmp pruning is controlled by the same flag as the
-// BB pruning.
-static bool IsInterestingCmp(ICmpInst *CMP, const DominatorTree *DT,
- const SanitizerCoverageOptions &Options) {
- if (!Options.NoPrune)
- if (CMP->hasOneUse())
- if (auto BR = dyn_cast<BranchInst>(CMP->user_back()))
- for (BasicBlock *B : BR->successors())
- if (IsBackEdge(BR->getParent(), B, DT))
- return false;
- return true;
-}
-
-void ModuleSanitizerCoverage::instrumentFunction(
- Function &F, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
- if (F.empty())
- return;
- if (F.getName().find(".module_ctor") != std::string::npos)
- return; // Should not instrument sanitizer init functions.
- if (F.getName().startswith("__sanitizer_"))
- return; // Don't instrument __sanitizer_* callbacks.
- // Don't touch available_externally functions, their actual body is elewhere.
- if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
- return;
- // Don't instrument MSVC CRT configuration helpers. They may run before normal
- // initialization.
- if (F.getName() == "__local_stdio_printf_options" ||
- F.getName() == "__local_stdio_scanf_options")
- return;
- if (isa<UnreachableInst>(F.getEntryBlock().getTerminator()))
- return;
- // Don't instrument functions using SEH for now. Splitting basic blocks like
- // we do for coverage breaks WinEHPrepare.
- // FIXME: Remove this when SEH no longer uses landingpad pattern matching.
- if (F.hasPersonalityFn() &&
- isAsynchronousEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
- return;
- if (Allowlist && !Allowlist->inSection("coverage", "fun", F.getName()))
- return;
- if (Blocklist && Blocklist->inSection("coverage", "fun", F.getName()))
- return;
- if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge)
- SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions().setIgnoreUnreachableDests());
- SmallVector<Instruction *, 8> IndirCalls;
- SmallVector<BasicBlock *, 16> BlocksToInstrument;
- SmallVector<Instruction *, 8> CmpTraceTargets;
- SmallVector<Instruction *, 8> SwitchTraceTargets;
- SmallVector<BinaryOperator *, 8> DivTraceTargets;
- SmallVector<GetElementPtrInst *, 8> GepTraceTargets;
-
- const DominatorTree *DT = DTCallback(F);
- const PostDominatorTree *PDT = PDTCallback(F);
- bool IsLeafFunc = true;
-
- for (auto &BB : F) {
- if (shouldInstrumentBlock(F, &BB, DT, PDT, Options))
- BlocksToInstrument.push_back(&BB);
- for (auto &Inst : BB) {
- if (Options.IndirectCalls) {
- CallBase *CB = dyn_cast<CallBase>(&Inst);
- if (CB && !CB->getCalledFunction())
- IndirCalls.push_back(&Inst);
- }
- if (Options.TraceCmp) {
- if (ICmpInst *CMP = dyn_cast<ICmpInst>(&Inst))
- if (IsInterestingCmp(CMP, DT, Options))
- CmpTraceTargets.push_back(&Inst);
- if (isa<SwitchInst>(&Inst))
- SwitchTraceTargets.push_back(&Inst);
- }
- if (Options.TraceDiv)
- if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&Inst))
- if (BO->getOpcode() == Instruction::SDiv ||
- BO->getOpcode() == Instruction::UDiv)
- DivTraceTargets.push_back(BO);
- if (Options.TraceGep)
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Inst))
- GepTraceTargets.push_back(GEP);
- if (Options.StackDepth)
- if (isa<InvokeInst>(Inst) ||
- (isa<CallInst>(Inst) && !isa<IntrinsicInst>(Inst)))
- IsLeafFunc = false;
- }
- }
-
- InjectCoverage(F, BlocksToInstrument, IsLeafFunc);
- InjectCoverageForIndirectCalls(F, IndirCalls);
- InjectTraceForCmp(F, CmpTraceTargets);
- InjectTraceForSwitch(F, SwitchTraceTargets);
- InjectTraceForDiv(F, DivTraceTargets);
- InjectTraceForGep(F, GepTraceTargets);
-}
-
-GlobalVariable *ModuleSanitizerCoverage::CreateFunctionLocalArrayInSection(
- size_t NumElements, Function &F, Type *Ty, const char *Section) {
- ArrayType *ArrayTy = ArrayType::get(Ty, NumElements);
- auto Array = new GlobalVariable(
- *CurModule, ArrayTy, false, GlobalVariable::PrivateLinkage,
- Constant::getNullValue(ArrayTy), "__sancov_gen_");
-
- if (TargetTriple.supportsCOMDAT() && !F.isInterposable())
- if (auto Comdat =
- GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId))
- Array->setComdat(Comdat);
- Array->setSection(getSectionName(Section));
- Array->setAlignment(Align(DL->getTypeStoreSize(Ty).getFixedSize()));
- GlobalsToAppendToUsed.push_back(Array);
- GlobalsToAppendToCompilerUsed.push_back(Array);
- MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F));
- Array->addMetadata(LLVMContext::MD_associated, *MD);
-
- return Array;
-}
-
-GlobalVariable *
-ModuleSanitizerCoverage::CreatePCArray(Function &F,
- ArrayRef<BasicBlock *> AllBlocks) {
- size_t N = AllBlocks.size();
- assert(N);
- SmallVector<Constant *, 32> PCs;
- IRBuilder<> IRB(&*F.getEntryBlock().getFirstInsertionPt());
- for (size_t i = 0; i < N; i++) {
- if (&F.getEntryBlock() == AllBlocks[i]) {
- PCs.push_back((Constant *)IRB.CreatePointerCast(&F, IntptrPtrTy));
- PCs.push_back((Constant *)IRB.CreateIntToPtr(
- ConstantInt::get(IntptrTy, 1), IntptrPtrTy));
- } else {
- PCs.push_back((Constant *)IRB.CreatePointerCast(
- BlockAddress::get(AllBlocks[i]), IntptrPtrTy));
- PCs.push_back((Constant *)IRB.CreateIntToPtr(
- ConstantInt::get(IntptrTy, 0), IntptrPtrTy));
- }
- }
- auto *PCArray = CreateFunctionLocalArrayInSection(N * 2, F, IntptrPtrTy,
- SanCovPCsSectionName);
- PCArray->setInitializer(
- ConstantArray::get(ArrayType::get(IntptrPtrTy, N * 2), PCs));
- PCArray->setConstant(true);
-
- return PCArray;
-}
-
-void ModuleSanitizerCoverage::CreateFunctionLocalArrays(
- Function &F, ArrayRef<BasicBlock *> AllBlocks) {
- if (Options.TracePCGuard)
- FunctionGuardArray = CreateFunctionLocalArrayInSection(
- AllBlocks.size(), F, Int32Ty, SanCovGuardsSectionName);
-
- if (Options.Inline8bitCounters)
- Function8bitCounterArray = CreateFunctionLocalArrayInSection(
- AllBlocks.size(), F, Int8Ty, SanCovCountersSectionName);
- if (Options.InlineBoolFlag)
- FunctionBoolArray = CreateFunctionLocalArrayInSection(
- AllBlocks.size(), F, Int1Ty, SanCovBoolFlagSectionName);
-
- if (Options.PCTable)
- FunctionPCsArray = CreatePCArray(F, AllBlocks);
-}
-
-bool ModuleSanitizerCoverage::InjectCoverage(Function &F,
- ArrayRef<BasicBlock *> AllBlocks,
- bool IsLeafFunc) {
- if (AllBlocks.empty()) return false;
- CreateFunctionLocalArrays(F, AllBlocks);
- for (size_t i = 0, N = AllBlocks.size(); i < N; i++)
- InjectCoverageAtBlock(F, *AllBlocks[i], i, IsLeafFunc);
- return true;
-}
-
-// On every indirect call we call a run-time function
-// __sanitizer_cov_indir_call* with two parameters:
-// - callee address,
-// - global cache array that contains CacheSize pointers (zero-initialized).
-// The cache is used to speed up recording the caller-callee pairs.
-// The address of the caller is passed implicitly via caller PC.
-// CacheSize is encoded in the name of the run-time function.
-void ModuleSanitizerCoverage::InjectCoverageForIndirectCalls(
- Function &F, ArrayRef<Instruction *> IndirCalls) {
- if (IndirCalls.empty())
- return;
- assert(Options.TracePC || Options.TracePCGuard ||
- Options.Inline8bitCounters || Options.InlineBoolFlag);
- for (auto I : IndirCalls) {
- IRBuilder<> IRB(I);
- CallBase &CB = cast<CallBase>(*I);
- Value *Callee = CB.getCalledOperand();
- if (isa<InlineAsm>(Callee))
- continue;
- IRB.CreateCall(SanCovTracePCIndir, IRB.CreatePointerCast(Callee, IntptrTy));
- }
-}
-
-// For every switch statement we insert a call:
-// __sanitizer_cov_trace_switch(CondValue,
-// {NumCases, ValueSizeInBits, Case0Value, Case1Value, Case2Value, ... })
-
-void ModuleSanitizerCoverage::InjectTraceForSwitch(
- Function &, ArrayRef<Instruction *> SwitchTraceTargets) {
- for (auto I : SwitchTraceTargets) {
- if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
- IRBuilder<> IRB(I);
- SmallVector<Constant *, 16> Initializers;
- Value *Cond = SI->getCondition();
- if (Cond->getType()->getScalarSizeInBits() >
- Int64Ty->getScalarSizeInBits())
- continue;
- Initializers.push_back(ConstantInt::get(Int64Ty, SI->getNumCases()));
- Initializers.push_back(
- ConstantInt::get(Int64Ty, Cond->getType()->getScalarSizeInBits()));
- if (Cond->getType()->getScalarSizeInBits() <
- Int64Ty->getScalarSizeInBits())
- Cond = IRB.CreateIntCast(Cond, Int64Ty, false);
- for (auto It : SI->cases()) {
- Constant *C = It.getCaseValue();
- if (C->getType()->getScalarSizeInBits() <
- Int64Ty->getScalarSizeInBits())
- C = ConstantExpr::getCast(CastInst::ZExt, It.getCaseValue(), Int64Ty);
- Initializers.push_back(C);
- }
+}
+
+static bool shouldInstrumentBlock(const Function &F, const BasicBlock *BB,
+ const DominatorTree *DT,
+ const PostDominatorTree *PDT,
+ const SanitizerCoverageOptions &Options) {
+ // Don't insert coverage for blocks containing nothing but unreachable: we
+ // will never call __sanitizer_cov() for them, so counting them in
+ // NumberOfInstrumentedBlocks() might complicate calculation of code coverage
+ // percentage. Also, unreachable instructions frequently have no debug
+ // locations.
+ if (isa<UnreachableInst>(BB->getFirstNonPHIOrDbgOrLifetime()))
+ return false;
+
+ // Don't insert coverage into blocks without a valid insertion point
+ // (catchswitch blocks).
+ if (BB->getFirstInsertionPt() == BB->end())
+ return false;
+
+ if (Options.NoPrune || &F.getEntryBlock() == BB)
+ return true;
+
+ if (Options.CoverageType == SanitizerCoverageOptions::SCK_Function &&
+ &F.getEntryBlock() != BB)
+ return false;
+
+ // Do not instrument full dominators, or full post-dominators with multiple
+ // predecessors.
+ return !isFullDominator(BB, DT)
+ && !(isFullPostDominator(BB, PDT) && !BB->getSinglePredecessor());
+}
+
+
+// Returns true iff From->To is a backedge.
+// A twist here is that we treat From->To as a backedge if
+// * To dominates From or
+// * To->UniqueSuccessor dominates From
+static bool IsBackEdge(BasicBlock *From, BasicBlock *To,
+ const DominatorTree *DT) {
+ if (DT->dominates(To, From))
+ return true;
+ if (auto Next = To->getUniqueSuccessor())
+ if (DT->dominates(Next, From))
+ return true;
+ return false;
+}
+
+// Prunes uninteresting Cmp instrumentation:
+// * CMP instructions that feed into loop backedge branch.
+//
+// Note that Cmp pruning is controlled by the same flag as the
+// BB pruning.
+static bool IsInterestingCmp(ICmpInst *CMP, const DominatorTree *DT,
+ const SanitizerCoverageOptions &Options) {
+ if (!Options.NoPrune)
+ if (CMP->hasOneUse())
+ if (auto BR = dyn_cast<BranchInst>(CMP->user_back()))
+ for (BasicBlock *B : BR->successors())
+ if (IsBackEdge(BR->getParent(), B, DT))
+ return false;
+ return true;
+}
+
+void ModuleSanitizerCoverage::instrumentFunction(
+ Function &F, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
+ if (F.empty())
+ return;
+ if (F.getName().find(".module_ctor") != std::string::npos)
+ return; // Should not instrument sanitizer init functions.
+ if (F.getName().startswith("__sanitizer_"))
+ return; // Don't instrument __sanitizer_* callbacks.
+ // Don't touch available_externally functions, their actual body is elewhere.
+ if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage)
+ return;
+ // Don't instrument MSVC CRT configuration helpers. They may run before normal
+ // initialization.
+ if (F.getName() == "__local_stdio_printf_options" ||
+ F.getName() == "__local_stdio_scanf_options")
+ return;
+ if (isa<UnreachableInst>(F.getEntryBlock().getTerminator()))
+ return;
+ // Don't instrument functions using SEH for now. Splitting basic blocks like
+ // we do for coverage breaks WinEHPrepare.
+ // FIXME: Remove this when SEH no longer uses landingpad pattern matching.
+ if (F.hasPersonalityFn() &&
+ isAsynchronousEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+ return;
+ if (Allowlist && !Allowlist->inSection("coverage", "fun", F.getName()))
+ return;
+ if (Blocklist && Blocklist->inSection("coverage", "fun", F.getName()))
+ return;
+ if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge)
+ SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions().setIgnoreUnreachableDests());
+ SmallVector<Instruction *, 8> IndirCalls;
+ SmallVector<BasicBlock *, 16> BlocksToInstrument;
+ SmallVector<Instruction *, 8> CmpTraceTargets;
+ SmallVector<Instruction *, 8> SwitchTraceTargets;
+ SmallVector<BinaryOperator *, 8> DivTraceTargets;
+ SmallVector<GetElementPtrInst *, 8> GepTraceTargets;
+
+ const DominatorTree *DT = DTCallback(F);
+ const PostDominatorTree *PDT = PDTCallback(F);
+ bool IsLeafFunc = true;
+
+ for (auto &BB : F) {
+ if (shouldInstrumentBlock(F, &BB, DT, PDT, Options))
+ BlocksToInstrument.push_back(&BB);
+ for (auto &Inst : BB) {
+ if (Options.IndirectCalls) {
+ CallBase *CB = dyn_cast<CallBase>(&Inst);
+ if (CB && !CB->getCalledFunction())
+ IndirCalls.push_back(&Inst);
+ }
+ if (Options.TraceCmp) {
+ if (ICmpInst *CMP = dyn_cast<ICmpInst>(&Inst))
+ if (IsInterestingCmp(CMP, DT, Options))
+ CmpTraceTargets.push_back(&Inst);
+ if (isa<SwitchInst>(&Inst))
+ SwitchTraceTargets.push_back(&Inst);
+ }
+ if (Options.TraceDiv)
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&Inst))
+ if (BO->getOpcode() == Instruction::SDiv ||
+ BO->getOpcode() == Instruction::UDiv)
+ DivTraceTargets.push_back(BO);
+ if (Options.TraceGep)
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Inst))
+ GepTraceTargets.push_back(GEP);
+ if (Options.StackDepth)
+ if (isa<InvokeInst>(Inst) ||
+ (isa<CallInst>(Inst) && !isa<IntrinsicInst>(Inst)))
+ IsLeafFunc = false;
+ }
+ }
+
+ InjectCoverage(F, BlocksToInstrument, IsLeafFunc);
+ InjectCoverageForIndirectCalls(F, IndirCalls);
+ InjectTraceForCmp(F, CmpTraceTargets);
+ InjectTraceForSwitch(F, SwitchTraceTargets);
+ InjectTraceForDiv(F, DivTraceTargets);
+ InjectTraceForGep(F, GepTraceTargets);
+}
+
+GlobalVariable *ModuleSanitizerCoverage::CreateFunctionLocalArrayInSection(
+ size_t NumElements, Function &F, Type *Ty, const char *Section) {
+ ArrayType *ArrayTy = ArrayType::get(Ty, NumElements);
+ auto Array = new GlobalVariable(
+ *CurModule, ArrayTy, false, GlobalVariable::PrivateLinkage,
+ Constant::getNullValue(ArrayTy), "__sancov_gen_");
+
+ if (TargetTriple.supportsCOMDAT() && !F.isInterposable())
+ if (auto Comdat =
+ GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId))
+ Array->setComdat(Comdat);
+ Array->setSection(getSectionName(Section));
+ Array->setAlignment(Align(DL->getTypeStoreSize(Ty).getFixedSize()));
+ GlobalsToAppendToUsed.push_back(Array);
+ GlobalsToAppendToCompilerUsed.push_back(Array);
+ MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F));
+ Array->addMetadata(LLVMContext::MD_associated, *MD);
+
+ return Array;
+}
+
+GlobalVariable *
+ModuleSanitizerCoverage::CreatePCArray(Function &F,
+ ArrayRef<BasicBlock *> AllBlocks) {
+ size_t N = AllBlocks.size();
+ assert(N);
+ SmallVector<Constant *, 32> PCs;
+ IRBuilder<> IRB(&*F.getEntryBlock().getFirstInsertionPt());
+ for (size_t i = 0; i < N; i++) {
+ if (&F.getEntryBlock() == AllBlocks[i]) {
+ PCs.push_back((Constant *)IRB.CreatePointerCast(&F, IntptrPtrTy));
+ PCs.push_back((Constant *)IRB.CreateIntToPtr(
+ ConstantInt::get(IntptrTy, 1), IntptrPtrTy));
+ } else {
+ PCs.push_back((Constant *)IRB.CreatePointerCast(
+ BlockAddress::get(AllBlocks[i]), IntptrPtrTy));
+ PCs.push_back((Constant *)IRB.CreateIntToPtr(
+ ConstantInt::get(IntptrTy, 0), IntptrPtrTy));
+ }
+ }
+ auto *PCArray = CreateFunctionLocalArrayInSection(N * 2, F, IntptrPtrTy,
+ SanCovPCsSectionName);
+ PCArray->setInitializer(
+ ConstantArray::get(ArrayType::get(IntptrPtrTy, N * 2), PCs));
+ PCArray->setConstant(true);
+
+ return PCArray;
+}
+
+void ModuleSanitizerCoverage::CreateFunctionLocalArrays(
+ Function &F, ArrayRef<BasicBlock *> AllBlocks) {
+ if (Options.TracePCGuard)
+ FunctionGuardArray = CreateFunctionLocalArrayInSection(
+ AllBlocks.size(), F, Int32Ty, SanCovGuardsSectionName);
+
+ if (Options.Inline8bitCounters)
+ Function8bitCounterArray = CreateFunctionLocalArrayInSection(
+ AllBlocks.size(), F, Int8Ty, SanCovCountersSectionName);
+ if (Options.InlineBoolFlag)
+ FunctionBoolArray = CreateFunctionLocalArrayInSection(
+ AllBlocks.size(), F, Int1Ty, SanCovBoolFlagSectionName);
+
+ if (Options.PCTable)
+ FunctionPCsArray = CreatePCArray(F, AllBlocks);
+}
+
+bool ModuleSanitizerCoverage::InjectCoverage(Function &F,
+ ArrayRef<BasicBlock *> AllBlocks,
+ bool IsLeafFunc) {
+ if (AllBlocks.empty()) return false;
+ CreateFunctionLocalArrays(F, AllBlocks);
+ for (size_t i = 0, N = AllBlocks.size(); i < N; i++)
+ InjectCoverageAtBlock(F, *AllBlocks[i], i, IsLeafFunc);
+ return true;
+}
+
+// On every indirect call we call a run-time function
+// __sanitizer_cov_indir_call* with two parameters:
+// - callee address,
+// - global cache array that contains CacheSize pointers (zero-initialized).
+// The cache is used to speed up recording the caller-callee pairs.
+// The address of the caller is passed implicitly via caller PC.
+// CacheSize is encoded in the name of the run-time function.
+void ModuleSanitizerCoverage::InjectCoverageForIndirectCalls(
+ Function &F, ArrayRef<Instruction *> IndirCalls) {
+ if (IndirCalls.empty())
+ return;
+ assert(Options.TracePC || Options.TracePCGuard ||
+ Options.Inline8bitCounters || Options.InlineBoolFlag);
+ for (auto I : IndirCalls) {
+ IRBuilder<> IRB(I);
+ CallBase &CB = cast<CallBase>(*I);
+ Value *Callee = CB.getCalledOperand();
+ if (isa<InlineAsm>(Callee))
+ continue;
+ IRB.CreateCall(SanCovTracePCIndir, IRB.CreatePointerCast(Callee, IntptrTy));
+ }
+}
+
+// For every switch statement we insert a call:
+// __sanitizer_cov_trace_switch(CondValue,
+// {NumCases, ValueSizeInBits, Case0Value, Case1Value, Case2Value, ... })
+
+void ModuleSanitizerCoverage::InjectTraceForSwitch(
+ Function &, ArrayRef<Instruction *> SwitchTraceTargets) {
+ for (auto I : SwitchTraceTargets) {
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+ IRBuilder<> IRB(I);
+ SmallVector<Constant *, 16> Initializers;
+ Value *Cond = SI->getCondition();
+ if (Cond->getType()->getScalarSizeInBits() >
+ Int64Ty->getScalarSizeInBits())
+ continue;
+ Initializers.push_back(ConstantInt::get(Int64Ty, SI->getNumCases()));
+ Initializers.push_back(
+ ConstantInt::get(Int64Ty, Cond->getType()->getScalarSizeInBits()));
+ if (Cond->getType()->getScalarSizeInBits() <
+ Int64Ty->getScalarSizeInBits())
+ Cond = IRB.CreateIntCast(Cond, Int64Ty, false);
+ for (auto It : SI->cases()) {
+ Constant *C = It.getCaseValue();
+ if (C->getType()->getScalarSizeInBits() <
+ Int64Ty->getScalarSizeInBits())
+ C = ConstantExpr::getCast(CastInst::ZExt, It.getCaseValue(), Int64Ty);
+ Initializers.push_back(C);
+ }
llvm::sort(drop_begin(Initializers, 2),
- [](const Constant *A, const Constant *B) {
- return cast<ConstantInt>(A)->getLimitedValue() <
- cast<ConstantInt>(B)->getLimitedValue();
- });
- ArrayType *ArrayOfInt64Ty = ArrayType::get(Int64Ty, Initializers.size());
- GlobalVariable *GV = new GlobalVariable(
- *CurModule, ArrayOfInt64Ty, false, GlobalVariable::InternalLinkage,
- ConstantArray::get(ArrayOfInt64Ty, Initializers),
- "__sancov_gen_cov_switch_values");
- IRB.CreateCall(SanCovTraceSwitchFunction,
- {Cond, IRB.CreatePointerCast(GV, Int64PtrTy)});
- }
- }
-}
-
-void ModuleSanitizerCoverage::InjectTraceForDiv(
- Function &, ArrayRef<BinaryOperator *> DivTraceTargets) {
- for (auto BO : DivTraceTargets) {
- IRBuilder<> IRB(BO);
- Value *A1 = BO->getOperand(1);
- if (isa<ConstantInt>(A1)) continue;
- if (!A1->getType()->isIntegerTy())
- continue;
- uint64_t TypeSize = DL->getTypeStoreSizeInBits(A1->getType());
- int CallbackIdx = TypeSize == 32 ? 0 :
- TypeSize == 64 ? 1 : -1;
- if (CallbackIdx < 0) continue;
- auto Ty = Type::getIntNTy(*C, TypeSize);
- IRB.CreateCall(SanCovTraceDivFunction[CallbackIdx],
- {IRB.CreateIntCast(A1, Ty, true)});
- }
-}
-
-void ModuleSanitizerCoverage::InjectTraceForGep(
- Function &, ArrayRef<GetElementPtrInst *> GepTraceTargets) {
- for (auto GEP : GepTraceTargets) {
- IRBuilder<> IRB(GEP);
- for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
- if (!isa<ConstantInt>(*I) && (*I)->getType()->isIntegerTy())
- IRB.CreateCall(SanCovTraceGepFunction,
- {IRB.CreateIntCast(*I, IntptrTy, true)});
- }
-}
-
-void ModuleSanitizerCoverage::InjectTraceForCmp(
- Function &, ArrayRef<Instruction *> CmpTraceTargets) {
- for (auto I : CmpTraceTargets) {
- if (ICmpInst *ICMP = dyn_cast<ICmpInst>(I)) {
- IRBuilder<> IRB(ICMP);
- Value *A0 = ICMP->getOperand(0);
- Value *A1 = ICMP->getOperand(1);
- if (!A0->getType()->isIntegerTy())
- continue;
- uint64_t TypeSize = DL->getTypeStoreSizeInBits(A0->getType());
- int CallbackIdx = TypeSize == 8 ? 0 :
- TypeSize == 16 ? 1 :
- TypeSize == 32 ? 2 :
- TypeSize == 64 ? 3 : -1;
- if (CallbackIdx < 0) continue;
- // __sanitizer_cov_trace_cmp((type_size << 32) | predicate, A0, A1);
- auto CallbackFunc = SanCovTraceCmpFunction[CallbackIdx];
- bool FirstIsConst = isa<ConstantInt>(A0);
- bool SecondIsConst = isa<ConstantInt>(A1);
- // If both are const, then we don't need such a comparison.
- if (FirstIsConst && SecondIsConst) continue;
- // If only one is const, then make it the first callback argument.
- if (FirstIsConst || SecondIsConst) {
- CallbackFunc = SanCovTraceConstCmpFunction[CallbackIdx];
- if (SecondIsConst)
- std::swap(A0, A1);
- }
-
- auto Ty = Type::getIntNTy(*C, TypeSize);
- IRB.CreateCall(CallbackFunc, {IRB.CreateIntCast(A0, Ty, true),
- IRB.CreateIntCast(A1, Ty, true)});
- }
- }
-}
-
-void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
- size_t Idx,
- bool IsLeafFunc) {
- BasicBlock::iterator IP = BB.getFirstInsertionPt();
- bool IsEntryBB = &BB == &F.getEntryBlock();
- DebugLoc EntryLoc;
- if (IsEntryBB) {
- if (auto SP = F.getSubprogram())
+ [](const Constant *A, const Constant *B) {
+ return cast<ConstantInt>(A)->getLimitedValue() <
+ cast<ConstantInt>(B)->getLimitedValue();
+ });
+ ArrayType *ArrayOfInt64Ty = ArrayType::get(Int64Ty, Initializers.size());
+ GlobalVariable *GV = new GlobalVariable(
+ *CurModule, ArrayOfInt64Ty, false, GlobalVariable::InternalLinkage,
+ ConstantArray::get(ArrayOfInt64Ty, Initializers),
+ "__sancov_gen_cov_switch_values");
+ IRB.CreateCall(SanCovTraceSwitchFunction,
+ {Cond, IRB.CreatePointerCast(GV, Int64PtrTy)});
+ }
+ }
+}
+
+void ModuleSanitizerCoverage::InjectTraceForDiv(
+ Function &, ArrayRef<BinaryOperator *> DivTraceTargets) {
+ for (auto BO : DivTraceTargets) {
+ IRBuilder<> IRB(BO);
+ Value *A1 = BO->getOperand(1);
+ if (isa<ConstantInt>(A1)) continue;
+ if (!A1->getType()->isIntegerTy())
+ continue;
+ uint64_t TypeSize = DL->getTypeStoreSizeInBits(A1->getType());
+ int CallbackIdx = TypeSize == 32 ? 0 :
+ TypeSize == 64 ? 1 : -1;
+ if (CallbackIdx < 0) continue;
+ auto Ty = Type::getIntNTy(*C, TypeSize);
+ IRB.CreateCall(SanCovTraceDivFunction[CallbackIdx],
+ {IRB.CreateIntCast(A1, Ty, true)});
+ }
+}
+
+void ModuleSanitizerCoverage::InjectTraceForGep(
+ Function &, ArrayRef<GetElementPtrInst *> GepTraceTargets) {
+ for (auto GEP : GepTraceTargets) {
+ IRBuilder<> IRB(GEP);
+ for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+ if (!isa<ConstantInt>(*I) && (*I)->getType()->isIntegerTy())
+ IRB.CreateCall(SanCovTraceGepFunction,
+ {IRB.CreateIntCast(*I, IntptrTy, true)});
+ }
+}
+
+void ModuleSanitizerCoverage::InjectTraceForCmp(
+ Function &, ArrayRef<Instruction *> CmpTraceTargets) {
+ for (auto I : CmpTraceTargets) {
+ if (ICmpInst *ICMP = dyn_cast<ICmpInst>(I)) {
+ IRBuilder<> IRB(ICMP);
+ Value *A0 = ICMP->getOperand(0);
+ Value *A1 = ICMP->getOperand(1);
+ if (!A0->getType()->isIntegerTy())
+ continue;
+ uint64_t TypeSize = DL->getTypeStoreSizeInBits(A0->getType());
+ int CallbackIdx = TypeSize == 8 ? 0 :
+ TypeSize == 16 ? 1 :
+ TypeSize == 32 ? 2 :
+ TypeSize == 64 ? 3 : -1;
+ if (CallbackIdx < 0) continue;
+ // __sanitizer_cov_trace_cmp((type_size << 32) | predicate, A0, A1);
+ auto CallbackFunc = SanCovTraceCmpFunction[CallbackIdx];
+ bool FirstIsConst = isa<ConstantInt>(A0);
+ bool SecondIsConst = isa<ConstantInt>(A1);
+ // If both are const, then we don't need such a comparison.
+ if (FirstIsConst && SecondIsConst) continue;
+ // If only one is const, then make it the first callback argument.
+ if (FirstIsConst || SecondIsConst) {
+ CallbackFunc = SanCovTraceConstCmpFunction[CallbackIdx];
+ if (SecondIsConst)
+ std::swap(A0, A1);
+ }
+
+ auto Ty = Type::getIntNTy(*C, TypeSize);
+ IRB.CreateCall(CallbackFunc, {IRB.CreateIntCast(A0, Ty, true),
+ IRB.CreateIntCast(A1, Ty, true)});
+ }
+ }
+}
+
+void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
+ size_t Idx,
+ bool IsLeafFunc) {
+ BasicBlock::iterator IP = BB.getFirstInsertionPt();
+ bool IsEntryBB = &BB == &F.getEntryBlock();
+ DebugLoc EntryLoc;
+ if (IsEntryBB) {
+ if (auto SP = F.getSubprogram())
EntryLoc = DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP);
- // Keep static allocas and llvm.localescape calls in the entry block. Even
- // if we aren't splitting the block, it's nice for allocas to be before
- // calls.
- IP = PrepareToSplitEntryBlock(BB, IP);
- } else {
- EntryLoc = IP->getDebugLoc();
- }
-
- IRBuilder<> IRB(&*IP);
- IRB.SetCurrentDebugLocation(EntryLoc);
- if (Options.TracePC) {
- IRB.CreateCall(SanCovTracePC)
- ->setCannotMerge(); // gets the PC using GET_CALLER_PC.
- }
- if (Options.TracePCGuard) {
- auto GuardPtr = IRB.CreateIntToPtr(
- IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy),
- ConstantInt::get(IntptrTy, Idx * 4)),
- Int32PtrTy);
- IRB.CreateCall(SanCovTracePCGuard, GuardPtr)->setCannotMerge();
- }
- if (Options.Inline8bitCounters) {
- auto CounterPtr = IRB.CreateGEP(
- Function8bitCounterArray->getValueType(), Function8bitCounterArray,
- {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)});
- auto Load = IRB.CreateLoad(Int8Ty, CounterPtr);
- auto Inc = IRB.CreateAdd(Load, ConstantInt::get(Int8Ty, 1));
- auto Store = IRB.CreateStore(Inc, CounterPtr);
- SetNoSanitizeMetadata(Load);
- SetNoSanitizeMetadata(Store);
- }
- if (Options.InlineBoolFlag) {
- auto FlagPtr = IRB.CreateGEP(
- FunctionBoolArray->getValueType(), FunctionBoolArray,
- {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)});
- auto Load = IRB.CreateLoad(Int1Ty, FlagPtr);
- auto ThenTerm =
- SplitBlockAndInsertIfThen(IRB.CreateIsNull(Load), &*IP, false);
- IRBuilder<> ThenIRB(ThenTerm);
- auto Store = ThenIRB.CreateStore(ConstantInt::getTrue(Int1Ty), FlagPtr);
- SetNoSanitizeMetadata(Load);
- SetNoSanitizeMetadata(Store);
- }
- if (Options.StackDepth && IsEntryBB && !IsLeafFunc) {
- // Check stack depth. If it's the deepest so far, record it.
- Module *M = F.getParent();
- Function *GetFrameAddr = Intrinsic::getDeclaration(
- M, Intrinsic::frameaddress,
- IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
- auto FrameAddrPtr =
- IRB.CreateCall(GetFrameAddr, {Constant::getNullValue(Int32Ty)});
- auto FrameAddrInt = IRB.CreatePtrToInt(FrameAddrPtr, IntptrTy);
- auto LowestStack = IRB.CreateLoad(IntptrTy, SanCovLowestStack);
- auto IsStackLower = IRB.CreateICmpULT(FrameAddrInt, LowestStack);
- auto ThenTerm = SplitBlockAndInsertIfThen(IsStackLower, &*IP, false);
- IRBuilder<> ThenIRB(ThenTerm);
- auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack);
- SetNoSanitizeMetadata(LowestStack);
- SetNoSanitizeMetadata(Store);
- }
-}
-
-std::string
-ModuleSanitizerCoverage::getSectionName(const std::string &Section) const {
- if (TargetTriple.isOSBinFormatCOFF()) {
- if (Section == SanCovCountersSectionName)
- return ".SCOV$CM";
- if (Section == SanCovBoolFlagSectionName)
- return ".SCOV$BM";
- if (Section == SanCovPCsSectionName)
- return ".SCOVP$M";
- return ".SCOV$GM"; // For SanCovGuardsSectionName.
- }
- if (TargetTriple.isOSBinFormatMachO())
- return "__DATA,__" + Section;
- return "__" + Section;
-}
-
-std::string
-ModuleSanitizerCoverage::getSectionStart(const std::string &Section) const {
- if (TargetTriple.isOSBinFormatMachO())
- return "\1section$start$__DATA$__" + Section;
- return "__start___" + Section;
-}
-
-std::string
-ModuleSanitizerCoverage::getSectionEnd(const std::string &Section) const {
- if (TargetTriple.isOSBinFormatMachO())
- return "\1section$end$__DATA$__" + Section;
- return "__stop___" + Section;
-}
-
-char ModuleSanitizerCoverageLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(ModuleSanitizerCoverageLegacyPass, "sancov",
- "Pass for instrumenting coverage on functions", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ModuleSanitizerCoverageLegacyPass, "sancov",
- "Pass for instrumenting coverage on functions", false,
- false)
-ModulePass *llvm::createModuleSanitizerCoverageLegacyPassPass(
- const SanitizerCoverageOptions &Options,
- const std::vector<std::string> &AllowlistFiles,
- const std::vector<std::string> &BlocklistFiles) {
- return new ModuleSanitizerCoverageLegacyPass(Options, AllowlistFiles,
- BlocklistFiles);
-}
+ // Keep static allocas and llvm.localescape calls in the entry block. Even
+ // if we aren't splitting the block, it's nice for allocas to be before
+ // calls.
+ IP = PrepareToSplitEntryBlock(BB, IP);
+ } else {
+ EntryLoc = IP->getDebugLoc();
+ }
+
+ IRBuilder<> IRB(&*IP);
+ IRB.SetCurrentDebugLocation(EntryLoc);
+ if (Options.TracePC) {
+ IRB.CreateCall(SanCovTracePC)
+ ->setCannotMerge(); // gets the PC using GET_CALLER_PC.
+ }
+ if (Options.TracePCGuard) {
+ auto GuardPtr = IRB.CreateIntToPtr(
+ IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy),
+ ConstantInt::get(IntptrTy, Idx * 4)),
+ Int32PtrTy);
+ IRB.CreateCall(SanCovTracePCGuard, GuardPtr)->setCannotMerge();
+ }
+ if (Options.Inline8bitCounters) {
+ auto CounterPtr = IRB.CreateGEP(
+ Function8bitCounterArray->getValueType(), Function8bitCounterArray,
+ {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)});
+ auto Load = IRB.CreateLoad(Int8Ty, CounterPtr);
+ auto Inc = IRB.CreateAdd(Load, ConstantInt::get(Int8Ty, 1));
+ auto Store = IRB.CreateStore(Inc, CounterPtr);
+ SetNoSanitizeMetadata(Load);
+ SetNoSanitizeMetadata(Store);
+ }
+ if (Options.InlineBoolFlag) {
+ auto FlagPtr = IRB.CreateGEP(
+ FunctionBoolArray->getValueType(), FunctionBoolArray,
+ {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)});
+ auto Load = IRB.CreateLoad(Int1Ty, FlagPtr);
+ auto ThenTerm =
+ SplitBlockAndInsertIfThen(IRB.CreateIsNull(Load), &*IP, false);
+ IRBuilder<> ThenIRB(ThenTerm);
+ auto Store = ThenIRB.CreateStore(ConstantInt::getTrue(Int1Ty), FlagPtr);
+ SetNoSanitizeMetadata(Load);
+ SetNoSanitizeMetadata(Store);
+ }
+ if (Options.StackDepth && IsEntryBB && !IsLeafFunc) {
+ // Check stack depth. If it's the deepest so far, record it.
+ Module *M = F.getParent();
+ Function *GetFrameAddr = Intrinsic::getDeclaration(
+ M, Intrinsic::frameaddress,
+ IRB.getInt8PtrTy(M->getDataLayout().getAllocaAddrSpace()));
+ auto FrameAddrPtr =
+ IRB.CreateCall(GetFrameAddr, {Constant::getNullValue(Int32Ty)});
+ auto FrameAddrInt = IRB.CreatePtrToInt(FrameAddrPtr, IntptrTy);
+ auto LowestStack = IRB.CreateLoad(IntptrTy, SanCovLowestStack);
+ auto IsStackLower = IRB.CreateICmpULT(FrameAddrInt, LowestStack);
+ auto ThenTerm = SplitBlockAndInsertIfThen(IsStackLower, &*IP, false);
+ IRBuilder<> ThenIRB(ThenTerm);
+ auto Store = ThenIRB.CreateStore(FrameAddrInt, SanCovLowestStack);
+ SetNoSanitizeMetadata(LowestStack);
+ SetNoSanitizeMetadata(Store);
+ }
+}
+
+std::string
+ModuleSanitizerCoverage::getSectionName(const std::string &Section) const {
+ if (TargetTriple.isOSBinFormatCOFF()) {
+ if (Section == SanCovCountersSectionName)
+ return ".SCOV$CM";
+ if (Section == SanCovBoolFlagSectionName)
+ return ".SCOV$BM";
+ if (Section == SanCovPCsSectionName)
+ return ".SCOVP$M";
+ return ".SCOV$GM"; // For SanCovGuardsSectionName.
+ }
+ if (TargetTriple.isOSBinFormatMachO())
+ return "__DATA,__" + Section;
+ return "__" + Section;
+}
+
+std::string
+ModuleSanitizerCoverage::getSectionStart(const std::string &Section) const {
+ if (TargetTriple.isOSBinFormatMachO())
+ return "\1section$start$__DATA$__" + Section;
+ return "__start___" + Section;
+}
+
+std::string
+ModuleSanitizerCoverage::getSectionEnd(const std::string &Section) const {
+ if (TargetTriple.isOSBinFormatMachO())
+ return "\1section$end$__DATA$__" + Section;
+ return "__stop___" + Section;
+}
+
+char ModuleSanitizerCoverageLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ModuleSanitizerCoverageLegacyPass, "sancov",
+ "Pass for instrumenting coverage on functions", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ModuleSanitizerCoverageLegacyPass, "sancov",
+ "Pass for instrumenting coverage on functions", false,
+ false)
+ModulePass *llvm::createModuleSanitizerCoverageLegacyPassPass(
+ const SanitizerCoverageOptions &Options,
+ const std::vector<std::string> &AllowlistFiles,
+ const std::vector<std::string> &BlocklistFiles) {
+ return new ModuleSanitizerCoverageLegacyPass(Options, AllowlistFiles,
+ BlocklistFiles);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 0b53ff8a83..783878cf1e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -1,113 +1,113 @@
-//===-- ThreadSanitizer.cpp - race detector -------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of ThreadSanitizer, a race detector.
-//
-// The tool is under development, for the details about previous versions see
-// http://code.google.com/p/data-race-test
-//
-// The instrumentation phase is quite simple:
-// - Insert calls to run-time library before every memory access.
-// - Optimizations may apply to avoid instrumenting some of the accesses.
-// - Insert calls at function entry/exit.
-// The rest is handled by the run-time library.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
+//===-- ThreadSanitizer.cpp - race detector -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer, a race detector.
+//
+// The tool is under development, for the details about previous versions see
+// http://code.google.com/p/data-race-test
+//
+// The instrumentation phase is quite simple:
+// - Insert calls to run-time library before every memory access.
+// - Optimizations may apply to avoid instrumenting some of the accesses.
+// - Insert calls at function entry/exit.
+// The rest is handled by the run-time library.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Instrumentation/ThreadSanitizer.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/ProfileData/InstrProf.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/EscapeEnumerator.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "tsan"
-
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "tsan"
+
static cl::opt<bool> ClInstrumentMemoryAccesses(
- "tsan-instrument-memory-accesses", cl::init(true),
- cl::desc("Instrument memory accesses"), cl::Hidden);
+ "tsan-instrument-memory-accesses", cl::init(true),
+ cl::desc("Instrument memory accesses"), cl::Hidden);
static cl::opt<bool>
ClInstrumentFuncEntryExit("tsan-instrument-func-entry-exit", cl::init(true),
cl::desc("Instrument function entry and exit"),
cl::Hidden);
static cl::opt<bool> ClHandleCxxExceptions(
- "tsan-handle-cxx-exceptions", cl::init(true),
- cl::desc("Handle C++ exceptions (insert cleanup blocks for unwinding)"),
- cl::Hidden);
+ "tsan-handle-cxx-exceptions", cl::init(true),
+ cl::desc("Handle C++ exceptions (insert cleanup blocks for unwinding)"),
+ cl::Hidden);
static cl::opt<bool> ClInstrumentAtomics("tsan-instrument-atomics",
cl::init(true),
cl::desc("Instrument atomics"),
cl::Hidden);
static cl::opt<bool> ClInstrumentMemIntrinsics(
- "tsan-instrument-memintrinsics", cl::init(true),
- cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
+ "tsan-instrument-memintrinsics", cl::init(true),
+ cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
static cl::opt<bool> ClDistinguishVolatile(
- "tsan-distinguish-volatile", cl::init(false),
- cl::desc("Emit special instrumentation for accesses to volatiles"),
- cl::Hidden);
+ "tsan-distinguish-volatile", cl::init(false),
+ cl::desc("Emit special instrumentation for accesses to volatiles"),
+ cl::Hidden);
static cl::opt<bool> ClInstrumentReadBeforeWrite(
- "tsan-instrument-read-before-write", cl::init(false),
- cl::desc("Do not eliminate read instrumentation for read-before-writes"),
- cl::Hidden);
+ "tsan-instrument-read-before-write", cl::init(false),
+ cl::desc("Do not eliminate read instrumentation for read-before-writes"),
+ cl::Hidden);
static cl::opt<bool> ClCompoundReadBeforeWrite(
"tsan-compound-read-before-write", cl::init(false),
cl::desc("Emit special compound instrumentation for reads-before-writes"),
cl::Hidden);
-
-STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
-STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
-STATISTIC(NumOmittedReadsBeforeWrite,
- "Number of reads ignored due to following writes");
-STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
-STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
-STATISTIC(NumInstrumentedVtableReads, "Number of vtable ptr reads");
-STATISTIC(NumOmittedReadsFromConstantGlobals,
- "Number of reads from constant globals");
-STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads");
-STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing");
-
+
+STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
+STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
+STATISTIC(NumOmittedReadsBeforeWrite,
+ "Number of reads ignored due to following writes");
+STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
+STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
+STATISTIC(NumInstrumentedVtableReads, "Number of vtable ptr reads");
+STATISTIC(NumOmittedReadsFromConstantGlobals,
+ "Number of reads from constant globals");
+STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads");
+STATISTIC(NumOmittedNonCaptured, "Number of accesses ignored due to capturing");
+
const char kTsanModuleCtorName[] = "tsan.module_ctor";
const char kTsanInitName[] = "__tsan_init";
-
-namespace {
-
-/// ThreadSanitizer: instrument the code in module to find races.
-///
-/// Instantiating ThreadSanitizer inserts the tsan runtime library API function
-/// declarations into the module if they don't exist already. Instantiating
-/// ensures the __tsan_init function is in the list of global constructors for
-/// the module.
-struct ThreadSanitizer {
+
+namespace {
+
+/// ThreadSanitizer: instrument the code in module to find races.
+///
+/// Instantiating ThreadSanitizer inserts the tsan runtime library API function
+/// declarations into the module if they don't exist already. Instantiating
+/// ensures the __tsan_init function is in the list of global constructors for
+/// the module.
+struct ThreadSanitizer {
ThreadSanitizer() {
// Sanity check options and warn user.
if (ClInstrumentReadBeforeWrite && ClCompoundReadBeforeWrite) {
@@ -117,9 +117,9 @@ struct ThreadSanitizer {
}
}
- bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI);
-
-private:
+ bool sanitizeFunction(Function &F, const TargetLibraryInfo &TLI);
+
+private:
// Internal Instruction wrapper that contains more information about the
// Instruction from prior analysis.
struct InstructionInfo {
@@ -133,172 +133,172 @@ private:
unsigned Flags = 0;
};
- void initialize(Module &M);
+ void initialize(Module &M);
bool instrumentLoadOrStore(const InstructionInfo &II, const DataLayout &DL);
- bool instrumentAtomic(Instruction *I, const DataLayout &DL);
- bool instrumentMemIntrinsic(Instruction *I);
- void chooseInstructionsToInstrument(SmallVectorImpl<Instruction *> &Local,
+ bool instrumentAtomic(Instruction *I, const DataLayout &DL);
+ bool instrumentMemIntrinsic(Instruction *I);
+ void chooseInstructionsToInstrument(SmallVectorImpl<Instruction *> &Local,
SmallVectorImpl<InstructionInfo> &All,
- const DataLayout &DL);
- bool addrPointsToConstantData(Value *Addr);
- int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL);
- void InsertRuntimeIgnores(Function &F);
-
- Type *IntptrTy;
- FunctionCallee TsanFuncEntry;
- FunctionCallee TsanFuncExit;
- FunctionCallee TsanIgnoreBegin;
- FunctionCallee TsanIgnoreEnd;
- // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
- static const size_t kNumberOfAccessSizes = 5;
- FunctionCallee TsanRead[kNumberOfAccessSizes];
- FunctionCallee TsanWrite[kNumberOfAccessSizes];
- FunctionCallee TsanUnalignedRead[kNumberOfAccessSizes];
- FunctionCallee TsanUnalignedWrite[kNumberOfAccessSizes];
- FunctionCallee TsanVolatileRead[kNumberOfAccessSizes];
- FunctionCallee TsanVolatileWrite[kNumberOfAccessSizes];
- FunctionCallee TsanUnalignedVolatileRead[kNumberOfAccessSizes];
- FunctionCallee TsanUnalignedVolatileWrite[kNumberOfAccessSizes];
+ const DataLayout &DL);
+ bool addrPointsToConstantData(Value *Addr);
+ int getMemoryAccessFuncIndex(Value *Addr, const DataLayout &DL);
+ void InsertRuntimeIgnores(Function &F);
+
+ Type *IntptrTy;
+ FunctionCallee TsanFuncEntry;
+ FunctionCallee TsanFuncExit;
+ FunctionCallee TsanIgnoreBegin;
+ FunctionCallee TsanIgnoreEnd;
+ // Accesses sizes are powers of two: 1, 2, 4, 8, 16.
+ static const size_t kNumberOfAccessSizes = 5;
+ FunctionCallee TsanRead[kNumberOfAccessSizes];
+ FunctionCallee TsanWrite[kNumberOfAccessSizes];
+ FunctionCallee TsanUnalignedRead[kNumberOfAccessSizes];
+ FunctionCallee TsanUnalignedWrite[kNumberOfAccessSizes];
+ FunctionCallee TsanVolatileRead[kNumberOfAccessSizes];
+ FunctionCallee TsanVolatileWrite[kNumberOfAccessSizes];
+ FunctionCallee TsanUnalignedVolatileRead[kNumberOfAccessSizes];
+ FunctionCallee TsanUnalignedVolatileWrite[kNumberOfAccessSizes];
FunctionCallee TsanCompoundRW[kNumberOfAccessSizes];
FunctionCallee TsanUnalignedCompoundRW[kNumberOfAccessSizes];
- FunctionCallee TsanAtomicLoad[kNumberOfAccessSizes];
- FunctionCallee TsanAtomicStore[kNumberOfAccessSizes];
- FunctionCallee TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1]
- [kNumberOfAccessSizes];
- FunctionCallee TsanAtomicCAS[kNumberOfAccessSizes];
- FunctionCallee TsanAtomicThreadFence;
- FunctionCallee TsanAtomicSignalFence;
- FunctionCallee TsanVptrUpdate;
- FunctionCallee TsanVptrLoad;
- FunctionCallee MemmoveFn, MemcpyFn, MemsetFn;
-};
-
-struct ThreadSanitizerLegacyPass : FunctionPass {
- ThreadSanitizerLegacyPass() : FunctionPass(ID) {
- initializeThreadSanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
- }
- StringRef getPassName() const override;
- void getAnalysisUsage(AnalysisUsage &AU) const override;
- bool runOnFunction(Function &F) override;
- bool doInitialization(Module &M) override;
- static char ID; // Pass identification, replacement for typeid.
-private:
- Optional<ThreadSanitizer> TSan;
-};
-
-void insertModuleCtor(Module &M) {
- getOrCreateSanitizerCtorAndInitFunctions(
- M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{},
- /*InitArgs=*/{},
- // This callback is invoked when the functions are created the first
- // time. Hook them into the global ctors list in that case:
- [&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); });
-}
-
-} // namespace
-
-PreservedAnalyses ThreadSanitizerPass::run(Function &F,
- FunctionAnalysisManager &FAM) {
- ThreadSanitizer TSan;
- if (TSan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F)))
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
-}
-
-PreservedAnalyses ThreadSanitizerPass::run(Module &M,
- ModuleAnalysisManager &MAM) {
- insertModuleCtor(M);
- return PreservedAnalyses::none();
-}
-
-char ThreadSanitizerLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(ThreadSanitizerLegacyPass, "tsan",
- "ThreadSanitizer: detects data races.", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(ThreadSanitizerLegacyPass, "tsan",
- "ThreadSanitizer: detects data races.", false, false)
-
-StringRef ThreadSanitizerLegacyPass::getPassName() const {
- return "ThreadSanitizerLegacyPass";
-}
-
-void ThreadSanitizerLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
-}
-
-bool ThreadSanitizerLegacyPass::doInitialization(Module &M) {
- insertModuleCtor(M);
- TSan.emplace();
- return true;
-}
-
-bool ThreadSanitizerLegacyPass::runOnFunction(Function &F) {
- auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- TSan->sanitizeFunction(F, TLI);
- return true;
-}
-
-FunctionPass *llvm::createThreadSanitizerLegacyPassPass() {
- return new ThreadSanitizerLegacyPass();
-}
-
-void ThreadSanitizer::initialize(Module &M) {
- const DataLayout &DL = M.getDataLayout();
- IntptrTy = DL.getIntPtrType(M.getContext());
-
- IRBuilder<> IRB(M.getContext());
- AttributeList Attr;
- Attr = Attr.addAttribute(M.getContext(), AttributeList::FunctionIndex,
- Attribute::NoUnwind);
- // Initialize the callbacks.
- TsanFuncEntry = M.getOrInsertFunction("__tsan_func_entry", Attr,
- IRB.getVoidTy(), IRB.getInt8PtrTy());
- TsanFuncExit =
- M.getOrInsertFunction("__tsan_func_exit", Attr, IRB.getVoidTy());
- TsanIgnoreBegin = M.getOrInsertFunction("__tsan_ignore_thread_begin", Attr,
- IRB.getVoidTy());
- TsanIgnoreEnd =
- M.getOrInsertFunction("__tsan_ignore_thread_end", Attr, IRB.getVoidTy());
- IntegerType *OrdTy = IRB.getInt32Ty();
- for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
- const unsigned ByteSize = 1U << i;
- const unsigned BitSize = ByteSize * 8;
- std::string ByteSizeStr = utostr(ByteSize);
- std::string BitSizeStr = utostr(BitSize);
- SmallString<32> ReadName("__tsan_read" + ByteSizeStr);
- TsanRead[i] = M.getOrInsertFunction(ReadName, Attr, IRB.getVoidTy(),
- IRB.getInt8PtrTy());
-
- SmallString<32> WriteName("__tsan_write" + ByteSizeStr);
- TsanWrite[i] = M.getOrInsertFunction(WriteName, Attr, IRB.getVoidTy(),
- IRB.getInt8PtrTy());
-
- SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr);
- TsanUnalignedRead[i] = M.getOrInsertFunction(
- UnalignedReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
-
- SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + ByteSizeStr);
- TsanUnalignedWrite[i] = M.getOrInsertFunction(
- UnalignedWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
-
- SmallString<64> VolatileReadName("__tsan_volatile_read" + ByteSizeStr);
- TsanVolatileRead[i] = M.getOrInsertFunction(
- VolatileReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
-
- SmallString<64> VolatileWriteName("__tsan_volatile_write" + ByteSizeStr);
- TsanVolatileWrite[i] = M.getOrInsertFunction(
- VolatileWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
-
- SmallString<64> UnalignedVolatileReadName("__tsan_unaligned_volatile_read" +
- ByteSizeStr);
- TsanUnalignedVolatileRead[i] = M.getOrInsertFunction(
- UnalignedVolatileReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
-
- SmallString<64> UnalignedVolatileWriteName(
- "__tsan_unaligned_volatile_write" + ByteSizeStr);
- TsanUnalignedVolatileWrite[i] = M.getOrInsertFunction(
- UnalignedVolatileWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
-
+ FunctionCallee TsanAtomicLoad[kNumberOfAccessSizes];
+ FunctionCallee TsanAtomicStore[kNumberOfAccessSizes];
+ FunctionCallee TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1]
+ [kNumberOfAccessSizes];
+ FunctionCallee TsanAtomicCAS[kNumberOfAccessSizes];
+ FunctionCallee TsanAtomicThreadFence;
+ FunctionCallee TsanAtomicSignalFence;
+ FunctionCallee TsanVptrUpdate;
+ FunctionCallee TsanVptrLoad;
+ FunctionCallee MemmoveFn, MemcpyFn, MemsetFn;
+};
+
+struct ThreadSanitizerLegacyPass : FunctionPass {
+ ThreadSanitizerLegacyPass() : FunctionPass(ID) {
+ initializeThreadSanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+ StringRef getPassName() const override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnFunction(Function &F) override;
+ bool doInitialization(Module &M) override;
+ static char ID; // Pass identification, replacement for typeid.
+private:
+ Optional<ThreadSanitizer> TSan;
+};
+
+void insertModuleCtor(Module &M) {
+ getOrCreateSanitizerCtorAndInitFunctions(
+ M, kTsanModuleCtorName, kTsanInitName, /*InitArgTypes=*/{},
+ /*InitArgs=*/{},
+ // This callback is invoked when the functions are created the first
+ // time. Hook them into the global ctors list in that case:
+ [&](Function *Ctor, FunctionCallee) { appendToGlobalCtors(M, Ctor, 0); });
+}
+
+} // namespace
+
+PreservedAnalyses ThreadSanitizerPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ ThreadSanitizer TSan;
+ if (TSan.sanitizeFunction(F, FAM.getResult<TargetLibraryAnalysis>(F)))
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
+PreservedAnalyses ThreadSanitizerPass::run(Module &M,
+ ModuleAnalysisManager &MAM) {
+ insertModuleCtor(M);
+ return PreservedAnalyses::none();
+}
+
+char ThreadSanitizerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ThreadSanitizerLegacyPass, "tsan",
+ "ThreadSanitizer: detects data races.", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(ThreadSanitizerLegacyPass, "tsan",
+ "ThreadSanitizer: detects data races.", false, false)
+
+StringRef ThreadSanitizerLegacyPass::getPassName() const {
+ return "ThreadSanitizerLegacyPass";
+}
+
+void ThreadSanitizerLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
+
+bool ThreadSanitizerLegacyPass::doInitialization(Module &M) {
+ insertModuleCtor(M);
+ TSan.emplace();
+ return true;
+}
+
+bool ThreadSanitizerLegacyPass::runOnFunction(Function &F) {
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ TSan->sanitizeFunction(F, TLI);
+ return true;
+}
+
+FunctionPass *llvm::createThreadSanitizerLegacyPassPass() {
+ return new ThreadSanitizerLegacyPass();
+}
+
+void ThreadSanitizer::initialize(Module &M) {
+ const DataLayout &DL = M.getDataLayout();
+ IntptrTy = DL.getIntPtrType(M.getContext());
+
+ IRBuilder<> IRB(M.getContext());
+ AttributeList Attr;
+ Attr = Attr.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+ Attribute::NoUnwind);
+ // Initialize the callbacks.
+ TsanFuncEntry = M.getOrInsertFunction("__tsan_func_entry", Attr,
+ IRB.getVoidTy(), IRB.getInt8PtrTy());
+ TsanFuncExit =
+ M.getOrInsertFunction("__tsan_func_exit", Attr, IRB.getVoidTy());
+ TsanIgnoreBegin = M.getOrInsertFunction("__tsan_ignore_thread_begin", Attr,
+ IRB.getVoidTy());
+ TsanIgnoreEnd =
+ M.getOrInsertFunction("__tsan_ignore_thread_end", Attr, IRB.getVoidTy());
+ IntegerType *OrdTy = IRB.getInt32Ty();
+ for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
+ const unsigned ByteSize = 1U << i;
+ const unsigned BitSize = ByteSize * 8;
+ std::string ByteSizeStr = utostr(ByteSize);
+ std::string BitSizeStr = utostr(BitSize);
+ SmallString<32> ReadName("__tsan_read" + ByteSizeStr);
+ TsanRead[i] = M.getOrInsertFunction(ReadName, Attr, IRB.getVoidTy(),
+ IRB.getInt8PtrTy());
+
+ SmallString<32> WriteName("__tsan_write" + ByteSizeStr);
+ TsanWrite[i] = M.getOrInsertFunction(WriteName, Attr, IRB.getVoidTy(),
+ IRB.getInt8PtrTy());
+
+ SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr);
+ TsanUnalignedRead[i] = M.getOrInsertFunction(
+ UnalignedReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+ SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + ByteSizeStr);
+ TsanUnalignedWrite[i] = M.getOrInsertFunction(
+ UnalignedWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+ SmallString<64> VolatileReadName("__tsan_volatile_read" + ByteSizeStr);
+ TsanVolatileRead[i] = M.getOrInsertFunction(
+ VolatileReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+ SmallString<64> VolatileWriteName("__tsan_volatile_write" + ByteSizeStr);
+ TsanVolatileWrite[i] = M.getOrInsertFunction(
+ VolatileWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+ SmallString<64> UnalignedVolatileReadName("__tsan_unaligned_volatile_read" +
+ ByteSizeStr);
+ TsanUnalignedVolatileRead[i] = M.getOrInsertFunction(
+ UnalignedVolatileReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+ SmallString<64> UnalignedVolatileWriteName(
+ "__tsan_unaligned_volatile_write" + ByteSizeStr);
+ TsanUnalignedVolatileWrite[i] = M.getOrInsertFunction(
+ UnalignedVolatileWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
SmallString<64> CompoundRWName("__tsan_read_write" + ByteSizeStr);
TsanCompoundRW[i] = M.getOrInsertFunction(
CompoundRWName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
@@ -308,145 +308,145 @@ void ThreadSanitizer::initialize(Module &M) {
TsanUnalignedCompoundRW[i] = M.getOrInsertFunction(
UnalignedCompoundRWName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
- Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
- Type *PtrTy = Ty->getPointerTo();
- SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load");
- TsanAtomicLoad[i] =
- M.getOrInsertFunction(AtomicLoadName, Attr, Ty, PtrTy, OrdTy);
-
- SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store");
- TsanAtomicStore[i] = M.getOrInsertFunction(
- AtomicStoreName, Attr, IRB.getVoidTy(), PtrTy, Ty, OrdTy);
-
- for (unsigned Op = AtomicRMWInst::FIRST_BINOP;
- Op <= AtomicRMWInst::LAST_BINOP; ++Op) {
- TsanAtomicRMW[Op][i] = nullptr;
- const char *NamePart = nullptr;
- if (Op == AtomicRMWInst::Xchg)
- NamePart = "_exchange";
- else if (Op == AtomicRMWInst::Add)
- NamePart = "_fetch_add";
- else if (Op == AtomicRMWInst::Sub)
- NamePart = "_fetch_sub";
- else if (Op == AtomicRMWInst::And)
- NamePart = "_fetch_and";
- else if (Op == AtomicRMWInst::Or)
- NamePart = "_fetch_or";
- else if (Op == AtomicRMWInst::Xor)
- NamePart = "_fetch_xor";
- else if (Op == AtomicRMWInst::Nand)
- NamePart = "_fetch_nand";
- else
- continue;
- SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
- TsanAtomicRMW[Op][i] =
- M.getOrInsertFunction(RMWName, Attr, Ty, PtrTy, Ty, OrdTy);
- }
-
- SmallString<32> AtomicCASName("__tsan_atomic" + BitSizeStr +
- "_compare_exchange_val");
- TsanAtomicCAS[i] = M.getOrInsertFunction(AtomicCASName, Attr, Ty, PtrTy, Ty,
- Ty, OrdTy, OrdTy);
- }
- TsanVptrUpdate =
- M.getOrInsertFunction("__tsan_vptr_update", Attr, IRB.getVoidTy(),
- IRB.getInt8PtrTy(), IRB.getInt8PtrTy());
- TsanVptrLoad = M.getOrInsertFunction("__tsan_vptr_read", Attr,
- IRB.getVoidTy(), IRB.getInt8PtrTy());
- TsanAtomicThreadFence = M.getOrInsertFunction("__tsan_atomic_thread_fence",
- Attr, IRB.getVoidTy(), OrdTy);
- TsanAtomicSignalFence = M.getOrInsertFunction("__tsan_atomic_signal_fence",
- Attr, IRB.getVoidTy(), OrdTy);
-
- MemmoveFn =
- M.getOrInsertFunction("memmove", Attr, IRB.getInt8PtrTy(),
- IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
- MemcpyFn =
- M.getOrInsertFunction("memcpy", Attr, IRB.getInt8PtrTy(),
- IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
- MemsetFn =
- M.getOrInsertFunction("memset", Attr, IRB.getInt8PtrTy(),
- IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy);
-}
-
-static bool isVtableAccess(Instruction *I) {
- if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa))
- return Tag->isTBAAVtableAccess();
- return false;
-}
-
-// Do not instrument known races/"benign races" that come from compiler
-// instrumentatin. The user has no way of suppressing them.
-static bool shouldInstrumentReadWriteFromAddress(const Module *M, Value *Addr) {
- // Peel off GEPs and BitCasts.
- Addr = Addr->stripInBoundsOffsets();
-
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
- if (GV->hasSection()) {
- StringRef SectionName = GV->getSection();
- // Check if the global is in the PGO counters section.
- auto OF = Triple(M->getTargetTriple()).getObjectFormat();
- if (SectionName.endswith(
- getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false)))
- return false;
- }
-
- // Check if the global is private gcov data.
- if (GV->getName().startswith("__llvm_gcov") ||
- GV->getName().startswith("__llvm_gcda"))
- return false;
- }
-
- // Do not instrument acesses from different address spaces; we cannot deal
- // with them.
- if (Addr) {
- Type *PtrTy = cast<PointerType>(Addr->getType()->getScalarType());
- if (PtrTy->getPointerAddressSpace() != 0)
- return false;
- }
-
- return true;
-}
-
-bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
- // If this is a GEP, just analyze its pointer operand.
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr))
- Addr = GEP->getPointerOperand();
-
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
- if (GV->isConstant()) {
- // Reads from constant globals can not race with any writes.
- NumOmittedReadsFromConstantGlobals++;
- return true;
- }
- } else if (LoadInst *L = dyn_cast<LoadInst>(Addr)) {
- if (isVtableAccess(L)) {
- // Reads from a vtable pointer can not race with any writes.
- NumOmittedReadsFromVtable++;
- return true;
- }
- }
- return false;
-}
-
-// Instrumenting some of the accesses may be proven redundant.
-// Currently handled:
-// - read-before-write (within same BB, no calls between)
-// - not captured variables
-//
-// We do not handle some of the patterns that should not survive
-// after the classic compiler optimizations.
-// E.g. two reads from the same temp should be eliminated by CSE,
-// two writes should be eliminated by DSE, etc.
-//
-// 'Local' is a vector of insns within the same BB (no calls between).
-// 'All' is a vector of insns that will be instrumented.
-void ThreadSanitizer::chooseInstructionsToInstrument(
+ Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
+ Type *PtrTy = Ty->getPointerTo();
+ SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load");
+ TsanAtomicLoad[i] =
+ M.getOrInsertFunction(AtomicLoadName, Attr, Ty, PtrTy, OrdTy);
+
+ SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store");
+ TsanAtomicStore[i] = M.getOrInsertFunction(
+ AtomicStoreName, Attr, IRB.getVoidTy(), PtrTy, Ty, OrdTy);
+
+ for (unsigned Op = AtomicRMWInst::FIRST_BINOP;
+ Op <= AtomicRMWInst::LAST_BINOP; ++Op) {
+ TsanAtomicRMW[Op][i] = nullptr;
+ const char *NamePart = nullptr;
+ if (Op == AtomicRMWInst::Xchg)
+ NamePart = "_exchange";
+ else if (Op == AtomicRMWInst::Add)
+ NamePart = "_fetch_add";
+ else if (Op == AtomicRMWInst::Sub)
+ NamePart = "_fetch_sub";
+ else if (Op == AtomicRMWInst::And)
+ NamePart = "_fetch_and";
+ else if (Op == AtomicRMWInst::Or)
+ NamePart = "_fetch_or";
+ else if (Op == AtomicRMWInst::Xor)
+ NamePart = "_fetch_xor";
+ else if (Op == AtomicRMWInst::Nand)
+ NamePart = "_fetch_nand";
+ else
+ continue;
+ SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
+ TsanAtomicRMW[Op][i] =
+ M.getOrInsertFunction(RMWName, Attr, Ty, PtrTy, Ty, OrdTy);
+ }
+
+ SmallString<32> AtomicCASName("__tsan_atomic" + BitSizeStr +
+ "_compare_exchange_val");
+ TsanAtomicCAS[i] = M.getOrInsertFunction(AtomicCASName, Attr, Ty, PtrTy, Ty,
+ Ty, OrdTy, OrdTy);
+ }
+ TsanVptrUpdate =
+ M.getOrInsertFunction("__tsan_vptr_update", Attr, IRB.getVoidTy(),
+ IRB.getInt8PtrTy(), IRB.getInt8PtrTy());
+ TsanVptrLoad = M.getOrInsertFunction("__tsan_vptr_read", Attr,
+ IRB.getVoidTy(), IRB.getInt8PtrTy());
+ TsanAtomicThreadFence = M.getOrInsertFunction("__tsan_atomic_thread_fence",
+ Attr, IRB.getVoidTy(), OrdTy);
+ TsanAtomicSignalFence = M.getOrInsertFunction("__tsan_atomic_signal_fence",
+ Attr, IRB.getVoidTy(), OrdTy);
+
+ MemmoveFn =
+ M.getOrInsertFunction("memmove", Attr, IRB.getInt8PtrTy(),
+ IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
+ MemcpyFn =
+ M.getOrInsertFunction("memcpy", Attr, IRB.getInt8PtrTy(),
+ IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy);
+ MemsetFn =
+ M.getOrInsertFunction("memset", Attr, IRB.getInt8PtrTy(),
+ IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy);
+}
+
+static bool isVtableAccess(Instruction *I) {
+ if (MDNode *Tag = I->getMetadata(LLVMContext::MD_tbaa))
+ return Tag->isTBAAVtableAccess();
+ return false;
+}
+
+// Do not instrument known races/"benign races" that come from compiler
+// instrumentatin. The user has no way of suppressing them.
+static bool shouldInstrumentReadWriteFromAddress(const Module *M, Value *Addr) {
+ // Peel off GEPs and BitCasts.
+ Addr = Addr->stripInBoundsOffsets();
+
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+ if (GV->hasSection()) {
+ StringRef SectionName = GV->getSection();
+ // Check if the global is in the PGO counters section.
+ auto OF = Triple(M->getTargetTriple()).getObjectFormat();
+ if (SectionName.endswith(
+ getInstrProfSectionName(IPSK_cnts, OF, /*AddSegmentInfo=*/false)))
+ return false;
+ }
+
+ // Check if the global is private gcov data.
+ if (GV->getName().startswith("__llvm_gcov") ||
+ GV->getName().startswith("__llvm_gcda"))
+ return false;
+ }
+
+ // Do not instrument acesses from different address spaces; we cannot deal
+ // with them.
+ if (Addr) {
+ Type *PtrTy = cast<PointerType>(Addr->getType()->getScalarType());
+ if (PtrTy->getPointerAddressSpace() != 0)
+ return false;
+ }
+
+ return true;
+}
+
+bool ThreadSanitizer::addrPointsToConstantData(Value *Addr) {
+ // If this is a GEP, just analyze its pointer operand.
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Addr))
+ Addr = GEP->getPointerOperand();
+
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Addr)) {
+ if (GV->isConstant()) {
+ // Reads from constant globals can not race with any writes.
+ NumOmittedReadsFromConstantGlobals++;
+ return true;
+ }
+ } else if (LoadInst *L = dyn_cast<LoadInst>(Addr)) {
+ if (isVtableAccess(L)) {
+ // Reads from a vtable pointer can not race with any writes.
+ NumOmittedReadsFromVtable++;
+ return true;
+ }
+ }
+ return false;
+}
+
+// Instrumenting some of the accesses may be proven redundant.
+// Currently handled:
+// - read-before-write (within same BB, no calls between)
+// - not captured variables
+//
+// We do not handle some of the patterns that should not survive
+// after the classic compiler optimizations.
+// E.g. two reads from the same temp should be eliminated by CSE,
+// two writes should be eliminated by DSE, etc.
+//
+// 'Local' is a vector of insns within the same BB (no calls between).
+// 'All' is a vector of insns that will be instrumented.
+void ThreadSanitizer::chooseInstructionsToInstrument(
SmallVectorImpl<Instruction *> &Local,
SmallVectorImpl<InstructionInfo> &All, const DataLayout &DL) {
DenseMap<Value *, size_t> WriteTargets; // Map of addresses to index in All
- // Iterate from the end.
- for (Instruction *I : reverse(Local)) {
+ // Iterate from the end.
+ for (Instruction *I : reverse(Local)) {
const bool IsWrite = isa<StoreInst>(*I);
Value *Addr = IsWrite ? cast<StoreInst>(I)->getPointerOperand()
: cast<LoadInst>(I)->getPointerOperand();
@@ -470,22 +470,22 @@ void ThreadSanitizer::chooseInstructionsToInstrument(
NumOmittedReadsBeforeWrite++;
continue;
}
- }
+ }
- if (addrPointsToConstantData(Addr)) {
- // Addr points to some constant data -- it can not race with any writes.
- continue;
- }
- }
+ if (addrPointsToConstantData(Addr)) {
+ // Addr points to some constant data -- it can not race with any writes.
+ continue;
+ }
+ }
if (isa<AllocaInst>(getUnderlyingObject(Addr)) &&
- !PointerMayBeCaptured(Addr, true, true)) {
- // The variable is addressable but not captured, so it cannot be
- // referenced from a different thread and participate in a data race
- // (see llvm/Analysis/CaptureTracking.h for details).
- NumOmittedNonCaptured++;
- continue;
- }
+ !PointerMayBeCaptured(Addr, true, true)) {
+ // The variable is addressable but not captured, so it cannot be
+ // referenced from a different thread and participate in a data race
+ // (see llvm/Analysis/CaptureTracking.h for details).
+ NumOmittedNonCaptured++;
+ continue;
+ }
// Instrument this instruction.
All.emplace_back(I);
@@ -494,160 +494,160 @@ void ThreadSanitizer::chooseInstructionsToInstrument(
// write target, and we can override any previous entry if it exists.
WriteTargets[Addr] = All.size() - 1;
}
- }
- Local.clear();
-}
-
-static bool isAtomic(Instruction *I) {
- // TODO: Ask TTI whether synchronization scope is between threads.
- if (LoadInst *LI = dyn_cast<LoadInst>(I))
- return LI->isAtomic() && LI->getSyncScopeID() != SyncScope::SingleThread;
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->isAtomic() && SI->getSyncScopeID() != SyncScope::SingleThread;
- if (isa<AtomicRMWInst>(I))
- return true;
- if (isa<AtomicCmpXchgInst>(I))
- return true;
- if (isa<FenceInst>(I))
- return true;
- return false;
-}
-
-void ThreadSanitizer::InsertRuntimeIgnores(Function &F) {
- IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
- IRB.CreateCall(TsanIgnoreBegin);
- EscapeEnumerator EE(F, "tsan_ignore_cleanup", ClHandleCxxExceptions);
- while (IRBuilder<> *AtExit = EE.Next()) {
- AtExit->CreateCall(TsanIgnoreEnd);
- }
-}
-
-bool ThreadSanitizer::sanitizeFunction(Function &F,
- const TargetLibraryInfo &TLI) {
- // This is required to prevent instrumenting call to __tsan_init from within
- // the module constructor.
- if (F.getName() == kTsanModuleCtorName)
- return false;
- // Naked functions can not have prologue/epilogue
- // (__tsan_func_entry/__tsan_func_exit) generated, so don't instrument them at
- // all.
- if (F.hasFnAttribute(Attribute::Naked))
- return false;
- initialize(*F.getParent());
+ }
+ Local.clear();
+}
+
+static bool isAtomic(Instruction *I) {
+ // TODO: Ask TTI whether synchronization scope is between threads.
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return LI->isAtomic() && LI->getSyncScopeID() != SyncScope::SingleThread;
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->isAtomic() && SI->getSyncScopeID() != SyncScope::SingleThread;
+ if (isa<AtomicRMWInst>(I))
+ return true;
+ if (isa<AtomicCmpXchgInst>(I))
+ return true;
+ if (isa<FenceInst>(I))
+ return true;
+ return false;
+}
+
+void ThreadSanitizer::InsertRuntimeIgnores(Function &F) {
+ IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+ IRB.CreateCall(TsanIgnoreBegin);
+ EscapeEnumerator EE(F, "tsan_ignore_cleanup", ClHandleCxxExceptions);
+ while (IRBuilder<> *AtExit = EE.Next()) {
+ AtExit->CreateCall(TsanIgnoreEnd);
+ }
+}
+
+bool ThreadSanitizer::sanitizeFunction(Function &F,
+ const TargetLibraryInfo &TLI) {
+ // This is required to prevent instrumenting call to __tsan_init from within
+ // the module constructor.
+ if (F.getName() == kTsanModuleCtorName)
+ return false;
+ // Naked functions can not have prologue/epilogue
+ // (__tsan_func_entry/__tsan_func_exit) generated, so don't instrument them at
+ // all.
+ if (F.hasFnAttribute(Attribute::Naked))
+ return false;
+ initialize(*F.getParent());
SmallVector<InstructionInfo, 8> AllLoadsAndStores;
- SmallVector<Instruction*, 8> LocalLoadsAndStores;
- SmallVector<Instruction*, 8> AtomicAccesses;
- SmallVector<Instruction*, 8> MemIntrinCalls;
- bool Res = false;
- bool HasCalls = false;
- bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread);
- const DataLayout &DL = F.getParent()->getDataLayout();
-
- // Traverse all instructions, collect loads/stores/returns, check for calls.
- for (auto &BB : F) {
- for (auto &Inst : BB) {
- if (isAtomic(&Inst))
- AtomicAccesses.push_back(&Inst);
- else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
- LocalLoadsAndStores.push_back(&Inst);
- else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
- if (CallInst *CI = dyn_cast<CallInst>(&Inst))
- maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI);
- if (isa<MemIntrinsic>(Inst))
- MemIntrinCalls.push_back(&Inst);
- HasCalls = true;
- chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores,
- DL);
- }
- }
- chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, DL);
- }
-
- // We have collected all loads and stores.
- // FIXME: many of these accesses do not need to be checked for races
- // (e.g. variables that do not escape, etc).
-
- // Instrument memory accesses only if we want to report bugs in the function.
- if (ClInstrumentMemoryAccesses && SanitizeFunction)
+ SmallVector<Instruction*, 8> LocalLoadsAndStores;
+ SmallVector<Instruction*, 8> AtomicAccesses;
+ SmallVector<Instruction*, 8> MemIntrinCalls;
+ bool Res = false;
+ bool HasCalls = false;
+ bool SanitizeFunction = F.hasFnAttribute(Attribute::SanitizeThread);
+ const DataLayout &DL = F.getParent()->getDataLayout();
+
+ // Traverse all instructions, collect loads/stores/returns, check for calls.
+ for (auto &BB : F) {
+ for (auto &Inst : BB) {
+ if (isAtomic(&Inst))
+ AtomicAccesses.push_back(&Inst);
+ else if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+ LocalLoadsAndStores.push_back(&Inst);
+ else if (isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) {
+ if (CallInst *CI = dyn_cast<CallInst>(&Inst))
+ maybeMarkSanitizerLibraryCallNoBuiltin(CI, &TLI);
+ if (isa<MemIntrinsic>(Inst))
+ MemIntrinCalls.push_back(&Inst);
+ HasCalls = true;
+ chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores,
+ DL);
+ }
+ }
+ chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores, DL);
+ }
+
+ // We have collected all loads and stores.
+ // FIXME: many of these accesses do not need to be checked for races
+ // (e.g. variables that do not escape, etc).
+
+ // Instrument memory accesses only if we want to report bugs in the function.
+ if (ClInstrumentMemoryAccesses && SanitizeFunction)
for (const auto &II : AllLoadsAndStores) {
Res |= instrumentLoadOrStore(II, DL);
- }
-
- // Instrument atomic memory accesses in any case (they can be used to
- // implement synchronization).
- if (ClInstrumentAtomics)
- for (auto Inst : AtomicAccesses) {
- Res |= instrumentAtomic(Inst, DL);
- }
-
- if (ClInstrumentMemIntrinsics && SanitizeFunction)
- for (auto Inst : MemIntrinCalls) {
- Res |= instrumentMemIntrinsic(Inst);
- }
-
- if (F.hasFnAttribute("sanitize_thread_no_checking_at_run_time")) {
- assert(!F.hasFnAttribute(Attribute::SanitizeThread));
- if (HasCalls)
- InsertRuntimeIgnores(F);
- }
-
- // Instrument function entry/exit points if there were instrumented accesses.
- if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
- IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
- Value *ReturnAddress = IRB.CreateCall(
- Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress),
- IRB.getInt32(0));
- IRB.CreateCall(TsanFuncEntry, ReturnAddress);
-
- EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions);
- while (IRBuilder<> *AtExit = EE.Next()) {
- AtExit->CreateCall(TsanFuncExit, {});
- }
- Res = true;
- }
- return Res;
-}
-
+ }
+
+ // Instrument atomic memory accesses in any case (they can be used to
+ // implement synchronization).
+ if (ClInstrumentAtomics)
+ for (auto Inst : AtomicAccesses) {
+ Res |= instrumentAtomic(Inst, DL);
+ }
+
+ if (ClInstrumentMemIntrinsics && SanitizeFunction)
+ for (auto Inst : MemIntrinCalls) {
+ Res |= instrumentMemIntrinsic(Inst);
+ }
+
+ if (F.hasFnAttribute("sanitize_thread_no_checking_at_run_time")) {
+ assert(!F.hasFnAttribute(Attribute::SanitizeThread));
+ if (HasCalls)
+ InsertRuntimeIgnores(F);
+ }
+
+ // Instrument function entry/exit points if there were instrumented accesses.
+ if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
+ IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+ Value *ReturnAddress = IRB.CreateCall(
+ Intrinsic::getDeclaration(F.getParent(), Intrinsic::returnaddress),
+ IRB.getInt32(0));
+ IRB.CreateCall(TsanFuncEntry, ReturnAddress);
+
+ EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions);
+ while (IRBuilder<> *AtExit = EE.Next()) {
+ AtExit->CreateCall(TsanFuncExit, {});
+ }
+ Res = true;
+ }
+ return Res;
+}
+
bool ThreadSanitizer::instrumentLoadOrStore(const InstructionInfo &II,
- const DataLayout &DL) {
+ const DataLayout &DL) {
IRBuilder<> IRB(II.Inst);
const bool IsWrite = isa<StoreInst>(*II.Inst);
Value *Addr = IsWrite ? cast<StoreInst>(II.Inst)->getPointerOperand()
: cast<LoadInst>(II.Inst)->getPointerOperand();
-
- // swifterror memory addresses are mem2reg promoted by instruction selection.
- // As such they cannot have regular uses like an instrumentation function and
- // it makes no sense to track them as memory.
- if (Addr->isSwiftError())
- return false;
-
- int Idx = getMemoryAccessFuncIndex(Addr, DL);
- if (Idx < 0)
- return false;
+
+ // swifterror memory addresses are mem2reg promoted by instruction selection.
+ // As such they cannot have regular uses like an instrumentation function and
+ // it makes no sense to track them as memory.
+ if (Addr->isSwiftError())
+ return false;
+
+ int Idx = getMemoryAccessFuncIndex(Addr, DL);
+ if (Idx < 0)
+ return false;
if (IsWrite && isVtableAccess(II.Inst)) {
LLVM_DEBUG(dbgs() << " VPTR : " << *II.Inst << "\n");
Value *StoredValue = cast<StoreInst>(II.Inst)->getValueOperand();
- // StoredValue may be a vector type if we are storing several vptrs at once.
- // In this case, just take the first element of the vector since this is
- // enough to find vptr races.
- if (isa<VectorType>(StoredValue->getType()))
- StoredValue = IRB.CreateExtractElement(
- StoredValue, ConstantInt::get(IRB.getInt32Ty(), 0));
- if (StoredValue->getType()->isIntegerTy())
- StoredValue = IRB.CreateIntToPtr(StoredValue, IRB.getInt8PtrTy());
- // Call TsanVptrUpdate.
- IRB.CreateCall(TsanVptrUpdate,
- {IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
- IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy())});
- NumInstrumentedVtableWrites++;
- return true;
- }
+ // StoredValue may be a vector type if we are storing several vptrs at once.
+ // In this case, just take the first element of the vector since this is
+ // enough to find vptr races.
+ if (isa<VectorType>(StoredValue->getType()))
+ StoredValue = IRB.CreateExtractElement(
+ StoredValue, ConstantInt::get(IRB.getInt32Ty(), 0));
+ if (StoredValue->getType()->isIntegerTy())
+ StoredValue = IRB.CreateIntToPtr(StoredValue, IRB.getInt8PtrTy());
+ // Call TsanVptrUpdate.
+ IRB.CreateCall(TsanVptrUpdate,
+ {IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()),
+ IRB.CreatePointerCast(StoredValue, IRB.getInt8PtrTy())});
+ NumInstrumentedVtableWrites++;
+ return true;
+ }
if (!IsWrite && isVtableAccess(II.Inst)) {
- IRB.CreateCall(TsanVptrLoad,
- IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
- NumInstrumentedVtableReads++;
- return true;
- }
+ IRB.CreateCall(TsanVptrLoad,
+ IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+ NumInstrumentedVtableReads++;
+ return true;
+ }
const unsigned Alignment = IsWrite ? cast<StoreInst>(II.Inst)->getAlignment()
: cast<LoadInst>(II.Inst)->getAlignment();
@@ -658,191 +658,191 @@ bool ThreadSanitizer::instrumentLoadOrStore(const InstructionInfo &II,
: cast<LoadInst>(II.Inst)->isVolatile());
assert((!IsVolatile || !IsCompoundRW) && "Compound volatile invalid!");
- Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
- const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
- FunctionCallee OnAccessFunc = nullptr;
- if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0) {
+ Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+ const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+ FunctionCallee OnAccessFunc = nullptr;
+ if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0) {
if (IsCompoundRW)
OnAccessFunc = TsanCompoundRW[Idx];
else if (IsVolatile)
- OnAccessFunc = IsWrite ? TsanVolatileWrite[Idx] : TsanVolatileRead[Idx];
- else
- OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
- } else {
+ OnAccessFunc = IsWrite ? TsanVolatileWrite[Idx] : TsanVolatileRead[Idx];
+ else
+ OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
+ } else {
if (IsCompoundRW)
OnAccessFunc = TsanUnalignedCompoundRW[Idx];
else if (IsVolatile)
- OnAccessFunc = IsWrite ? TsanUnalignedVolatileWrite[Idx]
- : TsanUnalignedVolatileRead[Idx];
- else
- OnAccessFunc = IsWrite ? TsanUnalignedWrite[Idx] : TsanUnalignedRead[Idx];
- }
- IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+ OnAccessFunc = IsWrite ? TsanUnalignedVolatileWrite[Idx]
+ : TsanUnalignedVolatileRead[Idx];
+ else
+ OnAccessFunc = IsWrite ? TsanUnalignedWrite[Idx] : TsanUnalignedRead[Idx];
+ }
+ IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
if (IsCompoundRW || IsWrite)
NumInstrumentedWrites++;
if (IsCompoundRW || !IsWrite)
NumInstrumentedReads++;
- return true;
-}
-
-static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
- uint32_t v = 0;
- switch (ord) {
- case AtomicOrdering::NotAtomic:
- llvm_unreachable("unexpected atomic ordering!");
- case AtomicOrdering::Unordered: LLVM_FALLTHROUGH;
- case AtomicOrdering::Monotonic: v = 0; break;
- // Not specified yet:
- // case AtomicOrdering::Consume: v = 1; break;
- case AtomicOrdering::Acquire: v = 2; break;
- case AtomicOrdering::Release: v = 3; break;
- case AtomicOrdering::AcquireRelease: v = 4; break;
- case AtomicOrdering::SequentiallyConsistent: v = 5; break;
- }
- return IRB->getInt32(v);
-}
-
-// If a memset intrinsic gets inlined by the code gen, we will miss races on it.
-// So, we either need to ensure the intrinsic is not inlined, or instrument it.
-// We do not instrument memset/memmove/memcpy intrinsics (too complicated),
-// instead we simply replace them with regular function calls, which are then
-// intercepted by the run-time.
-// Since tsan is running after everyone else, the calls should not be
-// replaced back with intrinsics. If that becomes wrong at some point,
-// we will need to call e.g. __tsan_memset to avoid the intrinsics.
-bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) {
- IRBuilder<> IRB(I);
- if (MemSetInst *M = dyn_cast<MemSetInst>(I)) {
- IRB.CreateCall(
- MemsetFn,
- {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
- IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false),
- IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
- I->eraseFromParent();
- } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) {
- IRB.CreateCall(
- isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn,
- {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
- IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()),
- IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
- I->eraseFromParent();
- }
- return false;
-}
-
-// Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x
-// standards. For background see C++11 standard. A slightly older, publicly
-// available draft of the standard (not entirely up-to-date, but close enough
-// for casual browsing) is available here:
-// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2011/n3242.pdf
-// The following page contains more background information:
-// http://www.hpl.hp.com/personal/Hans_Boehm/c++mm/
-
-bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
- IRBuilder<> IRB(I);
- if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
- Value *Addr = LI->getPointerOperand();
- int Idx = getMemoryAccessFuncIndex(Addr, DL);
- if (Idx < 0)
- return false;
- const unsigned ByteSize = 1U << Idx;
- const unsigned BitSize = ByteSize * 8;
- Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
- Type *PtrTy = Ty->getPointerTo();
- Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
- createOrdering(&IRB, LI->getOrdering())};
- Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
- Value *C = IRB.CreateCall(TsanAtomicLoad[Idx], Args);
- Value *Cast = IRB.CreateBitOrPointerCast(C, OrigTy);
- I->replaceAllUsesWith(Cast);
- } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
- Value *Addr = SI->getPointerOperand();
- int Idx = getMemoryAccessFuncIndex(Addr, DL);
- if (Idx < 0)
- return false;
- const unsigned ByteSize = 1U << Idx;
- const unsigned BitSize = ByteSize * 8;
- Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
- Type *PtrTy = Ty->getPointerTo();
- Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
- IRB.CreateBitOrPointerCast(SI->getValueOperand(), Ty),
- createOrdering(&IRB, SI->getOrdering())};
- CallInst *C = CallInst::Create(TsanAtomicStore[Idx], Args);
- ReplaceInstWithInst(I, C);
- } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
- Value *Addr = RMWI->getPointerOperand();
- int Idx = getMemoryAccessFuncIndex(Addr, DL);
- if (Idx < 0)
- return false;
- FunctionCallee F = TsanAtomicRMW[RMWI->getOperation()][Idx];
- if (!F)
- return false;
- const unsigned ByteSize = 1U << Idx;
- const unsigned BitSize = ByteSize * 8;
- Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
- Type *PtrTy = Ty->getPointerTo();
- Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
- IRB.CreateIntCast(RMWI->getValOperand(), Ty, false),
- createOrdering(&IRB, RMWI->getOrdering())};
- CallInst *C = CallInst::Create(F, Args);
- ReplaceInstWithInst(I, C);
- } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
- Value *Addr = CASI->getPointerOperand();
- int Idx = getMemoryAccessFuncIndex(Addr, DL);
- if (Idx < 0)
- return false;
- const unsigned ByteSize = 1U << Idx;
- const unsigned BitSize = ByteSize * 8;
- Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
- Type *PtrTy = Ty->getPointerTo();
- Value *CmpOperand =
- IRB.CreateBitOrPointerCast(CASI->getCompareOperand(), Ty);
- Value *NewOperand =
- IRB.CreateBitOrPointerCast(CASI->getNewValOperand(), Ty);
- Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
- CmpOperand,
- NewOperand,
- createOrdering(&IRB, CASI->getSuccessOrdering()),
- createOrdering(&IRB, CASI->getFailureOrdering())};
- CallInst *C = IRB.CreateCall(TsanAtomicCAS[Idx], Args);
- Value *Success = IRB.CreateICmpEQ(C, CmpOperand);
- Value *OldVal = C;
- Type *OrigOldValTy = CASI->getNewValOperand()->getType();
- if (Ty != OrigOldValTy) {
- // The value is a pointer, so we need to cast the return value.
- OldVal = IRB.CreateIntToPtr(C, OrigOldValTy);
- }
-
- Value *Res =
- IRB.CreateInsertValue(UndefValue::get(CASI->getType()), OldVal, 0);
- Res = IRB.CreateInsertValue(Res, Success, 1);
-
- I->replaceAllUsesWith(Res);
- I->eraseFromParent();
- } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) {
- Value *Args[] = {createOrdering(&IRB, FI->getOrdering())};
- FunctionCallee F = FI->getSyncScopeID() == SyncScope::SingleThread
- ? TsanAtomicSignalFence
- : TsanAtomicThreadFence;
- CallInst *C = CallInst::Create(F, Args);
- ReplaceInstWithInst(I, C);
- }
- return true;
-}
-
-int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr,
- const DataLayout &DL) {
- Type *OrigPtrTy = Addr->getType();
- Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
- assert(OrigTy->isSized());
- uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
- if (TypeSize != 8 && TypeSize != 16 &&
- TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
- NumAccessesWithBadSize++;
- // Ignore all unusual sizes.
- return -1;
- }
- size_t Idx = countTrailingZeros(TypeSize / 8);
- assert(Idx < kNumberOfAccessSizes);
- return Idx;
-}
+ return true;
+}
+
+static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
+ uint32_t v = 0;
+ switch (ord) {
+ case AtomicOrdering::NotAtomic:
+ llvm_unreachable("unexpected atomic ordering!");
+ case AtomicOrdering::Unordered: LLVM_FALLTHROUGH;
+ case AtomicOrdering::Monotonic: v = 0; break;
+ // Not specified yet:
+ // case AtomicOrdering::Consume: v = 1; break;
+ case AtomicOrdering::Acquire: v = 2; break;
+ case AtomicOrdering::Release: v = 3; break;
+ case AtomicOrdering::AcquireRelease: v = 4; break;
+ case AtomicOrdering::SequentiallyConsistent: v = 5; break;
+ }
+ return IRB->getInt32(v);
+}
+
+// If a memset intrinsic gets inlined by the code gen, we will miss races on it.
+// So, we either need to ensure the intrinsic is not inlined, or instrument it.
+// We do not instrument memset/memmove/memcpy intrinsics (too complicated),
+// instead we simply replace them with regular function calls, which are then
+// intercepted by the run-time.
+// Since tsan is running after everyone else, the calls should not be
+// replaced back with intrinsics. If that becomes wrong at some point,
+// we will need to call e.g. __tsan_memset to avoid the intrinsics.
+bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) {
+ IRBuilder<> IRB(I);
+ if (MemSetInst *M = dyn_cast<MemSetInst>(I)) {
+ IRB.CreateCall(
+ MemsetFn,
+ {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+ IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false),
+ IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+ I->eraseFromParent();
+ } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) {
+ IRB.CreateCall(
+ isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn,
+ {IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+ IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()),
+ IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false)});
+ I->eraseFromParent();
+ }
+ return false;
+}
+
+// Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x
+// standards. For background see C++11 standard. A slightly older, publicly
+// available draft of the standard (not entirely up-to-date, but close enough
+// for casual browsing) is available here:
+// http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2011/n3242.pdf
+// The following page contains more background information:
+// http://www.hpl.hp.com/personal/Hans_Boehm/c++mm/
+
+bool ThreadSanitizer::instrumentAtomic(Instruction *I, const DataLayout &DL) {
+ IRBuilder<> IRB(I);
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ Value *Addr = LI->getPointerOperand();
+ int Idx = getMemoryAccessFuncIndex(Addr, DL);
+ if (Idx < 0)
+ return false;
+ const unsigned ByteSize = 1U << Idx;
+ const unsigned BitSize = ByteSize * 8;
+ Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+ Type *PtrTy = Ty->getPointerTo();
+ Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+ createOrdering(&IRB, LI->getOrdering())};
+ Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
+ Value *C = IRB.CreateCall(TsanAtomicLoad[Idx], Args);
+ Value *Cast = IRB.CreateBitOrPointerCast(C, OrigTy);
+ I->replaceAllUsesWith(Cast);
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ Value *Addr = SI->getPointerOperand();
+ int Idx = getMemoryAccessFuncIndex(Addr, DL);
+ if (Idx < 0)
+ return false;
+ const unsigned ByteSize = 1U << Idx;
+ const unsigned BitSize = ByteSize * 8;
+ Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+ Type *PtrTy = Ty->getPointerTo();
+ Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+ IRB.CreateBitOrPointerCast(SI->getValueOperand(), Ty),
+ createOrdering(&IRB, SI->getOrdering())};
+ CallInst *C = CallInst::Create(TsanAtomicStore[Idx], Args);
+ ReplaceInstWithInst(I, C);
+ } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+ Value *Addr = RMWI->getPointerOperand();
+ int Idx = getMemoryAccessFuncIndex(Addr, DL);
+ if (Idx < 0)
+ return false;
+ FunctionCallee F = TsanAtomicRMW[RMWI->getOperation()][Idx];
+ if (!F)
+ return false;
+ const unsigned ByteSize = 1U << Idx;
+ const unsigned BitSize = ByteSize * 8;
+ Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+ Type *PtrTy = Ty->getPointerTo();
+ Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+ IRB.CreateIntCast(RMWI->getValOperand(), Ty, false),
+ createOrdering(&IRB, RMWI->getOrdering())};
+ CallInst *C = CallInst::Create(F, Args);
+ ReplaceInstWithInst(I, C);
+ } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
+ Value *Addr = CASI->getPointerOperand();
+ int Idx = getMemoryAccessFuncIndex(Addr, DL);
+ if (Idx < 0)
+ return false;
+ const unsigned ByteSize = 1U << Idx;
+ const unsigned BitSize = ByteSize * 8;
+ Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+ Type *PtrTy = Ty->getPointerTo();
+ Value *CmpOperand =
+ IRB.CreateBitOrPointerCast(CASI->getCompareOperand(), Ty);
+ Value *NewOperand =
+ IRB.CreateBitOrPointerCast(CASI->getNewValOperand(), Ty);
+ Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+ CmpOperand,
+ NewOperand,
+ createOrdering(&IRB, CASI->getSuccessOrdering()),
+ createOrdering(&IRB, CASI->getFailureOrdering())};
+ CallInst *C = IRB.CreateCall(TsanAtomicCAS[Idx], Args);
+ Value *Success = IRB.CreateICmpEQ(C, CmpOperand);
+ Value *OldVal = C;
+ Type *OrigOldValTy = CASI->getNewValOperand()->getType();
+ if (Ty != OrigOldValTy) {
+ // The value is a pointer, so we need to cast the return value.
+ OldVal = IRB.CreateIntToPtr(C, OrigOldValTy);
+ }
+
+ Value *Res =
+ IRB.CreateInsertValue(UndefValue::get(CASI->getType()), OldVal, 0);
+ Res = IRB.CreateInsertValue(Res, Success, 1);
+
+ I->replaceAllUsesWith(Res);
+ I->eraseFromParent();
+ } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) {
+ Value *Args[] = {createOrdering(&IRB, FI->getOrdering())};
+ FunctionCallee F = FI->getSyncScopeID() == SyncScope::SingleThread
+ ? TsanAtomicSignalFence
+ : TsanAtomicThreadFence;
+ CallInst *C = CallInst::Create(F, Args);
+ ReplaceInstWithInst(I, C);
+ }
+ return true;
+}
+
+int ThreadSanitizer::getMemoryAccessFuncIndex(Value *Addr,
+ const DataLayout &DL) {
+ Type *OrigPtrTy = Addr->getType();
+ Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+ assert(OrigTy->isSized());
+ uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
+ if (TypeSize != 8 && TypeSize != 16 &&
+ TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
+ NumAccessesWithBadSize++;
+ // Ignore all unusual sizes.
+ return -1;
+ }
+ size_t Idx = countTrailingZeros(TypeSize / 8);
+ assert(Idx < kNumberOfAccessSizes);
+ return Idx;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
index d7d10fb5d5..fb6216bb21 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
@@ -1,80 +1,80 @@
-//===- ValueProfileCollector.cpp - determine what to value profile --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The implementation of the ValueProfileCollector via ValueProfileCollectorImpl
-//
-//===----------------------------------------------------------------------===//
-
-#include "ValueProfilePlugins.inc"
+//===- ValueProfileCollector.cpp - determine what to value profile --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The implementation of the ValueProfileCollector via ValueProfileCollectorImpl
+//
+//===----------------------------------------------------------------------===//
+
+#include "ValueProfilePlugins.inc"
#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/InitializePasses.h"
-#include <cassert>
-
-using namespace llvm;
-
-namespace {
-
-/// A plugin-based class that takes an arbitrary number of Plugin types.
-/// Each plugin type must satisfy the following API:
-/// 1) the constructor must take a `Function &f`. Typically, the plugin would
-/// scan the function looking for candidates.
-/// 2) contain a member function with the following signature and name:
-/// void run(std::vector<CandidateInfo> &Candidates);
-/// such that the plugin would append its result into the vector parameter.
-///
-/// Plugins are defined in ValueProfilePlugins.inc
-template <class... Ts> class PluginChain;
-
-/// The type PluginChainFinal is the final chain of plugins that will be used by
-/// ValueProfileCollectorImpl.
-using PluginChainFinal = PluginChain<VP_PLUGIN_LIST>;
-
-template <> class PluginChain<> {
-public:
- PluginChain(Function &F, TargetLibraryInfo &TLI) {}
- void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {}
-};
-
-template <class PluginT, class... Ts>
-class PluginChain<PluginT, Ts...> : public PluginChain<Ts...> {
- PluginT Plugin;
- using Base = PluginChain<Ts...>;
-
-public:
- PluginChain(Function &F, TargetLibraryInfo &TLI)
- : PluginChain<Ts...>(F, TLI), Plugin(F, TLI) {}
-
- void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {
- if (K == PluginT::Kind)
- Plugin.run(Candidates);
- Base::get(K, Candidates);
- }
-};
-
-} // end anonymous namespace
-
-/// ValueProfileCollectorImpl inherits the API of PluginChainFinal.
-class ValueProfileCollector::ValueProfileCollectorImpl : public PluginChainFinal {
-public:
- using PluginChainFinal::PluginChainFinal;
-};
-
-ValueProfileCollector::ValueProfileCollector(Function &F,
- TargetLibraryInfo &TLI)
- : PImpl(new ValueProfileCollectorImpl(F, TLI)) {}
-
-ValueProfileCollector::~ValueProfileCollector() = default;
-
-std::vector<CandidateInfo>
-ValueProfileCollector::get(InstrProfValueKind Kind) const {
- std::vector<CandidateInfo> Result;
- PImpl->get(Kind, Result);
- return Result;
-}
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/InitializePasses.h"
+#include <cassert>
+
+using namespace llvm;
+
+namespace {
+
+/// A plugin-based class that takes an arbitrary number of Plugin types.
+/// Each plugin type must satisfy the following API:
+/// 1) the constructor must take a `Function &f`. Typically, the plugin would
+/// scan the function looking for candidates.
+/// 2) contain a member function with the following signature and name:
+/// void run(std::vector<CandidateInfo> &Candidates);
+/// such that the plugin would append its result into the vector parameter.
+///
+/// Plugins are defined in ValueProfilePlugins.inc
+template <class... Ts> class PluginChain;
+
+/// The type PluginChainFinal is the final chain of plugins that will be used by
+/// ValueProfileCollectorImpl.
+using PluginChainFinal = PluginChain<VP_PLUGIN_LIST>;
+
+template <> class PluginChain<> {
+public:
+ PluginChain(Function &F, TargetLibraryInfo &TLI) {}
+ void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {}
+};
+
+template <class PluginT, class... Ts>
+class PluginChain<PluginT, Ts...> : public PluginChain<Ts...> {
+ PluginT Plugin;
+ using Base = PluginChain<Ts...>;
+
+public:
+ PluginChain(Function &F, TargetLibraryInfo &TLI)
+ : PluginChain<Ts...>(F, TLI), Plugin(F, TLI) {}
+
+ void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {
+ if (K == PluginT::Kind)
+ Plugin.run(Candidates);
+ Base::get(K, Candidates);
+ }
+};
+
+} // end anonymous namespace
+
+/// ValueProfileCollectorImpl inherits the API of PluginChainFinal.
+class ValueProfileCollector::ValueProfileCollectorImpl : public PluginChainFinal {
+public:
+ using PluginChainFinal::PluginChainFinal;
+};
+
+ValueProfileCollector::ValueProfileCollector(Function &F,
+ TargetLibraryInfo &TLI)
+ : PImpl(new ValueProfileCollectorImpl(F, TLI)) {}
+
+ValueProfileCollector::~ValueProfileCollector() = default;
+
+std::vector<CandidateInfo>
+ValueProfileCollector::get(InstrProfValueKind Kind) const {
+ std::vector<CandidateInfo> Result;
+ PImpl->get(Kind, Result);
+ return Result;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.h b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.h
index 40f5006007..584a60ab45 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.h
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfileCollector.h
@@ -1,83 +1,83 @@
-//===- ValueProfileCollector.h - determine what to value profile ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a utility class, ValueProfileCollector, that is used to
-// determine what kind of llvm::Value's are worth value-profiling, at which
-// point in the program, and which instruction holds the Value Profile metadata.
-// Currently, the only users of this utility is the PGOInstrumentation[Gen|Use]
-// passes.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
-#define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
-
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/ProfileData/InstrProf.h"
+//===- ValueProfileCollector.h - determine what to value profile ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a utility class, ValueProfileCollector, that is used to
+// determine what kind of llvm::Value's are worth value-profiling, at which
+// point in the program, and which instruction holds the Value Profile metadata.
+// Currently, the only users of this utility is the PGOInstrumentation[Gen|Use]
+// passes.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
+#define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
+
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/ProfileData/InstrProf.h"
#include <memory>
#include <vector>
-
-namespace llvm {
-
+
+namespace llvm {
+
class Function;
class Instruction;
class Value;
-/// Utility analysis that determines what values are worth profiling.
-/// The actual logic is inside the ValueProfileCollectorImpl, whose job is to
-/// populate the Candidates vector.
-///
-/// Value profiling an expression means to track the values that this expression
-/// takes at runtime and the frequency of each value.
-/// It is important to distinguish between two sets of value profiles for a
-/// particular expression:
-/// 1) The set of values at the point of evaluation.
-/// 2) The set of values at the point of use.
-/// In some cases, the two sets are identical, but it's not unusual for the two
-/// to differ.
-///
-/// To elaborate more, consider this C code, and focus on the expression `nn`:
-/// void foo(int nn, bool b) {
-/// if (b) memcpy(x, y, nn);
-/// }
-/// The point of evaluation can be as early as the start of the function, and
-/// let's say the value profile for `nn` is:
-/// total=100; (value,freq) set = {(8,10), (32,50)}
-/// The point of use is right before we call memcpy, and since we execute the
-/// memcpy conditionally, the value profile of `nn` can be:
-/// total=15; (value,freq) set = {(8,10), (4,5)}
-///
-/// For this reason, a plugin is responsible for computing the insertion point
-/// for each value to be profiled. The `CandidateInfo` structure encapsulates
-/// all the information needed for each value profile site.
-class ValueProfileCollector {
-public:
- struct CandidateInfo {
- Value *V; // The value to profile.
- Instruction *InsertPt; // Insert the VP lib call before this instr.
- Instruction *AnnotatedInst; // Where metadata is attached.
- };
-
- ValueProfileCollector(Function &Fn, TargetLibraryInfo &TLI);
- ValueProfileCollector(ValueProfileCollector &&) = delete;
- ValueProfileCollector &operator=(ValueProfileCollector &&) = delete;
-
- ValueProfileCollector(const ValueProfileCollector &) = delete;
- ValueProfileCollector &operator=(const ValueProfileCollector &) = delete;
- ~ValueProfileCollector();
-
- /// returns a list of value profiling candidates of the given kind
- std::vector<CandidateInfo> get(InstrProfValueKind Kind) const;
-
-private:
- class ValueProfileCollectorImpl;
- std::unique_ptr<ValueProfileCollectorImpl> PImpl;
-};
-
-} // namespace llvm
-
-#endif
+/// Utility analysis that determines what values are worth profiling.
+/// The actual logic is inside the ValueProfileCollectorImpl, whose job is to
+/// populate the Candidates vector.
+///
+/// Value profiling an expression means to track the values that this expression
+/// takes at runtime and the frequency of each value.
+/// It is important to distinguish between two sets of value profiles for a
+/// particular expression:
+/// 1) The set of values at the point of evaluation.
+/// 2) The set of values at the point of use.
+/// In some cases, the two sets are identical, but it's not unusual for the two
+/// to differ.
+///
+/// To elaborate more, consider this C code, and focus on the expression `nn`:
+/// void foo(int nn, bool b) {
+/// if (b) memcpy(x, y, nn);
+/// }
+/// The point of evaluation can be as early as the start of the function, and
+/// let's say the value profile for `nn` is:
+/// total=100; (value,freq) set = {(8,10), (32,50)}
+/// The point of use is right before we call memcpy, and since we execute the
+/// memcpy conditionally, the value profile of `nn` can be:
+/// total=15; (value,freq) set = {(8,10), (4,5)}
+///
+/// For this reason, a plugin is responsible for computing the insertion point
+/// for each value to be profiled. The `CandidateInfo` structure encapsulates
+/// all the information needed for each value profile site.
+class ValueProfileCollector {
+public:
+ struct CandidateInfo {
+ Value *V; // The value to profile.
+ Instruction *InsertPt; // Insert the VP lib call before this instr.
+ Instruction *AnnotatedInst; // Where metadata is attached.
+ };
+
+ ValueProfileCollector(Function &Fn, TargetLibraryInfo &TLI);
+ ValueProfileCollector(ValueProfileCollector &&) = delete;
+ ValueProfileCollector &operator=(ValueProfileCollector &&) = delete;
+
+ ValueProfileCollector(const ValueProfileCollector &) = delete;
+ ValueProfileCollector &operator=(const ValueProfileCollector &) = delete;
+ ~ValueProfileCollector();
+
+ /// returns a list of value profiling candidates of the given kind
+ std::vector<CandidateInfo> get(InstrProfValueKind Kind) const;
+
+private:
+ class ValueProfileCollectorImpl;
+ std::unique_ptr<ValueProfileCollectorImpl> PImpl;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
index 0277494895..8d0cf5843e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
@@ -1,97 +1,97 @@
-//=== ValueProfilePlugins.inc - set of plugins used by ValueProfileCollector =//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a set of plugin classes used in ValueProfileCollectorImpl.
-// Each plugin is responsible for collecting Value Profiling candidates for a
-// particular optimization.
-// Each plugin must satisfy the interface described in ValueProfileCollector.cpp
-//
-//===----------------------------------------------------------------------===//
-
-#include "ValueProfileCollector.h"
-#include "llvm/Analysis/IndirectCallVisitor.h"
-#include "llvm/IR/InstVisitor.h"
-
-using namespace llvm;
-using CandidateInfo = ValueProfileCollector::CandidateInfo;
-
-extern cl::opt<bool> MemOPOptMemcmpBcmp;
-
-///--------------------------- MemIntrinsicPlugin ------------------------------
-class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> {
- Function &F;
- TargetLibraryInfo &TLI;
- std::vector<CandidateInfo> *Candidates;
-
-public:
- static constexpr InstrProfValueKind Kind = IPVK_MemOPSize;
-
- MemIntrinsicPlugin(Function &Fn, TargetLibraryInfo &TLI)
- : F(Fn), TLI(TLI), Candidates(nullptr) {}
-
- void run(std::vector<CandidateInfo> &Cs) {
- Candidates = &Cs;
- visit(F);
- Candidates = nullptr;
- }
- void visitMemIntrinsic(MemIntrinsic &MI) {
- Value *Length = MI.getLength();
- // Not instrument constant length calls.
- if (dyn_cast<ConstantInt>(Length))
- return;
-
- Instruction *InsertPt = &MI;
- Instruction *AnnotatedInst = &MI;
- Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst});
- }
- void visitCallInst(CallInst &CI) {
- if (!MemOPOptMemcmpBcmp)
- return;
- auto *F = CI.getCalledFunction();
- if (!F)
- return;
- LibFunc Func;
- if (TLI.getLibFunc(CI, Func) &&
- (Func == LibFunc_memcmp || Func == LibFunc_bcmp)) {
- Value *Length = CI.getArgOperand(2);
- // Not instrument constant length calls.
- if (dyn_cast<ConstantInt>(Length))
- return;
- Instruction *InsertPt = &CI;
- Instruction *AnnotatedInst = &CI;
- Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst});
- }
- }
-};
-
-///------------------------ IndirectCallPromotionPlugin ------------------------
-class IndirectCallPromotionPlugin {
- Function &F;
-
-public:
- static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget;
-
- IndirectCallPromotionPlugin(Function &Fn, TargetLibraryInfo &TLI) : F(Fn) {}
-
- void run(std::vector<CandidateInfo> &Candidates) {
- std::vector<CallBase *> Result = findIndirectCalls(F);
- for (Instruction *I : Result) {
- Value *Callee = cast<CallBase>(I)->getCalledOperand();
- Instruction *InsertPt = I;
- Instruction *AnnotatedInst = I;
- Candidates.emplace_back(CandidateInfo{Callee, InsertPt, AnnotatedInst});
- }
- }
-};
-
-///----------------------- Registration of the plugins -------------------------
-/// For now, registering a plugin with the ValueProfileCollector is done by
-/// adding the plugin type to the VP_PLUGIN_LIST macro.
-#define VP_PLUGIN_LIST \
- MemIntrinsicPlugin, \
- IndirectCallPromotionPlugin
+//=== ValueProfilePlugins.inc - set of plugins used by ValueProfileCollector =//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a set of plugin classes used in ValueProfileCollectorImpl.
+// Each plugin is responsible for collecting Value Profiling candidates for a
+// particular optimization.
+// Each plugin must satisfy the interface described in ValueProfileCollector.cpp
+//
+//===----------------------------------------------------------------------===//
+
+#include "ValueProfileCollector.h"
+#include "llvm/Analysis/IndirectCallVisitor.h"
+#include "llvm/IR/InstVisitor.h"
+
+using namespace llvm;
+using CandidateInfo = ValueProfileCollector::CandidateInfo;
+
+extern cl::opt<bool> MemOPOptMemcmpBcmp;
+
+///--------------------------- MemIntrinsicPlugin ------------------------------
+class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> {
+ Function &F;
+ TargetLibraryInfo &TLI;
+ std::vector<CandidateInfo> *Candidates;
+
+public:
+ static constexpr InstrProfValueKind Kind = IPVK_MemOPSize;
+
+ MemIntrinsicPlugin(Function &Fn, TargetLibraryInfo &TLI)
+ : F(Fn), TLI(TLI), Candidates(nullptr) {}
+
+ void run(std::vector<CandidateInfo> &Cs) {
+ Candidates = &Cs;
+ visit(F);
+ Candidates = nullptr;
+ }
+ void visitMemIntrinsic(MemIntrinsic &MI) {
+ Value *Length = MI.getLength();
+ // Not instrument constant length calls.
+ if (dyn_cast<ConstantInt>(Length))
+ return;
+
+ Instruction *InsertPt = &MI;
+ Instruction *AnnotatedInst = &MI;
+ Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst});
+ }
+ void visitCallInst(CallInst &CI) {
+ if (!MemOPOptMemcmpBcmp)
+ return;
+ auto *F = CI.getCalledFunction();
+ if (!F)
+ return;
+ LibFunc Func;
+ if (TLI.getLibFunc(CI, Func) &&
+ (Func == LibFunc_memcmp || Func == LibFunc_bcmp)) {
+ Value *Length = CI.getArgOperand(2);
+ // Not instrument constant length calls.
+ if (dyn_cast<ConstantInt>(Length))
+ return;
+ Instruction *InsertPt = &CI;
+ Instruction *AnnotatedInst = &CI;
+ Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst});
+ }
+ }
+};
+
+///------------------------ IndirectCallPromotionPlugin ------------------------
+class IndirectCallPromotionPlugin {
+ Function &F;
+
+public:
+ static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget;
+
+ IndirectCallPromotionPlugin(Function &Fn, TargetLibraryInfo &TLI) : F(Fn) {}
+
+ void run(std::vector<CandidateInfo> &Candidates) {
+ std::vector<CallBase *> Result = findIndirectCalls(F);
+ for (Instruction *I : Result) {
+ Value *Callee = cast<CallBase>(I)->getCalledOperand();
+ Instruction *InsertPt = I;
+ Instruction *AnnotatedInst = I;
+ Candidates.emplace_back(CandidateInfo{Callee, InsertPt, AnnotatedInst});
+ }
+ }
+};
+
+///----------------------- Registration of the plugins -------------------------
+/// For now, registering a plugin with the ValueProfileCollector is done by
+/// adding the plugin type to the VP_PLUGIN_LIST macro.
+#define VP_PLUGIN_LIST \
+ MemIntrinsicPlugin, \
+ IndirectCallPromotionPlugin
diff --git a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ya.make b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ya.make
index 10b7425404..39dab1eb7d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Instrumentation/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/Instrumentation/ya.make
@@ -1,12 +1,12 @@
-# Generated by devtools/yamaker.
-
-LIBRARY()
-
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
OWNER(
orivej
g:cpp-contrib
)
-
+
LICENSE(
Apache-2.0 WITH LLVM-exception AND
NCSA
@@ -14,7 +14,7 @@ LICENSE(
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-PEERDIR(
+PEERDIR(
contrib/libs/llvm12
contrib/libs/llvm12/include
contrib/libs/llvm12/lib/Analysis
@@ -23,36 +23,36 @@ PEERDIR(
contrib/libs/llvm12/lib/ProfileData
contrib/libs/llvm12/lib/Support
contrib/libs/llvm12/lib/Transforms/Utils
-)
-
+)
+
ADDINCL(
contrib/libs/llvm12/lib/Transforms/Instrumentation
)
-
-NO_COMPILER_WARNINGS()
-
-NO_UTIL()
-
-SRCS(
- AddressSanitizer.cpp
- BoundsChecking.cpp
- CGProfile.cpp
- ControlHeightReduction.cpp
- DataFlowSanitizer.cpp
- GCOVProfiling.cpp
- HWAddressSanitizer.cpp
- IndirectCallPromotion.cpp
- InstrOrderFile.cpp
- InstrProfiling.cpp
- Instrumentation.cpp
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+ AddressSanitizer.cpp
+ BoundsChecking.cpp
+ CGProfile.cpp
+ ControlHeightReduction.cpp
+ DataFlowSanitizer.cpp
+ GCOVProfiling.cpp
+ HWAddressSanitizer.cpp
+ IndirectCallPromotion.cpp
+ InstrOrderFile.cpp
+ InstrProfiling.cpp
+ Instrumentation.cpp
MemProfiler.cpp
- MemorySanitizer.cpp
- PGOInstrumentation.cpp
- PGOMemOPSizeOpt.cpp
- PoisonChecking.cpp
- SanitizerCoverage.cpp
- ThreadSanitizer.cpp
- ValueProfileCollector.cpp
-)
-
-END()
+ MemorySanitizer.cpp
+ PGOInstrumentation.cpp
+ PGOMemOPSizeOpt.cpp
+ PoisonChecking.cpp
+ SanitizerCoverage.cpp
+ ThreadSanitizer.cpp
+ ValueProfileCollector.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index c02799f3b2..258dc92408 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -1,142 +1,142 @@
-//===- ARCRuntimeEntryPoints.h - ObjC ARC Optimization ----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This file contains a class ARCRuntimeEntryPoints for use in
-/// creating/managing references to entry points to the arc objective c runtime.
-///
-/// WARNING: This file knows about certain library functions. It recognizes them
-/// by name, and hardwires knowledge of their semantics.
-///
-/// WARNING: This file knows about how certain Objective-C library functions are
-/// used. Naive LLVM IR transformations which would otherwise be
-/// behavior-preserving may break these assumptions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
-#define LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
-
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/ErrorHandling.h"
-#include <cassert>
-
-namespace llvm {
-
-class Function;
-class Module;
-
-namespace objcarc {
-
-enum class ARCRuntimeEntryPointKind {
- AutoreleaseRV,
- Release,
- Retain,
- RetainBlock,
- Autorelease,
- StoreStrong,
- RetainRV,
- RetainAutorelease,
- RetainAutoreleaseRV,
-};
-
-/// Declarations for ObjC runtime functions and constants. These are initialized
-/// lazily to avoid cluttering up the Module with unused declarations.
-class ARCRuntimeEntryPoints {
-public:
- ARCRuntimeEntryPoints() = default;
-
- void init(Module *M) {
- TheModule = M;
- AutoreleaseRV = nullptr;
- Release = nullptr;
- Retain = nullptr;
- RetainBlock = nullptr;
- Autorelease = nullptr;
- StoreStrong = nullptr;
- RetainRV = nullptr;
- RetainAutorelease = nullptr;
- RetainAutoreleaseRV = nullptr;
- }
-
- Function *get(ARCRuntimeEntryPointKind kind) {
- assert(TheModule != nullptr && "Not initialized.");
-
- switch (kind) {
- case ARCRuntimeEntryPointKind::AutoreleaseRV:
- return getIntrinsicEntryPoint(AutoreleaseRV,
- Intrinsic::objc_autoreleaseReturnValue);
- case ARCRuntimeEntryPointKind::Release:
- return getIntrinsicEntryPoint(Release, Intrinsic::objc_release);
- case ARCRuntimeEntryPointKind::Retain:
- return getIntrinsicEntryPoint(Retain, Intrinsic::objc_retain);
- case ARCRuntimeEntryPointKind::RetainBlock:
- return getIntrinsicEntryPoint(RetainBlock, Intrinsic::objc_retainBlock);
- case ARCRuntimeEntryPointKind::Autorelease:
- return getIntrinsicEntryPoint(Autorelease, Intrinsic::objc_autorelease);
- case ARCRuntimeEntryPointKind::StoreStrong:
- return getIntrinsicEntryPoint(StoreStrong, Intrinsic::objc_storeStrong);
- case ARCRuntimeEntryPointKind::RetainRV:
- return getIntrinsicEntryPoint(RetainRV,
- Intrinsic::objc_retainAutoreleasedReturnValue);
- case ARCRuntimeEntryPointKind::RetainAutorelease:
- return getIntrinsicEntryPoint(RetainAutorelease,
- Intrinsic::objc_retainAutorelease);
- case ARCRuntimeEntryPointKind::RetainAutoreleaseRV:
- return getIntrinsicEntryPoint(RetainAutoreleaseRV,
- Intrinsic::objc_retainAutoreleaseReturnValue);
- }
-
- llvm_unreachable("Switch should be a covered switch.");
- }
-
-private:
- /// Cached reference to the module which we will insert declarations into.
- Module *TheModule = nullptr;
-
- /// Declaration for ObjC runtime function objc_autoreleaseReturnValue.
- Function *AutoreleaseRV = nullptr;
-
- /// Declaration for ObjC runtime function objc_release.
- Function *Release = nullptr;
-
- /// Declaration for ObjC runtime function objc_retain.
- Function *Retain = nullptr;
-
- /// Declaration for ObjC runtime function objc_retainBlock.
- Function *RetainBlock = nullptr;
-
- /// Declaration for ObjC runtime function objc_autorelease.
- Function *Autorelease = nullptr;
-
- /// Declaration for objc_storeStrong().
- Function *StoreStrong = nullptr;
-
- /// Declaration for objc_retainAutoreleasedReturnValue().
- Function *RetainRV = nullptr;
-
- /// Declaration for objc_retainAutorelease().
- Function *RetainAutorelease = nullptr;
-
- /// Declaration for objc_retainAutoreleaseReturnValue().
- Function *RetainAutoreleaseRV = nullptr;
-
- Function *getIntrinsicEntryPoint(Function *&Decl, Intrinsic::ID IntID) {
- if (Decl)
- return Decl;
-
- return Decl = Intrinsic::getDeclaration(TheModule, IntID);
- }
-};
-
-} // end namespace objcarc
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
+//===- ARCRuntimeEntryPoints.h - ObjC ARC Optimization ----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains a class ARCRuntimeEntryPoints for use in
+/// creating/managing references to entry points to the arc objective c runtime.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
+
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+
+namespace llvm {
+
+class Function;
+class Module;
+
+namespace objcarc {
+
+enum class ARCRuntimeEntryPointKind {
+ AutoreleaseRV,
+ Release,
+ Retain,
+ RetainBlock,
+ Autorelease,
+ StoreStrong,
+ RetainRV,
+ RetainAutorelease,
+ RetainAutoreleaseRV,
+};
+
+/// Declarations for ObjC runtime functions and constants. These are initialized
+/// lazily to avoid cluttering up the Module with unused declarations.
+class ARCRuntimeEntryPoints {
+public:
+ ARCRuntimeEntryPoints() = default;
+
+ void init(Module *M) {
+ TheModule = M;
+ AutoreleaseRV = nullptr;
+ Release = nullptr;
+ Retain = nullptr;
+ RetainBlock = nullptr;
+ Autorelease = nullptr;
+ StoreStrong = nullptr;
+ RetainRV = nullptr;
+ RetainAutorelease = nullptr;
+ RetainAutoreleaseRV = nullptr;
+ }
+
+ Function *get(ARCRuntimeEntryPointKind kind) {
+ assert(TheModule != nullptr && "Not initialized.");
+
+ switch (kind) {
+ case ARCRuntimeEntryPointKind::AutoreleaseRV:
+ return getIntrinsicEntryPoint(AutoreleaseRV,
+ Intrinsic::objc_autoreleaseReturnValue);
+ case ARCRuntimeEntryPointKind::Release:
+ return getIntrinsicEntryPoint(Release, Intrinsic::objc_release);
+ case ARCRuntimeEntryPointKind::Retain:
+ return getIntrinsicEntryPoint(Retain, Intrinsic::objc_retain);
+ case ARCRuntimeEntryPointKind::RetainBlock:
+ return getIntrinsicEntryPoint(RetainBlock, Intrinsic::objc_retainBlock);
+ case ARCRuntimeEntryPointKind::Autorelease:
+ return getIntrinsicEntryPoint(Autorelease, Intrinsic::objc_autorelease);
+ case ARCRuntimeEntryPointKind::StoreStrong:
+ return getIntrinsicEntryPoint(StoreStrong, Intrinsic::objc_storeStrong);
+ case ARCRuntimeEntryPointKind::RetainRV:
+ return getIntrinsicEntryPoint(RetainRV,
+ Intrinsic::objc_retainAutoreleasedReturnValue);
+ case ARCRuntimeEntryPointKind::RetainAutorelease:
+ return getIntrinsicEntryPoint(RetainAutorelease,
+ Intrinsic::objc_retainAutorelease);
+ case ARCRuntimeEntryPointKind::RetainAutoreleaseRV:
+ return getIntrinsicEntryPoint(RetainAutoreleaseRV,
+ Intrinsic::objc_retainAutoreleaseReturnValue);
+ }
+
+ llvm_unreachable("Switch should be a covered switch.");
+ }
+
+private:
+ /// Cached reference to the module which we will insert declarations into.
+ Module *TheModule = nullptr;
+
+ /// Declaration for ObjC runtime function objc_autoreleaseReturnValue.
+ Function *AutoreleaseRV = nullptr;
+
+ /// Declaration for ObjC runtime function objc_release.
+ Function *Release = nullptr;
+
+ /// Declaration for ObjC runtime function objc_retain.
+ Function *Retain = nullptr;
+
+ /// Declaration for ObjC runtime function objc_retainBlock.
+ Function *RetainBlock = nullptr;
+
+ /// Declaration for ObjC runtime function objc_autorelease.
+ Function *Autorelease = nullptr;
+
+ /// Declaration for objc_storeStrong().
+ Function *StoreStrong = nullptr;
+
+ /// Declaration for objc_retainAutoreleasedReturnValue().
+ Function *RetainRV = nullptr;
+
+ /// Declaration for objc_retainAutorelease().
+ Function *RetainAutorelease = nullptr;
+
+ /// Declaration for objc_retainAutoreleaseReturnValue().
+ Function *RetainAutoreleaseRV = nullptr;
+
+ Function *getIntrinsicEntryPoint(Function *&Decl, Intrinsic::ID IntID) {
+ if (Decl)
+ return Decl;
+
+ return Decl = Intrinsic::getDeclaration(TheModule, IntID);
+ }
+};
+
+} // end namespace objcarc
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/BlotMapVector.h b/contrib/libs/llvm12/lib/Transforms/ObjCARC/BlotMapVector.h
index 9b144aaac6..2fa07cfb32 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/BlotMapVector.h
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/BlotMapVector.h
@@ -1,117 +1,117 @@
-//===- BlotMapVector.h - A MapVector with the blot operation ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_BLOTMAPVECTOR_H
-#define LLVM_LIB_TRANSFORMS_OBJCARC_BLOTMAPVECTOR_H
-
-#include "llvm/ADT/DenseMap.h"
-#include <cassert>
-#include <cstddef>
-#include <utility>
-#include <vector>
-
-namespace llvm {
-
-/// An associative container with fast insertion-order (deterministic)
-/// iteration over its elements. Plus the special blot operation.
-template <class KeyT, class ValueT> class BlotMapVector {
- /// Map keys to indices in Vector.
- using MapTy = DenseMap<KeyT, size_t>;
- MapTy Map;
-
- /// Keys and values.
- using VectorTy = std::vector<std::pair<KeyT, ValueT>>;
- VectorTy Vector;
-
-public:
-#ifdef EXPENSIVE_CHECKS
- ~BlotMapVector() {
- assert(Vector.size() >= Map.size()); // May differ due to blotting.
- for (typename MapTy::const_iterator I = Map.begin(), E = Map.end(); I != E;
- ++I) {
- assert(I->second < Vector.size());
- assert(Vector[I->second].first == I->first);
- }
- for (typename VectorTy::const_iterator I = Vector.begin(), E = Vector.end();
- I != E; ++I)
- assert(!I->first || (Map.count(I->first) &&
- Map[I->first] == size_t(I - Vector.begin())));
- }
-#endif
-
- using iterator = typename VectorTy::iterator;
- using const_iterator = typename VectorTy::const_iterator;
-
- iterator begin() { return Vector.begin(); }
- iterator end() { return Vector.end(); }
- const_iterator begin() const { return Vector.begin(); }
- const_iterator end() const { return Vector.end(); }
-
- ValueT &operator[](const KeyT &Arg) {
- std::pair<typename MapTy::iterator, bool> Pair =
- Map.insert(std::make_pair(Arg, size_t(0)));
- if (Pair.second) {
- size_t Num = Vector.size();
- Pair.first->second = Num;
- Vector.push_back(std::make_pair(Arg, ValueT()));
- return Vector[Num].second;
- }
- return Vector[Pair.first->second].second;
- }
-
- std::pair<iterator, bool> insert(const std::pair<KeyT, ValueT> &InsertPair) {
- std::pair<typename MapTy::iterator, bool> Pair =
- Map.insert(std::make_pair(InsertPair.first, size_t(0)));
- if (Pair.second) {
- size_t Num = Vector.size();
- Pair.first->second = Num;
- Vector.push_back(InsertPair);
- return std::make_pair(Vector.begin() + Num, true);
- }
- return std::make_pair(Vector.begin() + Pair.first->second, false);
- }
-
- iterator find(const KeyT &Key) {
- typename MapTy::iterator It = Map.find(Key);
- if (It == Map.end())
- return Vector.end();
- return Vector.begin() + It->second;
- }
-
- const_iterator find(const KeyT &Key) const {
- typename MapTy::const_iterator It = Map.find(Key);
- if (It == Map.end())
- return Vector.end();
- return Vector.begin() + It->second;
- }
-
- /// This is similar to erase, but instead of removing the element from the
- /// vector, it just zeros out the key in the vector. This leaves iterators
- /// intact, but clients must be prepared for zeroed-out keys when iterating.
- void blot(const KeyT &Key) {
- typename MapTy::iterator It = Map.find(Key);
- if (It == Map.end())
- return;
- Vector[It->second].first = KeyT();
- Map.erase(It);
- }
-
- void clear() {
- Map.clear();
- Vector.clear();
- }
-
- bool empty() const {
- assert(Map.empty() == Vector.empty());
- return Map.empty();
- }
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TRANSFORMS_OBJCARC_BLOTMAPVECTOR_H
+//===- BlotMapVector.h - A MapVector with the blot operation ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_BLOTMAPVECTOR_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_BLOTMAPVECTOR_H
+
+#include "llvm/ADT/DenseMap.h"
+#include <cassert>
+#include <cstddef>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+/// An associative container with fast insertion-order (deterministic)
+/// iteration over its elements. Plus the special blot operation.
+template <class KeyT, class ValueT> class BlotMapVector {
+ /// Map keys to indices in Vector.
+ using MapTy = DenseMap<KeyT, size_t>;
+ MapTy Map;
+
+ /// Keys and values.
+ using VectorTy = std::vector<std::pair<KeyT, ValueT>>;
+ VectorTy Vector;
+
+public:
+#ifdef EXPENSIVE_CHECKS
+ ~BlotMapVector() {
+ assert(Vector.size() >= Map.size()); // May differ due to blotting.
+ for (typename MapTy::const_iterator I = Map.begin(), E = Map.end(); I != E;
+ ++I) {
+ assert(I->second < Vector.size());
+ assert(Vector[I->second].first == I->first);
+ }
+ for (typename VectorTy::const_iterator I = Vector.begin(), E = Vector.end();
+ I != E; ++I)
+ assert(!I->first || (Map.count(I->first) &&
+ Map[I->first] == size_t(I - Vector.begin())));
+ }
+#endif
+
+ using iterator = typename VectorTy::iterator;
+ using const_iterator = typename VectorTy::const_iterator;
+
+ iterator begin() { return Vector.begin(); }
+ iterator end() { return Vector.end(); }
+ const_iterator begin() const { return Vector.begin(); }
+ const_iterator end() const { return Vector.end(); }
+
+ ValueT &operator[](const KeyT &Arg) {
+ std::pair<typename MapTy::iterator, bool> Pair =
+ Map.insert(std::make_pair(Arg, size_t(0)));
+ if (Pair.second) {
+ size_t Num = Vector.size();
+ Pair.first->second = Num;
+ Vector.push_back(std::make_pair(Arg, ValueT()));
+ return Vector[Num].second;
+ }
+ return Vector[Pair.first->second].second;
+ }
+
+ std::pair<iterator, bool> insert(const std::pair<KeyT, ValueT> &InsertPair) {
+ std::pair<typename MapTy::iterator, bool> Pair =
+ Map.insert(std::make_pair(InsertPair.first, size_t(0)));
+ if (Pair.second) {
+ size_t Num = Vector.size();
+ Pair.first->second = Num;
+ Vector.push_back(InsertPair);
+ return std::make_pair(Vector.begin() + Num, true);
+ }
+ return std::make_pair(Vector.begin() + Pair.first->second, false);
+ }
+
+ iterator find(const KeyT &Key) {
+ typename MapTy::iterator It = Map.find(Key);
+ if (It == Map.end())
+ return Vector.end();
+ return Vector.begin() + It->second;
+ }
+
+ const_iterator find(const KeyT &Key) const {
+ typename MapTy::const_iterator It = Map.find(Key);
+ if (It == Map.end())
+ return Vector.end();
+ return Vector.begin() + It->second;
+ }
+
+ /// This is similar to erase, but instead of removing the element from the
+ /// vector, it just zeros out the key in the vector. This leaves iterators
+ /// intact, but clients must be prepared for zeroed-out keys when iterating.
+ void blot(const KeyT &Key) {
+ typename MapTy::iterator It = Map.find(Key);
+ if (It == Map.end())
+ return;
+ Vector[It->second].first = KeyT();
+ Map.erase(It);
+ }
+
+ void clear() {
+ Map.clear();
+ Vector.clear();
+ }
+
+ bool empty() const {
+ assert(Map.empty() == Vector.empty());
+ return Map.empty();
+ }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TRANSFORMS_OBJCARC_BLOTMAPVECTOR_H
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index c621c56c05..7f7f2dc89b 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -1,233 +1,233 @@
-//===- DependencyAnalysis.cpp - ObjC ARC Optimization ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-///
-/// This file defines special dependency analysis routines used in Objective C
-/// ARC Optimizations.
-///
-/// WARNING: This file knows about certain library functions. It recognizes them
-/// by name, and hardwires knowledge of their semantics.
-///
-/// WARNING: This file knows about how certain Objective-C library functions are
-/// used. Naive LLVM IR transformations which would otherwise be
-/// behavior-preserving may break these assumptions.
-///
-//===----------------------------------------------------------------------===//
-
-#include "DependencyAnalysis.h"
-#include "ObjCARC.h"
-#include "ProvenanceAnalysis.h"
+//===- DependencyAnalysis.cpp - ObjC ARC Optimization ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines special dependency analysis routines used in Objective C
+/// ARC Optimizations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "DependencyAnalysis.h"
+#include "ObjCARC.h"
+#include "ProvenanceAnalysis.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/IR/CFG.h"
-
-using namespace llvm;
-using namespace llvm::objcarc;
-
-#define DEBUG_TYPE "objc-arc-dependency"
-
-/// Test whether the given instruction can result in a reference count
-/// modification (positive or negative) for the pointer's object.
-bool llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
- ProvenanceAnalysis &PA,
- ARCInstKind Class) {
- switch (Class) {
- case ARCInstKind::Autorelease:
- case ARCInstKind::AutoreleaseRV:
- case ARCInstKind::IntrinsicUser:
- case ARCInstKind::User:
- // These operations never directly modify a reference count.
- return false;
- default: break;
- }
-
- const auto *Call = cast<CallBase>(Inst);
-
- // See if AliasAnalysis can help us with the call.
- FunctionModRefBehavior MRB = PA.getAA()->getModRefBehavior(Call);
- if (AliasAnalysis::onlyReadsMemory(MRB))
- return false;
- if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
- for (const Value *Op : Call->args()) {
+#include "llvm/IR/CFG.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-dependency"
+
+/// Test whether the given instruction can result in a reference count
+/// modification (positive or negative) for the pointer's object.
+bool llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+ ProvenanceAnalysis &PA,
+ ARCInstKind Class) {
+ switch (Class) {
+ case ARCInstKind::Autorelease:
+ case ARCInstKind::AutoreleaseRV:
+ case ARCInstKind::IntrinsicUser:
+ case ARCInstKind::User:
+ // These operations never directly modify a reference count.
+ return false;
+ default: break;
+ }
+
+ const auto *Call = cast<CallBase>(Inst);
+
+ // See if AliasAnalysis can help us with the call.
+ FunctionModRefBehavior MRB = PA.getAA()->getModRefBehavior(Call);
+ if (AliasAnalysis::onlyReadsMemory(MRB))
+ return false;
+ if (AliasAnalysis::onlyAccessesArgPointees(MRB)) {
+ for (const Value *Op : Call->args()) {
if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
- return true;
- }
- return false;
- }
-
- // Assume the worst.
- return true;
-}
-
-bool llvm::objcarc::CanDecrementRefCount(const Instruction *Inst,
- const Value *Ptr,
- ProvenanceAnalysis &PA,
- ARCInstKind Class) {
- // First perform a quick check if Class can not touch ref counts.
- if (!CanDecrementRefCount(Class))
- return false;
-
- // Otherwise, just use CanAlterRefCount for now.
- return CanAlterRefCount(Inst, Ptr, PA, Class);
-}
-
-/// Test whether the given instruction can "use" the given pointer's object in a
-/// way that requires the reference count to be positive.
-bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
- ProvenanceAnalysis &PA, ARCInstKind Class) {
- // ARCInstKind::Call operations (as opposed to
- // ARCInstKind::CallOrUser) never "use" objc pointers.
- if (Class == ARCInstKind::Call)
- return false;
-
- // Consider various instructions which may have pointer arguments which are
- // not "uses".
- if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
- // Comparing a pointer with null, or any other constant, isn't really a use,
- // because we don't care what the pointer points to, or about the values
- // of any other dynamic reference-counted pointers.
- if (!IsPotentialRetainableObjPtr(ICI->getOperand(1), *PA.getAA()))
- return false;
- } else if (const auto *CS = dyn_cast<CallBase>(Inst)) {
- // For calls, just check the arguments (and not the callee operand).
- for (auto OI = CS->arg_begin(), OE = CS->arg_end(); OI != OE; ++OI) {
- const Value *Op = *OI;
+ return true;
+ }
+ return false;
+ }
+
+ // Assume the worst.
+ return true;
+}
+
+bool llvm::objcarc::CanDecrementRefCount(const Instruction *Inst,
+ const Value *Ptr,
+ ProvenanceAnalysis &PA,
+ ARCInstKind Class) {
+ // First perform a quick check if Class can not touch ref counts.
+ if (!CanDecrementRefCount(Class))
+ return false;
+
+ // Otherwise, just use CanAlterRefCount for now.
+ return CanAlterRefCount(Inst, Ptr, PA, Class);
+}
+
+/// Test whether the given instruction can "use" the given pointer's object in a
+/// way that requires the reference count to be positive.
+bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
+ ProvenanceAnalysis &PA, ARCInstKind Class) {
+ // ARCInstKind::Call operations (as opposed to
+ // ARCInstKind::CallOrUser) never "use" objc pointers.
+ if (Class == ARCInstKind::Call)
+ return false;
+
+ // Consider various instructions which may have pointer arguments which are
+ // not "uses".
+ if (const ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
+ // Comparing a pointer with null, or any other constant, isn't really a use,
+ // because we don't care what the pointer points to, or about the values
+ // of any other dynamic reference-counted pointers.
+ if (!IsPotentialRetainableObjPtr(ICI->getOperand(1), *PA.getAA()))
+ return false;
+ } else if (const auto *CS = dyn_cast<CallBase>(Inst)) {
+ // For calls, just check the arguments (and not the callee operand).
+ for (auto OI = CS->arg_begin(), OE = CS->arg_end(); OI != OE; ++OI) {
+ const Value *Op = *OI;
if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
- return true;
- }
- return false;
- } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
- // Special-case stores, because we don't care about the stored value, just
- // the store address.
+ return true;
+ }
+ return false;
+ } else if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ // Special-case stores, because we don't care about the stored value, just
+ // the store address.
const Value *Op = GetUnderlyingObjCPtr(SI->getPointerOperand());
- // If we can't tell what the underlying object was, assume there is a
- // dependence.
+ // If we can't tell what the underlying object was, assume there is a
+ // dependence.
return IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Op, Ptr);
- }
-
- // Check each operand for a match.
- for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
- OI != OE; ++OI) {
- const Value *Op = *OI;
+ }
+
+ // Check each operand for a match.
+ for (User::const_op_iterator OI = Inst->op_begin(), OE = Inst->op_end();
+ OI != OE; ++OI) {
+ const Value *Op = *OI;
if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) && PA.related(Ptr, Op))
- return true;
- }
- return false;
-}
-
-/// Test if there can be dependencies on Inst through Arg. This function only
-/// tests dependencies relevant for removing pairs of calls.
-bool
-llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
- const Value *Arg, ProvenanceAnalysis &PA) {
- // If we've reached the definition of Arg, stop.
- if (Inst == Arg)
- return true;
-
- switch (Flavor) {
- case NeedsPositiveRetainCount: {
- ARCInstKind Class = GetARCInstKind(Inst);
- switch (Class) {
- case ARCInstKind::AutoreleasepoolPop:
- case ARCInstKind::AutoreleasepoolPush:
- case ARCInstKind::None:
- return false;
- default:
- return CanUse(Inst, Arg, PA, Class);
- }
- }
-
- case AutoreleasePoolBoundary: {
- ARCInstKind Class = GetARCInstKind(Inst);
- switch (Class) {
- case ARCInstKind::AutoreleasepoolPop:
- case ARCInstKind::AutoreleasepoolPush:
- // These mark the end and begin of an autorelease pool scope.
- return true;
- default:
- // Nothing else does this.
- return false;
- }
- }
-
- case CanChangeRetainCount: {
- ARCInstKind Class = GetARCInstKind(Inst);
- switch (Class) {
- case ARCInstKind::AutoreleasepoolPop:
- // Conservatively assume this can decrement any count.
- return true;
- case ARCInstKind::AutoreleasepoolPush:
- case ARCInstKind::None:
- return false;
- default:
- return CanAlterRefCount(Inst, Arg, PA, Class);
- }
- }
-
- case RetainAutoreleaseDep:
- switch (GetBasicARCInstKind(Inst)) {
- case ARCInstKind::AutoreleasepoolPop:
- case ARCInstKind::AutoreleasepoolPush:
- // Don't merge an objc_autorelease with an objc_retain inside a different
- // autoreleasepool scope.
- return true;
- case ARCInstKind::Retain:
- case ARCInstKind::RetainRV:
- // Check for a retain of the same pointer for merging.
- return GetArgRCIdentityRoot(Inst) == Arg;
- default:
- // Nothing else matters for objc_retainAutorelease formation.
- return false;
- }
-
- case RetainAutoreleaseRVDep: {
- ARCInstKind Class = GetBasicARCInstKind(Inst);
- switch (Class) {
- case ARCInstKind::Retain:
- case ARCInstKind::RetainRV:
- // Check for a retain of the same pointer for merging.
- return GetArgRCIdentityRoot(Inst) == Arg;
- default:
- // Anything that can autorelease interrupts
- // retainAutoreleaseReturnValue formation.
- return CanInterruptRV(Class);
- }
- }
-
- case RetainRVDep:
- return CanInterruptRV(GetBasicARCInstKind(Inst));
- }
-
- llvm_unreachable("Invalid dependence flavor");
-}
-
-/// Walk up the CFG from StartPos (which is in StartBB) and find local and
-/// non-local dependencies on Arg.
-///
-/// TODO: Cache results?
+ return true;
+ }
+ return false;
+}
+
+/// Test if there can be dependencies on Inst through Arg. This function only
+/// tests dependencies relevant for removing pairs of calls.
+bool
+llvm::objcarc::Depends(DependenceKind Flavor, Instruction *Inst,
+ const Value *Arg, ProvenanceAnalysis &PA) {
+ // If we've reached the definition of Arg, stop.
+ if (Inst == Arg)
+ return true;
+
+ switch (Flavor) {
+ case NeedsPositiveRetainCount: {
+ ARCInstKind Class = GetARCInstKind(Inst);
+ switch (Class) {
+ case ARCInstKind::AutoreleasepoolPop:
+ case ARCInstKind::AutoreleasepoolPush:
+ case ARCInstKind::None:
+ return false;
+ default:
+ return CanUse(Inst, Arg, PA, Class);
+ }
+ }
+
+ case AutoreleasePoolBoundary: {
+ ARCInstKind Class = GetARCInstKind(Inst);
+ switch (Class) {
+ case ARCInstKind::AutoreleasepoolPop:
+ case ARCInstKind::AutoreleasepoolPush:
+ // These mark the end and begin of an autorelease pool scope.
+ return true;
+ default:
+ // Nothing else does this.
+ return false;
+ }
+ }
+
+ case CanChangeRetainCount: {
+ ARCInstKind Class = GetARCInstKind(Inst);
+ switch (Class) {
+ case ARCInstKind::AutoreleasepoolPop:
+ // Conservatively assume this can decrement any count.
+ return true;
+ case ARCInstKind::AutoreleasepoolPush:
+ case ARCInstKind::None:
+ return false;
+ default:
+ return CanAlterRefCount(Inst, Arg, PA, Class);
+ }
+ }
+
+ case RetainAutoreleaseDep:
+ switch (GetBasicARCInstKind(Inst)) {
+ case ARCInstKind::AutoreleasepoolPop:
+ case ARCInstKind::AutoreleasepoolPush:
+ // Don't merge an objc_autorelease with an objc_retain inside a different
+ // autoreleasepool scope.
+ return true;
+ case ARCInstKind::Retain:
+ case ARCInstKind::RetainRV:
+ // Check for a retain of the same pointer for merging.
+ return GetArgRCIdentityRoot(Inst) == Arg;
+ default:
+ // Nothing else matters for objc_retainAutorelease formation.
+ return false;
+ }
+
+ case RetainAutoreleaseRVDep: {
+ ARCInstKind Class = GetBasicARCInstKind(Inst);
+ switch (Class) {
+ case ARCInstKind::Retain:
+ case ARCInstKind::RetainRV:
+ // Check for a retain of the same pointer for merging.
+ return GetArgRCIdentityRoot(Inst) == Arg;
+ default:
+ // Anything that can autorelease interrupts
+ // retainAutoreleaseReturnValue formation.
+ return CanInterruptRV(Class);
+ }
+ }
+
+ case RetainRVDep:
+ return CanInterruptRV(GetBasicARCInstKind(Inst));
+ }
+
+ llvm_unreachable("Invalid dependence flavor");
+}
+
+/// Walk up the CFG from StartPos (which is in StartBB) and find local and
+/// non-local dependencies on Arg.
+///
+/// TODO: Cache results?
static bool findDependencies(DependenceKind Flavor, const Value *Arg,
BasicBlock *StartBB, Instruction *StartInst,
SmallPtrSetImpl<Instruction *> &DependingInsts,
ProvenanceAnalysis &PA) {
- BasicBlock::iterator StartPos = StartInst->getIterator();
-
+ BasicBlock::iterator StartPos = StartInst->getIterator();
+
SmallPtrSet<const BasicBlock *, 4> Visited;
- SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;
- Worklist.push_back(std::make_pair(StartBB, StartPos));
- do {
- std::pair<BasicBlock *, BasicBlock::iterator> Pair =
- Worklist.pop_back_val();
- BasicBlock *LocalStartBB = Pair.first;
- BasicBlock::iterator LocalStartPos = Pair.second;
- BasicBlock::iterator StartBBBegin = LocalStartBB->begin();
- for (;;) {
- if (LocalStartPos == StartBBBegin) {
- pred_iterator PI(LocalStartBB), PE(LocalStartBB, false);
- if (PI == PE)
+ SmallVector<std::pair<BasicBlock *, BasicBlock::iterator>, 4> Worklist;
+ Worklist.push_back(std::make_pair(StartBB, StartPos));
+ do {
+ std::pair<BasicBlock *, BasicBlock::iterator> Pair =
+ Worklist.pop_back_val();
+ BasicBlock *LocalStartBB = Pair.first;
+ BasicBlock::iterator LocalStartPos = Pair.second;
+ BasicBlock::iterator StartBBBegin = LocalStartBB->begin();
+ for (;;) {
+ if (LocalStartPos == StartBBBegin) {
+ pred_iterator PI(LocalStartBB), PE(LocalStartBB, false);
+ if (PI == PE)
// Return if we've reached the function entry.
return false;
// Add the predecessors to the worklist.
@@ -236,30 +236,30 @@ static bool findDependencies(DependenceKind Flavor, const Value *Arg,
if (Visited.insert(PredBB).second)
Worklist.push_back(std::make_pair(PredBB, PredBB->end()));
} while (++PI != PE);
- break;
- }
-
- Instruction *Inst = &*--LocalStartPos;
- if (Depends(Flavor, Inst, Arg, PA)) {
- DependingInsts.insert(Inst);
- break;
- }
- }
- } while (!Worklist.empty());
-
- // Determine whether the original StartBB post-dominates all of the blocks we
- // visited. If not, insert a sentinal indicating that most optimizations are
- // not safe.
- for (const BasicBlock *BB : Visited) {
- if (BB == StartBB)
- continue;
- for (const BasicBlock *Succ : successors(BB))
+ break;
+ }
+
+ Instruction *Inst = &*--LocalStartPos;
+ if (Depends(Flavor, Inst, Arg, PA)) {
+ DependingInsts.insert(Inst);
+ break;
+ }
+ }
+ } while (!Worklist.empty());
+
+ // Determine whether the original StartBB post-dominates all of the blocks we
+ // visited. If not, insert a sentinal indicating that most optimizations are
+ // not safe.
+ for (const BasicBlock *BB : Visited) {
+ if (BB == StartBB)
+ continue;
+ for (const BasicBlock *Succ : successors(BB))
if (Succ != StartBB && !Visited.count(Succ))
return false;
- }
+ }
return true;
-}
+}
llvm::Instruction *llvm::objcarc::findSingleDependency(DependenceKind Flavor,
const Value *Arg,
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.h b/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.h
index 2a51683c38..cf4c05ebe9 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.h
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/DependencyAnalysis.h
@@ -1,88 +1,88 @@
-//===- DependencyAnalysis.h - ObjC ARC Optimization ---*- C++ -*-----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-///
-/// This file declares special dependency analysis routines used in Objective C
-/// ARC Optimizations.
-///
-/// WARNING: This file knows about certain library functions. It recognizes them
-/// by name, and hardwires knowledge of their semantics.
-///
-/// WARNING: This file knows about how certain Objective-C library functions are
-/// used. Naive LLVM IR transformations which would otherwise be
-/// behavior-preserving may break these assumptions.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_DEPENDENCYANALYSIS_H
-#define LLVM_LIB_TRANSFORMS_OBJCARC_DEPENDENCYANALYSIS_H
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/ObjCARCInstKind.h"
-
-namespace llvm {
- class BasicBlock;
- class Instruction;
- class Value;
-}
-
-namespace llvm {
-namespace objcarc {
-
-class ProvenanceAnalysis;
-
-/// \enum DependenceKind
-/// Defines different dependence kinds among various ARC constructs.
-///
-/// There are several kinds of dependence-like concepts in use here.
-///
-enum DependenceKind {
- NeedsPositiveRetainCount,
- AutoreleasePoolBoundary,
- CanChangeRetainCount,
- RetainAutoreleaseDep, ///< Blocks objc_retainAutorelease.
- RetainAutoreleaseRVDep, ///< Blocks objc_retainAutoreleaseReturnValue.
- RetainRVDep ///< Blocks objc_retainAutoreleasedReturnValue.
-};
-
+//===- DependencyAnalysis.h - ObjC ARC Optimization ---*- C++ -*-----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file declares special dependency analysis routines used in Objective C
+/// ARC Optimizations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_DEPENDENCYANALYSIS_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_DEPENDENCYANALYSIS_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
+
+namespace llvm {
+ class BasicBlock;
+ class Instruction;
+ class Value;
+}
+
+namespace llvm {
+namespace objcarc {
+
+class ProvenanceAnalysis;
+
+/// \enum DependenceKind
+/// Defines different dependence kinds among various ARC constructs.
+///
+/// There are several kinds of dependence-like concepts in use here.
+///
+enum DependenceKind {
+ NeedsPositiveRetainCount,
+ AutoreleasePoolBoundary,
+ CanChangeRetainCount,
+ RetainAutoreleaseDep, ///< Blocks objc_retainAutorelease.
+ RetainAutoreleaseRVDep, ///< Blocks objc_retainAutoreleaseReturnValue.
+ RetainRVDep ///< Blocks objc_retainAutoreleasedReturnValue.
+};
+
/// Find dependent instructions. If there is exactly one dependent instruction,
/// return it. Otherwise, return null.
llvm::Instruction *findSingleDependency(DependenceKind Flavor, const Value *Arg,
BasicBlock *StartBB,
Instruction *StartInst,
ProvenanceAnalysis &PA);
-
-bool
-Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
- ProvenanceAnalysis &PA);
-
-/// Test whether the given instruction can "use" the given pointer's object in a
-/// way that requires the reference count to be positive.
-bool CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
- ARCInstKind Class);
-
-/// Test whether the given instruction can result in a reference count
-/// modification (positive or negative) for the pointer's object.
-bool CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
- ProvenanceAnalysis &PA, ARCInstKind Class);
-
-/// Returns true if we can not conservatively prove that Inst can not decrement
-/// the reference count of Ptr. Returns false if we can.
-bool CanDecrementRefCount(const Instruction *Inst, const Value *Ptr,
- ProvenanceAnalysis &PA, ARCInstKind Class);
-
-static inline bool CanDecrementRefCount(const Instruction *Inst,
- const Value *Ptr,
- ProvenanceAnalysis &PA) {
- return CanDecrementRefCount(Inst, Ptr, PA, GetARCInstKind(Inst));
-}
-
-} // namespace objcarc
-} // namespace llvm
-
-#endif
+
+bool
+Depends(DependenceKind Flavor, Instruction *Inst, const Value *Arg,
+ ProvenanceAnalysis &PA);
+
+/// Test whether the given instruction can "use" the given pointer's object in a
+/// way that requires the reference count to be positive.
+bool CanUse(const Instruction *Inst, const Value *Ptr, ProvenanceAnalysis &PA,
+ ARCInstKind Class);
+
+/// Test whether the given instruction can result in a reference count
+/// modification (positive or negative) for the pointer's object.
+bool CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
+ ProvenanceAnalysis &PA, ARCInstKind Class);
+
+/// Returns true if we can not conservatively prove that Inst can not decrement
+/// the reference count of Ptr. Returns false if we can.
+bool CanDecrementRefCount(const Instruction *Inst, const Value *Ptr,
+ ProvenanceAnalysis &PA, ARCInstKind Class);
+
+static inline bool CanDecrementRefCount(const Instruction *Inst,
+ const Value *Ptr,
+ ProvenanceAnalysis &PA) {
+ return CanDecrementRefCount(Inst, Ptr, PA, GetARCInstKind(Inst));
+}
+
+} // namespace objcarc
+} // namespace llvm
+
+#endif
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.cpp
index cbc1eca728..970136392f 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -1,39 +1,39 @@
-//===-- ObjCARC.cpp -------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements common infrastructure for libLLVMObjCARCOpts.a, which
-// implements several scalar transformations over the LLVM intermediate
-// representation, including the C bindings for that library.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ObjCARC.h"
-#include "llvm-c/Initialization.h"
-#include "llvm/InitializePasses.h"
-
-namespace llvm {
- class PassRegistry;
-}
-
-using namespace llvm;
-using namespace llvm::objcarc;
-
-/// initializeObjCARCOptsPasses - Initialize all passes linked into the
-/// ObjCARCOpts library.
-void llvm::initializeObjCARCOpts(PassRegistry &Registry) {
- initializeObjCARCAAWrapperPassPass(Registry);
- initializeObjCARCAPElimPass(Registry);
- initializeObjCARCExpandPass(Registry);
+//===-- ObjCARC.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMObjCARCOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/InitializePasses.h"
+
+namespace llvm {
+ class PassRegistry;
+}
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+/// initializeObjCARCOptsPasses - Initialize all passes linked into the
+/// ObjCARCOpts library.
+void llvm::initializeObjCARCOpts(PassRegistry &Registry) {
+ initializeObjCARCAAWrapperPassPass(Registry);
+ initializeObjCARCAPElimPass(Registry);
+ initializeObjCARCExpandPass(Registry);
initializeObjCARCContractLegacyPassPass(Registry);
initializeObjCARCOptLegacyPassPass(Registry);
- initializePAEvalPass(Registry);
-}
-
-void LLVMInitializeObjCARCOpts(LLVMPassRegistryRef R) {
- initializeObjCARCOpts(*unwrap(R));
-}
+ initializePAEvalPass(Registry);
+}
+
+void LLVMInitializeObjCARCOpts(LLVMPassRegistryRef R) {
+ initializeObjCARCOpts(*unwrap(R));
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.h b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.h
index c80f5f597b..8227a8c6f7 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARC.h
@@ -1,93 +1,93 @@
-//===- ObjCARC.h - ObjC ARC Optimization --------------*- C++ -*-----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file defines common definitions/declarations used by the ObjC ARC
-/// Optimizer. ARC stands for Automatic Reference Counting and is a system for
-/// managing reference counts for objects in Objective C.
-///
-/// WARNING: This file knows about certain library functions. It recognizes them
-/// by name, and hardwires knowledge of their semantics.
-///
-/// WARNING: This file knows about how certain Objective-C library functions are
-/// used. Naive LLVM IR transformations which would otherwise be
-/// behavior-preserving may break these assumptions.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
-#define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
-
-#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-namespace llvm {
-namespace objcarc {
-
-/// Erase the given instruction.
-///
-/// Many ObjC calls return their argument verbatim,
-/// so if it's such a call and the return value has users, replace them with the
-/// argument value.
-///
-static inline void EraseInstruction(Instruction *CI) {
- Value *OldArg = cast<CallInst>(CI)->getArgOperand(0);
-
- bool Unused = CI->use_empty();
-
- if (!Unused) {
- // Replace the return value with the argument.
- assert((IsForwarding(GetBasicARCInstKind(CI)) ||
- (IsNoopOnNull(GetBasicARCInstKind(CI)) &&
- IsNullOrUndef(OldArg->stripPointerCasts()))) &&
- "Can't delete non-forwarding instruction with users!");
- CI->replaceAllUsesWith(OldArg);
- }
-
- CI->eraseFromParent();
-
- if (Unused)
- RecursivelyDeleteTriviallyDeadInstructions(OldArg);
-}
-
-/// If Inst is a ReturnRV and its operand is a call or invoke, return the
-/// operand. Otherwise return null.
-static inline const Instruction *getreturnRVOperand(const Instruction &Inst,
- ARCInstKind Class) {
- if (Class != ARCInstKind::RetainRV)
- return nullptr;
-
- const auto *Opnd = Inst.getOperand(0)->stripPointerCasts();
- if (const auto *C = dyn_cast<CallInst>(Opnd))
- return C;
- return dyn_cast<InvokeInst>(Opnd);
-}
-
-/// Return the list of PHI nodes that are equivalent to PN.
-template<class PHINodeTy, class VectorTy>
-void getEquivalentPHIs(PHINodeTy &PN, VectorTy &PHIList) {
- auto *BB = PN.getParent();
- for (auto &P : BB->phis()) {
- if (&P == &PN) // Do not add PN to the list.
- continue;
- unsigned I = 0, E = PN.getNumIncomingValues();
- for (; I < E; ++I) {
- auto *BB = PN.getIncomingBlock(I);
- auto *PNOpnd = PN.getIncomingValue(I)->stripPointerCasts();
- auto *POpnd = P.getIncomingValueForBlock(BB)->stripPointerCasts();
- if (PNOpnd != POpnd)
- break;
- }
- if (I == E)
- PHIList.push_back(&P);
- }
-}
-
-} // end namespace objcarc
-} // end namespace llvm
-
-#endif
+//===- ObjCARC.h - ObjC ARC Optimization --------------*- C++ -*-----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines common definitions/declarations used by the ObjC ARC
+/// Optimizer. ARC stands for Automatic Reference Counting and is a system for
+/// managing reference counts for objects in Objective C.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
+
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+namespace llvm {
+namespace objcarc {
+
+/// Erase the given instruction.
+///
+/// Many ObjC calls return their argument verbatim,
+/// so if it's such a call and the return value has users, replace them with the
+/// argument value.
+///
+static inline void EraseInstruction(Instruction *CI) {
+ Value *OldArg = cast<CallInst>(CI)->getArgOperand(0);
+
+ bool Unused = CI->use_empty();
+
+ if (!Unused) {
+ // Replace the return value with the argument.
+ assert((IsForwarding(GetBasicARCInstKind(CI)) ||
+ (IsNoopOnNull(GetBasicARCInstKind(CI)) &&
+ IsNullOrUndef(OldArg->stripPointerCasts()))) &&
+ "Can't delete non-forwarding instruction with users!");
+ CI->replaceAllUsesWith(OldArg);
+ }
+
+ CI->eraseFromParent();
+
+ if (Unused)
+ RecursivelyDeleteTriviallyDeadInstructions(OldArg);
+}
+
+/// If Inst is a ReturnRV and its operand is a call or invoke, return the
+/// operand. Otherwise return null.
+static inline const Instruction *getreturnRVOperand(const Instruction &Inst,
+ ARCInstKind Class) {
+ if (Class != ARCInstKind::RetainRV)
+ return nullptr;
+
+ const auto *Opnd = Inst.getOperand(0)->stripPointerCasts();
+ if (const auto *C = dyn_cast<CallInst>(Opnd))
+ return C;
+ return dyn_cast<InvokeInst>(Opnd);
+}
+
+/// Return the list of PHI nodes that are equivalent to PN.
+template<class PHINodeTy, class VectorTy>
+void getEquivalentPHIs(PHINodeTy &PN, VectorTy &PHIList) {
+ auto *BB = PN.getParent();
+ for (auto &P : BB->phis()) {
+ if (&P == &PN) // Do not add PN to the list.
+ continue;
+ unsigned I = 0, E = PN.getNumIncomingValues();
+ for (; I < E; ++I) {
+ auto *BB = PN.getIncomingBlock(I);
+ auto *PNOpnd = PN.getIncomingValue(I)->stripPointerCasts();
+ auto *POpnd = P.getIncomingValueForBlock(BB)->stripPointerCasts();
+ if (PNOpnd != POpnd)
+ break;
+ }
+ if (I == E)
+ PHIList.push_back(&P);
+ }
+}
+
+} // end namespace objcarc
+} // end namespace llvm
+
+#endif
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
index 3a4aea7574..6a928f2c7f 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -1,149 +1,149 @@
-//===- ObjCARCAPElim.cpp - ObjC ARC Optimization --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-///
-/// This file defines ObjC ARC optimizations. ARC stands for Automatic
-/// Reference Counting and is a system for managing reference counts for objects
-/// in Objective C.
-///
-/// This specific file implements optimizations which remove extraneous
-/// autorelease pools.
-///
-/// WARNING: This file knows about certain library functions. It recognizes them
-/// by name, and hardwires knowledge of their semantics.
-///
-/// WARNING: This file knows about how certain Objective-C library functions are
-/// used. Naive LLVM IR transformations which would otherwise be
-/// behavior-preserving may break these assumptions.
-///
-//===----------------------------------------------------------------------===//
-
-#include "ObjCARC.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/IR/Constants.h"
+//===- ObjCARCAPElim.cpp - ObjC ARC Optimization --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file implements optimizations which remove extraneous
+/// autorelease pools.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/ObjCARC.h"
-
-using namespace llvm;
-using namespace llvm::objcarc;
-
-#define DEBUG_TYPE "objc-arc-ap-elim"
-
-namespace {
-
-/// Interprocedurally determine if calls made by the given call site can
-/// possibly produce autoreleases.
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-ap-elim"
+
+namespace {
+
+/// Interprocedurally determine if calls made by the given call site can
+/// possibly produce autoreleases.
bool MayAutorelease(const CallBase &CB, unsigned Depth = 0) {
- if (const Function *Callee = CB.getCalledFunction()) {
- if (!Callee->hasExactDefinition())
- return true;
- for (const BasicBlock &BB : *Callee) {
- for (const Instruction &I : BB)
- if (const CallBase *JCB = dyn_cast<CallBase>(&I))
- // This recursion depth limit is arbitrary. It's just great
- // enough to cover known interesting testcases.
- if (Depth < 3 && !JCB->onlyReadsMemory() &&
- MayAutorelease(*JCB, Depth + 1))
- return true;
- }
- return false;
- }
-
- return true;
-}
-
+ if (const Function *Callee = CB.getCalledFunction()) {
+ if (!Callee->hasExactDefinition())
+ return true;
+ for (const BasicBlock &BB : *Callee) {
+ for (const Instruction &I : BB)
+ if (const CallBase *JCB = dyn_cast<CallBase>(&I))
+ // This recursion depth limit is arbitrary. It's just great
+ // enough to cover known interesting testcases.
+ if (Depth < 3 && !JCB->onlyReadsMemory() &&
+ MayAutorelease(*JCB, Depth + 1))
+ return true;
+ }
+ return false;
+ }
+
+ return true;
+}
+
bool OptimizeBB(BasicBlock *BB) {
- bool Changed = false;
-
- Instruction *Push = nullptr;
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
- Instruction *Inst = &*I++;
- switch (GetBasicARCInstKind(Inst)) {
- case ARCInstKind::AutoreleasepoolPush:
- Push = Inst;
- break;
- case ARCInstKind::AutoreleasepoolPop:
- // If this pop matches a push and nothing in between can autorelease,
- // zap the pair.
- if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
- Changed = true;
- LLVM_DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
- "autorelease pair:\n"
- " Pop: "
- << *Inst << "\n"
- << " Push: " << *Push
- << "\n");
- Inst->eraseFromParent();
- Push->eraseFromParent();
- }
- Push = nullptr;
- break;
- case ARCInstKind::CallOrUser:
- if (MayAutorelease(cast<CallBase>(*Inst)))
- Push = nullptr;
- break;
- default:
- break;
- }
- }
-
- return Changed;
-}
-
+ bool Changed = false;
+
+ Instruction *Push = nullptr;
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+ Instruction *Inst = &*I++;
+ switch (GetBasicARCInstKind(Inst)) {
+ case ARCInstKind::AutoreleasepoolPush:
+ Push = Inst;
+ break;
+ case ARCInstKind::AutoreleasepoolPop:
+ // If this pop matches a push and nothing in between can autorelease,
+ // zap the pair.
+ if (Push && cast<CallInst>(Inst)->getArgOperand(0) == Push) {
+ Changed = true;
+ LLVM_DEBUG(dbgs() << "ObjCARCAPElim::OptimizeBB: Zapping push pop "
+ "autorelease pair:\n"
+ " Pop: "
+ << *Inst << "\n"
+ << " Push: " << *Push
+ << "\n");
+ Inst->eraseFromParent();
+ Push->eraseFromParent();
+ }
+ Push = nullptr;
+ break;
+ case ARCInstKind::CallOrUser:
+ if (MayAutorelease(cast<CallBase>(*Inst)))
+ Push = nullptr;
+ break;
+ default:
+ break;
+ }
+ }
+
+ return Changed;
+}
+
bool runImpl(Module &M) {
- if (!EnableARCOpts)
- return false;
-
- // If nothing in the Module uses ARC, don't do anything.
- if (!ModuleHasARC(M))
- return false;
- // Find the llvm.global_ctors variable, as the first step in
- // identifying the global constructors. In theory, unnecessary autorelease
- // pools could occur anywhere, but in practice it's pretty rare. Global
- // ctors are a place where autorelease pools get inserted automatically,
- // so it's pretty common for them to be unnecessary, and it's pretty
- // profitable to eliminate them.
- GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
- if (!GV)
- return false;
-
- assert(GV->hasDefinitiveInitializer() &&
- "llvm.global_ctors is uncooperative!");
-
- bool Changed = false;
-
- // Dig the constructor functions out of GV's initializer.
- ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
- for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
- OI != OE; ++OI) {
- Value *Op = *OI;
- // llvm.global_ctors is an array of three-field structs where the second
- // members are constructor functions.
- Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
- // If the user used a constructor function with the wrong signature and
- // it got bitcasted or whatever, look the other way.
- if (!F)
- continue;
- // Only look at function definitions.
- if (F->isDeclaration())
- continue;
- // Only look at functions with one basic block.
- if (std::next(F->begin()) != F->end())
- continue;
- // Ok, a single-block constructor function definition. Try to optimize it.
- Changed |= OptimizeBB(&F->front());
- }
-
- return Changed;
-}
+ if (!EnableARCOpts)
+ return false;
+
+ // If nothing in the Module uses ARC, don't do anything.
+ if (!ModuleHasARC(M))
+ return false;
+ // Find the llvm.global_ctors variable, as the first step in
+ // identifying the global constructors. In theory, unnecessary autorelease
+ // pools could occur anywhere, but in practice it's pretty rare. Global
+ // ctors are a place where autorelease pools get inserted automatically,
+ // so it's pretty common for them to be unnecessary, and it's pretty
+ // profitable to eliminate them.
+ GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+ if (!GV)
+ return false;
+
+ assert(GV->hasDefinitiveInitializer() &&
+ "llvm.global_ctors is uncooperative!");
+
+ bool Changed = false;
+
+ // Dig the constructor functions out of GV's initializer.
+ ConstantArray *Init = cast<ConstantArray>(GV->getInitializer());
+ for (User::op_iterator OI = Init->op_begin(), OE = Init->op_end();
+ OI != OE; ++OI) {
+ Value *Op = *OI;
+ // llvm.global_ctors is an array of three-field structs where the second
+ // members are constructor functions.
+ Function *F = dyn_cast<Function>(cast<ConstantStruct>(Op)->getOperand(1));
+ // If the user used a constructor function with the wrong signature and
+ // it got bitcasted or whatever, look the other way.
+ if (!F)
+ continue;
+ // Only look at function definitions.
+ if (F->isDeclaration())
+ continue;
+ // Only look at functions with one basic block.
+ if (std::next(F->begin()) != F->end())
+ continue;
+ // Ok, a single-block constructor function definition. Try to optimize it.
+ Changed |= OptimizeBB(&F->front());
+ }
+
+ return Changed;
+}
/// Autorelease pool elimination.
class ObjCARCAPElim : public ModulePass {
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 1419e4dacb..86d161116e 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -1,100 +1,100 @@
-//===- ObjCARCContract.cpp - ObjC ARC Optimization ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file defines late ObjC ARC optimizations. ARC stands for Automatic
-/// Reference Counting and is a system for managing reference counts for objects
-/// in Objective C.
-///
-/// This specific file mainly deals with ``contracting'' multiple lower level
-/// operations into singular higher level operations through pattern matching.
-///
-/// WARNING: This file knows about certain library functions. It recognizes them
-/// by name, and hardwires knowledge of their semantics.
-///
-/// WARNING: This file knows about how certain Objective-C library functions are
-/// used. Naive LLVM IR transformations which would otherwise be
-/// behavior-preserving may break these assumptions.
-///
-//===----------------------------------------------------------------------===//
-
-// TODO: ObjCARCContract could insert PHI nodes when uses aren't
-// dominated by single calls.
-
-#include "ARCRuntimeEntryPoints.h"
-#include "DependencyAnalysis.h"
-#include "ObjCARC.h"
-#include "ProvenanceAnalysis.h"
-#include "llvm/ADT/Statistic.h"
+//===- ObjCARCContract.cpp - ObjC ARC Optimization ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines late ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file mainly deals with ``contracting'' multiple lower level
+/// operations into singular higher level operations through pattern matching.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+// TODO: ObjCARCContract could insert PHI nodes when uses aren't
+// dominated by single calls.
+
+#include "ARCRuntimeEntryPoints.h"
+#include "DependencyAnalysis.h"
+#include "ObjCARC.h"
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Operator.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Operator.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/ObjCARC.h"
-
-using namespace llvm;
-using namespace llvm::objcarc;
-
-#define DEBUG_TYPE "objc-arc-contract"
-
-STATISTIC(NumPeeps, "Number of calls peephole-optimized");
-STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
-
-//===----------------------------------------------------------------------===//
-// Declarations
-//===----------------------------------------------------------------------===//
-
-namespace {
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-contract"
+
+STATISTIC(NumPeeps, "Number of calls peephole-optimized");
+STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
+
+//===----------------------------------------------------------------------===//
+// Declarations
+//===----------------------------------------------------------------------===//
+
+namespace {
/// Late ARC optimizations
///
/// These change the IR in a way that makes it difficult to be analyzed by
/// ObjCARCOpt, so it's run late.
-
+
class ObjCARCContract {
bool Changed;
AAResults *AA;
DominatorTree *DT;
ProvenanceAnalysis PA;
ARCRuntimeEntryPoints EP;
-
+
/// A flag indicating whether this optimization pass should run.
bool Run;
-
+
/// The inline asm string to insert between calls and RetainRV calls to make
/// the optimization work on targets which need it.
const MDString *RVInstMarker;
-
+
/// The set of inserted objc_storeStrong calls. If at the end of walking the
/// function we have found no alloca instructions, these calls can be marked
/// "tail".
SmallPtrSet<CallInst *, 8> StoreStrongCalls;
-
+
/// Returns true if we eliminated Inst.
bool tryToPeepholeInstruction(
Function &F, Instruction *Inst, inst_iterator &Iter,
bool &TailOkForStoreStrong,
const DenseMap<BasicBlock *, ColorVector> &BlockColors);
-
+
bool optimizeRetainCall(Function &F, Instruction *Retain);
-
+
bool contractAutorelease(Function &F, Instruction *Autorelease,
ARCInstKind Class);
-
+
void tryToContractReleaseIntoStoreStrong(
Instruction *Release, inst_iterator &Iter,
const DenseMap<BasicBlock *, ColorVector> &BlockColors);
-
+
public:
bool init(Module &M);
bool run(Function &F, AAResults *AA, DominatorTree *DT);
@@ -113,426 +113,426 @@ public:
initializeObjCARCContractLegacyPassPass(*PassRegistry::getPassRegistry());
}
};
-}
-
-//===----------------------------------------------------------------------===//
-// Implementation
-//===----------------------------------------------------------------------===//
-
-/// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a
-/// return value. We do this late so we do not disrupt the dataflow analysis in
-/// ObjCARCOpt.
-bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) {
- const auto *Call = dyn_cast<CallBase>(GetArgRCIdentityRoot(Retain));
- if (!Call)
- return false;
- if (Call->getParent() != Retain->getParent())
- return false;
-
- // Check that the call is next to the retain.
- BasicBlock::const_iterator I = ++Call->getIterator();
- while (IsNoopInstruction(&*I))
- ++I;
- if (&*I != Retain)
- return false;
-
- // Turn it to an objc_retainAutoreleasedReturnValue.
- Changed = true;
- ++NumPeeps;
-
- LLVM_DEBUG(
- dbgs() << "Transforming objc_retain => "
- "objc_retainAutoreleasedReturnValue since the operand is a "
- "return value.\nOld: "
- << *Retain << "\n");
-
- // We do not have to worry about tail calls/does not throw since
- // retain/retainRV have the same properties.
- Function *Decl = EP.get(ARCRuntimeEntryPointKind::RetainRV);
- cast<CallInst>(Retain)->setCalledFunction(Decl);
-
- LLVM_DEBUG(dbgs() << "New: " << *Retain << "\n");
- return true;
-}
-
-/// Merge an autorelease with a retain into a fused call.
+}
+
+//===----------------------------------------------------------------------===//
+// Implementation
+//===----------------------------------------------------------------------===//
+
+/// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a
+/// return value. We do this late so we do not disrupt the dataflow analysis in
+/// ObjCARCOpt.
+bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) {
+ const auto *Call = dyn_cast<CallBase>(GetArgRCIdentityRoot(Retain));
+ if (!Call)
+ return false;
+ if (Call->getParent() != Retain->getParent())
+ return false;
+
+ // Check that the call is next to the retain.
+ BasicBlock::const_iterator I = ++Call->getIterator();
+ while (IsNoopInstruction(&*I))
+ ++I;
+ if (&*I != Retain)
+ return false;
+
+ // Turn it to an objc_retainAutoreleasedReturnValue.
+ Changed = true;
+ ++NumPeeps;
+
+ LLVM_DEBUG(
+ dbgs() << "Transforming objc_retain => "
+ "objc_retainAutoreleasedReturnValue since the operand is a "
+ "return value.\nOld: "
+ << *Retain << "\n");
+
+ // We do not have to worry about tail calls/does not throw since
+ // retain/retainRV have the same properties.
+ Function *Decl = EP.get(ARCRuntimeEntryPointKind::RetainRV);
+ cast<CallInst>(Retain)->setCalledFunction(Decl);
+
+ LLVM_DEBUG(dbgs() << "New: " << *Retain << "\n");
+ return true;
+}
+
+/// Merge an autorelease with a retain into a fused call.
bool ObjCARCContract::contractAutorelease(Function &F, Instruction *Autorelease,
ARCInstKind Class) {
- const Value *Arg = GetArgRCIdentityRoot(Autorelease);
-
- // Check that there are no instructions between the retain and the autorelease
- // (such as an autorelease_pop) which may change the count.
+ const Value *Arg = GetArgRCIdentityRoot(Autorelease);
+
+ // Check that there are no instructions between the retain and the autorelease
+ // (such as an autorelease_pop) which may change the count.
DependenceKind DK = Class == ARCInstKind::AutoreleaseRV
? RetainAutoreleaseRVDep
: RetainAutoreleaseDep;
auto *Retain = dyn_cast_or_null<CallInst>(
findSingleDependency(DK, Arg, Autorelease->getParent(), Autorelease, PA));
-
- if (!Retain || GetBasicARCInstKind(Retain) != ARCInstKind::Retain ||
- GetArgRCIdentityRoot(Retain) != Arg)
- return false;
-
- Changed = true;
- ++NumPeeps;
-
- LLVM_DEBUG(dbgs() << " Fusing retain/autorelease!\n"
- " Autorelease:"
- << *Autorelease
- << "\n"
- " Retain: "
- << *Retain << "\n");
-
- Function *Decl = EP.get(Class == ARCInstKind::AutoreleaseRV
- ? ARCRuntimeEntryPointKind::RetainAutoreleaseRV
- : ARCRuntimeEntryPointKind::RetainAutorelease);
- Retain->setCalledFunction(Decl);
-
- LLVM_DEBUG(dbgs() << " New RetainAutorelease: " << *Retain << "\n");
-
- EraseInstruction(Autorelease);
- return true;
-}
-
-static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,
- Instruction *Release,
- ProvenanceAnalysis &PA,
+
+ if (!Retain || GetBasicARCInstKind(Retain) != ARCInstKind::Retain ||
+ GetArgRCIdentityRoot(Retain) != Arg)
+ return false;
+
+ Changed = true;
+ ++NumPeeps;
+
+ LLVM_DEBUG(dbgs() << " Fusing retain/autorelease!\n"
+ " Autorelease:"
+ << *Autorelease
+ << "\n"
+ " Retain: "
+ << *Retain << "\n");
+
+ Function *Decl = EP.get(Class == ARCInstKind::AutoreleaseRV
+ ? ARCRuntimeEntryPointKind::RetainAutoreleaseRV
+ : ARCRuntimeEntryPointKind::RetainAutorelease);
+ Retain->setCalledFunction(Decl);
+
+ LLVM_DEBUG(dbgs() << " New RetainAutorelease: " << *Retain << "\n");
+
+ EraseInstruction(Autorelease);
+ return true;
+}
+
+static StoreInst *findSafeStoreForStoreStrongContraction(LoadInst *Load,
+ Instruction *Release,
+ ProvenanceAnalysis &PA,
AAResults *AA) {
- StoreInst *Store = nullptr;
- bool SawRelease = false;
-
- // Get the location associated with Load.
- MemoryLocation Loc = MemoryLocation::get(Load);
- auto *LocPtr = Loc.Ptr->stripPointerCasts();
-
- // Walk down to find the store and the release, which may be in either order.
- for (auto I = std::next(BasicBlock::iterator(Load)),
- E = Load->getParent()->end();
- I != E; ++I) {
- // If we found the store we were looking for and saw the release,
- // break. There is no more work to be done.
- if (Store && SawRelease)
- break;
-
- // Now we know that we have not seen either the store or the release. If I
- // is the release, mark that we saw the release and continue.
- Instruction *Inst = &*I;
- if (Inst == Release) {
- SawRelease = true;
- continue;
- }
-
- // Otherwise, we check if Inst is a "good" store. Grab the instruction class
- // of Inst.
- ARCInstKind Class = GetBasicARCInstKind(Inst);
-
- // If Inst is an unrelated retain, we don't care about it.
- //
- // TODO: This is one area where the optimization could be made more
- // aggressive.
- if (IsRetain(Class))
- continue;
-
- // If we have seen the store, but not the release...
- if (Store) {
- // We need to make sure that it is safe to move the release from its
- // current position to the store. This implies proving that any
- // instruction in between Store and the Release conservatively can not use
- // the RCIdentityRoot of Release. If we can prove we can ignore Inst, so
- // continue...
- if (!CanUse(Inst, Load, PA, Class)) {
- continue;
- }
-
- // Otherwise, be conservative and return nullptr.
- return nullptr;
- }
-
- // Ok, now we know we have not seen a store yet. See if Inst can write to
- // our load location, if it can not, just ignore the instruction.
- if (!isModSet(AA->getModRefInfo(Inst, Loc)))
- continue;
-
- Store = dyn_cast<StoreInst>(Inst);
-
- // If Inst can, then check if Inst is a simple store. If Inst is not a
- // store or a store that is not simple, then we have some we do not
- // understand writing to this memory implying we can not move the load
- // over the write to any subsequent store that we may find.
- if (!Store || !Store->isSimple())
- return nullptr;
-
- // Then make sure that the pointer we are storing to is Ptr. If so, we
- // found our Store!
- if (Store->getPointerOperand()->stripPointerCasts() == LocPtr)
- continue;
-
- // Otherwise, we have an unknown store to some other ptr that clobbers
- // Loc.Ptr. Bail!
- return nullptr;
- }
-
- // If we did not find the store or did not see the release, fail.
- if (!Store || !SawRelease)
- return nullptr;
-
- // We succeeded!
- return Store;
-}
-
-static Instruction *
-findRetainForStoreStrongContraction(Value *New, StoreInst *Store,
- Instruction *Release,
- ProvenanceAnalysis &PA) {
- // Walk up from the Store to find the retain.
- BasicBlock::iterator I = Store->getIterator();
- BasicBlock::iterator Begin = Store->getParent()->begin();
- while (I != Begin && GetBasicARCInstKind(&*I) != ARCInstKind::Retain) {
- Instruction *Inst = &*I;
-
- // It is only safe to move the retain to the store if we can prove
- // conservatively that nothing besides the release can decrement reference
- // counts in between the retain and the store.
- if (CanDecrementRefCount(Inst, New, PA) && Inst != Release)
- return nullptr;
- --I;
- }
- Instruction *Retain = &*I;
- if (GetBasicARCInstKind(Retain) != ARCInstKind::Retain)
- return nullptr;
- if (GetArgRCIdentityRoot(Retain) != New)
- return nullptr;
- return Retain;
-}
-
-/// Create a call instruction with the correct funclet token. Should be used
-/// instead of calling CallInst::Create directly.
-static CallInst *
-createCallInst(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
- const Twine &NameStr, Instruction *InsertBefore,
- const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
- SmallVector<OperandBundleDef, 1> OpBundles;
- if (!BlockColors.empty()) {
- const ColorVector &CV = BlockColors.find(InsertBefore->getParent())->second;
- assert(CV.size() == 1 && "non-unique color for block!");
- Instruction *EHPad = CV.front()->getFirstNonPHI();
- if (EHPad->isEHPad())
- OpBundles.emplace_back("funclet", EHPad);
- }
-
- return CallInst::Create(FTy, Func, Args, OpBundles, NameStr, InsertBefore);
-}
-
-static CallInst *
-createCallInst(FunctionCallee Func, ArrayRef<Value *> Args, const Twine &NameStr,
- Instruction *InsertBefore,
- const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
- return createCallInst(Func.getFunctionType(), Func.getCallee(), Args, NameStr,
- InsertBefore, BlockColors);
-}
-
-/// Attempt to merge an objc_release with a store, load, and objc_retain to form
-/// an objc_storeStrong. An objc_storeStrong:
-///
-/// objc_storeStrong(i8** %old_ptr, i8* new_value)
-///
-/// is equivalent to the following IR sequence:
-///
-/// ; Load old value.
-/// %old_value = load i8** %old_ptr (1)
-///
-/// ; Increment the new value and then release the old value. This must occur
-/// ; in order in case old_value releases new_value in its destructor causing
-/// ; us to potentially have a dangling ptr.
-/// tail call i8* @objc_retain(i8* %new_value) (2)
-/// tail call void @objc_release(i8* %old_value) (3)
-///
-/// ; Store the new_value into old_ptr
-/// store i8* %new_value, i8** %old_ptr (4)
-///
-/// The safety of this optimization is based around the following
-/// considerations:
-///
-/// 1. We are forming the store strong at the store. Thus to perform this
-/// optimization it must be safe to move the retain, load, and release to
-/// (4).
-/// 2. We need to make sure that any re-orderings of (1), (2), (3), (4) are
-/// safe.
-void ObjCARCContract::tryToContractReleaseIntoStoreStrong(
- Instruction *Release, inst_iterator &Iter,
- const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
- // See if we are releasing something that we just loaded.
- auto *Load = dyn_cast<LoadInst>(GetArgRCIdentityRoot(Release));
- if (!Load || !Load->isSimple())
- return;
-
- // For now, require everything to be in one basic block.
- BasicBlock *BB = Release->getParent();
- if (Load->getParent() != BB)
- return;
-
- // First scan down the BB from Load, looking for a store of the RCIdentityRoot
- // of Load's
- StoreInst *Store =
- findSafeStoreForStoreStrongContraction(Load, Release, PA, AA);
- // If we fail, bail.
- if (!Store)
- return;
-
- // Then find what new_value's RCIdentity Root is.
- Value *New = GetRCIdentityRoot(Store->getValueOperand());
-
- // Then walk up the BB and look for a retain on New without any intervening
- // instructions which conservatively might decrement ref counts.
- Instruction *Retain =
- findRetainForStoreStrongContraction(New, Store, Release, PA);
-
- // If we fail, bail.
- if (!Retain)
- return;
-
- Changed = true;
- ++NumStoreStrongs;
-
- LLVM_DEBUG(
- llvm::dbgs() << " Contracting retain, release into objc_storeStrong.\n"
- << " Old:\n"
- << " Store: " << *Store << "\n"
- << " Release: " << *Release << "\n"
- << " Retain: " << *Retain << "\n"
- << " Load: " << *Load << "\n");
-
- LLVMContext &C = Release->getContext();
- Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
- Type *I8XX = PointerType::getUnqual(I8X);
-
- Value *Args[] = { Load->getPointerOperand(), New };
- if (Args[0]->getType() != I8XX)
- Args[0] = new BitCastInst(Args[0], I8XX, "", Store);
- if (Args[1]->getType() != I8X)
- Args[1] = new BitCastInst(Args[1], I8X, "", Store);
- Function *Decl = EP.get(ARCRuntimeEntryPointKind::StoreStrong);
- CallInst *StoreStrong = createCallInst(Decl, Args, "", Store, BlockColors);
- StoreStrong->setDoesNotThrow();
- StoreStrong->setDebugLoc(Store->getDebugLoc());
-
- // We can't set the tail flag yet, because we haven't yet determined
- // whether there are any escaping allocas. Remember this call, so that
- // we can set the tail flag once we know it's safe.
- StoreStrongCalls.insert(StoreStrong);
-
- LLVM_DEBUG(llvm::dbgs() << " New Store Strong: " << *StoreStrong
- << "\n");
-
- if (&*Iter == Retain) ++Iter;
- if (&*Iter == Store) ++Iter;
- Store->eraseFromParent();
- Release->eraseFromParent();
- EraseInstruction(Retain);
- if (Load->use_empty())
- Load->eraseFromParent();
-}
-
-bool ObjCARCContract::tryToPeepholeInstruction(
- Function &F, Instruction *Inst, inst_iterator &Iter,
+ StoreInst *Store = nullptr;
+ bool SawRelease = false;
+
+ // Get the location associated with Load.
+ MemoryLocation Loc = MemoryLocation::get(Load);
+ auto *LocPtr = Loc.Ptr->stripPointerCasts();
+
+ // Walk down to find the store and the release, which may be in either order.
+ for (auto I = std::next(BasicBlock::iterator(Load)),
+ E = Load->getParent()->end();
+ I != E; ++I) {
+ // If we found the store we were looking for and saw the release,
+ // break. There is no more work to be done.
+ if (Store && SawRelease)
+ break;
+
+ // Now we know that we have not seen either the store or the release. If I
+ // is the release, mark that we saw the release and continue.
+ Instruction *Inst = &*I;
+ if (Inst == Release) {
+ SawRelease = true;
+ continue;
+ }
+
+ // Otherwise, we check if Inst is a "good" store. Grab the instruction class
+ // of Inst.
+ ARCInstKind Class = GetBasicARCInstKind(Inst);
+
+ // If Inst is an unrelated retain, we don't care about it.
+ //
+ // TODO: This is one area where the optimization could be made more
+ // aggressive.
+ if (IsRetain(Class))
+ continue;
+
+ // If we have seen the store, but not the release...
+ if (Store) {
+ // We need to make sure that it is safe to move the release from its
+ // current position to the store. This implies proving that any
+ // instruction in between Store and the Release conservatively can not use
+ // the RCIdentityRoot of Release. If we can prove we can ignore Inst, so
+ // continue...
+ if (!CanUse(Inst, Load, PA, Class)) {
+ continue;
+ }
+
+ // Otherwise, be conservative and return nullptr.
+ return nullptr;
+ }
+
+ // Ok, now we know we have not seen a store yet. See if Inst can write to
+ // our load location, if it can not, just ignore the instruction.
+ if (!isModSet(AA->getModRefInfo(Inst, Loc)))
+ continue;
+
+ Store = dyn_cast<StoreInst>(Inst);
+
+ // If Inst can, then check if Inst is a simple store. If Inst is not a
+ // store or a store that is not simple, then we have some we do not
+ // understand writing to this memory implying we can not move the load
+ // over the write to any subsequent store that we may find.
+ if (!Store || !Store->isSimple())
+ return nullptr;
+
+ // Then make sure that the pointer we are storing to is Ptr. If so, we
+ // found our Store!
+ if (Store->getPointerOperand()->stripPointerCasts() == LocPtr)
+ continue;
+
+ // Otherwise, we have an unknown store to some other ptr that clobbers
+ // Loc.Ptr. Bail!
+ return nullptr;
+ }
+
+ // If we did not find the store or did not see the release, fail.
+ if (!Store || !SawRelease)
+ return nullptr;
+
+ // We succeeded!
+ return Store;
+}
+
+static Instruction *
+findRetainForStoreStrongContraction(Value *New, StoreInst *Store,
+ Instruction *Release,
+ ProvenanceAnalysis &PA) {
+ // Walk up from the Store to find the retain.
+ BasicBlock::iterator I = Store->getIterator();
+ BasicBlock::iterator Begin = Store->getParent()->begin();
+ while (I != Begin && GetBasicARCInstKind(&*I) != ARCInstKind::Retain) {
+ Instruction *Inst = &*I;
+
+ // It is only safe to move the retain to the store if we can prove
+ // conservatively that nothing besides the release can decrement reference
+ // counts in between the retain and the store.
+ if (CanDecrementRefCount(Inst, New, PA) && Inst != Release)
+ return nullptr;
+ --I;
+ }
+ Instruction *Retain = &*I;
+ if (GetBasicARCInstKind(Retain) != ARCInstKind::Retain)
+ return nullptr;
+ if (GetArgRCIdentityRoot(Retain) != New)
+ return nullptr;
+ return Retain;
+}
+
+/// Create a call instruction with the correct funclet token. Should be used
+/// instead of calling CallInst::Create directly.
+static CallInst *
+createCallInst(FunctionType *FTy, Value *Func, ArrayRef<Value *> Args,
+ const Twine &NameStr, Instruction *InsertBefore,
+ const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ if (!BlockColors.empty()) {
+ const ColorVector &CV = BlockColors.find(InsertBefore->getParent())->second;
+ assert(CV.size() == 1 && "non-unique color for block!");
+ Instruction *EHPad = CV.front()->getFirstNonPHI();
+ if (EHPad->isEHPad())
+ OpBundles.emplace_back("funclet", EHPad);
+ }
+
+ return CallInst::Create(FTy, Func, Args, OpBundles, NameStr, InsertBefore);
+}
+
+static CallInst *
+createCallInst(FunctionCallee Func, ArrayRef<Value *> Args, const Twine &NameStr,
+ Instruction *InsertBefore,
+ const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
+ return createCallInst(Func.getFunctionType(), Func.getCallee(), Args, NameStr,
+ InsertBefore, BlockColors);
+}
+
+/// Attempt to merge an objc_release with a store, load, and objc_retain to form
+/// an objc_storeStrong. An objc_storeStrong:
+///
+/// objc_storeStrong(i8** %old_ptr, i8* new_value)
+///
+/// is equivalent to the following IR sequence:
+///
+/// ; Load old value.
+/// %old_value = load i8** %old_ptr (1)
+///
+/// ; Increment the new value and then release the old value. This must occur
+/// ; in order in case old_value releases new_value in its destructor causing
+/// ; us to potentially have a dangling ptr.
+/// tail call i8* @objc_retain(i8* %new_value) (2)
+/// tail call void @objc_release(i8* %old_value) (3)
+///
+/// ; Store the new_value into old_ptr
+/// store i8* %new_value, i8** %old_ptr (4)
+///
+/// The safety of this optimization is based around the following
+/// considerations:
+///
+/// 1. We are forming the store strong at the store. Thus to perform this
+/// optimization it must be safe to move the retain, load, and release to
+/// (4).
+/// 2. We need to make sure that any re-orderings of (1), (2), (3), (4) are
+/// safe.
+void ObjCARCContract::tryToContractReleaseIntoStoreStrong(
+ Instruction *Release, inst_iterator &Iter,
+ const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
+ // See if we are releasing something that we just loaded.
+ auto *Load = dyn_cast<LoadInst>(GetArgRCIdentityRoot(Release));
+ if (!Load || !Load->isSimple())
+ return;
+
+ // For now, require everything to be in one basic block.
+ BasicBlock *BB = Release->getParent();
+ if (Load->getParent() != BB)
+ return;
+
+ // First scan down the BB from Load, looking for a store of the RCIdentityRoot
+ // of Load's
+ StoreInst *Store =
+ findSafeStoreForStoreStrongContraction(Load, Release, PA, AA);
+ // If we fail, bail.
+ if (!Store)
+ return;
+
+ // Then find what new_value's RCIdentity Root is.
+ Value *New = GetRCIdentityRoot(Store->getValueOperand());
+
+ // Then walk up the BB and look for a retain on New without any intervening
+ // instructions which conservatively might decrement ref counts.
+ Instruction *Retain =
+ findRetainForStoreStrongContraction(New, Store, Release, PA);
+
+ // If we fail, bail.
+ if (!Retain)
+ return;
+
+ Changed = true;
+ ++NumStoreStrongs;
+
+ LLVM_DEBUG(
+ llvm::dbgs() << " Contracting retain, release into objc_storeStrong.\n"
+ << " Old:\n"
+ << " Store: " << *Store << "\n"
+ << " Release: " << *Release << "\n"
+ << " Retain: " << *Retain << "\n"
+ << " Load: " << *Load << "\n");
+
+ LLVMContext &C = Release->getContext();
+ Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+ Type *I8XX = PointerType::getUnqual(I8X);
+
+ Value *Args[] = { Load->getPointerOperand(), New };
+ if (Args[0]->getType() != I8XX)
+ Args[0] = new BitCastInst(Args[0], I8XX, "", Store);
+ if (Args[1]->getType() != I8X)
+ Args[1] = new BitCastInst(Args[1], I8X, "", Store);
+ Function *Decl = EP.get(ARCRuntimeEntryPointKind::StoreStrong);
+ CallInst *StoreStrong = createCallInst(Decl, Args, "", Store, BlockColors);
+ StoreStrong->setDoesNotThrow();
+ StoreStrong->setDebugLoc(Store->getDebugLoc());
+
+ // We can't set the tail flag yet, because we haven't yet determined
+ // whether there are any escaping allocas. Remember this call, so that
+ // we can set the tail flag once we know it's safe.
+ StoreStrongCalls.insert(StoreStrong);
+
+ LLVM_DEBUG(llvm::dbgs() << " New Store Strong: " << *StoreStrong
+ << "\n");
+
+ if (&*Iter == Retain) ++Iter;
+ if (&*Iter == Store) ++Iter;
+ Store->eraseFromParent();
+ Release->eraseFromParent();
+ EraseInstruction(Retain);
+ if (Load->use_empty())
+ Load->eraseFromParent();
+}
+
+bool ObjCARCContract::tryToPeepholeInstruction(
+ Function &F, Instruction *Inst, inst_iterator &Iter,
bool &TailOkForStoreStrongs,
- const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
- // Only these library routines return their argument. In particular,
- // objc_retainBlock does not necessarily return its argument.
- ARCInstKind Class = GetBasicARCInstKind(Inst);
- switch (Class) {
- case ARCInstKind::FusedRetainAutorelease:
- case ARCInstKind::FusedRetainAutoreleaseRV:
- return false;
- case ARCInstKind::Autorelease:
- case ARCInstKind::AutoreleaseRV:
+ const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
+ // Only these library routines return their argument. In particular,
+ // objc_retainBlock does not necessarily return its argument.
+ ARCInstKind Class = GetBasicARCInstKind(Inst);
+ switch (Class) {
+ case ARCInstKind::FusedRetainAutorelease:
+ case ARCInstKind::FusedRetainAutoreleaseRV:
+ return false;
+ case ARCInstKind::Autorelease:
+ case ARCInstKind::AutoreleaseRV:
return contractAutorelease(F, Inst, Class);
- case ARCInstKind::Retain:
- // Attempt to convert retains to retainrvs if they are next to function
- // calls.
- if (!optimizeRetainCall(F, Inst))
- return false;
- // If we succeed in our optimization, fall through.
- LLVM_FALLTHROUGH;
- case ARCInstKind::RetainRV:
- case ARCInstKind::ClaimRV: {
- // If we're compiling for a target which needs a special inline-asm
- // marker to do the return value optimization, insert it now.
- if (!RVInstMarker)
- return false;
- BasicBlock::iterator BBI = Inst->getIterator();
- BasicBlock *InstParent = Inst->getParent();
-
- // Step up to see if the call immediately precedes the RV call.
- // If it's an invoke, we have to cross a block boundary. And we have
- // to carefully dodge no-op instructions.
- do {
- if (BBI == InstParent->begin()) {
- BasicBlock *Pred = InstParent->getSinglePredecessor();
- if (!Pred)
- goto decline_rv_optimization;
- BBI = Pred->getTerminator()->getIterator();
- break;
- }
- --BBI;
- } while (IsNoopInstruction(&*BBI));
-
+ case ARCInstKind::Retain:
+ // Attempt to convert retains to retainrvs if they are next to function
+ // calls.
+ if (!optimizeRetainCall(F, Inst))
+ return false;
+ // If we succeed in our optimization, fall through.
+ LLVM_FALLTHROUGH;
+ case ARCInstKind::RetainRV:
+ case ARCInstKind::ClaimRV: {
+ // If we're compiling for a target which needs a special inline-asm
+ // marker to do the return value optimization, insert it now.
+ if (!RVInstMarker)
+ return false;
+ BasicBlock::iterator BBI = Inst->getIterator();
+ BasicBlock *InstParent = Inst->getParent();
+
+ // Step up to see if the call immediately precedes the RV call.
+ // If it's an invoke, we have to cross a block boundary. And we have
+ // to carefully dodge no-op instructions.
+ do {
+ if (BBI == InstParent->begin()) {
+ BasicBlock *Pred = InstParent->getSinglePredecessor();
+ if (!Pred)
+ goto decline_rv_optimization;
+ BBI = Pred->getTerminator()->getIterator();
+ break;
+ }
+ --BBI;
+ } while (IsNoopInstruction(&*BBI));
+
if (GetRCIdentityRoot(&*BBI) == GetArgRCIdentityRoot(Inst)) {
- LLVM_DEBUG(dbgs() << "Adding inline asm marker for the return value "
- "optimization.\n");
- Changed = true;
- InlineAsm *IA =
- InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()),
- /*isVarArg=*/false),
- RVInstMarker->getString(),
- /*Constraints=*/"", /*hasSideEffects=*/true);
-
- createCallInst(IA, None, "", Inst, BlockColors);
- }
- decline_rv_optimization:
- return false;
- }
- case ARCInstKind::InitWeak: {
- // objc_initWeak(p, null) => *p = null
- CallInst *CI = cast<CallInst>(Inst);
- if (IsNullOrUndef(CI->getArgOperand(1))) {
- Value *Null = ConstantPointerNull::get(cast<PointerType>(CI->getType()));
- Changed = true;
- new StoreInst(Null, CI->getArgOperand(0), CI);
-
- LLVM_DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n"
- << " New = " << *Null << "\n");
-
- CI->replaceAllUsesWith(Null);
- CI->eraseFromParent();
- }
- return true;
- }
- case ARCInstKind::Release:
- // Try to form an objc store strong from our release. If we fail, there is
- // nothing further to do below, so continue.
- tryToContractReleaseIntoStoreStrong(Inst, Iter, BlockColors);
- return true;
- case ARCInstKind::User:
- // Be conservative if the function has any alloca instructions.
- // Technically we only care about escaping alloca instructions,
- // but this is sufficient to handle some interesting cases.
- if (isa<AllocaInst>(Inst))
- TailOkForStoreStrongs = false;
- return true;
- case ARCInstKind::IntrinsicUser:
- // Remove calls to @llvm.objc.clang.arc.use(...).
- Changed = true;
- Inst->eraseFromParent();
- return true;
- default:
- return true;
- }
-}
-
-//===----------------------------------------------------------------------===//
-// Top Level Driver
-//===----------------------------------------------------------------------===//
-
+ LLVM_DEBUG(dbgs() << "Adding inline asm marker for the return value "
+ "optimization.\n");
+ Changed = true;
+ InlineAsm *IA =
+ InlineAsm::get(FunctionType::get(Type::getVoidTy(Inst->getContext()),
+ /*isVarArg=*/false),
+ RVInstMarker->getString(),
+ /*Constraints=*/"", /*hasSideEffects=*/true);
+
+ createCallInst(IA, None, "", Inst, BlockColors);
+ }
+ decline_rv_optimization:
+ return false;
+ }
+ case ARCInstKind::InitWeak: {
+ // objc_initWeak(p, null) => *p = null
+ CallInst *CI = cast<CallInst>(Inst);
+ if (IsNullOrUndef(CI->getArgOperand(1))) {
+ Value *Null = ConstantPointerNull::get(cast<PointerType>(CI->getType()));
+ Changed = true;
+ new StoreInst(Null, CI->getArgOperand(0), CI);
+
+ LLVM_DEBUG(dbgs() << "OBJCARCContract: Old = " << *CI << "\n"
+ << " New = " << *Null << "\n");
+
+ CI->replaceAllUsesWith(Null);
+ CI->eraseFromParent();
+ }
+ return true;
+ }
+ case ARCInstKind::Release:
+ // Try to form an objc store strong from our release. If we fail, there is
+ // nothing further to do below, so continue.
+ tryToContractReleaseIntoStoreStrong(Inst, Iter, BlockColors);
+ return true;
+ case ARCInstKind::User:
+ // Be conservative if the function has any alloca instructions.
+ // Technically we only care about escaping alloca instructions,
+ // but this is sufficient to handle some interesting cases.
+ if (isa<AllocaInst>(Inst))
+ TailOkForStoreStrongs = false;
+ return true;
+ case ARCInstKind::IntrinsicUser:
+ // Remove calls to @llvm.objc.clang.arc.use(...).
+ Changed = true;
+ Inst->eraseFromParent();
+ return true;
+ default:
+ return true;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Top Level Driver
+//===----------------------------------------------------------------------===//
+
bool ObjCARCContract::init(Module &M) {
// If nothing in the Module uses ARC, don't do anything.
Run = ModuleHasARC(M);
@@ -549,212 +549,212 @@ bool ObjCARCContract::init(Module &M) {
}
bool ObjCARCContract::run(Function &F, AAResults *A, DominatorTree *D) {
- if (!EnableARCOpts)
- return false;
-
- // If nothing in the Module uses ARC, don't do anything.
- if (!Run)
- return false;
-
- Changed = false;
+ if (!EnableARCOpts)
+ return false;
+
+ // If nothing in the Module uses ARC, don't do anything.
+ if (!Run)
+ return false;
+
+ Changed = false;
AA = A;
DT = D;
PA.setAA(A);
-
- DenseMap<BasicBlock *, ColorVector> BlockColors;
- if (F.hasPersonalityFn() &&
- isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
- BlockColors = colorEHFunclets(F);
-
- LLVM_DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n");
-
- // Track whether it's ok to mark objc_storeStrong calls with the "tail"
- // keyword. Be conservative if the function has variadic arguments.
- // It seems that functions which "return twice" are also unsafe for the
- // "tail" argument, because they are setjmp, which could need to
- // return to an earlier stack state.
- bool TailOkForStoreStrongs =
- !F.isVarArg() && !F.callsFunctionThatReturnsTwice();
-
- // For ObjC library calls which return their argument, replace uses of the
- // argument with uses of the call return value, if it dominates the use. This
- // reduces register pressure.
- for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E;) {
- Instruction *Inst = &*I++;
-
- LLVM_DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
-
- // First try to peephole Inst. If there is nothing further we can do in
- // terms of undoing objc-arc-expand, process the next inst.
+
+ DenseMap<BasicBlock *, ColorVector> BlockColors;
+ if (F.hasPersonalityFn() &&
+ isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+ BlockColors = colorEHFunclets(F);
+
+ LLVM_DEBUG(llvm::dbgs() << "**** ObjCARC Contract ****\n");
+
+ // Track whether it's ok to mark objc_storeStrong calls with the "tail"
+ // keyword. Be conservative if the function has variadic arguments.
+ // It seems that functions which "return twice" are also unsafe for the
+ // "tail" argument, because they are setjmp, which could need to
+ // return to an earlier stack state.
+ bool TailOkForStoreStrongs =
+ !F.isVarArg() && !F.callsFunctionThatReturnsTwice();
+
+ // For ObjC library calls which return their argument, replace uses of the
+ // argument with uses of the call return value, if it dominates the use. This
+ // reduces register pressure.
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E;) {
+ Instruction *Inst = &*I++;
+
+ LLVM_DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
+
+ // First try to peephole Inst. If there is nothing further we can do in
+ // terms of undoing objc-arc-expand, process the next inst.
if (tryToPeepholeInstruction(F, Inst, I, TailOkForStoreStrongs,
BlockColors))
- continue;
-
- // Otherwise, try to undo objc-arc-expand.
-
- // Don't use GetArgRCIdentityRoot because we don't want to look through bitcasts
- // and such; to do the replacement, the argument must have type i8*.
-
- // Function for replacing uses of Arg dominated by Inst.
- auto ReplaceArgUses = [Inst, this](Value *Arg) {
- // If we're compiling bugpointed code, don't get in trouble.
- if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
- return;
-
- // Look through the uses of the pointer.
- for (Value::use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
- UI != UE; ) {
- // Increment UI now, because we may unlink its element.
- Use &U = *UI++;
- unsigned OperandNo = U.getOperandNo();
-
- // If the call's return value dominates a use of the call's argument
- // value, rewrite the use to use the return value. We check for
- // reachability here because an unreachable call is considered to
- // trivially dominate itself, which would lead us to rewriting its
- // argument in terms of its return value, which would lead to
- // infinite loops in GetArgRCIdentityRoot.
- if (!DT->isReachableFromEntry(U) || !DT->dominates(Inst, U))
- continue;
-
- Changed = true;
- Instruction *Replacement = Inst;
- Type *UseTy = U.get()->getType();
- if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
- // For PHI nodes, insert the bitcast in the predecessor block.
- unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
- BasicBlock *IncomingBB = PHI->getIncomingBlock(ValNo);
- if (Replacement->getType() != UseTy) {
- // A catchswitch is both a pad and a terminator, meaning a basic
- // block with a catchswitch has no insertion point. Keep going up
- // the dominator tree until we find a non-catchswitch.
- BasicBlock *InsertBB = IncomingBB;
- while (isa<CatchSwitchInst>(InsertBB->getFirstNonPHI())) {
- InsertBB = DT->getNode(InsertBB)->getIDom()->getBlock();
- }
-
- assert(DT->dominates(Inst, &InsertBB->back()) &&
- "Invalid insertion point for bitcast");
- Replacement =
- new BitCastInst(Replacement, UseTy, "", &InsertBB->back());
- }
-
- // While we're here, rewrite all edges for this PHI, rather
- // than just one use at a time, to minimize the number of
- // bitcasts we emit.
- for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
- if (PHI->getIncomingBlock(i) == IncomingBB) {
- // Keep the UI iterator valid.
- if (UI != UE &&
- &PHI->getOperandUse(
- PHINode::getOperandNumForIncomingValue(i)) == &*UI)
- ++UI;
- PHI->setIncomingValue(i, Replacement);
- }
- } else {
- if (Replacement->getType() != UseTy)
- Replacement = new BitCastInst(Replacement, UseTy, "",
- cast<Instruction>(U.getUser()));
- U.set(Replacement);
- }
- }
- };
-
- Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
- Value *OrigArg = Arg;
-
- // TODO: Change this to a do-while.
- for (;;) {
- ReplaceArgUses(Arg);
-
- // If Arg is a no-op casted pointer, strip one level of casts and iterate.
- if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
- Arg = BI->getOperand(0);
- else if (isa<GEPOperator>(Arg) &&
- cast<GEPOperator>(Arg)->hasAllZeroIndices())
- Arg = cast<GEPOperator>(Arg)->getPointerOperand();
- else if (isa<GlobalAlias>(Arg) &&
- !cast<GlobalAlias>(Arg)->isInterposable())
- Arg = cast<GlobalAlias>(Arg)->getAliasee();
- else {
- // If Arg is a PHI node, get PHIs that are equivalent to it and replace
- // their uses.
- if (PHINode *PN = dyn_cast<PHINode>(Arg)) {
- SmallVector<Value *, 1> PHIList;
- getEquivalentPHIs(*PN, PHIList);
- for (Value *PHI : PHIList)
- ReplaceArgUses(PHI);
- }
- break;
- }
- }
-
- // Replace bitcast users of Arg that are dominated by Inst.
- SmallVector<BitCastInst *, 2> BitCastUsers;
-
- // Add all bitcast users of the function argument first.
- for (User *U : OrigArg->users())
- if (auto *BC = dyn_cast<BitCastInst>(U))
- BitCastUsers.push_back(BC);
-
- // Replace the bitcasts with the call return. Iterate until list is empty.
- while (!BitCastUsers.empty()) {
- auto *BC = BitCastUsers.pop_back_val();
- for (User *U : BC->users())
- if (auto *B = dyn_cast<BitCastInst>(U))
- BitCastUsers.push_back(B);
-
- ReplaceArgUses(BC);
- }
- }
-
- // If this function has no escaping allocas or suspicious vararg usage,
- // objc_storeStrong calls can be marked with the "tail" keyword.
- if (TailOkForStoreStrongs)
- for (CallInst *CI : StoreStrongCalls)
- CI->setTailCall();
- StoreStrongCalls.clear();
-
- return Changed;
-}
-
-//===----------------------------------------------------------------------===//
-// Misc Pass Manager
-//===----------------------------------------------------------------------===//
-
+ continue;
+
+ // Otherwise, try to undo objc-arc-expand.
+
+ // Don't use GetArgRCIdentityRoot because we don't want to look through bitcasts
+ // and such; to do the replacement, the argument must have type i8*.
+
+ // Function for replacing uses of Arg dominated by Inst.
+ auto ReplaceArgUses = [Inst, this](Value *Arg) {
+ // If we're compiling bugpointed code, don't get in trouble.
+ if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
+ return;
+
+ // Look through the uses of the pointer.
+ for (Value::use_iterator UI = Arg->use_begin(), UE = Arg->use_end();
+ UI != UE; ) {
+ // Increment UI now, because we may unlink its element.
+ Use &U = *UI++;
+ unsigned OperandNo = U.getOperandNo();
+
+ // If the call's return value dominates a use of the call's argument
+ // value, rewrite the use to use the return value. We check for
+ // reachability here because an unreachable call is considered to
+ // trivially dominate itself, which would lead us to rewriting its
+ // argument in terms of its return value, which would lead to
+ // infinite loops in GetArgRCIdentityRoot.
+ if (!DT->isReachableFromEntry(U) || !DT->dominates(Inst, U))
+ continue;
+
+ Changed = true;
+ Instruction *Replacement = Inst;
+ Type *UseTy = U.get()->getType();
+ if (PHINode *PHI = dyn_cast<PHINode>(U.getUser())) {
+ // For PHI nodes, insert the bitcast in the predecessor block.
+ unsigned ValNo = PHINode::getIncomingValueNumForOperand(OperandNo);
+ BasicBlock *IncomingBB = PHI->getIncomingBlock(ValNo);
+ if (Replacement->getType() != UseTy) {
+ // A catchswitch is both a pad and a terminator, meaning a basic
+ // block with a catchswitch has no insertion point. Keep going up
+ // the dominator tree until we find a non-catchswitch.
+ BasicBlock *InsertBB = IncomingBB;
+ while (isa<CatchSwitchInst>(InsertBB->getFirstNonPHI())) {
+ InsertBB = DT->getNode(InsertBB)->getIDom()->getBlock();
+ }
+
+ assert(DT->dominates(Inst, &InsertBB->back()) &&
+ "Invalid insertion point for bitcast");
+ Replacement =
+ new BitCastInst(Replacement, UseTy, "", &InsertBB->back());
+ }
+
+ // While we're here, rewrite all edges for this PHI, rather
+ // than just one use at a time, to minimize the number of
+ // bitcasts we emit.
+ for (unsigned i = 0, e = PHI->getNumIncomingValues(); i != e; ++i)
+ if (PHI->getIncomingBlock(i) == IncomingBB) {
+ // Keep the UI iterator valid.
+ if (UI != UE &&
+ &PHI->getOperandUse(
+ PHINode::getOperandNumForIncomingValue(i)) == &*UI)
+ ++UI;
+ PHI->setIncomingValue(i, Replacement);
+ }
+ } else {
+ if (Replacement->getType() != UseTy)
+ Replacement = new BitCastInst(Replacement, UseTy, "",
+ cast<Instruction>(U.getUser()));
+ U.set(Replacement);
+ }
+ }
+ };
+
+ Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
+ Value *OrigArg = Arg;
+
+ // TODO: Change this to a do-while.
+ for (;;) {
+ ReplaceArgUses(Arg);
+
+ // If Arg is a no-op casted pointer, strip one level of casts and iterate.
+ if (const BitCastInst *BI = dyn_cast<BitCastInst>(Arg))
+ Arg = BI->getOperand(0);
+ else if (isa<GEPOperator>(Arg) &&
+ cast<GEPOperator>(Arg)->hasAllZeroIndices())
+ Arg = cast<GEPOperator>(Arg)->getPointerOperand();
+ else if (isa<GlobalAlias>(Arg) &&
+ !cast<GlobalAlias>(Arg)->isInterposable())
+ Arg = cast<GlobalAlias>(Arg)->getAliasee();
+ else {
+ // If Arg is a PHI node, get PHIs that are equivalent to it and replace
+ // their uses.
+ if (PHINode *PN = dyn_cast<PHINode>(Arg)) {
+ SmallVector<Value *, 1> PHIList;
+ getEquivalentPHIs(*PN, PHIList);
+ for (Value *PHI : PHIList)
+ ReplaceArgUses(PHI);
+ }
+ break;
+ }
+ }
+
+ // Replace bitcast users of Arg that are dominated by Inst.
+ SmallVector<BitCastInst *, 2> BitCastUsers;
+
+ // Add all bitcast users of the function argument first.
+ for (User *U : OrigArg->users())
+ if (auto *BC = dyn_cast<BitCastInst>(U))
+ BitCastUsers.push_back(BC);
+
+ // Replace the bitcasts with the call return. Iterate until list is empty.
+ while (!BitCastUsers.empty()) {
+ auto *BC = BitCastUsers.pop_back_val();
+ for (User *U : BC->users())
+ if (auto *B = dyn_cast<BitCastInst>(U))
+ BitCastUsers.push_back(B);
+
+ ReplaceArgUses(BC);
+ }
+ }
+
+ // If this function has no escaping allocas or suspicious vararg usage,
+ // objc_storeStrong calls can be marked with the "tail" keyword.
+ if (TailOkForStoreStrongs)
+ for (CallInst *CI : StoreStrongCalls)
+ CI->setTailCall();
+ StoreStrongCalls.clear();
+
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Misc Pass Manager
+//===----------------------------------------------------------------------===//
+
char ObjCARCContractLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(ObjCARCContractLegacyPass, "objc-arc-contract",
- "ObjC ARC contraction", false, false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+ "ObjC ARC contraction", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
INITIALIZE_PASS_END(ObjCARCContractLegacyPass, "objc-arc-contract",
- "ObjC ARC contraction", false, false)
-
+ "ObjC ARC contraction", false, false)
+
void ObjCARCContractLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.setPreservesCFG();
-}
-
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.setPreservesCFG();
+}
+
Pass *llvm::createObjCARCContractPass() {
return new ObjCARCContractLegacyPass();
}
-
+
bool ObjCARCContractLegacyPass::doInitialization(Module &M) {
return OCARCC.init(M);
}
-
+
bool ObjCARCContractLegacyPass::runOnFunction(Function &F) {
auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
return OCARCC.run(F, AA, DT);
}
-
+
PreservedAnalyses ObjCARCContractPass::run(Function &F,
FunctionAnalysisManager &AM) {
ObjCARCContract OCAC;
OCAC.init(*F.getParent());
-
+
bool Changed = OCAC.run(F, &AM.getResult<AAManager>(F),
&AM.getResult<DominatorTreeAnalysis>(F));
if (Changed) {
@@ -763,4 +763,4 @@ PreservedAnalyses ObjCARCContractPass::run(Function &F,
return PA;
}
return PreservedAnalyses::all();
-}
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
index 1f757198fe..d2121dcebe 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -1,95 +1,95 @@
-//===- ObjCARCExpand.cpp - ObjC ARC Optimization --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file defines ObjC ARC optimizations. ARC stands for Automatic
-/// Reference Counting and is a system for managing reference counts for objects
-/// in Objective C.
-///
-/// This specific file deals with early optimizations which perform certain
-/// cleanup operations.
-///
-/// WARNING: This file knows about certain library functions. It recognizes them
-/// by name, and hardwires knowledge of their semantics.
-///
-/// WARNING: This file knows about how certain Objective-C library functions are
-/// used. Naive LLVM IR transformations which would otherwise be
-/// behavior-preserving may break these assumptions.
-///
-//===----------------------------------------------------------------------===//
-
-#include "ObjCARC.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
+//===- ObjCARCExpand.cpp - ObjC ARC Optimization --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// This specific file deals with early optimizations which perform certain
+/// cleanup operations.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ObjCARC.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/PassRegistry.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/ObjCARC.h"
-
-#define DEBUG_TYPE "objc-arc-expand"
-
-using namespace llvm;
-using namespace llvm::objcarc;
-
-namespace {
+
+#define DEBUG_TYPE "objc-arc-expand"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
static bool runImpl(Function &F) {
- if (!EnableARCOpts)
- return false;
-
- // If nothing in the Module uses ARC, don't do anything.
+ if (!EnableARCOpts)
+ return false;
+
+ // If nothing in the Module uses ARC, don't do anything.
if (!ModuleHasARC(*F.getParent()))
- return false;
-
- bool Changed = false;
-
- LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting Function: " << F.getName()
- << "\n");
-
- for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
- Instruction *Inst = &*I;
-
- LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
-
- switch (GetBasicARCInstKind(Inst)) {
- case ARCInstKind::Retain:
- case ARCInstKind::RetainRV:
- case ARCInstKind::Autorelease:
- case ARCInstKind::AutoreleaseRV:
- case ARCInstKind::FusedRetainAutorelease:
- case ARCInstKind::FusedRetainAutoreleaseRV: {
- // These calls return their argument verbatim, as a low-level
- // optimization. However, this makes high-level optimizations
- // harder. Undo any uses of this optimization that the front-end
- // emitted here. We'll redo them in the contract pass.
- Changed = true;
- Value *Value = cast<CallInst>(Inst)->getArgOperand(0);
- LLVM_DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst
- << "\n"
- " New = "
- << *Value << "\n");
- Inst->replaceAllUsesWith(Value);
- break;
- }
- default:
- break;
- }
- }
-
- LLVM_DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n");
-
- return Changed;
-}
+ return false;
+
+ bool Changed = false;
+
+ LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting Function: " << F.getName()
+ << "\n");
+
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ++I) {
+ Instruction *Inst = &*I;
+
+ LLVM_DEBUG(dbgs() << "ObjCARCExpand: Visiting: " << *Inst << "\n");
+
+ switch (GetBasicARCInstKind(Inst)) {
+ case ARCInstKind::Retain:
+ case ARCInstKind::RetainRV:
+ case ARCInstKind::Autorelease:
+ case ARCInstKind::AutoreleaseRV:
+ case ARCInstKind::FusedRetainAutorelease:
+ case ARCInstKind::FusedRetainAutoreleaseRV: {
+ // These calls return their argument verbatim, as a low-level
+ // optimization. However, this makes high-level optimizations
+ // harder. Undo any uses of this optimization that the front-end
+ // emitted here. We'll redo them in the contract pass.
+ Changed = true;
+ Value *Value = cast<CallInst>(Inst)->getArgOperand(0);
+ LLVM_DEBUG(dbgs() << "ObjCARCExpand: Old = " << *Inst
+ << "\n"
+ " New = "
+ << *Value << "\n");
+ Inst->replaceAllUsesWith(Value);
+ break;
+ }
+ default:
+ break;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "ObjCARCExpand: Finished List.\n\n");
+
+ return Changed;
+}
/// Early ARC transformations.
class ObjCARCExpand : public FunctionPass {
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 823d3fad2b..1c44749951 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -1,519 +1,519 @@
-//===- ObjCARCOpts.cpp - ObjC ARC Optimization ----------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This file defines ObjC ARC optimizations. ARC stands for Automatic
-/// Reference Counting and is a system for managing reference counts for objects
-/// in Objective C.
-///
-/// The optimizations performed include elimination of redundant, partially
-/// redundant, and inconsequential reference count operations, elimination of
-/// redundant weak pointer operations, and numerous minor simplifications.
-///
-/// WARNING: This file knows about certain library functions. It recognizes them
-/// by name, and hardwires knowledge of their semantics.
-///
-/// WARNING: This file knows about how certain Objective-C library functions are
-/// used. Naive LLVM IR transformations which would otherwise be
-/// behavior-preserving may break these assumptions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARCRuntimeEntryPoints.h"
-#include "BlotMapVector.h"
-#include "DependencyAnalysis.h"
-#include "ObjCARC.h"
-#include "ProvenanceAnalysis.h"
-#include "PtrState.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/ObjCARCAliasAnalysis.h"
-#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
-#include "llvm/Analysis/ObjCARCInstKind.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+//===- ObjCARCOpts.cpp - ObjC ARC Optimization ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file defines ObjC ARC optimizations. ARC stands for Automatic
+/// Reference Counting and is a system for managing reference counts for objects
+/// in Objective C.
+///
+/// The optimizations performed include elimination of redundant, partially
+/// redundant, and inconsequential reference count operations, elimination of
+/// redundant weak pointer operations, and numerous minor simplifications.
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARCRuntimeEntryPoints.h"
+#include "BlotMapVector.h"
+#include "DependencyAnalysis.h"
+#include "ObjCARC.h"
+#include "ProvenanceAnalysis.h"
+#include "PtrState.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/ObjCARCAliasAnalysis.h"
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/ObjCARC.h"
-#include <cassert>
-#include <iterator>
-#include <utility>
-
-using namespace llvm;
-using namespace llvm::objcarc;
-
-#define DEBUG_TYPE "objc-arc-opts"
-
-static cl::opt<unsigned> MaxPtrStates("arc-opt-max-ptr-states",
- cl::Hidden,
- cl::desc("Maximum number of ptr states the optimizer keeps track of"),
- cl::init(4095));
-
-/// \defgroup ARCUtilities Utility declarations/definitions specific to ARC.
-/// @{
-
-/// This is similar to GetRCIdentityRoot but it stops as soon
-/// as it finds a value with multiple uses.
-static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
- // ConstantData (like ConstantPointerNull and UndefValue) is used across
- // modules. It's never a single-use value.
- if (isa<ConstantData>(Arg))
- return nullptr;
-
- if (Arg->hasOneUse()) {
- if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg))
- return FindSingleUseIdentifiedObject(BC->getOperand(0));
- if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg))
- if (GEP->hasAllZeroIndices())
- return FindSingleUseIdentifiedObject(GEP->getPointerOperand());
- if (IsForwarding(GetBasicARCInstKind(Arg)))
- return FindSingleUseIdentifiedObject(
- cast<CallInst>(Arg)->getArgOperand(0));
- if (!IsObjCIdentifiedObject(Arg))
- return nullptr;
- return Arg;
- }
-
- // If we found an identifiable object but it has multiple uses, but they are
- // trivial uses, we can still consider this to be a single-use value.
- if (IsObjCIdentifiedObject(Arg)) {
- for (const User *U : Arg->users())
- if (!U->use_empty() || GetRCIdentityRoot(U) != Arg)
- return nullptr;
-
- return Arg;
- }
-
- return nullptr;
-}
-
-/// @}
-///
-/// \defgroup ARCOpt ARC Optimization.
-/// @{
-
-// TODO: On code like this:
-//
-// objc_retain(%x)
-// stuff_that_cannot_release()
-// objc_autorelease(%x)
-// stuff_that_cannot_release()
-// objc_retain(%x)
-// stuff_that_cannot_release()
-// objc_autorelease(%x)
-//
-// The second retain and autorelease can be deleted.
-
-// TODO: It should be possible to delete
-// objc_autoreleasePoolPush and objc_autoreleasePoolPop
-// pairs if nothing is actually autoreleased between them. Also, autorelease
-// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code
-// after inlining) can be turned into plain release calls.
-
-// TODO: Critical-edge splitting. If the optimial insertion point is
-// a critical edge, the current algorithm has to fail, because it doesn't
-// know how to split edges. It should be possible to make the optimizer
-// think in terms of edges, rather than blocks, and then split critical
-// edges on demand.
-
-// TODO: OptimizeSequences could generalized to be Interprocedural.
-
-// TODO: Recognize that a bunch of other objc runtime calls have
-// non-escaping arguments and non-releasing arguments, and may be
-// non-autoreleasing.
-
-// TODO: Sink autorelease calls as far as possible. Unfortunately we
-// usually can't sink them past other calls, which would be the main
-// case where it would be useful.
-
-// TODO: The pointer returned from objc_loadWeakRetained is retained.
-
-// TODO: Delete release+retain pairs (rare).
-
-STATISTIC(NumNoops, "Number of no-op objc calls eliminated");
-STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
-STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases");
-STATISTIC(NumRets, "Number of return value forwarding "
- "retain+autoreleases eliminated");
-STATISTIC(NumRRs, "Number of retain+release paths eliminated");
-STATISTIC(NumPeeps, "Number of calls peephole-optimized");
-#ifndef NDEBUG
-STATISTIC(NumRetainsBeforeOpt,
- "Number of retains before optimization");
-STATISTIC(NumReleasesBeforeOpt,
- "Number of releases before optimization");
-STATISTIC(NumRetainsAfterOpt,
- "Number of retains after optimization");
-STATISTIC(NumReleasesAfterOpt,
- "Number of releases after optimization");
-#endif
-
-namespace {
-
- /// Per-BasicBlock state.
- class BBState {
- /// The number of unique control paths from the entry which can reach this
- /// block.
- unsigned TopDownPathCount = 0;
-
- /// The number of unique control paths to exits from this block.
- unsigned BottomUpPathCount = 0;
-
- /// The top-down traversal uses this to record information known about a
- /// pointer at the bottom of each block.
- BlotMapVector<const Value *, TopDownPtrState> PerPtrTopDown;
-
- /// The bottom-up traversal uses this to record information known about a
- /// pointer at the top of each block.
- BlotMapVector<const Value *, BottomUpPtrState> PerPtrBottomUp;
-
- /// Effective predecessors of the current block ignoring ignorable edges and
- /// ignored backedges.
- SmallVector<BasicBlock *, 2> Preds;
-
- /// Effective successors of the current block ignoring ignorable edges and
- /// ignored backedges.
- SmallVector<BasicBlock *, 2> Succs;
-
- public:
- static const unsigned OverflowOccurredValue;
-
- BBState() = default;
-
- using top_down_ptr_iterator = decltype(PerPtrTopDown)::iterator;
- using const_top_down_ptr_iterator = decltype(PerPtrTopDown)::const_iterator;
-
- top_down_ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); }
- top_down_ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); }
- const_top_down_ptr_iterator top_down_ptr_begin() const {
- return PerPtrTopDown.begin();
- }
- const_top_down_ptr_iterator top_down_ptr_end() const {
- return PerPtrTopDown.end();
- }
- bool hasTopDownPtrs() const {
- return !PerPtrTopDown.empty();
- }
-
- unsigned top_down_ptr_list_size() const {
- return std::distance(top_down_ptr_begin(), top_down_ptr_end());
- }
-
- using bottom_up_ptr_iterator = decltype(PerPtrBottomUp)::iterator;
- using const_bottom_up_ptr_iterator =
- decltype(PerPtrBottomUp)::const_iterator;
-
- bottom_up_ptr_iterator bottom_up_ptr_begin() {
- return PerPtrBottomUp.begin();
- }
- bottom_up_ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); }
- const_bottom_up_ptr_iterator bottom_up_ptr_begin() const {
- return PerPtrBottomUp.begin();
- }
- const_bottom_up_ptr_iterator bottom_up_ptr_end() const {
- return PerPtrBottomUp.end();
- }
- bool hasBottomUpPtrs() const {
- return !PerPtrBottomUp.empty();
- }
-
- unsigned bottom_up_ptr_list_size() const {
- return std::distance(bottom_up_ptr_begin(), bottom_up_ptr_end());
- }
-
- /// Mark this block as being an entry block, which has one path from the
- /// entry by definition.
- void SetAsEntry() { TopDownPathCount = 1; }
-
- /// Mark this block as being an exit block, which has one path to an exit by
- /// definition.
- void SetAsExit() { BottomUpPathCount = 1; }
-
- /// Attempt to find the PtrState object describing the top down state for
- /// pointer Arg. Return a new initialized PtrState describing the top down
- /// state for Arg if we do not find one.
- TopDownPtrState &getPtrTopDownState(const Value *Arg) {
- return PerPtrTopDown[Arg];
- }
-
- /// Attempt to find the PtrState object describing the bottom up state for
- /// pointer Arg. Return a new initialized PtrState describing the bottom up
- /// state for Arg if we do not find one.
- BottomUpPtrState &getPtrBottomUpState(const Value *Arg) {
- return PerPtrBottomUp[Arg];
- }
-
- /// Attempt to find the PtrState object describing the bottom up state for
- /// pointer Arg.
- bottom_up_ptr_iterator findPtrBottomUpState(const Value *Arg) {
- return PerPtrBottomUp.find(Arg);
- }
-
- void clearBottomUpPointers() {
- PerPtrBottomUp.clear();
- }
-
- void clearTopDownPointers() {
- PerPtrTopDown.clear();
- }
-
- void InitFromPred(const BBState &Other);
- void InitFromSucc(const BBState &Other);
- void MergePred(const BBState &Other);
- void MergeSucc(const BBState &Other);
-
- /// Compute the number of possible unique paths from an entry to an exit
- /// which pass through this block. This is only valid after both the
- /// top-down and bottom-up traversals are complete.
- ///
- /// Returns true if overflow occurred. Returns false if overflow did not
- /// occur.
- bool GetAllPathCountWithOverflow(unsigned &PathCount) const {
- if (TopDownPathCount == OverflowOccurredValue ||
- BottomUpPathCount == OverflowOccurredValue)
- return true;
- unsigned long long Product =
- (unsigned long long)TopDownPathCount*BottomUpPathCount;
- // Overflow occurred if any of the upper bits of Product are set or if all
- // the lower bits of Product are all set.
- return (Product >> 32) ||
- ((PathCount = Product) == OverflowOccurredValue);
- }
-
- // Specialized CFG utilities.
- using edge_iterator = SmallVectorImpl<BasicBlock *>::const_iterator;
-
- edge_iterator pred_begin() const { return Preds.begin(); }
- edge_iterator pred_end() const { return Preds.end(); }
- edge_iterator succ_begin() const { return Succs.begin(); }
- edge_iterator succ_end() const { return Succs.end(); }
-
- void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
- void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
-
- bool isExit() const { return Succs.empty(); }
- };
-
-} // end anonymous namespace
-
-const unsigned BBState::OverflowOccurredValue = 0xffffffff;
-
-namespace llvm {
-
-raw_ostream &operator<<(raw_ostream &OS,
- BBState &BBState) LLVM_ATTRIBUTE_UNUSED;
-
-} // end namespace llvm
-
-void BBState::InitFromPred(const BBState &Other) {
- PerPtrTopDown = Other.PerPtrTopDown;
- TopDownPathCount = Other.TopDownPathCount;
-}
-
-void BBState::InitFromSucc(const BBState &Other) {
- PerPtrBottomUp = Other.PerPtrBottomUp;
- BottomUpPathCount = Other.BottomUpPathCount;
-}
-
-/// The top-down traversal uses this to merge information about predecessors to
-/// form the initial state for a new block.
-void BBState::MergePred(const BBState &Other) {
- if (TopDownPathCount == OverflowOccurredValue)
- return;
-
- // Other.TopDownPathCount can be 0, in which case it is either dead or a
- // loop backedge. Loop backedges are special.
- TopDownPathCount += Other.TopDownPathCount;
-
- // In order to be consistent, we clear the top down pointers when by adding
- // TopDownPathCount becomes OverflowOccurredValue even though "true" overflow
- // has not occurred.
- if (TopDownPathCount == OverflowOccurredValue) {
- clearTopDownPointers();
- return;
- }
-
- // Check for overflow. If we have overflow, fall back to conservative
- // behavior.
- if (TopDownPathCount < Other.TopDownPathCount) {
- TopDownPathCount = OverflowOccurredValue;
- clearTopDownPointers();
- return;
- }
-
- // For each entry in the other set, if our set has an entry with the same key,
- // merge the entries. Otherwise, copy the entry and merge it with an empty
- // entry.
- for (auto MI = Other.top_down_ptr_begin(), ME = Other.top_down_ptr_end();
- MI != ME; ++MI) {
- auto Pair = PerPtrTopDown.insert(*MI);
- Pair.first->second.Merge(Pair.second ? TopDownPtrState() : MI->second,
- /*TopDown=*/true);
- }
-
- // For each entry in our set, if the other set doesn't have an entry with the
- // same key, force it to merge with an empty entry.
- for (auto MI = top_down_ptr_begin(), ME = top_down_ptr_end(); MI != ME; ++MI)
- if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end())
- MI->second.Merge(TopDownPtrState(), /*TopDown=*/true);
-}
-
-/// The bottom-up traversal uses this to merge information about successors to
-/// form the initial state for a new block.
-void BBState::MergeSucc(const BBState &Other) {
- if (BottomUpPathCount == OverflowOccurredValue)
- return;
-
- // Other.BottomUpPathCount can be 0, in which case it is either dead or a
- // loop backedge. Loop backedges are special.
- BottomUpPathCount += Other.BottomUpPathCount;
-
- // In order to be consistent, we clear the top down pointers when by adding
- // BottomUpPathCount becomes OverflowOccurredValue even though "true" overflow
- // has not occurred.
- if (BottomUpPathCount == OverflowOccurredValue) {
- clearBottomUpPointers();
- return;
- }
-
- // Check for overflow. If we have overflow, fall back to conservative
- // behavior.
- if (BottomUpPathCount < Other.BottomUpPathCount) {
- BottomUpPathCount = OverflowOccurredValue;
- clearBottomUpPointers();
- return;
- }
-
- // For each entry in the other set, if our set has an entry with the
- // same key, merge the entries. Otherwise, copy the entry and merge
- // it with an empty entry.
- for (auto MI = Other.bottom_up_ptr_begin(), ME = Other.bottom_up_ptr_end();
- MI != ME; ++MI) {
- auto Pair = PerPtrBottomUp.insert(*MI);
- Pair.first->second.Merge(Pair.second ? BottomUpPtrState() : MI->second,
- /*TopDown=*/false);
- }
-
- // For each entry in our set, if the other set doesn't have an entry
- // with the same key, force it to merge with an empty entry.
- for (auto MI = bottom_up_ptr_begin(), ME = bottom_up_ptr_end(); MI != ME;
- ++MI)
- if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end())
- MI->second.Merge(BottomUpPtrState(), /*TopDown=*/false);
-}
-
-raw_ostream &llvm::operator<<(raw_ostream &OS, BBState &BBInfo) {
- // Dump the pointers we are tracking.
- OS << " TopDown State:\n";
- if (!BBInfo.hasTopDownPtrs()) {
- LLVM_DEBUG(dbgs() << " NONE!\n");
- } else {
- for (auto I = BBInfo.top_down_ptr_begin(), E = BBInfo.top_down_ptr_end();
- I != E; ++I) {
- const PtrState &P = I->second;
- OS << " Ptr: " << *I->first
- << "\n KnownSafe: " << (P.IsKnownSafe()?"true":"false")
- << "\n ImpreciseRelease: "
- << (P.IsTrackingImpreciseReleases()?"true":"false") << "\n"
- << " HasCFGHazards: "
- << (P.IsCFGHazardAfflicted()?"true":"false") << "\n"
- << " KnownPositive: "
- << (P.HasKnownPositiveRefCount()?"true":"false") << "\n"
- << " Seq: "
- << P.GetSeq() << "\n";
- }
- }
-
- OS << " BottomUp State:\n";
- if (!BBInfo.hasBottomUpPtrs()) {
- LLVM_DEBUG(dbgs() << " NONE!\n");
- } else {
- for (auto I = BBInfo.bottom_up_ptr_begin(), E = BBInfo.bottom_up_ptr_end();
- I != E; ++I) {
- const PtrState &P = I->second;
- OS << " Ptr: " << *I->first
- << "\n KnownSafe: " << (P.IsKnownSafe()?"true":"false")
- << "\n ImpreciseRelease: "
- << (P.IsTrackingImpreciseReleases()?"true":"false") << "\n"
- << " HasCFGHazards: "
- << (P.IsCFGHazardAfflicted()?"true":"false") << "\n"
- << " KnownPositive: "
- << (P.HasKnownPositiveRefCount()?"true":"false") << "\n"
- << " Seq: "
- << P.GetSeq() << "\n";
- }
- }
-
- return OS;
-}
-
-namespace {
-
- /// The main ARC optimization pass.
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-opts"
+
+static cl::opt<unsigned> MaxPtrStates("arc-opt-max-ptr-states",
+ cl::Hidden,
+ cl::desc("Maximum number of ptr states the optimizer keeps track of"),
+ cl::init(4095));
+
+/// \defgroup ARCUtilities Utility declarations/definitions specific to ARC.
+/// @{
+
+/// This is similar to GetRCIdentityRoot but it stops as soon
+/// as it finds a value with multiple uses.
+static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
+ // ConstantData (like ConstantPointerNull and UndefValue) is used across
+ // modules. It's never a single-use value.
+ if (isa<ConstantData>(Arg))
+ return nullptr;
+
+ if (Arg->hasOneUse()) {
+ if (const BitCastInst *BC = dyn_cast<BitCastInst>(Arg))
+ return FindSingleUseIdentifiedObject(BC->getOperand(0));
+ if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Arg))
+ if (GEP->hasAllZeroIndices())
+ return FindSingleUseIdentifiedObject(GEP->getPointerOperand());
+ if (IsForwarding(GetBasicARCInstKind(Arg)))
+ return FindSingleUseIdentifiedObject(
+ cast<CallInst>(Arg)->getArgOperand(0));
+ if (!IsObjCIdentifiedObject(Arg))
+ return nullptr;
+ return Arg;
+ }
+
+ // If we found an identifiable object but it has multiple uses, but they are
+ // trivial uses, we can still consider this to be a single-use value.
+ if (IsObjCIdentifiedObject(Arg)) {
+ for (const User *U : Arg->users())
+ if (!U->use_empty() || GetRCIdentityRoot(U) != Arg)
+ return nullptr;
+
+ return Arg;
+ }
+
+ return nullptr;
+}
+
+/// @}
+///
+/// \defgroup ARCOpt ARC Optimization.
+/// @{
+
+// TODO: On code like this:
+//
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+// stuff_that_cannot_release()
+// objc_retain(%x)
+// stuff_that_cannot_release()
+// objc_autorelease(%x)
+//
+// The second retain and autorelease can be deleted.
+
+// TODO: It should be possible to delete
+// objc_autoreleasePoolPush and objc_autoreleasePoolPop
+// pairs if nothing is actually autoreleased between them. Also, autorelease
+// calls followed by objc_autoreleasePoolPop calls (perhaps in ObjC++ code
+// after inlining) can be turned into plain release calls.
+
+// TODO: Critical-edge splitting. If the optimial insertion point is
+// a critical edge, the current algorithm has to fail, because it doesn't
+// know how to split edges. It should be possible to make the optimizer
+// think in terms of edges, rather than blocks, and then split critical
+// edges on demand.
+
+// TODO: OptimizeSequences could generalized to be Interprocedural.
+
+// TODO: Recognize that a bunch of other objc runtime calls have
+// non-escaping arguments and non-releasing arguments, and may be
+// non-autoreleasing.
+
+// TODO: Sink autorelease calls as far as possible. Unfortunately we
+// usually can't sink them past other calls, which would be the main
+// case where it would be useful.
+
+// TODO: The pointer returned from objc_loadWeakRetained is retained.
+
+// TODO: Delete release+retain pairs (rare).
+
+STATISTIC(NumNoops, "Number of no-op objc calls eliminated");
+STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
+STATISTIC(NumAutoreleases,"Number of autoreleases converted to releases");
+STATISTIC(NumRets, "Number of return value forwarding "
+ "retain+autoreleases eliminated");
+STATISTIC(NumRRs, "Number of retain+release paths eliminated");
+STATISTIC(NumPeeps, "Number of calls peephole-optimized");
+#ifndef NDEBUG
+STATISTIC(NumRetainsBeforeOpt,
+ "Number of retains before optimization");
+STATISTIC(NumReleasesBeforeOpt,
+ "Number of releases before optimization");
+STATISTIC(NumRetainsAfterOpt,
+ "Number of retains after optimization");
+STATISTIC(NumReleasesAfterOpt,
+ "Number of releases after optimization");
+#endif
+
+namespace {
+
+ /// Per-BasicBlock state.
+ class BBState {
+ /// The number of unique control paths from the entry which can reach this
+ /// block.
+ unsigned TopDownPathCount = 0;
+
+ /// The number of unique control paths to exits from this block.
+ unsigned BottomUpPathCount = 0;
+
+ /// The top-down traversal uses this to record information known about a
+ /// pointer at the bottom of each block.
+ BlotMapVector<const Value *, TopDownPtrState> PerPtrTopDown;
+
+ /// The bottom-up traversal uses this to record information known about a
+ /// pointer at the top of each block.
+ BlotMapVector<const Value *, BottomUpPtrState> PerPtrBottomUp;
+
+ /// Effective predecessors of the current block ignoring ignorable edges and
+ /// ignored backedges.
+ SmallVector<BasicBlock *, 2> Preds;
+
+ /// Effective successors of the current block ignoring ignorable edges and
+ /// ignored backedges.
+ SmallVector<BasicBlock *, 2> Succs;
+
+ public:
+ static const unsigned OverflowOccurredValue;
+
+ BBState() = default;
+
+ using top_down_ptr_iterator = decltype(PerPtrTopDown)::iterator;
+ using const_top_down_ptr_iterator = decltype(PerPtrTopDown)::const_iterator;
+
+ top_down_ptr_iterator top_down_ptr_begin() { return PerPtrTopDown.begin(); }
+ top_down_ptr_iterator top_down_ptr_end() { return PerPtrTopDown.end(); }
+ const_top_down_ptr_iterator top_down_ptr_begin() const {
+ return PerPtrTopDown.begin();
+ }
+ const_top_down_ptr_iterator top_down_ptr_end() const {
+ return PerPtrTopDown.end();
+ }
+ bool hasTopDownPtrs() const {
+ return !PerPtrTopDown.empty();
+ }
+
+ unsigned top_down_ptr_list_size() const {
+ return std::distance(top_down_ptr_begin(), top_down_ptr_end());
+ }
+
+ using bottom_up_ptr_iterator = decltype(PerPtrBottomUp)::iterator;
+ using const_bottom_up_ptr_iterator =
+ decltype(PerPtrBottomUp)::const_iterator;
+
+ bottom_up_ptr_iterator bottom_up_ptr_begin() {
+ return PerPtrBottomUp.begin();
+ }
+ bottom_up_ptr_iterator bottom_up_ptr_end() { return PerPtrBottomUp.end(); }
+ const_bottom_up_ptr_iterator bottom_up_ptr_begin() const {
+ return PerPtrBottomUp.begin();
+ }
+ const_bottom_up_ptr_iterator bottom_up_ptr_end() const {
+ return PerPtrBottomUp.end();
+ }
+ bool hasBottomUpPtrs() const {
+ return !PerPtrBottomUp.empty();
+ }
+
+ unsigned bottom_up_ptr_list_size() const {
+ return std::distance(bottom_up_ptr_begin(), bottom_up_ptr_end());
+ }
+
+ /// Mark this block as being an entry block, which has one path from the
+ /// entry by definition.
+ void SetAsEntry() { TopDownPathCount = 1; }
+
+ /// Mark this block as being an exit block, which has one path to an exit by
+ /// definition.
+ void SetAsExit() { BottomUpPathCount = 1; }
+
+ /// Attempt to find the PtrState object describing the top down state for
+ /// pointer Arg. Return a new initialized PtrState describing the top down
+ /// state for Arg if we do not find one.
+ TopDownPtrState &getPtrTopDownState(const Value *Arg) {
+ return PerPtrTopDown[Arg];
+ }
+
+ /// Attempt to find the PtrState object describing the bottom up state for
+ /// pointer Arg. Return a new initialized PtrState describing the bottom up
+ /// state for Arg if we do not find one.
+ BottomUpPtrState &getPtrBottomUpState(const Value *Arg) {
+ return PerPtrBottomUp[Arg];
+ }
+
+ /// Attempt to find the PtrState object describing the bottom up state for
+ /// pointer Arg.
+ bottom_up_ptr_iterator findPtrBottomUpState(const Value *Arg) {
+ return PerPtrBottomUp.find(Arg);
+ }
+
+ void clearBottomUpPointers() {
+ PerPtrBottomUp.clear();
+ }
+
+ void clearTopDownPointers() {
+ PerPtrTopDown.clear();
+ }
+
+ void InitFromPred(const BBState &Other);
+ void InitFromSucc(const BBState &Other);
+ void MergePred(const BBState &Other);
+ void MergeSucc(const BBState &Other);
+
+ /// Compute the number of possible unique paths from an entry to an exit
+ /// which pass through this block. This is only valid after both the
+ /// top-down and bottom-up traversals are complete.
+ ///
+ /// Returns true if overflow occurred. Returns false if overflow did not
+ /// occur.
+ bool GetAllPathCountWithOverflow(unsigned &PathCount) const {
+ if (TopDownPathCount == OverflowOccurredValue ||
+ BottomUpPathCount == OverflowOccurredValue)
+ return true;
+ unsigned long long Product =
+ (unsigned long long)TopDownPathCount*BottomUpPathCount;
+ // Overflow occurred if any of the upper bits of Product are set or if all
+ // the lower bits of Product are all set.
+ return (Product >> 32) ||
+ ((PathCount = Product) == OverflowOccurredValue);
+ }
+
+ // Specialized CFG utilities.
+ using edge_iterator = SmallVectorImpl<BasicBlock *>::const_iterator;
+
+ edge_iterator pred_begin() const { return Preds.begin(); }
+ edge_iterator pred_end() const { return Preds.end(); }
+ edge_iterator succ_begin() const { return Succs.begin(); }
+ edge_iterator succ_end() const { return Succs.end(); }
+
+ void addSucc(BasicBlock *Succ) { Succs.push_back(Succ); }
+ void addPred(BasicBlock *Pred) { Preds.push_back(Pred); }
+
+ bool isExit() const { return Succs.empty(); }
+ };
+
+} // end anonymous namespace
+
+const unsigned BBState::OverflowOccurredValue = 0xffffffff;
+
+namespace llvm {
+
+raw_ostream &operator<<(raw_ostream &OS,
+ BBState &BBState) LLVM_ATTRIBUTE_UNUSED;
+
+} // end namespace llvm
+
+void BBState::InitFromPred(const BBState &Other) {
+ PerPtrTopDown = Other.PerPtrTopDown;
+ TopDownPathCount = Other.TopDownPathCount;
+}
+
+void BBState::InitFromSucc(const BBState &Other) {
+ PerPtrBottomUp = Other.PerPtrBottomUp;
+ BottomUpPathCount = Other.BottomUpPathCount;
+}
+
+/// The top-down traversal uses this to merge information about predecessors to
+/// form the initial state for a new block.
+void BBState::MergePred(const BBState &Other) {
+ if (TopDownPathCount == OverflowOccurredValue)
+ return;
+
+ // Other.TopDownPathCount can be 0, in which case it is either dead or a
+ // loop backedge. Loop backedges are special.
+ TopDownPathCount += Other.TopDownPathCount;
+
+ // In order to be consistent, we clear the top down pointers when by adding
+ // TopDownPathCount becomes OverflowOccurredValue even though "true" overflow
+ // has not occurred.
+ if (TopDownPathCount == OverflowOccurredValue) {
+ clearTopDownPointers();
+ return;
+ }
+
+ // Check for overflow. If we have overflow, fall back to conservative
+ // behavior.
+ if (TopDownPathCount < Other.TopDownPathCount) {
+ TopDownPathCount = OverflowOccurredValue;
+ clearTopDownPointers();
+ return;
+ }
+
+ // For each entry in the other set, if our set has an entry with the same key,
+ // merge the entries. Otherwise, copy the entry and merge it with an empty
+ // entry.
+ for (auto MI = Other.top_down_ptr_begin(), ME = Other.top_down_ptr_end();
+ MI != ME; ++MI) {
+ auto Pair = PerPtrTopDown.insert(*MI);
+ Pair.first->second.Merge(Pair.second ? TopDownPtrState() : MI->second,
+ /*TopDown=*/true);
+ }
+
+ // For each entry in our set, if the other set doesn't have an entry with the
+ // same key, force it to merge with an empty entry.
+ for (auto MI = top_down_ptr_begin(), ME = top_down_ptr_end(); MI != ME; ++MI)
+ if (Other.PerPtrTopDown.find(MI->first) == Other.PerPtrTopDown.end())
+ MI->second.Merge(TopDownPtrState(), /*TopDown=*/true);
+}
+
+/// The bottom-up traversal uses this to merge information about successors to
+/// form the initial state for a new block.
+void BBState::MergeSucc(const BBState &Other) {
+ if (BottomUpPathCount == OverflowOccurredValue)
+ return;
+
+ // Other.BottomUpPathCount can be 0, in which case it is either dead or a
+ // loop backedge. Loop backedges are special.
+ BottomUpPathCount += Other.BottomUpPathCount;
+
+ // In order to be consistent, we clear the top down pointers when by adding
+ // BottomUpPathCount becomes OverflowOccurredValue even though "true" overflow
+ // has not occurred.
+ if (BottomUpPathCount == OverflowOccurredValue) {
+ clearBottomUpPointers();
+ return;
+ }
+
+ // Check for overflow. If we have overflow, fall back to conservative
+ // behavior.
+ if (BottomUpPathCount < Other.BottomUpPathCount) {
+ BottomUpPathCount = OverflowOccurredValue;
+ clearBottomUpPointers();
+ return;
+ }
+
+ // For each entry in the other set, if our set has an entry with the
+ // same key, merge the entries. Otherwise, copy the entry and merge
+ // it with an empty entry.
+ for (auto MI = Other.bottom_up_ptr_begin(), ME = Other.bottom_up_ptr_end();
+ MI != ME; ++MI) {
+ auto Pair = PerPtrBottomUp.insert(*MI);
+ Pair.first->second.Merge(Pair.second ? BottomUpPtrState() : MI->second,
+ /*TopDown=*/false);
+ }
+
+ // For each entry in our set, if the other set doesn't have an entry
+ // with the same key, force it to merge with an empty entry.
+ for (auto MI = bottom_up_ptr_begin(), ME = bottom_up_ptr_end(); MI != ME;
+ ++MI)
+ if (Other.PerPtrBottomUp.find(MI->first) == Other.PerPtrBottomUp.end())
+ MI->second.Merge(BottomUpPtrState(), /*TopDown=*/false);
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, BBState &BBInfo) {
+ // Dump the pointers we are tracking.
+ OS << " TopDown State:\n";
+ if (!BBInfo.hasTopDownPtrs()) {
+ LLVM_DEBUG(dbgs() << " NONE!\n");
+ } else {
+ for (auto I = BBInfo.top_down_ptr_begin(), E = BBInfo.top_down_ptr_end();
+ I != E; ++I) {
+ const PtrState &P = I->second;
+ OS << " Ptr: " << *I->first
+ << "\n KnownSafe: " << (P.IsKnownSafe()?"true":"false")
+ << "\n ImpreciseRelease: "
+ << (P.IsTrackingImpreciseReleases()?"true":"false") << "\n"
+ << " HasCFGHazards: "
+ << (P.IsCFGHazardAfflicted()?"true":"false") << "\n"
+ << " KnownPositive: "
+ << (P.HasKnownPositiveRefCount()?"true":"false") << "\n"
+ << " Seq: "
+ << P.GetSeq() << "\n";
+ }
+ }
+
+ OS << " BottomUp State:\n";
+ if (!BBInfo.hasBottomUpPtrs()) {
+ LLVM_DEBUG(dbgs() << " NONE!\n");
+ } else {
+ for (auto I = BBInfo.bottom_up_ptr_begin(), E = BBInfo.bottom_up_ptr_end();
+ I != E; ++I) {
+ const PtrState &P = I->second;
+ OS << " Ptr: " << *I->first
+ << "\n KnownSafe: " << (P.IsKnownSafe()?"true":"false")
+ << "\n ImpreciseRelease: "
+ << (P.IsTrackingImpreciseReleases()?"true":"false") << "\n"
+ << " HasCFGHazards: "
+ << (P.IsCFGHazardAfflicted()?"true":"false") << "\n"
+ << " KnownPositive: "
+ << (P.HasKnownPositiveRefCount()?"true":"false") << "\n"
+ << " Seq: "
+ << P.GetSeq() << "\n";
+ }
+ }
+
+ return OS;
+}
+
+namespace {
+
+ /// The main ARC optimization pass.
class ObjCARCOpt {
bool Changed;
ProvenanceAnalysis PA;
-
+
/// A cache of references to runtime entry point constants.
ARCRuntimeEntryPoints EP;
-
+
/// A cache of MDKinds that can be passed into other functions to propagate
/// MDKind identifiers.
ARCMDKindCache MDKindCache;
-
+
/// A flag indicating whether this optimization pass should run.
bool Run;
-
+
/// A flag indicating whether the optimization that removes or moves
/// retain/release pairs should be performed.
bool DisableRetainReleasePairing = false;
-
+
/// Flags which determine whether each of the interesting runtime functions
/// is in fact used in the current function.
unsigned UsedInThisFunction;
-
+
bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
ARCInstKind &Class);
void OptimizeIndividualCalls(Function &F);
-
+
/// Optimize an individual call, optionally passing the
/// GetArgRCIdentityRoot if it has already been computed.
void OptimizeIndividualCallImpl(
Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
Instruction *Inst, ARCInstKind Class, const Value *Arg);
-
+
/// Try to optimize an AutoreleaseRV with a RetainRV or ClaimRV. If the
/// optimization occurs, returns true to indicate that the caller should
/// assume the instructions are dead.
@@ -521,7 +521,7 @@ class ObjCARCOpt {
Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
Instruction *Inst, const Value *&Arg, ARCInstKind Class,
Instruction *AutoreleaseRV, const Value *&AutoreleaseRVArg);
-
+
void CheckForCFGHazards(const BasicBlock *BB,
DenseMap<const BasicBlock *, BBState> &BBStates,
BBState &MyStates) const;
@@ -540,12 +540,12 @@ class ObjCARCOpt {
bool Visit(Function &F, DenseMap<const BasicBlock *, BBState> &BBStates,
BlotMapVector<Value *, RRInfo> &Retains,
DenseMap<Value *, RRInfo> &Releases);
-
+
void MoveCalls(Value *Arg, RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
BlotMapVector<Value *, RRInfo> &Retains,
DenseMap<Value *, RRInfo> &Releases,
SmallVectorImpl<Instruction *> &DeadInsts, Module *M);
-
+
bool PairUpRetainsAndReleases(DenseMap<const BasicBlock *, BBState> &BBStates,
BlotMapVector<Value *, RRInfo> &Retains,
DenseMap<Value *, RRInfo> &Releases, Module *M,
@@ -554,27 +554,27 @@ class ObjCARCOpt {
RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
Value *Arg, bool KnownSafe,
bool &AnyPairsCompletelyEliminated);
-
+
bool PerformCodePlacement(DenseMap<const BasicBlock *, BBState> &BBStates,
BlotMapVector<Value *, RRInfo> &Retains,
DenseMap<Value *, RRInfo> &Releases, Module *M);
-
+
void OptimizeWeakCalls(Function &F);
-
+
bool OptimizeSequences(Function &F);
-
+
void OptimizeReturns(Function &F);
-
-#ifndef NDEBUG
+
+#ifndef NDEBUG
void GatherStatistics(Function &F, bool AfterOptimization = false);
-#endif
-
- public:
+#endif
+
+ public:
void init(Module &M);
bool run(Function &F, AAResults &AA);
void releaseMemory();
};
-
+
/// The main ARC optimization pass.
class ObjCARCOptLegacyPass : public FunctionPass {
public:
@@ -591,1876 +591,1876 @@ public:
}
void releaseMemory() override { OCAO.releaseMemory(); }
static char ID;
-
+
private:
ObjCARCOpt OCAO;
};
-} // end anonymous namespace
-
+} // end anonymous namespace
+
char ObjCARCOptLegacyPass::ID = 0;
-
+
INITIALIZE_PASS_BEGIN(ObjCARCOptLegacyPass, "objc-arc", "ObjC ARC optimization",
false, false)
-INITIALIZE_PASS_DEPENDENCY(ObjCARCAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ObjCARCAAWrapperPass)
INITIALIZE_PASS_END(ObjCARCOptLegacyPass, "objc-arc", "ObjC ARC optimization",
false, false)
-
+
Pass *llvm::createObjCARCOptPass() { return new ObjCARCOptLegacyPass(); }
-
+
void ObjCARCOptLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<ObjCARCAAWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- // ARC optimization doesn't currently split critical edges.
- AU.setPreservesCFG();
-}
-
-/// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is
-/// not a return value.
-bool
-ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
- // Check for the argument being from an immediately preceding call or invoke.
- const Value *Arg = GetArgRCIdentityRoot(RetainRV);
- if (const Instruction *Call = dyn_cast<CallBase>(Arg)) {
- if (Call->getParent() == RetainRV->getParent()) {
- BasicBlock::const_iterator I(Call);
- ++I;
- while (IsNoopInstruction(&*I))
- ++I;
- if (&*I == RetainRV)
- return false;
- } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
- BasicBlock *RetainRVParent = RetainRV->getParent();
- if (II->getNormalDest() == RetainRVParent) {
- BasicBlock::const_iterator I = RetainRVParent->begin();
- while (IsNoopInstruction(&*I))
- ++I;
- if (&*I == RetainRV)
- return false;
- }
- }
- }
-
- // Turn it to a plain objc_retain.
- Changed = true;
- ++NumPeeps;
-
- LLVM_DEBUG(dbgs() << "Transforming objc_retainAutoreleasedReturnValue => "
- "objc_retain since the operand is not a return value.\n"
- "Old = "
- << *RetainRV << "\n");
-
- Function *NewDecl = EP.get(ARCRuntimeEntryPointKind::Retain);
- cast<CallInst>(RetainRV)->setCalledFunction(NewDecl);
-
- LLVM_DEBUG(dbgs() << "New = " << *RetainRV << "\n");
-
- return false;
-}
-
-bool ObjCARCOpt::OptimizeInlinedAutoreleaseRVCall(
- Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
- Instruction *Inst, const Value *&Arg, ARCInstKind Class,
- Instruction *AutoreleaseRV, const Value *&AutoreleaseRVArg) {
- // Must be in the same basic block.
- assert(Inst->getParent() == AutoreleaseRV->getParent());
-
- // Must operate on the same root.
- Arg = GetArgRCIdentityRoot(Inst);
- AutoreleaseRVArg = GetArgRCIdentityRoot(AutoreleaseRV);
- if (Arg != AutoreleaseRVArg) {
- // If there isn't an exact match, check if we have equivalent PHIs.
- const PHINode *PN = dyn_cast<PHINode>(Arg);
- if (!PN)
- return false;
-
- SmallVector<const Value *, 4> ArgUsers;
- getEquivalentPHIs(*PN, ArgUsers);
+ AU.addRequired<ObjCARCAAWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ // ARC optimization doesn't currently split critical edges.
+ AU.setPreservesCFG();
+}
+
+/// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is
+/// not a return value.
+bool
+ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
+ // Check for the argument being from an immediately preceding call or invoke.
+ const Value *Arg = GetArgRCIdentityRoot(RetainRV);
+ if (const Instruction *Call = dyn_cast<CallBase>(Arg)) {
+ if (Call->getParent() == RetainRV->getParent()) {
+ BasicBlock::const_iterator I(Call);
+ ++I;
+ while (IsNoopInstruction(&*I))
+ ++I;
+ if (&*I == RetainRV)
+ return false;
+ } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+ BasicBlock *RetainRVParent = RetainRV->getParent();
+ if (II->getNormalDest() == RetainRVParent) {
+ BasicBlock::const_iterator I = RetainRVParent->begin();
+ while (IsNoopInstruction(&*I))
+ ++I;
+ if (&*I == RetainRV)
+ return false;
+ }
+ }
+ }
+
+ // Turn it to a plain objc_retain.
+ Changed = true;
+ ++NumPeeps;
+
+ LLVM_DEBUG(dbgs() << "Transforming objc_retainAutoreleasedReturnValue => "
+ "objc_retain since the operand is not a return value.\n"
+ "Old = "
+ << *RetainRV << "\n");
+
+ Function *NewDecl = EP.get(ARCRuntimeEntryPointKind::Retain);
+ cast<CallInst>(RetainRV)->setCalledFunction(NewDecl);
+
+ LLVM_DEBUG(dbgs() << "New = " << *RetainRV << "\n");
+
+ return false;
+}
+
+bool ObjCARCOpt::OptimizeInlinedAutoreleaseRVCall(
+ Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
+ Instruction *Inst, const Value *&Arg, ARCInstKind Class,
+ Instruction *AutoreleaseRV, const Value *&AutoreleaseRVArg) {
+ // Must be in the same basic block.
+ assert(Inst->getParent() == AutoreleaseRV->getParent());
+
+ // Must operate on the same root.
+ Arg = GetArgRCIdentityRoot(Inst);
+ AutoreleaseRVArg = GetArgRCIdentityRoot(AutoreleaseRV);
+ if (Arg != AutoreleaseRVArg) {
+ // If there isn't an exact match, check if we have equivalent PHIs.
+ const PHINode *PN = dyn_cast<PHINode>(Arg);
+ if (!PN)
+ return false;
+
+ SmallVector<const Value *, 4> ArgUsers;
+ getEquivalentPHIs(*PN, ArgUsers);
if (!llvm::is_contained(ArgUsers, AutoreleaseRVArg))
- return false;
- }
-
- // Okay, this is a match. Merge them.
- ++NumPeeps;
- LLVM_DEBUG(dbgs() << "Found inlined objc_autoreleaseReturnValue '"
- << *AutoreleaseRV << "' paired with '" << *Inst << "'\n");
-
- // Delete the RV pair, starting with the AutoreleaseRV.
- AutoreleaseRV->replaceAllUsesWith(
- cast<CallInst>(AutoreleaseRV)->getArgOperand(0));
- Changed = true;
- EraseInstruction(AutoreleaseRV);
- if (Class == ARCInstKind::RetainRV) {
- // AutoreleaseRV and RetainRV cancel out. Delete the RetainRV.
- Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0));
- EraseInstruction(Inst);
- return true;
- }
-
- // ClaimRV is a frontend peephole for RetainRV + Release. Since the
- // AutoreleaseRV and RetainRV cancel out, replace the ClaimRV with a Release.
- assert(Class == ARCInstKind::ClaimRV);
- Value *CallArg = cast<CallInst>(Inst)->getArgOperand(0);
- CallInst *Release = CallInst::Create(
- EP.get(ARCRuntimeEntryPointKind::Release), CallArg, "", Inst);
- assert(IsAlwaysTail(ARCInstKind::ClaimRV) &&
- "Expected ClaimRV to be safe to tail call");
- Release->setTailCall();
- Inst->replaceAllUsesWith(CallArg);
- EraseInstruction(Inst);
-
- // Run the normal optimizations on Release.
- OptimizeIndividualCallImpl(F, BlockColors, Release, ARCInstKind::Release,
- Arg);
- return true;
-}
-
-/// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not
-/// used as a return value.
-void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F,
- Instruction *AutoreleaseRV,
- ARCInstKind &Class) {
- // Check for a return of the pointer value.
- const Value *Ptr = GetArgRCIdentityRoot(AutoreleaseRV);
-
- // If the argument is ConstantPointerNull or UndefValue, its other users
- // aren't actually interesting to look at.
- if (isa<ConstantData>(Ptr))
- return;
-
- SmallVector<const Value *, 2> Users;
- Users.push_back(Ptr);
-
- // Add PHIs that are equivalent to Ptr to Users.
- if (const PHINode *PN = dyn_cast<PHINode>(Ptr))
- getEquivalentPHIs(*PN, Users);
-
- do {
- Ptr = Users.pop_back_val();
- for (const User *U : Ptr->users()) {
- if (isa<ReturnInst>(U) || GetBasicARCInstKind(U) == ARCInstKind::RetainRV)
- return;
- if (isa<BitCastInst>(U))
- Users.push_back(U);
- }
- } while (!Users.empty());
-
- Changed = true;
- ++NumPeeps;
-
- LLVM_DEBUG(
- dbgs() << "Transforming objc_autoreleaseReturnValue => "
- "objc_autorelease since its operand is not used as a return "
- "value.\n"
- "Old = "
- << *AutoreleaseRV << "\n");
-
- CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV);
- Function *NewDecl = EP.get(ARCRuntimeEntryPointKind::Autorelease);
- AutoreleaseRVCI->setCalledFunction(NewDecl);
- AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
- Class = ARCInstKind::Autorelease;
-
- LLVM_DEBUG(dbgs() << "New: " << *AutoreleaseRV << "\n");
-}
-
-namespace {
-Instruction *
-CloneCallInstForBB(CallInst &CI, BasicBlock &BB,
- const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
- SmallVector<OperandBundleDef, 1> OpBundles;
- for (unsigned I = 0, E = CI.getNumOperandBundles(); I != E; ++I) {
- auto Bundle = CI.getOperandBundleAt(I);
- // Funclets will be reassociated in the future.
- if (Bundle.getTagID() == LLVMContext::OB_funclet)
- continue;
- OpBundles.emplace_back(Bundle);
- }
-
- if (!BlockColors.empty()) {
- const ColorVector &CV = BlockColors.find(&BB)->second;
- assert(CV.size() == 1 && "non-unique color for block!");
- Instruction *EHPad = CV.front()->getFirstNonPHI();
- if (EHPad->isEHPad())
- OpBundles.emplace_back("funclet", EHPad);
- }
-
- return CallInst::Create(&CI, OpBundles);
-}
-}
-
-/// Visit each call, one at a time, and make simplifications without doing any
-/// additional analysis.
-void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
- LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeIndividualCalls ==\n");
- // Reset all the flags in preparation for recomputing them.
- UsedInThisFunction = 0;
-
- DenseMap<BasicBlock *, ColorVector> BlockColors;
- if (F.hasPersonalityFn() &&
- isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
- BlockColors = colorEHFunclets(F);
-
- // Store any delayed AutoreleaseRV intrinsics, so they can be easily paired
- // with RetainRV and ClaimRV.
- Instruction *DelayedAutoreleaseRV = nullptr;
- const Value *DelayedAutoreleaseRVArg = nullptr;
- auto setDelayedAutoreleaseRV = [&](Instruction *AutoreleaseRV) {
- assert(!DelayedAutoreleaseRV || !AutoreleaseRV);
- DelayedAutoreleaseRV = AutoreleaseRV;
- DelayedAutoreleaseRVArg = nullptr;
- };
- auto optimizeDelayedAutoreleaseRV = [&]() {
- if (!DelayedAutoreleaseRV)
- return;
- OptimizeIndividualCallImpl(F, BlockColors, DelayedAutoreleaseRV,
- ARCInstKind::AutoreleaseRV,
- DelayedAutoreleaseRVArg);
- setDelayedAutoreleaseRV(nullptr);
- };
- auto shouldDelayAutoreleaseRV = [&](Instruction *NonARCInst) {
- // Nothing to delay, but we may as well skip the logic below.
- if (!DelayedAutoreleaseRV)
- return true;
-
- // If we hit the end of the basic block we're not going to find an RV-pair.
- // Stop delaying.
- if (NonARCInst->isTerminator())
- return false;
-
- // Given the frontend rules for emitting AutoreleaseRV, RetainRV, and
- // ClaimRV, it's probably safe to skip over even opaque function calls
- // here since OptimizeInlinedAutoreleaseRVCall will confirm that they
- // have the same RCIdentityRoot. However, what really matters is
- // skipping instructions or intrinsics that the inliner could leave behind;
- // be conservative for now and don't skip over opaque calls, which could
- // potentially include other ARC calls.
- auto *CB = dyn_cast<CallBase>(NonARCInst);
- if (!CB)
- return true;
- return CB->getIntrinsicID() != Intrinsic::not_intrinsic;
- };
-
- // Visit all objc_* calls in F.
- for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
- Instruction *Inst = &*I++;
-
- ARCInstKind Class = GetBasicARCInstKind(Inst);
-
- // Skip this loop if this instruction isn't itself an ARC intrinsic.
- const Value *Arg = nullptr;
- switch (Class) {
- default:
- optimizeDelayedAutoreleaseRV();
- break;
- case ARCInstKind::CallOrUser:
- case ARCInstKind::User:
- case ARCInstKind::None:
- // This is a non-ARC instruction. If we're delaying an AutoreleaseRV,
- // check if it's safe to skip over it; if not, optimize the AutoreleaseRV
- // now.
- if (!shouldDelayAutoreleaseRV(Inst))
- optimizeDelayedAutoreleaseRV();
- continue;
- case ARCInstKind::AutoreleaseRV:
- optimizeDelayedAutoreleaseRV();
- setDelayedAutoreleaseRV(Inst);
- continue;
- case ARCInstKind::RetainRV:
- case ARCInstKind::ClaimRV:
- if (DelayedAutoreleaseRV) {
- // We have a potential RV pair. Check if they cancel out.
- if (OptimizeInlinedAutoreleaseRVCall(F, BlockColors, Inst, Arg, Class,
- DelayedAutoreleaseRV,
- DelayedAutoreleaseRVArg)) {
- setDelayedAutoreleaseRV(nullptr);
- continue;
- }
- optimizeDelayedAutoreleaseRV();
- }
- break;
- }
-
- OptimizeIndividualCallImpl(F, BlockColors, Inst, Class, Arg);
- }
-
- // Catch the final delayed AutoreleaseRV.
- optimizeDelayedAutoreleaseRV();
-}
-
-/// This function returns true if the value is inert. An ObjC ARC runtime call
-/// taking an inert operand can be safely deleted.
-static bool isInertARCValue(Value *V, SmallPtrSet<Value *, 1> &VisitedPhis) {
- V = V->stripPointerCasts();
-
- if (IsNullOrUndef(V))
- return true;
-
- // See if this is a global attribute annotated with an 'objc_arc_inert'.
- if (auto *GV = dyn_cast<GlobalVariable>(V))
- if (GV->hasAttribute("objc_arc_inert"))
- return true;
-
- if (auto PN = dyn_cast<PHINode>(V)) {
- // Ignore this phi if it has already been discovered.
- if (!VisitedPhis.insert(PN).second)
- return true;
- // Look through phis's operands.
- for (Value *Opnd : PN->incoming_values())
- if (!isInertARCValue(Opnd, VisitedPhis))
- return false;
- return true;
- }
-
- return false;
-}
-
-void ObjCARCOpt::OptimizeIndividualCallImpl(
- Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
- Instruction *Inst, ARCInstKind Class, const Value *Arg) {
- LLVM_DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n");
-
- // We can delete this call if it takes an inert value.
- SmallPtrSet<Value *, 1> VisitedPhis;
-
- if (IsNoopOnGlobal(Class))
- if (isInertARCValue(Inst->getOperand(0), VisitedPhis)) {
- if (!Inst->getType()->isVoidTy())
- Inst->replaceAllUsesWith(Inst->getOperand(0));
- Inst->eraseFromParent();
- Changed = true;
- return;
- }
-
- switch (Class) {
- default:
- break;
-
- // Delete no-op casts. These function calls have special semantics, but
- // the semantics are entirely implemented via lowering in the front-end,
- // so by the time they reach the optimizer, they are just no-op calls
- // which return their argument.
- //
- // There are gray areas here, as the ability to cast reference-counted
- // pointers to raw void* and back allows code to break ARC assumptions,
- // however these are currently considered to be unimportant.
- case ARCInstKind::NoopCast:
- Changed = true;
- ++NumNoops;
- LLVM_DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n");
- EraseInstruction(Inst);
- return;
-
- // If the pointer-to-weak-pointer is null, it's undefined behavior.
- case ARCInstKind::StoreWeak:
- case ARCInstKind::LoadWeak:
- case ARCInstKind::LoadWeakRetained:
- case ARCInstKind::InitWeak:
- case ARCInstKind::DestroyWeak: {
- CallInst *CI = cast<CallInst>(Inst);
- if (IsNullOrUndef(CI->getArgOperand(0))) {
- Changed = true;
- Type *Ty = CI->getArgOperand(0)->getType();
- new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
- Constant::getNullValue(Ty), CI);
- Value *NewValue = UndefValue::get(CI->getType());
- LLVM_DEBUG(
- dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
- "\nOld = "
- << *CI << "\nNew = " << *NewValue << "\n");
- CI->replaceAllUsesWith(NewValue);
- CI->eraseFromParent();
- return;
- }
- break;
- }
- case ARCInstKind::CopyWeak:
- case ARCInstKind::MoveWeak: {
- CallInst *CI = cast<CallInst>(Inst);
- if (IsNullOrUndef(CI->getArgOperand(0)) ||
- IsNullOrUndef(CI->getArgOperand(1))) {
- Changed = true;
- Type *Ty = CI->getArgOperand(0)->getType();
- new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
- Constant::getNullValue(Ty), CI);
-
- Value *NewValue = UndefValue::get(CI->getType());
- LLVM_DEBUG(
- dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
- "\nOld = "
- << *CI << "\nNew = " << *NewValue << "\n");
-
- CI->replaceAllUsesWith(NewValue);
- CI->eraseFromParent();
- return;
- }
- break;
- }
- case ARCInstKind::RetainRV:
- if (OptimizeRetainRVCall(F, Inst))
- return;
- break;
- case ARCInstKind::AutoreleaseRV:
- OptimizeAutoreleaseRVCall(F, Inst, Class);
- break;
- }
-
- // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
- if (IsAutorelease(Class) && Inst->use_empty()) {
- CallInst *Call = cast<CallInst>(Inst);
- const Value *Arg = Call->getArgOperand(0);
- Arg = FindSingleUseIdentifiedObject(Arg);
- if (Arg) {
- Changed = true;
- ++NumAutoreleases;
-
- // Create the declaration lazily.
- LLVMContext &C = Inst->getContext();
-
- Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release);
- CallInst *NewCall =
- CallInst::Create(Decl, Call->getArgOperand(0), "", Call);
- NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease),
- MDNode::get(C, None));
-
- LLVM_DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
- "since x is otherwise unused.\nOld: "
- << *Call << "\nNew: " << *NewCall << "\n");
-
- EraseInstruction(Call);
- Inst = NewCall;
- Class = ARCInstKind::Release;
- }
- }
-
- // For functions which can never be passed stack arguments, add
- // a tail keyword.
- if (IsAlwaysTail(Class) && !cast<CallInst>(Inst)->isNoTailCall()) {
- Changed = true;
- LLVM_DEBUG(
- dbgs() << "Adding tail keyword to function since it can never be "
- "passed stack args: "
- << *Inst << "\n");
- cast<CallInst>(Inst)->setTailCall();
- }
-
- // Ensure that functions that can never have a "tail" keyword due to the
- // semantics of ARC truly do not do so.
- if (IsNeverTail(Class)) {
- Changed = true;
- LLVM_DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst
- << "\n");
- cast<CallInst>(Inst)->setTailCall(false);
- }
-
- // Set nounwind as needed.
- if (IsNoThrow(Class)) {
- Changed = true;
- LLVM_DEBUG(dbgs() << "Found no throw class. Setting nounwind on: " << *Inst
- << "\n");
- cast<CallInst>(Inst)->setDoesNotThrow();
- }
-
- // Note: This catches instructions unrelated to ARC.
- if (!IsNoopOnNull(Class)) {
- UsedInThisFunction |= 1 << unsigned(Class);
- return;
- }
-
- // If we haven't already looked up the root, look it up now.
- if (!Arg)
- Arg = GetArgRCIdentityRoot(Inst);
-
- // ARC calls with null are no-ops. Delete them.
- if (IsNullOrUndef(Arg)) {
- Changed = true;
- ++NumNoops;
- LLVM_DEBUG(dbgs() << "ARC calls with null are no-ops. Erasing: " << *Inst
- << "\n");
- EraseInstruction(Inst);
- return;
- }
-
- // Keep track of which of retain, release, autorelease, and retain_block
- // are actually present in this function.
- UsedInThisFunction |= 1 << unsigned(Class);
-
- // If Arg is a PHI, and one or more incoming values to the
- // PHI are null, and the call is control-equivalent to the PHI, and there
- // are no relevant side effects between the PHI and the call, and the call
- // is not a release that doesn't have the clang.imprecise_release tag, the
- // call could be pushed up to just those paths with non-null incoming
- // values. For now, don't bother splitting critical edges for this.
- if (Class == ARCInstKind::Release &&
- !Inst->getMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease)))
- return;
-
- SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist;
- Worklist.push_back(std::make_pair(Inst, Arg));
- do {
- std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val();
- Inst = Pair.first;
- Arg = Pair.second;
-
- const PHINode *PN = dyn_cast<PHINode>(Arg);
- if (!PN)
- continue;
-
- // Determine if the PHI has any null operands, or any incoming
- // critical edges.
- bool HasNull = false;
- bool HasCriticalEdges = false;
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- Value *Incoming = GetRCIdentityRoot(PN->getIncomingValue(i));
- if (IsNullOrUndef(Incoming))
- HasNull = true;
- else if (PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors() !=
- 1) {
- HasCriticalEdges = true;
- break;
- }
- }
- // If we have null operands and no critical edges, optimize.
- if (HasCriticalEdges)
- continue;
- if (!HasNull)
- continue;
-
+ return false;
+ }
+
+ // Okay, this is a match. Merge them.
+ ++NumPeeps;
+ LLVM_DEBUG(dbgs() << "Found inlined objc_autoreleaseReturnValue '"
+ << *AutoreleaseRV << "' paired with '" << *Inst << "'\n");
+
+ // Delete the RV pair, starting with the AutoreleaseRV.
+ AutoreleaseRV->replaceAllUsesWith(
+ cast<CallInst>(AutoreleaseRV)->getArgOperand(0));
+ Changed = true;
+ EraseInstruction(AutoreleaseRV);
+ if (Class == ARCInstKind::RetainRV) {
+ // AutoreleaseRV and RetainRV cancel out. Delete the RetainRV.
+ Inst->replaceAllUsesWith(cast<CallInst>(Inst)->getArgOperand(0));
+ EraseInstruction(Inst);
+ return true;
+ }
+
+ // ClaimRV is a frontend peephole for RetainRV + Release. Since the
+ // AutoreleaseRV and RetainRV cancel out, replace the ClaimRV with a Release.
+ assert(Class == ARCInstKind::ClaimRV);
+ Value *CallArg = cast<CallInst>(Inst)->getArgOperand(0);
+ CallInst *Release = CallInst::Create(
+ EP.get(ARCRuntimeEntryPointKind::Release), CallArg, "", Inst);
+ assert(IsAlwaysTail(ARCInstKind::ClaimRV) &&
+ "Expected ClaimRV to be safe to tail call");
+ Release->setTailCall();
+ Inst->replaceAllUsesWith(CallArg);
+ EraseInstruction(Inst);
+
+ // Run the normal optimizations on Release.
+ OptimizeIndividualCallImpl(F, BlockColors, Release, ARCInstKind::Release,
+ Arg);
+ return true;
+}
+
+/// Turn objc_autoreleaseReturnValue into objc_autorelease if the result is not
+/// used as a return value.
+void ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F,
+ Instruction *AutoreleaseRV,
+ ARCInstKind &Class) {
+ // Check for a return of the pointer value.
+ const Value *Ptr = GetArgRCIdentityRoot(AutoreleaseRV);
+
+ // If the argument is ConstantPointerNull or UndefValue, its other users
+ // aren't actually interesting to look at.
+ if (isa<ConstantData>(Ptr))
+ return;
+
+ SmallVector<const Value *, 2> Users;
+ Users.push_back(Ptr);
+
+ // Add PHIs that are equivalent to Ptr to Users.
+ if (const PHINode *PN = dyn_cast<PHINode>(Ptr))
+ getEquivalentPHIs(*PN, Users);
+
+ do {
+ Ptr = Users.pop_back_val();
+ for (const User *U : Ptr->users()) {
+ if (isa<ReturnInst>(U) || GetBasicARCInstKind(U) == ARCInstKind::RetainRV)
+ return;
+ if (isa<BitCastInst>(U))
+ Users.push_back(U);
+ }
+ } while (!Users.empty());
+
+ Changed = true;
+ ++NumPeeps;
+
+ LLVM_DEBUG(
+ dbgs() << "Transforming objc_autoreleaseReturnValue => "
+ "objc_autorelease since its operand is not used as a return "
+ "value.\n"
+ "Old = "
+ << *AutoreleaseRV << "\n");
+
+ CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV);
+ Function *NewDecl = EP.get(ARCRuntimeEntryPointKind::Autorelease);
+ AutoreleaseRVCI->setCalledFunction(NewDecl);
+ AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
+ Class = ARCInstKind::Autorelease;
+
+ LLVM_DEBUG(dbgs() << "New: " << *AutoreleaseRV << "\n");
+}
+
+namespace {
+Instruction *
+CloneCallInstForBB(CallInst &CI, BasicBlock &BB,
+ const DenseMap<BasicBlock *, ColorVector> &BlockColors) {
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ for (unsigned I = 0, E = CI.getNumOperandBundles(); I != E; ++I) {
+ auto Bundle = CI.getOperandBundleAt(I);
+ // Funclets will be reassociated in the future.
+ if (Bundle.getTagID() == LLVMContext::OB_funclet)
+ continue;
+ OpBundles.emplace_back(Bundle);
+ }
+
+ if (!BlockColors.empty()) {
+ const ColorVector &CV = BlockColors.find(&BB)->second;
+ assert(CV.size() == 1 && "non-unique color for block!");
+ Instruction *EHPad = CV.front()->getFirstNonPHI();
+ if (EHPad->isEHPad())
+ OpBundles.emplace_back("funclet", EHPad);
+ }
+
+ return CallInst::Create(&CI, OpBundles);
+}
+}
+
+/// Visit each call, one at a time, and make simplifications without doing any
+/// additional analysis.
+void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
+ LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeIndividualCalls ==\n");
+ // Reset all the flags in preparation for recomputing them.
+ UsedInThisFunction = 0;
+
+ DenseMap<BasicBlock *, ColorVector> BlockColors;
+ if (F.hasPersonalityFn() &&
+ isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+ BlockColors = colorEHFunclets(F);
+
+ // Store any delayed AutoreleaseRV intrinsics, so they can be easily paired
+ // with RetainRV and ClaimRV.
+ Instruction *DelayedAutoreleaseRV = nullptr;
+ const Value *DelayedAutoreleaseRVArg = nullptr;
+ auto setDelayedAutoreleaseRV = [&](Instruction *AutoreleaseRV) {
+ assert(!DelayedAutoreleaseRV || !AutoreleaseRV);
+ DelayedAutoreleaseRV = AutoreleaseRV;
+ DelayedAutoreleaseRVArg = nullptr;
+ };
+ auto optimizeDelayedAutoreleaseRV = [&]() {
+ if (!DelayedAutoreleaseRV)
+ return;
+ OptimizeIndividualCallImpl(F, BlockColors, DelayedAutoreleaseRV,
+ ARCInstKind::AutoreleaseRV,
+ DelayedAutoreleaseRVArg);
+ setDelayedAutoreleaseRV(nullptr);
+ };
+ auto shouldDelayAutoreleaseRV = [&](Instruction *NonARCInst) {
+ // Nothing to delay, but we may as well skip the logic below.
+ if (!DelayedAutoreleaseRV)
+ return true;
+
+ // If we hit the end of the basic block we're not going to find an RV-pair.
+ // Stop delaying.
+ if (NonARCInst->isTerminator())
+ return false;
+
+ // Given the frontend rules for emitting AutoreleaseRV, RetainRV, and
+ // ClaimRV, it's probably safe to skip over even opaque function calls
+ // here since OptimizeInlinedAutoreleaseRVCall will confirm that they
+ // have the same RCIdentityRoot. However, what really matters is
+ // skipping instructions or intrinsics that the inliner could leave behind;
+ // be conservative for now and don't skip over opaque calls, which could
+ // potentially include other ARC calls.
+ auto *CB = dyn_cast<CallBase>(NonARCInst);
+ if (!CB)
+ return true;
+ return CB->getIntrinsicID() != Intrinsic::not_intrinsic;
+ };
+
+ // Visit all objc_* calls in F.
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+ Instruction *Inst = &*I++;
+
+ ARCInstKind Class = GetBasicARCInstKind(Inst);
+
+ // Skip this loop if this instruction isn't itself an ARC intrinsic.
+ const Value *Arg = nullptr;
+ switch (Class) {
+ default:
+ optimizeDelayedAutoreleaseRV();
+ break;
+ case ARCInstKind::CallOrUser:
+ case ARCInstKind::User:
+ case ARCInstKind::None:
+ // This is a non-ARC instruction. If we're delaying an AutoreleaseRV,
+ // check if it's safe to skip over it; if not, optimize the AutoreleaseRV
+ // now.
+ if (!shouldDelayAutoreleaseRV(Inst))
+ optimizeDelayedAutoreleaseRV();
+ continue;
+ case ARCInstKind::AutoreleaseRV:
+ optimizeDelayedAutoreleaseRV();
+ setDelayedAutoreleaseRV(Inst);
+ continue;
+ case ARCInstKind::RetainRV:
+ case ARCInstKind::ClaimRV:
+ if (DelayedAutoreleaseRV) {
+ // We have a potential RV pair. Check if they cancel out.
+ if (OptimizeInlinedAutoreleaseRVCall(F, BlockColors, Inst, Arg, Class,
+ DelayedAutoreleaseRV,
+ DelayedAutoreleaseRVArg)) {
+ setDelayedAutoreleaseRV(nullptr);
+ continue;
+ }
+ optimizeDelayedAutoreleaseRV();
+ }
+ break;
+ }
+
+ OptimizeIndividualCallImpl(F, BlockColors, Inst, Class, Arg);
+ }
+
+ // Catch the final delayed AutoreleaseRV.
+ optimizeDelayedAutoreleaseRV();
+}
+
+/// This function returns true if the value is inert. An ObjC ARC runtime call
+/// taking an inert operand can be safely deleted.
+static bool isInertARCValue(Value *V, SmallPtrSet<Value *, 1> &VisitedPhis) {
+ V = V->stripPointerCasts();
+
+ if (IsNullOrUndef(V))
+ return true;
+
+ // See if this is a global attribute annotated with an 'objc_arc_inert'.
+ if (auto *GV = dyn_cast<GlobalVariable>(V))
+ if (GV->hasAttribute("objc_arc_inert"))
+ return true;
+
+ if (auto PN = dyn_cast<PHINode>(V)) {
+ // Ignore this phi if it has already been discovered.
+ if (!VisitedPhis.insert(PN).second)
+ return true;
+ // Look through phis's operands.
+ for (Value *Opnd : PN->incoming_values())
+ if (!isInertARCValue(Opnd, VisitedPhis))
+ return false;
+ return true;
+ }
+
+ return false;
+}
+
+void ObjCARCOpt::OptimizeIndividualCallImpl(
+ Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
+ Instruction *Inst, ARCInstKind Class, const Value *Arg) {
+ LLVM_DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n");
+
+ // We can delete this call if it takes an inert value.
+ SmallPtrSet<Value *, 1> VisitedPhis;
+
+ if (IsNoopOnGlobal(Class))
+ if (isInertARCValue(Inst->getOperand(0), VisitedPhis)) {
+ if (!Inst->getType()->isVoidTy())
+ Inst->replaceAllUsesWith(Inst->getOperand(0));
+ Inst->eraseFromParent();
+ Changed = true;
+ return;
+ }
+
+ switch (Class) {
+ default:
+ break;
+
+ // Delete no-op casts. These function calls have special semantics, but
+ // the semantics are entirely implemented via lowering in the front-end,
+ // so by the time they reach the optimizer, they are just no-op calls
+ // which return their argument.
+ //
+ // There are gray areas here, as the ability to cast reference-counted
+ // pointers to raw void* and back allows code to break ARC assumptions,
+ // however these are currently considered to be unimportant.
+ case ARCInstKind::NoopCast:
+ Changed = true;
+ ++NumNoops;
+ LLVM_DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n");
+ EraseInstruction(Inst);
+ return;
+
+ // If the pointer-to-weak-pointer is null, it's undefined behavior.
+ case ARCInstKind::StoreWeak:
+ case ARCInstKind::LoadWeak:
+ case ARCInstKind::LoadWeakRetained:
+ case ARCInstKind::InitWeak:
+ case ARCInstKind::DestroyWeak: {
+ CallInst *CI = cast<CallInst>(Inst);
+ if (IsNullOrUndef(CI->getArgOperand(0))) {
+ Changed = true;
+ Type *Ty = CI->getArgOperand(0)->getType();
+ new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+ Constant::getNullValue(Ty), CI);
+ Value *NewValue = UndefValue::get(CI->getType());
+ LLVM_DEBUG(
+ dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+ "\nOld = "
+ << *CI << "\nNew = " << *NewValue << "\n");
+ CI->replaceAllUsesWith(NewValue);
+ CI->eraseFromParent();
+ return;
+ }
+ break;
+ }
+ case ARCInstKind::CopyWeak:
+ case ARCInstKind::MoveWeak: {
+ CallInst *CI = cast<CallInst>(Inst);
+ if (IsNullOrUndef(CI->getArgOperand(0)) ||
+ IsNullOrUndef(CI->getArgOperand(1))) {
+ Changed = true;
+ Type *Ty = CI->getArgOperand(0)->getType();
+ new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
+ Constant::getNullValue(Ty), CI);
+
+ Value *NewValue = UndefValue::get(CI->getType());
+ LLVM_DEBUG(
+ dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+ "\nOld = "
+ << *CI << "\nNew = " << *NewValue << "\n");
+
+ CI->replaceAllUsesWith(NewValue);
+ CI->eraseFromParent();
+ return;
+ }
+ break;
+ }
+ case ARCInstKind::RetainRV:
+ if (OptimizeRetainRVCall(F, Inst))
+ return;
+ break;
+ case ARCInstKind::AutoreleaseRV:
+ OptimizeAutoreleaseRVCall(F, Inst, Class);
+ break;
+ }
+
+ // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
+ if (IsAutorelease(Class) && Inst->use_empty()) {
+ CallInst *Call = cast<CallInst>(Inst);
+ const Value *Arg = Call->getArgOperand(0);
+ Arg = FindSingleUseIdentifiedObject(Arg);
+ if (Arg) {
+ Changed = true;
+ ++NumAutoreleases;
+
+ // Create the declaration lazily.
+ LLVMContext &C = Inst->getContext();
+
+ Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release);
+ CallInst *NewCall =
+ CallInst::Create(Decl, Call->getArgOperand(0), "", Call);
+ NewCall->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease),
+ MDNode::get(C, None));
+
+ LLVM_DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
+ "since x is otherwise unused.\nOld: "
+ << *Call << "\nNew: " << *NewCall << "\n");
+
+ EraseInstruction(Call);
+ Inst = NewCall;
+ Class = ARCInstKind::Release;
+ }
+ }
+
+ // For functions which can never be passed stack arguments, add
+ // a tail keyword.
+ if (IsAlwaysTail(Class) && !cast<CallInst>(Inst)->isNoTailCall()) {
+ Changed = true;
+ LLVM_DEBUG(
+ dbgs() << "Adding tail keyword to function since it can never be "
+ "passed stack args: "
+ << *Inst << "\n");
+ cast<CallInst>(Inst)->setTailCall();
+ }
+
+ // Ensure that functions that can never have a "tail" keyword due to the
+ // semantics of ARC truly do not do so.
+ if (IsNeverTail(Class)) {
+ Changed = true;
+ LLVM_DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst
+ << "\n");
+ cast<CallInst>(Inst)->setTailCall(false);
+ }
+
+ // Set nounwind as needed.
+ if (IsNoThrow(Class)) {
+ Changed = true;
+ LLVM_DEBUG(dbgs() << "Found no throw class. Setting nounwind on: " << *Inst
+ << "\n");
+ cast<CallInst>(Inst)->setDoesNotThrow();
+ }
+
+ // Note: This catches instructions unrelated to ARC.
+ if (!IsNoopOnNull(Class)) {
+ UsedInThisFunction |= 1 << unsigned(Class);
+ return;
+ }
+
+ // If we haven't already looked up the root, look it up now.
+ if (!Arg)
+ Arg = GetArgRCIdentityRoot(Inst);
+
+ // ARC calls with null are no-ops. Delete them.
+ if (IsNullOrUndef(Arg)) {
+ Changed = true;
+ ++NumNoops;
+ LLVM_DEBUG(dbgs() << "ARC calls with null are no-ops. Erasing: " << *Inst
+ << "\n");
+ EraseInstruction(Inst);
+ return;
+ }
+
+ // Keep track of which of retain, release, autorelease, and retain_block
+ // are actually present in this function.
+ UsedInThisFunction |= 1 << unsigned(Class);
+
+ // If Arg is a PHI, and one or more incoming values to the
+ // PHI are null, and the call is control-equivalent to the PHI, and there
+ // are no relevant side effects between the PHI and the call, and the call
+ // is not a release that doesn't have the clang.imprecise_release tag, the
+ // call could be pushed up to just those paths with non-null incoming
+ // values. For now, don't bother splitting critical edges for this.
+ if (Class == ARCInstKind::Release &&
+ !Inst->getMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease)))
+ return;
+
+ SmallVector<std::pair<Instruction *, const Value *>, 4> Worklist;
+ Worklist.push_back(std::make_pair(Inst, Arg));
+ do {
+ std::pair<Instruction *, const Value *> Pair = Worklist.pop_back_val();
+ Inst = Pair.first;
+ Arg = Pair.second;
+
+ const PHINode *PN = dyn_cast<PHINode>(Arg);
+ if (!PN)
+ continue;
+
+ // Determine if the PHI has any null operands, or any incoming
+ // critical edges.
+ bool HasNull = false;
+ bool HasCriticalEdges = false;
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ Value *Incoming = GetRCIdentityRoot(PN->getIncomingValue(i));
+ if (IsNullOrUndef(Incoming))
+ HasNull = true;
+ else if (PN->getIncomingBlock(i)->getTerminator()->getNumSuccessors() !=
+ 1) {
+ HasCriticalEdges = true;
+ break;
+ }
+ }
+ // If we have null operands and no critical edges, optimize.
+ if (HasCriticalEdges)
+ continue;
+ if (!HasNull)
+ continue;
+
Instruction *DepInst = nullptr;
-
- // Check that there is nothing that cares about the reference
- // count between the call and the phi.
- switch (Class) {
- case ARCInstKind::Retain:
- case ARCInstKind::RetainBlock:
- // These can always be moved up.
- break;
- case ARCInstKind::Release:
- // These can't be moved across things that care about the retain
- // count.
+
+ // Check that there is nothing that cares about the reference
+ // count between the call and the phi.
+ switch (Class) {
+ case ARCInstKind::Retain:
+ case ARCInstKind::RetainBlock:
+ // These can always be moved up.
+ break;
+ case ARCInstKind::Release:
+ // These can't be moved across things that care about the retain
+ // count.
DepInst = findSingleDependency(NeedsPositiveRetainCount, Arg,
Inst->getParent(), Inst, PA);
- break;
- case ARCInstKind::Autorelease:
- // These can't be moved across autorelease pool scope boundaries.
+ break;
+ case ARCInstKind::Autorelease:
+ // These can't be moved across autorelease pool scope boundaries.
DepInst = findSingleDependency(AutoreleasePoolBoundary, Arg,
Inst->getParent(), Inst, PA);
- break;
- case ARCInstKind::ClaimRV:
- case ARCInstKind::RetainRV:
- case ARCInstKind::AutoreleaseRV:
- // Don't move these; the RV optimization depends on the autoreleaseRV
- // being tail called, and the retainRV being immediately after a call
- // (which might still happen if we get lucky with codegen layout, but
- // it's not worth taking the chance).
- continue;
- default:
- llvm_unreachable("Invalid dependence flavor");
- }
-
+ break;
+ case ARCInstKind::ClaimRV:
+ case ARCInstKind::RetainRV:
+ case ARCInstKind::AutoreleaseRV:
+ // Don't move these; the RV optimization depends on the autoreleaseRV
+ // being tail called, and the retainRV being immediately after a call
+ // (which might still happen if we get lucky with codegen layout, but
+ // it's not worth taking the chance).
+ continue;
+ default:
+ llvm_unreachable("Invalid dependence flavor");
+ }
+
if (DepInst != PN)
- continue;
-
- Changed = true;
- ++NumPartialNoops;
- // Clone the call into each predecessor that has a non-null value.
- CallInst *CInst = cast<CallInst>(Inst);
- Type *ParamTy = CInst->getArgOperand(0)->getType();
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- Value *Incoming = GetRCIdentityRoot(PN->getIncomingValue(i));
- if (IsNullOrUndef(Incoming))
- continue;
- Value *Op = PN->getIncomingValue(i);
- Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
- CallInst *Clone = cast<CallInst>(
- CloneCallInstForBB(*CInst, *InsertPos->getParent(), BlockColors));
- if (Op->getType() != ParamTy)
- Op = new BitCastInst(Op, ParamTy, "", InsertPos);
- Clone->setArgOperand(0, Op);
- Clone->insertBefore(InsertPos);
-
- LLVM_DEBUG(dbgs() << "Cloning " << *CInst << "\n"
- "And inserting clone at "
- << *InsertPos << "\n");
- Worklist.push_back(std::make_pair(Clone, Incoming));
- }
- // Erase the original call.
- LLVM_DEBUG(dbgs() << "Erasing: " << *CInst << "\n");
- EraseInstruction(CInst);
- } while (!Worklist.empty());
-}
-
-/// If we have a top down pointer in the S_Use state, make sure that there are
-/// no CFG hazards by checking the states of various bottom up pointers.
-static void CheckForUseCFGHazard(const Sequence SuccSSeq,
- const bool SuccSRRIKnownSafe,
- TopDownPtrState &S,
- bool &SomeSuccHasSame,
- bool &AllSuccsHaveSame,
- bool &NotAllSeqEqualButKnownSafe,
- bool &ShouldContinue) {
- switch (SuccSSeq) {
- case S_CanRelease: {
- if (!S.IsKnownSafe() && !SuccSRRIKnownSafe) {
- S.ClearSequenceProgress();
- break;
- }
- S.SetCFGHazardAfflicted(true);
- ShouldContinue = true;
- break;
- }
- case S_Use:
- SomeSuccHasSame = true;
- break;
- case S_Stop:
- case S_Release:
- case S_MovableRelease:
- if (!S.IsKnownSafe() && !SuccSRRIKnownSafe)
- AllSuccsHaveSame = false;
- else
- NotAllSeqEqualButKnownSafe = true;
- break;
- case S_Retain:
- llvm_unreachable("bottom-up pointer in retain state!");
- case S_None:
- llvm_unreachable("This should have been handled earlier.");
- }
-}
-
-/// If we have a Top Down pointer in the S_CanRelease state, make sure that
-/// there are no CFG hazards by checking the states of various bottom up
-/// pointers.
-static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq,
- const bool SuccSRRIKnownSafe,
- TopDownPtrState &S,
- bool &SomeSuccHasSame,
- bool &AllSuccsHaveSame,
- bool &NotAllSeqEqualButKnownSafe) {
- switch (SuccSSeq) {
- case S_CanRelease:
- SomeSuccHasSame = true;
- break;
- case S_Stop:
- case S_Release:
- case S_MovableRelease:
- case S_Use:
- if (!S.IsKnownSafe() && !SuccSRRIKnownSafe)
- AllSuccsHaveSame = false;
- else
- NotAllSeqEqualButKnownSafe = true;
- break;
- case S_Retain:
- llvm_unreachable("bottom-up pointer in retain state!");
- case S_None:
- llvm_unreachable("This should have been handled earlier.");
- }
-}
-
-/// Check for critical edges, loop boundaries, irreducible control flow, or
-/// other CFG structures where moving code across the edge would result in it
-/// being executed more.
-void
-ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
- DenseMap<const BasicBlock *, BBState> &BBStates,
- BBState &MyStates) const {
- // If any top-down local-use or possible-dec has a succ which is earlier in
- // the sequence, forget it.
- for (auto I = MyStates.top_down_ptr_begin(), E = MyStates.top_down_ptr_end();
- I != E; ++I) {
- TopDownPtrState &S = I->second;
- const Sequence Seq = I->second.GetSeq();
-
- // We only care about S_Retain, S_CanRelease, and S_Use.
- if (Seq == S_None)
- continue;
-
- // Make sure that if extra top down states are added in the future that this
- // code is updated to handle it.
- assert((Seq == S_Retain || Seq == S_CanRelease || Seq == S_Use) &&
- "Unknown top down sequence state.");
-
- const Value *Arg = I->first;
- bool SomeSuccHasSame = false;
- bool AllSuccsHaveSame = true;
- bool NotAllSeqEqualButKnownSafe = false;
-
- for (const BasicBlock *Succ : successors(BB)) {
- // If VisitBottomUp has pointer information for this successor, take
- // what we know about it.
- const DenseMap<const BasicBlock *, BBState>::iterator BBI =
- BBStates.find(Succ);
- assert(BBI != BBStates.end());
- const BottomUpPtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
- const Sequence SuccSSeq = SuccS.GetSeq();
-
- // If bottom up, the pointer is in an S_None state, clear the sequence
- // progress since the sequence in the bottom up state finished
- // suggesting a mismatch in between retains/releases. This is true for
- // all three cases that we are handling here: S_Retain, S_Use, and
- // S_CanRelease.
- if (SuccSSeq == S_None) {
- S.ClearSequenceProgress();
- continue;
- }
-
- // If we have S_Use or S_CanRelease, perform our check for cfg hazard
- // checks.
- const bool SuccSRRIKnownSafe = SuccS.IsKnownSafe();
-
- // *NOTE* We do not use Seq from above here since we are allowing for
- // S.GetSeq() to change while we are visiting basic blocks.
- switch(S.GetSeq()) {
- case S_Use: {
- bool ShouldContinue = false;
- CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S, SomeSuccHasSame,
- AllSuccsHaveSame, NotAllSeqEqualButKnownSafe,
- ShouldContinue);
- if (ShouldContinue)
- continue;
- break;
- }
- case S_CanRelease:
- CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S,
- SomeSuccHasSame, AllSuccsHaveSame,
- NotAllSeqEqualButKnownSafe);
- break;
- case S_Retain:
- case S_None:
- case S_Stop:
- case S_Release:
- case S_MovableRelease:
- break;
- }
- }
-
- // If the state at the other end of any of the successor edges
- // matches the current state, require all edges to match. This
- // guards against loops in the middle of a sequence.
- if (SomeSuccHasSame && !AllSuccsHaveSame) {
- S.ClearSequenceProgress();
- } else if (NotAllSeqEqualButKnownSafe) {
- // If we would have cleared the state foregoing the fact that we are known
- // safe, stop code motion. This is because whether or not it is safe to
- // remove RR pairs via KnownSafe is an orthogonal concept to whether we
- // are allowed to perform code motion.
- S.SetCFGHazardAfflicted(true);
- }
- }
-}
-
-bool ObjCARCOpt::VisitInstructionBottomUp(
- Instruction *Inst, BasicBlock *BB, BlotMapVector<Value *, RRInfo> &Retains,
- BBState &MyStates) {
- bool NestingDetected = false;
- ARCInstKind Class = GetARCInstKind(Inst);
- const Value *Arg = nullptr;
-
- LLVM_DEBUG(dbgs() << " Class: " << Class << "\n");
-
- switch (Class) {
- case ARCInstKind::Release: {
- Arg = GetArgRCIdentityRoot(Inst);
-
- BottomUpPtrState &S = MyStates.getPtrBottomUpState(Arg);
- NestingDetected |= S.InitBottomUp(MDKindCache, Inst);
- break;
- }
- case ARCInstKind::RetainBlock:
- // In OptimizeIndividualCalls, we have strength reduced all optimizable
- // objc_retainBlocks to objc_retains. Thus at this point any
- // objc_retainBlocks that we see are not optimizable.
- break;
- case ARCInstKind::Retain:
- case ARCInstKind::RetainRV: {
- Arg = GetArgRCIdentityRoot(Inst);
- BottomUpPtrState &S = MyStates.getPtrBottomUpState(Arg);
- if (S.MatchWithRetain()) {
- // Don't do retain+release tracking for ARCInstKind::RetainRV, because
- // it's better to let it remain as the first instruction after a call.
- if (Class != ARCInstKind::RetainRV) {
- LLVM_DEBUG(dbgs() << " Matching with: " << *Inst << "\n");
- Retains[Inst] = S.GetRRInfo();
- }
- S.ClearSequenceProgress();
- }
- // A retain moving bottom up can be a use.
- break;
- }
- case ARCInstKind::AutoreleasepoolPop:
- // Conservatively, clear MyStates for all known pointers.
- MyStates.clearBottomUpPointers();
- return NestingDetected;
- case ARCInstKind::AutoreleasepoolPush:
- case ARCInstKind::None:
- // These are irrelevant.
- return NestingDetected;
- default:
- break;
- }
-
- // Consider any other possible effects of this instruction on each
- // pointer being tracked.
- for (auto MI = MyStates.bottom_up_ptr_begin(),
- ME = MyStates.bottom_up_ptr_end();
- MI != ME; ++MI) {
- const Value *Ptr = MI->first;
- if (Ptr == Arg)
- continue; // Handled above.
- BottomUpPtrState &S = MI->second;
-
- if (S.HandlePotentialAlterRefCount(Inst, Ptr, PA, Class))
- continue;
-
- S.HandlePotentialUse(BB, Inst, Ptr, PA, Class);
- }
-
- return NestingDetected;
-}
-
-bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
- DenseMap<const BasicBlock *, BBState> &BBStates,
- BlotMapVector<Value *, RRInfo> &Retains) {
- LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::VisitBottomUp ==\n");
-
- bool NestingDetected = false;
- BBState &MyStates = BBStates[BB];
-
- // Merge the states from each successor to compute the initial state
- // for the current block.
- BBState::edge_iterator SI(MyStates.succ_begin()),
- SE(MyStates.succ_end());
- if (SI != SE) {
- const BasicBlock *Succ = *SI;
- DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
- assert(I != BBStates.end());
- MyStates.InitFromSucc(I->second);
- ++SI;
- for (; SI != SE; ++SI) {
- Succ = *SI;
- I = BBStates.find(Succ);
- assert(I != BBStates.end());
- MyStates.MergeSucc(I->second);
- }
- }
-
- LLVM_DEBUG(dbgs() << "Before:\n"
- << BBStates[BB] << "\n"
- << "Performing Dataflow:\n");
-
- // Visit all the instructions, bottom-up.
- for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
- Instruction *Inst = &*std::prev(I);
-
- // Invoke instructions are visited as part of their successors (below).
- if (isa<InvokeInst>(Inst))
- continue;
-
- LLVM_DEBUG(dbgs() << " Visiting " << *Inst << "\n");
-
- NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
-
- // Bail out if the number of pointers being tracked becomes too large so
- // that this pass can complete in a reasonable amount of time.
- if (MyStates.bottom_up_ptr_list_size() > MaxPtrStates) {
- DisableRetainReleasePairing = true;
- return false;
- }
- }
-
- // If there's a predecessor with an invoke, visit the invoke as if it were
- // part of this block, since we can't insert code after an invoke in its own
- // block, and we don't want to split critical edges.
- for (BBState::edge_iterator PI(MyStates.pred_begin()),
- PE(MyStates.pred_end()); PI != PE; ++PI) {
- BasicBlock *Pred = *PI;
- if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back()))
- NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
- }
-
- LLVM_DEBUG(dbgs() << "\nFinal State:\n" << BBStates[BB] << "\n");
-
- return NestingDetected;
-}
-
-bool
-ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
- DenseMap<Value *, RRInfo> &Releases,
- BBState &MyStates) {
- bool NestingDetected = false;
- ARCInstKind Class = GetARCInstKind(Inst);
- const Value *Arg = nullptr;
-
- LLVM_DEBUG(dbgs() << " Class: " << Class << "\n");
-
- switch (Class) {
- case ARCInstKind::RetainBlock:
- // In OptimizeIndividualCalls, we have strength reduced all optimizable
- // objc_retainBlocks to objc_retains. Thus at this point any
- // objc_retainBlocks that we see are not optimizable. We need to break since
- // a retain can be a potential use.
- break;
- case ARCInstKind::Retain:
- case ARCInstKind::RetainRV: {
- Arg = GetArgRCIdentityRoot(Inst);
- TopDownPtrState &S = MyStates.getPtrTopDownState(Arg);
- NestingDetected |= S.InitTopDown(Class, Inst);
- // A retain can be a potential use; proceed to the generic checking
- // code below.
- break;
- }
- case ARCInstKind::Release: {
- Arg = GetArgRCIdentityRoot(Inst);
- TopDownPtrState &S = MyStates.getPtrTopDownState(Arg);
- // Try to form a tentative pair in between this release instruction and the
- // top down pointers that we are tracking.
- if (S.MatchWithRelease(MDKindCache, Inst)) {
- // If we succeed, copy S's RRInfo into the Release -> {Retain Set
- // Map}. Then we clear S.
- LLVM_DEBUG(dbgs() << " Matching with: " << *Inst << "\n");
- Releases[Inst] = S.GetRRInfo();
- S.ClearSequenceProgress();
- }
- break;
- }
- case ARCInstKind::AutoreleasepoolPop:
- // Conservatively, clear MyStates for all known pointers.
- MyStates.clearTopDownPointers();
- return false;
- case ARCInstKind::AutoreleasepoolPush:
- case ARCInstKind::None:
- // These can not be uses of
- return false;
- default:
- break;
- }
-
- // Consider any other possible effects of this instruction on each
- // pointer being tracked.
- for (auto MI = MyStates.top_down_ptr_begin(),
- ME = MyStates.top_down_ptr_end();
- MI != ME; ++MI) {
- const Value *Ptr = MI->first;
- if (Ptr == Arg)
- continue; // Handled above.
- TopDownPtrState &S = MI->second;
- if (S.HandlePotentialAlterRefCount(Inst, Ptr, PA, Class))
- continue;
-
- S.HandlePotentialUse(Inst, Ptr, PA, Class);
- }
-
- return NestingDetected;
-}
-
-bool
-ObjCARCOpt::VisitTopDown(BasicBlock *BB,
- DenseMap<const BasicBlock *, BBState> &BBStates,
- DenseMap<Value *, RRInfo> &Releases) {
- LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::VisitTopDown ==\n");
- bool NestingDetected = false;
- BBState &MyStates = BBStates[BB];
-
- // Merge the states from each predecessor to compute the initial state
- // for the current block.
- BBState::edge_iterator PI(MyStates.pred_begin()),
- PE(MyStates.pred_end());
- if (PI != PE) {
- const BasicBlock *Pred = *PI;
- DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
- assert(I != BBStates.end());
- MyStates.InitFromPred(I->second);
- ++PI;
- for (; PI != PE; ++PI) {
- Pred = *PI;
- I = BBStates.find(Pred);
- assert(I != BBStates.end());
- MyStates.MergePred(I->second);
- }
- }
-
- // Check that BB and MyStates have the same number of predecessors. This
- // prevents retain calls that live outside a loop from being moved into the
- // loop.
- if (!BB->hasNPredecessors(MyStates.pred_end() - MyStates.pred_begin()))
- for (auto I = MyStates.top_down_ptr_begin(),
- E = MyStates.top_down_ptr_end();
- I != E; ++I)
- I->second.SetCFGHazardAfflicted(true);
-
- LLVM_DEBUG(dbgs() << "Before:\n"
- << BBStates[BB] << "\n"
- << "Performing Dataflow:\n");
-
- // Visit all the instructions, top-down.
- for (Instruction &Inst : *BB) {
- LLVM_DEBUG(dbgs() << " Visiting " << Inst << "\n");
-
- NestingDetected |= VisitInstructionTopDown(&Inst, Releases, MyStates);
-
- // Bail out if the number of pointers being tracked becomes too large so
- // that this pass can complete in a reasonable amount of time.
- if (MyStates.top_down_ptr_list_size() > MaxPtrStates) {
- DisableRetainReleasePairing = true;
- return false;
- }
- }
-
- LLVM_DEBUG(dbgs() << "\nState Before Checking for CFG Hazards:\n"
- << BBStates[BB] << "\n\n");
- CheckForCFGHazards(BB, BBStates, MyStates);
- LLVM_DEBUG(dbgs() << "Final State:\n" << BBStates[BB] << "\n");
- return NestingDetected;
-}
-
-static void
-ComputePostOrders(Function &F,
- SmallVectorImpl<BasicBlock *> &PostOrder,
- SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder,
- unsigned NoObjCARCExceptionsMDKind,
- DenseMap<const BasicBlock *, BBState> &BBStates) {
- /// The visited set, for doing DFS walks.
- SmallPtrSet<BasicBlock *, 16> Visited;
-
- // Do DFS, computing the PostOrder.
- SmallPtrSet<BasicBlock *, 16> OnStack;
- SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack;
-
- // Functions always have exactly one entry block, and we don't have
- // any other block that we treat like an entry block.
- BasicBlock *EntryBB = &F.getEntryBlock();
- BBState &MyStates = BBStates[EntryBB];
- MyStates.SetAsEntry();
- Instruction *EntryTI = EntryBB->getTerminator();
- SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
- Visited.insert(EntryBB);
- OnStack.insert(EntryBB);
- do {
- dfs_next_succ:
- BasicBlock *CurrBB = SuccStack.back().first;
- succ_iterator SE(CurrBB->getTerminator(), false);
-
- while (SuccStack.back().second != SE) {
- BasicBlock *SuccBB = *SuccStack.back().second++;
- if (Visited.insert(SuccBB).second) {
- SuccStack.push_back(
- std::make_pair(SuccBB, succ_iterator(SuccBB->getTerminator())));
- BBStates[CurrBB].addSucc(SuccBB);
- BBState &SuccStates = BBStates[SuccBB];
- SuccStates.addPred(CurrBB);
- OnStack.insert(SuccBB);
- goto dfs_next_succ;
- }
-
- if (!OnStack.count(SuccBB)) {
- BBStates[CurrBB].addSucc(SuccBB);
- BBStates[SuccBB].addPred(CurrBB);
- }
- }
- OnStack.erase(CurrBB);
- PostOrder.push_back(CurrBB);
- SuccStack.pop_back();
- } while (!SuccStack.empty());
-
- Visited.clear();
-
- // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
- // Functions may have many exits, and there also blocks which we treat
- // as exits due to ignored edges.
- SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack;
- for (BasicBlock &ExitBB : F) {
- BBState &MyStates = BBStates[&ExitBB];
- if (!MyStates.isExit())
- continue;
-
- MyStates.SetAsExit();
-
- PredStack.push_back(std::make_pair(&ExitBB, MyStates.pred_begin()));
- Visited.insert(&ExitBB);
- while (!PredStack.empty()) {
- reverse_dfs_next_succ:
- BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
- while (PredStack.back().second != PE) {
- BasicBlock *BB = *PredStack.back().second++;
- if (Visited.insert(BB).second) {
- PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin()));
- goto reverse_dfs_next_succ;
- }
- }
- ReverseCFGPostOrder.push_back(PredStack.pop_back_val().first);
- }
- }
-}
-
-// Visit the function both top-down and bottom-up.
-bool ObjCARCOpt::Visit(Function &F,
- DenseMap<const BasicBlock *, BBState> &BBStates,
- BlotMapVector<Value *, RRInfo> &Retains,
- DenseMap<Value *, RRInfo> &Releases) {
- // Use reverse-postorder traversals, because we magically know that loops
- // will be well behaved, i.e. they won't repeatedly call retain on a single
- // pointer without doing a release. We can't use the ReversePostOrderTraversal
- // class here because we want the reverse-CFG postorder to consider each
- // function exit point, and we want to ignore selected cycle edges.
- SmallVector<BasicBlock *, 16> PostOrder;
- SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
- ComputePostOrders(F, PostOrder, ReverseCFGPostOrder,
- MDKindCache.get(ARCMDKindID::NoObjCARCExceptions),
- BBStates);
-
- // Use reverse-postorder on the reverse CFG for bottom-up.
- bool BottomUpNestingDetected = false;
- for (BasicBlock *BB : llvm::reverse(ReverseCFGPostOrder)) {
- BottomUpNestingDetected |= VisitBottomUp(BB, BBStates, Retains);
- if (DisableRetainReleasePairing)
- return false;
- }
-
- // Use reverse-postorder for top-down.
- bool TopDownNestingDetected = false;
- for (BasicBlock *BB : llvm::reverse(PostOrder)) {
- TopDownNestingDetected |= VisitTopDown(BB, BBStates, Releases);
- if (DisableRetainReleasePairing)
- return false;
- }
-
- return TopDownNestingDetected && BottomUpNestingDetected;
-}
-
-/// Move the calls in RetainsToMove and ReleasesToMove.
-void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove,
- RRInfo &ReleasesToMove,
- BlotMapVector<Value *, RRInfo> &Retains,
- DenseMap<Value *, RRInfo> &Releases,
- SmallVectorImpl<Instruction *> &DeadInsts,
- Module *M) {
- Type *ArgTy = Arg->getType();
- Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
-
- LLVM_DEBUG(dbgs() << "== ObjCARCOpt::MoveCalls ==\n");
-
- // Insert the new retain and release calls.
- for (Instruction *InsertPt : ReleasesToMove.ReverseInsertPts) {
- Value *MyArg = ArgTy == ParamTy ? Arg :
- new BitCastInst(Arg, ParamTy, "", InsertPt);
- Function *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
- CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt);
- Call->setDoesNotThrow();
- Call->setTailCall();
-
- LLVM_DEBUG(dbgs() << "Inserting new Retain: " << *Call
- << "\n"
- "At insertion point: "
- << *InsertPt << "\n");
- }
- for (Instruction *InsertPt : RetainsToMove.ReverseInsertPts) {
- Value *MyArg = ArgTy == ParamTy ? Arg :
- new BitCastInst(Arg, ParamTy, "", InsertPt);
- Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release);
- CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt);
- // Attach a clang.imprecise_release metadata tag, if appropriate.
- if (MDNode *M = ReleasesToMove.ReleaseMetadata)
- Call->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease), M);
- Call->setDoesNotThrow();
- if (ReleasesToMove.IsTailCallRelease)
- Call->setTailCall();
-
- LLVM_DEBUG(dbgs() << "Inserting new Release: " << *Call
- << "\n"
- "At insertion point: "
- << *InsertPt << "\n");
- }
-
- // Delete the original retain and release calls.
- for (Instruction *OrigRetain : RetainsToMove.Calls) {
- Retains.blot(OrigRetain);
- DeadInsts.push_back(OrigRetain);
- LLVM_DEBUG(dbgs() << "Deleting retain: " << *OrigRetain << "\n");
- }
- for (Instruction *OrigRelease : ReleasesToMove.Calls) {
- Releases.erase(OrigRelease);
- DeadInsts.push_back(OrigRelease);
- LLVM_DEBUG(dbgs() << "Deleting release: " << *OrigRelease << "\n");
- }
-}
-
-bool ObjCARCOpt::PairUpRetainsAndReleases(
- DenseMap<const BasicBlock *, BBState> &BBStates,
- BlotMapVector<Value *, RRInfo> &Retains,
- DenseMap<Value *, RRInfo> &Releases, Module *M,
- Instruction *Retain,
- SmallVectorImpl<Instruction *> &DeadInsts, RRInfo &RetainsToMove,
- RRInfo &ReleasesToMove, Value *Arg, bool KnownSafe,
- bool &AnyPairsCompletelyEliminated) {
- // If a pair happens in a region where it is known that the reference count
- // is already incremented, we can similarly ignore possible decrements unless
- // we are dealing with a retainable object with multiple provenance sources.
- bool KnownSafeTD = true, KnownSafeBU = true;
- bool CFGHazardAfflicted = false;
-
- // Connect the dots between the top-down-collected RetainsToMove and
- // bottom-up-collected ReleasesToMove to form sets of related calls.
- // This is an iterative process so that we connect multiple releases
- // to multiple retains if needed.
- unsigned OldDelta = 0;
- unsigned NewDelta = 0;
- unsigned OldCount = 0;
- unsigned NewCount = 0;
- bool FirstRelease = true;
- for (SmallVector<Instruction *, 4> NewRetains{Retain};;) {
- SmallVector<Instruction *, 4> NewReleases;
- for (Instruction *NewRetain : NewRetains) {
- auto It = Retains.find(NewRetain);
- assert(It != Retains.end());
- const RRInfo &NewRetainRRI = It->second;
- KnownSafeTD &= NewRetainRRI.KnownSafe;
- CFGHazardAfflicted |= NewRetainRRI.CFGHazardAfflicted;
- for (Instruction *NewRetainRelease : NewRetainRRI.Calls) {
- auto Jt = Releases.find(NewRetainRelease);
- if (Jt == Releases.end())
- return false;
- const RRInfo &NewRetainReleaseRRI = Jt->second;
-
- // If the release does not have a reference to the retain as well,
- // something happened which is unaccounted for. Do not do anything.
- //
- // This can happen if we catch an additive overflow during path count
- // merging.
- if (!NewRetainReleaseRRI.Calls.count(NewRetain))
- return false;
-
- if (ReleasesToMove.Calls.insert(NewRetainRelease).second) {
- // If we overflow when we compute the path count, don't remove/move
- // anything.
- const BBState &NRRBBState = BBStates[NewRetainRelease->getParent()];
- unsigned PathCount = BBState::OverflowOccurredValue;
- if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
- return false;
- assert(PathCount != BBState::OverflowOccurredValue &&
- "PathCount at this point can not be "
- "OverflowOccurredValue.");
- OldDelta -= PathCount;
-
- // Merge the ReleaseMetadata and IsTailCallRelease values.
- if (FirstRelease) {
- ReleasesToMove.ReleaseMetadata =
- NewRetainReleaseRRI.ReleaseMetadata;
- ReleasesToMove.IsTailCallRelease =
- NewRetainReleaseRRI.IsTailCallRelease;
- FirstRelease = false;
- } else {
- if (ReleasesToMove.ReleaseMetadata !=
- NewRetainReleaseRRI.ReleaseMetadata)
- ReleasesToMove.ReleaseMetadata = nullptr;
- if (ReleasesToMove.IsTailCallRelease !=
- NewRetainReleaseRRI.IsTailCallRelease)
- ReleasesToMove.IsTailCallRelease = false;
- }
-
- // Collect the optimal insertion points.
- if (!KnownSafe)
- for (Instruction *RIP : NewRetainReleaseRRI.ReverseInsertPts) {
- if (ReleasesToMove.ReverseInsertPts.insert(RIP).second) {
- // If we overflow when we compute the path count, don't
- // remove/move anything.
- const BBState &RIPBBState = BBStates[RIP->getParent()];
- PathCount = BBState::OverflowOccurredValue;
- if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
- return false;
- assert(PathCount != BBState::OverflowOccurredValue &&
- "PathCount at this point can not be "
- "OverflowOccurredValue.");
- NewDelta -= PathCount;
- }
- }
- NewReleases.push_back(NewRetainRelease);
- }
- }
- }
- NewRetains.clear();
- if (NewReleases.empty()) break;
-
- // Back the other way.
- for (Instruction *NewRelease : NewReleases) {
- auto It = Releases.find(NewRelease);
- assert(It != Releases.end());
- const RRInfo &NewReleaseRRI = It->second;
- KnownSafeBU &= NewReleaseRRI.KnownSafe;
- CFGHazardAfflicted |= NewReleaseRRI.CFGHazardAfflicted;
- for (Instruction *NewReleaseRetain : NewReleaseRRI.Calls) {
- auto Jt = Retains.find(NewReleaseRetain);
- if (Jt == Retains.end())
- return false;
- const RRInfo &NewReleaseRetainRRI = Jt->second;
-
- // If the retain does not have a reference to the release as well,
- // something happened which is unaccounted for. Do not do anything.
- //
- // This can happen if we catch an additive overflow during path count
- // merging.
- if (!NewReleaseRetainRRI.Calls.count(NewRelease))
- return false;
-
- if (RetainsToMove.Calls.insert(NewReleaseRetain).second) {
- // If we overflow when we compute the path count, don't remove/move
- // anything.
- const BBState &NRRBBState = BBStates[NewReleaseRetain->getParent()];
- unsigned PathCount = BBState::OverflowOccurredValue;
- if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
- return false;
- assert(PathCount != BBState::OverflowOccurredValue &&
- "PathCount at this point can not be "
- "OverflowOccurredValue.");
- OldDelta += PathCount;
- OldCount += PathCount;
-
- // Collect the optimal insertion points.
- if (!KnownSafe)
- for (Instruction *RIP : NewReleaseRetainRRI.ReverseInsertPts) {
- if (RetainsToMove.ReverseInsertPts.insert(RIP).second) {
- // If we overflow when we compute the path count, don't
- // remove/move anything.
- const BBState &RIPBBState = BBStates[RIP->getParent()];
-
- PathCount = BBState::OverflowOccurredValue;
- if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
- return false;
- assert(PathCount != BBState::OverflowOccurredValue &&
- "PathCount at this point can not be "
- "OverflowOccurredValue.");
- NewDelta += PathCount;
- NewCount += PathCount;
- }
- }
- NewRetains.push_back(NewReleaseRetain);
- }
- }
- }
- if (NewRetains.empty()) break;
- }
-
- // We can only remove pointers if we are known safe in both directions.
- bool UnconditionallySafe = KnownSafeTD && KnownSafeBU;
- if (UnconditionallySafe) {
- RetainsToMove.ReverseInsertPts.clear();
- ReleasesToMove.ReverseInsertPts.clear();
- NewCount = 0;
- } else {
- // Determine whether the new insertion points we computed preserve the
- // balance of retain and release calls through the program.
- // TODO: If the fully aggressive solution isn't valid, try to find a
- // less aggressive solution which is.
- if (NewDelta != 0)
- return false;
-
- // At this point, we are not going to remove any RR pairs, but we still are
- // able to move RR pairs. If one of our pointers is afflicted with
- // CFGHazards, we cannot perform such code motion so exit early.
- const bool WillPerformCodeMotion =
- !RetainsToMove.ReverseInsertPts.empty() ||
- !ReleasesToMove.ReverseInsertPts.empty();
- if (CFGHazardAfflicted && WillPerformCodeMotion)
- return false;
- }
-
- // Determine whether the original call points are balanced in the retain and
- // release calls through the program. If not, conservatively don't touch
- // them.
- // TODO: It's theoretically possible to do code motion in this case, as
- // long as the existing imbalances are maintained.
- if (OldDelta != 0)
- return false;
-
- Changed = true;
- assert(OldCount != 0 && "Unreachable code?");
- NumRRs += OldCount - NewCount;
- // Set to true if we completely removed any RR pairs.
- AnyPairsCompletelyEliminated = NewCount == 0;
-
- // We can move calls!
- return true;
-}
-
-/// Identify pairings between the retains and releases, and delete and/or move
-/// them.
-bool ObjCARCOpt::PerformCodePlacement(
- DenseMap<const BasicBlock *, BBState> &BBStates,
- BlotMapVector<Value *, RRInfo> &Retains,
- DenseMap<Value *, RRInfo> &Releases, Module *M) {
- LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n");
-
- bool AnyPairsCompletelyEliminated = false;
- SmallVector<Instruction *, 8> DeadInsts;
-
- // Visit each retain.
- for (BlotMapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
- E = Retains.end();
- I != E; ++I) {
- Value *V = I->first;
- if (!V) continue; // blotted
-
- Instruction *Retain = cast<Instruction>(V);
-
- LLVM_DEBUG(dbgs() << "Visiting: " << *Retain << "\n");
-
- Value *Arg = GetArgRCIdentityRoot(Retain);
-
- // If the object being released is in static or stack storage, we know it's
- // not being managed by ObjC reference counting, so we can delete pairs
- // regardless of what possible decrements or uses lie between them.
- bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
-
- // A constant pointer can't be pointing to an object on the heap. It may
- // be reference-counted, but it won't be deleted.
- if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
- if (const GlobalVariable *GV =
- dyn_cast<GlobalVariable>(
- GetRCIdentityRoot(LI->getPointerOperand())))
- if (GV->isConstant())
- KnownSafe = true;
-
- // Connect the dots between the top-down-collected RetainsToMove and
- // bottom-up-collected ReleasesToMove to form sets of related calls.
- RRInfo RetainsToMove, ReleasesToMove;
-
- bool PerformMoveCalls = PairUpRetainsAndReleases(
- BBStates, Retains, Releases, M, Retain, DeadInsts,
- RetainsToMove, ReleasesToMove, Arg, KnownSafe,
- AnyPairsCompletelyEliminated);
-
- if (PerformMoveCalls) {
- // Ok, everything checks out and we're all set. Let's move/delete some
- // code!
- MoveCalls(Arg, RetainsToMove, ReleasesToMove,
- Retains, Releases, DeadInsts, M);
- }
- }
-
- // Now that we're done moving everything, we can delete the newly dead
- // instructions, as we no longer need them as insert points.
- while (!DeadInsts.empty())
- EraseInstruction(DeadInsts.pop_back_val());
-
- return AnyPairsCompletelyEliminated;
-}
-
-/// Weak pointer optimizations.
-void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
- LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeWeakCalls ==\n");
-
- // First, do memdep-style RLE and S2L optimizations. We can't use memdep
- // itself because it uses AliasAnalysis and we need to do provenance
- // queries instead.
- for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
- Instruction *Inst = &*I++;
-
- LLVM_DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
-
- ARCInstKind Class = GetBasicARCInstKind(Inst);
- if (Class != ARCInstKind::LoadWeak &&
- Class != ARCInstKind::LoadWeakRetained)
- continue;
-
- // Delete objc_loadWeak calls with no users.
- if (Class == ARCInstKind::LoadWeak && Inst->use_empty()) {
- Inst->eraseFromParent();
- Changed = true;
- continue;
- }
-
- // TODO: For now, just look for an earlier available version of this value
- // within the same block. Theoretically, we could do memdep-style non-local
- // analysis too, but that would want caching. A better approach would be to
- // use the technique that EarlyCSE uses.
- inst_iterator Current = std::prev(I);
- BasicBlock *CurrentBB = &*Current.getBasicBlockIterator();
- for (BasicBlock::iterator B = CurrentBB->begin(),
- J = Current.getInstructionIterator();
- J != B; --J) {
- Instruction *EarlierInst = &*std::prev(J);
- ARCInstKind EarlierClass = GetARCInstKind(EarlierInst);
- switch (EarlierClass) {
- case ARCInstKind::LoadWeak:
- case ARCInstKind::LoadWeakRetained: {
- // If this is loading from the same pointer, replace this load's value
- // with that one.
- CallInst *Call = cast<CallInst>(Inst);
- CallInst *EarlierCall = cast<CallInst>(EarlierInst);
- Value *Arg = Call->getArgOperand(0);
- Value *EarlierArg = EarlierCall->getArgOperand(0);
- switch (PA.getAA()->alias(Arg, EarlierArg)) {
- case MustAlias:
- Changed = true;
- // If the load has a builtin retain, insert a plain retain for it.
- if (Class == ARCInstKind::LoadWeakRetained) {
- Function *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
- CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call);
- CI->setTailCall();
- }
- // Zap the fully redundant load.
- Call->replaceAllUsesWith(EarlierCall);
- Call->eraseFromParent();
- goto clobbered;
- case MayAlias:
- case PartialAlias:
- goto clobbered;
- case NoAlias:
- break;
- }
- break;
- }
- case ARCInstKind::StoreWeak:
- case ARCInstKind::InitWeak: {
- // If this is storing to the same pointer and has the same size etc.
- // replace this load's value with the stored value.
- CallInst *Call = cast<CallInst>(Inst);
- CallInst *EarlierCall = cast<CallInst>(EarlierInst);
- Value *Arg = Call->getArgOperand(0);
- Value *EarlierArg = EarlierCall->getArgOperand(0);
- switch (PA.getAA()->alias(Arg, EarlierArg)) {
- case MustAlias:
- Changed = true;
- // If the load has a builtin retain, insert a plain retain for it.
- if (Class == ARCInstKind::LoadWeakRetained) {
- Function *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
- CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call);
- CI->setTailCall();
- }
- // Zap the fully redundant load.
- Call->replaceAllUsesWith(EarlierCall->getArgOperand(1));
- Call->eraseFromParent();
- goto clobbered;
- case MayAlias:
- case PartialAlias:
- goto clobbered;
- case NoAlias:
- break;
- }
- break;
- }
- case ARCInstKind::MoveWeak:
- case ARCInstKind::CopyWeak:
- // TOOD: Grab the copied value.
- goto clobbered;
- case ARCInstKind::AutoreleasepoolPush:
- case ARCInstKind::None:
- case ARCInstKind::IntrinsicUser:
- case ARCInstKind::User:
- // Weak pointers are only modified through the weak entry points
- // (and arbitrary calls, which could call the weak entry points).
- break;
- default:
- // Anything else could modify the weak pointer.
- goto clobbered;
- }
- }
- clobbered:;
- }
-
- // Then, for each destroyWeak with an alloca operand, check to see if
- // the alloca and all its users can be zapped.
- for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
- Instruction *Inst = &*I++;
- ARCInstKind Class = GetBasicARCInstKind(Inst);
- if (Class != ARCInstKind::DestroyWeak)
- continue;
-
- CallInst *Call = cast<CallInst>(Inst);
- Value *Arg = Call->getArgOperand(0);
- if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
- for (User *U : Alloca->users()) {
- const Instruction *UserInst = cast<Instruction>(U);
- switch (GetBasicARCInstKind(UserInst)) {
- case ARCInstKind::InitWeak:
- case ARCInstKind::StoreWeak:
- case ARCInstKind::DestroyWeak:
- continue;
- default:
- goto done;
- }
- }
- Changed = true;
- for (auto UI = Alloca->user_begin(), UE = Alloca->user_end(); UI != UE;) {
- CallInst *UserInst = cast<CallInst>(*UI++);
- switch (GetBasicARCInstKind(UserInst)) {
- case ARCInstKind::InitWeak:
- case ARCInstKind::StoreWeak:
- // These functions return their second argument.
- UserInst->replaceAllUsesWith(UserInst->getArgOperand(1));
- break;
- case ARCInstKind::DestroyWeak:
- // No return value.
- break;
- default:
- llvm_unreachable("alloca really is used!");
- }
- UserInst->eraseFromParent();
- }
- Alloca->eraseFromParent();
- done:;
- }
- }
-}
-
-/// Identify program paths which execute sequences of retains and releases which
-/// can be eliminated.
-bool ObjCARCOpt::OptimizeSequences(Function &F) {
- // Releases, Retains - These are used to store the results of the main flow
- // analysis. These use Value* as the key instead of Instruction* so that the
- // map stays valid when we get around to rewriting code and calls get
- // replaced by arguments.
- DenseMap<Value *, RRInfo> Releases;
- BlotMapVector<Value *, RRInfo> Retains;
-
- // This is used during the traversal of the function to track the
- // states for each identified object at each block.
- DenseMap<const BasicBlock *, BBState> BBStates;
-
- // Analyze the CFG of the function, and all instructions.
- bool NestingDetected = Visit(F, BBStates, Retains, Releases);
-
- if (DisableRetainReleasePairing)
- return false;
-
- // Transform.
- bool AnyPairsCompletelyEliminated = PerformCodePlacement(BBStates, Retains,
- Releases,
- F.getParent());
-
- return AnyPairsCompletelyEliminated && NestingDetected;
-}
-
-/// Check if there is a dependent call earlier that does not have anything in
-/// between the Retain and the call that can affect the reference count of their
-/// shared pointer argument. Note that Retain need not be in BB.
+ continue;
+
+ Changed = true;
+ ++NumPartialNoops;
+ // Clone the call into each predecessor that has a non-null value.
+ CallInst *CInst = cast<CallInst>(Inst);
+ Type *ParamTy = CInst->getArgOperand(0)->getType();
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ Value *Incoming = GetRCIdentityRoot(PN->getIncomingValue(i));
+ if (IsNullOrUndef(Incoming))
+ continue;
+ Value *Op = PN->getIncomingValue(i);
+ Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
+ CallInst *Clone = cast<CallInst>(
+ CloneCallInstForBB(*CInst, *InsertPos->getParent(), BlockColors));
+ if (Op->getType() != ParamTy)
+ Op = new BitCastInst(Op, ParamTy, "", InsertPos);
+ Clone->setArgOperand(0, Op);
+ Clone->insertBefore(InsertPos);
+
+ LLVM_DEBUG(dbgs() << "Cloning " << *CInst << "\n"
+ "And inserting clone at "
+ << *InsertPos << "\n");
+ Worklist.push_back(std::make_pair(Clone, Incoming));
+ }
+ // Erase the original call.
+ LLVM_DEBUG(dbgs() << "Erasing: " << *CInst << "\n");
+ EraseInstruction(CInst);
+ } while (!Worklist.empty());
+}
+
+/// If we have a top down pointer in the S_Use state, make sure that there are
+/// no CFG hazards by checking the states of various bottom up pointers.
+static void CheckForUseCFGHazard(const Sequence SuccSSeq,
+ const bool SuccSRRIKnownSafe,
+ TopDownPtrState &S,
+ bool &SomeSuccHasSame,
+ bool &AllSuccsHaveSame,
+ bool &NotAllSeqEqualButKnownSafe,
+ bool &ShouldContinue) {
+ switch (SuccSSeq) {
+ case S_CanRelease: {
+ if (!S.IsKnownSafe() && !SuccSRRIKnownSafe) {
+ S.ClearSequenceProgress();
+ break;
+ }
+ S.SetCFGHazardAfflicted(true);
+ ShouldContinue = true;
+ break;
+ }
+ case S_Use:
+ SomeSuccHasSame = true;
+ break;
+ case S_Stop:
+ case S_Release:
+ case S_MovableRelease:
+ if (!S.IsKnownSafe() && !SuccSRRIKnownSafe)
+ AllSuccsHaveSame = false;
+ else
+ NotAllSeqEqualButKnownSafe = true;
+ break;
+ case S_Retain:
+ llvm_unreachable("bottom-up pointer in retain state!");
+ case S_None:
+ llvm_unreachable("This should have been handled earlier.");
+ }
+}
+
+/// If we have a Top Down pointer in the S_CanRelease state, make sure that
+/// there are no CFG hazards by checking the states of various bottom up
+/// pointers.
+static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq,
+ const bool SuccSRRIKnownSafe,
+ TopDownPtrState &S,
+ bool &SomeSuccHasSame,
+ bool &AllSuccsHaveSame,
+ bool &NotAllSeqEqualButKnownSafe) {
+ switch (SuccSSeq) {
+ case S_CanRelease:
+ SomeSuccHasSame = true;
+ break;
+ case S_Stop:
+ case S_Release:
+ case S_MovableRelease:
+ case S_Use:
+ if (!S.IsKnownSafe() && !SuccSRRIKnownSafe)
+ AllSuccsHaveSame = false;
+ else
+ NotAllSeqEqualButKnownSafe = true;
+ break;
+ case S_Retain:
+ llvm_unreachable("bottom-up pointer in retain state!");
+ case S_None:
+ llvm_unreachable("This should have been handled earlier.");
+ }
+}
+
+/// Check for critical edges, loop boundaries, irreducible control flow, or
+/// other CFG structures where moving code across the edge would result in it
+/// being executed more.
+void
+ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
+ DenseMap<const BasicBlock *, BBState> &BBStates,
+ BBState &MyStates) const {
+ // If any top-down local-use or possible-dec has a succ which is earlier in
+ // the sequence, forget it.
+ for (auto I = MyStates.top_down_ptr_begin(), E = MyStates.top_down_ptr_end();
+ I != E; ++I) {
+ TopDownPtrState &S = I->second;
+ const Sequence Seq = I->second.GetSeq();
+
+ // We only care about S_Retain, S_CanRelease, and S_Use.
+ if (Seq == S_None)
+ continue;
+
+ // Make sure that if extra top down states are added in the future that this
+ // code is updated to handle it.
+ assert((Seq == S_Retain || Seq == S_CanRelease || Seq == S_Use) &&
+ "Unknown top down sequence state.");
+
+ const Value *Arg = I->first;
+ bool SomeSuccHasSame = false;
+ bool AllSuccsHaveSame = true;
+ bool NotAllSeqEqualButKnownSafe = false;
+
+ for (const BasicBlock *Succ : successors(BB)) {
+ // If VisitBottomUp has pointer information for this successor, take
+ // what we know about it.
+ const DenseMap<const BasicBlock *, BBState>::iterator BBI =
+ BBStates.find(Succ);
+ assert(BBI != BBStates.end());
+ const BottomUpPtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+ const Sequence SuccSSeq = SuccS.GetSeq();
+
+ // If bottom up, the pointer is in an S_None state, clear the sequence
+ // progress since the sequence in the bottom up state finished
+ // suggesting a mismatch in between retains/releases. This is true for
+ // all three cases that we are handling here: S_Retain, S_Use, and
+ // S_CanRelease.
+ if (SuccSSeq == S_None) {
+ S.ClearSequenceProgress();
+ continue;
+ }
+
+ // If we have S_Use or S_CanRelease, perform our check for cfg hazard
+ // checks.
+ const bool SuccSRRIKnownSafe = SuccS.IsKnownSafe();
+
+ // *NOTE* We do not use Seq from above here since we are allowing for
+ // S.GetSeq() to change while we are visiting basic blocks.
+ switch(S.GetSeq()) {
+ case S_Use: {
+ bool ShouldContinue = false;
+ CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S, SomeSuccHasSame,
+ AllSuccsHaveSame, NotAllSeqEqualButKnownSafe,
+ ShouldContinue);
+ if (ShouldContinue)
+ continue;
+ break;
+ }
+ case S_CanRelease:
+ CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S,
+ SomeSuccHasSame, AllSuccsHaveSame,
+ NotAllSeqEqualButKnownSafe);
+ break;
+ case S_Retain:
+ case S_None:
+ case S_Stop:
+ case S_Release:
+ case S_MovableRelease:
+ break;
+ }
+ }
+
+ // If the state at the other end of any of the successor edges
+ // matches the current state, require all edges to match. This
+ // guards against loops in the middle of a sequence.
+ if (SomeSuccHasSame && !AllSuccsHaveSame) {
+ S.ClearSequenceProgress();
+ } else if (NotAllSeqEqualButKnownSafe) {
+ // If we would have cleared the state foregoing the fact that we are known
+ // safe, stop code motion. This is because whether or not it is safe to
+ // remove RR pairs via KnownSafe is an orthogonal concept to whether we
+ // are allowed to perform code motion.
+ S.SetCFGHazardAfflicted(true);
+ }
+ }
+}
+
+bool ObjCARCOpt::VisitInstructionBottomUp(
+ Instruction *Inst, BasicBlock *BB, BlotMapVector<Value *, RRInfo> &Retains,
+ BBState &MyStates) {
+ bool NestingDetected = false;
+ ARCInstKind Class = GetARCInstKind(Inst);
+ const Value *Arg = nullptr;
+
+ LLVM_DEBUG(dbgs() << " Class: " << Class << "\n");
+
+ switch (Class) {
+ case ARCInstKind::Release: {
+ Arg = GetArgRCIdentityRoot(Inst);
+
+ BottomUpPtrState &S = MyStates.getPtrBottomUpState(Arg);
+ NestingDetected |= S.InitBottomUp(MDKindCache, Inst);
+ break;
+ }
+ case ARCInstKind::RetainBlock:
+ // In OptimizeIndividualCalls, we have strength reduced all optimizable
+ // objc_retainBlocks to objc_retains. Thus at this point any
+ // objc_retainBlocks that we see are not optimizable.
+ break;
+ case ARCInstKind::Retain:
+ case ARCInstKind::RetainRV: {
+ Arg = GetArgRCIdentityRoot(Inst);
+ BottomUpPtrState &S = MyStates.getPtrBottomUpState(Arg);
+ if (S.MatchWithRetain()) {
+ // Don't do retain+release tracking for ARCInstKind::RetainRV, because
+ // it's better to let it remain as the first instruction after a call.
+ if (Class != ARCInstKind::RetainRV) {
+ LLVM_DEBUG(dbgs() << " Matching with: " << *Inst << "\n");
+ Retains[Inst] = S.GetRRInfo();
+ }
+ S.ClearSequenceProgress();
+ }
+ // A retain moving bottom up can be a use.
+ break;
+ }
+ case ARCInstKind::AutoreleasepoolPop:
+ // Conservatively, clear MyStates for all known pointers.
+ MyStates.clearBottomUpPointers();
+ return NestingDetected;
+ case ARCInstKind::AutoreleasepoolPush:
+ case ARCInstKind::None:
+ // These are irrelevant.
+ return NestingDetected;
+ default:
+ break;
+ }
+
+ // Consider any other possible effects of this instruction on each
+ // pointer being tracked.
+ for (auto MI = MyStates.bottom_up_ptr_begin(),
+ ME = MyStates.bottom_up_ptr_end();
+ MI != ME; ++MI) {
+ const Value *Ptr = MI->first;
+ if (Ptr == Arg)
+ continue; // Handled above.
+ BottomUpPtrState &S = MI->second;
+
+ if (S.HandlePotentialAlterRefCount(Inst, Ptr, PA, Class))
+ continue;
+
+ S.HandlePotentialUse(BB, Inst, Ptr, PA, Class);
+ }
+
+ return NestingDetected;
+}
+
+bool ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
+ DenseMap<const BasicBlock *, BBState> &BBStates,
+ BlotMapVector<Value *, RRInfo> &Retains) {
+ LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::VisitBottomUp ==\n");
+
+ bool NestingDetected = false;
+ BBState &MyStates = BBStates[BB];
+
+ // Merge the states from each successor to compute the initial state
+ // for the current block.
+ BBState::edge_iterator SI(MyStates.succ_begin()),
+ SE(MyStates.succ_end());
+ if (SI != SE) {
+ const BasicBlock *Succ = *SI;
+ DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Succ);
+ assert(I != BBStates.end());
+ MyStates.InitFromSucc(I->second);
+ ++SI;
+ for (; SI != SE; ++SI) {
+ Succ = *SI;
+ I = BBStates.find(Succ);
+ assert(I != BBStates.end());
+ MyStates.MergeSucc(I->second);
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "Before:\n"
+ << BBStates[BB] << "\n"
+ << "Performing Dataflow:\n");
+
+ // Visit all the instructions, bottom-up.
+ for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
+ Instruction *Inst = &*std::prev(I);
+
+ // Invoke instructions are visited as part of their successors (below).
+ if (isa<InvokeInst>(Inst))
+ continue;
+
+ LLVM_DEBUG(dbgs() << " Visiting " << *Inst << "\n");
+
+ NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
+
+ // Bail out if the number of pointers being tracked becomes too large so
+ // that this pass can complete in a reasonable amount of time.
+ if (MyStates.bottom_up_ptr_list_size() > MaxPtrStates) {
+ DisableRetainReleasePairing = true;
+ return false;
+ }
+ }
+
+ // If there's a predecessor with an invoke, visit the invoke as if it were
+ // part of this block, since we can't insert code after an invoke in its own
+ // block, and we don't want to split critical edges.
+ for (BBState::edge_iterator PI(MyStates.pred_begin()),
+ PE(MyStates.pred_end()); PI != PE; ++PI) {
+ BasicBlock *Pred = *PI;
+ if (InvokeInst *II = dyn_cast<InvokeInst>(&Pred->back()))
+ NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
+ }
+
+ LLVM_DEBUG(dbgs() << "\nFinal State:\n" << BBStates[BB] << "\n");
+
+ return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
+ DenseMap<Value *, RRInfo> &Releases,
+ BBState &MyStates) {
+ bool NestingDetected = false;
+ ARCInstKind Class = GetARCInstKind(Inst);
+ const Value *Arg = nullptr;
+
+ LLVM_DEBUG(dbgs() << " Class: " << Class << "\n");
+
+ switch (Class) {
+ case ARCInstKind::RetainBlock:
+ // In OptimizeIndividualCalls, we have strength reduced all optimizable
+ // objc_retainBlocks to objc_retains. Thus at this point any
+ // objc_retainBlocks that we see are not optimizable. We need to break since
+ // a retain can be a potential use.
+ break;
+ case ARCInstKind::Retain:
+ case ARCInstKind::RetainRV: {
+ Arg = GetArgRCIdentityRoot(Inst);
+ TopDownPtrState &S = MyStates.getPtrTopDownState(Arg);
+ NestingDetected |= S.InitTopDown(Class, Inst);
+ // A retain can be a potential use; proceed to the generic checking
+ // code below.
+ break;
+ }
+ case ARCInstKind::Release: {
+ Arg = GetArgRCIdentityRoot(Inst);
+ TopDownPtrState &S = MyStates.getPtrTopDownState(Arg);
+ // Try to form a tentative pair in between this release instruction and the
+ // top down pointers that we are tracking.
+ if (S.MatchWithRelease(MDKindCache, Inst)) {
+ // If we succeed, copy S's RRInfo into the Release -> {Retain Set
+ // Map}. Then we clear S.
+ LLVM_DEBUG(dbgs() << " Matching with: " << *Inst << "\n");
+ Releases[Inst] = S.GetRRInfo();
+ S.ClearSequenceProgress();
+ }
+ break;
+ }
+ case ARCInstKind::AutoreleasepoolPop:
+ // Conservatively, clear MyStates for all known pointers.
+ MyStates.clearTopDownPointers();
+ return false;
+ case ARCInstKind::AutoreleasepoolPush:
+ case ARCInstKind::None:
+ // These can not be uses of
+ return false;
+ default:
+ break;
+ }
+
+ // Consider any other possible effects of this instruction on each
+ // pointer being tracked.
+ for (auto MI = MyStates.top_down_ptr_begin(),
+ ME = MyStates.top_down_ptr_end();
+ MI != ME; ++MI) {
+ const Value *Ptr = MI->first;
+ if (Ptr == Arg)
+ continue; // Handled above.
+ TopDownPtrState &S = MI->second;
+ if (S.HandlePotentialAlterRefCount(Inst, Ptr, PA, Class))
+ continue;
+
+ S.HandlePotentialUse(Inst, Ptr, PA, Class);
+ }
+
+ return NestingDetected;
+}
+
+bool
+ObjCARCOpt::VisitTopDown(BasicBlock *BB,
+ DenseMap<const BasicBlock *, BBState> &BBStates,
+ DenseMap<Value *, RRInfo> &Releases) {
+ LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::VisitTopDown ==\n");
+ bool NestingDetected = false;
+ BBState &MyStates = BBStates[BB];
+
+ // Merge the states from each predecessor to compute the initial state
+ // for the current block.
+ BBState::edge_iterator PI(MyStates.pred_begin()),
+ PE(MyStates.pred_end());
+ if (PI != PE) {
+ const BasicBlock *Pred = *PI;
+ DenseMap<const BasicBlock *, BBState>::iterator I = BBStates.find(Pred);
+ assert(I != BBStates.end());
+ MyStates.InitFromPred(I->second);
+ ++PI;
+ for (; PI != PE; ++PI) {
+ Pred = *PI;
+ I = BBStates.find(Pred);
+ assert(I != BBStates.end());
+ MyStates.MergePred(I->second);
+ }
+ }
+
+ // Check that BB and MyStates have the same number of predecessors. This
+ // prevents retain calls that live outside a loop from being moved into the
+ // loop.
+ if (!BB->hasNPredecessors(MyStates.pred_end() - MyStates.pred_begin()))
+ for (auto I = MyStates.top_down_ptr_begin(),
+ E = MyStates.top_down_ptr_end();
+ I != E; ++I)
+ I->second.SetCFGHazardAfflicted(true);
+
+ LLVM_DEBUG(dbgs() << "Before:\n"
+ << BBStates[BB] << "\n"
+ << "Performing Dataflow:\n");
+
+ // Visit all the instructions, top-down.
+ for (Instruction &Inst : *BB) {
+ LLVM_DEBUG(dbgs() << " Visiting " << Inst << "\n");
+
+ NestingDetected |= VisitInstructionTopDown(&Inst, Releases, MyStates);
+
+ // Bail out if the number of pointers being tracked becomes too large so
+ // that this pass can complete in a reasonable amount of time.
+ if (MyStates.top_down_ptr_list_size() > MaxPtrStates) {
+ DisableRetainReleasePairing = true;
+ return false;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "\nState Before Checking for CFG Hazards:\n"
+ << BBStates[BB] << "\n\n");
+ CheckForCFGHazards(BB, BBStates, MyStates);
+ LLVM_DEBUG(dbgs() << "Final State:\n" << BBStates[BB] << "\n");
+ return NestingDetected;
+}
+
+static void
+ComputePostOrders(Function &F,
+ SmallVectorImpl<BasicBlock *> &PostOrder,
+ SmallVectorImpl<BasicBlock *> &ReverseCFGPostOrder,
+ unsigned NoObjCARCExceptionsMDKind,
+ DenseMap<const BasicBlock *, BBState> &BBStates) {
+ /// The visited set, for doing DFS walks.
+ SmallPtrSet<BasicBlock *, 16> Visited;
+
+ // Do DFS, computing the PostOrder.
+ SmallPtrSet<BasicBlock *, 16> OnStack;
+ SmallVector<std::pair<BasicBlock *, succ_iterator>, 16> SuccStack;
+
+ // Functions always have exactly one entry block, and we don't have
+ // any other block that we treat like an entry block.
+ BasicBlock *EntryBB = &F.getEntryBlock();
+ BBState &MyStates = BBStates[EntryBB];
+ MyStates.SetAsEntry();
+ Instruction *EntryTI = EntryBB->getTerminator();
+ SuccStack.push_back(std::make_pair(EntryBB, succ_iterator(EntryTI)));
+ Visited.insert(EntryBB);
+ OnStack.insert(EntryBB);
+ do {
+ dfs_next_succ:
+ BasicBlock *CurrBB = SuccStack.back().first;
+ succ_iterator SE(CurrBB->getTerminator(), false);
+
+ while (SuccStack.back().second != SE) {
+ BasicBlock *SuccBB = *SuccStack.back().second++;
+ if (Visited.insert(SuccBB).second) {
+ SuccStack.push_back(
+ std::make_pair(SuccBB, succ_iterator(SuccBB->getTerminator())));
+ BBStates[CurrBB].addSucc(SuccBB);
+ BBState &SuccStates = BBStates[SuccBB];
+ SuccStates.addPred(CurrBB);
+ OnStack.insert(SuccBB);
+ goto dfs_next_succ;
+ }
+
+ if (!OnStack.count(SuccBB)) {
+ BBStates[CurrBB].addSucc(SuccBB);
+ BBStates[SuccBB].addPred(CurrBB);
+ }
+ }
+ OnStack.erase(CurrBB);
+ PostOrder.push_back(CurrBB);
+ SuccStack.pop_back();
+ } while (!SuccStack.empty());
+
+ Visited.clear();
+
+ // Do reverse-CFG DFS, computing the reverse-CFG PostOrder.
+ // Functions may have many exits, and there also blocks which we treat
+ // as exits due to ignored edges.
+ SmallVector<std::pair<BasicBlock *, BBState::edge_iterator>, 16> PredStack;
+ for (BasicBlock &ExitBB : F) {
+ BBState &MyStates = BBStates[&ExitBB];
+ if (!MyStates.isExit())
+ continue;
+
+ MyStates.SetAsExit();
+
+ PredStack.push_back(std::make_pair(&ExitBB, MyStates.pred_begin()));
+ Visited.insert(&ExitBB);
+ while (!PredStack.empty()) {
+ reverse_dfs_next_succ:
+ BBState::edge_iterator PE = BBStates[PredStack.back().first].pred_end();
+ while (PredStack.back().second != PE) {
+ BasicBlock *BB = *PredStack.back().second++;
+ if (Visited.insert(BB).second) {
+ PredStack.push_back(std::make_pair(BB, BBStates[BB].pred_begin()));
+ goto reverse_dfs_next_succ;
+ }
+ }
+ ReverseCFGPostOrder.push_back(PredStack.pop_back_val().first);
+ }
+ }
+}
+
+// Visit the function both top-down and bottom-up.
+bool ObjCARCOpt::Visit(Function &F,
+ DenseMap<const BasicBlock *, BBState> &BBStates,
+ BlotMapVector<Value *, RRInfo> &Retains,
+ DenseMap<Value *, RRInfo> &Releases) {
+ // Use reverse-postorder traversals, because we magically know that loops
+ // will be well behaved, i.e. they won't repeatedly call retain on a single
+ // pointer without doing a release. We can't use the ReversePostOrderTraversal
+ // class here because we want the reverse-CFG postorder to consider each
+ // function exit point, and we want to ignore selected cycle edges.
+ SmallVector<BasicBlock *, 16> PostOrder;
+ SmallVector<BasicBlock *, 16> ReverseCFGPostOrder;
+ ComputePostOrders(F, PostOrder, ReverseCFGPostOrder,
+ MDKindCache.get(ARCMDKindID::NoObjCARCExceptions),
+ BBStates);
+
+ // Use reverse-postorder on the reverse CFG for bottom-up.
+ bool BottomUpNestingDetected = false;
+ for (BasicBlock *BB : llvm::reverse(ReverseCFGPostOrder)) {
+ BottomUpNestingDetected |= VisitBottomUp(BB, BBStates, Retains);
+ if (DisableRetainReleasePairing)
+ return false;
+ }
+
+ // Use reverse-postorder for top-down.
+ bool TopDownNestingDetected = false;
+ for (BasicBlock *BB : llvm::reverse(PostOrder)) {
+ TopDownNestingDetected |= VisitTopDown(BB, BBStates, Releases);
+ if (DisableRetainReleasePairing)
+ return false;
+ }
+
+ return TopDownNestingDetected && BottomUpNestingDetected;
+}
+
+/// Move the calls in RetainsToMove and ReleasesToMove.
+void ObjCARCOpt::MoveCalls(Value *Arg, RRInfo &RetainsToMove,
+ RRInfo &ReleasesToMove,
+ BlotMapVector<Value *, RRInfo> &Retains,
+ DenseMap<Value *, RRInfo> &Releases,
+ SmallVectorImpl<Instruction *> &DeadInsts,
+ Module *M) {
+ Type *ArgTy = Arg->getType();
+ Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
+
+ LLVM_DEBUG(dbgs() << "== ObjCARCOpt::MoveCalls ==\n");
+
+ // Insert the new retain and release calls.
+ for (Instruction *InsertPt : ReleasesToMove.ReverseInsertPts) {
+ Value *MyArg = ArgTy == ParamTy ? Arg :
+ new BitCastInst(Arg, ParamTy, "", InsertPt);
+ Function *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
+ CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt);
+ Call->setDoesNotThrow();
+ Call->setTailCall();
+
+ LLVM_DEBUG(dbgs() << "Inserting new Retain: " << *Call
+ << "\n"
+ "At insertion point: "
+ << *InsertPt << "\n");
+ }
+ for (Instruction *InsertPt : RetainsToMove.ReverseInsertPts) {
+ Value *MyArg = ArgTy == ParamTy ? Arg :
+ new BitCastInst(Arg, ParamTy, "", InsertPt);
+ Function *Decl = EP.get(ARCRuntimeEntryPointKind::Release);
+ CallInst *Call = CallInst::Create(Decl, MyArg, "", InsertPt);
+ // Attach a clang.imprecise_release metadata tag, if appropriate.
+ if (MDNode *M = ReleasesToMove.ReleaseMetadata)
+ Call->setMetadata(MDKindCache.get(ARCMDKindID::ImpreciseRelease), M);
+ Call->setDoesNotThrow();
+ if (ReleasesToMove.IsTailCallRelease)
+ Call->setTailCall();
+
+ LLVM_DEBUG(dbgs() << "Inserting new Release: " << *Call
+ << "\n"
+ "At insertion point: "
+ << *InsertPt << "\n");
+ }
+
+ // Delete the original retain and release calls.
+ for (Instruction *OrigRetain : RetainsToMove.Calls) {
+ Retains.blot(OrigRetain);
+ DeadInsts.push_back(OrigRetain);
+ LLVM_DEBUG(dbgs() << "Deleting retain: " << *OrigRetain << "\n");
+ }
+ for (Instruction *OrigRelease : ReleasesToMove.Calls) {
+ Releases.erase(OrigRelease);
+ DeadInsts.push_back(OrigRelease);
+ LLVM_DEBUG(dbgs() << "Deleting release: " << *OrigRelease << "\n");
+ }
+}
+
+bool ObjCARCOpt::PairUpRetainsAndReleases(
+ DenseMap<const BasicBlock *, BBState> &BBStates,
+ BlotMapVector<Value *, RRInfo> &Retains,
+ DenseMap<Value *, RRInfo> &Releases, Module *M,
+ Instruction *Retain,
+ SmallVectorImpl<Instruction *> &DeadInsts, RRInfo &RetainsToMove,
+ RRInfo &ReleasesToMove, Value *Arg, bool KnownSafe,
+ bool &AnyPairsCompletelyEliminated) {
+ // If a pair happens in a region where it is known that the reference count
+ // is already incremented, we can similarly ignore possible decrements unless
+ // we are dealing with a retainable object with multiple provenance sources.
+ bool KnownSafeTD = true, KnownSafeBU = true;
+ bool CFGHazardAfflicted = false;
+
+ // Connect the dots between the top-down-collected RetainsToMove and
+ // bottom-up-collected ReleasesToMove to form sets of related calls.
+ // This is an iterative process so that we connect multiple releases
+ // to multiple retains if needed.
+ unsigned OldDelta = 0;
+ unsigned NewDelta = 0;
+ unsigned OldCount = 0;
+ unsigned NewCount = 0;
+ bool FirstRelease = true;
+ for (SmallVector<Instruction *, 4> NewRetains{Retain};;) {
+ SmallVector<Instruction *, 4> NewReleases;
+ for (Instruction *NewRetain : NewRetains) {
+ auto It = Retains.find(NewRetain);
+ assert(It != Retains.end());
+ const RRInfo &NewRetainRRI = It->second;
+ KnownSafeTD &= NewRetainRRI.KnownSafe;
+ CFGHazardAfflicted |= NewRetainRRI.CFGHazardAfflicted;
+ for (Instruction *NewRetainRelease : NewRetainRRI.Calls) {
+ auto Jt = Releases.find(NewRetainRelease);
+ if (Jt == Releases.end())
+ return false;
+ const RRInfo &NewRetainReleaseRRI = Jt->second;
+
+ // If the release does not have a reference to the retain as well,
+ // something happened which is unaccounted for. Do not do anything.
+ //
+ // This can happen if we catch an additive overflow during path count
+ // merging.
+ if (!NewRetainReleaseRRI.Calls.count(NewRetain))
+ return false;
+
+ if (ReleasesToMove.Calls.insert(NewRetainRelease).second) {
+ // If we overflow when we compute the path count, don't remove/move
+ // anything.
+ const BBState &NRRBBState = BBStates[NewRetainRelease->getParent()];
+ unsigned PathCount = BBState::OverflowOccurredValue;
+ if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
+ return false;
+ assert(PathCount != BBState::OverflowOccurredValue &&
+ "PathCount at this point can not be "
+ "OverflowOccurredValue.");
+ OldDelta -= PathCount;
+
+ // Merge the ReleaseMetadata and IsTailCallRelease values.
+ if (FirstRelease) {
+ ReleasesToMove.ReleaseMetadata =
+ NewRetainReleaseRRI.ReleaseMetadata;
+ ReleasesToMove.IsTailCallRelease =
+ NewRetainReleaseRRI.IsTailCallRelease;
+ FirstRelease = false;
+ } else {
+ if (ReleasesToMove.ReleaseMetadata !=
+ NewRetainReleaseRRI.ReleaseMetadata)
+ ReleasesToMove.ReleaseMetadata = nullptr;
+ if (ReleasesToMove.IsTailCallRelease !=
+ NewRetainReleaseRRI.IsTailCallRelease)
+ ReleasesToMove.IsTailCallRelease = false;
+ }
+
+ // Collect the optimal insertion points.
+ if (!KnownSafe)
+ for (Instruction *RIP : NewRetainReleaseRRI.ReverseInsertPts) {
+ if (ReleasesToMove.ReverseInsertPts.insert(RIP).second) {
+ // If we overflow when we compute the path count, don't
+ // remove/move anything.
+ const BBState &RIPBBState = BBStates[RIP->getParent()];
+ PathCount = BBState::OverflowOccurredValue;
+ if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
+ return false;
+ assert(PathCount != BBState::OverflowOccurredValue &&
+ "PathCount at this point can not be "
+ "OverflowOccurredValue.");
+ NewDelta -= PathCount;
+ }
+ }
+ NewReleases.push_back(NewRetainRelease);
+ }
+ }
+ }
+ NewRetains.clear();
+ if (NewReleases.empty()) break;
+
+ // Back the other way.
+ for (Instruction *NewRelease : NewReleases) {
+ auto It = Releases.find(NewRelease);
+ assert(It != Releases.end());
+ const RRInfo &NewReleaseRRI = It->second;
+ KnownSafeBU &= NewReleaseRRI.KnownSafe;
+ CFGHazardAfflicted |= NewReleaseRRI.CFGHazardAfflicted;
+ for (Instruction *NewReleaseRetain : NewReleaseRRI.Calls) {
+ auto Jt = Retains.find(NewReleaseRetain);
+ if (Jt == Retains.end())
+ return false;
+ const RRInfo &NewReleaseRetainRRI = Jt->second;
+
+ // If the retain does not have a reference to the release as well,
+ // something happened which is unaccounted for. Do not do anything.
+ //
+ // This can happen if we catch an additive overflow during path count
+ // merging.
+ if (!NewReleaseRetainRRI.Calls.count(NewRelease))
+ return false;
+
+ if (RetainsToMove.Calls.insert(NewReleaseRetain).second) {
+ // If we overflow when we compute the path count, don't remove/move
+ // anything.
+ const BBState &NRRBBState = BBStates[NewReleaseRetain->getParent()];
+ unsigned PathCount = BBState::OverflowOccurredValue;
+ if (NRRBBState.GetAllPathCountWithOverflow(PathCount))
+ return false;
+ assert(PathCount != BBState::OverflowOccurredValue &&
+ "PathCount at this point can not be "
+ "OverflowOccurredValue.");
+ OldDelta += PathCount;
+ OldCount += PathCount;
+
+ // Collect the optimal insertion points.
+ if (!KnownSafe)
+ for (Instruction *RIP : NewReleaseRetainRRI.ReverseInsertPts) {
+ if (RetainsToMove.ReverseInsertPts.insert(RIP).second) {
+ // If we overflow when we compute the path count, don't
+ // remove/move anything.
+ const BBState &RIPBBState = BBStates[RIP->getParent()];
+
+ PathCount = BBState::OverflowOccurredValue;
+ if (RIPBBState.GetAllPathCountWithOverflow(PathCount))
+ return false;
+ assert(PathCount != BBState::OverflowOccurredValue &&
+ "PathCount at this point can not be "
+ "OverflowOccurredValue.");
+ NewDelta += PathCount;
+ NewCount += PathCount;
+ }
+ }
+ NewRetains.push_back(NewReleaseRetain);
+ }
+ }
+ }
+ if (NewRetains.empty()) break;
+ }
+
+ // We can only remove pointers if we are known safe in both directions.
+ bool UnconditionallySafe = KnownSafeTD && KnownSafeBU;
+ if (UnconditionallySafe) {
+ RetainsToMove.ReverseInsertPts.clear();
+ ReleasesToMove.ReverseInsertPts.clear();
+ NewCount = 0;
+ } else {
+ // Determine whether the new insertion points we computed preserve the
+ // balance of retain and release calls through the program.
+ // TODO: If the fully aggressive solution isn't valid, try to find a
+ // less aggressive solution which is.
+ if (NewDelta != 0)
+ return false;
+
+ // At this point, we are not going to remove any RR pairs, but we still are
+ // able to move RR pairs. If one of our pointers is afflicted with
+ // CFGHazards, we cannot perform such code motion so exit early.
+ const bool WillPerformCodeMotion =
+ !RetainsToMove.ReverseInsertPts.empty() ||
+ !ReleasesToMove.ReverseInsertPts.empty();
+ if (CFGHazardAfflicted && WillPerformCodeMotion)
+ return false;
+ }
+
+ // Determine whether the original call points are balanced in the retain and
+ // release calls through the program. If not, conservatively don't touch
+ // them.
+ // TODO: It's theoretically possible to do code motion in this case, as
+ // long as the existing imbalances are maintained.
+ if (OldDelta != 0)
+ return false;
+
+ Changed = true;
+ assert(OldCount != 0 && "Unreachable code?");
+ NumRRs += OldCount - NewCount;
+ // Set to true if we completely removed any RR pairs.
+ AnyPairsCompletelyEliminated = NewCount == 0;
+
+ // We can move calls!
+ return true;
+}
+
+/// Identify pairings between the retains and releases, and delete and/or move
+/// them.
+bool ObjCARCOpt::PerformCodePlacement(
+ DenseMap<const BasicBlock *, BBState> &BBStates,
+ BlotMapVector<Value *, RRInfo> &Retains,
+ DenseMap<Value *, RRInfo> &Releases, Module *M) {
+ LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n");
+
+ bool AnyPairsCompletelyEliminated = false;
+ SmallVector<Instruction *, 8> DeadInsts;
+
+ // Visit each retain.
+ for (BlotMapVector<Value *, RRInfo>::const_iterator I = Retains.begin(),
+ E = Retains.end();
+ I != E; ++I) {
+ Value *V = I->first;
+ if (!V) continue; // blotted
+
+ Instruction *Retain = cast<Instruction>(V);
+
+ LLVM_DEBUG(dbgs() << "Visiting: " << *Retain << "\n");
+
+ Value *Arg = GetArgRCIdentityRoot(Retain);
+
+ // If the object being released is in static or stack storage, we know it's
+ // not being managed by ObjC reference counting, so we can delete pairs
+ // regardless of what possible decrements or uses lie between them.
+ bool KnownSafe = isa<Constant>(Arg) || isa<AllocaInst>(Arg);
+
+ // A constant pointer can't be pointing to an object on the heap. It may
+ // be reference-counted, but it won't be deleted.
+ if (const LoadInst *LI = dyn_cast<LoadInst>(Arg))
+ if (const GlobalVariable *GV =
+ dyn_cast<GlobalVariable>(
+ GetRCIdentityRoot(LI->getPointerOperand())))
+ if (GV->isConstant())
+ KnownSafe = true;
+
+ // Connect the dots between the top-down-collected RetainsToMove and
+ // bottom-up-collected ReleasesToMove to form sets of related calls.
+ RRInfo RetainsToMove, ReleasesToMove;
+
+ bool PerformMoveCalls = PairUpRetainsAndReleases(
+ BBStates, Retains, Releases, M, Retain, DeadInsts,
+ RetainsToMove, ReleasesToMove, Arg, KnownSafe,
+ AnyPairsCompletelyEliminated);
+
+ if (PerformMoveCalls) {
+ // Ok, everything checks out and we're all set. Let's move/delete some
+ // code!
+ MoveCalls(Arg, RetainsToMove, ReleasesToMove,
+ Retains, Releases, DeadInsts, M);
+ }
+ }
+
+ // Now that we're done moving everything, we can delete the newly dead
+ // instructions, as we no longer need them as insert points.
+ while (!DeadInsts.empty())
+ EraseInstruction(DeadInsts.pop_back_val());
+
+ return AnyPairsCompletelyEliminated;
+}
+
+/// Weak pointer optimizations.
+void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
+ LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeWeakCalls ==\n");
+
+ // First, do memdep-style RLE and S2L optimizations. We can't use memdep
+ // itself because it uses AliasAnalysis and we need to do provenance
+ // queries instead.
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+ Instruction *Inst = &*I++;
+
+ LLVM_DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
+
+ ARCInstKind Class = GetBasicARCInstKind(Inst);
+ if (Class != ARCInstKind::LoadWeak &&
+ Class != ARCInstKind::LoadWeakRetained)
+ continue;
+
+ // Delete objc_loadWeak calls with no users.
+ if (Class == ARCInstKind::LoadWeak && Inst->use_empty()) {
+ Inst->eraseFromParent();
+ Changed = true;
+ continue;
+ }
+
+ // TODO: For now, just look for an earlier available version of this value
+ // within the same block. Theoretically, we could do memdep-style non-local
+ // analysis too, but that would want caching. A better approach would be to
+ // use the technique that EarlyCSE uses.
+ inst_iterator Current = std::prev(I);
+ BasicBlock *CurrentBB = &*Current.getBasicBlockIterator();
+ for (BasicBlock::iterator B = CurrentBB->begin(),
+ J = Current.getInstructionIterator();
+ J != B; --J) {
+ Instruction *EarlierInst = &*std::prev(J);
+ ARCInstKind EarlierClass = GetARCInstKind(EarlierInst);
+ switch (EarlierClass) {
+ case ARCInstKind::LoadWeak:
+ case ARCInstKind::LoadWeakRetained: {
+ // If this is loading from the same pointer, replace this load's value
+ // with that one.
+ CallInst *Call = cast<CallInst>(Inst);
+ CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+ Value *Arg = Call->getArgOperand(0);
+ Value *EarlierArg = EarlierCall->getArgOperand(0);
+ switch (PA.getAA()->alias(Arg, EarlierArg)) {
+ case MustAlias:
+ Changed = true;
+ // If the load has a builtin retain, insert a plain retain for it.
+ if (Class == ARCInstKind::LoadWeakRetained) {
+ Function *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
+ CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call);
+ CI->setTailCall();
+ }
+ // Zap the fully redundant load.
+ Call->replaceAllUsesWith(EarlierCall);
+ Call->eraseFromParent();
+ goto clobbered;
+ case MayAlias:
+ case PartialAlias:
+ goto clobbered;
+ case NoAlias:
+ break;
+ }
+ break;
+ }
+ case ARCInstKind::StoreWeak:
+ case ARCInstKind::InitWeak: {
+ // If this is storing to the same pointer and has the same size etc.
+ // replace this load's value with the stored value.
+ CallInst *Call = cast<CallInst>(Inst);
+ CallInst *EarlierCall = cast<CallInst>(EarlierInst);
+ Value *Arg = Call->getArgOperand(0);
+ Value *EarlierArg = EarlierCall->getArgOperand(0);
+ switch (PA.getAA()->alias(Arg, EarlierArg)) {
+ case MustAlias:
+ Changed = true;
+ // If the load has a builtin retain, insert a plain retain for it.
+ if (Class == ARCInstKind::LoadWeakRetained) {
+ Function *Decl = EP.get(ARCRuntimeEntryPointKind::Retain);
+ CallInst *CI = CallInst::Create(Decl, EarlierCall, "", Call);
+ CI->setTailCall();
+ }
+ // Zap the fully redundant load.
+ Call->replaceAllUsesWith(EarlierCall->getArgOperand(1));
+ Call->eraseFromParent();
+ goto clobbered;
+ case MayAlias:
+ case PartialAlias:
+ goto clobbered;
+ case NoAlias:
+ break;
+ }
+ break;
+ }
+ case ARCInstKind::MoveWeak:
+ case ARCInstKind::CopyWeak:
+ // TOOD: Grab the copied value.
+ goto clobbered;
+ case ARCInstKind::AutoreleasepoolPush:
+ case ARCInstKind::None:
+ case ARCInstKind::IntrinsicUser:
+ case ARCInstKind::User:
+ // Weak pointers are only modified through the weak entry points
+ // (and arbitrary calls, which could call the weak entry points).
+ break;
+ default:
+ // Anything else could modify the weak pointer.
+ goto clobbered;
+ }
+ }
+ clobbered:;
+ }
+
+ // Then, for each destroyWeak with an alloca operand, check to see if
+ // the alloca and all its users can be zapped.
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+ Instruction *Inst = &*I++;
+ ARCInstKind Class = GetBasicARCInstKind(Inst);
+ if (Class != ARCInstKind::DestroyWeak)
+ continue;
+
+ CallInst *Call = cast<CallInst>(Inst);
+ Value *Arg = Call->getArgOperand(0);
+ if (AllocaInst *Alloca = dyn_cast<AllocaInst>(Arg)) {
+ for (User *U : Alloca->users()) {
+ const Instruction *UserInst = cast<Instruction>(U);
+ switch (GetBasicARCInstKind(UserInst)) {
+ case ARCInstKind::InitWeak:
+ case ARCInstKind::StoreWeak:
+ case ARCInstKind::DestroyWeak:
+ continue;
+ default:
+ goto done;
+ }
+ }
+ Changed = true;
+ for (auto UI = Alloca->user_begin(), UE = Alloca->user_end(); UI != UE;) {
+ CallInst *UserInst = cast<CallInst>(*UI++);
+ switch (GetBasicARCInstKind(UserInst)) {
+ case ARCInstKind::InitWeak:
+ case ARCInstKind::StoreWeak:
+ // These functions return their second argument.
+ UserInst->replaceAllUsesWith(UserInst->getArgOperand(1));
+ break;
+ case ARCInstKind::DestroyWeak:
+ // No return value.
+ break;
+ default:
+ llvm_unreachable("alloca really is used!");
+ }
+ UserInst->eraseFromParent();
+ }
+ Alloca->eraseFromParent();
+ done:;
+ }
+ }
+}
+
+/// Identify program paths which execute sequences of retains and releases which
+/// can be eliminated.
+bool ObjCARCOpt::OptimizeSequences(Function &F) {
+ // Releases, Retains - These are used to store the results of the main flow
+ // analysis. These use Value* as the key instead of Instruction* so that the
+ // map stays valid when we get around to rewriting code and calls get
+ // replaced by arguments.
+ DenseMap<Value *, RRInfo> Releases;
+ BlotMapVector<Value *, RRInfo> Retains;
+
+ // This is used during the traversal of the function to track the
+ // states for each identified object at each block.
+ DenseMap<const BasicBlock *, BBState> BBStates;
+
+ // Analyze the CFG of the function, and all instructions.
+ bool NestingDetected = Visit(F, BBStates, Retains, Releases);
+
+ if (DisableRetainReleasePairing)
+ return false;
+
+ // Transform.
+ bool AnyPairsCompletelyEliminated = PerformCodePlacement(BBStates, Retains,
+ Releases,
+ F.getParent());
+
+ return AnyPairsCompletelyEliminated && NestingDetected;
+}
+
+/// Check if there is a dependent call earlier that does not have anything in
+/// between the Retain and the call that can affect the reference count of their
+/// shared pointer argument. Note that Retain need not be in BB.
static CallInst *HasSafePathToPredecessorCall(const Value *Arg,
Instruction *Retain,
ProvenanceAnalysis &PA) {
auto *Call = dyn_cast_or_null<CallInst>(findSingleDependency(
CanChangeRetainCount, Arg, Retain->getParent(), Retain, PA));
-
- // Check that the pointer is the return value of the call.
- if (!Call || Arg != Call)
+
+ // Check that the pointer is the return value of the call.
+ if (!Call || Arg != Call)
return nullptr;
-
- // Check that the call is a regular call.
- ARCInstKind Class = GetBasicARCInstKind(Call);
+
+ // Check that the call is a regular call.
+ ARCInstKind Class = GetBasicARCInstKind(Call);
return Class == ARCInstKind::CallOrUser || Class == ARCInstKind::Call
? Call
: nullptr;
-}
-
-/// Find a dependent retain that precedes the given autorelease for which there
-/// is nothing in between the two instructions that can affect the ref count of
-/// Arg.
-static CallInst *
-FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
- Instruction *Autorelease,
- ProvenanceAnalysis &PA) {
+}
+
+/// Find a dependent retain that precedes the given autorelease for which there
+/// is nothing in between the two instructions that can affect the ref count of
+/// Arg.
+static CallInst *
+FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
+ Instruction *Autorelease,
+ ProvenanceAnalysis &PA) {
auto *Retain = dyn_cast_or_null<CallInst>(
findSingleDependency(CanChangeRetainCount, Arg, BB, Autorelease, PA));
-
- // Check that we found a retain with the same argument.
- if (!Retain || !IsRetain(GetBasicARCInstKind(Retain)) ||
- GetArgRCIdentityRoot(Retain) != Arg) {
- return nullptr;
- }
-
- return Retain;
-}
-
-/// Look for an ``autorelease'' instruction dependent on Arg such that there are
-/// no instructions dependent on Arg that need a positive ref count in between
-/// the autorelease and the ret.
-static CallInst *
-FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB,
- ReturnInst *Ret,
- ProvenanceAnalysis &PA) {
+
+ // Check that we found a retain with the same argument.
+ if (!Retain || !IsRetain(GetBasicARCInstKind(Retain)) ||
+ GetArgRCIdentityRoot(Retain) != Arg) {
+ return nullptr;
+ }
+
+ return Retain;
+}
+
+/// Look for an ``autorelease'' instruction dependent on Arg such that there are
+/// no instructions dependent on Arg that need a positive ref count in between
+/// the autorelease and the ret.
+static CallInst *
+FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB,
+ ReturnInst *Ret,
+ ProvenanceAnalysis &PA) {
SmallPtrSet<Instruction *, 4> DepInsts;
auto *Autorelease = dyn_cast_or_null<CallInst>(
findSingleDependency(NeedsPositiveRetainCount, Arg, BB, Ret, PA));
-
- if (!Autorelease)
- return nullptr;
- ARCInstKind AutoreleaseClass = GetBasicARCInstKind(Autorelease);
- if (!IsAutorelease(AutoreleaseClass))
- return nullptr;
- if (GetArgRCIdentityRoot(Autorelease) != Arg)
- return nullptr;
-
- return Autorelease;
-}
-
-/// Look for this pattern:
-/// \code
-/// %call = call i8* @something(...)
-/// %2 = call i8* @objc_retain(i8* %call)
-/// %3 = call i8* @objc_autorelease(i8* %2)
-/// ret i8* %3
-/// \endcode
-/// And delete the retain and autorelease.
-void ObjCARCOpt::OptimizeReturns(Function &F) {
- if (!F.getReturnType()->isPointerTy())
- return;
-
- LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeReturns ==\n");
-
- for (BasicBlock &BB: F) {
- ReturnInst *Ret = dyn_cast<ReturnInst>(&BB.back());
- if (!Ret)
- continue;
-
- LLVM_DEBUG(dbgs() << "Visiting: " << *Ret << "\n");
-
- const Value *Arg = GetRCIdentityRoot(Ret->getOperand(0));
-
- // Look for an ``autorelease'' instruction that is a predecessor of Ret and
- // dependent on Arg such that there are no instructions dependent on Arg
- // that need a positive ref count in between the autorelease and Ret.
+
+ if (!Autorelease)
+ return nullptr;
+ ARCInstKind AutoreleaseClass = GetBasicARCInstKind(Autorelease);
+ if (!IsAutorelease(AutoreleaseClass))
+ return nullptr;
+ if (GetArgRCIdentityRoot(Autorelease) != Arg)
+ return nullptr;
+
+ return Autorelease;
+}
+
+/// Look for this pattern:
+/// \code
+/// %call = call i8* @something(...)
+/// %2 = call i8* @objc_retain(i8* %call)
+/// %3 = call i8* @objc_autorelease(i8* %2)
+/// ret i8* %3
+/// \endcode
+/// And delete the retain and autorelease.
+void ObjCARCOpt::OptimizeReturns(Function &F) {
+ if (!F.getReturnType()->isPointerTy())
+ return;
+
+ LLVM_DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeReturns ==\n");
+
+ for (BasicBlock &BB: F) {
+ ReturnInst *Ret = dyn_cast<ReturnInst>(&BB.back());
+ if (!Ret)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Visiting: " << *Ret << "\n");
+
+ const Value *Arg = GetRCIdentityRoot(Ret->getOperand(0));
+
+ // Look for an ``autorelease'' instruction that is a predecessor of Ret and
+ // dependent on Arg such that there are no instructions dependent on Arg
+ // that need a positive ref count in between the autorelease and Ret.
CallInst *Autorelease =
FindPredecessorAutoreleaseWithSafePath(Arg, &BB, Ret, PA);
-
- if (!Autorelease)
- continue;
-
- CallInst *Retain = FindPredecessorRetainWithSafePath(
+
+ if (!Autorelease)
+ continue;
+
+ CallInst *Retain = FindPredecessorRetainWithSafePath(
Arg, Autorelease->getParent(), Autorelease, PA);
-
- if (!Retain)
- continue;
-
- // Check that there is nothing that can affect the reference count
- // between the retain and the call. Note that Retain need not be in BB.
+
+ if (!Retain)
+ continue;
+
+ // Check that there is nothing that can affect the reference count
+ // between the retain and the call. Note that Retain need not be in BB.
CallInst *Call = HasSafePathToPredecessorCall(Arg, Retain, PA);
-
- // Don't remove retainRV/autoreleaseRV pairs if the call isn't a tail call.
+
+ // Don't remove retainRV/autoreleaseRV pairs if the call isn't a tail call.
if (!Call ||
(!Call->isTailCall() &&
GetBasicARCInstKind(Retain) == ARCInstKind::RetainRV &&
GetBasicARCInstKind(Autorelease) == ARCInstKind::AutoreleaseRV))
- continue;
-
- // If so, we can zap the retain and autorelease.
- Changed = true;
- ++NumRets;
- LLVM_DEBUG(dbgs() << "Erasing: " << *Retain << "\nErasing: " << *Autorelease
- << "\n");
- EraseInstruction(Retain);
- EraseInstruction(Autorelease);
- }
-}
-
-#ifndef NDEBUG
-void
-ObjCARCOpt::GatherStatistics(Function &F, bool AfterOptimization) {
- Statistic &NumRetains =
- AfterOptimization ? NumRetainsAfterOpt : NumRetainsBeforeOpt;
- Statistic &NumReleases =
- AfterOptimization ? NumReleasesAfterOpt : NumReleasesBeforeOpt;
-
- for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
- Instruction *Inst = &*I++;
- switch (GetBasicARCInstKind(Inst)) {
- default:
- break;
- case ARCInstKind::Retain:
- ++NumRetains;
- break;
- case ARCInstKind::Release:
- ++NumReleases;
- break;
- }
- }
-}
-#endif
-
+ continue;
+
+ // If so, we can zap the retain and autorelease.
+ Changed = true;
+ ++NumRets;
+ LLVM_DEBUG(dbgs() << "Erasing: " << *Retain << "\nErasing: " << *Autorelease
+ << "\n");
+ EraseInstruction(Retain);
+ EraseInstruction(Autorelease);
+ }
+}
+
+#ifndef NDEBUG
+void
+ObjCARCOpt::GatherStatistics(Function &F, bool AfterOptimization) {
+ Statistic &NumRetains =
+ AfterOptimization ? NumRetainsAfterOpt : NumRetainsBeforeOpt;
+ Statistic &NumReleases =
+ AfterOptimization ? NumReleasesAfterOpt : NumReleasesBeforeOpt;
+
+ for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+ Instruction *Inst = &*I++;
+ switch (GetBasicARCInstKind(Inst)) {
+ default:
+ break;
+ case ARCInstKind::Retain:
+ ++NumRetains;
+ break;
+ case ARCInstKind::Release:
+ ++NumReleases;
+ break;
+ }
+ }
+}
+#endif
+
void ObjCARCOpt::init(Module &M) {
- if (!EnableARCOpts)
+ if (!EnableARCOpts)
return;
-
- // If nothing in the Module uses ARC, don't do anything.
- Run = ModuleHasARC(M);
- if (!Run)
+
+ // If nothing in the Module uses ARC, don't do anything.
+ Run = ModuleHasARC(M);
+ if (!Run)
return;
-
- // Intuitively, objc_retain and others are nocapture, however in practice
- // they are not, because they return their argument value. And objc_release
- // calls finalizers which can have arbitrary side effects.
- MDKindCache.init(&M);
-
- // Initialize our runtime entry point cache.
- EP.init(&M);
-}
-
+
+ // Intuitively, objc_retain and others are nocapture, however in practice
+ // they are not, because they return their argument value. And objc_release
+ // calls finalizers which can have arbitrary side effects.
+ MDKindCache.init(&M);
+
+ // Initialize our runtime entry point cache.
+ EP.init(&M);
+}
+
bool ObjCARCOpt::run(Function &F, AAResults &AA) {
- if (!EnableARCOpts)
- return false;
-
- // If nothing in the Module uses ARC, don't do anything.
- if (!Run)
- return false;
-
- Changed = false;
-
- LLVM_DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName()
- << " >>>"
- "\n");
-
+ if (!EnableARCOpts)
+ return false;
+
+ // If nothing in the Module uses ARC, don't do anything.
+ if (!Run)
+ return false;
+
+ Changed = false;
+
+ LLVM_DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName()
+ << " >>>"
+ "\n");
+
PA.setAA(&AA);
-
-#ifndef NDEBUG
- if (AreStatisticsEnabled()) {
- GatherStatistics(F, false);
- }
-#endif
-
- // This pass performs several distinct transformations. As a compile-time aid
- // when compiling code that isn't ObjC, skip these if the relevant ObjC
- // library functions aren't declared.
-
- // Preliminary optimizations. This also computes UsedInThisFunction.
- OptimizeIndividualCalls(F);
-
- // Optimizations for weak pointers.
- if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::LoadWeak)) |
- (1 << unsigned(ARCInstKind::LoadWeakRetained)) |
- (1 << unsigned(ARCInstKind::StoreWeak)) |
- (1 << unsigned(ARCInstKind::InitWeak)) |
- (1 << unsigned(ARCInstKind::CopyWeak)) |
- (1 << unsigned(ARCInstKind::MoveWeak)) |
- (1 << unsigned(ARCInstKind::DestroyWeak))))
- OptimizeWeakCalls(F);
-
- // Optimizations for retain+release pairs.
- if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Retain)) |
- (1 << unsigned(ARCInstKind::RetainRV)) |
- (1 << unsigned(ARCInstKind::RetainBlock))))
- if (UsedInThisFunction & (1 << unsigned(ARCInstKind::Release)))
- // Run OptimizeSequences until it either stops making changes or
- // no retain+release pair nesting is detected.
- while (OptimizeSequences(F)) {}
-
- // Optimizations if objc_autorelease is used.
- if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Autorelease)) |
- (1 << unsigned(ARCInstKind::AutoreleaseRV))))
- OptimizeReturns(F);
-
- // Gather statistics after optimization.
-#ifndef NDEBUG
- if (AreStatisticsEnabled()) {
- GatherStatistics(F, true);
- }
-#endif
-
- LLVM_DEBUG(dbgs() << "\n");
-
- return Changed;
-}
-
-void ObjCARCOpt::releaseMemory() {
- PA.clear();
-}
-
-/// @}
-///
+
+#ifndef NDEBUG
+ if (AreStatisticsEnabled()) {
+ GatherStatistics(F, false);
+ }
+#endif
+
+ // This pass performs several distinct transformations. As a compile-time aid
+ // when compiling code that isn't ObjC, skip these if the relevant ObjC
+ // library functions aren't declared.
+
+ // Preliminary optimizations. This also computes UsedInThisFunction.
+ OptimizeIndividualCalls(F);
+
+ // Optimizations for weak pointers.
+ if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::LoadWeak)) |
+ (1 << unsigned(ARCInstKind::LoadWeakRetained)) |
+ (1 << unsigned(ARCInstKind::StoreWeak)) |
+ (1 << unsigned(ARCInstKind::InitWeak)) |
+ (1 << unsigned(ARCInstKind::CopyWeak)) |
+ (1 << unsigned(ARCInstKind::MoveWeak)) |
+ (1 << unsigned(ARCInstKind::DestroyWeak))))
+ OptimizeWeakCalls(F);
+
+ // Optimizations for retain+release pairs.
+ if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Retain)) |
+ (1 << unsigned(ARCInstKind::RetainRV)) |
+ (1 << unsigned(ARCInstKind::RetainBlock))))
+ if (UsedInThisFunction & (1 << unsigned(ARCInstKind::Release)))
+ // Run OptimizeSequences until it either stops making changes or
+ // no retain+release pair nesting is detected.
+ while (OptimizeSequences(F)) {}
+
+ // Optimizations if objc_autorelease is used.
+ if (UsedInThisFunction & ((1 << unsigned(ARCInstKind::Autorelease)) |
+ (1 << unsigned(ARCInstKind::AutoreleaseRV))))
+ OptimizeReturns(F);
+
+ // Gather statistics after optimization.
+#ifndef NDEBUG
+ if (AreStatisticsEnabled()) {
+ GatherStatistics(F, true);
+ }
+#endif
+
+ LLVM_DEBUG(dbgs() << "\n");
+
+ return Changed;
+}
+
+void ObjCARCOpt::releaseMemory() {
+ PA.clear();
+}
+
+/// @}
+///
PreservedAnalyses ObjCARCOptPass::run(Function &F,
FunctionAnalysisManager &AM) {
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
index 1e4fb458a3..3d59b2edc5 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.cpp
@@ -1,179 +1,179 @@
-//===- ProvenanceAnalysis.cpp - ObjC ARC Optimization ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// This file defines a special form of Alias Analysis called ``Provenance
-/// Analysis''. The word ``provenance'' refers to the history of the ownership
-/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
-/// use various techniques to determine if locally
-///
-/// WARNING: This file knows about certain library functions. It recognizes them
-/// by name, and hardwires knowledge of their semantics.
-///
-/// WARNING: This file knows about how certain Objective-C library functions are
-/// used. Naive LLVM IR transformations which would otherwise be
-/// behavior-preserving may break these assumptions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ProvenanceAnalysis.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include <utility>
-
-using namespace llvm;
-using namespace llvm::objcarc;
-
-bool ProvenanceAnalysis::relatedSelect(const SelectInst *A,
- const Value *B) {
- // If the values are Selects with the same condition, we can do a more precise
- // check: just check for relations between the values on corresponding arms.
- if (const SelectInst *SB = dyn_cast<SelectInst>(B))
- if (A->getCondition() == SB->getCondition())
+//===- ProvenanceAnalysis.cpp - ObjC ARC Optimization ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file defines a special form of Alias Analysis called ``Provenance
+/// Analysis''. The word ``provenance'' refers to the history of the ownership
+/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
+/// use various techniques to determine if locally
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+bool ProvenanceAnalysis::relatedSelect(const SelectInst *A,
+ const Value *B) {
+ // If the values are Selects with the same condition, we can do a more precise
+ // check: just check for relations between the values on corresponding arms.
+ if (const SelectInst *SB = dyn_cast<SelectInst>(B))
+ if (A->getCondition() == SB->getCondition())
return related(A->getTrueValue(), SB->getTrueValue()) ||
related(A->getFalseValue(), SB->getFalseValue());
-
- // Check both arms of the Select node individually.
+
+ // Check both arms of the Select node individually.
return related(A->getTrueValue(), B) || related(A->getFalseValue(), B);
-}
-
-bool ProvenanceAnalysis::relatedPHI(const PHINode *A,
- const Value *B) {
- // If the values are PHIs in the same block, we can do a more precise as well
- // as efficient check: just check for relations between the values on
- // corresponding edges.
- if (const PHINode *PNB = dyn_cast<PHINode>(B))
- if (PNB->getParent() == A->getParent()) {
- for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i)
- if (related(A->getIncomingValue(i),
+}
+
+bool ProvenanceAnalysis::relatedPHI(const PHINode *A,
+ const Value *B) {
+ // If the values are PHIs in the same block, we can do a more precise as well
+ // as efficient check: just check for relations between the values on
+ // corresponding edges.
+ if (const PHINode *PNB = dyn_cast<PHINode>(B))
+ if (PNB->getParent() == A->getParent()) {
+ for (unsigned i = 0, e = A->getNumIncomingValues(); i != e; ++i)
+ if (related(A->getIncomingValue(i),
PNB->getIncomingValueForBlock(A->getIncomingBlock(i))))
- return true;
- return false;
- }
-
- // Check each unique source of the PHI node against B.
- SmallPtrSet<const Value *, 4> UniqueSrc;
- for (Value *PV1 : A->incoming_values()) {
+ return true;
+ return false;
+ }
+
+ // Check each unique source of the PHI node against B.
+ SmallPtrSet<const Value *, 4> UniqueSrc;
+ for (Value *PV1 : A->incoming_values()) {
if (UniqueSrc.insert(PV1).second && related(PV1, B))
- return true;
- }
-
- // All of the arms checked out.
- return false;
-}
-
-/// Test if the value of P, or any value covered by its provenance, is ever
-/// stored within the function (not counting callees).
-static bool IsStoredObjCPointer(const Value *P) {
- SmallPtrSet<const Value *, 8> Visited;
- SmallVector<const Value *, 8> Worklist;
- Worklist.push_back(P);
- Visited.insert(P);
- do {
- P = Worklist.pop_back_val();
- for (const Use &U : P->uses()) {
- const User *Ur = U.getUser();
- if (isa<StoreInst>(Ur)) {
- if (U.getOperandNo() == 0)
- // The pointer is stored.
- return true;
- // The pointed is stored through.
- continue;
- }
- if (isa<CallInst>(Ur))
- // The pointer is passed as an argument, ignore this.
- continue;
- if (isa<PtrToIntInst>(P))
- // Assume the worst.
- return true;
- if (Visited.insert(Ur).second)
- Worklist.push_back(Ur);
- }
- } while (!Worklist.empty());
-
- // Everything checked out.
- return false;
-}
-
+ return true;
+ }
+
+ // All of the arms checked out.
+ return false;
+}
+
+/// Test if the value of P, or any value covered by its provenance, is ever
+/// stored within the function (not counting callees).
+static bool IsStoredObjCPointer(const Value *P) {
+ SmallPtrSet<const Value *, 8> Visited;
+ SmallVector<const Value *, 8> Worklist;
+ Worklist.push_back(P);
+ Visited.insert(P);
+ do {
+ P = Worklist.pop_back_val();
+ for (const Use &U : P->uses()) {
+ const User *Ur = U.getUser();
+ if (isa<StoreInst>(Ur)) {
+ if (U.getOperandNo() == 0)
+ // The pointer is stored.
+ return true;
+ // The pointed is stored through.
+ continue;
+ }
+ if (isa<CallInst>(Ur))
+ // The pointer is passed as an argument, ignore this.
+ continue;
+ if (isa<PtrToIntInst>(P))
+ // Assume the worst.
+ return true;
+ if (Visited.insert(Ur).second)
+ Worklist.push_back(Ur);
+ }
+ } while (!Worklist.empty());
+
+ // Everything checked out.
+ return false;
+}
+
bool ProvenanceAnalysis::relatedCheck(const Value *A, const Value *B) {
- // Ask regular AliasAnalysis, for a first approximation.
- switch (AA->alias(A, B)) {
- case NoAlias:
- return false;
- case MustAlias:
- case PartialAlias:
- return true;
- case MayAlias:
- break;
- }
-
- bool AIsIdentified = IsObjCIdentifiedObject(A);
- bool BIsIdentified = IsObjCIdentifiedObject(B);
-
- // An ObjC-Identified object can't alias a load if it is never locally stored.
- if (AIsIdentified) {
- // Check for an obvious escape.
- if (isa<LoadInst>(B))
- return IsStoredObjCPointer(A);
- if (BIsIdentified) {
- // Check for an obvious escape.
- if (isa<LoadInst>(A))
- return IsStoredObjCPointer(B);
- // Both pointers are identified and escapes aren't an evident problem.
- return false;
- }
- } else if (BIsIdentified) {
- // Check for an obvious escape.
- if (isa<LoadInst>(A))
- return IsStoredObjCPointer(B);
- }
-
- // Special handling for PHI and Select.
- if (const PHINode *PN = dyn_cast<PHINode>(A))
- return relatedPHI(PN, B);
- if (const PHINode *PN = dyn_cast<PHINode>(B))
- return relatedPHI(PN, A);
- if (const SelectInst *S = dyn_cast<SelectInst>(A))
- return relatedSelect(S, B);
- if (const SelectInst *S = dyn_cast<SelectInst>(B))
- return relatedSelect(S, A);
-
- // Conservative.
- return true;
-}
-
+ // Ask regular AliasAnalysis, for a first approximation.
+ switch (AA->alias(A, B)) {
+ case NoAlias:
+ return false;
+ case MustAlias:
+ case PartialAlias:
+ return true;
+ case MayAlias:
+ break;
+ }
+
+ bool AIsIdentified = IsObjCIdentifiedObject(A);
+ bool BIsIdentified = IsObjCIdentifiedObject(B);
+
+ // An ObjC-Identified object can't alias a load if it is never locally stored.
+ if (AIsIdentified) {
+ // Check for an obvious escape.
+ if (isa<LoadInst>(B))
+ return IsStoredObjCPointer(A);
+ if (BIsIdentified) {
+ // Check for an obvious escape.
+ if (isa<LoadInst>(A))
+ return IsStoredObjCPointer(B);
+ // Both pointers are identified and escapes aren't an evident problem.
+ return false;
+ }
+ } else if (BIsIdentified) {
+ // Check for an obvious escape.
+ if (isa<LoadInst>(A))
+ return IsStoredObjCPointer(B);
+ }
+
+ // Special handling for PHI and Select.
+ if (const PHINode *PN = dyn_cast<PHINode>(A))
+ return relatedPHI(PN, B);
+ if (const PHINode *PN = dyn_cast<PHINode>(B))
+ return relatedPHI(PN, A);
+ if (const SelectInst *S = dyn_cast<SelectInst>(A))
+ return relatedSelect(S, B);
+ if (const SelectInst *S = dyn_cast<SelectInst>(B))
+ return relatedSelect(S, A);
+
+ // Conservative.
+ return true;
+}
+
bool ProvenanceAnalysis::related(const Value *A, const Value *B) {
A = GetUnderlyingObjCPtrCached(A, UnderlyingObjCPtrCache);
B = GetUnderlyingObjCPtrCached(B, UnderlyingObjCPtrCache);
-
- // Quick check.
- if (A == B)
- return true;
-
- // Begin by inserting a conservative value into the map. If the insertion
- // fails, we have the answer already. If it succeeds, leave it there until we
- // compute the real answer to guard against recursive queries.
- if (A > B) std::swap(A, B);
- std::pair<CachedResultsTy::iterator, bool> Pair =
- CachedResults.insert(std::make_pair(ValuePairTy(A, B), true));
- if (!Pair.second)
- return Pair.first->second;
-
+
+ // Quick check.
+ if (A == B)
+ return true;
+
+ // Begin by inserting a conservative value into the map. If the insertion
+ // fails, we have the answer already. If it succeeds, leave it there until we
+ // compute the real answer to guard against recursive queries.
+ if (A > B) std::swap(A, B);
+ std::pair<CachedResultsTy::iterator, bool> Pair =
+ CachedResults.insert(std::make_pair(ValuePairTy(A, B), true));
+ if (!Pair.second)
+ return Pair.first->second;
+
bool Result = relatedCheck(A, B);
- CachedResults[ValuePairTy(A, B)] = Result;
- return Result;
-}
+ CachedResults[ValuePairTy(A, B)] = Result;
+ return Result;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.h b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
index 0957031aa7..a63e356ce1 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysis.h
@@ -1,86 +1,86 @@
-//===- ProvenanceAnalysis.h - ObjC ARC Optimization -------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// This file declares a special form of Alias Analysis called ``Provenance
-/// Analysis''. The word ``provenance'' refers to the history of the ownership
-/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
-/// use various techniques to determine if locally
-///
-/// WARNING: This file knows about certain library functions. It recognizes them
-/// by name, and hardwires knowledge of their semantics.
-///
-/// WARNING: This file knows about how certain Objective-C library functions are
-/// used. Naive LLVM IR transformations which would otherwise be
-/// behavior-preserving may break these assumptions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
-#define LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/IR/ValueHandle.h"
-#include <utility>
-
-namespace llvm {
-
+//===- ProvenanceAnalysis.h - ObjC ARC Optimization -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This file declares a special form of Alias Analysis called ``Provenance
+/// Analysis''. The word ``provenance'' refers to the history of the ownership
+/// of an object. Thus ``Provenance Analysis'' is an analysis which attempts to
+/// use various techniques to determine if locally
+///
+/// WARNING: This file knows about certain library functions. It recognizes them
+/// by name, and hardwires knowledge of their semantics.
+///
+/// WARNING: This file knows about how certain Objective-C library functions are
+/// used. Naive LLVM IR transformations which would otherwise be
+/// behavior-preserving may break these assumptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/ValueHandle.h"
+#include <utility>
+
+namespace llvm {
+
class AAResults;
-class DataLayout;
-class PHINode;
-class SelectInst;
-class Value;
-
-namespace objcarc {
-
-/// This is similar to BasicAliasAnalysis, and it uses many of the same
-/// techniques, except it uses special ObjC-specific reasoning about pointer
-/// relationships.
-///
-/// In this context ``Provenance'' is defined as the history of an object's
-/// ownership. Thus ``Provenance Analysis'' is defined by using the notion of
-/// an ``independent provenance source'' of a pointer to determine whether or
-/// not two pointers have the same provenance source and thus could
-/// potentially be related.
-class ProvenanceAnalysis {
+class DataLayout;
+class PHINode;
+class SelectInst;
+class Value;
+
+namespace objcarc {
+
+/// This is similar to BasicAliasAnalysis, and it uses many of the same
+/// techniques, except it uses special ObjC-specific reasoning about pointer
+/// relationships.
+///
+/// In this context ``Provenance'' is defined as the history of an object's
+/// ownership. Thus ``Provenance Analysis'' is defined by using the notion of
+/// an ``independent provenance source'' of a pointer to determine whether or
+/// not two pointers have the same provenance source and thus could
+/// potentially be related.
+class ProvenanceAnalysis {
AAResults *AA;
-
- using ValuePairTy = std::pair<const Value *, const Value *>;
- using CachedResultsTy = DenseMap<ValuePairTy, bool>;
-
- CachedResultsTy CachedResults;
-
- DenseMap<const Value *, WeakTrackingVH> UnderlyingObjCPtrCache;
-
+
+ using ValuePairTy = std::pair<const Value *, const Value *>;
+ using CachedResultsTy = DenseMap<ValuePairTy, bool>;
+
+ CachedResultsTy CachedResults;
+
+ DenseMap<const Value *, WeakTrackingVH> UnderlyingObjCPtrCache;
+
bool relatedCheck(const Value *A, const Value *B);
- bool relatedSelect(const SelectInst *A, const Value *B);
- bool relatedPHI(const PHINode *A, const Value *B);
-
-public:
- ProvenanceAnalysis() = default;
- ProvenanceAnalysis(const ProvenanceAnalysis &) = delete;
- ProvenanceAnalysis &operator=(const ProvenanceAnalysis &) = delete;
-
+ bool relatedSelect(const SelectInst *A, const Value *B);
+ bool relatedPHI(const PHINode *A, const Value *B);
+
+public:
+ ProvenanceAnalysis() = default;
+ ProvenanceAnalysis(const ProvenanceAnalysis &) = delete;
+ ProvenanceAnalysis &operator=(const ProvenanceAnalysis &) = delete;
+
void setAA(AAResults *aa) { AA = aa; }
-
+
AAResults *getAA() const { return AA; }
-
+
bool related(const Value *A, const Value *B);
-
- void clear() {
- CachedResults.clear();
- UnderlyingObjCPtrCache.clear();
- }
-};
-
-} // end namespace objcarc
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
+
+ void clear() {
+ CachedResults.clear();
+ UnderlyingObjCPtrCache.clear();
+ }
+};
+
+} // end namespace objcarc
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TRANSFORMS_OBJCARC_PROVENANCEANALYSIS_H
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
index fff773908d..6fdfe787d4 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ProvenanceAnalysisEvaluator.cpp
@@ -1,94 +1,94 @@
-//===- ProvenanceAnalysisEvaluator.cpp - ObjC ARC Optimization ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ProvenanceAnalysis.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
+//===- ProvenanceAnalysisEvaluator.cpp - ObjC ARC Optimization ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ProvenanceAnalysis.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-using namespace llvm::objcarc;
-
-namespace {
-class PAEval : public FunctionPass {
-
-public:
- static char ID;
- PAEval();
- void getAnalysisUsage(AnalysisUsage &AU) const override;
- bool runOnFunction(Function &F) override;
-};
-}
-
-char PAEval::ID = 0;
-PAEval::PAEval() : FunctionPass(ID) {}
-
-void PAEval::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<AAResultsWrapperPass>();
-}
-
-static StringRef getName(Value *V) {
- StringRef Name = V->getName();
- if (Name.startswith("\1"))
- return Name.substr(1);
- return Name;
-}
-
-static void insertIfNamed(SetVector<Value *> &Values, Value *V) {
- if (!V->hasName())
- return;
- Values.insert(V);
-}
-
-bool PAEval::runOnFunction(Function &F) {
- SetVector<Value *> Values;
-
- for (auto &Arg : F.args())
- insertIfNamed(Values, &Arg);
-
- for (auto I = inst_begin(F), E = inst_end(F); I != E; ++I) {
- insertIfNamed(Values, &*I);
-
- for (auto &Op : I->operands())
- insertIfNamed(Values, Op);
- }
-
- ProvenanceAnalysis PA;
- PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
-
- for (Value *V1 : Values) {
- StringRef NameV1 = getName(V1);
- for (Value *V2 : Values) {
- StringRef NameV2 = getName(V2);
- if (NameV1 >= NameV2)
- continue;
- errs() << NameV1 << " and " << NameV2;
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+namespace {
+class PAEval : public FunctionPass {
+
+public:
+ static char ID;
+ PAEval();
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnFunction(Function &F) override;
+};
+}
+
+char PAEval::ID = 0;
+PAEval::PAEval() : FunctionPass(ID) {}
+
+void PAEval::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<AAResultsWrapperPass>();
+}
+
+static StringRef getName(Value *V) {
+ StringRef Name = V->getName();
+ if (Name.startswith("\1"))
+ return Name.substr(1);
+ return Name;
+}
+
+static void insertIfNamed(SetVector<Value *> &Values, Value *V) {
+ if (!V->hasName())
+ return;
+ Values.insert(V);
+}
+
+bool PAEval::runOnFunction(Function &F) {
+ SetVector<Value *> Values;
+
+ for (auto &Arg : F.args())
+ insertIfNamed(Values, &Arg);
+
+ for (auto I = inst_begin(F), E = inst_end(F); I != E; ++I) {
+ insertIfNamed(Values, &*I);
+
+ for (auto &Op : I->operands())
+ insertIfNamed(Values, Op);
+ }
+
+ ProvenanceAnalysis PA;
+ PA.setAA(&getAnalysis<AAResultsWrapperPass>().getAAResults());
+
+ for (Value *V1 : Values) {
+ StringRef NameV1 = getName(V1);
+ for (Value *V2 : Values) {
+ StringRef NameV2 = getName(V2);
+ if (NameV1 >= NameV2)
+ continue;
+ errs() << NameV1 << " and " << NameV2;
if (PA.related(V1, V2))
- errs() << " are related.\n";
- else
- errs() << " are not related.\n";
- }
- }
-
- return false;
-}
-
-FunctionPass *llvm::createPAEvalPass() { return new PAEval(); }
-
-INITIALIZE_PASS_BEGIN(PAEval, "pa-eval",
- "Evaluate ProvenanceAnalysis on all pairs", false, true)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(PAEval, "pa-eval",
- "Evaluate ProvenanceAnalysis on all pairs", false, true)
+ errs() << " are related.\n";
+ else
+ errs() << " are not related.\n";
+ }
+ }
+
+ return false;
+}
+
+FunctionPass *llvm::createPAEvalPass() { return new PAEval(); }
+
+INITIALIZE_PASS_BEGIN(PAEval, "pa-eval",
+ "Evaluate ProvenanceAnalysis on all pairs", false, true)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(PAEval, "pa-eval",
+ "Evaluate ProvenanceAnalysis on all pairs", false, true)
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.cpp b/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.cpp
index 513be76e6c..6071ec3e4d 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.cpp
@@ -1,436 +1,436 @@
-//===- PtrState.cpp -------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "PtrState.h"
-#include "DependencyAnalysis.h"
-#include "ObjCARC.h"
-#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
-#include "llvm/Analysis/ObjCARCInstKind.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <iterator>
-#include <utility>
-
-using namespace llvm;
-using namespace llvm::objcarc;
-
-#define DEBUG_TYPE "objc-arc-ptr-state"
-
-//===----------------------------------------------------------------------===//
-// Utility
-//===----------------------------------------------------------------------===//
-
-raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, const Sequence S) {
- switch (S) {
- case S_None:
- return OS << "S_None";
- case S_Retain:
- return OS << "S_Retain";
- case S_CanRelease:
- return OS << "S_CanRelease";
- case S_Use:
- return OS << "S_Use";
- case S_Release:
- return OS << "S_Release";
- case S_MovableRelease:
- return OS << "S_MovableRelease";
- case S_Stop:
- return OS << "S_Stop";
- }
- llvm_unreachable("Unknown sequence type.");
-}
-
-//===----------------------------------------------------------------------===//
-// Sequence
-//===----------------------------------------------------------------------===//
-
-static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
- // The easy cases.
- if (A == B)
- return A;
- if (A == S_None || B == S_None)
- return S_None;
-
- if (A > B)
- std::swap(A, B);
- if (TopDown) {
- // Choose the side which is further along in the sequence.
- if ((A == S_Retain || A == S_CanRelease) &&
- (B == S_CanRelease || B == S_Use))
- return B;
- } else {
- // Choose the side which is further along in the sequence.
- if ((A == S_Use || A == S_CanRelease) &&
- (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
- return A;
- // If both sides are releases, choose the more conservative one.
- if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
- return A;
- if (A == S_Release && B == S_MovableRelease)
- return A;
- }
-
- return S_None;
-}
-
-//===----------------------------------------------------------------------===//
-// RRInfo
-//===----------------------------------------------------------------------===//
-
-void RRInfo::clear() {
- KnownSafe = false;
- IsTailCallRelease = false;
- ReleaseMetadata = nullptr;
- Calls.clear();
- ReverseInsertPts.clear();
- CFGHazardAfflicted = false;
-}
-
-bool RRInfo::Merge(const RRInfo &Other) {
- // Conservatively merge the ReleaseMetadata information.
- if (ReleaseMetadata != Other.ReleaseMetadata)
- ReleaseMetadata = nullptr;
-
- // Conservatively merge the boolean state.
- KnownSafe &= Other.KnownSafe;
- IsTailCallRelease &= Other.IsTailCallRelease;
- CFGHazardAfflicted |= Other.CFGHazardAfflicted;
-
- // Merge the call sets.
- Calls.insert(Other.Calls.begin(), Other.Calls.end());
-
- // Merge the insert point sets. If there are any differences,
- // that makes this a partial merge.
- bool Partial = ReverseInsertPts.size() != Other.ReverseInsertPts.size();
- for (Instruction *Inst : Other.ReverseInsertPts)
- Partial |= ReverseInsertPts.insert(Inst).second;
- return Partial;
-}
-
-//===----------------------------------------------------------------------===//
-// PtrState
-//===----------------------------------------------------------------------===//
-
-void PtrState::SetKnownPositiveRefCount() {
- LLVM_DEBUG(dbgs() << " Setting Known Positive.\n");
- KnownPositiveRefCount = true;
-}
-
-void PtrState::ClearKnownPositiveRefCount() {
- LLVM_DEBUG(dbgs() << " Clearing Known Positive.\n");
- KnownPositiveRefCount = false;
-}
-
-void PtrState::SetSeq(Sequence NewSeq) {
- LLVM_DEBUG(dbgs() << " Old: " << GetSeq() << "; New: " << NewSeq
- << "\n");
- Seq = NewSeq;
-}
-
-void PtrState::ResetSequenceProgress(Sequence NewSeq) {
- LLVM_DEBUG(dbgs() << " Resetting sequence progress.\n");
- SetSeq(NewSeq);
- Partial = false;
- RRI.clear();
-}
-
-void PtrState::Merge(const PtrState &Other, bool TopDown) {
- Seq = MergeSeqs(GetSeq(), Other.GetSeq(), TopDown);
- KnownPositiveRefCount &= Other.KnownPositiveRefCount;
-
- // If we're not in a sequence (anymore), drop all associated state.
- if (Seq == S_None) {
- Partial = false;
- RRI.clear();
- } else if (Partial || Other.Partial) {
- // If we're doing a merge on a path that's previously seen a partial
- // merge, conservatively drop the sequence, to avoid doing partial
- // RR elimination. If the branch predicates for the two merge differ,
- // mixing them is unsafe.
- ClearSequenceProgress();
- } else {
- // Otherwise merge the other PtrState's RRInfo into our RRInfo. At this
- // point, we know that currently we are not partial. Stash whether or not
- // the merge operation caused us to undergo a partial merging of reverse
- // insertion points.
- Partial = RRI.Merge(Other.RRI);
- }
-}
-
-//===----------------------------------------------------------------------===//
-// BottomUpPtrState
-//===----------------------------------------------------------------------===//
-
-bool BottomUpPtrState::InitBottomUp(ARCMDKindCache &Cache, Instruction *I) {
- // If we see two releases in a row on the same pointer. If so, make
- // a note, and we'll cicle back to revisit it after we've
- // hopefully eliminated the second release, which may allow us to
- // eliminate the first release too.
- // Theoretically we could implement removal of nested retain+release
- // pairs by making PtrState hold a stack of states, but this is
- // simple and avoids adding overhead for the non-nested case.
- bool NestingDetected = false;
- if (GetSeq() == S_Release || GetSeq() == S_MovableRelease) {
- LLVM_DEBUG(
- dbgs() << " Found nested releases (i.e. a release pair)\n");
- NestingDetected = true;
- }
-
- MDNode *ReleaseMetadata =
- I->getMetadata(Cache.get(ARCMDKindID::ImpreciseRelease));
- Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release;
- ResetSequenceProgress(NewSeq);
- SetReleaseMetadata(ReleaseMetadata);
- SetKnownSafe(HasKnownPositiveRefCount());
- SetTailCallRelease(cast<CallInst>(I)->isTailCall());
- InsertCall(I);
- SetKnownPositiveRefCount();
- return NestingDetected;
-}
-
-bool BottomUpPtrState::MatchWithRetain() {
- SetKnownPositiveRefCount();
-
- Sequence OldSeq = GetSeq();
- switch (OldSeq) {
- case S_Stop:
- case S_Release:
- case S_MovableRelease:
- case S_Use:
- // If OldSeq is not S_Use or OldSeq is S_Use and we are tracking an
- // imprecise release, clear our reverse insertion points.
- if (OldSeq != S_Use || IsTrackingImpreciseReleases())
- ClearReverseInsertPts();
- LLVM_FALLTHROUGH;
- case S_CanRelease:
- return true;
- case S_None:
- return false;
- case S_Retain:
- llvm_unreachable("bottom-up pointer in retain state!");
- }
- llvm_unreachable("Sequence unknown enum value");
-}
-
-bool BottomUpPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
- const Value *Ptr,
- ProvenanceAnalysis &PA,
- ARCInstKind Class) {
- Sequence S = GetSeq();
-
- // Check for possible releases.
+//===- PtrState.cpp -------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "PtrState.h"
+#include "DependencyAnalysis.h"
+#include "ObjCARC.h"
+#include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::objcarc;
+
+#define DEBUG_TYPE "objc-arc-ptr-state"
+
+//===----------------------------------------------------------------------===//
+// Utility
+//===----------------------------------------------------------------------===//
+
+raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS, const Sequence S) {
+ switch (S) {
+ case S_None:
+ return OS << "S_None";
+ case S_Retain:
+ return OS << "S_Retain";
+ case S_CanRelease:
+ return OS << "S_CanRelease";
+ case S_Use:
+ return OS << "S_Use";
+ case S_Release:
+ return OS << "S_Release";
+ case S_MovableRelease:
+ return OS << "S_MovableRelease";
+ case S_Stop:
+ return OS << "S_Stop";
+ }
+ llvm_unreachable("Unknown sequence type.");
+}
+
+//===----------------------------------------------------------------------===//
+// Sequence
+//===----------------------------------------------------------------------===//
+
+static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
+ // The easy cases.
+ if (A == B)
+ return A;
+ if (A == S_None || B == S_None)
+ return S_None;
+
+ if (A > B)
+ std::swap(A, B);
+ if (TopDown) {
+ // Choose the side which is further along in the sequence.
+ if ((A == S_Retain || A == S_CanRelease) &&
+ (B == S_CanRelease || B == S_Use))
+ return B;
+ } else {
+ // Choose the side which is further along in the sequence.
+ if ((A == S_Use || A == S_CanRelease) &&
+ (B == S_Use || B == S_Release || B == S_Stop || B == S_MovableRelease))
+ return A;
+ // If both sides are releases, choose the more conservative one.
+ if (A == S_Stop && (B == S_Release || B == S_MovableRelease))
+ return A;
+ if (A == S_Release && B == S_MovableRelease)
+ return A;
+ }
+
+ return S_None;
+}
+
+//===----------------------------------------------------------------------===//
+// RRInfo
+//===----------------------------------------------------------------------===//
+
+void RRInfo::clear() {
+ KnownSafe = false;
+ IsTailCallRelease = false;
+ ReleaseMetadata = nullptr;
+ Calls.clear();
+ ReverseInsertPts.clear();
+ CFGHazardAfflicted = false;
+}
+
+bool RRInfo::Merge(const RRInfo &Other) {
+ // Conservatively merge the ReleaseMetadata information.
+ if (ReleaseMetadata != Other.ReleaseMetadata)
+ ReleaseMetadata = nullptr;
+
+ // Conservatively merge the boolean state.
+ KnownSafe &= Other.KnownSafe;
+ IsTailCallRelease &= Other.IsTailCallRelease;
+ CFGHazardAfflicted |= Other.CFGHazardAfflicted;
+
+ // Merge the call sets.
+ Calls.insert(Other.Calls.begin(), Other.Calls.end());
+
+ // Merge the insert point sets. If there are any differences,
+ // that makes this a partial merge.
+ bool Partial = ReverseInsertPts.size() != Other.ReverseInsertPts.size();
+ for (Instruction *Inst : Other.ReverseInsertPts)
+ Partial |= ReverseInsertPts.insert(Inst).second;
+ return Partial;
+}
+
+//===----------------------------------------------------------------------===//
+// PtrState
+//===----------------------------------------------------------------------===//
+
+void PtrState::SetKnownPositiveRefCount() {
+ LLVM_DEBUG(dbgs() << " Setting Known Positive.\n");
+ KnownPositiveRefCount = true;
+}
+
+void PtrState::ClearKnownPositiveRefCount() {
+ LLVM_DEBUG(dbgs() << " Clearing Known Positive.\n");
+ KnownPositiveRefCount = false;
+}
+
+void PtrState::SetSeq(Sequence NewSeq) {
+ LLVM_DEBUG(dbgs() << " Old: " << GetSeq() << "; New: " << NewSeq
+ << "\n");
+ Seq = NewSeq;
+}
+
+void PtrState::ResetSequenceProgress(Sequence NewSeq) {
+ LLVM_DEBUG(dbgs() << " Resetting sequence progress.\n");
+ SetSeq(NewSeq);
+ Partial = false;
+ RRI.clear();
+}
+
+void PtrState::Merge(const PtrState &Other, bool TopDown) {
+ Seq = MergeSeqs(GetSeq(), Other.GetSeq(), TopDown);
+ KnownPositiveRefCount &= Other.KnownPositiveRefCount;
+
+ // If we're not in a sequence (anymore), drop all associated state.
+ if (Seq == S_None) {
+ Partial = false;
+ RRI.clear();
+ } else if (Partial || Other.Partial) {
+ // If we're doing a merge on a path that's previously seen a partial
+ // merge, conservatively drop the sequence, to avoid doing partial
+ // RR elimination. If the branch predicates for the two merge differ,
+ // mixing them is unsafe.
+ ClearSequenceProgress();
+ } else {
+ // Otherwise merge the other PtrState's RRInfo into our RRInfo. At this
+ // point, we know that currently we are not partial. Stash whether or not
+ // the merge operation caused us to undergo a partial merging of reverse
+ // insertion points.
+ Partial = RRI.Merge(Other.RRI);
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// BottomUpPtrState
+//===----------------------------------------------------------------------===//
+
+bool BottomUpPtrState::InitBottomUp(ARCMDKindCache &Cache, Instruction *I) {
+ // If we see two releases in a row on the same pointer. If so, make
+ // a note, and we'll cicle back to revisit it after we've
+ // hopefully eliminated the second release, which may allow us to
+ // eliminate the first release too.
+ // Theoretically we could implement removal of nested retain+release
+ // pairs by making PtrState hold a stack of states, but this is
+ // simple and avoids adding overhead for the non-nested case.
+ bool NestingDetected = false;
+ if (GetSeq() == S_Release || GetSeq() == S_MovableRelease) {
+ LLVM_DEBUG(
+ dbgs() << " Found nested releases (i.e. a release pair)\n");
+ NestingDetected = true;
+ }
+
+ MDNode *ReleaseMetadata =
+ I->getMetadata(Cache.get(ARCMDKindID::ImpreciseRelease));
+ Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release;
+ ResetSequenceProgress(NewSeq);
+ SetReleaseMetadata(ReleaseMetadata);
+ SetKnownSafe(HasKnownPositiveRefCount());
+ SetTailCallRelease(cast<CallInst>(I)->isTailCall());
+ InsertCall(I);
+ SetKnownPositiveRefCount();
+ return NestingDetected;
+}
+
+bool BottomUpPtrState::MatchWithRetain() {
+ SetKnownPositiveRefCount();
+
+ Sequence OldSeq = GetSeq();
+ switch (OldSeq) {
+ case S_Stop:
+ case S_Release:
+ case S_MovableRelease:
+ case S_Use:
+ // If OldSeq is not S_Use or OldSeq is S_Use and we are tracking an
+ // imprecise release, clear our reverse insertion points.
+ if (OldSeq != S_Use || IsTrackingImpreciseReleases())
+ ClearReverseInsertPts();
+ LLVM_FALLTHROUGH;
+ case S_CanRelease:
+ return true;
+ case S_None:
+ return false;
+ case S_Retain:
+ llvm_unreachable("bottom-up pointer in retain state!");
+ }
+ llvm_unreachable("Sequence unknown enum value");
+}
+
+bool BottomUpPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
+ const Value *Ptr,
+ ProvenanceAnalysis &PA,
+ ARCInstKind Class) {
+ Sequence S = GetSeq();
+
+ // Check for possible releases.
if (!CanDecrementRefCount(Inst, Ptr, PA, Class))
- return false;
-
- LLVM_DEBUG(dbgs() << " CanAlterRefCount: Seq: " << S << "; "
- << *Ptr << "\n");
- switch (S) {
- case S_Use:
- SetSeq(S_CanRelease);
- return true;
- case S_CanRelease:
- case S_Release:
- case S_MovableRelease:
- case S_Stop:
- case S_None:
- return false;
- case S_Retain:
- llvm_unreachable("bottom-up pointer in retain state!");
- }
- llvm_unreachable("Sequence unknown enum value");
-}
-
-void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
- const Value *Ptr,
- ProvenanceAnalysis &PA,
- ARCInstKind Class) {
- auto SetSeqAndInsertReverseInsertPt = [&](Sequence NewSeq){
- assert(!HasReverseInsertPts());
- SetSeq(NewSeq);
- // If this is an invoke instruction, we're scanning it as part of
- // one of its successor blocks, since we can't insert code after it
- // in its own block, and we don't want to split critical edges.
- BasicBlock::iterator InsertAfter;
- if (isa<InvokeInst>(Inst)) {
- const auto IP = BB->getFirstInsertionPt();
- InsertAfter = IP == BB->end() ? std::prev(BB->end()) : IP;
- if (isa<CatchSwitchInst>(InsertAfter))
- // A catchswitch must be the only non-phi instruction in its basic
- // block, so attempting to insert an instruction into such a block would
- // produce invalid IR.
- SetCFGHazardAfflicted(true);
- } else {
- InsertAfter = std::next(Inst->getIterator());
- }
-
- if (InsertAfter != BB->end())
- InsertAfter = skipDebugIntrinsics(InsertAfter);
-
- InsertReverseInsertPt(&*InsertAfter);
- };
-
- // Check for possible direct uses.
- switch (GetSeq()) {
- case S_Release:
- case S_MovableRelease:
- if (CanUse(Inst, Ptr, PA, Class)) {
- LLVM_DEBUG(dbgs() << " CanUse: Seq: " << GetSeq() << "; "
- << *Ptr << "\n");
- SetSeqAndInsertReverseInsertPt(S_Use);
- } else if (Seq == S_Release && IsUser(Class)) {
- LLVM_DEBUG(dbgs() << " PreciseReleaseUse: Seq: " << GetSeq()
- << "; " << *Ptr << "\n");
- // Non-movable releases depend on any possible objc pointer use.
- SetSeqAndInsertReverseInsertPt(S_Stop);
- } else if (const auto *Call = getreturnRVOperand(*Inst, Class)) {
- if (CanUse(Call, Ptr, PA, GetBasicARCInstKind(Call))) {
- LLVM_DEBUG(dbgs() << " ReleaseUse: Seq: " << GetSeq() << "; "
- << *Ptr << "\n");
- SetSeqAndInsertReverseInsertPt(S_Stop);
- }
- }
- break;
- case S_Stop:
- if (CanUse(Inst, Ptr, PA, Class)) {
- LLVM_DEBUG(dbgs() << " PreciseStopUse: Seq: " << GetSeq()
- << "; " << *Ptr << "\n");
- SetSeq(S_Use);
- }
- break;
- case S_CanRelease:
- case S_Use:
- case S_None:
- break;
- case S_Retain:
- llvm_unreachable("bottom-up pointer in retain state!");
- }
-}
-
-//===----------------------------------------------------------------------===//
-// TopDownPtrState
-//===----------------------------------------------------------------------===//
-
-bool TopDownPtrState::InitTopDown(ARCInstKind Kind, Instruction *I) {
- bool NestingDetected = false;
- // Don't do retain+release tracking for ARCInstKind::RetainRV, because
- // it's
- // better to let it remain as the first instruction after a call.
- if (Kind != ARCInstKind::RetainRV) {
- // If we see two retains in a row on the same pointer. If so, make
- // a note, and we'll cicle back to revisit it after we've
- // hopefully eliminated the second retain, which may allow us to
- // eliminate the first retain too.
- // Theoretically we could implement removal of nested retain+release
- // pairs by making PtrState hold a stack of states, but this is
- // simple and avoids adding overhead for the non-nested case.
- if (GetSeq() == S_Retain)
- NestingDetected = true;
-
- ResetSequenceProgress(S_Retain);
- SetKnownSafe(HasKnownPositiveRefCount());
- InsertCall(I);
- }
-
- SetKnownPositiveRefCount();
- return NestingDetected;
-}
-
-bool TopDownPtrState::MatchWithRelease(ARCMDKindCache &Cache,
- Instruction *Release) {
- ClearKnownPositiveRefCount();
-
- Sequence OldSeq = GetSeq();
-
- MDNode *ReleaseMetadata =
- Release->getMetadata(Cache.get(ARCMDKindID::ImpreciseRelease));
-
- switch (OldSeq) {
- case S_Retain:
- case S_CanRelease:
- if (OldSeq == S_Retain || ReleaseMetadata != nullptr)
- ClearReverseInsertPts();
- LLVM_FALLTHROUGH;
- case S_Use:
- SetReleaseMetadata(ReleaseMetadata);
- SetTailCallRelease(cast<CallInst>(Release)->isTailCall());
- return true;
- case S_None:
- return false;
- case S_Stop:
- case S_Release:
- case S_MovableRelease:
- llvm_unreachable("top-down pointer in bottom up state!");
- }
- llvm_unreachable("Sequence unknown enum value");
-}
-
-bool TopDownPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
- const Value *Ptr,
- ProvenanceAnalysis &PA,
- ARCInstKind Class) {
- // Check for possible releases. Treat clang.arc.use as a releasing instruction
- // to prevent sinking a retain past it.
+ return false;
+
+ LLVM_DEBUG(dbgs() << " CanAlterRefCount: Seq: " << S << "; "
+ << *Ptr << "\n");
+ switch (S) {
+ case S_Use:
+ SetSeq(S_CanRelease);
+ return true;
+ case S_CanRelease:
+ case S_Release:
+ case S_MovableRelease:
+ case S_Stop:
+ case S_None:
+ return false;
+ case S_Retain:
+ llvm_unreachable("bottom-up pointer in retain state!");
+ }
+ llvm_unreachable("Sequence unknown enum value");
+}
+
+void BottomUpPtrState::HandlePotentialUse(BasicBlock *BB, Instruction *Inst,
+ const Value *Ptr,
+ ProvenanceAnalysis &PA,
+ ARCInstKind Class) {
+ auto SetSeqAndInsertReverseInsertPt = [&](Sequence NewSeq){
+ assert(!HasReverseInsertPts());
+ SetSeq(NewSeq);
+ // If this is an invoke instruction, we're scanning it as part of
+ // one of its successor blocks, since we can't insert code after it
+ // in its own block, and we don't want to split critical edges.
+ BasicBlock::iterator InsertAfter;
+ if (isa<InvokeInst>(Inst)) {
+ const auto IP = BB->getFirstInsertionPt();
+ InsertAfter = IP == BB->end() ? std::prev(BB->end()) : IP;
+ if (isa<CatchSwitchInst>(InsertAfter))
+ // A catchswitch must be the only non-phi instruction in its basic
+ // block, so attempting to insert an instruction into such a block would
+ // produce invalid IR.
+ SetCFGHazardAfflicted(true);
+ } else {
+ InsertAfter = std::next(Inst->getIterator());
+ }
+
+ if (InsertAfter != BB->end())
+ InsertAfter = skipDebugIntrinsics(InsertAfter);
+
+ InsertReverseInsertPt(&*InsertAfter);
+ };
+
+ // Check for possible direct uses.
+ switch (GetSeq()) {
+ case S_Release:
+ case S_MovableRelease:
+ if (CanUse(Inst, Ptr, PA, Class)) {
+ LLVM_DEBUG(dbgs() << " CanUse: Seq: " << GetSeq() << "; "
+ << *Ptr << "\n");
+ SetSeqAndInsertReverseInsertPt(S_Use);
+ } else if (Seq == S_Release && IsUser(Class)) {
+ LLVM_DEBUG(dbgs() << " PreciseReleaseUse: Seq: " << GetSeq()
+ << "; " << *Ptr << "\n");
+ // Non-movable releases depend on any possible objc pointer use.
+ SetSeqAndInsertReverseInsertPt(S_Stop);
+ } else if (const auto *Call = getreturnRVOperand(*Inst, Class)) {
+ if (CanUse(Call, Ptr, PA, GetBasicARCInstKind(Call))) {
+ LLVM_DEBUG(dbgs() << " ReleaseUse: Seq: " << GetSeq() << "; "
+ << *Ptr << "\n");
+ SetSeqAndInsertReverseInsertPt(S_Stop);
+ }
+ }
+ break;
+ case S_Stop:
+ if (CanUse(Inst, Ptr, PA, Class)) {
+ LLVM_DEBUG(dbgs() << " PreciseStopUse: Seq: " << GetSeq()
+ << "; " << *Ptr << "\n");
+ SetSeq(S_Use);
+ }
+ break;
+ case S_CanRelease:
+ case S_Use:
+ case S_None:
+ break;
+ case S_Retain:
+ llvm_unreachable("bottom-up pointer in retain state!");
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// TopDownPtrState
+//===----------------------------------------------------------------------===//
+
+bool TopDownPtrState::InitTopDown(ARCInstKind Kind, Instruction *I) {
+ bool NestingDetected = false;
+ // Don't do retain+release tracking for ARCInstKind::RetainRV, because
+ // it's
+ // better to let it remain as the first instruction after a call.
+ if (Kind != ARCInstKind::RetainRV) {
+ // If we see two retains in a row on the same pointer. If so, make
+ // a note, and we'll cicle back to revisit it after we've
+ // hopefully eliminated the second retain, which may allow us to
+ // eliminate the first retain too.
+ // Theoretically we could implement removal of nested retain+release
+ // pairs by making PtrState hold a stack of states, but this is
+ // simple and avoids adding overhead for the non-nested case.
+ if (GetSeq() == S_Retain)
+ NestingDetected = true;
+
+ ResetSequenceProgress(S_Retain);
+ SetKnownSafe(HasKnownPositiveRefCount());
+ InsertCall(I);
+ }
+
+ SetKnownPositiveRefCount();
+ return NestingDetected;
+}
+
+bool TopDownPtrState::MatchWithRelease(ARCMDKindCache &Cache,
+ Instruction *Release) {
+ ClearKnownPositiveRefCount();
+
+ Sequence OldSeq = GetSeq();
+
+ MDNode *ReleaseMetadata =
+ Release->getMetadata(Cache.get(ARCMDKindID::ImpreciseRelease));
+
+ switch (OldSeq) {
+ case S_Retain:
+ case S_CanRelease:
+ if (OldSeq == S_Retain || ReleaseMetadata != nullptr)
+ ClearReverseInsertPts();
+ LLVM_FALLTHROUGH;
+ case S_Use:
+ SetReleaseMetadata(ReleaseMetadata);
+ SetTailCallRelease(cast<CallInst>(Release)->isTailCall());
+ return true;
+ case S_None:
+ return false;
+ case S_Stop:
+ case S_Release:
+ case S_MovableRelease:
+ llvm_unreachable("top-down pointer in bottom up state!");
+ }
+ llvm_unreachable("Sequence unknown enum value");
+}
+
+bool TopDownPtrState::HandlePotentialAlterRefCount(Instruction *Inst,
+ const Value *Ptr,
+ ProvenanceAnalysis &PA,
+ ARCInstKind Class) {
+ // Check for possible releases. Treat clang.arc.use as a releasing instruction
+ // to prevent sinking a retain past it.
if (!CanDecrementRefCount(Inst, Ptr, PA, Class) &&
- Class != ARCInstKind::IntrinsicUser)
- return false;
-
- LLVM_DEBUG(dbgs() << " CanAlterRefCount: Seq: " << GetSeq() << "; "
- << *Ptr << "\n");
- ClearKnownPositiveRefCount();
- switch (GetSeq()) {
- case S_Retain:
- SetSeq(S_CanRelease);
- assert(!HasReverseInsertPts());
- InsertReverseInsertPt(Inst);
-
- // One call can't cause a transition from S_Retain to S_CanRelease
- // and S_CanRelease to S_Use. If we've made the first transition,
- // we're done.
- return true;
- case S_Use:
- case S_CanRelease:
- case S_None:
- return false;
- case S_Stop:
- case S_Release:
- case S_MovableRelease:
- llvm_unreachable("top-down pointer in release state!");
- }
- llvm_unreachable("covered switch is not covered!?");
-}
-
-void TopDownPtrState::HandlePotentialUse(Instruction *Inst, const Value *Ptr,
- ProvenanceAnalysis &PA,
- ARCInstKind Class) {
- // Check for possible direct uses.
- switch (GetSeq()) {
- case S_CanRelease:
- if (!CanUse(Inst, Ptr, PA, Class))
- return;
- LLVM_DEBUG(dbgs() << " CanUse: Seq: " << GetSeq() << "; "
- << *Ptr << "\n");
- SetSeq(S_Use);
- return;
- case S_Retain:
- case S_Use:
- case S_None:
- return;
- case S_Stop:
- case S_Release:
- case S_MovableRelease:
- llvm_unreachable("top-down pointer in release state!");
- }
-}
+ Class != ARCInstKind::IntrinsicUser)
+ return false;
+
+ LLVM_DEBUG(dbgs() << " CanAlterRefCount: Seq: " << GetSeq() << "; "
+ << *Ptr << "\n");
+ ClearKnownPositiveRefCount();
+ switch (GetSeq()) {
+ case S_Retain:
+ SetSeq(S_CanRelease);
+ assert(!HasReverseInsertPts());
+ InsertReverseInsertPt(Inst);
+
+ // One call can't cause a transition from S_Retain to S_CanRelease
+ // and S_CanRelease to S_Use. If we've made the first transition,
+ // we're done.
+ return true;
+ case S_Use:
+ case S_CanRelease:
+ case S_None:
+ return false;
+ case S_Stop:
+ case S_Release:
+ case S_MovableRelease:
+ llvm_unreachable("top-down pointer in release state!");
+ }
+ llvm_unreachable("covered switch is not covered!?");
+}
+
+void TopDownPtrState::HandlePotentialUse(Instruction *Inst, const Value *Ptr,
+ ProvenanceAnalysis &PA,
+ ARCInstKind Class) {
+ // Check for possible direct uses.
+ switch (GetSeq()) {
+ case S_CanRelease:
+ if (!CanUse(Inst, Ptr, PA, Class))
+ return;
+ LLVM_DEBUG(dbgs() << " CanUse: Seq: " << GetSeq() << "; "
+ << *Ptr << "\n");
+ SetSeq(S_Use);
+ return;
+ case S_Retain:
+ case S_Use:
+ case S_None:
+ return;
+ case S_Stop:
+ case S_Release:
+ case S_MovableRelease:
+ llvm_unreachable("top-down pointer in release state!");
+ }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.h b/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.h
index e6856ba615..66614c06cb 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.h
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/PtrState.h
@@ -1,212 +1,212 @@
-//===- PtrState.h - ARC State for a Ptr -------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains declarations for the ARC state associated with a ptr. It
-// is only used by the ARC Sequence Dataflow computation. By separating this
-// from the actual dataflow, it is easier to consider the mechanics of the ARC
-// optimization separate from the actual predicates being used.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
-#define LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/ObjCARCInstKind.h"
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-
-class BasicBlock;
-class Instruction;
-class MDNode;
-class raw_ostream;
-class Value;
-
-namespace objcarc {
-
-class ARCMDKindCache;
-class ProvenanceAnalysis;
-
-/// \enum Sequence
-///
-/// A sequence of states that a pointer may go through in which an
-/// objc_retain and objc_release are actually needed.
-enum Sequence {
- S_None,
- S_Retain, ///< objc_retain(x).
- S_CanRelease, ///< foo(x) -- x could possibly see a ref count decrement.
- S_Use, ///< any use of x.
- S_Stop, ///< like S_Release, but code motion is stopped.
- S_Release, ///< objc_release(x).
- S_MovableRelease ///< objc_release(x), !clang.imprecise_release.
-};
-
-raw_ostream &operator<<(raw_ostream &OS,
- const Sequence S) LLVM_ATTRIBUTE_UNUSED;
-
-/// Unidirectional information about either a
-/// retain-decrement-use-release sequence or release-use-decrement-retain
-/// reverse sequence.
-struct RRInfo {
- /// After an objc_retain, the reference count of the referenced
- /// object is known to be positive. Similarly, before an objc_release, the
- /// reference count of the referenced object is known to be positive. If
- /// there are retain-release pairs in code regions where the retain count
- /// is known to be positive, they can be eliminated, regardless of any side
- /// effects between them.
- ///
- /// Also, a retain+release pair nested within another retain+release
- /// pair all on the known same pointer value can be eliminated, regardless
- /// of any intervening side effects.
- ///
- /// KnownSafe is true when either of these conditions is satisfied.
- bool KnownSafe = false;
-
- /// True of the objc_release calls are all marked with the "tail" keyword.
- bool IsTailCallRelease = false;
-
- /// If the Calls are objc_release calls and they all have a
- /// clang.imprecise_release tag, this is the metadata tag.
- MDNode *ReleaseMetadata = nullptr;
-
- /// For a top-down sequence, the set of objc_retains or
- /// objc_retainBlocks. For bottom-up, the set of objc_releases.
- SmallPtrSet<Instruction *, 2> Calls;
-
- /// The set of optimal insert positions for moving calls in the opposite
- /// sequence.
- SmallPtrSet<Instruction *, 2> ReverseInsertPts;
-
- /// If this is true, we cannot perform code motion but can still remove
- /// retain/release pairs.
- bool CFGHazardAfflicted = false;
-
- RRInfo() = default;
-
- void clear();
-
- /// Conservatively merge the two RRInfo. Returns true if a partial merge has
- /// occurred, false otherwise.
- bool Merge(const RRInfo &Other);
-};
-
-/// This class summarizes several per-pointer runtime properties which
-/// are propagated through the flow graph.
-class PtrState {
-protected:
- /// True if the reference count is known to be incremented.
- bool KnownPositiveRefCount = false;
-
- /// True if we've seen an opportunity for partial RR elimination, such as
- /// pushing calls into a CFG triangle or into one side of a CFG diamond.
- bool Partial = false;
-
- /// The current position in the sequence.
- unsigned char Seq : 8;
-
- /// Unidirectional information about the current sequence.
- RRInfo RRI;
-
- PtrState() : Seq(S_None) {}
-
-public:
- bool IsKnownSafe() const { return RRI.KnownSafe; }
-
- void SetKnownSafe(const bool NewValue) { RRI.KnownSafe = NewValue; }
-
- bool IsTailCallRelease() const { return RRI.IsTailCallRelease; }
-
- void SetTailCallRelease(const bool NewValue) {
- RRI.IsTailCallRelease = NewValue;
- }
-
- bool IsTrackingImpreciseReleases() const {
- return RRI.ReleaseMetadata != nullptr;
- }
-
- const MDNode *GetReleaseMetadata() const { return RRI.ReleaseMetadata; }
-
- void SetReleaseMetadata(MDNode *NewValue) { RRI.ReleaseMetadata = NewValue; }
-
- bool IsCFGHazardAfflicted() const { return RRI.CFGHazardAfflicted; }
-
- void SetCFGHazardAfflicted(const bool NewValue) {
- RRI.CFGHazardAfflicted = NewValue;
- }
-
- void SetKnownPositiveRefCount();
- void ClearKnownPositiveRefCount();
-
- bool HasKnownPositiveRefCount() const { return KnownPositiveRefCount; }
-
- void SetSeq(Sequence NewSeq);
-
- Sequence GetSeq() const { return static_cast<Sequence>(Seq); }
-
- void ClearSequenceProgress() { ResetSequenceProgress(S_None); }
-
- void ResetSequenceProgress(Sequence NewSeq);
- void Merge(const PtrState &Other, bool TopDown);
-
- void InsertCall(Instruction *I) { RRI.Calls.insert(I); }
-
- void InsertReverseInsertPt(Instruction *I) { RRI.ReverseInsertPts.insert(I); }
-
- void ClearReverseInsertPts() { RRI.ReverseInsertPts.clear(); }
-
- bool HasReverseInsertPts() const { return !RRI.ReverseInsertPts.empty(); }
-
- const RRInfo &GetRRInfo() const { return RRI; }
-};
-
-struct BottomUpPtrState : PtrState {
- BottomUpPtrState() = default;
-
- /// (Re-)Initialize this bottom up pointer returning true if we detected a
- /// pointer with nested releases.
- bool InitBottomUp(ARCMDKindCache &Cache, Instruction *I);
-
- /// Return true if this set of releases can be paired with a release. Modifies
- /// state appropriately to reflect that the matching occurred if it is
- /// successful.
- ///
- /// It is assumed that one has already checked that the RCIdentity of the
- /// retain and the RCIdentity of this ptr state are the same.
- bool MatchWithRetain();
-
- void HandlePotentialUse(BasicBlock *BB, Instruction *Inst, const Value *Ptr,
- ProvenanceAnalysis &PA, ARCInstKind Class);
- bool HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr,
- ProvenanceAnalysis &PA, ARCInstKind Class);
-};
-
-struct TopDownPtrState : PtrState {
- TopDownPtrState() = default;
-
- /// (Re-)Initialize this bottom up pointer returning true if we detected a
- /// pointer with nested releases.
- bool InitTopDown(ARCInstKind Kind, Instruction *I);
-
- /// Return true if this set of retains can be paired with the given
- /// release. Modifies state appropriately to reflect that the matching
- /// occurred.
- bool MatchWithRelease(ARCMDKindCache &Cache, Instruction *Release);
-
- void HandlePotentialUse(Instruction *Inst, const Value *Ptr,
- ProvenanceAnalysis &PA, ARCInstKind Class);
-
- bool HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr,
- ProvenanceAnalysis &PA, ARCInstKind Class);
-};
-
-} // end namespace objcarc
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
+//===- PtrState.h - ARC State for a Ptr -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains declarations for the ARC state associated with a ptr. It
+// is only used by the ARC Sequence Dataflow computation. By separating this
+// from the actual dataflow, it is easier to consider the mechanics of the ARC
+// optimization separate from the actual predicates being used.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
+#define LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/ObjCARCInstKind.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+
+class BasicBlock;
+class Instruction;
+class MDNode;
+class raw_ostream;
+class Value;
+
+namespace objcarc {
+
+class ARCMDKindCache;
+class ProvenanceAnalysis;
+
+/// \enum Sequence
+///
+/// A sequence of states that a pointer may go through in which an
+/// objc_retain and objc_release are actually needed.
+enum Sequence {
+ S_None,
+ S_Retain, ///< objc_retain(x).
+ S_CanRelease, ///< foo(x) -- x could possibly see a ref count decrement.
+ S_Use, ///< any use of x.
+ S_Stop, ///< like S_Release, but code motion is stopped.
+ S_Release, ///< objc_release(x).
+ S_MovableRelease ///< objc_release(x), !clang.imprecise_release.
+};
+
+raw_ostream &operator<<(raw_ostream &OS,
+ const Sequence S) LLVM_ATTRIBUTE_UNUSED;
+
+/// Unidirectional information about either a
+/// retain-decrement-use-release sequence or release-use-decrement-retain
+/// reverse sequence.
+struct RRInfo {
+ /// After an objc_retain, the reference count of the referenced
+ /// object is known to be positive. Similarly, before an objc_release, the
+ /// reference count of the referenced object is known to be positive. If
+ /// there are retain-release pairs in code regions where the retain count
+ /// is known to be positive, they can be eliminated, regardless of any side
+ /// effects between them.
+ ///
+ /// Also, a retain+release pair nested within another retain+release
+ /// pair all on the known same pointer value can be eliminated, regardless
+ /// of any intervening side effects.
+ ///
+ /// KnownSafe is true when either of these conditions is satisfied.
+ bool KnownSafe = false;
+
+ /// True of the objc_release calls are all marked with the "tail" keyword.
+ bool IsTailCallRelease = false;
+
+ /// If the Calls are objc_release calls and they all have a
+ /// clang.imprecise_release tag, this is the metadata tag.
+ MDNode *ReleaseMetadata = nullptr;
+
+ /// For a top-down sequence, the set of objc_retains or
+ /// objc_retainBlocks. For bottom-up, the set of objc_releases.
+ SmallPtrSet<Instruction *, 2> Calls;
+
+ /// The set of optimal insert positions for moving calls in the opposite
+ /// sequence.
+ SmallPtrSet<Instruction *, 2> ReverseInsertPts;
+
+ /// If this is true, we cannot perform code motion but can still remove
+ /// retain/release pairs.
+ bool CFGHazardAfflicted = false;
+
+ RRInfo() = default;
+
+ void clear();
+
+ /// Conservatively merge the two RRInfo. Returns true if a partial merge has
+ /// occurred, false otherwise.
+ bool Merge(const RRInfo &Other);
+};
+
+/// This class summarizes several per-pointer runtime properties which
+/// are propagated through the flow graph.
+class PtrState {
+protected:
+ /// True if the reference count is known to be incremented.
+ bool KnownPositiveRefCount = false;
+
+ /// True if we've seen an opportunity for partial RR elimination, such as
+ /// pushing calls into a CFG triangle or into one side of a CFG diamond.
+ bool Partial = false;
+
+ /// The current position in the sequence.
+ unsigned char Seq : 8;
+
+ /// Unidirectional information about the current sequence.
+ RRInfo RRI;
+
+ PtrState() : Seq(S_None) {}
+
+public:
+ bool IsKnownSafe() const { return RRI.KnownSafe; }
+
+ void SetKnownSafe(const bool NewValue) { RRI.KnownSafe = NewValue; }
+
+ bool IsTailCallRelease() const { return RRI.IsTailCallRelease; }
+
+ void SetTailCallRelease(const bool NewValue) {
+ RRI.IsTailCallRelease = NewValue;
+ }
+
+ bool IsTrackingImpreciseReleases() const {
+ return RRI.ReleaseMetadata != nullptr;
+ }
+
+ const MDNode *GetReleaseMetadata() const { return RRI.ReleaseMetadata; }
+
+ void SetReleaseMetadata(MDNode *NewValue) { RRI.ReleaseMetadata = NewValue; }
+
+ bool IsCFGHazardAfflicted() const { return RRI.CFGHazardAfflicted; }
+
+ void SetCFGHazardAfflicted(const bool NewValue) {
+ RRI.CFGHazardAfflicted = NewValue;
+ }
+
+ void SetKnownPositiveRefCount();
+ void ClearKnownPositiveRefCount();
+
+ bool HasKnownPositiveRefCount() const { return KnownPositiveRefCount; }
+
+ void SetSeq(Sequence NewSeq);
+
+ Sequence GetSeq() const { return static_cast<Sequence>(Seq); }
+
+ void ClearSequenceProgress() { ResetSequenceProgress(S_None); }
+
+ void ResetSequenceProgress(Sequence NewSeq);
+ void Merge(const PtrState &Other, bool TopDown);
+
+ void InsertCall(Instruction *I) { RRI.Calls.insert(I); }
+
+ void InsertReverseInsertPt(Instruction *I) { RRI.ReverseInsertPts.insert(I); }
+
+ void ClearReverseInsertPts() { RRI.ReverseInsertPts.clear(); }
+
+ bool HasReverseInsertPts() const { return !RRI.ReverseInsertPts.empty(); }
+
+ const RRInfo &GetRRInfo() const { return RRI; }
+};
+
+struct BottomUpPtrState : PtrState {
+ BottomUpPtrState() = default;
+
+ /// (Re-)Initialize this bottom up pointer returning true if we detected a
+ /// pointer with nested releases.
+ bool InitBottomUp(ARCMDKindCache &Cache, Instruction *I);
+
+ /// Return true if this set of releases can be paired with a release. Modifies
+ /// state appropriately to reflect that the matching occurred if it is
+ /// successful.
+ ///
+ /// It is assumed that one has already checked that the RCIdentity of the
+ /// retain and the RCIdentity of this ptr state are the same.
+ bool MatchWithRetain();
+
+ void HandlePotentialUse(BasicBlock *BB, Instruction *Inst, const Value *Ptr,
+ ProvenanceAnalysis &PA, ARCInstKind Class);
+ bool HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr,
+ ProvenanceAnalysis &PA, ARCInstKind Class);
+};
+
+struct TopDownPtrState : PtrState {
+ TopDownPtrState() = default;
+
+ /// (Re-)Initialize this bottom up pointer returning true if we detected a
+ /// pointer with nested releases.
+ bool InitTopDown(ARCInstKind Kind, Instruction *I);
+
+ /// Return true if this set of retains can be paired with the given
+ /// release. Modifies state appropriately to reflect that the matching
+ /// occurred.
+ bool MatchWithRelease(ARCMDKindCache &Cache, Instruction *Release);
+
+ void HandlePotentialUse(Instruction *Inst, const Value *Ptr,
+ ProvenanceAnalysis &PA, ARCInstKind Class);
+
+ bool HandlePotentialAlterRefCount(Instruction *Inst, const Value *Ptr,
+ ProvenanceAnalysis &PA, ARCInstKind Class);
+};
+
+} // end namespace objcarc
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TRANSFORMS_OBJCARC_PTRSTATE_H
diff --git a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ya.make b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ya.make
index 7c6fa10925..727ec42c3f 100644
--- a/contrib/libs/llvm12/lib/Transforms/ObjCARC/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/ObjCARC/ya.make
@@ -1,43 +1,43 @@
-# Generated by devtools/yamaker.
-
-LIBRARY()
-
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
OWNER(
orivej
g:cpp-contrib
)
-
+
LICENSE(Apache-2.0 WITH LLVM-exception)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-PEERDIR(
+PEERDIR(
contrib/libs/llvm12
contrib/libs/llvm12/include
contrib/libs/llvm12/lib/Analysis
contrib/libs/llvm12/lib/IR
contrib/libs/llvm12/lib/Support
contrib/libs/llvm12/lib/Transforms/Utils
-)
-
+)
+
ADDINCL(
contrib/libs/llvm12/lib/Transforms/ObjCARC
)
-
-NO_COMPILER_WARNINGS()
-
-NO_UTIL()
-
-SRCS(
- DependencyAnalysis.cpp
- ObjCARC.cpp
- ObjCARCAPElim.cpp
- ObjCARCContract.cpp
- ObjCARCExpand.cpp
- ObjCARCOpts.cpp
- ProvenanceAnalysis.cpp
- ProvenanceAnalysisEvaluator.cpp
- PtrState.cpp
-)
-
-END()
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+ DependencyAnalysis.cpp
+ ObjCARC.cpp
+ ObjCARCAPElim.cpp
+ ObjCARCContract.cpp
+ ObjCARCExpand.cpp
+ ObjCARCOpts.cpp
+ ProvenanceAnalysis.cpp
+ ProvenanceAnalysisEvaluator.cpp
+ PtrState.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/ADCE.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/ADCE.cpp
index 768fab6e7d..ce4e5e575f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/ADCE.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/ADCE.cpp
@@ -1,747 +1,747 @@
-//===- ADCE.cpp - Code to perform dead code elimination -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Aggressive Dead Code Elimination pass. This pass
-// optimistically assumes that all instructions are dead until proven otherwise,
-// allowing it to eliminate dead computations that other DCE passes do not
-// catch, particularly involving loop computations.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/ADCE.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/GraphTraits.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/IteratedDominanceFrontier.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/ProfileData/InstrProf.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include <cassert>
-#include <cstddef>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "adce"
-
-STATISTIC(NumRemoved, "Number of instructions removed");
-STATISTIC(NumBranchesRemoved, "Number of branch instructions removed");
-
-// This is a temporary option until we change the interface to this pass based
-// on optimization level.
-static cl::opt<bool> RemoveControlFlowFlag("adce-remove-control-flow",
- cl::init(true), cl::Hidden);
-
-// This option enables removing of may-be-infinite loops which have no other
-// effect.
-static cl::opt<bool> RemoveLoops("adce-remove-loops", cl::init(false),
- cl::Hidden);
-
-namespace {
-
-/// Information about Instructions
-struct InstInfoType {
- /// True if the associated instruction is live.
- bool Live = false;
-
- /// Quick access to information for block containing associated Instruction.
- struct BlockInfoType *Block = nullptr;
-};
-
-/// Information about basic blocks relevant to dead code elimination.
-struct BlockInfoType {
- /// True when this block contains a live instructions.
- bool Live = false;
-
- /// True when this block ends in an unconditional branch.
- bool UnconditionalBranch = false;
-
- /// True when this block is known to have live PHI nodes.
- bool HasLivePhiNodes = false;
-
- /// Control dependence sources need to be live for this block.
- bool CFLive = false;
-
- /// Quick access to the LiveInfo for the terminator,
- /// holds the value &InstInfo[Terminator]
- InstInfoType *TerminatorLiveInfo = nullptr;
-
- /// Corresponding BasicBlock.
- BasicBlock *BB = nullptr;
-
- /// Cache of BB->getTerminator().
- Instruction *Terminator = nullptr;
-
- /// Post-order numbering of reverse control flow graph.
- unsigned PostOrder;
-
- bool terminatorIsLive() const { return TerminatorLiveInfo->Live; }
-};
-
-class AggressiveDeadCodeElimination {
- Function &F;
-
- // ADCE does not use DominatorTree per se, but it updates it to preserve the
- // analysis.
- DominatorTree *DT;
- PostDominatorTree &PDT;
-
- /// Mapping of blocks to associated information, an element in BlockInfoVec.
- /// Use MapVector to get deterministic iteration order.
- MapVector<BasicBlock *, BlockInfoType> BlockInfo;
- bool isLive(BasicBlock *BB) { return BlockInfo[BB].Live; }
-
- /// Mapping of instructions to associated information.
- DenseMap<Instruction *, InstInfoType> InstInfo;
- bool isLive(Instruction *I) { return InstInfo[I].Live; }
-
- /// Instructions known to be live where we need to mark
- /// reaching definitions as live.
- SmallVector<Instruction *, 128> Worklist;
-
- /// Debug info scopes around a live instruction.
- SmallPtrSet<const Metadata *, 32> AliveScopes;
-
- /// Set of blocks with not known to have live terminators.
- SmallSetVector<BasicBlock *, 16> BlocksWithDeadTerminators;
-
- /// The set of blocks which we have determined whose control
- /// dependence sources must be live and which have not had
- /// those dependences analyzed.
- SmallPtrSet<BasicBlock *, 16> NewLiveBlocks;
-
- /// Set up auxiliary data structures for Instructions and BasicBlocks and
- /// initialize the Worklist to the set of must-be-live Instruscions.
- void initialize();
-
- /// Return true for operations which are always treated as live.
- bool isAlwaysLive(Instruction &I);
-
- /// Return true for instrumentation instructions for value profiling.
- bool isInstrumentsConstant(Instruction &I);
-
- /// Propagate liveness to reaching definitions.
- void markLiveInstructions();
-
- /// Mark an instruction as live.
- void markLive(Instruction *I);
-
- /// Mark a block as live.
- void markLive(BlockInfoType &BB);
- void markLive(BasicBlock *BB) { markLive(BlockInfo[BB]); }
-
- /// Mark terminators of control predecessors of a PHI node live.
- void markPhiLive(PHINode *PN);
-
- /// Record the Debug Scopes which surround live debug information.
- void collectLiveScopes(const DILocalScope &LS);
- void collectLiveScopes(const DILocation &DL);
-
- /// Analyze dead branches to find those whose branches are the sources
- /// of control dependences impacting a live block. Those branches are
- /// marked live.
- void markLiveBranchesFromControlDependences();
-
- /// Remove instructions not marked live, return if any instruction was
- /// removed.
- bool removeDeadInstructions();
-
- /// Identify connected sections of the control flow graph which have
- /// dead terminators and rewrite the control flow graph to remove them.
- bool updateDeadRegions();
-
- /// Set the BlockInfo::PostOrder field based on a post-order
- /// numbering of the reverse control flow graph.
- void computeReversePostOrder();
-
- /// Make the terminator of this block an unconditional branch to \p Target.
- void makeUnconditional(BasicBlock *BB, BasicBlock *Target);
-
-public:
- AggressiveDeadCodeElimination(Function &F, DominatorTree *DT,
- PostDominatorTree &PDT)
- : F(F), DT(DT), PDT(PDT) {}
-
- bool performDeadCodeElimination();
-};
-
-} // end anonymous namespace
-
-bool AggressiveDeadCodeElimination::performDeadCodeElimination() {
- initialize();
- markLiveInstructions();
- return removeDeadInstructions();
-}
-
-static bool isUnconditionalBranch(Instruction *Term) {
- auto *BR = dyn_cast<BranchInst>(Term);
- return BR && BR->isUnconditional();
-}
-
-void AggressiveDeadCodeElimination::initialize() {
- auto NumBlocks = F.size();
-
- // We will have an entry in the map for each block so we grow the
- // structure to twice that size to keep the load factor low in the hash table.
- BlockInfo.reserve(NumBlocks);
- size_t NumInsts = 0;
-
- // Iterate over blocks and initialize BlockInfoVec entries, count
- // instructions to size the InstInfo hash table.
- for (auto &BB : F) {
- NumInsts += BB.size();
- auto &Info = BlockInfo[&BB];
- Info.BB = &BB;
- Info.Terminator = BB.getTerminator();
- Info.UnconditionalBranch = isUnconditionalBranch(Info.Terminator);
- }
-
- // Initialize instruction map and set pointers to block info.
- InstInfo.reserve(NumInsts);
- for (auto &BBInfo : BlockInfo)
- for (Instruction &I : *BBInfo.second.BB)
- InstInfo[&I].Block = &BBInfo.second;
-
- // Since BlockInfoVec holds pointers into InstInfo and vice-versa, we may not
- // add any more elements to either after this point.
- for (auto &BBInfo : BlockInfo)
- BBInfo.second.TerminatorLiveInfo = &InstInfo[BBInfo.second.Terminator];
-
- // Collect the set of "root" instructions that are known live.
- for (Instruction &I : instructions(F))
- if (isAlwaysLive(I))
- markLive(&I);
-
- if (!RemoveControlFlowFlag)
- return;
-
- if (!RemoveLoops) {
- // This stores state for the depth-first iterator. In addition
- // to recording which nodes have been visited we also record whether
- // a node is currently on the "stack" of active ancestors of the current
- // node.
- using StatusMap = DenseMap<BasicBlock *, bool>;
-
- class DFState : public StatusMap {
- public:
- std::pair<StatusMap::iterator, bool> insert(BasicBlock *BB) {
- return StatusMap::insert(std::make_pair(BB, true));
- }
-
- // Invoked after we have visited all children of a node.
- void completed(BasicBlock *BB) { (*this)[BB] = false; }
-
- // Return true if \p BB is currently on the active stack
- // of ancestors.
- bool onStack(BasicBlock *BB) {
- auto Iter = find(BB);
- return Iter != end() && Iter->second;
- }
- } State;
-
- State.reserve(F.size());
- // Iterate over blocks in depth-first pre-order and
- // treat all edges to a block already seen as loop back edges
- // and mark the branch live it if there is a back edge.
- for (auto *BB: depth_first_ext(&F.getEntryBlock(), State)) {
- Instruction *Term = BB->getTerminator();
- if (isLive(Term))
- continue;
-
- for (auto *Succ : successors(BB))
- if (State.onStack(Succ)) {
- // back edge....
- markLive(Term);
- break;
- }
- }
- }
-
- // Mark blocks live if there is no path from the block to a
- // return of the function.
- // We do this by seeing which of the postdomtree root children exit the
- // program, and for all others, mark the subtree live.
- for (auto &PDTChild : children<DomTreeNode *>(PDT.getRootNode())) {
- auto *BB = PDTChild->getBlock();
- auto &Info = BlockInfo[BB];
- // Real function return
- if (isa<ReturnInst>(Info.Terminator)) {
- LLVM_DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName()
- << '\n';);
- continue;
- }
-
- // This child is something else, like an infinite loop.
- for (auto DFNode : depth_first(PDTChild))
- markLive(BlockInfo[DFNode->getBlock()].Terminator);
- }
-
- // Treat the entry block as always live
- auto *BB = &F.getEntryBlock();
- auto &EntryInfo = BlockInfo[BB];
- EntryInfo.Live = true;
- if (EntryInfo.UnconditionalBranch)
- markLive(EntryInfo.Terminator);
-
- // Build initial collection of blocks with dead terminators
- for (auto &BBInfo : BlockInfo)
- if (!BBInfo.second.terminatorIsLive())
- BlocksWithDeadTerminators.insert(BBInfo.second.BB);
-}
-
-bool AggressiveDeadCodeElimination::isAlwaysLive(Instruction &I) {
- // TODO -- use llvm::isInstructionTriviallyDead
+//===- ADCE.cpp - Code to perform dead code elimination -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Aggressive Dead Code Elimination pass. This pass
+// optimistically assumes that all instructions are dead until proven otherwise,
+// allowing it to eliminate dead computations that other DCE passes do not
+// catch, particularly involving loop computations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ADCE.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <cassert>
+#include <cstddef>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "adce"
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+STATISTIC(NumBranchesRemoved, "Number of branch instructions removed");
+
+// This is a temporary option until we change the interface to this pass based
+// on optimization level.
+static cl::opt<bool> RemoveControlFlowFlag("adce-remove-control-flow",
+ cl::init(true), cl::Hidden);
+
+// This option enables removing of may-be-infinite loops which have no other
+// effect.
+static cl::opt<bool> RemoveLoops("adce-remove-loops", cl::init(false),
+ cl::Hidden);
+
+namespace {
+
+/// Information about Instructions
+struct InstInfoType {
+ /// True if the associated instruction is live.
+ bool Live = false;
+
+ /// Quick access to information for block containing associated Instruction.
+ struct BlockInfoType *Block = nullptr;
+};
+
+/// Information about basic blocks relevant to dead code elimination.
+struct BlockInfoType {
+ /// True when this block contains a live instructions.
+ bool Live = false;
+
+ /// True when this block ends in an unconditional branch.
+ bool UnconditionalBranch = false;
+
+ /// True when this block is known to have live PHI nodes.
+ bool HasLivePhiNodes = false;
+
+ /// Control dependence sources need to be live for this block.
+ bool CFLive = false;
+
+ /// Quick access to the LiveInfo for the terminator,
+ /// holds the value &InstInfo[Terminator]
+ InstInfoType *TerminatorLiveInfo = nullptr;
+
+ /// Corresponding BasicBlock.
+ BasicBlock *BB = nullptr;
+
+ /// Cache of BB->getTerminator().
+ Instruction *Terminator = nullptr;
+
+ /// Post-order numbering of reverse control flow graph.
+ unsigned PostOrder;
+
+ bool terminatorIsLive() const { return TerminatorLiveInfo->Live; }
+};
+
+class AggressiveDeadCodeElimination {
+ Function &F;
+
+ // ADCE does not use DominatorTree per se, but it updates it to preserve the
+ // analysis.
+ DominatorTree *DT;
+ PostDominatorTree &PDT;
+
+ /// Mapping of blocks to associated information, an element in BlockInfoVec.
+ /// Use MapVector to get deterministic iteration order.
+ MapVector<BasicBlock *, BlockInfoType> BlockInfo;
+ bool isLive(BasicBlock *BB) { return BlockInfo[BB].Live; }
+
+ /// Mapping of instructions to associated information.
+ DenseMap<Instruction *, InstInfoType> InstInfo;
+ bool isLive(Instruction *I) { return InstInfo[I].Live; }
+
+ /// Instructions known to be live where we need to mark
+ /// reaching definitions as live.
+ SmallVector<Instruction *, 128> Worklist;
+
+ /// Debug info scopes around a live instruction.
+ SmallPtrSet<const Metadata *, 32> AliveScopes;
+
+ /// Set of blocks with not known to have live terminators.
+ SmallSetVector<BasicBlock *, 16> BlocksWithDeadTerminators;
+
+ /// The set of blocks which we have determined whose control
+ /// dependence sources must be live and which have not had
+ /// those dependences analyzed.
+ SmallPtrSet<BasicBlock *, 16> NewLiveBlocks;
+
+ /// Set up auxiliary data structures for Instructions and BasicBlocks and
+ /// initialize the Worklist to the set of must-be-live Instruscions.
+ void initialize();
+
+ /// Return true for operations which are always treated as live.
+ bool isAlwaysLive(Instruction &I);
+
+ /// Return true for instrumentation instructions for value profiling.
+ bool isInstrumentsConstant(Instruction &I);
+
+ /// Propagate liveness to reaching definitions.
+ void markLiveInstructions();
+
+ /// Mark an instruction as live.
+ void markLive(Instruction *I);
+
+ /// Mark a block as live.
+ void markLive(BlockInfoType &BB);
+ void markLive(BasicBlock *BB) { markLive(BlockInfo[BB]); }
+
+ /// Mark terminators of control predecessors of a PHI node live.
+ void markPhiLive(PHINode *PN);
+
+ /// Record the Debug Scopes which surround live debug information.
+ void collectLiveScopes(const DILocalScope &LS);
+ void collectLiveScopes(const DILocation &DL);
+
+ /// Analyze dead branches to find those whose branches are the sources
+ /// of control dependences impacting a live block. Those branches are
+ /// marked live.
+ void markLiveBranchesFromControlDependences();
+
+ /// Remove instructions not marked live, return if any instruction was
+ /// removed.
+ bool removeDeadInstructions();
+
+ /// Identify connected sections of the control flow graph which have
+ /// dead terminators and rewrite the control flow graph to remove them.
+ bool updateDeadRegions();
+
+ /// Set the BlockInfo::PostOrder field based on a post-order
+ /// numbering of the reverse control flow graph.
+ void computeReversePostOrder();
+
+ /// Make the terminator of this block an unconditional branch to \p Target.
+ void makeUnconditional(BasicBlock *BB, BasicBlock *Target);
+
+public:
+ AggressiveDeadCodeElimination(Function &F, DominatorTree *DT,
+ PostDominatorTree &PDT)
+ : F(F), DT(DT), PDT(PDT) {}
+
+ bool performDeadCodeElimination();
+};
+
+} // end anonymous namespace
+
+bool AggressiveDeadCodeElimination::performDeadCodeElimination() {
+ initialize();
+ markLiveInstructions();
+ return removeDeadInstructions();
+}
+
+static bool isUnconditionalBranch(Instruction *Term) {
+ auto *BR = dyn_cast<BranchInst>(Term);
+ return BR && BR->isUnconditional();
+}
+
+void AggressiveDeadCodeElimination::initialize() {
+ auto NumBlocks = F.size();
+
+ // We will have an entry in the map for each block so we grow the
+ // structure to twice that size to keep the load factor low in the hash table.
+ BlockInfo.reserve(NumBlocks);
+ size_t NumInsts = 0;
+
+ // Iterate over blocks and initialize BlockInfoVec entries, count
+ // instructions to size the InstInfo hash table.
+ for (auto &BB : F) {
+ NumInsts += BB.size();
+ auto &Info = BlockInfo[&BB];
+ Info.BB = &BB;
+ Info.Terminator = BB.getTerminator();
+ Info.UnconditionalBranch = isUnconditionalBranch(Info.Terminator);
+ }
+
+ // Initialize instruction map and set pointers to block info.
+ InstInfo.reserve(NumInsts);
+ for (auto &BBInfo : BlockInfo)
+ for (Instruction &I : *BBInfo.second.BB)
+ InstInfo[&I].Block = &BBInfo.second;
+
+ // Since BlockInfoVec holds pointers into InstInfo and vice-versa, we may not
+ // add any more elements to either after this point.
+ for (auto &BBInfo : BlockInfo)
+ BBInfo.second.TerminatorLiveInfo = &InstInfo[BBInfo.second.Terminator];
+
+ // Collect the set of "root" instructions that are known live.
+ for (Instruction &I : instructions(F))
+ if (isAlwaysLive(I))
+ markLive(&I);
+
+ if (!RemoveControlFlowFlag)
+ return;
+
+ if (!RemoveLoops) {
+ // This stores state for the depth-first iterator. In addition
+ // to recording which nodes have been visited we also record whether
+ // a node is currently on the "stack" of active ancestors of the current
+ // node.
+ using StatusMap = DenseMap<BasicBlock *, bool>;
+
+ class DFState : public StatusMap {
+ public:
+ std::pair<StatusMap::iterator, bool> insert(BasicBlock *BB) {
+ return StatusMap::insert(std::make_pair(BB, true));
+ }
+
+ // Invoked after we have visited all children of a node.
+ void completed(BasicBlock *BB) { (*this)[BB] = false; }
+
+ // Return true if \p BB is currently on the active stack
+ // of ancestors.
+ bool onStack(BasicBlock *BB) {
+ auto Iter = find(BB);
+ return Iter != end() && Iter->second;
+ }
+ } State;
+
+ State.reserve(F.size());
+ // Iterate over blocks in depth-first pre-order and
+ // treat all edges to a block already seen as loop back edges
+ // and mark the branch live it if there is a back edge.
+ for (auto *BB: depth_first_ext(&F.getEntryBlock(), State)) {
+ Instruction *Term = BB->getTerminator();
+ if (isLive(Term))
+ continue;
+
+ for (auto *Succ : successors(BB))
+ if (State.onStack(Succ)) {
+ // back edge....
+ markLive(Term);
+ break;
+ }
+ }
+ }
+
+ // Mark blocks live if there is no path from the block to a
+ // return of the function.
+ // We do this by seeing which of the postdomtree root children exit the
+ // program, and for all others, mark the subtree live.
+ for (auto &PDTChild : children<DomTreeNode *>(PDT.getRootNode())) {
+ auto *BB = PDTChild->getBlock();
+ auto &Info = BlockInfo[BB];
+ // Real function return
+ if (isa<ReturnInst>(Info.Terminator)) {
+ LLVM_DEBUG(dbgs() << "post-dom root child is a return: " << BB->getName()
+ << '\n';);
+ continue;
+ }
+
+ // This child is something else, like an infinite loop.
+ for (auto DFNode : depth_first(PDTChild))
+ markLive(BlockInfo[DFNode->getBlock()].Terminator);
+ }
+
+ // Treat the entry block as always live
+ auto *BB = &F.getEntryBlock();
+ auto &EntryInfo = BlockInfo[BB];
+ EntryInfo.Live = true;
+ if (EntryInfo.UnconditionalBranch)
+ markLive(EntryInfo.Terminator);
+
+ // Build initial collection of blocks with dead terminators
+ for (auto &BBInfo : BlockInfo)
+ if (!BBInfo.second.terminatorIsLive())
+ BlocksWithDeadTerminators.insert(BBInfo.second.BB);
+}
+
+bool AggressiveDeadCodeElimination::isAlwaysLive(Instruction &I) {
+ // TODO -- use llvm::isInstructionTriviallyDead
if (I.isEHPad() || I.mayHaveSideEffects() || !I.willReturn()) {
- // Skip any value profile instrumentation calls if they are
- // instrumenting constants.
- if (isInstrumentsConstant(I))
- return false;
- return true;
- }
- if (!I.isTerminator())
- return false;
- if (RemoveControlFlowFlag && (isa<BranchInst>(I) || isa<SwitchInst>(I)))
- return false;
- return true;
-}
-
-// Check if this instruction is a runtime call for value profiling and
-// if it's instrumenting a constant.
-bool AggressiveDeadCodeElimination::isInstrumentsConstant(Instruction &I) {
- // TODO -- move this test into llvm::isInstructionTriviallyDead
- if (CallInst *CI = dyn_cast<CallInst>(&I))
- if (Function *Callee = CI->getCalledFunction())
- if (Callee->getName().equals(getInstrProfValueProfFuncName()))
- if (isa<Constant>(CI->getArgOperand(0)))
- return true;
- return false;
-}
-
-void AggressiveDeadCodeElimination::markLiveInstructions() {
- // Propagate liveness backwards to operands.
- do {
- // Worklist holds newly discovered live instructions
- // where we need to mark the inputs as live.
- while (!Worklist.empty()) {
- Instruction *LiveInst = Worklist.pop_back_val();
- LLVM_DEBUG(dbgs() << "work live: "; LiveInst->dump(););
-
- for (Use &OI : LiveInst->operands())
- if (Instruction *Inst = dyn_cast<Instruction>(OI))
- markLive(Inst);
-
- if (auto *PN = dyn_cast<PHINode>(LiveInst))
- markPhiLive(PN);
- }
-
- // After data flow liveness has been identified, examine which branch
- // decisions are required to determine live instructions are executed.
- markLiveBranchesFromControlDependences();
-
- } while (!Worklist.empty());
-}
-
-void AggressiveDeadCodeElimination::markLive(Instruction *I) {
- auto &Info = InstInfo[I];
- if (Info.Live)
- return;
-
- LLVM_DEBUG(dbgs() << "mark live: "; I->dump());
- Info.Live = true;
- Worklist.push_back(I);
-
- // Collect the live debug info scopes attached to this instruction.
- if (const DILocation *DL = I->getDebugLoc())
- collectLiveScopes(*DL);
-
- // Mark the containing block live
- auto &BBInfo = *Info.Block;
- if (BBInfo.Terminator == I) {
- BlocksWithDeadTerminators.remove(BBInfo.BB);
- // For live terminators, mark destination blocks
- // live to preserve this control flow edges.
- if (!BBInfo.UnconditionalBranch)
- for (auto *BB : successors(I->getParent()))
- markLive(BB);
- }
- markLive(BBInfo);
-}
-
-void AggressiveDeadCodeElimination::markLive(BlockInfoType &BBInfo) {
- if (BBInfo.Live)
- return;
- LLVM_DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n');
- BBInfo.Live = true;
- if (!BBInfo.CFLive) {
- BBInfo.CFLive = true;
- NewLiveBlocks.insert(BBInfo.BB);
- }
-
- // Mark unconditional branches at the end of live
- // blocks as live since there is no work to do for them later
- if (BBInfo.UnconditionalBranch)
- markLive(BBInfo.Terminator);
-}
-
-void AggressiveDeadCodeElimination::collectLiveScopes(const DILocalScope &LS) {
- if (!AliveScopes.insert(&LS).second)
- return;
-
- if (isa<DISubprogram>(LS))
- return;
-
- // Tail-recurse through the scope chain.
- collectLiveScopes(cast<DILocalScope>(*LS.getScope()));
-}
-
-void AggressiveDeadCodeElimination::collectLiveScopes(const DILocation &DL) {
- // Even though DILocations are not scopes, shove them into AliveScopes so we
- // don't revisit them.
- if (!AliveScopes.insert(&DL).second)
- return;
-
- // Collect live scopes from the scope chain.
- collectLiveScopes(*DL.getScope());
-
- // Tail-recurse through the inlined-at chain.
- if (const DILocation *IA = DL.getInlinedAt())
- collectLiveScopes(*IA);
-}
-
-void AggressiveDeadCodeElimination::markPhiLive(PHINode *PN) {
- auto &Info = BlockInfo[PN->getParent()];
- // Only need to check this once per block.
- if (Info.HasLivePhiNodes)
- return;
- Info.HasLivePhiNodes = true;
-
- // If a predecessor block is not live, mark it as control-flow live
- // which will trigger marking live branches upon which
- // that block is control dependent.
- for (auto *PredBB : predecessors(Info.BB)) {
- auto &Info = BlockInfo[PredBB];
- if (!Info.CFLive) {
- Info.CFLive = true;
- NewLiveBlocks.insert(PredBB);
- }
- }
-}
-
-void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
- if (BlocksWithDeadTerminators.empty())
- return;
-
- LLVM_DEBUG({
- dbgs() << "new live blocks:\n";
- for (auto *BB : NewLiveBlocks)
- dbgs() << "\t" << BB->getName() << '\n';
- dbgs() << "dead terminator blocks:\n";
- for (auto *BB : BlocksWithDeadTerminators)
- dbgs() << "\t" << BB->getName() << '\n';
- });
-
- // The dominance frontier of a live block X in the reverse
- // control graph is the set of blocks upon which X is control
- // dependent. The following sequence computes the set of blocks
- // which currently have dead terminators that are control
- // dependence sources of a block which is in NewLiveBlocks.
-
- const SmallPtrSet<BasicBlock *, 16> BWDT{
- BlocksWithDeadTerminators.begin(),
- BlocksWithDeadTerminators.end()
- };
- SmallVector<BasicBlock *, 32> IDFBlocks;
- ReverseIDFCalculator IDFs(PDT);
- IDFs.setDefiningBlocks(NewLiveBlocks);
- IDFs.setLiveInBlocks(BWDT);
- IDFs.calculate(IDFBlocks);
- NewLiveBlocks.clear();
-
- // Dead terminators which control live blocks are now marked live.
- for (auto *BB : IDFBlocks) {
- LLVM_DEBUG(dbgs() << "live control in: " << BB->getName() << '\n');
- markLive(BB->getTerminator());
- }
-}
-
-//===----------------------------------------------------------------------===//
-//
-// Routines to update the CFG and SSA information before removing dead code.
-//
-//===----------------------------------------------------------------------===//
-bool AggressiveDeadCodeElimination::removeDeadInstructions() {
- // Updates control and dataflow around dead blocks
- bool RegionsUpdated = updateDeadRegions();
-
- LLVM_DEBUG({
- for (Instruction &I : instructions(F)) {
- // Check if the instruction is alive.
- if (isLive(&I))
- continue;
-
- if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
- // Check if the scope of this variable location is alive.
- if (AliveScopes.count(DII->getDebugLoc()->getScope()))
- continue;
-
- // If intrinsic is pointing at a live SSA value, there may be an
- // earlier optimization bug: if we know the location of the variable,
- // why isn't the scope of the location alive?
- if (Value *V = DII->getVariableLocation())
- if (Instruction *II = dyn_cast<Instruction>(V))
- if (isLive(II))
- dbgs() << "Dropping debug info for " << *DII << "\n";
- }
- }
- });
-
- // The inverse of the live set is the dead set. These are those instructions
- // that have no side effects and do not influence the control flow or return
- // value of the function, and may therefore be deleted safely.
- // NOTE: We reuse the Worklist vector here for memory efficiency.
- for (Instruction &I : instructions(F)) {
- // Check if the instruction is alive.
- if (isLive(&I))
- continue;
-
- if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) {
- // Check if the scope of this variable location is alive.
- if (AliveScopes.count(DII->getDebugLoc()->getScope()))
- continue;
-
- // Fallthrough and drop the intrinsic.
- }
-
- // Prepare to delete.
- Worklist.push_back(&I);
- I.dropAllReferences();
- }
-
- for (Instruction *&I : Worklist) {
- ++NumRemoved;
- I->eraseFromParent();
- }
-
- return !Worklist.empty() || RegionsUpdated;
-}
-
-// A dead region is the set of dead blocks with a common live post-dominator.
-bool AggressiveDeadCodeElimination::updateDeadRegions() {
- LLVM_DEBUG({
- dbgs() << "final dead terminator blocks: " << '\n';
- for (auto *BB : BlocksWithDeadTerminators)
- dbgs() << '\t' << BB->getName()
- << (BlockInfo[BB].Live ? " LIVE\n" : "\n");
- });
-
- // Don't compute the post ordering unless we needed it.
- bool HavePostOrder = false;
- bool Changed = false;
-
- for (auto *BB : BlocksWithDeadTerminators) {
- auto &Info = BlockInfo[BB];
- if (Info.UnconditionalBranch) {
- InstInfo[Info.Terminator].Live = true;
- continue;
- }
-
- if (!HavePostOrder) {
- computeReversePostOrder();
- HavePostOrder = true;
- }
-
- // Add an unconditional branch to the successor closest to the
- // end of the function which insures a path to the exit for each
- // live edge.
- BlockInfoType *PreferredSucc = nullptr;
- for (auto *Succ : successors(BB)) {
- auto *Info = &BlockInfo[Succ];
- if (!PreferredSucc || PreferredSucc->PostOrder < Info->PostOrder)
- PreferredSucc = Info;
- }
- assert((PreferredSucc && PreferredSucc->PostOrder > 0) &&
- "Failed to find safe successor for dead branch");
-
- // Collect removed successors to update the (Post)DominatorTrees.
- SmallPtrSet<BasicBlock *, 4> RemovedSuccessors;
- bool First = true;
- for (auto *Succ : successors(BB)) {
- if (!First || Succ != PreferredSucc->BB) {
- Succ->removePredecessor(BB);
- RemovedSuccessors.insert(Succ);
- } else
- First = false;
- }
- makeUnconditional(BB, PreferredSucc->BB);
-
- // Inform the dominators about the deleted CFG edges.
- SmallVector<DominatorTree::UpdateType, 4> DeletedEdges;
- for (auto *Succ : RemovedSuccessors) {
- // It might have happened that the same successor appeared multiple times
- // and the CFG edge wasn't really removed.
- if (Succ != PreferredSucc->BB) {
- LLVM_DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion"
- << BB->getName() << " -> " << Succ->getName()
- << "\n");
- DeletedEdges.push_back({DominatorTree::Delete, BB, Succ});
- }
- }
-
- DomTreeUpdater(DT, &PDT, DomTreeUpdater::UpdateStrategy::Eager)
- .applyUpdates(DeletedEdges);
-
- NumBranchesRemoved += 1;
- Changed = true;
- }
-
- return Changed;
-}
-
-// reverse top-sort order
-void AggressiveDeadCodeElimination::computeReversePostOrder() {
- // This provides a post-order numbering of the reverse control flow graph
- // Note that it is incomplete in the presence of infinite loops but we don't
- // need numbers blocks which don't reach the end of the functions since
- // all branches in those blocks are forced live.
-
- // For each block without successors, extend the DFS from the block
- // backward through the graph
- SmallPtrSet<BasicBlock*, 16> Visited;
- unsigned PostOrder = 0;
- for (auto &BB : F) {
+ // Skip any value profile instrumentation calls if they are
+ // instrumenting constants.
+ if (isInstrumentsConstant(I))
+ return false;
+ return true;
+ }
+ if (!I.isTerminator())
+ return false;
+ if (RemoveControlFlowFlag && (isa<BranchInst>(I) || isa<SwitchInst>(I)))
+ return false;
+ return true;
+}
+
+// Check if this instruction is a runtime call for value profiling and
+// if it's instrumenting a constant.
+bool AggressiveDeadCodeElimination::isInstrumentsConstant(Instruction &I) {
+ // TODO -- move this test into llvm::isInstructionTriviallyDead
+ if (CallInst *CI = dyn_cast<CallInst>(&I))
+ if (Function *Callee = CI->getCalledFunction())
+ if (Callee->getName().equals(getInstrProfValueProfFuncName()))
+ if (isa<Constant>(CI->getArgOperand(0)))
+ return true;
+ return false;
+}
+
+void AggressiveDeadCodeElimination::markLiveInstructions() {
+ // Propagate liveness backwards to operands.
+ do {
+ // Worklist holds newly discovered live instructions
+ // where we need to mark the inputs as live.
+ while (!Worklist.empty()) {
+ Instruction *LiveInst = Worklist.pop_back_val();
+ LLVM_DEBUG(dbgs() << "work live: "; LiveInst->dump(););
+
+ for (Use &OI : LiveInst->operands())
+ if (Instruction *Inst = dyn_cast<Instruction>(OI))
+ markLive(Inst);
+
+ if (auto *PN = dyn_cast<PHINode>(LiveInst))
+ markPhiLive(PN);
+ }
+
+ // After data flow liveness has been identified, examine which branch
+ // decisions are required to determine live instructions are executed.
+ markLiveBranchesFromControlDependences();
+
+ } while (!Worklist.empty());
+}
+
+void AggressiveDeadCodeElimination::markLive(Instruction *I) {
+ auto &Info = InstInfo[I];
+ if (Info.Live)
+ return;
+
+ LLVM_DEBUG(dbgs() << "mark live: "; I->dump());
+ Info.Live = true;
+ Worklist.push_back(I);
+
+ // Collect the live debug info scopes attached to this instruction.
+ if (const DILocation *DL = I->getDebugLoc())
+ collectLiveScopes(*DL);
+
+ // Mark the containing block live
+ auto &BBInfo = *Info.Block;
+ if (BBInfo.Terminator == I) {
+ BlocksWithDeadTerminators.remove(BBInfo.BB);
+ // For live terminators, mark destination blocks
+ // live to preserve this control flow edges.
+ if (!BBInfo.UnconditionalBranch)
+ for (auto *BB : successors(I->getParent()))
+ markLive(BB);
+ }
+ markLive(BBInfo);
+}
+
+void AggressiveDeadCodeElimination::markLive(BlockInfoType &BBInfo) {
+ if (BBInfo.Live)
+ return;
+ LLVM_DEBUG(dbgs() << "mark block live: " << BBInfo.BB->getName() << '\n');
+ BBInfo.Live = true;
+ if (!BBInfo.CFLive) {
+ BBInfo.CFLive = true;
+ NewLiveBlocks.insert(BBInfo.BB);
+ }
+
+ // Mark unconditional branches at the end of live
+ // blocks as live since there is no work to do for them later
+ if (BBInfo.UnconditionalBranch)
+ markLive(BBInfo.Terminator);
+}
+
+void AggressiveDeadCodeElimination::collectLiveScopes(const DILocalScope &LS) {
+ if (!AliveScopes.insert(&LS).second)
+ return;
+
+ if (isa<DISubprogram>(LS))
+ return;
+
+ // Tail-recurse through the scope chain.
+ collectLiveScopes(cast<DILocalScope>(*LS.getScope()));
+}
+
+void AggressiveDeadCodeElimination::collectLiveScopes(const DILocation &DL) {
+ // Even though DILocations are not scopes, shove them into AliveScopes so we
+ // don't revisit them.
+ if (!AliveScopes.insert(&DL).second)
+ return;
+
+ // Collect live scopes from the scope chain.
+ collectLiveScopes(*DL.getScope());
+
+ // Tail-recurse through the inlined-at chain.
+ if (const DILocation *IA = DL.getInlinedAt())
+ collectLiveScopes(*IA);
+}
+
+void AggressiveDeadCodeElimination::markPhiLive(PHINode *PN) {
+ auto &Info = BlockInfo[PN->getParent()];
+ // Only need to check this once per block.
+ if (Info.HasLivePhiNodes)
+ return;
+ Info.HasLivePhiNodes = true;
+
+ // If a predecessor block is not live, mark it as control-flow live
+ // which will trigger marking live branches upon which
+ // that block is control dependent.
+ for (auto *PredBB : predecessors(Info.BB)) {
+ auto &Info = BlockInfo[PredBB];
+ if (!Info.CFLive) {
+ Info.CFLive = true;
+ NewLiveBlocks.insert(PredBB);
+ }
+ }
+}
+
+void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
+ if (BlocksWithDeadTerminators.empty())
+ return;
+
+ LLVM_DEBUG({
+ dbgs() << "new live blocks:\n";
+ for (auto *BB : NewLiveBlocks)
+ dbgs() << "\t" << BB->getName() << '\n';
+ dbgs() << "dead terminator blocks:\n";
+ for (auto *BB : BlocksWithDeadTerminators)
+ dbgs() << "\t" << BB->getName() << '\n';
+ });
+
+ // The dominance frontier of a live block X in the reverse
+ // control graph is the set of blocks upon which X is control
+ // dependent. The following sequence computes the set of blocks
+ // which currently have dead terminators that are control
+ // dependence sources of a block which is in NewLiveBlocks.
+
+ const SmallPtrSet<BasicBlock *, 16> BWDT{
+ BlocksWithDeadTerminators.begin(),
+ BlocksWithDeadTerminators.end()
+ };
+ SmallVector<BasicBlock *, 32> IDFBlocks;
+ ReverseIDFCalculator IDFs(PDT);
+ IDFs.setDefiningBlocks(NewLiveBlocks);
+ IDFs.setLiveInBlocks(BWDT);
+ IDFs.calculate(IDFBlocks);
+ NewLiveBlocks.clear();
+
+ // Dead terminators which control live blocks are now marked live.
+ for (auto *BB : IDFBlocks) {
+ LLVM_DEBUG(dbgs() << "live control in: " << BB->getName() << '\n');
+ markLive(BB->getTerminator());
+ }
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Routines to update the CFG and SSA information before removing dead code.
+//
+//===----------------------------------------------------------------------===//
+bool AggressiveDeadCodeElimination::removeDeadInstructions() {
+ // Updates control and dataflow around dead blocks
+ bool RegionsUpdated = updateDeadRegions();
+
+ LLVM_DEBUG({
+ for (Instruction &I : instructions(F)) {
+ // Check if the instruction is alive.
+ if (isLive(&I))
+ continue;
+
+ if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
+ // Check if the scope of this variable location is alive.
+ if (AliveScopes.count(DII->getDebugLoc()->getScope()))
+ continue;
+
+ // If intrinsic is pointing at a live SSA value, there may be an
+ // earlier optimization bug: if we know the location of the variable,
+ // why isn't the scope of the location alive?
+ if (Value *V = DII->getVariableLocation())
+ if (Instruction *II = dyn_cast<Instruction>(V))
+ if (isLive(II))
+ dbgs() << "Dropping debug info for " << *DII << "\n";
+ }
+ }
+ });
+
+ // The inverse of the live set is the dead set. These are those instructions
+ // that have no side effects and do not influence the control flow or return
+ // value of the function, and may therefore be deleted safely.
+ // NOTE: We reuse the Worklist vector here for memory efficiency.
+ for (Instruction &I : instructions(F)) {
+ // Check if the instruction is alive.
+ if (isLive(&I))
+ continue;
+
+ if (auto *DII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+ // Check if the scope of this variable location is alive.
+ if (AliveScopes.count(DII->getDebugLoc()->getScope()))
+ continue;
+
+ // Fallthrough and drop the intrinsic.
+ }
+
+ // Prepare to delete.
+ Worklist.push_back(&I);
+ I.dropAllReferences();
+ }
+
+ for (Instruction *&I : Worklist) {
+ ++NumRemoved;
+ I->eraseFromParent();
+ }
+
+ return !Worklist.empty() || RegionsUpdated;
+}
+
+// A dead region is the set of dead blocks with a common live post-dominator.
+bool AggressiveDeadCodeElimination::updateDeadRegions() {
+ LLVM_DEBUG({
+ dbgs() << "final dead terminator blocks: " << '\n';
+ for (auto *BB : BlocksWithDeadTerminators)
+ dbgs() << '\t' << BB->getName()
+ << (BlockInfo[BB].Live ? " LIVE\n" : "\n");
+ });
+
+ // Don't compute the post ordering unless we needed it.
+ bool HavePostOrder = false;
+ bool Changed = false;
+
+ for (auto *BB : BlocksWithDeadTerminators) {
+ auto &Info = BlockInfo[BB];
+ if (Info.UnconditionalBranch) {
+ InstInfo[Info.Terminator].Live = true;
+ continue;
+ }
+
+ if (!HavePostOrder) {
+ computeReversePostOrder();
+ HavePostOrder = true;
+ }
+
+ // Add an unconditional branch to the successor closest to the
+ // end of the function which insures a path to the exit for each
+ // live edge.
+ BlockInfoType *PreferredSucc = nullptr;
+ for (auto *Succ : successors(BB)) {
+ auto *Info = &BlockInfo[Succ];
+ if (!PreferredSucc || PreferredSucc->PostOrder < Info->PostOrder)
+ PreferredSucc = Info;
+ }
+ assert((PreferredSucc && PreferredSucc->PostOrder > 0) &&
+ "Failed to find safe successor for dead branch");
+
+ // Collect removed successors to update the (Post)DominatorTrees.
+ SmallPtrSet<BasicBlock *, 4> RemovedSuccessors;
+ bool First = true;
+ for (auto *Succ : successors(BB)) {
+ if (!First || Succ != PreferredSucc->BB) {
+ Succ->removePredecessor(BB);
+ RemovedSuccessors.insert(Succ);
+ } else
+ First = false;
+ }
+ makeUnconditional(BB, PreferredSucc->BB);
+
+ // Inform the dominators about the deleted CFG edges.
+ SmallVector<DominatorTree::UpdateType, 4> DeletedEdges;
+ for (auto *Succ : RemovedSuccessors) {
+ // It might have happened that the same successor appeared multiple times
+ // and the CFG edge wasn't really removed.
+ if (Succ != PreferredSucc->BB) {
+ LLVM_DEBUG(dbgs() << "ADCE: (Post)DomTree edge enqueued for deletion"
+ << BB->getName() << " -> " << Succ->getName()
+ << "\n");
+ DeletedEdges.push_back({DominatorTree::Delete, BB, Succ});
+ }
+ }
+
+ DomTreeUpdater(DT, &PDT, DomTreeUpdater::UpdateStrategy::Eager)
+ .applyUpdates(DeletedEdges);
+
+ NumBranchesRemoved += 1;
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+// reverse top-sort order
+void AggressiveDeadCodeElimination::computeReversePostOrder() {
+ // This provides a post-order numbering of the reverse control flow graph
+ // Note that it is incomplete in the presence of infinite loops but we don't
+ // need numbers blocks which don't reach the end of the functions since
+ // all branches in those blocks are forced live.
+
+ // For each block without successors, extend the DFS from the block
+ // backward through the graph
+ SmallPtrSet<BasicBlock*, 16> Visited;
+ unsigned PostOrder = 0;
+ for (auto &BB : F) {
if (!succ_empty(&BB))
- continue;
- for (BasicBlock *Block : inverse_post_order_ext(&BB,Visited))
- BlockInfo[Block].PostOrder = PostOrder++;
- }
-}
-
-void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
- BasicBlock *Target) {
- Instruction *PredTerm = BB->getTerminator();
- // Collect the live debug info scopes attached to this instruction.
- if (const DILocation *DL = PredTerm->getDebugLoc())
- collectLiveScopes(*DL);
-
- // Just mark live an existing unconditional branch
- if (isUnconditionalBranch(PredTerm)) {
- PredTerm->setSuccessor(0, Target);
- InstInfo[PredTerm].Live = true;
- return;
- }
- LLVM_DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n');
- NumBranchesRemoved += 1;
- IRBuilder<> Builder(PredTerm);
- auto *NewTerm = Builder.CreateBr(Target);
- InstInfo[NewTerm].Live = true;
- if (const DILocation *DL = PredTerm->getDebugLoc())
- NewTerm->setDebugLoc(DL);
-
- InstInfo.erase(PredTerm);
- PredTerm->eraseFromParent();
-}
-
-//===----------------------------------------------------------------------===//
-//
-// Pass Manager integration code
-//
-//===----------------------------------------------------------------------===//
-PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) {
- // ADCE does not need DominatorTree, but require DominatorTree here
- // to update analysis if it is already available.
- auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
- auto &PDT = FAM.getResult<PostDominatorTreeAnalysis>(F);
- if (!AggressiveDeadCodeElimination(F, DT, PDT).performDeadCodeElimination())
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- // TODO: We could track if we have actually done CFG changes.
- if (!RemoveControlFlowFlag)
- PA.preserveSet<CFGAnalyses>();
- else {
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<PostDominatorTreeAnalysis>();
- }
- PA.preserve<GlobalsAA>();
- return PA;
-}
-
-namespace {
-
-struct ADCELegacyPass : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
-
- ADCELegacyPass() : FunctionPass(ID) {
- initializeADCELegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- // ADCE does not need DominatorTree, but require DominatorTree here
- // to update analysis if it is already available.
- auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
- auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
- return AggressiveDeadCodeElimination(F, DT, PDT)
- .performDeadCodeElimination();
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<PostDominatorTreeWrapperPass>();
- if (!RemoveControlFlowFlag)
- AU.setPreservesCFG();
- else {
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<PostDominatorTreeWrapperPass>();
- }
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-char ADCELegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(ADCELegacyPass, "adce",
- "Aggressive Dead Code Elimination", false, false)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination",
- false, false)
-
-FunctionPass *llvm::createAggressiveDCEPass() { return new ADCELegacyPass(); }
+ continue;
+ for (BasicBlock *Block : inverse_post_order_ext(&BB,Visited))
+ BlockInfo[Block].PostOrder = PostOrder++;
+ }
+}
+
+void AggressiveDeadCodeElimination::makeUnconditional(BasicBlock *BB,
+ BasicBlock *Target) {
+ Instruction *PredTerm = BB->getTerminator();
+ // Collect the live debug info scopes attached to this instruction.
+ if (const DILocation *DL = PredTerm->getDebugLoc())
+ collectLiveScopes(*DL);
+
+ // Just mark live an existing unconditional branch
+ if (isUnconditionalBranch(PredTerm)) {
+ PredTerm->setSuccessor(0, Target);
+ InstInfo[PredTerm].Live = true;
+ return;
+ }
+ LLVM_DEBUG(dbgs() << "making unconditional " << BB->getName() << '\n');
+ NumBranchesRemoved += 1;
+ IRBuilder<> Builder(PredTerm);
+ auto *NewTerm = Builder.CreateBr(Target);
+ InstInfo[NewTerm].Live = true;
+ if (const DILocation *DL = PredTerm->getDebugLoc())
+ NewTerm->setDebugLoc(DL);
+
+ InstInfo.erase(PredTerm);
+ PredTerm->eraseFromParent();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Pass Manager integration code
+//
+//===----------------------------------------------------------------------===//
+PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) {
+ // ADCE does not need DominatorTree, but require DominatorTree here
+ // to update analysis if it is already available.
+ auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+ auto &PDT = FAM.getResult<PostDominatorTreeAnalysis>(F);
+ if (!AggressiveDeadCodeElimination(F, DT, PDT).performDeadCodeElimination())
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ // TODO: We could track if we have actually done CFG changes.
+ if (!RemoveControlFlowFlag)
+ PA.preserveSet<CFGAnalyses>();
+ else {
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<PostDominatorTreeAnalysis>();
+ }
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+namespace {
+
+struct ADCELegacyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+
+ ADCELegacyPass() : FunctionPass(ID) {
+ initializeADCELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ // ADCE does not need DominatorTree, but require DominatorTree here
+ // to update analysis if it is already available.
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ return AggressiveDeadCodeElimination(F, DT, PDT)
+ .performDeadCodeElimination();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+ if (!RemoveControlFlowFlag)
+ AU.setPreservesCFG();
+ else {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<PostDominatorTreeWrapperPass>();
+ }
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char ADCELegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ADCELegacyPass, "adce",
+ "Aggressive Dead Code Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ADCELegacyPass, "adce", "Aggressive Dead Code Elimination",
+ false, false)
+
+FunctionPass *llvm::createAggressiveDCEPass() { return new ADCELegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index b05b073c02..bccf94fc21 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -1,217 +1,217 @@
-//===----------------------- AlignmentFromAssumptions.cpp -----------------===//
-// Set Load/Store Alignments From Assumptions
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a ScalarEvolution-based transformation to set
-// the alignments of load, stores and memory intrinsics based on the truth
-// expressions of assume intrinsics. The primary motivation is to handle
-// complex alignment assumptions that apply to vector loads and stores that
-// appear after vectorization and unrolling.
-//
-//===----------------------------------------------------------------------===//
-
+//===----------------------- AlignmentFromAssumptions.cpp -----------------===//
+// Set Load/Store Alignments From Assumptions
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a ScalarEvolution-based transformation to set
+// the alignments of load, stores and memory intrinsics based on the truth
+// expressions of assume intrinsics. The primary motivation is to handle
+// complex alignment assumptions that apply to vector loads and stores that
+// appear after vectorization and unrolling.
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/IR/Instructions.h"
-#include "llvm/InitializePasses.h"
-#define AA_NAME "alignment-from-assumptions"
-#define DEBUG_TYPE AA_NAME
-#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-using namespace llvm;
-
-STATISTIC(NumLoadAlignChanged,
- "Number of loads changed by alignment assumptions");
-STATISTIC(NumStoreAlignChanged,
- "Number of stores changed by alignment assumptions");
-STATISTIC(NumMemIntAlignChanged,
- "Number of memory intrinsics changed by alignment assumptions");
-
-namespace {
-struct AlignmentFromAssumptions : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- AlignmentFromAssumptions() : FunctionPass(ID) {
- initializeAlignmentFromAssumptionsPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
-
- AU.setPreservesCFG();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- }
-
- AlignmentFromAssumptionsPass Impl;
-};
-}
-
-char AlignmentFromAssumptions::ID = 0;
-static const char aip_name[] = "Alignment from assumptions";
-INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME,
- aip_name, false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME,
- aip_name, false, false)
-
-FunctionPass *llvm::createAlignmentFromAssumptionsPass() {
- return new AlignmentFromAssumptions();
-}
-
-// Given an expression for the (constant) alignment, AlignSCEV, and an
-// expression for the displacement between a pointer and the aligned address,
-// DiffSCEV, compute the alignment of the displaced pointer if it can be reduced
-// to a constant. Using SCEV to compute alignment handles the case where
-// DiffSCEV is a recurrence with constant start such that the aligned offset
-// is constant. e.g. {16,+,32} % 32 -> 16.
-static MaybeAlign getNewAlignmentDiff(const SCEV *DiffSCEV,
- const SCEV *AlignSCEV,
- ScalarEvolution *SE) {
- // DiffUnits = Diff % int64_t(Alignment)
- const SCEV *DiffUnitsSCEV = SE->getURemExpr(DiffSCEV, AlignSCEV);
-
- LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is "
- << *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
-
- if (const SCEVConstant *ConstDUSCEV =
- dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {
- int64_t DiffUnits = ConstDUSCEV->getValue()->getSExtValue();
-
- // If the displacement is an exact multiple of the alignment, then the
- // displaced pointer has the same alignment as the aligned pointer, so
- // return the alignment value.
- if (!DiffUnits)
- return cast<SCEVConstant>(AlignSCEV)->getValue()->getAlignValue();
-
- // If the displacement is not an exact multiple, but the remainder is a
- // constant, then return this remainder (but only if it is a power of 2).
- uint64_t DiffUnitsAbs = std::abs(DiffUnits);
- if (isPowerOf2_64(DiffUnitsAbs))
- return Align(DiffUnitsAbs);
- }
-
- return None;
-}
-
-// There is an address given by an offset OffSCEV from AASCEV which has an
-// alignment AlignSCEV. Use that information, if possible, to compute a new
-// alignment for Ptr.
-static Align getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
- const SCEV *OffSCEV, Value *Ptr,
- ScalarEvolution *SE) {
- const SCEV *PtrSCEV = SE->getSCEV(Ptr);
- // On a platform with 32-bit allocas, but 64-bit flat/global pointer sizes
- // (*cough* AMDGPU), the effective SCEV type of AASCEV and PtrSCEV
- // may disagree. Trunc/extend so they agree.
- PtrSCEV = SE->getTruncateOrZeroExtend(
- PtrSCEV, SE->getEffectiveSCEVType(AASCEV->getType()));
- const SCEV *DiffSCEV = SE->getMinusSCEV(PtrSCEV, AASCEV);
-
- // On 32-bit platforms, DiffSCEV might now have type i32 -- we've always
- // sign-extended OffSCEV to i64, so make sure they agree again.
- DiffSCEV = SE->getNoopOrSignExtend(DiffSCEV, OffSCEV->getType());
-
- // What we really want to know is the overall offset to the aligned
- // address. This address is displaced by the provided offset.
- DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV);
-
- LLVM_DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to "
- << *AlignSCEV << " and offset " << *OffSCEV
- << " using diff " << *DiffSCEV << "\n");
-
- if (MaybeAlign NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE)) {
- LLVM_DEBUG(dbgs() << "\tnew alignment: " << DebugStr(NewAlignment) << "\n");
- return *NewAlignment;
- }
-
- if (const SCEVAddRecExpr *DiffARSCEV = dyn_cast<SCEVAddRecExpr>(DiffSCEV)) {
- // The relative offset to the alignment assumption did not yield a constant,
- // but we should try harder: if we assume that a is 32-byte aligned, then in
- // for (i = 0; i < 1024; i += 4) r += a[i]; not all of the loads from a are
- // 32-byte aligned, but instead alternate between 32 and 16-byte alignment.
- // As a result, the new alignment will not be a constant, but can still
- // be improved over the default (of 4) to 16.
-
- const SCEV *DiffStartSCEV = DiffARSCEV->getStart();
- const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE);
-
- LLVM_DEBUG(dbgs() << "\ttrying start/inc alignment using start "
- << *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
-
- // Now compute the new alignment using the displacement to the value in the
- // first iteration, and also the alignment using the per-iteration delta.
- // If these are the same, then use that answer. Otherwise, use the smaller
- // one, but only if it divides the larger one.
- MaybeAlign NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
- MaybeAlign NewIncAlignment =
- getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
-
- LLVM_DEBUG(dbgs() << "\tnew start alignment: " << DebugStr(NewAlignment)
- << "\n");
- LLVM_DEBUG(dbgs() << "\tnew inc alignment: " << DebugStr(NewIncAlignment)
- << "\n");
-
- if (!NewAlignment || !NewIncAlignment)
- return Align(1);
-
- const Align NewAlign = *NewAlignment;
- const Align NewIncAlign = *NewIncAlignment;
- if (NewAlign > NewIncAlign) {
- LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: "
- << DebugStr(NewIncAlign) << "\n");
- return NewIncAlign;
- }
- if (NewIncAlign > NewAlign) {
- LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << DebugStr(NewAlign)
- << "\n");
- return NewAlign;
- }
- assert(NewIncAlign == NewAlign);
- LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << DebugStr(NewAlign)
- << "\n");
- return NewAlign;
- }
-
- return Align(1);
-}
-
-bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I,
+#include "llvm/InitializePasses.h"
+#define AA_NAME "alignment-from-assumptions"
+#define DEBUG_TYPE AA_NAME
+#include "llvm/Transforms/Scalar/AlignmentFromAssumptions.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+STATISTIC(NumLoadAlignChanged,
+ "Number of loads changed by alignment assumptions");
+STATISTIC(NumStoreAlignChanged,
+ "Number of stores changed by alignment assumptions");
+STATISTIC(NumMemIntAlignChanged,
+ "Number of memory intrinsics changed by alignment assumptions");
+
+namespace {
+struct AlignmentFromAssumptions : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ AlignmentFromAssumptions() : FunctionPass(ID) {
+ initializeAlignmentFromAssumptionsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+
+ AU.setPreservesCFG();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ }
+
+ AlignmentFromAssumptionsPass Impl;
+};
+}
+
+char AlignmentFromAssumptions::ID = 0;
+static const char aip_name[] = "Alignment from assumptions";
+INITIALIZE_PASS_BEGIN(AlignmentFromAssumptions, AA_NAME,
+ aip_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(AlignmentFromAssumptions, AA_NAME,
+ aip_name, false, false)
+
+FunctionPass *llvm::createAlignmentFromAssumptionsPass() {
+ return new AlignmentFromAssumptions();
+}
+
+// Given an expression for the (constant) alignment, AlignSCEV, and an
+// expression for the displacement between a pointer and the aligned address,
+// DiffSCEV, compute the alignment of the displaced pointer if it can be reduced
+// to a constant. Using SCEV to compute alignment handles the case where
+// DiffSCEV is a recurrence with constant start such that the aligned offset
+// is constant. e.g. {16,+,32} % 32 -> 16.
+static MaybeAlign getNewAlignmentDiff(const SCEV *DiffSCEV,
+ const SCEV *AlignSCEV,
+ ScalarEvolution *SE) {
+ // DiffUnits = Diff % int64_t(Alignment)
+ const SCEV *DiffUnitsSCEV = SE->getURemExpr(DiffSCEV, AlignSCEV);
+
+ LLVM_DEBUG(dbgs() << "\talignment relative to " << *AlignSCEV << " is "
+ << *DiffUnitsSCEV << " (diff: " << *DiffSCEV << ")\n");
+
+ if (const SCEVConstant *ConstDUSCEV =
+ dyn_cast<SCEVConstant>(DiffUnitsSCEV)) {
+ int64_t DiffUnits = ConstDUSCEV->getValue()->getSExtValue();
+
+ // If the displacement is an exact multiple of the alignment, then the
+ // displaced pointer has the same alignment as the aligned pointer, so
+ // return the alignment value.
+ if (!DiffUnits)
+ return cast<SCEVConstant>(AlignSCEV)->getValue()->getAlignValue();
+
+ // If the displacement is not an exact multiple, but the remainder is a
+ // constant, then return this remainder (but only if it is a power of 2).
+ uint64_t DiffUnitsAbs = std::abs(DiffUnits);
+ if (isPowerOf2_64(DiffUnitsAbs))
+ return Align(DiffUnitsAbs);
+ }
+
+ return None;
+}
+
+// There is an address given by an offset OffSCEV from AASCEV which has an
+// alignment AlignSCEV. Use that information, if possible, to compute a new
+// alignment for Ptr.
+static Align getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
+ const SCEV *OffSCEV, Value *Ptr,
+ ScalarEvolution *SE) {
+ const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+ // On a platform with 32-bit allocas, but 64-bit flat/global pointer sizes
+ // (*cough* AMDGPU), the effective SCEV type of AASCEV and PtrSCEV
+ // may disagree. Trunc/extend so they agree.
+ PtrSCEV = SE->getTruncateOrZeroExtend(
+ PtrSCEV, SE->getEffectiveSCEVType(AASCEV->getType()));
+ const SCEV *DiffSCEV = SE->getMinusSCEV(PtrSCEV, AASCEV);
+
+ // On 32-bit platforms, DiffSCEV might now have type i32 -- we've always
+ // sign-extended OffSCEV to i64, so make sure they agree again.
+ DiffSCEV = SE->getNoopOrSignExtend(DiffSCEV, OffSCEV->getType());
+
+ // What we really want to know is the overall offset to the aligned
+ // address. This address is displaced by the provided offset.
+ DiffSCEV = SE->getMinusSCEV(DiffSCEV, OffSCEV);
+
+ LLVM_DEBUG(dbgs() << "AFI: alignment of " << *Ptr << " relative to "
+ << *AlignSCEV << " and offset " << *OffSCEV
+ << " using diff " << *DiffSCEV << "\n");
+
+ if (MaybeAlign NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE)) {
+ LLVM_DEBUG(dbgs() << "\tnew alignment: " << DebugStr(NewAlignment) << "\n");
+ return *NewAlignment;
+ }
+
+ if (const SCEVAddRecExpr *DiffARSCEV = dyn_cast<SCEVAddRecExpr>(DiffSCEV)) {
+ // The relative offset to the alignment assumption did not yield a constant,
+ // but we should try harder: if we assume that a is 32-byte aligned, then in
+ // for (i = 0; i < 1024; i += 4) r += a[i]; not all of the loads from a are
+ // 32-byte aligned, but instead alternate between 32 and 16-byte alignment.
+ // As a result, the new alignment will not be a constant, but can still
+ // be improved over the default (of 4) to 16.
+
+ const SCEV *DiffStartSCEV = DiffARSCEV->getStart();
+ const SCEV *DiffIncSCEV = DiffARSCEV->getStepRecurrence(*SE);
+
+ LLVM_DEBUG(dbgs() << "\ttrying start/inc alignment using start "
+ << *DiffStartSCEV << " and inc " << *DiffIncSCEV << "\n");
+
+ // Now compute the new alignment using the displacement to the value in the
+ // first iteration, and also the alignment using the per-iteration delta.
+ // If these are the same, then use that answer. Otherwise, use the smaller
+ // one, but only if it divides the larger one.
+ MaybeAlign NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
+ MaybeAlign NewIncAlignment =
+ getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
+
+ LLVM_DEBUG(dbgs() << "\tnew start alignment: " << DebugStr(NewAlignment)
+ << "\n");
+ LLVM_DEBUG(dbgs() << "\tnew inc alignment: " << DebugStr(NewIncAlignment)
+ << "\n");
+
+ if (!NewAlignment || !NewIncAlignment)
+ return Align(1);
+
+ const Align NewAlign = *NewAlignment;
+ const Align NewIncAlign = *NewIncAlignment;
+ if (NewAlign > NewIncAlign) {
+ LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: "
+ << DebugStr(NewIncAlign) << "\n");
+ return NewIncAlign;
+ }
+ if (NewIncAlign > NewAlign) {
+ LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << DebugStr(NewAlign)
+ << "\n");
+ return NewAlign;
+ }
+ assert(NewIncAlign == NewAlign);
+ LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << DebugStr(NewAlign)
+ << "\n");
+ return NewAlign;
+ }
+
+ return Align(1);
+}
+
+bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I,
unsigned Idx,
- Value *&AAPtr,
- const SCEV *&AlignSCEV,
- const SCEV *&OffSCEV) {
+ Value *&AAPtr,
+ const SCEV *&AlignSCEV,
+ const SCEV *&OffSCEV) {
Type *Int64Ty = Type::getInt64Ty(I->getContext());
OperandBundleUse AlignOB = I->getOperandBundleAt(Idx);
if (AlignOB.getTagName() != "align")
- return false;
+ return false;
assert(AlignOB.Inputs.size() >= 2);
AAPtr = AlignOB.Inputs[0].get();
// TODO: Consider accumulating the offset to the base.
@@ -221,139 +221,139 @@ bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I,
if (AlignOB.Inputs.size() == 3)
OffSCEV = SE->getSCEV(AlignOB.Inputs[2].get());
else
- OffSCEV = SE->getZero(Int64Ty);
+ OffSCEV = SE->getZero(Int64Ty);
OffSCEV = SE->getTruncateOrZeroExtend(OffSCEV, Int64Ty);
- return true;
-}
-
+ return true;
+}
+
bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall,
unsigned Idx) {
- Value *AAPtr;
- const SCEV *AlignSCEV, *OffSCEV;
+ Value *AAPtr;
+ const SCEV *AlignSCEV, *OffSCEV;
if (!extractAlignmentInfo(ACall, Idx, AAPtr, AlignSCEV, OffSCEV))
- return false;
-
- // Skip ConstantPointerNull and UndefValue. Assumptions on these shouldn't
- // affect other users.
- if (isa<ConstantData>(AAPtr))
- return false;
-
- const SCEV *AASCEV = SE->getSCEV(AAPtr);
-
- // Apply the assumption to all other users of the specified pointer.
- SmallPtrSet<Instruction *, 32> Visited;
- SmallVector<Instruction*, 16> WorkList;
- for (User *J : AAPtr->users()) {
- if (J == ACall)
- continue;
-
- if (Instruction *K = dyn_cast<Instruction>(J))
- WorkList.push_back(K);
- }
-
- while (!WorkList.empty()) {
- Instruction *J = WorkList.pop_back_val();
- if (LoadInst *LI = dyn_cast<LoadInst>(J)) {
+ return false;
+
+ // Skip ConstantPointerNull and UndefValue. Assumptions on these shouldn't
+ // affect other users.
+ if (isa<ConstantData>(AAPtr))
+ return false;
+
+ const SCEV *AASCEV = SE->getSCEV(AAPtr);
+
+ // Apply the assumption to all other users of the specified pointer.
+ SmallPtrSet<Instruction *, 32> Visited;
+ SmallVector<Instruction*, 16> WorkList;
+ for (User *J : AAPtr->users()) {
+ if (J == ACall)
+ continue;
+
+ if (Instruction *K = dyn_cast<Instruction>(J))
+ WorkList.push_back(K);
+ }
+
+ while (!WorkList.empty()) {
+ Instruction *J = WorkList.pop_back_val();
+ if (LoadInst *LI = dyn_cast<LoadInst>(J)) {
if (!isValidAssumeForContext(ACall, J, DT))
continue;
- Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
- LI->getPointerOperand(), SE);
- if (NewAlignment > LI->getAlign()) {
- LI->setAlignment(NewAlignment);
- ++NumLoadAlignChanged;
- }
- } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
+ Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+ LI->getPointerOperand(), SE);
+ if (NewAlignment > LI->getAlign()) {
+ LI->setAlignment(NewAlignment);
+ ++NumLoadAlignChanged;
+ }
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
if (!isValidAssumeForContext(ACall, J, DT))
continue;
- Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
- SI->getPointerOperand(), SE);
- if (NewAlignment > SI->getAlign()) {
- SI->setAlignment(NewAlignment);
- ++NumStoreAlignChanged;
- }
- } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
+ Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+ SI->getPointerOperand(), SE);
+ if (NewAlignment > SI->getAlign()) {
+ SI->setAlignment(NewAlignment);
+ ++NumStoreAlignChanged;
+ }
+ } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
if (!isValidAssumeForContext(ACall, J, DT))
continue;
- Align NewDestAlignment =
- getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MI->getDest(), SE);
-
- LLVM_DEBUG(dbgs() << "\tmem inst: " << DebugStr(NewDestAlignment)
- << "\n";);
- if (NewDestAlignment > *MI->getDestAlign()) {
- MI->setDestAlignment(NewDestAlignment);
- ++NumMemIntAlignChanged;
- }
-
- // For memory transfers, there is also a source alignment that
- // can be set.
- if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
- Align NewSrcAlignment =
- getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MTI->getSource(), SE);
-
- LLVM_DEBUG(dbgs() << "\tmem trans: " << DebugStr(NewSrcAlignment)
- << "\n";);
-
- if (NewSrcAlignment > *MTI->getSourceAlign()) {
- MTI->setSourceAlignment(NewSrcAlignment);
- ++NumMemIntAlignChanged;
- }
- }
- }
-
- // Now that we've updated that use of the pointer, look for other uses of
- // the pointer to update.
- Visited.insert(J);
- for (User *UJ : J->users()) {
- Instruction *K = cast<Instruction>(UJ);
+ Align NewDestAlignment =
+ getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MI->getDest(), SE);
+
+ LLVM_DEBUG(dbgs() << "\tmem inst: " << DebugStr(NewDestAlignment)
+ << "\n";);
+ if (NewDestAlignment > *MI->getDestAlign()) {
+ MI->setDestAlignment(NewDestAlignment);
+ ++NumMemIntAlignChanged;
+ }
+
+ // For memory transfers, there is also a source alignment that
+ // can be set.
+ if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+ Align NewSrcAlignment =
+ getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MTI->getSource(), SE);
+
+ LLVM_DEBUG(dbgs() << "\tmem trans: " << DebugStr(NewSrcAlignment)
+ << "\n";);
+
+ if (NewSrcAlignment > *MTI->getSourceAlign()) {
+ MTI->setSourceAlignment(NewSrcAlignment);
+ ++NumMemIntAlignChanged;
+ }
+ }
+ }
+
+ // Now that we've updated that use of the pointer, look for other uses of
+ // the pointer to update.
+ Visited.insert(J);
+ for (User *UJ : J->users()) {
+ Instruction *K = cast<Instruction>(UJ);
if (!Visited.count(K))
- WorkList.push_back(K);
- }
- }
-
- return true;
-}
-
-bool AlignmentFromAssumptions::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
- return Impl.runImpl(F, AC, SE, DT);
-}
-
-bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC,
- ScalarEvolution *SE_,
- DominatorTree *DT_) {
- SE = SE_;
- DT = DT_;
-
- bool Changed = false;
- for (auto &AssumeVH : AC.assumptions())
+ WorkList.push_back(K);
+ }
+ }
+
+ return true;
+}
+
+bool AlignmentFromAssumptions::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+ return Impl.runImpl(F, AC, SE, DT);
+}
+
+bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC,
+ ScalarEvolution *SE_,
+ DominatorTree *DT_) {
+ SE = SE_;
+ DT = DT_;
+
+ bool Changed = false;
+ for (auto &AssumeVH : AC.assumptions())
if (AssumeVH) {
CallInst *Call = cast<CallInst>(AssumeVH);
for (unsigned Idx = 0; Idx < Call->getNumOperandBundles(); Idx++)
Changed |= processAssumption(Call, Idx);
}
-
- return Changed;
-}
-
-PreservedAnalyses
-AlignmentFromAssumptionsPass::run(Function &F, FunctionAnalysisManager &AM) {
-
- AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
- ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
- DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
- if (!runImpl(F, AC, &SE, &DT))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<AAManager>();
- PA.preserve<ScalarEvolutionAnalysis>();
- PA.preserve<GlobalsAA>();
- return PA;
-}
+
+ return Changed;
+}
+
+PreservedAnalyses
+AlignmentFromAssumptionsPass::run(Function &F, FunctionAnalysisManager &AM) {
+
+ AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
+ ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ if (!runImpl(F, AC, &SE, &DT))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<AAManager>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/BDCE.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/BDCE.cpp
index 9dde869bb6..767c7656dc 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/BDCE.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/BDCE.cpp
@@ -1,206 +1,206 @@
-//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Bit-Tracking Dead Code Elimination pass. Some
-// instructions (shifts, some ands, ors, etc.) kill some of their input bits.
-// We track these dead bits and remove instructions that compute only these
-// dead bits. We also simplify sext that generates unused extension bits,
-// converting it to a zext.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/BDCE.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/DemandedBits.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "bdce"
-
-STATISTIC(NumRemoved, "Number of instructions removed (unused)");
-STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
-STATISTIC(NumSExt2ZExt,
- "Number of sign extension instructions converted to zero extension");
-
-/// If an instruction is trivialized (dead), then the chain of users of that
-/// instruction may need to be cleared of assumptions that can no longer be
-/// guaranteed correct.
-static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
- assert(I->getType()->isIntOrIntVectorTy() &&
- "Trivializing a non-integer value?");
-
- // Initialize the worklist with eligible direct users.
- SmallPtrSet<Instruction *, 16> Visited;
- SmallVector<Instruction *, 16> WorkList;
- for (User *JU : I->users()) {
- // If all bits of a user are demanded, then we know that nothing below that
- // in the def-use chain needs to be changed.
- auto *J = dyn_cast<Instruction>(JU);
- if (J && J->getType()->isIntOrIntVectorTy() &&
- !DB.getDemandedBits(J).isAllOnesValue()) {
- Visited.insert(J);
- WorkList.push_back(J);
- }
-
- // Note that we need to check for non-int types above before asking for
- // demanded bits. Normally, the only way to reach an instruction with an
- // non-int type is via an instruction that has side effects (or otherwise
- // will demand its input bits). However, if we have a readnone function
- // that returns an unsized type (e.g., void), we must avoid asking for the
- // demanded bits of the function call's return value. A void-returning
- // readnone function is always dead (and so we can stop walking the use/def
- // chain here), but the check is necessary to avoid asserting.
- }
-
- // DFS through subsequent users while tracking visits to avoid cycles.
- while (!WorkList.empty()) {
- Instruction *J = WorkList.pop_back_val();
-
- // NSW, NUW, and exact are based on operands that might have changed.
- J->dropPoisonGeneratingFlags();
-
- // We do not have to worry about llvm.assume or range metadata:
- // 1. llvm.assume demands its operand, so trivializing can't change it.
- // 2. range metadata only applies to memory accesses which demand all bits.
-
- for (User *KU : J->users()) {
- // If all bits of a user are demanded, then we know that nothing below
- // that in the def-use chain needs to be changed.
- auto *K = dyn_cast<Instruction>(KU);
- if (K && Visited.insert(K).second && K->getType()->isIntOrIntVectorTy() &&
- !DB.getDemandedBits(K).isAllOnesValue())
- WorkList.push_back(K);
- }
- }
-}
-
-static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
- SmallVector<Instruction*, 128> Worklist;
- bool Changed = false;
- for (Instruction &I : instructions(F)) {
- // If the instruction has side effects and no non-dbg uses,
- // skip it. This way we avoid computing known bits on an instruction
- // that will not help us.
- if (I.mayHaveSideEffects() && I.use_empty())
- continue;
-
- // Remove instructions that are dead, either because they were not reached
- // during analysis or have no demanded bits.
- if (DB.isInstructionDead(&I) ||
- (I.getType()->isIntOrIntVectorTy() &&
- DB.getDemandedBits(&I).isNullValue() &&
- wouldInstructionBeTriviallyDead(&I))) {
- salvageDebugInfo(I);
- Worklist.push_back(&I);
- I.dropAllReferences();
- Changed = true;
- continue;
- }
-
- // Convert SExt into ZExt if none of the extension bits is required
- if (SExtInst *SE = dyn_cast<SExtInst>(&I)) {
- APInt Demanded = DB.getDemandedBits(SE);
- const uint32_t SrcBitSize = SE->getSrcTy()->getScalarSizeInBits();
- auto *const DstTy = SE->getDestTy();
- const uint32_t DestBitSize = DstTy->getScalarSizeInBits();
- if (Demanded.countLeadingZeros() >= (DestBitSize - SrcBitSize)) {
- clearAssumptionsOfUsers(SE, DB);
- IRBuilder<> Builder(SE);
- I.replaceAllUsesWith(
- Builder.CreateZExt(SE->getOperand(0), DstTy, SE->getName()));
- Worklist.push_back(SE);
- Changed = true;
- NumSExt2ZExt++;
- continue;
- }
- }
-
- for (Use &U : I.operands()) {
- // DemandedBits only detects dead integer uses.
- if (!U->getType()->isIntOrIntVectorTy())
- continue;
-
- if (!isa<Instruction>(U) && !isa<Argument>(U))
- continue;
-
- if (!DB.isUseDead(&U))
- continue;
-
- LLVM_DEBUG(dbgs() << "BDCE: Trivializing: " << U << " (all bits dead)\n");
-
- clearAssumptionsOfUsers(&I, DB);
-
- // FIXME: In theory we could substitute undef here instead of zero.
- // This should be reconsidered once we settle on the semantics of
- // undef, poison, etc.
- U.set(ConstantInt::get(U->getType(), 0));
- ++NumSimplified;
- Changed = true;
- }
- }
-
- for (Instruction *&I : Worklist) {
- ++NumRemoved;
- I->eraseFromParent();
- }
-
- return Changed;
-}
-
-PreservedAnalyses BDCEPass::run(Function &F, FunctionAnalysisManager &AM) {
- auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
- if (!bitTrackingDCE(F, DB))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<GlobalsAA>();
- return PA;
-}
-
-namespace {
-struct BDCELegacyPass : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- BDCELegacyPass() : FunctionPass(ID) {
- initializeBDCELegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
- auto &DB = getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
- return bitTrackingDCE(F, DB);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<DemandedBitsWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
-};
-}
-
-char BDCELegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(BDCELegacyPass, "bdce",
- "Bit-Tracking Dead Code Elimination", false, false)
-INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
-INITIALIZE_PASS_END(BDCELegacyPass, "bdce",
- "Bit-Tracking Dead Code Elimination", false, false)
-
-FunctionPass *llvm::createBitTrackingDCEPass() { return new BDCELegacyPass(); }
+//===---- BDCE.cpp - Bit-tracking dead code elimination -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Bit-Tracking Dead Code Elimination pass. Some
+// instructions (shifts, some ands, ors, etc.) kill some of their input bits.
+// We track these dead bits and remove instructions that compute only these
+// dead bits. We also simplify sext that generates unused extension bits,
+// converting it to a zext.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/BDCE.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "bdce"
+
+STATISTIC(NumRemoved, "Number of instructions removed (unused)");
+STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
+STATISTIC(NumSExt2ZExt,
+ "Number of sign extension instructions converted to zero extension");
+
+/// If an instruction is trivialized (dead), then the chain of users of that
+/// instruction may need to be cleared of assumptions that can no longer be
+/// guaranteed correct.
+static void clearAssumptionsOfUsers(Instruction *I, DemandedBits &DB) {
+ assert(I->getType()->isIntOrIntVectorTy() &&
+ "Trivializing a non-integer value?");
+
+ // Initialize the worklist with eligible direct users.
+ SmallPtrSet<Instruction *, 16> Visited;
+ SmallVector<Instruction *, 16> WorkList;
+ for (User *JU : I->users()) {
+ // If all bits of a user are demanded, then we know that nothing below that
+ // in the def-use chain needs to be changed.
+ auto *J = dyn_cast<Instruction>(JU);
+ if (J && J->getType()->isIntOrIntVectorTy() &&
+ !DB.getDemandedBits(J).isAllOnesValue()) {
+ Visited.insert(J);
+ WorkList.push_back(J);
+ }
+
+ // Note that we need to check for non-int types above before asking for
+ // demanded bits. Normally, the only way to reach an instruction with an
+ // non-int type is via an instruction that has side effects (or otherwise
+ // will demand its input bits). However, if we have a readnone function
+ // that returns an unsized type (e.g., void), we must avoid asking for the
+ // demanded bits of the function call's return value. A void-returning
+ // readnone function is always dead (and so we can stop walking the use/def
+ // chain here), but the check is necessary to avoid asserting.
+ }
+
+ // DFS through subsequent users while tracking visits to avoid cycles.
+ while (!WorkList.empty()) {
+ Instruction *J = WorkList.pop_back_val();
+
+ // NSW, NUW, and exact are based on operands that might have changed.
+ J->dropPoisonGeneratingFlags();
+
+ // We do not have to worry about llvm.assume or range metadata:
+ // 1. llvm.assume demands its operand, so trivializing can't change it.
+ // 2. range metadata only applies to memory accesses which demand all bits.
+
+ for (User *KU : J->users()) {
+ // If all bits of a user are demanded, then we know that nothing below
+ // that in the def-use chain needs to be changed.
+ auto *K = dyn_cast<Instruction>(KU);
+ if (K && Visited.insert(K).second && K->getType()->isIntOrIntVectorTy() &&
+ !DB.getDemandedBits(K).isAllOnesValue())
+ WorkList.push_back(K);
+ }
+ }
+}
+
+static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
+ SmallVector<Instruction*, 128> Worklist;
+ bool Changed = false;
+ for (Instruction &I : instructions(F)) {
+ // If the instruction has side effects and no non-dbg uses,
+ // skip it. This way we avoid computing known bits on an instruction
+ // that will not help us.
+ if (I.mayHaveSideEffects() && I.use_empty())
+ continue;
+
+ // Remove instructions that are dead, either because they were not reached
+ // during analysis or have no demanded bits.
+ if (DB.isInstructionDead(&I) ||
+ (I.getType()->isIntOrIntVectorTy() &&
+ DB.getDemandedBits(&I).isNullValue() &&
+ wouldInstructionBeTriviallyDead(&I))) {
+ salvageDebugInfo(I);
+ Worklist.push_back(&I);
+ I.dropAllReferences();
+ Changed = true;
+ continue;
+ }
+
+ // Convert SExt into ZExt if none of the extension bits is required
+ if (SExtInst *SE = dyn_cast<SExtInst>(&I)) {
+ APInt Demanded = DB.getDemandedBits(SE);
+ const uint32_t SrcBitSize = SE->getSrcTy()->getScalarSizeInBits();
+ auto *const DstTy = SE->getDestTy();
+ const uint32_t DestBitSize = DstTy->getScalarSizeInBits();
+ if (Demanded.countLeadingZeros() >= (DestBitSize - SrcBitSize)) {
+ clearAssumptionsOfUsers(SE, DB);
+ IRBuilder<> Builder(SE);
+ I.replaceAllUsesWith(
+ Builder.CreateZExt(SE->getOperand(0), DstTy, SE->getName()));
+ Worklist.push_back(SE);
+ Changed = true;
+ NumSExt2ZExt++;
+ continue;
+ }
+ }
+
+ for (Use &U : I.operands()) {
+ // DemandedBits only detects dead integer uses.
+ if (!U->getType()->isIntOrIntVectorTy())
+ continue;
+
+ if (!isa<Instruction>(U) && !isa<Argument>(U))
+ continue;
+
+ if (!DB.isUseDead(&U))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "BDCE: Trivializing: " << U << " (all bits dead)\n");
+
+ clearAssumptionsOfUsers(&I, DB);
+
+ // FIXME: In theory we could substitute undef here instead of zero.
+ // This should be reconsidered once we settle on the semantics of
+ // undef, poison, etc.
+ U.set(ConstantInt::get(U->getType(), 0));
+ ++NumSimplified;
+ Changed = true;
+ }
+ }
+
+ for (Instruction *&I : Worklist) {
+ ++NumRemoved;
+ I->eraseFromParent();
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses BDCEPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
+ if (!bitTrackingDCE(F, DB))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+namespace {
+struct BDCELegacyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ BDCELegacyPass() : FunctionPass(ID) {
+ initializeBDCELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ auto &DB = getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+ return bitTrackingDCE(F, DB);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DemandedBitsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+}
+
+char BDCELegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(BDCELegacyPass, "bdce",
+ "Bit-Tracking Dead Code Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_END(BDCELegacyPass, "bdce",
+ "Bit-Tracking Dead Code Elimination", false, false)
+
+FunctionPass *llvm::createBitTrackingDCEPass() { return new BDCELegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/CallSiteSplitting.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/CallSiteSplitting.cpp
index 743c43d3f3..2eb94b721d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -1,590 +1,590 @@
-//===- CallSiteSplitting.cpp ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a transformation that tries to split a call-site to pass
-// more constrained arguments if its argument is predicated in the control flow
-// so that we can expose better context to the later passes (e.g, inliner, jump
-// threading, or IPA-CP based function cloning, etc.).
-// As of now we support two cases :
-//
-// 1) Try to a split call-site with constrained arguments, if any constraints
-// on any argument can be found by following the single predecessors of the
-// all site's predecessors. Currently this pass only handles call-sites with 2
-// predecessors. For example, in the code below, we try to split the call-site
-// since we can predicate the argument(ptr) based on the OR condition.
-//
-// Split from :
-// if (!ptr || c)
-// callee(ptr);
-// to :
-// if (!ptr)
-// callee(null) // set the known constant value
-// else if (c)
-// callee(nonnull ptr) // set non-null attribute in the argument
-//
-// 2) We can also split a call-site based on constant incoming values of a PHI
-// For example,
-// from :
-// Header:
-// %c = icmp eq i32 %i1, %i2
-// br i1 %c, label %Tail, label %TBB
-// TBB:
-// br label Tail%
-// Tail:
-// %p = phi i32 [ 0, %Header], [ 1, %TBB]
-// call void @bar(i32 %p)
-// to
-// Header:
-// %c = icmp eq i32 %i1, %i2
-// br i1 %c, label %Tail-split0, label %TBB
-// TBB:
-// br label %Tail-split1
-// Tail-split0:
-// call void @bar(i32 0)
-// br label %Tail
-// Tail-split1:
-// call void @bar(i32 1)
-// br label %Tail
-// Tail:
-// %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ]
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "callsite-splitting"
-
-STATISTIC(NumCallSiteSplit, "Number of call-site split");
-
-/// Only allow instructions before a call, if their CodeSize cost is below
-/// DuplicationThreshold. Those instructions need to be duplicated in all
-/// split blocks.
-static cl::opt<unsigned>
- DuplicationThreshold("callsite-splitting-duplication-threshold", cl::Hidden,
- cl::desc("Only allow instructions before a call, if "
- "their cost is below DuplicationThreshold"),
- cl::init(5));
-
-static void addNonNullAttribute(CallBase &CB, Value *Op) {
- unsigned ArgNo = 0;
- for (auto &I : CB.args()) {
- if (&*I == Op)
- CB.addParamAttr(ArgNo, Attribute::NonNull);
- ++ArgNo;
- }
-}
-
-static void setConstantInArgument(CallBase &CB, Value *Op,
- Constant *ConstValue) {
- unsigned ArgNo = 0;
- for (auto &I : CB.args()) {
- if (&*I == Op) {
- // It is possible we have already added the non-null attribute to the
- // parameter by using an earlier constraining condition.
- CB.removeParamAttr(ArgNo, Attribute::NonNull);
- CB.setArgOperand(ArgNo, ConstValue);
- }
- ++ArgNo;
- }
-}
-
-static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallBase &CB) {
- assert(isa<Constant>(Cmp->getOperand(1)) && "Expected a constant operand.");
- Value *Op0 = Cmp->getOperand(0);
- unsigned ArgNo = 0;
- for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I, ++ArgNo) {
- // Don't consider constant or arguments that are already known non-null.
- if (isa<Constant>(*I) || CB.paramHasAttr(ArgNo, Attribute::NonNull))
- continue;
-
- if (*I == Op0)
- return true;
- }
- return false;
-}
-
-typedef std::pair<ICmpInst *, unsigned> ConditionTy;
-typedef SmallVector<ConditionTy, 2> ConditionsTy;
-
-/// If From has a conditional jump to To, add the condition to Conditions,
-/// if it is relevant to any argument at CB.
-static void recordCondition(CallBase &CB, BasicBlock *From, BasicBlock *To,
- ConditionsTy &Conditions) {
- auto *BI = dyn_cast<BranchInst>(From->getTerminator());
- if (!BI || !BI->isConditional())
- return;
-
- CmpInst::Predicate Pred;
- Value *Cond = BI->getCondition();
- if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant())))
- return;
-
- ICmpInst *Cmp = cast<ICmpInst>(Cond);
- if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)
- if (isCondRelevantToAnyCallArgument(Cmp, CB))
- Conditions.push_back({Cmp, From->getTerminator()->getSuccessor(0) == To
- ? Pred
- : Cmp->getInversePredicate()});
-}
-
-/// Record ICmp conditions relevant to any argument in CB following Pred's
-/// single predecessors. If there are conflicting conditions along a path, like
-/// x == 1 and x == 0, the first condition will be used. We stop once we reach
-/// an edge to StopAt.
-static void recordConditions(CallBase &CB, BasicBlock *Pred,
- ConditionsTy &Conditions, BasicBlock *StopAt) {
- BasicBlock *From = Pred;
- BasicBlock *To = Pred;
- SmallPtrSet<BasicBlock *, 4> Visited;
- while (To != StopAt && !Visited.count(From->getSinglePredecessor()) &&
- (From = From->getSinglePredecessor())) {
- recordCondition(CB, From, To, Conditions);
- Visited.insert(From);
- To = From;
- }
-}
-
-static void addConditions(CallBase &CB, const ConditionsTy &Conditions) {
- for (auto &Cond : Conditions) {
- Value *Arg = Cond.first->getOperand(0);
- Constant *ConstVal = cast<Constant>(Cond.first->getOperand(1));
- if (Cond.second == ICmpInst::ICMP_EQ)
- setConstantInArgument(CB, Arg, ConstVal);
- else if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) {
- assert(Cond.second == ICmpInst::ICMP_NE);
- addNonNullAttribute(CB, Arg);
- }
- }
-}
-
-static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
- SmallVector<BasicBlock *, 2> Preds(predecessors((BB)));
- assert(Preds.size() == 2 && "Expected exactly 2 predecessors!");
- return Preds;
-}
-
-static bool canSplitCallSite(CallBase &CB, TargetTransformInfo &TTI) {
- if (CB.isConvergent() || CB.cannotDuplicate())
- return false;
-
- // FIXME: As of now we handle only CallInst. InvokeInst could be handled
- // without too much effort.
- if (!isa<CallInst>(CB))
- return false;
-
- BasicBlock *CallSiteBB = CB.getParent();
- // Need 2 predecessors and cannot split an edge from an IndirectBrInst.
- SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB));
- if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) ||
- isa<IndirectBrInst>(Preds[1]->getTerminator()))
- return false;
-
- // BasicBlock::canSplitPredecessors is more aggressive, so checking for
- // BasicBlock::isEHPad as well.
- if (!CallSiteBB->canSplitPredecessors() || CallSiteBB->isEHPad())
- return false;
-
- // Allow splitting a call-site only when the CodeSize cost of the
- // instructions before the call is less then DuplicationThreshold. The
- // instructions before the call will be duplicated in the split blocks and
- // corresponding uses will be updated.
+//===- CallSiteSplitting.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a transformation that tries to split a call-site to pass
+// more constrained arguments if its argument is predicated in the control flow
+// so that we can expose better context to the later passes (e.g, inliner, jump
+// threading, or IPA-CP based function cloning, etc.).
+// As of now we support two cases :
+//
+// 1) Try to a split call-site with constrained arguments, if any constraints
+// on any argument can be found by following the single predecessors of the
+// all site's predecessors. Currently this pass only handles call-sites with 2
+// predecessors. For example, in the code below, we try to split the call-site
+// since we can predicate the argument(ptr) based on the OR condition.
+//
+// Split from :
+// if (!ptr || c)
+// callee(ptr);
+// to :
+// if (!ptr)
+// callee(null) // set the known constant value
+// else if (c)
+// callee(nonnull ptr) // set non-null attribute in the argument
+//
+// 2) We can also split a call-site based on constant incoming values of a PHI
+// For example,
+// from :
+// Header:
+// %c = icmp eq i32 %i1, %i2
+// br i1 %c, label %Tail, label %TBB
+// TBB:
+// br label Tail%
+// Tail:
+// %p = phi i32 [ 0, %Header], [ 1, %TBB]
+// call void @bar(i32 %p)
+// to
+// Header:
+// %c = icmp eq i32 %i1, %i2
+// br i1 %c, label %Tail-split0, label %TBB
+// TBB:
+// br label %Tail-split1
+// Tail-split0:
+// call void @bar(i32 0)
+// br label %Tail
+// Tail-split1:
+// call void @bar(i32 1)
+// br label %Tail
+// Tail:
+// %p = phi i32 [ 0, %Tail-split0 ], [ 1, %Tail-split1 ]
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/CallSiteSplitting.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "callsite-splitting"
+
+STATISTIC(NumCallSiteSplit, "Number of call-site split");
+
+/// Only allow instructions before a call, if their CodeSize cost is below
+/// DuplicationThreshold. Those instructions need to be duplicated in all
+/// split blocks.
+static cl::opt<unsigned>
+ DuplicationThreshold("callsite-splitting-duplication-threshold", cl::Hidden,
+ cl::desc("Only allow instructions before a call, if "
+ "their cost is below DuplicationThreshold"),
+ cl::init(5));
+
+static void addNonNullAttribute(CallBase &CB, Value *Op) {
+ unsigned ArgNo = 0;
+ for (auto &I : CB.args()) {
+ if (&*I == Op)
+ CB.addParamAttr(ArgNo, Attribute::NonNull);
+ ++ArgNo;
+ }
+}
+
+static void setConstantInArgument(CallBase &CB, Value *Op,
+ Constant *ConstValue) {
+ unsigned ArgNo = 0;
+ for (auto &I : CB.args()) {
+ if (&*I == Op) {
+ // It is possible we have already added the non-null attribute to the
+ // parameter by using an earlier constraining condition.
+ CB.removeParamAttr(ArgNo, Attribute::NonNull);
+ CB.setArgOperand(ArgNo, ConstValue);
+ }
+ ++ArgNo;
+ }
+}
+
+static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallBase &CB) {
+ assert(isa<Constant>(Cmp->getOperand(1)) && "Expected a constant operand.");
+ Value *Op0 = Cmp->getOperand(0);
+ unsigned ArgNo = 0;
+ for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I, ++ArgNo) {
+ // Don't consider constant or arguments that are already known non-null.
+ if (isa<Constant>(*I) || CB.paramHasAttr(ArgNo, Attribute::NonNull))
+ continue;
+
+ if (*I == Op0)
+ return true;
+ }
+ return false;
+}
+
+typedef std::pair<ICmpInst *, unsigned> ConditionTy;
+typedef SmallVector<ConditionTy, 2> ConditionsTy;
+
+/// If From has a conditional jump to To, add the condition to Conditions,
+/// if it is relevant to any argument at CB.
+static void recordCondition(CallBase &CB, BasicBlock *From, BasicBlock *To,
+ ConditionsTy &Conditions) {
+ auto *BI = dyn_cast<BranchInst>(From->getTerminator());
+ if (!BI || !BI->isConditional())
+ return;
+
+ CmpInst::Predicate Pred;
+ Value *Cond = BI->getCondition();
+ if (!match(Cond, m_ICmp(Pred, m_Value(), m_Constant())))
+ return;
+
+ ICmpInst *Cmp = cast<ICmpInst>(Cond);
+ if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)
+ if (isCondRelevantToAnyCallArgument(Cmp, CB))
+ Conditions.push_back({Cmp, From->getTerminator()->getSuccessor(0) == To
+ ? Pred
+ : Cmp->getInversePredicate()});
+}
+
+/// Record ICmp conditions relevant to any argument in CB following Pred's
+/// single predecessors. If there are conflicting conditions along a path, like
+/// x == 1 and x == 0, the first condition will be used. We stop once we reach
+/// an edge to StopAt.
+static void recordConditions(CallBase &CB, BasicBlock *Pred,
+ ConditionsTy &Conditions, BasicBlock *StopAt) {
+ BasicBlock *From = Pred;
+ BasicBlock *To = Pred;
+ SmallPtrSet<BasicBlock *, 4> Visited;
+ while (To != StopAt && !Visited.count(From->getSinglePredecessor()) &&
+ (From = From->getSinglePredecessor())) {
+ recordCondition(CB, From, To, Conditions);
+ Visited.insert(From);
+ To = From;
+ }
+}
+
+static void addConditions(CallBase &CB, const ConditionsTy &Conditions) {
+ for (auto &Cond : Conditions) {
+ Value *Arg = Cond.first->getOperand(0);
+ Constant *ConstVal = cast<Constant>(Cond.first->getOperand(1));
+ if (Cond.second == ICmpInst::ICMP_EQ)
+ setConstantInArgument(CB, Arg, ConstVal);
+ else if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) {
+ assert(Cond.second == ICmpInst::ICMP_NE);
+ addNonNullAttribute(CB, Arg);
+ }
+ }
+}
+
+static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
+ SmallVector<BasicBlock *, 2> Preds(predecessors((BB)));
+ assert(Preds.size() == 2 && "Expected exactly 2 predecessors!");
+ return Preds;
+}
+
+static bool canSplitCallSite(CallBase &CB, TargetTransformInfo &TTI) {
+ if (CB.isConvergent() || CB.cannotDuplicate())
+ return false;
+
+ // FIXME: As of now we handle only CallInst. InvokeInst could be handled
+ // without too much effort.
+ if (!isa<CallInst>(CB))
+ return false;
+
+ BasicBlock *CallSiteBB = CB.getParent();
+ // Need 2 predecessors and cannot split an edge from an IndirectBrInst.
+ SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB));
+ if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) ||
+ isa<IndirectBrInst>(Preds[1]->getTerminator()))
+ return false;
+
+ // BasicBlock::canSplitPredecessors is more aggressive, so checking for
+ // BasicBlock::isEHPad as well.
+ if (!CallSiteBB->canSplitPredecessors() || CallSiteBB->isEHPad())
+ return false;
+
+ // Allow splitting a call-site only when the CodeSize cost of the
+ // instructions before the call is less then DuplicationThreshold. The
+ // instructions before the call will be duplicated in the split blocks and
+ // corresponding uses will be updated.
InstructionCost Cost = 0;
- for (auto &InstBeforeCall :
- llvm::make_range(CallSiteBB->begin(), CB.getIterator())) {
- Cost += TTI.getInstructionCost(&InstBeforeCall,
- TargetTransformInfo::TCK_CodeSize);
- if (Cost >= DuplicationThreshold)
- return false;
- }
-
- return true;
-}
-
-static Instruction *cloneInstForMustTail(Instruction *I, Instruction *Before,
- Value *V) {
- Instruction *Copy = I->clone();
- Copy->setName(I->getName());
- Copy->insertBefore(Before);
- if (V)
- Copy->setOperand(0, V);
- return Copy;
-}
-
-/// Copy mandatory `musttail` return sequence that follows original `CI`, and
-/// link it up to `NewCI` value instead:
-///
-/// * (optional) `bitcast NewCI to ...`
-/// * `ret bitcast or NewCI`
-///
-/// Insert this sequence right before `SplitBB`'s terminator, which will be
-/// cleaned up later in `splitCallSite` below.
-static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
- Instruction *NewCI) {
- bool IsVoid = SplitBB->getParent()->getReturnType()->isVoidTy();
- auto II = std::next(CI->getIterator());
-
- BitCastInst* BCI = dyn_cast<BitCastInst>(&*II);
- if (BCI)
- ++II;
-
- ReturnInst* RI = dyn_cast<ReturnInst>(&*II);
- assert(RI && "`musttail` call must be followed by `ret` instruction");
-
- Instruction *TI = SplitBB->getTerminator();
- Value *V = NewCI;
- if (BCI)
- V = cloneInstForMustTail(BCI, TI, V);
- cloneInstForMustTail(RI, TI, IsVoid ? nullptr : V);
-
- // FIXME: remove TI here, `DuplicateInstructionsInSplitBetween` has a bug
- // that prevents doing this now.
-}
-
-/// For each (predecessor, conditions from predecessors) pair, it will split the
-/// basic block containing the call site, hook it up to the predecessor and
-/// replace the call instruction with new call instructions, which contain
-/// constraints based on the conditions from their predecessors.
-/// For example, in the IR below with an OR condition, the call-site can
-/// be split. In this case, Preds for Tail is [(Header, a == null),
-/// (TBB, a != null, b == null)]. Tail is replaced by 2 split blocks, containing
-/// CallInst1, which has constraints based on the conditions from Head and
-/// CallInst2, which has constraints based on the conditions coming from TBB.
-///
-/// From :
-///
-/// Header:
-/// %c = icmp eq i32* %a, null
-/// br i1 %c %Tail, %TBB
-/// TBB:
-/// %c2 = icmp eq i32* %b, null
-/// br i1 %c %Tail, %End
-/// Tail:
-/// %ca = call i1 @callee (i32* %a, i32* %b)
-///
-/// to :
-///
-/// Header: // PredBB1 is Header
-/// %c = icmp eq i32* %a, null
-/// br i1 %c %Tail-split1, %TBB
-/// TBB: // PredBB2 is TBB
-/// %c2 = icmp eq i32* %b, null
-/// br i1 %c %Tail-split2, %End
-/// Tail-split1:
-/// %ca1 = call @callee (i32* null, i32* %b) // CallInst1
-/// br %Tail
-/// Tail-split2:
-/// %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2
-/// br %Tail
-/// Tail:
-/// %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2]
-///
-/// Note that in case any arguments at the call-site are constrained by its
-/// predecessors, new call-sites with more constrained arguments will be
-/// created in createCallSitesOnPredicatedArgument().
-static void splitCallSite(
- CallBase &CB,
- const SmallVectorImpl<std::pair<BasicBlock *, ConditionsTy>> &Preds,
- DomTreeUpdater &DTU) {
- BasicBlock *TailBB = CB.getParent();
- bool IsMustTailCall = CB.isMustTailCall();
-
- PHINode *CallPN = nullptr;
-
- // `musttail` calls must be followed by optional `bitcast`, and `ret`. The
- // split blocks will be terminated right after that so there're no users for
- // this phi in a `TailBB`.
- if (!IsMustTailCall && !CB.use_empty()) {
- CallPN = PHINode::Create(CB.getType(), Preds.size(), "phi.call");
- CallPN->setDebugLoc(CB.getDebugLoc());
- }
-
- LLVM_DEBUG(dbgs() << "split call-site : " << CB << " into \n");
-
- assert(Preds.size() == 2 && "The ValueToValueMaps array has size 2.");
- // ValueToValueMapTy is neither copy nor moveable, so we use a simple array
- // here.
- ValueToValueMapTy ValueToValueMaps[2];
- for (unsigned i = 0; i < Preds.size(); i++) {
- BasicBlock *PredBB = Preds[i].first;
- BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween(
- TailBB, PredBB, &*std::next(CB.getIterator()), ValueToValueMaps[i],
- DTU);
- assert(SplitBlock && "Unexpected new basic block split.");
-
- auto *NewCI =
- cast<CallBase>(&*std::prev(SplitBlock->getTerminator()->getIterator()));
- addConditions(*NewCI, Preds[i].second);
-
- // Handle PHIs used as arguments in the call-site.
- for (PHINode &PN : TailBB->phis()) {
- unsigned ArgNo = 0;
- for (auto &CI : CB.args()) {
- if (&*CI == &PN) {
- NewCI->setArgOperand(ArgNo, PN.getIncomingValueForBlock(SplitBlock));
- }
- ++ArgNo;
- }
- }
- LLVM_DEBUG(dbgs() << " " << *NewCI << " in " << SplitBlock->getName()
- << "\n");
- if (CallPN)
- CallPN->addIncoming(NewCI, SplitBlock);
-
- // Clone and place bitcast and return instructions before `TI`
- if (IsMustTailCall)
- copyMustTailReturn(SplitBlock, &CB, NewCI);
- }
-
- NumCallSiteSplit++;
-
- // FIXME: remove TI in `copyMustTailReturn`
- if (IsMustTailCall) {
- // Remove superfluous `br` terminators from the end of the Split blocks
- // NOTE: Removing terminator removes the SplitBlock from the TailBB's
- // predecessors. Therefore we must get complete list of Splits before
- // attempting removal.
- SmallVector<BasicBlock *, 2> Splits(predecessors((TailBB)));
- assert(Splits.size() == 2 && "Expected exactly 2 splits!");
- for (unsigned i = 0; i < Splits.size(); i++) {
- Splits[i]->getTerminator()->eraseFromParent();
- DTU.applyUpdatesPermissive({{DominatorTree::Delete, Splits[i], TailBB}});
- }
-
- // Erase the tail block once done with musttail patching
- DTU.deleteBB(TailBB);
- return;
- }
-
- auto *OriginalBegin = &*TailBB->begin();
- // Replace users of the original call with a PHI mering call-sites split.
- if (CallPN) {
- CallPN->insertBefore(OriginalBegin);
- CB.replaceAllUsesWith(CallPN);
- }
-
- // Remove instructions moved to split blocks from TailBB, from the duplicated
- // call instruction to the beginning of the basic block. If an instruction
- // has any uses, add a new PHI node to combine the values coming from the
- // split blocks. The new PHI nodes are placed before the first original
- // instruction, so we do not end up deleting them. By using reverse-order, we
- // do not introduce unnecessary PHI nodes for def-use chains from the call
- // instruction to the beginning of the block.
- auto I = CB.getReverseIterator();
- while (I != TailBB->rend()) {
- Instruction *CurrentI = &*I++;
- if (!CurrentI->use_empty()) {
- // If an existing PHI has users after the call, there is no need to create
- // a new one.
- if (isa<PHINode>(CurrentI))
- continue;
- PHINode *NewPN = PHINode::Create(CurrentI->getType(), Preds.size());
- NewPN->setDebugLoc(CurrentI->getDebugLoc());
- for (auto &Mapping : ValueToValueMaps)
- NewPN->addIncoming(Mapping[CurrentI],
- cast<Instruction>(Mapping[CurrentI])->getParent());
- NewPN->insertBefore(&*TailBB->begin());
- CurrentI->replaceAllUsesWith(NewPN);
- }
- CurrentI->eraseFromParent();
- // We are done once we handled the first original instruction in TailBB.
- if (CurrentI == OriginalBegin)
- break;
- }
-}
-
-// Return true if the call-site has an argument which is a PHI with only
-// constant incoming values.
-static bool isPredicatedOnPHI(CallBase &CB) {
- BasicBlock *Parent = CB.getParent();
- if (&CB != Parent->getFirstNonPHIOrDbg())
- return false;
-
- for (auto &PN : Parent->phis()) {
- for (auto &Arg : CB.args()) {
- if (&*Arg != &PN)
- continue;
- assert(PN.getNumIncomingValues() == 2 &&
- "Unexpected number of incoming values");
- if (PN.getIncomingBlock(0) == PN.getIncomingBlock(1))
- return false;
- if (PN.getIncomingValue(0) == PN.getIncomingValue(1))
- continue;
- if (isa<Constant>(PN.getIncomingValue(0)) &&
- isa<Constant>(PN.getIncomingValue(1)))
- return true;
- }
- }
- return false;
-}
-
-using PredsWithCondsTy = SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2>;
-
-// Check if any of the arguments in CS are predicated on a PHI node and return
-// the set of predecessors we should use for splitting.
-static PredsWithCondsTy shouldSplitOnPHIPredicatedArgument(CallBase &CB) {
- if (!isPredicatedOnPHI(CB))
- return {};
-
- auto Preds = getTwoPredecessors(CB.getParent());
- return {{Preds[0], {}}, {Preds[1], {}}};
-}
-
-// Checks if any of the arguments in CS are predicated in a predecessor and
-// returns a list of predecessors with the conditions that hold on their edges
-// to CS.
-static PredsWithCondsTy shouldSplitOnPredicatedArgument(CallBase &CB,
- DomTreeUpdater &DTU) {
- auto Preds = getTwoPredecessors(CB.getParent());
- if (Preds[0] == Preds[1])
- return {};
-
- // We can stop recording conditions once we reached the immediate dominator
- // for the block containing the call site. Conditions in predecessors of the
- // that node will be the same for all paths to the call site and splitting
- // is not beneficial.
- assert(DTU.hasDomTree() && "We need a DTU with a valid DT!");
- auto *CSDTNode = DTU.getDomTree().getNode(CB.getParent());
- BasicBlock *StopAt = CSDTNode ? CSDTNode->getIDom()->getBlock() : nullptr;
-
- SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS;
- for (auto *Pred : make_range(Preds.rbegin(), Preds.rend())) {
- ConditionsTy Conditions;
- // Record condition on edge BB(CS) <- Pred
- recordCondition(CB, Pred, CB.getParent(), Conditions);
- // Record conditions following Pred's single predecessors.
- recordConditions(CB, Pred, Conditions, StopAt);
- PredsCS.push_back({Pred, Conditions});
- }
-
- if (all_of(PredsCS, [](const std::pair<BasicBlock *, ConditionsTy> &P) {
- return P.second.empty();
- }))
- return {};
-
- return PredsCS;
-}
-
-static bool tryToSplitCallSite(CallBase &CB, TargetTransformInfo &TTI,
- DomTreeUpdater &DTU) {
- // Check if we can split the call site.
- if (!CB.arg_size() || !canSplitCallSite(CB, TTI))
- return false;
-
- auto PredsWithConds = shouldSplitOnPredicatedArgument(CB, DTU);
- if (PredsWithConds.empty())
- PredsWithConds = shouldSplitOnPHIPredicatedArgument(CB);
- if (PredsWithConds.empty())
- return false;
-
- splitCallSite(CB, PredsWithConds, DTU);
- return true;
-}
-
-static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI,
- TargetTransformInfo &TTI, DominatorTree &DT) {
-
- DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy);
- bool Changed = false;
- for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
- BasicBlock &BB = *BI++;
- auto II = BB.getFirstNonPHIOrDbg()->getIterator();
- auto IE = BB.getTerminator()->getIterator();
- // Iterate until we reach the terminator instruction. tryToSplitCallSite
- // can replace BB's terminator in case BB is a successor of itself. In that
- // case, IE will be invalidated and we also have to check the current
- // terminator.
- while (II != IE && &*II != BB.getTerminator()) {
- CallBase *CB = dyn_cast<CallBase>(&*II++);
- if (!CB || isa<IntrinsicInst>(CB) || isInstructionTriviallyDead(CB, &TLI))
- continue;
-
- Function *Callee = CB->getCalledFunction();
- if (!Callee || Callee->isDeclaration())
- continue;
-
- // Successful musttail call-site splits result in erased CI and erased BB.
- // Check if such path is possible before attempting the splitting.
- bool IsMustTail = CB->isMustTailCall();
-
- Changed |= tryToSplitCallSite(*CB, TTI, DTU);
-
- // There're no interesting instructions after this. The call site
- // itself might have been erased on splitting.
- if (IsMustTail)
- break;
- }
- }
- return Changed;
-}
-
-namespace {
-struct CallSiteSplittingLegacyPass : public FunctionPass {
- static char ID;
- CallSiteSplittingLegacyPass() : FunctionPass(ID) {
- initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- FunctionPass::getAnalysisUsage(AU);
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- return doCallSiteSplitting(F, TLI, TTI, DT);
- }
-};
-} // namespace
-
-char CallSiteSplittingLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
- "Call-site splitting", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
- "Call-site splitting", false, false)
-FunctionPass *llvm::createCallSiteSplittingPass() {
- return new CallSiteSplittingLegacyPass();
-}
-
-PreservedAnalyses CallSiteSplittingPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
-
- if (!doCallSiteSplitting(F, TLI, TTI, DT))
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
- PA.preserve<DominatorTreeAnalysis>();
- return PA;
-}
+ for (auto &InstBeforeCall :
+ llvm::make_range(CallSiteBB->begin(), CB.getIterator())) {
+ Cost += TTI.getInstructionCost(&InstBeforeCall,
+ TargetTransformInfo::TCK_CodeSize);
+ if (Cost >= DuplicationThreshold)
+ return false;
+ }
+
+ return true;
+}
+
+static Instruction *cloneInstForMustTail(Instruction *I, Instruction *Before,
+ Value *V) {
+ Instruction *Copy = I->clone();
+ Copy->setName(I->getName());
+ Copy->insertBefore(Before);
+ if (V)
+ Copy->setOperand(0, V);
+ return Copy;
+}
+
+/// Copy mandatory `musttail` return sequence that follows original `CI`, and
+/// link it up to `NewCI` value instead:
+///
+/// * (optional) `bitcast NewCI to ...`
+/// * `ret bitcast or NewCI`
+///
+/// Insert this sequence right before `SplitBB`'s terminator, which will be
+/// cleaned up later in `splitCallSite` below.
+static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
+ Instruction *NewCI) {
+ bool IsVoid = SplitBB->getParent()->getReturnType()->isVoidTy();
+ auto II = std::next(CI->getIterator());
+
+ BitCastInst* BCI = dyn_cast<BitCastInst>(&*II);
+ if (BCI)
+ ++II;
+
+ ReturnInst* RI = dyn_cast<ReturnInst>(&*II);
+ assert(RI && "`musttail` call must be followed by `ret` instruction");
+
+ Instruction *TI = SplitBB->getTerminator();
+ Value *V = NewCI;
+ if (BCI)
+ V = cloneInstForMustTail(BCI, TI, V);
+ cloneInstForMustTail(RI, TI, IsVoid ? nullptr : V);
+
+ // FIXME: remove TI here, `DuplicateInstructionsInSplitBetween` has a bug
+ // that prevents doing this now.
+}
+
+/// For each (predecessor, conditions from predecessors) pair, it will split the
+/// basic block containing the call site, hook it up to the predecessor and
+/// replace the call instruction with new call instructions, which contain
+/// constraints based on the conditions from their predecessors.
+/// For example, in the IR below with an OR condition, the call-site can
+/// be split. In this case, Preds for Tail is [(Header, a == null),
+/// (TBB, a != null, b == null)]. Tail is replaced by 2 split blocks, containing
+/// CallInst1, which has constraints based on the conditions from Head and
+/// CallInst2, which has constraints based on the conditions coming from TBB.
+///
+/// From :
+///
+/// Header:
+/// %c = icmp eq i32* %a, null
+/// br i1 %c %Tail, %TBB
+/// TBB:
+/// %c2 = icmp eq i32* %b, null
+/// br i1 %c %Tail, %End
+/// Tail:
+/// %ca = call i1 @callee (i32* %a, i32* %b)
+///
+/// to :
+///
+/// Header: // PredBB1 is Header
+/// %c = icmp eq i32* %a, null
+/// br i1 %c %Tail-split1, %TBB
+/// TBB: // PredBB2 is TBB
+/// %c2 = icmp eq i32* %b, null
+/// br i1 %c %Tail-split2, %End
+/// Tail-split1:
+/// %ca1 = call @callee (i32* null, i32* %b) // CallInst1
+/// br %Tail
+/// Tail-split2:
+/// %ca2 = call @callee (i32* nonnull %a, i32* null) // CallInst2
+/// br %Tail
+/// Tail:
+/// %p = phi i1 [%ca1, %Tail-split1],[%ca2, %Tail-split2]
+///
+/// Note that in case any arguments at the call-site are constrained by its
+/// predecessors, new call-sites with more constrained arguments will be
+/// created in createCallSitesOnPredicatedArgument().
+static void splitCallSite(
+ CallBase &CB,
+ const SmallVectorImpl<std::pair<BasicBlock *, ConditionsTy>> &Preds,
+ DomTreeUpdater &DTU) {
+ BasicBlock *TailBB = CB.getParent();
+ bool IsMustTailCall = CB.isMustTailCall();
+
+ PHINode *CallPN = nullptr;
+
+ // `musttail` calls must be followed by optional `bitcast`, and `ret`. The
+ // split blocks will be terminated right after that so there're no users for
+ // this phi in a `TailBB`.
+ if (!IsMustTailCall && !CB.use_empty()) {
+ CallPN = PHINode::Create(CB.getType(), Preds.size(), "phi.call");
+ CallPN->setDebugLoc(CB.getDebugLoc());
+ }
+
+ LLVM_DEBUG(dbgs() << "split call-site : " << CB << " into \n");
+
+ assert(Preds.size() == 2 && "The ValueToValueMaps array has size 2.");
+ // ValueToValueMapTy is neither copy nor moveable, so we use a simple array
+ // here.
+ ValueToValueMapTy ValueToValueMaps[2];
+ for (unsigned i = 0; i < Preds.size(); i++) {
+ BasicBlock *PredBB = Preds[i].first;
+ BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween(
+ TailBB, PredBB, &*std::next(CB.getIterator()), ValueToValueMaps[i],
+ DTU);
+ assert(SplitBlock && "Unexpected new basic block split.");
+
+ auto *NewCI =
+ cast<CallBase>(&*std::prev(SplitBlock->getTerminator()->getIterator()));
+ addConditions(*NewCI, Preds[i].second);
+
+ // Handle PHIs used as arguments in the call-site.
+ for (PHINode &PN : TailBB->phis()) {
+ unsigned ArgNo = 0;
+ for (auto &CI : CB.args()) {
+ if (&*CI == &PN) {
+ NewCI->setArgOperand(ArgNo, PN.getIncomingValueForBlock(SplitBlock));
+ }
+ ++ArgNo;
+ }
+ }
+ LLVM_DEBUG(dbgs() << " " << *NewCI << " in " << SplitBlock->getName()
+ << "\n");
+ if (CallPN)
+ CallPN->addIncoming(NewCI, SplitBlock);
+
+ // Clone and place bitcast and return instructions before `TI`
+ if (IsMustTailCall)
+ copyMustTailReturn(SplitBlock, &CB, NewCI);
+ }
+
+ NumCallSiteSplit++;
+
+ // FIXME: remove TI in `copyMustTailReturn`
+ if (IsMustTailCall) {
+ // Remove superfluous `br` terminators from the end of the Split blocks
+ // NOTE: Removing terminator removes the SplitBlock from the TailBB's
+ // predecessors. Therefore we must get complete list of Splits before
+ // attempting removal.
+ SmallVector<BasicBlock *, 2> Splits(predecessors((TailBB)));
+ assert(Splits.size() == 2 && "Expected exactly 2 splits!");
+ for (unsigned i = 0; i < Splits.size(); i++) {
+ Splits[i]->getTerminator()->eraseFromParent();
+ DTU.applyUpdatesPermissive({{DominatorTree::Delete, Splits[i], TailBB}});
+ }
+
+ // Erase the tail block once done with musttail patching
+ DTU.deleteBB(TailBB);
+ return;
+ }
+
+ auto *OriginalBegin = &*TailBB->begin();
+ // Replace users of the original call with a PHI mering call-sites split.
+ if (CallPN) {
+ CallPN->insertBefore(OriginalBegin);
+ CB.replaceAllUsesWith(CallPN);
+ }
+
+ // Remove instructions moved to split blocks from TailBB, from the duplicated
+ // call instruction to the beginning of the basic block. If an instruction
+ // has any uses, add a new PHI node to combine the values coming from the
+ // split blocks. The new PHI nodes are placed before the first original
+ // instruction, so we do not end up deleting them. By using reverse-order, we
+ // do not introduce unnecessary PHI nodes for def-use chains from the call
+ // instruction to the beginning of the block.
+ auto I = CB.getReverseIterator();
+ while (I != TailBB->rend()) {
+ Instruction *CurrentI = &*I++;
+ if (!CurrentI->use_empty()) {
+ // If an existing PHI has users after the call, there is no need to create
+ // a new one.
+ if (isa<PHINode>(CurrentI))
+ continue;
+ PHINode *NewPN = PHINode::Create(CurrentI->getType(), Preds.size());
+ NewPN->setDebugLoc(CurrentI->getDebugLoc());
+ for (auto &Mapping : ValueToValueMaps)
+ NewPN->addIncoming(Mapping[CurrentI],
+ cast<Instruction>(Mapping[CurrentI])->getParent());
+ NewPN->insertBefore(&*TailBB->begin());
+ CurrentI->replaceAllUsesWith(NewPN);
+ }
+ CurrentI->eraseFromParent();
+ // We are done once we handled the first original instruction in TailBB.
+ if (CurrentI == OriginalBegin)
+ break;
+ }
+}
+
+// Return true if the call-site has an argument which is a PHI with only
+// constant incoming values.
+static bool isPredicatedOnPHI(CallBase &CB) {
+ BasicBlock *Parent = CB.getParent();
+ if (&CB != Parent->getFirstNonPHIOrDbg())
+ return false;
+
+ for (auto &PN : Parent->phis()) {
+ for (auto &Arg : CB.args()) {
+ if (&*Arg != &PN)
+ continue;
+ assert(PN.getNumIncomingValues() == 2 &&
+ "Unexpected number of incoming values");
+ if (PN.getIncomingBlock(0) == PN.getIncomingBlock(1))
+ return false;
+ if (PN.getIncomingValue(0) == PN.getIncomingValue(1))
+ continue;
+ if (isa<Constant>(PN.getIncomingValue(0)) &&
+ isa<Constant>(PN.getIncomingValue(1)))
+ return true;
+ }
+ }
+ return false;
+}
+
+using PredsWithCondsTy = SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2>;
+
+// Check if any of the arguments in CS are predicated on a PHI node and return
+// the set of predecessors we should use for splitting.
+static PredsWithCondsTy shouldSplitOnPHIPredicatedArgument(CallBase &CB) {
+ if (!isPredicatedOnPHI(CB))
+ return {};
+
+ auto Preds = getTwoPredecessors(CB.getParent());
+ return {{Preds[0], {}}, {Preds[1], {}}};
+}
+
+// Checks if any of the arguments in CS are predicated in a predecessor and
+// returns a list of predecessors with the conditions that hold on their edges
+// to CS.
+static PredsWithCondsTy shouldSplitOnPredicatedArgument(CallBase &CB,
+ DomTreeUpdater &DTU) {
+ auto Preds = getTwoPredecessors(CB.getParent());
+ if (Preds[0] == Preds[1])
+ return {};
+
+ // We can stop recording conditions once we reached the immediate dominator
+ // for the block containing the call site. Conditions in predecessors of the
+ // that node will be the same for all paths to the call site and splitting
+ // is not beneficial.
+ assert(DTU.hasDomTree() && "We need a DTU with a valid DT!");
+ auto *CSDTNode = DTU.getDomTree().getNode(CB.getParent());
+ BasicBlock *StopAt = CSDTNode ? CSDTNode->getIDom()->getBlock() : nullptr;
+
+ SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS;
+ for (auto *Pred : make_range(Preds.rbegin(), Preds.rend())) {
+ ConditionsTy Conditions;
+ // Record condition on edge BB(CS) <- Pred
+ recordCondition(CB, Pred, CB.getParent(), Conditions);
+ // Record conditions following Pred's single predecessors.
+ recordConditions(CB, Pred, Conditions, StopAt);
+ PredsCS.push_back({Pred, Conditions});
+ }
+
+ if (all_of(PredsCS, [](const std::pair<BasicBlock *, ConditionsTy> &P) {
+ return P.second.empty();
+ }))
+ return {};
+
+ return PredsCS;
+}
+
+static bool tryToSplitCallSite(CallBase &CB, TargetTransformInfo &TTI,
+ DomTreeUpdater &DTU) {
+ // Check if we can split the call site.
+ if (!CB.arg_size() || !canSplitCallSite(CB, TTI))
+ return false;
+
+ auto PredsWithConds = shouldSplitOnPredicatedArgument(CB, DTU);
+ if (PredsWithConds.empty())
+ PredsWithConds = shouldSplitOnPHIPredicatedArgument(CB);
+ if (PredsWithConds.empty())
+ return false;
+
+ splitCallSite(CB, PredsWithConds, DTU);
+ return true;
+}
+
+static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI,
+ TargetTransformInfo &TTI, DominatorTree &DT) {
+
+ DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ bool Changed = false;
+ for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;) {
+ BasicBlock &BB = *BI++;
+ auto II = BB.getFirstNonPHIOrDbg()->getIterator();
+ auto IE = BB.getTerminator()->getIterator();
+ // Iterate until we reach the terminator instruction. tryToSplitCallSite
+ // can replace BB's terminator in case BB is a successor of itself. In that
+ // case, IE will be invalidated and we also have to check the current
+ // terminator.
+ while (II != IE && &*II != BB.getTerminator()) {
+ CallBase *CB = dyn_cast<CallBase>(&*II++);
+ if (!CB || isa<IntrinsicInst>(CB) || isInstructionTriviallyDead(CB, &TLI))
+ continue;
+
+ Function *Callee = CB->getCalledFunction();
+ if (!Callee || Callee->isDeclaration())
+ continue;
+
+ // Successful musttail call-site splits result in erased CI and erased BB.
+ // Check if such path is possible before attempting the splitting.
+ bool IsMustTail = CB->isMustTailCall();
+
+ Changed |= tryToSplitCallSite(*CB, TTI, DTU);
+
+ // There're no interesting instructions after this. The call site
+ // itself might have been erased on splitting.
+ if (IsMustTail)
+ break;
+ }
+ }
+ return Changed;
+}
+
+namespace {
+struct CallSiteSplittingLegacyPass : public FunctionPass {
+ static char ID;
+ CallSiteSplittingLegacyPass() : FunctionPass(ID) {
+ initializeCallSiteSplittingLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ return doCallSiteSplitting(F, TLI, TTI, DT);
+ }
+};
+} // namespace
+
+char CallSiteSplittingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CallSiteSplittingLegacyPass, "callsite-splitting",
+ "Call-site splitting", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(CallSiteSplittingLegacyPass, "callsite-splitting",
+ "Call-site splitting", false, false)
+FunctionPass *llvm::createCallSiteSplittingPass() {
+ return new CallSiteSplittingLegacyPass();
+}
+
+PreservedAnalyses CallSiteSplittingPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+ if (!doCallSiteSplitting(F, TLI, TTI, DT))
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/ConstantHoisting.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/ConstantHoisting.cpp
index 57fb8492d7..fdab74fc94 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -1,991 +1,991 @@
-//===- ConstantHoisting.cpp - Prepare code for expensive constants --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass identifies expensive constants to hoist and coalesces them to
-// better prepare it for SelectionDAG-based code generation. This works around
-// the limitations of the basic-block-at-a-time approach.
-//
-// First it scans all instructions for integer constants and calculates its
-// cost. If the constant can be folded into the instruction (the cost is
-// TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't
-// consider it expensive and leave it alone. This is the default behavior and
-// the default implementation of getIntImmCostInst will always return TCC_Free.
-//
-// If the cost is more than TCC_BASIC, then the integer constant can't be folded
-// into the instruction and it might be beneficial to hoist the constant.
-// Similar constants are coalesced to reduce register pressure and
-// materialization code.
-//
-// When a constant is hoisted, it is also hidden behind a bitcast to force it to
-// be live-out of the basic block. Otherwise the constant would be just
-// duplicated and each basic block would have its own copy in the SelectionDAG.
-// The SelectionDAG recognizes such constants as opaque and doesn't perform
-// certain transformations on them, which would create a new expensive constant.
-//
-// This optimization is only applied to integer constants in instructions and
-// simple (this means not nested) constant cast expressions. For example:
-// %0 = load i64* inttoptr (i64 big_constant to i64*)
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/ConstantHoisting.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/BlockFrequency.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SizeOpts.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <tuple>
-#include <utility>
-
-using namespace llvm;
-using namespace consthoist;
-
-#define DEBUG_TYPE "consthoist"
-
-STATISTIC(NumConstantsHoisted, "Number of constants hoisted");
-STATISTIC(NumConstantsRebased, "Number of constants rebased");
-
-static cl::opt<bool> ConstHoistWithBlockFrequency(
- "consthoist-with-block-frequency", cl::init(true), cl::Hidden,
- cl::desc("Enable the use of the block frequency analysis to reduce the "
- "chance to execute const materialization more frequently than "
- "without hoisting."));
-
-static cl::opt<bool> ConstHoistGEP(
- "consthoist-gep", cl::init(false), cl::Hidden,
- cl::desc("Try hoisting constant gep expressions"));
-
-static cl::opt<unsigned>
-MinNumOfDependentToRebase("consthoist-min-num-to-rebase",
- cl::desc("Do not rebase if number of dependent constants of a Base is less "
- "than this number."),
- cl::init(0), cl::Hidden);
-
-namespace {
-
-/// The constant hoisting pass.
-class ConstantHoistingLegacyPass : public FunctionPass {
-public:
- static char ID; // Pass identification, replacement for typeid
-
- ConstantHoistingLegacyPass() : FunctionPass(ID) {
- initializeConstantHoistingLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &Fn) override;
-
- StringRef getPassName() const override { return "Constant Hoisting"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- if (ConstHoistWithBlockFrequency)
- AU.addRequired<BlockFrequencyInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<ProfileSummaryInfoWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- }
-
-private:
- ConstantHoistingPass Impl;
-};
-
-} // end anonymous namespace
-
-char ConstantHoistingLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist",
- "Constant Hoisting", false, false)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist",
- "Constant Hoisting", false, false)
-
-FunctionPass *llvm::createConstantHoistingPass() {
- return new ConstantHoistingLegacyPass();
-}
-
-/// Perform the constant hoisting optimization for the given function.
-bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
- if (skipFunction(Fn))
- return false;
-
- LLVM_DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
- LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
-
- bool MadeChange =
- Impl.runImpl(Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
- getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
- ConstHoistWithBlockFrequency
- ? &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI()
- : nullptr,
- Fn.getEntryBlock(),
- &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
-
- if (MadeChange) {
- LLVM_DEBUG(dbgs() << "********** Function after Constant Hoisting: "
- << Fn.getName() << '\n');
- LLVM_DEBUG(dbgs() << Fn);
- }
- LLVM_DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
-
- return MadeChange;
-}
-
-/// Find the constant materialization insertion point.
-Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
- unsigned Idx) const {
- // If the operand is a cast instruction, then we have to materialize the
- // constant before the cast instruction.
- if (Idx != ~0U) {
- Value *Opnd = Inst->getOperand(Idx);
- if (auto CastInst = dyn_cast<Instruction>(Opnd))
- if (CastInst->isCast())
- return CastInst;
- }
-
- // The simple and common case. This also includes constant expressions.
- if (!isa<PHINode>(Inst) && !Inst->isEHPad())
- return Inst;
-
- // We can't insert directly before a phi node or an eh pad. Insert before
- // the terminator of the incoming or dominating block.
- assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!");
- if (Idx != ~0U && isa<PHINode>(Inst))
- return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator();
-
- // This must be an EH pad. Iterate over immediate dominators until we find a
- // non-EH pad. We need to skip over catchswitch blocks, which are both EH pads
- // and terminators.
- auto IDom = DT->getNode(Inst->getParent())->getIDom();
- while (IDom->getBlock()->isEHPad()) {
- assert(Entry != IDom->getBlock() && "eh pad in entry block");
- IDom = IDom->getIDom();
- }
-
- return IDom->getBlock()->getTerminator();
-}
-
-/// Given \p BBs as input, find another set of BBs which collectively
-/// dominates \p BBs and have the minimal sum of frequencies. Return the BB
-/// set found in \p BBs.
-static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
- BasicBlock *Entry,
- SetVector<BasicBlock *> &BBs) {
- assert(!BBs.count(Entry) && "Assume Entry is not in BBs");
- // Nodes on the current path to the root.
- SmallPtrSet<BasicBlock *, 8> Path;
- // Candidates includes any block 'BB' in set 'BBs' that is not strictly
- // dominated by any other blocks in set 'BBs', and all nodes in the path
- // in the dominator tree from Entry to 'BB'.
- SmallPtrSet<BasicBlock *, 16> Candidates;
- for (auto BB : BBs) {
- // Ignore unreachable basic blocks.
- if (!DT.isReachableFromEntry(BB))
- continue;
- Path.clear();
- // Walk up the dominator tree until Entry or another BB in BBs
- // is reached. Insert the nodes on the way to the Path.
- BasicBlock *Node = BB;
- // The "Path" is a candidate path to be added into Candidates set.
- bool isCandidate = false;
- do {
- Path.insert(Node);
- if (Node == Entry || Candidates.count(Node)) {
- isCandidate = true;
- break;
- }
- assert(DT.getNode(Node)->getIDom() &&
- "Entry doens't dominate current Node");
- Node = DT.getNode(Node)->getIDom()->getBlock();
- } while (!BBs.count(Node));
-
- // If isCandidate is false, Node is another Block in BBs dominating
- // current 'BB'. Drop the nodes on the Path.
- if (!isCandidate)
- continue;
-
- // Add nodes on the Path into Candidates.
- Candidates.insert(Path.begin(), Path.end());
- }
-
- // Sort the nodes in Candidates in top-down order and save the nodes
- // in Orders.
- unsigned Idx = 0;
- SmallVector<BasicBlock *, 16> Orders;
- Orders.push_back(Entry);
- while (Idx != Orders.size()) {
- BasicBlock *Node = Orders[Idx++];
- for (auto ChildDomNode : DT.getNode(Node)->children()) {
- if (Candidates.count(ChildDomNode->getBlock()))
- Orders.push_back(ChildDomNode->getBlock());
- }
- }
-
- // Visit Orders in bottom-up order.
- using InsertPtsCostPair =
- std::pair<SetVector<BasicBlock *>, BlockFrequency>;
-
- // InsertPtsMap is a map from a BB to the best insertion points for the
- // subtree of BB (subtree not including the BB itself).
- DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap;
- InsertPtsMap.reserve(Orders.size() + 1);
- for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) {
- BasicBlock *Node = *RIt;
- bool NodeInBBs = BBs.count(Node);
- auto &InsertPts = InsertPtsMap[Node].first;
- BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second;
-
- // Return the optimal insert points in BBs.
- if (Node == Entry) {
- BBs.clear();
- if (InsertPtsFreq > BFI.getBlockFreq(Node) ||
- (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1))
- BBs.insert(Entry);
- else
- BBs.insert(InsertPts.begin(), InsertPts.end());
- break;
- }
-
- BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock();
- // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child
- // will update its parent's ParentInsertPts and ParentPtsFreq.
- auto &ParentInsertPts = InsertPtsMap[Parent].first;
- BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second;
- // Choose to insert in Node or in subtree of Node.
- // Don't hoist to EHPad because we may not find a proper place to insert
- // in EHPad.
- // If the total frequency of InsertPts is the same as the frequency of the
- // target Node, and InsertPts contains more than one nodes, choose hoisting
- // to reduce code size.
- if (NodeInBBs ||
- (!Node->isEHPad() &&
- (InsertPtsFreq > BFI.getBlockFreq(Node) ||
- (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1)))) {
- ParentInsertPts.insert(Node);
- ParentPtsFreq += BFI.getBlockFreq(Node);
- } else {
- ParentInsertPts.insert(InsertPts.begin(), InsertPts.end());
- ParentPtsFreq += InsertPtsFreq;
- }
- }
-}
-
-/// Find an insertion point that dominates all uses.
-SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint(
- const ConstantInfo &ConstInfo) const {
- assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
- // Collect all basic blocks.
- SetVector<BasicBlock *> BBs;
- SetVector<Instruction *> InsertPts;
- for (auto const &RCI : ConstInfo.RebasedConstants)
- for (auto const &U : RCI.Uses)
- BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent());
-
- if (BBs.count(Entry)) {
- InsertPts.insert(&Entry->front());
- return InsertPts;
- }
-
- if (BFI) {
- findBestInsertionSet(*DT, *BFI, Entry, BBs);
- for (auto BB : BBs) {
- BasicBlock::iterator InsertPt = BB->begin();
- for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
- ;
- InsertPts.insert(&*InsertPt);
- }
- return InsertPts;
- }
-
- while (BBs.size() >= 2) {
- BasicBlock *BB, *BB1, *BB2;
- BB1 = BBs.pop_back_val();
- BB2 = BBs.pop_back_val();
- BB = DT->findNearestCommonDominator(BB1, BB2);
- if (BB == Entry) {
- InsertPts.insert(&Entry->front());
- return InsertPts;
- }
- BBs.insert(BB);
- }
- assert((BBs.size() == 1) && "Expected only one element.");
- Instruction &FirstInst = (*BBs.begin())->front();
- InsertPts.insert(findMatInsertPt(&FirstInst));
- return InsertPts;
-}
-
-/// Record constant integer ConstInt for instruction Inst at operand
-/// index Idx.
-///
-/// The operand at index Idx is not necessarily the constant integer itself. It
-/// could also be a cast instruction or a constant expression that uses the
-/// constant integer.
-void ConstantHoistingPass::collectConstantCandidates(
- ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx,
- ConstantInt *ConstInt) {
- unsigned Cost;
- // Ask the target about the cost of materializing the constant for the given
- // instruction and operand index.
- if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst))
- Cost = TTI->getIntImmCostIntrin(IntrInst->getIntrinsicID(), Idx,
- ConstInt->getValue(), ConstInt->getType(),
- TargetTransformInfo::TCK_SizeAndLatency);
- else
+//===- ConstantHoisting.cpp - Prepare code for expensive constants --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass identifies expensive constants to hoist and coalesces them to
+// better prepare it for SelectionDAG-based code generation. This works around
+// the limitations of the basic-block-at-a-time approach.
+//
+// First it scans all instructions for integer constants and calculates its
+// cost. If the constant can be folded into the instruction (the cost is
+// TCC_Free) or the cost is just a simple operation (TCC_BASIC), then we don't
+// consider it expensive and leave it alone. This is the default behavior and
+// the default implementation of getIntImmCostInst will always return TCC_Free.
+//
+// If the cost is more than TCC_BASIC, then the integer constant can't be folded
+// into the instruction and it might be beneficial to hoist the constant.
+// Similar constants are coalesced to reduce register pressure and
+// materialization code.
+//
+// When a constant is hoisted, it is also hidden behind a bitcast to force it to
+// be live-out of the basic block. Otherwise the constant would be just
+// duplicated and each basic block would have its own copy in the SelectionDAG.
+// The SelectionDAG recognizes such constants as opaque and doesn't perform
+// certain transformations on them, which would create a new expensive constant.
+//
+// This optimization is only applied to integer constants in instructions and
+// simple (this means not nested) constant cast expressions. For example:
+// %0 = load i64* inttoptr (i64 big_constant to i64*)
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/ConstantHoisting.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+using namespace consthoist;
+
+#define DEBUG_TYPE "consthoist"
+
+STATISTIC(NumConstantsHoisted, "Number of constants hoisted");
+STATISTIC(NumConstantsRebased, "Number of constants rebased");
+
+static cl::opt<bool> ConstHoistWithBlockFrequency(
+ "consthoist-with-block-frequency", cl::init(true), cl::Hidden,
+ cl::desc("Enable the use of the block frequency analysis to reduce the "
+ "chance to execute const materialization more frequently than "
+ "without hoisting."));
+
+static cl::opt<bool> ConstHoistGEP(
+ "consthoist-gep", cl::init(false), cl::Hidden,
+ cl::desc("Try hoisting constant gep expressions"));
+
+static cl::opt<unsigned>
+MinNumOfDependentToRebase("consthoist-min-num-to-rebase",
+ cl::desc("Do not rebase if number of dependent constants of a Base is less "
+ "than this number."),
+ cl::init(0), cl::Hidden);
+
+namespace {
+
+/// The constant hoisting pass.
+class ConstantHoistingLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ ConstantHoistingLegacyPass() : FunctionPass(ID) {
+ initializeConstantHoistingLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &Fn) override;
+
+ StringRef getPassName() const override { return "Constant Hoisting"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ if (ConstHoistWithBlockFrequency)
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+private:
+ ConstantHoistingPass Impl;
+};
+
+} // end anonymous namespace
+
+char ConstantHoistingLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(ConstantHoistingLegacyPass, "consthoist",
+ "Constant Hoisting", false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(ConstantHoistingLegacyPass, "consthoist",
+ "Constant Hoisting", false, false)
+
+FunctionPass *llvm::createConstantHoistingPass() {
+ return new ConstantHoistingLegacyPass();
+}
+
+/// Perform the constant hoisting optimization for the given function.
+bool ConstantHoistingLegacyPass::runOnFunction(Function &Fn) {
+ if (skipFunction(Fn))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "********** Begin Constant Hoisting **********\n");
+ LLVM_DEBUG(dbgs() << "********** Function: " << Fn.getName() << '\n');
+
+ bool MadeChange =
+ Impl.runImpl(Fn, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn),
+ getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ ConstHoistWithBlockFrequency
+ ? &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI()
+ : nullptr,
+ Fn.getEntryBlock(),
+ &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
+
+ if (MadeChange) {
+ LLVM_DEBUG(dbgs() << "********** Function after Constant Hoisting: "
+ << Fn.getName() << '\n');
+ LLVM_DEBUG(dbgs() << Fn);
+ }
+ LLVM_DEBUG(dbgs() << "********** End Constant Hoisting **********\n");
+
+ return MadeChange;
+}
+
+/// Find the constant materialization insertion point.
+Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
+ unsigned Idx) const {
+ // If the operand is a cast instruction, then we have to materialize the
+ // constant before the cast instruction.
+ if (Idx != ~0U) {
+ Value *Opnd = Inst->getOperand(Idx);
+ if (auto CastInst = dyn_cast<Instruction>(Opnd))
+ if (CastInst->isCast())
+ return CastInst;
+ }
+
+ // The simple and common case. This also includes constant expressions.
+ if (!isa<PHINode>(Inst) && !Inst->isEHPad())
+ return Inst;
+
+ // We can't insert directly before a phi node or an eh pad. Insert before
+ // the terminator of the incoming or dominating block.
+ assert(Entry != Inst->getParent() && "PHI or landing pad in entry block!");
+ if (Idx != ~0U && isa<PHINode>(Inst))
+ return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator();
+
+ // This must be an EH pad. Iterate over immediate dominators until we find a
+ // non-EH pad. We need to skip over catchswitch blocks, which are both EH pads
+ // and terminators.
+ auto IDom = DT->getNode(Inst->getParent())->getIDom();
+ while (IDom->getBlock()->isEHPad()) {
+ assert(Entry != IDom->getBlock() && "eh pad in entry block");
+ IDom = IDom->getIDom();
+ }
+
+ return IDom->getBlock()->getTerminator();
+}
+
+/// Given \p BBs as input, find another set of BBs which collectively
+/// dominates \p BBs and have the minimal sum of frequencies. Return the BB
+/// set found in \p BBs.
+static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
+ BasicBlock *Entry,
+ SetVector<BasicBlock *> &BBs) {
+ assert(!BBs.count(Entry) && "Assume Entry is not in BBs");
+ // Nodes on the current path to the root.
+ SmallPtrSet<BasicBlock *, 8> Path;
+ // Candidates includes any block 'BB' in set 'BBs' that is not strictly
+ // dominated by any other blocks in set 'BBs', and all nodes in the path
+ // in the dominator tree from Entry to 'BB'.
+ SmallPtrSet<BasicBlock *, 16> Candidates;
+ for (auto BB : BBs) {
+ // Ignore unreachable basic blocks.
+ if (!DT.isReachableFromEntry(BB))
+ continue;
+ Path.clear();
+ // Walk up the dominator tree until Entry or another BB in BBs
+ // is reached. Insert the nodes on the way to the Path.
+ BasicBlock *Node = BB;
+ // The "Path" is a candidate path to be added into Candidates set.
+ bool isCandidate = false;
+ do {
+ Path.insert(Node);
+ if (Node == Entry || Candidates.count(Node)) {
+ isCandidate = true;
+ break;
+ }
+ assert(DT.getNode(Node)->getIDom() &&
+ "Entry doens't dominate current Node");
+ Node = DT.getNode(Node)->getIDom()->getBlock();
+ } while (!BBs.count(Node));
+
+ // If isCandidate is false, Node is another Block in BBs dominating
+ // current 'BB'. Drop the nodes on the Path.
+ if (!isCandidate)
+ continue;
+
+ // Add nodes on the Path into Candidates.
+ Candidates.insert(Path.begin(), Path.end());
+ }
+
+ // Sort the nodes in Candidates in top-down order and save the nodes
+ // in Orders.
+ unsigned Idx = 0;
+ SmallVector<BasicBlock *, 16> Orders;
+ Orders.push_back(Entry);
+ while (Idx != Orders.size()) {
+ BasicBlock *Node = Orders[Idx++];
+ for (auto ChildDomNode : DT.getNode(Node)->children()) {
+ if (Candidates.count(ChildDomNode->getBlock()))
+ Orders.push_back(ChildDomNode->getBlock());
+ }
+ }
+
+ // Visit Orders in bottom-up order.
+ using InsertPtsCostPair =
+ std::pair<SetVector<BasicBlock *>, BlockFrequency>;
+
+ // InsertPtsMap is a map from a BB to the best insertion points for the
+ // subtree of BB (subtree not including the BB itself).
+ DenseMap<BasicBlock *, InsertPtsCostPair> InsertPtsMap;
+ InsertPtsMap.reserve(Orders.size() + 1);
+ for (auto RIt = Orders.rbegin(); RIt != Orders.rend(); RIt++) {
+ BasicBlock *Node = *RIt;
+ bool NodeInBBs = BBs.count(Node);
+ auto &InsertPts = InsertPtsMap[Node].first;
+ BlockFrequency &InsertPtsFreq = InsertPtsMap[Node].second;
+
+ // Return the optimal insert points in BBs.
+ if (Node == Entry) {
+ BBs.clear();
+ if (InsertPtsFreq > BFI.getBlockFreq(Node) ||
+ (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1))
+ BBs.insert(Entry);
+ else
+ BBs.insert(InsertPts.begin(), InsertPts.end());
+ break;
+ }
+
+ BasicBlock *Parent = DT.getNode(Node)->getIDom()->getBlock();
+ // Initially, ParentInsertPts is empty and ParentPtsFreq is 0. Every child
+ // will update its parent's ParentInsertPts and ParentPtsFreq.
+ auto &ParentInsertPts = InsertPtsMap[Parent].first;
+ BlockFrequency &ParentPtsFreq = InsertPtsMap[Parent].second;
+ // Choose to insert in Node or in subtree of Node.
+ // Don't hoist to EHPad because we may not find a proper place to insert
+ // in EHPad.
+ // If the total frequency of InsertPts is the same as the frequency of the
+ // target Node, and InsertPts contains more than one nodes, choose hoisting
+ // to reduce code size.
+ if (NodeInBBs ||
+ (!Node->isEHPad() &&
+ (InsertPtsFreq > BFI.getBlockFreq(Node) ||
+ (InsertPtsFreq == BFI.getBlockFreq(Node) && InsertPts.size() > 1)))) {
+ ParentInsertPts.insert(Node);
+ ParentPtsFreq += BFI.getBlockFreq(Node);
+ } else {
+ ParentInsertPts.insert(InsertPts.begin(), InsertPts.end());
+ ParentPtsFreq += InsertPtsFreq;
+ }
+ }
+}
+
+/// Find an insertion point that dominates all uses.
+SetVector<Instruction *> ConstantHoistingPass::findConstantInsertionPoint(
+ const ConstantInfo &ConstInfo) const {
+ assert(!ConstInfo.RebasedConstants.empty() && "Invalid constant info entry.");
+ // Collect all basic blocks.
+ SetVector<BasicBlock *> BBs;
+ SetVector<Instruction *> InsertPts;
+ for (auto const &RCI : ConstInfo.RebasedConstants)
+ for (auto const &U : RCI.Uses)
+ BBs.insert(findMatInsertPt(U.Inst, U.OpndIdx)->getParent());
+
+ if (BBs.count(Entry)) {
+ InsertPts.insert(&Entry->front());
+ return InsertPts;
+ }
+
+ if (BFI) {
+ findBestInsertionSet(*DT, *BFI, Entry, BBs);
+ for (auto BB : BBs) {
+ BasicBlock::iterator InsertPt = BB->begin();
+ for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
+ ;
+ InsertPts.insert(&*InsertPt);
+ }
+ return InsertPts;
+ }
+
+ while (BBs.size() >= 2) {
+ BasicBlock *BB, *BB1, *BB2;
+ BB1 = BBs.pop_back_val();
+ BB2 = BBs.pop_back_val();
+ BB = DT->findNearestCommonDominator(BB1, BB2);
+ if (BB == Entry) {
+ InsertPts.insert(&Entry->front());
+ return InsertPts;
+ }
+ BBs.insert(BB);
+ }
+ assert((BBs.size() == 1) && "Expected only one element.");
+ Instruction &FirstInst = (*BBs.begin())->front();
+ InsertPts.insert(findMatInsertPt(&FirstInst));
+ return InsertPts;
+}
+
+/// Record constant integer ConstInt for instruction Inst at operand
+/// index Idx.
+///
+/// The operand at index Idx is not necessarily the constant integer itself. It
+/// could also be a cast instruction or a constant expression that uses the
+/// constant integer.
+void ConstantHoistingPass::collectConstantCandidates(
+ ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx,
+ ConstantInt *ConstInt) {
+ unsigned Cost;
+ // Ask the target about the cost of materializing the constant for the given
+ // instruction and operand index.
+ if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst))
+ Cost = TTI->getIntImmCostIntrin(IntrInst->getIntrinsicID(), Idx,
+ ConstInt->getValue(), ConstInt->getType(),
+ TargetTransformInfo::TCK_SizeAndLatency);
+ else
Cost = TTI->getIntImmCostInst(
Inst->getOpcode(), Idx, ConstInt->getValue(), ConstInt->getType(),
TargetTransformInfo::TCK_SizeAndLatency, Inst);
-
- // Ignore cheap integer constants.
- if (Cost > TargetTransformInfo::TCC_Basic) {
- ConstCandMapType::iterator Itr;
- bool Inserted;
- ConstPtrUnionType Cand = ConstInt;
- std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(Cand, 0));
- if (Inserted) {
- ConstIntCandVec.push_back(ConstantCandidate(ConstInt));
- Itr->second = ConstIntCandVec.size() - 1;
- }
- ConstIntCandVec[Itr->second].addUser(Inst, Idx, Cost);
- LLVM_DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx))) dbgs()
- << "Collect constant " << *ConstInt << " from " << *Inst
- << " with cost " << Cost << '\n';
- else dbgs() << "Collect constant " << *ConstInt
- << " indirectly from " << *Inst << " via "
- << *Inst->getOperand(Idx) << " with cost " << Cost
- << '\n';);
- }
-}
-
-/// Record constant GEP expression for instruction Inst at operand index Idx.
-void ConstantHoistingPass::collectConstantCandidates(
- ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx,
- ConstantExpr *ConstExpr) {
- // TODO: Handle vector GEPs
- if (ConstExpr->getType()->isVectorTy())
- return;
-
- GlobalVariable *BaseGV = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
- if (!BaseGV)
- return;
-
- // Get offset from the base GV.
- PointerType *GVPtrTy = cast<PointerType>(BaseGV->getType());
- IntegerType *PtrIntTy = DL->getIntPtrType(*Ctx, GVPtrTy->getAddressSpace());
- APInt Offset(DL->getTypeSizeInBits(PtrIntTy), /*val*/0, /*isSigned*/true);
- auto *GEPO = cast<GEPOperator>(ConstExpr);
- if (!GEPO->accumulateConstantOffset(*DL, Offset))
- return;
-
- if (!Offset.isIntN(32))
- return;
-
- // A constant GEP expression that has a GlobalVariable as base pointer is
- // usually lowered to a load from constant pool. Such operation is unlikely
- // to be cheaper than compute it by <Base + Offset>, which can be lowered to
- // an ADD instruction or folded into Load/Store instruction.
+
+ // Ignore cheap integer constants.
+ if (Cost > TargetTransformInfo::TCC_Basic) {
+ ConstCandMapType::iterator Itr;
+ bool Inserted;
+ ConstPtrUnionType Cand = ConstInt;
+ std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(Cand, 0));
+ if (Inserted) {
+ ConstIntCandVec.push_back(ConstantCandidate(ConstInt));
+ Itr->second = ConstIntCandVec.size() - 1;
+ }
+ ConstIntCandVec[Itr->second].addUser(Inst, Idx, Cost);
+ LLVM_DEBUG(if (isa<ConstantInt>(Inst->getOperand(Idx))) dbgs()
+ << "Collect constant " << *ConstInt << " from " << *Inst
+ << " with cost " << Cost << '\n';
+ else dbgs() << "Collect constant " << *ConstInt
+ << " indirectly from " << *Inst << " via "
+ << *Inst->getOperand(Idx) << " with cost " << Cost
+ << '\n';);
+ }
+}
+
+/// Record constant GEP expression for instruction Inst at operand index Idx.
+void ConstantHoistingPass::collectConstantCandidates(
+ ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx,
+ ConstantExpr *ConstExpr) {
+ // TODO: Handle vector GEPs
+ if (ConstExpr->getType()->isVectorTy())
+ return;
+
+ GlobalVariable *BaseGV = dyn_cast<GlobalVariable>(ConstExpr->getOperand(0));
+ if (!BaseGV)
+ return;
+
+ // Get offset from the base GV.
+ PointerType *GVPtrTy = cast<PointerType>(BaseGV->getType());
+ IntegerType *PtrIntTy = DL->getIntPtrType(*Ctx, GVPtrTy->getAddressSpace());
+ APInt Offset(DL->getTypeSizeInBits(PtrIntTy), /*val*/0, /*isSigned*/true);
+ auto *GEPO = cast<GEPOperator>(ConstExpr);
+ if (!GEPO->accumulateConstantOffset(*DL, Offset))
+ return;
+
+ if (!Offset.isIntN(32))
+ return;
+
+ // A constant GEP expression that has a GlobalVariable as base pointer is
+ // usually lowered to a load from constant pool. Such operation is unlikely
+ // to be cheaper than compute it by <Base + Offset>, which can be lowered to
+ // an ADD instruction or folded into Load/Store instruction.
int Cost =
TTI->getIntImmCostInst(Instruction::Add, 1, Offset, PtrIntTy,
TargetTransformInfo::TCK_SizeAndLatency, Inst);
- ConstCandVecType &ExprCandVec = ConstGEPCandMap[BaseGV];
- ConstCandMapType::iterator Itr;
- bool Inserted;
- ConstPtrUnionType Cand = ConstExpr;
- std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(Cand, 0));
- if (Inserted) {
- ExprCandVec.push_back(ConstantCandidate(
- ConstantInt::get(Type::getInt32Ty(*Ctx), Offset.getLimitedValue()),
- ConstExpr));
- Itr->second = ExprCandVec.size() - 1;
- }
- ExprCandVec[Itr->second].addUser(Inst, Idx, Cost);
-}
-
-/// Check the operand for instruction Inst at index Idx.
-void ConstantHoistingPass::collectConstantCandidates(
- ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) {
- Value *Opnd = Inst->getOperand(Idx);
-
- // Visit constant integers.
- if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
- collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
- return;
- }
-
- // Visit cast instructions that have constant integers.
- if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
- // Only visit cast instructions, which have been skipped. All other
- // instructions should have already been visited.
- if (!CastInst->isCast())
- return;
-
- if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
- // Pretend the constant is directly used by the instruction and ignore
- // the cast instruction.
- collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
- return;
- }
- }
-
- // Visit constant expressions that have constant integers.
- if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
- // Handle constant gep expressions.
- if (ConstHoistGEP && ConstExpr->isGEPWithNoNotionalOverIndexing())
- collectConstantCandidates(ConstCandMap, Inst, Idx, ConstExpr);
-
- // Only visit constant cast expressions.
- if (!ConstExpr->isCast())
- return;
-
- if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
- // Pretend the constant is directly used by the instruction and ignore
- // the constant expression.
- collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
- return;
- }
- }
-}
-
-/// Scan the instruction for expensive integer constants and record them
-/// in the constant candidate vector.
-void ConstantHoistingPass::collectConstantCandidates(
- ConstCandMapType &ConstCandMap, Instruction *Inst) {
- // Skip all cast instructions. They are visited indirectly later on.
- if (Inst->isCast())
- return;
-
- // Scan all operands.
- for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
- // The cost of materializing the constants (defined in
- // `TargetTransformInfo::getIntImmCostInst`) for instructions which only
- // take constant variables is lower than `TargetTransformInfo::TCC_Basic`.
- // So it's safe for us to collect constant candidates from all
- // IntrinsicInsts.
- if (canReplaceOperandWithVariable(Inst, Idx)) {
- collectConstantCandidates(ConstCandMap, Inst, Idx);
- }
- } // end of for all operands
-}
-
-/// Collect all integer constants in the function that cannot be folded
-/// into an instruction itself.
-void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
- ConstCandMapType ConstCandMap;
- for (BasicBlock &BB : Fn) {
- // Ignore unreachable basic blocks.
- if (!DT->isReachableFromEntry(&BB))
- continue;
- for (Instruction &Inst : BB)
- collectConstantCandidates(ConstCandMap, &Inst);
- }
-}
-
-// This helper function is necessary to deal with values that have different
-// bit widths (APInt Operator- does not like that). If the value cannot be
-// represented in uint64 we return an "empty" APInt. This is then interpreted
-// as the value is not in range.
-static Optional<APInt> calculateOffsetDiff(const APInt &V1, const APInt &V2) {
- Optional<APInt> Res = None;
- unsigned BW = V1.getBitWidth() > V2.getBitWidth() ?
- V1.getBitWidth() : V2.getBitWidth();
- uint64_t LimVal1 = V1.getLimitedValue();
- uint64_t LimVal2 = V2.getLimitedValue();
-
- if (LimVal1 == ~0ULL || LimVal2 == ~0ULL)
- return Res;
-
- uint64_t Diff = LimVal1 - LimVal2;
- return APInt(BW, Diff, true);
-}
-
-// From a list of constants, one needs to picked as the base and the other
-// constants will be transformed into an offset from that base constant. The
-// question is which we can pick best? For example, consider these constants
-// and their number of uses:
-//
-// Constants| 2 | 4 | 12 | 42 |
-// NumUses | 3 | 2 | 8 | 7 |
-//
-// Selecting constant 12 because it has the most uses will generate negative
-// offsets for constants 2 and 4 (i.e. -10 and -8 respectively). If negative
-// offsets lead to less optimal code generation, then there might be better
-// solutions. Suppose immediates in the range of 0..35 are most optimally
-// supported by the architecture, then selecting constant 2 is most optimal
-// because this will generate offsets: 0, 2, 10, 40. Offsets 0, 2 and 10 are in
-// range 0..35, and thus 3 + 2 + 8 = 13 uses are in range. Selecting 12 would
-// have only 8 uses in range, so choosing 2 as a base is more optimal. Thus, in
-// selecting the base constant the range of the offsets is a very important
-// factor too that we take into account here. This algorithm calculates a total
-// costs for selecting a constant as the base and substract the costs if
-// immediates are out of range. It has quadratic complexity, so we call this
-// function only when we're optimising for size and there are less than 100
-// constants, we fall back to the straightforward algorithm otherwise
-// which does not do all the offset calculations.
-unsigned
-ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
- ConstCandVecType::iterator E,
- ConstCandVecType::iterator &MaxCostItr) {
- unsigned NumUses = 0;
-
- bool OptForSize = Entry->getParent()->hasOptSize() ||
- llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI,
- PGSOQueryType::IRPass);
- if (!OptForSize || std::distance(S,E) > 100) {
- for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
- NumUses += ConstCand->Uses.size();
- if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost)
- MaxCostItr = ConstCand;
- }
- return NumUses;
- }
-
- LLVM_DEBUG(dbgs() << "== Maximize constants in range ==\n");
- int MaxCost = -1;
- for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
- auto Value = ConstCand->ConstInt->getValue();
- Type *Ty = ConstCand->ConstInt->getType();
- int Cost = 0;
- NumUses += ConstCand->Uses.size();
- LLVM_DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue()
- << "\n");
-
- for (auto User : ConstCand->Uses) {
- unsigned Opcode = User.Inst->getOpcode();
- unsigned OpndIdx = User.OpndIdx;
- Cost += TTI->getIntImmCostInst(Opcode, OpndIdx, Value, Ty,
- TargetTransformInfo::TCK_SizeAndLatency);
- LLVM_DEBUG(dbgs() << "Cost: " << Cost << "\n");
-
- for (auto C2 = S; C2 != E; ++C2) {
- Optional<APInt> Diff = calculateOffsetDiff(
- C2->ConstInt->getValue(),
- ConstCand->ConstInt->getValue());
- if (Diff) {
- const int ImmCosts =
- TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty);
- Cost -= ImmCosts;
- LLVM_DEBUG(dbgs() << "Offset " << Diff.getValue() << " "
- << "has penalty: " << ImmCosts << "\n"
- << "Adjusted cost: " << Cost << "\n");
- }
- }
- }
- LLVM_DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n");
- if (Cost > MaxCost) {
- MaxCost = Cost;
- MaxCostItr = ConstCand;
- LLVM_DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue()
- << "\n");
- }
- }
- return NumUses;
-}
-
-/// Find the base constant within the given range and rebase all other
-/// constants with respect to the base constant.
-void ConstantHoistingPass::findAndMakeBaseConstant(
- ConstCandVecType::iterator S, ConstCandVecType::iterator E,
- SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec) {
- auto MaxCostItr = S;
- unsigned NumUses = maximizeConstantsInRange(S, E, MaxCostItr);
-
- // Don't hoist constants that have only one use.
- if (NumUses <= 1)
- return;
-
- ConstantInt *ConstInt = MaxCostItr->ConstInt;
- ConstantExpr *ConstExpr = MaxCostItr->ConstExpr;
- ConstantInfo ConstInfo;
- ConstInfo.BaseInt = ConstInt;
- ConstInfo.BaseExpr = ConstExpr;
- Type *Ty = ConstInt->getType();
-
- // Rebase the constants with respect to the base constant.
- for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
- APInt Diff = ConstCand->ConstInt->getValue() - ConstInt->getValue();
- Constant *Offset = Diff == 0 ? nullptr : ConstantInt::get(Ty, Diff);
- Type *ConstTy =
- ConstCand->ConstExpr ? ConstCand->ConstExpr->getType() : nullptr;
- ConstInfo.RebasedConstants.push_back(
- RebasedConstantInfo(std::move(ConstCand->Uses), Offset, ConstTy));
- }
- ConstInfoVec.push_back(std::move(ConstInfo));
-}
-
-/// Finds and combines constant candidates that can be easily
-/// rematerialized with an add from a common base constant.
-void ConstantHoistingPass::findBaseConstants(GlobalVariable *BaseGV) {
- // If BaseGV is nullptr, find base among candidate constant integers;
- // Otherwise find base among constant GEPs that share the same BaseGV.
- ConstCandVecType &ConstCandVec = BaseGV ?
- ConstGEPCandMap[BaseGV] : ConstIntCandVec;
- ConstInfoVecType &ConstInfoVec = BaseGV ?
- ConstGEPInfoMap[BaseGV] : ConstIntInfoVec;
-
- // Sort the constants by value and type. This invalidates the mapping!
- llvm::stable_sort(ConstCandVec, [](const ConstantCandidate &LHS,
- const ConstantCandidate &RHS) {
- if (LHS.ConstInt->getType() != RHS.ConstInt->getType())
- return LHS.ConstInt->getType()->getBitWidth() <
- RHS.ConstInt->getType()->getBitWidth();
- return LHS.ConstInt->getValue().ult(RHS.ConstInt->getValue());
- });
-
- // Simple linear scan through the sorted constant candidate vector for viable
- // merge candidates.
- auto MinValItr = ConstCandVec.begin();
- for (auto CC = std::next(ConstCandVec.begin()), E = ConstCandVec.end();
- CC != E; ++CC) {
- if (MinValItr->ConstInt->getType() == CC->ConstInt->getType()) {
- Type *MemUseValTy = nullptr;
- for (auto &U : CC->Uses) {
- auto *UI = U.Inst;
- if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
- MemUseValTy = LI->getType();
- break;
- } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
- // Make sure the constant is used as pointer operand of the StoreInst.
- if (SI->getPointerOperand() == SI->getOperand(U.OpndIdx)) {
- MemUseValTy = SI->getValueOperand()->getType();
- break;
- }
- }
- }
-
- // Check if the constant is in range of an add with immediate.
- APInt Diff = CC->ConstInt->getValue() - MinValItr->ConstInt->getValue();
- if ((Diff.getBitWidth() <= 64) &&
- TTI->isLegalAddImmediate(Diff.getSExtValue()) &&
- // Check if Diff can be used as offset in addressing mode of the user
- // memory instruction.
- (!MemUseValTy || TTI->isLegalAddressingMode(MemUseValTy,
- /*BaseGV*/nullptr, /*BaseOffset*/Diff.getSExtValue(),
- /*HasBaseReg*/true, /*Scale*/0)))
- continue;
- }
- // We either have now a different constant type or the constant is not in
- // range of an add with immediate anymore.
- findAndMakeBaseConstant(MinValItr, CC, ConstInfoVec);
- // Start a new base constant search.
- MinValItr = CC;
- }
- // Finalize the last base constant search.
- findAndMakeBaseConstant(MinValItr, ConstCandVec.end(), ConstInfoVec);
-}
-
-/// Updates the operand at Idx in instruction Inst with the result of
-/// instruction Mat. If the instruction is a PHI node then special
-/// handling for duplicate values form the same incoming basic block is
-/// required.
-/// \return The update will always succeed, but the return value indicated if
-/// Mat was used for the update or not.
-static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) {
- if (auto PHI = dyn_cast<PHINode>(Inst)) {
- // Check if any previous operand of the PHI node has the same incoming basic
- // block. This is a very odd case that happens when the incoming basic block
- // has a switch statement. In this case use the same value as the previous
- // operand(s), otherwise we will fail verification due to different values.
- // The values are actually the same, but the variable names are different
- // and the verifier doesn't like that.
- BasicBlock *IncomingBB = PHI->getIncomingBlock(Idx);
- for (unsigned i = 0; i < Idx; ++i) {
- if (PHI->getIncomingBlock(i) == IncomingBB) {
- Value *IncomingVal = PHI->getIncomingValue(i);
- Inst->setOperand(Idx, IncomingVal);
- return false;
- }
- }
- }
-
- Inst->setOperand(Idx, Mat);
- return true;
-}
-
-/// Emit materialization code for all rebased constants and update their
-/// users.
-void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
- Constant *Offset,
- Type *Ty,
- const ConstantUser &ConstUser) {
- Instruction *Mat = Base;
-
- // The same offset can be dereferenced to different types in nested struct.
- if (!Offset && Ty && Ty != Base->getType())
- Offset = ConstantInt::get(Type::getInt32Ty(*Ctx), 0);
-
- if (Offset) {
- Instruction *InsertionPt = findMatInsertPt(ConstUser.Inst,
- ConstUser.OpndIdx);
- if (Ty) {
- // Constant being rebased is a ConstantExpr.
- PointerType *Int8PtrTy = Type::getInt8PtrTy(*Ctx,
- cast<PointerType>(Ty)->getAddressSpace());
- Base = new BitCastInst(Base, Int8PtrTy, "base_bitcast", InsertionPt);
- Mat = GetElementPtrInst::Create(Int8PtrTy->getElementType(), Base,
- Offset, "mat_gep", InsertionPt);
- Mat = new BitCastInst(Mat, Ty, "mat_bitcast", InsertionPt);
- } else
- // Constant being rebased is a ConstantInt.
- Mat = BinaryOperator::Create(Instruction::Add, Base, Offset,
- "const_mat", InsertionPt);
-
- LLVM_DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
- << " + " << *Offset << ") in BB "
- << Mat->getParent()->getName() << '\n'
- << *Mat << '\n');
- Mat->setDebugLoc(ConstUser.Inst->getDebugLoc());
- }
- Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx);
-
- // Visit constant integer.
- if (isa<ConstantInt>(Opnd)) {
- LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
- if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset)
- Mat->eraseFromParent();
- LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
- return;
- }
-
- // Visit cast instruction.
- if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
- assert(CastInst->isCast() && "Expected an cast instruction!");
- // Check if we already have visited this cast instruction before to avoid
- // unnecessary cloning.
- Instruction *&ClonedCastInst = ClonedCastMap[CastInst];
- if (!ClonedCastInst) {
- ClonedCastInst = CastInst->clone();
- ClonedCastInst->setOperand(0, Mat);
- ClonedCastInst->insertAfter(CastInst);
- // Use the same debug location as the original cast instruction.
- ClonedCastInst->setDebugLoc(CastInst->getDebugLoc());
- LLVM_DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n'
- << "To : " << *ClonedCastInst << '\n');
- }
-
- LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
- updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst);
- LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
- return;
- }
-
- // Visit constant expression.
- if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
- if (ConstExpr->isGEPWithNoNotionalOverIndexing()) {
- // Operand is a ConstantGEP, replace it.
- updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat);
- return;
- }
-
- // Aside from constant GEPs, only constant cast expressions are collected.
- assert(ConstExpr->isCast() && "ConstExpr should be a cast");
- Instruction *ConstExprInst = ConstExpr->getAsInstruction();
- ConstExprInst->setOperand(0, Mat);
- ConstExprInst->insertBefore(findMatInsertPt(ConstUser.Inst,
- ConstUser.OpndIdx));
-
- // Use the same debug location as the instruction we are about to update.
- ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc());
-
- LLVM_DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n'
- << "From : " << *ConstExpr << '\n');
- LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
- if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) {
- ConstExprInst->eraseFromParent();
- if (Offset)
- Mat->eraseFromParent();
- }
- LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
- return;
- }
-}
-
-/// Hoist and hide the base constant behind a bitcast and emit
-/// materialization code for derived constants.
-bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) {
- bool MadeChange = false;
- SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec =
- BaseGV ? ConstGEPInfoMap[BaseGV] : ConstIntInfoVec;
- for (auto const &ConstInfo : ConstInfoVec) {
- SetVector<Instruction *> IPSet = findConstantInsertionPoint(ConstInfo);
- // We can have an empty set if the function contains unreachable blocks.
- if (IPSet.empty())
- continue;
-
- unsigned UsesNum = 0;
- unsigned ReBasesNum = 0;
- unsigned NotRebasedNum = 0;
- for (Instruction *IP : IPSet) {
- // First, collect constants depending on this IP of the base.
- unsigned Uses = 0;
- using RebasedUse = std::tuple<Constant *, Type *, ConstantUser>;
- SmallVector<RebasedUse, 4> ToBeRebased;
- for (auto const &RCI : ConstInfo.RebasedConstants) {
- for (auto const &U : RCI.Uses) {
- Uses++;
- BasicBlock *OrigMatInsertBB =
- findMatInsertPt(U.Inst, U.OpndIdx)->getParent();
- // If Base constant is to be inserted in multiple places,
- // generate rebase for U using the Base dominating U.
- if (IPSet.size() == 1 ||
- DT->dominates(IP->getParent(), OrigMatInsertBB))
- ToBeRebased.push_back(RebasedUse(RCI.Offset, RCI.Ty, U));
- }
- }
- UsesNum = Uses;
-
- // If only few constants depend on this IP of base, skip rebasing,
- // assuming the base and the rebased have the same materialization cost.
- if (ToBeRebased.size() < MinNumOfDependentToRebase) {
- NotRebasedNum += ToBeRebased.size();
- continue;
- }
-
- // Emit an instance of the base at this IP.
- Instruction *Base = nullptr;
- // Hoist and hide the base constant behind a bitcast.
- if (ConstInfo.BaseExpr) {
- assert(BaseGV && "A base constant expression must have an base GV");
- Type *Ty = ConstInfo.BaseExpr->getType();
- Base = new BitCastInst(ConstInfo.BaseExpr, Ty, "const", IP);
- } else {
- IntegerType *Ty = ConstInfo.BaseInt->getType();
- Base = new BitCastInst(ConstInfo.BaseInt, Ty, "const", IP);
- }
-
- Base->setDebugLoc(IP->getDebugLoc());
-
- LLVM_DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseInt
- << ") to BB " << IP->getParent()->getName() << '\n'
- << *Base << '\n');
-
- // Emit materialization code for rebased constants depending on this IP.
- for (auto const &R : ToBeRebased) {
- Constant *Off = std::get<0>(R);
- Type *Ty = std::get<1>(R);
- ConstantUser U = std::get<2>(R);
- emitBaseConstants(Base, Off, Ty, U);
- ReBasesNum++;
- // Use the same debug location as the last user of the constant.
- Base->setDebugLoc(DILocation::getMergedLocation(
- Base->getDebugLoc(), U.Inst->getDebugLoc()));
- }
- assert(!Base->use_empty() && "The use list is empty!?");
- assert(isa<Instruction>(Base->user_back()) &&
- "All uses should be instructions.");
- }
- (void)UsesNum;
- (void)ReBasesNum;
- (void)NotRebasedNum;
- // Expect all uses are rebased after rebase is done.
- assert(UsesNum == (ReBasesNum + NotRebasedNum) &&
- "Not all uses are rebased");
-
- NumConstantsHoisted++;
-
- // Base constant is also included in ConstInfo.RebasedConstants, so
- // deduct 1 from ConstInfo.RebasedConstants.size().
- NumConstantsRebased += ConstInfo.RebasedConstants.size() - 1;
-
- MadeChange = true;
- }
- return MadeChange;
-}
-
-/// Check all cast instructions we made a copy of and remove them if they
-/// have no more users.
-void ConstantHoistingPass::deleteDeadCastInst() const {
- for (auto const &I : ClonedCastMap)
- if (I.first->use_empty())
- I.first->eraseFromParent();
-}
-
-/// Optimize expensive integer constants in the given function.
-bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
- DominatorTree &DT, BlockFrequencyInfo *BFI,
- BasicBlock &Entry, ProfileSummaryInfo *PSI) {
- this->TTI = &TTI;
- this->DT = &DT;
- this->BFI = BFI;
- this->DL = &Fn.getParent()->getDataLayout();
- this->Ctx = &Fn.getContext();
- this->Entry = &Entry;
- this->PSI = PSI;
- // Collect all constant candidates.
- collectConstantCandidates(Fn);
-
- // Combine constants that can be easily materialized with an add from a common
- // base constant.
- if (!ConstIntCandVec.empty())
- findBaseConstants(nullptr);
+ ConstCandVecType &ExprCandVec = ConstGEPCandMap[BaseGV];
+ ConstCandMapType::iterator Itr;
+ bool Inserted;
+ ConstPtrUnionType Cand = ConstExpr;
+ std::tie(Itr, Inserted) = ConstCandMap.insert(std::make_pair(Cand, 0));
+ if (Inserted) {
+ ExprCandVec.push_back(ConstantCandidate(
+ ConstantInt::get(Type::getInt32Ty(*Ctx), Offset.getLimitedValue()),
+ ConstExpr));
+ Itr->second = ExprCandVec.size() - 1;
+ }
+ ExprCandVec[Itr->second].addUser(Inst, Idx, Cost);
+}
+
+/// Check the operand for instruction Inst at index Idx.
+void ConstantHoistingPass::collectConstantCandidates(
+ ConstCandMapType &ConstCandMap, Instruction *Inst, unsigned Idx) {
+ Value *Opnd = Inst->getOperand(Idx);
+
+ // Visit constant integers.
+ if (auto ConstInt = dyn_cast<ConstantInt>(Opnd)) {
+ collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+ return;
+ }
+
+ // Visit cast instructions that have constant integers.
+ if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+ // Only visit cast instructions, which have been skipped. All other
+ // instructions should have already been visited.
+ if (!CastInst->isCast())
+ return;
+
+ if (auto *ConstInt = dyn_cast<ConstantInt>(CastInst->getOperand(0))) {
+ // Pretend the constant is directly used by the instruction and ignore
+ // the cast instruction.
+ collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+ return;
+ }
+ }
+
+ // Visit constant expressions that have constant integers.
+ if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+ // Handle constant gep expressions.
+ if (ConstHoistGEP && ConstExpr->isGEPWithNoNotionalOverIndexing())
+ collectConstantCandidates(ConstCandMap, Inst, Idx, ConstExpr);
+
+ // Only visit constant cast expressions.
+ if (!ConstExpr->isCast())
+ return;
+
+ if (auto ConstInt = dyn_cast<ConstantInt>(ConstExpr->getOperand(0))) {
+ // Pretend the constant is directly used by the instruction and ignore
+ // the constant expression.
+ collectConstantCandidates(ConstCandMap, Inst, Idx, ConstInt);
+ return;
+ }
+ }
+}
+
+/// Scan the instruction for expensive integer constants and record them
+/// in the constant candidate vector.
+void ConstantHoistingPass::collectConstantCandidates(
+ ConstCandMapType &ConstCandMap, Instruction *Inst) {
+ // Skip all cast instructions. They are visited indirectly later on.
+ if (Inst->isCast())
+ return;
+
+ // Scan all operands.
+ for (unsigned Idx = 0, E = Inst->getNumOperands(); Idx != E; ++Idx) {
+ // The cost of materializing the constants (defined in
+ // `TargetTransformInfo::getIntImmCostInst`) for instructions which only
+ // take constant variables is lower than `TargetTransformInfo::TCC_Basic`.
+ // So it's safe for us to collect constant candidates from all
+ // IntrinsicInsts.
+ if (canReplaceOperandWithVariable(Inst, Idx)) {
+ collectConstantCandidates(ConstCandMap, Inst, Idx);
+ }
+ } // end of for all operands
+}
+
+/// Collect all integer constants in the function that cannot be folded
+/// into an instruction itself.
+void ConstantHoistingPass::collectConstantCandidates(Function &Fn) {
+ ConstCandMapType ConstCandMap;
+ for (BasicBlock &BB : Fn) {
+ // Ignore unreachable basic blocks.
+ if (!DT->isReachableFromEntry(&BB))
+ continue;
+ for (Instruction &Inst : BB)
+ collectConstantCandidates(ConstCandMap, &Inst);
+ }
+}
+
+// This helper function is necessary to deal with values that have different
+// bit widths (APInt Operator- does not like that). If the value cannot be
+// represented in uint64 we return an "empty" APInt. This is then interpreted
+// as the value is not in range.
+static Optional<APInt> calculateOffsetDiff(const APInt &V1, const APInt &V2) {
+ Optional<APInt> Res = None;
+ unsigned BW = V1.getBitWidth() > V2.getBitWidth() ?
+ V1.getBitWidth() : V2.getBitWidth();
+ uint64_t LimVal1 = V1.getLimitedValue();
+ uint64_t LimVal2 = V2.getLimitedValue();
+
+ if (LimVal1 == ~0ULL || LimVal2 == ~0ULL)
+ return Res;
+
+ uint64_t Diff = LimVal1 - LimVal2;
+ return APInt(BW, Diff, true);
+}
+
+// From a list of constants, one needs to picked as the base and the other
+// constants will be transformed into an offset from that base constant. The
+// question is which we can pick best? For example, consider these constants
+// and their number of uses:
+//
+// Constants| 2 | 4 | 12 | 42 |
+// NumUses | 3 | 2 | 8 | 7 |
+//
+// Selecting constant 12 because it has the most uses will generate negative
+// offsets for constants 2 and 4 (i.e. -10 and -8 respectively). If negative
+// offsets lead to less optimal code generation, then there might be better
+// solutions. Suppose immediates in the range of 0..35 are most optimally
+// supported by the architecture, then selecting constant 2 is most optimal
+// because this will generate offsets: 0, 2, 10, 40. Offsets 0, 2 and 10 are in
+// range 0..35, and thus 3 + 2 + 8 = 13 uses are in range. Selecting 12 would
+// have only 8 uses in range, so choosing 2 as a base is more optimal. Thus, in
+// selecting the base constant the range of the offsets is a very important
+// factor too that we take into account here. This algorithm calculates a total
+// costs for selecting a constant as the base and substract the costs if
+// immediates are out of range. It has quadratic complexity, so we call this
+// function only when we're optimising for size and there are less than 100
+// constants, we fall back to the straightforward algorithm otherwise
+// which does not do all the offset calculations.
+unsigned
+ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
+ ConstCandVecType::iterator E,
+ ConstCandVecType::iterator &MaxCostItr) {
+ unsigned NumUses = 0;
+
+ bool OptForSize = Entry->getParent()->hasOptSize() ||
+ llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI,
+ PGSOQueryType::IRPass);
+ if (!OptForSize || std::distance(S,E) > 100) {
+ for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+ NumUses += ConstCand->Uses.size();
+ if (ConstCand->CumulativeCost > MaxCostItr->CumulativeCost)
+ MaxCostItr = ConstCand;
+ }
+ return NumUses;
+ }
+
+ LLVM_DEBUG(dbgs() << "== Maximize constants in range ==\n");
+ int MaxCost = -1;
+ for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+ auto Value = ConstCand->ConstInt->getValue();
+ Type *Ty = ConstCand->ConstInt->getType();
+ int Cost = 0;
+ NumUses += ConstCand->Uses.size();
+ LLVM_DEBUG(dbgs() << "= Constant: " << ConstCand->ConstInt->getValue()
+ << "\n");
+
+ for (auto User : ConstCand->Uses) {
+ unsigned Opcode = User.Inst->getOpcode();
+ unsigned OpndIdx = User.OpndIdx;
+ Cost += TTI->getIntImmCostInst(Opcode, OpndIdx, Value, Ty,
+ TargetTransformInfo::TCK_SizeAndLatency);
+ LLVM_DEBUG(dbgs() << "Cost: " << Cost << "\n");
+
+ for (auto C2 = S; C2 != E; ++C2) {
+ Optional<APInt> Diff = calculateOffsetDiff(
+ C2->ConstInt->getValue(),
+ ConstCand->ConstInt->getValue());
+ if (Diff) {
+ const int ImmCosts =
+ TTI->getIntImmCodeSizeCost(Opcode, OpndIdx, Diff.getValue(), Ty);
+ Cost -= ImmCosts;
+ LLVM_DEBUG(dbgs() << "Offset " << Diff.getValue() << " "
+ << "has penalty: " << ImmCosts << "\n"
+ << "Adjusted cost: " << Cost << "\n");
+ }
+ }
+ }
+ LLVM_DEBUG(dbgs() << "Cumulative cost: " << Cost << "\n");
+ if (Cost > MaxCost) {
+ MaxCost = Cost;
+ MaxCostItr = ConstCand;
+ LLVM_DEBUG(dbgs() << "New candidate: " << MaxCostItr->ConstInt->getValue()
+ << "\n");
+ }
+ }
+ return NumUses;
+}
+
+/// Find the base constant within the given range and rebase all other
+/// constants with respect to the base constant.
+void ConstantHoistingPass::findAndMakeBaseConstant(
+ ConstCandVecType::iterator S, ConstCandVecType::iterator E,
+ SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec) {
+ auto MaxCostItr = S;
+ unsigned NumUses = maximizeConstantsInRange(S, E, MaxCostItr);
+
+ // Don't hoist constants that have only one use.
+ if (NumUses <= 1)
+ return;
+
+ ConstantInt *ConstInt = MaxCostItr->ConstInt;
+ ConstantExpr *ConstExpr = MaxCostItr->ConstExpr;
+ ConstantInfo ConstInfo;
+ ConstInfo.BaseInt = ConstInt;
+ ConstInfo.BaseExpr = ConstExpr;
+ Type *Ty = ConstInt->getType();
+
+ // Rebase the constants with respect to the base constant.
+ for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
+ APInt Diff = ConstCand->ConstInt->getValue() - ConstInt->getValue();
+ Constant *Offset = Diff == 0 ? nullptr : ConstantInt::get(Ty, Diff);
+ Type *ConstTy =
+ ConstCand->ConstExpr ? ConstCand->ConstExpr->getType() : nullptr;
+ ConstInfo.RebasedConstants.push_back(
+ RebasedConstantInfo(std::move(ConstCand->Uses), Offset, ConstTy));
+ }
+ ConstInfoVec.push_back(std::move(ConstInfo));
+}
+
+/// Finds and combines constant candidates that can be easily
+/// rematerialized with an add from a common base constant.
+void ConstantHoistingPass::findBaseConstants(GlobalVariable *BaseGV) {
+ // If BaseGV is nullptr, find base among candidate constant integers;
+ // Otherwise find base among constant GEPs that share the same BaseGV.
+ ConstCandVecType &ConstCandVec = BaseGV ?
+ ConstGEPCandMap[BaseGV] : ConstIntCandVec;
+ ConstInfoVecType &ConstInfoVec = BaseGV ?
+ ConstGEPInfoMap[BaseGV] : ConstIntInfoVec;
+
+ // Sort the constants by value and type. This invalidates the mapping!
+ llvm::stable_sort(ConstCandVec, [](const ConstantCandidate &LHS,
+ const ConstantCandidate &RHS) {
+ if (LHS.ConstInt->getType() != RHS.ConstInt->getType())
+ return LHS.ConstInt->getType()->getBitWidth() <
+ RHS.ConstInt->getType()->getBitWidth();
+ return LHS.ConstInt->getValue().ult(RHS.ConstInt->getValue());
+ });
+
+ // Simple linear scan through the sorted constant candidate vector for viable
+ // merge candidates.
+ auto MinValItr = ConstCandVec.begin();
+ for (auto CC = std::next(ConstCandVec.begin()), E = ConstCandVec.end();
+ CC != E; ++CC) {
+ if (MinValItr->ConstInt->getType() == CC->ConstInt->getType()) {
+ Type *MemUseValTy = nullptr;
+ for (auto &U : CC->Uses) {
+ auto *UI = U.Inst;
+ if (LoadInst *LI = dyn_cast<LoadInst>(UI)) {
+ MemUseValTy = LI->getType();
+ break;
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(UI)) {
+ // Make sure the constant is used as pointer operand of the StoreInst.
+ if (SI->getPointerOperand() == SI->getOperand(U.OpndIdx)) {
+ MemUseValTy = SI->getValueOperand()->getType();
+ break;
+ }
+ }
+ }
+
+ // Check if the constant is in range of an add with immediate.
+ APInt Diff = CC->ConstInt->getValue() - MinValItr->ConstInt->getValue();
+ if ((Diff.getBitWidth() <= 64) &&
+ TTI->isLegalAddImmediate(Diff.getSExtValue()) &&
+ // Check if Diff can be used as offset in addressing mode of the user
+ // memory instruction.
+ (!MemUseValTy || TTI->isLegalAddressingMode(MemUseValTy,
+ /*BaseGV*/nullptr, /*BaseOffset*/Diff.getSExtValue(),
+ /*HasBaseReg*/true, /*Scale*/0)))
+ continue;
+ }
+ // We either have now a different constant type or the constant is not in
+ // range of an add with immediate anymore.
+ findAndMakeBaseConstant(MinValItr, CC, ConstInfoVec);
+ // Start a new base constant search.
+ MinValItr = CC;
+ }
+ // Finalize the last base constant search.
+ findAndMakeBaseConstant(MinValItr, ConstCandVec.end(), ConstInfoVec);
+}
+
+/// Updates the operand at Idx in instruction Inst with the result of
+/// instruction Mat. If the instruction is a PHI node then special
+/// handling for duplicate values form the same incoming basic block is
+/// required.
+/// \return The update will always succeed, but the return value indicated if
+/// Mat was used for the update or not.
+static bool updateOperand(Instruction *Inst, unsigned Idx, Instruction *Mat) {
+ if (auto PHI = dyn_cast<PHINode>(Inst)) {
+ // Check if any previous operand of the PHI node has the same incoming basic
+ // block. This is a very odd case that happens when the incoming basic block
+ // has a switch statement. In this case use the same value as the previous
+ // operand(s), otherwise we will fail verification due to different values.
+ // The values are actually the same, but the variable names are different
+ // and the verifier doesn't like that.
+ BasicBlock *IncomingBB = PHI->getIncomingBlock(Idx);
+ for (unsigned i = 0; i < Idx; ++i) {
+ if (PHI->getIncomingBlock(i) == IncomingBB) {
+ Value *IncomingVal = PHI->getIncomingValue(i);
+ Inst->setOperand(Idx, IncomingVal);
+ return false;
+ }
+ }
+ }
+
+ Inst->setOperand(Idx, Mat);
+ return true;
+}
+
+/// Emit materialization code for all rebased constants and update their
+/// users.
+void ConstantHoistingPass::emitBaseConstants(Instruction *Base,
+ Constant *Offset,
+ Type *Ty,
+ const ConstantUser &ConstUser) {
+ Instruction *Mat = Base;
+
+ // The same offset can be dereferenced to different types in nested struct.
+ if (!Offset && Ty && Ty != Base->getType())
+ Offset = ConstantInt::get(Type::getInt32Ty(*Ctx), 0);
+
+ if (Offset) {
+ Instruction *InsertionPt = findMatInsertPt(ConstUser.Inst,
+ ConstUser.OpndIdx);
+ if (Ty) {
+ // Constant being rebased is a ConstantExpr.
+ PointerType *Int8PtrTy = Type::getInt8PtrTy(*Ctx,
+ cast<PointerType>(Ty)->getAddressSpace());
+ Base = new BitCastInst(Base, Int8PtrTy, "base_bitcast", InsertionPt);
+ Mat = GetElementPtrInst::Create(Int8PtrTy->getElementType(), Base,
+ Offset, "mat_gep", InsertionPt);
+ Mat = new BitCastInst(Mat, Ty, "mat_bitcast", InsertionPt);
+ } else
+ // Constant being rebased is a ConstantInt.
+ Mat = BinaryOperator::Create(Instruction::Add, Base, Offset,
+ "const_mat", InsertionPt);
+
+ LLVM_DEBUG(dbgs() << "Materialize constant (" << *Base->getOperand(0)
+ << " + " << *Offset << ") in BB "
+ << Mat->getParent()->getName() << '\n'
+ << *Mat << '\n');
+ Mat->setDebugLoc(ConstUser.Inst->getDebugLoc());
+ }
+ Value *Opnd = ConstUser.Inst->getOperand(ConstUser.OpndIdx);
+
+ // Visit constant integer.
+ if (isa<ConstantInt>(Opnd)) {
+ LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat) && Offset)
+ Mat->eraseFromParent();
+ LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ return;
+ }
+
+ // Visit cast instruction.
+ if (auto CastInst = dyn_cast<Instruction>(Opnd)) {
+ assert(CastInst->isCast() && "Expected an cast instruction!");
+ // Check if we already have visited this cast instruction before to avoid
+ // unnecessary cloning.
+ Instruction *&ClonedCastInst = ClonedCastMap[CastInst];
+ if (!ClonedCastInst) {
+ ClonedCastInst = CastInst->clone();
+ ClonedCastInst->setOperand(0, Mat);
+ ClonedCastInst->insertAfter(CastInst);
+ // Use the same debug location as the original cast instruction.
+ ClonedCastInst->setDebugLoc(CastInst->getDebugLoc());
+ LLVM_DEBUG(dbgs() << "Clone instruction: " << *CastInst << '\n'
+ << "To : " << *ClonedCastInst << '\n');
+ }
+
+ LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ClonedCastInst);
+ LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ return;
+ }
+
+ // Visit constant expression.
+ if (auto ConstExpr = dyn_cast<ConstantExpr>(Opnd)) {
+ if (ConstExpr->isGEPWithNoNotionalOverIndexing()) {
+ // Operand is a ConstantGEP, replace it.
+ updateOperand(ConstUser.Inst, ConstUser.OpndIdx, Mat);
+ return;
+ }
+
+ // Aside from constant GEPs, only constant cast expressions are collected.
+ assert(ConstExpr->isCast() && "ConstExpr should be a cast");
+ Instruction *ConstExprInst = ConstExpr->getAsInstruction();
+ ConstExprInst->setOperand(0, Mat);
+ ConstExprInst->insertBefore(findMatInsertPt(ConstUser.Inst,
+ ConstUser.OpndIdx));
+
+ // Use the same debug location as the instruction we are about to update.
+ ConstExprInst->setDebugLoc(ConstUser.Inst->getDebugLoc());
+
+ LLVM_DEBUG(dbgs() << "Create instruction: " << *ConstExprInst << '\n'
+ << "From : " << *ConstExpr << '\n');
+ LLVM_DEBUG(dbgs() << "Update: " << *ConstUser.Inst << '\n');
+ if (!updateOperand(ConstUser.Inst, ConstUser.OpndIdx, ConstExprInst)) {
+ ConstExprInst->eraseFromParent();
+ if (Offset)
+ Mat->eraseFromParent();
+ }
+ LLVM_DEBUG(dbgs() << "To : " << *ConstUser.Inst << '\n');
+ return;
+ }
+}
+
+/// Hoist and hide the base constant behind a bitcast and emit
+/// materialization code for derived constants.
+bool ConstantHoistingPass::emitBaseConstants(GlobalVariable *BaseGV) {
+ bool MadeChange = false;
+ SmallVectorImpl<consthoist::ConstantInfo> &ConstInfoVec =
+ BaseGV ? ConstGEPInfoMap[BaseGV] : ConstIntInfoVec;
+ for (auto const &ConstInfo : ConstInfoVec) {
+ SetVector<Instruction *> IPSet = findConstantInsertionPoint(ConstInfo);
+ // We can have an empty set if the function contains unreachable blocks.
+ if (IPSet.empty())
+ continue;
+
+ unsigned UsesNum = 0;
+ unsigned ReBasesNum = 0;
+ unsigned NotRebasedNum = 0;
+ for (Instruction *IP : IPSet) {
+ // First, collect constants depending on this IP of the base.
+ unsigned Uses = 0;
+ using RebasedUse = std::tuple<Constant *, Type *, ConstantUser>;
+ SmallVector<RebasedUse, 4> ToBeRebased;
+ for (auto const &RCI : ConstInfo.RebasedConstants) {
+ for (auto const &U : RCI.Uses) {
+ Uses++;
+ BasicBlock *OrigMatInsertBB =
+ findMatInsertPt(U.Inst, U.OpndIdx)->getParent();
+ // If Base constant is to be inserted in multiple places,
+ // generate rebase for U using the Base dominating U.
+ if (IPSet.size() == 1 ||
+ DT->dominates(IP->getParent(), OrigMatInsertBB))
+ ToBeRebased.push_back(RebasedUse(RCI.Offset, RCI.Ty, U));
+ }
+ }
+ UsesNum = Uses;
+
+ // If only few constants depend on this IP of base, skip rebasing,
+ // assuming the base and the rebased have the same materialization cost.
+ if (ToBeRebased.size() < MinNumOfDependentToRebase) {
+ NotRebasedNum += ToBeRebased.size();
+ continue;
+ }
+
+ // Emit an instance of the base at this IP.
+ Instruction *Base = nullptr;
+ // Hoist and hide the base constant behind a bitcast.
+ if (ConstInfo.BaseExpr) {
+ assert(BaseGV && "A base constant expression must have an base GV");
+ Type *Ty = ConstInfo.BaseExpr->getType();
+ Base = new BitCastInst(ConstInfo.BaseExpr, Ty, "const", IP);
+ } else {
+ IntegerType *Ty = ConstInfo.BaseInt->getType();
+ Base = new BitCastInst(ConstInfo.BaseInt, Ty, "const", IP);
+ }
+
+ Base->setDebugLoc(IP->getDebugLoc());
+
+ LLVM_DEBUG(dbgs() << "Hoist constant (" << *ConstInfo.BaseInt
+ << ") to BB " << IP->getParent()->getName() << '\n'
+ << *Base << '\n');
+
+ // Emit materialization code for rebased constants depending on this IP.
+ for (auto const &R : ToBeRebased) {
+ Constant *Off = std::get<0>(R);
+ Type *Ty = std::get<1>(R);
+ ConstantUser U = std::get<2>(R);
+ emitBaseConstants(Base, Off, Ty, U);
+ ReBasesNum++;
+ // Use the same debug location as the last user of the constant.
+ Base->setDebugLoc(DILocation::getMergedLocation(
+ Base->getDebugLoc(), U.Inst->getDebugLoc()));
+ }
+ assert(!Base->use_empty() && "The use list is empty!?");
+ assert(isa<Instruction>(Base->user_back()) &&
+ "All uses should be instructions.");
+ }
+ (void)UsesNum;
+ (void)ReBasesNum;
+ (void)NotRebasedNum;
+ // Expect all uses are rebased after rebase is done.
+ assert(UsesNum == (ReBasesNum + NotRebasedNum) &&
+ "Not all uses are rebased");
+
+ NumConstantsHoisted++;
+
+ // Base constant is also included in ConstInfo.RebasedConstants, so
+ // deduct 1 from ConstInfo.RebasedConstants.size().
+ NumConstantsRebased += ConstInfo.RebasedConstants.size() - 1;
+
+ MadeChange = true;
+ }
+ return MadeChange;
+}
+
+/// Check all cast instructions we made a copy of and remove them if they
+/// have no more users.
+void ConstantHoistingPass::deleteDeadCastInst() const {
+ for (auto const &I : ClonedCastMap)
+ if (I.first->use_empty())
+ I.first->eraseFromParent();
+}
+
+/// Optimize expensive integer constants in the given function.
+bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
+ DominatorTree &DT, BlockFrequencyInfo *BFI,
+ BasicBlock &Entry, ProfileSummaryInfo *PSI) {
+ this->TTI = &TTI;
+ this->DT = &DT;
+ this->BFI = BFI;
+ this->DL = &Fn.getParent()->getDataLayout();
+ this->Ctx = &Fn.getContext();
+ this->Entry = &Entry;
+ this->PSI = PSI;
+ // Collect all constant candidates.
+ collectConstantCandidates(Fn);
+
+ // Combine constants that can be easily materialized with an add from a common
+ // base constant.
+ if (!ConstIntCandVec.empty())
+ findBaseConstants(nullptr);
for (const auto &MapEntry : ConstGEPCandMap)
- if (!MapEntry.second.empty())
- findBaseConstants(MapEntry.first);
-
- // Finally hoist the base constant and emit materialization code for dependent
- // constants.
- bool MadeChange = false;
- if (!ConstIntInfoVec.empty())
- MadeChange = emitBaseConstants(nullptr);
+ if (!MapEntry.second.empty())
+ findBaseConstants(MapEntry.first);
+
+ // Finally hoist the base constant and emit materialization code for dependent
+ // constants.
+ bool MadeChange = false;
+ if (!ConstIntInfoVec.empty())
+ MadeChange = emitBaseConstants(nullptr);
for (const auto &MapEntry : ConstGEPInfoMap)
- if (!MapEntry.second.empty())
- MadeChange |= emitBaseConstants(MapEntry.first);
-
-
- // Cleanup dead instructions.
- deleteDeadCastInst();
-
- cleanup();
-
- return MadeChange;
-}
-
-PreservedAnalyses ConstantHoistingPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- auto BFI = ConstHoistWithBlockFrequency
- ? &AM.getResult<BlockFrequencyAnalysis>(F)
- : nullptr;
- auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
- auto *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
- if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock(), PSI))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
-}
+ if (!MapEntry.second.empty())
+ MadeChange |= emitBaseConstants(MapEntry.first);
+
+
+ // Cleanup dead instructions.
+ deleteDeadCastInst();
+
+ cleanup();
+
+ return MadeChange;
+}
+
+PreservedAnalyses ConstantHoistingPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto BFI = ConstHoistWithBlockFrequency
+ ? &AM.getResult<BlockFrequencyAnalysis>(F)
+ : nullptr;
+ auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+ auto *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock(), PSI))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 432c7efe57..b671d68031 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -1,563 +1,563 @@
-//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Correlated Value Propagation pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LazyValueInfo.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/ConstantRange.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <cassert>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "correlated-value-propagation"
-
-STATISTIC(NumPhis, "Number of phis propagated");
-STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value");
-STATISTIC(NumSelects, "Number of selects propagated");
-STATISTIC(NumMemAccess, "Number of memory access targets propagated");
-STATISTIC(NumCmps, "Number of comparisons propagated");
-STATISTIC(NumReturns, "Number of return values propagated");
-STATISTIC(NumDeadCases, "Number of switch cases removed");
+//===- CorrelatedValuePropagation.cpp - Propagate CFG-derived info --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Correlated Value Propagation pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/CorrelatedValuePropagation.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "correlated-value-propagation"
+
+STATISTIC(NumPhis, "Number of phis propagated");
+STATISTIC(NumPhiCommon, "Number of phis deleted via common incoming value");
+STATISTIC(NumSelects, "Number of selects propagated");
+STATISTIC(NumMemAccess, "Number of memory access targets propagated");
+STATISTIC(NumCmps, "Number of comparisons propagated");
+STATISTIC(NumReturns, "Number of return values propagated");
+STATISTIC(NumDeadCases, "Number of switch cases removed");
STATISTIC(NumSDivSRemsNarrowed,
"Number of sdivs/srems whose width was decreased");
-STATISTIC(NumSDivs, "Number of sdiv converted to udiv");
+STATISTIC(NumSDivs, "Number of sdiv converted to udiv");
STATISTIC(NumUDivURemsNarrowed,
"Number of udivs/urems whose width was decreased");
-STATISTIC(NumAShrs, "Number of ashr converted to lshr");
-STATISTIC(NumSRems, "Number of srem converted to urem");
-STATISTIC(NumSExt, "Number of sext converted to zext");
-STATISTIC(NumAnd, "Number of ands removed");
-STATISTIC(NumNW, "Number of no-wrap deductions");
-STATISTIC(NumNSW, "Number of no-signed-wrap deductions");
-STATISTIC(NumNUW, "Number of no-unsigned-wrap deductions");
-STATISTIC(NumAddNW, "Number of no-wrap deductions for add");
-STATISTIC(NumAddNSW, "Number of no-signed-wrap deductions for add");
-STATISTIC(NumAddNUW, "Number of no-unsigned-wrap deductions for add");
-STATISTIC(NumSubNW, "Number of no-wrap deductions for sub");
-STATISTIC(NumSubNSW, "Number of no-signed-wrap deductions for sub");
-STATISTIC(NumSubNUW, "Number of no-unsigned-wrap deductions for sub");
-STATISTIC(NumMulNW, "Number of no-wrap deductions for mul");
-STATISTIC(NumMulNSW, "Number of no-signed-wrap deductions for mul");
-STATISTIC(NumMulNUW, "Number of no-unsigned-wrap deductions for mul");
-STATISTIC(NumShlNW, "Number of no-wrap deductions for shl");
-STATISTIC(NumShlNSW, "Number of no-signed-wrap deductions for shl");
-STATISTIC(NumShlNUW, "Number of no-unsigned-wrap deductions for shl");
-STATISTIC(NumOverflows, "Number of overflow checks removed");
-STATISTIC(NumSaturating,
- "Number of saturating arithmetics converted to normal arithmetics");
-
-static cl::opt<bool> DontAddNoWrapFlags("cvp-dont-add-nowrap-flags", cl::init(false));
-
-namespace {
-
- class CorrelatedValuePropagation : public FunctionPass {
- public:
- static char ID;
-
- CorrelatedValuePropagation(): FunctionPass(ID) {
- initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LazyValueInfoWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<LazyValueInfoWrapperPass>();
- }
- };
-
-} // end anonymous namespace
-
-char CorrelatedValuePropagation::ID = 0;
-
-INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
- "Value Propagation", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
-INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
- "Value Propagation", false, false)
-
-// Public interface to the Value Propagation pass
-Pass *llvm::createCorrelatedValuePropagationPass() {
- return new CorrelatedValuePropagation();
-}
-
-static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
- if (S->getType()->isVectorTy()) return false;
- if (isa<Constant>(S->getCondition())) return false;
-
+STATISTIC(NumAShrs, "Number of ashr converted to lshr");
+STATISTIC(NumSRems, "Number of srem converted to urem");
+STATISTIC(NumSExt, "Number of sext converted to zext");
+STATISTIC(NumAnd, "Number of ands removed");
+STATISTIC(NumNW, "Number of no-wrap deductions");
+STATISTIC(NumNSW, "Number of no-signed-wrap deductions");
+STATISTIC(NumNUW, "Number of no-unsigned-wrap deductions");
+STATISTIC(NumAddNW, "Number of no-wrap deductions for add");
+STATISTIC(NumAddNSW, "Number of no-signed-wrap deductions for add");
+STATISTIC(NumAddNUW, "Number of no-unsigned-wrap deductions for add");
+STATISTIC(NumSubNW, "Number of no-wrap deductions for sub");
+STATISTIC(NumSubNSW, "Number of no-signed-wrap deductions for sub");
+STATISTIC(NumSubNUW, "Number of no-unsigned-wrap deductions for sub");
+STATISTIC(NumMulNW, "Number of no-wrap deductions for mul");
+STATISTIC(NumMulNSW, "Number of no-signed-wrap deductions for mul");
+STATISTIC(NumMulNUW, "Number of no-unsigned-wrap deductions for mul");
+STATISTIC(NumShlNW, "Number of no-wrap deductions for shl");
+STATISTIC(NumShlNSW, "Number of no-signed-wrap deductions for shl");
+STATISTIC(NumShlNUW, "Number of no-unsigned-wrap deductions for shl");
+STATISTIC(NumOverflows, "Number of overflow checks removed");
+STATISTIC(NumSaturating,
+ "Number of saturating arithmetics converted to normal arithmetics");
+
+static cl::opt<bool> DontAddNoWrapFlags("cvp-dont-add-nowrap-flags", cl::init(false));
+
+namespace {
+
+ class CorrelatedValuePropagation : public FunctionPass {
+ public:
+ static char ID;
+
+ CorrelatedValuePropagation(): FunctionPass(ID) {
+ initializeCorrelatedValuePropagationPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LazyValueInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LazyValueInfoWrapperPass>();
+ }
+ };
+
+} // end anonymous namespace
+
+char CorrelatedValuePropagation::ID = 0;
+
+INITIALIZE_PASS_BEGIN(CorrelatedValuePropagation, "correlated-propagation",
+ "Value Propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_END(CorrelatedValuePropagation, "correlated-propagation",
+ "Value Propagation", false, false)
+
+// Public interface to the Value Propagation pass
+Pass *llvm::createCorrelatedValuePropagationPass() {
+ return new CorrelatedValuePropagation();
+}
+
+static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
+ if (S->getType()->isVectorTy()) return false;
+ if (isa<Constant>(S->getCondition())) return false;
+
Constant *C = LVI->getConstant(S->getCondition(), S);
- if (!C) return false;
-
- ConstantInt *CI = dyn_cast<ConstantInt>(C);
- if (!CI) return false;
-
- Value *ReplaceWith = CI->isOne() ? S->getTrueValue() : S->getFalseValue();
- S->replaceAllUsesWith(ReplaceWith);
- S->eraseFromParent();
-
- ++NumSelects;
-
- return true;
-}
-
-/// Try to simplify a phi with constant incoming values that match the edge
-/// values of a non-constant value on all other edges:
-/// bb0:
-/// %isnull = icmp eq i8* %x, null
-/// br i1 %isnull, label %bb2, label %bb1
-/// bb1:
-/// br label %bb2
-/// bb2:
-/// %r = phi i8* [ %x, %bb1 ], [ null, %bb0 ]
-/// -->
-/// %r = %x
-static bool simplifyCommonValuePhi(PHINode *P, LazyValueInfo *LVI,
- DominatorTree *DT) {
- // Collect incoming constants and initialize possible common value.
- SmallVector<std::pair<Constant *, unsigned>, 4> IncomingConstants;
- Value *CommonValue = nullptr;
- for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) {
- Value *Incoming = P->getIncomingValue(i);
- if (auto *IncomingConstant = dyn_cast<Constant>(Incoming)) {
- IncomingConstants.push_back(std::make_pair(IncomingConstant, i));
- } else if (!CommonValue) {
- // The potential common value is initialized to the first non-constant.
- CommonValue = Incoming;
- } else if (Incoming != CommonValue) {
- // There can be only one non-constant common value.
- return false;
- }
- }
-
- if (!CommonValue || IncomingConstants.empty())
- return false;
-
- // The common value must be valid in all incoming blocks.
- BasicBlock *ToBB = P->getParent();
- if (auto *CommonInst = dyn_cast<Instruction>(CommonValue))
- if (!DT->dominates(CommonInst, ToBB))
- return false;
-
- // We have a phi with exactly 1 variable incoming value and 1 or more constant
- // incoming values. See if all constant incoming values can be mapped back to
- // the same incoming variable value.
- for (auto &IncomingConstant : IncomingConstants) {
- Constant *C = IncomingConstant.first;
- BasicBlock *IncomingBB = P->getIncomingBlock(IncomingConstant.second);
- if (C != LVI->getConstantOnEdge(CommonValue, IncomingBB, ToBB, P))
- return false;
- }
-
- // All constant incoming values map to the same variable along the incoming
- // edges of the phi. The phi is unnecessary. However, we must drop all
- // poison-generating flags to ensure that no poison is propagated to the phi
- // location by performing this substitution.
- // Warning: If the underlying analysis changes, this may not be enough to
- // guarantee that poison is not propagated.
- // TODO: We may be able to re-infer flags by re-analyzing the instruction.
- if (auto *CommonInst = dyn_cast<Instruction>(CommonValue))
- CommonInst->dropPoisonGeneratingFlags();
- P->replaceAllUsesWith(CommonValue);
- P->eraseFromParent();
- ++NumPhiCommon;
- return true;
-}
-
-static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT,
- const SimplifyQuery &SQ) {
- bool Changed = false;
-
- BasicBlock *BB = P->getParent();
- for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
- Value *Incoming = P->getIncomingValue(i);
- if (isa<Constant>(Incoming)) continue;
-
- Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P);
-
- // Look if the incoming value is a select with a scalar condition for which
- // LVI can tells us the value. In that case replace the incoming value with
- // the appropriate value of the select. This often allows us to remove the
- // select later.
- if (!V) {
- SelectInst *SI = dyn_cast<SelectInst>(Incoming);
- if (!SI) continue;
-
- Value *Condition = SI->getCondition();
- if (!Condition->getType()->isVectorTy()) {
- if (Constant *C = LVI->getConstantOnEdge(
- Condition, P->getIncomingBlock(i), BB, P)) {
- if (C->isOneValue()) {
- V = SI->getTrueValue();
- } else if (C->isZeroValue()) {
- V = SI->getFalseValue();
- }
- // Once LVI learns to handle vector types, we could also add support
- // for vector type constants that are not all zeroes or all ones.
- }
- }
-
- // Look if the select has a constant but LVI tells us that the incoming
- // value can never be that constant. In that case replace the incoming
- // value with the other value of the select. This often allows us to
- // remove the select later.
- if (!V) {
- Constant *C = dyn_cast<Constant>(SI->getFalseValue());
- if (!C) continue;
-
- if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
- P->getIncomingBlock(i), BB, P) !=
- LazyValueInfo::False)
- continue;
- V = SI->getTrueValue();
- }
-
- LLVM_DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
- }
-
- P->setIncomingValue(i, V);
- Changed = true;
- }
-
- if (Value *V = SimplifyInstruction(P, SQ)) {
- P->replaceAllUsesWith(V);
- P->eraseFromParent();
- Changed = true;
- }
-
- if (!Changed)
- Changed = simplifyCommonValuePhi(P, LVI, DT);
-
- if (Changed)
- ++NumPhis;
-
- return Changed;
-}
-
-static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) {
- Value *Pointer = nullptr;
- if (LoadInst *L = dyn_cast<LoadInst>(I))
- Pointer = L->getPointerOperand();
- else
- Pointer = cast<StoreInst>(I)->getPointerOperand();
-
- if (isa<Constant>(Pointer)) return false;
-
+ if (!C) return false;
+
+ ConstantInt *CI = dyn_cast<ConstantInt>(C);
+ if (!CI) return false;
+
+ Value *ReplaceWith = CI->isOne() ? S->getTrueValue() : S->getFalseValue();
+ S->replaceAllUsesWith(ReplaceWith);
+ S->eraseFromParent();
+
+ ++NumSelects;
+
+ return true;
+}
+
+/// Try to simplify a phi with constant incoming values that match the edge
+/// values of a non-constant value on all other edges:
+/// bb0:
+/// %isnull = icmp eq i8* %x, null
+/// br i1 %isnull, label %bb2, label %bb1
+/// bb1:
+/// br label %bb2
+/// bb2:
+/// %r = phi i8* [ %x, %bb1 ], [ null, %bb0 ]
+/// -->
+/// %r = %x
+static bool simplifyCommonValuePhi(PHINode *P, LazyValueInfo *LVI,
+ DominatorTree *DT) {
+ // Collect incoming constants and initialize possible common value.
+ SmallVector<std::pair<Constant *, unsigned>, 4> IncomingConstants;
+ Value *CommonValue = nullptr;
+ for (unsigned i = 0, e = P->getNumIncomingValues(); i != e; ++i) {
+ Value *Incoming = P->getIncomingValue(i);
+ if (auto *IncomingConstant = dyn_cast<Constant>(Incoming)) {
+ IncomingConstants.push_back(std::make_pair(IncomingConstant, i));
+ } else if (!CommonValue) {
+ // The potential common value is initialized to the first non-constant.
+ CommonValue = Incoming;
+ } else if (Incoming != CommonValue) {
+ // There can be only one non-constant common value.
+ return false;
+ }
+ }
+
+ if (!CommonValue || IncomingConstants.empty())
+ return false;
+
+ // The common value must be valid in all incoming blocks.
+ BasicBlock *ToBB = P->getParent();
+ if (auto *CommonInst = dyn_cast<Instruction>(CommonValue))
+ if (!DT->dominates(CommonInst, ToBB))
+ return false;
+
+ // We have a phi with exactly 1 variable incoming value and 1 or more constant
+ // incoming values. See if all constant incoming values can be mapped back to
+ // the same incoming variable value.
+ for (auto &IncomingConstant : IncomingConstants) {
+ Constant *C = IncomingConstant.first;
+ BasicBlock *IncomingBB = P->getIncomingBlock(IncomingConstant.second);
+ if (C != LVI->getConstantOnEdge(CommonValue, IncomingBB, ToBB, P))
+ return false;
+ }
+
+ // All constant incoming values map to the same variable along the incoming
+ // edges of the phi. The phi is unnecessary. However, we must drop all
+ // poison-generating flags to ensure that no poison is propagated to the phi
+ // location by performing this substitution.
+ // Warning: If the underlying analysis changes, this may not be enough to
+ // guarantee that poison is not propagated.
+ // TODO: We may be able to re-infer flags by re-analyzing the instruction.
+ if (auto *CommonInst = dyn_cast<Instruction>(CommonValue))
+ CommonInst->dropPoisonGeneratingFlags();
+ P->replaceAllUsesWith(CommonValue);
+ P->eraseFromParent();
+ ++NumPhiCommon;
+ return true;
+}
+
+static bool processPHI(PHINode *P, LazyValueInfo *LVI, DominatorTree *DT,
+ const SimplifyQuery &SQ) {
+ bool Changed = false;
+
+ BasicBlock *BB = P->getParent();
+ for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
+ Value *Incoming = P->getIncomingValue(i);
+ if (isa<Constant>(Incoming)) continue;
+
+ Value *V = LVI->getConstantOnEdge(Incoming, P->getIncomingBlock(i), BB, P);
+
+ // Look if the incoming value is a select with a scalar condition for which
+ // LVI can tells us the value. In that case replace the incoming value with
+ // the appropriate value of the select. This often allows us to remove the
+ // select later.
+ if (!V) {
+ SelectInst *SI = dyn_cast<SelectInst>(Incoming);
+ if (!SI) continue;
+
+ Value *Condition = SI->getCondition();
+ if (!Condition->getType()->isVectorTy()) {
+ if (Constant *C = LVI->getConstantOnEdge(
+ Condition, P->getIncomingBlock(i), BB, P)) {
+ if (C->isOneValue()) {
+ V = SI->getTrueValue();
+ } else if (C->isZeroValue()) {
+ V = SI->getFalseValue();
+ }
+ // Once LVI learns to handle vector types, we could also add support
+ // for vector type constants that are not all zeroes or all ones.
+ }
+ }
+
+ // Look if the select has a constant but LVI tells us that the incoming
+ // value can never be that constant. In that case replace the incoming
+ // value with the other value of the select. This often allows us to
+ // remove the select later.
+ if (!V) {
+ Constant *C = dyn_cast<Constant>(SI->getFalseValue());
+ if (!C) continue;
+
+ if (LVI->getPredicateOnEdge(ICmpInst::ICMP_EQ, SI, C,
+ P->getIncomingBlock(i), BB, P) !=
+ LazyValueInfo::False)
+ continue;
+ V = SI->getTrueValue();
+ }
+
+ LLVM_DEBUG(dbgs() << "CVP: Threading PHI over " << *SI << '\n');
+ }
+
+ P->setIncomingValue(i, V);
+ Changed = true;
+ }
+
+ if (Value *V = SimplifyInstruction(P, SQ)) {
+ P->replaceAllUsesWith(V);
+ P->eraseFromParent();
+ Changed = true;
+ }
+
+ if (!Changed)
+ Changed = simplifyCommonValuePhi(P, LVI, DT);
+
+ if (Changed)
+ ++NumPhis;
+
+ return Changed;
+}
+
+static bool processMemAccess(Instruction *I, LazyValueInfo *LVI) {
+ Value *Pointer = nullptr;
+ if (LoadInst *L = dyn_cast<LoadInst>(I))
+ Pointer = L->getPointerOperand();
+ else
+ Pointer = cast<StoreInst>(I)->getPointerOperand();
+
+ if (isa<Constant>(Pointer)) return false;
+
Constant *C = LVI->getConstant(Pointer, I);
- if (!C) return false;
-
- ++NumMemAccess;
- I->replaceUsesOfWith(Pointer, C);
- return true;
-}
-
-/// See if LazyValueInfo's ability to exploit edge conditions or range
-/// information is sufficient to prove this comparison. Even for local
-/// conditions, this can sometimes prove conditions instcombine can't by
-/// exploiting range information.
-static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) {
- Value *Op0 = Cmp->getOperand(0);
- auto *C = dyn_cast<Constant>(Cmp->getOperand(1));
- if (!C)
- return false;
-
- LazyValueInfo::Tristate Result =
+ if (!C) return false;
+
+ ++NumMemAccess;
+ I->replaceUsesOfWith(Pointer, C);
+ return true;
+}
+
+/// See if LazyValueInfo's ability to exploit edge conditions or range
+/// information is sufficient to prove this comparison. Even for local
+/// conditions, this can sometimes prove conditions instcombine can't by
+/// exploiting range information.
+static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) {
+ Value *Op0 = Cmp->getOperand(0);
+ auto *C = dyn_cast<Constant>(Cmp->getOperand(1));
+ if (!C)
+ return false;
+
+ LazyValueInfo::Tristate Result =
LVI->getPredicateAt(Cmp->getPredicate(), Op0, C, Cmp,
/*UseBlockValue=*/true);
- if (Result == LazyValueInfo::Unknown)
- return false;
-
- ++NumCmps;
- Constant *TorF = ConstantInt::get(Type::getInt1Ty(Cmp->getContext()), Result);
- Cmp->replaceAllUsesWith(TorF);
- Cmp->eraseFromParent();
- return true;
-}
-
-/// Simplify a switch instruction by removing cases which can never fire. If the
-/// uselessness of a case could be determined locally then constant propagation
-/// would already have figured it out. Instead, walk the predecessors and
-/// statically evaluate cases based on information available on that edge. Cases
-/// that cannot fire no matter what the incoming edge can safely be removed. If
-/// a case fires on every incoming edge then the entire switch can be removed
-/// and replaced with a branch to the case destination.
-static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
- DominatorTree *DT) {
- DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy);
- Value *Cond = I->getCondition();
- BasicBlock *BB = I->getParent();
-
- // Analyse each switch case in turn.
- bool Changed = false;
- DenseMap<BasicBlock*, int> SuccessorsCount;
- for (auto *Succ : successors(BB))
- SuccessorsCount[Succ]++;
-
- { // Scope for SwitchInstProfUpdateWrapper. It must not live during
- // ConstantFoldTerminator() as the underlying SwitchInst can be changed.
- SwitchInstProfUpdateWrapper SI(*I);
-
- for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
- ConstantInt *Case = CI->getCaseValue();
+ if (Result == LazyValueInfo::Unknown)
+ return false;
+
+ ++NumCmps;
+ Constant *TorF = ConstantInt::get(Type::getInt1Ty(Cmp->getContext()), Result);
+ Cmp->replaceAllUsesWith(TorF);
+ Cmp->eraseFromParent();
+ return true;
+}
+
+/// Simplify a switch instruction by removing cases which can never fire. If the
+/// uselessness of a case could be determined locally then constant propagation
+/// would already have figured it out. Instead, walk the predecessors and
+/// statically evaluate cases based on information available on that edge. Cases
+/// that cannot fire no matter what the incoming edge can safely be removed. If
+/// a case fires on every incoming edge then the entire switch can be removed
+/// and replaced with a branch to the case destination.
+static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI,
+ DominatorTree *DT) {
+ DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ Value *Cond = I->getCondition();
+ BasicBlock *BB = I->getParent();
+
+ // Analyse each switch case in turn.
+ bool Changed = false;
+ DenseMap<BasicBlock*, int> SuccessorsCount;
+ for (auto *Succ : successors(BB))
+ SuccessorsCount[Succ]++;
+
+ { // Scope for SwitchInstProfUpdateWrapper. It must not live during
+ // ConstantFoldTerminator() as the underlying SwitchInst can be changed.
+ SwitchInstProfUpdateWrapper SI(*I);
+
+ for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
+ ConstantInt *Case = CI->getCaseValue();
LazyValueInfo::Tristate State =
LVI->getPredicateAt(CmpInst::ICMP_EQ, Cond, Case, I,
/* UseBlockValue */ true);
-
- if (State == LazyValueInfo::False) {
- // This case never fires - remove it.
- BasicBlock *Succ = CI->getCaseSuccessor();
- Succ->removePredecessor(BB);
- CI = SI.removeCase(CI);
- CE = SI->case_end();
-
- // The condition can be modified by removePredecessor's PHI simplification
- // logic.
- Cond = SI->getCondition();
-
- ++NumDeadCases;
- Changed = true;
- if (--SuccessorsCount[Succ] == 0)
- DTU.applyUpdatesPermissive({{DominatorTree::Delete, BB, Succ}});
- continue;
- }
- if (State == LazyValueInfo::True) {
- // This case always fires. Arrange for the switch to be turned into an
- // unconditional branch by replacing the switch condition with the case
- // value.
- SI->setCondition(Case);
- NumDeadCases += SI->getNumCases();
- Changed = true;
- break;
- }
-
- // Increment the case iterator since we didn't delete it.
- ++CI;
- }
- }
-
- if (Changed)
- // If the switch has been simplified to the point where it can be replaced
- // by a branch then do so now.
- ConstantFoldTerminator(BB, /*DeleteDeadConditions = */ false,
- /*TLI = */ nullptr, &DTU);
- return Changed;
-}
-
-// See if we can prove that the given binary op intrinsic will not overflow.
-static bool willNotOverflow(BinaryOpIntrinsic *BO, LazyValueInfo *LVI) {
+
+ if (State == LazyValueInfo::False) {
+ // This case never fires - remove it.
+ BasicBlock *Succ = CI->getCaseSuccessor();
+ Succ->removePredecessor(BB);
+ CI = SI.removeCase(CI);
+ CE = SI->case_end();
+
+ // The condition can be modified by removePredecessor's PHI simplification
+ // logic.
+ Cond = SI->getCondition();
+
+ ++NumDeadCases;
+ Changed = true;
+ if (--SuccessorsCount[Succ] == 0)
+ DTU.applyUpdatesPermissive({{DominatorTree::Delete, BB, Succ}});
+ continue;
+ }
+ if (State == LazyValueInfo::True) {
+ // This case always fires. Arrange for the switch to be turned into an
+ // unconditional branch by replacing the switch condition with the case
+ // value.
+ SI->setCondition(Case);
+ NumDeadCases += SI->getNumCases();
+ Changed = true;
+ break;
+ }
+
+ // Increment the case iterator since we didn't delete it.
+ ++CI;
+ }
+ }
+
+ if (Changed)
+ // If the switch has been simplified to the point where it can be replaced
+ // by a branch then do so now.
+ ConstantFoldTerminator(BB, /*DeleteDeadConditions = */ false,
+ /*TLI = */ nullptr, &DTU);
+ return Changed;
+}
+
+// See if we can prove that the given binary op intrinsic will not overflow.
+static bool willNotOverflow(BinaryOpIntrinsic *BO, LazyValueInfo *LVI) {
ConstantRange LRange = LVI->getConstantRange(BO->getLHS(), BO);
ConstantRange RRange = LVI->getConstantRange(BO->getRHS(), BO);
- ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
- BO->getBinaryOp(), RRange, BO->getNoWrapKind());
- return NWRegion.contains(LRange);
-}
-
-static void setDeducedOverflowingFlags(Value *V, Instruction::BinaryOps Opcode,
- bool NewNSW, bool NewNUW) {
- Statistic *OpcNW, *OpcNSW, *OpcNUW;
- switch (Opcode) {
- case Instruction::Add:
- OpcNW = &NumAddNW;
- OpcNSW = &NumAddNSW;
- OpcNUW = &NumAddNUW;
- break;
- case Instruction::Sub:
- OpcNW = &NumSubNW;
- OpcNSW = &NumSubNSW;
- OpcNUW = &NumSubNUW;
- break;
- case Instruction::Mul:
- OpcNW = &NumMulNW;
- OpcNSW = &NumMulNSW;
- OpcNUW = &NumMulNUW;
- break;
- case Instruction::Shl:
- OpcNW = &NumShlNW;
- OpcNSW = &NumShlNSW;
- OpcNUW = &NumShlNUW;
- break;
- default:
- llvm_unreachable("Will not be called with other binops");
- }
-
- auto *Inst = dyn_cast<Instruction>(V);
- if (NewNSW) {
- ++NumNW;
- ++*OpcNW;
- ++NumNSW;
- ++*OpcNSW;
- if (Inst)
- Inst->setHasNoSignedWrap();
- }
- if (NewNUW) {
- ++NumNW;
- ++*OpcNW;
- ++NumNUW;
- ++*OpcNUW;
- if (Inst)
- Inst->setHasNoUnsignedWrap();
- }
-}
-
-static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI);
-
-// Rewrite this with.overflow intrinsic as non-overflowing.
-static void processOverflowIntrinsic(WithOverflowInst *WO, LazyValueInfo *LVI) {
- IRBuilder<> B(WO);
- Instruction::BinaryOps Opcode = WO->getBinaryOp();
- bool NSW = WO->isSigned();
- bool NUW = !WO->isSigned();
-
- Value *NewOp =
- B.CreateBinOp(Opcode, WO->getLHS(), WO->getRHS(), WO->getName());
- setDeducedOverflowingFlags(NewOp, Opcode, NSW, NUW);
-
- StructType *ST = cast<StructType>(WO->getType());
- Constant *Struct = ConstantStruct::get(ST,
- { UndefValue::get(ST->getElementType(0)),
- ConstantInt::getFalse(ST->getElementType(1)) });
- Value *NewI = B.CreateInsertValue(Struct, NewOp, 0);
- WO->replaceAllUsesWith(NewI);
- WO->eraseFromParent();
- ++NumOverflows;
-
- // See if we can infer the other no-wrap too.
- if (auto *BO = dyn_cast<BinaryOperator>(NewOp))
- processBinOp(BO, LVI);
-}
-
-static void processSaturatingInst(SaturatingInst *SI, LazyValueInfo *LVI) {
- Instruction::BinaryOps Opcode = SI->getBinaryOp();
- bool NSW = SI->isSigned();
- bool NUW = !SI->isSigned();
- BinaryOperator *BinOp = BinaryOperator::Create(
- Opcode, SI->getLHS(), SI->getRHS(), SI->getName(), SI);
- BinOp->setDebugLoc(SI->getDebugLoc());
- setDeducedOverflowingFlags(BinOp, Opcode, NSW, NUW);
-
- SI->replaceAllUsesWith(BinOp);
- SI->eraseFromParent();
- ++NumSaturating;
-
- // See if we can infer the other no-wrap too.
- if (auto *BO = dyn_cast<BinaryOperator>(BinOp))
- processBinOp(BO, LVI);
-}
-
-/// Infer nonnull attributes for the arguments at the specified callsite.
-static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
-
- if (auto *WO = dyn_cast<WithOverflowInst>(&CB)) {
- if (WO->getLHS()->getType()->isIntegerTy() && willNotOverflow(WO, LVI)) {
- processOverflowIntrinsic(WO, LVI);
- return true;
- }
- }
-
- if (auto *SI = dyn_cast<SaturatingInst>(&CB)) {
- if (SI->getType()->isIntegerTy() && willNotOverflow(SI, LVI)) {
- processSaturatingInst(SI, LVI);
- return true;
- }
- }
-
+ ConstantRange NWRegion = ConstantRange::makeGuaranteedNoWrapRegion(
+ BO->getBinaryOp(), RRange, BO->getNoWrapKind());
+ return NWRegion.contains(LRange);
+}
+
+static void setDeducedOverflowingFlags(Value *V, Instruction::BinaryOps Opcode,
+ bool NewNSW, bool NewNUW) {
+ Statistic *OpcNW, *OpcNSW, *OpcNUW;
+ switch (Opcode) {
+ case Instruction::Add:
+ OpcNW = &NumAddNW;
+ OpcNSW = &NumAddNSW;
+ OpcNUW = &NumAddNUW;
+ break;
+ case Instruction::Sub:
+ OpcNW = &NumSubNW;
+ OpcNSW = &NumSubNSW;
+ OpcNUW = &NumSubNUW;
+ break;
+ case Instruction::Mul:
+ OpcNW = &NumMulNW;
+ OpcNSW = &NumMulNSW;
+ OpcNUW = &NumMulNUW;
+ break;
+ case Instruction::Shl:
+ OpcNW = &NumShlNW;
+ OpcNSW = &NumShlNSW;
+ OpcNUW = &NumShlNUW;
+ break;
+ default:
+ llvm_unreachable("Will not be called with other binops");
+ }
+
+ auto *Inst = dyn_cast<Instruction>(V);
+ if (NewNSW) {
+ ++NumNW;
+ ++*OpcNW;
+ ++NumNSW;
+ ++*OpcNSW;
+ if (Inst)
+ Inst->setHasNoSignedWrap();
+ }
+ if (NewNUW) {
+ ++NumNW;
+ ++*OpcNW;
+ ++NumNUW;
+ ++*OpcNUW;
+ if (Inst)
+ Inst->setHasNoUnsignedWrap();
+ }
+}
+
+static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI);
+
+// Rewrite this with.overflow intrinsic as non-overflowing.
+static void processOverflowIntrinsic(WithOverflowInst *WO, LazyValueInfo *LVI) {
+ IRBuilder<> B(WO);
+ Instruction::BinaryOps Opcode = WO->getBinaryOp();
+ bool NSW = WO->isSigned();
+ bool NUW = !WO->isSigned();
+
+ Value *NewOp =
+ B.CreateBinOp(Opcode, WO->getLHS(), WO->getRHS(), WO->getName());
+ setDeducedOverflowingFlags(NewOp, Opcode, NSW, NUW);
+
+ StructType *ST = cast<StructType>(WO->getType());
+ Constant *Struct = ConstantStruct::get(ST,
+ { UndefValue::get(ST->getElementType(0)),
+ ConstantInt::getFalse(ST->getElementType(1)) });
+ Value *NewI = B.CreateInsertValue(Struct, NewOp, 0);
+ WO->replaceAllUsesWith(NewI);
+ WO->eraseFromParent();
+ ++NumOverflows;
+
+ // See if we can infer the other no-wrap too.
+ if (auto *BO = dyn_cast<BinaryOperator>(NewOp))
+ processBinOp(BO, LVI);
+}
+
+static void processSaturatingInst(SaturatingInst *SI, LazyValueInfo *LVI) {
+ Instruction::BinaryOps Opcode = SI->getBinaryOp();
+ bool NSW = SI->isSigned();
+ bool NUW = !SI->isSigned();
+ BinaryOperator *BinOp = BinaryOperator::Create(
+ Opcode, SI->getLHS(), SI->getRHS(), SI->getName(), SI);
+ BinOp->setDebugLoc(SI->getDebugLoc());
+ setDeducedOverflowingFlags(BinOp, Opcode, NSW, NUW);
+
+ SI->replaceAllUsesWith(BinOp);
+ SI->eraseFromParent();
+ ++NumSaturating;
+
+ // See if we can infer the other no-wrap too.
+ if (auto *BO = dyn_cast<BinaryOperator>(BinOp))
+ processBinOp(BO, LVI);
+}
+
+/// Infer nonnull attributes for the arguments at the specified callsite.
+static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
+
+ if (auto *WO = dyn_cast<WithOverflowInst>(&CB)) {
+ if (WO->getLHS()->getType()->isIntegerTy() && willNotOverflow(WO, LVI)) {
+ processOverflowIntrinsic(WO, LVI);
+ return true;
+ }
+ }
+
+ if (auto *SI = dyn_cast<SaturatingInst>(&CB)) {
+ if (SI->getType()->isIntegerTy() && willNotOverflow(SI, LVI)) {
+ processSaturatingInst(SI, LVI);
+ return true;
+ }
+ }
+
bool Changed = false;
- // Deopt bundle operands are intended to capture state with minimal
- // perturbance of the code otherwise. If we can find a constant value for
- // any such operand and remove a use of the original value, that's
- // desireable since it may allow further optimization of that value (e.g. via
- // single use rules in instcombine). Since deopt uses tend to,
- // idiomatically, appear along rare conditional paths, it's reasonable likely
- // we may have a conditional fact with which LVI can fold.
- if (auto DeoptBundle = CB.getOperandBundle(LLVMContext::OB_deopt)) {
- for (const Use &ConstU : DeoptBundle->Inputs) {
- Use &U = const_cast<Use&>(ConstU);
- Value *V = U.get();
- if (V->getType()->isVectorTy()) continue;
- if (isa<Constant>(V)) continue;
-
+ // Deopt bundle operands are intended to capture state with minimal
+ // perturbance of the code otherwise. If we can find a constant value for
+ // any such operand and remove a use of the original value, that's
+ // desireable since it may allow further optimization of that value (e.g. via
+ // single use rules in instcombine). Since deopt uses tend to,
+ // idiomatically, appear along rare conditional paths, it's reasonable likely
+ // we may have a conditional fact with which LVI can fold.
+ if (auto DeoptBundle = CB.getOperandBundle(LLVMContext::OB_deopt)) {
+ for (const Use &ConstU : DeoptBundle->Inputs) {
+ Use &U = const_cast<Use&>(ConstU);
+ Value *V = U.get();
+ if (V->getType()->isVectorTy()) continue;
+ if (isa<Constant>(V)) continue;
+
Constant *C = LVI->getConstant(V, &CB);
- if (!C) continue;
- U.set(C);
+ if (!C) continue;
+ U.set(C);
Changed = true;
- }
- }
-
+ }
+ }
+
SmallVector<unsigned, 4> ArgNos;
unsigned ArgNo = 0;
- for (Value *V : CB.args()) {
- PointerType *Type = dyn_cast<PointerType>(V->getType());
- // Try to mark pointer typed parameters as non-null. We skip the
- // relatively expensive analysis for constants which are obviously either
- // null or non-null to start with.
- if (Type && !CB.paramHasAttr(ArgNo, Attribute::NonNull) &&
- !isa<Constant>(V) &&
- LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
- ConstantPointerNull::get(Type),
- &CB) == LazyValueInfo::False)
- ArgNos.push_back(ArgNo);
- ArgNo++;
- }
-
- assert(ArgNo == CB.arg_size() && "sanity check");
-
- if (ArgNos.empty())
+ for (Value *V : CB.args()) {
+ PointerType *Type = dyn_cast<PointerType>(V->getType());
+ // Try to mark pointer typed parameters as non-null. We skip the
+ // relatively expensive analysis for constants which are obviously either
+ // null or non-null to start with.
+ if (Type && !CB.paramHasAttr(ArgNo, Attribute::NonNull) &&
+ !isa<Constant>(V) &&
+ LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
+ ConstantPointerNull::get(Type),
+ &CB) == LazyValueInfo::False)
+ ArgNos.push_back(ArgNo);
+ ArgNo++;
+ }
+
+ assert(ArgNo == CB.arg_size() && "sanity check");
+
+ if (ArgNos.empty())
return Changed;
-
- AttributeList AS = CB.getAttributes();
- LLVMContext &Ctx = CB.getContext();
- AS = AS.addParamAttribute(Ctx, ArgNos,
- Attribute::get(Ctx, Attribute::NonNull));
- CB.setAttributes(AS);
-
- return true;
-}
-
+
+ AttributeList AS = CB.getAttributes();
+ LLVMContext &Ctx = CB.getContext();
+ AS = AS.addParamAttribute(Ctx, ArgNos,
+ Attribute::get(Ctx, Attribute::NonNull));
+ CB.setAttributes(AS);
+
+ return true;
+}
+
static bool isNonNegative(Value *V, LazyValueInfo *LVI, Instruction *CxtI) {
Constant *Zero = ConstantInt::get(V->getType(), 0);
auto Result = LVI->getPredicateAt(ICmpInst::ICMP_SGE, V, Zero, CxtI);
@@ -599,7 +599,7 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
for (auto I : zip(Instr->operands(), CRs)) {
std::get<1>(I) = LVI->getConstantRange(std::get<0>(I), Instr);
MinSignedBits = std::max(std::get<1>(I)->getMinSignedBits(), MinSignedBits);
- }
+ }
// sdiv/srem is UB if divisor is -1 and divident is INT_MIN, so unless we can
// prove that such a combination is impossible, we need to bump the bitwidth.
@@ -631,58 +631,58 @@ static bool narrowSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
Instr->replaceAllUsesWith(Sext);
Instr->eraseFromParent();
- return true;
-}
-
-/// Try to shrink a udiv/urem's width down to the smallest power of two that's
-/// sufficient to contain its operands.
-static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) {
- assert(Instr->getOpcode() == Instruction::UDiv ||
- Instr->getOpcode() == Instruction::URem);
- if (Instr->getType()->isVectorTy())
- return false;
-
- // Find the smallest power of two bitwidth that's sufficient to hold Instr's
- // operands.
+ return true;
+}
+
+/// Try to shrink a udiv/urem's width down to the smallest power of two that's
+/// sufficient to contain its operands.
+static bool processUDivOrURem(BinaryOperator *Instr, LazyValueInfo *LVI) {
+ assert(Instr->getOpcode() == Instruction::UDiv ||
+ Instr->getOpcode() == Instruction::URem);
+ if (Instr->getType()->isVectorTy())
+ return false;
+
+ // Find the smallest power of two bitwidth that's sufficient to hold Instr's
+ // operands.
// What is the smallest bit width that can accomodate the entire value ranges
// of both of the operands?
unsigned MaxActiveBits = 0;
- for (Value *Operand : Instr->operands()) {
+ for (Value *Operand : Instr->operands()) {
ConstantRange CR = LVI->getConstantRange(Operand, Instr);
MaxActiveBits = std::max(CR.getActiveBits(), MaxActiveBits);
- }
- // Don't shrink below 8 bits wide.
+ }
+ // Don't shrink below 8 bits wide.
unsigned NewWidth = std::max<unsigned>(PowerOf2Ceil(MaxActiveBits), 8);
- // NewWidth might be greater than OrigWidth if OrigWidth is not a power of
- // two.
+ // NewWidth might be greater than OrigWidth if OrigWidth is not a power of
+ // two.
if (NewWidth >= Instr->getType()->getIntegerBitWidth())
- return false;
-
+ return false;
+
++NumUDivURemsNarrowed;
- IRBuilder<> B{Instr};
- auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth);
- auto *LHS = B.CreateTruncOrBitCast(Instr->getOperand(0), TruncTy,
- Instr->getName() + ".lhs.trunc");
- auto *RHS = B.CreateTruncOrBitCast(Instr->getOperand(1), TruncTy,
- Instr->getName() + ".rhs.trunc");
- auto *BO = B.CreateBinOp(Instr->getOpcode(), LHS, RHS, Instr->getName());
- auto *Zext = B.CreateZExt(BO, Instr->getType(), Instr->getName() + ".zext");
- if (auto *BinOp = dyn_cast<BinaryOperator>(BO))
- if (BinOp->getOpcode() == Instruction::UDiv)
- BinOp->setIsExact(Instr->isExact());
-
- Instr->replaceAllUsesWith(Zext);
- Instr->eraseFromParent();
- return true;
-}
-
-static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
+ IRBuilder<> B{Instr};
+ auto *TruncTy = Type::getIntNTy(Instr->getContext(), NewWidth);
+ auto *LHS = B.CreateTruncOrBitCast(Instr->getOperand(0), TruncTy,
+ Instr->getName() + ".lhs.trunc");
+ auto *RHS = B.CreateTruncOrBitCast(Instr->getOperand(1), TruncTy,
+ Instr->getName() + ".rhs.trunc");
+ auto *BO = B.CreateBinOp(Instr->getOpcode(), LHS, RHS, Instr->getName());
+ auto *Zext = B.CreateZExt(BO, Instr->getType(), Instr->getName() + ".zext");
+ if (auto *BinOp = dyn_cast<BinaryOperator>(BO))
+ if (BinOp->getOpcode() == Instruction::UDiv)
+ BinOp->setIsExact(Instr->isExact());
+
+ Instr->replaceAllUsesWith(Zext);
+ Instr->eraseFromParent();
+ return true;
+}
+
+static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
assert(SDI->getOpcode() == Instruction::SRem);
if (SDI->getType()->isVectorTy())
- return false;
-
+ return false;
+
struct Operand {
Value *V;
Domain D;
@@ -698,7 +698,7 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
}
// We know domains of both of the operands!
- ++NumSRems;
+ ++NumSRems;
// We need operands to be non-negative, so negate each one that isn't.
for (Operand &Op : Ops) {
@@ -721,24 +721,24 @@ static bool processSRem(BinaryOperator *SDI, LazyValueInfo *LVI) {
Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", SDI);
SDI->replaceAllUsesWith(Res);
- SDI->eraseFromParent();
-
+ SDI->eraseFromParent();
+
// Try to simplify our new urem.
processUDivOrURem(URem, LVI);
-
- return true;
-}
-
-/// See if LazyValueInfo's ability to exploit edge conditions or range
+
+ return true;
+}
+
+/// See if LazyValueInfo's ability to exploit edge conditions or range
/// information is sufficient to prove the signs of both operands of this SDiv.
/// If this is the case, replace the SDiv with a UDiv. Even for local
-/// conditions, this can sometimes prove conditions instcombine can't by
-/// exploiting range information.
-static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
+/// conditions, this can sometimes prove conditions instcombine can't by
+/// exploiting range information.
+static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
assert(SDI->getOpcode() == Instruction::SDiv);
if (SDI->getType()->isVectorTy())
- return false;
-
+ return false;
+
struct Operand {
Value *V;
Domain D;
@@ -754,7 +754,7 @@ static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
}
// We know domains of both of the operands!
- ++NumSDivs;
+ ++NumSDivs;
// We need operands to be non-negative, so negate each one that isn't.
for (Operand &Op : Ops) {
@@ -778,14 +778,14 @@ static bool processSDiv(BinaryOperator *SDI, LazyValueInfo *LVI) {
Res = BinaryOperator::CreateNeg(Res, Res->getName() + ".neg", SDI);
SDI->replaceAllUsesWith(Res);
- SDI->eraseFromParent();
-
- // Try to simplify our new udiv.
+ SDI->eraseFromParent();
+
+ // Try to simplify our new udiv.
processUDivOrURem(UDiv, LVI);
-
- return true;
-}
-
+
+ return true;
+}
+
static bool processSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
assert(Instr->getOpcode() == Instruction::SDiv ||
Instr->getOpcode() == Instruction::SRem);
@@ -803,234 +803,234 @@ static bool processSDivOrSRem(BinaryOperator *Instr, LazyValueInfo *LVI) {
return narrowSDivOrSRem(Instr, LVI);
}
-static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
- if (SDI->getType()->isVectorTy())
- return false;
-
+static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
+ if (SDI->getType()->isVectorTy())
+ return false;
+
if (!isNonNegative(SDI->getOperand(0), LVI, SDI))
- return false;
-
- ++NumAShrs;
- auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1),
- SDI->getName(), SDI);
- BO->setDebugLoc(SDI->getDebugLoc());
- BO->setIsExact(SDI->isExact());
- SDI->replaceAllUsesWith(BO);
- SDI->eraseFromParent();
-
- return true;
-}
-
-static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) {
- if (SDI->getType()->isVectorTy())
- return false;
-
- Value *Base = SDI->getOperand(0);
-
+ return false;
+
+ ++NumAShrs;
+ auto *BO = BinaryOperator::CreateLShr(SDI->getOperand(0), SDI->getOperand(1),
+ SDI->getName(), SDI);
+ BO->setDebugLoc(SDI->getDebugLoc());
+ BO->setIsExact(SDI->isExact());
+ SDI->replaceAllUsesWith(BO);
+ SDI->eraseFromParent();
+
+ return true;
+}
+
+static bool processSExt(SExtInst *SDI, LazyValueInfo *LVI) {
+ if (SDI->getType()->isVectorTy())
+ return false;
+
+ Value *Base = SDI->getOperand(0);
+
if (!isNonNegative(Base, LVI, SDI))
- return false;
-
- ++NumSExt;
- auto *ZExt =
- CastInst::CreateZExtOrBitCast(Base, SDI->getType(), SDI->getName(), SDI);
- ZExt->setDebugLoc(SDI->getDebugLoc());
- SDI->replaceAllUsesWith(ZExt);
- SDI->eraseFromParent();
-
- return true;
-}
-
-static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
- using OBO = OverflowingBinaryOperator;
-
- if (DontAddNoWrapFlags)
- return false;
-
- if (BinOp->getType()->isVectorTy())
- return false;
-
- bool NSW = BinOp->hasNoSignedWrap();
- bool NUW = BinOp->hasNoUnsignedWrap();
- if (NSW && NUW)
- return false;
-
- Instruction::BinaryOps Opcode = BinOp->getOpcode();
- Value *LHS = BinOp->getOperand(0);
- Value *RHS = BinOp->getOperand(1);
-
+ return false;
+
+ ++NumSExt;
+ auto *ZExt =
+ CastInst::CreateZExtOrBitCast(Base, SDI->getType(), SDI->getName(), SDI);
+ ZExt->setDebugLoc(SDI->getDebugLoc());
+ SDI->replaceAllUsesWith(ZExt);
+ SDI->eraseFromParent();
+
+ return true;
+}
+
+static bool processBinOp(BinaryOperator *BinOp, LazyValueInfo *LVI) {
+ using OBO = OverflowingBinaryOperator;
+
+ if (DontAddNoWrapFlags)
+ return false;
+
+ if (BinOp->getType()->isVectorTy())
+ return false;
+
+ bool NSW = BinOp->hasNoSignedWrap();
+ bool NUW = BinOp->hasNoUnsignedWrap();
+ if (NSW && NUW)
+ return false;
+
+ Instruction::BinaryOps Opcode = BinOp->getOpcode();
+ Value *LHS = BinOp->getOperand(0);
+ Value *RHS = BinOp->getOperand(1);
+
ConstantRange LRange = LVI->getConstantRange(LHS, BinOp);
ConstantRange RRange = LVI->getConstantRange(RHS, BinOp);
-
- bool Changed = false;
- bool NewNUW = false, NewNSW = false;
- if (!NUW) {
- ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
- Opcode, RRange, OBO::NoUnsignedWrap);
- NewNUW = NUWRange.contains(LRange);
- Changed |= NewNUW;
- }
- if (!NSW) {
- ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
- Opcode, RRange, OBO::NoSignedWrap);
- NewNSW = NSWRange.contains(LRange);
- Changed |= NewNSW;
- }
-
- setDeducedOverflowingFlags(BinOp, Opcode, NewNSW, NewNUW);
-
- return Changed;
-}
-
-static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) {
- if (BinOp->getType()->isVectorTy())
- return false;
-
- // Pattern match (and lhs, C) where C includes a superset of bits which might
- // be set in lhs. This is a common truncation idiom created by instcombine.
- Value *LHS = BinOp->getOperand(0);
- ConstantInt *RHS = dyn_cast<ConstantInt>(BinOp->getOperand(1));
- if (!RHS || !RHS->getValue().isMask())
- return false;
-
- // We can only replace the AND with LHS based on range info if the range does
- // not include undef.
- ConstantRange LRange =
+
+ bool Changed = false;
+ bool NewNUW = false, NewNSW = false;
+ if (!NUW) {
+ ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+ Opcode, RRange, OBO::NoUnsignedWrap);
+ NewNUW = NUWRange.contains(LRange);
+ Changed |= NewNUW;
+ }
+ if (!NSW) {
+ ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion(
+ Opcode, RRange, OBO::NoSignedWrap);
+ NewNSW = NSWRange.contains(LRange);
+ Changed |= NewNSW;
+ }
+
+ setDeducedOverflowingFlags(BinOp, Opcode, NewNSW, NewNUW);
+
+ return Changed;
+}
+
+static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) {
+ if (BinOp->getType()->isVectorTy())
+ return false;
+
+ // Pattern match (and lhs, C) where C includes a superset of bits which might
+ // be set in lhs. This is a common truncation idiom created by instcombine.
+ Value *LHS = BinOp->getOperand(0);
+ ConstantInt *RHS = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+ if (!RHS || !RHS->getValue().isMask())
+ return false;
+
+ // We can only replace the AND with LHS based on range info if the range does
+ // not include undef.
+ ConstantRange LRange =
LVI->getConstantRange(LHS, BinOp, /*UndefAllowed=*/false);
- if (!LRange.getUnsignedMax().ule(RHS->getValue()))
- return false;
-
- BinOp->replaceAllUsesWith(LHS);
- BinOp->eraseFromParent();
- NumAnd++;
- return true;
-}
-
-
-static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
+ if (!LRange.getUnsignedMax().ule(RHS->getValue()))
+ return false;
+
+ BinOp->replaceAllUsesWith(LHS);
+ BinOp->eraseFromParent();
+ NumAnd++;
+ return true;
+}
+
+
+static Constant *getConstantAt(Value *V, Instruction *At, LazyValueInfo *LVI) {
if (Constant *C = LVI->getConstant(V, At))
- return C;
-
- // TODO: The following really should be sunk inside LVI's core algorithm, or
- // at least the outer shims around such.
- auto *C = dyn_cast<CmpInst>(V);
- if (!C) return nullptr;
-
- Value *Op0 = C->getOperand(0);
- Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
- if (!Op1) return nullptr;
-
- LazyValueInfo::Tristate Result =
- LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At);
- if (Result == LazyValueInfo::Unknown)
- return nullptr;
-
- return (Result == LazyValueInfo::True) ?
- ConstantInt::getTrue(C->getContext()) :
- ConstantInt::getFalse(C->getContext());
-}
-
-static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
- const SimplifyQuery &SQ) {
- bool FnChanged = false;
- // Visiting in a pre-order depth-first traversal causes us to simplify early
- // blocks before querying later blocks (which require us to analyze early
- // blocks). Eagerly simplifying shallow blocks means there is strictly less
- // work to do for deep blocks. This also means we don't visit unreachable
- // blocks.
- for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
- bool BBChanged = false;
- for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
- Instruction *II = &*BI++;
- switch (II->getOpcode()) {
- case Instruction::Select:
- BBChanged |= processSelect(cast<SelectInst>(II), LVI);
- break;
- case Instruction::PHI:
- BBChanged |= processPHI(cast<PHINode>(II), LVI, DT, SQ);
- break;
- case Instruction::ICmp:
- case Instruction::FCmp:
- BBChanged |= processCmp(cast<CmpInst>(II), LVI);
- break;
- case Instruction::Load:
- case Instruction::Store:
- BBChanged |= processMemAccess(II, LVI);
- break;
- case Instruction::Call:
- case Instruction::Invoke:
- BBChanged |= processCallSite(cast<CallBase>(*II), LVI);
- break;
- case Instruction::SRem:
- case Instruction::SDiv:
+ return C;
+
+ // TODO: The following really should be sunk inside LVI's core algorithm, or
+ // at least the outer shims around such.
+ auto *C = dyn_cast<CmpInst>(V);
+ if (!C) return nullptr;
+
+ Value *Op0 = C->getOperand(0);
+ Constant *Op1 = dyn_cast<Constant>(C->getOperand(1));
+ if (!Op1) return nullptr;
+
+ LazyValueInfo::Tristate Result =
+ LVI->getPredicateAt(C->getPredicate(), Op0, Op1, At);
+ if (Result == LazyValueInfo::Unknown)
+ return nullptr;
+
+ return (Result == LazyValueInfo::True) ?
+ ConstantInt::getTrue(C->getContext()) :
+ ConstantInt::getFalse(C->getContext());
+}
+
+static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
+ const SimplifyQuery &SQ) {
+ bool FnChanged = false;
+ // Visiting in a pre-order depth-first traversal causes us to simplify early
+ // blocks before querying later blocks (which require us to analyze early
+ // blocks). Eagerly simplifying shallow blocks means there is strictly less
+ // work to do for deep blocks. This also means we don't visit unreachable
+ // blocks.
+ for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
+ bool BBChanged = false;
+ for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+ Instruction *II = &*BI++;
+ switch (II->getOpcode()) {
+ case Instruction::Select:
+ BBChanged |= processSelect(cast<SelectInst>(II), LVI);
+ break;
+ case Instruction::PHI:
+ BBChanged |= processPHI(cast<PHINode>(II), LVI, DT, SQ);
+ break;
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ BBChanged |= processCmp(cast<CmpInst>(II), LVI);
+ break;
+ case Instruction::Load:
+ case Instruction::Store:
+ BBChanged |= processMemAccess(II, LVI);
+ break;
+ case Instruction::Call:
+ case Instruction::Invoke:
+ BBChanged |= processCallSite(cast<CallBase>(*II), LVI);
+ break;
+ case Instruction::SRem:
+ case Instruction::SDiv:
BBChanged |= processSDivOrSRem(cast<BinaryOperator>(II), LVI);
- break;
- case Instruction::UDiv:
- case Instruction::URem:
- BBChanged |= processUDivOrURem(cast<BinaryOperator>(II), LVI);
- break;
- case Instruction::AShr:
- BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
- break;
- case Instruction::SExt:
- BBChanged |= processSExt(cast<SExtInst>(II), LVI);
- break;
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Mul:
- case Instruction::Shl:
- BBChanged |= processBinOp(cast<BinaryOperator>(II), LVI);
- break;
- case Instruction::And:
- BBChanged |= processAnd(cast<BinaryOperator>(II), LVI);
- break;
- }
- }
-
- Instruction *Term = BB->getTerminator();
- switch (Term->getOpcode()) {
- case Instruction::Switch:
- BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI, DT);
- break;
- case Instruction::Ret: {
- auto *RI = cast<ReturnInst>(Term);
- // Try to determine the return value if we can. This is mainly here to
- // simplify the writing of unit tests, but also helps to enable IPO by
- // constant folding the return values of callees.
- auto *RetVal = RI->getReturnValue();
- if (!RetVal) break; // handle "ret void"
- if (isa<Constant>(RetVal)) break; // nothing to do
- if (auto *C = getConstantAt(RetVal, RI, LVI)) {
- ++NumReturns;
- RI->replaceUsesOfWith(RetVal, C);
- BBChanged = true;
- }
- }
- }
-
- FnChanged |= BBChanged;
- }
-
- return FnChanged;
-}
-
-bool CorrelatedValuePropagation::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
- DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-
- return runImpl(F, LVI, DT, getBestSimplifyQuery(*this, F));
-}
-
-PreservedAnalyses
-CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
- LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
- DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
-
- bool Changed = runImpl(F, LVI, DT, getBestSimplifyQuery(AM, F));
-
- PreservedAnalyses PA;
+ break;
+ case Instruction::UDiv:
+ case Instruction::URem:
+ BBChanged |= processUDivOrURem(cast<BinaryOperator>(II), LVI);
+ break;
+ case Instruction::AShr:
+ BBChanged |= processAShr(cast<BinaryOperator>(II), LVI);
+ break;
+ case Instruction::SExt:
+ BBChanged |= processSExt(cast<SExtInst>(II), LVI);
+ break;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::Shl:
+ BBChanged |= processBinOp(cast<BinaryOperator>(II), LVI);
+ break;
+ case Instruction::And:
+ BBChanged |= processAnd(cast<BinaryOperator>(II), LVI);
+ break;
+ }
+ }
+
+ Instruction *Term = BB->getTerminator();
+ switch (Term->getOpcode()) {
+ case Instruction::Switch:
+ BBChanged |= processSwitch(cast<SwitchInst>(Term), LVI, DT);
+ break;
+ case Instruction::Ret: {
+ auto *RI = cast<ReturnInst>(Term);
+ // Try to determine the return value if we can. This is mainly here to
+ // simplify the writing of unit tests, but also helps to enable IPO by
+ // constant folding the return values of callees.
+ auto *RetVal = RI->getReturnValue();
+ if (!RetVal) break; // handle "ret void"
+ if (isa<Constant>(RetVal)) break; // nothing to do
+ if (auto *C = getConstantAt(RetVal, RI, LVI)) {
+ ++NumReturns;
+ RI->replaceUsesOfWith(RetVal, C);
+ BBChanged = true;
+ }
+ }
+ }
+
+ FnChanged |= BBChanged;
+ }
+
+ return FnChanged;
+}
+
+bool CorrelatedValuePropagation::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+ return runImpl(F, LVI, DT, getBestSimplifyQuery(*this, F));
+}
+
+PreservedAnalyses
+CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
+ LazyValueInfo *LVI = &AM.getResult<LazyValueAnalysis>(F);
+ DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+
+ bool Changed = runImpl(F, LVI, DT, getBestSimplifyQuery(AM, F));
+
+ PreservedAnalyses PA;
if (!Changed) {
PA = PreservedAnalyses::all();
} else {
@@ -1044,5 +1044,5 @@ CorrelatedValuePropagationPass::run(Function &F, FunctionAnalysisManager &AM) {
// LVI, we know that passes after JumpThreading+CVP will not need the result
// of this analysis, so we forcefully discard it early.
PA.abandon<LazyValueAnalysis>();
- return PA;
-}
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/DCE.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/DCE.cpp
index 20f04a2e14..d55adf7c2d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/DCE.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/DCE.cpp
@@ -1,74 +1,74 @@
-//===- DCE.cpp - Code to perform dead code elimination --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements dead inst elimination and dead code elimination.
-//
-// Dead Inst Elimination performs a single pass over the function removing
-// instructions that are obviously dead. Dead Code Elimination is similar, but
-// it rechecks instructions that were used by removed instructions to see if
-// they are newly dead.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/DCE.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/DebugCounter.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "dce"
-
-STATISTIC(DCEEliminated, "Number of insts removed");
-DEBUG_COUNTER(DCECounter, "dce-transform",
- "Controls which instructions are eliminated");
-
-//===--------------------------------------------------------------------===//
-// RedundantDbgInstElimination pass implementation
-//
-
-namespace {
-struct RedundantDbgInstElimination : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- RedundantDbgInstElimination() : FunctionPass(ID) {
- initializeRedundantDbgInstEliminationPass(*PassRegistry::getPassRegistry());
- }
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
- bool Changed = false;
- for (auto &BB : F)
- Changed |= RemoveRedundantDbgInstrs(&BB);
- return Changed;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- }
-};
-}
-
-char RedundantDbgInstElimination::ID = 0;
-INITIALIZE_PASS(RedundantDbgInstElimination, "redundant-dbg-inst-elim",
- "Redundant Dbg Instruction Elimination", false, false)
-
-Pass *llvm::createRedundantDbgInstEliminationPass() {
- return new RedundantDbgInstElimination();
-}
-
+//===- DCE.cpp - Code to perform dead code elimination --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead inst elimination and dead code elimination.
+//
+// Dead Inst Elimination performs a single pass over the function removing
+// instructions that are obviously dead. Dead Code Elimination is similar, but
+// it rechecks instructions that were used by removed instructions to see if
+// they are newly dead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/DCE.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "dce"
+
+STATISTIC(DCEEliminated, "Number of insts removed");
+DEBUG_COUNTER(DCECounter, "dce-transform",
+ "Controls which instructions are eliminated");
+
+//===--------------------------------------------------------------------===//
+// RedundantDbgInstElimination pass implementation
+//
+
+namespace {
+struct RedundantDbgInstElimination : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ RedundantDbgInstElimination() : FunctionPass(ID) {
+ initializeRedundantDbgInstEliminationPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ bool Changed = false;
+ for (auto &BB : F)
+ Changed |= RemoveRedundantDbgInstrs(&BB);
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ }
+};
+}
+
+char RedundantDbgInstElimination::ID = 0;
+INITIALIZE_PASS(RedundantDbgInstElimination, "redundant-dbg-inst-elim",
+ "Redundant Dbg Instruction Elimination", false, false)
+
+Pass *llvm::createRedundantDbgInstEliminationPass() {
+ return new RedundantDbgInstElimination();
+}
+
PreservedAnalyses
RedundantDbgInstEliminationPass::run(Function &F, FunctionAnalysisManager &AM) {
bool Changed = false;
@@ -81,103 +81,103 @@ RedundantDbgInstEliminationPass::run(Function &F, FunctionAnalysisManager &AM) {
return PA;
}
-//===--------------------------------------------------------------------===//
-// DeadCodeElimination pass implementation
-//
-
-static bool DCEInstruction(Instruction *I,
- SmallSetVector<Instruction *, 16> &WorkList,
- const TargetLibraryInfo *TLI) {
- if (isInstructionTriviallyDead(I, TLI)) {
- if (!DebugCounter::shouldExecute(DCECounter))
- return false;
-
- salvageDebugInfo(*I);
- salvageKnowledge(I);
-
- // Null out all of the instruction's operands to see if any operand becomes
- // dead as we go.
- for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
- Value *OpV = I->getOperand(i);
- I->setOperand(i, nullptr);
-
- if (!OpV->use_empty() || I == OpV)
- continue;
-
- // If the operand is an instruction that became dead as we nulled out the
- // operand, and if it is 'trivially' dead, delete it in a future loop
- // iteration.
- if (Instruction *OpI = dyn_cast<Instruction>(OpV))
- if (isInstructionTriviallyDead(OpI, TLI))
- WorkList.insert(OpI);
- }
-
- I->eraseFromParent();
- ++DCEEliminated;
- return true;
- }
- return false;
-}
-
-static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) {
- bool MadeChange = false;
- SmallSetVector<Instruction *, 16> WorkList;
- // Iterate over the original function, only adding insts to the worklist
- // if they actually need to be revisited. This avoids having to pre-init
- // the worklist with the entire function's worth of instructions.
- for (inst_iterator FI = inst_begin(F), FE = inst_end(F); FI != FE;) {
- Instruction *I = &*FI;
- ++FI;
-
- // We're visiting this instruction now, so make sure it's not in the
- // worklist from an earlier visit.
- if (!WorkList.count(I))
- MadeChange |= DCEInstruction(I, WorkList, TLI);
- }
-
- while (!WorkList.empty()) {
- Instruction *I = WorkList.pop_back_val();
- MadeChange |= DCEInstruction(I, WorkList, TLI);
- }
- return MadeChange;
-}
-
-PreservedAnalyses DCEPass::run(Function &F, FunctionAnalysisManager &AM) {
+//===--------------------------------------------------------------------===//
+// DeadCodeElimination pass implementation
+//
+
+static bool DCEInstruction(Instruction *I,
+ SmallSetVector<Instruction *, 16> &WorkList,
+ const TargetLibraryInfo *TLI) {
+ if (isInstructionTriviallyDead(I, TLI)) {
+ if (!DebugCounter::shouldExecute(DCECounter))
+ return false;
+
+ salvageDebugInfo(*I);
+ salvageKnowledge(I);
+
+ // Null out all of the instruction's operands to see if any operand becomes
+ // dead as we go.
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+ Value *OpV = I->getOperand(i);
+ I->setOperand(i, nullptr);
+
+ if (!OpV->use_empty() || I == OpV)
+ continue;
+
+ // If the operand is an instruction that became dead as we nulled out the
+ // operand, and if it is 'trivially' dead, delete it in a future loop
+ // iteration.
+ if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+ if (isInstructionTriviallyDead(OpI, TLI))
+ WorkList.insert(OpI);
+ }
+
+ I->eraseFromParent();
+ ++DCEEliminated;
+ return true;
+ }
+ return false;
+}
+
+static bool eliminateDeadCode(Function &F, TargetLibraryInfo *TLI) {
+ bool MadeChange = false;
+ SmallSetVector<Instruction *, 16> WorkList;
+ // Iterate over the original function, only adding insts to the worklist
+ // if they actually need to be revisited. This avoids having to pre-init
+ // the worklist with the entire function's worth of instructions.
+ for (inst_iterator FI = inst_begin(F), FE = inst_end(F); FI != FE;) {
+ Instruction *I = &*FI;
+ ++FI;
+
+ // We're visiting this instruction now, so make sure it's not in the
+ // worklist from an earlier visit.
+ if (!WorkList.count(I))
+ MadeChange |= DCEInstruction(I, WorkList, TLI);
+ }
+
+ while (!WorkList.empty()) {
+ Instruction *I = WorkList.pop_back_val();
+ MadeChange |= DCEInstruction(I, WorkList, TLI);
+ }
+ return MadeChange;
+}
+
+PreservedAnalyses DCEPass::run(Function &F, FunctionAnalysisManager &AM) {
if (!eliminateDeadCode(F, &AM.getResult<TargetLibraryAnalysis>(F)))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
-}
-
-namespace {
-struct DCELegacyPass : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- DCELegacyPass() : FunctionPass(ID) {
- initializeDCELegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+namespace {
+struct DCELegacyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ DCELegacyPass() : FunctionPass(ID) {
+ initializeDCELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
TargetLibraryInfo *TLI =
&getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-
- return eliminateDeadCode(F, TLI);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
+
+ return eliminateDeadCode(F, TLI);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.setPreservesCFG();
- }
-};
-}
-
-char DCELegacyPass::ID = 0;
-INITIALIZE_PASS(DCELegacyPass, "dce", "Dead Code Elimination", false, false)
-
-FunctionPass *llvm::createDeadCodeEliminationPass() {
- return new DCELegacyPass();
-}
+ AU.setPreservesCFG();
+ }
+};
+}
+
+char DCELegacyPass::ID = 0;
+INITIALIZE_PASS(DCELegacyPass, "dce", "Dead Code Elimination", false, false)
+
+FunctionPass *llvm::createDeadCodeEliminationPass() {
+ return new DCELegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/DeadStoreElimination.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 3d34beb8d9..2979225c60 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -1,134 +1,134 @@
-//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a trivial dead store elimination that only considers
-// basic-block local redundant stores.
-//
-// FIXME: This should eventually be extended to be a post-dominator tree
-// traversal. Doing so would be pretty trivial.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/DebugCounter.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <map>
-#include <utility>
-
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "dse"
-
-STATISTIC(NumRemainingStores, "Number of stores remaining after DSE");
-STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
-STATISTIC(NumFastStores, "Number of stores deleted");
-STATISTIC(NumFastOther, "Number of other instrs removed");
-STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
-STATISTIC(NumModifiedStores, "Number of stores modified");
-STATISTIC(NumCFGChecks, "Number of stores modified");
-STATISTIC(NumCFGTries, "Number of stores modified");
-STATISTIC(NumCFGSuccess, "Number of stores modified");
+//===- DeadStoreElimination.cpp - Fast Dead Store Elimination -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a trivial dead store elimination that only considers
+// basic-block local redundant stores.
+//
+// FIXME: This should eventually be extended to be a post-dominator tree
+// traversal. Doing so would be pretty trivial.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/DeadStoreElimination.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <utility>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "dse"
+
+STATISTIC(NumRemainingStores, "Number of stores remaining after DSE");
+STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
+STATISTIC(NumFastStores, "Number of stores deleted");
+STATISTIC(NumFastOther, "Number of other instrs removed");
+STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
+STATISTIC(NumModifiedStores, "Number of stores modified");
+STATISTIC(NumCFGChecks, "Number of stores modified");
+STATISTIC(NumCFGTries, "Number of stores modified");
+STATISTIC(NumCFGSuccess, "Number of stores modified");
STATISTIC(NumGetDomMemoryDefPassed,
"Number of times a valid candidate is returned from getDomMemoryDef");
STATISTIC(NumDomMemDefChecks,
"Number iterations check for reads in getDomMemoryDef");
-
-DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa",
- "Controls which MemoryDefs are eliminated.");
-
-static cl::opt<bool>
-EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
- cl::init(true), cl::Hidden,
- cl::desc("Enable partial-overwrite tracking in DSE"));
-
-static cl::opt<bool>
-EnablePartialStoreMerging("enable-dse-partial-store-merging",
- cl::init(true), cl::Hidden,
- cl::desc("Enable partial store merging in DSE"));
-
-static cl::opt<bool>
+
+DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa",
+ "Controls which MemoryDefs are eliminated.");
+
+static cl::opt<bool>
+EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
+ cl::init(true), cl::Hidden,
+ cl::desc("Enable partial-overwrite tracking in DSE"));
+
+static cl::opt<bool>
+EnablePartialStoreMerging("enable-dse-partial-store-merging",
+ cl::init(true), cl::Hidden,
+ cl::desc("Enable partial store merging in DSE"));
+
+static cl::opt<bool>
EnableMemorySSA("enable-dse-memoryssa", cl::init(true), cl::Hidden,
- cl::desc("Use the new MemorySSA-backed DSE."));
-
-static cl::opt<unsigned>
+ cl::desc("Use the new MemorySSA-backed DSE."));
+
+static cl::opt<unsigned>
MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(150), cl::Hidden,
- cl::desc("The number of memory instructions to scan for "
- "dead store elimination (default = 100)"));
+ cl::desc("The number of memory instructions to scan for "
+ "dead store elimination (default = 100)"));
static cl::opt<unsigned> MemorySSAUpwardsStepLimit(
"dse-memoryssa-walklimit", cl::init(90), cl::Hidden,
cl::desc("The maximum number of steps while walking upwards to find "
"MemoryDefs that may be killed (default = 90)"));
-
+
static cl::opt<unsigned> MemorySSAPartialStoreLimit(
"dse-memoryssa-partial-store-limit", cl::init(5), cl::Hidden,
cl::desc("The maximum number candidates that only partially overwrite the "
"killing MemoryDef to consider"
" (default = 5)"));
-static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
- "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
- cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
- "other stores per basic block (default = 5000)"));
-
+static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
+ "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
+ cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
+ "other stores per basic block (default = 5000)"));
+
static cl::opt<unsigned> MemorySSASameBBStepCost(
"dse-memoryssa-samebb-cost", cl::init(1), cl::Hidden,
cl::desc(
@@ -142,273 +142,273 @@ static cl::opt<unsigned>
"block than the killing MemoryDef"
"(default = 5)"));
-static cl::opt<unsigned> MemorySSAPathCheckLimit(
- "dse-memoryssa-path-check-limit", cl::init(50), cl::Hidden,
- cl::desc("The maximum number of blocks to check when trying to prove that "
- "all paths to an exit go through a killing block (default = 50)"));
-
-//===----------------------------------------------------------------------===//
-// Helper functions
-//===----------------------------------------------------------------------===//
-using OverlapIntervalsTy = std::map<int64_t, int64_t>;
-using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;
-
-/// Delete this instruction. Before we do, go through and zero out all the
-/// operands of this instruction. If any of them become dead, delete them and
-/// the computation tree that feeds them.
-/// If ValueSet is non-null, remove any deleted instructions from it as well.
-static void
-deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
- MemoryDependenceResults &MD, const TargetLibraryInfo &TLI,
- InstOverlapIntervalsTy &IOL,
- MapVector<Instruction *, bool> &ThrowableInst,
- SmallSetVector<const Value *, 16> *ValueSet = nullptr) {
- SmallVector<Instruction*, 32> NowDeadInsts;
-
- NowDeadInsts.push_back(I);
- --NumFastOther;
-
- // Keeping the iterator straight is a pain, so we let this routine tell the
- // caller what the next instruction is after we're done mucking about.
- BasicBlock::iterator NewIter = *BBI;
-
- // Before we touch this instruction, remove it from memdep!
- do {
- Instruction *DeadInst = NowDeadInsts.pop_back_val();
- // Mark the DeadInst as dead in the list of throwable instructions.
- auto It = ThrowableInst.find(DeadInst);
- if (It != ThrowableInst.end())
- ThrowableInst[It->first] = false;
- ++NumFastOther;
-
- // Try to preserve debug information attached to the dead instruction.
- salvageDebugInfo(*DeadInst);
- salvageKnowledge(DeadInst);
-
- // This instruction is dead, zap it, in stages. Start by removing it from
- // MemDep, which needs to know the operands and needs it to be in the
- // function.
- MD.removeInstruction(DeadInst);
-
- for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
- Value *Op = DeadInst->getOperand(op);
- DeadInst->setOperand(op, nullptr);
-
- // If this operand just became dead, add it to the NowDeadInsts list.
- if (!Op->use_empty()) continue;
-
- if (Instruction *OpI = dyn_cast<Instruction>(Op))
- if (isInstructionTriviallyDead(OpI, &TLI))
- NowDeadInsts.push_back(OpI);
- }
-
- if (ValueSet) ValueSet->remove(DeadInst);
- IOL.erase(DeadInst);
-
- if (NewIter == DeadInst->getIterator())
- NewIter = DeadInst->eraseFromParent();
- else
- DeadInst->eraseFromParent();
- } while (!NowDeadInsts.empty());
- *BBI = NewIter;
- // Pop dead entries from back of ThrowableInst till we find an alive entry.
- while (!ThrowableInst.empty() && !ThrowableInst.back().second)
- ThrowableInst.pop_back();
-}
-
-/// Does this instruction write some memory? This only returns true for things
-/// that we can analyze with other helpers below.
-static bool hasAnalyzableMemoryWrite(Instruction *I,
- const TargetLibraryInfo &TLI) {
- if (isa<StoreInst>(I))
- return true;
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- switch (II->getIntrinsicID()) {
- default:
- return false;
- case Intrinsic::memset:
- case Intrinsic::memmove:
- case Intrinsic::memcpy:
+static cl::opt<unsigned> MemorySSAPathCheckLimit(
+ "dse-memoryssa-path-check-limit", cl::init(50), cl::Hidden,
+ cl::desc("The maximum number of blocks to check when trying to prove that "
+ "all paths to an exit go through a killing block (default = 50)"));
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+using OverlapIntervalsTy = std::map<int64_t, int64_t>;
+using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;
+
+/// Delete this instruction. Before we do, go through and zero out all the
+/// operands of this instruction. If any of them become dead, delete them and
+/// the computation tree that feeds them.
+/// If ValueSet is non-null, remove any deleted instructions from it as well.
+static void
+deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
+ MemoryDependenceResults &MD, const TargetLibraryInfo &TLI,
+ InstOverlapIntervalsTy &IOL,
+ MapVector<Instruction *, bool> &ThrowableInst,
+ SmallSetVector<const Value *, 16> *ValueSet = nullptr) {
+ SmallVector<Instruction*, 32> NowDeadInsts;
+
+ NowDeadInsts.push_back(I);
+ --NumFastOther;
+
+ // Keeping the iterator straight is a pain, so we let this routine tell the
+ // caller what the next instruction is after we're done mucking about.
+ BasicBlock::iterator NewIter = *BBI;
+
+ // Before we touch this instruction, remove it from memdep!
+ do {
+ Instruction *DeadInst = NowDeadInsts.pop_back_val();
+ // Mark the DeadInst as dead in the list of throwable instructions.
+ auto It = ThrowableInst.find(DeadInst);
+ if (It != ThrowableInst.end())
+ ThrowableInst[It->first] = false;
+ ++NumFastOther;
+
+ // Try to preserve debug information attached to the dead instruction.
+ salvageDebugInfo(*DeadInst);
+ salvageKnowledge(DeadInst);
+
+ // This instruction is dead, zap it, in stages. Start by removing it from
+ // MemDep, which needs to know the operands and needs it to be in the
+ // function.
+ MD.removeInstruction(DeadInst);
+
+ for (unsigned op = 0, e = DeadInst->getNumOperands(); op != e; ++op) {
+ Value *Op = DeadInst->getOperand(op);
+ DeadInst->setOperand(op, nullptr);
+
+ // If this operand just became dead, add it to the NowDeadInsts list.
+ if (!Op->use_empty()) continue;
+
+ if (Instruction *OpI = dyn_cast<Instruction>(Op))
+ if (isInstructionTriviallyDead(OpI, &TLI))
+ NowDeadInsts.push_back(OpI);
+ }
+
+ if (ValueSet) ValueSet->remove(DeadInst);
+ IOL.erase(DeadInst);
+
+ if (NewIter == DeadInst->getIterator())
+ NewIter = DeadInst->eraseFromParent();
+ else
+ DeadInst->eraseFromParent();
+ } while (!NowDeadInsts.empty());
+ *BBI = NewIter;
+ // Pop dead entries from back of ThrowableInst till we find an alive entry.
+ while (!ThrowableInst.empty() && !ThrowableInst.back().second)
+ ThrowableInst.pop_back();
+}
+
+/// Does this instruction write some memory? This only returns true for things
+/// that we can analyze with other helpers below.
+static bool hasAnalyzableMemoryWrite(Instruction *I,
+ const TargetLibraryInfo &TLI) {
+ if (isa<StoreInst>(I))
+ return true;
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::memset:
+ case Intrinsic::memmove:
+ case Intrinsic::memcpy:
case Intrinsic::memcpy_inline:
- case Intrinsic::memcpy_element_unordered_atomic:
- case Intrinsic::memmove_element_unordered_atomic:
- case Intrinsic::memset_element_unordered_atomic:
- case Intrinsic::init_trampoline:
- case Intrinsic::lifetime_end:
+ case Intrinsic::memcpy_element_unordered_atomic:
+ case Intrinsic::memmove_element_unordered_atomic:
+ case Intrinsic::memset_element_unordered_atomic:
+ case Intrinsic::init_trampoline:
+ case Intrinsic::lifetime_end:
case Intrinsic::masked_store:
- return true;
- }
- }
- if (auto *CB = dyn_cast<CallBase>(I)) {
- LibFunc LF;
- if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
- switch (LF) {
- case LibFunc_strcpy:
- case LibFunc_strncpy:
- case LibFunc_strcat:
- case LibFunc_strncat:
- return true;
- default:
- return false;
- }
- }
- }
- return false;
-}
-
-/// Return a Location stored to by the specified instruction. If isRemovable
-/// returns true, this function and getLocForRead completely describe the memory
-/// operations for this instruction.
+ return true;
+ }
+ }
+ if (auto *CB = dyn_cast<CallBase>(I)) {
+ LibFunc LF;
+ if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
+ switch (LF) {
+ case LibFunc_strcpy:
+ case LibFunc_strncpy:
+ case LibFunc_strcat:
+ case LibFunc_strncat:
+ return true;
+ default:
+ return false;
+ }
+ }
+ }
+ return false;
+}
+
+/// Return a Location stored to by the specified instruction. If isRemovable
+/// returns true, this function and getLocForRead completely describe the memory
+/// operations for this instruction.
static MemoryLocation getLocForWrite(Instruction *Inst,
const TargetLibraryInfo &TLI) {
- if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
- return MemoryLocation::get(SI);
-
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ return MemoryLocation::get(SI);
+
// memcpy/memmove/memset.
if (auto *MI = dyn_cast<AnyMemIntrinsic>(Inst))
return MemoryLocation::getForDest(MI);
-
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
- switch (II->getIntrinsicID()) {
- default:
- return MemoryLocation(); // Unhandled intrinsic.
- case Intrinsic::init_trampoline:
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ switch (II->getIntrinsicID()) {
+ default:
+ return MemoryLocation(); // Unhandled intrinsic.
+ case Intrinsic::init_trampoline:
return MemoryLocation::getAfter(II->getArgOperand(0));
case Intrinsic::masked_store:
return MemoryLocation::getForArgument(II, 1, TLI);
- case Intrinsic::lifetime_end: {
- uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
- return MemoryLocation(II->getArgOperand(1), Len);
- }
- }
- }
- if (auto *CB = dyn_cast<CallBase>(Inst))
- // All the supported TLI functions so far happen to have dest as their
- // first argument.
+ case Intrinsic::lifetime_end: {
+ uint64_t Len = cast<ConstantInt>(II->getArgOperand(0))->getZExtValue();
+ return MemoryLocation(II->getArgOperand(1), Len);
+ }
+ }
+ }
+ if (auto *CB = dyn_cast<CallBase>(Inst))
+ // All the supported TLI functions so far happen to have dest as their
+ // first argument.
return MemoryLocation::getAfter(CB->getArgOperand(0));
- return MemoryLocation();
-}
-
-/// Return the location read by the specified "hasAnalyzableMemoryWrite"
-/// instruction if any.
-static MemoryLocation getLocForRead(Instruction *Inst,
- const TargetLibraryInfo &TLI) {
- assert(hasAnalyzableMemoryWrite(Inst, TLI) && "Unknown instruction case");
-
- // The only instructions that both read and write are the mem transfer
- // instructions (memcpy/memmove).
- if (auto *MTI = dyn_cast<AnyMemTransferInst>(Inst))
- return MemoryLocation::getForSource(MTI);
- return MemoryLocation();
-}
-
-/// If the value of this instruction and the memory it writes to is unused, may
-/// we delete this instruction?
-static bool isRemovable(Instruction *I) {
- // Don't remove volatile/atomic stores.
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->isUnordered();
-
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- switch (II->getIntrinsicID()) {
- default: llvm_unreachable("doesn't pass 'hasAnalyzableMemoryWrite' predicate");
- case Intrinsic::lifetime_end:
- // Never remove dead lifetime_end's, e.g. because it is followed by a
- // free.
- return false;
- case Intrinsic::init_trampoline:
- // Always safe to remove init_trampoline.
- return true;
- case Intrinsic::memset:
- case Intrinsic::memmove:
- case Intrinsic::memcpy:
+ return MemoryLocation();
+}
+
+/// Return the location read by the specified "hasAnalyzableMemoryWrite"
+/// instruction if any.
+static MemoryLocation getLocForRead(Instruction *Inst,
+ const TargetLibraryInfo &TLI) {
+ assert(hasAnalyzableMemoryWrite(Inst, TLI) && "Unknown instruction case");
+
+ // The only instructions that both read and write are the mem transfer
+ // instructions (memcpy/memmove).
+ if (auto *MTI = dyn_cast<AnyMemTransferInst>(Inst))
+ return MemoryLocation::getForSource(MTI);
+ return MemoryLocation();
+}
+
+/// If the value of this instruction and the memory it writes to is unused, may
+/// we delete this instruction?
+static bool isRemovable(Instruction *I) {
+ // Don't remove volatile/atomic stores.
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->isUnordered();
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default: llvm_unreachable("doesn't pass 'hasAnalyzableMemoryWrite' predicate");
+ case Intrinsic::lifetime_end:
+ // Never remove dead lifetime_end's, e.g. because it is followed by a
+ // free.
+ return false;
+ case Intrinsic::init_trampoline:
+ // Always safe to remove init_trampoline.
+ return true;
+ case Intrinsic::memset:
+ case Intrinsic::memmove:
+ case Intrinsic::memcpy:
case Intrinsic::memcpy_inline:
- // Don't remove volatile memory intrinsics.
- return !cast<MemIntrinsic>(II)->isVolatile();
- case Intrinsic::memcpy_element_unordered_atomic:
- case Intrinsic::memmove_element_unordered_atomic:
- case Intrinsic::memset_element_unordered_atomic:
+ // Don't remove volatile memory intrinsics.
+ return !cast<MemIntrinsic>(II)->isVolatile();
+ case Intrinsic::memcpy_element_unordered_atomic:
+ case Intrinsic::memmove_element_unordered_atomic:
+ case Intrinsic::memset_element_unordered_atomic:
case Intrinsic::masked_store:
- return true;
- }
- }
-
- // note: only get here for calls with analyzable writes - i.e. libcalls
- if (auto *CB = dyn_cast<CallBase>(I))
- return CB->use_empty();
-
- return false;
-}
-
-/// Returns true if the end of this instruction can be safely shortened in
-/// length.
-static bool isShortenableAtTheEnd(Instruction *I) {
- // Don't shorten stores for now
- if (isa<StoreInst>(I))
- return false;
-
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- switch (II->getIntrinsicID()) {
- default: return false;
- case Intrinsic::memset:
- case Intrinsic::memcpy:
- case Intrinsic::memcpy_element_unordered_atomic:
- case Intrinsic::memset_element_unordered_atomic:
- // Do shorten memory intrinsics.
- // FIXME: Add memmove if it's also safe to transform.
- return true;
- }
- }
-
- // Don't shorten libcalls calls for now.
-
- return false;
-}
-
-/// Returns true if the beginning of this instruction can be safely shortened
-/// in length.
-static bool isShortenableAtTheBeginning(Instruction *I) {
- // FIXME: Handle only memset for now. Supporting memcpy/memmove should be
- // easily done by offsetting the source address.
- return isa<AnyMemSetInst>(I);
-}
-
-/// Return the pointer that is being written to.
+ return true;
+ }
+ }
+
+ // note: only get here for calls with analyzable writes - i.e. libcalls
+ if (auto *CB = dyn_cast<CallBase>(I))
+ return CB->use_empty();
+
+ return false;
+}
+
+/// Returns true if the end of this instruction can be safely shortened in
+/// length.
+static bool isShortenableAtTheEnd(Instruction *I) {
+ // Don't shorten stores for now
+ if (isa<StoreInst>(I))
+ return false;
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default: return false;
+ case Intrinsic::memset:
+ case Intrinsic::memcpy:
+ case Intrinsic::memcpy_element_unordered_atomic:
+ case Intrinsic::memset_element_unordered_atomic:
+ // Do shorten memory intrinsics.
+ // FIXME: Add memmove if it's also safe to transform.
+ return true;
+ }
+ }
+
+ // Don't shorten libcalls calls for now.
+
+ return false;
+}
+
+/// Returns true if the beginning of this instruction can be safely shortened
+/// in length.
+static bool isShortenableAtTheBeginning(Instruction *I) {
+ // FIXME: Handle only memset for now. Supporting memcpy/memmove should be
+ // easily done by offsetting the source address.
+ return isa<AnyMemSetInst>(I);
+}
+
+/// Return the pointer that is being written to.
static Value *getStoredPointerOperand(Instruction *I,
const TargetLibraryInfo &TLI) {
- //TODO: factor this to reuse getLocForWrite
+ //TODO: factor this to reuse getLocForWrite
MemoryLocation Loc = getLocForWrite(I, TLI);
- assert(Loc.Ptr &&
- "unable to find pointer written for analyzable instruction?");
- // TODO: most APIs don't expect const Value *
- return const_cast<Value*>(Loc.Ptr);
-}
-
-static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
- const TargetLibraryInfo &TLI,
- const Function *F) {
- uint64_t Size;
- ObjectSizeOpts Opts;
- Opts.NullIsUnknownSize = NullPointerIsDefined(F);
-
- if (getObjectSize(V, Size, DL, &TLI, Opts))
- return Size;
- return MemoryLocation::UnknownSize;
-}
-
-namespace {
-
-enum OverwriteResult {
- OW_Begin,
- OW_Complete,
- OW_End,
- OW_PartialEarlierWithFullLater,
+ assert(Loc.Ptr &&
+ "unable to find pointer written for analyzable instruction?");
+ // TODO: most APIs don't expect const Value *
+ return const_cast<Value*>(Loc.Ptr);
+}
+
+static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
+ const TargetLibraryInfo &TLI,
+ const Function *F) {
+ uint64_t Size;
+ ObjectSizeOpts Opts;
+ Opts.NullIsUnknownSize = NullPointerIsDefined(F);
+
+ if (getObjectSize(V, Size, DL, &TLI, Opts))
+ return Size;
+ return MemoryLocation::UnknownSize;
+}
+
+namespace {
+
+enum OverwriteResult {
+ OW_Begin,
+ OW_Complete,
+ OW_End,
+ OW_PartialEarlierWithFullLater,
OW_MaybePartial,
- OW_Unknown
-};
-
-} // end anonymous namespace
-
+ OW_Unknown
+};
+
+} // end anonymous namespace
+
/// Check if two instruction are masked stores that completely
/// overwrite one another. More specifically, \p Later has to
/// overwrite \p Earlier.
@@ -449,56 +449,56 @@ isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
const DataLayout &DL, const TargetLibraryInfo &TLI,
int64_t &EarlierOff, int64_t &LaterOff, AATy &AA,
const Function *F) {
- // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
- // get imprecise values here, though (except for unknown sizes).
+ // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
+ // get imprecise values here, though (except for unknown sizes).
if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) {
// Masked stores have imprecise locations, but we can reason about them
// to some extent.
return isMaskedStoreOverwrite(LaterI, EarlierI, AA);
}
-
- const uint64_t LaterSize = Later.Size.getValue();
- const uint64_t EarlierSize = Earlier.Size.getValue();
-
- const Value *P1 = Earlier.Ptr->stripPointerCasts();
- const Value *P2 = Later.Ptr->stripPointerCasts();
-
- // If the start pointers are the same, we just have to compare sizes to see if
- // the later store was larger than the earlier store.
- if (P1 == P2 || AA.isMustAlias(P1, P2)) {
- // Make sure that the Later size is >= the Earlier size.
- if (LaterSize >= EarlierSize)
- return OW_Complete;
- }
-
- // Check to see if the later store is to the entire object (either a global,
- // an alloca, or a byval/inalloca argument). If so, then it clearly
- // overwrites any other store to the same object.
+
+ const uint64_t LaterSize = Later.Size.getValue();
+ const uint64_t EarlierSize = Earlier.Size.getValue();
+
+ const Value *P1 = Earlier.Ptr->stripPointerCasts();
+ const Value *P2 = Later.Ptr->stripPointerCasts();
+
+ // If the start pointers are the same, we just have to compare sizes to see if
+ // the later store was larger than the earlier store.
+ if (P1 == P2 || AA.isMustAlias(P1, P2)) {
+ // Make sure that the Later size is >= the Earlier size.
+ if (LaterSize >= EarlierSize)
+ return OW_Complete;
+ }
+
+ // Check to see if the later store is to the entire object (either a global,
+ // an alloca, or a byval/inalloca argument). If so, then it clearly
+ // overwrites any other store to the same object.
const Value *UO1 = getUnderlyingObject(P1), *UO2 = getUnderlyingObject(P2);
-
- // If we can't resolve the same pointers to the same object, then we can't
- // analyze them at all.
- if (UO1 != UO2)
- return OW_Unknown;
-
- // If the "Later" store is to a recognizable object, get its size.
- uint64_t ObjectSize = getPointerSize(UO2, DL, TLI, F);
- if (ObjectSize != MemoryLocation::UnknownSize)
- if (ObjectSize == LaterSize && ObjectSize >= EarlierSize)
- return OW_Complete;
-
- // Okay, we have stores to two completely different pointers. Try to
- // decompose the pointer into a "base + constant_offset" form. If the base
- // pointers are equal, then we can reason about the two stores.
- EarlierOff = 0;
- LaterOff = 0;
- const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL);
- const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL);
-
- // If the base pointers still differ, we have two completely different stores.
- if (BP1 != BP2)
- return OW_Unknown;
-
+
+ // If we can't resolve the same pointers to the same object, then we can't
+ // analyze them at all.
+ if (UO1 != UO2)
+ return OW_Unknown;
+
+ // If the "Later" store is to a recognizable object, get its size.
+ uint64_t ObjectSize = getPointerSize(UO2, DL, TLI, F);
+ if (ObjectSize != MemoryLocation::UnknownSize)
+ if (ObjectSize == LaterSize && ObjectSize >= EarlierSize)
+ return OW_Complete;
+
+ // Okay, we have stores to two completely different pointers. Try to
+ // decompose the pointer into a "base + constant_offset" form. If the base
+ // pointers are equal, then we can reason about the two stores.
+ EarlierOff = 0;
+ LaterOff = 0;
+ const Value *BP1 = GetPointerBaseWithConstantOffset(P1, EarlierOff, DL);
+ const Value *BP2 = GetPointerBaseWithConstantOffset(P2, LaterOff, DL);
+
+ // If the base pointers still differ, we have two completely different stores.
+ if (BP1 != BP2)
+ return OW_Unknown;
+
// The later access completely overlaps the earlier store if and only if
// both start and end of the earlier one is "inside" the later one:
// |<->|--earlier--|<->|
@@ -510,9 +510,9 @@ isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
// OR
// |----- earlier -----|
// |<->|---later---|<----->|
- //
- // We have to be careful here as *Off is signed while *.Size is unsigned.
-
+ //
+ // We have to be careful here as *Off is signed while *.Size is unsigned.
+
// Check if the earlier access starts "not before" the later one.
if (EarlierOff >= LaterOff) {
// If the earlier access ends "not after" the later access then the earlier
@@ -552,587 +552,587 @@ static OverwriteResult isPartialOverwrite(const MemoryLocation &Later,
InstOverlapIntervalsTy &IOL) {
const uint64_t LaterSize = Later.Size.getValue();
const uint64_t EarlierSize = Earlier.Size.getValue();
- // We may now overlap, although the overlap is not complete. There might also
- // be other incomplete overlaps, and together, they might cover the complete
- // earlier write.
- // Note: The correctness of this logic depends on the fact that this function
- // is not even called providing DepWrite when there are any intervening reads.
- if (EnablePartialOverwriteTracking &&
- LaterOff < int64_t(EarlierOff + EarlierSize) &&
- int64_t(LaterOff + LaterSize) >= EarlierOff) {
-
- // Insert our part of the overlap into the map.
- auto &IM = IOL[DepWrite];
- LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff
- << ", " << int64_t(EarlierOff + EarlierSize)
- << ") Later [" << LaterOff << ", "
- << int64_t(LaterOff + LaterSize) << ")\n");
-
- // Make sure that we only insert non-overlapping intervals and combine
- // adjacent intervals. The intervals are stored in the map with the ending
- // offset as the key (in the half-open sense) and the starting offset as
- // the value.
- int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + LaterSize;
-
- // Find any intervals ending at, or after, LaterIntStart which start
- // before LaterIntEnd.
- auto ILI = IM.lower_bound(LaterIntStart);
- if (ILI != IM.end() && ILI->second <= LaterIntEnd) {
- // This existing interval is overlapped with the current store somewhere
- // in [LaterIntStart, LaterIntEnd]. Merge them by erasing the existing
- // intervals and adjusting our start and end.
- LaterIntStart = std::min(LaterIntStart, ILI->second);
- LaterIntEnd = std::max(LaterIntEnd, ILI->first);
- ILI = IM.erase(ILI);
-
- // Continue erasing and adjusting our end in case other previous
- // intervals are also overlapped with the current store.
- //
- // |--- ealier 1 ---| |--- ealier 2 ---|
- // |------- later---------|
- //
- while (ILI != IM.end() && ILI->second <= LaterIntEnd) {
- assert(ILI->second > LaterIntStart && "Unexpected interval");
- LaterIntEnd = std::max(LaterIntEnd, ILI->first);
- ILI = IM.erase(ILI);
- }
- }
-
- IM[LaterIntEnd] = LaterIntStart;
-
- ILI = IM.begin();
- if (ILI->second <= EarlierOff &&
- ILI->first >= int64_t(EarlierOff + EarlierSize)) {
- LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier ["
- << EarlierOff << ", "
- << int64_t(EarlierOff + EarlierSize)
- << ") Composite Later [" << ILI->second << ", "
- << ILI->first << ")\n");
- ++NumCompletePartials;
- return OW_Complete;
- }
- }
-
- // Check for an earlier store which writes to all the memory locations that
- // the later store writes to.
- if (EnablePartialStoreMerging && LaterOff >= EarlierOff &&
- int64_t(EarlierOff + EarlierSize) > LaterOff &&
- uint64_t(LaterOff - EarlierOff) + LaterSize <= EarlierSize) {
- LLVM_DEBUG(dbgs() << "DSE: Partial overwrite an earlier load ["
- << EarlierOff << ", "
- << int64_t(EarlierOff + EarlierSize)
- << ") by a later store [" << LaterOff << ", "
- << int64_t(LaterOff + LaterSize) << ")\n");
- // TODO: Maybe come up with a better name?
- return OW_PartialEarlierWithFullLater;
- }
-
- // Another interesting case is if the later store overwrites the end of the
- // earlier store.
- //
- // |--earlier--|
- // |-- later --|
- //
- // In this case we may want to trim the size of earlier to avoid generating
- // writes to addresses which will definitely be overwritten later
- if (!EnablePartialOverwriteTracking &&
- (LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + EarlierSize) &&
- int64_t(LaterOff + LaterSize) >= int64_t(EarlierOff + EarlierSize)))
- return OW_End;
-
- // Finally, we also need to check if the later store overwrites the beginning
- // of the earlier store.
- //
- // |--earlier--|
- // |-- later --|
- //
- // In this case we may want to move the destination address and trim the size
- // of earlier to avoid generating writes to addresses which will definitely
- // be overwritten later.
- if (!EnablePartialOverwriteTracking &&
- (LaterOff <= EarlierOff && int64_t(LaterOff + LaterSize) > EarlierOff)) {
- assert(int64_t(LaterOff + LaterSize) < int64_t(EarlierOff + EarlierSize) &&
- "Expect to be handled as OW_Complete");
- return OW_Begin;
- }
- // Otherwise, they don't completely overlap.
- return OW_Unknown;
-}
-
-/// If 'Inst' might be a self read (i.e. a noop copy of a
-/// memory region into an identical pointer) then it doesn't actually make its
-/// input dead in the traditional sense. Consider this case:
-///
-/// memmove(A <- B)
-/// memmove(A <- A)
-///
-/// In this case, the second store to A does not make the first store to A dead.
-/// The usual situation isn't an explicit A<-A store like this (which can be
-/// trivially removed) but a case where two pointers may alias.
-///
-/// This function detects when it is unsafe to remove a dependent instruction
-/// because the DSE inducing instruction may be a self-read.
-static bool isPossibleSelfRead(Instruction *Inst,
- const MemoryLocation &InstStoreLoc,
- Instruction *DepWrite,
- const TargetLibraryInfo &TLI,
- AliasAnalysis &AA) {
- // Self reads can only happen for instructions that read memory. Get the
- // location read.
- MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);
- if (!InstReadLoc.Ptr)
- return false; // Not a reading instruction.
-
- // If the read and written loc obviously don't alias, it isn't a read.
- if (AA.isNoAlias(InstReadLoc, InstStoreLoc))
- return false;
-
- if (isa<AnyMemCpyInst>(Inst)) {
- // LLVM's memcpy overlap semantics are not fully fleshed out (see PR11763)
- // but in practice memcpy(A <- B) either means that A and B are disjoint or
- // are equal (i.e. there are not partial overlaps). Given that, if we have:
- //
- // memcpy/memmove(A <- B) // DepWrite
- // memcpy(A <- B) // Inst
- //
- // with Inst reading/writing a >= size than DepWrite, we can reason as
- // follows:
- //
- // - If A == B then both the copies are no-ops, so the DepWrite can be
- // removed.
- // - If A != B then A and B are disjoint locations in Inst. Since
- // Inst.size >= DepWrite.size A and B are disjoint in DepWrite too.
- // Therefore DepWrite can be removed.
- MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
-
- if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
- return false;
- }
-
- // If DepWrite doesn't read memory or if we can't prove it is a must alias,
- // then it can't be considered dead.
- return true;
-}
-
-/// Returns true if the memory which is accessed by the second instruction is not
-/// modified between the first and the second instruction.
-/// Precondition: Second instruction must be dominated by the first
-/// instruction.
+ // We may now overlap, although the overlap is not complete. There might also
+ // be other incomplete overlaps, and together, they might cover the complete
+ // earlier write.
+ // Note: The correctness of this logic depends on the fact that this function
+ // is not even called providing DepWrite when there are any intervening reads.
+ if (EnablePartialOverwriteTracking &&
+ LaterOff < int64_t(EarlierOff + EarlierSize) &&
+ int64_t(LaterOff + LaterSize) >= EarlierOff) {
+
+ // Insert our part of the overlap into the map.
+ auto &IM = IOL[DepWrite];
+ LLVM_DEBUG(dbgs() << "DSE: Partial overwrite: Earlier [" << EarlierOff
+ << ", " << int64_t(EarlierOff + EarlierSize)
+ << ") Later [" << LaterOff << ", "
+ << int64_t(LaterOff + LaterSize) << ")\n");
+
+ // Make sure that we only insert non-overlapping intervals and combine
+ // adjacent intervals. The intervals are stored in the map with the ending
+ // offset as the key (in the half-open sense) and the starting offset as
+ // the value.
+ int64_t LaterIntStart = LaterOff, LaterIntEnd = LaterOff + LaterSize;
+
+ // Find any intervals ending at, or after, LaterIntStart which start
+ // before LaterIntEnd.
+ auto ILI = IM.lower_bound(LaterIntStart);
+ if (ILI != IM.end() && ILI->second <= LaterIntEnd) {
+ // This existing interval is overlapped with the current store somewhere
+ // in [LaterIntStart, LaterIntEnd]. Merge them by erasing the existing
+ // intervals and adjusting our start and end.
+ LaterIntStart = std::min(LaterIntStart, ILI->second);
+ LaterIntEnd = std::max(LaterIntEnd, ILI->first);
+ ILI = IM.erase(ILI);
+
+ // Continue erasing and adjusting our end in case other previous
+ // intervals are also overlapped with the current store.
+ //
+ // |--- ealier 1 ---| |--- ealier 2 ---|
+ // |------- later---------|
+ //
+ while (ILI != IM.end() && ILI->second <= LaterIntEnd) {
+ assert(ILI->second > LaterIntStart && "Unexpected interval");
+ LaterIntEnd = std::max(LaterIntEnd, ILI->first);
+ ILI = IM.erase(ILI);
+ }
+ }
+
+ IM[LaterIntEnd] = LaterIntStart;
+
+ ILI = IM.begin();
+ if (ILI->second <= EarlierOff &&
+ ILI->first >= int64_t(EarlierOff + EarlierSize)) {
+ LLVM_DEBUG(dbgs() << "DSE: Full overwrite from partials: Earlier ["
+ << EarlierOff << ", "
+ << int64_t(EarlierOff + EarlierSize)
+ << ") Composite Later [" << ILI->second << ", "
+ << ILI->first << ")\n");
+ ++NumCompletePartials;
+ return OW_Complete;
+ }
+ }
+
+ // Check for an earlier store which writes to all the memory locations that
+ // the later store writes to.
+ if (EnablePartialStoreMerging && LaterOff >= EarlierOff &&
+ int64_t(EarlierOff + EarlierSize) > LaterOff &&
+ uint64_t(LaterOff - EarlierOff) + LaterSize <= EarlierSize) {
+ LLVM_DEBUG(dbgs() << "DSE: Partial overwrite an earlier load ["
+ << EarlierOff << ", "
+ << int64_t(EarlierOff + EarlierSize)
+ << ") by a later store [" << LaterOff << ", "
+ << int64_t(LaterOff + LaterSize) << ")\n");
+ // TODO: Maybe come up with a better name?
+ return OW_PartialEarlierWithFullLater;
+ }
+
+ // Another interesting case is if the later store overwrites the end of the
+ // earlier store.
+ //
+ // |--earlier--|
+ // |-- later --|
+ //
+ // In this case we may want to trim the size of earlier to avoid generating
+ // writes to addresses which will definitely be overwritten later
+ if (!EnablePartialOverwriteTracking &&
+ (LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + EarlierSize) &&
+ int64_t(LaterOff + LaterSize) >= int64_t(EarlierOff + EarlierSize)))
+ return OW_End;
+
+ // Finally, we also need to check if the later store overwrites the beginning
+ // of the earlier store.
+ //
+ // |--earlier--|
+ // |-- later --|
+ //
+ // In this case we may want to move the destination address and trim the size
+ // of earlier to avoid generating writes to addresses which will definitely
+ // be overwritten later.
+ if (!EnablePartialOverwriteTracking &&
+ (LaterOff <= EarlierOff && int64_t(LaterOff + LaterSize) > EarlierOff)) {
+ assert(int64_t(LaterOff + LaterSize) < int64_t(EarlierOff + EarlierSize) &&
+ "Expect to be handled as OW_Complete");
+ return OW_Begin;
+ }
+ // Otherwise, they don't completely overlap.
+ return OW_Unknown;
+}
+
+/// If 'Inst' might be a self read (i.e. a noop copy of a
+/// memory region into an identical pointer) then it doesn't actually make its
+/// input dead in the traditional sense. Consider this case:
+///
+/// memmove(A <- B)
+/// memmove(A <- A)
+///
+/// In this case, the second store to A does not make the first store to A dead.
+/// The usual situation isn't an explicit A<-A store like this (which can be
+/// trivially removed) but a case where two pointers may alias.
+///
+/// This function detects when it is unsafe to remove a dependent instruction
+/// because the DSE inducing instruction may be a self-read.
+static bool isPossibleSelfRead(Instruction *Inst,
+ const MemoryLocation &InstStoreLoc,
+ Instruction *DepWrite,
+ const TargetLibraryInfo &TLI,
+ AliasAnalysis &AA) {
+ // Self reads can only happen for instructions that read memory. Get the
+ // location read.
+ MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);
+ if (!InstReadLoc.Ptr)
+ return false; // Not a reading instruction.
+
+ // If the read and written loc obviously don't alias, it isn't a read.
+ if (AA.isNoAlias(InstReadLoc, InstStoreLoc))
+ return false;
+
+ if (isa<AnyMemCpyInst>(Inst)) {
+ // LLVM's memcpy overlap semantics are not fully fleshed out (see PR11763)
+ // but in practice memcpy(A <- B) either means that A and B are disjoint or
+ // are equal (i.e. there are not partial overlaps). Given that, if we have:
+ //
+ // memcpy/memmove(A <- B) // DepWrite
+ // memcpy(A <- B) // Inst
+ //
+ // with Inst reading/writing a >= size than DepWrite, we can reason as
+ // follows:
+ //
+ // - If A == B then both the copies are no-ops, so the DepWrite can be
+ // removed.
+ // - If A != B then A and B are disjoint locations in Inst. Since
+ // Inst.size >= DepWrite.size A and B are disjoint in DepWrite too.
+ // Therefore DepWrite can be removed.
+ MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
+
+ if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
+ return false;
+ }
+
+ // If DepWrite doesn't read memory or if we can't prove it is a must alias,
+ // then it can't be considered dead.
+ return true;
+}
+
+/// Returns true if the memory which is accessed by the second instruction is not
+/// modified between the first and the second instruction.
+/// Precondition: Second instruction must be dominated by the first
+/// instruction.
template <typename AATy>
static bool
memoryIsNotModifiedBetween(Instruction *FirstI, Instruction *SecondI, AATy &AA,
const DataLayout &DL, DominatorTree *DT) {
- // Do a backwards scan through the CFG from SecondI to FirstI. Look for
- // instructions which can modify the memory location accessed by SecondI.
- //
- // While doing the walk keep track of the address to check. It might be
- // different in different basic blocks due to PHI translation.
- using BlockAddressPair = std::pair<BasicBlock *, PHITransAddr>;
- SmallVector<BlockAddressPair, 16> WorkList;
- // Keep track of the address we visited each block with. Bail out if we
- // visit a block with different addresses.
- DenseMap<BasicBlock *, Value *> Visited;
-
- BasicBlock::iterator FirstBBI(FirstI);
- ++FirstBBI;
- BasicBlock::iterator SecondBBI(SecondI);
- BasicBlock *FirstBB = FirstI->getParent();
- BasicBlock *SecondBB = SecondI->getParent();
- MemoryLocation MemLoc = MemoryLocation::get(SecondI);
- auto *MemLocPtr = const_cast<Value *>(MemLoc.Ptr);
-
- // Start checking the SecondBB.
- WorkList.push_back(
- std::make_pair(SecondBB, PHITransAddr(MemLocPtr, DL, nullptr)));
- bool isFirstBlock = true;
-
- // Check all blocks going backward until we reach the FirstBB.
- while (!WorkList.empty()) {
- BlockAddressPair Current = WorkList.pop_back_val();
- BasicBlock *B = Current.first;
- PHITransAddr &Addr = Current.second;
- Value *Ptr = Addr.getAddr();
-
- // Ignore instructions before FirstI if this is the FirstBB.
- BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin());
-
- BasicBlock::iterator EI;
- if (isFirstBlock) {
- // Ignore instructions after SecondI if this is the first visit of SecondBB.
- assert(B == SecondBB && "first block is not the store block");
- EI = SecondBBI;
- isFirstBlock = false;
- } else {
- // It's not SecondBB or (in case of a loop) the second visit of SecondBB.
- // In this case we also have to look at instructions after SecondI.
- EI = B->end();
- }
- for (; BI != EI; ++BI) {
- Instruction *I = &*BI;
- if (I->mayWriteToMemory() && I != SecondI)
+ // Do a backwards scan through the CFG from SecondI to FirstI. Look for
+ // instructions which can modify the memory location accessed by SecondI.
+ //
+ // While doing the walk keep track of the address to check. It might be
+ // different in different basic blocks due to PHI translation.
+ using BlockAddressPair = std::pair<BasicBlock *, PHITransAddr>;
+ SmallVector<BlockAddressPair, 16> WorkList;
+ // Keep track of the address we visited each block with. Bail out if we
+ // visit a block with different addresses.
+ DenseMap<BasicBlock *, Value *> Visited;
+
+ BasicBlock::iterator FirstBBI(FirstI);
+ ++FirstBBI;
+ BasicBlock::iterator SecondBBI(SecondI);
+ BasicBlock *FirstBB = FirstI->getParent();
+ BasicBlock *SecondBB = SecondI->getParent();
+ MemoryLocation MemLoc = MemoryLocation::get(SecondI);
+ auto *MemLocPtr = const_cast<Value *>(MemLoc.Ptr);
+
+ // Start checking the SecondBB.
+ WorkList.push_back(
+ std::make_pair(SecondBB, PHITransAddr(MemLocPtr, DL, nullptr)));
+ bool isFirstBlock = true;
+
+ // Check all blocks going backward until we reach the FirstBB.
+ while (!WorkList.empty()) {
+ BlockAddressPair Current = WorkList.pop_back_val();
+ BasicBlock *B = Current.first;
+ PHITransAddr &Addr = Current.second;
+ Value *Ptr = Addr.getAddr();
+
+ // Ignore instructions before FirstI if this is the FirstBB.
+ BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin());
+
+ BasicBlock::iterator EI;
+ if (isFirstBlock) {
+ // Ignore instructions after SecondI if this is the first visit of SecondBB.
+ assert(B == SecondBB && "first block is not the store block");
+ EI = SecondBBI;
+ isFirstBlock = false;
+ } else {
+ // It's not SecondBB or (in case of a loop) the second visit of SecondBB.
+ // In this case we also have to look at instructions after SecondI.
+ EI = B->end();
+ }
+ for (; BI != EI; ++BI) {
+ Instruction *I = &*BI;
+ if (I->mayWriteToMemory() && I != SecondI)
if (isModSet(AA.getModRefInfo(I, MemLoc.getWithNewPtr(Ptr))))
- return false;
- }
- if (B != FirstBB) {
- assert(B != &FirstBB->getParent()->getEntryBlock() &&
- "Should not hit the entry block because SI must be dominated by LI");
- for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) {
- PHITransAddr PredAddr = Addr;
- if (PredAddr.NeedsPHITranslationFromBlock(B)) {
- if (!PredAddr.IsPotentiallyPHITranslatable())
- return false;
- if (PredAddr.PHITranslateValue(B, *PredI, DT, false))
- return false;
- }
- Value *TranslatedPtr = PredAddr.getAddr();
- auto Inserted = Visited.insert(std::make_pair(*PredI, TranslatedPtr));
- if (!Inserted.second) {
- // We already visited this block before. If it was with a different
- // address - bail out!
- if (TranslatedPtr != Inserted.first->second)
- return false;
- // ... otherwise just skip it.
- continue;
- }
- WorkList.push_back(std::make_pair(*PredI, PredAddr));
- }
- }
- }
- return true;
-}
-
-/// Find all blocks that will unconditionally lead to the block BB and append
-/// them to F.
-static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
- BasicBlock *BB, DominatorTree *DT) {
- for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
- BasicBlock *Pred = *I;
- if (Pred == BB) continue;
- Instruction *PredTI = Pred->getTerminator();
- if (PredTI->getNumSuccessors() != 1)
- continue;
-
- if (DT->isReachableFromEntry(Pred))
- Blocks.push_back(Pred);
- }
-}
-
-/// Handle frees of entire structures whose dependency is a store
-/// to a field of that structure.
-static bool handleFree(CallInst *F, AliasAnalysis *AA,
- MemoryDependenceResults *MD, DominatorTree *DT,
- const TargetLibraryInfo *TLI,
- InstOverlapIntervalsTy &IOL,
- MapVector<Instruction *, bool> &ThrowableInst) {
- bool MadeChange = false;
-
+ return false;
+ }
+ if (B != FirstBB) {
+ assert(B != &FirstBB->getParent()->getEntryBlock() &&
+ "Should not hit the entry block because SI must be dominated by LI");
+ for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) {
+ PHITransAddr PredAddr = Addr;
+ if (PredAddr.NeedsPHITranslationFromBlock(B)) {
+ if (!PredAddr.IsPotentiallyPHITranslatable())
+ return false;
+ if (PredAddr.PHITranslateValue(B, *PredI, DT, false))
+ return false;
+ }
+ Value *TranslatedPtr = PredAddr.getAddr();
+ auto Inserted = Visited.insert(std::make_pair(*PredI, TranslatedPtr));
+ if (!Inserted.second) {
+ // We already visited this block before. If it was with a different
+ // address - bail out!
+ if (TranslatedPtr != Inserted.first->second)
+ return false;
+ // ... otherwise just skip it.
+ continue;
+ }
+ WorkList.push_back(std::make_pair(*PredI, PredAddr));
+ }
+ }
+ }
+ return true;
+}
+
+/// Find all blocks that will unconditionally lead to the block BB and append
+/// them to F.
+static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
+ BasicBlock *BB, DominatorTree *DT) {
+ for (pred_iterator I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+ BasicBlock *Pred = *I;
+ if (Pred == BB) continue;
+ Instruction *PredTI = Pred->getTerminator();
+ if (PredTI->getNumSuccessors() != 1)
+ continue;
+
+ if (DT->isReachableFromEntry(Pred))
+ Blocks.push_back(Pred);
+ }
+}
+
+/// Handle frees of entire structures whose dependency is a store
+/// to a field of that structure.
+static bool handleFree(CallInst *F, AliasAnalysis *AA,
+ MemoryDependenceResults *MD, DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
+ InstOverlapIntervalsTy &IOL,
+ MapVector<Instruction *, bool> &ThrowableInst) {
+ bool MadeChange = false;
+
MemoryLocation Loc = MemoryLocation::getAfter(F->getOperand(0));
- SmallVector<BasicBlock *, 16> Blocks;
- Blocks.push_back(F->getParent());
-
- while (!Blocks.empty()) {
- BasicBlock *BB = Blocks.pop_back_val();
- Instruction *InstPt = BB->getTerminator();
- if (BB == F->getParent()) InstPt = F;
-
- MemDepResult Dep =
- MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB);
- while (Dep.isDef() || Dep.isClobber()) {
- Instruction *Dependency = Dep.getInst();
- if (!hasAnalyzableMemoryWrite(Dependency, *TLI) ||
- !isRemovable(Dependency))
- break;
-
- Value *DepPointer =
+ SmallVector<BasicBlock *, 16> Blocks;
+ Blocks.push_back(F->getParent());
+
+ while (!Blocks.empty()) {
+ BasicBlock *BB = Blocks.pop_back_val();
+ Instruction *InstPt = BB->getTerminator();
+ if (BB == F->getParent()) InstPt = F;
+
+ MemDepResult Dep =
+ MD->getPointerDependencyFrom(Loc, false, InstPt->getIterator(), BB);
+ while (Dep.isDef() || Dep.isClobber()) {
+ Instruction *Dependency = Dep.getInst();
+ if (!hasAnalyzableMemoryWrite(Dependency, *TLI) ||
+ !isRemovable(Dependency))
+ break;
+
+ Value *DepPointer =
getUnderlyingObject(getStoredPointerOperand(Dependency, *TLI));
-
- // Check for aliasing.
- if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
- break;
-
- LLVM_DEBUG(
- dbgs() << "DSE: Dead Store to soon to be freed memory:\n DEAD: "
- << *Dependency << '\n');
-
- // DCE instructions only used to calculate that store.
- BasicBlock::iterator BBI(Dependency);
- deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL,
- ThrowableInst);
- ++NumFastStores;
- MadeChange = true;
-
- // Inst's old Dependency is now deleted. Compute the next dependency,
- // which may also be dead, as in
- // s[0] = 0;
- // s[1] = 0; // This has just been deleted.
- // free(s);
- Dep = MD->getPointerDependencyFrom(Loc, false, BBI, BB);
- }
-
- if (Dep.isNonLocal())
- findUnconditionalPreds(Blocks, BB, DT);
- }
-
- return MadeChange;
-}
-
-/// Check to see if the specified location may alias any of the stack objects in
-/// the DeadStackObjects set. If so, they become live because the location is
-/// being loaded.
-static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
- SmallSetVector<const Value *, 16> &DeadStackObjects,
- const DataLayout &DL, AliasAnalysis *AA,
- const TargetLibraryInfo *TLI,
- const Function *F) {
+
+ // Check for aliasing.
+ if (!AA->isMustAlias(F->getArgOperand(0), DepPointer))
+ break;
+
+ LLVM_DEBUG(
+ dbgs() << "DSE: Dead Store to soon to be freed memory:\n DEAD: "
+ << *Dependency << '\n');
+
+ // DCE instructions only used to calculate that store.
+ BasicBlock::iterator BBI(Dependency);
+ deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL,
+ ThrowableInst);
+ ++NumFastStores;
+ MadeChange = true;
+
+ // Inst's old Dependency is now deleted. Compute the next dependency,
+ // which may also be dead, as in
+ // s[0] = 0;
+ // s[1] = 0; // This has just been deleted.
+ // free(s);
+ Dep = MD->getPointerDependencyFrom(Loc, false, BBI, BB);
+ }
+
+ if (Dep.isNonLocal())
+ findUnconditionalPreds(Blocks, BB, DT);
+ }
+
+ return MadeChange;
+}
+
+/// Check to see if the specified location may alias any of the stack objects in
+/// the DeadStackObjects set. If so, they become live because the location is
+/// being loaded.
+static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
+ SmallSetVector<const Value *, 16> &DeadStackObjects,
+ const DataLayout &DL, AliasAnalysis *AA,
+ const TargetLibraryInfo *TLI,
+ const Function *F) {
const Value *UnderlyingPointer = getUnderlyingObject(LoadedLoc.Ptr);
-
- // A constant can't be in the dead pointer set.
- if (isa<Constant>(UnderlyingPointer))
- return;
-
- // If the kill pointer can be easily reduced to an alloca, don't bother doing
- // extraneous AA queries.
- if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
- DeadStackObjects.remove(UnderlyingPointer);
- return;
- }
-
- // Remove objects that could alias LoadedLoc.
- DeadStackObjects.remove_if([&](const Value *I) {
- // See if the loaded location could alias the stack location.
- MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI, F));
- return !AA->isNoAlias(StackLoc, LoadedLoc);
- });
-}
-
-/// Remove dead stores to stack-allocated locations in the function end block.
-/// Ex:
-/// %A = alloca i32
-/// ...
-/// store i32 1, i32* %A
-/// ret void
-static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
- MemoryDependenceResults *MD,
- const TargetLibraryInfo *TLI,
- InstOverlapIntervalsTy &IOL,
- MapVector<Instruction *, bool> &ThrowableInst) {
- bool MadeChange = false;
-
- // Keep track of all of the stack objects that are dead at the end of the
- // function.
- SmallSetVector<const Value*, 16> DeadStackObjects;
-
- // Find all of the alloca'd pointers in the entry block.
- BasicBlock &Entry = BB.getParent()->front();
- for (Instruction &I : Entry) {
- if (isa<AllocaInst>(&I))
- DeadStackObjects.insert(&I);
-
- // Okay, so these are dead heap objects, but if the pointer never escapes
- // then it's leaked by this function anyways.
- else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true))
- DeadStackObjects.insert(&I);
- }
-
- // Treat byval or inalloca arguments the same, stores to them are dead at the
- // end of the function.
- for (Argument &AI : BB.getParent()->args())
+
+ // A constant can't be in the dead pointer set.
+ if (isa<Constant>(UnderlyingPointer))
+ return;
+
+ // If the kill pointer can be easily reduced to an alloca, don't bother doing
+ // extraneous AA queries.
+ if (isa<AllocaInst>(UnderlyingPointer) || isa<Argument>(UnderlyingPointer)) {
+ DeadStackObjects.remove(UnderlyingPointer);
+ return;
+ }
+
+ // Remove objects that could alias LoadedLoc.
+ DeadStackObjects.remove_if([&](const Value *I) {
+ // See if the loaded location could alias the stack location.
+ MemoryLocation StackLoc(I, getPointerSize(I, DL, *TLI, F));
+ return !AA->isNoAlias(StackLoc, LoadedLoc);
+ });
+}
+
+/// Remove dead stores to stack-allocated locations in the function end block.
+/// Ex:
+/// %A = alloca i32
+/// ...
+/// store i32 1, i32* %A
+/// ret void
+static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
+ MemoryDependenceResults *MD,
+ const TargetLibraryInfo *TLI,
+ InstOverlapIntervalsTy &IOL,
+ MapVector<Instruction *, bool> &ThrowableInst) {
+ bool MadeChange = false;
+
+ // Keep track of all of the stack objects that are dead at the end of the
+ // function.
+ SmallSetVector<const Value*, 16> DeadStackObjects;
+
+ // Find all of the alloca'd pointers in the entry block.
+ BasicBlock &Entry = BB.getParent()->front();
+ for (Instruction &I : Entry) {
+ if (isa<AllocaInst>(&I))
+ DeadStackObjects.insert(&I);
+
+ // Okay, so these are dead heap objects, but if the pointer never escapes
+ // then it's leaked by this function anyways.
+ else if (isAllocLikeFn(&I, TLI) && !PointerMayBeCaptured(&I, true, true))
+ DeadStackObjects.insert(&I);
+ }
+
+ // Treat byval or inalloca arguments the same, stores to them are dead at the
+ // end of the function.
+ for (Argument &AI : BB.getParent()->args())
if (AI.hasPassPointeeByValueCopyAttr())
- DeadStackObjects.insert(&AI);
-
- const DataLayout &DL = BB.getModule()->getDataLayout();
-
- // Scan the basic block backwards
- for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
- --BBI;
-
- // If we find a store, check to see if it points into a dead stack value.
- if (hasAnalyzableMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
- // See through pointer-to-pointer bitcasts
- SmallVector<const Value *, 4> Pointers;
+ DeadStackObjects.insert(&AI);
+
+ const DataLayout &DL = BB.getModule()->getDataLayout();
+
+ // Scan the basic block backwards
+ for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
+ --BBI;
+
+ // If we find a store, check to see if it points into a dead stack value.
+ if (hasAnalyzableMemoryWrite(&*BBI, *TLI) && isRemovable(&*BBI)) {
+ // See through pointer-to-pointer bitcasts
+ SmallVector<const Value *, 4> Pointers;
getUnderlyingObjects(getStoredPointerOperand(&*BBI, *TLI), Pointers);
-
- // Stores to stack values are valid candidates for removal.
- bool AllDead = true;
- for (const Value *Pointer : Pointers)
- if (!DeadStackObjects.count(Pointer)) {
- AllDead = false;
- break;
- }
-
- if (AllDead) {
- Instruction *Dead = &*BBI;
-
- LLVM_DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: "
- << *Dead << "\n Objects: ";
- for (SmallVectorImpl<const Value *>::iterator I =
- Pointers.begin(),
- E = Pointers.end();
- I != E; ++I) {
- dbgs() << **I;
- if (std::next(I) != E)
- dbgs() << ", ";
- } dbgs()
- << '\n');
-
- // DCE instructions only used to calculate that store.
- deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, ThrowableInst,
- &DeadStackObjects);
- ++NumFastStores;
- MadeChange = true;
- continue;
- }
- }
-
- // Remove any dead non-memory-mutating instructions.
- if (isInstructionTriviallyDead(&*BBI, TLI)) {
- LLVM_DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: "
- << *&*BBI << '\n');
- deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, ThrowableInst,
- &DeadStackObjects);
- ++NumFastOther;
- MadeChange = true;
- continue;
- }
-
- if (isa<AllocaInst>(BBI)) {
- // Remove allocas from the list of dead stack objects; there can't be
- // any references before the definition.
- DeadStackObjects.remove(&*BBI);
- continue;
- }
-
- if (auto *Call = dyn_cast<CallBase>(&*BBI)) {
- // Remove allocation function calls from the list of dead stack objects;
- // there can't be any references before the definition.
- if (isAllocLikeFn(&*BBI, TLI))
- DeadStackObjects.remove(&*BBI);
-
- // If this call does not access memory, it can't be loading any of our
- // pointers.
- if (AA->doesNotAccessMemory(Call))
- continue;
-
- // If the call might load from any of our allocas, then any store above
- // the call is live.
- DeadStackObjects.remove_if([&](const Value *I) {
- // See if the call site touches the value.
- return isRefSet(AA->getModRefInfo(
- Call, I, getPointerSize(I, DL, *TLI, BB.getParent())));
- });
-
- // If all of the allocas were clobbered by the call then we're not going
- // to find anything else to process.
- if (DeadStackObjects.empty())
- break;
-
- continue;
- }
-
- // We can remove the dead stores, irrespective of the fence and its ordering
- // (release/acquire/seq_cst). Fences only constraints the ordering of
- // already visible stores, it does not make a store visible to other
- // threads. So, skipping over a fence does not change a store from being
- // dead.
- if (isa<FenceInst>(*BBI))
- continue;
-
- MemoryLocation LoadedLoc;
-
- // If we encounter a use of the pointer, it is no longer considered dead
- if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
- if (!L->isUnordered()) // Be conservative with atomic/volatile load
- break;
- LoadedLoc = MemoryLocation::get(L);
- } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
- LoadedLoc = MemoryLocation::get(V);
- } else if (!BBI->mayReadFromMemory()) {
- // Instruction doesn't read memory. Note that stores that weren't removed
- // above will hit this case.
- continue;
- } else {
- // Unknown inst; assume it clobbers everything.
- break;
- }
-
- // Remove any allocas from the DeadPointer set that are loaded, as this
- // makes any stores above the access live.
- removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI, BB.getParent());
-
- // If all of the allocas were clobbered by the access then we're not going
- // to find anything else to process.
- if (DeadStackObjects.empty())
- break;
- }
-
- return MadeChange;
-}
-
-static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
+
+ // Stores to stack values are valid candidates for removal.
+ bool AllDead = true;
+ for (const Value *Pointer : Pointers)
+ if (!DeadStackObjects.count(Pointer)) {
+ AllDead = false;
+ break;
+ }
+
+ if (AllDead) {
+ Instruction *Dead = &*BBI;
+
+ LLVM_DEBUG(dbgs() << "DSE: Dead Store at End of Block:\n DEAD: "
+ << *Dead << "\n Objects: ";
+ for (SmallVectorImpl<const Value *>::iterator I =
+ Pointers.begin(),
+ E = Pointers.end();
+ I != E; ++I) {
+ dbgs() << **I;
+ if (std::next(I) != E)
+ dbgs() << ", ";
+ } dbgs()
+ << '\n');
+
+ // DCE instructions only used to calculate that store.
+ deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, ThrowableInst,
+ &DeadStackObjects);
+ ++NumFastStores;
+ MadeChange = true;
+ continue;
+ }
+ }
+
+ // Remove any dead non-memory-mutating instructions.
+ if (isInstructionTriviallyDead(&*BBI, TLI)) {
+ LLVM_DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n DEAD: "
+ << *&*BBI << '\n');
+ deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, ThrowableInst,
+ &DeadStackObjects);
+ ++NumFastOther;
+ MadeChange = true;
+ continue;
+ }
+
+ if (isa<AllocaInst>(BBI)) {
+ // Remove allocas from the list of dead stack objects; there can't be
+ // any references before the definition.
+ DeadStackObjects.remove(&*BBI);
+ continue;
+ }
+
+ if (auto *Call = dyn_cast<CallBase>(&*BBI)) {
+ // Remove allocation function calls from the list of dead stack objects;
+ // there can't be any references before the definition.
+ if (isAllocLikeFn(&*BBI, TLI))
+ DeadStackObjects.remove(&*BBI);
+
+ // If this call does not access memory, it can't be loading any of our
+ // pointers.
+ if (AA->doesNotAccessMemory(Call))
+ continue;
+
+ // If the call might load from any of our allocas, then any store above
+ // the call is live.
+ DeadStackObjects.remove_if([&](const Value *I) {
+ // See if the call site touches the value.
+ return isRefSet(AA->getModRefInfo(
+ Call, I, getPointerSize(I, DL, *TLI, BB.getParent())));
+ });
+
+ // If all of the allocas were clobbered by the call then we're not going
+ // to find anything else to process.
+ if (DeadStackObjects.empty())
+ break;
+
+ continue;
+ }
+
+ // We can remove the dead stores, irrespective of the fence and its ordering
+ // (release/acquire/seq_cst). Fences only constraints the ordering of
+ // already visible stores, it does not make a store visible to other
+ // threads. So, skipping over a fence does not change a store from being
+ // dead.
+ if (isa<FenceInst>(*BBI))
+ continue;
+
+ MemoryLocation LoadedLoc;
+
+ // If we encounter a use of the pointer, it is no longer considered dead
+ if (LoadInst *L = dyn_cast<LoadInst>(BBI)) {
+ if (!L->isUnordered()) // Be conservative with atomic/volatile load
+ break;
+ LoadedLoc = MemoryLocation::get(L);
+ } else if (VAArgInst *V = dyn_cast<VAArgInst>(BBI)) {
+ LoadedLoc = MemoryLocation::get(V);
+ } else if (!BBI->mayReadFromMemory()) {
+ // Instruction doesn't read memory. Note that stores that weren't removed
+ // above will hit this case.
+ continue;
+ } else {
+ // Unknown inst; assume it clobbers everything.
+ break;
+ }
+
+ // Remove any allocas from the DeadPointer set that are loaded, as this
+ // makes any stores above the access live.
+ removeAccessedObjects(LoadedLoc, DeadStackObjects, DL, AA, TLI, BB.getParent());
+
+ // If all of the allocas were clobbered by the access then we're not going
+ // to find anything else to process.
+ if (DeadStackObjects.empty())
+ break;
+ }
+
+ return MadeChange;
+}
+
+static bool tryToShorten(Instruction *EarlierWrite, int64_t &EarlierOffset,
uint64_t &EarlierSize, int64_t LaterOffset,
uint64_t LaterSize, bool IsOverwriteEnd) {
- // TODO: base this on the target vector size so that if the earlier
- // store was too small to get vector writes anyway then its likely
- // a good idea to shorten it
- // Power of 2 vector writes are probably always a bad idea to optimize
- // as any store/memset/memcpy is likely using vector instructions so
- // shortening it to not vector size is likely to be slower
- auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite);
- unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment();
- if (!IsOverwriteEnd)
- LaterOffset = int64_t(LaterOffset + LaterSize);
-
- if (!(isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
- !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
- return false;
-
- int64_t NewLength = IsOverwriteEnd
- ? LaterOffset - EarlierOffset
- : EarlierSize - (LaterOffset - EarlierOffset);
-
- if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) {
- // When shortening an atomic memory intrinsic, the newly shortened
- // length must remain an integer multiple of the element size.
- const uint32_t ElementSize = AMI->getElementSizeInBytes();
- if (0 != NewLength % ElementSize)
- return false;
- }
-
- LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
- << (IsOverwriteEnd ? "END" : "BEGIN") << ": "
- << *EarlierWrite << "\n KILLER (offset " << LaterOffset
- << ", " << EarlierSize << ")\n");
-
- Value *EarlierWriteLength = EarlierIntrinsic->getLength();
- Value *TrimmedLength =
- ConstantInt::get(EarlierWriteLength->getType(), NewLength);
- EarlierIntrinsic->setLength(TrimmedLength);
-
- EarlierSize = NewLength;
- if (!IsOverwriteEnd) {
- int64_t OffsetMoved = (LaterOffset - EarlierOffset);
- Value *Indices[1] = {
- ConstantInt::get(EarlierWriteLength->getType(), OffsetMoved)};
- GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds(
- EarlierIntrinsic->getRawDest()->getType()->getPointerElementType(),
- EarlierIntrinsic->getRawDest(), Indices, "", EarlierWrite);
- NewDestGEP->setDebugLoc(EarlierIntrinsic->getDebugLoc());
- EarlierIntrinsic->setDest(NewDestGEP);
- EarlierOffset = EarlierOffset + OffsetMoved;
- }
- return true;
-}
-
-static bool tryToShortenEnd(Instruction *EarlierWrite,
- OverlapIntervalsTy &IntervalMap,
+ // TODO: base this on the target vector size so that if the earlier
+ // store was too small to get vector writes anyway then its likely
+ // a good idea to shorten it
+ // Power of 2 vector writes are probably always a bad idea to optimize
+ // as any store/memset/memcpy is likely using vector instructions so
+ // shortening it to not vector size is likely to be slower
+ auto *EarlierIntrinsic = cast<AnyMemIntrinsic>(EarlierWrite);
+ unsigned EarlierWriteAlign = EarlierIntrinsic->getDestAlignment();
+ if (!IsOverwriteEnd)
+ LaterOffset = int64_t(LaterOffset + LaterSize);
+
+ if (!(isPowerOf2_64(LaterOffset) && EarlierWriteAlign <= LaterOffset) &&
+ !((EarlierWriteAlign != 0) && LaterOffset % EarlierWriteAlign == 0))
+ return false;
+
+ int64_t NewLength = IsOverwriteEnd
+ ? LaterOffset - EarlierOffset
+ : EarlierSize - (LaterOffset - EarlierOffset);
+
+ if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(EarlierWrite)) {
+ // When shortening an atomic memory intrinsic, the newly shortened
+ // length must remain an integer multiple of the element size.
+ const uint32_t ElementSize = AMI->getElementSizeInBytes();
+ if (0 != NewLength % ElementSize)
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n OW "
+ << (IsOverwriteEnd ? "END" : "BEGIN") << ": "
+ << *EarlierWrite << "\n KILLER (offset " << LaterOffset
+ << ", " << EarlierSize << ")\n");
+
+ Value *EarlierWriteLength = EarlierIntrinsic->getLength();
+ Value *TrimmedLength =
+ ConstantInt::get(EarlierWriteLength->getType(), NewLength);
+ EarlierIntrinsic->setLength(TrimmedLength);
+
+ EarlierSize = NewLength;
+ if (!IsOverwriteEnd) {
+ int64_t OffsetMoved = (LaterOffset - EarlierOffset);
+ Value *Indices[1] = {
+ ConstantInt::get(EarlierWriteLength->getType(), OffsetMoved)};
+ GetElementPtrInst *NewDestGEP = GetElementPtrInst::CreateInBounds(
+ EarlierIntrinsic->getRawDest()->getType()->getPointerElementType(),
+ EarlierIntrinsic->getRawDest(), Indices, "", EarlierWrite);
+ NewDestGEP->setDebugLoc(EarlierIntrinsic->getDebugLoc());
+ EarlierIntrinsic->setDest(NewDestGEP);
+ EarlierOffset = EarlierOffset + OffsetMoved;
+ }
+ return true;
+}
+
+static bool tryToShortenEnd(Instruction *EarlierWrite,
+ OverlapIntervalsTy &IntervalMap,
int64_t &EarlierStart, uint64_t &EarlierSize) {
- if (IntervalMap.empty() || !isShortenableAtTheEnd(EarlierWrite))
- return false;
-
- OverlapIntervalsTy::iterator OII = --IntervalMap.end();
- int64_t LaterStart = OII->second;
+ if (IntervalMap.empty() || !isShortenableAtTheEnd(EarlierWrite))
+ return false;
+
+ OverlapIntervalsTy::iterator OII = --IntervalMap.end();
+ int64_t LaterStart = OII->second;
uint64_t LaterSize = OII->first - LaterStart;
-
+
assert(OII->first - LaterStart >= 0 && "Size expected to be positive");
if (LaterStart > EarlierStart &&
@@ -1142,25 +1142,25 @@ static bool tryToShortenEnd(Instruction *EarlierWrite,
// Note: "EarlierSize - (uint64_t)(LaterStart - EarlierStart)" is known to
// be non negative due to preceding checks.
LaterSize >= EarlierSize - (uint64_t)(LaterStart - EarlierStart)) {
- if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
- LaterSize, true)) {
- IntervalMap.erase(OII);
- return true;
- }
- }
- return false;
-}
-
-static bool tryToShortenBegin(Instruction *EarlierWrite,
- OverlapIntervalsTy &IntervalMap,
+ if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
+ LaterSize, true)) {
+ IntervalMap.erase(OII);
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool tryToShortenBegin(Instruction *EarlierWrite,
+ OverlapIntervalsTy &IntervalMap,
int64_t &EarlierStart, uint64_t &EarlierSize) {
- if (IntervalMap.empty() || !isShortenableAtTheBeginning(EarlierWrite))
- return false;
-
- OverlapIntervalsTy::iterator OII = IntervalMap.begin();
- int64_t LaterStart = OII->second;
+ if (IntervalMap.empty() || !isShortenableAtTheBeginning(EarlierWrite))
+ return false;
+
+ OverlapIntervalsTy::iterator OII = IntervalMap.begin();
+ int64_t LaterStart = OII->second;
uint64_t LaterSize = OII->first - LaterStart;
-
+
assert(OII->first - LaterStart >= 0 && "Size expected to be positive");
if (LaterStart <= EarlierStart &&
@@ -1170,433 +1170,433 @@ static bool tryToShortenBegin(Instruction *EarlierWrite,
// Note: "LaterSize - (uint64_t)(EarlierStart - LaterStart)" is known to be
// positive due to preceding checks.
assert(LaterSize - (uint64_t)(EarlierStart - LaterStart) < EarlierSize &&
- "Should have been handled as OW_Complete");
- if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
- LaterSize, false)) {
- IntervalMap.erase(OII);
- return true;
- }
- }
- return false;
-}
-
+ "Should have been handled as OW_Complete");
+ if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
+ LaterSize, false)) {
+ IntervalMap.erase(OII);
+ return true;
+ }
+ }
+ return false;
+}
+
static bool removePartiallyOverlappedStores(const DataLayout &DL,
InstOverlapIntervalsTy &IOL,
const TargetLibraryInfo &TLI) {
- bool Changed = false;
- for (auto OI : IOL) {
- Instruction *EarlierWrite = OI.first;
+ bool Changed = false;
+ for (auto OI : IOL) {
+ Instruction *EarlierWrite = OI.first;
MemoryLocation Loc = getLocForWrite(EarlierWrite, TLI);
- assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
-
- const Value *Ptr = Loc.Ptr->stripPointerCasts();
- int64_t EarlierStart = 0;
+ assert(isRemovable(EarlierWrite) && "Expect only removable instruction");
+
+ const Value *Ptr = Loc.Ptr->stripPointerCasts();
+ int64_t EarlierStart = 0;
uint64_t EarlierSize = Loc.Size.getValue();
- GetPointerBaseWithConstantOffset(Ptr, EarlierStart, DL);
- OverlapIntervalsTy &IntervalMap = OI.second;
- Changed |=
- tryToShortenEnd(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
- if (IntervalMap.empty())
- continue;
- Changed |=
- tryToShortenBegin(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
- }
- return Changed;
-}
-
-static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
- AliasAnalysis *AA, MemoryDependenceResults *MD,
- const DataLayout &DL,
- const TargetLibraryInfo *TLI,
- InstOverlapIntervalsTy &IOL,
- MapVector<Instruction *, bool> &ThrowableInst,
- DominatorTree *DT) {
- // Must be a store instruction.
- StoreInst *SI = dyn_cast<StoreInst>(Inst);
- if (!SI)
- return false;
-
- // If we're storing the same value back to a pointer that we just loaded from,
- // then the store can be removed.
- if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
- if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
- isRemovable(SI) &&
+ GetPointerBaseWithConstantOffset(Ptr, EarlierStart, DL);
+ OverlapIntervalsTy &IntervalMap = OI.second;
+ Changed |=
+ tryToShortenEnd(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
+ if (IntervalMap.empty())
+ continue;
+ Changed |=
+ tryToShortenBegin(EarlierWrite, IntervalMap, EarlierStart, EarlierSize);
+ }
+ return Changed;
+}
+
+static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
+ AliasAnalysis *AA, MemoryDependenceResults *MD,
+ const DataLayout &DL,
+ const TargetLibraryInfo *TLI,
+ InstOverlapIntervalsTy &IOL,
+ MapVector<Instruction *, bool> &ThrowableInst,
+ DominatorTree *DT) {
+ // Must be a store instruction.
+ StoreInst *SI = dyn_cast<StoreInst>(Inst);
+ if (!SI)
+ return false;
+
+ // If we're storing the same value back to a pointer that we just loaded from,
+ // then the store can be removed.
+ if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
+ if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
+ isRemovable(SI) &&
memoryIsNotModifiedBetween(DepLoad, SI, *AA, DL, DT)) {
-
- LLVM_DEBUG(
- dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: "
- << *DepLoad << "\n STORE: " << *SI << '\n');
-
- deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, ThrowableInst);
- ++NumRedundantStores;
- return true;
- }
- }
-
- // Remove null stores into the calloc'ed objects
- Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand());
- if (StoredConstant && StoredConstant->isNullValue() && isRemovable(SI)) {
- Instruction *UnderlyingPointer =
+
+ LLVM_DEBUG(
+ dbgs() << "DSE: Remove Store Of Load from same pointer:\n LOAD: "
+ << *DepLoad << "\n STORE: " << *SI << '\n');
+
+ deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, ThrowableInst);
+ ++NumRedundantStores;
+ return true;
+ }
+ }
+
+ // Remove null stores into the calloc'ed objects
+ Constant *StoredConstant = dyn_cast<Constant>(SI->getValueOperand());
+ if (StoredConstant && StoredConstant->isNullValue() && isRemovable(SI)) {
+ Instruction *UnderlyingPointer =
dyn_cast<Instruction>(getUnderlyingObject(SI->getPointerOperand()));
-
- if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
+
+ if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
memoryIsNotModifiedBetween(UnderlyingPointer, SI, *AA, DL, DT)) {
- LLVM_DEBUG(
- dbgs() << "DSE: Remove null store to the calloc'ed object:\n DEAD: "
- << *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n');
-
- deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, ThrowableInst);
- ++NumRedundantStores;
- return true;
- }
- }
- return false;
-}
-
+ LLVM_DEBUG(
+ dbgs() << "DSE: Remove null store to the calloc'ed object:\n DEAD: "
+ << *Inst << "\n OBJECT: " << *UnderlyingPointer << '\n');
+
+ deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, ThrowableInst);
+ ++NumRedundantStores;
+ return true;
+ }
+ }
+ return false;
+}
+
template <typename AATy>
static Constant *tryToMergePartialOverlappingStores(
StoreInst *Earlier, StoreInst *Later, int64_t InstWriteOffset,
int64_t DepWriteOffset, const DataLayout &DL, AATy &AA, DominatorTree *DT) {
-
- if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
- DL.typeSizeEqualsStoreSize(Earlier->getValueOperand()->getType()) &&
- Later && isa<ConstantInt>(Later->getValueOperand()) &&
- DL.typeSizeEqualsStoreSize(Later->getValueOperand()->getType()) &&
- memoryIsNotModifiedBetween(Earlier, Later, AA, DL, DT)) {
- // If the store we find is:
- // a) partially overwritten by the store to 'Loc'
- // b) the later store is fully contained in the earlier one and
- // c) they both have a constant value
- // d) none of the two stores need padding
- // Merge the two stores, replacing the earlier store's value with a
- // merge of both values.
- // TODO: Deal with other constant types (vectors, etc), and probably
- // some mem intrinsics (if needed)
-
- APInt EarlierValue =
- cast<ConstantInt>(Earlier->getValueOperand())->getValue();
- APInt LaterValue = cast<ConstantInt>(Later->getValueOperand())->getValue();
- unsigned LaterBits = LaterValue.getBitWidth();
- assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth());
- LaterValue = LaterValue.zext(EarlierValue.getBitWidth());
-
- // Offset of the smaller store inside the larger store
- unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8;
- unsigned LShiftAmount = DL.isBigEndian() ? EarlierValue.getBitWidth() -
- BitOffsetDiff - LaterBits
- : BitOffsetDiff;
- APInt Mask = APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount,
- LShiftAmount + LaterBits);
- // Clear the bits we'll be replacing, then OR with the smaller
- // store, shifted appropriately.
- APInt Merged = (EarlierValue & ~Mask) | (LaterValue << LShiftAmount);
- LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *Earlier
- << "\n Later: " << *Later
- << "\n Merged Value: " << Merged << '\n');
- return ConstantInt::get(Earlier->getValueOperand()->getType(), Merged);
- }
- return nullptr;
-}
-
-static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
- MemoryDependenceResults *MD, DominatorTree *DT,
- const TargetLibraryInfo *TLI) {
- const DataLayout &DL = BB.getModule()->getDataLayout();
- bool MadeChange = false;
-
- MapVector<Instruction *, bool> ThrowableInst;
-
- // A map of interval maps representing partially-overwritten value parts.
- InstOverlapIntervalsTy IOL;
-
- // Do a top-down walk on the BB.
- for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
- // Handle 'free' calls specially.
- if (CallInst *F = isFreeCall(&*BBI, TLI)) {
- MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, ThrowableInst);
- // Increment BBI after handleFree has potentially deleted instructions.
- // This ensures we maintain a valid iterator.
- ++BBI;
- continue;
- }
-
- Instruction *Inst = &*BBI++;
-
- if (Inst->mayThrow()) {
- ThrowableInst[Inst] = true;
- continue;
- }
-
- // Check to see if Inst writes to memory. If not, continue.
- if (!hasAnalyzableMemoryWrite(Inst, *TLI))
- continue;
-
- // eliminateNoopStore will update in iterator, if necessary.
- if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL,
- ThrowableInst, DT)) {
- MadeChange = true;
- continue;
- }
-
- // If we find something that writes memory, get its memory dependence.
- MemDepResult InstDep = MD->getDependency(Inst);
-
- // Ignore any store where we can't find a local dependence.
- // FIXME: cross-block DSE would be fun. :)
- if (!InstDep.isDef() && !InstDep.isClobber())
- continue;
-
- // Figure out what location is being stored to.
+
+ if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
+ DL.typeSizeEqualsStoreSize(Earlier->getValueOperand()->getType()) &&
+ Later && isa<ConstantInt>(Later->getValueOperand()) &&
+ DL.typeSizeEqualsStoreSize(Later->getValueOperand()->getType()) &&
+ memoryIsNotModifiedBetween(Earlier, Later, AA, DL, DT)) {
+ // If the store we find is:
+ // a) partially overwritten by the store to 'Loc'
+ // b) the later store is fully contained in the earlier one and
+ // c) they both have a constant value
+ // d) none of the two stores need padding
+ // Merge the two stores, replacing the earlier store's value with a
+ // merge of both values.
+ // TODO: Deal with other constant types (vectors, etc), and probably
+ // some mem intrinsics (if needed)
+
+ APInt EarlierValue =
+ cast<ConstantInt>(Earlier->getValueOperand())->getValue();
+ APInt LaterValue = cast<ConstantInt>(Later->getValueOperand())->getValue();
+ unsigned LaterBits = LaterValue.getBitWidth();
+ assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth());
+ LaterValue = LaterValue.zext(EarlierValue.getBitWidth());
+
+ // Offset of the smaller store inside the larger store
+ unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8;
+ unsigned LShiftAmount = DL.isBigEndian() ? EarlierValue.getBitWidth() -
+ BitOffsetDiff - LaterBits
+ : BitOffsetDiff;
+ APInt Mask = APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount,
+ LShiftAmount + LaterBits);
+ // Clear the bits we'll be replacing, then OR with the smaller
+ // store, shifted appropriately.
+ APInt Merged = (EarlierValue & ~Mask) | (LaterValue << LShiftAmount);
+ LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n Earlier: " << *Earlier
+ << "\n Later: " << *Later
+ << "\n Merged Value: " << Merged << '\n');
+ return ConstantInt::get(Earlier->getValueOperand()->getType(), Merged);
+ }
+ return nullptr;
+}
+
+static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
+ MemoryDependenceResults *MD, DominatorTree *DT,
+ const TargetLibraryInfo *TLI) {
+ const DataLayout &DL = BB.getModule()->getDataLayout();
+ bool MadeChange = false;
+
+ MapVector<Instruction *, bool> ThrowableInst;
+
+ // A map of interval maps representing partially-overwritten value parts.
+ InstOverlapIntervalsTy IOL;
+
+ // Do a top-down walk on the BB.
+ for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
+ // Handle 'free' calls specially.
+ if (CallInst *F = isFreeCall(&*BBI, TLI)) {
+ MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, ThrowableInst);
+ // Increment BBI after handleFree has potentially deleted instructions.
+ // This ensures we maintain a valid iterator.
+ ++BBI;
+ continue;
+ }
+
+ Instruction *Inst = &*BBI++;
+
+ if (Inst->mayThrow()) {
+ ThrowableInst[Inst] = true;
+ continue;
+ }
+
+ // Check to see if Inst writes to memory. If not, continue.
+ if (!hasAnalyzableMemoryWrite(Inst, *TLI))
+ continue;
+
+ // eliminateNoopStore will update in iterator, if necessary.
+ if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL,
+ ThrowableInst, DT)) {
+ MadeChange = true;
+ continue;
+ }
+
+ // If we find something that writes memory, get its memory dependence.
+ MemDepResult InstDep = MD->getDependency(Inst);
+
+ // Ignore any store where we can't find a local dependence.
+ // FIXME: cross-block DSE would be fun. :)
+ if (!InstDep.isDef() && !InstDep.isClobber())
+ continue;
+
+ // Figure out what location is being stored to.
MemoryLocation Loc = getLocForWrite(Inst, *TLI);
-
- // If we didn't get a useful location, fail.
- if (!Loc.Ptr)
- continue;
-
- // Loop until we find a store we can eliminate or a load that
- // invalidates the analysis. Without an upper bound on the number of
- // instructions examined, this analysis can become very time-consuming.
- // However, the potential gain diminishes as we process more instructions
- // without eliminating any of them. Therefore, we limit the number of
- // instructions we look at.
- auto Limit = MD->getDefaultBlockScanLimit();
- while (InstDep.isDef() || InstDep.isClobber()) {
- // Get the memory clobbered by the instruction we depend on. MemDep will
- // skip any instructions that 'Loc' clearly doesn't interact with. If we
- // end up depending on a may- or must-aliased load, then we can't optimize
- // away the store and we bail out. However, if we depend on something
- // that overwrites the memory location we *can* potentially optimize it.
- //
- // Find out what memory location the dependent instruction stores.
- Instruction *DepWrite = InstDep.getInst();
- if (!hasAnalyzableMemoryWrite(DepWrite, *TLI))
- break;
+
+ // If we didn't get a useful location, fail.
+ if (!Loc.Ptr)
+ continue;
+
+ // Loop until we find a store we can eliminate or a load that
+ // invalidates the analysis. Without an upper bound on the number of
+ // instructions examined, this analysis can become very time-consuming.
+ // However, the potential gain diminishes as we process more instructions
+ // without eliminating any of them. Therefore, we limit the number of
+ // instructions we look at.
+ auto Limit = MD->getDefaultBlockScanLimit();
+ while (InstDep.isDef() || InstDep.isClobber()) {
+ // Get the memory clobbered by the instruction we depend on. MemDep will
+ // skip any instructions that 'Loc' clearly doesn't interact with. If we
+ // end up depending on a may- or must-aliased load, then we can't optimize
+ // away the store and we bail out. However, if we depend on something
+ // that overwrites the memory location we *can* potentially optimize it.
+ //
+ // Find out what memory location the dependent instruction stores.
+ Instruction *DepWrite = InstDep.getInst();
+ if (!hasAnalyzableMemoryWrite(DepWrite, *TLI))
+ break;
MemoryLocation DepLoc = getLocForWrite(DepWrite, *TLI);
- // If we didn't get a useful location, or if it isn't a size, bail out.
- if (!DepLoc.Ptr)
- break;
-
- // Find the last throwable instruction not removed by call to
- // deleteDeadInstruction.
- Instruction *LastThrowing = nullptr;
- if (!ThrowableInst.empty())
- LastThrowing = ThrowableInst.back().first;
-
- // Make sure we don't look past a call which might throw. This is an
- // issue because MemoryDependenceAnalysis works in the wrong direction:
- // it finds instructions which dominate the current instruction, rather than
- // instructions which are post-dominated by the current instruction.
- //
- // If the underlying object is a non-escaping memory allocation, any store
- // to it is dead along the unwind edge. Otherwise, we need to preserve
- // the store.
- if (LastThrowing && DepWrite->comesBefore(LastThrowing)) {
+ // If we didn't get a useful location, or if it isn't a size, bail out.
+ if (!DepLoc.Ptr)
+ break;
+
+ // Find the last throwable instruction not removed by call to
+ // deleteDeadInstruction.
+ Instruction *LastThrowing = nullptr;
+ if (!ThrowableInst.empty())
+ LastThrowing = ThrowableInst.back().first;
+
+ // Make sure we don't look past a call which might throw. This is an
+ // issue because MemoryDependenceAnalysis works in the wrong direction:
+ // it finds instructions which dominate the current instruction, rather than
+ // instructions which are post-dominated by the current instruction.
+ //
+ // If the underlying object is a non-escaping memory allocation, any store
+ // to it is dead along the unwind edge. Otherwise, we need to preserve
+ // the store.
+ if (LastThrowing && DepWrite->comesBefore(LastThrowing)) {
const Value *Underlying = getUnderlyingObject(DepLoc.Ptr);
- bool IsStoreDeadOnUnwind = isa<AllocaInst>(Underlying);
- if (!IsStoreDeadOnUnwind) {
- // We're looking for a call to an allocation function
- // where the allocation doesn't escape before the last
- // throwing instruction; PointerMayBeCaptured
- // reasonably fast approximation.
- IsStoreDeadOnUnwind = isAllocLikeFn(Underlying, TLI) &&
- !PointerMayBeCaptured(Underlying, false, true);
- }
- if (!IsStoreDeadOnUnwind)
- break;
- }
-
- // If we find a write that is a) removable (i.e., non-volatile), b) is
- // completely obliterated by the store to 'Loc', and c) which we know that
- // 'Inst' doesn't load from, then we can remove it.
- // Also try to merge two stores if a later one only touches memory written
- // to by the earlier one.
- if (isRemovable(DepWrite) &&
- !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
- int64_t InstWriteOffset, DepWriteOffset;
+ bool IsStoreDeadOnUnwind = isa<AllocaInst>(Underlying);
+ if (!IsStoreDeadOnUnwind) {
+ // We're looking for a call to an allocation function
+ // where the allocation doesn't escape before the last
+ // throwing instruction; PointerMayBeCaptured
+ // reasonably fast approximation.
+ IsStoreDeadOnUnwind = isAllocLikeFn(Underlying, TLI) &&
+ !PointerMayBeCaptured(Underlying, false, true);
+ }
+ if (!IsStoreDeadOnUnwind)
+ break;
+ }
+
+ // If we find a write that is a) removable (i.e., non-volatile), b) is
+ // completely obliterated by the store to 'Loc', and c) which we know that
+ // 'Inst' doesn't load from, then we can remove it.
+ // Also try to merge two stores if a later one only touches memory written
+ // to by the earlier one.
+ if (isRemovable(DepWrite) &&
+ !isPossibleSelfRead(Inst, Loc, DepWrite, *TLI, *AA)) {
+ int64_t InstWriteOffset, DepWriteOffset;
OverwriteResult OR = isOverwrite(Inst, DepWrite, Loc, DepLoc, DL, *TLI,
DepWriteOffset, InstWriteOffset, *AA,
- BB.getParent());
+ BB.getParent());
if (OR == OW_MaybePartial)
OR = isPartialOverwrite(Loc, DepLoc, DepWriteOffset, InstWriteOffset,
DepWrite, IOL);
- if (OR == OW_Complete) {
- LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite
- << "\n KILLER: " << *Inst << '\n');
-
- // Delete the store and now-dead instructions that feed it.
- deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL,
- ThrowableInst);
- ++NumFastStores;
- MadeChange = true;
-
- // We erased DepWrite; start over.
- InstDep = MD->getDependency(Inst);
- continue;
- } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) ||
- ((OR == OW_Begin &&
- isShortenableAtTheBeginning(DepWrite)))) {
- assert(!EnablePartialOverwriteTracking && "Do not expect to perform "
- "when partial-overwrite "
- "tracking is enabled");
- // The overwrite result is known, so these must be known, too.
+ if (OR == OW_Complete) {
+ LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *DepWrite
+ << "\n KILLER: " << *Inst << '\n');
+
+ // Delete the store and now-dead instructions that feed it.
+ deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL,
+ ThrowableInst);
+ ++NumFastStores;
+ MadeChange = true;
+
+ // We erased DepWrite; start over.
+ InstDep = MD->getDependency(Inst);
+ continue;
+ } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) ||
+ ((OR == OW_Begin &&
+ isShortenableAtTheBeginning(DepWrite)))) {
+ assert(!EnablePartialOverwriteTracking && "Do not expect to perform "
+ "when partial-overwrite "
+ "tracking is enabled");
+ // The overwrite result is known, so these must be known, too.
uint64_t EarlierSize = DepLoc.Size.getValue();
uint64_t LaterSize = Loc.Size.getValue();
- bool IsOverwriteEnd = (OR == OW_End);
- MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
- InstWriteOffset, LaterSize, IsOverwriteEnd);
- } else if (EnablePartialStoreMerging &&
- OR == OW_PartialEarlierWithFullLater) {
- auto *Earlier = dyn_cast<StoreInst>(DepWrite);
- auto *Later = dyn_cast<StoreInst>(Inst);
- if (Constant *C = tryToMergePartialOverlappingStores(
+ bool IsOverwriteEnd = (OR == OW_End);
+ MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
+ InstWriteOffset, LaterSize, IsOverwriteEnd);
+ } else if (EnablePartialStoreMerging &&
+ OR == OW_PartialEarlierWithFullLater) {
+ auto *Earlier = dyn_cast<StoreInst>(DepWrite);
+ auto *Later = dyn_cast<StoreInst>(Inst);
+ if (Constant *C = tryToMergePartialOverlappingStores(
Earlier, Later, InstWriteOffset, DepWriteOffset, DL, *AA,
- DT)) {
- auto *SI = new StoreInst(
- C, Earlier->getPointerOperand(), false, Earlier->getAlign(),
- Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite);
-
- unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa,
- LLVMContext::MD_alias_scope,
- LLVMContext::MD_noalias,
- LLVMContext::MD_nontemporal};
- SI->copyMetadata(*DepWrite, MDToKeep);
- ++NumModifiedStores;
-
- // Delete the old stores and now-dead instructions that feed them.
- deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL,
- ThrowableInst);
- deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL,
- ThrowableInst);
- MadeChange = true;
-
- // We erased DepWrite and Inst (Loc); start over.
- break;
- }
- }
- }
-
- // If this is a may-aliased store that is clobbering the store value, we
- // can keep searching past it for another must-aliased pointer that stores
- // to the same location. For example, in:
- // store -> P
- // store -> Q
- // store -> P
- // we can remove the first store to P even though we don't know if P and Q
- // alias.
- if (DepWrite == &BB.front()) break;
-
- // Can't look past this instruction if it might read 'Loc'.
- if (isRefSet(AA->getModRefInfo(DepWrite, Loc)))
- break;
-
- InstDep = MD->getPointerDependencyFrom(Loc, /*isLoad=*/ false,
- DepWrite->getIterator(), &BB,
- /*QueryInst=*/ nullptr, &Limit);
- }
- }
-
- if (EnablePartialOverwriteTracking)
+ DT)) {
+ auto *SI = new StoreInst(
+ C, Earlier->getPointerOperand(), false, Earlier->getAlign(),
+ Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite);
+
+ unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa,
+ LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias,
+ LLVMContext::MD_nontemporal};
+ SI->copyMetadata(*DepWrite, MDToKeep);
+ ++NumModifiedStores;
+
+ // Delete the old stores and now-dead instructions that feed them.
+ deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL,
+ ThrowableInst);
+ deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL,
+ ThrowableInst);
+ MadeChange = true;
+
+ // We erased DepWrite and Inst (Loc); start over.
+ break;
+ }
+ }
+ }
+
+ // If this is a may-aliased store that is clobbering the store value, we
+ // can keep searching past it for another must-aliased pointer that stores
+ // to the same location. For example, in:
+ // store -> P
+ // store -> Q
+ // store -> P
+ // we can remove the first store to P even though we don't know if P and Q
+ // alias.
+ if (DepWrite == &BB.front()) break;
+
+ // Can't look past this instruction if it might read 'Loc'.
+ if (isRefSet(AA->getModRefInfo(DepWrite, Loc)))
+ break;
+
+ InstDep = MD->getPointerDependencyFrom(Loc, /*isLoad=*/ false,
+ DepWrite->getIterator(), &BB,
+ /*QueryInst=*/ nullptr, &Limit);
+ }
+ }
+
+ if (EnablePartialOverwriteTracking)
MadeChange |= removePartiallyOverlappedStores(DL, IOL, *TLI);
-
- // If this block ends in a return, unwind, or unreachable, all allocas are
- // dead at its end, which means stores to them are also dead.
- if (BB.getTerminator()->getNumSuccessors() == 0)
- MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, ThrowableInst);
-
- return MadeChange;
-}
-
-static bool eliminateDeadStores(Function &F, AliasAnalysis *AA,
- MemoryDependenceResults *MD, DominatorTree *DT,
- const TargetLibraryInfo *TLI) {
- bool MadeChange = false;
- for (BasicBlock &BB : F)
- // Only check non-dead blocks. Dead blocks may have strange pointer
- // cycles that will confuse alias analysis.
- if (DT->isReachableFromEntry(&BB))
- MadeChange |= eliminateDeadStores(BB, AA, MD, DT, TLI);
-
- return MadeChange;
-}
-
-namespace {
-//=============================================================================
-// MemorySSA backed dead store elimination.
-//
-// The code below implements dead store elimination using MemorySSA. It uses
-// the following general approach: given a MemoryDef, walk upwards to find
-// clobbering MemoryDefs that may be killed by the starting def. Then check
-// that there are no uses that may read the location of the original MemoryDef
-// in between both MemoryDefs. A bit more concretely:
-//
-// For all MemoryDefs StartDef:
+
+ // If this block ends in a return, unwind, or unreachable, all allocas are
+ // dead at its end, which means stores to them are also dead.
+ if (BB.getTerminator()->getNumSuccessors() == 0)
+ MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, ThrowableInst);
+
+ return MadeChange;
+}
+
+static bool eliminateDeadStores(Function &F, AliasAnalysis *AA,
+ MemoryDependenceResults *MD, DominatorTree *DT,
+ const TargetLibraryInfo *TLI) {
+ bool MadeChange = false;
+ for (BasicBlock &BB : F)
+ // Only check non-dead blocks. Dead blocks may have strange pointer
+ // cycles that will confuse alias analysis.
+ if (DT->isReachableFromEntry(&BB))
+ MadeChange |= eliminateDeadStores(BB, AA, MD, DT, TLI);
+
+ return MadeChange;
+}
+
+namespace {
+//=============================================================================
+// MemorySSA backed dead store elimination.
+//
+// The code below implements dead store elimination using MemorySSA. It uses
+// the following general approach: given a MemoryDef, walk upwards to find
+// clobbering MemoryDefs that may be killed by the starting def. Then check
+// that there are no uses that may read the location of the original MemoryDef
+// in between both MemoryDefs. A bit more concretely:
+//
+// For all MemoryDefs StartDef:
// 1. Get the next dominating clobbering MemoryDef (EarlierAccess) by walking
-// upwards.
+// upwards.
// 2. Check that there are no reads between EarlierAccess and the StartDef by
// checking all uses starting at EarlierAccess and walking until we see
// StartDef.
// 3. For each found CurrentDef, check that:
// 1. There are no barrier instructions between CurrentDef and StartDef (like
-// throws or stores with ordering constraints).
+// throws or stores with ordering constraints).
// 2. StartDef is executed whenever CurrentDef is executed.
// 3. StartDef completely overwrites CurrentDef.
// 4. Erase CurrentDef from the function and MemorySSA.
-
+
// Returns true if \p I is an intrisnic that does not read or write memory.
bool isNoopIntrinsic(Instruction *I) {
if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- switch (II->getIntrinsicID()) {
- case Intrinsic::lifetime_start:
- case Intrinsic::lifetime_end:
- case Intrinsic::invariant_end:
- case Intrinsic::launder_invariant_group:
- case Intrinsic::assume:
- return true;
- case Intrinsic::dbg_addr:
- case Intrinsic::dbg_declare:
- case Intrinsic::dbg_label:
- case Intrinsic::dbg_value:
- llvm_unreachable("Intrinsic should not be modeled in MemorySSA");
- default:
- return false;
- }
- }
- return false;
-}
-
-// Check if we can ignore \p D for DSE.
-bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
- Instruction *DI = D->getMemoryInst();
- // Calls that only access inaccessible memory cannot read or write any memory
- // locations we consider for elimination.
- if (auto *CB = dyn_cast<CallBase>(DI))
- if (CB->onlyAccessesInaccessibleMemory())
- return true;
-
- // We can eliminate stores to locations not visible to the caller across
- // throwing instructions.
- if (DI->mayThrow() && !DefVisibleToCaller)
- return true;
-
- // We can remove the dead stores, irrespective of the fence and its ordering
- // (release/acquire/seq_cst). Fences only constraints the ordering of
- // already visible stores, it does not make a store visible to other
- // threads. So, skipping over a fence does not change a store from being
- // dead.
- if (isa<FenceInst>(DI))
- return true;
-
- // Skip intrinsics that do not really read or modify memory.
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ case Intrinsic::invariant_end:
+ case Intrinsic::launder_invariant_group:
+ case Intrinsic::assume:
+ return true;
+ case Intrinsic::dbg_addr:
+ case Intrinsic::dbg_declare:
+ case Intrinsic::dbg_label:
+ case Intrinsic::dbg_value:
+ llvm_unreachable("Intrinsic should not be modeled in MemorySSA");
+ default:
+ return false;
+ }
+ }
+ return false;
+}
+
+// Check if we can ignore \p D for DSE.
+bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
+ Instruction *DI = D->getMemoryInst();
+ // Calls that only access inaccessible memory cannot read or write any memory
+ // locations we consider for elimination.
+ if (auto *CB = dyn_cast<CallBase>(DI))
+ if (CB->onlyAccessesInaccessibleMemory())
+ return true;
+
+ // We can eliminate stores to locations not visible to the caller across
+ // throwing instructions.
+ if (DI->mayThrow() && !DefVisibleToCaller)
+ return true;
+
+ // We can remove the dead stores, irrespective of the fence and its ordering
+ // (release/acquire/seq_cst). Fences only constraints the ordering of
+ // already visible stores, it does not make a store visible to other
+ // threads. So, skipping over a fence does not change a store from being
+ // dead.
+ if (isa<FenceInst>(DI))
+ return true;
+
+ // Skip intrinsics that do not really read or modify memory.
if (isNoopIntrinsic(D->getMemoryInst()))
- return true;
-
- return false;
-}
-
-struct DSEState {
- Function &F;
- AliasAnalysis &AA;
+ return true;
+
+ return false;
+}
+
+struct DSEState {
+ Function &F;
+ AliasAnalysis &AA;
/// The single BatchAA instance that is used to cache AA queries. It will
/// not be invalidated over the whole run. This is safe, because:
@@ -1607,72 +1607,72 @@ struct DSEState {
/// value pointer.
BatchAAResults BatchAA;
- MemorySSA &MSSA;
- DominatorTree &DT;
- PostDominatorTree &PDT;
- const TargetLibraryInfo &TLI;
+ MemorySSA &MSSA;
+ DominatorTree &DT;
+ PostDominatorTree &PDT;
+ const TargetLibraryInfo &TLI;
const DataLayout &DL;
-
- // All MemoryDefs that potentially could kill other MemDefs.
- SmallVector<MemoryDef *, 64> MemDefs;
- // Any that should be skipped as they are already deleted
- SmallPtrSet<MemoryAccess *, 4> SkipStores;
- // Keep track of all of the objects that are invisible to the caller before
- // the function returns.
+
+ // All MemoryDefs that potentially could kill other MemDefs.
+ SmallVector<MemoryDef *, 64> MemDefs;
+ // Any that should be skipped as they are already deleted
+ SmallPtrSet<MemoryAccess *, 4> SkipStores;
+ // Keep track of all of the objects that are invisible to the caller before
+ // the function returns.
// SmallPtrSet<const Value *, 16> InvisibleToCallerBeforeRet;
DenseMap<const Value *, bool> InvisibleToCallerBeforeRet;
- // Keep track of all of the objects that are invisible to the caller after
- // the function returns.
+ // Keep track of all of the objects that are invisible to the caller after
+ // the function returns.
DenseMap<const Value *, bool> InvisibleToCallerAfterRet;
- // Keep track of blocks with throwing instructions not modeled in MemorySSA.
- SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
- // Post-order numbers for each basic block. Used to figure out if memory
- // accesses are executed before another access.
- DenseMap<BasicBlock *, unsigned> PostOrderNumbers;
-
- /// Keep track of instructions (partly) overlapping with killing MemoryDefs per
- /// basic block.
- DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs;
-
- DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
- PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
+ // Keep track of blocks with throwing instructions not modeled in MemorySSA.
+ SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
+ // Post-order numbers for each basic block. Used to figure out if memory
+ // accesses are executed before another access.
+ DenseMap<BasicBlock *, unsigned> PostOrderNumbers;
+
+ /// Keep track of instructions (partly) overlapping with killing MemoryDefs per
+ /// basic block.
+ DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs;
+
+ DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
+ PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
: F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI),
DL(F.getParent()->getDataLayout()) {}
-
- static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
- DominatorTree &DT, PostDominatorTree &PDT,
- const TargetLibraryInfo &TLI) {
- DSEState State(F, AA, MSSA, DT, PDT, TLI);
- // Collect blocks with throwing instructions not modeled in MemorySSA and
- // alloc-like objects.
- unsigned PO = 0;
- for (BasicBlock *BB : post_order(&F)) {
- State.PostOrderNumbers[BB] = PO++;
- for (Instruction &I : *BB) {
- MemoryAccess *MA = MSSA.getMemoryAccess(&I);
- if (I.mayThrow() && !MA)
- State.ThrowingBlocks.insert(I.getParent());
-
- auto *MD = dyn_cast_or_null<MemoryDef>(MA);
- if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit &&
- (State.getLocForWriteEx(&I) || State.isMemTerminatorInst(&I)))
- State.MemDefs.push_back(MD);
- }
- }
-
- // Treat byval or inalloca arguments the same as Allocas, stores to them are
- // dead at the end of the function.
- for (Argument &AI : F.args())
+
+ static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
+ DominatorTree &DT, PostDominatorTree &PDT,
+ const TargetLibraryInfo &TLI) {
+ DSEState State(F, AA, MSSA, DT, PDT, TLI);
+ // Collect blocks with throwing instructions not modeled in MemorySSA and
+ // alloc-like objects.
+ unsigned PO = 0;
+ for (BasicBlock *BB : post_order(&F)) {
+ State.PostOrderNumbers[BB] = PO++;
+ for (Instruction &I : *BB) {
+ MemoryAccess *MA = MSSA.getMemoryAccess(&I);
+ if (I.mayThrow() && !MA)
+ State.ThrowingBlocks.insert(I.getParent());
+
+ auto *MD = dyn_cast_or_null<MemoryDef>(MA);
+ if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit &&
+ (State.getLocForWriteEx(&I) || State.isMemTerminatorInst(&I)))
+ State.MemDefs.push_back(MD);
+ }
+ }
+
+ // Treat byval or inalloca arguments the same as Allocas, stores to them are
+ // dead at the end of the function.
+ for (Argument &AI : F.args())
if (AI.hasPassPointeeByValueCopyAttr()) {
- // For byval, the caller doesn't know the address of the allocation.
- if (AI.hasByValAttr())
+ // For byval, the caller doesn't know the address of the allocation.
+ if (AI.hasByValAttr())
State.InvisibleToCallerBeforeRet.insert({&AI, true});
State.InvisibleToCallerAfterRet.insert({&AI, true});
- }
-
- return State;
- }
-
+ }
+
+ return State;
+ }
+
bool isInvisibleToCallerAfterRet(const Value *V) {
if (isa<AllocaInst>(V))
return true;
@@ -1705,31 +1705,31 @@ struct DSEState {
return I.first->second;
}
- Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const {
- if (!I->mayWriteToMemory())
- return None;
-
- if (auto *MTI = dyn_cast<AnyMemIntrinsic>(I))
- return {MemoryLocation::getForDest(MTI)};
-
- if (auto *CB = dyn_cast<CallBase>(I)) {
+ Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const {
+ if (!I->mayWriteToMemory())
+ return None;
+
+ if (auto *MTI = dyn_cast<AnyMemIntrinsic>(I))
+ return {MemoryLocation::getForDest(MTI)};
+
+ if (auto *CB = dyn_cast<CallBase>(I)) {
// If the functions may write to memory we do not know about, bail out.
if (!CB->onlyAccessesArgMemory() &&
!CB->onlyAccessesInaccessibleMemOrArgMem())
return None;
- LibFunc LF;
- if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
- switch (LF) {
- case LibFunc_strcpy:
- case LibFunc_strncpy:
- case LibFunc_strcat:
- case LibFunc_strncat:
+ LibFunc LF;
+ if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
+ switch (LF) {
+ case LibFunc_strcpy:
+ case LibFunc_strncpy:
+ case LibFunc_strcat:
+ case LibFunc_strncat:
return {MemoryLocation::getAfter(CB->getArgOperand(0))};
- default:
- break;
- }
- }
+ default:
+ break;
+ }
+ }
switch (CB->getIntrinsicID()) {
case Intrinsic::init_trampoline:
return {MemoryLocation::getAfter(CB->getArgOperand(0))};
@@ -1738,138 +1738,138 @@ struct DSEState {
default:
break;
}
- return None;
- }
-
- return MemoryLocation::getOrNone(I);
- }
-
+ return None;
+ }
+
+ return MemoryLocation::getOrNone(I);
+ }
+
/// Returns true if \p UseInst completely overwrites \p DefLoc
/// (stored by \p DefInst).
bool isCompleteOverwrite(const MemoryLocation &DefLoc, Instruction *DefInst,
Instruction *UseInst) {
- // UseInst has a MemoryDef associated in MemorySSA. It's possible for a
- // MemoryDef to not write to memory, e.g. a volatile load is modeled as a
- // MemoryDef.
- if (!UseInst->mayWriteToMemory())
- return false;
-
- if (auto *CB = dyn_cast<CallBase>(UseInst))
- if (CB->onlyAccessesInaccessibleMemory())
- return false;
-
- int64_t InstWriteOffset, DepWriteOffset;
+ // UseInst has a MemoryDef associated in MemorySSA. It's possible for a
+ // MemoryDef to not write to memory, e.g. a volatile load is modeled as a
+ // MemoryDef.
+ if (!UseInst->mayWriteToMemory())
+ return false;
+
+ if (auto *CB = dyn_cast<CallBase>(UseInst))
+ if (CB->onlyAccessesInaccessibleMemory())
+ return false;
+
+ int64_t InstWriteOffset, DepWriteOffset;
if (auto CC = getLocForWriteEx(UseInst))
return isOverwrite(UseInst, DefInst, *CC, DefLoc, DL, TLI, DepWriteOffset,
InstWriteOffset, BatchAA, &F) == OW_Complete;
return false;
- }
-
- /// Returns true if \p Def is not read before returning from the function.
- bool isWriteAtEndOfFunction(MemoryDef *Def) {
- LLVM_DEBUG(dbgs() << " Check if def " << *Def << " ("
- << *Def->getMemoryInst()
- << ") is at the end the function \n");
-
- auto MaybeLoc = getLocForWriteEx(Def->getMemoryInst());
- if (!MaybeLoc) {
- LLVM_DEBUG(dbgs() << " ... could not get location for write.\n");
- return false;
- }
-
- SmallVector<MemoryAccess *, 4> WorkList;
- SmallPtrSet<MemoryAccess *, 8> Visited;
- auto PushMemUses = [&WorkList, &Visited](MemoryAccess *Acc) {
- if (!Visited.insert(Acc).second)
- return;
- for (Use &U : Acc->uses())
- WorkList.push_back(cast<MemoryAccess>(U.getUser()));
- };
- PushMemUses(Def);
- for (unsigned I = 0; I < WorkList.size(); I++) {
- if (WorkList.size() >= MemorySSAScanLimit) {
- LLVM_DEBUG(dbgs() << " ... hit exploration limit.\n");
- return false;
- }
-
- MemoryAccess *UseAccess = WorkList[I];
+ }
+
+ /// Returns true if \p Def is not read before returning from the function.
+ bool isWriteAtEndOfFunction(MemoryDef *Def) {
+ LLVM_DEBUG(dbgs() << " Check if def " << *Def << " ("
+ << *Def->getMemoryInst()
+ << ") is at the end the function \n");
+
+ auto MaybeLoc = getLocForWriteEx(Def->getMemoryInst());
+ if (!MaybeLoc) {
+ LLVM_DEBUG(dbgs() << " ... could not get location for write.\n");
+ return false;
+ }
+
+ SmallVector<MemoryAccess *, 4> WorkList;
+ SmallPtrSet<MemoryAccess *, 8> Visited;
+ auto PushMemUses = [&WorkList, &Visited](MemoryAccess *Acc) {
+ if (!Visited.insert(Acc).second)
+ return;
+ for (Use &U : Acc->uses())
+ WorkList.push_back(cast<MemoryAccess>(U.getUser()));
+ };
+ PushMemUses(Def);
+ for (unsigned I = 0; I < WorkList.size(); I++) {
+ if (WorkList.size() >= MemorySSAScanLimit) {
+ LLVM_DEBUG(dbgs() << " ... hit exploration limit.\n");
+ return false;
+ }
+
+ MemoryAccess *UseAccess = WorkList[I];
// Simply adding the users of MemoryPhi to the worklist is not enough,
// because we might miss read clobbers in different iterations of a loop,
// for example.
// TODO: Add support for phi translation to handle the loop case.
if (isa<MemoryPhi>(UseAccess))
return false;
-
- // TODO: Checking for aliasing is expensive. Consider reducing the amount
- // of times this is called and/or caching it.
- Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
- if (isReadClobber(*MaybeLoc, UseInst)) {
- LLVM_DEBUG(dbgs() << " ... hit read clobber " << *UseInst << ".\n");
- return false;
- }
-
- if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess))
- PushMemUses(UseDef);
- }
- return true;
- }
-
- /// If \p I is a memory terminator like llvm.lifetime.end or free, return a
- /// pair with the MemoryLocation terminated by \p I and a boolean flag
- /// indicating whether \p I is a free-like call.
- Optional<std::pair<MemoryLocation, bool>>
- getLocForTerminator(Instruction *I) const {
- uint64_t Len;
- Value *Ptr;
- if (match(I, m_Intrinsic<Intrinsic::lifetime_end>(m_ConstantInt(Len),
- m_Value(Ptr))))
- return {std::make_pair(MemoryLocation(Ptr, Len), false)};
-
- if (auto *CB = dyn_cast<CallBase>(I)) {
- if (isFreeCall(I, &TLI))
+
+ // TODO: Checking for aliasing is expensive. Consider reducing the amount
+ // of times this is called and/or caching it.
+ Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
+ if (isReadClobber(*MaybeLoc, UseInst)) {
+ LLVM_DEBUG(dbgs() << " ... hit read clobber " << *UseInst << ".\n");
+ return false;
+ }
+
+ if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess))
+ PushMemUses(UseDef);
+ }
+ return true;
+ }
+
+ /// If \p I is a memory terminator like llvm.lifetime.end or free, return a
+ /// pair with the MemoryLocation terminated by \p I and a boolean flag
+ /// indicating whether \p I is a free-like call.
+ Optional<std::pair<MemoryLocation, bool>>
+ getLocForTerminator(Instruction *I) const {
+ uint64_t Len;
+ Value *Ptr;
+ if (match(I, m_Intrinsic<Intrinsic::lifetime_end>(m_ConstantInt(Len),
+ m_Value(Ptr))))
+ return {std::make_pair(MemoryLocation(Ptr, Len), false)};
+
+ if (auto *CB = dyn_cast<CallBase>(I)) {
+ if (isFreeCall(I, &TLI))
return {std::make_pair(MemoryLocation::getAfter(CB->getArgOperand(0)),
true)};
- }
-
- return None;
- }
-
- /// Returns true if \p I is a memory terminator instruction like
- /// llvm.lifetime.end or free.
- bool isMemTerminatorInst(Instruction *I) const {
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
- return (II && II->getIntrinsicID() == Intrinsic::lifetime_end) ||
- isFreeCall(I, &TLI);
- }
-
+ }
+
+ return None;
+ }
+
+ /// Returns true if \p I is a memory terminator instruction like
+ /// llvm.lifetime.end or free.
+ bool isMemTerminatorInst(Instruction *I) const {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+ return (II && II->getIntrinsicID() == Intrinsic::lifetime_end) ||
+ isFreeCall(I, &TLI);
+ }
+
/// Returns true if \p MaybeTerm is a memory terminator for \p Loc from
/// instruction \p AccessI.
bool isMemTerminator(const MemoryLocation &Loc, Instruction *AccessI,
Instruction *MaybeTerm) {
- Optional<std::pair<MemoryLocation, bool>> MaybeTermLoc =
- getLocForTerminator(MaybeTerm);
-
- if (!MaybeTermLoc)
- return false;
-
- // If the terminator is a free-like call, all accesses to the underlying
- // object can be considered terminated.
+ Optional<std::pair<MemoryLocation, bool>> MaybeTermLoc =
+ getLocForTerminator(MaybeTerm);
+
+ if (!MaybeTermLoc)
+ return false;
+
+ // If the terminator is a free-like call, all accesses to the underlying
+ // object can be considered terminated.
if (getUnderlyingObject(Loc.Ptr) !=
getUnderlyingObject(MaybeTermLoc->first.Ptr))
return false;
auto TermLoc = MaybeTermLoc->first;
- if (MaybeTermLoc->second) {
+ if (MaybeTermLoc->second) {
const Value *LocUO = getUnderlyingObject(Loc.Ptr);
return BatchAA.isMustAlias(TermLoc.Ptr, LocUO);
- }
+ }
int64_t InstWriteOffset, DepWriteOffset;
return isOverwrite(MaybeTerm, AccessI, TermLoc, Loc, DL, TLI,
DepWriteOffset, InstWriteOffset, BatchAA,
&F) == OW_Complete;
- }
-
- // Returns true if \p Use may read from \p DefLoc.
+ }
+
+ // Returns true if \p Use may read from \p DefLoc.
bool isReadClobber(const MemoryLocation &DefLoc, Instruction *UseInst) {
if (isNoopIntrinsic(UseInst))
return false;
@@ -1879,20 +1879,20 @@ struct DSEState {
if (auto SI = dyn_cast<StoreInst>(UseInst))
return isStrongerThan(SI->getOrdering(), AtomicOrdering::Monotonic);
- if (!UseInst->mayReadFromMemory())
- return false;
-
- if (auto *CB = dyn_cast<CallBase>(UseInst))
- if (CB->onlyAccessesInaccessibleMemory())
- return false;
-
+ if (!UseInst->mayReadFromMemory())
+ return false;
+
+ if (auto *CB = dyn_cast<CallBase>(UseInst))
+ if (CB->onlyAccessesInaccessibleMemory())
+ return false;
+
// NOTE: For calls, the number of stores removed could be slightly improved
// by using AA.callCapturesBefore(UseInst, DefLoc, &DT), but that showed to
// be expensive compared to the benefits in practice. For now, avoid more
// expensive analysis to limit compile-time.
return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc));
- }
-
+ }
+
/// Returns true if \p Ptr is guaranteed to be loop invariant for any possible
/// loop. In particular, this guarantees that it only references a single
/// MemoryLocation during execution of the containing function.
@@ -1925,7 +1925,7 @@ struct DSEState {
// such MemoryDef, return None. The returned value may not (completely)
// overwrite \p DefLoc. Currently we bail out when we encounter an aliasing
// MemoryUse (read).
- Optional<MemoryAccess *>
+ Optional<MemoryAccess *>
getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *StartAccess,
const MemoryLocation &DefLoc, const Value *DefUO,
unsigned &ScanLimit, unsigned &WalkerStepLimit,
@@ -1937,13 +1937,13 @@ struct DSEState {
MemoryAccess *Current = StartAccess;
Instruction *KillingI = KillingDef->getMemoryInst();
- bool StepAgain;
+ bool StepAgain;
LLVM_DEBUG(dbgs() << " trying to get dominating access\n");
// Find the next clobbering Mod access for DefLoc, starting at StartAccess.
Optional<MemoryLocation> CurrentLoc;
- do {
- StepAgain = false;
+ do {
+ StepAgain = false;
LLVM_DEBUG({
dbgs() << " visiting " << *Current;
if (!MSSA.isLiveOnEntryDef(Current) && isa<MemoryUseOrDef>(Current))
@@ -1952,12 +1952,12 @@ struct DSEState {
dbgs() << "\n";
});
- // Reached TOP.
+ // Reached TOP.
if (MSSA.isLiveOnEntryDef(Current)) {
LLVM_DEBUG(dbgs() << " ... found LiveOnEntryDef\n");
- return None;
+ return None;
}
-
+
// Cost of a step. Accesses in the same block are more likely to be valid
// candidates for elimination, hence consider them cheaper.
unsigned StepCost = KillingDef->getBlock() == Current->getBlock()
@@ -1971,10 +1971,10 @@ struct DSEState {
// Return for MemoryPhis. They cannot be eliminated directly and the
// caller is responsible for traversing them.
- if (isa<MemoryPhi>(Current)) {
+ if (isa<MemoryPhi>(Current)) {
LLVM_DEBUG(dbgs() << " ... found MemoryPhi\n");
return Current;
- }
+ }
// Below, check if CurrentDef is a valid candidate to be eliminated by
// KillingDef. If it is not, check the next candidate.
@@ -1991,16 +1991,16 @@ struct DSEState {
// instructions that block us from DSEing
if (mayThrowBetween(KillingI, CurrentI, DefUO)) {
LLVM_DEBUG(dbgs() << " ... skip, may throw!\n");
- return None;
+ return None;
}
-
+
// Check for anything that looks like it will be a barrier to further
// removal
if (isDSEBarrier(DefUO, CurrentI)) {
LLVM_DEBUG(dbgs() << " ... skip, barrier\n");
return None;
}
-
+
// If Current is known to be on path that reads DefLoc or is a read
// clobber, bail out, as the path is not profitable. We skip this check
// for intrinsic calls, because the code knows how to handle memcpy
@@ -2022,11 +2022,11 @@ struct DSEState {
// If Current cannot be analyzed or is not removable, check the next
// candidate.
if (!hasAnalyzableMemoryWrite(CurrentI, TLI) || !isRemovable(CurrentI)) {
- StepAgain = true;
+ StepAgain = true;
Current = CurrentDef->getDefiningAccess();
continue;
- }
-
+ }
+
// If Current does not have an analyzable write location, skip it
CurrentLoc = getLocForWriteEx(CurrentI);
if (!CurrentLoc) {
@@ -2078,11 +2078,11 @@ struct DSEState {
PartialLimit -= 1;
}
}
- } while (StepAgain);
-
- // Accesses to objects accessible after the function returns can only be
- // eliminated if the access is killed along all paths to the exit. Collect
- // the blocks with killing (=completely overwriting MemoryDefs) and check if
+ } while (StepAgain);
+
+ // Accesses to objects accessible after the function returns can only be
+ // eliminated if the access is killed along all paths to the exit. Collect
+ // the blocks with killing (=completely overwriting MemoryDefs) and check if
// they cover all paths from EarlierAccess to any function exit.
SmallPtrSet<Instruction *, 16> KillingDefs;
KillingDefs.insert(KillingDef->getMemoryInst());
@@ -2091,34 +2091,34 @@ struct DSEState {
cast<MemoryDef>(EarlierAccess)->getMemoryInst();
LLVM_DEBUG(dbgs() << " Checking for reads of " << *EarlierAccess << " ("
<< *EarlierMemInst << ")\n");
-
- SmallSetVector<MemoryAccess *, 32> WorkList;
- auto PushMemUses = [&WorkList](MemoryAccess *Acc) {
- for (Use &U : Acc->uses())
- WorkList.insert(cast<MemoryAccess>(U.getUser()));
- };
+
+ SmallSetVector<MemoryAccess *, 32> WorkList;
+ auto PushMemUses = [&WorkList](MemoryAccess *Acc) {
+ for (Use &U : Acc->uses())
+ WorkList.insert(cast<MemoryAccess>(U.getUser()));
+ };
PushMemUses(EarlierAccess);
-
+
// Optimistically collect all accesses for reads. If we do not find any
// read clobbers, add them to the cache.
SmallPtrSet<MemoryAccess *, 16> KnownNoReads;
if (!EarlierMemInst->mayReadFromMemory())
KnownNoReads.insert(EarlierAccess);
// Check if EarlierDef may be read.
- for (unsigned I = 0; I < WorkList.size(); I++) {
- MemoryAccess *UseAccess = WorkList[I];
-
- LLVM_DEBUG(dbgs() << " " << *UseAccess);
+ for (unsigned I = 0; I < WorkList.size(); I++) {
+ MemoryAccess *UseAccess = WorkList[I];
+
+ LLVM_DEBUG(dbgs() << " " << *UseAccess);
// Bail out if the number of accesses to check exceeds the scan limit.
if (ScanLimit < (WorkList.size() - I)) {
- LLVM_DEBUG(dbgs() << "\n ... hit scan limit\n");
- return None;
- }
+ LLVM_DEBUG(dbgs() << "\n ... hit scan limit\n");
+ return None;
+ }
--ScanLimit;
NumDomMemDefChecks++;
KnownNoReads.insert(UseAccess);
-
- if (isa<MemoryPhi>(UseAccess)) {
+
+ if (isa<MemoryPhi>(UseAccess)) {
if (any_of(KillingDefs, [this, UseAccess](Instruction *KI) {
return DT.properlyDominates(KI->getParent(),
UseAccess->getBlock());
@@ -2126,30 +2126,30 @@ struct DSEState {
LLVM_DEBUG(dbgs() << " ... skipping, dominated by killing block\n");
continue;
}
- LLVM_DEBUG(dbgs() << "\n ... adding PHI uses\n");
- PushMemUses(UseAccess);
- continue;
- }
-
- Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
- LLVM_DEBUG(dbgs() << " (" << *UseInst << ")\n");
-
+ LLVM_DEBUG(dbgs() << "\n ... adding PHI uses\n");
+ PushMemUses(UseAccess);
+ continue;
+ }
+
+ Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
+ LLVM_DEBUG(dbgs() << " (" << *UseInst << ")\n");
+
if (any_of(KillingDefs, [this, UseInst](Instruction *KI) {
return DT.dominates(KI, UseInst);
})) {
LLVM_DEBUG(dbgs() << " ... skipping, dominated by killing def\n");
- continue;
- }
-
- // A memory terminator kills all preceeding MemoryDefs and all succeeding
- // MemoryAccesses. We do not have to check it's users.
+ continue;
+ }
+
+ // A memory terminator kills all preceeding MemoryDefs and all succeeding
+ // MemoryAccesses. We do not have to check it's users.
if (isMemTerminator(*CurrentLoc, EarlierMemInst, UseInst)) {
LLVM_DEBUG(
dbgs()
<< " ... skipping, memterminator invalidates following accesses\n");
- continue;
+ continue;
}
-
+
if (isNoopIntrinsic(cast<MemoryUseOrDef>(UseAccess)->getMemoryInst())) {
LLVM_DEBUG(dbgs() << " ... adding uses of intrinsic\n");
PushMemUses(UseAccess);
@@ -2161,218 +2161,218 @@ struct DSEState {
return None;
}
- // Uses which may read the original MemoryDef mean we cannot eliminate the
- // original MD. Stop walk.
+ // Uses which may read the original MemoryDef mean we cannot eliminate the
+ // original MD. Stop walk.
if (isReadClobber(*CurrentLoc, UseInst)) {
- LLVM_DEBUG(dbgs() << " ... found read clobber\n");
- return None;
- }
-
+ LLVM_DEBUG(dbgs() << " ... found read clobber\n");
+ return None;
+ }
+
// For the KillingDef and EarlierAccess we only have to check if it reads
// the memory location.
- // TODO: It would probably be better to check for self-reads before
- // calling the function.
+ // TODO: It would probably be better to check for self-reads before
+ // calling the function.
if (KillingDef == UseAccess || EarlierAccess == UseAccess) {
- LLVM_DEBUG(dbgs() << " ... skipping killing def/dom access\n");
- continue;
- }
-
- // Check all uses for MemoryDefs, except for defs completely overwriting
- // the original location. Otherwise we have to check uses of *all*
- // MemoryDefs we discover, including non-aliasing ones. Otherwise we might
- // miss cases like the following
+ LLVM_DEBUG(dbgs() << " ... skipping killing def/dom access\n");
+ continue;
+ }
+
+ // Check all uses for MemoryDefs, except for defs completely overwriting
+ // the original location. Otherwise we have to check uses of *all*
+ // MemoryDefs we discover, including non-aliasing ones. Otherwise we might
+ // miss cases like the following
// 1 = Def(LoE) ; <----- EarlierDef stores [0,1]
- // 2 = Def(1) ; (2, 1) = NoAlias, stores [2,3]
- // Use(2) ; MayAlias 2 *and* 1, loads [0, 3].
- // (The Use points to the *first* Def it may alias)
- // 3 = Def(1) ; <---- Current (3, 2) = NoAlias, (3,1) = MayAlias,
- // stores [0,1]
- if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
+ // 2 = Def(1) ; (2, 1) = NoAlias, stores [2,3]
+ // Use(2) ; MayAlias 2 *and* 1, loads [0, 3].
+ // (The Use points to the *first* Def it may alias)
+ // 3 = Def(1) ; <---- Current (3, 2) = NoAlias, (3,1) = MayAlias,
+ // stores [0,1]
+ if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
if (isCompleteOverwrite(*CurrentLoc, EarlierMemInst, UseInst)) {
if (!isInvisibleToCallerAfterRet(DefUO) &&
UseAccess != EarlierAccess) {
- BasicBlock *MaybeKillingBlock = UseInst->getParent();
- if (PostOrderNumbers.find(MaybeKillingBlock)->second <
+ BasicBlock *MaybeKillingBlock = UseInst->getParent();
+ if (PostOrderNumbers.find(MaybeKillingBlock)->second <
PostOrderNumbers.find(EarlierAccess->getBlock())->second) {
-
+
LLVM_DEBUG(dbgs()
<< " ... found killing def " << *UseInst << "\n");
KillingDefs.insert(UseInst);
- }
- }
- } else
- PushMemUses(UseDef);
- }
- }
-
- // For accesses to locations visible after the function returns, make sure
+ }
+ }
+ } else
+ PushMemUses(UseDef);
+ }
+ }
+
+ // For accesses to locations visible after the function returns, make sure
// that the location is killed (=overwritten) along all paths from
// EarlierAccess to the exit.
if (!isInvisibleToCallerAfterRet(DefUO)) {
SmallPtrSet<BasicBlock *, 16> KillingBlocks;
for (Instruction *KD : KillingDefs)
KillingBlocks.insert(KD->getParent());
- assert(!KillingBlocks.empty() &&
- "Expected at least a single killing block");
-
- // Find the common post-dominator of all killing blocks.
- BasicBlock *CommonPred = *KillingBlocks.begin();
- for (auto I = std::next(KillingBlocks.begin()), E = KillingBlocks.end();
- I != E; I++) {
- if (!CommonPred)
- break;
- CommonPred = PDT.findNearestCommonDominator(CommonPred, *I);
- }
-
- // If CommonPred is in the set of killing blocks, just check if it
+ assert(!KillingBlocks.empty() &&
+ "Expected at least a single killing block");
+
+ // Find the common post-dominator of all killing blocks.
+ BasicBlock *CommonPred = *KillingBlocks.begin();
+ for (auto I = std::next(KillingBlocks.begin()), E = KillingBlocks.end();
+ I != E; I++) {
+ if (!CommonPred)
+ break;
+ CommonPred = PDT.findNearestCommonDominator(CommonPred, *I);
+ }
+
+ // If CommonPred is in the set of killing blocks, just check if it
// post-dominates EarlierAccess.
- if (KillingBlocks.count(CommonPred)) {
+ if (KillingBlocks.count(CommonPred)) {
if (PDT.dominates(CommonPred, EarlierAccess->getBlock()))
return {EarlierAccess};
- return None;
- }
-
+ return None;
+ }
+
// If the common post-dominator does not post-dominate EarlierAccess,
// there is a path from EarlierAccess to an exit not going through a
// killing block.
if (PDT.dominates(CommonPred, EarlierAccess->getBlock())) {
- SetVector<BasicBlock *> WorkList;
-
- // If CommonPred is null, there are multiple exits from the function.
- // They all have to be added to the worklist.
- if (CommonPred)
- WorkList.insert(CommonPred);
- else
- for (BasicBlock *R : PDT.roots())
- WorkList.insert(R);
-
- NumCFGTries++;
- // Check if all paths starting from an exit node go through one of the
+ SetVector<BasicBlock *> WorkList;
+
+ // If CommonPred is null, there are multiple exits from the function.
+ // They all have to be added to the worklist.
+ if (CommonPred)
+ WorkList.insert(CommonPred);
+ else
+ for (BasicBlock *R : PDT.roots())
+ WorkList.insert(R);
+
+ NumCFGTries++;
+ // Check if all paths starting from an exit node go through one of the
// killing blocks before reaching EarlierAccess.
- for (unsigned I = 0; I < WorkList.size(); I++) {
- NumCFGChecks++;
- BasicBlock *Current = WorkList[I];
- if (KillingBlocks.count(Current))
- continue;
+ for (unsigned I = 0; I < WorkList.size(); I++) {
+ NumCFGChecks++;
+ BasicBlock *Current = WorkList[I];
+ if (KillingBlocks.count(Current))
+ continue;
if (Current == EarlierAccess->getBlock())
- return None;
-
+ return None;
+
// EarlierAccess is reachable from the entry, so we don't have to
// explore unreachable blocks further.
- if (!DT.isReachableFromEntry(Current))
- continue;
-
- for (BasicBlock *Pred : predecessors(Current))
- WorkList.insert(Pred);
-
- if (WorkList.size() >= MemorySSAPathCheckLimit)
- return None;
- }
- NumCFGSuccess++;
+ if (!DT.isReachableFromEntry(Current))
+ continue;
+
+ for (BasicBlock *Pred : predecessors(Current))
+ WorkList.insert(Pred);
+
+ if (WorkList.size() >= MemorySSAPathCheckLimit)
+ return None;
+ }
+ NumCFGSuccess++;
return {EarlierAccess};
- }
- return None;
- }
-
+ }
+ return None;
+ }
+
// No aliasing MemoryUses of EarlierAccess found, EarlierAccess is
// potentially dead.
return {EarlierAccess};
- }
-
- // Delete dead memory defs
- void deleteDeadInstruction(Instruction *SI) {
- MemorySSAUpdater Updater(&MSSA);
- SmallVector<Instruction *, 32> NowDeadInsts;
- NowDeadInsts.push_back(SI);
- --NumFastOther;
-
- while (!NowDeadInsts.empty()) {
- Instruction *DeadInst = NowDeadInsts.pop_back_val();
- ++NumFastOther;
-
- // Try to preserve debug information attached to the dead instruction.
- salvageDebugInfo(*DeadInst);
- salvageKnowledge(DeadInst);
-
- // Remove the Instruction from MSSA.
- if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) {
- if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) {
- SkipStores.insert(MD);
- }
- Updater.removeMemoryAccess(MA);
- }
-
- auto I = IOLs.find(DeadInst->getParent());
- if (I != IOLs.end())
- I->second.erase(DeadInst);
- // Remove its operands
- for (Use &O : DeadInst->operands())
- if (Instruction *OpI = dyn_cast<Instruction>(O)) {
- O = nullptr;
- if (isInstructionTriviallyDead(OpI, &TLI))
- NowDeadInsts.push_back(OpI);
- }
-
- DeadInst->eraseFromParent();
- }
- }
-
- // Check for any extra throws between SI and NI that block DSE. This only
- // checks extra maythrows (those that aren't MemoryDef's). MemoryDef that may
- // throw are handled during the walk from one def to the next.
- bool mayThrowBetween(Instruction *SI, Instruction *NI,
+ }
+
+ // Delete dead memory defs
+ void deleteDeadInstruction(Instruction *SI) {
+ MemorySSAUpdater Updater(&MSSA);
+ SmallVector<Instruction *, 32> NowDeadInsts;
+ NowDeadInsts.push_back(SI);
+ --NumFastOther;
+
+ while (!NowDeadInsts.empty()) {
+ Instruction *DeadInst = NowDeadInsts.pop_back_val();
+ ++NumFastOther;
+
+ // Try to preserve debug information attached to the dead instruction.
+ salvageDebugInfo(*DeadInst);
+ salvageKnowledge(DeadInst);
+
+ // Remove the Instruction from MSSA.
+ if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) {
+ if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) {
+ SkipStores.insert(MD);
+ }
+ Updater.removeMemoryAccess(MA);
+ }
+
+ auto I = IOLs.find(DeadInst->getParent());
+ if (I != IOLs.end())
+ I->second.erase(DeadInst);
+ // Remove its operands
+ for (Use &O : DeadInst->operands())
+ if (Instruction *OpI = dyn_cast<Instruction>(O)) {
+ O = nullptr;
+ if (isInstructionTriviallyDead(OpI, &TLI))
+ NowDeadInsts.push_back(OpI);
+ }
+
+ DeadInst->eraseFromParent();
+ }
+ }
+
+ // Check for any extra throws between SI and NI that block DSE. This only
+ // checks extra maythrows (those that aren't MemoryDef's). MemoryDef that may
+ // throw are handled during the walk from one def to the next.
+ bool mayThrowBetween(Instruction *SI, Instruction *NI,
const Value *SILocUnd) {
- // First see if we can ignore it by using the fact that SI is an
- // alloca/alloca like object that is not visible to the caller during
- // execution of the function.
+ // First see if we can ignore it by using the fact that SI is an
+ // alloca/alloca like object that is not visible to the caller during
+ // execution of the function.
if (SILocUnd && isInvisibleToCallerBeforeRet(SILocUnd))
- return false;
-
- if (SI->getParent() == NI->getParent())
- return ThrowingBlocks.count(SI->getParent());
- return !ThrowingBlocks.empty();
- }
-
- // Check if \p NI acts as a DSE barrier for \p SI. The following instructions
- // act as barriers:
- // * A memory instruction that may throw and \p SI accesses a non-stack
- // object.
- // * Atomic stores stronger that monotonic.
+ return false;
+
+ if (SI->getParent() == NI->getParent())
+ return ThrowingBlocks.count(SI->getParent());
+ return !ThrowingBlocks.empty();
+ }
+
+ // Check if \p NI acts as a DSE barrier for \p SI. The following instructions
+ // act as barriers:
+ // * A memory instruction that may throw and \p SI accesses a non-stack
+ // object.
+ // * Atomic stores stronger that monotonic.
bool isDSEBarrier(const Value *SILocUnd, Instruction *NI) {
- // If NI may throw it acts as a barrier, unless we are to an alloca/alloca
- // like object that does not escape.
+ // If NI may throw it acts as a barrier, unless we are to an alloca/alloca
+ // like object that does not escape.
if (NI->mayThrow() && !isInvisibleToCallerBeforeRet(SILocUnd))
- return true;
-
- // If NI is an atomic load/store stronger than monotonic, do not try to
- // eliminate/reorder it.
- if (NI->isAtomic()) {
- if (auto *LI = dyn_cast<LoadInst>(NI))
- return isStrongerThanMonotonic(LI->getOrdering());
- if (auto *SI = dyn_cast<StoreInst>(NI))
- return isStrongerThanMonotonic(SI->getOrdering());
+ return true;
+
+ // If NI is an atomic load/store stronger than monotonic, do not try to
+ // eliminate/reorder it.
+ if (NI->isAtomic()) {
+ if (auto *LI = dyn_cast<LoadInst>(NI))
+ return isStrongerThanMonotonic(LI->getOrdering());
+ if (auto *SI = dyn_cast<StoreInst>(NI))
+ return isStrongerThanMonotonic(SI->getOrdering());
if (auto *ARMW = dyn_cast<AtomicRMWInst>(NI))
return isStrongerThanMonotonic(ARMW->getOrdering());
if (auto *CmpXchg = dyn_cast<AtomicCmpXchgInst>(NI))
return isStrongerThanMonotonic(CmpXchg->getSuccessOrdering()) ||
isStrongerThanMonotonic(CmpXchg->getFailureOrdering());
- llvm_unreachable("other instructions should be skipped in MemorySSA");
- }
- return false;
- }
-
- /// Eliminate writes to objects that are not visible in the caller and are not
- /// accessed before returning from the function.
- bool eliminateDeadWritesAtEndOfFunction() {
- bool MadeChange = false;
- LLVM_DEBUG(
- dbgs()
- << "Trying to eliminate MemoryDefs at the end of the function\n");
- for (int I = MemDefs.size() - 1; I >= 0; I--) {
- MemoryDef *Def = MemDefs[I];
+ llvm_unreachable("other instructions should be skipped in MemorySSA");
+ }
+ return false;
+ }
+
+ /// Eliminate writes to objects that are not visible in the caller and are not
+ /// accessed before returning from the function.
+ bool eliminateDeadWritesAtEndOfFunction() {
+ bool MadeChange = false;
+ LLVM_DEBUG(
+ dbgs()
+ << "Trying to eliminate MemoryDefs at the end of the function\n");
+ for (int I = MemDefs.size() - 1; I >= 0; I--) {
+ MemoryDef *Def = MemDefs[I];
if (SkipStores.contains(Def) || !isRemovable(Def->getMemoryInst()))
- continue;
-
+ continue;
+
Instruction *DefI = Def->getMemoryInst();
SmallVector<const Value *, 4> Pointers;
auto DefLoc = getLocForWriteEx(DefI);
@@ -2388,30 +2388,30 @@ struct DSEState {
if (!UO || !isInvisibleToCallerAfterRet(UO))
continue;
- if (isWriteAtEndOfFunction(Def)) {
- // See through pointer-to-pointer bitcasts
- LLVM_DEBUG(dbgs() << " ... MemoryDef is not accessed until the end "
- "of the function\n");
+ if (isWriteAtEndOfFunction(Def)) {
+ // See through pointer-to-pointer bitcasts
+ LLVM_DEBUG(dbgs() << " ... MemoryDef is not accessed until the end "
+ "of the function\n");
deleteDeadInstruction(DefI);
++NumFastStores;
MadeChange = true;
- }
- }
- return MadeChange;
- }
-
- /// \returns true if \p Def is a no-op store, either because it
- /// directly stores back a loaded value or stores zero to a calloced object.
+ }
+ }
+ return MadeChange;
+ }
+
+ /// \returns true if \p Def is a no-op store, either because it
+ /// directly stores back a loaded value or stores zero to a calloced object.
bool storeIsNoop(MemoryDef *Def, const MemoryLocation &DefLoc,
const Value *DefUO) {
- StoreInst *Store = dyn_cast<StoreInst>(Def->getMemoryInst());
- if (!Store)
- return false;
-
- if (auto *LoadI = dyn_cast<LoadInst>(Store->getOperand(0))) {
- if (LoadI->getPointerOperand() == Store->getOperand(1)) {
+ StoreInst *Store = dyn_cast<StoreInst>(Def->getMemoryInst());
+ if (!Store)
+ return false;
+
+ if (auto *LoadI = dyn_cast<LoadInst>(Store->getOperand(0))) {
+ if (LoadI->getPointerOperand() == Store->getOperand(1)) {
// Get the defining access for the load.
- auto *LoadAccess = MSSA.getMemoryAccess(LoadI)->getDefiningAccess();
+ auto *LoadAccess = MSSA.getMemoryAccess(LoadI)->getDefiningAccess();
// Fast path: the defining accesses are the same.
if (LoadAccess == Def->getDefiningAccess())
return true;
@@ -2449,126 +2449,126 @@ struct DSEState {
return false;
}
return true;
- }
- }
-
- Constant *StoredConstant = dyn_cast<Constant>(Store->getOperand(0));
- if (StoredConstant && StoredConstant->isNullValue()) {
- auto *DefUOInst = dyn_cast<Instruction>(DefUO);
- if (DefUOInst && isCallocLikeFn(DefUOInst, &TLI)) {
- auto *UnderlyingDef = cast<MemoryDef>(MSSA.getMemoryAccess(DefUOInst));
- // If UnderlyingDef is the clobbering access of Def, no instructions
- // between them can modify the memory location.
- auto *ClobberDef =
- MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def);
- return UnderlyingDef == ClobberDef;
- }
- }
- return false;
- }
-};
-
-bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
- MemorySSA &MSSA, DominatorTree &DT,
- PostDominatorTree &PDT,
- const TargetLibraryInfo &TLI) {
- bool MadeChange = false;
-
- DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI);
- // For each store:
- for (unsigned I = 0; I < State.MemDefs.size(); I++) {
- MemoryDef *KillingDef = State.MemDefs[I];
- if (State.SkipStores.count(KillingDef))
- continue;
- Instruction *SI = KillingDef->getMemoryInst();
-
+ }
+ }
+
+ Constant *StoredConstant = dyn_cast<Constant>(Store->getOperand(0));
+ if (StoredConstant && StoredConstant->isNullValue()) {
+ auto *DefUOInst = dyn_cast<Instruction>(DefUO);
+ if (DefUOInst && isCallocLikeFn(DefUOInst, &TLI)) {
+ auto *UnderlyingDef = cast<MemoryDef>(MSSA.getMemoryAccess(DefUOInst));
+ // If UnderlyingDef is the clobbering access of Def, no instructions
+ // between them can modify the memory location.
+ auto *ClobberDef =
+ MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def);
+ return UnderlyingDef == ClobberDef;
+ }
+ }
+ return false;
+ }
+};
+
+bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
+ MemorySSA &MSSA, DominatorTree &DT,
+ PostDominatorTree &PDT,
+ const TargetLibraryInfo &TLI) {
+ bool MadeChange = false;
+
+ DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI);
+ // For each store:
+ for (unsigned I = 0; I < State.MemDefs.size(); I++) {
+ MemoryDef *KillingDef = State.MemDefs[I];
+ if (State.SkipStores.count(KillingDef))
+ continue;
+ Instruction *SI = KillingDef->getMemoryInst();
+
Optional<MemoryLocation> MaybeSILoc;
- if (State.isMemTerminatorInst(SI))
- MaybeSILoc = State.getLocForTerminator(SI).map(
- [](const std::pair<MemoryLocation, bool> &P) { return P.first; });
- else
- MaybeSILoc = State.getLocForWriteEx(SI);
-
- if (!MaybeSILoc) {
- LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
- << *SI << "\n");
- continue;
- }
- MemoryLocation SILoc = *MaybeSILoc;
- assert(SILoc.Ptr && "SILoc should not be null");
+ if (State.isMemTerminatorInst(SI))
+ MaybeSILoc = State.getLocForTerminator(SI).map(
+ [](const std::pair<MemoryLocation, bool> &P) { return P.first; });
+ else
+ MaybeSILoc = State.getLocForWriteEx(SI);
+
+ if (!MaybeSILoc) {
+ LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
+ << *SI << "\n");
+ continue;
+ }
+ MemoryLocation SILoc = *MaybeSILoc;
+ assert(SILoc.Ptr && "SILoc should not be null");
const Value *SILocUnd = getUnderlyingObject(SILoc.Ptr);
-
- MemoryAccess *Current = KillingDef;
- LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by "
- << *KillingDef << " (" << *SI << ")\n");
-
+
+ MemoryAccess *Current = KillingDef;
+ LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by "
+ << *KillingDef << " (" << *SI << ")\n");
+
unsigned ScanLimit = MemorySSAScanLimit;
unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit;
unsigned PartialLimit = MemorySSAPartialStoreLimit;
- // Worklist of MemoryAccesses that may be killed by KillingDef.
- SetVector<MemoryAccess *> ToCheck;
-
+ // Worklist of MemoryAccesses that may be killed by KillingDef.
+ SetVector<MemoryAccess *> ToCheck;
+
if (SILocUnd)
ToCheck.insert(KillingDef->getDefiningAccess());
bool Shortend = false;
bool IsMemTerm = State.isMemTerminatorInst(SI);
- // Check if MemoryAccesses in the worklist are killed by KillingDef.
- for (unsigned I = 0; I < ToCheck.size(); I++) {
- Current = ToCheck[I];
- if (State.SkipStores.count(Current))
- continue;
-
- Optional<MemoryAccess *> Next = State.getDomMemoryDef(
+ // Check if MemoryAccesses in the worklist are killed by KillingDef.
+ for (unsigned I = 0; I < ToCheck.size(); I++) {
+ Current = ToCheck[I];
+ if (State.SkipStores.count(Current))
+ continue;
+
+ Optional<MemoryAccess *> Next = State.getDomMemoryDef(
KillingDef, Current, SILoc, SILocUnd, ScanLimit, WalkerStepLimit,
IsMemTerm, PartialLimit);
-
- if (!Next) {
- LLVM_DEBUG(dbgs() << " finished walk\n");
- continue;
- }
-
+
+ if (!Next) {
+ LLVM_DEBUG(dbgs() << " finished walk\n");
+ continue;
+ }
+
MemoryAccess *EarlierAccess = *Next;
LLVM_DEBUG(dbgs() << " Checking if we can kill " << *EarlierAccess);
if (isa<MemoryPhi>(EarlierAccess)) {
- LLVM_DEBUG(dbgs() << "\n ... adding incoming values to worklist\n");
+ LLVM_DEBUG(dbgs() << "\n ... adding incoming values to worklist\n");
for (Value *V : cast<MemoryPhi>(EarlierAccess)->incoming_values()) {
- MemoryAccess *IncomingAccess = cast<MemoryAccess>(V);
- BasicBlock *IncomingBlock = IncomingAccess->getBlock();
+ MemoryAccess *IncomingAccess = cast<MemoryAccess>(V);
+ BasicBlock *IncomingBlock = IncomingAccess->getBlock();
BasicBlock *PhiBlock = EarlierAccess->getBlock();
-
- // We only consider incoming MemoryAccesses that come before the
- // MemoryPhi. Otherwise we could discover candidates that do not
- // strictly dominate our starting def.
- if (State.PostOrderNumbers[IncomingBlock] >
- State.PostOrderNumbers[PhiBlock])
- ToCheck.insert(IncomingAccess);
- }
- continue;
- }
+
+ // We only consider incoming MemoryAccesses that come before the
+ // MemoryPhi. Otherwise we could discover candidates that do not
+ // strictly dominate our starting def.
+ if (State.PostOrderNumbers[IncomingBlock] >
+ State.PostOrderNumbers[PhiBlock])
+ ToCheck.insert(IncomingAccess);
+ }
+ continue;
+ }
auto *NextDef = cast<MemoryDef>(EarlierAccess);
- Instruction *NI = NextDef->getMemoryInst();
- LLVM_DEBUG(dbgs() << " (" << *NI << ")\n");
- ToCheck.insert(NextDef->getDefiningAccess());
+ Instruction *NI = NextDef->getMemoryInst();
+ LLVM_DEBUG(dbgs() << " (" << *NI << ")\n");
+ ToCheck.insert(NextDef->getDefiningAccess());
NumGetDomMemoryDefPassed++;
-
- if (!DebugCounter::shouldExecute(MemorySSACounter))
- continue;
-
- MemoryLocation NILoc = *State.getLocForWriteEx(NI);
-
+
+ if (!DebugCounter::shouldExecute(MemorySSACounter))
+ continue;
+
+ MemoryLocation NILoc = *State.getLocForWriteEx(NI);
+
if (IsMemTerm) {
const Value *NIUnd = getUnderlyingObject(NILoc.Ptr);
if (SILocUnd != NIUnd)
- continue;
- LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI
- << "\n KILLER: " << *SI << '\n');
- State.deleteDeadInstruction(NI);
- ++NumFastStores;
- MadeChange = true;
- } else {
- // Check if NI overwrites SI.
- int64_t InstWriteOffset, DepWriteOffset;
+ continue;
+ LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI
+ << "\n KILLER: " << *SI << '\n');
+ State.deleteDeadInstruction(NI);
+ ++NumFastStores;
+ MadeChange = true;
+ } else {
+ // Check if NI overwrites SI.
+ int64_t InstWriteOffset, DepWriteOffset;
OverwriteResult OR =
isOverwrite(SI, NI, SILoc, NILoc, State.DL, TLI, DepWriteOffset,
InstWriteOffset, State.BatchAA, &F);
@@ -2580,10 +2580,10 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
OR = isPartialOverwrite(SILoc, NILoc, DepWriteOffset, InstWriteOffset,
NI, IOL);
}
-
- if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) {
- auto *Earlier = dyn_cast<StoreInst>(NI);
- auto *Later = dyn_cast<StoreInst>(SI);
+
+ if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) {
+ auto *Earlier = dyn_cast<StoreInst>(NI);
+ auto *Later = dyn_cast<StoreInst>(SI);
// We are re-using tryToMergePartialOverlappingStores, which requires
// Earlier to domiante Later.
// TODO: implement tryToMergeParialOverlappingStores using MemorySSA.
@@ -2591,12 +2591,12 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
if (Constant *Merged = tryToMergePartialOverlappingStores(
Earlier, Later, InstWriteOffset, DepWriteOffset, State.DL,
State.BatchAA, &DT)) {
-
+
// Update stored value of earlier store to merged constant.
Earlier->setOperand(0, Merged);
++NumModifiedStores;
MadeChange = true;
-
+
Shortend = true;
// Remove later store and remove any outstanding overlap intervals
// for the updated store.
@@ -2606,18 +2606,18 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
I->second.erase(Earlier);
break;
}
- }
- }
-
- if (OR == OW_Complete) {
- LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI
- << "\n KILLER: " << *SI << '\n');
- State.deleteDeadInstruction(NI);
- ++NumFastStores;
- MadeChange = true;
- }
- }
- }
+ }
+ }
+
+ if (OR == OW_Complete) {
+ LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n DEAD: " << *NI
+ << "\n KILLER: " << *SI << '\n');
+ State.deleteDeadInstruction(NI);
+ ++NumFastStores;
+ MadeChange = true;
+ }
+ }
+ }
// Check if the store is a no-op.
if (!Shortend && isRemovable(SI) &&
@@ -2628,135 +2628,135 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
MadeChange = true;
continue;
}
- }
-
- if (EnablePartialOverwriteTracking)
- for (auto &KV : State.IOLs)
+ }
+
+ if (EnablePartialOverwriteTracking)
+ for (auto &KV : State.IOLs)
MadeChange |= removePartiallyOverlappedStores(State.DL, KV.second, TLI);
-
- MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
- return MadeChange;
-}
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-// DSE Pass
-//===----------------------------------------------------------------------===//
-PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
- AliasAnalysis &AA = AM.getResult<AAManager>(F);
- const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
-
- bool Changed = false;
- if (EnableMemorySSA) {
- MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
- PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
-
- Changed = eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI);
- } else {
- MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
-
- Changed = eliminateDeadStores(F, &AA, &MD, &DT, &TLI);
- }
-
-#ifdef LLVM_ENABLE_STATS
- if (AreStatisticsEnabled())
- for (auto &I : instructions(F))
- NumRemainingStores += isa<StoreInst>(&I);
-#endif
-
- if (!Changed)
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<GlobalsAA>();
- if (EnableMemorySSA)
- PA.preserve<MemorySSAAnalysis>();
- else
- PA.preserve<MemoryDependenceAnalysis>();
- return PA;
-}
-
-namespace {
-
-/// A legacy pass for the legacy pass manager that wraps \c DSEPass.
-class DSELegacyPass : public FunctionPass {
-public:
- static char ID; // Pass identification, replacement for typeid
-
- DSELegacyPass() : FunctionPass(ID) {
- initializeDSELegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
- DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- const TargetLibraryInfo &TLI =
- getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-
- bool Changed = false;
- if (EnableMemorySSA) {
- MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
- PostDominatorTree &PDT =
- getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
-
- Changed = eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI);
- } else {
- MemoryDependenceResults &MD =
- getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
-
- Changed = eliminateDeadStores(F, &AA, &MD, &DT, &TLI);
- }
-
-#ifdef LLVM_ENABLE_STATS
- if (AreStatisticsEnabled())
- for (auto &I : instructions(F))
- NumRemainingStores += isa<StoreInst>(&I);
-#endif
-
- return Changed;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
-
- if (EnableMemorySSA) {
- AU.addRequired<PostDominatorTreeWrapperPass>();
- AU.addRequired<MemorySSAWrapperPass>();
- AU.addPreserved<PostDominatorTreeWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
- } else {
- AU.addRequired<MemoryDependenceWrapperPass>();
- AU.addPreserved<MemoryDependenceWrapperPass>();
- }
- }
-};
-
-} // end anonymous namespace
-
-char DSELegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
- false)
-
-FunctionPass *llvm::createDeadStoreEliminationPass() {
- return new DSELegacyPass();
-}
+
+ MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
+ return MadeChange;
+}
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// DSE Pass
+//===----------------------------------------------------------------------===//
+PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
+ AliasAnalysis &AA = AM.getResult<AAManager>(F);
+ const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+ bool Changed = false;
+ if (EnableMemorySSA) {
+ MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+ PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+
+ Changed = eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI);
+ } else {
+ MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+
+ Changed = eliminateDeadStores(F, &AA, &MD, &DT, &TLI);
+ }
+
+#ifdef LLVM_ENABLE_STATS
+ if (AreStatisticsEnabled())
+ for (auto &I : instructions(F))
+ NumRemainingStores += isa<StoreInst>(&I);
+#endif
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ if (EnableMemorySSA)
+ PA.preserve<MemorySSAAnalysis>();
+ else
+ PA.preserve<MemoryDependenceAnalysis>();
+ return PA;
+}
+
+namespace {
+
+/// A legacy pass for the legacy pass manager that wraps \c DSEPass.
+class DSELegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ DSELegacyPass() : FunctionPass(ID) {
+ initializeDSELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ const TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+
+ bool Changed = false;
+ if (EnableMemorySSA) {
+ MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+ PostDominatorTree &PDT =
+ getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+
+ Changed = eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI);
+ } else {
+ MemoryDependenceResults &MD =
+ getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+
+ Changed = eliminateDeadStores(F, &AA, &MD, &DT, &TLI);
+ }
+
+#ifdef LLVM_ENABLE_STATS
+ if (AreStatisticsEnabled())
+ for (auto &I : instructions(F))
+ NumRemainingStores += isa<StoreInst>(&I);
+#endif
+
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+
+ if (EnableMemorySSA) {
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addPreserved<PostDominatorTreeWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ } else {
+ AU.addRequired<MemoryDependenceWrapperPass>();
+ AU.addPreserved<MemoryDependenceWrapperPass>();
+ }
+ }
+};
+
+} // end anonymous namespace
+
+char DSELegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
+ false)
+
+FunctionPass *llvm::createDeadStoreEliminationPass() {
+ return new DSELegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/DivRemPairs.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/DivRemPairs.cpp
index 0132ac83bb..3c6c444d66 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/DivRemPairs.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -1,399 +1,399 @@
-//===- DivRemPairs.cpp - Hoist/[dr]ecompose division and remainder --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass hoists and/or decomposes/recomposes integer division and remainder
-// instructions to enable CFG improvements and better codegen.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/DivRemPairs.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/DebugCounter.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BypassSlowDivision.h"
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-#define DEBUG_TYPE "div-rem-pairs"
-STATISTIC(NumPairs, "Number of div/rem pairs");
-STATISTIC(NumRecomposed, "Number of instructions recomposed");
-STATISTIC(NumHoisted, "Number of instructions hoisted");
-STATISTIC(NumDecomposed, "Number of instructions decomposed");
-DEBUG_COUNTER(DRPCounter, "div-rem-pairs-transform",
- "Controls transformations in div-rem-pairs pass");
-
-namespace {
-struct ExpandedMatch {
- DivRemMapKey Key;
- Instruction *Value;
-};
-} // namespace
-
-/// See if we can match: (which is the form we expand into)
-/// X - ((X ?/ Y) * Y)
-/// which is equivalent to:
-/// X ?% Y
-static llvm::Optional<ExpandedMatch> matchExpandedRem(Instruction &I) {
- Value *Dividend, *XroundedDownToMultipleOfY;
- if (!match(&I, m_Sub(m_Value(Dividend), m_Value(XroundedDownToMultipleOfY))))
- return llvm::None;
-
- Value *Divisor;
- Instruction *Div;
- // Look for ((X / Y) * Y)
- if (!match(
- XroundedDownToMultipleOfY,
- m_c_Mul(m_CombineAnd(m_IDiv(m_Specific(Dividend), m_Value(Divisor)),
- m_Instruction(Div)),
- m_Deferred(Divisor))))
- return llvm::None;
-
- ExpandedMatch M;
- M.Key.SignedOp = Div->getOpcode() == Instruction::SDiv;
- M.Key.Dividend = Dividend;
- M.Key.Divisor = Divisor;
- M.Value = &I;
- return M;
-}
-
-namespace {
-/// A thin wrapper to store two values that we matched as div-rem pair.
-/// We want this extra indirection to avoid dealing with RAUW'ing the map keys.
-struct DivRemPairWorklistEntry {
- /// The actual udiv/sdiv instruction. Source of truth.
- AssertingVH<Instruction> DivInst;
-
- /// The instruction that we have matched as a remainder instruction.
- /// Should only be used as Value, don't introspect it.
- AssertingVH<Instruction> RemInst;
-
- DivRemPairWorklistEntry(Instruction *DivInst_, Instruction *RemInst_)
- : DivInst(DivInst_), RemInst(RemInst_) {
- assert((DivInst->getOpcode() == Instruction::UDiv ||
- DivInst->getOpcode() == Instruction::SDiv) &&
- "Not a division.");
- assert(DivInst->getType() == RemInst->getType() && "Types should match.");
- // We can't check anything else about remainder instruction,
- // it's not strictly required to be a urem/srem.
- }
-
- /// The type for this pair, identical for both the div and rem.
- Type *getType() const { return DivInst->getType(); }
-
- /// Is this pair signed or unsigned?
- bool isSigned() const { return DivInst->getOpcode() == Instruction::SDiv; }
-
- /// In this pair, what are the divident and divisor?
- Value *getDividend() const { return DivInst->getOperand(0); }
- Value *getDivisor() const { return DivInst->getOperand(1); }
-
- bool isRemExpanded() const {
- switch (RemInst->getOpcode()) {
- case Instruction::SRem:
- case Instruction::URem:
- return false; // single 'rem' instruction - unexpanded form.
- default:
- return true; // anything else means we have remainder in expanded form.
- }
- }
-};
-} // namespace
-using DivRemWorklistTy = SmallVector<DivRemPairWorklistEntry, 4>;
-
-/// Find matching pairs of integer div/rem ops (they have the same numerator,
-/// denominator, and signedness). Place those pairs into a worklist for further
-/// processing. This indirection is needed because we have to use TrackingVH<>
-/// because we will be doing RAUW, and if one of the rem instructions we change
-/// happens to be an input to another div/rem in the maps, we'd have problems.
-static DivRemWorklistTy getWorklist(Function &F) {
- // Insert all divide and remainder instructions into maps keyed by their
- // operands and opcode (signed or unsigned).
- DenseMap<DivRemMapKey, Instruction *> DivMap;
- // Use a MapVector for RemMap so that instructions are moved/inserted in a
- // deterministic order.
- MapVector<DivRemMapKey, Instruction *> RemMap;
- for (auto &BB : F) {
- for (auto &I : BB) {
- if (I.getOpcode() == Instruction::SDiv)
- DivMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
- else if (I.getOpcode() == Instruction::UDiv)
- DivMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
- else if (I.getOpcode() == Instruction::SRem)
- RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
- else if (I.getOpcode() == Instruction::URem)
- RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
- else if (auto Match = matchExpandedRem(I))
- RemMap[Match->Key] = Match->Value;
- }
- }
-
- // We'll accumulate the matching pairs of div-rem instructions here.
- DivRemWorklistTy Worklist;
-
- // We can iterate over either map because we are only looking for matched
- // pairs. Choose remainders for efficiency because they are usually even more
- // rare than division.
- for (auto &RemPair : RemMap) {
- // Find the matching division instruction from the division map.
+//===- DivRemPairs.cpp - Hoist/[dr]ecompose division and remainder --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists and/or decomposes/recomposes integer division and remainder
+// instructions to enable CFG improvements and better codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/DivRemPairs.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "div-rem-pairs"
+STATISTIC(NumPairs, "Number of div/rem pairs");
+STATISTIC(NumRecomposed, "Number of instructions recomposed");
+STATISTIC(NumHoisted, "Number of instructions hoisted");
+STATISTIC(NumDecomposed, "Number of instructions decomposed");
+DEBUG_COUNTER(DRPCounter, "div-rem-pairs-transform",
+ "Controls transformations in div-rem-pairs pass");
+
+namespace {
+struct ExpandedMatch {
+ DivRemMapKey Key;
+ Instruction *Value;
+};
+} // namespace
+
+/// See if we can match: (which is the form we expand into)
+/// X - ((X ?/ Y) * Y)
+/// which is equivalent to:
+/// X ?% Y
+static llvm::Optional<ExpandedMatch> matchExpandedRem(Instruction &I) {
+ Value *Dividend, *XroundedDownToMultipleOfY;
+ if (!match(&I, m_Sub(m_Value(Dividend), m_Value(XroundedDownToMultipleOfY))))
+ return llvm::None;
+
+ Value *Divisor;
+ Instruction *Div;
+ // Look for ((X / Y) * Y)
+ if (!match(
+ XroundedDownToMultipleOfY,
+ m_c_Mul(m_CombineAnd(m_IDiv(m_Specific(Dividend), m_Value(Divisor)),
+ m_Instruction(Div)),
+ m_Deferred(Divisor))))
+ return llvm::None;
+
+ ExpandedMatch M;
+ M.Key.SignedOp = Div->getOpcode() == Instruction::SDiv;
+ M.Key.Dividend = Dividend;
+ M.Key.Divisor = Divisor;
+ M.Value = &I;
+ return M;
+}
+
+namespace {
+/// A thin wrapper to store two values that we matched as div-rem pair.
+/// We want this extra indirection to avoid dealing with RAUW'ing the map keys.
+struct DivRemPairWorklistEntry {
+ /// The actual udiv/sdiv instruction. Source of truth.
+ AssertingVH<Instruction> DivInst;
+
+ /// The instruction that we have matched as a remainder instruction.
+ /// Should only be used as Value, don't introspect it.
+ AssertingVH<Instruction> RemInst;
+
+ DivRemPairWorklistEntry(Instruction *DivInst_, Instruction *RemInst_)
+ : DivInst(DivInst_), RemInst(RemInst_) {
+ assert((DivInst->getOpcode() == Instruction::UDiv ||
+ DivInst->getOpcode() == Instruction::SDiv) &&
+ "Not a division.");
+ assert(DivInst->getType() == RemInst->getType() && "Types should match.");
+ // We can't check anything else about remainder instruction,
+ // it's not strictly required to be a urem/srem.
+ }
+
+ /// The type for this pair, identical for both the div and rem.
+ Type *getType() const { return DivInst->getType(); }
+
+ /// Is this pair signed or unsigned?
+ bool isSigned() const { return DivInst->getOpcode() == Instruction::SDiv; }
+
+ /// In this pair, what are the divident and divisor?
+ Value *getDividend() const { return DivInst->getOperand(0); }
+ Value *getDivisor() const { return DivInst->getOperand(1); }
+
+ bool isRemExpanded() const {
+ switch (RemInst->getOpcode()) {
+ case Instruction::SRem:
+ case Instruction::URem:
+ return false; // single 'rem' instruction - unexpanded form.
+ default:
+ return true; // anything else means we have remainder in expanded form.
+ }
+ }
+};
+} // namespace
+using DivRemWorklistTy = SmallVector<DivRemPairWorklistEntry, 4>;
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). Place those pairs into a worklist for further
+/// processing. This indirection is needed because we have to use TrackingVH<>
+/// because we will be doing RAUW, and if one of the rem instructions we change
+/// happens to be an input to another div/rem in the maps, we'd have problems.
+static DivRemWorklistTy getWorklist(Function &F) {
+ // Insert all divide and remainder instructions into maps keyed by their
+ // operands and opcode (signed or unsigned).
+ DenseMap<DivRemMapKey, Instruction *> DivMap;
+ // Use a MapVector for RemMap so that instructions are moved/inserted in a
+ // deterministic order.
+ MapVector<DivRemMapKey, Instruction *> RemMap;
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ if (I.getOpcode() == Instruction::SDiv)
+ DivMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
+ else if (I.getOpcode() == Instruction::UDiv)
+ DivMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
+ else if (I.getOpcode() == Instruction::SRem)
+ RemMap[DivRemMapKey(true, I.getOperand(0), I.getOperand(1))] = &I;
+ else if (I.getOpcode() == Instruction::URem)
+ RemMap[DivRemMapKey(false, I.getOperand(0), I.getOperand(1))] = &I;
+ else if (auto Match = matchExpandedRem(I))
+ RemMap[Match->Key] = Match->Value;
+ }
+ }
+
+ // We'll accumulate the matching pairs of div-rem instructions here.
+ DivRemWorklistTy Worklist;
+
+ // We can iterate over either map because we are only looking for matched
+ // pairs. Choose remainders for efficiency because they are usually even more
+ // rare than division.
+ for (auto &RemPair : RemMap) {
+ // Find the matching division instruction from the division map.
auto It = DivMap.find(RemPair.first);
if (It == DivMap.end())
- continue;
-
- // We have a matching pair of div/rem instructions.
- NumPairs++;
- Instruction *RemInst = RemPair.second;
-
- // Place it in the worklist.
+ continue;
+
+ // We have a matching pair of div/rem instructions.
+ NumPairs++;
+ Instruction *RemInst = RemPair.second;
+
+ // Place it in the worklist.
Worklist.emplace_back(It->second, RemInst);
- }
-
- return Worklist;
-}
-
-/// Find matching pairs of integer div/rem ops (they have the same numerator,
-/// denominator, and signedness). If they exist in different basic blocks, bring
-/// them together by hoisting or replace the common division operation that is
-/// implicit in the remainder:
-/// X % Y <--> X - ((X / Y) * Y).
-///
-/// We can largely ignore the normal safety and cost constraints on speculation
-/// of these ops when we find a matching pair. This is because we are already
-/// guaranteed that any exceptions and most cost are already incurred by the
-/// first member of the pair.
-///
-/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
-/// SimplifyCFG, but it's split off on its own because it's different enough
-/// that it doesn't quite match the stated objectives of those passes.
-static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
- const DominatorTree &DT) {
- bool Changed = false;
-
- // Get the matching pairs of div-rem instructions. We want this extra
- // indirection to avoid dealing with having to RAUW the keys of the maps.
- DivRemWorklistTy Worklist = getWorklist(F);
-
- // Process each entry in the worklist.
- for (DivRemPairWorklistEntry &E : Worklist) {
- if (!DebugCounter::shouldExecute(DRPCounter))
- continue;
-
- bool HasDivRemOp = TTI.hasDivRemOp(E.getType(), E.isSigned());
-
- auto &DivInst = E.DivInst;
- auto &RemInst = E.RemInst;
-
- const bool RemOriginallyWasInExpandedForm = E.isRemExpanded();
- (void)RemOriginallyWasInExpandedForm; // suppress unused variable warning
-
- if (HasDivRemOp && E.isRemExpanded()) {
- // The target supports div+rem but the rem is expanded.
- // We should recompose it first.
- Value *X = E.getDividend();
- Value *Y = E.getDivisor();
- Instruction *RealRem = E.isSigned() ? BinaryOperator::CreateSRem(X, Y)
- : BinaryOperator::CreateURem(X, Y);
- // Note that we place it right next to the original expanded instruction,
- // and letting further handling to move it if needed.
- RealRem->setName(RemInst->getName() + ".recomposed");
- RealRem->insertAfter(RemInst);
- Instruction *OrigRemInst = RemInst;
- // Update AssertingVH<> with new instruction so it doesn't assert.
- RemInst = RealRem;
- // And replace the original instruction with the new one.
- OrigRemInst->replaceAllUsesWith(RealRem);
- OrigRemInst->eraseFromParent();
- NumRecomposed++;
- // Note that we have left ((X / Y) * Y) around.
- // If it had other uses we could rewrite it as X - X % Y
- Changed = true;
- }
-
- assert((!E.isRemExpanded() || !HasDivRemOp) &&
- "*If* the target supports div-rem, then by now the RemInst *is* "
- "Instruction::[US]Rem.");
-
- // If the target supports div+rem and the instructions are in the same block
- // already, there's nothing to do. The backend should handle this. If the
- // target does not support div+rem, then we will decompose the rem.
- if (HasDivRemOp && RemInst->getParent() == DivInst->getParent())
- continue;
-
- bool DivDominates = DT.dominates(DivInst, RemInst);
- if (!DivDominates && !DT.dominates(RemInst, DivInst)) {
- // We have matching div-rem pair, but they are in two different blocks,
- // neither of which dominates one another.
- // FIXME: We could hoist both ops to the common predecessor block?
- continue;
- }
-
- // The target does not have a single div/rem operation,
- // and the rem is already in expanded form. Nothing to do.
- if (!HasDivRemOp && E.isRemExpanded())
- continue;
-
- if (HasDivRemOp) {
- // The target has a single div/rem operation. Hoist the lower instruction
- // to make the matched pair visible to the backend.
- if (DivDominates)
- RemInst->moveAfter(DivInst);
- else
- DivInst->moveAfter(RemInst);
- NumHoisted++;
- } else {
- // The target does not have a single div/rem operation,
- // and the rem is *not* in a already-expanded form.
- // Decompose the remainder calculation as:
- // X % Y --> X - ((X / Y) * Y).
-
- assert(!RemOriginallyWasInExpandedForm &&
- "We should not be expanding if the rem was in expanded form to "
- "begin with.");
-
- Value *X = E.getDividend();
- Value *Y = E.getDivisor();
- Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y);
- Instruction *Sub = BinaryOperator::CreateSub(X, Mul);
-
- // If the remainder dominates, then hoist the division up to that block:
- //
- // bb1:
- // %rem = srem %x, %y
- // bb2:
- // %div = sdiv %x, %y
- // -->
- // bb1:
- // %div = sdiv %x, %y
- // %mul = mul %div, %y
- // %rem = sub %x, %mul
- //
- // If the division dominates, it's already in the right place. The mul+sub
- // will be in a different block because we don't assume that they are
- // cheap to speculatively execute:
- //
- // bb1:
- // %div = sdiv %x, %y
- // bb2:
- // %rem = srem %x, %y
- // -->
- // bb1:
- // %div = sdiv %x, %y
- // bb2:
- // %mul = mul %div, %y
- // %rem = sub %x, %mul
- //
- // If the div and rem are in the same block, we do the same transform,
- // but any code movement would be within the same block.
-
- if (!DivDominates)
- DivInst->moveBefore(RemInst);
- Mul->insertAfter(RemInst);
- Sub->insertAfter(Mul);
-
- // If X can be undef, X should be frozen first.
- // For example, let's assume that Y = 1 & X = undef:
- // %div = sdiv undef, 1 // %div = undef
- // %rem = srem undef, 1 // %rem = 0
- // =>
- // %div = sdiv undef, 1 // %div = undef
- // %mul = mul %div, 1 // %mul = undef
- // %rem = sub %x, %mul // %rem = undef - undef = undef
- // If X is not frozen, %rem becomes undef after transformation.
- // TODO: We need a undef-specific checking function in ValueTracking
+ }
+
+ return Worklist;
+}
+
+/// Find matching pairs of integer div/rem ops (they have the same numerator,
+/// denominator, and signedness). If they exist in different basic blocks, bring
+/// them together by hoisting or replace the common division operation that is
+/// implicit in the remainder:
+/// X % Y <--> X - ((X / Y) * Y).
+///
+/// We can largely ignore the normal safety and cost constraints on speculation
+/// of these ops when we find a matching pair. This is because we are already
+/// guaranteed that any exceptions and most cost are already incurred by the
+/// first member of the pair.
+///
+/// Note: This transform could be an oddball enhancement to EarlyCSE, GVN, or
+/// SimplifyCFG, but it's split off on its own because it's different enough
+/// that it doesn't quite match the stated objectives of those passes.
+static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
+ const DominatorTree &DT) {
+ bool Changed = false;
+
+ // Get the matching pairs of div-rem instructions. We want this extra
+ // indirection to avoid dealing with having to RAUW the keys of the maps.
+ DivRemWorklistTy Worklist = getWorklist(F);
+
+ // Process each entry in the worklist.
+ for (DivRemPairWorklistEntry &E : Worklist) {
+ if (!DebugCounter::shouldExecute(DRPCounter))
+ continue;
+
+ bool HasDivRemOp = TTI.hasDivRemOp(E.getType(), E.isSigned());
+
+ auto &DivInst = E.DivInst;
+ auto &RemInst = E.RemInst;
+
+ const bool RemOriginallyWasInExpandedForm = E.isRemExpanded();
+ (void)RemOriginallyWasInExpandedForm; // suppress unused variable warning
+
+ if (HasDivRemOp && E.isRemExpanded()) {
+ // The target supports div+rem but the rem is expanded.
+ // We should recompose it first.
+ Value *X = E.getDividend();
+ Value *Y = E.getDivisor();
+ Instruction *RealRem = E.isSigned() ? BinaryOperator::CreateSRem(X, Y)
+ : BinaryOperator::CreateURem(X, Y);
+ // Note that we place it right next to the original expanded instruction,
+ // and letting further handling to move it if needed.
+ RealRem->setName(RemInst->getName() + ".recomposed");
+ RealRem->insertAfter(RemInst);
+ Instruction *OrigRemInst = RemInst;
+ // Update AssertingVH<> with new instruction so it doesn't assert.
+ RemInst = RealRem;
+ // And replace the original instruction with the new one.
+ OrigRemInst->replaceAllUsesWith(RealRem);
+ OrigRemInst->eraseFromParent();
+ NumRecomposed++;
+ // Note that we have left ((X / Y) * Y) around.
+ // If it had other uses we could rewrite it as X - X % Y
+ Changed = true;
+ }
+
+ assert((!E.isRemExpanded() || !HasDivRemOp) &&
+ "*If* the target supports div-rem, then by now the RemInst *is* "
+ "Instruction::[US]Rem.");
+
+ // If the target supports div+rem and the instructions are in the same block
+ // already, there's nothing to do. The backend should handle this. If the
+ // target does not support div+rem, then we will decompose the rem.
+ if (HasDivRemOp && RemInst->getParent() == DivInst->getParent())
+ continue;
+
+ bool DivDominates = DT.dominates(DivInst, RemInst);
+ if (!DivDominates && !DT.dominates(RemInst, DivInst)) {
+ // We have matching div-rem pair, but they are in two different blocks,
+ // neither of which dominates one another.
+ // FIXME: We could hoist both ops to the common predecessor block?
+ continue;
+ }
+
+ // The target does not have a single div/rem operation,
+ // and the rem is already in expanded form. Nothing to do.
+ if (!HasDivRemOp && E.isRemExpanded())
+ continue;
+
+ if (HasDivRemOp) {
+ // The target has a single div/rem operation. Hoist the lower instruction
+ // to make the matched pair visible to the backend.
+ if (DivDominates)
+ RemInst->moveAfter(DivInst);
+ else
+ DivInst->moveAfter(RemInst);
+ NumHoisted++;
+ } else {
+ // The target does not have a single div/rem operation,
+ // and the rem is *not* in a already-expanded form.
+ // Decompose the remainder calculation as:
+ // X % Y --> X - ((X / Y) * Y).
+
+ assert(!RemOriginallyWasInExpandedForm &&
+ "We should not be expanding if the rem was in expanded form to "
+ "begin with.");
+
+ Value *X = E.getDividend();
+ Value *Y = E.getDivisor();
+ Instruction *Mul = BinaryOperator::CreateMul(DivInst, Y);
+ Instruction *Sub = BinaryOperator::CreateSub(X, Mul);
+
+ // If the remainder dominates, then hoist the division up to that block:
+ //
+ // bb1:
+ // %rem = srem %x, %y
+ // bb2:
+ // %div = sdiv %x, %y
+ // -->
+ // bb1:
+ // %div = sdiv %x, %y
+ // %mul = mul %div, %y
+ // %rem = sub %x, %mul
+ //
+ // If the division dominates, it's already in the right place. The mul+sub
+ // will be in a different block because we don't assume that they are
+ // cheap to speculatively execute:
+ //
+ // bb1:
+ // %div = sdiv %x, %y
+ // bb2:
+ // %rem = srem %x, %y
+ // -->
+ // bb1:
+ // %div = sdiv %x, %y
+ // bb2:
+ // %mul = mul %div, %y
+ // %rem = sub %x, %mul
+ //
+ // If the div and rem are in the same block, we do the same transform,
+ // but any code movement would be within the same block.
+
+ if (!DivDominates)
+ DivInst->moveBefore(RemInst);
+ Mul->insertAfter(RemInst);
+ Sub->insertAfter(Mul);
+
+ // If X can be undef, X should be frozen first.
+ // For example, let's assume that Y = 1 & X = undef:
+ // %div = sdiv undef, 1 // %div = undef
+ // %rem = srem undef, 1 // %rem = 0
+ // =>
+ // %div = sdiv undef, 1 // %div = undef
+ // %mul = mul %div, 1 // %mul = undef
+ // %rem = sub %x, %mul // %rem = undef - undef = undef
+ // If X is not frozen, %rem becomes undef after transformation.
+ // TODO: We need a undef-specific checking function in ValueTracking
if (!isGuaranteedNotToBeUndefOrPoison(X, nullptr, DivInst, &DT)) {
- auto *FrX = new FreezeInst(X, X->getName() + ".frozen", DivInst);
- DivInst->setOperand(0, FrX);
- Sub->setOperand(0, FrX);
- }
- // Same for Y. If X = 1 and Y = (undef | 1), %rem in src is either 1 or 0,
- // but %rem in tgt can be one of many integer values.
+ auto *FrX = new FreezeInst(X, X->getName() + ".frozen", DivInst);
+ DivInst->setOperand(0, FrX);
+ Sub->setOperand(0, FrX);
+ }
+ // Same for Y. If X = 1 and Y = (undef | 1), %rem in src is either 1 or 0,
+ // but %rem in tgt can be one of many integer values.
if (!isGuaranteedNotToBeUndefOrPoison(Y, nullptr, DivInst, &DT)) {
- auto *FrY = new FreezeInst(Y, Y->getName() + ".frozen", DivInst);
- DivInst->setOperand(1, FrY);
- Mul->setOperand(1, FrY);
- }
-
- // Now kill the explicit remainder. We have replaced it with:
- // (sub X, (mul (div X, Y), Y)
- Sub->setName(RemInst->getName() + ".decomposed");
- Instruction *OrigRemInst = RemInst;
- // Update AssertingVH<> with new instruction so it doesn't assert.
- RemInst = Sub;
- // And replace the original instruction with the new one.
- OrigRemInst->replaceAllUsesWith(Sub);
- OrigRemInst->eraseFromParent();
- NumDecomposed++;
- }
- Changed = true;
- }
-
- return Changed;
-}
-
-// Pass manager boilerplate below here.
-
-namespace {
-struct DivRemPairsLegacyPass : public FunctionPass {
- static char ID;
- DivRemPairsLegacyPass() : FunctionPass(ID) {
- initializeDivRemPairsLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.setPreservesCFG();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- FunctionPass::getAnalysisUsage(AU);
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
- auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- return optimizeDivRem(F, TTI, DT);
- }
-};
-} // namespace
-
-char DivRemPairsLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(DivRemPairsLegacyPass, "div-rem-pairs",
- "Hoist/decompose integer division and remainder", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(DivRemPairsLegacyPass, "div-rem-pairs",
- "Hoist/decompose integer division and remainder", false,
- false)
-FunctionPass *llvm::createDivRemPairsPass() {
- return new DivRemPairsLegacyPass();
-}
-
-PreservedAnalyses DivRemPairsPass::run(Function &F,
- FunctionAnalysisManager &FAM) {
- TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
- DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
- if (!optimizeDivRem(F, TTI, DT))
- return PreservedAnalyses::all();
- // TODO: This pass just hoists/replaces math ops - all analyses are preserved?
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<GlobalsAA>();
- return PA;
-}
+ auto *FrY = new FreezeInst(Y, Y->getName() + ".frozen", DivInst);
+ DivInst->setOperand(1, FrY);
+ Mul->setOperand(1, FrY);
+ }
+
+ // Now kill the explicit remainder. We have replaced it with:
+ // (sub X, (mul (div X, Y), Y)
+ Sub->setName(RemInst->getName() + ".decomposed");
+ Instruction *OrigRemInst = RemInst;
+ // Update AssertingVH<> with new instruction so it doesn't assert.
+ RemInst = Sub;
+ // And replace the original instruction with the new one.
+ OrigRemInst->replaceAllUsesWith(Sub);
+ OrigRemInst->eraseFromParent();
+ NumDecomposed++;
+ }
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+// Pass manager boilerplate below here.
+
+namespace {
+struct DivRemPairsLegacyPass : public FunctionPass {
+ static char ID;
+ DivRemPairsLegacyPass() : FunctionPass(ID) {
+ initializeDivRemPairsLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ return optimizeDivRem(F, TTI, DT);
+ }
+};
+} // namespace
+
+char DivRemPairsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(DivRemPairsLegacyPass, "div-rem-pairs",
+ "Hoist/decompose integer division and remainder", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(DivRemPairsLegacyPass, "div-rem-pairs",
+ "Hoist/decompose integer division and remainder", false,
+ false)
+FunctionPass *llvm::createDivRemPairsPass() {
+ return new DivRemPairsLegacyPass();
+}
+
+PreservedAnalyses DivRemPairsPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+ DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ if (!optimizeDivRem(F, TTI, DT))
+ return PreservedAnalyses::all();
+ // TODO: This pass just hoists/replaces math ops - all analyses are preserved?
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/EarlyCSE.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/EarlyCSE.cpp
index 07a84445eb..180a82917f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -1,274 +1,274 @@
-//===- EarlyCSE.cpp - Simple and fast CSE pass ----------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass performs a simple dominator tree walk that eliminates trivially
-// redundant instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/EarlyCSE.h"
-#include "llvm/ADT/DenseMapInfo.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/ScopedHashTable.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/GuardUtils.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Statepoint.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/DebugCounter.h"
-#include "llvm/Support/RecyclingAllocator.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
-#include "llvm/Transforms/Utils/GuardUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <cassert>
-#include <deque>
-#include <memory>
-#include <utility>
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-#define DEBUG_TYPE "early-cse"
-
-STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd");
-STATISTIC(NumCSE, "Number of instructions CSE'd");
-STATISTIC(NumCSECVP, "Number of compare instructions CVP'd");
-STATISTIC(NumCSELoad, "Number of load instructions CSE'd");
-STATISTIC(NumCSECall, "Number of call instructions CSE'd");
-STATISTIC(NumDSE, "Number of trivial dead stores removed");
-
-DEBUG_COUNTER(CSECounter, "early-cse",
- "Controls which instructions are removed");
-
-static cl::opt<unsigned> EarlyCSEMssaOptCap(
- "earlycse-mssa-optimization-cap", cl::init(500), cl::Hidden,
- cl::desc("Enable imprecision in EarlyCSE in pathological cases, in exchange "
- "for faster compile. Caps the MemorySSA clobbering calls."));
-
-static cl::opt<bool> EarlyCSEDebugHash(
- "earlycse-debug-hash", cl::init(false), cl::Hidden,
- cl::desc("Perform extra assertion checking to verify that SimpleValue's hash "
- "function is well-behaved w.r.t. its isEqual predicate"));
-
-//===----------------------------------------------------------------------===//
-// SimpleValue
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-/// Struct representing the available values in the scoped hash table.
-struct SimpleValue {
- Instruction *Inst;
-
- SimpleValue(Instruction *I) : Inst(I) {
- assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
- }
-
- bool isSentinel() const {
- return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
- Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
- }
-
- static bool canHandle(Instruction *Inst) {
- // This can only handle non-void readnone functions.
- if (CallInst *CI = dyn_cast<CallInst>(Inst))
- return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
- return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) ||
- isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) ||
- isa<CmpInst>(Inst) || isa<SelectInst>(Inst) ||
- isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
- isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) ||
- isa<InsertValueInst>(Inst) || isa<FreezeInst>(Inst);
- }
-};
-
-} // end anonymous namespace
-
-namespace llvm {
-
-template <> struct DenseMapInfo<SimpleValue> {
- static inline SimpleValue getEmptyKey() {
- return DenseMapInfo<Instruction *>::getEmptyKey();
- }
-
- static inline SimpleValue getTombstoneKey() {
- return DenseMapInfo<Instruction *>::getTombstoneKey();
- }
-
- static unsigned getHashValue(SimpleValue Val);
- static bool isEqual(SimpleValue LHS, SimpleValue RHS);
-};
-
-} // end namespace llvm
-
-/// Match a 'select' including an optional 'not's of the condition.
-static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A,
- Value *&B,
- SelectPatternFlavor &Flavor) {
- // Return false if V is not even a select.
- if (!match(V, m_Select(m_Value(Cond), m_Value(A), m_Value(B))))
- return false;
-
- // Look through a 'not' of the condition operand by swapping A/B.
- Value *CondNot;
- if (match(Cond, m_Not(m_Value(CondNot)))) {
- Cond = CondNot;
- std::swap(A, B);
- }
-
+//===- EarlyCSE.cpp - Simple and fast CSE pass ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs a simple dominator tree walk that eliminates trivially
+// redundant instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/EarlyCSE.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/RecyclingAllocator.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <deque>
+#include <memory>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "early-cse"
+
+STATISTIC(NumSimplify, "Number of instructions simplified or DCE'd");
+STATISTIC(NumCSE, "Number of instructions CSE'd");
+STATISTIC(NumCSECVP, "Number of compare instructions CVP'd");
+STATISTIC(NumCSELoad, "Number of load instructions CSE'd");
+STATISTIC(NumCSECall, "Number of call instructions CSE'd");
+STATISTIC(NumDSE, "Number of trivial dead stores removed");
+
+DEBUG_COUNTER(CSECounter, "early-cse",
+ "Controls which instructions are removed");
+
+static cl::opt<unsigned> EarlyCSEMssaOptCap(
+ "earlycse-mssa-optimization-cap", cl::init(500), cl::Hidden,
+ cl::desc("Enable imprecision in EarlyCSE in pathological cases, in exchange "
+ "for faster compile. Caps the MemorySSA clobbering calls."));
+
+static cl::opt<bool> EarlyCSEDebugHash(
+ "earlycse-debug-hash", cl::init(false), cl::Hidden,
+ cl::desc("Perform extra assertion checking to verify that SimpleValue's hash "
+ "function is well-behaved w.r.t. its isEqual predicate"));
+
+//===----------------------------------------------------------------------===//
+// SimpleValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Struct representing the available values in the scoped hash table.
+struct SimpleValue {
+ Instruction *Inst;
+
+ SimpleValue(Instruction *I) : Inst(I) {
+ assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+ }
+
+ bool isSentinel() const {
+ return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+ Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+
+ static bool canHandle(Instruction *Inst) {
+ // This can only handle non-void readnone functions.
+ if (CallInst *CI = dyn_cast<CallInst>(Inst))
+ return CI->doesNotAccessMemory() && !CI->getType()->isVoidTy();
+ return isa<CastInst>(Inst) || isa<UnaryOperator>(Inst) ||
+ isa<BinaryOperator>(Inst) || isa<GetElementPtrInst>(Inst) ||
+ isa<CmpInst>(Inst) || isa<SelectInst>(Inst) ||
+ isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
+ isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) ||
+ isa<InsertValueInst>(Inst) || isa<FreezeInst>(Inst);
+ }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <> struct DenseMapInfo<SimpleValue> {
+ static inline SimpleValue getEmptyKey() {
+ return DenseMapInfo<Instruction *>::getEmptyKey();
+ }
+
+ static inline SimpleValue getTombstoneKey() {
+ return DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+
+ static unsigned getHashValue(SimpleValue Val);
+ static bool isEqual(SimpleValue LHS, SimpleValue RHS);
+};
+
+} // end namespace llvm
+
+/// Match a 'select' including an optional 'not's of the condition.
+static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A,
+ Value *&B,
+ SelectPatternFlavor &Flavor) {
+ // Return false if V is not even a select.
+ if (!match(V, m_Select(m_Value(Cond), m_Value(A), m_Value(B))))
+ return false;
+
+ // Look through a 'not' of the condition operand by swapping A/B.
+ Value *CondNot;
+ if (match(Cond, m_Not(m_Value(CondNot)))) {
+ Cond = CondNot;
+ std::swap(A, B);
+ }
+
// Match canonical forms of min/max. We are not using ValueTracking's
- // more powerful matchSelectPattern() because it may rely on instruction flags
- // such as "nsw". That would be incompatible with the current hashing
- // mechanism that may remove flags to increase the likelihood of CSE.
-
- Flavor = SPF_UNKNOWN;
- CmpInst::Predicate Pred;
-
- if (!match(Cond, m_ICmp(Pred, m_Specific(A), m_Specific(B)))) {
- // Check for commuted variants of min/max by swapping predicate.
- // If we do not match the standard or commuted patterns, this is not a
- // recognized form of min/max, but it is still a select, so return true.
- if (!match(Cond, m_ICmp(Pred, m_Specific(B), m_Specific(A))))
- return true;
- Pred = ICmpInst::getSwappedPredicate(Pred);
- }
-
- switch (Pred) {
- case CmpInst::ICMP_UGT: Flavor = SPF_UMAX; break;
- case CmpInst::ICMP_ULT: Flavor = SPF_UMIN; break;
- case CmpInst::ICMP_SGT: Flavor = SPF_SMAX; break;
- case CmpInst::ICMP_SLT: Flavor = SPF_SMIN; break;
+ // more powerful matchSelectPattern() because it may rely on instruction flags
+ // such as "nsw". That would be incompatible with the current hashing
+ // mechanism that may remove flags to increase the likelihood of CSE.
+
+ Flavor = SPF_UNKNOWN;
+ CmpInst::Predicate Pred;
+
+ if (!match(Cond, m_ICmp(Pred, m_Specific(A), m_Specific(B)))) {
+ // Check for commuted variants of min/max by swapping predicate.
+ // If we do not match the standard or commuted patterns, this is not a
+ // recognized form of min/max, but it is still a select, so return true.
+ if (!match(Cond, m_ICmp(Pred, m_Specific(B), m_Specific(A))))
+ return true;
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ }
+
+ switch (Pred) {
+ case CmpInst::ICMP_UGT: Flavor = SPF_UMAX; break;
+ case CmpInst::ICMP_ULT: Flavor = SPF_UMIN; break;
+ case CmpInst::ICMP_SGT: Flavor = SPF_SMAX; break;
+ case CmpInst::ICMP_SLT: Flavor = SPF_SMIN; break;
// Non-strict inequalities.
case CmpInst::ICMP_ULE: Flavor = SPF_UMIN; break;
case CmpInst::ICMP_UGE: Flavor = SPF_UMAX; break;
case CmpInst::ICMP_SLE: Flavor = SPF_SMIN; break;
case CmpInst::ICMP_SGE: Flavor = SPF_SMAX; break;
- default: break;
- }
-
- return true;
-}
-
-static unsigned getHashValueImpl(SimpleValue Val) {
- Instruction *Inst = Val.Inst;
- // Hash in all of the operands as pointers.
- if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst)) {
- Value *LHS = BinOp->getOperand(0);
- Value *RHS = BinOp->getOperand(1);
- if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1))
- std::swap(LHS, RHS);
-
- return hash_combine(BinOp->getOpcode(), LHS, RHS);
- }
-
- if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) {
- // Compares can be commuted by swapping the comparands and
- // updating the predicate. Choose the form that has the
- // comparands in sorted order, or in the case of a tie, the
- // one with the lower predicate.
- Value *LHS = CI->getOperand(0);
- Value *RHS = CI->getOperand(1);
- CmpInst::Predicate Pred = CI->getPredicate();
- CmpInst::Predicate SwappedPred = CI->getSwappedPredicate();
- if (std::tie(LHS, Pred) > std::tie(RHS, SwappedPred)) {
- std::swap(LHS, RHS);
- Pred = SwappedPred;
- }
- return hash_combine(Inst->getOpcode(), Pred, LHS, RHS);
- }
-
- // Hash general selects to allow matching commuted true/false operands.
- SelectPatternFlavor SPF;
- Value *Cond, *A, *B;
- if (matchSelectWithOptionalNotCond(Inst, Cond, A, B, SPF)) {
+ default: break;
+ }
+
+ return true;
+}
+
+static unsigned getHashValueImpl(SimpleValue Val) {
+ Instruction *Inst = Val.Inst;
+ // Hash in all of the operands as pointers.
+ if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst)) {
+ Value *LHS = BinOp->getOperand(0);
+ Value *RHS = BinOp->getOperand(1);
+ if (BinOp->isCommutative() && BinOp->getOperand(0) > BinOp->getOperand(1))
+ std::swap(LHS, RHS);
+
+ return hash_combine(BinOp->getOpcode(), LHS, RHS);
+ }
+
+ if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) {
+ // Compares can be commuted by swapping the comparands and
+ // updating the predicate. Choose the form that has the
+ // comparands in sorted order, or in the case of a tie, the
+ // one with the lower predicate.
+ Value *LHS = CI->getOperand(0);
+ Value *RHS = CI->getOperand(1);
+ CmpInst::Predicate Pred = CI->getPredicate();
+ CmpInst::Predicate SwappedPred = CI->getSwappedPredicate();
+ if (std::tie(LHS, Pred) > std::tie(RHS, SwappedPred)) {
+ std::swap(LHS, RHS);
+ Pred = SwappedPred;
+ }
+ return hash_combine(Inst->getOpcode(), Pred, LHS, RHS);
+ }
+
+ // Hash general selects to allow matching commuted true/false operands.
+ SelectPatternFlavor SPF;
+ Value *Cond, *A, *B;
+ if (matchSelectWithOptionalNotCond(Inst, Cond, A, B, SPF)) {
// Hash min/max (cmp + select) to allow for commuted operands.
- // Min/max may also have non-canonical compare predicate (eg, the compare for
- // smin may use 'sgt' rather than 'slt'), and non-canonical operands in the
- // compare.
- // TODO: We should also detect FP min/max.
- if (SPF == SPF_SMIN || SPF == SPF_SMAX ||
- SPF == SPF_UMIN || SPF == SPF_UMAX) {
- if (A > B)
- std::swap(A, B);
- return hash_combine(Inst->getOpcode(), SPF, A, B);
- }
-
- // Hash general selects to allow matching commuted true/false operands.
-
- // If we do not have a compare as the condition, just hash in the condition.
- CmpInst::Predicate Pred;
- Value *X, *Y;
- if (!match(Cond, m_Cmp(Pred, m_Value(X), m_Value(Y))))
- return hash_combine(Inst->getOpcode(), Cond, A, B);
-
- // Similar to cmp normalization (above) - canonicalize the predicate value:
- // select (icmp Pred, X, Y), A, B --> select (icmp InvPred, X, Y), B, A
- if (CmpInst::getInversePredicate(Pred) < Pred) {
- Pred = CmpInst::getInversePredicate(Pred);
- std::swap(A, B);
- }
- return hash_combine(Inst->getOpcode(), Pred, X, Y, A, B);
- }
-
- if (CastInst *CI = dyn_cast<CastInst>(Inst))
- return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
-
- if (FreezeInst *FI = dyn_cast<FreezeInst>(Inst))
- return hash_combine(FI->getOpcode(), FI->getOperand(0));
-
- if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst))
- return hash_combine(EVI->getOpcode(), EVI->getOperand(0),
- hash_combine_range(EVI->idx_begin(), EVI->idx_end()));
-
- if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst))
- return hash_combine(IVI->getOpcode(), IVI->getOperand(0),
- IVI->getOperand(1),
- hash_combine_range(IVI->idx_begin(), IVI->idx_end()));
-
- assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) ||
- isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
- isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst) ||
- isa<FreezeInst>(Inst)) &&
- "Invalid/unknown instruction");
-
+ // Min/max may also have non-canonical compare predicate (eg, the compare for
+ // smin may use 'sgt' rather than 'slt'), and non-canonical operands in the
+ // compare.
+ // TODO: We should also detect FP min/max.
+ if (SPF == SPF_SMIN || SPF == SPF_SMAX ||
+ SPF == SPF_UMIN || SPF == SPF_UMAX) {
+ if (A > B)
+ std::swap(A, B);
+ return hash_combine(Inst->getOpcode(), SPF, A, B);
+ }
+
+ // Hash general selects to allow matching commuted true/false operands.
+
+ // If we do not have a compare as the condition, just hash in the condition.
+ CmpInst::Predicate Pred;
+ Value *X, *Y;
+ if (!match(Cond, m_Cmp(Pred, m_Value(X), m_Value(Y))))
+ return hash_combine(Inst->getOpcode(), Cond, A, B);
+
+ // Similar to cmp normalization (above) - canonicalize the predicate value:
+ // select (icmp Pred, X, Y), A, B --> select (icmp InvPred, X, Y), B, A
+ if (CmpInst::getInversePredicate(Pred) < Pred) {
+ Pred = CmpInst::getInversePredicate(Pred);
+ std::swap(A, B);
+ }
+ return hash_combine(Inst->getOpcode(), Pred, X, Y, A, B);
+ }
+
+ if (CastInst *CI = dyn_cast<CastInst>(Inst))
+ return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
+
+ if (FreezeInst *FI = dyn_cast<FreezeInst>(Inst))
+ return hash_combine(FI->getOpcode(), FI->getOperand(0));
+
+ if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst))
+ return hash_combine(EVI->getOpcode(), EVI->getOperand(0),
+ hash_combine_range(EVI->idx_begin(), EVI->idx_end()));
+
+ if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(Inst))
+ return hash_combine(IVI->getOpcode(), IVI->getOperand(0),
+ IVI->getOperand(1),
+ hash_combine_range(IVI->idx_begin(), IVI->idx_end()));
+
+ assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) ||
+ isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
+ isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst) ||
+ isa<FreezeInst>(Inst)) &&
+ "Invalid/unknown instruction");
+
// Handle intrinsics with commutative operands.
// TODO: Extend this to handle intrinsics with >2 operands where the 1st
// 2 operands are commutative.
@@ -280,58 +280,58 @@ static unsigned getHashValueImpl(SimpleValue Val) {
return hash_combine(II->getOpcode(), LHS, RHS);
}
- // Mix in the opcode.
- return hash_combine(
- Inst->getOpcode(),
- hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
-}
-
-unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
-#ifndef NDEBUG
- // If -earlycse-debug-hash was specified, return a constant -- this
- // will force all hashing to collide, so we'll exhaustively search
- // the table for a match, and the assertion in isEqual will fire if
- // there's a bug causing equal keys to hash differently.
- if (EarlyCSEDebugHash)
- return 0;
-#endif
- return getHashValueImpl(Val);
-}
-
-static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
- Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
-
- if (LHS.isSentinel() || RHS.isSentinel())
- return LHSI == RHSI;
-
- if (LHSI->getOpcode() != RHSI->getOpcode())
- return false;
- if (LHSI->isIdenticalToWhenDefined(RHSI))
- return true;
-
- // If we're not strictly identical, we still might be a commutable instruction
- if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) {
- if (!LHSBinOp->isCommutative())
- return false;
-
- assert(isa<BinaryOperator>(RHSI) &&
- "same opcode, but different instruction type?");
- BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI);
-
- // Commuted equality
- return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) &&
- LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
- }
- if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) {
- assert(isa<CmpInst>(RHSI) &&
- "same opcode, but different instruction type?");
- CmpInst *RHSCmp = cast<CmpInst>(RHSI);
- // Commuted equality
- return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) &&
- LHSCmp->getOperand(1) == RHSCmp->getOperand(0) &&
- LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
- }
-
+ // Mix in the opcode.
+ return hash_combine(
+ Inst->getOpcode(),
+ hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
+}
+
+unsigned DenseMapInfo<SimpleValue>::getHashValue(SimpleValue Val) {
+#ifndef NDEBUG
+ // If -earlycse-debug-hash was specified, return a constant -- this
+ // will force all hashing to collide, so we'll exhaustively search
+ // the table for a match, and the assertion in isEqual will fire if
+ // there's a bug causing equal keys to hash differently.
+ if (EarlyCSEDebugHash)
+ return 0;
+#endif
+ return getHashValueImpl(Val);
+}
+
+static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
+ Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
+
+ if (LHS.isSentinel() || RHS.isSentinel())
+ return LHSI == RHSI;
+
+ if (LHSI->getOpcode() != RHSI->getOpcode())
+ return false;
+ if (LHSI->isIdenticalToWhenDefined(RHSI))
+ return true;
+
+ // If we're not strictly identical, we still might be a commutable instruction
+ if (BinaryOperator *LHSBinOp = dyn_cast<BinaryOperator>(LHSI)) {
+ if (!LHSBinOp->isCommutative())
+ return false;
+
+ assert(isa<BinaryOperator>(RHSI) &&
+ "same opcode, but different instruction type?");
+ BinaryOperator *RHSBinOp = cast<BinaryOperator>(RHSI);
+
+ // Commuted equality
+ return LHSBinOp->getOperand(0) == RHSBinOp->getOperand(1) &&
+ LHSBinOp->getOperand(1) == RHSBinOp->getOperand(0);
+ }
+ if (CmpInst *LHSCmp = dyn_cast<CmpInst>(LHSI)) {
+ assert(isa<CmpInst>(RHSI) &&
+ "same opcode, but different instruction type?");
+ CmpInst *RHSCmp = cast<CmpInst>(RHSI);
+ // Commuted equality
+ return LHSCmp->getOperand(0) == RHSCmp->getOperand(1) &&
+ LHSCmp->getOperand(1) == RHSCmp->getOperand(0) &&
+ LHSCmp->getSwappedPredicate() == RHSCmp->getPredicate();
+ }
+
// TODO: Extend this for >2 args by matching the trailing N-2 args.
auto *LII = dyn_cast<IntrinsicInst>(LHSI);
auto *RII = dyn_cast<IntrinsicInst>(RHSI);
@@ -342,326 +342,326 @@ static bool isEqualImpl(SimpleValue LHS, SimpleValue RHS) {
}
// Min/max can occur with commuted operands, non-canonical predicates,
- // and/or non-canonical operands.
- // Selects can be non-trivially equivalent via inverted conditions and swaps.
- SelectPatternFlavor LSPF, RSPF;
- Value *CondL, *CondR, *LHSA, *RHSA, *LHSB, *RHSB;
- if (matchSelectWithOptionalNotCond(LHSI, CondL, LHSA, LHSB, LSPF) &&
- matchSelectWithOptionalNotCond(RHSI, CondR, RHSA, RHSB, RSPF)) {
- if (LSPF == RSPF) {
- // TODO: We should also detect FP min/max.
- if (LSPF == SPF_SMIN || LSPF == SPF_SMAX ||
- LSPF == SPF_UMIN || LSPF == SPF_UMAX)
- return ((LHSA == RHSA && LHSB == RHSB) ||
- (LHSA == RHSB && LHSB == RHSA));
-
- // select Cond, A, B <--> select not(Cond), B, A
- if (CondL == CondR && LHSA == RHSA && LHSB == RHSB)
- return true;
- }
-
- // If the true/false operands are swapped and the conditions are compares
- // with inverted predicates, the selects are equal:
- // select (icmp Pred, X, Y), A, B <--> select (icmp InvPred, X, Y), B, A
- //
- // This also handles patterns with a double-negation in the sense of not +
- // inverse, because we looked through a 'not' in the matching function and
- // swapped A/B:
- // select (cmp Pred, X, Y), A, B <--> select (not (cmp InvPred, X, Y)), B, A
- //
- // This intentionally does NOT handle patterns with a double-negation in
- // the sense of not + not, because doing so could result in values
- // comparing
+ // and/or non-canonical operands.
+ // Selects can be non-trivially equivalent via inverted conditions and swaps.
+ SelectPatternFlavor LSPF, RSPF;
+ Value *CondL, *CondR, *LHSA, *RHSA, *LHSB, *RHSB;
+ if (matchSelectWithOptionalNotCond(LHSI, CondL, LHSA, LHSB, LSPF) &&
+ matchSelectWithOptionalNotCond(RHSI, CondR, RHSA, RHSB, RSPF)) {
+ if (LSPF == RSPF) {
+ // TODO: We should also detect FP min/max.
+ if (LSPF == SPF_SMIN || LSPF == SPF_SMAX ||
+ LSPF == SPF_UMIN || LSPF == SPF_UMAX)
+ return ((LHSA == RHSA && LHSB == RHSB) ||
+ (LHSA == RHSB && LHSB == RHSA));
+
+ // select Cond, A, B <--> select not(Cond), B, A
+ if (CondL == CondR && LHSA == RHSA && LHSB == RHSB)
+ return true;
+ }
+
+ // If the true/false operands are swapped and the conditions are compares
+ // with inverted predicates, the selects are equal:
+ // select (icmp Pred, X, Y), A, B <--> select (icmp InvPred, X, Y), B, A
+ //
+ // This also handles patterns with a double-negation in the sense of not +
+ // inverse, because we looked through a 'not' in the matching function and
+ // swapped A/B:
+ // select (cmp Pred, X, Y), A, B <--> select (not (cmp InvPred, X, Y)), B, A
+ //
+ // This intentionally does NOT handle patterns with a double-negation in
+ // the sense of not + not, because doing so could result in values
+ // comparing
// as equal that hash differently in the min/max cases like:
- // select (cmp slt, X, Y), X, Y <--> select (not (not (cmp slt, X, Y))), X, Y
- // ^ hashes as min ^ would not hash as min
- // In the context of the EarlyCSE pass, however, such cases never reach
- // this code, as we simplify the double-negation before hashing the second
- // select (and so still succeed at CSEing them).
- if (LHSA == RHSB && LHSB == RHSA) {
- CmpInst::Predicate PredL, PredR;
- Value *X, *Y;
- if (match(CondL, m_Cmp(PredL, m_Value(X), m_Value(Y))) &&
- match(CondR, m_Cmp(PredR, m_Specific(X), m_Specific(Y))) &&
- CmpInst::getInversePredicate(PredL) == PredR)
- return true;
- }
- }
-
- return false;
-}
-
-bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
- // These comparisons are nontrivial, so assert that equality implies
- // hash equality (DenseMap demands this as an invariant).
- bool Result = isEqualImpl(LHS, RHS);
- assert(!Result || (LHS.isSentinel() && LHS.Inst == RHS.Inst) ||
- getHashValueImpl(LHS) == getHashValueImpl(RHS));
- return Result;
-}
-
-//===----------------------------------------------------------------------===//
-// CallValue
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-/// Struct representing the available call values in the scoped hash
-/// table.
-struct CallValue {
- Instruction *Inst;
-
- CallValue(Instruction *I) : Inst(I) {
- assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
- }
-
- bool isSentinel() const {
- return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
- Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
- }
-
- static bool canHandle(Instruction *Inst) {
- // Don't value number anything that returns void.
- if (Inst->getType()->isVoidTy())
- return false;
-
- CallInst *CI = dyn_cast<CallInst>(Inst);
- if (!CI || !CI->onlyReadsMemory())
- return false;
- return true;
- }
-};
-
-} // end anonymous namespace
-
-namespace llvm {
-
-template <> struct DenseMapInfo<CallValue> {
- static inline CallValue getEmptyKey() {
- return DenseMapInfo<Instruction *>::getEmptyKey();
- }
-
- static inline CallValue getTombstoneKey() {
- return DenseMapInfo<Instruction *>::getTombstoneKey();
- }
-
- static unsigned getHashValue(CallValue Val);
- static bool isEqual(CallValue LHS, CallValue RHS);
-};
-
-} // end namespace llvm
-
-unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
- Instruction *Inst = Val.Inst;
-
- // gc.relocate is 'special' call: its second and third operands are
- // not real values, but indices into statepoint's argument list.
- // Get values they point to.
- if (const GCRelocateInst *GCR = dyn_cast<GCRelocateInst>(Inst))
- return hash_combine(GCR->getOpcode(), GCR->getOperand(0),
- GCR->getBasePtr(), GCR->getDerivedPtr());
-
- // Hash all of the operands as pointers and mix in the opcode.
- return hash_combine(
- Inst->getOpcode(),
- hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
-}
-
-bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
- Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
- if (LHS.isSentinel() || RHS.isSentinel())
- return LHSI == RHSI;
-
- // See comment above in `getHashValue()`.
- if (const GCRelocateInst *GCR1 = dyn_cast<GCRelocateInst>(LHSI))
- if (const GCRelocateInst *GCR2 = dyn_cast<GCRelocateInst>(RHSI))
- return GCR1->getOperand(0) == GCR2->getOperand(0) &&
- GCR1->getBasePtr() == GCR2->getBasePtr() &&
- GCR1->getDerivedPtr() == GCR2->getDerivedPtr();
-
- return LHSI->isIdenticalTo(RHSI);
-}
-
-//===----------------------------------------------------------------------===//
-// EarlyCSE implementation
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-/// A simple and fast domtree-based CSE pass.
-///
-/// This pass does a simple depth-first walk over the dominator tree,
-/// eliminating trivially redundant instructions and using instsimplify to
-/// canonicalize things as it goes. It is intended to be fast and catch obvious
-/// cases so that instcombine and other passes are more effective. It is
-/// expected that a later pass of GVN will catch the interesting/hard cases.
-class EarlyCSE {
-public:
- const TargetLibraryInfo &TLI;
- const TargetTransformInfo &TTI;
- DominatorTree &DT;
- AssumptionCache &AC;
- const SimplifyQuery SQ;
- MemorySSA *MSSA;
- std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
-
- using AllocatorTy =
- RecyclingAllocator<BumpPtrAllocator,
- ScopedHashTableVal<SimpleValue, Value *>>;
- using ScopedHTType =
- ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
- AllocatorTy>;
-
- /// A scoped hash table of the current values of all of our simple
- /// scalar expressions.
- ///
- /// As we walk down the domtree, we look to see if instructions are in this:
- /// if so, we replace them with what we find, otherwise we insert them so
- /// that dominated values can succeed in their lookup.
- ScopedHTType AvailableValues;
-
- /// A scoped hash table of the current values of previously encountered
- /// memory locations.
- ///
- /// This allows us to get efficient access to dominating loads or stores when
- /// we have a fully redundant load. In addition to the most recent load, we
- /// keep track of a generation count of the read, which is compared against
- /// the current generation count. The current generation count is incremented
- /// after every possibly writing memory operation, which ensures that we only
- /// CSE loads with other loads that have no intervening store. Ordering
- /// events (such as fences or atomic instructions) increment the generation
- /// count as well; essentially, we model these as writes to all possible
- /// locations. Note that atomic and/or volatile loads and stores can be
- /// present the table; it is the responsibility of the consumer to inspect
- /// the atomicity/volatility if needed.
- struct LoadValue {
- Instruction *DefInst = nullptr;
- unsigned Generation = 0;
- int MatchingId = -1;
- bool IsAtomic = false;
-
- LoadValue() = default;
- LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId,
- bool IsAtomic)
- : DefInst(Inst), Generation(Generation), MatchingId(MatchingId),
- IsAtomic(IsAtomic) {}
- };
-
- using LoadMapAllocator =
- RecyclingAllocator<BumpPtrAllocator,
- ScopedHashTableVal<Value *, LoadValue>>;
- using LoadHTType =
- ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>,
- LoadMapAllocator>;
-
- LoadHTType AvailableLoads;
-
- // A scoped hash table mapping memory locations (represented as typed
- // addresses) to generation numbers at which that memory location became
- // (henceforth indefinitely) invariant.
- using InvariantMapAllocator =
- RecyclingAllocator<BumpPtrAllocator,
- ScopedHashTableVal<MemoryLocation, unsigned>>;
- using InvariantHTType =
- ScopedHashTable<MemoryLocation, unsigned, DenseMapInfo<MemoryLocation>,
- InvariantMapAllocator>;
- InvariantHTType AvailableInvariants;
-
- /// A scoped hash table of the current values of read-only call
- /// values.
- ///
- /// It uses the same generation count as loads.
- using CallHTType =
- ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
- CallHTType AvailableCalls;
-
- /// This is the current generation of the memory value.
- unsigned CurrentGeneration = 0;
-
- /// Set up the EarlyCSE runner for a particular function.
- EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI,
- const TargetTransformInfo &TTI, DominatorTree &DT,
- AssumptionCache &AC, MemorySSA *MSSA)
- : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA),
- MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
-
- bool run();
-
-private:
- unsigned ClobberCounter = 0;
- // Almost a POD, but needs to call the constructors for the scoped hash
- // tables so that a new scope gets pushed on. These are RAII so that the
- // scope gets popped when the NodeScope is destroyed.
- class NodeScope {
- public:
- NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
- InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
- : Scope(AvailableValues), LoadScope(AvailableLoads),
- InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
- NodeScope(const NodeScope &) = delete;
- NodeScope &operator=(const NodeScope &) = delete;
-
- private:
- ScopedHTType::ScopeTy Scope;
- LoadHTType::ScopeTy LoadScope;
- InvariantHTType::ScopeTy InvariantScope;
- CallHTType::ScopeTy CallScope;
- };
-
- // Contains all the needed information to create a stack for doing a depth
- // first traversal of the tree. This includes scopes for values, loads, and
- // calls as well as the generation. There is a child iterator so that the
- // children do not need to be store separately.
- class StackNode {
- public:
- StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
- InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
- unsigned cg, DomTreeNode *n, DomTreeNode::const_iterator child,
- DomTreeNode::const_iterator end)
- : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
- EndIter(end),
- Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
- AvailableCalls)
- {}
- StackNode(const StackNode &) = delete;
- StackNode &operator=(const StackNode &) = delete;
-
- // Accessors.
+ // select (cmp slt, X, Y), X, Y <--> select (not (not (cmp slt, X, Y))), X, Y
+ // ^ hashes as min ^ would not hash as min
+ // In the context of the EarlyCSE pass, however, such cases never reach
+ // this code, as we simplify the double-negation before hashing the second
+ // select (and so still succeed at CSEing them).
+ if (LHSA == RHSB && LHSB == RHSA) {
+ CmpInst::Predicate PredL, PredR;
+ Value *X, *Y;
+ if (match(CondL, m_Cmp(PredL, m_Value(X), m_Value(Y))) &&
+ match(CondR, m_Cmp(PredR, m_Specific(X), m_Specific(Y))) &&
+ CmpInst::getInversePredicate(PredL) == PredR)
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool DenseMapInfo<SimpleValue>::isEqual(SimpleValue LHS, SimpleValue RHS) {
+ // These comparisons are nontrivial, so assert that equality implies
+ // hash equality (DenseMap demands this as an invariant).
+ bool Result = isEqualImpl(LHS, RHS);
+ assert(!Result || (LHS.isSentinel() && LHS.Inst == RHS.Inst) ||
+ getHashValueImpl(LHS) == getHashValueImpl(RHS));
+ return Result;
+}
+
+//===----------------------------------------------------------------------===//
+// CallValue
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// Struct representing the available call values in the scoped hash
+/// table.
+struct CallValue {
+ Instruction *Inst;
+
+ CallValue(Instruction *I) : Inst(I) {
+ assert((isSentinel() || canHandle(I)) && "Inst can't be handled!");
+ }
+
+ bool isSentinel() const {
+ return Inst == DenseMapInfo<Instruction *>::getEmptyKey() ||
+ Inst == DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+
+ static bool canHandle(Instruction *Inst) {
+ // Don't value number anything that returns void.
+ if (Inst->getType()->isVoidTy())
+ return false;
+
+ CallInst *CI = dyn_cast<CallInst>(Inst);
+ if (!CI || !CI->onlyReadsMemory())
+ return false;
+ return true;
+ }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <> struct DenseMapInfo<CallValue> {
+ static inline CallValue getEmptyKey() {
+ return DenseMapInfo<Instruction *>::getEmptyKey();
+ }
+
+ static inline CallValue getTombstoneKey() {
+ return DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+
+ static unsigned getHashValue(CallValue Val);
+ static bool isEqual(CallValue LHS, CallValue RHS);
+};
+
+} // end namespace llvm
+
+unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
+ Instruction *Inst = Val.Inst;
+
+ // gc.relocate is 'special' call: its second and third operands are
+ // not real values, but indices into statepoint's argument list.
+ // Get values they point to.
+ if (const GCRelocateInst *GCR = dyn_cast<GCRelocateInst>(Inst))
+ return hash_combine(GCR->getOpcode(), GCR->getOperand(0),
+ GCR->getBasePtr(), GCR->getDerivedPtr());
+
+ // Hash all of the operands as pointers and mix in the opcode.
+ return hash_combine(
+ Inst->getOpcode(),
+ hash_combine_range(Inst->value_op_begin(), Inst->value_op_end()));
+}
+
+bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
+ Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
+ if (LHS.isSentinel() || RHS.isSentinel())
+ return LHSI == RHSI;
+
+ // See comment above in `getHashValue()`.
+ if (const GCRelocateInst *GCR1 = dyn_cast<GCRelocateInst>(LHSI))
+ if (const GCRelocateInst *GCR2 = dyn_cast<GCRelocateInst>(RHSI))
+ return GCR1->getOperand(0) == GCR2->getOperand(0) &&
+ GCR1->getBasePtr() == GCR2->getBasePtr() &&
+ GCR1->getDerivedPtr() == GCR2->getDerivedPtr();
+
+ return LHSI->isIdenticalTo(RHSI);
+}
+
+//===----------------------------------------------------------------------===//
+// EarlyCSE implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+class EarlyCSE {
+public:
+ const TargetLibraryInfo &TLI;
+ const TargetTransformInfo &TTI;
+ DominatorTree &DT;
+ AssumptionCache &AC;
+ const SimplifyQuery SQ;
+ MemorySSA *MSSA;
+ std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
+
+ using AllocatorTy =
+ RecyclingAllocator<BumpPtrAllocator,
+ ScopedHashTableVal<SimpleValue, Value *>>;
+ using ScopedHTType =
+ ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
+ AllocatorTy>;
+
+ /// A scoped hash table of the current values of all of our simple
+ /// scalar expressions.
+ ///
+ /// As we walk down the domtree, we look to see if instructions are in this:
+ /// if so, we replace them with what we find, otherwise we insert them so
+ /// that dominated values can succeed in their lookup.
+ ScopedHTType AvailableValues;
+
+ /// A scoped hash table of the current values of previously encountered
+ /// memory locations.
+ ///
+ /// This allows us to get efficient access to dominating loads or stores when
+ /// we have a fully redundant load. In addition to the most recent load, we
+ /// keep track of a generation count of the read, which is compared against
+ /// the current generation count. The current generation count is incremented
+ /// after every possibly writing memory operation, which ensures that we only
+ /// CSE loads with other loads that have no intervening store. Ordering
+ /// events (such as fences or atomic instructions) increment the generation
+ /// count as well; essentially, we model these as writes to all possible
+ /// locations. Note that atomic and/or volatile loads and stores can be
+ /// present the table; it is the responsibility of the consumer to inspect
+ /// the atomicity/volatility if needed.
+ struct LoadValue {
+ Instruction *DefInst = nullptr;
+ unsigned Generation = 0;
+ int MatchingId = -1;
+ bool IsAtomic = false;
+
+ LoadValue() = default;
+ LoadValue(Instruction *Inst, unsigned Generation, unsigned MatchingId,
+ bool IsAtomic)
+ : DefInst(Inst), Generation(Generation), MatchingId(MatchingId),
+ IsAtomic(IsAtomic) {}
+ };
+
+ using LoadMapAllocator =
+ RecyclingAllocator<BumpPtrAllocator,
+ ScopedHashTableVal<Value *, LoadValue>>;
+ using LoadHTType =
+ ScopedHashTable<Value *, LoadValue, DenseMapInfo<Value *>,
+ LoadMapAllocator>;
+
+ LoadHTType AvailableLoads;
+
+ // A scoped hash table mapping memory locations (represented as typed
+ // addresses) to generation numbers at which that memory location became
+ // (henceforth indefinitely) invariant.
+ using InvariantMapAllocator =
+ RecyclingAllocator<BumpPtrAllocator,
+ ScopedHashTableVal<MemoryLocation, unsigned>>;
+ using InvariantHTType =
+ ScopedHashTable<MemoryLocation, unsigned, DenseMapInfo<MemoryLocation>,
+ InvariantMapAllocator>;
+ InvariantHTType AvailableInvariants;
+
+ /// A scoped hash table of the current values of read-only call
+ /// values.
+ ///
+ /// It uses the same generation count as loads.
+ using CallHTType =
+ ScopedHashTable<CallValue, std::pair<Instruction *, unsigned>>;
+ CallHTType AvailableCalls;
+
+ /// This is the current generation of the memory value.
+ unsigned CurrentGeneration = 0;
+
+ /// Set up the EarlyCSE runner for a particular function.
+ EarlyCSE(const DataLayout &DL, const TargetLibraryInfo &TLI,
+ const TargetTransformInfo &TTI, DominatorTree &DT,
+ AssumptionCache &AC, MemorySSA *MSSA)
+ : TLI(TLI), TTI(TTI), DT(DT), AC(AC), SQ(DL, &TLI, &DT, &AC), MSSA(MSSA),
+ MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
+
+ bool run();
+
+private:
+ unsigned ClobberCounter = 0;
+ // Almost a POD, but needs to call the constructors for the scoped hash
+ // tables so that a new scope gets pushed on. These are RAII so that the
+ // scope gets popped when the NodeScope is destroyed.
+ class NodeScope {
+ public:
+ NodeScope(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+ InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls)
+ : Scope(AvailableValues), LoadScope(AvailableLoads),
+ InvariantScope(AvailableInvariants), CallScope(AvailableCalls) {}
+ NodeScope(const NodeScope &) = delete;
+ NodeScope &operator=(const NodeScope &) = delete;
+
+ private:
+ ScopedHTType::ScopeTy Scope;
+ LoadHTType::ScopeTy LoadScope;
+ InvariantHTType::ScopeTy InvariantScope;
+ CallHTType::ScopeTy CallScope;
+ };
+
+ // Contains all the needed information to create a stack for doing a depth
+ // first traversal of the tree. This includes scopes for values, loads, and
+ // calls as well as the generation. There is a child iterator so that the
+ // children do not need to be store separately.
+ class StackNode {
+ public:
+ StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
+ InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
+ unsigned cg, DomTreeNode *n, DomTreeNode::const_iterator child,
+ DomTreeNode::const_iterator end)
+ : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
+ EndIter(end),
+ Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
+ AvailableCalls)
+ {}
+ StackNode(const StackNode &) = delete;
+ StackNode &operator=(const StackNode &) = delete;
+
+ // Accessors.
unsigned currentGeneration() const { return CurrentGeneration; }
unsigned childGeneration() const { return ChildGeneration; }
- void childGeneration(unsigned generation) { ChildGeneration = generation; }
- DomTreeNode *node() { return Node; }
+ void childGeneration(unsigned generation) { ChildGeneration = generation; }
+ DomTreeNode *node() { return Node; }
DomTreeNode::const_iterator childIter() const { return ChildIter; }
-
- DomTreeNode *nextChild() {
- DomTreeNode *child = *ChildIter;
- ++ChildIter;
- return child;
- }
-
+
+ DomTreeNode *nextChild() {
+ DomTreeNode *child = *ChildIter;
+ ++ChildIter;
+ return child;
+ }
+
DomTreeNode::const_iterator end() const { return EndIter; }
bool isProcessed() const { return Processed; }
- void process() { Processed = true; }
-
- private:
- unsigned CurrentGeneration;
- unsigned ChildGeneration;
- DomTreeNode *Node;
- DomTreeNode::const_iterator ChildIter;
- DomTreeNode::const_iterator EndIter;
- NodeScope Scopes;
- bool Processed = false;
- };
-
- /// Wrapper class to handle memory instructions, including loads,
- /// stores and intrinsic loads and stores defined by the target.
- class ParseMemoryInst {
- public:
- ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
- : Inst(Inst) {
+ void process() { Processed = true; }
+
+ private:
+ unsigned CurrentGeneration;
+ unsigned ChildGeneration;
+ DomTreeNode *Node;
+ DomTreeNode::const_iterator ChildIter;
+ DomTreeNode::const_iterator EndIter;
+ NodeScope Scopes;
+ bool Processed = false;
+ };
+
+ /// Wrapper class to handle memory instructions, including loads,
+ /// stores and intrinsic loads and stores defined by the target.
+ class ParseMemoryInst {
+ public:
+ ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
+ : Inst(Inst) {
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
IntrID = II->getIntrinsicID();
- if (TTI.getTgtMemIntrinsic(II, Info))
+ if (TTI.getTgtMemIntrinsic(II, Info))
return;
if (isHandledNonTargetIntrinsic(IntrID)) {
switch (IntrID) {
@@ -688,97 +688,97 @@ private:
}
}
}
- }
-
+ }
+
Instruction *get() { return Inst; }
const Instruction *get() const { return Inst; }
- bool isLoad() const {
+ bool isLoad() const {
if (IntrID != 0)
return Info.ReadMem;
- return isa<LoadInst>(Inst);
- }
-
- bool isStore() const {
+ return isa<LoadInst>(Inst);
+ }
+
+ bool isStore() const {
if (IntrID != 0)
return Info.WriteMem;
- return isa<StoreInst>(Inst);
- }
-
- bool isAtomic() const {
+ return isa<StoreInst>(Inst);
+ }
+
+ bool isAtomic() const {
if (IntrID != 0)
- return Info.Ordering != AtomicOrdering::NotAtomic;
- return Inst->isAtomic();
- }
-
- bool isUnordered() const {
+ return Info.Ordering != AtomicOrdering::NotAtomic;
+ return Inst->isAtomic();
+ }
+
+ bool isUnordered() const {
if (IntrID != 0)
- return Info.isUnordered();
-
- if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
- return LI->isUnordered();
- } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
- return SI->isUnordered();
- }
- // Conservative answer
- return !Inst->isAtomic();
- }
-
- bool isVolatile() const {
+ return Info.isUnordered();
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ return LI->isUnordered();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ return SI->isUnordered();
+ }
+ // Conservative answer
+ return !Inst->isAtomic();
+ }
+
+ bool isVolatile() const {
if (IntrID != 0)
- return Info.IsVolatile;
-
- if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
- return LI->isVolatile();
- } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
- return SI->isVolatile();
- }
- // Conservative answer
- return true;
- }
-
- bool isInvariantLoad() const {
- if (auto *LI = dyn_cast<LoadInst>(Inst))
- return LI->hasMetadata(LLVMContext::MD_invariant_load);
- return false;
- }
-
- bool isValid() const { return getPointerOperand() != nullptr; }
-
- // For regular (non-intrinsic) loads/stores, this is set to -1. For
- // intrinsic loads/stores, the id is retrieved from the corresponding
- // field in the MemIntrinsicInfo structure. That field contains
- // non-negative values only.
- int getMatchingId() const {
+ return Info.IsVolatile;
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ return LI->isVolatile();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ return SI->isVolatile();
+ }
+ // Conservative answer
+ return true;
+ }
+
+ bool isInvariantLoad() const {
+ if (auto *LI = dyn_cast<LoadInst>(Inst))
+ return LI->hasMetadata(LLVMContext::MD_invariant_load);
+ return false;
+ }
+
+ bool isValid() const { return getPointerOperand() != nullptr; }
+
+ // For regular (non-intrinsic) loads/stores, this is set to -1. For
+ // intrinsic loads/stores, the id is retrieved from the corresponding
+ // field in the MemIntrinsicInfo structure. That field contains
+ // non-negative values only.
+ int getMatchingId() const {
if (IntrID != 0)
return Info.MatchingId;
- return -1;
- }
-
- Value *getPointerOperand() const {
+ return -1;
+ }
+
+ Value *getPointerOperand() const {
if (IntrID != 0)
return Info.PtrVal;
- return getLoadStorePointerOperand(Inst);
- }
-
- bool mayReadFromMemory() const {
+ return getLoadStorePointerOperand(Inst);
+ }
+
+ bool mayReadFromMemory() const {
if (IntrID != 0)
return Info.ReadMem;
- return Inst->mayReadFromMemory();
- }
-
- bool mayWriteToMemory() const {
+ return Inst->mayReadFromMemory();
+ }
+
+ bool mayWriteToMemory() const {
if (IntrID != 0)
return Info.WriteMem;
- return Inst->mayWriteToMemory();
- }
-
- private:
+ return Inst->mayWriteToMemory();
+ }
+
+ private:
Intrinsic::ID IntrID = 0;
- MemIntrinsicInfo Info;
- Instruction *Inst;
- };
-
+ MemIntrinsicInfo Info;
+ Instruction *Inst;
+ };
+
// This function is to prevent accidentally passing a non-target
// intrinsic ID to TargetTransformInfo.
static bool isHandledNonTargetIntrinsic(Intrinsic::ID ID) {
@@ -795,29 +795,29 @@ private:
return false;
}
- bool processNode(DomTreeNode *Node);
-
- bool handleBranchCondition(Instruction *CondInst, const BranchInst *BI,
- const BasicBlock *BB, const BasicBlock *Pred);
-
+ bool processNode(DomTreeNode *Node);
+
+ bool handleBranchCondition(Instruction *CondInst, const BranchInst *BI,
+ const BasicBlock *BB, const BasicBlock *Pred);
+
Value *getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
unsigned CurrentGeneration);
bool overridingStores(const ParseMemoryInst &Earlier,
const ParseMemoryInst &Later);
- Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
- if (auto *LI = dyn_cast<LoadInst>(Inst))
- return LI;
- if (auto *SI = dyn_cast<StoreInst>(Inst))
- return SI->getValueOperand();
- assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
+ Value *getOrCreateResult(Value *Inst, Type *ExpectedType) const {
+ if (auto *LI = dyn_cast<LoadInst>(Inst))
+ return LI;
+ if (auto *SI = dyn_cast<StoreInst>(Inst))
+ return SI->getValueOperand();
+ assert(isa<IntrinsicInst>(Inst) && "Instruction not supported");
auto *II = cast<IntrinsicInst>(Inst);
if (isHandledNonTargetIntrinsic(II->getIntrinsicID()))
return getOrCreateResultNonTargetMemIntrinsic(II, ExpectedType);
return TTI.getOrCreateResultFromMemIntrinsic(II, ExpectedType);
- }
-
+ }
+
Value *getOrCreateResultNonTargetMemIntrinsic(IntrinsicInst *II,
Type *ExpectedType) const {
switch (II->getIntrinsicID()) {
@@ -829,13 +829,13 @@ private:
return nullptr;
}
- /// Return true if the instruction is known to only operate on memory
- /// provably invariant in the given "generation".
- bool isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt);
-
- bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
- Instruction *EarlierInst, Instruction *LaterInst);
-
+ /// Return true if the instruction is known to only operate on memory
+ /// provably invariant in the given "generation".
+ bool isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt);
+
+ bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
+ Instruction *EarlierInst, Instruction *LaterInst);
+
bool isNonTargetIntrinsicMatch(const IntrinsicInst *Earlier,
const IntrinsicInst *Later) {
auto IsSubmask = [](const Value *Mask0, const Value *Mask1) {
@@ -931,108 +931,108 @@ private:
return false;
}
- void removeMSSA(Instruction &Inst) {
- if (!MSSA)
- return;
- if (VerifyMemorySSA)
- MSSA->verifyMemorySSA();
- // Removing a store here can leave MemorySSA in an unoptimized state by
- // creating MemoryPhis that have identical arguments and by creating
- // MemoryUses whose defining access is not an actual clobber. The phi case
- // is handled by MemorySSA when passing OptimizePhis = true to
- // removeMemoryAccess. The non-optimized MemoryUse case is lazily updated
- // by MemorySSA's getClobberingMemoryAccess.
- MSSAUpdater->removeMemoryAccess(&Inst, true);
- }
-};
-
-} // end anonymous namespace
-
-/// Determine if the memory referenced by LaterInst is from the same heap
-/// version as EarlierInst.
-/// This is currently called in two scenarios:
-///
-/// load p
-/// ...
-/// load p
-///
-/// and
-///
-/// x = load p
-/// ...
-/// store x, p
-///
-/// in both cases we want to verify that there are no possible writes to the
-/// memory referenced by p between the earlier and later instruction.
-bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
- unsigned LaterGeneration,
- Instruction *EarlierInst,
- Instruction *LaterInst) {
- // Check the simple memory generation tracking first.
- if (EarlierGeneration == LaterGeneration)
- return true;
-
- if (!MSSA)
- return false;
-
- // If MemorySSA has determined that one of EarlierInst or LaterInst does not
- // read/write memory, then we can safely return true here.
- // FIXME: We could be more aggressive when checking doesNotAccessMemory(),
- // onlyReadsMemory(), mayReadFromMemory(), and mayWriteToMemory() in this pass
- // by also checking the MemorySSA MemoryAccess on the instruction. Initial
- // experiments suggest this isn't worthwhile, at least for C/C++ code compiled
- // with the default optimization pipeline.
- auto *EarlierMA = MSSA->getMemoryAccess(EarlierInst);
- if (!EarlierMA)
- return true;
- auto *LaterMA = MSSA->getMemoryAccess(LaterInst);
- if (!LaterMA)
- return true;
-
- // Since we know LaterDef dominates LaterInst and EarlierInst dominates
- // LaterInst, if LaterDef dominates EarlierInst then it can't occur between
- // EarlierInst and LaterInst and neither can any other write that potentially
- // clobbers LaterInst.
- MemoryAccess *LaterDef;
- if (ClobberCounter < EarlyCSEMssaOptCap) {
- LaterDef = MSSA->getWalker()->getClobberingMemoryAccess(LaterInst);
- ClobberCounter++;
- } else
- LaterDef = LaterMA->getDefiningAccess();
-
- return MSSA->dominates(LaterDef, EarlierMA);
-}
-
-bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
- // A location loaded from with an invariant_load is assumed to *never* change
- // within the visible scope of the compilation.
- if (auto *LI = dyn_cast<LoadInst>(I))
- if (LI->hasMetadata(LLVMContext::MD_invariant_load))
- return true;
-
- auto MemLocOpt = MemoryLocation::getOrNone(I);
- if (!MemLocOpt)
- // "target" intrinsic forms of loads aren't currently known to
- // MemoryLocation::get. TODO
- return false;
- MemoryLocation MemLoc = *MemLocOpt;
- if (!AvailableInvariants.count(MemLoc))
- return false;
-
- // Is the generation at which this became invariant older than the
- // current one?
- return AvailableInvariants.lookup(MemLoc) <= GenAt;
-}
-
-bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
- const BranchInst *BI, const BasicBlock *BB,
- const BasicBlock *Pred) {
- assert(BI->isConditional() && "Should be a conditional branch!");
- assert(BI->getCondition() == CondInst && "Wrong condition?");
- assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
- auto *TorF = (BI->getSuccessor(0) == BB)
- ? ConstantInt::getTrue(BB->getContext())
- : ConstantInt::getFalse(BB->getContext());
+ void removeMSSA(Instruction &Inst) {
+ if (!MSSA)
+ return;
+ if (VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+ // Removing a store here can leave MemorySSA in an unoptimized state by
+ // creating MemoryPhis that have identical arguments and by creating
+ // MemoryUses whose defining access is not an actual clobber. The phi case
+ // is handled by MemorySSA when passing OptimizePhis = true to
+ // removeMemoryAccess. The non-optimized MemoryUse case is lazily updated
+ // by MemorySSA's getClobberingMemoryAccess.
+ MSSAUpdater->removeMemoryAccess(&Inst, true);
+ }
+};
+
+} // end anonymous namespace
+
+/// Determine if the memory referenced by LaterInst is from the same heap
+/// version as EarlierInst.
+/// This is currently called in two scenarios:
+///
+/// load p
+/// ...
+/// load p
+///
+/// and
+///
+/// x = load p
+/// ...
+/// store x, p
+///
+/// in both cases we want to verify that there are no possible writes to the
+/// memory referenced by p between the earlier and later instruction.
+bool EarlyCSE::isSameMemGeneration(unsigned EarlierGeneration,
+ unsigned LaterGeneration,
+ Instruction *EarlierInst,
+ Instruction *LaterInst) {
+ // Check the simple memory generation tracking first.
+ if (EarlierGeneration == LaterGeneration)
+ return true;
+
+ if (!MSSA)
+ return false;
+
+ // If MemorySSA has determined that one of EarlierInst or LaterInst does not
+ // read/write memory, then we can safely return true here.
+ // FIXME: We could be more aggressive when checking doesNotAccessMemory(),
+ // onlyReadsMemory(), mayReadFromMemory(), and mayWriteToMemory() in this pass
+ // by also checking the MemorySSA MemoryAccess on the instruction. Initial
+ // experiments suggest this isn't worthwhile, at least for C/C++ code compiled
+ // with the default optimization pipeline.
+ auto *EarlierMA = MSSA->getMemoryAccess(EarlierInst);
+ if (!EarlierMA)
+ return true;
+ auto *LaterMA = MSSA->getMemoryAccess(LaterInst);
+ if (!LaterMA)
+ return true;
+
+ // Since we know LaterDef dominates LaterInst and EarlierInst dominates
+ // LaterInst, if LaterDef dominates EarlierInst then it can't occur between
+ // EarlierInst and LaterInst and neither can any other write that potentially
+ // clobbers LaterInst.
+ MemoryAccess *LaterDef;
+ if (ClobberCounter < EarlyCSEMssaOptCap) {
+ LaterDef = MSSA->getWalker()->getClobberingMemoryAccess(LaterInst);
+ ClobberCounter++;
+ } else
+ LaterDef = LaterMA->getDefiningAccess();
+
+ return MSSA->dominates(LaterDef, EarlierMA);
+}
+
+bool EarlyCSE::isOperatingOnInvariantMemAt(Instruction *I, unsigned GenAt) {
+ // A location loaded from with an invariant_load is assumed to *never* change
+ // within the visible scope of the compilation.
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ if (LI->hasMetadata(LLVMContext::MD_invariant_load))
+ return true;
+
+ auto MemLocOpt = MemoryLocation::getOrNone(I);
+ if (!MemLocOpt)
+ // "target" intrinsic forms of loads aren't currently known to
+ // MemoryLocation::get. TODO
+ return false;
+ MemoryLocation MemLoc = *MemLocOpt;
+ if (!AvailableInvariants.count(MemLoc))
+ return false;
+
+ // Is the generation at which this became invariant older than the
+ // current one?
+ return AvailableInvariants.lookup(MemLoc) <= GenAt;
+}
+
+bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
+ const BranchInst *BI, const BasicBlock *BB,
+ const BasicBlock *Pred) {
+ assert(BI->isConditional() && "Should be a conditional branch!");
+ assert(BI->getCondition() == CondInst && "Wrong condition?");
+ assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
+ auto *TorF = (BI->getSuccessor(0) == BB)
+ ? ConstantInt::getTrue(BB->getContext())
+ : ConstantInt::getFalse(BB->getContext());
auto MatchBinOp = [](Instruction *I, unsigned Opcode, Value *&LHS,
Value *&RHS) {
if (Opcode == Instruction::And &&
@@ -1041,47 +1041,47 @@ bool EarlyCSE::handleBranchCondition(Instruction *CondInst,
else if (Opcode == Instruction::Or &&
match(I, m_LogicalOr(m_Value(LHS), m_Value(RHS))))
return true;
- return false;
- };
- // If the condition is AND operation, we can propagate its operands into the
- // true branch. If it is OR operation, we can propagate them into the false
- // branch.
- unsigned PropagateOpcode =
- (BI->getSuccessor(0) == BB) ? Instruction::And : Instruction::Or;
-
- bool MadeChanges = false;
- SmallVector<Instruction *, 4> WorkList;
- SmallPtrSet<Instruction *, 4> Visited;
- WorkList.push_back(CondInst);
- while (!WorkList.empty()) {
- Instruction *Curr = WorkList.pop_back_val();
-
- AvailableValues.insert(Curr, TorF);
- LLVM_DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
- << Curr->getName() << "' as " << *TorF << " in "
- << BB->getName() << "\n");
- if (!DebugCounter::shouldExecute(CSECounter)) {
- LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
- } else {
- // Replace all dominated uses with the known value.
- if (unsigned Count = replaceDominatedUsesWith(Curr, TorF, DT,
- BasicBlockEdge(Pred, BB))) {
- NumCSECVP += Count;
- MadeChanges = true;
- }
- }
-
+ return false;
+ };
+ // If the condition is AND operation, we can propagate its operands into the
+ // true branch. If it is OR operation, we can propagate them into the false
+ // branch.
+ unsigned PropagateOpcode =
+ (BI->getSuccessor(0) == BB) ? Instruction::And : Instruction::Or;
+
+ bool MadeChanges = false;
+ SmallVector<Instruction *, 4> WorkList;
+ SmallPtrSet<Instruction *, 4> Visited;
+ WorkList.push_back(CondInst);
+ while (!WorkList.empty()) {
+ Instruction *Curr = WorkList.pop_back_val();
+
+ AvailableValues.insert(Curr, TorF);
+ LLVM_DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
+ << Curr->getName() << "' as " << *TorF << " in "
+ << BB->getName() << "\n");
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ } else {
+ // Replace all dominated uses with the known value.
+ if (unsigned Count = replaceDominatedUsesWith(Curr, TorF, DT,
+ BasicBlockEdge(Pred, BB))) {
+ NumCSECVP += Count;
+ MadeChanges = true;
+ }
+ }
+
Value *LHS, *RHS;
if (MatchBinOp(Curr, PropagateOpcode, LHS, RHS))
for (auto &Op : { LHS, RHS })
- if (Instruction *OPI = dyn_cast<Instruction>(Op))
- if (SimpleValue::canHandle(OPI) && Visited.insert(OPI).second)
- WorkList.push_back(OPI);
- }
-
- return MadeChanges;
-}
-
+ if (Instruction *OPI = dyn_cast<Instruction>(Op))
+ if (SimpleValue::canHandle(OPI) && Visited.insert(OPI).second)
+ WorkList.push_back(OPI);
+ }
+
+ return MadeChanges;
+}
+
Value *EarlyCSE::getMatchingValue(LoadValue &InVal, ParseMemoryInst &MemInst,
unsigned CurrentGeneration) {
if (InVal.DefInst == nullptr)
@@ -1162,76 +1162,76 @@ bool EarlyCSE::overridingStores(const ParseMemoryInst &Earlier,
return ENTI == LNTI;
}
-bool EarlyCSE::processNode(DomTreeNode *Node) {
- bool Changed = false;
- BasicBlock *BB = Node->getBlock();
-
- // If this block has a single predecessor, then the predecessor is the parent
- // of the domtree node and all of the live out memory values are still current
- // in this block. If this block has multiple predecessors, then they could
- // have invalidated the live-out memory values of our parent value. For now,
- // just be conservative and invalidate memory if this block has multiple
- // predecessors.
- if (!BB->getSinglePredecessor())
- ++CurrentGeneration;
-
- // If this node has a single predecessor which ends in a conditional branch,
- // we can infer the value of the branch condition given that we took this
- // path. We need the single predecessor to ensure there's not another path
- // which reaches this block where the condition might hold a different
- // value. Since we're adding this to the scoped hash table (like any other
- // def), it will have been popped if we encounter a future merge block.
- if (BasicBlock *Pred = BB->getSinglePredecessor()) {
- auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
- if (BI && BI->isConditional()) {
- auto *CondInst = dyn_cast<Instruction>(BI->getCondition());
- if (CondInst && SimpleValue::canHandle(CondInst))
- Changed |= handleBranchCondition(CondInst, BI, BB, Pred);
- }
- }
-
- /// LastStore - Keep track of the last non-volatile store that we saw... for
- /// as long as there in no instruction that reads memory. If we see a store
- /// to the same location, we delete the dead store. This zaps trivial dead
- /// stores which can occur in bitfield code among other things.
- Instruction *LastStore = nullptr;
-
- // See if any instructions in the block can be eliminated. If so, do it. If
- // not, add them to AvailableValues.
- for (Instruction &Inst : make_early_inc_range(BB->getInstList())) {
- // Dead instructions should just be removed.
- if (isInstructionTriviallyDead(&Inst, &TLI)) {
- LLVM_DEBUG(dbgs() << "EarlyCSE DCE: " << Inst << '\n');
- if (!DebugCounter::shouldExecute(CSECounter)) {
- LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
- continue;
- }
-
- salvageKnowledge(&Inst, &AC);
- salvageDebugInfo(Inst);
- removeMSSA(Inst);
- Inst.eraseFromParent();
- Changed = true;
- ++NumSimplify;
- continue;
- }
-
- // Skip assume intrinsics, they don't really have side effects (although
- // they're marked as such to ensure preservation of control dependencies),
- // and this pass will not bother with its removal. However, we should mark
- // its condition as true for all dominated blocks.
- if (match(&Inst, m_Intrinsic<Intrinsic::assume>())) {
- auto *CondI =
- dyn_cast<Instruction>(cast<CallInst>(Inst).getArgOperand(0));
- if (CondI && SimpleValue::canHandle(CondI)) {
- LLVM_DEBUG(dbgs() << "EarlyCSE considering assumption: " << Inst
- << '\n');
- AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
- } else
- LLVM_DEBUG(dbgs() << "EarlyCSE skipping assumption: " << Inst << '\n');
- continue;
- }
-
+bool EarlyCSE::processNode(DomTreeNode *Node) {
+ bool Changed = false;
+ BasicBlock *BB = Node->getBlock();
+
+ // If this block has a single predecessor, then the predecessor is the parent
+ // of the domtree node and all of the live out memory values are still current
+ // in this block. If this block has multiple predecessors, then they could
+ // have invalidated the live-out memory values of our parent value. For now,
+ // just be conservative and invalidate memory if this block has multiple
+ // predecessors.
+ if (!BB->getSinglePredecessor())
+ ++CurrentGeneration;
+
+ // If this node has a single predecessor which ends in a conditional branch,
+ // we can infer the value of the branch condition given that we took this
+ // path. We need the single predecessor to ensure there's not another path
+ // which reaches this block where the condition might hold a different
+ // value. Since we're adding this to the scoped hash table (like any other
+ // def), it will have been popped if we encounter a future merge block.
+ if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+ auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
+ if (BI && BI->isConditional()) {
+ auto *CondInst = dyn_cast<Instruction>(BI->getCondition());
+ if (CondInst && SimpleValue::canHandle(CondInst))
+ Changed |= handleBranchCondition(CondInst, BI, BB, Pred);
+ }
+ }
+
+ /// LastStore - Keep track of the last non-volatile store that we saw... for
+ /// as long as there in no instruction that reads memory. If we see a store
+ /// to the same location, we delete the dead store. This zaps trivial dead
+ /// stores which can occur in bitfield code among other things.
+ Instruction *LastStore = nullptr;
+
+ // See if any instructions in the block can be eliminated. If so, do it. If
+ // not, add them to AvailableValues.
+ for (Instruction &Inst : make_early_inc_range(BB->getInstList())) {
+ // Dead instructions should just be removed.
+ if (isInstructionTriviallyDead(&Inst, &TLI)) {
+ LLVM_DEBUG(dbgs() << "EarlyCSE DCE: " << Inst << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ continue;
+ }
+
+ salvageKnowledge(&Inst, &AC);
+ salvageDebugInfo(Inst);
+ removeMSSA(Inst);
+ Inst.eraseFromParent();
+ Changed = true;
+ ++NumSimplify;
+ continue;
+ }
+
+ // Skip assume intrinsics, they don't really have side effects (although
+ // they're marked as such to ensure preservation of control dependencies),
+ // and this pass will not bother with its removal. However, we should mark
+ // its condition as true for all dominated blocks.
+ if (match(&Inst, m_Intrinsic<Intrinsic::assume>())) {
+ auto *CondI =
+ dyn_cast<Instruction>(cast<CallInst>(Inst).getArgOperand(0));
+ if (CondI && SimpleValue::canHandle(CondI)) {
+ LLVM_DEBUG(dbgs() << "EarlyCSE considering assumption: " << Inst
+ << '\n');
+ AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
+ } else
+ LLVM_DEBUG(dbgs() << "EarlyCSE skipping assumption: " << Inst << '\n');
+ continue;
+ }
+
// Likewise, noalias intrinsics don't actually write.
if (match(&Inst,
m_Intrinsic<Intrinsic::experimental_noalias_scope_decl>())) {
@@ -1240,159 +1240,159 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
continue;
}
- // Skip sideeffect intrinsics, for the same reason as assume intrinsics.
- if (match(&Inst, m_Intrinsic<Intrinsic::sideeffect>())) {
- LLVM_DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << Inst << '\n');
- continue;
- }
-
- // We can skip all invariant.start intrinsics since they only read memory,
- // and we can forward values across it. For invariant starts without
- // invariant ends, we can use the fact that the invariantness never ends to
- // start a scope in the current generaton which is true for all future
- // generations. Also, we dont need to consume the last store since the
- // semantics of invariant.start allow us to perform DSE of the last
- // store, if there was a store following invariant.start. Consider:
- //
- // store 30, i8* p
- // invariant.start(p)
- // store 40, i8* p
- // We can DSE the store to 30, since the store 40 to invariant location p
- // causes undefined behaviour.
- if (match(&Inst, m_Intrinsic<Intrinsic::invariant_start>())) {
- // If there are any uses, the scope might end.
- if (!Inst.use_empty())
- continue;
- MemoryLocation MemLoc =
- MemoryLocation::getForArgument(&cast<CallInst>(Inst), 1, TLI);
- // Don't start a scope if we already have a better one pushed
- if (!AvailableInvariants.count(MemLoc))
- AvailableInvariants.insert(MemLoc, CurrentGeneration);
- continue;
- }
-
- if (isGuard(&Inst)) {
- if (auto *CondI =
- dyn_cast<Instruction>(cast<CallInst>(Inst).getArgOperand(0))) {
- if (SimpleValue::canHandle(CondI)) {
- // Do we already know the actual value of this condition?
- if (auto *KnownCond = AvailableValues.lookup(CondI)) {
- // Is the condition known to be true?
- if (isa<ConstantInt>(KnownCond) &&
- cast<ConstantInt>(KnownCond)->isOne()) {
- LLVM_DEBUG(dbgs()
- << "EarlyCSE removing guard: " << Inst << '\n');
- salvageKnowledge(&Inst, &AC);
- removeMSSA(Inst);
- Inst.eraseFromParent();
- Changed = true;
- continue;
- } else
- // Use the known value if it wasn't true.
- cast<CallInst>(Inst).setArgOperand(0, KnownCond);
- }
- // The condition we're on guarding here is true for all dominated
- // locations.
- AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
- }
- }
-
- // Guard intrinsics read all memory, but don't write any memory.
- // Accordingly, don't update the generation but consume the last store (to
- // avoid an incorrect DSE).
- LastStore = nullptr;
- continue;
- }
-
- // If the instruction can be simplified (e.g. X+0 = X) then replace it with
- // its simpler value.
- if (Value *V = SimplifyInstruction(&Inst, SQ)) {
- LLVM_DEBUG(dbgs() << "EarlyCSE Simplify: " << Inst << " to: " << *V
- << '\n');
- if (!DebugCounter::shouldExecute(CSECounter)) {
- LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
- } else {
- bool Killed = false;
- if (!Inst.use_empty()) {
- Inst.replaceAllUsesWith(V);
- Changed = true;
- }
- if (isInstructionTriviallyDead(&Inst, &TLI)) {
- salvageKnowledge(&Inst, &AC);
- removeMSSA(Inst);
- Inst.eraseFromParent();
- Changed = true;
- Killed = true;
- }
- if (Changed)
- ++NumSimplify;
- if (Killed)
- continue;
- }
- }
-
- // If this is a simple instruction that we can value number, process it.
- if (SimpleValue::canHandle(&Inst)) {
- // See if the instruction has an available value. If so, use it.
- if (Value *V = AvailableValues.lookup(&Inst)) {
- LLVM_DEBUG(dbgs() << "EarlyCSE CSE: " << Inst << " to: " << *V
- << '\n');
- if (!DebugCounter::shouldExecute(CSECounter)) {
- LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
- continue;
- }
- if (auto *I = dyn_cast<Instruction>(V))
- I->andIRFlags(&Inst);
- Inst.replaceAllUsesWith(V);
- salvageKnowledge(&Inst, &AC);
- removeMSSA(Inst);
- Inst.eraseFromParent();
- Changed = true;
- ++NumCSE;
- continue;
- }
-
- // Otherwise, just remember that this value is available.
- AvailableValues.insert(&Inst, &Inst);
- continue;
- }
-
- ParseMemoryInst MemInst(&Inst, TTI);
- // If this is a non-volatile load, process it.
- if (MemInst.isValid() && MemInst.isLoad()) {
- // (conservatively) we can't peak past the ordering implied by this
- // operation, but we can add this load to our set of available values
- if (MemInst.isVolatile() || !MemInst.isUnordered()) {
- LastStore = nullptr;
- ++CurrentGeneration;
- }
-
- if (MemInst.isInvariantLoad()) {
- // If we pass an invariant load, we know that memory location is
- // indefinitely constant from the moment of first dereferenceability.
- // We conservatively treat the invariant_load as that moment. If we
- // pass a invariant load after already establishing a scope, don't
- // restart it since we want to preserve the earliest point seen.
- auto MemLoc = MemoryLocation::get(&Inst);
- if (!AvailableInvariants.count(MemLoc))
- AvailableInvariants.insert(MemLoc, CurrentGeneration);
- }
-
- // If we have an available version of this load, and if it is the right
- // generation or the load is known to be from an invariant location,
- // replace this instruction.
- //
- // If either the dominating load or the current load are invariant, then
- // we can assume the current load loads the same value as the dominating
- // load.
- LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
+ // Skip sideeffect intrinsics, for the same reason as assume intrinsics.
+ if (match(&Inst, m_Intrinsic<Intrinsic::sideeffect>())) {
+ LLVM_DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << Inst << '\n');
+ continue;
+ }
+
+ // We can skip all invariant.start intrinsics since they only read memory,
+ // and we can forward values across it. For invariant starts without
+ // invariant ends, we can use the fact that the invariantness never ends to
+ // start a scope in the current generaton which is true for all future
+ // generations. Also, we dont need to consume the last store since the
+ // semantics of invariant.start allow us to perform DSE of the last
+ // store, if there was a store following invariant.start. Consider:
+ //
+ // store 30, i8* p
+ // invariant.start(p)
+ // store 40, i8* p
+ // We can DSE the store to 30, since the store 40 to invariant location p
+ // causes undefined behaviour.
+ if (match(&Inst, m_Intrinsic<Intrinsic::invariant_start>())) {
+ // If there are any uses, the scope might end.
+ if (!Inst.use_empty())
+ continue;
+ MemoryLocation MemLoc =
+ MemoryLocation::getForArgument(&cast<CallInst>(Inst), 1, TLI);
+ // Don't start a scope if we already have a better one pushed
+ if (!AvailableInvariants.count(MemLoc))
+ AvailableInvariants.insert(MemLoc, CurrentGeneration);
+ continue;
+ }
+
+ if (isGuard(&Inst)) {
+ if (auto *CondI =
+ dyn_cast<Instruction>(cast<CallInst>(Inst).getArgOperand(0))) {
+ if (SimpleValue::canHandle(CondI)) {
+ // Do we already know the actual value of this condition?
+ if (auto *KnownCond = AvailableValues.lookup(CondI)) {
+ // Is the condition known to be true?
+ if (isa<ConstantInt>(KnownCond) &&
+ cast<ConstantInt>(KnownCond)->isOne()) {
+ LLVM_DEBUG(dbgs()
+ << "EarlyCSE removing guard: " << Inst << '\n');
+ salvageKnowledge(&Inst, &AC);
+ removeMSSA(Inst);
+ Inst.eraseFromParent();
+ Changed = true;
+ continue;
+ } else
+ // Use the known value if it wasn't true.
+ cast<CallInst>(Inst).setArgOperand(0, KnownCond);
+ }
+ // The condition we're on guarding here is true for all dominated
+ // locations.
+ AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
+ }
+ }
+
+ // Guard intrinsics read all memory, but don't write any memory.
+ // Accordingly, don't update the generation but consume the last store (to
+ // avoid an incorrect DSE).
+ LastStore = nullptr;
+ continue;
+ }
+
+ // If the instruction can be simplified (e.g. X+0 = X) then replace it with
+ // its simpler value.
+ if (Value *V = SimplifyInstruction(&Inst, SQ)) {
+ LLVM_DEBUG(dbgs() << "EarlyCSE Simplify: " << Inst << " to: " << *V
+ << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ } else {
+ bool Killed = false;
+ if (!Inst.use_empty()) {
+ Inst.replaceAllUsesWith(V);
+ Changed = true;
+ }
+ if (isInstructionTriviallyDead(&Inst, &TLI)) {
+ salvageKnowledge(&Inst, &AC);
+ removeMSSA(Inst);
+ Inst.eraseFromParent();
+ Changed = true;
+ Killed = true;
+ }
+ if (Changed)
+ ++NumSimplify;
+ if (Killed)
+ continue;
+ }
+ }
+
+ // If this is a simple instruction that we can value number, process it.
+ if (SimpleValue::canHandle(&Inst)) {
+ // See if the instruction has an available value. If so, use it.
+ if (Value *V = AvailableValues.lookup(&Inst)) {
+ LLVM_DEBUG(dbgs() << "EarlyCSE CSE: " << Inst << " to: " << *V
+ << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ continue;
+ }
+ if (auto *I = dyn_cast<Instruction>(V))
+ I->andIRFlags(&Inst);
+ Inst.replaceAllUsesWith(V);
+ salvageKnowledge(&Inst, &AC);
+ removeMSSA(Inst);
+ Inst.eraseFromParent();
+ Changed = true;
+ ++NumCSE;
+ continue;
+ }
+
+ // Otherwise, just remember that this value is available.
+ AvailableValues.insert(&Inst, &Inst);
+ continue;
+ }
+
+ ParseMemoryInst MemInst(&Inst, TTI);
+ // If this is a non-volatile load, process it.
+ if (MemInst.isValid() && MemInst.isLoad()) {
+ // (conservatively) we can't peak past the ordering implied by this
+ // operation, but we can add this load to our set of available values
+ if (MemInst.isVolatile() || !MemInst.isUnordered()) {
+ LastStore = nullptr;
+ ++CurrentGeneration;
+ }
+
+ if (MemInst.isInvariantLoad()) {
+ // If we pass an invariant load, we know that memory location is
+ // indefinitely constant from the moment of first dereferenceability.
+ // We conservatively treat the invariant_load as that moment. If we
+ // pass a invariant load after already establishing a scope, don't
+ // restart it since we want to preserve the earliest point seen.
+ auto MemLoc = MemoryLocation::get(&Inst);
+ if (!AvailableInvariants.count(MemLoc))
+ AvailableInvariants.insert(MemLoc, CurrentGeneration);
+ }
+
+ // If we have an available version of this load, and if it is the right
+ // generation or the load is known to be from an invariant location,
+ // replace this instruction.
+ //
+ // If either the dominating load or the current load are invariant, then
+ // we can assume the current load loads the same value as the dominating
+ // load.
+ LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
if (Value *Op = getMatchingValue(InVal, MemInst, CurrentGeneration)) {
LLVM_DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << Inst
<< " to: " << *InVal.DefInst << '\n');
if (!DebugCounter::shouldExecute(CSECounter)) {
LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
- continue;
- }
+ continue;
+ }
if (!Inst.use_empty())
Inst.replaceAllUsesWith(Op);
salvageKnowledge(&Inst, &AC);
@@ -1401,317 +1401,317 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
Changed = true;
++NumCSELoad;
continue;
- }
-
- // Otherwise, remember that we have this instruction.
- AvailableLoads.insert(MemInst.getPointerOperand(),
- LoadValue(&Inst, CurrentGeneration,
- MemInst.getMatchingId(),
- MemInst.isAtomic()));
- LastStore = nullptr;
- continue;
- }
-
- // If this instruction may read from memory or throw (and potentially read
- // from memory in the exception handler), forget LastStore. Load/store
- // intrinsics will indicate both a read and a write to memory. The target
- // may override this (e.g. so that a store intrinsic does not read from
- // memory, and thus will be treated the same as a regular store for
- // commoning purposes).
- if ((Inst.mayReadFromMemory() || Inst.mayThrow()) &&
- !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
- LastStore = nullptr;
-
- // If this is a read-only call, process it.
- if (CallValue::canHandle(&Inst)) {
- // If we have an available version of this call, and if it is the right
- // generation, replace this instruction.
- std::pair<Instruction *, unsigned> InVal = AvailableCalls.lookup(&Inst);
- if (InVal.first != nullptr &&
- isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first,
- &Inst)) {
- LLVM_DEBUG(dbgs() << "EarlyCSE CSE CALL: " << Inst
- << " to: " << *InVal.first << '\n');
- if (!DebugCounter::shouldExecute(CSECounter)) {
- LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
- continue;
- }
- if (!Inst.use_empty())
- Inst.replaceAllUsesWith(InVal.first);
- salvageKnowledge(&Inst, &AC);
- removeMSSA(Inst);
- Inst.eraseFromParent();
- Changed = true;
- ++NumCSECall;
- continue;
- }
-
- // Otherwise, remember that we have this instruction.
- AvailableCalls.insert(&Inst, std::make_pair(&Inst, CurrentGeneration));
- continue;
- }
-
- // A release fence requires that all stores complete before it, but does
- // not prevent the reordering of following loads 'before' the fence. As a
- // result, we don't need to consider it as writing to memory and don't need
- // to advance the generation. We do need to prevent DSE across the fence,
- // but that's handled above.
- if (auto *FI = dyn_cast<FenceInst>(&Inst))
- if (FI->getOrdering() == AtomicOrdering::Release) {
- assert(Inst.mayReadFromMemory() && "relied on to prevent DSE above");
- continue;
- }
-
- // write back DSE - If we write back the same value we just loaded from
- // the same location and haven't passed any intervening writes or ordering
- // operations, we can remove the write. The primary benefit is in allowing
- // the available load table to remain valid and value forward past where
- // the store originally was.
- if (MemInst.isValid() && MemInst.isStore()) {
- LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
- if (InVal.DefInst &&
+ }
+
+ // Otherwise, remember that we have this instruction.
+ AvailableLoads.insert(MemInst.getPointerOperand(),
+ LoadValue(&Inst, CurrentGeneration,
+ MemInst.getMatchingId(),
+ MemInst.isAtomic()));
+ LastStore = nullptr;
+ continue;
+ }
+
+ // If this instruction may read from memory or throw (and potentially read
+ // from memory in the exception handler), forget LastStore. Load/store
+ // intrinsics will indicate both a read and a write to memory. The target
+ // may override this (e.g. so that a store intrinsic does not read from
+ // memory, and thus will be treated the same as a regular store for
+ // commoning purposes).
+ if ((Inst.mayReadFromMemory() || Inst.mayThrow()) &&
+ !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
+ LastStore = nullptr;
+
+ // If this is a read-only call, process it.
+ if (CallValue::canHandle(&Inst)) {
+ // If we have an available version of this call, and if it is the right
+ // generation, replace this instruction.
+ std::pair<Instruction *, unsigned> InVal = AvailableCalls.lookup(&Inst);
+ if (InVal.first != nullptr &&
+ isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first,
+ &Inst)) {
+ LLVM_DEBUG(dbgs() << "EarlyCSE CSE CALL: " << Inst
+ << " to: " << *InVal.first << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ continue;
+ }
+ if (!Inst.use_empty())
+ Inst.replaceAllUsesWith(InVal.first);
+ salvageKnowledge(&Inst, &AC);
+ removeMSSA(Inst);
+ Inst.eraseFromParent();
+ Changed = true;
+ ++NumCSECall;
+ continue;
+ }
+
+ // Otherwise, remember that we have this instruction.
+ AvailableCalls.insert(&Inst, std::make_pair(&Inst, CurrentGeneration));
+ continue;
+ }
+
+ // A release fence requires that all stores complete before it, but does
+ // not prevent the reordering of following loads 'before' the fence. As a
+ // result, we don't need to consider it as writing to memory and don't need
+ // to advance the generation. We do need to prevent DSE across the fence,
+ // but that's handled above.
+ if (auto *FI = dyn_cast<FenceInst>(&Inst))
+ if (FI->getOrdering() == AtomicOrdering::Release) {
+ assert(Inst.mayReadFromMemory() && "relied on to prevent DSE above");
+ continue;
+ }
+
+ // write back DSE - If we write back the same value we just loaded from
+ // the same location and haven't passed any intervening writes or ordering
+ // operations, we can remove the write. The primary benefit is in allowing
+ // the available load table to remain valid and value forward past where
+ // the store originally was.
+ if (MemInst.isValid() && MemInst.isStore()) {
+ LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
+ if (InVal.DefInst &&
InVal.DefInst == getMatchingValue(InVal, MemInst, CurrentGeneration)) {
- // It is okay to have a LastStore to a different pointer here if MemorySSA
- // tells us that the load and store are from the same memory generation.
- // In that case, LastStore should keep its present value since we're
- // removing the current store.
- assert((!LastStore ||
- ParseMemoryInst(LastStore, TTI).getPointerOperand() ==
- MemInst.getPointerOperand() ||
- MSSA) &&
- "can't have an intervening store if not using MemorySSA!");
- LLVM_DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << Inst << '\n');
- if (!DebugCounter::shouldExecute(CSECounter)) {
- LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
- continue;
- }
- salvageKnowledge(&Inst, &AC);
- removeMSSA(Inst);
- Inst.eraseFromParent();
- Changed = true;
- ++NumDSE;
- // We can avoid incrementing the generation count since we were able
- // to eliminate this store.
- continue;
- }
- }
-
- // Okay, this isn't something we can CSE at all. Check to see if it is
- // something that could modify memory. If so, our available memory values
- // cannot be used so bump the generation count.
- if (Inst.mayWriteToMemory()) {
- ++CurrentGeneration;
-
- if (MemInst.isValid() && MemInst.isStore()) {
- // We do a trivial form of DSE if there are two stores to the same
- // location with no intervening loads. Delete the earlier store.
- if (LastStore) {
+ // It is okay to have a LastStore to a different pointer here if MemorySSA
+ // tells us that the load and store are from the same memory generation.
+ // In that case, LastStore should keep its present value since we're
+ // removing the current store.
+ assert((!LastStore ||
+ ParseMemoryInst(LastStore, TTI).getPointerOperand() ==
+ MemInst.getPointerOperand() ||
+ MSSA) &&
+ "can't have an intervening store if not using MemorySSA!");
+ LLVM_DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << Inst << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ continue;
+ }
+ salvageKnowledge(&Inst, &AC);
+ removeMSSA(Inst);
+ Inst.eraseFromParent();
+ Changed = true;
+ ++NumDSE;
+ // We can avoid incrementing the generation count since we were able
+ // to eliminate this store.
+ continue;
+ }
+ }
+
+ // Okay, this isn't something we can CSE at all. Check to see if it is
+ // something that could modify memory. If so, our available memory values
+ // cannot be used so bump the generation count.
+ if (Inst.mayWriteToMemory()) {
+ ++CurrentGeneration;
+
+ if (MemInst.isValid() && MemInst.isStore()) {
+ // We do a trivial form of DSE if there are two stores to the same
+ // location with no intervening loads. Delete the earlier store.
+ if (LastStore) {
if (overridingStores(ParseMemoryInst(LastStore, TTI), MemInst)) {
- LLVM_DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
- << " due to: " << Inst << '\n');
- if (!DebugCounter::shouldExecute(CSECounter)) {
- LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
- } else {
- salvageKnowledge(&Inst, &AC);
- removeMSSA(*LastStore);
- LastStore->eraseFromParent();
- Changed = true;
- ++NumDSE;
- LastStore = nullptr;
- }
- }
- // fallthrough - we can exploit information about this store
- }
-
- // Okay, we just invalidated anything we knew about loaded values. Try
- // to salvage *something* by remembering that the stored value is a live
- // version of the pointer. It is safe to forward from volatile stores
- // to non-volatile loads, so we don't have to check for volatility of
- // the store.
- AvailableLoads.insert(MemInst.getPointerOperand(),
- LoadValue(&Inst, CurrentGeneration,
- MemInst.getMatchingId(),
- MemInst.isAtomic()));
-
- // Remember that this was the last unordered store we saw for DSE. We
- // don't yet handle DSE on ordered or volatile stores since we don't
- // have a good way to model the ordering requirement for following
- // passes once the store is removed. We could insert a fence, but
- // since fences are slightly stronger than stores in their ordering,
- // it's not clear this is a profitable transform. Another option would
- // be to merge the ordering with that of the post dominating store.
- if (MemInst.isUnordered() && !MemInst.isVolatile())
- LastStore = &Inst;
- else
- LastStore = nullptr;
- }
- }
- }
-
- return Changed;
-}
-
-bool EarlyCSE::run() {
- // Note, deque is being used here because there is significant performance
- // gains over vector when the container becomes very large due to the
- // specific access patterns. For more information see the mailing list
- // discussion on this:
- // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
- std::deque<StackNode *> nodesToProcess;
-
- bool Changed = false;
-
- // Process the root node.
- nodesToProcess.push_back(new StackNode(
- AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
- CurrentGeneration, DT.getRootNode(),
- DT.getRootNode()->begin(), DT.getRootNode()->end()));
-
- assert(!CurrentGeneration && "Create a new EarlyCSE instance to rerun it.");
-
- // Process the stack.
- while (!nodesToProcess.empty()) {
- // Grab the first item off the stack. Set the current generation, remove
- // the node from the stack, and process it.
- StackNode *NodeToProcess = nodesToProcess.back();
-
- // Initialize class members.
- CurrentGeneration = NodeToProcess->currentGeneration();
-
- // Check if the node needs to be processed.
- if (!NodeToProcess->isProcessed()) {
- // Process the node.
- Changed |= processNode(NodeToProcess->node());
- NodeToProcess->childGeneration(CurrentGeneration);
- NodeToProcess->process();
- } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
- // Push the next child onto the stack.
- DomTreeNode *child = NodeToProcess->nextChild();
- nodesToProcess.push_back(
- new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
- AvailableCalls, NodeToProcess->childGeneration(),
- child, child->begin(), child->end()));
- } else {
- // It has been processed, and there are no more children to process,
- // so delete it and pop it off the stack.
- delete NodeToProcess;
- nodesToProcess.pop_back();
- }
- } // while (!nodes...)
-
- return Changed;
-}
-
-PreservedAnalyses EarlyCSEPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &AC = AM.getResult<AssumptionAnalysis>(F);
- auto *MSSA =
- UseMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() : nullptr;
-
- EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
-
- if (!CSE.run())
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<GlobalsAA>();
- if (UseMemorySSA)
- PA.preserve<MemorySSAAnalysis>();
- return PA;
-}
-
-namespace {
-
-/// A simple and fast domtree-based CSE pass.
-///
-/// This pass does a simple depth-first walk over the dominator tree,
-/// eliminating trivially redundant instructions and using instsimplify to
-/// canonicalize things as it goes. It is intended to be fast and catch obvious
-/// cases so that instcombine and other passes are more effective. It is
-/// expected that a later pass of GVN will catch the interesting/hard cases.
-template<bool UseMemorySSA>
-class EarlyCSELegacyCommonPass : public FunctionPass {
-public:
- static char ID;
-
- EarlyCSELegacyCommonPass() : FunctionPass(ID) {
- if (UseMemorySSA)
- initializeEarlyCSEMemSSALegacyPassPass(*PassRegistry::getPassRegistry());
- else
- initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto *MSSA =
- UseMemorySSA ? &getAnalysis<MemorySSAWrapperPass>().getMSSA() : nullptr;
-
- EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
-
- return CSE.run();
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- if (UseMemorySSA) {
+ LLVM_DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
+ << " due to: " << Inst << '\n');
+ if (!DebugCounter::shouldExecute(CSECounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
+ } else {
+ salvageKnowledge(&Inst, &AC);
+ removeMSSA(*LastStore);
+ LastStore->eraseFromParent();
+ Changed = true;
+ ++NumDSE;
+ LastStore = nullptr;
+ }
+ }
+ // fallthrough - we can exploit information about this store
+ }
+
+ // Okay, we just invalidated anything we knew about loaded values. Try
+ // to salvage *something* by remembering that the stored value is a live
+ // version of the pointer. It is safe to forward from volatile stores
+ // to non-volatile loads, so we don't have to check for volatility of
+ // the store.
+ AvailableLoads.insert(MemInst.getPointerOperand(),
+ LoadValue(&Inst, CurrentGeneration,
+ MemInst.getMatchingId(),
+ MemInst.isAtomic()));
+
+ // Remember that this was the last unordered store we saw for DSE. We
+ // don't yet handle DSE on ordered or volatile stores since we don't
+ // have a good way to model the ordering requirement for following
+ // passes once the store is removed. We could insert a fence, but
+ // since fences are slightly stronger than stores in their ordering,
+ // it's not clear this is a profitable transform. Another option would
+ // be to merge the ordering with that of the post dominating store.
+ if (MemInst.isUnordered() && !MemInst.isVolatile())
+ LastStore = &Inst;
+ else
+ LastStore = nullptr;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+bool EarlyCSE::run() {
+ // Note, deque is being used here because there is significant performance
+ // gains over vector when the container becomes very large due to the
+ // specific access patterns. For more information see the mailing list
+ // discussion on this:
+ // http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20120116/135228.html
+ std::deque<StackNode *> nodesToProcess;
+
+ bool Changed = false;
+
+ // Process the root node.
+ nodesToProcess.push_back(new StackNode(
+ AvailableValues, AvailableLoads, AvailableInvariants, AvailableCalls,
+ CurrentGeneration, DT.getRootNode(),
+ DT.getRootNode()->begin(), DT.getRootNode()->end()));
+
+ assert(!CurrentGeneration && "Create a new EarlyCSE instance to rerun it.");
+
+ // Process the stack.
+ while (!nodesToProcess.empty()) {
+ // Grab the first item off the stack. Set the current generation, remove
+ // the node from the stack, and process it.
+ StackNode *NodeToProcess = nodesToProcess.back();
+
+ // Initialize class members.
+ CurrentGeneration = NodeToProcess->currentGeneration();
+
+ // Check if the node needs to be processed.
+ if (!NodeToProcess->isProcessed()) {
+ // Process the node.
+ Changed |= processNode(NodeToProcess->node());
+ NodeToProcess->childGeneration(CurrentGeneration);
+ NodeToProcess->process();
+ } else if (NodeToProcess->childIter() != NodeToProcess->end()) {
+ // Push the next child onto the stack.
+ DomTreeNode *child = NodeToProcess->nextChild();
+ nodesToProcess.push_back(
+ new StackNode(AvailableValues, AvailableLoads, AvailableInvariants,
+ AvailableCalls, NodeToProcess->childGeneration(),
+ child, child->begin(), child->end()));
+ } else {
+ // It has been processed, and there are no more children to process,
+ // so delete it and pop it off the stack.
+ delete NodeToProcess;
+ nodesToProcess.pop_back();
+ }
+ } // while (!nodes...)
+
+ return Changed;
+}
+
+PreservedAnalyses EarlyCSEPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto *MSSA =
+ UseMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA() : nullptr;
+
+ EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
+
+ if (!CSE.run())
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ if (UseMemorySSA)
+ PA.preserve<MemorySSAAnalysis>();
+ return PA;
+}
+
+namespace {
+
+/// A simple and fast domtree-based CSE pass.
+///
+/// This pass does a simple depth-first walk over the dominator tree,
+/// eliminating trivially redundant instructions and using instsimplify to
+/// canonicalize things as it goes. It is intended to be fast and catch obvious
+/// cases so that instcombine and other passes are more effective. It is
+/// expected that a later pass of GVN will catch the interesting/hard cases.
+template<bool UseMemorySSA>
+class EarlyCSELegacyCommonPass : public FunctionPass {
+public:
+ static char ID;
+
+ EarlyCSELegacyCommonPass() : FunctionPass(ID) {
+ if (UseMemorySSA)
+ initializeEarlyCSEMemSSALegacyPassPass(*PassRegistry::getPassRegistry());
+ else
+ initializeEarlyCSELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *MSSA =
+ UseMemorySSA ? &getAnalysis<MemorySSAWrapperPass>().getMSSA() : nullptr;
+
+ EarlyCSE CSE(F.getParent()->getDataLayout(), TLI, TTI, DT, AC, MSSA);
+
+ return CSE.run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ if (UseMemorySSA) {
AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<MemorySSAWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
- }
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.setPreservesCFG();
- }
-};
-
-} // end anonymous namespace
-
-using EarlyCSELegacyPass = EarlyCSELegacyCommonPass</*UseMemorySSA=*/false>;
-
-template<>
-char EarlyCSELegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false)
-
-using EarlyCSEMemSSALegacyPass =
- EarlyCSELegacyCommonPass</*UseMemorySSA=*/true>;
-
-template<>
-char EarlyCSEMemSSALegacyPass::ID = 0;
-
-FunctionPass *llvm::createEarlyCSEPass(bool UseMemorySSA) {
- if (UseMemorySSA)
- return new EarlyCSEMemSSALegacyPass();
- else
- return new EarlyCSELegacyPass();
-}
-
-INITIALIZE_PASS_BEGIN(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
- "Early CSE w/ MemorySSA", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ }
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.setPreservesCFG();
+ }
+};
+
+} // end anonymous namespace
+
+using EarlyCSELegacyPass = EarlyCSELegacyCommonPass</*UseMemorySSA=*/false>;
+
+template<>
+char EarlyCSELegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(EarlyCSELegacyPass, "early-cse", "Early CSE", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(EarlyCSELegacyPass, "early-cse", "Early CSE", false, false)
+
+using EarlyCSEMemSSALegacyPass =
+ EarlyCSELegacyCommonPass</*UseMemorySSA=*/true>;
+
+template<>
+char EarlyCSEMemSSALegacyPass::ID = 0;
+
+FunctionPass *llvm::createEarlyCSEPass(bool UseMemorySSA) {
+ if (UseMemorySSA)
+ return new EarlyCSEMemSSALegacyPass();
+ else
+ return new EarlyCSELegacyPass();
+}
+
+INITIALIZE_PASS_BEGIN(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
+ "Early CSE w/ MemorySSA", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_END(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
- "Early CSE w/ MemorySSA", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(EarlyCSEMemSSALegacyPass, "early-cse-memssa",
+ "Early CSE w/ MemorySSA", false, false)
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/FlattenCFGPass.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/FlattenCFGPass.cpp
index e2c126223d..e54a270fb2 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/FlattenCFGPass.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/FlattenCFGPass.cpp
@@ -1,91 +1,91 @@
-//===- FlattenCFGPass.cpp - CFG Flatten Pass ----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements flattening of CFG.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/IR/CFG.h"
+//===- FlattenCFGPass.cpp - CFG Flatten Pass ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements flattening of CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/CFG.h"
#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "flattencfg"
-
-namespace {
-struct FlattenCFGPass : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
-public:
- FlattenCFGPass() : FunctionPass(ID) {
- initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry());
- }
- bool runOnFunction(Function &F) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AAResultsWrapperPass>();
- }
-
-private:
- AliasAnalysis *AA;
-};
-}
-
-char FlattenCFGPass::ID = 0;
-INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
- false)
-
-// Public interface to the FlattenCFG pass
-FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); }
-
-/// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function,
-/// iterating until no more changes are made.
-static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
- bool Changed = false;
- bool LocalChange = true;
-
- // Use block handles instead of iterating over function blocks directly
- // to avoid using iterators invalidated by erasing blocks.
- std::vector<WeakVH> Blocks;
- Blocks.reserve(F.size());
- for (auto &BB : F)
- Blocks.push_back(&BB);
-
- while (LocalChange) {
- LocalChange = false;
-
- // Loop over all of the basic blocks and try to flatten them.
- for (WeakVH &BlockHandle : Blocks) {
- // Skip blocks erased by FlattenCFG.
- if (auto *BB = cast_or_null<BasicBlock>(BlockHandle))
- if (FlattenCFG(BB, AA))
- LocalChange = true;
- }
- Changed |= LocalChange;
- }
- return Changed;
-}
-
-bool FlattenCFGPass::runOnFunction(Function &F) {
- AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- bool EverChanged = false;
- // iterativelyFlattenCFG can make some blocks dead.
- while (iterativelyFlattenCFG(F, AA)) {
- removeUnreachableBlocks(F);
- EverChanged = true;
- }
- return EverChanged;
-}
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "flattencfg"
+
+namespace {
+struct FlattenCFGPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+public:
+ FlattenCFGPass() : FunctionPass(ID) {
+ initializeFlattenCFGPassPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
+ }
+
+private:
+ AliasAnalysis *AA;
+};
+}
+
+char FlattenCFGPass::ID = 0;
+INITIALIZE_PASS_BEGIN(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(FlattenCFGPass, "flattencfg", "Flatten the CFG", false,
+ false)
+
+// Public interface to the FlattenCFG pass
+FunctionPass *llvm::createFlattenCFGPass() { return new FlattenCFGPass(); }
+
+/// iterativelyFlattenCFG - Call FlattenCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelyFlattenCFG(Function &F, AliasAnalysis *AA) {
+ bool Changed = false;
+ bool LocalChange = true;
+
+ // Use block handles instead of iterating over function blocks directly
+ // to avoid using iterators invalidated by erasing blocks.
+ std::vector<WeakVH> Blocks;
+ Blocks.reserve(F.size());
+ for (auto &BB : F)
+ Blocks.push_back(&BB);
+
+ while (LocalChange) {
+ LocalChange = false;
+
+ // Loop over all of the basic blocks and try to flatten them.
+ for (WeakVH &BlockHandle : Blocks) {
+ // Skip blocks erased by FlattenCFG.
+ if (auto *BB = cast_or_null<BasicBlock>(BlockHandle))
+ if (FlattenCFG(BB, AA))
+ LocalChange = true;
+ }
+ Changed |= LocalChange;
+ }
+ return Changed;
+}
+
+bool FlattenCFGPass::runOnFunction(Function &F) {
+ AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ bool EverChanged = false;
+ // iterativelyFlattenCFG can make some blocks dead.
+ while (iterativelyFlattenCFG(F, AA)) {
+ removeUnreachableBlocks(F);
+ EverChanged = true;
+ }
+ return EverChanged;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/Float2Int.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/Float2Int.cpp
index 341a4c8220..b6d82685e8 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/Float2Int.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/Float2Int.cpp
@@ -1,550 +1,550 @@
-//===- Float2Int.cpp - Demote floating point ops to work on integers ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Float2Int pass, which aims to demote floating
-// point operations to work on integers, where that is losslessly possible.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#define DEBUG_TYPE "float2int"
-
-#include "llvm/Transforms/Scalar/Float2Int.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/APSInt.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include <deque>
-#include <functional> // For std::function
-using namespace llvm;
-
-// The algorithm is simple. Start at instructions that convert from the
-// float to the int domain: fptoui, fptosi and fcmp. Walk up the def-use
-// graph, using an equivalence datastructure to unify graphs that interfere.
-//
-// Mappable instructions are those with an integer corrollary that, given
-// integer domain inputs, produce an integer output; fadd, for example.
-//
-// If a non-mappable instruction is seen, this entire def-use graph is marked
-// as non-transformable. If we see an instruction that converts from the
-// integer domain to FP domain (uitofp,sitofp), we terminate our walk.
-
-/// The largest integer type worth dealing with.
-static cl::opt<unsigned>
-MaxIntegerBW("float2int-max-integer-bw", cl::init(64), cl::Hidden,
- cl::desc("Max integer bitwidth to consider in float2int"
- "(default=64)"));
-
-namespace {
- struct Float2IntLegacyPass : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- Float2IntLegacyPass() : FunctionPass(ID) {
- initializeFloat2IntLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- const DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- return Impl.runImpl(F, DT);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
-
- private:
- Float2IntPass Impl;
- };
-}
-
-char Float2IntLegacyPass::ID = 0;
-INITIALIZE_PASS(Float2IntLegacyPass, "float2int", "Float to int", false, false)
-
-// Given a FCmp predicate, return a matching ICmp predicate if one
-// exists, otherwise return BAD_ICMP_PREDICATE.
-static CmpInst::Predicate mapFCmpPred(CmpInst::Predicate P) {
- switch (P) {
- case CmpInst::FCMP_OEQ:
- case CmpInst::FCMP_UEQ:
- return CmpInst::ICMP_EQ;
- case CmpInst::FCMP_OGT:
- case CmpInst::FCMP_UGT:
- return CmpInst::ICMP_SGT;
- case CmpInst::FCMP_OGE:
- case CmpInst::FCMP_UGE:
- return CmpInst::ICMP_SGE;
- case CmpInst::FCMP_OLT:
- case CmpInst::FCMP_ULT:
- return CmpInst::ICMP_SLT;
- case CmpInst::FCMP_OLE:
- case CmpInst::FCMP_ULE:
- return CmpInst::ICMP_SLE;
- case CmpInst::FCMP_ONE:
- case CmpInst::FCMP_UNE:
- return CmpInst::ICMP_NE;
- default:
- return CmpInst::BAD_ICMP_PREDICATE;
- }
-}
-
-// Given a floating point binary operator, return the matching
-// integer version.
-static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
- switch (Opcode) {
- default: llvm_unreachable("Unhandled opcode!");
- case Instruction::FAdd: return Instruction::Add;
- case Instruction::FSub: return Instruction::Sub;
- case Instruction::FMul: return Instruction::Mul;
- }
-}
-
-// Find the roots - instructions that convert from the FP domain to
-// integer domain.
-void Float2IntPass::findRoots(Function &F, const DominatorTree &DT) {
- for (BasicBlock &BB : F) {
- // Unreachable code can take on strange forms that we are not prepared to
- // handle. For example, an instruction may have itself as an operand.
- if (!DT.isReachableFromEntry(&BB))
- continue;
-
- for (Instruction &I : BB) {
- if (isa<VectorType>(I.getType()))
- continue;
- switch (I.getOpcode()) {
- default: break;
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- Roots.insert(&I);
- break;
- case Instruction::FCmp:
- if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=
- CmpInst::BAD_ICMP_PREDICATE)
- Roots.insert(&I);
- break;
- }
- }
- }
-}
-
-// Helper - mark I as having been traversed, having range R.
-void Float2IntPass::seen(Instruction *I, ConstantRange R) {
- LLVM_DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
- auto IT = SeenInsts.find(I);
- if (IT != SeenInsts.end())
- IT->second = std::move(R);
- else
- SeenInsts.insert(std::make_pair(I, std::move(R)));
-}
-
-// Helper - get a range representing a poison value.
-ConstantRange Float2IntPass::badRange() {
- return ConstantRange::getFull(MaxIntegerBW + 1);
-}
-ConstantRange Float2IntPass::unknownRange() {
- return ConstantRange::getEmpty(MaxIntegerBW + 1);
-}
-ConstantRange Float2IntPass::validateRange(ConstantRange R) {
- if (R.getBitWidth() > MaxIntegerBW + 1)
- return badRange();
- return R;
-}
-
-// The most obvious way to structure the search is a depth-first, eager
-// search from each root. However, that require direct recursion and so
-// can only handle small instruction sequences. Instead, we split the search
-// up into two phases:
-// - walkBackwards: A breadth-first walk of the use-def graph starting from
-// the roots. Populate "SeenInsts" with interesting
-// instructions and poison values if they're obvious and
-// cheap to compute. Calculate the equivalance set structure
-// while we're here too.
-// - walkForwards: Iterate over SeenInsts in reverse order, so we visit
-// defs before their uses. Calculate the real range info.
-
-// Breadth-first walk of the use-def graph; determine the set of nodes
-// we care about and eagerly determine if some of them are poisonous.
-void Float2IntPass::walkBackwards() {
- std::deque<Instruction*> Worklist(Roots.begin(), Roots.end());
- while (!Worklist.empty()) {
- Instruction *I = Worklist.back();
- Worklist.pop_back();
-
- if (SeenInsts.find(I) != SeenInsts.end())
- // Seen already.
- continue;
-
- switch (I->getOpcode()) {
- // FIXME: Handle select and phi nodes.
- default:
- // Path terminated uncleanly.
- seen(I, badRange());
- break;
-
- case Instruction::UIToFP:
- case Instruction::SIToFP: {
- // Path terminated cleanly - use the type of the integer input to seed
- // the analysis.
- unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
- auto Input = ConstantRange::getFull(BW);
- auto CastOp = (Instruction::CastOps)I->getOpcode();
- seen(I, validateRange(Input.castOp(CastOp, MaxIntegerBW+1)));
- continue;
- }
-
- case Instruction::FNeg:
- case Instruction::FAdd:
- case Instruction::FSub:
- case Instruction::FMul:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FCmp:
- seen(I, unknownRange());
- break;
- }
-
- for (Value *O : I->operands()) {
- if (Instruction *OI = dyn_cast<Instruction>(O)) {
- // Unify def-use chains if they interfere.
- ECs.unionSets(I, OI);
- if (SeenInsts.find(I)->second != badRange())
- Worklist.push_back(OI);
- } else if (!isa<ConstantFP>(O)) {
- // Not an instruction or ConstantFP? we can't do anything.
- seen(I, badRange());
- }
- }
- }
-}
-
-// Walk forwards down the list of seen instructions, so we visit defs before
-// uses.
-void Float2IntPass::walkForwards() {
- for (auto &It : reverse(SeenInsts)) {
- if (It.second != unknownRange())
- continue;
-
- Instruction *I = It.first;
- std::function<ConstantRange(ArrayRef<ConstantRange>)> Op;
- switch (I->getOpcode()) {
- // FIXME: Handle select and phi nodes.
- default:
- case Instruction::UIToFP:
- case Instruction::SIToFP:
- llvm_unreachable("Should have been handled in walkForwards!");
-
- case Instruction::FNeg:
- Op = [](ArrayRef<ConstantRange> Ops) {
- assert(Ops.size() == 1 && "FNeg is a unary operator!");
- unsigned Size = Ops[0].getBitWidth();
- auto Zero = ConstantRange(APInt::getNullValue(Size));
- return Zero.sub(Ops[0]);
- };
- break;
-
- case Instruction::FAdd:
- case Instruction::FSub:
- case Instruction::FMul:
- Op = [I](ArrayRef<ConstantRange> Ops) {
- assert(Ops.size() == 2 && "its a binary operator!");
- auto BinOp = (Instruction::BinaryOps) I->getOpcode();
- return Ops[0].binaryOp(BinOp, Ops[1]);
- };
- break;
-
- //
- // Root-only instructions - we'll only see these if they're the
- // first node in a walk.
- //
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- Op = [I](ArrayRef<ConstantRange> Ops) {
- assert(Ops.size() == 1 && "FPTo[US]I is a unary operator!");
- // Note: We're ignoring the casts output size here as that's what the
- // caller expects.
- auto CastOp = (Instruction::CastOps)I->getOpcode();
- return Ops[0].castOp(CastOp, MaxIntegerBW+1);
- };
- break;
-
- case Instruction::FCmp:
- Op = [](ArrayRef<ConstantRange> Ops) {
- assert(Ops.size() == 2 && "FCmp is a binary operator!");
- return Ops[0].unionWith(Ops[1]);
- };
- break;
- }
-
- bool Abort = false;
- SmallVector<ConstantRange,4> OpRanges;
- for (Value *O : I->operands()) {
- if (Instruction *OI = dyn_cast<Instruction>(O)) {
- assert(SeenInsts.find(OI) != SeenInsts.end() &&
- "def not seen before use!");
- OpRanges.push_back(SeenInsts.find(OI)->second);
- } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) {
- // Work out if the floating point number can be losslessly represented
- // as an integer.
- // APFloat::convertToInteger(&Exact) purports to do what we want, but
- // the exactness can be too precise. For example, negative zero can
- // never be exactly converted to an integer.
- //
- // Instead, we ask APFloat to round itself to an integral value - this
- // preserves sign-of-zero - then compare the result with the original.
- //
- const APFloat &F = CF->getValueAPF();
-
- // First, weed out obviously incorrect values. Non-finite numbers
- // can't be represented and neither can negative zero, unless
- // we're in fast math mode.
- if (!F.isFinite() ||
- (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) &&
- !I->hasNoSignedZeros())) {
- seen(I, badRange());
- Abort = true;
- break;
- }
-
- APFloat NewF = F;
- auto Res = NewF.roundToIntegral(APFloat::rmNearestTiesToEven);
- if (Res != APFloat::opOK || NewF != F) {
- seen(I, badRange());
- Abort = true;
- break;
- }
- // OK, it's representable. Now get it.
- APSInt Int(MaxIntegerBW+1, false);
- bool Exact;
- CF->getValueAPF().convertToInteger(Int,
- APFloat::rmNearestTiesToEven,
- &Exact);
- OpRanges.push_back(ConstantRange(Int));
- } else {
- llvm_unreachable("Should have already marked this as badRange!");
- }
- }
-
- // Reduce the operands' ranges to a single range and return.
- if (!Abort)
- seen(I, Op(OpRanges));
- }
-}
-
-// If there is a valid transform to be done, do it.
-bool Float2IntPass::validateAndTransform() {
- bool MadeChange = false;
-
- // Iterate over every disjoint partition of the def-use graph.
- for (auto It = ECs.begin(), E = ECs.end(); It != E; ++It) {
- ConstantRange R(MaxIntegerBW + 1, false);
- bool Fail = false;
- Type *ConvertedToTy = nullptr;
-
- // For every member of the partition, union all the ranges together.
- for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
- MI != ME; ++MI) {
- Instruction *I = *MI;
- auto SeenI = SeenInsts.find(I);
- if (SeenI == SeenInsts.end())
- continue;
-
- R = R.unionWith(SeenI->second);
- // We need to ensure I has no users that have not been seen.
- // If it does, transformation would be illegal.
- //
- // Don't count the roots, as they terminate the graphs.
- if (Roots.count(I) == 0) {
- // Set the type of the conversion while we're here.
- if (!ConvertedToTy)
- ConvertedToTy = I->getType();
- for (User *U : I->users()) {
- Instruction *UI = dyn_cast<Instruction>(U);
- if (!UI || SeenInsts.find(UI) == SeenInsts.end()) {
- LLVM_DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n");
- Fail = true;
- break;
- }
- }
- }
- if (Fail)
- break;
- }
-
- // If the set was empty, or we failed, or the range is poisonous,
- // bail out.
- if (ECs.member_begin(It) == ECs.member_end() || Fail ||
- R.isFullSet() || R.isSignWrappedSet())
- continue;
- assert(ConvertedToTy && "Must have set the convertedtoty by this point!");
-
- // The number of bits required is the maximum of the upper and
- // lower limits, plus one so it can be signed.
- unsigned MinBW = std::max(R.getLower().getMinSignedBits(),
- R.getUpper().getMinSignedBits()) + 1;
- LLVM_DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n");
-
- // If we've run off the realms of the exactly representable integers,
- // the floating point result will differ from an integer approximation.
-
- // Do we need more bits than are in the mantissa of the type we converted
- // to? semanticsPrecision returns the number of mantissa bits plus one
- // for the sign bit.
- unsigned MaxRepresentableBits
- = APFloat::semanticsPrecision(ConvertedToTy->getFltSemantics()) - 1;
- if (MinBW > MaxRepresentableBits) {
- LLVM_DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
- continue;
- }
- if (MinBW > 64) {
- LLVM_DEBUG(
- dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
- continue;
- }
-
- // OK, R is known to be representable. Now pick a type for it.
- // FIXME: Pick the smallest legal type that will fit.
- Type *Ty = (MinBW > 32) ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx);
-
- for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
- MI != ME; ++MI)
- convert(*MI, Ty);
- MadeChange = true;
- }
-
- return MadeChange;
-}
-
-Value *Float2IntPass::convert(Instruction *I, Type *ToTy) {
- if (ConvertedInsts.find(I) != ConvertedInsts.end())
- // Already converted this instruction.
- return ConvertedInsts[I];
-
- SmallVector<Value*,4> NewOperands;
- for (Value *V : I->operands()) {
- // Don't recurse if we're an instruction that terminates the path.
- if (I->getOpcode() == Instruction::UIToFP ||
- I->getOpcode() == Instruction::SIToFP) {
- NewOperands.push_back(V);
- } else if (Instruction *VI = dyn_cast<Instruction>(V)) {
- NewOperands.push_back(convert(VI, ToTy));
- } else if (ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
- APSInt Val(ToTy->getPrimitiveSizeInBits(), /*isUnsigned=*/false);
- bool Exact;
- CF->getValueAPF().convertToInteger(Val,
- APFloat::rmNearestTiesToEven,
- &Exact);
- NewOperands.push_back(ConstantInt::get(ToTy, Val));
- } else {
- llvm_unreachable("Unhandled operand type?");
- }
- }
-
- // Now create a new instruction.
- IRBuilder<> IRB(I);
- Value *NewV = nullptr;
- switch (I->getOpcode()) {
- default: llvm_unreachable("Unhandled instruction!");
-
- case Instruction::FPToUI:
- NewV = IRB.CreateZExtOrTrunc(NewOperands[0], I->getType());
- break;
-
- case Instruction::FPToSI:
- NewV = IRB.CreateSExtOrTrunc(NewOperands[0], I->getType());
- break;
-
- case Instruction::FCmp: {
- CmpInst::Predicate P = mapFCmpPred(cast<CmpInst>(I)->getPredicate());
- assert(P != CmpInst::BAD_ICMP_PREDICATE && "Unhandled predicate!");
- NewV = IRB.CreateICmp(P, NewOperands[0], NewOperands[1], I->getName());
- break;
- }
-
- case Instruction::UIToFP:
- NewV = IRB.CreateZExtOrTrunc(NewOperands[0], ToTy);
- break;
-
- case Instruction::SIToFP:
- NewV = IRB.CreateSExtOrTrunc(NewOperands[0], ToTy);
- break;
-
- case Instruction::FNeg:
- NewV = IRB.CreateNeg(NewOperands[0], I->getName());
- break;
-
- case Instruction::FAdd:
- case Instruction::FSub:
- case Instruction::FMul:
- NewV = IRB.CreateBinOp(mapBinOpcode(I->getOpcode()),
- NewOperands[0], NewOperands[1],
- I->getName());
- break;
- }
-
- // If we're a root instruction, RAUW.
- if (Roots.count(I))
- I->replaceAllUsesWith(NewV);
-
- ConvertedInsts[I] = NewV;
- return NewV;
-}
-
-// Perform dead code elimination on the instructions we just modified.
-void Float2IntPass::cleanup() {
- for (auto &I : reverse(ConvertedInsts))
- I.first->eraseFromParent();
-}
-
-bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) {
- LLVM_DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
- // Clear out all state.
- ECs = EquivalenceClasses<Instruction*>();
- SeenInsts.clear();
- ConvertedInsts.clear();
- Roots.clear();
-
- Ctx = &F.getParent()->getContext();
-
- findRoots(F, DT);
-
- walkBackwards();
- walkForwards();
-
- bool Modified = validateAndTransform();
- if (Modified)
- cleanup();
- return Modified;
-}
-
-namespace llvm {
-FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); }
-
-PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &AM) {
- const DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
- if (!runImpl(F, DT))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<GlobalsAA>();
- return PA;
-}
-} // End namespace llvm
+//===- Float2Int.cpp - Demote floating point ops to work on integers ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Float2Int pass, which aims to demote floating
+// point operations to work on integers, where that is losslessly possible.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#define DEBUG_TYPE "float2int"
+
+#include "llvm/Transforms/Scalar/Float2Int.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <deque>
+#include <functional> // For std::function
+using namespace llvm;
+
+// The algorithm is simple. Start at instructions that convert from the
+// float to the int domain: fptoui, fptosi and fcmp. Walk up the def-use
+// graph, using an equivalence datastructure to unify graphs that interfere.
+//
+// Mappable instructions are those with an integer corrollary that, given
+// integer domain inputs, produce an integer output; fadd, for example.
+//
+// If a non-mappable instruction is seen, this entire def-use graph is marked
+// as non-transformable. If we see an instruction that converts from the
+// integer domain to FP domain (uitofp,sitofp), we terminate our walk.
+
+/// The largest integer type worth dealing with.
+static cl::opt<unsigned>
+MaxIntegerBW("float2int-max-integer-bw", cl::init(64), cl::Hidden,
+ cl::desc("Max integer bitwidth to consider in float2int"
+ "(default=64)"));
+
+namespace {
+ struct Float2IntLegacyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ Float2IntLegacyPass() : FunctionPass(ID) {
+ initializeFloat2IntLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ const DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ return Impl.runImpl(F, DT);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+
+ private:
+ Float2IntPass Impl;
+ };
+}
+
+char Float2IntLegacyPass::ID = 0;
+INITIALIZE_PASS(Float2IntLegacyPass, "float2int", "Float to int", false, false)
+
+// Given a FCmp predicate, return a matching ICmp predicate if one
+// exists, otherwise return BAD_ICMP_PREDICATE.
+static CmpInst::Predicate mapFCmpPred(CmpInst::Predicate P) {
+ switch (P) {
+ case CmpInst::FCMP_OEQ:
+ case CmpInst::FCMP_UEQ:
+ return CmpInst::ICMP_EQ;
+ case CmpInst::FCMP_OGT:
+ case CmpInst::FCMP_UGT:
+ return CmpInst::ICMP_SGT;
+ case CmpInst::FCMP_OGE:
+ case CmpInst::FCMP_UGE:
+ return CmpInst::ICMP_SGE;
+ case CmpInst::FCMP_OLT:
+ case CmpInst::FCMP_ULT:
+ return CmpInst::ICMP_SLT;
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_ULE:
+ return CmpInst::ICMP_SLE;
+ case CmpInst::FCMP_ONE:
+ case CmpInst::FCMP_UNE:
+ return CmpInst::ICMP_NE;
+ default:
+ return CmpInst::BAD_ICMP_PREDICATE;
+ }
+}
+
+// Given a floating point binary operator, return the matching
+// integer version.
+static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unhandled opcode!");
+ case Instruction::FAdd: return Instruction::Add;
+ case Instruction::FSub: return Instruction::Sub;
+ case Instruction::FMul: return Instruction::Mul;
+ }
+}
+
+// Find the roots - instructions that convert from the FP domain to
+// integer domain.
+void Float2IntPass::findRoots(Function &F, const DominatorTree &DT) {
+ for (BasicBlock &BB : F) {
+ // Unreachable code can take on strange forms that we are not prepared to
+ // handle. For example, an instruction may have itself as an operand.
+ if (!DT.isReachableFromEntry(&BB))
+ continue;
+
+ for (Instruction &I : BB) {
+ if (isa<VectorType>(I.getType()))
+ continue;
+ switch (I.getOpcode()) {
+ default: break;
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ Roots.insert(&I);
+ break;
+ case Instruction::FCmp:
+ if (mapFCmpPred(cast<CmpInst>(&I)->getPredicate()) !=
+ CmpInst::BAD_ICMP_PREDICATE)
+ Roots.insert(&I);
+ break;
+ }
+ }
+ }
+}
+
+// Helper - mark I as having been traversed, having range R.
+void Float2IntPass::seen(Instruction *I, ConstantRange R) {
+ LLVM_DEBUG(dbgs() << "F2I: " << *I << ":" << R << "\n");
+ auto IT = SeenInsts.find(I);
+ if (IT != SeenInsts.end())
+ IT->second = std::move(R);
+ else
+ SeenInsts.insert(std::make_pair(I, std::move(R)));
+}
+
+// Helper - get a range representing a poison value.
+ConstantRange Float2IntPass::badRange() {
+ return ConstantRange::getFull(MaxIntegerBW + 1);
+}
+ConstantRange Float2IntPass::unknownRange() {
+ return ConstantRange::getEmpty(MaxIntegerBW + 1);
+}
+ConstantRange Float2IntPass::validateRange(ConstantRange R) {
+ if (R.getBitWidth() > MaxIntegerBW + 1)
+ return badRange();
+ return R;
+}
+
+// The most obvious way to structure the search is a depth-first, eager
+// search from each root. However, that require direct recursion and so
+// can only handle small instruction sequences. Instead, we split the search
+// up into two phases:
+// - walkBackwards: A breadth-first walk of the use-def graph starting from
+// the roots. Populate "SeenInsts" with interesting
+// instructions and poison values if they're obvious and
+// cheap to compute. Calculate the equivalance set structure
+// while we're here too.
+// - walkForwards: Iterate over SeenInsts in reverse order, so we visit
+// defs before their uses. Calculate the real range info.
+
+// Breadth-first walk of the use-def graph; determine the set of nodes
+// we care about and eagerly determine if some of them are poisonous.
+void Float2IntPass::walkBackwards() {
+ std::deque<Instruction*> Worklist(Roots.begin(), Roots.end());
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.back();
+ Worklist.pop_back();
+
+ if (SeenInsts.find(I) != SeenInsts.end())
+ // Seen already.
+ continue;
+
+ switch (I->getOpcode()) {
+ // FIXME: Handle select and phi nodes.
+ default:
+ // Path terminated uncleanly.
+ seen(I, badRange());
+ break;
+
+ case Instruction::UIToFP:
+ case Instruction::SIToFP: {
+ // Path terminated cleanly - use the type of the integer input to seed
+ // the analysis.
+ unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+ auto Input = ConstantRange::getFull(BW);
+ auto CastOp = (Instruction::CastOps)I->getOpcode();
+ seen(I, validateRange(Input.castOp(CastOp, MaxIntegerBW+1)));
+ continue;
+ }
+
+ case Instruction::FNeg:
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FCmp:
+ seen(I, unknownRange());
+ break;
+ }
+
+ for (Value *O : I->operands()) {
+ if (Instruction *OI = dyn_cast<Instruction>(O)) {
+ // Unify def-use chains if they interfere.
+ ECs.unionSets(I, OI);
+ if (SeenInsts.find(I)->second != badRange())
+ Worklist.push_back(OI);
+ } else if (!isa<ConstantFP>(O)) {
+ // Not an instruction or ConstantFP? we can't do anything.
+ seen(I, badRange());
+ }
+ }
+ }
+}
+
+// Walk forwards down the list of seen instructions, so we visit defs before
+// uses.
+void Float2IntPass::walkForwards() {
+ for (auto &It : reverse(SeenInsts)) {
+ if (It.second != unknownRange())
+ continue;
+
+ Instruction *I = It.first;
+ std::function<ConstantRange(ArrayRef<ConstantRange>)> Op;
+ switch (I->getOpcode()) {
+ // FIXME: Handle select and phi nodes.
+ default:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ llvm_unreachable("Should have been handled in walkForwards!");
+
+ case Instruction::FNeg:
+ Op = [](ArrayRef<ConstantRange> Ops) {
+ assert(Ops.size() == 1 && "FNeg is a unary operator!");
+ unsigned Size = Ops[0].getBitWidth();
+ auto Zero = ConstantRange(APInt::getNullValue(Size));
+ return Zero.sub(Ops[0]);
+ };
+ break;
+
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ Op = [I](ArrayRef<ConstantRange> Ops) {
+ assert(Ops.size() == 2 && "its a binary operator!");
+ auto BinOp = (Instruction::BinaryOps) I->getOpcode();
+ return Ops[0].binaryOp(BinOp, Ops[1]);
+ };
+ break;
+
+ //
+ // Root-only instructions - we'll only see these if they're the
+ // first node in a walk.
+ //
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ Op = [I](ArrayRef<ConstantRange> Ops) {
+ assert(Ops.size() == 1 && "FPTo[US]I is a unary operator!");
+ // Note: We're ignoring the casts output size here as that's what the
+ // caller expects.
+ auto CastOp = (Instruction::CastOps)I->getOpcode();
+ return Ops[0].castOp(CastOp, MaxIntegerBW+1);
+ };
+ break;
+
+ case Instruction::FCmp:
+ Op = [](ArrayRef<ConstantRange> Ops) {
+ assert(Ops.size() == 2 && "FCmp is a binary operator!");
+ return Ops[0].unionWith(Ops[1]);
+ };
+ break;
+ }
+
+ bool Abort = false;
+ SmallVector<ConstantRange,4> OpRanges;
+ for (Value *O : I->operands()) {
+ if (Instruction *OI = dyn_cast<Instruction>(O)) {
+ assert(SeenInsts.find(OI) != SeenInsts.end() &&
+ "def not seen before use!");
+ OpRanges.push_back(SeenInsts.find(OI)->second);
+ } else if (ConstantFP *CF = dyn_cast<ConstantFP>(O)) {
+ // Work out if the floating point number can be losslessly represented
+ // as an integer.
+ // APFloat::convertToInteger(&Exact) purports to do what we want, but
+ // the exactness can be too precise. For example, negative zero can
+ // never be exactly converted to an integer.
+ //
+ // Instead, we ask APFloat to round itself to an integral value - this
+ // preserves sign-of-zero - then compare the result with the original.
+ //
+ const APFloat &F = CF->getValueAPF();
+
+ // First, weed out obviously incorrect values. Non-finite numbers
+ // can't be represented and neither can negative zero, unless
+ // we're in fast math mode.
+ if (!F.isFinite() ||
+ (F.isZero() && F.isNegative() && isa<FPMathOperator>(I) &&
+ !I->hasNoSignedZeros())) {
+ seen(I, badRange());
+ Abort = true;
+ break;
+ }
+
+ APFloat NewF = F;
+ auto Res = NewF.roundToIntegral(APFloat::rmNearestTiesToEven);
+ if (Res != APFloat::opOK || NewF != F) {
+ seen(I, badRange());
+ Abort = true;
+ break;
+ }
+ // OK, it's representable. Now get it.
+ APSInt Int(MaxIntegerBW+1, false);
+ bool Exact;
+ CF->getValueAPF().convertToInteger(Int,
+ APFloat::rmNearestTiesToEven,
+ &Exact);
+ OpRanges.push_back(ConstantRange(Int));
+ } else {
+ llvm_unreachable("Should have already marked this as badRange!");
+ }
+ }
+
+ // Reduce the operands' ranges to a single range and return.
+ if (!Abort)
+ seen(I, Op(OpRanges));
+ }
+}
+
+// If there is a valid transform to be done, do it.
+bool Float2IntPass::validateAndTransform() {
+ bool MadeChange = false;
+
+ // Iterate over every disjoint partition of the def-use graph.
+ for (auto It = ECs.begin(), E = ECs.end(); It != E; ++It) {
+ ConstantRange R(MaxIntegerBW + 1, false);
+ bool Fail = false;
+ Type *ConvertedToTy = nullptr;
+
+ // For every member of the partition, union all the ranges together.
+ for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
+ MI != ME; ++MI) {
+ Instruction *I = *MI;
+ auto SeenI = SeenInsts.find(I);
+ if (SeenI == SeenInsts.end())
+ continue;
+
+ R = R.unionWith(SeenI->second);
+ // We need to ensure I has no users that have not been seen.
+ // If it does, transformation would be illegal.
+ //
+ // Don't count the roots, as they terminate the graphs.
+ if (Roots.count(I) == 0) {
+ // Set the type of the conversion while we're here.
+ if (!ConvertedToTy)
+ ConvertedToTy = I->getType();
+ for (User *U : I->users()) {
+ Instruction *UI = dyn_cast<Instruction>(U);
+ if (!UI || SeenInsts.find(UI) == SeenInsts.end()) {
+ LLVM_DEBUG(dbgs() << "F2I: Failing because of " << *U << "\n");
+ Fail = true;
+ break;
+ }
+ }
+ }
+ if (Fail)
+ break;
+ }
+
+ // If the set was empty, or we failed, or the range is poisonous,
+ // bail out.
+ if (ECs.member_begin(It) == ECs.member_end() || Fail ||
+ R.isFullSet() || R.isSignWrappedSet())
+ continue;
+ assert(ConvertedToTy && "Must have set the convertedtoty by this point!");
+
+ // The number of bits required is the maximum of the upper and
+ // lower limits, plus one so it can be signed.
+ unsigned MinBW = std::max(R.getLower().getMinSignedBits(),
+ R.getUpper().getMinSignedBits()) + 1;
+ LLVM_DEBUG(dbgs() << "F2I: MinBitwidth=" << MinBW << ", R: " << R << "\n");
+
+ // If we've run off the realms of the exactly representable integers,
+ // the floating point result will differ from an integer approximation.
+
+ // Do we need more bits than are in the mantissa of the type we converted
+ // to? semanticsPrecision returns the number of mantissa bits plus one
+ // for the sign bit.
+ unsigned MaxRepresentableBits
+ = APFloat::semanticsPrecision(ConvertedToTy->getFltSemantics()) - 1;
+ if (MinBW > MaxRepresentableBits) {
+ LLVM_DEBUG(dbgs() << "F2I: Value not guaranteed to be representable!\n");
+ continue;
+ }
+ if (MinBW > 64) {
+ LLVM_DEBUG(
+ dbgs() << "F2I: Value requires more than 64 bits to represent!\n");
+ continue;
+ }
+
+ // OK, R is known to be representable. Now pick a type for it.
+ // FIXME: Pick the smallest legal type that will fit.
+ Type *Ty = (MinBW > 32) ? Type::getInt64Ty(*Ctx) : Type::getInt32Ty(*Ctx);
+
+ for (auto MI = ECs.member_begin(It), ME = ECs.member_end();
+ MI != ME; ++MI)
+ convert(*MI, Ty);
+ MadeChange = true;
+ }
+
+ return MadeChange;
+}
+
+Value *Float2IntPass::convert(Instruction *I, Type *ToTy) {
+ if (ConvertedInsts.find(I) != ConvertedInsts.end())
+ // Already converted this instruction.
+ return ConvertedInsts[I];
+
+ SmallVector<Value*,4> NewOperands;
+ for (Value *V : I->operands()) {
+ // Don't recurse if we're an instruction that terminates the path.
+ if (I->getOpcode() == Instruction::UIToFP ||
+ I->getOpcode() == Instruction::SIToFP) {
+ NewOperands.push_back(V);
+ } else if (Instruction *VI = dyn_cast<Instruction>(V)) {
+ NewOperands.push_back(convert(VI, ToTy));
+ } else if (ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
+ APSInt Val(ToTy->getPrimitiveSizeInBits(), /*isUnsigned=*/false);
+ bool Exact;
+ CF->getValueAPF().convertToInteger(Val,
+ APFloat::rmNearestTiesToEven,
+ &Exact);
+ NewOperands.push_back(ConstantInt::get(ToTy, Val));
+ } else {
+ llvm_unreachable("Unhandled operand type?");
+ }
+ }
+
+ // Now create a new instruction.
+ IRBuilder<> IRB(I);
+ Value *NewV = nullptr;
+ switch (I->getOpcode()) {
+ default: llvm_unreachable("Unhandled instruction!");
+
+ case Instruction::FPToUI:
+ NewV = IRB.CreateZExtOrTrunc(NewOperands[0], I->getType());
+ break;
+
+ case Instruction::FPToSI:
+ NewV = IRB.CreateSExtOrTrunc(NewOperands[0], I->getType());
+ break;
+
+ case Instruction::FCmp: {
+ CmpInst::Predicate P = mapFCmpPred(cast<CmpInst>(I)->getPredicate());
+ assert(P != CmpInst::BAD_ICMP_PREDICATE && "Unhandled predicate!");
+ NewV = IRB.CreateICmp(P, NewOperands[0], NewOperands[1], I->getName());
+ break;
+ }
+
+ case Instruction::UIToFP:
+ NewV = IRB.CreateZExtOrTrunc(NewOperands[0], ToTy);
+ break;
+
+ case Instruction::SIToFP:
+ NewV = IRB.CreateSExtOrTrunc(NewOperands[0], ToTy);
+ break;
+
+ case Instruction::FNeg:
+ NewV = IRB.CreateNeg(NewOperands[0], I->getName());
+ break;
+
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ NewV = IRB.CreateBinOp(mapBinOpcode(I->getOpcode()),
+ NewOperands[0], NewOperands[1],
+ I->getName());
+ break;
+ }
+
+ // If we're a root instruction, RAUW.
+ if (Roots.count(I))
+ I->replaceAllUsesWith(NewV);
+
+ ConvertedInsts[I] = NewV;
+ return NewV;
+}
+
+// Perform dead code elimination on the instructions we just modified.
+void Float2IntPass::cleanup() {
+ for (auto &I : reverse(ConvertedInsts))
+ I.first->eraseFromParent();
+}
+
+bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) {
+ LLVM_DEBUG(dbgs() << "F2I: Looking at function " << F.getName() << "\n");
+ // Clear out all state.
+ ECs = EquivalenceClasses<Instruction*>();
+ SeenInsts.clear();
+ ConvertedInsts.clear();
+ Roots.clear();
+
+ Ctx = &F.getParent()->getContext();
+
+ findRoots(F, DT);
+
+ walkBackwards();
+ walkForwards();
+
+ bool Modified = validateAndTransform();
+ if (Modified)
+ cleanup();
+ return Modified;
+}
+
+namespace llvm {
+FunctionPass *createFloat2IntPass() { return new Float2IntLegacyPass(); }
+
+PreservedAnalyses Float2IntPass::run(Function &F, FunctionAnalysisManager &AM) {
+ const DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ if (!runImpl(F, DT))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+} // End namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/GVN.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/GVN.cpp
index 90795c40d6..c6b6d75aef 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/GVN.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/GVN.cpp
@@ -1,104 +1,104 @@
-//===- GVN.cpp - Eliminate redundant values and loads ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass performs global value numbering to eliminate fully redundant
-// instructions. It also performs simple dead load elimination.
-//
-// Note that this pass does the value numbering itself; it does not use the
-// ValueNumbering analysis passes.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
+//===- GVN.cpp - Eliminate redundant values and loads ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs global value numbering to eliminate fully redundant
+// instructions. It also performs simple dead load elimination.
+//
+// Note that this pass does the value numbering itself; it does not use the
+// ValueNumbering analysis passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumeBundleQueries.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/PHITransAddr.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/Transforms/Utils/VNCoercion.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-using namespace llvm::gvn;
-using namespace llvm::VNCoercion;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "gvn"
-
-STATISTIC(NumGVNInstr, "Number of instructions deleted");
-STATISTIC(NumGVNLoad, "Number of loads deleted");
-STATISTIC(NumGVNPRE, "Number of instructions PRE'd");
-STATISTIC(NumGVNBlocks, "Number of blocks merged");
-STATISTIC(NumGVNSimpl, "Number of instructions simplified");
-STATISTIC(NumGVNEqProp, "Number of equalities propagated");
-STATISTIC(NumPRELoad, "Number of loads PRE'd");
-
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PHITransAddr.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::gvn;
+using namespace llvm::VNCoercion;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "gvn"
+
+STATISTIC(NumGVNInstr, "Number of instructions deleted");
+STATISTIC(NumGVNLoad, "Number of loads deleted");
+STATISTIC(NumGVNPRE, "Number of instructions PRE'd");
+STATISTIC(NumGVNBlocks, "Number of blocks merged");
+STATISTIC(NumGVNSimpl, "Number of instructions simplified");
+STATISTIC(NumGVNEqProp, "Number of equalities propagated");
+STATISTIC(NumPRELoad, "Number of loads PRE'd");
+
STATISTIC(IsValueFullyAvailableInBlockNumSpeculationsMax,
"Number of blocks speculated as available in "
"IsValueFullyAvailableInBlock(), max");
@@ -106,19 +106,19 @@ STATISTIC(MaxBBSpeculationCutoffReachedTimes,
"Number of times we we reached gvn-max-block-speculations cut-off "
"preventing further exploration");
-static cl::opt<bool> GVNEnablePRE("enable-pre", cl::init(true), cl::Hidden);
-static cl::opt<bool> GVNEnableLoadPRE("enable-load-pre", cl::init(true));
-static cl::opt<bool> GVNEnableLoadInLoopPRE("enable-load-in-loop-pre",
- cl::init(true));
+static cl::opt<bool> GVNEnablePRE("enable-pre", cl::init(true), cl::Hidden);
+static cl::opt<bool> GVNEnableLoadPRE("enable-load-pre", cl::init(true));
+static cl::opt<bool> GVNEnableLoadInLoopPRE("enable-load-in-loop-pre",
+ cl::init(true));
static cl::opt<bool>
GVNEnableSplitBackedgeInLoadPRE("enable-split-backedge-in-load-pre",
cl::init(true));
-static cl::opt<bool> GVNEnableMemDep("enable-gvn-memdep", cl::init(true));
-
-static cl::opt<uint32_t> MaxNumDeps(
- "gvn-max-num-deps", cl::Hidden, cl::init(100), cl::ZeroOrMore,
- cl::desc("Max number of dependences to attempt Load PRE (default = 100)"));
-
+static cl::opt<bool> GVNEnableMemDep("enable-gvn-memdep", cl::init(true));
+
+static cl::opt<uint32_t> MaxNumDeps(
+ "gvn-max-num-deps", cl::Hidden, cl::init(100), cl::ZeroOrMore,
+ cl::desc("Max number of dependences to attempt Load PRE (default = 100)"));
+
// This is based on IsValueFullyAvailableInBlockNumSpeculationsMax stat.
static cl::opt<uint32_t> MaxBBSpeculations(
"gvn-max-block-speculations", cl::Hidden, cl::init(600), cl::ZeroOrMore,
@@ -126,570 +126,570 @@ static cl::opt<uint32_t> MaxBBSpeculations(
"into) when deducing if a value is fully available or not in GVN "
"(default = 600)"));
-struct llvm::GVN::Expression {
- uint32_t opcode;
- bool commutative = false;
- Type *type = nullptr;
- SmallVector<uint32_t, 4> varargs;
-
- Expression(uint32_t o = ~2U) : opcode(o) {}
-
- bool operator==(const Expression &other) const {
- if (opcode != other.opcode)
- return false;
- if (opcode == ~0U || opcode == ~1U)
- return true;
- if (type != other.type)
- return false;
- if (varargs != other.varargs)
- return false;
- return true;
- }
-
- friend hash_code hash_value(const Expression &Value) {
- return hash_combine(
- Value.opcode, Value.type,
- hash_combine_range(Value.varargs.begin(), Value.varargs.end()));
- }
-};
-
-namespace llvm {
-
-template <> struct DenseMapInfo<GVN::Expression> {
- static inline GVN::Expression getEmptyKey() { return ~0U; }
- static inline GVN::Expression getTombstoneKey() { return ~1U; }
-
- static unsigned getHashValue(const GVN::Expression &e) {
- using llvm::hash_value;
-
- return static_cast<unsigned>(hash_value(e));
- }
-
- static bool isEqual(const GVN::Expression &LHS, const GVN::Expression &RHS) {
- return LHS == RHS;
- }
-};
-
-} // end namespace llvm
-
-/// Represents a particular available value that we know how to materialize.
-/// Materialization of an AvailableValue never fails. An AvailableValue is
-/// implicitly associated with a rematerialization point which is the
-/// location of the instruction from which it was formed.
-struct llvm::gvn::AvailableValue {
- enum ValType {
- SimpleVal, // A simple offsetted value that is accessed.
- LoadVal, // A value produced by a load.
- MemIntrin, // A memory intrinsic which is loaded from.
- UndefVal // A UndefValue representing a value from dead block (which
- // is not yet physically removed from the CFG).
- };
-
- /// V - The value that is live out of the block.
- PointerIntPair<Value *, 2, ValType> Val;
-
- /// Offset - The byte offset in Val that is interesting for the load query.
- unsigned Offset = 0;
-
- static AvailableValue get(Value *V, unsigned Offset = 0) {
- AvailableValue Res;
- Res.Val.setPointer(V);
- Res.Val.setInt(SimpleVal);
- Res.Offset = Offset;
- return Res;
- }
-
- static AvailableValue getMI(MemIntrinsic *MI, unsigned Offset = 0) {
- AvailableValue Res;
- Res.Val.setPointer(MI);
- Res.Val.setInt(MemIntrin);
- Res.Offset = Offset;
- return Res;
- }
-
- static AvailableValue getLoad(LoadInst *LI, unsigned Offset = 0) {
- AvailableValue Res;
- Res.Val.setPointer(LI);
- Res.Val.setInt(LoadVal);
- Res.Offset = Offset;
- return Res;
- }
-
- static AvailableValue getUndef() {
- AvailableValue Res;
- Res.Val.setPointer(nullptr);
- Res.Val.setInt(UndefVal);
- Res.Offset = 0;
- return Res;
- }
-
- bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
- bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
- bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
- bool isUndefValue() const { return Val.getInt() == UndefVal; }
-
- Value *getSimpleValue() const {
- assert(isSimpleValue() && "Wrong accessor");
- return Val.getPointer();
- }
-
- LoadInst *getCoercedLoadValue() const {
- assert(isCoercedLoadValue() && "Wrong accessor");
- return cast<LoadInst>(Val.getPointer());
- }
-
- MemIntrinsic *getMemIntrinValue() const {
- assert(isMemIntrinValue() && "Wrong accessor");
- return cast<MemIntrinsic>(Val.getPointer());
- }
-
- /// Emit code at the specified insertion point to adjust the value defined
- /// here to the specified type. This handles various coercion cases.
- Value *MaterializeAdjustedValue(LoadInst *LI, Instruction *InsertPt,
- GVN &gvn) const;
-};
-
-/// Represents an AvailableValue which can be rematerialized at the end of
-/// the associated BasicBlock.
-struct llvm::gvn::AvailableValueInBlock {
- /// BB - The basic block in question.
- BasicBlock *BB = nullptr;
-
- /// AV - The actual available value
- AvailableValue AV;
-
- static AvailableValueInBlock get(BasicBlock *BB, AvailableValue &&AV) {
- AvailableValueInBlock Res;
- Res.BB = BB;
- Res.AV = std::move(AV);
- return Res;
- }
-
- static AvailableValueInBlock get(BasicBlock *BB, Value *V,
- unsigned Offset = 0) {
- return get(BB, AvailableValue::get(V, Offset));
- }
-
- static AvailableValueInBlock getUndef(BasicBlock *BB) {
- return get(BB, AvailableValue::getUndef());
- }
-
- /// Emit code at the end of this block to adjust the value defined here to
- /// the specified type. This handles various coercion cases.
- Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const {
- return AV.MaterializeAdjustedValue(LI, BB->getTerminator(), gvn);
- }
-};
-
-//===----------------------------------------------------------------------===//
-// ValueTable Internal Functions
-//===----------------------------------------------------------------------===//
-
-GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
- Expression e;
- e.type = I->getType();
- e.opcode = I->getOpcode();
- for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
- OI != OE; ++OI)
- e.varargs.push_back(lookupOrAdd(*OI));
- if (I->isCommutative()) {
- // Ensure that commutative instructions that only differ by a permutation
- // of their operands get the same value number by sorting the operand value
+struct llvm::GVN::Expression {
+ uint32_t opcode;
+ bool commutative = false;
+ Type *type = nullptr;
+ SmallVector<uint32_t, 4> varargs;
+
+ Expression(uint32_t o = ~2U) : opcode(o) {}
+
+ bool operator==(const Expression &other) const {
+ if (opcode != other.opcode)
+ return false;
+ if (opcode == ~0U || opcode == ~1U)
+ return true;
+ if (type != other.type)
+ return false;
+ if (varargs != other.varargs)
+ return false;
+ return true;
+ }
+
+ friend hash_code hash_value(const Expression &Value) {
+ return hash_combine(
+ Value.opcode, Value.type,
+ hash_combine_range(Value.varargs.begin(), Value.varargs.end()));
+ }
+};
+
+namespace llvm {
+
+template <> struct DenseMapInfo<GVN::Expression> {
+ static inline GVN::Expression getEmptyKey() { return ~0U; }
+ static inline GVN::Expression getTombstoneKey() { return ~1U; }
+
+ static unsigned getHashValue(const GVN::Expression &e) {
+ using llvm::hash_value;
+
+ return static_cast<unsigned>(hash_value(e));
+ }
+
+ static bool isEqual(const GVN::Expression &LHS, const GVN::Expression &RHS) {
+ return LHS == RHS;
+ }
+};
+
+} // end namespace llvm
+
+/// Represents a particular available value that we know how to materialize.
+/// Materialization of an AvailableValue never fails. An AvailableValue is
+/// implicitly associated with a rematerialization point which is the
+/// location of the instruction from which it was formed.
+struct llvm::gvn::AvailableValue {
+ enum ValType {
+ SimpleVal, // A simple offsetted value that is accessed.
+ LoadVal, // A value produced by a load.
+ MemIntrin, // A memory intrinsic which is loaded from.
+ UndefVal // A UndefValue representing a value from dead block (which
+ // is not yet physically removed from the CFG).
+ };
+
+ /// V - The value that is live out of the block.
+ PointerIntPair<Value *, 2, ValType> Val;
+
+ /// Offset - The byte offset in Val that is interesting for the load query.
+ unsigned Offset = 0;
+
+ static AvailableValue get(Value *V, unsigned Offset = 0) {
+ AvailableValue Res;
+ Res.Val.setPointer(V);
+ Res.Val.setInt(SimpleVal);
+ Res.Offset = Offset;
+ return Res;
+ }
+
+ static AvailableValue getMI(MemIntrinsic *MI, unsigned Offset = 0) {
+ AvailableValue Res;
+ Res.Val.setPointer(MI);
+ Res.Val.setInt(MemIntrin);
+ Res.Offset = Offset;
+ return Res;
+ }
+
+ static AvailableValue getLoad(LoadInst *LI, unsigned Offset = 0) {
+ AvailableValue Res;
+ Res.Val.setPointer(LI);
+ Res.Val.setInt(LoadVal);
+ Res.Offset = Offset;
+ return Res;
+ }
+
+ static AvailableValue getUndef() {
+ AvailableValue Res;
+ Res.Val.setPointer(nullptr);
+ Res.Val.setInt(UndefVal);
+ Res.Offset = 0;
+ return Res;
+ }
+
+ bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
+ bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
+ bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+ bool isUndefValue() const { return Val.getInt() == UndefVal; }
+
+ Value *getSimpleValue() const {
+ assert(isSimpleValue() && "Wrong accessor");
+ return Val.getPointer();
+ }
+
+ LoadInst *getCoercedLoadValue() const {
+ assert(isCoercedLoadValue() && "Wrong accessor");
+ return cast<LoadInst>(Val.getPointer());
+ }
+
+ MemIntrinsic *getMemIntrinValue() const {
+ assert(isMemIntrinValue() && "Wrong accessor");
+ return cast<MemIntrinsic>(Val.getPointer());
+ }
+
+ /// Emit code at the specified insertion point to adjust the value defined
+ /// here to the specified type. This handles various coercion cases.
+ Value *MaterializeAdjustedValue(LoadInst *LI, Instruction *InsertPt,
+ GVN &gvn) const;
+};
+
+/// Represents an AvailableValue which can be rematerialized at the end of
+/// the associated BasicBlock.
+struct llvm::gvn::AvailableValueInBlock {
+ /// BB - The basic block in question.
+ BasicBlock *BB = nullptr;
+
+ /// AV - The actual available value
+ AvailableValue AV;
+
+ static AvailableValueInBlock get(BasicBlock *BB, AvailableValue &&AV) {
+ AvailableValueInBlock Res;
+ Res.BB = BB;
+ Res.AV = std::move(AV);
+ return Res;
+ }
+
+ static AvailableValueInBlock get(BasicBlock *BB, Value *V,
+ unsigned Offset = 0) {
+ return get(BB, AvailableValue::get(V, Offset));
+ }
+
+ static AvailableValueInBlock getUndef(BasicBlock *BB) {
+ return get(BB, AvailableValue::getUndef());
+ }
+
+ /// Emit code at the end of this block to adjust the value defined here to
+ /// the specified type. This handles various coercion cases.
+ Value *MaterializeAdjustedValue(LoadInst *LI, GVN &gvn) const {
+ return AV.MaterializeAdjustedValue(LI, BB->getTerminator(), gvn);
+ }
+};
+
+//===----------------------------------------------------------------------===//
+// ValueTable Internal Functions
+//===----------------------------------------------------------------------===//
+
+GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
+ Expression e;
+ e.type = I->getType();
+ e.opcode = I->getOpcode();
+ for (Instruction::op_iterator OI = I->op_begin(), OE = I->op_end();
+ OI != OE; ++OI)
+ e.varargs.push_back(lookupOrAdd(*OI));
+ if (I->isCommutative()) {
+ // Ensure that commutative instructions that only differ by a permutation
+ // of their operands get the same value number by sorting the operand value
// numbers. Since commutative operands are the 1st two operands it is more
- // efficient to sort by hand rather than using, say, std::sort.
+ // efficient to sort by hand rather than using, say, std::sort.
assert(I->getNumOperands() >= 2 && "Unsupported commutative instruction!");
- if (e.varargs[0] > e.varargs[1])
- std::swap(e.varargs[0], e.varargs[1]);
- e.commutative = true;
- }
-
- if (auto *C = dyn_cast<CmpInst>(I)) {
- // Sort the operand value numbers so x<y and y>x get the same value number.
- CmpInst::Predicate Predicate = C->getPredicate();
- if (e.varargs[0] > e.varargs[1]) {
- std::swap(e.varargs[0], e.varargs[1]);
- Predicate = CmpInst::getSwappedPredicate(Predicate);
- }
- e.opcode = (C->getOpcode() << 8) | Predicate;
- e.commutative = true;
- } else if (auto *E = dyn_cast<InsertValueInst>(I)) {
- e.varargs.append(E->idx_begin(), E->idx_end());
- } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
- ArrayRef<int> ShuffleMask = SVI->getShuffleMask();
- e.varargs.append(ShuffleMask.begin(), ShuffleMask.end());
- }
-
- return e;
-}
-
-GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode,
- CmpInst::Predicate Predicate,
- Value *LHS, Value *RHS) {
- assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
- "Not a comparison!");
- Expression e;
- e.type = CmpInst::makeCmpResultType(LHS->getType());
- e.varargs.push_back(lookupOrAdd(LHS));
- e.varargs.push_back(lookupOrAdd(RHS));
-
- // Sort the operand value numbers so x<y and y>x get the same value number.
- if (e.varargs[0] > e.varargs[1]) {
- std::swap(e.varargs[0], e.varargs[1]);
- Predicate = CmpInst::getSwappedPredicate(Predicate);
- }
- e.opcode = (Opcode << 8) | Predicate;
- e.commutative = true;
- return e;
-}
-
-GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
- assert(EI && "Not an ExtractValueInst?");
- Expression e;
- e.type = EI->getType();
- e.opcode = 0;
-
- WithOverflowInst *WO = dyn_cast<WithOverflowInst>(EI->getAggregateOperand());
- if (WO != nullptr && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
- // EI is an extract from one of our with.overflow intrinsics. Synthesize
- // a semantically equivalent expression instead of an extract value
- // expression.
- e.opcode = WO->getBinaryOp();
- e.varargs.push_back(lookupOrAdd(WO->getLHS()));
- e.varargs.push_back(lookupOrAdd(WO->getRHS()));
- return e;
- }
-
- // Not a recognised intrinsic. Fall back to producing an extract value
- // expression.
- e.opcode = EI->getOpcode();
- for (Instruction::op_iterator OI = EI->op_begin(), OE = EI->op_end();
- OI != OE; ++OI)
- e.varargs.push_back(lookupOrAdd(*OI));
-
+ if (e.varargs[0] > e.varargs[1])
+ std::swap(e.varargs[0], e.varargs[1]);
+ e.commutative = true;
+ }
+
+ if (auto *C = dyn_cast<CmpInst>(I)) {
+ // Sort the operand value numbers so x<y and y>x get the same value number.
+ CmpInst::Predicate Predicate = C->getPredicate();
+ if (e.varargs[0] > e.varargs[1]) {
+ std::swap(e.varargs[0], e.varargs[1]);
+ Predicate = CmpInst::getSwappedPredicate(Predicate);
+ }
+ e.opcode = (C->getOpcode() << 8) | Predicate;
+ e.commutative = true;
+ } else if (auto *E = dyn_cast<InsertValueInst>(I)) {
+ e.varargs.append(E->idx_begin(), E->idx_end());
+ } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
+ ArrayRef<int> ShuffleMask = SVI->getShuffleMask();
+ e.varargs.append(ShuffleMask.begin(), ShuffleMask.end());
+ }
+
+ return e;
+}
+
+GVN::Expression GVN::ValueTable::createCmpExpr(unsigned Opcode,
+ CmpInst::Predicate Predicate,
+ Value *LHS, Value *RHS) {
+ assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+ "Not a comparison!");
+ Expression e;
+ e.type = CmpInst::makeCmpResultType(LHS->getType());
+ e.varargs.push_back(lookupOrAdd(LHS));
+ e.varargs.push_back(lookupOrAdd(RHS));
+
+ // Sort the operand value numbers so x<y and y>x get the same value number.
+ if (e.varargs[0] > e.varargs[1]) {
+ std::swap(e.varargs[0], e.varargs[1]);
+ Predicate = CmpInst::getSwappedPredicate(Predicate);
+ }
+ e.opcode = (Opcode << 8) | Predicate;
+ e.commutative = true;
+ return e;
+}
+
+GVN::Expression GVN::ValueTable::createExtractvalueExpr(ExtractValueInst *EI) {
+ assert(EI && "Not an ExtractValueInst?");
+ Expression e;
+ e.type = EI->getType();
+ e.opcode = 0;
+
+ WithOverflowInst *WO = dyn_cast<WithOverflowInst>(EI->getAggregateOperand());
+ if (WO != nullptr && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
+ // EI is an extract from one of our with.overflow intrinsics. Synthesize
+ // a semantically equivalent expression instead of an extract value
+ // expression.
+ e.opcode = WO->getBinaryOp();
+ e.varargs.push_back(lookupOrAdd(WO->getLHS()));
+ e.varargs.push_back(lookupOrAdd(WO->getRHS()));
+ return e;
+ }
+
+ // Not a recognised intrinsic. Fall back to producing an extract value
+ // expression.
+ e.opcode = EI->getOpcode();
+ for (Instruction::op_iterator OI = EI->op_begin(), OE = EI->op_end();
+ OI != OE; ++OI)
+ e.varargs.push_back(lookupOrAdd(*OI));
+
append_range(e.varargs, EI->indices());
-
- return e;
-}
-
-//===----------------------------------------------------------------------===//
-// ValueTable External Functions
-//===----------------------------------------------------------------------===//
-
-GVN::ValueTable::ValueTable() = default;
-GVN::ValueTable::ValueTable(const ValueTable &) = default;
-GVN::ValueTable::ValueTable(ValueTable &&) = default;
-GVN::ValueTable::~ValueTable() = default;
-GVN::ValueTable &GVN::ValueTable::operator=(const GVN::ValueTable &Arg) = default;
-
-/// add - Insert a value into the table with a specified value number.
-void GVN::ValueTable::add(Value *V, uint32_t num) {
- valueNumbering.insert(std::make_pair(V, num));
- if (PHINode *PN = dyn_cast<PHINode>(V))
- NumberingPhi[num] = PN;
-}
-
-uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
- if (AA->doesNotAccessMemory(C)) {
- Expression exp = createExpr(C);
- uint32_t e = assignExpNewValueNum(exp).first;
- valueNumbering[C] = e;
- return e;
- } else if (MD && AA->onlyReadsMemory(C)) {
- Expression exp = createExpr(C);
- auto ValNum = assignExpNewValueNum(exp);
- if (ValNum.second) {
- valueNumbering[C] = ValNum.first;
- return ValNum.first;
- }
-
- MemDepResult local_dep = MD->getDependency(C);
-
- if (!local_dep.isDef() && !local_dep.isNonLocal()) {
- valueNumbering[C] = nextValueNumber;
- return nextValueNumber++;
- }
-
- if (local_dep.isDef()) {
+
+ return e;
+}
+
+//===----------------------------------------------------------------------===//
+// ValueTable External Functions
+//===----------------------------------------------------------------------===//
+
+GVN::ValueTable::ValueTable() = default;
+GVN::ValueTable::ValueTable(const ValueTable &) = default;
+GVN::ValueTable::ValueTable(ValueTable &&) = default;
+GVN::ValueTable::~ValueTable() = default;
+GVN::ValueTable &GVN::ValueTable::operator=(const GVN::ValueTable &Arg) = default;
+
+/// add - Insert a value into the table with a specified value number.
+void GVN::ValueTable::add(Value *V, uint32_t num) {
+ valueNumbering.insert(std::make_pair(V, num));
+ if (PHINode *PN = dyn_cast<PHINode>(V))
+ NumberingPhi[num] = PN;
+}
+
+uint32_t GVN::ValueTable::lookupOrAddCall(CallInst *C) {
+ if (AA->doesNotAccessMemory(C)) {
+ Expression exp = createExpr(C);
+ uint32_t e = assignExpNewValueNum(exp).first;
+ valueNumbering[C] = e;
+ return e;
+ } else if (MD && AA->onlyReadsMemory(C)) {
+ Expression exp = createExpr(C);
+ auto ValNum = assignExpNewValueNum(exp);
+ if (ValNum.second) {
+ valueNumbering[C] = ValNum.first;
+ return ValNum.first;
+ }
+
+ MemDepResult local_dep = MD->getDependency(C);
+
+ if (!local_dep.isDef() && !local_dep.isNonLocal()) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ if (local_dep.isDef()) {
// For masked load/store intrinsics, the local_dep may actully be
// a normal load or store instruction.
CallInst *local_cdep = dyn_cast<CallInst>(local_dep.getInst());
-
+
if (!local_cdep ||
local_cdep->getNumArgOperands() != C->getNumArgOperands()) {
- valueNumbering[C] = nextValueNumber;
- return nextValueNumber++;
- }
-
- for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
- uint32_t c_vn = lookupOrAdd(C->getArgOperand(i));
- uint32_t cd_vn = lookupOrAdd(local_cdep->getArgOperand(i));
- if (c_vn != cd_vn) {
- valueNumbering[C] = nextValueNumber;
- return nextValueNumber++;
- }
- }
-
- uint32_t v = lookupOrAdd(local_cdep);
- valueNumbering[C] = v;
- return v;
- }
-
- // Non-local case.
- const MemoryDependenceResults::NonLocalDepInfo &deps =
- MD->getNonLocalCallDependency(C);
- // FIXME: Move the checking logic to MemDep!
- CallInst* cdep = nullptr;
-
- // Check to see if we have a single dominating call instruction that is
- // identical to C.
- for (unsigned i = 0, e = deps.size(); i != e; ++i) {
- const NonLocalDepEntry *I = &deps[i];
- if (I->getResult().isNonLocal())
- continue;
-
- // We don't handle non-definitions. If we already have a call, reject
- // instruction dependencies.
- if (!I->getResult().isDef() || cdep != nullptr) {
- cdep = nullptr;
- break;
- }
-
- CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->getResult().getInst());
- // FIXME: All duplicated with non-local case.
- if (NonLocalDepCall && DT->properlyDominates(I->getBB(), C->getParent())){
- cdep = NonLocalDepCall;
- continue;
- }
-
- cdep = nullptr;
- break;
- }
-
- if (!cdep) {
- valueNumbering[C] = nextValueNumber;
- return nextValueNumber++;
- }
-
- if (cdep->getNumArgOperands() != C->getNumArgOperands()) {
- valueNumbering[C] = nextValueNumber;
- return nextValueNumber++;
- }
- for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
- uint32_t c_vn = lookupOrAdd(C->getArgOperand(i));
- uint32_t cd_vn = lookupOrAdd(cdep->getArgOperand(i));
- if (c_vn != cd_vn) {
- valueNumbering[C] = nextValueNumber;
- return nextValueNumber++;
- }
- }
-
- uint32_t v = lookupOrAdd(cdep);
- valueNumbering[C] = v;
- return v;
- } else {
- valueNumbering[C] = nextValueNumber;
- return nextValueNumber++;
- }
-}
-
-/// Returns true if a value number exists for the specified value.
-bool GVN::ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; }
-
-/// lookup_or_add - Returns the value number for the specified value, assigning
-/// it a new number if it did not have one before.
-uint32_t GVN::ValueTable::lookupOrAdd(Value *V) {
- DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
- if (VI != valueNumbering.end())
- return VI->second;
-
- if (!isa<Instruction>(V)) {
- valueNumbering[V] = nextValueNumber;
- return nextValueNumber++;
- }
-
- Instruction* I = cast<Instruction>(V);
- Expression exp;
- switch (I->getOpcode()) {
- case Instruction::Call:
- return lookupOrAddCall(cast<CallInst>(I));
- case Instruction::FNeg:
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::ICmp:
- case Instruction::FCmp:
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::UIToFP:
- case Instruction::SIToFP:
- case Instruction::FPTrunc:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::AddrSpaceCast:
- case Instruction::BitCast:
- case Instruction::Select:
- case Instruction::Freeze:
- case Instruction::ExtractElement:
- case Instruction::InsertElement:
- case Instruction::ShuffleVector:
- case Instruction::InsertValue:
- case Instruction::GetElementPtr:
- exp = createExpr(I);
- break;
- case Instruction::ExtractValue:
- exp = createExtractvalueExpr(cast<ExtractValueInst>(I));
- break;
- case Instruction::PHI:
- valueNumbering[V] = nextValueNumber;
- NumberingPhi[nextValueNumber] = cast<PHINode>(V);
- return nextValueNumber++;
- default:
- valueNumbering[V] = nextValueNumber;
- return nextValueNumber++;
- }
-
- uint32_t e = assignExpNewValueNum(exp).first;
- valueNumbering[V] = e;
- return e;
-}
-
-/// Returns the value number of the specified value. Fails if
-/// the value has not yet been numbered.
-uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const {
- DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
- if (Verify) {
- assert(VI != valueNumbering.end() && "Value not numbered?");
- return VI->second;
- }
- return (VI != valueNumbering.end()) ? VI->second : 0;
-}
-
-/// Returns the value number of the given comparison,
-/// assigning it a new number if it did not have one before. Useful when
-/// we deduced the result of a comparison, but don't immediately have an
-/// instruction realizing that comparison to hand.
-uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode,
- CmpInst::Predicate Predicate,
- Value *LHS, Value *RHS) {
- Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS);
- return assignExpNewValueNum(exp).first;
-}
-
-/// Remove all entries from the ValueTable.
-void GVN::ValueTable::clear() {
- valueNumbering.clear();
- expressionNumbering.clear();
- NumberingPhi.clear();
- PhiTranslateTable.clear();
- nextValueNumber = 1;
- Expressions.clear();
- ExprIdx.clear();
- nextExprNumber = 0;
-}
-
-/// Remove a value from the value numbering.
-void GVN::ValueTable::erase(Value *V) {
- uint32_t Num = valueNumbering.lookup(V);
- valueNumbering.erase(V);
- // If V is PHINode, V <--> value number is an one-to-one mapping.
- if (isa<PHINode>(V))
- NumberingPhi.erase(Num);
-}
-
-/// verifyRemoved - Verify that the value is removed from all internal data
-/// structures.
-void GVN::ValueTable::verifyRemoved(const Value *V) const {
- for (DenseMap<Value*, uint32_t>::const_iterator
- I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
- assert(I->first != V && "Inst still occurs in value numbering map!");
- }
-}
-
-//===----------------------------------------------------------------------===//
-// GVN Pass
-//===----------------------------------------------------------------------===//
-
-bool GVN::isPREEnabled() const {
- return Options.AllowPRE.getValueOr(GVNEnablePRE);
-}
-
-bool GVN::isLoadPREEnabled() const {
- return Options.AllowLoadPRE.getValueOr(GVNEnableLoadPRE);
-}
-
-bool GVN::isLoadInLoopPREEnabled() const {
- return Options.AllowLoadInLoopPRE.getValueOr(GVNEnableLoadInLoopPRE);
-}
-
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+ uint32_t c_vn = lookupOrAdd(C->getArgOperand(i));
+ uint32_t cd_vn = lookupOrAdd(local_cdep->getArgOperand(i));
+ if (c_vn != cd_vn) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+ }
+
+ uint32_t v = lookupOrAdd(local_cdep);
+ valueNumbering[C] = v;
+ return v;
+ }
+
+ // Non-local case.
+ const MemoryDependenceResults::NonLocalDepInfo &deps =
+ MD->getNonLocalCallDependency(C);
+ // FIXME: Move the checking logic to MemDep!
+ CallInst* cdep = nullptr;
+
+ // Check to see if we have a single dominating call instruction that is
+ // identical to C.
+ for (unsigned i = 0, e = deps.size(); i != e; ++i) {
+ const NonLocalDepEntry *I = &deps[i];
+ if (I->getResult().isNonLocal())
+ continue;
+
+ // We don't handle non-definitions. If we already have a call, reject
+ // instruction dependencies.
+ if (!I->getResult().isDef() || cdep != nullptr) {
+ cdep = nullptr;
+ break;
+ }
+
+ CallInst *NonLocalDepCall = dyn_cast<CallInst>(I->getResult().getInst());
+ // FIXME: All duplicated with non-local case.
+ if (NonLocalDepCall && DT->properlyDominates(I->getBB(), C->getParent())){
+ cdep = NonLocalDepCall;
+ continue;
+ }
+
+ cdep = nullptr;
+ break;
+ }
+
+ if (!cdep) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ if (cdep->getNumArgOperands() != C->getNumArgOperands()) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+ for (unsigned i = 0, e = C->getNumArgOperands(); i < e; ++i) {
+ uint32_t c_vn = lookupOrAdd(C->getArgOperand(i));
+ uint32_t cd_vn = lookupOrAdd(cdep->getArgOperand(i));
+ if (c_vn != cd_vn) {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+ }
+
+ uint32_t v = lookupOrAdd(cdep);
+ valueNumbering[C] = v;
+ return v;
+ } else {
+ valueNumbering[C] = nextValueNumber;
+ return nextValueNumber++;
+ }
+}
+
+/// Returns true if a value number exists for the specified value.
+bool GVN::ValueTable::exists(Value *V) const { return valueNumbering.count(V) != 0; }
+
+/// lookup_or_add - Returns the value number for the specified value, assigning
+/// it a new number if it did not have one before.
+uint32_t GVN::ValueTable::lookupOrAdd(Value *V) {
+ DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+ if (VI != valueNumbering.end())
+ return VI->second;
+
+ if (!isa<Instruction>(V)) {
+ valueNumbering[V] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ Instruction* I = cast<Instruction>(V);
+ Expression exp;
+ switch (I->getOpcode()) {
+ case Instruction::Call:
+ return lookupOrAddCall(cast<CallInst>(I));
+ case Instruction::FNeg:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::AddrSpaceCast:
+ case Instruction::BitCast:
+ case Instruction::Select:
+ case Instruction::Freeze:
+ case Instruction::ExtractElement:
+ case Instruction::InsertElement:
+ case Instruction::ShuffleVector:
+ case Instruction::InsertValue:
+ case Instruction::GetElementPtr:
+ exp = createExpr(I);
+ break;
+ case Instruction::ExtractValue:
+ exp = createExtractvalueExpr(cast<ExtractValueInst>(I));
+ break;
+ case Instruction::PHI:
+ valueNumbering[V] = nextValueNumber;
+ NumberingPhi[nextValueNumber] = cast<PHINode>(V);
+ return nextValueNumber++;
+ default:
+ valueNumbering[V] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ uint32_t e = assignExpNewValueNum(exp).first;
+ valueNumbering[V] = e;
+ return e;
+}
+
+/// Returns the value number of the specified value. Fails if
+/// the value has not yet been numbered.
+uint32_t GVN::ValueTable::lookup(Value *V, bool Verify) const {
+ DenseMap<Value*, uint32_t>::const_iterator VI = valueNumbering.find(V);
+ if (Verify) {
+ assert(VI != valueNumbering.end() && "Value not numbered?");
+ return VI->second;
+ }
+ return (VI != valueNumbering.end()) ? VI->second : 0;
+}
+
+/// Returns the value number of the given comparison,
+/// assigning it a new number if it did not have one before. Useful when
+/// we deduced the result of a comparison, but don't immediately have an
+/// instruction realizing that comparison to hand.
+uint32_t GVN::ValueTable::lookupOrAddCmp(unsigned Opcode,
+ CmpInst::Predicate Predicate,
+ Value *LHS, Value *RHS) {
+ Expression exp = createCmpExpr(Opcode, Predicate, LHS, RHS);
+ return assignExpNewValueNum(exp).first;
+}
+
+/// Remove all entries from the ValueTable.
+void GVN::ValueTable::clear() {
+ valueNumbering.clear();
+ expressionNumbering.clear();
+ NumberingPhi.clear();
+ PhiTranslateTable.clear();
+ nextValueNumber = 1;
+ Expressions.clear();
+ ExprIdx.clear();
+ nextExprNumber = 0;
+}
+
+/// Remove a value from the value numbering.
+void GVN::ValueTable::erase(Value *V) {
+ uint32_t Num = valueNumbering.lookup(V);
+ valueNumbering.erase(V);
+ // If V is PHINode, V <--> value number is an one-to-one mapping.
+ if (isa<PHINode>(V))
+ NumberingPhi.erase(Num);
+}
+
+/// verifyRemoved - Verify that the value is removed from all internal data
+/// structures.
+void GVN::ValueTable::verifyRemoved(const Value *V) const {
+ for (DenseMap<Value*, uint32_t>::const_iterator
+ I = valueNumbering.begin(), E = valueNumbering.end(); I != E; ++I) {
+ assert(I->first != V && "Inst still occurs in value numbering map!");
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// GVN Pass
+//===----------------------------------------------------------------------===//
+
+bool GVN::isPREEnabled() const {
+ return Options.AllowPRE.getValueOr(GVNEnablePRE);
+}
+
+bool GVN::isLoadPREEnabled() const {
+ return Options.AllowLoadPRE.getValueOr(GVNEnableLoadPRE);
+}
+
+bool GVN::isLoadInLoopPREEnabled() const {
+ return Options.AllowLoadInLoopPRE.getValueOr(GVNEnableLoadInLoopPRE);
+}
+
bool GVN::isLoadPRESplitBackedgeEnabled() const {
return Options.AllowLoadPRESplitBackedge.getValueOr(
GVNEnableSplitBackedgeInLoadPRE);
}
-bool GVN::isMemDepEnabled() const {
- return Options.AllowMemDep.getValueOr(GVNEnableMemDep);
-}
-
-PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
- // FIXME: The order of evaluation of these 'getResult' calls is very
- // significant! Re-ordering these variables will cause GVN when run alone to
- // be less effective! We should fix memdep and basic-aa to not exhibit this
- // behavior, but until then don't change the order here.
- auto &AC = AM.getResult<AssumptionAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &AA = AM.getResult<AAManager>(F);
- auto *MemDep =
- isMemDepEnabled() ? &AM.getResult<MemoryDependenceAnalysis>(F) : nullptr;
- auto *LI = AM.getCachedResult<LoopAnalysis>(F);
+bool GVN::isMemDepEnabled() const {
+ return Options.AllowMemDep.getValueOr(GVNEnableMemDep);
+}
+
+PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
+ // FIXME: The order of evaluation of these 'getResult' calls is very
+ // significant! Re-ordering these variables will cause GVN when run alone to
+ // be less effective! We should fix memdep and basic-aa to not exhibit this
+ // behavior, but until then don't change the order here.
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+ auto *MemDep =
+ isMemDepEnabled() ? &AM.getResult<MemoryDependenceAnalysis>(F) : nullptr;
+ auto *LI = AM.getCachedResult<LoopAnalysis>(F);
auto *MSSA = AM.getCachedResult<MemorySSAAnalysis>(F);
- auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
bool Changed = runImpl(F, AC, DT, TLI, AA, MemDep, LI, &ORE,
MSSA ? &MSSA->getMSSA() : nullptr);
- if (!Changed)
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<GlobalsAA>();
- PA.preserve<TargetLibraryAnalysis>();
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<GlobalsAA>();
+ PA.preserve<TargetLibraryAnalysis>();
if (MSSA)
PA.preserve<MemorySSAAnalysis>();
- if (LI)
- PA.preserve<LoopAnalysis>();
- return PA;
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const {
- errs() << "{\n";
- for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
- E = d.end(); I != E; ++I) {
- errs() << I->first << "\n";
- I->second->dump();
- }
- errs() << "}\n";
-}
-#endif
-
+ if (LI)
+ PA.preserve<LoopAnalysis>();
+ return PA;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void GVN::dump(DenseMap<uint32_t, Value*>& d) const {
+ errs() << "{\n";
+ for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
+ E = d.end(); I != E; ++I) {
+ errs() << I->first << "\n";
+ I->second->dump();
+ }
+ errs() << "}\n";
+}
+#endif
+
enum class AvailabilityState : char {
/// We know the block *is not* fully available. This is a fixpoint.
Unavailable = 0,
@@ -702,29 +702,29 @@ enum class AvailabilityState : char {
SpeculativelyAvailable = 2,
};
-/// Return true if we can prove that the value
-/// we're analyzing is fully available in the specified block. As we go, keep
-/// track of which blocks we know are fully alive in FullyAvailableBlocks. This
-/// map is actually a tri-state map with the following values:
-/// 0) we know the block *is not* fully available.
-/// 1) we know the block *is* fully available.
-/// 2) we do not know whether the block is fully available or not, but we are
-/// currently speculating that it will be.
+/// Return true if we can prove that the value
+/// we're analyzing is fully available in the specified block. As we go, keep
+/// track of which blocks we know are fully alive in FullyAvailableBlocks. This
+/// map is actually a tri-state map with the following values:
+/// 0) we know the block *is not* fully available.
+/// 1) we know the block *is* fully available.
+/// 2) we do not know whether the block is fully available or not, but we are
+/// currently speculating that it will be.
static bool IsValueFullyAvailableInBlock(
BasicBlock *BB,
DenseMap<BasicBlock *, AvailabilityState> &FullyAvailableBlocks) {
SmallVector<BasicBlock *, 32> Worklist;
Optional<BasicBlock *> UnavailableBB;
-
+
// The number of times we didn't find an entry for a block in a map and
// optimistically inserted an entry marking block as speculatively available.
unsigned NumNewNewSpeculativelyAvailableBBs = 0;
-
+
#ifndef NDEBUG
SmallSet<BasicBlock *, 32> NewSpeculativelyAvailableBBs;
SmallVector<BasicBlock *, 32> AvailableBBs;
#endif
-
+
Worklist.emplace_back(BB);
while (!Worklist.empty()) {
BasicBlock *CurrBB = Worklist.pop_back_val(); // LIFO - depth-first!
@@ -734,24 +734,24 @@ static bool IsValueFullyAvailableInBlock(
FullyAvailableBlocks.try_emplace(
CurrBB, AvailabilityState::SpeculativelyAvailable);
AvailabilityState &State = IV.first->second;
-
+
// Did the entry already exist for this block?
if (!IV.second) {
if (State == AvailabilityState::Unavailable) {
UnavailableBB = CurrBB;
break; // Backpropagate unavailability info.
}
-
+
#ifndef NDEBUG
AvailableBBs.emplace_back(CurrBB);
#endif
continue; // Don't recurse further, but continue processing worklist.
}
-
+
// No entry found for block.
++NumNewNewSpeculativelyAvailableBBs;
bool OutOfBudget = NumNewNewSpeculativelyAvailableBBs > MaxBBSpeculations;
-
+
// If we have exhausted our budget, mark this block as unavailable.
// Also, if this block has no predecessors, the value isn't live-in here.
if (OutOfBudget || pred_empty(CurrBB)) {
@@ -760,20 +760,20 @@ static bool IsValueFullyAvailableInBlock(
UnavailableBB = CurrBB;
break; // Backpropagate unavailability info.
}
-
+
// Tentatively consider this block as speculatively available.
#ifndef NDEBUG
NewSpeculativelyAvailableBBs.insert(CurrBB);
#endif
// And further recurse into block's predecessors, in depth-first order!
Worklist.append(pred_begin(CurrBB), pred_end(CurrBB));
- }
-
+ }
+
#if LLVM_ENABLE_STATS
IsValueFullyAvailableInBlockNumSpeculationsMax.updateMax(
NumNewNewSpeculativelyAvailableBBs);
#endif
-
+
// If the block isn't marked as fixpoint yet
// (the Unavailable and Available states are fixpoints)
auto MarkAsFixpointAndEnqueueSuccessors =
@@ -796,7 +796,7 @@ static bool IsValueFullyAvailableInBlock(
return;
}
};
-
+
if (UnavailableBB) {
// Okay, we have encountered an unavailable block.
// Mark speculatively available blocks reachable from UnavailableBB as
@@ -808,7 +808,7 @@ static bool IsValueFullyAvailableInBlock(
MarkAsFixpointAndEnqueueSuccessors(Worklist.pop_back_val(),
AvailabilityState::Unavailable);
}
-
+
#ifndef NDEBUG
Worklist.clear();
for (BasicBlock *AvailableBB : AvailableBBs)
@@ -816,418 +816,418 @@ static bool IsValueFullyAvailableInBlock(
while (!Worklist.empty())
MarkAsFixpointAndEnqueueSuccessors(Worklist.pop_back_val(),
AvailabilityState::Available);
-
+
assert(NewSpeculativelyAvailableBBs.empty() &&
"Must have fixed all the new speculatively available blocks.");
#endif
return !UnavailableBB;
-}
-
-/// Given a set of loads specified by ValuesPerBlock,
-/// construct SSA form, allowing us to eliminate LI. This returns the value
-/// that should be used at LI's definition site.
-static Value *ConstructSSAForLoadSet(LoadInst *LI,
- SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock,
- GVN &gvn) {
- // Check for the fully redundant, dominating load case. In this case, we can
- // just use the dominating value directly.
- if (ValuesPerBlock.size() == 1 &&
- gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
- LI->getParent())) {
- assert(!ValuesPerBlock[0].AV.isUndefValue() &&
- "Dead BB dominate this block");
- return ValuesPerBlock[0].MaterializeAdjustedValue(LI, gvn);
- }
-
- // Otherwise, we have to construct SSA form.
- SmallVector<PHINode*, 8> NewPHIs;
- SSAUpdater SSAUpdate(&NewPHIs);
- SSAUpdate.Initialize(LI->getType(), LI->getName());
-
- for (const AvailableValueInBlock &AV : ValuesPerBlock) {
- BasicBlock *BB = AV.BB;
-
- if (SSAUpdate.HasValueForBlock(BB))
- continue;
-
- // If the value is the load that we will be eliminating, and the block it's
- // available in is the block that the load is in, then don't add it as
- // SSAUpdater will resolve the value to the relevant phi which may let it
- // avoid phi construction entirely if there's actually only one value.
- if (BB == LI->getParent() &&
- ((AV.AV.isSimpleValue() && AV.AV.getSimpleValue() == LI) ||
- (AV.AV.isCoercedLoadValue() && AV.AV.getCoercedLoadValue() == LI)))
- continue;
-
- SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn));
- }
-
- // Perform PHI construction.
- return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
-}
-
-Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
- Instruction *InsertPt,
- GVN &gvn) const {
- Value *Res;
- Type *LoadTy = LI->getType();
- const DataLayout &DL = LI->getModule()->getDataLayout();
- if (isSimpleValue()) {
- Res = getSimpleValue();
- if (Res->getType() != LoadTy) {
- Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
-
- LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset
- << " " << *getSimpleValue() << '\n'
- << *Res << '\n'
- << "\n\n\n");
- }
- } else if (isCoercedLoadValue()) {
- LoadInst *Load = getCoercedLoadValue();
- if (Load->getType() == LoadTy && Offset == 0) {
- Res = Load;
- } else {
- Res = getLoadValueForLoad(Load, Offset, LoadTy, InsertPt, DL);
- // We would like to use gvn.markInstructionForDeletion here, but we can't
- // because the load is already memoized into the leader map table that GVN
- // tracks. It is potentially possible to remove the load from the table,
- // but then there all of the operations based on it would need to be
- // rehashed. Just leave the dead load around.
- gvn.getMemDep().removeInstruction(Load);
- LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset
- << " " << *getCoercedLoadValue() << '\n'
- << *Res << '\n'
- << "\n\n\n");
- }
- } else if (isMemIntrinValue()) {
- Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
- InsertPt, DL);
- LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
- << " " << *getMemIntrinValue() << '\n'
- << *Res << '\n'
- << "\n\n\n");
- } else {
- assert(isUndefValue() && "Should be UndefVal");
- LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
- return UndefValue::get(LoadTy);
- }
- assert(Res && "failed to materialize?");
- return Res;
-}
-
-static bool isLifetimeStart(const Instruction *Inst) {
- if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst))
- return II->getIntrinsicID() == Intrinsic::lifetime_start;
- return false;
-}
-
-/// Try to locate the three instruction involved in a missed
-/// load-elimination case that is due to an intervening store.
-static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo,
- DominatorTree *DT,
- OptimizationRemarkEmitter *ORE) {
- using namespace ore;
-
- User *OtherAccess = nullptr;
-
- OptimizationRemarkMissed R(DEBUG_TYPE, "LoadClobbered", LI);
- R << "load of type " << NV("Type", LI->getType()) << " not eliminated"
- << setExtraArgs();
-
- for (auto *U : LI->getPointerOperand()->users())
- if (U != LI && (isa<LoadInst>(U) || isa<StoreInst>(U)) &&
- DT->dominates(cast<Instruction>(U), LI)) {
- // FIXME: for now give up if there are multiple memory accesses that
- // dominate the load. We need further analysis to decide which one is
- // that we're forwarding from.
- if (OtherAccess)
- OtherAccess = nullptr;
- else
- OtherAccess = U;
- }
-
- if (OtherAccess)
- R << " in favor of " << NV("OtherAccess", OtherAccess);
-
- R << " because it is clobbered by " << NV("ClobberedBy", DepInfo.getInst());
-
- ORE->emit(R);
-}
-
-bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
- Value *Address, AvailableValue &Res) {
- assert((DepInfo.isDef() || DepInfo.isClobber()) &&
- "expected a local dependence");
- assert(LI->isUnordered() && "rules below are incorrect for ordered access");
-
- const DataLayout &DL = LI->getModule()->getDataLayout();
-
- Instruction *DepInst = DepInfo.getInst();
- if (DepInfo.isClobber()) {
- // If the dependence is to a store that writes to a superset of the bits
- // read by the load, we can extract the bits we need for the load from the
- // stored value.
- if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
- // Can't forward from non-atomic to atomic without violating memory model.
- if (Address && LI->isAtomic() <= DepSI->isAtomic()) {
- int Offset =
- analyzeLoadFromClobberingStore(LI->getType(), Address, DepSI, DL);
- if (Offset != -1) {
- Res = AvailableValue::get(DepSI->getValueOperand(), Offset);
- return true;
- }
- }
- }
-
- // Check to see if we have something like this:
- // load i32* P
- // load i8* (P+1)
- // if we have this, replace the later with an extraction from the former.
- if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
- // If this is a clobber and L is the first instruction in its block, then
- // we have the first instruction in the entry block.
- // Can't forward from non-atomic to atomic without violating memory model.
- if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) {
- int Offset =
- analyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
-
- if (Offset != -1) {
- Res = AvailableValue::getLoad(DepLI, Offset);
- return true;
- }
- }
- }
-
- // If the clobbering value is a memset/memcpy/memmove, see if we can
- // forward a value on from it.
- if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
- if (Address && !LI->isAtomic()) {
- int Offset = analyzeLoadFromClobberingMemInst(LI->getType(), Address,
- DepMI, DL);
- if (Offset != -1) {
- Res = AvailableValue::getMI(DepMI, Offset);
- return true;
- }
- }
- }
- // Nothing known about this clobber, have to be conservative
- LLVM_DEBUG(
- // fast print dep, using operator<< on instruction is too slow.
- dbgs() << "GVN: load "; LI->printAsOperand(dbgs());
- dbgs() << " is clobbered by " << *DepInst << '\n';);
- if (ORE->allowExtraAnalysis(DEBUG_TYPE))
- reportMayClobberedLoad(LI, DepInfo, DT, ORE);
-
- return false;
- }
- assert(DepInfo.isDef() && "follows from above");
-
- // Loading the allocation -> undef.
- if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
- isAlignedAllocLikeFn(DepInst, TLI) ||
- // Loading immediately after lifetime begin -> undef.
- isLifetimeStart(DepInst)) {
- Res = AvailableValue::get(UndefValue::get(LI->getType()));
- return true;
- }
-
- // Loading from calloc (which zero initializes memory) -> zero
- if (isCallocLikeFn(DepInst, TLI)) {
- Res = AvailableValue::get(Constant::getNullValue(LI->getType()));
- return true;
- }
-
- if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
- // Reject loads and stores that are to the same address but are of
+}
+
+/// Given a set of loads specified by ValuesPerBlock,
+/// construct SSA form, allowing us to eliminate LI. This returns the value
+/// that should be used at LI's definition site.
+static Value *ConstructSSAForLoadSet(LoadInst *LI,
+ SmallVectorImpl<AvailableValueInBlock> &ValuesPerBlock,
+ GVN &gvn) {
+ // Check for the fully redundant, dominating load case. In this case, we can
+ // just use the dominating value directly.
+ if (ValuesPerBlock.size() == 1 &&
+ gvn.getDominatorTree().properlyDominates(ValuesPerBlock[0].BB,
+ LI->getParent())) {
+ assert(!ValuesPerBlock[0].AV.isUndefValue() &&
+ "Dead BB dominate this block");
+ return ValuesPerBlock[0].MaterializeAdjustedValue(LI, gvn);
+ }
+
+ // Otherwise, we have to construct SSA form.
+ SmallVector<PHINode*, 8> NewPHIs;
+ SSAUpdater SSAUpdate(&NewPHIs);
+ SSAUpdate.Initialize(LI->getType(), LI->getName());
+
+ for (const AvailableValueInBlock &AV : ValuesPerBlock) {
+ BasicBlock *BB = AV.BB;
+
+ if (SSAUpdate.HasValueForBlock(BB))
+ continue;
+
+ // If the value is the load that we will be eliminating, and the block it's
+ // available in is the block that the load is in, then don't add it as
+ // SSAUpdater will resolve the value to the relevant phi which may let it
+ // avoid phi construction entirely if there's actually only one value.
+ if (BB == LI->getParent() &&
+ ((AV.AV.isSimpleValue() && AV.AV.getSimpleValue() == LI) ||
+ (AV.AV.isCoercedLoadValue() && AV.AV.getCoercedLoadValue() == LI)))
+ continue;
+
+ SSAUpdate.AddAvailableValue(BB, AV.MaterializeAdjustedValue(LI, gvn));
+ }
+
+ // Perform PHI construction.
+ return SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
+}
+
+Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
+ Instruction *InsertPt,
+ GVN &gvn) const {
+ Value *Res;
+ Type *LoadTy = LI->getType();
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+ if (isSimpleValue()) {
+ Res = getSimpleValue();
+ if (Res->getType() != LoadTy) {
+ Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
+
+ LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset
+ << " " << *getSimpleValue() << '\n'
+ << *Res << '\n'
+ << "\n\n\n");
+ }
+ } else if (isCoercedLoadValue()) {
+ LoadInst *Load = getCoercedLoadValue();
+ if (Load->getType() == LoadTy && Offset == 0) {
+ Res = Load;
+ } else {
+ Res = getLoadValueForLoad(Load, Offset, LoadTy, InsertPt, DL);
+ // We would like to use gvn.markInstructionForDeletion here, but we can't
+ // because the load is already memoized into the leader map table that GVN
+ // tracks. It is potentially possible to remove the load from the table,
+ // but then there all of the operations based on it would need to be
+ // rehashed. Just leave the dead load around.
+ gvn.getMemDep().removeInstruction(Load);
+ LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset
+ << " " << *getCoercedLoadValue() << '\n'
+ << *Res << '\n'
+ << "\n\n\n");
+ }
+ } else if (isMemIntrinValue()) {
+ Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
+ InsertPt, DL);
+ LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+ << " " << *getMemIntrinValue() << '\n'
+ << *Res << '\n'
+ << "\n\n\n");
+ } else {
+ assert(isUndefValue() && "Should be UndefVal");
+ LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL Undef:\n";);
+ return UndefValue::get(LoadTy);
+ }
+ assert(Res && "failed to materialize?");
+ return Res;
+}
+
+static bool isLifetimeStart(const Instruction *Inst) {
+ if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst))
+ return II->getIntrinsicID() == Intrinsic::lifetime_start;
+ return false;
+}
+
+/// Try to locate the three instruction involved in a missed
+/// load-elimination case that is due to an intervening store.
+static void reportMayClobberedLoad(LoadInst *LI, MemDepResult DepInfo,
+ DominatorTree *DT,
+ OptimizationRemarkEmitter *ORE) {
+ using namespace ore;
+
+ User *OtherAccess = nullptr;
+
+ OptimizationRemarkMissed R(DEBUG_TYPE, "LoadClobbered", LI);
+ R << "load of type " << NV("Type", LI->getType()) << " not eliminated"
+ << setExtraArgs();
+
+ for (auto *U : LI->getPointerOperand()->users())
+ if (U != LI && (isa<LoadInst>(U) || isa<StoreInst>(U)) &&
+ DT->dominates(cast<Instruction>(U), LI)) {
+ // FIXME: for now give up if there are multiple memory accesses that
+ // dominate the load. We need further analysis to decide which one is
+ // that we're forwarding from.
+ if (OtherAccess)
+ OtherAccess = nullptr;
+ else
+ OtherAccess = U;
+ }
+
+ if (OtherAccess)
+ R << " in favor of " << NV("OtherAccess", OtherAccess);
+
+ R << " because it is clobbered by " << NV("ClobberedBy", DepInfo.getInst());
+
+ ORE->emit(R);
+}
+
+bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
+ Value *Address, AvailableValue &Res) {
+ assert((DepInfo.isDef() || DepInfo.isClobber()) &&
+ "expected a local dependence");
+ assert(LI->isUnordered() && "rules below are incorrect for ordered access");
+
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+
+ Instruction *DepInst = DepInfo.getInst();
+ if (DepInfo.isClobber()) {
+ // If the dependence is to a store that writes to a superset of the bits
+ // read by the load, we can extract the bits we need for the load from the
+ // stored value.
+ if (StoreInst *DepSI = dyn_cast<StoreInst>(DepInst)) {
+ // Can't forward from non-atomic to atomic without violating memory model.
+ if (Address && LI->isAtomic() <= DepSI->isAtomic()) {
+ int Offset =
+ analyzeLoadFromClobberingStore(LI->getType(), Address, DepSI, DL);
+ if (Offset != -1) {
+ Res = AvailableValue::get(DepSI->getValueOperand(), Offset);
+ return true;
+ }
+ }
+ }
+
+ // Check to see if we have something like this:
+ // load i32* P
+ // load i8* (P+1)
+ // if we have this, replace the later with an extraction from the former.
+ if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+ // If this is a clobber and L is the first instruction in its block, then
+ // we have the first instruction in the entry block.
+ // Can't forward from non-atomic to atomic without violating memory model.
+ if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) {
+ int Offset =
+ analyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
+
+ if (Offset != -1) {
+ Res = AvailableValue::getLoad(DepLI, Offset);
+ return true;
+ }
+ }
+ }
+
+ // If the clobbering value is a memset/memcpy/memmove, see if we can
+ // forward a value on from it.
+ if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
+ if (Address && !LI->isAtomic()) {
+ int Offset = analyzeLoadFromClobberingMemInst(LI->getType(), Address,
+ DepMI, DL);
+ if (Offset != -1) {
+ Res = AvailableValue::getMI(DepMI, Offset);
+ return true;
+ }
+ }
+ }
+ // Nothing known about this clobber, have to be conservative
+ LLVM_DEBUG(
+ // fast print dep, using operator<< on instruction is too slow.
+ dbgs() << "GVN: load "; LI->printAsOperand(dbgs());
+ dbgs() << " is clobbered by " << *DepInst << '\n';);
+ if (ORE->allowExtraAnalysis(DEBUG_TYPE))
+ reportMayClobberedLoad(LI, DepInfo, DT, ORE);
+
+ return false;
+ }
+ assert(DepInfo.isDef() && "follows from above");
+
+ // Loading the allocation -> undef.
+ if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
+ isAlignedAllocLikeFn(DepInst, TLI) ||
+ // Loading immediately after lifetime begin -> undef.
+ isLifetimeStart(DepInst)) {
+ Res = AvailableValue::get(UndefValue::get(LI->getType()));
+ return true;
+ }
+
+ // Loading from calloc (which zero initializes memory) -> zero
+ if (isCallocLikeFn(DepInst, TLI)) {
+ Res = AvailableValue::get(Constant::getNullValue(LI->getType()));
+ return true;
+ }
+
+ if (StoreInst *S = dyn_cast<StoreInst>(DepInst)) {
+ // Reject loads and stores that are to the same address but are of
// different types if we have to. If the stored value is convertable to
- // the loaded value, we can reuse it.
- if (!canCoerceMustAliasedValueToLoad(S->getValueOperand(), LI->getType(),
- DL))
- return false;
-
- // Can't forward from non-atomic to atomic without violating memory model.
- if (S->isAtomic() < LI->isAtomic())
- return false;
-
- Res = AvailableValue::get(S->getValueOperand());
- return true;
- }
-
- if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) {
- // If the types mismatch and we can't handle it, reject reuse of the load.
- // If the stored value is larger or equal to the loaded value, we can reuse
- // it.
- if (!canCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
- return false;
-
- // Can't forward from non-atomic to atomic without violating memory model.
- if (LD->isAtomic() < LI->isAtomic())
- return false;
-
- Res = AvailableValue::getLoad(LD);
- return true;
- }
-
- // Unknown def - must be conservative
- LLVM_DEBUG(
- // fast print dep, using operator<< on instruction is too slow.
- dbgs() << "GVN: load "; LI->printAsOperand(dbgs());
- dbgs() << " has unknown def " << *DepInst << '\n';);
- return false;
-}
-
-void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
- AvailValInBlkVect &ValuesPerBlock,
- UnavailBlkVect &UnavailableBlocks) {
- // Filter out useless results (non-locals, etc). Keep track of the blocks
- // where we have a value available in repl, also keep track of whether we see
- // dependencies that produce an unknown value for the load (such as a call
- // that could potentially clobber the load).
- unsigned NumDeps = Deps.size();
- for (unsigned i = 0, e = NumDeps; i != e; ++i) {
- BasicBlock *DepBB = Deps[i].getBB();
- MemDepResult DepInfo = Deps[i].getResult();
-
- if (DeadBlocks.count(DepBB)) {
- // Dead dependent mem-op disguise as a load evaluating the same value
- // as the load in question.
- ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB));
- continue;
- }
-
- if (!DepInfo.isDef() && !DepInfo.isClobber()) {
- UnavailableBlocks.push_back(DepBB);
- continue;
- }
-
- // The address being loaded in this non-local block may not be the same as
- // the pointer operand of the load if PHI translation occurs. Make sure
- // to consider the right address.
- Value *Address = Deps[i].getAddress();
-
- AvailableValue AV;
- if (AnalyzeLoadAvailability(LI, DepInfo, Address, AV)) {
- // subtlety: because we know this was a non-local dependency, we know
- // it's safe to materialize anywhere between the instruction within
- // DepInfo and the end of it's block.
- ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
- std::move(AV)));
- } else {
- UnavailableBlocks.push_back(DepBB);
- }
- }
-
- assert(NumDeps == ValuesPerBlock.size() + UnavailableBlocks.size() &&
- "post condition violation");
-}
-
-bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
- UnavailBlkVect &UnavailableBlocks) {
- // Okay, we have *some* definitions of the value. This means that the value
- // is available in some of our (transitive) predecessors. Lets think about
- // doing PRE of this load. This will involve inserting a new load into the
- // predecessor when it's not available. We could do this in general, but
- // prefer to not increase code size. As such, we only do this when we know
- // that we only have to insert *one* load (which means we're basically moving
- // the load, not inserting a new one).
-
- SmallPtrSet<BasicBlock *, 4> Blockers(UnavailableBlocks.begin(),
- UnavailableBlocks.end());
-
- // Let's find the first basic block with more than one predecessor. Walk
- // backwards through predecessors if needed.
- BasicBlock *LoadBB = LI->getParent();
- BasicBlock *TmpBB = LoadBB;
-
- // Check that there is no implicit control flow instructions above our load in
- // its block. If there is an instruction that doesn't always pass the
- // execution to the following instruction, then moving through it may become
- // invalid. For example:
- //
- // int arr[LEN];
- // int index = ???;
- // ...
- // guard(0 <= index && index < LEN);
- // use(arr[index]);
- //
- // It is illegal to move the array access to any point above the guard,
- // because if the index is out of bounds we should deoptimize rather than
- // access the array.
- // Check that there is no guard in this block above our instruction.
+ // the loaded value, we can reuse it.
+ if (!canCoerceMustAliasedValueToLoad(S->getValueOperand(), LI->getType(),
+ DL))
+ return false;
+
+ // Can't forward from non-atomic to atomic without violating memory model.
+ if (S->isAtomic() < LI->isAtomic())
+ return false;
+
+ Res = AvailableValue::get(S->getValueOperand());
+ return true;
+ }
+
+ if (LoadInst *LD = dyn_cast<LoadInst>(DepInst)) {
+ // If the types mismatch and we can't handle it, reject reuse of the load.
+ // If the stored value is larger or equal to the loaded value, we can reuse
+ // it.
+ if (!canCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
+ return false;
+
+ // Can't forward from non-atomic to atomic without violating memory model.
+ if (LD->isAtomic() < LI->isAtomic())
+ return false;
+
+ Res = AvailableValue::getLoad(LD);
+ return true;
+ }
+
+ // Unknown def - must be conservative
+ LLVM_DEBUG(
+ // fast print dep, using operator<< on instruction is too slow.
+ dbgs() << "GVN: load "; LI->printAsOperand(dbgs());
+ dbgs() << " has unknown def " << *DepInst << '\n';);
+ return false;
+}
+
+void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps,
+ AvailValInBlkVect &ValuesPerBlock,
+ UnavailBlkVect &UnavailableBlocks) {
+ // Filter out useless results (non-locals, etc). Keep track of the blocks
+ // where we have a value available in repl, also keep track of whether we see
+ // dependencies that produce an unknown value for the load (such as a call
+ // that could potentially clobber the load).
+ unsigned NumDeps = Deps.size();
+ for (unsigned i = 0, e = NumDeps; i != e; ++i) {
+ BasicBlock *DepBB = Deps[i].getBB();
+ MemDepResult DepInfo = Deps[i].getResult();
+
+ if (DeadBlocks.count(DepBB)) {
+ // Dead dependent mem-op disguise as a load evaluating the same value
+ // as the load in question.
+ ValuesPerBlock.push_back(AvailableValueInBlock::getUndef(DepBB));
+ continue;
+ }
+
+ if (!DepInfo.isDef() && !DepInfo.isClobber()) {
+ UnavailableBlocks.push_back(DepBB);
+ continue;
+ }
+
+ // The address being loaded in this non-local block may not be the same as
+ // the pointer operand of the load if PHI translation occurs. Make sure
+ // to consider the right address.
+ Value *Address = Deps[i].getAddress();
+
+ AvailableValue AV;
+ if (AnalyzeLoadAvailability(LI, DepInfo, Address, AV)) {
+ // subtlety: because we know this was a non-local dependency, we know
+ // it's safe to materialize anywhere between the instruction within
+ // DepInfo and the end of it's block.
+ ValuesPerBlock.push_back(AvailableValueInBlock::get(DepBB,
+ std::move(AV)));
+ } else {
+ UnavailableBlocks.push_back(DepBB);
+ }
+ }
+
+ assert(NumDeps == ValuesPerBlock.size() + UnavailableBlocks.size() &&
+ "post condition violation");
+}
+
+bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
+ UnavailBlkVect &UnavailableBlocks) {
+ // Okay, we have *some* definitions of the value. This means that the value
+ // is available in some of our (transitive) predecessors. Lets think about
+ // doing PRE of this load. This will involve inserting a new load into the
+ // predecessor when it's not available. We could do this in general, but
+ // prefer to not increase code size. As such, we only do this when we know
+ // that we only have to insert *one* load (which means we're basically moving
+ // the load, not inserting a new one).
+
+ SmallPtrSet<BasicBlock *, 4> Blockers(UnavailableBlocks.begin(),
+ UnavailableBlocks.end());
+
+ // Let's find the first basic block with more than one predecessor. Walk
+ // backwards through predecessors if needed.
+ BasicBlock *LoadBB = LI->getParent();
+ BasicBlock *TmpBB = LoadBB;
+
+ // Check that there is no implicit control flow instructions above our load in
+ // its block. If there is an instruction that doesn't always pass the
+ // execution to the following instruction, then moving through it may become
+ // invalid. For example:
+ //
+ // int arr[LEN];
+ // int index = ???;
+ // ...
+ // guard(0 <= index && index < LEN);
+ // use(arr[index]);
+ //
+ // It is illegal to move the array access to any point above the guard,
+ // because if the index is out of bounds we should deoptimize rather than
+ // access the array.
+ // Check that there is no guard in this block above our instruction.
bool MustEnsureSafetyOfSpeculativeExecution =
ICF->isDominatedByICFIFromSameBlock(LI);
- while (TmpBB->getSinglePredecessor()) {
- TmpBB = TmpBB->getSinglePredecessor();
- if (TmpBB == LoadBB) // Infinite (unreachable) loop.
- return false;
- if (Blockers.count(TmpBB))
- return false;
-
- // If any of these blocks has more than one successor (i.e. if the edge we
- // just traversed was critical), then there are other paths through this
- // block along which the load may not be anticipated. Hoisting the load
- // above this block would be adding the load to execution paths along
- // which it was not previously executed.
- if (TmpBB->getTerminator()->getNumSuccessors() != 1)
- return false;
-
- // Check that there is no implicit control flow in a block above.
+ while (TmpBB->getSinglePredecessor()) {
+ TmpBB = TmpBB->getSinglePredecessor();
+ if (TmpBB == LoadBB) // Infinite (unreachable) loop.
+ return false;
+ if (Blockers.count(TmpBB))
+ return false;
+
+ // If any of these blocks has more than one successor (i.e. if the edge we
+ // just traversed was critical), then there are other paths through this
+ // block along which the load may not be anticipated. Hoisting the load
+ // above this block would be adding the load to execution paths along
+ // which it was not previously executed.
+ if (TmpBB->getTerminator()->getNumSuccessors() != 1)
+ return false;
+
+ // Check that there is no implicit control flow in a block above.
MustEnsureSafetyOfSpeculativeExecution =
MustEnsureSafetyOfSpeculativeExecution || ICF->hasICF(TmpBB);
- }
-
- assert(TmpBB);
- LoadBB = TmpBB;
-
- // Check to see how many predecessors have the loaded value fully
- // available.
- MapVector<BasicBlock *, Value *> PredLoads;
+ }
+
+ assert(TmpBB);
+ LoadBB = TmpBB;
+
+ // Check to see how many predecessors have the loaded value fully
+ // available.
+ MapVector<BasicBlock *, Value *> PredLoads;
DenseMap<BasicBlock *, AvailabilityState> FullyAvailableBlocks;
- for (const AvailableValueInBlock &AV : ValuesPerBlock)
+ for (const AvailableValueInBlock &AV : ValuesPerBlock)
FullyAvailableBlocks[AV.BB] = AvailabilityState::Available;
- for (BasicBlock *UnavailableBB : UnavailableBlocks)
+ for (BasicBlock *UnavailableBB : UnavailableBlocks)
FullyAvailableBlocks[UnavailableBB] = AvailabilityState::Unavailable;
-
- SmallVector<BasicBlock *, 4> CriticalEdgePred;
- for (BasicBlock *Pred : predecessors(LoadBB)) {
- // If any predecessor block is an EH pad that does not allow non-PHI
- // instructions before the terminator, we can't PRE the load.
- if (Pred->getTerminator()->isEHPad()) {
- LLVM_DEBUG(
- dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '"
- << Pred->getName() << "': " << *LI << '\n');
- return false;
- }
-
+
+ SmallVector<BasicBlock *, 4> CriticalEdgePred;
+ for (BasicBlock *Pred : predecessors(LoadBB)) {
+ // If any predecessor block is an EH pad that does not allow non-PHI
+ // instructions before the terminator, we can't PRE the load.
+ if (Pred->getTerminator()->isEHPad()) {
+ LLVM_DEBUG(
+ dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD PREDECESSOR '"
+ << Pred->getName() << "': " << *LI << '\n');
+ return false;
+ }
+
if (IsValueFullyAvailableInBlock(Pred, FullyAvailableBlocks)) {
- continue;
- }
-
- if (Pred->getTerminator()->getNumSuccessors() != 1) {
- if (isa<IndirectBrInst>(Pred->getTerminator())) {
- LLVM_DEBUG(
- dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
- << Pred->getName() << "': " << *LI << '\n');
- return false;
- }
-
- // FIXME: Can we support the fallthrough edge?
- if (isa<CallBrInst>(Pred->getTerminator())) {
- LLVM_DEBUG(
- dbgs() << "COULD NOT PRE LOAD BECAUSE OF CALLBR CRITICAL EDGE '"
- << Pred->getName() << "': " << *LI << '\n');
- return false;
- }
-
- if (LoadBB->isEHPad()) {
- LLVM_DEBUG(
- dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"
- << Pred->getName() << "': " << *LI << '\n');
- return false;
- }
-
+ continue;
+ }
+
+ if (Pred->getTerminator()->getNumSuccessors() != 1) {
+ if (isa<IndirectBrInst>(Pred->getTerminator())) {
+ LLVM_DEBUG(
+ dbgs() << "COULD NOT PRE LOAD BECAUSE OF INDBR CRITICAL EDGE '"
+ << Pred->getName() << "': " << *LI << '\n');
+ return false;
+ }
+
+ // FIXME: Can we support the fallthrough edge?
+ if (isa<CallBrInst>(Pred->getTerminator())) {
+ LLVM_DEBUG(
+ dbgs() << "COULD NOT PRE LOAD BECAUSE OF CALLBR CRITICAL EDGE '"
+ << Pred->getName() << "': " << *LI << '\n');
+ return false;
+ }
+
+ if (LoadBB->isEHPad()) {
+ LLVM_DEBUG(
+ dbgs() << "COULD NOT PRE LOAD BECAUSE OF AN EH PAD CRITICAL EDGE '"
+ << Pred->getName() << "': " << *LI << '\n');
+ return false;
+ }
+
// Do not split backedge as it will break the canonical loop form.
if (!isLoadPRESplitBackedgeEnabled())
if (DT->dominates(LoadBB, Pred)) {
@@ -1238,25 +1238,25 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
return false;
}
- CriticalEdgePred.push_back(Pred);
- } else {
- // Only add the predecessors that will not be split for now.
- PredLoads[Pred] = nullptr;
- }
- }
-
- // Decide whether PRE is profitable for this load.
- unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size();
- assert(NumUnavailablePreds != 0 &&
- "Fully available value should already be eliminated!");
-
- // If this load is unavailable in multiple predecessors, reject it.
- // FIXME: If we could restructure the CFG, we could make a common pred with
- // all the preds that don't have an available LI and insert a new load into
- // that one block.
- if (NumUnavailablePreds != 1)
- return false;
-
+ CriticalEdgePred.push_back(Pred);
+ } else {
+ // Only add the predecessors that will not be split for now.
+ PredLoads[Pred] = nullptr;
+ }
+ }
+
+ // Decide whether PRE is profitable for this load.
+ unsigned NumUnavailablePreds = PredLoads.size() + CriticalEdgePred.size();
+ assert(NumUnavailablePreds != 0 &&
+ "Fully available value should already be eliminated!");
+
+ // If this load is unavailable in multiple predecessors, reject it.
+ // FIXME: If we could restructure the CFG, we could make a common pred with
+ // all the preds that don't have an available LI and insert a new load into
+ // that one block.
+ if (NumUnavailablePreds != 1)
+ return false;
+
// Now we know where we will insert load. We must ensure that it is safe
// to speculatively execute the load at that points.
if (MustEnsureSafetyOfSpeculativeExecution) {
@@ -1268,105 +1268,105 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
return false;
}
- // Split critical edges, and update the unavailable predecessors accordingly.
- for (BasicBlock *OrigPred : CriticalEdgePred) {
- BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
- assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!");
- PredLoads[NewPred] = nullptr;
- LLVM_DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
- << LoadBB->getName() << '\n');
- }
-
- // Check if the load can safely be moved to all the unavailable predecessors.
- bool CanDoPRE = true;
- const DataLayout &DL = LI->getModule()->getDataLayout();
- SmallVector<Instruction*, 8> NewInsts;
- for (auto &PredLoad : PredLoads) {
- BasicBlock *UnavailablePred = PredLoad.first;
-
- // Do PHI translation to get its value in the predecessor if necessary. The
- // returned pointer (if non-null) is guaranteed to dominate UnavailablePred.
- // We do the translation for each edge we skipped by going from LI's block
- // to LoadBB, otherwise we might miss pieces needing translation.
-
- // If all preds have a single successor, then we know it is safe to insert
- // the load on the pred (?!?), so we can insert code to materialize the
- // pointer if it is not available.
- Value *LoadPtr = LI->getPointerOperand();
- BasicBlock *Cur = LI->getParent();
- while (Cur != LoadBB) {
- PHITransAddr Address(LoadPtr, DL, AC);
- LoadPtr = Address.PHITranslateWithInsertion(
- Cur, Cur->getSinglePredecessor(), *DT, NewInsts);
- if (!LoadPtr) {
- CanDoPRE = false;
- break;
- }
- Cur = Cur->getSinglePredecessor();
- }
-
- if (LoadPtr) {
- PHITransAddr Address(LoadPtr, DL, AC);
- LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, *DT,
- NewInsts);
- }
- // If we couldn't find or insert a computation of this phi translated value,
- // we fail PRE.
- if (!LoadPtr) {
- LLVM_DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
- << *LI->getPointerOperand() << "\n");
- CanDoPRE = false;
- break;
- }
-
- PredLoad.second = LoadPtr;
- }
-
- if (!CanDoPRE) {
- while (!NewInsts.empty()) {
- // Erase instructions generated by the failed PHI translation before
- // trying to number them. PHI translation might insert instructions
- // in basic blocks other than the current one, and we delete them
- // directly, as markInstructionForDeletion only allows removing from the
- // current basic block.
- NewInsts.pop_back_val()->eraseFromParent();
- }
- // HINT: Don't revert the edge-splitting as following transformation may
- // also need to split these critical edges.
- return !CriticalEdgePred.empty();
- }
-
- // Okay, we can eliminate this load by inserting a reload in the predecessor
- // and using PHI construction to get the value in the other predecessors, do
- // it.
- LLVM_DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n');
- LLVM_DEBUG(if (!NewInsts.empty()) dbgs()
- << "INSERTED " << NewInsts.size() << " INSTS: " << *NewInsts.back()
- << '\n');
-
- // Assign value numbers to the new instructions.
- for (Instruction *I : NewInsts) {
- // Instructions that have been inserted in predecessor(s) to materialize
- // the load address do not retain their original debug locations. Doing
- // so could lead to confusing (but correct) source attributions.
+ // Split critical edges, and update the unavailable predecessors accordingly.
+ for (BasicBlock *OrigPred : CriticalEdgePred) {
+ BasicBlock *NewPred = splitCriticalEdges(OrigPred, LoadBB);
+ assert(!PredLoads.count(OrigPred) && "Split edges shouldn't be in map!");
+ PredLoads[NewPred] = nullptr;
+ LLVM_DEBUG(dbgs() << "Split critical edge " << OrigPred->getName() << "->"
+ << LoadBB->getName() << '\n');
+ }
+
+ // Check if the load can safely be moved to all the unavailable predecessors.
+ bool CanDoPRE = true;
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+ SmallVector<Instruction*, 8> NewInsts;
+ for (auto &PredLoad : PredLoads) {
+ BasicBlock *UnavailablePred = PredLoad.first;
+
+ // Do PHI translation to get its value in the predecessor if necessary. The
+ // returned pointer (if non-null) is guaranteed to dominate UnavailablePred.
+ // We do the translation for each edge we skipped by going from LI's block
+ // to LoadBB, otherwise we might miss pieces needing translation.
+
+ // If all preds have a single successor, then we know it is safe to insert
+ // the load on the pred (?!?), so we can insert code to materialize the
+ // pointer if it is not available.
+ Value *LoadPtr = LI->getPointerOperand();
+ BasicBlock *Cur = LI->getParent();
+ while (Cur != LoadBB) {
+ PHITransAddr Address(LoadPtr, DL, AC);
+ LoadPtr = Address.PHITranslateWithInsertion(
+ Cur, Cur->getSinglePredecessor(), *DT, NewInsts);
+ if (!LoadPtr) {
+ CanDoPRE = false;
+ break;
+ }
+ Cur = Cur->getSinglePredecessor();
+ }
+
+ if (LoadPtr) {
+ PHITransAddr Address(LoadPtr, DL, AC);
+ LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred, *DT,
+ NewInsts);
+ }
+ // If we couldn't find or insert a computation of this phi translated value,
+ // we fail PRE.
+ if (!LoadPtr) {
+ LLVM_DEBUG(dbgs() << "COULDN'T INSERT PHI TRANSLATED VALUE OF: "
+ << *LI->getPointerOperand() << "\n");
+ CanDoPRE = false;
+ break;
+ }
+
+ PredLoad.second = LoadPtr;
+ }
+
+ if (!CanDoPRE) {
+ while (!NewInsts.empty()) {
+ // Erase instructions generated by the failed PHI translation before
+ // trying to number them. PHI translation might insert instructions
+ // in basic blocks other than the current one, and we delete them
+ // directly, as markInstructionForDeletion only allows removing from the
+ // current basic block.
+ NewInsts.pop_back_val()->eraseFromParent();
+ }
+ // HINT: Don't revert the edge-splitting as following transformation may
+ // also need to split these critical edges.
+ return !CriticalEdgePred.empty();
+ }
+
+ // Okay, we can eliminate this load by inserting a reload in the predecessor
+ // and using PHI construction to get the value in the other predecessors, do
+ // it.
+ LLVM_DEBUG(dbgs() << "GVN REMOVING PRE LOAD: " << *LI << '\n');
+ LLVM_DEBUG(if (!NewInsts.empty()) dbgs()
+ << "INSERTED " << NewInsts.size() << " INSTS: " << *NewInsts.back()
+ << '\n');
+
+ // Assign value numbers to the new instructions.
+ for (Instruction *I : NewInsts) {
+ // Instructions that have been inserted in predecessor(s) to materialize
+ // the load address do not retain their original debug locations. Doing
+ // so could lead to confusing (but correct) source attributions.
I->updateLocationAfterHoist();
-
- // FIXME: We really _ought_ to insert these value numbers into their
- // parent's availability map. However, in doing so, we risk getting into
- // ordering issues. If a block hasn't been processed yet, we would be
- // marking a value as AVAIL-IN, which isn't what we intend.
- VN.lookupOrAdd(I);
- }
-
- for (const auto &PredLoad : PredLoads) {
- BasicBlock *UnavailablePred = PredLoad.first;
- Value *LoadPtr = PredLoad.second;
-
- auto *NewLoad = new LoadInst(
- LI->getType(), LoadPtr, LI->getName() + ".pre", LI->isVolatile(),
- LI->getAlign(), LI->getOrdering(), LI->getSyncScopeID(),
- UnavailablePred->getTerminator());
- NewLoad->setDebugLoc(LI->getDebugLoc());
+
+ // FIXME: We really _ought_ to insert these value numbers into their
+ // parent's availability map. However, in doing so, we risk getting into
+ // ordering issues. If a block hasn't been processed yet, we would be
+ // marking a value as AVAIL-IN, which isn't what we intend.
+ VN.lookupOrAdd(I);
+ }
+
+ for (const auto &PredLoad : PredLoads) {
+ BasicBlock *UnavailablePred = PredLoad.first;
+ Value *LoadPtr = PredLoad.second;
+
+ auto *NewLoad = new LoadInst(
+ LI->getType(), LoadPtr, LI->getName() + ".pre", LI->isVolatile(),
+ LI->getAlign(), LI->getOrdering(), LI->getSyncScopeID(),
+ UnavailablePred->getTerminator());
+ NewLoad->setDebugLoc(LI->getDebugLoc());
if (MSSAU) {
auto *MSSA = MSSAU->getMemorySSA();
// Get the defining access of the original load or use the load if it is a
@@ -1383,223 +1383,223 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
else
MSSAU->insertUse(cast<MemoryUse>(NewAccess), /*RenameUses=*/true);
}
-
- // Transfer the old load's AA tags to the new load.
- AAMDNodes Tags;
- LI->getAAMetadata(Tags);
- if (Tags)
- NewLoad->setAAMetadata(Tags);
-
- if (auto *MD = LI->getMetadata(LLVMContext::MD_invariant_load))
- NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD);
- if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group))
- NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD);
- if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range))
- NewLoad->setMetadata(LLVMContext::MD_range, RangeMD);
-
- // We do not propagate the old load's debug location, because the new
- // load now lives in a different BB, and we want to avoid a jumpy line
- // table.
- // FIXME: How do we retain source locations without causing poor debugging
- // behavior?
-
- // Add the newly created load.
- ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
- NewLoad));
- MD->invalidateCachedPointerInfo(LoadPtr);
- LLVM_DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n');
- }
-
- // Perform PHI construction.
- Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
- LI->replaceAllUsesWith(V);
- if (isa<PHINode>(V))
- V->takeName(LI);
- if (Instruction *I = dyn_cast<Instruction>(V))
- I->setDebugLoc(LI->getDebugLoc());
- if (V->getType()->isPtrOrPtrVectorTy())
- MD->invalidateCachedPointerInfo(V);
- markInstructionForDeletion(LI);
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI)
- << "load eliminated by PRE";
- });
- ++NumPRELoad;
- return true;
-}
-
-static void reportLoadElim(LoadInst *LI, Value *AvailableValue,
- OptimizationRemarkEmitter *ORE) {
- using namespace ore;
-
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "LoadElim", LI)
- << "load of type " << NV("Type", LI->getType()) << " eliminated"
- << setExtraArgs() << " in favor of "
- << NV("InfavorOfValue", AvailableValue);
- });
-}
-
-/// Attempt to eliminate a load whose dependencies are
-/// non-local by performing PHI construction.
-bool GVN::processNonLocalLoad(LoadInst *LI) {
- // non-local speculations are not allowed under asan.
- if (LI->getParent()->getParent()->hasFnAttribute(
- Attribute::SanitizeAddress) ||
- LI->getParent()->getParent()->hasFnAttribute(
- Attribute::SanitizeHWAddress))
- return false;
-
- // Step 1: Find the non-local dependencies of the load.
- LoadDepVect Deps;
- MD->getNonLocalPointerDependency(LI, Deps);
-
- // If we had to process more than one hundred blocks to find the
- // dependencies, this load isn't worth worrying about. Optimizing
- // it will be too expensive.
- unsigned NumDeps = Deps.size();
- if (NumDeps > MaxNumDeps)
- return false;
-
- // If we had a phi translation failure, we'll have a single entry which is a
- // clobber in the current block. Reject this early.
- if (NumDeps == 1 &&
- !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
- LLVM_DEBUG(dbgs() << "GVN: non-local load "; LI->printAsOperand(dbgs());
- dbgs() << " has unknown dependencies\n";);
- return false;
- }
-
+
+ // Transfer the old load's AA tags to the new load.
+ AAMDNodes Tags;
+ LI->getAAMetadata(Tags);
+ if (Tags)
+ NewLoad->setAAMetadata(Tags);
+
+ if (auto *MD = LI->getMetadata(LLVMContext::MD_invariant_load))
+ NewLoad->setMetadata(LLVMContext::MD_invariant_load, MD);
+ if (auto *InvGroupMD = LI->getMetadata(LLVMContext::MD_invariant_group))
+ NewLoad->setMetadata(LLVMContext::MD_invariant_group, InvGroupMD);
+ if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range))
+ NewLoad->setMetadata(LLVMContext::MD_range, RangeMD);
+
+ // We do not propagate the old load's debug location, because the new
+ // load now lives in a different BB, and we want to avoid a jumpy line
+ // table.
+ // FIXME: How do we retain source locations without causing poor debugging
+ // behavior?
+
+ // Add the newly created load.
+ ValuesPerBlock.push_back(AvailableValueInBlock::get(UnavailablePred,
+ NewLoad));
+ MD->invalidateCachedPointerInfo(LoadPtr);
+ LLVM_DEBUG(dbgs() << "GVN INSERTED " << *NewLoad << '\n');
+ }
+
+ // Perform PHI construction.
+ Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+ LI->replaceAllUsesWith(V);
+ if (isa<PHINode>(V))
+ V->takeName(LI);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ I->setDebugLoc(LI->getDebugLoc());
+ if (V->getType()->isPtrOrPtrVectorTy())
+ MD->invalidateCachedPointerInfo(V);
+ markInstructionForDeletion(LI);
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "LoadPRE", LI)
+ << "load eliminated by PRE";
+ });
+ ++NumPRELoad;
+ return true;
+}
+
+static void reportLoadElim(LoadInst *LI, Value *AvailableValue,
+ OptimizationRemarkEmitter *ORE) {
+ using namespace ore;
+
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "LoadElim", LI)
+ << "load of type " << NV("Type", LI->getType()) << " eliminated"
+ << setExtraArgs() << " in favor of "
+ << NV("InfavorOfValue", AvailableValue);
+ });
+}
+
+/// Attempt to eliminate a load whose dependencies are
+/// non-local by performing PHI construction.
+bool GVN::processNonLocalLoad(LoadInst *LI) {
+ // non-local speculations are not allowed under asan.
+ if (LI->getParent()->getParent()->hasFnAttribute(
+ Attribute::SanitizeAddress) ||
+ LI->getParent()->getParent()->hasFnAttribute(
+ Attribute::SanitizeHWAddress))
+ return false;
+
+ // Step 1: Find the non-local dependencies of the load.
+ LoadDepVect Deps;
+ MD->getNonLocalPointerDependency(LI, Deps);
+
+ // If we had to process more than one hundred blocks to find the
+ // dependencies, this load isn't worth worrying about. Optimizing
+ // it will be too expensive.
+ unsigned NumDeps = Deps.size();
+ if (NumDeps > MaxNumDeps)
+ return false;
+
+ // If we had a phi translation failure, we'll have a single entry which is a
+ // clobber in the current block. Reject this early.
+ if (NumDeps == 1 &&
+ !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
+ LLVM_DEBUG(dbgs() << "GVN: non-local load "; LI->printAsOperand(dbgs());
+ dbgs() << " has unknown dependencies\n";);
+ return false;
+ }
+
bool Changed = false;
- // If this load follows a GEP, see if we can PRE the indices before analyzing.
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) {
- for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
- OE = GEP->idx_end();
- OI != OE; ++OI)
- if (Instruction *I = dyn_cast<Instruction>(OI->get()))
+ // If this load follows a GEP, see if we can PRE the indices before analyzing.
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getOperand(0))) {
+ for (GetElementPtrInst::op_iterator OI = GEP->idx_begin(),
+ OE = GEP->idx_end();
+ OI != OE; ++OI)
+ if (Instruction *I = dyn_cast<Instruction>(OI->get()))
Changed |= performScalarPRE(I);
- }
-
- // Step 2: Analyze the availability of the load
- AvailValInBlkVect ValuesPerBlock;
- UnavailBlkVect UnavailableBlocks;
- AnalyzeLoadAvailability(LI, Deps, ValuesPerBlock, UnavailableBlocks);
-
- // If we have no predecessors that produce a known value for this load, exit
- // early.
- if (ValuesPerBlock.empty())
+ }
+
+ // Step 2: Analyze the availability of the load
+ AvailValInBlkVect ValuesPerBlock;
+ UnavailBlkVect UnavailableBlocks;
+ AnalyzeLoadAvailability(LI, Deps, ValuesPerBlock, UnavailableBlocks);
+
+ // If we have no predecessors that produce a known value for this load, exit
+ // early.
+ if (ValuesPerBlock.empty())
return Changed;
-
- // Step 3: Eliminate fully redundancy.
- //
- // If all of the instructions we depend on produce a known value for this
- // load, then it is fully redundant and we can use PHI insertion to compute
- // its value. Insert PHIs and remove the fully redundant value now.
- if (UnavailableBlocks.empty()) {
- LLVM_DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
-
- // Perform PHI construction.
- Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
- LI->replaceAllUsesWith(V);
-
- if (isa<PHINode>(V))
- V->takeName(LI);
- if (Instruction *I = dyn_cast<Instruction>(V))
- // If instruction I has debug info, then we should not update it.
- // Also, if I has a null DebugLoc, then it is still potentially incorrect
- // to propagate LI's DebugLoc because LI may not post-dominate I.
- if (LI->getDebugLoc() && LI->getParent() == I->getParent())
- I->setDebugLoc(LI->getDebugLoc());
- if (V->getType()->isPtrOrPtrVectorTy())
- MD->invalidateCachedPointerInfo(V);
- markInstructionForDeletion(LI);
- ++NumGVNLoad;
- reportLoadElim(LI, V, ORE);
- return true;
- }
-
- // Step 4: Eliminate partial redundancy.
- if (!isPREEnabled() || !isLoadPREEnabled())
+
+ // Step 3: Eliminate fully redundancy.
+ //
+ // If all of the instructions we depend on produce a known value for this
+ // load, then it is fully redundant and we can use PHI insertion to compute
+ // its value. Insert PHIs and remove the fully redundant value now.
+ if (UnavailableBlocks.empty()) {
+ LLVM_DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
+
+ // Perform PHI construction.
+ Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+ LI->replaceAllUsesWith(V);
+
+ if (isa<PHINode>(V))
+ V->takeName(LI);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ // If instruction I has debug info, then we should not update it.
+ // Also, if I has a null DebugLoc, then it is still potentially incorrect
+ // to propagate LI's DebugLoc because LI may not post-dominate I.
+ if (LI->getDebugLoc() && LI->getParent() == I->getParent())
+ I->setDebugLoc(LI->getDebugLoc());
+ if (V->getType()->isPtrOrPtrVectorTy())
+ MD->invalidateCachedPointerInfo(V);
+ markInstructionForDeletion(LI);
+ ++NumGVNLoad;
+ reportLoadElim(LI, V, ORE);
+ return true;
+ }
+
+ // Step 4: Eliminate partial redundancy.
+ if (!isPREEnabled() || !isLoadPREEnabled())
return Changed;
- if (!isLoadInLoopPREEnabled() && this->LI &&
- this->LI->getLoopFor(LI->getParent()))
+ if (!isLoadInLoopPREEnabled() && this->LI &&
+ this->LI->getLoopFor(LI->getParent()))
return Changed;
-
+
return Changed || PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
-}
-
-static bool impliesEquivalanceIfTrue(CmpInst* Cmp) {
- if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_EQ)
- return true;
-
- // Floating point comparisons can be equal, but not equivalent. Cases:
- // NaNs for unordered operators
- // +0.0 vs 0.0 for all operators
- if (Cmp->getPredicate() == CmpInst::Predicate::FCMP_OEQ ||
- (Cmp->getPredicate() == CmpInst::Predicate::FCMP_UEQ &&
- Cmp->getFastMathFlags().noNaNs())) {
- Value *LHS = Cmp->getOperand(0);
- Value *RHS = Cmp->getOperand(1);
- // If we can prove either side non-zero, then equality must imply
- // equivalence.
- // FIXME: We should do this optimization if 'no signed zeros' is
- // applicable via an instruction-level fast-math-flag or some other
- // indicator that relaxed FP semantics are being used.
- if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero())
- return true;
- if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero())
- return true;;
- // TODO: Handle vector floating point constants
- }
- return false;
-}
-
-static bool impliesEquivalanceIfFalse(CmpInst* Cmp) {
- if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_NE)
- return true;
-
- // Floating point comparisons can be equal, but not equivelent. Cases:
- // NaNs for unordered operators
- // +0.0 vs 0.0 for all operators
- if ((Cmp->getPredicate() == CmpInst::Predicate::FCMP_ONE &&
- Cmp->getFastMathFlags().noNaNs()) ||
- Cmp->getPredicate() == CmpInst::Predicate::FCMP_UNE) {
- Value *LHS = Cmp->getOperand(0);
- Value *RHS = Cmp->getOperand(1);
- // If we can prove either side non-zero, then equality must imply
- // equivalence.
- // FIXME: We should do this optimization if 'no signed zeros' is
- // applicable via an instruction-level fast-math-flag or some other
- // indicator that relaxed FP semantics are being used.
- if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero())
- return true;
- if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero())
- return true;;
- // TODO: Handle vector floating point constants
- }
- return false;
-}
-
-
-static bool hasUsersIn(Value *V, BasicBlock *BB) {
- for (User *U : V->users())
- if (isa<Instruction>(U) &&
- cast<Instruction>(U)->getParent() == BB)
- return true;
- return false;
-}
-
-bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
- assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume &&
- "This function can only be called with llvm.assume intrinsic");
- Value *V = IntrinsicI->getArgOperand(0);
-
- if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) {
- if (Cond->isZero()) {
- Type *Int8Ty = Type::getInt8Ty(V->getContext());
- // Insert a new store to null instruction before the load to indicate that
- // this code is not reachable. FIXME: We could insert unreachable
- // instruction directly because we can modify the CFG.
+}
+
+static bool impliesEquivalanceIfTrue(CmpInst* Cmp) {
+ if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_EQ)
+ return true;
+
+ // Floating point comparisons can be equal, but not equivalent. Cases:
+ // NaNs for unordered operators
+ // +0.0 vs 0.0 for all operators
+ if (Cmp->getPredicate() == CmpInst::Predicate::FCMP_OEQ ||
+ (Cmp->getPredicate() == CmpInst::Predicate::FCMP_UEQ &&
+ Cmp->getFastMathFlags().noNaNs())) {
+ Value *LHS = Cmp->getOperand(0);
+ Value *RHS = Cmp->getOperand(1);
+ // If we can prove either side non-zero, then equality must imply
+ // equivalence.
+ // FIXME: We should do this optimization if 'no signed zeros' is
+ // applicable via an instruction-level fast-math-flag or some other
+ // indicator that relaxed FP semantics are being used.
+ if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero())
+ return true;
+ if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero())
+ return true;;
+ // TODO: Handle vector floating point constants
+ }
+ return false;
+}
+
+static bool impliesEquivalanceIfFalse(CmpInst* Cmp) {
+ if (Cmp->getPredicate() == CmpInst::Predicate::ICMP_NE)
+ return true;
+
+ // Floating point comparisons can be equal, but not equivelent. Cases:
+ // NaNs for unordered operators
+ // +0.0 vs 0.0 for all operators
+ if ((Cmp->getPredicate() == CmpInst::Predicate::FCMP_ONE &&
+ Cmp->getFastMathFlags().noNaNs()) ||
+ Cmp->getPredicate() == CmpInst::Predicate::FCMP_UNE) {
+ Value *LHS = Cmp->getOperand(0);
+ Value *RHS = Cmp->getOperand(1);
+ // If we can prove either side non-zero, then equality must imply
+ // equivalence.
+ // FIXME: We should do this optimization if 'no signed zeros' is
+ // applicable via an instruction-level fast-math-flag or some other
+ // indicator that relaxed FP semantics are being used.
+ if (isa<ConstantFP>(LHS) && !cast<ConstantFP>(LHS)->isZero())
+ return true;
+ if (isa<ConstantFP>(RHS) && !cast<ConstantFP>(RHS)->isZero())
+ return true;;
+ // TODO: Handle vector floating point constants
+ }
+ return false;
+}
+
+
+static bool hasUsersIn(Value *V, BasicBlock *BB) {
+ for (User *U : V->users())
+ if (isa<Instruction>(U) &&
+ cast<Instruction>(U)->getParent() == BB)
+ return true;
+ return false;
+}
+
+bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
+ assert(IntrinsicI->getIntrinsicID() == Intrinsic::assume &&
+ "This function can only be called with llvm.assume intrinsic");
+ Value *V = IntrinsicI->getArgOperand(0);
+
+ if (ConstantInt *Cond = dyn_cast<ConstantInt>(V)) {
+ if (Cond->isZero()) {
+ Type *Int8Ty = Type::getInt8Ty(V->getContext());
+ // Insert a new store to null instruction before the load to indicate that
+ // this code is not reachable. FIXME: We could insert unreachable
+ // instruction directly because we can modify the CFG.
auto *NewS = new StoreInst(UndefValue::get(Int8Ty),
Constant::getNullValue(Int8Ty->getPointerTo()),
IntrinsicI);
@@ -1634,1312 +1634,1312 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
MSSAU->insertDef(cast<MemoryDef>(NewDef), /*RenameUses=*/false);
}
- }
- if (isAssumeWithEmptyBundle(*IntrinsicI))
- markInstructionForDeletion(IntrinsicI);
- return false;
- } else if (isa<Constant>(V)) {
- // If it's not false, and constant, it must evaluate to true. This means our
- // assume is assume(true), and thus, pointless, and we don't want to do
- // anything more here.
- return false;
- }
-
- Constant *True = ConstantInt::getTrue(V->getContext());
- bool Changed = false;
-
- for (BasicBlock *Successor : successors(IntrinsicI->getParent())) {
- BasicBlockEdge Edge(IntrinsicI->getParent(), Successor);
-
- // This property is only true in dominated successors, propagateEquality
- // will check dominance for us.
- Changed |= propagateEquality(V, True, Edge, false);
- }
-
- // We can replace assume value with true, which covers cases like this:
- // call void @llvm.assume(i1 %cmp)
- // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
- ReplaceOperandsWithMap[V] = True;
-
+ }
+ if (isAssumeWithEmptyBundle(*IntrinsicI))
+ markInstructionForDeletion(IntrinsicI);
+ return false;
+ } else if (isa<Constant>(V)) {
+ // If it's not false, and constant, it must evaluate to true. This means our
+ // assume is assume(true), and thus, pointless, and we don't want to do
+ // anything more here.
+ return false;
+ }
+
+ Constant *True = ConstantInt::getTrue(V->getContext());
+ bool Changed = false;
+
+ for (BasicBlock *Successor : successors(IntrinsicI->getParent())) {
+ BasicBlockEdge Edge(IntrinsicI->getParent(), Successor);
+
+ // This property is only true in dominated successors, propagateEquality
+ // will check dominance for us.
+ Changed |= propagateEquality(V, True, Edge, false);
+ }
+
+ // We can replace assume value with true, which covers cases like this:
+ // call void @llvm.assume(i1 %cmp)
+ // br i1 %cmp, label %bb1, label %bb2 ; will change %cmp to true
+ ReplaceOperandsWithMap[V] = True;
+
// Similarly, after assume(!NotV) we know that NotV == false.
Value *NotV;
if (match(V, m_Not(m_Value(NotV))))
ReplaceOperandsWithMap[NotV] = ConstantInt::getFalse(V->getContext());
- // If we find an equality fact, canonicalize all dominated uses in this block
- // to one of the two values. We heuristically choice the "oldest" of the
- // two where age is determined by value number. (Note that propagateEquality
- // above handles the cross block case.)
- //
- // Key case to cover are:
- // 1)
- // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
- // call void @llvm.assume(i1 %cmp)
- // ret float %0 ; will change it to ret float 3.000000e+00
- // 2)
- // %load = load float, float* %addr
- // %cmp = fcmp oeq float %load, %0
- // call void @llvm.assume(i1 %cmp)
- // ret float %load ; will change it to ret float %0
- if (auto *CmpI = dyn_cast<CmpInst>(V)) {
- if (impliesEquivalanceIfTrue(CmpI)) {
- Value *CmpLHS = CmpI->getOperand(0);
- Value *CmpRHS = CmpI->getOperand(1);
- // Heuristically pick the better replacement -- the choice of heuristic
- // isn't terribly important here, but the fact we canonicalize on some
- // replacement is for exposing other simplifications.
- // TODO: pull this out as a helper function and reuse w/existing
- // (slightly different) logic.
- if (isa<Constant>(CmpLHS) && !isa<Constant>(CmpRHS))
- std::swap(CmpLHS, CmpRHS);
- if (!isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))
- std::swap(CmpLHS, CmpRHS);
- if ((isa<Argument>(CmpLHS) && isa<Argument>(CmpRHS)) ||
- (isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))) {
- // Move the 'oldest' value to the right-hand side, using the value
- // number as a proxy for age.
- uint32_t LVN = VN.lookupOrAdd(CmpLHS);
- uint32_t RVN = VN.lookupOrAdd(CmpRHS);
- if (LVN < RVN)
- std::swap(CmpLHS, CmpRHS);
- }
-
- // Handle degenerate case where we either haven't pruned a dead path or a
- // removed a trivial assume yet.
- if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS))
- return Changed;
-
- LLVM_DEBUG(dbgs() << "Replacing dominated uses of "
- << *CmpLHS << " with "
- << *CmpRHS << " in block "
- << IntrinsicI->getParent()->getName() << "\n");
-
-
- // Setup the replacement map - this handles uses within the same block
- if (hasUsersIn(CmpLHS, IntrinsicI->getParent()))
- ReplaceOperandsWithMap[CmpLHS] = CmpRHS;
-
- // NOTE: The non-block local cases are handled by the call to
- // propagateEquality above; this block is just about handling the block
- // local cases. TODO: There's a bunch of logic in propagateEqualiy which
- // isn't duplicated for the block local case, can we share it somehow?
- }
- }
- return Changed;
-}
-
-static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
- patchReplacementInstruction(I, Repl);
- I->replaceAllUsesWith(Repl);
-}
-
-/// Attempt to eliminate a load, first by eliminating it
-/// locally, and then attempting non-local elimination if that fails.
-bool GVN::processLoad(LoadInst *L) {
- if (!MD)
- return false;
-
- // This code hasn't been audited for ordered or volatile memory access
- if (!L->isUnordered())
- return false;
-
- if (L->use_empty()) {
- markInstructionForDeletion(L);
- return true;
- }
-
- // ... to a pointer that has been loaded from before...
- MemDepResult Dep = MD->getDependency(L);
-
- // If it is defined in another block, try harder.
- if (Dep.isNonLocal())
- return processNonLocalLoad(L);
-
- // Only handle the local case below
- if (!Dep.isDef() && !Dep.isClobber()) {
- // This might be a NonFuncLocal or an Unknown
- LLVM_DEBUG(
- // fast print dep, using operator<< on instruction is too slow.
- dbgs() << "GVN: load "; L->printAsOperand(dbgs());
- dbgs() << " has unknown dependence\n";);
- return false;
- }
-
- AvailableValue AV;
- if (AnalyzeLoadAvailability(L, Dep, L->getPointerOperand(), AV)) {
- Value *AvailableValue = AV.MaterializeAdjustedValue(L, L, *this);
-
- // Replace the load!
- patchAndReplaceAllUsesWith(L, AvailableValue);
- markInstructionForDeletion(L);
+ // If we find an equality fact, canonicalize all dominated uses in this block
+ // to one of the two values. We heuristically choice the "oldest" of the
+ // two where age is determined by value number. (Note that propagateEquality
+ // above handles the cross block case.)
+ //
+ // Key case to cover are:
+ // 1)
+ // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
+ // call void @llvm.assume(i1 %cmp)
+ // ret float %0 ; will change it to ret float 3.000000e+00
+ // 2)
+ // %load = load float, float* %addr
+ // %cmp = fcmp oeq float %load, %0
+ // call void @llvm.assume(i1 %cmp)
+ // ret float %load ; will change it to ret float %0
+ if (auto *CmpI = dyn_cast<CmpInst>(V)) {
+ if (impliesEquivalanceIfTrue(CmpI)) {
+ Value *CmpLHS = CmpI->getOperand(0);
+ Value *CmpRHS = CmpI->getOperand(1);
+ // Heuristically pick the better replacement -- the choice of heuristic
+ // isn't terribly important here, but the fact we canonicalize on some
+ // replacement is for exposing other simplifications.
+ // TODO: pull this out as a helper function and reuse w/existing
+ // (slightly different) logic.
+ if (isa<Constant>(CmpLHS) && !isa<Constant>(CmpRHS))
+ std::swap(CmpLHS, CmpRHS);
+ if (!isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))
+ std::swap(CmpLHS, CmpRHS);
+ if ((isa<Argument>(CmpLHS) && isa<Argument>(CmpRHS)) ||
+ (isa<Instruction>(CmpLHS) && isa<Instruction>(CmpRHS))) {
+ // Move the 'oldest' value to the right-hand side, using the value
+ // number as a proxy for age.
+ uint32_t LVN = VN.lookupOrAdd(CmpLHS);
+ uint32_t RVN = VN.lookupOrAdd(CmpRHS);
+ if (LVN < RVN)
+ std::swap(CmpLHS, CmpRHS);
+ }
+
+ // Handle degenerate case where we either haven't pruned a dead path or a
+ // removed a trivial assume yet.
+ if (isa<Constant>(CmpLHS) && isa<Constant>(CmpRHS))
+ return Changed;
+
+ LLVM_DEBUG(dbgs() << "Replacing dominated uses of "
+ << *CmpLHS << " with "
+ << *CmpRHS << " in block "
+ << IntrinsicI->getParent()->getName() << "\n");
+
+
+ // Setup the replacement map - this handles uses within the same block
+ if (hasUsersIn(CmpLHS, IntrinsicI->getParent()))
+ ReplaceOperandsWithMap[CmpLHS] = CmpRHS;
+
+ // NOTE: The non-block local cases are handled by the call to
+ // propagateEquality above; this block is just about handling the block
+ // local cases. TODO: There's a bunch of logic in propagateEqualiy which
+ // isn't duplicated for the block local case, can we share it somehow?
+ }
+ }
+ return Changed;
+}
+
+static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
+ patchReplacementInstruction(I, Repl);
+ I->replaceAllUsesWith(Repl);
+}
+
+/// Attempt to eliminate a load, first by eliminating it
+/// locally, and then attempting non-local elimination if that fails.
+bool GVN::processLoad(LoadInst *L) {
+ if (!MD)
+ return false;
+
+ // This code hasn't been audited for ordered or volatile memory access
+ if (!L->isUnordered())
+ return false;
+
+ if (L->use_empty()) {
+ markInstructionForDeletion(L);
+ return true;
+ }
+
+ // ... to a pointer that has been loaded from before...
+ MemDepResult Dep = MD->getDependency(L);
+
+ // If it is defined in another block, try harder.
+ if (Dep.isNonLocal())
+ return processNonLocalLoad(L);
+
+ // Only handle the local case below
+ if (!Dep.isDef() && !Dep.isClobber()) {
+ // This might be a NonFuncLocal or an Unknown
+ LLVM_DEBUG(
+ // fast print dep, using operator<< on instruction is too slow.
+ dbgs() << "GVN: load "; L->printAsOperand(dbgs());
+ dbgs() << " has unknown dependence\n";);
+ return false;
+ }
+
+ AvailableValue AV;
+ if (AnalyzeLoadAvailability(L, Dep, L->getPointerOperand(), AV)) {
+ Value *AvailableValue = AV.MaterializeAdjustedValue(L, L, *this);
+
+ // Replace the load!
+ patchAndReplaceAllUsesWith(L, AvailableValue);
+ markInstructionForDeletion(L);
if (MSSAU)
MSSAU->removeMemoryAccess(L);
- ++NumGVNLoad;
- reportLoadElim(L, AvailableValue, ORE);
- // Tell MDA to rexamine the reused pointer since we might have more
- // information after forwarding it.
- if (MD && AvailableValue->getType()->isPtrOrPtrVectorTy())
- MD->invalidateCachedPointerInfo(AvailableValue);
- return true;
- }
-
- return false;
-}
-
-/// Return a pair the first field showing the value number of \p Exp and the
-/// second field showing whether it is a value number newly created.
-std::pair<uint32_t, bool>
-GVN::ValueTable::assignExpNewValueNum(Expression &Exp) {
- uint32_t &e = expressionNumbering[Exp];
- bool CreateNewValNum = !e;
- if (CreateNewValNum) {
- Expressions.push_back(Exp);
- if (ExprIdx.size() < nextValueNumber + 1)
- ExprIdx.resize(nextValueNumber * 2);
- e = nextValueNumber;
- ExprIdx[nextValueNumber++] = nextExprNumber++;
- }
- return {e, CreateNewValNum};
-}
-
-/// Return whether all the values related with the same \p num are
-/// defined in \p BB.
-bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB,
- GVN &Gvn) {
- LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
- while (Vals && Vals->BB == BB)
- Vals = Vals->Next;
- return !Vals;
-}
-
-/// Wrap phiTranslateImpl to provide caching functionality.
-uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred,
- const BasicBlock *PhiBlock, uint32_t Num,
- GVN &Gvn) {
- auto FindRes = PhiTranslateTable.find({Num, Pred});
- if (FindRes != PhiTranslateTable.end())
- return FindRes->second;
- uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn);
- PhiTranslateTable.insert({{Num, Pred}, NewNum});
- return NewNum;
-}
-
-// Return true if the value number \p Num and NewNum have equal value.
-// Return false if the result is unknown.
-bool GVN::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum,
- const BasicBlock *Pred,
- const BasicBlock *PhiBlock, GVN &Gvn) {
- CallInst *Call = nullptr;
- LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
- while (Vals) {
- Call = dyn_cast<CallInst>(Vals->Val);
- if (Call && Call->getParent() == PhiBlock)
- break;
- Vals = Vals->Next;
- }
-
- if (AA->doesNotAccessMemory(Call))
- return true;
-
- if (!MD || !AA->onlyReadsMemory(Call))
- return false;
-
- MemDepResult local_dep = MD->getDependency(Call);
- if (!local_dep.isNonLocal())
- return false;
-
- const MemoryDependenceResults::NonLocalDepInfo &deps =
- MD->getNonLocalCallDependency(Call);
-
- // Check to see if the Call has no function local clobber.
- for (unsigned i = 0; i < deps.size(); i++) {
- if (deps[i].getResult().isNonFuncLocal())
- return true;
- }
- return false;
-}
-
-/// Translate value number \p Num using phis, so that it has the values of
-/// the phis in BB.
-uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
- const BasicBlock *PhiBlock,
- uint32_t Num, GVN &Gvn) {
- if (PHINode *PN = NumberingPhi[Num]) {
- for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
- if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred)
- if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false))
- return TransVal;
- }
- return Num;
- }
-
- // If there is any value related with Num is defined in a BB other than
- // PhiBlock, it cannot depend on a phi in PhiBlock without going through
- // a backedge. We can do an early exit in that case to save compile time.
- if (!areAllValsInBB(Num, PhiBlock, Gvn))
- return Num;
-
- if (Num >= ExprIdx.size() || ExprIdx[Num] == 0)
- return Num;
- Expression Exp = Expressions[ExprIdx[Num]];
-
- for (unsigned i = 0; i < Exp.varargs.size(); i++) {
- // For InsertValue and ExtractValue, some varargs are index numbers
- // instead of value numbers. Those index numbers should not be
- // translated.
- if ((i > 1 && Exp.opcode == Instruction::InsertValue) ||
- (i > 0 && Exp.opcode == Instruction::ExtractValue) ||
- (i > 1 && Exp.opcode == Instruction::ShuffleVector))
- continue;
- Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn);
- }
-
- if (Exp.commutative) {
+ ++NumGVNLoad;
+ reportLoadElim(L, AvailableValue, ORE);
+ // Tell MDA to rexamine the reused pointer since we might have more
+ // information after forwarding it.
+ if (MD && AvailableValue->getType()->isPtrOrPtrVectorTy())
+ MD->invalidateCachedPointerInfo(AvailableValue);
+ return true;
+ }
+
+ return false;
+}
+
+/// Return a pair the first field showing the value number of \p Exp and the
+/// second field showing whether it is a value number newly created.
+std::pair<uint32_t, bool>
+GVN::ValueTable::assignExpNewValueNum(Expression &Exp) {
+ uint32_t &e = expressionNumbering[Exp];
+ bool CreateNewValNum = !e;
+ if (CreateNewValNum) {
+ Expressions.push_back(Exp);
+ if (ExprIdx.size() < nextValueNumber + 1)
+ ExprIdx.resize(nextValueNumber * 2);
+ e = nextValueNumber;
+ ExprIdx[nextValueNumber++] = nextExprNumber++;
+ }
+ return {e, CreateNewValNum};
+}
+
+/// Return whether all the values related with the same \p num are
+/// defined in \p BB.
+bool GVN::ValueTable::areAllValsInBB(uint32_t Num, const BasicBlock *BB,
+ GVN &Gvn) {
+ LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
+ while (Vals && Vals->BB == BB)
+ Vals = Vals->Next;
+ return !Vals;
+}
+
+/// Wrap phiTranslateImpl to provide caching functionality.
+uint32_t GVN::ValueTable::phiTranslate(const BasicBlock *Pred,
+ const BasicBlock *PhiBlock, uint32_t Num,
+ GVN &Gvn) {
+ auto FindRes = PhiTranslateTable.find({Num, Pred});
+ if (FindRes != PhiTranslateTable.end())
+ return FindRes->second;
+ uint32_t NewNum = phiTranslateImpl(Pred, PhiBlock, Num, Gvn);
+ PhiTranslateTable.insert({{Num, Pred}, NewNum});
+ return NewNum;
+}
+
+// Return true if the value number \p Num and NewNum have equal value.
+// Return false if the result is unknown.
+bool GVN::ValueTable::areCallValsEqual(uint32_t Num, uint32_t NewNum,
+ const BasicBlock *Pred,
+ const BasicBlock *PhiBlock, GVN &Gvn) {
+ CallInst *Call = nullptr;
+ LeaderTableEntry *Vals = &Gvn.LeaderTable[Num];
+ while (Vals) {
+ Call = dyn_cast<CallInst>(Vals->Val);
+ if (Call && Call->getParent() == PhiBlock)
+ break;
+ Vals = Vals->Next;
+ }
+
+ if (AA->doesNotAccessMemory(Call))
+ return true;
+
+ if (!MD || !AA->onlyReadsMemory(Call))
+ return false;
+
+ MemDepResult local_dep = MD->getDependency(Call);
+ if (!local_dep.isNonLocal())
+ return false;
+
+ const MemoryDependenceResults::NonLocalDepInfo &deps =
+ MD->getNonLocalCallDependency(Call);
+
+ // Check to see if the Call has no function local clobber.
+ for (unsigned i = 0; i < deps.size(); i++) {
+ if (deps[i].getResult().isNonFuncLocal())
+ return true;
+ }
+ return false;
+}
+
+/// Translate value number \p Num using phis, so that it has the values of
+/// the phis in BB.
+uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
+ const BasicBlock *PhiBlock,
+ uint32_t Num, GVN &Gvn) {
+ if (PHINode *PN = NumberingPhi[Num]) {
+ for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+ if (PN->getParent() == PhiBlock && PN->getIncomingBlock(i) == Pred)
+ if (uint32_t TransVal = lookup(PN->getIncomingValue(i), false))
+ return TransVal;
+ }
+ return Num;
+ }
+
+ // If there is any value related with Num is defined in a BB other than
+ // PhiBlock, it cannot depend on a phi in PhiBlock without going through
+ // a backedge. We can do an early exit in that case to save compile time.
+ if (!areAllValsInBB(Num, PhiBlock, Gvn))
+ return Num;
+
+ if (Num >= ExprIdx.size() || ExprIdx[Num] == 0)
+ return Num;
+ Expression Exp = Expressions[ExprIdx[Num]];
+
+ for (unsigned i = 0; i < Exp.varargs.size(); i++) {
+ // For InsertValue and ExtractValue, some varargs are index numbers
+ // instead of value numbers. Those index numbers should not be
+ // translated.
+ if ((i > 1 && Exp.opcode == Instruction::InsertValue) ||
+ (i > 0 && Exp.opcode == Instruction::ExtractValue) ||
+ (i > 1 && Exp.opcode == Instruction::ShuffleVector))
+ continue;
+ Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn);
+ }
+
+ if (Exp.commutative) {
assert(Exp.varargs.size() >= 2 && "Unsupported commutative instruction!");
- if (Exp.varargs[0] > Exp.varargs[1]) {
- std::swap(Exp.varargs[0], Exp.varargs[1]);
- uint32_t Opcode = Exp.opcode >> 8;
- if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)
- Exp.opcode = (Opcode << 8) |
- CmpInst::getSwappedPredicate(
- static_cast<CmpInst::Predicate>(Exp.opcode & 255));
- }
- }
-
- if (uint32_t NewNum = expressionNumbering[Exp]) {
- if (Exp.opcode == Instruction::Call && NewNum != Num)
- return areCallValsEqual(Num, NewNum, Pred, PhiBlock, Gvn) ? NewNum : Num;
- return NewNum;
- }
- return Num;
-}
-
-/// Erase stale entry from phiTranslate cache so phiTranslate can be computed
-/// again.
-void GVN::ValueTable::eraseTranslateCacheEntry(uint32_t Num,
- const BasicBlock &CurrBlock) {
+ if (Exp.varargs[0] > Exp.varargs[1]) {
+ std::swap(Exp.varargs[0], Exp.varargs[1]);
+ uint32_t Opcode = Exp.opcode >> 8;
+ if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp)
+ Exp.opcode = (Opcode << 8) |
+ CmpInst::getSwappedPredicate(
+ static_cast<CmpInst::Predicate>(Exp.opcode & 255));
+ }
+ }
+
+ if (uint32_t NewNum = expressionNumbering[Exp]) {
+ if (Exp.opcode == Instruction::Call && NewNum != Num)
+ return areCallValsEqual(Num, NewNum, Pred, PhiBlock, Gvn) ? NewNum : Num;
+ return NewNum;
+ }
+ return Num;
+}
+
+/// Erase stale entry from phiTranslate cache so phiTranslate can be computed
+/// again.
+void GVN::ValueTable::eraseTranslateCacheEntry(uint32_t Num,
+ const BasicBlock &CurrBlock) {
for (const BasicBlock *Pred : predecessors(&CurrBlock))
PhiTranslateTable.erase({Num, Pred});
-}
-
-// In order to find a leader for a given value number at a
-// specific basic block, we first obtain the list of all Values for that number,
-// and then scan the list to find one whose block dominates the block in
-// question. This is fast because dominator tree queries consist of only
-// a few comparisons of DFS numbers.
-Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) {
- LeaderTableEntry Vals = LeaderTable[num];
- if (!Vals.Val) return nullptr;
-
- Value *Val = nullptr;
- if (DT->dominates(Vals.BB, BB)) {
- Val = Vals.Val;
- if (isa<Constant>(Val)) return Val;
- }
-
- LeaderTableEntry* Next = Vals.Next;
- while (Next) {
- if (DT->dominates(Next->BB, BB)) {
- if (isa<Constant>(Next->Val)) return Next->Val;
- if (!Val) Val = Next->Val;
- }
-
- Next = Next->Next;
- }
-
- return Val;
-}
-
-/// There is an edge from 'Src' to 'Dst'. Return
-/// true if every path from the entry block to 'Dst' passes via this edge. In
-/// particular 'Dst' must not be reachable via another edge from 'Src'.
-static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
- DominatorTree *DT) {
- // While in theory it is interesting to consider the case in which Dst has
- // more than one predecessor, because Dst might be part of a loop which is
- // only reachable from Src, in practice it is pointless since at the time
- // GVN runs all such loops have preheaders, which means that Dst will have
- // been changed to have only one predecessor, namely Src.
- const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
- assert((!Pred || Pred == E.getStart()) &&
- "No edge between these basic blocks!");
- return Pred != nullptr;
-}
-
-void GVN::assignBlockRPONumber(Function &F) {
- BlockRPONumber.clear();
- uint32_t NextBlockNumber = 1;
- ReversePostOrderTraversal<Function *> RPOT(&F);
- for (BasicBlock *BB : RPOT)
- BlockRPONumber[BB] = NextBlockNumber++;
- InvalidBlockRPONumbers = false;
-}
-
-bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const {
- bool Changed = false;
- for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
- Value *Operand = Instr->getOperand(OpNum);
- auto it = ReplaceOperandsWithMap.find(Operand);
- if (it != ReplaceOperandsWithMap.end()) {
- LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with "
- << *it->second << " in instruction " << *Instr << '\n');
- Instr->setOperand(OpNum, it->second);
- Changed = true;
- }
- }
- return Changed;
-}
-
-/// The given values are known to be equal in every block
-/// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with
-/// 'RHS' everywhere in the scope. Returns whether a change was made.
-/// If DominatesByEdge is false, then it means that we will propagate the RHS
-/// value starting from the end of Root.Start.
-bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
- bool DominatesByEdge) {
- SmallVector<std::pair<Value*, Value*>, 4> Worklist;
- Worklist.push_back(std::make_pair(LHS, RHS));
- bool Changed = false;
- // For speed, compute a conservative fast approximation to
- // DT->dominates(Root, Root.getEnd());
- const bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT);
-
- while (!Worklist.empty()) {
- std::pair<Value*, Value*> Item = Worklist.pop_back_val();
- LHS = Item.first; RHS = Item.second;
-
- if (LHS == RHS)
- continue;
- assert(LHS->getType() == RHS->getType() && "Equality but unequal types!");
-
- // Don't try to propagate equalities between constants.
- if (isa<Constant>(LHS) && isa<Constant>(RHS))
- continue;
-
- // Prefer a constant on the right-hand side, or an Argument if no constants.
- if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS)))
- std::swap(LHS, RHS);
- assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
-
- // If there is no obvious reason to prefer the left-hand side over the
- // right-hand side, ensure the longest lived term is on the right-hand side,
- // so the shortest lived term will be replaced by the longest lived.
- // This tends to expose more simplifications.
- uint32_t LVN = VN.lookupOrAdd(LHS);
- if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
- (isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
- // Move the 'oldest' value to the right-hand side, using the value number
- // as a proxy for age.
- uint32_t RVN = VN.lookupOrAdd(RHS);
- if (LVN < RVN) {
- std::swap(LHS, RHS);
- LVN = RVN;
- }
- }
-
- // If value numbering later sees that an instruction in the scope is equal
- // to 'LHS' then ensure it will be turned into 'RHS'. In order to preserve
- // the invariant that instructions only occur in the leader table for their
- // own value number (this is used by removeFromLeaderTable), do not do this
- // if RHS is an instruction (if an instruction in the scope is morphed into
- // LHS then it will be turned into RHS by the next GVN iteration anyway, so
- // using the leader table is about compiling faster, not optimizing better).
- // The leader table only tracks basic blocks, not edges. Only add to if we
- // have the simple case where the edge dominates the end.
- if (RootDominatesEnd && !isa<Instruction>(RHS))
- addToLeaderTable(LVN, RHS, Root.getEnd());
-
- // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope. As
- // LHS always has at least one use that is not dominated by Root, this will
- // never do anything if LHS has only one use.
- if (!LHS->hasOneUse()) {
- unsigned NumReplacements =
- DominatesByEdge
- ? replaceDominatedUsesWith(LHS, RHS, *DT, Root)
- : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getStart());
-
- Changed |= NumReplacements > 0;
- NumGVNEqProp += NumReplacements;
- // Cached information for anything that uses LHS will be invalid.
- if (MD)
- MD->invalidateCachedPointerInfo(LHS);
- }
-
- // Now try to deduce additional equalities from this one. For example, if
- // the known equality was "(A != B)" == "false" then it follows that A and B
- // are equal in the scope. Only boolean equalities with an explicit true or
- // false RHS are currently supported.
- if (!RHS->getType()->isIntegerTy(1))
- // Not a boolean equality - bail out.
- continue;
- ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
- if (!CI)
- // RHS neither 'true' nor 'false' - bail out.
- continue;
- // Whether RHS equals 'true'. Otherwise it equals 'false'.
- bool isKnownTrue = CI->isMinusOne();
- bool isKnownFalse = !isKnownTrue;
-
- // If "A && B" is known true then both A and B are known true. If "A || B"
- // is known false then both A and B are known false.
- Value *A, *B;
+}
+
+// In order to find a leader for a given value number at a
+// specific basic block, we first obtain the list of all Values for that number,
+// and then scan the list to find one whose block dominates the block in
+// question. This is fast because dominator tree queries consist of only
+// a few comparisons of DFS numbers.
+Value *GVN::findLeader(const BasicBlock *BB, uint32_t num) {
+ LeaderTableEntry Vals = LeaderTable[num];
+ if (!Vals.Val) return nullptr;
+
+ Value *Val = nullptr;
+ if (DT->dominates(Vals.BB, BB)) {
+ Val = Vals.Val;
+ if (isa<Constant>(Val)) return Val;
+ }
+
+ LeaderTableEntry* Next = Vals.Next;
+ while (Next) {
+ if (DT->dominates(Next->BB, BB)) {
+ if (isa<Constant>(Next->Val)) return Next->Val;
+ if (!Val) Val = Next->Val;
+ }
+
+ Next = Next->Next;
+ }
+
+ return Val;
+}
+
+/// There is an edge from 'Src' to 'Dst'. Return
+/// true if every path from the entry block to 'Dst' passes via this edge. In
+/// particular 'Dst' must not be reachable via another edge from 'Src'.
+static bool isOnlyReachableViaThisEdge(const BasicBlockEdge &E,
+ DominatorTree *DT) {
+ // While in theory it is interesting to consider the case in which Dst has
+ // more than one predecessor, because Dst might be part of a loop which is
+ // only reachable from Src, in practice it is pointless since at the time
+ // GVN runs all such loops have preheaders, which means that Dst will have
+ // been changed to have only one predecessor, namely Src.
+ const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
+ assert((!Pred || Pred == E.getStart()) &&
+ "No edge between these basic blocks!");
+ return Pred != nullptr;
+}
+
+void GVN::assignBlockRPONumber(Function &F) {
+ BlockRPONumber.clear();
+ uint32_t NextBlockNumber = 1;
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+ for (BasicBlock *BB : RPOT)
+ BlockRPONumber[BB] = NextBlockNumber++;
+ InvalidBlockRPONumbers = false;
+}
+
+bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const {
+ bool Changed = false;
+ for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
+ Value *Operand = Instr->getOperand(OpNum);
+ auto it = ReplaceOperandsWithMap.find(Operand);
+ if (it != ReplaceOperandsWithMap.end()) {
+ LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with "
+ << *it->second << " in instruction " << *Instr << '\n');
+ Instr->setOperand(OpNum, it->second);
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+/// The given values are known to be equal in every block
+/// dominated by 'Root'. Exploit this, for example by replacing 'LHS' with
+/// 'RHS' everywhere in the scope. Returns whether a change was made.
+/// If DominatesByEdge is false, then it means that we will propagate the RHS
+/// value starting from the end of Root.Start.
+bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
+ bool DominatesByEdge) {
+ SmallVector<std::pair<Value*, Value*>, 4> Worklist;
+ Worklist.push_back(std::make_pair(LHS, RHS));
+ bool Changed = false;
+ // For speed, compute a conservative fast approximation to
+ // DT->dominates(Root, Root.getEnd());
+ const bool RootDominatesEnd = isOnlyReachableViaThisEdge(Root, DT);
+
+ while (!Worklist.empty()) {
+ std::pair<Value*, Value*> Item = Worklist.pop_back_val();
+ LHS = Item.first; RHS = Item.second;
+
+ if (LHS == RHS)
+ continue;
+ assert(LHS->getType() == RHS->getType() && "Equality but unequal types!");
+
+ // Don't try to propagate equalities between constants.
+ if (isa<Constant>(LHS) && isa<Constant>(RHS))
+ continue;
+
+ // Prefer a constant on the right-hand side, or an Argument if no constants.
+ if (isa<Constant>(LHS) || (isa<Argument>(LHS) && !isa<Constant>(RHS)))
+ std::swap(LHS, RHS);
+ assert((isa<Argument>(LHS) || isa<Instruction>(LHS)) && "Unexpected value!");
+
+ // If there is no obvious reason to prefer the left-hand side over the
+ // right-hand side, ensure the longest lived term is on the right-hand side,
+ // so the shortest lived term will be replaced by the longest lived.
+ // This tends to expose more simplifications.
+ uint32_t LVN = VN.lookupOrAdd(LHS);
+ if ((isa<Argument>(LHS) && isa<Argument>(RHS)) ||
+ (isa<Instruction>(LHS) && isa<Instruction>(RHS))) {
+ // Move the 'oldest' value to the right-hand side, using the value number
+ // as a proxy for age.
+ uint32_t RVN = VN.lookupOrAdd(RHS);
+ if (LVN < RVN) {
+ std::swap(LHS, RHS);
+ LVN = RVN;
+ }
+ }
+
+ // If value numbering later sees that an instruction in the scope is equal
+ // to 'LHS' then ensure it will be turned into 'RHS'. In order to preserve
+ // the invariant that instructions only occur in the leader table for their
+ // own value number (this is used by removeFromLeaderTable), do not do this
+ // if RHS is an instruction (if an instruction in the scope is morphed into
+ // LHS then it will be turned into RHS by the next GVN iteration anyway, so
+ // using the leader table is about compiling faster, not optimizing better).
+ // The leader table only tracks basic blocks, not edges. Only add to if we
+ // have the simple case where the edge dominates the end.
+ if (RootDominatesEnd && !isa<Instruction>(RHS))
+ addToLeaderTable(LVN, RHS, Root.getEnd());
+
+ // Replace all occurrences of 'LHS' with 'RHS' everywhere in the scope. As
+ // LHS always has at least one use that is not dominated by Root, this will
+ // never do anything if LHS has only one use.
+ if (!LHS->hasOneUse()) {
+ unsigned NumReplacements =
+ DominatesByEdge
+ ? replaceDominatedUsesWith(LHS, RHS, *DT, Root)
+ : replaceDominatedUsesWith(LHS, RHS, *DT, Root.getStart());
+
+ Changed |= NumReplacements > 0;
+ NumGVNEqProp += NumReplacements;
+ // Cached information for anything that uses LHS will be invalid.
+ if (MD)
+ MD->invalidateCachedPointerInfo(LHS);
+ }
+
+ // Now try to deduce additional equalities from this one. For example, if
+ // the known equality was "(A != B)" == "false" then it follows that A and B
+ // are equal in the scope. Only boolean equalities with an explicit true or
+ // false RHS are currently supported.
+ if (!RHS->getType()->isIntegerTy(1))
+ // Not a boolean equality - bail out.
+ continue;
+ ConstantInt *CI = dyn_cast<ConstantInt>(RHS);
+ if (!CI)
+ // RHS neither 'true' nor 'false' - bail out.
+ continue;
+ // Whether RHS equals 'true'. Otherwise it equals 'false'.
+ bool isKnownTrue = CI->isMinusOne();
+ bool isKnownFalse = !isKnownTrue;
+
+ // If "A && B" is known true then both A and B are known true. If "A || B"
+ // is known false then both A and B are known false.
+ Value *A, *B;
if ((isKnownTrue && match(LHS, m_LogicalAnd(m_Value(A), m_Value(B)))) ||
(isKnownFalse && match(LHS, m_LogicalOr(m_Value(A), m_Value(B))))) {
- Worklist.push_back(std::make_pair(A, RHS));
- Worklist.push_back(std::make_pair(B, RHS));
- continue;
- }
-
- // If we are propagating an equality like "(A == B)" == "true" then also
- // propagate the equality A == B. When propagating a comparison such as
- // "(A >= B)" == "true", replace all instances of "A < B" with "false".
- if (CmpInst *Cmp = dyn_cast<CmpInst>(LHS)) {
- Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
-
- // If "A == B" is known true, or "A != B" is known false, then replace
- // A with B everywhere in the scope. For floating point operations, we
- // have to be careful since equality does not always imply equivalance.
- if ((isKnownTrue && impliesEquivalanceIfTrue(Cmp)) ||
- (isKnownFalse && impliesEquivalanceIfFalse(Cmp)))
- Worklist.push_back(std::make_pair(Op0, Op1));
-
- // If "A >= B" is known true, replace "A < B" with false everywhere.
- CmpInst::Predicate NotPred = Cmp->getInversePredicate();
- Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
- // Since we don't have the instruction "A < B" immediately to hand, work
- // out the value number that it would have and use that to find an
- // appropriate instruction (if any).
- uint32_t NextNum = VN.getNextUnusedValueNumber();
- uint32_t Num = VN.lookupOrAddCmp(Cmp->getOpcode(), NotPred, Op0, Op1);
- // If the number we were assigned was brand new then there is no point in
- // looking for an instruction realizing it: there cannot be one!
- if (Num < NextNum) {
- Value *NotCmp = findLeader(Root.getEnd(), Num);
- if (NotCmp && isa<Instruction>(NotCmp)) {
- unsigned NumReplacements =
- DominatesByEdge
- ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root)
- : replaceDominatedUsesWith(NotCmp, NotVal, *DT,
- Root.getStart());
- Changed |= NumReplacements > 0;
- NumGVNEqProp += NumReplacements;
- // Cached information for anything that uses NotCmp will be invalid.
- if (MD)
- MD->invalidateCachedPointerInfo(NotCmp);
- }
- }
- // Ensure that any instruction in scope that gets the "A < B" value number
- // is replaced with false.
- // The leader table only tracks basic blocks, not edges. Only add to if we
- // have the simple case where the edge dominates the end.
- if (RootDominatesEnd)
- addToLeaderTable(Num, NotVal, Root.getEnd());
-
- continue;
- }
- }
-
- return Changed;
-}
-
-/// When calculating availability, handle an instruction
-/// by inserting it into the appropriate sets
-bool GVN::processInstruction(Instruction *I) {
- // Ignore dbg info intrinsics.
- if (isa<DbgInfoIntrinsic>(I))
- return false;
-
- // If the instruction can be easily simplified then do so now in preference
- // to value numbering it. Value numbering often exposes redundancies, for
- // example if it determines that %y is equal to %x then the instruction
- // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
- const DataLayout &DL = I->getModule()->getDataLayout();
- if (Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC})) {
- bool Changed = false;
- if (!I->use_empty()) {
- I->replaceAllUsesWith(V);
- Changed = true;
- }
- if (isInstructionTriviallyDead(I, TLI)) {
- markInstructionForDeletion(I);
- Changed = true;
- }
- if (Changed) {
- if (MD && V->getType()->isPtrOrPtrVectorTy())
- MD->invalidateCachedPointerInfo(V);
- ++NumGVNSimpl;
- return true;
- }
- }
-
- if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I))
- if (IntrinsicI->getIntrinsicID() == Intrinsic::assume)
- return processAssumeIntrinsic(IntrinsicI);
-
- if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
- if (processLoad(LI))
- return true;
-
- unsigned Num = VN.lookupOrAdd(LI);
- addToLeaderTable(Num, LI, LI->getParent());
- return false;
- }
-
- // For conditional branches, we can perform simple conditional propagation on
- // the condition value itself.
- if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
- if (!BI->isConditional())
- return false;
-
- if (isa<Constant>(BI->getCondition()))
- return processFoldableCondBr(BI);
-
- Value *BranchCond = BI->getCondition();
- BasicBlock *TrueSucc = BI->getSuccessor(0);
- BasicBlock *FalseSucc = BI->getSuccessor(1);
- // Avoid multiple edges early.
- if (TrueSucc == FalseSucc)
- return false;
-
- BasicBlock *Parent = BI->getParent();
- bool Changed = false;
-
- Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext());
- BasicBlockEdge TrueE(Parent, TrueSucc);
- Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true);
-
- Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext());
- BasicBlockEdge FalseE(Parent, FalseSucc);
- Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true);
-
- return Changed;
- }
-
- // For switches, propagate the case values into the case destinations.
- if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
- Value *SwitchCond = SI->getCondition();
- BasicBlock *Parent = SI->getParent();
- bool Changed = false;
-
- // Remember how many outgoing edges there are to every successor.
- SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
- for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i)
- ++SwitchEdges[SI->getSuccessor(i)];
-
- for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
- i != e; ++i) {
- BasicBlock *Dst = i->getCaseSuccessor();
- // If there is only a single edge, propagate the case value into it.
- if (SwitchEdges.lookup(Dst) == 1) {
- BasicBlockEdge E(Parent, Dst);
- Changed |= propagateEquality(SwitchCond, i->getCaseValue(), E, true);
- }
- }
- return Changed;
- }
-
- // Instructions with void type don't return a value, so there's
- // no point in trying to find redundancies in them.
- if (I->getType()->isVoidTy())
- return false;
-
- uint32_t NextNum = VN.getNextUnusedValueNumber();
- unsigned Num = VN.lookupOrAdd(I);
-
- // Allocations are always uniquely numbered, so we can save time and memory
- // by fast failing them.
- if (isa<AllocaInst>(I) || I->isTerminator() || isa<PHINode>(I)) {
- addToLeaderTable(Num, I, I->getParent());
- return false;
- }
-
- // If the number we were assigned was a brand new VN, then we don't
- // need to do a lookup to see if the number already exists
- // somewhere in the domtree: it can't!
- if (Num >= NextNum) {
- addToLeaderTable(Num, I, I->getParent());
- return false;
- }
-
- // Perform fast-path value-number based elimination of values inherited from
- // dominators.
- Value *Repl = findLeader(I->getParent(), Num);
- if (!Repl) {
- // Failure, just remember this instance for future use.
- addToLeaderTable(Num, I, I->getParent());
- return false;
- } else if (Repl == I) {
- // If I was the result of a shortcut PRE, it might already be in the table
- // and the best replacement for itself. Nothing to do.
- return false;
- }
-
- // Remove it!
- patchAndReplaceAllUsesWith(I, Repl);
- if (MD && Repl->getType()->isPtrOrPtrVectorTy())
- MD->invalidateCachedPointerInfo(Repl);
- markInstructionForDeletion(I);
- return true;
-}
-
-/// runOnFunction - This is the main transformation entry point for a function.
-bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
- const TargetLibraryInfo &RunTLI, AAResults &RunAA,
- MemoryDependenceResults *RunMD, LoopInfo *LI,
+ Worklist.push_back(std::make_pair(A, RHS));
+ Worklist.push_back(std::make_pair(B, RHS));
+ continue;
+ }
+
+ // If we are propagating an equality like "(A == B)" == "true" then also
+ // propagate the equality A == B. When propagating a comparison such as
+ // "(A >= B)" == "true", replace all instances of "A < B" with "false".
+ if (CmpInst *Cmp = dyn_cast<CmpInst>(LHS)) {
+ Value *Op0 = Cmp->getOperand(0), *Op1 = Cmp->getOperand(1);
+
+ // If "A == B" is known true, or "A != B" is known false, then replace
+ // A with B everywhere in the scope. For floating point operations, we
+ // have to be careful since equality does not always imply equivalance.
+ if ((isKnownTrue && impliesEquivalanceIfTrue(Cmp)) ||
+ (isKnownFalse && impliesEquivalanceIfFalse(Cmp)))
+ Worklist.push_back(std::make_pair(Op0, Op1));
+
+ // If "A >= B" is known true, replace "A < B" with false everywhere.
+ CmpInst::Predicate NotPred = Cmp->getInversePredicate();
+ Constant *NotVal = ConstantInt::get(Cmp->getType(), isKnownFalse);
+ // Since we don't have the instruction "A < B" immediately to hand, work
+ // out the value number that it would have and use that to find an
+ // appropriate instruction (if any).
+ uint32_t NextNum = VN.getNextUnusedValueNumber();
+ uint32_t Num = VN.lookupOrAddCmp(Cmp->getOpcode(), NotPred, Op0, Op1);
+ // If the number we were assigned was brand new then there is no point in
+ // looking for an instruction realizing it: there cannot be one!
+ if (Num < NextNum) {
+ Value *NotCmp = findLeader(Root.getEnd(), Num);
+ if (NotCmp && isa<Instruction>(NotCmp)) {
+ unsigned NumReplacements =
+ DominatesByEdge
+ ? replaceDominatedUsesWith(NotCmp, NotVal, *DT, Root)
+ : replaceDominatedUsesWith(NotCmp, NotVal, *DT,
+ Root.getStart());
+ Changed |= NumReplacements > 0;
+ NumGVNEqProp += NumReplacements;
+ // Cached information for anything that uses NotCmp will be invalid.
+ if (MD)
+ MD->invalidateCachedPointerInfo(NotCmp);
+ }
+ }
+ // Ensure that any instruction in scope that gets the "A < B" value number
+ // is replaced with false.
+ // The leader table only tracks basic blocks, not edges. Only add to if we
+ // have the simple case where the edge dominates the end.
+ if (RootDominatesEnd)
+ addToLeaderTable(Num, NotVal, Root.getEnd());
+
+ continue;
+ }
+ }
+
+ return Changed;
+}
+
+/// When calculating availability, handle an instruction
+/// by inserting it into the appropriate sets
+bool GVN::processInstruction(Instruction *I) {
+ // Ignore dbg info intrinsics.
+ if (isa<DbgInfoIntrinsic>(I))
+ return false;
+
+ // If the instruction can be easily simplified then do so now in preference
+ // to value numbering it. Value numbering often exposes redundancies, for
+ // example if it determines that %y is equal to %x then the instruction
+ // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
+ const DataLayout &DL = I->getModule()->getDataLayout();
+ if (Value *V = SimplifyInstruction(I, {DL, TLI, DT, AC})) {
+ bool Changed = false;
+ if (!I->use_empty()) {
+ I->replaceAllUsesWith(V);
+ Changed = true;
+ }
+ if (isInstructionTriviallyDead(I, TLI)) {
+ markInstructionForDeletion(I);
+ Changed = true;
+ }
+ if (Changed) {
+ if (MD && V->getType()->isPtrOrPtrVectorTy())
+ MD->invalidateCachedPointerInfo(V);
+ ++NumGVNSimpl;
+ return true;
+ }
+ }
+
+ if (IntrinsicInst *IntrinsicI = dyn_cast<IntrinsicInst>(I))
+ if (IntrinsicI->getIntrinsicID() == Intrinsic::assume)
+ return processAssumeIntrinsic(IntrinsicI);
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ if (processLoad(LI))
+ return true;
+
+ unsigned Num = VN.lookupOrAdd(LI);
+ addToLeaderTable(Num, LI, LI->getParent());
+ return false;
+ }
+
+ // For conditional branches, we can perform simple conditional propagation on
+ // the condition value itself.
+ if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+ if (!BI->isConditional())
+ return false;
+
+ if (isa<Constant>(BI->getCondition()))
+ return processFoldableCondBr(BI);
+
+ Value *BranchCond = BI->getCondition();
+ BasicBlock *TrueSucc = BI->getSuccessor(0);
+ BasicBlock *FalseSucc = BI->getSuccessor(1);
+ // Avoid multiple edges early.
+ if (TrueSucc == FalseSucc)
+ return false;
+
+ BasicBlock *Parent = BI->getParent();
+ bool Changed = false;
+
+ Value *TrueVal = ConstantInt::getTrue(TrueSucc->getContext());
+ BasicBlockEdge TrueE(Parent, TrueSucc);
+ Changed |= propagateEquality(BranchCond, TrueVal, TrueE, true);
+
+ Value *FalseVal = ConstantInt::getFalse(FalseSucc->getContext());
+ BasicBlockEdge FalseE(Parent, FalseSucc);
+ Changed |= propagateEquality(BranchCond, FalseVal, FalseE, true);
+
+ return Changed;
+ }
+
+ // For switches, propagate the case values into the case destinations.
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+ Value *SwitchCond = SI->getCondition();
+ BasicBlock *Parent = SI->getParent();
+ bool Changed = false;
+
+ // Remember how many outgoing edges there are to every successor.
+ SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+ for (unsigned i = 0, n = SI->getNumSuccessors(); i != n; ++i)
+ ++SwitchEdges[SI->getSuccessor(i)];
+
+ for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
+ i != e; ++i) {
+ BasicBlock *Dst = i->getCaseSuccessor();
+ // If there is only a single edge, propagate the case value into it.
+ if (SwitchEdges.lookup(Dst) == 1) {
+ BasicBlockEdge E(Parent, Dst);
+ Changed |= propagateEquality(SwitchCond, i->getCaseValue(), E, true);
+ }
+ }
+ return Changed;
+ }
+
+ // Instructions with void type don't return a value, so there's
+ // no point in trying to find redundancies in them.
+ if (I->getType()->isVoidTy())
+ return false;
+
+ uint32_t NextNum = VN.getNextUnusedValueNumber();
+ unsigned Num = VN.lookupOrAdd(I);
+
+ // Allocations are always uniquely numbered, so we can save time and memory
+ // by fast failing them.
+ if (isa<AllocaInst>(I) || I->isTerminator() || isa<PHINode>(I)) {
+ addToLeaderTable(Num, I, I->getParent());
+ return false;
+ }
+
+ // If the number we were assigned was a brand new VN, then we don't
+ // need to do a lookup to see if the number already exists
+ // somewhere in the domtree: it can't!
+ if (Num >= NextNum) {
+ addToLeaderTable(Num, I, I->getParent());
+ return false;
+ }
+
+ // Perform fast-path value-number based elimination of values inherited from
+ // dominators.
+ Value *Repl = findLeader(I->getParent(), Num);
+ if (!Repl) {
+ // Failure, just remember this instance for future use.
+ addToLeaderTable(Num, I, I->getParent());
+ return false;
+ } else if (Repl == I) {
+ // If I was the result of a shortcut PRE, it might already be in the table
+ // and the best replacement for itself. Nothing to do.
+ return false;
+ }
+
+ // Remove it!
+ patchAndReplaceAllUsesWith(I, Repl);
+ if (MD && Repl->getType()->isPtrOrPtrVectorTy())
+ MD->invalidateCachedPointerInfo(Repl);
+ markInstructionForDeletion(I);
+ return true;
+}
+
+/// runOnFunction - This is the main transformation entry point for a function.
+bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
+ const TargetLibraryInfo &RunTLI, AAResults &RunAA,
+ MemoryDependenceResults *RunMD, LoopInfo *LI,
OptimizationRemarkEmitter *RunORE, MemorySSA *MSSA) {
- AC = &RunAC;
- DT = &RunDT;
- VN.setDomTree(DT);
- TLI = &RunTLI;
- VN.setAliasAnalysis(&RunAA);
- MD = RunMD;
- ImplicitControlFlowTracking ImplicitCFT;
- ICF = &ImplicitCFT;
- this->LI = LI;
- VN.setMemDep(MD);
- ORE = RunORE;
- InvalidBlockRPONumbers = true;
+ AC = &RunAC;
+ DT = &RunDT;
+ VN.setDomTree(DT);
+ TLI = &RunTLI;
+ VN.setAliasAnalysis(&RunAA);
+ MD = RunMD;
+ ImplicitControlFlowTracking ImplicitCFT;
+ ICF = &ImplicitCFT;
+ this->LI = LI;
+ VN.setMemDep(MD);
+ ORE = RunORE;
+ InvalidBlockRPONumbers = true;
MemorySSAUpdater Updater(MSSA);
MSSAU = MSSA ? &Updater : nullptr;
-
- bool Changed = false;
- bool ShouldContinue = true;
-
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- // Merge unconditional branches, allowing PRE to catch more
- // optimization opportunities.
- for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
- BasicBlock *BB = &*FI++;
-
+
+ bool Changed = false;
+ bool ShouldContinue = true;
+
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ // Merge unconditional branches, allowing PRE to catch more
+ // optimization opportunities.
+ for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ) {
+ BasicBlock *BB = &*FI++;
+
bool removedBlock = MergeBlockIntoPredecessor(BB, &DTU, LI, MSSAU, MD);
- if (removedBlock)
- ++NumGVNBlocks;
-
- Changed |= removedBlock;
- }
-
- unsigned Iteration = 0;
- while (ShouldContinue) {
- LLVM_DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
- ShouldContinue = iterateOnFunction(F);
- Changed |= ShouldContinue;
- ++Iteration;
- }
-
- if (isPREEnabled()) {
- // Fabricate val-num for dead-code in order to suppress assertion in
- // performPRE().
- assignValNumForDeadCode();
- bool PREChanged = true;
- while (PREChanged) {
- PREChanged = performPRE(F);
- Changed |= PREChanged;
- }
- }
-
- // FIXME: Should perform GVN again after PRE does something. PRE can move
- // computations into blocks where they become fully redundant. Note that
- // we can't do this until PRE's critical edge splitting updates memdep.
- // Actually, when this happens, we should just fully integrate PRE into GVN.
-
- cleanupGlobalSets();
- // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each
- // iteration.
- DeadBlocks.clear();
-
+ if (removedBlock)
+ ++NumGVNBlocks;
+
+ Changed |= removedBlock;
+ }
+
+ unsigned Iteration = 0;
+ while (ShouldContinue) {
+ LLVM_DEBUG(dbgs() << "GVN iteration: " << Iteration << "\n");
+ ShouldContinue = iterateOnFunction(F);
+ Changed |= ShouldContinue;
+ ++Iteration;
+ }
+
+ if (isPREEnabled()) {
+ // Fabricate val-num for dead-code in order to suppress assertion in
+ // performPRE().
+ assignValNumForDeadCode();
+ bool PREChanged = true;
+ while (PREChanged) {
+ PREChanged = performPRE(F);
+ Changed |= PREChanged;
+ }
+ }
+
+ // FIXME: Should perform GVN again after PRE does something. PRE can move
+ // computations into blocks where they become fully redundant. Note that
+ // we can't do this until PRE's critical edge splitting updates memdep.
+ // Actually, when this happens, we should just fully integrate PRE into GVN.
+
+ cleanupGlobalSets();
+ // Do not cleanup DeadBlocks in cleanupGlobalSets() as it's called for each
+ // iteration.
+ DeadBlocks.clear();
+
if (MSSA && VerifyMemorySSA)
MSSA->verifyMemorySSA();
- return Changed;
-}
-
-bool GVN::processBlock(BasicBlock *BB) {
- // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function
- // (and incrementing BI before processing an instruction).
- assert(InstrsToErase.empty() &&
- "We expect InstrsToErase to be empty across iterations");
- if (DeadBlocks.count(BB))
- return false;
-
- // Clearing map before every BB because it can be used only for single BB.
- ReplaceOperandsWithMap.clear();
- bool ChangedFunction = false;
-
- for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
- BI != BE;) {
- if (!ReplaceOperandsWithMap.empty())
- ChangedFunction |= replaceOperandsForInBlockEquality(&*BI);
- ChangedFunction |= processInstruction(&*BI);
-
- if (InstrsToErase.empty()) {
- ++BI;
- continue;
- }
-
- // If we need some instructions deleted, do it now.
- NumGVNInstr += InstrsToErase.size();
-
- // Avoid iterator invalidation.
- bool AtStart = BI == BB->begin();
- if (!AtStart)
- --BI;
-
- for (auto *I : InstrsToErase) {
- assert(I->getParent() == BB && "Removing instruction from wrong block?");
- LLVM_DEBUG(dbgs() << "GVN removed: " << *I << '\n');
- salvageKnowledge(I, AC);
- salvageDebugInfo(*I);
- if (MD) MD->removeInstruction(I);
+ return Changed;
+}
+
+bool GVN::processBlock(BasicBlock *BB) {
+ // FIXME: Kill off InstrsToErase by doing erasing eagerly in a helper function
+ // (and incrementing BI before processing an instruction).
+ assert(InstrsToErase.empty() &&
+ "We expect InstrsToErase to be empty across iterations");
+ if (DeadBlocks.count(BB))
+ return false;
+
+ // Clearing map before every BB because it can be used only for single BB.
+ ReplaceOperandsWithMap.clear();
+ bool ChangedFunction = false;
+
+ for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+ BI != BE;) {
+ if (!ReplaceOperandsWithMap.empty())
+ ChangedFunction |= replaceOperandsForInBlockEquality(&*BI);
+ ChangedFunction |= processInstruction(&*BI);
+
+ if (InstrsToErase.empty()) {
+ ++BI;
+ continue;
+ }
+
+ // If we need some instructions deleted, do it now.
+ NumGVNInstr += InstrsToErase.size();
+
+ // Avoid iterator invalidation.
+ bool AtStart = BI == BB->begin();
+ if (!AtStart)
+ --BI;
+
+ for (auto *I : InstrsToErase) {
+ assert(I->getParent() == BB && "Removing instruction from wrong block?");
+ LLVM_DEBUG(dbgs() << "GVN removed: " << *I << '\n');
+ salvageKnowledge(I, AC);
+ salvageDebugInfo(*I);
+ if (MD) MD->removeInstruction(I);
if (MSSAU)
MSSAU->removeMemoryAccess(I);
- LLVM_DEBUG(verifyRemoved(I));
- ICF->removeInstruction(I);
- I->eraseFromParent();
- }
- InstrsToErase.clear();
-
- if (AtStart)
- BI = BB->begin();
- else
- ++BI;
- }
-
- return ChangedFunction;
-}
-
-// Instantiate an expression in a predecessor that lacked it.
-bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
- BasicBlock *Curr, unsigned int ValNo) {
- // Because we are going top-down through the block, all value numbers
- // will be available in the predecessor by the time we need them. Any
- // that weren't originally present will have been instantiated earlier
- // in this loop.
- bool success = true;
- for (unsigned i = 0, e = Instr->getNumOperands(); i != e; ++i) {
- Value *Op = Instr->getOperand(i);
- if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
- continue;
- // This could be a newly inserted instruction, in which case, we won't
- // find a value number, and should give up before we hurt ourselves.
- // FIXME: Rewrite the infrastructure to let it easier to value number
- // and process newly inserted instructions.
- if (!VN.exists(Op)) {
- success = false;
- break;
- }
- uint32_t TValNo =
- VN.phiTranslate(Pred, Curr, VN.lookup(Op), *this);
- if (Value *V = findLeader(Pred, TValNo)) {
- Instr->setOperand(i, V);
- } else {
- success = false;
- break;
- }
- }
-
- // Fail out if we encounter an operand that is not available in
- // the PRE predecessor. This is typically because of loads which
- // are not value numbered precisely.
- if (!success)
- return false;
-
- Instr->insertBefore(Pred->getTerminator());
- Instr->setName(Instr->getName() + ".pre");
- Instr->setDebugLoc(Instr->getDebugLoc());
-
- unsigned Num = VN.lookupOrAdd(Instr);
- VN.add(Instr, Num);
-
- // Update the availability map to include the new instruction.
- addToLeaderTable(Num, Instr, Pred);
- return true;
-}
-
-bool GVN::performScalarPRE(Instruction *CurInst) {
- if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() ||
- isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
- CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
- isa<DbgInfoIntrinsic>(CurInst))
- return false;
-
- // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
- // sinking the compare again, and it would force the code generator to
- // move the i1 from processor flags or predicate registers into a general
- // purpose register.
- if (isa<CmpInst>(CurInst))
- return false;
-
- // Don't do PRE on GEPs. The inserted PHI would prevent CodeGenPrepare from
- // sinking the addressing mode computation back to its uses. Extending the
- // GEP's live range increases the register pressure, and therefore it can
- // introduce unnecessary spills.
- //
- // This doesn't prevent Load PRE. PHI translation will make the GEP available
- // to the load by moving it to the predecessor block if necessary.
- if (isa<GetElementPtrInst>(CurInst))
- return false;
-
+ LLVM_DEBUG(verifyRemoved(I));
+ ICF->removeInstruction(I);
+ I->eraseFromParent();
+ }
+ InstrsToErase.clear();
+
+ if (AtStart)
+ BI = BB->begin();
+ else
+ ++BI;
+ }
+
+ return ChangedFunction;
+}
+
+// Instantiate an expression in a predecessor that lacked it.
+bool GVN::performScalarPREInsertion(Instruction *Instr, BasicBlock *Pred,
+ BasicBlock *Curr, unsigned int ValNo) {
+ // Because we are going top-down through the block, all value numbers
+ // will be available in the predecessor by the time we need them. Any
+ // that weren't originally present will have been instantiated earlier
+ // in this loop.
+ bool success = true;
+ for (unsigned i = 0, e = Instr->getNumOperands(); i != e; ++i) {
+ Value *Op = Instr->getOperand(i);
+ if (isa<Argument>(Op) || isa<Constant>(Op) || isa<GlobalValue>(Op))
+ continue;
+ // This could be a newly inserted instruction, in which case, we won't
+ // find a value number, and should give up before we hurt ourselves.
+ // FIXME: Rewrite the infrastructure to let it easier to value number
+ // and process newly inserted instructions.
+ if (!VN.exists(Op)) {
+ success = false;
+ break;
+ }
+ uint32_t TValNo =
+ VN.phiTranslate(Pred, Curr, VN.lookup(Op), *this);
+ if (Value *V = findLeader(Pred, TValNo)) {
+ Instr->setOperand(i, V);
+ } else {
+ success = false;
+ break;
+ }
+ }
+
+ // Fail out if we encounter an operand that is not available in
+ // the PRE predecessor. This is typically because of loads which
+ // are not value numbered precisely.
+ if (!success)
+ return false;
+
+ Instr->insertBefore(Pred->getTerminator());
+ Instr->setName(Instr->getName() + ".pre");
+ Instr->setDebugLoc(Instr->getDebugLoc());
+
+ unsigned Num = VN.lookupOrAdd(Instr);
+ VN.add(Instr, Num);
+
+ // Update the availability map to include the new instruction.
+ addToLeaderTable(Num, Instr, Pred);
+ return true;
+}
+
+bool GVN::performScalarPRE(Instruction *CurInst) {
+ if (isa<AllocaInst>(CurInst) || CurInst->isTerminator() ||
+ isa<PHINode>(CurInst) || CurInst->getType()->isVoidTy() ||
+ CurInst->mayReadFromMemory() || CurInst->mayHaveSideEffects() ||
+ isa<DbgInfoIntrinsic>(CurInst))
+ return false;
+
+ // Don't do PRE on compares. The PHI would prevent CodeGenPrepare from
+ // sinking the compare again, and it would force the code generator to
+ // move the i1 from processor flags or predicate registers into a general
+ // purpose register.
+ if (isa<CmpInst>(CurInst))
+ return false;
+
+ // Don't do PRE on GEPs. The inserted PHI would prevent CodeGenPrepare from
+ // sinking the addressing mode computation back to its uses. Extending the
+ // GEP's live range increases the register pressure, and therefore it can
+ // introduce unnecessary spills.
+ //
+ // This doesn't prevent Load PRE. PHI translation will make the GEP available
+ // to the load by moving it to the predecessor block if necessary.
+ if (isa<GetElementPtrInst>(CurInst))
+ return false;
+
if (auto *CallB = dyn_cast<CallBase>(CurInst)) {
// We don't currently value number ANY inline asm calls.
- if (CallB->isInlineAsm())
- return false;
+ if (CallB->isInlineAsm())
+ return false;
// Don't do PRE on convergent calls.
if (CallB->isConvergent())
return false;
}
-
- uint32_t ValNo = VN.lookup(CurInst);
-
- // Look for the predecessors for PRE opportunities. We're
- // only trying to solve the basic diamond case, where
- // a value is computed in the successor and one predecessor,
- // but not the other. We also explicitly disallow cases
- // where the successor is its own predecessor, because they're
- // more complicated to get right.
- unsigned NumWith = 0;
- unsigned NumWithout = 0;
- BasicBlock *PREPred = nullptr;
- BasicBlock *CurrentBlock = CurInst->getParent();
-
- // Update the RPO numbers for this function.
- if (InvalidBlockRPONumbers)
- assignBlockRPONumber(*CurrentBlock->getParent());
-
- SmallVector<std::pair<Value *, BasicBlock *>, 8> predMap;
- for (BasicBlock *P : predecessors(CurrentBlock)) {
- // We're not interested in PRE where blocks with predecessors that are
- // not reachable.
- if (!DT->isReachableFromEntry(P)) {
- NumWithout = 2;
- break;
- }
- // It is not safe to do PRE when P->CurrentBlock is a loop backedge, and
- // when CurInst has operand defined in CurrentBlock (so it may be defined
- // by phi in the loop header).
- assert(BlockRPONumber.count(P) && BlockRPONumber.count(CurrentBlock) &&
- "Invalid BlockRPONumber map.");
- if (BlockRPONumber[P] >= BlockRPONumber[CurrentBlock] &&
- llvm::any_of(CurInst->operands(), [&](const Use &U) {
- if (auto *Inst = dyn_cast<Instruction>(U.get()))
- return Inst->getParent() == CurrentBlock;
- return false;
- })) {
- NumWithout = 2;
- break;
- }
-
- uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this);
- Value *predV = findLeader(P, TValNo);
- if (!predV) {
- predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
- PREPred = P;
- ++NumWithout;
- } else if (predV == CurInst) {
- /* CurInst dominates this predecessor. */
- NumWithout = 2;
- break;
- } else {
- predMap.push_back(std::make_pair(predV, P));
- ++NumWith;
- }
- }
-
- // Don't do PRE when it might increase code size, i.e. when
- // we would need to insert instructions in more than one pred.
- if (NumWithout > 1 || NumWith == 0)
- return false;
-
- // We may have a case where all predecessors have the instruction,
- // and we just need to insert a phi node. Otherwise, perform
- // insertion.
- Instruction *PREInstr = nullptr;
-
- if (NumWithout != 0) {
- if (!isSafeToSpeculativelyExecute(CurInst)) {
- // It is only valid to insert a new instruction if the current instruction
- // is always executed. An instruction with implicit control flow could
- // prevent us from doing it. If we cannot speculate the execution, then
- // PRE should be prohibited.
- if (ICF->isDominatedByICFIFromSameBlock(CurInst))
- return false;
- }
-
- // Don't do PRE across indirect branch.
- if (isa<IndirectBrInst>(PREPred->getTerminator()))
- return false;
-
- // Don't do PRE across callbr.
- // FIXME: Can we do this across the fallthrough edge?
- if (isa<CallBrInst>(PREPred->getTerminator()))
- return false;
-
- // We can't do PRE safely on a critical edge, so instead we schedule
- // the edge to be split and perform the PRE the next time we iterate
- // on the function.
- unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
- if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
- toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
- return false;
- }
- // We need to insert somewhere, so let's give it a shot
- PREInstr = CurInst->clone();
- if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) {
- // If we failed insertion, make sure we remove the instruction.
- LLVM_DEBUG(verifyRemoved(PREInstr));
- PREInstr->deleteValue();
- return false;
- }
- }
-
- // Either we should have filled in the PRE instruction, or we should
- // not have needed insertions.
- assert(PREInstr != nullptr || NumWithout == 0);
-
- ++NumGVNPRE;
-
- // Create a PHI to make the value available in this block.
- PHINode *Phi =
- PHINode::Create(CurInst->getType(), predMap.size(),
- CurInst->getName() + ".pre-phi", &CurrentBlock->front());
- for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
- if (Value *V = predMap[i].first) {
- // If we use an existing value in this phi, we have to patch the original
- // value because the phi will be used to replace a later value.
- patchReplacementInstruction(CurInst, V);
- Phi->addIncoming(V, predMap[i].second);
- } else
- Phi->addIncoming(PREInstr, PREPred);
- }
-
- VN.add(Phi, ValNo);
- // After creating a new PHI for ValNo, the phi translate result for ValNo will
- // be changed, so erase the related stale entries in phi translate cache.
- VN.eraseTranslateCacheEntry(ValNo, *CurrentBlock);
- addToLeaderTable(ValNo, Phi, CurrentBlock);
- Phi->setDebugLoc(CurInst->getDebugLoc());
- CurInst->replaceAllUsesWith(Phi);
- if (MD && Phi->getType()->isPtrOrPtrVectorTy())
- MD->invalidateCachedPointerInfo(Phi);
- VN.erase(CurInst);
- removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
-
- LLVM_DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
- if (MD)
- MD->removeInstruction(CurInst);
+
+ uint32_t ValNo = VN.lookup(CurInst);
+
+ // Look for the predecessors for PRE opportunities. We're
+ // only trying to solve the basic diamond case, where
+ // a value is computed in the successor and one predecessor,
+ // but not the other. We also explicitly disallow cases
+ // where the successor is its own predecessor, because they're
+ // more complicated to get right.
+ unsigned NumWith = 0;
+ unsigned NumWithout = 0;
+ BasicBlock *PREPred = nullptr;
+ BasicBlock *CurrentBlock = CurInst->getParent();
+
+ // Update the RPO numbers for this function.
+ if (InvalidBlockRPONumbers)
+ assignBlockRPONumber(*CurrentBlock->getParent());
+
+ SmallVector<std::pair<Value *, BasicBlock *>, 8> predMap;
+ for (BasicBlock *P : predecessors(CurrentBlock)) {
+ // We're not interested in PRE where blocks with predecessors that are
+ // not reachable.
+ if (!DT->isReachableFromEntry(P)) {
+ NumWithout = 2;
+ break;
+ }
+ // It is not safe to do PRE when P->CurrentBlock is a loop backedge, and
+ // when CurInst has operand defined in CurrentBlock (so it may be defined
+ // by phi in the loop header).
+ assert(BlockRPONumber.count(P) && BlockRPONumber.count(CurrentBlock) &&
+ "Invalid BlockRPONumber map.");
+ if (BlockRPONumber[P] >= BlockRPONumber[CurrentBlock] &&
+ llvm::any_of(CurInst->operands(), [&](const Use &U) {
+ if (auto *Inst = dyn_cast<Instruction>(U.get()))
+ return Inst->getParent() == CurrentBlock;
+ return false;
+ })) {
+ NumWithout = 2;
+ break;
+ }
+
+ uint32_t TValNo = VN.phiTranslate(P, CurrentBlock, ValNo, *this);
+ Value *predV = findLeader(P, TValNo);
+ if (!predV) {
+ predMap.push_back(std::make_pair(static_cast<Value *>(nullptr), P));
+ PREPred = P;
+ ++NumWithout;
+ } else if (predV == CurInst) {
+ /* CurInst dominates this predecessor. */
+ NumWithout = 2;
+ break;
+ } else {
+ predMap.push_back(std::make_pair(predV, P));
+ ++NumWith;
+ }
+ }
+
+ // Don't do PRE when it might increase code size, i.e. when
+ // we would need to insert instructions in more than one pred.
+ if (NumWithout > 1 || NumWith == 0)
+ return false;
+
+ // We may have a case where all predecessors have the instruction,
+ // and we just need to insert a phi node. Otherwise, perform
+ // insertion.
+ Instruction *PREInstr = nullptr;
+
+ if (NumWithout != 0) {
+ if (!isSafeToSpeculativelyExecute(CurInst)) {
+ // It is only valid to insert a new instruction if the current instruction
+ // is always executed. An instruction with implicit control flow could
+ // prevent us from doing it. If we cannot speculate the execution, then
+ // PRE should be prohibited.
+ if (ICF->isDominatedByICFIFromSameBlock(CurInst))
+ return false;
+ }
+
+ // Don't do PRE across indirect branch.
+ if (isa<IndirectBrInst>(PREPred->getTerminator()))
+ return false;
+
+ // Don't do PRE across callbr.
+ // FIXME: Can we do this across the fallthrough edge?
+ if (isa<CallBrInst>(PREPred->getTerminator()))
+ return false;
+
+ // We can't do PRE safely on a critical edge, so instead we schedule
+ // the edge to be split and perform the PRE the next time we iterate
+ // on the function.
+ unsigned SuccNum = GetSuccessorNumber(PREPred, CurrentBlock);
+ if (isCriticalEdge(PREPred->getTerminator(), SuccNum)) {
+ toSplit.push_back(std::make_pair(PREPred->getTerminator(), SuccNum));
+ return false;
+ }
+ // We need to insert somewhere, so let's give it a shot
+ PREInstr = CurInst->clone();
+ if (!performScalarPREInsertion(PREInstr, PREPred, CurrentBlock, ValNo)) {
+ // If we failed insertion, make sure we remove the instruction.
+ LLVM_DEBUG(verifyRemoved(PREInstr));
+ PREInstr->deleteValue();
+ return false;
+ }
+ }
+
+ // Either we should have filled in the PRE instruction, or we should
+ // not have needed insertions.
+ assert(PREInstr != nullptr || NumWithout == 0);
+
+ ++NumGVNPRE;
+
+ // Create a PHI to make the value available in this block.
+ PHINode *Phi =
+ PHINode::Create(CurInst->getType(), predMap.size(),
+ CurInst->getName() + ".pre-phi", &CurrentBlock->front());
+ for (unsigned i = 0, e = predMap.size(); i != e; ++i) {
+ if (Value *V = predMap[i].first) {
+ // If we use an existing value in this phi, we have to patch the original
+ // value because the phi will be used to replace a later value.
+ patchReplacementInstruction(CurInst, V);
+ Phi->addIncoming(V, predMap[i].second);
+ } else
+ Phi->addIncoming(PREInstr, PREPred);
+ }
+
+ VN.add(Phi, ValNo);
+ // After creating a new PHI for ValNo, the phi translate result for ValNo will
+ // be changed, so erase the related stale entries in phi translate cache.
+ VN.eraseTranslateCacheEntry(ValNo, *CurrentBlock);
+ addToLeaderTable(ValNo, Phi, CurrentBlock);
+ Phi->setDebugLoc(CurInst->getDebugLoc());
+ CurInst->replaceAllUsesWith(Phi);
+ if (MD && Phi->getType()->isPtrOrPtrVectorTy())
+ MD->invalidateCachedPointerInfo(Phi);
+ VN.erase(CurInst);
+ removeFromLeaderTable(ValNo, CurInst, CurrentBlock);
+
+ LLVM_DEBUG(dbgs() << "GVN PRE removed: " << *CurInst << '\n');
+ if (MD)
+ MD->removeInstruction(CurInst);
if (MSSAU)
MSSAU->removeMemoryAccess(CurInst);
- LLVM_DEBUG(verifyRemoved(CurInst));
- // FIXME: Intended to be markInstructionForDeletion(CurInst), but it causes
- // some assertion failures.
- ICF->removeInstruction(CurInst);
- CurInst->eraseFromParent();
- ++NumGVNInstr;
-
- return true;
-}
-
-/// Perform a purely local form of PRE that looks for diamond
-/// control flow patterns and attempts to perform simple PRE at the join point.
-bool GVN::performPRE(Function &F) {
- bool Changed = false;
- for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
- // Nothing to PRE in the entry block.
- if (CurrentBlock == &F.getEntryBlock())
- continue;
-
- // Don't perform PRE on an EH pad.
- if (CurrentBlock->isEHPad())
- continue;
-
- for (BasicBlock::iterator BI = CurrentBlock->begin(),
- BE = CurrentBlock->end();
- BI != BE;) {
- Instruction *CurInst = &*BI++;
- Changed |= performScalarPRE(CurInst);
- }
- }
-
- if (splitCriticalEdges())
- Changed = true;
-
- return Changed;
-}
-
-/// Split the critical edge connecting the given two blocks, and return
-/// the block inserted to the critical edge.
-BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
- // GVN does not require loop-simplify, do not try to preserve it if it is not
- // possible.
- BasicBlock *BB = SplitCriticalEdge(
- Pred, Succ,
+ LLVM_DEBUG(verifyRemoved(CurInst));
+ // FIXME: Intended to be markInstructionForDeletion(CurInst), but it causes
+ // some assertion failures.
+ ICF->removeInstruction(CurInst);
+ CurInst->eraseFromParent();
+ ++NumGVNInstr;
+
+ return true;
+}
+
+/// Perform a purely local form of PRE that looks for diamond
+/// control flow patterns and attempts to perform simple PRE at the join point.
+bool GVN::performPRE(Function &F) {
+ bool Changed = false;
+ for (BasicBlock *CurrentBlock : depth_first(&F.getEntryBlock())) {
+ // Nothing to PRE in the entry block.
+ if (CurrentBlock == &F.getEntryBlock())
+ continue;
+
+ // Don't perform PRE on an EH pad.
+ if (CurrentBlock->isEHPad())
+ continue;
+
+ for (BasicBlock::iterator BI = CurrentBlock->begin(),
+ BE = CurrentBlock->end();
+ BI != BE;) {
+ Instruction *CurInst = &*BI++;
+ Changed |= performScalarPRE(CurInst);
+ }
+ }
+
+ if (splitCriticalEdges())
+ Changed = true;
+
+ return Changed;
+}
+
+/// Split the critical edge connecting the given two blocks, and return
+/// the block inserted to the critical edge.
+BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
+ // GVN does not require loop-simplify, do not try to preserve it if it is not
+ // possible.
+ BasicBlock *BB = SplitCriticalEdge(
+ Pred, Succ,
CriticalEdgeSplittingOptions(DT, LI, MSSAU).unsetPreserveLoopSimplify());
if (BB) {
if (MD)
MD->invalidateCachedPredecessors();
InvalidBlockRPONumbers = true;
}
- return BB;
-}
-
-/// Split critical edges found during the previous
-/// iteration that may enable further optimization.
-bool GVN::splitCriticalEdges() {
- if (toSplit.empty())
- return false;
+ return BB;
+}
+
+/// Split critical edges found during the previous
+/// iteration that may enable further optimization.
+bool GVN::splitCriticalEdges() {
+ if (toSplit.empty())
+ return false;
bool Changed = false;
- do {
- std::pair<Instruction *, unsigned> Edge = toSplit.pop_back_val();
+ do {
+ std::pair<Instruction *, unsigned> Edge = toSplit.pop_back_val();
Changed |= SplitCriticalEdge(Edge.first, Edge.second,
CriticalEdgeSplittingOptions(DT, LI, MSSAU)) !=
nullptr;
- } while (!toSplit.empty());
+ } while (!toSplit.empty());
if (Changed) {
if (MD)
MD->invalidateCachedPredecessors();
InvalidBlockRPONumbers = true;
}
return Changed;
-}
-
-/// Executes one iteration of GVN
-bool GVN::iterateOnFunction(Function &F) {
- cleanupGlobalSets();
-
- // Top-down walk of the dominator tree
- bool Changed = false;
- // Needed for value numbering with phi construction to work.
- // RPOT walks the graph in its constructor and will not be invalidated during
- // processBlock.
- ReversePostOrderTraversal<Function *> RPOT(&F);
-
- for (BasicBlock *BB : RPOT)
- Changed |= processBlock(BB);
-
- return Changed;
-}
-
-void GVN::cleanupGlobalSets() {
- VN.clear();
- LeaderTable.clear();
- BlockRPONumber.clear();
- TableAllocator.Reset();
- ICF->clear();
- InvalidBlockRPONumbers = true;
-}
-
-/// Verify that the specified instruction does not occur in our
-/// internal data structures.
-void GVN::verifyRemoved(const Instruction *Inst) const {
- VN.verifyRemoved(Inst);
-
- // Walk through the value number scope to make sure the instruction isn't
- // ferreted away in it.
- for (DenseMap<uint32_t, LeaderTableEntry>::const_iterator
- I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) {
- const LeaderTableEntry *Node = &I->second;
- assert(Node->Val != Inst && "Inst still in value numbering scope!");
-
- while (Node->Next) {
- Node = Node->Next;
- assert(Node->Val != Inst && "Inst still in value numbering scope!");
- }
- }
-}
-
-/// BB is declared dead, which implied other blocks become dead as well. This
-/// function is to add all these blocks to "DeadBlocks". For the dead blocks'
-/// live successors, update their phi nodes by replacing the operands
-/// corresponding to dead blocks with UndefVal.
-void GVN::addDeadBlock(BasicBlock *BB) {
- SmallVector<BasicBlock *, 4> NewDead;
- SmallSetVector<BasicBlock *, 4> DF;
-
- NewDead.push_back(BB);
- while (!NewDead.empty()) {
- BasicBlock *D = NewDead.pop_back_val();
- if (DeadBlocks.count(D))
- continue;
-
- // All blocks dominated by D are dead.
- SmallVector<BasicBlock *, 8> Dom;
- DT->getDescendants(D, Dom);
- DeadBlocks.insert(Dom.begin(), Dom.end());
-
- // Figure out the dominance-frontier(D).
- for (BasicBlock *B : Dom) {
- for (BasicBlock *S : successors(B)) {
- if (DeadBlocks.count(S))
- continue;
-
- bool AllPredDead = true;
- for (BasicBlock *P : predecessors(S))
- if (!DeadBlocks.count(P)) {
- AllPredDead = false;
- break;
- }
-
- if (!AllPredDead) {
- // S could be proved dead later on. That is why we don't update phi
- // operands at this moment.
- DF.insert(S);
- } else {
- // While S is not dominated by D, it is dead by now. This could take
- // place if S already have a dead predecessor before D is declared
- // dead.
- NewDead.push_back(S);
- }
- }
- }
- }
-
- // For the dead blocks' live successors, update their phi nodes by replacing
- // the operands corresponding to dead blocks with UndefVal.
- for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end();
- I != E; I++) {
- BasicBlock *B = *I;
- if (DeadBlocks.count(B))
- continue;
-
- // First, split the critical edges. This might also create additional blocks
- // to preserve LoopSimplify form and adjust edges accordingly.
+}
+
+/// Executes one iteration of GVN
+bool GVN::iterateOnFunction(Function &F) {
+ cleanupGlobalSets();
+
+ // Top-down walk of the dominator tree
+ bool Changed = false;
+ // Needed for value numbering with phi construction to work.
+ // RPOT walks the graph in its constructor and will not be invalidated during
+ // processBlock.
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+
+ for (BasicBlock *BB : RPOT)
+ Changed |= processBlock(BB);
+
+ return Changed;
+}
+
+void GVN::cleanupGlobalSets() {
+ VN.clear();
+ LeaderTable.clear();
+ BlockRPONumber.clear();
+ TableAllocator.Reset();
+ ICF->clear();
+ InvalidBlockRPONumbers = true;
+}
+
+/// Verify that the specified instruction does not occur in our
+/// internal data structures.
+void GVN::verifyRemoved(const Instruction *Inst) const {
+ VN.verifyRemoved(Inst);
+
+ // Walk through the value number scope to make sure the instruction isn't
+ // ferreted away in it.
+ for (DenseMap<uint32_t, LeaderTableEntry>::const_iterator
+ I = LeaderTable.begin(), E = LeaderTable.end(); I != E; ++I) {
+ const LeaderTableEntry *Node = &I->second;
+ assert(Node->Val != Inst && "Inst still in value numbering scope!");
+
+ while (Node->Next) {
+ Node = Node->Next;
+ assert(Node->Val != Inst && "Inst still in value numbering scope!");
+ }
+ }
+}
+
+/// BB is declared dead, which implied other blocks become dead as well. This
+/// function is to add all these blocks to "DeadBlocks". For the dead blocks'
+/// live successors, update their phi nodes by replacing the operands
+/// corresponding to dead blocks with UndefVal.
+void GVN::addDeadBlock(BasicBlock *BB) {
+ SmallVector<BasicBlock *, 4> NewDead;
+ SmallSetVector<BasicBlock *, 4> DF;
+
+ NewDead.push_back(BB);
+ while (!NewDead.empty()) {
+ BasicBlock *D = NewDead.pop_back_val();
+ if (DeadBlocks.count(D))
+ continue;
+
+ // All blocks dominated by D are dead.
+ SmallVector<BasicBlock *, 8> Dom;
+ DT->getDescendants(D, Dom);
+ DeadBlocks.insert(Dom.begin(), Dom.end());
+
+ // Figure out the dominance-frontier(D).
+ for (BasicBlock *B : Dom) {
+ for (BasicBlock *S : successors(B)) {
+ if (DeadBlocks.count(S))
+ continue;
+
+ bool AllPredDead = true;
+ for (BasicBlock *P : predecessors(S))
+ if (!DeadBlocks.count(P)) {
+ AllPredDead = false;
+ break;
+ }
+
+ if (!AllPredDead) {
+ // S could be proved dead later on. That is why we don't update phi
+ // operands at this moment.
+ DF.insert(S);
+ } else {
+ // While S is not dominated by D, it is dead by now. This could take
+ // place if S already have a dead predecessor before D is declared
+ // dead.
+ NewDead.push_back(S);
+ }
+ }
+ }
+ }
+
+ // For the dead blocks' live successors, update their phi nodes by replacing
+ // the operands corresponding to dead blocks with UndefVal.
+ for(SmallSetVector<BasicBlock *, 4>::iterator I = DF.begin(), E = DF.end();
+ I != E; I++) {
+ BasicBlock *B = *I;
+ if (DeadBlocks.count(B))
+ continue;
+
+ // First, split the critical edges. This might also create additional blocks
+ // to preserve LoopSimplify form and adjust edges accordingly.
SmallVector<BasicBlock *, 4> Preds(predecessors(B));
- for (BasicBlock *P : Preds) {
- if (!DeadBlocks.count(P))
- continue;
-
+ for (BasicBlock *P : Preds) {
+ if (!DeadBlocks.count(P))
+ continue;
+
if (llvm::is_contained(successors(P), B) &&
- isCriticalEdge(P->getTerminator(), B)) {
- if (BasicBlock *S = splitCriticalEdges(P, B))
- DeadBlocks.insert(P = S);
- }
- }
-
- // Now undef the incoming values from the dead predecessors.
- for (BasicBlock *P : predecessors(B)) {
- if (!DeadBlocks.count(P))
- continue;
- for (PHINode &Phi : B->phis()) {
- Phi.setIncomingValueForBlock(P, UndefValue::get(Phi.getType()));
- if (MD)
- MD->invalidateCachedPointerInfo(&Phi);
- }
- }
- }
-}
-
-// If the given branch is recognized as a foldable branch (i.e. conditional
-// branch with constant condition), it will perform following analyses and
-// transformation.
-// 1) If the dead out-coming edge is a critical-edge, split it. Let
-// R be the target of the dead out-coming edge.
-// 1) Identify the set of dead blocks implied by the branch's dead outcoming
-// edge. The result of this step will be {X| X is dominated by R}
-// 2) Identify those blocks which haves at least one dead predecessor. The
-// result of this step will be dominance-frontier(R).
-// 3) Update the PHIs in DF(R) by replacing the operands corresponding to
-// dead blocks with "UndefVal" in an hope these PHIs will optimized away.
-//
-// Return true iff *NEW* dead code are found.
-bool GVN::processFoldableCondBr(BranchInst *BI) {
- if (!BI || BI->isUnconditional())
- return false;
-
- // If a branch has two identical successors, we cannot declare either dead.
- if (BI->getSuccessor(0) == BI->getSuccessor(1))
- return false;
-
- ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
- if (!Cond)
- return false;
-
- BasicBlock *DeadRoot =
- Cond->getZExtValue() ? BI->getSuccessor(1) : BI->getSuccessor(0);
- if (DeadBlocks.count(DeadRoot))
- return false;
-
- if (!DeadRoot->getSinglePredecessor())
- DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot);
-
- addDeadBlock(DeadRoot);
- return true;
-}
-
-// performPRE() will trigger assert if it comes across an instruction without
-// associated val-num. As it normally has far more live instructions than dead
-// instructions, it makes more sense just to "fabricate" a val-number for the
-// dead code than checking if instruction involved is dead or not.
-void GVN::assignValNumForDeadCode() {
- for (BasicBlock *BB : DeadBlocks) {
- for (Instruction &Inst : *BB) {
- unsigned ValNum = VN.lookupOrAdd(&Inst);
- addToLeaderTable(ValNum, &Inst, BB);
- }
- }
-}
-
-class llvm::gvn::GVNLegacyPass : public FunctionPass {
-public:
- static char ID; // Pass identification, replacement for typeid
-
- explicit GVNLegacyPass(bool NoMemDepAnalysis = !GVNEnableMemDep)
- : FunctionPass(ID), Impl(GVNOptions().setMemDep(!NoMemDepAnalysis)) {
- initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
-
+ isCriticalEdge(P->getTerminator(), B)) {
+ if (BasicBlock *S = splitCriticalEdges(P, B))
+ DeadBlocks.insert(P = S);
+ }
+ }
+
+ // Now undef the incoming values from the dead predecessors.
+ for (BasicBlock *P : predecessors(B)) {
+ if (!DeadBlocks.count(P))
+ continue;
+ for (PHINode &Phi : B->phis()) {
+ Phi.setIncomingValueForBlock(P, UndefValue::get(Phi.getType()));
+ if (MD)
+ MD->invalidateCachedPointerInfo(&Phi);
+ }
+ }
+ }
+}
+
+// If the given branch is recognized as a foldable branch (i.e. conditional
+// branch with constant condition), it will perform following analyses and
+// transformation.
+// 1) If the dead out-coming edge is a critical-edge, split it. Let
+// R be the target of the dead out-coming edge.
+// 1) Identify the set of dead blocks implied by the branch's dead outcoming
+// edge. The result of this step will be {X| X is dominated by R}
+// 2) Identify those blocks which haves at least one dead predecessor. The
+// result of this step will be dominance-frontier(R).
+// 3) Update the PHIs in DF(R) by replacing the operands corresponding to
+// dead blocks with "UndefVal" in an hope these PHIs will optimized away.
+//
+// Return true iff *NEW* dead code are found.
+bool GVN::processFoldableCondBr(BranchInst *BI) {
+ if (!BI || BI->isUnconditional())
+ return false;
+
+ // If a branch has two identical successors, we cannot declare either dead.
+ if (BI->getSuccessor(0) == BI->getSuccessor(1))
+ return false;
+
+ ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+ if (!Cond)
+ return false;
+
+ BasicBlock *DeadRoot =
+ Cond->getZExtValue() ? BI->getSuccessor(1) : BI->getSuccessor(0);
+ if (DeadBlocks.count(DeadRoot))
+ return false;
+
+ if (!DeadRoot->getSinglePredecessor())
+ DeadRoot = splitCriticalEdges(BI->getParent(), DeadRoot);
+
+ addDeadBlock(DeadRoot);
+ return true;
+}
+
+// performPRE() will trigger assert if it comes across an instruction without
+// associated val-num. As it normally has far more live instructions than dead
+// instructions, it makes more sense just to "fabricate" a val-number for the
+// dead code than checking if instruction involved is dead or not.
+void GVN::assignValNumForDeadCode() {
+ for (BasicBlock *BB : DeadBlocks) {
+ for (Instruction &Inst : *BB) {
+ unsigned ValNum = VN.lookupOrAdd(&Inst);
+ addToLeaderTable(ValNum, &Inst, BB);
+ }
+ }
+}
+
+class llvm::gvn::GVNLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ explicit GVNLegacyPass(bool NoMemDepAnalysis = !GVNEnableMemDep)
+ : FunctionPass(ID), Impl(GVNOptions().setMemDep(!NoMemDepAnalysis)) {
+ initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+
auto *MSSAWP = getAnalysisIfAvailable<MemorySSAWrapperPass>();
- return Impl.runImpl(
- F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
- getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
- getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
- getAnalysis<AAResultsWrapperPass>().getAAResults(),
- Impl.isMemDepEnabled()
- ? &getAnalysis<MemoryDependenceWrapperPass>().getMemDep()
- : nullptr,
- LIWP ? &LIWP->getLoopInfo() : nullptr,
+ return Impl.runImpl(
+ F, getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+ getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
+ getAnalysis<AAResultsWrapperPass>().getAAResults(),
+ Impl.isMemDepEnabled()
+ ? &getAnalysis<MemoryDependenceWrapperPass>().getMemDep()
+ : nullptr,
+ LIWP ? &LIWP->getLoopInfo() : nullptr,
&getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(),
MSSAWP ? &MSSAWP->getMSSA() : nullptr);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- if (Impl.isMemDepEnabled())
- AU.addRequired<MemoryDependenceWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<TargetLibraryInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ if (Impl.isMemDepEnabled())
+ AU.addRequired<MemoryDependenceWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
AU.addPreserved<MemorySSAWrapperPass>();
- }
-
-private:
- GVN Impl;
-};
-
-char GVNLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
-
-// The public interface to this file...
-FunctionPass *llvm::createGVNPass(bool NoMemDepAnalysis) {
- return new GVNLegacyPass(NoMemDepAnalysis);
-}
+ }
+
+private:
+ GVN Impl;
+};
+
+char GVNLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(GVNLegacyPass, "gvn", "Global Value Numbering", false, false)
+
+// The public interface to this file...
+FunctionPass *llvm::createGVNPass(bool NoMemDepAnalysis) {
+ return new GVNLegacyPass(NoMemDepAnalysis);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/GVNHoist.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/GVNHoist.cpp
index 136058877c..8d0bd56749 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/GVNHoist.cpp
@@ -1,247 +1,247 @@
-//===- GVNHoist.cpp - Hoist scalar and load expressions -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass hoists expressions from branches to a common dominator. It uses
-// GVN (global value numbering) to discover expressions computing the same
-// values. The primary goals of code-hoisting are:
-// 1. To reduce the code size.
-// 2. In some cases reduce critical path (by exposing more ILP).
-//
-// The algorithm factors out the reachability of values such that multiple
-// queries to find reachability of values are fast. This is based on finding the
-// ANTIC points in the CFG which do not change during hoisting. The ANTIC points
-// are basically the dominance-frontiers in the inverse graph. So we introduce a
-// data structure (CHI nodes) to keep track of values flowing out of a basic
-// block. We only do this for values with multiple occurrences in the function
-// as they are the potential hoistable candidates. This approach allows us to
-// hoist instructions to a basic block with more than two successors, as well as
-// deal with infinite loops in a trivial way.
-//
-// Limitations: This pass does not hoist fully redundant expressions because
-// they are already handled by GVN-PRE. It is advisable to run gvn-hoist before
-// and after gvn-pre because gvn-pre creates opportunities for more instructions
-// to be hoisted.
-//
-// Hoisting may affect the performance in some cases. To mitigate that, hoisting
-// is disabled in the following cases.
-// 1. Scalars across calls.
-// 2. geps when corresponding load/store cannot be hoisted.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/IteratedDominanceFrontier.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
-#include <cassert>
-#include <iterator>
-#include <memory>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "gvn-hoist"
-
-STATISTIC(NumHoisted, "Number of instructions hoisted");
-STATISTIC(NumRemoved, "Number of instructions removed");
-STATISTIC(NumLoadsHoisted, "Number of loads hoisted");
-STATISTIC(NumLoadsRemoved, "Number of loads removed");
-STATISTIC(NumStoresHoisted, "Number of stores hoisted");
-STATISTIC(NumStoresRemoved, "Number of stores removed");
-STATISTIC(NumCallsHoisted, "Number of calls hoisted");
-STATISTIC(NumCallsRemoved, "Number of calls removed");
-
-static cl::opt<int>
- MaxHoistedThreshold("gvn-max-hoisted", cl::Hidden, cl::init(-1),
- cl::desc("Max number of instructions to hoist "
- "(default unlimited = -1)"));
-
-static cl::opt<int> MaxNumberOfBBSInPath(
- "gvn-hoist-max-bbs", cl::Hidden, cl::init(4),
- cl::desc("Max number of basic blocks on the path between "
- "hoisting locations (default = 4, unlimited = -1)"));
-
-static cl::opt<int> MaxDepthInBB(
- "gvn-hoist-max-depth", cl::Hidden, cl::init(100),
- cl::desc("Hoist instructions from the beginning of the BB up to the "
- "maximum specified depth (default = 100, unlimited = -1)"));
-
-static cl::opt<int>
- MaxChainLength("gvn-hoist-max-chain-length", cl::Hidden, cl::init(10),
- cl::desc("Maximum length of dependent chains to hoist "
- "(default = 10, unlimited = -1)"));
-
-namespace llvm {
-
-using BBSideEffectsSet = DenseMap<const BasicBlock *, bool>;
-using SmallVecInsn = SmallVector<Instruction *, 4>;
-using SmallVecImplInsn = SmallVectorImpl<Instruction *>;
-
-// Each element of a hoisting list contains the basic block where to hoist and
-// a list of instructions to be hoisted.
-using HoistingPointInfo = std::pair<BasicBlock *, SmallVecInsn>;
-
-using HoistingPointList = SmallVector<HoistingPointInfo, 4>;
-
-// A map from a pair of VNs to all the instructions with those VNs.
-using VNType = std::pair<unsigned, unsigned>;
-
-using VNtoInsns = DenseMap<VNType, SmallVector<Instruction *, 4>>;
-
-// CHI keeps information about values flowing out of a basic block. It is
-// similar to PHI but in the inverse graph, and used for outgoing values on each
-// edge. For conciseness, it is computed only for instructions with multiple
-// occurrences in the CFG because they are the only hoistable candidates.
-// A (CHI[{V, B, I1}, {V, C, I2}]
-// / \
-// / \
-// B(I1) C (I2)
-// The Value number for both I1 and I2 is V, the CHI node will save the
-// instruction as well as the edge where the value is flowing to.
-struct CHIArg {
- VNType VN;
-
- // Edge destination (shows the direction of flow), may not be where the I is.
- BasicBlock *Dest;
-
- // The instruction (VN) which uses the values flowing out of CHI.
- Instruction *I;
-
- bool operator==(const CHIArg &A) const { return VN == A.VN; }
- bool operator!=(const CHIArg &A) const { return !(*this == A); }
-};
-
-using CHIIt = SmallVectorImpl<CHIArg>::iterator;
-using CHIArgs = iterator_range<CHIIt>;
-using OutValuesType = DenseMap<BasicBlock *, SmallVector<CHIArg, 2>>;
-using InValuesType =
- DenseMap<BasicBlock *, SmallVector<std::pair<VNType, Instruction *>, 2>>;
-
-// An invalid value number Used when inserting a single value number into
-// VNtoInsns.
-enum : unsigned { InvalidVN = ~2U };
-
-// Records all scalar instructions candidate for code hoisting.
-class InsnInfo {
- VNtoInsns VNtoScalars;
-
-public:
- // Inserts I and its value number in VNtoScalars.
- void insert(Instruction *I, GVN::ValueTable &VN) {
- // Scalar instruction.
- unsigned V = VN.lookupOrAdd(I);
- VNtoScalars[{V, InvalidVN}].push_back(I);
- }
-
- const VNtoInsns &getVNTable() const { return VNtoScalars; }
-};
-
-// Records all load instructions candidate for code hoisting.
-class LoadInfo {
- VNtoInsns VNtoLoads;
-
-public:
- // Insert Load and the value number of its memory address in VNtoLoads.
- void insert(LoadInst *Load, GVN::ValueTable &VN) {
- if (Load->isSimple()) {
- unsigned V = VN.lookupOrAdd(Load->getPointerOperand());
- VNtoLoads[{V, InvalidVN}].push_back(Load);
- }
- }
-
- const VNtoInsns &getVNTable() const { return VNtoLoads; }
-};
-
-// Records all store instructions candidate for code hoisting.
-class StoreInfo {
- VNtoInsns VNtoStores;
-
-public:
- // Insert the Store and a hash number of the store address and the stored
- // value in VNtoStores.
- void insert(StoreInst *Store, GVN::ValueTable &VN) {
- if (!Store->isSimple())
- return;
- // Hash the store address and the stored value.
- Value *Ptr = Store->getPointerOperand();
- Value *Val = Store->getValueOperand();
- VNtoStores[{VN.lookupOrAdd(Ptr), VN.lookupOrAdd(Val)}].push_back(Store);
- }
-
- const VNtoInsns &getVNTable() const { return VNtoStores; }
-};
-
-// Records all call instructions candidate for code hoisting.
-class CallInfo {
- VNtoInsns VNtoCallsScalars;
- VNtoInsns VNtoCallsLoads;
- VNtoInsns VNtoCallsStores;
-
-public:
- // Insert Call and its value numbering in one of the VNtoCalls* containers.
- void insert(CallInst *Call, GVN::ValueTable &VN) {
- // A call that doesNotAccessMemory is handled as a Scalar,
- // onlyReadsMemory will be handled as a Load instruction,
- // all other calls will be handled as stores.
- unsigned V = VN.lookupOrAdd(Call);
- auto Entry = std::make_pair(V, InvalidVN);
-
- if (Call->doesNotAccessMemory())
- VNtoCallsScalars[Entry].push_back(Call);
- else if (Call->onlyReadsMemory())
- VNtoCallsLoads[Entry].push_back(Call);
- else
- VNtoCallsStores[Entry].push_back(Call);
- }
-
- const VNtoInsns &getScalarVNTable() const { return VNtoCallsScalars; }
- const VNtoInsns &getLoadVNTable() const { return VNtoCallsLoads; }
- const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; }
-};
-
-static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
+//===- GVNHoist.cpp - Hoist scalar and load expressions -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists expressions from branches to a common dominator. It uses
+// GVN (global value numbering) to discover expressions computing the same
+// values. The primary goals of code-hoisting are:
+// 1. To reduce the code size.
+// 2. In some cases reduce critical path (by exposing more ILP).
+//
+// The algorithm factors out the reachability of values such that multiple
+// queries to find reachability of values are fast. This is based on finding the
+// ANTIC points in the CFG which do not change during hoisting. The ANTIC points
+// are basically the dominance-frontiers in the inverse graph. So we introduce a
+// data structure (CHI nodes) to keep track of values flowing out of a basic
+// block. We only do this for values with multiple occurrences in the function
+// as they are the potential hoistable candidates. This approach allows us to
+// hoist instructions to a basic block with more than two successors, as well as
+// deal with infinite loops in a trivial way.
+//
+// Limitations: This pass does not hoist fully redundant expressions because
+// they are already handled by GVN-PRE. It is advisable to run gvn-hoist before
+// and after gvn-pre because gvn-pre creates opportunities for more instructions
+// to be hoisted.
+//
+// Hoisting may affect the performance in some cases. To mitigate that, hoisting
+// is disabled in the following cases.
+// 1. Scalars across calls.
+// 2. geps when corresponding load/store cannot be hoisted.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gvn-hoist"
+
+STATISTIC(NumHoisted, "Number of instructions hoisted");
+STATISTIC(NumRemoved, "Number of instructions removed");
+STATISTIC(NumLoadsHoisted, "Number of loads hoisted");
+STATISTIC(NumLoadsRemoved, "Number of loads removed");
+STATISTIC(NumStoresHoisted, "Number of stores hoisted");
+STATISTIC(NumStoresRemoved, "Number of stores removed");
+STATISTIC(NumCallsHoisted, "Number of calls hoisted");
+STATISTIC(NumCallsRemoved, "Number of calls removed");
+
+static cl::opt<int>
+ MaxHoistedThreshold("gvn-max-hoisted", cl::Hidden, cl::init(-1),
+ cl::desc("Max number of instructions to hoist "
+ "(default unlimited = -1)"));
+
+static cl::opt<int> MaxNumberOfBBSInPath(
+ "gvn-hoist-max-bbs", cl::Hidden, cl::init(4),
+ cl::desc("Max number of basic blocks on the path between "
+ "hoisting locations (default = 4, unlimited = -1)"));
+
+static cl::opt<int> MaxDepthInBB(
+ "gvn-hoist-max-depth", cl::Hidden, cl::init(100),
+ cl::desc("Hoist instructions from the beginning of the BB up to the "
+ "maximum specified depth (default = 100, unlimited = -1)"));
+
+static cl::opt<int>
+ MaxChainLength("gvn-hoist-max-chain-length", cl::Hidden, cl::init(10),
+ cl::desc("Maximum length of dependent chains to hoist "
+ "(default = 10, unlimited = -1)"));
+
+namespace llvm {
+
+using BBSideEffectsSet = DenseMap<const BasicBlock *, bool>;
+using SmallVecInsn = SmallVector<Instruction *, 4>;
+using SmallVecImplInsn = SmallVectorImpl<Instruction *>;
+
+// Each element of a hoisting list contains the basic block where to hoist and
+// a list of instructions to be hoisted.
+using HoistingPointInfo = std::pair<BasicBlock *, SmallVecInsn>;
+
+using HoistingPointList = SmallVector<HoistingPointInfo, 4>;
+
+// A map from a pair of VNs to all the instructions with those VNs.
+using VNType = std::pair<unsigned, unsigned>;
+
+using VNtoInsns = DenseMap<VNType, SmallVector<Instruction *, 4>>;
+
+// CHI keeps information about values flowing out of a basic block. It is
+// similar to PHI but in the inverse graph, and used for outgoing values on each
+// edge. For conciseness, it is computed only for instructions with multiple
+// occurrences in the CFG because they are the only hoistable candidates.
+// A (CHI[{V, B, I1}, {V, C, I2}]
+// / \
+// / \
+// B(I1) C (I2)
+// The Value number for both I1 and I2 is V, the CHI node will save the
+// instruction as well as the edge where the value is flowing to.
+struct CHIArg {
+ VNType VN;
+
+ // Edge destination (shows the direction of flow), may not be where the I is.
+ BasicBlock *Dest;
+
+ // The instruction (VN) which uses the values flowing out of CHI.
+ Instruction *I;
+
+ bool operator==(const CHIArg &A) const { return VN == A.VN; }
+ bool operator!=(const CHIArg &A) const { return !(*this == A); }
+};
+
+using CHIIt = SmallVectorImpl<CHIArg>::iterator;
+using CHIArgs = iterator_range<CHIIt>;
+using OutValuesType = DenseMap<BasicBlock *, SmallVector<CHIArg, 2>>;
+using InValuesType =
+ DenseMap<BasicBlock *, SmallVector<std::pair<VNType, Instruction *>, 2>>;
+
+// An invalid value number Used when inserting a single value number into
+// VNtoInsns.
+enum : unsigned { InvalidVN = ~2U };
+
+// Records all scalar instructions candidate for code hoisting.
+class InsnInfo {
+ VNtoInsns VNtoScalars;
+
+public:
+ // Inserts I and its value number in VNtoScalars.
+ void insert(Instruction *I, GVN::ValueTable &VN) {
+ // Scalar instruction.
+ unsigned V = VN.lookupOrAdd(I);
+ VNtoScalars[{V, InvalidVN}].push_back(I);
+ }
+
+ const VNtoInsns &getVNTable() const { return VNtoScalars; }
+};
+
+// Records all load instructions candidate for code hoisting.
+class LoadInfo {
+ VNtoInsns VNtoLoads;
+
+public:
+ // Insert Load and the value number of its memory address in VNtoLoads.
+ void insert(LoadInst *Load, GVN::ValueTable &VN) {
+ if (Load->isSimple()) {
+ unsigned V = VN.lookupOrAdd(Load->getPointerOperand());
+ VNtoLoads[{V, InvalidVN}].push_back(Load);
+ }
+ }
+
+ const VNtoInsns &getVNTable() const { return VNtoLoads; }
+};
+
+// Records all store instructions candidate for code hoisting.
+class StoreInfo {
+ VNtoInsns VNtoStores;
+
+public:
+ // Insert the Store and a hash number of the store address and the stored
+ // value in VNtoStores.
+ void insert(StoreInst *Store, GVN::ValueTable &VN) {
+ if (!Store->isSimple())
+ return;
+ // Hash the store address and the stored value.
+ Value *Ptr = Store->getPointerOperand();
+ Value *Val = Store->getValueOperand();
+ VNtoStores[{VN.lookupOrAdd(Ptr), VN.lookupOrAdd(Val)}].push_back(Store);
+ }
+
+ const VNtoInsns &getVNTable() const { return VNtoStores; }
+};
+
+// Records all call instructions candidate for code hoisting.
+class CallInfo {
+ VNtoInsns VNtoCallsScalars;
+ VNtoInsns VNtoCallsLoads;
+ VNtoInsns VNtoCallsStores;
+
+public:
+ // Insert Call and its value numbering in one of the VNtoCalls* containers.
+ void insert(CallInst *Call, GVN::ValueTable &VN) {
+ // A call that doesNotAccessMemory is handled as a Scalar,
+ // onlyReadsMemory will be handled as a Load instruction,
+ // all other calls will be handled as stores.
+ unsigned V = VN.lookupOrAdd(Call);
+ auto Entry = std::make_pair(V, InvalidVN);
+
+ if (Call->doesNotAccessMemory())
+ VNtoCallsScalars[Entry].push_back(Call);
+ else if (Call->onlyReadsMemory())
+ VNtoCallsLoads[Entry].push_back(Call);
+ else
+ VNtoCallsStores[Entry].push_back(Call);
+ }
+
+ const VNtoInsns &getScalarVNTable() const { return VNtoCallsScalars; }
+ const VNtoInsns &getLoadVNTable() const { return VNtoCallsLoads; }
+ const VNtoInsns &getStoreVNTable() const { return VNtoCallsStores; }
+};
+
+static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
static const unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
LLVMContext::MD_alias_scope,
LLVMContext::MD_noalias,
@@ -250,248 +250,248 @@ static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
LLVMContext::MD_invariant_load,
LLVMContext::MD_invariant_group,
LLVMContext::MD_access_group};
- combineMetadata(ReplInst, I, KnownIDs, true);
-}
-
-// This pass hoists common computations across branches sharing common
-// dominator. The primary goal is to reduce the code size, and in some
-// cases reduce critical path (by exposing more ILP).
-class GVNHoist {
-public:
- GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA,
- MemoryDependenceResults *MD, MemorySSA *MSSA)
- : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA),
- MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
-
+ combineMetadata(ReplInst, I, KnownIDs, true);
+}
+
+// This pass hoists common computations across branches sharing common
+// dominator. The primary goal is to reduce the code size, and in some
+// cases reduce critical path (by exposing more ILP).
+class GVNHoist {
+public:
+ GVNHoist(DominatorTree *DT, PostDominatorTree *PDT, AliasAnalysis *AA,
+ MemoryDependenceResults *MD, MemorySSA *MSSA)
+ : DT(DT), PDT(PDT), AA(AA), MD(MD), MSSA(MSSA),
+ MSSAUpdater(std::make_unique<MemorySSAUpdater>(MSSA)) {}
+
bool run(Function &F);
-
- // Copied from NewGVN.cpp
- // This function provides global ranking of operations so that we can place
- // them in a canonical order. Note that rank alone is not necessarily enough
- // for a complete ordering, as constants all have the same rank. However,
- // generally, we will simplify an operation with all constants so that it
- // doesn't matter what order they appear in.
+
+ // Copied from NewGVN.cpp
+ // This function provides global ranking of operations so that we can place
+ // them in a canonical order. Note that rank alone is not necessarily enough
+ // for a complete ordering, as constants all have the same rank. However,
+ // generally, we will simplify an operation with all constants so that it
+ // doesn't matter what order they appear in.
unsigned int rank(const Value *V) const;
-
-private:
- GVN::ValueTable VN;
- DominatorTree *DT;
- PostDominatorTree *PDT;
- AliasAnalysis *AA;
- MemoryDependenceResults *MD;
- MemorySSA *MSSA;
- std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
- DenseMap<const Value *, unsigned> DFSNumber;
- BBSideEffectsSet BBSideEffects;
- DenseSet<const BasicBlock *> HoistBarrier;
- SmallVector<BasicBlock *, 32> IDFBlocks;
- unsigned NumFuncArgs;
- const bool HoistingGeps = false;
-
- enum InsKind { Unknown, Scalar, Load, Store };
-
- // Return true when there are exception handling in BB.
+
+private:
+ GVN::ValueTable VN;
+ DominatorTree *DT;
+ PostDominatorTree *PDT;
+ AliasAnalysis *AA;
+ MemoryDependenceResults *MD;
+ MemorySSA *MSSA;
+ std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
+ DenseMap<const Value *, unsigned> DFSNumber;
+ BBSideEffectsSet BBSideEffects;
+ DenseSet<const BasicBlock *> HoistBarrier;
+ SmallVector<BasicBlock *, 32> IDFBlocks;
+ unsigned NumFuncArgs;
+ const bool HoistingGeps = false;
+
+ enum InsKind { Unknown, Scalar, Load, Store };
+
+ // Return true when there are exception handling in BB.
bool hasEH(const BasicBlock *BB);
-
- // Return true when I1 appears before I2 in the instructions of BB.
- bool firstInBB(const Instruction *I1, const Instruction *I2) {
- assert(I1->getParent() == I2->getParent());
- unsigned I1DFS = DFSNumber.lookup(I1);
- unsigned I2DFS = DFSNumber.lookup(I2);
- assert(I1DFS && I2DFS);
- return I1DFS < I2DFS;
- }
-
- // Return true when there are memory uses of Def in BB.
- bool hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
+
+ // Return true when I1 appears before I2 in the instructions of BB.
+ bool firstInBB(const Instruction *I1, const Instruction *I2) {
+ assert(I1->getParent() == I2->getParent());
+ unsigned I1DFS = DFSNumber.lookup(I1);
+ unsigned I2DFS = DFSNumber.lookup(I2);
+ assert(I1DFS && I2DFS);
+ return I1DFS < I2DFS;
+ }
+
+ // Return true when there are memory uses of Def in BB.
+ bool hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
const BasicBlock *BB);
-
- bool hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
+
+ bool hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
int &NBBsOnAllPaths);
-
- // Return true when there are exception handling or loads of memory Def
- // between Def and NewPt. This function is only called for stores: Def is
- // the MemoryDef of the store to be hoisted.
-
- // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
- // return true when the counter NBBsOnAllPaths reaces 0, except when it is
- // initialized to -1 which is unlimited.
- bool hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
+
+ // Return true when there are exception handling or loads of memory Def
+ // between Def and NewPt. This function is only called for stores: Def is
+ // the MemoryDef of the store to be hoisted.
+
+ // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
+ // return true when the counter NBBsOnAllPaths reaces 0, except when it is
+ // initialized to -1 which is unlimited.
+ bool hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
int &NBBsOnAllPaths);
-
- // Return true when there are exception handling between HoistPt and BB.
- // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
- // return true when the counter NBBsOnAllPaths reaches 0, except when it is
- // initialized to -1 which is unlimited.
- bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
+
+ // Return true when there are exception handling between HoistPt and BB.
+ // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
+ // return true when the counter NBBsOnAllPaths reaches 0, except when it is
+ // initialized to -1 which is unlimited.
+ bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
int &NBBsOnAllPaths);
-
- // Return true when it is safe to hoist a memory load or store U from OldPt
- // to NewPt.
- bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt,
+
+ // Return true when it is safe to hoist a memory load or store U from OldPt
+ // to NewPt.
+ bool safeToHoistLdSt(const Instruction *NewPt, const Instruction *OldPt,
MemoryUseOrDef *U, InsKind K, int &NBBsOnAllPaths);
-
- // Return true when it is safe to hoist scalar instructions from all blocks in
- // WL to HoistBB.
- bool safeToHoistScalar(const BasicBlock *HoistBB, const BasicBlock *BB,
- int &NBBsOnAllPaths) {
- return !hasEHOnPath(HoistBB, BB, NBBsOnAllPaths);
- }
-
- // In the inverse CFG, the dominance frontier of basic block (BB) is the
- // point where ANTIC needs to be computed for instructions which are going
- // to be hoisted. Since this point does not change during gvn-hoist,
- // we compute it only once (on demand).
- // The ides is inspired from:
- // "Partial Redundancy Elimination in SSA Form"
- // ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW
- // They use similar idea in the forward graph to find fully redundant and
- // partially redundant expressions, here it is used in the inverse graph to
- // find fully anticipable instructions at merge point (post-dominator in
- // the inverse CFG).
- // Returns the edge via which an instruction in BB will get the values from.
-
- // Returns true when the values are flowing out to each edge.
+
+ // Return true when it is safe to hoist scalar instructions from all blocks in
+ // WL to HoistBB.
+ bool safeToHoistScalar(const BasicBlock *HoistBB, const BasicBlock *BB,
+ int &NBBsOnAllPaths) {
+ return !hasEHOnPath(HoistBB, BB, NBBsOnAllPaths);
+ }
+
+ // In the inverse CFG, the dominance frontier of basic block (BB) is the
+ // point where ANTIC needs to be computed for instructions which are going
+ // to be hoisted. Since this point does not change during gvn-hoist,
+ // we compute it only once (on demand).
+ // The ides is inspired from:
+ // "Partial Redundancy Elimination in SSA Form"
+ // ROBERT KENNEDY, SUN CHAN, SHIN-MING LIU, RAYMOND LO, PENG TU and FRED CHOW
+ // They use similar idea in the forward graph to find fully redundant and
+ // partially redundant expressions, here it is used in the inverse graph to
+ // find fully anticipable instructions at merge point (post-dominator in
+ // the inverse CFG).
+ // Returns the edge via which an instruction in BB will get the values from.
+
+ // Returns true when the values are flowing out to each edge.
bool valueAnticipable(CHIArgs C, Instruction *TI) const;
-
- // Check if it is safe to hoist values tracked by CHI in the range
- // [Begin, End) and accumulate them in Safe.
- void checkSafety(CHIArgs C, BasicBlock *BB, InsKind K,
+
+ // Check if it is safe to hoist values tracked by CHI in the range
+ // [Begin, End) and accumulate them in Safe.
+ void checkSafety(CHIArgs C, BasicBlock *BB, InsKind K,
SmallVectorImpl<CHIArg> &Safe);
-
- using RenameStackType = DenseMap<VNType, SmallVector<Instruction *, 2>>;
-
- // Push all the VNs corresponding to BB into RenameStack.
- void fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
+
+ using RenameStackType = DenseMap<VNType, SmallVector<Instruction *, 2>>;
+
+ // Push all the VNs corresponding to BB into RenameStack.
+ void fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
RenameStackType &RenameStack);
-
- void fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
+
+ void fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
RenameStackType &RenameStack);
-
- // Walk the post-dominator tree top-down and use a stack for each value to
- // store the last value you see. When you hit a CHI from a given edge, the
- // value to use as the argument is at the top of the stack, add the value to
- // CHI and pop.
- void insertCHI(InValuesType &ValueBBs, OutValuesType &CHIBBs) {
- auto Root = PDT->getNode(nullptr);
- if (!Root)
- return;
- // Depth first walk on PDom tree to fill the CHIargs at each PDF.
- RenameStackType RenameStack;
- for (auto Node : depth_first(Root)) {
- BasicBlock *BB = Node->getBlock();
- if (!BB)
- continue;
-
- // Collect all values in BB and push to stack.
- fillRenameStack(BB, ValueBBs, RenameStack);
-
- // Fill outgoing values in each CHI corresponding to BB.
- fillChiArgs(BB, CHIBBs, RenameStack);
- }
- }
-
- // Walk all the CHI-nodes to find ones which have a empty-entry and remove
- // them Then collect all the instructions which are safe to hoist and see if
- // they form a list of anticipable values. OutValues contains CHIs
- // corresponding to each basic block.
- void findHoistableCandidates(OutValuesType &CHIBBs, InsKind K,
+
+ // Walk the post-dominator tree top-down and use a stack for each value to
+ // store the last value you see. When you hit a CHI from a given edge, the
+ // value to use as the argument is at the top of the stack, add the value to
+ // CHI and pop.
+ void insertCHI(InValuesType &ValueBBs, OutValuesType &CHIBBs) {
+ auto Root = PDT->getNode(nullptr);
+ if (!Root)
+ return;
+ // Depth first walk on PDom tree to fill the CHIargs at each PDF.
+ RenameStackType RenameStack;
+ for (auto Node : depth_first(Root)) {
+ BasicBlock *BB = Node->getBlock();
+ if (!BB)
+ continue;
+
+ // Collect all values in BB and push to stack.
+ fillRenameStack(BB, ValueBBs, RenameStack);
+
+ // Fill outgoing values in each CHI corresponding to BB.
+ fillChiArgs(BB, CHIBBs, RenameStack);
+ }
+ }
+
+ // Walk all the CHI-nodes to find ones which have a empty-entry and remove
+ // them Then collect all the instructions which are safe to hoist and see if
+ // they form a list of anticipable values. OutValues contains CHIs
+ // corresponding to each basic block.
+ void findHoistableCandidates(OutValuesType &CHIBBs, InsKind K,
HoistingPointList &HPL);
-
- // Compute insertion points for each values which can be fully anticipated at
- // a dominator. HPL contains all such values.
- void computeInsertionPoints(const VNtoInsns &Map, HoistingPointList &HPL,
- InsKind K) {
- // Sort VNs based on their rankings
- std::vector<VNType> Ranks;
- for (const auto &Entry : Map) {
- Ranks.push_back(Entry.first);
- }
-
- // TODO: Remove fully-redundant expressions.
- // Get instruction from the Map, assume that all the Instructions
- // with same VNs have same rank (this is an approximation).
- llvm::sort(Ranks, [this, &Map](const VNType &r1, const VNType &r2) {
- return (rank(*Map.lookup(r1).begin()) < rank(*Map.lookup(r2).begin()));
- });
-
- // - Sort VNs according to their rank, and start with lowest ranked VN
- // - Take a VN and for each instruction with same VN
- // - Find the dominance frontier in the inverse graph (PDF)
- // - Insert the chi-node at PDF
- // - Remove the chi-nodes with missing entries
- // - Remove values from CHI-nodes which do not truly flow out, e.g.,
- // modified along the path.
- // - Collect the remaining values that are still anticipable
- SmallVector<BasicBlock *, 2> IDFBlocks;
- ReverseIDFCalculator IDFs(*PDT);
- OutValuesType OutValue;
- InValuesType InValue;
- for (const auto &R : Ranks) {
- const SmallVecInsn &V = Map.lookup(R);
- if (V.size() < 2)
- continue;
- const VNType &VN = R;
- SmallPtrSet<BasicBlock *, 2> VNBlocks;
- for (auto &I : V) {
- BasicBlock *BBI = I->getParent();
- if (!hasEH(BBI))
- VNBlocks.insert(BBI);
- }
- // Compute the Post Dominance Frontiers of each basic block
- // The dominance frontier of a live block X in the reverse
- // control graph is the set of blocks upon which X is control
- // dependent. The following sequence computes the set of blocks
- // which currently have dead terminators that are control
- // dependence sources of a block which is in NewLiveBlocks.
- IDFs.setDefiningBlocks(VNBlocks);
- IDFBlocks.clear();
- IDFs.calculate(IDFBlocks);
-
- // Make a map of BB vs instructions to be hoisted.
- for (unsigned i = 0; i < V.size(); ++i) {
- InValue[V[i]->getParent()].push_back(std::make_pair(VN, V[i]));
- }
- // Insert empty CHI node for this VN. This is used to factor out
- // basic blocks where the ANTIC can potentially change.
+
+ // Compute insertion points for each values which can be fully anticipated at
+ // a dominator. HPL contains all such values.
+ void computeInsertionPoints(const VNtoInsns &Map, HoistingPointList &HPL,
+ InsKind K) {
+ // Sort VNs based on their rankings
+ std::vector<VNType> Ranks;
+ for (const auto &Entry : Map) {
+ Ranks.push_back(Entry.first);
+ }
+
+ // TODO: Remove fully-redundant expressions.
+ // Get instruction from the Map, assume that all the Instructions
+ // with same VNs have same rank (this is an approximation).
+ llvm::sort(Ranks, [this, &Map](const VNType &r1, const VNType &r2) {
+ return (rank(*Map.lookup(r1).begin()) < rank(*Map.lookup(r2).begin()));
+ });
+
+ // - Sort VNs according to their rank, and start with lowest ranked VN
+ // - Take a VN and for each instruction with same VN
+ // - Find the dominance frontier in the inverse graph (PDF)
+ // - Insert the chi-node at PDF
+ // - Remove the chi-nodes with missing entries
+ // - Remove values from CHI-nodes which do not truly flow out, e.g.,
+ // modified along the path.
+ // - Collect the remaining values that are still anticipable
+ SmallVector<BasicBlock *, 2> IDFBlocks;
+ ReverseIDFCalculator IDFs(*PDT);
+ OutValuesType OutValue;
+ InValuesType InValue;
+ for (const auto &R : Ranks) {
+ const SmallVecInsn &V = Map.lookup(R);
+ if (V.size() < 2)
+ continue;
+ const VNType &VN = R;
+ SmallPtrSet<BasicBlock *, 2> VNBlocks;
+ for (auto &I : V) {
+ BasicBlock *BBI = I->getParent();
+ if (!hasEH(BBI))
+ VNBlocks.insert(BBI);
+ }
+ // Compute the Post Dominance Frontiers of each basic block
+ // The dominance frontier of a live block X in the reverse
+ // control graph is the set of blocks upon which X is control
+ // dependent. The following sequence computes the set of blocks
+ // which currently have dead terminators that are control
+ // dependence sources of a block which is in NewLiveBlocks.
+ IDFs.setDefiningBlocks(VNBlocks);
+ IDFBlocks.clear();
+ IDFs.calculate(IDFBlocks);
+
+ // Make a map of BB vs instructions to be hoisted.
+ for (unsigned i = 0; i < V.size(); ++i) {
+ InValue[V[i]->getParent()].push_back(std::make_pair(VN, V[i]));
+ }
+ // Insert empty CHI node for this VN. This is used to factor out
+ // basic blocks where the ANTIC can potentially change.
CHIArg EmptyChi = {VN, nullptr, nullptr};
for (auto *IDFBB : IDFBlocks) {
- for (unsigned i = 0; i < V.size(); ++i) {
+ for (unsigned i = 0; i < V.size(); ++i) {
// Ignore spurious PDFs.
if (DT->properlyDominates(IDFBB, V[i]->getParent())) {
OutValue[IDFBB].push_back(EmptyChi);
LLVM_DEBUG(dbgs() << "\nInserting a CHI for BB: "
<< IDFBB->getName() << ", for Insn: " << *V[i]);
- }
- }
- }
- }
-
- // Insert CHI args at each PDF to iterate on factored graph of
- // control dependence.
- insertCHI(InValue, OutValue);
- // Using the CHI args inserted at each PDF, find fully anticipable values.
- findHoistableCandidates(OutValue, K, HPL);
- }
-
- // Return true when all operands of Instr are available at insertion point
- // HoistPt. When limiting the number of hoisted expressions, one could hoist
- // a load without hoisting its access function. So before hoisting any
- // expression, make sure that all its operands are available at insert point.
- bool allOperandsAvailable(const Instruction *I,
+ }
+ }
+ }
+ }
+
+ // Insert CHI args at each PDF to iterate on factored graph of
+ // control dependence.
+ insertCHI(InValue, OutValue);
+ // Using the CHI args inserted at each PDF, find fully anticipable values.
+ findHoistableCandidates(OutValue, K, HPL);
+ }
+
+ // Return true when all operands of Instr are available at insertion point
+ // HoistPt. When limiting the number of hoisted expressions, one could hoist
+ // a load without hoisting its access function. So before hoisting any
+ // expression, make sure that all its operands are available at insert point.
+ bool allOperandsAvailable(const Instruction *I,
const BasicBlock *HoistPt) const;
-
- // Same as allOperandsAvailable with recursive check for GEP operands.
- bool allGepOperandsAvailable(const Instruction *I,
+
+ // Same as allOperandsAvailable with recursive check for GEP operands.
+ bool allGepOperandsAvailable(const Instruction *I,
const BasicBlock *HoistPt) const;
-
- // Make all operands of the GEP available.
- void makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
- const SmallVecInsn &InstructionsToHoist,
+
+ // Make all operands of the GEP available.
+ void makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
+ const SmallVecInsn &InstructionsToHoist,
Instruction *Gep) const;
-
+
void updateAlignment(Instruction *I, Instruction *Repl);
-
+
// Remove all the instructions in Candidates and replace their usage with
// Repl. Returns the number of instructions removed.
unsigned rauw(const SmallVecInsn &Candidates, Instruction *Repl,
@@ -655,20 +655,20 @@ bool GVNHoist::hasMemoryUse(const Instruction *NewPt, MemoryDef *Def,
continue;
ReachedNewPt = true;
}
- }
+ }
if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA))
return true;
}
-
+
return false;
}
-
+
bool GVNHoist::hasEHhelper(const BasicBlock *BB, const BasicBlock *SrcBB,
int &NBBsOnAllPaths) {
// Stop walk once the limit is reached.
if (NBBsOnAllPaths == 0)
return true;
-
+
// Impossible to hoist with exceptions on the path.
if (hasEH(BB))
return true;
@@ -700,8 +700,8 @@ bool GVNHoist::hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
// Stop traversal when reaching HoistPt.
I.skipChildren();
continue;
- }
-
+ }
+
if (hasEHhelper(BB, OldBB, NBBsOnAllPaths))
return true;
@@ -714,8 +714,8 @@ bool GVNHoist::hasEHOrLoadsOnPath(const Instruction *NewPt, MemoryDef *Def,
--NBBsOnAllPaths;
++I;
- }
-
+ }
+
return false;
}
@@ -734,7 +734,7 @@ bool GVNHoist::hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
// Stop traversal when reaching NewHoistPt.
I.skipChildren();
continue;
- }
+ }
if (hasEHhelper(BB, SrcBB, NBBsOnAllPaths))
return true;
@@ -744,11 +744,11 @@ bool GVNHoist::hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
--NBBsOnAllPaths;
++I;
- }
-
+ }
+
return false;
}
-
+
bool GVNHoist::safeToHoistLdSt(const Instruction *NewPt,
const Instruction *OldPt, MemoryUseOrDef *U,
GVNHoist::InsKind K, int &NBBsOnAllPaths) {
@@ -785,12 +785,12 @@ bool GVNHoist::safeToHoistLdSt(const Instruction *NewPt,
return true;
assert(UBB == DBB);
assert(MSSA->locallyDominates(D, U));
- }
-
+ }
+
// No side effects: it is safe to hoist.
return true;
}
-
+
bool GVNHoist::valueAnticipable(CHIArgs C, Instruction *TI) const {
if (TI->getNumSuccessors() > (unsigned)size(C))
return false; // Not enough args in this CHI.
@@ -818,10 +818,10 @@ void GVNHoist::checkSafety(CHIArgs C, BasicBlock *BB, GVNHoist::InsKind K,
if (MemoryUseOrDef *UD = MSSA->getMemoryAccess(Insn))
if (safeToHoistLdSt(T, Insn, UD, K, NumBBsOnAllPaths))
Safe.push_back(CHI);
- }
- }
+ }
+ }
}
-
+
void GVNHoist::fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
GVNHoist::RenameStackType &RenameStack) {
auto it1 = ValueBBs.find(BB);
@@ -831,10 +831,10 @@ void GVNHoist::fillRenameStack(BasicBlock *BB, InValuesType &ValueBBs,
// Get the value of instruction I
LLVM_DEBUG(dbgs() << "\nPushing on stack: " << *VI.second);
RenameStack[VI.first].push_back(VI.second);
- }
+ }
}
}
-
+
void GVNHoist::fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
GVNHoist::RenameStackType &RenameStack) {
// For each *predecessor* (because Post-DOM) of BB check if it has a CHI
@@ -869,7 +869,7 @@ void GVNHoist::fillChiArgs(BasicBlock *BB, OutValuesType &CHIBBs,
}
}
}
-
+
void GVNHoist::findHoistableCandidates(OutValuesType &CHIBBs,
GVNHoist::InsKind K,
HoistingPointList &HPL) {
@@ -911,9 +911,9 @@ void GVNHoist::findHoistableCandidates(OutValuesType &CHIBBs,
PHIIt = std::find_if(PrevIt, CHIs.end(),
[PrevIt](CHIArg &A) { return A != *PrevIt; });
}
- }
+ }
}
-
+
bool GVNHoist::allOperandsAvailable(const Instruction *I,
const BasicBlock *HoistPt) const {
for (const Use &Op : I->operands())
@@ -932,14 +932,14 @@ bool GVNHoist::allGepOperandsAvailable(const Instruction *I,
if (const GetElementPtrInst *GepOp =
dyn_cast<GetElementPtrInst>(Inst)) {
if (!allGepOperandsAvailable(GepOp, HoistPt))
- return false;
+ return false;
// Gep is available if all operands of GepOp are available.
} else {
// Gep is not available if it has operands other than GEPs that are
// defined in blocks not dominating HoistPt.
- return false;
+ return false;
}
- }
+ }
return true;
}
@@ -959,15 +959,15 @@ void GVNHoist::makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
// of this GEP available at HoistPt.
if (GetElementPtrInst *GepOp = dyn_cast<GetElementPtrInst>(Op))
makeGepsAvailable(ClonedGep, HoistPt, InstructionsToHoist, GepOp);
- }
-
+ }
+
// Copy Gep and replace its uses in Repl with ClonedGep.
ClonedGep->insertBefore(HoistPt->getTerminator());
-
+
// Conservatively discard any optimization hints, they may differ on the
// other paths.
ClonedGep->dropUnknownNonDebugMetadata();
-
+
// If we have optimization hints which agree with each other along different
// paths, preserve them.
for (const Instruction *OtherInst : InstructionsToHoist) {
@@ -979,7 +979,7 @@ void GVNHoist::makeGepsAvailable(Instruction *Repl, BasicBlock *HoistPt,
cast<StoreInst>(OtherInst)->getPointerOperand());
ClonedGep->andIRFlags(OtherGep);
}
-
+
// Replace uses of Gep with ClonedGep in Repl.
Repl->replaceUsesOfWith(Gep, ClonedGep);
}
@@ -998,9 +998,9 @@ void GVNHoist::updateAlignment(Instruction *I, Instruction *Repl) {
cast<AllocaInst>(I)->getAlign()));
} else if (isa<CallInst>(Repl)) {
++NumCallsRemoved;
- }
+ }
}
-
+
unsigned GVNHoist::rauw(const SmallVecInsn &Candidates, Instruction *Repl,
MemoryUseOrDef *NewMemAcc) {
unsigned NR = 0;
@@ -1014,7 +1014,7 @@ unsigned GVNHoist::rauw(const SmallVecInsn &Candidates, Instruction *Repl,
OldMA->replaceAllUsesWith(NewMemAcc);
MSSAUpdater->removeMemoryAccess(OldMA);
}
-
+
Repl->andIRFlags(I);
combineKnownMetadata(Repl, I);
I->replaceAllUsesWith(Repl);
@@ -1025,13 +1025,13 @@ unsigned GVNHoist::rauw(const SmallVecInsn &Candidates, Instruction *Repl,
}
return NR;
}
-
+
void GVNHoist::raMPHIuw(MemoryUseOrDef *NewMemAcc) {
SmallPtrSet<MemoryPhi *, 4> UsePhis;
for (User *U : NewMemAcc->users())
if (MemoryPhi *Phi = dyn_cast<MemoryPhi>(U))
UsePhis.insert(Phi);
-
+
for (MemoryPhi *Phi : UsePhis) {
auto In = Phi->incoming_values();
if (llvm::all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
@@ -1040,7 +1040,7 @@ void GVNHoist::raMPHIuw(MemoryUseOrDef *NewMemAcc) {
}
}
}
-
+
unsigned GVNHoist::removeAndReplace(const SmallVecInsn &Candidates,
Instruction *Repl, BasicBlock *DestBB,
bool MoveAccess) {
@@ -1050,16 +1050,16 @@ unsigned GVNHoist::removeAndReplace(const SmallVecInsn &Candidates,
// legal when the ld/st is not moved past its current definition.
MSSAUpdater->moveToPlace(NewMemAcc, DestBB, MemorySSA::BeforeTerminator);
}
-
+
// Replace all other instructions with Repl with memory access NewMemAcc.
unsigned NR = rauw(Candidates, Repl, NewMemAcc);
-
+
// Remove MemorySSA phi nodes with the same arguments.
if (NewMemAcc)
raMPHIuw(NewMemAcc);
return NR;
}
-
+
bool GVNHoist::makeGepOperandsAvailable(
Instruction *Repl, BasicBlock *HoistPt,
const SmallVecInsn &InstructionsToHoist) const {
@@ -1079,21 +1079,21 @@ bool GVNHoist::makeGepOperandsAvailable(
return false;
} else if (!DT->dominates(Val->getParent(), HoistPt))
return false;
- }
+ }
}
-
+
// Check whether we can compute the Gep at HoistPt.
if (!Gep || !allGepOperandsAvailable(Gep, HoistPt))
return false;
-
+
makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Gep);
-
+
if (Val && isa<GetElementPtrInst>(Val))
makeGepsAvailable(Repl, HoistPt, InstructionsToHoist, Val);
-
+
return true;
}
-
+
std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
unsigned NI = 0, NL = 0, NS = 0, NC = 0, NR = 0;
for (const HoistingPointInfo &HP : HPL) {
@@ -1109,7 +1109,7 @@ std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
// of the second based on the first.
if (!Repl || firstInBB(I, Repl))
Repl = I;
-
+
// Keep track of whether we moved the instruction so we know whether we
// should move the MemoryAccess.
bool MoveAccess = true;
@@ -1122,7 +1122,7 @@ std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
// When we do not find Repl in HoistPt, select the first in the list
// and move it to HoistPt.
Repl = InstructionsToHoist.front();
-
+
// We can move Repl in HoistPt only when all operands are available.
// The order in which hoistings are done may influence the availability
// of operands.
@@ -1135,7 +1135,7 @@ std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
// When not HoistingGeps we need to copy the GEPs.
if (!makeGepOperandsAvailable(Repl, DestBB, InstructionsToHoist))
continue;
- }
+ }
// Move the instruction at the end of HoistPt.
Instruction *Last = DestBB->getTerminator();
@@ -1143,8 +1143,8 @@ std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
Repl->moveBefore(Last);
DFSNumber[Repl] = DFSNumber[Last]++;
- }
-
+ }
+
NR += removeAndReplace(InstructionsToHoist, Repl, DestBB, MoveAccess);
if (isa<LoadInst>(Repl))
@@ -1155,11 +1155,11 @@ std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
++NC;
else // Scalar
++NI;
- }
-
+ }
+
if (MSSA && VerifyMemorySSA)
MSSA->verifyMemorySSA();
-
+
NumHoisted += NL + NS + NC + NI;
NumRemoved += NR;
NumLoadsHoisted += NL;
@@ -1167,7 +1167,7 @@ std::pair<unsigned, unsigned> GVNHoist::hoist(HoistingPointList &HPL) {
NumCallsHoisted += NC;
return {NI, NL + NC + NS};
}
-
+
std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
InsnInfo II;
LoadInfo LI;
@@ -1186,11 +1186,11 @@ std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
// deeper may increase the register pressure and compilation time.
if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
break;
-
+
// Do not value number terminator instructions.
if (I1.isTerminator())
break;
-
+
if (auto *Load = dyn_cast<LoadInst>(&I1))
LI.insert(Load, VN);
else if (auto *Store = dyn_cast<StoreInst>(&I1))
@@ -1216,8 +1216,8 @@ std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
// registers than X86.
II.insert(&I1, VN);
}
- }
-
+ }
+
HoistingPointList HPL;
computeInsertionPoints(II.getVNTable(), HPL, InsKind::Scalar);
computeInsertionPoints(LI.getVNTable(), HPL, InsKind::Load);
@@ -1228,35 +1228,35 @@ std::pair<unsigned, unsigned> GVNHoist::hoistExpressions(Function &F) {
return hoist(HPL);
}
-} // end namespace llvm
-
-PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) {
- DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
- PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
- AliasAnalysis &AA = AM.getResult<AAManager>(F);
- MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
- MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
- GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
- if (!G.run(F))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<MemorySSAAnalysis>();
- PA.preserve<GlobalsAA>();
- return PA;
-}
-
-char GVNHoistLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist",
- "Early GVN Hoisting of Expressions", false, false)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(GVNHoistLegacyPass, "gvn-hoist",
- "Early GVN Hoisting of Expressions", false, false)
-
-FunctionPass *llvm::createGVNHoistPass() { return new GVNHoistLegacyPass(); }
+} // end namespace llvm
+
+PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) {
+ DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+ AliasAnalysis &AA = AM.getResult<AAManager>(F);
+ MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+ MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+ GVNHoist G(&DT, &PDT, &AA, &MD, &MSSA);
+ if (!G.run(F))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<MemorySSAAnalysis>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+char GVNHoistLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GVNHoistLegacyPass, "gvn-hoist",
+ "Early GVN Hoisting of Expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(GVNHoistLegacyPass, "gvn-hoist",
+ "Early GVN Hoisting of Expressions", false, false)
+
+FunctionPass *llvm::createGVNHoistPass() { return new GVNHoistLegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/GVNSink.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/GVNSink.cpp
index c1d1c06eab..aef927ab65 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/GVNSink.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/GVNSink.cpp
@@ -1,930 +1,930 @@
-//===- GVNSink.cpp - sink expressions into successors ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file GVNSink.cpp
-/// This pass attempts to sink instructions into successors, reducing static
-/// instruction count and enabling if-conversion.
-///
-/// We use a variant of global value numbering to decide what can be sunk.
-/// Consider:
-///
-/// [ %a1 = add i32 %b, 1 ] [ %c1 = add i32 %d, 1 ]
-/// [ %a2 = xor i32 %a1, 1 ] [ %c2 = xor i32 %c1, 1 ]
-/// \ /
-/// [ %e = phi i32 %a2, %c2 ]
-/// [ add i32 %e, 4 ]
-///
-///
-/// GVN would number %a1 and %c1 differently because they compute different
-/// results - the VN of an instruction is a function of its opcode and the
-/// transitive closure of its operands. This is the key property for hoisting
-/// and CSE.
-///
-/// What we want when sinking however is for a numbering that is a function of
-/// the *uses* of an instruction, which allows us to answer the question "if I
-/// replace %a1 with %c1, will it contribute in an equivalent way to all
-/// successive instructions?". The PostValueTable class in GVN provides this
-/// mapping.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseMapInfo.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/ArrayRecycler.h"
-#include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/Scalar/GVNExpression.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "gvn-sink"
-
-STATISTIC(NumRemoved, "Number of instructions removed");
-
-namespace llvm {
-namespace GVNExpression {
-
-LLVM_DUMP_METHOD void Expression::dump() const {
- print(dbgs());
- dbgs() << "\n";
-}
-
-} // end namespace GVNExpression
-} // end namespace llvm
-
-namespace {
-
-static bool isMemoryInst(const Instruction *I) {
- return isa<LoadInst>(I) || isa<StoreInst>(I) ||
- (isa<InvokeInst>(I) && !cast<InvokeInst>(I)->doesNotAccessMemory()) ||
- (isa<CallInst>(I) && !cast<CallInst>(I)->doesNotAccessMemory());
-}
-
-/// Iterates through instructions in a set of blocks in reverse order from the
-/// first non-terminator. For example (assume all blocks have size n):
-/// LockstepReverseIterator I([B1, B2, B3]);
-/// *I-- = [B1[n], B2[n], B3[n]];
-/// *I-- = [B1[n-1], B2[n-1], B3[n-1]];
-/// *I-- = [B1[n-2], B2[n-2], B3[n-2]];
-/// ...
-///
-/// It continues until all blocks have been exhausted. Use \c getActiveBlocks()
-/// to
-/// determine which blocks are still going and the order they appear in the
-/// list returned by operator*.
-class LockstepReverseIterator {
- ArrayRef<BasicBlock *> Blocks;
- SmallSetVector<BasicBlock *, 4> ActiveBlocks;
- SmallVector<Instruction *, 4> Insts;
- bool Fail;
-
-public:
- LockstepReverseIterator(ArrayRef<BasicBlock *> Blocks) : Blocks(Blocks) {
- reset();
- }
-
- void reset() {
- Fail = false;
- ActiveBlocks.clear();
- for (BasicBlock *BB : Blocks)
- ActiveBlocks.insert(BB);
- Insts.clear();
- for (BasicBlock *BB : Blocks) {
- if (BB->size() <= 1) {
- // Block wasn't big enough - only contained a terminator.
- ActiveBlocks.remove(BB);
- continue;
- }
- Insts.push_back(BB->getTerminator()->getPrevNode());
- }
- if (Insts.empty())
- Fail = true;
- }
-
- bool isValid() const { return !Fail; }
- ArrayRef<Instruction *> operator*() const { return Insts; }
-
- // Note: This needs to return a SmallSetVector as the elements of
- // ActiveBlocks will be later copied to Blocks using std::copy. The
- // resultant order of elements in Blocks needs to be deterministic.
- // Using SmallPtrSet instead causes non-deterministic order while
- // copying. And we cannot simply sort Blocks as they need to match the
- // corresponding Values.
- SmallSetVector<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; }
-
- void restrictToBlocks(SmallSetVector<BasicBlock *, 4> &Blocks) {
- for (auto II = Insts.begin(); II != Insts.end();) {
+//===- GVNSink.cpp - sink expressions into successors ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file GVNSink.cpp
+/// This pass attempts to sink instructions into successors, reducing static
+/// instruction count and enabling if-conversion.
+///
+/// We use a variant of global value numbering to decide what can be sunk.
+/// Consider:
+///
+/// [ %a1 = add i32 %b, 1 ] [ %c1 = add i32 %d, 1 ]
+/// [ %a2 = xor i32 %a1, 1 ] [ %c2 = xor i32 %c1, 1 ]
+/// \ /
+/// [ %e = phi i32 %a2, %c2 ]
+/// [ add i32 %e, 4 ]
+///
+///
+/// GVN would number %a1 and %c1 differently because they compute different
+/// results - the VN of an instruction is a function of its opcode and the
+/// transitive closure of its operands. This is the key property for hoisting
+/// and CSE.
+///
+/// What we want when sinking however is for a numbering that is a function of
+/// the *uses* of an instruction, which allows us to answer the question "if I
+/// replace %a1 with %c1, will it contribute in an equivalent way to all
+/// successive instructions?". The PostValueTable class in GVN provides this
+/// mapping.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/GVNExpression.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "gvn-sink"
+
+STATISTIC(NumRemoved, "Number of instructions removed");
+
+namespace llvm {
+namespace GVNExpression {
+
+LLVM_DUMP_METHOD void Expression::dump() const {
+ print(dbgs());
+ dbgs() << "\n";
+}
+
+} // end namespace GVNExpression
+} // end namespace llvm
+
+namespace {
+
+static bool isMemoryInst(const Instruction *I) {
+ return isa<LoadInst>(I) || isa<StoreInst>(I) ||
+ (isa<InvokeInst>(I) && !cast<InvokeInst>(I)->doesNotAccessMemory()) ||
+ (isa<CallInst>(I) && !cast<CallInst>(I)->doesNotAccessMemory());
+}
+
+/// Iterates through instructions in a set of blocks in reverse order from the
+/// first non-terminator. For example (assume all blocks have size n):
+/// LockstepReverseIterator I([B1, B2, B3]);
+/// *I-- = [B1[n], B2[n], B3[n]];
+/// *I-- = [B1[n-1], B2[n-1], B3[n-1]];
+/// *I-- = [B1[n-2], B2[n-2], B3[n-2]];
+/// ...
+///
+/// It continues until all blocks have been exhausted. Use \c getActiveBlocks()
+/// to
+/// determine which blocks are still going and the order they appear in the
+/// list returned by operator*.
+class LockstepReverseIterator {
+ ArrayRef<BasicBlock *> Blocks;
+ SmallSetVector<BasicBlock *, 4> ActiveBlocks;
+ SmallVector<Instruction *, 4> Insts;
+ bool Fail;
+
+public:
+ LockstepReverseIterator(ArrayRef<BasicBlock *> Blocks) : Blocks(Blocks) {
+ reset();
+ }
+
+ void reset() {
+ Fail = false;
+ ActiveBlocks.clear();
+ for (BasicBlock *BB : Blocks)
+ ActiveBlocks.insert(BB);
+ Insts.clear();
+ for (BasicBlock *BB : Blocks) {
+ if (BB->size() <= 1) {
+ // Block wasn't big enough - only contained a terminator.
+ ActiveBlocks.remove(BB);
+ continue;
+ }
+ Insts.push_back(BB->getTerminator()->getPrevNode());
+ }
+ if (Insts.empty())
+ Fail = true;
+ }
+
+ bool isValid() const { return !Fail; }
+ ArrayRef<Instruction *> operator*() const { return Insts; }
+
+ // Note: This needs to return a SmallSetVector as the elements of
+ // ActiveBlocks will be later copied to Blocks using std::copy. The
+ // resultant order of elements in Blocks needs to be deterministic.
+ // Using SmallPtrSet instead causes non-deterministic order while
+ // copying. And we cannot simply sort Blocks as they need to match the
+ // corresponding Values.
+ SmallSetVector<BasicBlock *, 4> &getActiveBlocks() { return ActiveBlocks; }
+
+ void restrictToBlocks(SmallSetVector<BasicBlock *, 4> &Blocks) {
+ for (auto II = Insts.begin(); II != Insts.end();) {
if (!llvm::is_contained(Blocks, (*II)->getParent())) {
- ActiveBlocks.remove((*II)->getParent());
- II = Insts.erase(II);
- } else {
- ++II;
- }
- }
- }
-
- void operator--() {
- if (Fail)
- return;
- SmallVector<Instruction *, 4> NewInsts;
- for (auto *Inst : Insts) {
- if (Inst == &Inst->getParent()->front())
- ActiveBlocks.remove(Inst->getParent());
- else
- NewInsts.push_back(Inst->getPrevNode());
- }
- if (NewInsts.empty()) {
- Fail = true;
- return;
- }
- Insts = NewInsts;
- }
-};
-
-//===----------------------------------------------------------------------===//
-
-/// Candidate solution for sinking. There may be different ways to
-/// sink instructions, differing in the number of instructions sunk,
-/// the number of predecessors sunk from and the number of PHIs
-/// required.
-struct SinkingInstructionCandidate {
- unsigned NumBlocks;
- unsigned NumInstructions;
- unsigned NumPHIs;
- unsigned NumMemoryInsts;
- int Cost = -1;
- SmallVector<BasicBlock *, 4> Blocks;
-
- void calculateCost(unsigned NumOrigPHIs, unsigned NumOrigBlocks) {
- unsigned NumExtraPHIs = NumPHIs - NumOrigPHIs;
- unsigned SplitEdgeCost = (NumOrigBlocks > NumBlocks) ? 2 : 0;
- Cost = (NumInstructions * (NumBlocks - 1)) -
- (NumExtraPHIs *
- NumExtraPHIs) // PHIs are expensive, so make sure they're worth it.
- - SplitEdgeCost;
- }
-
- bool operator>(const SinkingInstructionCandidate &Other) const {
- return Cost > Other.Cost;
- }
-};
-
-#ifndef NDEBUG
-raw_ostream &operator<<(raw_ostream &OS, const SinkingInstructionCandidate &C) {
- OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
- << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
- return OS;
-}
-#endif
-
-//===----------------------------------------------------------------------===//
-
-/// Describes a PHI node that may or may not exist. These track the PHIs
-/// that must be created if we sunk a sequence of instructions. It provides
-/// a hash function for efficient equality comparisons.
-class ModelledPHI {
- SmallVector<Value *, 4> Values;
- SmallVector<BasicBlock *, 4> Blocks;
-
-public:
- ModelledPHI() = default;
-
- ModelledPHI(const PHINode *PN) {
- // BasicBlock comes first so we sort by basic block pointer order, then by value pointer order.
- SmallVector<std::pair<BasicBlock *, Value *>, 4> Ops;
- for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
- Ops.push_back({PN->getIncomingBlock(I), PN->getIncomingValue(I)});
- llvm::sort(Ops);
- for (auto &P : Ops) {
- Blocks.push_back(P.first);
- Values.push_back(P.second);
- }
- }
-
- /// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI
- /// without the same ID.
- /// \note This is specifically for DenseMapInfo - do not use this!
- static ModelledPHI createDummy(size_t ID) {
- ModelledPHI M;
- M.Values.push_back(reinterpret_cast<Value*>(ID));
- return M;
- }
-
- /// Create a PHI from an array of incoming values and incoming blocks.
- template <typename VArray, typename BArray>
- ModelledPHI(const VArray &V, const BArray &B) {
- llvm::copy(V, std::back_inserter(Values));
- llvm::copy(B, std::back_inserter(Blocks));
- }
-
- /// Create a PHI from [I[OpNum] for I in Insts].
- template <typename BArray>
- ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) {
- llvm::copy(B, std::back_inserter(Blocks));
- for (auto *I : Insts)
- Values.push_back(I->getOperand(OpNum));
- }
-
- /// Restrict the PHI's contents down to only \c NewBlocks.
- /// \c NewBlocks must be a subset of \c this->Blocks.
- void restrictToBlocks(const SmallSetVector<BasicBlock *, 4> &NewBlocks) {
- auto BI = Blocks.begin();
- auto VI = Values.begin();
- while (BI != Blocks.end()) {
- assert(VI != Values.end());
+ ActiveBlocks.remove((*II)->getParent());
+ II = Insts.erase(II);
+ } else {
+ ++II;
+ }
+ }
+ }
+
+ void operator--() {
+ if (Fail)
+ return;
+ SmallVector<Instruction *, 4> NewInsts;
+ for (auto *Inst : Insts) {
+ if (Inst == &Inst->getParent()->front())
+ ActiveBlocks.remove(Inst->getParent());
+ else
+ NewInsts.push_back(Inst->getPrevNode());
+ }
+ if (NewInsts.empty()) {
+ Fail = true;
+ return;
+ }
+ Insts = NewInsts;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+
+/// Candidate solution for sinking. There may be different ways to
+/// sink instructions, differing in the number of instructions sunk,
+/// the number of predecessors sunk from and the number of PHIs
+/// required.
+struct SinkingInstructionCandidate {
+ unsigned NumBlocks;
+ unsigned NumInstructions;
+ unsigned NumPHIs;
+ unsigned NumMemoryInsts;
+ int Cost = -1;
+ SmallVector<BasicBlock *, 4> Blocks;
+
+ void calculateCost(unsigned NumOrigPHIs, unsigned NumOrigBlocks) {
+ unsigned NumExtraPHIs = NumPHIs - NumOrigPHIs;
+ unsigned SplitEdgeCost = (NumOrigBlocks > NumBlocks) ? 2 : 0;
+ Cost = (NumInstructions * (NumBlocks - 1)) -
+ (NumExtraPHIs *
+ NumExtraPHIs) // PHIs are expensive, so make sure they're worth it.
+ - SplitEdgeCost;
+ }
+
+ bool operator>(const SinkingInstructionCandidate &Other) const {
+ return Cost > Other.Cost;
+ }
+};
+
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const SinkingInstructionCandidate &C) {
+ OS << "<Candidate Cost=" << C.Cost << " #Blocks=" << C.NumBlocks
+ << " #Insts=" << C.NumInstructions << " #PHIs=" << C.NumPHIs << ">";
+ return OS;
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+
+/// Describes a PHI node that may or may not exist. These track the PHIs
+/// that must be created if we sunk a sequence of instructions. It provides
+/// a hash function for efficient equality comparisons.
+class ModelledPHI {
+ SmallVector<Value *, 4> Values;
+ SmallVector<BasicBlock *, 4> Blocks;
+
+public:
+ ModelledPHI() = default;
+
+ ModelledPHI(const PHINode *PN) {
+ // BasicBlock comes first so we sort by basic block pointer order, then by value pointer order.
+ SmallVector<std::pair<BasicBlock *, Value *>, 4> Ops;
+ for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I)
+ Ops.push_back({PN->getIncomingBlock(I), PN->getIncomingValue(I)});
+ llvm::sort(Ops);
+ for (auto &P : Ops) {
+ Blocks.push_back(P.first);
+ Values.push_back(P.second);
+ }
+ }
+
+ /// Create a dummy ModelledPHI that will compare unequal to any other ModelledPHI
+ /// without the same ID.
+ /// \note This is specifically for DenseMapInfo - do not use this!
+ static ModelledPHI createDummy(size_t ID) {
+ ModelledPHI M;
+ M.Values.push_back(reinterpret_cast<Value*>(ID));
+ return M;
+ }
+
+ /// Create a PHI from an array of incoming values and incoming blocks.
+ template <typename VArray, typename BArray>
+ ModelledPHI(const VArray &V, const BArray &B) {
+ llvm::copy(V, std::back_inserter(Values));
+ llvm::copy(B, std::back_inserter(Blocks));
+ }
+
+ /// Create a PHI from [I[OpNum] for I in Insts].
+ template <typename BArray>
+ ModelledPHI(ArrayRef<Instruction *> Insts, unsigned OpNum, const BArray &B) {
+ llvm::copy(B, std::back_inserter(Blocks));
+ for (auto *I : Insts)
+ Values.push_back(I->getOperand(OpNum));
+ }
+
+ /// Restrict the PHI's contents down to only \c NewBlocks.
+ /// \c NewBlocks must be a subset of \c this->Blocks.
+ void restrictToBlocks(const SmallSetVector<BasicBlock *, 4> &NewBlocks) {
+ auto BI = Blocks.begin();
+ auto VI = Values.begin();
+ while (BI != Blocks.end()) {
+ assert(VI != Values.end());
if (!llvm::is_contained(NewBlocks, *BI)) {
- BI = Blocks.erase(BI);
- VI = Values.erase(VI);
- } else {
- ++BI;
- ++VI;
- }
- }
- assert(Blocks.size() == NewBlocks.size());
- }
-
- ArrayRef<Value *> getValues() const { return Values; }
-
- bool areAllIncomingValuesSame() const {
- return llvm::all_of(Values, [&](Value *V) { return V == Values[0]; });
- }
-
- bool areAllIncomingValuesSameType() const {
- return llvm::all_of(
- Values, [&](Value *V) { return V->getType() == Values[0]->getType(); });
- }
-
- bool areAnyIncomingValuesConstant() const {
- return llvm::any_of(Values, [&](Value *V) { return isa<Constant>(V); });
- }
-
- // Hash functor
- unsigned hash() const {
- return (unsigned)hash_combine_range(Values.begin(), Values.end());
- }
-
- bool operator==(const ModelledPHI &Other) const {
- return Values == Other.Values && Blocks == Other.Blocks;
- }
-};
-
-template <typename ModelledPHI> struct DenseMapInfo {
- static inline ModelledPHI &getEmptyKey() {
- static ModelledPHI Dummy = ModelledPHI::createDummy(0);
- return Dummy;
- }
-
- static inline ModelledPHI &getTombstoneKey() {
- static ModelledPHI Dummy = ModelledPHI::createDummy(1);
- return Dummy;
- }
-
- static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); }
-
- static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) {
- return LHS == RHS;
- }
-};
-
-using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>;
-
-//===----------------------------------------------------------------------===//
-// ValueTable
-//===----------------------------------------------------------------------===//
-// This is a value number table where the value number is a function of the
-// *uses* of a value, rather than its operands. Thus, if VN(A) == VN(B) we know
-// that the program would be equivalent if we replaced A with PHI(A, B).
-//===----------------------------------------------------------------------===//
-
-/// A GVN expression describing how an instruction is used. The operands
-/// field of BasicExpression is used to store uses, not operands.
-///
-/// This class also contains fields for discriminators used when determining
-/// equivalence of instructions with sideeffects.
-class InstructionUseExpr : public GVNExpression::BasicExpression {
- unsigned MemoryUseOrder = -1;
- bool Volatile = false;
- ArrayRef<int> ShuffleMask;
-
-public:
- InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R,
- BumpPtrAllocator &A)
- : GVNExpression::BasicExpression(I->getNumUses()) {
- allocateOperands(R, A);
- setOpcode(I->getOpcode());
- setType(I->getType());
-
- if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
- ShuffleMask = SVI->getShuffleMask().copy(A);
-
- for (auto &U : I->uses())
- op_push_back(U.getUser());
- llvm::sort(op_begin(), op_end());
- }
-
- void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; }
- void setVolatile(bool V) { Volatile = V; }
-
- hash_code getHashValue() const override {
- return hash_combine(GVNExpression::BasicExpression::getHashValue(),
- MemoryUseOrder, Volatile, ShuffleMask);
- }
-
- template <typename Function> hash_code getHashValue(Function MapFn) {
- hash_code H = hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile,
- ShuffleMask);
- for (auto *V : operands())
- H = hash_combine(H, MapFn(V));
- return H;
- }
-};
-
-class ValueTable {
- DenseMap<Value *, uint32_t> ValueNumbering;
- DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering;
- DenseMap<size_t, uint32_t> HashNumbering;
- BumpPtrAllocator Allocator;
- ArrayRecycler<Value *> Recycler;
- uint32_t nextValueNumber = 1;
-
- /// Create an expression for I based on its opcode and its uses. If I
- /// touches or reads memory, the expression is also based upon its memory
- /// order - see \c getMemoryUseOrder().
- InstructionUseExpr *createExpr(Instruction *I) {
- InstructionUseExpr *E =
- new (Allocator) InstructionUseExpr(I, Recycler, Allocator);
- if (isMemoryInst(I))
- E->setMemoryUseOrder(getMemoryUseOrder(I));
-
- if (CmpInst *C = dyn_cast<CmpInst>(I)) {
- CmpInst::Predicate Predicate = C->getPredicate();
- E->setOpcode((C->getOpcode() << 8) | Predicate);
- }
- return E;
- }
-
- /// Helper to compute the value number for a memory instruction
- /// (LoadInst/StoreInst), including checking the memory ordering and
- /// volatility.
- template <class Inst> InstructionUseExpr *createMemoryExpr(Inst *I) {
- if (isStrongerThanUnordered(I->getOrdering()) || I->isAtomic())
- return nullptr;
- InstructionUseExpr *E = createExpr(I);
- E->setVolatile(I->isVolatile());
- return E;
- }
-
-public:
- ValueTable() = default;
-
- /// Returns the value number for the specified value, assigning
- /// it a new number if it did not have one before.
- uint32_t lookupOrAdd(Value *V) {
- auto VI = ValueNumbering.find(V);
- if (VI != ValueNumbering.end())
- return VI->second;
-
- if (!isa<Instruction>(V)) {
- ValueNumbering[V] = nextValueNumber;
- return nextValueNumber++;
- }
-
- Instruction *I = cast<Instruction>(V);
- InstructionUseExpr *exp = nullptr;
- switch (I->getOpcode()) {
- case Instruction::Load:
- exp = createMemoryExpr(cast<LoadInst>(I));
- break;
- case Instruction::Store:
- exp = createMemoryExpr(cast<StoreInst>(I));
- break;
- case Instruction::Call:
- case Instruction::Invoke:
- case Instruction::FNeg:
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::ICmp:
- case Instruction::FCmp:
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::UIToFP:
- case Instruction::SIToFP:
- case Instruction::FPTrunc:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::BitCast:
- case Instruction::AddrSpaceCast:
- case Instruction::Select:
- case Instruction::ExtractElement:
- case Instruction::InsertElement:
- case Instruction::ShuffleVector:
- case Instruction::InsertValue:
- case Instruction::GetElementPtr:
- exp = createExpr(I);
- break;
- default:
- break;
- }
-
- if (!exp) {
- ValueNumbering[V] = nextValueNumber;
- return nextValueNumber++;
- }
-
- uint32_t e = ExpressionNumbering[exp];
- if (!e) {
- hash_code H = exp->getHashValue([=](Value *V) { return lookupOrAdd(V); });
- auto I = HashNumbering.find(H);
- if (I != HashNumbering.end()) {
- e = I->second;
- } else {
- e = nextValueNumber++;
- HashNumbering[H] = e;
- ExpressionNumbering[exp] = e;
- }
- }
- ValueNumbering[V] = e;
- return e;
- }
-
- /// Returns the value number of the specified value. Fails if the value has
- /// not yet been numbered.
- uint32_t lookup(Value *V) const {
- auto VI = ValueNumbering.find(V);
- assert(VI != ValueNumbering.end() && "Value not numbered?");
- return VI->second;
- }
-
- /// Removes all value numberings and resets the value table.
- void clear() {
- ValueNumbering.clear();
- ExpressionNumbering.clear();
- HashNumbering.clear();
- Recycler.clear(Allocator);
- nextValueNumber = 1;
- }
-
- /// \c Inst uses or touches memory. Return an ID describing the memory state
- /// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2),
- /// the exact same memory operations happen after I1 and I2.
- ///
- /// This is a very hard problem in general, so we use domain-specific
- /// knowledge that we only ever check for equivalence between blocks sharing a
- /// single immediate successor that is common, and when determining if I1 ==
- /// I2 we will have already determined that next(I1) == next(I2). This
- /// inductive property allows us to simply return the value number of the next
- /// instruction that defines memory.
- uint32_t getMemoryUseOrder(Instruction *Inst) {
- auto *BB = Inst->getParent();
- for (auto I = std::next(Inst->getIterator()), E = BB->end();
- I != E && !I->isTerminator(); ++I) {
- if (!isMemoryInst(&*I))
- continue;
- if (isa<LoadInst>(&*I))
- continue;
- CallInst *CI = dyn_cast<CallInst>(&*I);
- if (CI && CI->onlyReadsMemory())
- continue;
- InvokeInst *II = dyn_cast<InvokeInst>(&*I);
- if (II && II->onlyReadsMemory())
- continue;
- return lookupOrAdd(&*I);
- }
- return 0;
- }
-};
-
-//===----------------------------------------------------------------------===//
-
-class GVNSink {
-public:
- GVNSink() = default;
-
- bool run(Function &F) {
- LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName()
- << "\n");
-
- unsigned NumSunk = 0;
- ReversePostOrderTraversal<Function*> RPOT(&F);
- for (auto *N : RPOT)
- NumSunk += sinkBB(N);
-
- return NumSunk > 0;
- }
-
-private:
- ValueTable VN;
-
- bool shouldAvoidSinkingInstruction(Instruction *I) {
- // These instructions may change or break semantics if moved.
- if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
- I->getType()->isTokenTy())
- return true;
- return false;
- }
-
- /// The main heuristic function. Analyze the set of instructions pointed to by
- /// LRI and return a candidate solution if these instructions can be sunk, or
- /// None otherwise.
- Optional<SinkingInstructionCandidate> analyzeInstructionForSinking(
- LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
- ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents);
-
- /// Create a ModelledPHI for each PHI in BB, adding to PHIs.
- void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs,
- SmallPtrSetImpl<Value *> &PHIContents) {
- for (PHINode &PN : BB->phis()) {
- auto MPHI = ModelledPHI(&PN);
- PHIs.insert(MPHI);
- for (auto *V : MPHI.getValues())
- PHIContents.insert(V);
- }
- }
-
- /// The main instruction sinking driver. Set up state and try and sink
- /// instructions into BBEnd from its predecessors.
- unsigned sinkBB(BasicBlock *BBEnd);
-
- /// Perform the actual mechanics of sinking an instruction from Blocks into
- /// BBEnd, which is their only successor.
- void sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, BasicBlock *BBEnd);
-
- /// Remove PHIs that all have the same incoming value.
- void foldPointlessPHINodes(BasicBlock *BB) {
- auto I = BB->begin();
- while (PHINode *PN = dyn_cast<PHINode>(I++)) {
- if (!llvm::all_of(PN->incoming_values(), [&](const Value *V) {
- return V == PN->getIncomingValue(0);
- }))
- continue;
- if (PN->getIncomingValue(0) != PN)
- PN->replaceAllUsesWith(PN->getIncomingValue(0));
- else
- PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
- PN->eraseFromParent();
- }
- }
-};
-
-Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
- LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
- ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) {
- auto Insts = *LRI;
- LLVM_DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I
- : Insts) {
- I->dump();
- } dbgs() << " ]\n";);
-
- DenseMap<uint32_t, unsigned> VNums;
- for (auto *I : Insts) {
- uint32_t N = VN.lookupOrAdd(I);
- LLVM_DEBUG(dbgs() << " VN=" << Twine::utohexstr(N) << " for" << *I << "\n");
- if (N == ~0U)
- return None;
- VNums[N]++;
- }
- unsigned VNumToSink =
- std::max_element(VNums.begin(), VNums.end(),
- [](const std::pair<uint32_t, unsigned> &I,
- const std::pair<uint32_t, unsigned> &J) {
- return I.second < J.second;
- })
- ->first;
-
- if (VNums[VNumToSink] == 1)
- // Can't sink anything!
- return None;
-
- // Now restrict the number of incoming blocks down to only those with
- // VNumToSink.
- auto &ActivePreds = LRI.getActiveBlocks();
- unsigned InitialActivePredSize = ActivePreds.size();
- SmallVector<Instruction *, 4> NewInsts;
- for (auto *I : Insts) {
- if (VN.lookup(I) != VNumToSink)
- ActivePreds.remove(I->getParent());
- else
- NewInsts.push_back(I);
- }
- for (auto *I : NewInsts)
- if (shouldAvoidSinkingInstruction(I))
- return None;
-
- // If we've restricted the incoming blocks, restrict all needed PHIs also
- // to that set.
- bool RecomputePHIContents = false;
- if (ActivePreds.size() != InitialActivePredSize) {
- ModelledPHISet NewNeededPHIs;
- for (auto P : NeededPHIs) {
- P.restrictToBlocks(ActivePreds);
- NewNeededPHIs.insert(P);
- }
- NeededPHIs = NewNeededPHIs;
- LRI.restrictToBlocks(ActivePreds);
- RecomputePHIContents = true;
- }
-
- // The sunk instruction's results.
- ModelledPHI NewPHI(NewInsts, ActivePreds);
-
- // Does sinking this instruction render previous PHIs redundant?
+ BI = Blocks.erase(BI);
+ VI = Values.erase(VI);
+ } else {
+ ++BI;
+ ++VI;
+ }
+ }
+ assert(Blocks.size() == NewBlocks.size());
+ }
+
+ ArrayRef<Value *> getValues() const { return Values; }
+
+ bool areAllIncomingValuesSame() const {
+ return llvm::all_of(Values, [&](Value *V) { return V == Values[0]; });
+ }
+
+ bool areAllIncomingValuesSameType() const {
+ return llvm::all_of(
+ Values, [&](Value *V) { return V->getType() == Values[0]->getType(); });
+ }
+
+ bool areAnyIncomingValuesConstant() const {
+ return llvm::any_of(Values, [&](Value *V) { return isa<Constant>(V); });
+ }
+
+ // Hash functor
+ unsigned hash() const {
+ return (unsigned)hash_combine_range(Values.begin(), Values.end());
+ }
+
+ bool operator==(const ModelledPHI &Other) const {
+ return Values == Other.Values && Blocks == Other.Blocks;
+ }
+};
+
+template <typename ModelledPHI> struct DenseMapInfo {
+ static inline ModelledPHI &getEmptyKey() {
+ static ModelledPHI Dummy = ModelledPHI::createDummy(0);
+ return Dummy;
+ }
+
+ static inline ModelledPHI &getTombstoneKey() {
+ static ModelledPHI Dummy = ModelledPHI::createDummy(1);
+ return Dummy;
+ }
+
+ static unsigned getHashValue(const ModelledPHI &V) { return V.hash(); }
+
+ static bool isEqual(const ModelledPHI &LHS, const ModelledPHI &RHS) {
+ return LHS == RHS;
+ }
+};
+
+using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>;
+
+//===----------------------------------------------------------------------===//
+// ValueTable
+//===----------------------------------------------------------------------===//
+// This is a value number table where the value number is a function of the
+// *uses* of a value, rather than its operands. Thus, if VN(A) == VN(B) we know
+// that the program would be equivalent if we replaced A with PHI(A, B).
+//===----------------------------------------------------------------------===//
+
+/// A GVN expression describing how an instruction is used. The operands
+/// field of BasicExpression is used to store uses, not operands.
+///
+/// This class also contains fields for discriminators used when determining
+/// equivalence of instructions with sideeffects.
+class InstructionUseExpr : public GVNExpression::BasicExpression {
+ unsigned MemoryUseOrder = -1;
+ bool Volatile = false;
+ ArrayRef<int> ShuffleMask;
+
+public:
+ InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R,
+ BumpPtrAllocator &A)
+ : GVNExpression::BasicExpression(I->getNumUses()) {
+ allocateOperands(R, A);
+ setOpcode(I->getOpcode());
+ setType(I->getType());
+
+ if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
+ ShuffleMask = SVI->getShuffleMask().copy(A);
+
+ for (auto &U : I->uses())
+ op_push_back(U.getUser());
+ llvm::sort(op_begin(), op_end());
+ }
+
+ void setMemoryUseOrder(unsigned MUO) { MemoryUseOrder = MUO; }
+ void setVolatile(bool V) { Volatile = V; }
+
+ hash_code getHashValue() const override {
+ return hash_combine(GVNExpression::BasicExpression::getHashValue(),
+ MemoryUseOrder, Volatile, ShuffleMask);
+ }
+
+ template <typename Function> hash_code getHashValue(Function MapFn) {
+ hash_code H = hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile,
+ ShuffleMask);
+ for (auto *V : operands())
+ H = hash_combine(H, MapFn(V));
+ return H;
+ }
+};
+
+class ValueTable {
+ DenseMap<Value *, uint32_t> ValueNumbering;
+ DenseMap<GVNExpression::Expression *, uint32_t> ExpressionNumbering;
+ DenseMap<size_t, uint32_t> HashNumbering;
+ BumpPtrAllocator Allocator;
+ ArrayRecycler<Value *> Recycler;
+ uint32_t nextValueNumber = 1;
+
+ /// Create an expression for I based on its opcode and its uses. If I
+ /// touches or reads memory, the expression is also based upon its memory
+ /// order - see \c getMemoryUseOrder().
+ InstructionUseExpr *createExpr(Instruction *I) {
+ InstructionUseExpr *E =
+ new (Allocator) InstructionUseExpr(I, Recycler, Allocator);
+ if (isMemoryInst(I))
+ E->setMemoryUseOrder(getMemoryUseOrder(I));
+
+ if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+ CmpInst::Predicate Predicate = C->getPredicate();
+ E->setOpcode((C->getOpcode() << 8) | Predicate);
+ }
+ return E;
+ }
+
+ /// Helper to compute the value number for a memory instruction
+ /// (LoadInst/StoreInst), including checking the memory ordering and
+ /// volatility.
+ template <class Inst> InstructionUseExpr *createMemoryExpr(Inst *I) {
+ if (isStrongerThanUnordered(I->getOrdering()) || I->isAtomic())
+ return nullptr;
+ InstructionUseExpr *E = createExpr(I);
+ E->setVolatile(I->isVolatile());
+ return E;
+ }
+
+public:
+ ValueTable() = default;
+
+ /// Returns the value number for the specified value, assigning
+ /// it a new number if it did not have one before.
+ uint32_t lookupOrAdd(Value *V) {
+ auto VI = ValueNumbering.find(V);
+ if (VI != ValueNumbering.end())
+ return VI->second;
+
+ if (!isa<Instruction>(V)) {
+ ValueNumbering[V] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ Instruction *I = cast<Instruction>(V);
+ InstructionUseExpr *exp = nullptr;
+ switch (I->getOpcode()) {
+ case Instruction::Load:
+ exp = createMemoryExpr(cast<LoadInst>(I));
+ break;
+ case Instruction::Store:
+ exp = createMemoryExpr(cast<StoreInst>(I));
+ break;
+ case Instruction::Call:
+ case Instruction::Invoke:
+ case Instruction::FNeg:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ case Instruction::Select:
+ case Instruction::ExtractElement:
+ case Instruction::InsertElement:
+ case Instruction::ShuffleVector:
+ case Instruction::InsertValue:
+ case Instruction::GetElementPtr:
+ exp = createExpr(I);
+ break;
+ default:
+ break;
+ }
+
+ if (!exp) {
+ ValueNumbering[V] = nextValueNumber;
+ return nextValueNumber++;
+ }
+
+ uint32_t e = ExpressionNumbering[exp];
+ if (!e) {
+ hash_code H = exp->getHashValue([=](Value *V) { return lookupOrAdd(V); });
+ auto I = HashNumbering.find(H);
+ if (I != HashNumbering.end()) {
+ e = I->second;
+ } else {
+ e = nextValueNumber++;
+ HashNumbering[H] = e;
+ ExpressionNumbering[exp] = e;
+ }
+ }
+ ValueNumbering[V] = e;
+ return e;
+ }
+
+ /// Returns the value number of the specified value. Fails if the value has
+ /// not yet been numbered.
+ uint32_t lookup(Value *V) const {
+ auto VI = ValueNumbering.find(V);
+ assert(VI != ValueNumbering.end() && "Value not numbered?");
+ return VI->second;
+ }
+
+ /// Removes all value numberings and resets the value table.
+ void clear() {
+ ValueNumbering.clear();
+ ExpressionNumbering.clear();
+ HashNumbering.clear();
+ Recycler.clear(Allocator);
+ nextValueNumber = 1;
+ }
+
+ /// \c Inst uses or touches memory. Return an ID describing the memory state
+ /// at \c Inst such that if getMemoryUseOrder(I1) == getMemoryUseOrder(I2),
+ /// the exact same memory operations happen after I1 and I2.
+ ///
+ /// This is a very hard problem in general, so we use domain-specific
+ /// knowledge that we only ever check for equivalence between blocks sharing a
+ /// single immediate successor that is common, and when determining if I1 ==
+ /// I2 we will have already determined that next(I1) == next(I2). This
+ /// inductive property allows us to simply return the value number of the next
+ /// instruction that defines memory.
+ uint32_t getMemoryUseOrder(Instruction *Inst) {
+ auto *BB = Inst->getParent();
+ for (auto I = std::next(Inst->getIterator()), E = BB->end();
+ I != E && !I->isTerminator(); ++I) {
+ if (!isMemoryInst(&*I))
+ continue;
+ if (isa<LoadInst>(&*I))
+ continue;
+ CallInst *CI = dyn_cast<CallInst>(&*I);
+ if (CI && CI->onlyReadsMemory())
+ continue;
+ InvokeInst *II = dyn_cast<InvokeInst>(&*I);
+ if (II && II->onlyReadsMemory())
+ continue;
+ return lookupOrAdd(&*I);
+ }
+ return 0;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+
+class GVNSink {
+public:
+ GVNSink() = default;
+
+ bool run(Function &F) {
+ LLVM_DEBUG(dbgs() << "GVNSink: running on function @" << F.getName()
+ << "\n");
+
+ unsigned NumSunk = 0;
+ ReversePostOrderTraversal<Function*> RPOT(&F);
+ for (auto *N : RPOT)
+ NumSunk += sinkBB(N);
+
+ return NumSunk > 0;
+ }
+
+private:
+ ValueTable VN;
+
+ bool shouldAvoidSinkingInstruction(Instruction *I) {
+ // These instructions may change or break semantics if moved.
+ if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
+ I->getType()->isTokenTy())
+ return true;
+ return false;
+ }
+
+ /// The main heuristic function. Analyze the set of instructions pointed to by
+ /// LRI and return a candidate solution if these instructions can be sunk, or
+ /// None otherwise.
+ Optional<SinkingInstructionCandidate> analyzeInstructionForSinking(
+ LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
+ ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents);
+
+ /// Create a ModelledPHI for each PHI in BB, adding to PHIs.
+ void analyzeInitialPHIs(BasicBlock *BB, ModelledPHISet &PHIs,
+ SmallPtrSetImpl<Value *> &PHIContents) {
+ for (PHINode &PN : BB->phis()) {
+ auto MPHI = ModelledPHI(&PN);
+ PHIs.insert(MPHI);
+ for (auto *V : MPHI.getValues())
+ PHIContents.insert(V);
+ }
+ }
+
+ /// The main instruction sinking driver. Set up state and try and sink
+ /// instructions into BBEnd from its predecessors.
+ unsigned sinkBB(BasicBlock *BBEnd);
+
+ /// Perform the actual mechanics of sinking an instruction from Blocks into
+ /// BBEnd, which is their only successor.
+ void sinkLastInstruction(ArrayRef<BasicBlock *> Blocks, BasicBlock *BBEnd);
+
+ /// Remove PHIs that all have the same incoming value.
+ void foldPointlessPHINodes(BasicBlock *BB) {
+ auto I = BB->begin();
+ while (PHINode *PN = dyn_cast<PHINode>(I++)) {
+ if (!llvm::all_of(PN->incoming_values(), [&](const Value *V) {
+ return V == PN->getIncomingValue(0);
+ }))
+ continue;
+ if (PN->getIncomingValue(0) != PN)
+ PN->replaceAllUsesWith(PN->getIncomingValue(0));
+ else
+ PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+ PN->eraseFromParent();
+ }
+ }
+};
+
+Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
+ LockstepReverseIterator &LRI, unsigned &InstNum, unsigned &MemoryInstNum,
+ ModelledPHISet &NeededPHIs, SmallPtrSetImpl<Value *> &PHIContents) {
+ auto Insts = *LRI;
+ LLVM_DEBUG(dbgs() << " -- Analyzing instruction set: [\n"; for (auto *I
+ : Insts) {
+ I->dump();
+ } dbgs() << " ]\n";);
+
+ DenseMap<uint32_t, unsigned> VNums;
+ for (auto *I : Insts) {
+ uint32_t N = VN.lookupOrAdd(I);
+ LLVM_DEBUG(dbgs() << " VN=" << Twine::utohexstr(N) << " for" << *I << "\n");
+ if (N == ~0U)
+ return None;
+ VNums[N]++;
+ }
+ unsigned VNumToSink =
+ std::max_element(VNums.begin(), VNums.end(),
+ [](const std::pair<uint32_t, unsigned> &I,
+ const std::pair<uint32_t, unsigned> &J) {
+ return I.second < J.second;
+ })
+ ->first;
+
+ if (VNums[VNumToSink] == 1)
+ // Can't sink anything!
+ return None;
+
+ // Now restrict the number of incoming blocks down to only those with
+ // VNumToSink.
+ auto &ActivePreds = LRI.getActiveBlocks();
+ unsigned InitialActivePredSize = ActivePreds.size();
+ SmallVector<Instruction *, 4> NewInsts;
+ for (auto *I : Insts) {
+ if (VN.lookup(I) != VNumToSink)
+ ActivePreds.remove(I->getParent());
+ else
+ NewInsts.push_back(I);
+ }
+ for (auto *I : NewInsts)
+ if (shouldAvoidSinkingInstruction(I))
+ return None;
+
+ // If we've restricted the incoming blocks, restrict all needed PHIs also
+ // to that set.
+ bool RecomputePHIContents = false;
+ if (ActivePreds.size() != InitialActivePredSize) {
+ ModelledPHISet NewNeededPHIs;
+ for (auto P : NeededPHIs) {
+ P.restrictToBlocks(ActivePreds);
+ NewNeededPHIs.insert(P);
+ }
+ NeededPHIs = NewNeededPHIs;
+ LRI.restrictToBlocks(ActivePreds);
+ RecomputePHIContents = true;
+ }
+
+ // The sunk instruction's results.
+ ModelledPHI NewPHI(NewInsts, ActivePreds);
+
+ // Does sinking this instruction render previous PHIs redundant?
if (NeededPHIs.erase(NewPHI))
- RecomputePHIContents = true;
-
- if (RecomputePHIContents) {
- // The needed PHIs have changed, so recompute the set of all needed
- // values.
- PHIContents.clear();
- for (auto &PHI : NeededPHIs)
- PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
- }
-
- // Is this instruction required by a later PHI that doesn't match this PHI?
- // if so, we can't sink this instruction.
- for (auto *V : NewPHI.getValues())
- if (PHIContents.count(V))
- // V exists in this PHI, but the whole PHI is different to NewPHI
- // (else it would have been removed earlier). We cannot continue
- // because this isn't representable.
- return None;
-
- // Which operands need PHIs?
- // FIXME: If any of these fail, we should partition up the candidates to
- // try and continue making progress.
- Instruction *I0 = NewInsts[0];
-
- // If all instructions that are going to participate don't have the same
- // number of operands, we can't do any useful PHI analysis for all operands.
- auto hasDifferentNumOperands = [&I0](Instruction *I) {
- return I->getNumOperands() != I0->getNumOperands();
- };
- if (any_of(NewInsts, hasDifferentNumOperands))
- return None;
-
- for (unsigned OpNum = 0, E = I0->getNumOperands(); OpNum != E; ++OpNum) {
- ModelledPHI PHI(NewInsts, OpNum, ActivePreds);
- if (PHI.areAllIncomingValuesSame())
- continue;
- if (!canReplaceOperandWithVariable(I0, OpNum))
- // We can 't create a PHI from this instruction!
- return None;
- if (NeededPHIs.count(PHI))
- continue;
- if (!PHI.areAllIncomingValuesSameType())
- return None;
- // Don't create indirect calls! The called value is the final operand.
- if ((isa<CallInst>(I0) || isa<InvokeInst>(I0)) && OpNum == E - 1 &&
- PHI.areAnyIncomingValuesConstant())
- return None;
-
- NeededPHIs.reserve(NeededPHIs.size());
- NeededPHIs.insert(PHI);
- PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
- }
-
- if (isMemoryInst(NewInsts[0]))
- ++MemoryInstNum;
-
- SinkingInstructionCandidate Cand;
- Cand.NumInstructions = ++InstNum;
- Cand.NumMemoryInsts = MemoryInstNum;
- Cand.NumBlocks = ActivePreds.size();
- Cand.NumPHIs = NeededPHIs.size();
+ RecomputePHIContents = true;
+
+ if (RecomputePHIContents) {
+ // The needed PHIs have changed, so recompute the set of all needed
+ // values.
+ PHIContents.clear();
+ for (auto &PHI : NeededPHIs)
+ PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+ }
+
+ // Is this instruction required by a later PHI that doesn't match this PHI?
+ // if so, we can't sink this instruction.
+ for (auto *V : NewPHI.getValues())
+ if (PHIContents.count(V))
+ // V exists in this PHI, but the whole PHI is different to NewPHI
+ // (else it would have been removed earlier). We cannot continue
+ // because this isn't representable.
+ return None;
+
+ // Which operands need PHIs?
+ // FIXME: If any of these fail, we should partition up the candidates to
+ // try and continue making progress.
+ Instruction *I0 = NewInsts[0];
+
+ // If all instructions that are going to participate don't have the same
+ // number of operands, we can't do any useful PHI analysis for all operands.
+ auto hasDifferentNumOperands = [&I0](Instruction *I) {
+ return I->getNumOperands() != I0->getNumOperands();
+ };
+ if (any_of(NewInsts, hasDifferentNumOperands))
+ return None;
+
+ for (unsigned OpNum = 0, E = I0->getNumOperands(); OpNum != E; ++OpNum) {
+ ModelledPHI PHI(NewInsts, OpNum, ActivePreds);
+ if (PHI.areAllIncomingValuesSame())
+ continue;
+ if (!canReplaceOperandWithVariable(I0, OpNum))
+ // We can 't create a PHI from this instruction!
+ return None;
+ if (NeededPHIs.count(PHI))
+ continue;
+ if (!PHI.areAllIncomingValuesSameType())
+ return None;
+ // Don't create indirect calls! The called value is the final operand.
+ if ((isa<CallInst>(I0) || isa<InvokeInst>(I0)) && OpNum == E - 1 &&
+ PHI.areAnyIncomingValuesConstant())
+ return None;
+
+ NeededPHIs.reserve(NeededPHIs.size());
+ NeededPHIs.insert(PHI);
+ PHIContents.insert(PHI.getValues().begin(), PHI.getValues().end());
+ }
+
+ if (isMemoryInst(NewInsts[0]))
+ ++MemoryInstNum;
+
+ SinkingInstructionCandidate Cand;
+ Cand.NumInstructions = ++InstNum;
+ Cand.NumMemoryInsts = MemoryInstNum;
+ Cand.NumBlocks = ActivePreds.size();
+ Cand.NumPHIs = NeededPHIs.size();
append_range(Cand.Blocks, ActivePreds);
-
- return Cand;
-}
-
-unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
- LLVM_DEBUG(dbgs() << "GVNSink: running on basic block ";
- BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
- SmallVector<BasicBlock *, 4> Preds;
- for (auto *B : predecessors(BBEnd)) {
- auto *T = B->getTerminator();
- if (isa<BranchInst>(T) || isa<SwitchInst>(T))
- Preds.push_back(B);
- else
- return 0;
- }
- if (Preds.size() < 2)
- return 0;
- llvm::sort(Preds);
-
- unsigned NumOrigPreds = Preds.size();
- // We can only sink instructions through unconditional branches.
- for (auto I = Preds.begin(); I != Preds.end();) {
- if ((*I)->getTerminator()->getNumSuccessors() != 1)
- I = Preds.erase(I);
- else
- ++I;
- }
-
- LockstepReverseIterator LRI(Preds);
- SmallVector<SinkingInstructionCandidate, 4> Candidates;
- unsigned InstNum = 0, MemoryInstNum = 0;
- ModelledPHISet NeededPHIs;
- SmallPtrSet<Value *, 4> PHIContents;
- analyzeInitialPHIs(BBEnd, NeededPHIs, PHIContents);
- unsigned NumOrigPHIs = NeededPHIs.size();
-
- while (LRI.isValid()) {
- auto Cand = analyzeInstructionForSinking(LRI, InstNum, MemoryInstNum,
- NeededPHIs, PHIContents);
- if (!Cand)
- break;
- Cand->calculateCost(NumOrigPHIs, Preds.size());
- Candidates.emplace_back(*Cand);
- --LRI;
- }
-
- llvm::stable_sort(Candidates, std::greater<SinkingInstructionCandidate>());
- LLVM_DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
- : Candidates) dbgs()
- << " " << C << "\n";);
-
- // Pick the top candidate, as long it is positive!
- if (Candidates.empty() || Candidates.front().Cost <= 0)
- return 0;
- auto C = Candidates.front();
-
- LLVM_DEBUG(dbgs() << " -- Sinking: " << C << "\n");
- BasicBlock *InsertBB = BBEnd;
- if (C.Blocks.size() < NumOrigPreds) {
- LLVM_DEBUG(dbgs() << " -- Splitting edge to ";
- BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
- InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split");
- if (!InsertBB) {
- LLVM_DEBUG(dbgs() << " -- FAILED to split edge!\n");
- // Edge couldn't be split.
- return 0;
- }
- }
-
- for (unsigned I = 0; I < C.NumInstructions; ++I)
- sinkLastInstruction(C.Blocks, InsertBB);
-
- return C.NumInstructions;
-}
-
-void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
- BasicBlock *BBEnd) {
- SmallVector<Instruction *, 4> Insts;
- for (BasicBlock *BB : Blocks)
- Insts.push_back(BB->getTerminator()->getPrevNode());
- Instruction *I0 = Insts.front();
-
- SmallVector<Value *, 4> NewOperands;
- for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
- bool NeedPHI = llvm::any_of(Insts, [&I0, O](const Instruction *I) {
- return I->getOperand(O) != I0->getOperand(O);
- });
- if (!NeedPHI) {
- NewOperands.push_back(I0->getOperand(O));
- continue;
- }
-
- // Create a new PHI in the successor block and populate it.
- auto *Op = I0->getOperand(O);
- assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!");
- auto *PN = PHINode::Create(Op->getType(), Insts.size(),
- Op->getName() + ".sink", &BBEnd->front());
- for (auto *I : Insts)
- PN->addIncoming(I->getOperand(O), I->getParent());
- NewOperands.push_back(PN);
- }
-
- // Arbitrarily use I0 as the new "common" instruction; remap its operands
- // and move it to the start of the successor block.
- for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
- I0->getOperandUse(O).set(NewOperands[O]);
- I0->moveBefore(&*BBEnd->getFirstInsertionPt());
-
- // Update metadata and IR flags.
- for (auto *I : Insts)
- if (I != I0) {
- combineMetadataForCSE(I0, I, true);
- I0->andIRFlags(I);
- }
-
- for (auto *I : Insts)
- if (I != I0)
- I->replaceAllUsesWith(I0);
- foldPointlessPHINodes(BBEnd);
-
- // Finally nuke all instructions apart from the common instruction.
- for (auto *I : Insts)
- if (I != I0)
- I->eraseFromParent();
-
- NumRemoved += Insts.size() - 1;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Pass machinery / boilerplate
-
-class GVNSinkLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- GVNSinkLegacyPass() : FunctionPass(ID) {
- initializeGVNSinkLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
- GVNSink G;
- return G.run(F);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
- GVNSink G;
- if (!G.run(F))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserve<GlobalsAA>();
- return PA;
-}
-
-char GVNSinkLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink",
- "Early GVN sinking of Expressions", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_END(GVNSinkLegacyPass, "gvn-sink",
- "Early GVN sinking of Expressions", false, false)
-
-FunctionPass *llvm::createGVNSinkPass() { return new GVNSinkLegacyPass(); }
+
+ return Cand;
+}
+
+unsigned GVNSink::sinkBB(BasicBlock *BBEnd) {
+ LLVM_DEBUG(dbgs() << "GVNSink: running on basic block ";
+ BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
+ SmallVector<BasicBlock *, 4> Preds;
+ for (auto *B : predecessors(BBEnd)) {
+ auto *T = B->getTerminator();
+ if (isa<BranchInst>(T) || isa<SwitchInst>(T))
+ Preds.push_back(B);
+ else
+ return 0;
+ }
+ if (Preds.size() < 2)
+ return 0;
+ llvm::sort(Preds);
+
+ unsigned NumOrigPreds = Preds.size();
+ // We can only sink instructions through unconditional branches.
+ for (auto I = Preds.begin(); I != Preds.end();) {
+ if ((*I)->getTerminator()->getNumSuccessors() != 1)
+ I = Preds.erase(I);
+ else
+ ++I;
+ }
+
+ LockstepReverseIterator LRI(Preds);
+ SmallVector<SinkingInstructionCandidate, 4> Candidates;
+ unsigned InstNum = 0, MemoryInstNum = 0;
+ ModelledPHISet NeededPHIs;
+ SmallPtrSet<Value *, 4> PHIContents;
+ analyzeInitialPHIs(BBEnd, NeededPHIs, PHIContents);
+ unsigned NumOrigPHIs = NeededPHIs.size();
+
+ while (LRI.isValid()) {
+ auto Cand = analyzeInstructionForSinking(LRI, InstNum, MemoryInstNum,
+ NeededPHIs, PHIContents);
+ if (!Cand)
+ break;
+ Cand->calculateCost(NumOrigPHIs, Preds.size());
+ Candidates.emplace_back(*Cand);
+ --LRI;
+ }
+
+ llvm::stable_sort(Candidates, std::greater<SinkingInstructionCandidate>());
+ LLVM_DEBUG(dbgs() << " -- Sinking candidates:\n"; for (auto &C
+ : Candidates) dbgs()
+ << " " << C << "\n";);
+
+ // Pick the top candidate, as long it is positive!
+ if (Candidates.empty() || Candidates.front().Cost <= 0)
+ return 0;
+ auto C = Candidates.front();
+
+ LLVM_DEBUG(dbgs() << " -- Sinking: " << C << "\n");
+ BasicBlock *InsertBB = BBEnd;
+ if (C.Blocks.size() < NumOrigPreds) {
+ LLVM_DEBUG(dbgs() << " -- Splitting edge to ";
+ BBEnd->printAsOperand(dbgs()); dbgs() << "\n");
+ InsertBB = SplitBlockPredecessors(BBEnd, C.Blocks, ".gvnsink.split");
+ if (!InsertBB) {
+ LLVM_DEBUG(dbgs() << " -- FAILED to split edge!\n");
+ // Edge couldn't be split.
+ return 0;
+ }
+ }
+
+ for (unsigned I = 0; I < C.NumInstructions; ++I)
+ sinkLastInstruction(C.Blocks, InsertBB);
+
+ return C.NumInstructions;
+}
+
+void GVNSink::sinkLastInstruction(ArrayRef<BasicBlock *> Blocks,
+ BasicBlock *BBEnd) {
+ SmallVector<Instruction *, 4> Insts;
+ for (BasicBlock *BB : Blocks)
+ Insts.push_back(BB->getTerminator()->getPrevNode());
+ Instruction *I0 = Insts.front();
+
+ SmallVector<Value *, 4> NewOperands;
+ for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
+ bool NeedPHI = llvm::any_of(Insts, [&I0, O](const Instruction *I) {
+ return I->getOperand(O) != I0->getOperand(O);
+ });
+ if (!NeedPHI) {
+ NewOperands.push_back(I0->getOperand(O));
+ continue;
+ }
+
+ // Create a new PHI in the successor block and populate it.
+ auto *Op = I0->getOperand(O);
+ assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!");
+ auto *PN = PHINode::Create(Op->getType(), Insts.size(),
+ Op->getName() + ".sink", &BBEnd->front());
+ for (auto *I : Insts)
+ PN->addIncoming(I->getOperand(O), I->getParent());
+ NewOperands.push_back(PN);
+ }
+
+ // Arbitrarily use I0 as the new "common" instruction; remap its operands
+ // and move it to the start of the successor block.
+ for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
+ I0->getOperandUse(O).set(NewOperands[O]);
+ I0->moveBefore(&*BBEnd->getFirstInsertionPt());
+
+ // Update metadata and IR flags.
+ for (auto *I : Insts)
+ if (I != I0) {
+ combineMetadataForCSE(I0, I, true);
+ I0->andIRFlags(I);
+ }
+
+ for (auto *I : Insts)
+ if (I != I0)
+ I->replaceAllUsesWith(I0);
+ foldPointlessPHINodes(BBEnd);
+
+ // Finally nuke all instructions apart from the common instruction.
+ for (auto *I : Insts)
+ if (I != I0)
+ I->eraseFromParent();
+
+ NumRemoved += Insts.size() - 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Pass machinery / boilerplate
+
+class GVNSinkLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ GVNSinkLegacyPass() : FunctionPass(ID) {
+ initializeGVNSinkLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ GVNSink G;
+ return G.run(F);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses GVNSinkPass::run(Function &F, FunctionAnalysisManager &AM) {
+ GVNSink G;
+ if (!G.run(F))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+char GVNSinkLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GVNSinkLegacyPass, "gvn-sink",
+ "Early GVN sinking of Expressions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_END(GVNSinkLegacyPass, "gvn-sink",
+ "Early GVN sinking of Expressions", false, false)
+
+FunctionPass *llvm::createGVNSinkPass() { return new GVNSinkLegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/GuardWidening.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/GuardWidening.cpp
index 12363b373a..61eb4ce0ed 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/GuardWidening.cpp
@@ -1,881 +1,881 @@
-//===- GuardWidening.cpp - ---- Guard widening ----------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the guard widening pass. The semantics of the
-// @llvm.experimental.guard intrinsic lets LLVM transform it so that it fails
-// more often that it did before the transform. This optimization is called
-// "widening" and can be used hoist and common runtime checks in situations like
-// these:
-//
-// %cmp0 = 7 u< Length
-// call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ]
-// call @unknown_side_effects()
-// %cmp1 = 9 u< Length
-// call @llvm.experimental.guard(i1 %cmp1) [ "deopt"(...) ]
-// ...
-//
-// =>
-//
-// %cmp0 = 9 u< Length
-// call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ]
-// call @unknown_side_effects()
-// ...
-//
-// If %cmp0 is false, @llvm.experimental.guard will "deoptimize" back to a
-// generic implementation of the same function, which will have the correct
-// semantics from that point onward. It is always _legal_ to deoptimize (so
-// replacing %cmp0 with false is "correct"), though it may not always be
-// profitable to do so.
-//
-// NB! This pass is a work in progress. It hasn't been tuned to be "production
-// ready" yet. It is known to have quadriatic running time and will not scale
-// to large numbers of guards
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/GuardWidening.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/GuardUtils.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/ConstantRange.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/KnownBits.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/GuardUtils.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include <functional>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "guard-widening"
-
-STATISTIC(GuardsEliminated, "Number of eliminated guards");
-STATISTIC(CondBranchEliminated, "Number of eliminated conditional branches");
-
-static cl::opt<bool>
- WidenBranchGuards("guard-widening-widen-branch-guards", cl::Hidden,
- cl::desc("Whether or not we should widen guards "
- "expressed as branches by widenable conditions"),
- cl::init(true));
-
-namespace {
-
-// Get the condition of \p I. It can either be a guard or a conditional branch.
-static Value *getCondition(Instruction *I) {
- if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) {
- assert(GI->getIntrinsicID() == Intrinsic::experimental_guard &&
- "Bad guard intrinsic?");
- return GI->getArgOperand(0);
- }
- Value *Cond, *WC;
- BasicBlock *IfTrueBB, *IfFalseBB;
- if (parseWidenableBranch(I, Cond, WC, IfTrueBB, IfFalseBB))
- return Cond;
-
- return cast<BranchInst>(I)->getCondition();
-}
-
-// Set the condition for \p I to \p NewCond. \p I can either be a guard or a
-// conditional branch.
-static void setCondition(Instruction *I, Value *NewCond) {
- if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) {
- assert(GI->getIntrinsicID() == Intrinsic::experimental_guard &&
- "Bad guard intrinsic?");
- GI->setArgOperand(0, NewCond);
- return;
- }
- cast<BranchInst>(I)->setCondition(NewCond);
-}
-
-// Eliminates the guard instruction properly.
-static void eliminateGuard(Instruction *GuardInst) {
- GuardInst->eraseFromParent();
- ++GuardsEliminated;
-}
-
-class GuardWideningImpl {
- DominatorTree &DT;
- PostDominatorTree *PDT;
- LoopInfo &LI;
-
- /// Together, these describe the region of interest. This might be all of
- /// the blocks within a function, or only a given loop's blocks and preheader.
- DomTreeNode *Root;
- std::function<bool(BasicBlock*)> BlockFilter;
-
- /// The set of guards and conditional branches whose conditions have been
- /// widened into dominating guards.
- SmallVector<Instruction *, 16> EliminatedGuardsAndBranches;
-
- /// The set of guards which have been widened to include conditions to other
- /// guards.
- DenseSet<Instruction *> WidenedGuards;
-
- /// Try to eliminate instruction \p Instr by widening it into an earlier
- /// dominating guard. \p DFSI is the DFS iterator on the dominator tree that
- /// is currently visiting the block containing \p Guard, and \p GuardsPerBlock
- /// maps BasicBlocks to the set of guards seen in that block.
- bool eliminateInstrViaWidening(
- Instruction *Instr, const df_iterator<DomTreeNode *> &DFSI,
- const DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> &
- GuardsPerBlock, bool InvertCondition = false);
-
- /// Used to keep track of which widening potential is more effective.
- enum WideningScore {
- /// Don't widen.
- WS_IllegalOrNegative,
-
- /// Widening is performance neutral as far as the cycles spent in check
- /// conditions goes (but can still help, e.g., code layout, having less
- /// deopt state).
- WS_Neutral,
-
- /// Widening is profitable.
- WS_Positive,
-
- /// Widening is very profitable. Not significantly different from \c
- /// WS_Positive, except by the order.
- WS_VeryPositive
- };
-
- static StringRef scoreTypeToString(WideningScore WS);
-
- /// Compute the score for widening the condition in \p DominatedInstr
- /// into \p DominatingGuard. If \p InvertCond is set, then we widen the
- /// inverted condition of the dominating guard.
- WideningScore computeWideningScore(Instruction *DominatedInstr,
- Instruction *DominatingGuard,
- bool InvertCond);
-
- /// Helper to check if \p V can be hoisted to \p InsertPos.
- bool isAvailableAt(const Value *V, const Instruction *InsertPos) const {
- SmallPtrSet<const Instruction *, 8> Visited;
- return isAvailableAt(V, InsertPos, Visited);
- }
-
- bool isAvailableAt(const Value *V, const Instruction *InsertPos,
- SmallPtrSetImpl<const Instruction *> &Visited) const;
-
- /// Helper to hoist \p V to \p InsertPos. Guaranteed to succeed if \c
- /// isAvailableAt returned true.
- void makeAvailableAt(Value *V, Instruction *InsertPos) const;
-
- /// Common helper used by \c widenGuard and \c isWideningCondProfitable. Try
- /// to generate an expression computing the logical AND of \p Cond0 and (\p
- /// Cond1 XOR \p InvertCondition).
- /// Return true if the expression computing the AND is only as
- /// expensive as computing one of the two. If \p InsertPt is true then
- /// actually generate the resulting expression, make it available at \p
- /// InsertPt and return it in \p Result (else no change to the IR is made).
- bool widenCondCommon(Value *Cond0, Value *Cond1, Instruction *InsertPt,
- Value *&Result, bool InvertCondition);
-
- /// Represents a range check of the form \c Base + \c Offset u< \c Length,
- /// with the constraint that \c Length is not negative. \c CheckInst is the
- /// pre-existing instruction in the IR that computes the result of this range
- /// check.
- class RangeCheck {
- const Value *Base;
- const ConstantInt *Offset;
- const Value *Length;
- ICmpInst *CheckInst;
-
- public:
- explicit RangeCheck(const Value *Base, const ConstantInt *Offset,
- const Value *Length, ICmpInst *CheckInst)
- : Base(Base), Offset(Offset), Length(Length), CheckInst(CheckInst) {}
-
- void setBase(const Value *NewBase) { Base = NewBase; }
- void setOffset(const ConstantInt *NewOffset) { Offset = NewOffset; }
-
- const Value *getBase() const { return Base; }
- const ConstantInt *getOffset() const { return Offset; }
- const APInt &getOffsetValue() const { return getOffset()->getValue(); }
- const Value *getLength() const { return Length; };
- ICmpInst *getCheckInst() const { return CheckInst; }
-
- void print(raw_ostream &OS, bool PrintTypes = false) {
- OS << "Base: ";
- Base->printAsOperand(OS, PrintTypes);
- OS << " Offset: ";
- Offset->printAsOperand(OS, PrintTypes);
- OS << " Length: ";
- Length->printAsOperand(OS, PrintTypes);
- }
-
- LLVM_DUMP_METHOD void dump() {
- print(dbgs());
- dbgs() << "\n";
- }
- };
-
- /// Parse \p CheckCond into a conjunction (logical-and) of range checks; and
- /// append them to \p Checks. Returns true on success, may clobber \c Checks
- /// on failure.
- bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks) {
- SmallPtrSet<const Value *, 8> Visited;
- return parseRangeChecks(CheckCond, Checks, Visited);
- }
-
- bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks,
- SmallPtrSetImpl<const Value *> &Visited);
-
- /// Combine the checks in \p Checks into a smaller set of checks and append
- /// them into \p CombinedChecks. Return true on success (i.e. all of checks
- /// in \p Checks were combined into \p CombinedChecks). Clobbers \p Checks
- /// and \p CombinedChecks on success and on failure.
- bool combineRangeChecks(SmallVectorImpl<RangeCheck> &Checks,
- SmallVectorImpl<RangeCheck> &CombinedChecks) const;
-
- /// Can we compute the logical AND of \p Cond0 and \p Cond1 for the price of
- /// computing only one of the two expressions?
- bool isWideningCondProfitable(Value *Cond0, Value *Cond1, bool InvertCond) {
- Value *ResultUnused;
- return widenCondCommon(Cond0, Cond1, /*InsertPt=*/nullptr, ResultUnused,
- InvertCond);
- }
-
- /// If \p InvertCondition is false, Widen \p ToWiden to fail if
- /// \p NewCondition is false, otherwise make it fail if \p NewCondition is
- /// true (in addition to whatever it is already checking).
- void widenGuard(Instruction *ToWiden, Value *NewCondition,
- bool InvertCondition) {
- Value *Result;
-
- widenCondCommon(getCondition(ToWiden), NewCondition, ToWiden, Result,
- InvertCondition);
- if (isGuardAsWidenableBranch(ToWiden)) {
- setWidenableBranchCond(cast<BranchInst>(ToWiden), Result);
- return;
- }
- setCondition(ToWiden, Result);
- }
-
-public:
-
- explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT,
- LoopInfo &LI, DomTreeNode *Root,
- std::function<bool(BasicBlock*)> BlockFilter)
- : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter)
- {}
-
- /// The entry point for this pass.
- bool run();
-};
-}
-
-static bool isSupportedGuardInstruction(const Instruction *Insn) {
- if (isGuard(Insn))
- return true;
- if (WidenBranchGuards && isGuardAsWidenableBranch(Insn))
- return true;
- return false;
-}
-
-bool GuardWideningImpl::run() {
- DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> GuardsInBlock;
- bool Changed = false;
- for (auto DFI = df_begin(Root), DFE = df_end(Root);
- DFI != DFE; ++DFI) {
- auto *BB = (*DFI)->getBlock();
- if (!BlockFilter(BB))
- continue;
-
- auto &CurrentList = GuardsInBlock[BB];
-
- for (auto &I : *BB)
- if (isSupportedGuardInstruction(&I))
- CurrentList.push_back(cast<Instruction>(&I));
-
- for (auto *II : CurrentList)
- Changed |= eliminateInstrViaWidening(II, DFI, GuardsInBlock);
- }
-
- assert(EliminatedGuardsAndBranches.empty() || Changed);
- for (auto *I : EliminatedGuardsAndBranches)
- if (!WidenedGuards.count(I)) {
- assert(isa<ConstantInt>(getCondition(I)) && "Should be!");
- if (isSupportedGuardInstruction(I))
- eliminateGuard(I);
- else {
- assert(isa<BranchInst>(I) &&
- "Eliminated something other than guard or branch?");
- ++CondBranchEliminated;
- }
- }
-
- return Changed;
-}
-
-bool GuardWideningImpl::eliminateInstrViaWidening(
- Instruction *Instr, const df_iterator<DomTreeNode *> &DFSI,
- const DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> &
- GuardsInBlock, bool InvertCondition) {
- // Ignore trivial true or false conditions. These instructions will be
- // trivially eliminated by any cleanup pass. Do not erase them because other
- // guards can possibly be widened into them.
- if (isa<ConstantInt>(getCondition(Instr)))
- return false;
-
- Instruction *BestSoFar = nullptr;
- auto BestScoreSoFar = WS_IllegalOrNegative;
-
- // In the set of dominating guards, find the one we can merge GuardInst with
- // for the most profit.
- for (unsigned i = 0, e = DFSI.getPathLength(); i != e; ++i) {
- auto *CurBB = DFSI.getPath(i)->getBlock();
- if (!BlockFilter(CurBB))
- break;
- assert(GuardsInBlock.count(CurBB) && "Must have been populated by now!");
- const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second;
-
- auto I = GuardsInCurBB.begin();
+//===- GuardWidening.cpp - ---- Guard widening ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the guard widening pass. The semantics of the
+// @llvm.experimental.guard intrinsic lets LLVM transform it so that it fails
+// more often that it did before the transform. This optimization is called
+// "widening" and can be used hoist and common runtime checks in situations like
+// these:
+//
+// %cmp0 = 7 u< Length
+// call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ]
+// call @unknown_side_effects()
+// %cmp1 = 9 u< Length
+// call @llvm.experimental.guard(i1 %cmp1) [ "deopt"(...) ]
+// ...
+//
+// =>
+//
+// %cmp0 = 9 u< Length
+// call @llvm.experimental.guard(i1 %cmp0) [ "deopt"(...) ]
+// call @unknown_side_effects()
+// ...
+//
+// If %cmp0 is false, @llvm.experimental.guard will "deoptimize" back to a
+// generic implementation of the same function, which will have the correct
+// semantics from that point onward. It is always _legal_ to deoptimize (so
+// replacing %cmp0 with false is "correct"), though it may not always be
+// profitable to do so.
+//
+// NB! This pass is a work in progress. It hasn't been tuned to be "production
+// ready" yet. It is known to have quadriatic running time and will not scale
+// to large numbers of guards
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/GuardWidening.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <functional>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "guard-widening"
+
+STATISTIC(GuardsEliminated, "Number of eliminated guards");
+STATISTIC(CondBranchEliminated, "Number of eliminated conditional branches");
+
+static cl::opt<bool>
+ WidenBranchGuards("guard-widening-widen-branch-guards", cl::Hidden,
+ cl::desc("Whether or not we should widen guards "
+ "expressed as branches by widenable conditions"),
+ cl::init(true));
+
+namespace {
+
+// Get the condition of \p I. It can either be a guard or a conditional branch.
+static Value *getCondition(Instruction *I) {
+ if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) {
+ assert(GI->getIntrinsicID() == Intrinsic::experimental_guard &&
+ "Bad guard intrinsic?");
+ return GI->getArgOperand(0);
+ }
+ Value *Cond, *WC;
+ BasicBlock *IfTrueBB, *IfFalseBB;
+ if (parseWidenableBranch(I, Cond, WC, IfTrueBB, IfFalseBB))
+ return Cond;
+
+ return cast<BranchInst>(I)->getCondition();
+}
+
+// Set the condition for \p I to \p NewCond. \p I can either be a guard or a
+// conditional branch.
+static void setCondition(Instruction *I, Value *NewCond) {
+ if (IntrinsicInst *GI = dyn_cast<IntrinsicInst>(I)) {
+ assert(GI->getIntrinsicID() == Intrinsic::experimental_guard &&
+ "Bad guard intrinsic?");
+ GI->setArgOperand(0, NewCond);
+ return;
+ }
+ cast<BranchInst>(I)->setCondition(NewCond);
+}
+
+// Eliminates the guard instruction properly.
+static void eliminateGuard(Instruction *GuardInst) {
+ GuardInst->eraseFromParent();
+ ++GuardsEliminated;
+}
+
+class GuardWideningImpl {
+ DominatorTree &DT;
+ PostDominatorTree *PDT;
+ LoopInfo &LI;
+
+ /// Together, these describe the region of interest. This might be all of
+ /// the blocks within a function, or only a given loop's blocks and preheader.
+ DomTreeNode *Root;
+ std::function<bool(BasicBlock*)> BlockFilter;
+
+ /// The set of guards and conditional branches whose conditions have been
+ /// widened into dominating guards.
+ SmallVector<Instruction *, 16> EliminatedGuardsAndBranches;
+
+ /// The set of guards which have been widened to include conditions to other
+ /// guards.
+ DenseSet<Instruction *> WidenedGuards;
+
+ /// Try to eliminate instruction \p Instr by widening it into an earlier
+ /// dominating guard. \p DFSI is the DFS iterator on the dominator tree that
+ /// is currently visiting the block containing \p Guard, and \p GuardsPerBlock
+ /// maps BasicBlocks to the set of guards seen in that block.
+ bool eliminateInstrViaWidening(
+ Instruction *Instr, const df_iterator<DomTreeNode *> &DFSI,
+ const DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> &
+ GuardsPerBlock, bool InvertCondition = false);
+
+ /// Used to keep track of which widening potential is more effective.
+ enum WideningScore {
+ /// Don't widen.
+ WS_IllegalOrNegative,
+
+ /// Widening is performance neutral as far as the cycles spent in check
+ /// conditions goes (but can still help, e.g., code layout, having less
+ /// deopt state).
+ WS_Neutral,
+
+ /// Widening is profitable.
+ WS_Positive,
+
+ /// Widening is very profitable. Not significantly different from \c
+ /// WS_Positive, except by the order.
+ WS_VeryPositive
+ };
+
+ static StringRef scoreTypeToString(WideningScore WS);
+
+ /// Compute the score for widening the condition in \p DominatedInstr
+ /// into \p DominatingGuard. If \p InvertCond is set, then we widen the
+ /// inverted condition of the dominating guard.
+ WideningScore computeWideningScore(Instruction *DominatedInstr,
+ Instruction *DominatingGuard,
+ bool InvertCond);
+
+ /// Helper to check if \p V can be hoisted to \p InsertPos.
+ bool isAvailableAt(const Value *V, const Instruction *InsertPos) const {
+ SmallPtrSet<const Instruction *, 8> Visited;
+ return isAvailableAt(V, InsertPos, Visited);
+ }
+
+ bool isAvailableAt(const Value *V, const Instruction *InsertPos,
+ SmallPtrSetImpl<const Instruction *> &Visited) const;
+
+ /// Helper to hoist \p V to \p InsertPos. Guaranteed to succeed if \c
+ /// isAvailableAt returned true.
+ void makeAvailableAt(Value *V, Instruction *InsertPos) const;
+
+ /// Common helper used by \c widenGuard and \c isWideningCondProfitable. Try
+ /// to generate an expression computing the logical AND of \p Cond0 and (\p
+ /// Cond1 XOR \p InvertCondition).
+ /// Return true if the expression computing the AND is only as
+ /// expensive as computing one of the two. If \p InsertPt is true then
+ /// actually generate the resulting expression, make it available at \p
+ /// InsertPt and return it in \p Result (else no change to the IR is made).
+ bool widenCondCommon(Value *Cond0, Value *Cond1, Instruction *InsertPt,
+ Value *&Result, bool InvertCondition);
+
+ /// Represents a range check of the form \c Base + \c Offset u< \c Length,
+ /// with the constraint that \c Length is not negative. \c CheckInst is the
+ /// pre-existing instruction in the IR that computes the result of this range
+ /// check.
+ class RangeCheck {
+ const Value *Base;
+ const ConstantInt *Offset;
+ const Value *Length;
+ ICmpInst *CheckInst;
+
+ public:
+ explicit RangeCheck(const Value *Base, const ConstantInt *Offset,
+ const Value *Length, ICmpInst *CheckInst)
+ : Base(Base), Offset(Offset), Length(Length), CheckInst(CheckInst) {}
+
+ void setBase(const Value *NewBase) { Base = NewBase; }
+ void setOffset(const ConstantInt *NewOffset) { Offset = NewOffset; }
+
+ const Value *getBase() const { return Base; }
+ const ConstantInt *getOffset() const { return Offset; }
+ const APInt &getOffsetValue() const { return getOffset()->getValue(); }
+ const Value *getLength() const { return Length; };
+ ICmpInst *getCheckInst() const { return CheckInst; }
+
+ void print(raw_ostream &OS, bool PrintTypes = false) {
+ OS << "Base: ";
+ Base->printAsOperand(OS, PrintTypes);
+ OS << " Offset: ";
+ Offset->printAsOperand(OS, PrintTypes);
+ OS << " Length: ";
+ Length->printAsOperand(OS, PrintTypes);
+ }
+
+ LLVM_DUMP_METHOD void dump() {
+ print(dbgs());
+ dbgs() << "\n";
+ }
+ };
+
+ /// Parse \p CheckCond into a conjunction (logical-and) of range checks; and
+ /// append them to \p Checks. Returns true on success, may clobber \c Checks
+ /// on failure.
+ bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks) {
+ SmallPtrSet<const Value *, 8> Visited;
+ return parseRangeChecks(CheckCond, Checks, Visited);
+ }
+
+ bool parseRangeChecks(Value *CheckCond, SmallVectorImpl<RangeCheck> &Checks,
+ SmallPtrSetImpl<const Value *> &Visited);
+
+ /// Combine the checks in \p Checks into a smaller set of checks and append
+ /// them into \p CombinedChecks. Return true on success (i.e. all of checks
+ /// in \p Checks were combined into \p CombinedChecks). Clobbers \p Checks
+ /// and \p CombinedChecks on success and on failure.
+ bool combineRangeChecks(SmallVectorImpl<RangeCheck> &Checks,
+ SmallVectorImpl<RangeCheck> &CombinedChecks) const;
+
+ /// Can we compute the logical AND of \p Cond0 and \p Cond1 for the price of
+ /// computing only one of the two expressions?
+ bool isWideningCondProfitable(Value *Cond0, Value *Cond1, bool InvertCond) {
+ Value *ResultUnused;
+ return widenCondCommon(Cond0, Cond1, /*InsertPt=*/nullptr, ResultUnused,
+ InvertCond);
+ }
+
+ /// If \p InvertCondition is false, Widen \p ToWiden to fail if
+ /// \p NewCondition is false, otherwise make it fail if \p NewCondition is
+ /// true (in addition to whatever it is already checking).
+ void widenGuard(Instruction *ToWiden, Value *NewCondition,
+ bool InvertCondition) {
+ Value *Result;
+
+ widenCondCommon(getCondition(ToWiden), NewCondition, ToWiden, Result,
+ InvertCondition);
+ if (isGuardAsWidenableBranch(ToWiden)) {
+ setWidenableBranchCond(cast<BranchInst>(ToWiden), Result);
+ return;
+ }
+ setCondition(ToWiden, Result);
+ }
+
+public:
+
+ explicit GuardWideningImpl(DominatorTree &DT, PostDominatorTree *PDT,
+ LoopInfo &LI, DomTreeNode *Root,
+ std::function<bool(BasicBlock*)> BlockFilter)
+ : DT(DT), PDT(PDT), LI(LI), Root(Root), BlockFilter(BlockFilter)
+ {}
+
+ /// The entry point for this pass.
+ bool run();
+};
+}
+
+static bool isSupportedGuardInstruction(const Instruction *Insn) {
+ if (isGuard(Insn))
+ return true;
+ if (WidenBranchGuards && isGuardAsWidenableBranch(Insn))
+ return true;
+ return false;
+}
+
+bool GuardWideningImpl::run() {
+ DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> GuardsInBlock;
+ bool Changed = false;
+ for (auto DFI = df_begin(Root), DFE = df_end(Root);
+ DFI != DFE; ++DFI) {
+ auto *BB = (*DFI)->getBlock();
+ if (!BlockFilter(BB))
+ continue;
+
+ auto &CurrentList = GuardsInBlock[BB];
+
+ for (auto &I : *BB)
+ if (isSupportedGuardInstruction(&I))
+ CurrentList.push_back(cast<Instruction>(&I));
+
+ for (auto *II : CurrentList)
+ Changed |= eliminateInstrViaWidening(II, DFI, GuardsInBlock);
+ }
+
+ assert(EliminatedGuardsAndBranches.empty() || Changed);
+ for (auto *I : EliminatedGuardsAndBranches)
+ if (!WidenedGuards.count(I)) {
+ assert(isa<ConstantInt>(getCondition(I)) && "Should be!");
+ if (isSupportedGuardInstruction(I))
+ eliminateGuard(I);
+ else {
+ assert(isa<BranchInst>(I) &&
+ "Eliminated something other than guard or branch?");
+ ++CondBranchEliminated;
+ }
+ }
+
+ return Changed;
+}
+
+bool GuardWideningImpl::eliminateInstrViaWidening(
+ Instruction *Instr, const df_iterator<DomTreeNode *> &DFSI,
+ const DenseMap<BasicBlock *, SmallVector<Instruction *, 8>> &
+ GuardsInBlock, bool InvertCondition) {
+ // Ignore trivial true or false conditions. These instructions will be
+ // trivially eliminated by any cleanup pass. Do not erase them because other
+ // guards can possibly be widened into them.
+ if (isa<ConstantInt>(getCondition(Instr)))
+ return false;
+
+ Instruction *BestSoFar = nullptr;
+ auto BestScoreSoFar = WS_IllegalOrNegative;
+
+ // In the set of dominating guards, find the one we can merge GuardInst with
+ // for the most profit.
+ for (unsigned i = 0, e = DFSI.getPathLength(); i != e; ++i) {
+ auto *CurBB = DFSI.getPath(i)->getBlock();
+ if (!BlockFilter(CurBB))
+ break;
+ assert(GuardsInBlock.count(CurBB) && "Must have been populated by now!");
+ const auto &GuardsInCurBB = GuardsInBlock.find(CurBB)->second;
+
+ auto I = GuardsInCurBB.begin();
auto E = Instr->getParent() == CurBB ? find(GuardsInCurBB, Instr)
: GuardsInCurBB.end();
-
-#ifndef NDEBUG
- {
- unsigned Index = 0;
- for (auto &I : *CurBB) {
- if (Index == GuardsInCurBB.size())
- break;
- if (GuardsInCurBB[Index] == &I)
- Index++;
- }
- assert(Index == GuardsInCurBB.size() &&
- "Guards expected to be in order!");
- }
-#endif
-
- assert((i == (e - 1)) == (Instr->getParent() == CurBB) && "Bad DFS?");
-
- for (auto *Candidate : make_range(I, E)) {
- auto Score = computeWideningScore(Instr, Candidate, InvertCondition);
- LLVM_DEBUG(dbgs() << "Score between " << *getCondition(Instr)
- << " and " << *getCondition(Candidate) << " is "
- << scoreTypeToString(Score) << "\n");
- if (Score > BestScoreSoFar) {
- BestScoreSoFar = Score;
- BestSoFar = Candidate;
- }
- }
- }
-
- if (BestScoreSoFar == WS_IllegalOrNegative) {
- LLVM_DEBUG(dbgs() << "Did not eliminate guard " << *Instr << "\n");
- return false;
- }
-
- assert(BestSoFar != Instr && "Should have never visited same guard!");
- assert(DT.dominates(BestSoFar, Instr) && "Should be!");
-
- LLVM_DEBUG(dbgs() << "Widening " << *Instr << " into " << *BestSoFar
- << " with score " << scoreTypeToString(BestScoreSoFar)
- << "\n");
- widenGuard(BestSoFar, getCondition(Instr), InvertCondition);
- auto NewGuardCondition = InvertCondition
- ? ConstantInt::getFalse(Instr->getContext())
- : ConstantInt::getTrue(Instr->getContext());
- setCondition(Instr, NewGuardCondition);
- EliminatedGuardsAndBranches.push_back(Instr);
- WidenedGuards.insert(BestSoFar);
- return true;
-}
-
-GuardWideningImpl::WideningScore
-GuardWideningImpl::computeWideningScore(Instruction *DominatedInstr,
- Instruction *DominatingGuard,
- bool InvertCond) {
- Loop *DominatedInstrLoop = LI.getLoopFor(DominatedInstr->getParent());
- Loop *DominatingGuardLoop = LI.getLoopFor(DominatingGuard->getParent());
- bool HoistingOutOfLoop = false;
-
- if (DominatingGuardLoop != DominatedInstrLoop) {
- // Be conservative and don't widen into a sibling loop. TODO: If the
- // sibling is colder, we should consider allowing this.
- if (DominatingGuardLoop &&
- !DominatingGuardLoop->contains(DominatedInstrLoop))
- return WS_IllegalOrNegative;
-
- HoistingOutOfLoop = true;
- }
-
- if (!isAvailableAt(getCondition(DominatedInstr), DominatingGuard))
- return WS_IllegalOrNegative;
-
- // If the guard was conditional executed, it may never be reached
- // dynamically. There are two potential downsides to hoisting it out of the
- // conditionally executed region: 1) we may spuriously deopt without need and
- // 2) we have the extra cost of computing the guard condition in the common
- // case. At the moment, we really only consider the second in our heuristic
- // here. TODO: evaluate cost model for spurious deopt
- // NOTE: As written, this also lets us hoist right over another guard which
- // is essentially just another spelling for control flow.
- if (isWideningCondProfitable(getCondition(DominatedInstr),
- getCondition(DominatingGuard), InvertCond))
- return HoistingOutOfLoop ? WS_VeryPositive : WS_Positive;
-
- if (HoistingOutOfLoop)
- return WS_Positive;
-
- // Returns true if we might be hoisting above explicit control flow. Note
- // that this completely ignores implicit control flow (guards, calls which
- // throw, etc...). That choice appears arbitrary.
- auto MaybeHoistingOutOfIf = [&]() {
- auto *DominatingBlock = DominatingGuard->getParent();
- auto *DominatedBlock = DominatedInstr->getParent();
- if (isGuardAsWidenableBranch(DominatingGuard))
- DominatingBlock = cast<BranchInst>(DominatingGuard)->getSuccessor(0);
-
- // Same Block?
- if (DominatedBlock == DominatingBlock)
- return false;
- // Obvious successor (common loop header/preheader case)
- if (DominatedBlock == DominatingBlock->getUniqueSuccessor())
- return false;
- // TODO: diamond, triangle cases
- if (!PDT) return true;
- return !PDT->dominates(DominatedBlock, DominatingBlock);
- };
-
- return MaybeHoistingOutOfIf() ? WS_IllegalOrNegative : WS_Neutral;
-}
-
-bool GuardWideningImpl::isAvailableAt(
- const Value *V, const Instruction *Loc,
- SmallPtrSetImpl<const Instruction *> &Visited) const {
- auto *Inst = dyn_cast<Instruction>(V);
- if (!Inst || DT.dominates(Inst, Loc) || Visited.count(Inst))
- return true;
-
- if (!isSafeToSpeculativelyExecute(Inst, Loc, &DT) ||
- Inst->mayReadFromMemory())
- return false;
-
- Visited.insert(Inst);
-
- // We only want to go _up_ the dominance chain when recursing.
- assert(!isa<PHINode>(Loc) &&
- "PHIs should return false for isSafeToSpeculativelyExecute");
- assert(DT.isReachableFromEntry(Inst->getParent()) &&
- "We did a DFS from the block entry!");
- return all_of(Inst->operands(),
- [&](Value *Op) { return isAvailableAt(Op, Loc, Visited); });
-}
-
-void GuardWideningImpl::makeAvailableAt(Value *V, Instruction *Loc) const {
- auto *Inst = dyn_cast<Instruction>(V);
- if (!Inst || DT.dominates(Inst, Loc))
- return;
-
- assert(isSafeToSpeculativelyExecute(Inst, Loc, &DT) &&
- !Inst->mayReadFromMemory() && "Should've checked with isAvailableAt!");
-
- for (Value *Op : Inst->operands())
- makeAvailableAt(Op, Loc);
-
- Inst->moveBefore(Loc);
-}
-
-bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1,
- Instruction *InsertPt, Value *&Result,
- bool InvertCondition) {
- using namespace llvm::PatternMatch;
-
- {
- // L >u C0 && L >u C1 -> L >u max(C0, C1)
- ConstantInt *RHS0, *RHS1;
- Value *LHS;
- ICmpInst::Predicate Pred0, Pred1;
- if (match(Cond0, m_ICmp(Pred0, m_Value(LHS), m_ConstantInt(RHS0))) &&
- match(Cond1, m_ICmp(Pred1, m_Specific(LHS), m_ConstantInt(RHS1)))) {
- if (InvertCondition)
- Pred1 = ICmpInst::getInversePredicate(Pred1);
-
- ConstantRange CR0 =
- ConstantRange::makeExactICmpRegion(Pred0, RHS0->getValue());
- ConstantRange CR1 =
- ConstantRange::makeExactICmpRegion(Pred1, RHS1->getValue());
-
- // SubsetIntersect is a subset of the actual mathematical intersection of
- // CR0 and CR1, while SupersetIntersect is a superset of the actual
- // mathematical intersection. If these two ConstantRanges are equal, then
- // we know we were able to represent the actual mathematical intersection
- // of CR0 and CR1, and can use the same to generate an icmp instruction.
- //
- // Given what we're doing here and the semantics of guards, it would
- // actually be correct to just use SubsetIntersect, but that may be too
- // aggressive in cases we care about.
- auto SubsetIntersect = CR0.inverse().unionWith(CR1.inverse()).inverse();
- auto SupersetIntersect = CR0.intersectWith(CR1);
-
- APInt NewRHSAP;
- CmpInst::Predicate Pred;
- if (SubsetIntersect == SupersetIntersect &&
- SubsetIntersect.getEquivalentICmp(Pred, NewRHSAP)) {
- if (InsertPt) {
- ConstantInt *NewRHS = ConstantInt::get(Cond0->getContext(), NewRHSAP);
- Result = new ICmpInst(InsertPt, Pred, LHS, NewRHS, "wide.chk");
- }
- return true;
- }
- }
- }
-
- {
- SmallVector<GuardWideningImpl::RangeCheck, 4> Checks, CombinedChecks;
- // TODO: Support InvertCondition case?
- if (!InvertCondition &&
- parseRangeChecks(Cond0, Checks) && parseRangeChecks(Cond1, Checks) &&
- combineRangeChecks(Checks, CombinedChecks)) {
- if (InsertPt) {
- Result = nullptr;
- for (auto &RC : CombinedChecks) {
- makeAvailableAt(RC.getCheckInst(), InsertPt);
- if (Result)
- Result = BinaryOperator::CreateAnd(RC.getCheckInst(), Result, "",
- InsertPt);
- else
- Result = RC.getCheckInst();
- }
- assert(Result && "Failed to find result value");
- Result->setName("wide.chk");
- }
- return true;
- }
- }
-
- // Base case -- just logical-and the two conditions together.
-
- if (InsertPt) {
- makeAvailableAt(Cond0, InsertPt);
- makeAvailableAt(Cond1, InsertPt);
- if (InvertCondition)
- Cond1 = BinaryOperator::CreateNot(Cond1, "inverted", InsertPt);
- Result = BinaryOperator::CreateAnd(Cond0, Cond1, "wide.chk", InsertPt);
- }
-
- // We were not able to compute Cond0 AND Cond1 for the price of one.
- return false;
-}
-
-bool GuardWideningImpl::parseRangeChecks(
- Value *CheckCond, SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks,
- SmallPtrSetImpl<const Value *> &Visited) {
- if (!Visited.insert(CheckCond).second)
- return true;
-
- using namespace llvm::PatternMatch;
-
- {
- Value *AndLHS, *AndRHS;
- if (match(CheckCond, m_And(m_Value(AndLHS), m_Value(AndRHS))))
- return parseRangeChecks(AndLHS, Checks) &&
- parseRangeChecks(AndRHS, Checks);
- }
-
- auto *IC = dyn_cast<ICmpInst>(CheckCond);
- if (!IC || !IC->getOperand(0)->getType()->isIntegerTy() ||
- (IC->getPredicate() != ICmpInst::ICMP_ULT &&
- IC->getPredicate() != ICmpInst::ICMP_UGT))
- return false;
-
- const Value *CmpLHS = IC->getOperand(0), *CmpRHS = IC->getOperand(1);
- if (IC->getPredicate() == ICmpInst::ICMP_UGT)
- std::swap(CmpLHS, CmpRHS);
-
- auto &DL = IC->getModule()->getDataLayout();
-
- GuardWideningImpl::RangeCheck Check(
- CmpLHS, cast<ConstantInt>(ConstantInt::getNullValue(CmpRHS->getType())),
- CmpRHS, IC);
-
- if (!isKnownNonNegative(Check.getLength(), DL))
- return false;
-
- // What we have in \c Check now is a correct interpretation of \p CheckCond.
- // Try to see if we can move some constant offsets into the \c Offset field.
-
- bool Changed;
- auto &Ctx = CheckCond->getContext();
-
- do {
- Value *OpLHS;
- ConstantInt *OpRHS;
- Changed = false;
-
-#ifndef NDEBUG
- auto *BaseInst = dyn_cast<Instruction>(Check.getBase());
- assert((!BaseInst || DT.isReachableFromEntry(BaseInst->getParent())) &&
- "Unreachable instruction?");
-#endif
-
- if (match(Check.getBase(), m_Add(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
- Check.setBase(OpLHS);
- APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
- Check.setOffset(ConstantInt::get(Ctx, NewOffset));
- Changed = true;
- } else if (match(Check.getBase(),
- m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
- KnownBits Known = computeKnownBits(OpLHS, DL);
- if ((OpRHS->getValue() & Known.Zero) == OpRHS->getValue()) {
- Check.setBase(OpLHS);
- APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
- Check.setOffset(ConstantInt::get(Ctx, NewOffset));
- Changed = true;
- }
- }
- } while (Changed);
-
- Checks.push_back(Check);
- return true;
-}
-
-bool GuardWideningImpl::combineRangeChecks(
- SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks,
- SmallVectorImpl<GuardWideningImpl::RangeCheck> &RangeChecksOut) const {
- unsigned OldCount = Checks.size();
- while (!Checks.empty()) {
- // Pick all of the range checks with a specific base and length, and try to
- // merge them.
- const Value *CurrentBase = Checks.front().getBase();
- const Value *CurrentLength = Checks.front().getLength();
-
- SmallVector<GuardWideningImpl::RangeCheck, 3> CurrentChecks;
-
- auto IsCurrentCheck = [&](GuardWideningImpl::RangeCheck &RC) {
- return RC.getBase() == CurrentBase && RC.getLength() == CurrentLength;
- };
-
- copy_if(Checks, std::back_inserter(CurrentChecks), IsCurrentCheck);
+
+#ifndef NDEBUG
+ {
+ unsigned Index = 0;
+ for (auto &I : *CurBB) {
+ if (Index == GuardsInCurBB.size())
+ break;
+ if (GuardsInCurBB[Index] == &I)
+ Index++;
+ }
+ assert(Index == GuardsInCurBB.size() &&
+ "Guards expected to be in order!");
+ }
+#endif
+
+ assert((i == (e - 1)) == (Instr->getParent() == CurBB) && "Bad DFS?");
+
+ for (auto *Candidate : make_range(I, E)) {
+ auto Score = computeWideningScore(Instr, Candidate, InvertCondition);
+ LLVM_DEBUG(dbgs() << "Score between " << *getCondition(Instr)
+ << " and " << *getCondition(Candidate) << " is "
+ << scoreTypeToString(Score) << "\n");
+ if (Score > BestScoreSoFar) {
+ BestScoreSoFar = Score;
+ BestSoFar = Candidate;
+ }
+ }
+ }
+
+ if (BestScoreSoFar == WS_IllegalOrNegative) {
+ LLVM_DEBUG(dbgs() << "Did not eliminate guard " << *Instr << "\n");
+ return false;
+ }
+
+ assert(BestSoFar != Instr && "Should have never visited same guard!");
+ assert(DT.dominates(BestSoFar, Instr) && "Should be!");
+
+ LLVM_DEBUG(dbgs() << "Widening " << *Instr << " into " << *BestSoFar
+ << " with score " << scoreTypeToString(BestScoreSoFar)
+ << "\n");
+ widenGuard(BestSoFar, getCondition(Instr), InvertCondition);
+ auto NewGuardCondition = InvertCondition
+ ? ConstantInt::getFalse(Instr->getContext())
+ : ConstantInt::getTrue(Instr->getContext());
+ setCondition(Instr, NewGuardCondition);
+ EliminatedGuardsAndBranches.push_back(Instr);
+ WidenedGuards.insert(BestSoFar);
+ return true;
+}
+
+GuardWideningImpl::WideningScore
+GuardWideningImpl::computeWideningScore(Instruction *DominatedInstr,
+ Instruction *DominatingGuard,
+ bool InvertCond) {
+ Loop *DominatedInstrLoop = LI.getLoopFor(DominatedInstr->getParent());
+ Loop *DominatingGuardLoop = LI.getLoopFor(DominatingGuard->getParent());
+ bool HoistingOutOfLoop = false;
+
+ if (DominatingGuardLoop != DominatedInstrLoop) {
+ // Be conservative and don't widen into a sibling loop. TODO: If the
+ // sibling is colder, we should consider allowing this.
+ if (DominatingGuardLoop &&
+ !DominatingGuardLoop->contains(DominatedInstrLoop))
+ return WS_IllegalOrNegative;
+
+ HoistingOutOfLoop = true;
+ }
+
+ if (!isAvailableAt(getCondition(DominatedInstr), DominatingGuard))
+ return WS_IllegalOrNegative;
+
+ // If the guard was conditional executed, it may never be reached
+ // dynamically. There are two potential downsides to hoisting it out of the
+ // conditionally executed region: 1) we may spuriously deopt without need and
+ // 2) we have the extra cost of computing the guard condition in the common
+ // case. At the moment, we really only consider the second in our heuristic
+ // here. TODO: evaluate cost model for spurious deopt
+ // NOTE: As written, this also lets us hoist right over another guard which
+ // is essentially just another spelling for control flow.
+ if (isWideningCondProfitable(getCondition(DominatedInstr),
+ getCondition(DominatingGuard), InvertCond))
+ return HoistingOutOfLoop ? WS_VeryPositive : WS_Positive;
+
+ if (HoistingOutOfLoop)
+ return WS_Positive;
+
+ // Returns true if we might be hoisting above explicit control flow. Note
+ // that this completely ignores implicit control flow (guards, calls which
+ // throw, etc...). That choice appears arbitrary.
+ auto MaybeHoistingOutOfIf = [&]() {
+ auto *DominatingBlock = DominatingGuard->getParent();
+ auto *DominatedBlock = DominatedInstr->getParent();
+ if (isGuardAsWidenableBranch(DominatingGuard))
+ DominatingBlock = cast<BranchInst>(DominatingGuard)->getSuccessor(0);
+
+ // Same Block?
+ if (DominatedBlock == DominatingBlock)
+ return false;
+ // Obvious successor (common loop header/preheader case)
+ if (DominatedBlock == DominatingBlock->getUniqueSuccessor())
+ return false;
+ // TODO: diamond, triangle cases
+ if (!PDT) return true;
+ return !PDT->dominates(DominatedBlock, DominatingBlock);
+ };
+
+ return MaybeHoistingOutOfIf() ? WS_IllegalOrNegative : WS_Neutral;
+}
+
+bool GuardWideningImpl::isAvailableAt(
+ const Value *V, const Instruction *Loc,
+ SmallPtrSetImpl<const Instruction *> &Visited) const {
+ auto *Inst = dyn_cast<Instruction>(V);
+ if (!Inst || DT.dominates(Inst, Loc) || Visited.count(Inst))
+ return true;
+
+ if (!isSafeToSpeculativelyExecute(Inst, Loc, &DT) ||
+ Inst->mayReadFromMemory())
+ return false;
+
+ Visited.insert(Inst);
+
+ // We only want to go _up_ the dominance chain when recursing.
+ assert(!isa<PHINode>(Loc) &&
+ "PHIs should return false for isSafeToSpeculativelyExecute");
+ assert(DT.isReachableFromEntry(Inst->getParent()) &&
+ "We did a DFS from the block entry!");
+ return all_of(Inst->operands(),
+ [&](Value *Op) { return isAvailableAt(Op, Loc, Visited); });
+}
+
+void GuardWideningImpl::makeAvailableAt(Value *V, Instruction *Loc) const {
+ auto *Inst = dyn_cast<Instruction>(V);
+ if (!Inst || DT.dominates(Inst, Loc))
+ return;
+
+ assert(isSafeToSpeculativelyExecute(Inst, Loc, &DT) &&
+ !Inst->mayReadFromMemory() && "Should've checked with isAvailableAt!");
+
+ for (Value *Op : Inst->operands())
+ makeAvailableAt(Op, Loc);
+
+ Inst->moveBefore(Loc);
+}
+
+bool GuardWideningImpl::widenCondCommon(Value *Cond0, Value *Cond1,
+ Instruction *InsertPt, Value *&Result,
+ bool InvertCondition) {
+ using namespace llvm::PatternMatch;
+
+ {
+ // L >u C0 && L >u C1 -> L >u max(C0, C1)
+ ConstantInt *RHS0, *RHS1;
+ Value *LHS;
+ ICmpInst::Predicate Pred0, Pred1;
+ if (match(Cond0, m_ICmp(Pred0, m_Value(LHS), m_ConstantInt(RHS0))) &&
+ match(Cond1, m_ICmp(Pred1, m_Specific(LHS), m_ConstantInt(RHS1)))) {
+ if (InvertCondition)
+ Pred1 = ICmpInst::getInversePredicate(Pred1);
+
+ ConstantRange CR0 =
+ ConstantRange::makeExactICmpRegion(Pred0, RHS0->getValue());
+ ConstantRange CR1 =
+ ConstantRange::makeExactICmpRegion(Pred1, RHS1->getValue());
+
+ // SubsetIntersect is a subset of the actual mathematical intersection of
+ // CR0 and CR1, while SupersetIntersect is a superset of the actual
+ // mathematical intersection. If these two ConstantRanges are equal, then
+ // we know we were able to represent the actual mathematical intersection
+ // of CR0 and CR1, and can use the same to generate an icmp instruction.
+ //
+ // Given what we're doing here and the semantics of guards, it would
+ // actually be correct to just use SubsetIntersect, but that may be too
+ // aggressive in cases we care about.
+ auto SubsetIntersect = CR0.inverse().unionWith(CR1.inverse()).inverse();
+ auto SupersetIntersect = CR0.intersectWith(CR1);
+
+ APInt NewRHSAP;
+ CmpInst::Predicate Pred;
+ if (SubsetIntersect == SupersetIntersect &&
+ SubsetIntersect.getEquivalentICmp(Pred, NewRHSAP)) {
+ if (InsertPt) {
+ ConstantInt *NewRHS = ConstantInt::get(Cond0->getContext(), NewRHSAP);
+ Result = new ICmpInst(InsertPt, Pred, LHS, NewRHS, "wide.chk");
+ }
+ return true;
+ }
+ }
+ }
+
+ {
+ SmallVector<GuardWideningImpl::RangeCheck, 4> Checks, CombinedChecks;
+ // TODO: Support InvertCondition case?
+ if (!InvertCondition &&
+ parseRangeChecks(Cond0, Checks) && parseRangeChecks(Cond1, Checks) &&
+ combineRangeChecks(Checks, CombinedChecks)) {
+ if (InsertPt) {
+ Result = nullptr;
+ for (auto &RC : CombinedChecks) {
+ makeAvailableAt(RC.getCheckInst(), InsertPt);
+ if (Result)
+ Result = BinaryOperator::CreateAnd(RC.getCheckInst(), Result, "",
+ InsertPt);
+ else
+ Result = RC.getCheckInst();
+ }
+ assert(Result && "Failed to find result value");
+ Result->setName("wide.chk");
+ }
+ return true;
+ }
+ }
+
+ // Base case -- just logical-and the two conditions together.
+
+ if (InsertPt) {
+ makeAvailableAt(Cond0, InsertPt);
+ makeAvailableAt(Cond1, InsertPt);
+ if (InvertCondition)
+ Cond1 = BinaryOperator::CreateNot(Cond1, "inverted", InsertPt);
+ Result = BinaryOperator::CreateAnd(Cond0, Cond1, "wide.chk", InsertPt);
+ }
+
+ // We were not able to compute Cond0 AND Cond1 for the price of one.
+ return false;
+}
+
+bool GuardWideningImpl::parseRangeChecks(
+ Value *CheckCond, SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks,
+ SmallPtrSetImpl<const Value *> &Visited) {
+ if (!Visited.insert(CheckCond).second)
+ return true;
+
+ using namespace llvm::PatternMatch;
+
+ {
+ Value *AndLHS, *AndRHS;
+ if (match(CheckCond, m_And(m_Value(AndLHS), m_Value(AndRHS))))
+ return parseRangeChecks(AndLHS, Checks) &&
+ parseRangeChecks(AndRHS, Checks);
+ }
+
+ auto *IC = dyn_cast<ICmpInst>(CheckCond);
+ if (!IC || !IC->getOperand(0)->getType()->isIntegerTy() ||
+ (IC->getPredicate() != ICmpInst::ICMP_ULT &&
+ IC->getPredicate() != ICmpInst::ICMP_UGT))
+ return false;
+
+ const Value *CmpLHS = IC->getOperand(0), *CmpRHS = IC->getOperand(1);
+ if (IC->getPredicate() == ICmpInst::ICMP_UGT)
+ std::swap(CmpLHS, CmpRHS);
+
+ auto &DL = IC->getModule()->getDataLayout();
+
+ GuardWideningImpl::RangeCheck Check(
+ CmpLHS, cast<ConstantInt>(ConstantInt::getNullValue(CmpRHS->getType())),
+ CmpRHS, IC);
+
+ if (!isKnownNonNegative(Check.getLength(), DL))
+ return false;
+
+ // What we have in \c Check now is a correct interpretation of \p CheckCond.
+ // Try to see if we can move some constant offsets into the \c Offset field.
+
+ bool Changed;
+ auto &Ctx = CheckCond->getContext();
+
+ do {
+ Value *OpLHS;
+ ConstantInt *OpRHS;
+ Changed = false;
+
+#ifndef NDEBUG
+ auto *BaseInst = dyn_cast<Instruction>(Check.getBase());
+ assert((!BaseInst || DT.isReachableFromEntry(BaseInst->getParent())) &&
+ "Unreachable instruction?");
+#endif
+
+ if (match(Check.getBase(), m_Add(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
+ Check.setBase(OpLHS);
+ APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
+ Check.setOffset(ConstantInt::get(Ctx, NewOffset));
+ Changed = true;
+ } else if (match(Check.getBase(),
+ m_Or(m_Value(OpLHS), m_ConstantInt(OpRHS)))) {
+ KnownBits Known = computeKnownBits(OpLHS, DL);
+ if ((OpRHS->getValue() & Known.Zero) == OpRHS->getValue()) {
+ Check.setBase(OpLHS);
+ APInt NewOffset = Check.getOffsetValue() + OpRHS->getValue();
+ Check.setOffset(ConstantInt::get(Ctx, NewOffset));
+ Changed = true;
+ }
+ }
+ } while (Changed);
+
+ Checks.push_back(Check);
+ return true;
+}
+
+bool GuardWideningImpl::combineRangeChecks(
+ SmallVectorImpl<GuardWideningImpl::RangeCheck> &Checks,
+ SmallVectorImpl<GuardWideningImpl::RangeCheck> &RangeChecksOut) const {
+ unsigned OldCount = Checks.size();
+ while (!Checks.empty()) {
+ // Pick all of the range checks with a specific base and length, and try to
+ // merge them.
+ const Value *CurrentBase = Checks.front().getBase();
+ const Value *CurrentLength = Checks.front().getLength();
+
+ SmallVector<GuardWideningImpl::RangeCheck, 3> CurrentChecks;
+
+ auto IsCurrentCheck = [&](GuardWideningImpl::RangeCheck &RC) {
+ return RC.getBase() == CurrentBase && RC.getLength() == CurrentLength;
+ };
+
+ copy_if(Checks, std::back_inserter(CurrentChecks), IsCurrentCheck);
erase_if(Checks, IsCurrentCheck);
-
- assert(CurrentChecks.size() != 0 && "We know we have at least one!");
-
- if (CurrentChecks.size() < 3) {
+
+ assert(CurrentChecks.size() != 0 && "We know we have at least one!");
+
+ if (CurrentChecks.size() < 3) {
llvm::append_range(RangeChecksOut, CurrentChecks);
- continue;
- }
-
- // CurrentChecks.size() will typically be 3 here, but so far there has been
- // no need to hard-code that fact.
-
- llvm::sort(CurrentChecks, [&](const GuardWideningImpl::RangeCheck &LHS,
- const GuardWideningImpl::RangeCheck &RHS) {
- return LHS.getOffsetValue().slt(RHS.getOffsetValue());
- });
-
- // Note: std::sort should not invalidate the ChecksStart iterator.
-
- const ConstantInt *MinOffset = CurrentChecks.front().getOffset();
- const ConstantInt *MaxOffset = CurrentChecks.back().getOffset();
-
- unsigned BitWidth = MaxOffset->getValue().getBitWidth();
- if ((MaxOffset->getValue() - MinOffset->getValue())
- .ugt(APInt::getSignedMinValue(BitWidth)))
- return false;
-
- APInt MaxDiff = MaxOffset->getValue() - MinOffset->getValue();
- const APInt &HighOffset = MaxOffset->getValue();
- auto OffsetOK = [&](const GuardWideningImpl::RangeCheck &RC) {
- return (HighOffset - RC.getOffsetValue()).ult(MaxDiff);
- };
-
+ continue;
+ }
+
+ // CurrentChecks.size() will typically be 3 here, but so far there has been
+ // no need to hard-code that fact.
+
+ llvm::sort(CurrentChecks, [&](const GuardWideningImpl::RangeCheck &LHS,
+ const GuardWideningImpl::RangeCheck &RHS) {
+ return LHS.getOffsetValue().slt(RHS.getOffsetValue());
+ });
+
+ // Note: std::sort should not invalidate the ChecksStart iterator.
+
+ const ConstantInt *MinOffset = CurrentChecks.front().getOffset();
+ const ConstantInt *MaxOffset = CurrentChecks.back().getOffset();
+
+ unsigned BitWidth = MaxOffset->getValue().getBitWidth();
+ if ((MaxOffset->getValue() - MinOffset->getValue())
+ .ugt(APInt::getSignedMinValue(BitWidth)))
+ return false;
+
+ APInt MaxDiff = MaxOffset->getValue() - MinOffset->getValue();
+ const APInt &HighOffset = MaxOffset->getValue();
+ auto OffsetOK = [&](const GuardWideningImpl::RangeCheck &RC) {
+ return (HighOffset - RC.getOffsetValue()).ult(MaxDiff);
+ };
+
if (MaxDiff.isMinValue() || !all_of(drop_begin(CurrentChecks), OffsetOK))
- return false;
-
- // We have a series of f+1 checks as:
- //
- // I+k_0 u< L ... Chk_0
- // I+k_1 u< L ... Chk_1
- // ...
- // I+k_f u< L ... Chk_f
- //
- // with forall i in [0,f]: k_f-k_i u< k_f-k_0 ... Precond_0
- // k_f-k_0 u< INT_MIN+k_f ... Precond_1
- // k_f != k_0 ... Precond_2
- //
- // Claim:
- // Chk_0 AND Chk_f implies all the other checks
- //
- // Informal proof sketch:
- //
- // We will show that the integer range [I+k_0,I+k_f] does not unsigned-wrap
- // (i.e. going from I+k_0 to I+k_f does not cross the -1,0 boundary) and
- // thus I+k_f is the greatest unsigned value in that range.
- //
- // This combined with Ckh_(f+1) shows that everything in that range is u< L.
- // Via Precond_0 we know that all of the indices in Chk_0 through Chk_(f+1)
- // lie in [I+k_0,I+k_f], this proving our claim.
- //
- // To see that [I+k_0,I+k_f] is not a wrapping range, note that there are
- // two possibilities: I+k_0 u< I+k_f or I+k_0 >u I+k_f (they can't be equal
- // since k_0 != k_f). In the former case, [I+k_0,I+k_f] is not a wrapping
- // range by definition, and the latter case is impossible:
- //
- // 0-----I+k_f---I+k_0----L---INT_MAX,INT_MIN------------------(-1)
- // xxxxxx xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
- //
- // For Chk_0 to succeed, we'd have to have k_f-k_0 (the range highlighted
- // with 'x' above) to be at least >u INT_MIN.
-
- RangeChecksOut.emplace_back(CurrentChecks.front());
- RangeChecksOut.emplace_back(CurrentChecks.back());
- }
-
- assert(RangeChecksOut.size() <= OldCount && "We pessimized!");
- return RangeChecksOut.size() != OldCount;
-}
-
-#ifndef NDEBUG
-StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
- switch (WS) {
- case WS_IllegalOrNegative:
- return "IllegalOrNegative";
- case WS_Neutral:
- return "Neutral";
- case WS_Positive:
- return "Positive";
- case WS_VeryPositive:
- return "VeryPositive";
- }
-
- llvm_unreachable("Fully covered switch above!");
-}
-#endif
-
-PreservedAnalyses GuardWideningPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &LI = AM.getResult<LoopAnalysis>(F);
- auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
- if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
- [](BasicBlock*) { return true; } ).run())
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
-}
-
-PreservedAnalyses GuardWideningPass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &U) {
- BasicBlock *RootBB = L.getLoopPredecessor();
- if (!RootBB)
- RootBB = L.getHeader();
- auto BlockFilter = [&](BasicBlock *BB) {
- return BB == RootBB || L.contains(BB);
- };
- if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, AR.DT.getNode(RootBB),
- BlockFilter).run())
- return PreservedAnalyses::all();
-
- return getLoopPassPreservedAnalyses();
-}
-
-namespace {
-struct GuardWideningLegacyPass : public FunctionPass {
- static char ID;
-
- GuardWideningLegacyPass() : FunctionPass(ID) {
- initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
- return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
- [](BasicBlock*) { return true; } ).run();
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<PostDominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- }
-};
-
-/// Same as above, but restricted to a single loop at a time. Can be
-/// scheduled with other loop passes w/o breaking out of LPM
-struct LoopGuardWideningLegacyPass : public LoopPass {
- static char ID;
-
- LoopGuardWideningLegacyPass() : LoopPass(ID) {
- initializeLoopGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override {
- if (skipLoop(L))
- return false;
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
- auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
- BasicBlock *RootBB = L->getLoopPredecessor();
- if (!RootBB)
- RootBB = L->getHeader();
- auto BlockFilter = [&](BasicBlock *BB) {
- return BB == RootBB || L->contains(BB);
- };
- return GuardWideningImpl(DT, PDT, LI,
- DT.getNode(RootBB), BlockFilter).run();
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- getLoopAnalysisUsage(AU);
- AU.addPreserved<PostDominatorTreeWrapperPass>();
- }
-};
-}
-
-char GuardWideningLegacyPass::ID = 0;
-char LoopGuardWideningLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards",
- false, false)
-
-INITIALIZE_PASS_BEGIN(LoopGuardWideningLegacyPass, "loop-guard-widening",
- "Widen guards (within a single loop, as a loop pass)",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(LoopGuardWideningLegacyPass, "loop-guard-widening",
- "Widen guards (within a single loop, as a loop pass)",
- false, false)
-
-FunctionPass *llvm::createGuardWideningPass() {
- return new GuardWideningLegacyPass();
-}
-
-Pass *llvm::createLoopGuardWideningPass() {
- return new LoopGuardWideningLegacyPass();
-}
+ return false;
+
+ // We have a series of f+1 checks as:
+ //
+ // I+k_0 u< L ... Chk_0
+ // I+k_1 u< L ... Chk_1
+ // ...
+ // I+k_f u< L ... Chk_f
+ //
+ // with forall i in [0,f]: k_f-k_i u< k_f-k_0 ... Precond_0
+ // k_f-k_0 u< INT_MIN+k_f ... Precond_1
+ // k_f != k_0 ... Precond_2
+ //
+ // Claim:
+ // Chk_0 AND Chk_f implies all the other checks
+ //
+ // Informal proof sketch:
+ //
+ // We will show that the integer range [I+k_0,I+k_f] does not unsigned-wrap
+ // (i.e. going from I+k_0 to I+k_f does not cross the -1,0 boundary) and
+ // thus I+k_f is the greatest unsigned value in that range.
+ //
+ // This combined with Ckh_(f+1) shows that everything in that range is u< L.
+ // Via Precond_0 we know that all of the indices in Chk_0 through Chk_(f+1)
+ // lie in [I+k_0,I+k_f], this proving our claim.
+ //
+ // To see that [I+k_0,I+k_f] is not a wrapping range, note that there are
+ // two possibilities: I+k_0 u< I+k_f or I+k_0 >u I+k_f (they can't be equal
+ // since k_0 != k_f). In the former case, [I+k_0,I+k_f] is not a wrapping
+ // range by definition, and the latter case is impossible:
+ //
+ // 0-----I+k_f---I+k_0----L---INT_MAX,INT_MIN------------------(-1)
+ // xxxxxx xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+ //
+ // For Chk_0 to succeed, we'd have to have k_f-k_0 (the range highlighted
+ // with 'x' above) to be at least >u INT_MIN.
+
+ RangeChecksOut.emplace_back(CurrentChecks.front());
+ RangeChecksOut.emplace_back(CurrentChecks.back());
+ }
+
+ assert(RangeChecksOut.size() <= OldCount && "We pessimized!");
+ return RangeChecksOut.size() != OldCount;
+}
+
+#ifndef NDEBUG
+StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
+ switch (WS) {
+ case WS_IllegalOrNegative:
+ return "IllegalOrNegative";
+ case WS_Neutral:
+ return "Neutral";
+ case WS_Positive:
+ return "Positive";
+ case WS_VeryPositive:
+ return "VeryPositive";
+ }
+
+ llvm_unreachable("Fully covered switch above!");
+}
+#endif
+
+PreservedAnalyses GuardWideningPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+ if (!GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
+ [](BasicBlock*) { return true; } ).run())
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+PreservedAnalyses GuardWideningPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ BasicBlock *RootBB = L.getLoopPredecessor();
+ if (!RootBB)
+ RootBB = L.getHeader();
+ auto BlockFilter = [&](BasicBlock *BB) {
+ return BB == RootBB || L.contains(BB);
+ };
+ if (!GuardWideningImpl(AR.DT, nullptr, AR.LI, AR.DT.getNode(RootBB),
+ BlockFilter).run())
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
+
+namespace {
+struct GuardWideningLegacyPass : public FunctionPass {
+ static char ID;
+
+ GuardWideningLegacyPass() : FunctionPass(ID) {
+ initializeGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ return GuardWideningImpl(DT, &PDT, LI, DT.getRootNode(),
+ [](BasicBlock*) { return true; } ).run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ }
+};
+
+/// Same as above, but restricted to a single loop at a time. Can be
+/// scheduled with other loop passes w/o breaking out of LPM
+struct LoopGuardWideningLegacyPass : public LoopPass {
+ static char ID;
+
+ LoopGuardWideningLegacyPass() : LoopPass(ID) {
+ initializeLoopGuardWideningLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+ auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+ BasicBlock *RootBB = L->getLoopPredecessor();
+ if (!RootBB)
+ RootBB = L->getHeader();
+ auto BlockFilter = [&](BasicBlock *BB) {
+ return BB == RootBB || L->contains(BB);
+ };
+ return GuardWideningImpl(DT, PDT, LI,
+ DT.getNode(RootBB), BlockFilter).run();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ getLoopAnalysisUsage(AU);
+ AU.addPreserved<PostDominatorTreeWrapperPass>();
+ }
+};
+}
+
+char GuardWideningLegacyPass::ID = 0;
+char LoopGuardWideningLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(GuardWideningLegacyPass, "guard-widening", "Widen guards",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(GuardWideningLegacyPass, "guard-widening", "Widen guards",
+ false, false)
+
+INITIALIZE_PASS_BEGIN(LoopGuardWideningLegacyPass, "loop-guard-widening",
+ "Widen guards (within a single loop, as a loop pass)",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LoopGuardWideningLegacyPass, "loop-guard-widening",
+ "Widen guards (within a single loop, as a loop pass)",
+ false, false)
+
+FunctionPass *llvm::createGuardWideningPass() {
+ return new GuardWideningLegacyPass();
+}
+
+Pass *llvm::createLoopGuardWideningPass() {
+ return new LoopGuardWideningLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/IVUsersPrinter.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/IVUsersPrinter.cpp
index 36deb00b5a..e2022aba97 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/IVUsersPrinter.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/IVUsersPrinter.cpp
@@ -1,21 +1,21 @@
-//===- IVUsersPrinter.cpp - Induction Variable Users Printer ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/IVUsersPrinter.h"
-#include "llvm/Analysis/IVUsers.h"
-#include "llvm/Support/Debug.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "iv-users"
-
-PreservedAnalyses IVUsersPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &U) {
- AM.getResult<IVUsersAnalysis>(L, AR).print(OS);
- return PreservedAnalyses::all();
-}
+//===- IVUsersPrinter.cpp - Induction Variable Users Printer ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/IVUsersPrinter.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "iv-users"
+
+PreservedAnalyses IVUsersPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ AM.getResult<IVUsersAnalysis>(L, AR).print(OS);
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/IndVarSimplify.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/IndVarSimplify.cpp
index bba2f76e77..ae1fff0fa8 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1,641 +1,641 @@
-//===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This transformation analyzes and transforms the induction variables (and
-// computations derived from them) into simpler forms suitable for subsequent
-// analysis and transformation.
-//
-// If the trip count of a loop is computable, this pass also makes the following
-// changes:
-// 1. The exit condition for the loop is canonicalized to compare the
-// induction value against the exit value. This turns loops like:
-// 'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)'
-// 2. Any use outside of the loop of an expression derived from the indvar
-// is changed to compute the derived value outside of the loop, eliminating
-// the dependence on the exit value of the induction variable. If the only
-// purpose of the loop is to compute the exit value of some derived
-// expression, this transformation will make the loop dead.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/IndVarSimplify.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/ConstantRange.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Utils/SimplifyIndVar.h"
-#include <cassert>
-#include <cstdint>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "indvars"
-
-STATISTIC(NumWidened , "Number of indvars widened");
-STATISTIC(NumReplaced , "Number of exit values replaced");
-STATISTIC(NumLFTR , "Number of loop exit tests replaced");
-STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated");
-STATISTIC(NumElimIV , "Number of congruent IVs eliminated");
-
-// Trip count verification can be enabled by default under NDEBUG if we
-// implement a strong expression equivalence checker in SCEV. Until then, we
-// use the verify-indvars flag, which may assert in some cases.
-static cl::opt<bool> VerifyIndvars(
- "verify-indvars", cl::Hidden,
- cl::desc("Verify the ScalarEvolution result after running indvars. Has no "
- "effect in release builds. (Note: this adds additional SCEV "
- "queries potentially changing the analysis result)"));
-
-static cl::opt<ReplaceExitVal> ReplaceExitValue(
- "replexitval", cl::Hidden, cl::init(OnlyCheapRepl),
- cl::desc("Choose the strategy to replace exit value in IndVarSimplify"),
- cl::values(clEnumValN(NeverRepl, "never", "never replace exit value"),
- clEnumValN(OnlyCheapRepl, "cheap",
- "only replace exit value when the cost is cheap"),
- clEnumValN(NoHardUse, "noharduse",
- "only replace exit values when loop def likely dead"),
- clEnumValN(AlwaysRepl, "always",
- "always replace exit value whenever possible")));
-
-static cl::opt<bool> UsePostIncrementRanges(
- "indvars-post-increment-ranges", cl::Hidden,
- cl::desc("Use post increment control-dependent ranges in IndVarSimplify"),
- cl::init(true));
-
-static cl::opt<bool>
-DisableLFTR("disable-lftr", cl::Hidden, cl::init(false),
- cl::desc("Disable Linear Function Test Replace optimization"));
-
-static cl::opt<bool>
-LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true),
- cl::desc("Predicate conditions in read only loops"));
-
+//===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into simpler forms suitable for subsequent
+// analysis and transformation.
+//
+// If the trip count of a loop is computable, this pass also makes the following
+// changes:
+// 1. The exit condition for the loop is canonicalized to compare the
+// induction value against the exit value. This turns loops like:
+// 'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)'
+// 2. Any use outside of the loop of an expression derived from the indvar
+// is changed to compute the derived value outside of the loop, eliminating
+// the dependence on the exit value of the induction variable. If the only
+// purpose of the loop is to compute the exit value of some derived
+// expression, this transformation will make the loop dead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/IndVarSimplify.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "indvars"
+
+STATISTIC(NumWidened , "Number of indvars widened");
+STATISTIC(NumReplaced , "Number of exit values replaced");
+STATISTIC(NumLFTR , "Number of loop exit tests replaced");
+STATISTIC(NumElimExt , "Number of IV sign/zero extends eliminated");
+STATISTIC(NumElimIV , "Number of congruent IVs eliminated");
+
+// Trip count verification can be enabled by default under NDEBUG if we
+// implement a strong expression equivalence checker in SCEV. Until then, we
+// use the verify-indvars flag, which may assert in some cases.
+static cl::opt<bool> VerifyIndvars(
+ "verify-indvars", cl::Hidden,
+ cl::desc("Verify the ScalarEvolution result after running indvars. Has no "
+ "effect in release builds. (Note: this adds additional SCEV "
+ "queries potentially changing the analysis result)"));
+
+static cl::opt<ReplaceExitVal> ReplaceExitValue(
+ "replexitval", cl::Hidden, cl::init(OnlyCheapRepl),
+ cl::desc("Choose the strategy to replace exit value in IndVarSimplify"),
+ cl::values(clEnumValN(NeverRepl, "never", "never replace exit value"),
+ clEnumValN(OnlyCheapRepl, "cheap",
+ "only replace exit value when the cost is cheap"),
+ clEnumValN(NoHardUse, "noharduse",
+ "only replace exit values when loop def likely dead"),
+ clEnumValN(AlwaysRepl, "always",
+ "always replace exit value whenever possible")));
+
+static cl::opt<bool> UsePostIncrementRanges(
+ "indvars-post-increment-ranges", cl::Hidden,
+ cl::desc("Use post increment control-dependent ranges in IndVarSimplify"),
+ cl::init(true));
+
+static cl::opt<bool>
+DisableLFTR("disable-lftr", cl::Hidden, cl::init(false),
+ cl::desc("Disable Linear Function Test Replace optimization"));
+
+static cl::opt<bool>
+LoopPredication("indvars-predicate-loops", cl::Hidden, cl::init(true),
+ cl::desc("Predicate conditions in read only loops"));
+
static cl::opt<bool>
AllowIVWidening("indvars-widen-indvars", cl::Hidden, cl::init(true),
cl::desc("Allow widening of indvars to eliminate s/zext"));
-namespace {
-
-struct RewritePhi;
-
-class IndVarSimplify {
- LoopInfo *LI;
- ScalarEvolution *SE;
- DominatorTree *DT;
- const DataLayout &DL;
- TargetLibraryInfo *TLI;
- const TargetTransformInfo *TTI;
- std::unique_ptr<MemorySSAUpdater> MSSAU;
-
- SmallVector<WeakTrackingVH, 16> DeadInsts;
+namespace {
+
+struct RewritePhi;
+
+class IndVarSimplify {
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ DominatorTree *DT;
+ const DataLayout &DL;
+ TargetLibraryInfo *TLI;
+ const TargetTransformInfo *TTI;
+ std::unique_ptr<MemorySSAUpdater> MSSAU;
+
+ SmallVector<WeakTrackingVH, 16> DeadInsts;
bool WidenIndVars;
-
- bool handleFloatingPointIV(Loop *L, PHINode *PH);
- bool rewriteNonIntegerIVs(Loop *L);
-
- bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI);
- /// Try to eliminate loop exits based on analyzeable exit counts
- bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter);
- /// Try to form loop invariant tests for loop exits by changing how many
- /// iterations of the loop run when that is unobservable.
- bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter);
-
- bool rewriteFirstIterationLoopExitValues(Loop *L);
-
- bool linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
- const SCEV *ExitCount,
- PHINode *IndVar, SCEVExpander &Rewriter);
-
- bool sinkUnusedInvariants(Loop *L);
-
-public:
- IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
- const DataLayout &DL, TargetLibraryInfo *TLI,
+
+ bool handleFloatingPointIV(Loop *L, PHINode *PH);
+ bool rewriteNonIntegerIVs(Loop *L);
+
+ bool simplifyAndExtend(Loop *L, SCEVExpander &Rewriter, LoopInfo *LI);
+ /// Try to eliminate loop exits based on analyzeable exit counts
+ bool optimizeLoopExits(Loop *L, SCEVExpander &Rewriter);
+ /// Try to form loop invariant tests for loop exits by changing how many
+ /// iterations of the loop run when that is unobservable.
+ bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter);
+
+ bool rewriteFirstIterationLoopExitValues(Loop *L);
+
+ bool linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
+ const SCEV *ExitCount,
+ PHINode *IndVar, SCEVExpander &Rewriter);
+
+ bool sinkUnusedInvariants(Loop *L);
+
+public:
+ IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+ const DataLayout &DL, TargetLibraryInfo *TLI,
TargetTransformInfo *TTI, MemorySSA *MSSA, bool WidenIndVars)
: LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI),
WidenIndVars(WidenIndVars) {
- if (MSSA)
- MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
- }
-
- bool run(Loop *L);
-};
-
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-// rewriteNonIntegerIVs and helpers. Prefer integer IVs.
-//===----------------------------------------------------------------------===//
-
-/// Convert APF to an integer, if possible.
-static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
- bool isExact = false;
- // See if we can convert this to an int64_t
- uint64_t UIntVal;
- if (APF.convertToInteger(makeMutableArrayRef(UIntVal), 64, true,
- APFloat::rmTowardZero, &isExact) != APFloat::opOK ||
- !isExact)
- return false;
- IntVal = UIntVal;
- return true;
-}
-
-/// If the loop has floating induction variable then insert corresponding
-/// integer induction variable if possible.
-/// For example,
-/// for(double i = 0; i < 10000; ++i)
-/// bar(i)
-/// is converted into
-/// for(int i = 0; i < 10000; ++i)
-/// bar((double)i);
-bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
- unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0));
- unsigned BackEdge = IncomingEdge^1;
-
- // Check incoming value.
- auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge));
-
- int64_t InitValue;
- if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue))
- return false;
-
- // Check IV increment. Reject this PN if increment operation is not
- // an add or increment value can not be represented by an integer.
- auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge));
- if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return false;
-
- // If this is not an add of the PHI with a constantfp, or if the constant fp
- // is not an integer, bail out.
- ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1));
- int64_t IncValue;
- if (IncValueVal == nullptr || Incr->getOperand(0) != PN ||
- !ConvertToSInt(IncValueVal->getValueAPF(), IncValue))
- return false;
-
- // Check Incr uses. One user is PN and the other user is an exit condition
- // used by the conditional terminator.
- Value::user_iterator IncrUse = Incr->user_begin();
- Instruction *U1 = cast<Instruction>(*IncrUse++);
- if (IncrUse == Incr->user_end()) return false;
- Instruction *U2 = cast<Instruction>(*IncrUse++);
- if (IncrUse != Incr->user_end()) return false;
-
- // Find exit condition, which is an fcmp. If it doesn't exist, or if it isn't
- // only used by a branch, we can't transform it.
- FCmpInst *Compare = dyn_cast<FCmpInst>(U1);
- if (!Compare)
- Compare = dyn_cast<FCmpInst>(U2);
- if (!Compare || !Compare->hasOneUse() ||
- !isa<BranchInst>(Compare->user_back()))
- return false;
-
- BranchInst *TheBr = cast<BranchInst>(Compare->user_back());
-
- // We need to verify that the branch actually controls the iteration count
- // of the loop. If not, the new IV can overflow and no one will notice.
- // The branch block must be in the loop and one of the successors must be out
- // of the loop.
- assert(TheBr->isConditional() && "Can't use fcmp if not conditional");
- if (!L->contains(TheBr->getParent()) ||
- (L->contains(TheBr->getSuccessor(0)) &&
- L->contains(TheBr->getSuccessor(1))))
- return false;
-
- // If it isn't a comparison with an integer-as-fp (the exit value), we can't
- // transform it.
- ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1));
- int64_t ExitValue;
- if (ExitValueVal == nullptr ||
- !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue))
- return false;
-
- // Find new predicate for integer comparison.
- CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE;
- switch (Compare->getPredicate()) {
- default: return false; // Unknown comparison.
- case CmpInst::FCMP_OEQ:
- case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break;
- case CmpInst::FCMP_ONE:
- case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break;
- case CmpInst::FCMP_OGT:
- case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break;
- case CmpInst::FCMP_OGE:
- case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break;
- case CmpInst::FCMP_OLT:
- case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break;
- case CmpInst::FCMP_OLE:
- case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break;
- }
-
- // We convert the floating point induction variable to a signed i32 value if
- // we can. This is only safe if the comparison will not overflow in a way
- // that won't be trapped by the integer equivalent operations. Check for this
- // now.
- // TODO: We could use i64 if it is native and the range requires it.
-
- // The start/stride/exit values must all fit in signed i32.
- if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue))
- return false;
-
- // If not actually striding (add x, 0.0), avoid touching the code.
- if (IncValue == 0)
- return false;
-
- // Positive and negative strides have different safety conditions.
- if (IncValue > 0) {
- // If we have a positive stride, we require the init to be less than the
- // exit value.
- if (InitValue >= ExitValue)
- return false;
-
- uint32_t Range = uint32_t(ExitValue-InitValue);
- // Check for infinite loop, either:
- // while (i <= Exit) or until (i > Exit)
- if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) {
- if (++Range == 0) return false; // Range overflows.
- }
-
- unsigned Leftover = Range % uint32_t(IncValue);
-
- // If this is an equality comparison, we require that the strided value
- // exactly land on the exit value, otherwise the IV condition will wrap
- // around and do things the fp IV wouldn't.
- if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
- Leftover != 0)
- return false;
-
- // If the stride would wrap around the i32 before exiting, we can't
- // transform the IV.
- if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue)
- return false;
- } else {
- // If we have a negative stride, we require the init to be greater than the
- // exit value.
- if (InitValue <= ExitValue)
- return false;
-
- uint32_t Range = uint32_t(InitValue-ExitValue);
- // Check for infinite loop, either:
- // while (i >= Exit) or until (i < Exit)
- if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) {
- if (++Range == 0) return false; // Range overflows.
- }
-
- unsigned Leftover = Range % uint32_t(-IncValue);
-
- // If this is an equality comparison, we require that the strided value
- // exactly land on the exit value, otherwise the IV condition will wrap
- // around and do things the fp IV wouldn't.
- if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
- Leftover != 0)
- return false;
-
- // If the stride would wrap around the i32 before exiting, we can't
- // transform the IV.
- if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue)
- return false;
- }
-
- IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext());
-
- // Insert new integer induction variable.
- PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN);
- NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue),
- PN->getIncomingBlock(IncomingEdge));
-
- Value *NewAdd =
- BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue),
- Incr->getName()+".int", Incr);
- NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge));
-
- ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd,
- ConstantInt::get(Int32Ty, ExitValue),
- Compare->getName());
-
- // In the following deletions, PN may become dead and may be deleted.
- // Use a WeakTrackingVH to observe whether this happens.
- WeakTrackingVH WeakPH = PN;
-
- // Delete the old floating point exit comparison. The branch starts using the
- // new comparison.
- NewCompare->takeName(Compare);
- Compare->replaceAllUsesWith(NewCompare);
- RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI, MSSAU.get());
-
- // Delete the old floating point increment.
- Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
- RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI, MSSAU.get());
-
- // If the FP induction variable still has uses, this is because something else
- // in the loop uses its value. In order to canonicalize the induction
- // variable, we chose to eliminate the IV and rewrite it in terms of an
- // int->fp cast.
- //
- // We give preference to sitofp over uitofp because it is faster on most
- // platforms.
- if (WeakPH) {
- Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
- &*PN->getParent()->getFirstInsertionPt());
- PN->replaceAllUsesWith(Conv);
- RecursivelyDeleteTriviallyDeadInstructions(PN, TLI, MSSAU.get());
- }
- return true;
-}
-
-bool IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
- // First step. Check to see if there are any floating-point recurrences.
- // If there are, change them into integer recurrences, permitting analysis by
- // the SCEV routines.
- BasicBlock *Header = L->getHeader();
-
- SmallVector<WeakTrackingVH, 8> PHIs;
- for (PHINode &PN : Header->phis())
- PHIs.push_back(&PN);
-
- bool Changed = false;
- for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
- if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i]))
- Changed |= handleFloatingPointIV(L, PN);
-
- // If the loop previously had floating-point IV, ScalarEvolution
- // may not have been able to compute a trip count. Now that we've done some
- // re-writing, the trip count may be computable.
- if (Changed)
- SE->forgetLoop(L);
- return Changed;
-}
-
-//===---------------------------------------------------------------------===//
-// rewriteFirstIterationLoopExitValues: Rewrite loop exit values if we know
-// they will exit at the first iteration.
-//===---------------------------------------------------------------------===//
-
-/// Check to see if this loop has loop invariant conditions which lead to loop
-/// exits. If so, we know that if the exit path is taken, it is at the first
-/// loop iteration. This lets us predict exit values of PHI nodes that live in
-/// loop header.
-bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
- // Verify the input to the pass is already in LCSSA form.
- assert(L->isLCSSAForm(*DT));
-
- SmallVector<BasicBlock *, 8> ExitBlocks;
- L->getUniqueExitBlocks(ExitBlocks);
-
- bool MadeAnyChanges = false;
- for (auto *ExitBB : ExitBlocks) {
- // If there are no more PHI nodes in this exit block, then no more
- // values defined inside the loop are used on this path.
- for (PHINode &PN : ExitBB->phis()) {
- for (unsigned IncomingValIdx = 0, E = PN.getNumIncomingValues();
- IncomingValIdx != E; ++IncomingValIdx) {
- auto *IncomingBB = PN.getIncomingBlock(IncomingValIdx);
-
- // Can we prove that the exit must run on the first iteration if it
- // runs at all? (i.e. early exits are fine for our purposes, but
- // traces which lead to this exit being taken on the 2nd iteration
- // aren't.) Note that this is about whether the exit branch is
- // executed, not about whether it is taken.
- if (!L->getLoopLatch() ||
- !DT->dominates(IncomingBB, L->getLoopLatch()))
- continue;
-
- // Get condition that leads to the exit path.
- auto *TermInst = IncomingBB->getTerminator();
-
- Value *Cond = nullptr;
- if (auto *BI = dyn_cast<BranchInst>(TermInst)) {
- // Must be a conditional branch, otherwise the block
- // should not be in the loop.
- Cond = BI->getCondition();
- } else if (auto *SI = dyn_cast<SwitchInst>(TermInst))
- Cond = SI->getCondition();
- else
- continue;
-
- if (!L->isLoopInvariant(Cond))
- continue;
-
- auto *ExitVal = dyn_cast<PHINode>(PN.getIncomingValue(IncomingValIdx));
-
- // Only deal with PHIs in the loop header.
- if (!ExitVal || ExitVal->getParent() != L->getHeader())
- continue;
-
- // If ExitVal is a PHI on the loop header, then we know its
- // value along this exit because the exit can only be taken
- // on the first iteration.
- auto *LoopPreheader = L->getLoopPreheader();
- assert(LoopPreheader && "Invalid loop");
- int PreheaderIdx = ExitVal->getBasicBlockIndex(LoopPreheader);
- if (PreheaderIdx != -1) {
- assert(ExitVal->getParent() == L->getHeader() &&
- "ExitVal must be in loop header");
- MadeAnyChanges = true;
- PN.setIncomingValue(IncomingValIdx,
- ExitVal->getIncomingValue(PreheaderIdx));
- }
- }
- }
- }
- return MadeAnyChanges;
-}
-
-//===----------------------------------------------------------------------===//
-// IV Widening - Extend the width of an IV to cover its widest uses.
-//===----------------------------------------------------------------------===//
-
-/// Update information about the induction variable that is extended by this
-/// sign or zero extend operation. This is used to determine the final width of
-/// the IV before actually widening it.
+ if (MSSA)
+ MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+ }
+
+ bool run(Loop *L);
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// rewriteNonIntegerIVs and helpers. Prefer integer IVs.
+//===----------------------------------------------------------------------===//
+
+/// Convert APF to an integer, if possible.
+static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
+ bool isExact = false;
+ // See if we can convert this to an int64_t
+ uint64_t UIntVal;
+ if (APF.convertToInteger(makeMutableArrayRef(UIntVal), 64, true,
+ APFloat::rmTowardZero, &isExact) != APFloat::opOK ||
+ !isExact)
+ return false;
+ IntVal = UIntVal;
+ return true;
+}
+
+/// If the loop has floating induction variable then insert corresponding
+/// integer induction variable if possible.
+/// For example,
+/// for(double i = 0; i < 10000; ++i)
+/// bar(i)
+/// is converted into
+/// for(int i = 0; i < 10000; ++i)
+/// bar((double)i);
+bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
+ unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0));
+ unsigned BackEdge = IncomingEdge^1;
+
+ // Check incoming value.
+ auto *InitValueVal = dyn_cast<ConstantFP>(PN->getIncomingValue(IncomingEdge));
+
+ int64_t InitValue;
+ if (!InitValueVal || !ConvertToSInt(InitValueVal->getValueAPF(), InitValue))
+ return false;
+
+ // Check IV increment. Reject this PN if increment operation is not
+ // an add or increment value can not be represented by an integer.
+ auto *Incr = dyn_cast<BinaryOperator>(PN->getIncomingValue(BackEdge));
+ if (Incr == nullptr || Incr->getOpcode() != Instruction::FAdd) return false;
+
+ // If this is not an add of the PHI with a constantfp, or if the constant fp
+ // is not an integer, bail out.
+ ConstantFP *IncValueVal = dyn_cast<ConstantFP>(Incr->getOperand(1));
+ int64_t IncValue;
+ if (IncValueVal == nullptr || Incr->getOperand(0) != PN ||
+ !ConvertToSInt(IncValueVal->getValueAPF(), IncValue))
+ return false;
+
+ // Check Incr uses. One user is PN and the other user is an exit condition
+ // used by the conditional terminator.
+ Value::user_iterator IncrUse = Incr->user_begin();
+ Instruction *U1 = cast<Instruction>(*IncrUse++);
+ if (IncrUse == Incr->user_end()) return false;
+ Instruction *U2 = cast<Instruction>(*IncrUse++);
+ if (IncrUse != Incr->user_end()) return false;
+
+ // Find exit condition, which is an fcmp. If it doesn't exist, or if it isn't
+ // only used by a branch, we can't transform it.
+ FCmpInst *Compare = dyn_cast<FCmpInst>(U1);
+ if (!Compare)
+ Compare = dyn_cast<FCmpInst>(U2);
+ if (!Compare || !Compare->hasOneUse() ||
+ !isa<BranchInst>(Compare->user_back()))
+ return false;
+
+ BranchInst *TheBr = cast<BranchInst>(Compare->user_back());
+
+ // We need to verify that the branch actually controls the iteration count
+ // of the loop. If not, the new IV can overflow and no one will notice.
+ // The branch block must be in the loop and one of the successors must be out
+ // of the loop.
+ assert(TheBr->isConditional() && "Can't use fcmp if not conditional");
+ if (!L->contains(TheBr->getParent()) ||
+ (L->contains(TheBr->getSuccessor(0)) &&
+ L->contains(TheBr->getSuccessor(1))))
+ return false;
+
+ // If it isn't a comparison with an integer-as-fp (the exit value), we can't
+ // transform it.
+ ConstantFP *ExitValueVal = dyn_cast<ConstantFP>(Compare->getOperand(1));
+ int64_t ExitValue;
+ if (ExitValueVal == nullptr ||
+ !ConvertToSInt(ExitValueVal->getValueAPF(), ExitValue))
+ return false;
+
+ // Find new predicate for integer comparison.
+ CmpInst::Predicate NewPred = CmpInst::BAD_ICMP_PREDICATE;
+ switch (Compare->getPredicate()) {
+ default: return false; // Unknown comparison.
+ case CmpInst::FCMP_OEQ:
+ case CmpInst::FCMP_UEQ: NewPred = CmpInst::ICMP_EQ; break;
+ case CmpInst::FCMP_ONE:
+ case CmpInst::FCMP_UNE: NewPred = CmpInst::ICMP_NE; break;
+ case CmpInst::FCMP_OGT:
+ case CmpInst::FCMP_UGT: NewPred = CmpInst::ICMP_SGT; break;
+ case CmpInst::FCMP_OGE:
+ case CmpInst::FCMP_UGE: NewPred = CmpInst::ICMP_SGE; break;
+ case CmpInst::FCMP_OLT:
+ case CmpInst::FCMP_ULT: NewPred = CmpInst::ICMP_SLT; break;
+ case CmpInst::FCMP_OLE:
+ case CmpInst::FCMP_ULE: NewPred = CmpInst::ICMP_SLE; break;
+ }
+
+ // We convert the floating point induction variable to a signed i32 value if
+ // we can. This is only safe if the comparison will not overflow in a way
+ // that won't be trapped by the integer equivalent operations. Check for this
+ // now.
+ // TODO: We could use i64 if it is native and the range requires it.
+
+ // The start/stride/exit values must all fit in signed i32.
+ if (!isInt<32>(InitValue) || !isInt<32>(IncValue) || !isInt<32>(ExitValue))
+ return false;
+
+ // If not actually striding (add x, 0.0), avoid touching the code.
+ if (IncValue == 0)
+ return false;
+
+ // Positive and negative strides have different safety conditions.
+ if (IncValue > 0) {
+ // If we have a positive stride, we require the init to be less than the
+ // exit value.
+ if (InitValue >= ExitValue)
+ return false;
+
+ uint32_t Range = uint32_t(ExitValue-InitValue);
+ // Check for infinite loop, either:
+ // while (i <= Exit) or until (i > Exit)
+ if (NewPred == CmpInst::ICMP_SLE || NewPred == CmpInst::ICMP_SGT) {
+ if (++Range == 0) return false; // Range overflows.
+ }
+
+ unsigned Leftover = Range % uint32_t(IncValue);
+
+ // If this is an equality comparison, we require that the strided value
+ // exactly land on the exit value, otherwise the IV condition will wrap
+ // around and do things the fp IV wouldn't.
+ if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
+ Leftover != 0)
+ return false;
+
+ // If the stride would wrap around the i32 before exiting, we can't
+ // transform the IV.
+ if (Leftover != 0 && int32_t(ExitValue+IncValue) < ExitValue)
+ return false;
+ } else {
+ // If we have a negative stride, we require the init to be greater than the
+ // exit value.
+ if (InitValue <= ExitValue)
+ return false;
+
+ uint32_t Range = uint32_t(InitValue-ExitValue);
+ // Check for infinite loop, either:
+ // while (i >= Exit) or until (i < Exit)
+ if (NewPred == CmpInst::ICMP_SGE || NewPred == CmpInst::ICMP_SLT) {
+ if (++Range == 0) return false; // Range overflows.
+ }
+
+ unsigned Leftover = Range % uint32_t(-IncValue);
+
+ // If this is an equality comparison, we require that the strided value
+ // exactly land on the exit value, otherwise the IV condition will wrap
+ // around and do things the fp IV wouldn't.
+ if ((NewPred == CmpInst::ICMP_EQ || NewPred == CmpInst::ICMP_NE) &&
+ Leftover != 0)
+ return false;
+
+ // If the stride would wrap around the i32 before exiting, we can't
+ // transform the IV.
+ if (Leftover != 0 && int32_t(ExitValue+IncValue) > ExitValue)
+ return false;
+ }
+
+ IntegerType *Int32Ty = Type::getInt32Ty(PN->getContext());
+
+ // Insert new integer induction variable.
+ PHINode *NewPHI = PHINode::Create(Int32Ty, 2, PN->getName()+".int", PN);
+ NewPHI->addIncoming(ConstantInt::get(Int32Ty, InitValue),
+ PN->getIncomingBlock(IncomingEdge));
+
+ Value *NewAdd =
+ BinaryOperator::CreateAdd(NewPHI, ConstantInt::get(Int32Ty, IncValue),
+ Incr->getName()+".int", Incr);
+ NewPHI->addIncoming(NewAdd, PN->getIncomingBlock(BackEdge));
+
+ ICmpInst *NewCompare = new ICmpInst(TheBr, NewPred, NewAdd,
+ ConstantInt::get(Int32Ty, ExitValue),
+ Compare->getName());
+
+ // In the following deletions, PN may become dead and may be deleted.
+ // Use a WeakTrackingVH to observe whether this happens.
+ WeakTrackingVH WeakPH = PN;
+
+ // Delete the old floating point exit comparison. The branch starts using the
+ // new comparison.
+ NewCompare->takeName(Compare);
+ Compare->replaceAllUsesWith(NewCompare);
+ RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI, MSSAU.get());
+
+ // Delete the old floating point increment.
+ Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
+ RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI, MSSAU.get());
+
+ // If the FP induction variable still has uses, this is because something else
+ // in the loop uses its value. In order to canonicalize the induction
+ // variable, we chose to eliminate the IV and rewrite it in terms of an
+ // int->fp cast.
+ //
+ // We give preference to sitofp over uitofp because it is faster on most
+ // platforms.
+ if (WeakPH) {
+ Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
+ &*PN->getParent()->getFirstInsertionPt());
+ PN->replaceAllUsesWith(Conv);
+ RecursivelyDeleteTriviallyDeadInstructions(PN, TLI, MSSAU.get());
+ }
+ return true;
+}
+
+bool IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
+ // First step. Check to see if there are any floating-point recurrences.
+ // If there are, change them into integer recurrences, permitting analysis by
+ // the SCEV routines.
+ BasicBlock *Header = L->getHeader();
+
+ SmallVector<WeakTrackingVH, 8> PHIs;
+ for (PHINode &PN : Header->phis())
+ PHIs.push_back(&PN);
+
+ bool Changed = false;
+ for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+ if (PHINode *PN = dyn_cast_or_null<PHINode>(&*PHIs[i]))
+ Changed |= handleFloatingPointIV(L, PN);
+
+ // If the loop previously had floating-point IV, ScalarEvolution
+ // may not have been able to compute a trip count. Now that we've done some
+ // re-writing, the trip count may be computable.
+ if (Changed)
+ SE->forgetLoop(L);
+ return Changed;
+}
+
+//===---------------------------------------------------------------------===//
+// rewriteFirstIterationLoopExitValues: Rewrite loop exit values if we know
+// they will exit at the first iteration.
+//===---------------------------------------------------------------------===//
+
+/// Check to see if this loop has loop invariant conditions which lead to loop
+/// exits. If so, we know that if the exit path is taken, it is at the first
+/// loop iteration. This lets us predict exit values of PHI nodes that live in
+/// loop header.
+bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
+ // Verify the input to the pass is already in LCSSA form.
+ assert(L->isLCSSAForm(*DT));
+
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+
+ bool MadeAnyChanges = false;
+ for (auto *ExitBB : ExitBlocks) {
+ // If there are no more PHI nodes in this exit block, then no more
+ // values defined inside the loop are used on this path.
+ for (PHINode &PN : ExitBB->phis()) {
+ for (unsigned IncomingValIdx = 0, E = PN.getNumIncomingValues();
+ IncomingValIdx != E; ++IncomingValIdx) {
+ auto *IncomingBB = PN.getIncomingBlock(IncomingValIdx);
+
+ // Can we prove that the exit must run on the first iteration if it
+ // runs at all? (i.e. early exits are fine for our purposes, but
+ // traces which lead to this exit being taken on the 2nd iteration
+ // aren't.) Note that this is about whether the exit branch is
+ // executed, not about whether it is taken.
+ if (!L->getLoopLatch() ||
+ !DT->dominates(IncomingBB, L->getLoopLatch()))
+ continue;
+
+ // Get condition that leads to the exit path.
+ auto *TermInst = IncomingBB->getTerminator();
+
+ Value *Cond = nullptr;
+ if (auto *BI = dyn_cast<BranchInst>(TermInst)) {
+ // Must be a conditional branch, otherwise the block
+ // should not be in the loop.
+ Cond = BI->getCondition();
+ } else if (auto *SI = dyn_cast<SwitchInst>(TermInst))
+ Cond = SI->getCondition();
+ else
+ continue;
+
+ if (!L->isLoopInvariant(Cond))
+ continue;
+
+ auto *ExitVal = dyn_cast<PHINode>(PN.getIncomingValue(IncomingValIdx));
+
+ // Only deal with PHIs in the loop header.
+ if (!ExitVal || ExitVal->getParent() != L->getHeader())
+ continue;
+
+ // If ExitVal is a PHI on the loop header, then we know its
+ // value along this exit because the exit can only be taken
+ // on the first iteration.
+ auto *LoopPreheader = L->getLoopPreheader();
+ assert(LoopPreheader && "Invalid loop");
+ int PreheaderIdx = ExitVal->getBasicBlockIndex(LoopPreheader);
+ if (PreheaderIdx != -1) {
+ assert(ExitVal->getParent() == L->getHeader() &&
+ "ExitVal must be in loop header");
+ MadeAnyChanges = true;
+ PN.setIncomingValue(IncomingValIdx,
+ ExitVal->getIncomingValue(PreheaderIdx));
+ }
+ }
+ }
+ }
+ return MadeAnyChanges;
+}
+
+//===----------------------------------------------------------------------===//
+// IV Widening - Extend the width of an IV to cover its widest uses.
+//===----------------------------------------------------------------------===//
+
+/// Update information about the induction variable that is extended by this
+/// sign or zero extend operation. This is used to determine the final width of
+/// the IV before actually widening it.
static void visitIVCast(CastInst *Cast, WideIVInfo &WI,
ScalarEvolution *SE,
- const TargetTransformInfo *TTI) {
- bool IsSigned = Cast->getOpcode() == Instruction::SExt;
- if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
- return;
-
- Type *Ty = Cast->getType();
- uint64_t Width = SE->getTypeSizeInBits(Ty);
- if (!Cast->getModule()->getDataLayout().isLegalInteger(Width))
- return;
-
- // Check that `Cast` actually extends the induction variable (we rely on this
- // later). This takes care of cases where `Cast` is extending a truncation of
- // the narrow induction variable, and thus can end up being narrower than the
- // "narrow" induction variable.
- uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType());
- if (NarrowIVWidth >= Width)
- return;
-
- // Cast is either an sext or zext up to this point.
- // We should not widen an indvar if arithmetics on the wider indvar are more
- // expensive than those on the narrower indvar. We check only the cost of ADD
- // because at least an ADD is required to increment the induction variable. We
- // could compute more comprehensively the cost of all instructions on the
- // induction variable when necessary.
- if (TTI &&
- TTI->getArithmeticInstrCost(Instruction::Add, Ty) >
- TTI->getArithmeticInstrCost(Instruction::Add,
- Cast->getOperand(0)->getType())) {
- return;
- }
-
- if (!WI.WidestNativeType) {
- WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
- WI.IsSigned = IsSigned;
- return;
- }
-
- // We extend the IV to satisfy the sign of its first user, arbitrarily.
- if (WI.IsSigned != IsSigned)
- return;
-
- if (Width > SE->getTypeSizeInBits(WI.WidestNativeType))
- WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
-}
-
-//===----------------------------------------------------------------------===//
-// Live IV Reduction - Minimize IVs live across the loop.
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Simplification of IV users based on SCEV evaluation.
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-class IndVarSimplifyVisitor : public IVVisitor {
- ScalarEvolution *SE;
- const TargetTransformInfo *TTI;
- PHINode *IVPhi;
-
-public:
- WideIVInfo WI;
-
- IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV,
- const TargetTransformInfo *TTI,
- const DominatorTree *DTree)
- : SE(SCEV), TTI(TTI), IVPhi(IV) {
- DT = DTree;
- WI.NarrowIV = IVPhi;
- }
-
- // Implement the interface used by simplifyUsersOfIV.
- void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); }
-};
-
-} // end anonymous namespace
-
-/// Iteratively perform simplification on a worklist of IV users. Each
-/// successive simplification may push more users which may themselves be
-/// candidates for simplification.
-///
-/// Sign/Zero extend elimination is interleaved with IV simplification.
-bool IndVarSimplify::simplifyAndExtend(Loop *L,
- SCEVExpander &Rewriter,
- LoopInfo *LI) {
- SmallVector<WideIVInfo, 8> WideIVs;
-
- auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_guard));
- bool HasGuards = GuardDecl && !GuardDecl->use_empty();
-
- SmallVector<PHINode*, 8> LoopPhis;
- for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
- LoopPhis.push_back(cast<PHINode>(I));
- }
- // Each round of simplification iterates through the SimplifyIVUsers worklist
- // for all current phis, then determines whether any IVs can be
- // widened. Widening adds new phis to LoopPhis, inducing another round of
- // simplification on the wide IVs.
- bool Changed = false;
- while (!LoopPhis.empty()) {
- // Evaluate as many IV expressions as possible before widening any IVs. This
- // forces SCEV to set no-wrap flags before evaluating sign/zero
- // extension. The first time SCEV attempts to normalize sign/zero extension,
- // the result becomes final. So for the most predictable results, we delay
- // evaluation of sign/zero extend evaluation until needed, and avoid running
- // other SCEV based analysis prior to simplifyAndExtend.
- do {
- PHINode *CurrIV = LoopPhis.pop_back_val();
-
- // Information about sign/zero extensions of CurrIV.
- IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT);
-
- Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, TTI, DeadInsts, Rewriter,
- &Visitor);
-
- if (Visitor.WI.WidestNativeType) {
- WideIVs.push_back(Visitor.WI);
- }
- } while(!LoopPhis.empty());
-
+ const TargetTransformInfo *TTI) {
+ bool IsSigned = Cast->getOpcode() == Instruction::SExt;
+ if (!IsSigned && Cast->getOpcode() != Instruction::ZExt)
+ return;
+
+ Type *Ty = Cast->getType();
+ uint64_t Width = SE->getTypeSizeInBits(Ty);
+ if (!Cast->getModule()->getDataLayout().isLegalInteger(Width))
+ return;
+
+ // Check that `Cast` actually extends the induction variable (we rely on this
+ // later). This takes care of cases where `Cast` is extending a truncation of
+ // the narrow induction variable, and thus can end up being narrower than the
+ // "narrow" induction variable.
+ uint64_t NarrowIVWidth = SE->getTypeSizeInBits(WI.NarrowIV->getType());
+ if (NarrowIVWidth >= Width)
+ return;
+
+ // Cast is either an sext or zext up to this point.
+ // We should not widen an indvar if arithmetics on the wider indvar are more
+ // expensive than those on the narrower indvar. We check only the cost of ADD
+ // because at least an ADD is required to increment the induction variable. We
+ // could compute more comprehensively the cost of all instructions on the
+ // induction variable when necessary.
+ if (TTI &&
+ TTI->getArithmeticInstrCost(Instruction::Add, Ty) >
+ TTI->getArithmeticInstrCost(Instruction::Add,
+ Cast->getOperand(0)->getType())) {
+ return;
+ }
+
+ if (!WI.WidestNativeType) {
+ WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+ WI.IsSigned = IsSigned;
+ return;
+ }
+
+ // We extend the IV to satisfy the sign of its first user, arbitrarily.
+ if (WI.IsSigned != IsSigned)
+ return;
+
+ if (Width > SE->getTypeSizeInBits(WI.WidestNativeType))
+ WI.WidestNativeType = SE->getEffectiveSCEVType(Ty);
+}
+
+//===----------------------------------------------------------------------===//
+// Live IV Reduction - Minimize IVs live across the loop.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Simplification of IV users based on SCEV evaluation.
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class IndVarSimplifyVisitor : public IVVisitor {
+ ScalarEvolution *SE;
+ const TargetTransformInfo *TTI;
+ PHINode *IVPhi;
+
+public:
+ WideIVInfo WI;
+
+ IndVarSimplifyVisitor(PHINode *IV, ScalarEvolution *SCEV,
+ const TargetTransformInfo *TTI,
+ const DominatorTree *DTree)
+ : SE(SCEV), TTI(TTI), IVPhi(IV) {
+ DT = DTree;
+ WI.NarrowIV = IVPhi;
+ }
+
+ // Implement the interface used by simplifyUsersOfIV.
+ void visitCast(CastInst *Cast) override { visitIVCast(Cast, WI, SE, TTI); }
+};
+
+} // end anonymous namespace
+
+/// Iteratively perform simplification on a worklist of IV users. Each
+/// successive simplification may push more users which may themselves be
+/// candidates for simplification.
+///
+/// Sign/Zero extend elimination is interleaved with IV simplification.
+bool IndVarSimplify::simplifyAndExtend(Loop *L,
+ SCEVExpander &Rewriter,
+ LoopInfo *LI) {
+ SmallVector<WideIVInfo, 8> WideIVs;
+
+ auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction(
+ Intrinsic::getName(Intrinsic::experimental_guard));
+ bool HasGuards = GuardDecl && !GuardDecl->use_empty();
+
+ SmallVector<PHINode*, 8> LoopPhis;
+ for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ LoopPhis.push_back(cast<PHINode>(I));
+ }
+ // Each round of simplification iterates through the SimplifyIVUsers worklist
+ // for all current phis, then determines whether any IVs can be
+ // widened. Widening adds new phis to LoopPhis, inducing another round of
+ // simplification on the wide IVs.
+ bool Changed = false;
+ while (!LoopPhis.empty()) {
+ // Evaluate as many IV expressions as possible before widening any IVs. This
+ // forces SCEV to set no-wrap flags before evaluating sign/zero
+ // extension. The first time SCEV attempts to normalize sign/zero extension,
+ // the result becomes final. So for the most predictable results, we delay
+ // evaluation of sign/zero extend evaluation until needed, and avoid running
+ // other SCEV based analysis prior to simplifyAndExtend.
+ do {
+ PHINode *CurrIV = LoopPhis.pop_back_val();
+
+ // Information about sign/zero extensions of CurrIV.
+ IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT);
+
+ Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, TTI, DeadInsts, Rewriter,
+ &Visitor);
+
+ if (Visitor.WI.WidestNativeType) {
+ WideIVs.push_back(Visitor.WI);
+ }
+ } while(!LoopPhis.empty());
+
// Continue if we disallowed widening.
if (!WidenIndVars)
continue;
- for (; !WideIVs.empty(); WideIVs.pop_back()) {
+ for (; !WideIVs.empty(); WideIVs.pop_back()) {
unsigned ElimExt;
unsigned Widened;
if (PHINode *WidePhi = createWideIV(WideIVs.back(), LI, SE, Rewriter,
@@ -643,653 +643,653 @@ bool IndVarSimplify::simplifyAndExtend(Loop *L,
HasGuards, UsePostIncrementRanges)) {
NumElimExt += ElimExt;
NumWidened += Widened;
- Changed = true;
- LoopPhis.push_back(WidePhi);
- }
- }
- }
- return Changed;
-}
-
-//===----------------------------------------------------------------------===//
-// linearFunctionTestReplace and its kin. Rewrite the loop exit condition.
-//===----------------------------------------------------------------------===//
-
-/// Given an Value which is hoped to be part of an add recurance in the given
-/// loop, return the associated Phi node if so. Otherwise, return null. Note
-/// that this is less general than SCEVs AddRec checking.
-static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L) {
- Instruction *IncI = dyn_cast<Instruction>(IncV);
- if (!IncI)
- return nullptr;
-
- switch (IncI->getOpcode()) {
- case Instruction::Add:
- case Instruction::Sub:
- break;
- case Instruction::GetElementPtr:
- // An IV counter must preserve its type.
- if (IncI->getNumOperands() == 2)
- break;
- LLVM_FALLTHROUGH;
- default:
- return nullptr;
- }
-
- PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
- if (Phi && Phi->getParent() == L->getHeader()) {
- if (L->isLoopInvariant(IncI->getOperand(1)))
- return Phi;
- return nullptr;
- }
- if (IncI->getOpcode() == Instruction::GetElementPtr)
- return nullptr;
-
- // Allow add/sub to be commuted.
- Phi = dyn_cast<PHINode>(IncI->getOperand(1));
- if (Phi && Phi->getParent() == L->getHeader()) {
- if (L->isLoopInvariant(IncI->getOperand(0)))
- return Phi;
- }
- return nullptr;
-}
-
-/// Whether the current loop exit test is based on this value. Currently this
-/// is limited to a direct use in the loop condition.
-static bool isLoopExitTestBasedOn(Value *V, BasicBlock *ExitingBB) {
- BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
- ICmpInst *ICmp = dyn_cast<ICmpInst>(BI->getCondition());
- // TODO: Allow non-icmp loop test.
- if (!ICmp)
- return false;
-
- // TODO: Allow indirect use.
- return ICmp->getOperand(0) == V || ICmp->getOperand(1) == V;
-}
-
-/// linearFunctionTestReplace policy. Return true unless we can show that the
-/// current exit test is already sufficiently canonical.
-static bool needsLFTR(Loop *L, BasicBlock *ExitingBB) {
- assert(L->getLoopLatch() && "Must be in simplified form");
-
- // Avoid converting a constant or loop invariant test back to a runtime
- // test. This is critical for when SCEV's cached ExitCount is less precise
- // than the current IR (such as after we've proven a particular exit is
- // actually dead and thus the BE count never reaches our ExitCount.)
- BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
- if (L->isLoopInvariant(BI->getCondition()))
- return false;
-
- // Do LFTR to simplify the exit condition to an ICMP.
- ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
- if (!Cond)
- return true;
-
- // Do LFTR to simplify the exit ICMP to EQ/NE
- ICmpInst::Predicate Pred = Cond->getPredicate();
- if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ)
- return true;
-
- // Look for a loop invariant RHS
- Value *LHS = Cond->getOperand(0);
- Value *RHS = Cond->getOperand(1);
- if (!L->isLoopInvariant(RHS)) {
- if (!L->isLoopInvariant(LHS))
- return true;
- std::swap(LHS, RHS);
- }
- // Look for a simple IV counter LHS
- PHINode *Phi = dyn_cast<PHINode>(LHS);
- if (!Phi)
- Phi = getLoopPhiForCounter(LHS, L);
-
- if (!Phi)
- return true;
-
- // Do LFTR if PHI node is defined in the loop, but is *not* a counter.
- int Idx = Phi->getBasicBlockIndex(L->getLoopLatch());
- if (Idx < 0)
- return true;
-
- // Do LFTR if the exit condition's IV is *not* a simple counter.
- Value *IncV = Phi->getIncomingValue(Idx);
- return Phi != getLoopPhiForCounter(IncV, L);
-}
-
-/// Return true if undefined behavior would provable be executed on the path to
-/// OnPathTo if Root produced a posion result. Note that this doesn't say
-/// anything about whether OnPathTo is actually executed or whether Root is
-/// actually poison. This can be used to assess whether a new use of Root can
-/// be added at a location which is control equivalent with OnPathTo (such as
-/// immediately before it) without introducing UB which didn't previously
-/// exist. Note that a false result conveys no information.
-static bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root,
- Instruction *OnPathTo,
- DominatorTree *DT) {
- // Basic approach is to assume Root is poison, propagate poison forward
- // through all users we can easily track, and then check whether any of those
- // users are provable UB and must execute before out exiting block might
- // exit.
-
- // The set of all recursive users we've visited (which are assumed to all be
- // poison because of said visit)
- SmallSet<const Value *, 16> KnownPoison;
- SmallVector<const Instruction*, 16> Worklist;
- Worklist.push_back(Root);
- while (!Worklist.empty()) {
- const Instruction *I = Worklist.pop_back_val();
-
- // If we know this must trigger UB on a path leading our target.
- if (mustTriggerUB(I, KnownPoison) && DT->dominates(I, OnPathTo))
- return true;
-
- // If we can't analyze propagation through this instruction, just skip it
- // and transitive users. Safe as false is a conservative result.
+ Changed = true;
+ LoopPhis.push_back(WidePhi);
+ }
+ }
+ }
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// linearFunctionTestReplace and its kin. Rewrite the loop exit condition.
+//===----------------------------------------------------------------------===//
+
+/// Given an Value which is hoped to be part of an add recurance in the given
+/// loop, return the associated Phi node if so. Otherwise, return null. Note
+/// that this is less general than SCEVs AddRec checking.
+static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L) {
+ Instruction *IncI = dyn_cast<Instruction>(IncV);
+ if (!IncI)
+ return nullptr;
+
+ switch (IncI->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Sub:
+ break;
+ case Instruction::GetElementPtr:
+ // An IV counter must preserve its type.
+ if (IncI->getNumOperands() == 2)
+ break;
+ LLVM_FALLTHROUGH;
+ default:
+ return nullptr;
+ }
+
+ PHINode *Phi = dyn_cast<PHINode>(IncI->getOperand(0));
+ if (Phi && Phi->getParent() == L->getHeader()) {
+ if (L->isLoopInvariant(IncI->getOperand(1)))
+ return Phi;
+ return nullptr;
+ }
+ if (IncI->getOpcode() == Instruction::GetElementPtr)
+ return nullptr;
+
+ // Allow add/sub to be commuted.
+ Phi = dyn_cast<PHINode>(IncI->getOperand(1));
+ if (Phi && Phi->getParent() == L->getHeader()) {
+ if (L->isLoopInvariant(IncI->getOperand(0)))
+ return Phi;
+ }
+ return nullptr;
+}
+
+/// Whether the current loop exit test is based on this value. Currently this
+/// is limited to a direct use in the loop condition.
+static bool isLoopExitTestBasedOn(Value *V, BasicBlock *ExitingBB) {
+ BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+ ICmpInst *ICmp = dyn_cast<ICmpInst>(BI->getCondition());
+ // TODO: Allow non-icmp loop test.
+ if (!ICmp)
+ return false;
+
+ // TODO: Allow indirect use.
+ return ICmp->getOperand(0) == V || ICmp->getOperand(1) == V;
+}
+
+/// linearFunctionTestReplace policy. Return true unless we can show that the
+/// current exit test is already sufficiently canonical.
+static bool needsLFTR(Loop *L, BasicBlock *ExitingBB) {
+ assert(L->getLoopLatch() && "Must be in simplified form");
+
+ // Avoid converting a constant or loop invariant test back to a runtime
+ // test. This is critical for when SCEV's cached ExitCount is less precise
+ // than the current IR (such as after we've proven a particular exit is
+ // actually dead and thus the BE count never reaches our ExitCount.)
+ BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+ if (L->isLoopInvariant(BI->getCondition()))
+ return false;
+
+ // Do LFTR to simplify the exit condition to an ICMP.
+ ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+ if (!Cond)
+ return true;
+
+ // Do LFTR to simplify the exit ICMP to EQ/NE
+ ICmpInst::Predicate Pred = Cond->getPredicate();
+ if (Pred != ICmpInst::ICMP_NE && Pred != ICmpInst::ICMP_EQ)
+ return true;
+
+ // Look for a loop invariant RHS
+ Value *LHS = Cond->getOperand(0);
+ Value *RHS = Cond->getOperand(1);
+ if (!L->isLoopInvariant(RHS)) {
+ if (!L->isLoopInvariant(LHS))
+ return true;
+ std::swap(LHS, RHS);
+ }
+ // Look for a simple IV counter LHS
+ PHINode *Phi = dyn_cast<PHINode>(LHS);
+ if (!Phi)
+ Phi = getLoopPhiForCounter(LHS, L);
+
+ if (!Phi)
+ return true;
+
+ // Do LFTR if PHI node is defined in the loop, but is *not* a counter.
+ int Idx = Phi->getBasicBlockIndex(L->getLoopLatch());
+ if (Idx < 0)
+ return true;
+
+ // Do LFTR if the exit condition's IV is *not* a simple counter.
+ Value *IncV = Phi->getIncomingValue(Idx);
+ return Phi != getLoopPhiForCounter(IncV, L);
+}
+
+/// Return true if undefined behavior would provable be executed on the path to
+/// OnPathTo if Root produced a posion result. Note that this doesn't say
+/// anything about whether OnPathTo is actually executed or whether Root is
+/// actually poison. This can be used to assess whether a new use of Root can
+/// be added at a location which is control equivalent with OnPathTo (such as
+/// immediately before it) without introducing UB which didn't previously
+/// exist. Note that a false result conveys no information.
+static bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root,
+ Instruction *OnPathTo,
+ DominatorTree *DT) {
+ // Basic approach is to assume Root is poison, propagate poison forward
+ // through all users we can easily track, and then check whether any of those
+ // users are provable UB and must execute before out exiting block might
+ // exit.
+
+ // The set of all recursive users we've visited (which are assumed to all be
+ // poison because of said visit)
+ SmallSet<const Value *, 16> KnownPoison;
+ SmallVector<const Instruction*, 16> Worklist;
+ Worklist.push_back(Root);
+ while (!Worklist.empty()) {
+ const Instruction *I = Worklist.pop_back_val();
+
+ // If we know this must trigger UB on a path leading our target.
+ if (mustTriggerUB(I, KnownPoison) && DT->dominates(I, OnPathTo))
+ return true;
+
+ // If we can't analyze propagation through this instruction, just skip it
+ // and transitive users. Safe as false is a conservative result.
if (!propagatesPoison(cast<Operator>(I)) && I != Root)
- continue;
-
- if (KnownPoison.insert(I).second)
- for (const User *User : I->users())
- Worklist.push_back(cast<Instruction>(User));
- }
-
- // Might be non-UB, or might have a path we couldn't prove must execute on
- // way to exiting bb.
- return false;
-}
-
-/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils
-/// down to checking that all operands are constant and listing instructions
-/// that may hide undef.
-static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited,
- unsigned Depth) {
- if (isa<Constant>(V))
- return !isa<UndefValue>(V);
-
- if (Depth >= 6)
- return false;
-
- // Conservatively handle non-constant non-instructions. For example, Arguments
- // may be undef.
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I)
- return false;
-
- // Load and return values may be undef.
- if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I))
- return false;
-
- // Optimistically handle other instructions.
- for (Value *Op : I->operands()) {
- if (!Visited.insert(Op).second)
- continue;
- if (!hasConcreteDefImpl(Op, Visited, Depth+1))
- return false;
- }
- return true;
-}
-
-/// Return true if the given value is concrete. We must prove that undef can
-/// never reach it.
-///
-/// TODO: If we decide that this is a good approach to checking for undef, we
-/// may factor it into a common location.
-static bool hasConcreteDef(Value *V) {
- SmallPtrSet<Value*, 8> Visited;
- Visited.insert(V);
- return hasConcreteDefImpl(V, Visited, 0);
-}
-
-/// Return true if this IV has any uses other than the (soon to be rewritten)
-/// loop exit test.
-static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
- int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
- Value *IncV = Phi->getIncomingValue(LatchIdx);
-
- for (User *U : Phi->users())
- if (U != Cond && U != IncV) return false;
-
- for (User *U : IncV->users())
- if (U != Cond && U != Phi) return false;
- return true;
-}
-
-/// Return true if the given phi is a "counter" in L. A counter is an
-/// add recurance (of integer or pointer type) with an arbitrary start, and a
-/// step of 1. Note that L must have exactly one latch.
-static bool isLoopCounter(PHINode* Phi, Loop *L,
- ScalarEvolution *SE) {
- assert(Phi->getParent() == L->getHeader());
- assert(L->getLoopLatch());
-
- if (!SE->isSCEVable(Phi->getType()))
- return false;
-
- const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
- if (!AR || AR->getLoop() != L || !AR->isAffine())
- return false;
-
- const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
- if (!Step || !Step->isOne())
- return false;
-
- int LatchIdx = Phi->getBasicBlockIndex(L->getLoopLatch());
- Value *IncV = Phi->getIncomingValue(LatchIdx);
- return (getLoopPhiForCounter(IncV, L) == Phi);
-}
-
-/// Search the loop header for a loop counter (anadd rec w/step of one)
-/// suitable for use by LFTR. If multiple counters are available, select the
-/// "best" one based profitable heuristics.
-///
-/// BECount may be an i8* pointer type. The pointer difference is already
-/// valid count without scaling the address stride, so it remains a pointer
-/// expression as far as SCEV is concerned.
-static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB,
- const SCEV *BECount,
- ScalarEvolution *SE, DominatorTree *DT) {
- uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
-
- Value *Cond = cast<BranchInst>(ExitingBB->getTerminator())->getCondition();
-
- // Loop over all of the PHI nodes, looking for a simple counter.
- PHINode *BestPhi = nullptr;
- const SCEV *BestInit = nullptr;
- BasicBlock *LatchBlock = L->getLoopLatch();
- assert(LatchBlock && "Must be in simplified form");
- const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-
- for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
- PHINode *Phi = cast<PHINode>(I);
- if (!isLoopCounter(Phi, L, SE))
- continue;
-
- // Avoid comparing an integer IV against a pointer Limit.
- if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy())
- continue;
-
- const auto *AR = cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
-
- // AR may be a pointer type, while BECount is an integer type.
- // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
- // AR may not be a narrower type, or we may never exit.
- uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType());
- if (PhiWidth < BCWidth || !DL.isLegalInteger(PhiWidth))
- continue;
-
- // Avoid reusing a potentially undef value to compute other values that may
- // have originally had a concrete definition.
- if (!hasConcreteDef(Phi)) {
- // We explicitly allow unknown phis as long as they are already used by
- // the loop exit test. This is legal since performing LFTR could not
- // increase the number of undef users.
- Value *IncPhi = Phi->getIncomingValueForBlock(LatchBlock);
- if (!isLoopExitTestBasedOn(Phi, ExitingBB) &&
- !isLoopExitTestBasedOn(IncPhi, ExitingBB))
- continue;
- }
-
- // Avoid introducing undefined behavior due to poison which didn't exist in
- // the original program. (Annoyingly, the rules for poison and undef
- // propagation are distinct, so this does NOT cover the undef case above.)
- // We have to ensure that we don't introduce UB by introducing a use on an
- // iteration where said IV produces poison. Our strategy here differs for
- // pointers and integer IVs. For integers, we strip and reinfer as needed,
- // see code in linearFunctionTestReplace. For pointers, we restrict
- // transforms as there is no good way to reinfer inbounds once lost.
- if (!Phi->getType()->isIntegerTy() &&
- !mustExecuteUBIfPoisonOnPathTo(Phi, ExitingBB->getTerminator(), DT))
- continue;
-
- const SCEV *Init = AR->getStart();
-
- if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) {
- // Don't force a live loop counter if another IV can be used.
- if (AlmostDeadIV(Phi, LatchBlock, Cond))
- continue;
-
- // Prefer to count-from-zero. This is a more "canonical" counter form. It
- // also prefers integer to pointer IVs.
- if (BestInit->isZero() != Init->isZero()) {
- if (BestInit->isZero())
- continue;
- }
- // If two IVs both count from zero or both count from nonzero then the
- // narrower is likely a dead phi that has been widened. Use the wider phi
- // to allow the other to be eliminated.
- else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType()))
- continue;
- }
- BestPhi = Phi;
- BestInit = Init;
- }
- return BestPhi;
-}
-
-/// Insert an IR expression which computes the value held by the IV IndVar
-/// (which must be an loop counter w/unit stride) after the backedge of loop L
-/// is taken ExitCount times.
-static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB,
- const SCEV *ExitCount, bool UsePostInc, Loop *L,
- SCEVExpander &Rewriter, ScalarEvolution *SE) {
- assert(isLoopCounter(IndVar, L, SE));
- const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
- const SCEV *IVInit = AR->getStart();
-
- // IVInit may be a pointer while ExitCount is an integer when FindLoopCounter
- // finds a valid pointer IV. Sign extend ExitCount in order to materialize a
- // GEP. Avoid running SCEVExpander on a new pointer value, instead reusing
- // the existing GEPs whenever possible.
- if (IndVar->getType()->isPointerTy() &&
- !ExitCount->getType()->isPointerTy()) {
- // IVOffset will be the new GEP offset that is interpreted by GEP as a
- // signed value. ExitCount on the other hand represents the loop trip count,
- // which is an unsigned value. FindLoopCounter only allows induction
- // variables that have a positive unit stride of one. This means we don't
- // have to handle the case of negative offsets (yet) and just need to zero
- // extend ExitCount.
- Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType());
- const SCEV *IVOffset = SE->getTruncateOrZeroExtend(ExitCount, OfsTy);
- if (UsePostInc)
- IVOffset = SE->getAddExpr(IVOffset, SE->getOne(OfsTy));
-
- // Expand the code for the iteration count.
- assert(SE->isLoopInvariant(IVOffset, L) &&
- "Computed iteration count is not loop invariant!");
-
- // We could handle pointer IVs other than i8*, but we need to compensate for
- // gep index scaling.
- assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
- cast<PointerType>(IndVar->getType())
- ->getElementType())->isOne() &&
- "unit stride pointer IV must be i8*");
-
- const SCEV *IVLimit = SE->getAddExpr(IVInit, IVOffset);
- BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
- return Rewriter.expandCodeFor(IVLimit, IndVar->getType(), BI);
- } else {
- // In any other case, convert both IVInit and ExitCount to integers before
- // comparing. This may result in SCEV expansion of pointers, but in practice
- // SCEV will fold the pointer arithmetic away as such:
- // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
- //
- // Valid Cases: (1) both integers is most common; (2) both may be pointers
- // for simple memset-style loops.
- //
- // IVInit integer and ExitCount pointer would only occur if a canonical IV
- // were generated on top of case #2, which is not expected.
-
- assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
- // For unit stride, IVCount = Start + ExitCount with 2's complement
- // overflow.
-
- // For integer IVs, truncate the IV before computing IVInit + BECount,
- // unless we know apriori that the limit must be a constant when evaluated
- // in the bitwidth of the IV. We prefer (potentially) keeping a truncate
- // of the IV in the loop over a (potentially) expensive expansion of the
- // widened exit count add(zext(add)) expression.
- if (SE->getTypeSizeInBits(IVInit->getType())
- > SE->getTypeSizeInBits(ExitCount->getType())) {
- if (isa<SCEVConstant>(IVInit) && isa<SCEVConstant>(ExitCount))
- ExitCount = SE->getZeroExtendExpr(ExitCount, IVInit->getType());
- else
- IVInit = SE->getTruncateExpr(IVInit, ExitCount->getType());
- }
-
- const SCEV *IVLimit = SE->getAddExpr(IVInit, ExitCount);
-
- if (UsePostInc)
- IVLimit = SE->getAddExpr(IVLimit, SE->getOne(IVLimit->getType()));
-
- // Expand the code for the iteration count.
- assert(SE->isLoopInvariant(IVLimit, L) &&
- "Computed iteration count is not loop invariant!");
- // Ensure that we generate the same type as IndVar, or a smaller integer
- // type. In the presence of null pointer values, we have an integer type
- // SCEV expression (IVInit) for a pointer type IV value (IndVar).
- Type *LimitTy = ExitCount->getType()->isPointerTy() ?
- IndVar->getType() : ExitCount->getType();
- BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
- return Rewriter.expandCodeFor(IVLimit, LimitTy, BI);
- }
-}
-
-/// This method rewrites the exit condition of the loop to be a canonical !=
-/// comparison against the incremented loop induction variable. This pass is
-/// able to rewrite the exit tests of any loop where the SCEV analysis can
-/// determine a loop-invariant trip count of the loop, which is actually a much
-/// broader range than just linear tests.
-bool IndVarSimplify::
-linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
- const SCEV *ExitCount,
- PHINode *IndVar, SCEVExpander &Rewriter) {
- assert(L->getLoopLatch() && "Loop no longer in simplified form?");
- assert(isLoopCounter(IndVar, L, SE));
- Instruction * const IncVar =
- cast<Instruction>(IndVar->getIncomingValueForBlock(L->getLoopLatch()));
-
- // Initialize CmpIndVar to the preincremented IV.
- Value *CmpIndVar = IndVar;
- bool UsePostInc = false;
-
- // If the exiting block is the same as the backedge block, we prefer to
- // compare against the post-incremented value, otherwise we must compare
- // against the preincremented value.
- if (ExitingBB == L->getLoopLatch()) {
- // For pointer IVs, we chose to not strip inbounds which requires us not
- // to add a potentially UB introducing use. We need to either a) show
- // the loop test we're modifying is already in post-inc form, or b) show
- // that adding a use must not introduce UB.
- bool SafeToPostInc =
- IndVar->getType()->isIntegerTy() ||
- isLoopExitTestBasedOn(IncVar, ExitingBB) ||
- mustExecuteUBIfPoisonOnPathTo(IncVar, ExitingBB->getTerminator(), DT);
- if (SafeToPostInc) {
- UsePostInc = true;
- CmpIndVar = IncVar;
- }
- }
-
- // It may be necessary to drop nowrap flags on the incrementing instruction
- // if either LFTR moves from a pre-inc check to a post-inc check (in which
- // case the increment might have previously been poison on the last iteration
- // only) or if LFTR switches to a different IV that was previously dynamically
- // dead (and as such may be arbitrarily poison). We remove any nowrap flags
- // that SCEV didn't infer for the post-inc addrec (even if we use a pre-inc
- // check), because the pre-inc addrec flags may be adopted from the original
- // instruction, while SCEV has to explicitly prove the post-inc nowrap flags.
- // TODO: This handling is inaccurate for one case: If we switch to a
- // dynamically dead IV that wraps on the first loop iteration only, which is
- // not covered by the post-inc addrec. (If the new IV was not dynamically
- // dead, it could not be poison on the first iteration in the first place.)
- if (auto *BO = dyn_cast<BinaryOperator>(IncVar)) {
- const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IncVar));
- if (BO->hasNoUnsignedWrap())
- BO->setHasNoUnsignedWrap(AR->hasNoUnsignedWrap());
- if (BO->hasNoSignedWrap())
- BO->setHasNoSignedWrap(AR->hasNoSignedWrap());
- }
-
- Value *ExitCnt = genLoopLimit(
- IndVar, ExitingBB, ExitCount, UsePostInc, L, Rewriter, SE);
- assert(ExitCnt->getType()->isPointerTy() ==
- IndVar->getType()->isPointerTy() &&
- "genLoopLimit missed a cast");
-
- // Insert a new icmp_ne or icmp_eq instruction before the branch.
- BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
- ICmpInst::Predicate P;
- if (L->contains(BI->getSuccessor(0)))
- P = ICmpInst::ICMP_NE;
- else
- P = ICmpInst::ICMP_EQ;
-
- IRBuilder<> Builder(BI);
-
- // The new loop exit condition should reuse the debug location of the
- // original loop exit condition.
- if (auto *Cond = dyn_cast<Instruction>(BI->getCondition()))
- Builder.SetCurrentDebugLocation(Cond->getDebugLoc());
-
- // For integer IVs, if we evaluated the limit in the narrower bitwidth to
- // avoid the expensive expansion of the limit expression in the wider type,
- // emit a truncate to narrow the IV to the ExitCount type. This is safe
- // since we know (from the exit count bitwidth), that we can't self-wrap in
- // the narrower type.
- unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType());
- unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType());
- if (CmpIndVarSize > ExitCntSize) {
- assert(!CmpIndVar->getType()->isPointerTy() &&
- !ExitCnt->getType()->isPointerTy());
-
- // Before resorting to actually inserting the truncate, use the same
- // reasoning as from SimplifyIndvar::eliminateTrunc to see if we can extend
- // the other side of the comparison instead. We still evaluate the limit
- // in the narrower bitwidth, we just prefer a zext/sext outside the loop to
- // a truncate within in.
- bool Extended = false;
- const SCEV *IV = SE->getSCEV(CmpIndVar);
- const SCEV *TruncatedIV = SE->getTruncateExpr(SE->getSCEV(CmpIndVar),
- ExitCnt->getType());
- const SCEV *ZExtTrunc =
- SE->getZeroExtendExpr(TruncatedIV, CmpIndVar->getType());
-
- if (ZExtTrunc == IV) {
- Extended = true;
- ExitCnt = Builder.CreateZExt(ExitCnt, IndVar->getType(),
- "wide.trip.count");
- } else {
- const SCEV *SExtTrunc =
- SE->getSignExtendExpr(TruncatedIV, CmpIndVar->getType());
- if (SExtTrunc == IV) {
- Extended = true;
- ExitCnt = Builder.CreateSExt(ExitCnt, IndVar->getType(),
- "wide.trip.count");
- }
- }
-
- if (Extended) {
- bool Discard;
- L->makeLoopInvariant(ExitCnt, Discard);
- } else
- CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
- "lftr.wideiv");
- }
- LLVM_DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
- << " LHS:" << *CmpIndVar << '\n'
- << " op:\t" << (P == ICmpInst::ICMP_NE ? "!=" : "==")
- << "\n"
- << " RHS:\t" << *ExitCnt << "\n"
- << "ExitCount:\t" << *ExitCount << "\n"
- << " was: " << *BI->getCondition() << "\n");
-
- Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond");
- Value *OrigCond = BI->getCondition();
- // It's tempting to use replaceAllUsesWith here to fully replace the old
- // comparison, but that's not immediately safe, since users of the old
- // comparison may not be dominated by the new comparison. Instead, just
- // update the branch to use the new comparison; in the common case this
- // will make old comparison dead.
- BI->setCondition(Cond);
- DeadInsts.emplace_back(OrigCond);
-
- ++NumLFTR;
- return true;
-}
-
-//===----------------------------------------------------------------------===//
-// sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
-//===----------------------------------------------------------------------===//
-
-/// If there's a single exit block, sink any loop-invariant values that
-/// were defined in the preheader but not used inside the loop into the
-/// exit block to reduce register pressure in the loop.
-bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
- BasicBlock *ExitBlock = L->getExitBlock();
- if (!ExitBlock) return false;
-
- BasicBlock *Preheader = L->getLoopPreheader();
- if (!Preheader) return false;
-
- bool MadeAnyChanges = false;
- BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt();
- BasicBlock::iterator I(Preheader->getTerminator());
- while (I != Preheader->begin()) {
- --I;
- // New instructions were inserted at the end of the preheader.
- if (isa<PHINode>(I))
- break;
-
- // Don't move instructions which might have side effects, since the side
- // effects need to complete before instructions inside the loop. Also don't
- // move instructions which might read memory, since the loop may modify
- // memory. Note that it's okay if the instruction might have undefined
- // behavior: LoopSimplify guarantees that the preheader dominates the exit
- // block.
- if (I->mayHaveSideEffects() || I->mayReadFromMemory())
- continue;
-
- // Skip debug info intrinsics.
- if (isa<DbgInfoIntrinsic>(I))
- continue;
-
- // Skip eh pad instructions.
- if (I->isEHPad())
- continue;
-
- // Don't sink alloca: we never want to sink static alloca's out of the
- // entry block, and correctly sinking dynamic alloca's requires
- // checks for stacksave/stackrestore intrinsics.
- // FIXME: Refactor this check somehow?
- if (isa<AllocaInst>(I))
- continue;
-
- // Determine if there is a use in or before the loop (direct or
- // otherwise).
- bool UsedInLoop = false;
- for (Use &U : I->uses()) {
- Instruction *User = cast<Instruction>(U.getUser());
- BasicBlock *UseBB = User->getParent();
- if (PHINode *P = dyn_cast<PHINode>(User)) {
- unsigned i =
- PHINode::getIncomingValueNumForOperand(U.getOperandNo());
- UseBB = P->getIncomingBlock(i);
- }
- if (UseBB == Preheader || L->contains(UseBB)) {
- UsedInLoop = true;
- break;
- }
- }
-
- // If there is, the def must remain in the preheader.
- if (UsedInLoop)
- continue;
-
- // Otherwise, sink it to the exit block.
- Instruction *ToMove = &*I;
- bool Done = false;
-
- if (I != Preheader->begin()) {
- // Skip debug info intrinsics.
- do {
- --I;
- } while (isa<DbgInfoIntrinsic>(I) && I != Preheader->begin());
-
- if (isa<DbgInfoIntrinsic>(I) && I == Preheader->begin())
- Done = true;
- } else {
- Done = true;
- }
-
- MadeAnyChanges = true;
- ToMove->moveBefore(*ExitBlock, InsertPt);
- if (Done) break;
- InsertPt = ToMove->getIterator();
- }
-
- return MadeAnyChanges;
-}
-
+ continue;
+
+ if (KnownPoison.insert(I).second)
+ for (const User *User : I->users())
+ Worklist.push_back(cast<Instruction>(User));
+ }
+
+ // Might be non-UB, or might have a path we couldn't prove must execute on
+ // way to exiting bb.
+ return false;
+}
+
+/// Recursive helper for hasConcreteDef(). Unfortunately, this currently boils
+/// down to checking that all operands are constant and listing instructions
+/// that may hide undef.
+static bool hasConcreteDefImpl(Value *V, SmallPtrSetImpl<Value*> &Visited,
+ unsigned Depth) {
+ if (isa<Constant>(V))
+ return !isa<UndefValue>(V);
+
+ if (Depth >= 6)
+ return false;
+
+ // Conservatively handle non-constant non-instructions. For example, Arguments
+ // may be undef.
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+
+ // Load and return values may be undef.
+ if(I->mayReadFromMemory() || isa<CallInst>(I) || isa<InvokeInst>(I))
+ return false;
+
+ // Optimistically handle other instructions.
+ for (Value *Op : I->operands()) {
+ if (!Visited.insert(Op).second)
+ continue;
+ if (!hasConcreteDefImpl(Op, Visited, Depth+1))
+ return false;
+ }
+ return true;
+}
+
+/// Return true if the given value is concrete. We must prove that undef can
+/// never reach it.
+///
+/// TODO: If we decide that this is a good approach to checking for undef, we
+/// may factor it into a common location.
+static bool hasConcreteDef(Value *V) {
+ SmallPtrSet<Value*, 8> Visited;
+ Visited.insert(V);
+ return hasConcreteDefImpl(V, Visited, 0);
+}
+
+/// Return true if this IV has any uses other than the (soon to be rewritten)
+/// loop exit test.
+static bool AlmostDeadIV(PHINode *Phi, BasicBlock *LatchBlock, Value *Cond) {
+ int LatchIdx = Phi->getBasicBlockIndex(LatchBlock);
+ Value *IncV = Phi->getIncomingValue(LatchIdx);
+
+ for (User *U : Phi->users())
+ if (U != Cond && U != IncV) return false;
+
+ for (User *U : IncV->users())
+ if (U != Cond && U != Phi) return false;
+ return true;
+}
+
+/// Return true if the given phi is a "counter" in L. A counter is an
+/// add recurance (of integer or pointer type) with an arbitrary start, and a
+/// step of 1. Note that L must have exactly one latch.
+static bool isLoopCounter(PHINode* Phi, Loop *L,
+ ScalarEvolution *SE) {
+ assert(Phi->getParent() == L->getHeader());
+ assert(L->getLoopLatch());
+
+ if (!SE->isSCEVable(Phi->getType()))
+ return false;
+
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
+ if (!AR || AR->getLoop() != L || !AR->isAffine())
+ return false;
+
+ const SCEV *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+ if (!Step || !Step->isOne())
+ return false;
+
+ int LatchIdx = Phi->getBasicBlockIndex(L->getLoopLatch());
+ Value *IncV = Phi->getIncomingValue(LatchIdx);
+ return (getLoopPhiForCounter(IncV, L) == Phi);
+}
+
+/// Search the loop header for a loop counter (anadd rec w/step of one)
+/// suitable for use by LFTR. If multiple counters are available, select the
+/// "best" one based profitable heuristics.
+///
+/// BECount may be an i8* pointer type. The pointer difference is already
+/// valid count without scaling the address stride, so it remains a pointer
+/// expression as far as SCEV is concerned.
+static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB,
+ const SCEV *BECount,
+ ScalarEvolution *SE, DominatorTree *DT) {
+ uint64_t BCWidth = SE->getTypeSizeInBits(BECount->getType());
+
+ Value *Cond = cast<BranchInst>(ExitingBB->getTerminator())->getCondition();
+
+ // Loop over all of the PHI nodes, looking for a simple counter.
+ PHINode *BestPhi = nullptr;
+ const SCEV *BestInit = nullptr;
+ BasicBlock *LatchBlock = L->getLoopLatch();
+ assert(LatchBlock && "Must be in simplified form");
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+ for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ PHINode *Phi = cast<PHINode>(I);
+ if (!isLoopCounter(Phi, L, SE))
+ continue;
+
+ // Avoid comparing an integer IV against a pointer Limit.
+ if (BECount->getType()->isPointerTy() && !Phi->getType()->isPointerTy())
+ continue;
+
+ const auto *AR = cast<SCEVAddRecExpr>(SE->getSCEV(Phi));
+
+ // AR may be a pointer type, while BECount is an integer type.
+ // AR may be wider than BECount. With eq/ne tests overflow is immaterial.
+ // AR may not be a narrower type, or we may never exit.
+ uint64_t PhiWidth = SE->getTypeSizeInBits(AR->getType());
+ if (PhiWidth < BCWidth || !DL.isLegalInteger(PhiWidth))
+ continue;
+
+ // Avoid reusing a potentially undef value to compute other values that may
+ // have originally had a concrete definition.
+ if (!hasConcreteDef(Phi)) {
+ // We explicitly allow unknown phis as long as they are already used by
+ // the loop exit test. This is legal since performing LFTR could not
+ // increase the number of undef users.
+ Value *IncPhi = Phi->getIncomingValueForBlock(LatchBlock);
+ if (!isLoopExitTestBasedOn(Phi, ExitingBB) &&
+ !isLoopExitTestBasedOn(IncPhi, ExitingBB))
+ continue;
+ }
+
+ // Avoid introducing undefined behavior due to poison which didn't exist in
+ // the original program. (Annoyingly, the rules for poison and undef
+ // propagation are distinct, so this does NOT cover the undef case above.)
+ // We have to ensure that we don't introduce UB by introducing a use on an
+ // iteration where said IV produces poison. Our strategy here differs for
+ // pointers and integer IVs. For integers, we strip and reinfer as needed,
+ // see code in linearFunctionTestReplace. For pointers, we restrict
+ // transforms as there is no good way to reinfer inbounds once lost.
+ if (!Phi->getType()->isIntegerTy() &&
+ !mustExecuteUBIfPoisonOnPathTo(Phi, ExitingBB->getTerminator(), DT))
+ continue;
+
+ const SCEV *Init = AR->getStart();
+
+ if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) {
+ // Don't force a live loop counter if another IV can be used.
+ if (AlmostDeadIV(Phi, LatchBlock, Cond))
+ continue;
+
+ // Prefer to count-from-zero. This is a more "canonical" counter form. It
+ // also prefers integer to pointer IVs.
+ if (BestInit->isZero() != Init->isZero()) {
+ if (BestInit->isZero())
+ continue;
+ }
+ // If two IVs both count from zero or both count from nonzero then the
+ // narrower is likely a dead phi that has been widened. Use the wider phi
+ // to allow the other to be eliminated.
+ else if (PhiWidth <= SE->getTypeSizeInBits(BestPhi->getType()))
+ continue;
+ }
+ BestPhi = Phi;
+ BestInit = Init;
+ }
+ return BestPhi;
+}
+
+/// Insert an IR expression which computes the value held by the IV IndVar
+/// (which must be an loop counter w/unit stride) after the backedge of loop L
+/// is taken ExitCount times.
+static Value *genLoopLimit(PHINode *IndVar, BasicBlock *ExitingBB,
+ const SCEV *ExitCount, bool UsePostInc, Loop *L,
+ SCEVExpander &Rewriter, ScalarEvolution *SE) {
+ assert(isLoopCounter(IndVar, L, SE));
+ const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
+ const SCEV *IVInit = AR->getStart();
+
+ // IVInit may be a pointer while ExitCount is an integer when FindLoopCounter
+ // finds a valid pointer IV. Sign extend ExitCount in order to materialize a
+ // GEP. Avoid running SCEVExpander on a new pointer value, instead reusing
+ // the existing GEPs whenever possible.
+ if (IndVar->getType()->isPointerTy() &&
+ !ExitCount->getType()->isPointerTy()) {
+ // IVOffset will be the new GEP offset that is interpreted by GEP as a
+ // signed value. ExitCount on the other hand represents the loop trip count,
+ // which is an unsigned value. FindLoopCounter only allows induction
+ // variables that have a positive unit stride of one. This means we don't
+ // have to handle the case of negative offsets (yet) and just need to zero
+ // extend ExitCount.
+ Type *OfsTy = SE->getEffectiveSCEVType(IVInit->getType());
+ const SCEV *IVOffset = SE->getTruncateOrZeroExtend(ExitCount, OfsTy);
+ if (UsePostInc)
+ IVOffset = SE->getAddExpr(IVOffset, SE->getOne(OfsTy));
+
+ // Expand the code for the iteration count.
+ assert(SE->isLoopInvariant(IVOffset, L) &&
+ "Computed iteration count is not loop invariant!");
+
+ // We could handle pointer IVs other than i8*, but we need to compensate for
+ // gep index scaling.
+ assert(SE->getSizeOfExpr(IntegerType::getInt64Ty(IndVar->getContext()),
+ cast<PointerType>(IndVar->getType())
+ ->getElementType())->isOne() &&
+ "unit stride pointer IV must be i8*");
+
+ const SCEV *IVLimit = SE->getAddExpr(IVInit, IVOffset);
+ BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+ return Rewriter.expandCodeFor(IVLimit, IndVar->getType(), BI);
+ } else {
+ // In any other case, convert both IVInit and ExitCount to integers before
+ // comparing. This may result in SCEV expansion of pointers, but in practice
+ // SCEV will fold the pointer arithmetic away as such:
+ // BECount = (IVEnd - IVInit - 1) => IVLimit = IVInit (postinc).
+ //
+ // Valid Cases: (1) both integers is most common; (2) both may be pointers
+ // for simple memset-style loops.
+ //
+ // IVInit integer and ExitCount pointer would only occur if a canonical IV
+ // were generated on top of case #2, which is not expected.
+
+ assert(AR->getStepRecurrence(*SE)->isOne() && "only handles unit stride");
+ // For unit stride, IVCount = Start + ExitCount with 2's complement
+ // overflow.
+
+ // For integer IVs, truncate the IV before computing IVInit + BECount,
+ // unless we know apriori that the limit must be a constant when evaluated
+ // in the bitwidth of the IV. We prefer (potentially) keeping a truncate
+ // of the IV in the loop over a (potentially) expensive expansion of the
+ // widened exit count add(zext(add)) expression.
+ if (SE->getTypeSizeInBits(IVInit->getType())
+ > SE->getTypeSizeInBits(ExitCount->getType())) {
+ if (isa<SCEVConstant>(IVInit) && isa<SCEVConstant>(ExitCount))
+ ExitCount = SE->getZeroExtendExpr(ExitCount, IVInit->getType());
+ else
+ IVInit = SE->getTruncateExpr(IVInit, ExitCount->getType());
+ }
+
+ const SCEV *IVLimit = SE->getAddExpr(IVInit, ExitCount);
+
+ if (UsePostInc)
+ IVLimit = SE->getAddExpr(IVLimit, SE->getOne(IVLimit->getType()));
+
+ // Expand the code for the iteration count.
+ assert(SE->isLoopInvariant(IVLimit, L) &&
+ "Computed iteration count is not loop invariant!");
+ // Ensure that we generate the same type as IndVar, or a smaller integer
+ // type. In the presence of null pointer values, we have an integer type
+ // SCEV expression (IVInit) for a pointer type IV value (IndVar).
+ Type *LimitTy = ExitCount->getType()->isPointerTy() ?
+ IndVar->getType() : ExitCount->getType();
+ BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+ return Rewriter.expandCodeFor(IVLimit, LimitTy, BI);
+ }
+}
+
+/// This method rewrites the exit condition of the loop to be a canonical !=
+/// comparison against the incremented loop induction variable. This pass is
+/// able to rewrite the exit tests of any loop where the SCEV analysis can
+/// determine a loop-invariant trip count of the loop, which is actually a much
+/// broader range than just linear tests.
+bool IndVarSimplify::
+linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
+ const SCEV *ExitCount,
+ PHINode *IndVar, SCEVExpander &Rewriter) {
+ assert(L->getLoopLatch() && "Loop no longer in simplified form?");
+ assert(isLoopCounter(IndVar, L, SE));
+ Instruction * const IncVar =
+ cast<Instruction>(IndVar->getIncomingValueForBlock(L->getLoopLatch()));
+
+ // Initialize CmpIndVar to the preincremented IV.
+ Value *CmpIndVar = IndVar;
+ bool UsePostInc = false;
+
+ // If the exiting block is the same as the backedge block, we prefer to
+ // compare against the post-incremented value, otherwise we must compare
+ // against the preincremented value.
+ if (ExitingBB == L->getLoopLatch()) {
+ // For pointer IVs, we chose to not strip inbounds which requires us not
+ // to add a potentially UB introducing use. We need to either a) show
+ // the loop test we're modifying is already in post-inc form, or b) show
+ // that adding a use must not introduce UB.
+ bool SafeToPostInc =
+ IndVar->getType()->isIntegerTy() ||
+ isLoopExitTestBasedOn(IncVar, ExitingBB) ||
+ mustExecuteUBIfPoisonOnPathTo(IncVar, ExitingBB->getTerminator(), DT);
+ if (SafeToPostInc) {
+ UsePostInc = true;
+ CmpIndVar = IncVar;
+ }
+ }
+
+ // It may be necessary to drop nowrap flags on the incrementing instruction
+ // if either LFTR moves from a pre-inc check to a post-inc check (in which
+ // case the increment might have previously been poison on the last iteration
+ // only) or if LFTR switches to a different IV that was previously dynamically
+ // dead (and as such may be arbitrarily poison). We remove any nowrap flags
+ // that SCEV didn't infer for the post-inc addrec (even if we use a pre-inc
+ // check), because the pre-inc addrec flags may be adopted from the original
+ // instruction, while SCEV has to explicitly prove the post-inc nowrap flags.
+ // TODO: This handling is inaccurate for one case: If we switch to a
+ // dynamically dead IV that wraps on the first loop iteration only, which is
+ // not covered by the post-inc addrec. (If the new IV was not dynamically
+ // dead, it could not be poison on the first iteration in the first place.)
+ if (auto *BO = dyn_cast<BinaryOperator>(IncVar)) {
+ const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(SE->getSCEV(IncVar));
+ if (BO->hasNoUnsignedWrap())
+ BO->setHasNoUnsignedWrap(AR->hasNoUnsignedWrap());
+ if (BO->hasNoSignedWrap())
+ BO->setHasNoSignedWrap(AR->hasNoSignedWrap());
+ }
+
+ Value *ExitCnt = genLoopLimit(
+ IndVar, ExitingBB, ExitCount, UsePostInc, L, Rewriter, SE);
+ assert(ExitCnt->getType()->isPointerTy() ==
+ IndVar->getType()->isPointerTy() &&
+ "genLoopLimit missed a cast");
+
+ // Insert a new icmp_ne or icmp_eq instruction before the branch.
+ BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
+ ICmpInst::Predicate P;
+ if (L->contains(BI->getSuccessor(0)))
+ P = ICmpInst::ICMP_NE;
+ else
+ P = ICmpInst::ICMP_EQ;
+
+ IRBuilder<> Builder(BI);
+
+ // The new loop exit condition should reuse the debug location of the
+ // original loop exit condition.
+ if (auto *Cond = dyn_cast<Instruction>(BI->getCondition()))
+ Builder.SetCurrentDebugLocation(Cond->getDebugLoc());
+
+ // For integer IVs, if we evaluated the limit in the narrower bitwidth to
+ // avoid the expensive expansion of the limit expression in the wider type,
+ // emit a truncate to narrow the IV to the ExitCount type. This is safe
+ // since we know (from the exit count bitwidth), that we can't self-wrap in
+ // the narrower type.
+ unsigned CmpIndVarSize = SE->getTypeSizeInBits(CmpIndVar->getType());
+ unsigned ExitCntSize = SE->getTypeSizeInBits(ExitCnt->getType());
+ if (CmpIndVarSize > ExitCntSize) {
+ assert(!CmpIndVar->getType()->isPointerTy() &&
+ !ExitCnt->getType()->isPointerTy());
+
+ // Before resorting to actually inserting the truncate, use the same
+ // reasoning as from SimplifyIndvar::eliminateTrunc to see if we can extend
+ // the other side of the comparison instead. We still evaluate the limit
+ // in the narrower bitwidth, we just prefer a zext/sext outside the loop to
+ // a truncate within in.
+ bool Extended = false;
+ const SCEV *IV = SE->getSCEV(CmpIndVar);
+ const SCEV *TruncatedIV = SE->getTruncateExpr(SE->getSCEV(CmpIndVar),
+ ExitCnt->getType());
+ const SCEV *ZExtTrunc =
+ SE->getZeroExtendExpr(TruncatedIV, CmpIndVar->getType());
+
+ if (ZExtTrunc == IV) {
+ Extended = true;
+ ExitCnt = Builder.CreateZExt(ExitCnt, IndVar->getType(),
+ "wide.trip.count");
+ } else {
+ const SCEV *SExtTrunc =
+ SE->getSignExtendExpr(TruncatedIV, CmpIndVar->getType());
+ if (SExtTrunc == IV) {
+ Extended = true;
+ ExitCnt = Builder.CreateSExt(ExitCnt, IndVar->getType(),
+ "wide.trip.count");
+ }
+ }
+
+ if (Extended) {
+ bool Discard;
+ L->makeLoopInvariant(ExitCnt, Discard);
+ } else
+ CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
+ "lftr.wideiv");
+ }
+ LLVM_DEBUG(dbgs() << "INDVARS: Rewriting loop exit condition to:\n"
+ << " LHS:" << *CmpIndVar << '\n'
+ << " op:\t" << (P == ICmpInst::ICMP_NE ? "!=" : "==")
+ << "\n"
+ << " RHS:\t" << *ExitCnt << "\n"
+ << "ExitCount:\t" << *ExitCount << "\n"
+ << " was: " << *BI->getCondition() << "\n");
+
+ Value *Cond = Builder.CreateICmp(P, CmpIndVar, ExitCnt, "exitcond");
+ Value *OrigCond = BI->getCondition();
+ // It's tempting to use replaceAllUsesWith here to fully replace the old
+ // comparison, but that's not immediately safe, since users of the old
+ // comparison may not be dominated by the new comparison. Instead, just
+ // update the branch to use the new comparison; in the common case this
+ // will make old comparison dead.
+ BI->setCondition(Cond);
+ DeadInsts.emplace_back(OrigCond);
+
+ ++NumLFTR;
+ return true;
+}
+
+//===----------------------------------------------------------------------===//
+// sinkUnusedInvariants. A late subpass to cleanup loop preheaders.
+//===----------------------------------------------------------------------===//
+
+/// If there's a single exit block, sink any loop-invariant values that
+/// were defined in the preheader but not used inside the loop into the
+/// exit block to reduce register pressure in the loop.
+bool IndVarSimplify::sinkUnusedInvariants(Loop *L) {
+ BasicBlock *ExitBlock = L->getExitBlock();
+ if (!ExitBlock) return false;
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader) return false;
+
+ bool MadeAnyChanges = false;
+ BasicBlock::iterator InsertPt = ExitBlock->getFirstInsertionPt();
+ BasicBlock::iterator I(Preheader->getTerminator());
+ while (I != Preheader->begin()) {
+ --I;
+ // New instructions were inserted at the end of the preheader.
+ if (isa<PHINode>(I))
+ break;
+
+ // Don't move instructions which might have side effects, since the side
+ // effects need to complete before instructions inside the loop. Also don't
+ // move instructions which might read memory, since the loop may modify
+ // memory. Note that it's okay if the instruction might have undefined
+ // behavior: LoopSimplify guarantees that the preheader dominates the exit
+ // block.
+ if (I->mayHaveSideEffects() || I->mayReadFromMemory())
+ continue;
+
+ // Skip debug info intrinsics.
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+
+ // Skip eh pad instructions.
+ if (I->isEHPad())
+ continue;
+
+ // Don't sink alloca: we never want to sink static alloca's out of the
+ // entry block, and correctly sinking dynamic alloca's requires
+ // checks for stacksave/stackrestore intrinsics.
+ // FIXME: Refactor this check somehow?
+ if (isa<AllocaInst>(I))
+ continue;
+
+ // Determine if there is a use in or before the loop (direct or
+ // otherwise).
+ bool UsedInLoop = false;
+ for (Use &U : I->uses()) {
+ Instruction *User = cast<Instruction>(U.getUser());
+ BasicBlock *UseBB = User->getParent();
+ if (PHINode *P = dyn_cast<PHINode>(User)) {
+ unsigned i =
+ PHINode::getIncomingValueNumForOperand(U.getOperandNo());
+ UseBB = P->getIncomingBlock(i);
+ }
+ if (UseBB == Preheader || L->contains(UseBB)) {
+ UsedInLoop = true;
+ break;
+ }
+ }
+
+ // If there is, the def must remain in the preheader.
+ if (UsedInLoop)
+ continue;
+
+ // Otherwise, sink it to the exit block.
+ Instruction *ToMove = &*I;
+ bool Done = false;
+
+ if (I != Preheader->begin()) {
+ // Skip debug info intrinsics.
+ do {
+ --I;
+ } while (isa<DbgInfoIntrinsic>(I) && I != Preheader->begin());
+
+ if (isa<DbgInfoIntrinsic>(I) && I == Preheader->begin())
+ Done = true;
+ } else {
+ Done = true;
+ }
+
+ MadeAnyChanges = true;
+ ToMove->moveBefore(*ExitBlock, InsertPt);
+ if (Done) break;
+ InsertPt = ToMove->getIterator();
+ }
+
+ return MadeAnyChanges;
+}
+
static void replaceExitCond(BranchInst *BI, Value *NewCond,
SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
auto *OldCond = BI->getCondition();
@@ -1297,7 +1297,7 @@ static void replaceExitCond(BranchInst *BI, Value *NewCond,
if (OldCond->use_empty())
DeadInsts.emplace_back(OldCond);
}
-
+
static void foldExit(const Loop *L, BasicBlock *ExitingBB, bool IsTaken,
SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
@@ -1355,7 +1355,7 @@ static bool optimizeLoopExitWithUnknownExitCount(
if (SE->isKnownPredicateAt(Pred, LHSS, RHSS, BI)) {
foldExit(L, ExitingBB, Inverted, DeadInsts);
return true;
- }
+ }
// Further logic works for non-inverted condition only.
if (Inverted)
return false;
@@ -1391,52 +1391,52 @@ static bool optimizeLoopExitWithUnknownExitCount(
Rewriter, DeadInsts);
return true;
-}
-
-bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
- SmallVector<BasicBlock*, 16> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
-
+}
+
+bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
+ SmallVector<BasicBlock*, 16> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
// Remove all exits which aren't both rewriteable and execute on every
// iteration.
llvm::erase_if(ExitingBlocks, [&](BasicBlock *ExitingBB) {
- // If our exitting block exits multiple loops, we can only rewrite the
- // innermost one. Otherwise, we're changing how many times the innermost
- // loop runs before it exits.
- if (LI->getLoopFor(ExitingBB) != L)
- return true;
-
- // Can't rewrite non-branch yet.
- BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
- if (!BI)
- return true;
-
- // If already constant, nothing to do.
- if (isa<Constant>(BI->getCondition()))
- return true;
-
+ // If our exitting block exits multiple loops, we can only rewrite the
+ // innermost one. Otherwise, we're changing how many times the innermost
+ // loop runs before it exits.
+ if (LI->getLoopFor(ExitingBB) != L)
+ return true;
+
+ // Can't rewrite non-branch yet.
+ BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+ if (!BI)
+ return true;
+
+ // If already constant, nothing to do.
+ if (isa<Constant>(BI->getCondition()))
+ return true;
+
// Likewise, the loop latch must be dominated by the exiting BB.
if (!DT->dominates(ExitingBB, L->getLoopLatch()))
- return true;
-
- return false;
- });
-
- if (ExitingBlocks.empty())
- return false;
-
- // Get a symbolic upper bound on the loop backedge taken count.
+ return true;
+
+ return false;
+ });
+
+ if (ExitingBlocks.empty())
+ return false;
+
+ // Get a symbolic upper bound on the loop backedge taken count.
const SCEV *MaxExitCount = SE->getSymbolicMaxBackedgeTakenCount(L);
- if (isa<SCEVCouldNotCompute>(MaxExitCount))
- return false;
-
+ if (isa<SCEVCouldNotCompute>(MaxExitCount))
+ return false;
+
// Visit our exit blocks in order of dominance. We know from the fact that
// all exits must dominate the latch, so there is a total dominance order
// between them.
llvm::sort(ExitingBlocks, [&](BasicBlock *A, BasicBlock *B) {
- // std::sort sorts in ascending order, so we want the inverse of
- // the normal dominance relation.
- if (A == B) return false;
+ // std::sort sorts in ascending order, so we want the inverse of
+ // the normal dominance relation.
+ if (A == B) return false;
if (DT->properlyDominates(A, B))
return true;
else {
@@ -1445,17 +1445,17 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
return false;
}
});
-#ifdef ASSERT
- for (unsigned i = 1; i < ExitingBlocks.size(); i++) {
- assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]));
- }
-#endif
-
- bool Changed = false;
+#ifdef ASSERT
+ for (unsigned i = 1; i < ExitingBlocks.size(); i++) {
+ assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]));
+ }
+#endif
+
+ bool Changed = false;
bool SkipLastIter = false;
- SmallSet<const SCEV*, 8> DominatingExitCounts;
- for (BasicBlock *ExitingBB : ExitingBlocks) {
- const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+ SmallSet<const SCEV*, 8> DominatingExitCounts;
+ for (BasicBlock *ExitingBB : ExitingBlocks) {
+ const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
if (isa<SCEVCouldNotCompute>(ExitCount)) {
// Okay, we do not know the exit count here. Can we at least prove that it
// will remain the same within iteration space?
@@ -1465,7 +1465,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
L, BI, ExitingBB, MaxExitCount, Inverted, SkipLastIter, SE,
Rewriter, DeadInsts);
};
-
+
// TODO: We might have proved that we can skip the last iteration for
// this check. In this case, we only want to check the condition on the
// pre-last iteration (MaxExitCount - 1). However, there is a nasty
@@ -1495,496 +1495,496 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
// executed 1 iteration less.
SkipLastIter = true;
- // If we know we'd exit on the first iteration, rewrite the exit to
- // reflect this. This does not imply the loop must exit through this
- // exit; there may be an earlier one taken on the first iteration.
- // TODO: Given we know the backedge can't be taken, we should go ahead
- // and break it. Or at least, kill all the header phis and simplify.
- if (ExitCount->isZero()) {
+ // If we know we'd exit on the first iteration, rewrite the exit to
+ // reflect this. This does not imply the loop must exit through this
+ // exit; there may be an earlier one taken on the first iteration.
+ // TODO: Given we know the backedge can't be taken, we should go ahead
+ // and break it. Or at least, kill all the header phis and simplify.
+ if (ExitCount->isZero()) {
foldExit(L, ExitingBB, true, DeadInsts);
- Changed = true;
- continue;
- }
-
- // If we end up with a pointer exit count, bail. Note that we can end up
- // with a pointer exit count for one exiting block, and not for another in
- // the same loop.
- if (!ExitCount->getType()->isIntegerTy() ||
- !MaxExitCount->getType()->isIntegerTy())
- continue;
-
- Type *WiderType =
- SE->getWiderType(MaxExitCount->getType(), ExitCount->getType());
- ExitCount = SE->getNoopOrZeroExtend(ExitCount, WiderType);
- MaxExitCount = SE->getNoopOrZeroExtend(MaxExitCount, WiderType);
- assert(MaxExitCount->getType() == ExitCount->getType());
-
- // Can we prove that some other exit must be taken strictly before this
- // one?
- if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT,
- MaxExitCount, ExitCount)) {
+ Changed = true;
+ continue;
+ }
+
+ // If we end up with a pointer exit count, bail. Note that we can end up
+ // with a pointer exit count for one exiting block, and not for another in
+ // the same loop.
+ if (!ExitCount->getType()->isIntegerTy() ||
+ !MaxExitCount->getType()->isIntegerTy())
+ continue;
+
+ Type *WiderType =
+ SE->getWiderType(MaxExitCount->getType(), ExitCount->getType());
+ ExitCount = SE->getNoopOrZeroExtend(ExitCount, WiderType);
+ MaxExitCount = SE->getNoopOrZeroExtend(MaxExitCount, WiderType);
+ assert(MaxExitCount->getType() == ExitCount->getType());
+
+ // Can we prove that some other exit must be taken strictly before this
+ // one?
+ if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT,
+ MaxExitCount, ExitCount)) {
foldExit(L, ExitingBB, false, DeadInsts);
- Changed = true;
- continue;
- }
-
- // As we run, keep track of which exit counts we've encountered. If we
- // find a duplicate, we've found an exit which would have exited on the
- // exiting iteration, but (from the visit order) strictly follows another
- // which does the same and is thus dead.
- if (!DominatingExitCounts.insert(ExitCount).second) {
+ Changed = true;
+ continue;
+ }
+
+ // As we run, keep track of which exit counts we've encountered. If we
+ // find a duplicate, we've found an exit which would have exited on the
+ // exiting iteration, but (from the visit order) strictly follows another
+ // which does the same and is thus dead.
+ if (!DominatingExitCounts.insert(ExitCount).second) {
foldExit(L, ExitingBB, false, DeadInsts);
- Changed = true;
- continue;
- }
-
- // TODO: There might be another oppurtunity to leverage SCEV's reasoning
- // here. If we kept track of the min of dominanting exits so far, we could
- // discharge exits with EC >= MDEC. This is less powerful than the existing
- // transform (since later exits aren't considered), but potentially more
- // powerful for any case where SCEV can prove a >=u b, but neither a == b
- // or a >u b. Such a case is not currently known.
- }
- return Changed;
-}
-
-bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
- SmallVector<BasicBlock*, 16> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
-
- // Finally, see if we can rewrite our exit conditions into a loop invariant
- // form. If we have a read-only loop, and we can tell that we must exit down
- // a path which does not need any of the values computed within the loop, we
- // can rewrite the loop to exit on the first iteration. Note that this
- // doesn't either a) tell us the loop exits on the first iteration (unless
- // *all* exits are predicateable) or b) tell us *which* exit might be taken.
- // This transformation looks a lot like a restricted form of dead loop
- // elimination, but restricted to read-only loops and without neccesssarily
- // needing to kill the loop entirely.
- if (!LoopPredication)
- return false;
-
- if (!SE->hasLoopInvariantBackedgeTakenCount(L))
- return false;
-
- // Note: ExactBTC is the exact backedge taken count *iff* the loop exits
- // through *explicit* control flow. We have to eliminate the possibility of
- // implicit exits (see below) before we know it's truly exact.
- const SCEV *ExactBTC = SE->getBackedgeTakenCount(L);
- if (isa<SCEVCouldNotCompute>(ExactBTC) ||
- !SE->isLoopInvariant(ExactBTC, L) ||
- !isSafeToExpand(ExactBTC, *SE))
- return false;
-
- // If we end up with a pointer exit count, bail. It may be unsized.
- if (!ExactBTC->getType()->isIntegerTy())
- return false;
-
- auto BadExit = [&](BasicBlock *ExitingBB) {
- // If our exiting block exits multiple loops, we can only rewrite the
- // innermost one. Otherwise, we're changing how many times the innermost
- // loop runs before it exits.
- if (LI->getLoopFor(ExitingBB) != L)
- return true;
-
- // Can't rewrite non-branch yet.
- BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
- if (!BI)
- return true;
-
- // If already constant, nothing to do.
- if (isa<Constant>(BI->getCondition()))
- return true;
-
- // If the exit block has phis, we need to be able to compute the values
- // within the loop which contains them. This assumes trivially lcssa phis
- // have already been removed; TODO: generalize
- BasicBlock *ExitBlock =
- BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0);
- if (!ExitBlock->phis().empty())
- return true;
-
- const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
- assert(!isa<SCEVCouldNotCompute>(ExactBTC) && "implied by having exact trip count");
- if (!SE->isLoopInvariant(ExitCount, L) ||
- !isSafeToExpand(ExitCount, *SE))
- return true;
-
- // If we end up with a pointer exit count, bail. It may be unsized.
- if (!ExitCount->getType()->isIntegerTy())
- return true;
-
- return false;
- };
-
- // If we have any exits which can't be predicated themselves, than we can't
- // predicate any exit which isn't guaranteed to execute before it. Consider
- // two exits (a) and (b) which would both exit on the same iteration. If we
- // can predicate (b), but not (a), and (a) preceeds (b) along some path, then
- // we could convert a loop from exiting through (a) to one exiting through
- // (b). Note that this problem exists only for exits with the same exit
- // count, and we could be more aggressive when exit counts are known inequal.
- llvm::sort(ExitingBlocks,
- [&](BasicBlock *A, BasicBlock *B) {
- // std::sort sorts in ascending order, so we want the inverse of
- // the normal dominance relation, plus a tie breaker for blocks
- // unordered by dominance.
- if (DT->properlyDominates(A, B)) return true;
- if (DT->properlyDominates(B, A)) return false;
- return A->getName() < B->getName();
- });
- // Check to see if our exit blocks are a total order (i.e. a linear chain of
- // exits before the backedge). If they aren't, reasoning about reachability
- // is complicated and we choose not to for now.
- for (unsigned i = 1; i < ExitingBlocks.size(); i++)
- if (!DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]))
- return false;
-
- // Given our sorted total order, we know that exit[j] must be evaluated
- // after all exit[i] such j > i.
- for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++)
- if (BadExit(ExitingBlocks[i])) {
- ExitingBlocks.resize(i);
- break;
- }
-
- if (ExitingBlocks.empty())
- return false;
-
- // We rely on not being able to reach an exiting block on a later iteration
- // then it's statically compute exit count. The implementaton of
- // getExitCount currently has this invariant, but assert it here so that
- // breakage is obvious if this ever changes..
- assert(llvm::all_of(ExitingBlocks, [&](BasicBlock *ExitingBB) {
- return DT->dominates(ExitingBB, L->getLoopLatch());
- }));
-
- // At this point, ExitingBlocks consists of only those blocks which are
- // predicatable. Given that, we know we have at least one exit we can
- // predicate if the loop is doesn't have side effects and doesn't have any
- // implicit exits (because then our exact BTC isn't actually exact).
- // @Reviewers - As structured, this is O(I^2) for loop nests. Any
- // suggestions on how to improve this? I can obviously bail out for outer
- // loops, but that seems less than ideal. MemorySSA can find memory writes,
- // is that enough for *all* side effects?
- for (BasicBlock *BB : L->blocks())
- for (auto &I : *BB)
- // TODO:isGuaranteedToTransfer
- if (I.mayHaveSideEffects() || I.mayThrow())
- return false;
-
- bool Changed = false;
- // Finally, do the actual predication for all predicatable blocks. A couple
- // of notes here:
- // 1) We don't bother to constant fold dominated exits with identical exit
- // counts; that's simply a form of CSE/equality propagation and we leave
- // it for dedicated passes.
- // 2) We insert the comparison at the branch. Hoisting introduces additional
- // legality constraints and we leave that to dedicated logic. We want to
- // predicate even if we can't insert a loop invariant expression as
- // peeling or unrolling will likely reduce the cost of the otherwise loop
- // varying check.
- Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator());
- IRBuilder<> B(L->getLoopPreheader()->getTerminator());
- Value *ExactBTCV = nullptr; // Lazily generated if needed.
- for (BasicBlock *ExitingBB : ExitingBlocks) {
- const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
-
- auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
- Value *NewCond;
- if (ExitCount == ExactBTC) {
- NewCond = L->contains(BI->getSuccessor(0)) ?
- B.getFalse() : B.getTrue();
- } else {
- Value *ECV = Rewriter.expandCodeFor(ExitCount);
- if (!ExactBTCV)
- ExactBTCV = Rewriter.expandCodeFor(ExactBTC);
- Value *RHS = ExactBTCV;
- if (ECV->getType() != RHS->getType()) {
- Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType());
- ECV = B.CreateZExt(ECV, WiderTy);
- RHS = B.CreateZExt(RHS, WiderTy);
- }
- auto Pred = L->contains(BI->getSuccessor(0)) ?
- ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
- NewCond = B.CreateICmp(Pred, ECV, RHS);
- }
- Value *OldCond = BI->getCondition();
- BI->setCondition(NewCond);
- if (OldCond->use_empty())
- DeadInsts.emplace_back(OldCond);
- Changed = true;
- }
-
- return Changed;
-}
-
-//===----------------------------------------------------------------------===//
-// IndVarSimplify driver. Manage several subpasses of IV simplification.
-//===----------------------------------------------------------------------===//
-
-bool IndVarSimplify::run(Loop *L) {
- // We need (and expect!) the incoming loop to be in LCSSA.
- assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
- "LCSSA required to run indvars!");
-
- // If LoopSimplify form is not available, stay out of trouble. Some notes:
- // - LSR currently only supports LoopSimplify-form loops. Indvars'
- // canonicalization can be a pessimization without LSR to "clean up"
- // afterwards.
- // - We depend on having a preheader; in particular,
- // Loop::getCanonicalInductionVariable only supports loops with preheaders,
- // and we're in trouble if we can't find the induction variable even when
- // we've manually inserted one.
- // - LFTR relies on having a single backedge.
- if (!L->isLoopSimplifyForm())
- return false;
-
-#ifndef NDEBUG
- // Used below for a consistency check only
- // Note: Since the result returned by ScalarEvolution may depend on the order
- // in which previous results are added to its cache, the call to
- // getBackedgeTakenCount() may change following SCEV queries.
- const SCEV *BackedgeTakenCount;
- if (VerifyIndvars)
- BackedgeTakenCount = SE->getBackedgeTakenCount(L);
-#endif
-
- bool Changed = false;
- // If there are any floating-point recurrences, attempt to
- // transform them to use integer recurrences.
- Changed |= rewriteNonIntegerIVs(L);
-
- // Create a rewriter object which we'll use to transform the code with.
- SCEVExpander Rewriter(*SE, DL, "indvars");
-#ifndef NDEBUG
- Rewriter.setDebugType(DEBUG_TYPE);
-#endif
-
- // Eliminate redundant IV users.
- //
- // Simplification works best when run before other consumers of SCEV. We
- // attempt to avoid evaluating SCEVs for sign/zero extend operations until
- // other expressions involving loop IVs have been evaluated. This helps SCEV
- // set no-wrap flags before normalizing sign/zero extension.
- Rewriter.disableCanonicalMode();
- Changed |= simplifyAndExtend(L, Rewriter, LI);
-
- // Check to see if we can compute the final value of any expressions
- // that are recurrent in the loop, and substitute the exit values from the
- // loop into any instructions outside of the loop that use the final values
- // of the current expressions.
- if (ReplaceExitValue != NeverRepl) {
- if (int Rewrites = rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT,
- ReplaceExitValue, DeadInsts)) {
- NumReplaced += Rewrites;
- Changed = true;
- }
- }
-
- // Eliminate redundant IV cycles.
- NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
-
- // Try to eliminate loop exits based on analyzeable exit counts
- if (optimizeLoopExits(L, Rewriter)) {
- Changed = true;
- // Given we've changed exit counts, notify SCEV
+ Changed = true;
+ continue;
+ }
+
+ // TODO: There might be another oppurtunity to leverage SCEV's reasoning
+ // here. If we kept track of the min of dominanting exits so far, we could
+ // discharge exits with EC >= MDEC. This is less powerful than the existing
+ // transform (since later exits aren't considered), but potentially more
+ // powerful for any case where SCEV can prove a >=u b, but neither a == b
+ // or a >u b. Such a case is not currently known.
+ }
+ return Changed;
+}
+
+bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
+ SmallVector<BasicBlock*, 16> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
+ // Finally, see if we can rewrite our exit conditions into a loop invariant
+ // form. If we have a read-only loop, and we can tell that we must exit down
+ // a path which does not need any of the values computed within the loop, we
+ // can rewrite the loop to exit on the first iteration. Note that this
+ // doesn't either a) tell us the loop exits on the first iteration (unless
+ // *all* exits are predicateable) or b) tell us *which* exit might be taken.
+ // This transformation looks a lot like a restricted form of dead loop
+ // elimination, but restricted to read-only loops and without neccesssarily
+ // needing to kill the loop entirely.
+ if (!LoopPredication)
+ return false;
+
+ if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+ return false;
+
+ // Note: ExactBTC is the exact backedge taken count *iff* the loop exits
+ // through *explicit* control flow. We have to eliminate the possibility of
+ // implicit exits (see below) before we know it's truly exact.
+ const SCEV *ExactBTC = SE->getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(ExactBTC) ||
+ !SE->isLoopInvariant(ExactBTC, L) ||
+ !isSafeToExpand(ExactBTC, *SE))
+ return false;
+
+ // If we end up with a pointer exit count, bail. It may be unsized.
+ if (!ExactBTC->getType()->isIntegerTy())
+ return false;
+
+ auto BadExit = [&](BasicBlock *ExitingBB) {
+ // If our exiting block exits multiple loops, we can only rewrite the
+ // innermost one. Otherwise, we're changing how many times the innermost
+ // loop runs before it exits.
+ if (LI->getLoopFor(ExitingBB) != L)
+ return true;
+
+ // Can't rewrite non-branch yet.
+ BranchInst *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+ if (!BI)
+ return true;
+
+ // If already constant, nothing to do.
+ if (isa<Constant>(BI->getCondition()))
+ return true;
+
+ // If the exit block has phis, we need to be able to compute the values
+ // within the loop which contains them. This assumes trivially lcssa phis
+ // have already been removed; TODO: generalize
+ BasicBlock *ExitBlock =
+ BI->getSuccessor(L->contains(BI->getSuccessor(0)) ? 1 : 0);
+ if (!ExitBlock->phis().empty())
+ return true;
+
+ const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+ assert(!isa<SCEVCouldNotCompute>(ExactBTC) && "implied by having exact trip count");
+ if (!SE->isLoopInvariant(ExitCount, L) ||
+ !isSafeToExpand(ExitCount, *SE))
+ return true;
+
+ // If we end up with a pointer exit count, bail. It may be unsized.
+ if (!ExitCount->getType()->isIntegerTy())
+ return true;
+
+ return false;
+ };
+
+ // If we have any exits which can't be predicated themselves, than we can't
+ // predicate any exit which isn't guaranteed to execute before it. Consider
+ // two exits (a) and (b) which would both exit on the same iteration. If we
+ // can predicate (b), but not (a), and (a) preceeds (b) along some path, then
+ // we could convert a loop from exiting through (a) to one exiting through
+ // (b). Note that this problem exists only for exits with the same exit
+ // count, and we could be more aggressive when exit counts are known inequal.
+ llvm::sort(ExitingBlocks,
+ [&](BasicBlock *A, BasicBlock *B) {
+ // std::sort sorts in ascending order, so we want the inverse of
+ // the normal dominance relation, plus a tie breaker for blocks
+ // unordered by dominance.
+ if (DT->properlyDominates(A, B)) return true;
+ if (DT->properlyDominates(B, A)) return false;
+ return A->getName() < B->getName();
+ });
+ // Check to see if our exit blocks are a total order (i.e. a linear chain of
+ // exits before the backedge). If they aren't, reasoning about reachability
+ // is complicated and we choose not to for now.
+ for (unsigned i = 1; i < ExitingBlocks.size(); i++)
+ if (!DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]))
+ return false;
+
+ // Given our sorted total order, we know that exit[j] must be evaluated
+ // after all exit[i] such j > i.
+ for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++)
+ if (BadExit(ExitingBlocks[i])) {
+ ExitingBlocks.resize(i);
+ break;
+ }
+
+ if (ExitingBlocks.empty())
+ return false;
+
+ // We rely on not being able to reach an exiting block on a later iteration
+ // then it's statically compute exit count. The implementaton of
+ // getExitCount currently has this invariant, but assert it here so that
+ // breakage is obvious if this ever changes..
+ assert(llvm::all_of(ExitingBlocks, [&](BasicBlock *ExitingBB) {
+ return DT->dominates(ExitingBB, L->getLoopLatch());
+ }));
+
+ // At this point, ExitingBlocks consists of only those blocks which are
+ // predicatable. Given that, we know we have at least one exit we can
+ // predicate if the loop is doesn't have side effects and doesn't have any
+ // implicit exits (because then our exact BTC isn't actually exact).
+ // @Reviewers - As structured, this is O(I^2) for loop nests. Any
+ // suggestions on how to improve this? I can obviously bail out for outer
+ // loops, but that seems less than ideal. MemorySSA can find memory writes,
+ // is that enough for *all* side effects?
+ for (BasicBlock *BB : L->blocks())
+ for (auto &I : *BB)
+ // TODO:isGuaranteedToTransfer
+ if (I.mayHaveSideEffects() || I.mayThrow())
+ return false;
+
+ bool Changed = false;
+ // Finally, do the actual predication for all predicatable blocks. A couple
+ // of notes here:
+ // 1) We don't bother to constant fold dominated exits with identical exit
+ // counts; that's simply a form of CSE/equality propagation and we leave
+ // it for dedicated passes.
+ // 2) We insert the comparison at the branch. Hoisting introduces additional
+ // legality constraints and we leave that to dedicated logic. We want to
+ // predicate even if we can't insert a loop invariant expression as
+ // peeling or unrolling will likely reduce the cost of the otherwise loop
+ // varying check.
+ Rewriter.setInsertPoint(L->getLoopPreheader()->getTerminator());
+ IRBuilder<> B(L->getLoopPreheader()->getTerminator());
+ Value *ExactBTCV = nullptr; // Lazily generated if needed.
+ for (BasicBlock *ExitingBB : ExitingBlocks) {
+ const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+
+ auto *BI = cast<BranchInst>(ExitingBB->getTerminator());
+ Value *NewCond;
+ if (ExitCount == ExactBTC) {
+ NewCond = L->contains(BI->getSuccessor(0)) ?
+ B.getFalse() : B.getTrue();
+ } else {
+ Value *ECV = Rewriter.expandCodeFor(ExitCount);
+ if (!ExactBTCV)
+ ExactBTCV = Rewriter.expandCodeFor(ExactBTC);
+ Value *RHS = ExactBTCV;
+ if (ECV->getType() != RHS->getType()) {
+ Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType());
+ ECV = B.CreateZExt(ECV, WiderTy);
+ RHS = B.CreateZExt(RHS, WiderTy);
+ }
+ auto Pred = L->contains(BI->getSuccessor(0)) ?
+ ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
+ NewCond = B.CreateICmp(Pred, ECV, RHS);
+ }
+ Value *OldCond = BI->getCondition();
+ BI->setCondition(NewCond);
+ if (OldCond->use_empty())
+ DeadInsts.emplace_back(OldCond);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// IndVarSimplify driver. Manage several subpasses of IV simplification.
+//===----------------------------------------------------------------------===//
+
+bool IndVarSimplify::run(Loop *L) {
+ // We need (and expect!) the incoming loop to be in LCSSA.
+ assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+ "LCSSA required to run indvars!");
+
+ // If LoopSimplify form is not available, stay out of trouble. Some notes:
+ // - LSR currently only supports LoopSimplify-form loops. Indvars'
+ // canonicalization can be a pessimization without LSR to "clean up"
+ // afterwards.
+ // - We depend on having a preheader; in particular,
+ // Loop::getCanonicalInductionVariable only supports loops with preheaders,
+ // and we're in trouble if we can't find the induction variable even when
+ // we've manually inserted one.
+ // - LFTR relies on having a single backedge.
+ if (!L->isLoopSimplifyForm())
+ return false;
+
+#ifndef NDEBUG
+ // Used below for a consistency check only
+ // Note: Since the result returned by ScalarEvolution may depend on the order
+ // in which previous results are added to its cache, the call to
+ // getBackedgeTakenCount() may change following SCEV queries.
+ const SCEV *BackedgeTakenCount;
+ if (VerifyIndvars)
+ BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+#endif
+
+ bool Changed = false;
+ // If there are any floating-point recurrences, attempt to
+ // transform them to use integer recurrences.
+ Changed |= rewriteNonIntegerIVs(L);
+
+ // Create a rewriter object which we'll use to transform the code with.
+ SCEVExpander Rewriter(*SE, DL, "indvars");
+#ifndef NDEBUG
+ Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+
+ // Eliminate redundant IV users.
+ //
+ // Simplification works best when run before other consumers of SCEV. We
+ // attempt to avoid evaluating SCEVs for sign/zero extend operations until
+ // other expressions involving loop IVs have been evaluated. This helps SCEV
+ // set no-wrap flags before normalizing sign/zero extension.
+ Rewriter.disableCanonicalMode();
+ Changed |= simplifyAndExtend(L, Rewriter, LI);
+
+ // Check to see if we can compute the final value of any expressions
+ // that are recurrent in the loop, and substitute the exit values from the
+ // loop into any instructions outside of the loop that use the final values
+ // of the current expressions.
+ if (ReplaceExitValue != NeverRepl) {
+ if (int Rewrites = rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT,
+ ReplaceExitValue, DeadInsts)) {
+ NumReplaced += Rewrites;
+ Changed = true;
+ }
+ }
+
+ // Eliminate redundant IV cycles.
+ NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
+
+ // Try to eliminate loop exits based on analyzeable exit counts
+ if (optimizeLoopExits(L, Rewriter)) {
+ Changed = true;
+ // Given we've changed exit counts, notify SCEV
// Some nested loops may share same folded exit basic block,
// thus we need to notify top most loop.
SE->forgetTopmostLoop(L);
- }
-
- // Try to form loop invariant tests for loop exits by changing how many
- // iterations of the loop run when that is unobservable.
- if (predicateLoopExits(L, Rewriter)) {
- Changed = true;
- // Given we've changed exit counts, notify SCEV
- SE->forgetLoop(L);
- }
-
- // If we have a trip count expression, rewrite the loop's exit condition
- // using it.
- if (!DisableLFTR) {
- BasicBlock *PreHeader = L->getLoopPreheader();
- BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
-
- SmallVector<BasicBlock*, 16> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
- for (BasicBlock *ExitingBB : ExitingBlocks) {
- // Can't rewrite non-branch yet.
- if (!isa<BranchInst>(ExitingBB->getTerminator()))
- continue;
-
- // If our exitting block exits multiple loops, we can only rewrite the
- // innermost one. Otherwise, we're changing how many times the innermost
- // loop runs before it exits.
- if (LI->getLoopFor(ExitingBB) != L)
- continue;
-
- if (!needsLFTR(L, ExitingBB))
- continue;
-
- const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
- if (isa<SCEVCouldNotCompute>(ExitCount))
- continue;
-
- // This was handled above, but as we form SCEVs, we can sometimes refine
- // existing ones; this allows exit counts to be folded to zero which
- // weren't when optimizeLoopExits saw them. Arguably, we should iterate
- // until stable to handle cases like this better.
- if (ExitCount->isZero())
- continue;
-
- PHINode *IndVar = FindLoopCounter(L, ExitingBB, ExitCount, SE, DT);
- if (!IndVar)
- continue;
-
- // Avoid high cost expansions. Note: This heuristic is questionable in
- // that our definition of "high cost" is not exactly principled.
- if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget,
- TTI, PreHeaderBR))
- continue;
-
- // Check preconditions for proper SCEVExpander operation. SCEV does not
- // express SCEVExpander's dependencies, such as LoopSimplify. Instead
- // any pass that uses the SCEVExpander must do it. This does not work
- // well for loop passes because SCEVExpander makes assumptions about
- // all loops, while LoopPassManager only forces the current loop to be
- // simplified.
- //
- // FIXME: SCEV expansion has no way to bail out, so the caller must
- // explicitly check any assumptions made by SCEV. Brittle.
- const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ExitCount);
- if (!AR || AR->getLoop()->getLoopPreheader())
- Changed |= linearFunctionTestReplace(L, ExitingBB,
- ExitCount, IndVar,
- Rewriter);
- }
- }
- // Clear the rewriter cache, because values that are in the rewriter's cache
- // can be deleted in the loop below, causing the AssertingVH in the cache to
- // trigger.
- Rewriter.clear();
-
- // Now that we're done iterating through lists, clean up any instructions
- // which are now dead.
+ }
+
+ // Try to form loop invariant tests for loop exits by changing how many
+ // iterations of the loop run when that is unobservable.
+ if (predicateLoopExits(L, Rewriter)) {
+ Changed = true;
+ // Given we've changed exit counts, notify SCEV
+ SE->forgetLoop(L);
+ }
+
+ // If we have a trip count expression, rewrite the loop's exit condition
+ // using it.
+ if (!DisableLFTR) {
+ BasicBlock *PreHeader = L->getLoopPreheader();
+ BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
+
+ SmallVector<BasicBlock*, 16> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+ for (BasicBlock *ExitingBB : ExitingBlocks) {
+ // Can't rewrite non-branch yet.
+ if (!isa<BranchInst>(ExitingBB->getTerminator()))
+ continue;
+
+ // If our exitting block exits multiple loops, we can only rewrite the
+ // innermost one. Otherwise, we're changing how many times the innermost
+ // loop runs before it exits.
+ if (LI->getLoopFor(ExitingBB) != L)
+ continue;
+
+ if (!needsLFTR(L, ExitingBB))
+ continue;
+
+ const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+ if (isa<SCEVCouldNotCompute>(ExitCount))
+ continue;
+
+ // This was handled above, but as we form SCEVs, we can sometimes refine
+ // existing ones; this allows exit counts to be folded to zero which
+ // weren't when optimizeLoopExits saw them. Arguably, we should iterate
+ // until stable to handle cases like this better.
+ if (ExitCount->isZero())
+ continue;
+
+ PHINode *IndVar = FindLoopCounter(L, ExitingBB, ExitCount, SE, DT);
+ if (!IndVar)
+ continue;
+
+ // Avoid high cost expansions. Note: This heuristic is questionable in
+ // that our definition of "high cost" is not exactly principled.
+ if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget,
+ TTI, PreHeaderBR))
+ continue;
+
+ // Check preconditions for proper SCEVExpander operation. SCEV does not
+ // express SCEVExpander's dependencies, such as LoopSimplify. Instead
+ // any pass that uses the SCEVExpander must do it. This does not work
+ // well for loop passes because SCEVExpander makes assumptions about
+ // all loops, while LoopPassManager only forces the current loop to be
+ // simplified.
+ //
+ // FIXME: SCEV expansion has no way to bail out, so the caller must
+ // explicitly check any assumptions made by SCEV. Brittle.
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ExitCount);
+ if (!AR || AR->getLoop()->getLoopPreheader())
+ Changed |= linearFunctionTestReplace(L, ExitingBB,
+ ExitCount, IndVar,
+ Rewriter);
+ }
+ }
+ // Clear the rewriter cache, because values that are in the rewriter's cache
+ // can be deleted in the loop below, causing the AssertingVH in the cache to
+ // trigger.
+ Rewriter.clear();
+
+ // Now that we're done iterating through lists, clean up any instructions
+ // which are now dead.
while (!DeadInsts.empty()) {
Value *V = DeadInsts.pop_back_val();
if (PHINode *PHI = dyn_cast_or_null<PHINode>(V))
Changed |= RecursivelyDeleteDeadPHINode(PHI, TLI, MSSAU.get());
else if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
- Changed |=
- RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI, MSSAU.get());
+ Changed |=
+ RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI, MSSAU.get());
}
-
- // The Rewriter may not be used from this point on.
-
- // Loop-invariant instructions in the preheader that aren't used in the
- // loop may be sunk below the loop to reduce register pressure.
- Changed |= sinkUnusedInvariants(L);
-
- // rewriteFirstIterationLoopExitValues does not rely on the computation of
- // trip count and therefore can further simplify exit values in addition to
- // rewriteLoopExitValues.
- Changed |= rewriteFirstIterationLoopExitValues(L);
-
- // Clean up dead instructions.
- Changed |= DeleteDeadPHIs(L->getHeader(), TLI, MSSAU.get());
-
- // Check a post-condition.
- assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
- "Indvars did not preserve LCSSA!");
-
- // Verify that LFTR, and any other change have not interfered with SCEV's
- // ability to compute trip count. We may have *changed* the exit count, but
- // only by reducing it.
-#ifndef NDEBUG
- if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
- SE->forgetLoop(L);
- const SCEV *NewBECount = SE->getBackedgeTakenCount(L);
- if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) <
- SE->getTypeSizeInBits(NewBECount->getType()))
- NewBECount = SE->getTruncateOrNoop(NewBECount,
- BackedgeTakenCount->getType());
- else
- BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount,
- NewBECount->getType());
- assert(!SE->isKnownPredicate(ICmpInst::ICMP_ULT, BackedgeTakenCount,
- NewBECount) && "indvars must preserve SCEV");
- }
- if (VerifyMemorySSA && MSSAU)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-#endif
-
- return Changed;
-}
-
-PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &) {
- Function *F = L.getHeader()->getParent();
- const DataLayout &DL = F->getParent()->getDataLayout();
-
+
+ // The Rewriter may not be used from this point on.
+
+ // Loop-invariant instructions in the preheader that aren't used in the
+ // loop may be sunk below the loop to reduce register pressure.
+ Changed |= sinkUnusedInvariants(L);
+
+ // rewriteFirstIterationLoopExitValues does not rely on the computation of
+ // trip count and therefore can further simplify exit values in addition to
+ // rewriteLoopExitValues.
+ Changed |= rewriteFirstIterationLoopExitValues(L);
+
+ // Clean up dead instructions.
+ Changed |= DeleteDeadPHIs(L->getHeader(), TLI, MSSAU.get());
+
+ // Check a post-condition.
+ assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+ "Indvars did not preserve LCSSA!");
+
+ // Verify that LFTR, and any other change have not interfered with SCEV's
+ // ability to compute trip count. We may have *changed* the exit count, but
+ // only by reducing it.
+#ifndef NDEBUG
+ if (VerifyIndvars && !isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+ SE->forgetLoop(L);
+ const SCEV *NewBECount = SE->getBackedgeTakenCount(L);
+ if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) <
+ SE->getTypeSizeInBits(NewBECount->getType()))
+ NewBECount = SE->getTruncateOrNoop(NewBECount,
+ BackedgeTakenCount->getType());
+ else
+ BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount,
+ NewBECount->getType());
+ assert(!SE->isKnownPredicate(ICmpInst::ICMP_ULT, BackedgeTakenCount,
+ NewBECount) && "indvars must preserve SCEV");
+ }
+ if (VerifyMemorySSA && MSSAU)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+#endif
+
+ return Changed;
+}
+
+PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ Function *F = L.getHeader()->getParent();
+ const DataLayout &DL = F->getParent()->getDataLayout();
+
IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA,
WidenIndVars && AllowIVWidening);
- if (!IVS.run(&L))
- return PreservedAnalyses::all();
-
- auto PA = getLoopPassPreservedAnalyses();
- PA.preserveSet<CFGAnalyses>();
- if (AR.MSSA)
- PA.preserve<MemorySSAAnalysis>();
- return PA;
-}
-
-namespace {
-
-struct IndVarSimplifyLegacyPass : public LoopPass {
- static char ID; // Pass identification, replacement for typeid
-
- IndVarSimplifyLegacyPass() : LoopPass(ID) {
- initializeIndVarSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override {
- if (skipLoop(L))
- return false;
-
- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- auto *TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
- auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
- auto *TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
- const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
- auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
- MemorySSA *MSSA = nullptr;
- if (MSSAAnalysis)
- MSSA = &MSSAAnalysis->getMSSA();
-
+ if (!IVS.run(&L))
+ return PreservedAnalyses::all();
+
+ auto PA = getLoopPassPreservedAnalyses();
+ PA.preserveSet<CFGAnalyses>();
+ if (AR.MSSA)
+ PA.preserve<MemorySSAAnalysis>();
+ return PA;
+}
+
+namespace {
+
+struct IndVarSimplifyLegacyPass : public LoopPass {
+ static char ID; // Pass identification, replacement for typeid
+
+ IndVarSimplifyLegacyPass() : LoopPass(ID) {
+ initializeIndVarSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ auto *TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
+ auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
+ auto *TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+ MemorySSA *MSSA = nullptr;
+ if (MSSAAnalysis)
+ MSSA = &MSSAAnalysis->getMSSA();
+
IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI, MSSA, AllowIVWidening);
- return IVS.run(L);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addPreserved<MemorySSAWrapperPass>();
- getLoopAnalysisUsage(AU);
- }
-};
-
-} // end anonymous namespace
-
-char IndVarSimplifyLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(IndVarSimplifyLegacyPass, "indvars",
- "Induction Variable Simplification", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_END(IndVarSimplifyLegacyPass, "indvars",
- "Induction Variable Simplification", false, false)
-
-Pass *llvm::createIndVarSimplifyPass() {
- return new IndVarSimplifyLegacyPass();
-}
+ return IVS.run(L);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char IndVarSimplifyLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(IndVarSimplifyLegacyPass, "indvars",
+ "Induction Variable Simplification", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(IndVarSimplifyLegacyPass, "indvars",
+ "Induction Variable Simplification", false, false)
+
+Pass *llvm::createIndVarSimplifyPass() {
+ return new IndVarSimplifyLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 006523ecf4..6e09dec198 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -1,241 +1,241 @@
-//===- InductiveRangeCheckElimination.cpp - -------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The InductiveRangeCheckElimination pass splits a loop's iteration space into
-// three disjoint ranges. It does that in a way such that the loop running in
-// the middle loop provably does not need range checks. As an example, it will
-// convert
-//
-// len = < known positive >
-// for (i = 0; i < n; i++) {
-// if (0 <= i && i < len) {
-// do_something();
-// } else {
-// throw_out_of_bounds();
-// }
-// }
-//
-// to
-//
-// len = < known positive >
-// limit = smin(n, len)
-// // no first segment
-// for (i = 0; i < limit; i++) {
-// if (0 <= i && i < len) { // this check is fully redundant
-// do_something();
-// } else {
-// throw_out_of_bounds();
-// }
-// }
-// for (i = limit; i < n; i++) {
-// if (0 <= i && i < len) {
-// do_something();
-// } else {
-// throw_out_of_bounds();
-// }
-// }
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/PriorityWorklist.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
+//===- InductiveRangeCheckElimination.cpp - -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The InductiveRangeCheckElimination pass splits a loop's iteration space into
+// three disjoint ranges. It does that in a way such that the loop running in
+// the middle loop provably does not need range checks. As an example, it will
+// convert
+//
+// len = < known positive >
+// for (i = 0; i < n; i++) {
+// if (0 <= i && i < len) {
+// do_something();
+// } else {
+// throw_out_of_bounds();
+// }
+// }
+//
+// to
+//
+// len = < known positive >
+// limit = smin(n, len)
+// // no first segment
+// for (i = 0; i < limit; i++) {
+// if (0 <= i && i < len) { // this check is fully redundant
+// do_something();
+// } else {
+// throw_out_of_bounds();
+// }
+// }
+// for (i = limit; i < n; i++) {
+// if (0 <= i && i < len) {
+// do_something();
+// } else {
+// throw_out_of_bounds();
+// }
+// }
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/InductiveRangeCheckElimination.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PriorityWorklist.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/BranchProbability.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/LoopSimplify.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <algorithm>
-#include <cassert>
-#include <iterator>
-#include <limits>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden,
- cl::init(64));
-
-static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden,
- cl::init(false));
-
-static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden,
- cl::init(false));
-
-static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks",
- cl::Hidden, cl::init(false));
-
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<unsigned> LoopSizeCutoff("irce-loop-size-cutoff", cl::Hidden,
+ cl::init(64));
+
+static cl::opt<bool> PrintChangedLoops("irce-print-changed-loops", cl::Hidden,
+ cl::init(false));
+
+static cl::opt<bool> PrintRangeChecks("irce-print-range-checks", cl::Hidden,
+ cl::init(false));
+
+static cl::opt<bool> SkipProfitabilityChecks("irce-skip-profitability-checks",
+ cl::Hidden, cl::init(false));
+
static cl::opt<unsigned> MinRuntimeIterations("irce-min-runtime-iterations",
cl::Hidden, cl::init(10));
-static cl::opt<bool> AllowUnsignedLatchCondition("irce-allow-unsigned-latch",
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> AllowNarrowLatchCondition(
- "irce-allow-narrow-latch", cl::Hidden, cl::init(true),
- cl::desc("If set to true, IRCE may eliminate wide range checks in loops "
- "with narrow latch condition."));
-
-static const char *ClonedLoopTag = "irce.loop.clone";
-
-#define DEBUG_TYPE "irce"
-
-namespace {
-
-/// An inductive range check is conditional branch in a loop with
-///
-/// 1. a very cold successor (i.e. the branch jumps to that successor very
-/// rarely)
-///
-/// and
-///
-/// 2. a condition that is provably true for some contiguous range of values
-/// taken by the containing loop's induction variable.
-///
-class InductiveRangeCheck {
-
- const SCEV *Begin = nullptr;
- const SCEV *Step = nullptr;
- const SCEV *End = nullptr;
- Use *CheckUse = nullptr;
-
- static bool parseRangeCheckICmp(Loop *L, ICmpInst *ICI, ScalarEvolution &SE,
- Value *&Index, Value *&Length,
- bool &IsSigned);
-
- static void
- extractRangeChecksFromCond(Loop *L, ScalarEvolution &SE, Use &ConditionUse,
- SmallVectorImpl<InductiveRangeCheck> &Checks,
- SmallPtrSetImpl<Value *> &Visited);
-
-public:
- const SCEV *getBegin() const { return Begin; }
- const SCEV *getStep() const { return Step; }
- const SCEV *getEnd() const { return End; }
-
- void print(raw_ostream &OS) const {
- OS << "InductiveRangeCheck:\n";
- OS << " Begin: ";
- Begin->print(OS);
- OS << " Step: ";
- Step->print(OS);
- OS << " End: ";
- End->print(OS);
- OS << "\n CheckUse: ";
- getCheckUse()->getUser()->print(OS);
- OS << " Operand: " << getCheckUse()->getOperandNo() << "\n";
- }
-
- LLVM_DUMP_METHOD
- void dump() {
- print(dbgs());
- }
-
- Use *getCheckUse() const { return CheckUse; }
-
- /// Represents an signed integer range [Range.getBegin(), Range.getEnd()). If
- /// R.getEnd() le R.getBegin(), then R denotes the empty range.
-
- class Range {
- const SCEV *Begin;
- const SCEV *End;
-
- public:
- Range(const SCEV *Begin, const SCEV *End) : Begin(Begin), End(End) {
- assert(Begin->getType() == End->getType() && "ill-typed range!");
- }
-
- Type *getType() const { return Begin->getType(); }
- const SCEV *getBegin() const { return Begin; }
- const SCEV *getEnd() const { return End; }
- bool isEmpty(ScalarEvolution &SE, bool IsSigned) const {
- if (Begin == End)
- return true;
- if (IsSigned)
- return SE.isKnownPredicate(ICmpInst::ICMP_SGE, Begin, End);
- else
- return SE.isKnownPredicate(ICmpInst::ICMP_UGE, Begin, End);
- }
- };
-
- /// This is the value the condition of the branch needs to evaluate to for the
- /// branch to take the hot successor (see (1) above).
- bool getPassingDirection() { return true; }
-
- /// Computes a range for the induction variable (IndVar) in which the range
- /// check is redundant and can be constant-folded away. The induction
- /// variable is not required to be the canonical {0,+,1} induction variable.
- Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE,
- const SCEVAddRecExpr *IndVar,
- bool IsLatchSigned) const;
-
- /// Parse out a set of inductive range checks from \p BI and append them to \p
- /// Checks.
- ///
- /// NB! There may be conditions feeding into \p BI that aren't inductive range
- /// checks, and hence don't end up in \p Checks.
- static void
- extractRangeChecksFromBranch(BranchInst *BI, Loop *L, ScalarEvolution &SE,
- BranchProbabilityInfo *BPI,
- SmallVectorImpl<InductiveRangeCheck> &Checks);
-};
-
+static cl::opt<bool> AllowUnsignedLatchCondition("irce-allow-unsigned-latch",
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> AllowNarrowLatchCondition(
+ "irce-allow-narrow-latch", cl::Hidden, cl::init(true),
+ cl::desc("If set to true, IRCE may eliminate wide range checks in loops "
+ "with narrow latch condition."));
+
+static const char *ClonedLoopTag = "irce.loop.clone";
+
+#define DEBUG_TYPE "irce"
+
+namespace {
+
+/// An inductive range check is conditional branch in a loop with
+///
+/// 1. a very cold successor (i.e. the branch jumps to that successor very
+/// rarely)
+///
+/// and
+///
+/// 2. a condition that is provably true for some contiguous range of values
+/// taken by the containing loop's induction variable.
+///
+class InductiveRangeCheck {
+
+ const SCEV *Begin = nullptr;
+ const SCEV *Step = nullptr;
+ const SCEV *End = nullptr;
+ Use *CheckUse = nullptr;
+
+ static bool parseRangeCheckICmp(Loop *L, ICmpInst *ICI, ScalarEvolution &SE,
+ Value *&Index, Value *&Length,
+ bool &IsSigned);
+
+ static void
+ extractRangeChecksFromCond(Loop *L, ScalarEvolution &SE, Use &ConditionUse,
+ SmallVectorImpl<InductiveRangeCheck> &Checks,
+ SmallPtrSetImpl<Value *> &Visited);
+
+public:
+ const SCEV *getBegin() const { return Begin; }
+ const SCEV *getStep() const { return Step; }
+ const SCEV *getEnd() const { return End; }
+
+ void print(raw_ostream &OS) const {
+ OS << "InductiveRangeCheck:\n";
+ OS << " Begin: ";
+ Begin->print(OS);
+ OS << " Step: ";
+ Step->print(OS);
+ OS << " End: ";
+ End->print(OS);
+ OS << "\n CheckUse: ";
+ getCheckUse()->getUser()->print(OS);
+ OS << " Operand: " << getCheckUse()->getOperandNo() << "\n";
+ }
+
+ LLVM_DUMP_METHOD
+ void dump() {
+ print(dbgs());
+ }
+
+ Use *getCheckUse() const { return CheckUse; }
+
+ /// Represents an signed integer range [Range.getBegin(), Range.getEnd()). If
+ /// R.getEnd() le R.getBegin(), then R denotes the empty range.
+
+ class Range {
+ const SCEV *Begin;
+ const SCEV *End;
+
+ public:
+ Range(const SCEV *Begin, const SCEV *End) : Begin(Begin), End(End) {
+ assert(Begin->getType() == End->getType() && "ill-typed range!");
+ }
+
+ Type *getType() const { return Begin->getType(); }
+ const SCEV *getBegin() const { return Begin; }
+ const SCEV *getEnd() const { return End; }
+ bool isEmpty(ScalarEvolution &SE, bool IsSigned) const {
+ if (Begin == End)
+ return true;
+ if (IsSigned)
+ return SE.isKnownPredicate(ICmpInst::ICMP_SGE, Begin, End);
+ else
+ return SE.isKnownPredicate(ICmpInst::ICMP_UGE, Begin, End);
+ }
+ };
+
+ /// This is the value the condition of the branch needs to evaluate to for the
+ /// branch to take the hot successor (see (1) above).
+ bool getPassingDirection() { return true; }
+
+ /// Computes a range for the induction variable (IndVar) in which the range
+ /// check is redundant and can be constant-folded away. The induction
+ /// variable is not required to be the canonical {0,+,1} induction variable.
+ Optional<Range> computeSafeIterationSpace(ScalarEvolution &SE,
+ const SCEVAddRecExpr *IndVar,
+ bool IsLatchSigned) const;
+
+ /// Parse out a set of inductive range checks from \p BI and append them to \p
+ /// Checks.
+ ///
+ /// NB! There may be conditions feeding into \p BI that aren't inductive range
+ /// checks, and hence don't end up in \p Checks.
+ static void
+ extractRangeChecksFromBranch(BranchInst *BI, Loop *L, ScalarEvolution &SE,
+ BranchProbabilityInfo *BPI,
+ SmallVectorImpl<InductiveRangeCheck> &Checks);
+};
+
struct LoopStructure;
-class InductiveRangeCheckElimination {
- ScalarEvolution &SE;
- BranchProbabilityInfo *BPI;
- DominatorTree &DT;
- LoopInfo &LI;
-
+class InductiveRangeCheckElimination {
+ ScalarEvolution &SE;
+ BranchProbabilityInfo *BPI;
+ DominatorTree &DT;
+ LoopInfo &LI;
+
using GetBFIFunc =
llvm::Optional<llvm::function_ref<llvm::BlockFrequencyInfo &()> >;
GetBFIFunc GetBFI;
@@ -244,1538 +244,1538 @@ class InductiveRangeCheckElimination {
// number of iterations.
bool isProfitableToTransform(const Loop &L, LoopStructure &LS);
-public:
- InductiveRangeCheckElimination(ScalarEvolution &SE,
- BranchProbabilityInfo *BPI, DominatorTree &DT,
+public:
+ InductiveRangeCheckElimination(ScalarEvolution &SE,
+ BranchProbabilityInfo *BPI, DominatorTree &DT,
LoopInfo &LI, GetBFIFunc GetBFI = None)
: SE(SE), BPI(BPI), DT(DT), LI(LI), GetBFI(GetBFI) {}
-
- bool run(Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop);
-};
-
-class IRCELegacyPass : public FunctionPass {
-public:
- static char ID;
-
- IRCELegacyPass() : FunctionPass(ID) {
- initializeIRCELegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<BranchProbabilityInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- }
-
- bool runOnFunction(Function &F) override;
-};
-
-} // end anonymous namespace
-
-char IRCELegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(IRCELegacyPass, "irce",
- "Inductive range check elimination", false, false)
-INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(IRCELegacyPass, "irce", "Inductive range check elimination",
- false, false)
-
-/// Parse a single ICmp instruction, `ICI`, into a range check. If `ICI` cannot
-/// be interpreted as a range check, return false and set `Index` and `Length`
-/// to `nullptr`. Otherwise set `Index` to the value being range checked, and
-/// set `Length` to the upper limit `Index` is being range checked.
-bool
-InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
- ScalarEvolution &SE, Value *&Index,
- Value *&Length, bool &IsSigned) {
- auto IsLoopInvariant = [&SE, L](Value *V) {
- return SE.isLoopInvariant(SE.getSCEV(V), L);
- };
-
- ICmpInst::Predicate Pred = ICI->getPredicate();
- Value *LHS = ICI->getOperand(0);
- Value *RHS = ICI->getOperand(1);
-
- switch (Pred) {
- default:
- return false;
-
- case ICmpInst::ICMP_SLE:
- std::swap(LHS, RHS);
- LLVM_FALLTHROUGH;
- case ICmpInst::ICMP_SGE:
- IsSigned = true;
- if (match(RHS, m_ConstantInt<0>())) {
- Index = LHS;
- return true; // Lower.
- }
- return false;
-
- case ICmpInst::ICMP_SLT:
- std::swap(LHS, RHS);
- LLVM_FALLTHROUGH;
- case ICmpInst::ICMP_SGT:
- IsSigned = true;
- if (match(RHS, m_ConstantInt<-1>())) {
- Index = LHS;
- return true; // Lower.
- }
-
- if (IsLoopInvariant(LHS)) {
- Index = RHS;
- Length = LHS;
- return true; // Upper.
- }
- return false;
-
- case ICmpInst::ICMP_ULT:
- std::swap(LHS, RHS);
- LLVM_FALLTHROUGH;
- case ICmpInst::ICMP_UGT:
- IsSigned = false;
- if (IsLoopInvariant(LHS)) {
- Index = RHS;
- Length = LHS;
- return true; // Both lower and upper.
- }
- return false;
- }
-
- llvm_unreachable("default clause returns!");
-}
-
-void InductiveRangeCheck::extractRangeChecksFromCond(
- Loop *L, ScalarEvolution &SE, Use &ConditionUse,
- SmallVectorImpl<InductiveRangeCheck> &Checks,
- SmallPtrSetImpl<Value *> &Visited) {
- Value *Condition = ConditionUse.get();
- if (!Visited.insert(Condition).second)
- return;
-
- // TODO: Do the same for OR, XOR, NOT etc?
- if (match(Condition, m_And(m_Value(), m_Value()))) {
- extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(0),
- Checks, Visited);
- extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(1),
- Checks, Visited);
- return;
- }
-
- ICmpInst *ICI = dyn_cast<ICmpInst>(Condition);
- if (!ICI)
- return;
-
- Value *Length = nullptr, *Index;
- bool IsSigned;
- if (!parseRangeCheckICmp(L, ICI, SE, Index, Length, IsSigned))
- return;
-
- const auto *IndexAddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Index));
- bool IsAffineIndex =
- IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine();
-
- if (!IsAffineIndex)
- return;
-
- const SCEV *End = nullptr;
- // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
- // We can potentially do much better here.
- if (Length)
- End = SE.getSCEV(Length);
- else {
- // So far we can only reach this point for Signed range check. This may
- // change in future. In this case we will need to pick Unsigned max for the
- // unsigned range check.
- unsigned BitWidth = cast<IntegerType>(IndexAddRec->getType())->getBitWidth();
- const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
- End = SIntMax;
- }
-
- InductiveRangeCheck IRC;
- IRC.End = End;
- IRC.Begin = IndexAddRec->getStart();
- IRC.Step = IndexAddRec->getStepRecurrence(SE);
- IRC.CheckUse = &ConditionUse;
- Checks.push_back(IRC);
-}
-
-void InductiveRangeCheck::extractRangeChecksFromBranch(
- BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI,
- SmallVectorImpl<InductiveRangeCheck> &Checks) {
- if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
- return;
-
- BranchProbability LikelyTaken(15, 16);
-
- if (!SkipProfitabilityChecks && BPI &&
- BPI->getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken)
- return;
-
- SmallPtrSet<Value *, 8> Visited;
- InductiveRangeCheck::extractRangeChecksFromCond(L, SE, BI->getOperandUse(0),
- Checks, Visited);
-}
-
-// Add metadata to the loop L to disable loop optimizations. Callers need to
-// confirm that optimizing loop L is not beneficial.
-static void DisableAllLoopOptsOnLoop(Loop &L) {
- // We do not care about any existing loopID related metadata for L, since we
- // are setting all loop metadata to false.
- LLVMContext &Context = L.getHeader()->getContext();
- // Reserve first location for self reference to the LoopID metadata node.
- MDNode *Dummy = MDNode::get(Context, {});
- MDNode *DisableUnroll = MDNode::get(
- Context, {MDString::get(Context, "llvm.loop.unroll.disable")});
- Metadata *FalseVal =
- ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), 0));
- MDNode *DisableVectorize = MDNode::get(
- Context,
- {MDString::get(Context, "llvm.loop.vectorize.enable"), FalseVal});
- MDNode *DisableLICMVersioning = MDNode::get(
- Context, {MDString::get(Context, "llvm.loop.licm_versioning.disable")});
- MDNode *DisableDistribution= MDNode::get(
- Context,
- {MDString::get(Context, "llvm.loop.distribute.enable"), FalseVal});
- MDNode *NewLoopID =
- MDNode::get(Context, {Dummy, DisableUnroll, DisableVectorize,
- DisableLICMVersioning, DisableDistribution});
- // Set operand 0 to refer to the loop id itself.
- NewLoopID->replaceOperandWith(0, NewLoopID);
- L.setLoopID(NewLoopID);
-}
-
-namespace {
-
-// Keeps track of the structure of a loop. This is similar to llvm::Loop,
-// except that it is more lightweight and can track the state of a loop through
-// changing and potentially invalid IR. This structure also formalizes the
-// kinds of loops we can deal with -- ones that have a single latch that is also
-// an exiting block *and* have a canonical induction variable.
-struct LoopStructure {
- const char *Tag = "";
-
- BasicBlock *Header = nullptr;
- BasicBlock *Latch = nullptr;
-
- // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th
- // successor is `LatchExit', the exit block of the loop.
- BranchInst *LatchBr = nullptr;
- BasicBlock *LatchExit = nullptr;
- unsigned LatchBrExitIdx = std::numeric_limits<unsigned>::max();
-
- // The loop represented by this instance of LoopStructure is semantically
- // equivalent to:
- //
- // intN_ty inc = IndVarIncreasing ? 1 : -1;
- // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT;
- //
- // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarBase)
- // ... body ...
-
- Value *IndVarBase = nullptr;
- Value *IndVarStart = nullptr;
- Value *IndVarStep = nullptr;
- Value *LoopExitAt = nullptr;
- bool IndVarIncreasing = false;
- bool IsSignedPredicate = true;
-
- LoopStructure() = default;
-
- template <typename M> LoopStructure map(M Map) const {
- LoopStructure Result;
- Result.Tag = Tag;
- Result.Header = cast<BasicBlock>(Map(Header));
- Result.Latch = cast<BasicBlock>(Map(Latch));
- Result.LatchBr = cast<BranchInst>(Map(LatchBr));
- Result.LatchExit = cast<BasicBlock>(Map(LatchExit));
- Result.LatchBrExitIdx = LatchBrExitIdx;
- Result.IndVarBase = Map(IndVarBase);
- Result.IndVarStart = Map(IndVarStart);
- Result.IndVarStep = Map(IndVarStep);
- Result.LoopExitAt = Map(LoopExitAt);
- Result.IndVarIncreasing = IndVarIncreasing;
- Result.IsSignedPredicate = IsSignedPredicate;
- return Result;
- }
-
+
+ bool run(Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop);
+};
+
+class IRCELegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ IRCELegacyPass() : FunctionPass(ID) {
+ initializeIRCELegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<BranchProbabilityInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override;
+};
+
+} // end anonymous namespace
+
+char IRCELegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(IRCELegacyPass, "irce",
+ "Inductive range check elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(IRCELegacyPass, "irce", "Inductive range check elimination",
+ false, false)
+
+/// Parse a single ICmp instruction, `ICI`, into a range check. If `ICI` cannot
+/// be interpreted as a range check, return false and set `Index` and `Length`
+/// to `nullptr`. Otherwise set `Index` to the value being range checked, and
+/// set `Length` to the upper limit `Index` is being range checked.
+bool
+InductiveRangeCheck::parseRangeCheckICmp(Loop *L, ICmpInst *ICI,
+ ScalarEvolution &SE, Value *&Index,
+ Value *&Length, bool &IsSigned) {
+ auto IsLoopInvariant = [&SE, L](Value *V) {
+ return SE.isLoopInvariant(SE.getSCEV(V), L);
+ };
+
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+ Value *LHS = ICI->getOperand(0);
+ Value *RHS = ICI->getOperand(1);
+
+ switch (Pred) {
+ default:
+ return false;
+
+ case ICmpInst::ICMP_SLE:
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ICmpInst::ICMP_SGE:
+ IsSigned = true;
+ if (match(RHS, m_ConstantInt<0>())) {
+ Index = LHS;
+ return true; // Lower.
+ }
+ return false;
+
+ case ICmpInst::ICMP_SLT:
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ICmpInst::ICMP_SGT:
+ IsSigned = true;
+ if (match(RHS, m_ConstantInt<-1>())) {
+ Index = LHS;
+ return true; // Lower.
+ }
+
+ if (IsLoopInvariant(LHS)) {
+ Index = RHS;
+ Length = LHS;
+ return true; // Upper.
+ }
+ return false;
+
+ case ICmpInst::ICMP_ULT:
+ std::swap(LHS, RHS);
+ LLVM_FALLTHROUGH;
+ case ICmpInst::ICMP_UGT:
+ IsSigned = false;
+ if (IsLoopInvariant(LHS)) {
+ Index = RHS;
+ Length = LHS;
+ return true; // Both lower and upper.
+ }
+ return false;
+ }
+
+ llvm_unreachable("default clause returns!");
+}
+
+void InductiveRangeCheck::extractRangeChecksFromCond(
+ Loop *L, ScalarEvolution &SE, Use &ConditionUse,
+ SmallVectorImpl<InductiveRangeCheck> &Checks,
+ SmallPtrSetImpl<Value *> &Visited) {
+ Value *Condition = ConditionUse.get();
+ if (!Visited.insert(Condition).second)
+ return;
+
+ // TODO: Do the same for OR, XOR, NOT etc?
+ if (match(Condition, m_And(m_Value(), m_Value()))) {
+ extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(0),
+ Checks, Visited);
+ extractRangeChecksFromCond(L, SE, cast<User>(Condition)->getOperandUse(1),
+ Checks, Visited);
+ return;
+ }
+
+ ICmpInst *ICI = dyn_cast<ICmpInst>(Condition);
+ if (!ICI)
+ return;
+
+ Value *Length = nullptr, *Index;
+ bool IsSigned;
+ if (!parseRangeCheckICmp(L, ICI, SE, Index, Length, IsSigned))
+ return;
+
+ const auto *IndexAddRec = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Index));
+ bool IsAffineIndex =
+ IndexAddRec && (IndexAddRec->getLoop() == L) && IndexAddRec->isAffine();
+
+ if (!IsAffineIndex)
+ return;
+
+ const SCEV *End = nullptr;
+ // We strengthen "0 <= I" to "0 <= I < INT_SMAX" and "I < L" to "0 <= I < L".
+ // We can potentially do much better here.
+ if (Length)
+ End = SE.getSCEV(Length);
+ else {
+ // So far we can only reach this point for Signed range check. This may
+ // change in future. In this case we will need to pick Unsigned max for the
+ // unsigned range check.
+ unsigned BitWidth = cast<IntegerType>(IndexAddRec->getType())->getBitWidth();
+ const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+ End = SIntMax;
+ }
+
+ InductiveRangeCheck IRC;
+ IRC.End = End;
+ IRC.Begin = IndexAddRec->getStart();
+ IRC.Step = IndexAddRec->getStepRecurrence(SE);
+ IRC.CheckUse = &ConditionUse;
+ Checks.push_back(IRC);
+}
+
+void InductiveRangeCheck::extractRangeChecksFromBranch(
+ BranchInst *BI, Loop *L, ScalarEvolution &SE, BranchProbabilityInfo *BPI,
+ SmallVectorImpl<InductiveRangeCheck> &Checks) {
+ if (BI->isUnconditional() || BI->getParent() == L->getLoopLatch())
+ return;
+
+ BranchProbability LikelyTaken(15, 16);
+
+ if (!SkipProfitabilityChecks && BPI &&
+ BPI->getEdgeProbability(BI->getParent(), (unsigned)0) < LikelyTaken)
+ return;
+
+ SmallPtrSet<Value *, 8> Visited;
+ InductiveRangeCheck::extractRangeChecksFromCond(L, SE, BI->getOperandUse(0),
+ Checks, Visited);
+}
+
+// Add metadata to the loop L to disable loop optimizations. Callers need to
+// confirm that optimizing loop L is not beneficial.
+static void DisableAllLoopOptsOnLoop(Loop &L) {
+ // We do not care about any existing loopID related metadata for L, since we
+ // are setting all loop metadata to false.
+ LLVMContext &Context = L.getHeader()->getContext();
+ // Reserve first location for self reference to the LoopID metadata node.
+ MDNode *Dummy = MDNode::get(Context, {});
+ MDNode *DisableUnroll = MDNode::get(
+ Context, {MDString::get(Context, "llvm.loop.unroll.disable")});
+ Metadata *FalseVal =
+ ConstantAsMetadata::get(ConstantInt::get(Type::getInt1Ty(Context), 0));
+ MDNode *DisableVectorize = MDNode::get(
+ Context,
+ {MDString::get(Context, "llvm.loop.vectorize.enable"), FalseVal});
+ MDNode *DisableLICMVersioning = MDNode::get(
+ Context, {MDString::get(Context, "llvm.loop.licm_versioning.disable")});
+ MDNode *DisableDistribution= MDNode::get(
+ Context,
+ {MDString::get(Context, "llvm.loop.distribute.enable"), FalseVal});
+ MDNode *NewLoopID =
+ MDNode::get(Context, {Dummy, DisableUnroll, DisableVectorize,
+ DisableLICMVersioning, DisableDistribution});
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+ L.setLoopID(NewLoopID);
+}
+
+namespace {
+
+// Keeps track of the structure of a loop. This is similar to llvm::Loop,
+// except that it is more lightweight and can track the state of a loop through
+// changing and potentially invalid IR. This structure also formalizes the
+// kinds of loops we can deal with -- ones that have a single latch that is also
+// an exiting block *and* have a canonical induction variable.
+struct LoopStructure {
+ const char *Tag = "";
+
+ BasicBlock *Header = nullptr;
+ BasicBlock *Latch = nullptr;
+
+ // `Latch's terminator instruction is `LatchBr', and it's `LatchBrExitIdx'th
+ // successor is `LatchExit', the exit block of the loop.
+ BranchInst *LatchBr = nullptr;
+ BasicBlock *LatchExit = nullptr;
+ unsigned LatchBrExitIdx = std::numeric_limits<unsigned>::max();
+
+ // The loop represented by this instance of LoopStructure is semantically
+ // equivalent to:
+ //
+ // intN_ty inc = IndVarIncreasing ? 1 : -1;
+ // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT;
+ //
+ // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarBase)
+ // ... body ...
+
+ Value *IndVarBase = nullptr;
+ Value *IndVarStart = nullptr;
+ Value *IndVarStep = nullptr;
+ Value *LoopExitAt = nullptr;
+ bool IndVarIncreasing = false;
+ bool IsSignedPredicate = true;
+
+ LoopStructure() = default;
+
+ template <typename M> LoopStructure map(M Map) const {
+ LoopStructure Result;
+ Result.Tag = Tag;
+ Result.Header = cast<BasicBlock>(Map(Header));
+ Result.Latch = cast<BasicBlock>(Map(Latch));
+ Result.LatchBr = cast<BranchInst>(Map(LatchBr));
+ Result.LatchExit = cast<BasicBlock>(Map(LatchExit));
+ Result.LatchBrExitIdx = LatchBrExitIdx;
+ Result.IndVarBase = Map(IndVarBase);
+ Result.IndVarStart = Map(IndVarStart);
+ Result.IndVarStep = Map(IndVarStep);
+ Result.LoopExitAt = Map(LoopExitAt);
+ Result.IndVarIncreasing = IndVarIncreasing;
+ Result.IsSignedPredicate = IsSignedPredicate;
+ return Result;
+ }
+
static Optional<LoopStructure> parseLoopStructure(ScalarEvolution &, Loop &,
const char *&);
-};
-
-/// This class is used to constrain loops to run within a given iteration space.
-/// The algorithm this class implements is given a Loop and a range [Begin,
-/// End). The algorithm then tries to break out a "main loop" out of the loop
-/// it is given in a way that the "main loop" runs with the induction variable
-/// in a subset of [Begin, End). The algorithm emits appropriate pre and post
-/// loops to run any remaining iterations. The pre loop runs any iterations in
-/// which the induction variable is < Begin, and the post loop runs any
-/// iterations in which the induction variable is >= End.
-class LoopConstrainer {
- // The representation of a clone of the original loop we started out with.
- struct ClonedLoop {
- // The cloned blocks
- std::vector<BasicBlock *> Blocks;
-
- // `Map` maps values in the clonee into values in the cloned version
- ValueToValueMapTy Map;
-
- // An instance of `LoopStructure` for the cloned loop
- LoopStructure Structure;
- };
-
- // Result of rewriting the range of a loop. See changeIterationSpaceEnd for
- // more details on what these fields mean.
- struct RewrittenRangeInfo {
- BasicBlock *PseudoExit = nullptr;
- BasicBlock *ExitSelector = nullptr;
- std::vector<PHINode *> PHIValuesAtPseudoExit;
- PHINode *IndVarEnd = nullptr;
-
- RewrittenRangeInfo() = default;
- };
-
- // Calculated subranges we restrict the iteration space of the main loop to.
- // See the implementation of `calculateSubRanges' for more details on how
- // these fields are computed. `LowLimit` is None if there is no restriction
- // on low end of the restricted iteration space of the main loop. `HighLimit`
- // is None if there is no restriction on high end of the restricted iteration
- // space of the main loop.
-
- struct SubRanges {
- Optional<const SCEV *> LowLimit;
- Optional<const SCEV *> HighLimit;
- };
-
- // Compute a safe set of limits for the main loop to run in -- effectively the
- // intersection of `Range' and the iteration space of the original loop.
- // Return None if unable to compute the set of subranges.
- Optional<SubRanges> calculateSubRanges(bool IsSignedPredicate) const;
-
- // Clone `OriginalLoop' and return the result in CLResult. The IR after
- // running `cloneLoop' is well formed except for the PHI nodes in CLResult --
- // the PHI nodes say that there is an incoming edge from `OriginalPreheader`
- // but there is no such edge.
- void cloneLoop(ClonedLoop &CLResult, const char *Tag) const;
-
- // Create the appropriate loop structure needed to describe a cloned copy of
- // `Original`. The clone is described by `VM`.
- Loop *createClonedLoopStructure(Loop *Original, Loop *Parent,
- ValueToValueMapTy &VM, bool IsSubloop);
-
- // Rewrite the iteration space of the loop denoted by (LS, Preheader). The
- // iteration space of the rewritten loop ends at ExitLoopAt. The start of the
- // iteration space is not changed. `ExitLoopAt' is assumed to be slt
- // `OriginalHeaderCount'.
- //
- // If there are iterations left to execute, control is made to jump to
- // `ContinuationBlock', otherwise they take the normal loop exit. The
- // returned `RewrittenRangeInfo' object is populated as follows:
- //
- // .PseudoExit is a basic block that unconditionally branches to
- // `ContinuationBlock'.
- //
- // .ExitSelector is a basic block that decides, on exit from the loop,
- // whether to branch to the "true" exit or to `PseudoExit'.
- //
- // .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value
- // for each PHINode in the loop header on taking the pseudo exit.
- //
- // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate
- // preheader because it is made to branch to the loop header only
- // conditionally.
- RewrittenRangeInfo
- changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader,
- Value *ExitLoopAt,
- BasicBlock *ContinuationBlock) const;
-
- // The loop denoted by `LS' has `OldPreheader' as its preheader. This
- // function creates a new preheader for `LS' and returns it.
- BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader,
- const char *Tag) const;
-
- // `ContinuationBlockAndPreheader' was the continuation block for some call to
- // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'.
- // This function rewrites the PHI nodes in `LS.Header' to start with the
- // correct value.
- void rewriteIncomingValuesForPHIs(
- LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader,
- const LoopConstrainer::RewrittenRangeInfo &RRI) const;
-
- // Even though we do not preserve any passes at this time, we at least need to
- // keep the parent loop structure consistent. The `LPPassManager' seems to
- // verify this after running a loop pass. This function adds the list of
- // blocks denoted by BBs to this loops parent loop if required.
- void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs);
-
- // Some global state.
- Function &F;
- LLVMContext &Ctx;
- ScalarEvolution &SE;
- DominatorTree &DT;
- LoopInfo &LI;
- function_ref<void(Loop *, bool)> LPMAddNewLoop;
-
- // Information about the original loop we started out with.
- Loop &OriginalLoop;
-
- const SCEV *LatchTakenCount = nullptr;
- BasicBlock *OriginalPreheader = nullptr;
-
- // The preheader of the main loop. This may or may not be different from
- // `OriginalPreheader'.
- BasicBlock *MainLoopPreheader = nullptr;
-
- // The range we need to run the main loop in.
- InductiveRangeCheck::Range Range;
-
- // The structure of the main loop (see comment at the beginning of this class
- // for a definition)
- LoopStructure MainLoopStructure;
-
-public:
- LoopConstrainer(Loop &L, LoopInfo &LI,
- function_ref<void(Loop *, bool)> LPMAddNewLoop,
- const LoopStructure &LS, ScalarEvolution &SE,
- DominatorTree &DT, InductiveRangeCheck::Range R)
- : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
- SE(SE), DT(DT), LI(LI), LPMAddNewLoop(LPMAddNewLoop), OriginalLoop(L),
- Range(R), MainLoopStructure(LS) {}
-
- // Entry point for the algorithm. Returns true on success.
- bool run();
-};
-
-} // end anonymous namespace
-
-/// Given a loop with an deccreasing induction variable, is it possible to
-/// safely calculate the bounds of a new loop using the given Predicate.
-static bool isSafeDecreasingBound(const SCEV *Start,
- const SCEV *BoundSCEV, const SCEV *Step,
- ICmpInst::Predicate Pred,
- unsigned LatchBrExitIdx,
- Loop *L, ScalarEvolution &SE) {
- if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
- Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
- return false;
-
- if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
- return false;
-
- assert(SE.isKnownNegative(Step) && "expecting negative step");
-
- LLVM_DEBUG(dbgs() << "irce: isSafeDecreasingBound with:\n");
- LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n");
- LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n");
- LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n");
- LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred)
- << "\n");
- LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n");
-
- bool IsSigned = ICmpInst::isSigned(Pred);
- // The predicate that we need to check that the induction variable lies
- // within bounds.
- ICmpInst::Predicate BoundPred =
- IsSigned ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
-
- if (LatchBrExitIdx == 1)
- return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
-
- assert(LatchBrExitIdx == 0 &&
- "LatchBrExitIdx should be either 0 or 1");
-
- const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType()));
- unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
- APInt Min = IsSigned ? APInt::getSignedMinValue(BitWidth) :
- APInt::getMinValue(BitWidth);
- const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Min), StepPlusOne);
-
- const SCEV *MinusOne =
- SE.getMinusSCEV(BoundSCEV, SE.getOne(BoundSCEV->getType()));
-
- return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, MinusOne) &&
- SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit);
-
-}
-
-/// Given a loop with an increasing induction variable, is it possible to
-/// safely calculate the bounds of a new loop using the given Predicate.
-static bool isSafeIncreasingBound(const SCEV *Start,
- const SCEV *BoundSCEV, const SCEV *Step,
- ICmpInst::Predicate Pred,
- unsigned LatchBrExitIdx,
- Loop *L, ScalarEvolution &SE) {
- if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
- Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
- return false;
-
- if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
- return false;
-
- LLVM_DEBUG(dbgs() << "irce: isSafeIncreasingBound with:\n");
- LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n");
- LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n");
- LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n");
- LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred)
- << "\n");
- LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n");
-
- bool IsSigned = ICmpInst::isSigned(Pred);
- // The predicate that we need to check that the induction variable lies
- // within bounds.
- ICmpInst::Predicate BoundPred =
- IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
-
- if (LatchBrExitIdx == 1)
- return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
-
- assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be 0 or 1");
-
- const SCEV *StepMinusOne =
- SE.getMinusSCEV(Step, SE.getOne(Step->getType()));
- unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
- APInt Max = IsSigned ? APInt::getSignedMaxValue(BitWidth) :
- APInt::getMaxValue(BitWidth);
- const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Max), StepMinusOne);
-
- return (SE.isLoopEntryGuardedByCond(L, BoundPred, Start,
- SE.getAddExpr(BoundSCEV, Step)) &&
- SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit));
-}
-
-Optional<LoopStructure>
+};
+
+/// This class is used to constrain loops to run within a given iteration space.
+/// The algorithm this class implements is given a Loop and a range [Begin,
+/// End). The algorithm then tries to break out a "main loop" out of the loop
+/// it is given in a way that the "main loop" runs with the induction variable
+/// in a subset of [Begin, End). The algorithm emits appropriate pre and post
+/// loops to run any remaining iterations. The pre loop runs any iterations in
+/// which the induction variable is < Begin, and the post loop runs any
+/// iterations in which the induction variable is >= End.
+class LoopConstrainer {
+ // The representation of a clone of the original loop we started out with.
+ struct ClonedLoop {
+ // The cloned blocks
+ std::vector<BasicBlock *> Blocks;
+
+ // `Map` maps values in the clonee into values in the cloned version
+ ValueToValueMapTy Map;
+
+ // An instance of `LoopStructure` for the cloned loop
+ LoopStructure Structure;
+ };
+
+ // Result of rewriting the range of a loop. See changeIterationSpaceEnd for
+ // more details on what these fields mean.
+ struct RewrittenRangeInfo {
+ BasicBlock *PseudoExit = nullptr;
+ BasicBlock *ExitSelector = nullptr;
+ std::vector<PHINode *> PHIValuesAtPseudoExit;
+ PHINode *IndVarEnd = nullptr;
+
+ RewrittenRangeInfo() = default;
+ };
+
+ // Calculated subranges we restrict the iteration space of the main loop to.
+ // See the implementation of `calculateSubRanges' for more details on how
+ // these fields are computed. `LowLimit` is None if there is no restriction
+ // on low end of the restricted iteration space of the main loop. `HighLimit`
+ // is None if there is no restriction on high end of the restricted iteration
+ // space of the main loop.
+
+ struct SubRanges {
+ Optional<const SCEV *> LowLimit;
+ Optional<const SCEV *> HighLimit;
+ };
+
+ // Compute a safe set of limits for the main loop to run in -- effectively the
+ // intersection of `Range' and the iteration space of the original loop.
+ // Return None if unable to compute the set of subranges.
+ Optional<SubRanges> calculateSubRanges(bool IsSignedPredicate) const;
+
+ // Clone `OriginalLoop' and return the result in CLResult. The IR after
+ // running `cloneLoop' is well formed except for the PHI nodes in CLResult --
+ // the PHI nodes say that there is an incoming edge from `OriginalPreheader`
+ // but there is no such edge.
+ void cloneLoop(ClonedLoop &CLResult, const char *Tag) const;
+
+ // Create the appropriate loop structure needed to describe a cloned copy of
+ // `Original`. The clone is described by `VM`.
+ Loop *createClonedLoopStructure(Loop *Original, Loop *Parent,
+ ValueToValueMapTy &VM, bool IsSubloop);
+
+ // Rewrite the iteration space of the loop denoted by (LS, Preheader). The
+ // iteration space of the rewritten loop ends at ExitLoopAt. The start of the
+ // iteration space is not changed. `ExitLoopAt' is assumed to be slt
+ // `OriginalHeaderCount'.
+ //
+ // If there are iterations left to execute, control is made to jump to
+ // `ContinuationBlock', otherwise they take the normal loop exit. The
+ // returned `RewrittenRangeInfo' object is populated as follows:
+ //
+ // .PseudoExit is a basic block that unconditionally branches to
+ // `ContinuationBlock'.
+ //
+ // .ExitSelector is a basic block that decides, on exit from the loop,
+ // whether to branch to the "true" exit or to `PseudoExit'.
+ //
+ // .PHIValuesAtPseudoExit are PHINodes in `PseudoExit' that compute the value
+ // for each PHINode in the loop header on taking the pseudo exit.
+ //
+ // After changeIterationSpaceEnd, `Preheader' is no longer a legitimate
+ // preheader because it is made to branch to the loop header only
+ // conditionally.
+ RewrittenRangeInfo
+ changeIterationSpaceEnd(const LoopStructure &LS, BasicBlock *Preheader,
+ Value *ExitLoopAt,
+ BasicBlock *ContinuationBlock) const;
+
+ // The loop denoted by `LS' has `OldPreheader' as its preheader. This
+ // function creates a new preheader for `LS' and returns it.
+ BasicBlock *createPreheader(const LoopStructure &LS, BasicBlock *OldPreheader,
+ const char *Tag) const;
+
+ // `ContinuationBlockAndPreheader' was the continuation block for some call to
+ // `changeIterationSpaceEnd' and is the preheader to the loop denoted by `LS'.
+ // This function rewrites the PHI nodes in `LS.Header' to start with the
+ // correct value.
+ void rewriteIncomingValuesForPHIs(
+ LoopStructure &LS, BasicBlock *ContinuationBlockAndPreheader,
+ const LoopConstrainer::RewrittenRangeInfo &RRI) const;
+
+ // Even though we do not preserve any passes at this time, we at least need to
+ // keep the parent loop structure consistent. The `LPPassManager' seems to
+ // verify this after running a loop pass. This function adds the list of
+ // blocks denoted by BBs to this loops parent loop if required.
+ void addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs);
+
+ // Some global state.
+ Function &F;
+ LLVMContext &Ctx;
+ ScalarEvolution &SE;
+ DominatorTree &DT;
+ LoopInfo &LI;
+ function_ref<void(Loop *, bool)> LPMAddNewLoop;
+
+ // Information about the original loop we started out with.
+ Loop &OriginalLoop;
+
+ const SCEV *LatchTakenCount = nullptr;
+ BasicBlock *OriginalPreheader = nullptr;
+
+ // The preheader of the main loop. This may or may not be different from
+ // `OriginalPreheader'.
+ BasicBlock *MainLoopPreheader = nullptr;
+
+ // The range we need to run the main loop in.
+ InductiveRangeCheck::Range Range;
+
+ // The structure of the main loop (see comment at the beginning of this class
+ // for a definition)
+ LoopStructure MainLoopStructure;
+
+public:
+ LoopConstrainer(Loop &L, LoopInfo &LI,
+ function_ref<void(Loop *, bool)> LPMAddNewLoop,
+ const LoopStructure &LS, ScalarEvolution &SE,
+ DominatorTree &DT, InductiveRangeCheck::Range R)
+ : F(*L.getHeader()->getParent()), Ctx(L.getHeader()->getContext()),
+ SE(SE), DT(DT), LI(LI), LPMAddNewLoop(LPMAddNewLoop), OriginalLoop(L),
+ Range(R), MainLoopStructure(LS) {}
+
+ // Entry point for the algorithm. Returns true on success.
+ bool run();
+};
+
+} // end anonymous namespace
+
+/// Given a loop with an deccreasing induction variable, is it possible to
+/// safely calculate the bounds of a new loop using the given Predicate.
+static bool isSafeDecreasingBound(const SCEV *Start,
+ const SCEV *BoundSCEV, const SCEV *Step,
+ ICmpInst::Predicate Pred,
+ unsigned LatchBrExitIdx,
+ Loop *L, ScalarEvolution &SE) {
+ if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
+ Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
+ return false;
+
+ if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
+ return false;
+
+ assert(SE.isKnownNegative(Step) && "expecting negative step");
+
+ LLVM_DEBUG(dbgs() << "irce: isSafeDecreasingBound with:\n");
+ LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n");
+ LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n");
+ LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n");
+ LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred)
+ << "\n");
+ LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n");
+
+ bool IsSigned = ICmpInst::isSigned(Pred);
+ // The predicate that we need to check that the induction variable lies
+ // within bounds.
+ ICmpInst::Predicate BoundPred =
+ IsSigned ? CmpInst::ICMP_SGT : CmpInst::ICMP_UGT;
+
+ if (LatchBrExitIdx == 1)
+ return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+
+ assert(LatchBrExitIdx == 0 &&
+ "LatchBrExitIdx should be either 0 or 1");
+
+ const SCEV *StepPlusOne = SE.getAddExpr(Step, SE.getOne(Step->getType()));
+ unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+ APInt Min = IsSigned ? APInt::getSignedMinValue(BitWidth) :
+ APInt::getMinValue(BitWidth);
+ const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Min), StepPlusOne);
+
+ const SCEV *MinusOne =
+ SE.getMinusSCEV(BoundSCEV, SE.getOne(BoundSCEV->getType()));
+
+ return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, MinusOne) &&
+ SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit);
+
+}
+
+/// Given a loop with an increasing induction variable, is it possible to
+/// safely calculate the bounds of a new loop using the given Predicate.
+static bool isSafeIncreasingBound(const SCEV *Start,
+ const SCEV *BoundSCEV, const SCEV *Step,
+ ICmpInst::Predicate Pred,
+ unsigned LatchBrExitIdx,
+ Loop *L, ScalarEvolution &SE) {
+ if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_SGT &&
+ Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_UGT)
+ return false;
+
+ if (!SE.isAvailableAtLoopEntry(BoundSCEV, L))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "irce: isSafeIncreasingBound with:\n");
+ LLVM_DEBUG(dbgs() << "irce: Start: " << *Start << "\n");
+ LLVM_DEBUG(dbgs() << "irce: Step: " << *Step << "\n");
+ LLVM_DEBUG(dbgs() << "irce: BoundSCEV: " << *BoundSCEV << "\n");
+ LLVM_DEBUG(dbgs() << "irce: Pred: " << ICmpInst::getPredicateName(Pred)
+ << "\n");
+ LLVM_DEBUG(dbgs() << "irce: LatchExitBrIdx: " << LatchBrExitIdx << "\n");
+
+ bool IsSigned = ICmpInst::isSigned(Pred);
+ // The predicate that we need to check that the induction variable lies
+ // within bounds.
+ ICmpInst::Predicate BoundPred =
+ IsSigned ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
+
+ if (LatchBrExitIdx == 1)
+ return SE.isLoopEntryGuardedByCond(L, BoundPred, Start, BoundSCEV);
+
+ assert(LatchBrExitIdx == 0 && "LatchBrExitIdx should be 0 or 1");
+
+ const SCEV *StepMinusOne =
+ SE.getMinusSCEV(Step, SE.getOne(Step->getType()));
+ unsigned BitWidth = cast<IntegerType>(BoundSCEV->getType())->getBitWidth();
+ APInt Max = IsSigned ? APInt::getSignedMaxValue(BitWidth) :
+ APInt::getMaxValue(BitWidth);
+ const SCEV *Limit = SE.getMinusSCEV(SE.getConstant(Max), StepMinusOne);
+
+ return (SE.isLoopEntryGuardedByCond(L, BoundPred, Start,
+ SE.getAddExpr(BoundSCEV, Step)) &&
+ SE.isLoopEntryGuardedByCond(L, BoundPred, BoundSCEV, Limit));
+}
+
+Optional<LoopStructure>
LoopStructure::parseLoopStructure(ScalarEvolution &SE, Loop &L,
- const char *&FailureReason) {
- if (!L.isLoopSimplifyForm()) {
- FailureReason = "loop not in LoopSimplify form";
- return None;
- }
-
- BasicBlock *Latch = L.getLoopLatch();
- assert(Latch && "Simplified loops only have one latch!");
-
- if (Latch->getTerminator()->getMetadata(ClonedLoopTag)) {
- FailureReason = "loop has already been cloned";
- return None;
- }
-
- if (!L.isLoopExiting(Latch)) {
- FailureReason = "no loop latch";
- return None;
- }
-
- BasicBlock *Header = L.getHeader();
- BasicBlock *Preheader = L.getLoopPreheader();
- if (!Preheader) {
- FailureReason = "no preheader";
- return None;
- }
-
- BranchInst *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
- if (!LatchBr || LatchBr->isUnconditional()) {
- FailureReason = "latch terminator not conditional branch";
- return None;
- }
-
- unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
-
- ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition());
- if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) {
- FailureReason = "latch terminator branch not conditional on integral icmp";
- return None;
- }
-
- const SCEV *LatchCount = SE.getExitCount(&L, Latch);
- if (isa<SCEVCouldNotCompute>(LatchCount)) {
- FailureReason = "could not compute latch count";
- return None;
- }
-
- ICmpInst::Predicate Pred = ICI->getPredicate();
- Value *LeftValue = ICI->getOperand(0);
- const SCEV *LeftSCEV = SE.getSCEV(LeftValue);
- IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType());
-
- Value *RightValue = ICI->getOperand(1);
- const SCEV *RightSCEV = SE.getSCEV(RightValue);
-
- // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence.
- if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
- if (isa<SCEVAddRecExpr>(RightSCEV)) {
- std::swap(LeftSCEV, RightSCEV);
- std::swap(LeftValue, RightValue);
- Pred = ICmpInst::getSwappedPredicate(Pred);
- } else {
- FailureReason = "no add recurrences in the icmp";
- return None;
- }
- }
-
- auto HasNoSignedWrap = [&](const SCEVAddRecExpr *AR) {
- if (AR->getNoWrapFlags(SCEV::FlagNSW))
- return true;
-
- IntegerType *Ty = cast<IntegerType>(AR->getType());
- IntegerType *WideTy =
- IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2);
-
- const SCEVAddRecExpr *ExtendAfterOp =
- dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
- if (ExtendAfterOp) {
- const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy);
- const SCEV *ExtendedStep =
- SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy);
-
- bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart &&
- ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep;
-
- if (NoSignedWrap)
- return true;
- }
-
- // We may have proved this when computing the sign extension above.
- return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
- };
-
- // `ICI` is interpreted as taking the backedge if the *next* value of the
- // induction variable satisfies some constraint.
-
- const SCEVAddRecExpr *IndVarBase = cast<SCEVAddRecExpr>(LeftSCEV);
- if (!IndVarBase->isAffine()) {
- FailureReason = "LHS in icmp not induction variable";
- return None;
- }
- const SCEV* StepRec = IndVarBase->getStepRecurrence(SE);
- if (!isa<SCEVConstant>(StepRec)) {
- FailureReason = "LHS in icmp not induction variable";
- return None;
- }
- ConstantInt *StepCI = cast<SCEVConstant>(StepRec)->getValue();
-
- if (ICI->isEquality() && !HasNoSignedWrap(IndVarBase)) {
- FailureReason = "LHS in icmp needs nsw for equality predicates";
- return None;
- }
-
- assert(!StepCI->isZero() && "Zero step?");
- bool IsIncreasing = !StepCI->isNegative();
- bool IsSignedPredicate;
- const SCEV *StartNext = IndVarBase->getStart();
- const SCEV *Addend = SE.getNegativeSCEV(IndVarBase->getStepRecurrence(SE));
- const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
- const SCEV *Step = SE.getSCEV(StepCI);
-
- const SCEV *FixedRightSCEV = nullptr;
-
- // If RightValue resides within loop (but still being loop invariant),
- // regenerate it as preheader.
- if (auto *I = dyn_cast<Instruction>(RightValue))
- if (L.contains(I->getParent()))
- FixedRightSCEV = RightSCEV;
-
- if (IsIncreasing) {
- bool DecreasedRightValueByOne = false;
- if (StepCI->isOne()) {
- // Try to turn eq/ne predicates to those we can work with.
- if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
- // while (++i != len) { while (++i < len) {
- // ... ---> ...
- // } }
- // If both parts are known non-negative, it is profitable to use
- // unsigned comparison in increasing loop. This allows us to make the
- // comparison check against "RightSCEV + 1" more optimistic.
- if (isKnownNonNegativeInLoop(IndVarStart, &L, SE) &&
- isKnownNonNegativeInLoop(RightSCEV, &L, SE))
- Pred = ICmpInst::ICMP_ULT;
- else
- Pred = ICmpInst::ICMP_SLT;
- else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
- // while (true) { while (true) {
- // if (++i == len) ---> if (++i > len - 1)
- // break; break;
- // ... ...
- // } }
- if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
- cannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/false)) {
- Pred = ICmpInst::ICMP_UGT;
- RightSCEV = SE.getMinusSCEV(RightSCEV,
- SE.getOne(RightSCEV->getType()));
- DecreasedRightValueByOne = true;
- } else if (cannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/true)) {
- Pred = ICmpInst::ICMP_SGT;
- RightSCEV = SE.getMinusSCEV(RightSCEV,
- SE.getOne(RightSCEV->getType()));
- DecreasedRightValueByOne = true;
- }
- }
- }
-
- bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
- bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
- bool FoundExpectedPred =
- (LTPred && LatchBrExitIdx == 1) || (GTPred && LatchBrExitIdx == 0);
-
- if (!FoundExpectedPred) {
- FailureReason = "expected icmp slt semantically, found something else";
- return None;
- }
-
- IsSignedPredicate = ICmpInst::isSigned(Pred);
- if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
- FailureReason = "unsigned latch conditions are explicitly prohibited";
- return None;
- }
-
- if (!isSafeIncreasingBound(IndVarStart, RightSCEV, Step, Pred,
- LatchBrExitIdx, &L, SE)) {
- FailureReason = "Unsafe loop bounds";
- return None;
- }
- if (LatchBrExitIdx == 0) {
- // We need to increase the right value unless we have already decreased
- // it virtually when we replaced EQ with SGT.
- if (!DecreasedRightValueByOne)
- FixedRightSCEV =
- SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
- } else {
- assert(!DecreasedRightValueByOne &&
- "Right value can be decreased only for LatchBrExitIdx == 0!");
- }
- } else {
- bool IncreasedRightValueByOne = false;
- if (StepCI->isMinusOne()) {
- // Try to turn eq/ne predicates to those we can work with.
- if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
- // while (--i != len) { while (--i > len) {
- // ... ---> ...
- // } }
- // We intentionally don't turn the predicate into UGT even if we know
- // that both operands are non-negative, because it will only pessimize
- // our check against "RightSCEV - 1".
- Pred = ICmpInst::ICMP_SGT;
- else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
- // while (true) { while (true) {
- // if (--i == len) ---> if (--i < len + 1)
- // break; break;
- // ... ...
- // } }
- if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
- cannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ false)) {
- Pred = ICmpInst::ICMP_ULT;
- RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
- IncreasedRightValueByOne = true;
- } else if (cannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ true)) {
- Pred = ICmpInst::ICMP_SLT;
- RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
- IncreasedRightValueByOne = true;
- }
- }
- }
-
- bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
- bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
-
- bool FoundExpectedPred =
- (GTPred && LatchBrExitIdx == 1) || (LTPred && LatchBrExitIdx == 0);
-
- if (!FoundExpectedPred) {
- FailureReason = "expected icmp sgt semantically, found something else";
- return None;
- }
-
- IsSignedPredicate =
- Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT;
-
- if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
- FailureReason = "unsigned latch conditions are explicitly prohibited";
- return None;
- }
-
- if (!isSafeDecreasingBound(IndVarStart, RightSCEV, Step, Pred,
- LatchBrExitIdx, &L, SE)) {
- FailureReason = "Unsafe bounds";
- return None;
- }
-
- if (LatchBrExitIdx == 0) {
- // We need to decrease the right value unless we have already increased
- // it virtually when we replaced EQ with SLT.
- if (!IncreasedRightValueByOne)
- FixedRightSCEV =
- SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
- } else {
- assert(!IncreasedRightValueByOne &&
- "Right value can be increased only for LatchBrExitIdx == 0!");
- }
- }
- BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
-
- assert(SE.getLoopDisposition(LatchCount, &L) ==
- ScalarEvolution::LoopInvariant &&
- "loop variant exit count doesn't make sense!");
-
- assert(!L.contains(LatchExit) && "expected an exit block!");
- const DataLayout &DL = Preheader->getModule()->getDataLayout();
- SCEVExpander Expander(SE, DL, "irce");
- Instruction *Ins = Preheader->getTerminator();
-
- if (FixedRightSCEV)
- RightValue =
- Expander.expandCodeFor(FixedRightSCEV, FixedRightSCEV->getType(), Ins);
-
- Value *IndVarStartV = Expander.expandCodeFor(IndVarStart, IndVarTy, Ins);
- IndVarStartV->setName("indvar.start");
-
- LoopStructure Result;
-
- Result.Tag = "main";
- Result.Header = Header;
- Result.Latch = Latch;
- Result.LatchBr = LatchBr;
- Result.LatchExit = LatchExit;
- Result.LatchBrExitIdx = LatchBrExitIdx;
- Result.IndVarStart = IndVarStartV;
- Result.IndVarStep = StepCI;
- Result.IndVarBase = LeftValue;
- Result.IndVarIncreasing = IsIncreasing;
- Result.LoopExitAt = RightValue;
- Result.IsSignedPredicate = IsSignedPredicate;
-
- FailureReason = nullptr;
-
- return Result;
-}
-
-/// If the type of \p S matches with \p Ty, return \p S. Otherwise, return
-/// signed or unsigned extension of \p S to type \p Ty.
-static const SCEV *NoopOrExtend(const SCEV *S, Type *Ty, ScalarEvolution &SE,
- bool Signed) {
- return Signed ? SE.getNoopOrSignExtend(S, Ty) : SE.getNoopOrZeroExtend(S, Ty);
-}
-
-Optional<LoopConstrainer::SubRanges>
-LoopConstrainer::calculateSubRanges(bool IsSignedPredicate) const {
- IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType());
-
- auto *RTy = cast<IntegerType>(Range.getType());
-
- // We only support wide range checks and narrow latches.
- if (!AllowNarrowLatchCondition && RTy != Ty)
- return None;
- if (RTy->getBitWidth() < Ty->getBitWidth())
- return None;
-
- LoopConstrainer::SubRanges Result;
-
- // I think we can be more aggressive here and make this nuw / nsw if the
- // addition that feeds into the icmp for the latch's terminating branch is nuw
- // / nsw. In any case, a wrapping 2's complement addition is safe.
- const SCEV *Start = NoopOrExtend(SE.getSCEV(MainLoopStructure.IndVarStart),
- RTy, SE, IsSignedPredicate);
- const SCEV *End = NoopOrExtend(SE.getSCEV(MainLoopStructure.LoopExitAt), RTy,
- SE, IsSignedPredicate);
-
- bool Increasing = MainLoopStructure.IndVarIncreasing;
-
- // We compute `Smallest` and `Greatest` such that [Smallest, Greatest), or
- // [Smallest, GreatestSeen] is the range of values the induction variable
- // takes.
-
- const SCEV *Smallest = nullptr, *Greatest = nullptr, *GreatestSeen = nullptr;
-
- const SCEV *One = SE.getOne(RTy);
- if (Increasing) {
- Smallest = Start;
- Greatest = End;
- // No overflow, because the range [Smallest, GreatestSeen] is not empty.
- GreatestSeen = SE.getMinusSCEV(End, One);
- } else {
- // These two computations may sign-overflow. Here is why that is okay:
- //
- // We know that the induction variable does not sign-overflow on any
- // iteration except the last one, and it starts at `Start` and ends at
- // `End`, decrementing by one every time.
- //
- // * if `Smallest` sign-overflows we know `End` is `INT_SMAX`. Since the
- // induction variable is decreasing we know that that the smallest value
- // the loop body is actually executed with is `INT_SMIN` == `Smallest`.
- //
- // * if `Greatest` sign-overflows, we know it can only be `INT_SMIN`. In
- // that case, `Clamp` will always return `Smallest` and
- // [`Result.LowLimit`, `Result.HighLimit`) = [`Smallest`, `Smallest`)
- // will be an empty range. Returning an empty range is always safe.
-
- Smallest = SE.getAddExpr(End, One);
- Greatest = SE.getAddExpr(Start, One);
- GreatestSeen = Start;
- }
-
- auto Clamp = [this, Smallest, Greatest, IsSignedPredicate](const SCEV *S) {
- return IsSignedPredicate
- ? SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S))
- : SE.getUMaxExpr(Smallest, SE.getUMinExpr(Greatest, S));
- };
-
- // In some cases we can prove that we don't need a pre or post loop.
- ICmpInst::Predicate PredLE =
- IsSignedPredicate ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
- ICmpInst::Predicate PredLT =
- IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
-
- bool ProvablyNoPreloop =
- SE.isKnownPredicate(PredLE, Range.getBegin(), Smallest);
- if (!ProvablyNoPreloop)
- Result.LowLimit = Clamp(Range.getBegin());
-
- bool ProvablyNoPostLoop =
- SE.isKnownPredicate(PredLT, GreatestSeen, Range.getEnd());
- if (!ProvablyNoPostLoop)
- Result.HighLimit = Clamp(Range.getEnd());
-
- return Result;
-}
-
-void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
- const char *Tag) const {
- for (BasicBlock *BB : OriginalLoop.getBlocks()) {
- BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F);
- Result.Blocks.push_back(Clone);
- Result.Map[BB] = Clone;
- }
-
- auto GetClonedValue = [&Result](Value *V) {
- assert(V && "null values not in domain!");
- auto It = Result.Map.find(V);
- if (It == Result.Map.end())
- return V;
- return static_cast<Value *>(It->second);
- };
-
- auto *ClonedLatch =
- cast<BasicBlock>(GetClonedValue(OriginalLoop.getLoopLatch()));
- ClonedLatch->getTerminator()->setMetadata(ClonedLoopTag,
- MDNode::get(Ctx, {}));
-
- Result.Structure = MainLoopStructure.map(GetClonedValue);
- Result.Structure.Tag = Tag;
-
- for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) {
- BasicBlock *ClonedBB = Result.Blocks[i];
- BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i];
-
- assert(Result.Map[OriginalBB] == ClonedBB && "invariant!");
-
- for (Instruction &I : *ClonedBB)
- RemapInstruction(&I, Result.Map,
- RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-
- // Exit blocks will now have one more predecessor and their PHI nodes need
- // to be edited to reflect that. No phi nodes need to be introduced because
- // the loop is in LCSSA.
-
- for (auto *SBB : successors(OriginalBB)) {
- if (OriginalLoop.contains(SBB))
- continue; // not an exit block
-
- for (PHINode &PN : SBB->phis()) {
- Value *OldIncoming = PN.getIncomingValueForBlock(OriginalBB);
- PN.addIncoming(GetClonedValue(OldIncoming), ClonedBB);
- }
- }
- }
-}
-
-LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
- const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt,
- BasicBlock *ContinuationBlock) const {
- // We start with a loop with a single latch:
- //
- // +--------------------+
- // | |
- // | preheader |
- // | |
- // +--------+-----------+
- // | ----------------\
- // | / |
- // +--------v----v------+ |
- // | | |
- // | header | |
- // | | |
- // +--------------------+ |
- // |
- // ..... |
- // |
- // +--------------------+ |
- // | | |
- // | latch >----------/
- // | |
- // +-------v------------+
- // |
- // |
- // | +--------------------+
- // | | |
- // +---> original exit |
- // | |
- // +--------------------+
- //
- // We change the control flow to look like
- //
- //
- // +--------------------+
- // | |
- // | preheader >-------------------------+
- // | | |
- // +--------v-----------+ |
- // | /-------------+ |
- // | / | |
- // +--------v--v--------+ | |
- // | | | |
- // | header | | +--------+ |
- // | | | | | |
- // +--------------------+ | | +-----v-----v-----------+
- // | | | |
- // | | | .pseudo.exit |
- // | | | |
- // | | +-----------v-----------+
- // | | |
- // ..... | | |
- // | | +--------v-------------+
- // +--------------------+ | | | |
- // | | | | | ContinuationBlock |
- // | latch >------+ | | |
- // | | | +----------------------+
- // +---------v----------+ |
- // | |
- // | |
- // | +---------------^-----+
- // | | |
- // +-----> .exit.selector |
- // | |
- // +----------v----------+
- // |
- // +--------------------+ |
- // | | |
- // | original exit <----+
- // | |
- // +--------------------+
-
- RewrittenRangeInfo RRI;
-
- BasicBlock *BBInsertLocation = LS.Latch->getNextNode();
- RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector",
- &F, BBInsertLocation);
- RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
- BBInsertLocation);
-
- BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator());
- bool Increasing = LS.IndVarIncreasing;
- bool IsSignedPredicate = LS.IsSignedPredicate;
-
- IRBuilder<> B(PreheaderJump);
- auto *RangeTy = Range.getBegin()->getType();
- auto NoopOrExt = [&](Value *V) {
- if (V->getType() == RangeTy)
- return V;
- return IsSignedPredicate ? B.CreateSExt(V, RangeTy, "wide." + V->getName())
- : B.CreateZExt(V, RangeTy, "wide." + V->getName());
- };
-
- // EnterLoopCond - is it okay to start executing this `LS'?
- Value *EnterLoopCond = nullptr;
- auto Pred =
- Increasing
- ? (IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT)
- : (IsSignedPredicate ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT);
- Value *IndVarStart = NoopOrExt(LS.IndVarStart);
- EnterLoopCond = B.CreateICmp(Pred, IndVarStart, ExitSubloopAt);
-
- B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit);
- PreheaderJump->eraseFromParent();
-
- LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector);
- B.SetInsertPoint(LS.LatchBr);
- Value *IndVarBase = NoopOrExt(LS.IndVarBase);
- Value *TakeBackedgeLoopCond = B.CreateICmp(Pred, IndVarBase, ExitSubloopAt);
-
- Value *CondForBranch = LS.LatchBrExitIdx == 1
- ? TakeBackedgeLoopCond
- : B.CreateNot(TakeBackedgeLoopCond);
-
- LS.LatchBr->setCondition(CondForBranch);
-
- B.SetInsertPoint(RRI.ExitSelector);
-
- // IterationsLeft - are there any more iterations left, given the original
- // upper bound on the induction variable? If not, we branch to the "real"
- // exit.
- Value *LoopExitAt = NoopOrExt(LS.LoopExitAt);
- Value *IterationsLeft = B.CreateICmp(Pred, IndVarBase, LoopExitAt);
- B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit);
-
- BranchInst *BranchToContinuation =
- BranchInst::Create(ContinuationBlock, RRI.PseudoExit);
-
- // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of
- // each of the PHI nodes in the loop header. This feeds into the initial
- // value of the same PHI nodes if/when we continue execution.
- for (PHINode &PN : LS.Header->phis()) {
- PHINode *NewPHI = PHINode::Create(PN.getType(), 2, PN.getName() + ".copy",
- BranchToContinuation);
-
- NewPHI->addIncoming(PN.getIncomingValueForBlock(Preheader), Preheader);
- NewPHI->addIncoming(PN.getIncomingValueForBlock(LS.Latch),
- RRI.ExitSelector);
- RRI.PHIValuesAtPseudoExit.push_back(NewPHI);
- }
-
- RRI.IndVarEnd = PHINode::Create(IndVarBase->getType(), 2, "indvar.end",
- BranchToContinuation);
- RRI.IndVarEnd->addIncoming(IndVarStart, Preheader);
- RRI.IndVarEnd->addIncoming(IndVarBase, RRI.ExitSelector);
-
- // The latch exit now has a branch from `RRI.ExitSelector' instead of
- // `LS.Latch'. The PHI nodes need to be updated to reflect that.
- LS.LatchExit->replacePhiUsesWith(LS.Latch, RRI.ExitSelector);
-
- return RRI;
-}
-
-void LoopConstrainer::rewriteIncomingValuesForPHIs(
- LoopStructure &LS, BasicBlock *ContinuationBlock,
- const LoopConstrainer::RewrittenRangeInfo &RRI) const {
- unsigned PHIIndex = 0;
- for (PHINode &PN : LS.Header->phis())
- PN.setIncomingValueForBlock(ContinuationBlock,
- RRI.PHIValuesAtPseudoExit[PHIIndex++]);
-
- LS.IndVarStart = RRI.IndVarEnd;
-}
-
-BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
- BasicBlock *OldPreheader,
- const char *Tag) const {
- BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
- BranchInst::Create(LS.Header, Preheader);
-
- LS.Header->replacePhiUsesWith(OldPreheader, Preheader);
-
- return Preheader;
-}
-
-void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
- Loop *ParentLoop = OriginalLoop.getParentLoop();
- if (!ParentLoop)
- return;
-
- for (BasicBlock *BB : BBs)
- ParentLoop->addBasicBlockToLoop(BB, LI);
-}
-
-Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
- ValueToValueMapTy &VM,
- bool IsSubloop) {
- Loop &New = *LI.AllocateLoop();
- if (Parent)
- Parent->addChildLoop(&New);
- else
- LI.addTopLevelLoop(&New);
- LPMAddNewLoop(&New, IsSubloop);
-
- // Add all of the blocks in Original to the new loop.
- for (auto *BB : Original->blocks())
- if (LI.getLoopFor(BB) == Original)
- New.addBasicBlockToLoop(cast<BasicBlock>(VM[BB]), LI);
-
- // Add all of the subloops to the new loop.
- for (Loop *SubLoop : *Original)
- createClonedLoopStructure(SubLoop, &New, VM, /* IsSubloop */ true);
-
- return &New;
-}
-
-bool LoopConstrainer::run() {
- BasicBlock *Preheader = nullptr;
- LatchTakenCount = SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch);
- Preheader = OriginalLoop.getLoopPreheader();
- assert(!isa<SCEVCouldNotCompute>(LatchTakenCount) && Preheader != nullptr &&
- "preconditions!");
-
- OriginalPreheader = Preheader;
- MainLoopPreheader = Preheader;
-
- bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate;
- Optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate);
- if (!MaybeSR.hasValue()) {
- LLVM_DEBUG(dbgs() << "irce: could not compute subranges\n");
- return false;
- }
-
- SubRanges SR = MaybeSR.getValue();
- bool Increasing = MainLoopStructure.IndVarIncreasing;
- IntegerType *IVTy =
- cast<IntegerType>(Range.getBegin()->getType());
-
- SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce");
- Instruction *InsertPt = OriginalPreheader->getTerminator();
-
- // It would have been better to make `PreLoop' and `PostLoop'
- // `Optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy
- // constructor.
- ClonedLoop PreLoop, PostLoop;
- bool NeedsPreLoop =
- Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue();
- bool NeedsPostLoop =
- Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue();
-
- Value *ExitPreLoopAt = nullptr;
- Value *ExitMainLoopAt = nullptr;
- const SCEVConstant *MinusOneS =
- cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */));
-
- if (NeedsPreLoop) {
- const SCEV *ExitPreLoopAtSCEV = nullptr;
-
- if (Increasing)
- ExitPreLoopAtSCEV = *SR.LowLimit;
- else if (cannotBeMinInLoop(*SR.HighLimit, &OriginalLoop, SE,
- IsSignedPredicate))
- ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
- else {
- LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
- << "preloop exit limit. HighLimit = "
- << *(*SR.HighLimit) << "\n");
- return false;
- }
-
- if (!isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt, SE)) {
- LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
- << " preloop exit limit " << *ExitPreLoopAtSCEV
- << " at block " << InsertPt->getParent()->getName()
- << "\n");
- return false;
- }
-
- ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt);
- ExitPreLoopAt->setName("exit.preloop.at");
- }
-
- if (NeedsPostLoop) {
- const SCEV *ExitMainLoopAtSCEV = nullptr;
-
- if (Increasing)
- ExitMainLoopAtSCEV = *SR.HighLimit;
- else if (cannotBeMinInLoop(*SR.LowLimit, &OriginalLoop, SE,
- IsSignedPredicate))
- ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
- else {
- LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
- << "mainloop exit limit. LowLimit = "
- << *(*SR.LowLimit) << "\n");
- return false;
- }
-
- if (!isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt, SE)) {
- LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
- << " main loop exit limit " << *ExitMainLoopAtSCEV
- << " at block " << InsertPt->getParent()->getName()
- << "\n");
- return false;
- }
-
- ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt);
- ExitMainLoopAt->setName("exit.mainloop.at");
- }
-
- // We clone these ahead of time so that we don't have to deal with changing
- // and temporarily invalid IR as we transform the loops.
- if (NeedsPreLoop)
- cloneLoop(PreLoop, "preloop");
- if (NeedsPostLoop)
- cloneLoop(PostLoop, "postloop");
-
- RewrittenRangeInfo PreLoopRRI;
-
- if (NeedsPreLoop) {
- Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header,
- PreLoop.Structure.Header);
-
- MainLoopPreheader =
- createPreheader(MainLoopStructure, Preheader, "mainloop");
- PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader,
- ExitPreLoopAt, MainLoopPreheader);
- rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader,
- PreLoopRRI);
- }
-
- BasicBlock *PostLoopPreheader = nullptr;
- RewrittenRangeInfo PostLoopRRI;
-
- if (NeedsPostLoop) {
- PostLoopPreheader =
- createPreheader(PostLoop.Structure, Preheader, "postloop");
- PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader,
- ExitMainLoopAt, PostLoopPreheader);
- rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader,
- PostLoopRRI);
- }
-
- BasicBlock *NewMainLoopPreheader =
- MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr;
- BasicBlock *NewBlocks[] = {PostLoopPreheader, PreLoopRRI.PseudoExit,
- PreLoopRRI.ExitSelector, PostLoopRRI.PseudoExit,
- PostLoopRRI.ExitSelector, NewMainLoopPreheader};
-
- // Some of the above may be nullptr, filter them out before passing to
- // addToParentLoopIfNeeded.
- auto NewBlocksEnd =
- std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr);
-
- addToParentLoopIfNeeded(makeArrayRef(std::begin(NewBlocks), NewBlocksEnd));
-
- DT.recalculate(F);
-
- // We need to first add all the pre and post loop blocks into the loop
- // structures (as part of createClonedLoopStructure), and then update the
- // LCSSA form and LoopSimplifyForm. This is necessary for correctly updating
- // LI when LoopSimplifyForm is generated.
- Loop *PreL = nullptr, *PostL = nullptr;
- if (!PreLoop.Blocks.empty()) {
- PreL = createClonedLoopStructure(&OriginalLoop,
- OriginalLoop.getParentLoop(), PreLoop.Map,
- /* IsSubLoop */ false);
- }
-
- if (!PostLoop.Blocks.empty()) {
- PostL =
- createClonedLoopStructure(&OriginalLoop, OriginalLoop.getParentLoop(),
- PostLoop.Map, /* IsSubLoop */ false);
- }
-
- // This function canonicalizes the loop into Loop-Simplify and LCSSA forms.
- auto CanonicalizeLoop = [&] (Loop *L, bool IsOriginalLoop) {
- formLCSSARecursively(*L, DT, &LI, &SE);
- simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr, true);
- // Pre/post loops are slow paths, we do not need to perform any loop
- // optimizations on them.
- if (!IsOriginalLoop)
- DisableAllLoopOptsOnLoop(*L);
- };
- if (PreL)
- CanonicalizeLoop(PreL, false);
- if (PostL)
- CanonicalizeLoop(PostL, false);
- CanonicalizeLoop(&OriginalLoop, true);
-
- return true;
-}
-
-/// Computes and returns a range of values for the induction variable (IndVar)
-/// in which the range check can be safely elided. If it cannot compute such a
-/// range, returns None.
-Optional<InductiveRangeCheck::Range>
-InductiveRangeCheck::computeSafeIterationSpace(
- ScalarEvolution &SE, const SCEVAddRecExpr *IndVar,
- bool IsLatchSigned) const {
- // We can deal when types of latch check and range checks don't match in case
- // if latch check is more narrow.
- auto *IVType = cast<IntegerType>(IndVar->getType());
- auto *RCType = cast<IntegerType>(getBegin()->getType());
- if (IVType->getBitWidth() > RCType->getBitWidth())
- return None;
- // IndVar is of the form "A + B * I" (where "I" is the canonical induction
- // variable, that may or may not exist as a real llvm::Value in the loop) and
- // this inductive range check is a range check on the "C + D * I" ("C" is
- // getBegin() and "D" is getStep()). We rewrite the value being range
- // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA".
- //
- // The actual inequalities we solve are of the form
- //
- // 0 <= M + 1 * IndVar < L given L >= 0 (i.e. N == 1)
- //
- // Here L stands for upper limit of the safe iteration space.
- // The inequality is satisfied by (0 - M) <= IndVar < (L - M). To avoid
- // overflows when calculating (0 - M) and (L - M) we, depending on type of
- // IV's iteration space, limit the calculations by borders of the iteration
- // space. For example, if IndVar is unsigned, (0 - M) overflows for any M > 0.
- // If we figured out that "anything greater than (-M) is safe", we strengthen
- // this to "everything greater than 0 is safe", assuming that values between
- // -M and 0 just do not exist in unsigned iteration space, and we don't want
- // to deal with overflown values.
-
- if (!IndVar->isAffine())
- return None;
-
- const SCEV *A = NoopOrExtend(IndVar->getStart(), RCType, SE, IsLatchSigned);
- const SCEVConstant *B = dyn_cast<SCEVConstant>(
- NoopOrExtend(IndVar->getStepRecurrence(SE), RCType, SE, IsLatchSigned));
- if (!B)
- return None;
- assert(!B->isZero() && "Recurrence with zero step?");
-
- const SCEV *C = getBegin();
- const SCEVConstant *D = dyn_cast<SCEVConstant>(getStep());
- if (D != B)
- return None;
-
- assert(!D->getValue()->isZero() && "Recurrence with zero step?");
- unsigned BitWidth = RCType->getBitWidth();
- const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
-
- // Subtract Y from X so that it does not go through border of the IV
- // iteration space. Mathematically, it is equivalent to:
- //
- // ClampedSubtract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX). [1]
- //
- // In [1], 'X - Y' is a mathematical subtraction (result is not bounded to
- // any width of bit grid). But after we take min/max, the result is
- // guaranteed to be within [INT_MIN, INT_MAX].
- //
- // In [1], INT_MAX and INT_MIN are respectively signed and unsigned max/min
- // values, depending on type of latch condition that defines IV iteration
- // space.
- auto ClampedSubtract = [&](const SCEV *X, const SCEV *Y) {
- // FIXME: The current implementation assumes that X is in [0, SINT_MAX].
- // This is required to ensure that SINT_MAX - X does not overflow signed and
- // that X - Y does not overflow unsigned if Y is negative. Can we lift this
- // restriction and make it work for negative X either?
- if (IsLatchSigned) {
- // X is a number from signed range, Y is interpreted as signed.
- // Even if Y is SINT_MAX, (X - Y) does not reach SINT_MIN. So the only
- // thing we should care about is that we didn't cross SINT_MAX.
- // So, if Y is positive, we subtract Y safely.
- // Rule 1: Y > 0 ---> Y.
- // If 0 <= -Y <= (SINT_MAX - X), we subtract Y safely.
- // Rule 2: Y >=s (X - SINT_MAX) ---> Y.
- // If 0 <= (SINT_MAX - X) < -Y, we can only subtract (X - SINT_MAX).
- // Rule 3: Y <s (X - SINT_MAX) ---> (X - SINT_MAX).
- // It gives us smax(Y, X - SINT_MAX) to subtract in all cases.
- const SCEV *XMinusSIntMax = SE.getMinusSCEV(X, SIntMax);
- return SE.getMinusSCEV(X, SE.getSMaxExpr(Y, XMinusSIntMax),
- SCEV::FlagNSW);
- } else
- // X is a number from unsigned range, Y is interpreted as signed.
- // Even if Y is SINT_MIN, (X - Y) does not reach UINT_MAX. So the only
- // thing we should care about is that we didn't cross zero.
- // So, if Y is negative, we subtract Y safely.
- // Rule 1: Y <s 0 ---> Y.
- // If 0 <= Y <= X, we subtract Y safely.
- // Rule 2: Y <=s X ---> Y.
- // If 0 <= X < Y, we should stop at 0 and can only subtract X.
- // Rule 3: Y >s X ---> X.
- // It gives us smin(X, Y) to subtract in all cases.
- return SE.getMinusSCEV(X, SE.getSMinExpr(X, Y), SCEV::FlagNUW);
- };
- const SCEV *M = SE.getMinusSCEV(C, A);
- const SCEV *Zero = SE.getZero(M->getType());
-
- // This function returns SCEV equal to 1 if X is non-negative 0 otherwise.
- auto SCEVCheckNonNegative = [&](const SCEV *X) {
- const Loop *L = IndVar->getLoop();
- const SCEV *One = SE.getOne(X->getType());
- // Can we trivially prove that X is a non-negative or negative value?
- if (isKnownNonNegativeInLoop(X, L, SE))
- return One;
- else if (isKnownNegativeInLoop(X, L, SE))
- return Zero;
- // If not, we will have to figure it out during the execution.
- // Function smax(smin(X, 0), -1) + 1 equals to 1 if X >= 0 and 0 if X < 0.
- const SCEV *NegOne = SE.getNegativeSCEV(One);
- return SE.getAddExpr(SE.getSMaxExpr(SE.getSMinExpr(X, Zero), NegOne), One);
- };
- // FIXME: Current implementation of ClampedSubtract implicitly assumes that
- // X is non-negative (in sense of a signed value). We need to re-implement
- // this function in a way that it will correctly handle negative X as well.
- // We use it twice: for X = 0 everything is fine, but for X = getEnd() we can
- // end up with a negative X and produce wrong results. So currently we ensure
- // that if getEnd() is negative then both ends of the safe range are zero.
- // Note that this may pessimize elimination of unsigned range checks against
- // negative values.
- const SCEV *REnd = getEnd();
- const SCEV *EndIsNonNegative = SCEVCheckNonNegative(REnd);
-
- const SCEV *Begin = SE.getMulExpr(ClampedSubtract(Zero, M), EndIsNonNegative);
- const SCEV *End = SE.getMulExpr(ClampedSubtract(REnd, M), EndIsNonNegative);
- return InductiveRangeCheck::Range(Begin, End);
-}
-
-static Optional<InductiveRangeCheck::Range>
-IntersectSignedRange(ScalarEvolution &SE,
- const Optional<InductiveRangeCheck::Range> &R1,
- const InductiveRangeCheck::Range &R2) {
- if (R2.isEmpty(SE, /* IsSigned */ true))
- return None;
- if (!R1.hasValue())
- return R2;
- auto &R1Value = R1.getValue();
- // We never return empty ranges from this function, and R1 is supposed to be
- // a result of intersection. Thus, R1 is never empty.
- assert(!R1Value.isEmpty(SE, /* IsSigned */ true) &&
- "We should never have empty R1!");
-
- // TODO: we could widen the smaller range and have this work; but for now we
- // bail out to keep things simple.
- if (R1Value.getType() != R2.getType())
- return None;
-
- const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin());
- const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd());
-
- // If the resulting range is empty, just return None.
- auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd);
- if (Ret.isEmpty(SE, /* IsSigned */ true))
- return None;
- return Ret;
-}
-
-static Optional<InductiveRangeCheck::Range>
-IntersectUnsignedRange(ScalarEvolution &SE,
- const Optional<InductiveRangeCheck::Range> &R1,
- const InductiveRangeCheck::Range &R2) {
- if (R2.isEmpty(SE, /* IsSigned */ false))
- return None;
- if (!R1.hasValue())
- return R2;
- auto &R1Value = R1.getValue();
- // We never return empty ranges from this function, and R1 is supposed to be
- // a result of intersection. Thus, R1 is never empty.
- assert(!R1Value.isEmpty(SE, /* IsSigned */ false) &&
- "We should never have empty R1!");
-
- // TODO: we could widen the smaller range and have this work; but for now we
- // bail out to keep things simple.
- if (R1Value.getType() != R2.getType())
- return None;
-
- const SCEV *NewBegin = SE.getUMaxExpr(R1Value.getBegin(), R2.getBegin());
- const SCEV *NewEnd = SE.getUMinExpr(R1Value.getEnd(), R2.getEnd());
-
- // If the resulting range is empty, just return None.
- auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd);
- if (Ret.isEmpty(SE, /* IsSigned */ false))
- return None;
- return Ret;
-}
-
-PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) {
- auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &BPI = AM.getResult<BranchProbabilityAnalysis>(F);
- LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
-
+ const char *&FailureReason) {
+ if (!L.isLoopSimplifyForm()) {
+ FailureReason = "loop not in LoopSimplify form";
+ return None;
+ }
+
+ BasicBlock *Latch = L.getLoopLatch();
+ assert(Latch && "Simplified loops only have one latch!");
+
+ if (Latch->getTerminator()->getMetadata(ClonedLoopTag)) {
+ FailureReason = "loop has already been cloned";
+ return None;
+ }
+
+ if (!L.isLoopExiting(Latch)) {
+ FailureReason = "no loop latch";
+ return None;
+ }
+
+ BasicBlock *Header = L.getHeader();
+ BasicBlock *Preheader = L.getLoopPreheader();
+ if (!Preheader) {
+ FailureReason = "no preheader";
+ return None;
+ }
+
+ BranchInst *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!LatchBr || LatchBr->isUnconditional()) {
+ FailureReason = "latch terminator not conditional branch";
+ return None;
+ }
+
+ unsigned LatchBrExitIdx = LatchBr->getSuccessor(0) == Header ? 1 : 0;
+
+ ICmpInst *ICI = dyn_cast<ICmpInst>(LatchBr->getCondition());
+ if (!ICI || !isa<IntegerType>(ICI->getOperand(0)->getType())) {
+ FailureReason = "latch terminator branch not conditional on integral icmp";
+ return None;
+ }
+
+ const SCEV *LatchCount = SE.getExitCount(&L, Latch);
+ if (isa<SCEVCouldNotCompute>(LatchCount)) {
+ FailureReason = "could not compute latch count";
+ return None;
+ }
+
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+ Value *LeftValue = ICI->getOperand(0);
+ const SCEV *LeftSCEV = SE.getSCEV(LeftValue);
+ IntegerType *IndVarTy = cast<IntegerType>(LeftValue->getType());
+
+ Value *RightValue = ICI->getOperand(1);
+ const SCEV *RightSCEV = SE.getSCEV(RightValue);
+
+ // We canonicalize `ICI` such that `LeftSCEV` is an add recurrence.
+ if (!isa<SCEVAddRecExpr>(LeftSCEV)) {
+ if (isa<SCEVAddRecExpr>(RightSCEV)) {
+ std::swap(LeftSCEV, RightSCEV);
+ std::swap(LeftValue, RightValue);
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ } else {
+ FailureReason = "no add recurrences in the icmp";
+ return None;
+ }
+ }
+
+ auto HasNoSignedWrap = [&](const SCEVAddRecExpr *AR) {
+ if (AR->getNoWrapFlags(SCEV::FlagNSW))
+ return true;
+
+ IntegerType *Ty = cast<IntegerType>(AR->getType());
+ IntegerType *WideTy =
+ IntegerType::get(Ty->getContext(), Ty->getBitWidth() * 2);
+
+ const SCEVAddRecExpr *ExtendAfterOp =
+ dyn_cast<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+ if (ExtendAfterOp) {
+ const SCEV *ExtendedStart = SE.getSignExtendExpr(AR->getStart(), WideTy);
+ const SCEV *ExtendedStep =
+ SE.getSignExtendExpr(AR->getStepRecurrence(SE), WideTy);
+
+ bool NoSignedWrap = ExtendAfterOp->getStart() == ExtendedStart &&
+ ExtendAfterOp->getStepRecurrence(SE) == ExtendedStep;
+
+ if (NoSignedWrap)
+ return true;
+ }
+
+ // We may have proved this when computing the sign extension above.
+ return AR->getNoWrapFlags(SCEV::FlagNSW) != SCEV::FlagAnyWrap;
+ };
+
+ // `ICI` is interpreted as taking the backedge if the *next* value of the
+ // induction variable satisfies some constraint.
+
+ const SCEVAddRecExpr *IndVarBase = cast<SCEVAddRecExpr>(LeftSCEV);
+ if (!IndVarBase->isAffine()) {
+ FailureReason = "LHS in icmp not induction variable";
+ return None;
+ }
+ const SCEV* StepRec = IndVarBase->getStepRecurrence(SE);
+ if (!isa<SCEVConstant>(StepRec)) {
+ FailureReason = "LHS in icmp not induction variable";
+ return None;
+ }
+ ConstantInt *StepCI = cast<SCEVConstant>(StepRec)->getValue();
+
+ if (ICI->isEquality() && !HasNoSignedWrap(IndVarBase)) {
+ FailureReason = "LHS in icmp needs nsw for equality predicates";
+ return None;
+ }
+
+ assert(!StepCI->isZero() && "Zero step?");
+ bool IsIncreasing = !StepCI->isNegative();
+ bool IsSignedPredicate;
+ const SCEV *StartNext = IndVarBase->getStart();
+ const SCEV *Addend = SE.getNegativeSCEV(IndVarBase->getStepRecurrence(SE));
+ const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+ const SCEV *Step = SE.getSCEV(StepCI);
+
+ const SCEV *FixedRightSCEV = nullptr;
+
+ // If RightValue resides within loop (but still being loop invariant),
+ // regenerate it as preheader.
+ if (auto *I = dyn_cast<Instruction>(RightValue))
+ if (L.contains(I->getParent()))
+ FixedRightSCEV = RightSCEV;
+
+ if (IsIncreasing) {
+ bool DecreasedRightValueByOne = false;
+ if (StepCI->isOne()) {
+ // Try to turn eq/ne predicates to those we can work with.
+ if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+ // while (++i != len) { while (++i < len) {
+ // ... ---> ...
+ // } }
+ // If both parts are known non-negative, it is profitable to use
+ // unsigned comparison in increasing loop. This allows us to make the
+ // comparison check against "RightSCEV + 1" more optimistic.
+ if (isKnownNonNegativeInLoop(IndVarStart, &L, SE) &&
+ isKnownNonNegativeInLoop(RightSCEV, &L, SE))
+ Pred = ICmpInst::ICMP_ULT;
+ else
+ Pred = ICmpInst::ICMP_SLT;
+ else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
+ // while (true) { while (true) {
+ // if (++i == len) ---> if (++i > len - 1)
+ // break; break;
+ // ... ...
+ // } }
+ if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
+ cannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/false)) {
+ Pred = ICmpInst::ICMP_UGT;
+ RightSCEV = SE.getMinusSCEV(RightSCEV,
+ SE.getOne(RightSCEV->getType()));
+ DecreasedRightValueByOne = true;
+ } else if (cannotBeMinInLoop(RightSCEV, &L, SE, /*Signed*/true)) {
+ Pred = ICmpInst::ICMP_SGT;
+ RightSCEV = SE.getMinusSCEV(RightSCEV,
+ SE.getOne(RightSCEV->getType()));
+ DecreasedRightValueByOne = true;
+ }
+ }
+ }
+
+ bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
+ bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
+ bool FoundExpectedPred =
+ (LTPred && LatchBrExitIdx == 1) || (GTPred && LatchBrExitIdx == 0);
+
+ if (!FoundExpectedPred) {
+ FailureReason = "expected icmp slt semantically, found something else";
+ return None;
+ }
+
+ IsSignedPredicate = ICmpInst::isSigned(Pred);
+ if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
+ FailureReason = "unsigned latch conditions are explicitly prohibited";
+ return None;
+ }
+
+ if (!isSafeIncreasingBound(IndVarStart, RightSCEV, Step, Pred,
+ LatchBrExitIdx, &L, SE)) {
+ FailureReason = "Unsafe loop bounds";
+ return None;
+ }
+ if (LatchBrExitIdx == 0) {
+ // We need to increase the right value unless we have already decreased
+ // it virtually when we replaced EQ with SGT.
+ if (!DecreasedRightValueByOne)
+ FixedRightSCEV =
+ SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+ } else {
+ assert(!DecreasedRightValueByOne &&
+ "Right value can be decreased only for LatchBrExitIdx == 0!");
+ }
+ } else {
+ bool IncreasedRightValueByOne = false;
+ if (StepCI->isMinusOne()) {
+ // Try to turn eq/ne predicates to those we can work with.
+ if (Pred == ICmpInst::ICMP_NE && LatchBrExitIdx == 1)
+ // while (--i != len) { while (--i > len) {
+ // ... ---> ...
+ // } }
+ // We intentionally don't turn the predicate into UGT even if we know
+ // that both operands are non-negative, because it will only pessimize
+ // our check against "RightSCEV - 1".
+ Pred = ICmpInst::ICMP_SGT;
+ else if (Pred == ICmpInst::ICMP_EQ && LatchBrExitIdx == 0) {
+ // while (true) { while (true) {
+ // if (--i == len) ---> if (--i < len + 1)
+ // break; break;
+ // ... ...
+ // } }
+ if (IndVarBase->getNoWrapFlags(SCEV::FlagNUW) &&
+ cannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ false)) {
+ Pred = ICmpInst::ICMP_ULT;
+ RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+ IncreasedRightValueByOne = true;
+ } else if (cannotBeMaxInLoop(RightSCEV, &L, SE, /* Signed */ true)) {
+ Pred = ICmpInst::ICMP_SLT;
+ RightSCEV = SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
+ IncreasedRightValueByOne = true;
+ }
+ }
+ }
+
+ bool LTPred = (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_ULT);
+ bool GTPred = (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_UGT);
+
+ bool FoundExpectedPred =
+ (GTPred && LatchBrExitIdx == 1) || (LTPred && LatchBrExitIdx == 0);
+
+ if (!FoundExpectedPred) {
+ FailureReason = "expected icmp sgt semantically, found something else";
+ return None;
+ }
+
+ IsSignedPredicate =
+ Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGT;
+
+ if (!IsSignedPredicate && !AllowUnsignedLatchCondition) {
+ FailureReason = "unsigned latch conditions are explicitly prohibited";
+ return None;
+ }
+
+ if (!isSafeDecreasingBound(IndVarStart, RightSCEV, Step, Pred,
+ LatchBrExitIdx, &L, SE)) {
+ FailureReason = "Unsafe bounds";
+ return None;
+ }
+
+ if (LatchBrExitIdx == 0) {
+ // We need to decrease the right value unless we have already increased
+ // it virtually when we replaced EQ with SLT.
+ if (!IncreasedRightValueByOne)
+ FixedRightSCEV =
+ SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
+ } else {
+ assert(!IncreasedRightValueByOne &&
+ "Right value can be increased only for LatchBrExitIdx == 0!");
+ }
+ }
+ BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
+
+ assert(SE.getLoopDisposition(LatchCount, &L) ==
+ ScalarEvolution::LoopInvariant &&
+ "loop variant exit count doesn't make sense!");
+
+ assert(!L.contains(LatchExit) && "expected an exit block!");
+ const DataLayout &DL = Preheader->getModule()->getDataLayout();
+ SCEVExpander Expander(SE, DL, "irce");
+ Instruction *Ins = Preheader->getTerminator();
+
+ if (FixedRightSCEV)
+ RightValue =
+ Expander.expandCodeFor(FixedRightSCEV, FixedRightSCEV->getType(), Ins);
+
+ Value *IndVarStartV = Expander.expandCodeFor(IndVarStart, IndVarTy, Ins);
+ IndVarStartV->setName("indvar.start");
+
+ LoopStructure Result;
+
+ Result.Tag = "main";
+ Result.Header = Header;
+ Result.Latch = Latch;
+ Result.LatchBr = LatchBr;
+ Result.LatchExit = LatchExit;
+ Result.LatchBrExitIdx = LatchBrExitIdx;
+ Result.IndVarStart = IndVarStartV;
+ Result.IndVarStep = StepCI;
+ Result.IndVarBase = LeftValue;
+ Result.IndVarIncreasing = IsIncreasing;
+ Result.LoopExitAt = RightValue;
+ Result.IsSignedPredicate = IsSignedPredicate;
+
+ FailureReason = nullptr;
+
+ return Result;
+}
+
+/// If the type of \p S matches with \p Ty, return \p S. Otherwise, return
+/// signed or unsigned extension of \p S to type \p Ty.
+static const SCEV *NoopOrExtend(const SCEV *S, Type *Ty, ScalarEvolution &SE,
+ bool Signed) {
+ return Signed ? SE.getNoopOrSignExtend(S, Ty) : SE.getNoopOrZeroExtend(S, Ty);
+}
+
+Optional<LoopConstrainer::SubRanges>
+LoopConstrainer::calculateSubRanges(bool IsSignedPredicate) const {
+ IntegerType *Ty = cast<IntegerType>(LatchTakenCount->getType());
+
+ auto *RTy = cast<IntegerType>(Range.getType());
+
+ // We only support wide range checks and narrow latches.
+ if (!AllowNarrowLatchCondition && RTy != Ty)
+ return None;
+ if (RTy->getBitWidth() < Ty->getBitWidth())
+ return None;
+
+ LoopConstrainer::SubRanges Result;
+
+ // I think we can be more aggressive here and make this nuw / nsw if the
+ // addition that feeds into the icmp for the latch's terminating branch is nuw
+ // / nsw. In any case, a wrapping 2's complement addition is safe.
+ const SCEV *Start = NoopOrExtend(SE.getSCEV(MainLoopStructure.IndVarStart),
+ RTy, SE, IsSignedPredicate);
+ const SCEV *End = NoopOrExtend(SE.getSCEV(MainLoopStructure.LoopExitAt), RTy,
+ SE, IsSignedPredicate);
+
+ bool Increasing = MainLoopStructure.IndVarIncreasing;
+
+ // We compute `Smallest` and `Greatest` such that [Smallest, Greatest), or
+ // [Smallest, GreatestSeen] is the range of values the induction variable
+ // takes.
+
+ const SCEV *Smallest = nullptr, *Greatest = nullptr, *GreatestSeen = nullptr;
+
+ const SCEV *One = SE.getOne(RTy);
+ if (Increasing) {
+ Smallest = Start;
+ Greatest = End;
+ // No overflow, because the range [Smallest, GreatestSeen] is not empty.
+ GreatestSeen = SE.getMinusSCEV(End, One);
+ } else {
+ // These two computations may sign-overflow. Here is why that is okay:
+ //
+ // We know that the induction variable does not sign-overflow on any
+ // iteration except the last one, and it starts at `Start` and ends at
+ // `End`, decrementing by one every time.
+ //
+ // * if `Smallest` sign-overflows we know `End` is `INT_SMAX`. Since the
+ // induction variable is decreasing we know that that the smallest value
+ // the loop body is actually executed with is `INT_SMIN` == `Smallest`.
+ //
+ // * if `Greatest` sign-overflows, we know it can only be `INT_SMIN`. In
+ // that case, `Clamp` will always return `Smallest` and
+ // [`Result.LowLimit`, `Result.HighLimit`) = [`Smallest`, `Smallest`)
+ // will be an empty range. Returning an empty range is always safe.
+
+ Smallest = SE.getAddExpr(End, One);
+ Greatest = SE.getAddExpr(Start, One);
+ GreatestSeen = Start;
+ }
+
+ auto Clamp = [this, Smallest, Greatest, IsSignedPredicate](const SCEV *S) {
+ return IsSignedPredicate
+ ? SE.getSMaxExpr(Smallest, SE.getSMinExpr(Greatest, S))
+ : SE.getUMaxExpr(Smallest, SE.getUMinExpr(Greatest, S));
+ };
+
+ // In some cases we can prove that we don't need a pre or post loop.
+ ICmpInst::Predicate PredLE =
+ IsSignedPredicate ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_ULE;
+ ICmpInst::Predicate PredLT =
+ IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+
+ bool ProvablyNoPreloop =
+ SE.isKnownPredicate(PredLE, Range.getBegin(), Smallest);
+ if (!ProvablyNoPreloop)
+ Result.LowLimit = Clamp(Range.getBegin());
+
+ bool ProvablyNoPostLoop =
+ SE.isKnownPredicate(PredLT, GreatestSeen, Range.getEnd());
+ if (!ProvablyNoPostLoop)
+ Result.HighLimit = Clamp(Range.getEnd());
+
+ return Result;
+}
+
+void LoopConstrainer::cloneLoop(LoopConstrainer::ClonedLoop &Result,
+ const char *Tag) const {
+ for (BasicBlock *BB : OriginalLoop.getBlocks()) {
+ BasicBlock *Clone = CloneBasicBlock(BB, Result.Map, Twine(".") + Tag, &F);
+ Result.Blocks.push_back(Clone);
+ Result.Map[BB] = Clone;
+ }
+
+ auto GetClonedValue = [&Result](Value *V) {
+ assert(V && "null values not in domain!");
+ auto It = Result.Map.find(V);
+ if (It == Result.Map.end())
+ return V;
+ return static_cast<Value *>(It->second);
+ };
+
+ auto *ClonedLatch =
+ cast<BasicBlock>(GetClonedValue(OriginalLoop.getLoopLatch()));
+ ClonedLatch->getTerminator()->setMetadata(ClonedLoopTag,
+ MDNode::get(Ctx, {}));
+
+ Result.Structure = MainLoopStructure.map(GetClonedValue);
+ Result.Structure.Tag = Tag;
+
+ for (unsigned i = 0, e = Result.Blocks.size(); i != e; ++i) {
+ BasicBlock *ClonedBB = Result.Blocks[i];
+ BasicBlock *OriginalBB = OriginalLoop.getBlocks()[i];
+
+ assert(Result.Map[OriginalBB] == ClonedBB && "invariant!");
+
+ for (Instruction &I : *ClonedBB)
+ RemapInstruction(&I, Result.Map,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+ // Exit blocks will now have one more predecessor and their PHI nodes need
+ // to be edited to reflect that. No phi nodes need to be introduced because
+ // the loop is in LCSSA.
+
+ for (auto *SBB : successors(OriginalBB)) {
+ if (OriginalLoop.contains(SBB))
+ continue; // not an exit block
+
+ for (PHINode &PN : SBB->phis()) {
+ Value *OldIncoming = PN.getIncomingValueForBlock(OriginalBB);
+ PN.addIncoming(GetClonedValue(OldIncoming), ClonedBB);
+ }
+ }
+ }
+}
+
+LoopConstrainer::RewrittenRangeInfo LoopConstrainer::changeIterationSpaceEnd(
+ const LoopStructure &LS, BasicBlock *Preheader, Value *ExitSubloopAt,
+ BasicBlock *ContinuationBlock) const {
+ // We start with a loop with a single latch:
+ //
+ // +--------------------+
+ // | |
+ // | preheader |
+ // | |
+ // +--------+-----------+
+ // | ----------------\
+ // | / |
+ // +--------v----v------+ |
+ // | | |
+ // | header | |
+ // | | |
+ // +--------------------+ |
+ // |
+ // ..... |
+ // |
+ // +--------------------+ |
+ // | | |
+ // | latch >----------/
+ // | |
+ // +-------v------------+
+ // |
+ // |
+ // | +--------------------+
+ // | | |
+ // +---> original exit |
+ // | |
+ // +--------------------+
+ //
+ // We change the control flow to look like
+ //
+ //
+ // +--------------------+
+ // | |
+ // | preheader >-------------------------+
+ // | | |
+ // +--------v-----------+ |
+ // | /-------------+ |
+ // | / | |
+ // +--------v--v--------+ | |
+ // | | | |
+ // | header | | +--------+ |
+ // | | | | | |
+ // +--------------------+ | | +-----v-----v-----------+
+ // | | | |
+ // | | | .pseudo.exit |
+ // | | | |
+ // | | +-----------v-----------+
+ // | | |
+ // ..... | | |
+ // | | +--------v-------------+
+ // +--------------------+ | | | |
+ // | | | | | ContinuationBlock |
+ // | latch >------+ | | |
+ // | | | +----------------------+
+ // +---------v----------+ |
+ // | |
+ // | |
+ // | +---------------^-----+
+ // | | |
+ // +-----> .exit.selector |
+ // | |
+ // +----------v----------+
+ // |
+ // +--------------------+ |
+ // | | |
+ // | original exit <----+
+ // | |
+ // +--------------------+
+
+ RewrittenRangeInfo RRI;
+
+ BasicBlock *BBInsertLocation = LS.Latch->getNextNode();
+ RRI.ExitSelector = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".exit.selector",
+ &F, BBInsertLocation);
+ RRI.PseudoExit = BasicBlock::Create(Ctx, Twine(LS.Tag) + ".pseudo.exit", &F,
+ BBInsertLocation);
+
+ BranchInst *PreheaderJump = cast<BranchInst>(Preheader->getTerminator());
+ bool Increasing = LS.IndVarIncreasing;
+ bool IsSignedPredicate = LS.IsSignedPredicate;
+
+ IRBuilder<> B(PreheaderJump);
+ auto *RangeTy = Range.getBegin()->getType();
+ auto NoopOrExt = [&](Value *V) {
+ if (V->getType() == RangeTy)
+ return V;
+ return IsSignedPredicate ? B.CreateSExt(V, RangeTy, "wide." + V->getName())
+ : B.CreateZExt(V, RangeTy, "wide." + V->getName());
+ };
+
+ // EnterLoopCond - is it okay to start executing this `LS'?
+ Value *EnterLoopCond = nullptr;
+ auto Pred =
+ Increasing
+ ? (IsSignedPredicate ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT)
+ : (IsSignedPredicate ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT);
+ Value *IndVarStart = NoopOrExt(LS.IndVarStart);
+ EnterLoopCond = B.CreateICmp(Pred, IndVarStart, ExitSubloopAt);
+
+ B.CreateCondBr(EnterLoopCond, LS.Header, RRI.PseudoExit);
+ PreheaderJump->eraseFromParent();
+
+ LS.LatchBr->setSuccessor(LS.LatchBrExitIdx, RRI.ExitSelector);
+ B.SetInsertPoint(LS.LatchBr);
+ Value *IndVarBase = NoopOrExt(LS.IndVarBase);
+ Value *TakeBackedgeLoopCond = B.CreateICmp(Pred, IndVarBase, ExitSubloopAt);
+
+ Value *CondForBranch = LS.LatchBrExitIdx == 1
+ ? TakeBackedgeLoopCond
+ : B.CreateNot(TakeBackedgeLoopCond);
+
+ LS.LatchBr->setCondition(CondForBranch);
+
+ B.SetInsertPoint(RRI.ExitSelector);
+
+ // IterationsLeft - are there any more iterations left, given the original
+ // upper bound on the induction variable? If not, we branch to the "real"
+ // exit.
+ Value *LoopExitAt = NoopOrExt(LS.LoopExitAt);
+ Value *IterationsLeft = B.CreateICmp(Pred, IndVarBase, LoopExitAt);
+ B.CreateCondBr(IterationsLeft, RRI.PseudoExit, LS.LatchExit);
+
+ BranchInst *BranchToContinuation =
+ BranchInst::Create(ContinuationBlock, RRI.PseudoExit);
+
+ // We emit PHI nodes into `RRI.PseudoExit' that compute the "latest" value of
+ // each of the PHI nodes in the loop header. This feeds into the initial
+ // value of the same PHI nodes if/when we continue execution.
+ for (PHINode &PN : LS.Header->phis()) {
+ PHINode *NewPHI = PHINode::Create(PN.getType(), 2, PN.getName() + ".copy",
+ BranchToContinuation);
+
+ NewPHI->addIncoming(PN.getIncomingValueForBlock(Preheader), Preheader);
+ NewPHI->addIncoming(PN.getIncomingValueForBlock(LS.Latch),
+ RRI.ExitSelector);
+ RRI.PHIValuesAtPseudoExit.push_back(NewPHI);
+ }
+
+ RRI.IndVarEnd = PHINode::Create(IndVarBase->getType(), 2, "indvar.end",
+ BranchToContinuation);
+ RRI.IndVarEnd->addIncoming(IndVarStart, Preheader);
+ RRI.IndVarEnd->addIncoming(IndVarBase, RRI.ExitSelector);
+
+ // The latch exit now has a branch from `RRI.ExitSelector' instead of
+ // `LS.Latch'. The PHI nodes need to be updated to reflect that.
+ LS.LatchExit->replacePhiUsesWith(LS.Latch, RRI.ExitSelector);
+
+ return RRI;
+}
+
+void LoopConstrainer::rewriteIncomingValuesForPHIs(
+ LoopStructure &LS, BasicBlock *ContinuationBlock,
+ const LoopConstrainer::RewrittenRangeInfo &RRI) const {
+ unsigned PHIIndex = 0;
+ for (PHINode &PN : LS.Header->phis())
+ PN.setIncomingValueForBlock(ContinuationBlock,
+ RRI.PHIValuesAtPseudoExit[PHIIndex++]);
+
+ LS.IndVarStart = RRI.IndVarEnd;
+}
+
+BasicBlock *LoopConstrainer::createPreheader(const LoopStructure &LS,
+ BasicBlock *OldPreheader,
+ const char *Tag) const {
+ BasicBlock *Preheader = BasicBlock::Create(Ctx, Tag, &F, LS.Header);
+ BranchInst::Create(LS.Header, Preheader);
+
+ LS.Header->replacePhiUsesWith(OldPreheader, Preheader);
+
+ return Preheader;
+}
+
+void LoopConstrainer::addToParentLoopIfNeeded(ArrayRef<BasicBlock *> BBs) {
+ Loop *ParentLoop = OriginalLoop.getParentLoop();
+ if (!ParentLoop)
+ return;
+
+ for (BasicBlock *BB : BBs)
+ ParentLoop->addBasicBlockToLoop(BB, LI);
+}
+
+Loop *LoopConstrainer::createClonedLoopStructure(Loop *Original, Loop *Parent,
+ ValueToValueMapTy &VM,
+ bool IsSubloop) {
+ Loop &New = *LI.AllocateLoop();
+ if (Parent)
+ Parent->addChildLoop(&New);
+ else
+ LI.addTopLevelLoop(&New);
+ LPMAddNewLoop(&New, IsSubloop);
+
+ // Add all of the blocks in Original to the new loop.
+ for (auto *BB : Original->blocks())
+ if (LI.getLoopFor(BB) == Original)
+ New.addBasicBlockToLoop(cast<BasicBlock>(VM[BB]), LI);
+
+ // Add all of the subloops to the new loop.
+ for (Loop *SubLoop : *Original)
+ createClonedLoopStructure(SubLoop, &New, VM, /* IsSubloop */ true);
+
+ return &New;
+}
+
+bool LoopConstrainer::run() {
+ BasicBlock *Preheader = nullptr;
+ LatchTakenCount = SE.getExitCount(&OriginalLoop, MainLoopStructure.Latch);
+ Preheader = OriginalLoop.getLoopPreheader();
+ assert(!isa<SCEVCouldNotCompute>(LatchTakenCount) && Preheader != nullptr &&
+ "preconditions!");
+
+ OriginalPreheader = Preheader;
+ MainLoopPreheader = Preheader;
+
+ bool IsSignedPredicate = MainLoopStructure.IsSignedPredicate;
+ Optional<SubRanges> MaybeSR = calculateSubRanges(IsSignedPredicate);
+ if (!MaybeSR.hasValue()) {
+ LLVM_DEBUG(dbgs() << "irce: could not compute subranges\n");
+ return false;
+ }
+
+ SubRanges SR = MaybeSR.getValue();
+ bool Increasing = MainLoopStructure.IndVarIncreasing;
+ IntegerType *IVTy =
+ cast<IntegerType>(Range.getBegin()->getType());
+
+ SCEVExpander Expander(SE, F.getParent()->getDataLayout(), "irce");
+ Instruction *InsertPt = OriginalPreheader->getTerminator();
+
+ // It would have been better to make `PreLoop' and `PostLoop'
+ // `Optional<ClonedLoop>'s, but `ValueToValueMapTy' does not have a copy
+ // constructor.
+ ClonedLoop PreLoop, PostLoop;
+ bool NeedsPreLoop =
+ Increasing ? SR.LowLimit.hasValue() : SR.HighLimit.hasValue();
+ bool NeedsPostLoop =
+ Increasing ? SR.HighLimit.hasValue() : SR.LowLimit.hasValue();
+
+ Value *ExitPreLoopAt = nullptr;
+ Value *ExitMainLoopAt = nullptr;
+ const SCEVConstant *MinusOneS =
+ cast<SCEVConstant>(SE.getConstant(IVTy, -1, true /* isSigned */));
+
+ if (NeedsPreLoop) {
+ const SCEV *ExitPreLoopAtSCEV = nullptr;
+
+ if (Increasing)
+ ExitPreLoopAtSCEV = *SR.LowLimit;
+ else if (cannotBeMinInLoop(*SR.HighLimit, &OriginalLoop, SE,
+ IsSignedPredicate))
+ ExitPreLoopAtSCEV = SE.getAddExpr(*SR.HighLimit, MinusOneS);
+ else {
+ LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+ << "preloop exit limit. HighLimit = "
+ << *(*SR.HighLimit) << "\n");
+ return false;
+ }
+
+ if (!isSafeToExpandAt(ExitPreLoopAtSCEV, InsertPt, SE)) {
+ LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+ << " preloop exit limit " << *ExitPreLoopAtSCEV
+ << " at block " << InsertPt->getParent()->getName()
+ << "\n");
+ return false;
+ }
+
+ ExitPreLoopAt = Expander.expandCodeFor(ExitPreLoopAtSCEV, IVTy, InsertPt);
+ ExitPreLoopAt->setName("exit.preloop.at");
+ }
+
+ if (NeedsPostLoop) {
+ const SCEV *ExitMainLoopAtSCEV = nullptr;
+
+ if (Increasing)
+ ExitMainLoopAtSCEV = *SR.HighLimit;
+ else if (cannotBeMinInLoop(*SR.LowLimit, &OriginalLoop, SE,
+ IsSignedPredicate))
+ ExitMainLoopAtSCEV = SE.getAddExpr(*SR.LowLimit, MinusOneS);
+ else {
+ LLVM_DEBUG(dbgs() << "irce: could not prove no-overflow when computing "
+ << "mainloop exit limit. LowLimit = "
+ << *(*SR.LowLimit) << "\n");
+ return false;
+ }
+
+ if (!isSafeToExpandAt(ExitMainLoopAtSCEV, InsertPt, SE)) {
+ LLVM_DEBUG(dbgs() << "irce: could not prove that it is safe to expand the"
+ << " main loop exit limit " << *ExitMainLoopAtSCEV
+ << " at block " << InsertPt->getParent()->getName()
+ << "\n");
+ return false;
+ }
+
+ ExitMainLoopAt = Expander.expandCodeFor(ExitMainLoopAtSCEV, IVTy, InsertPt);
+ ExitMainLoopAt->setName("exit.mainloop.at");
+ }
+
+ // We clone these ahead of time so that we don't have to deal with changing
+ // and temporarily invalid IR as we transform the loops.
+ if (NeedsPreLoop)
+ cloneLoop(PreLoop, "preloop");
+ if (NeedsPostLoop)
+ cloneLoop(PostLoop, "postloop");
+
+ RewrittenRangeInfo PreLoopRRI;
+
+ if (NeedsPreLoop) {
+ Preheader->getTerminator()->replaceUsesOfWith(MainLoopStructure.Header,
+ PreLoop.Structure.Header);
+
+ MainLoopPreheader =
+ createPreheader(MainLoopStructure, Preheader, "mainloop");
+ PreLoopRRI = changeIterationSpaceEnd(PreLoop.Structure, Preheader,
+ ExitPreLoopAt, MainLoopPreheader);
+ rewriteIncomingValuesForPHIs(MainLoopStructure, MainLoopPreheader,
+ PreLoopRRI);
+ }
+
+ BasicBlock *PostLoopPreheader = nullptr;
+ RewrittenRangeInfo PostLoopRRI;
+
+ if (NeedsPostLoop) {
+ PostLoopPreheader =
+ createPreheader(PostLoop.Structure, Preheader, "postloop");
+ PostLoopRRI = changeIterationSpaceEnd(MainLoopStructure, MainLoopPreheader,
+ ExitMainLoopAt, PostLoopPreheader);
+ rewriteIncomingValuesForPHIs(PostLoop.Structure, PostLoopPreheader,
+ PostLoopRRI);
+ }
+
+ BasicBlock *NewMainLoopPreheader =
+ MainLoopPreheader != Preheader ? MainLoopPreheader : nullptr;
+ BasicBlock *NewBlocks[] = {PostLoopPreheader, PreLoopRRI.PseudoExit,
+ PreLoopRRI.ExitSelector, PostLoopRRI.PseudoExit,
+ PostLoopRRI.ExitSelector, NewMainLoopPreheader};
+
+ // Some of the above may be nullptr, filter them out before passing to
+ // addToParentLoopIfNeeded.
+ auto NewBlocksEnd =
+ std::remove(std::begin(NewBlocks), std::end(NewBlocks), nullptr);
+
+ addToParentLoopIfNeeded(makeArrayRef(std::begin(NewBlocks), NewBlocksEnd));
+
+ DT.recalculate(F);
+
+ // We need to first add all the pre and post loop blocks into the loop
+ // structures (as part of createClonedLoopStructure), and then update the
+ // LCSSA form and LoopSimplifyForm. This is necessary for correctly updating
+ // LI when LoopSimplifyForm is generated.
+ Loop *PreL = nullptr, *PostL = nullptr;
+ if (!PreLoop.Blocks.empty()) {
+ PreL = createClonedLoopStructure(&OriginalLoop,
+ OriginalLoop.getParentLoop(), PreLoop.Map,
+ /* IsSubLoop */ false);
+ }
+
+ if (!PostLoop.Blocks.empty()) {
+ PostL =
+ createClonedLoopStructure(&OriginalLoop, OriginalLoop.getParentLoop(),
+ PostLoop.Map, /* IsSubLoop */ false);
+ }
+
+ // This function canonicalizes the loop into Loop-Simplify and LCSSA forms.
+ auto CanonicalizeLoop = [&] (Loop *L, bool IsOriginalLoop) {
+ formLCSSARecursively(*L, DT, &LI, &SE);
+ simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr, true);
+ // Pre/post loops are slow paths, we do not need to perform any loop
+ // optimizations on them.
+ if (!IsOriginalLoop)
+ DisableAllLoopOptsOnLoop(*L);
+ };
+ if (PreL)
+ CanonicalizeLoop(PreL, false);
+ if (PostL)
+ CanonicalizeLoop(PostL, false);
+ CanonicalizeLoop(&OriginalLoop, true);
+
+ return true;
+}
+
+/// Computes and returns a range of values for the induction variable (IndVar)
+/// in which the range check can be safely elided. If it cannot compute such a
+/// range, returns None.
+Optional<InductiveRangeCheck::Range>
+InductiveRangeCheck::computeSafeIterationSpace(
+ ScalarEvolution &SE, const SCEVAddRecExpr *IndVar,
+ bool IsLatchSigned) const {
+ // We can deal when types of latch check and range checks don't match in case
+ // if latch check is more narrow.
+ auto *IVType = cast<IntegerType>(IndVar->getType());
+ auto *RCType = cast<IntegerType>(getBegin()->getType());
+ if (IVType->getBitWidth() > RCType->getBitWidth())
+ return None;
+ // IndVar is of the form "A + B * I" (where "I" is the canonical induction
+ // variable, that may or may not exist as a real llvm::Value in the loop) and
+ // this inductive range check is a range check on the "C + D * I" ("C" is
+ // getBegin() and "D" is getStep()). We rewrite the value being range
+ // checked to "M + N * IndVar" where "N" = "D * B^(-1)" and "M" = "C - NA".
+ //
+ // The actual inequalities we solve are of the form
+ //
+ // 0 <= M + 1 * IndVar < L given L >= 0 (i.e. N == 1)
+ //
+ // Here L stands for upper limit of the safe iteration space.
+ // The inequality is satisfied by (0 - M) <= IndVar < (L - M). To avoid
+ // overflows when calculating (0 - M) and (L - M) we, depending on type of
+ // IV's iteration space, limit the calculations by borders of the iteration
+ // space. For example, if IndVar is unsigned, (0 - M) overflows for any M > 0.
+ // If we figured out that "anything greater than (-M) is safe", we strengthen
+ // this to "everything greater than 0 is safe", assuming that values between
+ // -M and 0 just do not exist in unsigned iteration space, and we don't want
+ // to deal with overflown values.
+
+ if (!IndVar->isAffine())
+ return None;
+
+ const SCEV *A = NoopOrExtend(IndVar->getStart(), RCType, SE, IsLatchSigned);
+ const SCEVConstant *B = dyn_cast<SCEVConstant>(
+ NoopOrExtend(IndVar->getStepRecurrence(SE), RCType, SE, IsLatchSigned));
+ if (!B)
+ return None;
+ assert(!B->isZero() && "Recurrence with zero step?");
+
+ const SCEV *C = getBegin();
+ const SCEVConstant *D = dyn_cast<SCEVConstant>(getStep());
+ if (D != B)
+ return None;
+
+ assert(!D->getValue()->isZero() && "Recurrence with zero step?");
+ unsigned BitWidth = RCType->getBitWidth();
+ const SCEV *SIntMax = SE.getConstant(APInt::getSignedMaxValue(BitWidth));
+
+ // Subtract Y from X so that it does not go through border of the IV
+ // iteration space. Mathematically, it is equivalent to:
+ //
+ // ClampedSubtract(X, Y) = min(max(X - Y, INT_MIN), INT_MAX). [1]
+ //
+ // In [1], 'X - Y' is a mathematical subtraction (result is not bounded to
+ // any width of bit grid). But after we take min/max, the result is
+ // guaranteed to be within [INT_MIN, INT_MAX].
+ //
+ // In [1], INT_MAX and INT_MIN are respectively signed and unsigned max/min
+ // values, depending on type of latch condition that defines IV iteration
+ // space.
+ auto ClampedSubtract = [&](const SCEV *X, const SCEV *Y) {
+ // FIXME: The current implementation assumes that X is in [0, SINT_MAX].
+ // This is required to ensure that SINT_MAX - X does not overflow signed and
+ // that X - Y does not overflow unsigned if Y is negative. Can we lift this
+ // restriction and make it work for negative X either?
+ if (IsLatchSigned) {
+ // X is a number from signed range, Y is interpreted as signed.
+ // Even if Y is SINT_MAX, (X - Y) does not reach SINT_MIN. So the only
+ // thing we should care about is that we didn't cross SINT_MAX.
+ // So, if Y is positive, we subtract Y safely.
+ // Rule 1: Y > 0 ---> Y.
+ // If 0 <= -Y <= (SINT_MAX - X), we subtract Y safely.
+ // Rule 2: Y >=s (X - SINT_MAX) ---> Y.
+ // If 0 <= (SINT_MAX - X) < -Y, we can only subtract (X - SINT_MAX).
+ // Rule 3: Y <s (X - SINT_MAX) ---> (X - SINT_MAX).
+ // It gives us smax(Y, X - SINT_MAX) to subtract in all cases.
+ const SCEV *XMinusSIntMax = SE.getMinusSCEV(X, SIntMax);
+ return SE.getMinusSCEV(X, SE.getSMaxExpr(Y, XMinusSIntMax),
+ SCEV::FlagNSW);
+ } else
+ // X is a number from unsigned range, Y is interpreted as signed.
+ // Even if Y is SINT_MIN, (X - Y) does not reach UINT_MAX. So the only
+ // thing we should care about is that we didn't cross zero.
+ // So, if Y is negative, we subtract Y safely.
+ // Rule 1: Y <s 0 ---> Y.
+ // If 0 <= Y <= X, we subtract Y safely.
+ // Rule 2: Y <=s X ---> Y.
+ // If 0 <= X < Y, we should stop at 0 and can only subtract X.
+ // Rule 3: Y >s X ---> X.
+ // It gives us smin(X, Y) to subtract in all cases.
+ return SE.getMinusSCEV(X, SE.getSMinExpr(X, Y), SCEV::FlagNUW);
+ };
+ const SCEV *M = SE.getMinusSCEV(C, A);
+ const SCEV *Zero = SE.getZero(M->getType());
+
+ // This function returns SCEV equal to 1 if X is non-negative 0 otherwise.
+ auto SCEVCheckNonNegative = [&](const SCEV *X) {
+ const Loop *L = IndVar->getLoop();
+ const SCEV *One = SE.getOne(X->getType());
+ // Can we trivially prove that X is a non-negative or negative value?
+ if (isKnownNonNegativeInLoop(X, L, SE))
+ return One;
+ else if (isKnownNegativeInLoop(X, L, SE))
+ return Zero;
+ // If not, we will have to figure it out during the execution.
+ // Function smax(smin(X, 0), -1) + 1 equals to 1 if X >= 0 and 0 if X < 0.
+ const SCEV *NegOne = SE.getNegativeSCEV(One);
+ return SE.getAddExpr(SE.getSMaxExpr(SE.getSMinExpr(X, Zero), NegOne), One);
+ };
+ // FIXME: Current implementation of ClampedSubtract implicitly assumes that
+ // X is non-negative (in sense of a signed value). We need to re-implement
+ // this function in a way that it will correctly handle negative X as well.
+ // We use it twice: for X = 0 everything is fine, but for X = getEnd() we can
+ // end up with a negative X and produce wrong results. So currently we ensure
+ // that if getEnd() is negative then both ends of the safe range are zero.
+ // Note that this may pessimize elimination of unsigned range checks against
+ // negative values.
+ const SCEV *REnd = getEnd();
+ const SCEV *EndIsNonNegative = SCEVCheckNonNegative(REnd);
+
+ const SCEV *Begin = SE.getMulExpr(ClampedSubtract(Zero, M), EndIsNonNegative);
+ const SCEV *End = SE.getMulExpr(ClampedSubtract(REnd, M), EndIsNonNegative);
+ return InductiveRangeCheck::Range(Begin, End);
+}
+
+static Optional<InductiveRangeCheck::Range>
+IntersectSignedRange(ScalarEvolution &SE,
+ const Optional<InductiveRangeCheck::Range> &R1,
+ const InductiveRangeCheck::Range &R2) {
+ if (R2.isEmpty(SE, /* IsSigned */ true))
+ return None;
+ if (!R1.hasValue())
+ return R2;
+ auto &R1Value = R1.getValue();
+ // We never return empty ranges from this function, and R1 is supposed to be
+ // a result of intersection. Thus, R1 is never empty.
+ assert(!R1Value.isEmpty(SE, /* IsSigned */ true) &&
+ "We should never have empty R1!");
+
+ // TODO: we could widen the smaller range and have this work; but for now we
+ // bail out to keep things simple.
+ if (R1Value.getType() != R2.getType())
+ return None;
+
+ const SCEV *NewBegin = SE.getSMaxExpr(R1Value.getBegin(), R2.getBegin());
+ const SCEV *NewEnd = SE.getSMinExpr(R1Value.getEnd(), R2.getEnd());
+
+ // If the resulting range is empty, just return None.
+ auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd);
+ if (Ret.isEmpty(SE, /* IsSigned */ true))
+ return None;
+ return Ret;
+}
+
+static Optional<InductiveRangeCheck::Range>
+IntersectUnsignedRange(ScalarEvolution &SE,
+ const Optional<InductiveRangeCheck::Range> &R1,
+ const InductiveRangeCheck::Range &R2) {
+ if (R2.isEmpty(SE, /* IsSigned */ false))
+ return None;
+ if (!R1.hasValue())
+ return R2;
+ auto &R1Value = R1.getValue();
+ // We never return empty ranges from this function, and R1 is supposed to be
+ // a result of intersection. Thus, R1 is never empty.
+ assert(!R1Value.isEmpty(SE, /* IsSigned */ false) &&
+ "We should never have empty R1!");
+
+ // TODO: we could widen the smaller range and have this work; but for now we
+ // bail out to keep things simple.
+ if (R1Value.getType() != R2.getType())
+ return None;
+
+ const SCEV *NewBegin = SE.getUMaxExpr(R1Value.getBegin(), R2.getBegin());
+ const SCEV *NewEnd = SE.getUMinExpr(R1Value.getEnd(), R2.getEnd());
+
+ // If the resulting range is empty, just return None.
+ auto Ret = InductiveRangeCheck::Range(NewBegin, NewEnd);
+ if (Ret.isEmpty(SE, /* IsSigned */ false))
+ return None;
+ return Ret;
+}
+
+PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &BPI = AM.getResult<BranchProbabilityAnalysis>(F);
+ LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+
// Get BFI analysis result on demand. Please note that modification of
// CFG invalidates this analysis and we should handle it.
auto getBFI = [&F, &AM ]()->BlockFrequencyInfo & {
return AM.getResult<BlockFrequencyAnalysis>(F);
};
InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI, { getBFI });
-
- bool Changed = false;
+
+ bool Changed = false;
{
bool CFGChanged = false;
for (const auto &L : LI) {
@@ -1784,65 +1784,65 @@ PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) {
Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
}
Changed |= CFGChanged;
-
+
if (CFGChanged && !SkipProfitabilityChecks)
AM.invalidate<BlockFrequencyAnalysis>(F);
- }
-
- SmallPriorityWorklist<Loop *, 4> Worklist;
- appendLoopsToWorklist(LI, Worklist);
- auto LPMAddNewLoop = [&Worklist](Loop *NL, bool IsSubloop) {
- if (!IsSubloop)
- appendLoopsToWorklist(*NL, Worklist);
- };
-
- while (!Worklist.empty()) {
- Loop *L = Worklist.pop_back_val();
+ }
+
+ SmallPriorityWorklist<Loop *, 4> Worklist;
+ appendLoopsToWorklist(LI, Worklist);
+ auto LPMAddNewLoop = [&Worklist](Loop *NL, bool IsSubloop) {
+ if (!IsSubloop)
+ appendLoopsToWorklist(*NL, Worklist);
+ };
+
+ while (!Worklist.empty()) {
+ Loop *L = Worklist.pop_back_val();
if (IRCE.run(L, LPMAddNewLoop)) {
Changed = true;
if (!SkipProfitabilityChecks)
AM.invalidate<BlockFrequencyAnalysis>(F);
}
- }
-
- if (!Changed)
- return PreservedAnalyses::all();
- return getLoopPassPreservedAnalyses();
-}
-
-bool IRCELegacyPass::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- BranchProbabilityInfo &BPI =
- getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI);
-
- bool Changed = false;
-
- for (const auto &L : LI) {
- Changed |= simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr,
- /*PreserveLCSSA=*/false);
- Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
- }
-
- SmallPriorityWorklist<Loop *, 4> Worklist;
- appendLoopsToWorklist(LI, Worklist);
- auto LPMAddNewLoop = [&](Loop *NL, bool IsSubloop) {
- if (!IsSubloop)
- appendLoopsToWorklist(*NL, Worklist);
- };
-
- while (!Worklist.empty()) {
- Loop *L = Worklist.pop_back_val();
- Changed |= IRCE.run(L, LPMAddNewLoop);
- }
- return Changed;
-}
-
+ }
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+ return getLoopPassPreservedAnalyses();
+}
+
+bool IRCELegacyPass::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ BranchProbabilityInfo &BPI =
+ getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI);
+
+ bool Changed = false;
+
+ for (const auto &L : LI) {
+ Changed |= simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr,
+ /*PreserveLCSSA=*/false);
+ Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
+ }
+
+ SmallPriorityWorklist<Loop *, 4> Worklist;
+ appendLoopsToWorklist(LI, Worklist);
+ auto LPMAddNewLoop = [&](Loop *NL, bool IsSubloop) {
+ if (!IsSubloop)
+ appendLoopsToWorklist(*NL, Worklist);
+ };
+
+ while (!Worklist.empty()) {
+ Loop *L = Worklist.pop_back_val();
+ Changed |= IRCE.run(L, LPMAddNewLoop);
+ }
+ return Changed;
+}
+
bool
InductiveRangeCheckElimination::isProfitableToTransform(const Loop &L,
LoopStructure &LS) {
@@ -1874,118 +1874,118 @@ InductiveRangeCheckElimination::isProfitableToTransform(const Loop &L,
return true;
}
-bool InductiveRangeCheckElimination::run(
- Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop) {
- if (L->getBlocks().size() >= LoopSizeCutoff) {
- LLVM_DEBUG(dbgs() << "irce: giving up constraining loop, too large\n");
- return false;
- }
-
- BasicBlock *Preheader = L->getLoopPreheader();
- if (!Preheader) {
- LLVM_DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
- return false;
- }
-
- LLVMContext &Context = Preheader->getContext();
- SmallVector<InductiveRangeCheck, 16> RangeChecks;
-
- for (auto BBI : L->getBlocks())
- if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
- InductiveRangeCheck::extractRangeChecksFromBranch(TBI, L, SE, BPI,
- RangeChecks);
-
- if (RangeChecks.empty())
- return false;
-
- auto PrintRecognizedRangeChecks = [&](raw_ostream &OS) {
- OS << "irce: looking at loop "; L->print(OS);
- OS << "irce: loop has " << RangeChecks.size()
- << " inductive range checks: \n";
- for (InductiveRangeCheck &IRC : RangeChecks)
- IRC.print(OS);
- };
-
- LLVM_DEBUG(PrintRecognizedRangeChecks(dbgs()));
-
- if (PrintRangeChecks)
- PrintRecognizedRangeChecks(errs());
-
- const char *FailureReason = nullptr;
- Optional<LoopStructure> MaybeLoopStructure =
+bool InductiveRangeCheckElimination::run(
+ Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop) {
+ if (L->getBlocks().size() >= LoopSizeCutoff) {
+ LLVM_DEBUG(dbgs() << "irce: giving up constraining loop, too large\n");
+ return false;
+ }
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader) {
+ LLVM_DEBUG(dbgs() << "irce: loop has no preheader, leaving\n");
+ return false;
+ }
+
+ LLVMContext &Context = Preheader->getContext();
+ SmallVector<InductiveRangeCheck, 16> RangeChecks;
+
+ for (auto BBI : L->getBlocks())
+ if (BranchInst *TBI = dyn_cast<BranchInst>(BBI->getTerminator()))
+ InductiveRangeCheck::extractRangeChecksFromBranch(TBI, L, SE, BPI,
+ RangeChecks);
+
+ if (RangeChecks.empty())
+ return false;
+
+ auto PrintRecognizedRangeChecks = [&](raw_ostream &OS) {
+ OS << "irce: looking at loop "; L->print(OS);
+ OS << "irce: loop has " << RangeChecks.size()
+ << " inductive range checks: \n";
+ for (InductiveRangeCheck &IRC : RangeChecks)
+ IRC.print(OS);
+ };
+
+ LLVM_DEBUG(PrintRecognizedRangeChecks(dbgs()));
+
+ if (PrintRangeChecks)
+ PrintRecognizedRangeChecks(errs());
+
+ const char *FailureReason = nullptr;
+ Optional<LoopStructure> MaybeLoopStructure =
LoopStructure::parseLoopStructure(SE, *L, FailureReason);
- if (!MaybeLoopStructure.hasValue()) {
- LLVM_DEBUG(dbgs() << "irce: could not parse loop structure: "
- << FailureReason << "\n";);
- return false;
- }
- LoopStructure LS = MaybeLoopStructure.getValue();
+ if (!MaybeLoopStructure.hasValue()) {
+ LLVM_DEBUG(dbgs() << "irce: could not parse loop structure: "
+ << FailureReason << "\n";);
+ return false;
+ }
+ LoopStructure LS = MaybeLoopStructure.getValue();
if (!isProfitableToTransform(*L, LS))
return false;
- const SCEVAddRecExpr *IndVar =
- cast<SCEVAddRecExpr>(SE.getMinusSCEV(SE.getSCEV(LS.IndVarBase), SE.getSCEV(LS.IndVarStep)));
-
- Optional<InductiveRangeCheck::Range> SafeIterRange;
- Instruction *ExprInsertPt = Preheader->getTerminator();
-
- SmallVector<InductiveRangeCheck, 4> RangeChecksToEliminate;
- // Basing on the type of latch predicate, we interpret the IV iteration range
- // as signed or unsigned range. We use different min/max functions (signed or
- // unsigned) when intersecting this range with safe iteration ranges implied
- // by range checks.
- auto IntersectRange =
- LS.IsSignedPredicate ? IntersectSignedRange : IntersectUnsignedRange;
-
- IRBuilder<> B(ExprInsertPt);
- for (InductiveRangeCheck &IRC : RangeChecks) {
- auto Result = IRC.computeSafeIterationSpace(SE, IndVar,
- LS.IsSignedPredicate);
- if (Result.hasValue()) {
- auto MaybeSafeIterRange =
- IntersectRange(SE, SafeIterRange, Result.getValue());
- if (MaybeSafeIterRange.hasValue()) {
- assert(
- !MaybeSafeIterRange.getValue().isEmpty(SE, LS.IsSignedPredicate) &&
- "We should never return empty ranges!");
- RangeChecksToEliminate.push_back(IRC);
- SafeIterRange = MaybeSafeIterRange.getValue();
- }
- }
- }
-
- if (!SafeIterRange.hasValue())
- return false;
-
- LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT,
- SafeIterRange.getValue());
- bool Changed = LC.run();
-
- if (Changed) {
- auto PrintConstrainedLoopInfo = [L]() {
- dbgs() << "irce: in function ";
- dbgs() << L->getHeader()->getParent()->getName() << ": ";
- dbgs() << "constrained ";
- L->print(dbgs());
- };
-
- LLVM_DEBUG(PrintConstrainedLoopInfo());
-
- if (PrintChangedLoops)
- PrintConstrainedLoopInfo();
-
- // Optimize away the now-redundant range checks.
-
- for (InductiveRangeCheck &IRC : RangeChecksToEliminate) {
- ConstantInt *FoldedRangeCheck = IRC.getPassingDirection()
- ? ConstantInt::getTrue(Context)
- : ConstantInt::getFalse(Context);
- IRC.getCheckUse()->set(FoldedRangeCheck);
- }
- }
-
- return Changed;
-}
-
-Pass *llvm::createInductiveRangeCheckEliminationPass() {
- return new IRCELegacyPass();
-}
+ const SCEVAddRecExpr *IndVar =
+ cast<SCEVAddRecExpr>(SE.getMinusSCEV(SE.getSCEV(LS.IndVarBase), SE.getSCEV(LS.IndVarStep)));
+
+ Optional<InductiveRangeCheck::Range> SafeIterRange;
+ Instruction *ExprInsertPt = Preheader->getTerminator();
+
+ SmallVector<InductiveRangeCheck, 4> RangeChecksToEliminate;
+ // Basing on the type of latch predicate, we interpret the IV iteration range
+ // as signed or unsigned range. We use different min/max functions (signed or
+ // unsigned) when intersecting this range with safe iteration ranges implied
+ // by range checks.
+ auto IntersectRange =
+ LS.IsSignedPredicate ? IntersectSignedRange : IntersectUnsignedRange;
+
+ IRBuilder<> B(ExprInsertPt);
+ for (InductiveRangeCheck &IRC : RangeChecks) {
+ auto Result = IRC.computeSafeIterationSpace(SE, IndVar,
+ LS.IsSignedPredicate);
+ if (Result.hasValue()) {
+ auto MaybeSafeIterRange =
+ IntersectRange(SE, SafeIterRange, Result.getValue());
+ if (MaybeSafeIterRange.hasValue()) {
+ assert(
+ !MaybeSafeIterRange.getValue().isEmpty(SE, LS.IsSignedPredicate) &&
+ "We should never return empty ranges!");
+ RangeChecksToEliminate.push_back(IRC);
+ SafeIterRange = MaybeSafeIterRange.getValue();
+ }
+ }
+ }
+
+ if (!SafeIterRange.hasValue())
+ return false;
+
+ LoopConstrainer LC(*L, LI, LPMAddNewLoop, LS, SE, DT,
+ SafeIterRange.getValue());
+ bool Changed = LC.run();
+
+ if (Changed) {
+ auto PrintConstrainedLoopInfo = [L]() {
+ dbgs() << "irce: in function ";
+ dbgs() << L->getHeader()->getParent()->getName() << ": ";
+ dbgs() << "constrained ";
+ L->print(dbgs());
+ };
+
+ LLVM_DEBUG(PrintConstrainedLoopInfo());
+
+ if (PrintChangedLoops)
+ PrintConstrainedLoopInfo();
+
+ // Optimize away the now-redundant range checks.
+
+ for (InductiveRangeCheck &IRC : RangeChecksToEliminate) {
+ ConstantInt *FoldedRangeCheck = IRC.getPassingDirection()
+ ? ConstantInt::getTrue(Context)
+ : ConstantInt::getFalse(Context);
+ IRC.getCheckUse()->set(FoldedRangeCheck);
+ }
+ }
+
+ return Changed;
+}
+
+Pass *llvm::createInductiveRangeCheckEliminationPass() {
+ return new IRCELegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/InferAddressSpaces.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/InferAddressSpaces.cpp
index d8df431486..332eb10ac1 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -1,171 +1,171 @@
-//===- InferAddressSpace.cpp - --------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// CUDA C/C++ includes memory space designation as variable type qualifers (such
-// as __global__ and __shared__). Knowing the space of a memory access allows
-// CUDA compilers to emit faster PTX loads and stores. For example, a load from
-// shared memory can be translated to `ld.shared` which is roughly 10% faster
-// than a generic `ld` on an NVIDIA Tesla K40c.
-//
-// Unfortunately, type qualifiers only apply to variable declarations, so CUDA
-// compilers must infer the memory space of an address expression from
-// type-qualified variables.
-//
-// LLVM IR uses non-zero (so-called) specific address spaces to represent memory
-// spaces (e.g. addrspace(3) means shared memory). The Clang frontend
-// places only type-qualified variables in specific address spaces, and then
-// conservatively `addrspacecast`s each type-qualified variable to addrspace(0)
-// (so-called the generic address space) for other instructions to use.
-//
-// For example, the Clang translates the following CUDA code
-// __shared__ float a[10];
-// float v = a[i];
-// to
-// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
-// %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i
-// %v = load float, float* %1 ; emits ld.f32
-// @a is in addrspace(3) since it's type-qualified, but its use from %1 is
-// redirected to %0 (the generic version of @a).
-//
-// The optimization implemented in this file propagates specific address spaces
-// from type-qualified variable declarations to its users. For example, it
-// optimizes the above IR to
-// %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
-// %v = load float addrspace(3)* %1 ; emits ld.shared.f32
-// propagating the addrspace(3) from @a to %1. As the result, the NVPTX
-// codegen is able to emit ld.shared.f32 for %v.
-//
-// Address space inference works in two steps. First, it uses a data-flow
-// analysis to infer as many generic pointers as possible to point to only one
-// specific address space. In the above example, it can prove that %1 only
-// points to addrspace(3). This algorithm was published in
-// CUDA: Compiling and optimizing for a GPU platform
-// Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang
-// ICCS 2012
-//
-// Then, address space inference replaces all refinable generic pointers with
-// equivalent specific pointers.
-//
-// The major challenge of implementing this optimization is handling PHINodes,
-// which may create loops in the data flow graph. This brings two complications.
-//
-// First, the data flow analysis in Step 1 needs to be circular. For example,
-// %generic.input = addrspacecast float addrspace(3)* %input to float*
-// loop:
-// %y = phi [ %generic.input, %y2 ]
-// %y2 = getelementptr %y, 1
-// %v = load %y2
-// br ..., label %loop, ...
-// proving %y specific requires proving both %generic.input and %y2 specific,
-// but proving %y2 specific circles back to %y. To address this complication,
-// the data flow analysis operates on a lattice:
-// uninitialized > specific address spaces > generic.
-// All address expressions (our implementation only considers phi, bitcast,
-// addrspacecast, and getelementptr) start with the uninitialized address space.
-// The monotone transfer function moves the address space of a pointer down a
-// lattice path from uninitialized to specific and then to generic. A join
-// operation of two different specific address spaces pushes the expression down
-// to the generic address space. The analysis completes once it reaches a fixed
-// point.
-//
-// Second, IR rewriting in Step 2 also needs to be circular. For example,
-// converting %y to addrspace(3) requires the compiler to know the converted
-// %y2, but converting %y2 needs the converted %y. To address this complication,
-// we break these cycles using "undef" placeholders. When converting an
-// instruction `I` to a new address space, if its operand `Op` is not converted
-// yet, we let `I` temporarily use `undef` and fix all the uses of undef later.
-// For instance, our algorithm first converts %y to
-// %y' = phi float addrspace(3)* [ %input, undef ]
-// Then, it converts %y2 to
-// %y2' = getelementptr %y', 1
-// Finally, it fixes the undef in %y' so that
-// %y' = phi float addrspace(3)* [ %input, %y2' ]
-//
-//===----------------------------------------------------------------------===//
-
+//===- InferAddressSpace.cpp - --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// CUDA C/C++ includes memory space designation as variable type qualifers (such
+// as __global__ and __shared__). Knowing the space of a memory access allows
+// CUDA compilers to emit faster PTX loads and stores. For example, a load from
+// shared memory can be translated to `ld.shared` which is roughly 10% faster
+// than a generic `ld` on an NVIDIA Tesla K40c.
+//
+// Unfortunately, type qualifiers only apply to variable declarations, so CUDA
+// compilers must infer the memory space of an address expression from
+// type-qualified variables.
+//
+// LLVM IR uses non-zero (so-called) specific address spaces to represent memory
+// spaces (e.g. addrspace(3) means shared memory). The Clang frontend
+// places only type-qualified variables in specific address spaces, and then
+// conservatively `addrspacecast`s each type-qualified variable to addrspace(0)
+// (so-called the generic address space) for other instructions to use.
+//
+// For example, the Clang translates the following CUDA code
+// __shared__ float a[10];
+// float v = a[i];
+// to
+// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+// %1 = gep [10 x float], [10 x float]* %0, i64 0, i64 %i
+// %v = load float, float* %1 ; emits ld.f32
+// @a is in addrspace(3) since it's type-qualified, but its use from %1 is
+// redirected to %0 (the generic version of @a).
+//
+// The optimization implemented in this file propagates specific address spaces
+// from type-qualified variable declarations to its users. For example, it
+// optimizes the above IR to
+// %1 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %v = load float addrspace(3)* %1 ; emits ld.shared.f32
+// propagating the addrspace(3) from @a to %1. As the result, the NVPTX
+// codegen is able to emit ld.shared.f32 for %v.
+//
+// Address space inference works in two steps. First, it uses a data-flow
+// analysis to infer as many generic pointers as possible to point to only one
+// specific address space. In the above example, it can prove that %1 only
+// points to addrspace(3). This algorithm was published in
+// CUDA: Compiling and optimizing for a GPU platform
+// Chakrabarti, Grover, Aarts, Kong, Kudlur, Lin, Marathe, Murphy, Wang
+// ICCS 2012
+//
+// Then, address space inference replaces all refinable generic pointers with
+// equivalent specific pointers.
+//
+// The major challenge of implementing this optimization is handling PHINodes,
+// which may create loops in the data flow graph. This brings two complications.
+//
+// First, the data flow analysis in Step 1 needs to be circular. For example,
+// %generic.input = addrspacecast float addrspace(3)* %input to float*
+// loop:
+// %y = phi [ %generic.input, %y2 ]
+// %y2 = getelementptr %y, 1
+// %v = load %y2
+// br ..., label %loop, ...
+// proving %y specific requires proving both %generic.input and %y2 specific,
+// but proving %y2 specific circles back to %y. To address this complication,
+// the data flow analysis operates on a lattice:
+// uninitialized > specific address spaces > generic.
+// All address expressions (our implementation only considers phi, bitcast,
+// addrspacecast, and getelementptr) start with the uninitialized address space.
+// The monotone transfer function moves the address space of a pointer down a
+// lattice path from uninitialized to specific and then to generic. A join
+// operation of two different specific address spaces pushes the expression down
+// to the generic address space. The analysis completes once it reaches a fixed
+// point.
+//
+// Second, IR rewriting in Step 2 also needs to be circular. For example,
+// converting %y to addrspace(3) requires the compiler to know the converted
+// %y2, but converting %y2 needs the converted %y. To address this complication,
+// we break these cycles using "undef" placeholders. When converting an
+// instruction `I` to a new address space, if its operand `Op` is not converted
+// yet, we let `I` temporarily use `undef` and fix all the uses of undef later.
+// For instance, our algorithm first converts %y to
+// %y' = phi float addrspace(3)* [ %input, undef ]
+// Then, it converts %y2 to
+// %y2' = getelementptr %y', 1
+// Finally, it fixes the undef in %y' so that
+// %y' = phi float addrspace(3)* [ %input, %y2' ]
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Scalar/InferAddressSpaces.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Operator.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Operator.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <cassert>
-#include <iterator>
-#include <limits>
-#include <utility>
-#include <vector>
-
-#define DEBUG_TYPE "infer-address-spaces"
-
-using namespace llvm;
-
-static cl::opt<bool> AssumeDefaultIsFlatAddressSpace(
- "assume-default-is-flat-addrspace", cl::init(false), cl::ReallyHidden,
- cl::desc("The default address space is assumed as the flat address space. "
- "This is mainly for test purpose."));
-
-static const unsigned UninitializedAddressSpace =
- std::numeric_limits<unsigned>::max();
-
-namespace {
-
-using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
-using PostorderStackTy = llvm::SmallVector<PointerIntPair<Value *, 1, bool>, 4>;
-
-class InferAddressSpaces : public FunctionPass {
- unsigned FlatAddrSpace = 0;
-
-public:
- static char ID;
-
- InferAddressSpaces() :
- FunctionPass(ID), FlatAddrSpace(UninitializedAddressSpace) {}
- InferAddressSpaces(unsigned AS) : FunctionPass(ID), FlatAddrSpace(AS) {}
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- }
-
- bool runOnFunction(Function &F) override;
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "infer-address-spaces"
+
+using namespace llvm;
+
+static cl::opt<bool> AssumeDefaultIsFlatAddressSpace(
+ "assume-default-is-flat-addrspace", cl::init(false), cl::ReallyHidden,
+ cl::desc("The default address space is assumed as the flat address space. "
+ "This is mainly for test purpose."));
+
+static const unsigned UninitializedAddressSpace =
+ std::numeric_limits<unsigned>::max();
+
+namespace {
+
+using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
+using PostorderStackTy = llvm::SmallVector<PointerIntPair<Value *, 1, bool>, 4>;
+
+class InferAddressSpaces : public FunctionPass {
+ unsigned FlatAddrSpace = 0;
+
+public:
+ static char ID;
+
+ InferAddressSpaces() :
+ FunctionPass(ID), FlatAddrSpace(UninitializedAddressSpace) {}
+ InferAddressSpaces(unsigned AS) : FunctionPass(ID), FlatAddrSpace(AS) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override;
};
-
+
class InferAddressSpacesImpl {
const TargetTransformInfo *TTI = nullptr;
const DataLayout *DL = nullptr;
@@ -174,400 +174,400 @@ class InferAddressSpacesImpl {
/// possible.
unsigned FlatAddrSpace = 0;
- // Returns the new address space of V if updated; otherwise, returns None.
- Optional<unsigned>
- updateAddressSpace(const Value &V,
- const ValueToAddrSpaceMapTy &InferredAddrSpace) const;
-
- // Tries to infer the specific address space of each address expression in
- // Postorder.
- void inferAddressSpaces(ArrayRef<WeakTrackingVH> Postorder,
- ValueToAddrSpaceMapTy *InferredAddrSpace) const;
-
- bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const;
-
- Value *cloneInstructionWithNewAddressSpace(
- Instruction *I, unsigned NewAddrSpace,
- const ValueToValueMapTy &ValueWithNewAddrSpace,
- SmallVectorImpl<const Use *> *UndefUsesToFix) const;
-
- // Changes the flat address expressions in function F to point to specific
- // address spaces if InferredAddrSpace says so. Postorder is the postorder of
- // all flat expressions in the use-def graph of function F.
- bool rewriteWithNewAddressSpaces(
- const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
- const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const;
-
- void appendsFlatAddressExpressionToPostorderStack(
- Value *V, PostorderStackTy &PostorderStack,
- DenseSet<Value *> &Visited) const;
-
- bool rewriteIntrinsicOperands(IntrinsicInst *II,
- Value *OldV, Value *NewV) const;
- void collectRewritableIntrinsicOperands(IntrinsicInst *II,
- PostorderStackTy &PostorderStack,
- DenseSet<Value *> &Visited) const;
-
- std::vector<WeakTrackingVH> collectFlatAddressExpressions(Function &F) const;
-
- Value *cloneValueWithNewAddressSpace(
- Value *V, unsigned NewAddrSpace,
- const ValueToValueMapTy &ValueWithNewAddrSpace,
- SmallVectorImpl<const Use *> *UndefUsesToFix) const;
- unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const;
+ // Returns the new address space of V if updated; otherwise, returns None.
+ Optional<unsigned>
+ updateAddressSpace(const Value &V,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace) const;
+
+ // Tries to infer the specific address space of each address expression in
+ // Postorder.
+ void inferAddressSpaces(ArrayRef<WeakTrackingVH> Postorder,
+ ValueToAddrSpaceMapTy *InferredAddrSpace) const;
+
+ bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const;
+
+ Value *cloneInstructionWithNewAddressSpace(
+ Instruction *I, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) const;
+
+ // Changes the flat address expressions in function F to point to specific
+ // address spaces if InferredAddrSpace says so. Postorder is the postorder of
+ // all flat expressions in the use-def graph of function F.
+ bool rewriteWithNewAddressSpaces(
+ const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const;
+
+ void appendsFlatAddressExpressionToPostorderStack(
+ Value *V, PostorderStackTy &PostorderStack,
+ DenseSet<Value *> &Visited) const;
+
+ bool rewriteIntrinsicOperands(IntrinsicInst *II,
+ Value *OldV, Value *NewV) const;
+ void collectRewritableIntrinsicOperands(IntrinsicInst *II,
+ PostorderStackTy &PostorderStack,
+ DenseSet<Value *> &Visited) const;
+
+ std::vector<WeakTrackingVH> collectFlatAddressExpressions(Function &F) const;
+
+ Value *cloneValueWithNewAddressSpace(
+ Value *V, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) const;
+ unsigned joinAddressSpaces(unsigned AS1, unsigned AS2) const;
public:
InferAddressSpacesImpl(const TargetTransformInfo *TTI, unsigned FlatAddrSpace)
: TTI(TTI), FlatAddrSpace(FlatAddrSpace) {}
bool run(Function &F);
-};
-
-} // end anonymous namespace
-
-char InferAddressSpaces::ID = 0;
-
-namespace llvm {
-
-void initializeInferAddressSpacesPass(PassRegistry &);
-
-} // end namespace llvm
-
-INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
- false, false)
-
-// Check whether that's no-op pointer bicast using a pair of
-// `ptrtoint`/`inttoptr` due to the missing no-op pointer bitcast over
-// different address spaces.
-static bool isNoopPtrIntCastPair(const Operator *I2P, const DataLayout &DL,
- const TargetTransformInfo *TTI) {
- assert(I2P->getOpcode() == Instruction::IntToPtr);
- auto *P2I = dyn_cast<Operator>(I2P->getOperand(0));
- if (!P2I || P2I->getOpcode() != Instruction::PtrToInt)
- return false;
- // Check it's really safe to treat that pair of `ptrtoint`/`inttoptr` as a
- // no-op cast. Besides checking both of them are no-op casts, as the
- // reinterpreted pointer may be used in other pointer arithmetic, we also
- // need to double-check that through the target-specific hook. That ensures
- // the underlying target also agrees that's a no-op address space cast and
- // pointer bits are preserved.
- // The current IR spec doesn't have clear rules on address space casts,
- // especially a clear definition for pointer bits in non-default address
- // spaces. It would be undefined if that pointer is dereferenced after an
- // invalid reinterpret cast. Also, due to the unclearness for the meaning of
- // bits in non-default address spaces in the current spec, the pointer
- // arithmetic may also be undefined after invalid pointer reinterpret cast.
- // However, as we confirm through the target hooks that it's a no-op
- // addrspacecast, it doesn't matter since the bits should be the same.
- return CastInst::isNoopCast(Instruction::CastOps(I2P->getOpcode()),
- I2P->getOperand(0)->getType(), I2P->getType(),
- DL) &&
- CastInst::isNoopCast(Instruction::CastOps(P2I->getOpcode()),
- P2I->getOperand(0)->getType(), P2I->getType(),
- DL) &&
- TTI->isNoopAddrSpaceCast(
- P2I->getOperand(0)->getType()->getPointerAddressSpace(),
- I2P->getType()->getPointerAddressSpace());
-}
-
-// Returns true if V is an address expression.
-// TODO: Currently, we consider only phi, bitcast, addrspacecast, and
-// getelementptr operators.
-static bool isAddressExpression(const Value &V, const DataLayout &DL,
- const TargetTransformInfo *TTI) {
- const Operator *Op = dyn_cast<Operator>(&V);
- if (!Op)
- return false;
-
- switch (Op->getOpcode()) {
- case Instruction::PHI:
- assert(Op->getType()->isPointerTy());
- return true;
- case Instruction::BitCast:
- case Instruction::AddrSpaceCast:
- case Instruction::GetElementPtr:
- return true;
- case Instruction::Select:
- return Op->getType()->isPointerTy();
- case Instruction::Call: {
- const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&V);
- return II && II->getIntrinsicID() == Intrinsic::ptrmask;
- }
- case Instruction::IntToPtr:
- return isNoopPtrIntCastPair(Op, DL, TTI);
- default:
+};
+
+} // end anonymous namespace
+
+char InferAddressSpaces::ID = 0;
+
+namespace llvm {
+
+void initializeInferAddressSpacesPass(PassRegistry &);
+
+} // end namespace llvm
+
+INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
+ false, false)
+
+// Check whether that's no-op pointer bicast using a pair of
+// `ptrtoint`/`inttoptr` due to the missing no-op pointer bitcast over
+// different address spaces.
+static bool isNoopPtrIntCastPair(const Operator *I2P, const DataLayout &DL,
+ const TargetTransformInfo *TTI) {
+ assert(I2P->getOpcode() == Instruction::IntToPtr);
+ auto *P2I = dyn_cast<Operator>(I2P->getOperand(0));
+ if (!P2I || P2I->getOpcode() != Instruction::PtrToInt)
+ return false;
+ // Check it's really safe to treat that pair of `ptrtoint`/`inttoptr` as a
+ // no-op cast. Besides checking both of them are no-op casts, as the
+ // reinterpreted pointer may be used in other pointer arithmetic, we also
+ // need to double-check that through the target-specific hook. That ensures
+ // the underlying target also agrees that's a no-op address space cast and
+ // pointer bits are preserved.
+ // The current IR spec doesn't have clear rules on address space casts,
+ // especially a clear definition for pointer bits in non-default address
+ // spaces. It would be undefined if that pointer is dereferenced after an
+ // invalid reinterpret cast. Also, due to the unclearness for the meaning of
+ // bits in non-default address spaces in the current spec, the pointer
+ // arithmetic may also be undefined after invalid pointer reinterpret cast.
+ // However, as we confirm through the target hooks that it's a no-op
+ // addrspacecast, it doesn't matter since the bits should be the same.
+ return CastInst::isNoopCast(Instruction::CastOps(I2P->getOpcode()),
+ I2P->getOperand(0)->getType(), I2P->getType(),
+ DL) &&
+ CastInst::isNoopCast(Instruction::CastOps(P2I->getOpcode()),
+ P2I->getOperand(0)->getType(), P2I->getType(),
+ DL) &&
+ TTI->isNoopAddrSpaceCast(
+ P2I->getOperand(0)->getType()->getPointerAddressSpace(),
+ I2P->getType()->getPointerAddressSpace());
+}
+
+// Returns true if V is an address expression.
+// TODO: Currently, we consider only phi, bitcast, addrspacecast, and
+// getelementptr operators.
+static bool isAddressExpression(const Value &V, const DataLayout &DL,
+ const TargetTransformInfo *TTI) {
+ const Operator *Op = dyn_cast<Operator>(&V);
+ if (!Op)
+ return false;
+
+ switch (Op->getOpcode()) {
+ case Instruction::PHI:
+ assert(Op->getType()->isPointerTy());
+ return true;
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ case Instruction::GetElementPtr:
+ return true;
+ case Instruction::Select:
+ return Op->getType()->isPointerTy();
+ case Instruction::Call: {
+ const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&V);
+ return II && II->getIntrinsicID() == Intrinsic::ptrmask;
+ }
+ case Instruction::IntToPtr:
+ return isNoopPtrIntCastPair(Op, DL, TTI);
+ default:
// That value is an address expression if it has an assumed address space.
return TTI->getAssumedAddrSpace(&V) != UninitializedAddressSpace;
- }
-}
-
-// Returns the pointer operands of V.
-//
-// Precondition: V is an address expression.
-static SmallVector<Value *, 2>
-getPointerOperands(const Value &V, const DataLayout &DL,
- const TargetTransformInfo *TTI) {
- const Operator &Op = cast<Operator>(V);
- switch (Op.getOpcode()) {
- case Instruction::PHI: {
- auto IncomingValues = cast<PHINode>(Op).incoming_values();
- return SmallVector<Value *, 2>(IncomingValues.begin(),
- IncomingValues.end());
- }
- case Instruction::BitCast:
- case Instruction::AddrSpaceCast:
- case Instruction::GetElementPtr:
- return {Op.getOperand(0)};
- case Instruction::Select:
- return {Op.getOperand(1), Op.getOperand(2)};
- case Instruction::Call: {
- const IntrinsicInst &II = cast<IntrinsicInst>(Op);
- assert(II.getIntrinsicID() == Intrinsic::ptrmask &&
- "unexpected intrinsic call");
- return {II.getArgOperand(0)};
- }
- case Instruction::IntToPtr: {
- assert(isNoopPtrIntCastPair(&Op, DL, TTI));
- auto *P2I = cast<Operator>(Op.getOperand(0));
- return {P2I->getOperand(0)};
- }
- default:
- llvm_unreachable("Unexpected instruction type.");
- }
-}
-
+ }
+}
+
+// Returns the pointer operands of V.
+//
+// Precondition: V is an address expression.
+static SmallVector<Value *, 2>
+getPointerOperands(const Value &V, const DataLayout &DL,
+ const TargetTransformInfo *TTI) {
+ const Operator &Op = cast<Operator>(V);
+ switch (Op.getOpcode()) {
+ case Instruction::PHI: {
+ auto IncomingValues = cast<PHINode>(Op).incoming_values();
+ return SmallVector<Value *, 2>(IncomingValues.begin(),
+ IncomingValues.end());
+ }
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ case Instruction::GetElementPtr:
+ return {Op.getOperand(0)};
+ case Instruction::Select:
+ return {Op.getOperand(1), Op.getOperand(2)};
+ case Instruction::Call: {
+ const IntrinsicInst &II = cast<IntrinsicInst>(Op);
+ assert(II.getIntrinsicID() == Intrinsic::ptrmask &&
+ "unexpected intrinsic call");
+ return {II.getArgOperand(0)};
+ }
+ case Instruction::IntToPtr: {
+ assert(isNoopPtrIntCastPair(&Op, DL, TTI));
+ auto *P2I = cast<Operator>(Op.getOperand(0));
+ return {P2I->getOperand(0)};
+ }
+ default:
+ llvm_unreachable("Unexpected instruction type.");
+ }
+}
+
bool InferAddressSpacesImpl::rewriteIntrinsicOperands(IntrinsicInst *II,
Value *OldV,
Value *NewV) const {
- Module *M = II->getParent()->getParent()->getParent();
-
- switch (II->getIntrinsicID()) {
- case Intrinsic::objectsize: {
- Type *DestTy = II->getType();
- Type *SrcTy = NewV->getType();
- Function *NewDecl =
- Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
- II->setArgOperand(0, NewV);
- II->setCalledFunction(NewDecl);
- return true;
- }
- case Intrinsic::ptrmask:
- // This is handled as an address expression, not as a use memory operation.
- return false;
- default: {
- Value *Rewrite = TTI->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
- if (!Rewrite)
- return false;
- if (Rewrite != II)
- II->replaceAllUsesWith(Rewrite);
- return true;
- }
- }
-}
-
+ Module *M = II->getParent()->getParent()->getParent();
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::objectsize: {
+ Type *DestTy = II->getType();
+ Type *SrcTy = NewV->getType();
+ Function *NewDecl =
+ Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
+ II->setArgOperand(0, NewV);
+ II->setCalledFunction(NewDecl);
+ return true;
+ }
+ case Intrinsic::ptrmask:
+ // This is handled as an address expression, not as a use memory operation.
+ return false;
+ default: {
+ Value *Rewrite = TTI->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
+ if (!Rewrite)
+ return false;
+ if (Rewrite != II)
+ II->replaceAllUsesWith(Rewrite);
+ return true;
+ }
+ }
+}
+
void InferAddressSpacesImpl::collectRewritableIntrinsicOperands(
- IntrinsicInst *II, PostorderStackTy &PostorderStack,
- DenseSet<Value *> &Visited) const {
- auto IID = II->getIntrinsicID();
- switch (IID) {
- case Intrinsic::ptrmask:
- case Intrinsic::objectsize:
- appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
- PostorderStack, Visited);
- break;
- default:
- SmallVector<int, 2> OpIndexes;
- if (TTI->collectFlatAddressOperands(OpIndexes, IID)) {
- for (int Idx : OpIndexes) {
- appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(Idx),
- PostorderStack, Visited);
- }
- }
- break;
- }
-}
-
-// Returns all flat address expressions in function F. The elements are
-// If V is an unvisited flat address expression, appends V to PostorderStack
-// and marks it as visited.
+ IntrinsicInst *II, PostorderStackTy &PostorderStack,
+ DenseSet<Value *> &Visited) const {
+ auto IID = II->getIntrinsicID();
+ switch (IID) {
+ case Intrinsic::ptrmask:
+ case Intrinsic::objectsize:
+ appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
+ PostorderStack, Visited);
+ break;
+ default:
+ SmallVector<int, 2> OpIndexes;
+ if (TTI->collectFlatAddressOperands(OpIndexes, IID)) {
+ for (int Idx : OpIndexes) {
+ appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(Idx),
+ PostorderStack, Visited);
+ }
+ }
+ break;
+ }
+}
+
+// Returns all flat address expressions in function F. The elements are
+// If V is an unvisited flat address expression, appends V to PostorderStack
+// and marks it as visited.
void InferAddressSpacesImpl::appendsFlatAddressExpressionToPostorderStack(
- Value *V, PostorderStackTy &PostorderStack,
- DenseSet<Value *> &Visited) const {
- assert(V->getType()->isPointerTy());
-
- // Generic addressing expressions may be hidden in nested constant
- // expressions.
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
- // TODO: Look in non-address parts, like icmp operands.
- if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second)
- PostorderStack.emplace_back(CE, false);
-
- return;
- }
-
+ Value *V, PostorderStackTy &PostorderStack,
+ DenseSet<Value *> &Visited) const {
+ assert(V->getType()->isPointerTy());
+
+ // Generic addressing expressions may be hidden in nested constant
+ // expressions.
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
+ // TODO: Look in non-address parts, like icmp operands.
+ if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second)
+ PostorderStack.emplace_back(CE, false);
+
+ return;
+ }
+
if (V->getType()->getPointerAddressSpace() == FlatAddrSpace &&
isAddressExpression(*V, *DL, TTI)) {
- if (Visited.insert(V).second) {
- PostorderStack.emplace_back(V, false);
-
- Operator *Op = cast<Operator>(V);
- for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) {
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op->getOperand(I))) {
- if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second)
- PostorderStack.emplace_back(CE, false);
- }
- }
- }
- }
-}
-
-// Returns all flat address expressions in function F. The elements are ordered
-// ordered in postorder.
-std::vector<WeakTrackingVH>
+ if (Visited.insert(V).second) {
+ PostorderStack.emplace_back(V, false);
+
+ Operator *Op = cast<Operator>(V);
+ for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) {
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op->getOperand(I))) {
+ if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second)
+ PostorderStack.emplace_back(CE, false);
+ }
+ }
+ }
+ }
+}
+
+// Returns all flat address expressions in function F. The elements are ordered
+// ordered in postorder.
+std::vector<WeakTrackingVH>
InferAddressSpacesImpl::collectFlatAddressExpressions(Function &F) const {
- // This function implements a non-recursive postorder traversal of a partial
- // use-def graph of function F.
- PostorderStackTy PostorderStack;
- // The set of visited expressions.
- DenseSet<Value *> Visited;
-
- auto PushPtrOperand = [&](Value *Ptr) {
- appendsFlatAddressExpressionToPostorderStack(Ptr, PostorderStack,
- Visited);
- };
-
- // Look at operations that may be interesting accelerate by moving to a known
- // address space. We aim at generating after loads and stores, but pure
- // addressing calculations may also be faster.
- for (Instruction &I : instructions(F)) {
- if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
- if (!GEP->getType()->isVectorTy())
- PushPtrOperand(GEP->getPointerOperand());
- } else if (auto *LI = dyn_cast<LoadInst>(&I))
- PushPtrOperand(LI->getPointerOperand());
- else if (auto *SI = dyn_cast<StoreInst>(&I))
- PushPtrOperand(SI->getPointerOperand());
- else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
- PushPtrOperand(RMW->getPointerOperand());
- else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
- PushPtrOperand(CmpX->getPointerOperand());
- else if (auto *MI = dyn_cast<MemIntrinsic>(&I)) {
- // For memset/memcpy/memmove, any pointer operand can be replaced.
- PushPtrOperand(MI->getRawDest());
-
- // Handle 2nd operand for memcpy/memmove.
- if (auto *MTI = dyn_cast<MemTransferInst>(MI))
- PushPtrOperand(MTI->getRawSource());
- } else if (auto *II = dyn_cast<IntrinsicInst>(&I))
- collectRewritableIntrinsicOperands(II, PostorderStack, Visited);
- else if (ICmpInst *Cmp = dyn_cast<ICmpInst>(&I)) {
- // FIXME: Handle vectors of pointers
- if (Cmp->getOperand(0)->getType()->isPointerTy()) {
- PushPtrOperand(Cmp->getOperand(0));
- PushPtrOperand(Cmp->getOperand(1));
- }
- } else if (auto *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
- if (!ASC->getType()->isVectorTy())
- PushPtrOperand(ASC->getPointerOperand());
- } else if (auto *I2P = dyn_cast<IntToPtrInst>(&I)) {
- if (isNoopPtrIntCastPair(cast<Operator>(I2P), *DL, TTI))
- PushPtrOperand(
- cast<PtrToIntInst>(I2P->getOperand(0))->getPointerOperand());
- }
- }
-
- std::vector<WeakTrackingVH> Postorder; // The resultant postorder.
- while (!PostorderStack.empty()) {
- Value *TopVal = PostorderStack.back().getPointer();
- // If the operands of the expression on the top are already explored,
- // adds that expression to the resultant postorder.
- if (PostorderStack.back().getInt()) {
- if (TopVal->getType()->getPointerAddressSpace() == FlatAddrSpace)
- Postorder.push_back(TopVal);
- PostorderStack.pop_back();
- continue;
- }
- // Otherwise, adds its operands to the stack and explores them.
- PostorderStack.back().setInt(true);
+ // This function implements a non-recursive postorder traversal of a partial
+ // use-def graph of function F.
+ PostorderStackTy PostorderStack;
+ // The set of visited expressions.
+ DenseSet<Value *> Visited;
+
+ auto PushPtrOperand = [&](Value *Ptr) {
+ appendsFlatAddressExpressionToPostorderStack(Ptr, PostorderStack,
+ Visited);
+ };
+
+ // Look at operations that may be interesting accelerate by moving to a known
+ // address space. We aim at generating after loads and stores, but pure
+ // addressing calculations may also be faster.
+ for (Instruction &I : instructions(F)) {
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ if (!GEP->getType()->isVectorTy())
+ PushPtrOperand(GEP->getPointerOperand());
+ } else if (auto *LI = dyn_cast<LoadInst>(&I))
+ PushPtrOperand(LI->getPointerOperand());
+ else if (auto *SI = dyn_cast<StoreInst>(&I))
+ PushPtrOperand(SI->getPointerOperand());
+ else if (auto *RMW = dyn_cast<AtomicRMWInst>(&I))
+ PushPtrOperand(RMW->getPointerOperand());
+ else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
+ PushPtrOperand(CmpX->getPointerOperand());
+ else if (auto *MI = dyn_cast<MemIntrinsic>(&I)) {
+ // For memset/memcpy/memmove, any pointer operand can be replaced.
+ PushPtrOperand(MI->getRawDest());
+
+ // Handle 2nd operand for memcpy/memmove.
+ if (auto *MTI = dyn_cast<MemTransferInst>(MI))
+ PushPtrOperand(MTI->getRawSource());
+ } else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ collectRewritableIntrinsicOperands(II, PostorderStack, Visited);
+ else if (ICmpInst *Cmp = dyn_cast<ICmpInst>(&I)) {
+ // FIXME: Handle vectors of pointers
+ if (Cmp->getOperand(0)->getType()->isPointerTy()) {
+ PushPtrOperand(Cmp->getOperand(0));
+ PushPtrOperand(Cmp->getOperand(1));
+ }
+ } else if (auto *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
+ if (!ASC->getType()->isVectorTy())
+ PushPtrOperand(ASC->getPointerOperand());
+ } else if (auto *I2P = dyn_cast<IntToPtrInst>(&I)) {
+ if (isNoopPtrIntCastPair(cast<Operator>(I2P), *DL, TTI))
+ PushPtrOperand(
+ cast<PtrToIntInst>(I2P->getOperand(0))->getPointerOperand());
+ }
+ }
+
+ std::vector<WeakTrackingVH> Postorder; // The resultant postorder.
+ while (!PostorderStack.empty()) {
+ Value *TopVal = PostorderStack.back().getPointer();
+ // If the operands of the expression on the top are already explored,
+ // adds that expression to the resultant postorder.
+ if (PostorderStack.back().getInt()) {
+ if (TopVal->getType()->getPointerAddressSpace() == FlatAddrSpace)
+ Postorder.push_back(TopVal);
+ PostorderStack.pop_back();
+ continue;
+ }
+ // Otherwise, adds its operands to the stack and explores them.
+ PostorderStack.back().setInt(true);
// Skip values with an assumed address space.
if (TTI->getAssumedAddrSpace(TopVal) == UninitializedAddressSpace) {
for (Value *PtrOperand : getPointerOperands(*TopVal, *DL, TTI)) {
appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
Visited);
}
- }
- }
- return Postorder;
-}
-
-// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone
-// of OperandUse.get() in the new address space. If the clone is not ready yet,
-// returns an undef in the new address space as a placeholder.
-static Value *operandWithNewAddressSpaceOrCreateUndef(
- const Use &OperandUse, unsigned NewAddrSpace,
- const ValueToValueMapTy &ValueWithNewAddrSpace,
- SmallVectorImpl<const Use *> *UndefUsesToFix) {
- Value *Operand = OperandUse.get();
-
- Type *NewPtrTy =
- Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
-
- if (Constant *C = dyn_cast<Constant>(Operand))
- return ConstantExpr::getAddrSpaceCast(C, NewPtrTy);
-
- if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
- return NewOperand;
-
- UndefUsesToFix->push_back(&OperandUse);
- return UndefValue::get(NewPtrTy);
-}
-
-// Returns a clone of `I` with its operands converted to those specified in
-// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an
-// operand whose address space needs to be modified might not exist in
-// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and
-// adds that operand use to UndefUsesToFix so that caller can fix them later.
-//
-// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
-// from a pointer whose type already matches. Therefore, this function returns a
-// Value* instead of an Instruction*.
-//
-// This may also return nullptr in the case the instruction could not be
-// rewritten.
+ }
+ }
+ return Postorder;
+}
+
+// A helper function for cloneInstructionWithNewAddressSpace. Returns the clone
+// of OperandUse.get() in the new address space. If the clone is not ready yet,
+// returns an undef in the new address space as a placeholder.
+static Value *operandWithNewAddressSpaceOrCreateUndef(
+ const Use &OperandUse, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) {
+ Value *Operand = OperandUse.get();
+
+ Type *NewPtrTy =
+ Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+ if (Constant *C = dyn_cast<Constant>(Operand))
+ return ConstantExpr::getAddrSpaceCast(C, NewPtrTy);
+
+ if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
+ return NewOperand;
+
+ UndefUsesToFix->push_back(&OperandUse);
+ return UndefValue::get(NewPtrTy);
+}
+
+// Returns a clone of `I` with its operands converted to those specified in
+// ValueWithNewAddrSpace. Due to potential cycles in the data flow graph, an
+// operand whose address space needs to be modified might not exist in
+// ValueWithNewAddrSpace. In that case, uses undef as a placeholder operand and
+// adds that operand use to UndefUsesToFix so that caller can fix them later.
+//
+// Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
+// from a pointer whose type already matches. Therefore, this function returns a
+// Value* instead of an Instruction*.
+//
+// This may also return nullptr in the case the instruction could not be
+// rewritten.
Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace(
- Instruction *I, unsigned NewAddrSpace,
- const ValueToValueMapTy &ValueWithNewAddrSpace,
- SmallVectorImpl<const Use *> *UndefUsesToFix) const {
- Type *NewPtrType =
- I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
-
- if (I->getOpcode() == Instruction::AddrSpaceCast) {
- Value *Src = I->getOperand(0);
- // Because `I` is flat, the source address space must be specific.
- // Therefore, the inferred address space must be the source space, according
- // to our algorithm.
- assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
- if (Src->getType() != NewPtrType)
- return new BitCastInst(Src, NewPtrType);
- return Src;
- }
-
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- // Technically the intrinsic ID is a pointer typed argument, so specially
- // handle calls early.
- assert(II->getIntrinsicID() == Intrinsic::ptrmask);
- Value *NewPtr = operandWithNewAddressSpaceOrCreateUndef(
- II->getArgOperandUse(0), NewAddrSpace, ValueWithNewAddrSpace,
- UndefUsesToFix);
- Value *Rewrite =
- TTI->rewriteIntrinsicWithAddressSpace(II, II->getArgOperand(0), NewPtr);
- if (Rewrite) {
- assert(Rewrite != II && "cannot modify this pointer operation in place");
- return Rewrite;
- }
-
- return nullptr;
- }
-
+ Instruction *I, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace,
+ SmallVectorImpl<const Use *> *UndefUsesToFix) const {
+ Type *NewPtrType =
+ I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+ if (I->getOpcode() == Instruction::AddrSpaceCast) {
+ Value *Src = I->getOperand(0);
+ // Because `I` is flat, the source address space must be specific.
+ // Therefore, the inferred address space must be the source space, according
+ // to our algorithm.
+ assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+ if (Src->getType() != NewPtrType)
+ return new BitCastInst(Src, NewPtrType);
+ return Src;
+ }
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ // Technically the intrinsic ID is a pointer typed argument, so specially
+ // handle calls early.
+ assert(II->getIntrinsicID() == Intrinsic::ptrmask);
+ Value *NewPtr = operandWithNewAddressSpaceOrCreateUndef(
+ II->getArgOperandUse(0), NewAddrSpace, ValueWithNewAddrSpace,
+ UndefUsesToFix);
+ Value *Rewrite =
+ TTI->rewriteIntrinsicWithAddressSpace(II, II->getArgOperand(0), NewPtr);
+ if (Rewrite) {
+ assert(Rewrite != II && "cannot modify this pointer operation in place");
+ return Rewrite;
+ }
+
+ return nullptr;
+ }
+
unsigned AS = TTI->getAssumedAddrSpace(I);
if (AS != UninitializedAddressSpace) {
// For the assumed address space, insert an `addrspacecast` to make that
@@ -578,295 +578,295 @@ Value *InferAddressSpacesImpl::cloneInstructionWithNewAddressSpace(
return NewI;
}
- // Computes the converted pointer operands.
- SmallVector<Value *, 4> NewPointerOperands;
- for (const Use &OperandUse : I->operands()) {
- if (!OperandUse.get()->getType()->isPointerTy())
- NewPointerOperands.push_back(nullptr);
- else
- NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
- OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
- }
-
- switch (I->getOpcode()) {
- case Instruction::BitCast:
- return new BitCastInst(NewPointerOperands[0], NewPtrType);
- case Instruction::PHI: {
- assert(I->getType()->isPointerTy());
- PHINode *PHI = cast<PHINode>(I);
- PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues());
- for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) {
- unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index);
- NewPHI->addIncoming(NewPointerOperands[OperandNo],
- PHI->getIncomingBlock(Index));
- }
- return NewPHI;
- }
- case Instruction::GetElementPtr: {
- GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
- GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
- GEP->getSourceElementType(), NewPointerOperands[0],
+ // Computes the converted pointer operands.
+ SmallVector<Value *, 4> NewPointerOperands;
+ for (const Use &OperandUse : I->operands()) {
+ if (!OperandUse.get()->getType()->isPointerTy())
+ NewPointerOperands.push_back(nullptr);
+ else
+ NewPointerOperands.push_back(operandWithNewAddressSpaceOrCreateUndef(
+ OperandUse, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix));
+ }
+
+ switch (I->getOpcode()) {
+ case Instruction::BitCast:
+ return new BitCastInst(NewPointerOperands[0], NewPtrType);
+ case Instruction::PHI: {
+ assert(I->getType()->isPointerTy());
+ PHINode *PHI = cast<PHINode>(I);
+ PHINode *NewPHI = PHINode::Create(NewPtrType, PHI->getNumIncomingValues());
+ for (unsigned Index = 0; Index < PHI->getNumIncomingValues(); ++Index) {
+ unsigned OperandNo = PHINode::getOperandNumForIncomingValue(Index);
+ NewPHI->addIncoming(NewPointerOperands[OperandNo],
+ PHI->getIncomingBlock(Index));
+ }
+ return NewPHI;
+ }
+ case Instruction::GetElementPtr: {
+ GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+ GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
+ GEP->getSourceElementType(), NewPointerOperands[0],
SmallVector<Value *, 4>(GEP->indices()));
- NewGEP->setIsInBounds(GEP->isInBounds());
- return NewGEP;
- }
- case Instruction::Select:
- assert(I->getType()->isPointerTy());
- return SelectInst::Create(I->getOperand(0), NewPointerOperands[1],
- NewPointerOperands[2], "", nullptr, I);
- case Instruction::IntToPtr: {
- assert(isNoopPtrIntCastPair(cast<Operator>(I), *DL, TTI));
- Value *Src = cast<Operator>(I->getOperand(0))->getOperand(0);
- assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
- if (Src->getType() != NewPtrType)
- return new BitCastInst(Src, NewPtrType);
- return Src;
- }
- default:
- llvm_unreachable("Unexpected opcode");
- }
-}
-
-// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the
-// constant expression `CE` with its operands replaced as specified in
-// ValueWithNewAddrSpace.
-static Value *cloneConstantExprWithNewAddressSpace(
- ConstantExpr *CE, unsigned NewAddrSpace,
- const ValueToValueMapTy &ValueWithNewAddrSpace, const DataLayout *DL,
- const TargetTransformInfo *TTI) {
- Type *TargetType =
- CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
-
- if (CE->getOpcode() == Instruction::AddrSpaceCast) {
- // Because CE is flat, the source address space must be specific.
- // Therefore, the inferred address space must be the source space according
- // to our algorithm.
- assert(CE->getOperand(0)->getType()->getPointerAddressSpace() ==
- NewAddrSpace);
- return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
- }
-
- if (CE->getOpcode() == Instruction::BitCast) {
- if (Value *NewOperand = ValueWithNewAddrSpace.lookup(CE->getOperand(0)))
- return ConstantExpr::getBitCast(cast<Constant>(NewOperand), TargetType);
- return ConstantExpr::getAddrSpaceCast(CE, TargetType);
- }
-
- if (CE->getOpcode() == Instruction::Select) {
- Constant *Src0 = CE->getOperand(1);
- Constant *Src1 = CE->getOperand(2);
- if (Src0->getType()->getPointerAddressSpace() ==
- Src1->getType()->getPointerAddressSpace()) {
-
- return ConstantExpr::getSelect(
- CE->getOperand(0), ConstantExpr::getAddrSpaceCast(Src0, TargetType),
- ConstantExpr::getAddrSpaceCast(Src1, TargetType));
- }
- }
-
- if (CE->getOpcode() == Instruction::IntToPtr) {
- assert(isNoopPtrIntCastPair(cast<Operator>(CE), *DL, TTI));
- Constant *Src = cast<ConstantExpr>(CE->getOperand(0))->getOperand(0);
- assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
- return ConstantExpr::getBitCast(Src, TargetType);
- }
-
- // Computes the operands of the new constant expression.
- bool IsNew = false;
- SmallVector<Constant *, 4> NewOperands;
- for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
- Constant *Operand = CE->getOperand(Index);
- // If the address space of `Operand` needs to be modified, the new operand
- // with the new address space should already be in ValueWithNewAddrSpace
- // because (1) the constant expressions we consider (i.e. addrspacecast,
- // bitcast, and getelementptr) do not incur cycles in the data flow graph
- // and (2) this function is called on constant expressions in postorder.
- if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) {
- IsNew = true;
- NewOperands.push_back(cast<Constant>(NewOperand));
- continue;
- }
- if (auto CExpr = dyn_cast<ConstantExpr>(Operand))
- if (Value *NewOperand = cloneConstantExprWithNewAddressSpace(
- CExpr, NewAddrSpace, ValueWithNewAddrSpace, DL, TTI)) {
- IsNew = true;
- NewOperands.push_back(cast<Constant>(NewOperand));
- continue;
- }
- // Otherwise, reuses the old operand.
- NewOperands.push_back(Operand);
- }
-
- // If !IsNew, we will replace the Value with itself. However, replaced values
- // are assumed to wrapped in a addrspace cast later so drop it now.
- if (!IsNew)
- return nullptr;
-
- if (CE->getOpcode() == Instruction::GetElementPtr) {
- // Needs to specify the source type while constructing a getelementptr
- // constant expression.
- return CE->getWithOperands(
- NewOperands, TargetType, /*OnlyIfReduced=*/false,
- NewOperands[0]->getType()->getPointerElementType());
- }
-
- return CE->getWithOperands(NewOperands, TargetType);
-}
-
-// Returns a clone of the value `V`, with its operands replaced as specified in
-// ValueWithNewAddrSpace. This function is called on every flat address
-// expression whose address space needs to be modified, in postorder.
-//
-// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
+ NewGEP->setIsInBounds(GEP->isInBounds());
+ return NewGEP;
+ }
+ case Instruction::Select:
+ assert(I->getType()->isPointerTy());
+ return SelectInst::Create(I->getOperand(0), NewPointerOperands[1],
+ NewPointerOperands[2], "", nullptr, I);
+ case Instruction::IntToPtr: {
+ assert(isNoopPtrIntCastPair(cast<Operator>(I), *DL, TTI));
+ Value *Src = cast<Operator>(I->getOperand(0))->getOperand(0);
+ assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+ if (Src->getType() != NewPtrType)
+ return new BitCastInst(Src, NewPtrType);
+ return Src;
+ }
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+}
+
+// Similar to cloneInstructionWithNewAddressSpace, returns a clone of the
+// constant expression `CE` with its operands replaced as specified in
+// ValueWithNewAddrSpace.
+static Value *cloneConstantExprWithNewAddressSpace(
+ ConstantExpr *CE, unsigned NewAddrSpace,
+ const ValueToValueMapTy &ValueWithNewAddrSpace, const DataLayout *DL,
+ const TargetTransformInfo *TTI) {
+ Type *TargetType =
+ CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+ if (CE->getOpcode() == Instruction::AddrSpaceCast) {
+ // Because CE is flat, the source address space must be specific.
+ // Therefore, the inferred address space must be the source space according
+ // to our algorithm.
+ assert(CE->getOperand(0)->getType()->getPointerAddressSpace() ==
+ NewAddrSpace);
+ return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
+ }
+
+ if (CE->getOpcode() == Instruction::BitCast) {
+ if (Value *NewOperand = ValueWithNewAddrSpace.lookup(CE->getOperand(0)))
+ return ConstantExpr::getBitCast(cast<Constant>(NewOperand), TargetType);
+ return ConstantExpr::getAddrSpaceCast(CE, TargetType);
+ }
+
+ if (CE->getOpcode() == Instruction::Select) {
+ Constant *Src0 = CE->getOperand(1);
+ Constant *Src1 = CE->getOperand(2);
+ if (Src0->getType()->getPointerAddressSpace() ==
+ Src1->getType()->getPointerAddressSpace()) {
+
+ return ConstantExpr::getSelect(
+ CE->getOperand(0), ConstantExpr::getAddrSpaceCast(Src0, TargetType),
+ ConstantExpr::getAddrSpaceCast(Src1, TargetType));
+ }
+ }
+
+ if (CE->getOpcode() == Instruction::IntToPtr) {
+ assert(isNoopPtrIntCastPair(cast<Operator>(CE), *DL, TTI));
+ Constant *Src = cast<ConstantExpr>(CE->getOperand(0))->getOperand(0);
+ assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+ return ConstantExpr::getBitCast(Src, TargetType);
+ }
+
+ // Computes the operands of the new constant expression.
+ bool IsNew = false;
+ SmallVector<Constant *, 4> NewOperands;
+ for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
+ Constant *Operand = CE->getOperand(Index);
+ // If the address space of `Operand` needs to be modified, the new operand
+ // with the new address space should already be in ValueWithNewAddrSpace
+ // because (1) the constant expressions we consider (i.e. addrspacecast,
+ // bitcast, and getelementptr) do not incur cycles in the data flow graph
+ // and (2) this function is called on constant expressions in postorder.
+ if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand)) {
+ IsNew = true;
+ NewOperands.push_back(cast<Constant>(NewOperand));
+ continue;
+ }
+ if (auto CExpr = dyn_cast<ConstantExpr>(Operand))
+ if (Value *NewOperand = cloneConstantExprWithNewAddressSpace(
+ CExpr, NewAddrSpace, ValueWithNewAddrSpace, DL, TTI)) {
+ IsNew = true;
+ NewOperands.push_back(cast<Constant>(NewOperand));
+ continue;
+ }
+ // Otherwise, reuses the old operand.
+ NewOperands.push_back(Operand);
+ }
+
+ // If !IsNew, we will replace the Value with itself. However, replaced values
+ // are assumed to wrapped in a addrspace cast later so drop it now.
+ if (!IsNew)
+ return nullptr;
+
+ if (CE->getOpcode() == Instruction::GetElementPtr) {
+ // Needs to specify the source type while constructing a getelementptr
+ // constant expression.
+ return CE->getWithOperands(
+ NewOperands, TargetType, /*OnlyIfReduced=*/false,
+ NewOperands[0]->getType()->getPointerElementType());
+ }
+
+ return CE->getWithOperands(NewOperands, TargetType);
+}
+
+// Returns a clone of the value `V`, with its operands replaced as specified in
+// ValueWithNewAddrSpace. This function is called on every flat address
+// expression whose address space needs to be modified, in postorder.
+//
+// See cloneInstructionWithNewAddressSpace for the meaning of UndefUsesToFix.
Value *InferAddressSpacesImpl::cloneValueWithNewAddressSpace(
Value *V, unsigned NewAddrSpace,
const ValueToValueMapTy &ValueWithNewAddrSpace,
SmallVectorImpl<const Use *> *UndefUsesToFix) const {
- // All values in Postorder are flat address expressions.
+ // All values in Postorder are flat address expressions.
assert(V->getType()->getPointerAddressSpace() == FlatAddrSpace &&
isAddressExpression(*V, *DL, TTI));
-
- if (Instruction *I = dyn_cast<Instruction>(V)) {
- Value *NewV = cloneInstructionWithNewAddressSpace(
- I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
- if (Instruction *NewI = dyn_cast_or_null<Instruction>(NewV)) {
- if (NewI->getParent() == nullptr) {
- NewI->insertBefore(I);
- NewI->takeName(I);
- }
- }
- return NewV;
- }
-
- return cloneConstantExprWithNewAddressSpace(
- cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace, DL, TTI);
-}
-
-// Defines the join operation on the address space lattice (see the file header
-// comments).
+
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ Value *NewV = cloneInstructionWithNewAddressSpace(
+ I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
+ if (Instruction *NewI = dyn_cast_or_null<Instruction>(NewV)) {
+ if (NewI->getParent() == nullptr) {
+ NewI->insertBefore(I);
+ NewI->takeName(I);
+ }
+ }
+ return NewV;
+ }
+
+ return cloneConstantExprWithNewAddressSpace(
+ cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace, DL, TTI);
+}
+
+// Defines the join operation on the address space lattice (see the file header
+// comments).
unsigned InferAddressSpacesImpl::joinAddressSpaces(unsigned AS1,
unsigned AS2) const {
- if (AS1 == FlatAddrSpace || AS2 == FlatAddrSpace)
- return FlatAddrSpace;
-
- if (AS1 == UninitializedAddressSpace)
- return AS2;
- if (AS2 == UninitializedAddressSpace)
- return AS1;
-
- // The join of two different specific address spaces is flat.
- return (AS1 == AS2) ? AS1 : FlatAddrSpace;
-}
-
+ if (AS1 == FlatAddrSpace || AS2 == FlatAddrSpace)
+ return FlatAddrSpace;
+
+ if (AS1 == UninitializedAddressSpace)
+ return AS2;
+ if (AS2 == UninitializedAddressSpace)
+ return AS1;
+
+ // The join of two different specific address spaces is flat.
+ return (AS1 == AS2) ? AS1 : FlatAddrSpace;
+}
+
bool InferAddressSpacesImpl::run(Function &F) {
- DL = &F.getParent()->getDataLayout();
-
- if (AssumeDefaultIsFlatAddressSpace)
- FlatAddrSpace = 0;
-
- if (FlatAddrSpace == UninitializedAddressSpace) {
- FlatAddrSpace = TTI->getFlatAddressSpace();
- if (FlatAddrSpace == UninitializedAddressSpace)
- return false;
- }
-
- // Collects all flat address expressions in postorder.
- std::vector<WeakTrackingVH> Postorder = collectFlatAddressExpressions(F);
-
- // Runs a data-flow analysis to refine the address spaces of every expression
- // in Postorder.
- ValueToAddrSpaceMapTy InferredAddrSpace;
- inferAddressSpaces(Postorder, &InferredAddrSpace);
-
- // Changes the address spaces of the flat address expressions who are inferred
- // to point to a specific address space.
- return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace, &F);
-}
-
-// Constants need to be tracked through RAUW to handle cases with nested
-// constant expressions, so wrap values in WeakTrackingVH.
+ DL = &F.getParent()->getDataLayout();
+
+ if (AssumeDefaultIsFlatAddressSpace)
+ FlatAddrSpace = 0;
+
+ if (FlatAddrSpace == UninitializedAddressSpace) {
+ FlatAddrSpace = TTI->getFlatAddressSpace();
+ if (FlatAddrSpace == UninitializedAddressSpace)
+ return false;
+ }
+
+ // Collects all flat address expressions in postorder.
+ std::vector<WeakTrackingVH> Postorder = collectFlatAddressExpressions(F);
+
+ // Runs a data-flow analysis to refine the address spaces of every expression
+ // in Postorder.
+ ValueToAddrSpaceMapTy InferredAddrSpace;
+ inferAddressSpaces(Postorder, &InferredAddrSpace);
+
+ // Changes the address spaces of the flat address expressions who are inferred
+ // to point to a specific address space.
+ return rewriteWithNewAddressSpaces(*TTI, Postorder, InferredAddrSpace, &F);
+}
+
+// Constants need to be tracked through RAUW to handle cases with nested
+// constant expressions, so wrap values in WeakTrackingVH.
void InferAddressSpacesImpl::inferAddressSpaces(
- ArrayRef<WeakTrackingVH> Postorder,
- ValueToAddrSpaceMapTy *InferredAddrSpace) const {
- SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
- // Initially, all expressions are in the uninitialized address space.
- for (Value *V : Postorder)
- (*InferredAddrSpace)[V] = UninitializedAddressSpace;
-
- while (!Worklist.empty()) {
- Value *V = Worklist.pop_back_val();
-
- // Tries to update the address space of the stack top according to the
- // address spaces of its operands.
- LLVM_DEBUG(dbgs() << "Updating the address space of\n " << *V << '\n');
- Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
- if (!NewAS.hasValue())
- continue;
- // If any updates are made, grabs its users to the worklist because
- // their address spaces can also be possibly updated.
- LLVM_DEBUG(dbgs() << " to " << NewAS.getValue() << '\n');
- (*InferredAddrSpace)[V] = NewAS.getValue();
-
- for (Value *User : V->users()) {
- // Skip if User is already in the worklist.
- if (Worklist.count(User))
- continue;
-
- auto Pos = InferredAddrSpace->find(User);
- // Our algorithm only updates the address spaces of flat address
- // expressions, which are those in InferredAddrSpace.
- if (Pos == InferredAddrSpace->end())
- continue;
-
- // Function updateAddressSpace moves the address space down a lattice
- // path. Therefore, nothing to do if User is already inferred as flat (the
- // bottom element in the lattice).
- if (Pos->second == FlatAddrSpace)
- continue;
-
- Worklist.insert(User);
- }
- }
-}
-
+ ArrayRef<WeakTrackingVH> Postorder,
+ ValueToAddrSpaceMapTy *InferredAddrSpace) const {
+ SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
+ // Initially, all expressions are in the uninitialized address space.
+ for (Value *V : Postorder)
+ (*InferredAddrSpace)[V] = UninitializedAddressSpace;
+
+ while (!Worklist.empty()) {
+ Value *V = Worklist.pop_back_val();
+
+ // Tries to update the address space of the stack top according to the
+ // address spaces of its operands.
+ LLVM_DEBUG(dbgs() << "Updating the address space of\n " << *V << '\n');
+ Optional<unsigned> NewAS = updateAddressSpace(*V, *InferredAddrSpace);
+ if (!NewAS.hasValue())
+ continue;
+ // If any updates are made, grabs its users to the worklist because
+ // their address spaces can also be possibly updated.
+ LLVM_DEBUG(dbgs() << " to " << NewAS.getValue() << '\n');
+ (*InferredAddrSpace)[V] = NewAS.getValue();
+
+ for (Value *User : V->users()) {
+ // Skip if User is already in the worklist.
+ if (Worklist.count(User))
+ continue;
+
+ auto Pos = InferredAddrSpace->find(User);
+ // Our algorithm only updates the address spaces of flat address
+ // expressions, which are those in InferredAddrSpace.
+ if (Pos == InferredAddrSpace->end())
+ continue;
+
+ // Function updateAddressSpace moves the address space down a lattice
+ // path. Therefore, nothing to do if User is already inferred as flat (the
+ // bottom element in the lattice).
+ if (Pos->second == FlatAddrSpace)
+ continue;
+
+ Worklist.insert(User);
+ }
+ }
+}
+
Optional<unsigned> InferAddressSpacesImpl::updateAddressSpace(
- const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const {
- assert(InferredAddrSpace.count(&V));
-
- // The new inferred address space equals the join of the address spaces
- // of all its pointer operands.
- unsigned NewAS = UninitializedAddressSpace;
-
- const Operator &Op = cast<Operator>(V);
- if (Op.getOpcode() == Instruction::Select) {
- Value *Src0 = Op.getOperand(1);
- Value *Src1 = Op.getOperand(2);
-
- auto I = InferredAddrSpace.find(Src0);
- unsigned Src0AS = (I != InferredAddrSpace.end()) ?
- I->second : Src0->getType()->getPointerAddressSpace();
-
- auto J = InferredAddrSpace.find(Src1);
- unsigned Src1AS = (J != InferredAddrSpace.end()) ?
- J->second : Src1->getType()->getPointerAddressSpace();
-
- auto *C0 = dyn_cast<Constant>(Src0);
- auto *C1 = dyn_cast<Constant>(Src1);
-
- // If one of the inputs is a constant, we may be able to do a constant
- // addrspacecast of it. Defer inferring the address space until the input
- // address space is known.
- if ((C1 && Src0AS == UninitializedAddressSpace) ||
- (C0 && Src1AS == UninitializedAddressSpace))
- return None;
-
- if (C0 && isSafeToCastConstAddrSpace(C0, Src1AS))
- NewAS = Src1AS;
- else if (C1 && isSafeToCastConstAddrSpace(C1, Src0AS))
- NewAS = Src0AS;
- else
- NewAS = joinAddressSpaces(Src0AS, Src1AS);
- } else {
+ const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const {
+ assert(InferredAddrSpace.count(&V));
+
+ // The new inferred address space equals the join of the address spaces
+ // of all its pointer operands.
+ unsigned NewAS = UninitializedAddressSpace;
+
+ const Operator &Op = cast<Operator>(V);
+ if (Op.getOpcode() == Instruction::Select) {
+ Value *Src0 = Op.getOperand(1);
+ Value *Src1 = Op.getOperand(2);
+
+ auto I = InferredAddrSpace.find(Src0);
+ unsigned Src0AS = (I != InferredAddrSpace.end()) ?
+ I->second : Src0->getType()->getPointerAddressSpace();
+
+ auto J = InferredAddrSpace.find(Src1);
+ unsigned Src1AS = (J != InferredAddrSpace.end()) ?
+ J->second : Src1->getType()->getPointerAddressSpace();
+
+ auto *C0 = dyn_cast<Constant>(Src0);
+ auto *C1 = dyn_cast<Constant>(Src1);
+
+ // If one of the inputs is a constant, we may be able to do a constant
+ // addrspacecast of it. Defer inferring the address space until the input
+ // address space is known.
+ if ((C1 && Src0AS == UninitializedAddressSpace) ||
+ (C0 && Src1AS == UninitializedAddressSpace))
+ return None;
+
+ if (C0 && isSafeToCastConstAddrSpace(C0, Src1AS))
+ NewAS = Src1AS;
+ else if (C1 && isSafeToCastConstAddrSpace(C1, Src0AS))
+ NewAS = Src0AS;
+ else
+ NewAS = joinAddressSpaces(Src0AS, Src1AS);
+ } else {
unsigned AS = TTI->getAssumedAddrSpace(&V);
if (AS != UninitializedAddressSpace) {
// Use the assumed address space directly.
@@ -879,313 +879,313 @@ Optional<unsigned> InferAddressSpacesImpl::updateAddressSpace(
I != InferredAddrSpace.end()
? I->second
: PtrOperand->getType()->getPointerAddressSpace();
-
+
// join(flat, *) = flat. So we can break if NewAS is already flat.
NewAS = joinAddressSpaces(NewAS, OperandAS);
if (NewAS == FlatAddrSpace)
break;
}
- }
- }
-
- unsigned OldAS = InferredAddrSpace.lookup(&V);
- assert(OldAS != FlatAddrSpace);
- if (OldAS == NewAS)
- return None;
- return NewAS;
-}
-
-/// \p returns true if \p U is the pointer operand of a memory instruction with
-/// a single pointer operand that can have its address space changed by simply
-/// mutating the use to a new value. If the memory instruction is volatile,
-/// return true only if the target allows the memory instruction to be volatile
-/// in the new address space.
-static bool isSimplePointerUseValidToReplace(const TargetTransformInfo &TTI,
- Use &U, unsigned AddrSpace) {
- User *Inst = U.getUser();
- unsigned OpNo = U.getOperandNo();
- bool VolatileIsAllowed = false;
- if (auto *I = dyn_cast<Instruction>(Inst))
- VolatileIsAllowed = TTI.hasVolatileVariant(I, AddrSpace);
-
- if (auto *LI = dyn_cast<LoadInst>(Inst))
- return OpNo == LoadInst::getPointerOperandIndex() &&
- (VolatileIsAllowed || !LI->isVolatile());
-
- if (auto *SI = dyn_cast<StoreInst>(Inst))
- return OpNo == StoreInst::getPointerOperandIndex() &&
- (VolatileIsAllowed || !SI->isVolatile());
-
- if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst))
- return OpNo == AtomicRMWInst::getPointerOperandIndex() &&
- (VolatileIsAllowed || !RMW->isVolatile());
-
- if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst))
- return OpNo == AtomicCmpXchgInst::getPointerOperandIndex() &&
- (VolatileIsAllowed || !CmpX->isVolatile());
-
- return false;
-}
-
-/// Update memory intrinsic uses that require more complex processing than
-/// simple memory instructions. Thse require re-mangling and may have multiple
-/// pointer operands.
-static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
- Value *NewV) {
- IRBuilder<> B(MI);
- MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa);
- MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope);
- MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias);
-
- if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
- B.CreateMemSet(NewV, MSI->getValue(), MSI->getLength(),
- MaybeAlign(MSI->getDestAlignment()),
- false, // isVolatile
- TBAA, ScopeMD, NoAliasMD);
- } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
- Value *Src = MTI->getRawSource();
- Value *Dest = MTI->getRawDest();
-
- // Be careful in case this is a self-to-self copy.
- if (Src == OldV)
- Src = NewV;
-
- if (Dest == OldV)
- Dest = NewV;
-
- if (isa<MemCpyInst>(MTI)) {
- MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
- B.CreateMemCpy(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(),
- MTI->getLength(),
- false, // isVolatile
- TBAA, TBAAStruct, ScopeMD, NoAliasMD);
- } else {
- assert(isa<MemMoveInst>(MTI));
- B.CreateMemMove(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(),
- MTI->getLength(),
- false, // isVolatile
- TBAA, ScopeMD, NoAliasMD);
- }
- } else
- llvm_unreachable("unhandled MemIntrinsic");
-
- MI->eraseFromParent();
- return true;
-}
-
-// \p returns true if it is OK to change the address space of constant \p C with
-// a ConstantExpr addrspacecast.
+ }
+ }
+
+ unsigned OldAS = InferredAddrSpace.lookup(&V);
+ assert(OldAS != FlatAddrSpace);
+ if (OldAS == NewAS)
+ return None;
+ return NewAS;
+}
+
+/// \p returns true if \p U is the pointer operand of a memory instruction with
+/// a single pointer operand that can have its address space changed by simply
+/// mutating the use to a new value. If the memory instruction is volatile,
+/// return true only if the target allows the memory instruction to be volatile
+/// in the new address space.
+static bool isSimplePointerUseValidToReplace(const TargetTransformInfo &TTI,
+ Use &U, unsigned AddrSpace) {
+ User *Inst = U.getUser();
+ unsigned OpNo = U.getOperandNo();
+ bool VolatileIsAllowed = false;
+ if (auto *I = dyn_cast<Instruction>(Inst))
+ VolatileIsAllowed = TTI.hasVolatileVariant(I, AddrSpace);
+
+ if (auto *LI = dyn_cast<LoadInst>(Inst))
+ return OpNo == LoadInst::getPointerOperandIndex() &&
+ (VolatileIsAllowed || !LI->isVolatile());
+
+ if (auto *SI = dyn_cast<StoreInst>(Inst))
+ return OpNo == StoreInst::getPointerOperandIndex() &&
+ (VolatileIsAllowed || !SI->isVolatile());
+
+ if (auto *RMW = dyn_cast<AtomicRMWInst>(Inst))
+ return OpNo == AtomicRMWInst::getPointerOperandIndex() &&
+ (VolatileIsAllowed || !RMW->isVolatile());
+
+ if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst))
+ return OpNo == AtomicCmpXchgInst::getPointerOperandIndex() &&
+ (VolatileIsAllowed || !CmpX->isVolatile());
+
+ return false;
+}
+
+/// Update memory intrinsic uses that require more complex processing than
+/// simple memory instructions. Thse require re-mangling and may have multiple
+/// pointer operands.
+static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
+ Value *NewV) {
+ IRBuilder<> B(MI);
+ MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa);
+ MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope);
+ MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias);
+
+ if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
+ B.CreateMemSet(NewV, MSI->getValue(), MSI->getLength(),
+ MaybeAlign(MSI->getDestAlignment()),
+ false, // isVolatile
+ TBAA, ScopeMD, NoAliasMD);
+ } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
+ Value *Src = MTI->getRawSource();
+ Value *Dest = MTI->getRawDest();
+
+ // Be careful in case this is a self-to-self copy.
+ if (Src == OldV)
+ Src = NewV;
+
+ if (Dest == OldV)
+ Dest = NewV;
+
+ if (isa<MemCpyInst>(MTI)) {
+ MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
+ B.CreateMemCpy(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(),
+ MTI->getLength(),
+ false, // isVolatile
+ TBAA, TBAAStruct, ScopeMD, NoAliasMD);
+ } else {
+ assert(isa<MemMoveInst>(MTI));
+ B.CreateMemMove(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(),
+ MTI->getLength(),
+ false, // isVolatile
+ TBAA, ScopeMD, NoAliasMD);
+ }
+ } else
+ llvm_unreachable("unhandled MemIntrinsic");
+
+ MI->eraseFromParent();
+ return true;
+}
+
+// \p returns true if it is OK to change the address space of constant \p C with
+// a ConstantExpr addrspacecast.
bool InferAddressSpacesImpl::isSafeToCastConstAddrSpace(Constant *C,
unsigned NewAS) const {
- assert(NewAS != UninitializedAddressSpace);
-
- unsigned SrcAS = C->getType()->getPointerAddressSpace();
- if (SrcAS == NewAS || isa<UndefValue>(C))
- return true;
-
- // Prevent illegal casts between different non-flat address spaces.
- if (SrcAS != FlatAddrSpace && NewAS != FlatAddrSpace)
- return false;
-
- if (isa<ConstantPointerNull>(C))
- return true;
-
- if (auto *Op = dyn_cast<Operator>(C)) {
- // If we already have a constant addrspacecast, it should be safe to cast it
- // off.
- if (Op->getOpcode() == Instruction::AddrSpaceCast)
- return isSafeToCastConstAddrSpace(cast<Constant>(Op->getOperand(0)), NewAS);
-
- if (Op->getOpcode() == Instruction::IntToPtr &&
- Op->getType()->getPointerAddressSpace() == FlatAddrSpace)
- return true;
- }
-
- return false;
-}
-
-static Value::use_iterator skipToNextUser(Value::use_iterator I,
- Value::use_iterator End) {
- User *CurUser = I->getUser();
- ++I;
-
- while (I != End && I->getUser() == CurUser)
- ++I;
-
- return I;
-}
-
+ assert(NewAS != UninitializedAddressSpace);
+
+ unsigned SrcAS = C->getType()->getPointerAddressSpace();
+ if (SrcAS == NewAS || isa<UndefValue>(C))
+ return true;
+
+ // Prevent illegal casts between different non-flat address spaces.
+ if (SrcAS != FlatAddrSpace && NewAS != FlatAddrSpace)
+ return false;
+
+ if (isa<ConstantPointerNull>(C))
+ return true;
+
+ if (auto *Op = dyn_cast<Operator>(C)) {
+ // If we already have a constant addrspacecast, it should be safe to cast it
+ // off.
+ if (Op->getOpcode() == Instruction::AddrSpaceCast)
+ return isSafeToCastConstAddrSpace(cast<Constant>(Op->getOperand(0)), NewAS);
+
+ if (Op->getOpcode() == Instruction::IntToPtr &&
+ Op->getType()->getPointerAddressSpace() == FlatAddrSpace)
+ return true;
+ }
+
+ return false;
+}
+
+static Value::use_iterator skipToNextUser(Value::use_iterator I,
+ Value::use_iterator End) {
+ User *CurUser = I->getUser();
+ ++I;
+
+ while (I != End && I->getUser() == CurUser)
+ ++I;
+
+ return I;
+}
+
bool InferAddressSpacesImpl::rewriteWithNewAddressSpaces(
- const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
- const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
- // For each address expression to be modified, creates a clone of it with its
- // pointer operands converted to the new address space. Since the pointer
- // operands are converted, the clone is naturally in the new address space by
- // construction.
- ValueToValueMapTy ValueWithNewAddrSpace;
- SmallVector<const Use *, 32> UndefUsesToFix;
- for (Value* V : Postorder) {
- unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
+ const TargetTransformInfo &TTI, ArrayRef<WeakTrackingVH> Postorder,
+ const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
+ // For each address expression to be modified, creates a clone of it with its
+ // pointer operands converted to the new address space. Since the pointer
+ // operands are converted, the clone is naturally in the new address space by
+ // construction.
+ ValueToValueMapTy ValueWithNewAddrSpace;
+ SmallVector<const Use *, 32> UndefUsesToFix;
+ for (Value* V : Postorder) {
+ unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
// In some degenerate cases (e.g. invalid IR in unreachable code), we may
// not even infer the value to have its original address space.
if (NewAddrSpace == UninitializedAddressSpace)
continue;
- if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
- Value *New = cloneValueWithNewAddressSpace(
- V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
- if (New)
- ValueWithNewAddrSpace[V] = New;
- }
- }
-
- if (ValueWithNewAddrSpace.empty())
- return false;
-
- // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
- for (const Use *UndefUse : UndefUsesToFix) {
- User *V = UndefUse->getUser();
- User *NewV = cast_or_null<User>(ValueWithNewAddrSpace.lookup(V));
- if (!NewV)
- continue;
-
- unsigned OperandNo = UndefUse->getOperandNo();
- assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
- NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
- }
-
- SmallVector<Instruction *, 16> DeadInstructions;
-
- // Replaces the uses of the old address expressions with the new ones.
- for (const WeakTrackingVH &WVH : Postorder) {
- assert(WVH && "value was unexpectedly deleted");
- Value *V = WVH;
- Value *NewV = ValueWithNewAddrSpace.lookup(V);
- if (NewV == nullptr)
- continue;
-
- LLVM_DEBUG(dbgs() << "Replacing the uses of " << *V << "\n with\n "
- << *NewV << '\n');
-
- if (Constant *C = dyn_cast<Constant>(V)) {
- Constant *Replace = ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
- C->getType());
- if (C != Replace) {
- LLVM_DEBUG(dbgs() << "Inserting replacement const cast: " << Replace
- << ": " << *Replace << '\n');
- C->replaceAllUsesWith(Replace);
- V = Replace;
- }
- }
-
- Value::use_iterator I, E, Next;
- for (I = V->use_begin(), E = V->use_end(); I != E; ) {
- Use &U = *I;
-
- // Some users may see the same pointer operand in multiple operands. Skip
- // to the next instruction.
- I = skipToNextUser(I, E);
-
- if (isSimplePointerUseValidToReplace(
- TTI, U, V->getType()->getPointerAddressSpace())) {
- // If V is used as the pointer operand of a compatible memory operation,
- // sets the pointer operand to NewV. This replacement does not change
- // the element type, so the resultant load/store is still valid.
- U.set(NewV);
- continue;
- }
-
- User *CurUser = U.getUser();
+ if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
+ Value *New = cloneValueWithNewAddressSpace(
+ V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+ if (New)
+ ValueWithNewAddrSpace[V] = New;
+ }
+ }
+
+ if (ValueWithNewAddrSpace.empty())
+ return false;
+
+ // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
+ for (const Use *UndefUse : UndefUsesToFix) {
+ User *V = UndefUse->getUser();
+ User *NewV = cast_or_null<User>(ValueWithNewAddrSpace.lookup(V));
+ if (!NewV)
+ continue;
+
+ unsigned OperandNo = UndefUse->getOperandNo();
+ assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
+ NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
+ }
+
+ SmallVector<Instruction *, 16> DeadInstructions;
+
+ // Replaces the uses of the old address expressions with the new ones.
+ for (const WeakTrackingVH &WVH : Postorder) {
+ assert(WVH && "value was unexpectedly deleted");
+ Value *V = WVH;
+ Value *NewV = ValueWithNewAddrSpace.lookup(V);
+ if (NewV == nullptr)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Replacing the uses of " << *V << "\n with\n "
+ << *NewV << '\n');
+
+ if (Constant *C = dyn_cast<Constant>(V)) {
+ Constant *Replace = ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+ C->getType());
+ if (C != Replace) {
+ LLVM_DEBUG(dbgs() << "Inserting replacement const cast: " << Replace
+ << ": " << *Replace << '\n');
+ C->replaceAllUsesWith(Replace);
+ V = Replace;
+ }
+ }
+
+ Value::use_iterator I, E, Next;
+ for (I = V->use_begin(), E = V->use_end(); I != E; ) {
+ Use &U = *I;
+
+ // Some users may see the same pointer operand in multiple operands. Skip
+ // to the next instruction.
+ I = skipToNextUser(I, E);
+
+ if (isSimplePointerUseValidToReplace(
+ TTI, U, V->getType()->getPointerAddressSpace())) {
+ // If V is used as the pointer operand of a compatible memory operation,
+ // sets the pointer operand to NewV. This replacement does not change
+ // the element type, so the resultant load/store is still valid.
+ U.set(NewV);
+ continue;
+ }
+
+ User *CurUser = U.getUser();
// Skip if the current user is the new value itself.
if (CurUser == NewV)
continue;
- // Handle more complex cases like intrinsic that need to be remangled.
- if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) {
- if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV))
- continue;
- }
-
- if (auto *II = dyn_cast<IntrinsicInst>(CurUser)) {
- if (rewriteIntrinsicOperands(II, V, NewV))
- continue;
- }
-
- if (isa<Instruction>(CurUser)) {
- if (ICmpInst *Cmp = dyn_cast<ICmpInst>(CurUser)) {
- // If we can infer that both pointers are in the same addrspace,
- // transform e.g.
- // %cmp = icmp eq float* %p, %q
- // into
- // %cmp = icmp eq float addrspace(3)* %new_p, %new_q
-
- unsigned NewAS = NewV->getType()->getPointerAddressSpace();
- int SrcIdx = U.getOperandNo();
- int OtherIdx = (SrcIdx == 0) ? 1 : 0;
- Value *OtherSrc = Cmp->getOperand(OtherIdx);
-
- if (Value *OtherNewV = ValueWithNewAddrSpace.lookup(OtherSrc)) {
- if (OtherNewV->getType()->getPointerAddressSpace() == NewAS) {
- Cmp->setOperand(OtherIdx, OtherNewV);
- Cmp->setOperand(SrcIdx, NewV);
- continue;
- }
- }
-
- // Even if the type mismatches, we can cast the constant.
- if (auto *KOtherSrc = dyn_cast<Constant>(OtherSrc)) {
- if (isSafeToCastConstAddrSpace(KOtherSrc, NewAS)) {
- Cmp->setOperand(SrcIdx, NewV);
- Cmp->setOperand(OtherIdx,
- ConstantExpr::getAddrSpaceCast(KOtherSrc, NewV->getType()));
- continue;
- }
- }
- }
-
- if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(CurUser)) {
- unsigned NewAS = NewV->getType()->getPointerAddressSpace();
- if (ASC->getDestAddressSpace() == NewAS) {
- if (ASC->getType()->getPointerElementType() !=
- NewV->getType()->getPointerElementType()) {
- NewV = CastInst::Create(Instruction::BitCast, NewV,
- ASC->getType(), "", ASC);
- }
- ASC->replaceAllUsesWith(NewV);
- DeadInstructions.push_back(ASC);
- continue;
- }
- }
-
- // Otherwise, replaces the use with flat(NewV).
- if (Instruction *Inst = dyn_cast<Instruction>(V)) {
- // Don't create a copy of the original addrspacecast.
- if (U == V && isa<AddrSpaceCastInst>(V))
- continue;
-
- BasicBlock::iterator InsertPos = std::next(Inst->getIterator());
- while (isa<PHINode>(InsertPos))
- ++InsertPos;
- U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
- } else {
- U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
- V->getType()));
- }
- }
- }
-
- if (V->use_empty()) {
- if (Instruction *I = dyn_cast<Instruction>(V))
- DeadInstructions.push_back(I);
- }
- }
-
- for (Instruction *I : DeadInstructions)
- RecursivelyDeleteTriviallyDeadInstructions(I);
-
- return true;
-}
-
+ // Handle more complex cases like intrinsic that need to be remangled.
+ if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) {
+ if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV))
+ continue;
+ }
+
+ if (auto *II = dyn_cast<IntrinsicInst>(CurUser)) {
+ if (rewriteIntrinsicOperands(II, V, NewV))
+ continue;
+ }
+
+ if (isa<Instruction>(CurUser)) {
+ if (ICmpInst *Cmp = dyn_cast<ICmpInst>(CurUser)) {
+ // If we can infer that both pointers are in the same addrspace,
+ // transform e.g.
+ // %cmp = icmp eq float* %p, %q
+ // into
+ // %cmp = icmp eq float addrspace(3)* %new_p, %new_q
+
+ unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+ int SrcIdx = U.getOperandNo();
+ int OtherIdx = (SrcIdx == 0) ? 1 : 0;
+ Value *OtherSrc = Cmp->getOperand(OtherIdx);
+
+ if (Value *OtherNewV = ValueWithNewAddrSpace.lookup(OtherSrc)) {
+ if (OtherNewV->getType()->getPointerAddressSpace() == NewAS) {
+ Cmp->setOperand(OtherIdx, OtherNewV);
+ Cmp->setOperand(SrcIdx, NewV);
+ continue;
+ }
+ }
+
+ // Even if the type mismatches, we can cast the constant.
+ if (auto *KOtherSrc = dyn_cast<Constant>(OtherSrc)) {
+ if (isSafeToCastConstAddrSpace(KOtherSrc, NewAS)) {
+ Cmp->setOperand(SrcIdx, NewV);
+ Cmp->setOperand(OtherIdx,
+ ConstantExpr::getAddrSpaceCast(KOtherSrc, NewV->getType()));
+ continue;
+ }
+ }
+ }
+
+ if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(CurUser)) {
+ unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+ if (ASC->getDestAddressSpace() == NewAS) {
+ if (ASC->getType()->getPointerElementType() !=
+ NewV->getType()->getPointerElementType()) {
+ NewV = CastInst::Create(Instruction::BitCast, NewV,
+ ASC->getType(), "", ASC);
+ }
+ ASC->replaceAllUsesWith(NewV);
+ DeadInstructions.push_back(ASC);
+ continue;
+ }
+ }
+
+ // Otherwise, replaces the use with flat(NewV).
+ if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+ // Don't create a copy of the original addrspacecast.
+ if (U == V && isa<AddrSpaceCastInst>(V))
+ continue;
+
+ BasicBlock::iterator InsertPos = std::next(Inst->getIterator());
+ while (isa<PHINode>(InsertPos))
+ ++InsertPos;
+ U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+ } else {
+ U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+ V->getType()));
+ }
+ }
+ }
+
+ if (V->use_empty()) {
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ DeadInstructions.push_back(I);
+ }
+ }
+
+ for (Instruction *I : DeadInstructions)
+ RecursivelyDeleteTriviallyDeadInstructions(I);
+
+ return true;
+}
+
bool InferAddressSpaces::runOnFunction(Function &F) {
if (skipFunction(F))
return false;
@@ -1196,9 +1196,9 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
.run(F);
}
-FunctionPass *llvm::createInferAddressSpacesPass(unsigned AddressSpace) {
- return new InferAddressSpaces(AddressSpace);
-}
+FunctionPass *llvm::createInferAddressSpacesPass(unsigned AddressSpace) {
+ return new InferAddressSpaces(AddressSpace);
+}
InferAddressSpacesPass::InferAddressSpacesPass()
: FlatAddrSpace(UninitializedAddressSpace) {}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/InstSimplifyPass.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/InstSimplifyPass.cpp
index 2c47a99985..c11d2e4c1d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/InstSimplifyPass.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/InstSimplifyPass.cpp
@@ -1,148 +1,148 @@
-//===- InstSimplifyPass.cpp -----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
+//===- InstSimplifyPass.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/InstSimplifyPass.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "instsimplify"
-
-STATISTIC(NumSimplified, "Number of redundant instructions removed");
-
-static bool runImpl(Function &F, const SimplifyQuery &SQ,
- OptimizationRemarkEmitter *ORE) {
- SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
- bool Changed = false;
-
- do {
- for (BasicBlock &BB : F) {
- // Unreachable code can take on strange forms that we are not prepared to
- // handle. For example, an instruction may have itself as an operand.
- if (!SQ.DT->isReachableFromEntry(&BB))
- continue;
-
- SmallVector<WeakTrackingVH, 8> DeadInstsInBB;
- for (Instruction &I : BB) {
- // The first time through the loop, ToSimplify is empty and we try to
- // simplify all instructions. On later iterations, ToSimplify is not
- // empty and we only bother simplifying instructions that are in it.
- if (!ToSimplify->empty() && !ToSimplify->count(&I))
- continue;
-
- // Don't waste time simplifying dead/unused instructions.
- if (isInstructionTriviallyDead(&I)) {
- DeadInstsInBB.push_back(&I);
- Changed = true;
- } else if (!I.use_empty()) {
- if (Value *V = SimplifyInstruction(&I, SQ, ORE)) {
- // Mark all uses for resimplification next time round the loop.
- for (User *U : I.users())
- Next->insert(cast<Instruction>(U));
- I.replaceAllUsesWith(V);
- ++NumSimplified;
- Changed = true;
- // A call can get simplified, but it may not be trivially dead.
- if (isInstructionTriviallyDead(&I))
- DeadInstsInBB.push_back(&I);
- }
- }
- }
- RecursivelyDeleteTriviallyDeadInstructions(DeadInstsInBB, SQ.TLI);
- }
-
- // Place the list of instructions to simplify on the next loop iteration
- // into ToSimplify.
- std::swap(ToSimplify, Next);
- Next->clear();
- } while (!ToSimplify->empty());
-
- return Changed;
-}
-
-namespace {
-struct InstSimplifyLegacyPass : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- InstSimplifyLegacyPass() : FunctionPass(ID) {
- initializeInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- }
-
- /// Remove instructions that simplify.
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- const DominatorTree *DT =
- &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- const TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- AssumptionCache *AC =
- &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- OptimizationRemarkEmitter *ORE =
- &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- const DataLayout &DL = F.getParent()->getDataLayout();
- const SimplifyQuery SQ(DL, TLI, DT, AC);
- return runImpl(F, SQ, ORE);
- }
-};
-} // namespace
-
-char InstSimplifyLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(InstSimplifyLegacyPass, "instsimplify",
- "Remove redundant instructions", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_END(InstSimplifyLegacyPass, "instsimplify",
- "Remove redundant instructions", false, false)
-
-// Public interface to the simplify instructions pass.
-FunctionPass *llvm::createInstSimplifyLegacyPass() {
- return new InstSimplifyLegacyPass();
-}
-
-PreservedAnalyses InstSimplifyPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &AC = AM.getResult<AssumptionAnalysis>(F);
- auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- const DataLayout &DL = F.getParent()->getDataLayout();
- const SimplifyQuery SQ(DL, &TLI, &DT, &AC);
- bool Changed = runImpl(F, SQ, &ORE);
- if (!Changed)
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
-}
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "instsimplify"
+
+STATISTIC(NumSimplified, "Number of redundant instructions removed");
+
+static bool runImpl(Function &F, const SimplifyQuery &SQ,
+ OptimizationRemarkEmitter *ORE) {
+ SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+ bool Changed = false;
+
+ do {
+ for (BasicBlock &BB : F) {
+ // Unreachable code can take on strange forms that we are not prepared to
+ // handle. For example, an instruction may have itself as an operand.
+ if (!SQ.DT->isReachableFromEntry(&BB))
+ continue;
+
+ SmallVector<WeakTrackingVH, 8> DeadInstsInBB;
+ for (Instruction &I : BB) {
+ // The first time through the loop, ToSimplify is empty and we try to
+ // simplify all instructions. On later iterations, ToSimplify is not
+ // empty and we only bother simplifying instructions that are in it.
+ if (!ToSimplify->empty() && !ToSimplify->count(&I))
+ continue;
+
+ // Don't waste time simplifying dead/unused instructions.
+ if (isInstructionTriviallyDead(&I)) {
+ DeadInstsInBB.push_back(&I);
+ Changed = true;
+ } else if (!I.use_empty()) {
+ if (Value *V = SimplifyInstruction(&I, SQ, ORE)) {
+ // Mark all uses for resimplification next time round the loop.
+ for (User *U : I.users())
+ Next->insert(cast<Instruction>(U));
+ I.replaceAllUsesWith(V);
+ ++NumSimplified;
+ Changed = true;
+ // A call can get simplified, but it may not be trivially dead.
+ if (isInstructionTriviallyDead(&I))
+ DeadInstsInBB.push_back(&I);
+ }
+ }
+ }
+ RecursivelyDeleteTriviallyDeadInstructions(DeadInstsInBB, SQ.TLI);
+ }
+
+ // Place the list of instructions to simplify on the next loop iteration
+ // into ToSimplify.
+ std::swap(ToSimplify, Next);
+ Next->clear();
+ } while (!ToSimplify->empty());
+
+ return Changed;
+}
+
+namespace {
+struct InstSimplifyLegacyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ InstSimplifyLegacyPass() : FunctionPass(ID) {
+ initializeInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ }
+
+ /// Remove instructions that simplify.
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ const DominatorTree *DT =
+ &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ AssumptionCache *AC =
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ OptimizationRemarkEmitter *ORE =
+ &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const SimplifyQuery SQ(DL, TLI, DT, AC);
+ return runImpl(F, SQ, ORE);
+ }
+};
+} // namespace
+
+char InstSimplifyLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(InstSimplifyLegacyPass, "instsimplify",
+ "Remove redundant instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(InstSimplifyLegacyPass, "instsimplify",
+ "Remove redundant instructions", false, false)
+
+// Public interface to the simplify instructions pass.
+FunctionPass *llvm::createInstSimplifyLegacyPass() {
+ return new InstSimplifyLegacyPass();
+}
+
+PreservedAnalyses InstSimplifyPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const SimplifyQuery SQ(DL, &TLI, &DT, &AC);
+ bool Changed = runImpl(F, SQ, &ORE);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/JumpThreading.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/JumpThreading.cpp
index 4ba1bea9f0..10b08b4e22 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1,724 +1,724 @@
-//===- JumpThreading.cpp - Thread control through conditional blocks ------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Jump Threading pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/JumpThreading.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/GuardUtils.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LazyValueInfo.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
+//===- JumpThreading.cpp - Thread control through conditional blocks ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Jump Threading pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/JumpThreading.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/ConstantRange.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/BlockFrequency.h"
-#include "llvm/Support/BranchProbability.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <memory>
-#include <utility>
-
-using namespace llvm;
-using namespace jumpthreading;
-
-#define DEBUG_TYPE "jump-threading"
-
-STATISTIC(NumThreads, "Number of jumps threaded");
-STATISTIC(NumFolds, "Number of terminators folded");
-STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi");
-
-static cl::opt<unsigned>
-BBDuplicateThreshold("jump-threading-threshold",
- cl::desc("Max block size to duplicate for jump threading"),
- cl::init(6), cl::Hidden);
-
-static cl::opt<unsigned>
-ImplicationSearchThreshold(
- "jump-threading-implication-search-threshold",
- cl::desc("The number of predecessors to search for a stronger "
- "condition to use to thread over a weaker condition"),
- cl::init(3), cl::Hidden);
-
-static cl::opt<bool> PrintLVIAfterJumpThreading(
- "print-lvi-after-jump-threading",
- cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false),
- cl::Hidden);
-
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <utility>
+
+using namespace llvm;
+using namespace jumpthreading;
+
+#define DEBUG_TYPE "jump-threading"
+
+STATISTIC(NumThreads, "Number of jumps threaded");
+STATISTIC(NumFolds, "Number of terminators folded");
+STATISTIC(NumDupes, "Number of branch blocks duplicated to eliminate phi");
+
+static cl::opt<unsigned>
+BBDuplicateThreshold("jump-threading-threshold",
+ cl::desc("Max block size to duplicate for jump threading"),
+ cl::init(6), cl::Hidden);
+
+static cl::opt<unsigned>
+ImplicationSearchThreshold(
+ "jump-threading-implication-search-threshold",
+ cl::desc("The number of predecessors to search for a stronger "
+ "condition to use to thread over a weaker condition"),
+ cl::init(3), cl::Hidden);
+
+static cl::opt<bool> PrintLVIAfterJumpThreading(
+ "print-lvi-after-jump-threading",
+ cl::desc("Print the LazyValueInfo cache after JumpThreading"), cl::init(false),
+ cl::Hidden);
+
static cl::opt<bool> JumpThreadingFreezeSelectCond(
"jump-threading-freeze-select-cond",
cl::desc("Freeze the condition when unfolding select"), cl::init(false),
cl::Hidden);
-static cl::opt<bool> ThreadAcrossLoopHeaders(
- "jump-threading-across-loop-headers",
- cl::desc("Allow JumpThreading to thread across loop headers, for testing"),
- cl::init(false), cl::Hidden);
-
-
-namespace {
-
- /// This pass performs 'jump threading', which looks at blocks that have
- /// multiple predecessors and multiple successors. If one or more of the
- /// predecessors of the block can be proven to always jump to one of the
- /// successors, we forward the edge from the predecessor to the successor by
- /// duplicating the contents of this block.
- ///
- /// An example of when this can occur is code like this:
- ///
- /// if () { ...
- /// X = 4;
- /// }
- /// if (X < 3) {
- ///
- /// In this case, the unconditional branch at the end of the first if can be
- /// revectored to the false side of the second if.
- class JumpThreading : public FunctionPass {
- JumpThreadingPass Impl;
-
- public:
- static char ID; // Pass identification
-
+static cl::opt<bool> ThreadAcrossLoopHeaders(
+ "jump-threading-across-loop-headers",
+ cl::desc("Allow JumpThreading to thread across loop headers, for testing"),
+ cl::init(false), cl::Hidden);
+
+
+namespace {
+
+ /// This pass performs 'jump threading', which looks at blocks that have
+ /// multiple predecessors and multiple successors. If one or more of the
+ /// predecessors of the block can be proven to always jump to one of the
+ /// successors, we forward the edge from the predecessor to the successor by
+ /// duplicating the contents of this block.
+ ///
+ /// An example of when this can occur is code like this:
+ ///
+ /// if () { ...
+ /// X = 4;
+ /// }
+ /// if (X < 3) {
+ ///
+ /// In this case, the unconditional branch at the end of the first if can be
+ /// revectored to the false side of the second if.
+ class JumpThreading : public FunctionPass {
+ JumpThreadingPass Impl;
+
+ public:
+ static char ID; // Pass identification
+
JumpThreading(bool InsertFreezeWhenUnfoldingSelect = false, int T = -1)
: FunctionPass(ID), Impl(InsertFreezeWhenUnfoldingSelect, T) {
- initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<LazyValueInfoWrapperPass>();
- AU.addPreserved<LazyValueInfoWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
+ initializeJumpThreadingPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<LazyValueInfoWrapperPass>();
+ AU.addPreserved<LazyValueInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.addRequired<TargetTransformInfoWrapperPass>();
- }
-
- void releaseMemory() override { Impl.releaseMemory(); }
- };
-
-} // end anonymous namespace
-
-char JumpThreading::ID = 0;
-
-INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
- "Jump Threading", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(JumpThreading, "jump-threading",
- "Jump Threading", false, false)
-
-// Public interface to the Jump Threading pass
+ }
+
+ void releaseMemory() override { Impl.releaseMemory(); }
+ };
+
+} // end anonymous namespace
+
+char JumpThreading::ID = 0;
+
+INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
+ "Jump Threading", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(JumpThreading, "jump-threading",
+ "Jump Threading", false, false)
+
+// Public interface to the Jump Threading pass
FunctionPass *llvm::createJumpThreadingPass(bool InsertFr, int Threshold) {
return new JumpThreading(InsertFr, Threshold);
-}
-
+}
+
JumpThreadingPass::JumpThreadingPass(bool InsertFr, int T) {
InsertFreezeWhenUnfoldingSelect = JumpThreadingFreezeSelectCond | InsertFr;
- DefaultBBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
-}
-
-// Update branch probability information according to conditional
-// branch probability. This is usually made possible for cloned branches
-// in inline instances by the context specific profile in the caller.
-// For instance,
-//
-// [Block PredBB]
-// [Branch PredBr]
-// if (t) {
-// Block A;
-// } else {
-// Block B;
-// }
-//
-// [Block BB]
-// cond = PN([true, %A], [..., %B]); // PHI node
-// [Branch CondBr]
-// if (cond) {
-// ... // P(cond == true) = 1%
-// }
-//
-// Here we know that when block A is taken, cond must be true, which means
-// P(cond == true | A) = 1
-//
-// Given that P(cond == true) = P(cond == true | A) * P(A) +
-// P(cond == true | B) * P(B)
-// we get:
-// P(cond == true ) = P(A) + P(cond == true | B) * P(B)
-//
-// which gives us:
-// P(A) is less than P(cond == true), i.e.
-// P(t == true) <= P(cond == true)
-//
-// In other words, if we know P(cond == true) is unlikely, we know
-// that P(t == true) is also unlikely.
-//
-static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
- BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
- if (!CondBr)
- return;
-
- uint64_t TrueWeight, FalseWeight;
- if (!CondBr->extractProfMetadata(TrueWeight, FalseWeight))
- return;
-
- if (TrueWeight + FalseWeight == 0)
- // Zero branch_weights do not give a hint for getting branch probabilities.
- // Technically it would result in division by zero denominator, which is
- // TrueWeight + FalseWeight.
- return;
-
- // Returns the outgoing edge of the dominating predecessor block
- // that leads to the PhiNode's incoming block:
- auto GetPredOutEdge =
- [](BasicBlock *IncomingBB,
- BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> {
- auto *PredBB = IncomingBB;
- auto *SuccBB = PhiBB;
- SmallPtrSet<BasicBlock *, 16> Visited;
- while (true) {
- BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
- if (PredBr && PredBr->isConditional())
- return {PredBB, SuccBB};
- Visited.insert(PredBB);
- auto *SinglePredBB = PredBB->getSinglePredecessor();
- if (!SinglePredBB)
- return {nullptr, nullptr};
-
- // Stop searching when SinglePredBB has been visited. It means we see
- // an unreachable loop.
- if (Visited.count(SinglePredBB))
- return {nullptr, nullptr};
-
- SuccBB = PredBB;
- PredBB = SinglePredBB;
- }
- };
-
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- Value *PhiOpnd = PN->getIncomingValue(i);
- ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
-
- if (!CI || !CI->getType()->isIntegerTy(1))
- continue;
-
- BranchProbability BP =
- (CI->isOne() ? BranchProbability::getBranchProbability(
- TrueWeight, TrueWeight + FalseWeight)
- : BranchProbability::getBranchProbability(
- FalseWeight, TrueWeight + FalseWeight));
-
- auto PredOutEdge = GetPredOutEdge(PN->getIncomingBlock(i), BB);
- if (!PredOutEdge.first)
- return;
-
- BasicBlock *PredBB = PredOutEdge.first;
- BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
- if (!PredBr)
- return;
-
- uint64_t PredTrueWeight, PredFalseWeight;
- // FIXME: We currently only set the profile data when it is missing.
- // With PGO, this can be used to refine even existing profile data with
- // context information. This needs to be done after more performance
- // testing.
- if (PredBr->extractProfMetadata(PredTrueWeight, PredFalseWeight))
- continue;
-
- // We can not infer anything useful when BP >= 50%, because BP is the
- // upper bound probability value.
- if (BP >= BranchProbability(50, 100))
- continue;
-
- SmallVector<uint32_t, 2> Weights;
- if (PredBr->getSuccessor(0) == PredOutEdge.second) {
- Weights.push_back(BP.getNumerator());
- Weights.push_back(BP.getCompl().getNumerator());
- } else {
- Weights.push_back(BP.getCompl().getNumerator());
- Weights.push_back(BP.getNumerator());
- }
- PredBr->setMetadata(LLVMContext::MD_prof,
- MDBuilder(PredBr->getParent()->getContext())
- .createBranchWeights(Weights));
- }
-}
-
-/// runOnFunction - Toplevel algorithm.
-bool JumpThreading::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
+ DefaultBBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
+}
+
+// Update branch probability information according to conditional
+// branch probability. This is usually made possible for cloned branches
+// in inline instances by the context specific profile in the caller.
+// For instance,
+//
+// [Block PredBB]
+// [Branch PredBr]
+// if (t) {
+// Block A;
+// } else {
+// Block B;
+// }
+//
+// [Block BB]
+// cond = PN([true, %A], [..., %B]); // PHI node
+// [Branch CondBr]
+// if (cond) {
+// ... // P(cond == true) = 1%
+// }
+//
+// Here we know that when block A is taken, cond must be true, which means
+// P(cond == true | A) = 1
+//
+// Given that P(cond == true) = P(cond == true | A) * P(A) +
+// P(cond == true | B) * P(B)
+// we get:
+// P(cond == true ) = P(A) + P(cond == true | B) * P(B)
+//
+// which gives us:
+// P(A) is less than P(cond == true), i.e.
+// P(t == true) <= P(cond == true)
+//
+// In other words, if we know P(cond == true) is unlikely, we know
+// that P(t == true) is also unlikely.
+//
+static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
+ BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!CondBr)
+ return;
+
+ uint64_t TrueWeight, FalseWeight;
+ if (!CondBr->extractProfMetadata(TrueWeight, FalseWeight))
+ return;
+
+ if (TrueWeight + FalseWeight == 0)
+ // Zero branch_weights do not give a hint for getting branch probabilities.
+ // Technically it would result in division by zero denominator, which is
+ // TrueWeight + FalseWeight.
+ return;
+
+ // Returns the outgoing edge of the dominating predecessor block
+ // that leads to the PhiNode's incoming block:
+ auto GetPredOutEdge =
+ [](BasicBlock *IncomingBB,
+ BasicBlock *PhiBB) -> std::pair<BasicBlock *, BasicBlock *> {
+ auto *PredBB = IncomingBB;
+ auto *SuccBB = PhiBB;
+ SmallPtrSet<BasicBlock *, 16> Visited;
+ while (true) {
+ BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
+ if (PredBr && PredBr->isConditional())
+ return {PredBB, SuccBB};
+ Visited.insert(PredBB);
+ auto *SinglePredBB = PredBB->getSinglePredecessor();
+ if (!SinglePredBB)
+ return {nullptr, nullptr};
+
+ // Stop searching when SinglePredBB has been visited. It means we see
+ // an unreachable loop.
+ if (Visited.count(SinglePredBB))
+ return {nullptr, nullptr};
+
+ SuccBB = PredBB;
+ PredBB = SinglePredBB;
+ }
+ };
+
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ Value *PhiOpnd = PN->getIncomingValue(i);
+ ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
+
+ if (!CI || !CI->getType()->isIntegerTy(1))
+ continue;
+
+ BranchProbability BP =
+ (CI->isOne() ? BranchProbability::getBranchProbability(
+ TrueWeight, TrueWeight + FalseWeight)
+ : BranchProbability::getBranchProbability(
+ FalseWeight, TrueWeight + FalseWeight));
+
+ auto PredOutEdge = GetPredOutEdge(PN->getIncomingBlock(i), BB);
+ if (!PredOutEdge.first)
+ return;
+
+ BasicBlock *PredBB = PredOutEdge.first;
+ BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator());
+ if (!PredBr)
+ return;
+
+ uint64_t PredTrueWeight, PredFalseWeight;
+ // FIXME: We currently only set the profile data when it is missing.
+ // With PGO, this can be used to refine even existing profile data with
+ // context information. This needs to be done after more performance
+ // testing.
+ if (PredBr->extractProfMetadata(PredTrueWeight, PredFalseWeight))
+ continue;
+
+ // We can not infer anything useful when BP >= 50%, because BP is the
+ // upper bound probability value.
+ if (BP >= BranchProbability(50, 100))
+ continue;
+
+ SmallVector<uint32_t, 2> Weights;
+ if (PredBr->getSuccessor(0) == PredOutEdge.second) {
+ Weights.push_back(BP.getNumerator());
+ Weights.push_back(BP.getCompl().getNumerator());
+ } else {
+ Weights.push_back(BP.getCompl().getNumerator());
+ Weights.push_back(BP.getNumerator());
+ }
+ PredBr->setMetadata(LLVMContext::MD_prof,
+ MDBuilder(PredBr->getParent()->getContext())
+ .createBranchWeights(Weights));
+ }
+}
+
+/// runOnFunction - Toplevel algorithm.
+bool JumpThreading::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
auto TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
// Jump Threading has no sense for the targets with divergent CF
if (TTI->hasBranchDivergence())
return false;
- auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
- auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy);
- std::unique_ptr<BlockFrequencyInfo> BFI;
- std::unique_ptr<BranchProbabilityInfo> BPI;
- if (F.hasProfileData()) {
- LoopInfo LI{DominatorTree(F)};
- BPI.reset(new BranchProbabilityInfo(F, LI, TLI));
- BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
- }
-
- bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, F.hasProfileData(),
- std::move(BFI), std::move(BPI));
- if (PrintLVIAfterJumpThreading) {
- dbgs() << "LVI for function '" << F.getName() << "':\n";
- LVI->printLVI(F, DTU.getDomTree(), dbgs());
- }
- return Changed;
-}
-
-PreservedAnalyses JumpThreadingPass::run(Function &F,
- FunctionAnalysisManager &AM) {
+ auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+ auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DomTreeUpdater DTU(*DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ std::unique_ptr<BlockFrequencyInfo> BFI;
+ std::unique_ptr<BranchProbabilityInfo> BPI;
+ if (F.hasProfileData()) {
+ LoopInfo LI{DominatorTree(F)};
+ BPI.reset(new BranchProbabilityInfo(F, LI, TLI));
+ BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+ }
+
+ bool Changed = Impl.runImpl(F, TLI, LVI, AA, &DTU, F.hasProfileData(),
+ std::move(BFI), std::move(BPI));
+ if (PrintLVIAfterJumpThreading) {
+ dbgs() << "LVI for function '" << F.getName() << "':\n";
+ LVI->printLVI(F, DTU.getDomTree(), dbgs());
+ }
+ return Changed;
+}
+
+PreservedAnalyses JumpThreadingPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
// Jump Threading has no sense for the targets with divergent CF
if (TTI.hasBranchDivergence())
return PreservedAnalyses::all();
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &LVI = AM.getResult<LazyValueAnalysis>(F);
- auto &AA = AM.getResult<AAManager>(F);
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
-
- std::unique_ptr<BlockFrequencyInfo> BFI;
- std::unique_ptr<BranchProbabilityInfo> BPI;
- if (F.hasProfileData()) {
- LoopInfo LI{DominatorTree(F)};
- BPI.reset(new BranchProbabilityInfo(F, LI, &TLI));
- BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
- }
-
- bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(),
- std::move(BFI), std::move(BPI));
-
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &LVI = AM.getResult<LazyValueAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+
+ std::unique_ptr<BlockFrequencyInfo> BFI;
+ std::unique_ptr<BranchProbabilityInfo> BPI;
+ if (F.hasProfileData()) {
+ LoopInfo LI{DominatorTree(F)};
+ BPI.reset(new BranchProbabilityInfo(F, LI, &TLI));
+ BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
+ }
+
+ bool Changed = runImpl(F, &TLI, &LVI, &AA, &DTU, F.hasProfileData(),
+ std::move(BFI), std::move(BPI));
+
if (PrintLVIAfterJumpThreading) {
dbgs() << "LVI for function '" << F.getName() << "':\n";
LVI.printLVI(F, DTU.getDomTree(), dbgs());
}
- if (!Changed)
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
- PA.preserve<GlobalsAA>();
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<LazyValueAnalysis>();
- return PA;
-}
-
-bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
- LazyValueInfo *LVI_, AliasAnalysis *AA_,
- DomTreeUpdater *DTU_, bool HasProfileData_,
- std::unique_ptr<BlockFrequencyInfo> BFI_,
- std::unique_ptr<BranchProbabilityInfo> BPI_) {
- LLVM_DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
- TLI = TLI_;
- LVI = LVI_;
- AA = AA_;
- DTU = DTU_;
- BFI.reset();
- BPI.reset();
- // When profile data is available, we need to update edge weights after
- // successful jump threading, which requires both BPI and BFI being available.
- HasProfileData = HasProfileData_;
- auto *GuardDecl = F.getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_guard));
- HasGuards = GuardDecl && !GuardDecl->use_empty();
- if (HasProfileData) {
- BPI = std::move(BPI_);
- BFI = std::move(BFI_);
- }
-
- // Reduce the number of instructions duplicated when optimizing strictly for
- // size.
- if (BBDuplicateThreshold.getNumOccurrences())
- BBDupThreshold = BBDuplicateThreshold;
- else if (F.hasFnAttribute(Attribute::MinSize))
- BBDupThreshold = 3;
- else
- BBDupThreshold = DefaultBBDupThreshold;
-
- // JumpThreading must not processes blocks unreachable from entry. It's a
- // waste of compute time and can potentially lead to hangs.
- SmallPtrSet<BasicBlock *, 16> Unreachable;
- assert(DTU && "DTU isn't passed into JumpThreading before using it.");
- assert(DTU->hasDomTree() && "JumpThreading relies on DomTree to proceed.");
- DominatorTree &DT = DTU->getDomTree();
- for (auto &BB : F)
- if (!DT.isReachableFromEntry(&BB))
- Unreachable.insert(&BB);
-
- if (!ThreadAcrossLoopHeaders)
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<GlobalsAA>();
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LazyValueAnalysis>();
+ return PA;
+}
+
+bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
+ LazyValueInfo *LVI_, AliasAnalysis *AA_,
+ DomTreeUpdater *DTU_, bool HasProfileData_,
+ std::unique_ptr<BlockFrequencyInfo> BFI_,
+ std::unique_ptr<BranchProbabilityInfo> BPI_) {
+ LLVM_DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
+ TLI = TLI_;
+ LVI = LVI_;
+ AA = AA_;
+ DTU = DTU_;
+ BFI.reset();
+ BPI.reset();
+ // When profile data is available, we need to update edge weights after
+ // successful jump threading, which requires both BPI and BFI being available.
+ HasProfileData = HasProfileData_;
+ auto *GuardDecl = F.getParent()->getFunction(
+ Intrinsic::getName(Intrinsic::experimental_guard));
+ HasGuards = GuardDecl && !GuardDecl->use_empty();
+ if (HasProfileData) {
+ BPI = std::move(BPI_);
+ BFI = std::move(BFI_);
+ }
+
+ // Reduce the number of instructions duplicated when optimizing strictly for
+ // size.
+ if (BBDuplicateThreshold.getNumOccurrences())
+ BBDupThreshold = BBDuplicateThreshold;
+ else if (F.hasFnAttribute(Attribute::MinSize))
+ BBDupThreshold = 3;
+ else
+ BBDupThreshold = DefaultBBDupThreshold;
+
+ // JumpThreading must not processes blocks unreachable from entry. It's a
+ // waste of compute time and can potentially lead to hangs.
+ SmallPtrSet<BasicBlock *, 16> Unreachable;
+ assert(DTU && "DTU isn't passed into JumpThreading before using it.");
+ assert(DTU->hasDomTree() && "JumpThreading relies on DomTree to proceed.");
+ DominatorTree &DT = DTU->getDomTree();
+ for (auto &BB : F)
+ if (!DT.isReachableFromEntry(&BB))
+ Unreachable.insert(&BB);
+
+ if (!ThreadAcrossLoopHeaders)
findLoopHeaders(F);
-
- bool EverChanged = false;
- bool Changed;
- do {
- Changed = false;
- for (auto &BB : F) {
- if (Unreachable.count(&BB))
- continue;
+
+ bool EverChanged = false;
+ bool Changed;
+ do {
+ Changed = false;
+ for (auto &BB : F) {
+ if (Unreachable.count(&BB))
+ continue;
while (processBlock(&BB)) // Thread all of the branches we can over BB.
- Changed = true;
-
- // Jump threading may have introduced redundant debug values into BB
- // which should be removed.
- if (Changed)
- RemoveRedundantDbgInstrs(&BB);
-
- // Stop processing BB if it's the entry or is now deleted. The following
- // routines attempt to eliminate BB and locating a suitable replacement
- // for the entry is non-trivial.
- if (&BB == &F.getEntryBlock() || DTU->isBBPendingDeletion(&BB))
- continue;
-
- if (pred_empty(&BB)) {
+ Changed = true;
+
+ // Jump threading may have introduced redundant debug values into BB
+ // which should be removed.
+ if (Changed)
+ RemoveRedundantDbgInstrs(&BB);
+
+ // Stop processing BB if it's the entry or is now deleted. The following
+ // routines attempt to eliminate BB and locating a suitable replacement
+ // for the entry is non-trivial.
+ if (&BB == &F.getEntryBlock() || DTU->isBBPendingDeletion(&BB))
+ continue;
+
+ if (pred_empty(&BB)) {
// When processBlock makes BB unreachable it doesn't bother to fix up
- // the instructions in it. We must remove BB to prevent invalid IR.
- LLVM_DEBUG(dbgs() << " JT: Deleting dead block '" << BB.getName()
- << "' with terminator: " << *BB.getTerminator()
- << '\n');
- LoopHeaders.erase(&BB);
- LVI->eraseBlock(&BB);
- DeleteDeadBlock(&BB, DTU);
- Changed = true;
- continue;
- }
-
+ // the instructions in it. We must remove BB to prevent invalid IR.
+ LLVM_DEBUG(dbgs() << " JT: Deleting dead block '" << BB.getName()
+ << "' with terminator: " << *BB.getTerminator()
+ << '\n');
+ LoopHeaders.erase(&BB);
+ LVI->eraseBlock(&BB);
+ DeleteDeadBlock(&BB, DTU);
+ Changed = true;
+ continue;
+ }
+
// processBlock doesn't thread BBs with unconditional TIs. However, if BB
- // is "almost empty", we attempt to merge BB with its sole successor.
- auto *BI = dyn_cast<BranchInst>(BB.getTerminator());
- if (BI && BI->isUnconditional()) {
- BasicBlock *Succ = BI->getSuccessor(0);
- if (
- // The terminator must be the only non-phi instruction in BB.
- BB.getFirstNonPHIOrDbg()->isTerminator() &&
- // Don't alter Loop headers and latches to ensure another pass can
- // detect and transform nested loops later.
- !LoopHeaders.count(&BB) && !LoopHeaders.count(Succ) &&
- TryToSimplifyUncondBranchFromEmptyBlock(&BB, DTU)) {
- RemoveRedundantDbgInstrs(Succ);
- // BB is valid for cleanup here because we passed in DTU. F remains
- // BB's parent until a DTU->getDomTree() event.
- LVI->eraseBlock(&BB);
- Changed = true;
- }
- }
- }
- EverChanged |= Changed;
- } while (Changed);
-
- LoopHeaders.clear();
- return EverChanged;
-}
-
-// Replace uses of Cond with ToVal when safe to do so. If all uses are
-// replaced, we can remove Cond. We cannot blindly replace all uses of Cond
-// because we may incorrectly replace uses when guards/assumes are uses of
-// of `Cond` and we used the guards/assume to reason about the `Cond` value
-// at the end of block. RAUW unconditionally replaces all uses
-// including the guards/assumes themselves and the uses before the
-// guard/assume.
+ // is "almost empty", we attempt to merge BB with its sole successor.
+ auto *BI = dyn_cast<BranchInst>(BB.getTerminator());
+ if (BI && BI->isUnconditional()) {
+ BasicBlock *Succ = BI->getSuccessor(0);
+ if (
+ // The terminator must be the only non-phi instruction in BB.
+ BB.getFirstNonPHIOrDbg()->isTerminator() &&
+ // Don't alter Loop headers and latches to ensure another pass can
+ // detect and transform nested loops later.
+ !LoopHeaders.count(&BB) && !LoopHeaders.count(Succ) &&
+ TryToSimplifyUncondBranchFromEmptyBlock(&BB, DTU)) {
+ RemoveRedundantDbgInstrs(Succ);
+ // BB is valid for cleanup here because we passed in DTU. F remains
+ // BB's parent until a DTU->getDomTree() event.
+ LVI->eraseBlock(&BB);
+ Changed = true;
+ }
+ }
+ }
+ EverChanged |= Changed;
+ } while (Changed);
+
+ LoopHeaders.clear();
+ return EverChanged;
+}
+
+// Replace uses of Cond with ToVal when safe to do so. If all uses are
+// replaced, we can remove Cond. We cannot blindly replace all uses of Cond
+// because we may incorrectly replace uses when guards/assumes are uses of
+// of `Cond` and we used the guards/assume to reason about the `Cond` value
+// at the end of block. RAUW unconditionally replaces all uses
+// including the guards/assumes themselves and the uses before the
+// guard/assume.
static void replaceFoldableUses(Instruction *Cond, Value *ToVal) {
- assert(Cond->getType() == ToVal->getType());
- auto *BB = Cond->getParent();
- // We can unconditionally replace all uses in non-local blocks (i.e. uses
- // strictly dominated by BB), since LVI information is true from the
- // terminator of BB.
- replaceNonLocalUsesWith(Cond, ToVal);
- for (Instruction &I : reverse(*BB)) {
- // Reached the Cond whose uses we are trying to replace, so there are no
- // more uses.
- if (&I == Cond)
- break;
- // We only replace uses in instructions that are guaranteed to reach the end
- // of BB, where we know Cond is ToVal.
- if (!isGuaranteedToTransferExecutionToSuccessor(&I))
- break;
- I.replaceUsesOfWith(Cond, ToVal);
- }
- if (Cond->use_empty() && !Cond->mayHaveSideEffects())
- Cond->eraseFromParent();
-}
-
-/// Return the cost of duplicating a piece of this block from first non-phi
-/// and before StopAt instruction to thread across it. Stop scanning the block
-/// when exceeding the threshold. If duplication is impossible, returns ~0U.
-static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
- Instruction *StopAt,
- unsigned Threshold) {
- assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
- /// Ignore PHI nodes, these will be flattened when duplication happens.
- BasicBlock::const_iterator I(BB->getFirstNonPHI());
-
- // FIXME: THREADING will delete values that are just used to compute the
- // branch, so they shouldn't count against the duplication cost.
-
- unsigned Bonus = 0;
- if (BB->getTerminator() == StopAt) {
- // Threading through a switch statement is particularly profitable. If this
- // block ends in a switch, decrease its cost to make it more likely to
- // happen.
- if (isa<SwitchInst>(StopAt))
- Bonus = 6;
-
- // The same holds for indirect branches, but slightly more so.
- if (isa<IndirectBrInst>(StopAt))
- Bonus = 8;
- }
-
- // Bump the threshold up so the early exit from the loop doesn't skip the
- // terminator-based Size adjustment at the end.
- Threshold += Bonus;
-
- // Sum up the cost of each instruction until we get to the terminator. Don't
- // include the terminator because the copy won't include it.
- unsigned Size = 0;
- for (; &*I != StopAt; ++I) {
-
- // Stop scanning the block if we've reached the threshold.
- if (Size > Threshold)
- return Size;
-
- // Debugger intrinsics don't incur code size.
- if (isa<DbgInfoIntrinsic>(I)) continue;
-
+ assert(Cond->getType() == ToVal->getType());
+ auto *BB = Cond->getParent();
+ // We can unconditionally replace all uses in non-local blocks (i.e. uses
+ // strictly dominated by BB), since LVI information is true from the
+ // terminator of BB.
+ replaceNonLocalUsesWith(Cond, ToVal);
+ for (Instruction &I : reverse(*BB)) {
+ // Reached the Cond whose uses we are trying to replace, so there are no
+ // more uses.
+ if (&I == Cond)
+ break;
+ // We only replace uses in instructions that are guaranteed to reach the end
+ // of BB, where we know Cond is ToVal.
+ if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+ break;
+ I.replaceUsesOfWith(Cond, ToVal);
+ }
+ if (Cond->use_empty() && !Cond->mayHaveSideEffects())
+ Cond->eraseFromParent();
+}
+
+/// Return the cost of duplicating a piece of this block from first non-phi
+/// and before StopAt instruction to thread across it. Stop scanning the block
+/// when exceeding the threshold. If duplication is impossible, returns ~0U.
+static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
+ Instruction *StopAt,
+ unsigned Threshold) {
+ assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
+ /// Ignore PHI nodes, these will be flattened when duplication happens.
+ BasicBlock::const_iterator I(BB->getFirstNonPHI());
+
+ // FIXME: THREADING will delete values that are just used to compute the
+ // branch, so they shouldn't count against the duplication cost.
+
+ unsigned Bonus = 0;
+ if (BB->getTerminator() == StopAt) {
+ // Threading through a switch statement is particularly profitable. If this
+ // block ends in a switch, decrease its cost to make it more likely to
+ // happen.
+ if (isa<SwitchInst>(StopAt))
+ Bonus = 6;
+
+ // The same holds for indirect branches, but slightly more so.
+ if (isa<IndirectBrInst>(StopAt))
+ Bonus = 8;
+ }
+
+ // Bump the threshold up so the early exit from the loop doesn't skip the
+ // terminator-based Size adjustment at the end.
+ Threshold += Bonus;
+
+ // Sum up the cost of each instruction until we get to the terminator. Don't
+ // include the terminator because the copy won't include it.
+ unsigned Size = 0;
+ for (; &*I != StopAt; ++I) {
+
+ // Stop scanning the block if we've reached the threshold.
+ if (Size > Threshold)
+ return Size;
+
+ // Debugger intrinsics don't incur code size.
+ if (isa<DbgInfoIntrinsic>(I)) continue;
+
// Pseudo-probes don't incur code size.
if (isa<PseudoProbeInst>(I))
continue;
- // If this is a pointer->pointer bitcast, it is free.
- if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
- continue;
-
+ // If this is a pointer->pointer bitcast, it is free.
+ if (isa<BitCastInst>(I) && I->getType()->isPointerTy())
+ continue;
+
// Freeze instruction is free, too.
if (isa<FreezeInst>(I))
continue;
- // Bail out if this instruction gives back a token type, it is not possible
- // to duplicate it if it is used outside this BB.
- if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB))
- return ~0U;
-
- // All other instructions count for at least one unit.
- ++Size;
-
- // Calls are more expensive. If they are non-intrinsic calls, we model them
- // as having cost of 4. If they are a non-vector intrinsic, we model them
- // as having cost of 2 total, and if they are a vector intrinsic, we model
- // them as having cost 1.
- if (const CallInst *CI = dyn_cast<CallInst>(I)) {
- if (CI->cannotDuplicate() || CI->isConvergent())
- // Blocks with NoDuplicate are modelled as having infinite cost, so they
- // are never duplicated.
- return ~0U;
- else if (!isa<IntrinsicInst>(CI))
- Size += 3;
- else if (!CI->getType()->isVectorTy())
- Size += 1;
- }
- }
-
- return Size > Bonus ? Size - Bonus : 0;
-}
-
+ // Bail out if this instruction gives back a token type, it is not possible
+ // to duplicate it if it is used outside this BB.
+ if (I->getType()->isTokenTy() && I->isUsedOutsideOfBlock(BB))
+ return ~0U;
+
+ // All other instructions count for at least one unit.
+ ++Size;
+
+ // Calls are more expensive. If they are non-intrinsic calls, we model them
+ // as having cost of 4. If they are a non-vector intrinsic, we model them
+ // as having cost of 2 total, and if they are a vector intrinsic, we model
+ // them as having cost 1.
+ if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+ if (CI->cannotDuplicate() || CI->isConvergent())
+ // Blocks with NoDuplicate are modelled as having infinite cost, so they
+ // are never duplicated.
+ return ~0U;
+ else if (!isa<IntrinsicInst>(CI))
+ Size += 3;
+ else if (!CI->getType()->isVectorTy())
+ Size += 1;
+ }
+ }
+
+ return Size > Bonus ? Size - Bonus : 0;
+}
+
/// findLoopHeaders - We do not want jump threading to turn proper loop
-/// structures into irreducible loops. Doing this breaks up the loop nesting
-/// hierarchy and pessimizes later transformations. To prevent this from
-/// happening, we first have to find the loop headers. Here we approximate this
-/// by finding targets of backedges in the CFG.
-///
-/// Note that there definitely are cases when we want to allow threading of
-/// edges across a loop header. For example, threading a jump from outside the
-/// loop (the preheader) to an exit block of the loop is definitely profitable.
-/// It is also almost always profitable to thread backedges from within the loop
-/// to exit blocks, and is often profitable to thread backedges to other blocks
-/// within the loop (forming a nested loop). This simple analysis is not rich
-/// enough to track all of these properties and keep it up-to-date as the CFG
-/// mutates, so we don't allow any of these transformations.
+/// structures into irreducible loops. Doing this breaks up the loop nesting
+/// hierarchy and pessimizes later transformations. To prevent this from
+/// happening, we first have to find the loop headers. Here we approximate this
+/// by finding targets of backedges in the CFG.
+///
+/// Note that there definitely are cases when we want to allow threading of
+/// edges across a loop header. For example, threading a jump from outside the
+/// loop (the preheader) to an exit block of the loop is definitely profitable.
+/// It is also almost always profitable to thread backedges from within the loop
+/// to exit blocks, and is often profitable to thread backedges to other blocks
+/// within the loop (forming a nested loop). This simple analysis is not rich
+/// enough to track all of these properties and keep it up-to-date as the CFG
+/// mutates, so we don't allow any of these transformations.
void JumpThreadingPass::findLoopHeaders(Function &F) {
- SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
- FindFunctionBackedges(F, Edges);
-
- for (const auto &Edge : Edges)
- LoopHeaders.insert(Edge.second);
-}
-
-/// getKnownConstant - Helper method to determine if we can thread over a
-/// terminator with the given value as its condition, and if so what value to
-/// use for that. What kind of value this is depends on whether we want an
-/// integer or a block address, but an undef is always accepted.
-/// Returns null if Val is null or not an appropriate constant.
-static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
- if (!Val)
- return nullptr;
-
- // Undef is "known" enough.
- if (UndefValue *U = dyn_cast<UndefValue>(Val))
- return U;
-
- if (Preference == WantBlockAddress)
- return dyn_cast<BlockAddress>(Val->stripPointerCasts());
-
- return dyn_cast<ConstantInt>(Val);
-}
-
+ SmallVector<std::pair<const BasicBlock*,const BasicBlock*>, 32> Edges;
+ FindFunctionBackedges(F, Edges);
+
+ for (const auto &Edge : Edges)
+ LoopHeaders.insert(Edge.second);
+}
+
+/// getKnownConstant - Helper method to determine if we can thread over a
+/// terminator with the given value as its condition, and if so what value to
+/// use for that. What kind of value this is depends on whether we want an
+/// integer or a block address, but an undef is always accepted.
+/// Returns null if Val is null or not an appropriate constant.
+static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
+ if (!Val)
+ return nullptr;
+
+ // Undef is "known" enough.
+ if (UndefValue *U = dyn_cast<UndefValue>(Val))
+ return U;
+
+ if (Preference == WantBlockAddress)
+ return dyn_cast<BlockAddress>(Val->stripPointerCasts());
+
+ return dyn_cast<ConstantInt>(Val);
+}
+
/// computeValueKnownInPredecessors - Given a basic block BB and a value V, see
-/// if we can infer that the value is a known ConstantInt/BlockAddress or undef
-/// in any of our predecessors. If so, return the known list of value and pred
-/// BB in the result vector.
-///
-/// This returns true if there were any known values.
+/// if we can infer that the value is a known ConstantInt/BlockAddress or undef
+/// in any of our predecessors. If so, return the known list of value and pred
+/// BB in the result vector.
+///
+/// This returns true if there were any known values.
bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
- Value *V, BasicBlock *BB, PredValueInfo &Result,
- ConstantPreference Preference, DenseSet<Value *> &RecursionSet,
- Instruction *CxtI) {
- // This method walks up use-def chains recursively. Because of this, we could
- // get into an infinite loop going around loops in the use-def chain. To
- // prevent this, keep track of what (value, block) pairs we've already visited
- // and terminate the search if we loop back to them
- if (!RecursionSet.insert(V).second)
- return false;
-
- // If V is a constant, then it is known in all predecessors.
- if (Constant *KC = getKnownConstant(V, Preference)) {
- for (BasicBlock *Pred : predecessors(BB))
- Result.emplace_back(KC, Pred);
-
- return !Result.empty();
- }
-
- // If V is a non-instruction value, or an instruction in a different block,
- // then it can't be derived from a PHI.
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I || I->getParent() != BB) {
-
- // Okay, if this is a live-in value, see if it has a known value at the end
- // of any of our predecessors.
- //
- // FIXME: This should be an edge property, not a block end property.
- /// TODO: Per PR2563, we could infer value range information about a
- /// predecessor based on its terminator.
- //
- // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if
- // "I" is a non-local compare-with-a-constant instruction. This would be
- // able to handle value inequalities better, for example if the compare is
- // "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
- // Perhaps getConstantOnEdge should be smart enough to do this?
- for (BasicBlock *P : predecessors(BB)) {
- // If the value is known by LazyValueInfo to be a constant in a
- // predecessor, use that information to try to thread this block.
- Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI);
- if (Constant *KC = getKnownConstant(PredCst, Preference))
- Result.emplace_back(KC, P);
- }
-
- return !Result.empty();
- }
-
- /// If I is a PHI node, then we know the incoming values for any constants.
- if (PHINode *PN = dyn_cast<PHINode>(I)) {
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- Value *InVal = PN->getIncomingValue(i);
- if (Constant *KC = getKnownConstant(InVal, Preference)) {
- Result.emplace_back(KC, PN->getIncomingBlock(i));
- } else {
- Constant *CI = LVI->getConstantOnEdge(InVal,
- PN->getIncomingBlock(i),
- BB, CxtI);
- if (Constant *KC = getKnownConstant(CI, Preference))
- Result.emplace_back(KC, PN->getIncomingBlock(i));
- }
- }
-
- return !Result.empty();
- }
-
+ Value *V, BasicBlock *BB, PredValueInfo &Result,
+ ConstantPreference Preference, DenseSet<Value *> &RecursionSet,
+ Instruction *CxtI) {
+ // This method walks up use-def chains recursively. Because of this, we could
+ // get into an infinite loop going around loops in the use-def chain. To
+ // prevent this, keep track of what (value, block) pairs we've already visited
+ // and terminate the search if we loop back to them
+ if (!RecursionSet.insert(V).second)
+ return false;
+
+ // If V is a constant, then it is known in all predecessors.
+ if (Constant *KC = getKnownConstant(V, Preference)) {
+ for (BasicBlock *Pred : predecessors(BB))
+ Result.emplace_back(KC, Pred);
+
+ return !Result.empty();
+ }
+
+ // If V is a non-instruction value, or an instruction in a different block,
+ // then it can't be derived from a PHI.
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I || I->getParent() != BB) {
+
+ // Okay, if this is a live-in value, see if it has a known value at the end
+ // of any of our predecessors.
+ //
+ // FIXME: This should be an edge property, not a block end property.
+ /// TODO: Per PR2563, we could infer value range information about a
+ /// predecessor based on its terminator.
+ //
+ // FIXME: change this to use the more-rich 'getPredicateOnEdge' method if
+ // "I" is a non-local compare-with-a-constant instruction. This would be
+ // able to handle value inequalities better, for example if the compare is
+ // "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
+ // Perhaps getConstantOnEdge should be smart enough to do this?
+ for (BasicBlock *P : predecessors(BB)) {
+ // If the value is known by LazyValueInfo to be a constant in a
+ // predecessor, use that information to try to thread this block.
+ Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI);
+ if (Constant *KC = getKnownConstant(PredCst, Preference))
+ Result.emplace_back(KC, P);
+ }
+
+ return !Result.empty();
+ }
+
+ /// If I is a PHI node, then we know the incoming values for any constants.
+ if (PHINode *PN = dyn_cast<PHINode>(I)) {
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ Value *InVal = PN->getIncomingValue(i);
+ if (Constant *KC = getKnownConstant(InVal, Preference)) {
+ Result.emplace_back(KC, PN->getIncomingBlock(i));
+ } else {
+ Constant *CI = LVI->getConstantOnEdge(InVal,
+ PN->getIncomingBlock(i),
+ BB, CxtI);
+ if (Constant *KC = getKnownConstant(CI, Preference))
+ Result.emplace_back(KC, PN->getIncomingBlock(i));
+ }
+ }
+
+ return !Result.empty();
+ }
+
// Handle Cast instructions.
- if (CastInst *CI = dyn_cast<CastInst>(I)) {
- Value *Source = CI->getOperand(0);
+ if (CastInst *CI = dyn_cast<CastInst>(I)) {
+ Value *Source = CI->getOperand(0);
computeValueKnownInPredecessorsImpl(Source, BB, Result, Preference,
- RecursionSet, CxtI);
- if (Result.empty())
- return false;
-
- // Convert the known values.
- for (auto &R : Result)
- R.first = ConstantExpr::getCast(CI->getOpcode(), R.first, CI->getType());
-
- return true;
- }
-
+ RecursionSet, CxtI);
+ if (Result.empty())
+ return false;
+
+ // Convert the known values.
+ for (auto &R : Result)
+ R.first = ConstantExpr::getCast(CI->getOpcode(), R.first, CI->getType());
+
+ return true;
+ }
+
if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {
Value *Source = FI->getOperand(0);
computeValueKnownInPredecessorsImpl(Source, BB, Result, Preference,
@@ -731,1351 +731,1351 @@ bool JumpThreadingPass::computeValueKnownInPredecessorsImpl(
return !Result.empty();
}
- // Handle some boolean conditions.
- if (I->getType()->getPrimitiveSizeInBits() == 1) {
- assert(Preference == WantInteger && "One-bit non-integer type?");
- // X | true -> true
- // X & false -> false
- if (I->getOpcode() == Instruction::Or ||
- I->getOpcode() == Instruction::And) {
- PredValueInfoTy LHSVals, RHSVals;
-
+ // Handle some boolean conditions.
+ if (I->getType()->getPrimitiveSizeInBits() == 1) {
+ assert(Preference == WantInteger && "One-bit non-integer type?");
+ // X | true -> true
+ // X & false -> false
+ if (I->getOpcode() == Instruction::Or ||
+ I->getOpcode() == Instruction::And) {
+ PredValueInfoTy LHSVals, RHSVals;
+
computeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
- WantInteger, RecursionSet, CxtI);
+ WantInteger, RecursionSet, CxtI);
computeValueKnownInPredecessorsImpl(I->getOperand(1), BB, RHSVals,
- WantInteger, RecursionSet, CxtI);
-
- if (LHSVals.empty() && RHSVals.empty())
- return false;
-
- ConstantInt *InterestingVal;
- if (I->getOpcode() == Instruction::Or)
- InterestingVal = ConstantInt::getTrue(I->getContext());
- else
- InterestingVal = ConstantInt::getFalse(I->getContext());
-
- SmallPtrSet<BasicBlock*, 4> LHSKnownBBs;
-
- // Scan for the sentinel. If we find an undef, force it to the
- // interesting value: x|undef -> true and x&undef -> false.
- for (const auto &LHSVal : LHSVals)
- if (LHSVal.first == InterestingVal || isa<UndefValue>(LHSVal.first)) {
- Result.emplace_back(InterestingVal, LHSVal.second);
- LHSKnownBBs.insert(LHSVal.second);
- }
- for (const auto &RHSVal : RHSVals)
- if (RHSVal.first == InterestingVal || isa<UndefValue>(RHSVal.first)) {
- // If we already inferred a value for this block on the LHS, don't
- // re-add it.
- if (!LHSKnownBBs.count(RHSVal.second))
- Result.emplace_back(InterestingVal, RHSVal.second);
- }
-
- return !Result.empty();
- }
-
- // Handle the NOT form of XOR.
- if (I->getOpcode() == Instruction::Xor &&
- isa<ConstantInt>(I->getOperand(1)) &&
- cast<ConstantInt>(I->getOperand(1))->isOne()) {
+ WantInteger, RecursionSet, CxtI);
+
+ if (LHSVals.empty() && RHSVals.empty())
+ return false;
+
+ ConstantInt *InterestingVal;
+ if (I->getOpcode() == Instruction::Or)
+ InterestingVal = ConstantInt::getTrue(I->getContext());
+ else
+ InterestingVal = ConstantInt::getFalse(I->getContext());
+
+ SmallPtrSet<BasicBlock*, 4> LHSKnownBBs;
+
+ // Scan for the sentinel. If we find an undef, force it to the
+ // interesting value: x|undef -> true and x&undef -> false.
+ for (const auto &LHSVal : LHSVals)
+ if (LHSVal.first == InterestingVal || isa<UndefValue>(LHSVal.first)) {
+ Result.emplace_back(InterestingVal, LHSVal.second);
+ LHSKnownBBs.insert(LHSVal.second);
+ }
+ for (const auto &RHSVal : RHSVals)
+ if (RHSVal.first == InterestingVal || isa<UndefValue>(RHSVal.first)) {
+ // If we already inferred a value for this block on the LHS, don't
+ // re-add it.
+ if (!LHSKnownBBs.count(RHSVal.second))
+ Result.emplace_back(InterestingVal, RHSVal.second);
+ }
+
+ return !Result.empty();
+ }
+
+ // Handle the NOT form of XOR.
+ if (I->getOpcode() == Instruction::Xor &&
+ isa<ConstantInt>(I->getOperand(1)) &&
+ cast<ConstantInt>(I->getOperand(1))->isOne()) {
computeValueKnownInPredecessorsImpl(I->getOperand(0), BB, Result,
- WantInteger, RecursionSet, CxtI);
- if (Result.empty())
- return false;
-
- // Invert the known values.
- for (auto &R : Result)
- R.first = ConstantExpr::getNot(R.first);
-
- return true;
- }
-
- // Try to simplify some other binary operator values.
- } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
- assert(Preference != WantBlockAddress
- && "A binary operator creating a block address?");
- if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
- PredValueInfoTy LHSVals;
+ WantInteger, RecursionSet, CxtI);
+ if (Result.empty())
+ return false;
+
+ // Invert the known values.
+ for (auto &R : Result)
+ R.first = ConstantExpr::getNot(R.first);
+
+ return true;
+ }
+
+ // Try to simplify some other binary operator values.
+ } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+ assert(Preference != WantBlockAddress
+ && "A binary operator creating a block address?");
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+ PredValueInfoTy LHSVals;
computeValueKnownInPredecessorsImpl(BO->getOperand(0), BB, LHSVals,
- WantInteger, RecursionSet, CxtI);
-
- // Try to use constant folding to simplify the binary operator.
- for (const auto &LHSVal : LHSVals) {
- Constant *V = LHSVal.first;
- Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);
-
- if (Constant *KC = getKnownConstant(Folded, WantInteger))
- Result.emplace_back(KC, LHSVal.second);
- }
- }
-
- return !Result.empty();
- }
-
- // Handle compare with phi operand, where the PHI is defined in this block.
- if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
- assert(Preference == WantInteger && "Compares only produce integers");
- Type *CmpType = Cmp->getType();
- Value *CmpLHS = Cmp->getOperand(0);
- Value *CmpRHS = Cmp->getOperand(1);
- CmpInst::Predicate Pred = Cmp->getPredicate();
-
- PHINode *PN = dyn_cast<PHINode>(CmpLHS);
- if (!PN)
- PN = dyn_cast<PHINode>(CmpRHS);
- if (PN && PN->getParent() == BB) {
- const DataLayout &DL = PN->getModule()->getDataLayout();
- // We can do this simplification if any comparisons fold to true or false.
- // See if any do.
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- BasicBlock *PredBB = PN->getIncomingBlock(i);
- Value *LHS, *RHS;
- if (PN == CmpLHS) {
- LHS = PN->getIncomingValue(i);
- RHS = CmpRHS->DoPHITranslation(BB, PredBB);
- } else {
- LHS = CmpLHS->DoPHITranslation(BB, PredBB);
- RHS = PN->getIncomingValue(i);
- }
- Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});
- if (!Res) {
- if (!isa<Constant>(RHS))
- continue;
-
- // getPredicateOnEdge call will make no sense if LHS is defined in BB.
- auto LHSInst = dyn_cast<Instruction>(LHS);
- if (LHSInst && LHSInst->getParent() == BB)
- continue;
-
- LazyValueInfo::Tristate
- ResT = LVI->getPredicateOnEdge(Pred, LHS,
- cast<Constant>(RHS), PredBB, BB,
- CxtI ? CxtI : Cmp);
- if (ResT == LazyValueInfo::Unknown)
- continue;
- Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
- }
-
- if (Constant *KC = getKnownConstant(Res, WantInteger))
- Result.emplace_back(KC, PredBB);
- }
-
- return !Result.empty();
- }
-
- // If comparing a live-in value against a constant, see if we know the
- // live-in value on any predecessors.
- if (isa<Constant>(CmpRHS) && !CmpType->isVectorTy()) {
- Constant *CmpConst = cast<Constant>(CmpRHS);
-
- if (!isa<Instruction>(CmpLHS) ||
- cast<Instruction>(CmpLHS)->getParent() != BB) {
- for (BasicBlock *P : predecessors(BB)) {
- // If the value is known by LazyValueInfo to be a constant in a
- // predecessor, use that information to try to thread this block.
- LazyValueInfo::Tristate Res =
- LVI->getPredicateOnEdge(Pred, CmpLHS,
- CmpConst, P, BB, CxtI ? CxtI : Cmp);
- if (Res == LazyValueInfo::Unknown)
- continue;
-
- Constant *ResC = ConstantInt::get(CmpType, Res);
- Result.emplace_back(ResC, P);
- }
-
- return !Result.empty();
- }
-
- // InstCombine can fold some forms of constant range checks into
- // (icmp (add (x, C1)), C2). See if we have we have such a thing with
- // x as a live-in.
- {
- using namespace PatternMatch;
-
- Value *AddLHS;
- ConstantInt *AddConst;
- if (isa<ConstantInt>(CmpConst) &&
- match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) {
- if (!isa<Instruction>(AddLHS) ||
- cast<Instruction>(AddLHS)->getParent() != BB) {
- for (BasicBlock *P : predecessors(BB)) {
- // If the value is known by LazyValueInfo to be a ConstantRange in
- // a predecessor, use that information to try to thread this
- // block.
- ConstantRange CR = LVI->getConstantRangeOnEdge(
- AddLHS, P, BB, CxtI ? CxtI : cast<Instruction>(CmpLHS));
- // Propagate the range through the addition.
- CR = CR.add(AddConst->getValue());
-
- // Get the range where the compare returns true.
- ConstantRange CmpRange = ConstantRange::makeExactICmpRegion(
- Pred, cast<ConstantInt>(CmpConst)->getValue());
-
- Constant *ResC;
- if (CmpRange.contains(CR))
- ResC = ConstantInt::getTrue(CmpType);
- else if (CmpRange.inverse().contains(CR))
- ResC = ConstantInt::getFalse(CmpType);
- else
- continue;
-
- Result.emplace_back(ResC, P);
- }
-
- return !Result.empty();
- }
- }
- }
-
- // Try to find a constant value for the LHS of a comparison,
- // and evaluate it statically if we can.
- PredValueInfoTy LHSVals;
+ WantInteger, RecursionSet, CxtI);
+
+ // Try to use constant folding to simplify the binary operator.
+ for (const auto &LHSVal : LHSVals) {
+ Constant *V = LHSVal.first;
+ Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);
+
+ if (Constant *KC = getKnownConstant(Folded, WantInteger))
+ Result.emplace_back(KC, LHSVal.second);
+ }
+ }
+
+ return !Result.empty();
+ }
+
+ // Handle compare with phi operand, where the PHI is defined in this block.
+ if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
+ assert(Preference == WantInteger && "Compares only produce integers");
+ Type *CmpType = Cmp->getType();
+ Value *CmpLHS = Cmp->getOperand(0);
+ Value *CmpRHS = Cmp->getOperand(1);
+ CmpInst::Predicate Pred = Cmp->getPredicate();
+
+ PHINode *PN = dyn_cast<PHINode>(CmpLHS);
+ if (!PN)
+ PN = dyn_cast<PHINode>(CmpRHS);
+ if (PN && PN->getParent() == BB) {
+ const DataLayout &DL = PN->getModule()->getDataLayout();
+ // We can do this simplification if any comparisons fold to true or false.
+ // See if any do.
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ BasicBlock *PredBB = PN->getIncomingBlock(i);
+ Value *LHS, *RHS;
+ if (PN == CmpLHS) {
+ LHS = PN->getIncomingValue(i);
+ RHS = CmpRHS->DoPHITranslation(BB, PredBB);
+ } else {
+ LHS = CmpLHS->DoPHITranslation(BB, PredBB);
+ RHS = PN->getIncomingValue(i);
+ }
+ Value *Res = SimplifyCmpInst(Pred, LHS, RHS, {DL});
+ if (!Res) {
+ if (!isa<Constant>(RHS))
+ continue;
+
+ // getPredicateOnEdge call will make no sense if LHS is defined in BB.
+ auto LHSInst = dyn_cast<Instruction>(LHS);
+ if (LHSInst && LHSInst->getParent() == BB)
+ continue;
+
+ LazyValueInfo::Tristate
+ ResT = LVI->getPredicateOnEdge(Pred, LHS,
+ cast<Constant>(RHS), PredBB, BB,
+ CxtI ? CxtI : Cmp);
+ if (ResT == LazyValueInfo::Unknown)
+ continue;
+ Res = ConstantInt::get(Type::getInt1Ty(LHS->getContext()), ResT);
+ }
+
+ if (Constant *KC = getKnownConstant(Res, WantInteger))
+ Result.emplace_back(KC, PredBB);
+ }
+
+ return !Result.empty();
+ }
+
+ // If comparing a live-in value against a constant, see if we know the
+ // live-in value on any predecessors.
+ if (isa<Constant>(CmpRHS) && !CmpType->isVectorTy()) {
+ Constant *CmpConst = cast<Constant>(CmpRHS);
+
+ if (!isa<Instruction>(CmpLHS) ||
+ cast<Instruction>(CmpLHS)->getParent() != BB) {
+ for (BasicBlock *P : predecessors(BB)) {
+ // If the value is known by LazyValueInfo to be a constant in a
+ // predecessor, use that information to try to thread this block.
+ LazyValueInfo::Tristate Res =
+ LVI->getPredicateOnEdge(Pred, CmpLHS,
+ CmpConst, P, BB, CxtI ? CxtI : Cmp);
+ if (Res == LazyValueInfo::Unknown)
+ continue;
+
+ Constant *ResC = ConstantInt::get(CmpType, Res);
+ Result.emplace_back(ResC, P);
+ }
+
+ return !Result.empty();
+ }
+
+ // InstCombine can fold some forms of constant range checks into
+ // (icmp (add (x, C1)), C2). See if we have we have such a thing with
+ // x as a live-in.
+ {
+ using namespace PatternMatch;
+
+ Value *AddLHS;
+ ConstantInt *AddConst;
+ if (isa<ConstantInt>(CmpConst) &&
+ match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) {
+ if (!isa<Instruction>(AddLHS) ||
+ cast<Instruction>(AddLHS)->getParent() != BB) {
+ for (BasicBlock *P : predecessors(BB)) {
+ // If the value is known by LazyValueInfo to be a ConstantRange in
+ // a predecessor, use that information to try to thread this
+ // block.
+ ConstantRange CR = LVI->getConstantRangeOnEdge(
+ AddLHS, P, BB, CxtI ? CxtI : cast<Instruction>(CmpLHS));
+ // Propagate the range through the addition.
+ CR = CR.add(AddConst->getValue());
+
+ // Get the range where the compare returns true.
+ ConstantRange CmpRange = ConstantRange::makeExactICmpRegion(
+ Pred, cast<ConstantInt>(CmpConst)->getValue());
+
+ Constant *ResC;
+ if (CmpRange.contains(CR))
+ ResC = ConstantInt::getTrue(CmpType);
+ else if (CmpRange.inverse().contains(CR))
+ ResC = ConstantInt::getFalse(CmpType);
+ else
+ continue;
+
+ Result.emplace_back(ResC, P);
+ }
+
+ return !Result.empty();
+ }
+ }
+ }
+
+ // Try to find a constant value for the LHS of a comparison,
+ // and evaluate it statically if we can.
+ PredValueInfoTy LHSVals;
computeValueKnownInPredecessorsImpl(I->getOperand(0), BB, LHSVals,
- WantInteger, RecursionSet, CxtI);
-
- for (const auto &LHSVal : LHSVals) {
- Constant *V = LHSVal.first;
- Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst);
- if (Constant *KC = getKnownConstant(Folded, WantInteger))
- Result.emplace_back(KC, LHSVal.second);
- }
-
- return !Result.empty();
- }
- }
-
- if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
- // Handle select instructions where at least one operand is a known constant
- // and we can figure out the condition value for any predecessor block.
- Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference);
- Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference);
- PredValueInfoTy Conds;
- if ((TrueVal || FalseVal) &&
+ WantInteger, RecursionSet, CxtI);
+
+ for (const auto &LHSVal : LHSVals) {
+ Constant *V = LHSVal.first;
+ Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst);
+ if (Constant *KC = getKnownConstant(Folded, WantInteger))
+ Result.emplace_back(KC, LHSVal.second);
+ }
+
+ return !Result.empty();
+ }
+ }
+
+ if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+ // Handle select instructions where at least one operand is a known constant
+ // and we can figure out the condition value for any predecessor block.
+ Constant *TrueVal = getKnownConstant(SI->getTrueValue(), Preference);
+ Constant *FalseVal = getKnownConstant(SI->getFalseValue(), Preference);
+ PredValueInfoTy Conds;
+ if ((TrueVal || FalseVal) &&
computeValueKnownInPredecessorsImpl(SI->getCondition(), BB, Conds,
- WantInteger, RecursionSet, CxtI)) {
- for (auto &C : Conds) {
- Constant *Cond = C.first;
-
- // Figure out what value to use for the condition.
- bool KnownCond;
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Cond)) {
- // A known boolean.
- KnownCond = CI->isOne();
- } else {
- assert(isa<UndefValue>(Cond) && "Unexpected condition value");
- // Either operand will do, so be sure to pick the one that's a known
- // constant.
- // FIXME: Do this more cleverly if both values are known constants?
- KnownCond = (TrueVal != nullptr);
- }
-
- // See if the select has a known constant value for this predecessor.
- if (Constant *Val = KnownCond ? TrueVal : FalseVal)
- Result.emplace_back(Val, C.second);
- }
-
- return !Result.empty();
- }
- }
-
- // If all else fails, see if LVI can figure out a constant value for us.
+ WantInteger, RecursionSet, CxtI)) {
+ for (auto &C : Conds) {
+ Constant *Cond = C.first;
+
+ // Figure out what value to use for the condition.
+ bool KnownCond;
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Cond)) {
+ // A known boolean.
+ KnownCond = CI->isOne();
+ } else {
+ assert(isa<UndefValue>(Cond) && "Unexpected condition value");
+ // Either operand will do, so be sure to pick the one that's a known
+ // constant.
+ // FIXME: Do this more cleverly if both values are known constants?
+ KnownCond = (TrueVal != nullptr);
+ }
+
+ // See if the select has a known constant value for this predecessor.
+ if (Constant *Val = KnownCond ? TrueVal : FalseVal)
+ Result.emplace_back(Val, C.second);
+ }
+
+ return !Result.empty();
+ }
+ }
+
+ // If all else fails, see if LVI can figure out a constant value for us.
assert(CxtI->getParent() == BB && "CxtI should be in BB");
Constant *CI = LVI->getConstant(V, CxtI);
- if (Constant *KC = getKnownConstant(CI, Preference)) {
- for (BasicBlock *Pred : predecessors(BB))
- Result.emplace_back(KC, Pred);
- }
-
- return !Result.empty();
-}
-
-/// GetBestDestForBranchOnUndef - If we determine that the specified block ends
-/// in an undefined jump, decide which block is best to revector to.
-///
-/// Since we can pick an arbitrary destination, we pick the successor with the
-/// fewest predecessors. This should reduce the in-degree of the others.
+ if (Constant *KC = getKnownConstant(CI, Preference)) {
+ for (BasicBlock *Pred : predecessors(BB))
+ Result.emplace_back(KC, Pred);
+ }
+
+ return !Result.empty();
+}
+
+/// GetBestDestForBranchOnUndef - If we determine that the specified block ends
+/// in an undefined jump, decide which block is best to revector to.
+///
+/// Since we can pick an arbitrary destination, we pick the successor with the
+/// fewest predecessors. This should reduce the in-degree of the others.
static unsigned getBestDestForJumpOnUndef(BasicBlock *BB) {
- Instruction *BBTerm = BB->getTerminator();
- unsigned MinSucc = 0;
- BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
- // Compute the successor with the minimum number of predecessors.
- unsigned MinNumPreds = pred_size(TestBB);
- for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
- TestBB = BBTerm->getSuccessor(i);
- unsigned NumPreds = pred_size(TestBB);
- if (NumPreds < MinNumPreds) {
- MinSucc = i;
- MinNumPreds = NumPreds;
- }
- }
-
- return MinSucc;
-}
-
-static bool hasAddressTakenAndUsed(BasicBlock *BB) {
- if (!BB->hasAddressTaken()) return false;
-
- // If the block has its address taken, it may be a tree of dead constants
- // hanging off of it. These shouldn't keep the block alive.
- BlockAddress *BA = BlockAddress::get(BB);
- BA->removeDeadConstantUsers();
- return !BA->use_empty();
-}
-
+ Instruction *BBTerm = BB->getTerminator();
+ unsigned MinSucc = 0;
+ BasicBlock *TestBB = BBTerm->getSuccessor(MinSucc);
+ // Compute the successor with the minimum number of predecessors.
+ unsigned MinNumPreds = pred_size(TestBB);
+ for (unsigned i = 1, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+ TestBB = BBTerm->getSuccessor(i);
+ unsigned NumPreds = pred_size(TestBB);
+ if (NumPreds < MinNumPreds) {
+ MinSucc = i;
+ MinNumPreds = NumPreds;
+ }
+ }
+
+ return MinSucc;
+}
+
+static bool hasAddressTakenAndUsed(BasicBlock *BB) {
+ if (!BB->hasAddressTaken()) return false;
+
+ // If the block has its address taken, it may be a tree of dead constants
+ // hanging off of it. These shouldn't keep the block alive.
+ BlockAddress *BA = BlockAddress::get(BB);
+ BA->removeDeadConstantUsers();
+ return !BA->use_empty();
+}
+
/// processBlock - If there are any predecessors whose control can be threaded
-/// through to a successor, transform them now.
+/// through to a successor, transform them now.
bool JumpThreadingPass::processBlock(BasicBlock *BB) {
- // If the block is trivially dead, just return and let the caller nuke it.
- // This simplifies other transformations.
- if (DTU->isBBPendingDeletion(BB) ||
- (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()))
- return false;
-
- // If this block has a single predecessor, and if that pred has a single
- // successor, merge the blocks. This encourages recursive jump threading
- // because now the condition in this block can be threaded through
- // predecessors of our predecessor block.
+ // If the block is trivially dead, just return and let the caller nuke it.
+ // This simplifies other transformations.
+ if (DTU->isBBPendingDeletion(BB) ||
+ (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()))
+ return false;
+
+ // If this block has a single predecessor, and if that pred has a single
+ // successor, merge the blocks. This encourages recursive jump threading
+ // because now the condition in this block can be threaded through
+ // predecessors of our predecessor block.
if (maybeMergeBasicBlockIntoOnlyPred(BB))
- return true;
-
+ return true;
+
if (tryToUnfoldSelectInCurrBB(BB))
- return true;
-
- // Look if we can propagate guards to predecessors.
+ return true;
+
+ // Look if we can propagate guards to predecessors.
if (HasGuards && processGuards(BB))
- return true;
-
- // What kind of constant we're looking for.
- ConstantPreference Preference = WantInteger;
-
- // Look to see if the terminator is a conditional branch, switch or indirect
- // branch, if not we can't thread it.
- Value *Condition;
- Instruction *Terminator = BB->getTerminator();
- if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) {
- // Can't thread an unconditional jump.
- if (BI->isUnconditional()) return false;
- Condition = BI->getCondition();
- } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) {
- Condition = SI->getCondition();
- } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) {
- // Can't thread indirect branch with no successors.
- if (IB->getNumSuccessors() == 0) return false;
- Condition = IB->getAddress()->stripPointerCasts();
- Preference = WantBlockAddress;
- } else {
- return false; // Must be an invoke or callbr.
- }
-
+ return true;
+
+ // What kind of constant we're looking for.
+ ConstantPreference Preference = WantInteger;
+
+ // Look to see if the terminator is a conditional branch, switch or indirect
+ // branch, if not we can't thread it.
+ Value *Condition;
+ Instruction *Terminator = BB->getTerminator();
+ if (BranchInst *BI = dyn_cast<BranchInst>(Terminator)) {
+ // Can't thread an unconditional jump.
+ if (BI->isUnconditional()) return false;
+ Condition = BI->getCondition();
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(Terminator)) {
+ Condition = SI->getCondition();
+ } else if (IndirectBrInst *IB = dyn_cast<IndirectBrInst>(Terminator)) {
+ // Can't thread indirect branch with no successors.
+ if (IB->getNumSuccessors() == 0) return false;
+ Condition = IB->getAddress()->stripPointerCasts();
+ Preference = WantBlockAddress;
+ } else {
+ return false; // Must be an invoke or callbr.
+ }
+
// Keep track if we constant folded the condition in this invocation.
bool ConstantFolded = false;
- // Run constant folding to see if we can reduce the condition to a simple
- // constant.
- if (Instruction *I = dyn_cast<Instruction>(Condition)) {
- Value *SimpleVal =
- ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI);
- if (SimpleVal) {
- I->replaceAllUsesWith(SimpleVal);
- if (isInstructionTriviallyDead(I, TLI))
- I->eraseFromParent();
- Condition = SimpleVal;
+ // Run constant folding to see if we can reduce the condition to a simple
+ // constant.
+ if (Instruction *I = dyn_cast<Instruction>(Condition)) {
+ Value *SimpleVal =
+ ConstantFoldInstruction(I, BB->getModule()->getDataLayout(), TLI);
+ if (SimpleVal) {
+ I->replaceAllUsesWith(SimpleVal);
+ if (isInstructionTriviallyDead(I, TLI))
+ I->eraseFromParent();
+ Condition = SimpleVal;
ConstantFolded = true;
- }
- }
-
+ }
+ }
+
// If the terminator is branching on an undef or freeze undef, we can pick any
// of the successors to branch to. Let getBestDestForJumpOnUndef decide.
auto *FI = dyn_cast<FreezeInst>(Condition);
if (isa<UndefValue>(Condition) ||
(FI && isa<UndefValue>(FI->getOperand(0)) && FI->hasOneUse())) {
unsigned BestSucc = getBestDestForJumpOnUndef(BB);
- std::vector<DominatorTree::UpdateType> Updates;
-
- // Fold the branch/switch.
- Instruction *BBTerm = BB->getTerminator();
- Updates.reserve(BBTerm->getNumSuccessors());
- for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
- if (i == BestSucc) continue;
- BasicBlock *Succ = BBTerm->getSuccessor(i);
- Succ->removePredecessor(BB, true);
- Updates.push_back({DominatorTree::Delete, BB, Succ});
- }
-
- LLVM_DEBUG(dbgs() << " In block '" << BB->getName()
- << "' folding undef terminator: " << *BBTerm << '\n');
- BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
- BBTerm->eraseFromParent();
- DTU->applyUpdatesPermissive(Updates);
+ std::vector<DominatorTree::UpdateType> Updates;
+
+ // Fold the branch/switch.
+ Instruction *BBTerm = BB->getTerminator();
+ Updates.reserve(BBTerm->getNumSuccessors());
+ for (unsigned i = 0, e = BBTerm->getNumSuccessors(); i != e; ++i) {
+ if (i == BestSucc) continue;
+ BasicBlock *Succ = BBTerm->getSuccessor(i);
+ Succ->removePredecessor(BB, true);
+ Updates.push_back({DominatorTree::Delete, BB, Succ});
+ }
+
+ LLVM_DEBUG(dbgs() << " In block '" << BB->getName()
+ << "' folding undef terminator: " << *BBTerm << '\n');
+ BranchInst::Create(BBTerm->getSuccessor(BestSucc), BBTerm);
+ BBTerm->eraseFromParent();
+ DTU->applyUpdatesPermissive(Updates);
if (FI)
FI->eraseFromParent();
- return true;
- }
-
- // If the terminator of this block is branching on a constant, simplify the
- // terminator to an unconditional branch. This can occur due to threading in
- // other blocks.
- if (getKnownConstant(Condition, Preference)) {
- LLVM_DEBUG(dbgs() << " In block '" << BB->getName()
- << "' folding terminator: " << *BB->getTerminator()
- << '\n');
- ++NumFolds;
- ConstantFoldTerminator(BB, true, nullptr, DTU);
+ return true;
+ }
+
+ // If the terminator of this block is branching on a constant, simplify the
+ // terminator to an unconditional branch. This can occur due to threading in
+ // other blocks.
+ if (getKnownConstant(Condition, Preference)) {
+ LLVM_DEBUG(dbgs() << " In block '" << BB->getName()
+ << "' folding terminator: " << *BB->getTerminator()
+ << '\n');
+ ++NumFolds;
+ ConstantFoldTerminator(BB, true, nullptr, DTU);
if (HasProfileData)
BPI->eraseBlock(BB);
- return true;
- }
-
- Instruction *CondInst = dyn_cast<Instruction>(Condition);
-
- // All the rest of our checks depend on the condition being an instruction.
- if (!CondInst) {
- // FIXME: Unify this with code below.
+ return true;
+ }
+
+ Instruction *CondInst = dyn_cast<Instruction>(Condition);
+
+ // All the rest of our checks depend on the condition being an instruction.
+ if (!CondInst) {
+ // FIXME: Unify this with code below.
if (processThreadableEdges(Condition, BB, Preference, Terminator))
- return true;
+ return true;
return ConstantFolded;
- }
-
- if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
- // If we're branching on a conditional, LVI might be able to determine
- // it's value at the branch instruction. We only handle comparisons
- // against a constant at this time.
- // TODO: This should be extended to handle switches as well.
- BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
- Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
- if (CondBr && CondConst) {
- // We should have returned as soon as we turn a conditional branch to
- // unconditional. Because its no longer interesting as far as jump
- // threading is concerned.
- assert(CondBr->isConditional() && "Threading on unconditional terminator");
-
- LazyValueInfo::Tristate Ret =
- LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
- CondConst, CondBr);
- if (Ret != LazyValueInfo::Unknown) {
- unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
- unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
- BasicBlock *ToRemoveSucc = CondBr->getSuccessor(ToRemove);
- ToRemoveSucc->removePredecessor(BB, true);
- BranchInst *UncondBr =
- BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
- UncondBr->setDebugLoc(CondBr->getDebugLoc());
- CondBr->eraseFromParent();
- if (CondCmp->use_empty())
- CondCmp->eraseFromParent();
- // We can safely replace *some* uses of the CondInst if it has
- // exactly one value as returned by LVI. RAUW is incorrect in the
- // presence of guards and assumes, that have the `Cond` as the use. This
- // is because we use the guards/assume to reason about the `Cond` value
- // at the end of block, but RAUW unconditionally replaces all uses
- // including the guards/assumes themselves and the uses before the
- // guard/assume.
- else if (CondCmp->getParent() == BB) {
- auto *CI = Ret == LazyValueInfo::True ?
- ConstantInt::getTrue(CondCmp->getType()) :
- ConstantInt::getFalse(CondCmp->getType());
+ }
+
+ if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
+ // If we're branching on a conditional, LVI might be able to determine
+ // it's value at the branch instruction. We only handle comparisons
+ // against a constant at this time.
+ // TODO: This should be extended to handle switches as well.
+ BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+ Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
+ if (CondBr && CondConst) {
+ // We should have returned as soon as we turn a conditional branch to
+ // unconditional. Because its no longer interesting as far as jump
+ // threading is concerned.
+ assert(CondBr->isConditional() && "Threading on unconditional terminator");
+
+ LazyValueInfo::Tristate Ret =
+ LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
+ CondConst, CondBr);
+ if (Ret != LazyValueInfo::Unknown) {
+ unsigned ToRemove = Ret == LazyValueInfo::True ? 1 : 0;
+ unsigned ToKeep = Ret == LazyValueInfo::True ? 0 : 1;
+ BasicBlock *ToRemoveSucc = CondBr->getSuccessor(ToRemove);
+ ToRemoveSucc->removePredecessor(BB, true);
+ BranchInst *UncondBr =
+ BranchInst::Create(CondBr->getSuccessor(ToKeep), CondBr);
+ UncondBr->setDebugLoc(CondBr->getDebugLoc());
+ CondBr->eraseFromParent();
+ if (CondCmp->use_empty())
+ CondCmp->eraseFromParent();
+ // We can safely replace *some* uses of the CondInst if it has
+ // exactly one value as returned by LVI. RAUW is incorrect in the
+ // presence of guards and assumes, that have the `Cond` as the use. This
+ // is because we use the guards/assume to reason about the `Cond` value
+ // at the end of block, but RAUW unconditionally replaces all uses
+ // including the guards/assumes themselves and the uses before the
+ // guard/assume.
+ else if (CondCmp->getParent() == BB) {
+ auto *CI = Ret == LazyValueInfo::True ?
+ ConstantInt::getTrue(CondCmp->getType()) :
+ ConstantInt::getFalse(CondCmp->getType());
replaceFoldableUses(CondCmp, CI);
- }
- DTU->applyUpdatesPermissive(
- {{DominatorTree::Delete, BB, ToRemoveSucc}});
+ }
+ DTU->applyUpdatesPermissive(
+ {{DominatorTree::Delete, BB, ToRemoveSucc}});
if (HasProfileData)
BPI->eraseBlock(BB);
- return true;
- }
-
- // We did not manage to simplify this branch, try to see whether
- // CondCmp depends on a known phi-select pattern.
+ return true;
+ }
+
+ // We did not manage to simplify this branch, try to see whether
+ // CondCmp depends on a known phi-select pattern.
if (tryToUnfoldSelect(CondCmp, BB))
- return true;
- }
- }
-
- if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator()))
+ return true;
+ }
+ }
+
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator()))
if (tryToUnfoldSelect(SI, BB))
- return true;
-
- // Check for some cases that are worth simplifying. Right now we want to look
- // for loads that are used by a switch or by the condition for the branch. If
- // we see one, check to see if it's partially redundant. If so, insert a PHI
- // which can then be used to thread the values.
- Value *SimplifyValue = CondInst;
+ return true;
+
+ // Check for some cases that are worth simplifying. Right now we want to look
+ // for loads that are used by a switch or by the condition for the branch. If
+ // we see one, check to see if it's partially redundant. If so, insert a PHI
+ // which can then be used to thread the values.
+ Value *SimplifyValue = CondInst;
if (auto *FI = dyn_cast<FreezeInst>(SimplifyValue))
// Look into freeze's operand
SimplifyValue = FI->getOperand(0);
- if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
- if (isa<Constant>(CondCmp->getOperand(1)))
- SimplifyValue = CondCmp->getOperand(0);
-
- // TODO: There are other places where load PRE would be profitable, such as
- // more complex comparisons.
- if (LoadInst *LoadI = dyn_cast<LoadInst>(SimplifyValue))
+ if (CmpInst *CondCmp = dyn_cast<CmpInst>(SimplifyValue))
+ if (isa<Constant>(CondCmp->getOperand(1)))
+ SimplifyValue = CondCmp->getOperand(0);
+
+ // TODO: There are other places where load PRE would be profitable, such as
+ // more complex comparisons.
+ if (LoadInst *LoadI = dyn_cast<LoadInst>(SimplifyValue))
if (simplifyPartiallyRedundantLoad(LoadI))
- return true;
-
- // Before threading, try to propagate profile data backwards:
- if (PHINode *PN = dyn_cast<PHINode>(CondInst))
- if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
- updatePredecessorProfileMetadata(PN, BB);
-
- // Handle a variety of cases where we are branching on something derived from
- // a PHI node in the current block. If we can prove that any predecessors
- // compute a predictable value based on a PHI node, thread those predecessors.
+ return true;
+
+ // Before threading, try to propagate profile data backwards:
+ if (PHINode *PN = dyn_cast<PHINode>(CondInst))
+ if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+ updatePredecessorProfileMetadata(PN, BB);
+
+ // Handle a variety of cases where we are branching on something derived from
+ // a PHI node in the current block. If we can prove that any predecessors
+ // compute a predictable value based on a PHI node, thread those predecessors.
if (processThreadableEdges(CondInst, BB, Preference, Terminator))
- return true;
-
+ return true;
+
// If this is an otherwise-unfoldable branch on a phi node or freeze(phi) in
// the current block, see if we can simplify.
PHINode *PN = dyn_cast<PHINode>(
isa<FreezeInst>(CondInst) ? cast<FreezeInst>(CondInst)->getOperand(0)
: CondInst);
-
+
if (PN && PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
return processBranchOnPHI(PN);
- // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
- if (CondInst->getOpcode() == Instruction::Xor &&
- CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
+ // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
+ if (CondInst->getOpcode() == Instruction::Xor &&
+ CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
return processBranchOnXOR(cast<BinaryOperator>(CondInst));
-
- // Search for a stronger dominating condition that can be used to simplify a
- // conditional branch leaving BB.
+
+ // Search for a stronger dominating condition that can be used to simplify a
+ // conditional branch leaving BB.
if (processImpliedCondition(BB))
- return true;
-
- return false;
-}
-
+ return true;
+
+ return false;
+}
+
bool JumpThreadingPass::processImpliedCondition(BasicBlock *BB) {
- auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
- if (!BI || !BI->isConditional())
- return false;
-
- Value *Cond = BI->getCondition();
- BasicBlock *CurrentBB = BB;
- BasicBlock *CurrentPred = BB->getSinglePredecessor();
- unsigned Iter = 0;
-
- auto &DL = BB->getModule()->getDataLayout();
-
- while (CurrentPred && Iter++ < ImplicationSearchThreshold) {
- auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator());
- if (!PBI || !PBI->isConditional())
- return false;
- if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB)
- return false;
-
- bool CondIsTrue = PBI->getSuccessor(0) == CurrentBB;
- Optional<bool> Implication =
- isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue);
- if (Implication) {
- BasicBlock *KeepSucc = BI->getSuccessor(*Implication ? 0 : 1);
- BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 1 : 0);
- RemoveSucc->removePredecessor(BB);
- BranchInst *UncondBI = BranchInst::Create(KeepSucc, BI);
- UncondBI->setDebugLoc(BI->getDebugLoc());
- BI->eraseFromParent();
- DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, RemoveSucc}});
+ auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || !BI->isConditional())
+ return false;
+
+ Value *Cond = BI->getCondition();
+ BasicBlock *CurrentBB = BB;
+ BasicBlock *CurrentPred = BB->getSinglePredecessor();
+ unsigned Iter = 0;
+
+ auto &DL = BB->getModule()->getDataLayout();
+
+ while (CurrentPred && Iter++ < ImplicationSearchThreshold) {
+ auto *PBI = dyn_cast<BranchInst>(CurrentPred->getTerminator());
+ if (!PBI || !PBI->isConditional())
+ return false;
+ if (PBI->getSuccessor(0) != CurrentBB && PBI->getSuccessor(1) != CurrentBB)
+ return false;
+
+ bool CondIsTrue = PBI->getSuccessor(0) == CurrentBB;
+ Optional<bool> Implication =
+ isImpliedCondition(PBI->getCondition(), Cond, DL, CondIsTrue);
+ if (Implication) {
+ BasicBlock *KeepSucc = BI->getSuccessor(*Implication ? 0 : 1);
+ BasicBlock *RemoveSucc = BI->getSuccessor(*Implication ? 1 : 0);
+ RemoveSucc->removePredecessor(BB);
+ BranchInst *UncondBI = BranchInst::Create(KeepSucc, BI);
+ UncondBI->setDebugLoc(BI->getDebugLoc());
+ BI->eraseFromParent();
+ DTU->applyUpdatesPermissive({{DominatorTree::Delete, BB, RemoveSucc}});
if (HasProfileData)
BPI->eraseBlock(BB);
- return true;
- }
- CurrentBB = CurrentPred;
- CurrentPred = CurrentBB->getSinglePredecessor();
- }
-
- return false;
-}
-
-/// Return true if Op is an instruction defined in the given block.
-static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
- if (Instruction *OpInst = dyn_cast<Instruction>(Op))
- if (OpInst->getParent() == BB)
- return true;
- return false;
-}
-
+ return true;
+ }
+ CurrentBB = CurrentPred;
+ CurrentPred = CurrentBB->getSinglePredecessor();
+ }
+
+ return false;
+}
+
+/// Return true if Op is an instruction defined in the given block.
+static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
+ if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+ if (OpInst->getParent() == BB)
+ return true;
+ return false;
+}
+
/// simplifyPartiallyRedundantLoad - If LoadI is an obviously partially
-/// redundant load instruction, eliminate it by replacing it with a PHI node.
-/// This is an important optimization that encourages jump threading, and needs
-/// to be run interlaced with other jump threading tasks.
+/// redundant load instruction, eliminate it by replacing it with a PHI node.
+/// This is an important optimization that encourages jump threading, and needs
+/// to be run interlaced with other jump threading tasks.
bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
- // Don't hack volatile and ordered loads.
- if (!LoadI->isUnordered()) return false;
-
- // If the load is defined in a block with exactly one predecessor, it can't be
- // partially redundant.
- BasicBlock *LoadBB = LoadI->getParent();
- if (LoadBB->getSinglePredecessor())
- return false;
-
- // If the load is defined in an EH pad, it can't be partially redundant,
- // because the edges between the invoke and the EH pad cannot have other
- // instructions between them.
- if (LoadBB->isEHPad())
- return false;
-
- Value *LoadedPtr = LoadI->getOperand(0);
-
- // If the loaded operand is defined in the LoadBB and its not a phi,
- // it can't be available in predecessors.
- if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa<PHINode>(LoadedPtr))
- return false;
-
- // Scan a few instructions up from the load, to see if it is obviously live at
- // the entry to its block.
- BasicBlock::iterator BBIt(LoadI);
- bool IsLoadCSE;
- if (Value *AvailableVal = FindAvailableLoadedValue(
- LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
- // If the value of the load is locally available within the block, just use
- // it. This frequently occurs for reg2mem'd allocas.
-
- if (IsLoadCSE) {
- LoadInst *NLoadI = cast<LoadInst>(AvailableVal);
- combineMetadataForCSE(NLoadI, LoadI, false);
- };
-
- // If the returned value is the load itself, replace with an undef. This can
- // only happen in dead loops.
- if (AvailableVal == LoadI)
- AvailableVal = UndefValue::get(LoadI->getType());
- if (AvailableVal->getType() != LoadI->getType())
- AvailableVal = CastInst::CreateBitOrPointerCast(
- AvailableVal, LoadI->getType(), "", LoadI);
- LoadI->replaceAllUsesWith(AvailableVal);
- LoadI->eraseFromParent();
- return true;
- }
-
- // Otherwise, if we scanned the whole block and got to the top of the block,
- // we know the block is locally transparent to the load. If not, something
- // might clobber its value.
- if (BBIt != LoadBB->begin())
- return false;
-
- // If all of the loads and stores that feed the value have the same AA tags,
- // then we can propagate them onto any newly inserted loads.
- AAMDNodes AATags;
- LoadI->getAAMetadata(AATags);
-
- SmallPtrSet<BasicBlock*, 8> PredsScanned;
-
- using AvailablePredsTy = SmallVector<std::pair<BasicBlock *, Value *>, 8>;
-
- AvailablePredsTy AvailablePreds;
- BasicBlock *OneUnavailablePred = nullptr;
- SmallVector<LoadInst*, 8> CSELoads;
-
- // If we got here, the loaded value is transparent through to the start of the
- // block. Check to see if it is available in any of the predecessor blocks.
- for (BasicBlock *PredBB : predecessors(LoadBB)) {
- // If we already scanned this predecessor, skip it.
- if (!PredsScanned.insert(PredBB).second)
- continue;
-
- BBIt = PredBB->end();
- unsigned NumScanedInst = 0;
- Value *PredAvailable = nullptr;
- // NOTE: We don't CSE load that is volatile or anything stronger than
- // unordered, that should have been checked when we entered the function.
- assert(LoadI->isUnordered() &&
- "Attempting to CSE volatile or atomic loads");
- // If this is a load on a phi pointer, phi-translate it and search
- // for available load/store to the pointer in predecessors.
- Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
- PredAvailable = FindAvailablePtrLoadStore(
- Ptr, LoadI->getType(), LoadI->isAtomic(), PredBB, BBIt,
- DefMaxInstsToScan, AA, &IsLoadCSE, &NumScanedInst);
-
- // If PredBB has a single predecessor, continue scanning through the
- // single predecessor.
- BasicBlock *SinglePredBB = PredBB;
- while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
- NumScanedInst < DefMaxInstsToScan) {
- SinglePredBB = SinglePredBB->getSinglePredecessor();
- if (SinglePredBB) {
- BBIt = SinglePredBB->end();
- PredAvailable = FindAvailablePtrLoadStore(
- Ptr, LoadI->getType(), LoadI->isAtomic(), SinglePredBB, BBIt,
- (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
- &NumScanedInst);
- }
- }
-
- if (!PredAvailable) {
- OneUnavailablePred = PredBB;
- continue;
- }
-
- if (IsLoadCSE)
- CSELoads.push_back(cast<LoadInst>(PredAvailable));
-
- // If so, this load is partially redundant. Remember this info so that we
- // can create a PHI node.
- AvailablePreds.emplace_back(PredBB, PredAvailable);
- }
-
- // If the loaded value isn't available in any predecessor, it isn't partially
- // redundant.
- if (AvailablePreds.empty()) return false;
-
- // Okay, the loaded value is available in at least one (and maybe all!)
- // predecessors. If the value is unavailable in more than one unique
- // predecessor, we want to insert a merge block for those common predecessors.
- // This ensures that we only have to insert one reload, thus not increasing
- // code size.
- BasicBlock *UnavailablePred = nullptr;
-
- // If the value is unavailable in one of predecessors, we will end up
- // inserting a new instruction into them. It is only valid if all the
- // instructions before LoadI are guaranteed to pass execution to its
- // successor, or if LoadI is safe to speculate.
- // TODO: If this logic becomes more complex, and we will perform PRE insertion
- // farther than to a predecessor, we need to reuse the code from GVN's PRE.
- // It requires domination tree analysis, so for this simple case it is an
- // overkill.
- if (PredsScanned.size() != AvailablePreds.size() &&
- !isSafeToSpeculativelyExecute(LoadI))
- for (auto I = LoadBB->begin(); &*I != LoadI; ++I)
- if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
- return false;
-
- // If there is exactly one predecessor where the value is unavailable, the
- // already computed 'OneUnavailablePred' block is it. If it ends in an
- // unconditional branch, we know that it isn't a critical edge.
- if (PredsScanned.size() == AvailablePreds.size()+1 &&
- OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
- UnavailablePred = OneUnavailablePred;
- } else if (PredsScanned.size() != AvailablePreds.size()) {
- // Otherwise, we had multiple unavailable predecessors or we had a critical
- // edge from the one.
- SmallVector<BasicBlock*, 8> PredsToSplit;
- SmallPtrSet<BasicBlock*, 8> AvailablePredSet;
-
- for (const auto &AvailablePred : AvailablePreds)
- AvailablePredSet.insert(AvailablePred.first);
-
- // Add all the unavailable predecessors to the PredsToSplit list.
- for (BasicBlock *P : predecessors(LoadBB)) {
- // If the predecessor is an indirect goto, we can't split the edge.
- // Same for CallBr.
- if (isa<IndirectBrInst>(P->getTerminator()) ||
- isa<CallBrInst>(P->getTerminator()))
- return false;
-
- if (!AvailablePredSet.count(P))
- PredsToSplit.push_back(P);
- }
-
- // Split them out to their own block.
+ // Don't hack volatile and ordered loads.
+ if (!LoadI->isUnordered()) return false;
+
+ // If the load is defined in a block with exactly one predecessor, it can't be
+ // partially redundant.
+ BasicBlock *LoadBB = LoadI->getParent();
+ if (LoadBB->getSinglePredecessor())
+ return false;
+
+ // If the load is defined in an EH pad, it can't be partially redundant,
+ // because the edges between the invoke and the EH pad cannot have other
+ // instructions between them.
+ if (LoadBB->isEHPad())
+ return false;
+
+ Value *LoadedPtr = LoadI->getOperand(0);
+
+ // If the loaded operand is defined in the LoadBB and its not a phi,
+ // it can't be available in predecessors.
+ if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa<PHINode>(LoadedPtr))
+ return false;
+
+ // Scan a few instructions up from the load, to see if it is obviously live at
+ // the entry to its block.
+ BasicBlock::iterator BBIt(LoadI);
+ bool IsLoadCSE;
+ if (Value *AvailableVal = FindAvailableLoadedValue(
+ LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+ // If the value of the load is locally available within the block, just use
+ // it. This frequently occurs for reg2mem'd allocas.
+
+ if (IsLoadCSE) {
+ LoadInst *NLoadI = cast<LoadInst>(AvailableVal);
+ combineMetadataForCSE(NLoadI, LoadI, false);
+ };
+
+ // If the returned value is the load itself, replace with an undef. This can
+ // only happen in dead loops.
+ if (AvailableVal == LoadI)
+ AvailableVal = UndefValue::get(LoadI->getType());
+ if (AvailableVal->getType() != LoadI->getType())
+ AvailableVal = CastInst::CreateBitOrPointerCast(
+ AvailableVal, LoadI->getType(), "", LoadI);
+ LoadI->replaceAllUsesWith(AvailableVal);
+ LoadI->eraseFromParent();
+ return true;
+ }
+
+ // Otherwise, if we scanned the whole block and got to the top of the block,
+ // we know the block is locally transparent to the load. If not, something
+ // might clobber its value.
+ if (BBIt != LoadBB->begin())
+ return false;
+
+ // If all of the loads and stores that feed the value have the same AA tags,
+ // then we can propagate them onto any newly inserted loads.
+ AAMDNodes AATags;
+ LoadI->getAAMetadata(AATags);
+
+ SmallPtrSet<BasicBlock*, 8> PredsScanned;
+
+ using AvailablePredsTy = SmallVector<std::pair<BasicBlock *, Value *>, 8>;
+
+ AvailablePredsTy AvailablePreds;
+ BasicBlock *OneUnavailablePred = nullptr;
+ SmallVector<LoadInst*, 8> CSELoads;
+
+ // If we got here, the loaded value is transparent through to the start of the
+ // block. Check to see if it is available in any of the predecessor blocks.
+ for (BasicBlock *PredBB : predecessors(LoadBB)) {
+ // If we already scanned this predecessor, skip it.
+ if (!PredsScanned.insert(PredBB).second)
+ continue;
+
+ BBIt = PredBB->end();
+ unsigned NumScanedInst = 0;
+ Value *PredAvailable = nullptr;
+ // NOTE: We don't CSE load that is volatile or anything stronger than
+ // unordered, that should have been checked when we entered the function.
+ assert(LoadI->isUnordered() &&
+ "Attempting to CSE volatile or atomic loads");
+ // If this is a load on a phi pointer, phi-translate it and search
+ // for available load/store to the pointer in predecessors.
+ Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
+ PredAvailable = FindAvailablePtrLoadStore(
+ Ptr, LoadI->getType(), LoadI->isAtomic(), PredBB, BBIt,
+ DefMaxInstsToScan, AA, &IsLoadCSE, &NumScanedInst);
+
+ // If PredBB has a single predecessor, continue scanning through the
+ // single predecessor.
+ BasicBlock *SinglePredBB = PredBB;
+ while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
+ NumScanedInst < DefMaxInstsToScan) {
+ SinglePredBB = SinglePredBB->getSinglePredecessor();
+ if (SinglePredBB) {
+ BBIt = SinglePredBB->end();
+ PredAvailable = FindAvailablePtrLoadStore(
+ Ptr, LoadI->getType(), LoadI->isAtomic(), SinglePredBB, BBIt,
+ (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
+ &NumScanedInst);
+ }
+ }
+
+ if (!PredAvailable) {
+ OneUnavailablePred = PredBB;
+ continue;
+ }
+
+ if (IsLoadCSE)
+ CSELoads.push_back(cast<LoadInst>(PredAvailable));
+
+ // If so, this load is partially redundant. Remember this info so that we
+ // can create a PHI node.
+ AvailablePreds.emplace_back(PredBB, PredAvailable);
+ }
+
+ // If the loaded value isn't available in any predecessor, it isn't partially
+ // redundant.
+ if (AvailablePreds.empty()) return false;
+
+ // Okay, the loaded value is available in at least one (and maybe all!)
+ // predecessors. If the value is unavailable in more than one unique
+ // predecessor, we want to insert a merge block for those common predecessors.
+ // This ensures that we only have to insert one reload, thus not increasing
+ // code size.
+ BasicBlock *UnavailablePred = nullptr;
+
+ // If the value is unavailable in one of predecessors, we will end up
+ // inserting a new instruction into them. It is only valid if all the
+ // instructions before LoadI are guaranteed to pass execution to its
+ // successor, or if LoadI is safe to speculate.
+ // TODO: If this logic becomes more complex, and we will perform PRE insertion
+ // farther than to a predecessor, we need to reuse the code from GVN's PRE.
+ // It requires domination tree analysis, so for this simple case it is an
+ // overkill.
+ if (PredsScanned.size() != AvailablePreds.size() &&
+ !isSafeToSpeculativelyExecute(LoadI))
+ for (auto I = LoadBB->begin(); &*I != LoadI; ++I)
+ if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
+ return false;
+
+ // If there is exactly one predecessor where the value is unavailable, the
+ // already computed 'OneUnavailablePred' block is it. If it ends in an
+ // unconditional branch, we know that it isn't a critical edge.
+ if (PredsScanned.size() == AvailablePreds.size()+1 &&
+ OneUnavailablePred->getTerminator()->getNumSuccessors() == 1) {
+ UnavailablePred = OneUnavailablePred;
+ } else if (PredsScanned.size() != AvailablePreds.size()) {
+ // Otherwise, we had multiple unavailable predecessors or we had a critical
+ // edge from the one.
+ SmallVector<BasicBlock*, 8> PredsToSplit;
+ SmallPtrSet<BasicBlock*, 8> AvailablePredSet;
+
+ for (const auto &AvailablePred : AvailablePreds)
+ AvailablePredSet.insert(AvailablePred.first);
+
+ // Add all the unavailable predecessors to the PredsToSplit list.
+ for (BasicBlock *P : predecessors(LoadBB)) {
+ // If the predecessor is an indirect goto, we can't split the edge.
+ // Same for CallBr.
+ if (isa<IndirectBrInst>(P->getTerminator()) ||
+ isa<CallBrInst>(P->getTerminator()))
+ return false;
+
+ if (!AvailablePredSet.count(P))
+ PredsToSplit.push_back(P);
+ }
+
+ // Split them out to their own block.
UnavailablePred = splitBlockPreds(LoadBB, PredsToSplit, "thread-pre-split");
- }
-
- // If the value isn't available in all predecessors, then there will be
- // exactly one where it isn't available. Insert a load on that edge and add
- // it to the AvailablePreds list.
- if (UnavailablePred) {
- assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
- "Can't handle critical edge here!");
- LoadInst *NewVal = new LoadInst(
- LoadI->getType(), LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
- LoadI->getName() + ".pr", false, LoadI->getAlign(),
- LoadI->getOrdering(), LoadI->getSyncScopeID(),
- UnavailablePred->getTerminator());
- NewVal->setDebugLoc(LoadI->getDebugLoc());
- if (AATags)
- NewVal->setAAMetadata(AATags);
-
- AvailablePreds.emplace_back(UnavailablePred, NewVal);
- }
-
- // Now we know that each predecessor of this block has a value in
- // AvailablePreds, sort them for efficient access as we're walking the preds.
- array_pod_sort(AvailablePreds.begin(), AvailablePreds.end());
-
- // Create a PHI node at the start of the block for the PRE'd load value.
- pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
- PHINode *PN = PHINode::Create(LoadI->getType(), std::distance(PB, PE), "",
- &LoadBB->front());
- PN->takeName(LoadI);
- PN->setDebugLoc(LoadI->getDebugLoc());
-
- // Insert new entries into the PHI for each predecessor. A single block may
- // have multiple entries here.
- for (pred_iterator PI = PB; PI != PE; ++PI) {
- BasicBlock *P = *PI;
- AvailablePredsTy::iterator I =
- llvm::lower_bound(AvailablePreds, std::make_pair(P, (Value *)nullptr));
-
- assert(I != AvailablePreds.end() && I->first == P &&
- "Didn't find entry for predecessor!");
-
- // If we have an available predecessor but it requires casting, insert the
- // cast in the predecessor and use the cast. Note that we have to update the
- // AvailablePreds vector as we go so that all of the PHI entries for this
- // predecessor use the same bitcast.
- Value *&PredV = I->second;
- if (PredV->getType() != LoadI->getType())
- PredV = CastInst::CreateBitOrPointerCast(PredV, LoadI->getType(), "",
- P->getTerminator());
-
- PN->addIncoming(PredV, I->first);
- }
-
- for (LoadInst *PredLoadI : CSELoads) {
- combineMetadataForCSE(PredLoadI, LoadI, true);
- }
-
- LoadI->replaceAllUsesWith(PN);
- LoadI->eraseFromParent();
-
- return true;
-}
-
+ }
+
+ // If the value isn't available in all predecessors, then there will be
+ // exactly one where it isn't available. Insert a load on that edge and add
+ // it to the AvailablePreds list.
+ if (UnavailablePred) {
+ assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
+ "Can't handle critical edge here!");
+ LoadInst *NewVal = new LoadInst(
+ LoadI->getType(), LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
+ LoadI->getName() + ".pr", false, LoadI->getAlign(),
+ LoadI->getOrdering(), LoadI->getSyncScopeID(),
+ UnavailablePred->getTerminator());
+ NewVal->setDebugLoc(LoadI->getDebugLoc());
+ if (AATags)
+ NewVal->setAAMetadata(AATags);
+
+ AvailablePreds.emplace_back(UnavailablePred, NewVal);
+ }
+
+ // Now we know that each predecessor of this block has a value in
+ // AvailablePreds, sort them for efficient access as we're walking the preds.
+ array_pod_sort(AvailablePreds.begin(), AvailablePreds.end());
+
+ // Create a PHI node at the start of the block for the PRE'd load value.
+ pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
+ PHINode *PN = PHINode::Create(LoadI->getType(), std::distance(PB, PE), "",
+ &LoadBB->front());
+ PN->takeName(LoadI);
+ PN->setDebugLoc(LoadI->getDebugLoc());
+
+ // Insert new entries into the PHI for each predecessor. A single block may
+ // have multiple entries here.
+ for (pred_iterator PI = PB; PI != PE; ++PI) {
+ BasicBlock *P = *PI;
+ AvailablePredsTy::iterator I =
+ llvm::lower_bound(AvailablePreds, std::make_pair(P, (Value *)nullptr));
+
+ assert(I != AvailablePreds.end() && I->first == P &&
+ "Didn't find entry for predecessor!");
+
+ // If we have an available predecessor but it requires casting, insert the
+ // cast in the predecessor and use the cast. Note that we have to update the
+ // AvailablePreds vector as we go so that all of the PHI entries for this
+ // predecessor use the same bitcast.
+ Value *&PredV = I->second;
+ if (PredV->getType() != LoadI->getType())
+ PredV = CastInst::CreateBitOrPointerCast(PredV, LoadI->getType(), "",
+ P->getTerminator());
+
+ PN->addIncoming(PredV, I->first);
+ }
+
+ for (LoadInst *PredLoadI : CSELoads) {
+ combineMetadataForCSE(PredLoadI, LoadI, true);
+ }
+
+ LoadI->replaceAllUsesWith(PN);
+ LoadI->eraseFromParent();
+
+ return true;
+}
+
/// findMostPopularDest - The specified list contains multiple possible
-/// threadable destinations. Pick the one that occurs the most frequently in
-/// the list.
-static BasicBlock *
+/// threadable destinations. Pick the one that occurs the most frequently in
+/// the list.
+static BasicBlock *
findMostPopularDest(BasicBlock *BB,
- const SmallVectorImpl<std::pair<BasicBlock *,
- BasicBlock *>> &PredToDestList) {
- assert(!PredToDestList.empty());
-
- // Determine popularity. If there are multiple possible destinations, we
- // explicitly choose to ignore 'undef' destinations. We prefer to thread
- // blocks with known and real destinations to threading undef. We'll handle
- // them later if interesting.
- MapVector<BasicBlock *, unsigned> DestPopularity;
-
- // Populate DestPopularity with the successors in the order they appear in the
- // successor list. This way, we ensure determinism by iterating it in the
- // same order in std::max_element below. We map nullptr to 0 so that we can
- // return nullptr when PredToDestList contains nullptr only.
- DestPopularity[nullptr] = 0;
- for (auto *SuccBB : successors(BB))
- DestPopularity[SuccBB] = 0;
-
- for (const auto &PredToDest : PredToDestList)
- if (PredToDest.second)
- DestPopularity[PredToDest.second]++;
-
- // Find the most popular dest.
- using VT = decltype(DestPopularity)::value_type;
- auto MostPopular = std::max_element(
- DestPopularity.begin(), DestPopularity.end(),
- [](const VT &L, const VT &R) { return L.second < R.second; });
-
- // Okay, we have finally picked the most popular destination.
- return MostPopular->first;
-}
-
-// Try to evaluate the value of V when the control flows from PredPredBB to
-// BB->getSinglePredecessor() and then on to BB.
+ const SmallVectorImpl<std::pair<BasicBlock *,
+ BasicBlock *>> &PredToDestList) {
+ assert(!PredToDestList.empty());
+
+ // Determine popularity. If there are multiple possible destinations, we
+ // explicitly choose to ignore 'undef' destinations. We prefer to thread
+ // blocks with known and real destinations to threading undef. We'll handle
+ // them later if interesting.
+ MapVector<BasicBlock *, unsigned> DestPopularity;
+
+ // Populate DestPopularity with the successors in the order they appear in the
+ // successor list. This way, we ensure determinism by iterating it in the
+ // same order in std::max_element below. We map nullptr to 0 so that we can
+ // return nullptr when PredToDestList contains nullptr only.
+ DestPopularity[nullptr] = 0;
+ for (auto *SuccBB : successors(BB))
+ DestPopularity[SuccBB] = 0;
+
+ for (const auto &PredToDest : PredToDestList)
+ if (PredToDest.second)
+ DestPopularity[PredToDest.second]++;
+
+ // Find the most popular dest.
+ using VT = decltype(DestPopularity)::value_type;
+ auto MostPopular = std::max_element(
+ DestPopularity.begin(), DestPopularity.end(),
+ [](const VT &L, const VT &R) { return L.second < R.second; });
+
+ // Okay, we have finally picked the most popular destination.
+ return MostPopular->first;
+}
+
+// Try to evaluate the value of V when the control flows from PredPredBB to
+// BB->getSinglePredecessor() and then on to BB.
Constant *JumpThreadingPass::evaluateOnPredecessorEdge(BasicBlock *BB,
- BasicBlock *PredPredBB,
- Value *V) {
- BasicBlock *PredBB = BB->getSinglePredecessor();
- assert(PredBB && "Expected a single predecessor");
-
- if (Constant *Cst = dyn_cast<Constant>(V)) {
- return Cst;
- }
-
- // Consult LVI if V is not an instruction in BB or PredBB.
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I || (I->getParent() != BB && I->getParent() != PredBB)) {
- return LVI->getConstantOnEdge(V, PredPredBB, PredBB, nullptr);
- }
-
- // Look into a PHI argument.
- if (PHINode *PHI = dyn_cast<PHINode>(V)) {
- if (PHI->getParent() == PredBB)
- return dyn_cast<Constant>(PHI->getIncomingValueForBlock(PredPredBB));
- return nullptr;
- }
-
- // If we have a CmpInst, try to fold it for each incoming edge into PredBB.
- if (CmpInst *CondCmp = dyn_cast<CmpInst>(V)) {
- if (CondCmp->getParent() == BB) {
- Constant *Op0 =
+ BasicBlock *PredPredBB,
+ Value *V) {
+ BasicBlock *PredBB = BB->getSinglePredecessor();
+ assert(PredBB && "Expected a single predecessor");
+
+ if (Constant *Cst = dyn_cast<Constant>(V)) {
+ return Cst;
+ }
+
+ // Consult LVI if V is not an instruction in BB or PredBB.
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I || (I->getParent() != BB && I->getParent() != PredBB)) {
+ return LVI->getConstantOnEdge(V, PredPredBB, PredBB, nullptr);
+ }
+
+ // Look into a PHI argument.
+ if (PHINode *PHI = dyn_cast<PHINode>(V)) {
+ if (PHI->getParent() == PredBB)
+ return dyn_cast<Constant>(PHI->getIncomingValueForBlock(PredPredBB));
+ return nullptr;
+ }
+
+ // If we have a CmpInst, try to fold it for each incoming edge into PredBB.
+ if (CmpInst *CondCmp = dyn_cast<CmpInst>(V)) {
+ if (CondCmp->getParent() == BB) {
+ Constant *Op0 =
evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0));
- Constant *Op1 =
+ Constant *Op1 =
evaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1));
- if (Op0 && Op1) {
- return ConstantExpr::getCompare(CondCmp->getPredicate(), Op0, Op1);
- }
- }
- return nullptr;
- }
-
- return nullptr;
-}
-
+ if (Op0 && Op1) {
+ return ConstantExpr::getCompare(CondCmp->getPredicate(), Op0, Op1);
+ }
+ }
+ return nullptr;
+ }
+
+ return nullptr;
+}
+
bool JumpThreadingPass::processThreadableEdges(Value *Cond, BasicBlock *BB,
- ConstantPreference Preference,
- Instruction *CxtI) {
- // If threading this would thread across a loop header, don't even try to
- // thread the edge.
- if (LoopHeaders.count(BB))
- return false;
-
- PredValueInfoTy PredValues;
+ ConstantPreference Preference,
+ Instruction *CxtI) {
+ // If threading this would thread across a loop header, don't even try to
+ // thread the edge.
+ if (LoopHeaders.count(BB))
+ return false;
+
+ PredValueInfoTy PredValues;
if (!computeValueKnownInPredecessors(Cond, BB, PredValues, Preference,
- CxtI)) {
- // We don't have known values in predecessors. See if we can thread through
- // BB and its sole predecessor.
+ CxtI)) {
+ // We don't have known values in predecessors. See if we can thread through
+ // BB and its sole predecessor.
return maybethreadThroughTwoBasicBlocks(BB, Cond);
- }
-
- assert(!PredValues.empty() &&
+ }
+
+ assert(!PredValues.empty() &&
"computeValueKnownInPredecessors returned true with no values");
-
- LLVM_DEBUG(dbgs() << "IN BB: " << *BB;
- for (const auto &PredValue : PredValues) {
- dbgs() << " BB '" << BB->getName()
- << "': FOUND condition = " << *PredValue.first
- << " for pred '" << PredValue.second->getName() << "'.\n";
- });
-
- // Decide what we want to thread through. Convert our list of known values to
- // a list of known destinations for each pred. This also discards duplicate
- // predecessors and keeps track of the undefined inputs (which are represented
- // as a null dest in the PredToDestList).
- SmallPtrSet<BasicBlock*, 16> SeenPreds;
- SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList;
-
- BasicBlock *OnlyDest = nullptr;
- BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL;
- Constant *OnlyVal = nullptr;
- Constant *MultipleVal = (Constant *)(intptr_t)~0ULL;
-
- for (const auto &PredValue : PredValues) {
- BasicBlock *Pred = PredValue.second;
- if (!SeenPreds.insert(Pred).second)
- continue; // Duplicate predecessor entry.
-
- Constant *Val = PredValue.first;
-
- BasicBlock *DestBB;
- if (isa<UndefValue>(Val))
- DestBB = nullptr;
- else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
- assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
- DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
- } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
- assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
- DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor();
- } else {
- assert(isa<IndirectBrInst>(BB->getTerminator())
- && "Unexpected terminator");
- assert(isa<BlockAddress>(Val) && "Expecting a constant blockaddress");
- DestBB = cast<BlockAddress>(Val)->getBasicBlock();
- }
-
- // If we have exactly one destination, remember it for efficiency below.
- if (PredToDestList.empty()) {
- OnlyDest = DestBB;
- OnlyVal = Val;
- } else {
- if (OnlyDest != DestBB)
- OnlyDest = MultipleDestSentinel;
- // It possible we have same destination, but different value, e.g. default
- // case in switchinst.
- if (Val != OnlyVal)
- OnlyVal = MultipleVal;
- }
-
- // If the predecessor ends with an indirect goto, we can't change its
- // destination. Same for CallBr.
- if (isa<IndirectBrInst>(Pred->getTerminator()) ||
- isa<CallBrInst>(Pred->getTerminator()))
- continue;
-
- PredToDestList.emplace_back(Pred, DestBB);
- }
-
- // If all edges were unthreadable, we fail.
- if (PredToDestList.empty())
- return false;
-
- // If all the predecessors go to a single known successor, we want to fold,
- // not thread. By doing so, we do not need to duplicate the current block and
- // also miss potential opportunities in case we dont/cant duplicate.
- if (OnlyDest && OnlyDest != MultipleDestSentinel) {
- if (BB->hasNPredecessors(PredToDestList.size())) {
- bool SeenFirstBranchToOnlyDest = false;
- std::vector <DominatorTree::UpdateType> Updates;
- Updates.reserve(BB->getTerminator()->getNumSuccessors() - 1);
- for (BasicBlock *SuccBB : successors(BB)) {
- if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest) {
- SeenFirstBranchToOnlyDest = true; // Don't modify the first branch.
- } else {
- SuccBB->removePredecessor(BB, true); // This is unreachable successor.
- Updates.push_back({DominatorTree::Delete, BB, SuccBB});
- }
- }
-
- // Finally update the terminator.
- Instruction *Term = BB->getTerminator();
- BranchInst::Create(OnlyDest, Term);
- Term->eraseFromParent();
- DTU->applyUpdatesPermissive(Updates);
+
+ LLVM_DEBUG(dbgs() << "IN BB: " << *BB;
+ for (const auto &PredValue : PredValues) {
+ dbgs() << " BB '" << BB->getName()
+ << "': FOUND condition = " << *PredValue.first
+ << " for pred '" << PredValue.second->getName() << "'.\n";
+ });
+
+ // Decide what we want to thread through. Convert our list of known values to
+ // a list of known destinations for each pred. This also discards duplicate
+ // predecessors and keeps track of the undefined inputs (which are represented
+ // as a null dest in the PredToDestList).
+ SmallPtrSet<BasicBlock*, 16> SeenPreds;
+ SmallVector<std::pair<BasicBlock*, BasicBlock*>, 16> PredToDestList;
+
+ BasicBlock *OnlyDest = nullptr;
+ BasicBlock *MultipleDestSentinel = (BasicBlock*)(intptr_t)~0ULL;
+ Constant *OnlyVal = nullptr;
+ Constant *MultipleVal = (Constant *)(intptr_t)~0ULL;
+
+ for (const auto &PredValue : PredValues) {
+ BasicBlock *Pred = PredValue.second;
+ if (!SeenPreds.insert(Pred).second)
+ continue; // Duplicate predecessor entry.
+
+ Constant *Val = PredValue.first;
+
+ BasicBlock *DestBB;
+ if (isa<UndefValue>(Val))
+ DestBB = nullptr;
+ else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+ assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
+ DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+ assert(isa<ConstantInt>(Val) && "Expecting a constant integer");
+ DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor();
+ } else {
+ assert(isa<IndirectBrInst>(BB->getTerminator())
+ && "Unexpected terminator");
+ assert(isa<BlockAddress>(Val) && "Expecting a constant blockaddress");
+ DestBB = cast<BlockAddress>(Val)->getBasicBlock();
+ }
+
+ // If we have exactly one destination, remember it for efficiency below.
+ if (PredToDestList.empty()) {
+ OnlyDest = DestBB;
+ OnlyVal = Val;
+ } else {
+ if (OnlyDest != DestBB)
+ OnlyDest = MultipleDestSentinel;
+ // It possible we have same destination, but different value, e.g. default
+ // case in switchinst.
+ if (Val != OnlyVal)
+ OnlyVal = MultipleVal;
+ }
+
+ // If the predecessor ends with an indirect goto, we can't change its
+ // destination. Same for CallBr.
+ if (isa<IndirectBrInst>(Pred->getTerminator()) ||
+ isa<CallBrInst>(Pred->getTerminator()))
+ continue;
+
+ PredToDestList.emplace_back(Pred, DestBB);
+ }
+
+ // If all edges were unthreadable, we fail.
+ if (PredToDestList.empty())
+ return false;
+
+ // If all the predecessors go to a single known successor, we want to fold,
+ // not thread. By doing so, we do not need to duplicate the current block and
+ // also miss potential opportunities in case we dont/cant duplicate.
+ if (OnlyDest && OnlyDest != MultipleDestSentinel) {
+ if (BB->hasNPredecessors(PredToDestList.size())) {
+ bool SeenFirstBranchToOnlyDest = false;
+ std::vector <DominatorTree::UpdateType> Updates;
+ Updates.reserve(BB->getTerminator()->getNumSuccessors() - 1);
+ for (BasicBlock *SuccBB : successors(BB)) {
+ if (SuccBB == OnlyDest && !SeenFirstBranchToOnlyDest) {
+ SeenFirstBranchToOnlyDest = true; // Don't modify the first branch.
+ } else {
+ SuccBB->removePredecessor(BB, true); // This is unreachable successor.
+ Updates.push_back({DominatorTree::Delete, BB, SuccBB});
+ }
+ }
+
+ // Finally update the terminator.
+ Instruction *Term = BB->getTerminator();
+ BranchInst::Create(OnlyDest, Term);
+ Term->eraseFromParent();
+ DTU->applyUpdatesPermissive(Updates);
if (HasProfileData)
BPI->eraseBlock(BB);
-
- // If the condition is now dead due to the removal of the old terminator,
- // erase it.
- if (auto *CondInst = dyn_cast<Instruction>(Cond)) {
- if (CondInst->use_empty() && !CondInst->mayHaveSideEffects())
- CondInst->eraseFromParent();
- // We can safely replace *some* uses of the CondInst if it has
- // exactly one value as returned by LVI. RAUW is incorrect in the
- // presence of guards and assumes, that have the `Cond` as the use. This
- // is because we use the guards/assume to reason about the `Cond` value
- // at the end of block, but RAUW unconditionally replaces all uses
- // including the guards/assumes themselves and the uses before the
- // guard/assume.
- else if (OnlyVal && OnlyVal != MultipleVal &&
- CondInst->getParent() == BB)
+
+ // If the condition is now dead due to the removal of the old terminator,
+ // erase it.
+ if (auto *CondInst = dyn_cast<Instruction>(Cond)) {
+ if (CondInst->use_empty() && !CondInst->mayHaveSideEffects())
+ CondInst->eraseFromParent();
+ // We can safely replace *some* uses of the CondInst if it has
+ // exactly one value as returned by LVI. RAUW is incorrect in the
+ // presence of guards and assumes, that have the `Cond` as the use. This
+ // is because we use the guards/assume to reason about the `Cond` value
+ // at the end of block, but RAUW unconditionally replaces all uses
+ // including the guards/assumes themselves and the uses before the
+ // guard/assume.
+ else if (OnlyVal && OnlyVal != MultipleVal &&
+ CondInst->getParent() == BB)
replaceFoldableUses(CondInst, OnlyVal);
- }
- return true;
- }
- }
-
- // Determine which is the most common successor. If we have many inputs and
- // this block is a switch, we want to start by threading the batch that goes
- // to the most popular destination first. If we only know about one
- // threadable destination (the common case) we can avoid this.
- BasicBlock *MostPopularDest = OnlyDest;
-
- if (MostPopularDest == MultipleDestSentinel) {
+ }
+ return true;
+ }
+ }
+
+ // Determine which is the most common successor. If we have many inputs and
+ // this block is a switch, we want to start by threading the batch that goes
+ // to the most popular destination first. If we only know about one
+ // threadable destination (the common case) we can avoid this.
+ BasicBlock *MostPopularDest = OnlyDest;
+
+ if (MostPopularDest == MultipleDestSentinel) {
// Remove any loop headers from the Dest list, threadEdge conservatively
- // won't process them, but we might have other destination that are eligible
- // and we still want to process.
- erase_if(PredToDestList,
- [&](const std::pair<BasicBlock *, BasicBlock *> &PredToDest) {
+ // won't process them, but we might have other destination that are eligible
+ // and we still want to process.
+ erase_if(PredToDestList,
+ [&](const std::pair<BasicBlock *, BasicBlock *> &PredToDest) {
return LoopHeaders.contains(PredToDest.second);
- });
-
- if (PredToDestList.empty())
- return false;
-
+ });
+
+ if (PredToDestList.empty())
+ return false;
+
MostPopularDest = findMostPopularDest(BB, PredToDestList);
- }
-
- // Now that we know what the most popular destination is, factor all
- // predecessors that will jump to it into a single predecessor.
- SmallVector<BasicBlock*, 16> PredsToFactor;
- for (const auto &PredToDest : PredToDestList)
- if (PredToDest.second == MostPopularDest) {
- BasicBlock *Pred = PredToDest.first;
-
- // This predecessor may be a switch or something else that has multiple
- // edges to the block. Factor each of these edges by listing them
- // according to # occurrences in PredsToFactor.
- for (BasicBlock *Succ : successors(Pred))
- if (Succ == BB)
- PredsToFactor.push_back(Pred);
- }
-
- // If the threadable edges are branching on an undefined value, we get to pick
- // the destination that these predecessors should get to.
- if (!MostPopularDest)
- MostPopularDest = BB->getTerminator()->
+ }
+
+ // Now that we know what the most popular destination is, factor all
+ // predecessors that will jump to it into a single predecessor.
+ SmallVector<BasicBlock*, 16> PredsToFactor;
+ for (const auto &PredToDest : PredToDestList)
+ if (PredToDest.second == MostPopularDest) {
+ BasicBlock *Pred = PredToDest.first;
+
+ // This predecessor may be a switch or something else that has multiple
+ // edges to the block. Factor each of these edges by listing them
+ // according to # occurrences in PredsToFactor.
+ for (BasicBlock *Succ : successors(Pred))
+ if (Succ == BB)
+ PredsToFactor.push_back(Pred);
+ }
+
+ // If the threadable edges are branching on an undefined value, we get to pick
+ // the destination that these predecessors should get to.
+ if (!MostPopularDest)
+ MostPopularDest = BB->getTerminator()->
getSuccessor(getBestDestForJumpOnUndef(BB));
-
- // Ok, try to thread it!
+
+ // Ok, try to thread it!
return tryThreadEdge(BB, PredsToFactor, MostPopularDest);
-}
-
+}
+
/// processBranchOnPHI - We have an otherwise unthreadable conditional branch on
/// a PHI node (or freeze PHI) in the current block. See if there are any
/// simplifications we can do based on inputs to the phi node.
bool JumpThreadingPass::processBranchOnPHI(PHINode *PN) {
- BasicBlock *BB = PN->getParent();
-
- // TODO: We could make use of this to do it once for blocks with common PHI
- // values.
- SmallVector<BasicBlock*, 1> PredBBs;
- PredBBs.resize(1);
-
- // If any of the predecessor blocks end in an unconditional branch, we can
- // *duplicate* the conditional branch into that block in order to further
- // encourage jump threading and to eliminate cases where we have branch on a
- // phi of an icmp (branch on icmp is much better).
+ BasicBlock *BB = PN->getParent();
+
+ // TODO: We could make use of this to do it once for blocks with common PHI
+ // values.
+ SmallVector<BasicBlock*, 1> PredBBs;
+ PredBBs.resize(1);
+
+ // If any of the predecessor blocks end in an unconditional branch, we can
+ // *duplicate* the conditional branch into that block in order to further
+ // encourage jump threading and to eliminate cases where we have branch on a
+ // phi of an icmp (branch on icmp is much better).
// This is still beneficial when a frozen phi is used as the branch condition
// because it allows CodeGenPrepare to further canonicalize br(freeze(icmp))
// to br(icmp(freeze ...)).
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- BasicBlock *PredBB = PN->getIncomingBlock(i);
- if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()))
- if (PredBr->isUnconditional()) {
- PredBBs[0] = PredBB;
- // Try to duplicate BB into PredBB.
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ BasicBlock *PredBB = PN->getIncomingBlock(i);
+ if (BranchInst *PredBr = dyn_cast<BranchInst>(PredBB->getTerminator()))
+ if (PredBr->isUnconditional()) {
+ PredBBs[0] = PredBB;
+ // Try to duplicate BB into PredBB.
if (duplicateCondBranchOnPHIIntoPred(BB, PredBBs))
- return true;
- }
- }
-
- return false;
-}
-
+ return true;
+ }
+ }
+
+ return false;
+}
+
/// processBranchOnXOR - We have an otherwise unthreadable conditional branch on
-/// a xor instruction in the current block. See if there are any
-/// simplifications we can do based on inputs to the xor.
+/// a xor instruction in the current block. See if there are any
+/// simplifications we can do based on inputs to the xor.
bool JumpThreadingPass::processBranchOnXOR(BinaryOperator *BO) {
- BasicBlock *BB = BO->getParent();
-
- // If either the LHS or RHS of the xor is a constant, don't do this
- // optimization.
- if (isa<ConstantInt>(BO->getOperand(0)) ||
- isa<ConstantInt>(BO->getOperand(1)))
- return false;
-
- // If the first instruction in BB isn't a phi, we won't be able to infer
- // anything special about any particular predecessor.
- if (!isa<PHINode>(BB->front()))
- return false;
-
- // If this BB is a landing pad, we won't be able to split the edge into it.
- if (BB->isEHPad())
- return false;
-
- // If we have a xor as the branch input to this block, and we know that the
- // LHS or RHS of the xor in any predecessor is true/false, then we can clone
- // the condition into the predecessor and fix that value to true, saving some
- // logical ops on that path and encouraging other paths to simplify.
- //
- // This copies something like this:
- //
- // BB:
- // %X = phi i1 [1], [%X']
- // %Y = icmp eq i32 %A, %B
- // %Z = xor i1 %X, %Y
- // br i1 %Z, ...
- //
- // Into:
- // BB':
- // %Y = icmp ne i32 %A, %B
- // br i1 %Y, ...
-
- PredValueInfoTy XorOpValues;
- bool isLHS = true;
+ BasicBlock *BB = BO->getParent();
+
+ // If either the LHS or RHS of the xor is a constant, don't do this
+ // optimization.
+ if (isa<ConstantInt>(BO->getOperand(0)) ||
+ isa<ConstantInt>(BO->getOperand(1)))
+ return false;
+
+ // If the first instruction in BB isn't a phi, we won't be able to infer
+ // anything special about any particular predecessor.
+ if (!isa<PHINode>(BB->front()))
+ return false;
+
+ // If this BB is a landing pad, we won't be able to split the edge into it.
+ if (BB->isEHPad())
+ return false;
+
+ // If we have a xor as the branch input to this block, and we know that the
+ // LHS or RHS of the xor in any predecessor is true/false, then we can clone
+ // the condition into the predecessor and fix that value to true, saving some
+ // logical ops on that path and encouraging other paths to simplify.
+ //
+ // This copies something like this:
+ //
+ // BB:
+ // %X = phi i1 [1], [%X']
+ // %Y = icmp eq i32 %A, %B
+ // %Z = xor i1 %X, %Y
+ // br i1 %Z, ...
+ //
+ // Into:
+ // BB':
+ // %Y = icmp ne i32 %A, %B
+ // br i1 %Y, ...
+
+ PredValueInfoTy XorOpValues;
+ bool isLHS = true;
if (!computeValueKnownInPredecessors(BO->getOperand(0), BB, XorOpValues,
- WantInteger, BO)) {
- assert(XorOpValues.empty());
+ WantInteger, BO)) {
+ assert(XorOpValues.empty());
if (!computeValueKnownInPredecessors(BO->getOperand(1), BB, XorOpValues,
- WantInteger, BO))
- return false;
- isLHS = false;
- }
-
- assert(!XorOpValues.empty() &&
+ WantInteger, BO))
+ return false;
+ isLHS = false;
+ }
+
+ assert(!XorOpValues.empty() &&
"computeValueKnownInPredecessors returned true with no values");
-
- // Scan the information to see which is most popular: true or false. The
- // predecessors can be of the set true, false, or undef.
- unsigned NumTrue = 0, NumFalse = 0;
- for (const auto &XorOpValue : XorOpValues) {
- if (isa<UndefValue>(XorOpValue.first))
- // Ignore undefs for the count.
- continue;
- if (cast<ConstantInt>(XorOpValue.first)->isZero())
- ++NumFalse;
- else
- ++NumTrue;
- }
-
- // Determine which value to split on, true, false, or undef if neither.
- ConstantInt *SplitVal = nullptr;
- if (NumTrue > NumFalse)
- SplitVal = ConstantInt::getTrue(BB->getContext());
- else if (NumTrue != 0 || NumFalse != 0)
- SplitVal = ConstantInt::getFalse(BB->getContext());
-
- // Collect all of the blocks that this can be folded into so that we can
- // factor this once and clone it once.
- SmallVector<BasicBlock*, 8> BlocksToFoldInto;
- for (const auto &XorOpValue : XorOpValues) {
- if (XorOpValue.first != SplitVal && !isa<UndefValue>(XorOpValue.first))
- continue;
-
- BlocksToFoldInto.push_back(XorOpValue.second);
- }
-
- // If we inferred a value for all of the predecessors, then duplication won't
- // help us. However, we can just replace the LHS or RHS with the constant.
- if (BlocksToFoldInto.size() ==
- cast<PHINode>(BB->front()).getNumIncomingValues()) {
- if (!SplitVal) {
- // If all preds provide undef, just nuke the xor, because it is undef too.
- BO->replaceAllUsesWith(UndefValue::get(BO->getType()));
- BO->eraseFromParent();
- } else if (SplitVal->isZero()) {
- // If all preds provide 0, replace the xor with the other input.
- BO->replaceAllUsesWith(BO->getOperand(isLHS));
- BO->eraseFromParent();
- } else {
- // If all preds provide 1, set the computed value to 1.
- BO->setOperand(!isLHS, SplitVal);
- }
-
- return true;
- }
-
- // If any of predecessors end with an indirect goto, we can't change its
- // destination. Same for CallBr.
- if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) {
- return isa<IndirectBrInst>(Pred->getTerminator()) ||
- isa<CallBrInst>(Pred->getTerminator());
- }))
- return false;
-
- // Try to duplicate BB into PredBB.
+
+ // Scan the information to see which is most popular: true or false. The
+ // predecessors can be of the set true, false, or undef.
+ unsigned NumTrue = 0, NumFalse = 0;
+ for (const auto &XorOpValue : XorOpValues) {
+ if (isa<UndefValue>(XorOpValue.first))
+ // Ignore undefs for the count.
+ continue;
+ if (cast<ConstantInt>(XorOpValue.first)->isZero())
+ ++NumFalse;
+ else
+ ++NumTrue;
+ }
+
+ // Determine which value to split on, true, false, or undef if neither.
+ ConstantInt *SplitVal = nullptr;
+ if (NumTrue > NumFalse)
+ SplitVal = ConstantInt::getTrue(BB->getContext());
+ else if (NumTrue != 0 || NumFalse != 0)
+ SplitVal = ConstantInt::getFalse(BB->getContext());
+
+ // Collect all of the blocks that this can be folded into so that we can
+ // factor this once and clone it once.
+ SmallVector<BasicBlock*, 8> BlocksToFoldInto;
+ for (const auto &XorOpValue : XorOpValues) {
+ if (XorOpValue.first != SplitVal && !isa<UndefValue>(XorOpValue.first))
+ continue;
+
+ BlocksToFoldInto.push_back(XorOpValue.second);
+ }
+
+ // If we inferred a value for all of the predecessors, then duplication won't
+ // help us. However, we can just replace the LHS or RHS with the constant.
+ if (BlocksToFoldInto.size() ==
+ cast<PHINode>(BB->front()).getNumIncomingValues()) {
+ if (!SplitVal) {
+ // If all preds provide undef, just nuke the xor, because it is undef too.
+ BO->replaceAllUsesWith(UndefValue::get(BO->getType()));
+ BO->eraseFromParent();
+ } else if (SplitVal->isZero()) {
+ // If all preds provide 0, replace the xor with the other input.
+ BO->replaceAllUsesWith(BO->getOperand(isLHS));
+ BO->eraseFromParent();
+ } else {
+ // If all preds provide 1, set the computed value to 1.
+ BO->setOperand(!isLHS, SplitVal);
+ }
+
+ return true;
+ }
+
+ // If any of predecessors end with an indirect goto, we can't change its
+ // destination. Same for CallBr.
+ if (any_of(BlocksToFoldInto, [](BasicBlock *Pred) {
+ return isa<IndirectBrInst>(Pred->getTerminator()) ||
+ isa<CallBrInst>(Pred->getTerminator());
+ }))
+ return false;
+
+ // Try to duplicate BB into PredBB.
return duplicateCondBranchOnPHIIntoPred(BB, BlocksToFoldInto);
-}
-
+}
+
/// addPHINodeEntriesForMappedBlock - We're adding 'NewPred' as a new
-/// predecessor to the PHIBB block. If it has PHI nodes, add entries for
-/// NewPred using the entries from OldPred (suitably mapped).
+/// predecessor to the PHIBB block. If it has PHI nodes, add entries for
+/// NewPred using the entries from OldPred (suitably mapped).
static void addPHINodeEntriesForMappedBlock(BasicBlock *PHIBB,
- BasicBlock *OldPred,
- BasicBlock *NewPred,
- DenseMap<Instruction*, Value*> &ValueMap) {
- for (PHINode &PN : PHIBB->phis()) {
- // Ok, we have a PHI node. Figure out what the incoming value was for the
- // DestBlock.
- Value *IV = PN.getIncomingValueForBlock(OldPred);
-
- // Remap the value if necessary.
- if (Instruction *Inst = dyn_cast<Instruction>(IV)) {
- DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst);
- if (I != ValueMap.end())
- IV = I->second;
- }
-
- PN.addIncoming(IV, NewPred);
- }
-}
-
-/// Merge basic block BB into its sole predecessor if possible.
+ BasicBlock *OldPred,
+ BasicBlock *NewPred,
+ DenseMap<Instruction*, Value*> &ValueMap) {
+ for (PHINode &PN : PHIBB->phis()) {
+ // Ok, we have a PHI node. Figure out what the incoming value was for the
+ // DestBlock.
+ Value *IV = PN.getIncomingValueForBlock(OldPred);
+
+ // Remap the value if necessary.
+ if (Instruction *Inst = dyn_cast<Instruction>(IV)) {
+ DenseMap<Instruction*, Value*>::iterator I = ValueMap.find(Inst);
+ if (I != ValueMap.end())
+ IV = I->second;
+ }
+
+ PN.addIncoming(IV, NewPred);
+ }
+}
+
+/// Merge basic block BB into its sole predecessor if possible.
bool JumpThreadingPass::maybeMergeBasicBlockIntoOnlyPred(BasicBlock *BB) {
- BasicBlock *SinglePred = BB->getSinglePredecessor();
- if (!SinglePred)
- return false;
-
- const Instruction *TI = SinglePred->getTerminator();
- if (TI->isExceptionalTerminator() || TI->getNumSuccessors() != 1 ||
- SinglePred == BB || hasAddressTakenAndUsed(BB))
- return false;
-
- // If SinglePred was a loop header, BB becomes one.
- if (LoopHeaders.erase(SinglePred))
- LoopHeaders.insert(BB);
-
- LVI->eraseBlock(SinglePred);
- MergeBasicBlockIntoOnlyPred(BB, DTU);
-
- // Now that BB is merged into SinglePred (i.e. SinglePred code followed by
- // BB code within one basic block `BB`), we need to invalidate the LVI
- // information associated with BB, because the LVI information need not be
- // true for all of BB after the merge. For example,
- // Before the merge, LVI info and code is as follows:
- // SinglePred: <LVI info1 for %p val>
- // %y = use of %p
- // call @exit() // need not transfer execution to successor.
- // assume(%p) // from this point on %p is true
- // br label %BB
- // BB: <LVI info2 for %p val, i.e. %p is true>
- // %x = use of %p
- // br label exit
- //
- // Note that this LVI info for blocks BB and SinglPred is correct for %p
- // (info2 and info1 respectively). After the merge and the deletion of the
- // LVI info1 for SinglePred. We have the following code:
- // BB: <LVI info2 for %p val>
- // %y = use of %p
- // call @exit()
- // assume(%p)
- // %x = use of %p <-- LVI info2 is correct from here onwards.
- // br label exit
- // LVI info2 for BB is incorrect at the beginning of BB.
-
- // Invalidate LVI information for BB if the LVI is not provably true for
- // all of BB.
- if (!isGuaranteedToTransferExecutionToSuccessor(BB))
- LVI->eraseBlock(BB);
- return true;
-}
-
-/// Update the SSA form. NewBB contains instructions that are copied from BB.
-/// ValueMapping maps old values in BB to new ones in NewBB.
+ BasicBlock *SinglePred = BB->getSinglePredecessor();
+ if (!SinglePred)
+ return false;
+
+ const Instruction *TI = SinglePred->getTerminator();
+ if (TI->isExceptionalTerminator() || TI->getNumSuccessors() != 1 ||
+ SinglePred == BB || hasAddressTakenAndUsed(BB))
+ return false;
+
+ // If SinglePred was a loop header, BB becomes one.
+ if (LoopHeaders.erase(SinglePred))
+ LoopHeaders.insert(BB);
+
+ LVI->eraseBlock(SinglePred);
+ MergeBasicBlockIntoOnlyPred(BB, DTU);
+
+ // Now that BB is merged into SinglePred (i.e. SinglePred code followed by
+ // BB code within one basic block `BB`), we need to invalidate the LVI
+ // information associated with BB, because the LVI information need not be
+ // true for all of BB after the merge. For example,
+ // Before the merge, LVI info and code is as follows:
+ // SinglePred: <LVI info1 for %p val>
+ // %y = use of %p
+ // call @exit() // need not transfer execution to successor.
+ // assume(%p) // from this point on %p is true
+ // br label %BB
+ // BB: <LVI info2 for %p val, i.e. %p is true>
+ // %x = use of %p
+ // br label exit
+ //
+ // Note that this LVI info for blocks BB and SinglPred is correct for %p
+ // (info2 and info1 respectively). After the merge and the deletion of the
+ // LVI info1 for SinglePred. We have the following code:
+ // BB: <LVI info2 for %p val>
+ // %y = use of %p
+ // call @exit()
+ // assume(%p)
+ // %x = use of %p <-- LVI info2 is correct from here onwards.
+ // br label exit
+ // LVI info2 for BB is incorrect at the beginning of BB.
+
+ // Invalidate LVI information for BB if the LVI is not provably true for
+ // all of BB.
+ if (!isGuaranteedToTransferExecutionToSuccessor(BB))
+ LVI->eraseBlock(BB);
+ return true;
+}
+
+/// Update the SSA form. NewBB contains instructions that are copied from BB.
+/// ValueMapping maps old values in BB to new ones in NewBB.
void JumpThreadingPass::updateSSA(
- BasicBlock *BB, BasicBlock *NewBB,
- DenseMap<Instruction *, Value *> &ValueMapping) {
- // If there were values defined in BB that are used outside the block, then we
- // now have to update all uses of the value to use either the original value,
- // the cloned value, or some PHI derived value. This can require arbitrary
- // PHI insertion, of which we are prepared to do, clean these up now.
- SSAUpdater SSAUpdate;
- SmallVector<Use *, 16> UsesToRename;
-
- for (Instruction &I : *BB) {
- // Scan all uses of this instruction to see if it is used outside of its
- // block, and if so, record them in UsesToRename.
- for (Use &U : I.uses()) {
- Instruction *User = cast<Instruction>(U.getUser());
- if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
- if (UserPN->getIncomingBlock(U) == BB)
- continue;
- } else if (User->getParent() == BB)
- continue;
-
- UsesToRename.push_back(&U);
- }
-
- // If there are no uses outside the block, we're done with this instruction.
- if (UsesToRename.empty())
- continue;
- LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
-
- // We found a use of I outside of BB. Rename all uses of I that are outside
- // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks
- // with the two values we know.
- SSAUpdate.Initialize(I.getType(), I.getName());
- SSAUpdate.AddAvailableValue(BB, &I);
- SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]);
-
- while (!UsesToRename.empty())
- SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
- LLVM_DEBUG(dbgs() << "\n");
- }
-}
-
-/// Clone instructions in range [BI, BE) to NewBB. For PHI nodes, we only clone
-/// arguments that come from PredBB. Return the map from the variables in the
-/// source basic block to the variables in the newly created basic block.
-DenseMap<Instruction *, Value *>
+ BasicBlock *BB, BasicBlock *NewBB,
+ DenseMap<Instruction *, Value *> &ValueMapping) {
+ // If there were values defined in BB that are used outside the block, then we
+ // now have to update all uses of the value to use either the original value,
+ // the cloned value, or some PHI derived value. This can require arbitrary
+ // PHI insertion, of which we are prepared to do, clean these up now.
+ SSAUpdater SSAUpdate;
+ SmallVector<Use *, 16> UsesToRename;
+
+ for (Instruction &I : *BB) {
+ // Scan all uses of this instruction to see if it is used outside of its
+ // block, and if so, record them in UsesToRename.
+ for (Use &U : I.uses()) {
+ Instruction *User = cast<Instruction>(U.getUser());
+ if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+ if (UserPN->getIncomingBlock(U) == BB)
+ continue;
+ } else if (User->getParent() == BB)
+ continue;
+
+ UsesToRename.push_back(&U);
+ }
+
+ // If there are no uses outside the block, we're done with this instruction.
+ if (UsesToRename.empty())
+ continue;
+ LLVM_DEBUG(dbgs() << "JT: Renaming non-local uses of: " << I << "\n");
+
+ // We found a use of I outside of BB. Rename all uses of I that are outside
+ // its block to be uses of the appropriate PHI node etc. See ValuesInBlocks
+ // with the two values we know.
+ SSAUpdate.Initialize(I.getType(), I.getName());
+ SSAUpdate.AddAvailableValue(BB, &I);
+ SSAUpdate.AddAvailableValue(NewBB, ValueMapping[&I]);
+
+ while (!UsesToRename.empty())
+ SSAUpdate.RewriteUse(*UsesToRename.pop_back_val());
+ LLVM_DEBUG(dbgs() << "\n");
+ }
+}
+
+/// Clone instructions in range [BI, BE) to NewBB. For PHI nodes, we only clone
+/// arguments that come from PredBB. Return the map from the variables in the
+/// source basic block to the variables in the newly created basic block.
+DenseMap<Instruction *, Value *>
JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
- BasicBlock::iterator BE, BasicBlock *NewBB,
- BasicBlock *PredBB) {
- // We are going to have to map operands from the source basic block to the new
- // copy of the block 'NewBB'. If there are PHI nodes in the source basic
- // block, evaluate them to account for entry from PredBB.
- DenseMap<Instruction *, Value *> ValueMapping;
-
- // Clone the phi nodes of the source basic block into NewBB. The resulting
- // phi nodes are trivial since NewBB only has one predecessor, but SSAUpdater
- // might need to rewrite the operand of the cloned phi.
- for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
- PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB);
- NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB);
- ValueMapping[PN] = NewPN;
- }
-
+ BasicBlock::iterator BE, BasicBlock *NewBB,
+ BasicBlock *PredBB) {
+ // We are going to have to map operands from the source basic block to the new
+ // copy of the block 'NewBB'. If there are PHI nodes in the source basic
+ // block, evaluate them to account for entry from PredBB.
+ DenseMap<Instruction *, Value *> ValueMapping;
+
+ // Clone the phi nodes of the source basic block into NewBB. The resulting
+ // phi nodes are trivial since NewBB only has one predecessor, but SSAUpdater
+ // might need to rewrite the operand of the cloned phi.
+ for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
+ PHINode *NewPN = PHINode::Create(PN->getType(), 1, PN->getName(), NewBB);
+ NewPN->addIncoming(PN->getIncomingValueForBlock(PredBB), PredBB);
+ ValueMapping[PN] = NewPN;
+ }
+
// Clone noalias scope declarations in the threaded block. When threading a
// loop exit, we would otherwise end up with two idential scope declarations
// visible at the same time.
@@ -2085,974 +2085,974 @@ JumpThreadingPass::cloneInstructions(BasicBlock::iterator BI,
identifyNoAliasScopesToClone(BI, BE, NoAliasScopes);
cloneNoAliasScopes(NoAliasScopes, ClonedScopes, "thread", Context);
- // Clone the non-phi instructions of the source basic block into NewBB,
- // keeping track of the mapping and using it to remap operands in the cloned
- // instructions.
- for (; BI != BE; ++BI) {
- Instruction *New = BI->clone();
- New->setName(BI->getName());
- NewBB->getInstList().push_back(New);
- ValueMapping[&*BI] = New;
+ // Clone the non-phi instructions of the source basic block into NewBB,
+ // keeping track of the mapping and using it to remap operands in the cloned
+ // instructions.
+ for (; BI != BE; ++BI) {
+ Instruction *New = BI->clone();
+ New->setName(BI->getName());
+ NewBB->getInstList().push_back(New);
+ ValueMapping[&*BI] = New;
adaptNoAliasScopes(New, ClonedScopes, Context);
-
- // Remap operands to patch up intra-block references.
- for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
- if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
- DenseMap<Instruction *, Value *>::iterator I = ValueMapping.find(Inst);
- if (I != ValueMapping.end())
- New->setOperand(i, I->second);
- }
- }
-
- return ValueMapping;
-}
-
-/// Attempt to thread through two successive basic blocks.
+
+ // Remap operands to patch up intra-block references.
+ for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+ if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+ DenseMap<Instruction *, Value *>::iterator I = ValueMapping.find(Inst);
+ if (I != ValueMapping.end())
+ New->setOperand(i, I->second);
+ }
+ }
+
+ return ValueMapping;
+}
+
+/// Attempt to thread through two successive basic blocks.
bool JumpThreadingPass::maybethreadThroughTwoBasicBlocks(BasicBlock *BB,
- Value *Cond) {
- // Consider:
- //
- // PredBB:
- // %var = phi i32* [ null, %bb1 ], [ @a, %bb2 ]
- // %tobool = icmp eq i32 %cond, 0
- // br i1 %tobool, label %BB, label ...
- //
- // BB:
- // %cmp = icmp eq i32* %var, null
- // br i1 %cmp, label ..., label ...
- //
- // We don't know the value of %var at BB even if we know which incoming edge
- // we take to BB. However, once we duplicate PredBB for each of its incoming
- // edges (say, PredBB1 and PredBB2), we know the value of %var in each copy of
- // PredBB. Then we can thread edges PredBB1->BB and PredBB2->BB through BB.
-
- // Require that BB end with a Branch for simplicity.
- BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
- if (!CondBr)
- return false;
-
- // BB must have exactly one predecessor.
- BasicBlock *PredBB = BB->getSinglePredecessor();
- if (!PredBB)
- return false;
-
- // Require that PredBB end with a conditional Branch. If PredBB ends with an
- // unconditional branch, we should be merging PredBB and BB instead. For
- // simplicity, we don't deal with a switch.
- BranchInst *PredBBBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
- if (!PredBBBranch || PredBBBranch->isUnconditional())
- return false;
-
- // If PredBB has exactly one incoming edge, we don't gain anything by copying
- // PredBB.
- if (PredBB->getSinglePredecessor())
- return false;
-
- // Don't thread through PredBB if it contains a successor edge to itself, in
- // which case we would infinite loop. Suppose we are threading an edge from
- // PredPredBB through PredBB and BB to SuccBB with PredBB containing a
- // successor edge to itself. If we allowed jump threading in this case, we
- // could duplicate PredBB and BB as, say, PredBB.thread and BB.thread. Since
- // PredBB.thread has a successor edge to PredBB, we would immediately come up
- // with another jump threading opportunity from PredBB.thread through PredBB
- // and BB to SuccBB. This jump threading would repeatedly occur. That is, we
- // would keep peeling one iteration from PredBB.
- if (llvm::is_contained(successors(PredBB), PredBB))
- return false;
-
- // Don't thread across a loop header.
- if (LoopHeaders.count(PredBB))
- return false;
-
- // Avoid complication with duplicating EH pads.
- if (PredBB->isEHPad())
- return false;
-
- // Find a predecessor that we can thread. For simplicity, we only consider a
- // successor edge out of BB to which we thread exactly one incoming edge into
- // PredBB.
- unsigned ZeroCount = 0;
- unsigned OneCount = 0;
- BasicBlock *ZeroPred = nullptr;
- BasicBlock *OnePred = nullptr;
- for (BasicBlock *P : predecessors(PredBB)) {
- if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(
+ Value *Cond) {
+ // Consider:
+ //
+ // PredBB:
+ // %var = phi i32* [ null, %bb1 ], [ @a, %bb2 ]
+ // %tobool = icmp eq i32 %cond, 0
+ // br i1 %tobool, label %BB, label ...
+ //
+ // BB:
+ // %cmp = icmp eq i32* %var, null
+ // br i1 %cmp, label ..., label ...
+ //
+ // We don't know the value of %var at BB even if we know which incoming edge
+ // we take to BB. However, once we duplicate PredBB for each of its incoming
+ // edges (say, PredBB1 and PredBB2), we know the value of %var in each copy of
+ // PredBB. Then we can thread edges PredBB1->BB and PredBB2->BB through BB.
+
+ // Require that BB end with a Branch for simplicity.
+ BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!CondBr)
+ return false;
+
+ // BB must have exactly one predecessor.
+ BasicBlock *PredBB = BB->getSinglePredecessor();
+ if (!PredBB)
+ return false;
+
+ // Require that PredBB end with a conditional Branch. If PredBB ends with an
+ // unconditional branch, we should be merging PredBB and BB instead. For
+ // simplicity, we don't deal with a switch.
+ BranchInst *PredBBBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
+ if (!PredBBBranch || PredBBBranch->isUnconditional())
+ return false;
+
+ // If PredBB has exactly one incoming edge, we don't gain anything by copying
+ // PredBB.
+ if (PredBB->getSinglePredecessor())
+ return false;
+
+ // Don't thread through PredBB if it contains a successor edge to itself, in
+ // which case we would infinite loop. Suppose we are threading an edge from
+ // PredPredBB through PredBB and BB to SuccBB with PredBB containing a
+ // successor edge to itself. If we allowed jump threading in this case, we
+ // could duplicate PredBB and BB as, say, PredBB.thread and BB.thread. Since
+ // PredBB.thread has a successor edge to PredBB, we would immediately come up
+ // with another jump threading opportunity from PredBB.thread through PredBB
+ // and BB to SuccBB. This jump threading would repeatedly occur. That is, we
+ // would keep peeling one iteration from PredBB.
+ if (llvm::is_contained(successors(PredBB), PredBB))
+ return false;
+
+ // Don't thread across a loop header.
+ if (LoopHeaders.count(PredBB))
+ return false;
+
+ // Avoid complication with duplicating EH pads.
+ if (PredBB->isEHPad())
+ return false;
+
+ // Find a predecessor that we can thread. For simplicity, we only consider a
+ // successor edge out of BB to which we thread exactly one incoming edge into
+ // PredBB.
+ unsigned ZeroCount = 0;
+ unsigned OneCount = 0;
+ BasicBlock *ZeroPred = nullptr;
+ BasicBlock *OnePred = nullptr;
+ for (BasicBlock *P : predecessors(PredBB)) {
+ if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(
evaluateOnPredecessorEdge(BB, P, Cond))) {
- if (CI->isZero()) {
- ZeroCount++;
- ZeroPred = P;
- } else if (CI->isOne()) {
- OneCount++;
- OnePred = P;
- }
- }
- }
-
- // Disregard complicated cases where we have to thread multiple edges.
- BasicBlock *PredPredBB;
- if (ZeroCount == 1) {
- PredPredBB = ZeroPred;
- } else if (OneCount == 1) {
- PredPredBB = OnePred;
- } else {
- return false;
- }
-
- BasicBlock *SuccBB = CondBr->getSuccessor(PredPredBB == ZeroPred);
-
- // If threading to the same block as we come from, we would infinite loop.
- if (SuccBB == BB) {
- LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName()
- << "' - would thread to self!\n");
- return false;
- }
-
- // If threading this would thread across a loop header, don't thread the edge.
+ if (CI->isZero()) {
+ ZeroCount++;
+ ZeroPred = P;
+ } else if (CI->isOne()) {
+ OneCount++;
+ OnePred = P;
+ }
+ }
+ }
+
+ // Disregard complicated cases where we have to thread multiple edges.
+ BasicBlock *PredPredBB;
+ if (ZeroCount == 1) {
+ PredPredBB = ZeroPred;
+ } else if (OneCount == 1) {
+ PredPredBB = OnePred;
+ } else {
+ return false;
+ }
+
+ BasicBlock *SuccBB = CondBr->getSuccessor(PredPredBB == ZeroPred);
+
+ // If threading to the same block as we come from, we would infinite loop.
+ if (SuccBB == BB) {
+ LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName()
+ << "' - would thread to self!\n");
+ return false;
+ }
+
+ // If threading this would thread across a loop header, don't thread the edge.
// See the comments above findLoopHeaders for justifications and caveats.
- if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
- LLVM_DEBUG({
- bool BBIsHeader = LoopHeaders.count(BB);
- bool SuccIsHeader = LoopHeaders.count(SuccBB);
- dbgs() << " Not threading across "
- << (BBIsHeader ? "loop header BB '" : "block BB '")
- << BB->getName() << "' to dest "
- << (SuccIsHeader ? "loop header BB '" : "block BB '")
- << SuccBB->getName()
- << "' - it might create an irreducible loop!\n";
- });
- return false;
- }
-
- // Compute the cost of duplicating BB and PredBB.
- unsigned BBCost =
- getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
- unsigned PredBBCost = getJumpThreadDuplicationCost(
- PredBB, PredBB->getTerminator(), BBDupThreshold);
-
- // Give up if costs are too high. We need to check BBCost and PredBBCost
- // individually before checking their sum because getJumpThreadDuplicationCost
- // return (unsigned)~0 for those basic blocks that cannot be duplicated.
- if (BBCost > BBDupThreshold || PredBBCost > BBDupThreshold ||
- BBCost + PredBBCost > BBDupThreshold) {
- LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName()
- << "' - Cost is too high: " << PredBBCost
- << " for PredBB, " << BBCost << "for BB\n");
- return false;
- }
-
- // Now we are ready to duplicate PredBB.
+ if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
+ LLVM_DEBUG({
+ bool BBIsHeader = LoopHeaders.count(BB);
+ bool SuccIsHeader = LoopHeaders.count(SuccBB);
+ dbgs() << " Not threading across "
+ << (BBIsHeader ? "loop header BB '" : "block BB '")
+ << BB->getName() << "' to dest "
+ << (SuccIsHeader ? "loop header BB '" : "block BB '")
+ << SuccBB->getName()
+ << "' - it might create an irreducible loop!\n";
+ });
+ return false;
+ }
+
+ // Compute the cost of duplicating BB and PredBB.
+ unsigned BBCost =
+ getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
+ unsigned PredBBCost = getJumpThreadDuplicationCost(
+ PredBB, PredBB->getTerminator(), BBDupThreshold);
+
+ // Give up if costs are too high. We need to check BBCost and PredBBCost
+ // individually before checking their sum because getJumpThreadDuplicationCost
+ // return (unsigned)~0 for those basic blocks that cannot be duplicated.
+ if (BBCost > BBDupThreshold || PredBBCost > BBDupThreshold ||
+ BBCost + PredBBCost > BBDupThreshold) {
+ LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName()
+ << "' - Cost is too high: " << PredBBCost
+ << " for PredBB, " << BBCost << "for BB\n");
+ return false;
+ }
+
+ // Now we are ready to duplicate PredBB.
threadThroughTwoBasicBlocks(PredPredBB, PredBB, BB, SuccBB);
- return true;
-}
-
+ return true;
+}
+
void JumpThreadingPass::threadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
- BasicBlock *PredBB,
- BasicBlock *BB,
- BasicBlock *SuccBB) {
- LLVM_DEBUG(dbgs() << " Threading through '" << PredBB->getName() << "' and '"
- << BB->getName() << "'\n");
-
- BranchInst *CondBr = cast<BranchInst>(BB->getTerminator());
- BranchInst *PredBBBranch = cast<BranchInst>(PredBB->getTerminator());
-
- BasicBlock *NewBB =
- BasicBlock::Create(PredBB->getContext(), PredBB->getName() + ".thread",
- PredBB->getParent(), PredBB);
- NewBB->moveAfter(PredBB);
-
- // Set the block frequency of NewBB.
- if (HasProfileData) {
- auto NewBBFreq = BFI->getBlockFreq(PredPredBB) *
- BPI->getEdgeProbability(PredPredBB, PredBB);
- BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
- }
-
- // We are going to have to map operands from the original BB block to the new
- // copy of the block 'NewBB'. If there are PHI nodes in PredBB, evaluate them
- // to account for entry from PredPredBB.
- DenseMap<Instruction *, Value *> ValueMapping =
+ BasicBlock *PredBB,
+ BasicBlock *BB,
+ BasicBlock *SuccBB) {
+ LLVM_DEBUG(dbgs() << " Threading through '" << PredBB->getName() << "' and '"
+ << BB->getName() << "'\n");
+
+ BranchInst *CondBr = cast<BranchInst>(BB->getTerminator());
+ BranchInst *PredBBBranch = cast<BranchInst>(PredBB->getTerminator());
+
+ BasicBlock *NewBB =
+ BasicBlock::Create(PredBB->getContext(), PredBB->getName() + ".thread",
+ PredBB->getParent(), PredBB);
+ NewBB->moveAfter(PredBB);
+
+ // Set the block frequency of NewBB.
+ if (HasProfileData) {
+ auto NewBBFreq = BFI->getBlockFreq(PredPredBB) *
+ BPI->getEdgeProbability(PredPredBB, PredBB);
+ BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+ }
+
+ // We are going to have to map operands from the original BB block to the new
+ // copy of the block 'NewBB'. If there are PHI nodes in PredBB, evaluate them
+ // to account for entry from PredPredBB.
+ DenseMap<Instruction *, Value *> ValueMapping =
cloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB);
-
+
// Copy the edge probabilities from PredBB to NewBB.
if (HasProfileData)
BPI->copyEdgeProbabilities(PredBB, NewBB);
- // Update the terminator of PredPredBB to jump to NewBB instead of PredBB.
- // This eliminates predecessors from PredPredBB, which requires us to simplify
- // any PHI nodes in PredBB.
- Instruction *PredPredTerm = PredPredBB->getTerminator();
- for (unsigned i = 0, e = PredPredTerm->getNumSuccessors(); i != e; ++i)
- if (PredPredTerm->getSuccessor(i) == PredBB) {
- PredBB->removePredecessor(PredPredBB, true);
- PredPredTerm->setSuccessor(i, NewBB);
- }
-
+ // Update the terminator of PredPredBB to jump to NewBB instead of PredBB.
+ // This eliminates predecessors from PredPredBB, which requires us to simplify
+ // any PHI nodes in PredBB.
+ Instruction *PredPredTerm = PredPredBB->getTerminator();
+ for (unsigned i = 0, e = PredPredTerm->getNumSuccessors(); i != e; ++i)
+ if (PredPredTerm->getSuccessor(i) == PredBB) {
+ PredBB->removePredecessor(PredPredBB, true);
+ PredPredTerm->setSuccessor(i, NewBB);
+ }
+
addPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(0), PredBB, NewBB,
- ValueMapping);
+ ValueMapping);
addPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(1), PredBB, NewBB,
- ValueMapping);
-
- DTU->applyUpdatesPermissive(
- {{DominatorTree::Insert, NewBB, CondBr->getSuccessor(0)},
- {DominatorTree::Insert, NewBB, CondBr->getSuccessor(1)},
- {DominatorTree::Insert, PredPredBB, NewBB},
- {DominatorTree::Delete, PredPredBB, PredBB}});
-
+ ValueMapping);
+
+ DTU->applyUpdatesPermissive(
+ {{DominatorTree::Insert, NewBB, CondBr->getSuccessor(0)},
+ {DominatorTree::Insert, NewBB, CondBr->getSuccessor(1)},
+ {DominatorTree::Insert, PredPredBB, NewBB},
+ {DominatorTree::Delete, PredPredBB, PredBB}});
+
updateSSA(PredBB, NewBB, ValueMapping);
-
- // Clean up things like PHI nodes with single operands, dead instructions,
- // etc.
- SimplifyInstructionsInBlock(NewBB, TLI);
- SimplifyInstructionsInBlock(PredBB, TLI);
-
- SmallVector<BasicBlock *, 1> PredsToFactor;
- PredsToFactor.push_back(NewBB);
+
+ // Clean up things like PHI nodes with single operands, dead instructions,
+ // etc.
+ SimplifyInstructionsInBlock(NewBB, TLI);
+ SimplifyInstructionsInBlock(PredBB, TLI);
+
+ SmallVector<BasicBlock *, 1> PredsToFactor;
+ PredsToFactor.push_back(NewBB);
threadEdge(BB, PredsToFactor, SuccBB);
-}
-
+}
+
/// tryThreadEdge - Thread an edge if it's safe and profitable to do so.
bool JumpThreadingPass::tryThreadEdge(
- BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs,
- BasicBlock *SuccBB) {
- // If threading to the same block as we come from, we would infinite loop.
- if (SuccBB == BB) {
- LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName()
- << "' - would thread to self!\n");
- return false;
- }
-
- // If threading this would thread across a loop header, don't thread the edge.
+ BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs,
+ BasicBlock *SuccBB) {
+ // If threading to the same block as we come from, we would infinite loop.
+ if (SuccBB == BB) {
+ LLVM_DEBUG(dbgs() << " Not threading across BB '" << BB->getName()
+ << "' - would thread to self!\n");
+ return false;
+ }
+
+ // If threading this would thread across a loop header, don't thread the edge.
// See the comments above findLoopHeaders for justifications and caveats.
- if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
- LLVM_DEBUG({
- bool BBIsHeader = LoopHeaders.count(BB);
- bool SuccIsHeader = LoopHeaders.count(SuccBB);
- dbgs() << " Not threading across "
- << (BBIsHeader ? "loop header BB '" : "block BB '") << BB->getName()
- << "' to dest " << (SuccIsHeader ? "loop header BB '" : "block BB '")
- << SuccBB->getName() << "' - it might create an irreducible loop!\n";
- });
- return false;
- }
-
- unsigned JumpThreadCost =
- getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
- if (JumpThreadCost > BBDupThreshold) {
- LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName()
- << "' - Cost is too high: " << JumpThreadCost << "\n");
- return false;
- }
-
+ if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
+ LLVM_DEBUG({
+ bool BBIsHeader = LoopHeaders.count(BB);
+ bool SuccIsHeader = LoopHeaders.count(SuccBB);
+ dbgs() << " Not threading across "
+ << (BBIsHeader ? "loop header BB '" : "block BB '") << BB->getName()
+ << "' to dest " << (SuccIsHeader ? "loop header BB '" : "block BB '")
+ << SuccBB->getName() << "' - it might create an irreducible loop!\n";
+ });
+ return false;
+ }
+
+ unsigned JumpThreadCost =
+ getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
+ if (JumpThreadCost > BBDupThreshold) {
+ LLVM_DEBUG(dbgs() << " Not threading BB '" << BB->getName()
+ << "' - Cost is too high: " << JumpThreadCost << "\n");
+ return false;
+ }
+
threadEdge(BB, PredBBs, SuccBB);
- return true;
-}
-
+ return true;
+}
+
/// threadEdge - We have decided that it is safe and profitable to factor the
-/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
-/// across BB. Transform the IR to reflect this change.
+/// blocks in PredBBs to one predecessor, then thread an edge from it to SuccBB
+/// across BB. Transform the IR to reflect this change.
void JumpThreadingPass::threadEdge(BasicBlock *BB,
- const SmallVectorImpl<BasicBlock *> &PredBBs,
- BasicBlock *SuccBB) {
- assert(SuccBB != BB && "Don't create an infinite loop");
-
- assert(!LoopHeaders.count(BB) && !LoopHeaders.count(SuccBB) &&
- "Don't thread across loop headers");
-
- // And finally, do it! Start by factoring the predecessors if needed.
- BasicBlock *PredBB;
- if (PredBBs.size() == 1)
- PredBB = PredBBs[0];
- else {
- LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size()
- << " common predecessors.\n");
+ const SmallVectorImpl<BasicBlock *> &PredBBs,
+ BasicBlock *SuccBB) {
+ assert(SuccBB != BB && "Don't create an infinite loop");
+
+ assert(!LoopHeaders.count(BB) && !LoopHeaders.count(SuccBB) &&
+ "Don't thread across loop headers");
+
+ // And finally, do it! Start by factoring the predecessors if needed.
+ BasicBlock *PredBB;
+ if (PredBBs.size() == 1)
+ PredBB = PredBBs[0];
+ else {
+ LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size()
+ << " common predecessors.\n");
PredBB = splitBlockPreds(BB, PredBBs, ".thr_comm");
- }
-
- // And finally, do it!
- LLVM_DEBUG(dbgs() << " Threading edge from '" << PredBB->getName()
- << "' to '" << SuccBB->getName()
- << ", across block:\n " << *BB << "\n");
-
- LVI->threadEdge(PredBB, BB, SuccBB);
-
- BasicBlock *NewBB = BasicBlock::Create(BB->getContext(),
- BB->getName()+".thread",
- BB->getParent(), BB);
- NewBB->moveAfter(PredBB);
-
- // Set the block frequency of NewBB.
- if (HasProfileData) {
- auto NewBBFreq =
- BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB);
- BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
- }
-
- // Copy all the instructions from BB to NewBB except the terminator.
- DenseMap<Instruction *, Value *> ValueMapping =
+ }
+
+ // And finally, do it!
+ LLVM_DEBUG(dbgs() << " Threading edge from '" << PredBB->getName()
+ << "' to '" << SuccBB->getName()
+ << ", across block:\n " << *BB << "\n");
+
+ LVI->threadEdge(PredBB, BB, SuccBB);
+
+ BasicBlock *NewBB = BasicBlock::Create(BB->getContext(),
+ BB->getName()+".thread",
+ BB->getParent(), BB);
+ NewBB->moveAfter(PredBB);
+
+ // Set the block frequency of NewBB.
+ if (HasProfileData) {
+ auto NewBBFreq =
+ BFI->getBlockFreq(PredBB) * BPI->getEdgeProbability(PredBB, BB);
+ BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+ }
+
+ // Copy all the instructions from BB to NewBB except the terminator.
+ DenseMap<Instruction *, Value *> ValueMapping =
cloneInstructions(BB->begin(), std::prev(BB->end()), NewBB, PredBB);
-
- // We didn't copy the terminator from BB over to NewBB, because there is now
- // an unconditional jump to SuccBB. Insert the unconditional jump.
- BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB);
- NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());
-
- // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
- // PHI nodes for NewBB now.
+
+ // We didn't copy the terminator from BB over to NewBB, because there is now
+ // an unconditional jump to SuccBB. Insert the unconditional jump.
+ BranchInst *NewBI = BranchInst::Create(SuccBB, NewBB);
+ NewBI->setDebugLoc(BB->getTerminator()->getDebugLoc());
+
+ // Check to see if SuccBB has PHI nodes. If so, we need to add entries to the
+ // PHI nodes for NewBB now.
addPHINodeEntriesForMappedBlock(SuccBB, BB, NewBB, ValueMapping);
-
- // Update the terminator of PredBB to jump to NewBB instead of BB. This
- // eliminates predecessors from BB, which requires us to simplify any PHI
- // nodes in BB.
- Instruction *PredTerm = PredBB->getTerminator();
- for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
- if (PredTerm->getSuccessor(i) == BB) {
- BB->removePredecessor(PredBB, true);
- PredTerm->setSuccessor(i, NewBB);
- }
-
- // Enqueue required DT updates.
- DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, SuccBB},
- {DominatorTree::Insert, PredBB, NewBB},
- {DominatorTree::Delete, PredBB, BB}});
-
+
+ // Update the terminator of PredBB to jump to NewBB instead of BB. This
+ // eliminates predecessors from BB, which requires us to simplify any PHI
+ // nodes in BB.
+ Instruction *PredTerm = PredBB->getTerminator();
+ for (unsigned i = 0, e = PredTerm->getNumSuccessors(); i != e; ++i)
+ if (PredTerm->getSuccessor(i) == BB) {
+ BB->removePredecessor(PredBB, true);
+ PredTerm->setSuccessor(i, NewBB);
+ }
+
+ // Enqueue required DT updates.
+ DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, SuccBB},
+ {DominatorTree::Insert, PredBB, NewBB},
+ {DominatorTree::Delete, PredBB, BB}});
+
updateSSA(BB, NewBB, ValueMapping);
-
- // At this point, the IR is fully up to date and consistent. Do a quick scan
- // over the new instructions and zap any that are constants or dead. This
- // frequently happens because of phi translation.
- SimplifyInstructionsInBlock(NewBB, TLI);
-
- // Update the edge weight from BB to SuccBB, which should be less than before.
+
+ // At this point, the IR is fully up to date and consistent. Do a quick scan
+ // over the new instructions and zap any that are constants or dead. This
+ // frequently happens because of phi translation.
+ SimplifyInstructionsInBlock(NewBB, TLI);
+
+ // Update the edge weight from BB to SuccBB, which should be less than before.
updateBlockFreqAndEdgeWeight(PredBB, BB, NewBB, SuccBB);
-
- // Threaded an edge!
- ++NumThreads;
-}
-
-/// Create a new basic block that will be the predecessor of BB and successor of
-/// all blocks in Preds. When profile data is available, update the frequency of
-/// this new block.
+
+ // Threaded an edge!
+ ++NumThreads;
+}
+
+/// Create a new basic block that will be the predecessor of BB and successor of
+/// all blocks in Preds. When profile data is available, update the frequency of
+/// this new block.
BasicBlock *JumpThreadingPass::splitBlockPreds(BasicBlock *BB,
- ArrayRef<BasicBlock *> Preds,
- const char *Suffix) {
- SmallVector<BasicBlock *, 2> NewBBs;
-
- // Collect the frequencies of all predecessors of BB, which will be used to
- // update the edge weight of the result of splitting predecessors.
- DenseMap<BasicBlock *, BlockFrequency> FreqMap;
- if (HasProfileData)
- for (auto Pred : Preds)
- FreqMap.insert(std::make_pair(
- Pred, BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB)));
-
- // In the case when BB is a LandingPad block we create 2 new predecessors
- // instead of just one.
- if (BB->isLandingPad()) {
- std::string NewName = std::string(Suffix) + ".split-lp";
- SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs);
- } else {
- NewBBs.push_back(SplitBlockPredecessors(BB, Preds, Suffix));
- }
-
- std::vector<DominatorTree::UpdateType> Updates;
- Updates.reserve((2 * Preds.size()) + NewBBs.size());
- for (auto NewBB : NewBBs) {
- BlockFrequency NewBBFreq(0);
- Updates.push_back({DominatorTree::Insert, NewBB, BB});
- for (auto Pred : predecessors(NewBB)) {
- Updates.push_back({DominatorTree::Delete, Pred, BB});
- Updates.push_back({DominatorTree::Insert, Pred, NewBB});
- if (HasProfileData) // Update frequencies between Pred -> NewBB.
- NewBBFreq += FreqMap.lookup(Pred);
- }
- if (HasProfileData) // Apply the summed frequency to NewBB.
- BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
- }
-
- DTU->applyUpdatesPermissive(Updates);
- return NewBBs[0];
-}
-
-bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
- const Instruction *TI = BB->getTerminator();
- assert(TI->getNumSuccessors() > 1 && "not a split");
-
- MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
- if (!WeightsNode)
- return false;
-
- MDString *MDName = cast<MDString>(WeightsNode->getOperand(0));
- if (MDName->getString() != "branch_weights")
- return false;
-
- // Ensure there are weights for all of the successors. Note that the first
- // operand to the metadata node is a name, not a weight.
- return WeightsNode->getNumOperands() == TI->getNumSuccessors() + 1;
-}
-
-/// Update the block frequency of BB and branch weight and the metadata on the
-/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
-/// Freq(PredBB->BB) / Freq(BB->SuccBB).
+ ArrayRef<BasicBlock *> Preds,
+ const char *Suffix) {
+ SmallVector<BasicBlock *, 2> NewBBs;
+
+ // Collect the frequencies of all predecessors of BB, which will be used to
+ // update the edge weight of the result of splitting predecessors.
+ DenseMap<BasicBlock *, BlockFrequency> FreqMap;
+ if (HasProfileData)
+ for (auto Pred : Preds)
+ FreqMap.insert(std::make_pair(
+ Pred, BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, BB)));
+
+ // In the case when BB is a LandingPad block we create 2 new predecessors
+ // instead of just one.
+ if (BB->isLandingPad()) {
+ std::string NewName = std::string(Suffix) + ".split-lp";
+ SplitLandingPadPredecessors(BB, Preds, Suffix, NewName.c_str(), NewBBs);
+ } else {
+ NewBBs.push_back(SplitBlockPredecessors(BB, Preds, Suffix));
+ }
+
+ std::vector<DominatorTree::UpdateType> Updates;
+ Updates.reserve((2 * Preds.size()) + NewBBs.size());
+ for (auto NewBB : NewBBs) {
+ BlockFrequency NewBBFreq(0);
+ Updates.push_back({DominatorTree::Insert, NewBB, BB});
+ for (auto Pred : predecessors(NewBB)) {
+ Updates.push_back({DominatorTree::Delete, Pred, BB});
+ Updates.push_back({DominatorTree::Insert, Pred, NewBB});
+ if (HasProfileData) // Update frequencies between Pred -> NewBB.
+ NewBBFreq += FreqMap.lookup(Pred);
+ }
+ if (HasProfileData) // Apply the summed frequency to NewBB.
+ BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+ }
+
+ DTU->applyUpdatesPermissive(Updates);
+ return NewBBs[0];
+}
+
+bool JumpThreadingPass::doesBlockHaveProfileData(BasicBlock *BB) {
+ const Instruction *TI = BB->getTerminator();
+ assert(TI->getNumSuccessors() > 1 && "not a split");
+
+ MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
+ if (!WeightsNode)
+ return false;
+
+ MDString *MDName = cast<MDString>(WeightsNode->getOperand(0));
+ if (MDName->getString() != "branch_weights")
+ return false;
+
+ // Ensure there are weights for all of the successors. Note that the first
+ // operand to the metadata node is a name, not a weight.
+ return WeightsNode->getNumOperands() == TI->getNumSuccessors() + 1;
+}
+
+/// Update the block frequency of BB and branch weight and the metadata on the
+/// edge BB->SuccBB. This is done by scaling the weight of BB->SuccBB by 1 -
+/// Freq(PredBB->BB) / Freq(BB->SuccBB).
void JumpThreadingPass::updateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
- BasicBlock *BB,
- BasicBlock *NewBB,
- BasicBlock *SuccBB) {
- if (!HasProfileData)
- return;
-
- assert(BFI && BPI && "BFI & BPI should have been created here");
-
- // As the edge from PredBB to BB is deleted, we have to update the block
- // frequency of BB.
- auto BBOrigFreq = BFI->getBlockFreq(BB);
- auto NewBBFreq = BFI->getBlockFreq(NewBB);
- auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB);
- auto BBNewFreq = BBOrigFreq - NewBBFreq;
- BFI->setBlockFreq(BB, BBNewFreq.getFrequency());
-
- // Collect updated outgoing edges' frequencies from BB and use them to update
- // edge probabilities.
- SmallVector<uint64_t, 4> BBSuccFreq;
- for (BasicBlock *Succ : successors(BB)) {
- auto SuccFreq = (Succ == SuccBB)
- ? BB2SuccBBFreq - NewBBFreq
- : BBOrigFreq * BPI->getEdgeProbability(BB, Succ);
- BBSuccFreq.push_back(SuccFreq.getFrequency());
- }
-
- uint64_t MaxBBSuccFreq =
- *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end());
-
- SmallVector<BranchProbability, 4> BBSuccProbs;
- if (MaxBBSuccFreq == 0)
- BBSuccProbs.assign(BBSuccFreq.size(),
- {1, static_cast<unsigned>(BBSuccFreq.size())});
- else {
- for (uint64_t Freq : BBSuccFreq)
- BBSuccProbs.push_back(
- BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq));
- // Normalize edge probabilities so that they sum up to one.
- BranchProbability::normalizeProbabilities(BBSuccProbs.begin(),
- BBSuccProbs.end());
- }
-
- // Update edge probabilities in BPI.
- BPI->setEdgeProbability(BB, BBSuccProbs);
-
- // Update the profile metadata as well.
- //
- // Don't do this if the profile of the transformed blocks was statically
- // estimated. (This could occur despite the function having an entry
- // frequency in completely cold parts of the CFG.)
- //
- // In this case we don't want to suggest to subsequent passes that the
- // calculated weights are fully consistent. Consider this graph:
- //
- // check_1
- // 50% / |
- // eq_1 | 50%
- // \ |
- // check_2
- // 50% / |
- // eq_2 | 50%
- // \ |
- // check_3
- // 50% / |
- // eq_3 | 50%
- // \ |
- //
- // Assuming the blocks check_* all compare the same value against 1, 2 and 3,
- // the overall probabilities are inconsistent; the total probability that the
- // value is either 1, 2 or 3 is 150%.
- //
- // As a consequence if we thread eq_1 -> check_2 to check_3, check_2->check_3
- // becomes 0%. This is even worse if the edge whose probability becomes 0% is
- // the loop exit edge. Then based solely on static estimation we would assume
- // the loop was extremely hot.
- //
- // FIXME this locally as well so that BPI and BFI are consistent as well. We
- // shouldn't make edges extremely likely or unlikely based solely on static
- // estimation.
- if (BBSuccProbs.size() >= 2 && doesBlockHaveProfileData(BB)) {
- SmallVector<uint32_t, 4> Weights;
- for (auto Prob : BBSuccProbs)
- Weights.push_back(Prob.getNumerator());
-
- auto TI = BB->getTerminator();
- TI->setMetadata(
- LLVMContext::MD_prof,
- MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights));
- }
-}
-
+ BasicBlock *BB,
+ BasicBlock *NewBB,
+ BasicBlock *SuccBB) {
+ if (!HasProfileData)
+ return;
+
+ assert(BFI && BPI && "BFI & BPI should have been created here");
+
+ // As the edge from PredBB to BB is deleted, we have to update the block
+ // frequency of BB.
+ auto BBOrigFreq = BFI->getBlockFreq(BB);
+ auto NewBBFreq = BFI->getBlockFreq(NewBB);
+ auto BB2SuccBBFreq = BBOrigFreq * BPI->getEdgeProbability(BB, SuccBB);
+ auto BBNewFreq = BBOrigFreq - NewBBFreq;
+ BFI->setBlockFreq(BB, BBNewFreq.getFrequency());
+
+ // Collect updated outgoing edges' frequencies from BB and use them to update
+ // edge probabilities.
+ SmallVector<uint64_t, 4> BBSuccFreq;
+ for (BasicBlock *Succ : successors(BB)) {
+ auto SuccFreq = (Succ == SuccBB)
+ ? BB2SuccBBFreq - NewBBFreq
+ : BBOrigFreq * BPI->getEdgeProbability(BB, Succ);
+ BBSuccFreq.push_back(SuccFreq.getFrequency());
+ }
+
+ uint64_t MaxBBSuccFreq =
+ *std::max_element(BBSuccFreq.begin(), BBSuccFreq.end());
+
+ SmallVector<BranchProbability, 4> BBSuccProbs;
+ if (MaxBBSuccFreq == 0)
+ BBSuccProbs.assign(BBSuccFreq.size(),
+ {1, static_cast<unsigned>(BBSuccFreq.size())});
+ else {
+ for (uint64_t Freq : BBSuccFreq)
+ BBSuccProbs.push_back(
+ BranchProbability::getBranchProbability(Freq, MaxBBSuccFreq));
+ // Normalize edge probabilities so that they sum up to one.
+ BranchProbability::normalizeProbabilities(BBSuccProbs.begin(),
+ BBSuccProbs.end());
+ }
+
+ // Update edge probabilities in BPI.
+ BPI->setEdgeProbability(BB, BBSuccProbs);
+
+ // Update the profile metadata as well.
+ //
+ // Don't do this if the profile of the transformed blocks was statically
+ // estimated. (This could occur despite the function having an entry
+ // frequency in completely cold parts of the CFG.)
+ //
+ // In this case we don't want to suggest to subsequent passes that the
+ // calculated weights are fully consistent. Consider this graph:
+ //
+ // check_1
+ // 50% / |
+ // eq_1 | 50%
+ // \ |
+ // check_2
+ // 50% / |
+ // eq_2 | 50%
+ // \ |
+ // check_3
+ // 50% / |
+ // eq_3 | 50%
+ // \ |
+ //
+ // Assuming the blocks check_* all compare the same value against 1, 2 and 3,
+ // the overall probabilities are inconsistent; the total probability that the
+ // value is either 1, 2 or 3 is 150%.
+ //
+ // As a consequence if we thread eq_1 -> check_2 to check_3, check_2->check_3
+ // becomes 0%. This is even worse if the edge whose probability becomes 0% is
+ // the loop exit edge. Then based solely on static estimation we would assume
+ // the loop was extremely hot.
+ //
+ // FIXME this locally as well so that BPI and BFI are consistent as well. We
+ // shouldn't make edges extremely likely or unlikely based solely on static
+ // estimation.
+ if (BBSuccProbs.size() >= 2 && doesBlockHaveProfileData(BB)) {
+ SmallVector<uint32_t, 4> Weights;
+ for (auto Prob : BBSuccProbs)
+ Weights.push_back(Prob.getNumerator());
+
+ auto TI = BB->getTerminator();
+ TI->setMetadata(
+ LLVMContext::MD_prof,
+ MDBuilder(TI->getParent()->getContext()).createBranchWeights(Weights));
+ }
+}
+
/// duplicateCondBranchOnPHIIntoPred - PredBB contains an unconditional branch
-/// to BB which contains an i1 PHI node and a conditional branch on that PHI.
-/// If we can duplicate the contents of BB up into PredBB do so now, this
-/// improves the odds that the branch will be on an analyzable instruction like
-/// a compare.
+/// to BB which contains an i1 PHI node and a conditional branch on that PHI.
+/// If we can duplicate the contents of BB up into PredBB do so now, this
+/// improves the odds that the branch will be on an analyzable instruction like
+/// a compare.
bool JumpThreadingPass::duplicateCondBranchOnPHIIntoPred(
- BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs) {
- assert(!PredBBs.empty() && "Can't handle an empty set");
-
- // If BB is a loop header, then duplicating this block outside the loop would
- // cause us to transform this into an irreducible loop, don't do this.
+ BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs) {
+ assert(!PredBBs.empty() && "Can't handle an empty set");
+
+ // If BB is a loop header, then duplicating this block outside the loop would
+ // cause us to transform this into an irreducible loop, don't do this.
// See the comments above findLoopHeaders for justifications and caveats.
- if (LoopHeaders.count(BB)) {
- LLVM_DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName()
- << "' into predecessor block '" << PredBBs[0]->getName()
- << "' - it might create an irreducible loop!\n");
- return false;
- }
-
- unsigned DuplicationCost =
- getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
- if (DuplicationCost > BBDupThreshold) {
- LLVM_DEBUG(dbgs() << " Not duplicating BB '" << BB->getName()
- << "' - Cost is too high: " << DuplicationCost << "\n");
- return false;
- }
-
- // And finally, do it! Start by factoring the predecessors if needed.
- std::vector<DominatorTree::UpdateType> Updates;
- BasicBlock *PredBB;
- if (PredBBs.size() == 1)
- PredBB = PredBBs[0];
- else {
- LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size()
- << " common predecessors.\n");
+ if (LoopHeaders.count(BB)) {
+ LLVM_DEBUG(dbgs() << " Not duplicating loop header '" << BB->getName()
+ << "' into predecessor block '" << PredBBs[0]->getName()
+ << "' - it might create an irreducible loop!\n");
+ return false;
+ }
+
+ unsigned DuplicationCost =
+ getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
+ if (DuplicationCost > BBDupThreshold) {
+ LLVM_DEBUG(dbgs() << " Not duplicating BB '" << BB->getName()
+ << "' - Cost is too high: " << DuplicationCost << "\n");
+ return false;
+ }
+
+ // And finally, do it! Start by factoring the predecessors if needed.
+ std::vector<DominatorTree::UpdateType> Updates;
+ BasicBlock *PredBB;
+ if (PredBBs.size() == 1)
+ PredBB = PredBBs[0];
+ else {
+ LLVM_DEBUG(dbgs() << " Factoring out " << PredBBs.size()
+ << " common predecessors.\n");
PredBB = splitBlockPreds(BB, PredBBs, ".thr_comm");
- }
- Updates.push_back({DominatorTree::Delete, PredBB, BB});
-
- // Okay, we decided to do this! Clone all the instructions in BB onto the end
- // of PredBB.
- LLVM_DEBUG(dbgs() << " Duplicating block '" << BB->getName()
- << "' into end of '" << PredBB->getName()
- << "' to eliminate branch on phi. Cost: "
- << DuplicationCost << " block is:" << *BB << "\n");
-
- // Unless PredBB ends with an unconditional branch, split the edge so that we
- // can just clone the bits from BB into the end of the new PredBB.
- BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
-
- if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
- BasicBlock *OldPredBB = PredBB;
- PredBB = SplitEdge(OldPredBB, BB);
- Updates.push_back({DominatorTree::Insert, OldPredBB, PredBB});
- Updates.push_back({DominatorTree::Insert, PredBB, BB});
- Updates.push_back({DominatorTree::Delete, OldPredBB, BB});
- OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
- }
-
- // We are going to have to map operands from the original BB block into the
- // PredBB block. Evaluate PHI nodes in BB.
- DenseMap<Instruction*, Value*> ValueMapping;
-
- BasicBlock::iterator BI = BB->begin();
- for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
- ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
- // Clone the non-phi instructions of BB into PredBB, keeping track of the
- // mapping and using it to remap operands in the cloned instructions.
- for (; BI != BB->end(); ++BI) {
- Instruction *New = BI->clone();
-
- // Remap operands to patch up intra-block references.
- for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
- if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
- DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
- if (I != ValueMapping.end())
- New->setOperand(i, I->second);
- }
-
- // If this instruction can be simplified after the operands are updated,
- // just use the simplified value instead. This frequently happens due to
- // phi translation.
- if (Value *IV = SimplifyInstruction(
- New,
- {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) {
- ValueMapping[&*BI] = IV;
- if (!New->mayHaveSideEffects()) {
- New->deleteValue();
- New = nullptr;
- }
- } else {
- ValueMapping[&*BI] = New;
- }
- if (New) {
- // Otherwise, insert the new instruction into the block.
- New->setName(BI->getName());
- PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
- // Update Dominance from simplified New instruction operands.
- for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
- if (BasicBlock *SuccBB = dyn_cast<BasicBlock>(New->getOperand(i)))
- Updates.push_back({DominatorTree::Insert, PredBB, SuccBB});
- }
- }
-
- // Check to see if the targets of the branch had PHI nodes. If so, we need to
- // add entries to the PHI nodes for branch from PredBB now.
- BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator());
+ }
+ Updates.push_back({DominatorTree::Delete, PredBB, BB});
+
+ // Okay, we decided to do this! Clone all the instructions in BB onto the end
+ // of PredBB.
+ LLVM_DEBUG(dbgs() << " Duplicating block '" << BB->getName()
+ << "' into end of '" << PredBB->getName()
+ << "' to eliminate branch on phi. Cost: "
+ << DuplicationCost << " block is:" << *BB << "\n");
+
+ // Unless PredBB ends with an unconditional branch, split the edge so that we
+ // can just clone the bits from BB into the end of the new PredBB.
+ BranchInst *OldPredBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
+
+ if (!OldPredBranch || !OldPredBranch->isUnconditional()) {
+ BasicBlock *OldPredBB = PredBB;
+ PredBB = SplitEdge(OldPredBB, BB);
+ Updates.push_back({DominatorTree::Insert, OldPredBB, PredBB});
+ Updates.push_back({DominatorTree::Insert, PredBB, BB});
+ Updates.push_back({DominatorTree::Delete, OldPredBB, BB});
+ OldPredBranch = cast<BranchInst>(PredBB->getTerminator());
+ }
+
+ // We are going to have to map operands from the original BB block into the
+ // PredBB block. Evaluate PHI nodes in BB.
+ DenseMap<Instruction*, Value*> ValueMapping;
+
+ BasicBlock::iterator BI = BB->begin();
+ for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+ ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+ // Clone the non-phi instructions of BB into PredBB, keeping track of the
+ // mapping and using it to remap operands in the cloned instructions.
+ for (; BI != BB->end(); ++BI) {
+ Instruction *New = BI->clone();
+
+ // Remap operands to patch up intra-block references.
+ for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+ if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+ DenseMap<Instruction*, Value*>::iterator I = ValueMapping.find(Inst);
+ if (I != ValueMapping.end())
+ New->setOperand(i, I->second);
+ }
+
+ // If this instruction can be simplified after the operands are updated,
+ // just use the simplified value instead. This frequently happens due to
+ // phi translation.
+ if (Value *IV = SimplifyInstruction(
+ New,
+ {BB->getModule()->getDataLayout(), TLI, nullptr, nullptr, New})) {
+ ValueMapping[&*BI] = IV;
+ if (!New->mayHaveSideEffects()) {
+ New->deleteValue();
+ New = nullptr;
+ }
+ } else {
+ ValueMapping[&*BI] = New;
+ }
+ if (New) {
+ // Otherwise, insert the new instruction into the block.
+ New->setName(BI->getName());
+ PredBB->getInstList().insert(OldPredBranch->getIterator(), New);
+ // Update Dominance from simplified New instruction operands.
+ for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+ if (BasicBlock *SuccBB = dyn_cast<BasicBlock>(New->getOperand(i)))
+ Updates.push_back({DominatorTree::Insert, PredBB, SuccBB});
+ }
+ }
+
+ // Check to see if the targets of the branch had PHI nodes. If so, we need to
+ // add entries to the PHI nodes for branch from PredBB now.
+ BranchInst *BBBranch = cast<BranchInst>(BB->getTerminator());
addPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(0), BB, PredBB,
- ValueMapping);
+ ValueMapping);
addPHINodeEntriesForMappedBlock(BBBranch->getSuccessor(1), BB, PredBB,
- ValueMapping);
-
+ ValueMapping);
+
updateSSA(BB, PredBB, ValueMapping);
-
- // PredBB no longer jumps to BB, remove entries in the PHI node for the edge
- // that we nuked.
- BB->removePredecessor(PredBB, true);
-
- // Remove the unconditional branch at the end of the PredBB block.
- OldPredBranch->eraseFromParent();
+
+ // PredBB no longer jumps to BB, remove entries in the PHI node for the edge
+ // that we nuked.
+ BB->removePredecessor(PredBB, true);
+
+ // Remove the unconditional branch at the end of the PredBB block.
+ OldPredBranch->eraseFromParent();
if (HasProfileData)
BPI->copyEdgeProbabilities(BB, PredBB);
- DTU->applyUpdatesPermissive(Updates);
-
- ++NumDupes;
- return true;
-}
-
-// Pred is a predecessor of BB with an unconditional branch to BB. SI is
-// a Select instruction in Pred. BB has other predecessors and SI is used in
-// a PHI node in BB. SI has no other use.
-// A new basic block, NewBB, is created and SI is converted to compare and
-// conditional branch. SI is erased from parent.
+ DTU->applyUpdatesPermissive(Updates);
+
+ ++NumDupes;
+ return true;
+}
+
+// Pred is a predecessor of BB with an unconditional branch to BB. SI is
+// a Select instruction in Pred. BB has other predecessors and SI is used in
+// a PHI node in BB. SI has no other use.
+// A new basic block, NewBB, is created and SI is converted to compare and
+// conditional branch. SI is erased from parent.
void JumpThreadingPass::unfoldSelectInstr(BasicBlock *Pred, BasicBlock *BB,
- SelectInst *SI, PHINode *SIUse,
- unsigned Idx) {
- // Expand the select.
- //
- // Pred --
- // | v
- // | NewBB
- // | |
- // |-----
- // v
- // BB
- BranchInst *PredTerm = cast<BranchInst>(Pred->getTerminator());
- BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
- BB->getParent(), BB);
- // Move the unconditional branch to NewBB.
- PredTerm->removeFromParent();
- NewBB->getInstList().insert(NewBB->end(), PredTerm);
- // Create a conditional branch and update PHI nodes.
- BranchInst::Create(NewBB, BB, SI->getCondition(), Pred);
- SIUse->setIncomingValue(Idx, SI->getFalseValue());
- SIUse->addIncoming(SI->getTrueValue(), NewBB);
-
- // The select is now dead.
- SI->eraseFromParent();
- DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, BB},
- {DominatorTree::Insert, Pred, NewBB}});
-
- // Update any other PHI nodes in BB.
- for (BasicBlock::iterator BI = BB->begin();
- PHINode *Phi = dyn_cast<PHINode>(BI); ++BI)
- if (Phi != SIUse)
- Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB);
-}
-
+ SelectInst *SI, PHINode *SIUse,
+ unsigned Idx) {
+ // Expand the select.
+ //
+ // Pred --
+ // | v
+ // | NewBB
+ // | |
+ // |-----
+ // v
+ // BB
+ BranchInst *PredTerm = cast<BranchInst>(Pred->getTerminator());
+ BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "select.unfold",
+ BB->getParent(), BB);
+ // Move the unconditional branch to NewBB.
+ PredTerm->removeFromParent();
+ NewBB->getInstList().insert(NewBB->end(), PredTerm);
+ // Create a conditional branch and update PHI nodes.
+ BranchInst::Create(NewBB, BB, SI->getCondition(), Pred);
+ SIUse->setIncomingValue(Idx, SI->getFalseValue());
+ SIUse->addIncoming(SI->getTrueValue(), NewBB);
+
+ // The select is now dead.
+ SI->eraseFromParent();
+ DTU->applyUpdatesPermissive({{DominatorTree::Insert, NewBB, BB},
+ {DominatorTree::Insert, Pred, NewBB}});
+
+ // Update any other PHI nodes in BB.
+ for (BasicBlock::iterator BI = BB->begin();
+ PHINode *Phi = dyn_cast<PHINode>(BI); ++BI)
+ if (Phi != SIUse)
+ Phi->addIncoming(Phi->getIncomingValueForBlock(Pred), NewBB);
+}
+
bool JumpThreadingPass::tryToUnfoldSelect(SwitchInst *SI, BasicBlock *BB) {
- PHINode *CondPHI = dyn_cast<PHINode>(SI->getCondition());
-
- if (!CondPHI || CondPHI->getParent() != BB)
- return false;
-
- for (unsigned I = 0, E = CondPHI->getNumIncomingValues(); I != E; ++I) {
- BasicBlock *Pred = CondPHI->getIncomingBlock(I);
- SelectInst *PredSI = dyn_cast<SelectInst>(CondPHI->getIncomingValue(I));
-
- // The second and third condition can be potentially relaxed. Currently
- // the conditions help to simplify the code and allow us to reuse existing
+ PHINode *CondPHI = dyn_cast<PHINode>(SI->getCondition());
+
+ if (!CondPHI || CondPHI->getParent() != BB)
+ return false;
+
+ for (unsigned I = 0, E = CondPHI->getNumIncomingValues(); I != E; ++I) {
+ BasicBlock *Pred = CondPHI->getIncomingBlock(I);
+ SelectInst *PredSI = dyn_cast<SelectInst>(CondPHI->getIncomingValue(I));
+
+ // The second and third condition can be potentially relaxed. Currently
+ // the conditions help to simplify the code and allow us to reuse existing
// code, developed for tryToUnfoldSelect(CmpInst *, BasicBlock *)
- if (!PredSI || PredSI->getParent() != Pred || !PredSI->hasOneUse())
- continue;
-
- BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
- if (!PredTerm || !PredTerm->isUnconditional())
- continue;
-
+ if (!PredSI || PredSI->getParent() != Pred || !PredSI->hasOneUse())
+ continue;
+
+ BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
+ if (!PredTerm || !PredTerm->isUnconditional())
+ continue;
+
unfoldSelectInstr(Pred, BB, PredSI, CondPHI, I);
- return true;
- }
- return false;
-}
-
+ return true;
+ }
+ return false;
+}
+
/// tryToUnfoldSelect - Look for blocks of the form
-/// bb1:
-/// %a = select
-/// br bb2
-///
-/// bb2:
-/// %p = phi [%a, %bb1] ...
-/// %c = icmp %p
-/// br i1 %c
-///
-/// And expand the select into a branch structure if one of its arms allows %c
-/// to be folded. This later enables threading from bb1 over bb2.
+/// bb1:
+/// %a = select
+/// br bb2
+///
+/// bb2:
+/// %p = phi [%a, %bb1] ...
+/// %c = icmp %p
+/// br i1 %c
+///
+/// And expand the select into a branch structure if one of its arms allows %c
+/// to be folded. This later enables threading from bb1 over bb2.
bool JumpThreadingPass::tryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
- BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
- PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
- Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));
-
- if (!CondBr || !CondBr->isConditional() || !CondLHS ||
- CondLHS->getParent() != BB)
- return false;
-
- for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) {
- BasicBlock *Pred = CondLHS->getIncomingBlock(I);
- SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I));
-
- // Look if one of the incoming values is a select in the corresponding
- // predecessor.
- if (!SI || SI->getParent() != Pred || !SI->hasOneUse())
- continue;
-
- BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
- if (!PredTerm || !PredTerm->isUnconditional())
- continue;
-
- // Now check if one of the select values would allow us to constant fold the
- // terminator in BB. We don't do the transform if both sides fold, those
- // cases will be threaded in any case.
- LazyValueInfo::Tristate LHSFolds =
- LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
- CondRHS, Pred, BB, CondCmp);
- LazyValueInfo::Tristate RHSFolds =
- LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2),
- CondRHS, Pred, BB, CondCmp);
- if ((LHSFolds != LazyValueInfo::Unknown ||
- RHSFolds != LazyValueInfo::Unknown) &&
- LHSFolds != RHSFolds) {
+ BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+ PHINode *CondLHS = dyn_cast<PHINode>(CondCmp->getOperand(0));
+ Constant *CondRHS = cast<Constant>(CondCmp->getOperand(1));
+
+ if (!CondBr || !CondBr->isConditional() || !CondLHS ||
+ CondLHS->getParent() != BB)
+ return false;
+
+ for (unsigned I = 0, E = CondLHS->getNumIncomingValues(); I != E; ++I) {
+ BasicBlock *Pred = CondLHS->getIncomingBlock(I);
+ SelectInst *SI = dyn_cast<SelectInst>(CondLHS->getIncomingValue(I));
+
+ // Look if one of the incoming values is a select in the corresponding
+ // predecessor.
+ if (!SI || SI->getParent() != Pred || !SI->hasOneUse())
+ continue;
+
+ BranchInst *PredTerm = dyn_cast<BranchInst>(Pred->getTerminator());
+ if (!PredTerm || !PredTerm->isUnconditional())
+ continue;
+
+ // Now check if one of the select values would allow us to constant fold the
+ // terminator in BB. We don't do the transform if both sides fold, those
+ // cases will be threaded in any case.
+ LazyValueInfo::Tristate LHSFolds =
+ LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
+ CondRHS, Pred, BB, CondCmp);
+ LazyValueInfo::Tristate RHSFolds =
+ LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(2),
+ CondRHS, Pred, BB, CondCmp);
+ if ((LHSFolds != LazyValueInfo::Unknown ||
+ RHSFolds != LazyValueInfo::Unknown) &&
+ LHSFolds != RHSFolds) {
unfoldSelectInstr(Pred, BB, SI, CondLHS, I);
- return true;
- }
- }
- return false;
-}
-
+ return true;
+ }
+ }
+ return false;
+}
+
/// tryToUnfoldSelectInCurrBB - Look for PHI/Select or PHI/CMP/Select in the
-/// same BB in the form
-/// bb:
-/// %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
-/// %s = select %p, trueval, falseval
-///
-/// or
-///
-/// bb:
-/// %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ...
-/// %c = cmp %p, 0
-/// %s = select %c, trueval, falseval
-///
-/// And expand the select into a branch structure. This later enables
-/// jump-threading over bb in this pass.
-///
-/// Using the similar approach of SimplifyCFG::FoldCondBranchOnPHI(), unfold
-/// select if the associated PHI has at least one constant. If the unfolded
-/// select is not jump-threaded, it will be folded again in the later
-/// optimizations.
+/// same BB in the form
+/// bb:
+/// %p = phi [false, %bb1], [true, %bb2], [false, %bb3], [true, %bb4], ...
+/// %s = select %p, trueval, falseval
+///
+/// or
+///
+/// bb:
+/// %p = phi [0, %bb1], [1, %bb2], [0, %bb3], [1, %bb4], ...
+/// %c = cmp %p, 0
+/// %s = select %c, trueval, falseval
+///
+/// And expand the select into a branch structure. This later enables
+/// jump-threading over bb in this pass.
+///
+/// Using the similar approach of SimplifyCFG::FoldCondBranchOnPHI(), unfold
+/// select if the associated PHI has at least one constant. If the unfolded
+/// select is not jump-threaded, it will be folded again in the later
+/// optimizations.
bool JumpThreadingPass::tryToUnfoldSelectInCurrBB(BasicBlock *BB) {
// This transform would reduce the quality of msan diagnostics.
- // Disable this transform under MemorySanitizer.
- if (BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory))
- return false;
-
- // If threading this would thread across a loop header, don't thread the edge.
+ // Disable this transform under MemorySanitizer.
+ if (BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory))
+ return false;
+
+ // If threading this would thread across a loop header, don't thread the edge.
// See the comments above findLoopHeaders for justifications and caveats.
- if (LoopHeaders.count(BB))
- return false;
-
- for (BasicBlock::iterator BI = BB->begin();
- PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
- // Look for a Phi having at least one constant incoming value.
- if (llvm::all_of(PN->incoming_values(),
- [](Value *V) { return !isa<ConstantInt>(V); }))
- continue;
-
- auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) {
- // Check if SI is in BB and use V as condition.
- if (SI->getParent() != BB)
- return false;
- Value *Cond = SI->getCondition();
- return (Cond && Cond == V && Cond->getType()->isIntegerTy(1));
- };
-
- SelectInst *SI = nullptr;
- for (Use &U : PN->uses()) {
- if (ICmpInst *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
- // Look for a ICmp in BB that compares PN with a constant and is the
- // condition of a Select.
- if (Cmp->getParent() == BB && Cmp->hasOneUse() &&
- isa<ConstantInt>(Cmp->getOperand(1 - U.getOperandNo())))
- if (SelectInst *SelectI = dyn_cast<SelectInst>(Cmp->user_back()))
- if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) {
- SI = SelectI;
- break;
- }
- } else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) {
- // Look for a Select in BB that uses PN as condition.
- if (isUnfoldCandidate(SelectI, U.get())) {
- SI = SelectI;
- break;
- }
- }
- }
-
- if (!SI)
- continue;
- // Expand the select.
+ if (LoopHeaders.count(BB))
+ return false;
+
+ for (BasicBlock::iterator BI = BB->begin();
+ PHINode *PN = dyn_cast<PHINode>(BI); ++BI) {
+ // Look for a Phi having at least one constant incoming value.
+ if (llvm::all_of(PN->incoming_values(),
+ [](Value *V) { return !isa<ConstantInt>(V); }))
+ continue;
+
+ auto isUnfoldCandidate = [BB](SelectInst *SI, Value *V) {
+ // Check if SI is in BB and use V as condition.
+ if (SI->getParent() != BB)
+ return false;
+ Value *Cond = SI->getCondition();
+ return (Cond && Cond == V && Cond->getType()->isIntegerTy(1));
+ };
+
+ SelectInst *SI = nullptr;
+ for (Use &U : PN->uses()) {
+ if (ICmpInst *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
+ // Look for a ICmp in BB that compares PN with a constant and is the
+ // condition of a Select.
+ if (Cmp->getParent() == BB && Cmp->hasOneUse() &&
+ isa<ConstantInt>(Cmp->getOperand(1 - U.getOperandNo())))
+ if (SelectInst *SelectI = dyn_cast<SelectInst>(Cmp->user_back()))
+ if (isUnfoldCandidate(SelectI, Cmp->use_begin()->get())) {
+ SI = SelectI;
+ break;
+ }
+ } else if (SelectInst *SelectI = dyn_cast<SelectInst>(U.getUser())) {
+ // Look for a Select in BB that uses PN as condition.
+ if (isUnfoldCandidate(SelectI, U.get())) {
+ SI = SelectI;
+ break;
+ }
+ }
+ }
+
+ if (!SI)
+ continue;
+ // Expand the select.
Value *Cond = SI->getCondition();
if (InsertFreezeWhenUnfoldingSelect &&
!isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI,
&DTU->getDomTree()))
Cond = new FreezeInst(Cond, "cond.fr", SI);
Instruction *Term = SplitBlockAndInsertIfThen(Cond, SI, false);
- BasicBlock *SplitBB = SI->getParent();
- BasicBlock *NewBB = Term->getParent();
- PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
- NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
- NewPN->addIncoming(SI->getFalseValue(), BB);
- SI->replaceAllUsesWith(NewPN);
- SI->eraseFromParent();
- // NewBB and SplitBB are newly created blocks which require insertion.
- std::vector<DominatorTree::UpdateType> Updates;
- Updates.reserve((2 * SplitBB->getTerminator()->getNumSuccessors()) + 3);
- Updates.push_back({DominatorTree::Insert, BB, SplitBB});
- Updates.push_back({DominatorTree::Insert, BB, NewBB});
- Updates.push_back({DominatorTree::Insert, NewBB, SplitBB});
- // BB's successors were moved to SplitBB, update DTU accordingly.
- for (auto *Succ : successors(SplitBB)) {
- Updates.push_back({DominatorTree::Delete, BB, Succ});
- Updates.push_back({DominatorTree::Insert, SplitBB, Succ});
- }
- DTU->applyUpdatesPermissive(Updates);
- return true;
- }
- return false;
-}
-
-/// Try to propagate a guard from the current BB into one of its predecessors
-/// in case if another branch of execution implies that the condition of this
-/// guard is always true. Currently we only process the simplest case that
-/// looks like:
-///
-/// Start:
-/// %cond = ...
-/// br i1 %cond, label %T1, label %F1
-/// T1:
-/// br label %Merge
-/// F1:
-/// br label %Merge
-/// Merge:
-/// %condGuard = ...
-/// call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
-///
-/// And cond either implies condGuard or !condGuard. In this case all the
-/// instructions before the guard can be duplicated in both branches, and the
-/// guard is then threaded to one of them.
+ BasicBlock *SplitBB = SI->getParent();
+ BasicBlock *NewBB = Term->getParent();
+ PHINode *NewPN = PHINode::Create(SI->getType(), 2, "", SI);
+ NewPN->addIncoming(SI->getTrueValue(), Term->getParent());
+ NewPN->addIncoming(SI->getFalseValue(), BB);
+ SI->replaceAllUsesWith(NewPN);
+ SI->eraseFromParent();
+ // NewBB and SplitBB are newly created blocks which require insertion.
+ std::vector<DominatorTree::UpdateType> Updates;
+ Updates.reserve((2 * SplitBB->getTerminator()->getNumSuccessors()) + 3);
+ Updates.push_back({DominatorTree::Insert, BB, SplitBB});
+ Updates.push_back({DominatorTree::Insert, BB, NewBB});
+ Updates.push_back({DominatorTree::Insert, NewBB, SplitBB});
+ // BB's successors were moved to SplitBB, update DTU accordingly.
+ for (auto *Succ : successors(SplitBB)) {
+ Updates.push_back({DominatorTree::Delete, BB, Succ});
+ Updates.push_back({DominatorTree::Insert, SplitBB, Succ});
+ }
+ DTU->applyUpdatesPermissive(Updates);
+ return true;
+ }
+ return false;
+}
+
+/// Try to propagate a guard from the current BB into one of its predecessors
+/// in case if another branch of execution implies that the condition of this
+/// guard is always true. Currently we only process the simplest case that
+/// looks like:
+///
+/// Start:
+/// %cond = ...
+/// br i1 %cond, label %T1, label %F1
+/// T1:
+/// br label %Merge
+/// F1:
+/// br label %Merge
+/// Merge:
+/// %condGuard = ...
+/// call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+///
+/// And cond either implies condGuard or !condGuard. In this case all the
+/// instructions before the guard can be duplicated in both branches, and the
+/// guard is then threaded to one of them.
bool JumpThreadingPass::processGuards(BasicBlock *BB) {
- using namespace PatternMatch;
-
- // We only want to deal with two predecessors.
- BasicBlock *Pred1, *Pred2;
- auto PI = pred_begin(BB), PE = pred_end(BB);
- if (PI == PE)
- return false;
- Pred1 = *PI++;
- if (PI == PE)
- return false;
- Pred2 = *PI++;
- if (PI != PE)
- return false;
- if (Pred1 == Pred2)
- return false;
-
- // Try to thread one of the guards of the block.
- // TODO: Look up deeper than to immediate predecessor?
- auto *Parent = Pred1->getSinglePredecessor();
- if (!Parent || Parent != Pred2->getSinglePredecessor())
- return false;
-
- if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
- for (auto &I : *BB)
+ using namespace PatternMatch;
+
+ // We only want to deal with two predecessors.
+ BasicBlock *Pred1, *Pred2;
+ auto PI = pred_begin(BB), PE = pred_end(BB);
+ if (PI == PE)
+ return false;
+ Pred1 = *PI++;
+ if (PI == PE)
+ return false;
+ Pred2 = *PI++;
+ if (PI != PE)
+ return false;
+ if (Pred1 == Pred2)
+ return false;
+
+ // Try to thread one of the guards of the block.
+ // TODO: Look up deeper than to immediate predecessor?
+ auto *Parent = Pred1->getSinglePredecessor();
+ if (!Parent || Parent != Pred2->getSinglePredecessor())
+ return false;
+
+ if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
+ for (auto &I : *BB)
if (isGuard(&I) && threadGuard(BB, cast<IntrinsicInst>(&I), BI))
- return true;
-
- return false;
-}
-
-/// Try to propagate the guard from BB which is the lower block of a diamond
-/// to one of its branches, in case if diamond's condition implies guard's
-/// condition.
+ return true;
+
+ return false;
+}
+
+/// Try to propagate the guard from BB which is the lower block of a diamond
+/// to one of its branches, in case if diamond's condition implies guard's
+/// condition.
bool JumpThreadingPass::threadGuard(BasicBlock *BB, IntrinsicInst *Guard,
- BranchInst *BI) {
- assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
- assert(BI->isConditional() && "Unconditional branch has 2 successors?");
- Value *GuardCond = Guard->getArgOperand(0);
- Value *BranchCond = BI->getCondition();
- BasicBlock *TrueDest = BI->getSuccessor(0);
- BasicBlock *FalseDest = BI->getSuccessor(1);
-
- auto &DL = BB->getModule()->getDataLayout();
- bool TrueDestIsSafe = false;
- bool FalseDestIsSafe = false;
-
- // True dest is safe if BranchCond => GuardCond.
- auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
- if (Impl && *Impl)
- TrueDestIsSafe = true;
- else {
- // False dest is safe if !BranchCond => GuardCond.
- Impl = isImpliedCondition(BranchCond, GuardCond, DL, /* LHSIsTrue */ false);
- if (Impl && *Impl)
- FalseDestIsSafe = true;
- }
-
- if (!TrueDestIsSafe && !FalseDestIsSafe)
- return false;
-
- BasicBlock *PredUnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
- BasicBlock *PredGuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
-
- ValueToValueMapTy UnguardedMapping, GuardedMapping;
- Instruction *AfterGuard = Guard->getNextNode();
- unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
- if (Cost > BBDupThreshold)
- return false;
- // Duplicate all instructions before the guard and the guard itself to the
- // branch where implication is not proved.
- BasicBlock *GuardedBlock = DuplicateInstructionsInSplitBetween(
- BB, PredGuardedBlock, AfterGuard, GuardedMapping, *DTU);
- assert(GuardedBlock && "Could not create the guarded block?");
- // Duplicate all instructions before the guard in the unguarded branch.
- // Since we have successfully duplicated the guarded block and this block
- // has fewer instructions, we expect it to succeed.
- BasicBlock *UnguardedBlock = DuplicateInstructionsInSplitBetween(
- BB, PredUnguardedBlock, Guard, UnguardedMapping, *DTU);
- assert(UnguardedBlock && "Could not create the unguarded block?");
- LLVM_DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
- << GuardedBlock->getName() << "\n");
- // Some instructions before the guard may still have uses. For them, we need
- // to create Phi nodes merging their copies in both guarded and unguarded
- // branches. Those instructions that have no uses can be just removed.
- SmallVector<Instruction *, 4> ToRemove;
- for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
- if (!isa<PHINode>(&*BI))
- ToRemove.push_back(&*BI);
-
- Instruction *InsertionPoint = &*BB->getFirstInsertionPt();
- assert(InsertionPoint && "Empty block?");
- // Substitute with Phis & remove.
- for (auto *Inst : reverse(ToRemove)) {
- if (!Inst->use_empty()) {
- PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
- NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
- NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
- NewPN->insertBefore(InsertionPoint);
- Inst->replaceAllUsesWith(NewPN);
- }
- Inst->eraseFromParent();
- }
- return true;
-}
+ BranchInst *BI) {
+ assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
+ assert(BI->isConditional() && "Unconditional branch has 2 successors?");
+ Value *GuardCond = Guard->getArgOperand(0);
+ Value *BranchCond = BI->getCondition();
+ BasicBlock *TrueDest = BI->getSuccessor(0);
+ BasicBlock *FalseDest = BI->getSuccessor(1);
+
+ auto &DL = BB->getModule()->getDataLayout();
+ bool TrueDestIsSafe = false;
+ bool FalseDestIsSafe = false;
+
+ // True dest is safe if BranchCond => GuardCond.
+ auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
+ if (Impl && *Impl)
+ TrueDestIsSafe = true;
+ else {
+ // False dest is safe if !BranchCond => GuardCond.
+ Impl = isImpliedCondition(BranchCond, GuardCond, DL, /* LHSIsTrue */ false);
+ if (Impl && *Impl)
+ FalseDestIsSafe = true;
+ }
+
+ if (!TrueDestIsSafe && !FalseDestIsSafe)
+ return false;
+
+ BasicBlock *PredUnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
+ BasicBlock *PredGuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
+
+ ValueToValueMapTy UnguardedMapping, GuardedMapping;
+ Instruction *AfterGuard = Guard->getNextNode();
+ unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
+ if (Cost > BBDupThreshold)
+ return false;
+ // Duplicate all instructions before the guard and the guard itself to the
+ // branch where implication is not proved.
+ BasicBlock *GuardedBlock = DuplicateInstructionsInSplitBetween(
+ BB, PredGuardedBlock, AfterGuard, GuardedMapping, *DTU);
+ assert(GuardedBlock && "Could not create the guarded block?");
+ // Duplicate all instructions before the guard in the unguarded branch.
+ // Since we have successfully duplicated the guarded block and this block
+ // has fewer instructions, we expect it to succeed.
+ BasicBlock *UnguardedBlock = DuplicateInstructionsInSplitBetween(
+ BB, PredUnguardedBlock, Guard, UnguardedMapping, *DTU);
+ assert(UnguardedBlock && "Could not create the unguarded block?");
+ LLVM_DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
+ << GuardedBlock->getName() << "\n");
+ // Some instructions before the guard may still have uses. For them, we need
+ // to create Phi nodes merging their copies in both guarded and unguarded
+ // branches. Those instructions that have no uses can be just removed.
+ SmallVector<Instruction *, 4> ToRemove;
+ for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
+ if (!isa<PHINode>(&*BI))
+ ToRemove.push_back(&*BI);
+
+ Instruction *InsertionPoint = &*BB->getFirstInsertionPt();
+ assert(InsertionPoint && "Empty block?");
+ // Substitute with Phis & remove.
+ for (auto *Inst : reverse(ToRemove)) {
+ if (!Inst->use_empty()) {
+ PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
+ NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
+ NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
+ NewPN->insertBefore(InsertionPoint);
+ Inst->replaceAllUsesWith(NewPN);
+ }
+ Inst->eraseFromParent();
+ }
+ return true;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LICM.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LICM.cpp
index 5276b77f8c..d2b4ba296f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LICM.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LICM.cpp
@@ -1,17 +1,17 @@
-//===-- LICM.cpp - Loop Invariant Code Motion Pass ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass performs loop invariant code motion, attempting to remove as much
-// code from the body of a loop as possible. It does this by either hoisting
-// code into the preheader block, or by sinking code to the exit blocks if it is
-// safe. This pass also promotes must-aliased memory locations in the loop to
-// live in registers, thus hoisting and sinking "invariant" loads and stores.
-//
+//===-- LICM.cpp - Loop Invariant Code Motion Pass ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion, attempting to remove as much
+// code from the body of a loop as possible. It does this by either hoisting
+// code into the preheader block, or by sinking code to the exit blocks if it is
+// safe. This pass also promotes must-aliased memory locations in the loop to
+// live in registers, thus hoisting and sinking "invariant" loads and stores.
+//
// Hoisting operations out of loops is a canonicalization transform. It
// enables and simplifies subsequent optimizations in the middle-end.
// Rematerialization of hoisted instructions to reduce register pressure is the
@@ -19,223 +19,223 @@
// register pressure and also handles other optimizations than LICM that
// increase live-ranges.
//
-// This pass uses alias analysis for two purposes:
-//
-// 1. Moving loop invariant loads and calls out of loops. If we can determine
-// that a load or call inside of a loop never aliases anything stored to,
-// we can hoist it or sink it like any other instruction.
-// 2. Scalar Promotion of Memory - If there is a store instruction inside of
-// the loop, we try to move the store to happen AFTER the loop instead of
-// inside of the loop. This can only happen if a few conditions are true:
-// A. The pointer stored through is loop invariant
-// B. There are no stores or loads in the loop which _may_ alias the
-// pointer. There are no calls in the loop which mod/ref the pointer.
-// If these conditions are true, we can promote the loads and stores in the
-// loop of the pointer to use a temporary alloca'd variable. We then use
-// the SSAUpdater to construct the appropriate SSA form for the value.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LICM.h"
-#include "llvm/ADT/SetOperations.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
+// This pass uses alias analysis for two purposes:
+//
+// 1. Moving loop invariant loads and calls out of loops. If we can determine
+// that a load or call inside of a loop never aliases anything stored to,
+// we can hoist it or sink it like any other instruction.
+// 2. Scalar Promotion of Memory - If there is a store instruction inside of
+// the loop, we try to move the store to happen AFTER the loop instead of
+// inside of the loop. This can only happen if a few conditions are true:
+// A. The pointer stored through is loop invariant
+// B. There are no stores or loads in the loop which _may_ alias the
+// pointer. There are no calls in the loop which mod/ref the pointer.
+// If these conditions are true, we can promote the loads and stores in the
+// loop of the pointer to use a temporary alloca'd variable. We then use
+// the SSAUpdater to construct the appropriate SSA form for the value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LICM.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/GuardUtils.h"
#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/MustExecute.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/PredIteratorCache.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include <algorithm>
-#include <utility>
-using namespace llvm;
-
-#define DEBUG_TYPE "licm"
-
-STATISTIC(NumCreatedBlocks, "Number of blocks created");
-STATISTIC(NumClonedBranches, "Number of branches cloned");
-STATISTIC(NumSunk, "Number of instructions sunk out of loop");
-STATISTIC(NumHoisted, "Number of instructions hoisted out of loop");
-STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
-STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
-STATISTIC(NumPromoted, "Number of memory locations promoted to registers");
-
-/// Memory promotion is enabled by default.
-static cl::opt<bool>
- DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
- cl::desc("Disable memory promotion in LICM pass"));
-
-static cl::opt<bool> ControlFlowHoisting(
- "licm-control-flow-hoisting", cl::Hidden, cl::init(false),
- cl::desc("Enable control flow (and PHI) hoisting in LICM"));
-
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "licm"
+
+STATISTIC(NumCreatedBlocks, "Number of blocks created");
+STATISTIC(NumClonedBranches, "Number of branches cloned");
+STATISTIC(NumSunk, "Number of instructions sunk out of loop");
+STATISTIC(NumHoisted, "Number of instructions hoisted out of loop");
+STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
+STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
+STATISTIC(NumPromoted, "Number of memory locations promoted to registers");
+
+/// Memory promotion is enabled by default.
+static cl::opt<bool>
+ DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
+ cl::desc("Disable memory promotion in LICM pass"));
+
+static cl::opt<bool> ControlFlowHoisting(
+ "licm-control-flow-hoisting", cl::Hidden, cl::init(false),
+ cl::desc("Enable control flow (and PHI) hoisting in LICM"));
+
static cl::opt<unsigned> HoistSinkColdnessThreshold(
"licm-coldness-threshold", cl::Hidden, cl::init(4),
cl::desc("Relative coldness Threshold of hoisting/sinking destination "
"block for LICM to be considered beneficial"));
-static cl::opt<uint32_t> MaxNumUsesTraversed(
- "licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
- cl::desc("Max num uses visited for identifying load "
- "invariance in loop using invariant start (default = 8)"));
-
-// Default value of zero implies we use the regular alias set tracker mechanism
-// instead of the cross product using AA to identify aliasing of the memory
-// location we are interested in.
-static cl::opt<int>
-LICMN2Theshold("licm-n2-threshold", cl::Hidden, cl::init(0),
- cl::desc("How many instruction to cross product using AA"));
-
-// Experimental option to allow imprecision in LICM in pathological cases, in
-// exchange for faster compile. This is to be removed if MemorySSA starts to
-// address the same issue. This flag applies only when LICM uses MemorySSA
-// instead on AliasSetTracker. LICM calls MemorySSAWalker's
-// getClobberingMemoryAccess, up to the value of the Cap, getting perfect
-// accuracy. Afterwards, LICM will call into MemorySSA's getDefiningAccess,
-// which may not be precise, since optimizeUses is capped. The result is
-// correct, but we may not get as "far up" as possible to get which access is
-// clobbering the one queried.
-cl::opt<unsigned> llvm::SetLicmMssaOptCap(
- "licm-mssa-optimization-cap", cl::init(100), cl::Hidden,
- cl::desc("Enable imprecision in LICM in pathological cases, in exchange "
- "for faster compile. Caps the MemorySSA clobbering calls."));
-
-// Experimentally, memory promotion carries less importance than sinking and
-// hoisting. Limit when we do promotion when using MemorySSA, in order to save
-// compile time.
-cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
- "licm-mssa-max-acc-promotion", cl::init(250), cl::Hidden,
- cl::desc("[LICM & MemorySSA] When MSSA in LICM is disabled, this has no "
- "effect. When MSSA in LICM is enabled, then this is the maximum "
- "number of accesses allowed to be present in a loop in order to "
- "enable memory promotion."));
-
-static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
-static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
- const LoopSafetyInfo *SafetyInfo,
- TargetTransformInfo *TTI, bool &FreeInLoop);
-static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
- BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
- MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
- OptimizationRemarkEmitter *ORE);
-static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
+static cl::opt<uint32_t> MaxNumUsesTraversed(
+ "licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
+ cl::desc("Max num uses visited for identifying load "
+ "invariance in loop using invariant start (default = 8)"));
+
+// Default value of zero implies we use the regular alias set tracker mechanism
+// instead of the cross product using AA to identify aliasing of the memory
+// location we are interested in.
+static cl::opt<int>
+LICMN2Theshold("licm-n2-threshold", cl::Hidden, cl::init(0),
+ cl::desc("How many instruction to cross product using AA"));
+
+// Experimental option to allow imprecision in LICM in pathological cases, in
+// exchange for faster compile. This is to be removed if MemorySSA starts to
+// address the same issue. This flag applies only when LICM uses MemorySSA
+// instead on AliasSetTracker. LICM calls MemorySSAWalker's
+// getClobberingMemoryAccess, up to the value of the Cap, getting perfect
+// accuracy. Afterwards, LICM will call into MemorySSA's getDefiningAccess,
+// which may not be precise, since optimizeUses is capped. The result is
+// correct, but we may not get as "far up" as possible to get which access is
+// clobbering the one queried.
+cl::opt<unsigned> llvm::SetLicmMssaOptCap(
+ "licm-mssa-optimization-cap", cl::init(100), cl::Hidden,
+ cl::desc("Enable imprecision in LICM in pathological cases, in exchange "
+ "for faster compile. Caps the MemorySSA clobbering calls."));
+
+// Experimentally, memory promotion carries less importance than sinking and
+// hoisting. Limit when we do promotion when using MemorySSA, in order to save
+// compile time.
+cl::opt<unsigned> llvm::SetLicmMssaNoAccForPromotionCap(
+ "licm-mssa-max-acc-promotion", cl::init(250), cl::Hidden,
+ cl::desc("[LICM & MemorySSA] When MSSA in LICM is disabled, this has no "
+ "effect. When MSSA in LICM is enabled, then this is the maximum "
+ "number of accesses allowed to be present in a loop in order to "
+ "enable memory promotion."));
+
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+ const LoopSafetyInfo *SafetyInfo,
+ TargetTransformInfo *TTI, bool &FreeInLoop);
+static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
+ BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
+ MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
+ OptimizationRemarkEmitter *ORE);
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
BlockFrequencyInfo *BFI, const Loop *CurLoop,
ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU,
OptimizationRemarkEmitter *ORE);
-static bool isSafeToExecuteUnconditionally(Instruction &Inst,
- const DominatorTree *DT,
- const Loop *CurLoop,
- const LoopSafetyInfo *SafetyInfo,
- OptimizationRemarkEmitter *ORE,
- const Instruction *CtxI = nullptr);
-static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
- AliasSetTracker *CurAST, Loop *CurLoop,
- AAResults *AA);
-static bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,
+ const DominatorTree *DT,
+ const Loop *CurLoop,
+ const LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE,
+ const Instruction *CtxI = nullptr);
+static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
+ AliasSetTracker *CurAST, Loop *CurLoop,
+ AAResults *AA);
+static bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
Loop *CurLoop, Instruction &I,
- SinkAndHoistLICMFlags &Flags);
+ SinkAndHoistLICMFlags &Flags);
static bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA,
MemoryUse &MU);
-static Instruction *cloneInstructionInExitBlock(
- Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI,
- const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU);
-
-static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
- AliasSetTracker *AST, MemorySSAUpdater *MSSAU);
-
-static void moveInstructionBefore(Instruction &I, Instruction &Dest,
- ICFLoopSafetyInfo &SafetyInfo,
- MemorySSAUpdater *MSSAU, ScalarEvolution *SE);
-
-namespace {
-struct LoopInvariantCodeMotion {
- bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
+static Instruction *cloneInstructionInExitBlock(
+ Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI,
+ const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU);
+
+static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
+ AliasSetTracker *AST, MemorySSAUpdater *MSSAU);
+
+static void moveInstructionBefore(Instruction &I, Instruction &Dest,
+ ICFLoopSafetyInfo &SafetyInfo,
+ MemorySSAUpdater *MSSAU, ScalarEvolution *SE);
+
+namespace {
+struct LoopInvariantCodeMotion {
+ bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI,
TargetTransformInfo *TTI, ScalarEvolution *SE, MemorySSA *MSSA,
- OptimizationRemarkEmitter *ORE);
-
- LoopInvariantCodeMotion(unsigned LicmMssaOptCap,
- unsigned LicmMssaNoAccForPromotionCap)
- : LicmMssaOptCap(LicmMssaOptCap),
- LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {}
-
-private:
- unsigned LicmMssaOptCap;
- unsigned LicmMssaNoAccForPromotionCap;
-
- std::unique_ptr<AliasSetTracker>
- collectAliasInfoForLoop(Loop *L, LoopInfo *LI, AAResults *AA);
- std::unique_ptr<AliasSetTracker>
- collectAliasInfoForLoopWithMSSA(Loop *L, AAResults *AA,
- MemorySSAUpdater *MSSAU);
-};
-
-struct LegacyLICMPass : public LoopPass {
- static char ID; // Pass identification, replacement for typeid
- LegacyLICMPass(
- unsigned LicmMssaOptCap = SetLicmMssaOptCap,
- unsigned LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap)
- : LoopPass(ID), LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap) {
- initializeLegacyLICMPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override {
- if (skipLoop(L))
- return false;
-
+ OptimizationRemarkEmitter *ORE);
+
+ LoopInvariantCodeMotion(unsigned LicmMssaOptCap,
+ unsigned LicmMssaNoAccForPromotionCap)
+ : LicmMssaOptCap(LicmMssaOptCap),
+ LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {}
+
+private:
+ unsigned LicmMssaOptCap;
+ unsigned LicmMssaNoAccForPromotionCap;
+
+ std::unique_ptr<AliasSetTracker>
+ collectAliasInfoForLoop(Loop *L, LoopInfo *LI, AAResults *AA);
+ std::unique_ptr<AliasSetTracker>
+ collectAliasInfoForLoopWithMSSA(Loop *L, AAResults *AA,
+ MemorySSAUpdater *MSSAU);
+};
+
+struct LegacyLICMPass : public LoopPass {
+ static char ID; // Pass identification, replacement for typeid
+ LegacyLICMPass(
+ unsigned LicmMssaOptCap = SetLicmMssaOptCap,
+ unsigned LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap)
+ : LoopPass(ID), LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap) {
+ initializeLegacyLICMPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+
LLVM_DEBUG(dbgs() << "Perform LICM on Loop with header at block "
<< L->getHeader()->getNameOrAsOperand() << "\n");
- auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
- MemorySSA *MSSA = EnableMSSALoopDependency
- ? (&getAnalysis<MemorySSAWrapperPass>().getMSSA())
- : nullptr;
+ auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ MemorySSA *MSSA = EnableMSSALoopDependency
+ ? (&getAnalysis<MemorySSAWrapperPass>().getMSSA())
+ : nullptr;
bool hasProfileData = L->getHeader()->getParent()->hasProfileData();
BlockFrequencyInfo *BFI =
hasProfileData ? &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI()
: nullptr;
- // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+ // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
// pass. Function analyses need to be preserved across loop transformations
- // but ORE cannot be preserved (see comment before the pass definition).
- OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
+ // but ORE cannot be preserved (see comment before the pass definition).
+ OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
return LICM.runOnLoop(
L, &getAnalysis<AAResultsWrapperPass>().getAAResults(),
&getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
@@ -245,70 +245,70 @@ struct LegacyLICMPass : public LoopPass {
&getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
*L->getHeader()->getParent()),
SE ? &SE->getSE() : nullptr, MSSA, &ORE);
- }
-
- /// This transformation requires natural loop information & requires that
- /// loop preheaders be inserted into the CFG...
- ///
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- if (EnableMSSALoopDependency) {
- AU.addRequired<MemorySSAWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
- }
- AU.addRequired<TargetTransformInfoWrapperPass>();
- getLoopAnalysisUsage(AU);
+ }
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG...
+ ///
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ if (EnableMSSALoopDependency) {
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ }
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ getLoopAnalysisUsage(AU);
LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
AU.addPreserved<LazyBlockFrequencyInfoPass>();
AU.addPreserved<LazyBranchProbabilityInfoPass>();
- }
-
-private:
- LoopInvariantCodeMotion LICM;
-};
-} // namespace
-
-PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR, LPMUpdater &) {
- // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
- // pass. Function analyses need to be preserved across loop transformations
- // but ORE cannot be preserved (see comment before the pass definition).
- OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
-
- LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
+ }
+
+private:
+ LoopInvariantCodeMotion LICM;
+};
+} // namespace
+
+PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &) {
+ // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
+ // pass. Function analyses need to be preserved across loop transformations
+ // but ORE cannot be preserved (see comment before the pass definition).
+ OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
+
+ LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI,
&AR.SE, AR.MSSA, &ORE))
- return PreservedAnalyses::all();
-
- auto PA = getLoopPassPreservedAnalyses();
-
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<LoopAnalysis>();
- if (AR.MSSA)
- PA.preserve<MemorySSAAnalysis>();
-
- return PA;
-}
-
-char LegacyLICMPass::ID = 0;
-INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+ return PreservedAnalyses::all();
+
+ auto PA = getLoopPassPreservedAnalyses();
+
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ if (AR.MSSA)
+ PA.preserve<MemorySSAAnalysis>();
+
+ return PA;
+}
+
+char LegacyLICMPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LegacyLICMPass, "licm", "Loop Invariant Code Motion",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LazyBFIPass)
-INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
- false)
-
-Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
-Pass *llvm::createLICMPass(unsigned LicmMssaOptCap,
- unsigned LicmMssaNoAccForPromotionCap) {
- return new LegacyLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
-}
-
+INITIALIZE_PASS_END(LegacyLICMPass, "licm", "Loop Invariant Code Motion", false,
+ false)
+
+Pass *llvm::createLICMPass() { return new LegacyLICMPass(); }
+Pass *llvm::createLICMPass(unsigned LicmMssaOptCap,
+ unsigned LicmMssaNoAccForPromotionCap) {
+ return new LegacyLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
+}
+
llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags(bool IsSink, Loop *L,
MemorySSA *MSSA)
: SinkAndHoistLICMFlags(SetLicmMssaOptCap, SetLicmMssaNoAccForPromotionCap,
@@ -338,456 +338,456 @@ llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags(
}
}
-/// Hoist expressions out of the specified loop. Note, alias info for inner
-/// loop is not preserved so it is not a good idea to run LICM multiple
-/// times on one loop.
-bool LoopInvariantCodeMotion::runOnLoop(
- Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
+/// Hoist expressions out of the specified loop. Note, alias info for inner
+/// loop is not preserved so it is not a good idea to run LICM multiple
+/// times on one loop.
+bool LoopInvariantCodeMotion::runOnLoop(
+ Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
BlockFrequencyInfo *BFI, TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE) {
- bool Changed = false;
-
- assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
-
- // If this loop has metadata indicating that LICM is not to be performed then
- // just exit.
- if (hasDisableLICMTransformsHint(L)) {
- return false;
- }
-
- std::unique_ptr<AliasSetTracker> CurAST;
- std::unique_ptr<MemorySSAUpdater> MSSAU;
+ bool Changed = false;
+
+ assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
+
+ // If this loop has metadata indicating that LICM is not to be performed then
+ // just exit.
+ if (hasDisableLICMTransformsHint(L)) {
+ return false;
+ }
+
+ std::unique_ptr<AliasSetTracker> CurAST;
+ std::unique_ptr<MemorySSAUpdater> MSSAU;
std::unique_ptr<SinkAndHoistLICMFlags> Flags;
-
- if (!MSSA) {
- LLVM_DEBUG(dbgs() << "LICM: Using Alias Set Tracker.\n");
- CurAST = collectAliasInfoForLoop(L, LI, AA);
+
+ if (!MSSA) {
+ LLVM_DEBUG(dbgs() << "LICM: Using Alias Set Tracker.\n");
+ CurAST = collectAliasInfoForLoop(L, LI, AA);
Flags = std::make_unique<SinkAndHoistLICMFlags>(
LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true);
- } else {
- LLVM_DEBUG(dbgs() << "LICM: Using MemorySSA.\n");
- MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+ } else {
+ LLVM_DEBUG(dbgs() << "LICM: Using MemorySSA.\n");
+ MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
Flags = std::make_unique<SinkAndHoistLICMFlags>(
LicmMssaOptCap, LicmMssaNoAccForPromotionCap, /*IsSink=*/true, L, MSSA);
- }
-
- // Get the preheader block to move instructions into...
- BasicBlock *Preheader = L->getLoopPreheader();
-
- // Compute loop safety information.
- ICFLoopSafetyInfo SafetyInfo;
- SafetyInfo.computeLoopSafetyInfo(L);
-
- // We want to visit all of the instructions in this loop... that are not parts
- // of our subloops (they have already had their invariants hoisted out of
- // their loop, into this loop, so there is no need to process the BODIES of
- // the subloops).
- //
- // Traverse the body of the loop in depth first order on the dominator tree so
- // that we are guaranteed to see definitions before we see uses. This allows
- // us to sink instructions in one pass, without iteration. After sinking
- // instructions, we perform another pass to hoist them out of the loop.
- if (L->hasDedicatedExits())
+ }
+
+ // Get the preheader block to move instructions into...
+ BasicBlock *Preheader = L->getLoopPreheader();
+
+ // Compute loop safety information.
+ ICFLoopSafetyInfo SafetyInfo;
+ SafetyInfo.computeLoopSafetyInfo(L);
+
+ // We want to visit all of the instructions in this loop... that are not parts
+ // of our subloops (they have already had their invariants hoisted out of
+ // their loop, into this loop, so there is no need to process the BODIES of
+ // the subloops).
+ //
+ // Traverse the body of the loop in depth first order on the dominator tree so
+ // that we are guaranteed to see definitions before we see uses. This allows
+ // us to sink instructions in one pass, without iteration. After sinking
+ // instructions, we perform another pass to hoist them out of the loop.
+ if (L->hasDedicatedExits())
Changed |=
sinkRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, TTI, L,
CurAST.get(), MSSAU.get(), &SafetyInfo, *Flags.get(), ORE);
Flags->setIsSink(false);
- if (Preheader)
+ if (Preheader)
Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L,
CurAST.get(), MSSAU.get(), SE, &SafetyInfo,
*Flags.get(), ORE);
-
- // Now that all loop invariants have been removed from the loop, promote any
- // memory references to scalars that we can.
- // Don't sink stores from loops without dedicated block exits. Exits
- // containing indirect branches are not transformed by loop simplify,
- // make sure we catch that. An additional load may be generated in the
- // preheader for SSA updater, so also avoid sinking when no preheader
- // is available.
- if (!DisablePromotion && Preheader && L->hasDedicatedExits() &&
+
+ // Now that all loop invariants have been removed from the loop, promote any
+ // memory references to scalars that we can.
+ // Don't sink stores from loops without dedicated block exits. Exits
+ // containing indirect branches are not transformed by loop simplify,
+ // make sure we catch that. An additional load may be generated in the
+ // preheader for SSA updater, so also avoid sinking when no preheader
+ // is available.
+ if (!DisablePromotion && Preheader && L->hasDedicatedExits() &&
!Flags->tooManyMemoryAccesses()) {
- // Figure out the loop exits and their insertion points
- SmallVector<BasicBlock *, 8> ExitBlocks;
- L->getUniqueExitBlocks(ExitBlocks);
-
- // We can't insert into a catchswitch.
- bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) {
- return isa<CatchSwitchInst>(Exit->getTerminator());
- });
-
- if (!HasCatchSwitch) {
- SmallVector<Instruction *, 8> InsertPts;
- SmallVector<MemoryAccess *, 8> MSSAInsertPts;
- InsertPts.reserve(ExitBlocks.size());
- if (MSSAU)
- MSSAInsertPts.reserve(ExitBlocks.size());
- for (BasicBlock *ExitBlock : ExitBlocks) {
- InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
- if (MSSAU)
- MSSAInsertPts.push_back(nullptr);
- }
-
- PredIteratorCache PIC;
-
- bool Promoted = false;
-
- // Build an AST using MSSA.
- if (!CurAST.get())
- CurAST = collectAliasInfoForLoopWithMSSA(L, AA, MSSAU.get());
-
- // Loop over all of the alias sets in the tracker object.
- for (AliasSet &AS : *CurAST) {
- // We can promote this alias set if it has a store, if it is a "Must"
- // alias set, if the pointer is loop invariant, and if we are not
- // eliminating any volatile loads or stores.
- if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
- !L->isLoopInvariant(AS.begin()->getValue()))
- continue;
-
- assert(
- !AS.empty() &&
- "Must alias set should have at least one pointer element in it!");
-
- SmallSetVector<Value *, 8> PointerMustAliases;
- for (const auto &ASI : AS)
- PointerMustAliases.insert(ASI.getValue());
-
- Promoted |= promoteLoopAccessesToScalars(
- PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI,
- DT, TLI, L, CurAST.get(), MSSAU.get(), &SafetyInfo, ORE);
- }
-
- // Once we have promoted values across the loop body we have to
- // recursively reform LCSSA as any nested loop may now have values defined
- // within the loop used in the outer loop.
- // FIXME: This is really heavy handed. It would be a bit better to use an
- // SSAUpdater strategy during promotion that was LCSSA aware and reformed
- // it as it went.
- if (Promoted)
- formLCSSARecursively(*L, *DT, LI, SE);
-
- Changed |= Promoted;
- }
- }
-
- // Check that neither this loop nor its parent have had LCSSA broken. LICM is
- // specifically moving instructions across the loop boundary and so it is
- // especially in need of sanity checking here.
- assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!");
+ // Figure out the loop exits and their insertion points
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+
+ // We can't insert into a catchswitch.
+ bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) {
+ return isa<CatchSwitchInst>(Exit->getTerminator());
+ });
+
+ if (!HasCatchSwitch) {
+ SmallVector<Instruction *, 8> InsertPts;
+ SmallVector<MemoryAccess *, 8> MSSAInsertPts;
+ InsertPts.reserve(ExitBlocks.size());
+ if (MSSAU)
+ MSSAInsertPts.reserve(ExitBlocks.size());
+ for (BasicBlock *ExitBlock : ExitBlocks) {
+ InsertPts.push_back(&*ExitBlock->getFirstInsertionPt());
+ if (MSSAU)
+ MSSAInsertPts.push_back(nullptr);
+ }
+
+ PredIteratorCache PIC;
+
+ bool Promoted = false;
+
+ // Build an AST using MSSA.
+ if (!CurAST.get())
+ CurAST = collectAliasInfoForLoopWithMSSA(L, AA, MSSAU.get());
+
+ // Loop over all of the alias sets in the tracker object.
+ for (AliasSet &AS : *CurAST) {
+ // We can promote this alias set if it has a store, if it is a "Must"
+ // alias set, if the pointer is loop invariant, and if we are not
+ // eliminating any volatile loads or stores.
+ if (AS.isForwardingAliasSet() || !AS.isMod() || !AS.isMustAlias() ||
+ !L->isLoopInvariant(AS.begin()->getValue()))
+ continue;
+
+ assert(
+ !AS.empty() &&
+ "Must alias set should have at least one pointer element in it!");
+
+ SmallSetVector<Value *, 8> PointerMustAliases;
+ for (const auto &ASI : AS)
+ PointerMustAliases.insert(ASI.getValue());
+
+ Promoted |= promoteLoopAccessesToScalars(
+ PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI,
+ DT, TLI, L, CurAST.get(), MSSAU.get(), &SafetyInfo, ORE);
+ }
+
+ // Once we have promoted values across the loop body we have to
+ // recursively reform LCSSA as any nested loop may now have values defined
+ // within the loop used in the outer loop.
+ // FIXME: This is really heavy handed. It would be a bit better to use an
+ // SSAUpdater strategy during promotion that was LCSSA aware and reformed
+ // it as it went.
+ if (Promoted)
+ formLCSSARecursively(*L, *DT, LI, SE);
+
+ Changed |= Promoted;
+ }
+ }
+
+ // Check that neither this loop nor its parent have had LCSSA broken. LICM is
+ // specifically moving instructions across the loop boundary and so it is
+ // especially in need of sanity checking here.
+ assert(L->isLCSSAForm(*DT) && "Loop not left in LCSSA form after LICM!");
assert((L->isOutermost() || L->getParentLoop()->isLCSSAForm(*DT)) &&
- "Parent loop not left in LCSSA form after LICM!");
-
- if (MSSAU.get() && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- if (Changed && SE)
- SE->forgetLoopDispositions(L);
- return Changed;
-}
-
-/// Walk the specified region of the CFG (defined by all blocks dominated by
-/// the specified block, and that are in the current loop) in reverse depth
-/// first order w.r.t the DominatorTree. This allows us to visit uses before
-/// definitions, allowing us to sink a loop body in one pass without iteration.
-///
-bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
+ "Parent loop not left in LCSSA form after LICM!");
+
+ if (MSSAU.get() && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ if (Changed && SE)
+ SE->forgetLoopDispositions(L);
+ return Changed;
+}
+
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in reverse depth
+/// first order w.r.t the DominatorTree. This allows us to visit uses before
+/// definitions, allowing us to sink a loop body in one pass without iteration.
+///
+bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
DominatorTree *DT, BlockFrequencyInfo *BFI,
TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
Loop *CurLoop, AliasSetTracker *CurAST,
MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo,
- SinkAndHoistLICMFlags &Flags,
- OptimizationRemarkEmitter *ORE) {
-
- // Verify inputs.
- assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
- CurLoop != nullptr && SafetyInfo != nullptr &&
- "Unexpected input to sinkRegion.");
- assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
- "Either AliasSetTracker or MemorySSA should be initialized.");
-
- // We want to visit children before parents. We will enque all the parents
- // before their children in the worklist and process the worklist in reverse
- // order.
- SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop);
-
- bool Changed = false;
- for (DomTreeNode *DTN : reverse(Worklist)) {
- BasicBlock *BB = DTN->getBlock();
- // Only need to process the contents of this block if it is not part of a
- // subloop (which would already have been processed).
- if (inSubLoop(BB, CurLoop, LI))
- continue;
-
- for (BasicBlock::iterator II = BB->end(); II != BB->begin();) {
- Instruction &I = *--II;
-
- // If the instruction is dead, we would try to sink it because it isn't
- // used in the loop, instead, just delete it.
- if (isInstructionTriviallyDead(&I, TLI)) {
- LLVM_DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
- salvageKnowledge(&I);
- salvageDebugInfo(I);
- ++II;
- eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
- Changed = true;
- continue;
- }
-
- // Check to see if we can sink this instruction to the exit blocks
- // of the loop. We can do this if the all users of the instruction are
- // outside of the loop. In this case, it doesn't even matter if the
- // operands of the instruction are loop invariant.
- //
- bool FreeInLoop = false;
- if (!I.mayHaveSideEffects() &&
- isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
- canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
- ORE)) {
+ SinkAndHoistLICMFlags &Flags,
+ OptimizationRemarkEmitter *ORE) {
+
+ // Verify inputs.
+ assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
+ CurLoop != nullptr && SafetyInfo != nullptr &&
+ "Unexpected input to sinkRegion.");
+ assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
+ "Either AliasSetTracker or MemorySSA should be initialized.");
+
+ // We want to visit children before parents. We will enque all the parents
+ // before their children in the worklist and process the worklist in reverse
+ // order.
+ SmallVector<DomTreeNode *, 16> Worklist = collectChildrenInLoop(N, CurLoop);
+
+ bool Changed = false;
+ for (DomTreeNode *DTN : reverse(Worklist)) {
+ BasicBlock *BB = DTN->getBlock();
+ // Only need to process the contents of this block if it is not part of a
+ // subloop (which would already have been processed).
+ if (inSubLoop(BB, CurLoop, LI))
+ continue;
+
+ for (BasicBlock::iterator II = BB->end(); II != BB->begin();) {
+ Instruction &I = *--II;
+
+ // If the instruction is dead, we would try to sink it because it isn't
+ // used in the loop, instead, just delete it.
+ if (isInstructionTriviallyDead(&I, TLI)) {
+ LLVM_DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+ salvageKnowledge(&I);
+ salvageDebugInfo(I);
+ ++II;
+ eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+ Changed = true;
+ continue;
+ }
+
+ // Check to see if we can sink this instruction to the exit blocks
+ // of the loop. We can do this if the all users of the instruction are
+ // outside of the loop. In this case, it doesn't even matter if the
+ // operands of the instruction are loop invariant.
+ //
+ bool FreeInLoop = false;
+ if (!I.mayHaveSideEffects() &&
+ isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
+ canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
+ ORE)) {
if (sink(I, LI, DT, BFI, CurLoop, SafetyInfo, MSSAU, ORE)) {
- if (!FreeInLoop) {
- ++II;
- salvageDebugInfo(I);
- eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
- }
- Changed = true;
- }
- }
- }
- }
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
- return Changed;
-}
-
-namespace {
-// This is a helper class for hoistRegion to make it able to hoist control flow
-// in order to be able to hoist phis. The way this works is that we initially
-// start hoisting to the loop preheader, and when we see a loop invariant branch
-// we make note of this. When we then come to hoist an instruction that's
-// conditional on such a branch we duplicate the branch and the relevant control
-// flow, then hoist the instruction into the block corresponding to its original
-// block in the duplicated control flow.
-class ControlFlowHoister {
-private:
- // Information about the loop we are hoisting from
- LoopInfo *LI;
- DominatorTree *DT;
- Loop *CurLoop;
- MemorySSAUpdater *MSSAU;
-
- // A map of blocks in the loop to the block their instructions will be hoisted
- // to.
- DenseMap<BasicBlock *, BasicBlock *> HoistDestinationMap;
-
- // The branches that we can hoist, mapped to the block that marks a
- // convergence point of their control flow.
- DenseMap<BranchInst *, BasicBlock *> HoistableBranches;
-
-public:
- ControlFlowHoister(LoopInfo *LI, DominatorTree *DT, Loop *CurLoop,
- MemorySSAUpdater *MSSAU)
- : LI(LI), DT(DT), CurLoop(CurLoop), MSSAU(MSSAU) {}
-
- void registerPossiblyHoistableBranch(BranchInst *BI) {
- // We can only hoist conditional branches with loop invariant operands.
- if (!ControlFlowHoisting || !BI->isConditional() ||
- !CurLoop->hasLoopInvariantOperands(BI))
- return;
-
- // The branch destinations need to be in the loop, and we don't gain
- // anything by duplicating conditional branches with duplicate successors,
- // as it's essentially the same as an unconditional branch.
- BasicBlock *TrueDest = BI->getSuccessor(0);
- BasicBlock *FalseDest = BI->getSuccessor(1);
- if (!CurLoop->contains(TrueDest) || !CurLoop->contains(FalseDest) ||
- TrueDest == FalseDest)
- return;
-
- // We can hoist BI if one branch destination is the successor of the other,
- // or both have common successor which we check by seeing if the
- // intersection of their successors is non-empty.
- // TODO: This could be expanded to allowing branches where both ends
- // eventually converge to a single block.
- SmallPtrSet<BasicBlock *, 4> TrueDestSucc, FalseDestSucc;
- TrueDestSucc.insert(succ_begin(TrueDest), succ_end(TrueDest));
- FalseDestSucc.insert(succ_begin(FalseDest), succ_end(FalseDest));
- BasicBlock *CommonSucc = nullptr;
- if (TrueDestSucc.count(FalseDest)) {
- CommonSucc = FalseDest;
- } else if (FalseDestSucc.count(TrueDest)) {
- CommonSucc = TrueDest;
- } else {
- set_intersect(TrueDestSucc, FalseDestSucc);
- // If there's one common successor use that.
- if (TrueDestSucc.size() == 1)
- CommonSucc = *TrueDestSucc.begin();
- // If there's more than one pick whichever appears first in the block list
- // (we can't use the value returned by TrueDestSucc.begin() as it's
- // unpredicatable which element gets returned).
- else if (!TrueDestSucc.empty()) {
- Function *F = TrueDest->getParent();
- auto IsSucc = [&](BasicBlock &BB) { return TrueDestSucc.count(&BB); };
+ if (!FreeInLoop) {
+ ++II;
+ salvageDebugInfo(I);
+ eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+ }
+ Changed = true;
+ }
+ }
+ }
+ }
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+ return Changed;
+}
+
+namespace {
+// This is a helper class for hoistRegion to make it able to hoist control flow
+// in order to be able to hoist phis. The way this works is that we initially
+// start hoisting to the loop preheader, and when we see a loop invariant branch
+// we make note of this. When we then come to hoist an instruction that's
+// conditional on such a branch we duplicate the branch and the relevant control
+// flow, then hoist the instruction into the block corresponding to its original
+// block in the duplicated control flow.
+class ControlFlowHoister {
+private:
+ // Information about the loop we are hoisting from
+ LoopInfo *LI;
+ DominatorTree *DT;
+ Loop *CurLoop;
+ MemorySSAUpdater *MSSAU;
+
+ // A map of blocks in the loop to the block their instructions will be hoisted
+ // to.
+ DenseMap<BasicBlock *, BasicBlock *> HoistDestinationMap;
+
+ // The branches that we can hoist, mapped to the block that marks a
+ // convergence point of their control flow.
+ DenseMap<BranchInst *, BasicBlock *> HoistableBranches;
+
+public:
+ ControlFlowHoister(LoopInfo *LI, DominatorTree *DT, Loop *CurLoop,
+ MemorySSAUpdater *MSSAU)
+ : LI(LI), DT(DT), CurLoop(CurLoop), MSSAU(MSSAU) {}
+
+ void registerPossiblyHoistableBranch(BranchInst *BI) {
+ // We can only hoist conditional branches with loop invariant operands.
+ if (!ControlFlowHoisting || !BI->isConditional() ||
+ !CurLoop->hasLoopInvariantOperands(BI))
+ return;
+
+ // The branch destinations need to be in the loop, and we don't gain
+ // anything by duplicating conditional branches with duplicate successors,
+ // as it's essentially the same as an unconditional branch.
+ BasicBlock *TrueDest = BI->getSuccessor(0);
+ BasicBlock *FalseDest = BI->getSuccessor(1);
+ if (!CurLoop->contains(TrueDest) || !CurLoop->contains(FalseDest) ||
+ TrueDest == FalseDest)
+ return;
+
+ // We can hoist BI if one branch destination is the successor of the other,
+ // or both have common successor which we check by seeing if the
+ // intersection of their successors is non-empty.
+ // TODO: This could be expanded to allowing branches where both ends
+ // eventually converge to a single block.
+ SmallPtrSet<BasicBlock *, 4> TrueDestSucc, FalseDestSucc;
+ TrueDestSucc.insert(succ_begin(TrueDest), succ_end(TrueDest));
+ FalseDestSucc.insert(succ_begin(FalseDest), succ_end(FalseDest));
+ BasicBlock *CommonSucc = nullptr;
+ if (TrueDestSucc.count(FalseDest)) {
+ CommonSucc = FalseDest;
+ } else if (FalseDestSucc.count(TrueDest)) {
+ CommonSucc = TrueDest;
+ } else {
+ set_intersect(TrueDestSucc, FalseDestSucc);
+ // If there's one common successor use that.
+ if (TrueDestSucc.size() == 1)
+ CommonSucc = *TrueDestSucc.begin();
+ // If there's more than one pick whichever appears first in the block list
+ // (we can't use the value returned by TrueDestSucc.begin() as it's
+ // unpredicatable which element gets returned).
+ else if (!TrueDestSucc.empty()) {
+ Function *F = TrueDest->getParent();
+ auto IsSucc = [&](BasicBlock &BB) { return TrueDestSucc.count(&BB); };
auto It = llvm::find_if(*F, IsSucc);
- assert(It != F->end() && "Could not find successor in function");
- CommonSucc = &*It;
- }
- }
- // The common successor has to be dominated by the branch, as otherwise
- // there will be some other path to the successor that will not be
- // controlled by this branch so any phi we hoist would be controlled by the
- // wrong condition. This also takes care of avoiding hoisting of loop back
- // edges.
- // TODO: In some cases this could be relaxed if the successor is dominated
- // by another block that's been hoisted and we can guarantee that the
- // control flow has been replicated exactly.
- if (CommonSucc && DT->dominates(BI, CommonSucc))
- HoistableBranches[BI] = CommonSucc;
- }
-
- bool canHoistPHI(PHINode *PN) {
- // The phi must have loop invariant operands.
- if (!ControlFlowHoisting || !CurLoop->hasLoopInvariantOperands(PN))
- return false;
- // We can hoist phis if the block they are in is the target of hoistable
- // branches which cover all of the predecessors of the block.
- SmallPtrSet<BasicBlock *, 8> PredecessorBlocks;
- BasicBlock *BB = PN->getParent();
- for (BasicBlock *PredBB : predecessors(BB))
- PredecessorBlocks.insert(PredBB);
- // If we have less predecessor blocks than predecessors then the phi will
- // have more than one incoming value for the same block which we can't
- // handle.
- // TODO: This could be handled be erasing some of the duplicate incoming
- // values.
- if (PredecessorBlocks.size() != pred_size(BB))
- return false;
- for (auto &Pair : HoistableBranches) {
- if (Pair.second == BB) {
- // Which blocks are predecessors via this branch depends on if the
- // branch is triangle-like or diamond-like.
- if (Pair.first->getSuccessor(0) == BB) {
- PredecessorBlocks.erase(Pair.first->getParent());
- PredecessorBlocks.erase(Pair.first->getSuccessor(1));
- } else if (Pair.first->getSuccessor(1) == BB) {
- PredecessorBlocks.erase(Pair.first->getParent());
- PredecessorBlocks.erase(Pair.first->getSuccessor(0));
- } else {
- PredecessorBlocks.erase(Pair.first->getSuccessor(0));
- PredecessorBlocks.erase(Pair.first->getSuccessor(1));
- }
- }
- }
- // PredecessorBlocks will now be empty if for every predecessor of BB we
- // found a hoistable branch source.
- return PredecessorBlocks.empty();
- }
-
- BasicBlock *getOrCreateHoistedBlock(BasicBlock *BB) {
- if (!ControlFlowHoisting)
- return CurLoop->getLoopPreheader();
- // If BB has already been hoisted, return that
- if (HoistDestinationMap.count(BB))
- return HoistDestinationMap[BB];
-
- // Check if this block is conditional based on a pending branch
- auto HasBBAsSuccessor =
- [&](DenseMap<BranchInst *, BasicBlock *>::value_type &Pair) {
- return BB != Pair.second && (Pair.first->getSuccessor(0) == BB ||
- Pair.first->getSuccessor(1) == BB);
- };
+ assert(It != F->end() && "Could not find successor in function");
+ CommonSucc = &*It;
+ }
+ }
+ // The common successor has to be dominated by the branch, as otherwise
+ // there will be some other path to the successor that will not be
+ // controlled by this branch so any phi we hoist would be controlled by the
+ // wrong condition. This also takes care of avoiding hoisting of loop back
+ // edges.
+ // TODO: In some cases this could be relaxed if the successor is dominated
+ // by another block that's been hoisted and we can guarantee that the
+ // control flow has been replicated exactly.
+ if (CommonSucc && DT->dominates(BI, CommonSucc))
+ HoistableBranches[BI] = CommonSucc;
+ }
+
+ bool canHoistPHI(PHINode *PN) {
+ // The phi must have loop invariant operands.
+ if (!ControlFlowHoisting || !CurLoop->hasLoopInvariantOperands(PN))
+ return false;
+ // We can hoist phis if the block they are in is the target of hoistable
+ // branches which cover all of the predecessors of the block.
+ SmallPtrSet<BasicBlock *, 8> PredecessorBlocks;
+ BasicBlock *BB = PN->getParent();
+ for (BasicBlock *PredBB : predecessors(BB))
+ PredecessorBlocks.insert(PredBB);
+ // If we have less predecessor blocks than predecessors then the phi will
+ // have more than one incoming value for the same block which we can't
+ // handle.
+ // TODO: This could be handled be erasing some of the duplicate incoming
+ // values.
+ if (PredecessorBlocks.size() != pred_size(BB))
+ return false;
+ for (auto &Pair : HoistableBranches) {
+ if (Pair.second == BB) {
+ // Which blocks are predecessors via this branch depends on if the
+ // branch is triangle-like or diamond-like.
+ if (Pair.first->getSuccessor(0) == BB) {
+ PredecessorBlocks.erase(Pair.first->getParent());
+ PredecessorBlocks.erase(Pair.first->getSuccessor(1));
+ } else if (Pair.first->getSuccessor(1) == BB) {
+ PredecessorBlocks.erase(Pair.first->getParent());
+ PredecessorBlocks.erase(Pair.first->getSuccessor(0));
+ } else {
+ PredecessorBlocks.erase(Pair.first->getSuccessor(0));
+ PredecessorBlocks.erase(Pair.first->getSuccessor(1));
+ }
+ }
+ }
+ // PredecessorBlocks will now be empty if for every predecessor of BB we
+ // found a hoistable branch source.
+ return PredecessorBlocks.empty();
+ }
+
+ BasicBlock *getOrCreateHoistedBlock(BasicBlock *BB) {
+ if (!ControlFlowHoisting)
+ return CurLoop->getLoopPreheader();
+ // If BB has already been hoisted, return that
+ if (HoistDestinationMap.count(BB))
+ return HoistDestinationMap[BB];
+
+ // Check if this block is conditional based on a pending branch
+ auto HasBBAsSuccessor =
+ [&](DenseMap<BranchInst *, BasicBlock *>::value_type &Pair) {
+ return BB != Pair.second && (Pair.first->getSuccessor(0) == BB ||
+ Pair.first->getSuccessor(1) == BB);
+ };
auto It = llvm::find_if(HoistableBranches, HasBBAsSuccessor);
-
- // If not involved in a pending branch, hoist to preheader
- BasicBlock *InitialPreheader = CurLoop->getLoopPreheader();
- if (It == HoistableBranches.end()) {
+
+ // If not involved in a pending branch, hoist to preheader
+ BasicBlock *InitialPreheader = CurLoop->getLoopPreheader();
+ if (It == HoistableBranches.end()) {
LLVM_DEBUG(dbgs() << "LICM using "
<< InitialPreheader->getNameOrAsOperand()
<< " as hoist destination for "
<< BB->getNameOrAsOperand() << "\n");
- HoistDestinationMap[BB] = InitialPreheader;
- return InitialPreheader;
- }
- BranchInst *BI = It->first;
- assert(std::find_if(++It, HoistableBranches.end(), HasBBAsSuccessor) ==
- HoistableBranches.end() &&
- "BB is expected to be the target of at most one branch");
-
- LLVMContext &C = BB->getContext();
- BasicBlock *TrueDest = BI->getSuccessor(0);
- BasicBlock *FalseDest = BI->getSuccessor(1);
- BasicBlock *CommonSucc = HoistableBranches[BI];
- BasicBlock *HoistTarget = getOrCreateHoistedBlock(BI->getParent());
-
- // Create hoisted versions of blocks that currently don't have them
- auto CreateHoistedBlock = [&](BasicBlock *Orig) {
- if (HoistDestinationMap.count(Orig))
- return HoistDestinationMap[Orig];
- BasicBlock *New =
- BasicBlock::Create(C, Orig->getName() + ".licm", Orig->getParent());
- HoistDestinationMap[Orig] = New;
- DT->addNewBlock(New, HoistTarget);
- if (CurLoop->getParentLoop())
- CurLoop->getParentLoop()->addBasicBlockToLoop(New, *LI);
- ++NumCreatedBlocks;
- LLVM_DEBUG(dbgs() << "LICM created " << New->getName()
- << " as hoist destination for " << Orig->getName()
- << "\n");
- return New;
- };
- BasicBlock *HoistTrueDest = CreateHoistedBlock(TrueDest);
- BasicBlock *HoistFalseDest = CreateHoistedBlock(FalseDest);
- BasicBlock *HoistCommonSucc = CreateHoistedBlock(CommonSucc);
-
- // Link up these blocks with branches.
- if (!HoistCommonSucc->getTerminator()) {
- // The new common successor we've generated will branch to whatever that
- // hoist target branched to.
- BasicBlock *TargetSucc = HoistTarget->getSingleSuccessor();
- assert(TargetSucc && "Expected hoist target to have a single successor");
- HoistCommonSucc->moveBefore(TargetSucc);
- BranchInst::Create(TargetSucc, HoistCommonSucc);
- }
- if (!HoistTrueDest->getTerminator()) {
- HoistTrueDest->moveBefore(HoistCommonSucc);
- BranchInst::Create(HoistCommonSucc, HoistTrueDest);
- }
- if (!HoistFalseDest->getTerminator()) {
- HoistFalseDest->moveBefore(HoistCommonSucc);
- BranchInst::Create(HoistCommonSucc, HoistFalseDest);
- }
-
- // If BI is being cloned to what was originally the preheader then
- // HoistCommonSucc will now be the new preheader.
- if (HoistTarget == InitialPreheader) {
- // Phis in the loop header now need to use the new preheader.
- InitialPreheader->replaceSuccessorsPhiUsesWith(HoistCommonSucc);
- if (MSSAU)
- MSSAU->wireOldPredecessorsToNewImmediatePredecessor(
- HoistTarget->getSingleSuccessor(), HoistCommonSucc, {HoistTarget});
- // The new preheader dominates the loop header.
- DomTreeNode *PreheaderNode = DT->getNode(HoistCommonSucc);
- DomTreeNode *HeaderNode = DT->getNode(CurLoop->getHeader());
- DT->changeImmediateDominator(HeaderNode, PreheaderNode);
- // The preheader hoist destination is now the new preheader, with the
- // exception of the hoist destination of this branch.
- for (auto &Pair : HoistDestinationMap)
- if (Pair.second == InitialPreheader && Pair.first != BI->getParent())
- Pair.second = HoistCommonSucc;
- }
-
- // Now finally clone BI.
- ReplaceInstWithInst(
- HoistTarget->getTerminator(),
- BranchInst::Create(HoistTrueDest, HoistFalseDest, BI->getCondition()));
- ++NumClonedBranches;
-
- assert(CurLoop->getLoopPreheader() &&
- "Hoisting blocks should not have destroyed preheader");
- return HoistDestinationMap[BB];
- }
-};
-} // namespace
-
+ HoistDestinationMap[BB] = InitialPreheader;
+ return InitialPreheader;
+ }
+ BranchInst *BI = It->first;
+ assert(std::find_if(++It, HoistableBranches.end(), HasBBAsSuccessor) ==
+ HoistableBranches.end() &&
+ "BB is expected to be the target of at most one branch");
+
+ LLVMContext &C = BB->getContext();
+ BasicBlock *TrueDest = BI->getSuccessor(0);
+ BasicBlock *FalseDest = BI->getSuccessor(1);
+ BasicBlock *CommonSucc = HoistableBranches[BI];
+ BasicBlock *HoistTarget = getOrCreateHoistedBlock(BI->getParent());
+
+ // Create hoisted versions of blocks that currently don't have them
+ auto CreateHoistedBlock = [&](BasicBlock *Orig) {
+ if (HoistDestinationMap.count(Orig))
+ return HoistDestinationMap[Orig];
+ BasicBlock *New =
+ BasicBlock::Create(C, Orig->getName() + ".licm", Orig->getParent());
+ HoistDestinationMap[Orig] = New;
+ DT->addNewBlock(New, HoistTarget);
+ if (CurLoop->getParentLoop())
+ CurLoop->getParentLoop()->addBasicBlockToLoop(New, *LI);
+ ++NumCreatedBlocks;
+ LLVM_DEBUG(dbgs() << "LICM created " << New->getName()
+ << " as hoist destination for " << Orig->getName()
+ << "\n");
+ return New;
+ };
+ BasicBlock *HoistTrueDest = CreateHoistedBlock(TrueDest);
+ BasicBlock *HoistFalseDest = CreateHoistedBlock(FalseDest);
+ BasicBlock *HoistCommonSucc = CreateHoistedBlock(CommonSucc);
+
+ // Link up these blocks with branches.
+ if (!HoistCommonSucc->getTerminator()) {
+ // The new common successor we've generated will branch to whatever that
+ // hoist target branched to.
+ BasicBlock *TargetSucc = HoistTarget->getSingleSuccessor();
+ assert(TargetSucc && "Expected hoist target to have a single successor");
+ HoistCommonSucc->moveBefore(TargetSucc);
+ BranchInst::Create(TargetSucc, HoistCommonSucc);
+ }
+ if (!HoistTrueDest->getTerminator()) {
+ HoistTrueDest->moveBefore(HoistCommonSucc);
+ BranchInst::Create(HoistCommonSucc, HoistTrueDest);
+ }
+ if (!HoistFalseDest->getTerminator()) {
+ HoistFalseDest->moveBefore(HoistCommonSucc);
+ BranchInst::Create(HoistCommonSucc, HoistFalseDest);
+ }
+
+ // If BI is being cloned to what was originally the preheader then
+ // HoistCommonSucc will now be the new preheader.
+ if (HoistTarget == InitialPreheader) {
+ // Phis in the loop header now need to use the new preheader.
+ InitialPreheader->replaceSuccessorsPhiUsesWith(HoistCommonSucc);
+ if (MSSAU)
+ MSSAU->wireOldPredecessorsToNewImmediatePredecessor(
+ HoistTarget->getSingleSuccessor(), HoistCommonSucc, {HoistTarget});
+ // The new preheader dominates the loop header.
+ DomTreeNode *PreheaderNode = DT->getNode(HoistCommonSucc);
+ DomTreeNode *HeaderNode = DT->getNode(CurLoop->getHeader());
+ DT->changeImmediateDominator(HeaderNode, PreheaderNode);
+ // The preheader hoist destination is now the new preheader, with the
+ // exception of the hoist destination of this branch.
+ for (auto &Pair : HoistDestinationMap)
+ if (Pair.second == InitialPreheader && Pair.first != BI->getParent())
+ Pair.second = HoistCommonSucc;
+ }
+
+ // Now finally clone BI.
+ ReplaceInstWithInst(
+ HoistTarget->getTerminator(),
+ BranchInst::Create(HoistTrueDest, HoistFalseDest, BI->getCondition()));
+ ++NumClonedBranches;
+
+ assert(CurLoop->getLoopPreheader() &&
+ "Hoisting blocks should not have destroyed preheader");
+ return HoistDestinationMap[BB];
+ }
+};
+} // namespace
+
// Hoisting/sinking instruction out of a loop isn't always beneficial. It's only
// only worthwhile if the destination block is actually colder than current
// block.
@@ -817,205 +817,205 @@ static bool worthSinkOrHoistInst(Instruction &I, BasicBlock *DstBlock,
return true;
}
-/// Walk the specified region of the CFG (defined by all blocks dominated by
-/// the specified block, and that are in the current loop) in depth first
-/// order w.r.t the DominatorTree. This allows us to visit definitions before
-/// uses, allowing us to hoist a loop body in one pass without iteration.
-///
-bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
+/// Walk the specified region of the CFG (defined by all blocks dominated by
+/// the specified block, and that are in the current loop) in depth first
+/// order w.r.t the DominatorTree. This allows us to visit definitions before
+/// uses, allowing us to hoist a loop body in one pass without iteration.
+///
+bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
DominatorTree *DT, BlockFrequencyInfo *BFI,
TargetLibraryInfo *TLI, Loop *CurLoop,
- AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
- ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo,
- SinkAndHoistLICMFlags &Flags,
- OptimizationRemarkEmitter *ORE) {
- // Verify inputs.
- assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
- CurLoop != nullptr && SafetyInfo != nullptr &&
- "Unexpected input to hoistRegion.");
- assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
- "Either AliasSetTracker or MemorySSA should be initialized.");
-
- ControlFlowHoister CFH(LI, DT, CurLoop, MSSAU);
-
- // Keep track of instructions that have been hoisted, as they may need to be
- // re-hoisted if they end up not dominating all of their uses.
- SmallVector<Instruction *, 16> HoistedInstructions;
-
- // For PHI hoisting to work we need to hoist blocks before their successors.
- // We can do this by iterating through the blocks in the loop in reverse
- // post-order.
- LoopBlocksRPO Worklist(CurLoop);
- Worklist.perform(LI);
- bool Changed = false;
- for (BasicBlock *BB : Worklist) {
- // Only need to process the contents of this block if it is not part of a
- // subloop (which would already have been processed).
- if (inSubLoop(BB, CurLoop, LI))
- continue;
-
- for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
- Instruction &I = *II++;
- // Try constant folding this instruction. If all the operands are
- // constants, it is technically hoistable, but it would be better to
- // just fold it.
- if (Constant *C = ConstantFoldInstruction(
- &I, I.getModule()->getDataLayout(), TLI)) {
- LLVM_DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C
- << '\n');
- if (CurAST)
- CurAST->copyValue(&I, C);
- // FIXME MSSA: Such replacements may make accesses unoptimized (D51960).
- I.replaceAllUsesWith(C);
- if (isInstructionTriviallyDead(&I, TLI))
- eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
- Changed = true;
- continue;
- }
-
- // Try hoisting the instruction out to the preheader. We can only do
- // this if all of the operands of the instruction are loop invariant and
+ AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
+ ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo,
+ SinkAndHoistLICMFlags &Flags,
+ OptimizationRemarkEmitter *ORE) {
+ // Verify inputs.
+ assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr &&
+ CurLoop != nullptr && SafetyInfo != nullptr &&
+ "Unexpected input to hoistRegion.");
+ assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
+ "Either AliasSetTracker or MemorySSA should be initialized.");
+
+ ControlFlowHoister CFH(LI, DT, CurLoop, MSSAU);
+
+ // Keep track of instructions that have been hoisted, as they may need to be
+ // re-hoisted if they end up not dominating all of their uses.
+ SmallVector<Instruction *, 16> HoistedInstructions;
+
+ // For PHI hoisting to work we need to hoist blocks before their successors.
+ // We can do this by iterating through the blocks in the loop in reverse
+ // post-order.
+ LoopBlocksRPO Worklist(CurLoop);
+ Worklist.perform(LI);
+ bool Changed = false;
+ for (BasicBlock *BB : Worklist) {
+ // Only need to process the contents of this block if it is not part of a
+ // subloop (which would already have been processed).
+ if (inSubLoop(BB, CurLoop, LI))
+ continue;
+
+ for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
+ Instruction &I = *II++;
+ // Try constant folding this instruction. If all the operands are
+ // constants, it is technically hoistable, but it would be better to
+ // just fold it.
+ if (Constant *C = ConstantFoldInstruction(
+ &I, I.getModule()->getDataLayout(), TLI)) {
+ LLVM_DEBUG(dbgs() << "LICM folding inst: " << I << " --> " << *C
+ << '\n');
+ if (CurAST)
+ CurAST->copyValue(&I, C);
+ // FIXME MSSA: Such replacements may make accesses unoptimized (D51960).
+ I.replaceAllUsesWith(C);
+ if (isInstructionTriviallyDead(&I, TLI))
+ eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+ Changed = true;
+ continue;
+ }
+
+ // Try hoisting the instruction out to the preheader. We can only do
+ // this if all of the operands of the instruction are loop invariant and
// if it is safe to hoist the instruction. We also check block frequency
// to make sure instruction only gets hoisted into colder blocks.
- // TODO: It may be safe to hoist if we are hoisting to a conditional block
- // and we have accurately duplicated the control flow from the loop header
- // to that block.
- if (CurLoop->hasLoopInvariantOperands(&I) &&
- canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
- ORE) &&
+ // TODO: It may be safe to hoist if we are hoisting to a conditional block
+ // and we have accurately duplicated the control flow from the loop header
+ // to that block.
+ if (CurLoop->hasLoopInvariantOperands(&I) &&
+ canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
+ ORE) &&
worthSinkOrHoistInst(I, CurLoop->getLoopPreheader(), ORE, BFI) &&
- isSafeToExecuteUnconditionally(
- I, DT, CurLoop, SafetyInfo, ORE,
- CurLoop->getLoopPreheader()->getTerminator())) {
- hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
- MSSAU, SE, ORE);
- HoistedInstructions.push_back(&I);
- Changed = true;
- continue;
- }
-
- // Attempt to remove floating point division out of the loop by
- // converting it to a reciprocal multiplication.
- if (I.getOpcode() == Instruction::FDiv && I.hasAllowReciprocal() &&
- CurLoop->isLoopInvariant(I.getOperand(1))) {
- auto Divisor = I.getOperand(1);
- auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
- auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
- ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
- SafetyInfo->insertInstructionTo(ReciprocalDivisor, I.getParent());
- ReciprocalDivisor->insertBefore(&I);
-
- auto Product =
- BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
- Product->setFastMathFlags(I.getFastMathFlags());
- SafetyInfo->insertInstructionTo(Product, I.getParent());
- Product->insertAfter(&I);
- I.replaceAllUsesWith(Product);
- eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
-
- hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB),
- SafetyInfo, MSSAU, SE, ORE);
- HoistedInstructions.push_back(ReciprocalDivisor);
- Changed = true;
- continue;
- }
-
- auto IsInvariantStart = [&](Instruction &I) {
- using namespace PatternMatch;
- return I.use_empty() &&
- match(&I, m_Intrinsic<Intrinsic::invariant_start>());
- };
- auto MustExecuteWithoutWritesBefore = [&](Instruction &I) {
- return SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) &&
- SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop);
- };
- if ((IsInvariantStart(I) || isGuard(&I)) &&
- CurLoop->hasLoopInvariantOperands(&I) &&
- MustExecuteWithoutWritesBefore(I)) {
- hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
- MSSAU, SE, ORE);
- HoistedInstructions.push_back(&I);
- Changed = true;
- continue;
- }
-
- if (PHINode *PN = dyn_cast<PHINode>(&I)) {
- if (CFH.canHoistPHI(PN)) {
- // Redirect incoming blocks first to ensure that we create hoisted
- // versions of those blocks before we hoist the phi.
- for (unsigned int i = 0; i < PN->getNumIncomingValues(); ++i)
- PN->setIncomingBlock(
- i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i)));
- hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
- MSSAU, SE, ORE);
- assert(DT->dominates(PN, BB) && "Conditional PHIs not expected");
- Changed = true;
- continue;
- }
- }
-
- // Remember possibly hoistable branches so we can actually hoist them
- // later if needed.
- if (BranchInst *BI = dyn_cast<BranchInst>(&I))
- CFH.registerPossiblyHoistableBranch(BI);
- }
- }
-
- // If we hoisted instructions to a conditional block they may not dominate
- // their uses that weren't hoisted (such as phis where some operands are not
- // loop invariant). If so make them unconditional by moving them to their
- // immediate dominator. We iterate through the instructions in reverse order
- // which ensures that when we rehoist an instruction we rehoist its operands,
- // and also keep track of where in the block we are rehoisting to to make sure
- // that we rehoist instructions before the instructions that use them.
- Instruction *HoistPoint = nullptr;
- if (ControlFlowHoisting) {
- for (Instruction *I : reverse(HoistedInstructions)) {
- if (!llvm::all_of(I->uses(),
- [&](Use &U) { return DT->dominates(I, U); })) {
- BasicBlock *Dominator =
- DT->getNode(I->getParent())->getIDom()->getBlock();
- if (!HoistPoint || !DT->dominates(HoistPoint->getParent(), Dominator)) {
- if (HoistPoint)
- assert(DT->dominates(Dominator, HoistPoint->getParent()) &&
- "New hoist point expected to dominate old hoist point");
- HoistPoint = Dominator->getTerminator();
- }
- LLVM_DEBUG(dbgs() << "LICM rehoisting to "
+ isSafeToExecuteUnconditionally(
+ I, DT, CurLoop, SafetyInfo, ORE,
+ CurLoop->getLoopPreheader()->getTerminator())) {
+ hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
+ MSSAU, SE, ORE);
+ HoistedInstructions.push_back(&I);
+ Changed = true;
+ continue;
+ }
+
+ // Attempt to remove floating point division out of the loop by
+ // converting it to a reciprocal multiplication.
+ if (I.getOpcode() == Instruction::FDiv && I.hasAllowReciprocal() &&
+ CurLoop->isLoopInvariant(I.getOperand(1))) {
+ auto Divisor = I.getOperand(1);
+ auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
+ auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
+ ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+ SafetyInfo->insertInstructionTo(ReciprocalDivisor, I.getParent());
+ ReciprocalDivisor->insertBefore(&I);
+
+ auto Product =
+ BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor);
+ Product->setFastMathFlags(I.getFastMathFlags());
+ SafetyInfo->insertInstructionTo(Product, I.getParent());
+ Product->insertAfter(&I);
+ I.replaceAllUsesWith(Product);
+ eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
+
+ hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB),
+ SafetyInfo, MSSAU, SE, ORE);
+ HoistedInstructions.push_back(ReciprocalDivisor);
+ Changed = true;
+ continue;
+ }
+
+ auto IsInvariantStart = [&](Instruction &I) {
+ using namespace PatternMatch;
+ return I.use_empty() &&
+ match(&I, m_Intrinsic<Intrinsic::invariant_start>());
+ };
+ auto MustExecuteWithoutWritesBefore = [&](Instruction &I) {
+ return SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) &&
+ SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop);
+ };
+ if ((IsInvariantStart(I) || isGuard(&I)) &&
+ CurLoop->hasLoopInvariantOperands(&I) &&
+ MustExecuteWithoutWritesBefore(I)) {
+ hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
+ MSSAU, SE, ORE);
+ HoistedInstructions.push_back(&I);
+ Changed = true;
+ continue;
+ }
+
+ if (PHINode *PN = dyn_cast<PHINode>(&I)) {
+ if (CFH.canHoistPHI(PN)) {
+ // Redirect incoming blocks first to ensure that we create hoisted
+ // versions of those blocks before we hoist the phi.
+ for (unsigned int i = 0; i < PN->getNumIncomingValues(); ++i)
+ PN->setIncomingBlock(
+ i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i)));
+ hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
+ MSSAU, SE, ORE);
+ assert(DT->dominates(PN, BB) && "Conditional PHIs not expected");
+ Changed = true;
+ continue;
+ }
+ }
+
+ // Remember possibly hoistable branches so we can actually hoist them
+ // later if needed.
+ if (BranchInst *BI = dyn_cast<BranchInst>(&I))
+ CFH.registerPossiblyHoistableBranch(BI);
+ }
+ }
+
+ // If we hoisted instructions to a conditional block they may not dominate
+ // their uses that weren't hoisted (such as phis where some operands are not
+ // loop invariant). If so make them unconditional by moving them to their
+ // immediate dominator. We iterate through the instructions in reverse order
+ // which ensures that when we rehoist an instruction we rehoist its operands,
+ // and also keep track of where in the block we are rehoisting to to make sure
+ // that we rehoist instructions before the instructions that use them.
+ Instruction *HoistPoint = nullptr;
+ if (ControlFlowHoisting) {
+ for (Instruction *I : reverse(HoistedInstructions)) {
+ if (!llvm::all_of(I->uses(),
+ [&](Use &U) { return DT->dominates(I, U); })) {
+ BasicBlock *Dominator =
+ DT->getNode(I->getParent())->getIDom()->getBlock();
+ if (!HoistPoint || !DT->dominates(HoistPoint->getParent(), Dominator)) {
+ if (HoistPoint)
+ assert(DT->dominates(Dominator, HoistPoint->getParent()) &&
+ "New hoist point expected to dominate old hoist point");
+ HoistPoint = Dominator->getTerminator();
+ }
+ LLVM_DEBUG(dbgs() << "LICM rehoisting to "
<< HoistPoint->getParent()->getNameOrAsOperand()
- << ": " << *I << "\n");
- moveInstructionBefore(*I, *HoistPoint, *SafetyInfo, MSSAU, SE);
- HoistPoint = I;
- Changed = true;
- }
- }
- }
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- // Now that we've finished hoisting make sure that LI and DT are still
- // valid.
-#ifdef EXPENSIVE_CHECKS
- if (Changed) {
- assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
- "Dominator tree verification failed");
- LI->verify(*DT);
- }
-#endif
-
- return Changed;
-}
-
-// Return true if LI is invariant within scope of the loop. LI is invariant if
-// CurLoop is dominated by an invariant.start representing the same memory
-// location and size as the memory location LI loads from, and also the
-// invariant.start has no uses.
-static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
- Loop *CurLoop) {
- Value *Addr = LI->getOperand(0);
- const DataLayout &DL = LI->getModule()->getDataLayout();
+ << ": " << *I << "\n");
+ moveInstructionBefore(*I, *HoistPoint, *SafetyInfo, MSSAU, SE);
+ HoistPoint = I;
+ Changed = true;
+ }
+ }
+ }
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ // Now that we've finished hoisting make sure that LI and DT are still
+ // valid.
+#ifdef EXPENSIVE_CHECKS
+ if (Changed) {
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
+ "Dominator tree verification failed");
+ LI->verify(*DT);
+ }
+#endif
+
+ return Changed;
+}
+
+// Return true if LI is invariant within scope of the loop. LI is invariant if
+// CurLoop is dominated by an invariant.start representing the same memory
+// location and size as the memory location LI loads from, and also the
+// invariant.start has no uses.
+static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
+ Loop *CurLoop) {
+ Value *Addr = LI->getOperand(0);
+ const DataLayout &DL = LI->getModule()->getDataLayout();
const TypeSize LocSizeInBits = DL.getTypeSizeInBits(LI->getType());
-
+
// It is not currently possible for clang to generate an invariant.start
// intrinsic with scalable vector types because we don't support thread local
// sizeless types and we don't permit sizeless types in structs or classes.
@@ -1028,166 +1028,166 @@ static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
if (LocSizeInBits.isScalable())
return false;
- // if the type is i8 addrspace(x)*, we know this is the type of
- // llvm.invariant.start operand
- auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()),
- LI->getPointerAddressSpace());
- unsigned BitcastsVisited = 0;
- // Look through bitcasts until we reach the i8* type (this is invariant.start
- // operand type).
- while (Addr->getType() != PtrInt8Ty) {
- auto *BC = dyn_cast<BitCastInst>(Addr);
- // Avoid traversing high number of bitcast uses.
- if (++BitcastsVisited > MaxNumUsesTraversed || !BC)
- return false;
- Addr = BC->getOperand(0);
- }
-
- unsigned UsesVisited = 0;
- // Traverse all uses of the load operand value, to see if invariant.start is
- // one of the uses, and whether it dominates the load instruction.
- for (auto *U : Addr->users()) {
- // Avoid traversing for Load operand with high number of users.
- if (++UsesVisited > MaxNumUsesTraversed)
- return false;
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
- // If there are escaping uses of invariant.start instruction, the load maybe
- // non-invariant.
- if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
- !II->use_empty())
- continue;
+ // if the type is i8 addrspace(x)*, we know this is the type of
+ // llvm.invariant.start operand
+ auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()),
+ LI->getPointerAddressSpace());
+ unsigned BitcastsVisited = 0;
+ // Look through bitcasts until we reach the i8* type (this is invariant.start
+ // operand type).
+ while (Addr->getType() != PtrInt8Ty) {
+ auto *BC = dyn_cast<BitCastInst>(Addr);
+ // Avoid traversing high number of bitcast uses.
+ if (++BitcastsVisited > MaxNumUsesTraversed || !BC)
+ return false;
+ Addr = BC->getOperand(0);
+ }
+
+ unsigned UsesVisited = 0;
+ // Traverse all uses of the load operand value, to see if invariant.start is
+ // one of the uses, and whether it dominates the load instruction.
+ for (auto *U : Addr->users()) {
+ // Avoid traversing for Load operand with high number of users.
+ if (++UsesVisited > MaxNumUsesTraversed)
+ return false;
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+ // If there are escaping uses of invariant.start instruction, the load maybe
+ // non-invariant.
+ if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
+ !II->use_empty())
+ continue;
ConstantInt *InvariantSize = cast<ConstantInt>(II->getArgOperand(0));
// The intrinsic supports having a -1 argument for variable sized objects
// so we should check for that here.
if (InvariantSize->isNegative())
continue;
uint64_t InvariantSizeInBits = InvariantSize->getSExtValue() * 8;
- // Confirm the invariant.start location size contains the load operand size
- // in bits. Also, the invariant.start should dominate the load, and we
- // should not hoist the load out of a loop that contains this dominating
- // invariant.start.
+ // Confirm the invariant.start location size contains the load operand size
+ // in bits. Also, the invariant.start should dominate the load, and we
+ // should not hoist the load out of a loop that contains this dominating
+ // invariant.start.
if (LocSizeInBits.getFixedSize() <= InvariantSizeInBits &&
- DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
- return true;
- }
-
- return false;
-}
-
-namespace {
-/// Return true if-and-only-if we know how to (mechanically) both hoist and
-/// sink a given instruction out of a loop. Does not address legality
-/// concerns such as aliasing or speculation safety.
-bool isHoistableAndSinkableInst(Instruction &I) {
- // Only these instructions are hoistable/sinkable.
- return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) ||
- isa<FenceInst>(I) || isa<CastInst>(I) || isa<UnaryOperator>(I) ||
- isa<BinaryOperator>(I) || isa<SelectInst>(I) ||
- isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
- isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
- isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) ||
- isa<InsertValueInst>(I) || isa<FreezeInst>(I));
-}
-/// Return true if all of the alias sets within this AST are known not to
-/// contain a Mod, or if MSSA knows thare are no MemoryDefs in the loop.
-bool isReadOnly(AliasSetTracker *CurAST, const MemorySSAUpdater *MSSAU,
- const Loop *L) {
- if (CurAST) {
- for (AliasSet &AS : *CurAST) {
- if (!AS.isForwardingAliasSet() && AS.isMod()) {
- return false;
- }
- }
- return true;
- } else { /*MSSAU*/
- for (auto *BB : L->getBlocks())
- if (MSSAU->getMemorySSA()->getBlockDefs(BB))
- return false;
- return true;
- }
-}
-
-/// Return true if I is the only Instruction with a MemoryAccess in L.
-bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
- const MemorySSAUpdater *MSSAU) {
- for (auto *BB : L->getBlocks())
- if (auto *Accs = MSSAU->getMemorySSA()->getBlockAccesses(BB)) {
- int NotAPhi = 0;
- for (const auto &Acc : *Accs) {
- if (isa<MemoryPhi>(&Acc))
- continue;
- const auto *MUD = cast<MemoryUseOrDef>(&Acc);
- if (MUD->getMemoryInst() != I || NotAPhi++ == 1)
- return false;
- }
- }
- return true;
-}
-}
-
-bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
- Loop *CurLoop, AliasSetTracker *CurAST,
- MemorySSAUpdater *MSSAU,
- bool TargetExecutesOncePerLoop,
- SinkAndHoistLICMFlags *Flags,
- OptimizationRemarkEmitter *ORE) {
+ DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
+ return true;
+ }
+
+ return false;
+}
+
+namespace {
+/// Return true if-and-only-if we know how to (mechanically) both hoist and
+/// sink a given instruction out of a loop. Does not address legality
+/// concerns such as aliasing or speculation safety.
+bool isHoistableAndSinkableInst(Instruction &I) {
+ // Only these instructions are hoistable/sinkable.
+ return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) ||
+ isa<FenceInst>(I) || isa<CastInst>(I) || isa<UnaryOperator>(I) ||
+ isa<BinaryOperator>(I) || isa<SelectInst>(I) ||
+ isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
+ isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+ isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) ||
+ isa<InsertValueInst>(I) || isa<FreezeInst>(I));
+}
+/// Return true if all of the alias sets within this AST are known not to
+/// contain a Mod, or if MSSA knows thare are no MemoryDefs in the loop.
+bool isReadOnly(AliasSetTracker *CurAST, const MemorySSAUpdater *MSSAU,
+ const Loop *L) {
+ if (CurAST) {
+ for (AliasSet &AS : *CurAST) {
+ if (!AS.isForwardingAliasSet() && AS.isMod()) {
+ return false;
+ }
+ }
+ return true;
+ } else { /*MSSAU*/
+ for (auto *BB : L->getBlocks())
+ if (MSSAU->getMemorySSA()->getBlockDefs(BB))
+ return false;
+ return true;
+ }
+}
+
+/// Return true if I is the only Instruction with a MemoryAccess in L.
+bool isOnlyMemoryAccess(const Instruction *I, const Loop *L,
+ const MemorySSAUpdater *MSSAU) {
+ for (auto *BB : L->getBlocks())
+ if (auto *Accs = MSSAU->getMemorySSA()->getBlockAccesses(BB)) {
+ int NotAPhi = 0;
+ for (const auto &Acc : *Accs) {
+ if (isa<MemoryPhi>(&Acc))
+ continue;
+ const auto *MUD = cast<MemoryUseOrDef>(&Acc);
+ if (MUD->getMemoryInst() != I || NotAPhi++ == 1)
+ return false;
+ }
+ }
+ return true;
+}
+}
+
+bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
+ Loop *CurLoop, AliasSetTracker *CurAST,
+ MemorySSAUpdater *MSSAU,
+ bool TargetExecutesOncePerLoop,
+ SinkAndHoistLICMFlags *Flags,
+ OptimizationRemarkEmitter *ORE) {
assert(((CurAST != nullptr) ^ (MSSAU != nullptr)) &&
"Either AliasSetTracker or MemorySSA should be initialized.");
- // If we don't understand the instruction, bail early.
- if (!isHoistableAndSinkableInst(I))
- return false;
-
- MemorySSA *MSSA = MSSAU ? MSSAU->getMemorySSA() : nullptr;
- if (MSSA)
- assert(Flags != nullptr && "Flags cannot be null.");
-
- // Loads have extra constraints we have to verify before we can hoist them.
- if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
- if (!LI->isUnordered())
- return false; // Don't sink/hoist volatile or ordered atomic loads!
-
- // Loads from constant memory are always safe to move, even if they end up
- // in the same alias set as something that ends up being modified.
- if (AA->pointsToConstantMemory(LI->getOperand(0)))
- return true;
- if (LI->hasMetadata(LLVMContext::MD_invariant_load))
- return true;
-
- if (LI->isAtomic() && !TargetExecutesOncePerLoop)
- return false; // Don't risk duplicating unordered loads
-
- // This checks for an invariant.start dominating the load.
- if (isLoadInvariantInLoop(LI, DT, CurLoop))
- return true;
-
- bool Invalidated;
- if (CurAST)
- Invalidated = pointerInvalidatedByLoop(MemoryLocation::get(LI), CurAST,
- CurLoop, AA);
- else
- Invalidated = pointerInvalidatedByLoopWithMSSA(
+ // If we don't understand the instruction, bail early.
+ if (!isHoistableAndSinkableInst(I))
+ return false;
+
+ MemorySSA *MSSA = MSSAU ? MSSAU->getMemorySSA() : nullptr;
+ if (MSSA)
+ assert(Flags != nullptr && "Flags cannot be null.");
+
+ // Loads have extra constraints we have to verify before we can hoist them.
+ if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+ if (!LI->isUnordered())
+ return false; // Don't sink/hoist volatile or ordered atomic loads!
+
+ // Loads from constant memory are always safe to move, even if they end up
+ // in the same alias set as something that ends up being modified.
+ if (AA->pointsToConstantMemory(LI->getOperand(0)))
+ return true;
+ if (LI->hasMetadata(LLVMContext::MD_invariant_load))
+ return true;
+
+ if (LI->isAtomic() && !TargetExecutesOncePerLoop)
+ return false; // Don't risk duplicating unordered loads
+
+ // This checks for an invariant.start dominating the load.
+ if (isLoadInvariantInLoop(LI, DT, CurLoop))
+ return true;
+
+ bool Invalidated;
+ if (CurAST)
+ Invalidated = pointerInvalidatedByLoop(MemoryLocation::get(LI), CurAST,
+ CurLoop, AA);
+ else
+ Invalidated = pointerInvalidatedByLoopWithMSSA(
MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(LI)), CurLoop, I, *Flags);
- // Check loop-invariant address because this may also be a sinkable load
- // whose address is not necessarily loop-invariant.
- if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand()))
- ORE->emit([&]() {
- return OptimizationRemarkMissed(
- DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI)
- << "failed to move load with loop-invariant address "
- "because the loop may invalidate its value";
- });
-
- return !Invalidated;
- } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
- // Don't sink or hoist dbg info; it's legal, but not useful.
- if (isa<DbgInfoIntrinsic>(I))
- return false;
-
- // Don't sink calls which can throw.
- if (CI->mayThrow())
- return false;
-
+ // Check loop-invariant address because this may also be a sinkable load
+ // whose address is not necessarily loop-invariant.
+ if (ORE && Invalidated && CurLoop->isLoopInvariant(LI->getPointerOperand()))
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(
+ DEBUG_TYPE, "LoadWithLoopInvariantAddressInvalidated", LI)
+ << "failed to move load with loop-invariant address "
+ "because the loop may invalidate its value";
+ });
+
+ return !Invalidated;
+ } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+ // Don't sink or hoist dbg info; it's legal, but not useful.
+ if (isa<DbgInfoIntrinsic>(I))
+ return false;
+
+ // Don't sink calls which can throw.
+ if (CI->mayThrow())
+ return false;
+
// Convergent attribute has been used on operations that involve
// inter-thread communication which results are implicitly affected by the
// enclosing control flows. It is not safe to hoist or sink such operations
@@ -1195,526 +1195,526 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
if (CI->isConvergent())
return false;
- using namespace PatternMatch;
- if (match(CI, m_Intrinsic<Intrinsic::assume>()))
- // Assumes don't actually alias anything or throw
- return true;
-
- if (match(CI, m_Intrinsic<Intrinsic::experimental_widenable_condition>()))
- // Widenable conditions don't actually alias anything or throw
- return true;
-
- // Handle simple cases by querying alias analysis.
- FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI);
- if (Behavior == FMRB_DoesNotAccessMemory)
- return true;
- if (AAResults::onlyReadsMemory(Behavior)) {
- // A readonly argmemonly function only reads from memory pointed to by
- // it's arguments with arbitrary offsets. If we can prove there are no
- // writes to this memory in the loop, we can hoist or sink.
- if (AAResults::onlyAccessesArgPointees(Behavior)) {
- // TODO: expand to writeable arguments
- for (Value *Op : CI->arg_operands())
- if (Op->getType()->isPointerTy()) {
- bool Invalidated;
- if (CurAST)
- Invalidated = pointerInvalidatedByLoop(
+ using namespace PatternMatch;
+ if (match(CI, m_Intrinsic<Intrinsic::assume>()))
+ // Assumes don't actually alias anything or throw
+ return true;
+
+ if (match(CI, m_Intrinsic<Intrinsic::experimental_widenable_condition>()))
+ // Widenable conditions don't actually alias anything or throw
+ return true;
+
+ // Handle simple cases by querying alias analysis.
+ FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI);
+ if (Behavior == FMRB_DoesNotAccessMemory)
+ return true;
+ if (AAResults::onlyReadsMemory(Behavior)) {
+ // A readonly argmemonly function only reads from memory pointed to by
+ // it's arguments with arbitrary offsets. If we can prove there are no
+ // writes to this memory in the loop, we can hoist or sink.
+ if (AAResults::onlyAccessesArgPointees(Behavior)) {
+ // TODO: expand to writeable arguments
+ for (Value *Op : CI->arg_operands())
+ if (Op->getType()->isPointerTy()) {
+ bool Invalidated;
+ if (CurAST)
+ Invalidated = pointerInvalidatedByLoop(
MemoryLocation::getBeforeOrAfter(Op), CurAST, CurLoop, AA);
- else
- Invalidated = pointerInvalidatedByLoopWithMSSA(
+ else
+ Invalidated = pointerInvalidatedByLoopWithMSSA(
MSSA, cast<MemoryUse>(MSSA->getMemoryAccess(CI)), CurLoop, I,
- *Flags);
- if (Invalidated)
- return false;
- }
- return true;
- }
-
- // If this call only reads from memory and there are no writes to memory
- // in the loop, we can hoist or sink the call as appropriate.
- if (isReadOnly(CurAST, MSSAU, CurLoop))
- return true;
- }
-
- // FIXME: This should use mod/ref information to see if we can hoist or
- // sink the call.
-
- return false;
- } else if (auto *FI = dyn_cast<FenceInst>(&I)) {
- // Fences alias (most) everything to provide ordering. For the moment,
- // just give up if there are any other memory operations in the loop.
- if (CurAST) {
- auto Begin = CurAST->begin();
- assert(Begin != CurAST->end() && "must contain FI");
- if (std::next(Begin) != CurAST->end())
- // constant memory for instance, TODO: handle better
- return false;
- auto *UniqueI = Begin->getUniqueInstruction();
- if (!UniqueI)
- // other memory op, give up
- return false;
- (void)FI; // suppress unused variable warning
- assert(UniqueI == FI && "AS must contain FI");
- return true;
- } else // MSSAU
- return isOnlyMemoryAccess(FI, CurLoop, MSSAU);
- } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
- if (!SI->isUnordered())
- return false; // Don't sink/hoist volatile or ordered atomic store!
-
- // We can only hoist a store that we can prove writes a value which is not
- // read or overwritten within the loop. For those cases, we fallback to
- // load store promotion instead. TODO: We can extend this to cases where
- // there is exactly one write to the location and that write dominates an
- // arbitrary number of reads in the loop.
- if (CurAST) {
- auto &AS = CurAST->getAliasSetFor(MemoryLocation::get(SI));
-
- if (AS.isRef() || !AS.isMustAlias())
- // Quick exit test, handled by the full path below as well.
- return false;
- auto *UniqueI = AS.getUniqueInstruction();
- if (!UniqueI)
- // other memory op, give up
- return false;
- assert(UniqueI == SI && "AS must contain SI");
- return true;
- } else { // MSSAU
- if (isOnlyMemoryAccess(SI, CurLoop, MSSAU))
- return true;
+ *Flags);
+ if (Invalidated)
+ return false;
+ }
+ return true;
+ }
+
+ // If this call only reads from memory and there are no writes to memory
+ // in the loop, we can hoist or sink the call as appropriate.
+ if (isReadOnly(CurAST, MSSAU, CurLoop))
+ return true;
+ }
+
+ // FIXME: This should use mod/ref information to see if we can hoist or
+ // sink the call.
+
+ return false;
+ } else if (auto *FI = dyn_cast<FenceInst>(&I)) {
+ // Fences alias (most) everything to provide ordering. For the moment,
+ // just give up if there are any other memory operations in the loop.
+ if (CurAST) {
+ auto Begin = CurAST->begin();
+ assert(Begin != CurAST->end() && "must contain FI");
+ if (std::next(Begin) != CurAST->end())
+ // constant memory for instance, TODO: handle better
+ return false;
+ auto *UniqueI = Begin->getUniqueInstruction();
+ if (!UniqueI)
+ // other memory op, give up
+ return false;
+ (void)FI; // suppress unused variable warning
+ assert(UniqueI == FI && "AS must contain FI");
+ return true;
+ } else // MSSAU
+ return isOnlyMemoryAccess(FI, CurLoop, MSSAU);
+ } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ if (!SI->isUnordered())
+ return false; // Don't sink/hoist volatile or ordered atomic store!
+
+ // We can only hoist a store that we can prove writes a value which is not
+ // read or overwritten within the loop. For those cases, we fallback to
+ // load store promotion instead. TODO: We can extend this to cases where
+ // there is exactly one write to the location and that write dominates an
+ // arbitrary number of reads in the loop.
+ if (CurAST) {
+ auto &AS = CurAST->getAliasSetFor(MemoryLocation::get(SI));
+
+ if (AS.isRef() || !AS.isMustAlias())
+ // Quick exit test, handled by the full path below as well.
+ return false;
+ auto *UniqueI = AS.getUniqueInstruction();
+ if (!UniqueI)
+ // other memory op, give up
+ return false;
+ assert(UniqueI == SI && "AS must contain SI");
+ return true;
+ } else { // MSSAU
+ if (isOnlyMemoryAccess(SI, CurLoop, MSSAU))
+ return true;
// If there are more accesses than the Promotion cap or no "quota" to
// check clobber, then give up as we're not walking a list that long.
if (Flags->tooManyMemoryAccesses() || Flags->tooManyClobberingCalls())
- return false;
- // If there are interfering Uses (i.e. their defining access is in the
- // loop), or ordered loads (stored as Defs!), don't move this store.
- // Could do better here, but this is conservatively correct.
- // TODO: Cache set of Uses on the first walk in runOnLoop, update when
- // moving accesses. Can also extend to dominating uses.
- auto *SIMD = MSSA->getMemoryAccess(SI);
- for (auto *BB : CurLoop->getBlocks())
- if (auto *Accesses = MSSA->getBlockAccesses(BB)) {
- for (const auto &MA : *Accesses)
- if (const auto *MU = dyn_cast<MemoryUse>(&MA)) {
- auto *MD = MU->getDefiningAccess();
- if (!MSSA->isLiveOnEntryDef(MD) &&
- CurLoop->contains(MD->getBlock()))
- return false;
- // Disable hoisting past potentially interfering loads. Optimized
- // Uses may point to an access outside the loop, as getClobbering
- // checks the previous iteration when walking the backedge.
- // FIXME: More precise: no Uses that alias SI.
+ return false;
+ // If there are interfering Uses (i.e. their defining access is in the
+ // loop), or ordered loads (stored as Defs!), don't move this store.
+ // Could do better here, but this is conservatively correct.
+ // TODO: Cache set of Uses on the first walk in runOnLoop, update when
+ // moving accesses. Can also extend to dominating uses.
+ auto *SIMD = MSSA->getMemoryAccess(SI);
+ for (auto *BB : CurLoop->getBlocks())
+ if (auto *Accesses = MSSA->getBlockAccesses(BB)) {
+ for (const auto &MA : *Accesses)
+ if (const auto *MU = dyn_cast<MemoryUse>(&MA)) {
+ auto *MD = MU->getDefiningAccess();
+ if (!MSSA->isLiveOnEntryDef(MD) &&
+ CurLoop->contains(MD->getBlock()))
+ return false;
+ // Disable hoisting past potentially interfering loads. Optimized
+ // Uses may point to an access outside the loop, as getClobbering
+ // checks the previous iteration when walking the backedge.
+ // FIXME: More precise: no Uses that alias SI.
if (!Flags->getIsSink() && !MSSA->dominates(SIMD, MU))
- return false;
- } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) {
- if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) {
- (void)LI; // Silence warning.
- assert(!LI->isUnordered() && "Expected unordered load");
- return false;
- }
- // Any call, while it may not be clobbering SI, it may be a use.
- if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) {
- // Check if the call may read from the memory locattion written
- // to by SI. Check CI's attributes and arguments; the number of
- // such checks performed is limited above by NoOfMemAccTooLarge.
- ModRefInfo MRI = AA->getModRefInfo(CI, MemoryLocation::get(SI));
- if (isModOrRefSet(MRI))
- return false;
- }
- }
- }
- auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI);
+ return false;
+ } else if (const auto *MD = dyn_cast<MemoryDef>(&MA)) {
+ if (auto *LI = dyn_cast<LoadInst>(MD->getMemoryInst())) {
+ (void)LI; // Silence warning.
+ assert(!LI->isUnordered() && "Expected unordered load");
+ return false;
+ }
+ // Any call, while it may not be clobbering SI, it may be a use.
+ if (auto *CI = dyn_cast<CallInst>(MD->getMemoryInst())) {
+ // Check if the call may read from the memory locattion written
+ // to by SI. Check CI's attributes and arguments; the number of
+ // such checks performed is limited above by NoOfMemAccTooLarge.
+ ModRefInfo MRI = AA->getModRefInfo(CI, MemoryLocation::get(SI));
+ if (isModOrRefSet(MRI))
+ return false;
+ }
+ }
+ }
+ auto *Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(SI);
Flags->incrementClobberingCalls();
- // If there are no clobbering Defs in the loop, store is safe to hoist.
- return MSSA->isLiveOnEntryDef(Source) ||
- !CurLoop->contains(Source->getBlock());
- }
- }
-
- assert(!I.mayReadOrWriteMemory() && "unhandled aliasing");
-
- // We've established mechanical ability and aliasing, it's up to the caller
- // to check fault safety
- return true;
-}
-
-/// Returns true if a PHINode is a trivially replaceable with an
-/// Instruction.
-/// This is true when all incoming values are that instruction.
-/// This pattern occurs most often with LCSSA PHI nodes.
-///
-static bool isTriviallyReplaceablePHI(const PHINode &PN, const Instruction &I) {
- for (const Value *IncValue : PN.incoming_values())
- if (IncValue != &I)
- return false;
-
- return true;
-}
-
-/// Return true if the instruction is free in the loop.
-static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
- const TargetTransformInfo *TTI) {
-
- if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) {
- if (TTI->getUserCost(GEP, TargetTransformInfo::TCK_SizeAndLatency) !=
- TargetTransformInfo::TCC_Free)
- return false;
- // For a GEP, we cannot simply use getUserCost because currently it
- // optimistically assume that a GEP will fold into addressing mode
- // regardless of its users.
- const BasicBlock *BB = GEP->getParent();
- for (const User *U : GEP->users()) {
- const Instruction *UI = cast<Instruction>(U);
- if (CurLoop->contains(UI) &&
- (BB != UI->getParent() ||
- (!isa<StoreInst>(UI) && !isa<LoadInst>(UI))))
- return false;
- }
- return true;
- } else
- return TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
- TargetTransformInfo::TCC_Free;
-}
-
-/// Return true if the only users of this instruction are outside of
-/// the loop. If this is true, we can sink the instruction to the exit
-/// blocks of the loop.
-///
-/// We also return true if the instruction could be folded away in lowering.
-/// (e.g., a GEP can be folded into a load as an addressing mode in the loop).
-static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
- const LoopSafetyInfo *SafetyInfo,
- TargetTransformInfo *TTI, bool &FreeInLoop) {
- const auto &BlockColors = SafetyInfo->getBlockColors();
- bool IsFree = isFreeInLoop(I, CurLoop, TTI);
- for (const User *U : I.users()) {
- const Instruction *UI = cast<Instruction>(U);
- if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
- const BasicBlock *BB = PN->getParent();
- // We cannot sink uses in catchswitches.
- if (isa<CatchSwitchInst>(BB->getTerminator()))
- return false;
-
- // We need to sink a callsite to a unique funclet. Avoid sinking if the
- // phi use is too muddled.
- if (isa<CallInst>(I))
- if (!BlockColors.empty() &&
- BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
- return false;
- }
-
- if (CurLoop->contains(UI)) {
- if (IsFree) {
- FreeInLoop = true;
- continue;
- }
- return false;
- }
- }
- return true;
-}
-
-static Instruction *cloneInstructionInExitBlock(
- Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI,
- const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU) {
- Instruction *New;
- if (auto *CI = dyn_cast<CallInst>(&I)) {
- const auto &BlockColors = SafetyInfo->getBlockColors();
-
- // Sinking call-sites need to be handled differently from other
- // instructions. The cloned call-site needs a funclet bundle operand
- // appropriate for its location in the CFG.
- SmallVector<OperandBundleDef, 1> OpBundles;
- for (unsigned BundleIdx = 0, BundleEnd = CI->getNumOperandBundles();
- BundleIdx != BundleEnd; ++BundleIdx) {
- OperandBundleUse Bundle = CI->getOperandBundleAt(BundleIdx);
- if (Bundle.getTagID() == LLVMContext::OB_funclet)
- continue;
-
- OpBundles.emplace_back(Bundle);
- }
-
- if (!BlockColors.empty()) {
- const ColorVector &CV = BlockColors.find(&ExitBlock)->second;
- assert(CV.size() == 1 && "non-unique color for exit block!");
- BasicBlock *BBColor = CV.front();
- Instruction *EHPad = BBColor->getFirstNonPHI();
- if (EHPad->isEHPad())
- OpBundles.emplace_back("funclet", EHPad);
- }
-
- New = CallInst::Create(CI, OpBundles);
- } else {
- New = I.clone();
- }
-
- ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New);
- if (!I.getName().empty())
- New->setName(I.getName() + ".le");
-
- if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
- // Create a new MemoryAccess and let MemorySSA set its defining access.
- MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
- New, nullptr, New->getParent(), MemorySSA::Beginning);
- if (NewMemAcc) {
- if (auto *MemDef = dyn_cast<MemoryDef>(NewMemAcc))
- MSSAU->insertDef(MemDef, /*RenameUses=*/true);
- else {
- auto *MemUse = cast<MemoryUse>(NewMemAcc);
- MSSAU->insertUse(MemUse, /*RenameUses=*/true);
- }
- }
- }
-
- // Build LCSSA PHI nodes for any in-loop operands. Note that this is
- // particularly cheap because we can rip off the PHI node that we're
- // replacing for the number and blocks of the predecessors.
- // OPT: If this shows up in a profile, we can instead finish sinking all
- // invariant instructions, and then walk their operands to re-establish
- // LCSSA. That will eliminate creating PHI nodes just to nuke them when
- // sinking bottom-up.
- for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE;
- ++OI)
- if (Instruction *OInst = dyn_cast<Instruction>(*OI))
- if (Loop *OLoop = LI->getLoopFor(OInst->getParent()))
- if (!OLoop->contains(&PN)) {
- PHINode *OpPN =
- PHINode::Create(OInst->getType(), PN.getNumIncomingValues(),
- OInst->getName() + ".lcssa", &ExitBlock.front());
- for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
- OpPN->addIncoming(OInst, PN.getIncomingBlock(i));
- *OI = OpPN;
- }
- return New;
-}
-
-static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
- AliasSetTracker *AST, MemorySSAUpdater *MSSAU) {
- if (AST)
- AST->deleteValue(&I);
- if (MSSAU)
- MSSAU->removeMemoryAccess(&I);
- SafetyInfo.removeInstruction(&I);
- I.eraseFromParent();
-}
-
-static void moveInstructionBefore(Instruction &I, Instruction &Dest,
- ICFLoopSafetyInfo &SafetyInfo,
- MemorySSAUpdater *MSSAU,
- ScalarEvolution *SE) {
- SafetyInfo.removeInstruction(&I);
- SafetyInfo.insertInstructionTo(&I, Dest.getParent());
- I.moveBefore(&Dest);
- if (MSSAU)
- if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
- MSSAU->getMemorySSA()->getMemoryAccess(&I)))
- MSSAU->moveToPlace(OldMemAcc, Dest.getParent(),
- MemorySSA::BeforeTerminator);
- if (SE)
- SE->forgetValue(&I);
-}
-
-static Instruction *sinkThroughTriviallyReplaceablePHI(
- PHINode *TPN, Instruction *I, LoopInfo *LI,
- SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
- const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop,
- MemorySSAUpdater *MSSAU) {
- assert(isTriviallyReplaceablePHI(*TPN, *I) &&
- "Expect only trivially replaceable PHI");
- BasicBlock *ExitBlock = TPN->getParent();
- Instruction *New;
- auto It = SunkCopies.find(ExitBlock);
- if (It != SunkCopies.end())
- New = It->second;
- else
- New = SunkCopies[ExitBlock] = cloneInstructionInExitBlock(
- *I, *ExitBlock, *TPN, LI, SafetyInfo, MSSAU);
- return New;
-}
-
-static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) {
- BasicBlock *BB = PN->getParent();
- if (!BB->canSplitPredecessors())
- return false;
- // It's not impossible to split EHPad blocks, but if BlockColors already exist
- // it require updating BlockColors for all offspring blocks accordingly. By
- // skipping such corner case, we can make updating BlockColors after splitting
- // predecessor fairly simple.
- if (!SafetyInfo->getBlockColors().empty() && BB->getFirstNonPHI()->isEHPad())
- return false;
- for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
- BasicBlock *BBPred = *PI;
- if (isa<IndirectBrInst>(BBPred->getTerminator()) ||
- isa<CallBrInst>(BBPred->getTerminator()))
- return false;
- }
- return true;
-}
-
-static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
- LoopInfo *LI, const Loop *CurLoop,
- LoopSafetyInfo *SafetyInfo,
- MemorySSAUpdater *MSSAU) {
-#ifndef NDEBUG
- SmallVector<BasicBlock *, 32> ExitBlocks;
- CurLoop->getUniqueExitBlocks(ExitBlocks);
- SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
- ExitBlocks.end());
-#endif
- BasicBlock *ExitBB = PN->getParent();
- assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block.");
-
- // Split predecessors of the loop exit to make instructions in the loop are
- // exposed to exit blocks through trivially replaceable PHIs while keeping the
- // loop in the canonical form where each predecessor of each exit block should
- // be contained within the loop. For example, this will convert the loop below
- // from
- //
- // LB1:
- // %v1 =
- // br %LE, %LB2
- // LB2:
- // %v2 =
- // br %LE, %LB1
- // LE:
- // %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replaceable
- //
- // to
- //
- // LB1:
- // %v1 =
- // br %LE.split, %LB2
- // LB2:
- // %v2 =
- // br %LE.split2, %LB1
- // LE.split:
- // %p1 = phi [%v1, %LB1] <-- trivially replaceable
- // br %LE
- // LE.split2:
- // %p2 = phi [%v2, %LB2] <-- trivially replaceable
- // br %LE
- // LE:
- // %p = phi [%p1, %LE.split], [%p2, %LE.split2]
- //
- const auto &BlockColors = SafetyInfo->getBlockColors();
- SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB));
- while (!PredBBs.empty()) {
- BasicBlock *PredBB = *PredBBs.begin();
- assert(CurLoop->contains(PredBB) &&
- "Expect all predecessors are in the loop");
- if (PN->getBasicBlockIndex(PredBB) >= 0) {
- BasicBlock *NewPred = SplitBlockPredecessors(
- ExitBB, PredBB, ".split.loop.exit", DT, LI, MSSAU, true);
- // Since we do not allow splitting EH-block with BlockColors in
- // canSplitPredecessors(), we can simply assign predecessor's color to
- // the new block.
- if (!BlockColors.empty())
- // Grab a reference to the ColorVector to be inserted before getting the
- // reference to the vector we are copying because inserting the new
- // element in BlockColors might cause the map to be reallocated.
- SafetyInfo->copyColors(NewPred, PredBB);
- }
- PredBBs.remove(PredBB);
- }
-}
-
-/// When an instruction is found to only be used outside of the loop, this
-/// function moves it to the exit blocks and patches up SSA form as needed.
-/// This method is guaranteed to remove the original instruction from its
-/// position, and may either delete it or move it to outside of the loop.
-///
-static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
+ // If there are no clobbering Defs in the loop, store is safe to hoist.
+ return MSSA->isLiveOnEntryDef(Source) ||
+ !CurLoop->contains(Source->getBlock());
+ }
+ }
+
+ assert(!I.mayReadOrWriteMemory() && "unhandled aliasing");
+
+ // We've established mechanical ability and aliasing, it's up to the caller
+ // to check fault safety
+ return true;
+}
+
+/// Returns true if a PHINode is a trivially replaceable with an
+/// Instruction.
+/// This is true when all incoming values are that instruction.
+/// This pattern occurs most often with LCSSA PHI nodes.
+///
+static bool isTriviallyReplaceablePHI(const PHINode &PN, const Instruction &I) {
+ for (const Value *IncValue : PN.incoming_values())
+ if (IncValue != &I)
+ return false;
+
+ return true;
+}
+
+/// Return true if the instruction is free in the loop.
+static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
+ const TargetTransformInfo *TTI) {
+
+ if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ if (TTI->getUserCost(GEP, TargetTransformInfo::TCK_SizeAndLatency) !=
+ TargetTransformInfo::TCC_Free)
+ return false;
+ // For a GEP, we cannot simply use getUserCost because currently it
+ // optimistically assume that a GEP will fold into addressing mode
+ // regardless of its users.
+ const BasicBlock *BB = GEP->getParent();
+ for (const User *U : GEP->users()) {
+ const Instruction *UI = cast<Instruction>(U);
+ if (CurLoop->contains(UI) &&
+ (BB != UI->getParent() ||
+ (!isa<StoreInst>(UI) && !isa<LoadInst>(UI))))
+ return false;
+ }
+ return true;
+ } else
+ return TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
+ TargetTransformInfo::TCC_Free;
+}
+
+/// Return true if the only users of this instruction are outside of
+/// the loop. If this is true, we can sink the instruction to the exit
+/// blocks of the loop.
+///
+/// We also return true if the instruction could be folded away in lowering.
+/// (e.g., a GEP can be folded into a load as an addressing mode in the loop).
+static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
+ const LoopSafetyInfo *SafetyInfo,
+ TargetTransformInfo *TTI, bool &FreeInLoop) {
+ const auto &BlockColors = SafetyInfo->getBlockColors();
+ bool IsFree = isFreeInLoop(I, CurLoop, TTI);
+ for (const User *U : I.users()) {
+ const Instruction *UI = cast<Instruction>(U);
+ if (const PHINode *PN = dyn_cast<PHINode>(UI)) {
+ const BasicBlock *BB = PN->getParent();
+ // We cannot sink uses in catchswitches.
+ if (isa<CatchSwitchInst>(BB->getTerminator()))
+ return false;
+
+ // We need to sink a callsite to a unique funclet. Avoid sinking if the
+ // phi use is too muddled.
+ if (isa<CallInst>(I))
+ if (!BlockColors.empty() &&
+ BlockColors.find(const_cast<BasicBlock *>(BB))->second.size() != 1)
+ return false;
+ }
+
+ if (CurLoop->contains(UI)) {
+ if (IsFree) {
+ FreeInLoop = true;
+ continue;
+ }
+ return false;
+ }
+ }
+ return true;
+}
+
+static Instruction *cloneInstructionInExitBlock(
+ Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI,
+ const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU) {
+ Instruction *New;
+ if (auto *CI = dyn_cast<CallInst>(&I)) {
+ const auto &BlockColors = SafetyInfo->getBlockColors();
+
+ // Sinking call-sites need to be handled differently from other
+ // instructions. The cloned call-site needs a funclet bundle operand
+ // appropriate for its location in the CFG.
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ for (unsigned BundleIdx = 0, BundleEnd = CI->getNumOperandBundles();
+ BundleIdx != BundleEnd; ++BundleIdx) {
+ OperandBundleUse Bundle = CI->getOperandBundleAt(BundleIdx);
+ if (Bundle.getTagID() == LLVMContext::OB_funclet)
+ continue;
+
+ OpBundles.emplace_back(Bundle);
+ }
+
+ if (!BlockColors.empty()) {
+ const ColorVector &CV = BlockColors.find(&ExitBlock)->second;
+ assert(CV.size() == 1 && "non-unique color for exit block!");
+ BasicBlock *BBColor = CV.front();
+ Instruction *EHPad = BBColor->getFirstNonPHI();
+ if (EHPad->isEHPad())
+ OpBundles.emplace_back("funclet", EHPad);
+ }
+
+ New = CallInst::Create(CI, OpBundles);
+ } else {
+ New = I.clone();
+ }
+
+ ExitBlock.getInstList().insert(ExitBlock.getFirstInsertionPt(), New);
+ if (!I.getName().empty())
+ New->setName(I.getName() + ".le");
+
+ if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
+ // Create a new MemoryAccess and let MemorySSA set its defining access.
+ MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
+ New, nullptr, New->getParent(), MemorySSA::Beginning);
+ if (NewMemAcc) {
+ if (auto *MemDef = dyn_cast<MemoryDef>(NewMemAcc))
+ MSSAU->insertDef(MemDef, /*RenameUses=*/true);
+ else {
+ auto *MemUse = cast<MemoryUse>(NewMemAcc);
+ MSSAU->insertUse(MemUse, /*RenameUses=*/true);
+ }
+ }
+ }
+
+ // Build LCSSA PHI nodes for any in-loop operands. Note that this is
+ // particularly cheap because we can rip off the PHI node that we're
+ // replacing for the number and blocks of the predecessors.
+ // OPT: If this shows up in a profile, we can instead finish sinking all
+ // invariant instructions, and then walk their operands to re-establish
+ // LCSSA. That will eliminate creating PHI nodes just to nuke them when
+ // sinking bottom-up.
+ for (User::op_iterator OI = New->op_begin(), OE = New->op_end(); OI != OE;
+ ++OI)
+ if (Instruction *OInst = dyn_cast<Instruction>(*OI))
+ if (Loop *OLoop = LI->getLoopFor(OInst->getParent()))
+ if (!OLoop->contains(&PN)) {
+ PHINode *OpPN =
+ PHINode::Create(OInst->getType(), PN.getNumIncomingValues(),
+ OInst->getName() + ".lcssa", &ExitBlock.front());
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+ OpPN->addIncoming(OInst, PN.getIncomingBlock(i));
+ *OI = OpPN;
+ }
+ return New;
+}
+
+static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo,
+ AliasSetTracker *AST, MemorySSAUpdater *MSSAU) {
+ if (AST)
+ AST->deleteValue(&I);
+ if (MSSAU)
+ MSSAU->removeMemoryAccess(&I);
+ SafetyInfo.removeInstruction(&I);
+ I.eraseFromParent();
+}
+
+static void moveInstructionBefore(Instruction &I, Instruction &Dest,
+ ICFLoopSafetyInfo &SafetyInfo,
+ MemorySSAUpdater *MSSAU,
+ ScalarEvolution *SE) {
+ SafetyInfo.removeInstruction(&I);
+ SafetyInfo.insertInstructionTo(&I, Dest.getParent());
+ I.moveBefore(&Dest);
+ if (MSSAU)
+ if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
+ MSSAU->getMemorySSA()->getMemoryAccess(&I)))
+ MSSAU->moveToPlace(OldMemAcc, Dest.getParent(),
+ MemorySSA::BeforeTerminator);
+ if (SE)
+ SE->forgetValue(&I);
+}
+
+static Instruction *sinkThroughTriviallyReplaceablePHI(
+ PHINode *TPN, Instruction *I, LoopInfo *LI,
+ SmallDenseMap<BasicBlock *, Instruction *, 32> &SunkCopies,
+ const LoopSafetyInfo *SafetyInfo, const Loop *CurLoop,
+ MemorySSAUpdater *MSSAU) {
+ assert(isTriviallyReplaceablePHI(*TPN, *I) &&
+ "Expect only trivially replaceable PHI");
+ BasicBlock *ExitBlock = TPN->getParent();
+ Instruction *New;
+ auto It = SunkCopies.find(ExitBlock);
+ if (It != SunkCopies.end())
+ New = It->second;
+ else
+ New = SunkCopies[ExitBlock] = cloneInstructionInExitBlock(
+ *I, *ExitBlock, *TPN, LI, SafetyInfo, MSSAU);
+ return New;
+}
+
+static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) {
+ BasicBlock *BB = PN->getParent();
+ if (!BB->canSplitPredecessors())
+ return false;
+ // It's not impossible to split EHPad blocks, but if BlockColors already exist
+ // it require updating BlockColors for all offspring blocks accordingly. By
+ // skipping such corner case, we can make updating BlockColors after splitting
+ // predecessor fairly simple.
+ if (!SafetyInfo->getBlockColors().empty() && BB->getFirstNonPHI()->isEHPad())
+ return false;
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+ BasicBlock *BBPred = *PI;
+ if (isa<IndirectBrInst>(BBPred->getTerminator()) ||
+ isa<CallBrInst>(BBPred->getTerminator()))
+ return false;
+ }
+ return true;
+}
+
+static void splitPredecessorsOfLoopExit(PHINode *PN, DominatorTree *DT,
+ LoopInfo *LI, const Loop *CurLoop,
+ LoopSafetyInfo *SafetyInfo,
+ MemorySSAUpdater *MSSAU) {
+#ifndef NDEBUG
+ SmallVector<BasicBlock *, 32> ExitBlocks;
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+ SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+#endif
+ BasicBlock *ExitBB = PN->getParent();
+ assert(ExitBlockSet.count(ExitBB) && "Expect the PHI is in an exit block.");
+
+ // Split predecessors of the loop exit to make instructions in the loop are
+ // exposed to exit blocks through trivially replaceable PHIs while keeping the
+ // loop in the canonical form where each predecessor of each exit block should
+ // be contained within the loop. For example, this will convert the loop below
+ // from
+ //
+ // LB1:
+ // %v1 =
+ // br %LE, %LB2
+ // LB2:
+ // %v2 =
+ // br %LE, %LB1
+ // LE:
+ // %p = phi [%v1, %LB1], [%v2, %LB2] <-- non-trivially replaceable
+ //
+ // to
+ //
+ // LB1:
+ // %v1 =
+ // br %LE.split, %LB2
+ // LB2:
+ // %v2 =
+ // br %LE.split2, %LB1
+ // LE.split:
+ // %p1 = phi [%v1, %LB1] <-- trivially replaceable
+ // br %LE
+ // LE.split2:
+ // %p2 = phi [%v2, %LB2] <-- trivially replaceable
+ // br %LE
+ // LE:
+ // %p = phi [%p1, %LE.split], [%p2, %LE.split2]
+ //
+ const auto &BlockColors = SafetyInfo->getBlockColors();
+ SmallSetVector<BasicBlock *, 8> PredBBs(pred_begin(ExitBB), pred_end(ExitBB));
+ while (!PredBBs.empty()) {
+ BasicBlock *PredBB = *PredBBs.begin();
+ assert(CurLoop->contains(PredBB) &&
+ "Expect all predecessors are in the loop");
+ if (PN->getBasicBlockIndex(PredBB) >= 0) {
+ BasicBlock *NewPred = SplitBlockPredecessors(
+ ExitBB, PredBB, ".split.loop.exit", DT, LI, MSSAU, true);
+ // Since we do not allow splitting EH-block with BlockColors in
+ // canSplitPredecessors(), we can simply assign predecessor's color to
+ // the new block.
+ if (!BlockColors.empty())
+ // Grab a reference to the ColorVector to be inserted before getting the
+ // reference to the vector we are copying because inserting the new
+ // element in BlockColors might cause the map to be reallocated.
+ SafetyInfo->copyColors(NewPred, PredBB);
+ }
+ PredBBs.remove(PredBB);
+ }
+}
+
+/// When an instruction is found to only be used outside of the loop, this
+/// function moves it to the exit blocks and patches up SSA form as needed.
+/// This method is guaranteed to remove the original instruction from its
+/// position, and may either delete it or move it to outside of the loop.
+///
+static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
BlockFrequencyInfo *BFI, const Loop *CurLoop,
ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU,
OptimizationRemarkEmitter *ORE) {
- LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
- << "sinking " << ore::NV("Inst", &I);
- });
- bool Changed = false;
- if (isa<LoadInst>(I))
- ++NumMovedLoads;
- else if (isa<CallInst>(I))
- ++NumMovedCalls;
- ++NumSunk;
-
- // Iterate over users to be ready for actual sinking. Replace users via
- // unreachable blocks with undef and make all user PHIs trivially replaceable.
- SmallPtrSet<Instruction *, 8> VisitedUsers;
- for (Value::user_iterator UI = I.user_begin(), UE = I.user_end(); UI != UE;) {
- auto *User = cast<Instruction>(*UI);
- Use &U = UI.getUse();
- ++UI;
-
- if (VisitedUsers.count(User) || CurLoop->contains(User))
- continue;
-
- if (!DT->isReachableFromEntry(User->getParent())) {
- U = UndefValue::get(I.getType());
- Changed = true;
- continue;
- }
-
- // The user must be a PHI node.
- PHINode *PN = cast<PHINode>(User);
-
- // Surprisingly, instructions can be used outside of loops without any
- // exits. This can only happen in PHI nodes if the incoming block is
- // unreachable.
- BasicBlock *BB = PN->getIncomingBlock(U);
- if (!DT->isReachableFromEntry(BB)) {
- U = UndefValue::get(I.getType());
- Changed = true;
- continue;
- }
-
- VisitedUsers.insert(PN);
- if (isTriviallyReplaceablePHI(*PN, I))
- continue;
-
- if (!canSplitPredecessors(PN, SafetyInfo))
- return Changed;
-
- // Split predecessors of the PHI so that we can make users trivially
- // replaceable.
- splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop, SafetyInfo, MSSAU);
-
- // Should rebuild the iterators, as they may be invalidated by
- // splitPredecessorsOfLoopExit().
- UI = I.user_begin();
- UE = I.user_end();
- }
-
- if (VisitedUsers.empty())
- return Changed;
-
-#ifndef NDEBUG
- SmallVector<BasicBlock *, 32> ExitBlocks;
- CurLoop->getUniqueExitBlocks(ExitBlocks);
- SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
- ExitBlocks.end());
-#endif
-
- // Clones of this instruction. Don't create more than one per exit block!
- SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
-
- // If this instruction is only used outside of the loop, then all users are
- // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
- // the instruction.
+ LLVM_DEBUG(dbgs() << "LICM sinking instruction: " << I << "\n");
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "InstSunk", &I)
+ << "sinking " << ore::NV("Inst", &I);
+ });
+ bool Changed = false;
+ if (isa<LoadInst>(I))
+ ++NumMovedLoads;
+ else if (isa<CallInst>(I))
+ ++NumMovedCalls;
+ ++NumSunk;
+
+ // Iterate over users to be ready for actual sinking. Replace users via
+ // unreachable blocks with undef and make all user PHIs trivially replaceable.
+ SmallPtrSet<Instruction *, 8> VisitedUsers;
+ for (Value::user_iterator UI = I.user_begin(), UE = I.user_end(); UI != UE;) {
+ auto *User = cast<Instruction>(*UI);
+ Use &U = UI.getUse();
+ ++UI;
+
+ if (VisitedUsers.count(User) || CurLoop->contains(User))
+ continue;
+
+ if (!DT->isReachableFromEntry(User->getParent())) {
+ U = UndefValue::get(I.getType());
+ Changed = true;
+ continue;
+ }
+
+ // The user must be a PHI node.
+ PHINode *PN = cast<PHINode>(User);
+
+ // Surprisingly, instructions can be used outside of loops without any
+ // exits. This can only happen in PHI nodes if the incoming block is
+ // unreachable.
+ BasicBlock *BB = PN->getIncomingBlock(U);
+ if (!DT->isReachableFromEntry(BB)) {
+ U = UndefValue::get(I.getType());
+ Changed = true;
+ continue;
+ }
+
+ VisitedUsers.insert(PN);
+ if (isTriviallyReplaceablePHI(*PN, I))
+ continue;
+
+ if (!canSplitPredecessors(PN, SafetyInfo))
+ return Changed;
+
+ // Split predecessors of the PHI so that we can make users trivially
+ // replaceable.
+ splitPredecessorsOfLoopExit(PN, DT, LI, CurLoop, SafetyInfo, MSSAU);
+
+ // Should rebuild the iterators, as they may be invalidated by
+ // splitPredecessorsOfLoopExit().
+ UI = I.user_begin();
+ UE = I.user_end();
+ }
+
+ if (VisitedUsers.empty())
+ return Changed;
+
+#ifndef NDEBUG
+ SmallVector<BasicBlock *, 32> ExitBlocks;
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+ SmallPtrSet<BasicBlock *, 32> ExitBlockSet(ExitBlocks.begin(),
+ ExitBlocks.end());
+#endif
+
+ // Clones of this instruction. Don't create more than one per exit block!
+ SmallDenseMap<BasicBlock *, Instruction *, 32> SunkCopies;
+
+ // If this instruction is only used outside of the loop, then all users are
+ // PHI nodes in exit blocks due to LCSSA form. Just RAUW them with clones of
+ // the instruction.
// First check if I is worth sinking for all uses. Sink only when it is worth
// across all uses.
- SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end());
+ SmallSetVector<User*, 8> Users(I.user_begin(), I.user_end());
SmallVector<PHINode *, 8> ExitPNs;
- for (auto *UI : Users) {
- auto *User = cast<Instruction>(UI);
-
- if (CurLoop->contains(User))
- continue;
-
- PHINode *PN = cast<PHINode>(User);
- assert(ExitBlockSet.count(PN->getParent()) &&
- "The LCSSA PHI is not in an exit block!");
+ for (auto *UI : Users) {
+ auto *User = cast<Instruction>(UI);
+
+ if (CurLoop->contains(User))
+ continue;
+
+ PHINode *PN = cast<PHINode>(User);
+ assert(ExitBlockSet.count(PN->getParent()) &&
+ "The LCSSA PHI is not in an exit block!");
if (!worthSinkOrHoistInst(I, PN->getParent(), ORE, BFI)) {
return Changed;
}
@@ -1724,622 +1724,622 @@ static bool sink(Instruction &I, LoopInfo *LI, DominatorTree *DT,
for (auto *PN : ExitPNs) {
- // The PHI must be trivially replaceable.
- Instruction *New = sinkThroughTriviallyReplaceablePHI(
- PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU);
- PN->replaceAllUsesWith(New);
- eraseInstruction(*PN, *SafetyInfo, nullptr, nullptr);
- Changed = true;
- }
- return Changed;
-}
-
-/// When an instruction is found to only use loop invariant operands that
-/// is safe to hoist, this instruction is called to do the dirty work.
-///
-static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
- BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
- MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
- OptimizationRemarkEmitter *ORE) {
+ // The PHI must be trivially replaceable.
+ Instruction *New = sinkThroughTriviallyReplaceablePHI(
+ PN, &I, LI, SunkCopies, SafetyInfo, CurLoop, MSSAU);
+ PN->replaceAllUsesWith(New);
+ eraseInstruction(*PN, *SafetyInfo, nullptr, nullptr);
+ Changed = true;
+ }
+ return Changed;
+}
+
+/// When an instruction is found to only use loop invariant operands that
+/// is safe to hoist, this instruction is called to do the dirty work.
+///
+static void hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
+ BasicBlock *Dest, ICFLoopSafetyInfo *SafetyInfo,
+ MemorySSAUpdater *MSSAU, ScalarEvolution *SE,
+ OptimizationRemarkEmitter *ORE) {
LLVM_DEBUG(dbgs() << "LICM hoisting to " << Dest->getNameOrAsOperand() << ": "
<< I << "\n");
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting "
- << ore::NV("Inst", &I);
- });
-
- // Metadata can be dependent on conditions we are hoisting above.
- // Conservatively strip all metadata on the instruction unless we were
- // guaranteed to execute I if we entered the loop, in which case the metadata
- // is valid in the loop preheader.
- if (I.hasMetadataOtherThanDebugLoc() &&
- // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning
- // time in isGuaranteedToExecute if we don't actually have anything to
- // drop. It is a compile time optimization, not required for correctness.
- !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop))
- I.dropUnknownNonDebugMetadata();
-
- if (isa<PHINode>(I))
- // Move the new node to the end of the phi list in the destination block.
- moveInstructionBefore(I, *Dest->getFirstNonPHI(), *SafetyInfo, MSSAU, SE);
- else
- // Move the new node to the destination block, before its terminator.
- moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo, MSSAU, SE);
-
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Hoisted", &I) << "hoisting "
+ << ore::NV("Inst", &I);
+ });
+
+ // Metadata can be dependent on conditions we are hoisting above.
+ // Conservatively strip all metadata on the instruction unless we were
+ // guaranteed to execute I if we entered the loop, in which case the metadata
+ // is valid in the loop preheader.
+ if (I.hasMetadataOtherThanDebugLoc() &&
+ // The check on hasMetadataOtherThanDebugLoc is to prevent us from burning
+ // time in isGuaranteedToExecute if we don't actually have anything to
+ // drop. It is a compile time optimization, not required for correctness.
+ !SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop))
+ I.dropUnknownNonDebugMetadata();
+
+ if (isa<PHINode>(I))
+ // Move the new node to the end of the phi list in the destination block.
+ moveInstructionBefore(I, *Dest->getFirstNonPHI(), *SafetyInfo, MSSAU, SE);
+ else
+ // Move the new node to the destination block, before its terminator.
+ moveInstructionBefore(I, *Dest->getTerminator(), *SafetyInfo, MSSAU, SE);
+
I.updateLocationAfterHoist();
-
- if (isa<LoadInst>(I))
- ++NumMovedLoads;
- else if (isa<CallInst>(I))
- ++NumMovedCalls;
- ++NumHoisted;
-}
-
-/// Only sink or hoist an instruction if it is not a trapping instruction,
-/// or if the instruction is known not to trap when moved to the preheader.
-/// or if it is a trapping instruction and is guaranteed to execute.
-static bool isSafeToExecuteUnconditionally(Instruction &Inst,
- const DominatorTree *DT,
- const Loop *CurLoop,
- const LoopSafetyInfo *SafetyInfo,
- OptimizationRemarkEmitter *ORE,
- const Instruction *CtxI) {
- if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT))
- return true;
-
- bool GuaranteedToExecute =
- SafetyInfo->isGuaranteedToExecute(Inst, DT, CurLoop);
-
- if (!GuaranteedToExecute) {
- auto *LI = dyn_cast<LoadInst>(&Inst);
- if (LI && CurLoop->isLoopInvariant(LI->getPointerOperand()))
- ORE->emit([&]() {
- return OptimizationRemarkMissed(
- DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI)
- << "failed to hoist load with loop-invariant address "
- "because load is conditionally executed";
- });
- }
-
- return GuaranteedToExecute;
-}
-
-namespace {
-class LoopPromoter : public LoadAndStorePromoter {
- Value *SomePtr; // Designated pointer to store to.
- const SmallSetVector<Value *, 8> &PointerMustAliases;
- SmallVectorImpl<BasicBlock *> &LoopExitBlocks;
- SmallVectorImpl<Instruction *> &LoopInsertPts;
- SmallVectorImpl<MemoryAccess *> &MSSAInsertPts;
- PredIteratorCache &PredCache;
+
+ if (isa<LoadInst>(I))
+ ++NumMovedLoads;
+ else if (isa<CallInst>(I))
+ ++NumMovedCalls;
+ ++NumHoisted;
+}
+
+/// Only sink or hoist an instruction if it is not a trapping instruction,
+/// or if the instruction is known not to trap when moved to the preheader.
+/// or if it is a trapping instruction and is guaranteed to execute.
+static bool isSafeToExecuteUnconditionally(Instruction &Inst,
+ const DominatorTree *DT,
+ const Loop *CurLoop,
+ const LoopSafetyInfo *SafetyInfo,
+ OptimizationRemarkEmitter *ORE,
+ const Instruction *CtxI) {
+ if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT))
+ return true;
+
+ bool GuaranteedToExecute =
+ SafetyInfo->isGuaranteedToExecute(Inst, DT, CurLoop);
+
+ if (!GuaranteedToExecute) {
+ auto *LI = dyn_cast<LoadInst>(&Inst);
+ if (LI && CurLoop->isLoopInvariant(LI->getPointerOperand()))
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(
+ DEBUG_TYPE, "LoadWithLoopInvariantAddressCondExecuted", LI)
+ << "failed to hoist load with loop-invariant address "
+ "because load is conditionally executed";
+ });
+ }
+
+ return GuaranteedToExecute;
+}
+
+namespace {
+class LoopPromoter : public LoadAndStorePromoter {
+ Value *SomePtr; // Designated pointer to store to.
+ const SmallSetVector<Value *, 8> &PointerMustAliases;
+ SmallVectorImpl<BasicBlock *> &LoopExitBlocks;
+ SmallVectorImpl<Instruction *> &LoopInsertPts;
+ SmallVectorImpl<MemoryAccess *> &MSSAInsertPts;
+ PredIteratorCache &PredCache;
AliasSetTracker *AST;
- MemorySSAUpdater *MSSAU;
- LoopInfo &LI;
- DebugLoc DL;
- int Alignment;
- bool UnorderedAtomic;
- AAMDNodes AATags;
- ICFLoopSafetyInfo &SafetyInfo;
-
- Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
- if (Instruction *I = dyn_cast<Instruction>(V))
- if (Loop *L = LI.getLoopFor(I->getParent()))
- if (!L->contains(BB)) {
- // We need to create an LCSSA PHI node for the incoming value and
- // store that.
- PHINode *PN = PHINode::Create(I->getType(), PredCache.size(BB),
- I->getName() + ".lcssa", &BB->front());
- for (BasicBlock *Pred : PredCache.get(BB))
- PN->addIncoming(I, Pred);
- return PN;
- }
- return V;
- }
-
-public:
- LoopPromoter(Value *SP, ArrayRef<const Instruction *> Insts, SSAUpdater &S,
- const SmallSetVector<Value *, 8> &PMA,
- SmallVectorImpl<BasicBlock *> &LEB,
- SmallVectorImpl<Instruction *> &LIP,
- SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC,
+ MemorySSAUpdater *MSSAU;
+ LoopInfo &LI;
+ DebugLoc DL;
+ int Alignment;
+ bool UnorderedAtomic;
+ AAMDNodes AATags;
+ ICFLoopSafetyInfo &SafetyInfo;
+
+ Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ if (Loop *L = LI.getLoopFor(I->getParent()))
+ if (!L->contains(BB)) {
+ // We need to create an LCSSA PHI node for the incoming value and
+ // store that.
+ PHINode *PN = PHINode::Create(I->getType(), PredCache.size(BB),
+ I->getName() + ".lcssa", &BB->front());
+ for (BasicBlock *Pred : PredCache.get(BB))
+ PN->addIncoming(I, Pred);
+ return PN;
+ }
+ return V;
+ }
+
+public:
+ LoopPromoter(Value *SP, ArrayRef<const Instruction *> Insts, SSAUpdater &S,
+ const SmallSetVector<Value *, 8> &PMA,
+ SmallVectorImpl<BasicBlock *> &LEB,
+ SmallVectorImpl<Instruction *> &LIP,
+ SmallVectorImpl<MemoryAccess *> &MSSAIP, PredIteratorCache &PIC,
AliasSetTracker *ast, MemorySSAUpdater *MSSAU, LoopInfo &li,
- DebugLoc dl, int alignment, bool UnorderedAtomic,
- const AAMDNodes &AATags, ICFLoopSafetyInfo &SafetyInfo)
- : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
- LoopExitBlocks(LEB), LoopInsertPts(LIP), MSSAInsertPts(MSSAIP),
- PredCache(PIC), AST(ast), MSSAU(MSSAU), LI(li), DL(std::move(dl)),
- Alignment(alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags),
- SafetyInfo(SafetyInfo) {}
-
- bool isInstInList(Instruction *I,
- const SmallVectorImpl<Instruction *> &) const override {
- Value *Ptr;
- if (LoadInst *LI = dyn_cast<LoadInst>(I))
- Ptr = LI->getOperand(0);
- else
- Ptr = cast<StoreInst>(I)->getPointerOperand();
- return PointerMustAliases.count(Ptr);
- }
-
- void doExtraRewritesBeforeFinalDeletion() override {
- // Insert stores after in the loop exit blocks. Each exit block gets a
- // store of the live-out values that feed them. Since we've already told
- // the SSA updater about the defs in the loop and the preheader
- // definition, it is all set and we can start using it.
- for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
- BasicBlock *ExitBlock = LoopExitBlocks[i];
- Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
- LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock);
- Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
- Instruction *InsertPos = LoopInsertPts[i];
- StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
- if (UnorderedAtomic)
- NewSI->setOrdering(AtomicOrdering::Unordered);
- NewSI->setAlignment(Align(Alignment));
- NewSI->setDebugLoc(DL);
- if (AATags)
- NewSI->setAAMetadata(AATags);
-
- if (MSSAU) {
- MemoryAccess *MSSAInsertPoint = MSSAInsertPts[i];
- MemoryAccess *NewMemAcc;
- if (!MSSAInsertPoint) {
- NewMemAcc = MSSAU->createMemoryAccessInBB(
- NewSI, nullptr, NewSI->getParent(), MemorySSA::Beginning);
- } else {
- NewMemAcc =
- MSSAU->createMemoryAccessAfter(NewSI, nullptr, MSSAInsertPoint);
- }
- MSSAInsertPts[i] = NewMemAcc;
- MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
- // FIXME: true for safety, false may still be correct.
- }
- }
- }
-
- void replaceLoadWithValue(LoadInst *LI, Value *V) const override {
- // Update alias analysis.
+ DebugLoc dl, int alignment, bool UnorderedAtomic,
+ const AAMDNodes &AATags, ICFLoopSafetyInfo &SafetyInfo)
+ : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
+ LoopExitBlocks(LEB), LoopInsertPts(LIP), MSSAInsertPts(MSSAIP),
+ PredCache(PIC), AST(ast), MSSAU(MSSAU), LI(li), DL(std::move(dl)),
+ Alignment(alignment), UnorderedAtomic(UnorderedAtomic), AATags(AATags),
+ SafetyInfo(SafetyInfo) {}
+
+ bool isInstInList(Instruction *I,
+ const SmallVectorImpl<Instruction *> &) const override {
+ Value *Ptr;
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ Ptr = LI->getOperand(0);
+ else
+ Ptr = cast<StoreInst>(I)->getPointerOperand();
+ return PointerMustAliases.count(Ptr);
+ }
+
+ void doExtraRewritesBeforeFinalDeletion() override {
+ // Insert stores after in the loop exit blocks. Each exit block gets a
+ // store of the live-out values that feed them. Since we've already told
+ // the SSA updater about the defs in the loop and the preheader
+ // definition, it is all set and we can start using it.
+ for (unsigned i = 0, e = LoopExitBlocks.size(); i != e; ++i) {
+ BasicBlock *ExitBlock = LoopExitBlocks[i];
+ Value *LiveInValue = SSA.GetValueInMiddleOfBlock(ExitBlock);
+ LiveInValue = maybeInsertLCSSAPHI(LiveInValue, ExitBlock);
+ Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
+ Instruction *InsertPos = LoopInsertPts[i];
+ StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
+ if (UnorderedAtomic)
+ NewSI->setOrdering(AtomicOrdering::Unordered);
+ NewSI->setAlignment(Align(Alignment));
+ NewSI->setDebugLoc(DL);
+ if (AATags)
+ NewSI->setAAMetadata(AATags);
+
+ if (MSSAU) {
+ MemoryAccess *MSSAInsertPoint = MSSAInsertPts[i];
+ MemoryAccess *NewMemAcc;
+ if (!MSSAInsertPoint) {
+ NewMemAcc = MSSAU->createMemoryAccessInBB(
+ NewSI, nullptr, NewSI->getParent(), MemorySSA::Beginning);
+ } else {
+ NewMemAcc =
+ MSSAU->createMemoryAccessAfter(NewSI, nullptr, MSSAInsertPoint);
+ }
+ MSSAInsertPts[i] = NewMemAcc;
+ MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
+ // FIXME: true for safety, false may still be correct.
+ }
+ }
+ }
+
+ void replaceLoadWithValue(LoadInst *LI, Value *V) const override {
+ // Update alias analysis.
if (AST)
AST->copyValue(LI, V);
- }
- void instructionDeleted(Instruction *I) const override {
- SafetyInfo.removeInstruction(I);
+ }
+ void instructionDeleted(Instruction *I) const override {
+ SafetyInfo.removeInstruction(I);
if (AST)
AST->deleteValue(I);
- if (MSSAU)
- MSSAU->removeMemoryAccess(I);
- }
-};
-
-
-/// Return true iff we can prove that a caller of this function can not inspect
-/// the contents of the provided object in a well defined program.
-bool isKnownNonEscaping(Value *Object, const TargetLibraryInfo *TLI) {
- if (isa<AllocaInst>(Object))
- // Since the alloca goes out of scope, we know the caller can't retain a
- // reference to it and be well defined. Thus, we don't need to check for
- // capture.
- return true;
-
- // For all other objects we need to know that the caller can't possibly
- // have gotten a reference to the object. There are two components of
- // that:
- // 1) Object can't be escaped by this function. This is what
- // PointerMayBeCaptured checks.
- // 2) Object can't have been captured at definition site. For this, we
- // need to know the return value is noalias. At the moment, we use a
- // weaker condition and handle only AllocLikeFunctions (which are
- // known to be noalias). TODO
- return isAllocLikeFn(Object, TLI) &&
- !PointerMayBeCaptured(Object, true, true);
-}
-
-} // namespace
-
-/// Try to promote memory values to scalars by sinking stores out of the
-/// loop and moving loads to before the loop. We do this by looping over
-/// the stores in the loop, looking for stores to Must pointers which are
-/// loop invariant.
-///
-bool llvm::promoteLoopAccessesToScalars(
- const SmallSetVector<Value *, 8> &PointerMustAliases,
- SmallVectorImpl<BasicBlock *> &ExitBlocks,
- SmallVectorImpl<Instruction *> &InsertPts,
- SmallVectorImpl<MemoryAccess *> &MSSAInsertPts, PredIteratorCache &PIC,
- LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
- Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
- ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) {
- // Verify inputs.
- assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
+ if (MSSAU)
+ MSSAU->removeMemoryAccess(I);
+ }
+};
+
+
+/// Return true iff we can prove that a caller of this function can not inspect
+/// the contents of the provided object in a well defined program.
+bool isKnownNonEscaping(Value *Object, const TargetLibraryInfo *TLI) {
+ if (isa<AllocaInst>(Object))
+ // Since the alloca goes out of scope, we know the caller can't retain a
+ // reference to it and be well defined. Thus, we don't need to check for
+ // capture.
+ return true;
+
+ // For all other objects we need to know that the caller can't possibly
+ // have gotten a reference to the object. There are two components of
+ // that:
+ // 1) Object can't be escaped by this function. This is what
+ // PointerMayBeCaptured checks.
+ // 2) Object can't have been captured at definition site. For this, we
+ // need to know the return value is noalias. At the moment, we use a
+ // weaker condition and handle only AllocLikeFunctions (which are
+ // known to be noalias). TODO
+ return isAllocLikeFn(Object, TLI) &&
+ !PointerMayBeCaptured(Object, true, true);
+}
+
+} // namespace
+
+/// Try to promote memory values to scalars by sinking stores out of the
+/// loop and moving loads to before the loop. We do this by looping over
+/// the stores in the loop, looking for stores to Must pointers which are
+/// loop invariant.
+///
+bool llvm::promoteLoopAccessesToScalars(
+ const SmallSetVector<Value *, 8> &PointerMustAliases,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks,
+ SmallVectorImpl<Instruction *> &InsertPts,
+ SmallVectorImpl<MemoryAccess *> &MSSAInsertPts, PredIteratorCache &PIC,
+ LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
+ Loop *CurLoop, AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
+ ICFLoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE) {
+ // Verify inputs.
+ assert(LI != nullptr && DT != nullptr && CurLoop != nullptr &&
SafetyInfo != nullptr &&
- "Unexpected Input to promoteLoopAccessesToScalars");
-
- Value *SomePtr = *PointerMustAliases.begin();
- BasicBlock *Preheader = CurLoop->getLoopPreheader();
-
- // It is not safe to promote a load/store from the loop if the load/store is
- // conditional. For example, turning:
- //
- // for () { if (c) *P += 1; }
- //
- // into:
- //
- // tmp = *P; for () { if (c) tmp +=1; } *P = tmp;
- //
- // is not safe, because *P may only be valid to access if 'c' is true.
- //
- // The safety property divides into two parts:
- // p1) The memory may not be dereferenceable on entry to the loop. In this
- // case, we can't insert the required load in the preheader.
- // p2) The memory model does not allow us to insert a store along any dynamic
- // path which did not originally have one.
- //
- // If at least one store is guaranteed to execute, both properties are
- // satisfied, and promotion is legal.
- //
- // This, however, is not a necessary condition. Even if no store/load is
- // guaranteed to execute, we can still establish these properties.
- // We can establish (p1) by proving that hoisting the load into the preheader
- // is safe (i.e. proving dereferenceability on all paths through the loop). We
- // can use any access within the alias set to prove dereferenceability,
- // since they're all must alias.
- //
- // There are two ways establish (p2):
- // a) Prove the location is thread-local. In this case the memory model
- // requirement does not apply, and stores are safe to insert.
- // b) Prove a store dominates every exit block. In this case, if an exit
- // blocks is reached, the original dynamic path would have taken us through
- // the store, so inserting a store into the exit block is safe. Note that this
- // is different from the store being guaranteed to execute. For instance,
- // if an exception is thrown on the first iteration of the loop, the original
- // store is never executed, but the exit blocks are not executed either.
-
- bool DereferenceableInPH = false;
- bool SafeToInsertStore = false;
-
- SmallVector<Instruction *, 64> LoopUses;
-
- // We start with an alignment of one and try to find instructions that allow
- // us to prove better alignment.
- Align Alignment;
- // Keep track of which types of access we see
- bool SawUnorderedAtomic = false;
- bool SawNotAtomic = false;
- AAMDNodes AATags;
-
- const DataLayout &MDL = Preheader->getModule()->getDataLayout();
-
- bool IsKnownThreadLocalObject = false;
- if (SafetyInfo->anyBlockMayThrow()) {
- // If a loop can throw, we have to insert a store along each unwind edge.
- // That said, we can't actually make the unwind edge explicit. Therefore,
- // we have to prove that the store is dead along the unwind edge. We do
- // this by proving that the caller can't have a reference to the object
- // after return and thus can't possibly load from the object.
+ "Unexpected Input to promoteLoopAccessesToScalars");
+
+ Value *SomePtr = *PointerMustAliases.begin();
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+
+ // It is not safe to promote a load/store from the loop if the load/store is
+ // conditional. For example, turning:
+ //
+ // for () { if (c) *P += 1; }
+ //
+ // into:
+ //
+ // tmp = *P; for () { if (c) tmp +=1; } *P = tmp;
+ //
+ // is not safe, because *P may only be valid to access if 'c' is true.
+ //
+ // The safety property divides into two parts:
+ // p1) The memory may not be dereferenceable on entry to the loop. In this
+ // case, we can't insert the required load in the preheader.
+ // p2) The memory model does not allow us to insert a store along any dynamic
+ // path which did not originally have one.
+ //
+ // If at least one store is guaranteed to execute, both properties are
+ // satisfied, and promotion is legal.
+ //
+ // This, however, is not a necessary condition. Even if no store/load is
+ // guaranteed to execute, we can still establish these properties.
+ // We can establish (p1) by proving that hoisting the load into the preheader
+ // is safe (i.e. proving dereferenceability on all paths through the loop). We
+ // can use any access within the alias set to prove dereferenceability,
+ // since they're all must alias.
+ //
+ // There are two ways establish (p2):
+ // a) Prove the location is thread-local. In this case the memory model
+ // requirement does not apply, and stores are safe to insert.
+ // b) Prove a store dominates every exit block. In this case, if an exit
+ // blocks is reached, the original dynamic path would have taken us through
+ // the store, so inserting a store into the exit block is safe. Note that this
+ // is different from the store being guaranteed to execute. For instance,
+ // if an exception is thrown on the first iteration of the loop, the original
+ // store is never executed, but the exit blocks are not executed either.
+
+ bool DereferenceableInPH = false;
+ bool SafeToInsertStore = false;
+
+ SmallVector<Instruction *, 64> LoopUses;
+
+ // We start with an alignment of one and try to find instructions that allow
+ // us to prove better alignment.
+ Align Alignment;
+ // Keep track of which types of access we see
+ bool SawUnorderedAtomic = false;
+ bool SawNotAtomic = false;
+ AAMDNodes AATags;
+
+ const DataLayout &MDL = Preheader->getModule()->getDataLayout();
+
+ bool IsKnownThreadLocalObject = false;
+ if (SafetyInfo->anyBlockMayThrow()) {
+ // If a loop can throw, we have to insert a store along each unwind edge.
+ // That said, we can't actually make the unwind edge explicit. Therefore,
+ // we have to prove that the store is dead along the unwind edge. We do
+ // this by proving that the caller can't have a reference to the object
+ // after return and thus can't possibly load from the object.
Value *Object = getUnderlyingObject(SomePtr);
- if (!isKnownNonEscaping(Object, TLI))
- return false;
- // Subtlety: Alloca's aren't visible to callers, but *are* potentially
- // visible to other threads if captured and used during their lifetimes.
- IsKnownThreadLocalObject = !isa<AllocaInst>(Object);
- }
-
- // Check that all of the pointers in the alias set have the same type. We
- // cannot (yet) promote a memory location that is loaded and stored in
- // different sizes. While we are at it, collect alignment and AA info.
- for (Value *ASIV : PointerMustAliases) {
- // Check that all of the pointers in the alias set have the same type. We
- // cannot (yet) promote a memory location that is loaded and stored in
- // different sizes.
- if (SomePtr->getType() != ASIV->getType())
- return false;
-
- for (User *U : ASIV->users()) {
- // Ignore instructions that are outside the loop.
- Instruction *UI = dyn_cast<Instruction>(U);
- if (!UI || !CurLoop->contains(UI))
- continue;
-
- // If there is an non-load/store instruction in the loop, we can't promote
- // it.
- if (LoadInst *Load = dyn_cast<LoadInst>(UI)) {
- if (!Load->isUnordered())
- return false;
-
- SawUnorderedAtomic |= Load->isAtomic();
- SawNotAtomic |= !Load->isAtomic();
-
- Align InstAlignment = Load->getAlign();
-
- // Note that proving a load safe to speculate requires proving
- // sufficient alignment at the target location. Proving it guaranteed
- // to execute does as well. Thus we can increase our guaranteed
- // alignment as well.
- if (!DereferenceableInPH || (InstAlignment > Alignment))
- if (isSafeToExecuteUnconditionally(*Load, DT, CurLoop, SafetyInfo,
- ORE, Preheader->getTerminator())) {
- DereferenceableInPH = true;
- Alignment = std::max(Alignment, InstAlignment);
- }
- } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
- // Stores *of* the pointer are not interesting, only stores *to* the
- // pointer.
- if (UI->getOperand(1) != ASIV)
- continue;
- if (!Store->isUnordered())
- return false;
-
- SawUnorderedAtomic |= Store->isAtomic();
- SawNotAtomic |= !Store->isAtomic();
-
- // If the store is guaranteed to execute, both properties are satisfied.
- // We may want to check if a store is guaranteed to execute even if we
- // already know that promotion is safe, since it may have higher
- // alignment than any other guaranteed stores, in which case we can
- // raise the alignment on the promoted store.
- Align InstAlignment = Store->getAlign();
-
- if (!DereferenceableInPH || !SafeToInsertStore ||
- (InstAlignment > Alignment)) {
- if (SafetyInfo->isGuaranteedToExecute(*UI, DT, CurLoop)) {
- DereferenceableInPH = true;
- SafeToInsertStore = true;
- Alignment = std::max(Alignment, InstAlignment);
- }
- }
-
- // If a store dominates all exit blocks, it is safe to sink.
- // As explained above, if an exit block was executed, a dominating
- // store must have been executed at least once, so we are not
- // introducing stores on paths that did not have them.
- // Note that this only looks at explicit exit blocks. If we ever
- // start sinking stores into unwind edges (see above), this will break.
- if (!SafeToInsertStore)
- SafeToInsertStore = llvm::all_of(ExitBlocks, [&](BasicBlock *Exit) {
- return DT->dominates(Store->getParent(), Exit);
- });
-
- // If the store is not guaranteed to execute, we may still get
- // deref info through it.
- if (!DereferenceableInPH) {
- DereferenceableInPH = isDereferenceableAndAlignedPointer(
- Store->getPointerOperand(), Store->getValueOperand()->getType(),
- Store->getAlign(), MDL, Preheader->getTerminator(), DT);
- }
- } else
- return false; // Not a load or store.
-
- // Merge the AA tags.
- if (LoopUses.empty()) {
- // On the first load/store, just take its AA tags.
- UI->getAAMetadata(AATags);
- } else if (AATags) {
- UI->getAAMetadata(AATags, /* Merge = */ true);
- }
-
- LoopUses.push_back(UI);
- }
- }
-
- // If we found both an unordered atomic instruction and a non-atomic memory
- // access, bail. We can't blindly promote non-atomic to atomic since we
- // might not be able to lower the result. We can't downgrade since that
- // would violate memory model. Also, align 0 is an error for atomics.
- if (SawUnorderedAtomic && SawNotAtomic)
- return false;
-
- // If we're inserting an atomic load in the preheader, we must be able to
- // lower it. We're only guaranteed to be able to lower naturally aligned
- // atomics.
- auto *SomePtrElemType = SomePtr->getType()->getPointerElementType();
- if (SawUnorderedAtomic &&
- Alignment < MDL.getTypeStoreSize(SomePtrElemType))
- return false;
-
- // If we couldn't prove we can hoist the load, bail.
- if (!DereferenceableInPH)
- return false;
-
- // We know we can hoist the load, but don't have a guaranteed store.
- // Check whether the location is thread-local. If it is, then we can insert
- // stores along paths which originally didn't have them without violating the
- // memory model.
- if (!SafeToInsertStore) {
- if (IsKnownThreadLocalObject)
- SafeToInsertStore = true;
- else {
+ if (!isKnownNonEscaping(Object, TLI))
+ return false;
+ // Subtlety: Alloca's aren't visible to callers, but *are* potentially
+ // visible to other threads if captured and used during their lifetimes.
+ IsKnownThreadLocalObject = !isa<AllocaInst>(Object);
+ }
+
+ // Check that all of the pointers in the alias set have the same type. We
+ // cannot (yet) promote a memory location that is loaded and stored in
+ // different sizes. While we are at it, collect alignment and AA info.
+ for (Value *ASIV : PointerMustAliases) {
+ // Check that all of the pointers in the alias set have the same type. We
+ // cannot (yet) promote a memory location that is loaded and stored in
+ // different sizes.
+ if (SomePtr->getType() != ASIV->getType())
+ return false;
+
+ for (User *U : ASIV->users()) {
+ // Ignore instructions that are outside the loop.
+ Instruction *UI = dyn_cast<Instruction>(U);
+ if (!UI || !CurLoop->contains(UI))
+ continue;
+
+ // If there is an non-load/store instruction in the loop, we can't promote
+ // it.
+ if (LoadInst *Load = dyn_cast<LoadInst>(UI)) {
+ if (!Load->isUnordered())
+ return false;
+
+ SawUnorderedAtomic |= Load->isAtomic();
+ SawNotAtomic |= !Load->isAtomic();
+
+ Align InstAlignment = Load->getAlign();
+
+ // Note that proving a load safe to speculate requires proving
+ // sufficient alignment at the target location. Proving it guaranteed
+ // to execute does as well. Thus we can increase our guaranteed
+ // alignment as well.
+ if (!DereferenceableInPH || (InstAlignment > Alignment))
+ if (isSafeToExecuteUnconditionally(*Load, DT, CurLoop, SafetyInfo,
+ ORE, Preheader->getTerminator())) {
+ DereferenceableInPH = true;
+ Alignment = std::max(Alignment, InstAlignment);
+ }
+ } else if (const StoreInst *Store = dyn_cast<StoreInst>(UI)) {
+ // Stores *of* the pointer are not interesting, only stores *to* the
+ // pointer.
+ if (UI->getOperand(1) != ASIV)
+ continue;
+ if (!Store->isUnordered())
+ return false;
+
+ SawUnorderedAtomic |= Store->isAtomic();
+ SawNotAtomic |= !Store->isAtomic();
+
+ // If the store is guaranteed to execute, both properties are satisfied.
+ // We may want to check if a store is guaranteed to execute even if we
+ // already know that promotion is safe, since it may have higher
+ // alignment than any other guaranteed stores, in which case we can
+ // raise the alignment on the promoted store.
+ Align InstAlignment = Store->getAlign();
+
+ if (!DereferenceableInPH || !SafeToInsertStore ||
+ (InstAlignment > Alignment)) {
+ if (SafetyInfo->isGuaranteedToExecute(*UI, DT, CurLoop)) {
+ DereferenceableInPH = true;
+ SafeToInsertStore = true;
+ Alignment = std::max(Alignment, InstAlignment);
+ }
+ }
+
+ // If a store dominates all exit blocks, it is safe to sink.
+ // As explained above, if an exit block was executed, a dominating
+ // store must have been executed at least once, so we are not
+ // introducing stores on paths that did not have them.
+ // Note that this only looks at explicit exit blocks. If we ever
+ // start sinking stores into unwind edges (see above), this will break.
+ if (!SafeToInsertStore)
+ SafeToInsertStore = llvm::all_of(ExitBlocks, [&](BasicBlock *Exit) {
+ return DT->dominates(Store->getParent(), Exit);
+ });
+
+ // If the store is not guaranteed to execute, we may still get
+ // deref info through it.
+ if (!DereferenceableInPH) {
+ DereferenceableInPH = isDereferenceableAndAlignedPointer(
+ Store->getPointerOperand(), Store->getValueOperand()->getType(),
+ Store->getAlign(), MDL, Preheader->getTerminator(), DT);
+ }
+ } else
+ return false; // Not a load or store.
+
+ // Merge the AA tags.
+ if (LoopUses.empty()) {
+ // On the first load/store, just take its AA tags.
+ UI->getAAMetadata(AATags);
+ } else if (AATags) {
+ UI->getAAMetadata(AATags, /* Merge = */ true);
+ }
+
+ LoopUses.push_back(UI);
+ }
+ }
+
+ // If we found both an unordered atomic instruction and a non-atomic memory
+ // access, bail. We can't blindly promote non-atomic to atomic since we
+ // might not be able to lower the result. We can't downgrade since that
+ // would violate memory model. Also, align 0 is an error for atomics.
+ if (SawUnorderedAtomic && SawNotAtomic)
+ return false;
+
+ // If we're inserting an atomic load in the preheader, we must be able to
+ // lower it. We're only guaranteed to be able to lower naturally aligned
+ // atomics.
+ auto *SomePtrElemType = SomePtr->getType()->getPointerElementType();
+ if (SawUnorderedAtomic &&
+ Alignment < MDL.getTypeStoreSize(SomePtrElemType))
+ return false;
+
+ // If we couldn't prove we can hoist the load, bail.
+ if (!DereferenceableInPH)
+ return false;
+
+ // We know we can hoist the load, but don't have a guaranteed store.
+ // Check whether the location is thread-local. If it is, then we can insert
+ // stores along paths which originally didn't have them without violating the
+ // memory model.
+ if (!SafeToInsertStore) {
+ if (IsKnownThreadLocalObject)
+ SafeToInsertStore = true;
+ else {
Value *Object = getUnderlyingObject(SomePtr);
- SafeToInsertStore =
- (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
- !PointerMayBeCaptured(Object, true, true);
- }
- }
-
- // If we've still failed to prove we can sink the store, give up.
- if (!SafeToInsertStore)
- return false;
-
- // Otherwise, this is safe to promote, lets do it!
- LLVM_DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
- << '\n');
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar",
- LoopUses[0])
- << "Moving accesses to memory location out of the loop";
- });
- ++NumPromoted;
-
- // Look at all the loop uses, and try to merge their locations.
- std::vector<const DILocation *> LoopUsesLocs;
- for (auto U : LoopUses)
- LoopUsesLocs.push_back(U->getDebugLoc().get());
- auto DL = DebugLoc(DILocation::getMergedLocations(LoopUsesLocs));
-
- // We use the SSAUpdater interface to insert phi nodes as required.
- SmallVector<PHINode *, 16> NewPHIs;
- SSAUpdater SSA(&NewPHIs);
- LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
+ SafeToInsertStore =
+ (isAllocLikeFn(Object, TLI) || isa<AllocaInst>(Object)) &&
+ !PointerMayBeCaptured(Object, true, true);
+ }
+ }
+
+ // If we've still failed to prove we can sink the store, give up.
+ if (!SafeToInsertStore)
+ return false;
+
+ // Otherwise, this is safe to promote, lets do it!
+ LLVM_DEBUG(dbgs() << "LICM: Promoting value stored to in loop: " << *SomePtr
+ << '\n');
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "PromoteLoopAccessesToScalar",
+ LoopUses[0])
+ << "Moving accesses to memory location out of the loop";
+ });
+ ++NumPromoted;
+
+ // Look at all the loop uses, and try to merge their locations.
+ std::vector<const DILocation *> LoopUsesLocs;
+ for (auto U : LoopUses)
+ LoopUsesLocs.push_back(U->getDebugLoc().get());
+ auto DL = DebugLoc(DILocation::getMergedLocations(LoopUsesLocs));
+
+ // We use the SSAUpdater interface to insert phi nodes as required.
+ SmallVector<PHINode *, 16> NewPHIs;
+ SSAUpdater SSA(&NewPHIs);
+ LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
InsertPts, MSSAInsertPts, PIC, CurAST, MSSAU, *LI, DL,
- Alignment.value(), SawUnorderedAtomic, AATags,
- *SafetyInfo);
-
- // Set up the preheader to have a definition of the value. It is the live-out
- // value from the preheader that uses in the loop will use.
- LoadInst *PreheaderLoad = new LoadInst(
- SomePtr->getType()->getPointerElementType(), SomePtr,
- SomePtr->getName() + ".promoted", Preheader->getTerminator());
- if (SawUnorderedAtomic)
- PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
- PreheaderLoad->setAlignment(Alignment);
- PreheaderLoad->setDebugLoc(DebugLoc());
- if (AATags)
- PreheaderLoad->setAAMetadata(AATags);
- SSA.AddAvailableValue(Preheader, PreheaderLoad);
-
- if (MSSAU) {
- MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB(
- PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End);
- MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess);
- MSSAU->insertUse(NewMemUse, /*RenameUses=*/true);
- }
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
- // Rewrite all the loads in the loop and remember all the definitions from
- // stores in the loop.
- Promoter.run(LoopUses);
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
- // If the SSAUpdater didn't use the load in the preheader, just zap it now.
- if (PreheaderLoad->use_empty())
- eraseInstruction(*PreheaderLoad, *SafetyInfo, CurAST, MSSAU);
-
- return true;
-}
-
-/// Returns an owning pointer to an alias set which incorporates aliasing info
-/// from L and all subloops of L.
-std::unique_ptr<AliasSetTracker>
-LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
- AAResults *AA) {
- auto CurAST = std::make_unique<AliasSetTracker>(*AA);
-
- // Add everything from all the sub loops.
- for (Loop *InnerL : L->getSubLoops())
- for (BasicBlock *BB : InnerL->blocks())
- CurAST->add(*BB);
-
- // And merge in this loop (without anything from inner loops).
- for (BasicBlock *BB : L->blocks())
- if (LI->getLoopFor(BB) == L)
- CurAST->add(*BB);
-
- return CurAST;
-}
-
-std::unique_ptr<AliasSetTracker>
-LoopInvariantCodeMotion::collectAliasInfoForLoopWithMSSA(
- Loop *L, AAResults *AA, MemorySSAUpdater *MSSAU) {
- auto *MSSA = MSSAU->getMemorySSA();
- auto CurAST = std::make_unique<AliasSetTracker>(*AA, MSSA, L);
- CurAST->addAllInstructionsInLoopUsingMSSA();
- return CurAST;
-}
-
-static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
- AliasSetTracker *CurAST, Loop *CurLoop,
- AAResults *AA) {
- // First check to see if any of the basic blocks in CurLoop invalidate *V.
- bool isInvalidatedAccordingToAST = CurAST->getAliasSetFor(MemLoc).isMod();
-
- if (!isInvalidatedAccordingToAST || !LICMN2Theshold)
- return isInvalidatedAccordingToAST;
-
- // Check with a diagnostic analysis if we can refine the information above.
- // This is to identify the limitations of using the AST.
- // The alias set mechanism used by LICM has a major weakness in that it
- // combines all things which may alias into a single set *before* asking
- // modref questions. As a result, a single readonly call within a loop will
- // collapse all loads and stores into a single alias set and report
- // invalidation if the loop contains any store. For example, readonly calls
- // with deopt states have this form and create a general alias set with all
- // loads and stores. In order to get any LICM in loops containing possible
- // deopt states we need a more precise invalidation of checking the mod ref
- // info of each instruction within the loop and LI. This has a complexity of
- // O(N^2), so currently, it is used only as a diagnostic tool since the
- // default value of LICMN2Threshold is zero.
-
- // Don't look at nested loops.
- if (CurLoop->begin() != CurLoop->end())
- return true;
-
- int N = 0;
- for (BasicBlock *BB : CurLoop->getBlocks())
- for (Instruction &I : *BB) {
- if (N >= LICMN2Theshold) {
- LLVM_DEBUG(dbgs() << "Alasing N2 threshold exhausted for "
- << *(MemLoc.Ptr) << "\n");
- return true;
- }
- N++;
- auto Res = AA->getModRefInfo(&I, MemLoc);
- if (isModSet(Res)) {
- LLVM_DEBUG(dbgs() << "Aliasing failed on " << I << " for "
- << *(MemLoc.Ptr) << "\n");
- return true;
- }
- }
- LLVM_DEBUG(dbgs() << "Aliasing okay for " << *(MemLoc.Ptr) << "\n");
- return false;
-}
-
+ Alignment.value(), SawUnorderedAtomic, AATags,
+ *SafetyInfo);
+
+ // Set up the preheader to have a definition of the value. It is the live-out
+ // value from the preheader that uses in the loop will use.
+ LoadInst *PreheaderLoad = new LoadInst(
+ SomePtr->getType()->getPointerElementType(), SomePtr,
+ SomePtr->getName() + ".promoted", Preheader->getTerminator());
+ if (SawUnorderedAtomic)
+ PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
+ PreheaderLoad->setAlignment(Alignment);
+ PreheaderLoad->setDebugLoc(DebugLoc());
+ if (AATags)
+ PreheaderLoad->setAAMetadata(AATags);
+ SSA.AddAvailableValue(Preheader, PreheaderLoad);
+
+ if (MSSAU) {
+ MemoryAccess *PreheaderLoadMemoryAccess = MSSAU->createMemoryAccessInBB(
+ PreheaderLoad, nullptr, PreheaderLoad->getParent(), MemorySSA::End);
+ MemoryUse *NewMemUse = cast<MemoryUse>(PreheaderLoadMemoryAccess);
+ MSSAU->insertUse(NewMemUse, /*RenameUses=*/true);
+ }
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+ // Rewrite all the loads in the loop and remember all the definitions from
+ // stores in the loop.
+ Promoter.run(LoopUses);
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+ // If the SSAUpdater didn't use the load in the preheader, just zap it now.
+ if (PreheaderLoad->use_empty())
+ eraseInstruction(*PreheaderLoad, *SafetyInfo, CurAST, MSSAU);
+
+ return true;
+}
+
+/// Returns an owning pointer to an alias set which incorporates aliasing info
+/// from L and all subloops of L.
+std::unique_ptr<AliasSetTracker>
+LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
+ AAResults *AA) {
+ auto CurAST = std::make_unique<AliasSetTracker>(*AA);
+
+ // Add everything from all the sub loops.
+ for (Loop *InnerL : L->getSubLoops())
+ for (BasicBlock *BB : InnerL->blocks())
+ CurAST->add(*BB);
+
+ // And merge in this loop (without anything from inner loops).
+ for (BasicBlock *BB : L->blocks())
+ if (LI->getLoopFor(BB) == L)
+ CurAST->add(*BB);
+
+ return CurAST;
+}
+
+std::unique_ptr<AliasSetTracker>
+LoopInvariantCodeMotion::collectAliasInfoForLoopWithMSSA(
+ Loop *L, AAResults *AA, MemorySSAUpdater *MSSAU) {
+ auto *MSSA = MSSAU->getMemorySSA();
+ auto CurAST = std::make_unique<AliasSetTracker>(*AA, MSSA, L);
+ CurAST->addAllInstructionsInLoopUsingMSSA();
+ return CurAST;
+}
+
+static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
+ AliasSetTracker *CurAST, Loop *CurLoop,
+ AAResults *AA) {
+ // First check to see if any of the basic blocks in CurLoop invalidate *V.
+ bool isInvalidatedAccordingToAST = CurAST->getAliasSetFor(MemLoc).isMod();
+
+ if (!isInvalidatedAccordingToAST || !LICMN2Theshold)
+ return isInvalidatedAccordingToAST;
+
+ // Check with a diagnostic analysis if we can refine the information above.
+ // This is to identify the limitations of using the AST.
+ // The alias set mechanism used by LICM has a major weakness in that it
+ // combines all things which may alias into a single set *before* asking
+ // modref questions. As a result, a single readonly call within a loop will
+ // collapse all loads and stores into a single alias set and report
+ // invalidation if the loop contains any store. For example, readonly calls
+ // with deopt states have this form and create a general alias set with all
+ // loads and stores. In order to get any LICM in loops containing possible
+ // deopt states we need a more precise invalidation of checking the mod ref
+ // info of each instruction within the loop and LI. This has a complexity of
+ // O(N^2), so currently, it is used only as a diagnostic tool since the
+ // default value of LICMN2Threshold is zero.
+
+ // Don't look at nested loops.
+ if (CurLoop->begin() != CurLoop->end())
+ return true;
+
+ int N = 0;
+ for (BasicBlock *BB : CurLoop->getBlocks())
+ for (Instruction &I : *BB) {
+ if (N >= LICMN2Theshold) {
+ LLVM_DEBUG(dbgs() << "Alasing N2 threshold exhausted for "
+ << *(MemLoc.Ptr) << "\n");
+ return true;
+ }
+ N++;
+ auto Res = AA->getModRefInfo(&I, MemLoc);
+ if (isModSet(Res)) {
+ LLVM_DEBUG(dbgs() << "Aliasing failed on " << I << " for "
+ << *(MemLoc.Ptr) << "\n");
+ return true;
+ }
+ }
+ LLVM_DEBUG(dbgs() << "Aliasing okay for " << *(MemLoc.Ptr) << "\n");
+ return false;
+}
+
bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
Loop *CurLoop, Instruction &I,
SinkAndHoistLICMFlags &Flags) {
- // For hoisting, use the walker to determine safety
+ // For hoisting, use the walker to determine safety
if (!Flags.getIsSink()) {
- MemoryAccess *Source;
- // See declaration of SetLicmMssaOptCap for usage details.
+ MemoryAccess *Source;
+ // See declaration of SetLicmMssaOptCap for usage details.
if (Flags.tooManyClobberingCalls())
- Source = MU->getDefiningAccess();
- else {
- Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(MU);
+ Source = MU->getDefiningAccess();
+ else {
+ Source = MSSA->getSkipSelfWalker()->getClobberingMemoryAccess(MU);
Flags.incrementClobberingCalls();
- }
- return !MSSA->isLiveOnEntryDef(Source) &&
- CurLoop->contains(Source->getBlock());
- }
-
- // For sinking, we'd need to check all Defs below this use. The getClobbering
- // call will look on the backedge of the loop, but will check aliasing with
- // the instructions on the previous iteration.
- // For example:
- // for (i ... )
- // load a[i] ( Use (LoE)
- // store a[i] ( 1 = Def (2), with 2 = Phi for the loop.
- // i++;
- // The load sees no clobbering inside the loop, as the backedge alias check
- // does phi translation, and will check aliasing against store a[i-1].
- // However sinking the load outside the loop, below the store is incorrect.
-
- // For now, only sink if there are no Defs in the loop, and the existing ones
- // precede the use and are in the same block.
- // FIXME: Increase precision: Safe to sink if Use post dominates the Def;
- // needs PostDominatorTreeAnalysis.
- // FIXME: More precise: no Defs that alias this Use.
+ }
+ return !MSSA->isLiveOnEntryDef(Source) &&
+ CurLoop->contains(Source->getBlock());
+ }
+
+ // For sinking, we'd need to check all Defs below this use. The getClobbering
+ // call will look on the backedge of the loop, but will check aliasing with
+ // the instructions on the previous iteration.
+ // For example:
+ // for (i ... )
+ // load a[i] ( Use (LoE)
+ // store a[i] ( 1 = Def (2), with 2 = Phi for the loop.
+ // i++;
+ // The load sees no clobbering inside the loop, as the backedge alias check
+ // does phi translation, and will check aliasing against store a[i-1].
+ // However sinking the load outside the loop, below the store is incorrect.
+
+ // For now, only sink if there are no Defs in the loop, and the existing ones
+ // precede the use and are in the same block.
+ // FIXME: Increase precision: Safe to sink if Use post dominates the Def;
+ // needs PostDominatorTreeAnalysis.
+ // FIXME: More precise: no Defs that alias this Use.
if (Flags.tooManyMemoryAccesses())
- return true;
- for (auto *BB : CurLoop->getBlocks())
+ return true;
+ for (auto *BB : CurLoop->getBlocks())
if (pointerInvalidatedByBlockWithMSSA(*BB, *MSSA, *MU))
return true;
// When sinking, the source block may not be part of the loop so check it.
if (!CurLoop->contains(&I))
return pointerInvalidatedByBlockWithMSSA(*I.getParent(), *MSSA, *MU);
- return false;
-}
-
+ return false;
+}
+
bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA,
MemoryUse &MU) {
if (const auto *Accesses = MSSA.getBlockDefs(&BB))
@@ -2350,10 +2350,10 @@ bool pointerInvalidatedByBlockWithMSSA(BasicBlock &BB, MemorySSA &MSSA,
return false;
}
-/// Little predicate that returns true if the specified basic block is in
-/// a subloop of the current one, not the current one itself.
-///
-static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) {
- assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
- return LI->getLoopFor(BB) != CurLoop;
-}
+/// Little predicate that returns true if the specified basic block is in
+/// a subloop of the current one, not the current one itself.
+///
+static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI) {
+ assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
+ return LI->getLoopFor(BB) != CurLoop;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
index 32e6cd4e93..1c3ff1a61b 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopAccessAnalysisPrinter.cpp
@@ -1,24 +1,24 @@
-//===- LoopAccessAnalysisPrinter.cpp - Loop Access Analysis Printer --------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-accesses"
-
-PreservedAnalyses
-LoopAccessInfoPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR, LPMUpdater &) {
- Function &F = *L.getHeader()->getParent();
- auto &LAI = AM.getResult<LoopAccessAnalysis>(L, AR);
- OS << "Loop access info in function '" << F.getName() << "':\n";
- OS.indent(2) << L.getHeader()->getName() << ":\n";
- LAI.print(OS, 4);
- return PreservedAnalyses::all();
-}
+//===- LoopAccessAnalysisPrinter.cpp - Loop Access Analysis Printer --------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopAccessAnalysisPrinter.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-accesses"
+
+PreservedAnalyses
+LoopAccessInfoPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &) {
+ Function &F = *L.getHeader()->getParent();
+ auto &LAI = AM.getResult<LoopAccessAnalysis>(L, AR);
+ OS << "Loop access info in function '" << F.getName() << "':\n";
+ OS.indent(2) << L.getHeader()->getName() << ":\n";
+ LAI.print(OS, 4);
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 63b79c9caa..45cdcb2f37 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -1,419 +1,419 @@
-//===-------- LoopDataPrefetch.cpp - Loop Data Prefetching Pass -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements a Loop Data Prefetching Pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
-#include "llvm/InitializePasses.h"
-
-#define DEBUG_TYPE "loop-data-prefetch"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-using namespace llvm;
-
-// By default, we limit this to creating 16 PHIs (which is a little over half
-// of the allocatable register set).
-static cl::opt<bool>
-PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false),
- cl::desc("Prefetch write addresses"));
-
-static cl::opt<unsigned>
- PrefetchDistance("prefetch-distance",
- cl::desc("Number of instructions to prefetch ahead"),
- cl::Hidden);
-
-static cl::opt<unsigned>
- MinPrefetchStride("min-prefetch-stride",
- cl::desc("Min stride to add prefetches"), cl::Hidden);
-
-static cl::opt<unsigned> MaxPrefetchIterationsAhead(
- "max-prefetch-iters-ahead",
- cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden);
-
-STATISTIC(NumPrefetches, "Number of prefetches inserted");
-
-namespace {
-
-/// Loop prefetch implementation class.
-class LoopDataPrefetch {
-public:
- LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
- ScalarEvolution *SE, const TargetTransformInfo *TTI,
- OptimizationRemarkEmitter *ORE)
- : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
-
- bool run();
-
-private:
- bool runOnLoop(Loop *L);
-
- /// Check if the stride of the accesses is large enough to
- /// warrant a prefetch.
- bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride);
-
- unsigned getMinPrefetchStride(unsigned NumMemAccesses,
- unsigned NumStridedMemAccesses,
- unsigned NumPrefetches,
- bool HasCall) {
- if (MinPrefetchStride.getNumOccurrences() > 0)
- return MinPrefetchStride;
- return TTI->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
- NumPrefetches, HasCall);
- }
-
- unsigned getPrefetchDistance() {
- if (PrefetchDistance.getNumOccurrences() > 0)
- return PrefetchDistance;
- return TTI->getPrefetchDistance();
- }
-
- unsigned getMaxPrefetchIterationsAhead() {
- if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0)
- return MaxPrefetchIterationsAhead;
- return TTI->getMaxPrefetchIterationsAhead();
- }
-
- bool doPrefetchWrites() {
- if (PrefetchWrites.getNumOccurrences() > 0)
- return PrefetchWrites;
- return TTI->enableWritePrefetching();
- }
-
- AssumptionCache *AC;
- DominatorTree *DT;
- LoopInfo *LI;
- ScalarEvolution *SE;
- const TargetTransformInfo *TTI;
- OptimizationRemarkEmitter *ORE;
-};
-
-/// Legacy class for inserting loop data prefetches.
-class LoopDataPrefetchLegacyPass : public FunctionPass {
-public:
- static char ID; // Pass ID, replacement for typeid
- LoopDataPrefetchLegacyPass() : FunctionPass(ID) {
- initializeLoopDataPrefetchLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- }
-
- bool runOnFunction(Function &F) override;
- };
-}
-
-char LoopDataPrefetchLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
- "Loop Data Prefetch", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
- "Loop Data Prefetch", false, false)
-
-FunctionPass *llvm::createLoopDataPrefetchPass() {
- return new LoopDataPrefetchLegacyPass();
-}
-
-bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR,
- unsigned TargetMinStride) {
- // No need to check if any stride goes.
- if (TargetMinStride <= 1)
- return true;
-
- const auto *ConstStride = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
- // If MinStride is set, don't prefetch unless we can ensure that stride is
- // larger.
- if (!ConstStride)
- return false;
-
- unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue());
- return TargetMinStride <= AbsStride;
-}
-
-PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
- LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
- ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
- AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
- OptimizationRemarkEmitter *ORE =
- &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
-
- LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
- bool Changed = LDP.run();
-
- if (Changed) {
- PreservedAnalyses PA;
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<LoopAnalysis>();
- return PA;
- }
-
- return PreservedAnalyses::all();
-}
-
-bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- AssumptionCache *AC =
- &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- OptimizationRemarkEmitter *ORE =
- &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- const TargetTransformInfo *TTI =
- &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-
- LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
- return LDP.run();
-}
-
-bool LoopDataPrefetch::run() {
- // If PrefetchDistance is not set, don't run the pass. This gives an
- // opportunity for targets to run this pass for selected subtargets only
- // (whose TTI sets PrefetchDistance).
- if (getPrefetchDistance() == 0)
- return false;
- assert(TTI->getCacheLineSize() && "Cache line size is not set for target");
-
- bool MadeChange = false;
-
- for (Loop *I : *LI)
- for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
- MadeChange |= runOnLoop(*L);
-
- return MadeChange;
-}
-
-/// A record for a potential prefetch made during the initial scan of the
-/// loop. This is used to let a single prefetch target multiple memory accesses.
-struct Prefetch {
- /// The address formula for this prefetch as returned by ScalarEvolution.
- const SCEVAddRecExpr *LSCEVAddRec;
- /// The point of insertion for the prefetch instruction.
- Instruction *InsertPt;
- /// True if targeting a write memory access.
- bool Writes;
- /// The (first seen) prefetched instruction.
- Instruction *MemI;
-
- /// Constructor to create a new Prefetch for \p I.
- Prefetch(const SCEVAddRecExpr *L, Instruction *I)
- : LSCEVAddRec(L), InsertPt(nullptr), Writes(false), MemI(nullptr) {
- addInstruction(I);
- };
-
- /// Add the instruction \param I to this prefetch. If it's not the first
- /// one, 'InsertPt' and 'Writes' will be updated as required.
- /// \param PtrDiff the known constant address difference to the first added
- /// instruction.
- void addInstruction(Instruction *I, DominatorTree *DT = nullptr,
- int64_t PtrDiff = 0) {
- if (!InsertPt) {
- MemI = I;
- InsertPt = I;
- Writes = isa<StoreInst>(I);
- } else {
- BasicBlock *PrefBB = InsertPt->getParent();
- BasicBlock *InsBB = I->getParent();
- if (PrefBB != InsBB) {
- BasicBlock *DomBB = DT->findNearestCommonDominator(PrefBB, InsBB);
- if (DomBB != PrefBB)
- InsertPt = DomBB->getTerminator();
- }
-
- if (isa<StoreInst>(I) && PtrDiff == 0)
- Writes = true;
- }
- }
-};
-
-bool LoopDataPrefetch::runOnLoop(Loop *L) {
- bool MadeChange = false;
-
- // Only prefetch in the inner-most loop
+//===-------- LoopDataPrefetch.cpp - Loop Data Prefetching Pass -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Loop Data Prefetching Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopDataPrefetch.h"
+#include "llvm/InitializePasses.h"
+
+#define DEBUG_TYPE "loop-data-prefetch"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+// By default, we limit this to creating 16 PHIs (which is a little over half
+// of the allocatable register set).
+static cl::opt<bool>
+PrefetchWrites("loop-prefetch-writes", cl::Hidden, cl::init(false),
+ cl::desc("Prefetch write addresses"));
+
+static cl::opt<unsigned>
+ PrefetchDistance("prefetch-distance",
+ cl::desc("Number of instructions to prefetch ahead"),
+ cl::Hidden);
+
+static cl::opt<unsigned>
+ MinPrefetchStride("min-prefetch-stride",
+ cl::desc("Min stride to add prefetches"), cl::Hidden);
+
+static cl::opt<unsigned> MaxPrefetchIterationsAhead(
+ "max-prefetch-iters-ahead",
+ cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden);
+
+STATISTIC(NumPrefetches, "Number of prefetches inserted");
+
+namespace {
+
+/// Loop prefetch implementation class.
+class LoopDataPrefetch {
+public:
+ LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
+ ScalarEvolution *SE, const TargetTransformInfo *TTI,
+ OptimizationRemarkEmitter *ORE)
+ : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
+
+ bool run();
+
+private:
+ bool runOnLoop(Loop *L);
+
+ /// Check if the stride of the accesses is large enough to
+ /// warrant a prefetch.
+ bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride);
+
+ unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+ unsigned NumStridedMemAccesses,
+ unsigned NumPrefetches,
+ bool HasCall) {
+ if (MinPrefetchStride.getNumOccurrences() > 0)
+ return MinPrefetchStride;
+ return TTI->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+ NumPrefetches, HasCall);
+ }
+
+ unsigned getPrefetchDistance() {
+ if (PrefetchDistance.getNumOccurrences() > 0)
+ return PrefetchDistance;
+ return TTI->getPrefetchDistance();
+ }
+
+ unsigned getMaxPrefetchIterationsAhead() {
+ if (MaxPrefetchIterationsAhead.getNumOccurrences() > 0)
+ return MaxPrefetchIterationsAhead;
+ return TTI->getMaxPrefetchIterationsAhead();
+ }
+
+ bool doPrefetchWrites() {
+ if (PrefetchWrites.getNumOccurrences() > 0)
+ return PrefetchWrites;
+ return TTI->enableWritePrefetching();
+ }
+
+ AssumptionCache *AC;
+ DominatorTree *DT;
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ const TargetTransformInfo *TTI;
+ OptimizationRemarkEmitter *ORE;
+};
+
+/// Legacy class for inserting loop data prefetches.
+class LoopDataPrefetchLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopDataPrefetchLegacyPass() : FunctionPass(ID) {
+ initializeLoopDataPrefetchLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override;
+ };
+}
+
+char LoopDataPrefetchLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
+ "Loop Data Prefetch", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(LoopDataPrefetchLegacyPass, "loop-data-prefetch",
+ "Loop Data Prefetch", false, false)
+
+FunctionPass *llvm::createLoopDataPrefetchPass() {
+ return new LoopDataPrefetchLegacyPass();
+}
+
+bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR,
+ unsigned TargetMinStride) {
+ // No need to check if any stride goes.
+ if (TargetMinStride <= 1)
+ return true;
+
+ const auto *ConstStride = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE));
+ // If MinStride is set, don't prefetch unless we can ensure that stride is
+ // larger.
+ if (!ConstStride)
+ return false;
+
+ unsigned AbsStride = std::abs(ConstStride->getAPInt().getSExtValue());
+ return TargetMinStride <= AbsStride;
+}
+
+PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
+ ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+ AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+ OptimizationRemarkEmitter *ORE =
+ &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+ LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
+ bool Changed = LDP.run();
+
+ if (Changed) {
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ return PA;
+ }
+
+ return PreservedAnalyses::all();
+}
+
+bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ AssumptionCache *AC =
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ OptimizationRemarkEmitter *ORE =
+ &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ const TargetTransformInfo *TTI =
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
+ return LDP.run();
+}
+
+bool LoopDataPrefetch::run() {
+ // If PrefetchDistance is not set, don't run the pass. This gives an
+ // opportunity for targets to run this pass for selected subtargets only
+ // (whose TTI sets PrefetchDistance).
+ if (getPrefetchDistance() == 0)
+ return false;
+ assert(TTI->getCacheLineSize() && "Cache line size is not set for target");
+
+ bool MadeChange = false;
+
+ for (Loop *I : *LI)
+ for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
+ MadeChange |= runOnLoop(*L);
+
+ return MadeChange;
+}
+
+/// A record for a potential prefetch made during the initial scan of the
+/// loop. This is used to let a single prefetch target multiple memory accesses.
+struct Prefetch {
+ /// The address formula for this prefetch as returned by ScalarEvolution.
+ const SCEVAddRecExpr *LSCEVAddRec;
+ /// The point of insertion for the prefetch instruction.
+ Instruction *InsertPt;
+ /// True if targeting a write memory access.
+ bool Writes;
+ /// The (first seen) prefetched instruction.
+ Instruction *MemI;
+
+ /// Constructor to create a new Prefetch for \p I.
+ Prefetch(const SCEVAddRecExpr *L, Instruction *I)
+ : LSCEVAddRec(L), InsertPt(nullptr), Writes(false), MemI(nullptr) {
+ addInstruction(I);
+ };
+
+ /// Add the instruction \param I to this prefetch. If it's not the first
+ /// one, 'InsertPt' and 'Writes' will be updated as required.
+ /// \param PtrDiff the known constant address difference to the first added
+ /// instruction.
+ void addInstruction(Instruction *I, DominatorTree *DT = nullptr,
+ int64_t PtrDiff = 0) {
+ if (!InsertPt) {
+ MemI = I;
+ InsertPt = I;
+ Writes = isa<StoreInst>(I);
+ } else {
+ BasicBlock *PrefBB = InsertPt->getParent();
+ BasicBlock *InsBB = I->getParent();
+ if (PrefBB != InsBB) {
+ BasicBlock *DomBB = DT->findNearestCommonDominator(PrefBB, InsBB);
+ if (DomBB != PrefBB)
+ InsertPt = DomBB->getTerminator();
+ }
+
+ if (isa<StoreInst>(I) && PtrDiff == 0)
+ Writes = true;
+ }
+ }
+};
+
+bool LoopDataPrefetch::runOnLoop(Loop *L) {
+ bool MadeChange = false;
+
+ // Only prefetch in the inner-most loop
if (!L->isInnermost())
- return MadeChange;
-
- SmallPtrSet<const Value *, 32> EphValues;
- CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
- // Calculate the number of iterations ahead to prefetch
- CodeMetrics Metrics;
- bool HasCall = false;
- for (const auto BB : L->blocks()) {
- // If the loop already has prefetches, then assume that the user knows
- // what they are doing and don't add any more.
- for (auto &I : *BB) {
- if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
- if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
- if (F->getIntrinsicID() == Intrinsic::prefetch)
- return MadeChange;
- if (TTI->isLoweredToCall(F))
- HasCall = true;
- } else { // indirect call.
- HasCall = true;
- }
- }
- }
- Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
- }
- unsigned LoopSize = Metrics.NumInsts;
- if (!LoopSize)
- LoopSize = 1;
-
- unsigned ItersAhead = getPrefetchDistance() / LoopSize;
- if (!ItersAhead)
- ItersAhead = 1;
-
- if (ItersAhead > getMaxPrefetchIterationsAhead())
- return MadeChange;
-
- unsigned ConstantMaxTripCount = SE->getSmallConstantMaxTripCount(L);
- if (ConstantMaxTripCount && ConstantMaxTripCount < ItersAhead + 1)
- return MadeChange;
-
- unsigned NumMemAccesses = 0;
- unsigned NumStridedMemAccesses = 0;
- SmallVector<Prefetch, 16> Prefetches;
- for (const auto BB : L->blocks())
- for (auto &I : *BB) {
- Value *PtrValue;
- Instruction *MemI;
-
- if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) {
- MemI = LMemI;
- PtrValue = LMemI->getPointerOperand();
- } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
- if (!doPrefetchWrites()) continue;
- MemI = SMemI;
- PtrValue = SMemI->getPointerOperand();
- } else continue;
-
- unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
- if (PtrAddrSpace)
- continue;
- NumMemAccesses++;
- if (L->isLoopInvariant(PtrValue))
- continue;
-
- const SCEV *LSCEV = SE->getSCEV(PtrValue);
- const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
- if (!LSCEVAddRec)
- continue;
- NumStridedMemAccesses++;
-
- // We don't want to double prefetch individual cache lines. If this
- // access is known to be within one cache line of some other one that
- // has already been prefetched, then don't prefetch this one as well.
- bool DupPref = false;
- for (auto &Pref : Prefetches) {
- const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec);
- if (const SCEVConstant *ConstPtrDiff =
- dyn_cast<SCEVConstant>(PtrDiff)) {
- int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
- if (PD < (int64_t) TTI->getCacheLineSize()) {
- Pref.addInstruction(MemI, DT, PD);
- DupPref = true;
- break;
- }
- }
- }
- if (!DupPref)
- Prefetches.push_back(Prefetch(LSCEVAddRec, MemI));
- }
-
- unsigned TargetMinStride =
- getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
- Prefetches.size(), HasCall);
-
- LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
- << " iterations ahead (loop size: " << LoopSize << ") in "
- << L->getHeader()->getParent()->getName() << ": " << *L);
- LLVM_DEBUG(dbgs() << "Loop has: "
- << NumMemAccesses << " memory accesses, "
- << NumStridedMemAccesses << " strided memory accesses, "
- << Prefetches.size() << " potential prefetch(es), "
- << "a minimum stride of " << TargetMinStride << ", "
- << (HasCall ? "calls" : "no calls") << ".\n");
-
- for (auto &P : Prefetches) {
- // Check if the stride of the accesses is large enough to warrant a
- // prefetch.
- if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride))
- continue;
-
- const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr(
- SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
- P.LSCEVAddRec->getStepRecurrence(*SE)));
- if (!isSafeToExpand(NextLSCEV, *SE))
- continue;
-
- BasicBlock *BB = P.InsertPt->getParent();
- Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), 0/*PtrAddrSpace*/);
- SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr");
- Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, P.InsertPt);
-
- IRBuilder<> Builder(P.InsertPt);
- Module *M = BB->getParent()->getParent();
- Type *I32 = Type::getInt32Ty(BB->getContext());
- Function *PrefetchFunc = Intrinsic::getDeclaration(
- M, Intrinsic::prefetch, PrefPtrValue->getType());
- Builder.CreateCall(
- PrefetchFunc,
- {PrefPtrValue,
- ConstantInt::get(I32, P.Writes),
- ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
- ++NumPrefetches;
- LLVM_DEBUG(dbgs() << " Access: "
- << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1)
- << ", SCEV: " << *P.LSCEVAddRec << "\n");
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI)
- << "prefetched memory access";
- });
-
- MadeChange = true;
- }
-
- return MadeChange;
-}
+ return MadeChange;
+
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+ // Calculate the number of iterations ahead to prefetch
+ CodeMetrics Metrics;
+ bool HasCall = false;
+ for (const auto BB : L->blocks()) {
+ // If the loop already has prefetches, then assume that the user knows
+ // what they are doing and don't add any more.
+ for (auto &I : *BB) {
+ if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
+ if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
+ if (F->getIntrinsicID() == Intrinsic::prefetch)
+ return MadeChange;
+ if (TTI->isLoweredToCall(F))
+ HasCall = true;
+ } else { // indirect call.
+ HasCall = true;
+ }
+ }
+ }
+ Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+ }
+ unsigned LoopSize = Metrics.NumInsts;
+ if (!LoopSize)
+ LoopSize = 1;
+
+ unsigned ItersAhead = getPrefetchDistance() / LoopSize;
+ if (!ItersAhead)
+ ItersAhead = 1;
+
+ if (ItersAhead > getMaxPrefetchIterationsAhead())
+ return MadeChange;
+
+ unsigned ConstantMaxTripCount = SE->getSmallConstantMaxTripCount(L);
+ if (ConstantMaxTripCount && ConstantMaxTripCount < ItersAhead + 1)
+ return MadeChange;
+
+ unsigned NumMemAccesses = 0;
+ unsigned NumStridedMemAccesses = 0;
+ SmallVector<Prefetch, 16> Prefetches;
+ for (const auto BB : L->blocks())
+ for (auto &I : *BB) {
+ Value *PtrValue;
+ Instruction *MemI;
+
+ if (LoadInst *LMemI = dyn_cast<LoadInst>(&I)) {
+ MemI = LMemI;
+ PtrValue = LMemI->getPointerOperand();
+ } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
+ if (!doPrefetchWrites()) continue;
+ MemI = SMemI;
+ PtrValue = SMemI->getPointerOperand();
+ } else continue;
+
+ unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
+ if (PtrAddrSpace)
+ continue;
+ NumMemAccesses++;
+ if (L->isLoopInvariant(PtrValue))
+ continue;
+
+ const SCEV *LSCEV = SE->getSCEV(PtrValue);
+ const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+ if (!LSCEVAddRec)
+ continue;
+ NumStridedMemAccesses++;
+
+ // We don't want to double prefetch individual cache lines. If this
+ // access is known to be within one cache line of some other one that
+ // has already been prefetched, then don't prefetch this one as well.
+ bool DupPref = false;
+ for (auto &Pref : Prefetches) {
+ const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec);
+ if (const SCEVConstant *ConstPtrDiff =
+ dyn_cast<SCEVConstant>(PtrDiff)) {
+ int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
+ if (PD < (int64_t) TTI->getCacheLineSize()) {
+ Pref.addInstruction(MemI, DT, PD);
+ DupPref = true;
+ break;
+ }
+ }
+ }
+ if (!DupPref)
+ Prefetches.push_back(Prefetch(LSCEVAddRec, MemI));
+ }
+
+ unsigned TargetMinStride =
+ getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+ Prefetches.size(), HasCall);
+
+ LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
+ << " iterations ahead (loop size: " << LoopSize << ") in "
+ << L->getHeader()->getParent()->getName() << ": " << *L);
+ LLVM_DEBUG(dbgs() << "Loop has: "
+ << NumMemAccesses << " memory accesses, "
+ << NumStridedMemAccesses << " strided memory accesses, "
+ << Prefetches.size() << " potential prefetch(es), "
+ << "a minimum stride of " << TargetMinStride << ", "
+ << (HasCall ? "calls" : "no calls") << ".\n");
+
+ for (auto &P : Prefetches) {
+ // Check if the stride of the accesses is large enough to warrant a
+ // prefetch.
+ if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride))
+ continue;
+
+ const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr(
+ SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
+ P.LSCEVAddRec->getStepRecurrence(*SE)));
+ if (!isSafeToExpand(NextLSCEV, *SE))
+ continue;
+
+ BasicBlock *BB = P.InsertPt->getParent();
+ Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), 0/*PtrAddrSpace*/);
+ SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr");
+ Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, P.InsertPt);
+
+ IRBuilder<> Builder(P.InsertPt);
+ Module *M = BB->getParent()->getParent();
+ Type *I32 = Type::getInt32Ty(BB->getContext());
+ Function *PrefetchFunc = Intrinsic::getDeclaration(
+ M, Intrinsic::prefetch, PrefPtrValue->getType());
+ Builder.CreateCall(
+ PrefetchFunc,
+ {PrefPtrValue,
+ ConstantInt::get(I32, P.Writes),
+ ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+ ++NumPrefetches;
+ LLVM_DEBUG(dbgs() << " Access: "
+ << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1)
+ << ", SCEV: " << *P.LSCEVAddRec << "\n");
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI)
+ << "prefetched memory access";
+ });
+
+ MadeChange = true;
+ }
+
+ return MadeChange;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDeletion.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDeletion.cpp
index 59873b0352..1266c93316 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -1,44 +1,44 @@
-//===- LoopDeletion.cpp - Dead Loop Deletion Pass ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Dead Loop Deletion Pass. This pass is responsible
-// for eliminating loops with non-infinite computable trip counts that have no
-// side effects or volatile instructions, and do not contribute to the
-// computation of the function's return value.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopDeletion.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-delete"
-
-STATISTIC(NumDeleted, "Number of loops deleted");
-
-enum class LoopDeletionResult {
- Unmodified,
- Modified,
- Deleted,
-};
-
+//===- LoopDeletion.cpp - Dead Loop Deletion Pass ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Dead Loop Deletion Pass. This pass is responsible
+// for eliminating loops with non-infinite computable trip counts that have no
+// side effects or volatile instructions, and do not contribute to the
+// computation of the function's return value.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopDeletion.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-delete"
+
+STATISTIC(NumDeleted, "Number of loops deleted");
+
+enum class LoopDeletionResult {
+ Unmodified,
+ Modified,
+ Deleted,
+};
+
static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B) {
if (A == LoopDeletionResult::Deleted || B == LoopDeletionResult::Deleted)
return LoopDeletionResult::Deleted;
@@ -47,25 +47,25 @@ static LoopDeletionResult merge(LoopDeletionResult A, LoopDeletionResult B) {
return LoopDeletionResult::Unmodified;
}
-/// Determines if a loop is dead.
-///
-/// This assumes that we've already checked for unique exit and exiting blocks,
-/// and that the code is in LCSSA form.
-static bool isLoopDead(Loop *L, ScalarEvolution &SE,
- SmallVectorImpl<BasicBlock *> &ExitingBlocks,
- BasicBlock *ExitBlock, bool &Changed,
- BasicBlock *Preheader) {
- // Make sure that all PHI entries coming from the loop are loop invariant.
- // Because the code is in LCSSA form, any values used outside of the loop
- // must pass through a PHI in the exit block, meaning that this check is
- // sufficient to guarantee that no loop-variant values are used outside
- // of the loop.
- bool AllEntriesInvariant = true;
- bool AllOutgoingValuesSame = true;
+/// Determines if a loop is dead.
+///
+/// This assumes that we've already checked for unique exit and exiting blocks,
+/// and that the code is in LCSSA form.
+static bool isLoopDead(Loop *L, ScalarEvolution &SE,
+ SmallVectorImpl<BasicBlock *> &ExitingBlocks,
+ BasicBlock *ExitBlock, bool &Changed,
+ BasicBlock *Preheader) {
+ // Make sure that all PHI entries coming from the loop are loop invariant.
+ // Because the code is in LCSSA form, any values used outside of the loop
+ // must pass through a PHI in the exit block, meaning that this check is
+ // sufficient to guarantee that no loop-variant values are used outside
+ // of the loop.
+ bool AllEntriesInvariant = true;
+ bool AllOutgoingValuesSame = true;
if (!L->hasNoExitBlocks()) {
for (PHINode &P : ExitBlock->phis()) {
Value *incoming = P.getIncomingValueForBlock(ExitingBlocks[0]);
-
+
// Make sure all exiting blocks produce the same incoming value for the
// block. If there are different incoming values for different exiting
// blocks, then it is impossible to statically determine which value
@@ -74,67 +74,67 @@ static bool isLoopDead(Loop *L, ScalarEvolution &SE,
all_of(makeArrayRef(ExitingBlocks).slice(1), [&](BasicBlock *BB) {
return incoming == P.getIncomingValueForBlock(BB);
});
-
+
if (!AllOutgoingValuesSame)
break;
-
+
if (Instruction *I = dyn_cast<Instruction>(incoming))
if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator())) {
AllEntriesInvariant = false;
break;
}
}
- }
-
- if (Changed)
- SE.forgetLoopDispositions(L);
-
- if (!AllEntriesInvariant || !AllOutgoingValuesSame)
- return false;
-
- // Make sure that no instructions in the block have potential side-effects.
- // This includes instructions that could write to memory, and loads that are
- // marked volatile.
- for (auto &I : L->blocks())
+ }
+
+ if (Changed)
+ SE.forgetLoopDispositions(L);
+
+ if (!AllEntriesInvariant || !AllOutgoingValuesSame)
+ return false;
+
+ // Make sure that no instructions in the block have potential side-effects.
+ // This includes instructions that could write to memory, and loads that are
+ // marked volatile.
+ for (auto &I : L->blocks())
if (any_of(*I, [](Instruction &I) {
return I.mayHaveSideEffects() && !I.isDroppable();
}))
- return false;
- return true;
-}
-
-/// This function returns true if there is no viable path from the
-/// entry block to the header of \p L. Right now, it only does
-/// a local search to save compile time.
-static bool isLoopNeverExecuted(Loop *L) {
- using namespace PatternMatch;
-
- auto *Preheader = L->getLoopPreheader();
- // TODO: We can relax this constraint, since we just need a loop
- // predecessor.
- assert(Preheader && "Needs preheader!");
-
- if (Preheader == &Preheader->getParent()->getEntryBlock())
- return false;
- // All predecessors of the preheader should have a constant conditional
- // branch, with the loop's preheader as not-taken.
- for (auto *Pred: predecessors(Preheader)) {
- BasicBlock *Taken, *NotTaken;
- ConstantInt *Cond;
- if (!match(Pred->getTerminator(),
- m_Br(m_ConstantInt(Cond), Taken, NotTaken)))
- return false;
- if (!Cond->getZExtValue())
- std::swap(Taken, NotTaken);
- if (Taken == Preheader)
- return false;
- }
- assert(!pred_empty(Preheader) &&
- "Preheader should have predecessors at this point!");
- // All the predecessors have the loop preheader as not-taken target.
- return true;
-}
-
+ return false;
+ return true;
+}
+
+/// This function returns true if there is no viable path from the
+/// entry block to the header of \p L. Right now, it only does
+/// a local search to save compile time.
+static bool isLoopNeverExecuted(Loop *L) {
+ using namespace PatternMatch;
+
+ auto *Preheader = L->getLoopPreheader();
+ // TODO: We can relax this constraint, since we just need a loop
+ // predecessor.
+ assert(Preheader && "Needs preheader!");
+
+ if (Preheader == &Preheader->getParent()->getEntryBlock())
+ return false;
+ // All predecessors of the preheader should have a constant conditional
+ // branch, with the loop's preheader as not-taken.
+ for (auto *Pred: predecessors(Preheader)) {
+ BasicBlock *Taken, *NotTaken;
+ ConstantInt *Cond;
+ if (!match(Pred->getTerminator(),
+ m_Br(m_ConstantInt(Cond), Taken, NotTaken)))
+ return false;
+ if (!Cond->getZExtValue())
+ std::swap(Taken, NotTaken);
+ if (Taken == Preheader)
+ return false;
+ }
+ assert(!pred_empty(Preheader) &&
+ "Preheader should have predecessors at this point!");
+ // All the predecessors have the loop preheader as not-taken target.
+ return true;
+}
+
/// If we can prove the backedge is untaken, remove it. This destroys the
/// loop, but leaves the (now trivially loop invariant) control flow and
/// side effects (if any) in place.
@@ -155,116 +155,116 @@ breakBackedgeIfNotTaken(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
return LoopDeletionResult::Deleted;
}
-/// Remove a loop if it is dead.
-///
+/// Remove a loop if it is dead.
+///
/// A loop is considered dead either if it does not impact the observable
/// behavior of the program other than finite running time, or if it is
/// required to make progress by an attribute such as 'mustprogress' or
/// 'llvm.loop.mustprogress' and does not make any. This may remove
/// infinite loops that have been required to make progress.
-///
-/// This entire process relies pretty heavily on LoopSimplify form and LCSSA in
-/// order to make various safety checks work.
-///
-/// \returns true if any changes were made. This may mutate the loop even if it
-/// is unable to delete it due to hoisting trivially loop invariant
-/// instructions out of the loop.
-static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
- ScalarEvolution &SE, LoopInfo &LI,
- MemorySSA *MSSA,
- OptimizationRemarkEmitter &ORE) {
- assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
-
- // We can only remove the loop if there is a preheader that we can branch from
- // after removing it. Also, if LoopSimplify form is not available, stay out
- // of trouble.
- BasicBlock *Preheader = L->getLoopPreheader();
- if (!Preheader || !L->hasDedicatedExits()) {
- LLVM_DEBUG(
- dbgs()
- << "Deletion requires Loop with preheader and dedicated exits.\n");
- return LoopDeletionResult::Unmodified;
- }
-
- BasicBlock *ExitBlock = L->getUniqueExitBlock();
-
- if (ExitBlock && isLoopNeverExecuted(L)) {
- LLVM_DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
+///
+/// This entire process relies pretty heavily on LoopSimplify form and LCSSA in
+/// order to make various safety checks work.
+///
+/// \returns true if any changes were made. This may mutate the loop even if it
+/// is unable to delete it due to hoisting trivially loop invariant
+/// instructions out of the loop.
+static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
+ ScalarEvolution &SE, LoopInfo &LI,
+ MemorySSA *MSSA,
+ OptimizationRemarkEmitter &ORE) {
+ assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
+
+ // We can only remove the loop if there is a preheader that we can branch from
+ // after removing it. Also, if LoopSimplify form is not available, stay out
+ // of trouble.
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader || !L->hasDedicatedExits()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Deletion requires Loop with preheader and dedicated exits.\n");
+ return LoopDeletionResult::Unmodified;
+ }
+
+ BasicBlock *ExitBlock = L->getUniqueExitBlock();
+
+ if (ExitBlock && isLoopNeverExecuted(L)) {
+ LLVM_DEBUG(dbgs() << "Loop is proven to never execute, delete it!");
// We need to forget the loop before setting the incoming values of the exit
// phis to undef, so we properly invalidate the SCEV expressions for those
// phis.
SE.forgetLoop(L);
- // Set incoming value to undef for phi nodes in the exit block.
- for (PHINode &P : ExitBlock->phis()) {
- std::fill(P.incoming_values().begin(), P.incoming_values().end(),
- UndefValue::get(P.getType()));
- }
- ORE.emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "NeverExecutes", L->getStartLoc(),
- L->getHeader())
- << "Loop deleted because it never executes";
- });
- deleteDeadLoop(L, &DT, &SE, &LI, MSSA);
- ++NumDeleted;
- return LoopDeletionResult::Deleted;
- }
-
- // The remaining checks below are for a loop being dead because all statements
- // in the loop are invariant.
- SmallVector<BasicBlock *, 4> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
-
+ // Set incoming value to undef for phi nodes in the exit block.
+ for (PHINode &P : ExitBlock->phis()) {
+ std::fill(P.incoming_values().begin(), P.incoming_values().end(),
+ UndefValue::get(P.getType()));
+ }
+ ORE.emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "NeverExecutes", L->getStartLoc(),
+ L->getHeader())
+ << "Loop deleted because it never executes";
+ });
+ deleteDeadLoop(L, &DT, &SE, &LI, MSSA);
+ ++NumDeleted;
+ return LoopDeletionResult::Deleted;
+ }
+
+ // The remaining checks below are for a loop being dead because all statements
+ // in the loop are invariant.
+ SmallVector<BasicBlock *, 4> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
// We require that the loop has at most one exit block. Otherwise, we'd be in
// the situation of needing to be able to solve statically which exit block
// will be branched to, or trying to preserve the branching logic in a loop
// invariant manner.
if (!ExitBlock && !L->hasNoExitBlocks()) {
LLVM_DEBUG(dbgs() << "Deletion requires at most one exit block.\n");
- return LoopDeletionResult::Unmodified;
- }
- // Finally, we have to check that the loop really is dead.
- bool Changed = false;
- if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) {
- LLVM_DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
- return Changed ? LoopDeletionResult::Modified
- : LoopDeletionResult::Unmodified;
- }
-
+ return LoopDeletionResult::Unmodified;
+ }
+ // Finally, we have to check that the loop really is dead.
+ bool Changed = false;
+ if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader)) {
+ LLVM_DEBUG(dbgs() << "Loop is not invariant, cannot delete.\n");
+ return Changed ? LoopDeletionResult::Modified
+ : LoopDeletionResult::Unmodified;
+ }
+
// Don't remove loops for which we can't solve the trip count unless the loop
// was required to make progress but has been determined to be dead.
- const SCEV *S = SE.getConstantMaxBackedgeTakenCount(L);
+ const SCEV *S = SE.getConstantMaxBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(S) &&
!L->getHeader()->getParent()->mustProgress() && !hasMustProgress(L)) {
LLVM_DEBUG(dbgs() << "Could not compute SCEV MaxBackedgeTakenCount and was "
"not required to make progress.\n");
- return Changed ? LoopDeletionResult::Modified
- : LoopDeletionResult::Unmodified;
- }
-
- LLVM_DEBUG(dbgs() << "Loop is invariant, delete it!");
- ORE.emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "Invariant", L->getStartLoc(),
- L->getHeader())
- << "Loop deleted because it is invariant";
- });
- deleteDeadLoop(L, &DT, &SE, &LI, MSSA);
- ++NumDeleted;
-
- return LoopDeletionResult::Deleted;
-}
-
-PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &Updater) {
-
- LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: ");
- LLVM_DEBUG(L.dump());
- std::string LoopName = std::string(L.getName());
- // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
- // pass. Function analyses need to be preserved across loop transformations
- // but ORE cannot be preserved (see comment before the pass definition).
- OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
- auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, AR.MSSA, ORE);
+ return Changed ? LoopDeletionResult::Modified
+ : LoopDeletionResult::Unmodified;
+ }
+
+ LLVM_DEBUG(dbgs() << "Loop is invariant, delete it!");
+ ORE.emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Invariant", L->getStartLoc(),
+ L->getHeader())
+ << "Loop deleted because it is invariant";
+ });
+ deleteDeadLoop(L, &DT, &SE, &LI, MSSA);
+ ++NumDeleted;
+
+ return LoopDeletionResult::Deleted;
+}
+
+PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &Updater) {
+
+ LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+ LLVM_DEBUG(L.dump());
+ std::string LoopName = std::string(L.getName());
+ // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
+ // pass. Function analyses need to be preserved across loop transformations
+ // but ORE cannot be preserved (see comment before the pass definition).
+ OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
+ auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, AR.MSSA, ORE);
// If we can prove the backedge isn't taken, just break it and be done. This
// leaves the loop structure in place which means it can handle dispatching
@@ -273,73 +273,73 @@ PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
Result = merge(Result, breakBackedgeIfNotTaken(&L, AR.DT, AR.SE, AR.LI,
AR.MSSA, ORE));
- if (Result == LoopDeletionResult::Unmodified)
- return PreservedAnalyses::all();
-
- if (Result == LoopDeletionResult::Deleted)
- Updater.markLoopAsDeleted(L, LoopName);
-
- auto PA = getLoopPassPreservedAnalyses();
- if (AR.MSSA)
- PA.preserve<MemorySSAAnalysis>();
- return PA;
-}
-
-namespace {
-class LoopDeletionLegacyPass : public LoopPass {
-public:
- static char ID; // Pass ID, replacement for typeid
- LoopDeletionLegacyPass() : LoopPass(ID) {
- initializeLoopDeletionLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- // Possibly eliminate loop L if it is dead.
- bool runOnLoop(Loop *L, LPPassManager &) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<MemorySSAWrapperPass>();
- getLoopAnalysisUsage(AU);
- }
-};
-}
-
-char LoopDeletionLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopDeletionLegacyPass, "loop-deletion",
- "Delete dead loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_END(LoopDeletionLegacyPass, "loop-deletion",
- "Delete dead loops", false, false)
-
-Pass *llvm::createLoopDeletionPass() { return new LoopDeletionLegacyPass(); }
-
-bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
- if (skipLoop(L))
- return false;
- DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
- MemorySSA *MSSA = nullptr;
- if (MSSAAnalysis)
- MSSA = &MSSAAnalysis->getMSSA();
- // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
- // pass. Function analyses need to be preserved across loop transformations
- // but ORE cannot be preserved (see comment before the pass definition).
- OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
-
- LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: ");
- LLVM_DEBUG(L->dump());
-
- LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI, MSSA, ORE);
-
+ if (Result == LoopDeletionResult::Unmodified)
+ return PreservedAnalyses::all();
+
+ if (Result == LoopDeletionResult::Deleted)
+ Updater.markLoopAsDeleted(L, LoopName);
+
+ auto PA = getLoopPassPreservedAnalyses();
+ if (AR.MSSA)
+ PA.preserve<MemorySSAAnalysis>();
+ return PA;
+}
+
+namespace {
+class LoopDeletionLegacyPass : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopDeletionLegacyPass() : LoopPass(ID) {
+ initializeLoopDeletionLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ // Possibly eliminate loop L if it is dead.
+ bool runOnLoop(Loop *L, LPPassManager &) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<MemorySSAWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+};
+}
+
+char LoopDeletionLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopDeletionLegacyPass, "loop-deletion",
+ "Delete dead loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopDeletionLegacyPass, "loop-deletion",
+ "Delete dead loops", false, false)
+
+Pass *llvm::createLoopDeletionPass() { return new LoopDeletionLegacyPass(); }
+
+bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipLoop(L))
+ return false;
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+ MemorySSA *MSSA = nullptr;
+ if (MSSAAnalysis)
+ MSSA = &MSSAAnalysis->getMSSA();
+ // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+ // pass. Function analyses need to be preserved across loop transformations
+ // but ORE cannot be preserved (see comment before the pass definition).
+ OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
+
+ LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: ");
+ LLVM_DEBUG(L->dump());
+
+ LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI, MSSA, ORE);
+
// If we can prove the backedge isn't taken, just break it and be done. This
// leaves the loop structure in place which means it can handle dispatching
// to the right exit based on whatever loop invariant structure remains.
if (Result != LoopDeletionResult::Deleted)
Result = merge(Result, breakBackedgeIfNotTaken(L, DT, SE, LI, MSSA, ORE));
- if (Result == LoopDeletionResult::Deleted)
- LPM.markLoopAsDeleted(*L);
-
- return Result != LoopDeletionResult::Unmodified;
-}
+ if (Result == LoopDeletionResult::Deleted)
+ LPM.markLoopAsDeleted(*L);
+
+ return Result != LoopDeletionResult::Unmodified;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp
index d4b83c0fc3..1bd2529891 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -1,1088 +1,1088 @@
-//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Loop Distribution Pass. Its main focus is to
-// distribute loops that cannot be vectorized due to dependence cycles. It
-// tries to isolate the offending dependences into a new loop allowing
-// vectorization of the remaining parts.
-//
-// For dependence analysis, the pass uses the LoopVectorizer's
-// LoopAccessAnalysis. Because this analysis presumes no change in the order of
-// memory operations, special care is taken to preserve the lexical order of
-// these operations.
-//
-// Similarly to the Vectorizer, the pass also supports loop versioning to
-// run-time disambiguate potentially overlapping arrays.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopDistribute.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/EquivalenceClasses.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/LoopVersioning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <cassert>
-#include <functional>
-#include <list>
-#include <tuple>
-#include <utility>
-
-using namespace llvm;
-
-#define LDIST_NAME "loop-distribute"
-#define DEBUG_TYPE LDIST_NAME
-
-/// @{
-/// Metadata attribute names
-static const char *const LLVMLoopDistributeFollowupAll =
- "llvm.loop.distribute.followup_all";
-static const char *const LLVMLoopDistributeFollowupCoincident =
- "llvm.loop.distribute.followup_coincident";
-static const char *const LLVMLoopDistributeFollowupSequential =
- "llvm.loop.distribute.followup_sequential";
-static const char *const LLVMLoopDistributeFollowupFallback =
- "llvm.loop.distribute.followup_fallback";
-/// @}
-
-static cl::opt<bool>
- LDistVerify("loop-distribute-verify", cl::Hidden,
- cl::desc("Turn on DominatorTree and LoopInfo verification "
- "after Loop Distribution"),
- cl::init(false));
-
-static cl::opt<bool> DistributeNonIfConvertible(
- "loop-distribute-non-if-convertible", cl::Hidden,
- cl::desc("Whether to distribute into a loop that may not be "
- "if-convertible by the loop vectorizer"),
- cl::init(false));
-
-static cl::opt<unsigned> DistributeSCEVCheckThreshold(
- "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
- cl::desc("The maximum number of SCEV checks allowed for Loop "
- "Distribution"));
-
-static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
- "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
- cl::Hidden,
- cl::desc(
- "The maximum number of SCEV checks allowed for Loop "
- "Distribution for loop marked with #pragma loop distribute(enable)"));
-
-static cl::opt<bool> EnableLoopDistribute(
- "enable-loop-distribute", cl::Hidden,
- cl::desc("Enable the new, experimental LoopDistribution Pass"),
- cl::init(false));
-
-STATISTIC(NumLoopsDistributed, "Number of loops distributed");
-
-namespace {
-
-/// Maintains the set of instructions of the loop for a partition before
-/// cloning. After cloning, it hosts the new loop.
-class InstPartition {
- using InstructionSet = SmallPtrSet<Instruction *, 8>;
-
-public:
- InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
- : DepCycle(DepCycle), OrigLoop(L) {
- Set.insert(I);
- }
-
- /// Returns whether this partition contains a dependence cycle.
- bool hasDepCycle() const { return DepCycle; }
-
- /// Adds an instruction to this partition.
- void add(Instruction *I) { Set.insert(I); }
-
- /// Collection accessors.
- InstructionSet::iterator begin() { return Set.begin(); }
- InstructionSet::iterator end() { return Set.end(); }
- InstructionSet::const_iterator begin() const { return Set.begin(); }
- InstructionSet::const_iterator end() const { return Set.end(); }
- bool empty() const { return Set.empty(); }
-
- /// Moves this partition into \p Other. This partition becomes empty
- /// after this.
- void moveTo(InstPartition &Other) {
- Other.Set.insert(Set.begin(), Set.end());
- Set.clear();
- Other.DepCycle |= DepCycle;
- }
-
- /// Populates the partition with a transitive closure of all the
- /// instructions that the seeded instructions dependent on.
- void populateUsedSet() {
- // FIXME: We currently don't use control-dependence but simply include all
- // blocks (possibly empty at the end) and let simplifycfg mostly clean this
- // up.
- for (auto *B : OrigLoop->getBlocks())
- Set.insert(B->getTerminator());
-
- // Follow the use-def chains to form a transitive closure of all the
- // instructions that the originally seeded instructions depend on.
- SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
- while (!Worklist.empty()) {
- Instruction *I = Worklist.pop_back_val();
- // Insert instructions from the loop that we depend on.
- for (Value *V : I->operand_values()) {
- auto *I = dyn_cast<Instruction>(V);
- if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
- Worklist.push_back(I);
- }
- }
- }
-
- /// Clones the original loop.
- ///
- /// Updates LoopInfo and DominatorTree using the information that block \p
- /// LoopDomBB dominates the loop.
- Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
- unsigned Index, LoopInfo *LI,
- DominatorTree *DT) {
- ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
- VMap, Twine(".ldist") + Twine(Index),
- LI, DT, ClonedLoopBlocks);
- return ClonedLoop;
- }
-
- /// The cloned loop. If this partition is mapped to the original loop,
- /// this is null.
- const Loop *getClonedLoop() const { return ClonedLoop; }
-
- /// Returns the loop where this partition ends up after distribution.
- /// If this partition is mapped to the original loop then use the block from
- /// the loop.
- Loop *getDistributedLoop() const {
- return ClonedLoop ? ClonedLoop : OrigLoop;
- }
-
- /// The VMap that is populated by cloning and then used in
- /// remapinstruction to remap the cloned instructions.
- ValueToValueMapTy &getVMap() { return VMap; }
-
- /// Remaps the cloned instructions using VMap.
- void remapInstructions() {
- remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
- }
-
- /// Based on the set of instructions selected for this partition,
- /// removes the unnecessary ones.
- void removeUnusedInsts() {
- SmallVector<Instruction *, 8> Unused;
-
- for (auto *Block : OrigLoop->getBlocks())
- for (auto &Inst : *Block)
- if (!Set.count(&Inst)) {
- Instruction *NewInst = &Inst;
- if (!VMap.empty())
- NewInst = cast<Instruction>(VMap[NewInst]);
-
- assert(!isa<BranchInst>(NewInst) &&
- "Branches are marked used early on");
- Unused.push_back(NewInst);
- }
-
- // Delete the instructions backwards, as it has a reduced likelihood of
- // having to update as many def-use and use-def chains.
- for (auto *Inst : reverse(Unused)) {
- if (!Inst->use_empty())
- Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
- Inst->eraseFromParent();
- }
- }
-
- void print() const {
- if (DepCycle)
- dbgs() << " (cycle)\n";
- for (auto *I : Set)
- // Prefix with the block name.
- dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n";
- }
-
- void printBlocks() const {
- for (auto *BB : getDistributedLoop()->getBlocks())
- dbgs() << *BB;
- }
-
-private:
- /// Instructions from OrigLoop selected for this partition.
- InstructionSet Set;
-
- /// Whether this partition contains a dependence cycle.
- bool DepCycle;
-
- /// The original loop.
- Loop *OrigLoop;
-
- /// The cloned loop. If this partition is mapped to the original loop,
- /// this is null.
- Loop *ClonedLoop = nullptr;
-
- /// The blocks of ClonedLoop including the preheader. If this
- /// partition is mapped to the original loop, this is empty.
- SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
-
- /// These gets populated once the set of instructions have been
- /// finalized. If this partition is mapped to the original loop, these are not
- /// set.
- ValueToValueMapTy VMap;
-};
-
-/// Holds the set of Partitions. It populates them, merges them and then
-/// clones the loops.
-class InstPartitionContainer {
- using InstToPartitionIdT = DenseMap<Instruction *, int>;
-
-public:
- InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
- : L(L), LI(LI), DT(DT) {}
-
- /// Returns the number of partitions.
- unsigned getSize() const { return PartitionContainer.size(); }
-
- /// Adds \p Inst into the current partition if that is marked to
- /// contain cycles. Otherwise start a new partition for it.
- void addToCyclicPartition(Instruction *Inst) {
- // If the current partition is non-cyclic. Start a new one.
- if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
- PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
- else
- PartitionContainer.back().add(Inst);
- }
-
- /// Adds \p Inst into a partition that is not marked to contain
- /// dependence cycles.
- ///
- // Initially we isolate memory instructions into as many partitions as
- // possible, then later we may merge them back together.
- void addToNewNonCyclicPartition(Instruction *Inst) {
- PartitionContainer.emplace_back(Inst, L);
- }
-
- /// Merges adjacent non-cyclic partitions.
- ///
- /// The idea is that we currently only want to isolate the non-vectorizable
- /// partition. We could later allow more distribution among these partition
- /// too.
- void mergeAdjacentNonCyclic() {
- mergeAdjacentPartitionsIf(
- [](const InstPartition *P) { return !P->hasDepCycle(); });
- }
-
- /// If a partition contains only conditional stores, we won't vectorize
- /// it. Try to merge it with a previous cyclic partition.
- void mergeNonIfConvertible() {
- mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
- if (Partition->hasDepCycle())
- return true;
-
- // Now, check if all stores are conditional in this partition.
- bool seenStore = false;
-
- for (auto *Inst : *Partition)
- if (isa<StoreInst>(Inst)) {
- seenStore = true;
- if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
- return false;
- }
- return seenStore;
- });
- }
-
- /// Merges the partitions according to various heuristics.
- void mergeBeforePopulating() {
- mergeAdjacentNonCyclic();
- if (!DistributeNonIfConvertible)
- mergeNonIfConvertible();
- }
-
- /// Merges partitions in order to ensure that no loads are duplicated.
- ///
- /// We can't duplicate loads because that could potentially reorder them.
- /// LoopAccessAnalysis provides dependency information with the context that
- /// the order of memory operation is preserved.
- ///
- /// Return if any partitions were merged.
- bool mergeToAvoidDuplicatedLoads() {
- using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>;
- using ToBeMergedT = EquivalenceClasses<InstPartition *>;
-
- LoadToPartitionT LoadToPartition;
- ToBeMergedT ToBeMerged;
-
- // Step through the partitions and create equivalence between partitions
- // that contain the same load. Also put partitions in between them in the
- // same equivalence class to avoid reordering of memory operations.
- for (PartitionContainerT::iterator I = PartitionContainer.begin(),
- E = PartitionContainer.end();
- I != E; ++I) {
- auto *PartI = &*I;
-
- // If a load occurs in two partitions PartI and PartJ, merge all
- // partitions (PartI, PartJ] into PartI.
- for (Instruction *Inst : *PartI)
- if (isa<LoadInst>(Inst)) {
- bool NewElt;
- LoadToPartitionT::iterator LoadToPart;
-
- std::tie(LoadToPart, NewElt) =
- LoadToPartition.insert(std::make_pair(Inst, PartI));
- if (!NewElt) {
- LLVM_DEBUG(dbgs()
- << "Merging partitions due to this load in multiple "
- << "partitions: " << PartI << ", " << LoadToPart->second
- << "\n"
- << *Inst << "\n");
-
- auto PartJ = I;
- do {
- --PartJ;
- ToBeMerged.unionSets(PartI, &*PartJ);
- } while (&*PartJ != LoadToPart->second);
- }
- }
- }
- if (ToBeMerged.empty())
- return false;
-
- // Merge the member of an equivalence class into its class leader. This
- // makes the members empty.
- for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
- I != E; ++I) {
- if (!I->isLeader())
- continue;
-
- auto PartI = I->getData();
- for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
- ToBeMerged.member_end())) {
- PartJ->moveTo(*PartI);
- }
- }
-
- // Remove the empty partitions.
- PartitionContainer.remove_if(
- [](const InstPartition &P) { return P.empty(); });
-
- return true;
- }
-
- /// Sets up the mapping between instructions to partitions. If the
- /// instruction is duplicated across multiple partitions, set the entry to -1.
- void setupPartitionIdOnInstructions() {
- int PartitionID = 0;
- for (const auto &Partition : PartitionContainer) {
- for (Instruction *Inst : Partition) {
- bool NewElt;
- InstToPartitionIdT::iterator Iter;
-
- std::tie(Iter, NewElt) =
- InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
- if (!NewElt)
- Iter->second = -1;
- }
- ++PartitionID;
- }
- }
-
- /// Populates the partition with everything that the seeding
- /// instructions require.
- void populateUsedSet() {
- for (auto &P : PartitionContainer)
- P.populateUsedSet();
- }
-
- /// This performs the main chunk of the work of cloning the loops for
- /// the partitions.
- void cloneLoops() {
- BasicBlock *OrigPH = L->getLoopPreheader();
- // At this point the predecessor of the preheader is either the memcheck
- // block or the top part of the original preheader.
- BasicBlock *Pred = OrigPH->getSinglePredecessor();
- assert(Pred && "Preheader does not have a single predecessor");
- BasicBlock *ExitBlock = L->getExitBlock();
- assert(ExitBlock && "No single exit block");
- Loop *NewLoop;
-
- assert(!PartitionContainer.empty() && "at least two partitions expected");
- // We're cloning the preheader along with the loop so we already made sure
- // it was empty.
- assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
- "preheader not empty");
-
- // Preserve the original loop ID for use after the transformation.
- MDNode *OrigLoopID = L->getLoopID();
-
- // Create a loop for each partition except the last. Clone the original
- // loop before PH along with adding a preheader for the cloned loop. Then
- // update PH to point to the newly added preheader.
- BasicBlock *TopPH = OrigPH;
- unsigned Index = getSize() - 1;
- for (auto I = std::next(PartitionContainer.rbegin()),
- E = PartitionContainer.rend();
- I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
- auto *Part = &*I;
-
- NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
-
- Part->getVMap()[ExitBlock] = TopPH;
- Part->remapInstructions();
- setNewLoopID(OrigLoopID, Part);
- }
- Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
-
- // Also set a new loop ID for the last loop.
- setNewLoopID(OrigLoopID, &PartitionContainer.back());
-
- // Now go in forward order and update the immediate dominator for the
- // preheaders with the exiting block of the previous loop. Dominance
- // within the loop is updated in cloneLoopWithPreheader.
- for (auto Curr = PartitionContainer.cbegin(),
- Next = std::next(PartitionContainer.cbegin()),
- E = PartitionContainer.cend();
- Next != E; ++Curr, ++Next)
- DT->changeImmediateDominator(
- Next->getDistributedLoop()->getLoopPreheader(),
- Curr->getDistributedLoop()->getExitingBlock());
- }
-
- /// Removes the dead instructions from the cloned loops.
- void removeUnusedInsts() {
- for (auto &Partition : PartitionContainer)
- Partition.removeUnusedInsts();
- }
-
- /// For each memory pointer, it computes the partitionId the pointer is
- /// used in.
- ///
- /// This returns an array of int where the I-th entry corresponds to I-th
- /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple
- /// partitions its entry is set to -1.
- SmallVector<int, 8>
- computePartitionSetForPointers(const LoopAccessInfo &LAI) {
- const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
-
- unsigned N = RtPtrCheck->Pointers.size();
- SmallVector<int, 8> PtrToPartitions(N);
- for (unsigned I = 0; I < N; ++I) {
- Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
- auto Instructions =
- LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
-
- int &Partition = PtrToPartitions[I];
- // First set it to uninitialized.
- Partition = -2;
- for (Instruction *Inst : Instructions) {
- // Note that this could be -1 if Inst is duplicated across multiple
- // partitions.
- int ThisPartition = this->InstToPartitionId[Inst];
- if (Partition == -2)
- Partition = ThisPartition;
- // -1 means belonging to multiple partitions.
- else if (Partition == -1)
- break;
- else if (Partition != (int)ThisPartition)
- Partition = -1;
- }
- assert(Partition != -2 && "Pointer not belonging to any partition");
- }
-
- return PtrToPartitions;
- }
-
- void print(raw_ostream &OS) const {
- unsigned Index = 0;
- for (const auto &P : PartitionContainer) {
- OS << "Partition " << Index++ << " (" << &P << "):\n";
- P.print();
- }
- }
-
- void dump() const { print(dbgs()); }
-
-#ifndef NDEBUG
- friend raw_ostream &operator<<(raw_ostream &OS,
- const InstPartitionContainer &Partitions) {
- Partitions.print(OS);
- return OS;
- }
-#endif
-
- void printBlocks() const {
- unsigned Index = 0;
- for (const auto &P : PartitionContainer) {
- dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
- P.printBlocks();
- }
- }
-
-private:
- using PartitionContainerT = std::list<InstPartition>;
-
- /// List of partitions.
- PartitionContainerT PartitionContainer;
-
- /// Mapping from Instruction to partition Id. If the instruction
- /// belongs to multiple partitions the entry contains -1.
- InstToPartitionIdT InstToPartitionId;
-
- Loop *L;
- LoopInfo *LI;
- DominatorTree *DT;
-
- /// The control structure to merge adjacent partitions if both satisfy
- /// the \p Predicate.
- template <class UnaryPredicate>
- void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
- InstPartition *PrevMatch = nullptr;
- for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
- auto DoesMatch = Predicate(&*I);
- if (PrevMatch == nullptr && DoesMatch) {
- PrevMatch = &*I;
- ++I;
- } else if (PrevMatch != nullptr && DoesMatch) {
- I->moveTo(*PrevMatch);
- I = PartitionContainer.erase(I);
- } else {
- PrevMatch = nullptr;
- ++I;
- }
- }
- }
-
- /// Assign new LoopIDs for the partition's cloned loop.
- void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) {
- Optional<MDNode *> PartitionID = makeFollowupLoopID(
- OrigLoopID,
- {LLVMLoopDistributeFollowupAll,
- Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential
- : LLVMLoopDistributeFollowupCoincident});
- if (PartitionID.hasValue()) {
- Loop *NewLoop = Part->getDistributedLoop();
- NewLoop->setLoopID(PartitionID.getValue());
- }
- }
-};
-
-/// For each memory instruction, this class maintains difference of the
-/// number of unsafe dependences that start out from this instruction minus
-/// those that end here.
-///
-/// By traversing the memory instructions in program order and accumulating this
-/// number, we know whether any unsafe dependence crosses over a program point.
-class MemoryInstructionDependences {
- using Dependence = MemoryDepChecker::Dependence;
-
-public:
- struct Entry {
- Instruction *Inst;
- unsigned NumUnsafeDependencesStartOrEnd = 0;
-
- Entry(Instruction *Inst) : Inst(Inst) {}
- };
-
- using AccessesType = SmallVector<Entry, 8>;
-
- AccessesType::const_iterator begin() const { return Accesses.begin(); }
- AccessesType::const_iterator end() const { return Accesses.end(); }
-
- MemoryInstructionDependences(
- const SmallVectorImpl<Instruction *> &Instructions,
- const SmallVectorImpl<Dependence> &Dependences) {
- Accesses.append(Instructions.begin(), Instructions.end());
-
- LLVM_DEBUG(dbgs() << "Backward dependences:\n");
- for (auto &Dep : Dependences)
- if (Dep.isPossiblyBackward()) {
- // Note that the designations source and destination follow the program
- // order, i.e. source is always first. (The direction is given by the
- // DepType.)
- ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
- --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
-
- LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions));
- }
- }
-
-private:
- AccessesType Accesses;
-};
-
-/// The actual class performing the per-loop work.
-class LoopDistributeForLoop {
-public:
- LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
- ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
- : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) {
- setForced();
- }
-
- /// Try to distribute an inner-most loop.
- bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
+//===- LoopDistribute.cpp - Loop Distribution Pass ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Loop Distribution Pass. Its main focus is to
+// distribute loops that cannot be vectorized due to dependence cycles. It
+// tries to isolate the offending dependences into a new loop allowing
+// vectorization of the remaining parts.
+//
+// For dependence analysis, the pass uses the LoopVectorizer's
+// LoopAccessAnalysis. Because this analysis presumes no change in the order of
+// memory operations, special care is taken to preserve the lexical order of
+// these operations.
+//
+// Similarly to the Vectorizer, the pass also supports loop versioning to
+// run-time disambiguate potentially overlapping arrays.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopDistribute.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <cassert>
+#include <functional>
+#include <list>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define LDIST_NAME "loop-distribute"
+#define DEBUG_TYPE LDIST_NAME
+
+/// @{
+/// Metadata attribute names
+static const char *const LLVMLoopDistributeFollowupAll =
+ "llvm.loop.distribute.followup_all";
+static const char *const LLVMLoopDistributeFollowupCoincident =
+ "llvm.loop.distribute.followup_coincident";
+static const char *const LLVMLoopDistributeFollowupSequential =
+ "llvm.loop.distribute.followup_sequential";
+static const char *const LLVMLoopDistributeFollowupFallback =
+ "llvm.loop.distribute.followup_fallback";
+/// @}
+
+static cl::opt<bool>
+ LDistVerify("loop-distribute-verify", cl::Hidden,
+ cl::desc("Turn on DominatorTree and LoopInfo verification "
+ "after Loop Distribution"),
+ cl::init(false));
+
+static cl::opt<bool> DistributeNonIfConvertible(
+ "loop-distribute-non-if-convertible", cl::Hidden,
+ cl::desc("Whether to distribute into a loop that may not be "
+ "if-convertible by the loop vectorizer"),
+ cl::init(false));
+
+static cl::opt<unsigned> DistributeSCEVCheckThreshold(
+ "loop-distribute-scev-check-threshold", cl::init(8), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed for Loop "
+ "Distribution"));
+
+static cl::opt<unsigned> PragmaDistributeSCEVCheckThreshold(
+ "loop-distribute-scev-check-threshold-with-pragma", cl::init(128),
+ cl::Hidden,
+ cl::desc(
+ "The maximum number of SCEV checks allowed for Loop "
+ "Distribution for loop marked with #pragma loop distribute(enable)"));
+
+static cl::opt<bool> EnableLoopDistribute(
+ "enable-loop-distribute", cl::Hidden,
+ cl::desc("Enable the new, experimental LoopDistribution Pass"),
+ cl::init(false));
+
+STATISTIC(NumLoopsDistributed, "Number of loops distributed");
+
+namespace {
+
+/// Maintains the set of instructions of the loop for a partition before
+/// cloning. After cloning, it hosts the new loop.
+class InstPartition {
+ using InstructionSet = SmallPtrSet<Instruction *, 8>;
+
+public:
+ InstPartition(Instruction *I, Loop *L, bool DepCycle = false)
+ : DepCycle(DepCycle), OrigLoop(L) {
+ Set.insert(I);
+ }
+
+ /// Returns whether this partition contains a dependence cycle.
+ bool hasDepCycle() const { return DepCycle; }
+
+ /// Adds an instruction to this partition.
+ void add(Instruction *I) { Set.insert(I); }
+
+ /// Collection accessors.
+ InstructionSet::iterator begin() { return Set.begin(); }
+ InstructionSet::iterator end() { return Set.end(); }
+ InstructionSet::const_iterator begin() const { return Set.begin(); }
+ InstructionSet::const_iterator end() const { return Set.end(); }
+ bool empty() const { return Set.empty(); }
+
+ /// Moves this partition into \p Other. This partition becomes empty
+ /// after this.
+ void moveTo(InstPartition &Other) {
+ Other.Set.insert(Set.begin(), Set.end());
+ Set.clear();
+ Other.DepCycle |= DepCycle;
+ }
+
+ /// Populates the partition with a transitive closure of all the
+ /// instructions that the seeded instructions dependent on.
+ void populateUsedSet() {
+ // FIXME: We currently don't use control-dependence but simply include all
+ // blocks (possibly empty at the end) and let simplifycfg mostly clean this
+ // up.
+ for (auto *B : OrigLoop->getBlocks())
+ Set.insert(B->getTerminator());
+
+ // Follow the use-def chains to form a transitive closure of all the
+ // instructions that the originally seeded instructions depend on.
+ SmallVector<Instruction *, 8> Worklist(Set.begin(), Set.end());
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
+ // Insert instructions from the loop that we depend on.
+ for (Value *V : I->operand_values()) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (I && OrigLoop->contains(I->getParent()) && Set.insert(I).second)
+ Worklist.push_back(I);
+ }
+ }
+ }
+
+ /// Clones the original loop.
+ ///
+ /// Updates LoopInfo and DominatorTree using the information that block \p
+ /// LoopDomBB dominates the loop.
+ Loop *cloneLoopWithPreheader(BasicBlock *InsertBefore, BasicBlock *LoopDomBB,
+ unsigned Index, LoopInfo *LI,
+ DominatorTree *DT) {
+ ClonedLoop = ::cloneLoopWithPreheader(InsertBefore, LoopDomBB, OrigLoop,
+ VMap, Twine(".ldist") + Twine(Index),
+ LI, DT, ClonedLoopBlocks);
+ return ClonedLoop;
+ }
+
+ /// The cloned loop. If this partition is mapped to the original loop,
+ /// this is null.
+ const Loop *getClonedLoop() const { return ClonedLoop; }
+
+ /// Returns the loop where this partition ends up after distribution.
+ /// If this partition is mapped to the original loop then use the block from
+ /// the loop.
+ Loop *getDistributedLoop() const {
+ return ClonedLoop ? ClonedLoop : OrigLoop;
+ }
+
+ /// The VMap that is populated by cloning and then used in
+ /// remapinstruction to remap the cloned instructions.
+ ValueToValueMapTy &getVMap() { return VMap; }
+
+ /// Remaps the cloned instructions using VMap.
+ void remapInstructions() {
+ remapInstructionsInBlocks(ClonedLoopBlocks, VMap);
+ }
+
+ /// Based on the set of instructions selected for this partition,
+ /// removes the unnecessary ones.
+ void removeUnusedInsts() {
+ SmallVector<Instruction *, 8> Unused;
+
+ for (auto *Block : OrigLoop->getBlocks())
+ for (auto &Inst : *Block)
+ if (!Set.count(&Inst)) {
+ Instruction *NewInst = &Inst;
+ if (!VMap.empty())
+ NewInst = cast<Instruction>(VMap[NewInst]);
+
+ assert(!isa<BranchInst>(NewInst) &&
+ "Branches are marked used early on");
+ Unused.push_back(NewInst);
+ }
+
+ // Delete the instructions backwards, as it has a reduced likelihood of
+ // having to update as many def-use and use-def chains.
+ for (auto *Inst : reverse(Unused)) {
+ if (!Inst->use_empty())
+ Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+ Inst->eraseFromParent();
+ }
+ }
+
+ void print() const {
+ if (DepCycle)
+ dbgs() << " (cycle)\n";
+ for (auto *I : Set)
+ // Prefix with the block name.
+ dbgs() << " " << I->getParent()->getName() << ":" << *I << "\n";
+ }
+
+ void printBlocks() const {
+ for (auto *BB : getDistributedLoop()->getBlocks())
+ dbgs() << *BB;
+ }
+
+private:
+ /// Instructions from OrigLoop selected for this partition.
+ InstructionSet Set;
+
+ /// Whether this partition contains a dependence cycle.
+ bool DepCycle;
+
+ /// The original loop.
+ Loop *OrigLoop;
+
+ /// The cloned loop. If this partition is mapped to the original loop,
+ /// this is null.
+ Loop *ClonedLoop = nullptr;
+
+ /// The blocks of ClonedLoop including the preheader. If this
+ /// partition is mapped to the original loop, this is empty.
+ SmallVector<BasicBlock *, 8> ClonedLoopBlocks;
+
+ /// These gets populated once the set of instructions have been
+ /// finalized. If this partition is mapped to the original loop, these are not
+ /// set.
+ ValueToValueMapTy VMap;
+};
+
+/// Holds the set of Partitions. It populates them, merges them and then
+/// clones the loops.
+class InstPartitionContainer {
+ using InstToPartitionIdT = DenseMap<Instruction *, int>;
+
+public:
+ InstPartitionContainer(Loop *L, LoopInfo *LI, DominatorTree *DT)
+ : L(L), LI(LI), DT(DT) {}
+
+ /// Returns the number of partitions.
+ unsigned getSize() const { return PartitionContainer.size(); }
+
+ /// Adds \p Inst into the current partition if that is marked to
+ /// contain cycles. Otherwise start a new partition for it.
+ void addToCyclicPartition(Instruction *Inst) {
+ // If the current partition is non-cyclic. Start a new one.
+ if (PartitionContainer.empty() || !PartitionContainer.back().hasDepCycle())
+ PartitionContainer.emplace_back(Inst, L, /*DepCycle=*/true);
+ else
+ PartitionContainer.back().add(Inst);
+ }
+
+ /// Adds \p Inst into a partition that is not marked to contain
+ /// dependence cycles.
+ ///
+ // Initially we isolate memory instructions into as many partitions as
+ // possible, then later we may merge them back together.
+ void addToNewNonCyclicPartition(Instruction *Inst) {
+ PartitionContainer.emplace_back(Inst, L);
+ }
+
+ /// Merges adjacent non-cyclic partitions.
+ ///
+ /// The idea is that we currently only want to isolate the non-vectorizable
+ /// partition. We could later allow more distribution among these partition
+ /// too.
+ void mergeAdjacentNonCyclic() {
+ mergeAdjacentPartitionsIf(
+ [](const InstPartition *P) { return !P->hasDepCycle(); });
+ }
+
+ /// If a partition contains only conditional stores, we won't vectorize
+ /// it. Try to merge it with a previous cyclic partition.
+ void mergeNonIfConvertible() {
+ mergeAdjacentPartitionsIf([&](const InstPartition *Partition) {
+ if (Partition->hasDepCycle())
+ return true;
+
+ // Now, check if all stores are conditional in this partition.
+ bool seenStore = false;
+
+ for (auto *Inst : *Partition)
+ if (isa<StoreInst>(Inst)) {
+ seenStore = true;
+ if (!LoopAccessInfo::blockNeedsPredication(Inst->getParent(), L, DT))
+ return false;
+ }
+ return seenStore;
+ });
+ }
+
+ /// Merges the partitions according to various heuristics.
+ void mergeBeforePopulating() {
+ mergeAdjacentNonCyclic();
+ if (!DistributeNonIfConvertible)
+ mergeNonIfConvertible();
+ }
+
+ /// Merges partitions in order to ensure that no loads are duplicated.
+ ///
+ /// We can't duplicate loads because that could potentially reorder them.
+ /// LoopAccessAnalysis provides dependency information with the context that
+ /// the order of memory operation is preserved.
+ ///
+ /// Return if any partitions were merged.
+ bool mergeToAvoidDuplicatedLoads() {
+ using LoadToPartitionT = DenseMap<Instruction *, InstPartition *>;
+ using ToBeMergedT = EquivalenceClasses<InstPartition *>;
+
+ LoadToPartitionT LoadToPartition;
+ ToBeMergedT ToBeMerged;
+
+ // Step through the partitions and create equivalence between partitions
+ // that contain the same load. Also put partitions in between them in the
+ // same equivalence class to avoid reordering of memory operations.
+ for (PartitionContainerT::iterator I = PartitionContainer.begin(),
+ E = PartitionContainer.end();
+ I != E; ++I) {
+ auto *PartI = &*I;
+
+ // If a load occurs in two partitions PartI and PartJ, merge all
+ // partitions (PartI, PartJ] into PartI.
+ for (Instruction *Inst : *PartI)
+ if (isa<LoadInst>(Inst)) {
+ bool NewElt;
+ LoadToPartitionT::iterator LoadToPart;
+
+ std::tie(LoadToPart, NewElt) =
+ LoadToPartition.insert(std::make_pair(Inst, PartI));
+ if (!NewElt) {
+ LLVM_DEBUG(dbgs()
+ << "Merging partitions due to this load in multiple "
+ << "partitions: " << PartI << ", " << LoadToPart->second
+ << "\n"
+ << *Inst << "\n");
+
+ auto PartJ = I;
+ do {
+ --PartJ;
+ ToBeMerged.unionSets(PartI, &*PartJ);
+ } while (&*PartJ != LoadToPart->second);
+ }
+ }
+ }
+ if (ToBeMerged.empty())
+ return false;
+
+ // Merge the member of an equivalence class into its class leader. This
+ // makes the members empty.
+ for (ToBeMergedT::iterator I = ToBeMerged.begin(), E = ToBeMerged.end();
+ I != E; ++I) {
+ if (!I->isLeader())
+ continue;
+
+ auto PartI = I->getData();
+ for (auto PartJ : make_range(std::next(ToBeMerged.member_begin(I)),
+ ToBeMerged.member_end())) {
+ PartJ->moveTo(*PartI);
+ }
+ }
+
+ // Remove the empty partitions.
+ PartitionContainer.remove_if(
+ [](const InstPartition &P) { return P.empty(); });
+
+ return true;
+ }
+
+ /// Sets up the mapping between instructions to partitions. If the
+ /// instruction is duplicated across multiple partitions, set the entry to -1.
+ void setupPartitionIdOnInstructions() {
+ int PartitionID = 0;
+ for (const auto &Partition : PartitionContainer) {
+ for (Instruction *Inst : Partition) {
+ bool NewElt;
+ InstToPartitionIdT::iterator Iter;
+
+ std::tie(Iter, NewElt) =
+ InstToPartitionId.insert(std::make_pair(Inst, PartitionID));
+ if (!NewElt)
+ Iter->second = -1;
+ }
+ ++PartitionID;
+ }
+ }
+
+ /// Populates the partition with everything that the seeding
+ /// instructions require.
+ void populateUsedSet() {
+ for (auto &P : PartitionContainer)
+ P.populateUsedSet();
+ }
+
+ /// This performs the main chunk of the work of cloning the loops for
+ /// the partitions.
+ void cloneLoops() {
+ BasicBlock *OrigPH = L->getLoopPreheader();
+ // At this point the predecessor of the preheader is either the memcheck
+ // block or the top part of the original preheader.
+ BasicBlock *Pred = OrigPH->getSinglePredecessor();
+ assert(Pred && "Preheader does not have a single predecessor");
+ BasicBlock *ExitBlock = L->getExitBlock();
+ assert(ExitBlock && "No single exit block");
+ Loop *NewLoop;
+
+ assert(!PartitionContainer.empty() && "at least two partitions expected");
+ // We're cloning the preheader along with the loop so we already made sure
+ // it was empty.
+ assert(&*OrigPH->begin() == OrigPH->getTerminator() &&
+ "preheader not empty");
+
+ // Preserve the original loop ID for use after the transformation.
+ MDNode *OrigLoopID = L->getLoopID();
+
+ // Create a loop for each partition except the last. Clone the original
+ // loop before PH along with adding a preheader for the cloned loop. Then
+ // update PH to point to the newly added preheader.
+ BasicBlock *TopPH = OrigPH;
+ unsigned Index = getSize() - 1;
+ for (auto I = std::next(PartitionContainer.rbegin()),
+ E = PartitionContainer.rend();
+ I != E; ++I, --Index, TopPH = NewLoop->getLoopPreheader()) {
+ auto *Part = &*I;
+
+ NewLoop = Part->cloneLoopWithPreheader(TopPH, Pred, Index, LI, DT);
+
+ Part->getVMap()[ExitBlock] = TopPH;
+ Part->remapInstructions();
+ setNewLoopID(OrigLoopID, Part);
+ }
+ Pred->getTerminator()->replaceUsesOfWith(OrigPH, TopPH);
+
+ // Also set a new loop ID for the last loop.
+ setNewLoopID(OrigLoopID, &PartitionContainer.back());
+
+ // Now go in forward order and update the immediate dominator for the
+ // preheaders with the exiting block of the previous loop. Dominance
+ // within the loop is updated in cloneLoopWithPreheader.
+ for (auto Curr = PartitionContainer.cbegin(),
+ Next = std::next(PartitionContainer.cbegin()),
+ E = PartitionContainer.cend();
+ Next != E; ++Curr, ++Next)
+ DT->changeImmediateDominator(
+ Next->getDistributedLoop()->getLoopPreheader(),
+ Curr->getDistributedLoop()->getExitingBlock());
+ }
+
+ /// Removes the dead instructions from the cloned loops.
+ void removeUnusedInsts() {
+ for (auto &Partition : PartitionContainer)
+ Partition.removeUnusedInsts();
+ }
+
+ /// For each memory pointer, it computes the partitionId the pointer is
+ /// used in.
+ ///
+ /// This returns an array of int where the I-th entry corresponds to I-th
+ /// entry in LAI.getRuntimePointerCheck(). If the pointer is used in multiple
+ /// partitions its entry is set to -1.
+ SmallVector<int, 8>
+ computePartitionSetForPointers(const LoopAccessInfo &LAI) {
+ const RuntimePointerChecking *RtPtrCheck = LAI.getRuntimePointerChecking();
+
+ unsigned N = RtPtrCheck->Pointers.size();
+ SmallVector<int, 8> PtrToPartitions(N);
+ for (unsigned I = 0; I < N; ++I) {
+ Value *Ptr = RtPtrCheck->Pointers[I].PointerValue;
+ auto Instructions =
+ LAI.getInstructionsForAccess(Ptr, RtPtrCheck->Pointers[I].IsWritePtr);
+
+ int &Partition = PtrToPartitions[I];
+ // First set it to uninitialized.
+ Partition = -2;
+ for (Instruction *Inst : Instructions) {
+ // Note that this could be -1 if Inst is duplicated across multiple
+ // partitions.
+ int ThisPartition = this->InstToPartitionId[Inst];
+ if (Partition == -2)
+ Partition = ThisPartition;
+ // -1 means belonging to multiple partitions.
+ else if (Partition == -1)
+ break;
+ else if (Partition != (int)ThisPartition)
+ Partition = -1;
+ }
+ assert(Partition != -2 && "Pointer not belonging to any partition");
+ }
+
+ return PtrToPartitions;
+ }
+
+ void print(raw_ostream &OS) const {
+ unsigned Index = 0;
+ for (const auto &P : PartitionContainer) {
+ OS << "Partition " << Index++ << " (" << &P << "):\n";
+ P.print();
+ }
+ }
+
+ void dump() const { print(dbgs()); }
+
+#ifndef NDEBUG
+ friend raw_ostream &operator<<(raw_ostream &OS,
+ const InstPartitionContainer &Partitions) {
+ Partitions.print(OS);
+ return OS;
+ }
+#endif
+
+ void printBlocks() const {
+ unsigned Index = 0;
+ for (const auto &P : PartitionContainer) {
+ dbgs() << "\nPartition " << Index++ << " (" << &P << "):\n";
+ P.printBlocks();
+ }
+ }
+
+private:
+ using PartitionContainerT = std::list<InstPartition>;
+
+ /// List of partitions.
+ PartitionContainerT PartitionContainer;
+
+ /// Mapping from Instruction to partition Id. If the instruction
+ /// belongs to multiple partitions the entry contains -1.
+ InstToPartitionIdT InstToPartitionId;
+
+ Loop *L;
+ LoopInfo *LI;
+ DominatorTree *DT;
+
+ /// The control structure to merge adjacent partitions if both satisfy
+ /// the \p Predicate.
+ template <class UnaryPredicate>
+ void mergeAdjacentPartitionsIf(UnaryPredicate Predicate) {
+ InstPartition *PrevMatch = nullptr;
+ for (auto I = PartitionContainer.begin(); I != PartitionContainer.end();) {
+ auto DoesMatch = Predicate(&*I);
+ if (PrevMatch == nullptr && DoesMatch) {
+ PrevMatch = &*I;
+ ++I;
+ } else if (PrevMatch != nullptr && DoesMatch) {
+ I->moveTo(*PrevMatch);
+ I = PartitionContainer.erase(I);
+ } else {
+ PrevMatch = nullptr;
+ ++I;
+ }
+ }
+ }
+
+ /// Assign new LoopIDs for the partition's cloned loop.
+ void setNewLoopID(MDNode *OrigLoopID, InstPartition *Part) {
+ Optional<MDNode *> PartitionID = makeFollowupLoopID(
+ OrigLoopID,
+ {LLVMLoopDistributeFollowupAll,
+ Part->hasDepCycle() ? LLVMLoopDistributeFollowupSequential
+ : LLVMLoopDistributeFollowupCoincident});
+ if (PartitionID.hasValue()) {
+ Loop *NewLoop = Part->getDistributedLoop();
+ NewLoop->setLoopID(PartitionID.getValue());
+ }
+ }
+};
+
+/// For each memory instruction, this class maintains difference of the
+/// number of unsafe dependences that start out from this instruction minus
+/// those that end here.
+///
+/// By traversing the memory instructions in program order and accumulating this
+/// number, we know whether any unsafe dependence crosses over a program point.
+class MemoryInstructionDependences {
+ using Dependence = MemoryDepChecker::Dependence;
+
+public:
+ struct Entry {
+ Instruction *Inst;
+ unsigned NumUnsafeDependencesStartOrEnd = 0;
+
+ Entry(Instruction *Inst) : Inst(Inst) {}
+ };
+
+ using AccessesType = SmallVector<Entry, 8>;
+
+ AccessesType::const_iterator begin() const { return Accesses.begin(); }
+ AccessesType::const_iterator end() const { return Accesses.end(); }
+
+ MemoryInstructionDependences(
+ const SmallVectorImpl<Instruction *> &Instructions,
+ const SmallVectorImpl<Dependence> &Dependences) {
+ Accesses.append(Instructions.begin(), Instructions.end());
+
+ LLVM_DEBUG(dbgs() << "Backward dependences:\n");
+ for (auto &Dep : Dependences)
+ if (Dep.isPossiblyBackward()) {
+ // Note that the designations source and destination follow the program
+ // order, i.e. source is always first. (The direction is given by the
+ // DepType.)
+ ++Accesses[Dep.Source].NumUnsafeDependencesStartOrEnd;
+ --Accesses[Dep.Destination].NumUnsafeDependencesStartOrEnd;
+
+ LLVM_DEBUG(Dep.print(dbgs(), 2, Instructions));
+ }
+ }
+
+private:
+ AccessesType Accesses;
+};
+
+/// The actual class performing the per-loop work.
+class LoopDistributeForLoop {
+public:
+ LoopDistributeForLoop(Loop *L, Function *F, LoopInfo *LI, DominatorTree *DT,
+ ScalarEvolution *SE, OptimizationRemarkEmitter *ORE)
+ : L(L), F(F), LI(LI), DT(DT), SE(SE), ORE(ORE) {
+ setForced();
+ }
+
+ /// Try to distribute an inner-most loop.
+ bool processLoop(std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
assert(L->isInnermost() && "Only process inner loops.");
-
- LLVM_DEBUG(dbgs() << "\nLDist: In \""
- << L->getHeader()->getParent()->getName()
- << "\" checking " << *L << "\n");
-
+
+ LLVM_DEBUG(dbgs() << "\nLDist: In \""
+ << L->getHeader()->getParent()->getName()
+ << "\" checking " << *L << "\n");
+
// Having a single exit block implies there's also one exiting block.
- if (!L->getExitBlock())
- return fail("MultipleExitBlocks", "multiple exit blocks");
- if (!L->isLoopSimplifyForm())
- return fail("NotLoopSimplifyForm",
- "loop is not in loop-simplify form");
+ if (!L->getExitBlock())
+ return fail("MultipleExitBlocks", "multiple exit blocks");
+ if (!L->isLoopSimplifyForm())
+ return fail("NotLoopSimplifyForm",
+ "loop is not in loop-simplify form");
if (!L->isRotatedForm())
return fail("NotBottomTested", "loop is not bottom tested");
-
- BasicBlock *PH = L->getLoopPreheader();
-
- LAI = &GetLAA(*L);
-
- // Currently, we only distribute to isolate the part of the loop with
- // dependence cycles to enable partial vectorization.
- if (LAI->canVectorizeMemory())
- return fail("MemOpsCanBeVectorized",
- "memory operations are safe for vectorization");
-
- auto *Dependences = LAI->getDepChecker().getDependences();
- if (!Dependences || Dependences->empty())
- return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
-
- InstPartitionContainer Partitions(L, LI, DT);
-
- // First, go through each memory operation and assign them to consecutive
- // partitions (the order of partitions follows program order). Put those
- // with unsafe dependences into "cyclic" partition otherwise put each store
- // in its own "non-cyclic" partition (we'll merge these later).
- //
- // Note that a memory operation (e.g. Load2 below) at a program point that
- // has an unsafe dependence (Store3->Load1) spanning over it must be
- // included in the same cyclic partition as the dependent operations. This
- // is to preserve the original program order after distribution. E.g.:
- //
- // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive
- // Load1 -. 1 0->1
- // Load2 | /Unsafe/ 0 1
- // Store3 -' -1 1->0
- // Load4 0 0
- //
- // NumUnsafeDependencesActive > 0 indicates this situation and in this case
- // we just keep assigning to the same cyclic partition until
- // NumUnsafeDependencesActive reaches 0.
- const MemoryDepChecker &DepChecker = LAI->getDepChecker();
- MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
- *Dependences);
-
- int NumUnsafeDependencesActive = 0;
- for (auto &InstDep : MID) {
- Instruction *I = InstDep.Inst;
- // We update NumUnsafeDependencesActive post-instruction, catch the
- // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
- if (NumUnsafeDependencesActive ||
- InstDep.NumUnsafeDependencesStartOrEnd > 0)
- Partitions.addToCyclicPartition(I);
- else
- Partitions.addToNewNonCyclicPartition(I);
- NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
- assert(NumUnsafeDependencesActive >= 0 &&
- "Negative number of dependences active");
- }
-
- // Add partitions for values used outside. These partitions can be out of
- // order from the original program order. This is OK because if the
- // partition uses a load we will merge this partition with the original
- // partition of the load that we set up in the previous loop (see
- // mergeToAvoidDuplicatedLoads).
- auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
- for (auto *Inst : DefsUsedOutside)
- Partitions.addToNewNonCyclicPartition(Inst);
-
- LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
- if (Partitions.getSize() < 2)
- return fail("CantIsolateUnsafeDeps",
- "cannot isolate unsafe dependencies");
-
- // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
- // should be able to vectorize these together.
- Partitions.mergeBeforePopulating();
- LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
- if (Partitions.getSize() < 2)
- return fail("CantIsolateUnsafeDeps",
- "cannot isolate unsafe dependencies");
-
- // Now, populate the partitions with non-memory operations.
- Partitions.populateUsedSet();
- LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
-
- // In order to preserve original lexical order for loads, keep them in the
- // partition that we set up in the MemoryInstructionDependences loop.
- if (Partitions.mergeToAvoidDuplicatedLoads()) {
- LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
- << Partitions);
- if (Partitions.getSize() < 2)
- return fail("CantIsolateUnsafeDeps",
- "cannot isolate unsafe dependencies");
- }
-
- // Don't distribute the loop if we need too many SCEV run-time checks, or
- // any if it's illegal.
- const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
- if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) {
- return fail("RuntimeCheckWithConvergent",
- "may not insert runtime check with convergent operation");
- }
-
- if (Pred.getComplexity() > (IsForced.getValueOr(false)
- ? PragmaDistributeSCEVCheckThreshold
- : DistributeSCEVCheckThreshold))
- return fail("TooManySCEVRuntimeChecks",
- "too many SCEV run-time checks needed.\n");
-
- if (!IsForced.getValueOr(false) && hasDisableAllTransformsHint(L))
- return fail("HeuristicDisabled", "distribution heuristic disabled");
-
- LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
- // We're done forming the partitions set up the reverse mapping from
- // instructions to partitions.
- Partitions.setupPartitionIdOnInstructions();
-
- // If we need run-time checks, version the loop now.
- auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
- const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
- const auto &AllChecks = RtPtrChecking->getChecks();
- auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
- RtPtrChecking);
-
- if (LAI->hasConvergentOp() && !Checks.empty()) {
- return fail("RuntimeCheckWithConvergent",
- "may not insert runtime check with convergent operation");
- }
-
- // To keep things simple have an empty preheader before we version or clone
- // the loop. (Also split if this has no predecessor, i.e. entry, because we
- // rely on PH having a predecessor.)
- if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
- SplitBlock(PH, PH->getTerminator(), DT, LI);
-
- if (!Pred.isAlwaysTrue() || !Checks.empty()) {
- assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning");
-
- MDNode *OrigLoopID = L->getLoopID();
-
- LLVM_DEBUG(dbgs() << "\nPointers:\n");
- LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+
+ BasicBlock *PH = L->getLoopPreheader();
+
+ LAI = &GetLAA(*L);
+
+ // Currently, we only distribute to isolate the part of the loop with
+ // dependence cycles to enable partial vectorization.
+ if (LAI->canVectorizeMemory())
+ return fail("MemOpsCanBeVectorized",
+ "memory operations are safe for vectorization");
+
+ auto *Dependences = LAI->getDepChecker().getDependences();
+ if (!Dependences || Dependences->empty())
+ return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
+
+ InstPartitionContainer Partitions(L, LI, DT);
+
+ // First, go through each memory operation and assign them to consecutive
+ // partitions (the order of partitions follows program order). Put those
+ // with unsafe dependences into "cyclic" partition otherwise put each store
+ // in its own "non-cyclic" partition (we'll merge these later).
+ //
+ // Note that a memory operation (e.g. Load2 below) at a program point that
+ // has an unsafe dependence (Store3->Load1) spanning over it must be
+ // included in the same cyclic partition as the dependent operations. This
+ // is to preserve the original program order after distribution. E.g.:
+ //
+ // NumUnsafeDependencesStartOrEnd NumUnsafeDependencesActive
+ // Load1 -. 1 0->1
+ // Load2 | /Unsafe/ 0 1
+ // Store3 -' -1 1->0
+ // Load4 0 0
+ //
+ // NumUnsafeDependencesActive > 0 indicates this situation and in this case
+ // we just keep assigning to the same cyclic partition until
+ // NumUnsafeDependencesActive reaches 0.
+ const MemoryDepChecker &DepChecker = LAI->getDepChecker();
+ MemoryInstructionDependences MID(DepChecker.getMemoryInstructions(),
+ *Dependences);
+
+ int NumUnsafeDependencesActive = 0;
+ for (auto &InstDep : MID) {
+ Instruction *I = InstDep.Inst;
+ // We update NumUnsafeDependencesActive post-instruction, catch the
+ // start of a dependence directly via NumUnsafeDependencesStartOrEnd.
+ if (NumUnsafeDependencesActive ||
+ InstDep.NumUnsafeDependencesStartOrEnd > 0)
+ Partitions.addToCyclicPartition(I);
+ else
+ Partitions.addToNewNonCyclicPartition(I);
+ NumUnsafeDependencesActive += InstDep.NumUnsafeDependencesStartOrEnd;
+ assert(NumUnsafeDependencesActive >= 0 &&
+ "Negative number of dependences active");
+ }
+
+ // Add partitions for values used outside. These partitions can be out of
+ // order from the original program order. This is OK because if the
+ // partition uses a load we will merge this partition with the original
+ // partition of the load that we set up in the previous loop (see
+ // mergeToAvoidDuplicatedLoads).
+ auto DefsUsedOutside = findDefsUsedOutsideOfLoop(L);
+ for (auto *Inst : DefsUsedOutside)
+ Partitions.addToNewNonCyclicPartition(Inst);
+
+ LLVM_DEBUG(dbgs() << "Seeded partitions:\n" << Partitions);
+ if (Partitions.getSize() < 2)
+ return fail("CantIsolateUnsafeDeps",
+ "cannot isolate unsafe dependencies");
+
+ // Run the merge heuristics: Merge non-cyclic adjacent partitions since we
+ // should be able to vectorize these together.
+ Partitions.mergeBeforePopulating();
+ LLVM_DEBUG(dbgs() << "\nMerged partitions:\n" << Partitions);
+ if (Partitions.getSize() < 2)
+ return fail("CantIsolateUnsafeDeps",
+ "cannot isolate unsafe dependencies");
+
+ // Now, populate the partitions with non-memory operations.
+ Partitions.populateUsedSet();
+ LLVM_DEBUG(dbgs() << "\nPopulated partitions:\n" << Partitions);
+
+ // In order to preserve original lexical order for loads, keep them in the
+ // partition that we set up in the MemoryInstructionDependences loop.
+ if (Partitions.mergeToAvoidDuplicatedLoads()) {
+ LLVM_DEBUG(dbgs() << "\nPartitions merged to ensure unique loads:\n"
+ << Partitions);
+ if (Partitions.getSize() < 2)
+ return fail("CantIsolateUnsafeDeps",
+ "cannot isolate unsafe dependencies");
+ }
+
+ // Don't distribute the loop if we need too many SCEV run-time checks, or
+ // any if it's illegal.
+ const SCEVUnionPredicate &Pred = LAI->getPSE().getUnionPredicate();
+ if (LAI->hasConvergentOp() && !Pred.isAlwaysTrue()) {
+ return fail("RuntimeCheckWithConvergent",
+ "may not insert runtime check with convergent operation");
+ }
+
+ if (Pred.getComplexity() > (IsForced.getValueOr(false)
+ ? PragmaDistributeSCEVCheckThreshold
+ : DistributeSCEVCheckThreshold))
+ return fail("TooManySCEVRuntimeChecks",
+ "too many SCEV run-time checks needed.\n");
+
+ if (!IsForced.getValueOr(false) && hasDisableAllTransformsHint(L))
+ return fail("HeuristicDisabled", "distribution heuristic disabled");
+
+ LLVM_DEBUG(dbgs() << "\nDistributing loop: " << *L << "\n");
+ // We're done forming the partitions set up the reverse mapping from
+ // instructions to partitions.
+ Partitions.setupPartitionIdOnInstructions();
+
+ // If we need run-time checks, version the loop now.
+ auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
+ const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
+ const auto &AllChecks = RtPtrChecking->getChecks();
+ auto Checks = includeOnlyCrossPartitionChecks(AllChecks, PtrToPartition,
+ RtPtrChecking);
+
+ if (LAI->hasConvergentOp() && !Checks.empty()) {
+ return fail("RuntimeCheckWithConvergent",
+ "may not insert runtime check with convergent operation");
+ }
+
+ // To keep things simple have an empty preheader before we version or clone
+ // the loop. (Also split if this has no predecessor, i.e. entry, because we
+ // rely on PH having a predecessor.)
+ if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
+ SplitBlock(PH, PH->getTerminator(), DT, LI);
+
+ if (!Pred.isAlwaysTrue() || !Checks.empty()) {
+ assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning");
+
+ MDNode *OrigLoopID = L->getLoopID();
+
+ LLVM_DEBUG(dbgs() << "\nPointers:\n");
+ LLVM_DEBUG(LAI->getRuntimePointerChecking()->printChecks(dbgs(), Checks));
LoopVersioning LVer(*LAI, Checks, L, LI, DT, SE);
- LVer.versionLoop(DefsUsedOutside);
- LVer.annotateLoopWithNoAlias();
-
- // The unversioned loop will not be changed, so we inherit all attributes
- // from the original loop, but remove the loop distribution metadata to
- // avoid to distribute it again.
- MDNode *UnversionedLoopID =
- makeFollowupLoopID(OrigLoopID,
- {LLVMLoopDistributeFollowupAll,
- LLVMLoopDistributeFollowupFallback},
- "llvm.loop.distribute.", true)
- .getValue();
- LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID);
- }
-
- // Create identical copies of the original loop for each partition and hook
- // them up sequentially.
- Partitions.cloneLoops();
-
- // Now, we remove the instruction from each loop that don't belong to that
- // partition.
- Partitions.removeUnusedInsts();
- LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
- LLVM_DEBUG(Partitions.printBlocks());
-
- if (LDistVerify) {
- LI->verify(*DT);
- assert(DT->verify(DominatorTree::VerificationLevel::Fast));
- }
-
- ++NumLoopsDistributed;
- // Report the success.
- ORE->emit([&]() {
- return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(),
- L->getHeader())
- << "distributed loop";
- });
- return true;
- }
-
- /// Provide diagnostics then \return with false.
- bool fail(StringRef RemarkName, StringRef Message) {
- LLVMContext &Ctx = F->getContext();
- bool Forced = isForced().getValueOr(false);
-
- LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n");
-
- // With Rpass-missed report that distribution failed.
- ORE->emit([&]() {
- return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed",
- L->getStartLoc(), L->getHeader())
- << "loop not distributed: use -Rpass-analysis=loop-distribute for "
- "more "
- "info";
- });
-
- // With Rpass-analysis report why. This is on by default if distribution
- // was requested explicitly.
- ORE->emit(OptimizationRemarkAnalysis(
- Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME,
- RemarkName, L->getStartLoc(), L->getHeader())
- << "loop not distributed: " << Message);
-
- // Also issue a warning if distribution was requested explicitly but it
- // failed.
- if (Forced)
- Ctx.diagnose(DiagnosticInfoOptimizationFailure(
- *F, L->getStartLoc(), "loop not distributed: failed "
- "explicitly specified loop distribution"));
-
- return false;
- }
-
- /// Return if distribution forced to be enabled/disabled for the loop.
- ///
- /// If the optional has a value, it indicates whether distribution was forced
- /// to be enabled (true) or disabled (false). If the optional has no value
- /// distribution was not forced either way.
- const Optional<bool> &isForced() const { return IsForced; }
-
-private:
- /// Filter out checks between pointers from the same partition.
- ///
- /// \p PtrToPartition contains the partition number for pointers. Partition
- /// number -1 means that the pointer is used in multiple partitions. In this
- /// case we can't safely omit the check.
- SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks(
- const SmallVectorImpl<RuntimePointerCheck> &AllChecks,
- const SmallVectorImpl<int> &PtrToPartition,
- const RuntimePointerChecking *RtPtrChecking) {
- SmallVector<RuntimePointerCheck, 4> Checks;
-
- copy_if(AllChecks, std::back_inserter(Checks),
- [&](const RuntimePointerCheck &Check) {
- for (unsigned PtrIdx1 : Check.first->Members)
- for (unsigned PtrIdx2 : Check.second->Members)
- // Only include this check if there is a pair of pointers
- // that require checking and the pointers fall into
- // separate partitions.
- //
- // (Note that we already know at this point that the two
- // pointer groups need checking but it doesn't follow
- // that each pair of pointers within the two groups need
- // checking as well.
- //
- // In other words we don't want to include a check just
- // because there is a pair of pointers between the two
- // pointer groups that require checks and a different
- // pair whose pointers fall into different partitions.)
- if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
- !RuntimePointerChecking::arePointersInSamePartition(
- PtrToPartition, PtrIdx1, PtrIdx2))
- return true;
- return false;
- });
-
- return Checks;
- }
-
- /// Check whether the loop metadata is forcing distribution to be
- /// enabled/disabled.
- void setForced() {
- Optional<const MDOperand *> Value =
- findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
- if (!Value)
- return;
-
- const MDOperand *Op = *Value;
- assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
- IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
- }
-
- Loop *L;
- Function *F;
-
- // Analyses used.
- LoopInfo *LI;
- const LoopAccessInfo *LAI = nullptr;
- DominatorTree *DT;
- ScalarEvolution *SE;
- OptimizationRemarkEmitter *ORE;
-
- /// Indicates whether distribution is forced to be enabled/disabled for
- /// the loop.
- ///
- /// If the optional has a value, it indicates whether distribution was forced
- /// to be enabled (true) or disabled (false). If the optional has no value
- /// distribution was not forced either way.
- Optional<bool> IsForced;
-};
-
-} // end anonymous namespace
-
-/// Shared implementation between new and old PMs.
-static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
- ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
- std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
- // Build up a worklist of inner-loops to vectorize. This is necessary as the
- // act of distributing a loop creates new loops and can invalidate iterators
- // across the loops.
- SmallVector<Loop *, 8> Worklist;
-
- for (Loop *TopLevelLoop : *LI)
- for (Loop *L : depth_first(TopLevelLoop))
- // We only handle inner-most loops.
+ LVer.versionLoop(DefsUsedOutside);
+ LVer.annotateLoopWithNoAlias();
+
+ // The unversioned loop will not be changed, so we inherit all attributes
+ // from the original loop, but remove the loop distribution metadata to
+ // avoid to distribute it again.
+ MDNode *UnversionedLoopID =
+ makeFollowupLoopID(OrigLoopID,
+ {LLVMLoopDistributeFollowupAll,
+ LLVMLoopDistributeFollowupFallback},
+ "llvm.loop.distribute.", true)
+ .getValue();
+ LVer.getNonVersionedLoop()->setLoopID(UnversionedLoopID);
+ }
+
+ // Create identical copies of the original loop for each partition and hook
+ // them up sequentially.
+ Partitions.cloneLoops();
+
+ // Now, we remove the instruction from each loop that don't belong to that
+ // partition.
+ Partitions.removeUnusedInsts();
+ LLVM_DEBUG(dbgs() << "\nAfter removing unused Instrs:\n");
+ LLVM_DEBUG(Partitions.printBlocks());
+
+ if (LDistVerify) {
+ LI->verify(*DT);
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+ }
+
+ ++NumLoopsDistributed;
+ // Report the success.
+ ORE->emit([&]() {
+ return OptimizationRemark(LDIST_NAME, "Distribute", L->getStartLoc(),
+ L->getHeader())
+ << "distributed loop";
+ });
+ return true;
+ }
+
+ /// Provide diagnostics then \return with false.
+ bool fail(StringRef RemarkName, StringRef Message) {
+ LLVMContext &Ctx = F->getContext();
+ bool Forced = isForced().getValueOr(false);
+
+ LLVM_DEBUG(dbgs() << "Skipping; " << Message << "\n");
+
+ // With Rpass-missed report that distribution failed.
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(LDIST_NAME, "NotDistributed",
+ L->getStartLoc(), L->getHeader())
+ << "loop not distributed: use -Rpass-analysis=loop-distribute for "
+ "more "
+ "info";
+ });
+
+ // With Rpass-analysis report why. This is on by default if distribution
+ // was requested explicitly.
+ ORE->emit(OptimizationRemarkAnalysis(
+ Forced ? OptimizationRemarkAnalysis::AlwaysPrint : LDIST_NAME,
+ RemarkName, L->getStartLoc(), L->getHeader())
+ << "loop not distributed: " << Message);
+
+ // Also issue a warning if distribution was requested explicitly but it
+ // failed.
+ if (Forced)
+ Ctx.diagnose(DiagnosticInfoOptimizationFailure(
+ *F, L->getStartLoc(), "loop not distributed: failed "
+ "explicitly specified loop distribution"));
+
+ return false;
+ }
+
+ /// Return if distribution forced to be enabled/disabled for the loop.
+ ///
+ /// If the optional has a value, it indicates whether distribution was forced
+ /// to be enabled (true) or disabled (false). If the optional has no value
+ /// distribution was not forced either way.
+ const Optional<bool> &isForced() const { return IsForced; }
+
+private:
+ /// Filter out checks between pointers from the same partition.
+ ///
+ /// \p PtrToPartition contains the partition number for pointers. Partition
+ /// number -1 means that the pointer is used in multiple partitions. In this
+ /// case we can't safely omit the check.
+ SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks(
+ const SmallVectorImpl<RuntimePointerCheck> &AllChecks,
+ const SmallVectorImpl<int> &PtrToPartition,
+ const RuntimePointerChecking *RtPtrChecking) {
+ SmallVector<RuntimePointerCheck, 4> Checks;
+
+ copy_if(AllChecks, std::back_inserter(Checks),
+ [&](const RuntimePointerCheck &Check) {
+ for (unsigned PtrIdx1 : Check.first->Members)
+ for (unsigned PtrIdx2 : Check.second->Members)
+ // Only include this check if there is a pair of pointers
+ // that require checking and the pointers fall into
+ // separate partitions.
+ //
+ // (Note that we already know at this point that the two
+ // pointer groups need checking but it doesn't follow
+ // that each pair of pointers within the two groups need
+ // checking as well.
+ //
+ // In other words we don't want to include a check just
+ // because there is a pair of pointers between the two
+ // pointer groups that require checks and a different
+ // pair whose pointers fall into different partitions.)
+ if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
+ !RuntimePointerChecking::arePointersInSamePartition(
+ PtrToPartition, PtrIdx1, PtrIdx2))
+ return true;
+ return false;
+ });
+
+ return Checks;
+ }
+
+ /// Check whether the loop metadata is forcing distribution to be
+ /// enabled/disabled.
+ void setForced() {
+ Optional<const MDOperand *> Value =
+ findStringMetadataForLoop(L, "llvm.loop.distribute.enable");
+ if (!Value)
+ return;
+
+ const MDOperand *Op = *Value;
+ assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
+ IsForced = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
+ }
+
+ Loop *L;
+ Function *F;
+
+ // Analyses used.
+ LoopInfo *LI;
+ const LoopAccessInfo *LAI = nullptr;
+ DominatorTree *DT;
+ ScalarEvolution *SE;
+ OptimizationRemarkEmitter *ORE;
+
+ /// Indicates whether distribution is forced to be enabled/disabled for
+ /// the loop.
+ ///
+ /// If the optional has a value, it indicates whether distribution was forced
+ /// to be enabled (true) or disabled (false). If the optional has no value
+ /// distribution was not forced either way.
+ Optional<bool> IsForced;
+};
+
+} // end anonymous namespace
+
+/// Shared implementation between new and old PMs.
+static bool runImpl(Function &F, LoopInfo *LI, DominatorTree *DT,
+ ScalarEvolution *SE, OptimizationRemarkEmitter *ORE,
+ std::function<const LoopAccessInfo &(Loop &)> &GetLAA) {
+ // Build up a worklist of inner-loops to vectorize. This is necessary as the
+ // act of distributing a loop creates new loops and can invalidate iterators
+ // across the loops.
+ SmallVector<Loop *, 8> Worklist;
+
+ for (Loop *TopLevelLoop : *LI)
+ for (Loop *L : depth_first(TopLevelLoop))
+ // We only handle inner-most loops.
if (L->isInnermost())
- Worklist.push_back(L);
-
- // Now walk the identified inner loops.
- bool Changed = false;
- for (Loop *L : Worklist) {
- LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE);
-
- // If distribution was forced for the specific loop to be
- // enabled/disabled, follow that. Otherwise use the global flag.
- if (LDL.isForced().getValueOr(EnableLoopDistribute))
- Changed |= LDL.processLoop(GetLAA);
- }
-
- // Process each loop nest in the function.
- return Changed;
-}
-
-namespace {
-
-/// The pass class.
-class LoopDistributeLegacy : public FunctionPass {
-public:
- static char ID;
-
- LoopDistributeLegacy() : FunctionPass(ID) {
- // The default is set by the caller.
- initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- std::function<const LoopAccessInfo &(Loop &)> GetLAA =
- [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
-
- return runImpl(F, LI, DT, SE, ORE, GetLAA);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequired<LoopAccessLegacyAnalysis>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-PreservedAnalyses LoopDistributePass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &LI = AM.getResult<LoopAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
- auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-
- // We don't directly need these analyses but they're required for loop
- // analyses so provide them below.
- auto &AA = AM.getResult<AAManager>(F);
- auto &AC = AM.getResult<AssumptionAnalysis>(F);
- auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
-
- auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
- std::function<const LoopAccessInfo &(Loop &)> GetLAA =
- [&](Loop &L) -> const LoopAccessInfo & {
+ Worklist.push_back(L);
+
+ // Now walk the identified inner loops.
+ bool Changed = false;
+ for (Loop *L : Worklist) {
+ LoopDistributeForLoop LDL(L, &F, LI, DT, SE, ORE);
+
+ // If distribution was forced for the specific loop to be
+ // enabled/disabled, follow that. Otherwise use the global flag.
+ if (LDL.isForced().getValueOr(EnableLoopDistribute))
+ Changed |= LDL.processLoop(GetLAA);
+ }
+
+ // Process each loop nest in the function.
+ return Changed;
+}
+
+namespace {
+
+/// The pass class.
+class LoopDistributeLegacy : public FunctionPass {
+public:
+ static char ID;
+
+ LoopDistributeLegacy() : FunctionPass(ID) {
+ // The default is set by the caller.
+ initializeLoopDistributeLegacyPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+ [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
+
+ return runImpl(F, LI, DT, SE, ORE, GetLAA);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<LoopAccessLegacyAnalysis>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses LoopDistributePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+ // We don't directly need these analyses but they're required for loop
+ // analyses so provide them below.
+ auto &AA = AM.getResult<AAManager>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+
+ auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+ std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+ [&](Loop &L) -> const LoopAccessInfo & {
LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
TLI, TTI, nullptr, nullptr};
- return LAM.getResult<LoopAccessAnalysis>(L, AR);
- };
-
- bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA);
- if (!Changed)
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
- PA.preserve<LoopAnalysis>();
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<GlobalsAA>();
- return PA;
-}
-
-char LoopDistributeLegacy::ID;
-
-static const char ldist_name[] = "Loop Distribution";
-
-INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false,
- false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false)
-
-FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); }
+ return LAM.getResult<LoopAccessAnalysis>(L, AR);
+ };
+
+ bool Changed = runImpl(F, &LI, &DT, &SE, &ORE, GetLAA);
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<LoopAnalysis>();
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+char LoopDistributeLegacy::ID;
+
+static const char ldist_name[] = "Loop Distribution";
+
+INITIALIZE_PASS_BEGIN(LoopDistributeLegacy, LDIST_NAME, ldist_name, false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(LoopDistributeLegacy, LDIST_NAME, ldist_name, false, false)
+
+FunctionPass *llvm::createLoopDistributePass() { return new LoopDistributeLegacy(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp
index aa754a7077..b5f8dfa9aa 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopFuse.cpp
@@ -1,264 +1,264 @@
-//===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file implements the loop fusion pass.
-/// The implementation is largely based on the following document:
-///
-/// Code Transformations to Augment the Scope of Loop Fusion in a
-/// Production Compiler
-/// Christopher Mark Barton
-/// MSc Thesis
-/// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf
-///
-/// The general approach taken is to collect sets of control flow equivalent
-/// loops and test whether they can be fused. The necessary conditions for
-/// fusion are:
-/// 1. The loops must be adjacent (there cannot be any statements between
-/// the two loops).
-/// 2. The loops must be conforming (they must execute the same number of
-/// iterations).
-/// 3. The loops must be control flow equivalent (if one loop executes, the
-/// other is guaranteed to execute).
-/// 4. There cannot be any negative distance dependencies between the loops.
-/// If all of these conditions are satisfied, it is safe to fuse the loops.
-///
-/// This implementation creates FusionCandidates that represent the loop and the
-/// necessary information needed by fusion. It then operates on the fusion
-/// candidates, first confirming that the candidate is eligible for fusion. The
-/// candidates are then collected into control flow equivalent sets, sorted in
-/// dominance order. Each set of control flow equivalent candidates is then
-/// traversed, attempting to fuse pairs of candidates in the set. If all
-/// requirements for fusion are met, the two candidates are fused, creating a
-/// new (fused) candidate which is then added back into the set to consider for
-/// additional fusion.
-///
-/// This implementation currently does not make any modifications to remove
-/// conditions for fusion. Code transformations to make loops conform to each of
-/// the conditions for fusion are discussed in more detail in the document
-/// above. These can be added to the current implementation in the future.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopFuse.h"
-#include "llvm/ADT/Statistic.h"
+//===- LoopFuse.cpp - Loop Fusion Pass ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the loop fusion pass.
+/// The implementation is largely based on the following document:
+///
+/// Code Transformations to Augment the Scope of Loop Fusion in a
+/// Production Compiler
+/// Christopher Mark Barton
+/// MSc Thesis
+/// https://webdocs.cs.ualberta.ca/~amaral/thesis/ChristopherBartonMSc.pdf
+///
+/// The general approach taken is to collect sets of control flow equivalent
+/// loops and test whether they can be fused. The necessary conditions for
+/// fusion are:
+/// 1. The loops must be adjacent (there cannot be any statements between
+/// the two loops).
+/// 2. The loops must be conforming (they must execute the same number of
+/// iterations).
+/// 3. The loops must be control flow equivalent (if one loop executes, the
+/// other is guaranteed to execute).
+/// 4. There cannot be any negative distance dependencies between the loops.
+/// If all of these conditions are satisfied, it is safe to fuse the loops.
+///
+/// This implementation creates FusionCandidates that represent the loop and the
+/// necessary information needed by fusion. It then operates on the fusion
+/// candidates, first confirming that the candidate is eligible for fusion. The
+/// candidates are then collected into control flow equivalent sets, sorted in
+/// dominance order. Each set of control flow equivalent candidates is then
+/// traversed, attempting to fuse pairs of candidates in the set. If all
+/// requirements for fusion are met, the two candidates are fused, creating a
+/// new (fused) candidate which is then added back into the set to consider for
+/// additional fusion.
+///
+/// This implementation currently does not make any modifications to remove
+/// conditions for fusion. Code transformations to make loops conform to each of
+/// the conditions for fusion are discussed in more detail in the document
+/// above. These can be added to the current implementation in the future.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopFuse.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/DependenceAnalysis.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/CodeMoverUtils.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CodeMoverUtils.h"
#include "llvm/Transforms/Utils/LoopPeel.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-fusion"
-
-STATISTIC(FuseCounter, "Loops fused");
-STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
-STATISTIC(InvalidPreheader, "Loop has invalid preheader");
-STATISTIC(InvalidHeader, "Loop has invalid header");
-STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks");
-STATISTIC(InvalidExitBlock, "Loop has invalid exit block");
-STATISTIC(InvalidLatch, "Loop has invalid latch");
-STATISTIC(InvalidLoop, "Loop is invalid");
-STATISTIC(AddressTakenBB, "Basic block has address taken");
-STATISTIC(MayThrowException, "Loop may throw an exception");
-STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
-STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
-STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
-STATISTIC(UnknownTripCount, "Loop has unknown trip count");
-STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
-STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
-STATISTIC(NonAdjacent, "Loops are not adjacent");
-STATISTIC(
- NonEmptyPreheader,
- "Loop has a non-empty preheader with instructions that cannot be moved");
-STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
-STATISTIC(NonIdenticalGuards, "Candidates have different guards");
-STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with "
- "instructions that cannot be moved");
-STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with "
- "instructions that cannot be moved");
-STATISTIC(NotRotated, "Candidate is not rotated");
-
-enum FusionDependenceAnalysisChoice {
- FUSION_DEPENDENCE_ANALYSIS_SCEV,
- FUSION_DEPENDENCE_ANALYSIS_DA,
- FUSION_DEPENDENCE_ANALYSIS_ALL,
-};
-
-static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis(
- "loop-fusion-dependence-analysis",
- cl::desc("Which dependence analysis should loop fusion use?"),
- cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev",
- "Use the scalar evolution interface"),
- clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da",
- "Use the dependence analysis interface"),
- clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all",
- "Use all available analyses")),
- cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore);
-
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-fusion"
+
+STATISTIC(FuseCounter, "Loops fused");
+STATISTIC(NumFusionCandidates, "Number of candidates for loop fusion");
+STATISTIC(InvalidPreheader, "Loop has invalid preheader");
+STATISTIC(InvalidHeader, "Loop has invalid header");
+STATISTIC(InvalidExitingBlock, "Loop has invalid exiting blocks");
+STATISTIC(InvalidExitBlock, "Loop has invalid exit block");
+STATISTIC(InvalidLatch, "Loop has invalid latch");
+STATISTIC(InvalidLoop, "Loop is invalid");
+STATISTIC(AddressTakenBB, "Basic block has address taken");
+STATISTIC(MayThrowException, "Loop may throw an exception");
+STATISTIC(ContainsVolatileAccess, "Loop contains a volatile access");
+STATISTIC(NotSimplifiedForm, "Loop is not in simplified form");
+STATISTIC(InvalidDependencies, "Dependencies prevent fusion");
+STATISTIC(UnknownTripCount, "Loop has unknown trip count");
+STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
+STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
+STATISTIC(NonAdjacent, "Loops are not adjacent");
+STATISTIC(
+ NonEmptyPreheader,
+ "Loop has a non-empty preheader with instructions that cannot be moved");
+STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
+STATISTIC(NonIdenticalGuards, "Candidates have different guards");
+STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with "
+ "instructions that cannot be moved");
+STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with "
+ "instructions that cannot be moved");
+STATISTIC(NotRotated, "Candidate is not rotated");
+
+enum FusionDependenceAnalysisChoice {
+ FUSION_DEPENDENCE_ANALYSIS_SCEV,
+ FUSION_DEPENDENCE_ANALYSIS_DA,
+ FUSION_DEPENDENCE_ANALYSIS_ALL,
+};
+
+static cl::opt<FusionDependenceAnalysisChoice> FusionDependenceAnalysis(
+ "loop-fusion-dependence-analysis",
+ cl::desc("Which dependence analysis should loop fusion use?"),
+ cl::values(clEnumValN(FUSION_DEPENDENCE_ANALYSIS_SCEV, "scev",
+ "Use the scalar evolution interface"),
+ clEnumValN(FUSION_DEPENDENCE_ANALYSIS_DA, "da",
+ "Use the dependence analysis interface"),
+ clEnumValN(FUSION_DEPENDENCE_ANALYSIS_ALL, "all",
+ "Use all available analyses")),
+ cl::Hidden, cl::init(FUSION_DEPENDENCE_ANALYSIS_ALL), cl::ZeroOrMore);
+
static cl::opt<unsigned> FusionPeelMaxCount(
"loop-fusion-peel-max-count", cl::init(0), cl::Hidden,
cl::desc("Max number of iterations to be peeled from a loop, such that "
"fusion can take place"));
-#ifndef NDEBUG
-static cl::opt<bool>
- VerboseFusionDebugging("loop-fusion-verbose-debug",
- cl::desc("Enable verbose debugging for Loop Fusion"),
- cl::Hidden, cl::init(false), cl::ZeroOrMore);
-#endif
-
-namespace {
-/// This class is used to represent a candidate for loop fusion. When it is
-/// constructed, it checks the conditions for loop fusion to ensure that it
-/// represents a valid candidate. It caches several parts of a loop that are
-/// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead
-/// of continually querying the underlying Loop to retrieve these values. It is
-/// assumed these will not change throughout loop fusion.
-///
-/// The invalidate method should be used to indicate that the FusionCandidate is
-/// no longer a valid candidate for fusion. Similarly, the isValid() method can
-/// be used to ensure that the FusionCandidate is still valid for fusion.
-struct FusionCandidate {
- /// Cache of parts of the loop used throughout loop fusion. These should not
- /// need to change throughout the analysis and transformation.
- /// These parts are cached to avoid repeatedly looking up in the Loop class.
-
- /// Preheader of the loop this candidate represents
- BasicBlock *Preheader;
- /// Header of the loop this candidate represents
- BasicBlock *Header;
- /// Blocks in the loop that exit the loop
- BasicBlock *ExitingBlock;
- /// The successor block of this loop (where the exiting blocks go to)
- BasicBlock *ExitBlock;
- /// Latch of the loop
- BasicBlock *Latch;
- /// The loop that this fusion candidate represents
- Loop *L;
- /// Vector of instructions in this loop that read from memory
- SmallVector<Instruction *, 16> MemReads;
- /// Vector of instructions in this loop that write to memory
- SmallVector<Instruction *, 16> MemWrites;
- /// Are all of the members of this fusion candidate still valid
- bool Valid;
- /// Guard branch of the loop, if it exists
- BranchInst *GuardBranch;
+#ifndef NDEBUG
+static cl::opt<bool>
+ VerboseFusionDebugging("loop-fusion-verbose-debug",
+ cl::desc("Enable verbose debugging for Loop Fusion"),
+ cl::Hidden, cl::init(false), cl::ZeroOrMore);
+#endif
+
+namespace {
+/// This class is used to represent a candidate for loop fusion. When it is
+/// constructed, it checks the conditions for loop fusion to ensure that it
+/// represents a valid candidate. It caches several parts of a loop that are
+/// used throughout loop fusion (e.g., loop preheader, loop header, etc) instead
+/// of continually querying the underlying Loop to retrieve these values. It is
+/// assumed these will not change throughout loop fusion.
+///
+/// The invalidate method should be used to indicate that the FusionCandidate is
+/// no longer a valid candidate for fusion. Similarly, the isValid() method can
+/// be used to ensure that the FusionCandidate is still valid for fusion.
+struct FusionCandidate {
+ /// Cache of parts of the loop used throughout loop fusion. These should not
+ /// need to change throughout the analysis and transformation.
+ /// These parts are cached to avoid repeatedly looking up in the Loop class.
+
+ /// Preheader of the loop this candidate represents
+ BasicBlock *Preheader;
+ /// Header of the loop this candidate represents
+ BasicBlock *Header;
+ /// Blocks in the loop that exit the loop
+ BasicBlock *ExitingBlock;
+ /// The successor block of this loop (where the exiting blocks go to)
+ BasicBlock *ExitBlock;
+ /// Latch of the loop
+ BasicBlock *Latch;
+ /// The loop that this fusion candidate represents
+ Loop *L;
+ /// Vector of instructions in this loop that read from memory
+ SmallVector<Instruction *, 16> MemReads;
+ /// Vector of instructions in this loop that write to memory
+ SmallVector<Instruction *, 16> MemWrites;
+ /// Are all of the members of this fusion candidate still valid
+ bool Valid;
+ /// Guard branch of the loop, if it exists
+ BranchInst *GuardBranch;
/// Peeling Paramaters of the Loop.
TTI::PeelingPreferences PP;
/// Can you Peel this Loop?
bool AbleToPeel;
/// Has this loop been Peeled
bool Peeled;
-
- /// Dominator and PostDominator trees are needed for the
- /// FusionCandidateCompare function, required by FusionCandidateSet to
- /// determine where the FusionCandidate should be inserted into the set. These
- /// are used to establish ordering of the FusionCandidates based on dominance.
- const DominatorTree *DT;
- const PostDominatorTree *PDT;
-
- OptimizationRemarkEmitter &ORE;
-
- FusionCandidate(Loop *L, const DominatorTree *DT,
+
+ /// Dominator and PostDominator trees are needed for the
+ /// FusionCandidateCompare function, required by FusionCandidateSet to
+ /// determine where the FusionCandidate should be inserted into the set. These
+ /// are used to establish ordering of the FusionCandidates based on dominance.
+ const DominatorTree *DT;
+ const PostDominatorTree *PDT;
+
+ OptimizationRemarkEmitter &ORE;
+
+ FusionCandidate(Loop *L, const DominatorTree *DT,
const PostDominatorTree *PDT, OptimizationRemarkEmitter &ORE,
TTI::PeelingPreferences PP)
- : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
- ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
- Latch(L->getLoopLatch()), L(L), Valid(true),
+ : Preheader(L->getLoopPreheader()), Header(L->getHeader()),
+ ExitingBlock(L->getExitingBlock()), ExitBlock(L->getExitBlock()),
+ Latch(L->getLoopLatch()), L(L), Valid(true),
GuardBranch(L->getLoopGuardBranch()), PP(PP), AbleToPeel(canPeel(L)),
Peeled(false), DT(DT), PDT(PDT), ORE(ORE) {
-
- // Walk over all blocks in the loop and check for conditions that may
- // prevent fusion. For each block, walk over all instructions and collect
- // the memory reads and writes If any instructions that prevent fusion are
- // found, invalidate this object and return.
- for (BasicBlock *BB : L->blocks()) {
- if (BB->hasAddressTaken()) {
- invalidate();
- reportInvalidCandidate(AddressTakenBB);
- return;
- }
-
- for (Instruction &I : *BB) {
- if (I.mayThrow()) {
- invalidate();
- reportInvalidCandidate(MayThrowException);
- return;
- }
- if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
- if (SI->isVolatile()) {
- invalidate();
- reportInvalidCandidate(ContainsVolatileAccess);
- return;
- }
- }
- if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
- if (LI->isVolatile()) {
- invalidate();
- reportInvalidCandidate(ContainsVolatileAccess);
- return;
- }
- }
- if (I.mayWriteToMemory())
- MemWrites.push_back(&I);
- if (I.mayReadFromMemory())
- MemReads.push_back(&I);
- }
- }
- }
-
- /// Check if all members of the class are valid.
- bool isValid() const {
- return Preheader && Header && ExitingBlock && ExitBlock && Latch && L &&
- !L->isInvalid() && Valid;
- }
-
- /// Verify that all members are in sync with the Loop object.
- void verify() const {
- assert(isValid() && "Candidate is not valid!!");
- assert(!L->isInvalid() && "Loop is invalid!");
- assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync");
- assert(Header == L->getHeader() && "Header is out of sync");
- assert(ExitingBlock == L->getExitingBlock() &&
- "Exiting Blocks is out of sync");
- assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync");
- assert(Latch == L->getLoopLatch() && "Latch is out of sync");
- }
-
- /// Get the entry block for this fusion candidate.
- ///
- /// If this fusion candidate represents a guarded loop, the entry block is the
- /// loop guard block. If it represents an unguarded loop, the entry block is
- /// the preheader of the loop.
- BasicBlock *getEntryBlock() const {
- if (GuardBranch)
- return GuardBranch->getParent();
- else
- return Preheader;
- }
-
+
+ // Walk over all blocks in the loop and check for conditions that may
+ // prevent fusion. For each block, walk over all instructions and collect
+ // the memory reads and writes If any instructions that prevent fusion are
+ // found, invalidate this object and return.
+ for (BasicBlock *BB : L->blocks()) {
+ if (BB->hasAddressTaken()) {
+ invalidate();
+ reportInvalidCandidate(AddressTakenBB);
+ return;
+ }
+
+ for (Instruction &I : *BB) {
+ if (I.mayThrow()) {
+ invalidate();
+ reportInvalidCandidate(MayThrowException);
+ return;
+ }
+ if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+ if (SI->isVolatile()) {
+ invalidate();
+ reportInvalidCandidate(ContainsVolatileAccess);
+ return;
+ }
+ }
+ if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+ if (LI->isVolatile()) {
+ invalidate();
+ reportInvalidCandidate(ContainsVolatileAccess);
+ return;
+ }
+ }
+ if (I.mayWriteToMemory())
+ MemWrites.push_back(&I);
+ if (I.mayReadFromMemory())
+ MemReads.push_back(&I);
+ }
+ }
+ }
+
+ /// Check if all members of the class are valid.
+ bool isValid() const {
+ return Preheader && Header && ExitingBlock && ExitBlock && Latch && L &&
+ !L->isInvalid() && Valid;
+ }
+
+ /// Verify that all members are in sync with the Loop object.
+ void verify() const {
+ assert(isValid() && "Candidate is not valid!!");
+ assert(!L->isInvalid() && "Loop is invalid!");
+ assert(Preheader == L->getLoopPreheader() && "Preheader is out of sync");
+ assert(Header == L->getHeader() && "Header is out of sync");
+ assert(ExitingBlock == L->getExitingBlock() &&
+ "Exiting Blocks is out of sync");
+ assert(ExitBlock == L->getExitBlock() && "Exit block is out of sync");
+ assert(Latch == L->getLoopLatch() && "Latch is out of sync");
+ }
+
+ /// Get the entry block for this fusion candidate.
+ ///
+ /// If this fusion candidate represents a guarded loop, the entry block is the
+ /// loop guard block. If it represents an unguarded loop, the entry block is
+ /// the preheader of the loop.
+ BasicBlock *getEntryBlock() const {
+ if (GuardBranch)
+ return GuardBranch->getParent();
+ else
+ return Preheader;
+ }
+
/// After Peeling the loop is modified quite a bit, hence all of the Blocks
/// need to be updated accordingly.
void updateAfterPeeling() {
@@ -270,427 +270,427 @@ struct FusionCandidate {
verify();
}
- /// Given a guarded loop, get the successor of the guard that is not in the
- /// loop.
- ///
- /// This method returns the successor of the loop guard that is not located
- /// within the loop (i.e., the successor of the guard that is not the
- /// preheader).
- /// This method is only valid for guarded loops.
- BasicBlock *getNonLoopBlock() const {
- assert(GuardBranch && "Only valid on guarded loops.");
- assert(GuardBranch->isConditional() &&
- "Expecting guard to be a conditional branch.");
+ /// Given a guarded loop, get the successor of the guard that is not in the
+ /// loop.
+ ///
+ /// This method returns the successor of the loop guard that is not located
+ /// within the loop (i.e., the successor of the guard that is not the
+ /// preheader).
+ /// This method is only valid for guarded loops.
+ BasicBlock *getNonLoopBlock() const {
+ assert(GuardBranch && "Only valid on guarded loops.");
+ assert(GuardBranch->isConditional() &&
+ "Expecting guard to be a conditional branch.");
if (Peeled)
return GuardBranch->getSuccessor(1);
- return (GuardBranch->getSuccessor(0) == Preheader)
- ? GuardBranch->getSuccessor(1)
- : GuardBranch->getSuccessor(0);
- }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- LLVM_DUMP_METHOD void dump() const {
- dbgs() << "\tGuardBranch: ";
- if (GuardBranch)
- dbgs() << *GuardBranch;
- else
- dbgs() << "nullptr";
- dbgs() << "\n"
- << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
- << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
- << "\n"
- << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
- << "\tExitingBB: "
- << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
- << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
- << "\n"
- << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
- << "\tEntryBlock: "
- << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
- << "\n";
- }
-#endif
-
- /// Determine if a fusion candidate (representing a loop) is eligible for
- /// fusion. Note that this only checks whether a single loop can be fused - it
- /// does not check whether it is *legal* to fuse two loops together.
- bool isEligibleForFusion(ScalarEvolution &SE) const {
- if (!isValid()) {
- LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
- if (!Preheader)
- ++InvalidPreheader;
- if (!Header)
- ++InvalidHeader;
- if (!ExitingBlock)
- ++InvalidExitingBlock;
- if (!ExitBlock)
- ++InvalidExitBlock;
- if (!Latch)
- ++InvalidLatch;
- if (L->isInvalid())
- ++InvalidLoop;
-
- return false;
- }
-
- // Require ScalarEvolution to be able to determine a trip count.
- if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
- LLVM_DEBUG(dbgs() << "Loop " << L->getName()
- << " trip count not computable!\n");
- return reportInvalidCandidate(UnknownTripCount);
- }
-
- if (!L->isLoopSimplifyForm()) {
- LLVM_DEBUG(dbgs() << "Loop " << L->getName()
- << " is not in simplified form!\n");
- return reportInvalidCandidate(NotSimplifiedForm);
- }
-
- if (!L->isRotatedForm()) {
- LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n");
- return reportInvalidCandidate(NotRotated);
- }
-
- return true;
- }
-
-private:
- // This is only used internally for now, to clear the MemWrites and MemReads
- // list and setting Valid to false. I can't envision other uses of this right
- // now, since once FusionCandidates are put into the FusionCandidateSet they
- // are immutable. Thus, any time we need to change/update a FusionCandidate,
- // we must create a new one and insert it into the FusionCandidateSet to
- // ensure the FusionCandidateSet remains ordered correctly.
- void invalidate() {
- MemWrites.clear();
- MemReads.clear();
- Valid = false;
- }
-
- bool reportInvalidCandidate(llvm::Statistic &Stat) const {
- using namespace ore;
- assert(L && Preheader && "Fusion candidate not initialized properly!");
- ++Stat;
- ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(),
- L->getStartLoc(), Preheader)
- << "[" << Preheader->getParent()->getName() << "]: "
- << "Loop is not a candidate for fusion: " << Stat.getDesc());
- return false;
- }
-};
-
-struct FusionCandidateCompare {
- /// Comparison functor to sort two Control Flow Equivalent fusion candidates
- /// into dominance order.
- /// If LHS dominates RHS and RHS post-dominates LHS, return true;
- /// IF RHS dominates LHS and LHS post-dominates RHS, return false;
- bool operator()(const FusionCandidate &LHS,
- const FusionCandidate &RHS) const {
- const DominatorTree *DT = LHS.DT;
-
- BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
- BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
-
- // Do not save PDT to local variable as it is only used in asserts and thus
- // will trigger an unused variable warning if building without asserts.
- assert(DT && LHS.PDT && "Expecting valid dominator tree");
-
- // Do this compare first so if LHS == RHS, function returns false.
- if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
- // RHS dominates LHS
- // Verify LHS post-dominates RHS
- assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
- return false;
- }
-
- if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
- // Verify RHS Postdominates LHS
- assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
- return true;
- }
-
- // If LHS does not dominate RHS and RHS does not dominate LHS then there is
- // no dominance relationship between the two FusionCandidates. Thus, they
- // should not be in the same set together.
- llvm_unreachable(
- "No dominance relationship between these fusion candidates!");
- }
-};
-
-using LoopVector = SmallVector<Loop *, 4>;
-
-// Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
-// order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0
-// dominates FC1 and FC1 post-dominates FC0.
-// std::set was chosen because we want a sorted data structure with stable
-// iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent
-// loops by moving intervening code around. When this intervening code contains
-// loops, those loops will be moved also. The corresponding FusionCandidates
-// will also need to be moved accordingly. As this is done, having stable
-// iterators will simplify the logic. Similarly, having an efficient insert that
-// keeps the FusionCandidateSet sorted will also simplify the implementation.
-using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
-using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
-
-#if !defined(NDEBUG)
-static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
- const FusionCandidate &FC) {
- if (FC.isValid())
- OS << FC.Preheader->getName();
- else
- OS << "<Invalid>";
-
- return OS;
-}
-
-static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
- const FusionCandidateSet &CandSet) {
- for (const FusionCandidate &FC : CandSet)
- OS << FC << '\n';
-
- return OS;
-}
-
-static void
-printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
- dbgs() << "Fusion Candidates: \n";
- for (const auto &CandidateSet : FusionCandidates) {
- dbgs() << "*** Fusion Candidate Set ***\n";
- dbgs() << CandidateSet;
- dbgs() << "****************************\n";
- }
-}
-#endif
-
-/// Collect all loops in function at the same nest level, starting at the
-/// outermost level.
-///
-/// This data structure collects all loops at the same nest level for a
-/// given function (specified by the LoopInfo object). It starts at the
-/// outermost level.
-struct LoopDepthTree {
- using LoopsOnLevelTy = SmallVector<LoopVector, 4>;
- using iterator = LoopsOnLevelTy::iterator;
- using const_iterator = LoopsOnLevelTy::const_iterator;
-
- LoopDepthTree(LoopInfo &LI) : Depth(1) {
- if (!LI.empty())
- LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend()));
- }
-
- /// Test whether a given loop has been removed from the function, and thus is
- /// no longer valid.
- bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); }
-
- /// Record that a given loop has been removed from the function and is no
- /// longer valid.
- void removeLoop(const Loop *L) { RemovedLoops.insert(L); }
-
- /// Descend the tree to the next (inner) nesting level
- void descend() {
- LoopsOnLevelTy LoopsOnNextLevel;
-
- for (const LoopVector &LV : *this)
- for (Loop *L : LV)
- if (!isRemovedLoop(L) && L->begin() != L->end())
- LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end()));
-
- LoopsOnLevel = LoopsOnNextLevel;
- RemovedLoops.clear();
- Depth++;
- }
-
- bool empty() const { return size() == 0; }
- size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); }
- unsigned getDepth() const { return Depth; }
-
- iterator begin() { return LoopsOnLevel.begin(); }
- iterator end() { return LoopsOnLevel.end(); }
- const_iterator begin() const { return LoopsOnLevel.begin(); }
- const_iterator end() const { return LoopsOnLevel.end(); }
-
-private:
- /// Set of loops that have been removed from the function and are no longer
- /// valid.
- SmallPtrSet<const Loop *, 8> RemovedLoops;
-
- /// Depth of the current level, starting at 1 (outermost loops).
- unsigned Depth;
-
- /// Vector of loops at the current depth level that have the same parent loop
- LoopsOnLevelTy LoopsOnLevel;
-};
-
-#ifndef NDEBUG
-static void printLoopVector(const LoopVector &LV) {
- dbgs() << "****************************\n";
- for (auto L : LV)
- printLoop(*L, dbgs());
- dbgs() << "****************************\n";
-}
-#endif
-
-struct LoopFuser {
-private:
- // Sets of control flow equivalent fusion candidates for a given nest level.
- FusionCandidateCollection FusionCandidates;
-
- LoopDepthTree LDT;
- DomTreeUpdater DTU;
-
- LoopInfo &LI;
- DominatorTree &DT;
- DependenceInfo &DI;
- ScalarEvolution &SE;
- PostDominatorTree &PDT;
- OptimizationRemarkEmitter &ORE;
+ return (GuardBranch->getSuccessor(0) == Preheader)
+ ? GuardBranch->getSuccessor(1)
+ : GuardBranch->getSuccessor(0);
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ LLVM_DUMP_METHOD void dump() const {
+ dbgs() << "\tGuardBranch: ";
+ if (GuardBranch)
+ dbgs() << *GuardBranch;
+ else
+ dbgs() << "nullptr";
+ dbgs() << "\n"
+ << (GuardBranch ? GuardBranch->getName() : "nullptr") << "\n"
+ << "\tPreheader: " << (Preheader ? Preheader->getName() : "nullptr")
+ << "\n"
+ << "\tHeader: " << (Header ? Header->getName() : "nullptr") << "\n"
+ << "\tExitingBB: "
+ << (ExitingBlock ? ExitingBlock->getName() : "nullptr") << "\n"
+ << "\tExitBB: " << (ExitBlock ? ExitBlock->getName() : "nullptr")
+ << "\n"
+ << "\tLatch: " << (Latch ? Latch->getName() : "nullptr") << "\n"
+ << "\tEntryBlock: "
+ << (getEntryBlock() ? getEntryBlock()->getName() : "nullptr")
+ << "\n";
+ }
+#endif
+
+ /// Determine if a fusion candidate (representing a loop) is eligible for
+ /// fusion. Note that this only checks whether a single loop can be fused - it
+ /// does not check whether it is *legal* to fuse two loops together.
+ bool isEligibleForFusion(ScalarEvolution &SE) const {
+ if (!isValid()) {
+ LLVM_DEBUG(dbgs() << "FC has invalid CFG requirements!\n");
+ if (!Preheader)
+ ++InvalidPreheader;
+ if (!Header)
+ ++InvalidHeader;
+ if (!ExitingBlock)
+ ++InvalidExitingBlock;
+ if (!ExitBlock)
+ ++InvalidExitBlock;
+ if (!Latch)
+ ++InvalidLatch;
+ if (L->isInvalid())
+ ++InvalidLoop;
+
+ return false;
+ }
+
+ // Require ScalarEvolution to be able to determine a trip count.
+ if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
+ LLVM_DEBUG(dbgs() << "Loop " << L->getName()
+ << " trip count not computable!\n");
+ return reportInvalidCandidate(UnknownTripCount);
+ }
+
+ if (!L->isLoopSimplifyForm()) {
+ LLVM_DEBUG(dbgs() << "Loop " << L->getName()
+ << " is not in simplified form!\n");
+ return reportInvalidCandidate(NotSimplifiedForm);
+ }
+
+ if (!L->isRotatedForm()) {
+ LLVM_DEBUG(dbgs() << "Loop " << L->getName() << " is not rotated!\n");
+ return reportInvalidCandidate(NotRotated);
+ }
+
+ return true;
+ }
+
+private:
+ // This is only used internally for now, to clear the MemWrites and MemReads
+ // list and setting Valid to false. I can't envision other uses of this right
+ // now, since once FusionCandidates are put into the FusionCandidateSet they
+ // are immutable. Thus, any time we need to change/update a FusionCandidate,
+ // we must create a new one and insert it into the FusionCandidateSet to
+ // ensure the FusionCandidateSet remains ordered correctly.
+ void invalidate() {
+ MemWrites.clear();
+ MemReads.clear();
+ Valid = false;
+ }
+
+ bool reportInvalidCandidate(llvm::Statistic &Stat) const {
+ using namespace ore;
+ assert(L && Preheader && "Fusion candidate not initialized properly!");
+ ++Stat;
+ ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, Stat.getName(),
+ L->getStartLoc(), Preheader)
+ << "[" << Preheader->getParent()->getName() << "]: "
+ << "Loop is not a candidate for fusion: " << Stat.getDesc());
+ return false;
+ }
+};
+
+struct FusionCandidateCompare {
+ /// Comparison functor to sort two Control Flow Equivalent fusion candidates
+ /// into dominance order.
+ /// If LHS dominates RHS and RHS post-dominates LHS, return true;
+ /// IF RHS dominates LHS and LHS post-dominates RHS, return false;
+ bool operator()(const FusionCandidate &LHS,
+ const FusionCandidate &RHS) const {
+ const DominatorTree *DT = LHS.DT;
+
+ BasicBlock *LHSEntryBlock = LHS.getEntryBlock();
+ BasicBlock *RHSEntryBlock = RHS.getEntryBlock();
+
+ // Do not save PDT to local variable as it is only used in asserts and thus
+ // will trigger an unused variable warning if building without asserts.
+ assert(DT && LHS.PDT && "Expecting valid dominator tree");
+
+ // Do this compare first so if LHS == RHS, function returns false.
+ if (DT->dominates(RHSEntryBlock, LHSEntryBlock)) {
+ // RHS dominates LHS
+ // Verify LHS post-dominates RHS
+ assert(LHS.PDT->dominates(LHSEntryBlock, RHSEntryBlock));
+ return false;
+ }
+
+ if (DT->dominates(LHSEntryBlock, RHSEntryBlock)) {
+ // Verify RHS Postdominates LHS
+ assert(LHS.PDT->dominates(RHSEntryBlock, LHSEntryBlock));
+ return true;
+ }
+
+ // If LHS does not dominate RHS and RHS does not dominate LHS then there is
+ // no dominance relationship between the two FusionCandidates. Thus, they
+ // should not be in the same set together.
+ llvm_unreachable(
+ "No dominance relationship between these fusion candidates!");
+ }
+};
+
+using LoopVector = SmallVector<Loop *, 4>;
+
+// Set of Control Flow Equivalent (CFE) Fusion Candidates, sorted in dominance
+// order. Thus, if FC0 comes *before* FC1 in a FusionCandidateSet, then FC0
+// dominates FC1 and FC1 post-dominates FC0.
+// std::set was chosen because we want a sorted data structure with stable
+// iterators. A subsequent patch to loop fusion will enable fusing non-ajdacent
+// loops by moving intervening code around. When this intervening code contains
+// loops, those loops will be moved also. The corresponding FusionCandidates
+// will also need to be moved accordingly. As this is done, having stable
+// iterators will simplify the logic. Similarly, having an efficient insert that
+// keeps the FusionCandidateSet sorted will also simplify the implementation.
+using FusionCandidateSet = std::set<FusionCandidate, FusionCandidateCompare>;
+using FusionCandidateCollection = SmallVector<FusionCandidateSet, 4>;
+
+#if !defined(NDEBUG)
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+ const FusionCandidate &FC) {
+ if (FC.isValid())
+ OS << FC.Preheader->getName();
+ else
+ OS << "<Invalid>";
+
+ return OS;
+}
+
+static llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+ const FusionCandidateSet &CandSet) {
+ for (const FusionCandidate &FC : CandSet)
+ OS << FC << '\n';
+
+ return OS;
+}
+
+static void
+printFusionCandidates(const FusionCandidateCollection &FusionCandidates) {
+ dbgs() << "Fusion Candidates: \n";
+ for (const auto &CandidateSet : FusionCandidates) {
+ dbgs() << "*** Fusion Candidate Set ***\n";
+ dbgs() << CandidateSet;
+ dbgs() << "****************************\n";
+ }
+}
+#endif
+
+/// Collect all loops in function at the same nest level, starting at the
+/// outermost level.
+///
+/// This data structure collects all loops at the same nest level for a
+/// given function (specified by the LoopInfo object). It starts at the
+/// outermost level.
+struct LoopDepthTree {
+ using LoopsOnLevelTy = SmallVector<LoopVector, 4>;
+ using iterator = LoopsOnLevelTy::iterator;
+ using const_iterator = LoopsOnLevelTy::const_iterator;
+
+ LoopDepthTree(LoopInfo &LI) : Depth(1) {
+ if (!LI.empty())
+ LoopsOnLevel.emplace_back(LoopVector(LI.rbegin(), LI.rend()));
+ }
+
+ /// Test whether a given loop has been removed from the function, and thus is
+ /// no longer valid.
+ bool isRemovedLoop(const Loop *L) const { return RemovedLoops.count(L); }
+
+ /// Record that a given loop has been removed from the function and is no
+ /// longer valid.
+ void removeLoop(const Loop *L) { RemovedLoops.insert(L); }
+
+ /// Descend the tree to the next (inner) nesting level
+ void descend() {
+ LoopsOnLevelTy LoopsOnNextLevel;
+
+ for (const LoopVector &LV : *this)
+ for (Loop *L : LV)
+ if (!isRemovedLoop(L) && L->begin() != L->end())
+ LoopsOnNextLevel.emplace_back(LoopVector(L->begin(), L->end()));
+
+ LoopsOnLevel = LoopsOnNextLevel;
+ RemovedLoops.clear();
+ Depth++;
+ }
+
+ bool empty() const { return size() == 0; }
+ size_t size() const { return LoopsOnLevel.size() - RemovedLoops.size(); }
+ unsigned getDepth() const { return Depth; }
+
+ iterator begin() { return LoopsOnLevel.begin(); }
+ iterator end() { return LoopsOnLevel.end(); }
+ const_iterator begin() const { return LoopsOnLevel.begin(); }
+ const_iterator end() const { return LoopsOnLevel.end(); }
+
+private:
+ /// Set of loops that have been removed from the function and are no longer
+ /// valid.
+ SmallPtrSet<const Loop *, 8> RemovedLoops;
+
+ /// Depth of the current level, starting at 1 (outermost loops).
+ unsigned Depth;
+
+ /// Vector of loops at the current depth level that have the same parent loop
+ LoopsOnLevelTy LoopsOnLevel;
+};
+
+#ifndef NDEBUG
+static void printLoopVector(const LoopVector &LV) {
+ dbgs() << "****************************\n";
+ for (auto L : LV)
+ printLoop(*L, dbgs());
+ dbgs() << "****************************\n";
+}
+#endif
+
+struct LoopFuser {
+private:
+ // Sets of control flow equivalent fusion candidates for a given nest level.
+ FusionCandidateCollection FusionCandidates;
+
+ LoopDepthTree LDT;
+ DomTreeUpdater DTU;
+
+ LoopInfo &LI;
+ DominatorTree &DT;
+ DependenceInfo &DI;
+ ScalarEvolution &SE;
+ PostDominatorTree &PDT;
+ OptimizationRemarkEmitter &ORE;
AssumptionCache &AC;
-
+
const TargetTransformInfo &TTI;
-public:
- LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
- ScalarEvolution &SE, PostDominatorTree &PDT,
+public:
+ LoopFuser(LoopInfo &LI, DominatorTree &DT, DependenceInfo &DI,
+ ScalarEvolution &SE, PostDominatorTree &PDT,
OptimizationRemarkEmitter &ORE, const DataLayout &DL,
AssumptionCache &AC, const TargetTransformInfo &TTI)
- : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
+ : LDT(LI), DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Lazy), LI(LI),
DT(DT), DI(DI), SE(SE), PDT(PDT), ORE(ORE), AC(AC), TTI(TTI) {}
-
- /// This is the main entry point for loop fusion. It will traverse the
- /// specified function and collect candidate loops to fuse, starting at the
- /// outermost nesting level and working inwards.
- bool fuseLoops(Function &F) {
-#ifndef NDEBUG
- if (VerboseFusionDebugging) {
- LI.print(dbgs());
- }
-#endif
-
- LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName()
- << "\n");
- bool Changed = false;
-
- while (!LDT.empty()) {
- LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth "
- << LDT.getDepth() << "\n";);
-
- for (const LoopVector &LV : LDT) {
- assert(LV.size() > 0 && "Empty loop set was build!");
-
- // Skip singleton loop sets as they do not offer fusion opportunities on
- // this level.
- if (LV.size() == 1)
- continue;
-#ifndef NDEBUG
- if (VerboseFusionDebugging) {
- LLVM_DEBUG({
- dbgs() << " Visit loop set (#" << LV.size() << "):\n";
- printLoopVector(LV);
- });
- }
-#endif
-
- collectFusionCandidates(LV);
- Changed |= fuseCandidates();
- }
-
- // Finished analyzing candidates at this level.
- // Descend to the next level and clear all of the candidates currently
- // collected. Note that it will not be possible to fuse any of the
- // existing candidates with new candidates because the new candidates will
- // be at a different nest level and thus not be control flow equivalent
- // with all of the candidates collected so far.
- LLVM_DEBUG(dbgs() << "Descend one level!\n");
- LDT.descend();
- FusionCandidates.clear();
- }
-
- if (Changed)
- LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump(););
-
-#ifndef NDEBUG
- assert(DT.verify());
- assert(PDT.verify());
- LI.verify(DT);
- SE.verify();
-#endif
-
- LLVM_DEBUG(dbgs() << "Loop Fusion complete\n");
- return Changed;
- }
-
-private:
- /// Determine if two fusion candidates are control flow equivalent.
- ///
- /// Two fusion candidates are control flow equivalent if when one executes,
- /// the other is guaranteed to execute. This is determined using dominators
- /// and post-dominators: if A dominates B and B post-dominates A then A and B
- /// are control-flow equivalent.
- bool isControlFlowEquivalent(const FusionCandidate &FC0,
- const FusionCandidate &FC1) const {
- assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
-
- return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(),
- DT, PDT);
- }
-
- /// Iterate over all loops in the given loop set and identify the loops that
- /// are eligible for fusion. Place all eligible fusion candidates into Control
- /// Flow Equivalent sets, sorted by dominance.
- void collectFusionCandidates(const LoopVector &LV) {
- for (Loop *L : LV) {
+
+ /// This is the main entry point for loop fusion. It will traverse the
+ /// specified function and collect candidate loops to fuse, starting at the
+ /// outermost nesting level and working inwards.
+ bool fuseLoops(Function &F) {
+#ifndef NDEBUG
+ if (VerboseFusionDebugging) {
+ LI.print(dbgs());
+ }
+#endif
+
+ LLVM_DEBUG(dbgs() << "Performing Loop Fusion on function " << F.getName()
+ << "\n");
+ bool Changed = false;
+
+ while (!LDT.empty()) {
+ LLVM_DEBUG(dbgs() << "Got " << LDT.size() << " loop sets for depth "
+ << LDT.getDepth() << "\n";);
+
+ for (const LoopVector &LV : LDT) {
+ assert(LV.size() > 0 && "Empty loop set was build!");
+
+ // Skip singleton loop sets as they do not offer fusion opportunities on
+ // this level.
+ if (LV.size() == 1)
+ continue;
+#ifndef NDEBUG
+ if (VerboseFusionDebugging) {
+ LLVM_DEBUG({
+ dbgs() << " Visit loop set (#" << LV.size() << "):\n";
+ printLoopVector(LV);
+ });
+ }
+#endif
+
+ collectFusionCandidates(LV);
+ Changed |= fuseCandidates();
+ }
+
+ // Finished analyzing candidates at this level.
+ // Descend to the next level and clear all of the candidates currently
+ // collected. Note that it will not be possible to fuse any of the
+ // existing candidates with new candidates because the new candidates will
+ // be at a different nest level and thus not be control flow equivalent
+ // with all of the candidates collected so far.
+ LLVM_DEBUG(dbgs() << "Descend one level!\n");
+ LDT.descend();
+ FusionCandidates.clear();
+ }
+
+ if (Changed)
+ LLVM_DEBUG(dbgs() << "Function after Loop Fusion: \n"; F.dump(););
+
+#ifndef NDEBUG
+ assert(DT.verify());
+ assert(PDT.verify());
+ LI.verify(DT);
+ SE.verify();
+#endif
+
+ LLVM_DEBUG(dbgs() << "Loop Fusion complete\n");
+ return Changed;
+ }
+
+private:
+ /// Determine if two fusion candidates are control flow equivalent.
+ ///
+ /// Two fusion candidates are control flow equivalent if when one executes,
+ /// the other is guaranteed to execute. This is determined using dominators
+ /// and post-dominators: if A dominates B and B post-dominates A then A and B
+ /// are control-flow equivalent.
+ bool isControlFlowEquivalent(const FusionCandidate &FC0,
+ const FusionCandidate &FC1) const {
+ assert(FC0.Preheader && FC1.Preheader && "Expecting valid preheaders");
+
+ return ::isControlFlowEquivalent(*FC0.getEntryBlock(), *FC1.getEntryBlock(),
+ DT, PDT);
+ }
+
+ /// Iterate over all loops in the given loop set and identify the loops that
+ /// are eligible for fusion. Place all eligible fusion candidates into Control
+ /// Flow Equivalent sets, sorted by dominance.
+ void collectFusionCandidates(const LoopVector &LV) {
+ for (Loop *L : LV) {
TTI::PeelingPreferences PP =
gatherPeelingPreferences(L, SE, TTI, None, None);
FusionCandidate CurrCand(L, &DT, &PDT, ORE, PP);
- if (!CurrCand.isEligibleForFusion(SE))
- continue;
-
- // Go through each list in FusionCandidates and determine if L is control
- // flow equivalent with the first loop in that list. If it is, append LV.
- // If not, go to the next list.
- // If no suitable list is found, start another list and add it to
- // FusionCandidates.
- bool FoundSet = false;
-
- for (auto &CurrCandSet : FusionCandidates) {
- if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) {
- CurrCandSet.insert(CurrCand);
- FoundSet = true;
-#ifndef NDEBUG
- if (VerboseFusionDebugging)
- LLVM_DEBUG(dbgs() << "Adding " << CurrCand
- << " to existing candidate set\n");
-#endif
- break;
- }
- }
- if (!FoundSet) {
- // No set was found. Create a new set and add to FusionCandidates
-#ifndef NDEBUG
- if (VerboseFusionDebugging)
- LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n");
-#endif
- FusionCandidateSet NewCandSet;
- NewCandSet.insert(CurrCand);
- FusionCandidates.push_back(NewCandSet);
- }
- NumFusionCandidates++;
- }
- }
-
- /// Determine if it is beneficial to fuse two loops.
- ///
- /// For now, this method simply returns true because we want to fuse as much
- /// as possible (primarily to test the pass). This method will evolve, over
- /// time, to add heuristics for profitability of fusion.
- bool isBeneficialFusion(const FusionCandidate &FC0,
- const FusionCandidate &FC1) {
- return true;
- }
-
- /// Determine if two fusion candidates have the same trip count (i.e., they
- /// execute the same number of iterations).
- ///
+ if (!CurrCand.isEligibleForFusion(SE))
+ continue;
+
+ // Go through each list in FusionCandidates and determine if L is control
+ // flow equivalent with the first loop in that list. If it is, append LV.
+ // If not, go to the next list.
+ // If no suitable list is found, start another list and add it to
+ // FusionCandidates.
+ bool FoundSet = false;
+
+ for (auto &CurrCandSet : FusionCandidates) {
+ if (isControlFlowEquivalent(*CurrCandSet.begin(), CurrCand)) {
+ CurrCandSet.insert(CurrCand);
+ FoundSet = true;
+#ifndef NDEBUG
+ if (VerboseFusionDebugging)
+ LLVM_DEBUG(dbgs() << "Adding " << CurrCand
+ << " to existing candidate set\n");
+#endif
+ break;
+ }
+ }
+ if (!FoundSet) {
+ // No set was found. Create a new set and add to FusionCandidates
+#ifndef NDEBUG
+ if (VerboseFusionDebugging)
+ LLVM_DEBUG(dbgs() << "Adding " << CurrCand << " to new set\n");
+#endif
+ FusionCandidateSet NewCandSet;
+ NewCandSet.insert(CurrCand);
+ FusionCandidates.push_back(NewCandSet);
+ }
+ NumFusionCandidates++;
+ }
+ }
+
+ /// Determine if it is beneficial to fuse two loops.
+ ///
+ /// For now, this method simply returns true because we want to fuse as much
+ /// as possible (primarily to test the pass). This method will evolve, over
+ /// time, to add heuristics for profitability of fusion.
+ bool isBeneficialFusion(const FusionCandidate &FC0,
+ const FusionCandidate &FC1) {
+ return true;
+ }
+
+ /// Determine if two fusion candidates have the same trip count (i.e., they
+ /// execute the same number of iterations).
+ ///
/// This function will return a pair of values. The first is a boolean,
/// stating whether or not the two candidates are known at compile time to
/// have the same TripCount. The second is the difference in the two
@@ -700,25 +700,25 @@ private:
haveIdenticalTripCounts(const FusionCandidate &FC0,
const FusionCandidate &FC1) const {
- const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
- if (isa<SCEVCouldNotCompute>(TripCount0)) {
- UncomputableTripCount++;
- LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
+ const SCEV *TripCount0 = SE.getBackedgeTakenCount(FC0.L);
+ if (isa<SCEVCouldNotCompute>(TripCount0)) {
+ UncomputableTripCount++;
+ LLVM_DEBUG(dbgs() << "Trip count of first loop could not be computed!");
return {false, None};
- }
-
- const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
- if (isa<SCEVCouldNotCompute>(TripCount1)) {
- UncomputableTripCount++;
- LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
+ }
+
+ const SCEV *TripCount1 = SE.getBackedgeTakenCount(FC1.L);
+ if (isa<SCEVCouldNotCompute>(TripCount1)) {
+ UncomputableTripCount++;
+ LLVM_DEBUG(dbgs() << "Trip count of second loop could not be computed!");
return {false, None};
- }
+ }
+
+ LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
+ << *TripCount1 << " are "
+ << (TripCount0 == TripCount1 ? "identical" : "different")
+ << "\n");
- LLVM_DEBUG(dbgs() << "\tTrip counts: " << *TripCount0 << " & "
- << *TripCount1 << " are "
- << (TripCount0 == TripCount1 ? "identical" : "different")
- << "\n");
-
if (TripCount0 == TripCount1)
return {true, 0};
@@ -754,8 +754,8 @@ private:
<< "\n");
return {false, Difference};
- }
-
+ }
+
void peelFusionCandidate(FusionCandidate &FC0, const FusionCandidate &FC1,
unsigned PeelCount) {
assert(FC0.AbleToPeel && "Should be able to peel loop");
@@ -820,37 +820,37 @@ private:
}
}
- /// Walk each set of control flow equivalent fusion candidates and attempt to
- /// fuse them. This does a single linear traversal of all candidates in the
- /// set. The conditions for legal fusion are checked at this point. If a pair
- /// of fusion candidates passes all legality checks, they are fused together
- /// and a new fusion candidate is created and added to the FusionCandidateSet.
- /// The original fusion candidates are then removed, as they are no longer
- /// valid.
- bool fuseCandidates() {
- bool Fused = false;
- LLVM_DEBUG(printFusionCandidates(FusionCandidates));
- for (auto &CandidateSet : FusionCandidates) {
- if (CandidateSet.size() < 2)
- continue;
-
- LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n"
- << CandidateSet << "\n");
-
- for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) {
- assert(!LDT.isRemovedLoop(FC0->L) &&
- "Should not have removed loops in CandidateSet!");
- auto FC1 = FC0;
- for (++FC1; FC1 != CandidateSet.end(); ++FC1) {
- assert(!LDT.isRemovedLoop(FC1->L) &&
- "Should not have removed loops in CandidateSet!");
-
- LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump();
- dbgs() << " with\n"; FC1->dump(); dbgs() << "\n");
-
- FC0->verify();
- FC1->verify();
-
+ /// Walk each set of control flow equivalent fusion candidates and attempt to
+ /// fuse them. This does a single linear traversal of all candidates in the
+ /// set. The conditions for legal fusion are checked at this point. If a pair
+ /// of fusion candidates passes all legality checks, they are fused together
+ /// and a new fusion candidate is created and added to the FusionCandidateSet.
+ /// The original fusion candidates are then removed, as they are no longer
+ /// valid.
+ bool fuseCandidates() {
+ bool Fused = false;
+ LLVM_DEBUG(printFusionCandidates(FusionCandidates));
+ for (auto &CandidateSet : FusionCandidates) {
+ if (CandidateSet.size() < 2)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Attempting fusion on Candidate Set:\n"
+ << CandidateSet << "\n");
+
+ for (auto FC0 = CandidateSet.begin(); FC0 != CandidateSet.end(); ++FC0) {
+ assert(!LDT.isRemovedLoop(FC0->L) &&
+ "Should not have removed loops in CandidateSet!");
+ auto FC1 = FC0;
+ for (++FC1; FC1 != CandidateSet.end(); ++FC1) {
+ assert(!LDT.isRemovedLoop(FC1->L) &&
+ "Should not have removed loops in CandidateSet!");
+
+ LLVM_DEBUG(dbgs() << "Attempting to fuse candidate \n"; FC0->dump();
+ dbgs() << " with\n"; FC1->dump(); dbgs() << "\n");
+
+ FC0->verify();
+ FC1->verify();
+
// Check if the candidates have identical tripcounts (first value of
// pair), and if not check the difference in the tripcounts between
// the loops (second value of pair). The difference is not equal to
@@ -877,92 +877,92 @@ private:
}
if (!SameTripCount) {
- LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
- "counts. Not fusing.\n");
- reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
- NonEqualTripCount);
- continue;
- }
-
- if (!isAdjacent(*FC0, *FC1)) {
- LLVM_DEBUG(dbgs()
- << "Fusion candidates are not adjacent. Not fusing.\n");
- reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);
- continue;
- }
-
- // Ensure that FC0 and FC1 have identical guards.
- // If one (or both) are not guarded, this check is not necessary.
- if (FC0->GuardBranch && FC1->GuardBranch &&
+ LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical trip "
+ "counts. Not fusing.\n");
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ NonEqualTripCount);
+ continue;
+ }
+
+ if (!isAdjacent(*FC0, *FC1)) {
+ LLVM_DEBUG(dbgs()
+ << "Fusion candidates are not adjacent. Not fusing.\n");
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1, NonAdjacent);
+ continue;
+ }
+
+ // Ensure that FC0 and FC1 have identical guards.
+ // If one (or both) are not guarded, this check is not necessary.
+ if (FC0->GuardBranch && FC1->GuardBranch &&
!haveIdenticalGuards(*FC0, *FC1) && !TCDifference) {
- LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
- "guards. Not Fusing.\n");
- reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
- NonIdenticalGuards);
- continue;
- }
-
- if (!isSafeToMoveBefore(*FC1->Preheader,
- *FC0->Preheader->getTerminator(), DT, &PDT,
- &DI)) {
- LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
- "instructions in preheader. Not fusing.\n");
- reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
- NonEmptyPreheader);
- continue;
- }
-
- if (FC0->GuardBranch) {
- assert(FC1->GuardBranch && "Expecting valid FC1 guard branch");
-
- if (!isSafeToMoveBefore(*FC0->ExitBlock,
- *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT,
- &PDT, &DI)) {
- LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
- "instructions in exit block. Not fusing.\n");
- reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
- NonEmptyExitBlock);
- continue;
- }
-
- if (!isSafeToMoveBefore(
- *FC1->GuardBranch->getParent(),
- *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT,
- &DI)) {
- LLVM_DEBUG(dbgs()
- << "Fusion candidate contains unsafe "
- "instructions in guard block. Not fusing.\n");
- reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
- NonEmptyGuardBlock);
- continue;
- }
- }
-
- // Check the dependencies across the loops and do not fuse if it would
- // violate them.
- if (!dependencesAllowFusion(*FC0, *FC1)) {
- LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
- reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
- InvalidDependencies);
- continue;
- }
-
- bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1);
- LLVM_DEBUG(dbgs()
- << "\tFusion appears to be "
- << (BeneficialToFuse ? "" : "un") << "profitable!\n");
- if (!BeneficialToFuse) {
- reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
- FusionNotBeneficial);
- continue;
- }
- // All analysis has completed and has determined that fusion is legal
- // and profitable. At this point, start transforming the code and
- // perform fusion.
-
- LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
- << *FC1 << "\n");
-
+ LLVM_DEBUG(dbgs() << "Fusion candidates do not have identical "
+ "guards. Not Fusing.\n");
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ NonIdenticalGuards);
+ continue;
+ }
+
+ if (!isSafeToMoveBefore(*FC1->Preheader,
+ *FC0->Preheader->getTerminator(), DT, &PDT,
+ &DI)) {
+ LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
+ "instructions in preheader. Not fusing.\n");
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ NonEmptyPreheader);
+ continue;
+ }
+
+ if (FC0->GuardBranch) {
+ assert(FC1->GuardBranch && "Expecting valid FC1 guard branch");
+
+ if (!isSafeToMoveBefore(*FC0->ExitBlock,
+ *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT,
+ &PDT, &DI)) {
+ LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
+ "instructions in exit block. Not fusing.\n");
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ NonEmptyExitBlock);
+ continue;
+ }
+
+ if (!isSafeToMoveBefore(
+ *FC1->GuardBranch->getParent(),
+ *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT,
+ &DI)) {
+ LLVM_DEBUG(dbgs()
+ << "Fusion candidate contains unsafe "
+ "instructions in guard block. Not fusing.\n");
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ NonEmptyGuardBlock);
+ continue;
+ }
+ }
+
+ // Check the dependencies across the loops and do not fuse if it would
+ // violate them.
+ if (!dependencesAllowFusion(*FC0, *FC1)) {
+ LLVM_DEBUG(dbgs() << "Memory dependencies do not allow fusion!\n");
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ InvalidDependencies);
+ continue;
+ }
+
+ bool BeneficialToFuse = isBeneficialFusion(*FC0, *FC1);
+ LLVM_DEBUG(dbgs()
+ << "\tFusion appears to be "
+ << (BeneficialToFuse ? "" : "un") << "profitable!\n");
+ if (!BeneficialToFuse) {
+ reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+ FusionNotBeneficial);
+ continue;
+ }
+ // All analysis has completed and has determined that fusion is legal
+ // and profitable. At this point, start transforming the code and
+ // perform fusion.
+
+ LLVM_DEBUG(dbgs() << "\tFusion is performed: " << *FC0 << " and "
+ << *FC1 << "\n");
+
FusionCandidate FC0Copy = *FC0;
// Peel the loop after determining that fusion is legal. The Loops
// will still be safe to fuse after the peeling is performed.
@@ -970,405 +970,405 @@ private:
if (Peel)
peelFusionCandidate(FC0Copy, *FC1, *TCDifference);
- // Report fusion to the Optimization Remarks.
- // Note this needs to be done *before* performFusion because
- // performFusion will change the original loops, making it not
- // possible to identify them after fusion is complete.
+ // Report fusion to the Optimization Remarks.
+ // Note this needs to be done *before* performFusion because
+ // performFusion will change the original loops, making it not
+ // possible to identify them after fusion is complete.
reportLoopFusion<OptimizationRemark>((Peel ? FC0Copy : *FC0), *FC1,
FuseCounter);
-
+
FusionCandidate FusedCand(
performFusion((Peel ? FC0Copy : *FC0), *FC1), &DT, &PDT, ORE,
FC0Copy.PP);
- FusedCand.verify();
- assert(FusedCand.isEligibleForFusion(SE) &&
- "Fused candidate should be eligible for fusion!");
-
- // Notify the loop-depth-tree that these loops are not valid objects
- LDT.removeLoop(FC1->L);
-
- CandidateSet.erase(FC0);
- CandidateSet.erase(FC1);
-
- auto InsertPos = CandidateSet.insert(FusedCand);
-
- assert(InsertPos.second &&
- "Unable to insert TargetCandidate in CandidateSet!");
-
- // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations
- // of the FC1 loop will attempt to fuse the new (fused) loop with the
- // remaining candidates in the current candidate set.
- FC0 = FC1 = InsertPos.first;
-
- LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet
- << "\n");
-
- Fused = true;
- }
- }
- }
- return Fused;
- }
-
- /// Rewrite all additive recurrences in a SCEV to use a new loop.
- class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> {
- public:
- AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL,
- bool UseMax = true)
- : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL),
- NewL(NewL) {}
-
- const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
- const Loop *ExprL = Expr->getLoop();
- SmallVector<const SCEV *, 2> Operands;
- if (ExprL == &OldL) {
- Operands.append(Expr->op_begin(), Expr->op_end());
- return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags());
- }
-
- if (OldL.contains(ExprL)) {
- bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE));
- if (!UseMax || !Pos || !Expr->isAffine()) {
- Valid = false;
- return Expr;
- }
- return visit(Expr->getStart());
- }
-
- for (const SCEV *Op : Expr->operands())
- Operands.push_back(visit(Op));
- return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags());
- }
-
- bool wasValidSCEV() const { return Valid; }
-
- private:
- bool Valid, UseMax;
- const Loop &OldL, &NewL;
- };
-
- /// Return false if the access functions of \p I0 and \p I1 could cause
- /// a negative dependence.
- bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0,
- Instruction &I1, bool EqualIsInvalid) {
- Value *Ptr0 = getLoadStorePointerOperand(&I0);
- Value *Ptr1 = getLoadStorePointerOperand(&I1);
- if (!Ptr0 || !Ptr1)
- return false;
-
- const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0);
- const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1);
-#ifndef NDEBUG
- if (VerboseFusionDebugging)
- LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs "
- << *SCEVPtr1 << "\n");
-#endif
- AddRecLoopReplacer Rewriter(SE, L0, L1);
- SCEVPtr0 = Rewriter.visit(SCEVPtr0);
-#ifndef NDEBUG
- if (VerboseFusionDebugging)
- LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0
- << " [Valid: " << Rewriter.wasValidSCEV() << "]\n");
-#endif
- if (!Rewriter.wasValidSCEV())
- return false;
-
- // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by
- // L0) and the other is not. We could check if it is monotone and test
- // the beginning and end value instead.
-
- BasicBlock *L0Header = L0.getHeader();
- auto HasNonLinearDominanceRelation = [&](const SCEV *S) {
- const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S);
- if (!AddRec)
- return false;
- return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) &&
- !DT.dominates(AddRec->getLoop()->getHeader(), L0Header);
- };
- if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation))
- return false;
-
- ICmpInst::Predicate Pred =
- EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE;
- bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1);
-#ifndef NDEBUG
- if (VerboseFusionDebugging)
- LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0
- << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1
- << "\n");
-#endif
- return IsAlwaysGE;
- }
-
- /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in
- /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses
- /// specified by @p DepChoice are used to determine this.
- bool dependencesAllowFusion(const FusionCandidate &FC0,
- const FusionCandidate &FC1, Instruction &I0,
- Instruction &I1, bool AnyDep,
- FusionDependenceAnalysisChoice DepChoice) {
-#ifndef NDEBUG
- if (VerboseFusionDebugging) {
- LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : "
- << DepChoice << "\n");
- }
-#endif
- switch (DepChoice) {
- case FUSION_DEPENDENCE_ANALYSIS_SCEV:
- return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep);
- case FUSION_DEPENDENCE_ANALYSIS_DA: {
- auto DepResult = DI.depends(&I0, &I1, true);
- if (!DepResult)
- return true;
-#ifndef NDEBUG
- if (VerboseFusionDebugging) {
- LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs());
- dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: "
- << (DepResult->isOrdered() ? "true" : "false")
- << "]\n");
- LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels()
- << "\n");
- }
-#endif
-
- if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())
- LLVM_DEBUG(
- dbgs() << "TODO: Implement pred/succ dependence handling!\n");
-
- // TODO: Can we actually use the dependence info analysis here?
- return false;
- }
-
- case FUSION_DEPENDENCE_ANALYSIS_ALL:
- return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
- FUSION_DEPENDENCE_ANALYSIS_SCEV) ||
- dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
- FUSION_DEPENDENCE_ANALYSIS_DA);
- }
-
- llvm_unreachable("Unknown fusion dependence analysis choice!");
- }
-
- /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused.
- bool dependencesAllowFusion(const FusionCandidate &FC0,
- const FusionCandidate &FC1) {
- LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
- << "\n");
- assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
- assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
-
- for (Instruction *WriteL0 : FC0.MemWrites) {
- for (Instruction *WriteL1 : FC1.MemWrites)
- if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
- /* AnyDep */ false,
- FusionDependenceAnalysis)) {
- InvalidDependencies++;
- return false;
- }
- for (Instruction *ReadL1 : FC1.MemReads)
- if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1,
- /* AnyDep */ false,
- FusionDependenceAnalysis)) {
- InvalidDependencies++;
- return false;
- }
- }
-
- for (Instruction *WriteL1 : FC1.MemWrites) {
- for (Instruction *WriteL0 : FC0.MemWrites)
- if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
- /* AnyDep */ false,
- FusionDependenceAnalysis)) {
- InvalidDependencies++;
- return false;
- }
- for (Instruction *ReadL0 : FC0.MemReads)
- if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1,
- /* AnyDep */ false,
- FusionDependenceAnalysis)) {
- InvalidDependencies++;
- return false;
- }
- }
-
- // Walk through all uses in FC1. For each use, find the reaching def. If the
- // def is located in FC0 then it is is not safe to fuse.
- for (BasicBlock *BB : FC1.L->blocks())
- for (Instruction &I : *BB)
- for (auto &Op : I.operands())
- if (Instruction *Def = dyn_cast<Instruction>(Op))
- if (FC0.L->contains(Def->getParent())) {
- InvalidDependencies++;
- return false;
- }
-
- return true;
- }
-
- /// Determine if two fusion candidates are adjacent in the CFG.
- ///
- /// This method will determine if there are additional basic blocks in the CFG
- /// between the exit of \p FC0 and the entry of \p FC1.
- /// If the two candidates are guarded loops, then it checks whether the
- /// non-loop successor of the \p FC0 guard branch is the entry block of \p
- /// FC1. If not, then the loops are not adjacent. If the two candidates are
- /// not guarded loops, then it checks whether the exit block of \p FC0 is the
- /// preheader of \p FC1.
- bool isAdjacent(const FusionCandidate &FC0,
- const FusionCandidate &FC1) const {
- // If the successor of the guard branch is FC1, then the loops are adjacent
- if (FC0.GuardBranch)
- return FC0.getNonLoopBlock() == FC1.getEntryBlock();
- else
- return FC0.ExitBlock == FC1.getEntryBlock();
- }
-
- /// Determine if two fusion candidates have identical guards
- ///
- /// This method will determine if two fusion candidates have the same guards.
- /// The guards are considered the same if:
- /// 1. The instructions to compute the condition used in the compare are
- /// identical.
- /// 2. The successors of the guard have the same flow into/around the loop.
- /// If the compare instructions are identical, then the first successor of the
- /// guard must go to the same place (either the preheader of the loop or the
- /// NonLoopBlock). In other words, the the first successor of both loops must
- /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
- /// the NonLoopBlock). The same must be true for the second successor.
- bool haveIdenticalGuards(const FusionCandidate &FC0,
- const FusionCandidate &FC1) const {
- assert(FC0.GuardBranch && FC1.GuardBranch &&
- "Expecting FC0 and FC1 to be guarded loops.");
-
- if (auto FC0CmpInst =
- dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))
- if (auto FC1CmpInst =
- dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))
- if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))
- return false;
-
- // The compare instructions are identical.
- // Now make sure the successor of the guards have the same flow into/around
- // the loop
- if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)
- return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);
- else
- return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
- }
-
+ FusedCand.verify();
+ assert(FusedCand.isEligibleForFusion(SE) &&
+ "Fused candidate should be eligible for fusion!");
+
+ // Notify the loop-depth-tree that these loops are not valid objects
+ LDT.removeLoop(FC1->L);
+
+ CandidateSet.erase(FC0);
+ CandidateSet.erase(FC1);
+
+ auto InsertPos = CandidateSet.insert(FusedCand);
+
+ assert(InsertPos.second &&
+ "Unable to insert TargetCandidate in CandidateSet!");
+
+ // Reset FC0 and FC1 the new (fused) candidate. Subsequent iterations
+ // of the FC1 loop will attempt to fuse the new (fused) loop with the
+ // remaining candidates in the current candidate set.
+ FC0 = FC1 = InsertPos.first;
+
+ LLVM_DEBUG(dbgs() << "Candidate Set (after fusion): " << CandidateSet
+ << "\n");
+
+ Fused = true;
+ }
+ }
+ }
+ return Fused;
+ }
+
+ /// Rewrite all additive recurrences in a SCEV to use a new loop.
+ class AddRecLoopReplacer : public SCEVRewriteVisitor<AddRecLoopReplacer> {
+ public:
+ AddRecLoopReplacer(ScalarEvolution &SE, const Loop &OldL, const Loop &NewL,
+ bool UseMax = true)
+ : SCEVRewriteVisitor(SE), Valid(true), UseMax(UseMax), OldL(OldL),
+ NewL(NewL) {}
+
+ const SCEV *visitAddRecExpr(const SCEVAddRecExpr *Expr) {
+ const Loop *ExprL = Expr->getLoop();
+ SmallVector<const SCEV *, 2> Operands;
+ if (ExprL == &OldL) {
+ Operands.append(Expr->op_begin(), Expr->op_end());
+ return SE.getAddRecExpr(Operands, &NewL, Expr->getNoWrapFlags());
+ }
+
+ if (OldL.contains(ExprL)) {
+ bool Pos = SE.isKnownPositive(Expr->getStepRecurrence(SE));
+ if (!UseMax || !Pos || !Expr->isAffine()) {
+ Valid = false;
+ return Expr;
+ }
+ return visit(Expr->getStart());
+ }
+
+ for (const SCEV *Op : Expr->operands())
+ Operands.push_back(visit(Op));
+ return SE.getAddRecExpr(Operands, ExprL, Expr->getNoWrapFlags());
+ }
+
+ bool wasValidSCEV() const { return Valid; }
+
+ private:
+ bool Valid, UseMax;
+ const Loop &OldL, &NewL;
+ };
+
+ /// Return false if the access functions of \p I0 and \p I1 could cause
+ /// a negative dependence.
+ bool accessDiffIsPositive(const Loop &L0, const Loop &L1, Instruction &I0,
+ Instruction &I1, bool EqualIsInvalid) {
+ Value *Ptr0 = getLoadStorePointerOperand(&I0);
+ Value *Ptr1 = getLoadStorePointerOperand(&I1);
+ if (!Ptr0 || !Ptr1)
+ return false;
+
+ const SCEV *SCEVPtr0 = SE.getSCEVAtScope(Ptr0, &L0);
+ const SCEV *SCEVPtr1 = SE.getSCEVAtScope(Ptr1, &L1);
+#ifndef NDEBUG
+ if (VerboseFusionDebugging)
+ LLVM_DEBUG(dbgs() << " Access function check: " << *SCEVPtr0 << " vs "
+ << *SCEVPtr1 << "\n");
+#endif
+ AddRecLoopReplacer Rewriter(SE, L0, L1);
+ SCEVPtr0 = Rewriter.visit(SCEVPtr0);
+#ifndef NDEBUG
+ if (VerboseFusionDebugging)
+ LLVM_DEBUG(dbgs() << " Access function after rewrite: " << *SCEVPtr0
+ << " [Valid: " << Rewriter.wasValidSCEV() << "]\n");
+#endif
+ if (!Rewriter.wasValidSCEV())
+ return false;
+
+ // TODO: isKnownPredicate doesnt work well when one SCEV is loop carried (by
+ // L0) and the other is not. We could check if it is monotone and test
+ // the beginning and end value instead.
+
+ BasicBlock *L0Header = L0.getHeader();
+ auto HasNonLinearDominanceRelation = [&](const SCEV *S) {
+ const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(S);
+ if (!AddRec)
+ return false;
+ return !DT.dominates(L0Header, AddRec->getLoop()->getHeader()) &&
+ !DT.dominates(AddRec->getLoop()->getHeader(), L0Header);
+ };
+ if (SCEVExprContains(SCEVPtr1, HasNonLinearDominanceRelation))
+ return false;
+
+ ICmpInst::Predicate Pred =
+ EqualIsInvalid ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_SGE;
+ bool IsAlwaysGE = SE.isKnownPredicate(Pred, SCEVPtr0, SCEVPtr1);
+#ifndef NDEBUG
+ if (VerboseFusionDebugging)
+ LLVM_DEBUG(dbgs() << " Relation: " << *SCEVPtr0
+ << (IsAlwaysGE ? " >= " : " may < ") << *SCEVPtr1
+ << "\n");
+#endif
+ return IsAlwaysGE;
+ }
+
+ /// Return true if the dependences between @p I0 (in @p L0) and @p I1 (in
+ /// @p L1) allow loop fusion of @p L0 and @p L1. The dependence analyses
+ /// specified by @p DepChoice are used to determine this.
+ bool dependencesAllowFusion(const FusionCandidate &FC0,
+ const FusionCandidate &FC1, Instruction &I0,
+ Instruction &I1, bool AnyDep,
+ FusionDependenceAnalysisChoice DepChoice) {
+#ifndef NDEBUG
+ if (VerboseFusionDebugging) {
+ LLVM_DEBUG(dbgs() << "Check dep: " << I0 << " vs " << I1 << " : "
+ << DepChoice << "\n");
+ }
+#endif
+ switch (DepChoice) {
+ case FUSION_DEPENDENCE_ANALYSIS_SCEV:
+ return accessDiffIsPositive(*FC0.L, *FC1.L, I0, I1, AnyDep);
+ case FUSION_DEPENDENCE_ANALYSIS_DA: {
+ auto DepResult = DI.depends(&I0, &I1, true);
+ if (!DepResult)
+ return true;
+#ifndef NDEBUG
+ if (VerboseFusionDebugging) {
+ LLVM_DEBUG(dbgs() << "DA res: "; DepResult->dump(dbgs());
+ dbgs() << " [#l: " << DepResult->getLevels() << "][Ordered: "
+ << (DepResult->isOrdered() ? "true" : "false")
+ << "]\n");
+ LLVM_DEBUG(dbgs() << "DepResult Levels: " << DepResult->getLevels()
+ << "\n");
+ }
+#endif
+
+ if (DepResult->getNextPredecessor() || DepResult->getNextSuccessor())
+ LLVM_DEBUG(
+ dbgs() << "TODO: Implement pred/succ dependence handling!\n");
+
+ // TODO: Can we actually use the dependence info analysis here?
+ return false;
+ }
+
+ case FUSION_DEPENDENCE_ANALYSIS_ALL:
+ return dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
+ FUSION_DEPENDENCE_ANALYSIS_SCEV) ||
+ dependencesAllowFusion(FC0, FC1, I0, I1, AnyDep,
+ FUSION_DEPENDENCE_ANALYSIS_DA);
+ }
+
+ llvm_unreachable("Unknown fusion dependence analysis choice!");
+ }
+
+ /// Perform a dependence check and return if @p FC0 and @p FC1 can be fused.
+ bool dependencesAllowFusion(const FusionCandidate &FC0,
+ const FusionCandidate &FC1) {
+ LLVM_DEBUG(dbgs() << "Check if " << FC0 << " can be fused with " << FC1
+ << "\n");
+ assert(FC0.L->getLoopDepth() == FC1.L->getLoopDepth());
+ assert(DT.dominates(FC0.getEntryBlock(), FC1.getEntryBlock()));
+
+ for (Instruction *WriteL0 : FC0.MemWrites) {
+ for (Instruction *WriteL1 : FC1.MemWrites)
+ if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
+ /* AnyDep */ false,
+ FusionDependenceAnalysis)) {
+ InvalidDependencies++;
+ return false;
+ }
+ for (Instruction *ReadL1 : FC1.MemReads)
+ if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *ReadL1,
+ /* AnyDep */ false,
+ FusionDependenceAnalysis)) {
+ InvalidDependencies++;
+ return false;
+ }
+ }
+
+ for (Instruction *WriteL1 : FC1.MemWrites) {
+ for (Instruction *WriteL0 : FC0.MemWrites)
+ if (!dependencesAllowFusion(FC0, FC1, *WriteL0, *WriteL1,
+ /* AnyDep */ false,
+ FusionDependenceAnalysis)) {
+ InvalidDependencies++;
+ return false;
+ }
+ for (Instruction *ReadL0 : FC0.MemReads)
+ if (!dependencesAllowFusion(FC0, FC1, *ReadL0, *WriteL1,
+ /* AnyDep */ false,
+ FusionDependenceAnalysis)) {
+ InvalidDependencies++;
+ return false;
+ }
+ }
+
+ // Walk through all uses in FC1. For each use, find the reaching def. If the
+ // def is located in FC0 then it is is not safe to fuse.
+ for (BasicBlock *BB : FC1.L->blocks())
+ for (Instruction &I : *BB)
+ for (auto &Op : I.operands())
+ if (Instruction *Def = dyn_cast<Instruction>(Op))
+ if (FC0.L->contains(Def->getParent())) {
+ InvalidDependencies++;
+ return false;
+ }
+
+ return true;
+ }
+
+ /// Determine if two fusion candidates are adjacent in the CFG.
+ ///
+ /// This method will determine if there are additional basic blocks in the CFG
+ /// between the exit of \p FC0 and the entry of \p FC1.
+ /// If the two candidates are guarded loops, then it checks whether the
+ /// non-loop successor of the \p FC0 guard branch is the entry block of \p
+ /// FC1. If not, then the loops are not adjacent. If the two candidates are
+ /// not guarded loops, then it checks whether the exit block of \p FC0 is the
+ /// preheader of \p FC1.
+ bool isAdjacent(const FusionCandidate &FC0,
+ const FusionCandidate &FC1) const {
+ // If the successor of the guard branch is FC1, then the loops are adjacent
+ if (FC0.GuardBranch)
+ return FC0.getNonLoopBlock() == FC1.getEntryBlock();
+ else
+ return FC0.ExitBlock == FC1.getEntryBlock();
+ }
+
+ /// Determine if two fusion candidates have identical guards
+ ///
+ /// This method will determine if two fusion candidates have the same guards.
+ /// The guards are considered the same if:
+ /// 1. The instructions to compute the condition used in the compare are
+ /// identical.
+ /// 2. The successors of the guard have the same flow into/around the loop.
+ /// If the compare instructions are identical, then the first successor of the
+ /// guard must go to the same place (either the preheader of the loop or the
+ /// NonLoopBlock). In other words, the the first successor of both loops must
+ /// both go into the loop (i.e., the preheader) or go around the loop (i.e.,
+ /// the NonLoopBlock). The same must be true for the second successor.
+ bool haveIdenticalGuards(const FusionCandidate &FC0,
+ const FusionCandidate &FC1) const {
+ assert(FC0.GuardBranch && FC1.GuardBranch &&
+ "Expecting FC0 and FC1 to be guarded loops.");
+
+ if (auto FC0CmpInst =
+ dyn_cast<Instruction>(FC0.GuardBranch->getCondition()))
+ if (auto FC1CmpInst =
+ dyn_cast<Instruction>(FC1.GuardBranch->getCondition()))
+ if (!FC0CmpInst->isIdenticalTo(FC1CmpInst))
+ return false;
+
+ // The compare instructions are identical.
+ // Now make sure the successor of the guards have the same flow into/around
+ // the loop
+ if (FC0.GuardBranch->getSuccessor(0) == FC0.Preheader)
+ return (FC1.GuardBranch->getSuccessor(0) == FC1.Preheader);
+ else
+ return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
+ }
+
/// Modify the latch branch of FC to be unconditional since successors of the
/// branch are the same.
- void simplifyLatchBranch(const FusionCandidate &FC) const {
- BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator());
- if (FCLatchBranch) {
- assert(FCLatchBranch->isConditional() &&
- FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
- "Expecting the two successors of FCLatchBranch to be the same");
+ void simplifyLatchBranch(const FusionCandidate &FC) const {
+ BranchInst *FCLatchBranch = dyn_cast<BranchInst>(FC.Latch->getTerminator());
+ if (FCLatchBranch) {
+ assert(FCLatchBranch->isConditional() &&
+ FCLatchBranch->getSuccessor(0) == FCLatchBranch->getSuccessor(1) &&
+ "Expecting the two successors of FCLatchBranch to be the same");
BranchInst *NewBranch =
BranchInst::Create(FCLatchBranch->getSuccessor(0));
ReplaceInstWithInst(FCLatchBranch, NewBranch);
- }
- }
-
- /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique
- /// successor, then merge FC0.Latch with its unique successor.
- void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) {
- moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI);
- if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) {
- MergeBlockIntoPredecessor(Succ, &DTU, &LI);
- DTU.flush();
- }
- }
-
- /// Fuse two fusion candidates, creating a new fused loop.
- ///
- /// This method contains the mechanics of fusing two loops, represented by \p
- /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1
- /// postdominates \p FC0 (making them control flow equivalent). It also
- /// assumes that the other conditions for fusion have been met: adjacent,
- /// identical trip counts, and no negative distance dependencies exist that
- /// would prevent fusion. Thus, there is no checking for these conditions in
- /// this method.
- ///
- /// Fusion is performed by rewiring the CFG to update successor blocks of the
- /// components of tho loop. Specifically, the following changes are done:
- ///
- /// 1. The preheader of \p FC1 is removed as it is no longer necessary
- /// (because it is currently only a single statement block).
- /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1.
- /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0.
- /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0.
- ///
- /// All of these modifications are done with dominator tree updates, thus
- /// keeping the dominator (and post dominator) information up-to-date.
- ///
- /// This can be improved in the future by actually merging blocks during
- /// fusion. For example, the preheader of \p FC1 can be merged with the
- /// preheader of \p FC0. This would allow loops with more than a single
- /// statement in the preheader to be fused. Similarly, the latch blocks of the
- /// two loops could also be fused into a single block. This will require
- /// analysis to prove it is safe to move the contents of the block past
- /// existing code, which currently has not been implemented.
- Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) {
- assert(FC0.isValid() && FC1.isValid() &&
- "Expecting valid fusion candidates");
-
- LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
- dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
-
- // Move instructions from the preheader of FC1 to the end of the preheader
- // of FC0.
- moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI);
-
- // Fusing guarded loops is handled slightly differently than non-guarded
- // loops and has been broken out into a separate method instead of trying to
- // intersperse the logic within a single method.
- if (FC0.GuardBranch)
- return fuseGuardedLoops(FC0, FC1);
-
+ }
+ }
+
+ /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique
+ /// successor, then merge FC0.Latch with its unique successor.
+ void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) {
+ moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI);
+ if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) {
+ MergeBlockIntoPredecessor(Succ, &DTU, &LI);
+ DTU.flush();
+ }
+ }
+
+ /// Fuse two fusion candidates, creating a new fused loop.
+ ///
+ /// This method contains the mechanics of fusing two loops, represented by \p
+ /// FC0 and \p FC1. It is assumed that \p FC0 dominates \p FC1 and \p FC1
+ /// postdominates \p FC0 (making them control flow equivalent). It also
+ /// assumes that the other conditions for fusion have been met: adjacent,
+ /// identical trip counts, and no negative distance dependencies exist that
+ /// would prevent fusion. Thus, there is no checking for these conditions in
+ /// this method.
+ ///
+ /// Fusion is performed by rewiring the CFG to update successor blocks of the
+ /// components of tho loop. Specifically, the following changes are done:
+ ///
+ /// 1. The preheader of \p FC1 is removed as it is no longer necessary
+ /// (because it is currently only a single statement block).
+ /// 2. The latch of \p FC0 is modified to jump to the header of \p FC1.
+ /// 3. The latch of \p FC1 i modified to jump to the header of \p FC0.
+ /// 4. All blocks from \p FC1 are removed from FC1 and added to FC0.
+ ///
+ /// All of these modifications are done with dominator tree updates, thus
+ /// keeping the dominator (and post dominator) information up-to-date.
+ ///
+ /// This can be improved in the future by actually merging blocks during
+ /// fusion. For example, the preheader of \p FC1 can be merged with the
+ /// preheader of \p FC0. This would allow loops with more than a single
+ /// statement in the preheader to be fused. Similarly, the latch blocks of the
+ /// two loops could also be fused into a single block. This will require
+ /// analysis to prove it is safe to move the contents of the block past
+ /// existing code, which currently has not been implemented.
+ Loop *performFusion(const FusionCandidate &FC0, const FusionCandidate &FC1) {
+ assert(FC0.isValid() && FC1.isValid() &&
+ "Expecting valid fusion candidates");
+
+ LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
+ dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
+
+ // Move instructions from the preheader of FC1 to the end of the preheader
+ // of FC0.
+ moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI);
+
+ // Fusing guarded loops is handled slightly differently than non-guarded
+ // loops and has been broken out into a separate method instead of trying to
+ // intersperse the logic within a single method.
+ if (FC0.GuardBranch)
+ return fuseGuardedLoops(FC0, FC1);
+
assert(FC1.Preheader ==
(FC0.Peeled ? FC0.ExitBlock->getUniqueSuccessor() : FC0.ExitBlock));
- assert(FC1.Preheader->size() == 1 &&
- FC1.Preheader->getSingleSuccessor() == FC1.Header);
-
- // Remember the phi nodes originally in the header of FC0 in order to rewire
- // them later. However, this is only necessary if the new loop carried
- // values might not dominate the exiting branch. While we do not generally
- // test if this is the case but simply insert intermediate phi nodes, we
- // need to make sure these intermediate phi nodes have different
- // predecessors. To this end, we filter the special case where the exiting
- // block is the latch block of the first loop. Nothing needs to be done
- // anyway as all loop carried values dominate the latch and thereby also the
- // exiting branch.
- SmallVector<PHINode *, 8> OriginalFC0PHIs;
- if (FC0.ExitingBlock != FC0.Latch)
- for (PHINode &PHI : FC0.Header->phis())
- OriginalFC0PHIs.push_back(&PHI);
-
- // Replace incoming blocks for header PHIs first.
- FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
- FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
-
- // Then modify the control flow and update DT and PDT.
- SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
-
- // The old exiting block of the first loop (FC0) has to jump to the header
- // of the second as we need to execute the code in the second header block
- // regardless of the trip count. That is, if the trip count is 0, so the
- // back edge is never taken, we still have to execute both loop headers,
- // especially (but not only!) if the second is a do-while style loop.
- // However, doing so might invalidate the phi nodes of the first loop as
- // the new values do only need to dominate their latch and not the exiting
- // predicate. To remedy this potential problem we always introduce phi
- // nodes in the header of the second loop later that select the loop carried
- // value, if the second header was reached through an old latch of the
- // first, or undef otherwise. This is sound as exiting the first implies the
- // second will exit too, __without__ taking the back-edge. [Their
- // trip-counts are equal after all.
- // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go
- // to FC1.Header? I think this is basically what the three sequences are
- // trying to accomplish; however, doing this directly in the CFG may mean
- // the DT/PDT becomes invalid
+ assert(FC1.Preheader->size() == 1 &&
+ FC1.Preheader->getSingleSuccessor() == FC1.Header);
+
+ // Remember the phi nodes originally in the header of FC0 in order to rewire
+ // them later. However, this is only necessary if the new loop carried
+ // values might not dominate the exiting branch. While we do not generally
+ // test if this is the case but simply insert intermediate phi nodes, we
+ // need to make sure these intermediate phi nodes have different
+ // predecessors. To this end, we filter the special case where the exiting
+ // block is the latch block of the first loop. Nothing needs to be done
+ // anyway as all loop carried values dominate the latch and thereby also the
+ // exiting branch.
+ SmallVector<PHINode *, 8> OriginalFC0PHIs;
+ if (FC0.ExitingBlock != FC0.Latch)
+ for (PHINode &PHI : FC0.Header->phis())
+ OriginalFC0PHIs.push_back(&PHI);
+
+ // Replace incoming blocks for header PHIs first.
+ FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
+ FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
+
+ // Then modify the control flow and update DT and PDT.
+ SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
+
+ // The old exiting block of the first loop (FC0) has to jump to the header
+ // of the second as we need to execute the code in the second header block
+ // regardless of the trip count. That is, if the trip count is 0, so the
+ // back edge is never taken, we still have to execute both loop headers,
+ // especially (but not only!) if the second is a do-while style loop.
+ // However, doing so might invalidate the phi nodes of the first loop as
+ // the new values do only need to dominate their latch and not the exiting
+ // predicate. To remedy this potential problem we always introduce phi
+ // nodes in the header of the second loop later that select the loop carried
+ // value, if the second header was reached through an old latch of the
+ // first, or undef otherwise. This is sound as exiting the first implies the
+ // second will exit too, __without__ taking the back-edge. [Their
+ // trip-counts are equal after all.
+ // KB: Would this sequence be simpler to just just make FC0.ExitingBlock go
+ // to FC1.Header? I think this is basically what the three sequences are
+ // trying to accomplish; however, doing this directly in the CFG may mean
+ // the DT/PDT becomes invalid
if (!FC0.Peeled) {
FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC1.Preheader,
FC1.Header);
@@ -1379,7 +1379,7 @@ private:
} else {
TreeUpdates.emplace_back(DominatorTree::UpdateType(
DominatorTree::Delete, FC0.ExitBlock, FC1.Preheader));
-
+
// Remove the ExitBlock of the first Loop (also not needed)
FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
FC1.Header);
@@ -1391,215 +1391,215 @@ private:
new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
}
- // The pre-header of L1 is not necessary anymore.
+ // The pre-header of L1 is not necessary anymore.
assert(pred_empty(FC1.Preheader));
- FC1.Preheader->getTerminator()->eraseFromParent();
- new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
- TreeUpdates.emplace_back(DominatorTree::UpdateType(
- DominatorTree::Delete, FC1.Preheader, FC1.Header));
-
- // Moves the phi nodes from the second to the first loops header block.
- while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
- if (SE.isSCEVable(PHI->getType()))
- SE.forgetValue(PHI);
- if (PHI->hasNUsesOrMore(1))
- PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
- else
- PHI->eraseFromParent();
- }
-
- // Introduce new phi nodes in the second loop header to ensure
- // exiting the first and jumping to the header of the second does not break
- // the SSA property of the phis originally in the first loop. See also the
- // comment above.
- Instruction *L1HeaderIP = &FC1.Header->front();
- for (PHINode *LCPHI : OriginalFC0PHIs) {
- int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
- assert(L1LatchBBIdx >= 0 &&
- "Expected loop carried value to be rewired at this point!");
-
- Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
-
- PHINode *L1HeaderPHI = PHINode::Create(
- LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
- L1HeaderPHI->addIncoming(LCV, FC0.Latch);
- L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
- FC0.ExitingBlock);
-
- LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
- }
-
- // Replace latch terminator destinations.
- FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
- FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
-
+ FC1.Preheader->getTerminator()->eraseFromParent();
+ new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC1.Preheader, FC1.Header));
+
+ // Moves the phi nodes from the second to the first loops header block.
+ while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
+ if (SE.isSCEVable(PHI->getType()))
+ SE.forgetValue(PHI);
+ if (PHI->hasNUsesOrMore(1))
+ PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
+ else
+ PHI->eraseFromParent();
+ }
+
+ // Introduce new phi nodes in the second loop header to ensure
+ // exiting the first and jumping to the header of the second does not break
+ // the SSA property of the phis originally in the first loop. See also the
+ // comment above.
+ Instruction *L1HeaderIP = &FC1.Header->front();
+ for (PHINode *LCPHI : OriginalFC0PHIs) {
+ int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
+ assert(L1LatchBBIdx >= 0 &&
+ "Expected loop carried value to be rewired at this point!");
+
+ Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
+
+ PHINode *L1HeaderPHI = PHINode::Create(
+ LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
+ L1HeaderPHI->addIncoming(LCV, FC0.Latch);
+ L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
+ FC0.ExitingBlock);
+
+ LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
+ }
+
+ // Replace latch terminator destinations.
+ FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
+ FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
+
// Modify the latch branch of FC0 to be unconditional as both successors of
- // the branch are the same.
- simplifyLatchBranch(FC0);
-
- // If FC0.Latch and FC0.ExitingBlock are the same then we have already
- // performed the updates above.
- if (FC0.Latch != FC0.ExitingBlock)
- TreeUpdates.emplace_back(DominatorTree::UpdateType(
- DominatorTree::Insert, FC0.Latch, FC1.Header));
-
- TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
- FC0.Latch, FC0.Header));
- TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
- FC1.Latch, FC0.Header));
- TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
- FC1.Latch, FC1.Header));
-
- // Update DT/PDT
- DTU.applyUpdates(TreeUpdates);
-
- LI.removeBlock(FC1.Preheader);
- DTU.deleteBB(FC1.Preheader);
+ // the branch are the same.
+ simplifyLatchBranch(FC0);
+
+ // If FC0.Latch and FC0.ExitingBlock are the same then we have already
+ // performed the updates above.
+ if (FC0.Latch != FC0.ExitingBlock)
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Insert, FC0.Latch, FC1.Header));
+
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+ FC0.Latch, FC0.Header));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
+ FC1.Latch, FC0.Header));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+ FC1.Latch, FC1.Header));
+
+ // Update DT/PDT
+ DTU.applyUpdates(TreeUpdates);
+
+ LI.removeBlock(FC1.Preheader);
+ DTU.deleteBB(FC1.Preheader);
if (FC0.Peeled) {
LI.removeBlock(FC0.ExitBlock);
DTU.deleteBB(FC0.ExitBlock);
}
- DTU.flush();
-
- // Is there a way to keep SE up-to-date so we don't need to forget the loops
- // and rebuild the information in subsequent passes of fusion?
- // Note: Need to forget the loops before merging the loop latches, as
- // mergeLatch may remove the only block in FC1.
- SE.forgetLoop(FC1.L);
- SE.forgetLoop(FC0.L);
-
- // Move instructions from FC0.Latch to FC1.Latch.
- // Note: mergeLatch requires an updated DT.
- mergeLatch(FC0, FC1);
-
- // Merge the loops.
+ DTU.flush();
+
+ // Is there a way to keep SE up-to-date so we don't need to forget the loops
+ // and rebuild the information in subsequent passes of fusion?
+ // Note: Need to forget the loops before merging the loop latches, as
+ // mergeLatch may remove the only block in FC1.
+ SE.forgetLoop(FC1.L);
+ SE.forgetLoop(FC0.L);
+
+ // Move instructions from FC0.Latch to FC1.Latch.
+ // Note: mergeLatch requires an updated DT.
+ mergeLatch(FC0, FC1);
+
+ // Merge the loops.
SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
- for (BasicBlock *BB : Blocks) {
- FC0.L->addBlockEntry(BB);
- FC1.L->removeBlockFromLoop(BB);
- if (LI.getLoopFor(BB) != FC1.L)
- continue;
- LI.changeLoopFor(BB, FC0.L);
- }
+ for (BasicBlock *BB : Blocks) {
+ FC0.L->addBlockEntry(BB);
+ FC1.L->removeBlockFromLoop(BB);
+ if (LI.getLoopFor(BB) != FC1.L)
+ continue;
+ LI.changeLoopFor(BB, FC0.L);
+ }
while (!FC1.L->isInnermost()) {
- const auto &ChildLoopIt = FC1.L->begin();
- Loop *ChildLoop = *ChildLoopIt;
- FC1.L->removeChildLoop(ChildLoopIt);
- FC0.L->addChildLoop(ChildLoop);
- }
-
- // Delete the now empty loop L1.
- LI.erase(FC1.L);
-
-#ifndef NDEBUG
- assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
- assert(DT.verify(DominatorTree::VerificationLevel::Fast));
- assert(PDT.verify());
- LI.verify(DT);
- SE.verify();
-#endif
-
- LLVM_DEBUG(dbgs() << "Fusion done:\n");
-
- return FC0.L;
- }
-
- /// Report details on loop fusion opportunities.
- ///
- /// This template function can be used to report both successful and missed
- /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
- /// be one of:
- /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful
- /// given two valid fusion candidates.
- /// - OptimizationRemark to report successful fusion of two fusion
- /// candidates.
- /// The remarks will be printed using the form:
- /// <path/filename>:<line number>:<column number>: [<function name>]:
- /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
- template <typename RemarkKind>
- void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
- llvm::Statistic &Stat) {
- assert(FC0.Preheader && FC1.Preheader &&
- "Expecting valid fusion candidates");
- using namespace ore;
- ++Stat;
- ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
- FC0.Preheader)
- << "[" << FC0.Preheader->getParent()->getName()
- << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
- << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
- << ": " << Stat.getDesc());
- }
-
- /// Fuse two guarded fusion candidates, creating a new fused loop.
- ///
- /// Fusing guarded loops is handled much the same way as fusing non-guarded
- /// loops. The rewiring of the CFG is slightly different though, because of
- /// the presence of the guards around the loops and the exit blocks after the
- /// loop body. As such, the new loop is rewired as follows:
- /// 1. Keep the guard branch from FC0 and use the non-loop block target
- /// from the FC1 guard branch.
- /// 2. Remove the exit block from FC0 (this exit block should be empty
- /// right now).
- /// 3. Remove the guard branch for FC1
- /// 4. Remove the preheader for FC1.
- /// The exit block successor for the latch of FC0 is updated to be the header
- /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
- /// be the header of FC0, thus creating the fused loop.
- Loop *fuseGuardedLoops(const FusionCandidate &FC0,
- const FusionCandidate &FC1) {
- assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
-
- BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
- BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
- BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
- BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
+ const auto &ChildLoopIt = FC1.L->begin();
+ Loop *ChildLoop = *ChildLoopIt;
+ FC1.L->removeChildLoop(ChildLoopIt);
+ FC0.L->addChildLoop(ChildLoop);
+ }
+
+ // Delete the now empty loop L1.
+ LI.erase(FC1.L);
+
+#ifndef NDEBUG
+ assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+ assert(PDT.verify());
+ LI.verify(DT);
+ SE.verify();
+#endif
+
+ LLVM_DEBUG(dbgs() << "Fusion done:\n");
+
+ return FC0.L;
+ }
+
+ /// Report details on loop fusion opportunities.
+ ///
+ /// This template function can be used to report both successful and missed
+ /// loop fusion opportunities, based on the RemarkKind. The RemarkKind should
+ /// be one of:
+ /// - OptimizationRemarkMissed to report when loop fusion is unsuccessful
+ /// given two valid fusion candidates.
+ /// - OptimizationRemark to report successful fusion of two fusion
+ /// candidates.
+ /// The remarks will be printed using the form:
+ /// <path/filename>:<line number>:<column number>: [<function name>]:
+ /// <Cand1 Preheader> and <Cand2 Preheader>: <Stat Description>
+ template <typename RemarkKind>
+ void reportLoopFusion(const FusionCandidate &FC0, const FusionCandidate &FC1,
+ llvm::Statistic &Stat) {
+ assert(FC0.Preheader && FC1.Preheader &&
+ "Expecting valid fusion candidates");
+ using namespace ore;
+ ++Stat;
+ ORE.emit(RemarkKind(DEBUG_TYPE, Stat.getName(), FC0.L->getStartLoc(),
+ FC0.Preheader)
+ << "[" << FC0.Preheader->getParent()->getName()
+ << "]: " << NV("Cand1", StringRef(FC0.Preheader->getName()))
+ << " and " << NV("Cand2", StringRef(FC1.Preheader->getName()))
+ << ": " << Stat.getDesc());
+ }
+
+ /// Fuse two guarded fusion candidates, creating a new fused loop.
+ ///
+ /// Fusing guarded loops is handled much the same way as fusing non-guarded
+ /// loops. The rewiring of the CFG is slightly different though, because of
+ /// the presence of the guards around the loops and the exit blocks after the
+ /// loop body. As such, the new loop is rewired as follows:
+ /// 1. Keep the guard branch from FC0 and use the non-loop block target
+ /// from the FC1 guard branch.
+ /// 2. Remove the exit block from FC0 (this exit block should be empty
+ /// right now).
+ /// 3. Remove the guard branch for FC1
+ /// 4. Remove the preheader for FC1.
+ /// The exit block successor for the latch of FC0 is updated to be the header
+ /// of FC1 and the non-exit block successor of the latch of FC1 is updated to
+ /// be the header of FC0, thus creating the fused loop.
+ Loop *fuseGuardedLoops(const FusionCandidate &FC0,
+ const FusionCandidate &FC1) {
+ assert(FC0.GuardBranch && FC1.GuardBranch && "Expecting guarded loops");
+
+ BasicBlock *FC0GuardBlock = FC0.GuardBranch->getParent();
+ BasicBlock *FC1GuardBlock = FC1.GuardBranch->getParent();
+ BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
+ BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
BasicBlock *FC0ExitBlockSuccessor = FC0.ExitBlock->getUniqueSuccessor();
-
- // Move instructions from the exit block of FC0 to the beginning of the exit
+
+ // Move instructions from the exit block of FC0 to the beginning of the exit
// block of FC1, in the case that the FC0 loop has not been peeled. In the
// case that FC0 loop is peeled, then move the instructions of the successor
// of the FC0 Exit block to the beginning of the exit block of FC1.
moveInstructionsToTheBeginning(
(FC0.Peeled ? *FC0ExitBlockSuccessor : *FC0.ExitBlock), *FC1.ExitBlock,
DT, PDT, DI);
-
- // Move instructions from the guard block of FC1 to the end of the guard
- // block of FC0.
- moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI);
-
- assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
-
- SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
-
- ////////////////////////////////////////////////////////////////////////////
- // Update the Loop Guard
- ////////////////////////////////////////////////////////////////////////////
- // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
- // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
- // Thus, one path from the guard goes to the preheader for FC0 (and thus
- // executes the new fused loop) and the other path goes to the NonLoopBlock
- // for FC1 (where FC1 guard would have gone if FC1 was not executed).
- FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock);
- FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
-
+
+ // Move instructions from the guard block of FC1 to the end of the guard
+ // block of FC0.
+ moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI);
+
+ assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
+
+ SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
+
+ ////////////////////////////////////////////////////////////////////////////
+ // Update the Loop Guard
+ ////////////////////////////////////////////////////////////////////////////
+ // The guard for FC0 is updated to guard both FC0 and FC1. This is done by
+ // changing the NonLoopGuardBlock for FC0 to the NonLoopGuardBlock for FC1.
+ // Thus, one path from the guard goes to the preheader for FC0 (and thus
+ // executes the new fused loop) and the other path goes to the NonLoopBlock
+ // for FC1 (where FC1 guard would have gone if FC1 was not executed).
+ FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock);
+ FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
+
BasicBlock *BBToUpdate = FC0.Peeled ? FC0ExitBlockSuccessor : FC0.ExitBlock;
BBToUpdate->getTerminator()->replaceUsesOfWith(FC1GuardBlock, FC1.Header);
- // The guard of FC1 is not necessary anymore.
- FC1.GuardBranch->eraseFromParent();
- new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);
-
- TreeUpdates.emplace_back(DominatorTree::UpdateType(
- DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
- TreeUpdates.emplace_back(DominatorTree::UpdateType(
- DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
- TreeUpdates.emplace_back(DominatorTree::UpdateType(
- DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
- TreeUpdates.emplace_back(DominatorTree::UpdateType(
- DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
-
+ // The guard of FC1 is not necessary anymore.
+ FC1.GuardBranch->eraseFromParent();
+ new UnreachableInst(FC1GuardBlock->getContext(), FC1GuardBlock);
+
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC1GuardBlock, FC1.Preheader));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC1GuardBlock, FC1NonLoopBlock));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC0GuardBlock, FC1GuardBlock));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Insert, FC0GuardBlock, FC1NonLoopBlock));
+
if (FC0.Peeled) {
// Remove the Block after the ExitBlock of FC0
TreeUpdates.emplace_back(DominatorTree::UpdateType(
@@ -1610,273 +1610,273 @@ private:
}
assert(pred_empty(FC1GuardBlock) &&
- "Expecting guard block to have no predecessors");
+ "Expecting guard block to have no predecessors");
assert(succ_empty(FC1GuardBlock) &&
- "Expecting guard block to have no successors");
-
- // Remember the phi nodes originally in the header of FC0 in order to rewire
- // them later. However, this is only necessary if the new loop carried
- // values might not dominate the exiting branch. While we do not generally
- // test if this is the case but simply insert intermediate phi nodes, we
- // need to make sure these intermediate phi nodes have different
- // predecessors. To this end, we filter the special case where the exiting
- // block is the latch block of the first loop. Nothing needs to be done
- // anyway as all loop carried values dominate the latch and thereby also the
- // exiting branch.
- // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
- // (because the loops are rotated. Thus, nothing will ever be added to
- // OriginalFC0PHIs.
- SmallVector<PHINode *, 8> OriginalFC0PHIs;
- if (FC0.ExitingBlock != FC0.Latch)
- for (PHINode &PHI : FC0.Header->phis())
- OriginalFC0PHIs.push_back(&PHI);
-
- assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
-
- // Replace incoming blocks for header PHIs first.
- FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
- FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
-
- // The old exiting block of the first loop (FC0) has to jump to the header
- // of the second as we need to execute the code in the second header block
- // regardless of the trip count. That is, if the trip count is 0, so the
- // back edge is never taken, we still have to execute both loop headers,
- // especially (but not only!) if the second is a do-while style loop.
- // However, doing so might invalidate the phi nodes of the first loop as
- // the new values do only need to dominate their latch and not the exiting
- // predicate. To remedy this potential problem we always introduce phi
- // nodes in the header of the second loop later that select the loop carried
- // value, if the second header was reached through an old latch of the
- // first, or undef otherwise. This is sound as exiting the first implies the
- // second will exit too, __without__ taking the back-edge (their
- // trip-counts are equal after all).
- FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
- FC1.Header);
-
- TreeUpdates.emplace_back(DominatorTree::UpdateType(
- DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
- TreeUpdates.emplace_back(DominatorTree::UpdateType(
- DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
-
- // Remove FC0 Exit Block
- // The exit block for FC0 is no longer needed since control will flow
- // directly to the header of FC1. Since it is an empty block, it can be
- // removed at this point.
- // TODO: In the future, we can handle non-empty exit blocks my merging any
- // instructions from FC0 exit block into FC1 exit block prior to removing
- // the block.
+ "Expecting guard block to have no successors");
+
+ // Remember the phi nodes originally in the header of FC0 in order to rewire
+ // them later. However, this is only necessary if the new loop carried
+ // values might not dominate the exiting branch. While we do not generally
+ // test if this is the case but simply insert intermediate phi nodes, we
+ // need to make sure these intermediate phi nodes have different
+ // predecessors. To this end, we filter the special case where the exiting
+ // block is the latch block of the first loop. Nothing needs to be done
+ // anyway as all loop carried values dominate the latch and thereby also the
+ // exiting branch.
+ // KB: This is no longer necessary because FC0.ExitingBlock == FC0.Latch
+ // (because the loops are rotated. Thus, nothing will ever be added to
+ // OriginalFC0PHIs.
+ SmallVector<PHINode *, 8> OriginalFC0PHIs;
+ if (FC0.ExitingBlock != FC0.Latch)
+ for (PHINode &PHI : FC0.Header->phis())
+ OriginalFC0PHIs.push_back(&PHI);
+
+ assert(OriginalFC0PHIs.empty() && "Expecting OriginalFC0PHIs to be empty!");
+
+ // Replace incoming blocks for header PHIs first.
+ FC1.Preheader->replaceSuccessorsPhiUsesWith(FC0.Preheader);
+ FC0.Latch->replaceSuccessorsPhiUsesWith(FC1.Latch);
+
+ // The old exiting block of the first loop (FC0) has to jump to the header
+ // of the second as we need to execute the code in the second header block
+ // regardless of the trip count. That is, if the trip count is 0, so the
+ // back edge is never taken, we still have to execute both loop headers,
+ // especially (but not only!) if the second is a do-while style loop.
+ // However, doing so might invalidate the phi nodes of the first loop as
+ // the new values do only need to dominate their latch and not the exiting
+ // predicate. To remedy this potential problem we always introduce phi
+ // nodes in the header of the second loop later that select the loop carried
+ // value, if the second header was reached through an old latch of the
+ // first, or undef otherwise. This is sound as exiting the first implies the
+ // second will exit too, __without__ taking the back-edge (their
+ // trip-counts are equal after all).
+ FC0.ExitingBlock->getTerminator()->replaceUsesOfWith(FC0.ExitBlock,
+ FC1.Header);
+
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC0.ExitingBlock, FC0.ExitBlock));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Insert, FC0.ExitingBlock, FC1.Header));
+
+ // Remove FC0 Exit Block
+ // The exit block for FC0 is no longer needed since control will flow
+ // directly to the header of FC1. Since it is an empty block, it can be
+ // removed at this point.
+ // TODO: In the future, we can handle non-empty exit blocks my merging any
+ // instructions from FC0 exit block into FC1 exit block prior to removing
+ // the block.
assert(pred_empty(FC0.ExitBlock) && "Expecting exit block to be empty");
- FC0.ExitBlock->getTerminator()->eraseFromParent();
- new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
-
- // Remove FC1 Preheader
- // The pre-header of L1 is not necessary anymore.
+ FC0.ExitBlock->getTerminator()->eraseFromParent();
+ new UnreachableInst(FC0.ExitBlock->getContext(), FC0.ExitBlock);
+
+ // Remove FC1 Preheader
+ // The pre-header of L1 is not necessary anymore.
assert(pred_empty(FC1.Preheader));
- FC1.Preheader->getTerminator()->eraseFromParent();
- new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
- TreeUpdates.emplace_back(DominatorTree::UpdateType(
- DominatorTree::Delete, FC1.Preheader, FC1.Header));
-
- // Moves the phi nodes from the second to the first loops header block.
- while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
- if (SE.isSCEVable(PHI->getType()))
- SE.forgetValue(PHI);
- if (PHI->hasNUsesOrMore(1))
- PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
- else
- PHI->eraseFromParent();
- }
-
- // Introduce new phi nodes in the second loop header to ensure
- // exiting the first and jumping to the header of the second does not break
- // the SSA property of the phis originally in the first loop. See also the
- // comment above.
- Instruction *L1HeaderIP = &FC1.Header->front();
- for (PHINode *LCPHI : OriginalFC0PHIs) {
- int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
- assert(L1LatchBBIdx >= 0 &&
- "Expected loop carried value to be rewired at this point!");
-
- Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
-
- PHINode *L1HeaderPHI = PHINode::Create(
- LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
- L1HeaderPHI->addIncoming(LCV, FC0.Latch);
- L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
- FC0.ExitingBlock);
-
- LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
- }
-
- // Update the latches
-
- // Replace latch terminator destinations.
- FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
- FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
-
+ FC1.Preheader->getTerminator()->eraseFromParent();
+ new UnreachableInst(FC1.Preheader->getContext(), FC1.Preheader);
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Delete, FC1.Preheader, FC1.Header));
+
+ // Moves the phi nodes from the second to the first loops header block.
+ while (PHINode *PHI = dyn_cast<PHINode>(&FC1.Header->front())) {
+ if (SE.isSCEVable(PHI->getType()))
+ SE.forgetValue(PHI);
+ if (PHI->hasNUsesOrMore(1))
+ PHI->moveBefore(&*FC0.Header->getFirstInsertionPt());
+ else
+ PHI->eraseFromParent();
+ }
+
+ // Introduce new phi nodes in the second loop header to ensure
+ // exiting the first and jumping to the header of the second does not break
+ // the SSA property of the phis originally in the first loop. See also the
+ // comment above.
+ Instruction *L1HeaderIP = &FC1.Header->front();
+ for (PHINode *LCPHI : OriginalFC0PHIs) {
+ int L1LatchBBIdx = LCPHI->getBasicBlockIndex(FC1.Latch);
+ assert(L1LatchBBIdx >= 0 &&
+ "Expected loop carried value to be rewired at this point!");
+
+ Value *LCV = LCPHI->getIncomingValue(L1LatchBBIdx);
+
+ PHINode *L1HeaderPHI = PHINode::Create(
+ LCV->getType(), 2, LCPHI->getName() + ".afterFC0", L1HeaderIP);
+ L1HeaderPHI->addIncoming(LCV, FC0.Latch);
+ L1HeaderPHI->addIncoming(UndefValue::get(LCV->getType()),
+ FC0.ExitingBlock);
+
+ LCPHI->setIncomingValue(L1LatchBBIdx, L1HeaderPHI);
+ }
+
+ // Update the latches
+
+ // Replace latch terminator destinations.
+ FC0.Latch->getTerminator()->replaceUsesOfWith(FC0.Header, FC1.Header);
+ FC1.Latch->getTerminator()->replaceUsesOfWith(FC1.Header, FC0.Header);
+
// Modify the latch branch of FC0 to be unconditional as both successors of
- // the branch are the same.
- simplifyLatchBranch(FC0);
-
- // If FC0.Latch and FC0.ExitingBlock are the same then we have already
- // performed the updates above.
- if (FC0.Latch != FC0.ExitingBlock)
- TreeUpdates.emplace_back(DominatorTree::UpdateType(
- DominatorTree::Insert, FC0.Latch, FC1.Header));
-
- TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
- FC0.Latch, FC0.Header));
- TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
- FC1.Latch, FC0.Header));
- TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
- FC1.Latch, FC1.Header));
-
- // All done
- // Apply the updates to the Dominator Tree and cleanup.
-
+ // the branch are the same.
+ simplifyLatchBranch(FC0);
+
+ // If FC0.Latch and FC0.ExitingBlock are the same then we have already
+ // performed the updates above.
+ if (FC0.Latch != FC0.ExitingBlock)
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(
+ DominatorTree::Insert, FC0.Latch, FC1.Header));
+
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+ FC0.Latch, FC0.Header));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Insert,
+ FC1.Latch, FC0.Header));
+ TreeUpdates.emplace_back(DominatorTree::UpdateType(DominatorTree::Delete,
+ FC1.Latch, FC1.Header));
+
+ // All done
+ // Apply the updates to the Dominator Tree and cleanup.
+
assert(succ_empty(FC1GuardBlock) && "FC1GuardBlock has successors!!");
assert(pred_empty(FC1GuardBlock) && "FC1GuardBlock has predecessors!!");
-
- // Update DT/PDT
- DTU.applyUpdates(TreeUpdates);
-
- LI.removeBlock(FC1GuardBlock);
- LI.removeBlock(FC1.Preheader);
- LI.removeBlock(FC0.ExitBlock);
+
+ // Update DT/PDT
+ DTU.applyUpdates(TreeUpdates);
+
+ LI.removeBlock(FC1GuardBlock);
+ LI.removeBlock(FC1.Preheader);
+ LI.removeBlock(FC0.ExitBlock);
if (FC0.Peeled) {
LI.removeBlock(FC0ExitBlockSuccessor);
DTU.deleteBB(FC0ExitBlockSuccessor);
}
- DTU.deleteBB(FC1GuardBlock);
- DTU.deleteBB(FC1.Preheader);
- DTU.deleteBB(FC0.ExitBlock);
- DTU.flush();
-
- // Is there a way to keep SE up-to-date so we don't need to forget the loops
- // and rebuild the information in subsequent passes of fusion?
- // Note: Need to forget the loops before merging the loop latches, as
- // mergeLatch may remove the only block in FC1.
- SE.forgetLoop(FC1.L);
- SE.forgetLoop(FC0.L);
-
- // Move instructions from FC0.Latch to FC1.Latch.
- // Note: mergeLatch requires an updated DT.
- mergeLatch(FC0, FC1);
-
- // Merge the loops.
+ DTU.deleteBB(FC1GuardBlock);
+ DTU.deleteBB(FC1.Preheader);
+ DTU.deleteBB(FC0.ExitBlock);
+ DTU.flush();
+
+ // Is there a way to keep SE up-to-date so we don't need to forget the loops
+ // and rebuild the information in subsequent passes of fusion?
+ // Note: Need to forget the loops before merging the loop latches, as
+ // mergeLatch may remove the only block in FC1.
+ SE.forgetLoop(FC1.L);
+ SE.forgetLoop(FC0.L);
+
+ // Move instructions from FC0.Latch to FC1.Latch.
+ // Note: mergeLatch requires an updated DT.
+ mergeLatch(FC0, FC1);
+
+ // Merge the loops.
SmallVector<BasicBlock *, 8> Blocks(FC1.L->blocks());
- for (BasicBlock *BB : Blocks) {
- FC0.L->addBlockEntry(BB);
- FC1.L->removeBlockFromLoop(BB);
- if (LI.getLoopFor(BB) != FC1.L)
- continue;
- LI.changeLoopFor(BB, FC0.L);
- }
+ for (BasicBlock *BB : Blocks) {
+ FC0.L->addBlockEntry(BB);
+ FC1.L->removeBlockFromLoop(BB);
+ if (LI.getLoopFor(BB) != FC1.L)
+ continue;
+ LI.changeLoopFor(BB, FC0.L);
+ }
while (!FC1.L->isInnermost()) {
- const auto &ChildLoopIt = FC1.L->begin();
- Loop *ChildLoop = *ChildLoopIt;
- FC1.L->removeChildLoop(ChildLoopIt);
- FC0.L->addChildLoop(ChildLoop);
- }
-
- // Delete the now empty loop L1.
- LI.erase(FC1.L);
-
-#ifndef NDEBUG
- assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
- assert(DT.verify(DominatorTree::VerificationLevel::Fast));
- assert(PDT.verify());
- LI.verify(DT);
- SE.verify();
-#endif
-
- LLVM_DEBUG(dbgs() << "Fusion done:\n");
-
- return FC0.L;
- }
-};
-
-struct LoopFuseLegacy : public FunctionPass {
-
- static char ID;
-
- LoopFuseLegacy() : FunctionPass(ID) {
- initializeLoopFuseLegacyPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequiredID(LoopSimplifyID);
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<PostDominatorTreeWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addRequired<DependenceAnalysisWrapperPass>();
+ const auto &ChildLoopIt = FC1.L->begin();
+ Loop *ChildLoop = *ChildLoopIt;
+ FC1.L->removeChildLoop(ChildLoopIt);
+ FC0.L->addChildLoop(ChildLoop);
+ }
+
+ // Delete the now empty loop L1.
+ LI.erase(FC1.L);
+
+#ifndef NDEBUG
+ assert(!verifyFunction(*FC0.Header->getParent(), &errs()));
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+ assert(PDT.verify());
+ LI.verify(DT);
+ SE.verify();
+#endif
+
+ LLVM_DEBUG(dbgs() << "Fusion done:\n");
+
+ return FC0.L;
+ }
+};
+
+struct LoopFuseLegacy : public FunctionPass {
+
+ static char ID;
+
+ LoopFuseLegacy() : FunctionPass(ID) {
+ initializeLoopFuseLegacyPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<PostDominatorTreeWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addRequired<DependenceAnalysisWrapperPass>();
AU.addRequired<AssumptionCacheTracker>();
AU.addRequired<TargetTransformInfoWrapperPass>();
-
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<PostDominatorTreeWrapperPass>();
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
- auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
- auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
- auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<PostDominatorTreeWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
+ auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+ auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
const TargetTransformInfo &TTI =
getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
const DataLayout &DL = F.getParent()->getDataLayout();
-
+
LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
- return LF.fuseLoops(F);
- }
-};
-} // namespace
-
-PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
- auto &LI = AM.getResult<LoopAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &DI = AM.getResult<DependenceAnalysis>(F);
- auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
- auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
- auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ return LF.fuseLoops(F);
+ }
+};
+} // namespace
+
+PreservedAnalyses LoopFusePass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &DI = AM.getResult<DependenceAnalysis>(F);
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
auto &AC = AM.getResult<AssumptionAnalysis>(F);
const TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
const DataLayout &DL = F.getParent()->getDataLayout();
-
+
LoopFuser LF(LI, DT, DI, SE, PDT, ORE, DL, AC, TTI);
- bool Changed = LF.fuseLoops(F);
- if (!Changed)
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<PostDominatorTreeAnalysis>();
- PA.preserve<ScalarEvolutionAnalysis>();
- PA.preserve<LoopAnalysis>();
- return PA;
-}
-
-char LoopFuseLegacy::ID = 0;
-
-INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+ bool Changed = LF.fuseLoops(F);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<PostDominatorTreeAnalysis>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ return PA;
+}
+
+char LoopFuseLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false)
-
-FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); }
+INITIALIZE_PASS_END(LoopFuseLegacy, "loop-fusion", "Loop Fusion", false, false)
+
+FunctionPass *llvm::createLoopFusePass() { return new LoopFuseLegacy(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 7c55efb78d..8064c02e2b 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -1,117 +1,117 @@
-//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass implements an idiom recognizer that transforms simple loops into a
-// non-loop form. In cases that this kicks in, it can be a significant
-// performance win.
-//
-// If compiling for code size we avoid idiom recognition if the resulting
-// code could be larger than the code for the original loop. One way this could
-// happen is if the loop is not removable after idiom recognition due to the
-// presence of non-idiom instructions. The initial implementation of the
-// heuristics applies to idioms in multi-block loops.
-//
-//===----------------------------------------------------------------------===//
-//
-// TODO List:
-//
-// Future loop memory idioms to recognize:
-// memcmp, memmove, strlen, etc.
-// Future floating point idioms to recognize in -ffast-math mode:
-// fpowi
-// Future integer operation idioms to recognize:
-// ctpop
-//
-// Beware that isel's default lowering for ctpop is highly inefficient for
-// i64 and larger types when i64 is legal and the value has few bits set. It
-// would be good to enhance isel to emit a loop for ctpop in this case.
-//
-// This could recognize common matrix multiplies and dot product idioms and
-// replace them with calls to BLAS (if linked in??).
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/AliasAnalysis.h"
+//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an idiom recognizer that transforms simple loops into a
+// non-loop form. In cases that this kicks in, it can be a significant
+// performance win.
+//
+// If compiling for code size we avoid idiom recognition if the resulting
+// code could be larger than the code for the original loop. One way this could
+// happen is if the loop is not removable after idiom recognition due to the
+// presence of non-idiom instructions. The initial implementation of the
+// heuristics applies to idioms in multi-block loops.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO List:
+//
+// Future loop memory idioms to recognize:
+// memcmp, memmove, strlen, etc.
+// Future floating point idioms to recognize in -ffast-math mode:
+// fpowi
+// Future integer operation idioms to recognize:
+// ctpop
+//
+// Beware that isel's default lowering for ctpop is highly inefficient for
+// i64 and larger types when i64 is legal and the value has few bits set. It
+// would be good to enhance isel to emit a loop for ctpop in this case.
+//
+// This could recognize common matrix multiplies and dot product idioms and
+// replace them with calls to BLAS (if linked in??).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/CmpInstAnalysis.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/MustExecute.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-idiom"
-
-STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
-STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-idiom"
+
+STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
+STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
STATISTIC(
NumShiftUntilBitTest,
"Number of uncountable loops recognized as 'shift until bitttest' idiom");
-
+
bool DisableLIRP::All;
static cl::opt<bool, true>
DisableLIRPAll("disable-" DEBUG_TYPE "-all",
@@ -135,841 +135,841 @@ static cl::opt<bool, true>
cl::location(DisableLIRP::Memcpy), cl::init(false),
cl::ReallyHidden);
-static cl::opt<bool> UseLIRCodeSizeHeurs(
- "use-lir-code-size-heurs",
- cl::desc("Use loop idiom recognition code size heuristics when compiling"
- "with -Os/-Oz"),
- cl::init(true), cl::Hidden);
-
-namespace {
-
-class LoopIdiomRecognize {
- Loop *CurLoop = nullptr;
- AliasAnalysis *AA;
- DominatorTree *DT;
- LoopInfo *LI;
- ScalarEvolution *SE;
- TargetLibraryInfo *TLI;
- const TargetTransformInfo *TTI;
- const DataLayout *DL;
- OptimizationRemarkEmitter &ORE;
- bool ApplyCodeSizeHeuristics;
- std::unique_ptr<MemorySSAUpdater> MSSAU;
-
-public:
- explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
- LoopInfo *LI, ScalarEvolution *SE,
- TargetLibraryInfo *TLI,
- const TargetTransformInfo *TTI, MemorySSA *MSSA,
- const DataLayout *DL,
- OptimizationRemarkEmitter &ORE)
- : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
- if (MSSA)
- MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
- }
-
- bool runOnLoop(Loop *L);
-
-private:
- using StoreList = SmallVector<StoreInst *, 8>;
- using StoreListMap = MapVector<Value *, StoreList>;
-
- StoreListMap StoreRefsForMemset;
- StoreListMap StoreRefsForMemsetPattern;
- StoreList StoreRefsForMemcpy;
- bool HasMemset;
- bool HasMemsetPattern;
- bool HasMemcpy;
-
- /// Return code for isLegalStore()
- enum LegalStoreKind {
- None = 0,
- Memset,
- MemsetPattern,
- Memcpy,
- UnorderedAtomicMemcpy,
- DontUse // Dummy retval never to be used. Allows catching errors in retval
- // handling.
- };
-
- /// \name Countable Loop Idiom Handling
- /// @{
-
- bool runOnCountableLoop();
- bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
- SmallVectorImpl<BasicBlock *> &ExitBlocks);
-
- void collectStores(BasicBlock *BB);
- LegalStoreKind isLegalStore(StoreInst *SI);
- enum class ForMemset { No, Yes };
- bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
- ForMemset For);
- bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
-
- bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
- MaybeAlign StoreAlignment, Value *StoredVal,
- Instruction *TheStore,
- SmallPtrSetImpl<Instruction *> &Stores,
- const SCEVAddRecExpr *Ev, const SCEV *BECount,
- bool NegStride, bool IsLoopMemset = false);
- bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
- bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
- bool IsLoopMemset = false);
-
- /// @}
- /// \name Noncountable Loop Idiom Handling
- /// @{
-
- bool runOnNoncountableLoop();
-
- bool recognizePopcount();
- void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
- PHINode *CntPhi, Value *Var);
- bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
- void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
- Instruction *CntInst, PHINode *CntPhi,
- Value *Var, Instruction *DefX,
- const DebugLoc &DL, bool ZeroCheck,
- bool IsCntPhiUsedOutsideLoop);
-
+static cl::opt<bool> UseLIRCodeSizeHeurs(
+ "use-lir-code-size-heurs",
+ cl::desc("Use loop idiom recognition code size heuristics when compiling"
+ "with -Os/-Oz"),
+ cl::init(true), cl::Hidden);
+
+namespace {
+
+class LoopIdiomRecognize {
+ Loop *CurLoop = nullptr;
+ AliasAnalysis *AA;
+ DominatorTree *DT;
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ TargetLibraryInfo *TLI;
+ const TargetTransformInfo *TTI;
+ const DataLayout *DL;
+ OptimizationRemarkEmitter &ORE;
+ bool ApplyCodeSizeHeuristics;
+ std::unique_ptr<MemorySSAUpdater> MSSAU;
+
+public:
+ explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
+ LoopInfo *LI, ScalarEvolution *SE,
+ TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, MemorySSA *MSSA,
+ const DataLayout *DL,
+ OptimizationRemarkEmitter &ORE)
+ : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
+ if (MSSA)
+ MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+ }
+
+ bool runOnLoop(Loop *L);
+
+private:
+ using StoreList = SmallVector<StoreInst *, 8>;
+ using StoreListMap = MapVector<Value *, StoreList>;
+
+ StoreListMap StoreRefsForMemset;
+ StoreListMap StoreRefsForMemsetPattern;
+ StoreList StoreRefsForMemcpy;
+ bool HasMemset;
+ bool HasMemsetPattern;
+ bool HasMemcpy;
+
+ /// Return code for isLegalStore()
+ enum LegalStoreKind {
+ None = 0,
+ Memset,
+ MemsetPattern,
+ Memcpy,
+ UnorderedAtomicMemcpy,
+ DontUse // Dummy retval never to be used. Allows catching errors in retval
+ // handling.
+ };
+
+ /// \name Countable Loop Idiom Handling
+ /// @{
+
+ bool runOnCountableLoop();
+ bool runOnLoopBlock(BasicBlock *BB, const SCEV *BECount,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+ void collectStores(BasicBlock *BB);
+ LegalStoreKind isLegalStore(StoreInst *SI);
+ enum class ForMemset { No, Yes };
+ bool processLoopStores(SmallVectorImpl<StoreInst *> &SL, const SCEV *BECount,
+ ForMemset For);
+ bool processLoopMemSet(MemSetInst *MSI, const SCEV *BECount);
+
+ bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
+ MaybeAlign StoreAlignment, Value *StoredVal,
+ Instruction *TheStore,
+ SmallPtrSetImpl<Instruction *> &Stores,
+ const SCEVAddRecExpr *Ev, const SCEV *BECount,
+ bool NegStride, bool IsLoopMemset = false);
+ bool processLoopStoreOfLoopLoad(StoreInst *SI, const SCEV *BECount);
+ bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
+ bool IsLoopMemset = false);
+
+ /// @}
+ /// \name Noncountable Loop Idiom Handling
+ /// @{
+
+ bool runOnNoncountableLoop();
+
+ bool recognizePopcount();
+ void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst,
+ PHINode *CntPhi, Value *Var);
+ bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
+ void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
+ Instruction *CntInst, PHINode *CntPhi,
+ Value *Var, Instruction *DefX,
+ const DebugLoc &DL, bool ZeroCheck,
+ bool IsCntPhiUsedOutsideLoop);
+
bool recognizeShiftUntilBitTest();
- /// @}
-};
-
-class LoopIdiomRecognizeLegacyPass : public LoopPass {
-public:
- static char ID;
-
- explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
- initializeLoopIdiomRecognizeLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ /// @}
+};
+
+class LoopIdiomRecognizeLegacyPass : public LoopPass {
+public:
+ static char ID;
+
+ explicit LoopIdiomRecognizeLegacyPass() : LoopPass(ID) {
+ initializeLoopIdiomRecognizeLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
if (DisableLIRP::All)
return false;
- if (skipLoop(L))
- return false;
-
- AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
- *L->getHeader()->getParent());
- const TargetTransformInfo *TTI =
- &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
- *L->getHeader()->getParent());
- const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
- auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
- MemorySSA *MSSA = nullptr;
- if (MSSAAnalysis)
- MSSA = &MSSAAnalysis->getMSSA();
-
- // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
- // pass. Function analyses need to be preserved across loop transformations
- // but ORE cannot be preserved (see comment before the pass definition).
- OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
-
- LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
- return LIR.runOnLoop(L);
- }
-
- /// This transformation requires natural loop information & requires that
- /// loop preheaders be inserted into the CFG.
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
- getLoopAnalysisUsage(AU);
- }
-};
-
-} // end anonymous namespace
-
-char LoopIdiomRecognizeLegacyPass::ID = 0;
-
-PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &) {
+ if (skipLoop(L))
+ return false;
+
+ AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+ *L->getHeader()->getParent());
+ const TargetTransformInfo *TTI =
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *L->getHeader()->getParent());
+ const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
+ auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+ MemorySSA *MSSA = nullptr;
+ if (MSSAAnalysis)
+ MSSA = &MSSAAnalysis->getMSSA();
+
+ // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+ // pass. Function analyses need to be preserved across loop transformations
+ // but ORE cannot be preserved (see comment before the pass definition).
+ OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
+
+ LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
+ return LIR.runOnLoop(L);
+ }
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG.
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char LoopIdiomRecognizeLegacyPass::ID = 0;
+
+PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
if (DisableLIRP::All)
return PreservedAnalyses::all();
- const auto *DL = &L.getHeader()->getModule()->getDataLayout();
-
- // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
- // pass. Function analyses need to be preserved across loop transformations
- // but ORE cannot be preserved (see comment before the pass definition).
- OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
-
- LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
- AR.MSSA, DL, ORE);
- if (!LIR.runOnLoop(&L))
- return PreservedAnalyses::all();
-
- auto PA = getLoopPassPreservedAnalyses();
- if (AR.MSSA)
- PA.preserve<MemorySSAAnalysis>();
- return PA;
-}
-
-INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
- "Recognize loop idioms", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
- "Recognize loop idioms", false, false)
-
-Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
-
-static void deleteDeadInstruction(Instruction *I) {
- I->replaceAllUsesWith(UndefValue::get(I->getType()));
- I->eraseFromParent();
-}
-
-//===----------------------------------------------------------------------===//
-//
-// Implementation of LoopIdiomRecognize
-//
-//===----------------------------------------------------------------------===//
-
-bool LoopIdiomRecognize::runOnLoop(Loop *L) {
- CurLoop = L;
- // If the loop could not be converted to canonical form, it must have an
- // indirectbr in it, just give up.
- if (!L->getLoopPreheader())
- return false;
-
- // Disable loop idiom recognition if the function's name is a common idiom.
- StringRef Name = L->getHeader()->getParent()->getName();
- if (Name == "memset" || Name == "memcpy")
- return false;
-
- // Determine if code size heuristics need to be applied.
- ApplyCodeSizeHeuristics =
- L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
-
- HasMemset = TLI->has(LibFunc_memset);
- HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
- HasMemcpy = TLI->has(LibFunc_memcpy);
-
- if (HasMemset || HasMemsetPattern || HasMemcpy)
- if (SE->hasLoopInvariantBackedgeTakenCount(L))
- return runOnCountableLoop();
-
- return runOnNoncountableLoop();
-}
-
-bool LoopIdiomRecognize::runOnCountableLoop() {
- const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
- assert(!isa<SCEVCouldNotCompute>(BECount) &&
- "runOnCountableLoop() called on a loop without a predictable"
- "backedge-taken count");
-
- // If this loop executes exactly one time, then it should be peeled, not
- // optimized by this pass.
- if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
- if (BECst->getAPInt() == 0)
- return false;
-
- SmallVector<BasicBlock *, 8> ExitBlocks;
- CurLoop->getUniqueExitBlocks(ExitBlocks);
-
- LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
- << CurLoop->getHeader()->getParent()->getName()
- << "] Countable Loop %" << CurLoop->getHeader()->getName()
- << "\n");
-
- // The following transforms hoist stores/memsets into the loop pre-header.
- // Give up if the loop has instructions that may throw.
- SimpleLoopSafetyInfo SafetyInfo;
- SafetyInfo.computeLoopSafetyInfo(CurLoop);
- if (SafetyInfo.anyBlockMayThrow())
- return false;
-
- bool MadeChange = false;
-
- // Scan all the blocks in the loop that are not in subloops.
- for (auto *BB : CurLoop->getBlocks()) {
- // Ignore blocks in subloops.
- if (LI->getLoopFor(BB) != CurLoop)
- continue;
-
- MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
- }
- return MadeChange;
-}
-
-static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
- const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
- return ConstStride->getAPInt();
-}
-
-/// getMemSetPatternValue - If a strided store of the specified value is safe to
-/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
-/// be passed in. Otherwise, return null.
-///
-/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
-/// just replicate their input array and then pass on to memset_pattern16.
-static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
- // FIXME: This could check for UndefValue because it can be merged into any
- // other valid pattern.
-
- // If the value isn't a constant, we can't promote it to being in a constant
- // array. We could theoretically do a store to an alloca or something, but
- // that doesn't seem worthwhile.
- Constant *C = dyn_cast<Constant>(V);
- if (!C)
- return nullptr;
-
- // Only handle simple values that are a power of two bytes in size.
- uint64_t Size = DL->getTypeSizeInBits(V->getType());
- if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
- return nullptr;
-
- // Don't care enough about darwin/ppc to implement this.
- if (DL->isBigEndian())
- return nullptr;
-
- // Convert to size in bytes.
- Size /= 8;
-
- // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
- // if the top and bottom are the same (e.g. for vectors and large integers).
- if (Size > 16)
- return nullptr;
-
- // If the constant is exactly 16 bytes, just use it.
- if (Size == 16)
- return C;
-
- // Otherwise, we'll use an array of the constants.
- unsigned ArraySize = 16 / Size;
- ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
- return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
-}
-
-LoopIdiomRecognize::LegalStoreKind
-LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
- // Don't touch volatile stores.
- if (SI->isVolatile())
- return LegalStoreKind::None;
- // We only want simple or unordered-atomic stores.
- if (!SI->isUnordered())
- return LegalStoreKind::None;
-
- // Avoid merging nontemporal stores.
- if (SI->getMetadata(LLVMContext::MD_nontemporal))
- return LegalStoreKind::None;
-
- Value *StoredVal = SI->getValueOperand();
- Value *StorePtr = SI->getPointerOperand();
-
+ const auto *DL = &L.getHeader()->getModule()->getDataLayout();
+
+ // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
+ // pass. Function analyses need to be preserved across loop transformations
+ // but ORE cannot be preserved (see comment before the pass definition).
+ OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
+
+ LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
+ AR.MSSA, DL, ORE);
+ if (!LIR.runOnLoop(&L))
+ return PreservedAnalyses::all();
+
+ auto PA = getLoopPassPreservedAnalyses();
+ if (AR.MSSA)
+ PA.preserve<MemorySSAAnalysis>();
+ return PA;
+}
+
+INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
+ "Recognize loop idioms", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoopIdiomRecognizeLegacyPass, "loop-idiom",
+ "Recognize loop idioms", false, false)
+
+Pass *llvm::createLoopIdiomPass() { return new LoopIdiomRecognizeLegacyPass(); }
+
+static void deleteDeadInstruction(Instruction *I) {
+ I->replaceAllUsesWith(UndefValue::get(I->getType()));
+ I->eraseFromParent();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Implementation of LoopIdiomRecognize
+//
+//===----------------------------------------------------------------------===//
+
+bool LoopIdiomRecognize::runOnLoop(Loop *L) {
+ CurLoop = L;
+ // If the loop could not be converted to canonical form, it must have an
+ // indirectbr in it, just give up.
+ if (!L->getLoopPreheader())
+ return false;
+
+ // Disable loop idiom recognition if the function's name is a common idiom.
+ StringRef Name = L->getHeader()->getParent()->getName();
+ if (Name == "memset" || Name == "memcpy")
+ return false;
+
+ // Determine if code size heuristics need to be applied.
+ ApplyCodeSizeHeuristics =
+ L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
+
+ HasMemset = TLI->has(LibFunc_memset);
+ HasMemsetPattern = TLI->has(LibFunc_memset_pattern16);
+ HasMemcpy = TLI->has(LibFunc_memcpy);
+
+ if (HasMemset || HasMemsetPattern || HasMemcpy)
+ if (SE->hasLoopInvariantBackedgeTakenCount(L))
+ return runOnCountableLoop();
+
+ return runOnNoncountableLoop();
+}
+
+bool LoopIdiomRecognize::runOnCountableLoop() {
+ const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
+ assert(!isa<SCEVCouldNotCompute>(BECount) &&
+ "runOnCountableLoop() called on a loop without a predictable"
+ "backedge-taken count");
+
+ // If this loop executes exactly one time, then it should be peeled, not
+ // optimized by this pass.
+ if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+ if (BECst->getAPInt() == 0)
+ return false;
+
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ CurLoop->getUniqueExitBlocks(ExitBlocks);
+
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
+ << CurLoop->getHeader()->getParent()->getName()
+ << "] Countable Loop %" << CurLoop->getHeader()->getName()
+ << "\n");
+
+ // The following transforms hoist stores/memsets into the loop pre-header.
+ // Give up if the loop has instructions that may throw.
+ SimpleLoopSafetyInfo SafetyInfo;
+ SafetyInfo.computeLoopSafetyInfo(CurLoop);
+ if (SafetyInfo.anyBlockMayThrow())
+ return false;
+
+ bool MadeChange = false;
+
+ // Scan all the blocks in the loop that are not in subloops.
+ for (auto *BB : CurLoop->getBlocks()) {
+ // Ignore blocks in subloops.
+ if (LI->getLoopFor(BB) != CurLoop)
+ continue;
+
+ MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
+ }
+ return MadeChange;
+}
+
+static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
+ const SCEVConstant *ConstStride = cast<SCEVConstant>(StoreEv->getOperand(1));
+ return ConstStride->getAPInt();
+}
+
+/// getMemSetPatternValue - If a strided store of the specified value is safe to
+/// turn into a memset_pattern16, return a ConstantArray of 16 bytes that should
+/// be passed in. Otherwise, return null.
+///
+/// Note that we don't ever attempt to use memset_pattern8 or 4, because these
+/// just replicate their input array and then pass on to memset_pattern16.
+static Constant *getMemSetPatternValue(Value *V, const DataLayout *DL) {
+ // FIXME: This could check for UndefValue because it can be merged into any
+ // other valid pattern.
+
+ // If the value isn't a constant, we can't promote it to being in a constant
+ // array. We could theoretically do a store to an alloca or something, but
+ // that doesn't seem worthwhile.
+ Constant *C = dyn_cast<Constant>(V);
+ if (!C)
+ return nullptr;
+
+ // Only handle simple values that are a power of two bytes in size.
+ uint64_t Size = DL->getTypeSizeInBits(V->getType());
+ if (Size == 0 || (Size & 7) || (Size & (Size - 1)))
+ return nullptr;
+
+ // Don't care enough about darwin/ppc to implement this.
+ if (DL->isBigEndian())
+ return nullptr;
+
+ // Convert to size in bytes.
+ Size /= 8;
+
+ // TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
+ // if the top and bottom are the same (e.g. for vectors and large integers).
+ if (Size > 16)
+ return nullptr;
+
+ // If the constant is exactly 16 bytes, just use it.
+ if (Size == 16)
+ return C;
+
+ // Otherwise, we'll use an array of the constants.
+ unsigned ArraySize = 16 / Size;
+ ArrayType *AT = ArrayType::get(V->getType(), ArraySize);
+ return ConstantArray::get(AT, std::vector<Constant *>(ArraySize, C));
+}
+
+LoopIdiomRecognize::LegalStoreKind
+LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
+ // Don't touch volatile stores.
+ if (SI->isVolatile())
+ return LegalStoreKind::None;
+ // We only want simple or unordered-atomic stores.
+ if (!SI->isUnordered())
+ return LegalStoreKind::None;
+
+ // Avoid merging nontemporal stores.
+ if (SI->getMetadata(LLVMContext::MD_nontemporal))
+ return LegalStoreKind::None;
+
+ Value *StoredVal = SI->getValueOperand();
+ Value *StorePtr = SI->getPointerOperand();
+
// Don't convert stores of non-integral pointer types to memsets (which stores
// integers).
if (DL->isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
return LegalStoreKind::None;
- // Reject stores that are so large that they overflow an unsigned.
+ // Reject stores that are so large that they overflow an unsigned.
// When storing out scalable vectors we bail out for now, since the code
// below currently only works for constant strides.
TypeSize SizeInBits = DL->getTypeSizeInBits(StoredVal->getType());
if (SizeInBits.isScalable() || (SizeInBits.getFixedSize() & 7) ||
(SizeInBits.getFixedSize() >> 32) != 0)
- return LegalStoreKind::None;
-
- // See if the pointer expression is an AddRec like {base,+,1} on the current
- // loop, which indicates a strided store. If we have something else, it's a
- // random store we can't handle.
- const SCEVAddRecExpr *StoreEv =
- dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
- if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
- return LegalStoreKind::None;
-
- // Check to see if we have a constant stride.
- if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
- return LegalStoreKind::None;
-
- // See if the store can be turned into a memset.
-
- // If the stored value is a byte-wise value (like i32 -1), then it may be
- // turned into a memset of i8 -1, assuming that all the consecutive bytes
- // are stored. A store of i32 0x01020304 can never be turned into a memset,
- // but it can be turned into memset_pattern if the target supports it.
- Value *SplatValue = isBytewiseValue(StoredVal, *DL);
- Constant *PatternValue = nullptr;
-
- // Note: memset and memset_pattern on unordered-atomic is yet not supported
- bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
-
- // If we're allowed to form a memset, and the stored value would be
- // acceptable for memset, use it.
+ return LegalStoreKind::None;
+
+ // See if the pointer expression is an AddRec like {base,+,1} on the current
+ // loop, which indicates a strided store. If we have something else, it's a
+ // random store we can't handle.
+ const SCEVAddRecExpr *StoreEv =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+ if (!StoreEv || StoreEv->getLoop() != CurLoop || !StoreEv->isAffine())
+ return LegalStoreKind::None;
+
+ // Check to see if we have a constant stride.
+ if (!isa<SCEVConstant>(StoreEv->getOperand(1)))
+ return LegalStoreKind::None;
+
+ // See if the store can be turned into a memset.
+
+ // If the stored value is a byte-wise value (like i32 -1), then it may be
+ // turned into a memset of i8 -1, assuming that all the consecutive bytes
+ // are stored. A store of i32 0x01020304 can never be turned into a memset,
+ // but it can be turned into memset_pattern if the target supports it.
+ Value *SplatValue = isBytewiseValue(StoredVal, *DL);
+ Constant *PatternValue = nullptr;
+
+ // Note: memset and memset_pattern on unordered-atomic is yet not supported
+ bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
+
+ // If we're allowed to form a memset, and the stored value would be
+ // acceptable for memset, use it.
if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
- // Verify that the stored value is loop invariant. If not, we can't
- // promote the memset.
- CurLoop->isLoopInvariant(SplatValue)) {
- // It looks like we can use SplatValue.
- return LegalStoreKind::Memset;
+ // Verify that the stored value is loop invariant. If not, we can't
+ // promote the memset.
+ CurLoop->isLoopInvariant(SplatValue)) {
+ // It looks like we can use SplatValue.
+ return LegalStoreKind::Memset;
} else if (!UnorderedAtomic && HasMemsetPattern && !DisableLIRP::Memset &&
- // Don't create memset_pattern16s with address spaces.
- StorePtr->getType()->getPointerAddressSpace() == 0 &&
- (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
- // It looks like we can use PatternValue!
- return LegalStoreKind::MemsetPattern;
- }
-
- // Otherwise, see if the store can be turned into a memcpy.
+ // Don't create memset_pattern16s with address spaces.
+ StorePtr->getType()->getPointerAddressSpace() == 0 &&
+ (PatternValue = getMemSetPatternValue(StoredVal, DL))) {
+ // It looks like we can use PatternValue!
+ return LegalStoreKind::MemsetPattern;
+ }
+
+ // Otherwise, see if the store can be turned into a memcpy.
if (HasMemcpy && !DisableLIRP::Memcpy) {
- // Check to see if the stride matches the size of the store. If so, then we
- // know that every byte is touched in the loop.
- APInt Stride = getStoreStride(StoreEv);
- unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
- if (StoreSize != Stride && StoreSize != -Stride)
- return LegalStoreKind::None;
-
- // The store must be feeding a non-volatile load.
- LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
-
- // Only allow non-volatile loads
- if (!LI || LI->isVolatile())
- return LegalStoreKind::None;
- // Only allow simple or unordered-atomic loads
- if (!LI->isUnordered())
- return LegalStoreKind::None;
-
- // See if the pointer expression is an AddRec like {base,+,1} on the current
- // loop, which indicates a strided load. If we have something else, it's a
- // random load we can't handle.
- const SCEVAddRecExpr *LoadEv =
- dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
- if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
- return LegalStoreKind::None;
-
- // The store and load must share the same stride.
- if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
- return LegalStoreKind::None;
-
- // Success. This store can be converted into a memcpy.
- UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
- return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
- : LegalStoreKind::Memcpy;
- }
- // This store can't be transformed into a memset/memcpy.
- return LegalStoreKind::None;
-}
-
-void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
- StoreRefsForMemset.clear();
- StoreRefsForMemsetPattern.clear();
- StoreRefsForMemcpy.clear();
- for (Instruction &I : *BB) {
- StoreInst *SI = dyn_cast<StoreInst>(&I);
- if (!SI)
- continue;
-
- // Make sure this is a strided store with a constant stride.
- switch (isLegalStore(SI)) {
- case LegalStoreKind::None:
- // Nothing to do
- break;
- case LegalStoreKind::Memset: {
- // Find the base pointer.
+ // Check to see if the stride matches the size of the store. If so, then we
+ // know that every byte is touched in the loop.
+ APInt Stride = getStoreStride(StoreEv);
+ unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
+ if (StoreSize != Stride && StoreSize != -Stride)
+ return LegalStoreKind::None;
+
+ // The store must be feeding a non-volatile load.
+ LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand());
+
+ // Only allow non-volatile loads
+ if (!LI || LI->isVolatile())
+ return LegalStoreKind::None;
+ // Only allow simple or unordered-atomic loads
+ if (!LI->isUnordered())
+ return LegalStoreKind::None;
+
+ // See if the pointer expression is an AddRec like {base,+,1} on the current
+ // loop, which indicates a strided load. If we have something else, it's a
+ // random load we can't handle.
+ const SCEVAddRecExpr *LoadEv =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+ if (!LoadEv || LoadEv->getLoop() != CurLoop || !LoadEv->isAffine())
+ return LegalStoreKind::None;
+
+ // The store and load must share the same stride.
+ if (StoreEv->getOperand(1) != LoadEv->getOperand(1))
+ return LegalStoreKind::None;
+
+ // Success. This store can be converted into a memcpy.
+ UnorderedAtomic = UnorderedAtomic || LI->isAtomic();
+ return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
+ : LegalStoreKind::Memcpy;
+ }
+ // This store can't be transformed into a memset/memcpy.
+ return LegalStoreKind::None;
+}
+
+void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
+ StoreRefsForMemset.clear();
+ StoreRefsForMemsetPattern.clear();
+ StoreRefsForMemcpy.clear();
+ for (Instruction &I : *BB) {
+ StoreInst *SI = dyn_cast<StoreInst>(&I);
+ if (!SI)
+ continue;
+
+ // Make sure this is a strided store with a constant stride.
+ switch (isLegalStore(SI)) {
+ case LegalStoreKind::None:
+ // Nothing to do
+ break;
+ case LegalStoreKind::Memset: {
+ // Find the base pointer.
Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
- StoreRefsForMemset[Ptr].push_back(SI);
- } break;
- case LegalStoreKind::MemsetPattern: {
- // Find the base pointer.
+ StoreRefsForMemset[Ptr].push_back(SI);
+ } break;
+ case LegalStoreKind::MemsetPattern: {
+ // Find the base pointer.
Value *Ptr = getUnderlyingObject(SI->getPointerOperand());
- StoreRefsForMemsetPattern[Ptr].push_back(SI);
- } break;
- case LegalStoreKind::Memcpy:
- case LegalStoreKind::UnorderedAtomicMemcpy:
- StoreRefsForMemcpy.push_back(SI);
- break;
- default:
- assert(false && "unhandled return value");
- break;
- }
- }
-}
-
-/// runOnLoopBlock - Process the specified block, which lives in a counted loop
-/// with the specified backedge count. This block is known to be in the current
-/// loop and not in any subloops.
-bool LoopIdiomRecognize::runOnLoopBlock(
- BasicBlock *BB, const SCEV *BECount,
- SmallVectorImpl<BasicBlock *> &ExitBlocks) {
- // We can only promote stores in this block if they are unconditionally
- // executed in the loop. For a block to be unconditionally executed, it has
- // to dominate all the exit blocks of the loop. Verify this now.
- for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
- if (!DT->dominates(BB, ExitBlocks[i]))
- return false;
-
- bool MadeChange = false;
- // Look for store instructions, which may be optimized to memset/memcpy.
- collectStores(BB);
-
- // Look for a single store or sets of stores with a common base, which can be
- // optimized into a memset (memset_pattern). The latter most commonly happens
- // with structs and handunrolled loops.
- for (auto &SL : StoreRefsForMemset)
- MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
-
- for (auto &SL : StoreRefsForMemsetPattern)
- MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
-
- // Optimize the store into a memcpy, if it feeds an similarly strided load.
- for (auto &SI : StoreRefsForMemcpy)
- MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
-
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
- Instruction *Inst = &*I++;
- // Look for memset instructions, which may be optimized to a larger memset.
- if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
- WeakTrackingVH InstPtr(&*I);
- if (!processLoopMemSet(MSI, BECount))
- continue;
- MadeChange = true;
-
- // If processing the memset invalidated our iterator, start over from the
- // top of the block.
- if (!InstPtr)
- I = BB->begin();
- continue;
- }
- }
-
- return MadeChange;
-}
-
-/// See if this store(s) can be promoted to a memset.
-bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
- const SCEV *BECount, ForMemset For) {
- // Try to find consecutive stores that can be transformed into memsets.
- SetVector<StoreInst *> Heads, Tails;
- SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
-
- // Do a quadratic search on all of the given stores and find
- // all of the pairs of stores that follow each other.
- SmallVector<unsigned, 16> IndexQueue;
- for (unsigned i = 0, e = SL.size(); i < e; ++i) {
- assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
-
- Value *FirstStoredVal = SL[i]->getValueOperand();
- Value *FirstStorePtr = SL[i]->getPointerOperand();
- const SCEVAddRecExpr *FirstStoreEv =
- cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
- APInt FirstStride = getStoreStride(FirstStoreEv);
- unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
-
- // See if we can optimize just this store in isolation.
- if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
- Heads.insert(SL[i]);
- continue;
- }
-
- Value *FirstSplatValue = nullptr;
- Constant *FirstPatternValue = nullptr;
-
- if (For == ForMemset::Yes)
- FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
- else
- FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
-
- assert((FirstSplatValue || FirstPatternValue) &&
- "Expected either splat value or pattern value.");
-
- IndexQueue.clear();
- // If a store has multiple consecutive store candidates, search Stores
- // array according to the sequence: from i+1 to e, then from i-1 to 0.
- // This is because usually pairing with immediate succeeding or preceding
- // candidate create the best chance to find memset opportunity.
- unsigned j = 0;
- for (j = i + 1; j < e; ++j)
- IndexQueue.push_back(j);
- for (j = i; j > 0; --j)
- IndexQueue.push_back(j - 1);
-
- for (auto &k : IndexQueue) {
- assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
- Value *SecondStorePtr = SL[k]->getPointerOperand();
- const SCEVAddRecExpr *SecondStoreEv =
- cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
- APInt SecondStride = getStoreStride(SecondStoreEv);
-
- if (FirstStride != SecondStride)
- continue;
-
- Value *SecondStoredVal = SL[k]->getValueOperand();
- Value *SecondSplatValue = nullptr;
- Constant *SecondPatternValue = nullptr;
-
- if (For == ForMemset::Yes)
- SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
- else
- SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
-
- assert((SecondSplatValue || SecondPatternValue) &&
- "Expected either splat value or pattern value.");
-
- if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
- if (For == ForMemset::Yes) {
- if (isa<UndefValue>(FirstSplatValue))
- FirstSplatValue = SecondSplatValue;
- if (FirstSplatValue != SecondSplatValue)
- continue;
- } else {
- if (isa<UndefValue>(FirstPatternValue))
- FirstPatternValue = SecondPatternValue;
- if (FirstPatternValue != SecondPatternValue)
- continue;
- }
- Tails.insert(SL[k]);
- Heads.insert(SL[i]);
- ConsecutiveChain[SL[i]] = SL[k];
- break;
- }
- }
- }
-
- // We may run into multiple chains that merge into a single chain. We mark the
- // stores that we transformed so that we don't visit the same store twice.
- SmallPtrSet<Value *, 16> TransformedStores;
- bool Changed = false;
-
- // For stores that start but don't end a link in the chain:
- for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
- it != e; ++it) {
- if (Tails.count(*it))
- continue;
-
- // We found a store instr that starts a chain. Now follow the chain and try
- // to transform it.
- SmallPtrSet<Instruction *, 8> AdjacentStores;
- StoreInst *I = *it;
-
- StoreInst *HeadStore = I;
- unsigned StoreSize = 0;
-
- // Collect the chain into a list.
- while (Tails.count(I) || Heads.count(I)) {
- if (TransformedStores.count(I))
- break;
- AdjacentStores.insert(I);
-
- StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
- // Move to the next value in the chain.
- I = ConsecutiveChain[I];
- }
-
- Value *StoredVal = HeadStore->getValueOperand();
- Value *StorePtr = HeadStore->getPointerOperand();
- const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
- APInt Stride = getStoreStride(StoreEv);
-
- // Check to see if the stride matches the size of the stores. If so, then
- // we know that every byte is touched in the loop.
- if (StoreSize != Stride && StoreSize != -Stride)
- continue;
-
- bool NegStride = StoreSize == -Stride;
-
- if (processLoopStridedStore(StorePtr, StoreSize,
- MaybeAlign(HeadStore->getAlignment()),
- StoredVal, HeadStore, AdjacentStores, StoreEv,
- BECount, NegStride)) {
- TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
- Changed = true;
- }
- }
-
- return Changed;
-}
-
-/// processLoopMemSet - See if this memset can be promoted to a large memset.
-bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
- const SCEV *BECount) {
- // We can only handle non-volatile memsets with a constant size.
- if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
- return false;
-
- // If we're not allowed to hack on memset, we fail.
- if (!HasMemset)
- return false;
-
- Value *Pointer = MSI->getDest();
-
- // See if the pointer expression is an AddRec like {base,+,1} on the current
- // loop, which indicates a strided store. If we have something else, it's a
- // random store we can't handle.
- const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
- if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
- return false;
-
- // Reject memsets that are so large that they overflow an unsigned.
- uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
- if ((SizeInBytes >> 32) != 0)
- return false;
-
- // Check to see if the stride matches the size of the memset. If so, then we
- // know that every byte is touched in the loop.
- const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
- if (!ConstStride)
- return false;
-
- APInt Stride = ConstStride->getAPInt();
- if (SizeInBytes != Stride && SizeInBytes != -Stride)
- return false;
-
- // Verify that the memset value is loop invariant. If not, we can't promote
- // the memset.
- Value *SplatValue = MSI->getValue();
- if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
- return false;
-
- SmallPtrSet<Instruction *, 1> MSIs;
- MSIs.insert(MSI);
- bool NegStride = SizeInBytes == -Stride;
- return processLoopStridedStore(
- Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()),
- SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true);
-}
-
-/// mayLoopAccessLocation - Return true if the specified loop might access the
-/// specified pointer location, which is a loop-strided access. The 'Access'
-/// argument specifies what the verboten forms of access are (read or write).
-static bool
-mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
- const SCEV *BECount, unsigned StoreSize,
- AliasAnalysis &AA,
- SmallPtrSetImpl<Instruction *> &IgnoredStores) {
- // Get the location that may be stored across the loop. Since the access is
- // strided positively through memory, we say that the modified location starts
- // at the pointer and has infinite size.
+ StoreRefsForMemsetPattern[Ptr].push_back(SI);
+ } break;
+ case LegalStoreKind::Memcpy:
+ case LegalStoreKind::UnorderedAtomicMemcpy:
+ StoreRefsForMemcpy.push_back(SI);
+ break;
+ default:
+ assert(false && "unhandled return value");
+ break;
+ }
+ }
+}
+
+/// runOnLoopBlock - Process the specified block, which lives in a counted loop
+/// with the specified backedge count. This block is known to be in the current
+/// loop and not in any subloops.
+bool LoopIdiomRecognize::runOnLoopBlock(
+ BasicBlock *BB, const SCEV *BECount,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+ // We can only promote stores in this block if they are unconditionally
+ // executed in the loop. For a block to be unconditionally executed, it has
+ // to dominate all the exit blocks of the loop. Verify this now.
+ for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+ if (!DT->dominates(BB, ExitBlocks[i]))
+ return false;
+
+ bool MadeChange = false;
+ // Look for store instructions, which may be optimized to memset/memcpy.
+ collectStores(BB);
+
+ // Look for a single store or sets of stores with a common base, which can be
+ // optimized into a memset (memset_pattern). The latter most commonly happens
+ // with structs and handunrolled loops.
+ for (auto &SL : StoreRefsForMemset)
+ MadeChange |= processLoopStores(SL.second, BECount, ForMemset::Yes);
+
+ for (auto &SL : StoreRefsForMemsetPattern)
+ MadeChange |= processLoopStores(SL.second, BECount, ForMemset::No);
+
+ // Optimize the store into a memcpy, if it feeds an similarly strided load.
+ for (auto &SI : StoreRefsForMemcpy)
+ MadeChange |= processLoopStoreOfLoopLoad(SI, BECount);
+
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *Inst = &*I++;
+ // Look for memset instructions, which may be optimized to a larger memset.
+ if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {
+ WeakTrackingVH InstPtr(&*I);
+ if (!processLoopMemSet(MSI, BECount))
+ continue;
+ MadeChange = true;
+
+ // If processing the memset invalidated our iterator, start over from the
+ // top of the block.
+ if (!InstPtr)
+ I = BB->begin();
+ continue;
+ }
+ }
+
+ return MadeChange;
+}
+
+/// See if this store(s) can be promoted to a memset.
+bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
+ const SCEV *BECount, ForMemset For) {
+ // Try to find consecutive stores that can be transformed into memsets.
+ SetVector<StoreInst *> Heads, Tails;
+ SmallDenseMap<StoreInst *, StoreInst *> ConsecutiveChain;
+
+ // Do a quadratic search on all of the given stores and find
+ // all of the pairs of stores that follow each other.
+ SmallVector<unsigned, 16> IndexQueue;
+ for (unsigned i = 0, e = SL.size(); i < e; ++i) {
+ assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
+
+ Value *FirstStoredVal = SL[i]->getValueOperand();
+ Value *FirstStorePtr = SL[i]->getPointerOperand();
+ const SCEVAddRecExpr *FirstStoreEv =
+ cast<SCEVAddRecExpr>(SE->getSCEV(FirstStorePtr));
+ APInt FirstStride = getStoreStride(FirstStoreEv);
+ unsigned FirstStoreSize = DL->getTypeStoreSize(SL[i]->getValueOperand()->getType());
+
+ // See if we can optimize just this store in isolation.
+ if (FirstStride == FirstStoreSize || -FirstStride == FirstStoreSize) {
+ Heads.insert(SL[i]);
+ continue;
+ }
+
+ Value *FirstSplatValue = nullptr;
+ Constant *FirstPatternValue = nullptr;
+
+ if (For == ForMemset::Yes)
+ FirstSplatValue = isBytewiseValue(FirstStoredVal, *DL);
+ else
+ FirstPatternValue = getMemSetPatternValue(FirstStoredVal, DL);
+
+ assert((FirstSplatValue || FirstPatternValue) &&
+ "Expected either splat value or pattern value.");
+
+ IndexQueue.clear();
+ // If a store has multiple consecutive store candidates, search Stores
+ // array according to the sequence: from i+1 to e, then from i-1 to 0.
+ // This is because usually pairing with immediate succeeding or preceding
+ // candidate create the best chance to find memset opportunity.
+ unsigned j = 0;
+ for (j = i + 1; j < e; ++j)
+ IndexQueue.push_back(j);
+ for (j = i; j > 0; --j)
+ IndexQueue.push_back(j - 1);
+
+ for (auto &k : IndexQueue) {
+ assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
+ Value *SecondStorePtr = SL[k]->getPointerOperand();
+ const SCEVAddRecExpr *SecondStoreEv =
+ cast<SCEVAddRecExpr>(SE->getSCEV(SecondStorePtr));
+ APInt SecondStride = getStoreStride(SecondStoreEv);
+
+ if (FirstStride != SecondStride)
+ continue;
+
+ Value *SecondStoredVal = SL[k]->getValueOperand();
+ Value *SecondSplatValue = nullptr;
+ Constant *SecondPatternValue = nullptr;
+
+ if (For == ForMemset::Yes)
+ SecondSplatValue = isBytewiseValue(SecondStoredVal, *DL);
+ else
+ SecondPatternValue = getMemSetPatternValue(SecondStoredVal, DL);
+
+ assert((SecondSplatValue || SecondPatternValue) &&
+ "Expected either splat value or pattern value.");
+
+ if (isConsecutiveAccess(SL[i], SL[k], *DL, *SE, false)) {
+ if (For == ForMemset::Yes) {
+ if (isa<UndefValue>(FirstSplatValue))
+ FirstSplatValue = SecondSplatValue;
+ if (FirstSplatValue != SecondSplatValue)
+ continue;
+ } else {
+ if (isa<UndefValue>(FirstPatternValue))
+ FirstPatternValue = SecondPatternValue;
+ if (FirstPatternValue != SecondPatternValue)
+ continue;
+ }
+ Tails.insert(SL[k]);
+ Heads.insert(SL[i]);
+ ConsecutiveChain[SL[i]] = SL[k];
+ break;
+ }
+ }
+ }
+
+ // We may run into multiple chains that merge into a single chain. We mark the
+ // stores that we transformed so that we don't visit the same store twice.
+ SmallPtrSet<Value *, 16> TransformedStores;
+ bool Changed = false;
+
+ // For stores that start but don't end a link in the chain:
+ for (SetVector<StoreInst *>::iterator it = Heads.begin(), e = Heads.end();
+ it != e; ++it) {
+ if (Tails.count(*it))
+ continue;
+
+ // We found a store instr that starts a chain. Now follow the chain and try
+ // to transform it.
+ SmallPtrSet<Instruction *, 8> AdjacentStores;
+ StoreInst *I = *it;
+
+ StoreInst *HeadStore = I;
+ unsigned StoreSize = 0;
+
+ // Collect the chain into a list.
+ while (Tails.count(I) || Heads.count(I)) {
+ if (TransformedStores.count(I))
+ break;
+ AdjacentStores.insert(I);
+
+ StoreSize += DL->getTypeStoreSize(I->getValueOperand()->getType());
+ // Move to the next value in the chain.
+ I = ConsecutiveChain[I];
+ }
+
+ Value *StoredVal = HeadStore->getValueOperand();
+ Value *StorePtr = HeadStore->getPointerOperand();
+ const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+ APInt Stride = getStoreStride(StoreEv);
+
+ // Check to see if the stride matches the size of the stores. If so, then
+ // we know that every byte is touched in the loop.
+ if (StoreSize != Stride && StoreSize != -Stride)
+ continue;
+
+ bool NegStride = StoreSize == -Stride;
+
+ if (processLoopStridedStore(StorePtr, StoreSize,
+ MaybeAlign(HeadStore->getAlignment()),
+ StoredVal, HeadStore, AdjacentStores, StoreEv,
+ BECount, NegStride)) {
+ TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+/// processLoopMemSet - See if this memset can be promoted to a large memset.
+bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
+ const SCEV *BECount) {
+ // We can only handle non-volatile memsets with a constant size.
+ if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
+ return false;
+
+ // If we're not allowed to hack on memset, we fail.
+ if (!HasMemset)
+ return false;
+
+ Value *Pointer = MSI->getDest();
+
+ // See if the pointer expression is an AddRec like {base,+,1} on the current
+ // loop, which indicates a strided store. If we have something else, it's a
+ // random store we can't handle.
+ const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
+ if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
+ return false;
+
+ // Reject memsets that are so large that they overflow an unsigned.
+ uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+ if ((SizeInBytes >> 32) != 0)
+ return false;
+
+ // Check to see if the stride matches the size of the memset. If so, then we
+ // know that every byte is touched in the loop.
+ const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
+ if (!ConstStride)
+ return false;
+
+ APInt Stride = ConstStride->getAPInt();
+ if (SizeInBytes != Stride && SizeInBytes != -Stride)
+ return false;
+
+ // Verify that the memset value is loop invariant. If not, we can't promote
+ // the memset.
+ Value *SplatValue = MSI->getValue();
+ if (!SplatValue || !CurLoop->isLoopInvariant(SplatValue))
+ return false;
+
+ SmallPtrSet<Instruction *, 1> MSIs;
+ MSIs.insert(MSI);
+ bool NegStride = SizeInBytes == -Stride;
+ return processLoopStridedStore(
+ Pointer, (unsigned)SizeInBytes, MaybeAlign(MSI->getDestAlignment()),
+ SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true);
+}
+
+/// mayLoopAccessLocation - Return true if the specified loop might access the
+/// specified pointer location, which is a loop-strided access. The 'Access'
+/// argument specifies what the verboten forms of access are (read or write).
+static bool
+mayLoopAccessLocation(Value *Ptr, ModRefInfo Access, Loop *L,
+ const SCEV *BECount, unsigned StoreSize,
+ AliasAnalysis &AA,
+ SmallPtrSetImpl<Instruction *> &IgnoredStores) {
+ // Get the location that may be stored across the loop. Since the access is
+ // strided positively through memory, we say that the modified location starts
+ // at the pointer and has infinite size.
LocationSize AccessSize = LocationSize::afterPointer();
-
- // If the loop iterates a fixed number of times, we can refine the access size
- // to be exactly the size of the memset, which is (BECount+1)*StoreSize
- if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
- AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
- StoreSize);
-
- // TODO: For this to be really effective, we have to dive into the pointer
- // operand in the store. Store to &A[i] of 100 will always return may alias
- // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
- // which will then no-alias a store to &A[100].
- MemoryLocation StoreLoc(Ptr, AccessSize);
-
- for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
- ++BI)
- for (Instruction &I : **BI)
- if (IgnoredStores.count(&I) == 0 &&
- isModOrRefSet(
- intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
- return true;
-
- return false;
-}
-
-// If we have a negative stride, Start refers to the end of the memory location
-// we're trying to memset. Therefore, we need to recompute the base pointer,
-// which is just Start - BECount*Size.
-static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
- Type *IntPtr, unsigned StoreSize,
- ScalarEvolution *SE) {
- const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
- if (StoreSize != 1)
- Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
- SCEV::FlagNUW);
- return SE->getMinusSCEV(Start, Index);
-}
-
-/// Compute the number of bytes as a SCEV from the backedge taken count.
-///
-/// This also maps the SCEV into the provided type and tries to handle the
-/// computation in a way that will fold cleanly.
-static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
- unsigned StoreSize, Loop *CurLoop,
- const DataLayout *DL, ScalarEvolution *SE) {
- const SCEV *NumBytesS;
- // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
- // pointer size if it isn't already.
- //
- // If we're going to need to zero extend the BE count, check if we can add
- // one to it prior to zero extending without overflow. Provided this is safe,
- // it allows better simplification of the +1.
+
+ // If the loop iterates a fixed number of times, we can refine the access size
+ // to be exactly the size of the memset, which is (BECount+1)*StoreSize
+ if (const SCEVConstant *BECst = dyn_cast<SCEVConstant>(BECount))
+ AccessSize = LocationSize::precise((BECst->getValue()->getZExtValue() + 1) *
+ StoreSize);
+
+ // TODO: For this to be really effective, we have to dive into the pointer
+ // operand in the store. Store to &A[i] of 100 will always return may alias
+ // with store of &A[100], we need to StoreLoc to be "A" with size of 100,
+ // which will then no-alias a store to &A[100].
+ MemoryLocation StoreLoc(Ptr, AccessSize);
+
+ for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
+ ++BI)
+ for (Instruction &I : **BI)
+ if (IgnoredStores.count(&I) == 0 &&
+ isModOrRefSet(
+ intersectModRef(AA.getModRefInfo(&I, StoreLoc), Access)))
+ return true;
+
+ return false;
+}
+
+// If we have a negative stride, Start refers to the end of the memory location
+// we're trying to memset. Therefore, we need to recompute the base pointer,
+// which is just Start - BECount*Size.
+static const SCEV *getStartForNegStride(const SCEV *Start, const SCEV *BECount,
+ Type *IntPtr, unsigned StoreSize,
+ ScalarEvolution *SE) {
+ const SCEV *Index = SE->getTruncateOrZeroExtend(BECount, IntPtr);
+ if (StoreSize != 1)
+ Index = SE->getMulExpr(Index, SE->getConstant(IntPtr, StoreSize),
+ SCEV::FlagNUW);
+ return SE->getMinusSCEV(Start, Index);
+}
+
+/// Compute the number of bytes as a SCEV from the backedge taken count.
+///
+/// This also maps the SCEV into the provided type and tries to handle the
+/// computation in a way that will fold cleanly.
+static const SCEV *getNumBytes(const SCEV *BECount, Type *IntPtr,
+ unsigned StoreSize, Loop *CurLoop,
+ const DataLayout *DL, ScalarEvolution *SE) {
+ const SCEV *NumBytesS;
+ // The # stored bytes is (BECount+1)*Size. Expand the trip count out to
+ // pointer size if it isn't already.
+ //
+ // If we're going to need to zero extend the BE count, check if we can add
+ // one to it prior to zero extending without overflow. Provided this is safe,
+ // it allows better simplification of the +1.
if (DL->getTypeSizeInBits(BECount->getType()).getFixedSize() <
DL->getTypeSizeInBits(IntPtr).getFixedSize() &&
- SE->isLoopEntryGuardedByCond(
- CurLoop, ICmpInst::ICMP_NE, BECount,
- SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
- NumBytesS = SE->getZeroExtendExpr(
- SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
- IntPtr);
- } else {
- NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
- SE->getOne(IntPtr), SCEV::FlagNUW);
- }
-
- // And scale it based on the store size.
- if (StoreSize != 1) {
- NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
- SCEV::FlagNUW);
- }
- return NumBytesS;
-}
-
-/// processLoopStridedStore - We see a strided store of some value. If we can
-/// transform this into a memset or memset_pattern in the loop preheader, do so.
-bool LoopIdiomRecognize::processLoopStridedStore(
- Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment,
- Value *StoredVal, Instruction *TheStore,
- SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
- const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
- Value *SplatValue = isBytewiseValue(StoredVal, *DL);
- Constant *PatternValue = nullptr;
-
- if (!SplatValue)
- PatternValue = getMemSetPatternValue(StoredVal, DL);
-
- assert((SplatValue || PatternValue) &&
- "Expected either splat value or pattern value.");
-
- // The trip count of the loop and the base pointer of the addrec SCEV is
- // guaranteed to be loop invariant, which means that it should dominate the
- // header. This allows us to insert code for it in the preheader.
- unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
- BasicBlock *Preheader = CurLoop->getLoopPreheader();
- IRBuilder<> Builder(Preheader->getTerminator());
- SCEVExpander Expander(*SE, *DL, "loop-idiom");
+ SE->isLoopEntryGuardedByCond(
+ CurLoop, ICmpInst::ICMP_NE, BECount,
+ SE->getNegativeSCEV(SE->getOne(BECount->getType())))) {
+ NumBytesS = SE->getZeroExtendExpr(
+ SE->getAddExpr(BECount, SE->getOne(BECount->getType()), SCEV::FlagNUW),
+ IntPtr);
+ } else {
+ NumBytesS = SE->getAddExpr(SE->getTruncateOrZeroExtend(BECount, IntPtr),
+ SE->getOne(IntPtr), SCEV::FlagNUW);
+ }
+
+ // And scale it based on the store size.
+ if (StoreSize != 1) {
+ NumBytesS = SE->getMulExpr(NumBytesS, SE->getConstant(IntPtr, StoreSize),
+ SCEV::FlagNUW);
+ }
+ return NumBytesS;
+}
+
+/// processLoopStridedStore - We see a strided store of some value. If we can
+/// transform this into a memset or memset_pattern in the loop preheader, do so.
+bool LoopIdiomRecognize::processLoopStridedStore(
+ Value *DestPtr, unsigned StoreSize, MaybeAlign StoreAlignment,
+ Value *StoredVal, Instruction *TheStore,
+ SmallPtrSetImpl<Instruction *> &Stores, const SCEVAddRecExpr *Ev,
+ const SCEV *BECount, bool NegStride, bool IsLoopMemset) {
+ Value *SplatValue = isBytewiseValue(StoredVal, *DL);
+ Constant *PatternValue = nullptr;
+
+ if (!SplatValue)
+ PatternValue = getMemSetPatternValue(StoredVal, DL);
+
+ assert((SplatValue || PatternValue) &&
+ "Expected either splat value or pattern value.");
+
+ // The trip count of the loop and the base pointer of the addrec SCEV is
+ // guaranteed to be loop invariant, which means that it should dominate the
+ // header. This allows us to insert code for it in the preheader.
+ unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ IRBuilder<> Builder(Preheader->getTerminator());
+ SCEVExpander Expander(*SE, *DL, "loop-idiom");
SCEVExpanderCleaner ExpCleaner(Expander, *DT);
-
- Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
- Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
-
+
+ Type *DestInt8PtrTy = Builder.getInt8PtrTy(DestAS);
+ Type *IntIdxTy = DL->getIndexType(DestPtr->getType());
+
bool Changed = false;
- const SCEV *Start = Ev->getStart();
- // Handle negative strided loops.
- if (NegStride)
- Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE);
-
- // TODO: ideally we should still be able to generate memset if SCEV expander
- // is taught to generate the dependencies at the latest point.
- if (!isSafeToExpand(Start, *SE))
+ const SCEV *Start = Ev->getStart();
+ // Handle negative strided loops.
+ if (NegStride)
+ Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSize, SE);
+
+ // TODO: ideally we should still be able to generate memset if SCEV expander
+ // is taught to generate the dependencies at the latest point.
+ if (!isSafeToExpand(Start, *SE))
return Changed;
-
- // Okay, we have a strided store "p[i]" of a splattable value. We can turn
- // this into a memset in the loop preheader now if we want. However, this
- // would be unsafe to do if there is anything else in the loop that may read
- // or write to the aliased location. Check for any overlap by generating the
- // base pointer and checking the region.
- Value *BasePtr =
- Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
+
+ // Okay, we have a strided store "p[i]" of a splattable value. We can turn
+ // this into a memset in the loop preheader now if we want. However, this
+ // would be unsafe to do if there is anything else in the loop that may read
+ // or write to the aliased location. Check for any overlap by generating the
+ // base pointer and checking the region.
+ Value *BasePtr =
+ Expander.expandCodeFor(Start, DestInt8PtrTy, Preheader->getTerminator());
// From here on out, conservatively report to the pass manager that we've
// changed the IR, even if we later clean up these added instructions. There
@@ -980,134 +980,134 @@ bool LoopIdiomRecognize::processLoopStridedStore(
// the return value will read this comment, and leave them alone.
Changed = true;
- if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
+ if (mayLoopAccessLocation(BasePtr, ModRefInfo::ModRef, CurLoop, BECount,
StoreSize, *AA, Stores))
return Changed;
-
- if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
+
+ if (avoidLIRForMultiBlockLoop(/*IsMemset=*/true, IsLoopMemset))
return Changed;
-
- // Okay, everything looks good, insert the memset.
-
- const SCEV *NumBytesS =
- getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
-
- // TODO: ideally we should still be able to generate memset if SCEV expander
- // is taught to generate the dependencies at the latest point.
- if (!isSafeToExpand(NumBytesS, *SE))
+
+ // Okay, everything looks good, insert the memset.
+
+ const SCEV *NumBytesS =
+ getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
+
+ // TODO: ideally we should still be able to generate memset if SCEV expander
+ // is taught to generate the dependencies at the latest point.
+ if (!isSafeToExpand(NumBytesS, *SE))
return Changed;
-
- Value *NumBytes =
- Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
-
- CallInst *NewCall;
- if (SplatValue) {
- NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
- MaybeAlign(StoreAlignment));
- } else {
- // Everything is emitted in default address space
- Type *Int8PtrTy = DestInt8PtrTy;
-
- Module *M = TheStore->getModule();
- StringRef FuncName = "memset_pattern16";
- FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
- Int8PtrTy, Int8PtrTy, IntIdxTy);
- inferLibFuncAttributes(M, FuncName, *TLI);
-
- // Otherwise we should form a memset_pattern16. PatternValue is known to be
- // an constant array of 16-bytes. Plop the value into a mergable global.
- GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
- GlobalValue::PrivateLinkage,
- PatternValue, ".memset_pattern");
- GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
- GV->setAlignment(Align(16));
- Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
- NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
- }
- NewCall->setDebugLoc(TheStore->getDebugLoc());
-
- if (MSSAU) {
- MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
- NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
- MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
- }
-
- LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
- << " from store to: " << *Ev << " at: " << *TheStore
- << "\n");
-
- ORE.emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore",
- NewCall->getDebugLoc(), Preheader)
- << "Transformed loop-strided store into a call to "
- << ore::NV("NewFunction", NewCall->getCalledFunction())
- << "() function";
- });
-
- // Okay, the memset has been formed. Zap the original store and anything that
- // feeds into it.
- for (auto *I : Stores) {
- if (MSSAU)
- MSSAU->removeMemoryAccess(I, true);
- deleteDeadInstruction(I);
- }
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
- ++NumMemSet;
+
+ Value *NumBytes =
+ Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
+
+ CallInst *NewCall;
+ if (SplatValue) {
+ NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
+ MaybeAlign(StoreAlignment));
+ } else {
+ // Everything is emitted in default address space
+ Type *Int8PtrTy = DestInt8PtrTy;
+
+ Module *M = TheStore->getModule();
+ StringRef FuncName = "memset_pattern16";
+ FunctionCallee MSP = M->getOrInsertFunction(FuncName, Builder.getVoidTy(),
+ Int8PtrTy, Int8PtrTy, IntIdxTy);
+ inferLibFuncAttributes(M, FuncName, *TLI);
+
+ // Otherwise we should form a memset_pattern16. PatternValue is known to be
+ // an constant array of 16-bytes. Plop the value into a mergable global.
+ GlobalVariable *GV = new GlobalVariable(*M, PatternValue->getType(), true,
+ GlobalValue::PrivateLinkage,
+ PatternValue, ".memset_pattern");
+ GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global); // Ok to merge these.
+ GV->setAlignment(Align(16));
+ Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
+ NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
+ }
+ NewCall->setDebugLoc(TheStore->getDebugLoc());
+
+ if (MSSAU) {
+ MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
+ NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
+ MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
+ }
+
+ LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
+ << " from store to: " << *Ev << " at: " << *TheStore
+ << "\n");
+
+ ORE.emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore",
+ NewCall->getDebugLoc(), Preheader)
+ << "Transformed loop-strided store into a call to "
+ << ore::NV("NewFunction", NewCall->getCalledFunction())
+ << "() function";
+ });
+
+ // Okay, the memset has been formed. Zap the original store and anything that
+ // feeds into it.
+ for (auto *I : Stores) {
+ if (MSSAU)
+ MSSAU->removeMemoryAccess(I, true);
+ deleteDeadInstruction(I);
+ }
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+ ++NumMemSet;
ExpCleaner.markResultUsed();
- return true;
-}
-
-/// If the stored value is a strided load in the same loop with the same stride
-/// this may be transformable into a memcpy. This kicks in for stuff like
-/// for (i) A[i] = B[i];
-bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
- const SCEV *BECount) {
- assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
-
- Value *StorePtr = SI->getPointerOperand();
- const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
- APInt Stride = getStoreStride(StoreEv);
- unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
- bool NegStride = StoreSize == -Stride;
-
- // The store must be feeding a non-volatile load.
- LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
- assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
-
- // See if the pointer expression is an AddRec like {base,+,1} on the current
- // loop, which indicates a strided load. If we have something else, it's a
- // random load we can't handle.
- const SCEVAddRecExpr *LoadEv =
- cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
-
- // The trip count of the loop and the base pointer of the addrec SCEV is
- // guaranteed to be loop invariant, which means that it should dominate the
- // header. This allows us to insert code for it in the preheader.
- BasicBlock *Preheader = CurLoop->getLoopPreheader();
- IRBuilder<> Builder(Preheader->getTerminator());
- SCEVExpander Expander(*SE, *DL, "loop-idiom");
-
+ return true;
+}
+
+/// If the stored value is a strided load in the same loop with the same stride
+/// this may be transformable into a memcpy. This kicks in for stuff like
+/// for (i) A[i] = B[i];
+bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
+ const SCEV *BECount) {
+ assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
+
+ Value *StorePtr = SI->getPointerOperand();
+ const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
+ APInt Stride = getStoreStride(StoreEv);
+ unsigned StoreSize = DL->getTypeStoreSize(SI->getValueOperand()->getType());
+ bool NegStride = StoreSize == -Stride;
+
+ // The store must be feeding a non-volatile load.
+ LoadInst *LI = cast<LoadInst>(SI->getValueOperand());
+ assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
+
+ // See if the pointer expression is an AddRec like {base,+,1} on the current
+ // loop, which indicates a strided load. If we have something else, it's a
+ // random load we can't handle.
+ const SCEVAddRecExpr *LoadEv =
+ cast<SCEVAddRecExpr>(SE->getSCEV(LI->getPointerOperand()));
+
+ // The trip count of the loop and the base pointer of the addrec SCEV is
+ // guaranteed to be loop invariant, which means that it should dominate the
+ // header. This allows us to insert code for it in the preheader.
+ BasicBlock *Preheader = CurLoop->getLoopPreheader();
+ IRBuilder<> Builder(Preheader->getTerminator());
+ SCEVExpander Expander(*SE, *DL, "loop-idiom");
+
SCEVExpanderCleaner ExpCleaner(Expander, *DT);
-
+
bool Changed = false;
- const SCEV *StrStart = StoreEv->getStart();
- unsigned StrAS = SI->getPointerAddressSpace();
- Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
-
- // Handle negative strided loops.
- if (NegStride)
- StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
-
- // Okay, we have a strided store "p[i]" of a loaded value. We can turn
- // this into a memcpy in the loop preheader now if we want. However, this
- // would be unsafe to do if there is anything else in the loop that may read
- // or write the memory region we're storing to. This includes the load that
- // feeds the stores. Check for an alias by generating the base address and
- // checking everything.
- Value *StoreBasePtr = Expander.expandCodeFor(
- StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
-
+ const SCEV *StrStart = StoreEv->getStart();
+ unsigned StrAS = SI->getPointerAddressSpace();
+ Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
+
+ // Handle negative strided loops.
+ if (NegStride)
+ StrStart = getStartForNegStride(StrStart, BECount, IntIdxTy, StoreSize, SE);
+
+ // Okay, we have a strided store "p[i]" of a loaded value. We can turn
+ // this into a memcpy in the loop preheader now if we want. However, this
+ // would be unsafe to do if there is anything else in the loop that may read
+ // or write the memory region we're storing to. This includes the load that
+ // feeds the stores. Check for an alias by generating the base address and
+ // checking everything.
+ Value *StoreBasePtr = Expander.expandCodeFor(
+ StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
+
// From here on out, conservatively report to the pass manager that we've
// changed the IR, even if we later clean up these added instructions. There
// may be structural differences e.g. in the order of use lists not accounted
@@ -1117,650 +1117,650 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
// the return value will read this comment, and leave them alone.
Changed = true;
- SmallPtrSet<Instruction *, 1> Stores;
- Stores.insert(SI);
- if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
- StoreSize, *AA, Stores))
+ SmallPtrSet<Instruction *, 1> Stores;
+ Stores.insert(SI);
+ if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
+ StoreSize, *AA, Stores))
return Changed;
-
- const SCEV *LdStart = LoadEv->getStart();
- unsigned LdAS = LI->getPointerAddressSpace();
-
- // Handle negative strided loops.
- if (NegStride)
- LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);
-
- // For a memcpy, we have to make sure that the input array is not being
- // mutated by the loop.
- Value *LoadBasePtr = Expander.expandCodeFor(
- LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
-
- if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
- StoreSize, *AA, Stores))
+
+ const SCEV *LdStart = LoadEv->getStart();
+ unsigned LdAS = LI->getPointerAddressSpace();
+
+ // Handle negative strided loops.
+ if (NegStride)
+ LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);
+
+ // For a memcpy, we have to make sure that the input array is not being
+ // mutated by the loop.
+ Value *LoadBasePtr = Expander.expandCodeFor(
+ LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
+
+ if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
+ StoreSize, *AA, Stores))
return Changed;
-
- if (avoidLIRForMultiBlockLoop())
+
+ if (avoidLIRForMultiBlockLoop())
return Changed;
-
- // Okay, everything is safe, we can transform this!
-
- const SCEV *NumBytesS =
- getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
-
- Value *NumBytes =
- Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
-
- CallInst *NewCall = nullptr;
- // Check whether to generate an unordered atomic memcpy:
- // If the load or store are atomic, then they must necessarily be unordered
- // by previous checks.
- if (!SI->isAtomic() && !LI->isAtomic())
- NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
- LI->getAlign(), NumBytes);
- else {
- // We cannot allow unaligned ops for unordered load/store, so reject
- // anything where the alignment isn't at least the element size.
- const Align StoreAlign = SI->getAlign();
- const Align LoadAlign = LI->getAlign();
- if (StoreAlign < StoreSize || LoadAlign < StoreSize)
+
+ // Okay, everything is safe, we can transform this!
+
+ const SCEV *NumBytesS =
+ getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
+
+ Value *NumBytes =
+ Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
+
+ CallInst *NewCall = nullptr;
+ // Check whether to generate an unordered atomic memcpy:
+ // If the load or store are atomic, then they must necessarily be unordered
+ // by previous checks.
+ if (!SI->isAtomic() && !LI->isAtomic())
+ NewCall = Builder.CreateMemCpy(StoreBasePtr, SI->getAlign(), LoadBasePtr,
+ LI->getAlign(), NumBytes);
+ else {
+ // We cannot allow unaligned ops for unordered load/store, so reject
+ // anything where the alignment isn't at least the element size.
+ const Align StoreAlign = SI->getAlign();
+ const Align LoadAlign = LI->getAlign();
+ if (StoreAlign < StoreSize || LoadAlign < StoreSize)
return Changed;
-
- // If the element.atomic memcpy is not lowered into explicit
- // loads/stores later, then it will be lowered into an element-size
- // specific lib call. If the lib call doesn't exist for our store size, then
- // we shouldn't generate the memcpy.
- if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
+
+ // If the element.atomic memcpy is not lowered into explicit
+ // loads/stores later, then it will be lowered into an element-size
+ // specific lib call. If the lib call doesn't exist for our store size, then
+ // we shouldn't generate the memcpy.
+ if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
return Changed;
-
- // Create the call.
- // Note that unordered atomic loads/stores are *required* by the spec to
- // have an alignment but non-atomic loads/stores may not.
- NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
- StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
- StoreSize);
- }
- NewCall->setDebugLoc(SI->getDebugLoc());
-
- if (MSSAU) {
- MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
- NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
- MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
- }
-
- LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
- << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
- << " from store ptr=" << *StoreEv << " at: " << *SI
- << "\n");
-
- ORE.emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
- NewCall->getDebugLoc(), Preheader)
- << "Formed a call to "
- << ore::NV("NewFunction", NewCall->getCalledFunction())
- << "() function";
- });
-
- // Okay, the memcpy has been formed. Zap the original store and anything that
- // feeds into it.
- if (MSSAU)
- MSSAU->removeMemoryAccess(SI, true);
- deleteDeadInstruction(SI);
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
- ++NumMemCpy;
+
+ // Create the call.
+ // Note that unordered atomic loads/stores are *required* by the spec to
+ // have an alignment but non-atomic loads/stores may not.
+ NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
+ StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
+ StoreSize);
+ }
+ NewCall->setDebugLoc(SI->getDebugLoc());
+
+ if (MSSAU) {
+ MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
+ NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
+ MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
+ }
+
+ LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
+ << " from load ptr=" << *LoadEv << " at: " << *LI << "\n"
+ << " from store ptr=" << *StoreEv << " at: " << *SI
+ << "\n");
+
+ ORE.emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
+ NewCall->getDebugLoc(), Preheader)
+ << "Formed a call to "
+ << ore::NV("NewFunction", NewCall->getCalledFunction())
+ << "() function";
+ });
+
+ // Okay, the memcpy has been formed. Zap the original store and anything that
+ // feeds into it.
+ if (MSSAU)
+ MSSAU->removeMemoryAccess(SI, true);
+ deleteDeadInstruction(SI);
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+ ++NumMemCpy;
ExpCleaner.markResultUsed();
- return true;
-}
-
-// When compiling for codesize we avoid idiom recognition for a multi-block loop
-// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
-//
-bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
- bool IsLoopMemset) {
- if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
+ return true;
+}
+
+// When compiling for codesize we avoid idiom recognition for a multi-block loop
+// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
+//
+bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
+ bool IsLoopMemset) {
+ if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > 1) {
if (CurLoop->isOutermost() && (!IsMemset || !IsLoopMemset)) {
- LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
- << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
- << " avoided: multi-block top-level loop\n");
- return true;
- }
- }
-
- return false;
-}
-
-bool LoopIdiomRecognize::runOnNoncountableLoop() {
- LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
- << CurLoop->getHeader()->getParent()->getName()
- << "] Noncountable Loop %"
- << CurLoop->getHeader()->getName() << "\n");
-
+ LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
+ << " : LIR " << (IsMemset ? "Memset" : "Memcpy")
+ << " avoided: multi-block top-level loop\n");
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool LoopIdiomRecognize::runOnNoncountableLoop() {
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
+ << CurLoop->getHeader()->getParent()->getName()
+ << "] Noncountable Loop %"
+ << CurLoop->getHeader()->getName() << "\n");
+
return recognizePopcount() || recognizeAndInsertFFS() ||
recognizeShiftUntilBitTest();
-}
-
-/// Check if the given conditional branch is based on the comparison between
-/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
-/// true), the control yields to the loop entry. If the branch matches the
-/// behavior, the variable involved in the comparison is returned. This function
-/// will be called to see if the precondition and postcondition of the loop are
-/// in desirable form.
-static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
- bool JmpOnZero = false) {
- if (!BI || !BI->isConditional())
- return nullptr;
-
- ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
- if (!Cond)
- return nullptr;
-
- ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
- if (!CmpZero || !CmpZero->isZero())
- return nullptr;
-
- BasicBlock *TrueSucc = BI->getSuccessor(0);
- BasicBlock *FalseSucc = BI->getSuccessor(1);
- if (JmpOnZero)
- std::swap(TrueSucc, FalseSucc);
-
- ICmpInst::Predicate Pred = Cond->getPredicate();
- if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
- (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
- return Cond->getOperand(0);
-
- return nullptr;
-}
-
-// Check if the recurrence variable `VarX` is in the right form to create
-// the idiom. Returns the value coerced to a PHINode if so.
-static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
- BasicBlock *LoopEntry) {
- auto *PhiX = dyn_cast<PHINode>(VarX);
- if (PhiX && PhiX->getParent() == LoopEntry &&
- (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
- return PhiX;
- return nullptr;
-}
-
-/// Return true iff the idiom is detected in the loop.
-///
-/// Additionally:
-/// 1) \p CntInst is set to the instruction counting the population bit.
-/// 2) \p CntPhi is set to the corresponding phi node.
-/// 3) \p Var is set to the value whose population bits are being counted.
-///
-/// The core idiom we are trying to detect is:
-/// \code
-/// if (x0 != 0)
-/// goto loop-exit // the precondition of the loop
-/// cnt0 = init-val;
-/// do {
-/// x1 = phi (x0, x2);
-/// cnt1 = phi(cnt0, cnt2);
-///
-/// cnt2 = cnt1 + 1;
-/// ...
-/// x2 = x1 & (x1 - 1);
-/// ...
-/// } while(x != 0);
-///
-/// loop-exit:
-/// \endcode
-static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
- Instruction *&CntInst, PHINode *&CntPhi,
- Value *&Var) {
- // step 1: Check to see if the look-back branch match this pattern:
- // "if (a!=0) goto loop-entry".
- BasicBlock *LoopEntry;
- Instruction *DefX2, *CountInst;
- Value *VarX1, *VarX0;
- PHINode *PhiX, *CountPhi;
-
- DefX2 = CountInst = nullptr;
- VarX1 = VarX0 = nullptr;
- PhiX = CountPhi = nullptr;
- LoopEntry = *(CurLoop->block_begin());
-
- // step 1: Check if the loop-back branch is in desirable form.
- {
- if (Value *T = matchCondition(
- dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
- DefX2 = dyn_cast<Instruction>(T);
- else
- return false;
- }
-
- // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
- {
- if (!DefX2 || DefX2->getOpcode() != Instruction::And)
- return false;
-
- BinaryOperator *SubOneOp;
-
- if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
- VarX1 = DefX2->getOperand(1);
- else {
- VarX1 = DefX2->getOperand(0);
- SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
- }
- if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
- return false;
-
- ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
- if (!Dec ||
- !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
- (SubOneOp->getOpcode() == Instruction::Add &&
- Dec->isMinusOne()))) {
- return false;
- }
- }
-
- // step 3: Check the recurrence of variable X
- PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
- if (!PhiX)
- return false;
-
- // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
- {
- CountInst = nullptr;
- for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
- IterE = LoopEntry->end();
- Iter != IterE; Iter++) {
- Instruction *Inst = &*Iter;
- if (Inst->getOpcode() != Instruction::Add)
- continue;
-
- ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
- if (!Inc || !Inc->isOne())
- continue;
-
- PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
- if (!Phi)
- continue;
-
- // Check if the result of the instruction is live of the loop.
- bool LiveOutLoop = false;
- for (User *U : Inst->users()) {
- if ((cast<Instruction>(U))->getParent() != LoopEntry) {
- LiveOutLoop = true;
- break;
- }
- }
-
- if (LiveOutLoop) {
- CountInst = Inst;
- CountPhi = Phi;
- break;
- }
- }
-
- if (!CountInst)
- return false;
- }
-
- // step 5: check if the precondition is in this form:
- // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
- {
- auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
- Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
- if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
- return false;
-
- CntInst = CountInst;
- CntPhi = CountPhi;
- Var = T;
- }
-
- return true;
-}
-
-/// Return true if the idiom is detected in the loop.
-///
-/// Additionally:
-/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
-/// or nullptr if there is no such.
-/// 2) \p CntPhi is set to the corresponding phi node
-/// or nullptr if there is no such.
-/// 3) \p Var is set to the value whose CTLZ could be used.
-/// 4) \p DefX is set to the instruction calculating Loop exit condition.
-///
-/// The core idiom we are trying to detect is:
-/// \code
-/// if (x0 == 0)
-/// goto loop-exit // the precondition of the loop
-/// cnt0 = init-val;
-/// do {
-/// x = phi (x0, x.next); //PhiX
-/// cnt = phi(cnt0, cnt.next);
-///
-/// cnt.next = cnt + 1;
-/// ...
-/// x.next = x >> 1; // DefX
-/// ...
-/// } while(x.next != 0);
-///
-/// loop-exit:
-/// \endcode
-static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
- Intrinsic::ID &IntrinID, Value *&InitX,
- Instruction *&CntInst, PHINode *&CntPhi,
- Instruction *&DefX) {
- BasicBlock *LoopEntry;
- Value *VarX = nullptr;
-
- DefX = nullptr;
- CntInst = nullptr;
- CntPhi = nullptr;
- LoopEntry = *(CurLoop->block_begin());
-
- // step 1: Check if the loop-back branch is in desirable form.
- if (Value *T = matchCondition(
- dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
- DefX = dyn_cast<Instruction>(T);
- else
- return false;
-
- // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
- if (!DefX || !DefX->isShift())
- return false;
- IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
- Intrinsic::ctlz;
- ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
- if (!Shft || !Shft->isOne())
- return false;
- VarX = DefX->getOperand(0);
-
- // step 3: Check the recurrence of variable X
- PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
- if (!PhiX)
- return false;
-
- InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
-
- // Make sure the initial value can't be negative otherwise the ashr in the
- // loop might never reach zero which would make the loop infinite.
- if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
- return false;
-
- // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
+}
+
+/// Check if the given conditional branch is based on the comparison between
+/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
+/// true), the control yields to the loop entry. If the branch matches the
+/// behavior, the variable involved in the comparison is returned. This function
+/// will be called to see if the precondition and postcondition of the loop are
+/// in desirable form.
+static Value *matchCondition(BranchInst *BI, BasicBlock *LoopEntry,
+ bool JmpOnZero = false) {
+ if (!BI || !BI->isConditional())
+ return nullptr;
+
+ ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
+ if (!Cond)
+ return nullptr;
+
+ ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
+ if (!CmpZero || !CmpZero->isZero())
+ return nullptr;
+
+ BasicBlock *TrueSucc = BI->getSuccessor(0);
+ BasicBlock *FalseSucc = BI->getSuccessor(1);
+ if (JmpOnZero)
+ std::swap(TrueSucc, FalseSucc);
+
+ ICmpInst::Predicate Pred = Cond->getPredicate();
+ if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) ||
+ (Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
+ return Cond->getOperand(0);
+
+ return nullptr;
+}
+
+// Check if the recurrence variable `VarX` is in the right form to create
+// the idiom. Returns the value coerced to a PHINode if so.
+static PHINode *getRecurrenceVar(Value *VarX, Instruction *DefX,
+ BasicBlock *LoopEntry) {
+ auto *PhiX = dyn_cast<PHINode>(VarX);
+ if (PhiX && PhiX->getParent() == LoopEntry &&
+ (PhiX->getOperand(0) == DefX || PhiX->getOperand(1) == DefX))
+ return PhiX;
+ return nullptr;
+}
+
+/// Return true iff the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction counting the population bit.
+/// 2) \p CntPhi is set to the corresponding phi node.
+/// 3) \p Var is set to the value whose population bits are being counted.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+/// if (x0 != 0)
+/// goto loop-exit // the precondition of the loop
+/// cnt0 = init-val;
+/// do {
+/// x1 = phi (x0, x2);
+/// cnt1 = phi(cnt0, cnt2);
+///
+/// cnt2 = cnt1 + 1;
+/// ...
+/// x2 = x1 & (x1 - 1);
+/// ...
+/// } while(x != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB,
+ Instruction *&CntInst, PHINode *&CntPhi,
+ Value *&Var) {
+ // step 1: Check to see if the look-back branch match this pattern:
+ // "if (a!=0) goto loop-entry".
+ BasicBlock *LoopEntry;
+ Instruction *DefX2, *CountInst;
+ Value *VarX1, *VarX0;
+ PHINode *PhiX, *CountPhi;
+
+ DefX2 = CountInst = nullptr;
+ VarX1 = VarX0 = nullptr;
+ PhiX = CountPhi = nullptr;
+ LoopEntry = *(CurLoop->block_begin());
+
+ // step 1: Check if the loop-back branch is in desirable form.
+ {
+ if (Value *T = matchCondition(
+ dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+ DefX2 = dyn_cast<Instruction>(T);
+ else
+ return false;
+ }
+
+ // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
+ {
+ if (!DefX2 || DefX2->getOpcode() != Instruction::And)
+ return false;
+
+ BinaryOperator *SubOneOp;
+
+ if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
+ VarX1 = DefX2->getOperand(1);
+ else {
+ VarX1 = DefX2->getOperand(0);
+ SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
+ }
+ if (!SubOneOp || SubOneOp->getOperand(0) != VarX1)
+ return false;
+
+ ConstantInt *Dec = dyn_cast<ConstantInt>(SubOneOp->getOperand(1));
+ if (!Dec ||
+ !((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) ||
+ (SubOneOp->getOpcode() == Instruction::Add &&
+ Dec->isMinusOne()))) {
+ return false;
+ }
+ }
+
+ // step 3: Check the recurrence of variable X
+ PhiX = getRecurrenceVar(VarX1, DefX2, LoopEntry);
+ if (!PhiX)
+ return false;
+
+ // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
+ {
+ CountInst = nullptr;
+ for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+ IterE = LoopEntry->end();
+ Iter != IterE; Iter++) {
+ Instruction *Inst = &*Iter;
+ if (Inst->getOpcode() != Instruction::Add)
+ continue;
+
+ ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+ if (!Inc || !Inc->isOne())
+ continue;
+
+ PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+ if (!Phi)
+ continue;
+
+ // Check if the result of the instruction is live of the loop.
+ bool LiveOutLoop = false;
+ for (User *U : Inst->users()) {
+ if ((cast<Instruction>(U))->getParent() != LoopEntry) {
+ LiveOutLoop = true;
+ break;
+ }
+ }
+
+ if (LiveOutLoop) {
+ CountInst = Inst;
+ CountPhi = Phi;
+ break;
+ }
+ }
+
+ if (!CountInst)
+ return false;
+ }
+
+ // step 5: check if the precondition is in this form:
+ // "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
+ {
+ auto *PreCondBr = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+ Value *T = matchCondition(PreCondBr, CurLoop->getLoopPreheader());
+ if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
+ return false;
+
+ CntInst = CountInst;
+ CntPhi = CountPhi;
+ Var = T;
+ }
+
+ return true;
+}
+
+/// Return true if the idiom is detected in the loop.
+///
+/// Additionally:
+/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
+/// or nullptr if there is no such.
+/// 2) \p CntPhi is set to the corresponding phi node
+/// or nullptr if there is no such.
+/// 3) \p Var is set to the value whose CTLZ could be used.
+/// 4) \p DefX is set to the instruction calculating Loop exit condition.
+///
+/// The core idiom we are trying to detect is:
+/// \code
+/// if (x0 == 0)
+/// goto loop-exit // the precondition of the loop
+/// cnt0 = init-val;
+/// do {
+/// x = phi (x0, x.next); //PhiX
+/// cnt = phi(cnt0, cnt.next);
+///
+/// cnt.next = cnt + 1;
+/// ...
+/// x.next = x >> 1; // DefX
+/// ...
+/// } while(x.next != 0);
+///
+/// loop-exit:
+/// \endcode
+static bool detectShiftUntilZeroIdiom(Loop *CurLoop, const DataLayout &DL,
+ Intrinsic::ID &IntrinID, Value *&InitX,
+ Instruction *&CntInst, PHINode *&CntPhi,
+ Instruction *&DefX) {
+ BasicBlock *LoopEntry;
+ Value *VarX = nullptr;
+
+ DefX = nullptr;
+ CntInst = nullptr;
+ CntPhi = nullptr;
+ LoopEntry = *(CurLoop->block_begin());
+
+ // step 1: Check if the loop-back branch is in desirable form.
+ if (Value *T = matchCondition(
+ dyn_cast<BranchInst>(LoopEntry->getTerminator()), LoopEntry))
+ DefX = dyn_cast<Instruction>(T);
+ else
+ return false;
+
+ // step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
+ if (!DefX || !DefX->isShift())
+ return false;
+ IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
+ Intrinsic::ctlz;
+ ConstantInt *Shft = dyn_cast<ConstantInt>(DefX->getOperand(1));
+ if (!Shft || !Shft->isOne())
+ return false;
+ VarX = DefX->getOperand(0);
+
+ // step 3: Check the recurrence of variable X
+ PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
+ if (!PhiX)
+ return false;
+
+ InitX = PhiX->getIncomingValueForBlock(CurLoop->getLoopPreheader());
+
+ // Make sure the initial value can't be negative otherwise the ashr in the
+ // loop might never reach zero which would make the loop infinite.
+ if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(InitX, DL))
+ return false;
+
+ // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
// or cnt.next = cnt + -1.
- // TODO: We can skip the step. If loop trip count is known (CTLZ),
- // then all uses of "cnt.next" could be optimized to the trip count
- // plus "cnt0". Currently it is not optimized.
- // This step could be used to detect POPCNT instruction:
- // cnt.next = cnt + (x.next & 1)
- for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
- IterE = LoopEntry->end();
- Iter != IterE; Iter++) {
- Instruction *Inst = &*Iter;
- if (Inst->getOpcode() != Instruction::Add)
- continue;
-
- ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+ // TODO: We can skip the step. If loop trip count is known (CTLZ),
+ // then all uses of "cnt.next" could be optimized to the trip count
+ // plus "cnt0". Currently it is not optimized.
+ // This step could be used to detect POPCNT instruction:
+ // cnt.next = cnt + (x.next & 1)
+ for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(),
+ IterE = LoopEntry->end();
+ Iter != IterE; Iter++) {
+ Instruction *Inst = &*Iter;
+ if (Inst->getOpcode() != Instruction::Add)
+ continue;
+
+ ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
if (!Inc || (!Inc->isOne() && !Inc->isMinusOne()))
- continue;
-
- PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
- if (!Phi)
- continue;
-
- CntInst = Inst;
- CntPhi = Phi;
- break;
- }
- if (!CntInst)
- return false;
-
- return true;
-}
-
-/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
-/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
-/// trip count returns true; otherwise, returns false.
-bool LoopIdiomRecognize::recognizeAndInsertFFS() {
- // Give up if the loop has multiple blocks or multiple backedges.
- if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
- return false;
-
- Intrinsic::ID IntrinID;
- Value *InitX;
- Instruction *DefX = nullptr;
- PHINode *CntPhi = nullptr;
- Instruction *CntInst = nullptr;
- // Help decide if transformation is profitable. For ShiftUntilZero idiom,
- // this is always 6.
- size_t IdiomCanonicalSize = 6;
-
- if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
- CntInst, CntPhi, DefX))
- return false;
-
- bool IsCntPhiUsedOutsideLoop = false;
- for (User *U : CntPhi->users())
- if (!CurLoop->contains(cast<Instruction>(U))) {
- IsCntPhiUsedOutsideLoop = true;
- break;
- }
- bool IsCntInstUsedOutsideLoop = false;
- for (User *U : CntInst->users())
- if (!CurLoop->contains(cast<Instruction>(U))) {
- IsCntInstUsedOutsideLoop = true;
- break;
- }
- // If both CntInst and CntPhi are used outside the loop the profitability
- // is questionable.
- if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
- return false;
-
- // For some CPUs result of CTLZ(X) intrinsic is undefined
- // when X is 0. If we can not guarantee X != 0, we need to check this
- // when expand.
- bool ZeroCheck = false;
- // It is safe to assume Preheader exist as it was checked in
- // parent function RunOnLoop.
- BasicBlock *PH = CurLoop->getLoopPreheader();
-
- // If we are using the count instruction outside the loop, make sure we
- // have a zero check as a precondition. Without the check the loop would run
- // one iteration for before any check of the input value. This means 0 and 1
- // would have identical behavior in the original loop and thus
- if (!IsCntPhiUsedOutsideLoop) {
- auto *PreCondBB = PH->getSinglePredecessor();
- if (!PreCondBB)
- return false;
- auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
- if (!PreCondBI)
- return false;
- if (matchCondition(PreCondBI, PH) != InitX)
- return false;
- ZeroCheck = true;
- }
-
- // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
- // profitable if we delete the loop.
-
- // the loop has only 6 instructions:
- // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
- // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
- // %shr = ashr %n.addr.0, 1
- // %tobool = icmp eq %shr, 0
- // %inc = add nsw %i.0, 1
- // br i1 %tobool
-
- const Value *Args[] = {
- InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
- : ConstantInt::getFalse(InitX->getContext())};
-
- // @llvm.dbg doesn't count as they have no semantic effect.
- auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
- uint32_t HeaderSize =
- std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
-
- IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
- int Cost =
- TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
- if (HeaderSize != IdiomCanonicalSize &&
- Cost > TargetTransformInfo::TCC_Basic)
- return false;
-
- transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
- DefX->getDebugLoc(), ZeroCheck,
- IsCntPhiUsedOutsideLoop);
- return true;
-}
-
-/// Recognizes a population count idiom in a non-countable loop.
-///
-/// If detected, transforms the relevant code to issue the popcount intrinsic
-/// function call, and returns true; otherwise, returns false.
-bool LoopIdiomRecognize::recognizePopcount() {
- if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
- return false;
-
- // Counting population are usually conducted by few arithmetic instructions.
- // Such instructions can be easily "absorbed" by vacant slots in a
- // non-compact loop. Therefore, recognizing popcount idiom only makes sense
- // in a compact loop.
-
- // Give up if the loop has multiple blocks or multiple backedges.
- if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
- return false;
-
- BasicBlock *LoopBody = *(CurLoop->block_begin());
- if (LoopBody->size() >= 20) {
- // The loop is too big, bail out.
- return false;
- }
-
- // It should have a preheader containing nothing but an unconditional branch.
- BasicBlock *PH = CurLoop->getLoopPreheader();
- if (!PH || &PH->front() != PH->getTerminator())
- return false;
- auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
- if (!EntryBI || EntryBI->isConditional())
- return false;
-
- // It should have a precondition block where the generated popcount intrinsic
- // function can be inserted.
- auto *PreCondBB = PH->getSinglePredecessor();
- if (!PreCondBB)
- return false;
- auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
- if (!PreCondBI || PreCondBI->isUnconditional())
- return false;
-
- Instruction *CntInst;
- PHINode *CntPhi;
- Value *Val;
- if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
- return false;
-
- transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
- return true;
-}
-
-static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
- const DebugLoc &DL) {
- Value *Ops[] = {Val};
- Type *Tys[] = {Val->getType()};
-
- Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
- Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
- CallInst *CI = IRBuilder.CreateCall(Func, Ops);
- CI->setDebugLoc(DL);
-
- return CI;
-}
-
-static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
- const DebugLoc &DL, bool ZeroCheck,
- Intrinsic::ID IID) {
- Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
- Type *Tys[] = {Val->getType()};
-
- Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
- Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
- CallInst *CI = IRBuilder.CreateCall(Func, Ops);
- CI->setDebugLoc(DL);
-
- return CI;
-}
-
-/// Transform the following loop (Using CTLZ, CTTZ is similar):
-/// loop:
-/// CntPhi = PHI [Cnt0, CntInst]
-/// PhiX = PHI [InitX, DefX]
-/// CntInst = CntPhi + 1
-/// DefX = PhiX >> 1
-/// LOOP_BODY
-/// Br: loop if (DefX != 0)
-/// Use(CntPhi) or Use(CntInst)
-///
-/// Into:
-/// If CntPhi used outside the loop:
-/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
-/// Count = CountPrev + 1
-/// else
-/// Count = BitWidth(InitX) - CTLZ(InitX)
-/// loop:
-/// CntPhi = PHI [Cnt0, CntInst]
-/// PhiX = PHI [InitX, DefX]
-/// PhiCount = PHI [Count, Dec]
-/// CntInst = CntPhi + 1
-/// DefX = PhiX >> 1
-/// Dec = PhiCount - 1
-/// LOOP_BODY
-/// Br: loop if (Dec != 0)
-/// Use(CountPrev + Cnt0) // Use(CntPhi)
-/// or
-/// Use(Count + Cnt0) // Use(CntInst)
-///
-/// If LOOP_BODY is empty the loop will be deleted.
-/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
-void LoopIdiomRecognize::transformLoopToCountable(
- Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
- PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
- bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
- BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
-
- // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
- IRBuilder<> Builder(PreheaderBr);
- Builder.SetCurrentDebugLocation(DL);
-
- // Count = BitWidth - CTLZ(InitX);
+ continue;
+
+ PHINode *Phi = getRecurrenceVar(Inst->getOperand(0), Inst, LoopEntry);
+ if (!Phi)
+ continue;
+
+ CntInst = Inst;
+ CntPhi = Phi;
+ break;
+ }
+ if (!CntInst)
+ return false;
+
+ return true;
+}
+
+/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
+/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
+/// trip count returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizeAndInsertFFS() {
+ // Give up if the loop has multiple blocks or multiple backedges.
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+ return false;
+
+ Intrinsic::ID IntrinID;
+ Value *InitX;
+ Instruction *DefX = nullptr;
+ PHINode *CntPhi = nullptr;
+ Instruction *CntInst = nullptr;
+ // Help decide if transformation is profitable. For ShiftUntilZero idiom,
+ // this is always 6.
+ size_t IdiomCanonicalSize = 6;
+
+ if (!detectShiftUntilZeroIdiom(CurLoop, *DL, IntrinID, InitX,
+ CntInst, CntPhi, DefX))
+ return false;
+
+ bool IsCntPhiUsedOutsideLoop = false;
+ for (User *U : CntPhi->users())
+ if (!CurLoop->contains(cast<Instruction>(U))) {
+ IsCntPhiUsedOutsideLoop = true;
+ break;
+ }
+ bool IsCntInstUsedOutsideLoop = false;
+ for (User *U : CntInst->users())
+ if (!CurLoop->contains(cast<Instruction>(U))) {
+ IsCntInstUsedOutsideLoop = true;
+ break;
+ }
+ // If both CntInst and CntPhi are used outside the loop the profitability
+ // is questionable.
+ if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
+ return false;
+
+ // For some CPUs result of CTLZ(X) intrinsic is undefined
+ // when X is 0. If we can not guarantee X != 0, we need to check this
+ // when expand.
+ bool ZeroCheck = false;
+ // It is safe to assume Preheader exist as it was checked in
+ // parent function RunOnLoop.
+ BasicBlock *PH = CurLoop->getLoopPreheader();
+
+ // If we are using the count instruction outside the loop, make sure we
+ // have a zero check as a precondition. Without the check the loop would run
+ // one iteration for before any check of the input value. This means 0 and 1
+ // would have identical behavior in the original loop and thus
+ if (!IsCntPhiUsedOutsideLoop) {
+ auto *PreCondBB = PH->getSinglePredecessor();
+ if (!PreCondBB)
+ return false;
+ auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+ if (!PreCondBI)
+ return false;
+ if (matchCondition(PreCondBI, PH) != InitX)
+ return false;
+ ZeroCheck = true;
+ }
+
+ // Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
+ // profitable if we delete the loop.
+
+ // the loop has only 6 instructions:
+ // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
+ // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
+ // %shr = ashr %n.addr.0, 1
+ // %tobool = icmp eq %shr, 0
+ // %inc = add nsw %i.0, 1
+ // br i1 %tobool
+
+ const Value *Args[] = {
+ InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
+ : ConstantInt::getFalse(InitX->getContext())};
+
+ // @llvm.dbg doesn't count as they have no semantic effect.
+ auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
+ uint32_t HeaderSize =
+ std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
+
+ IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
+ int Cost =
+ TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
+ if (HeaderSize != IdiomCanonicalSize &&
+ Cost > TargetTransformInfo::TCC_Basic)
+ return false;
+
+ transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
+ DefX->getDebugLoc(), ZeroCheck,
+ IsCntPhiUsedOutsideLoop);
+ return true;
+}
+
+/// Recognizes a population count idiom in a non-countable loop.
+///
+/// If detected, transforms the relevant code to issue the popcount intrinsic
+/// function call, and returns true; otherwise, returns false.
+bool LoopIdiomRecognize::recognizePopcount() {
+ if (TTI->getPopcntSupport(32) != TargetTransformInfo::PSK_FastHardware)
+ return false;
+
+ // Counting population are usually conducted by few arithmetic instructions.
+ // Such instructions can be easily "absorbed" by vacant slots in a
+ // non-compact loop. Therefore, recognizing popcount idiom only makes sense
+ // in a compact loop.
+
+ // Give up if the loop has multiple blocks or multiple backedges.
+ if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
+ return false;
+
+ BasicBlock *LoopBody = *(CurLoop->block_begin());
+ if (LoopBody->size() >= 20) {
+ // The loop is too big, bail out.
+ return false;
+ }
+
+ // It should have a preheader containing nothing but an unconditional branch.
+ BasicBlock *PH = CurLoop->getLoopPreheader();
+ if (!PH || &PH->front() != PH->getTerminator())
+ return false;
+ auto *EntryBI = dyn_cast<BranchInst>(PH->getTerminator());
+ if (!EntryBI || EntryBI->isConditional())
+ return false;
+
+ // It should have a precondition block where the generated popcount intrinsic
+ // function can be inserted.
+ auto *PreCondBB = PH->getSinglePredecessor();
+ if (!PreCondBB)
+ return false;
+ auto *PreCondBI = dyn_cast<BranchInst>(PreCondBB->getTerminator());
+ if (!PreCondBI || PreCondBI->isUnconditional())
+ return false;
+
+ Instruction *CntInst;
+ PHINode *CntPhi;
+ Value *Val;
+ if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Val))
+ return false;
+
+ transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Val);
+ return true;
+}
+
+static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+ const DebugLoc &DL) {
+ Value *Ops[] = {Val};
+ Type *Tys[] = {Val->getType()};
+
+ Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+ Function *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
+ CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+ CI->setDebugLoc(DL);
+
+ return CI;
+}
+
+static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
+ const DebugLoc &DL, bool ZeroCheck,
+ Intrinsic::ID IID) {
+ Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()};
+ Type *Tys[] = {Val->getType()};
+
+ Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
+ Function *Func = Intrinsic::getDeclaration(M, IID, Tys);
+ CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+ CI->setDebugLoc(DL);
+
+ return CI;
+}
+
+/// Transform the following loop (Using CTLZ, CTTZ is similar):
+/// loop:
+/// CntPhi = PHI [Cnt0, CntInst]
+/// PhiX = PHI [InitX, DefX]
+/// CntInst = CntPhi + 1
+/// DefX = PhiX >> 1
+/// LOOP_BODY
+/// Br: loop if (DefX != 0)
+/// Use(CntPhi) or Use(CntInst)
+///
+/// Into:
+/// If CntPhi used outside the loop:
+/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
+/// Count = CountPrev + 1
+/// else
+/// Count = BitWidth(InitX) - CTLZ(InitX)
+/// loop:
+/// CntPhi = PHI [Cnt0, CntInst]
+/// PhiX = PHI [InitX, DefX]
+/// PhiCount = PHI [Count, Dec]
+/// CntInst = CntPhi + 1
+/// DefX = PhiX >> 1
+/// Dec = PhiCount - 1
+/// LOOP_BODY
+/// Br: loop if (Dec != 0)
+/// Use(CountPrev + Cnt0) // Use(CntPhi)
+/// or
+/// Use(Count + Cnt0) // Use(CntInst)
+///
+/// If LOOP_BODY is empty the loop will be deleted.
+/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
+void LoopIdiomRecognize::transformLoopToCountable(
+ Intrinsic::ID IntrinID, BasicBlock *Preheader, Instruction *CntInst,
+ PHINode *CntPhi, Value *InitX, Instruction *DefX, const DebugLoc &DL,
+ bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) {
+ BranchInst *PreheaderBr = cast<BranchInst>(Preheader->getTerminator());
+
+ // Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
+ IRBuilder<> Builder(PreheaderBr);
+ Builder.SetCurrentDebugLocation(DL);
+
+ // Count = BitWidth - CTLZ(InitX);
// NewCount = Count;
- // If there are uses of CntPhi create:
+ // If there are uses of CntPhi create:
// NewCount = BitWidth - CTLZ(InitX >> 1);
// Count = NewCount + 1;
Value *InitXNext;
- if (IsCntPhiUsedOutsideLoop) {
- if (DefX->getOpcode() == Instruction::AShr)
- InitXNext =
- Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1));
- else if (DefX->getOpcode() == Instruction::LShr)
- InitXNext =
- Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
- else if (DefX->getOpcode() == Instruction::Shl) // cttz
- InitXNext =
- Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1));
- else
- llvm_unreachable("Unexpected opcode!");
- } else
- InitXNext = InitX;
+ if (IsCntPhiUsedOutsideLoop) {
+ if (DefX->getOpcode() == Instruction::AShr)
+ InitXNext =
+ Builder.CreateAShr(InitX, ConstantInt::get(InitX->getType(), 1));
+ else if (DefX->getOpcode() == Instruction::LShr)
+ InitXNext =
+ Builder.CreateLShr(InitX, ConstantInt::get(InitX->getType(), 1));
+ else if (DefX->getOpcode() == Instruction::Shl) // cttz
+ InitXNext =
+ Builder.CreateShl(InitX, ConstantInt::get(InitX->getType(), 1));
+ else
+ llvm_unreachable("Unexpected opcode!");
+ } else
+ InitXNext = InitX;
Value *FFS = createFFSIntrinsic(Builder, InitXNext, DL, ZeroCheck, IntrinID);
Value *Count = Builder.CreateSub(
ConstantInt::get(FFS->getType(), FFS->getType()->getIntegerBitWidth()),
- FFS);
+ FFS);
Value *NewCount = Count;
- if (IsCntPhiUsedOutsideLoop) {
+ if (IsCntPhiUsedOutsideLoop) {
NewCount = Count;
Count = Builder.CreateAdd(Count, ConstantInt::get(Count->getType(), 1));
- }
-
+ }
+
NewCount = Builder.CreateZExtOrTrunc(NewCount,
cast<IntegerType>(CntInst->getType()));
-
- Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
+
+ Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader);
if (cast<ConstantInt>(CntInst->getOperand(1))->isOne()) {
// If the counter was being incremented in the loop, add NewCount to the
// counter's initial value, but only if the initial value is not zero.
@@ -1772,153 +1772,153 @@ void LoopIdiomRecognize::transformLoopToCountable(
// the counter's initial value.
NewCount = Builder.CreateSub(CntInitVal, NewCount);
}
-
- // Step 2: Insert new IV and loop condition:
- // loop:
- // ...
- // PhiCount = PHI [Count, Dec]
- // ...
- // Dec = PhiCount - 1
- // ...
- // Br: loop if (Dec != 0)
- BasicBlock *Body = *(CurLoop->block_begin());
- auto *LbBr = cast<BranchInst>(Body->getTerminator());
- ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
- Type *Ty = Count->getType();
-
- PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
-
- Builder.SetInsertPoint(LbCond);
- Instruction *TcDec = cast<Instruction>(
- Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
- "tcdec", false, true));
-
- TcPhi->addIncoming(Count, Preheader);
- TcPhi->addIncoming(TcDec, Body);
-
- CmpInst::Predicate Pred =
- (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
- LbCond->setPredicate(Pred);
- LbCond->setOperand(0, TcDec);
- LbCond->setOperand(1, ConstantInt::get(Ty, 0));
-
- // Step 3: All the references to the original counter outside
- // the loop are replaced with the NewCount
- if (IsCntPhiUsedOutsideLoop)
- CntPhi->replaceUsesOutsideBlock(NewCount, Body);
- else
- CntInst->replaceUsesOutsideBlock(NewCount, Body);
-
- // step 4: Forget the "non-computable" trip-count SCEV associated with the
- // loop. The loop would otherwise not be deleted even if it becomes empty.
- SE->forgetLoop(CurLoop);
-}
-
-void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
- Instruction *CntInst,
- PHINode *CntPhi, Value *Var) {
- BasicBlock *PreHead = CurLoop->getLoopPreheader();
- auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
- const DebugLoc &DL = CntInst->getDebugLoc();
-
- // Assuming before transformation, the loop is following:
- // if (x) // the precondition
- // do { cnt++; x &= x - 1; } while(x);
-
- // Step 1: Insert the ctpop instruction at the end of the precondition block
- IRBuilder<> Builder(PreCondBr);
- Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
- {
- PopCnt = createPopcntIntrinsic(Builder, Var, DL);
- NewCount = PopCntZext =
- Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
-
- if (NewCount != PopCnt)
- (cast<Instruction>(NewCount))->setDebugLoc(DL);
-
- // TripCnt is exactly the number of iterations the loop has
- TripCnt = NewCount;
-
- // If the population counter's initial value is not zero, insert Add Inst.
- Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
- ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
- if (!InitConst || !InitConst->isZero()) {
- NewCount = Builder.CreateAdd(NewCount, CntInitVal);
- (cast<Instruction>(NewCount))->setDebugLoc(DL);
- }
- }
-
- // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
- // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
- // function would be partial dead code, and downstream passes will drag
- // it back from the precondition block to the preheader.
- {
- ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
-
- Value *Opnd0 = PopCntZext;
- Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
- if (PreCond->getOperand(0) != Var)
- std::swap(Opnd0, Opnd1);
-
- ICmpInst *NewPreCond = cast<ICmpInst>(
- Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
- PreCondBr->setCondition(NewPreCond);
-
- RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
- }
-
- // Step 3: Note that the population count is exactly the trip count of the
- // loop in question, which enable us to convert the loop from noncountable
- // loop into a countable one. The benefit is twofold:
- //
- // - If the loop only counts population, the entire loop becomes dead after
- // the transformation. It is a lot easier to prove a countable loop dead
- // than to prove a noncountable one. (In some C dialects, an infinite loop
- // isn't dead even if it computes nothing useful. In general, DCE needs
- // to prove a noncountable loop finite before safely delete it.)
- //
- // - If the loop also performs something else, it remains alive.
- // Since it is transformed to countable form, it can be aggressively
- // optimized by some optimizations which are in general not applicable
- // to a noncountable loop.
- //
- // After this step, this loop (conceptually) would look like following:
- // newcnt = __builtin_ctpop(x);
- // t = newcnt;
- // if (x)
- // do { cnt++; x &= x-1; t--) } while (t > 0);
- BasicBlock *Body = *(CurLoop->block_begin());
- {
- auto *LbBr = cast<BranchInst>(Body->getTerminator());
- ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
- Type *Ty = TripCnt->getType();
-
- PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
-
- Builder.SetInsertPoint(LbCond);
- Instruction *TcDec = cast<Instruction>(
- Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
- "tcdec", false, true));
-
- TcPhi->addIncoming(TripCnt, PreHead);
- TcPhi->addIncoming(TcDec, Body);
-
- CmpInst::Predicate Pred =
- (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
- LbCond->setPredicate(Pred);
- LbCond->setOperand(0, TcDec);
- LbCond->setOperand(1, ConstantInt::get(Ty, 0));
- }
-
- // Step 4: All the references to the original population counter outside
- // the loop are replaced with the NewCount -- the value returned from
- // __builtin_ctpop().
- CntInst->replaceUsesOutsideBlock(NewCount, Body);
-
- // step 5: Forget the "non-computable" trip-count SCEV associated with the
- // loop. The loop would otherwise not be deleted even if it becomes empty.
- SE->forgetLoop(CurLoop);
-}
+
+ // Step 2: Insert new IV and loop condition:
+ // loop:
+ // ...
+ // PhiCount = PHI [Count, Dec]
+ // ...
+ // Dec = PhiCount - 1
+ // ...
+ // Br: loop if (Dec != 0)
+ BasicBlock *Body = *(CurLoop->block_begin());
+ auto *LbBr = cast<BranchInst>(Body->getTerminator());
+ ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+ Type *Ty = Count->getType();
+
+ PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+ Builder.SetInsertPoint(LbCond);
+ Instruction *TcDec = cast<Instruction>(
+ Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+ "tcdec", false, true));
+
+ TcPhi->addIncoming(Count, Preheader);
+ TcPhi->addIncoming(TcDec, Body);
+
+ CmpInst::Predicate Pred =
+ (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
+ LbCond->setPredicate(Pred);
+ LbCond->setOperand(0, TcDec);
+ LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+
+ // Step 3: All the references to the original counter outside
+ // the loop are replaced with the NewCount
+ if (IsCntPhiUsedOutsideLoop)
+ CntPhi->replaceUsesOutsideBlock(NewCount, Body);
+ else
+ CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+ // step 4: Forget the "non-computable" trip-count SCEV associated with the
+ // loop. The loop would otherwise not be deleted even if it becomes empty.
+ SE->forgetLoop(CurLoop);
+}
+
+void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
+ Instruction *CntInst,
+ PHINode *CntPhi, Value *Var) {
+ BasicBlock *PreHead = CurLoop->getLoopPreheader();
+ auto *PreCondBr = cast<BranchInst>(PreCondBB->getTerminator());
+ const DebugLoc &DL = CntInst->getDebugLoc();
+
+ // Assuming before transformation, the loop is following:
+ // if (x) // the precondition
+ // do { cnt++; x &= x - 1; } while(x);
+
+ // Step 1: Insert the ctpop instruction at the end of the precondition block
+ IRBuilder<> Builder(PreCondBr);
+ Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
+ {
+ PopCnt = createPopcntIntrinsic(Builder, Var, DL);
+ NewCount = PopCntZext =
+ Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
+
+ if (NewCount != PopCnt)
+ (cast<Instruction>(NewCount))->setDebugLoc(DL);
+
+ // TripCnt is exactly the number of iterations the loop has
+ TripCnt = NewCount;
+
+ // If the population counter's initial value is not zero, insert Add Inst.
+ Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
+ ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+ if (!InitConst || !InitConst->isZero()) {
+ NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+ (cast<Instruction>(NewCount))->setDebugLoc(DL);
+ }
+ }
+
+ // Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
+ // "if (NewCount == 0) loop-exit". Without this change, the intrinsic
+ // function would be partial dead code, and downstream passes will drag
+ // it back from the precondition block to the preheader.
+ {
+ ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
+
+ Value *Opnd0 = PopCntZext;
+ Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
+ if (PreCond->getOperand(0) != Var)
+ std::swap(Opnd0, Opnd1);
+
+ ICmpInst *NewPreCond = cast<ICmpInst>(
+ Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
+ PreCondBr->setCondition(NewPreCond);
+
+ RecursivelyDeleteTriviallyDeadInstructions(PreCond, TLI);
+ }
+
+ // Step 3: Note that the population count is exactly the trip count of the
+ // loop in question, which enable us to convert the loop from noncountable
+ // loop into a countable one. The benefit is twofold:
+ //
+ // - If the loop only counts population, the entire loop becomes dead after
+ // the transformation. It is a lot easier to prove a countable loop dead
+ // than to prove a noncountable one. (In some C dialects, an infinite loop
+ // isn't dead even if it computes nothing useful. In general, DCE needs
+ // to prove a noncountable loop finite before safely delete it.)
+ //
+ // - If the loop also performs something else, it remains alive.
+ // Since it is transformed to countable form, it can be aggressively
+ // optimized by some optimizations which are in general not applicable
+ // to a noncountable loop.
+ //
+ // After this step, this loop (conceptually) would look like following:
+ // newcnt = __builtin_ctpop(x);
+ // t = newcnt;
+ // if (x)
+ // do { cnt++; x &= x-1; t--) } while (t > 0);
+ BasicBlock *Body = *(CurLoop->block_begin());
+ {
+ auto *LbBr = cast<BranchInst>(Body->getTerminator());
+ ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+ Type *Ty = TripCnt->getType();
+
+ PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front());
+
+ Builder.SetInsertPoint(LbCond);
+ Instruction *TcDec = cast<Instruction>(
+ Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1),
+ "tcdec", false, true));
+
+ TcPhi->addIncoming(TripCnt, PreHead);
+ TcPhi->addIncoming(TcDec, Body);
+
+ CmpInst::Predicate Pred =
+ (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
+ LbCond->setPredicate(Pred);
+ LbCond->setOperand(0, TcDec);
+ LbCond->setOperand(1, ConstantInt::get(Ty, 0));
+ }
+
+ // Step 4: All the references to the original population counter outside
+ // the loop are replaced with the NewCount -- the value returned from
+ // __builtin_ctpop().
+ CntInst->replaceUsesOutsideBlock(NewCount, Body);
+
+ // step 5: Forget the "non-computable" trip-count SCEV associated with the
+ // loop. The loop would otherwise not be deleted even if it becomes empty.
+ SE->forgetLoop(CurLoop);
+}
/// Match loop-invariant value.
template <typename SubPattern_t> struct match_LoopInvariant {
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInstSimplify.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 219f7f38b6..3153a87211 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -1,257 +1,257 @@
-//===- LoopInstSimplify.cpp - Loop Instruction Simplification Pass --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass performs lightweight instruction simplification on loop bodies.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopInstSimplify.h"
-#include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/User.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include <algorithm>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-instsimplify"
-
-STATISTIC(NumSimplified, "Number of redundant instructions simplified");
-
-static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI,
- AssumptionCache &AC, const TargetLibraryInfo &TLI,
- MemorySSAUpdater *MSSAU) {
- const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
- SimplifyQuery SQ(DL, &TLI, &DT, &AC);
-
- // On the first pass over the loop body we try to simplify every instruction.
- // On subsequent passes, we can restrict this to only simplifying instructions
- // where the inputs have been updated. We end up needing two sets: one
- // containing the instructions we are simplifying in *this* pass, and one for
- // the instructions we will want to simplify in the *next* pass. We use
- // pointers so we can swap between two stably allocated sets.
- SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
-
- // Track the PHI nodes that have already been visited during each iteration so
- // that we can identify when it is necessary to iterate.
- SmallPtrSet<PHINode *, 4> VisitedPHIs;
-
- // While simplifying we may discover dead code or cause code to become dead.
- // Keep track of all such instructions and we will delete them at the end.
- SmallVector<WeakTrackingVH, 8> DeadInsts;
-
- // First we want to create an RPO traversal of the loop body. By processing in
- // RPO we can ensure that definitions are processed prior to uses (for non PHI
- // uses) in all cases. This ensures we maximize the simplifications in each
- // iteration over the loop and minimizes the possible causes for continuing to
- // iterate.
- LoopBlocksRPO RPOT(&L);
- RPOT.perform(&LI);
- MemorySSA *MSSA = MSSAU ? MSSAU->getMemorySSA() : nullptr;
-
- bool Changed = false;
- for (;;) {
- if (MSSAU && VerifyMemorySSA)
- MSSA->verifyMemorySSA();
- for (BasicBlock *BB : RPOT) {
- for (Instruction &I : *BB) {
- if (auto *PI = dyn_cast<PHINode>(&I))
- VisitedPHIs.insert(PI);
-
- if (I.use_empty()) {
- if (isInstructionTriviallyDead(&I, &TLI))
- DeadInsts.push_back(&I);
- continue;
- }
-
- // We special case the first iteration which we can detect due to the
- // empty `ToSimplify` set.
- bool IsFirstIteration = ToSimplify->empty();
-
- if (!IsFirstIteration && !ToSimplify->count(&I))
- continue;
-
- Value *V = SimplifyInstruction(&I, SQ.getWithInstruction(&I));
- if (!V || !LI.replacementPreservesLCSSAForm(&I, V))
- continue;
-
- for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
- UI != UE;) {
- Use &U = *UI++;
- auto *UserI = cast<Instruction>(U.getUser());
- U.set(V);
-
- // If the instruction is used by a PHI node we have already processed
- // we'll need to iterate on the loop body to converge, so add it to
- // the next set.
- if (auto *UserPI = dyn_cast<PHINode>(UserI))
- if (VisitedPHIs.count(UserPI)) {
- Next->insert(UserPI);
- continue;
- }
-
- // If we are only simplifying targeted instructions and the user is an
- // instruction in the loop body, add it to our set of targeted
- // instructions. Because we process defs before uses (outside of PHIs)
- // we won't have visited it yet.
- //
- // We also skip any uses outside of the loop being simplified. Those
- // should always be PHI nodes due to LCSSA form, and we don't want to
- // try to simplify those away.
- assert((L.contains(UserI) || isa<PHINode>(UserI)) &&
- "Uses outside the loop should be PHI nodes due to LCSSA!");
- if (!IsFirstIteration && L.contains(UserI))
- ToSimplify->insert(UserI);
- }
-
- if (MSSAU)
- if (Instruction *SimpleI = dyn_cast_or_null<Instruction>(V))
- if (MemoryAccess *MA = MSSA->getMemoryAccess(&I))
- if (MemoryAccess *ReplacementMA = MSSA->getMemoryAccess(SimpleI))
- MA->replaceAllUsesWith(ReplacementMA);
-
- assert(I.use_empty() && "Should always have replaced all uses!");
- if (isInstructionTriviallyDead(&I, &TLI))
- DeadInsts.push_back(&I);
- ++NumSimplified;
- Changed = true;
- }
- }
-
- // Delete any dead instructions found thus far now that we've finished an
- // iteration over all instructions in all the loop blocks.
- if (!DeadInsts.empty()) {
- Changed = true;
- RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, &TLI, MSSAU);
- }
-
- if (MSSAU && VerifyMemorySSA)
- MSSA->verifyMemorySSA();
-
- // If we never found a PHI that needs to be simplified in the next
- // iteration, we're done.
- if (Next->empty())
- break;
-
- // Otherwise, put the next set in place for the next iteration and reset it
- // and the visited PHIs for that iteration.
- std::swap(Next, ToSimplify);
- Next->clear();
- VisitedPHIs.clear();
- DeadInsts.clear();
- }
-
- return Changed;
-}
-
-namespace {
-
-class LoopInstSimplifyLegacyPass : public LoopPass {
-public:
- static char ID; // Pass ID, replacement for typeid
-
- LoopInstSimplifyLegacyPass() : LoopPass(ID) {
- initializeLoopInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override {
- if (skipLoop(L))
- return false;
- DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- AssumptionCache &AC =
- getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
- *L->getHeader()->getParent());
- const TargetLibraryInfo &TLI =
- getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
- *L->getHeader()->getParent());
- MemorySSA *MSSA = nullptr;
- Optional<MemorySSAUpdater> MSSAU;
- if (EnableMSSALoopDependency) {
- MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
- MSSAU = MemorySSAUpdater(MSSA);
- }
-
- return simplifyLoopInst(*L, DT, LI, AC, TLI,
- MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.setPreservesCFG();
- if (EnableMSSALoopDependency) {
- AU.addRequired<MemorySSAWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
- }
- getLoopAnalysisUsage(AU);
- }
-};
-
-} // end anonymous namespace
-
-PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &) {
- Optional<MemorySSAUpdater> MSSAU;
- if (AR.MSSA) {
- MSSAU = MemorySSAUpdater(AR.MSSA);
- if (VerifyMemorySSA)
- AR.MSSA->verifyMemorySSA();
- }
- if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI,
- MSSAU.hasValue() ? MSSAU.getPointer() : nullptr))
- return PreservedAnalyses::all();
-
- auto PA = getLoopPassPreservedAnalyses();
- PA.preserveSet<CFGAnalyses>();
- if (AR.MSSA)
- PA.preserve<MemorySSAAnalysis>();
- return PA;
-}
-
-char LoopInstSimplifyLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(LoopInstSimplifyLegacyPass, "loop-instsimplify",
- "Simplify instructions in loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(LoopInstSimplifyLegacyPass, "loop-instsimplify",
- "Simplify instructions in loops", false, false)
-
-Pass *llvm::createLoopInstSimplifyPass() {
- return new LoopInstSimplifyLegacyPass();
-}
+//===- LoopInstSimplify.cpp - Loop Instruction Simplification Pass --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs lightweight instruction simplification on loop bodies.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopInstSimplify.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/User.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <algorithm>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-instsimplify"
+
+STATISTIC(NumSimplified, "Number of redundant instructions simplified");
+
+static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI,
+ AssumptionCache &AC, const TargetLibraryInfo &TLI,
+ MemorySSAUpdater *MSSAU) {
+ const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+ SimplifyQuery SQ(DL, &TLI, &DT, &AC);
+
+ // On the first pass over the loop body we try to simplify every instruction.
+ // On subsequent passes, we can restrict this to only simplifying instructions
+ // where the inputs have been updated. We end up needing two sets: one
+ // containing the instructions we are simplifying in *this* pass, and one for
+ // the instructions we will want to simplify in the *next* pass. We use
+ // pointers so we can swap between two stably allocated sets.
+ SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
+
+ // Track the PHI nodes that have already been visited during each iteration so
+ // that we can identify when it is necessary to iterate.
+ SmallPtrSet<PHINode *, 4> VisitedPHIs;
+
+ // While simplifying we may discover dead code or cause code to become dead.
+ // Keep track of all such instructions and we will delete them at the end.
+ SmallVector<WeakTrackingVH, 8> DeadInsts;
+
+ // First we want to create an RPO traversal of the loop body. By processing in
+ // RPO we can ensure that definitions are processed prior to uses (for non PHI
+ // uses) in all cases. This ensures we maximize the simplifications in each
+ // iteration over the loop and minimizes the possible causes for continuing to
+ // iterate.
+ LoopBlocksRPO RPOT(&L);
+ RPOT.perform(&LI);
+ MemorySSA *MSSA = MSSAU ? MSSAU->getMemorySSA() : nullptr;
+
+ bool Changed = false;
+ for (;;) {
+ if (MSSAU && VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+ for (BasicBlock *BB : RPOT) {
+ for (Instruction &I : *BB) {
+ if (auto *PI = dyn_cast<PHINode>(&I))
+ VisitedPHIs.insert(PI);
+
+ if (I.use_empty()) {
+ if (isInstructionTriviallyDead(&I, &TLI))
+ DeadInsts.push_back(&I);
+ continue;
+ }
+
+ // We special case the first iteration which we can detect due to the
+ // empty `ToSimplify` set.
+ bool IsFirstIteration = ToSimplify->empty();
+
+ if (!IsFirstIteration && !ToSimplify->count(&I))
+ continue;
+
+ Value *V = SimplifyInstruction(&I, SQ.getWithInstruction(&I));
+ if (!V || !LI.replacementPreservesLCSSAForm(&I, V))
+ continue;
+
+ for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+ UI != UE;) {
+ Use &U = *UI++;
+ auto *UserI = cast<Instruction>(U.getUser());
+ U.set(V);
+
+ // If the instruction is used by a PHI node we have already processed
+ // we'll need to iterate on the loop body to converge, so add it to
+ // the next set.
+ if (auto *UserPI = dyn_cast<PHINode>(UserI))
+ if (VisitedPHIs.count(UserPI)) {
+ Next->insert(UserPI);
+ continue;
+ }
+
+ // If we are only simplifying targeted instructions and the user is an
+ // instruction in the loop body, add it to our set of targeted
+ // instructions. Because we process defs before uses (outside of PHIs)
+ // we won't have visited it yet.
+ //
+ // We also skip any uses outside of the loop being simplified. Those
+ // should always be PHI nodes due to LCSSA form, and we don't want to
+ // try to simplify those away.
+ assert((L.contains(UserI) || isa<PHINode>(UserI)) &&
+ "Uses outside the loop should be PHI nodes due to LCSSA!");
+ if (!IsFirstIteration && L.contains(UserI))
+ ToSimplify->insert(UserI);
+ }
+
+ if (MSSAU)
+ if (Instruction *SimpleI = dyn_cast_or_null<Instruction>(V))
+ if (MemoryAccess *MA = MSSA->getMemoryAccess(&I))
+ if (MemoryAccess *ReplacementMA = MSSA->getMemoryAccess(SimpleI))
+ MA->replaceAllUsesWith(ReplacementMA);
+
+ assert(I.use_empty() && "Should always have replaced all uses!");
+ if (isInstructionTriviallyDead(&I, &TLI))
+ DeadInsts.push_back(&I);
+ ++NumSimplified;
+ Changed = true;
+ }
+ }
+
+ // Delete any dead instructions found thus far now that we've finished an
+ // iteration over all instructions in all the loop blocks.
+ if (!DeadInsts.empty()) {
+ Changed = true;
+ RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, &TLI, MSSAU);
+ }
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+
+ // If we never found a PHI that needs to be simplified in the next
+ // iteration, we're done.
+ if (Next->empty())
+ break;
+
+ // Otherwise, put the next set in place for the next iteration and reset it
+ // and the visited PHIs for that iteration.
+ std::swap(Next, ToSimplify);
+ Next->clear();
+ VisitedPHIs.clear();
+ DeadInsts.clear();
+ }
+
+ return Changed;
+}
+
+namespace {
+
+class LoopInstSimplifyLegacyPass : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+
+ LoopInstSimplifyLegacyPass() : LoopPass(ID) {
+ initializeLoopInstSimplifyLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ AssumptionCache &AC =
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
+ const TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+ *L->getHeader()->getParent());
+ MemorySSA *MSSA = nullptr;
+ Optional<MemorySSAUpdater> MSSAU;
+ if (EnableMSSALoopDependency) {
+ MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+ MSSAU = MemorySSAUpdater(MSSA);
+ }
+
+ return simplifyLoopInst(*L, DT, LI, AC, TLI,
+ MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.setPreservesCFG();
+ if (EnableMSSALoopDependency) {
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ }
+ getLoopAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses LoopInstSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ Optional<MemorySSAUpdater> MSSAU;
+ if (AR.MSSA) {
+ MSSAU = MemorySSAUpdater(AR.MSSA);
+ if (VerifyMemorySSA)
+ AR.MSSA->verifyMemorySSA();
+ }
+ if (!simplifyLoopInst(L, AR.DT, AR.LI, AR.AC, AR.TLI,
+ MSSAU.hasValue() ? MSSAU.getPointer() : nullptr))
+ return PreservedAnalyses::all();
+
+ auto PA = getLoopPassPreservedAnalyses();
+ PA.preserveSet<CFGAnalyses>();
+ if (AR.MSSA)
+ PA.preserve<MemorySSAAnalysis>();
+ return PA;
+}
+
+char LoopInstSimplifyLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopInstSimplifyLegacyPass, "loop-instsimplify",
+ "Simplify instructions in loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LoopInstSimplifyLegacyPass, "loop-instsimplify",
+ "Simplify instructions in loops", false, false)
+
+Pass *llvm::createLoopInstSimplifyPass() {
+ return new LoopInstSimplifyLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInterchange.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInterchange.cpp
index 9b23343a0f..d9dbc0deb4 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -1,616 +1,616 @@
-//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This Pass handles loop interchange transform.
-// This pass interchanges loops to provide a more cache-friendly memory access
-// patterns.
-//
-//===----------------------------------------------------------------------===//
-
+//===- LoopInterchange.cpp - Loop interchange pass-------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This Pass handles loop interchange transform.
+// This pass interchanges loops to provide a more cache-friendly memory access
+// patterns.
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Scalar/LoopInterchange.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/DependenceAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include <cassert>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-interchange"
-
-STATISTIC(LoopsInterchanged, "Number of loops interchanged");
-
-static cl::opt<int> LoopInterchangeCostThreshold(
- "loop-interchange-threshold", cl::init(0), cl::Hidden,
- cl::desc("Interchange if you gain more than this number"));
-
-namespace {
-
-using LoopVector = SmallVector<Loop *, 8>;
-
-// TODO: Check if we can use a sparse matrix here.
-using CharMatrix = std::vector<std::vector<char>>;
-
-} // end anonymous namespace
-
-// Maximum number of dependencies that can be handled in the dependency matrix.
-static const unsigned MaxMemInstrCount = 100;
-
-// Maximum loop depth supported.
-static const unsigned MaxLoopNestDepth = 10;
-
-#ifdef DUMP_DEP_MATRICIES
-static void printDepMatrix(CharMatrix &DepMatrix) {
- for (auto &Row : DepMatrix) {
- for (auto D : Row)
- LLVM_DEBUG(dbgs() << D << " ");
- LLVM_DEBUG(dbgs() << "\n");
- }
-}
-#endif
-
-static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
- Loop *L, DependenceInfo *DI) {
- using ValueVector = SmallVector<Value *, 16>;
-
- ValueVector MemInstr;
-
- // For each block.
- for (BasicBlock *BB : L->blocks()) {
- // Scan the BB and collect legal loads and stores.
- for (Instruction &I : *BB) {
- if (!isa<Instruction>(I))
- return false;
- if (auto *Ld = dyn_cast<LoadInst>(&I)) {
- if (!Ld->isSimple())
- return false;
- MemInstr.push_back(&I);
- } else if (auto *St = dyn_cast<StoreInst>(&I)) {
- if (!St->isSimple())
- return false;
- MemInstr.push_back(&I);
- }
- }
- }
-
- LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
- << " Loads and Stores to analyze\n");
-
- ValueVector::iterator I, IE, J, JE;
-
- for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
- for (J = I, JE = MemInstr.end(); J != JE; ++J) {
- std::vector<char> Dep;
- Instruction *Src = cast<Instruction>(*I);
- Instruction *Dst = cast<Instruction>(*J);
- if (Src == Dst)
- continue;
- // Ignore Input dependencies.
- if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
- continue;
- // Track Output, Flow, and Anti dependencies.
- if (auto D = DI->depends(Src, Dst, true)) {
- assert(D->isOrdered() && "Expected an output, flow or anti dep.");
- LLVM_DEBUG(StringRef DepType =
- D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
- dbgs() << "Found " << DepType
- << " dependency between Src and Dst\n"
- << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
- unsigned Levels = D->getLevels();
- char Direction;
- for (unsigned II = 1; II <= Levels; ++II) {
- const SCEV *Distance = D->getDistance(II);
- const SCEVConstant *SCEVConst =
- dyn_cast_or_null<SCEVConstant>(Distance);
- if (SCEVConst) {
- const ConstantInt *CI = SCEVConst->getValue();
- if (CI->isNegative())
- Direction = '<';
- else if (CI->isZero())
- Direction = '=';
- else
- Direction = '>';
- Dep.push_back(Direction);
- } else if (D->isScalar(II)) {
- Direction = 'S';
- Dep.push_back(Direction);
- } else {
- unsigned Dir = D->getDirection(II);
- if (Dir == Dependence::DVEntry::LT ||
- Dir == Dependence::DVEntry::LE)
- Direction = '<';
- else if (Dir == Dependence::DVEntry::GT ||
- Dir == Dependence::DVEntry::GE)
- Direction = '>';
- else if (Dir == Dependence::DVEntry::EQ)
- Direction = '=';
- else
- Direction = '*';
- Dep.push_back(Direction);
- }
- }
- while (Dep.size() != Level) {
- Dep.push_back('I');
- }
-
- DepMatrix.push_back(Dep);
- if (DepMatrix.size() > MaxMemInstrCount) {
- LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
- << " dependencies inside loop\n");
- return false;
- }
- }
- }
- }
-
- return true;
-}
-
-// A loop is moved from index 'from' to an index 'to'. Update the Dependence
-// matrix by exchanging the two columns.
-static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
- unsigned ToIndx) {
- unsigned numRows = DepMatrix.size();
- for (unsigned i = 0; i < numRows; ++i) {
- char TmpVal = DepMatrix[i][ToIndx];
- DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx];
- DepMatrix[i][FromIndx] = TmpVal;
- }
-}
-
-// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
-// '>'
-static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
- unsigned Column) {
- for (unsigned i = 0; i <= Column; ++i) {
- if (DepMatrix[Row][i] == '<')
- return false;
- if (DepMatrix[Row][i] == '>')
- return true;
- }
- // All dependencies were '=','S' or 'I'
- return false;
-}
-
-// Checks if no dependence exist in the dependency matrix in Row before Column.
-static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
- unsigned Column) {
- for (unsigned i = 0; i < Column; ++i) {
- if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' &&
- DepMatrix[Row][i] != 'I')
- return false;
- }
- return true;
-}
-
-static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
- unsigned OuterLoopId, char InnerDep,
- char OuterDep) {
- if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
- return false;
-
- if (InnerDep == OuterDep)
- return true;
-
- // It is legal to interchange if and only if after interchange no row has a
- // '>' direction as the leftmost non-'='.
-
- if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I')
- return true;
-
- if (InnerDep == '<')
- return true;
-
- if (InnerDep == '>') {
- // If OuterLoopId represents outermost loop then interchanging will make the
- // 1st dependency as '>'
- if (OuterLoopId == 0)
- return false;
-
- // If all dependencies before OuterloopId are '=','S'or 'I'. Then
- // interchanging will result in this row having an outermost non '='
- // dependency of '>'
- if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
- return true;
- }
-
- return false;
-}
-
-// Checks if it is legal to interchange 2 loops.
-// [Theorem] A permutation of the loops in a perfect nest is legal if and only
-// if the direction matrix, after the same permutation is applied to its
-// columns, has no ">" direction as the leftmost non-"=" direction in any row.
-static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
- unsigned InnerLoopId,
- unsigned OuterLoopId) {
- unsigned NumRows = DepMatrix.size();
- // For each row check if it is valid to interchange.
- for (unsigned Row = 0; Row < NumRows; ++Row) {
- char InnerDep = DepMatrix[Row][InnerLoopId];
- char OuterDep = DepMatrix[Row][OuterLoopId];
- if (InnerDep == '*' || OuterDep == '*')
- return false;
- if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep))
- return false;
- }
- return true;
-}
-
-static LoopVector populateWorklist(Loop &L) {
- LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
- << L.getHeader()->getParent()->getName() << " Loop: %"
- << L.getHeader()->getName() << '\n');
- LoopVector LoopList;
- Loop *CurrentLoop = &L;
- const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
- while (!Vec->empty()) {
- // The current loop has multiple subloops in it hence it is not tightly
- // nested.
- // Discard all loops above it added into Worklist.
- if (Vec->size() != 1)
- return {};
-
- LoopList.push_back(CurrentLoop);
- CurrentLoop = Vec->front();
- Vec = &CurrentLoop->getSubLoops();
- }
- LoopList.push_back(CurrentLoop);
- return LoopList;
-}
-
-static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
- PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
- if (InnerIndexVar)
- return InnerIndexVar;
- if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
- return nullptr;
- for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
- PHINode *PhiVar = cast<PHINode>(I);
- Type *PhiTy = PhiVar->getType();
- if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
- !PhiTy->isPointerTy())
- return nullptr;
- const SCEVAddRecExpr *AddRec =
- dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
- if (!AddRec || !AddRec->isAffine())
- continue;
- const SCEV *Step = AddRec->getStepRecurrence(*SE);
- if (!isa<SCEVConstant>(Step))
- continue;
- // Found the induction variable.
- // FIXME: Handle loops with more than one induction variable. Note that,
- // currently, legality makes sure we have only one induction variable.
- return PhiVar;
- }
- return nullptr;
-}
-
-namespace {
-
-/// LoopInterchangeLegality checks if it is legal to interchange the loop.
-class LoopInterchangeLegality {
-public:
- LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
- OptimizationRemarkEmitter *ORE)
- : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
-
- /// Check if the loops can be interchanged.
- bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
- CharMatrix &DepMatrix);
-
- /// Check if the loop structure is understood. We do not handle triangular
- /// loops for now.
- bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
-
- bool currentLimitations();
-
- const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
- return OuterInnerReductions;
- }
-
-private:
- bool tightlyNested(Loop *Outer, Loop *Inner);
- bool containsUnsafeInstructions(BasicBlock *BB);
-
- /// Discover induction and reduction PHIs in the header of \p L. Induction
- /// PHIs are added to \p Inductions, reductions are added to
- /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
- /// to be passed as \p InnerLoop.
- bool findInductionAndReductions(Loop *L,
- SmallVector<PHINode *, 8> &Inductions,
- Loop *InnerLoop);
-
- Loop *OuterLoop;
- Loop *InnerLoop;
-
- ScalarEvolution *SE;
-
- /// Interface to emit optimization remarks.
- OptimizationRemarkEmitter *ORE;
-
- /// Set of reduction PHIs taking part of a reduction across the inner and
- /// outer loop.
- SmallPtrSet<PHINode *, 4> OuterInnerReductions;
-};
-
-/// LoopInterchangeProfitability checks if it is profitable to interchange the
-/// loop.
-class LoopInterchangeProfitability {
-public:
- LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
- OptimizationRemarkEmitter *ORE)
- : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
-
- /// Check if the loop interchange is profitable.
- bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
- CharMatrix &DepMatrix);
-
-private:
- int getInstrOrderCost();
-
- Loop *OuterLoop;
- Loop *InnerLoop;
-
- /// Scev analysis.
- ScalarEvolution *SE;
-
- /// Interface to emit optimization remarks.
- OptimizationRemarkEmitter *ORE;
-};
-
-/// LoopInterchangeTransform interchanges the loop.
-class LoopInterchangeTransform {
-public:
- LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
- LoopInfo *LI, DominatorTree *DT,
- BasicBlock *LoopNestExit,
- const LoopInterchangeLegality &LIL)
- : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
- LoopExit(LoopNestExit), LIL(LIL) {}
-
- /// Interchange OuterLoop and InnerLoop.
- bool transform();
- void restructureLoops(Loop *NewInner, Loop *NewOuter,
- BasicBlock *OrigInnerPreHeader,
- BasicBlock *OrigOuterPreHeader);
- void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
-
-private:
- bool adjustLoopLinks();
- bool adjustLoopBranches();
-
- Loop *OuterLoop;
- Loop *InnerLoop;
-
- /// Scev analysis.
- ScalarEvolution *SE;
-
- LoopInfo *LI;
- DominatorTree *DT;
- BasicBlock *LoopExit;
-
- const LoopInterchangeLegality &LIL;
-};
-
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include <cassert>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-interchange"
+
+STATISTIC(LoopsInterchanged, "Number of loops interchanged");
+
+static cl::opt<int> LoopInterchangeCostThreshold(
+ "loop-interchange-threshold", cl::init(0), cl::Hidden,
+ cl::desc("Interchange if you gain more than this number"));
+
+namespace {
+
+using LoopVector = SmallVector<Loop *, 8>;
+
+// TODO: Check if we can use a sparse matrix here.
+using CharMatrix = std::vector<std::vector<char>>;
+
+} // end anonymous namespace
+
+// Maximum number of dependencies that can be handled in the dependency matrix.
+static const unsigned MaxMemInstrCount = 100;
+
+// Maximum loop depth supported.
+static const unsigned MaxLoopNestDepth = 10;
+
+#ifdef DUMP_DEP_MATRICIES
+static void printDepMatrix(CharMatrix &DepMatrix) {
+ for (auto &Row : DepMatrix) {
+ for (auto D : Row)
+ LLVM_DEBUG(dbgs() << D << " ");
+ LLVM_DEBUG(dbgs() << "\n");
+ }
+}
+#endif
+
+static bool populateDependencyMatrix(CharMatrix &DepMatrix, unsigned Level,
+ Loop *L, DependenceInfo *DI) {
+ using ValueVector = SmallVector<Value *, 16>;
+
+ ValueVector MemInstr;
+
+ // For each block.
+ for (BasicBlock *BB : L->blocks()) {
+ // Scan the BB and collect legal loads and stores.
+ for (Instruction &I : *BB) {
+ if (!isa<Instruction>(I))
+ return false;
+ if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+ if (!Ld->isSimple())
+ return false;
+ MemInstr.push_back(&I);
+ } else if (auto *St = dyn_cast<StoreInst>(&I)) {
+ if (!St->isSimple())
+ return false;
+ MemInstr.push_back(&I);
+ }
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "Found " << MemInstr.size()
+ << " Loads and Stores to analyze\n");
+
+ ValueVector::iterator I, IE, J, JE;
+
+ for (I = MemInstr.begin(), IE = MemInstr.end(); I != IE; ++I) {
+ for (J = I, JE = MemInstr.end(); J != JE; ++J) {
+ std::vector<char> Dep;
+ Instruction *Src = cast<Instruction>(*I);
+ Instruction *Dst = cast<Instruction>(*J);
+ if (Src == Dst)
+ continue;
+ // Ignore Input dependencies.
+ if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
+ continue;
+ // Track Output, Flow, and Anti dependencies.
+ if (auto D = DI->depends(Src, Dst, true)) {
+ assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+ LLVM_DEBUG(StringRef DepType =
+ D->isFlow() ? "flow" : D->isAnti() ? "anti" : "output";
+ dbgs() << "Found " << DepType
+ << " dependency between Src and Dst\n"
+ << " Src:" << *Src << "\n Dst:" << *Dst << '\n');
+ unsigned Levels = D->getLevels();
+ char Direction;
+ for (unsigned II = 1; II <= Levels; ++II) {
+ const SCEV *Distance = D->getDistance(II);
+ const SCEVConstant *SCEVConst =
+ dyn_cast_or_null<SCEVConstant>(Distance);
+ if (SCEVConst) {
+ const ConstantInt *CI = SCEVConst->getValue();
+ if (CI->isNegative())
+ Direction = '<';
+ else if (CI->isZero())
+ Direction = '=';
+ else
+ Direction = '>';
+ Dep.push_back(Direction);
+ } else if (D->isScalar(II)) {
+ Direction = 'S';
+ Dep.push_back(Direction);
+ } else {
+ unsigned Dir = D->getDirection(II);
+ if (Dir == Dependence::DVEntry::LT ||
+ Dir == Dependence::DVEntry::LE)
+ Direction = '<';
+ else if (Dir == Dependence::DVEntry::GT ||
+ Dir == Dependence::DVEntry::GE)
+ Direction = '>';
+ else if (Dir == Dependence::DVEntry::EQ)
+ Direction = '=';
+ else
+ Direction = '*';
+ Dep.push_back(Direction);
+ }
+ }
+ while (Dep.size() != Level) {
+ Dep.push_back('I');
+ }
+
+ DepMatrix.push_back(Dep);
+ if (DepMatrix.size() > MaxMemInstrCount) {
+ LLVM_DEBUG(dbgs() << "Cannot handle more than " << MaxMemInstrCount
+ << " dependencies inside loop\n");
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+// A loop is moved from index 'from' to an index 'to'. Update the Dependence
+// matrix by exchanging the two columns.
+static void interChangeDependencies(CharMatrix &DepMatrix, unsigned FromIndx,
+ unsigned ToIndx) {
+ unsigned numRows = DepMatrix.size();
+ for (unsigned i = 0; i < numRows; ++i) {
+ char TmpVal = DepMatrix[i][ToIndx];
+ DepMatrix[i][ToIndx] = DepMatrix[i][FromIndx];
+ DepMatrix[i][FromIndx] = TmpVal;
+ }
+}
+
+// Checks if outermost non '=','S'or'I' dependence in the dependence matrix is
+// '>'
+static bool isOuterMostDepPositive(CharMatrix &DepMatrix, unsigned Row,
+ unsigned Column) {
+ for (unsigned i = 0; i <= Column; ++i) {
+ if (DepMatrix[Row][i] == '<')
+ return false;
+ if (DepMatrix[Row][i] == '>')
+ return true;
+ }
+ // All dependencies were '=','S' or 'I'
+ return false;
+}
+
+// Checks if no dependence exist in the dependency matrix in Row before Column.
+static bool containsNoDependence(CharMatrix &DepMatrix, unsigned Row,
+ unsigned Column) {
+ for (unsigned i = 0; i < Column; ++i) {
+ if (DepMatrix[Row][i] != '=' && DepMatrix[Row][i] != 'S' &&
+ DepMatrix[Row][i] != 'I')
+ return false;
+ }
+ return true;
+}
+
+static bool validDepInterchange(CharMatrix &DepMatrix, unsigned Row,
+ unsigned OuterLoopId, char InnerDep,
+ char OuterDep) {
+ if (isOuterMostDepPositive(DepMatrix, Row, OuterLoopId))
+ return false;
+
+ if (InnerDep == OuterDep)
+ return true;
+
+ // It is legal to interchange if and only if after interchange no row has a
+ // '>' direction as the leftmost non-'='.
+
+ if (InnerDep == '=' || InnerDep == 'S' || InnerDep == 'I')
+ return true;
+
+ if (InnerDep == '<')
+ return true;
+
+ if (InnerDep == '>') {
+ // If OuterLoopId represents outermost loop then interchanging will make the
+ // 1st dependency as '>'
+ if (OuterLoopId == 0)
+ return false;
+
+ // If all dependencies before OuterloopId are '=','S'or 'I'. Then
+ // interchanging will result in this row having an outermost non '='
+ // dependency of '>'
+ if (!containsNoDependence(DepMatrix, Row, OuterLoopId))
+ return true;
+ }
+
+ return false;
+}
+
+// Checks if it is legal to interchange 2 loops.
+// [Theorem] A permutation of the loops in a perfect nest is legal if and only
+// if the direction matrix, after the same permutation is applied to its
+// columns, has no ">" direction as the leftmost non-"=" direction in any row.
+static bool isLegalToInterChangeLoops(CharMatrix &DepMatrix,
+ unsigned InnerLoopId,
+ unsigned OuterLoopId) {
+ unsigned NumRows = DepMatrix.size();
+ // For each row check if it is valid to interchange.
+ for (unsigned Row = 0; Row < NumRows; ++Row) {
+ char InnerDep = DepMatrix[Row][InnerLoopId];
+ char OuterDep = DepMatrix[Row][OuterLoopId];
+ if (InnerDep == '*' || OuterDep == '*')
+ return false;
+ if (!validDepInterchange(DepMatrix, Row, OuterLoopId, InnerDep, OuterDep))
+ return false;
+ }
+ return true;
+}
+
+static LoopVector populateWorklist(Loop &L) {
+ LLVM_DEBUG(dbgs() << "Calling populateWorklist on Func: "
+ << L.getHeader()->getParent()->getName() << " Loop: %"
+ << L.getHeader()->getName() << '\n');
+ LoopVector LoopList;
+ Loop *CurrentLoop = &L;
+ const std::vector<Loop *> *Vec = &CurrentLoop->getSubLoops();
+ while (!Vec->empty()) {
+ // The current loop has multiple subloops in it hence it is not tightly
+ // nested.
+ // Discard all loops above it added into Worklist.
+ if (Vec->size() != 1)
+ return {};
+
+ LoopList.push_back(CurrentLoop);
+ CurrentLoop = Vec->front();
+ Vec = &CurrentLoop->getSubLoops();
+ }
+ LoopList.push_back(CurrentLoop);
+ return LoopList;
+}
+
+static PHINode *getInductionVariable(Loop *L, ScalarEvolution *SE) {
+ PHINode *InnerIndexVar = L->getCanonicalInductionVariable();
+ if (InnerIndexVar)
+ return InnerIndexVar;
+ if (L->getLoopLatch() == nullptr || L->getLoopPredecessor() == nullptr)
+ return nullptr;
+ for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PhiVar = cast<PHINode>(I);
+ Type *PhiTy = PhiVar->getType();
+ if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+ !PhiTy->isPointerTy())
+ return nullptr;
+ const SCEVAddRecExpr *AddRec =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(PhiVar));
+ if (!AddRec || !AddRec->isAffine())
+ continue;
+ const SCEV *Step = AddRec->getStepRecurrence(*SE);
+ if (!isa<SCEVConstant>(Step))
+ continue;
+ // Found the induction variable.
+ // FIXME: Handle loops with more than one induction variable. Note that,
+ // currently, legality makes sure we have only one induction variable.
+ return PhiVar;
+ }
+ return nullptr;
+}
+
+namespace {
+
+/// LoopInterchangeLegality checks if it is legal to interchange the loop.
+class LoopInterchangeLegality {
+public:
+ LoopInterchangeLegality(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+ OptimizationRemarkEmitter *ORE)
+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
+
+ /// Check if the loops can be interchanged.
+ bool canInterchangeLoops(unsigned InnerLoopId, unsigned OuterLoopId,
+ CharMatrix &DepMatrix);
+
+ /// Check if the loop structure is understood. We do not handle triangular
+ /// loops for now.
+ bool isLoopStructureUnderstood(PHINode *InnerInductionVar);
+
+ bool currentLimitations();
+
+ const SmallPtrSetImpl<PHINode *> &getOuterInnerReductions() const {
+ return OuterInnerReductions;
+ }
+
+private:
+ bool tightlyNested(Loop *Outer, Loop *Inner);
+ bool containsUnsafeInstructions(BasicBlock *BB);
+
+ /// Discover induction and reduction PHIs in the header of \p L. Induction
+ /// PHIs are added to \p Inductions, reductions are added to
+ /// OuterInnerReductions. When the outer loop is passed, the inner loop needs
+ /// to be passed as \p InnerLoop.
+ bool findInductionAndReductions(Loop *L,
+ SmallVector<PHINode *, 8> &Inductions,
+ Loop *InnerLoop);
+
+ Loop *OuterLoop;
+ Loop *InnerLoop;
+
+ ScalarEvolution *SE;
+
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
+
+ /// Set of reduction PHIs taking part of a reduction across the inner and
+ /// outer loop.
+ SmallPtrSet<PHINode *, 4> OuterInnerReductions;
+};
+
+/// LoopInterchangeProfitability checks if it is profitable to interchange the
+/// loop.
+class LoopInterchangeProfitability {
+public:
+ LoopInterchangeProfitability(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+ OptimizationRemarkEmitter *ORE)
+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE), ORE(ORE) {}
+
+ /// Check if the loop interchange is profitable.
+ bool isProfitable(unsigned InnerLoopId, unsigned OuterLoopId,
+ CharMatrix &DepMatrix);
+
+private:
+ int getInstrOrderCost();
+
+ Loop *OuterLoop;
+ Loop *InnerLoop;
+
+ /// Scev analysis.
+ ScalarEvolution *SE;
+
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
+};
+
+/// LoopInterchangeTransform interchanges the loop.
+class LoopInterchangeTransform {
+public:
+ LoopInterchangeTransform(Loop *Outer, Loop *Inner, ScalarEvolution *SE,
+ LoopInfo *LI, DominatorTree *DT,
+ BasicBlock *LoopNestExit,
+ const LoopInterchangeLegality &LIL)
+ : OuterLoop(Outer), InnerLoop(Inner), SE(SE), LI(LI), DT(DT),
+ LoopExit(LoopNestExit), LIL(LIL) {}
+
+ /// Interchange OuterLoop and InnerLoop.
+ bool transform();
+ void restructureLoops(Loop *NewInner, Loop *NewOuter,
+ BasicBlock *OrigInnerPreHeader,
+ BasicBlock *OrigOuterPreHeader);
+ void removeChildLoop(Loop *OuterLoop, Loop *InnerLoop);
+
+private:
+ bool adjustLoopLinks();
+ bool adjustLoopBranches();
+
+ Loop *OuterLoop;
+ Loop *InnerLoop;
+
+ /// Scev analysis.
+ ScalarEvolution *SE;
+
+ LoopInfo *LI;
+ DominatorTree *DT;
+ BasicBlock *LoopExit;
+
+ const LoopInterchangeLegality &LIL;
+};
+
struct LoopInterchange {
- ScalarEvolution *SE = nullptr;
- LoopInfo *LI = nullptr;
- DependenceInfo *DI = nullptr;
- DominatorTree *DT = nullptr;
-
- /// Interface to emit optimization remarks.
- OptimizationRemarkEmitter *ORE;
-
+ ScalarEvolution *SE = nullptr;
+ LoopInfo *LI = nullptr;
+ DependenceInfo *DI = nullptr;
+ DominatorTree *DT = nullptr;
+
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
+
LoopInterchange(ScalarEvolution *SE, LoopInfo *LI, DependenceInfo *DI,
DominatorTree *DT, OptimizationRemarkEmitter *ORE)
: SE(SE), LI(LI), DI(DI), DT(DT), ORE(ORE) {}
-
+
bool run(Loop *L) {
if (L->getParentLoop())
- return false;
-
- return processLoopList(populateWorklist(*L));
- }
-
- bool isComputableLoopNest(LoopVector LoopList) {
- for (Loop *L : LoopList) {
- const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
+ return false;
+
+ return processLoopList(populateWorklist(*L));
+ }
+
+ bool isComputableLoopNest(LoopVector LoopList) {
+ for (Loop *L : LoopList) {
+ const SCEV *ExitCountOuter = SE->getBackedgeTakenCount(L);
if (isa<SCEVCouldNotCompute>(ExitCountOuter)) {
- LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
- return false;
- }
- if (L->getNumBackEdges() != 1) {
- LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
- return false;
- }
- if (!L->getExitingBlock()) {
- LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
- return false;
- }
- }
- return true;
- }
-
- unsigned selectLoopForInterchange(const LoopVector &LoopList) {
- // TODO: Add a better heuristic to select the loop to be interchanged based
- // on the dependence matrix. Currently we select the innermost loop.
- return LoopList.size() - 1;
- }
-
- bool processLoopList(LoopVector LoopList) {
- bool Changed = false;
- unsigned LoopNestDepth = LoopList.size();
- if (LoopNestDepth < 2) {
- LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
- return false;
- }
- if (LoopNestDepth > MaxLoopNestDepth) {
- LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
- << MaxLoopNestDepth << "\n");
- return false;
- }
- if (!isComputableLoopNest(LoopList)) {
- LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
- return false;
- }
-
- LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
- << "\n");
-
- CharMatrix DependencyMatrix;
- Loop *OuterMostLoop = *(LoopList.begin());
- if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
- OuterMostLoop, DI)) {
- LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
- return false;
- }
-#ifdef DUMP_DEP_MATRICIES
- LLVM_DEBUG(dbgs() << "Dependence before interchange\n");
- printDepMatrix(DependencyMatrix);
-#endif
-
- // Get the Outermost loop exit.
- BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
- if (!LoopNestExit) {
- LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
- return false;
- }
-
- unsigned SelecLoopId = selectLoopForInterchange(LoopList);
- // Move the selected loop outwards to the best possible position.
- for (unsigned i = SelecLoopId; i > 0; i--) {
- bool Interchanged =
- processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
- if (!Interchanged)
- return Changed;
- // Loops interchanged reflect the same in LoopList
- std::swap(LoopList[i - 1], LoopList[i]);
-
- // Update the DependencyMatrix
- interChangeDependencies(DependencyMatrix, i, i - 1);
-#ifdef DUMP_DEP_MATRICIES
- LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
- printDepMatrix(DependencyMatrix);
-#endif
- Changed |= Interchanged;
- }
- return Changed;
- }
-
- bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
- unsigned OuterLoopId, BasicBlock *LoopNestExit,
- std::vector<std::vector<char>> &DependencyMatrix) {
- LLVM_DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
- << " and OuterLoopId = " << OuterLoopId << "\n");
- Loop *InnerLoop = LoopList[InnerLoopId];
- Loop *OuterLoop = LoopList[OuterLoopId];
-
- LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
- if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
- LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
- return false;
- }
- LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
- LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
- if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
- LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
- return false;
- }
-
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "Interchanged",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Loop interchanged with enclosing loop.";
- });
-
- LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit,
- LIL);
- LIT.transform();
- LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
- LoopsInterchanged++;
-
- assert(InnerLoop->isLCSSAForm(*DT) &&
- "Inner loop not left in LCSSA form after loop interchange!");
- assert(OuterLoop->isLCSSAForm(*DT) &&
- "Outer loop not left in LCSSA form after loop interchange!");
-
- return true;
- }
-};
-
-} // end anonymous namespace
-
-bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
- return any_of(*BB, [](const Instruction &I) {
- return I.mayHaveSideEffects() || I.mayReadFromMemory();
- });
-}
-
-bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
- BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
- BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
- BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
-
- LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
-
- // A perfectly nested loop will not have any branch in between the outer and
- // inner block i.e. outer header will branch to either inner preheader and
- // outerloop latch.
- BranchInst *OuterLoopHeaderBI =
- dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
- if (!OuterLoopHeaderBI)
- return false;
-
- for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
- if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
- Succ != OuterLoopLatch)
- return false;
-
- LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
- // We do not have any basic block in between now make sure the outer header
- // and outer loop latch doesn't contain any unsafe instructions.
- if (containsUnsafeInstructions(OuterLoopHeader) ||
- containsUnsafeInstructions(OuterLoopLatch))
- return false;
-
+ LLVM_DEBUG(dbgs() << "Couldn't compute backedge count\n");
+ return false;
+ }
+ if (L->getNumBackEdges() != 1) {
+ LLVM_DEBUG(dbgs() << "NumBackEdges is not equal to 1\n");
+ return false;
+ }
+ if (!L->getExitingBlock()) {
+ LLVM_DEBUG(dbgs() << "Loop doesn't have unique exit block\n");
+ return false;
+ }
+ }
+ return true;
+ }
+
+ unsigned selectLoopForInterchange(const LoopVector &LoopList) {
+ // TODO: Add a better heuristic to select the loop to be interchanged based
+ // on the dependence matrix. Currently we select the innermost loop.
+ return LoopList.size() - 1;
+ }
+
+ bool processLoopList(LoopVector LoopList) {
+ bool Changed = false;
+ unsigned LoopNestDepth = LoopList.size();
+ if (LoopNestDepth < 2) {
+ LLVM_DEBUG(dbgs() << "Loop doesn't contain minimum nesting level.\n");
+ return false;
+ }
+ if (LoopNestDepth > MaxLoopNestDepth) {
+ LLVM_DEBUG(dbgs() << "Cannot handle loops of depth greater than "
+ << MaxLoopNestDepth << "\n");
+ return false;
+ }
+ if (!isComputableLoopNest(LoopList)) {
+ LLVM_DEBUG(dbgs() << "Not valid loop candidate for interchange\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "Processing LoopList of size = " << LoopNestDepth
+ << "\n");
+
+ CharMatrix DependencyMatrix;
+ Loop *OuterMostLoop = *(LoopList.begin());
+ if (!populateDependencyMatrix(DependencyMatrix, LoopNestDepth,
+ OuterMostLoop, DI)) {
+ LLVM_DEBUG(dbgs() << "Populating dependency matrix failed\n");
+ return false;
+ }
+#ifdef DUMP_DEP_MATRICIES
+ LLVM_DEBUG(dbgs() << "Dependence before interchange\n");
+ printDepMatrix(DependencyMatrix);
+#endif
+
+ // Get the Outermost loop exit.
+ BasicBlock *LoopNestExit = OuterMostLoop->getExitBlock();
+ if (!LoopNestExit) {
+ LLVM_DEBUG(dbgs() << "OuterMostLoop needs an unique exit block");
+ return false;
+ }
+
+ unsigned SelecLoopId = selectLoopForInterchange(LoopList);
+ // Move the selected loop outwards to the best possible position.
+ for (unsigned i = SelecLoopId; i > 0; i--) {
+ bool Interchanged =
+ processLoop(LoopList, i, i - 1, LoopNestExit, DependencyMatrix);
+ if (!Interchanged)
+ return Changed;
+ // Loops interchanged reflect the same in LoopList
+ std::swap(LoopList[i - 1], LoopList[i]);
+
+ // Update the DependencyMatrix
+ interChangeDependencies(DependencyMatrix, i, i - 1);
+#ifdef DUMP_DEP_MATRICIES
+ LLVM_DEBUG(dbgs() << "Dependence after interchange\n");
+ printDepMatrix(DependencyMatrix);
+#endif
+ Changed |= Interchanged;
+ }
+ return Changed;
+ }
+
+ bool processLoop(LoopVector LoopList, unsigned InnerLoopId,
+ unsigned OuterLoopId, BasicBlock *LoopNestExit,
+ std::vector<std::vector<char>> &DependencyMatrix) {
+ LLVM_DEBUG(dbgs() << "Processing Inner Loop Id = " << InnerLoopId
+ << " and OuterLoopId = " << OuterLoopId << "\n");
+ Loop *InnerLoop = LoopList[InnerLoopId];
+ Loop *OuterLoop = LoopList[OuterLoopId];
+
+ LoopInterchangeLegality LIL(OuterLoop, InnerLoop, SE, ORE);
+ if (!LIL.canInterchangeLoops(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+ LLVM_DEBUG(dbgs() << "Not interchanging loops. Cannot prove legality.\n");
+ return false;
+ }
+ LLVM_DEBUG(dbgs() << "Loops are legal to interchange\n");
+ LoopInterchangeProfitability LIP(OuterLoop, InnerLoop, SE, ORE);
+ if (!LIP.isProfitable(InnerLoopId, OuterLoopId, DependencyMatrix)) {
+ LLVM_DEBUG(dbgs() << "Interchanging loops not profitable.\n");
+ return false;
+ }
+
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Interchanged",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Loop interchanged with enclosing loop.";
+ });
+
+ LoopInterchangeTransform LIT(OuterLoop, InnerLoop, SE, LI, DT, LoopNestExit,
+ LIL);
+ LIT.transform();
+ LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
+ LoopsInterchanged++;
+
+ assert(InnerLoop->isLCSSAForm(*DT) &&
+ "Inner loop not left in LCSSA form after loop interchange!");
+ assert(OuterLoop->isLCSSAForm(*DT) &&
+ "Outer loop not left in LCSSA form after loop interchange!");
+
+ return true;
+ }
+};
+
+} // end anonymous namespace
+
+bool LoopInterchangeLegality::containsUnsafeInstructions(BasicBlock *BB) {
+ return any_of(*BB, [](const Instruction &I) {
+ return I.mayHaveSideEffects() || I.mayReadFromMemory();
+ });
+}
+
+bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+
+ LLVM_DEBUG(dbgs() << "Checking if loops are tightly nested\n");
+
+ // A perfectly nested loop will not have any branch in between the outer and
+ // inner block i.e. outer header will branch to either inner preheader and
+ // outerloop latch.
+ BranchInst *OuterLoopHeaderBI =
+ dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+ if (!OuterLoopHeaderBI)
+ return false;
+
+ for (BasicBlock *Succ : successors(OuterLoopHeaderBI))
+ if (Succ != InnerLoopPreHeader && Succ != InnerLoop->getHeader() &&
+ Succ != OuterLoopLatch)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Checking instructions in Loop header and Loop latch\n");
+ // We do not have any basic block in between now make sure the outer header
+ // and outer loop latch doesn't contain any unsafe instructions.
+ if (containsUnsafeInstructions(OuterLoopHeader) ||
+ containsUnsafeInstructions(OuterLoopLatch))
+ return false;
+
// Also make sure the inner loop preheader does not contain any unsafe
// instructions. Note that all instructions in the preheader will be moved to
// the outer loop header when interchanging.
@@ -618,694 +618,694 @@ bool LoopInterchangeLegality::tightlyNested(Loop *OuterLoop, Loop *InnerLoop) {
containsUnsafeInstructions(InnerLoopPreHeader))
return false;
- LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
- // We have a perfect loop nest.
- return true;
-}
-
-bool LoopInterchangeLegality::isLoopStructureUnderstood(
- PHINode *InnerInduction) {
- unsigned Num = InnerInduction->getNumOperands();
- BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
- for (unsigned i = 0; i < Num; ++i) {
- Value *Val = InnerInduction->getOperand(i);
- if (isa<Constant>(Val))
- continue;
- Instruction *I = dyn_cast<Instruction>(Val);
- if (!I)
- return false;
- // TODO: Handle triangular loops.
- // e.g. for(int i=0;i<N;i++)
- // for(int j=i;j<N;j++)
- unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
- if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
- InnerLoopPreheader &&
- !OuterLoop->isLoopInvariant(I)) {
- return false;
- }
- }
- return true;
-}
-
-// If SV is a LCSSA PHI node with a single incoming value, return the incoming
-// value.
-static Value *followLCSSA(Value *SV) {
- PHINode *PHI = dyn_cast<PHINode>(SV);
- if (!PHI)
- return SV;
-
- if (PHI->getNumIncomingValues() != 1)
- return SV;
- return followLCSSA(PHI->getIncomingValue(0));
-}
-
-// Check V's users to see if it is involved in a reduction in L.
-static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
+ LLVM_DEBUG(dbgs() << "Loops are perfectly nested\n");
+ // We have a perfect loop nest.
+ return true;
+}
+
+bool LoopInterchangeLegality::isLoopStructureUnderstood(
+ PHINode *InnerInduction) {
+ unsigned Num = InnerInduction->getNumOperands();
+ BasicBlock *InnerLoopPreheader = InnerLoop->getLoopPreheader();
+ for (unsigned i = 0; i < Num; ++i) {
+ Value *Val = InnerInduction->getOperand(i);
+ if (isa<Constant>(Val))
+ continue;
+ Instruction *I = dyn_cast<Instruction>(Val);
+ if (!I)
+ return false;
+ // TODO: Handle triangular loops.
+ // e.g. for(int i=0;i<N;i++)
+ // for(int j=i;j<N;j++)
+ unsigned IncomBlockIndx = PHINode::getIncomingValueNumForOperand(i);
+ if (InnerInduction->getIncomingBlock(IncomBlockIndx) ==
+ InnerLoopPreheader &&
+ !OuterLoop->isLoopInvariant(I)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+// If SV is a LCSSA PHI node with a single incoming value, return the incoming
+// value.
+static Value *followLCSSA(Value *SV) {
+ PHINode *PHI = dyn_cast<PHINode>(SV);
+ if (!PHI)
+ return SV;
+
+ if (PHI->getNumIncomingValues() != 1)
+ return SV;
+ return followLCSSA(PHI->getIncomingValue(0));
+}
+
+// Check V's users to see if it is involved in a reduction in L.
+static PHINode *findInnerReductionPhi(Loop *L, Value *V) {
// Reduction variables cannot be constants.
if (isa<Constant>(V))
return nullptr;
- for (Value *User : V->users()) {
- if (PHINode *PHI = dyn_cast<PHINode>(User)) {
- if (PHI->getNumIncomingValues() == 1)
- continue;
- RecurrenceDescriptor RD;
- if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
- return PHI;
- return nullptr;
- }
- }
-
- return nullptr;
-}
-
-bool LoopInterchangeLegality::findInductionAndReductions(
- Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
- if (!L->getLoopLatch() || !L->getLoopPredecessor())
- return false;
- for (PHINode &PHI : L->getHeader()->phis()) {
- RecurrenceDescriptor RD;
- InductionDescriptor ID;
- if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
- Inductions.push_back(&PHI);
- else {
- // PHIs in inner loops need to be part of a reduction in the outer loop,
- // discovered when checking the PHIs of the outer loop earlier.
- if (!InnerLoop) {
- if (!OuterInnerReductions.count(&PHI)) {
- LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
- "across the outer loop.\n");
- return false;
- }
- } else {
- assert(PHI.getNumIncomingValues() == 2 &&
- "Phis in loop header should have exactly 2 incoming values");
- // Check if we have a PHI node in the outer loop that has a reduction
- // result from the inner loop as an incoming value.
- Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
- PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
- if (!InnerRedPhi ||
+ for (Value *User : V->users()) {
+ if (PHINode *PHI = dyn_cast<PHINode>(User)) {
+ if (PHI->getNumIncomingValues() == 1)
+ continue;
+ RecurrenceDescriptor RD;
+ if (RecurrenceDescriptor::isReductionPHI(PHI, L, RD))
+ return PHI;
+ return nullptr;
+ }
+ }
+
+ return nullptr;
+}
+
+bool LoopInterchangeLegality::findInductionAndReductions(
+ Loop *L, SmallVector<PHINode *, 8> &Inductions, Loop *InnerLoop) {
+ if (!L->getLoopLatch() || !L->getLoopPredecessor())
+ return false;
+ for (PHINode &PHI : L->getHeader()->phis()) {
+ RecurrenceDescriptor RD;
+ InductionDescriptor ID;
+ if (InductionDescriptor::isInductionPHI(&PHI, L, SE, ID))
+ Inductions.push_back(&PHI);
+ else {
+ // PHIs in inner loops need to be part of a reduction in the outer loop,
+ // discovered when checking the PHIs of the outer loop earlier.
+ if (!InnerLoop) {
+ if (!OuterInnerReductions.count(&PHI)) {
+ LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
+ "across the outer loop.\n");
+ return false;
+ }
+ } else {
+ assert(PHI.getNumIncomingValues() == 2 &&
+ "Phis in loop header should have exactly 2 incoming values");
+ // Check if we have a PHI node in the outer loop that has a reduction
+ // result from the inner loop as an incoming value.
+ Value *V = followLCSSA(PHI.getIncomingValueForBlock(L->getLoopLatch()));
+ PHINode *InnerRedPhi = findInnerReductionPhi(InnerLoop, V);
+ if (!InnerRedPhi ||
!llvm::is_contained(InnerRedPhi->incoming_values(), &PHI)) {
- LLVM_DEBUG(
- dbgs()
- << "Failed to recognize PHI as an induction or reduction.\n");
- return false;
- }
- OuterInnerReductions.insert(&PHI);
- OuterInnerReductions.insert(InnerRedPhi);
- }
- }
- }
- return true;
-}
-
-// This function indicates the current limitations in the transform as a result
-// of which we do not proceed.
-bool LoopInterchangeLegality::currentLimitations() {
- BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
- BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
-
- // transform currently expects the loop latches to also be the exiting
- // blocks.
- if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
- OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
- !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
- !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
- LLVM_DEBUG(
- dbgs() << "Loops where the latch is not the exiting block are not"
- << " supported currently.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
- OuterLoop->getStartLoc(),
- OuterLoop->getHeader())
- << "Loops where the latch is not the exiting block cannot be"
- " interchange currently.";
- });
- return true;
- }
-
- PHINode *InnerInductionVar;
- SmallVector<PHINode *, 8> Inductions;
- if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
- LLVM_DEBUG(
- dbgs() << "Only outer loops with induction or reduction PHI nodes "
- << "are supported currently.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
- OuterLoop->getStartLoc(),
- OuterLoop->getHeader())
- << "Only outer loops with induction or reduction PHI nodes can be"
- " interchanged currently.";
- });
- return true;
- }
-
- // TODO: Currently we handle only loops with 1 induction variable.
- if (Inductions.size() != 1) {
- LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
- << "supported currently.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
- OuterLoop->getStartLoc(),
- OuterLoop->getHeader())
- << "Only outer loops with 1 induction variable can be "
- "interchanged currently.";
- });
- return true;
- }
-
- Inductions.clear();
- if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) {
- LLVM_DEBUG(
- dbgs() << "Only inner loops with induction or reduction PHI nodes "
- << "are supported currently.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Only inner loops with induction or reduction PHI nodes can be"
- " interchange currently.";
- });
- return true;
- }
-
- // TODO: Currently we handle only loops with 1 induction variable.
- if (Inductions.size() != 1) {
- LLVM_DEBUG(
- dbgs() << "We currently only support loops with 1 induction variable."
- << "Failed to interchange due to current limitation\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Only inner loops with 1 induction variable can be "
- "interchanged currently.";
- });
- return true;
- }
- InnerInductionVar = Inductions.pop_back_val();
-
- // TODO: Triangular loops are not handled for now.
- if (!isLoopStructureUnderstood(InnerInductionVar)) {
- LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Inner loop structure not understood currently.";
- });
- return true;
- }
-
- // TODO: Current limitation: Since we split the inner loop latch at the point
- // were induction variable is incremented (induction.next); We cannot have
- // more than 1 user of induction.next since it would result in broken code
- // after split.
- // e.g.
- // for(i=0;i<N;i++) {
- // for(j = 0;j<M;j++) {
- // A[j+1][i+2] = A[j][i]+k;
- // }
- // }
- Instruction *InnerIndexVarInc = nullptr;
- if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
- InnerIndexVarInc =
- dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
- else
- InnerIndexVarInc =
- dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
-
- if (!InnerIndexVarInc) {
- LLVM_DEBUG(
- dbgs() << "Did not find an instruction to increment the induction "
- << "variable.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "The inner loop does not increment the induction variable.";
- });
- return true;
- }
-
- // Since we split the inner loop latch on this induction variable. Make sure
- // we do not have any instruction between the induction variable and branch
- // instruction.
-
- bool FoundInduction = false;
- for (const Instruction &I :
- llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) {
- if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
- isa<ZExtInst>(I))
- continue;
-
- // We found an instruction. If this is not induction variable then it is not
- // safe to split this loop latch.
- if (!I.isIdenticalTo(InnerIndexVarInc)) {
- LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction "
- << "variable increment and branch.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(
- DEBUG_TYPE, "UnsupportedInsBetweenInduction",
- InnerLoop->getStartLoc(), InnerLoop->getHeader())
- << "Found unsupported instruction between induction variable "
- "increment and branch.";
- });
- return true;
- }
-
- FoundInduction = true;
- break;
- }
- // The loop latch ended and we didn't find the induction variable return as
- // current limitation.
- if (!FoundInduction) {
- LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Did not find the induction variable.";
- });
- return true;
- }
- return false;
-}
-
-// We currently only support LCSSA PHI nodes in the inner loop exit, if their
-// users are either reduction PHIs or PHIs outside the outer loop (which means
-// the we are only interested in the final value after the loop).
-static bool
-areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
- SmallPtrSetImpl<PHINode *> &Reductions) {
- BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
- for (PHINode &PHI : InnerExit->phis()) {
- // Reduction lcssa phi will have only 1 incoming block that from loop latch.
- if (PHI.getNumIncomingValues() > 1)
- return false;
- if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
- PHINode *PN = dyn_cast<PHINode>(U);
- return !PN ||
- (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
- })) {
- return false;
- }
- }
- return true;
-}
-
-// We currently support LCSSA PHI nodes in the outer loop exit, if their
-// incoming values do not come from the outer loop latch or if the
-// outer loop latch has a single predecessor. In that case, the value will
-// be available if both the inner and outer loop conditions are true, which
-// will still be true after interchanging. If we have multiple predecessor,
-// that may not be the case, e.g. because the outer loop latch may be executed
-// if the inner loop is not executed.
-static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
- BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
- for (PHINode &PHI : LoopNestExit->phis()) {
- // FIXME: We currently are not able to detect floating point reductions
- // and have to use floating point PHIs as a proxy to prevent
- // interchanging in the presence of floating point reductions.
- if (PHI.getType()->isFloatingPointTy())
- return false;
- for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
- Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
- if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
- continue;
-
- // The incoming value is defined in the outer loop latch. Currently we
- // only support that in case the outer loop latch has a single predecessor.
- // This guarantees that the outer loop latch is executed if and only if
- // the inner loop is executed (because tightlyNested() guarantees that the
- // outer loop header only branches to the inner loop or the outer loop
- // latch).
- // FIXME: We could weaken this logic and allow multiple predecessors,
- // if the values are produced outside the loop latch. We would need
- // additional logic to update the PHI nodes in the exit block as
- // well.
- if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
- return false;
- }
- }
- return true;
-}
-
-bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
- unsigned OuterLoopId,
- CharMatrix &DepMatrix) {
- if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
- LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
- << " and OuterLoopId = " << OuterLoopId
- << " due to dependence\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Cannot interchange loops due to dependences.";
- });
- return false;
- }
- // Check if outer and inner loop contain legal instructions only.
- for (auto *BB : OuterLoop->blocks())
- for (Instruction &I : BB->instructionsWithoutDebug())
- if (CallInst *CI = dyn_cast<CallInst>(&I)) {
- // readnone functions do not prevent interchanging.
- if (CI->doesNotReadMemory())
- continue;
- LLVM_DEBUG(
- dbgs() << "Loops with call instructions cannot be interchanged "
- << "safely.");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
- CI->getDebugLoc(),
- CI->getParent())
- << "Cannot interchange loops due to call instruction.";
- });
-
- return false;
- }
-
- // TODO: The loops could not be interchanged due to current limitations in the
- // transform module.
- if (currentLimitations()) {
- LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
- return false;
- }
-
- // Check if the loops are tightly nested.
- if (!tightlyNested(OuterLoop, InnerLoop)) {
- LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Cannot interchange loops because they are not tightly "
- "nested.";
- });
- return false;
- }
-
- if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
- OuterInnerReductions)) {
- LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Found unsupported PHI node in loop exit.";
- });
- return false;
- }
-
- if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
- LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
- OuterLoop->getStartLoc(),
- OuterLoop->getHeader())
- << "Found unsupported PHI node in loop exit.";
- });
- return false;
- }
-
- return true;
-}
-
-int LoopInterchangeProfitability::getInstrOrderCost() {
- unsigned GoodOrder, BadOrder;
- BadOrder = GoodOrder = 0;
- for (BasicBlock *BB : InnerLoop->blocks()) {
- for (Instruction &Ins : *BB) {
- if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
- unsigned NumOp = GEP->getNumOperands();
- bool FoundInnerInduction = false;
- bool FoundOuterInduction = false;
- for (unsigned i = 0; i < NumOp; ++i) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Failed to recognize PHI as an induction or reduction.\n");
+ return false;
+ }
+ OuterInnerReductions.insert(&PHI);
+ OuterInnerReductions.insert(InnerRedPhi);
+ }
+ }
+ }
+ return true;
+}
+
+// This function indicates the current limitations in the transform as a result
+// of which we do not proceed.
+bool LoopInterchangeLegality::currentLimitations() {
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+
+ // transform currently expects the loop latches to also be the exiting
+ // blocks.
+ if (InnerLoop->getExitingBlock() != InnerLoopLatch ||
+ OuterLoop->getExitingBlock() != OuterLoop->getLoopLatch() ||
+ !isa<BranchInst>(InnerLoopLatch->getTerminator()) ||
+ !isa<BranchInst>(OuterLoop->getLoopLatch()->getTerminator())) {
+ LLVM_DEBUG(
+ dbgs() << "Loops where the latch is not the exiting block are not"
+ << " supported currently.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "ExitingNotLatch",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Loops where the latch is not the exiting block cannot be"
+ " interchange currently.";
+ });
+ return true;
+ }
+
+ PHINode *InnerInductionVar;
+ SmallVector<PHINode *, 8> Inductions;
+ if (!findInductionAndReductions(OuterLoop, Inductions, InnerLoop)) {
+ LLVM_DEBUG(
+ dbgs() << "Only outer loops with induction or reduction PHI nodes "
+ << "are supported currently.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIOuter",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Only outer loops with induction or reduction PHI nodes can be"
+ " interchanged currently.";
+ });
+ return true;
+ }
+
+ // TODO: Currently we handle only loops with 1 induction variable.
+ if (Inductions.size() != 1) {
+ LLVM_DEBUG(dbgs() << "Loops with more than 1 induction variables are not "
+ << "supported currently.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "MultiIndutionOuter",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Only outer loops with 1 induction variable can be "
+ "interchanged currently.";
+ });
+ return true;
+ }
+
+ Inductions.clear();
+ if (!findInductionAndReductions(InnerLoop, Inductions, nullptr)) {
+ LLVM_DEBUG(
+ dbgs() << "Only inner loops with induction or reduction PHI nodes "
+ << "are supported currently.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedPHIInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Only inner loops with induction or reduction PHI nodes can be"
+ " interchange currently.";
+ });
+ return true;
+ }
+
+ // TODO: Currently we handle only loops with 1 induction variable.
+ if (Inductions.size() != 1) {
+ LLVM_DEBUG(
+ dbgs() << "We currently only support loops with 1 induction variable."
+ << "Failed to interchange due to current limitation\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "MultiInductionInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Only inner loops with 1 induction variable can be "
+ "interchanged currently.";
+ });
+ return true;
+ }
+ InnerInductionVar = Inductions.pop_back_val();
+
+ // TODO: Triangular loops are not handled for now.
+ if (!isLoopStructureUnderstood(InnerInductionVar)) {
+ LLVM_DEBUG(dbgs() << "Loop structure not understood by pass\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedStructureInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Inner loop structure not understood currently.";
+ });
+ return true;
+ }
+
+ // TODO: Current limitation: Since we split the inner loop latch at the point
+ // were induction variable is incremented (induction.next); We cannot have
+ // more than 1 user of induction.next since it would result in broken code
+ // after split.
+ // e.g.
+ // for(i=0;i<N;i++) {
+ // for(j = 0;j<M;j++) {
+ // A[j+1][i+2] = A[j][i]+k;
+ // }
+ // }
+ Instruction *InnerIndexVarInc = nullptr;
+ if (InnerInductionVar->getIncomingBlock(0) == InnerLoopPreHeader)
+ InnerIndexVarInc =
+ dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(1));
+ else
+ InnerIndexVarInc =
+ dyn_cast<Instruction>(InnerInductionVar->getIncomingValue(0));
+
+ if (!InnerIndexVarInc) {
+ LLVM_DEBUG(
+ dbgs() << "Did not find an instruction to increment the induction "
+ << "variable.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoIncrementInInner",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "The inner loop does not increment the induction variable.";
+ });
+ return true;
+ }
+
+ // Since we split the inner loop latch on this induction variable. Make sure
+ // we do not have any instruction between the induction variable and branch
+ // instruction.
+
+ bool FoundInduction = false;
+ for (const Instruction &I :
+ llvm::reverse(InnerLoopLatch->instructionsWithoutDebug())) {
+ if (isa<BranchInst>(I) || isa<CmpInst>(I) || isa<TruncInst>(I) ||
+ isa<ZExtInst>(I))
+ continue;
+
+ // We found an instruction. If this is not induction variable then it is not
+ // safe to split this loop latch.
+ if (!I.isIdenticalTo(InnerIndexVarInc)) {
+ LLVM_DEBUG(dbgs() << "Found unsupported instructions between induction "
+ << "variable increment and branch.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(
+ DEBUG_TYPE, "UnsupportedInsBetweenInduction",
+ InnerLoop->getStartLoc(), InnerLoop->getHeader())
+ << "Found unsupported instruction between induction variable "
+ "increment and branch.";
+ });
+ return true;
+ }
+
+ FoundInduction = true;
+ break;
+ }
+ // The loop latch ended and we didn't find the induction variable return as
+ // current limitation.
+ if (!FoundInduction) {
+ LLVM_DEBUG(dbgs() << "Did not find the induction variable.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NoIndutionVariable",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Did not find the induction variable.";
+ });
+ return true;
+ }
+ return false;
+}
+
+// We currently only support LCSSA PHI nodes in the inner loop exit, if their
+// users are either reduction PHIs or PHIs outside the outer loop (which means
+// the we are only interested in the final value after the loop).
+static bool
+areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
+ SmallPtrSetImpl<PHINode *> &Reductions) {
+ BasicBlock *InnerExit = OuterL->getUniqueExitBlock();
+ for (PHINode &PHI : InnerExit->phis()) {
+ // Reduction lcssa phi will have only 1 incoming block that from loop latch.
+ if (PHI.getNumIncomingValues() > 1)
+ return false;
+ if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
+ PHINode *PN = dyn_cast<PHINode>(U);
+ return !PN ||
+ (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
+ })) {
+ return false;
+ }
+ }
+ return true;
+}
+
+// We currently support LCSSA PHI nodes in the outer loop exit, if their
+// incoming values do not come from the outer loop latch or if the
+// outer loop latch has a single predecessor. In that case, the value will
+// be available if both the inner and outer loop conditions are true, which
+// will still be true after interchanging. If we have multiple predecessor,
+// that may not be the case, e.g. because the outer loop latch may be executed
+// if the inner loop is not executed.
+static bool areOuterLoopExitPHIsSupported(Loop *OuterLoop, Loop *InnerLoop) {
+ BasicBlock *LoopNestExit = OuterLoop->getUniqueExitBlock();
+ for (PHINode &PHI : LoopNestExit->phis()) {
+ // FIXME: We currently are not able to detect floating point reductions
+ // and have to use floating point PHIs as a proxy to prevent
+ // interchanging in the presence of floating point reductions.
+ if (PHI.getType()->isFloatingPointTy())
+ return false;
+ for (unsigned i = 0; i < PHI.getNumIncomingValues(); i++) {
+ Instruction *IncomingI = dyn_cast<Instruction>(PHI.getIncomingValue(i));
+ if (!IncomingI || IncomingI->getParent() != OuterLoop->getLoopLatch())
+ continue;
+
+ // The incoming value is defined in the outer loop latch. Currently we
+ // only support that in case the outer loop latch has a single predecessor.
+ // This guarantees that the outer loop latch is executed if and only if
+ // the inner loop is executed (because tightlyNested() guarantees that the
+ // outer loop header only branches to the inner loop or the outer loop
+ // latch).
+ // FIXME: We could weaken this logic and allow multiple predecessors,
+ // if the values are produced outside the loop latch. We would need
+ // additional logic to update the PHI nodes in the exit block as
+ // well.
+ if (OuterLoop->getLoopLatch()->getUniquePredecessor() == nullptr)
+ return false;
+ }
+ }
+ return true;
+}
+
+bool LoopInterchangeLegality::canInterchangeLoops(unsigned InnerLoopId,
+ unsigned OuterLoopId,
+ CharMatrix &DepMatrix) {
+ if (!isLegalToInterChangeLoops(DepMatrix, InnerLoopId, OuterLoopId)) {
+ LLVM_DEBUG(dbgs() << "Failed interchange InnerLoopId = " << InnerLoopId
+ << " and OuterLoopId = " << OuterLoopId
+ << " due to dependence\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "Dependence",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Cannot interchange loops due to dependences.";
+ });
+ return false;
+ }
+ // Check if outer and inner loop contain legal instructions only.
+ for (auto *BB : OuterLoop->blocks())
+ for (Instruction &I : BB->instructionsWithoutDebug())
+ if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+ // readnone functions do not prevent interchanging.
+ if (CI->doesNotReadMemory())
+ continue;
+ LLVM_DEBUG(
+ dbgs() << "Loops with call instructions cannot be interchanged "
+ << "safely.");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "CallInst",
+ CI->getDebugLoc(),
+ CI->getParent())
+ << "Cannot interchange loops due to call instruction.";
+ });
+
+ return false;
+ }
+
+ // TODO: The loops could not be interchanged due to current limitations in the
+ // transform module.
+ if (currentLimitations()) {
+ LLVM_DEBUG(dbgs() << "Not legal because of current transform limitation\n");
+ return false;
+ }
+
+ // Check if the loops are tightly nested.
+ if (!tightlyNested(OuterLoop, InnerLoop)) {
+ LLVM_DEBUG(dbgs() << "Loops not tightly nested\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "NotTightlyNested",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Cannot interchange loops because they are not tightly "
+ "nested.";
+ });
+ return false;
+ }
+
+ if (!areInnerLoopExitPHIsSupported(OuterLoop, InnerLoop,
+ OuterInnerReductions)) {
+ LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in inner loop exit.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Found unsupported PHI node in loop exit.";
+ });
+ return false;
+ }
+
+ if (!areOuterLoopExitPHIsSupported(OuterLoop, InnerLoop)) {
+ LLVM_DEBUG(dbgs() << "Found unsupported PHI nodes in outer loop exit.\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "UnsupportedExitPHI",
+ OuterLoop->getStartLoc(),
+ OuterLoop->getHeader())
+ << "Found unsupported PHI node in loop exit.";
+ });
+ return false;
+ }
+
+ return true;
+}
+
+int LoopInterchangeProfitability::getInstrOrderCost() {
+ unsigned GoodOrder, BadOrder;
+ BadOrder = GoodOrder = 0;
+ for (BasicBlock *BB : InnerLoop->blocks()) {
+ for (Instruction &Ins : *BB) {
+ if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&Ins)) {
+ unsigned NumOp = GEP->getNumOperands();
+ bool FoundInnerInduction = false;
+ bool FoundOuterInduction = false;
+ for (unsigned i = 0; i < NumOp; ++i) {
// Skip operands that are not SCEV-able.
if (!SE->isSCEVable(GEP->getOperand(i)->getType()))
continue;
- const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
- const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
- if (!AR)
- continue;
-
- // If we find the inner induction after an outer induction e.g.
- // for(int i=0;i<N;i++)
- // for(int j=0;j<N;j++)
- // A[i][j] = A[i-1][j-1]+k;
- // then it is a good order.
- if (AR->getLoop() == InnerLoop) {
- // We found an InnerLoop induction after OuterLoop induction. It is
- // a good order.
- FoundInnerInduction = true;
- if (FoundOuterInduction) {
- GoodOrder++;
- break;
- }
- }
- // If we find the outer induction after an inner induction e.g.
- // for(int i=0;i<N;i++)
- // for(int j=0;j<N;j++)
- // A[j][i] = A[j-1][i-1]+k;
- // then it is a bad order.
- if (AR->getLoop() == OuterLoop) {
- // We found an OuterLoop induction after InnerLoop induction. It is
- // a bad order.
- FoundOuterInduction = true;
- if (FoundInnerInduction) {
- BadOrder++;
- break;
- }
- }
- }
- }
- }
- }
- return GoodOrder - BadOrder;
-}
-
-static bool isProfitableForVectorization(unsigned InnerLoopId,
- unsigned OuterLoopId,
- CharMatrix &DepMatrix) {
- // TODO: Improve this heuristic to catch more cases.
- // If the inner loop is loop independent or doesn't carry any dependency it is
- // profitable to move this to outer position.
- for (auto &Row : DepMatrix) {
- if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
- return false;
- // TODO: We need to improve this heuristic.
- if (Row[OuterLoopId] != '=')
- return false;
- }
- // If outer loop has dependence and inner loop is loop independent then it is
- // profitable to interchange to enable parallelism.
- // If there are no dependences, interchanging will not improve anything.
- return !DepMatrix.empty();
-}
-
-bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
- unsigned OuterLoopId,
- CharMatrix &DepMatrix) {
- // TODO: Add better profitability checks.
- // e.g
- // 1) Construct dependency matrix and move the one with no loop carried dep
- // inside to enable vectorization.
-
- // This is rough cost estimation algorithm. It counts the good and bad order
- // of induction variables in the instruction and allows reordering if number
- // of bad orders is more than good.
- int Cost = getInstrOrderCost();
- LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
- if (Cost < -LoopInterchangeCostThreshold)
- return true;
-
- // It is not profitable as per current cache profitability model. But check if
- // we can move this loop outside to improve parallelism.
- if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
- return true;
-
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
- InnerLoop->getStartLoc(),
- InnerLoop->getHeader())
- << "Interchanging loops is too costly (cost="
- << ore::NV("Cost", Cost) << ", threshold="
- << ore::NV("Threshold", LoopInterchangeCostThreshold)
- << ") and it does not improve parallelism.";
- });
- return false;
-}
-
-void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
- Loop *InnerLoop) {
- for (Loop *L : *OuterLoop)
- if (L == InnerLoop) {
- OuterLoop->removeChildLoop(L);
- return;
- }
- llvm_unreachable("Couldn't find loop");
-}
-
-/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
-/// new inner and outer loop after interchanging: NewInner is the original
-/// outer loop and NewOuter is the original inner loop.
-///
-/// Before interchanging, we have the following structure
-/// Outer preheader
-// Outer header
-// Inner preheader
-// Inner header
-// Inner body
-// Inner latch
-// outer bbs
-// Outer latch
-//
-// After interchanging:
-// Inner preheader
-// Inner header
-// Outer preheader
-// Outer header
-// Inner body
-// outer bbs
-// Outer latch
-// Inner latch
-void LoopInterchangeTransform::restructureLoops(
- Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
- BasicBlock *OrigOuterPreHeader) {
- Loop *OuterLoopParent = OuterLoop->getParentLoop();
- // The original inner loop preheader moves from the new inner loop to
- // the parent loop, if there is one.
- NewInner->removeBlockFromLoop(OrigInnerPreHeader);
- LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
-
- // Switch the loop levels.
- if (OuterLoopParent) {
- // Remove the loop from its parent loop.
- removeChildLoop(OuterLoopParent, NewInner);
- removeChildLoop(NewInner, NewOuter);
- OuterLoopParent->addChildLoop(NewOuter);
- } else {
- removeChildLoop(NewInner, NewOuter);
- LI->changeTopLevelLoop(NewInner, NewOuter);
- }
+ const SCEV *OperandVal = SE->getSCEV(GEP->getOperand(i));
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(OperandVal);
+ if (!AR)
+ continue;
+
+ // If we find the inner induction after an outer induction e.g.
+ // for(int i=0;i<N;i++)
+ // for(int j=0;j<N;j++)
+ // A[i][j] = A[i-1][j-1]+k;
+ // then it is a good order.
+ if (AR->getLoop() == InnerLoop) {
+ // We found an InnerLoop induction after OuterLoop induction. It is
+ // a good order.
+ FoundInnerInduction = true;
+ if (FoundOuterInduction) {
+ GoodOrder++;
+ break;
+ }
+ }
+ // If we find the outer induction after an inner induction e.g.
+ // for(int i=0;i<N;i++)
+ // for(int j=0;j<N;j++)
+ // A[j][i] = A[j-1][i-1]+k;
+ // then it is a bad order.
+ if (AR->getLoop() == OuterLoop) {
+ // We found an OuterLoop induction after InnerLoop induction. It is
+ // a bad order.
+ FoundOuterInduction = true;
+ if (FoundInnerInduction) {
+ BadOrder++;
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+ return GoodOrder - BadOrder;
+}
+
+static bool isProfitableForVectorization(unsigned InnerLoopId,
+ unsigned OuterLoopId,
+ CharMatrix &DepMatrix) {
+ // TODO: Improve this heuristic to catch more cases.
+ // If the inner loop is loop independent or doesn't carry any dependency it is
+ // profitable to move this to outer position.
+ for (auto &Row : DepMatrix) {
+ if (Row[InnerLoopId] != 'S' && Row[InnerLoopId] != 'I')
+ return false;
+ // TODO: We need to improve this heuristic.
+ if (Row[OuterLoopId] != '=')
+ return false;
+ }
+ // If outer loop has dependence and inner loop is loop independent then it is
+ // profitable to interchange to enable parallelism.
+ // If there are no dependences, interchanging will not improve anything.
+ return !DepMatrix.empty();
+}
+
+bool LoopInterchangeProfitability::isProfitable(unsigned InnerLoopId,
+ unsigned OuterLoopId,
+ CharMatrix &DepMatrix) {
+ // TODO: Add better profitability checks.
+ // e.g
+ // 1) Construct dependency matrix and move the one with no loop carried dep
+ // inside to enable vectorization.
+
+ // This is rough cost estimation algorithm. It counts the good and bad order
+ // of induction variables in the instruction and allows reordering if number
+ // of bad orders is more than good.
+ int Cost = getInstrOrderCost();
+ LLVM_DEBUG(dbgs() << "Cost = " << Cost << "\n");
+ if (Cost < -LoopInterchangeCostThreshold)
+ return true;
+
+ // It is not profitable as per current cache profitability model. But check if
+ // we can move this loop outside to improve parallelism.
+ if (isProfitableForVectorization(InnerLoopId, OuterLoopId, DepMatrix))
+ return true;
+
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "InterchangeNotProfitable",
+ InnerLoop->getStartLoc(),
+ InnerLoop->getHeader())
+ << "Interchanging loops is too costly (cost="
+ << ore::NV("Cost", Cost) << ", threshold="
+ << ore::NV("Threshold", LoopInterchangeCostThreshold)
+ << ") and it does not improve parallelism.";
+ });
+ return false;
+}
+
+void LoopInterchangeTransform::removeChildLoop(Loop *OuterLoop,
+ Loop *InnerLoop) {
+ for (Loop *L : *OuterLoop)
+ if (L == InnerLoop) {
+ OuterLoop->removeChildLoop(L);
+ return;
+ }
+ llvm_unreachable("Couldn't find loop");
+}
+
+/// Update LoopInfo, after interchanging. NewInner and NewOuter refer to the
+/// new inner and outer loop after interchanging: NewInner is the original
+/// outer loop and NewOuter is the original inner loop.
+///
+/// Before interchanging, we have the following structure
+/// Outer preheader
+// Outer header
+// Inner preheader
+// Inner header
+// Inner body
+// Inner latch
+// outer bbs
+// Outer latch
+//
+// After interchanging:
+// Inner preheader
+// Inner header
+// Outer preheader
+// Outer header
+// Inner body
+// outer bbs
+// Outer latch
+// Inner latch
+void LoopInterchangeTransform::restructureLoops(
+ Loop *NewInner, Loop *NewOuter, BasicBlock *OrigInnerPreHeader,
+ BasicBlock *OrigOuterPreHeader) {
+ Loop *OuterLoopParent = OuterLoop->getParentLoop();
+ // The original inner loop preheader moves from the new inner loop to
+ // the parent loop, if there is one.
+ NewInner->removeBlockFromLoop(OrigInnerPreHeader);
+ LI->changeLoopFor(OrigInnerPreHeader, OuterLoopParent);
+
+ // Switch the loop levels.
+ if (OuterLoopParent) {
+ // Remove the loop from its parent loop.
+ removeChildLoop(OuterLoopParent, NewInner);
+ removeChildLoop(NewInner, NewOuter);
+ OuterLoopParent->addChildLoop(NewOuter);
+ } else {
+ removeChildLoop(NewInner, NewOuter);
+ LI->changeTopLevelLoop(NewInner, NewOuter);
+ }
while (!NewOuter->isInnermost())
- NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
- NewOuter->addChildLoop(NewInner);
-
- // BBs from the original inner loop.
- SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
-
- // Add BBs from the original outer loop to the original inner loop (excluding
- // BBs already in inner loop)
- for (BasicBlock *BB : NewInner->blocks())
- if (LI->getLoopFor(BB) == NewInner)
- NewOuter->addBlockEntry(BB);
-
- // Now remove inner loop header and latch from the new inner loop and move
- // other BBs (the loop body) to the new inner loop.
- BasicBlock *OuterHeader = NewOuter->getHeader();
- BasicBlock *OuterLatch = NewOuter->getLoopLatch();
- for (BasicBlock *BB : OrigInnerBBs) {
- // Nothing will change for BBs in child loops.
- if (LI->getLoopFor(BB) != NewOuter)
- continue;
- // Remove the new outer loop header and latch from the new inner loop.
- if (BB == OuterHeader || BB == OuterLatch)
- NewInner->removeBlockFromLoop(BB);
- else
- LI->changeLoopFor(BB, NewInner);
- }
-
- // The preheader of the original outer loop becomes part of the new
- // outer loop.
- NewOuter->addBlockEntry(OrigOuterPreHeader);
- LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
-
- // Tell SE that we move the loops around.
- SE->forgetLoop(NewOuter);
- SE->forgetLoop(NewInner);
-}
-
-bool LoopInterchangeTransform::transform() {
- bool Transformed = false;
- Instruction *InnerIndexVar;
-
- if (InnerLoop->getSubLoops().empty()) {
- BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
- LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
- PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
- if (!InductionPHI) {
- LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
- return false;
- }
-
- if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
- InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
- else
- InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
-
- // Ensure that InductionPHI is the first Phi node.
- if (&InductionPHI->getParent()->front() != InductionPHI)
- InductionPHI->moveBefore(&InductionPHI->getParent()->front());
-
- // Create a new latch block for the inner loop. We split at the
- // current latch's terminator and then move the condition and all
- // operands that are not either loop-invariant or the induction PHI into the
- // new latch block.
- BasicBlock *NewLatch =
- SplitBlock(InnerLoop->getLoopLatch(),
- InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
-
- SmallSetVector<Instruction *, 4> WorkList;
- unsigned i = 0;
- auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() {
- for (; i < WorkList.size(); i++) {
- // Duplicate instruction and move it the new latch. Update uses that
- // have been moved.
- Instruction *NewI = WorkList[i]->clone();
- NewI->insertBefore(NewLatch->getFirstNonPHI());
- assert(!NewI->mayHaveSideEffects() &&
- "Moving instructions with side-effects may change behavior of "
- "the loop nest!");
- for (auto UI = WorkList[i]->use_begin(), UE = WorkList[i]->use_end();
- UI != UE;) {
- Use &U = *UI++;
- Instruction *UserI = cast<Instruction>(U.getUser());
- if (!InnerLoop->contains(UserI->getParent()) ||
- UserI->getParent() == NewLatch || UserI == InductionPHI)
- U.set(NewI);
- }
- // Add operands of moved instruction to the worklist, except if they are
- // outside the inner loop or are the induction PHI.
- for (Value *Op : WorkList[i]->operands()) {
- Instruction *OpI = dyn_cast<Instruction>(Op);
- if (!OpI ||
- this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
- OpI == InductionPHI)
- continue;
- WorkList.insert(OpI);
- }
- }
- };
-
- // FIXME: Should we interchange when we have a constant condition?
- Instruction *CondI = dyn_cast<Instruction>(
- cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())
- ->getCondition());
- if (CondI)
- WorkList.insert(CondI);
- MoveInstructions();
- WorkList.insert(cast<Instruction>(InnerIndexVar));
- MoveInstructions();
-
- // Splits the inner loops phi nodes out into a separate basic block.
- BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
- SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
- LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
- }
-
+ NewInner->addChildLoop(NewOuter->removeChildLoop(NewOuter->begin()));
+ NewOuter->addChildLoop(NewInner);
+
+ // BBs from the original inner loop.
+ SmallVector<BasicBlock *, 8> OrigInnerBBs(NewOuter->blocks());
+
+ // Add BBs from the original outer loop to the original inner loop (excluding
+ // BBs already in inner loop)
+ for (BasicBlock *BB : NewInner->blocks())
+ if (LI->getLoopFor(BB) == NewInner)
+ NewOuter->addBlockEntry(BB);
+
+ // Now remove inner loop header and latch from the new inner loop and move
+ // other BBs (the loop body) to the new inner loop.
+ BasicBlock *OuterHeader = NewOuter->getHeader();
+ BasicBlock *OuterLatch = NewOuter->getLoopLatch();
+ for (BasicBlock *BB : OrigInnerBBs) {
+ // Nothing will change for BBs in child loops.
+ if (LI->getLoopFor(BB) != NewOuter)
+ continue;
+ // Remove the new outer loop header and latch from the new inner loop.
+ if (BB == OuterHeader || BB == OuterLatch)
+ NewInner->removeBlockFromLoop(BB);
+ else
+ LI->changeLoopFor(BB, NewInner);
+ }
+
+ // The preheader of the original outer loop becomes part of the new
+ // outer loop.
+ NewOuter->addBlockEntry(OrigOuterPreHeader);
+ LI->changeLoopFor(OrigOuterPreHeader, NewOuter);
+
+ // Tell SE that we move the loops around.
+ SE->forgetLoop(NewOuter);
+ SE->forgetLoop(NewInner);
+}
+
+bool LoopInterchangeTransform::transform() {
+ bool Transformed = false;
+ Instruction *InnerIndexVar;
+
+ if (InnerLoop->getSubLoops().empty()) {
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ LLVM_DEBUG(dbgs() << "Splitting the inner loop latch\n");
+ PHINode *InductionPHI = getInductionVariable(InnerLoop, SE);
+ if (!InductionPHI) {
+ LLVM_DEBUG(dbgs() << "Failed to find the point to split loop latch \n");
+ return false;
+ }
+
+ if (InductionPHI->getIncomingBlock(0) == InnerLoopPreHeader)
+ InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(1));
+ else
+ InnerIndexVar = dyn_cast<Instruction>(InductionPHI->getIncomingValue(0));
+
+ // Ensure that InductionPHI is the first Phi node.
+ if (&InductionPHI->getParent()->front() != InductionPHI)
+ InductionPHI->moveBefore(&InductionPHI->getParent()->front());
+
+ // Create a new latch block for the inner loop. We split at the
+ // current latch's terminator and then move the condition and all
+ // operands that are not either loop-invariant or the induction PHI into the
+ // new latch block.
+ BasicBlock *NewLatch =
+ SplitBlock(InnerLoop->getLoopLatch(),
+ InnerLoop->getLoopLatch()->getTerminator(), DT, LI);
+
+ SmallSetVector<Instruction *, 4> WorkList;
+ unsigned i = 0;
+ auto MoveInstructions = [&i, &WorkList, this, InductionPHI, NewLatch]() {
+ for (; i < WorkList.size(); i++) {
+ // Duplicate instruction and move it the new latch. Update uses that
+ // have been moved.
+ Instruction *NewI = WorkList[i]->clone();
+ NewI->insertBefore(NewLatch->getFirstNonPHI());
+ assert(!NewI->mayHaveSideEffects() &&
+ "Moving instructions with side-effects may change behavior of "
+ "the loop nest!");
+ for (auto UI = WorkList[i]->use_begin(), UE = WorkList[i]->use_end();
+ UI != UE;) {
+ Use &U = *UI++;
+ Instruction *UserI = cast<Instruction>(U.getUser());
+ if (!InnerLoop->contains(UserI->getParent()) ||
+ UserI->getParent() == NewLatch || UserI == InductionPHI)
+ U.set(NewI);
+ }
+ // Add operands of moved instruction to the worklist, except if they are
+ // outside the inner loop or are the induction PHI.
+ for (Value *Op : WorkList[i]->operands()) {
+ Instruction *OpI = dyn_cast<Instruction>(Op);
+ if (!OpI ||
+ this->LI->getLoopFor(OpI->getParent()) != this->InnerLoop ||
+ OpI == InductionPHI)
+ continue;
+ WorkList.insert(OpI);
+ }
+ }
+ };
+
+ // FIXME: Should we interchange when we have a constant condition?
+ Instruction *CondI = dyn_cast<Instruction>(
+ cast<BranchInst>(InnerLoop->getLoopLatch()->getTerminator())
+ ->getCondition());
+ if (CondI)
+ WorkList.insert(CondI);
+ MoveInstructions();
+ WorkList.insert(cast<Instruction>(InnerIndexVar));
+ MoveInstructions();
+
+ // Splits the inner loops phi nodes out into a separate basic block.
+ BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+ SplitBlock(InnerLoopHeader, InnerLoopHeader->getFirstNonPHI(), DT, LI);
+ LLVM_DEBUG(dbgs() << "splitting InnerLoopHeader done\n");
+ }
+
// Instructions in the original inner loop preheader may depend on values
// defined in the outer loop header. Move them there, because the original
// inner loop preheader will become the entry into the interchanged loop nest.
@@ -1321,295 +1321,295 @@ bool LoopInterchangeTransform::transform() {
I.moveBefore(OuterLoopHeader->getTerminator());
}
- Transformed |= adjustLoopLinks();
- if (!Transformed) {
- LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
- return false;
- }
-
- return true;
-}
-
-/// \brief Move all instructions except the terminator from FromBB right before
-/// InsertBefore
-static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
- auto &ToList = InsertBefore->getParent()->getInstList();
- auto &FromList = FromBB->getInstList();
-
- ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
- FromBB->getTerminator()->getIterator());
-}
-
-/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
-static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
- // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
- // from BB1 afterwards.
- auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
- SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
- for (Instruction *I : TempInstrs)
- I->removeFromParent();
-
- // Move instructions from BB2 to BB1.
- moveBBContents(BB2, BB1->getTerminator());
-
- // Move instructions from TempInstrs to BB2.
- for (Instruction *I : TempInstrs)
- I->insertBefore(BB2->getTerminator());
-}
-
-// Update BI to jump to NewBB instead of OldBB. Records updates to the
-// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
-// \p OldBB is exactly once in BI's successor list.
-static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
- BasicBlock *NewBB,
- std::vector<DominatorTree::UpdateType> &DTUpdates,
- bool MustUpdateOnce = true) {
- assert((!MustUpdateOnce ||
- llvm::count_if(successors(BI),
- [OldBB](BasicBlock *BB) {
- return BB == OldBB;
- }) == 1) && "BI must jump to OldBB exactly once.");
- bool Changed = false;
- for (Use &Op : BI->operands())
- if (Op == OldBB) {
- Op.set(NewBB);
- Changed = true;
- }
-
- if (Changed) {
- DTUpdates.push_back(
- {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
- DTUpdates.push_back(
- {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
- }
- assert(Changed && "Expected a successor to be updated");
-}
-
-// Move Lcssa PHIs to the right place.
-static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
- BasicBlock *InnerLatch, BasicBlock *OuterHeader,
- BasicBlock *OuterLatch, BasicBlock *OuterExit,
- Loop *InnerLoop, LoopInfo *LI) {
-
- // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
- // defined either in the header or latch. Those blocks will become header and
- // latch of the new outer loop, and the only possible users can PHI nodes
- // in the exit block of the loop nest or the outer loop header (reduction
- // PHIs, in that case, the incoming value must be defined in the inner loop
- // header). We can just substitute the user with the incoming value and remove
- // the PHI.
- for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
- assert(P.getNumIncomingValues() == 1 &&
- "Only loops with a single exit are supported!");
-
- // Incoming values are guaranteed be instructions currently.
- auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
- // Skip phis with incoming values from the inner loop body, excluding the
- // header and latch.
- if (IncI->getParent() != InnerLatch && IncI->getParent() != InnerHeader)
- continue;
-
- assert(all_of(P.users(),
- [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
- return (cast<PHINode>(U)->getParent() == OuterHeader &&
- IncI->getParent() == InnerHeader) ||
- cast<PHINode>(U)->getParent() == OuterExit;
- }) &&
- "Can only replace phis iff the uses are in the loop nest exit or "
- "the incoming value is defined in the inner header (it will "
- "dominate all loop blocks after interchanging)");
- P.replaceAllUsesWith(IncI);
- P.eraseFromParent();
- }
-
- SmallVector<PHINode *, 8> LcssaInnerExit;
- for (PHINode &P : InnerExit->phis())
- LcssaInnerExit.push_back(&P);
-
- SmallVector<PHINode *, 8> LcssaInnerLatch;
- for (PHINode &P : InnerLatch->phis())
- LcssaInnerLatch.push_back(&P);
-
- // Lcssa PHIs for values used outside the inner loop are in InnerExit.
- // If a PHI node has users outside of InnerExit, it has a use outside the
- // interchanged loop and we have to preserve it. We move these to
- // InnerLatch, which will become the new exit block for the innermost
- // loop after interchanging.
- for (PHINode *P : LcssaInnerExit)
- P->moveBefore(InnerLatch->getFirstNonPHI());
-
- // If the inner loop latch contains LCSSA PHIs, those come from a child loop
- // and we have to move them to the new inner latch.
- for (PHINode *P : LcssaInnerLatch)
- P->moveBefore(InnerExit->getFirstNonPHI());
-
- // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
- // incoming values defined in the outer loop, we have to add a new PHI
- // in the inner loop latch, which became the exit block of the outer loop,
- // after interchanging.
- if (OuterExit) {
- for (PHINode &P : OuterExit->phis()) {
- if (P.getNumIncomingValues() != 1)
- continue;
- // Skip Phis with incoming values defined in the inner loop. Those should
- // already have been updated.
- auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
- if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
- continue;
-
- PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
- NewPhi->setIncomingValue(0, P.getIncomingValue(0));
- NewPhi->setIncomingBlock(0, OuterLatch);
- NewPhi->insertBefore(InnerLatch->getFirstNonPHI());
- P.setIncomingValue(0, NewPhi);
- }
- }
-
- // Now adjust the incoming blocks for the LCSSA PHIs.
- // For PHIs moved from Inner's exit block, we need to replace Inner's latch
- // with the new latch.
- InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
-}
-
-bool LoopInterchangeTransform::adjustLoopBranches() {
- LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
- std::vector<DominatorTree::UpdateType> DTUpdates;
-
- BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
- BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
-
- assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
- InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
- InnerLoopPreHeader && "Guaranteed by loop-simplify form");
- // Ensure that both preheaders do not contain PHI nodes and have single
- // predecessors. This allows us to move them easily. We use
- // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
- // preheaders do not satisfy those conditions.
- if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
- !OuterLoopPreHeader->getUniquePredecessor())
- OuterLoopPreHeader =
- InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
- if (InnerLoopPreHeader == OuterLoop->getHeader())
- InnerLoopPreHeader =
- InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
-
- // Adjust the loop preheader
- BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
- BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
- BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
- BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
- BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
- BasicBlock *InnerLoopLatchPredecessor =
- InnerLoopLatch->getUniquePredecessor();
- BasicBlock *InnerLoopLatchSuccessor;
- BasicBlock *OuterLoopLatchSuccessor;
-
- BranchInst *OuterLoopLatchBI =
- dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
- BranchInst *InnerLoopLatchBI =
- dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
- BranchInst *OuterLoopHeaderBI =
- dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
- BranchInst *InnerLoopHeaderBI =
- dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
-
- if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
- !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
- !InnerLoopHeaderBI)
- return false;
-
- BranchInst *InnerLoopLatchPredecessorBI =
- dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
- BranchInst *OuterLoopPredecessorBI =
- dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
-
- if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
- return false;
- BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
- if (!InnerLoopHeaderSuccessor)
- return false;
-
- // Adjust Loop Preheader and headers.
- // The branches in the outer loop predecessor and the outer loop header can
- // be unconditional branches or conditional branches with duplicates. Consider
- // this when updating the successors.
- updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
- InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
- // The outer loop header might or might not branch to the outer latch.
- // We are guaranteed to branch to the inner loop preheader.
+ Transformed |= adjustLoopLinks();
+ if (!Transformed) {
+ LLVM_DEBUG(dbgs() << "adjustLoopLinks failed\n");
+ return false;
+ }
+
+ return true;
+}
+
+/// \brief Move all instructions except the terminator from FromBB right before
+/// InsertBefore
+static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
+ auto &ToList = InsertBefore->getParent()->getInstList();
+ auto &FromList = FromBB->getInstList();
+
+ ToList.splice(InsertBefore->getIterator(), FromList, FromList.begin(),
+ FromBB->getTerminator()->getIterator());
+}
+
+/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
+static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
+ // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
+ // from BB1 afterwards.
+ auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
+ SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
+ for (Instruction *I : TempInstrs)
+ I->removeFromParent();
+
+ // Move instructions from BB2 to BB1.
+ moveBBContents(BB2, BB1->getTerminator());
+
+ // Move instructions from TempInstrs to BB2.
+ for (Instruction *I : TempInstrs)
+ I->insertBefore(BB2->getTerminator());
+}
+
+// Update BI to jump to NewBB instead of OldBB. Records updates to the
+// dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
+// \p OldBB is exactly once in BI's successor list.
+static void updateSuccessor(BranchInst *BI, BasicBlock *OldBB,
+ BasicBlock *NewBB,
+ std::vector<DominatorTree::UpdateType> &DTUpdates,
+ bool MustUpdateOnce = true) {
+ assert((!MustUpdateOnce ||
+ llvm::count_if(successors(BI),
+ [OldBB](BasicBlock *BB) {
+ return BB == OldBB;
+ }) == 1) && "BI must jump to OldBB exactly once.");
+ bool Changed = false;
+ for (Use &Op : BI->operands())
+ if (Op == OldBB) {
+ Op.set(NewBB);
+ Changed = true;
+ }
+
+ if (Changed) {
+ DTUpdates.push_back(
+ {DominatorTree::UpdateKind::Insert, BI->getParent(), NewBB});
+ DTUpdates.push_back(
+ {DominatorTree::UpdateKind::Delete, BI->getParent(), OldBB});
+ }
+ assert(Changed && "Expected a successor to be updated");
+}
+
+// Move Lcssa PHIs to the right place.
+static void moveLCSSAPhis(BasicBlock *InnerExit, BasicBlock *InnerHeader,
+ BasicBlock *InnerLatch, BasicBlock *OuterHeader,
+ BasicBlock *OuterLatch, BasicBlock *OuterExit,
+ Loop *InnerLoop, LoopInfo *LI) {
+
+ // Deal with LCSSA PHI nodes in the exit block of the inner loop, that are
+ // defined either in the header or latch. Those blocks will become header and
+ // latch of the new outer loop, and the only possible users can PHI nodes
+ // in the exit block of the loop nest or the outer loop header (reduction
+ // PHIs, in that case, the incoming value must be defined in the inner loop
+ // header). We can just substitute the user with the incoming value and remove
+ // the PHI.
+ for (PHINode &P : make_early_inc_range(InnerExit->phis())) {
+ assert(P.getNumIncomingValues() == 1 &&
+ "Only loops with a single exit are supported!");
+
+ // Incoming values are guaranteed be instructions currently.
+ auto IncI = cast<Instruction>(P.getIncomingValueForBlock(InnerLatch));
+ // Skip phis with incoming values from the inner loop body, excluding the
+ // header and latch.
+ if (IncI->getParent() != InnerLatch && IncI->getParent() != InnerHeader)
+ continue;
+
+ assert(all_of(P.users(),
+ [OuterHeader, OuterExit, IncI, InnerHeader](User *U) {
+ return (cast<PHINode>(U)->getParent() == OuterHeader &&
+ IncI->getParent() == InnerHeader) ||
+ cast<PHINode>(U)->getParent() == OuterExit;
+ }) &&
+ "Can only replace phis iff the uses are in the loop nest exit or "
+ "the incoming value is defined in the inner header (it will "
+ "dominate all loop blocks after interchanging)");
+ P.replaceAllUsesWith(IncI);
+ P.eraseFromParent();
+ }
+
+ SmallVector<PHINode *, 8> LcssaInnerExit;
+ for (PHINode &P : InnerExit->phis())
+ LcssaInnerExit.push_back(&P);
+
+ SmallVector<PHINode *, 8> LcssaInnerLatch;
+ for (PHINode &P : InnerLatch->phis())
+ LcssaInnerLatch.push_back(&P);
+
+ // Lcssa PHIs for values used outside the inner loop are in InnerExit.
+ // If a PHI node has users outside of InnerExit, it has a use outside the
+ // interchanged loop and we have to preserve it. We move these to
+ // InnerLatch, which will become the new exit block for the innermost
+ // loop after interchanging.
+ for (PHINode *P : LcssaInnerExit)
+ P->moveBefore(InnerLatch->getFirstNonPHI());
+
+ // If the inner loop latch contains LCSSA PHIs, those come from a child loop
+ // and we have to move them to the new inner latch.
+ for (PHINode *P : LcssaInnerLatch)
+ P->moveBefore(InnerExit->getFirstNonPHI());
+
+ // Deal with LCSSA PHI nodes in the loop nest exit block. For PHIs that have
+ // incoming values defined in the outer loop, we have to add a new PHI
+ // in the inner loop latch, which became the exit block of the outer loop,
+ // after interchanging.
+ if (OuterExit) {
+ for (PHINode &P : OuterExit->phis()) {
+ if (P.getNumIncomingValues() != 1)
+ continue;
+ // Skip Phis with incoming values defined in the inner loop. Those should
+ // already have been updated.
+ auto I = dyn_cast<Instruction>(P.getIncomingValue(0));
+ if (!I || LI->getLoopFor(I->getParent()) == InnerLoop)
+ continue;
+
+ PHINode *NewPhi = dyn_cast<PHINode>(P.clone());
+ NewPhi->setIncomingValue(0, P.getIncomingValue(0));
+ NewPhi->setIncomingBlock(0, OuterLatch);
+ NewPhi->insertBefore(InnerLatch->getFirstNonPHI());
+ P.setIncomingValue(0, NewPhi);
+ }
+ }
+
+ // Now adjust the incoming blocks for the LCSSA PHIs.
+ // For PHIs moved from Inner's exit block, we need to replace Inner's latch
+ // with the new latch.
+ InnerLatch->replacePhiUsesWith(InnerLatch, OuterLatch);
+}
+
+bool LoopInterchangeTransform::adjustLoopBranches() {
+ LLVM_DEBUG(dbgs() << "adjustLoopBranches called\n");
+ std::vector<DominatorTree::UpdateType> DTUpdates;
+
+ BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+
+ assert(OuterLoopPreHeader != OuterLoop->getHeader() &&
+ InnerLoopPreHeader != InnerLoop->getHeader() && OuterLoopPreHeader &&
+ InnerLoopPreHeader && "Guaranteed by loop-simplify form");
+ // Ensure that both preheaders do not contain PHI nodes and have single
+ // predecessors. This allows us to move them easily. We use
+ // InsertPreHeaderForLoop to create an 'extra' preheader, if the existing
+ // preheaders do not satisfy those conditions.
+ if (isa<PHINode>(OuterLoopPreHeader->begin()) ||
+ !OuterLoopPreHeader->getUniquePredecessor())
+ OuterLoopPreHeader =
+ InsertPreheaderForLoop(OuterLoop, DT, LI, nullptr, true);
+ if (InnerLoopPreHeader == OuterLoop->getHeader())
+ InnerLoopPreHeader =
+ InsertPreheaderForLoop(InnerLoop, DT, LI, nullptr, true);
+
+ // Adjust the loop preheader
+ BasicBlock *InnerLoopHeader = InnerLoop->getHeader();
+ BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
+ BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+ BasicBlock *OuterLoopLatch = OuterLoop->getLoopLatch();
+ BasicBlock *OuterLoopPredecessor = OuterLoopPreHeader->getUniquePredecessor();
+ BasicBlock *InnerLoopLatchPredecessor =
+ InnerLoopLatch->getUniquePredecessor();
+ BasicBlock *InnerLoopLatchSuccessor;
+ BasicBlock *OuterLoopLatchSuccessor;
+
+ BranchInst *OuterLoopLatchBI =
+ dyn_cast<BranchInst>(OuterLoopLatch->getTerminator());
+ BranchInst *InnerLoopLatchBI =
+ dyn_cast<BranchInst>(InnerLoopLatch->getTerminator());
+ BranchInst *OuterLoopHeaderBI =
+ dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+ BranchInst *InnerLoopHeaderBI =
+ dyn_cast<BranchInst>(InnerLoopHeader->getTerminator());
+
+ if (!OuterLoopPredecessor || !InnerLoopLatchPredecessor ||
+ !OuterLoopLatchBI || !InnerLoopLatchBI || !OuterLoopHeaderBI ||
+ !InnerLoopHeaderBI)
+ return false;
+
+ BranchInst *InnerLoopLatchPredecessorBI =
+ dyn_cast<BranchInst>(InnerLoopLatchPredecessor->getTerminator());
+ BranchInst *OuterLoopPredecessorBI =
+ dyn_cast<BranchInst>(OuterLoopPredecessor->getTerminator());
+
+ if (!OuterLoopPredecessorBI || !InnerLoopLatchPredecessorBI)
+ return false;
+ BasicBlock *InnerLoopHeaderSuccessor = InnerLoopHeader->getUniqueSuccessor();
+ if (!InnerLoopHeaderSuccessor)
+ return false;
+
+ // Adjust Loop Preheader and headers.
+ // The branches in the outer loop predecessor and the outer loop header can
+ // be unconditional branches or conditional branches with duplicates. Consider
+ // this when updating the successors.
+ updateSuccessor(OuterLoopPredecessorBI, OuterLoopPreHeader,
+ InnerLoopPreHeader, DTUpdates, /*MustUpdateOnce=*/false);
+ // The outer loop header might or might not branch to the outer latch.
+ // We are guaranteed to branch to the inner loop preheader.
if (llvm::is_contained(OuterLoopHeaderBI->successors(), OuterLoopLatch))
- updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates,
- /*MustUpdateOnce=*/false);
- updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
- InnerLoopHeaderSuccessor, DTUpdates,
- /*MustUpdateOnce=*/false);
-
- // Adjust reduction PHI's now that the incoming block has changed.
- InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
- OuterLoopHeader);
-
- updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
- OuterLoopPreHeader, DTUpdates);
-
- // -------------Adjust loop latches-----------
- if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
- InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
- else
- InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
-
- updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
- InnerLoopLatchSuccessor, DTUpdates);
-
-
- if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
- OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
- else
- OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
-
- updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
- OuterLoopLatchSuccessor, DTUpdates);
- updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
- DTUpdates);
-
- DT->applyUpdates(DTUpdates);
- restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
- OuterLoopPreHeader);
-
- moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
- OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
- InnerLoop, LI);
- // For PHIs in the exit block of the outer loop, outer's latch has been
- // replaced by Inners'.
- OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
-
- // Now update the reduction PHIs in the inner and outer loop headers.
- SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
+ updateSuccessor(OuterLoopHeaderBI, OuterLoopLatch, LoopExit, DTUpdates,
+ /*MustUpdateOnce=*/false);
+ updateSuccessor(OuterLoopHeaderBI, InnerLoopPreHeader,
+ InnerLoopHeaderSuccessor, DTUpdates,
+ /*MustUpdateOnce=*/false);
+
+ // Adjust reduction PHI's now that the incoming block has changed.
+ InnerLoopHeaderSuccessor->replacePhiUsesWith(InnerLoopHeader,
+ OuterLoopHeader);
+
+ updateSuccessor(InnerLoopHeaderBI, InnerLoopHeaderSuccessor,
+ OuterLoopPreHeader, DTUpdates);
+
+ // -------------Adjust loop latches-----------
+ if (InnerLoopLatchBI->getSuccessor(0) == InnerLoopHeader)
+ InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(1);
+ else
+ InnerLoopLatchSuccessor = InnerLoopLatchBI->getSuccessor(0);
+
+ updateSuccessor(InnerLoopLatchPredecessorBI, InnerLoopLatch,
+ InnerLoopLatchSuccessor, DTUpdates);
+
+
+ if (OuterLoopLatchBI->getSuccessor(0) == OuterLoopHeader)
+ OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(1);
+ else
+ OuterLoopLatchSuccessor = OuterLoopLatchBI->getSuccessor(0);
+
+ updateSuccessor(InnerLoopLatchBI, InnerLoopLatchSuccessor,
+ OuterLoopLatchSuccessor, DTUpdates);
+ updateSuccessor(OuterLoopLatchBI, OuterLoopLatchSuccessor, InnerLoopLatch,
+ DTUpdates);
+
+ DT->applyUpdates(DTUpdates);
+ restructureLoops(OuterLoop, InnerLoop, InnerLoopPreHeader,
+ OuterLoopPreHeader);
+
+ moveLCSSAPhis(InnerLoopLatchSuccessor, InnerLoopHeader, InnerLoopLatch,
+ OuterLoopHeader, OuterLoopLatch, InnerLoop->getExitBlock(),
+ InnerLoop, LI);
+ // For PHIs in the exit block of the outer loop, outer's latch has been
+ // replaced by Inners'.
+ OuterLoopLatchSuccessor->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
+
+ // Now update the reduction PHIs in the inner and outer loop headers.
+ SmallVector<PHINode *, 4> InnerLoopPHIs, OuterLoopPHIs;
for (PHINode &PHI : drop_begin(InnerLoopHeader->phis()))
- InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
+ InnerLoopPHIs.push_back(cast<PHINode>(&PHI));
for (PHINode &PHI : drop_begin(OuterLoopHeader->phis()))
- OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
-
- auto &OuterInnerReductions = LIL.getOuterInnerReductions();
- (void)OuterInnerReductions;
-
- // Now move the remaining reduction PHIs from outer to inner loop header and
- // vice versa. The PHI nodes must be part of a reduction across the inner and
- // outer loop and all the remains to do is and updating the incoming blocks.
- for (PHINode *PHI : OuterLoopPHIs) {
- PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
- assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
- }
- for (PHINode *PHI : InnerLoopPHIs) {
- PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
- assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
- }
-
- // Update the incoming blocks for moved PHI nodes.
- OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
- OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
- InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
- InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
-
+ OuterLoopPHIs.push_back(cast<PHINode>(&PHI));
+
+ auto &OuterInnerReductions = LIL.getOuterInnerReductions();
+ (void)OuterInnerReductions;
+
+ // Now move the remaining reduction PHIs from outer to inner loop header and
+ // vice versa. The PHI nodes must be part of a reduction across the inner and
+ // outer loop and all the remains to do is and updating the incoming blocks.
+ for (PHINode *PHI : OuterLoopPHIs) {
+ PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
+ assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
+ }
+ for (PHINode *PHI : InnerLoopPHIs) {
+ PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
+ assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
+ }
+
+ // Update the incoming blocks for moved PHI nodes.
+ OuterLoopHeader->replacePhiUsesWith(InnerLoopPreHeader, OuterLoopPreHeader);
+ OuterLoopHeader->replacePhiUsesWith(InnerLoopLatch, OuterLoopLatch);
+ InnerLoopHeader->replacePhiUsesWith(OuterLoopPreHeader, InnerLoopPreHeader);
+ InnerLoopHeader->replacePhiUsesWith(OuterLoopLatch, InnerLoopLatch);
+
// Values defined in the outer loop header could be used in the inner loop
// latch. In that case, we need to create LCSSA phis for them, because after
// interchanging they will be defined in the new inner loop and used in the
@@ -1621,27 +1621,27 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
MayNeedLCSSAPhis.push_back(&I);
formLCSSAForInstructions(MayNeedLCSSAPhis, *DT, *LI, SE, Builder);
- return true;
-}
-
-bool LoopInterchangeTransform::adjustLoopLinks() {
- // Adjust all branches in the inner and outer loop.
- bool Changed = adjustLoopBranches();
- if (Changed) {
- // We have interchanged the preheaders so we need to interchange the data in
- // the preheaders as well. This is because the content of the inner
- // preheader was previously executed inside the outer loop.
- BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
- BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
- swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
- }
- return Changed;
-}
-
+ return true;
+}
+
+bool LoopInterchangeTransform::adjustLoopLinks() {
+ // Adjust all branches in the inner and outer loop.
+ bool Changed = adjustLoopBranches();
+ if (Changed) {
+ // We have interchanged the preheaders so we need to interchange the data in
+ // the preheaders as well. This is because the content of the inner
+ // preheader was previously executed inside the outer loop.
+ BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+ BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+ swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
+ }
+ return Changed;
+}
+
/// Main LoopInterchange Pass.
struct LoopInterchangeLegacyPass : public LoopPass {
static char ID;
-
+
LoopInterchangeLegacyPass() : LoopPass(ID) {
initializeLoopInterchangeLegacyPassPass(*PassRegistry::getPassRegistry());
}
@@ -1670,14 +1670,14 @@ struct LoopInterchangeLegacyPass : public LoopPass {
char LoopInterchangeLegacyPass::ID = 0;
INITIALIZE_PASS_BEGIN(LoopInterchangeLegacyPass, "loop-interchange",
- "Interchanges loops for cache reuse", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-
+ "Interchanges loops for cache reuse", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+
INITIALIZE_PASS_END(LoopInterchangeLegacyPass, "loop-interchange",
- "Interchanges loops for cache reuse", false, false)
-
+ "Interchanges loops for cache reuse", false, false)
+
Pass *llvm::createLoopInterchangePass() {
return new LoopInterchangeLegacyPass();
}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopLoadElimination.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopLoadElimination.cpp
index e82d9f5407..058612149a 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -1,510 +1,510 @@
-//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implement a loop-aware load elimination pass.
-//
-// It uses LoopAccessAnalysis to identify loop-carried dependences with a
-// distance of one between stores and loads. These form the candidates for the
-// transformation. The source value of each store then propagated to the user
-// of the corresponding load. This makes the load dead.
-//
-// The pass can also version the loop and add memchecks in order to prove that
-// may-aliasing stores can't change the value in memory before it's read by the
-// load.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopLoadElimination.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h"
+//===- LoopLoadElimination.cpp - Loop Load Elimination Pass ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implement a loop-aware load elimination pass.
+//
+// It uses LoopAccessAnalysis to identify loop-carried dependences with a
+// distance of one between stores and loads. These form the candidates for the
+// transformation. The source value of each store then propagated to the user
+// of the corresponding load. This makes the load dead.
+//
+// The pass can also version the loop and add memchecks in order to prove that
+// may-aliasing stores can't change the value in memory before it's read by the
+// load.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopLoadElimination.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopSimplify.h"
-#include "llvm/Transforms/Utils/LoopVersioning.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Utils/SizeOpts.h"
-#include <algorithm>
-#include <cassert>
-#include <forward_list>
-#include <set>
-#include <tuple>
-#include <utility>
-
-using namespace llvm;
-
-#define LLE_OPTION "loop-load-elim"
-#define DEBUG_TYPE LLE_OPTION
-
-static cl::opt<unsigned> CheckPerElim(
- "runtime-check-per-loop-load-elim", cl::Hidden,
- cl::desc("Max number of memchecks allowed per eliminated load on average"),
- cl::init(1));
-
-static cl::opt<unsigned> LoadElimSCEVCheckThreshold(
- "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden,
- cl::desc("The maximum number of SCEV checks allowed for Loop "
- "Load Elimination"));
-
-STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE");
-
-namespace {
-
-/// Represent a store-to-forwarding candidate.
-struct StoreToLoadForwardingCandidate {
- LoadInst *Load;
- StoreInst *Store;
-
- StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store)
- : Load(Load), Store(Store) {}
-
- /// Return true if the dependence from the store to the load has a
- /// distance of one. E.g. A[i+1] = A[i]
- bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
- Loop *L) const {
- Value *LoadPtr = Load->getPointerOperand();
- Value *StorePtr = Store->getPointerOperand();
- Type *LoadPtrType = LoadPtr->getType();
- Type *LoadType = LoadPtrType->getPointerElementType();
-
- assert(LoadPtrType->getPointerAddressSpace() ==
- StorePtr->getType()->getPointerAddressSpace() &&
- LoadType == StorePtr->getType()->getPointerElementType() &&
- "Should be a known dependence");
-
- // Currently we only support accesses with unit stride. FIXME: we should be
- // able to handle non unit stirde as well as long as the stride is equal to
- // the dependence distance.
- if (getPtrStride(PSE, LoadPtr, L) != 1 ||
- getPtrStride(PSE, StorePtr, L) != 1)
- return false;
-
- auto &DL = Load->getParent()->getModule()->getDataLayout();
- unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
-
- auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr));
- auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr));
-
- // We don't need to check non-wrapping here because forward/backward
- // dependence wouldn't be valid if these weren't monotonic accesses.
- auto *Dist = cast<SCEVConstant>(
- PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
- const APInt &Val = Dist->getAPInt();
- return Val == TypeByteSize;
- }
-
- Value *getLoadPtr() const { return Load->getPointerOperand(); }
-
-#ifndef NDEBUG
- friend raw_ostream &operator<<(raw_ostream &OS,
- const StoreToLoadForwardingCandidate &Cand) {
- OS << *Cand.Store << " -->\n";
- OS.indent(2) << *Cand.Load << "\n";
- return OS;
- }
-#endif
-};
-
-} // end anonymous namespace
-
-/// Check if the store dominates all latches, so as long as there is no
-/// intervening store this value will be loaded in the next iteration.
-static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
- DominatorTree *DT) {
- SmallVector<BasicBlock *, 8> Latches;
- L->getLoopLatches(Latches);
- return llvm::all_of(Latches, [&](const BasicBlock *Latch) {
- return DT->dominates(StoreBlock, Latch);
- });
-}
-
-/// Return true if the load is not executed on all paths in the loop.
-static bool isLoadConditional(LoadInst *Load, Loop *L) {
- return Load->getParent() != L->getHeader();
-}
-
-namespace {
-
-/// The per-loop class that does most of the work.
-class LoadEliminationForLoop {
-public:
- LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
- DominatorTree *DT, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo* PSI)
- : L(L), LI(LI), LAI(LAI), DT(DT), BFI(BFI), PSI(PSI), PSE(LAI.getPSE()) {}
-
- /// Look through the loop-carried and loop-independent dependences in
- /// this loop and find store->load dependences.
- ///
- /// Note that no candidate is returned if LAA has failed to analyze the loop
- /// (e.g. if it's not bottom-tested, contains volatile memops, etc.)
- std::forward_list<StoreToLoadForwardingCandidate>
- findStoreToLoadDependences(const LoopAccessInfo &LAI) {
- std::forward_list<StoreToLoadForwardingCandidate> Candidates;
-
- const auto *Deps = LAI.getDepChecker().getDependences();
- if (!Deps)
- return Candidates;
-
- // Find store->load dependences (consequently true dep). Both lexically
- // forward and backward dependences qualify. Disqualify loads that have
- // other unknown dependences.
-
- SmallPtrSet<Instruction *, 4> LoadsWithUnknownDepedence;
-
- for (const auto &Dep : *Deps) {
- Instruction *Source = Dep.getSource(LAI);
- Instruction *Destination = Dep.getDestination(LAI);
-
- if (Dep.Type == MemoryDepChecker::Dependence::Unknown) {
- if (isa<LoadInst>(Source))
- LoadsWithUnknownDepedence.insert(Source);
- if (isa<LoadInst>(Destination))
- LoadsWithUnknownDepedence.insert(Destination);
- continue;
- }
-
- if (Dep.isBackward())
- // Note that the designations source and destination follow the program
- // order, i.e. source is always first. (The direction is given by the
- // DepType.)
- std::swap(Source, Destination);
- else
- assert(Dep.isForward() && "Needs to be a forward dependence");
-
- auto *Store = dyn_cast<StoreInst>(Source);
- if (!Store)
- continue;
- auto *Load = dyn_cast<LoadInst>(Destination);
- if (!Load)
- continue;
-
- // Only progagate the value if they are of the same type.
- if (Store->getPointerOperandType() != Load->getPointerOperandType())
- continue;
-
- Candidates.emplace_front(Load, Store);
- }
-
- if (!LoadsWithUnknownDepedence.empty())
- Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) {
- return LoadsWithUnknownDepedence.count(C.Load);
- });
-
- return Candidates;
- }
-
- /// Return the index of the instruction according to program order.
- unsigned getInstrIndex(Instruction *Inst) {
- auto I = InstOrder.find(Inst);
- assert(I != InstOrder.end() && "No index for instruction");
- return I->second;
- }
-
- /// If a load has multiple candidates associated (i.e. different
- /// stores), it means that it could be forwarding from multiple stores
- /// depending on control flow. Remove these candidates.
- ///
- /// Here, we rely on LAA to include the relevant loop-independent dependences.
- /// LAA is known to omit these in the very simple case when the read and the
- /// write within an alias set always takes place using the *same* pointer.
- ///
- /// However, we know that this is not the case here, i.e. we can rely on LAA
- /// to provide us with loop-independent dependences for the cases we're
- /// interested. Consider the case for example where a loop-independent
- /// dependece S1->S2 invalidates the forwarding S3->S2.
- ///
- /// A[i] = ... (S1)
- /// ... = A[i] (S2)
- /// A[i+1] = ... (S3)
- ///
- /// LAA will perform dependence analysis here because there are two
- /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]).
- void removeDependencesFromMultipleStores(
- std::forward_list<StoreToLoadForwardingCandidate> &Candidates) {
- // If Store is nullptr it means that we have multiple stores forwarding to
- // this store.
- using LoadToSingleCandT =
- DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>;
- LoadToSingleCandT LoadToSingleCand;
-
- for (const auto &Cand : Candidates) {
- bool NewElt;
- LoadToSingleCandT::iterator Iter;
-
- std::tie(Iter, NewElt) =
- LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand));
- if (!NewElt) {
- const StoreToLoadForwardingCandidate *&OtherCand = Iter->second;
- // Already multiple stores forward to this load.
- if (OtherCand == nullptr)
- continue;
-
- // Handle the very basic case when the two stores are in the same block
- // so deciding which one forwards is easy. The later one forwards as
- // long as they both have a dependence distance of one to the load.
- if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
- Cand.isDependenceDistanceOfOne(PSE, L) &&
- OtherCand->isDependenceDistanceOfOne(PSE, L)) {
- // They are in the same block, the later one will forward to the load.
- if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
- OtherCand = &Cand;
- } else
- OtherCand = nullptr;
- }
- }
-
- Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) {
- if (LoadToSingleCand[Cand.Load] != &Cand) {
- LLVM_DEBUG(
- dbgs() << "Removing from candidates: \n"
- << Cand
- << " The load may have multiple stores forwarding to "
- << "it\n");
- return true;
- }
- return false;
- });
- }
-
- /// Given two pointers operations by their RuntimePointerChecking
- /// indices, return true if they require an alias check.
- ///
- /// We need a check if one is a pointer for a candidate load and the other is
- /// a pointer for a possibly intervening store.
- bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include <algorithm>
+#include <cassert>
+#include <forward_list>
+#include <set>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define LLE_OPTION "loop-load-elim"
+#define DEBUG_TYPE LLE_OPTION
+
+static cl::opt<unsigned> CheckPerElim(
+ "runtime-check-per-loop-load-elim", cl::Hidden,
+ cl::desc("Max number of memchecks allowed per eliminated load on average"),
+ cl::init(1));
+
+static cl::opt<unsigned> LoadElimSCEVCheckThreshold(
+ "loop-load-elimination-scev-check-threshold", cl::init(8), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed for Loop "
+ "Load Elimination"));
+
+STATISTIC(NumLoopLoadEliminted, "Number of loads eliminated by LLE");
+
+namespace {
+
+/// Represent a store-to-forwarding candidate.
+struct StoreToLoadForwardingCandidate {
+ LoadInst *Load;
+ StoreInst *Store;
+
+ StoreToLoadForwardingCandidate(LoadInst *Load, StoreInst *Store)
+ : Load(Load), Store(Store) {}
+
+ /// Return true if the dependence from the store to the load has a
+ /// distance of one. E.g. A[i+1] = A[i]
+ bool isDependenceDistanceOfOne(PredicatedScalarEvolution &PSE,
+ Loop *L) const {
+ Value *LoadPtr = Load->getPointerOperand();
+ Value *StorePtr = Store->getPointerOperand();
+ Type *LoadPtrType = LoadPtr->getType();
+ Type *LoadType = LoadPtrType->getPointerElementType();
+
+ assert(LoadPtrType->getPointerAddressSpace() ==
+ StorePtr->getType()->getPointerAddressSpace() &&
+ LoadType == StorePtr->getType()->getPointerElementType() &&
+ "Should be a known dependence");
+
+ // Currently we only support accesses with unit stride. FIXME: we should be
+ // able to handle non unit stirde as well as long as the stride is equal to
+ // the dependence distance.
+ if (getPtrStride(PSE, LoadPtr, L) != 1 ||
+ getPtrStride(PSE, StorePtr, L) != 1)
+ return false;
+
+ auto &DL = Load->getParent()->getModule()->getDataLayout();
+ unsigned TypeByteSize = DL.getTypeAllocSize(const_cast<Type *>(LoadType));
+
+ auto *LoadPtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(LoadPtr));
+ auto *StorePtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(StorePtr));
+
+ // We don't need to check non-wrapping here because forward/backward
+ // dependence wouldn't be valid if these weren't monotonic accesses.
+ auto *Dist = cast<SCEVConstant>(
+ PSE.getSE()->getMinusSCEV(StorePtrSCEV, LoadPtrSCEV));
+ const APInt &Val = Dist->getAPInt();
+ return Val == TypeByteSize;
+ }
+
+ Value *getLoadPtr() const { return Load->getPointerOperand(); }
+
+#ifndef NDEBUG
+ friend raw_ostream &operator<<(raw_ostream &OS,
+ const StoreToLoadForwardingCandidate &Cand) {
+ OS << *Cand.Store << " -->\n";
+ OS.indent(2) << *Cand.Load << "\n";
+ return OS;
+ }
+#endif
+};
+
+} // end anonymous namespace
+
+/// Check if the store dominates all latches, so as long as there is no
+/// intervening store this value will be loaded in the next iteration.
+static bool doesStoreDominatesAllLatches(BasicBlock *StoreBlock, Loop *L,
+ DominatorTree *DT) {
+ SmallVector<BasicBlock *, 8> Latches;
+ L->getLoopLatches(Latches);
+ return llvm::all_of(Latches, [&](const BasicBlock *Latch) {
+ return DT->dominates(StoreBlock, Latch);
+ });
+}
+
+/// Return true if the load is not executed on all paths in the loop.
+static bool isLoadConditional(LoadInst *Load, Loop *L) {
+ return Load->getParent() != L->getHeader();
+}
+
+namespace {
+
+/// The per-loop class that does most of the work.
+class LoadEliminationForLoop {
+public:
+ LoadEliminationForLoop(Loop *L, LoopInfo *LI, const LoopAccessInfo &LAI,
+ DominatorTree *DT, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo* PSI)
+ : L(L), LI(LI), LAI(LAI), DT(DT), BFI(BFI), PSI(PSI), PSE(LAI.getPSE()) {}
+
+ /// Look through the loop-carried and loop-independent dependences in
+ /// this loop and find store->load dependences.
+ ///
+ /// Note that no candidate is returned if LAA has failed to analyze the loop
+ /// (e.g. if it's not bottom-tested, contains volatile memops, etc.)
+ std::forward_list<StoreToLoadForwardingCandidate>
+ findStoreToLoadDependences(const LoopAccessInfo &LAI) {
+ std::forward_list<StoreToLoadForwardingCandidate> Candidates;
+
+ const auto *Deps = LAI.getDepChecker().getDependences();
+ if (!Deps)
+ return Candidates;
+
+ // Find store->load dependences (consequently true dep). Both lexically
+ // forward and backward dependences qualify. Disqualify loads that have
+ // other unknown dependences.
+
+ SmallPtrSet<Instruction *, 4> LoadsWithUnknownDepedence;
+
+ for (const auto &Dep : *Deps) {
+ Instruction *Source = Dep.getSource(LAI);
+ Instruction *Destination = Dep.getDestination(LAI);
+
+ if (Dep.Type == MemoryDepChecker::Dependence::Unknown) {
+ if (isa<LoadInst>(Source))
+ LoadsWithUnknownDepedence.insert(Source);
+ if (isa<LoadInst>(Destination))
+ LoadsWithUnknownDepedence.insert(Destination);
+ continue;
+ }
+
+ if (Dep.isBackward())
+ // Note that the designations source and destination follow the program
+ // order, i.e. source is always first. (The direction is given by the
+ // DepType.)
+ std::swap(Source, Destination);
+ else
+ assert(Dep.isForward() && "Needs to be a forward dependence");
+
+ auto *Store = dyn_cast<StoreInst>(Source);
+ if (!Store)
+ continue;
+ auto *Load = dyn_cast<LoadInst>(Destination);
+ if (!Load)
+ continue;
+
+ // Only progagate the value if they are of the same type.
+ if (Store->getPointerOperandType() != Load->getPointerOperandType())
+ continue;
+
+ Candidates.emplace_front(Load, Store);
+ }
+
+ if (!LoadsWithUnknownDepedence.empty())
+ Candidates.remove_if([&](const StoreToLoadForwardingCandidate &C) {
+ return LoadsWithUnknownDepedence.count(C.Load);
+ });
+
+ return Candidates;
+ }
+
+ /// Return the index of the instruction according to program order.
+ unsigned getInstrIndex(Instruction *Inst) {
+ auto I = InstOrder.find(Inst);
+ assert(I != InstOrder.end() && "No index for instruction");
+ return I->second;
+ }
+
+ /// If a load has multiple candidates associated (i.e. different
+ /// stores), it means that it could be forwarding from multiple stores
+ /// depending on control flow. Remove these candidates.
+ ///
+ /// Here, we rely on LAA to include the relevant loop-independent dependences.
+ /// LAA is known to omit these in the very simple case when the read and the
+ /// write within an alias set always takes place using the *same* pointer.
+ ///
+ /// However, we know that this is not the case here, i.e. we can rely on LAA
+ /// to provide us with loop-independent dependences for the cases we're
+ /// interested. Consider the case for example where a loop-independent
+ /// dependece S1->S2 invalidates the forwarding S3->S2.
+ ///
+ /// A[i] = ... (S1)
+ /// ... = A[i] (S2)
+ /// A[i+1] = ... (S3)
+ ///
+ /// LAA will perform dependence analysis here because there are two
+ /// *different* pointers involved in the same alias set (&A[i] and &A[i+1]).
+ void removeDependencesFromMultipleStores(
+ std::forward_list<StoreToLoadForwardingCandidate> &Candidates) {
+ // If Store is nullptr it means that we have multiple stores forwarding to
+ // this store.
+ using LoadToSingleCandT =
+ DenseMap<LoadInst *, const StoreToLoadForwardingCandidate *>;
+ LoadToSingleCandT LoadToSingleCand;
+
+ for (const auto &Cand : Candidates) {
+ bool NewElt;
+ LoadToSingleCandT::iterator Iter;
+
+ std::tie(Iter, NewElt) =
+ LoadToSingleCand.insert(std::make_pair(Cand.Load, &Cand));
+ if (!NewElt) {
+ const StoreToLoadForwardingCandidate *&OtherCand = Iter->second;
+ // Already multiple stores forward to this load.
+ if (OtherCand == nullptr)
+ continue;
+
+ // Handle the very basic case when the two stores are in the same block
+ // so deciding which one forwards is easy. The later one forwards as
+ // long as they both have a dependence distance of one to the load.
+ if (Cand.Store->getParent() == OtherCand->Store->getParent() &&
+ Cand.isDependenceDistanceOfOne(PSE, L) &&
+ OtherCand->isDependenceDistanceOfOne(PSE, L)) {
+ // They are in the same block, the later one will forward to the load.
+ if (getInstrIndex(OtherCand->Store) < getInstrIndex(Cand.Store))
+ OtherCand = &Cand;
+ } else
+ OtherCand = nullptr;
+ }
+ }
+
+ Candidates.remove_if([&](const StoreToLoadForwardingCandidate &Cand) {
+ if (LoadToSingleCand[Cand.Load] != &Cand) {
+ LLVM_DEBUG(
+ dbgs() << "Removing from candidates: \n"
+ << Cand
+ << " The load may have multiple stores forwarding to "
+ << "it\n");
+ return true;
+ }
+ return false;
+ });
+ }
+
+ /// Given two pointers operations by their RuntimePointerChecking
+ /// indices, return true if they require an alias check.
+ ///
+ /// We need a check if one is a pointer for a candidate load and the other is
+ /// a pointer for a possibly intervening store.
+ bool needsChecking(unsigned PtrIdx1, unsigned PtrIdx2,
const SmallPtrSetImpl<Value *> &PtrsWrittenOnFwdingPath,
const SmallPtrSetImpl<Value *> &CandLoadPtrs) {
- Value *Ptr1 =
- LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue;
- Value *Ptr2 =
- LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue;
- return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) ||
- (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1)));
- }
-
- /// Return pointers that are possibly written to on the path from a
- /// forwarding store to a load.
- ///
- /// These pointers need to be alias-checked against the forwarding candidates.
- SmallPtrSet<Value *, 4> findPointersWrittenOnForwardingPath(
- const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
- // From FirstStore to LastLoad neither of the elimination candidate loads
- // should overlap with any of the stores.
- //
- // E.g.:
- //
- // st1 C[i]
- // ld1 B[i] <-------,
- // ld0 A[i] <----, | * LastLoad
- // ... | |
- // st2 E[i] | |
- // st3 B[i+1] -- | -' * FirstStore
- // st0 A[i+1] ---'
- // st4 D[i]
- //
- // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with
- // ld0.
-
- LoadInst *LastLoad =
- std::max_element(Candidates.begin(), Candidates.end(),
- [&](const StoreToLoadForwardingCandidate &A,
- const StoreToLoadForwardingCandidate &B) {
- return getInstrIndex(A.Load) < getInstrIndex(B.Load);
- })
- ->Load;
- StoreInst *FirstStore =
- std::min_element(Candidates.begin(), Candidates.end(),
- [&](const StoreToLoadForwardingCandidate &A,
- const StoreToLoadForwardingCandidate &B) {
- return getInstrIndex(A.Store) <
- getInstrIndex(B.Store);
- })
- ->Store;
-
- // We're looking for stores after the first forwarding store until the end
- // of the loop, then from the beginning of the loop until the last
- // forwarded-to load. Collect the pointer for the stores.
- SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath;
-
- auto InsertStorePtr = [&](Instruction *I) {
- if (auto *S = dyn_cast<StoreInst>(I))
- PtrsWrittenOnFwdingPath.insert(S->getPointerOperand());
- };
- const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions();
- std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1,
- MemInstrs.end(), InsertStorePtr);
- std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)],
- InsertStorePtr);
-
- return PtrsWrittenOnFwdingPath;
- }
-
- /// Determine the pointer alias checks to prove that there are no
- /// intervening stores.
- SmallVector<RuntimePointerCheck, 4> collectMemchecks(
- const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
-
- SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath =
- findPointersWrittenOnForwardingPath(Candidates);
-
- // Collect the pointers of the candidate loads.
+ Value *Ptr1 =
+ LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx1).PointerValue;
+ Value *Ptr2 =
+ LAI.getRuntimePointerChecking()->getPointerInfo(PtrIdx2).PointerValue;
+ return ((PtrsWrittenOnFwdingPath.count(Ptr1) && CandLoadPtrs.count(Ptr2)) ||
+ (PtrsWrittenOnFwdingPath.count(Ptr2) && CandLoadPtrs.count(Ptr1)));
+ }
+
+ /// Return pointers that are possibly written to on the path from a
+ /// forwarding store to a load.
+ ///
+ /// These pointers need to be alias-checked against the forwarding candidates.
+ SmallPtrSet<Value *, 4> findPointersWrittenOnForwardingPath(
+ const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+ // From FirstStore to LastLoad neither of the elimination candidate loads
+ // should overlap with any of the stores.
+ //
+ // E.g.:
+ //
+ // st1 C[i]
+ // ld1 B[i] <-------,
+ // ld0 A[i] <----, | * LastLoad
+ // ... | |
+ // st2 E[i] | |
+ // st3 B[i+1] -- | -' * FirstStore
+ // st0 A[i+1] ---'
+ // st4 D[i]
+ //
+ // st0 forwards to ld0 if the accesses in st4 and st1 don't overlap with
+ // ld0.
+
+ LoadInst *LastLoad =
+ std::max_element(Candidates.begin(), Candidates.end(),
+ [&](const StoreToLoadForwardingCandidate &A,
+ const StoreToLoadForwardingCandidate &B) {
+ return getInstrIndex(A.Load) < getInstrIndex(B.Load);
+ })
+ ->Load;
+ StoreInst *FirstStore =
+ std::min_element(Candidates.begin(), Candidates.end(),
+ [&](const StoreToLoadForwardingCandidate &A,
+ const StoreToLoadForwardingCandidate &B) {
+ return getInstrIndex(A.Store) <
+ getInstrIndex(B.Store);
+ })
+ ->Store;
+
+ // We're looking for stores after the first forwarding store until the end
+ // of the loop, then from the beginning of the loop until the last
+ // forwarded-to load. Collect the pointer for the stores.
+ SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath;
+
+ auto InsertStorePtr = [&](Instruction *I) {
+ if (auto *S = dyn_cast<StoreInst>(I))
+ PtrsWrittenOnFwdingPath.insert(S->getPointerOperand());
+ };
+ const auto &MemInstrs = LAI.getDepChecker().getMemoryInstructions();
+ std::for_each(MemInstrs.begin() + getInstrIndex(FirstStore) + 1,
+ MemInstrs.end(), InsertStorePtr);
+ std::for_each(MemInstrs.begin(), &MemInstrs[getInstrIndex(LastLoad)],
+ InsertStorePtr);
+
+ return PtrsWrittenOnFwdingPath;
+ }
+
+ /// Determine the pointer alias checks to prove that there are no
+ /// intervening stores.
+ SmallVector<RuntimePointerCheck, 4> collectMemchecks(
+ const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
+
+ SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath =
+ findPointersWrittenOnForwardingPath(Candidates);
+
+ // Collect the pointers of the candidate loads.
SmallPtrSet<Value *, 4> CandLoadPtrs;
for (const auto &Candidate : Candidates)
CandLoadPtrs.insert(Candidate.getLoadPtr());
-
- const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
- SmallVector<RuntimePointerCheck, 4> Checks;
-
- copy_if(AllChecks, std::back_inserter(Checks),
- [&](const RuntimePointerCheck &Check) {
- for (auto PtrIdx1 : Check.first->Members)
- for (auto PtrIdx2 : Check.second->Members)
- if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
- CandLoadPtrs))
- return true;
- return false;
- });
-
- LLVM_DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size()
- << "):\n");
- LLVM_DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
-
- return Checks;
- }
-
- /// Perform the transformation for a candidate.
- void
- propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
- SCEVExpander &SEE) {
- // loop:
- // %x = load %gep_i
- // = ... %x
- // store %y, %gep_i_plus_1
- //
- // =>
- //
- // ph:
- // %x.initial = load %gep_0
- // loop:
- // %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
- // %x = load %gep_i <---- now dead
- // = ... %x.storeforward
- // store %y, %gep_i_plus_1
-
- Value *Ptr = Cand.Load->getPointerOperand();
- auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr));
- auto *PH = L->getLoopPreheader();
- assert(PH && "Preheader should exist!");
- Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
- PH->getTerminator());
- Value *Initial = new LoadInst(
- Cand.Load->getType(), InitialPtr, "load_initial",
- /* isVolatile */ false, Cand.Load->getAlign(), PH->getTerminator());
-
- PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
- &L->getHeader()->front());
- PHI->addIncoming(Initial, PH);
- PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch());
-
- Cand.Load->replaceAllUsesWith(PHI);
- }
-
- /// Top-level driver for each loop: find store->load forwarding
- /// candidates, add run-time checks and perform transformation.
- bool processLoop() {
- LLVM_DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
- << "\" checking " << *L << "\n");
-
- // Look for store-to-load forwarding cases across the
- // backedge. E.g.:
- //
- // loop:
- // %x = load %gep_i
- // = ... %x
- // store %y, %gep_i_plus_1
- //
- // =>
- //
- // ph:
- // %x.initial = load %gep_0
- // loop:
- // %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
- // %x = load %gep_i <---- now dead
- // = ... %x.storeforward
- // store %y, %gep_i_plus_1
-
- // First start with store->load dependences.
- auto StoreToLoadDependences = findStoreToLoadDependences(LAI);
- if (StoreToLoadDependences.empty())
- return false;
-
- // Generate an index for each load and store according to the original
- // program order. This will be used later.
- InstOrder = LAI.getDepChecker().generateInstructionOrderMap();
-
- // To keep things simple for now, remove those where the load is potentially
- // fed by multiple stores.
- removeDependencesFromMultipleStores(StoreToLoadDependences);
- if (StoreToLoadDependences.empty())
- return false;
-
- // Filter the candidates further.
- SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
- for (const StoreToLoadForwardingCandidate &Cand : StoreToLoadDependences) {
- LLVM_DEBUG(dbgs() << "Candidate " << Cand);
-
- // Make sure that the stored values is available everywhere in the loop in
- // the next iteration.
- if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT))
- continue;
-
- // If the load is conditional we can't hoist its 0-iteration instance to
- // the preheader because that would make it unconditional. Thus we would
- // access a memory location that the original loop did not access.
- if (isLoadConditional(Cand.Load, L))
- continue;
-
- // Check whether the SCEV difference is the same as the induction step,
- // thus we load the value in the next iteration.
- if (!Cand.isDependenceDistanceOfOne(PSE, L))
- continue;
-
+
+ const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
+ SmallVector<RuntimePointerCheck, 4> Checks;
+
+ copy_if(AllChecks, std::back_inserter(Checks),
+ [&](const RuntimePointerCheck &Check) {
+ for (auto PtrIdx1 : Check.first->Members)
+ for (auto PtrIdx2 : Check.second->Members)
+ if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
+ CandLoadPtrs))
+ return true;
+ return false;
+ });
+
+ LLVM_DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size()
+ << "):\n");
+ LLVM_DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
+
+ return Checks;
+ }
+
+ /// Perform the transformation for a candidate.
+ void
+ propagateStoredValueToLoadUsers(const StoreToLoadForwardingCandidate &Cand,
+ SCEVExpander &SEE) {
+ // loop:
+ // %x = load %gep_i
+ // = ... %x
+ // store %y, %gep_i_plus_1
+ //
+ // =>
+ //
+ // ph:
+ // %x.initial = load %gep_0
+ // loop:
+ // %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+ // %x = load %gep_i <---- now dead
+ // = ... %x.storeforward
+ // store %y, %gep_i_plus_1
+
+ Value *Ptr = Cand.Load->getPointerOperand();
+ auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr));
+ auto *PH = L->getLoopPreheader();
+ assert(PH && "Preheader should exist!");
+ Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
+ PH->getTerminator());
+ Value *Initial = new LoadInst(
+ Cand.Load->getType(), InitialPtr, "load_initial",
+ /* isVolatile */ false, Cand.Load->getAlign(), PH->getTerminator());
+
+ PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
+ &L->getHeader()->front());
+ PHI->addIncoming(Initial, PH);
+ PHI->addIncoming(Cand.Store->getOperand(0), L->getLoopLatch());
+
+ Cand.Load->replaceAllUsesWith(PHI);
+ }
+
+ /// Top-level driver for each loop: find store->load forwarding
+ /// candidates, add run-time checks and perform transformation.
+ bool processLoop() {
+ LLVM_DEBUG(dbgs() << "\nIn \"" << L->getHeader()->getParent()->getName()
+ << "\" checking " << *L << "\n");
+
+ // Look for store-to-load forwarding cases across the
+ // backedge. E.g.:
+ //
+ // loop:
+ // %x = load %gep_i
+ // = ... %x
+ // store %y, %gep_i_plus_1
+ //
+ // =>
+ //
+ // ph:
+ // %x.initial = load %gep_0
+ // loop:
+ // %x.storeforward = phi [%x.initial, %ph] [%y, %loop]
+ // %x = load %gep_i <---- now dead
+ // = ... %x.storeforward
+ // store %y, %gep_i_plus_1
+
+ // First start with store->load dependences.
+ auto StoreToLoadDependences = findStoreToLoadDependences(LAI);
+ if (StoreToLoadDependences.empty())
+ return false;
+
+ // Generate an index for each load and store according to the original
+ // program order. This will be used later.
+ InstOrder = LAI.getDepChecker().generateInstructionOrderMap();
+
+ // To keep things simple for now, remove those where the load is potentially
+ // fed by multiple stores.
+ removeDependencesFromMultipleStores(StoreToLoadDependences);
+ if (StoreToLoadDependences.empty())
+ return false;
+
+ // Filter the candidates further.
+ SmallVector<StoreToLoadForwardingCandidate, 4> Candidates;
+ for (const StoreToLoadForwardingCandidate &Cand : StoreToLoadDependences) {
+ LLVM_DEBUG(dbgs() << "Candidate " << Cand);
+
+ // Make sure that the stored values is available everywhere in the loop in
+ // the next iteration.
+ if (!doesStoreDominatesAllLatches(Cand.Store->getParent(), L, DT))
+ continue;
+
+ // If the load is conditional we can't hoist its 0-iteration instance to
+ // the preheader because that would make it unconditional. Thus we would
+ // access a memory location that the original loop did not access.
+ if (isLoadConditional(Cand.Load, L))
+ continue;
+
+ // Check whether the SCEV difference is the same as the induction step,
+ // thus we load the value in the next iteration.
+ if (!Cand.isDependenceDistanceOfOne(PSE, L))
+ continue;
+
assert(isa<SCEVAddRecExpr>(PSE.getSCEV(Cand.Load->getPointerOperand())) &&
"Loading from something other than indvar?");
assert(
@@ -512,59 +512,59 @@ public:
"Storing to something other than indvar?");
Candidates.push_back(Cand);
- LLVM_DEBUG(
- dbgs()
+ LLVM_DEBUG(
+ dbgs()
<< Candidates.size()
- << ". Valid store-to-load forwarding across the loop backedge\n");
- }
- if (Candidates.empty())
- return false;
-
- // Check intervening may-alias stores. These need runtime checks for alias
- // disambiguation.
- SmallVector<RuntimePointerCheck, 4> Checks = collectMemchecks(Candidates);
-
- // Too many checks are likely to outweigh the benefits of forwarding.
- if (Checks.size() > Candidates.size() * CheckPerElim) {
- LLVM_DEBUG(dbgs() << "Too many run-time checks needed.\n");
- return false;
- }
-
- if (LAI.getPSE().getUnionPredicate().getComplexity() >
- LoadElimSCEVCheckThreshold) {
- LLVM_DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
- return false;
- }
-
- if (!L->isLoopSimplifyForm()) {
- LLVM_DEBUG(dbgs() << "Loop is not is loop-simplify form");
- return false;
- }
-
- if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
- if (LAI.hasConvergentOp()) {
- LLVM_DEBUG(dbgs() << "Versioning is needed but not allowed with "
- "convergent calls\n");
- return false;
- }
-
- auto *HeaderBB = L->getHeader();
- auto *F = HeaderBB->getParent();
- bool OptForSize = F->hasOptSize() ||
- llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI,
- PGSOQueryType::IRPass);
- if (OptForSize) {
- LLVM_DEBUG(
- dbgs() << "Versioning is needed but not allowed when optimizing "
- "for size.\n");
- return false;
- }
-
- // Point of no-return, start the transformation. First, version the loop
- // if necessary.
-
+ << ". Valid store-to-load forwarding across the loop backedge\n");
+ }
+ if (Candidates.empty())
+ return false;
+
+ // Check intervening may-alias stores. These need runtime checks for alias
+ // disambiguation.
+ SmallVector<RuntimePointerCheck, 4> Checks = collectMemchecks(Candidates);
+
+ // Too many checks are likely to outweigh the benefits of forwarding.
+ if (Checks.size() > Candidates.size() * CheckPerElim) {
+ LLVM_DEBUG(dbgs() << "Too many run-time checks needed.\n");
+ return false;
+ }
+
+ if (LAI.getPSE().getUnionPredicate().getComplexity() >
+ LoadElimSCEVCheckThreshold) {
+ LLVM_DEBUG(dbgs() << "Too many SCEV run-time checks needed.\n");
+ return false;
+ }
+
+ if (!L->isLoopSimplifyForm()) {
+ LLVM_DEBUG(dbgs() << "Loop is not is loop-simplify form");
+ return false;
+ }
+
+ if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
+ if (LAI.hasConvergentOp()) {
+ LLVM_DEBUG(dbgs() << "Versioning is needed but not allowed with "
+ "convergent calls\n");
+ return false;
+ }
+
+ auto *HeaderBB = L->getHeader();
+ auto *F = HeaderBB->getParent();
+ bool OptForSize = F->hasOptSize() ||
+ llvm::shouldOptimizeForSize(HeaderBB, PSI, BFI,
+ PGSOQueryType::IRPass);
+ if (OptForSize) {
+ LLVM_DEBUG(
+ dbgs() << "Versioning is needed but not allowed when optimizing "
+ "for size.\n");
+ return false;
+ }
+
+ // Point of no-return, start the transformation. First, version the loop
+ // if necessary.
+
LoopVersioning LV(LAI, Checks, L, LI, DT, PSE.getSE());
- LV.versionLoop();
+ LV.versionLoop();
// After versioning, some of the candidates' pointers could stop being
// SCEVAddRecs. We need to filter them out.
@@ -576,163 +576,163 @@ public:
PSE.getSCEV(Cand.Store->getPointerOperand()));
};
llvm::erase_if(Candidates, NoLongerGoodCandidate);
- }
-
- // Next, propagate the value stored by the store to the users of the load.
- // Also for the first iteration, generate the initial value of the load.
- SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(),
- "storeforward");
- for (const auto &Cand : Candidates)
- propagateStoredValueToLoadUsers(Cand, SEE);
+ }
+
+ // Next, propagate the value stored by the store to the users of the load.
+ // Also for the first iteration, generate the initial value of the load.
+ SCEVExpander SEE(*PSE.getSE(), L->getHeader()->getModule()->getDataLayout(),
+ "storeforward");
+ for (const auto &Cand : Candidates)
+ propagateStoredValueToLoadUsers(Cand, SEE);
NumLoopLoadEliminted += Candidates.size();
-
- return true;
- }
-
-private:
- Loop *L;
-
- /// Maps the load/store instructions to their index according to
- /// program order.
- DenseMap<Instruction *, unsigned> InstOrder;
-
- // Analyses used.
- LoopInfo *LI;
- const LoopAccessInfo &LAI;
- DominatorTree *DT;
- BlockFrequencyInfo *BFI;
- ProfileSummaryInfo *PSI;
- PredicatedScalarEvolution PSE;
-};
-
-} // end anonymous namespace
-
-static bool
-eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
- BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+
+ return true;
+ }
+
+private:
+ Loop *L;
+
+ /// Maps the load/store instructions to their index according to
+ /// program order.
+ DenseMap<Instruction *, unsigned> InstOrder;
+
+ // Analyses used.
+ LoopInfo *LI;
+ const LoopAccessInfo &LAI;
+ DominatorTree *DT;
+ BlockFrequencyInfo *BFI;
+ ProfileSummaryInfo *PSI;
+ PredicatedScalarEvolution PSE;
+};
+
+} // end anonymous namespace
+
+static bool
+eliminateLoadsAcrossLoops(Function &F, LoopInfo &LI, DominatorTree &DT,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
ScalarEvolution *SE, AssumptionCache *AC,
- function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
- // Build up a worklist of inner-loops to transform to avoid iterator
- // invalidation.
- // FIXME: This logic comes from other passes that actually change the loop
- // nest structure. It isn't clear this is necessary (or useful) for a pass
- // which merely optimizes the use of loads in a loop.
- SmallVector<Loop *, 8> Worklist;
-
+ function_ref<const LoopAccessInfo &(Loop &)> GetLAI) {
+ // Build up a worklist of inner-loops to transform to avoid iterator
+ // invalidation.
+ // FIXME: This logic comes from other passes that actually change the loop
+ // nest structure. It isn't clear this is necessary (or useful) for a pass
+ // which merely optimizes the use of loads in a loop.
+ SmallVector<Loop *, 8> Worklist;
+
bool Changed = false;
- for (Loop *TopLevelLoop : LI)
+ for (Loop *TopLevelLoop : LI)
for (Loop *L : depth_first(TopLevelLoop)) {
Changed |= simplifyLoop(L, &DT, &LI, SE, AC, /*MSSAU*/ nullptr, false);
- // We only handle inner-most loops.
+ // We only handle inner-most loops.
if (L->isInnermost())
- Worklist.push_back(L);
+ Worklist.push_back(L);
}
-
- // Now walk the identified inner loops.
- for (Loop *L : Worklist) {
+
+ // Now walk the identified inner loops.
+ for (Loop *L : Worklist) {
// Match historical behavior
if (!L->isRotatedForm() || !L->getExitingBlock())
continue;
- // The actual work is performed by LoadEliminationForLoop.
- LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT, BFI, PSI);
- Changed |= LEL.processLoop();
- }
- return Changed;
-}
-
-namespace {
-
-/// The pass. Most of the work is delegated to the per-loop
-/// LoadEliminationForLoop class.
-class LoopLoadElimination : public FunctionPass {
-public:
- static char ID;
-
- LoopLoadElimination() : FunctionPass(ID) {
- initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>();
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
- auto *BFI = (PSI && PSI->hasProfileSummary()) ?
- &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
- nullptr;
-
- // Process each loop nest in the function.
- return eliminateLoadsAcrossLoops(
+ // The actual work is performed by LoadEliminationForLoop.
+ LoadEliminationForLoop LEL(L, &LI, GetLAI(*L), &DT, BFI, PSI);
+ Changed |= LEL.processLoop();
+ }
+ return Changed;
+}
+
+namespace {
+
+/// The pass. Most of the work is delegated to the per-loop
+/// LoadEliminationForLoop class.
+class LoopLoadElimination : public FunctionPass {
+public:
+ static char ID;
+
+ LoopLoadElimination() : FunctionPass(ID) {
+ initializeLoopLoadEliminationPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &LAA = getAnalysis<LoopAccessLegacyAnalysis>();
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+ &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
+ nullptr;
+
+ // Process each loop nest in the function.
+ return eliminateLoadsAcrossLoops(
F, LI, DT, BFI, PSI, /*SE*/ nullptr, /*AC*/ nullptr,
- [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); });
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequiredID(LoopSimplifyID);
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequired<LoopAccessLegacyAnalysis>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addRequired<ProfileSummaryInfoWrapperPass>();
- LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
- }
-};
-
-} // end anonymous namespace
-
-char LoopLoadElimination::ID;
-
-static const char LLE_name[] = "Loop Load Elimination";
-
-INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
-INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
-
-FunctionPass *llvm::createLoopLoadEliminationPass() {
- return new LoopLoadElimination();
-}
-
-PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
- auto &LI = AM.getResult<LoopAnalysis>(F);
- auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &AA = AM.getResult<AAManager>(F);
- auto &AC = AM.getResult<AssumptionAnalysis>(F);
- auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
- auto *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
- auto *BFI = (PSI && PSI->hasProfileSummary()) ?
- &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
- MemorySSA *MSSA = EnableMSSALoopDependency
- ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
- : nullptr;
-
- auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
- bool Changed = eliminateLoadsAcrossLoops(
+ [&LAA](Loop &L) -> const LoopAccessInfo & { return LAA.getInfo(&L); });
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<LoopAccessLegacyAnalysis>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char LoopLoadElimination::ID;
+
+static const char LLE_name[] = "Loop Load Elimination";
+
+INITIALIZE_PASS_BEGIN(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LazyBlockFrequencyInfoPass)
+INITIALIZE_PASS_END(LoopLoadElimination, LLE_OPTION, LLE_name, false, false)
+
+FunctionPass *llvm::createLoopLoadEliminationPass() {
+ return new LoopLoadElimination();
+}
+
+PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+ auto *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+ &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
+ MemorySSA *MSSA = EnableMSSALoopDependency
+ ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
+ : nullptr;
+
+ auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+ bool Changed = eliminateLoadsAcrossLoops(
F, LI, DT, BFI, PSI, &SE, &AC, [&](Loop &L) -> const LoopAccessInfo & {
LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
TLI, TTI, nullptr, MSSA};
- return LAM.getResult<LoopAccessAnalysis>(L, AR);
- });
-
- if (!Changed)
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- return PA;
-}
+ return LAM.getResult<LoopAccessAnalysis>(L, AR);
+ });
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPassManager.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPassManager.cpp
index 18ab347d1b..3fe8e72591 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -1,11 +1,11 @@
-//===- LoopPassManager.cpp - Loop pass management -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
+//===- LoopPassManager.cpp - Loop pass management -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Scalar/LoopPassManager.h"
#include "llvm/Analysis/AssumptionCache.h"
#include "llvm/Analysis/BasicAliasAnalysis.h"
@@ -14,22 +14,22 @@
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Support/TimeProfiler.h"
-
-using namespace llvm;
-
-namespace llvm {
-
-/// Explicitly specialize the pass manager's run method to handle loop nest
-/// structure updates.
-PreservedAnalyses
-PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
- LPMUpdater &>::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR, LPMUpdater &U) {
-
- if (DebugLogging)
- dbgs() << "Starting Loop pass manager run.\n";
-
+#include "llvm/Support/TimeProfiler.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+/// Explicitly specialize the pass manager's run method to handle loop nest
+/// structure updates.
+PreservedAnalyses
+PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
+ LPMUpdater &>::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR, LPMUpdater &U) {
+
+ if (DebugLogging)
+ dbgs() << "Starting Loop pass manager run.\n";
+
// Runs loop-nest passes only when the current loop is a top-level one.
PreservedAnalyses PA = (L.isOutermost() && !LoopNestPasses.empty())
? runWithLoopNestPasses(L, AM, AR, U)
@@ -59,12 +59,12 @@ LoopPassManager::runWithLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
"Loop-nest passes should only run on top-level loops.");
PreservedAnalyses PA = PreservedAnalyses::all();
- // Request PassInstrumentation from analysis manager, will use it to run
- // instrumenting callbacks for the passes later.
- PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(L, AR);
-
+ // Request PassInstrumentation from analysis manager, will use it to run
+ // instrumenting callbacks for the passes later.
+ PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(L, AR);
+
unsigned LoopPassIndex = 0, LoopNestPassIndex = 0;
-
+
// `LoopNestPtr` points to the `LoopNest` object for the current top-level
// loop and `IsLoopNestPtrValid` indicates whether the pointer is still valid.
// The `LoopNest` object will have to be re-constructed if the pointer is
@@ -89,24 +89,24 @@ LoopPassManager::runWithLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
IsLoopNestPtrValid = true;
}
PassPA = runSinglePass(*LoopNestPtr, Pass, AM, AR, U, PI);
- }
-
+ }
+
// `PassPA` is `None` means that the before-pass callbacks in
// `PassInstrumentation` return false. The pass does not run in this case,
// so we can skip the following procedure.
if (!PassPA)
continue;
-
- // If the loop was deleted, abort the run and return to the outer walk.
- if (U.skipCurrentLoop()) {
+
+ // If the loop was deleted, abort the run and return to the outer walk.
+ if (U.skipCurrentLoop()) {
PA.intersect(std::move(*PassPA));
- break;
- }
-
+ break;
+ }
+
// Update the analysis manager as each pass runs and potentially
// invalidates analyses.
AM.invalidate(L, *PassPA);
-
+
// Finally, we intersect the final preserved analyses to compute the
// aggregate preserved set for this pass manager.
PA.intersect(std::move(*PassPA));
@@ -150,24 +150,24 @@ LoopPassManager::runWithoutLoopNestPasses(Loop &L, LoopAnalysisManager &AM,
break;
}
- // Update the analysis manager as each pass runs and potentially
- // invalidates analyses.
+ // Update the analysis manager as each pass runs and potentially
+ // invalidates analyses.
AM.invalidate(L, *PassPA);
-
- // Finally, we intersect the final preserved analyses to compute the
- // aggregate preserved set for this pass manager.
+
+ // Finally, we intersect the final preserved analyses to compute the
+ // aggregate preserved set for this pass manager.
PA.intersect(std::move(*PassPA));
-
- // FIXME: Historically, the pass managers all called the LLVM context's
- // yield function here. We don't have a generic way to acquire the
- // context and it isn't yet clear what the right pattern is for yielding
- // in the new pass manager so it is currently omitted.
- // ...getContext().yield();
- }
+
+ // FIXME: Historically, the pass managers all called the LLVM context's
+ // yield function here. We don't have a generic way to acquire the
+ // context and it isn't yet clear what the right pattern is for yielding
+ // in the new pass manager so it is currently omitted.
+ // ...getContext().yield();
+ }
return PA;
}
} // namespace llvm
-
+
PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
FunctionAnalysisManager &AM) {
// Before we even compute any loop analyses, first run a miniature function
@@ -175,7 +175,7 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
// directly build up function analyses after this as the function pass
// manager handles all the invalidation at that layer.
PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(F);
-
+
PreservedAnalyses PA = PreservedAnalyses::all();
// Check the PassInstrumentation's BeforePass callbacks before running the
// canonicalization pipeline.
@@ -183,7 +183,7 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
PA = LoopCanonicalizationFPM.run(F, AM);
PI.runAfterPass<Function>(LoopCanonicalizationFPM, F, PA);
}
-
+
// Get the loop structure for this function
LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
@@ -320,16 +320,16 @@ PreservedAnalyses FunctionToLoopPassAdaptor::run(Function &F,
PA.preserve<BasicAA>();
PA.preserve<GlobalsAA>();
PA.preserve<SCEVAA>();
- return PA;
-}
-
-PrintLoopPass::PrintLoopPass() : OS(dbgs()) {}
-PrintLoopPass::PrintLoopPass(raw_ostream &OS, const std::string &Banner)
- : OS(OS), Banner(Banner) {}
-
-PreservedAnalyses PrintLoopPass::run(Loop &L, LoopAnalysisManager &,
- LoopStandardAnalysisResults &,
- LPMUpdater &) {
- printLoop(L, OS, Banner);
- return PreservedAnalyses::all();
-}
+ return PA;
+}
+
+PrintLoopPass::PrintLoopPass() : OS(dbgs()) {}
+PrintLoopPass::PrintLoopPass(raw_ostream &OS, const std::string &Banner)
+ : OS(OS), Banner(Banner) {}
+
+PreservedAnalyses PrintLoopPass::run(Loop &L, LoopAnalysisManager &,
+ LoopStandardAnalysisResults &,
+ LPMUpdater &) {
+ printLoop(L, OS, Banner);
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPredication.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPredication.cpp
index 34f5868699..4f97641e20 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopPredication.cpp
@@ -1,1246 +1,1246 @@
-//===-- LoopPredication.cpp - Guard based loop predication pass -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The LoopPredication pass tries to convert loop variant range checks to loop
-// invariant by widening checks across loop iterations. For example, it will
-// convert
-//
-// for (i = 0; i < n; i++) {
-// guard(i < len);
-// ...
-// }
-//
-// to
-//
-// for (i = 0; i < n; i++) {
-// guard(n - 1 < len);
-// ...
-// }
-//
-// After this transformation the condition of the guard is loop invariant, so
-// loop-unswitch can later unswitch the loop by this condition which basically
-// predicates the loop by the widened condition:
-//
-// if (n - 1 < len)
-// for (i = 0; i < n; i++) {
-// ...
-// }
-// else
-// deoptimize
-//
-// It's tempting to rely on SCEV here, but it has proven to be problematic.
-// Generally the facts SCEV provides about the increment step of add
-// recurrences are true if the backedge of the loop is taken, which implicitly
-// assumes that the guard doesn't fail. Using these facts to optimize the
-// guard results in a circular logic where the guard is optimized under the
-// assumption that it never fails.
-//
-// For example, in the loop below the induction variable will be marked as nuw
-// basing on the guard. Basing on nuw the guard predicate will be considered
-// monotonic. Given a monotonic condition it's tempting to replace the induction
-// variable in the condition with its value on the last iteration. But this
-// transformation is not correct, e.g. e = 4, b = 5 breaks the loop.
-//
-// for (int i = b; i != e; i++)
-// guard(i u< len)
-//
-// One of the ways to reason about this problem is to use an inductive proof
-// approach. Given the loop:
-//
-// if (B(0)) {
-// do {
-// I = PHI(0, I.INC)
-// I.INC = I + Step
-// guard(G(I));
-// } while (B(I));
-// }
-//
-// where B(x) and G(x) are predicates that map integers to booleans, we want a
-// loop invariant expression M such the following program has the same semantics
-// as the above:
-//
-// if (B(0)) {
-// do {
-// I = PHI(0, I.INC)
-// I.INC = I + Step
-// guard(G(0) && M);
-// } while (B(I));
-// }
-//
-// One solution for M is M = forall X . (G(X) && B(X)) => G(X + Step)
-//
-// Informal proof that the transformation above is correct:
-//
-// By the definition of guards we can rewrite the guard condition to:
-// G(I) && G(0) && M
-//
-// Let's prove that for each iteration of the loop:
-// G(0) && M => G(I)
-// And the condition above can be simplified to G(Start) && M.
-//
-// Induction base.
-// G(0) && M => G(0)
-//
-// Induction step. Assuming G(0) && M => G(I) on the subsequent
-// iteration:
-//
-// B(I) is true because it's the backedge condition.
-// G(I) is true because the backedge is guarded by this condition.
-//
-// So M = forall X . (G(X) && B(X)) => G(X + Step) implies G(I + Step).
-//
-// Note that we can use anything stronger than M, i.e. any condition which
-// implies M.
-//
-// When S = 1 (i.e. forward iterating loop), the transformation is supported
-// when:
-// * The loop has a single latch with the condition of the form:
-// B(X) = latchStart + X <pred> latchLimit,
-// where <pred> is u<, u<=, s<, or s<=.
-// * The guard condition is of the form
-// G(X) = guardStart + X u< guardLimit
-//
-// For the ult latch comparison case M is:
-// forall X . guardStart + X u< guardLimit && latchStart + X <u latchLimit =>
-// guardStart + X + 1 u< guardLimit
-//
-// The only way the antecedent can be true and the consequent can be false is
-// if
-// X == guardLimit - 1 - guardStart
-// (and guardLimit is non-zero, but we won't use this latter fact).
-// If X == guardLimit - 1 - guardStart then the second half of the antecedent is
-// latchStart + guardLimit - 1 - guardStart u< latchLimit
-// and its negation is
-// latchStart + guardLimit - 1 - guardStart u>= latchLimit
-//
-// In other words, if
-// latchLimit u<= latchStart + guardLimit - 1 - guardStart
-// then:
-// (the ranges below are written in ConstantRange notation, where [A, B) is the
-// set for (I = A; I != B; I++ /*maywrap*/) yield(I);)
-//
-// forall X . guardStart + X u< guardLimit &&
-// latchStart + X u< latchLimit =>
-// guardStart + X + 1 u< guardLimit
-// == forall X . guardStart + X u< guardLimit &&
-// latchStart + X u< latchStart + guardLimit - 1 - guardStart =>
-// guardStart + X + 1 u< guardLimit
-// == forall X . (guardStart + X) in [0, guardLimit) &&
-// (latchStart + X) in [0, latchStart + guardLimit - 1 - guardStart) =>
-// (guardStart + X + 1) in [0, guardLimit)
-// == forall X . X in [-guardStart, guardLimit - guardStart) &&
-// X in [-latchStart, guardLimit - 1 - guardStart) =>
-// X in [-guardStart - 1, guardLimit - guardStart - 1)
-// == true
-//
-// So the widened condition is:
-// guardStart u< guardLimit &&
-// latchStart + guardLimit - 1 - guardStart u>= latchLimit
-// Similarly for ule condition the widened condition is:
-// guardStart u< guardLimit &&
-// latchStart + guardLimit - 1 - guardStart u> latchLimit
-// For slt condition the widened condition is:
-// guardStart u< guardLimit &&
-// latchStart + guardLimit - 1 - guardStart s>= latchLimit
-// For sle condition the widened condition is:
-// guardStart u< guardLimit &&
-// latchStart + guardLimit - 1 - guardStart s> latchLimit
-//
-// When S = -1 (i.e. reverse iterating loop), the transformation is supported
-// when:
-// * The loop has a single latch with the condition of the form:
-// B(X) = X <pred> latchLimit, where <pred> is u>, u>=, s>, or s>=.
-// * The guard condition is of the form
-// G(X) = X - 1 u< guardLimit
-//
-// For the ugt latch comparison case M is:
-// forall X. X-1 u< guardLimit and X u> latchLimit => X-2 u< guardLimit
-//
-// The only way the antecedent can be true and the consequent can be false is if
-// X == 1.
-// If X == 1 then the second half of the antecedent is
-// 1 u> latchLimit, and its negation is latchLimit u>= 1.
-//
-// So the widened condition is:
-// guardStart u< guardLimit && latchLimit u>= 1.
-// Similarly for sgt condition the widened condition is:
-// guardStart u< guardLimit && latchLimit s>= 1.
-// For uge condition the widened condition is:
-// guardStart u< guardLimit && latchLimit u> 1.
-// For sge condition the widened condition is:
-// guardStart u< guardLimit && latchLimit s> 1.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopPredication.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/GuardUtils.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/GuardUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-
-#define DEBUG_TYPE "loop-predication"
-
-STATISTIC(TotalConsidered, "Number of guards considered");
-STATISTIC(TotalWidened, "Number of checks widened");
-
-using namespace llvm;
-
-static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation",
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool> EnableCountDownLoop("loop-predication-enable-count-down-loop",
- cl::Hidden, cl::init(true));
-
-static cl::opt<bool>
- SkipProfitabilityChecks("loop-predication-skip-profitability-checks",
- cl::Hidden, cl::init(false));
-
-// This is the scale factor for the latch probability. We use this during
-// profitability analysis to find other exiting blocks that have a much higher
-// probability of exiting the loop instead of loop exiting via latch.
-// This value should be greater than 1 for a sane profitability check.
-static cl::opt<float> LatchExitProbabilityScale(
- "loop-predication-latch-probability-scale", cl::Hidden, cl::init(2.0),
- cl::desc("scale factor for the latch probability. Value should be greater "
- "than 1. Lower values are ignored"));
-
-static cl::opt<bool> PredicateWidenableBranchGuards(
- "loop-predication-predicate-widenable-branches-to-deopt", cl::Hidden,
- cl::desc("Whether or not we should predicate guards "
- "expressed as widenable branches to deoptimize blocks"),
- cl::init(true));
-
-namespace {
-/// Represents an induction variable check:
-/// icmp Pred, <induction variable>, <loop invariant limit>
-struct LoopICmp {
- ICmpInst::Predicate Pred;
- const SCEVAddRecExpr *IV;
- const SCEV *Limit;
- LoopICmp(ICmpInst::Predicate Pred, const SCEVAddRecExpr *IV,
- const SCEV *Limit)
- : Pred(Pred), IV(IV), Limit(Limit) {}
- LoopICmp() {}
- void dump() {
- dbgs() << "LoopICmp Pred = " << Pred << ", IV = " << *IV
- << ", Limit = " << *Limit << "\n";
- }
-};
-
-class LoopPredication {
- AliasAnalysis *AA;
- DominatorTree *DT;
- ScalarEvolution *SE;
- LoopInfo *LI;
- BranchProbabilityInfo *BPI;
-
- Loop *L;
- const DataLayout *DL;
- BasicBlock *Preheader;
- LoopICmp LatchCheck;
-
- bool isSupportedStep(const SCEV* Step);
- Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI);
- Optional<LoopICmp> parseLoopLatchICmp();
-
- /// Return an insertion point suitable for inserting a safe to speculate
- /// instruction whose only user will be 'User' which has operands 'Ops'. A
- /// trivial result would be the at the User itself, but we try to return a
- /// loop invariant location if possible.
- Instruction *findInsertPt(Instruction *User, ArrayRef<Value*> Ops);
- /// Same as above, *except* that this uses the SCEV definition of invariant
- /// which is that an expression *can be made* invariant via SCEVExpander.
- /// Thus, this version is only suitable for finding an insert point to be be
- /// passed to SCEVExpander!
- Instruction *findInsertPt(Instruction *User, ArrayRef<const SCEV*> Ops);
-
- /// Return true if the value is known to produce a single fixed value across
- /// all iterations on which it executes. Note that this does not imply
- /// speculation safety. That must be established separately.
- bool isLoopInvariantValue(const SCEV* S);
-
- Value *expandCheck(SCEVExpander &Expander, Instruction *Guard,
- ICmpInst::Predicate Pred, const SCEV *LHS,
- const SCEV *RHS);
-
- Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander,
- Instruction *Guard);
- Optional<Value *> widenICmpRangeCheckIncrementingLoop(LoopICmp LatchCheck,
- LoopICmp RangeCheck,
- SCEVExpander &Expander,
- Instruction *Guard);
- Optional<Value *> widenICmpRangeCheckDecrementingLoop(LoopICmp LatchCheck,
- LoopICmp RangeCheck,
- SCEVExpander &Expander,
- Instruction *Guard);
- unsigned collectChecks(SmallVectorImpl<Value *> &Checks, Value *Condition,
- SCEVExpander &Expander, Instruction *Guard);
- bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
- bool widenWidenableBranchGuardConditions(BranchInst *Guard, SCEVExpander &Expander);
- // If the loop always exits through another block in the loop, we should not
- // predicate based on the latch check. For example, the latch check can be a
- // very coarse grained check and there can be more fine grained exit checks
- // within the loop. We identify such unprofitable loops through BPI.
- bool isLoopProfitableToPredicate();
-
- bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter);
-
-public:
- LoopPredication(AliasAnalysis *AA, DominatorTree *DT,
- ScalarEvolution *SE, LoopInfo *LI,
- BranchProbabilityInfo *BPI)
- : AA(AA), DT(DT), SE(SE), LI(LI), BPI(BPI) {};
- bool runOnLoop(Loop *L);
-};
-
-class LoopPredicationLegacyPass : public LoopPass {
-public:
- static char ID;
- LoopPredicationLegacyPass() : LoopPass(ID) {
- initializeLoopPredicationLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<BranchProbabilityInfoWrapperPass>();
- getLoopAnalysisUsage(AU);
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override {
- if (skipLoop(L))
- return false;
- auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- BranchProbabilityInfo &BPI =
- getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
- auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- LoopPredication LP(AA, DT, SE, LI, &BPI);
- return LP.runOnLoop(L);
- }
-};
-
-char LoopPredicationLegacyPass::ID = 0;
-} // end namespace
-
-INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication",
- "Loop predication", false, false)
-INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication",
- "Loop predication", false, false)
-
-Pass *llvm::createLoopPredicationPass() {
- return new LoopPredicationLegacyPass();
-}
-
-PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &U) {
- Function *F = L.getHeader()->getParent();
- // For the new PM, we also can't use BranchProbabilityInfo as an analysis
- // pass. Function analyses need to be preserved across loop transformations
- // but BPI is not preserved, hence a newly built one is needed.
+//===-- LoopPredication.cpp - Guard based loop predication pass -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The LoopPredication pass tries to convert loop variant range checks to loop
+// invariant by widening checks across loop iterations. For example, it will
+// convert
+//
+// for (i = 0; i < n; i++) {
+// guard(i < len);
+// ...
+// }
+//
+// to
+//
+// for (i = 0; i < n; i++) {
+// guard(n - 1 < len);
+// ...
+// }
+//
+// After this transformation the condition of the guard is loop invariant, so
+// loop-unswitch can later unswitch the loop by this condition which basically
+// predicates the loop by the widened condition:
+//
+// if (n - 1 < len)
+// for (i = 0; i < n; i++) {
+// ...
+// }
+// else
+// deoptimize
+//
+// It's tempting to rely on SCEV here, but it has proven to be problematic.
+// Generally the facts SCEV provides about the increment step of add
+// recurrences are true if the backedge of the loop is taken, which implicitly
+// assumes that the guard doesn't fail. Using these facts to optimize the
+// guard results in a circular logic where the guard is optimized under the
+// assumption that it never fails.
+//
+// For example, in the loop below the induction variable will be marked as nuw
+// basing on the guard. Basing on nuw the guard predicate will be considered
+// monotonic. Given a monotonic condition it's tempting to replace the induction
+// variable in the condition with its value on the last iteration. But this
+// transformation is not correct, e.g. e = 4, b = 5 breaks the loop.
+//
+// for (int i = b; i != e; i++)
+// guard(i u< len)
+//
+// One of the ways to reason about this problem is to use an inductive proof
+// approach. Given the loop:
+//
+// if (B(0)) {
+// do {
+// I = PHI(0, I.INC)
+// I.INC = I + Step
+// guard(G(I));
+// } while (B(I));
+// }
+//
+// where B(x) and G(x) are predicates that map integers to booleans, we want a
+// loop invariant expression M such the following program has the same semantics
+// as the above:
+//
+// if (B(0)) {
+// do {
+// I = PHI(0, I.INC)
+// I.INC = I + Step
+// guard(G(0) && M);
+// } while (B(I));
+// }
+//
+// One solution for M is M = forall X . (G(X) && B(X)) => G(X + Step)
+//
+// Informal proof that the transformation above is correct:
+//
+// By the definition of guards we can rewrite the guard condition to:
+// G(I) && G(0) && M
+//
+// Let's prove that for each iteration of the loop:
+// G(0) && M => G(I)
+// And the condition above can be simplified to G(Start) && M.
+//
+// Induction base.
+// G(0) && M => G(0)
+//
+// Induction step. Assuming G(0) && M => G(I) on the subsequent
+// iteration:
+//
+// B(I) is true because it's the backedge condition.
+// G(I) is true because the backedge is guarded by this condition.
+//
+// So M = forall X . (G(X) && B(X)) => G(X + Step) implies G(I + Step).
+//
+// Note that we can use anything stronger than M, i.e. any condition which
+// implies M.
+//
+// When S = 1 (i.e. forward iterating loop), the transformation is supported
+// when:
+// * The loop has a single latch with the condition of the form:
+// B(X) = latchStart + X <pred> latchLimit,
+// where <pred> is u<, u<=, s<, or s<=.
+// * The guard condition is of the form
+// G(X) = guardStart + X u< guardLimit
+//
+// For the ult latch comparison case M is:
+// forall X . guardStart + X u< guardLimit && latchStart + X <u latchLimit =>
+// guardStart + X + 1 u< guardLimit
+//
+// The only way the antecedent can be true and the consequent can be false is
+// if
+// X == guardLimit - 1 - guardStart
+// (and guardLimit is non-zero, but we won't use this latter fact).
+// If X == guardLimit - 1 - guardStart then the second half of the antecedent is
+// latchStart + guardLimit - 1 - guardStart u< latchLimit
+// and its negation is
+// latchStart + guardLimit - 1 - guardStart u>= latchLimit
+//
+// In other words, if
+// latchLimit u<= latchStart + guardLimit - 1 - guardStart
+// then:
+// (the ranges below are written in ConstantRange notation, where [A, B) is the
+// set for (I = A; I != B; I++ /*maywrap*/) yield(I);)
+//
+// forall X . guardStart + X u< guardLimit &&
+// latchStart + X u< latchLimit =>
+// guardStart + X + 1 u< guardLimit
+// == forall X . guardStart + X u< guardLimit &&
+// latchStart + X u< latchStart + guardLimit - 1 - guardStart =>
+// guardStart + X + 1 u< guardLimit
+// == forall X . (guardStart + X) in [0, guardLimit) &&
+// (latchStart + X) in [0, latchStart + guardLimit - 1 - guardStart) =>
+// (guardStart + X + 1) in [0, guardLimit)
+// == forall X . X in [-guardStart, guardLimit - guardStart) &&
+// X in [-latchStart, guardLimit - 1 - guardStart) =>
+// X in [-guardStart - 1, guardLimit - guardStart - 1)
+// == true
+//
+// So the widened condition is:
+// guardStart u< guardLimit &&
+// latchStart + guardLimit - 1 - guardStart u>= latchLimit
+// Similarly for ule condition the widened condition is:
+// guardStart u< guardLimit &&
+// latchStart + guardLimit - 1 - guardStart u> latchLimit
+// For slt condition the widened condition is:
+// guardStart u< guardLimit &&
+// latchStart + guardLimit - 1 - guardStart s>= latchLimit
+// For sle condition the widened condition is:
+// guardStart u< guardLimit &&
+// latchStart + guardLimit - 1 - guardStart s> latchLimit
+//
+// When S = -1 (i.e. reverse iterating loop), the transformation is supported
+// when:
+// * The loop has a single latch with the condition of the form:
+// B(X) = X <pred> latchLimit, where <pred> is u>, u>=, s>, or s>=.
+// * The guard condition is of the form
+// G(X) = X - 1 u< guardLimit
+//
+// For the ugt latch comparison case M is:
+// forall X. X-1 u< guardLimit and X u> latchLimit => X-2 u< guardLimit
+//
+// The only way the antecedent can be true and the consequent can be false is if
+// X == 1.
+// If X == 1 then the second half of the antecedent is
+// 1 u> latchLimit, and its negation is latchLimit u>= 1.
+//
+// So the widened condition is:
+// guardStart u< guardLimit && latchLimit u>= 1.
+// Similarly for sgt condition the widened condition is:
+// guardStart u< guardLimit && latchLimit s>= 1.
+// For uge condition the widened condition is:
+// guardStart u< guardLimit && latchLimit u> 1.
+// For sge condition the widened condition is:
+// guardStart u< guardLimit && latchLimit s> 1.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopPredication.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+
+#define DEBUG_TYPE "loop-predication"
+
+STATISTIC(TotalConsidered, "Number of guards considered");
+STATISTIC(TotalWidened, "Number of checks widened");
+
+using namespace llvm;
+
+static cl::opt<bool> EnableIVTruncation("loop-predication-enable-iv-truncation",
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool> EnableCountDownLoop("loop-predication-enable-count-down-loop",
+ cl::Hidden, cl::init(true));
+
+static cl::opt<bool>
+ SkipProfitabilityChecks("loop-predication-skip-profitability-checks",
+ cl::Hidden, cl::init(false));
+
+// This is the scale factor for the latch probability. We use this during
+// profitability analysis to find other exiting blocks that have a much higher
+// probability of exiting the loop instead of loop exiting via latch.
+// This value should be greater than 1 for a sane profitability check.
+static cl::opt<float> LatchExitProbabilityScale(
+ "loop-predication-latch-probability-scale", cl::Hidden, cl::init(2.0),
+ cl::desc("scale factor for the latch probability. Value should be greater "
+ "than 1. Lower values are ignored"));
+
+static cl::opt<bool> PredicateWidenableBranchGuards(
+ "loop-predication-predicate-widenable-branches-to-deopt", cl::Hidden,
+ cl::desc("Whether or not we should predicate guards "
+ "expressed as widenable branches to deoptimize blocks"),
+ cl::init(true));
+
+namespace {
+/// Represents an induction variable check:
+/// icmp Pred, <induction variable>, <loop invariant limit>
+struct LoopICmp {
+ ICmpInst::Predicate Pred;
+ const SCEVAddRecExpr *IV;
+ const SCEV *Limit;
+ LoopICmp(ICmpInst::Predicate Pred, const SCEVAddRecExpr *IV,
+ const SCEV *Limit)
+ : Pred(Pred), IV(IV), Limit(Limit) {}
+ LoopICmp() {}
+ void dump() {
+ dbgs() << "LoopICmp Pred = " << Pred << ", IV = " << *IV
+ << ", Limit = " << *Limit << "\n";
+ }
+};
+
+class LoopPredication {
+ AliasAnalysis *AA;
+ DominatorTree *DT;
+ ScalarEvolution *SE;
+ LoopInfo *LI;
+ BranchProbabilityInfo *BPI;
+
+ Loop *L;
+ const DataLayout *DL;
+ BasicBlock *Preheader;
+ LoopICmp LatchCheck;
+
+ bool isSupportedStep(const SCEV* Step);
+ Optional<LoopICmp> parseLoopICmp(ICmpInst *ICI);
+ Optional<LoopICmp> parseLoopLatchICmp();
+
+ /// Return an insertion point suitable for inserting a safe to speculate
+ /// instruction whose only user will be 'User' which has operands 'Ops'. A
+ /// trivial result would be the at the User itself, but we try to return a
+ /// loop invariant location if possible.
+ Instruction *findInsertPt(Instruction *User, ArrayRef<Value*> Ops);
+ /// Same as above, *except* that this uses the SCEV definition of invariant
+ /// which is that an expression *can be made* invariant via SCEVExpander.
+ /// Thus, this version is only suitable for finding an insert point to be be
+ /// passed to SCEVExpander!
+ Instruction *findInsertPt(Instruction *User, ArrayRef<const SCEV*> Ops);
+
+ /// Return true if the value is known to produce a single fixed value across
+ /// all iterations on which it executes. Note that this does not imply
+ /// speculation safety. That must be established separately.
+ bool isLoopInvariantValue(const SCEV* S);
+
+ Value *expandCheck(SCEVExpander &Expander, Instruction *Guard,
+ ICmpInst::Predicate Pred, const SCEV *LHS,
+ const SCEV *RHS);
+
+ Optional<Value *> widenICmpRangeCheck(ICmpInst *ICI, SCEVExpander &Expander,
+ Instruction *Guard);
+ Optional<Value *> widenICmpRangeCheckIncrementingLoop(LoopICmp LatchCheck,
+ LoopICmp RangeCheck,
+ SCEVExpander &Expander,
+ Instruction *Guard);
+ Optional<Value *> widenICmpRangeCheckDecrementingLoop(LoopICmp LatchCheck,
+ LoopICmp RangeCheck,
+ SCEVExpander &Expander,
+ Instruction *Guard);
+ unsigned collectChecks(SmallVectorImpl<Value *> &Checks, Value *Condition,
+ SCEVExpander &Expander, Instruction *Guard);
+ bool widenGuardConditions(IntrinsicInst *II, SCEVExpander &Expander);
+ bool widenWidenableBranchGuardConditions(BranchInst *Guard, SCEVExpander &Expander);
+ // If the loop always exits through another block in the loop, we should not
+ // predicate based on the latch check. For example, the latch check can be a
+ // very coarse grained check and there can be more fine grained exit checks
+ // within the loop. We identify such unprofitable loops through BPI.
+ bool isLoopProfitableToPredicate();
+
+ bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter);
+
+public:
+ LoopPredication(AliasAnalysis *AA, DominatorTree *DT,
+ ScalarEvolution *SE, LoopInfo *LI,
+ BranchProbabilityInfo *BPI)
+ : AA(AA), DT(DT), SE(SE), LI(LI), BPI(BPI) {};
+ bool runOnLoop(Loop *L);
+};
+
+class LoopPredicationLegacyPass : public LoopPass {
+public:
+ static char ID;
+ LoopPredicationLegacyPass() : LoopPass(ID) {
+ initializeLoopPredicationLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<BranchProbabilityInfoWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ BranchProbabilityInfo &BPI =
+ getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+ auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ LoopPredication LP(AA, DT, SE, LI, &BPI);
+ return LP.runOnLoop(L);
+ }
+};
+
+char LoopPredicationLegacyPass::ID = 0;
+} // end namespace
+
+INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication",
+ "Loop predication", false, false)
+INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_END(LoopPredicationLegacyPass, "loop-predication",
+ "Loop predication", false, false)
+
+Pass *llvm::createLoopPredicationPass() {
+ return new LoopPredicationLegacyPass();
+}
+
+PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ Function *F = L.getHeader()->getParent();
+ // For the new PM, we also can't use BranchProbabilityInfo as an analysis
+ // pass. Function analyses need to be preserved across loop transformations
+ // but BPI is not preserved, hence a newly built one is needed.
BranchProbabilityInfo BPI(*F, AR.LI, &AR.TLI, &AR.DT, nullptr);
- LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, &BPI);
- if (!LP.runOnLoop(&L))
- return PreservedAnalyses::all();
-
- return getLoopPassPreservedAnalyses();
-}
-
-Optional<LoopICmp>
-LoopPredication::parseLoopICmp(ICmpInst *ICI) {
- auto Pred = ICI->getPredicate();
- auto *LHS = ICI->getOperand(0);
- auto *RHS = ICI->getOperand(1);
-
- const SCEV *LHSS = SE->getSCEV(LHS);
- if (isa<SCEVCouldNotCompute>(LHSS))
- return None;
- const SCEV *RHSS = SE->getSCEV(RHS);
- if (isa<SCEVCouldNotCompute>(RHSS))
- return None;
-
- // Canonicalize RHS to be loop invariant bound, LHS - a loop computable IV
- if (SE->isLoopInvariant(LHSS, L)) {
- std::swap(LHS, RHS);
- std::swap(LHSS, RHSS);
- Pred = ICmpInst::getSwappedPredicate(Pred);
- }
-
- const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHSS);
- if (!AR || AR->getLoop() != L)
- return None;
-
- return LoopICmp(Pred, AR, RHSS);
-}
-
-Value *LoopPredication::expandCheck(SCEVExpander &Expander,
- Instruction *Guard,
- ICmpInst::Predicate Pred, const SCEV *LHS,
- const SCEV *RHS) {
- Type *Ty = LHS->getType();
- assert(Ty == RHS->getType() && "expandCheck operands have different types?");
-
- if (SE->isLoopInvariant(LHS, L) && SE->isLoopInvariant(RHS, L)) {
- IRBuilder<> Builder(Guard);
- if (SE->isLoopEntryGuardedByCond(L, Pred, LHS, RHS))
- return Builder.getTrue();
- if (SE->isLoopEntryGuardedByCond(L, ICmpInst::getInversePredicate(Pred),
- LHS, RHS))
- return Builder.getFalse();
- }
-
- Value *LHSV = Expander.expandCodeFor(LHS, Ty, findInsertPt(Guard, {LHS}));
- Value *RHSV = Expander.expandCodeFor(RHS, Ty, findInsertPt(Guard, {RHS}));
- IRBuilder<> Builder(findInsertPt(Guard, {LHSV, RHSV}));
- return Builder.CreateICmp(Pred, LHSV, RHSV);
-}
-
-
-// Returns true if its safe to truncate the IV to RangeCheckType.
-// When the IV type is wider than the range operand type, we can still do loop
-// predication, by generating SCEVs for the range and latch that are of the
-// same type. We achieve this by generating a SCEV truncate expression for the
-// latch IV. This is done iff truncation of the IV is a safe operation,
-// without loss of information.
-// Another way to achieve this is by generating a wider type SCEV for the
-// range check operand, however, this needs a more involved check that
-// operands do not overflow. This can lead to loss of information when the
-// range operand is of the form: add i32 %offset, %iv. We need to prove that
-// sext(x + y) is same as sext(x) + sext(y).
-// This function returns true if we can safely represent the IV type in
-// the RangeCheckType without loss of information.
-static bool isSafeToTruncateWideIVType(const DataLayout &DL,
- ScalarEvolution &SE,
- const LoopICmp LatchCheck,
- Type *RangeCheckType) {
- if (!EnableIVTruncation)
- return false;
+ LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, &BPI);
+ if (!LP.runOnLoop(&L))
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
+
+Optional<LoopICmp>
+LoopPredication::parseLoopICmp(ICmpInst *ICI) {
+ auto Pred = ICI->getPredicate();
+ auto *LHS = ICI->getOperand(0);
+ auto *RHS = ICI->getOperand(1);
+
+ const SCEV *LHSS = SE->getSCEV(LHS);
+ if (isa<SCEVCouldNotCompute>(LHSS))
+ return None;
+ const SCEV *RHSS = SE->getSCEV(RHS);
+ if (isa<SCEVCouldNotCompute>(RHSS))
+ return None;
+
+ // Canonicalize RHS to be loop invariant bound, LHS - a loop computable IV
+ if (SE->isLoopInvariant(LHSS, L)) {
+ std::swap(LHS, RHS);
+ std::swap(LHSS, RHSS);
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ }
+
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHSS);
+ if (!AR || AR->getLoop() != L)
+ return None;
+
+ return LoopICmp(Pred, AR, RHSS);
+}
+
+Value *LoopPredication::expandCheck(SCEVExpander &Expander,
+ Instruction *Guard,
+ ICmpInst::Predicate Pred, const SCEV *LHS,
+ const SCEV *RHS) {
+ Type *Ty = LHS->getType();
+ assert(Ty == RHS->getType() && "expandCheck operands have different types?");
+
+ if (SE->isLoopInvariant(LHS, L) && SE->isLoopInvariant(RHS, L)) {
+ IRBuilder<> Builder(Guard);
+ if (SE->isLoopEntryGuardedByCond(L, Pred, LHS, RHS))
+ return Builder.getTrue();
+ if (SE->isLoopEntryGuardedByCond(L, ICmpInst::getInversePredicate(Pred),
+ LHS, RHS))
+ return Builder.getFalse();
+ }
+
+ Value *LHSV = Expander.expandCodeFor(LHS, Ty, findInsertPt(Guard, {LHS}));
+ Value *RHSV = Expander.expandCodeFor(RHS, Ty, findInsertPt(Guard, {RHS}));
+ IRBuilder<> Builder(findInsertPt(Guard, {LHSV, RHSV}));
+ return Builder.CreateICmp(Pred, LHSV, RHSV);
+}
+
+
+// Returns true if its safe to truncate the IV to RangeCheckType.
+// When the IV type is wider than the range operand type, we can still do loop
+// predication, by generating SCEVs for the range and latch that are of the
+// same type. We achieve this by generating a SCEV truncate expression for the
+// latch IV. This is done iff truncation of the IV is a safe operation,
+// without loss of information.
+// Another way to achieve this is by generating a wider type SCEV for the
+// range check operand, however, this needs a more involved check that
+// operands do not overflow. This can lead to loss of information when the
+// range operand is of the form: add i32 %offset, %iv. We need to prove that
+// sext(x + y) is same as sext(x) + sext(y).
+// This function returns true if we can safely represent the IV type in
+// the RangeCheckType without loss of information.
+static bool isSafeToTruncateWideIVType(const DataLayout &DL,
+ ScalarEvolution &SE,
+ const LoopICmp LatchCheck,
+ Type *RangeCheckType) {
+ if (!EnableIVTruncation)
+ return false;
assert(DL.getTypeSizeInBits(LatchCheck.IV->getType()).getFixedSize() >
DL.getTypeSizeInBits(RangeCheckType).getFixedSize() &&
- "Expected latch check IV type to be larger than range check operand "
- "type!");
- // The start and end values of the IV should be known. This is to guarantee
- // that truncating the wide type will not lose information.
- auto *Limit = dyn_cast<SCEVConstant>(LatchCheck.Limit);
- auto *Start = dyn_cast<SCEVConstant>(LatchCheck.IV->getStart());
- if (!Limit || !Start)
- return false;
- // This check makes sure that the IV does not change sign during loop
- // iterations. Consider latchType = i64, LatchStart = 5, Pred = ICMP_SGE,
- // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the
- // IV wraps around, and the truncation of the IV would lose the range of
- // iterations between 2^32 and 2^64.
+ "Expected latch check IV type to be larger than range check operand "
+ "type!");
+ // The start and end values of the IV should be known. This is to guarantee
+ // that truncating the wide type will not lose information.
+ auto *Limit = dyn_cast<SCEVConstant>(LatchCheck.Limit);
+ auto *Start = dyn_cast<SCEVConstant>(LatchCheck.IV->getStart());
+ if (!Limit || !Start)
+ return false;
+ // This check makes sure that the IV does not change sign during loop
+ // iterations. Consider latchType = i64, LatchStart = 5, Pred = ICMP_SGE,
+ // LatchEnd = 2, rangeCheckType = i32. If it's not a monotonic predicate, the
+ // IV wraps around, and the truncation of the IV would lose the range of
+ // iterations between 2^32 and 2^64.
if (!SE.getMonotonicPredicateType(LatchCheck.IV, LatchCheck.Pred))
- return false;
- // The active bits should be less than the bits in the RangeCheckType. This
- // guarantees that truncating the latch check to RangeCheckType is a safe
- // operation.
+ return false;
+ // The active bits should be less than the bits in the RangeCheckType. This
+ // guarantees that truncating the latch check to RangeCheckType is a safe
+ // operation.
auto RangeCheckTypeBitSize =
DL.getTypeSizeInBits(RangeCheckType).getFixedSize();
- return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize &&
- Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize;
-}
-
-
-// Return an LoopICmp describing a latch check equivlent to LatchCheck but with
-// the requested type if safe to do so. May involve the use of a new IV.
-static Optional<LoopICmp> generateLoopLatchCheck(const DataLayout &DL,
- ScalarEvolution &SE,
- const LoopICmp LatchCheck,
- Type *RangeCheckType) {
-
- auto *LatchType = LatchCheck.IV->getType();
- if (RangeCheckType == LatchType)
- return LatchCheck;
- // For now, bail out if latch type is narrower than range type.
+ return Start->getAPInt().getActiveBits() < RangeCheckTypeBitSize &&
+ Limit->getAPInt().getActiveBits() < RangeCheckTypeBitSize;
+}
+
+
+// Return an LoopICmp describing a latch check equivlent to LatchCheck but with
+// the requested type if safe to do so. May involve the use of a new IV.
+static Optional<LoopICmp> generateLoopLatchCheck(const DataLayout &DL,
+ ScalarEvolution &SE,
+ const LoopICmp LatchCheck,
+ Type *RangeCheckType) {
+
+ auto *LatchType = LatchCheck.IV->getType();
+ if (RangeCheckType == LatchType)
+ return LatchCheck;
+ // For now, bail out if latch type is narrower than range type.
if (DL.getTypeSizeInBits(LatchType).getFixedSize() <
DL.getTypeSizeInBits(RangeCheckType).getFixedSize())
- return None;
- if (!isSafeToTruncateWideIVType(DL, SE, LatchCheck, RangeCheckType))
- return None;
- // We can now safely identify the truncated version of the IV and limit for
- // RangeCheckType.
- LoopICmp NewLatchCheck;
- NewLatchCheck.Pred = LatchCheck.Pred;
- NewLatchCheck.IV = dyn_cast<SCEVAddRecExpr>(
- SE.getTruncateExpr(LatchCheck.IV, RangeCheckType));
- if (!NewLatchCheck.IV)
- return None;
- NewLatchCheck.Limit = SE.getTruncateExpr(LatchCheck.Limit, RangeCheckType);
- LLVM_DEBUG(dbgs() << "IV of type: " << *LatchType
- << "can be represented as range check type:"
- << *RangeCheckType << "\n");
- LLVM_DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
- LLVM_DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
- return NewLatchCheck;
-}
-
-bool LoopPredication::isSupportedStep(const SCEV* Step) {
- return Step->isOne() || (Step->isAllOnesValue() && EnableCountDownLoop);
-}
-
-Instruction *LoopPredication::findInsertPt(Instruction *Use,
- ArrayRef<Value*> Ops) {
- for (Value *Op : Ops)
- if (!L->isLoopInvariant(Op))
- return Use;
- return Preheader->getTerminator();
-}
-
-Instruction *LoopPredication::findInsertPt(Instruction *Use,
- ArrayRef<const SCEV*> Ops) {
- // Subtlety: SCEV considers things to be invariant if the value produced is
- // the same across iterations. This is not the same as being able to
- // evaluate outside the loop, which is what we actually need here.
- for (const SCEV *Op : Ops)
- if (!SE->isLoopInvariant(Op, L) ||
- !isSafeToExpandAt(Op, Preheader->getTerminator(), *SE))
- return Use;
- return Preheader->getTerminator();
-}
-
-bool LoopPredication::isLoopInvariantValue(const SCEV* S) {
- // Handling expressions which produce invariant results, but *haven't* yet
- // been removed from the loop serves two important purposes.
- // 1) Most importantly, it resolves a pass ordering cycle which would
- // otherwise need us to iteration licm, loop-predication, and either
- // loop-unswitch or loop-peeling to make progress on examples with lots of
- // predicable range checks in a row. (Since, in the general case, we can't
- // hoist the length checks until the dominating checks have been discharged
- // as we can't prove doing so is safe.)
- // 2) As a nice side effect, this exposes the value of peeling or unswitching
- // much more obviously in the IR. Otherwise, the cost modeling for other
- // transforms would end up needing to duplicate all of this logic to model a
- // check which becomes predictable based on a modeled peel or unswitch.
- //
- // The cost of doing so in the worst case is an extra fill from the stack in
- // the loop to materialize the loop invariant test value instead of checking
- // against the original IV which is presumable in a register inside the loop.
- // Such cases are presumably rare, and hint at missing oppurtunities for
- // other passes.
-
- if (SE->isLoopInvariant(S, L))
- // Note: This the SCEV variant, so the original Value* may be within the
- // loop even though SCEV has proven it is loop invariant.
- return true;
-
- // Handle a particular important case which SCEV doesn't yet know about which
- // shows up in range checks on arrays with immutable lengths.
- // TODO: This should be sunk inside SCEV.
- if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S))
- if (const auto *LI = dyn_cast<LoadInst>(U->getValue()))
- if (LI->isUnordered() && L->hasLoopInvariantOperands(LI))
- if (AA->pointsToConstantMemory(LI->getOperand(0)) ||
- LI->hasMetadata(LLVMContext::MD_invariant_load))
- return true;
- return false;
-}
-
-Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
- LoopICmp LatchCheck, LoopICmp RangeCheck,
- SCEVExpander &Expander, Instruction *Guard) {
- auto *Ty = RangeCheck.IV->getType();
- // Generate the widened condition for the forward loop:
- // guardStart u< guardLimit &&
- // latchLimit <pred> guardLimit - 1 - guardStart + latchStart
- // where <pred> depends on the latch condition predicate. See the file
- // header comment for the reasoning.
- // guardLimit - guardStart + latchStart - 1
- const SCEV *GuardStart = RangeCheck.IV->getStart();
- const SCEV *GuardLimit = RangeCheck.Limit;
- const SCEV *LatchStart = LatchCheck.IV->getStart();
- const SCEV *LatchLimit = LatchCheck.Limit;
- // Subtlety: We need all the values to be *invariant* across all iterations,
- // but we only need to check expansion safety for those which *aren't*
- // already guaranteed to dominate the guard.
- if (!isLoopInvariantValue(GuardStart) ||
- !isLoopInvariantValue(GuardLimit) ||
- !isLoopInvariantValue(LatchStart) ||
- !isLoopInvariantValue(LatchLimit)) {
- LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
- return None;
- }
- if (!isSafeToExpandAt(LatchStart, Guard, *SE) ||
- !isSafeToExpandAt(LatchLimit, Guard, *SE)) {
- LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
- return None;
- }
-
- // guardLimit - guardStart + latchStart - 1
- const SCEV *RHS =
- SE->getAddExpr(SE->getMinusSCEV(GuardLimit, GuardStart),
- SE->getMinusSCEV(LatchStart, SE->getOne(Ty)));
- auto LimitCheckPred =
- ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred);
-
- LLVM_DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n");
- LLVM_DEBUG(dbgs() << "RHS: " << *RHS << "\n");
- LLVM_DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
-
- auto *LimitCheck =
- expandCheck(Expander, Guard, LimitCheckPred, LatchLimit, RHS);
- auto *FirstIterationCheck = expandCheck(Expander, Guard, RangeCheck.Pred,
- GuardStart, GuardLimit);
- IRBuilder<> Builder(findInsertPt(Guard, {FirstIterationCheck, LimitCheck}));
- return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
-}
-
-Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
- LoopICmp LatchCheck, LoopICmp RangeCheck,
- SCEVExpander &Expander, Instruction *Guard) {
- auto *Ty = RangeCheck.IV->getType();
- const SCEV *GuardStart = RangeCheck.IV->getStart();
- const SCEV *GuardLimit = RangeCheck.Limit;
- const SCEV *LatchStart = LatchCheck.IV->getStart();
- const SCEV *LatchLimit = LatchCheck.Limit;
- // Subtlety: We need all the values to be *invariant* across all iterations,
- // but we only need to check expansion safety for those which *aren't*
- // already guaranteed to dominate the guard.
- if (!isLoopInvariantValue(GuardStart) ||
- !isLoopInvariantValue(GuardLimit) ||
- !isLoopInvariantValue(LatchStart) ||
- !isLoopInvariantValue(LatchLimit)) {
- LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
- return None;
- }
- if (!isSafeToExpandAt(LatchStart, Guard, *SE) ||
- !isSafeToExpandAt(LatchLimit, Guard, *SE)) {
- LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
- return None;
- }
- // The decrement of the latch check IV should be the same as the
- // rangeCheckIV.
- auto *PostDecLatchCheckIV = LatchCheck.IV->getPostIncExpr(*SE);
- if (RangeCheck.IV != PostDecLatchCheckIV) {
- LLVM_DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: "
- << *PostDecLatchCheckIV
- << " and RangeCheckIV: " << *RangeCheck.IV << "\n");
- return None;
- }
-
- // Generate the widened condition for CountDownLoop:
- // guardStart u< guardLimit &&
- // latchLimit <pred> 1.
- // See the header comment for reasoning of the checks.
- auto LimitCheckPred =
- ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred);
- auto *FirstIterationCheck = expandCheck(Expander, Guard,
- ICmpInst::ICMP_ULT,
- GuardStart, GuardLimit);
- auto *LimitCheck = expandCheck(Expander, Guard, LimitCheckPred, LatchLimit,
- SE->getOne(Ty));
- IRBuilder<> Builder(findInsertPt(Guard, {FirstIterationCheck, LimitCheck}));
- return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
-}
-
-static void normalizePredicate(ScalarEvolution *SE, Loop *L,
- LoopICmp& RC) {
- // LFTR canonicalizes checks to the ICMP_NE/EQ form; normalize back to the
- // ULT/UGE form for ease of handling by our caller.
- if (ICmpInst::isEquality(RC.Pred) &&
- RC.IV->getStepRecurrence(*SE)->isOne() &&
- SE->isKnownPredicate(ICmpInst::ICMP_ULE, RC.IV->getStart(), RC.Limit))
- RC.Pred = RC.Pred == ICmpInst::ICMP_NE ?
- ICmpInst::ICMP_ULT : ICmpInst::ICMP_UGE;
-}
-
-
-/// If ICI can be widened to a loop invariant condition emits the loop
-/// invariant condition in the loop preheader and return it, otherwise
-/// returns None.
-Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
- SCEVExpander &Expander,
- Instruction *Guard) {
- LLVM_DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
- LLVM_DEBUG(ICI->dump());
-
- // parseLoopStructure guarantees that the latch condition is:
- // ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=.
- // We are looking for the range checks of the form:
- // i u< guardLimit
- auto RangeCheck = parseLoopICmp(ICI);
- if (!RangeCheck) {
- LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
- return None;
- }
- LLVM_DEBUG(dbgs() << "Guard check:\n");
- LLVM_DEBUG(RangeCheck->dump());
- if (RangeCheck->Pred != ICmpInst::ICMP_ULT) {
- LLVM_DEBUG(dbgs() << "Unsupported range check predicate("
- << RangeCheck->Pred << ")!\n");
- return None;
- }
- auto *RangeCheckIV = RangeCheck->IV;
- if (!RangeCheckIV->isAffine()) {
- LLVM_DEBUG(dbgs() << "Range check IV is not affine!\n");
- return None;
- }
- auto *Step = RangeCheckIV->getStepRecurrence(*SE);
- // We cannot just compare with latch IV step because the latch and range IVs
- // may have different types.
- if (!isSupportedStep(Step)) {
- LLVM_DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
- return None;
- }
- auto *Ty = RangeCheckIV->getType();
- auto CurrLatchCheckOpt = generateLoopLatchCheck(*DL, *SE, LatchCheck, Ty);
- if (!CurrLatchCheckOpt) {
- LLVM_DEBUG(dbgs() << "Failed to generate a loop latch check "
- "corresponding to range type: "
- << *Ty << "\n");
- return None;
- }
-
- LoopICmp CurrLatchCheck = *CurrLatchCheckOpt;
- // At this point, the range and latch step should have the same type, but need
- // not have the same value (we support both 1 and -1 steps).
- assert(Step->getType() ==
- CurrLatchCheck.IV->getStepRecurrence(*SE)->getType() &&
- "Range and latch steps should be of same type!");
- if (Step != CurrLatchCheck.IV->getStepRecurrence(*SE)) {
- LLVM_DEBUG(dbgs() << "Range and latch have different step values!\n");
- return None;
- }
-
- if (Step->isOne())
- return widenICmpRangeCheckIncrementingLoop(CurrLatchCheck, *RangeCheck,
- Expander, Guard);
- else {
- assert(Step->isAllOnesValue() && "Step should be -1!");
- return widenICmpRangeCheckDecrementingLoop(CurrLatchCheck, *RangeCheck,
- Expander, Guard);
- }
-}
-
-unsigned LoopPredication::collectChecks(SmallVectorImpl<Value *> &Checks,
- Value *Condition,
- SCEVExpander &Expander,
- Instruction *Guard) {
- unsigned NumWidened = 0;
- // The guard condition is expected to be in form of:
- // cond1 && cond2 && cond3 ...
- // Iterate over subconditions looking for icmp conditions which can be
- // widened across loop iterations. Widening these conditions remember the
- // resulting list of subconditions in Checks vector.
- SmallVector<Value *, 4> Worklist(1, Condition);
- SmallPtrSet<Value *, 4> Visited;
- Value *WideableCond = nullptr;
- do {
- Value *Condition = Worklist.pop_back_val();
- if (!Visited.insert(Condition).second)
- continue;
-
- Value *LHS, *RHS;
- using namespace llvm::PatternMatch;
- if (match(Condition, m_And(m_Value(LHS), m_Value(RHS)))) {
- Worklist.push_back(LHS);
- Worklist.push_back(RHS);
- continue;
- }
-
- if (match(Condition,
- m_Intrinsic<Intrinsic::experimental_widenable_condition>())) {
- // Pick any, we don't care which
- WideableCond = Condition;
- continue;
- }
-
- if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) {
- if (auto NewRangeCheck = widenICmpRangeCheck(ICI, Expander,
- Guard)) {
- Checks.push_back(NewRangeCheck.getValue());
- NumWidened++;
- continue;
- }
- }
-
- // Save the condition as is if we can't widen it
- Checks.push_back(Condition);
- } while (!Worklist.empty());
- // At the moment, our matching logic for wideable conditions implicitly
- // assumes we preserve the form: (br (and Cond, WC())). FIXME
- // Note that if there were multiple calls to wideable condition in the
- // traversal, we only need to keep one, and which one is arbitrary.
- if (WideableCond)
- Checks.push_back(WideableCond);
- return NumWidened;
-}
-
-bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
- SCEVExpander &Expander) {
- LLVM_DEBUG(dbgs() << "Processing guard:\n");
- LLVM_DEBUG(Guard->dump());
-
- TotalConsidered++;
- SmallVector<Value *, 4> Checks;
- unsigned NumWidened = collectChecks(Checks, Guard->getOperand(0), Expander,
- Guard);
- if (NumWidened == 0)
- return false;
-
- TotalWidened += NumWidened;
-
- // Emit the new guard condition
- IRBuilder<> Builder(findInsertPt(Guard, Checks));
- Value *AllChecks = Builder.CreateAnd(Checks);
- auto *OldCond = Guard->getOperand(0);
- Guard->setOperand(0, AllChecks);
- RecursivelyDeleteTriviallyDeadInstructions(OldCond);
-
- LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
- return true;
-}
-
-bool LoopPredication::widenWidenableBranchGuardConditions(
- BranchInst *BI, SCEVExpander &Expander) {
- assert(isGuardAsWidenableBranch(BI) && "Must be!");
- LLVM_DEBUG(dbgs() << "Processing guard:\n");
- LLVM_DEBUG(BI->dump());
-
- TotalConsidered++;
- SmallVector<Value *, 4> Checks;
- unsigned NumWidened = collectChecks(Checks, BI->getCondition(),
- Expander, BI);
- if (NumWidened == 0)
- return false;
-
- TotalWidened += NumWidened;
-
- // Emit the new guard condition
- IRBuilder<> Builder(findInsertPt(BI, Checks));
- Value *AllChecks = Builder.CreateAnd(Checks);
- auto *OldCond = BI->getCondition();
- BI->setCondition(AllChecks);
- RecursivelyDeleteTriviallyDeadInstructions(OldCond);
- assert(isGuardAsWidenableBranch(BI) &&
- "Stopped being a guard after transform?");
-
- LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
- return true;
-}
-
-Optional<LoopICmp> LoopPredication::parseLoopLatchICmp() {
- using namespace PatternMatch;
-
- BasicBlock *LoopLatch = L->getLoopLatch();
- if (!LoopLatch) {
- LLVM_DEBUG(dbgs() << "The loop doesn't have a single latch!\n");
- return None;
- }
-
- auto *BI = dyn_cast<BranchInst>(LoopLatch->getTerminator());
- if (!BI || !BI->isConditional()) {
- LLVM_DEBUG(dbgs() << "Failed to match the latch terminator!\n");
- return None;
- }
- BasicBlock *TrueDest = BI->getSuccessor(0);
- assert(
- (TrueDest == L->getHeader() || BI->getSuccessor(1) == L->getHeader()) &&
- "One of the latch's destinations must be the header");
-
- auto *ICI = dyn_cast<ICmpInst>(BI->getCondition());
- if (!ICI) {
- LLVM_DEBUG(dbgs() << "Failed to match the latch condition!\n");
- return None;
- }
- auto Result = parseLoopICmp(ICI);
- if (!Result) {
- LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
- return None;
- }
-
- if (TrueDest != L->getHeader())
- Result->Pred = ICmpInst::getInversePredicate(Result->Pred);
-
- // Check affine first, so if it's not we don't try to compute the step
- // recurrence.
- if (!Result->IV->isAffine()) {
- LLVM_DEBUG(dbgs() << "The induction variable is not affine!\n");
- return None;
- }
-
- auto *Step = Result->IV->getStepRecurrence(*SE);
- if (!isSupportedStep(Step)) {
- LLVM_DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
- return None;
- }
-
- auto IsUnsupportedPredicate = [](const SCEV *Step, ICmpInst::Predicate Pred) {
- if (Step->isOne()) {
- return Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_SLT &&
- Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE;
- } else {
- assert(Step->isAllOnesValue() && "Step should be -1!");
- return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT &&
- Pred != ICmpInst::ICMP_UGE && Pred != ICmpInst::ICMP_SGE;
- }
- };
-
- normalizePredicate(SE, L, *Result);
- if (IsUnsupportedPredicate(Step, Result->Pred)) {
- LLVM_DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
- << ")!\n");
- return None;
- }
-
- return Result;
-}
-
-
-bool LoopPredication::isLoopProfitableToPredicate() {
- if (SkipProfitabilityChecks || !BPI)
- return true;
-
- SmallVector<std::pair<BasicBlock *, BasicBlock *>, 8> ExitEdges;
- L->getExitEdges(ExitEdges);
- // If there is only one exiting edge in the loop, it is always profitable to
- // predicate the loop.
- if (ExitEdges.size() == 1)
- return true;
-
- // Calculate the exiting probabilities of all exiting edges from the loop,
- // starting with the LatchExitProbability.
- // Heuristic for profitability: If any of the exiting blocks' probability of
- // exiting the loop is larger than exiting through the latch block, it's not
- // profitable to predicate the loop.
- auto *LatchBlock = L->getLoopLatch();
- assert(LatchBlock && "Should have a single latch at this point!");
- auto *LatchTerm = LatchBlock->getTerminator();
- assert(LatchTerm->getNumSuccessors() == 2 &&
- "expected to be an exiting block with 2 succs!");
- unsigned LatchBrExitIdx =
- LatchTerm->getSuccessor(0) == L->getHeader() ? 1 : 0;
- BranchProbability LatchExitProbability =
- BPI->getEdgeProbability(LatchBlock, LatchBrExitIdx);
-
- // Protect against degenerate inputs provided by the user. Providing a value
- // less than one, can invert the definition of profitable loop predication.
- float ScaleFactor = LatchExitProbabilityScale;
- if (ScaleFactor < 1) {
- LLVM_DEBUG(
- dbgs()
- << "Ignored user setting for loop-predication-latch-probability-scale: "
- << LatchExitProbabilityScale << "\n");
- LLVM_DEBUG(dbgs() << "The value is set to 1.0\n");
- ScaleFactor = 1.0;
- }
- const auto LatchProbabilityThreshold =
- LatchExitProbability * ScaleFactor;
-
- for (const auto &ExitEdge : ExitEdges) {
- BranchProbability ExitingBlockProbability =
- BPI->getEdgeProbability(ExitEdge.first, ExitEdge.second);
- // Some exiting edge has higher probability than the latch exiting edge.
- // No longer profitable to predicate.
- if (ExitingBlockProbability > LatchProbabilityThreshold)
- return false;
- }
- // Using BPI, we have concluded that the most probable way to exit from the
- // loop is through the latch (or there's no profile information and all
- // exits are equally likely).
- return true;
-}
-
-/// If we can (cheaply) find a widenable branch which controls entry into the
-/// loop, return it.
-static BranchInst *FindWidenableTerminatorAboveLoop(Loop *L, LoopInfo &LI) {
- // Walk back through any unconditional executed blocks and see if we can find
- // a widenable condition which seems to control execution of this loop. Note
- // that we predict that maythrow calls are likely untaken and thus that it's
- // profitable to widen a branch before a maythrow call with a condition
- // afterwards even though that may cause the slow path to run in a case where
- // it wouldn't have otherwise.
- BasicBlock *BB = L->getLoopPreheader();
- if (!BB)
- return nullptr;
- do {
- if (BasicBlock *Pred = BB->getSinglePredecessor())
- if (BB == Pred->getSingleSuccessor()) {
- BB = Pred;
- continue;
- }
- break;
- } while (true);
-
- if (BasicBlock *Pred = BB->getSinglePredecessor()) {
- auto *Term = Pred->getTerminator();
-
- Value *Cond, *WC;
- BasicBlock *IfTrueBB, *IfFalseBB;
- if (parseWidenableBranch(Term, Cond, WC, IfTrueBB, IfFalseBB) &&
- IfTrueBB == BB)
- return cast<BranchInst>(Term);
- }
- return nullptr;
-}
-
-/// Return the minimum of all analyzeable exit counts. This is an upper bound
-/// on the actual exit count. If there are not at least two analyzeable exits,
-/// returns SCEVCouldNotCompute.
-static const SCEV *getMinAnalyzeableBackedgeTakenCount(ScalarEvolution &SE,
- DominatorTree &DT,
- Loop *L) {
- SmallVector<BasicBlock *, 16> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
-
- SmallVector<const SCEV *, 4> ExitCounts;
- for (BasicBlock *ExitingBB : ExitingBlocks) {
- const SCEV *ExitCount = SE.getExitCount(L, ExitingBB);
- if (isa<SCEVCouldNotCompute>(ExitCount))
- continue;
- assert(DT.dominates(ExitingBB, L->getLoopLatch()) &&
- "We should only have known counts for exiting blocks that "
- "dominate latch!");
- ExitCounts.push_back(ExitCount);
- }
- if (ExitCounts.size() < 2)
- return SE.getCouldNotCompute();
- return SE.getUMinFromMismatchedTypes(ExitCounts);
-}
-
-/// This implements an analogous, but entirely distinct transform from the main
-/// loop predication transform. This one is phrased in terms of using a
-/// widenable branch *outside* the loop to allow us to simplify loop exits in a
-/// following loop. This is close in spirit to the IndVarSimplify transform
-/// of the same name, but is materially different widening loosens legality
-/// sharply.
-bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
- // The transformation performed here aims to widen a widenable condition
- // above the loop such that all analyzeable exit leading to deopt are dead.
- // It assumes that the latch is the dominant exit for profitability and that
- // exits branching to deoptimizing blocks are rarely taken. It relies on the
- // semantics of widenable expressions for legality. (i.e. being able to fall
- // down the widenable path spuriously allows us to ignore exit order,
- // unanalyzeable exits, side effects, exceptional exits, and other challenges
- // which restrict the applicability of the non-WC based version of this
- // transform in IndVarSimplify.)
- //
- // NOTE ON POISON/UNDEF - We're hoisting an expression above guards which may
- // imply flags on the expression being hoisted and inserting new uses (flags
- // are only correct for current uses). The result is that we may be
- // inserting a branch on the value which can be either poison or undef. In
- // this case, the branch can legally go either way; we just need to avoid
- // introducing UB. This is achieved through the use of the freeze
- // instruction.
-
- SmallVector<BasicBlock *, 16> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
-
- if (ExitingBlocks.empty())
- return false; // Nothing to do.
-
- auto *Latch = L->getLoopLatch();
- if (!Latch)
- return false;
-
- auto *WidenableBR = FindWidenableTerminatorAboveLoop(L, *LI);
- if (!WidenableBR)
- return false;
-
- const SCEV *LatchEC = SE->getExitCount(L, Latch);
- if (isa<SCEVCouldNotCompute>(LatchEC))
- return false; // profitability - want hot exit in analyzeable set
-
- // At this point, we have found an analyzeable latch, and a widenable
- // condition above the loop. If we have a widenable exit within the loop
- // (for which we can't compute exit counts), drop the ability to further
- // widen so that we gain ability to analyze it's exit count and perform this
- // transform. TODO: It'd be nice to know for sure the exit became
- // analyzeable after dropping widenability.
- {
- bool Invalidate = false;
-
- for (auto *ExitingBB : ExitingBlocks) {
- if (LI->getLoopFor(ExitingBB) != L)
- continue;
-
- auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
- if (!BI)
- continue;
-
- Use *Cond, *WC;
- BasicBlock *IfTrueBB, *IfFalseBB;
- if (parseWidenableBranch(BI, Cond, WC, IfTrueBB, IfFalseBB) &&
- L->contains(IfTrueBB)) {
- WC->set(ConstantInt::getTrue(IfTrueBB->getContext()));
- Invalidate = true;
- }
- }
- if (Invalidate)
- SE->forgetLoop(L);
- }
-
- // The use of umin(all analyzeable exits) instead of latch is subtle, but
- // important for profitability. We may have a loop which hasn't been fully
- // canonicalized just yet. If the exit we chose to widen is provably never
- // taken, we want the widened form to *also* be provably never taken. We
- // can't guarantee this as a current unanalyzeable exit may later become
- // analyzeable, but we can at least avoid the obvious cases.
- const SCEV *MinEC = getMinAnalyzeableBackedgeTakenCount(*SE, *DT, L);
- if (isa<SCEVCouldNotCompute>(MinEC) || MinEC->getType()->isPointerTy() ||
- !SE->isLoopInvariant(MinEC, L) ||
- !isSafeToExpandAt(MinEC, WidenableBR, *SE))
- return false;
-
- // Subtlety: We need to avoid inserting additional uses of the WC. We know
- // that it can only have one transitive use at the moment, and thus moving
- // that use to just before the branch and inserting code before it and then
- // modifying the operand is legal.
- auto *IP = cast<Instruction>(WidenableBR->getCondition());
- IP->moveBefore(WidenableBR);
- Rewriter.setInsertPoint(IP);
- IRBuilder<> B(IP);
-
- bool Changed = false;
- Value *MinECV = nullptr; // lazily generated if needed
- for (BasicBlock *ExitingBB : ExitingBlocks) {
- // If our exiting block exits multiple loops, we can only rewrite the
- // innermost one. Otherwise, we're changing how many times the innermost
- // loop runs before it exits.
- if (LI->getLoopFor(ExitingBB) != L)
- continue;
-
- // Can't rewrite non-branch yet.
- auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
- if (!BI)
- continue;
-
- // If already constant, nothing to do.
- if (isa<Constant>(BI->getCondition()))
- continue;
-
- const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
- if (isa<SCEVCouldNotCompute>(ExitCount) ||
- ExitCount->getType()->isPointerTy() ||
- !isSafeToExpandAt(ExitCount, WidenableBR, *SE))
- continue;
-
- const bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
- BasicBlock *ExitBB = BI->getSuccessor(ExitIfTrue ? 0 : 1);
- if (!ExitBB->getPostdominatingDeoptimizeCall())
- continue;
-
- /// Here we can be fairly sure that executing this exit will most likely
- /// lead to executing llvm.experimental.deoptimize.
- /// This is a profitability heuristic, not a legality constraint.
-
- // If we found a widenable exit condition, do two things:
- // 1) fold the widened exit test into the widenable condition
- // 2) fold the branch to untaken - avoids infinite looping
-
- Value *ECV = Rewriter.expandCodeFor(ExitCount);
- if (!MinECV)
- MinECV = Rewriter.expandCodeFor(MinEC);
- Value *RHS = MinECV;
- if (ECV->getType() != RHS->getType()) {
- Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType());
- ECV = B.CreateZExt(ECV, WiderTy);
- RHS = B.CreateZExt(RHS, WiderTy);
- }
- assert(!Latch || DT->dominates(ExitingBB, Latch));
- Value *NewCond = B.CreateICmp(ICmpInst::ICMP_UGT, ECV, RHS);
- // Freeze poison or undef to an arbitrary bit pattern to ensure we can
- // branch without introducing UB. See NOTE ON POISON/UNDEF above for
- // context.
- NewCond = B.CreateFreeze(NewCond);
-
- widenWidenableBranch(WidenableBR, NewCond);
-
- Value *OldCond = BI->getCondition();
- BI->setCondition(ConstantInt::get(OldCond->getType(), !ExitIfTrue));
- Changed = true;
- }
-
- if (Changed)
- // We just mutated a bunch of loop exits changing there exit counts
- // widely. We need to force recomputation of the exit counts given these
- // changes. Note that all of the inserted exits are never taken, and
- // should be removed next time the CFG is modified.
- SE->forgetLoop(L);
- return Changed;
-}
-
-bool LoopPredication::runOnLoop(Loop *Loop) {
- L = Loop;
-
- LLVM_DEBUG(dbgs() << "Analyzing ");
- LLVM_DEBUG(L->dump());
-
- Module *M = L->getHeader()->getModule();
-
- // There is nothing to do if the module doesn't use guards
- auto *GuardDecl =
- M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
- bool HasIntrinsicGuards = GuardDecl && !GuardDecl->use_empty();
- auto *WCDecl = M->getFunction(
- Intrinsic::getName(Intrinsic::experimental_widenable_condition));
- bool HasWidenableConditions =
- PredicateWidenableBranchGuards && WCDecl && !WCDecl->use_empty();
- if (!HasIntrinsicGuards && !HasWidenableConditions)
- return false;
-
- DL = &M->getDataLayout();
-
- Preheader = L->getLoopPreheader();
- if (!Preheader)
- return false;
-
- auto LatchCheckOpt = parseLoopLatchICmp();
- if (!LatchCheckOpt)
- return false;
- LatchCheck = *LatchCheckOpt;
-
- LLVM_DEBUG(dbgs() << "Latch check:\n");
- LLVM_DEBUG(LatchCheck.dump());
-
- if (!isLoopProfitableToPredicate()) {
- LLVM_DEBUG(dbgs() << "Loop not profitable to predicate!\n");
- return false;
- }
- // Collect all the guards into a vector and process later, so as not
- // to invalidate the instruction iterator.
- SmallVector<IntrinsicInst *, 4> Guards;
- SmallVector<BranchInst *, 4> GuardsAsWidenableBranches;
- for (const auto BB : L->blocks()) {
- for (auto &I : *BB)
- if (isGuard(&I))
- Guards.push_back(cast<IntrinsicInst>(&I));
- if (PredicateWidenableBranchGuards &&
- isGuardAsWidenableBranch(BB->getTerminator()))
- GuardsAsWidenableBranches.push_back(
- cast<BranchInst>(BB->getTerminator()));
- }
-
- SCEVExpander Expander(*SE, *DL, "loop-predication");
- bool Changed = false;
- for (auto *Guard : Guards)
- Changed |= widenGuardConditions(Guard, Expander);
- for (auto *Guard : GuardsAsWidenableBranches)
- Changed |= widenWidenableBranchGuardConditions(Guard, Expander);
- Changed |= predicateLoopExits(L, Expander);
- return Changed;
-}
+ return None;
+ if (!isSafeToTruncateWideIVType(DL, SE, LatchCheck, RangeCheckType))
+ return None;
+ // We can now safely identify the truncated version of the IV and limit for
+ // RangeCheckType.
+ LoopICmp NewLatchCheck;
+ NewLatchCheck.Pred = LatchCheck.Pred;
+ NewLatchCheck.IV = dyn_cast<SCEVAddRecExpr>(
+ SE.getTruncateExpr(LatchCheck.IV, RangeCheckType));
+ if (!NewLatchCheck.IV)
+ return None;
+ NewLatchCheck.Limit = SE.getTruncateExpr(LatchCheck.Limit, RangeCheckType);
+ LLVM_DEBUG(dbgs() << "IV of type: " << *LatchType
+ << "can be represented as range check type:"
+ << *RangeCheckType << "\n");
+ LLVM_DEBUG(dbgs() << "LatchCheck.IV: " << *NewLatchCheck.IV << "\n");
+ LLVM_DEBUG(dbgs() << "LatchCheck.Limit: " << *NewLatchCheck.Limit << "\n");
+ return NewLatchCheck;
+}
+
+bool LoopPredication::isSupportedStep(const SCEV* Step) {
+ return Step->isOne() || (Step->isAllOnesValue() && EnableCountDownLoop);
+}
+
+Instruction *LoopPredication::findInsertPt(Instruction *Use,
+ ArrayRef<Value*> Ops) {
+ for (Value *Op : Ops)
+ if (!L->isLoopInvariant(Op))
+ return Use;
+ return Preheader->getTerminator();
+}
+
+Instruction *LoopPredication::findInsertPt(Instruction *Use,
+ ArrayRef<const SCEV*> Ops) {
+ // Subtlety: SCEV considers things to be invariant if the value produced is
+ // the same across iterations. This is not the same as being able to
+ // evaluate outside the loop, which is what we actually need here.
+ for (const SCEV *Op : Ops)
+ if (!SE->isLoopInvariant(Op, L) ||
+ !isSafeToExpandAt(Op, Preheader->getTerminator(), *SE))
+ return Use;
+ return Preheader->getTerminator();
+}
+
+bool LoopPredication::isLoopInvariantValue(const SCEV* S) {
+ // Handling expressions which produce invariant results, but *haven't* yet
+ // been removed from the loop serves two important purposes.
+ // 1) Most importantly, it resolves a pass ordering cycle which would
+ // otherwise need us to iteration licm, loop-predication, and either
+ // loop-unswitch or loop-peeling to make progress on examples with lots of
+ // predicable range checks in a row. (Since, in the general case, we can't
+ // hoist the length checks until the dominating checks have been discharged
+ // as we can't prove doing so is safe.)
+ // 2) As a nice side effect, this exposes the value of peeling or unswitching
+ // much more obviously in the IR. Otherwise, the cost modeling for other
+ // transforms would end up needing to duplicate all of this logic to model a
+ // check which becomes predictable based on a modeled peel or unswitch.
+ //
+ // The cost of doing so in the worst case is an extra fill from the stack in
+ // the loop to materialize the loop invariant test value instead of checking
+ // against the original IV which is presumable in a register inside the loop.
+ // Such cases are presumably rare, and hint at missing oppurtunities for
+ // other passes.
+
+ if (SE->isLoopInvariant(S, L))
+ // Note: This the SCEV variant, so the original Value* may be within the
+ // loop even though SCEV has proven it is loop invariant.
+ return true;
+
+ // Handle a particular important case which SCEV doesn't yet know about which
+ // shows up in range checks on arrays with immutable lengths.
+ // TODO: This should be sunk inside SCEV.
+ if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S))
+ if (const auto *LI = dyn_cast<LoadInst>(U->getValue()))
+ if (LI->isUnordered() && L->hasLoopInvariantOperands(LI))
+ if (AA->pointsToConstantMemory(LI->getOperand(0)) ||
+ LI->hasMetadata(LLVMContext::MD_invariant_load))
+ return true;
+ return false;
+}
+
+Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
+ LoopICmp LatchCheck, LoopICmp RangeCheck,
+ SCEVExpander &Expander, Instruction *Guard) {
+ auto *Ty = RangeCheck.IV->getType();
+ // Generate the widened condition for the forward loop:
+ // guardStart u< guardLimit &&
+ // latchLimit <pred> guardLimit - 1 - guardStart + latchStart
+ // where <pred> depends on the latch condition predicate. See the file
+ // header comment for the reasoning.
+ // guardLimit - guardStart + latchStart - 1
+ const SCEV *GuardStart = RangeCheck.IV->getStart();
+ const SCEV *GuardLimit = RangeCheck.Limit;
+ const SCEV *LatchStart = LatchCheck.IV->getStart();
+ const SCEV *LatchLimit = LatchCheck.Limit;
+ // Subtlety: We need all the values to be *invariant* across all iterations,
+ // but we only need to check expansion safety for those which *aren't*
+ // already guaranteed to dominate the guard.
+ if (!isLoopInvariantValue(GuardStart) ||
+ !isLoopInvariantValue(GuardLimit) ||
+ !isLoopInvariantValue(LatchStart) ||
+ !isLoopInvariantValue(LatchLimit)) {
+ LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
+ return None;
+ }
+ if (!isSafeToExpandAt(LatchStart, Guard, *SE) ||
+ !isSafeToExpandAt(LatchLimit, Guard, *SE)) {
+ LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
+ return None;
+ }
+
+ // guardLimit - guardStart + latchStart - 1
+ const SCEV *RHS =
+ SE->getAddExpr(SE->getMinusSCEV(GuardLimit, GuardStart),
+ SE->getMinusSCEV(LatchStart, SE->getOne(Ty)));
+ auto LimitCheckPred =
+ ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred);
+
+ LLVM_DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n");
+ LLVM_DEBUG(dbgs() << "RHS: " << *RHS << "\n");
+ LLVM_DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
+
+ auto *LimitCheck =
+ expandCheck(Expander, Guard, LimitCheckPred, LatchLimit, RHS);
+ auto *FirstIterationCheck = expandCheck(Expander, Guard, RangeCheck.Pred,
+ GuardStart, GuardLimit);
+ IRBuilder<> Builder(findInsertPt(Guard, {FirstIterationCheck, LimitCheck}));
+ return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
+}
+
+Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
+ LoopICmp LatchCheck, LoopICmp RangeCheck,
+ SCEVExpander &Expander, Instruction *Guard) {
+ auto *Ty = RangeCheck.IV->getType();
+ const SCEV *GuardStart = RangeCheck.IV->getStart();
+ const SCEV *GuardLimit = RangeCheck.Limit;
+ const SCEV *LatchStart = LatchCheck.IV->getStart();
+ const SCEV *LatchLimit = LatchCheck.Limit;
+ // Subtlety: We need all the values to be *invariant* across all iterations,
+ // but we only need to check expansion safety for those which *aren't*
+ // already guaranteed to dominate the guard.
+ if (!isLoopInvariantValue(GuardStart) ||
+ !isLoopInvariantValue(GuardLimit) ||
+ !isLoopInvariantValue(LatchStart) ||
+ !isLoopInvariantValue(LatchLimit)) {
+ LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
+ return None;
+ }
+ if (!isSafeToExpandAt(LatchStart, Guard, *SE) ||
+ !isSafeToExpandAt(LatchLimit, Guard, *SE)) {
+ LLVM_DEBUG(dbgs() << "Can't expand limit check!\n");
+ return None;
+ }
+ // The decrement of the latch check IV should be the same as the
+ // rangeCheckIV.
+ auto *PostDecLatchCheckIV = LatchCheck.IV->getPostIncExpr(*SE);
+ if (RangeCheck.IV != PostDecLatchCheckIV) {
+ LLVM_DEBUG(dbgs() << "Not the same. PostDecLatchCheckIV: "
+ << *PostDecLatchCheckIV
+ << " and RangeCheckIV: " << *RangeCheck.IV << "\n");
+ return None;
+ }
+
+ // Generate the widened condition for CountDownLoop:
+ // guardStart u< guardLimit &&
+ // latchLimit <pred> 1.
+ // See the header comment for reasoning of the checks.
+ auto LimitCheckPred =
+ ICmpInst::getFlippedStrictnessPredicate(LatchCheck.Pred);
+ auto *FirstIterationCheck = expandCheck(Expander, Guard,
+ ICmpInst::ICMP_ULT,
+ GuardStart, GuardLimit);
+ auto *LimitCheck = expandCheck(Expander, Guard, LimitCheckPred, LatchLimit,
+ SE->getOne(Ty));
+ IRBuilder<> Builder(findInsertPt(Guard, {FirstIterationCheck, LimitCheck}));
+ return Builder.CreateAnd(FirstIterationCheck, LimitCheck);
+}
+
+static void normalizePredicate(ScalarEvolution *SE, Loop *L,
+ LoopICmp& RC) {
+ // LFTR canonicalizes checks to the ICMP_NE/EQ form; normalize back to the
+ // ULT/UGE form for ease of handling by our caller.
+ if (ICmpInst::isEquality(RC.Pred) &&
+ RC.IV->getStepRecurrence(*SE)->isOne() &&
+ SE->isKnownPredicate(ICmpInst::ICMP_ULE, RC.IV->getStart(), RC.Limit))
+ RC.Pred = RC.Pred == ICmpInst::ICMP_NE ?
+ ICmpInst::ICMP_ULT : ICmpInst::ICMP_UGE;
+}
+
+
+/// If ICI can be widened to a loop invariant condition emits the loop
+/// invariant condition in the loop preheader and return it, otherwise
+/// returns None.
+Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
+ SCEVExpander &Expander,
+ Instruction *Guard) {
+ LLVM_DEBUG(dbgs() << "Analyzing ICmpInst condition:\n");
+ LLVM_DEBUG(ICI->dump());
+
+ // parseLoopStructure guarantees that the latch condition is:
+ // ++i <pred> latchLimit, where <pred> is u<, u<=, s<, or s<=.
+ // We are looking for the range checks of the form:
+ // i u< guardLimit
+ auto RangeCheck = parseLoopICmp(ICI);
+ if (!RangeCheck) {
+ LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+ return None;
+ }
+ LLVM_DEBUG(dbgs() << "Guard check:\n");
+ LLVM_DEBUG(RangeCheck->dump());
+ if (RangeCheck->Pred != ICmpInst::ICMP_ULT) {
+ LLVM_DEBUG(dbgs() << "Unsupported range check predicate("
+ << RangeCheck->Pred << ")!\n");
+ return None;
+ }
+ auto *RangeCheckIV = RangeCheck->IV;
+ if (!RangeCheckIV->isAffine()) {
+ LLVM_DEBUG(dbgs() << "Range check IV is not affine!\n");
+ return None;
+ }
+ auto *Step = RangeCheckIV->getStepRecurrence(*SE);
+ // We cannot just compare with latch IV step because the latch and range IVs
+ // may have different types.
+ if (!isSupportedStep(Step)) {
+ LLVM_DEBUG(dbgs() << "Range check and latch have IVs different steps!\n");
+ return None;
+ }
+ auto *Ty = RangeCheckIV->getType();
+ auto CurrLatchCheckOpt = generateLoopLatchCheck(*DL, *SE, LatchCheck, Ty);
+ if (!CurrLatchCheckOpt) {
+ LLVM_DEBUG(dbgs() << "Failed to generate a loop latch check "
+ "corresponding to range type: "
+ << *Ty << "\n");
+ return None;
+ }
+
+ LoopICmp CurrLatchCheck = *CurrLatchCheckOpt;
+ // At this point, the range and latch step should have the same type, but need
+ // not have the same value (we support both 1 and -1 steps).
+ assert(Step->getType() ==
+ CurrLatchCheck.IV->getStepRecurrence(*SE)->getType() &&
+ "Range and latch steps should be of same type!");
+ if (Step != CurrLatchCheck.IV->getStepRecurrence(*SE)) {
+ LLVM_DEBUG(dbgs() << "Range and latch have different step values!\n");
+ return None;
+ }
+
+ if (Step->isOne())
+ return widenICmpRangeCheckIncrementingLoop(CurrLatchCheck, *RangeCheck,
+ Expander, Guard);
+ else {
+ assert(Step->isAllOnesValue() && "Step should be -1!");
+ return widenICmpRangeCheckDecrementingLoop(CurrLatchCheck, *RangeCheck,
+ Expander, Guard);
+ }
+}
+
+unsigned LoopPredication::collectChecks(SmallVectorImpl<Value *> &Checks,
+ Value *Condition,
+ SCEVExpander &Expander,
+ Instruction *Guard) {
+ unsigned NumWidened = 0;
+ // The guard condition is expected to be in form of:
+ // cond1 && cond2 && cond3 ...
+ // Iterate over subconditions looking for icmp conditions which can be
+ // widened across loop iterations. Widening these conditions remember the
+ // resulting list of subconditions in Checks vector.
+ SmallVector<Value *, 4> Worklist(1, Condition);
+ SmallPtrSet<Value *, 4> Visited;
+ Value *WideableCond = nullptr;
+ do {
+ Value *Condition = Worklist.pop_back_val();
+ if (!Visited.insert(Condition).second)
+ continue;
+
+ Value *LHS, *RHS;
+ using namespace llvm::PatternMatch;
+ if (match(Condition, m_And(m_Value(LHS), m_Value(RHS)))) {
+ Worklist.push_back(LHS);
+ Worklist.push_back(RHS);
+ continue;
+ }
+
+ if (match(Condition,
+ m_Intrinsic<Intrinsic::experimental_widenable_condition>())) {
+ // Pick any, we don't care which
+ WideableCond = Condition;
+ continue;
+ }
+
+ if (ICmpInst *ICI = dyn_cast<ICmpInst>(Condition)) {
+ if (auto NewRangeCheck = widenICmpRangeCheck(ICI, Expander,
+ Guard)) {
+ Checks.push_back(NewRangeCheck.getValue());
+ NumWidened++;
+ continue;
+ }
+ }
+
+ // Save the condition as is if we can't widen it
+ Checks.push_back(Condition);
+ } while (!Worklist.empty());
+ // At the moment, our matching logic for wideable conditions implicitly
+ // assumes we preserve the form: (br (and Cond, WC())). FIXME
+ // Note that if there were multiple calls to wideable condition in the
+ // traversal, we only need to keep one, and which one is arbitrary.
+ if (WideableCond)
+ Checks.push_back(WideableCond);
+ return NumWidened;
+}
+
+bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
+ SCEVExpander &Expander) {
+ LLVM_DEBUG(dbgs() << "Processing guard:\n");
+ LLVM_DEBUG(Guard->dump());
+
+ TotalConsidered++;
+ SmallVector<Value *, 4> Checks;
+ unsigned NumWidened = collectChecks(Checks, Guard->getOperand(0), Expander,
+ Guard);
+ if (NumWidened == 0)
+ return false;
+
+ TotalWidened += NumWidened;
+
+ // Emit the new guard condition
+ IRBuilder<> Builder(findInsertPt(Guard, Checks));
+ Value *AllChecks = Builder.CreateAnd(Checks);
+ auto *OldCond = Guard->getOperand(0);
+ Guard->setOperand(0, AllChecks);
+ RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+
+ LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
+ return true;
+}
+
+bool LoopPredication::widenWidenableBranchGuardConditions(
+ BranchInst *BI, SCEVExpander &Expander) {
+ assert(isGuardAsWidenableBranch(BI) && "Must be!");
+ LLVM_DEBUG(dbgs() << "Processing guard:\n");
+ LLVM_DEBUG(BI->dump());
+
+ TotalConsidered++;
+ SmallVector<Value *, 4> Checks;
+ unsigned NumWidened = collectChecks(Checks, BI->getCondition(),
+ Expander, BI);
+ if (NumWidened == 0)
+ return false;
+
+ TotalWidened += NumWidened;
+
+ // Emit the new guard condition
+ IRBuilder<> Builder(findInsertPt(BI, Checks));
+ Value *AllChecks = Builder.CreateAnd(Checks);
+ auto *OldCond = BI->getCondition();
+ BI->setCondition(AllChecks);
+ RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+ assert(isGuardAsWidenableBranch(BI) &&
+ "Stopped being a guard after transform?");
+
+ LLVM_DEBUG(dbgs() << "Widened checks = " << NumWidened << "\n");
+ return true;
+}
+
+Optional<LoopICmp> LoopPredication::parseLoopLatchICmp() {
+ using namespace PatternMatch;
+
+ BasicBlock *LoopLatch = L->getLoopLatch();
+ if (!LoopLatch) {
+ LLVM_DEBUG(dbgs() << "The loop doesn't have a single latch!\n");
+ return None;
+ }
+
+ auto *BI = dyn_cast<BranchInst>(LoopLatch->getTerminator());
+ if (!BI || !BI->isConditional()) {
+ LLVM_DEBUG(dbgs() << "Failed to match the latch terminator!\n");
+ return None;
+ }
+ BasicBlock *TrueDest = BI->getSuccessor(0);
+ assert(
+ (TrueDest == L->getHeader() || BI->getSuccessor(1) == L->getHeader()) &&
+ "One of the latch's destinations must be the header");
+
+ auto *ICI = dyn_cast<ICmpInst>(BI->getCondition());
+ if (!ICI) {
+ LLVM_DEBUG(dbgs() << "Failed to match the latch condition!\n");
+ return None;
+ }
+ auto Result = parseLoopICmp(ICI);
+ if (!Result) {
+ LLVM_DEBUG(dbgs() << "Failed to parse the loop latch condition!\n");
+ return None;
+ }
+
+ if (TrueDest != L->getHeader())
+ Result->Pred = ICmpInst::getInversePredicate(Result->Pred);
+
+ // Check affine first, so if it's not we don't try to compute the step
+ // recurrence.
+ if (!Result->IV->isAffine()) {
+ LLVM_DEBUG(dbgs() << "The induction variable is not affine!\n");
+ return None;
+ }
+
+ auto *Step = Result->IV->getStepRecurrence(*SE);
+ if (!isSupportedStep(Step)) {
+ LLVM_DEBUG(dbgs() << "Unsupported loop stride(" << *Step << ")!\n");
+ return None;
+ }
+
+ auto IsUnsupportedPredicate = [](const SCEV *Step, ICmpInst::Predicate Pred) {
+ if (Step->isOne()) {
+ return Pred != ICmpInst::ICMP_ULT && Pred != ICmpInst::ICMP_SLT &&
+ Pred != ICmpInst::ICMP_ULE && Pred != ICmpInst::ICMP_SLE;
+ } else {
+ assert(Step->isAllOnesValue() && "Step should be -1!");
+ return Pred != ICmpInst::ICMP_UGT && Pred != ICmpInst::ICMP_SGT &&
+ Pred != ICmpInst::ICMP_UGE && Pred != ICmpInst::ICMP_SGE;
+ }
+ };
+
+ normalizePredicate(SE, L, *Result);
+ if (IsUnsupportedPredicate(Step, Result->Pred)) {
+ LLVM_DEBUG(dbgs() << "Unsupported loop latch predicate(" << Result->Pred
+ << ")!\n");
+ return None;
+ }
+
+ return Result;
+}
+
+
+bool LoopPredication::isLoopProfitableToPredicate() {
+ if (SkipProfitabilityChecks || !BPI)
+ return true;
+
+ SmallVector<std::pair<BasicBlock *, BasicBlock *>, 8> ExitEdges;
+ L->getExitEdges(ExitEdges);
+ // If there is only one exiting edge in the loop, it is always profitable to
+ // predicate the loop.
+ if (ExitEdges.size() == 1)
+ return true;
+
+ // Calculate the exiting probabilities of all exiting edges from the loop,
+ // starting with the LatchExitProbability.
+ // Heuristic for profitability: If any of the exiting blocks' probability of
+ // exiting the loop is larger than exiting through the latch block, it's not
+ // profitable to predicate the loop.
+ auto *LatchBlock = L->getLoopLatch();
+ assert(LatchBlock && "Should have a single latch at this point!");
+ auto *LatchTerm = LatchBlock->getTerminator();
+ assert(LatchTerm->getNumSuccessors() == 2 &&
+ "expected to be an exiting block with 2 succs!");
+ unsigned LatchBrExitIdx =
+ LatchTerm->getSuccessor(0) == L->getHeader() ? 1 : 0;
+ BranchProbability LatchExitProbability =
+ BPI->getEdgeProbability(LatchBlock, LatchBrExitIdx);
+
+ // Protect against degenerate inputs provided by the user. Providing a value
+ // less than one, can invert the definition of profitable loop predication.
+ float ScaleFactor = LatchExitProbabilityScale;
+ if (ScaleFactor < 1) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Ignored user setting for loop-predication-latch-probability-scale: "
+ << LatchExitProbabilityScale << "\n");
+ LLVM_DEBUG(dbgs() << "The value is set to 1.0\n");
+ ScaleFactor = 1.0;
+ }
+ const auto LatchProbabilityThreshold =
+ LatchExitProbability * ScaleFactor;
+
+ for (const auto &ExitEdge : ExitEdges) {
+ BranchProbability ExitingBlockProbability =
+ BPI->getEdgeProbability(ExitEdge.first, ExitEdge.second);
+ // Some exiting edge has higher probability than the latch exiting edge.
+ // No longer profitable to predicate.
+ if (ExitingBlockProbability > LatchProbabilityThreshold)
+ return false;
+ }
+ // Using BPI, we have concluded that the most probable way to exit from the
+ // loop is through the latch (or there's no profile information and all
+ // exits are equally likely).
+ return true;
+}
+
+/// If we can (cheaply) find a widenable branch which controls entry into the
+/// loop, return it.
+static BranchInst *FindWidenableTerminatorAboveLoop(Loop *L, LoopInfo &LI) {
+ // Walk back through any unconditional executed blocks and see if we can find
+ // a widenable condition which seems to control execution of this loop. Note
+ // that we predict that maythrow calls are likely untaken and thus that it's
+ // profitable to widen a branch before a maythrow call with a condition
+ // afterwards even though that may cause the slow path to run in a case where
+ // it wouldn't have otherwise.
+ BasicBlock *BB = L->getLoopPreheader();
+ if (!BB)
+ return nullptr;
+ do {
+ if (BasicBlock *Pred = BB->getSinglePredecessor())
+ if (BB == Pred->getSingleSuccessor()) {
+ BB = Pred;
+ continue;
+ }
+ break;
+ } while (true);
+
+ if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+ auto *Term = Pred->getTerminator();
+
+ Value *Cond, *WC;
+ BasicBlock *IfTrueBB, *IfFalseBB;
+ if (parseWidenableBranch(Term, Cond, WC, IfTrueBB, IfFalseBB) &&
+ IfTrueBB == BB)
+ return cast<BranchInst>(Term);
+ }
+ return nullptr;
+}
+
+/// Return the minimum of all analyzeable exit counts. This is an upper bound
+/// on the actual exit count. If there are not at least two analyzeable exits,
+/// returns SCEVCouldNotCompute.
+static const SCEV *getMinAnalyzeableBackedgeTakenCount(ScalarEvolution &SE,
+ DominatorTree &DT,
+ Loop *L) {
+ SmallVector<BasicBlock *, 16> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
+ SmallVector<const SCEV *, 4> ExitCounts;
+ for (BasicBlock *ExitingBB : ExitingBlocks) {
+ const SCEV *ExitCount = SE.getExitCount(L, ExitingBB);
+ if (isa<SCEVCouldNotCompute>(ExitCount))
+ continue;
+ assert(DT.dominates(ExitingBB, L->getLoopLatch()) &&
+ "We should only have known counts for exiting blocks that "
+ "dominate latch!");
+ ExitCounts.push_back(ExitCount);
+ }
+ if (ExitCounts.size() < 2)
+ return SE.getCouldNotCompute();
+ return SE.getUMinFromMismatchedTypes(ExitCounts);
+}
+
+/// This implements an analogous, but entirely distinct transform from the main
+/// loop predication transform. This one is phrased in terms of using a
+/// widenable branch *outside* the loop to allow us to simplify loop exits in a
+/// following loop. This is close in spirit to the IndVarSimplify transform
+/// of the same name, but is materially different widening loosens legality
+/// sharply.
+bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
+ // The transformation performed here aims to widen a widenable condition
+ // above the loop such that all analyzeable exit leading to deopt are dead.
+ // It assumes that the latch is the dominant exit for profitability and that
+ // exits branching to deoptimizing blocks are rarely taken. It relies on the
+ // semantics of widenable expressions for legality. (i.e. being able to fall
+ // down the widenable path spuriously allows us to ignore exit order,
+ // unanalyzeable exits, side effects, exceptional exits, and other challenges
+ // which restrict the applicability of the non-WC based version of this
+ // transform in IndVarSimplify.)
+ //
+ // NOTE ON POISON/UNDEF - We're hoisting an expression above guards which may
+ // imply flags on the expression being hoisted and inserting new uses (flags
+ // are only correct for current uses). The result is that we may be
+ // inserting a branch on the value which can be either poison or undef. In
+ // this case, the branch can legally go either way; we just need to avoid
+ // introducing UB. This is achieved through the use of the freeze
+ // instruction.
+
+ SmallVector<BasicBlock *, 16> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
+ if (ExitingBlocks.empty())
+ return false; // Nothing to do.
+
+ auto *Latch = L->getLoopLatch();
+ if (!Latch)
+ return false;
+
+ auto *WidenableBR = FindWidenableTerminatorAboveLoop(L, *LI);
+ if (!WidenableBR)
+ return false;
+
+ const SCEV *LatchEC = SE->getExitCount(L, Latch);
+ if (isa<SCEVCouldNotCompute>(LatchEC))
+ return false; // profitability - want hot exit in analyzeable set
+
+ // At this point, we have found an analyzeable latch, and a widenable
+ // condition above the loop. If we have a widenable exit within the loop
+ // (for which we can't compute exit counts), drop the ability to further
+ // widen so that we gain ability to analyze it's exit count and perform this
+ // transform. TODO: It'd be nice to know for sure the exit became
+ // analyzeable after dropping widenability.
+ {
+ bool Invalidate = false;
+
+ for (auto *ExitingBB : ExitingBlocks) {
+ if (LI->getLoopFor(ExitingBB) != L)
+ continue;
+
+ auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+ if (!BI)
+ continue;
+
+ Use *Cond, *WC;
+ BasicBlock *IfTrueBB, *IfFalseBB;
+ if (parseWidenableBranch(BI, Cond, WC, IfTrueBB, IfFalseBB) &&
+ L->contains(IfTrueBB)) {
+ WC->set(ConstantInt::getTrue(IfTrueBB->getContext()));
+ Invalidate = true;
+ }
+ }
+ if (Invalidate)
+ SE->forgetLoop(L);
+ }
+
+ // The use of umin(all analyzeable exits) instead of latch is subtle, but
+ // important for profitability. We may have a loop which hasn't been fully
+ // canonicalized just yet. If the exit we chose to widen is provably never
+ // taken, we want the widened form to *also* be provably never taken. We
+ // can't guarantee this as a current unanalyzeable exit may later become
+ // analyzeable, but we can at least avoid the obvious cases.
+ const SCEV *MinEC = getMinAnalyzeableBackedgeTakenCount(*SE, *DT, L);
+ if (isa<SCEVCouldNotCompute>(MinEC) || MinEC->getType()->isPointerTy() ||
+ !SE->isLoopInvariant(MinEC, L) ||
+ !isSafeToExpandAt(MinEC, WidenableBR, *SE))
+ return false;
+
+ // Subtlety: We need to avoid inserting additional uses of the WC. We know
+ // that it can only have one transitive use at the moment, and thus moving
+ // that use to just before the branch and inserting code before it and then
+ // modifying the operand is legal.
+ auto *IP = cast<Instruction>(WidenableBR->getCondition());
+ IP->moveBefore(WidenableBR);
+ Rewriter.setInsertPoint(IP);
+ IRBuilder<> B(IP);
+
+ bool Changed = false;
+ Value *MinECV = nullptr; // lazily generated if needed
+ for (BasicBlock *ExitingBB : ExitingBlocks) {
+ // If our exiting block exits multiple loops, we can only rewrite the
+ // innermost one. Otherwise, we're changing how many times the innermost
+ // loop runs before it exits.
+ if (LI->getLoopFor(ExitingBB) != L)
+ continue;
+
+ // Can't rewrite non-branch yet.
+ auto *BI = dyn_cast<BranchInst>(ExitingBB->getTerminator());
+ if (!BI)
+ continue;
+
+ // If already constant, nothing to do.
+ if (isa<Constant>(BI->getCondition()))
+ continue;
+
+ const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
+ if (isa<SCEVCouldNotCompute>(ExitCount) ||
+ ExitCount->getType()->isPointerTy() ||
+ !isSafeToExpandAt(ExitCount, WidenableBR, *SE))
+ continue;
+
+ const bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
+ BasicBlock *ExitBB = BI->getSuccessor(ExitIfTrue ? 0 : 1);
+ if (!ExitBB->getPostdominatingDeoptimizeCall())
+ continue;
+
+ /// Here we can be fairly sure that executing this exit will most likely
+ /// lead to executing llvm.experimental.deoptimize.
+ /// This is a profitability heuristic, not a legality constraint.
+
+ // If we found a widenable exit condition, do two things:
+ // 1) fold the widened exit test into the widenable condition
+ // 2) fold the branch to untaken - avoids infinite looping
+
+ Value *ECV = Rewriter.expandCodeFor(ExitCount);
+ if (!MinECV)
+ MinECV = Rewriter.expandCodeFor(MinEC);
+ Value *RHS = MinECV;
+ if (ECV->getType() != RHS->getType()) {
+ Type *WiderTy = SE->getWiderType(ECV->getType(), RHS->getType());
+ ECV = B.CreateZExt(ECV, WiderTy);
+ RHS = B.CreateZExt(RHS, WiderTy);
+ }
+ assert(!Latch || DT->dominates(ExitingBB, Latch));
+ Value *NewCond = B.CreateICmp(ICmpInst::ICMP_UGT, ECV, RHS);
+ // Freeze poison or undef to an arbitrary bit pattern to ensure we can
+ // branch without introducing UB. See NOTE ON POISON/UNDEF above for
+ // context.
+ NewCond = B.CreateFreeze(NewCond);
+
+ widenWidenableBranch(WidenableBR, NewCond);
+
+ Value *OldCond = BI->getCondition();
+ BI->setCondition(ConstantInt::get(OldCond->getType(), !ExitIfTrue));
+ Changed = true;
+ }
+
+ if (Changed)
+ // We just mutated a bunch of loop exits changing there exit counts
+ // widely. We need to force recomputation of the exit counts given these
+ // changes. Note that all of the inserted exits are never taken, and
+ // should be removed next time the CFG is modified.
+ SE->forgetLoop(L);
+ return Changed;
+}
+
+bool LoopPredication::runOnLoop(Loop *Loop) {
+ L = Loop;
+
+ LLVM_DEBUG(dbgs() << "Analyzing ");
+ LLVM_DEBUG(L->dump());
+
+ Module *M = L->getHeader()->getModule();
+
+ // There is nothing to do if the module doesn't use guards
+ auto *GuardDecl =
+ M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
+ bool HasIntrinsicGuards = GuardDecl && !GuardDecl->use_empty();
+ auto *WCDecl = M->getFunction(
+ Intrinsic::getName(Intrinsic::experimental_widenable_condition));
+ bool HasWidenableConditions =
+ PredicateWidenableBranchGuards && WCDecl && !WCDecl->use_empty();
+ if (!HasIntrinsicGuards && !HasWidenableConditions)
+ return false;
+
+ DL = &M->getDataLayout();
+
+ Preheader = L->getLoopPreheader();
+ if (!Preheader)
+ return false;
+
+ auto LatchCheckOpt = parseLoopLatchICmp();
+ if (!LatchCheckOpt)
+ return false;
+ LatchCheck = *LatchCheckOpt;
+
+ LLVM_DEBUG(dbgs() << "Latch check:\n");
+ LLVM_DEBUG(LatchCheck.dump());
+
+ if (!isLoopProfitableToPredicate()) {
+ LLVM_DEBUG(dbgs() << "Loop not profitable to predicate!\n");
+ return false;
+ }
+ // Collect all the guards into a vector and process later, so as not
+ // to invalidate the instruction iterator.
+ SmallVector<IntrinsicInst *, 4> Guards;
+ SmallVector<BranchInst *, 4> GuardsAsWidenableBranches;
+ for (const auto BB : L->blocks()) {
+ for (auto &I : *BB)
+ if (isGuard(&I))
+ Guards.push_back(cast<IntrinsicInst>(&I));
+ if (PredicateWidenableBranchGuards &&
+ isGuardAsWidenableBranch(BB->getTerminator()))
+ GuardsAsWidenableBranches.push_back(
+ cast<BranchInst>(BB->getTerminator()));
+ }
+
+ SCEVExpander Expander(*SE, *DL, "loop-predication");
+ bool Changed = false;
+ for (auto *Guard : Guards)
+ Changed |= widenGuardConditions(Guard, Expander);
+ for (auto *Guard : GuardsAsWidenableBranches)
+ Changed |= widenWidenableBranchGuardConditions(Guard, Expander);
+ Changed |= predicateLoopExits(L, Expander);
+ return Changed;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRerollPass.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRerollPass.cpp
index cd8e046fb8..65a6205f03 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -1,183 +1,183 @@
-//===- LoopReroll.cpp - Loop rerolling pass -------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass implements a simple loop reroller.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+//===- LoopReroll.cpp - Loop rerolling pass -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop reroller.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Scalar/LoopReroll.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <iterator>
-#include <map>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-reroll"
-
-STATISTIC(NumRerolledLoops, "Number of rerolled loops");
-
-static cl::opt<unsigned>
-NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
- cl::Hidden,
- cl::desc("The maximum number of failures to tolerate"
- " during fuzzy matching. (default: 400)"));
-
-// This loop re-rolling transformation aims to transform loops like this:
-//
-// int foo(int a);
-// void bar(int *x) {
-// for (int i = 0; i < 500; i += 3) {
-// foo(i);
-// foo(i+1);
-// foo(i+2);
-// }
-// }
-//
-// into a loop like this:
-//
-// void bar(int *x) {
-// for (int i = 0; i < 500; ++i)
-// foo(i);
-// }
-//
-// It does this by looking for loops that, besides the latch code, are composed
-// of isomorphic DAGs of instructions, with each DAG rooted at some increment
-// to the induction variable, and where each DAG is isomorphic to the DAG
-// rooted at the induction variable (excepting the sub-DAGs which root the
-// other induction-variable increments). In other words, we're looking for loop
-// bodies of the form:
-//
-// %iv = phi [ (preheader, ...), (body, %iv.next) ]
-// f(%iv)
-// %iv.1 = add %iv, 1 <-- a root increment
-// f(%iv.1)
-// %iv.2 = add %iv, 2 <-- a root increment
-// f(%iv.2)
-// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment
-// f(%iv.scale_m_1)
-// ...
-// %iv.next = add %iv, scale
-// %cmp = icmp(%iv, ...)
-// br %cmp, header, exit
-//
-// where each f(i) is a set of instructions that, collectively, are a function
-// only of i (and other loop-invariant values).
-//
-// As a special case, we can also reroll loops like this:
-//
-// int foo(int);
-// void bar(int *x) {
-// for (int i = 0; i < 500; ++i) {
-// x[3*i] = foo(0);
-// x[3*i+1] = foo(0);
-// x[3*i+2] = foo(0);
-// }
-// }
-//
-// into this:
-//
-// void bar(int *x) {
-// for (int i = 0; i < 1500; ++i)
-// x[i] = foo(0);
-// }
-//
-// in which case, we're looking for inputs like this:
-//
-// %iv = phi [ (preheader, ...), (body, %iv.next) ]
-// %scaled.iv = mul %iv, scale
-// f(%scaled.iv)
-// %scaled.iv.1 = add %scaled.iv, 1
-// f(%scaled.iv.1)
-// %scaled.iv.2 = add %scaled.iv, 2
-// f(%scaled.iv.2)
-// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
-// f(%scaled.iv.scale_m_1)
-// ...
-// %iv.next = add %iv, 1
-// %cmp = icmp(%iv, ...)
-// br %cmp, header, exit
-
-namespace {
-
- enum IterationLimits {
- /// The maximum number of iterations that we'll try and reroll.
- IL_MaxRerollIterations = 32,
- /// The bitvector index used by loop induction variables and other
- /// instructions that belong to all iterations.
- IL_All,
- IL_End
- };
-
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <map>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-reroll"
+
+STATISTIC(NumRerolledLoops, "Number of rerolled loops");
+
+static cl::opt<unsigned>
+NumToleratedFailedMatches("reroll-num-tolerated-failed-matches", cl::init(400),
+ cl::Hidden,
+ cl::desc("The maximum number of failures to tolerate"
+ " during fuzzy matching. (default: 400)"));
+
+// This loop re-rolling transformation aims to transform loops like this:
+//
+// int foo(int a);
+// void bar(int *x) {
+// for (int i = 0; i < 500; i += 3) {
+// foo(i);
+// foo(i+1);
+// foo(i+2);
+// }
+// }
+//
+// into a loop like this:
+//
+// void bar(int *x) {
+// for (int i = 0; i < 500; ++i)
+// foo(i);
+// }
+//
+// It does this by looking for loops that, besides the latch code, are composed
+// of isomorphic DAGs of instructions, with each DAG rooted at some increment
+// to the induction variable, and where each DAG is isomorphic to the DAG
+// rooted at the induction variable (excepting the sub-DAGs which root the
+// other induction-variable increments). In other words, we're looking for loop
+// bodies of the form:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1 <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2 <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// where each f(i) is a set of instructions that, collectively, are a function
+// only of i (and other loop-invariant values).
+//
+// As a special case, we can also reroll loops like this:
+//
+// int foo(int);
+// void bar(int *x) {
+// for (int i = 0; i < 500; ++i) {
+// x[3*i] = foo(0);
+// x[3*i+1] = foo(0);
+// x[3*i+2] = foo(0);
+// }
+// }
+//
+// into this:
+//
+// void bar(int *x) {
+// for (int i = 0; i < 1500; ++i)
+// x[i] = foo(0);
+// }
+//
+// in which case, we're looking for inputs like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// %scaled.iv = mul %iv, scale
+// f(%scaled.iv)
+// %scaled.iv.1 = add %scaled.iv, 1
+// f(%scaled.iv.1)
+// %scaled.iv.2 = add %scaled.iv, 2
+// f(%scaled.iv.2)
+// %scaled.iv.scale_m_1 = add %scaled.iv, scale-1
+// f(%scaled.iv.scale_m_1)
+// ...
+// %iv.next = add %iv, 1
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+
+namespace {
+
+ enum IterationLimits {
+ /// The maximum number of iterations that we'll try and reroll.
+ IL_MaxRerollIterations = 32,
+ /// The bitvector index used by loop induction variables and other
+ /// instructions that belong to all iterations.
+ IL_All,
+ IL_End
+ };
+
class LoopRerollLegacyPass : public LoopPass {
- public:
- static char ID; // Pass ID, replacement for typeid
-
+ public:
+ static char ID; // Pass ID, replacement for typeid
+
LoopRerollLegacyPass() : LoopPass(ID) {
initializeLoopRerollLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- getLoopAnalysisUsage(AU);
- }
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
};
-
+
class LoopReroll {
public:
LoopReroll(AliasAnalysis *AA, LoopInfo *LI, ScalarEvolution *SE,
@@ -186,1529 +186,1529 @@ namespace {
PreserveLCSSA(PreserveLCSSA) {}
bool runOnLoop(Loop *L);
- protected:
- AliasAnalysis *AA;
- LoopInfo *LI;
- ScalarEvolution *SE;
- TargetLibraryInfo *TLI;
- DominatorTree *DT;
- bool PreserveLCSSA;
-
- using SmallInstructionVector = SmallVector<Instruction *, 16>;
- using SmallInstructionSet = SmallPtrSet<Instruction *, 16>;
-
- // Map between induction variable and its increment
- DenseMap<Instruction *, int64_t> IVToIncMap;
-
- // For loop with multiple induction variable, remember the one used only to
- // control the loop.
- Instruction *LoopControlIV;
-
- // A chain of isomorphic instructions, identified by a single-use PHI
- // representing a reduction. Only the last value may be used outside the
- // loop.
- struct SimpleLoopReduction {
- SimpleLoopReduction(Instruction *P, Loop *L) : Instructions(1, P) {
- assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
- add(L);
- }
-
- bool valid() const {
- return Valid;
- }
-
- Instruction *getPHI() const {
- assert(Valid && "Using invalid reduction");
- return Instructions.front();
- }
-
- Instruction *getReducedValue() const {
- assert(Valid && "Using invalid reduction");
- return Instructions.back();
- }
-
- Instruction *get(size_t i) const {
- assert(Valid && "Using invalid reduction");
- return Instructions[i+1];
- }
-
- Instruction *operator [] (size_t i) const { return get(i); }
-
- // The size, ignoring the initial PHI.
- size_t size() const {
- assert(Valid && "Using invalid reduction");
- return Instructions.size()-1;
- }
-
- using iterator = SmallInstructionVector::iterator;
- using const_iterator = SmallInstructionVector::const_iterator;
-
- iterator begin() {
- assert(Valid && "Using invalid reduction");
- return std::next(Instructions.begin());
- }
-
- const_iterator begin() const {
- assert(Valid && "Using invalid reduction");
- return std::next(Instructions.begin());
- }
-
- iterator end() { return Instructions.end(); }
- const_iterator end() const { return Instructions.end(); }
-
- protected:
- bool Valid = false;
- SmallInstructionVector Instructions;
-
- void add(Loop *L);
- };
-
- // The set of all reductions, and state tracking of possible reductions
- // during loop instruction processing.
- struct ReductionTracker {
- using SmallReductionVector = SmallVector<SimpleLoopReduction, 16>;
-
- // Add a new possible reduction.
- void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
-
- // Setup to track possible reductions corresponding to the provided
- // rerolling scale. Only reductions with a number of non-PHI instructions
- // that is divisible by the scale are considered. Three instructions sets
- // are filled in:
- // - A set of all possible instructions in eligible reductions.
- // - A set of all PHIs in eligible reductions
- // - A set of all reduced values (last instructions) in eligible
- // reductions.
- void restrictToScale(uint64_t Scale,
- SmallInstructionSet &PossibleRedSet,
- SmallInstructionSet &PossibleRedPHISet,
- SmallInstructionSet &PossibleRedLastSet) {
- PossibleRedIdx.clear();
- PossibleRedIter.clear();
- Reds.clear();
-
- for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
- if (PossibleReds[i].size() % Scale == 0) {
- PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
- PossibleRedPHISet.insert(PossibleReds[i].getPHI());
-
- PossibleRedSet.insert(PossibleReds[i].getPHI());
- PossibleRedIdx[PossibleReds[i].getPHI()] = i;
- for (Instruction *J : PossibleReds[i]) {
- PossibleRedSet.insert(J);
- PossibleRedIdx[J] = i;
- }
- }
- }
-
- // The functions below are used while processing the loop instructions.
-
- // Are the two instructions both from reductions, and furthermore, from
- // the same reduction?
- bool isPairInSame(Instruction *J1, Instruction *J2) {
- DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
- if (J1I != PossibleRedIdx.end()) {
- DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
- if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
- return true;
- }
-
- return false;
- }
-
- // The two provided instructions, the first from the base iteration, and
- // the second from iteration i, form a matched pair. If these are part of
- // a reduction, record that fact.
- void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
- if (PossibleRedIdx.count(J1)) {
- assert(PossibleRedIdx.count(J2) &&
- "Recording reduction vs. non-reduction instruction?");
-
- PossibleRedIter[J1] = 0;
- PossibleRedIter[J2] = i;
-
- int Idx = PossibleRedIdx[J1];
- assert(Idx == PossibleRedIdx[J2] &&
- "Recording pair from different reductions?");
- Reds.insert(Idx);
- }
- }
-
- // The functions below can be called after we've finished processing all
- // instructions in the loop, and we know which reductions were selected.
-
- bool validateSelected();
- void replaceSelected();
-
- protected:
- // The vector of all possible reductions (for any scale).
- SmallReductionVector PossibleReds;
-
- DenseMap<Instruction *, int> PossibleRedIdx;
- DenseMap<Instruction *, int> PossibleRedIter;
- DenseSet<int> Reds;
- };
-
- // A DAGRootSet models an induction variable being used in a rerollable
- // loop. For example,
- //
- // x[i*3+0] = y1
- // x[i*3+1] = y2
- // x[i*3+2] = y3
- //
- // Base instruction -> i*3
- // +---+----+
- // / | \
- // ST[y1] +1 +2 <-- Roots
- // | |
- // ST[y2] ST[y3]
- //
- // There may be multiple DAGRoots, for example:
- //
- // x[i*2+0] = ... (1)
- // x[i*2+1] = ... (1)
- // x[i*2+4] = ... (2)
- // x[i*2+5] = ... (2)
- // x[(i+1234)*2+5678] = ... (3)
- // x[(i+1234)*2+5679] = ... (3)
- //
- // The loop will be rerolled by adding a new loop induction variable,
- // one for the Base instruction in each DAGRootSet.
- //
- struct DAGRootSet {
- Instruction *BaseInst;
- SmallInstructionVector Roots;
-
- // The instructions between IV and BaseInst (but not including BaseInst).
- SmallInstructionSet SubsumedInsts;
- };
-
- // The set of all DAG roots, and state tracking of all roots
- // for a particular induction variable.
- struct DAGRootTracker {
- DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
- ScalarEvolution *SE, AliasAnalysis *AA,
- TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI,
- bool PreserveLCSSA,
- DenseMap<Instruction *, int64_t> &IncrMap,
- Instruction *LoopCtrlIV)
- : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI),
- PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap),
- LoopControlIV(LoopCtrlIV) {}
-
- /// Stage 1: Find all the DAG roots for the induction variable.
- bool findRoots();
-
- /// Stage 2: Validate if the found roots are valid.
- bool validate(ReductionTracker &Reductions);
-
- /// Stage 3: Assuming validate() returned true, perform the
- /// replacement.
- /// @param BackedgeTakenCount The backedge-taken count of L.
- void replace(const SCEV *BackedgeTakenCount);
-
- protected:
- using UsesTy = MapVector<Instruction *, BitVector>;
-
- void findRootsRecursive(Instruction *IVU,
- SmallInstructionSet SubsumedInsts);
- bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
- bool collectPossibleRoots(Instruction *Base,
- std::map<int64_t,Instruction*> &Roots);
- bool validateRootSet(DAGRootSet &DRS);
-
- bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
- void collectInLoopUserSet(const SmallInstructionVector &Roots,
- const SmallInstructionSet &Exclude,
- const SmallInstructionSet &Final,
- DenseSet<Instruction *> &Users);
- void collectInLoopUserSet(Instruction *Root,
- const SmallInstructionSet &Exclude,
- const SmallInstructionSet &Final,
- DenseSet<Instruction *> &Users);
-
- UsesTy::iterator nextInstr(int Val, UsesTy &In,
- const SmallInstructionSet &Exclude,
- UsesTy::iterator *StartI=nullptr);
- bool isBaseInst(Instruction *I);
- bool isRootInst(Instruction *I);
- bool instrDependsOn(Instruction *I,
- UsesTy::iterator Start,
- UsesTy::iterator End);
- void replaceIV(DAGRootSet &DRS, const SCEV *Start, const SCEV *IncrExpr);
-
- LoopReroll *Parent;
-
- // Members of Parent, replicated here for brevity.
- Loop *L;
- ScalarEvolution *SE;
- AliasAnalysis *AA;
- TargetLibraryInfo *TLI;
- DominatorTree *DT;
- LoopInfo *LI;
- bool PreserveLCSSA;
-
- // The loop induction variable.
- Instruction *IV;
-
- // Loop step amount.
- int64_t Inc;
-
- // Loop reroll count; if Inc == 1, this records the scaling applied
- // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
- // If Inc is not 1, Scale = Inc.
- uint64_t Scale;
-
- // The roots themselves.
- SmallVector<DAGRootSet,16> RootSets;
-
- // All increment instructions for IV.
- SmallInstructionVector LoopIncs;
-
- // Map of all instructions in the loop (in order) to the iterations
- // they are used in (or specially, IL_All for instructions
- // used in the loop increment mechanism).
- UsesTy Uses;
-
- // Map between induction variable and its increment
- DenseMap<Instruction *, int64_t> &IVToIncMap;
-
- Instruction *LoopControlIV;
- };
-
- // Check if it is a compare-like instruction whose user is a branch
- bool isCompareUsedByBranch(Instruction *I) {
- auto *TI = I->getParent()->getTerminator();
- if (!isa<BranchInst>(TI) || !isa<CmpInst>(I))
- return false;
- return I->hasOneUse() && TI->getOperand(0) == I;
- };
-
- bool isLoopControlIV(Loop *L, Instruction *IV);
- void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
- void collectPossibleReductions(Loop *L,
- ReductionTracker &Reductions);
- bool reroll(Instruction *IV, Loop *L, BasicBlock *Header,
- const SCEV *BackedgeTakenCount, ReductionTracker &Reductions);
- };
-
-} // end anonymous namespace
-
+ protected:
+ AliasAnalysis *AA;
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ TargetLibraryInfo *TLI;
+ DominatorTree *DT;
+ bool PreserveLCSSA;
+
+ using SmallInstructionVector = SmallVector<Instruction *, 16>;
+ using SmallInstructionSet = SmallPtrSet<Instruction *, 16>;
+
+ // Map between induction variable and its increment
+ DenseMap<Instruction *, int64_t> IVToIncMap;
+
+ // For loop with multiple induction variable, remember the one used only to
+ // control the loop.
+ Instruction *LoopControlIV;
+
+ // A chain of isomorphic instructions, identified by a single-use PHI
+ // representing a reduction. Only the last value may be used outside the
+ // loop.
+ struct SimpleLoopReduction {
+ SimpleLoopReduction(Instruction *P, Loop *L) : Instructions(1, P) {
+ assert(isa<PHINode>(P) && "First reduction instruction must be a PHI");
+ add(L);
+ }
+
+ bool valid() const {
+ return Valid;
+ }
+
+ Instruction *getPHI() const {
+ assert(Valid && "Using invalid reduction");
+ return Instructions.front();
+ }
+
+ Instruction *getReducedValue() const {
+ assert(Valid && "Using invalid reduction");
+ return Instructions.back();
+ }
+
+ Instruction *get(size_t i) const {
+ assert(Valid && "Using invalid reduction");
+ return Instructions[i+1];
+ }
+
+ Instruction *operator [] (size_t i) const { return get(i); }
+
+ // The size, ignoring the initial PHI.
+ size_t size() const {
+ assert(Valid && "Using invalid reduction");
+ return Instructions.size()-1;
+ }
+
+ using iterator = SmallInstructionVector::iterator;
+ using const_iterator = SmallInstructionVector::const_iterator;
+
+ iterator begin() {
+ assert(Valid && "Using invalid reduction");
+ return std::next(Instructions.begin());
+ }
+
+ const_iterator begin() const {
+ assert(Valid && "Using invalid reduction");
+ return std::next(Instructions.begin());
+ }
+
+ iterator end() { return Instructions.end(); }
+ const_iterator end() const { return Instructions.end(); }
+
+ protected:
+ bool Valid = false;
+ SmallInstructionVector Instructions;
+
+ void add(Loop *L);
+ };
+
+ // The set of all reductions, and state tracking of possible reductions
+ // during loop instruction processing.
+ struct ReductionTracker {
+ using SmallReductionVector = SmallVector<SimpleLoopReduction, 16>;
+
+ // Add a new possible reduction.
+ void addSLR(SimpleLoopReduction &SLR) { PossibleReds.push_back(SLR); }
+
+ // Setup to track possible reductions corresponding to the provided
+ // rerolling scale. Only reductions with a number of non-PHI instructions
+ // that is divisible by the scale are considered. Three instructions sets
+ // are filled in:
+ // - A set of all possible instructions in eligible reductions.
+ // - A set of all PHIs in eligible reductions
+ // - A set of all reduced values (last instructions) in eligible
+ // reductions.
+ void restrictToScale(uint64_t Scale,
+ SmallInstructionSet &PossibleRedSet,
+ SmallInstructionSet &PossibleRedPHISet,
+ SmallInstructionSet &PossibleRedLastSet) {
+ PossibleRedIdx.clear();
+ PossibleRedIter.clear();
+ Reds.clear();
+
+ for (unsigned i = 0, e = PossibleReds.size(); i != e; ++i)
+ if (PossibleReds[i].size() % Scale == 0) {
+ PossibleRedLastSet.insert(PossibleReds[i].getReducedValue());
+ PossibleRedPHISet.insert(PossibleReds[i].getPHI());
+
+ PossibleRedSet.insert(PossibleReds[i].getPHI());
+ PossibleRedIdx[PossibleReds[i].getPHI()] = i;
+ for (Instruction *J : PossibleReds[i]) {
+ PossibleRedSet.insert(J);
+ PossibleRedIdx[J] = i;
+ }
+ }
+ }
+
+ // The functions below are used while processing the loop instructions.
+
+ // Are the two instructions both from reductions, and furthermore, from
+ // the same reduction?
+ bool isPairInSame(Instruction *J1, Instruction *J2) {
+ DenseMap<Instruction *, int>::iterator J1I = PossibleRedIdx.find(J1);
+ if (J1I != PossibleRedIdx.end()) {
+ DenseMap<Instruction *, int>::iterator J2I = PossibleRedIdx.find(J2);
+ if (J2I != PossibleRedIdx.end() && J1I->second == J2I->second)
+ return true;
+ }
+
+ return false;
+ }
+
+ // The two provided instructions, the first from the base iteration, and
+ // the second from iteration i, form a matched pair. If these are part of
+ // a reduction, record that fact.
+ void recordPair(Instruction *J1, Instruction *J2, unsigned i) {
+ if (PossibleRedIdx.count(J1)) {
+ assert(PossibleRedIdx.count(J2) &&
+ "Recording reduction vs. non-reduction instruction?");
+
+ PossibleRedIter[J1] = 0;
+ PossibleRedIter[J2] = i;
+
+ int Idx = PossibleRedIdx[J1];
+ assert(Idx == PossibleRedIdx[J2] &&
+ "Recording pair from different reductions?");
+ Reds.insert(Idx);
+ }
+ }
+
+ // The functions below can be called after we've finished processing all
+ // instructions in the loop, and we know which reductions were selected.
+
+ bool validateSelected();
+ void replaceSelected();
+
+ protected:
+ // The vector of all possible reductions (for any scale).
+ SmallReductionVector PossibleReds;
+
+ DenseMap<Instruction *, int> PossibleRedIdx;
+ DenseMap<Instruction *, int> PossibleRedIter;
+ DenseSet<int> Reds;
+ };
+
+ // A DAGRootSet models an induction variable being used in a rerollable
+ // loop. For example,
+ //
+ // x[i*3+0] = y1
+ // x[i*3+1] = y2
+ // x[i*3+2] = y3
+ //
+ // Base instruction -> i*3
+ // +---+----+
+ // / | \
+ // ST[y1] +1 +2 <-- Roots
+ // | |
+ // ST[y2] ST[y3]
+ //
+ // There may be multiple DAGRoots, for example:
+ //
+ // x[i*2+0] = ... (1)
+ // x[i*2+1] = ... (1)
+ // x[i*2+4] = ... (2)
+ // x[i*2+5] = ... (2)
+ // x[(i+1234)*2+5678] = ... (3)
+ // x[(i+1234)*2+5679] = ... (3)
+ //
+ // The loop will be rerolled by adding a new loop induction variable,
+ // one for the Base instruction in each DAGRootSet.
+ //
+ struct DAGRootSet {
+ Instruction *BaseInst;
+ SmallInstructionVector Roots;
+
+ // The instructions between IV and BaseInst (but not including BaseInst).
+ SmallInstructionSet SubsumedInsts;
+ };
+
+ // The set of all DAG roots, and state tracking of all roots
+ // for a particular induction variable.
+ struct DAGRootTracker {
+ DAGRootTracker(LoopReroll *Parent, Loop *L, Instruction *IV,
+ ScalarEvolution *SE, AliasAnalysis *AA,
+ TargetLibraryInfo *TLI, DominatorTree *DT, LoopInfo *LI,
+ bool PreserveLCSSA,
+ DenseMap<Instruction *, int64_t> &IncrMap,
+ Instruction *LoopCtrlIV)
+ : Parent(Parent), L(L), SE(SE), AA(AA), TLI(TLI), DT(DT), LI(LI),
+ PreserveLCSSA(PreserveLCSSA), IV(IV), IVToIncMap(IncrMap),
+ LoopControlIV(LoopCtrlIV) {}
+
+ /// Stage 1: Find all the DAG roots for the induction variable.
+ bool findRoots();
+
+ /// Stage 2: Validate if the found roots are valid.
+ bool validate(ReductionTracker &Reductions);
+
+ /// Stage 3: Assuming validate() returned true, perform the
+ /// replacement.
+ /// @param BackedgeTakenCount The backedge-taken count of L.
+ void replace(const SCEV *BackedgeTakenCount);
+
+ protected:
+ using UsesTy = MapVector<Instruction *, BitVector>;
+
+ void findRootsRecursive(Instruction *IVU,
+ SmallInstructionSet SubsumedInsts);
+ bool findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts);
+ bool collectPossibleRoots(Instruction *Base,
+ std::map<int64_t,Instruction*> &Roots);
+ bool validateRootSet(DAGRootSet &DRS);
+
+ bool collectUsedInstructions(SmallInstructionSet &PossibleRedSet);
+ void collectInLoopUserSet(const SmallInstructionVector &Roots,
+ const SmallInstructionSet &Exclude,
+ const SmallInstructionSet &Final,
+ DenseSet<Instruction *> &Users);
+ void collectInLoopUserSet(Instruction *Root,
+ const SmallInstructionSet &Exclude,
+ const SmallInstructionSet &Final,
+ DenseSet<Instruction *> &Users);
+
+ UsesTy::iterator nextInstr(int Val, UsesTy &In,
+ const SmallInstructionSet &Exclude,
+ UsesTy::iterator *StartI=nullptr);
+ bool isBaseInst(Instruction *I);
+ bool isRootInst(Instruction *I);
+ bool instrDependsOn(Instruction *I,
+ UsesTy::iterator Start,
+ UsesTy::iterator End);
+ void replaceIV(DAGRootSet &DRS, const SCEV *Start, const SCEV *IncrExpr);
+
+ LoopReroll *Parent;
+
+ // Members of Parent, replicated here for brevity.
+ Loop *L;
+ ScalarEvolution *SE;
+ AliasAnalysis *AA;
+ TargetLibraryInfo *TLI;
+ DominatorTree *DT;
+ LoopInfo *LI;
+ bool PreserveLCSSA;
+
+ // The loop induction variable.
+ Instruction *IV;
+
+ // Loop step amount.
+ int64_t Inc;
+
+ // Loop reroll count; if Inc == 1, this records the scaling applied
+ // to the indvar: a[i*2+0] = ...; a[i*2+1] = ... ;
+ // If Inc is not 1, Scale = Inc.
+ uint64_t Scale;
+
+ // The roots themselves.
+ SmallVector<DAGRootSet,16> RootSets;
+
+ // All increment instructions for IV.
+ SmallInstructionVector LoopIncs;
+
+ // Map of all instructions in the loop (in order) to the iterations
+ // they are used in (or specially, IL_All for instructions
+ // used in the loop increment mechanism).
+ UsesTy Uses;
+
+ // Map between induction variable and its increment
+ DenseMap<Instruction *, int64_t> &IVToIncMap;
+
+ Instruction *LoopControlIV;
+ };
+
+ // Check if it is a compare-like instruction whose user is a branch
+ bool isCompareUsedByBranch(Instruction *I) {
+ auto *TI = I->getParent()->getTerminator();
+ if (!isa<BranchInst>(TI) || !isa<CmpInst>(I))
+ return false;
+ return I->hasOneUse() && TI->getOperand(0) == I;
+ };
+
+ bool isLoopControlIV(Loop *L, Instruction *IV);
+ void collectPossibleIVs(Loop *L, SmallInstructionVector &PossibleIVs);
+ void collectPossibleReductions(Loop *L,
+ ReductionTracker &Reductions);
+ bool reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+ const SCEV *BackedgeTakenCount, ReductionTracker &Reductions);
+ };
+
+} // end anonymous namespace
+
char LoopRerollLegacyPass::ID = 0;
-
+
INITIALIZE_PASS_BEGIN(LoopRerollLegacyPass, "loop-reroll", "Reroll loops",
false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
INITIALIZE_PASS_END(LoopRerollLegacyPass, "loop-reroll", "Reroll loops", false,
false)
-
+
Pass *llvm::createLoopRerollPass() { return new LoopRerollLegacyPass; }
-
-// Returns true if the provided instruction is used outside the given loop.
-// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
-// non-loop blocks to be outside the loop.
-static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
- for (User *U : I->users()) {
- if (!L->contains(cast<Instruction>(U)))
- return true;
- }
- return false;
-}
-
-// Check if an IV is only used to control the loop. There are two cases:
-// 1. It only has one use which is loop increment, and the increment is only
-// used by comparison and the PHI (could has sext with nsw in between), and the
-// comparison is only used by branch.
-// 2. It is used by loop increment and the comparison, the loop increment is
-// only used by the PHI, and the comparison is used only by the branch.
-bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) {
- unsigned IVUses = IV->getNumUses();
- if (IVUses != 2 && IVUses != 1)
- return false;
-
- for (auto *User : IV->users()) {
- int32_t IncOrCmpUses = User->getNumUses();
- bool IsCompInst = isCompareUsedByBranch(cast<Instruction>(User));
-
- // User can only have one or two uses.
- if (IncOrCmpUses != 2 && IncOrCmpUses != 1)
- return false;
-
- // Case 1
- if (IVUses == 1) {
- // The only user must be the loop increment.
- // The loop increment must have two uses.
- if (IsCompInst || IncOrCmpUses != 2)
- return false;
- }
-
- // Case 2
- if (IVUses == 2 && IncOrCmpUses != 1)
- return false;
-
- // The users of the IV must be a binary operation or a comparison
- if (auto *BO = dyn_cast<BinaryOperator>(User)) {
- if (BO->getOpcode() == Instruction::Add) {
- // Loop Increment
- // User of Loop Increment should be either PHI or CMP
- for (auto *UU : User->users()) {
- if (PHINode *PN = dyn_cast<PHINode>(UU)) {
- if (PN != IV)
- return false;
- }
- // Must be a CMP or an ext (of a value with nsw) then CMP
- else {
- Instruction *UUser = dyn_cast<Instruction>(UU);
- // Skip SExt if we are extending an nsw value
- // TODO: Allow ZExt too
- if (BO->hasNoSignedWrap() && UUser && UUser->hasOneUse() &&
- isa<SExtInst>(UUser))
- UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
- if (!isCompareUsedByBranch(UUser))
- return false;
- }
- }
- } else
- return false;
- // Compare : can only have one use, and must be branch
- } else if (!IsCompInst)
- return false;
- }
- return true;
-}
-
-// Collect the list of loop induction variables with respect to which it might
-// be possible to reroll the loop.
-void LoopReroll::collectPossibleIVs(Loop *L,
- SmallInstructionVector &PossibleIVs) {
- BasicBlock *Header = L->getHeader();
- for (BasicBlock::iterator I = Header->begin(),
- IE = Header->getFirstInsertionPt(); I != IE; ++I) {
- if (!isa<PHINode>(I))
- continue;
- if (!I->getType()->isIntegerTy() && !I->getType()->isPointerTy())
- continue;
-
- if (const SCEVAddRecExpr *PHISCEV =
- dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) {
- if (PHISCEV->getLoop() != L)
- continue;
- if (!PHISCEV->isAffine())
- continue;
- auto IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
- if (IncSCEV) {
- IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
- LLVM_DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
- << "\n");
-
- if (isLoopControlIV(L, &*I)) {
- assert(!LoopControlIV && "Found two loop control only IV");
- LoopControlIV = &(*I);
- LLVM_DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I
- << " = " << *PHISCEV << "\n");
- } else
- PossibleIVs.push_back(&*I);
- }
- }
- }
-}
-
-// Add the remainder of the reduction-variable chain to the instruction vector
-// (the initial PHINode has already been added). If successful, the object is
-// marked as valid.
-void LoopReroll::SimpleLoopReduction::add(Loop *L) {
- assert(!Valid && "Cannot add to an already-valid chain");
-
- // The reduction variable must be a chain of single-use instructions
- // (including the PHI), except for the last value (which is used by the PHI
- // and also outside the loop).
- Instruction *C = Instructions.front();
- if (C->user_empty())
- return;
-
- do {
- C = cast<Instruction>(*C->user_begin());
- if (C->hasOneUse()) {
- if (!C->isBinaryOp())
- return;
-
- if (!(isa<PHINode>(Instructions.back()) ||
- C->isSameOperationAs(Instructions.back())))
- return;
-
- Instructions.push_back(C);
- }
- } while (C->hasOneUse());
-
- if (Instructions.size() < 2 ||
- !C->isSameOperationAs(Instructions.back()) ||
- C->use_empty())
- return;
-
- // C is now the (potential) last instruction in the reduction chain.
- for (User *U : C->users()) {
- // The only in-loop user can be the initial PHI.
- if (L->contains(cast<Instruction>(U)))
- if (cast<Instruction>(U) != Instructions.front())
- return;
- }
-
- Instructions.push_back(C);
- Valid = true;
-}
-
-// Collect the vector of possible reduction variables.
-void LoopReroll::collectPossibleReductions(Loop *L,
- ReductionTracker &Reductions) {
- BasicBlock *Header = L->getHeader();
- for (BasicBlock::iterator I = Header->begin(),
- IE = Header->getFirstInsertionPt(); I != IE; ++I) {
- if (!isa<PHINode>(I))
- continue;
- if (!I->getType()->isSingleValueType())
- continue;
-
- SimpleLoopReduction SLR(&*I, L);
- if (!SLR.valid())
- continue;
-
- LLVM_DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with "
- << SLR.size() << " chained instructions)\n");
- Reductions.addSLR(SLR);
- }
-}
-
-// Collect the set of all users of the provided root instruction. This set of
-// users contains not only the direct users of the root instruction, but also
-// all users of those users, and so on. There are two exceptions:
-//
-// 1. Instructions in the set of excluded instructions are never added to the
-// use set (even if they are users). This is used, for example, to exclude
-// including root increments in the use set of the primary IV.
-//
-// 2. Instructions in the set of final instructions are added to the use set
-// if they are users, but their users are not added. This is used, for
-// example, to prevent a reduction update from forcing all later reduction
-// updates into the use set.
-void LoopReroll::DAGRootTracker::collectInLoopUserSet(
- Instruction *Root, const SmallInstructionSet &Exclude,
- const SmallInstructionSet &Final,
- DenseSet<Instruction *> &Users) {
- SmallInstructionVector Queue(1, Root);
- while (!Queue.empty()) {
- Instruction *I = Queue.pop_back_val();
- if (!Users.insert(I).second)
- continue;
-
- if (!Final.count(I))
- for (Use &U : I->uses()) {
- Instruction *User = cast<Instruction>(U.getUser());
- if (PHINode *PN = dyn_cast<PHINode>(User)) {
- // Ignore "wrap-around" uses to PHIs of this loop's header.
- if (PN->getIncomingBlock(U) == L->getHeader())
- continue;
- }
-
- if (L->contains(User) && !Exclude.count(User)) {
- Queue.push_back(User);
- }
- }
-
- // We also want to collect single-user "feeder" values.
- for (User::op_iterator OI = I->op_begin(),
- OIE = I->op_end(); OI != OIE; ++OI) {
- if (Instruction *Op = dyn_cast<Instruction>(*OI))
- if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
- !Final.count(Op))
- Queue.push_back(Op);
- }
- }
-}
-
-// Collect all of the users of all of the provided root instructions (combined
-// into a single set).
-void LoopReroll::DAGRootTracker::collectInLoopUserSet(
- const SmallInstructionVector &Roots,
- const SmallInstructionSet &Exclude,
- const SmallInstructionSet &Final,
- DenseSet<Instruction *> &Users) {
- for (Instruction *Root : Roots)
- collectInLoopUserSet(Root, Exclude, Final, Users);
-}
-
-static bool isUnorderedLoadStore(Instruction *I) {
- if (LoadInst *LI = dyn_cast<LoadInst>(I))
- return LI->isUnordered();
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->isUnordered();
- if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
- return !MI->isVolatile();
- return false;
-}
-
-/// Return true if IVU is a "simple" arithmetic operation.
-/// This is used for narrowing the search space for DAGRoots; only arithmetic
-/// and GEPs can be part of a DAGRoot.
-static bool isSimpleArithmeticOp(User *IVU) {
- if (Instruction *I = dyn_cast<Instruction>(IVU)) {
- switch (I->getOpcode()) {
- default: return false;
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Mul:
- case Instruction::Shl:
- case Instruction::AShr:
- case Instruction::LShr:
- case Instruction::GetElementPtr:
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt:
- return true;
- }
- }
- return false;
-}
-
-static bool isLoopIncrement(User *U, Instruction *IV) {
- BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
-
- if ((BO && BO->getOpcode() != Instruction::Add) ||
- (!BO && !isa<GetElementPtrInst>(U)))
- return false;
-
- for (auto *UU : U->users()) {
- PHINode *PN = dyn_cast<PHINode>(UU);
- if (PN && PN == IV)
- return true;
- }
- return false;
-}
-
-bool LoopReroll::DAGRootTracker::
-collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
- SmallInstructionVector BaseUsers;
-
- for (auto *I : Base->users()) {
- ConstantInt *CI = nullptr;
-
- if (isLoopIncrement(I, IV)) {
- LoopIncs.push_back(cast<Instruction>(I));
- continue;
- }
-
- // The root nodes must be either GEPs, ORs or ADDs.
- if (auto *BO = dyn_cast<BinaryOperator>(I)) {
- if (BO->getOpcode() == Instruction::Add ||
- BO->getOpcode() == Instruction::Or)
- CI = dyn_cast<ConstantInt>(BO->getOperand(1));
- } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
- Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1);
- CI = dyn_cast<ConstantInt>(LastOperand);
- }
-
- if (!CI) {
- if (Instruction *II = dyn_cast<Instruction>(I)) {
- BaseUsers.push_back(II);
- continue;
- } else {
- LLVM_DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I
- << "\n");
- return false;
- }
- }
-
- int64_t V = std::abs(CI->getValue().getSExtValue());
- if (Roots.find(V) != Roots.end())
- // No duplicates, please.
- return false;
-
- Roots[V] = cast<Instruction>(I);
- }
-
- // Make sure we have at least two roots.
- if (Roots.empty() || (Roots.size() == 1 && BaseUsers.empty()))
- return false;
-
- // If we found non-loop-inc, non-root users of Base, assume they are
- // for the zeroth root index. This is because "add %a, 0" gets optimized
- // away.
- if (BaseUsers.size()) {
- if (Roots.find(0) != Roots.end()) {
- LLVM_DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
- return false;
- }
- Roots[0] = Base;
- }
-
- // Calculate the number of users of the base, or lowest indexed, iteration.
- unsigned NumBaseUses = BaseUsers.size();
- if (NumBaseUses == 0)
- NumBaseUses = Roots.begin()->second->getNumUses();
-
- // Check that every node has the same number of users.
- for (auto &KV : Roots) {
- if (KV.first == 0)
- continue;
- if (!KV.second->hasNUses(NumBaseUses)) {
- LLVM_DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
- << "#Base=" << NumBaseUses
- << ", #Root=" << KV.second->getNumUses() << "\n");
- return false;
- }
- }
-
- return true;
-}
-
-void LoopReroll::DAGRootTracker::
-findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
- // Does the user look like it could be part of a root set?
- // All its users must be simple arithmetic ops.
- if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1))
- return;
-
- if (I != IV && findRootsBase(I, SubsumedInsts))
- return;
-
- SubsumedInsts.insert(I);
-
- for (User *V : I->users()) {
- Instruction *I = cast<Instruction>(V);
- if (is_contained(LoopIncs, I))
- continue;
-
- if (!isSimpleArithmeticOp(I))
- continue;
-
- // The recursive call makes a copy of SubsumedInsts.
- findRootsRecursive(I, SubsumedInsts);
- }
-}
-
-bool LoopReroll::DAGRootTracker::validateRootSet(DAGRootSet &DRS) {
- if (DRS.Roots.empty())
- return false;
-
- // If the value of the base instruction is used outside the loop, we cannot
- // reroll the loop. Check for other root instructions is unnecessary because
- // they don't match any base instructions if their values are used outside.
- if (hasUsesOutsideLoop(DRS.BaseInst, L))
- return false;
-
- // Consider a DAGRootSet with N-1 roots (so N different values including
- // BaseInst).
- // Define d = Roots[0] - BaseInst, which should be the same as
- // Roots[I] - Roots[I-1] for all I in [1..N).
- // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
- // loop iteration J.
- //
- // Now, For the loop iterations to be consecutive:
- // D = d * N
- const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
- if (!ADR)
- return false;
-
- // Check that the first root is evenly spaced.
- unsigned N = DRS.Roots.size() + 1;
- const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), ADR);
- const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
- if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV))
- return false;
-
- // Check that the remainling roots are evenly spaced.
- for (unsigned i = 1; i < N - 1; ++i) {
- const SCEV *NewStepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[i]),
- SE->getSCEV(DRS.Roots[i-1]));
- if (NewStepSCEV != StepSCEV)
- return false;
- }
-
- return true;
-}
-
-bool LoopReroll::DAGRootTracker::
-findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
- // The base of a RootSet must be an AddRec, so it can be erased.
- const auto *IVU_ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IVU));
- if (!IVU_ADR || IVU_ADR->getLoop() != L)
- return false;
-
- std::map<int64_t, Instruction*> V;
- if (!collectPossibleRoots(IVU, V))
- return false;
-
- // If we didn't get a root for index zero, then IVU must be
- // subsumed.
- if (V.find(0) == V.end())
- SubsumedInsts.insert(IVU);
-
- // Partition the vector into monotonically increasing indexes.
- DAGRootSet DRS;
- DRS.BaseInst = nullptr;
-
- SmallVector<DAGRootSet, 16> PotentialRootSets;
-
- for (auto &KV : V) {
- if (!DRS.BaseInst) {
- DRS.BaseInst = KV.second;
- DRS.SubsumedInsts = SubsumedInsts;
- } else if (DRS.Roots.empty()) {
- DRS.Roots.push_back(KV.second);
- } else if (V.find(KV.first - 1) != V.end()) {
- DRS.Roots.push_back(KV.second);
- } else {
- // Linear sequence terminated.
- if (!validateRootSet(DRS))
- return false;
-
- // Construct a new DAGRootSet with the next sequence.
- PotentialRootSets.push_back(DRS);
- DRS.BaseInst = KV.second;
- DRS.Roots.clear();
- }
- }
-
- if (!validateRootSet(DRS))
- return false;
-
- PotentialRootSets.push_back(DRS);
-
- RootSets.append(PotentialRootSets.begin(), PotentialRootSets.end());
-
- return true;
-}
-
-bool LoopReroll::DAGRootTracker::findRoots() {
- Inc = IVToIncMap[IV];
-
- assert(RootSets.empty() && "Unclean state!");
- if (std::abs(Inc) == 1) {
- for (auto *IVU : IV->users()) {
- if (isLoopIncrement(IVU, IV))
- LoopIncs.push_back(cast<Instruction>(IVU));
- }
- findRootsRecursive(IV, SmallInstructionSet());
- LoopIncs.push_back(IV);
- } else {
- if (!findRootsBase(IV, SmallInstructionSet()))
- return false;
- }
-
- // Ensure all sets have the same size.
- if (RootSets.empty()) {
- LLVM_DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
- return false;
- }
- for (auto &V : RootSets) {
- if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
- LLVM_DEBUG(
- dbgs()
- << "LRR: Aborting because not all root sets have the same size\n");
- return false;
- }
- }
-
- Scale = RootSets[0].Roots.size() + 1;
-
- if (Scale > IL_MaxRerollIterations) {
- LLVM_DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
- << "#Found=" << Scale
- << ", #Max=" << IL_MaxRerollIterations << "\n");
- return false;
- }
-
- LLVM_DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale
- << "\n");
-
- return true;
-}
-
-bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
- // Populate the MapVector with all instructions in the block, in order first,
- // so we can iterate over the contents later in perfect order.
- for (auto &I : *L->getHeader()) {
- Uses[&I].resize(IL_End);
- }
-
- SmallInstructionSet Exclude;
- for (auto &DRS : RootSets) {
- Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
- Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
- Exclude.insert(DRS.BaseInst);
- }
- Exclude.insert(LoopIncs.begin(), LoopIncs.end());
-
- for (auto &DRS : RootSets) {
- DenseSet<Instruction*> VBase;
- collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase);
- for (auto *I : VBase) {
- Uses[I].set(0);
- }
-
- unsigned Idx = 1;
- for (auto *Root : DRS.Roots) {
- DenseSet<Instruction*> V;
- collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
-
- // While we're here, check the use sets are the same size.
- if (V.size() != VBase.size()) {
- LLVM_DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
- return false;
- }
-
- for (auto *I : V) {
- Uses[I].set(Idx);
- }
- ++Idx;
- }
-
- // Make sure our subsumed instructions are remembered too.
- for (auto *I : DRS.SubsumedInsts) {
- Uses[I].set(IL_All);
- }
- }
-
- // Make sure the loop increments are also accounted for.
-
- Exclude.clear();
- for (auto &DRS : RootSets) {
- Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
- Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
- Exclude.insert(DRS.BaseInst);
- }
-
- DenseSet<Instruction*> V;
- collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
- for (auto *I : V) {
+
+// Returns true if the provided instruction is used outside the given loop.
+// This operates like Instruction::isUsedOutsideOfBlock, but considers PHIs in
+// non-loop blocks to be outside the loop.
+static bool hasUsesOutsideLoop(Instruction *I, Loop *L) {
+ for (User *U : I->users()) {
+ if (!L->contains(cast<Instruction>(U)))
+ return true;
+ }
+ return false;
+}
+
+// Check if an IV is only used to control the loop. There are two cases:
+// 1. It only has one use which is loop increment, and the increment is only
+// used by comparison and the PHI (could has sext with nsw in between), and the
+// comparison is only used by branch.
+// 2. It is used by loop increment and the comparison, the loop increment is
+// only used by the PHI, and the comparison is used only by the branch.
+bool LoopReroll::isLoopControlIV(Loop *L, Instruction *IV) {
+ unsigned IVUses = IV->getNumUses();
+ if (IVUses != 2 && IVUses != 1)
+ return false;
+
+ for (auto *User : IV->users()) {
+ int32_t IncOrCmpUses = User->getNumUses();
+ bool IsCompInst = isCompareUsedByBranch(cast<Instruction>(User));
+
+ // User can only have one or two uses.
+ if (IncOrCmpUses != 2 && IncOrCmpUses != 1)
+ return false;
+
+ // Case 1
+ if (IVUses == 1) {
+ // The only user must be the loop increment.
+ // The loop increment must have two uses.
+ if (IsCompInst || IncOrCmpUses != 2)
+ return false;
+ }
+
+ // Case 2
+ if (IVUses == 2 && IncOrCmpUses != 1)
+ return false;
+
+ // The users of the IV must be a binary operation or a comparison
+ if (auto *BO = dyn_cast<BinaryOperator>(User)) {
+ if (BO->getOpcode() == Instruction::Add) {
+ // Loop Increment
+ // User of Loop Increment should be either PHI or CMP
+ for (auto *UU : User->users()) {
+ if (PHINode *PN = dyn_cast<PHINode>(UU)) {
+ if (PN != IV)
+ return false;
+ }
+ // Must be a CMP or an ext (of a value with nsw) then CMP
+ else {
+ Instruction *UUser = dyn_cast<Instruction>(UU);
+ // Skip SExt if we are extending an nsw value
+ // TODO: Allow ZExt too
+ if (BO->hasNoSignedWrap() && UUser && UUser->hasOneUse() &&
+ isa<SExtInst>(UUser))
+ UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
+ if (!isCompareUsedByBranch(UUser))
+ return false;
+ }
+ }
+ } else
+ return false;
+ // Compare : can only have one use, and must be branch
+ } else if (!IsCompInst)
+ return false;
+ }
+ return true;
+}
+
+// Collect the list of loop induction variables with respect to which it might
+// be possible to reroll the loop.
+void LoopReroll::collectPossibleIVs(Loop *L,
+ SmallInstructionVector &PossibleIVs) {
+ BasicBlock *Header = L->getHeader();
+ for (BasicBlock::iterator I = Header->begin(),
+ IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+ if (!isa<PHINode>(I))
+ continue;
+ if (!I->getType()->isIntegerTy() && !I->getType()->isPointerTy())
+ continue;
+
+ if (const SCEVAddRecExpr *PHISCEV =
+ dyn_cast<SCEVAddRecExpr>(SE->getSCEV(&*I))) {
+ if (PHISCEV->getLoop() != L)
+ continue;
+ if (!PHISCEV->isAffine())
+ continue;
+ auto IncSCEV = dyn_cast<SCEVConstant>(PHISCEV->getStepRecurrence(*SE));
+ if (IncSCEV) {
+ IVToIncMap[&*I] = IncSCEV->getValue()->getSExtValue();
+ LLVM_DEBUG(dbgs() << "LRR: Possible IV: " << *I << " = " << *PHISCEV
+ << "\n");
+
+ if (isLoopControlIV(L, &*I)) {
+ assert(!LoopControlIV && "Found two loop control only IV");
+ LoopControlIV = &(*I);
+ LLVM_DEBUG(dbgs() << "LRR: Possible loop control only IV: " << *I
+ << " = " << *PHISCEV << "\n");
+ } else
+ PossibleIVs.push_back(&*I);
+ }
+ }
+ }
+}
+
+// Add the remainder of the reduction-variable chain to the instruction vector
+// (the initial PHINode has already been added). If successful, the object is
+// marked as valid.
+void LoopReroll::SimpleLoopReduction::add(Loop *L) {
+ assert(!Valid && "Cannot add to an already-valid chain");
+
+ // The reduction variable must be a chain of single-use instructions
+ // (including the PHI), except for the last value (which is used by the PHI
+ // and also outside the loop).
+ Instruction *C = Instructions.front();
+ if (C->user_empty())
+ return;
+
+ do {
+ C = cast<Instruction>(*C->user_begin());
+ if (C->hasOneUse()) {
+ if (!C->isBinaryOp())
+ return;
+
+ if (!(isa<PHINode>(Instructions.back()) ||
+ C->isSameOperationAs(Instructions.back())))
+ return;
+
+ Instructions.push_back(C);
+ }
+ } while (C->hasOneUse());
+
+ if (Instructions.size() < 2 ||
+ !C->isSameOperationAs(Instructions.back()) ||
+ C->use_empty())
+ return;
+
+ // C is now the (potential) last instruction in the reduction chain.
+ for (User *U : C->users()) {
+ // The only in-loop user can be the initial PHI.
+ if (L->contains(cast<Instruction>(U)))
+ if (cast<Instruction>(U) != Instructions.front())
+ return;
+ }
+
+ Instructions.push_back(C);
+ Valid = true;
+}
+
+// Collect the vector of possible reduction variables.
+void LoopReroll::collectPossibleReductions(Loop *L,
+ ReductionTracker &Reductions) {
+ BasicBlock *Header = L->getHeader();
+ for (BasicBlock::iterator I = Header->begin(),
+ IE = Header->getFirstInsertionPt(); I != IE; ++I) {
+ if (!isa<PHINode>(I))
+ continue;
+ if (!I->getType()->isSingleValueType())
+ continue;
+
+ SimpleLoopReduction SLR(&*I, L);
+ if (!SLR.valid())
+ continue;
+
+ LLVM_DEBUG(dbgs() << "LRR: Possible reduction: " << *I << " (with "
+ << SLR.size() << " chained instructions)\n");
+ Reductions.addSLR(SLR);
+ }
+}
+
+// Collect the set of all users of the provided root instruction. This set of
+// users contains not only the direct users of the root instruction, but also
+// all users of those users, and so on. There are two exceptions:
+//
+// 1. Instructions in the set of excluded instructions are never added to the
+// use set (even if they are users). This is used, for example, to exclude
+// including root increments in the use set of the primary IV.
+//
+// 2. Instructions in the set of final instructions are added to the use set
+// if they are users, but their users are not added. This is used, for
+// example, to prevent a reduction update from forcing all later reduction
+// updates into the use set.
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
+ Instruction *Root, const SmallInstructionSet &Exclude,
+ const SmallInstructionSet &Final,
+ DenseSet<Instruction *> &Users) {
+ SmallInstructionVector Queue(1, Root);
+ while (!Queue.empty()) {
+ Instruction *I = Queue.pop_back_val();
+ if (!Users.insert(I).second)
+ continue;
+
+ if (!Final.count(I))
+ for (Use &U : I->uses()) {
+ Instruction *User = cast<Instruction>(U.getUser());
+ if (PHINode *PN = dyn_cast<PHINode>(User)) {
+ // Ignore "wrap-around" uses to PHIs of this loop's header.
+ if (PN->getIncomingBlock(U) == L->getHeader())
+ continue;
+ }
+
+ if (L->contains(User) && !Exclude.count(User)) {
+ Queue.push_back(User);
+ }
+ }
+
+ // We also want to collect single-user "feeder" values.
+ for (User::op_iterator OI = I->op_begin(),
+ OIE = I->op_end(); OI != OIE; ++OI) {
+ if (Instruction *Op = dyn_cast<Instruction>(*OI))
+ if (Op->hasOneUse() && L->contains(Op) && !Exclude.count(Op) &&
+ !Final.count(Op))
+ Queue.push_back(Op);
+ }
+ }
+}
+
+// Collect all of the users of all of the provided root instructions (combined
+// into a single set).
+void LoopReroll::DAGRootTracker::collectInLoopUserSet(
+ const SmallInstructionVector &Roots,
+ const SmallInstructionSet &Exclude,
+ const SmallInstructionSet &Final,
+ DenseSet<Instruction *> &Users) {
+ for (Instruction *Root : Roots)
+ collectInLoopUserSet(Root, Exclude, Final, Users);
+}
+
+static bool isUnorderedLoadStore(Instruction *I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return LI->isUnordered();
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->isUnordered();
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+ return !MI->isVolatile();
+ return false;
+}
+
+/// Return true if IVU is a "simple" arithmetic operation.
+/// This is used for narrowing the search space for DAGRoots; only arithmetic
+/// and GEPs can be part of a DAGRoot.
+static bool isSimpleArithmeticOp(User *IVU) {
+ if (Instruction *I = dyn_cast<Instruction>(IVU)) {
+ switch (I->getOpcode()) {
+ default: return false;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::Shl:
+ case Instruction::AShr:
+ case Instruction::LShr:
+ case Instruction::GetElementPtr:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool isLoopIncrement(User *U, Instruction *IV) {
+ BinaryOperator *BO = dyn_cast<BinaryOperator>(U);
+
+ if ((BO && BO->getOpcode() != Instruction::Add) ||
+ (!BO && !isa<GetElementPtrInst>(U)))
+ return false;
+
+ for (auto *UU : U->users()) {
+ PHINode *PN = dyn_cast<PHINode>(UU);
+ if (PN && PN == IV)
+ return true;
+ }
+ return false;
+}
+
+bool LoopReroll::DAGRootTracker::
+collectPossibleRoots(Instruction *Base, std::map<int64_t,Instruction*> &Roots) {
+ SmallInstructionVector BaseUsers;
+
+ for (auto *I : Base->users()) {
+ ConstantInt *CI = nullptr;
+
+ if (isLoopIncrement(I, IV)) {
+ LoopIncs.push_back(cast<Instruction>(I));
+ continue;
+ }
+
+ // The root nodes must be either GEPs, ORs or ADDs.
+ if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+ if (BO->getOpcode() == Instruction::Add ||
+ BO->getOpcode() == Instruction::Or)
+ CI = dyn_cast<ConstantInt>(BO->getOperand(1));
+ } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+ Value *LastOperand = GEP->getOperand(GEP->getNumOperands()-1);
+ CI = dyn_cast<ConstantInt>(LastOperand);
+ }
+
+ if (!CI) {
+ if (Instruction *II = dyn_cast<Instruction>(I)) {
+ BaseUsers.push_back(II);
+ continue;
+ } else {
+ LLVM_DEBUG(dbgs() << "LRR: Aborting due to non-instruction: " << *I
+ << "\n");
+ return false;
+ }
+ }
+
+ int64_t V = std::abs(CI->getValue().getSExtValue());
+ if (Roots.find(V) != Roots.end())
+ // No duplicates, please.
+ return false;
+
+ Roots[V] = cast<Instruction>(I);
+ }
+
+ // Make sure we have at least two roots.
+ if (Roots.empty() || (Roots.size() == 1 && BaseUsers.empty()))
+ return false;
+
+ // If we found non-loop-inc, non-root users of Base, assume they are
+ // for the zeroth root index. This is because "add %a, 0" gets optimized
+ // away.
+ if (BaseUsers.size()) {
+ if (Roots.find(0) != Roots.end()) {
+ LLVM_DEBUG(dbgs() << "LRR: Multiple roots found for base - aborting!\n");
+ return false;
+ }
+ Roots[0] = Base;
+ }
+
+ // Calculate the number of users of the base, or lowest indexed, iteration.
+ unsigned NumBaseUses = BaseUsers.size();
+ if (NumBaseUses == 0)
+ NumBaseUses = Roots.begin()->second->getNumUses();
+
+ // Check that every node has the same number of users.
+ for (auto &KV : Roots) {
+ if (KV.first == 0)
+ continue;
+ if (!KV.second->hasNUses(NumBaseUses)) {
+ LLVM_DEBUG(dbgs() << "LRR: Aborting - Root and Base #users not the same: "
+ << "#Base=" << NumBaseUses
+ << ", #Root=" << KV.second->getNumUses() << "\n");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void LoopReroll::DAGRootTracker::
+findRootsRecursive(Instruction *I, SmallInstructionSet SubsumedInsts) {
+ // Does the user look like it could be part of a root set?
+ // All its users must be simple arithmetic ops.
+ if (I->hasNUsesOrMore(IL_MaxRerollIterations + 1))
+ return;
+
+ if (I != IV && findRootsBase(I, SubsumedInsts))
+ return;
+
+ SubsumedInsts.insert(I);
+
+ for (User *V : I->users()) {
+ Instruction *I = cast<Instruction>(V);
+ if (is_contained(LoopIncs, I))
+ continue;
+
+ if (!isSimpleArithmeticOp(I))
+ continue;
+
+ // The recursive call makes a copy of SubsumedInsts.
+ findRootsRecursive(I, SubsumedInsts);
+ }
+}
+
+bool LoopReroll::DAGRootTracker::validateRootSet(DAGRootSet &DRS) {
+ if (DRS.Roots.empty())
+ return false;
+
+ // If the value of the base instruction is used outside the loop, we cannot
+ // reroll the loop. Check for other root instructions is unnecessary because
+ // they don't match any base instructions if their values are used outside.
+ if (hasUsesOutsideLoop(DRS.BaseInst, L))
+ return false;
+
+ // Consider a DAGRootSet with N-1 roots (so N different values including
+ // BaseInst).
+ // Define d = Roots[0] - BaseInst, which should be the same as
+ // Roots[I] - Roots[I-1] for all I in [1..N).
+ // Define D = BaseInst@J - BaseInst@J-1, where "@J" means the value at the
+ // loop iteration J.
+ //
+ // Now, For the loop iterations to be consecutive:
+ // D = d * N
+ const auto *ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
+ if (!ADR)
+ return false;
+
+ // Check that the first root is evenly spaced.
+ unsigned N = DRS.Roots.size() + 1;
+ const SCEV *StepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), ADR);
+ const SCEV *ScaleSCEV = SE->getConstant(StepSCEV->getType(), N);
+ if (ADR->getStepRecurrence(*SE) != SE->getMulExpr(StepSCEV, ScaleSCEV))
+ return false;
+
+ // Check that the remainling roots are evenly spaced.
+ for (unsigned i = 1; i < N - 1; ++i) {
+ const SCEV *NewStepSCEV = SE->getMinusSCEV(SE->getSCEV(DRS.Roots[i]),
+ SE->getSCEV(DRS.Roots[i-1]));
+ if (NewStepSCEV != StepSCEV)
+ return false;
+ }
+
+ return true;
+}
+
+bool LoopReroll::DAGRootTracker::
+findRootsBase(Instruction *IVU, SmallInstructionSet SubsumedInsts) {
+ // The base of a RootSet must be an AddRec, so it can be erased.
+ const auto *IVU_ADR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IVU));
+ if (!IVU_ADR || IVU_ADR->getLoop() != L)
+ return false;
+
+ std::map<int64_t, Instruction*> V;
+ if (!collectPossibleRoots(IVU, V))
+ return false;
+
+ // If we didn't get a root for index zero, then IVU must be
+ // subsumed.
+ if (V.find(0) == V.end())
+ SubsumedInsts.insert(IVU);
+
+ // Partition the vector into monotonically increasing indexes.
+ DAGRootSet DRS;
+ DRS.BaseInst = nullptr;
+
+ SmallVector<DAGRootSet, 16> PotentialRootSets;
+
+ for (auto &KV : V) {
+ if (!DRS.BaseInst) {
+ DRS.BaseInst = KV.second;
+ DRS.SubsumedInsts = SubsumedInsts;
+ } else if (DRS.Roots.empty()) {
+ DRS.Roots.push_back(KV.second);
+ } else if (V.find(KV.first - 1) != V.end()) {
+ DRS.Roots.push_back(KV.second);
+ } else {
+ // Linear sequence terminated.
+ if (!validateRootSet(DRS))
+ return false;
+
+ // Construct a new DAGRootSet with the next sequence.
+ PotentialRootSets.push_back(DRS);
+ DRS.BaseInst = KV.second;
+ DRS.Roots.clear();
+ }
+ }
+
+ if (!validateRootSet(DRS))
+ return false;
+
+ PotentialRootSets.push_back(DRS);
+
+ RootSets.append(PotentialRootSets.begin(), PotentialRootSets.end());
+
+ return true;
+}
+
+bool LoopReroll::DAGRootTracker::findRoots() {
+ Inc = IVToIncMap[IV];
+
+ assert(RootSets.empty() && "Unclean state!");
+ if (std::abs(Inc) == 1) {
+ for (auto *IVU : IV->users()) {
+ if (isLoopIncrement(IVU, IV))
+ LoopIncs.push_back(cast<Instruction>(IVU));
+ }
+ findRootsRecursive(IV, SmallInstructionSet());
+ LoopIncs.push_back(IV);
+ } else {
+ if (!findRootsBase(IV, SmallInstructionSet()))
+ return false;
+ }
+
+ // Ensure all sets have the same size.
+ if (RootSets.empty()) {
+ LLVM_DEBUG(dbgs() << "LRR: Aborting because no root sets found!\n");
+ return false;
+ }
+ for (auto &V : RootSets) {
+ if (V.Roots.empty() || V.Roots.size() != RootSets[0].Roots.size()) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LRR: Aborting because not all root sets have the same size\n");
+ return false;
+ }
+ }
+
+ Scale = RootSets[0].Roots.size() + 1;
+
+ if (Scale > IL_MaxRerollIterations) {
+ LLVM_DEBUG(dbgs() << "LRR: Aborting - too many iterations found. "
+ << "#Found=" << Scale
+ << ", #Max=" << IL_MaxRerollIterations << "\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "LRR: Successfully found roots: Scale=" << Scale
+ << "\n");
+
+ return true;
+}
+
+bool LoopReroll::DAGRootTracker::collectUsedInstructions(SmallInstructionSet &PossibleRedSet) {
+ // Populate the MapVector with all instructions in the block, in order first,
+ // so we can iterate over the contents later in perfect order.
+ for (auto &I : *L->getHeader()) {
+ Uses[&I].resize(IL_End);
+ }
+
+ SmallInstructionSet Exclude;
+ for (auto &DRS : RootSets) {
+ Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+ Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+ Exclude.insert(DRS.BaseInst);
+ }
+ Exclude.insert(LoopIncs.begin(), LoopIncs.end());
+
+ for (auto &DRS : RootSets) {
+ DenseSet<Instruction*> VBase;
+ collectInLoopUserSet(DRS.BaseInst, Exclude, PossibleRedSet, VBase);
+ for (auto *I : VBase) {
+ Uses[I].set(0);
+ }
+
+ unsigned Idx = 1;
+ for (auto *Root : DRS.Roots) {
+ DenseSet<Instruction*> V;
+ collectInLoopUserSet(Root, Exclude, PossibleRedSet, V);
+
+ // While we're here, check the use sets are the same size.
+ if (V.size() != VBase.size()) {
+ LLVM_DEBUG(dbgs() << "LRR: Aborting - use sets are different sizes\n");
+ return false;
+ }
+
+ for (auto *I : V) {
+ Uses[I].set(Idx);
+ }
+ ++Idx;
+ }
+
+ // Make sure our subsumed instructions are remembered too.
+ for (auto *I : DRS.SubsumedInsts) {
+ Uses[I].set(IL_All);
+ }
+ }
+
+ // Make sure the loop increments are also accounted for.
+
+ Exclude.clear();
+ for (auto &DRS : RootSets) {
+ Exclude.insert(DRS.Roots.begin(), DRS.Roots.end());
+ Exclude.insert(DRS.SubsumedInsts.begin(), DRS.SubsumedInsts.end());
+ Exclude.insert(DRS.BaseInst);
+ }
+
+ DenseSet<Instruction*> V;
+ collectInLoopUserSet(LoopIncs, Exclude, PossibleRedSet, V);
+ for (auto *I : V) {
if (I->mayHaveSideEffects()) {
LLVM_DEBUG(dbgs() << "LRR: Aborting - "
<< "An instruction which does not belong to any root "
<< "sets must not have side effects: " << *I);
return false;
}
- Uses[I].set(IL_All);
- }
-
- return true;
-}
-
-/// Get the next instruction in "In" that is a member of set Val.
-/// Start searching from StartI, and do not return anything in Exclude.
-/// If StartI is not given, start from In.begin().
-LoopReroll::DAGRootTracker::UsesTy::iterator
-LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
- const SmallInstructionSet &Exclude,
- UsesTy::iterator *StartI) {
- UsesTy::iterator I = StartI ? *StartI : In.begin();
- while (I != In.end() && (I->second.test(Val) == 0 ||
+ Uses[I].set(IL_All);
+ }
+
+ return true;
+}
+
+/// Get the next instruction in "In" that is a member of set Val.
+/// Start searching from StartI, and do not return anything in Exclude.
+/// If StartI is not given, start from In.begin().
+LoopReroll::DAGRootTracker::UsesTy::iterator
+LoopReroll::DAGRootTracker::nextInstr(int Val, UsesTy &In,
+ const SmallInstructionSet &Exclude,
+ UsesTy::iterator *StartI) {
+ UsesTy::iterator I = StartI ? *StartI : In.begin();
+ while (I != In.end() && (I->second.test(Val) == 0 ||
Exclude.contains(I->first)))
- ++I;
- return I;
-}
-
-bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
- for (auto &DRS : RootSets) {
- if (DRS.BaseInst == I)
- return true;
- }
- return false;
-}
-
-bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
- for (auto &DRS : RootSets) {
- if (is_contained(DRS.Roots, I))
- return true;
- }
- return false;
-}
-
-/// Return true if instruction I depends on any instruction between
-/// Start and End.
-bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
- UsesTy::iterator Start,
- UsesTy::iterator End) {
- for (auto *U : I->users()) {
- for (auto It = Start; It != End; ++It)
- if (U == It->first)
- return true;
- }
- return false;
-}
-
-static bool isIgnorableInst(const Instruction *I) {
- if (isa<DbgInfoIntrinsic>(I))
- return true;
- const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
- if (!II)
- return false;
- switch (II->getIntrinsicID()) {
- default:
- return false;
- case Intrinsic::annotation:
- case Intrinsic::ptr_annotation:
- case Intrinsic::var_annotation:
- // TODO: the following intrinsics may also be allowed:
- // lifetime_start, lifetime_end, invariant_start, invariant_end
- return true;
- }
- return false;
-}
-
-bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
- // We now need to check for equivalence of the use graph of each root with
- // that of the primary induction variable (excluding the roots). Our goal
- // here is not to solve the full graph isomorphism problem, but rather to
- // catch common cases without a lot of work. As a result, we will assume
- // that the relative order of the instructions in each unrolled iteration
- // is the same (although we will not make an assumption about how the
- // different iterations are intermixed). Note that while the order must be
- // the same, the instructions may not be in the same basic block.
-
- // An array of just the possible reductions for this scale factor. When we
- // collect the set of all users of some root instructions, these reduction
- // instructions are treated as 'final' (their uses are not considered).
- // This is important because we don't want the root use set to search down
- // the reduction chain.
- SmallInstructionSet PossibleRedSet;
- SmallInstructionSet PossibleRedLastSet;
- SmallInstructionSet PossibleRedPHISet;
- Reductions.restrictToScale(Scale, PossibleRedSet,
- PossibleRedPHISet, PossibleRedLastSet);
-
- // Populate "Uses" with where each instruction is used.
- if (!collectUsedInstructions(PossibleRedSet))
- return false;
-
- // Make sure we mark the reduction PHIs as used in all iterations.
- for (auto *I : PossibleRedPHISet) {
- Uses[I].set(IL_All);
- }
-
- // Make sure we mark loop-control-only PHIs as used in all iterations. See
- // comment above LoopReroll::isLoopControlIV for more information.
- BasicBlock *Header = L->getHeader();
- if (LoopControlIV && LoopControlIV != IV) {
- for (auto *U : LoopControlIV->users()) {
- Instruction *IVUser = dyn_cast<Instruction>(U);
- // IVUser could be loop increment or compare
- Uses[IVUser].set(IL_All);
- for (auto *UU : IVUser->users()) {
- Instruction *UUser = dyn_cast<Instruction>(UU);
- // UUser could be compare, PHI or branch
- Uses[UUser].set(IL_All);
- // Skip SExt
- if (isa<SExtInst>(UUser)) {
- UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
- Uses[UUser].set(IL_All);
- }
- // Is UUser a compare instruction?
- if (UU->hasOneUse()) {
- Instruction *BI = dyn_cast<BranchInst>(*UUser->user_begin());
- if (BI == cast<BranchInst>(Header->getTerminator()))
- Uses[BI].set(IL_All);
- }
- }
- }
- }
-
- // Make sure all instructions in the loop are in one and only one
- // set.
- for (auto &KV : Uses) {
- if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
- LLVM_DEBUG(
- dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
- << *KV.first << " (#uses=" << KV.second.count() << ")\n");
- return false;
- }
- }
-
- LLVM_DEBUG(for (auto &KV
- : Uses) {
- dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
- });
-
- for (unsigned Iter = 1; Iter < Scale; ++Iter) {
- // In addition to regular aliasing information, we need to look for
- // instructions from later (future) iterations that have side effects
- // preventing us from reordering them past other instructions with side
- // effects.
- bool FutureSideEffects = false;
- AliasSetTracker AST(*AA);
- // The map between instructions in f(%iv.(i+1)) and f(%iv).
- DenseMap<Value *, Value *> BaseMap;
-
- // Compare iteration Iter to the base.
- SmallInstructionSet Visited;
- auto BaseIt = nextInstr(0, Uses, Visited);
- auto RootIt = nextInstr(Iter, Uses, Visited);
- auto LastRootIt = Uses.begin();
-
- while (BaseIt != Uses.end() && RootIt != Uses.end()) {
- Instruction *BaseInst = BaseIt->first;
- Instruction *RootInst = RootIt->first;
-
- // Skip over the IV or root instructions; only match their users.
- bool Continue = false;
- if (isBaseInst(BaseInst)) {
- Visited.insert(BaseInst);
- BaseIt = nextInstr(0, Uses, Visited);
- Continue = true;
- }
- if (isRootInst(RootInst)) {
- LastRootIt = RootIt;
- Visited.insert(RootInst);
- RootIt = nextInstr(Iter, Uses, Visited);
- Continue = true;
- }
- if (Continue) continue;
-
- if (!BaseInst->isSameOperationAs(RootInst)) {
- // Last chance saloon. We don't try and solve the full isomorphism
- // problem, but try and at least catch the case where two instructions
- // *of different types* are round the wrong way. We won't be able to
- // efficiently tell, given two ADD instructions, which way around we
- // should match them, but given an ADD and a SUB, we can at least infer
- // which one is which.
- //
- // This should allow us to deal with a greater subset of the isomorphism
- // problem. It does however change a linear algorithm into a quadratic
- // one, so limit the number of probes we do.
- auto TryIt = RootIt;
- unsigned N = NumToleratedFailedMatches;
- while (TryIt != Uses.end() &&
- !BaseInst->isSameOperationAs(TryIt->first) &&
- N--) {
- ++TryIt;
- TryIt = nextInstr(Iter, Uses, Visited, &TryIt);
- }
-
- if (TryIt == Uses.end() || TryIt == RootIt ||
- instrDependsOn(TryIt->first, RootIt, TryIt)) {
- LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
- << *BaseInst << " vs. " << *RootInst << "\n");
- return false;
- }
-
- RootIt = TryIt;
- RootInst = TryIt->first;
- }
-
- // All instructions between the last root and this root
- // may belong to some other iteration. If they belong to a
- // future iteration, then they're dangerous to alias with.
- //
- // Note that because we allow a limited amount of flexibility in the order
- // that we visit nodes, LastRootIt might be *before* RootIt, in which
- // case we've already checked this set of instructions so we shouldn't
- // do anything.
- for (; LastRootIt < RootIt; ++LastRootIt) {
- Instruction *I = LastRootIt->first;
- if (LastRootIt->second.find_first() < (int)Iter)
- continue;
- if (I->mayWriteToMemory())
- AST.add(I);
- // Note: This is specifically guarded by a check on isa<PHINode>,
- // which while a valid (somewhat arbitrary) micro-optimization, is
- // needed because otherwise isSafeToSpeculativelyExecute returns
- // false on PHI nodes.
- if (!isa<PHINode>(I) && !isUnorderedLoadStore(I) &&
- !isSafeToSpeculativelyExecute(I))
- // Intervening instructions cause side effects.
- FutureSideEffects = true;
- }
-
- // Make sure that this instruction, which is in the use set of this
- // root instruction, does not also belong to the base set or the set of
- // some other root instruction.
- if (RootIt->second.count() > 1) {
- LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
- << " vs. " << *RootInst << " (prev. case overlap)\n");
- return false;
- }
-
- // Make sure that we don't alias with any instruction in the alias set
- // tracker. If we do, then we depend on a future iteration, and we
- // can't reroll.
- if (RootInst->mayReadFromMemory())
- for (auto &K : AST) {
- if (K.aliasesUnknownInst(RootInst, *AA)) {
- LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
- << *BaseInst << " vs. " << *RootInst
- << " (depends on future store)\n");
- return false;
- }
- }
-
- // If we've past an instruction from a future iteration that may have
- // side effects, and this instruction might also, then we can't reorder
- // them, and this matching fails. As an exception, we allow the alias
- // set tracker to handle regular (unordered) load/store dependencies.
- if (FutureSideEffects && ((!isUnorderedLoadStore(BaseInst) &&
- !isSafeToSpeculativelyExecute(BaseInst)) ||
- (!isUnorderedLoadStore(RootInst) &&
- !isSafeToSpeculativelyExecute(RootInst)))) {
- LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
- << " vs. " << *RootInst
- << " (side effects prevent reordering)\n");
- return false;
- }
-
- // For instructions that are part of a reduction, if the operation is
- // associative, then don't bother matching the operands (because we
- // already know that the instructions are isomorphic, and the order
- // within the iteration does not matter). For non-associative reductions,
- // we do need to match the operands, because we need to reject
- // out-of-order instructions within an iteration!
- // For example (assume floating-point addition), we need to reject this:
- // x += a[i]; x += b[i];
- // x += a[i+1]; x += b[i+1];
- // x += b[i+2]; x += a[i+2];
- bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
-
- if (!(InReduction && BaseInst->isAssociative())) {
- bool Swapped = false, SomeOpMatched = false;
- for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
- Value *Op2 = RootInst->getOperand(j);
-
- // If this is part of a reduction (and the operation is not
- // associatve), then we match all operands, but not those that are
- // part of the reduction.
- if (InReduction)
- if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
- if (Reductions.isPairInSame(RootInst, Op2I))
- continue;
-
- DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
- if (BMI != BaseMap.end()) {
- Op2 = BMI->second;
- } else {
- for (auto &DRS : RootSets) {
- if (DRS.Roots[Iter-1] == (Instruction*) Op2) {
- Op2 = DRS.BaseInst;
- break;
- }
- }
- }
-
- if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
- // If we've not already decided to swap the matched operands, and
- // we've not already matched our first operand (note that we could
- // have skipped matching the first operand because it is part of a
- // reduction above), and the instruction is commutative, then try
- // the swapped match.
- if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
- BaseInst->getOperand(!j) == Op2) {
- Swapped = true;
- } else {
- LLVM_DEBUG(dbgs()
- << "LRR: iteration root match failed at " << *BaseInst
- << " vs. " << *RootInst << " (operand " << j << ")\n");
- return false;
- }
- }
-
- SomeOpMatched = true;
- }
- }
-
- if ((!PossibleRedLastSet.count(BaseInst) &&
- hasUsesOutsideLoop(BaseInst, L)) ||
- (!PossibleRedLastSet.count(RootInst) &&
- hasUsesOutsideLoop(RootInst, L))) {
- LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
- << " vs. " << *RootInst << " (uses outside loop)\n");
- return false;
- }
-
- Reductions.recordPair(BaseInst, RootInst, Iter);
- BaseMap.insert(std::make_pair(RootInst, BaseInst));
-
- LastRootIt = RootIt;
- Visited.insert(BaseInst);
- Visited.insert(RootInst);
- BaseIt = nextInstr(0, Uses, Visited);
- RootIt = nextInstr(Iter, Uses, Visited);
- }
- assert(BaseIt == Uses.end() && RootIt == Uses.end() &&
- "Mismatched set sizes!");
- }
-
- LLVM_DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *IV
- << "\n");
-
- return true;
-}
-
-void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) {
- BasicBlock *Header = L->getHeader();
-
- // Compute the start and increment for each BaseInst before we start erasing
- // instructions.
- SmallVector<const SCEV *, 8> StartExprs;
- SmallVector<const SCEV *, 8> IncrExprs;
- for (auto &DRS : RootSets) {
- const SCEVAddRecExpr *IVSCEV =
- cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
- StartExprs.push_back(IVSCEV->getStart());
- IncrExprs.push_back(SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), IVSCEV));
- }
-
- // Remove instructions associated with non-base iterations.
- for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend();
- J != JE;) {
- unsigned I = Uses[&*J].find_first();
- if (I > 0 && I < IL_All) {
- LLVM_DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
- J++->eraseFromParent();
- continue;
- }
-
- ++J;
- }
-
- // Rewrite each BaseInst using SCEV.
- for (size_t i = 0, e = RootSets.size(); i != e; ++i)
- // Insert the new induction variable.
- replaceIV(RootSets[i], StartExprs[i], IncrExprs[i]);
-
- { // Limit the lifetime of SCEVExpander.
- BranchInst *BI = cast<BranchInst>(Header->getTerminator());
- const DataLayout &DL = Header->getModule()->getDataLayout();
- SCEVExpander Expander(*SE, DL, "reroll");
- auto Zero = SE->getZero(BackedgeTakenCount->getType());
- auto One = SE->getOne(BackedgeTakenCount->getType());
- auto NewIVSCEV = SE->getAddRecExpr(Zero, One, L, SCEV::FlagAnyWrap);
- Value *NewIV =
- Expander.expandCodeFor(NewIVSCEV, BackedgeTakenCount->getType(),
- Header->getFirstNonPHIOrDbg());
- // FIXME: This arithmetic can overflow.
- auto TripCount = SE->getAddExpr(BackedgeTakenCount, One);
- auto ScaledTripCount = SE->getMulExpr(
- TripCount, SE->getConstant(BackedgeTakenCount->getType(), Scale));
- auto ScaledBECount = SE->getMinusSCEV(ScaledTripCount, One);
- Value *TakenCount =
- Expander.expandCodeFor(ScaledBECount, BackedgeTakenCount->getType(),
- Header->getFirstNonPHIOrDbg());
- Value *Cond =
- new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, TakenCount, "exitcond");
- BI->setCondition(Cond);
-
- if (BI->getSuccessor(1) != Header)
- BI->swapSuccessors();
- }
-
- SimplifyInstructionsInBlock(Header, TLI);
- DeleteDeadPHIs(Header, TLI);
-}
-
-void LoopReroll::DAGRootTracker::replaceIV(DAGRootSet &DRS,
- const SCEV *Start,
- const SCEV *IncrExpr) {
- BasicBlock *Header = L->getHeader();
- Instruction *Inst = DRS.BaseInst;
-
- const SCEV *NewIVSCEV =
- SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap);
-
- { // Limit the lifetime of SCEVExpander.
- const DataLayout &DL = Header->getModule()->getDataLayout();
- SCEVExpander Expander(*SE, DL, "reroll");
- Value *NewIV = Expander.expandCodeFor(NewIVSCEV, Inst->getType(),
- Header->getFirstNonPHIOrDbg());
-
- for (auto &KV : Uses)
- if (KV.second.find_first() == 0)
- KV.first->replaceUsesOfWith(Inst, NewIV);
- }
-}
-
-// Validate the selected reductions. All iterations must have an isomorphic
-// part of the reduction chain and, for non-associative reductions, the chain
-// entries must appear in order.
-bool LoopReroll::ReductionTracker::validateSelected() {
- // For a non-associative reduction, the chain entries must appear in order.
- for (int i : Reds) {
- int PrevIter = 0, BaseCount = 0, Count = 0;
- for (Instruction *J : PossibleReds[i]) {
- // Note that all instructions in the chain must have been found because
- // all instructions in the function must have been assigned to some
- // iteration.
- int Iter = PossibleRedIter[J];
- if (Iter != PrevIter && Iter != PrevIter + 1 &&
- !PossibleReds[i].getReducedValue()->isAssociative()) {
- LLVM_DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: "
- << J << "\n");
- return false;
- }
-
- if (Iter != PrevIter) {
- if (Count != BaseCount) {
- LLVM_DEBUG(dbgs()
- << "LRR: Iteration " << PrevIter << " reduction use count "
- << Count << " is not equal to the base use count "
- << BaseCount << "\n");
- return false;
- }
-
- Count = 0;
- }
-
- ++Count;
- if (Iter == 0)
- ++BaseCount;
-
- PrevIter = Iter;
- }
- }
-
- return true;
-}
-
-// For all selected reductions, remove all parts except those in the first
-// iteration (and the PHI). Replace outside uses of the reduced value with uses
-// of the first-iteration reduced value (in other words, reroll the selected
-// reductions).
-void LoopReroll::ReductionTracker::replaceSelected() {
- // Fixup reductions to refer to the last instruction associated with the
- // first iteration (not the last).
- for (int i : Reds) {
- int j = 0;
- for (int e = PossibleReds[i].size(); j != e; ++j)
- if (PossibleRedIter[PossibleReds[i][j]] != 0) {
- --j;
- break;
- }
-
- // Replace users with the new end-of-chain value.
- SmallInstructionVector Users;
- for (User *U : PossibleReds[i].getReducedValue()->users()) {
- Users.push_back(cast<Instruction>(U));
- }
-
- for (Instruction *User : Users)
- User->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
- PossibleReds[i][j]);
- }
-}
-
-// Reroll the provided loop with respect to the provided induction variable.
-// Generally, we're looking for a loop like this:
-//
-// %iv = phi [ (preheader, ...), (body, %iv.next) ]
-// f(%iv)
-// %iv.1 = add %iv, 1 <-- a root increment
-// f(%iv.1)
-// %iv.2 = add %iv, 2 <-- a root increment
-// f(%iv.2)
-// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment
-// f(%iv.scale_m_1)
-// ...
-// %iv.next = add %iv, scale
-// %cmp = icmp(%iv, ...)
-// br %cmp, header, exit
-//
-// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
-// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
-// be intermixed with eachother. The restriction imposed by this algorithm is
-// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
-// etc. be the same.
-//
-// First, we collect the use set of %iv, excluding the other increment roots.
-// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
-// times, having collected the use set of f(%iv.(i+1)), during which we:
-// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
-// the next unmatched instruction in f(%iv.(i+1)).
-// - Ensure that both matched instructions don't have any external users
-// (with the exception of last-in-chain reduction instructions).
-// - Track the (aliasing) write set, and other side effects, of all
-// instructions that belong to future iterations that come before the matched
-// instructions. If the matched instructions read from that write set, then
-// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
-// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
-// if any of these future instructions had side effects (could not be
-// speculatively executed), and so do the matched instructions, when we
-// cannot reorder those side-effect-producing instructions, and rerolling
-// fails.
-//
-// Finally, we make sure that all loop instructions are either loop increment
-// roots, belong to simple latch code, parts of validated reductions, part of
-// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
-// have been validated), then we reroll the loop.
-bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
- const SCEV *BackedgeTakenCount,
- ReductionTracker &Reductions) {
- DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
- IVToIncMap, LoopControlIV);
-
- if (!DAGRoots.findRoots())
- return false;
- LLVM_DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV
- << "\n");
-
- if (!DAGRoots.validate(Reductions))
- return false;
- if (!Reductions.validateSelected())
- return false;
- // At this point, we've validated the rerolling, and we're committed to
- // making changes!
-
- Reductions.replaceSelected();
- DAGRoots.replace(BackedgeTakenCount);
-
- ++NumRerolledLoops;
- return true;
-}
-
+ ++I;
+ return I;
+}
+
+bool LoopReroll::DAGRootTracker::isBaseInst(Instruction *I) {
+ for (auto &DRS : RootSets) {
+ if (DRS.BaseInst == I)
+ return true;
+ }
+ return false;
+}
+
+bool LoopReroll::DAGRootTracker::isRootInst(Instruction *I) {
+ for (auto &DRS : RootSets) {
+ if (is_contained(DRS.Roots, I))
+ return true;
+ }
+ return false;
+}
+
+/// Return true if instruction I depends on any instruction between
+/// Start and End.
+bool LoopReroll::DAGRootTracker::instrDependsOn(Instruction *I,
+ UsesTy::iterator Start,
+ UsesTy::iterator End) {
+ for (auto *U : I->users()) {
+ for (auto It = Start; It != End; ++It)
+ if (U == It->first)
+ return true;
+ }
+ return false;
+}
+
+static bool isIgnorableInst(const Instruction *I) {
+ if (isa<DbgInfoIntrinsic>(I))
+ return true;
+ const IntrinsicInst* II = dyn_cast<IntrinsicInst>(I);
+ if (!II)
+ return false;
+ switch (II->getIntrinsicID()) {
+ default:
+ return false;
+ case Intrinsic::annotation:
+ case Intrinsic::ptr_annotation:
+ case Intrinsic::var_annotation:
+ // TODO: the following intrinsics may also be allowed:
+ // lifetime_start, lifetime_end, invariant_start, invariant_end
+ return true;
+ }
+ return false;
+}
+
+bool LoopReroll::DAGRootTracker::validate(ReductionTracker &Reductions) {
+ // We now need to check for equivalence of the use graph of each root with
+ // that of the primary induction variable (excluding the roots). Our goal
+ // here is not to solve the full graph isomorphism problem, but rather to
+ // catch common cases without a lot of work. As a result, we will assume
+ // that the relative order of the instructions in each unrolled iteration
+ // is the same (although we will not make an assumption about how the
+ // different iterations are intermixed). Note that while the order must be
+ // the same, the instructions may not be in the same basic block.
+
+ // An array of just the possible reductions for this scale factor. When we
+ // collect the set of all users of some root instructions, these reduction
+ // instructions are treated as 'final' (their uses are not considered).
+ // This is important because we don't want the root use set to search down
+ // the reduction chain.
+ SmallInstructionSet PossibleRedSet;
+ SmallInstructionSet PossibleRedLastSet;
+ SmallInstructionSet PossibleRedPHISet;
+ Reductions.restrictToScale(Scale, PossibleRedSet,
+ PossibleRedPHISet, PossibleRedLastSet);
+
+ // Populate "Uses" with where each instruction is used.
+ if (!collectUsedInstructions(PossibleRedSet))
+ return false;
+
+ // Make sure we mark the reduction PHIs as used in all iterations.
+ for (auto *I : PossibleRedPHISet) {
+ Uses[I].set(IL_All);
+ }
+
+ // Make sure we mark loop-control-only PHIs as used in all iterations. See
+ // comment above LoopReroll::isLoopControlIV for more information.
+ BasicBlock *Header = L->getHeader();
+ if (LoopControlIV && LoopControlIV != IV) {
+ for (auto *U : LoopControlIV->users()) {
+ Instruction *IVUser = dyn_cast<Instruction>(U);
+ // IVUser could be loop increment or compare
+ Uses[IVUser].set(IL_All);
+ for (auto *UU : IVUser->users()) {
+ Instruction *UUser = dyn_cast<Instruction>(UU);
+ // UUser could be compare, PHI or branch
+ Uses[UUser].set(IL_All);
+ // Skip SExt
+ if (isa<SExtInst>(UUser)) {
+ UUser = dyn_cast<Instruction>(*(UUser->user_begin()));
+ Uses[UUser].set(IL_All);
+ }
+ // Is UUser a compare instruction?
+ if (UU->hasOneUse()) {
+ Instruction *BI = dyn_cast<BranchInst>(*UUser->user_begin());
+ if (BI == cast<BranchInst>(Header->getTerminator()))
+ Uses[BI].set(IL_All);
+ }
+ }
+ }
+ }
+
+ // Make sure all instructions in the loop are in one and only one
+ // set.
+ for (auto &KV : Uses) {
+ if (KV.second.count() != 1 && !isIgnorableInst(KV.first)) {
+ LLVM_DEBUG(
+ dbgs() << "LRR: Aborting - instruction is not used in 1 iteration: "
+ << *KV.first << " (#uses=" << KV.second.count() << ")\n");
+ return false;
+ }
+ }
+
+ LLVM_DEBUG(for (auto &KV
+ : Uses) {
+ dbgs() << "LRR: " << KV.second.find_first() << "\t" << *KV.first << "\n";
+ });
+
+ for (unsigned Iter = 1; Iter < Scale; ++Iter) {
+ // In addition to regular aliasing information, we need to look for
+ // instructions from later (future) iterations that have side effects
+ // preventing us from reordering them past other instructions with side
+ // effects.
+ bool FutureSideEffects = false;
+ AliasSetTracker AST(*AA);
+ // The map between instructions in f(%iv.(i+1)) and f(%iv).
+ DenseMap<Value *, Value *> BaseMap;
+
+ // Compare iteration Iter to the base.
+ SmallInstructionSet Visited;
+ auto BaseIt = nextInstr(0, Uses, Visited);
+ auto RootIt = nextInstr(Iter, Uses, Visited);
+ auto LastRootIt = Uses.begin();
+
+ while (BaseIt != Uses.end() && RootIt != Uses.end()) {
+ Instruction *BaseInst = BaseIt->first;
+ Instruction *RootInst = RootIt->first;
+
+ // Skip over the IV or root instructions; only match their users.
+ bool Continue = false;
+ if (isBaseInst(BaseInst)) {
+ Visited.insert(BaseInst);
+ BaseIt = nextInstr(0, Uses, Visited);
+ Continue = true;
+ }
+ if (isRootInst(RootInst)) {
+ LastRootIt = RootIt;
+ Visited.insert(RootInst);
+ RootIt = nextInstr(Iter, Uses, Visited);
+ Continue = true;
+ }
+ if (Continue) continue;
+
+ if (!BaseInst->isSameOperationAs(RootInst)) {
+ // Last chance saloon. We don't try and solve the full isomorphism
+ // problem, but try and at least catch the case where two instructions
+ // *of different types* are round the wrong way. We won't be able to
+ // efficiently tell, given two ADD instructions, which way around we
+ // should match them, but given an ADD and a SUB, we can at least infer
+ // which one is which.
+ //
+ // This should allow us to deal with a greater subset of the isomorphism
+ // problem. It does however change a linear algorithm into a quadratic
+ // one, so limit the number of probes we do.
+ auto TryIt = RootIt;
+ unsigned N = NumToleratedFailedMatches;
+ while (TryIt != Uses.end() &&
+ !BaseInst->isSameOperationAs(TryIt->first) &&
+ N--) {
+ ++TryIt;
+ TryIt = nextInstr(Iter, Uses, Visited, &TryIt);
+ }
+
+ if (TryIt == Uses.end() || TryIt == RootIt ||
+ instrDependsOn(TryIt->first, RootIt, TryIt)) {
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
+ << *BaseInst << " vs. " << *RootInst << "\n");
+ return false;
+ }
+
+ RootIt = TryIt;
+ RootInst = TryIt->first;
+ }
+
+ // All instructions between the last root and this root
+ // may belong to some other iteration. If they belong to a
+ // future iteration, then they're dangerous to alias with.
+ //
+ // Note that because we allow a limited amount of flexibility in the order
+ // that we visit nodes, LastRootIt might be *before* RootIt, in which
+ // case we've already checked this set of instructions so we shouldn't
+ // do anything.
+ for (; LastRootIt < RootIt; ++LastRootIt) {
+ Instruction *I = LastRootIt->first;
+ if (LastRootIt->second.find_first() < (int)Iter)
+ continue;
+ if (I->mayWriteToMemory())
+ AST.add(I);
+ // Note: This is specifically guarded by a check on isa<PHINode>,
+ // which while a valid (somewhat arbitrary) micro-optimization, is
+ // needed because otherwise isSafeToSpeculativelyExecute returns
+ // false on PHI nodes.
+ if (!isa<PHINode>(I) && !isUnorderedLoadStore(I) &&
+ !isSafeToSpeculativelyExecute(I))
+ // Intervening instructions cause side effects.
+ FutureSideEffects = true;
+ }
+
+ // Make sure that this instruction, which is in the use set of this
+ // root instruction, does not also belong to the base set or the set of
+ // some other root instruction.
+ if (RootIt->second.count() > 1) {
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst << " (prev. case overlap)\n");
+ return false;
+ }
+
+ // Make sure that we don't alias with any instruction in the alias set
+ // tracker. If we do, then we depend on a future iteration, and we
+ // can't reroll.
+ if (RootInst->mayReadFromMemory())
+ for (auto &K : AST) {
+ if (K.aliasesUnknownInst(RootInst, *AA)) {
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at "
+ << *BaseInst << " vs. " << *RootInst
+ << " (depends on future store)\n");
+ return false;
+ }
+ }
+
+ // If we've past an instruction from a future iteration that may have
+ // side effects, and this instruction might also, then we can't reorder
+ // them, and this matching fails. As an exception, we allow the alias
+ // set tracker to handle regular (unordered) load/store dependencies.
+ if (FutureSideEffects && ((!isUnorderedLoadStore(BaseInst) &&
+ !isSafeToSpeculativelyExecute(BaseInst)) ||
+ (!isUnorderedLoadStore(RootInst) &&
+ !isSafeToSpeculativelyExecute(RootInst)))) {
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst
+ << " (side effects prevent reordering)\n");
+ return false;
+ }
+
+ // For instructions that are part of a reduction, if the operation is
+ // associative, then don't bother matching the operands (because we
+ // already know that the instructions are isomorphic, and the order
+ // within the iteration does not matter). For non-associative reductions,
+ // we do need to match the operands, because we need to reject
+ // out-of-order instructions within an iteration!
+ // For example (assume floating-point addition), we need to reject this:
+ // x += a[i]; x += b[i];
+ // x += a[i+1]; x += b[i+1];
+ // x += b[i+2]; x += a[i+2];
+ bool InReduction = Reductions.isPairInSame(BaseInst, RootInst);
+
+ if (!(InReduction && BaseInst->isAssociative())) {
+ bool Swapped = false, SomeOpMatched = false;
+ for (unsigned j = 0; j < BaseInst->getNumOperands(); ++j) {
+ Value *Op2 = RootInst->getOperand(j);
+
+ // If this is part of a reduction (and the operation is not
+ // associatve), then we match all operands, but not those that are
+ // part of the reduction.
+ if (InReduction)
+ if (Instruction *Op2I = dyn_cast<Instruction>(Op2))
+ if (Reductions.isPairInSame(RootInst, Op2I))
+ continue;
+
+ DenseMap<Value *, Value *>::iterator BMI = BaseMap.find(Op2);
+ if (BMI != BaseMap.end()) {
+ Op2 = BMI->second;
+ } else {
+ for (auto &DRS : RootSets) {
+ if (DRS.Roots[Iter-1] == (Instruction*) Op2) {
+ Op2 = DRS.BaseInst;
+ break;
+ }
+ }
+ }
+
+ if (BaseInst->getOperand(Swapped ? unsigned(!j) : j) != Op2) {
+ // If we've not already decided to swap the matched operands, and
+ // we've not already matched our first operand (note that we could
+ // have skipped matching the first operand because it is part of a
+ // reduction above), and the instruction is commutative, then try
+ // the swapped match.
+ if (!Swapped && BaseInst->isCommutative() && !SomeOpMatched &&
+ BaseInst->getOperand(!j) == Op2) {
+ Swapped = true;
+ } else {
+ LLVM_DEBUG(dbgs()
+ << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst << " (operand " << j << ")\n");
+ return false;
+ }
+ }
+
+ SomeOpMatched = true;
+ }
+ }
+
+ if ((!PossibleRedLastSet.count(BaseInst) &&
+ hasUsesOutsideLoop(BaseInst, L)) ||
+ (!PossibleRedLastSet.count(RootInst) &&
+ hasUsesOutsideLoop(RootInst, L))) {
+ LLVM_DEBUG(dbgs() << "LRR: iteration root match failed at " << *BaseInst
+ << " vs. " << *RootInst << " (uses outside loop)\n");
+ return false;
+ }
+
+ Reductions.recordPair(BaseInst, RootInst, Iter);
+ BaseMap.insert(std::make_pair(RootInst, BaseInst));
+
+ LastRootIt = RootIt;
+ Visited.insert(BaseInst);
+ Visited.insert(RootInst);
+ BaseIt = nextInstr(0, Uses, Visited);
+ RootIt = nextInstr(Iter, Uses, Visited);
+ }
+ assert(BaseIt == Uses.end() && RootIt == Uses.end() &&
+ "Mismatched set sizes!");
+ }
+
+ LLVM_DEBUG(dbgs() << "LRR: Matched all iteration increments for " << *IV
+ << "\n");
+
+ return true;
+}
+
+void LoopReroll::DAGRootTracker::replace(const SCEV *BackedgeTakenCount) {
+ BasicBlock *Header = L->getHeader();
+
+ // Compute the start and increment for each BaseInst before we start erasing
+ // instructions.
+ SmallVector<const SCEV *, 8> StartExprs;
+ SmallVector<const SCEV *, 8> IncrExprs;
+ for (auto &DRS : RootSets) {
+ const SCEVAddRecExpr *IVSCEV =
+ cast<SCEVAddRecExpr>(SE->getSCEV(DRS.BaseInst));
+ StartExprs.push_back(IVSCEV->getStart());
+ IncrExprs.push_back(SE->getMinusSCEV(SE->getSCEV(DRS.Roots[0]), IVSCEV));
+ }
+
+ // Remove instructions associated with non-base iterations.
+ for (BasicBlock::reverse_iterator J = Header->rbegin(), JE = Header->rend();
+ J != JE;) {
+ unsigned I = Uses[&*J].find_first();
+ if (I > 0 && I < IL_All) {
+ LLVM_DEBUG(dbgs() << "LRR: removing: " << *J << "\n");
+ J++->eraseFromParent();
+ continue;
+ }
+
+ ++J;
+ }
+
+ // Rewrite each BaseInst using SCEV.
+ for (size_t i = 0, e = RootSets.size(); i != e; ++i)
+ // Insert the new induction variable.
+ replaceIV(RootSets[i], StartExprs[i], IncrExprs[i]);
+
+ { // Limit the lifetime of SCEVExpander.
+ BranchInst *BI = cast<BranchInst>(Header->getTerminator());
+ const DataLayout &DL = Header->getModule()->getDataLayout();
+ SCEVExpander Expander(*SE, DL, "reroll");
+ auto Zero = SE->getZero(BackedgeTakenCount->getType());
+ auto One = SE->getOne(BackedgeTakenCount->getType());
+ auto NewIVSCEV = SE->getAddRecExpr(Zero, One, L, SCEV::FlagAnyWrap);
+ Value *NewIV =
+ Expander.expandCodeFor(NewIVSCEV, BackedgeTakenCount->getType(),
+ Header->getFirstNonPHIOrDbg());
+ // FIXME: This arithmetic can overflow.
+ auto TripCount = SE->getAddExpr(BackedgeTakenCount, One);
+ auto ScaledTripCount = SE->getMulExpr(
+ TripCount, SE->getConstant(BackedgeTakenCount->getType(), Scale));
+ auto ScaledBECount = SE->getMinusSCEV(ScaledTripCount, One);
+ Value *TakenCount =
+ Expander.expandCodeFor(ScaledBECount, BackedgeTakenCount->getType(),
+ Header->getFirstNonPHIOrDbg());
+ Value *Cond =
+ new ICmpInst(BI, CmpInst::ICMP_EQ, NewIV, TakenCount, "exitcond");
+ BI->setCondition(Cond);
+
+ if (BI->getSuccessor(1) != Header)
+ BI->swapSuccessors();
+ }
+
+ SimplifyInstructionsInBlock(Header, TLI);
+ DeleteDeadPHIs(Header, TLI);
+}
+
+void LoopReroll::DAGRootTracker::replaceIV(DAGRootSet &DRS,
+ const SCEV *Start,
+ const SCEV *IncrExpr) {
+ BasicBlock *Header = L->getHeader();
+ Instruction *Inst = DRS.BaseInst;
+
+ const SCEV *NewIVSCEV =
+ SE->getAddRecExpr(Start, IncrExpr, L, SCEV::FlagAnyWrap);
+
+ { // Limit the lifetime of SCEVExpander.
+ const DataLayout &DL = Header->getModule()->getDataLayout();
+ SCEVExpander Expander(*SE, DL, "reroll");
+ Value *NewIV = Expander.expandCodeFor(NewIVSCEV, Inst->getType(),
+ Header->getFirstNonPHIOrDbg());
+
+ for (auto &KV : Uses)
+ if (KV.second.find_first() == 0)
+ KV.first->replaceUsesOfWith(Inst, NewIV);
+ }
+}
+
+// Validate the selected reductions. All iterations must have an isomorphic
+// part of the reduction chain and, for non-associative reductions, the chain
+// entries must appear in order.
+bool LoopReroll::ReductionTracker::validateSelected() {
+ // For a non-associative reduction, the chain entries must appear in order.
+ for (int i : Reds) {
+ int PrevIter = 0, BaseCount = 0, Count = 0;
+ for (Instruction *J : PossibleReds[i]) {
+ // Note that all instructions in the chain must have been found because
+ // all instructions in the function must have been assigned to some
+ // iteration.
+ int Iter = PossibleRedIter[J];
+ if (Iter != PrevIter && Iter != PrevIter + 1 &&
+ !PossibleReds[i].getReducedValue()->isAssociative()) {
+ LLVM_DEBUG(dbgs() << "LRR: Out-of-order non-associative reduction: "
+ << J << "\n");
+ return false;
+ }
+
+ if (Iter != PrevIter) {
+ if (Count != BaseCount) {
+ LLVM_DEBUG(dbgs()
+ << "LRR: Iteration " << PrevIter << " reduction use count "
+ << Count << " is not equal to the base use count "
+ << BaseCount << "\n");
+ return false;
+ }
+
+ Count = 0;
+ }
+
+ ++Count;
+ if (Iter == 0)
+ ++BaseCount;
+
+ PrevIter = Iter;
+ }
+ }
+
+ return true;
+}
+
+// For all selected reductions, remove all parts except those in the first
+// iteration (and the PHI). Replace outside uses of the reduced value with uses
+// of the first-iteration reduced value (in other words, reroll the selected
+// reductions).
+void LoopReroll::ReductionTracker::replaceSelected() {
+ // Fixup reductions to refer to the last instruction associated with the
+ // first iteration (not the last).
+ for (int i : Reds) {
+ int j = 0;
+ for (int e = PossibleReds[i].size(); j != e; ++j)
+ if (PossibleRedIter[PossibleReds[i][j]] != 0) {
+ --j;
+ break;
+ }
+
+ // Replace users with the new end-of-chain value.
+ SmallInstructionVector Users;
+ for (User *U : PossibleReds[i].getReducedValue()->users()) {
+ Users.push_back(cast<Instruction>(U));
+ }
+
+ for (Instruction *User : Users)
+ User->replaceUsesOfWith(PossibleReds[i].getReducedValue(),
+ PossibleReds[i][j]);
+ }
+}
+
+// Reroll the provided loop with respect to the provided induction variable.
+// Generally, we're looking for a loop like this:
+//
+// %iv = phi [ (preheader, ...), (body, %iv.next) ]
+// f(%iv)
+// %iv.1 = add %iv, 1 <-- a root increment
+// f(%iv.1)
+// %iv.2 = add %iv, 2 <-- a root increment
+// f(%iv.2)
+// %iv.scale_m_1 = add %iv, scale-1 <-- a root increment
+// f(%iv.scale_m_1)
+// ...
+// %iv.next = add %iv, scale
+// %cmp = icmp(%iv, ...)
+// br %cmp, header, exit
+//
+// Notably, we do not require that f(%iv), f(%iv.1), etc. be isolated groups of
+// instructions. In other words, the instructions in f(%iv), f(%iv.1), etc. can
+// be intermixed with eachother. The restriction imposed by this algorithm is
+// that the relative order of the isomorphic instructions in f(%iv), f(%iv.1),
+// etc. be the same.
+//
+// First, we collect the use set of %iv, excluding the other increment roots.
+// This gives us f(%iv). Then we iterate over the loop instructions (scale-1)
+// times, having collected the use set of f(%iv.(i+1)), during which we:
+// - Ensure that the next unmatched instruction in f(%iv) is isomorphic to
+// the next unmatched instruction in f(%iv.(i+1)).
+// - Ensure that both matched instructions don't have any external users
+// (with the exception of last-in-chain reduction instructions).
+// - Track the (aliasing) write set, and other side effects, of all
+// instructions that belong to future iterations that come before the matched
+// instructions. If the matched instructions read from that write set, then
+// f(%iv) or f(%iv.(i+1)) has some dependency on instructions in
+// f(%iv.(j+1)) for some j > i, and we cannot reroll the loop. Similarly,
+// if any of these future instructions had side effects (could not be
+// speculatively executed), and so do the matched instructions, when we
+// cannot reorder those side-effect-producing instructions, and rerolling
+// fails.
+//
+// Finally, we make sure that all loop instructions are either loop increment
+// roots, belong to simple latch code, parts of validated reductions, part of
+// f(%iv) or part of some f(%iv.i). If all of that is true (and all reductions
+// have been validated), then we reroll the loop.
+bool LoopReroll::reroll(Instruction *IV, Loop *L, BasicBlock *Header,
+ const SCEV *BackedgeTakenCount,
+ ReductionTracker &Reductions) {
+ DAGRootTracker DAGRoots(this, L, IV, SE, AA, TLI, DT, LI, PreserveLCSSA,
+ IVToIncMap, LoopControlIV);
+
+ if (!DAGRoots.findRoots())
+ return false;
+ LLVM_DEBUG(dbgs() << "LRR: Found all root induction increments for: " << *IV
+ << "\n");
+
+ if (!DAGRoots.validate(Reductions))
+ return false;
+ if (!Reductions.validateSelected())
+ return false;
+ // At this point, we've validated the rerolling, and we're committed to
+ // making changes!
+
+ Reductions.replaceSelected();
+ DAGRoots.replace(BackedgeTakenCount);
+
+ ++NumRerolledLoops;
+ return true;
+}
+
bool LoopReroll::runOnLoop(Loop *L) {
- BasicBlock *Header = L->getHeader();
- LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %"
- << Header->getName() << " (" << L->getNumBlocks()
- << " block(s))\n");
-
- // For now, we'll handle only single BB loops.
- if (L->getNumBlocks() > 1)
- return false;
-
- if (!SE->hasLoopInvariantBackedgeTakenCount(L))
- return false;
-
- const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
- LLVM_DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
- LLVM_DEBUG(dbgs() << "LRR: backedge-taken count = " << *BackedgeTakenCount
- << "\n");
-
- // First, we need to find the induction variable with respect to which we can
- // reroll (there may be several possible options).
- SmallInstructionVector PossibleIVs;
- IVToIncMap.clear();
- LoopControlIV = nullptr;
- collectPossibleIVs(L, PossibleIVs);
-
- if (PossibleIVs.empty()) {
- LLVM_DEBUG(dbgs() << "LRR: No possible IVs found\n");
- return false;
- }
-
- ReductionTracker Reductions;
- collectPossibleReductions(L, Reductions);
- bool Changed = false;
-
- // For each possible IV, collect the associated possible set of 'root' nodes
- // (i+1, i+2, etc.).
- for (Instruction *PossibleIV : PossibleIVs)
- if (reroll(PossibleIV, L, Header, BackedgeTakenCount, Reductions)) {
- Changed = true;
- break;
- }
- LLVM_DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
-
- // Trip count of L has changed so SE must be re-evaluated.
- if (Changed)
- SE->forgetLoop(L);
-
- return Changed;
-}
+ BasicBlock *Header = L->getHeader();
+ LLVM_DEBUG(dbgs() << "LRR: F[" << Header->getParent()->getName() << "] Loop %"
+ << Header->getName() << " (" << L->getNumBlocks()
+ << " block(s))\n");
+
+ // For now, we'll handle only single BB loops.
+ if (L->getNumBlocks() > 1)
+ return false;
+
+ if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+ return false;
+
+ const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+ LLVM_DEBUG(dbgs() << "\n Before Reroll:\n" << *(L->getHeader()) << "\n");
+ LLVM_DEBUG(dbgs() << "LRR: backedge-taken count = " << *BackedgeTakenCount
+ << "\n");
+
+ // First, we need to find the induction variable with respect to which we can
+ // reroll (there may be several possible options).
+ SmallInstructionVector PossibleIVs;
+ IVToIncMap.clear();
+ LoopControlIV = nullptr;
+ collectPossibleIVs(L, PossibleIVs);
+
+ if (PossibleIVs.empty()) {
+ LLVM_DEBUG(dbgs() << "LRR: No possible IVs found\n");
+ return false;
+ }
+
+ ReductionTracker Reductions;
+ collectPossibleReductions(L, Reductions);
+ bool Changed = false;
+
+ // For each possible IV, collect the associated possible set of 'root' nodes
+ // (i+1, i+2, etc.).
+ for (Instruction *PossibleIV : PossibleIVs)
+ if (reroll(PossibleIV, L, Header, BackedgeTakenCount, Reductions)) {
+ Changed = true;
+ break;
+ }
+ LLVM_DEBUG(dbgs() << "\n After Reroll:\n" << *(L->getHeader()) << "\n");
+
+ // Trip count of L has changed so SE must be re-evaluated.
+ if (Changed)
+ SE->forgetLoop(L);
+
+ return Changed;
+}
bool LoopRerollLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
if (skipLoop(L))
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRotation.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRotation.cpp
index ff63d625d8..ad1cfc68ec 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopRotation.cpp
@@ -1,51 +1,51 @@
-//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements Loop Rotation Pass.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopRotation.h"
-#include "llvm/ADT/Statistic.h"
+//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Loop Rotation Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopRotation.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils/LoopRotationUtils.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-rotate"
-
-static cl::opt<unsigned> DefaultRotationThreshold(
- "rotation-max-header-size", cl::init(16), cl::Hidden,
- cl::desc("The default maximum header size for automatic loop rotation"));
-
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/LoopRotationUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-rotate"
+
+static cl::opt<unsigned> DefaultRotationThreshold(
+ "rotation-max-header-size", cl::init(16), cl::Hidden,
+ cl::desc("The default maximum header size for automatic loop rotation"));
+
static cl::opt<bool> PrepareForLTOOption(
"rotation-prepare-for-lto", cl::init(false), cl::Hidden,
cl::desc("Run loop-rotation in the prepare-for-lto stage. This option "
"should be used for testing only."));
-
+
LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication, bool PrepareForLTO)
: EnableHeaderDuplication(EnableHeaderDuplication),
PrepareForLTO(PrepareForLTO) {}
-PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &) {
+PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
// Vectorization requires loop-rotation. Use default threshold for loops the
// user explicitly marked for vectorization, even when header duplication is
// disabled.
@@ -53,75 +53,75 @@ PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
hasVectorizeTransformation(&L) == TM_ForcedByUser
? DefaultRotationThreshold
: 0;
- const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
- const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
-
- Optional<MemorySSAUpdater> MSSAU;
- if (AR.MSSA)
- MSSAU = MemorySSAUpdater(AR.MSSA);
+ const DataLayout &DL = L.getHeader()->getModule()->getDataLayout();
+ const SimplifyQuery SQ = getBestSimplifyQuery(AR, DL);
+
+ Optional<MemorySSAUpdater> MSSAU;
+ if (AR.MSSA)
+ MSSAU = MemorySSAUpdater(AR.MSSA);
bool Changed =
LoopRotation(&L, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE,
MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ, false,
Threshold, false, PrepareForLTO || PrepareForLTOOption);
-
- if (!Changed)
- return PreservedAnalyses::all();
-
- if (AR.MSSA && VerifyMemorySSA)
- AR.MSSA->verifyMemorySSA();
-
- auto PA = getLoopPassPreservedAnalyses();
- if (AR.MSSA)
- PA.preserve<MemorySSAAnalysis>();
- return PA;
-}
-
-namespace {
-
-class LoopRotateLegacyPass : public LoopPass {
- unsigned MaxHeaderSize;
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ if (AR.MSSA && VerifyMemorySSA)
+ AR.MSSA->verifyMemorySSA();
+
+ auto PA = getLoopPassPreservedAnalyses();
+ if (AR.MSSA)
+ PA.preserve<MemorySSAAnalysis>();
+ return PA;
+}
+
+namespace {
+
+class LoopRotateLegacyPass : public LoopPass {
+ unsigned MaxHeaderSize;
bool PrepareForLTO;
-
-public:
- static char ID; // Pass ID, replacement for typeid
+
+public:
+ static char ID; // Pass ID, replacement for typeid
LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1,
bool PrepareForLTO = false)
: LoopPass(ID), PrepareForLTO(PrepareForLTO) {
- initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry());
- if (SpecifiedMaxHeaderSize == -1)
- MaxHeaderSize = DefaultRotationThreshold;
- else
- MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
- }
-
- // LCSSA form makes instruction renaming easier.
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- if (EnableMSSALoopDependency)
- AU.addPreserved<MemorySSAWrapperPass>();
- getLoopAnalysisUsage(AU);
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override {
- if (skipLoop(L))
- return false;
- Function &F = *L->getHeader()->getParent();
-
- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
- Optional<MemorySSAUpdater> MSSAU;
- if (EnableMSSALoopDependency) {
- // Not requiring MemorySSA and getting it only if available will split
- // the loop pass pipeline when LoopRotate is being run first.
- auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>();
- if (MSSAA)
- MSSAU = MemorySSAUpdater(&MSSAA->getMSSA());
- }
+ initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry());
+ if (SpecifiedMaxHeaderSize == -1)
+ MaxHeaderSize = DefaultRotationThreshold;
+ else
+ MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
+ }
+
+ // LCSSA form makes instruction renaming easier.
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ if (EnableMSSALoopDependency)
+ AU.addPreserved<MemorySSAWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+ Function &F = *L->getHeader()->getParent();
+
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ const auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
+ Optional<MemorySSAUpdater> MSSAU;
+ if (EnableMSSALoopDependency) {
+ // Not requiring MemorySSA and getting it only if available will split
+ // the loop pass pipeline when LoopRotate is being run first.
+ auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+ if (MSSAA)
+ MSSAU = MemorySSAUpdater(&MSSAA->getMSSA());
+ }
// Vectorization requires loop-rotation. Use default threshold for loops the
// user explicitly marked for vectorization, even when header duplication is
// disabled.
@@ -129,24 +129,24 @@ public:
? DefaultRotationThreshold
: MaxHeaderSize;
- return LoopRotation(L, LI, TTI, AC, &DT, &SE,
- MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ,
+ return LoopRotation(L, LI, TTI, AC, &DT, &SE,
+ MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ,
false, Threshold, false,
PrepareForLTO || PrepareForLTOOption);
- }
-};
-} // end namespace
-
-char LoopRotateLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false,
- false)
-
+ }
+};
+} // end namespace
+
+char LoopRotateLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false,
+ false)
+
Pass *llvm::createLoopRotatePass(int MaxHeaderSize, bool PrepareForLTO) {
return new LoopRotateLegacyPass(MaxHeaderSize, PrepareForLTO);
-}
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index c5d3c4519b..cc6d112208 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -1,773 +1,773 @@
-//===--------- LoopSimplifyCFG.cpp - Loop CFG Simplification Pass ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the Loop SimplifyCFG Pass. This pass is responsible for
-// basic loop CFG cleanup, primarily to assist other loop passes. If you
-// encounter a noncanonical CFG construct that causes another loop pass to
-// perform suboptimally, this is the place to fix it up.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/DependenceAnalysis.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-simplifycfg"
-
-static cl::opt<bool> EnableTermFolding("enable-loop-simplifycfg-term-folding",
- cl::init(true));
-
-STATISTIC(NumTerminatorsFolded,
- "Number of terminators folded to unconditional branches");
-STATISTIC(NumLoopBlocksDeleted,
- "Number of loop blocks deleted");
-STATISTIC(NumLoopExitsDeleted,
- "Number of loop exiting edges deleted");
-
-/// If \p BB is a switch or a conditional branch, but only one of its successors
-/// can be reached from this block in runtime, return this successor. Otherwise,
-/// return nullptr.
-static BasicBlock *getOnlyLiveSuccessor(BasicBlock *BB) {
- Instruction *TI = BB->getTerminator();
- if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
- if (BI->isUnconditional())
- return nullptr;
- if (BI->getSuccessor(0) == BI->getSuccessor(1))
- return BI->getSuccessor(0);
- ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
- if (!Cond)
- return nullptr;
- return Cond->isZero() ? BI->getSuccessor(1) : BI->getSuccessor(0);
- }
-
- if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
- auto *CI = dyn_cast<ConstantInt>(SI->getCondition());
- if (!CI)
- return nullptr;
- for (auto Case : SI->cases())
- if (Case.getCaseValue() == CI)
- return Case.getCaseSuccessor();
- return SI->getDefaultDest();
- }
-
- return nullptr;
-}
-
-/// Removes \p BB from all loops from [FirstLoop, LastLoop) in parent chain.
-static void removeBlockFromLoops(BasicBlock *BB, Loop *FirstLoop,
- Loop *LastLoop = nullptr) {
- assert((!LastLoop || LastLoop->contains(FirstLoop->getHeader())) &&
- "First loop is supposed to be inside of last loop!");
- assert(FirstLoop->contains(BB) && "Must be a loop block!");
- for (Loop *Current = FirstLoop; Current != LastLoop;
- Current = Current->getParentLoop())
- Current->removeBlockFromLoop(BB);
-}
-
-/// Find innermost loop that contains at least one block from \p BBs and
-/// contains the header of loop \p L.
-static Loop *getInnermostLoopFor(SmallPtrSetImpl<BasicBlock *> &BBs,
- Loop &L, LoopInfo &LI) {
- Loop *Innermost = nullptr;
- for (BasicBlock *BB : BBs) {
- Loop *BBL = LI.getLoopFor(BB);
- while (BBL && !BBL->contains(L.getHeader()))
- BBL = BBL->getParentLoop();
- if (BBL == &L)
- BBL = BBL->getParentLoop();
- if (!BBL)
- continue;
- if (!Innermost || BBL->getLoopDepth() > Innermost->getLoopDepth())
- Innermost = BBL;
- }
- return Innermost;
-}
-
-namespace {
-/// Helper class that can turn branches and switches with constant conditions
-/// into unconditional branches.
-class ConstantTerminatorFoldingImpl {
-private:
- Loop &L;
- LoopInfo &LI;
- DominatorTree &DT;
- ScalarEvolution &SE;
- MemorySSAUpdater *MSSAU;
- LoopBlocksDFS DFS;
- DomTreeUpdater DTU;
- SmallVector<DominatorTree::UpdateType, 16> DTUpdates;
-
- // Whether or not the current loop has irreducible CFG.
- bool HasIrreducibleCFG = false;
- // Whether or not the current loop will still exist after terminator constant
- // folding will be done. In theory, there are two ways how it can happen:
- // 1. Loop's latch(es) become unreachable from loop header;
- // 2. Loop's header becomes unreachable from method entry.
- // In practice, the second situation is impossible because we only modify the
- // current loop and its preheader and do not affect preheader's reachibility
- // from any other block. So this variable set to true means that loop's latch
- // has become unreachable from loop header.
- bool DeleteCurrentLoop = false;
-
- // The blocks of the original loop that will still be reachable from entry
- // after the constant folding.
- SmallPtrSet<BasicBlock *, 8> LiveLoopBlocks;
- // The blocks of the original loop that will become unreachable from entry
- // after the constant folding.
- SmallVector<BasicBlock *, 8> DeadLoopBlocks;
- // The exits of the original loop that will still be reachable from entry
- // after the constant folding.
- SmallPtrSet<BasicBlock *, 8> LiveExitBlocks;
- // The exits of the original loop that will become unreachable from entry
- // after the constant folding.
- SmallVector<BasicBlock *, 8> DeadExitBlocks;
- // The blocks that will still be a part of the current loop after folding.
- SmallPtrSet<BasicBlock *, 8> BlocksInLoopAfterFolding;
- // The blocks that have terminators with constant condition that can be
- // folded. Note: fold candidates should be in L but not in any of its
- // subloops to avoid complex LI updates.
- SmallVector<BasicBlock *, 8> FoldCandidates;
-
- void dump() const {
- dbgs() << "Constant terminator folding for loop " << L << "\n";
- dbgs() << "After terminator constant-folding, the loop will";
- if (!DeleteCurrentLoop)
- dbgs() << " not";
- dbgs() << " be destroyed\n";
- auto PrintOutVector = [&](const char *Message,
- const SmallVectorImpl<BasicBlock *> &S) {
- dbgs() << Message << "\n";
- for (const BasicBlock *BB : S)
- dbgs() << "\t" << BB->getName() << "\n";
- };
- auto PrintOutSet = [&](const char *Message,
- const SmallPtrSetImpl<BasicBlock *> &S) {
- dbgs() << Message << "\n";
- for (const BasicBlock *BB : S)
- dbgs() << "\t" << BB->getName() << "\n";
- };
- PrintOutVector("Blocks in which we can constant-fold terminator:",
- FoldCandidates);
- PrintOutSet("Live blocks from the original loop:", LiveLoopBlocks);
- PrintOutVector("Dead blocks from the original loop:", DeadLoopBlocks);
- PrintOutSet("Live exit blocks:", LiveExitBlocks);
- PrintOutVector("Dead exit blocks:", DeadExitBlocks);
- if (!DeleteCurrentLoop)
- PrintOutSet("The following blocks will still be part of the loop:",
- BlocksInLoopAfterFolding);
- }
-
- /// Whether or not the current loop has irreducible CFG.
- bool hasIrreducibleCFG(LoopBlocksDFS &DFS) {
- assert(DFS.isComplete() && "DFS is expected to be finished");
- // Index of a basic block in RPO traversal.
- DenseMap<const BasicBlock *, unsigned> RPO;
- unsigned Current = 0;
- for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I)
- RPO[*I] = Current++;
-
- for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) {
- BasicBlock *BB = *I;
- for (auto *Succ : successors(BB))
- if (L.contains(Succ) && !LI.isLoopHeader(Succ) && RPO[BB] > RPO[Succ])
- // If an edge goes from a block with greater order number into a block
- // with lesses number, and it is not a loop backedge, then it can only
- // be a part of irreducible non-loop cycle.
- return true;
- }
- return false;
- }
-
- /// Fill all information about status of blocks and exits of the current loop
- /// if constant folding of all branches will be done.
- void analyze() {
- DFS.perform(&LI);
- assert(DFS.isComplete() && "DFS is expected to be finished");
-
- // TODO: The algorithm below relies on both RPO and Postorder traversals.
- // When the loop has only reducible CFG inside, then the invariant "all
- // predecessors of X are processed before X in RPO" is preserved. However
- // an irreducible loop can break this invariant (e.g. latch does not have to
- // be the last block in the traversal in this case, and the algorithm relies
- // on this). We can later decide to support such cases by altering the
- // algorithms, but so far we just give up analyzing them.
- if (hasIrreducibleCFG(DFS)) {
- HasIrreducibleCFG = true;
- return;
- }
-
- // Collect live and dead loop blocks and exits.
- LiveLoopBlocks.insert(L.getHeader());
- for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) {
- BasicBlock *BB = *I;
-
- // If a loop block wasn't marked as live so far, then it's dead.
- if (!LiveLoopBlocks.count(BB)) {
- DeadLoopBlocks.push_back(BB);
- continue;
- }
-
- BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(BB);
-
- // If a block has only one live successor, it's a candidate on constant
- // folding. Only handle blocks from current loop: branches in child loops
- // are skipped because if they can be folded, they should be folded during
- // the processing of child loops.
- bool TakeFoldCandidate = TheOnlySucc && LI.getLoopFor(BB) == &L;
- if (TakeFoldCandidate)
- FoldCandidates.push_back(BB);
-
- // Handle successors.
- for (BasicBlock *Succ : successors(BB))
- if (!TakeFoldCandidate || TheOnlySucc == Succ) {
- if (L.contains(Succ))
- LiveLoopBlocks.insert(Succ);
- else
- LiveExitBlocks.insert(Succ);
- }
- }
-
- // Sanity check: amount of dead and live loop blocks should match the total
- // number of blocks in loop.
- assert(L.getNumBlocks() == LiveLoopBlocks.size() + DeadLoopBlocks.size() &&
- "Malformed block sets?");
-
- // Now, all exit blocks that are not marked as live are dead.
- SmallVector<BasicBlock *, 8> ExitBlocks;
- L.getExitBlocks(ExitBlocks);
- SmallPtrSet<BasicBlock *, 8> UniqueDeadExits;
- for (auto *ExitBlock : ExitBlocks)
- if (!LiveExitBlocks.count(ExitBlock) &&
- UniqueDeadExits.insert(ExitBlock).second)
- DeadExitBlocks.push_back(ExitBlock);
-
- // Whether or not the edge From->To will still be present in graph after the
- // folding.
- auto IsEdgeLive = [&](BasicBlock *From, BasicBlock *To) {
- if (!LiveLoopBlocks.count(From))
- return false;
- BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(From);
- return !TheOnlySucc || TheOnlySucc == To || LI.getLoopFor(From) != &L;
- };
-
- // The loop will not be destroyed if its latch is live.
- DeleteCurrentLoop = !IsEdgeLive(L.getLoopLatch(), L.getHeader());
-
- // If we are going to delete the current loop completely, no extra analysis
- // is needed.
- if (DeleteCurrentLoop)
- return;
-
- // Otherwise, we should check which blocks will still be a part of the
- // current loop after the transform.
- BlocksInLoopAfterFolding.insert(L.getLoopLatch());
- // If the loop is live, then we should compute what blocks are still in
- // loop after all branch folding has been done. A block is in loop if
- // it has a live edge to another block that is in the loop; by definition,
- // latch is in the loop.
- auto BlockIsInLoop = [&](BasicBlock *BB) {
- return any_of(successors(BB), [&](BasicBlock *Succ) {
- return BlocksInLoopAfterFolding.count(Succ) && IsEdgeLive(BB, Succ);
- });
- };
- for (auto I = DFS.beginPostorder(), E = DFS.endPostorder(); I != E; ++I) {
- BasicBlock *BB = *I;
- if (BlockIsInLoop(BB))
- BlocksInLoopAfterFolding.insert(BB);
- }
-
- // Sanity check: header must be in loop.
- assert(BlocksInLoopAfterFolding.count(L.getHeader()) &&
- "Header not in loop?");
- assert(BlocksInLoopAfterFolding.size() <= LiveLoopBlocks.size() &&
- "All blocks that stay in loop should be live!");
- }
-
- /// We need to preserve static reachibility of all loop exit blocks (this is)
- /// required by loop pass manager. In order to do it, we make the following
- /// trick:
- ///
- /// preheader:
- /// <preheader code>
- /// br label %loop_header
- ///
- /// loop_header:
- /// ...
- /// br i1 false, label %dead_exit, label %loop_block
- /// ...
- ///
- /// We cannot simply remove edge from the loop to dead exit because in this
- /// case dead_exit (and its successors) may become unreachable. To avoid that,
- /// we insert the following fictive preheader:
- ///
- /// preheader:
- /// <preheader code>
- /// switch i32 0, label %preheader-split,
- /// [i32 1, label %dead_exit_1],
- /// [i32 2, label %dead_exit_2],
- /// ...
- /// [i32 N, label %dead_exit_N],
- ///
- /// preheader-split:
- /// br label %loop_header
- ///
- /// loop_header:
- /// ...
- /// br i1 false, label %dead_exit_N, label %loop_block
- /// ...
- ///
- /// Doing so, we preserve static reachibility of all dead exits and can later
- /// remove edges from the loop to these blocks.
- void handleDeadExits() {
- // If no dead exits, nothing to do.
- if (DeadExitBlocks.empty())
- return;
-
- // Construct split preheader and the dummy switch to thread edges from it to
- // dead exits.
- BasicBlock *Preheader = L.getLoopPreheader();
- BasicBlock *NewPreheader = llvm::SplitBlock(
- Preheader, Preheader->getTerminator(), &DT, &LI, MSSAU);
-
- IRBuilder<> Builder(Preheader->getTerminator());
- SwitchInst *DummySwitch =
- Builder.CreateSwitch(Builder.getInt32(0), NewPreheader);
- Preheader->getTerminator()->eraseFromParent();
-
- unsigned DummyIdx = 1;
- for (BasicBlock *BB : DeadExitBlocks) {
+//===--------- LoopSimplifyCFG.cpp - Loop CFG Simplification Pass ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Loop SimplifyCFG Pass. This pass is responsible for
+// basic loop CFG cleanup, primarily to assist other loop passes. If you
+// encounter a noncanonical CFG construct that causes another loop pass to
+// perform suboptimally, this is the place to fix it up.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopSimplifyCFG.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-simplifycfg"
+
+static cl::opt<bool> EnableTermFolding("enable-loop-simplifycfg-term-folding",
+ cl::init(true));
+
+STATISTIC(NumTerminatorsFolded,
+ "Number of terminators folded to unconditional branches");
+STATISTIC(NumLoopBlocksDeleted,
+ "Number of loop blocks deleted");
+STATISTIC(NumLoopExitsDeleted,
+ "Number of loop exiting edges deleted");
+
+/// If \p BB is a switch or a conditional branch, but only one of its successors
+/// can be reached from this block in runtime, return this successor. Otherwise,
+/// return nullptr.
+static BasicBlock *getOnlyLiveSuccessor(BasicBlock *BB) {
+ Instruction *TI = BB->getTerminator();
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ if (BI->isUnconditional())
+ return nullptr;
+ if (BI->getSuccessor(0) == BI->getSuccessor(1))
+ return BI->getSuccessor(0);
+ ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+ if (!Cond)
+ return nullptr;
+ return Cond->isZero() ? BI->getSuccessor(1) : BI->getSuccessor(0);
+ }
+
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ auto *CI = dyn_cast<ConstantInt>(SI->getCondition());
+ if (!CI)
+ return nullptr;
+ for (auto Case : SI->cases())
+ if (Case.getCaseValue() == CI)
+ return Case.getCaseSuccessor();
+ return SI->getDefaultDest();
+ }
+
+ return nullptr;
+}
+
+/// Removes \p BB from all loops from [FirstLoop, LastLoop) in parent chain.
+static void removeBlockFromLoops(BasicBlock *BB, Loop *FirstLoop,
+ Loop *LastLoop = nullptr) {
+ assert((!LastLoop || LastLoop->contains(FirstLoop->getHeader())) &&
+ "First loop is supposed to be inside of last loop!");
+ assert(FirstLoop->contains(BB) && "Must be a loop block!");
+ for (Loop *Current = FirstLoop; Current != LastLoop;
+ Current = Current->getParentLoop())
+ Current->removeBlockFromLoop(BB);
+}
+
+/// Find innermost loop that contains at least one block from \p BBs and
+/// contains the header of loop \p L.
+static Loop *getInnermostLoopFor(SmallPtrSetImpl<BasicBlock *> &BBs,
+ Loop &L, LoopInfo &LI) {
+ Loop *Innermost = nullptr;
+ for (BasicBlock *BB : BBs) {
+ Loop *BBL = LI.getLoopFor(BB);
+ while (BBL && !BBL->contains(L.getHeader()))
+ BBL = BBL->getParentLoop();
+ if (BBL == &L)
+ BBL = BBL->getParentLoop();
+ if (!BBL)
+ continue;
+ if (!Innermost || BBL->getLoopDepth() > Innermost->getLoopDepth())
+ Innermost = BBL;
+ }
+ return Innermost;
+}
+
+namespace {
+/// Helper class that can turn branches and switches with constant conditions
+/// into unconditional branches.
+class ConstantTerminatorFoldingImpl {
+private:
+ Loop &L;
+ LoopInfo &LI;
+ DominatorTree &DT;
+ ScalarEvolution &SE;
+ MemorySSAUpdater *MSSAU;
+ LoopBlocksDFS DFS;
+ DomTreeUpdater DTU;
+ SmallVector<DominatorTree::UpdateType, 16> DTUpdates;
+
+ // Whether or not the current loop has irreducible CFG.
+ bool HasIrreducibleCFG = false;
+ // Whether or not the current loop will still exist after terminator constant
+ // folding will be done. In theory, there are two ways how it can happen:
+ // 1. Loop's latch(es) become unreachable from loop header;
+ // 2. Loop's header becomes unreachable from method entry.
+ // In practice, the second situation is impossible because we only modify the
+ // current loop and its preheader and do not affect preheader's reachibility
+ // from any other block. So this variable set to true means that loop's latch
+ // has become unreachable from loop header.
+ bool DeleteCurrentLoop = false;
+
+ // The blocks of the original loop that will still be reachable from entry
+ // after the constant folding.
+ SmallPtrSet<BasicBlock *, 8> LiveLoopBlocks;
+ // The blocks of the original loop that will become unreachable from entry
+ // after the constant folding.
+ SmallVector<BasicBlock *, 8> DeadLoopBlocks;
+ // The exits of the original loop that will still be reachable from entry
+ // after the constant folding.
+ SmallPtrSet<BasicBlock *, 8> LiveExitBlocks;
+ // The exits of the original loop that will become unreachable from entry
+ // after the constant folding.
+ SmallVector<BasicBlock *, 8> DeadExitBlocks;
+ // The blocks that will still be a part of the current loop after folding.
+ SmallPtrSet<BasicBlock *, 8> BlocksInLoopAfterFolding;
+ // The blocks that have terminators with constant condition that can be
+ // folded. Note: fold candidates should be in L but not in any of its
+ // subloops to avoid complex LI updates.
+ SmallVector<BasicBlock *, 8> FoldCandidates;
+
+ void dump() const {
+ dbgs() << "Constant terminator folding for loop " << L << "\n";
+ dbgs() << "After terminator constant-folding, the loop will";
+ if (!DeleteCurrentLoop)
+ dbgs() << " not";
+ dbgs() << " be destroyed\n";
+ auto PrintOutVector = [&](const char *Message,
+ const SmallVectorImpl<BasicBlock *> &S) {
+ dbgs() << Message << "\n";
+ for (const BasicBlock *BB : S)
+ dbgs() << "\t" << BB->getName() << "\n";
+ };
+ auto PrintOutSet = [&](const char *Message,
+ const SmallPtrSetImpl<BasicBlock *> &S) {
+ dbgs() << Message << "\n";
+ for (const BasicBlock *BB : S)
+ dbgs() << "\t" << BB->getName() << "\n";
+ };
+ PrintOutVector("Blocks in which we can constant-fold terminator:",
+ FoldCandidates);
+ PrintOutSet("Live blocks from the original loop:", LiveLoopBlocks);
+ PrintOutVector("Dead blocks from the original loop:", DeadLoopBlocks);
+ PrintOutSet("Live exit blocks:", LiveExitBlocks);
+ PrintOutVector("Dead exit blocks:", DeadExitBlocks);
+ if (!DeleteCurrentLoop)
+ PrintOutSet("The following blocks will still be part of the loop:",
+ BlocksInLoopAfterFolding);
+ }
+
+ /// Whether or not the current loop has irreducible CFG.
+ bool hasIrreducibleCFG(LoopBlocksDFS &DFS) {
+ assert(DFS.isComplete() && "DFS is expected to be finished");
+ // Index of a basic block in RPO traversal.
+ DenseMap<const BasicBlock *, unsigned> RPO;
+ unsigned Current = 0;
+ for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I)
+ RPO[*I] = Current++;
+
+ for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) {
+ BasicBlock *BB = *I;
+ for (auto *Succ : successors(BB))
+ if (L.contains(Succ) && !LI.isLoopHeader(Succ) && RPO[BB] > RPO[Succ])
+ // If an edge goes from a block with greater order number into a block
+ // with lesses number, and it is not a loop backedge, then it can only
+ // be a part of irreducible non-loop cycle.
+ return true;
+ }
+ return false;
+ }
+
+ /// Fill all information about status of blocks and exits of the current loop
+ /// if constant folding of all branches will be done.
+ void analyze() {
+ DFS.perform(&LI);
+ assert(DFS.isComplete() && "DFS is expected to be finished");
+
+ // TODO: The algorithm below relies on both RPO and Postorder traversals.
+ // When the loop has only reducible CFG inside, then the invariant "all
+ // predecessors of X are processed before X in RPO" is preserved. However
+ // an irreducible loop can break this invariant (e.g. latch does not have to
+ // be the last block in the traversal in this case, and the algorithm relies
+ // on this). We can later decide to support such cases by altering the
+ // algorithms, but so far we just give up analyzing them.
+ if (hasIrreducibleCFG(DFS)) {
+ HasIrreducibleCFG = true;
+ return;
+ }
+
+ // Collect live and dead loop blocks and exits.
+ LiveLoopBlocks.insert(L.getHeader());
+ for (auto I = DFS.beginRPO(), E = DFS.endRPO(); I != E; ++I) {
+ BasicBlock *BB = *I;
+
+ // If a loop block wasn't marked as live so far, then it's dead.
+ if (!LiveLoopBlocks.count(BB)) {
+ DeadLoopBlocks.push_back(BB);
+ continue;
+ }
+
+ BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(BB);
+
+ // If a block has only one live successor, it's a candidate on constant
+ // folding. Only handle blocks from current loop: branches in child loops
+ // are skipped because if they can be folded, they should be folded during
+ // the processing of child loops.
+ bool TakeFoldCandidate = TheOnlySucc && LI.getLoopFor(BB) == &L;
+ if (TakeFoldCandidate)
+ FoldCandidates.push_back(BB);
+
+ // Handle successors.
+ for (BasicBlock *Succ : successors(BB))
+ if (!TakeFoldCandidate || TheOnlySucc == Succ) {
+ if (L.contains(Succ))
+ LiveLoopBlocks.insert(Succ);
+ else
+ LiveExitBlocks.insert(Succ);
+ }
+ }
+
+ // Sanity check: amount of dead and live loop blocks should match the total
+ // number of blocks in loop.
+ assert(L.getNumBlocks() == LiveLoopBlocks.size() + DeadLoopBlocks.size() &&
+ "Malformed block sets?");
+
+ // Now, all exit blocks that are not marked as live are dead.
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ L.getExitBlocks(ExitBlocks);
+ SmallPtrSet<BasicBlock *, 8> UniqueDeadExits;
+ for (auto *ExitBlock : ExitBlocks)
+ if (!LiveExitBlocks.count(ExitBlock) &&
+ UniqueDeadExits.insert(ExitBlock).second)
+ DeadExitBlocks.push_back(ExitBlock);
+
+ // Whether or not the edge From->To will still be present in graph after the
+ // folding.
+ auto IsEdgeLive = [&](BasicBlock *From, BasicBlock *To) {
+ if (!LiveLoopBlocks.count(From))
+ return false;
+ BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(From);
+ return !TheOnlySucc || TheOnlySucc == To || LI.getLoopFor(From) != &L;
+ };
+
+ // The loop will not be destroyed if its latch is live.
+ DeleteCurrentLoop = !IsEdgeLive(L.getLoopLatch(), L.getHeader());
+
+ // If we are going to delete the current loop completely, no extra analysis
+ // is needed.
+ if (DeleteCurrentLoop)
+ return;
+
+ // Otherwise, we should check which blocks will still be a part of the
+ // current loop after the transform.
+ BlocksInLoopAfterFolding.insert(L.getLoopLatch());
+ // If the loop is live, then we should compute what blocks are still in
+ // loop after all branch folding has been done. A block is in loop if
+ // it has a live edge to another block that is in the loop; by definition,
+ // latch is in the loop.
+ auto BlockIsInLoop = [&](BasicBlock *BB) {
+ return any_of(successors(BB), [&](BasicBlock *Succ) {
+ return BlocksInLoopAfterFolding.count(Succ) && IsEdgeLive(BB, Succ);
+ });
+ };
+ for (auto I = DFS.beginPostorder(), E = DFS.endPostorder(); I != E; ++I) {
+ BasicBlock *BB = *I;
+ if (BlockIsInLoop(BB))
+ BlocksInLoopAfterFolding.insert(BB);
+ }
+
+ // Sanity check: header must be in loop.
+ assert(BlocksInLoopAfterFolding.count(L.getHeader()) &&
+ "Header not in loop?");
+ assert(BlocksInLoopAfterFolding.size() <= LiveLoopBlocks.size() &&
+ "All blocks that stay in loop should be live!");
+ }
+
+ /// We need to preserve static reachibility of all loop exit blocks (this is)
+ /// required by loop pass manager. In order to do it, we make the following
+ /// trick:
+ ///
+ /// preheader:
+ /// <preheader code>
+ /// br label %loop_header
+ ///
+ /// loop_header:
+ /// ...
+ /// br i1 false, label %dead_exit, label %loop_block
+ /// ...
+ ///
+ /// We cannot simply remove edge from the loop to dead exit because in this
+ /// case dead_exit (and its successors) may become unreachable. To avoid that,
+ /// we insert the following fictive preheader:
+ ///
+ /// preheader:
+ /// <preheader code>
+ /// switch i32 0, label %preheader-split,
+ /// [i32 1, label %dead_exit_1],
+ /// [i32 2, label %dead_exit_2],
+ /// ...
+ /// [i32 N, label %dead_exit_N],
+ ///
+ /// preheader-split:
+ /// br label %loop_header
+ ///
+ /// loop_header:
+ /// ...
+ /// br i1 false, label %dead_exit_N, label %loop_block
+ /// ...
+ ///
+ /// Doing so, we preserve static reachibility of all dead exits and can later
+ /// remove edges from the loop to these blocks.
+ void handleDeadExits() {
+ // If no dead exits, nothing to do.
+ if (DeadExitBlocks.empty())
+ return;
+
+ // Construct split preheader and the dummy switch to thread edges from it to
+ // dead exits.
+ BasicBlock *Preheader = L.getLoopPreheader();
+ BasicBlock *NewPreheader = llvm::SplitBlock(
+ Preheader, Preheader->getTerminator(), &DT, &LI, MSSAU);
+
+ IRBuilder<> Builder(Preheader->getTerminator());
+ SwitchInst *DummySwitch =
+ Builder.CreateSwitch(Builder.getInt32(0), NewPreheader);
+ Preheader->getTerminator()->eraseFromParent();
+
+ unsigned DummyIdx = 1;
+ for (BasicBlock *BB : DeadExitBlocks) {
// Eliminate all Phis and LandingPads from dead exits.
// TODO: Consider removing all instructions in this dead block.
SmallVector<Instruction *, 4> DeadInstructions;
- for (auto &PN : BB->phis())
+ for (auto &PN : BB->phis())
DeadInstructions.push_back(&PN);
-
+
if (auto *LandingPad = dyn_cast<LandingPadInst>(BB->getFirstNonPHI()))
DeadInstructions.emplace_back(LandingPad);
for (Instruction *I : DeadInstructions) {
I->replaceAllUsesWith(UndefValue::get(I->getType()));
I->eraseFromParent();
- }
-
- assert(DummyIdx != 0 && "Too many dead exits!");
- DummySwitch->addCase(Builder.getInt32(DummyIdx++), BB);
- DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
- ++NumLoopExitsDeleted;
- }
-
- assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
- if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
- // When we break dead edges, the outer loop may become unreachable from
- // the current loop. We need to fix loop info accordingly. For this, we
- // find the most nested loop that still contains L and remove L from all
- // loops that are inside of it.
- Loop *StillReachable = getInnermostLoopFor(LiveExitBlocks, L, LI);
-
- // Okay, our loop is no longer in the outer loop (and maybe not in some of
- // its parents as well). Make the fixup.
- if (StillReachable != OuterLoop) {
- LI.changeLoopFor(NewPreheader, StillReachable);
- removeBlockFromLoops(NewPreheader, OuterLoop, StillReachable);
- for (auto *BB : L.blocks())
- removeBlockFromLoops(BB, OuterLoop, StillReachable);
- OuterLoop->removeChildLoop(&L);
- if (StillReachable)
- StillReachable->addChildLoop(&L);
- else
- LI.addTopLevelLoop(&L);
-
- // Some values from loops in [OuterLoop, StillReachable) could be used
- // in the current loop. Now it is not their child anymore, so such uses
- // require LCSSA Phis.
- Loop *FixLCSSALoop = OuterLoop;
- while (FixLCSSALoop->getParentLoop() != StillReachable)
- FixLCSSALoop = FixLCSSALoop->getParentLoop();
- assert(FixLCSSALoop && "Should be a loop!");
- // We need all DT updates to be done before forming LCSSA.
- if (MSSAU)
+ }
+
+ assert(DummyIdx != 0 && "Too many dead exits!");
+ DummySwitch->addCase(Builder.getInt32(DummyIdx++), BB);
+ DTUpdates.push_back({DominatorTree::Insert, Preheader, BB});
+ ++NumLoopExitsDeleted;
+ }
+
+ assert(L.getLoopPreheader() == NewPreheader && "Malformed CFG?");
+ if (Loop *OuterLoop = LI.getLoopFor(Preheader)) {
+ // When we break dead edges, the outer loop may become unreachable from
+ // the current loop. We need to fix loop info accordingly. For this, we
+ // find the most nested loop that still contains L and remove L from all
+ // loops that are inside of it.
+ Loop *StillReachable = getInnermostLoopFor(LiveExitBlocks, L, LI);
+
+ // Okay, our loop is no longer in the outer loop (and maybe not in some of
+ // its parents as well). Make the fixup.
+ if (StillReachable != OuterLoop) {
+ LI.changeLoopFor(NewPreheader, StillReachable);
+ removeBlockFromLoops(NewPreheader, OuterLoop, StillReachable);
+ for (auto *BB : L.blocks())
+ removeBlockFromLoops(BB, OuterLoop, StillReachable);
+ OuterLoop->removeChildLoop(&L);
+ if (StillReachable)
+ StillReachable->addChildLoop(&L);
+ else
+ LI.addTopLevelLoop(&L);
+
+ // Some values from loops in [OuterLoop, StillReachable) could be used
+ // in the current loop. Now it is not their child anymore, so such uses
+ // require LCSSA Phis.
+ Loop *FixLCSSALoop = OuterLoop;
+ while (FixLCSSALoop->getParentLoop() != StillReachable)
+ FixLCSSALoop = FixLCSSALoop->getParentLoop();
+ assert(FixLCSSALoop && "Should be a loop!");
+ // We need all DT updates to be done before forming LCSSA.
+ if (MSSAU)
MSSAU->applyUpdates(DTUpdates, DT, /*UpdateDT=*/true);
else
DTU.applyUpdates(DTUpdates);
- DTUpdates.clear();
- formLCSSARecursively(*FixLCSSALoop, DT, &LI, &SE);
- }
- }
-
- if (MSSAU) {
- // Clear all updates now. Facilitates deletes that follow.
+ DTUpdates.clear();
+ formLCSSARecursively(*FixLCSSALoop, DT, &LI, &SE);
+ }
+ }
+
+ if (MSSAU) {
+ // Clear all updates now. Facilitates deletes that follow.
MSSAU->applyUpdates(DTUpdates, DT, /*UpdateDT=*/true);
- DTUpdates.clear();
- if (VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
- }
- }
-
- /// Delete loop blocks that have become unreachable after folding. Make all
- /// relevant updates to DT and LI.
- void deleteDeadLoopBlocks() {
- if (MSSAU) {
- SmallSetVector<BasicBlock *, 8> DeadLoopBlocksSet(DeadLoopBlocks.begin(),
- DeadLoopBlocks.end());
- MSSAU->removeBlocks(DeadLoopBlocksSet);
- }
-
- // The function LI.erase has some invariants that need to be preserved when
- // it tries to remove a loop which is not the top-level loop. In particular,
- // it requires loop's preheader to be strictly in loop's parent. We cannot
- // just remove blocks one by one, because after removal of preheader we may
- // break this invariant for the dead loop. So we detatch and erase all dead
- // loops beforehand.
- for (auto *BB : DeadLoopBlocks)
- if (LI.isLoopHeader(BB)) {
- assert(LI.getLoopFor(BB) != &L && "Attempt to remove current loop!");
- Loop *DL = LI.getLoopFor(BB);
+ DTUpdates.clear();
+ if (VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+ }
+ }
+
+ /// Delete loop blocks that have become unreachable after folding. Make all
+ /// relevant updates to DT and LI.
+ void deleteDeadLoopBlocks() {
+ if (MSSAU) {
+ SmallSetVector<BasicBlock *, 8> DeadLoopBlocksSet(DeadLoopBlocks.begin(),
+ DeadLoopBlocks.end());
+ MSSAU->removeBlocks(DeadLoopBlocksSet);
+ }
+
+ // The function LI.erase has some invariants that need to be preserved when
+ // it tries to remove a loop which is not the top-level loop. In particular,
+ // it requires loop's preheader to be strictly in loop's parent. We cannot
+ // just remove blocks one by one, because after removal of preheader we may
+ // break this invariant for the dead loop. So we detatch and erase all dead
+ // loops beforehand.
+ for (auto *BB : DeadLoopBlocks)
+ if (LI.isLoopHeader(BB)) {
+ assert(LI.getLoopFor(BB) != &L && "Attempt to remove current loop!");
+ Loop *DL = LI.getLoopFor(BB);
if (!DL->isOutermost()) {
- for (auto *PL = DL->getParentLoop(); PL; PL = PL->getParentLoop())
- for (auto *BB : DL->getBlocks())
- PL->removeBlockFromLoop(BB);
- DL->getParentLoop()->removeChildLoop(DL);
- LI.addTopLevelLoop(DL);
- }
- LI.erase(DL);
- }
-
- for (auto *BB : DeadLoopBlocks) {
- assert(BB != L.getHeader() &&
- "Header of the current loop cannot be dead!");
- LLVM_DEBUG(dbgs() << "Deleting dead loop block " << BB->getName()
- << "\n");
- LI.removeBlock(BB);
- }
-
- DetatchDeadBlocks(DeadLoopBlocks, &DTUpdates, /*KeepOneInputPHIs*/true);
- DTU.applyUpdates(DTUpdates);
- DTUpdates.clear();
- for (auto *BB : DeadLoopBlocks)
- DTU.deleteBB(BB);
-
- NumLoopBlocksDeleted += DeadLoopBlocks.size();
- }
-
- /// Constant-fold terminators of blocks acculumated in FoldCandidates into the
- /// unconditional branches.
- void foldTerminators() {
- for (BasicBlock *BB : FoldCandidates) {
- assert(LI.getLoopFor(BB) == &L && "Should be a loop block!");
- BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(BB);
- assert(TheOnlySucc && "Should have one live successor!");
-
- LLVM_DEBUG(dbgs() << "Replacing terminator of " << BB->getName()
- << " with an unconditional branch to the block "
- << TheOnlySucc->getName() << "\n");
-
- SmallPtrSet<BasicBlock *, 2> DeadSuccessors;
- // Remove all BB's successors except for the live one.
- unsigned TheOnlySuccDuplicates = 0;
- for (auto *Succ : successors(BB))
- if (Succ != TheOnlySucc) {
- DeadSuccessors.insert(Succ);
- // If our successor lies in a different loop, we don't want to remove
- // the one-input Phi because it is a LCSSA Phi.
- bool PreserveLCSSAPhi = !L.contains(Succ);
- Succ->removePredecessor(BB, PreserveLCSSAPhi);
- if (MSSAU)
- MSSAU->removeEdge(BB, Succ);
- } else
- ++TheOnlySuccDuplicates;
-
- assert(TheOnlySuccDuplicates > 0 && "Should be!");
- // If TheOnlySucc was BB's successor more than once, after transform it
- // will be its successor only once. Remove redundant inputs from
- // TheOnlySucc's Phis.
- bool PreserveLCSSAPhi = !L.contains(TheOnlySucc);
- for (unsigned Dup = 1; Dup < TheOnlySuccDuplicates; ++Dup)
- TheOnlySucc->removePredecessor(BB, PreserveLCSSAPhi);
- if (MSSAU && TheOnlySuccDuplicates > 1)
- MSSAU->removeDuplicatePhiEdgesBetween(BB, TheOnlySucc);
-
- IRBuilder<> Builder(BB->getContext());
- Instruction *Term = BB->getTerminator();
- Builder.SetInsertPoint(Term);
- Builder.CreateBr(TheOnlySucc);
- Term->eraseFromParent();
-
- for (auto *DeadSucc : DeadSuccessors)
- DTUpdates.push_back({DominatorTree::Delete, BB, DeadSucc});
-
- ++NumTerminatorsFolded;
- }
- }
-
-public:
- ConstantTerminatorFoldingImpl(Loop &L, LoopInfo &LI, DominatorTree &DT,
- ScalarEvolution &SE,
- MemorySSAUpdater *MSSAU)
- : L(L), LI(LI), DT(DT), SE(SE), MSSAU(MSSAU), DFS(&L),
- DTU(DT, DomTreeUpdater::UpdateStrategy::Eager) {}
- bool run() {
- assert(L.getLoopLatch() && "Should be single latch!");
-
- // Collect all available information about status of blocks after constant
- // folding.
- analyze();
- BasicBlock *Header = L.getHeader();
- (void)Header;
-
- LLVM_DEBUG(dbgs() << "In function " << Header->getParent()->getName()
- << ": ");
-
- if (HasIrreducibleCFG) {
- LLVM_DEBUG(dbgs() << "Loops with irreducible CFG are not supported!\n");
- return false;
- }
-
- // Nothing to constant-fold.
- if (FoldCandidates.empty()) {
- LLVM_DEBUG(
- dbgs() << "No constant terminator folding candidates found in loop "
- << Header->getName() << "\n");
- return false;
- }
-
- // TODO: Support deletion of the current loop.
- if (DeleteCurrentLoop) {
- LLVM_DEBUG(
- dbgs()
- << "Give up constant terminator folding in loop " << Header->getName()
- << ": we don't currently support deletion of the current loop.\n");
- return false;
- }
-
- // TODO: Support blocks that are not dead, but also not in loop after the
- // folding.
- if (BlocksInLoopAfterFolding.size() + DeadLoopBlocks.size() !=
- L.getNumBlocks()) {
- LLVM_DEBUG(
- dbgs() << "Give up constant terminator folding in loop "
- << Header->getName() << ": we don't currently"
- " support blocks that are not dead, but will stop "
- "being a part of the loop after constant-folding.\n");
- return false;
- }
-
- SE.forgetTopmostLoop(&L);
- // Dump analysis results.
- LLVM_DEBUG(dump());
-
- LLVM_DEBUG(dbgs() << "Constant-folding " << FoldCandidates.size()
- << " terminators in loop " << Header->getName() << "\n");
-
- // Make the actual transforms.
- handleDeadExits();
- foldTerminators();
-
- if (!DeadLoopBlocks.empty()) {
- LLVM_DEBUG(dbgs() << "Deleting " << DeadLoopBlocks.size()
- << " dead blocks in loop " << Header->getName() << "\n");
- deleteDeadLoopBlocks();
- } else {
- // If we didn't do updates inside deleteDeadLoopBlocks, do them here.
- DTU.applyUpdates(DTUpdates);
- DTUpdates.clear();
- }
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
-#ifndef NDEBUG
- // Make sure that we have preserved all data structures after the transform.
-#if defined(EXPENSIVE_CHECKS)
- assert(DT.verify(DominatorTree::VerificationLevel::Full) &&
- "DT broken after transform!");
-#else
- assert(DT.verify(DominatorTree::VerificationLevel::Fast) &&
- "DT broken after transform!");
-#endif
- assert(DT.isReachableFromEntry(Header));
- LI.verify(DT);
-#endif
-
- return true;
- }
-
- bool foldingBreaksCurrentLoop() const {
- return DeleteCurrentLoop;
- }
-};
-} // namespace
-
-/// Turn branches and switches with known constant conditions into unconditional
-/// branches.
-static bool constantFoldTerminators(Loop &L, DominatorTree &DT, LoopInfo &LI,
- ScalarEvolution &SE,
- MemorySSAUpdater *MSSAU,
- bool &IsLoopDeleted) {
- if (!EnableTermFolding)
- return false;
-
- // To keep things simple, only process loops with single latch. We
- // canonicalize most loops to this form. We can support multi-latch if needed.
- if (!L.getLoopLatch())
- return false;
-
- ConstantTerminatorFoldingImpl BranchFolder(L, LI, DT, SE, MSSAU);
- bool Changed = BranchFolder.run();
- IsLoopDeleted = Changed && BranchFolder.foldingBreaksCurrentLoop();
- return Changed;
-}
-
-static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT,
- LoopInfo &LI, MemorySSAUpdater *MSSAU) {
- bool Changed = false;
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- // Copy blocks into a temporary array to avoid iterator invalidation issues
- // as we remove them.
- SmallVector<WeakTrackingVH, 16> Blocks(L.blocks());
-
- for (auto &Block : Blocks) {
- // Attempt to merge blocks in the trivial case. Don't modify blocks which
- // belong to other loops.
- BasicBlock *Succ = cast_or_null<BasicBlock>(Block);
- if (!Succ)
- continue;
-
- BasicBlock *Pred = Succ->getSinglePredecessor();
- if (!Pred || !Pred->getSingleSuccessor() || LI.getLoopFor(Pred) != &L)
- continue;
-
- // Merge Succ into Pred and delete it.
- MergeBlockIntoPredecessor(Succ, &DTU, &LI, MSSAU);
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- Changed = true;
- }
-
- return Changed;
-}
-
-static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
- ScalarEvolution &SE, MemorySSAUpdater *MSSAU,
- bool &IsLoopDeleted) {
- bool Changed = false;
-
- // Constant-fold terminators with known constant conditions.
- Changed |= constantFoldTerminators(L, DT, LI, SE, MSSAU, IsLoopDeleted);
-
- if (IsLoopDeleted)
- return true;
-
- // Eliminate unconditional branches by merging blocks into their predecessors.
- Changed |= mergeBlocksIntoPredecessors(L, DT, LI, MSSAU);
-
- if (Changed)
- SE.forgetTopmostLoop(&L);
-
- return Changed;
-}
-
-PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &LPMU) {
- Optional<MemorySSAUpdater> MSSAU;
- if (AR.MSSA)
- MSSAU = MemorySSAUpdater(AR.MSSA);
- bool DeleteCurrentLoop = false;
- if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE,
- MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
- DeleteCurrentLoop))
- return PreservedAnalyses::all();
-
- if (DeleteCurrentLoop)
- LPMU.markLoopAsDeleted(L, "loop-simplifycfg");
-
- auto PA = getLoopPassPreservedAnalyses();
- if (AR.MSSA)
- PA.preserve<MemorySSAAnalysis>();
- return PA;
-}
-
-namespace {
-class LoopSimplifyCFGLegacyPass : public LoopPass {
-public:
- static char ID; // Pass ID, replacement for typeid
- LoopSimplifyCFGLegacyPass() : LoopPass(ID) {
- initializeLoopSimplifyCFGLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override {
- if (skipLoop(L))
- return false;
-
- DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- Optional<MemorySSAUpdater> MSSAU;
- if (EnableMSSALoopDependency) {
- MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
- MSSAU = MemorySSAUpdater(MSSA);
- if (VerifyMemorySSA)
- MSSA->verifyMemorySSA();
- }
- bool DeleteCurrentLoop = false;
- bool Changed = simplifyLoopCFG(
- *L, DT, LI, SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
- DeleteCurrentLoop);
- if (DeleteCurrentLoop)
- LPM.markLoopAsDeleted(*L);
- return Changed;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- if (EnableMSSALoopDependency) {
- AU.addRequired<MemorySSAWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
- }
- AU.addPreserved<DependenceAnalysisWrapperPass>();
- getLoopAnalysisUsage(AU);
- }
-};
-} // end namespace
-
-char LoopSimplifyCFGLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopSimplifyCFGLegacyPass, "loop-simplifycfg",
- "Simplify loop CFG", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_END(LoopSimplifyCFGLegacyPass, "loop-simplifycfg",
- "Simplify loop CFG", false, false)
-
-Pass *llvm::createLoopSimplifyCFGPass() {
- return new LoopSimplifyCFGLegacyPass();
-}
+ for (auto *PL = DL->getParentLoop(); PL; PL = PL->getParentLoop())
+ for (auto *BB : DL->getBlocks())
+ PL->removeBlockFromLoop(BB);
+ DL->getParentLoop()->removeChildLoop(DL);
+ LI.addTopLevelLoop(DL);
+ }
+ LI.erase(DL);
+ }
+
+ for (auto *BB : DeadLoopBlocks) {
+ assert(BB != L.getHeader() &&
+ "Header of the current loop cannot be dead!");
+ LLVM_DEBUG(dbgs() << "Deleting dead loop block " << BB->getName()
+ << "\n");
+ LI.removeBlock(BB);
+ }
+
+ DetatchDeadBlocks(DeadLoopBlocks, &DTUpdates, /*KeepOneInputPHIs*/true);
+ DTU.applyUpdates(DTUpdates);
+ DTUpdates.clear();
+ for (auto *BB : DeadLoopBlocks)
+ DTU.deleteBB(BB);
+
+ NumLoopBlocksDeleted += DeadLoopBlocks.size();
+ }
+
+ /// Constant-fold terminators of blocks acculumated in FoldCandidates into the
+ /// unconditional branches.
+ void foldTerminators() {
+ for (BasicBlock *BB : FoldCandidates) {
+ assert(LI.getLoopFor(BB) == &L && "Should be a loop block!");
+ BasicBlock *TheOnlySucc = getOnlyLiveSuccessor(BB);
+ assert(TheOnlySucc && "Should have one live successor!");
+
+ LLVM_DEBUG(dbgs() << "Replacing terminator of " << BB->getName()
+ << " with an unconditional branch to the block "
+ << TheOnlySucc->getName() << "\n");
+
+ SmallPtrSet<BasicBlock *, 2> DeadSuccessors;
+ // Remove all BB's successors except for the live one.
+ unsigned TheOnlySuccDuplicates = 0;
+ for (auto *Succ : successors(BB))
+ if (Succ != TheOnlySucc) {
+ DeadSuccessors.insert(Succ);
+ // If our successor lies in a different loop, we don't want to remove
+ // the one-input Phi because it is a LCSSA Phi.
+ bool PreserveLCSSAPhi = !L.contains(Succ);
+ Succ->removePredecessor(BB, PreserveLCSSAPhi);
+ if (MSSAU)
+ MSSAU->removeEdge(BB, Succ);
+ } else
+ ++TheOnlySuccDuplicates;
+
+ assert(TheOnlySuccDuplicates > 0 && "Should be!");
+ // If TheOnlySucc was BB's successor more than once, after transform it
+ // will be its successor only once. Remove redundant inputs from
+ // TheOnlySucc's Phis.
+ bool PreserveLCSSAPhi = !L.contains(TheOnlySucc);
+ for (unsigned Dup = 1; Dup < TheOnlySuccDuplicates; ++Dup)
+ TheOnlySucc->removePredecessor(BB, PreserveLCSSAPhi);
+ if (MSSAU && TheOnlySuccDuplicates > 1)
+ MSSAU->removeDuplicatePhiEdgesBetween(BB, TheOnlySucc);
+
+ IRBuilder<> Builder(BB->getContext());
+ Instruction *Term = BB->getTerminator();
+ Builder.SetInsertPoint(Term);
+ Builder.CreateBr(TheOnlySucc);
+ Term->eraseFromParent();
+
+ for (auto *DeadSucc : DeadSuccessors)
+ DTUpdates.push_back({DominatorTree::Delete, BB, DeadSucc});
+
+ ++NumTerminatorsFolded;
+ }
+ }
+
+public:
+ ConstantTerminatorFoldingImpl(Loop &L, LoopInfo &LI, DominatorTree &DT,
+ ScalarEvolution &SE,
+ MemorySSAUpdater *MSSAU)
+ : L(L), LI(LI), DT(DT), SE(SE), MSSAU(MSSAU), DFS(&L),
+ DTU(DT, DomTreeUpdater::UpdateStrategy::Eager) {}
+ bool run() {
+ assert(L.getLoopLatch() && "Should be single latch!");
+
+ // Collect all available information about status of blocks after constant
+ // folding.
+ analyze();
+ BasicBlock *Header = L.getHeader();
+ (void)Header;
+
+ LLVM_DEBUG(dbgs() << "In function " << Header->getParent()->getName()
+ << ": ");
+
+ if (HasIrreducibleCFG) {
+ LLVM_DEBUG(dbgs() << "Loops with irreducible CFG are not supported!\n");
+ return false;
+ }
+
+ // Nothing to constant-fold.
+ if (FoldCandidates.empty()) {
+ LLVM_DEBUG(
+ dbgs() << "No constant terminator folding candidates found in loop "
+ << Header->getName() << "\n");
+ return false;
+ }
+
+ // TODO: Support deletion of the current loop.
+ if (DeleteCurrentLoop) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Give up constant terminator folding in loop " << Header->getName()
+ << ": we don't currently support deletion of the current loop.\n");
+ return false;
+ }
+
+ // TODO: Support blocks that are not dead, but also not in loop after the
+ // folding.
+ if (BlocksInLoopAfterFolding.size() + DeadLoopBlocks.size() !=
+ L.getNumBlocks()) {
+ LLVM_DEBUG(
+ dbgs() << "Give up constant terminator folding in loop "
+ << Header->getName() << ": we don't currently"
+ " support blocks that are not dead, but will stop "
+ "being a part of the loop after constant-folding.\n");
+ return false;
+ }
+
+ SE.forgetTopmostLoop(&L);
+ // Dump analysis results.
+ LLVM_DEBUG(dump());
+
+ LLVM_DEBUG(dbgs() << "Constant-folding " << FoldCandidates.size()
+ << " terminators in loop " << Header->getName() << "\n");
+
+ // Make the actual transforms.
+ handleDeadExits();
+ foldTerminators();
+
+ if (!DeadLoopBlocks.empty()) {
+ LLVM_DEBUG(dbgs() << "Deleting " << DeadLoopBlocks.size()
+ << " dead blocks in loop " << Header->getName() << "\n");
+ deleteDeadLoopBlocks();
+ } else {
+ // If we didn't do updates inside deleteDeadLoopBlocks, do them here.
+ DTU.applyUpdates(DTUpdates);
+ DTUpdates.clear();
+ }
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+#ifndef NDEBUG
+ // Make sure that we have preserved all data structures after the transform.
+#if defined(EXPENSIVE_CHECKS)
+ assert(DT.verify(DominatorTree::VerificationLevel::Full) &&
+ "DT broken after transform!");
+#else
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast) &&
+ "DT broken after transform!");
+#endif
+ assert(DT.isReachableFromEntry(Header));
+ LI.verify(DT);
+#endif
+
+ return true;
+ }
+
+ bool foldingBreaksCurrentLoop() const {
+ return DeleteCurrentLoop;
+ }
+};
+} // namespace
+
+/// Turn branches and switches with known constant conditions into unconditional
+/// branches.
+static bool constantFoldTerminators(Loop &L, DominatorTree &DT, LoopInfo &LI,
+ ScalarEvolution &SE,
+ MemorySSAUpdater *MSSAU,
+ bool &IsLoopDeleted) {
+ if (!EnableTermFolding)
+ return false;
+
+ // To keep things simple, only process loops with single latch. We
+ // canonicalize most loops to this form. We can support multi-latch if needed.
+ if (!L.getLoopLatch())
+ return false;
+
+ ConstantTerminatorFoldingImpl BranchFolder(L, LI, DT, SE, MSSAU);
+ bool Changed = BranchFolder.run();
+ IsLoopDeleted = Changed && BranchFolder.foldingBreaksCurrentLoop();
+ return Changed;
+}
+
+static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT,
+ LoopInfo &LI, MemorySSAUpdater *MSSAU) {
+ bool Changed = false;
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ // Copy blocks into a temporary array to avoid iterator invalidation issues
+ // as we remove them.
+ SmallVector<WeakTrackingVH, 16> Blocks(L.blocks());
+
+ for (auto &Block : Blocks) {
+ // Attempt to merge blocks in the trivial case. Don't modify blocks which
+ // belong to other loops.
+ BasicBlock *Succ = cast_or_null<BasicBlock>(Block);
+ if (!Succ)
+ continue;
+
+ BasicBlock *Pred = Succ->getSinglePredecessor();
+ if (!Pred || !Pred->getSingleSuccessor() || LI.getLoopFor(Pred) != &L)
+ continue;
+
+ // Merge Succ into Pred and delete it.
+ MergeBlockIntoPredecessor(Succ, &DTU, &LI, MSSAU);
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
+ ScalarEvolution &SE, MemorySSAUpdater *MSSAU,
+ bool &IsLoopDeleted) {
+ bool Changed = false;
+
+ // Constant-fold terminators with known constant conditions.
+ Changed |= constantFoldTerminators(L, DT, LI, SE, MSSAU, IsLoopDeleted);
+
+ if (IsLoopDeleted)
+ return true;
+
+ // Eliminate unconditional branches by merging blocks into their predecessors.
+ Changed |= mergeBlocksIntoPredecessors(L, DT, LI, MSSAU);
+
+ if (Changed)
+ SE.forgetTopmostLoop(&L);
+
+ return Changed;
+}
+
+PreservedAnalyses LoopSimplifyCFGPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &LPMU) {
+ Optional<MemorySSAUpdater> MSSAU;
+ if (AR.MSSA)
+ MSSAU = MemorySSAUpdater(AR.MSSA);
+ bool DeleteCurrentLoop = false;
+ if (!simplifyLoopCFG(L, AR.DT, AR.LI, AR.SE,
+ MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
+ DeleteCurrentLoop))
+ return PreservedAnalyses::all();
+
+ if (DeleteCurrentLoop)
+ LPMU.markLoopAsDeleted(L, "loop-simplifycfg");
+
+ auto PA = getLoopPassPreservedAnalyses();
+ if (AR.MSSA)
+ PA.preserve<MemorySSAAnalysis>();
+ return PA;
+}
+
+namespace {
+class LoopSimplifyCFGLegacyPass : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ LoopSimplifyCFGLegacyPass() : LoopPass(ID) {
+ initializeLoopSimplifyCFGLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ Optional<MemorySSAUpdater> MSSAU;
+ if (EnableMSSALoopDependency) {
+ MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+ MSSAU = MemorySSAUpdater(MSSA);
+ if (VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+ }
+ bool DeleteCurrentLoop = false;
+ bool Changed = simplifyLoopCFG(
+ *L, DT, LI, SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr,
+ DeleteCurrentLoop);
+ if (DeleteCurrentLoop)
+ LPM.markLoopAsDeleted(*L);
+ return Changed;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ if (EnableMSSALoopDependency) {
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ }
+ AU.addPreserved<DependenceAnalysisWrapperPass>();
+ getLoopAnalysisUsage(AU);
+ }
+};
+} // end namespace
+
+char LoopSimplifyCFGLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopSimplifyCFGLegacyPass, "loop-simplifycfg",
+ "Simplify loop CFG", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(LoopSimplifyCFGLegacyPass, "loop-simplifycfg",
+ "Simplify loop CFG", false, false)
+
+Pass *llvm::createLoopSimplifyCFGPass() {
+ return new LoopSimplifyCFGLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSink.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSink.cpp
index 5ea1f430c3..47698fdde6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSink.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopSink.cpp
@@ -1,74 +1,74 @@
-//===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass does the inverse transformation of what LICM does.
-// It traverses all of the instructions in the loop's preheader and sinks
-// them to the loop body where frequency is lower than the loop's preheader.
-// This pass is a reverse-transformation of LICM. It differs from the Sink
-// pass in the following ways:
-//
-// * It only handles sinking of instructions from the loop's preheader to the
-// loop's body
-// * It uses alias set tracker to get more accurate alias info
-// * It uses block frequency info to find the optimal sinking locations
-//
-// Overall algorithm:
-//
-// For I in Preheader:
-// InsertBBs = BBs that uses I
-// For BB in sorted(LoopBBs):
-// DomBBs = BBs in InsertBBs that are dominated by BB
-// if freq(DomBBs) > freq(BB)
-// InsertBBs = UseBBs - DomBBs + BB
-// For BB in InsertBBs:
-// Insert I at BB's beginning
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopSink.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
+//===-- LoopSink.cpp - Loop Sink Pass -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does the inverse transformation of what LICM does.
+// It traverses all of the instructions in the loop's preheader and sinks
+// them to the loop body where frequency is lower than the loop's preheader.
+// This pass is a reverse-transformation of LICM. It differs from the Sink
+// pass in the following ways:
+//
+// * It only handles sinking of instructions from the loop's preheader to the
+// loop's body
+// * It uses alias set tracker to get more accurate alias info
+// * It uses block frequency info to find the optimal sinking locations
+//
+// Overall algorithm:
+//
+// For I in Preheader:
+// InsertBBs = BBs that uses I
+// For BB in sorted(LoopBBs):
+// DomBBs = BBs in InsertBBs that are dominated by BB
+// if freq(DomBBs) > freq(BB)
+// InsertBBs = UseBBs - DomBBs + BB
+// For BB in InsertBBs:
+// Insert I at BB's beginning
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopSink.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "loopsink"
-
-STATISTIC(NumLoopSunk, "Number of instructions sunk into loop");
-STATISTIC(NumLoopSunkCloned, "Number of cloned instructions sunk into loop");
-
-static cl::opt<unsigned> SinkFrequencyPercentThreshold(
- "sink-freq-percent-threshold", cl::Hidden, cl::init(90),
- cl::desc("Do not sink instructions that require cloning unless they "
- "execute less than this percent of the time."));
-
-static cl::opt<unsigned> MaxNumberOfUseBBsForSinking(
- "max-uses-for-sinking", cl::Hidden, cl::init(30),
- cl::desc("Do not sink instructions that have too many uses."));
-
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loopsink"
+
+STATISTIC(NumLoopSunk, "Number of instructions sunk into loop");
+STATISTIC(NumLoopSunkCloned, "Number of cloned instructions sunk into loop");
+
+static cl::opt<unsigned> SinkFrequencyPercentThreshold(
+ "sink-freq-percent-threshold", cl::Hidden, cl::init(90),
+ cl::desc("Do not sink instructions that require cloning unless they "
+ "execute less than this percent of the time."));
+
+static cl::opt<unsigned> MaxNumberOfUseBBsForSinking(
+ "max-uses-for-sinking", cl::Hidden, cl::init(30),
+ cl::desc("Do not sink instructions that have too many uses."));
+
static cl::opt<bool> EnableMSSAInLoopSink(
"enable-mssa-in-loop-sink", cl::Hidden, cl::init(true),
cl::desc("Enable MemorySSA for LoopSink in new pass manager"));
@@ -77,167 +77,167 @@ static cl::opt<bool> EnableMSSAInLegacyLoopSink(
"enable-mssa-in-legacy-loop-sink", cl::Hidden, cl::init(false),
cl::desc("Enable MemorySSA for LoopSink in legacy pass manager"));
-/// Return adjusted total frequency of \p BBs.
-///
-/// * If there is only one BB, sinking instruction will not introduce code
-/// size increase. Thus there is no need to adjust the frequency.
-/// * If there are more than one BB, sinking would lead to code size increase.
-/// In this case, we add some "tax" to the total frequency to make it harder
-/// to sink. E.g.
-/// Freq(Preheader) = 100
-/// Freq(BBs) = sum(50, 49) = 99
-/// Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to
-/// BBs as the difference is too small to justify the code size increase.
-/// To model this, The adjusted Freq(BBs) will be:
-/// AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold%
-static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs,
- BlockFrequencyInfo &BFI) {
- BlockFrequency T = 0;
- for (BasicBlock *B : BBs)
- T += BFI.getBlockFreq(B);
- if (BBs.size() > 1)
- T /= BranchProbability(SinkFrequencyPercentThreshold, 100);
- return T;
-}
-
-/// Return a set of basic blocks to insert sinked instructions.
-///
-/// The returned set of basic blocks (BBsToSinkInto) should satisfy:
-///
-/// * Inside the loop \p L
-/// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto
-/// that domintates the UseBB
-/// * Has minimum total frequency that is no greater than preheader frequency
-///
-/// The purpose of the function is to find the optimal sinking points to
-/// minimize execution cost, which is defined as "sum of frequency of
-/// BBsToSinkInto".
-/// As a result, the returned BBsToSinkInto needs to have minimum total
-/// frequency.
-/// Additionally, if the total frequency of BBsToSinkInto exceeds preheader
-/// frequency, the optimal solution is not sinking (return empty set).
-///
-/// \p ColdLoopBBs is used to help find the optimal sinking locations.
-/// It stores a list of BBs that is:
-///
-/// * Inside the loop \p L
-/// * Has a frequency no larger than the loop's preheader
-/// * Sorted by BB frequency
-///
-/// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()).
-/// To avoid expensive computation, we cap the maximum UseBBs.size() in its
-/// caller.
-static SmallPtrSet<BasicBlock *, 2>
-findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs,
- const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
- DominatorTree &DT, BlockFrequencyInfo &BFI) {
- SmallPtrSet<BasicBlock *, 2> BBsToSinkInto;
- if (UseBBs.size() == 0)
- return BBsToSinkInto;
-
- BBsToSinkInto.insert(UseBBs.begin(), UseBBs.end());
- SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB;
-
- // For every iteration:
- // * Pick the ColdestBB from ColdLoopBBs
- // * Find the set BBsDominatedByColdestBB that satisfy:
- // - BBsDominatedByColdestBB is a subset of BBsToSinkInto
- // - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB
- // * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove
- // BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to
- // BBsToSinkInto
- for (BasicBlock *ColdestBB : ColdLoopBBs) {
- BBsDominatedByColdestBB.clear();
- for (BasicBlock *SinkedBB : BBsToSinkInto)
- if (DT.dominates(ColdestBB, SinkedBB))
- BBsDominatedByColdestBB.insert(SinkedBB);
- if (BBsDominatedByColdestBB.size() == 0)
- continue;
- if (adjustedSumFreq(BBsDominatedByColdestBB, BFI) >
- BFI.getBlockFreq(ColdestBB)) {
- for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) {
- BBsToSinkInto.erase(DominatedBB);
- }
- BBsToSinkInto.insert(ColdestBB);
- }
- }
-
- // Can't sink into blocks that have no valid insertion point.
- for (BasicBlock *BB : BBsToSinkInto) {
- if (BB->getFirstInsertionPt() == BB->end()) {
- BBsToSinkInto.clear();
- break;
- }
- }
-
- // If the total frequency of BBsToSinkInto is larger than preheader frequency,
- // do not sink.
- if (adjustedSumFreq(BBsToSinkInto, BFI) >
- BFI.getBlockFreq(L.getLoopPreheader()))
- BBsToSinkInto.clear();
- return BBsToSinkInto;
-}
-
-// Sinks \p I from the loop \p L's preheader to its uses. Returns true if
-// sinking is successful.
-// \p LoopBlockNumber is used to sort the insertion blocks to ensure
-// determinism.
+/// Return adjusted total frequency of \p BBs.
+///
+/// * If there is only one BB, sinking instruction will not introduce code
+/// size increase. Thus there is no need to adjust the frequency.
+/// * If there are more than one BB, sinking would lead to code size increase.
+/// In this case, we add some "tax" to the total frequency to make it harder
+/// to sink. E.g.
+/// Freq(Preheader) = 100
+/// Freq(BBs) = sum(50, 49) = 99
+/// Even if Freq(BBs) < Freq(Preheader), we will not sink from Preheade to
+/// BBs as the difference is too small to justify the code size increase.
+/// To model this, The adjusted Freq(BBs) will be:
+/// AdjustedFreq(BBs) = 99 / SinkFrequencyPercentThreshold%
+static BlockFrequency adjustedSumFreq(SmallPtrSetImpl<BasicBlock *> &BBs,
+ BlockFrequencyInfo &BFI) {
+ BlockFrequency T = 0;
+ for (BasicBlock *B : BBs)
+ T += BFI.getBlockFreq(B);
+ if (BBs.size() > 1)
+ T /= BranchProbability(SinkFrequencyPercentThreshold, 100);
+ return T;
+}
+
+/// Return a set of basic blocks to insert sinked instructions.
+///
+/// The returned set of basic blocks (BBsToSinkInto) should satisfy:
+///
+/// * Inside the loop \p L
+/// * For each UseBB in \p UseBBs, there is at least one BB in BBsToSinkInto
+/// that domintates the UseBB
+/// * Has minimum total frequency that is no greater than preheader frequency
+///
+/// The purpose of the function is to find the optimal sinking points to
+/// minimize execution cost, which is defined as "sum of frequency of
+/// BBsToSinkInto".
+/// As a result, the returned BBsToSinkInto needs to have minimum total
+/// frequency.
+/// Additionally, if the total frequency of BBsToSinkInto exceeds preheader
+/// frequency, the optimal solution is not sinking (return empty set).
+///
+/// \p ColdLoopBBs is used to help find the optimal sinking locations.
+/// It stores a list of BBs that is:
+///
+/// * Inside the loop \p L
+/// * Has a frequency no larger than the loop's preheader
+/// * Sorted by BB frequency
+///
+/// The complexity of the function is O(UseBBs.size() * ColdLoopBBs.size()).
+/// To avoid expensive computation, we cap the maximum UseBBs.size() in its
+/// caller.
+static SmallPtrSet<BasicBlock *, 2>
+findBBsToSinkInto(const Loop &L, const SmallPtrSetImpl<BasicBlock *> &UseBBs,
+ const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
+ DominatorTree &DT, BlockFrequencyInfo &BFI) {
+ SmallPtrSet<BasicBlock *, 2> BBsToSinkInto;
+ if (UseBBs.size() == 0)
+ return BBsToSinkInto;
+
+ BBsToSinkInto.insert(UseBBs.begin(), UseBBs.end());
+ SmallPtrSet<BasicBlock *, 2> BBsDominatedByColdestBB;
+
+ // For every iteration:
+ // * Pick the ColdestBB from ColdLoopBBs
+ // * Find the set BBsDominatedByColdestBB that satisfy:
+ // - BBsDominatedByColdestBB is a subset of BBsToSinkInto
+ // - Every BB in BBsDominatedByColdestBB is dominated by ColdestBB
+ // * If Freq(ColdestBB) < Freq(BBsDominatedByColdestBB), remove
+ // BBsDominatedByColdestBB from BBsToSinkInto, add ColdestBB to
+ // BBsToSinkInto
+ for (BasicBlock *ColdestBB : ColdLoopBBs) {
+ BBsDominatedByColdestBB.clear();
+ for (BasicBlock *SinkedBB : BBsToSinkInto)
+ if (DT.dominates(ColdestBB, SinkedBB))
+ BBsDominatedByColdestBB.insert(SinkedBB);
+ if (BBsDominatedByColdestBB.size() == 0)
+ continue;
+ if (adjustedSumFreq(BBsDominatedByColdestBB, BFI) >
+ BFI.getBlockFreq(ColdestBB)) {
+ for (BasicBlock *DominatedBB : BBsDominatedByColdestBB) {
+ BBsToSinkInto.erase(DominatedBB);
+ }
+ BBsToSinkInto.insert(ColdestBB);
+ }
+ }
+
+ // Can't sink into blocks that have no valid insertion point.
+ for (BasicBlock *BB : BBsToSinkInto) {
+ if (BB->getFirstInsertionPt() == BB->end()) {
+ BBsToSinkInto.clear();
+ break;
+ }
+ }
+
+ // If the total frequency of BBsToSinkInto is larger than preheader frequency,
+ // do not sink.
+ if (adjustedSumFreq(BBsToSinkInto, BFI) >
+ BFI.getBlockFreq(L.getLoopPreheader()))
+ BBsToSinkInto.clear();
+ return BBsToSinkInto;
+}
+
+// Sinks \p I from the loop \p L's preheader to its uses. Returns true if
+// sinking is successful.
+// \p LoopBlockNumber is used to sort the insertion blocks to ensure
+// determinism.
static bool sinkInstruction(
Loop &L, Instruction &I, const SmallVectorImpl<BasicBlock *> &ColdLoopBBs,
const SmallDenseMap<BasicBlock *, int, 16> &LoopBlockNumber, LoopInfo &LI,
DominatorTree &DT, BlockFrequencyInfo &BFI, MemorySSAUpdater *MSSAU) {
- // Compute the set of blocks in loop L which contain a use of I.
- SmallPtrSet<BasicBlock *, 2> BBs;
- for (auto &U : I.uses()) {
- Instruction *UI = cast<Instruction>(U.getUser());
- // We cannot sink I to PHI-uses.
- if (dyn_cast<PHINode>(UI))
- return false;
- // We cannot sink I if it has uses outside of the loop.
- if (!L.contains(LI.getLoopFor(UI->getParent())))
- return false;
- BBs.insert(UI->getParent());
- }
-
- // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max
- // BBs.size() to avoid expensive computation.
- // FIXME: Handle code size growth for min_size and opt_size.
- if (BBs.size() > MaxNumberOfUseBBsForSinking)
- return false;
-
- // Find the set of BBs that we should insert a copy of I.
- SmallPtrSet<BasicBlock *, 2> BBsToSinkInto =
- findBBsToSinkInto(L, BBs, ColdLoopBBs, DT, BFI);
- if (BBsToSinkInto.empty())
- return false;
-
- // Return if any of the candidate blocks to sink into is non-cold.
- if (BBsToSinkInto.size() > 1) {
- for (auto *BB : BBsToSinkInto)
- if (!LoopBlockNumber.count(BB))
- return false;
- }
-
- // Copy the final BBs into a vector and sort them using the total ordering
- // of the loop block numbers as iterating the set doesn't give a useful
- // order. No need to stable sort as the block numbers are a total ordering.
- SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
+ // Compute the set of blocks in loop L which contain a use of I.
+ SmallPtrSet<BasicBlock *, 2> BBs;
+ for (auto &U : I.uses()) {
+ Instruction *UI = cast<Instruction>(U.getUser());
+ // We cannot sink I to PHI-uses.
+ if (dyn_cast<PHINode>(UI))
+ return false;
+ // We cannot sink I if it has uses outside of the loop.
+ if (!L.contains(LI.getLoopFor(UI->getParent())))
+ return false;
+ BBs.insert(UI->getParent());
+ }
+
+ // findBBsToSinkInto is O(BBs.size() * ColdLoopBBs.size()). We cap the max
+ // BBs.size() to avoid expensive computation.
+ // FIXME: Handle code size growth for min_size and opt_size.
+ if (BBs.size() > MaxNumberOfUseBBsForSinking)
+ return false;
+
+ // Find the set of BBs that we should insert a copy of I.
+ SmallPtrSet<BasicBlock *, 2> BBsToSinkInto =
+ findBBsToSinkInto(L, BBs, ColdLoopBBs, DT, BFI);
+ if (BBsToSinkInto.empty())
+ return false;
+
+ // Return if any of the candidate blocks to sink into is non-cold.
+ if (BBsToSinkInto.size() > 1) {
+ for (auto *BB : BBsToSinkInto)
+ if (!LoopBlockNumber.count(BB))
+ return false;
+ }
+
+ // Copy the final BBs into a vector and sort them using the total ordering
+ // of the loop block numbers as iterating the set doesn't give a useful
+ // order. No need to stable sort as the block numbers are a total ordering.
+ SmallVector<BasicBlock *, 2> SortedBBsToSinkInto;
llvm::append_range(SortedBBsToSinkInto, BBsToSinkInto);
- llvm::sort(SortedBBsToSinkInto, [&](BasicBlock *A, BasicBlock *B) {
- return LoopBlockNumber.find(A)->second < LoopBlockNumber.find(B)->second;
- });
-
- BasicBlock *MoveBB = *SortedBBsToSinkInto.begin();
- // FIXME: Optimize the efficiency for cloned value replacement. The current
- // implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
- for (BasicBlock *N : makeArrayRef(SortedBBsToSinkInto).drop_front(1)) {
- assert(LoopBlockNumber.find(N)->second >
- LoopBlockNumber.find(MoveBB)->second &&
- "BBs not sorted!");
- // Clone I and replace its uses.
- Instruction *IC = I.clone();
- IC->setName(I.getName());
- IC->insertBefore(&*N->getFirstInsertionPt());
+ llvm::sort(SortedBBsToSinkInto, [&](BasicBlock *A, BasicBlock *B) {
+ return LoopBlockNumber.find(A)->second < LoopBlockNumber.find(B)->second;
+ });
+
+ BasicBlock *MoveBB = *SortedBBsToSinkInto.begin();
+ // FIXME: Optimize the efficiency for cloned value replacement. The current
+ // implementation is O(SortedBBsToSinkInto.size() * I.num_uses()).
+ for (BasicBlock *N : makeArrayRef(SortedBBsToSinkInto).drop_front(1)) {
+ assert(LoopBlockNumber.find(N)->second >
+ LoopBlockNumber.find(MoveBB)->second &&
+ "BBs not sorted!");
+ // Clone I and replace its uses.
+ Instruction *IC = I.clone();
+ IC->setName(I.getName());
+ IC->insertBefore(&*N->getFirstInsertionPt());
if (MSSAU && MSSAU->getMemorySSA()->getMemoryAccess(&I)) {
// Create a new MemoryAccess and let MemorySSA set its defining access.
@@ -253,51 +253,51 @@ static bool sinkInstruction(
}
}
- // Replaces uses of I with IC in N
- I.replaceUsesWithIf(IC, [N](Use &U) {
- return cast<Instruction>(U.getUser())->getParent() == N;
- });
- // Replaces uses of I with IC in blocks dominated by N
- replaceDominatedUsesWith(&I, IC, DT, N);
- LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
- << '\n');
- NumLoopSunkCloned++;
- }
- LLVM_DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
- NumLoopSunk++;
- I.moveBefore(&*MoveBB->getFirstInsertionPt());
-
+ // Replaces uses of I with IC in N
+ I.replaceUsesWithIf(IC, [N](Use &U) {
+ return cast<Instruction>(U.getUser())->getParent() == N;
+ });
+ // Replaces uses of I with IC in blocks dominated by N
+ replaceDominatedUsesWith(&I, IC, DT, N);
+ LLVM_DEBUG(dbgs() << "Sinking a clone of " << I << " To: " << N->getName()
+ << '\n');
+ NumLoopSunkCloned++;
+ }
+ LLVM_DEBUG(dbgs() << "Sinking " << I << " To: " << MoveBB->getName() << '\n');
+ NumLoopSunk++;
+ I.moveBefore(&*MoveBB->getFirstInsertionPt());
+
if (MSSAU)
if (MemoryUseOrDef *OldMemAcc = cast_or_null<MemoryUseOrDef>(
MSSAU->getMemorySSA()->getMemoryAccess(&I)))
MSSAU->moveToPlace(OldMemAcc, MoveBB, MemorySSA::Beginning);
- return true;
-}
-
-/// Sinks instructions from loop's preheader to the loop body if the
-/// sum frequency of inserted copy is smaller than preheader's frequency.
-static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
- DominatorTree &DT,
- BlockFrequencyInfo &BFI,
+ return true;
+}
+
+/// Sinks instructions from loop's preheader to the loop body if the
+/// sum frequency of inserted copy is smaller than preheader's frequency.
+static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
+ DominatorTree &DT,
+ BlockFrequencyInfo &BFI,
ScalarEvolution *SE,
AliasSetTracker *CurAST,
MemorySSA *MSSA) {
- BasicBlock *Preheader = L.getLoopPreheader();
+ BasicBlock *Preheader = L.getLoopPreheader();
assert(Preheader && "Expected loop to have preheader");
-
+
assert(Preheader->getParent()->hasProfileData() &&
"Unexpected call when profile data unavailable.");
-
- const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
- // If there are no basic blocks with lower frequency than the preheader then
- // we can avoid the detailed analysis as we will never find profitable sinking
- // opportunities.
- if (all_of(L.blocks(), [&](const BasicBlock *BB) {
- return BFI.getBlockFreq(BB) > PreheaderFreq;
- }))
- return false;
-
+
+ const BlockFrequency PreheaderFreq = BFI.getBlockFreq(Preheader);
+ // If there are no basic blocks with lower frequency than the preheader then
+ // we can avoid the detailed analysis as we will never find profitable sinking
+ // opportunities.
+ if (all_of(L.blocks(), [&](const BasicBlock *BB) {
+ return BFI.getBlockFreq(BB) > PreheaderFreq;
+ }))
+ return false;
+
std::unique_ptr<MemorySSAUpdater> MSSAU;
std::unique_ptr<SinkAndHoistLICMFlags> LICMFlags;
if (MSSA) {
@@ -306,42 +306,42 @@ static bool sinkLoopInvariantInstructions(Loop &L, AAResults &AA, LoopInfo &LI,
std::make_unique<SinkAndHoistLICMFlags>(/*IsSink=*/true, &L, MSSA);
}
- bool Changed = false;
-
- // Sort loop's basic blocks by frequency
- SmallVector<BasicBlock *, 10> ColdLoopBBs;
- SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber;
- int i = 0;
- for (BasicBlock *B : L.blocks())
- if (BFI.getBlockFreq(B) < BFI.getBlockFreq(L.getLoopPreheader())) {
- ColdLoopBBs.push_back(B);
- LoopBlockNumber[B] = ++i;
- }
- llvm::stable_sort(ColdLoopBBs, [&](BasicBlock *A, BasicBlock *B) {
- return BFI.getBlockFreq(A) < BFI.getBlockFreq(B);
- });
-
- // Traverse preheader's instructions in reverse order becaue if A depends
- // on B (A appears after B), A needs to be sinked first before B can be
- // sinked.
- for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) {
- Instruction *I = &*II++;
- // No need to check for instruction's operands are loop invariant.
- assert(L.hasLoopInvariantOperands(I) &&
- "Insts in a loop's preheader should have loop invariant operands!");
+ bool Changed = false;
+
+ // Sort loop's basic blocks by frequency
+ SmallVector<BasicBlock *, 10> ColdLoopBBs;
+ SmallDenseMap<BasicBlock *, int, 16> LoopBlockNumber;
+ int i = 0;
+ for (BasicBlock *B : L.blocks())
+ if (BFI.getBlockFreq(B) < BFI.getBlockFreq(L.getLoopPreheader())) {
+ ColdLoopBBs.push_back(B);
+ LoopBlockNumber[B] = ++i;
+ }
+ llvm::stable_sort(ColdLoopBBs, [&](BasicBlock *A, BasicBlock *B) {
+ return BFI.getBlockFreq(A) < BFI.getBlockFreq(B);
+ });
+
+ // Traverse preheader's instructions in reverse order becaue if A depends
+ // on B (A appears after B), A needs to be sinked first before B can be
+ // sinked.
+ for (auto II = Preheader->rbegin(), E = Preheader->rend(); II != E;) {
+ Instruction *I = &*II++;
+ // No need to check for instruction's operands are loop invariant.
+ assert(L.hasLoopInvariantOperands(I) &&
+ "Insts in a loop's preheader should have loop invariant operands!");
if (!canSinkOrHoistInst(*I, &AA, &DT, &L, CurAST, MSSAU.get(), false,
LICMFlags.get()))
- continue;
+ continue;
if (sinkInstruction(L, *I, ColdLoopBBs, LoopBlockNumber, LI, DT, BFI,
MSSAU.get()))
- Changed = true;
- }
-
- if (Changed && SE)
- SE->forgetLoopDispositions(&L);
- return Changed;
-}
-
+ Changed = true;
+ }
+
+ if (Changed && SE)
+ SE->forgetLoopDispositions(&L);
+ return Changed;
+}
+
static void computeAliasSet(Loop &L, BasicBlock &Preheader,
AliasSetTracker &CurAST) {
for (BasicBlock *BB : L.blocks())
@@ -349,31 +349,31 @@ static void computeAliasSet(Loop &L, BasicBlock &Preheader,
CurAST.add(Preheader);
}
-PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
- LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
- // Nothing to do if there are no loops.
- if (LI.empty())
- return PreservedAnalyses::all();
-
- AAResults &AA = FAM.getResult<AAManager>(F);
- DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
- BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
-
+PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
+ LoopInfo &LI = FAM.getResult<LoopAnalysis>(F);
+ // Nothing to do if there are no loops.
+ if (LI.empty())
+ return PreservedAnalyses::all();
+
+ AAResults &AA = FAM.getResult<AAManager>(F);
+ DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ BlockFrequencyInfo &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+
MemorySSA *MSSA = EnableMSSAInLoopSink
? &FAM.getResult<MemorySSAAnalysis>(F).getMSSA()
: nullptr;
- // We want to do a postorder walk over the loops. Since loops are a tree this
- // is equivalent to a reversed preorder walk and preorder is easy to compute
- // without recursion. Since we reverse the preorder, we will visit siblings
- // in reverse program order. This isn't expected to matter at all but is more
- // consistent with sinking algorithms which generally work bottom-up.
- SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder();
-
- bool Changed = false;
- do {
- Loop &L = *PreorderLoops.pop_back_val();
-
+ // We want to do a postorder walk over the loops. Since loops are a tree this
+ // is equivalent to a reversed preorder walk and preorder is easy to compute
+ // without recursion. Since we reverse the preorder, we will visit siblings
+ // in reverse program order. This isn't expected to matter at all but is more
+ // consistent with sinking algorithms which generally work bottom-up.
+ SmallVector<Loop *, 4> PreorderLoops = LI.getLoopsInPreorder();
+
+ bool Changed = false;
+ do {
+ Loop &L = *PreorderLoops.pop_back_val();
+
BasicBlock *Preheader = L.getLoopPreheader();
if (!Preheader)
continue;
@@ -389,19 +389,19 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
computeAliasSet(L, *Preheader, *CurAST.get());
}
- // Note that we don't pass SCEV here because it is only used to invalidate
- // loops in SCEV and we don't preserve (or request) SCEV at all making that
- // unnecessary.
- Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI,
+ // Note that we don't pass SCEV here because it is only used to invalidate
+ // loops in SCEV and we don't preserve (or request) SCEV at all making that
+ // unnecessary.
+ Changed |= sinkLoopInvariantInstructions(L, AA, LI, DT, BFI,
/*ScalarEvolution*/ nullptr,
CurAST.get(), MSSA);
- } while (!PreorderLoops.empty());
-
- if (!Changed)
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
+ } while (!PreorderLoops.empty());
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
if (MSSA) {
PA.preserve<MemorySSAAnalysis>();
@@ -410,20 +410,20 @@ PreservedAnalyses LoopSinkPass::run(Function &F, FunctionAnalysisManager &FAM) {
MSSA->verifyMemorySSA();
}
- return PA;
-}
-
-namespace {
-struct LegacyLoopSinkPass : public LoopPass {
- static char ID;
- LegacyLoopSinkPass() : LoopPass(ID) {
- initializeLegacyLoopSinkPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override {
- if (skipLoop(L))
- return false;
-
+ return PA;
+}
+
+namespace {
+struct LegacyLoopSinkPass : public LoopPass {
+ static char ID;
+ LegacyLoopSinkPass() : LoopPass(ID) {
+ initializeLegacyLoopSinkPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+
BasicBlock *Preheader = L->getLoopPreheader();
if (!Preheader)
return false;
@@ -434,7 +434,7 @@ struct LegacyLoopSinkPass : public LoopPass {
return false;
AAResults &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
- auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
std::unique_ptr<AliasSetTracker> CurAST;
MemorySSA *MSSA = nullptr;
if (EnableMSSAInLegacyLoopSink)
@@ -446,34 +446,34 @@ struct LegacyLoopSinkPass : public LoopPass {
bool Changed = sinkLoopInvariantInstructions(
*L, AA, getAnalysis<LoopInfoWrapperPass>().getLoopInfo(),
- getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
- getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(),
+ getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI(),
SE ? &SE->getSE() : nullptr, CurAST.get(), MSSA);
if (MSSA && VerifyMemorySSA)
MSSA->verifyMemorySSA();
return Changed;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<BlockFrequencyInfoWrapperPass>();
- getLoopAnalysisUsage(AU);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ getLoopAnalysisUsage(AU);
if (EnableMSSAInLegacyLoopSink) {
AU.addRequired<MemorySSAWrapperPass>();
AU.addPreserved<MemorySSAWrapperPass>();
}
- }
-};
-}
-
-char LegacyLoopSinkPass::ID = 0;
-INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+ }
+};
+}
+
+char LegacyLoopSinkPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_END(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, false)
-
-Pass *llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); }
+INITIALIZE_PASS_END(LegacyLoopSinkPass, "loop-sink", "Loop Sink", false, false)
+
+Pass *llvm::createLoopSinkPass() { return new LegacyLoopSinkPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index a528f34c15..5dec9b5420 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -1,5631 +1,5631 @@
-//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This transformation analyzes and transforms the induction variables (and
-// computations derived from them) into forms suitable for efficient execution
-// on the target.
-//
-// This pass performs a strength reduction on array references inside loops that
-// have as one or more of their components the loop induction variable, it
-// rewrites expressions to take advantage of scaled-index addressing modes
-// available on the target, and it performs a variety of other optimizations
-// related to loop induction variables.
-//
-// Terminology note: this code has a lot of handling for "post-increment" or
-// "post-inc" users. This is not talking about post-increment addressing modes;
-// it is instead talking about code like this:
-//
-// %i = phi [ 0, %entry ], [ %i.next, %latch ]
-// ...
-// %i.next = add %i, 1
-// %c = icmp eq %i.next, %n
-//
-// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
-// it's useful to think about these as the same register, with some uses using
-// the value of the register before the add and some using it after. In this
-// example, the icmp is a post-increment user, since it uses %i.next, which is
-// the value of the induction variable after the increment. The other common
-// case of post-increment users is users outside the loop.
-//
-// TODO: More sophistication in the way Formulae are generated and filtered.
-//
-// TODO: Handle multiple loops at a time.
-//
-// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
-// of a GlobalValue?
-//
-// TODO: When truncation is free, truncate ICmp users' operands to make it a
-// smaller encoding (on x86 at least).
-//
-// TODO: When a negated register is used by an add (such as in a list of
-// multiple base registers, or as the increment expression in an addrec),
-// we may not actually need both reg and (-1 * reg) in registers; the
-// negation can be implemented by using a sub instead of an add. The
-// lack of support for taking this into consideration when making
-// register pressure decisions is partly worked around by the "Special"
-// use kind.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/IVUsers.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/ScalarEvolutionNormalization.h"
+//===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into forms suitable for efficient execution
+// on the target.
+//
+// This pass performs a strength reduction on array references inside loops that
+// have as one or more of their components the loop induction variable, it
+// rewrites expressions to take advantage of scaled-index addressing modes
+// available on the target, and it performs a variety of other optimizations
+// related to loop induction variables.
+//
+// Terminology note: this code has a lot of handling for "post-increment" or
+// "post-inc" users. This is not talking about post-increment addressing modes;
+// it is instead talking about code like this:
+//
+// %i = phi [ 0, %entry ], [ %i.next, %latch ]
+// ...
+// %i.next = add %i, 1
+// %c = icmp eq %i.next, %n
+//
+// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
+// it's useful to think about these as the same register, with some uses using
+// the value of the register before the add and some using it after. In this
+// example, the icmp is a post-increment user, since it uses %i.next, which is
+// the value of the induction variable after the increment. The other common
+// case of post-increment users is users outside the loop.
+//
+// TODO: More sophistication in the way Formulae are generated and filtered.
+//
+// TODO: Handle multiple loops at a time.
+//
+// TODO: Should the addressing mode BaseGV be changed to a ConstantExpr instead
+// of a GlobalValue?
+//
+// TODO: When truncation is free, truncate ICmp users' operands to make it a
+// smaller encoding (on x86 at least).
+//
+// TODO: When a negated register is used by an add (such as in a list of
+// multiple base registers, or as the increment expression in an addrec),
+// we may not actually need both reg and (-1 * reg) in registers; the
+// negation can be implemented by using a sub instead of an add. The
+// lack of support for taking this into consideration when making
+// register pressure decisions is partly worked around by the "Special"
+// use kind.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopStrengthReduce.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ScalarEvolutionNormalization.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/OperandTraits.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <cstdlib>
-#include <iterator>
-#include <limits>
-#include <map>
-#include <numeric>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-reduce"
-
-/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
-/// bail out. This threshold is far beyond the number of users that LSR can
-/// conceivably solve, so it should not affect generated code, but catches the
-/// worst cases before LSR burns too much compile time and stack space.
-static const unsigned MaxIVUsers = 200;
-
-// Temporary flag to cleanup congruent phis after LSR phi expansion.
-// It's currently disabled until we can determine whether it's truly useful or
-// not. The flag should be removed after the v3.0 release.
-// This is now needed for ivchains.
-static cl::opt<bool> EnablePhiElim(
- "enable-lsr-phielim", cl::Hidden, cl::init(true),
- cl::desc("Enable LSR phi elimination"));
-
-// The flag adds instruction count to solutions cost comparision.
-static cl::opt<bool> InsnsCost(
- "lsr-insns-cost", cl::Hidden, cl::init(true),
- cl::desc("Add instruction count to a LSR cost model"));
-
-// Flag to choose how to narrow complex lsr solution
-static cl::opt<bool> LSRExpNarrow(
- "lsr-exp-narrow", cl::Hidden, cl::init(false),
- cl::desc("Narrow LSR complex solution using"
- " expectation of registers number"));
-
-// Flag to narrow search space by filtering non-optimal formulae with
-// the same ScaledReg and Scale.
-static cl::opt<bool> FilterSameScaledReg(
- "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
- cl::desc("Narrow LSR search space by filtering non-optimal formulae"
- " with the same ScaledReg and Scale"));
-
-static cl::opt<bool> EnableBackedgeIndexing(
- "lsr-backedge-indexing", cl::Hidden, cl::init(true),
- cl::desc("Enable the generation of cross iteration indexed memops"));
-
-static cl::opt<unsigned> ComplexityLimit(
- "lsr-complexity-limit", cl::Hidden,
- cl::init(std::numeric_limits<uint16_t>::max()),
- cl::desc("LSR search space complexity limit"));
-
-static cl::opt<unsigned> SetupCostDepthLimit(
- "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
- cl::desc("The limit on recursion depth for LSRs setup cost"));
-
-#ifndef NDEBUG
-// Stress test IV chain generation.
-static cl::opt<bool> StressIVChain(
- "stress-ivchain", cl::Hidden, cl::init(false),
- cl::desc("Stress test LSR IV chains"));
-#else
-static bool StressIVChain = false;
-#endif
-
-namespace {
-
-struct MemAccessTy {
- /// Used in situations where the accessed memory type is unknown.
- static const unsigned UnknownAddressSpace =
- std::numeric_limits<unsigned>::max();
-
- Type *MemTy = nullptr;
- unsigned AddrSpace = UnknownAddressSpace;
-
- MemAccessTy() = default;
- MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
-
- bool operator==(MemAccessTy Other) const {
- return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
- }
-
- bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
-
- static MemAccessTy getUnknown(LLVMContext &Ctx,
- unsigned AS = UnknownAddressSpace) {
- return MemAccessTy(Type::getVoidTy(Ctx), AS);
- }
-
- Type *getType() { return MemTy; }
-};
-
-/// This class holds data which is used to order reuse candidates.
-class RegSortData {
-public:
- /// This represents the set of LSRUse indices which reference
- /// a particular register.
- SmallBitVector UsedByIndices;
-
- void print(raw_ostream &OS) const;
- void dump() const;
-};
-
-} // end anonymous namespace
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void RegSortData::print(raw_ostream &OS) const {
- OS << "[NumUses=" << UsedByIndices.count() << ']';
-}
-
-LLVM_DUMP_METHOD void RegSortData::dump() const {
- print(errs()); errs() << '\n';
-}
-#endif
-
-namespace {
-
-/// Map register candidates to information about how they are used.
-class RegUseTracker {
- using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
-
- RegUsesTy RegUsesMap;
- SmallVector<const SCEV *, 16> RegSequence;
-
-public:
- void countRegister(const SCEV *Reg, size_t LUIdx);
- void dropRegister(const SCEV *Reg, size_t LUIdx);
- void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
-
- bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
-
- const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
-
- void clear();
-
- using iterator = SmallVectorImpl<const SCEV *>::iterator;
- using const_iterator = SmallVectorImpl<const SCEV *>::const_iterator;
-
- iterator begin() { return RegSequence.begin(); }
- iterator end() { return RegSequence.end(); }
- const_iterator begin() const { return RegSequence.begin(); }
- const_iterator end() const { return RegSequence.end(); }
-};
-
-} // end anonymous namespace
-
-void
-RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
- std::pair<RegUsesTy::iterator, bool> Pair =
- RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
- RegSortData &RSD = Pair.first->second;
- if (Pair.second)
- RegSequence.push_back(Reg);
- RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
- RSD.UsedByIndices.set(LUIdx);
-}
-
-void
-RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
- RegUsesTy::iterator It = RegUsesMap.find(Reg);
- assert(It != RegUsesMap.end());
- RegSortData &RSD = It->second;
- assert(RSD.UsedByIndices.size() > LUIdx);
- RSD.UsedByIndices.reset(LUIdx);
-}
-
-void
-RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
- assert(LUIdx <= LastLUIdx);
-
- // Update RegUses. The data structure is not optimized for this purpose;
- // we must iterate through it and update each of the bit vectors.
- for (auto &Pair : RegUsesMap) {
- SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
- if (LUIdx < UsedByIndices.size())
- UsedByIndices[LUIdx] =
- LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
- UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
- }
-}
-
-bool
-RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
- RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
- if (I == RegUsesMap.end())
- return false;
- const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
- int i = UsedByIndices.find_first();
- if (i == -1) return false;
- if ((size_t)i != LUIdx) return true;
- return UsedByIndices.find_next(i) != -1;
-}
-
-const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
- RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
- assert(I != RegUsesMap.end() && "Unknown register!");
- return I->second.UsedByIndices;
-}
-
-void RegUseTracker::clear() {
- RegUsesMap.clear();
- RegSequence.clear();
-}
-
-namespace {
-
-/// This class holds information that describes a formula for computing
-/// satisfying a use. It may include broken-out immediates and scaled registers.
-struct Formula {
- /// Global base address used for complex addressing.
- GlobalValue *BaseGV = nullptr;
-
- /// Base offset for complex addressing.
- int64_t BaseOffset = 0;
-
- /// Whether any complex addressing has a base register.
- bool HasBaseReg = false;
-
- /// The scale of any complex addressing.
- int64_t Scale = 0;
-
- /// The list of "base" registers for this use. When this is non-empty. The
- /// canonical representation of a formula is
- /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
- /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
- /// 3. The reg containing recurrent expr related with currect loop in the
- /// formula should be put in the ScaledReg.
- /// #1 enforces that the scaled register is always used when at least two
- /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
- /// #2 enforces that 1 * reg is reg.
- /// #3 ensures invariant regs with respect to current loop can be combined
- /// together in LSR codegen.
- /// This invariant can be temporarily broken while building a formula.
- /// However, every formula inserted into the LSRInstance must be in canonical
- /// form.
- SmallVector<const SCEV *, 4> BaseRegs;
-
- /// The 'scaled' register for this use. This should be non-null when Scale is
- /// not zero.
- const SCEV *ScaledReg = nullptr;
-
- /// An additional constant offset which added near the use. This requires a
- /// temporary register, but the offset itself can live in an add immediate
- /// field rather than a register.
- int64_t UnfoldedOffset = 0;
-
- Formula() = default;
-
- void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
-
- bool isCanonical(const Loop &L) const;
-
- void canonicalize(const Loop &L);
-
- bool unscale();
-
- bool hasZeroEnd() const;
-
- size_t getNumRegs() const;
- Type *getType() const;
-
- void deleteBaseReg(const SCEV *&S);
-
- bool referencesReg(const SCEV *S) const;
- bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
- const RegUseTracker &RegUses) const;
-
- void print(raw_ostream &OS) const;
- void dump() const;
-};
-
-} // end anonymous namespace
-
-/// Recursion helper for initialMatch.
-static void DoInitialMatch(const SCEV *S, Loop *L,
- SmallVectorImpl<const SCEV *> &Good,
- SmallVectorImpl<const SCEV *> &Bad,
- ScalarEvolution &SE) {
- // Collect expressions which properly dominate the loop header.
- if (SE.properlyDominates(S, L->getHeader())) {
- Good.push_back(S);
- return;
- }
-
- // Look at add operands.
- if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
- for (const SCEV *S : Add->operands())
- DoInitialMatch(S, L, Good, Bad, SE);
- return;
- }
-
- // Look at addrec operands.
- if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
- if (!AR->getStart()->isZero() && AR->isAffine()) {
- DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
- DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
- AR->getStepRecurrence(SE),
- // FIXME: AR->getNoWrapFlags()
- AR->getLoop(), SCEV::FlagAnyWrap),
- L, Good, Bad, SE);
- return;
- }
-
- // Handle a multiplication by -1 (negation) if it didn't fold.
- if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
- if (Mul->getOperand(0)->isAllOnesValue()) {
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <iterator>
+#include <limits>
+#include <map>
+#include <numeric>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-reduce"
+
+/// MaxIVUsers is an arbitrary threshold that provides an early opportunity for
+/// bail out. This threshold is far beyond the number of users that LSR can
+/// conceivably solve, so it should not affect generated code, but catches the
+/// worst cases before LSR burns too much compile time and stack space.
+static const unsigned MaxIVUsers = 200;
+
+// Temporary flag to cleanup congruent phis after LSR phi expansion.
+// It's currently disabled until we can determine whether it's truly useful or
+// not. The flag should be removed after the v3.0 release.
+// This is now needed for ivchains.
+static cl::opt<bool> EnablePhiElim(
+ "enable-lsr-phielim", cl::Hidden, cl::init(true),
+ cl::desc("Enable LSR phi elimination"));
+
+// The flag adds instruction count to solutions cost comparision.
+static cl::opt<bool> InsnsCost(
+ "lsr-insns-cost", cl::Hidden, cl::init(true),
+ cl::desc("Add instruction count to a LSR cost model"));
+
+// Flag to choose how to narrow complex lsr solution
+static cl::opt<bool> LSRExpNarrow(
+ "lsr-exp-narrow", cl::Hidden, cl::init(false),
+ cl::desc("Narrow LSR complex solution using"
+ " expectation of registers number"));
+
+// Flag to narrow search space by filtering non-optimal formulae with
+// the same ScaledReg and Scale.
+static cl::opt<bool> FilterSameScaledReg(
+ "lsr-filter-same-scaled-reg", cl::Hidden, cl::init(true),
+ cl::desc("Narrow LSR search space by filtering non-optimal formulae"
+ " with the same ScaledReg and Scale"));
+
+static cl::opt<bool> EnableBackedgeIndexing(
+ "lsr-backedge-indexing", cl::Hidden, cl::init(true),
+ cl::desc("Enable the generation of cross iteration indexed memops"));
+
+static cl::opt<unsigned> ComplexityLimit(
+ "lsr-complexity-limit", cl::Hidden,
+ cl::init(std::numeric_limits<uint16_t>::max()),
+ cl::desc("LSR search space complexity limit"));
+
+static cl::opt<unsigned> SetupCostDepthLimit(
+ "lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
+ cl::desc("The limit on recursion depth for LSRs setup cost"));
+
+#ifndef NDEBUG
+// Stress test IV chain generation.
+static cl::opt<bool> StressIVChain(
+ "stress-ivchain", cl::Hidden, cl::init(false),
+ cl::desc("Stress test LSR IV chains"));
+#else
+static bool StressIVChain = false;
+#endif
+
+namespace {
+
+struct MemAccessTy {
+ /// Used in situations where the accessed memory type is unknown.
+ static const unsigned UnknownAddressSpace =
+ std::numeric_limits<unsigned>::max();
+
+ Type *MemTy = nullptr;
+ unsigned AddrSpace = UnknownAddressSpace;
+
+ MemAccessTy() = default;
+ MemAccessTy(Type *Ty, unsigned AS) : MemTy(Ty), AddrSpace(AS) {}
+
+ bool operator==(MemAccessTy Other) const {
+ return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace;
+ }
+
+ bool operator!=(MemAccessTy Other) const { return !(*this == Other); }
+
+ static MemAccessTy getUnknown(LLVMContext &Ctx,
+ unsigned AS = UnknownAddressSpace) {
+ return MemAccessTy(Type::getVoidTy(Ctx), AS);
+ }
+
+ Type *getType() { return MemTy; }
+};
+
+/// This class holds data which is used to order reuse candidates.
+class RegSortData {
+public:
+ /// This represents the set of LSRUse indices which reference
+ /// a particular register.
+ SmallBitVector UsedByIndices;
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+} // end anonymous namespace
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void RegSortData::print(raw_ostream &OS) const {
+ OS << "[NumUses=" << UsedByIndices.count() << ']';
+}
+
+LLVM_DUMP_METHOD void RegSortData::dump() const {
+ print(errs()); errs() << '\n';
+}
+#endif
+
+namespace {
+
+/// Map register candidates to information about how they are used.
+class RegUseTracker {
+ using RegUsesTy = DenseMap<const SCEV *, RegSortData>;
+
+ RegUsesTy RegUsesMap;
+ SmallVector<const SCEV *, 16> RegSequence;
+
+public:
+ void countRegister(const SCEV *Reg, size_t LUIdx);
+ void dropRegister(const SCEV *Reg, size_t LUIdx);
+ void swapAndDropUse(size_t LUIdx, size_t LastLUIdx);
+
+ bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const;
+
+ const SmallBitVector &getUsedByIndices(const SCEV *Reg) const;
+
+ void clear();
+
+ using iterator = SmallVectorImpl<const SCEV *>::iterator;
+ using const_iterator = SmallVectorImpl<const SCEV *>::const_iterator;
+
+ iterator begin() { return RegSequence.begin(); }
+ iterator end() { return RegSequence.end(); }
+ const_iterator begin() const { return RegSequence.begin(); }
+ const_iterator end() const { return RegSequence.end(); }
+};
+
+} // end anonymous namespace
+
+void
+RegUseTracker::countRegister(const SCEV *Reg, size_t LUIdx) {
+ std::pair<RegUsesTy::iterator, bool> Pair =
+ RegUsesMap.insert(std::make_pair(Reg, RegSortData()));
+ RegSortData &RSD = Pair.first->second;
+ if (Pair.second)
+ RegSequence.push_back(Reg);
+ RSD.UsedByIndices.resize(std::max(RSD.UsedByIndices.size(), LUIdx + 1));
+ RSD.UsedByIndices.set(LUIdx);
+}
+
+void
+RegUseTracker::dropRegister(const SCEV *Reg, size_t LUIdx) {
+ RegUsesTy::iterator It = RegUsesMap.find(Reg);
+ assert(It != RegUsesMap.end());
+ RegSortData &RSD = It->second;
+ assert(RSD.UsedByIndices.size() > LUIdx);
+ RSD.UsedByIndices.reset(LUIdx);
+}
+
+void
+RegUseTracker::swapAndDropUse(size_t LUIdx, size_t LastLUIdx) {
+ assert(LUIdx <= LastLUIdx);
+
+ // Update RegUses. The data structure is not optimized for this purpose;
+ // we must iterate through it and update each of the bit vectors.
+ for (auto &Pair : RegUsesMap) {
+ SmallBitVector &UsedByIndices = Pair.second.UsedByIndices;
+ if (LUIdx < UsedByIndices.size())
+ UsedByIndices[LUIdx] =
+ LastLUIdx < UsedByIndices.size() ? UsedByIndices[LastLUIdx] : false;
+ UsedByIndices.resize(std::min(UsedByIndices.size(), LastLUIdx));
+ }
+}
+
+bool
+RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const {
+ RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+ if (I == RegUsesMap.end())
+ return false;
+ const SmallBitVector &UsedByIndices = I->second.UsedByIndices;
+ int i = UsedByIndices.find_first();
+ if (i == -1) return false;
+ if ((size_t)i != LUIdx) return true;
+ return UsedByIndices.find_next(i) != -1;
+}
+
+const SmallBitVector &RegUseTracker::getUsedByIndices(const SCEV *Reg) const {
+ RegUsesTy::const_iterator I = RegUsesMap.find(Reg);
+ assert(I != RegUsesMap.end() && "Unknown register!");
+ return I->second.UsedByIndices;
+}
+
+void RegUseTracker::clear() {
+ RegUsesMap.clear();
+ RegSequence.clear();
+}
+
+namespace {
+
+/// This class holds information that describes a formula for computing
+/// satisfying a use. It may include broken-out immediates and scaled registers.
+struct Formula {
+ /// Global base address used for complex addressing.
+ GlobalValue *BaseGV = nullptr;
+
+ /// Base offset for complex addressing.
+ int64_t BaseOffset = 0;
+
+ /// Whether any complex addressing has a base register.
+ bool HasBaseReg = false;
+
+ /// The scale of any complex addressing.
+ int64_t Scale = 0;
+
+ /// The list of "base" registers for this use. When this is non-empty. The
+ /// canonical representation of a formula is
+ /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
+ /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
+ /// 3. The reg containing recurrent expr related with currect loop in the
+ /// formula should be put in the ScaledReg.
+ /// #1 enforces that the scaled register is always used when at least two
+ /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
+ /// #2 enforces that 1 * reg is reg.
+ /// #3 ensures invariant regs with respect to current loop can be combined
+ /// together in LSR codegen.
+ /// This invariant can be temporarily broken while building a formula.
+ /// However, every formula inserted into the LSRInstance must be in canonical
+ /// form.
+ SmallVector<const SCEV *, 4> BaseRegs;
+
+ /// The 'scaled' register for this use. This should be non-null when Scale is
+ /// not zero.
+ const SCEV *ScaledReg = nullptr;
+
+ /// An additional constant offset which added near the use. This requires a
+ /// temporary register, but the offset itself can live in an add immediate
+ /// field rather than a register.
+ int64_t UnfoldedOffset = 0;
+
+ Formula() = default;
+
+ void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
+
+ bool isCanonical(const Loop &L) const;
+
+ void canonicalize(const Loop &L);
+
+ bool unscale();
+
+ bool hasZeroEnd() const;
+
+ size_t getNumRegs() const;
+ Type *getType() const;
+
+ void deleteBaseReg(const SCEV *&S);
+
+ bool referencesReg(const SCEV *S) const;
+ bool hasRegsUsedByUsesOtherThan(size_t LUIdx,
+ const RegUseTracker &RegUses) const;
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+} // end anonymous namespace
+
+/// Recursion helper for initialMatch.
+static void DoInitialMatch(const SCEV *S, Loop *L,
+ SmallVectorImpl<const SCEV *> &Good,
+ SmallVectorImpl<const SCEV *> &Bad,
+ ScalarEvolution &SE) {
+ // Collect expressions which properly dominate the loop header.
+ if (SE.properlyDominates(S, L->getHeader())) {
+ Good.push_back(S);
+ return;
+ }
+
+ // Look at add operands.
+ if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ for (const SCEV *S : Add->operands())
+ DoInitialMatch(S, L, Good, Bad, SE);
+ return;
+ }
+
+ // Look at addrec operands.
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
+ if (!AR->getStart()->isZero() && AR->isAffine()) {
+ DoInitialMatch(AR->getStart(), L, Good, Bad, SE);
+ DoInitialMatch(SE.getAddRecExpr(SE.getConstant(AR->getType(), 0),
+ AR->getStepRecurrence(SE),
+ // FIXME: AR->getNoWrapFlags()
+ AR->getLoop(), SCEV::FlagAnyWrap),
+ L, Good, Bad, SE);
+ return;
+ }
+
+ // Handle a multiplication by -1 (negation) if it didn't fold.
+ if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
+ if (Mul->getOperand(0)->isAllOnesValue()) {
SmallVector<const SCEV *, 4> Ops(drop_begin(Mul->operands()));
- const SCEV *NewMul = SE.getMulExpr(Ops);
-
- SmallVector<const SCEV *, 4> MyGood;
- SmallVector<const SCEV *, 4> MyBad;
- DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
- const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
- SE.getEffectiveSCEVType(NewMul->getType())));
- for (const SCEV *S : MyGood)
- Good.push_back(SE.getMulExpr(NegOne, S));
- for (const SCEV *S : MyBad)
- Bad.push_back(SE.getMulExpr(NegOne, S));
- return;
- }
-
- // Ok, we can't do anything interesting. Just stuff the whole thing into a
- // register and hope for the best.
- Bad.push_back(S);
-}
-
-/// Incorporate loop-variant parts of S into this Formula, attempting to keep
-/// all loop-invariant and loop-computable values in a single base register.
-void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
- SmallVector<const SCEV *, 4> Good;
- SmallVector<const SCEV *, 4> Bad;
- DoInitialMatch(S, L, Good, Bad, SE);
- if (!Good.empty()) {
- const SCEV *Sum = SE.getAddExpr(Good);
- if (!Sum->isZero())
- BaseRegs.push_back(Sum);
- HasBaseReg = true;
- }
- if (!Bad.empty()) {
- const SCEV *Sum = SE.getAddExpr(Bad);
- if (!Sum->isZero())
- BaseRegs.push_back(Sum);
- HasBaseReg = true;
- }
- canonicalize(*L);
-}
-
-/// Check whether or not this formula satisfies the canonical
-/// representation.
-/// \see Formula::BaseRegs.
-bool Formula::isCanonical(const Loop &L) const {
- if (!ScaledReg)
- return BaseRegs.size() <= 1;
-
- if (Scale != 1)
- return true;
-
- if (Scale == 1 && BaseRegs.empty())
- return false;
-
- const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
- if (SAR && SAR->getLoop() == &L)
- return true;
-
- // If ScaledReg is not a recurrent expr, or it is but its loop is not current
- // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
- // loop, we want to swap the reg in BaseRegs with ScaledReg.
+ const SCEV *NewMul = SE.getMulExpr(Ops);
+
+ SmallVector<const SCEV *, 4> MyGood;
+ SmallVector<const SCEV *, 4> MyBad;
+ DoInitialMatch(NewMul, L, MyGood, MyBad, SE);
+ const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
+ SE.getEffectiveSCEVType(NewMul->getType())));
+ for (const SCEV *S : MyGood)
+ Good.push_back(SE.getMulExpr(NegOne, S));
+ for (const SCEV *S : MyBad)
+ Bad.push_back(SE.getMulExpr(NegOne, S));
+ return;
+ }
+
+ // Ok, we can't do anything interesting. Just stuff the whole thing into a
+ // register and hope for the best.
+ Bad.push_back(S);
+}
+
+/// Incorporate loop-variant parts of S into this Formula, attempting to keep
+/// all loop-invariant and loop-computable values in a single base register.
+void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
+ SmallVector<const SCEV *, 4> Good;
+ SmallVector<const SCEV *, 4> Bad;
+ DoInitialMatch(S, L, Good, Bad, SE);
+ if (!Good.empty()) {
+ const SCEV *Sum = SE.getAddExpr(Good);
+ if (!Sum->isZero())
+ BaseRegs.push_back(Sum);
+ HasBaseReg = true;
+ }
+ if (!Bad.empty()) {
+ const SCEV *Sum = SE.getAddExpr(Bad);
+ if (!Sum->isZero())
+ BaseRegs.push_back(Sum);
+ HasBaseReg = true;
+ }
+ canonicalize(*L);
+}
+
+/// Check whether or not this formula satisfies the canonical
+/// representation.
+/// \see Formula::BaseRegs.
+bool Formula::isCanonical(const Loop &L) const {
+ if (!ScaledReg)
+ return BaseRegs.size() <= 1;
+
+ if (Scale != 1)
+ return true;
+
+ if (Scale == 1 && BaseRegs.empty())
+ return false;
+
+ const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+ if (SAR && SAR->getLoop() == &L)
+ return true;
+
+ // If ScaledReg is not a recurrent expr, or it is but its loop is not current
+ // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
+ // loop, we want to swap the reg in BaseRegs with ScaledReg.
auto I = find_if(BaseRegs, [&](const SCEV *S) {
return isa<const SCEVAddRecExpr>(S) &&
(cast<SCEVAddRecExpr>(S)->getLoop() == &L);
});
- return I == BaseRegs.end();
-}
-
-/// Helper method to morph a formula into its canonical representation.
-/// \see Formula::BaseRegs.
-/// Every formula having more than one base register, must use the ScaledReg
-/// field. Otherwise, we would have to do special cases everywhere in LSR
-/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
-/// On the other hand, 1*reg should be canonicalized into reg.
-void Formula::canonicalize(const Loop &L) {
- if (isCanonical(L))
- return;
- // So far we did not need this case. This is easy to implement but it is
- // useless to maintain dead code. Beside it could hurt compile time.
- assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
-
- // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
- if (!ScaledReg) {
+ return I == BaseRegs.end();
+}
+
+/// Helper method to morph a formula into its canonical representation.
+/// \see Formula::BaseRegs.
+/// Every formula having more than one base register, must use the ScaledReg
+/// field. Otherwise, we would have to do special cases everywhere in LSR
+/// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
+/// On the other hand, 1*reg should be canonicalized into reg.
+void Formula::canonicalize(const Loop &L) {
+ if (isCanonical(L))
+ return;
+ // So far we did not need this case. This is easy to implement but it is
+ // useless to maintain dead code. Beside it could hurt compile time.
+ assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
+
+ // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
+ if (!ScaledReg) {
ScaledReg = BaseRegs.pop_back_val();
- Scale = 1;
- }
-
- // If ScaledReg is an invariant with respect to L, find the reg from
- // BaseRegs containing the recurrent expr related with Loop L. Swap the
- // reg with ScaledReg.
- const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
- if (!SAR || SAR->getLoop() != &L) {
+ Scale = 1;
+ }
+
+ // If ScaledReg is an invariant with respect to L, find the reg from
+ // BaseRegs containing the recurrent expr related with Loop L. Swap the
+ // reg with ScaledReg.
+ const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+ if (!SAR || SAR->getLoop() != &L) {
auto I = find_if(BaseRegs, [&](const SCEV *S) {
return isa<const SCEVAddRecExpr>(S) &&
(cast<SCEVAddRecExpr>(S)->getLoop() == &L);
});
- if (I != BaseRegs.end())
- std::swap(ScaledReg, *I);
- }
-}
-
-/// Get rid of the scale in the formula.
-/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
-/// \return true if it was possible to get rid of the scale, false otherwise.
-/// \note After this operation the formula may not be in the canonical form.
-bool Formula::unscale() {
- if (Scale != 1)
- return false;
- Scale = 0;
- BaseRegs.push_back(ScaledReg);
- ScaledReg = nullptr;
- return true;
-}
-
-bool Formula::hasZeroEnd() const {
- if (UnfoldedOffset || BaseOffset)
- return false;
- if (BaseRegs.size() != 1 || ScaledReg)
- return false;
- return true;
-}
-
-/// Return the total number of register operands used by this formula. This does
-/// not include register uses implied by non-constant addrec strides.
-size_t Formula::getNumRegs() const {
- return !!ScaledReg + BaseRegs.size();
-}
-
-/// Return the type of this formula, if it has one, or null otherwise. This type
-/// is meaningless except for the bit size.
-Type *Formula::getType() const {
- return !BaseRegs.empty() ? BaseRegs.front()->getType() :
- ScaledReg ? ScaledReg->getType() :
- BaseGV ? BaseGV->getType() :
- nullptr;
-}
-
-/// Delete the given base reg from the BaseRegs list.
-void Formula::deleteBaseReg(const SCEV *&S) {
- if (&S != &BaseRegs.back())
- std::swap(S, BaseRegs.back());
- BaseRegs.pop_back();
-}
-
-/// Test if this formula references the given register.
-bool Formula::referencesReg(const SCEV *S) const {
- return S == ScaledReg || is_contained(BaseRegs, S);
-}
-
-/// Test whether this formula uses registers which are used by uses other than
-/// the use with the given index.
-bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
- const RegUseTracker &RegUses) const {
- if (ScaledReg)
- if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
- return true;
- for (const SCEV *BaseReg : BaseRegs)
- if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
- return true;
- return false;
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void Formula::print(raw_ostream &OS) const {
- bool First = true;
- if (BaseGV) {
- if (!First) OS << " + "; else First = false;
- BaseGV->printAsOperand(OS, /*PrintType=*/false);
- }
- if (BaseOffset != 0) {
- if (!First) OS << " + "; else First = false;
- OS << BaseOffset;
- }
- for (const SCEV *BaseReg : BaseRegs) {
- if (!First) OS << " + "; else First = false;
- OS << "reg(" << *BaseReg << ')';
- }
- if (HasBaseReg && BaseRegs.empty()) {
- if (!First) OS << " + "; else First = false;
- OS << "**error: HasBaseReg**";
- } else if (!HasBaseReg && !BaseRegs.empty()) {
- if (!First) OS << " + "; else First = false;
- OS << "**error: !HasBaseReg**";
- }
- if (Scale != 0) {
- if (!First) OS << " + "; else First = false;
- OS << Scale << "*reg(";
- if (ScaledReg)
- OS << *ScaledReg;
- else
- OS << "<unknown>";
- OS << ')';
- }
- if (UnfoldedOffset != 0) {
- if (!First) OS << " + ";
- OS << "imm(" << UnfoldedOffset << ')';
- }
-}
-
-LLVM_DUMP_METHOD void Formula::dump() const {
- print(errs()); errs() << '\n';
-}
-#endif
-
-/// Return true if the given addrec can be sign-extended without changing its
-/// value.
-static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
- Type *WideTy =
- IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
- return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
-}
-
-/// Return true if the given add can be sign-extended without changing its
-/// value.
-static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
- Type *WideTy =
- IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
- return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
-}
-
-/// Return true if the given mul can be sign-extended without changing its
-/// value.
-static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
- Type *WideTy =
- IntegerType::get(SE.getContext(),
- SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
- return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
-}
-
-/// Return an expression for LHS /s RHS, if it can be determined and if the
-/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
-/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that
-/// the multiplication may overflow, which is useful when the result will be
-/// used in a context where the most significant bits are ignored.
-static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
- ScalarEvolution &SE,
- bool IgnoreSignificantBits = false) {
- // Handle the trivial case, which works for any SCEV type.
- if (LHS == RHS)
- return SE.getConstant(LHS->getType(), 1);
-
- // Handle a few RHS special cases.
- const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
- if (RC) {
- const APInt &RA = RC->getAPInt();
- // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
- // some folding.
- if (RA.isAllOnesValue())
- return SE.getMulExpr(LHS, RC);
- // Handle x /s 1 as x.
- if (RA == 1)
- return LHS;
- }
-
- // Check for a division of a constant by a constant.
- if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
- if (!RC)
- return nullptr;
- const APInt &LA = C->getAPInt();
- const APInt &RA = RC->getAPInt();
- if (LA.srem(RA) != 0)
- return nullptr;
- return SE.getConstant(LA.sdiv(RA));
- }
-
- // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
- if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
- if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
- const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
- IgnoreSignificantBits);
- if (!Step) return nullptr;
- const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
- IgnoreSignificantBits);
- if (!Start) return nullptr;
- // FlagNW is independent of the start value, step direction, and is
- // preserved with smaller magnitude steps.
- // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
- return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
- }
- return nullptr;
- }
-
- // Distribute the sdiv over add operands, if the add doesn't overflow.
- if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
- if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
- SmallVector<const SCEV *, 8> Ops;
- for (const SCEV *S : Add->operands()) {
- const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
- if (!Op) return nullptr;
- Ops.push_back(Op);
- }
- return SE.getAddExpr(Ops);
- }
- return nullptr;
- }
-
- // Check for a multiply operand that we can pull RHS out of.
- if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
- if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
- SmallVector<const SCEV *, 4> Ops;
- bool Found = false;
- for (const SCEV *S : Mul->operands()) {
- if (!Found)
- if (const SCEV *Q = getExactSDiv(S, RHS, SE,
- IgnoreSignificantBits)) {
- S = Q;
- Found = true;
- }
- Ops.push_back(S);
- }
- return Found ? SE.getMulExpr(Ops) : nullptr;
- }
- return nullptr;
- }
-
- // Otherwise we don't know.
- return nullptr;
-}
-
-/// If S involves the addition of a constant integer value, return that integer
-/// value, and mutate S to point to a new SCEV with that value excluded.
-static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
- if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
- if (C->getAPInt().getMinSignedBits() <= 64) {
- S = SE.getConstant(C->getType(), 0);
- return C->getValue()->getSExtValue();
- }
- } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ if (I != BaseRegs.end())
+ std::swap(ScaledReg, *I);
+ }
+}
+
+/// Get rid of the scale in the formula.
+/// In other words, this method morphes reg1 + 1*reg2 into reg1 + reg2.
+/// \return true if it was possible to get rid of the scale, false otherwise.
+/// \note After this operation the formula may not be in the canonical form.
+bool Formula::unscale() {
+ if (Scale != 1)
+ return false;
+ Scale = 0;
+ BaseRegs.push_back(ScaledReg);
+ ScaledReg = nullptr;
+ return true;
+}
+
+bool Formula::hasZeroEnd() const {
+ if (UnfoldedOffset || BaseOffset)
+ return false;
+ if (BaseRegs.size() != 1 || ScaledReg)
+ return false;
+ return true;
+}
+
+/// Return the total number of register operands used by this formula. This does
+/// not include register uses implied by non-constant addrec strides.
+size_t Formula::getNumRegs() const {
+ return !!ScaledReg + BaseRegs.size();
+}
+
+/// Return the type of this formula, if it has one, or null otherwise. This type
+/// is meaningless except for the bit size.
+Type *Formula::getType() const {
+ return !BaseRegs.empty() ? BaseRegs.front()->getType() :
+ ScaledReg ? ScaledReg->getType() :
+ BaseGV ? BaseGV->getType() :
+ nullptr;
+}
+
+/// Delete the given base reg from the BaseRegs list.
+void Formula::deleteBaseReg(const SCEV *&S) {
+ if (&S != &BaseRegs.back())
+ std::swap(S, BaseRegs.back());
+ BaseRegs.pop_back();
+}
+
+/// Test if this formula references the given register.
+bool Formula::referencesReg(const SCEV *S) const {
+ return S == ScaledReg || is_contained(BaseRegs, S);
+}
+
+/// Test whether this formula uses registers which are used by uses other than
+/// the use with the given index.
+bool Formula::hasRegsUsedByUsesOtherThan(size_t LUIdx,
+ const RegUseTracker &RegUses) const {
+ if (ScaledReg)
+ if (RegUses.isRegUsedByUsesOtherThan(ScaledReg, LUIdx))
+ return true;
+ for (const SCEV *BaseReg : BaseRegs)
+ if (RegUses.isRegUsedByUsesOtherThan(BaseReg, LUIdx))
+ return true;
+ return false;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void Formula::print(raw_ostream &OS) const {
+ bool First = true;
+ if (BaseGV) {
+ if (!First) OS << " + "; else First = false;
+ BaseGV->printAsOperand(OS, /*PrintType=*/false);
+ }
+ if (BaseOffset != 0) {
+ if (!First) OS << " + "; else First = false;
+ OS << BaseOffset;
+ }
+ for (const SCEV *BaseReg : BaseRegs) {
+ if (!First) OS << " + "; else First = false;
+ OS << "reg(" << *BaseReg << ')';
+ }
+ if (HasBaseReg && BaseRegs.empty()) {
+ if (!First) OS << " + "; else First = false;
+ OS << "**error: HasBaseReg**";
+ } else if (!HasBaseReg && !BaseRegs.empty()) {
+ if (!First) OS << " + "; else First = false;
+ OS << "**error: !HasBaseReg**";
+ }
+ if (Scale != 0) {
+ if (!First) OS << " + "; else First = false;
+ OS << Scale << "*reg(";
+ if (ScaledReg)
+ OS << *ScaledReg;
+ else
+ OS << "<unknown>";
+ OS << ')';
+ }
+ if (UnfoldedOffset != 0) {
+ if (!First) OS << " + ";
+ OS << "imm(" << UnfoldedOffset << ')';
+ }
+}
+
+LLVM_DUMP_METHOD void Formula::dump() const {
+ print(errs()); errs() << '\n';
+}
+#endif
+
+/// Return true if the given addrec can be sign-extended without changing its
+/// value.
+static bool isAddRecSExtable(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+ Type *WideTy =
+ IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(AR->getType()) + 1);
+ return isa<SCEVAddRecExpr>(SE.getSignExtendExpr(AR, WideTy));
+}
+
+/// Return true if the given add can be sign-extended without changing its
+/// value.
+static bool isAddSExtable(const SCEVAddExpr *A, ScalarEvolution &SE) {
+ Type *WideTy =
+ IntegerType::get(SE.getContext(), SE.getTypeSizeInBits(A->getType()) + 1);
+ return isa<SCEVAddExpr>(SE.getSignExtendExpr(A, WideTy));
+}
+
+/// Return true if the given mul can be sign-extended without changing its
+/// value.
+static bool isMulSExtable(const SCEVMulExpr *M, ScalarEvolution &SE) {
+ Type *WideTy =
+ IntegerType::get(SE.getContext(),
+ SE.getTypeSizeInBits(M->getType()) * M->getNumOperands());
+ return isa<SCEVMulExpr>(SE.getSignExtendExpr(M, WideTy));
+}
+
+/// Return an expression for LHS /s RHS, if it can be determined and if the
+/// remainder is known to be zero, or null otherwise. If IgnoreSignificantBits
+/// is true, expressions like (X * Y) /s Y are simplified to Y, ignoring that
+/// the multiplication may overflow, which is useful when the result will be
+/// used in a context where the most significant bits are ignored.
+static const SCEV *getExactSDiv(const SCEV *LHS, const SCEV *RHS,
+ ScalarEvolution &SE,
+ bool IgnoreSignificantBits = false) {
+ // Handle the trivial case, which works for any SCEV type.
+ if (LHS == RHS)
+ return SE.getConstant(LHS->getType(), 1);
+
+ // Handle a few RHS special cases.
+ const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
+ if (RC) {
+ const APInt &RA = RC->getAPInt();
+ // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do
+ // some folding.
+ if (RA.isAllOnesValue())
+ return SE.getMulExpr(LHS, RC);
+ // Handle x /s 1 as x.
+ if (RA == 1)
+ return LHS;
+ }
+
+ // Check for a division of a constant by a constant.
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
+ if (!RC)
+ return nullptr;
+ const APInt &LA = C->getAPInt();
+ const APInt &RA = RC->getAPInt();
+ if (LA.srem(RA) != 0)
+ return nullptr;
+ return SE.getConstant(LA.sdiv(RA));
+ }
+
+ // Distribute the sdiv over addrec operands, if the addrec doesn't overflow.
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
+ if ((IgnoreSignificantBits || isAddRecSExtable(AR, SE)) && AR->isAffine()) {
+ const SCEV *Step = getExactSDiv(AR->getStepRecurrence(SE), RHS, SE,
+ IgnoreSignificantBits);
+ if (!Step) return nullptr;
+ const SCEV *Start = getExactSDiv(AR->getStart(), RHS, SE,
+ IgnoreSignificantBits);
+ if (!Start) return nullptr;
+ // FlagNW is independent of the start value, step direction, and is
+ // preserved with smaller magnitude steps.
+ // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+ return SE.getAddRecExpr(Start, Step, AR->getLoop(), SCEV::FlagAnyWrap);
+ }
+ return nullptr;
+ }
+
+ // Distribute the sdiv over add operands, if the add doesn't overflow.
+ if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
+ if (IgnoreSignificantBits || isAddSExtable(Add, SE)) {
+ SmallVector<const SCEV *, 8> Ops;
+ for (const SCEV *S : Add->operands()) {
+ const SCEV *Op = getExactSDiv(S, RHS, SE, IgnoreSignificantBits);
+ if (!Op) return nullptr;
+ Ops.push_back(Op);
+ }
+ return SE.getAddExpr(Ops);
+ }
+ return nullptr;
+ }
+
+ // Check for a multiply operand that we can pull RHS out of.
+ if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS)) {
+ if (IgnoreSignificantBits || isMulSExtable(Mul, SE)) {
+ SmallVector<const SCEV *, 4> Ops;
+ bool Found = false;
+ for (const SCEV *S : Mul->operands()) {
+ if (!Found)
+ if (const SCEV *Q = getExactSDiv(S, RHS, SE,
+ IgnoreSignificantBits)) {
+ S = Q;
+ Found = true;
+ }
+ Ops.push_back(S);
+ }
+ return Found ? SE.getMulExpr(Ops) : nullptr;
+ }
+ return nullptr;
+ }
+
+ // Otherwise we don't know.
+ return nullptr;
+}
+
+/// If S involves the addition of a constant integer value, return that integer
+/// value, and mutate S to point to a new SCEV with that value excluded.
+static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
+ if (C->getAPInt().getMinSignedBits() <= 64) {
+ S = SE.getConstant(C->getType(), 0);
+ return C->getValue()->getSExtValue();
+ }
+ } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
SmallVector<const SCEV *, 8> NewOps(Add->operands());
- int64_t Result = ExtractImmediate(NewOps.front(), SE);
- if (Result != 0)
- S = SE.getAddExpr(NewOps);
- return Result;
- } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ int64_t Result = ExtractImmediate(NewOps.front(), SE);
+ if (Result != 0)
+ S = SE.getAddExpr(NewOps);
+ return Result;
+ } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
SmallVector<const SCEV *, 8> NewOps(AR->operands());
- int64_t Result = ExtractImmediate(NewOps.front(), SE);
- if (Result != 0)
- S = SE.getAddRecExpr(NewOps, AR->getLoop(),
- // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
- SCEV::FlagAnyWrap);
- return Result;
- }
- return 0;
-}
-
-/// If S involves the addition of a GlobalValue address, return that symbol, and
-/// mutate S to point to a new SCEV with that value excluded.
-static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
- if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
- if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
- S = SE.getConstant(GV->getType(), 0);
- return GV;
- }
- } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ int64_t Result = ExtractImmediate(NewOps.front(), SE);
+ if (Result != 0)
+ S = SE.getAddRecExpr(NewOps, AR->getLoop(),
+ // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+ SCEV::FlagAnyWrap);
+ return Result;
+ }
+ return 0;
+}
+
+/// If S involves the addition of a GlobalValue address, return that symbol, and
+/// mutate S to point to a new SCEV with that value excluded.
+static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
+ if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+ if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
+ S = SE.getConstant(GV->getType(), 0);
+ return GV;
+ }
+ } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
SmallVector<const SCEV *, 8> NewOps(Add->operands());
- GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
- if (Result)
- S = SE.getAddExpr(NewOps);
- return Result;
- } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
+ if (Result)
+ S = SE.getAddExpr(NewOps);
+ return Result;
+ } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
SmallVector<const SCEV *, 8> NewOps(AR->operands());
- GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
- if (Result)
- S = SE.getAddRecExpr(NewOps, AR->getLoop(),
- // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
- SCEV::FlagAnyWrap);
- return Result;
- }
- return nullptr;
-}
-
-/// Returns true if the specified instruction is using the specified value as an
-/// address.
-static bool isAddressUse(const TargetTransformInfo &TTI,
- Instruction *Inst, Value *OperandVal) {
- bool isAddress = isa<LoadInst>(Inst);
- if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
- if (SI->getPointerOperand() == OperandVal)
- isAddress = true;
- } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
- // Addressing modes can also be folded into prefetches and a variety
- // of intrinsics.
- switch (II->getIntrinsicID()) {
- case Intrinsic::memset:
- case Intrinsic::prefetch:
- case Intrinsic::masked_load:
- if (II->getArgOperand(0) == OperandVal)
- isAddress = true;
- break;
- case Intrinsic::masked_store:
- if (II->getArgOperand(1) == OperandVal)
- isAddress = true;
- break;
- case Intrinsic::memmove:
- case Intrinsic::memcpy:
- if (II->getArgOperand(0) == OperandVal ||
- II->getArgOperand(1) == OperandVal)
- isAddress = true;
- break;
- default: {
- MemIntrinsicInfo IntrInfo;
- if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
- if (IntrInfo.PtrVal == OperandVal)
- isAddress = true;
- }
- }
- }
- } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
- if (RMW->getPointerOperand() == OperandVal)
- isAddress = true;
- } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
- if (CmpX->getPointerOperand() == OperandVal)
- isAddress = true;
- }
- return isAddress;
-}
-
-/// Return the type of the memory being accessed.
-static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
- Instruction *Inst, Value *OperandVal) {
- MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
- if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
- AccessTy.MemTy = SI->getOperand(0)->getType();
- AccessTy.AddrSpace = SI->getPointerAddressSpace();
- } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
- AccessTy.AddrSpace = LI->getPointerAddressSpace();
- } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
- AccessTy.AddrSpace = RMW->getPointerAddressSpace();
- } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
- AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
- } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
- switch (II->getIntrinsicID()) {
- case Intrinsic::prefetch:
- case Intrinsic::memset:
- AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
- AccessTy.MemTy = OperandVal->getType();
- break;
- case Intrinsic::memmove:
- case Intrinsic::memcpy:
- AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
- AccessTy.MemTy = OperandVal->getType();
- break;
- case Intrinsic::masked_load:
- AccessTy.AddrSpace =
- II->getArgOperand(0)->getType()->getPointerAddressSpace();
- break;
- case Intrinsic::masked_store:
- AccessTy.MemTy = II->getOperand(0)->getType();
- AccessTy.AddrSpace =
- II->getArgOperand(1)->getType()->getPointerAddressSpace();
- break;
- default: {
- MemIntrinsicInfo IntrInfo;
- if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
- AccessTy.AddrSpace
- = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
- }
-
- break;
- }
- }
- }
-
- // All pointers have the same requirements, so canonicalize them to an
- // arbitrary pointer type to minimize variation.
- if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy))
- AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
- PTy->getAddressSpace());
-
- return AccessTy;
-}
-
-/// Return true if this AddRec is already a phi in its loop.
-static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
- for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
- if (SE.isSCEVable(PN.getType()) &&
- (SE.getEffectiveSCEVType(PN.getType()) ==
- SE.getEffectiveSCEVType(AR->getType())) &&
- SE.getSCEV(&PN) == AR)
- return true;
- }
- return false;
-}
-
-/// Check if expanding this expression is likely to incur significant cost. This
-/// is tricky because SCEV doesn't track which expressions are actually computed
-/// by the current IR.
-///
-/// We currently allow expansion of IV increments that involve adds,
-/// multiplication by constants, and AddRecs from existing phis.
-///
-/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
-/// obvious multiple of the UDivExpr.
-static bool isHighCostExpansion(const SCEV *S,
- SmallPtrSetImpl<const SCEV*> &Processed,
- ScalarEvolution &SE) {
- // Zero/One operand expressions
- switch (S->getSCEVType()) {
- case scUnknown:
- case scConstant:
- return false;
- case scTruncate:
- return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
- Processed, SE);
- case scZeroExtend:
- return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
- Processed, SE);
- case scSignExtend:
- return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
- Processed, SE);
+ GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
+ if (Result)
+ S = SE.getAddRecExpr(NewOps, AR->getLoop(),
+ // FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+ SCEV::FlagAnyWrap);
+ return Result;
+ }
+ return nullptr;
+}
+
+/// Returns true if the specified instruction is using the specified value as an
+/// address.
+static bool isAddressUse(const TargetTransformInfo &TTI,
+ Instruction *Inst, Value *OperandVal) {
+ bool isAddress = isa<LoadInst>(Inst);
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ if (SI->getPointerOperand() == OperandVal)
+ isAddress = true;
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ // Addressing modes can also be folded into prefetches and a variety
+ // of intrinsics.
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::memset:
+ case Intrinsic::prefetch:
+ case Intrinsic::masked_load:
+ if (II->getArgOperand(0) == OperandVal)
+ isAddress = true;
+ break;
+ case Intrinsic::masked_store:
+ if (II->getArgOperand(1) == OperandVal)
+ isAddress = true;
+ break;
+ case Intrinsic::memmove:
+ case Intrinsic::memcpy:
+ if (II->getArgOperand(0) == OperandVal ||
+ II->getArgOperand(1) == OperandVal)
+ isAddress = true;
+ break;
+ default: {
+ MemIntrinsicInfo IntrInfo;
+ if (TTI.getTgtMemIntrinsic(II, IntrInfo)) {
+ if (IntrInfo.PtrVal == OperandVal)
+ isAddress = true;
+ }
+ }
+ }
+ } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+ if (RMW->getPointerOperand() == OperandVal)
+ isAddress = true;
+ } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+ if (CmpX->getPointerOperand() == OperandVal)
+ isAddress = true;
+ }
+ return isAddress;
+}
+
+/// Return the type of the memory being accessed.
+static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
+ Instruction *Inst, Value *OperandVal) {
+ MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace);
+ if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+ AccessTy.MemTy = SI->getOperand(0)->getType();
+ AccessTy.AddrSpace = SI->getPointerAddressSpace();
+ } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+ AccessTy.AddrSpace = LI->getPointerAddressSpace();
+ } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+ AccessTy.AddrSpace = RMW->getPointerAddressSpace();
+ } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+ AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::prefetch:
+ case Intrinsic::memset:
+ AccessTy.AddrSpace = II->getArgOperand(0)->getType()->getPointerAddressSpace();
+ AccessTy.MemTy = OperandVal->getType();
+ break;
+ case Intrinsic::memmove:
+ case Intrinsic::memcpy:
+ AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
+ AccessTy.MemTy = OperandVal->getType();
+ break;
+ case Intrinsic::masked_load:
+ AccessTy.AddrSpace =
+ II->getArgOperand(0)->getType()->getPointerAddressSpace();
+ break;
+ case Intrinsic::masked_store:
+ AccessTy.MemTy = II->getOperand(0)->getType();
+ AccessTy.AddrSpace =
+ II->getArgOperand(1)->getType()->getPointerAddressSpace();
+ break;
+ default: {
+ MemIntrinsicInfo IntrInfo;
+ if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
+ AccessTy.AddrSpace
+ = IntrInfo.PtrVal->getType()->getPointerAddressSpace();
+ }
+
+ break;
+ }
+ }
+ }
+
+ // All pointers have the same requirements, so canonicalize them to an
+ // arbitrary pointer type to minimize variation.
+ if (PointerType *PTy = dyn_cast<PointerType>(AccessTy.MemTy))
+ AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1),
+ PTy->getAddressSpace());
+
+ return AccessTy;
+}
+
+/// Return true if this AddRec is already a phi in its loop.
+static bool isExistingPhi(const SCEVAddRecExpr *AR, ScalarEvolution &SE) {
+ for (PHINode &PN : AR->getLoop()->getHeader()->phis()) {
+ if (SE.isSCEVable(PN.getType()) &&
+ (SE.getEffectiveSCEVType(PN.getType()) ==
+ SE.getEffectiveSCEVType(AR->getType())) &&
+ SE.getSCEV(&PN) == AR)
+ return true;
+ }
+ return false;
+}
+
+/// Check if expanding this expression is likely to incur significant cost. This
+/// is tricky because SCEV doesn't track which expressions are actually computed
+/// by the current IR.
+///
+/// We currently allow expansion of IV increments that involve adds,
+/// multiplication by constants, and AddRecs from existing phis.
+///
+/// TODO: Allow UDivExpr if we can find an existing IV increment that is an
+/// obvious multiple of the UDivExpr.
+static bool isHighCostExpansion(const SCEV *S,
+ SmallPtrSetImpl<const SCEV*> &Processed,
+ ScalarEvolution &SE) {
+ // Zero/One operand expressions
+ switch (S->getSCEVType()) {
+ case scUnknown:
+ case scConstant:
+ return false;
+ case scTruncate:
+ return isHighCostExpansion(cast<SCEVTruncateExpr>(S)->getOperand(),
+ Processed, SE);
+ case scZeroExtend:
+ return isHighCostExpansion(cast<SCEVZeroExtendExpr>(S)->getOperand(),
+ Processed, SE);
+ case scSignExtend:
+ return isHighCostExpansion(cast<SCEVSignExtendExpr>(S)->getOperand(),
+ Processed, SE);
default:
break;
- }
-
- if (!Processed.insert(S).second)
- return false;
-
- if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
- for (const SCEV *S : Add->operands()) {
- if (isHighCostExpansion(S, Processed, SE))
- return true;
- }
- return false;
- }
-
- if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
- if (Mul->getNumOperands() == 2) {
- // Multiplication by a constant is ok
- if (isa<SCEVConstant>(Mul->getOperand(0)))
- return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
-
- // If we have the value of one operand, check if an existing
- // multiplication already generates this expression.
- if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
- Value *UVal = U->getValue();
- for (User *UR : UVal->users()) {
- // If U is a constant, it may be used by a ConstantExpr.
- Instruction *UI = dyn_cast<Instruction>(UR);
- if (UI && UI->getOpcode() == Instruction::Mul &&
- SE.isSCEVable(UI->getType())) {
- return SE.getSCEV(UI) == Mul;
- }
- }
- }
- }
- }
-
- if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
- if (isExistingPhi(AR, SE))
- return false;
- }
-
- // Fow now, consider any other type of expression (div/mul/min/max) high cost.
- return true;
-}
-
-namespace {
-
-class LSRUse;
-
-} // end anonymous namespace
-
-/// Check if the addressing mode defined by \p F is completely
-/// folded in \p LU at isel time.
-/// This includes address-mode folding and special icmp tricks.
-/// This function returns true if \p LU can accommodate what \p F
-/// defines and up to 1 base + 1 scaled + offset.
-/// In other words, if \p F has several base registers, this function may
-/// still return true. Therefore, users still need to account for
-/// additional base registers and/or unfolded offsets to derive an
-/// accurate cost model.
-static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
- const LSRUse &LU, const Formula &F);
-
-// Get the cost of the scaling factor used in F for LU.
-static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
- const LSRUse &LU, const Formula &F,
- const Loop &L);
-
-namespace {
-
-/// This class is used to measure and compare candidate formulae.
-class Cost {
- const Loop *L = nullptr;
- ScalarEvolution *SE = nullptr;
- const TargetTransformInfo *TTI = nullptr;
- TargetTransformInfo::LSRCost C;
-
-public:
- Cost() = delete;
- Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI) :
- L(L), SE(&SE), TTI(&TTI) {
- C.Insns = 0;
- C.NumRegs = 0;
- C.AddRecCost = 0;
- C.NumIVMuls = 0;
- C.NumBaseAdds = 0;
- C.ImmCost = 0;
- C.SetupCost = 0;
- C.ScaleCost = 0;
- }
-
- bool isLess(Cost &Other);
-
- void Lose();
-
-#ifndef NDEBUG
- // Once any of the metrics loses, they must all remain losers.
- bool isValid() {
- return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
- | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
- || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
- & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
- }
-#endif
-
- bool isLoser() {
- assert(isValid() && "invalid cost");
- return C.NumRegs == ~0u;
- }
-
- void RateFormula(const Formula &F,
- SmallPtrSetImpl<const SCEV *> &Regs,
- const DenseSet<const SCEV *> &VisitedRegs,
- const LSRUse &LU,
- SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
-
- void print(raw_ostream &OS) const;
- void dump() const;
-
-private:
- void RateRegister(const Formula &F, const SCEV *Reg,
- SmallPtrSetImpl<const SCEV *> &Regs);
- void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
- SmallPtrSetImpl<const SCEV *> &Regs,
- SmallPtrSetImpl<const SCEV *> *LoserRegs);
-};
-
-/// An operand value in an instruction which is to be replaced with some
-/// equivalent, possibly strength-reduced, replacement.
-struct LSRFixup {
- /// The instruction which will be updated.
- Instruction *UserInst = nullptr;
-
- /// The operand of the instruction which will be replaced. The operand may be
- /// used more than once; every instance will be replaced.
- Value *OperandValToReplace = nullptr;
-
- /// If this user is to use the post-incremented value of an induction
- /// variable, this set is non-empty and holds the loops associated with the
- /// induction variable.
- PostIncLoopSet PostIncLoops;
-
- /// A constant offset to be added to the LSRUse expression. This allows
- /// multiple fixups to share the same LSRUse with different offsets, for
- /// example in an unrolled loop.
- int64_t Offset = 0;
-
- LSRFixup() = default;
-
- bool isUseFullyOutsideLoop(const Loop *L) const;
-
- void print(raw_ostream &OS) const;
- void dump() const;
-};
-
-/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
-/// SmallVectors of const SCEV*.
-struct UniquifierDenseMapInfo {
- static SmallVector<const SCEV *, 4> getEmptyKey() {
- SmallVector<const SCEV *, 4> V;
- V.push_back(reinterpret_cast<const SCEV *>(-1));
- return V;
- }
-
- static SmallVector<const SCEV *, 4> getTombstoneKey() {
- SmallVector<const SCEV *, 4> V;
- V.push_back(reinterpret_cast<const SCEV *>(-2));
- return V;
- }
-
- static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
- return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
- }
-
- static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
- const SmallVector<const SCEV *, 4> &RHS) {
- return LHS == RHS;
- }
-};
-
-/// This class holds the state that LSR keeps for each use in IVUsers, as well
-/// as uses invented by LSR itself. It includes information about what kinds of
-/// things can be folded into the user, information about the user itself, and
-/// information about how the use may be satisfied. TODO: Represent multiple
-/// users of the same expression in common?
-class LSRUse {
- DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
-
-public:
- /// An enum for a kind of use, indicating what types of scaled and immediate
- /// operands it might support.
- enum KindType {
- Basic, ///< A normal use, with no folding.
- Special, ///< A special case of basic, allowing -1 scales.
- Address, ///< An address use; folding according to TargetLowering
- ICmpZero ///< An equality icmp with both operands folded into one.
- // TODO: Add a generic icmp too?
- };
-
- using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
-
- KindType Kind;
- MemAccessTy AccessTy;
-
- /// The list of operands which are to be replaced.
- SmallVector<LSRFixup, 8> Fixups;
-
- /// Keep track of the min and max offsets of the fixups.
- int64_t MinOffset = std::numeric_limits<int64_t>::max();
- int64_t MaxOffset = std::numeric_limits<int64_t>::min();
-
- /// This records whether all of the fixups using this LSRUse are outside of
- /// the loop, in which case some special-case heuristics may be used.
- bool AllFixupsOutsideLoop = true;
-
- /// RigidFormula is set to true to guarantee that this use will be associated
- /// with a single formula--the one that initially matched. Some SCEV
- /// expressions cannot be expanded. This allows LSR to consider the registers
- /// used by those expressions without the need to expand them later after
- /// changing the formula.
- bool RigidFormula = false;
-
- /// This records the widest use type for any fixup using this
- /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
- /// fixup widths to be equivalent, because the narrower one may be relying on
- /// the implicit truncation to truncate away bogus bits.
- Type *WidestFixupType = nullptr;
-
- /// A list of ways to build a value that can satisfy this user. After the
- /// list is populated, one of these is selected heuristically and used to
- /// formulate a replacement for OperandValToReplace in UserInst.
- SmallVector<Formula, 12> Formulae;
-
- /// The set of register candidates used by all formulae in this LSRUse.
- SmallPtrSet<const SCEV *, 4> Regs;
-
- LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
-
- LSRFixup &getNewFixup() {
- Fixups.push_back(LSRFixup());
- return Fixups.back();
- }
-
- void pushFixup(LSRFixup &f) {
- Fixups.push_back(f);
- if (f.Offset > MaxOffset)
- MaxOffset = f.Offset;
- if (f.Offset < MinOffset)
- MinOffset = f.Offset;
- }
-
- bool HasFormulaWithSameRegs(const Formula &F) const;
- float getNotSelectedProbability(const SCEV *Reg) const;
- bool InsertFormula(const Formula &F, const Loop &L);
- void DeleteFormula(Formula &F);
- void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
-
- void print(raw_ostream &OS) const;
- void dump() const;
-};
-
-} // end anonymous namespace
-
-static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
- LSRUse::KindType Kind, MemAccessTy AccessTy,
- GlobalValue *BaseGV, int64_t BaseOffset,
- bool HasBaseReg, int64_t Scale,
- Instruction *Fixup = nullptr);
-
-static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
- if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
- return 1;
- if (Depth == 0)
- return 0;
- if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
- return getSetupCost(S->getStart(), Depth - 1);
+ }
+
+ if (!Processed.insert(S).second)
+ return false;
+
+ if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ for (const SCEV *S : Add->operands()) {
+ if (isHighCostExpansion(S, Processed, SE))
+ return true;
+ }
+ return false;
+ }
+
+ if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+ if (Mul->getNumOperands() == 2) {
+ // Multiplication by a constant is ok
+ if (isa<SCEVConstant>(Mul->getOperand(0)))
+ return isHighCostExpansion(Mul->getOperand(1), Processed, SE);
+
+ // If we have the value of one operand, check if an existing
+ // multiplication already generates this expression.
+ if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(Mul->getOperand(1))) {
+ Value *UVal = U->getValue();
+ for (User *UR : UVal->users()) {
+ // If U is a constant, it may be used by a ConstantExpr.
+ Instruction *UI = dyn_cast<Instruction>(UR);
+ if (UI && UI->getOpcode() == Instruction::Mul &&
+ SE.isSCEVable(UI->getType())) {
+ return SE.getSCEV(UI) == Mul;
+ }
+ }
+ }
+ }
+ }
+
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ if (isExistingPhi(AR, SE))
+ return false;
+ }
+
+ // Fow now, consider any other type of expression (div/mul/min/max) high cost.
+ return true;
+}
+
+namespace {
+
+class LSRUse;
+
+} // end anonymous namespace
+
+/// Check if the addressing mode defined by \p F is completely
+/// folded in \p LU at isel time.
+/// This includes address-mode folding and special icmp tricks.
+/// This function returns true if \p LU can accommodate what \p F
+/// defines and up to 1 base + 1 scaled + offset.
+/// In other words, if \p F has several base registers, this function may
+/// still return true. Therefore, users still need to account for
+/// additional base registers and/or unfolded offsets to derive an
+/// accurate cost model.
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ const LSRUse &LU, const Formula &F);
+
+// Get the cost of the scaling factor used in F for LU.
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+ const LSRUse &LU, const Formula &F,
+ const Loop &L);
+
+namespace {
+
+/// This class is used to measure and compare candidate formulae.
+class Cost {
+ const Loop *L = nullptr;
+ ScalarEvolution *SE = nullptr;
+ const TargetTransformInfo *TTI = nullptr;
+ TargetTransformInfo::LSRCost C;
+
+public:
+ Cost() = delete;
+ Cost(const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI) :
+ L(L), SE(&SE), TTI(&TTI) {
+ C.Insns = 0;
+ C.NumRegs = 0;
+ C.AddRecCost = 0;
+ C.NumIVMuls = 0;
+ C.NumBaseAdds = 0;
+ C.ImmCost = 0;
+ C.SetupCost = 0;
+ C.ScaleCost = 0;
+ }
+
+ bool isLess(Cost &Other);
+
+ void Lose();
+
+#ifndef NDEBUG
+ // Once any of the metrics loses, they must all remain losers.
+ bool isValid() {
+ return ((C.Insns | C.NumRegs | C.AddRecCost | C.NumIVMuls | C.NumBaseAdds
+ | C.ImmCost | C.SetupCost | C.ScaleCost) != ~0u)
+ || ((C.Insns & C.NumRegs & C.AddRecCost & C.NumIVMuls & C.NumBaseAdds
+ & C.ImmCost & C.SetupCost & C.ScaleCost) == ~0u);
+ }
+#endif
+
+ bool isLoser() {
+ assert(isValid() && "invalid cost");
+ return C.NumRegs == ~0u;
+ }
+
+ void RateFormula(const Formula &F,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ const DenseSet<const SCEV *> &VisitedRegs,
+ const LSRUse &LU,
+ SmallPtrSetImpl<const SCEV *> *LoserRegs = nullptr);
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+
+private:
+ void RateRegister(const Formula &F, const SCEV *Reg,
+ SmallPtrSetImpl<const SCEV *> &Regs);
+ void RatePrimaryRegister(const Formula &F, const SCEV *Reg,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ SmallPtrSetImpl<const SCEV *> *LoserRegs);
+};
+
+/// An operand value in an instruction which is to be replaced with some
+/// equivalent, possibly strength-reduced, replacement.
+struct LSRFixup {
+ /// The instruction which will be updated.
+ Instruction *UserInst = nullptr;
+
+ /// The operand of the instruction which will be replaced. The operand may be
+ /// used more than once; every instance will be replaced.
+ Value *OperandValToReplace = nullptr;
+
+ /// If this user is to use the post-incremented value of an induction
+ /// variable, this set is non-empty and holds the loops associated with the
+ /// induction variable.
+ PostIncLoopSet PostIncLoops;
+
+ /// A constant offset to be added to the LSRUse expression. This allows
+ /// multiple fixups to share the same LSRUse with different offsets, for
+ /// example in an unrolled loop.
+ int64_t Offset = 0;
+
+ LSRFixup() = default;
+
+ bool isUseFullyOutsideLoop(const Loop *L) const;
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+/// A DenseMapInfo implementation for holding DenseMaps and DenseSets of sorted
+/// SmallVectors of const SCEV*.
+struct UniquifierDenseMapInfo {
+ static SmallVector<const SCEV *, 4> getEmptyKey() {
+ SmallVector<const SCEV *, 4> V;
+ V.push_back(reinterpret_cast<const SCEV *>(-1));
+ return V;
+ }
+
+ static SmallVector<const SCEV *, 4> getTombstoneKey() {
+ SmallVector<const SCEV *, 4> V;
+ V.push_back(reinterpret_cast<const SCEV *>(-2));
+ return V;
+ }
+
+ static unsigned getHashValue(const SmallVector<const SCEV *, 4> &V) {
+ return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+ }
+
+ static bool isEqual(const SmallVector<const SCEV *, 4> &LHS,
+ const SmallVector<const SCEV *, 4> &RHS) {
+ return LHS == RHS;
+ }
+};
+
+/// This class holds the state that LSR keeps for each use in IVUsers, as well
+/// as uses invented by LSR itself. It includes information about what kinds of
+/// things can be folded into the user, information about the user itself, and
+/// information about how the use may be satisfied. TODO: Represent multiple
+/// users of the same expression in common?
+class LSRUse {
+ DenseSet<SmallVector<const SCEV *, 4>, UniquifierDenseMapInfo> Uniquifier;
+
+public:
+ /// An enum for a kind of use, indicating what types of scaled and immediate
+ /// operands it might support.
+ enum KindType {
+ Basic, ///< A normal use, with no folding.
+ Special, ///< A special case of basic, allowing -1 scales.
+ Address, ///< An address use; folding according to TargetLowering
+ ICmpZero ///< An equality icmp with both operands folded into one.
+ // TODO: Add a generic icmp too?
+ };
+
+ using SCEVUseKindPair = PointerIntPair<const SCEV *, 2, KindType>;
+
+ KindType Kind;
+ MemAccessTy AccessTy;
+
+ /// The list of operands which are to be replaced.
+ SmallVector<LSRFixup, 8> Fixups;
+
+ /// Keep track of the min and max offsets of the fixups.
+ int64_t MinOffset = std::numeric_limits<int64_t>::max();
+ int64_t MaxOffset = std::numeric_limits<int64_t>::min();
+
+ /// This records whether all of the fixups using this LSRUse are outside of
+ /// the loop, in which case some special-case heuristics may be used.
+ bool AllFixupsOutsideLoop = true;
+
+ /// RigidFormula is set to true to guarantee that this use will be associated
+ /// with a single formula--the one that initially matched. Some SCEV
+ /// expressions cannot be expanded. This allows LSR to consider the registers
+ /// used by those expressions without the need to expand them later after
+ /// changing the formula.
+ bool RigidFormula = false;
+
+ /// This records the widest use type for any fixup using this
+ /// LSRUse. FindUseWithSimilarFormula can't consider uses with different max
+ /// fixup widths to be equivalent, because the narrower one may be relying on
+ /// the implicit truncation to truncate away bogus bits.
+ Type *WidestFixupType = nullptr;
+
+ /// A list of ways to build a value that can satisfy this user. After the
+ /// list is populated, one of these is selected heuristically and used to
+ /// formulate a replacement for OperandValToReplace in UserInst.
+ SmallVector<Formula, 12> Formulae;
+
+ /// The set of register candidates used by all formulae in this LSRUse.
+ SmallPtrSet<const SCEV *, 4> Regs;
+
+ LSRUse(KindType K, MemAccessTy AT) : Kind(K), AccessTy(AT) {}
+
+ LSRFixup &getNewFixup() {
+ Fixups.push_back(LSRFixup());
+ return Fixups.back();
+ }
+
+ void pushFixup(LSRFixup &f) {
+ Fixups.push_back(f);
+ if (f.Offset > MaxOffset)
+ MaxOffset = f.Offset;
+ if (f.Offset < MinOffset)
+ MinOffset = f.Offset;
+ }
+
+ bool HasFormulaWithSameRegs(const Formula &F) const;
+ float getNotSelectedProbability(const SCEV *Reg) const;
+ bool InsertFormula(const Formula &F, const Loop &L);
+ void DeleteFormula(Formula &F);
+ void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+} // end anonymous namespace
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ GlobalValue *BaseGV, int64_t BaseOffset,
+ bool HasBaseReg, int64_t Scale,
+ Instruction *Fixup = nullptr);
+
+static unsigned getSetupCost(const SCEV *Reg, unsigned Depth) {
+ if (isa<SCEVUnknown>(Reg) || isa<SCEVConstant>(Reg))
+ return 1;
+ if (Depth == 0)
+ return 0;
+ if (const auto *S = dyn_cast<SCEVAddRecExpr>(Reg))
+ return getSetupCost(S->getStart(), Depth - 1);
if (auto S = dyn_cast<SCEVIntegralCastExpr>(Reg))
- return getSetupCost(S->getOperand(), Depth - 1);
- if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
- return std::accumulate(S->op_begin(), S->op_end(), 0,
- [&](unsigned i, const SCEV *Reg) {
- return i + getSetupCost(Reg, Depth - 1);
- });
- if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
- return getSetupCost(S->getLHS(), Depth - 1) +
- getSetupCost(S->getRHS(), Depth - 1);
- return 0;
-}
-
-/// Tally up interesting quantities from the given register.
-void Cost::RateRegister(const Formula &F, const SCEV *Reg,
- SmallPtrSetImpl<const SCEV *> &Regs) {
- if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
- // If this is an addrec for another loop, it should be an invariant
- // with respect to L since L is the innermost loop (at least
- // for now LSR only handles innermost loops).
- if (AR->getLoop() != L) {
- // If the AddRec exists, consider it's register free and leave it alone.
- if (isExistingPhi(AR, *SE) && !TTI->shouldFavorPostInc())
- return;
-
- // It is bad to allow LSR for current loop to add induction variables
- // for its sibling loops.
- if (!AR->getLoop()->contains(L)) {
- Lose();
- return;
- }
-
- // Otherwise, it will be an invariant with respect to Loop L.
- ++C.NumRegs;
- return;
- }
-
- unsigned LoopCost = 1;
- if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
- TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
-
- // If the step size matches the base offset, we could use pre-indexed
- // addressing.
- if (TTI->shouldFavorBackedgeIndex(L)) {
- if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
- if (Step->getAPInt() == F.BaseOffset)
- LoopCost = 0;
- }
-
- if (TTI->shouldFavorPostInc()) {
- const SCEV *LoopStep = AR->getStepRecurrence(*SE);
- if (isa<SCEVConstant>(LoopStep)) {
- const SCEV *LoopStart = AR->getStart();
- if (!isa<SCEVConstant>(LoopStart) &&
- SE->isLoopInvariant(LoopStart, L))
- LoopCost = 0;
- }
- }
- }
- C.AddRecCost += LoopCost;
-
- // Add the step value register, if it needs one.
- // TODO: The non-affine case isn't precisely modeled here.
- if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
- if (!Regs.count(AR->getOperand(1))) {
- RateRegister(F, AR->getOperand(1), Regs);
- if (isLoser())
- return;
- }
- }
- }
- ++C.NumRegs;
-
- // Rough heuristic; favor registers which don't require extra setup
- // instructions in the preheader.
- C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
- // Ensure we don't, even with the recusion limit, produce invalid costs.
- C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
-
- C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
- SE->hasComputableLoopEvolution(Reg, L);
-}
-
-/// Record this register in the set. If we haven't seen it before, rate
-/// it. Optional LoserRegs provides a way to declare any formula that refers to
-/// one of those regs an instant loser.
-void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
- SmallPtrSetImpl<const SCEV *> &Regs,
- SmallPtrSetImpl<const SCEV *> *LoserRegs) {
- if (LoserRegs && LoserRegs->count(Reg)) {
- Lose();
- return;
- }
- if (Regs.insert(Reg).second) {
- RateRegister(F, Reg, Regs);
- if (LoserRegs && isLoser())
- LoserRegs->insert(Reg);
- }
-}
-
-void Cost::RateFormula(const Formula &F,
- SmallPtrSetImpl<const SCEV *> &Regs,
- const DenseSet<const SCEV *> &VisitedRegs,
- const LSRUse &LU,
- SmallPtrSetImpl<const SCEV *> *LoserRegs) {
- assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
- // Tally up the registers.
- unsigned PrevAddRecCost = C.AddRecCost;
- unsigned PrevNumRegs = C.NumRegs;
- unsigned PrevNumBaseAdds = C.NumBaseAdds;
- if (const SCEV *ScaledReg = F.ScaledReg) {
- if (VisitedRegs.count(ScaledReg)) {
- Lose();
- return;
- }
- RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
- if (isLoser())
- return;
- }
- for (const SCEV *BaseReg : F.BaseRegs) {
- if (VisitedRegs.count(BaseReg)) {
- Lose();
- return;
- }
- RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
- if (isLoser())
- return;
- }
-
- // Determine how many (unfolded) adds we'll need inside the loop.
- size_t NumBaseParts = F.getNumRegs();
- if (NumBaseParts > 1)
- // Do not count the base and a possible second register if the target
- // allows to fold 2 registers.
- C.NumBaseAdds +=
- NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
- C.NumBaseAdds += (F.UnfoldedOffset != 0);
-
- // Accumulate non-free scaling amounts.
- C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L);
-
- // Tally up the non-zero immediates.
- for (const LSRFixup &Fixup : LU.Fixups) {
- int64_t O = Fixup.Offset;
- int64_t Offset = (uint64_t)O + F.BaseOffset;
- if (F.BaseGV)
- C.ImmCost += 64; // Handle symbolic values conservatively.
- // TODO: This should probably be the pointer size.
- else if (Offset != 0)
- C.ImmCost += APInt(64, Offset, true).getMinSignedBits();
-
- // Check with target if this offset with this instruction is
- // specifically not supported.
- if (LU.Kind == LSRUse::Address && Offset != 0 &&
- !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
- Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
- C.NumBaseAdds++;
- }
-
- // If we don't count instruction cost exit here.
- if (!InsnsCost) {
- assert(isValid() && "invalid cost");
- return;
- }
-
- // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
- // additional instruction (at least fill).
- // TODO: Need distinguish register class?
- unsigned TTIRegNum = TTI->getNumberOfRegisters(
- TTI->getRegisterClassForType(false, F.getType())) - 1;
- if (C.NumRegs > TTIRegNum) {
- // Cost already exceeded TTIRegNum, then only newly added register can add
- // new instructions.
- if (PrevNumRegs > TTIRegNum)
- C.Insns += (C.NumRegs - PrevNumRegs);
- else
- C.Insns += (C.NumRegs - TTIRegNum);
- }
-
- // If ICmpZero formula ends with not 0, it could not be replaced by
- // just add or sub. We'll need to compare final result of AddRec.
- // That means we'll need an additional instruction. But if the target can
- // macro-fuse a compare with a branch, don't count this extra instruction.
- // For -10 + {0, +, 1}:
- // i = i + 1;
- // cmp i, 10
- //
- // For {-10, +, 1}:
- // i = i + 1;
- if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
- !TTI->canMacroFuseCmp())
- C.Insns++;
- // Each new AddRec adds 1 instruction to calculation.
- C.Insns += (C.AddRecCost - PrevAddRecCost);
-
- // BaseAdds adds instructions for unfolded registers.
- if (LU.Kind != LSRUse::ICmpZero)
- C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
- assert(isValid() && "invalid cost");
-}
-
-/// Set this cost to a losing value.
-void Cost::Lose() {
- C.Insns = std::numeric_limits<unsigned>::max();
- C.NumRegs = std::numeric_limits<unsigned>::max();
- C.AddRecCost = std::numeric_limits<unsigned>::max();
- C.NumIVMuls = std::numeric_limits<unsigned>::max();
- C.NumBaseAdds = std::numeric_limits<unsigned>::max();
- C.ImmCost = std::numeric_limits<unsigned>::max();
- C.SetupCost = std::numeric_limits<unsigned>::max();
- C.ScaleCost = std::numeric_limits<unsigned>::max();
-}
-
-/// Choose the lower cost.
-bool Cost::isLess(Cost &Other) {
- if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
- C.Insns != Other.C.Insns)
- return C.Insns < Other.C.Insns;
- return TTI->isLSRCostLess(C, Other.C);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void Cost::print(raw_ostream &OS) const {
- if (InsnsCost)
- OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
- OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
- if (C.AddRecCost != 0)
- OS << ", with addrec cost " << C.AddRecCost;
- if (C.NumIVMuls != 0)
- OS << ", plus " << C.NumIVMuls << " IV mul"
- << (C.NumIVMuls == 1 ? "" : "s");
- if (C.NumBaseAdds != 0)
- OS << ", plus " << C.NumBaseAdds << " base add"
- << (C.NumBaseAdds == 1 ? "" : "s");
- if (C.ScaleCost != 0)
- OS << ", plus " << C.ScaleCost << " scale cost";
- if (C.ImmCost != 0)
- OS << ", plus " << C.ImmCost << " imm cost";
- if (C.SetupCost != 0)
- OS << ", plus " << C.SetupCost << " setup cost";
-}
-
-LLVM_DUMP_METHOD void Cost::dump() const {
- print(errs()); errs() << '\n';
-}
-#endif
-
-/// Test whether this fixup always uses its value outside of the given loop.
-bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
- // PHI nodes use their value in their incoming blocks.
- if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- if (PN->getIncomingValue(i) == OperandValToReplace &&
- L->contains(PN->getIncomingBlock(i)))
- return false;
- return true;
- }
-
- return !L->contains(UserInst);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void LSRFixup::print(raw_ostream &OS) const {
- OS << "UserInst=";
- // Store is common and interesting enough to be worth special-casing.
- if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
- OS << "store ";
- Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
- } else if (UserInst->getType()->isVoidTy())
- OS << UserInst->getOpcodeName();
- else
- UserInst->printAsOperand(OS, /*PrintType=*/false);
-
- OS << ", OperandValToReplace=";
- OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
-
- for (const Loop *PIL : PostIncLoops) {
- OS << ", PostIncLoop=";
- PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
- }
-
- if (Offset != 0)
- OS << ", Offset=" << Offset;
-}
-
-LLVM_DUMP_METHOD void LSRFixup::dump() const {
- print(errs()); errs() << '\n';
-}
-#endif
-
-/// Test whether this use as a formula which has the same registers as the given
-/// formula.
-bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
- SmallVector<const SCEV *, 4> Key = F.BaseRegs;
- if (F.ScaledReg) Key.push_back(F.ScaledReg);
- // Unstable sort by host order ok, because this is only used for uniquifying.
- llvm::sort(Key);
- return Uniquifier.count(Key);
-}
-
-/// The function returns a probability of selecting formula without Reg.
-float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
- unsigned FNum = 0;
- for (const Formula &F : Formulae)
- if (F.referencesReg(Reg))
- FNum++;
- return ((float)(Formulae.size() - FNum)) / Formulae.size();
-}
-
-/// If the given formula has not yet been inserted, add it to the list, and
-/// return true. Return false otherwise. The formula must be in canonical form.
-bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
- assert(F.isCanonical(L) && "Invalid canonical representation");
-
- if (!Formulae.empty() && RigidFormula)
- return false;
-
- SmallVector<const SCEV *, 4> Key = F.BaseRegs;
- if (F.ScaledReg) Key.push_back(F.ScaledReg);
- // Unstable sort by host order ok, because this is only used for uniquifying.
- llvm::sort(Key);
-
- if (!Uniquifier.insert(Key).second)
- return false;
-
- // Using a register to hold the value of 0 is not profitable.
- assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
- "Zero allocated in a scaled register!");
-#ifndef NDEBUG
- for (const SCEV *BaseReg : F.BaseRegs)
- assert(!BaseReg->isZero() && "Zero allocated in a base register!");
-#endif
-
- // Add the formula to the list.
- Formulae.push_back(F);
-
- // Record registers now being used by this use.
- Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
- if (F.ScaledReg)
- Regs.insert(F.ScaledReg);
-
- return true;
-}
-
-/// Remove the given formula from this use's list.
-void LSRUse::DeleteFormula(Formula &F) {
- if (&F != &Formulae.back())
- std::swap(F, Formulae.back());
- Formulae.pop_back();
-}
-
-/// Recompute the Regs field, and update RegUses.
-void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
- // Now that we've filtered out some formulae, recompute the Regs set.
- SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
- Regs.clear();
- for (const Formula &F : Formulae) {
- if (F.ScaledReg) Regs.insert(F.ScaledReg);
- Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
- }
-
- // Update the RegTracker.
- for (const SCEV *S : OldRegs)
- if (!Regs.count(S))
- RegUses.dropRegister(S, LUIdx);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void LSRUse::print(raw_ostream &OS) const {
- OS << "LSR Use: Kind=";
- switch (Kind) {
- case Basic: OS << "Basic"; break;
- case Special: OS << "Special"; break;
- case ICmpZero: OS << "ICmpZero"; break;
- case Address:
- OS << "Address of ";
- if (AccessTy.MemTy->isPointerTy())
- OS << "pointer"; // the full pointer type could be really verbose
- else {
- OS << *AccessTy.MemTy;
- }
-
- OS << " in addrspace(" << AccessTy.AddrSpace << ')';
- }
-
- OS << ", Offsets={";
- bool NeedComma = false;
- for (const LSRFixup &Fixup : Fixups) {
- if (NeedComma) OS << ',';
- OS << Fixup.Offset;
- NeedComma = true;
- }
- OS << '}';
-
- if (AllFixupsOutsideLoop)
- OS << ", all-fixups-outside-loop";
-
- if (WidestFixupType)
- OS << ", widest fixup type: " << *WidestFixupType;
-}
-
-LLVM_DUMP_METHOD void LSRUse::dump() const {
- print(errs()); errs() << '\n';
-}
-#endif
-
-static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
- LSRUse::KindType Kind, MemAccessTy AccessTy,
- GlobalValue *BaseGV, int64_t BaseOffset,
- bool HasBaseReg, int64_t Scale,
- Instruction *Fixup/*= nullptr*/) {
- switch (Kind) {
- case LSRUse::Address:
- return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
- HasBaseReg, Scale, AccessTy.AddrSpace, Fixup);
-
- case LSRUse::ICmpZero:
- // There's not even a target hook for querying whether it would be legal to
- // fold a GV into an ICmp.
- if (BaseGV)
- return false;
-
- // ICmp only has two operands; don't allow more than two non-trivial parts.
- if (Scale != 0 && HasBaseReg && BaseOffset != 0)
- return false;
-
- // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
- // putting the scaled register in the other operand of the icmp.
- if (Scale != 0 && Scale != -1)
- return false;
-
- // If we have low-level target information, ask the target if it can fold an
- // integer immediate on an icmp.
- if (BaseOffset != 0) {
- // We have one of:
- // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
- // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
- // Offs is the ICmp immediate.
- if (Scale == 0)
- // The cast does the right thing with
- // std::numeric_limits<int64_t>::min().
- BaseOffset = -(uint64_t)BaseOffset;
- return TTI.isLegalICmpImmediate(BaseOffset);
- }
-
- // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
- return true;
-
- case LSRUse::Basic:
- // Only handle single-register values.
- return !BaseGV && Scale == 0 && BaseOffset == 0;
-
- case LSRUse::Special:
- // Special case Basic to handle -1 scales.
- return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
- }
-
- llvm_unreachable("Invalid LSRUse Kind!");
-}
-
-static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
- int64_t MinOffset, int64_t MaxOffset,
- LSRUse::KindType Kind, MemAccessTy AccessTy,
- GlobalValue *BaseGV, int64_t BaseOffset,
- bool HasBaseReg, int64_t Scale) {
- // Check for overflow.
- if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
- (MinOffset > 0))
- return false;
- MinOffset = (uint64_t)BaseOffset + MinOffset;
- if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
- (MaxOffset > 0))
- return false;
- MaxOffset = (uint64_t)BaseOffset + MaxOffset;
-
- return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
- HasBaseReg, Scale) &&
- isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
- HasBaseReg, Scale);
-}
-
-static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
- int64_t MinOffset, int64_t MaxOffset,
- LSRUse::KindType Kind, MemAccessTy AccessTy,
- const Formula &F, const Loop &L) {
- // For the purpose of isAMCompletelyFolded either having a canonical formula
- // or a scale not equal to zero is correct.
- // Problems may arise from non canonical formulae having a scale == 0.
- // Strictly speaking it would best to just rely on canonical formulae.
- // However, when we generate the scaled formulae, we first check that the
- // scaling factor is profitable before computing the actual ScaledReg for
- // compile time sake.
- assert((F.isCanonical(L) || F.Scale != 0));
- return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
- F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
-}
-
-/// Test whether we know how to expand the current formula.
-static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
- int64_t MaxOffset, LSRUse::KindType Kind,
- MemAccessTy AccessTy, GlobalValue *BaseGV,
- int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
- // We know how to expand completely foldable formulae.
- return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
- BaseOffset, HasBaseReg, Scale) ||
- // Or formulae that use a base register produced by a sum of base
- // registers.
- (Scale == 1 &&
- isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
- BaseGV, BaseOffset, true, 0));
-}
-
-static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
- int64_t MaxOffset, LSRUse::KindType Kind,
- MemAccessTy AccessTy, const Formula &F) {
- return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
- F.BaseOffset, F.HasBaseReg, F.Scale);
-}
-
-static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
- const LSRUse &LU, const Formula &F) {
- // Target may want to look at the user instructions.
- if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
- for (const LSRFixup &Fixup : LU.Fixups)
- if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
- (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
- F.Scale, Fixup.UserInst))
- return false;
- return true;
- }
-
- return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
- LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
- F.Scale);
-}
-
-static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
- const LSRUse &LU, const Formula &F,
- const Loop &L) {
- if (!F.Scale)
- return 0;
-
- // If the use is not completely folded in that instruction, we will have to
- // pay an extra cost only for scale != 1.
- if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
- LU.AccessTy, F, L))
- return F.Scale != 1;
-
- switch (LU.Kind) {
- case LSRUse::Address: {
- // Check the scaling factor cost with both the min and max offsets.
- int ScaleCostMinOffset = TTI.getScalingFactorCost(
- LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
- F.Scale, LU.AccessTy.AddrSpace);
- int ScaleCostMaxOffset = TTI.getScalingFactorCost(
- LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
- F.Scale, LU.AccessTy.AddrSpace);
-
- assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
- "Legal addressing mode has an illegal cost!");
- return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
- }
- case LSRUse::ICmpZero:
- case LSRUse::Basic:
- case LSRUse::Special:
- // The use is completely folded, i.e., everything is folded into the
- // instruction.
- return 0;
- }
-
- llvm_unreachable("Invalid LSRUse Kind!");
-}
-
-static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
- LSRUse::KindType Kind, MemAccessTy AccessTy,
- GlobalValue *BaseGV, int64_t BaseOffset,
- bool HasBaseReg) {
- // Fast-path: zero is always foldable.
- if (BaseOffset == 0 && !BaseGV) return true;
-
- // Conservatively, create an address with an immediate and a
- // base and a scale.
- int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
-
- // Canonicalize a scale of 1 to a base register if the formula doesn't
- // already have a base register.
- if (!HasBaseReg && Scale == 1) {
- Scale = 0;
- HasBaseReg = true;
- }
-
- return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
- HasBaseReg, Scale);
-}
-
-static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
- ScalarEvolution &SE, int64_t MinOffset,
- int64_t MaxOffset, LSRUse::KindType Kind,
- MemAccessTy AccessTy, const SCEV *S,
- bool HasBaseReg) {
- // Fast-path: zero is always foldable.
- if (S->isZero()) return true;
-
- // Conservatively, create an address with an immediate and a
- // base and a scale.
- int64_t BaseOffset = ExtractImmediate(S, SE);
- GlobalValue *BaseGV = ExtractSymbol(S, SE);
-
- // If there's anything else involved, it's not foldable.
- if (!S->isZero()) return false;
-
- // Fast-path: zero is always foldable.
- if (BaseOffset == 0 && !BaseGV) return true;
-
- // Conservatively, create an address with an immediate and a
- // base and a scale.
- int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
-
- return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
- BaseOffset, HasBaseReg, Scale);
-}
-
-namespace {
-
-/// An individual increment in a Chain of IV increments. Relate an IV user to
-/// an expression that computes the IV it uses from the IV used by the previous
-/// link in the Chain.
-///
-/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
-/// original IVOperand. The head of the chain's IVOperand is only valid during
-/// chain collection, before LSR replaces IV users. During chain generation,
-/// IncExpr can be used to find the new IVOperand that computes the same
-/// expression.
-struct IVInc {
- Instruction *UserInst;
- Value* IVOperand;
- const SCEV *IncExpr;
-
- IVInc(Instruction *U, Value *O, const SCEV *E)
- : UserInst(U), IVOperand(O), IncExpr(E) {}
-};
-
-// The list of IV increments in program order. We typically add the head of a
-// chain without finding subsequent links.
-struct IVChain {
- SmallVector<IVInc, 1> Incs;
- const SCEV *ExprBase = nullptr;
-
- IVChain() = default;
- IVChain(const IVInc &Head, const SCEV *Base)
- : Incs(1, Head), ExprBase(Base) {}
-
- using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
-
- // Return the first increment in the chain.
- const_iterator begin() const {
- assert(!Incs.empty());
- return std::next(Incs.begin());
- }
- const_iterator end() const {
- return Incs.end();
- }
-
- // Returns true if this chain contains any increments.
- bool hasIncs() const { return Incs.size() >= 2; }
-
- // Add an IVInc to the end of this chain.
- void add(const IVInc &X) { Incs.push_back(X); }
-
- // Returns the last UserInst in the chain.
- Instruction *tailUserInst() const { return Incs.back().UserInst; }
-
- // Returns true if IncExpr can be profitably added to this chain.
- bool isProfitableIncrement(const SCEV *OperExpr,
- const SCEV *IncExpr,
- ScalarEvolution&);
-};
-
-/// Helper for CollectChains to track multiple IV increment uses. Distinguish
-/// between FarUsers that definitely cross IV increments and NearUsers that may
-/// be used between IV increments.
-struct ChainUsers {
- SmallPtrSet<Instruction*, 4> FarUsers;
- SmallPtrSet<Instruction*, 4> NearUsers;
-};
-
-/// This class holds state for the main loop strength reduction logic.
-class LSRInstance {
- IVUsers &IU;
- ScalarEvolution &SE;
- DominatorTree &DT;
- LoopInfo &LI;
- AssumptionCache &AC;
- TargetLibraryInfo &TLI;
- const TargetTransformInfo &TTI;
- Loop *const L;
- MemorySSAUpdater *MSSAU;
- bool FavorBackedgeIndex = false;
- bool Changed = false;
-
- /// This is the insert position that the current loop's induction variable
- /// increment should be placed. In simple loops, this is the latch block's
- /// terminator. But in more complicated cases, this is a position which will
- /// dominate all the in-loop post-increment users.
- Instruction *IVIncInsertPos = nullptr;
-
- /// Interesting factors between use strides.
- ///
- /// We explicitly use a SetVector which contains a SmallSet, instead of the
- /// default, a SmallDenseSet, because we need to use the full range of
- /// int64_ts, and there's currently no good way of doing that with
- /// SmallDenseSet.
- SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
-
- /// Interesting use types, to facilitate truncation reuse.
- SmallSetVector<Type *, 4> Types;
-
- /// The list of interesting uses.
- mutable SmallVector<LSRUse, 16> Uses;
-
- /// Track which uses use which register candidates.
- RegUseTracker RegUses;
-
- // Limit the number of chains to avoid quadratic behavior. We don't expect to
- // have more than a few IV increment chains in a loop. Missing a Chain falls
- // back to normal LSR behavior for those uses.
- static const unsigned MaxChains = 8;
-
- /// IV users can form a chain of IV increments.
- SmallVector<IVChain, MaxChains> IVChainVec;
-
- /// IV users that belong to profitable IVChains.
- SmallPtrSet<Use*, MaxChains> IVIncSet;
-
- void OptimizeShadowIV();
- bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
- ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
- void OptimizeLoopTermCond();
-
- void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
- SmallVectorImpl<ChainUsers> &ChainUsersVec);
- void FinalizeChain(IVChain &Chain);
- void CollectChains();
- void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
- SmallVectorImpl<WeakTrackingVH> &DeadInsts);
-
- void CollectInterestingTypesAndFactors();
- void CollectFixupsAndInitialFormulae();
-
- // Support for sharing of LSRUses between LSRFixups.
- using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
- UseMapTy UseMap;
-
- bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
- LSRUse::KindType Kind, MemAccessTy AccessTy);
-
- std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
- MemAccessTy AccessTy);
-
- void DeleteUse(LSRUse &LU, size_t LUIdx);
-
- LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
-
- void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
- void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
- void CountRegisters(const Formula &F, size_t LUIdx);
- bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
-
- void CollectLoopInvariantFixupsAndFormulae();
-
- void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
- unsigned Depth = 0);
-
- void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
- const Formula &Base, unsigned Depth,
- size_t Idx, bool IsScaledReg = false);
- void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
- void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
- const Formula &Base, size_t Idx,
- bool IsScaledReg = false);
- void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
- void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
- const Formula &Base,
- const SmallVectorImpl<int64_t> &Worklist,
- size_t Idx, bool IsScaledReg = false);
- void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
- void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
- void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
- void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
- void GenerateCrossUseConstantOffsets();
- void GenerateAllReuseFormulae();
-
- void FilterOutUndesirableDedicatedRegisters();
-
- size_t EstimateSearchSpaceComplexity() const;
- void NarrowSearchSpaceByDetectingSupersets();
- void NarrowSearchSpaceByCollapsingUnrolledCode();
- void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
- void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
- void NarrowSearchSpaceByFilterPostInc();
- void NarrowSearchSpaceByDeletingCostlyFormulas();
- void NarrowSearchSpaceByPickingWinnerRegs();
- void NarrowSearchSpaceUsingHeuristics();
-
- void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
- Cost &SolutionCost,
- SmallVectorImpl<const Formula *> &Workspace,
- const Cost &CurCost,
- const SmallPtrSet<const SCEV *, 16> &CurRegs,
- DenseSet<const SCEV *> &VisitedRegs) const;
- void Solve(SmallVectorImpl<const Formula *> &Solution) const;
-
- BasicBlock::iterator
- HoistInsertPosition(BasicBlock::iterator IP,
- const SmallVectorImpl<Instruction *> &Inputs) const;
- BasicBlock::iterator
- AdjustInsertPositionForExpand(BasicBlock::iterator IP,
- const LSRFixup &LF,
- const LSRUse &LU,
- SCEVExpander &Rewriter) const;
-
- Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
- BasicBlock::iterator IP, SCEVExpander &Rewriter,
- SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
- void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
- const Formula &F, SCEVExpander &Rewriter,
- SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
- void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
- SCEVExpander &Rewriter,
- SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
- void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
-
-public:
- LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
- LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
- TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
-
- bool getChanged() const { return Changed; }
-
- void print_factors_and_types(raw_ostream &OS) const;
- void print_fixups(raw_ostream &OS) const;
- void print_uses(raw_ostream &OS) const;
- void print(raw_ostream &OS) const;
- void dump() const;
-};
-
-} // end anonymous namespace
-
-/// If IV is used in a int-to-float cast inside the loop then try to eliminate
-/// the cast operation.
-void LSRInstance::OptimizeShadowIV() {
- const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
- if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
- return;
-
- for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
- UI != E; /* empty */) {
- IVUsers::const_iterator CandidateUI = UI;
- ++UI;
- Instruction *ShadowUse = CandidateUI->getUser();
- Type *DestTy = nullptr;
- bool IsSigned = false;
-
- /* If shadow use is a int->float cast then insert a second IV
- to eliminate this cast.
-
- for (unsigned i = 0; i < n; ++i)
- foo((double)i);
-
- is transformed into
-
- double d = 0.0;
- for (unsigned i = 0; i < n; ++i, ++d)
- foo(d);
- */
- if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
- IsSigned = false;
- DestTy = UCast->getDestTy();
- }
- else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
- IsSigned = true;
- DestTy = SCast->getDestTy();
- }
- if (!DestTy) continue;
-
- // If target does not support DestTy natively then do not apply
- // this transformation.
- if (!TTI.isTypeLegal(DestTy)) continue;
-
- PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
- if (!PH) continue;
- if (PH->getNumIncomingValues() != 2) continue;
-
- // If the calculation in integers overflows, the result in FP type will
- // differ. So we only can do this transformation if we are guaranteed to not
- // deal with overflowing values
- const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
- if (!AR) continue;
- if (IsSigned && !AR->hasNoSignedWrap()) continue;
- if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
-
- Type *SrcTy = PH->getType();
- int Mantissa = DestTy->getFPMantissaWidth();
- if (Mantissa == -1) continue;
- if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
- continue;
-
- unsigned Entry, Latch;
- if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
- Entry = 0;
- Latch = 1;
- } else {
- Entry = 1;
- Latch = 0;
- }
-
- ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
- if (!Init) continue;
- Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
- (double)Init->getSExtValue() :
- (double)Init->getZExtValue());
-
- BinaryOperator *Incr =
- dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
- if (!Incr) continue;
- if (Incr->getOpcode() != Instruction::Add
- && Incr->getOpcode() != Instruction::Sub)
- continue;
-
- /* Initialize new IV, double d = 0.0 in above example. */
- ConstantInt *C = nullptr;
- if (Incr->getOperand(0) == PH)
- C = dyn_cast<ConstantInt>(Incr->getOperand(1));
- else if (Incr->getOperand(1) == PH)
- C = dyn_cast<ConstantInt>(Incr->getOperand(0));
- else
- continue;
-
- if (!C) continue;
-
- // Ignore negative constants, as the code below doesn't handle them
- // correctly. TODO: Remove this restriction.
- if (!C->getValue().isStrictlyPositive()) continue;
-
- /* Add new PHINode. */
- PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
-
- /* create new increment. '++d' in above example. */
- Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
- BinaryOperator *NewIncr =
- BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
- Instruction::FAdd : Instruction::FSub,
- NewPH, CFP, "IV.S.next.", Incr);
-
- NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
- NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
-
- /* Remove cast operation */
- ShadowUse->replaceAllUsesWith(NewPH);
- ShadowUse->eraseFromParent();
- Changed = true;
- break;
- }
-}
-
-/// If Cond has an operand that is an expression of an IV, set the IV user and
-/// stride information and return true, otherwise return false.
-bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
- for (IVStrideUse &U : IU)
- if (U.getUser() == Cond) {
- // NOTE: we could handle setcc instructions with multiple uses here, but
- // InstCombine does it as well for simple uses, it's not clear that it
- // occurs enough in real life to handle.
- CondUse = &U;
- return true;
- }
- return false;
-}
-
-/// Rewrite the loop's terminating condition if it uses a max computation.
-///
-/// This is a narrow solution to a specific, but acute, problem. For loops
-/// like this:
-///
-/// i = 0;
-/// do {
-/// p[i] = 0.0;
-/// } while (++i < n);
-///
-/// the trip count isn't just 'n', because 'n' might not be positive. And
-/// unfortunately this can come up even for loops where the user didn't use
-/// a C do-while loop. For example, seemingly well-behaved top-test loops
-/// will commonly be lowered like this:
-///
-/// if (n > 0) {
-/// i = 0;
-/// do {
-/// p[i] = 0.0;
-/// } while (++i < n);
-/// }
-///
-/// and then it's possible for subsequent optimization to obscure the if
-/// test in such a way that indvars can't find it.
-///
-/// When indvars can't find the if test in loops like this, it creates a
-/// max expression, which allows it to give the loop a canonical
-/// induction variable:
-///
-/// i = 0;
-/// max = n < 1 ? 1 : n;
-/// do {
-/// p[i] = 0.0;
-/// } while (++i != max);
-///
-/// Canonical induction variables are necessary because the loop passes
-/// are designed around them. The most obvious example of this is the
-/// LoopInfo analysis, which doesn't remember trip count values. It
-/// expects to be able to rediscover the trip count each time it is
-/// needed, and it does this using a simple analysis that only succeeds if
-/// the loop has a canonical induction variable.
-///
-/// However, when it comes time to generate code, the maximum operation
-/// can be quite costly, especially if it's inside of an outer loop.
-///
-/// This function solves this problem by detecting this type of loop and
-/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
-/// the instructions for the maximum computation.
-ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
- // Check that the loop matches the pattern we're looking for.
- if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
- Cond->getPredicate() != CmpInst::ICMP_NE)
- return Cond;
-
- SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
- if (!Sel || !Sel->hasOneUse()) return Cond;
-
- const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
- if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
- return Cond;
- const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
-
- // Add one to the backedge-taken count to get the trip count.
- const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
- if (IterationCount != SE.getSCEV(Sel)) return Cond;
-
- // Check for a max calculation that matches the pattern. There's no check
- // for ICMP_ULE here because the comparison would be with zero, which
- // isn't interesting.
- CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
- const SCEVNAryExpr *Max = nullptr;
- if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
- Pred = ICmpInst::ICMP_SLE;
- Max = S;
- } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
- Pred = ICmpInst::ICMP_SLT;
- Max = S;
- } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
- Pred = ICmpInst::ICMP_ULT;
- Max = U;
- } else {
- // No match; bail.
- return Cond;
- }
-
- // To handle a max with more than two operands, this optimization would
- // require additional checking and setup.
- if (Max->getNumOperands() != 2)
- return Cond;
-
- const SCEV *MaxLHS = Max->getOperand(0);
- const SCEV *MaxRHS = Max->getOperand(1);
-
- // ScalarEvolution canonicalizes constants to the left. For < and >, look
- // for a comparison with 1. For <= and >=, a comparison with zero.
- if (!MaxLHS ||
- (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
- return Cond;
-
- // Check the relevant induction variable for conformance to
- // the pattern.
- const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
- const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
- if (!AR || !AR->isAffine() ||
- AR->getStart() != One ||
- AR->getStepRecurrence(SE) != One)
- return Cond;
-
- assert(AR->getLoop() == L &&
- "Loop condition operand is an addrec in a different loop!");
-
- // Check the right operand of the select, and remember it, as it will
- // be used in the new comparison instruction.
- Value *NewRHS = nullptr;
- if (ICmpInst::isTrueWhenEqual(Pred)) {
- // Look for n+1, and grab n.
- if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
- if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
- if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
- NewRHS = BO->getOperand(0);
- if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
- if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
- if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
- NewRHS = BO->getOperand(0);
- if (!NewRHS)
- return Cond;
- } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
- NewRHS = Sel->getOperand(1);
- else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
- NewRHS = Sel->getOperand(2);
- else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
- NewRHS = SU->getValue();
- else
- // Max doesn't match expected pattern.
- return Cond;
-
- // Determine the new comparison opcode. It may be signed or unsigned,
- // and the original comparison may be either equality or inequality.
- if (Cond->getPredicate() == CmpInst::ICMP_EQ)
- Pred = CmpInst::getInversePredicate(Pred);
-
- // Ok, everything looks ok to change the condition into an SLT or SGE and
- // delete the max calculation.
- ICmpInst *NewCond =
- new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
-
- // Delete the max calculation instructions.
- Cond->replaceAllUsesWith(NewCond);
- CondUse->setUser(NewCond);
- Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
- Cond->eraseFromParent();
- Sel->eraseFromParent();
- if (Cmp->use_empty())
- Cmp->eraseFromParent();
- return NewCond;
-}
-
-/// Change loop terminating condition to use the postinc iv when possible.
-void
-LSRInstance::OptimizeLoopTermCond() {
- SmallPtrSet<Instruction *, 4> PostIncs;
-
- // We need a different set of heuristics for rotated and non-rotated loops.
- // If a loop is rotated then the latch is also the backedge, so inserting
- // post-inc expressions just before the latch is ideal. To reduce live ranges
- // it also makes sense to rewrite terminating conditions to use post-inc
- // expressions.
- //
- // If the loop is not rotated then the latch is not a backedge; the latch
- // check is done in the loop head. Adding post-inc expressions before the
- // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
- // in the loop body. In this case we do *not* want to use post-inc expressions
- // in the latch check, and we want to insert post-inc expressions before
- // the backedge.
- BasicBlock *LatchBlock = L->getLoopLatch();
- SmallVector<BasicBlock*, 8> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
- if (llvm::all_of(ExitingBlocks, [&LatchBlock](const BasicBlock *BB) {
- return LatchBlock != BB;
- })) {
- // The backedge doesn't exit the loop; treat this as a head-tested loop.
- IVIncInsertPos = LatchBlock->getTerminator();
- return;
- }
-
- // Otherwise treat this as a rotated loop.
- for (BasicBlock *ExitingBlock : ExitingBlocks) {
- // Get the terminating condition for the loop if possible. If we
- // can, we want to change it to use a post-incremented version of its
- // induction variable, to allow coalescing the live ranges for the IV into
- // one register value.
-
- BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
- if (!TermBr)
- continue;
- // FIXME: Overly conservative, termination condition could be an 'or' etc..
- if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
- continue;
-
- // Search IVUsesByStride to find Cond's IVUse if there is one.
- IVStrideUse *CondUse = nullptr;
- ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
- if (!FindIVUserForCond(Cond, CondUse))
- continue;
-
- // If the trip count is computed in terms of a max (due to ScalarEvolution
- // being unable to find a sufficient guard, for example), change the loop
- // comparison to use SLT or ULT instead of NE.
- // One consequence of doing this now is that it disrupts the count-down
- // optimization. That's not always a bad thing though, because in such
- // cases it may still be worthwhile to avoid a max.
- Cond = OptimizeMax(Cond, CondUse);
-
- // If this exiting block dominates the latch block, it may also use
- // the post-inc value if it won't be shared with other uses.
- // Check for dominance.
- if (!DT.dominates(ExitingBlock, LatchBlock))
- continue;
-
- // Conservatively avoid trying to use the post-inc value in non-latch
- // exits if there may be pre-inc users in intervening blocks.
- if (LatchBlock != ExitingBlock)
- for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
- // Test if the use is reachable from the exiting block. This dominator
- // query is a conservative approximation of reachability.
- if (&*UI != CondUse &&
- !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
- // Conservatively assume there may be reuse if the quotient of their
- // strides could be a legal scale.
- const SCEV *A = IU.getStride(*CondUse, L);
- const SCEV *B = IU.getStride(*UI, L);
- if (!A || !B) continue;
- if (SE.getTypeSizeInBits(A->getType()) !=
- SE.getTypeSizeInBits(B->getType())) {
- if (SE.getTypeSizeInBits(A->getType()) >
- SE.getTypeSizeInBits(B->getType()))
- B = SE.getSignExtendExpr(B, A->getType());
- else
- A = SE.getSignExtendExpr(A, B->getType());
- }
- if (const SCEVConstant *D =
- dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
- const ConstantInt *C = D->getValue();
- // Stride of one or negative one can have reuse with non-addresses.
- if (C->isOne() || C->isMinusOne())
- goto decline_post_inc;
- // Avoid weird situations.
- if (C->getValue().getMinSignedBits() >= 64 ||
- C->getValue().isMinSignedValue())
- goto decline_post_inc;
- // Check for possible scaled-address reuse.
- if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
- MemAccessTy AccessTy = getAccessType(
- TTI, UI->getUser(), UI->getOperandValToReplace());
- int64_t Scale = C->getSExtValue();
- if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
- /*BaseOffset=*/0,
- /*HasBaseReg=*/false, Scale,
- AccessTy.AddrSpace))
- goto decline_post_inc;
- Scale = -Scale;
- if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
- /*BaseOffset=*/0,
- /*HasBaseReg=*/false, Scale,
- AccessTy.AddrSpace))
- goto decline_post_inc;
- }
- }
- }
-
- LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
- << *Cond << '\n');
-
- // It's possible for the setcc instruction to be anywhere in the loop, and
- // possible for it to have multiple users. If it is not immediately before
- // the exiting block branch, move it.
- if (&*++BasicBlock::iterator(Cond) != TermBr) {
- if (Cond->hasOneUse()) {
- Cond->moveBefore(TermBr);
- } else {
- // Clone the terminating condition and insert into the loopend.
- ICmpInst *OldCond = Cond;
- Cond = cast<ICmpInst>(Cond->clone());
- Cond->setName(L->getHeader()->getName() + ".termcond");
- ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond);
-
- // Clone the IVUse, as the old use still exists!
- CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
- TermBr->replaceUsesOfWith(OldCond, Cond);
- }
- }
-
- // If we get to here, we know that we can transform the setcc instruction to
- // use the post-incremented version of the IV, allowing us to coalesce the
- // live ranges for the IV correctly.
- CondUse->transformToPostInc(L);
- Changed = true;
-
- PostIncs.insert(Cond);
- decline_post_inc:;
- }
-
- // Determine an insertion point for the loop induction variable increment. It
- // must dominate all the post-inc comparisons we just set up, and it must
- // dominate the loop latch edge.
- IVIncInsertPos = L->getLoopLatch()->getTerminator();
- for (Instruction *Inst : PostIncs) {
- BasicBlock *BB =
- DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
- Inst->getParent());
- if (BB == Inst->getParent())
- IVIncInsertPos = Inst;
- else if (BB != IVIncInsertPos->getParent())
- IVIncInsertPos = BB->getTerminator();
- }
-}
-
-/// Determine if the given use can accommodate a fixup at the given offset and
-/// other details. If so, update the use and return true.
-bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
- bool HasBaseReg, LSRUse::KindType Kind,
- MemAccessTy AccessTy) {
- int64_t NewMinOffset = LU.MinOffset;
- int64_t NewMaxOffset = LU.MaxOffset;
- MemAccessTy NewAccessTy = AccessTy;
-
- // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
- // something conservative, however this can pessimize in the case that one of
- // the uses will have all its uses outside the loop, for example.
- if (LU.Kind != Kind)
- return false;
-
- // Check for a mismatched access type, and fall back conservatively as needed.
- // TODO: Be less conservative when the type is similar and can use the same
- // addressing modes.
- if (Kind == LSRUse::Address) {
- if (AccessTy.MemTy != LU.AccessTy.MemTy) {
- NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
- AccessTy.AddrSpace);
- }
- }
-
- // Conservatively assume HasBaseReg is true for now.
- if (NewOffset < LU.MinOffset) {
- if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
- LU.MaxOffset - NewOffset, HasBaseReg))
- return false;
- NewMinOffset = NewOffset;
- } else if (NewOffset > LU.MaxOffset) {
- if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
- NewOffset - LU.MinOffset, HasBaseReg))
- return false;
- NewMaxOffset = NewOffset;
- }
-
- // Update the use.
- LU.MinOffset = NewMinOffset;
- LU.MaxOffset = NewMaxOffset;
- LU.AccessTy = NewAccessTy;
- return true;
-}
-
-/// Return an LSRUse index and an offset value for a fixup which needs the given
-/// expression, with the given kind and optional access type. Either reuse an
-/// existing use or create a new one, as needed.
-std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
- LSRUse::KindType Kind,
- MemAccessTy AccessTy) {
- const SCEV *Copy = Expr;
- int64_t Offset = ExtractImmediate(Expr, SE);
-
- // Basic uses can't accept any offset, for example.
- if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
- Offset, /*HasBaseReg=*/ true)) {
- Expr = Copy;
- Offset = 0;
- }
-
- std::pair<UseMapTy::iterator, bool> P =
- UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
- if (!P.second) {
- // A use already existed with this base.
- size_t LUIdx = P.first->second;
- LSRUse &LU = Uses[LUIdx];
- if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
- // Reuse this use.
- return std::make_pair(LUIdx, Offset);
- }
-
- // Create a new use.
- size_t LUIdx = Uses.size();
- P.first->second = LUIdx;
- Uses.push_back(LSRUse(Kind, AccessTy));
- LSRUse &LU = Uses[LUIdx];
-
- LU.MinOffset = Offset;
- LU.MaxOffset = Offset;
- return std::make_pair(LUIdx, Offset);
-}
-
-/// Delete the given use from the Uses list.
-void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
- if (&LU != &Uses.back())
- std::swap(LU, Uses.back());
- Uses.pop_back();
-
- // Update RegUses.
- RegUses.swapAndDropUse(LUIdx, Uses.size());
-}
-
-/// Look for a use distinct from OrigLU which is has a formula that has the same
-/// registers as the given formula.
-LSRUse *
-LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
- const LSRUse &OrigLU) {
- // Search all uses for the formula. This could be more clever.
- for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
- LSRUse &LU = Uses[LUIdx];
- // Check whether this use is close enough to OrigLU, to see whether it's
- // worthwhile looking through its formulae.
- // Ignore ICmpZero uses because they may contain formulae generated by
- // GenerateICmpZeroScales, in which case adding fixup offsets may
- // be invalid.
- if (&LU != &OrigLU &&
- LU.Kind != LSRUse::ICmpZero &&
- LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
- LU.WidestFixupType == OrigLU.WidestFixupType &&
- LU.HasFormulaWithSameRegs(OrigF)) {
- // Scan through this use's formulae.
- for (const Formula &F : LU.Formulae) {
- // Check to see if this formula has the same registers and symbols
- // as OrigF.
- if (F.BaseRegs == OrigF.BaseRegs &&
- F.ScaledReg == OrigF.ScaledReg &&
- F.BaseGV == OrigF.BaseGV &&
- F.Scale == OrigF.Scale &&
- F.UnfoldedOffset == OrigF.UnfoldedOffset) {
- if (F.BaseOffset == 0)
- return &LU;
- // This is the formula where all the registers and symbols matched;
- // there aren't going to be any others. Since we declined it, we
- // can skip the rest of the formulae and proceed to the next LSRUse.
- break;
- }
- }
- }
- }
-
- // Nothing looked good.
- return nullptr;
-}
-
-void LSRInstance::CollectInterestingTypesAndFactors() {
- SmallSetVector<const SCEV *, 4> Strides;
-
- // Collect interesting types and strides.
- SmallVector<const SCEV *, 4> Worklist;
- for (const IVStrideUse &U : IU) {
- const SCEV *Expr = IU.getExpr(U);
-
- // Collect interesting types.
- Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
-
- // Add strides for mentioned loops.
- Worklist.push_back(Expr);
- do {
- const SCEV *S = Worklist.pop_back_val();
- if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
- if (AR->getLoop() == L)
- Strides.insert(AR->getStepRecurrence(SE));
- Worklist.push_back(AR->getStart());
- } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
- Worklist.append(Add->op_begin(), Add->op_end());
- }
- } while (!Worklist.empty());
- }
-
- // Compute interesting factors from the set of interesting strides.
- for (SmallSetVector<const SCEV *, 4>::const_iterator
- I = Strides.begin(), E = Strides.end(); I != E; ++I)
- for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
- std::next(I); NewStrideIter != E; ++NewStrideIter) {
- const SCEV *OldStride = *I;
- const SCEV *NewStride = *NewStrideIter;
-
- if (SE.getTypeSizeInBits(OldStride->getType()) !=
- SE.getTypeSizeInBits(NewStride->getType())) {
- if (SE.getTypeSizeInBits(OldStride->getType()) >
- SE.getTypeSizeInBits(NewStride->getType()))
- NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
- else
- OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
- }
- if (const SCEVConstant *Factor =
- dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
- SE, true))) {
- if (Factor->getAPInt().getMinSignedBits() <= 64)
- Factors.insert(Factor->getAPInt().getSExtValue());
- } else if (const SCEVConstant *Factor =
- dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
- NewStride,
- SE, true))) {
- if (Factor->getAPInt().getMinSignedBits() <= 64)
- Factors.insert(Factor->getAPInt().getSExtValue());
- }
- }
-
- // If all uses use the same type, don't bother looking for truncation-based
- // reuse.
- if (Types.size() == 1)
- Types.clear();
-
- LLVM_DEBUG(print_factors_and_types(dbgs()));
-}
-
-/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
-/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
-/// IVStrideUses, we could partially skip this.
-static User::op_iterator
-findIVOperand(User::op_iterator OI, User::op_iterator OE,
- Loop *L, ScalarEvolution &SE) {
- for(; OI != OE; ++OI) {
- if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
- if (!SE.isSCEVable(Oper->getType()))
- continue;
-
- if (const SCEVAddRecExpr *AR =
- dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
- if (AR->getLoop() == L)
- break;
- }
- }
- }
- return OI;
-}
-
-/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
-/// a convenient helper.
-static Value *getWideOperand(Value *Oper) {
- if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
- return Trunc->getOperand(0);
- return Oper;
-}
-
-/// Return true if we allow an IV chain to include both types.
-static bool isCompatibleIVType(Value *LVal, Value *RVal) {
- Type *LType = LVal->getType();
- Type *RType = RVal->getType();
- return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() &&
- // Different address spaces means (possibly)
- // different types of the pointer implementation,
- // e.g. i16 vs i32 so disallow that.
- (LType->getPointerAddressSpace() ==
- RType->getPointerAddressSpace()));
-}
-
-/// Return an approximation of this SCEV expression's "base", or NULL for any
-/// constant. Returning the expression itself is conservative. Returning a
-/// deeper subexpression is more precise and valid as long as it isn't less
-/// complex than another subexpression. For expressions involving multiple
-/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
-/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
-/// IVInc==b-a.
-///
-/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
-/// SCEVUnknown, we simply return the rightmost SCEV operand.
-static const SCEV *getExprBase(const SCEV *S) {
- switch (S->getSCEVType()) {
- default: // uncluding scUnknown.
- return S;
- case scConstant:
- return nullptr;
- case scTruncate:
- return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
- case scZeroExtend:
- return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
- case scSignExtend:
- return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
- case scAddExpr: {
- // Skip over scaled operands (scMulExpr) to follow add operands as long as
- // there's nothing more complex.
- // FIXME: not sure if we want to recognize negation.
- const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
- for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
- E(Add->op_begin()); I != E; ++I) {
- const SCEV *SubExpr = *I;
- if (SubExpr->getSCEVType() == scAddExpr)
- return getExprBase(SubExpr);
-
- if (SubExpr->getSCEVType() != scMulExpr)
- return SubExpr;
- }
- return S; // all operands are scaled, be conservative.
- }
- case scAddRecExpr:
- return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
- }
+ return getSetupCost(S->getOperand(), Depth - 1);
+ if (auto S = dyn_cast<SCEVNAryExpr>(Reg))
+ return std::accumulate(S->op_begin(), S->op_end(), 0,
+ [&](unsigned i, const SCEV *Reg) {
+ return i + getSetupCost(Reg, Depth - 1);
+ });
+ if (auto S = dyn_cast<SCEVUDivExpr>(Reg))
+ return getSetupCost(S->getLHS(), Depth - 1) +
+ getSetupCost(S->getRHS(), Depth - 1);
+ return 0;
+}
+
+/// Tally up interesting quantities from the given register.
+void Cost::RateRegister(const Formula &F, const SCEV *Reg,
+ SmallPtrSetImpl<const SCEV *> &Regs) {
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
+ // If this is an addrec for another loop, it should be an invariant
+ // with respect to L since L is the innermost loop (at least
+ // for now LSR only handles innermost loops).
+ if (AR->getLoop() != L) {
+ // If the AddRec exists, consider it's register free and leave it alone.
+ if (isExistingPhi(AR, *SE) && !TTI->shouldFavorPostInc())
+ return;
+
+ // It is bad to allow LSR for current loop to add induction variables
+ // for its sibling loops.
+ if (!AR->getLoop()->contains(L)) {
+ Lose();
+ return;
+ }
+
+ // Otherwise, it will be an invariant with respect to Loop L.
+ ++C.NumRegs;
+ return;
+ }
+
+ unsigned LoopCost = 1;
+ if (TTI->isIndexedLoadLegal(TTI->MIM_PostInc, AR->getType()) ||
+ TTI->isIndexedStoreLegal(TTI->MIM_PostInc, AR->getType())) {
+
+ // If the step size matches the base offset, we could use pre-indexed
+ // addressing.
+ if (TTI->shouldFavorBackedgeIndex(L)) {
+ if (auto *Step = dyn_cast<SCEVConstant>(AR->getStepRecurrence(*SE)))
+ if (Step->getAPInt() == F.BaseOffset)
+ LoopCost = 0;
+ }
+
+ if (TTI->shouldFavorPostInc()) {
+ const SCEV *LoopStep = AR->getStepRecurrence(*SE);
+ if (isa<SCEVConstant>(LoopStep)) {
+ const SCEV *LoopStart = AR->getStart();
+ if (!isa<SCEVConstant>(LoopStart) &&
+ SE->isLoopInvariant(LoopStart, L))
+ LoopCost = 0;
+ }
+ }
+ }
+ C.AddRecCost += LoopCost;
+
+ // Add the step value register, if it needs one.
+ // TODO: The non-affine case isn't precisely modeled here.
+ if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1))) {
+ if (!Regs.count(AR->getOperand(1))) {
+ RateRegister(F, AR->getOperand(1), Regs);
+ if (isLoser())
+ return;
+ }
+ }
+ }
+ ++C.NumRegs;
+
+ // Rough heuristic; favor registers which don't require extra setup
+ // instructions in the preheader.
+ C.SetupCost += getSetupCost(Reg, SetupCostDepthLimit);
+ // Ensure we don't, even with the recusion limit, produce invalid costs.
+ C.SetupCost = std::min<unsigned>(C.SetupCost, 1 << 16);
+
+ C.NumIVMuls += isa<SCEVMulExpr>(Reg) &&
+ SE->hasComputableLoopEvolution(Reg, L);
+}
+
+/// Record this register in the set. If we haven't seen it before, rate
+/// it. Optional LoserRegs provides a way to declare any formula that refers to
+/// one of those regs an instant loser.
+void Cost::RatePrimaryRegister(const Formula &F, const SCEV *Reg,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+ if (LoserRegs && LoserRegs->count(Reg)) {
+ Lose();
+ return;
+ }
+ if (Regs.insert(Reg).second) {
+ RateRegister(F, Reg, Regs);
+ if (LoserRegs && isLoser())
+ LoserRegs->insert(Reg);
+ }
+}
+
+void Cost::RateFormula(const Formula &F,
+ SmallPtrSetImpl<const SCEV *> &Regs,
+ const DenseSet<const SCEV *> &VisitedRegs,
+ const LSRUse &LU,
+ SmallPtrSetImpl<const SCEV *> *LoserRegs) {
+ assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
+ // Tally up the registers.
+ unsigned PrevAddRecCost = C.AddRecCost;
+ unsigned PrevNumRegs = C.NumRegs;
+ unsigned PrevNumBaseAdds = C.NumBaseAdds;
+ if (const SCEV *ScaledReg = F.ScaledReg) {
+ if (VisitedRegs.count(ScaledReg)) {
+ Lose();
+ return;
+ }
+ RatePrimaryRegister(F, ScaledReg, Regs, LoserRegs);
+ if (isLoser())
+ return;
+ }
+ for (const SCEV *BaseReg : F.BaseRegs) {
+ if (VisitedRegs.count(BaseReg)) {
+ Lose();
+ return;
+ }
+ RatePrimaryRegister(F, BaseReg, Regs, LoserRegs);
+ if (isLoser())
+ return;
+ }
+
+ // Determine how many (unfolded) adds we'll need inside the loop.
+ size_t NumBaseParts = F.getNumRegs();
+ if (NumBaseParts > 1)
+ // Do not count the base and a possible second register if the target
+ // allows to fold 2 registers.
+ C.NumBaseAdds +=
+ NumBaseParts - (1 + (F.Scale && isAMCompletelyFolded(*TTI, LU, F)));
+ C.NumBaseAdds += (F.UnfoldedOffset != 0);
+
+ // Accumulate non-free scaling amounts.
+ C.ScaleCost += getScalingFactorCost(*TTI, LU, F, *L);
+
+ // Tally up the non-zero immediates.
+ for (const LSRFixup &Fixup : LU.Fixups) {
+ int64_t O = Fixup.Offset;
+ int64_t Offset = (uint64_t)O + F.BaseOffset;
+ if (F.BaseGV)
+ C.ImmCost += 64; // Handle symbolic values conservatively.
+ // TODO: This should probably be the pointer size.
+ else if (Offset != 0)
+ C.ImmCost += APInt(64, Offset, true).getMinSignedBits();
+
+ // Check with target if this offset with this instruction is
+ // specifically not supported.
+ if (LU.Kind == LSRUse::Address && Offset != 0 &&
+ !isAMCompletelyFolded(*TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
+ Offset, F.HasBaseReg, F.Scale, Fixup.UserInst))
+ C.NumBaseAdds++;
+ }
+
+ // If we don't count instruction cost exit here.
+ if (!InsnsCost) {
+ assert(isValid() && "invalid cost");
+ return;
+ }
+
+ // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
+ // additional instruction (at least fill).
+ // TODO: Need distinguish register class?
+ unsigned TTIRegNum = TTI->getNumberOfRegisters(
+ TTI->getRegisterClassForType(false, F.getType())) - 1;
+ if (C.NumRegs > TTIRegNum) {
+ // Cost already exceeded TTIRegNum, then only newly added register can add
+ // new instructions.
+ if (PrevNumRegs > TTIRegNum)
+ C.Insns += (C.NumRegs - PrevNumRegs);
+ else
+ C.Insns += (C.NumRegs - TTIRegNum);
+ }
+
+ // If ICmpZero formula ends with not 0, it could not be replaced by
+ // just add or sub. We'll need to compare final result of AddRec.
+ // That means we'll need an additional instruction. But if the target can
+ // macro-fuse a compare with a branch, don't count this extra instruction.
+ // For -10 + {0, +, 1}:
+ // i = i + 1;
+ // cmp i, 10
+ //
+ // For {-10, +, 1}:
+ // i = i + 1;
+ if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd() &&
+ !TTI->canMacroFuseCmp())
+ C.Insns++;
+ // Each new AddRec adds 1 instruction to calculation.
+ C.Insns += (C.AddRecCost - PrevAddRecCost);
+
+ // BaseAdds adds instructions for unfolded registers.
+ if (LU.Kind != LSRUse::ICmpZero)
+ C.Insns += C.NumBaseAdds - PrevNumBaseAdds;
+ assert(isValid() && "invalid cost");
+}
+
+/// Set this cost to a losing value.
+void Cost::Lose() {
+ C.Insns = std::numeric_limits<unsigned>::max();
+ C.NumRegs = std::numeric_limits<unsigned>::max();
+ C.AddRecCost = std::numeric_limits<unsigned>::max();
+ C.NumIVMuls = std::numeric_limits<unsigned>::max();
+ C.NumBaseAdds = std::numeric_limits<unsigned>::max();
+ C.ImmCost = std::numeric_limits<unsigned>::max();
+ C.SetupCost = std::numeric_limits<unsigned>::max();
+ C.ScaleCost = std::numeric_limits<unsigned>::max();
+}
+
+/// Choose the lower cost.
+bool Cost::isLess(Cost &Other) {
+ if (InsnsCost.getNumOccurrences() > 0 && InsnsCost &&
+ C.Insns != Other.C.Insns)
+ return C.Insns < Other.C.Insns;
+ return TTI->isLSRCostLess(C, Other.C);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void Cost::print(raw_ostream &OS) const {
+ if (InsnsCost)
+ OS << C.Insns << " instruction" << (C.Insns == 1 ? " " : "s ");
+ OS << C.NumRegs << " reg" << (C.NumRegs == 1 ? "" : "s");
+ if (C.AddRecCost != 0)
+ OS << ", with addrec cost " << C.AddRecCost;
+ if (C.NumIVMuls != 0)
+ OS << ", plus " << C.NumIVMuls << " IV mul"
+ << (C.NumIVMuls == 1 ? "" : "s");
+ if (C.NumBaseAdds != 0)
+ OS << ", plus " << C.NumBaseAdds << " base add"
+ << (C.NumBaseAdds == 1 ? "" : "s");
+ if (C.ScaleCost != 0)
+ OS << ", plus " << C.ScaleCost << " scale cost";
+ if (C.ImmCost != 0)
+ OS << ", plus " << C.ImmCost << " imm cost";
+ if (C.SetupCost != 0)
+ OS << ", plus " << C.SetupCost << " setup cost";
+}
+
+LLVM_DUMP_METHOD void Cost::dump() const {
+ print(errs()); errs() << '\n';
+}
+#endif
+
+/// Test whether this fixup always uses its value outside of the given loop.
+bool LSRFixup::isUseFullyOutsideLoop(const Loop *L) const {
+ // PHI nodes use their value in their incoming blocks.
+ if (const PHINode *PN = dyn_cast<PHINode>(UserInst)) {
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (PN->getIncomingValue(i) == OperandValToReplace &&
+ L->contains(PN->getIncomingBlock(i)))
+ return false;
+ return true;
+ }
+
+ return !L->contains(UserInst);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LSRFixup::print(raw_ostream &OS) const {
+ OS << "UserInst=";
+ // Store is common and interesting enough to be worth special-casing.
+ if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
+ OS << "store ";
+ Store->getOperand(0)->printAsOperand(OS, /*PrintType=*/false);
+ } else if (UserInst->getType()->isVoidTy())
+ OS << UserInst->getOpcodeName();
+ else
+ UserInst->printAsOperand(OS, /*PrintType=*/false);
+
+ OS << ", OperandValToReplace=";
+ OperandValToReplace->printAsOperand(OS, /*PrintType=*/false);
+
+ for (const Loop *PIL : PostIncLoops) {
+ OS << ", PostIncLoop=";
+ PIL->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+ }
+
+ if (Offset != 0)
+ OS << ", Offset=" << Offset;
+}
+
+LLVM_DUMP_METHOD void LSRFixup::dump() const {
+ print(errs()); errs() << '\n';
+}
+#endif
+
+/// Test whether this use as a formula which has the same registers as the given
+/// formula.
+bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
+ SmallVector<const SCEV *, 4> Key = F.BaseRegs;
+ if (F.ScaledReg) Key.push_back(F.ScaledReg);
+ // Unstable sort by host order ok, because this is only used for uniquifying.
+ llvm::sort(Key);
+ return Uniquifier.count(Key);
+}
+
+/// The function returns a probability of selecting formula without Reg.
+float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
+ unsigned FNum = 0;
+ for (const Formula &F : Formulae)
+ if (F.referencesReg(Reg))
+ FNum++;
+ return ((float)(Formulae.size() - FNum)) / Formulae.size();
+}
+
+/// If the given formula has not yet been inserted, add it to the list, and
+/// return true. Return false otherwise. The formula must be in canonical form.
+bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
+ assert(F.isCanonical(L) && "Invalid canonical representation");
+
+ if (!Formulae.empty() && RigidFormula)
+ return false;
+
+ SmallVector<const SCEV *, 4> Key = F.BaseRegs;
+ if (F.ScaledReg) Key.push_back(F.ScaledReg);
+ // Unstable sort by host order ok, because this is only used for uniquifying.
+ llvm::sort(Key);
+
+ if (!Uniquifier.insert(Key).second)
+ return false;
+
+ // Using a register to hold the value of 0 is not profitable.
+ assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
+ "Zero allocated in a scaled register!");
+#ifndef NDEBUG
+ for (const SCEV *BaseReg : F.BaseRegs)
+ assert(!BaseReg->isZero() && "Zero allocated in a base register!");
+#endif
+
+ // Add the formula to the list.
+ Formulae.push_back(F);
+
+ // Record registers now being used by this use.
+ Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+ if (F.ScaledReg)
+ Regs.insert(F.ScaledReg);
+
+ return true;
+}
+
+/// Remove the given formula from this use's list.
+void LSRUse::DeleteFormula(Formula &F) {
+ if (&F != &Formulae.back())
+ std::swap(F, Formulae.back());
+ Formulae.pop_back();
+}
+
+/// Recompute the Regs field, and update RegUses.
+void LSRUse::RecomputeRegs(size_t LUIdx, RegUseTracker &RegUses) {
+ // Now that we've filtered out some formulae, recompute the Regs set.
+ SmallPtrSet<const SCEV *, 4> OldRegs = std::move(Regs);
+ Regs.clear();
+ for (const Formula &F : Formulae) {
+ if (F.ScaledReg) Regs.insert(F.ScaledReg);
+ Regs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+ }
+
+ // Update the RegTracker.
+ for (const SCEV *S : OldRegs)
+ if (!Regs.count(S))
+ RegUses.dropRegister(S, LUIdx);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LSRUse::print(raw_ostream &OS) const {
+ OS << "LSR Use: Kind=";
+ switch (Kind) {
+ case Basic: OS << "Basic"; break;
+ case Special: OS << "Special"; break;
+ case ICmpZero: OS << "ICmpZero"; break;
+ case Address:
+ OS << "Address of ";
+ if (AccessTy.MemTy->isPointerTy())
+ OS << "pointer"; // the full pointer type could be really verbose
+ else {
+ OS << *AccessTy.MemTy;
+ }
+
+ OS << " in addrspace(" << AccessTy.AddrSpace << ')';
+ }
+
+ OS << ", Offsets={";
+ bool NeedComma = false;
+ for (const LSRFixup &Fixup : Fixups) {
+ if (NeedComma) OS << ',';
+ OS << Fixup.Offset;
+ NeedComma = true;
+ }
+ OS << '}';
+
+ if (AllFixupsOutsideLoop)
+ OS << ", all-fixups-outside-loop";
+
+ if (WidestFixupType)
+ OS << ", widest fixup type: " << *WidestFixupType;
+}
+
+LLVM_DUMP_METHOD void LSRUse::dump() const {
+ print(errs()); errs() << '\n';
+}
+#endif
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ GlobalValue *BaseGV, int64_t BaseOffset,
+ bool HasBaseReg, int64_t Scale,
+ Instruction *Fixup/*= nullptr*/) {
+ switch (Kind) {
+ case LSRUse::Address:
+ return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset,
+ HasBaseReg, Scale, AccessTy.AddrSpace, Fixup);
+
+ case LSRUse::ICmpZero:
+ // There's not even a target hook for querying whether it would be legal to
+ // fold a GV into an ICmp.
+ if (BaseGV)
+ return false;
+
+ // ICmp only has two operands; don't allow more than two non-trivial parts.
+ if (Scale != 0 && HasBaseReg && BaseOffset != 0)
+ return false;
+
+ // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale by
+ // putting the scaled register in the other operand of the icmp.
+ if (Scale != 0 && Scale != -1)
+ return false;
+
+ // If we have low-level target information, ask the target if it can fold an
+ // integer immediate on an icmp.
+ if (BaseOffset != 0) {
+ // We have one of:
+ // ICmpZero BaseReg + BaseOffset => ICmp BaseReg, -BaseOffset
+ // ICmpZero -1*ScaleReg + BaseOffset => ICmp ScaleReg, BaseOffset
+ // Offs is the ICmp immediate.
+ if (Scale == 0)
+ // The cast does the right thing with
+ // std::numeric_limits<int64_t>::min().
+ BaseOffset = -(uint64_t)BaseOffset;
+ return TTI.isLegalICmpImmediate(BaseOffset);
+ }
+
+ // ICmpZero BaseReg + -1*ScaleReg => ICmp BaseReg, ScaleReg
+ return true;
+
+ case LSRUse::Basic:
+ // Only handle single-register values.
+ return !BaseGV && Scale == 0 && BaseOffset == 0;
+
+ case LSRUse::Special:
+ // Special case Basic to handle -1 scales.
+ return !BaseGV && (Scale == 0 || Scale == -1) && BaseOffset == 0;
+ }
+
+ llvm_unreachable("Invalid LSRUse Kind!");
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ int64_t MinOffset, int64_t MaxOffset,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ GlobalValue *BaseGV, int64_t BaseOffset,
+ bool HasBaseReg, int64_t Scale) {
+ // Check for overflow.
+ if (((int64_t)((uint64_t)BaseOffset + MinOffset) > BaseOffset) !=
+ (MinOffset > 0))
+ return false;
+ MinOffset = (uint64_t)BaseOffset + MinOffset;
+ if (((int64_t)((uint64_t)BaseOffset + MaxOffset) > BaseOffset) !=
+ (MaxOffset > 0))
+ return false;
+ MaxOffset = (uint64_t)BaseOffset + MaxOffset;
+
+ return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MinOffset,
+ HasBaseReg, Scale) &&
+ isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, MaxOffset,
+ HasBaseReg, Scale);
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ int64_t MinOffset, int64_t MaxOffset,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ const Formula &F, const Loop &L) {
+ // For the purpose of isAMCompletelyFolded either having a canonical formula
+ // or a scale not equal to zero is correct.
+ // Problems may arise from non canonical formulae having a scale == 0.
+ // Strictly speaking it would best to just rely on canonical formulae.
+ // However, when we generate the scaled formulae, we first check that the
+ // scaling factor is profitable before computing the actual ScaledReg for
+ // compile time sake.
+ assert((F.isCanonical(L) || F.Scale != 0));
+ return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+ F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+/// Test whether we know how to expand the current formula.
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+ int64_t MaxOffset, LSRUse::KindType Kind,
+ MemAccessTy AccessTy, GlobalValue *BaseGV,
+ int64_t BaseOffset, bool HasBaseReg, int64_t Scale) {
+ // We know how to expand completely foldable formulae.
+ return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+ BaseOffset, HasBaseReg, Scale) ||
+ // Or formulae that use a base register produced by a sum of base
+ // registers.
+ (Scale == 1 &&
+ isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
+ BaseGV, BaseOffset, true, 0));
+}
+
+static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset,
+ int64_t MaxOffset, LSRUse::KindType Kind,
+ MemAccessTy AccessTy, const Formula &F) {
+ return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV,
+ F.BaseOffset, F.HasBaseReg, F.Scale);
+}
+
+static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
+ const LSRUse &LU, const Formula &F) {
+ // Target may want to look at the user instructions.
+ if (LU.Kind == LSRUse::Address && TTI.LSRWithInstrQueries()) {
+ for (const LSRFixup &Fixup : LU.Fixups)
+ if (!isAMCompletelyFolded(TTI, LSRUse::Address, LU.AccessTy, F.BaseGV,
+ (F.BaseOffset + Fixup.Offset), F.HasBaseReg,
+ F.Scale, Fixup.UserInst))
+ return false;
+ return true;
+ }
+
+ return isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+ LU.AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg,
+ F.Scale);
+}
+
+static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
+ const LSRUse &LU, const Formula &F,
+ const Loop &L) {
+ if (!F.Scale)
+ return 0;
+
+ // If the use is not completely folded in that instruction, we will have to
+ // pay an extra cost only for scale != 1.
+ if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
+ LU.AccessTy, F, L))
+ return F.Scale != 1;
+
+ switch (LU.Kind) {
+ case LSRUse::Address: {
+ // Check the scaling factor cost with both the min and max offsets.
+ int ScaleCostMinOffset = TTI.getScalingFactorCost(
+ LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg,
+ F.Scale, LU.AccessTy.AddrSpace);
+ int ScaleCostMaxOffset = TTI.getScalingFactorCost(
+ LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg,
+ F.Scale, LU.AccessTy.AddrSpace);
+
+ assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 &&
+ "Legal addressing mode has an illegal cost!");
+ return std::max(ScaleCostMinOffset, ScaleCostMaxOffset);
+ }
+ case LSRUse::ICmpZero:
+ case LSRUse::Basic:
+ case LSRUse::Special:
+ // The use is completely folded, i.e., everything is folded into the
+ // instruction.
+ return 0;
+ }
+
+ llvm_unreachable("Invalid LSRUse Kind!");
+}
+
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+ LSRUse::KindType Kind, MemAccessTy AccessTy,
+ GlobalValue *BaseGV, int64_t BaseOffset,
+ bool HasBaseReg) {
+ // Fast-path: zero is always foldable.
+ if (BaseOffset == 0 && !BaseGV) return true;
+
+ // Conservatively, create an address with an immediate and a
+ // base and a scale.
+ int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+
+ // Canonicalize a scale of 1 to a base register if the formula doesn't
+ // already have a base register.
+ if (!HasBaseReg && Scale == 1) {
+ Scale = 0;
+ HasBaseReg = true;
+ }
+
+ return isAMCompletelyFolded(TTI, Kind, AccessTy, BaseGV, BaseOffset,
+ HasBaseReg, Scale);
+}
+
+static bool isAlwaysFoldable(const TargetTransformInfo &TTI,
+ ScalarEvolution &SE, int64_t MinOffset,
+ int64_t MaxOffset, LSRUse::KindType Kind,
+ MemAccessTy AccessTy, const SCEV *S,
+ bool HasBaseReg) {
+ // Fast-path: zero is always foldable.
+ if (S->isZero()) return true;
+
+ // Conservatively, create an address with an immediate and a
+ // base and a scale.
+ int64_t BaseOffset = ExtractImmediate(S, SE);
+ GlobalValue *BaseGV = ExtractSymbol(S, SE);
+
+ // If there's anything else involved, it's not foldable.
+ if (!S->isZero()) return false;
+
+ // Fast-path: zero is always foldable.
+ if (BaseOffset == 0 && !BaseGV) return true;
+
+ // Conservatively, create an address with an immediate and a
+ // base and a scale.
+ int64_t Scale = Kind == LSRUse::ICmpZero ? -1 : 1;
+
+ return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV,
+ BaseOffset, HasBaseReg, Scale);
+}
+
+namespace {
+
+/// An individual increment in a Chain of IV increments. Relate an IV user to
+/// an expression that computes the IV it uses from the IV used by the previous
+/// link in the Chain.
+///
+/// For the head of a chain, IncExpr holds the absolute SCEV expression for the
+/// original IVOperand. The head of the chain's IVOperand is only valid during
+/// chain collection, before LSR replaces IV users. During chain generation,
+/// IncExpr can be used to find the new IVOperand that computes the same
+/// expression.
+struct IVInc {
+ Instruction *UserInst;
+ Value* IVOperand;
+ const SCEV *IncExpr;
+
+ IVInc(Instruction *U, Value *O, const SCEV *E)
+ : UserInst(U), IVOperand(O), IncExpr(E) {}
+};
+
+// The list of IV increments in program order. We typically add the head of a
+// chain without finding subsequent links.
+struct IVChain {
+ SmallVector<IVInc, 1> Incs;
+ const SCEV *ExprBase = nullptr;
+
+ IVChain() = default;
+ IVChain(const IVInc &Head, const SCEV *Base)
+ : Incs(1, Head), ExprBase(Base) {}
+
+ using const_iterator = SmallVectorImpl<IVInc>::const_iterator;
+
+ // Return the first increment in the chain.
+ const_iterator begin() const {
+ assert(!Incs.empty());
+ return std::next(Incs.begin());
+ }
+ const_iterator end() const {
+ return Incs.end();
+ }
+
+ // Returns true if this chain contains any increments.
+ bool hasIncs() const { return Incs.size() >= 2; }
+
+ // Add an IVInc to the end of this chain.
+ void add(const IVInc &X) { Incs.push_back(X); }
+
+ // Returns the last UserInst in the chain.
+ Instruction *tailUserInst() const { return Incs.back().UserInst; }
+
+ // Returns true if IncExpr can be profitably added to this chain.
+ bool isProfitableIncrement(const SCEV *OperExpr,
+ const SCEV *IncExpr,
+ ScalarEvolution&);
+};
+
+/// Helper for CollectChains to track multiple IV increment uses. Distinguish
+/// between FarUsers that definitely cross IV increments and NearUsers that may
+/// be used between IV increments.
+struct ChainUsers {
+ SmallPtrSet<Instruction*, 4> FarUsers;
+ SmallPtrSet<Instruction*, 4> NearUsers;
+};
+
+/// This class holds state for the main loop strength reduction logic.
+class LSRInstance {
+ IVUsers &IU;
+ ScalarEvolution &SE;
+ DominatorTree &DT;
+ LoopInfo &LI;
+ AssumptionCache &AC;
+ TargetLibraryInfo &TLI;
+ const TargetTransformInfo &TTI;
+ Loop *const L;
+ MemorySSAUpdater *MSSAU;
+ bool FavorBackedgeIndex = false;
+ bool Changed = false;
+
+ /// This is the insert position that the current loop's induction variable
+ /// increment should be placed. In simple loops, this is the latch block's
+ /// terminator. But in more complicated cases, this is a position which will
+ /// dominate all the in-loop post-increment users.
+ Instruction *IVIncInsertPos = nullptr;
+
+ /// Interesting factors between use strides.
+ ///
+ /// We explicitly use a SetVector which contains a SmallSet, instead of the
+ /// default, a SmallDenseSet, because we need to use the full range of
+ /// int64_ts, and there's currently no good way of doing that with
+ /// SmallDenseSet.
+ SetVector<int64_t, SmallVector<int64_t, 8>, SmallSet<int64_t, 8>> Factors;
+
+ /// Interesting use types, to facilitate truncation reuse.
+ SmallSetVector<Type *, 4> Types;
+
+ /// The list of interesting uses.
+ mutable SmallVector<LSRUse, 16> Uses;
+
+ /// Track which uses use which register candidates.
+ RegUseTracker RegUses;
+
+ // Limit the number of chains to avoid quadratic behavior. We don't expect to
+ // have more than a few IV increment chains in a loop. Missing a Chain falls
+ // back to normal LSR behavior for those uses.
+ static const unsigned MaxChains = 8;
+
+ /// IV users can form a chain of IV increments.
+ SmallVector<IVChain, MaxChains> IVChainVec;
+
+ /// IV users that belong to profitable IVChains.
+ SmallPtrSet<Use*, MaxChains> IVIncSet;
+
+ void OptimizeShadowIV();
+ bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse);
+ ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
+ void OptimizeLoopTermCond();
+
+ void ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+ SmallVectorImpl<ChainUsers> &ChainUsersVec);
+ void FinalizeChain(IVChain &Chain);
+ void CollectChains();
+ void GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts);
+
+ void CollectInterestingTypesAndFactors();
+ void CollectFixupsAndInitialFormulae();
+
+ // Support for sharing of LSRUses between LSRFixups.
+ using UseMapTy = DenseMap<LSRUse::SCEVUseKindPair, size_t>;
+ UseMapTy UseMap;
+
+ bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg,
+ LSRUse::KindType Kind, MemAccessTy AccessTy);
+
+ std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind,
+ MemAccessTy AccessTy);
+
+ void DeleteUse(LSRUse &LU, size_t LUIdx);
+
+ LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU);
+
+ void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
+ void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx);
+ void CountRegisters(const Formula &F, size_t LUIdx);
+ bool InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F);
+
+ void CollectLoopInvariantFixupsAndFormulae();
+
+ void GenerateReassociations(LSRUse &LU, unsigned LUIdx, Formula Base,
+ unsigned Depth = 0);
+
+ void GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base, unsigned Depth,
+ size_t Idx, bool IsScaledReg = false);
+ void GenerateCombinations(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base, size_t Idx,
+ bool IsScaledReg = false);
+ void GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateConstantOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base,
+ const SmallVectorImpl<int64_t> &Worklist,
+ size_t Idx, bool IsScaledReg = false);
+ void GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base);
+ void GenerateCrossUseConstantOffsets();
+ void GenerateAllReuseFormulae();
+
+ void FilterOutUndesirableDedicatedRegisters();
+
+ size_t EstimateSearchSpaceComplexity() const;
+ void NarrowSearchSpaceByDetectingSupersets();
+ void NarrowSearchSpaceByCollapsingUnrolledCode();
+ void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+ void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+ void NarrowSearchSpaceByFilterPostInc();
+ void NarrowSearchSpaceByDeletingCostlyFormulas();
+ void NarrowSearchSpaceByPickingWinnerRegs();
+ void NarrowSearchSpaceUsingHeuristics();
+
+ void SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
+ Cost &SolutionCost,
+ SmallVectorImpl<const Formula *> &Workspace,
+ const Cost &CurCost,
+ const SmallPtrSet<const SCEV *, 16> &CurRegs,
+ DenseSet<const SCEV *> &VisitedRegs) const;
+ void Solve(SmallVectorImpl<const Formula *> &Solution) const;
+
+ BasicBlock::iterator
+ HoistInsertPosition(BasicBlock::iterator IP,
+ const SmallVectorImpl<Instruction *> &Inputs) const;
+ BasicBlock::iterator
+ AdjustInsertPositionForExpand(BasicBlock::iterator IP,
+ const LSRFixup &LF,
+ const LSRUse &LU,
+ SCEVExpander &Rewriter) const;
+
+ Value *Expand(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+ BasicBlock::iterator IP, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+ void RewriteForPHI(PHINode *PN, const LSRUse &LU, const LSRFixup &LF,
+ const Formula &F, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+ void Rewrite(const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+ SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const;
+ void ImplementSolution(const SmallVectorImpl<const Formula *> &Solution);
+
+public:
+ LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
+ LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
+ TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
+
+ bool getChanged() const { return Changed; }
+
+ void print_factors_and_types(raw_ostream &OS) const;
+ void print_fixups(raw_ostream &OS) const;
+ void print_uses(raw_ostream &OS) const;
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+} // end anonymous namespace
+
+/// If IV is used in a int-to-float cast inside the loop then try to eliminate
+/// the cast operation.
+void LSRInstance::OptimizeShadowIV() {
+ const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+ return;
+
+ for (IVUsers::const_iterator UI = IU.begin(), E = IU.end();
+ UI != E; /* empty */) {
+ IVUsers::const_iterator CandidateUI = UI;
+ ++UI;
+ Instruction *ShadowUse = CandidateUI->getUser();
+ Type *DestTy = nullptr;
+ bool IsSigned = false;
+
+ /* If shadow use is a int->float cast then insert a second IV
+ to eliminate this cast.
+
+ for (unsigned i = 0; i < n; ++i)
+ foo((double)i);
+
+ is transformed into
+
+ double d = 0.0;
+ for (unsigned i = 0; i < n; ++i, ++d)
+ foo(d);
+ */
+ if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser())) {
+ IsSigned = false;
+ DestTy = UCast->getDestTy();
+ }
+ else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser())) {
+ IsSigned = true;
+ DestTy = SCast->getDestTy();
+ }
+ if (!DestTy) continue;
+
+ // If target does not support DestTy natively then do not apply
+ // this transformation.
+ if (!TTI.isTypeLegal(DestTy)) continue;
+
+ PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
+ if (!PH) continue;
+ if (PH->getNumIncomingValues() != 2) continue;
+
+ // If the calculation in integers overflows, the result in FP type will
+ // differ. So we only can do this transformation if we are guaranteed to not
+ // deal with overflowing values
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(PH));
+ if (!AR) continue;
+ if (IsSigned && !AR->hasNoSignedWrap()) continue;
+ if (!IsSigned && !AR->hasNoUnsignedWrap()) continue;
+
+ Type *SrcTy = PH->getType();
+ int Mantissa = DestTy->getFPMantissaWidth();
+ if (Mantissa == -1) continue;
+ if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
+ continue;
+
+ unsigned Entry, Latch;
+ if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
+ Entry = 0;
+ Latch = 1;
+ } else {
+ Entry = 1;
+ Latch = 0;
+ }
+
+ ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
+ if (!Init) continue;
+ Constant *NewInit = ConstantFP::get(DestTy, IsSigned ?
+ (double)Init->getSExtValue() :
+ (double)Init->getZExtValue());
+
+ BinaryOperator *Incr =
+ dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
+ if (!Incr) continue;
+ if (Incr->getOpcode() != Instruction::Add
+ && Incr->getOpcode() != Instruction::Sub)
+ continue;
+
+ /* Initialize new IV, double d = 0.0 in above example. */
+ ConstantInt *C = nullptr;
+ if (Incr->getOperand(0) == PH)
+ C = dyn_cast<ConstantInt>(Incr->getOperand(1));
+ else if (Incr->getOperand(1) == PH)
+ C = dyn_cast<ConstantInt>(Incr->getOperand(0));
+ else
+ continue;
+
+ if (!C) continue;
+
+ // Ignore negative constants, as the code below doesn't handle them
+ // correctly. TODO: Remove this restriction.
+ if (!C->getValue().isStrictlyPositive()) continue;
+
+ /* Add new PHINode. */
+ PHINode *NewPH = PHINode::Create(DestTy, 2, "IV.S.", PH);
+
+ /* create new increment. '++d' in above example. */
+ Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
+ BinaryOperator *NewIncr =
+ BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
+ Instruction::FAdd : Instruction::FSub,
+ NewPH, CFP, "IV.S.next.", Incr);
+
+ NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
+ NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));
+
+ /* Remove cast operation */
+ ShadowUse->replaceAllUsesWith(NewPH);
+ ShadowUse->eraseFromParent();
+ Changed = true;
+ break;
+ }
+}
+
+/// If Cond has an operand that is an expression of an IV, set the IV user and
+/// stride information and return true, otherwise return false.
+bool LSRInstance::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse) {
+ for (IVStrideUse &U : IU)
+ if (U.getUser() == Cond) {
+ // NOTE: we could handle setcc instructions with multiple uses here, but
+ // InstCombine does it as well for simple uses, it's not clear that it
+ // occurs enough in real life to handle.
+ CondUse = &U;
+ return true;
+ }
+ return false;
+}
+
+/// Rewrite the loop's terminating condition if it uses a max computation.
+///
+/// This is a narrow solution to a specific, but acute, problem. For loops
+/// like this:
+///
+/// i = 0;
+/// do {
+/// p[i] = 0.0;
+/// } while (++i < n);
+///
+/// the trip count isn't just 'n', because 'n' might not be positive. And
+/// unfortunately this can come up even for loops where the user didn't use
+/// a C do-while loop. For example, seemingly well-behaved top-test loops
+/// will commonly be lowered like this:
+///
+/// if (n > 0) {
+/// i = 0;
+/// do {
+/// p[i] = 0.0;
+/// } while (++i < n);
+/// }
+///
+/// and then it's possible for subsequent optimization to obscure the if
+/// test in such a way that indvars can't find it.
+///
+/// When indvars can't find the if test in loops like this, it creates a
+/// max expression, which allows it to give the loop a canonical
+/// induction variable:
+///
+/// i = 0;
+/// max = n < 1 ? 1 : n;
+/// do {
+/// p[i] = 0.0;
+/// } while (++i != max);
+///
+/// Canonical induction variables are necessary because the loop passes
+/// are designed around them. The most obvious example of this is the
+/// LoopInfo analysis, which doesn't remember trip count values. It
+/// expects to be able to rediscover the trip count each time it is
+/// needed, and it does this using a simple analysis that only succeeds if
+/// the loop has a canonical induction variable.
+///
+/// However, when it comes time to generate code, the maximum operation
+/// can be quite costly, especially if it's inside of an outer loop.
+///
+/// This function solves this problem by detecting this type of loop and
+/// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
+/// the instructions for the maximum computation.
+ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
+ // Check that the loop matches the pattern we're looking for.
+ if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
+ Cond->getPredicate() != CmpInst::ICMP_NE)
+ return Cond;
+
+ SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
+ if (!Sel || !Sel->hasOneUse()) return Cond;
+
+ const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
+ if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+ return Cond;
+ const SCEV *One = SE.getConstant(BackedgeTakenCount->getType(), 1);
+
+ // Add one to the backedge-taken count to get the trip count.
+ const SCEV *IterationCount = SE.getAddExpr(One, BackedgeTakenCount);
+ if (IterationCount != SE.getSCEV(Sel)) return Cond;
+
+ // Check for a max calculation that matches the pattern. There's no check
+ // for ICMP_ULE here because the comparison would be with zero, which
+ // isn't interesting.
+ CmpInst::Predicate Pred = ICmpInst::BAD_ICMP_PREDICATE;
+ const SCEVNAryExpr *Max = nullptr;
+ if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(BackedgeTakenCount)) {
+ Pred = ICmpInst::ICMP_SLE;
+ Max = S;
+ } else if (const SCEVSMaxExpr *S = dyn_cast<SCEVSMaxExpr>(IterationCount)) {
+ Pred = ICmpInst::ICMP_SLT;
+ Max = S;
+ } else if (const SCEVUMaxExpr *U = dyn_cast<SCEVUMaxExpr>(IterationCount)) {
+ Pred = ICmpInst::ICMP_ULT;
+ Max = U;
+ } else {
+ // No match; bail.
+ return Cond;
+ }
+
+ // To handle a max with more than two operands, this optimization would
+ // require additional checking and setup.
+ if (Max->getNumOperands() != 2)
+ return Cond;
+
+ const SCEV *MaxLHS = Max->getOperand(0);
+ const SCEV *MaxRHS = Max->getOperand(1);
+
+ // ScalarEvolution canonicalizes constants to the left. For < and >, look
+ // for a comparison with 1. For <= and >=, a comparison with zero.
+ if (!MaxLHS ||
+ (ICmpInst::isTrueWhenEqual(Pred) ? !MaxLHS->isZero() : (MaxLHS != One)))
+ return Cond;
+
+ // Check the relevant induction variable for conformance to
+ // the pattern.
+ const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
+ if (!AR || !AR->isAffine() ||
+ AR->getStart() != One ||
+ AR->getStepRecurrence(SE) != One)
+ return Cond;
+
+ assert(AR->getLoop() == L &&
+ "Loop condition operand is an addrec in a different loop!");
+
+ // Check the right operand of the select, and remember it, as it will
+ // be used in the new comparison instruction.
+ Value *NewRHS = nullptr;
+ if (ICmpInst::isTrueWhenEqual(Pred)) {
+ // Look for n+1, and grab n.
+ if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
+ if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+ if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+ NewRHS = BO->getOperand(0);
+ if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
+ if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+ if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+ NewRHS = BO->getOperand(0);
+ if (!NewRHS)
+ return Cond;
+ } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
+ NewRHS = Sel->getOperand(1);
+ else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
+ NewRHS = Sel->getOperand(2);
+ else if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(MaxRHS))
+ NewRHS = SU->getValue();
+ else
+ // Max doesn't match expected pattern.
+ return Cond;
+
+ // Determine the new comparison opcode. It may be signed or unsigned,
+ // and the original comparison may be either equality or inequality.
+ if (Cond->getPredicate() == CmpInst::ICMP_EQ)
+ Pred = CmpInst::getInversePredicate(Pred);
+
+ // Ok, everything looks ok to change the condition into an SLT or SGE and
+ // delete the max calculation.
+ ICmpInst *NewCond =
+ new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");
+
+ // Delete the max calculation instructions.
+ Cond->replaceAllUsesWith(NewCond);
+ CondUse->setUser(NewCond);
+ Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
+ Cond->eraseFromParent();
+ Sel->eraseFromParent();
+ if (Cmp->use_empty())
+ Cmp->eraseFromParent();
+ return NewCond;
+}
+
+/// Change loop terminating condition to use the postinc iv when possible.
+void
+LSRInstance::OptimizeLoopTermCond() {
+ SmallPtrSet<Instruction *, 4> PostIncs;
+
+ // We need a different set of heuristics for rotated and non-rotated loops.
+ // If a loop is rotated then the latch is also the backedge, so inserting
+ // post-inc expressions just before the latch is ideal. To reduce live ranges
+ // it also makes sense to rewrite terminating conditions to use post-inc
+ // expressions.
+ //
+ // If the loop is not rotated then the latch is not a backedge; the latch
+ // check is done in the loop head. Adding post-inc expressions before the
+ // latch will cause overlapping live-ranges of pre-inc and post-inc expressions
+ // in the loop body. In this case we do *not* want to use post-inc expressions
+ // in the latch check, and we want to insert post-inc expressions before
+ // the backedge.
+ BasicBlock *LatchBlock = L->getLoopLatch();
+ SmallVector<BasicBlock*, 8> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+ if (llvm::all_of(ExitingBlocks, [&LatchBlock](const BasicBlock *BB) {
+ return LatchBlock != BB;
+ })) {
+ // The backedge doesn't exit the loop; treat this as a head-tested loop.
+ IVIncInsertPos = LatchBlock->getTerminator();
+ return;
+ }
+
+ // Otherwise treat this as a rotated loop.
+ for (BasicBlock *ExitingBlock : ExitingBlocks) {
+ // Get the terminating condition for the loop if possible. If we
+ // can, we want to change it to use a post-incremented version of its
+ // induction variable, to allow coalescing the live ranges for the IV into
+ // one register value.
+
+ BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+ if (!TermBr)
+ continue;
+ // FIXME: Overly conservative, termination condition could be an 'or' etc..
+ if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
+ continue;
+
+ // Search IVUsesByStride to find Cond's IVUse if there is one.
+ IVStrideUse *CondUse = nullptr;
+ ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
+ if (!FindIVUserForCond(Cond, CondUse))
+ continue;
+
+ // If the trip count is computed in terms of a max (due to ScalarEvolution
+ // being unable to find a sufficient guard, for example), change the loop
+ // comparison to use SLT or ULT instead of NE.
+ // One consequence of doing this now is that it disrupts the count-down
+ // optimization. That's not always a bad thing though, because in such
+ // cases it may still be worthwhile to avoid a max.
+ Cond = OptimizeMax(Cond, CondUse);
+
+ // If this exiting block dominates the latch block, it may also use
+ // the post-inc value if it won't be shared with other uses.
+ // Check for dominance.
+ if (!DT.dominates(ExitingBlock, LatchBlock))
+ continue;
+
+ // Conservatively avoid trying to use the post-inc value in non-latch
+ // exits if there may be pre-inc users in intervening blocks.
+ if (LatchBlock != ExitingBlock)
+ for (IVUsers::const_iterator UI = IU.begin(), E = IU.end(); UI != E; ++UI)
+ // Test if the use is reachable from the exiting block. This dominator
+ // query is a conservative approximation of reachability.
+ if (&*UI != CondUse &&
+ !DT.properlyDominates(UI->getUser()->getParent(), ExitingBlock)) {
+ // Conservatively assume there may be reuse if the quotient of their
+ // strides could be a legal scale.
+ const SCEV *A = IU.getStride(*CondUse, L);
+ const SCEV *B = IU.getStride(*UI, L);
+ if (!A || !B) continue;
+ if (SE.getTypeSizeInBits(A->getType()) !=
+ SE.getTypeSizeInBits(B->getType())) {
+ if (SE.getTypeSizeInBits(A->getType()) >
+ SE.getTypeSizeInBits(B->getType()))
+ B = SE.getSignExtendExpr(B, A->getType());
+ else
+ A = SE.getSignExtendExpr(A, B->getType());
+ }
+ if (const SCEVConstant *D =
+ dyn_cast_or_null<SCEVConstant>(getExactSDiv(B, A, SE))) {
+ const ConstantInt *C = D->getValue();
+ // Stride of one or negative one can have reuse with non-addresses.
+ if (C->isOne() || C->isMinusOne())
+ goto decline_post_inc;
+ // Avoid weird situations.
+ if (C->getValue().getMinSignedBits() >= 64 ||
+ C->getValue().isMinSignedValue())
+ goto decline_post_inc;
+ // Check for possible scaled-address reuse.
+ if (isAddressUse(TTI, UI->getUser(), UI->getOperandValToReplace())) {
+ MemAccessTy AccessTy = getAccessType(
+ TTI, UI->getUser(), UI->getOperandValToReplace());
+ int64_t Scale = C->getSExtValue();
+ if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+ /*BaseOffset=*/0,
+ /*HasBaseReg=*/false, Scale,
+ AccessTy.AddrSpace))
+ goto decline_post_inc;
+ Scale = -Scale;
+ if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr,
+ /*BaseOffset=*/0,
+ /*HasBaseReg=*/false, Scale,
+ AccessTy.AddrSpace))
+ goto decline_post_inc;
+ }
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << " Change loop exiting icmp to use postinc iv: "
+ << *Cond << '\n');
+
+ // It's possible for the setcc instruction to be anywhere in the loop, and
+ // possible for it to have multiple users. If it is not immediately before
+ // the exiting block branch, move it.
+ if (&*++BasicBlock::iterator(Cond) != TermBr) {
+ if (Cond->hasOneUse()) {
+ Cond->moveBefore(TermBr);
+ } else {
+ // Clone the terminating condition and insert into the loopend.
+ ICmpInst *OldCond = Cond;
+ Cond = cast<ICmpInst>(Cond->clone());
+ Cond->setName(L->getHeader()->getName() + ".termcond");
+ ExitingBlock->getInstList().insert(TermBr->getIterator(), Cond);
+
+ // Clone the IVUse, as the old use still exists!
+ CondUse = &IU.AddUser(Cond, CondUse->getOperandValToReplace());
+ TermBr->replaceUsesOfWith(OldCond, Cond);
+ }
+ }
+
+ // If we get to here, we know that we can transform the setcc instruction to
+ // use the post-incremented version of the IV, allowing us to coalesce the
+ // live ranges for the IV correctly.
+ CondUse->transformToPostInc(L);
+ Changed = true;
+
+ PostIncs.insert(Cond);
+ decline_post_inc:;
+ }
+
+ // Determine an insertion point for the loop induction variable increment. It
+ // must dominate all the post-inc comparisons we just set up, and it must
+ // dominate the loop latch edge.
+ IVIncInsertPos = L->getLoopLatch()->getTerminator();
+ for (Instruction *Inst : PostIncs) {
+ BasicBlock *BB =
+ DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
+ Inst->getParent());
+ if (BB == Inst->getParent())
+ IVIncInsertPos = Inst;
+ else if (BB != IVIncInsertPos->getParent())
+ IVIncInsertPos = BB->getTerminator();
+ }
+}
+
+/// Determine if the given use can accommodate a fixup at the given offset and
+/// other details. If so, update the use and return true.
+bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset,
+ bool HasBaseReg, LSRUse::KindType Kind,
+ MemAccessTy AccessTy) {
+ int64_t NewMinOffset = LU.MinOffset;
+ int64_t NewMaxOffset = LU.MaxOffset;
+ MemAccessTy NewAccessTy = AccessTy;
+
+ // Check for a mismatched kind. It's tempting to collapse mismatched kinds to
+ // something conservative, however this can pessimize in the case that one of
+ // the uses will have all its uses outside the loop, for example.
+ if (LU.Kind != Kind)
+ return false;
+
+ // Check for a mismatched access type, and fall back conservatively as needed.
+ // TODO: Be less conservative when the type is similar and can use the same
+ // addressing modes.
+ if (Kind == LSRUse::Address) {
+ if (AccessTy.MemTy != LU.AccessTy.MemTy) {
+ NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext(),
+ AccessTy.AddrSpace);
+ }
+ }
+
+ // Conservatively assume HasBaseReg is true for now.
+ if (NewOffset < LU.MinOffset) {
+ if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
+ LU.MaxOffset - NewOffset, HasBaseReg))
+ return false;
+ NewMinOffset = NewOffset;
+ } else if (NewOffset > LU.MaxOffset) {
+ if (!isAlwaysFoldable(TTI, Kind, NewAccessTy, /*BaseGV=*/nullptr,
+ NewOffset - LU.MinOffset, HasBaseReg))
+ return false;
+ NewMaxOffset = NewOffset;
+ }
+
+ // Update the use.
+ LU.MinOffset = NewMinOffset;
+ LU.MaxOffset = NewMaxOffset;
+ LU.AccessTy = NewAccessTy;
+ return true;
+}
+
+/// Return an LSRUse index and an offset value for a fixup which needs the given
+/// expression, with the given kind and optional access type. Either reuse an
+/// existing use or create a new one, as needed.
+std::pair<size_t, int64_t> LSRInstance::getUse(const SCEV *&Expr,
+ LSRUse::KindType Kind,
+ MemAccessTy AccessTy) {
+ const SCEV *Copy = Expr;
+ int64_t Offset = ExtractImmediate(Expr, SE);
+
+ // Basic uses can't accept any offset, for example.
+ if (!isAlwaysFoldable(TTI, Kind, AccessTy, /*BaseGV=*/ nullptr,
+ Offset, /*HasBaseReg=*/ true)) {
+ Expr = Copy;
+ Offset = 0;
+ }
+
+ std::pair<UseMapTy::iterator, bool> P =
+ UseMap.insert(std::make_pair(LSRUse::SCEVUseKindPair(Expr, Kind), 0));
+ if (!P.second) {
+ // A use already existed with this base.
+ size_t LUIdx = P.first->second;
+ LSRUse &LU = Uses[LUIdx];
+ if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy))
+ // Reuse this use.
+ return std::make_pair(LUIdx, Offset);
+ }
+
+ // Create a new use.
+ size_t LUIdx = Uses.size();
+ P.first->second = LUIdx;
+ Uses.push_back(LSRUse(Kind, AccessTy));
+ LSRUse &LU = Uses[LUIdx];
+
+ LU.MinOffset = Offset;
+ LU.MaxOffset = Offset;
+ return std::make_pair(LUIdx, Offset);
+}
+
+/// Delete the given use from the Uses list.
+void LSRInstance::DeleteUse(LSRUse &LU, size_t LUIdx) {
+ if (&LU != &Uses.back())
+ std::swap(LU, Uses.back());
+ Uses.pop_back();
+
+ // Update RegUses.
+ RegUses.swapAndDropUse(LUIdx, Uses.size());
+}
+
+/// Look for a use distinct from OrigLU which is has a formula that has the same
+/// registers as the given formula.
+LSRUse *
+LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF,
+ const LSRUse &OrigLU) {
+ // Search all uses for the formula. This could be more clever.
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ // Check whether this use is close enough to OrigLU, to see whether it's
+ // worthwhile looking through its formulae.
+ // Ignore ICmpZero uses because they may contain formulae generated by
+ // GenerateICmpZeroScales, in which case adding fixup offsets may
+ // be invalid.
+ if (&LU != &OrigLU &&
+ LU.Kind != LSRUse::ICmpZero &&
+ LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy &&
+ LU.WidestFixupType == OrigLU.WidestFixupType &&
+ LU.HasFormulaWithSameRegs(OrigF)) {
+ // Scan through this use's formulae.
+ for (const Formula &F : LU.Formulae) {
+ // Check to see if this formula has the same registers and symbols
+ // as OrigF.
+ if (F.BaseRegs == OrigF.BaseRegs &&
+ F.ScaledReg == OrigF.ScaledReg &&
+ F.BaseGV == OrigF.BaseGV &&
+ F.Scale == OrigF.Scale &&
+ F.UnfoldedOffset == OrigF.UnfoldedOffset) {
+ if (F.BaseOffset == 0)
+ return &LU;
+ // This is the formula where all the registers and symbols matched;
+ // there aren't going to be any others. Since we declined it, we
+ // can skip the rest of the formulae and proceed to the next LSRUse.
+ break;
+ }
+ }
+ }
+ }
+
+ // Nothing looked good.
+ return nullptr;
+}
+
+void LSRInstance::CollectInterestingTypesAndFactors() {
+ SmallSetVector<const SCEV *, 4> Strides;
+
+ // Collect interesting types and strides.
+ SmallVector<const SCEV *, 4> Worklist;
+ for (const IVStrideUse &U : IU) {
+ const SCEV *Expr = IU.getExpr(U);
+
+ // Collect interesting types.
+ Types.insert(SE.getEffectiveSCEVType(Expr->getType()));
+
+ // Add strides for mentioned loops.
+ Worklist.push_back(Expr);
+ do {
+ const SCEV *S = Worklist.pop_back_val();
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ if (AR->getLoop() == L)
+ Strides.insert(AR->getStepRecurrence(SE));
+ Worklist.push_back(AR->getStart());
+ } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ Worklist.append(Add->op_begin(), Add->op_end());
+ }
+ } while (!Worklist.empty());
+ }
+
+ // Compute interesting factors from the set of interesting strides.
+ for (SmallSetVector<const SCEV *, 4>::const_iterator
+ I = Strides.begin(), E = Strides.end(); I != E; ++I)
+ for (SmallSetVector<const SCEV *, 4>::const_iterator NewStrideIter =
+ std::next(I); NewStrideIter != E; ++NewStrideIter) {
+ const SCEV *OldStride = *I;
+ const SCEV *NewStride = *NewStrideIter;
+
+ if (SE.getTypeSizeInBits(OldStride->getType()) !=
+ SE.getTypeSizeInBits(NewStride->getType())) {
+ if (SE.getTypeSizeInBits(OldStride->getType()) >
+ SE.getTypeSizeInBits(NewStride->getType()))
+ NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
+ else
+ OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
+ }
+ if (const SCEVConstant *Factor =
+ dyn_cast_or_null<SCEVConstant>(getExactSDiv(NewStride, OldStride,
+ SE, true))) {
+ if (Factor->getAPInt().getMinSignedBits() <= 64)
+ Factors.insert(Factor->getAPInt().getSExtValue());
+ } else if (const SCEVConstant *Factor =
+ dyn_cast_or_null<SCEVConstant>(getExactSDiv(OldStride,
+ NewStride,
+ SE, true))) {
+ if (Factor->getAPInt().getMinSignedBits() <= 64)
+ Factors.insert(Factor->getAPInt().getSExtValue());
+ }
+ }
+
+ // If all uses use the same type, don't bother looking for truncation-based
+ // reuse.
+ if (Types.size() == 1)
+ Types.clear();
+
+ LLVM_DEBUG(print_factors_and_types(dbgs()));
+}
+
+/// Helper for CollectChains that finds an IV operand (computed by an AddRec in
+/// this loop) within [OI,OE) or returns OE. If IVUsers mapped Instructions to
+/// IVStrideUses, we could partially skip this.
+static User::op_iterator
+findIVOperand(User::op_iterator OI, User::op_iterator OE,
+ Loop *L, ScalarEvolution &SE) {
+ for(; OI != OE; ++OI) {
+ if (Instruction *Oper = dyn_cast<Instruction>(*OI)) {
+ if (!SE.isSCEVable(Oper->getType()))
+ continue;
+
+ if (const SCEVAddRecExpr *AR =
+ dyn_cast<SCEVAddRecExpr>(SE.getSCEV(Oper))) {
+ if (AR->getLoop() == L)
+ break;
+ }
+ }
+ }
+ return OI;
+}
+
+/// IVChain logic must consistently peek base TruncInst operands, so wrap it in
+/// a convenient helper.
+static Value *getWideOperand(Value *Oper) {
+ if (TruncInst *Trunc = dyn_cast<TruncInst>(Oper))
+ return Trunc->getOperand(0);
+ return Oper;
+}
+
+/// Return true if we allow an IV chain to include both types.
+static bool isCompatibleIVType(Value *LVal, Value *RVal) {
+ Type *LType = LVal->getType();
+ Type *RType = RVal->getType();
+ return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() &&
+ // Different address spaces means (possibly)
+ // different types of the pointer implementation,
+ // e.g. i16 vs i32 so disallow that.
+ (LType->getPointerAddressSpace() ==
+ RType->getPointerAddressSpace()));
+}
+
+/// Return an approximation of this SCEV expression's "base", or NULL for any
+/// constant. Returning the expression itself is conservative. Returning a
+/// deeper subexpression is more precise and valid as long as it isn't less
+/// complex than another subexpression. For expressions involving multiple
+/// unscaled values, we need to return the pointer-type SCEVUnknown. This avoids
+/// forming chains across objects, such as: PrevOper==a[i], IVOper==b[i],
+/// IVInc==b-a.
+///
+/// Since SCEVUnknown is the rightmost type, and pointers are the rightmost
+/// SCEVUnknown, we simply return the rightmost SCEV operand.
+static const SCEV *getExprBase(const SCEV *S) {
+ switch (S->getSCEVType()) {
+ default: // uncluding scUnknown.
+ return S;
+ case scConstant:
+ return nullptr;
+ case scTruncate:
+ return getExprBase(cast<SCEVTruncateExpr>(S)->getOperand());
+ case scZeroExtend:
+ return getExprBase(cast<SCEVZeroExtendExpr>(S)->getOperand());
+ case scSignExtend:
+ return getExprBase(cast<SCEVSignExtendExpr>(S)->getOperand());
+ case scAddExpr: {
+ // Skip over scaled operands (scMulExpr) to follow add operands as long as
+ // there's nothing more complex.
+ // FIXME: not sure if we want to recognize negation.
+ const SCEVAddExpr *Add = cast<SCEVAddExpr>(S);
+ for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(Add->op_end()),
+ E(Add->op_begin()); I != E; ++I) {
+ const SCEV *SubExpr = *I;
+ if (SubExpr->getSCEVType() == scAddExpr)
+ return getExprBase(SubExpr);
+
+ if (SubExpr->getSCEVType() != scMulExpr)
+ return SubExpr;
+ }
+ return S; // all operands are scaled, be conservative.
+ }
+ case scAddRecExpr:
+ return getExprBase(cast<SCEVAddRecExpr>(S)->getStart());
+ }
llvm_unreachable("Unknown SCEV kind!");
-}
-
-/// Return true if the chain increment is profitable to expand into a loop
-/// invariant value, which may require its own register. A profitable chain
-/// increment will be an offset relative to the same base. We allow such offsets
-/// to potentially be used as chain increment as long as it's not obviously
-/// expensive to expand using real instructions.
-bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
- const SCEV *IncExpr,
- ScalarEvolution &SE) {
- // Aggressively form chains when -stress-ivchain.
- if (StressIVChain)
- return true;
-
- // Do not replace a constant offset from IV head with a nonconstant IV
- // increment.
- if (!isa<SCEVConstant>(IncExpr)) {
- const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
- if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
- return false;
- }
-
- SmallPtrSet<const SCEV*, 8> Processed;
- return !isHighCostExpansion(IncExpr, Processed, SE);
-}
-
-/// Return true if the number of registers needed for the chain is estimated to
-/// be less than the number required for the individual IV users. First prohibit
-/// any IV users that keep the IV live across increments (the Users set should
-/// be empty). Next count the number and type of increments in the chain.
-///
-/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
-/// effectively use postinc addressing modes. Only consider it profitable it the
-/// increments can be computed in fewer registers when chained.
-///
-/// TODO: Consider IVInc free if it's already used in another chains.
-static bool isProfitableChain(IVChain &Chain,
- SmallPtrSetImpl<Instruction *> &Users,
- ScalarEvolution &SE,
- const TargetTransformInfo &TTI) {
- if (StressIVChain)
- return true;
-
- if (!Chain.hasIncs())
- return false;
-
- if (!Users.empty()) {
- LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
- for (Instruction *Inst
- : Users) { dbgs() << " " << *Inst << "\n"; });
- return false;
- }
- assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
-
- // The chain itself may require a register, so intialize cost to 1.
- int cost = 1;
-
- // A complete chain likely eliminates the need for keeping the original IV in
- // a register. LSR does not currently know how to form a complete chain unless
- // the header phi already exists.
- if (isa<PHINode>(Chain.tailUserInst())
- && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
- --cost;
- }
- const SCEV *LastIncExpr = nullptr;
- unsigned NumConstIncrements = 0;
- unsigned NumVarIncrements = 0;
- unsigned NumReusedIncrements = 0;
-
- if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
- return true;
-
- for (const IVInc &Inc : Chain) {
- if (TTI.isProfitableLSRChainElement(Inc.UserInst))
- return true;
- if (Inc.IncExpr->isZero())
- continue;
-
- // Incrementing by zero or some constant is neutral. We assume constants can
- // be folded into an addressing mode or an add's immediate operand.
- if (isa<SCEVConstant>(Inc.IncExpr)) {
- ++NumConstIncrements;
- continue;
- }
-
- if (Inc.IncExpr == LastIncExpr)
- ++NumReusedIncrements;
- else
- ++NumVarIncrements;
-
- LastIncExpr = Inc.IncExpr;
- }
- // An IV chain with a single increment is handled by LSR's postinc
- // uses. However, a chain with multiple increments requires keeping the IV's
- // value live longer than it needs to be if chained.
- if (NumConstIncrements > 1)
- --cost;
-
- // Materializing increment expressions in the preheader that didn't exist in
- // the original code may cost a register. For example, sign-extended array
- // indices can produce ridiculous increments like this:
- // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
- cost += NumVarIncrements;
-
- // Reusing variable increments likely saves a register to hold the multiple of
- // the stride.
- cost -= NumReusedIncrements;
-
- LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
- << "\n");
-
- return cost < 0;
-}
-
-/// Add this IV user to an existing chain or make it the head of a new chain.
-void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
- SmallVectorImpl<ChainUsers> &ChainUsersVec) {
- // When IVs are used as types of varying widths, they are generally converted
- // to a wider type with some uses remaining narrow under a (free) trunc.
- Value *const NextIV = getWideOperand(IVOper);
- const SCEV *const OperExpr = SE.getSCEV(NextIV);
- const SCEV *const OperExprBase = getExprBase(OperExpr);
-
- // Visit all existing chains. Check if its IVOper can be computed as a
- // profitable loop invariant increment from the last link in the Chain.
- unsigned ChainIdx = 0, NChains = IVChainVec.size();
- const SCEV *LastIncExpr = nullptr;
- for (; ChainIdx < NChains; ++ChainIdx) {
- IVChain &Chain = IVChainVec[ChainIdx];
-
- // Prune the solution space aggressively by checking that both IV operands
- // are expressions that operate on the same unscaled SCEVUnknown. This
- // "base" will be canceled by the subsequent getMinusSCEV call. Checking
- // first avoids creating extra SCEV expressions.
- if (!StressIVChain && Chain.ExprBase != OperExprBase)
- continue;
-
- Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
- if (!isCompatibleIVType(PrevIV, NextIV))
- continue;
-
- // A phi node terminates a chain.
- if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
- continue;
-
- // The increment must be loop-invariant so it can be kept in a register.
- const SCEV *PrevExpr = SE.getSCEV(PrevIV);
- const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
- if (!SE.isLoopInvariant(IncExpr, L))
- continue;
-
- if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
- LastIncExpr = IncExpr;
- break;
- }
- }
- // If we haven't found a chain, create a new one, unless we hit the max. Don't
- // bother for phi nodes, because they must be last in the chain.
- if (ChainIdx == NChains) {
- if (isa<PHINode>(UserInst))
- return;
- if (NChains >= MaxChains && !StressIVChain) {
- LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
- return;
- }
- LastIncExpr = OperExpr;
- // IVUsers may have skipped over sign/zero extensions. We don't currently
- // attempt to form chains involving extensions unless they can be hoisted
- // into this loop's AddRec.
- if (!isa<SCEVAddRecExpr>(LastIncExpr))
- return;
- ++NChains;
- IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
- OperExprBase));
- ChainUsersVec.resize(NChains);
- LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
- << ") IV=" << *LastIncExpr << "\n");
- } else {
- LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
- << ") IV+" << *LastIncExpr << "\n");
- // Add this IV user to the end of the chain.
- IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
- }
- IVChain &Chain = IVChainVec[ChainIdx];
-
- SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
- // This chain's NearUsers become FarUsers.
- if (!LastIncExpr->isZero()) {
- ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
- NearUsers.end());
- NearUsers.clear();
- }
-
- // All other uses of IVOperand become near uses of the chain.
- // We currently ignore intermediate values within SCEV expressions, assuming
- // they will eventually be used be the current chain, or can be computed
- // from one of the chain increments. To be more precise we could
- // transitively follow its user and only add leaf IV users to the set.
- for (User *U : IVOper->users()) {
- Instruction *OtherUse = dyn_cast<Instruction>(U);
- if (!OtherUse)
- continue;
- // Uses in the chain will no longer be uses if the chain is formed.
- // Include the head of the chain in this iteration (not Chain.begin()).
- IVChain::const_iterator IncIter = Chain.Incs.begin();
- IVChain::const_iterator IncEnd = Chain.Incs.end();
- for( ; IncIter != IncEnd; ++IncIter) {
- if (IncIter->UserInst == OtherUse)
- break;
- }
- if (IncIter != IncEnd)
- continue;
-
- if (SE.isSCEVable(OtherUse->getType())
- && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
- && IU.isIVUserOrOperand(OtherUse)) {
- continue;
- }
- NearUsers.insert(OtherUse);
- }
-
- // Since this user is part of the chain, it's no longer considered a use
- // of the chain.
- ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
-}
-
-/// Populate the vector of Chains.
-///
-/// This decreases ILP at the architecture level. Targets with ample registers,
-/// multiple memory ports, and no register renaming probably don't want
-/// this. However, such targets should probably disable LSR altogether.
-///
-/// The job of LSR is to make a reasonable choice of induction variables across
-/// the loop. Subsequent passes can easily "unchain" computation exposing more
-/// ILP *within the loop* if the target wants it.
-///
-/// Finding the best IV chain is potentially a scheduling problem. Since LSR
-/// will not reorder memory operations, it will recognize this as a chain, but
-/// will generate redundant IV increments. Ideally this would be corrected later
-/// by a smart scheduler:
-/// = A[i]
-/// = A[i+x]
-/// A[i] =
-/// A[i+x] =
-///
-/// TODO: Walk the entire domtree within this loop, not just the path to the
-/// loop latch. This will discover chains on side paths, but requires
-/// maintaining multiple copies of the Chains state.
-void LSRInstance::CollectChains() {
- LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
- SmallVector<ChainUsers, 8> ChainUsersVec;
-
- SmallVector<BasicBlock *,8> LatchPath;
- BasicBlock *LoopHeader = L->getHeader();
- for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
- Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
- LatchPath.push_back(Rung->getBlock());
- }
- LatchPath.push_back(LoopHeader);
-
- // Walk the instruction stream from the loop header to the loop latch.
- for (BasicBlock *BB : reverse(LatchPath)) {
- for (Instruction &I : *BB) {
- // Skip instructions that weren't seen by IVUsers analysis.
- if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
- continue;
-
- // Ignore users that are part of a SCEV expression. This way we only
- // consider leaf IV Users. This effectively rediscovers a portion of
- // IVUsers analysis but in program order this time.
- if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
- continue;
-
- // Remove this instruction from any NearUsers set it may be in.
- for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
- ChainIdx < NChains; ++ChainIdx) {
- ChainUsersVec[ChainIdx].NearUsers.erase(&I);
- }
- // Search for operands that can be chained.
- SmallPtrSet<Instruction*, 4> UniqueOperands;
- User::op_iterator IVOpEnd = I.op_end();
- User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
- while (IVOpIter != IVOpEnd) {
- Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
- if (UniqueOperands.insert(IVOpInst).second)
- ChainInstruction(&I, IVOpInst, ChainUsersVec);
- IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
- }
- } // Continue walking down the instructions.
- } // Continue walking down the domtree.
- // Visit phi backedges to determine if the chain can generate the IV postinc.
- for (PHINode &PN : L->getHeader()->phis()) {
- if (!SE.isSCEVable(PN.getType()))
- continue;
-
- Instruction *IncV =
- dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
- if (IncV)
- ChainInstruction(&PN, IncV, ChainUsersVec);
- }
- // Remove any unprofitable chains.
- unsigned ChainIdx = 0;
- for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
- UsersIdx < NChains; ++UsersIdx) {
- if (!isProfitableChain(IVChainVec[UsersIdx],
- ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
- continue;
- // Preserve the chain at UsesIdx.
- if (ChainIdx != UsersIdx)
- IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
- FinalizeChain(IVChainVec[ChainIdx]);
- ++ChainIdx;
- }
- IVChainVec.resize(ChainIdx);
-}
-
-void LSRInstance::FinalizeChain(IVChain &Chain) {
- assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
- LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
-
- for (const IVInc &Inc : Chain) {
- LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
- auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
- assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
- IVIncSet.insert(UseI);
- }
-}
-
-/// Return true if the IVInc can be folded into an addressing mode.
-static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
- Value *Operand, const TargetTransformInfo &TTI) {
- const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
- if (!IncConst || !isAddressUse(TTI, UserInst, Operand))
- return false;
-
- if (IncConst->getAPInt().getMinSignedBits() > 64)
- return false;
-
- MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
- int64_t IncOffset = IncConst->getValue()->getSExtValue();
- if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
- IncOffset, /*HasBaseReg=*/false))
- return false;
-
- return true;
-}
-
-/// Generate an add or subtract for each IVInc in a chain to materialize the IV
-/// user's operand from the previous IV user's operand.
-void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
- SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
- // Find the new IVOperand for the head of the chain. It may have been replaced
- // by LSR.
- const IVInc &Head = Chain.Incs[0];
- User::op_iterator IVOpEnd = Head.UserInst->op_end();
- // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
- User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
- IVOpEnd, L, SE);
- Value *IVSrc = nullptr;
- while (IVOpIter != IVOpEnd) {
- IVSrc = getWideOperand(*IVOpIter);
-
- // If this operand computes the expression that the chain needs, we may use
- // it. (Check this after setting IVSrc which is used below.)
- //
- // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
- // narrow for the chain, so we can no longer use it. We do allow using a
- // wider phi, assuming the LSR checked for free truncation. In that case we
- // should already have a truncate on this operand such that
- // getSCEV(IVSrc) == IncExpr.
- if (SE.getSCEV(*IVOpIter) == Head.IncExpr
- || SE.getSCEV(IVSrc) == Head.IncExpr) {
- break;
- }
- IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
- }
- if (IVOpIter == IVOpEnd) {
- // Gracefully give up on this chain.
- LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
- return;
- }
- assert(IVSrc && "Failed to find IV chain source");
-
- LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
- Type *IVTy = IVSrc->getType();
- Type *IntTy = SE.getEffectiveSCEVType(IVTy);
- const SCEV *LeftOverExpr = nullptr;
- for (const IVInc &Inc : Chain) {
- Instruction *InsertPt = Inc.UserInst;
- if (isa<PHINode>(InsertPt))
- InsertPt = L->getLoopLatch()->getTerminator();
-
- // IVOper will replace the current IV User's operand. IVSrc is the IV
- // value currently held in a register.
- Value *IVOper = IVSrc;
- if (!Inc.IncExpr->isZero()) {
- // IncExpr was the result of subtraction of two narrow values, so must
- // be signed.
- const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
- LeftOverExpr = LeftOverExpr ?
- SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
- }
- if (LeftOverExpr && !LeftOverExpr->isZero()) {
- // Expand the IV increment.
- Rewriter.clearPostInc();
- Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
- const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
- SE.getUnknown(IncV));
- IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
-
- // If an IV increment can't be folded, use it as the next IV value.
- if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
- assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
- IVSrc = IVOper;
- LeftOverExpr = nullptr;
- }
- }
- Type *OperTy = Inc.IVOperand->getType();
- if (IVTy != OperTy) {
- assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
- "cannot extend a chained IV");
- IRBuilder<> Builder(InsertPt);
- IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
- }
- Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
- if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
- DeadInsts.emplace_back(OperandIsInstr);
- }
- // If LSR created a new, wider phi, we may also replace its postinc. We only
- // do this if we also found a wide value for the head of the chain.
- if (isa<PHINode>(Chain.tailUserInst())) {
- for (PHINode &Phi : L->getHeader()->phis()) {
- if (!isCompatibleIVType(&Phi, IVSrc))
- continue;
- Instruction *PostIncV = dyn_cast<Instruction>(
- Phi.getIncomingValueForBlock(L->getLoopLatch()));
- if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
- continue;
- Value *IVOper = IVSrc;
- Type *PostIncTy = PostIncV->getType();
- if (IVTy != PostIncTy) {
- assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
- IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
- Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
- IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
- }
- Phi.replaceUsesOfWith(PostIncV, IVOper);
- DeadInsts.emplace_back(PostIncV);
- }
- }
-}
-
-void LSRInstance::CollectFixupsAndInitialFormulae() {
- BranchInst *ExitBranch = nullptr;
- bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
-
- for (const IVStrideUse &U : IU) {
- Instruction *UserInst = U.getUser();
- // Skip IV users that are part of profitable IV Chains.
- User::op_iterator UseI =
- find(UserInst->operands(), U.getOperandValToReplace());
- assert(UseI != UserInst->op_end() && "cannot find IV operand");
- if (IVIncSet.count(UseI)) {
- LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
- continue;
- }
-
- LSRUse::KindType Kind = LSRUse::Basic;
- MemAccessTy AccessTy;
- if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
- Kind = LSRUse::Address;
- AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
- }
-
- const SCEV *S = IU.getExpr(U);
- PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
-
- // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
- // (N - i == 0), and this allows (N - i) to be the expression that we work
- // with rather than just N or i, so we can consider the register
- // requirements for both N and i at the same time. Limiting this code to
- // equality icmps is not a problem because all interesting loops use
- // equality icmps, thanks to IndVarSimplify.
- if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
- // If CI can be saved in some target, like replaced inside hardware loop
- // in PowerPC, no need to generate initial formulae for it.
- if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
- continue;
- if (CI->isEquality()) {
- // Swap the operands if needed to put the OperandValToReplace on the
- // left, for consistency.
- Value *NV = CI->getOperand(1);
- if (NV == U.getOperandValToReplace()) {
- CI->setOperand(1, CI->getOperand(0));
- CI->setOperand(0, NV);
- NV = CI->getOperand(1);
- Changed = true;
- }
-
- // x == y --> x - y == 0
- const SCEV *N = SE.getSCEV(NV);
- if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
- // S is normalized, so normalize N before folding it into S
- // to keep the result normalized.
- N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
- Kind = LSRUse::ICmpZero;
- S = SE.getMinusSCEV(N, S);
- }
-
- // -1 and the negations of all interesting strides (except the negation
- // of -1) are now also interesting.
- for (size_t i = 0, e = Factors.size(); i != e; ++i)
- if (Factors[i] != -1)
- Factors.insert(-(uint64_t)Factors[i]);
- Factors.insert(-1);
- }
- }
-
- // Get or create an LSRUse.
- std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
- size_t LUIdx = P.first;
- int64_t Offset = P.second;
- LSRUse &LU = Uses[LUIdx];
-
- // Record the fixup.
- LSRFixup &LF = LU.getNewFixup();
- LF.UserInst = UserInst;
- LF.OperandValToReplace = U.getOperandValToReplace();
- LF.PostIncLoops = TmpPostIncLoops;
- LF.Offset = Offset;
- LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
-
- if (!LU.WidestFixupType ||
- SE.getTypeSizeInBits(LU.WidestFixupType) <
- SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
- LU.WidestFixupType = LF.OperandValToReplace->getType();
-
- // If this is the first use of this LSRUse, give it a formula.
- if (LU.Formulae.empty()) {
- InsertInitialFormula(S, LU, LUIdx);
- CountRegisters(LU.Formulae.back(), LUIdx);
- }
- }
-
- LLVM_DEBUG(print_fixups(dbgs()));
-}
-
-/// Insert a formula for the given expression into the given use, separating out
-/// loop-variant portions from loop-invariant and loop-computable portions.
-void
-LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
- // Mark uses whose expressions cannot be expanded.
- if (!isSafeToExpand(S, SE))
- LU.RigidFormula = true;
-
- Formula F;
- F.initialMatch(S, L, SE);
- bool Inserted = InsertFormula(LU, LUIdx, F);
- assert(Inserted && "Initial formula already exists!"); (void)Inserted;
-}
-
-/// Insert a simple single-register formula for the given expression into the
-/// given use.
-void
-LSRInstance::InsertSupplementalFormula(const SCEV *S,
- LSRUse &LU, size_t LUIdx) {
- Formula F;
- F.BaseRegs.push_back(S);
- F.HasBaseReg = true;
- bool Inserted = InsertFormula(LU, LUIdx, F);
- assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
-}
-
-/// Note which registers are used by the given formula, updating RegUses.
-void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
- if (F.ScaledReg)
- RegUses.countRegister(F.ScaledReg, LUIdx);
- for (const SCEV *BaseReg : F.BaseRegs)
- RegUses.countRegister(BaseReg, LUIdx);
-}
-
-/// If the given formula has not yet been inserted, add it to the list, and
-/// return true. Return false otherwise.
-bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
- // Do not insert formula that we will not be able to expand.
- assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
- "Formula is illegal");
-
- if (!LU.InsertFormula(F, *L))
- return false;
-
- CountRegisters(F, LUIdx);
- return true;
-}
-
-/// Check for other uses of loop-invariant values which we're tracking. These
-/// other uses will pin these values in registers, making them less profitable
-/// for elimination.
-/// TODO: This currently misses non-constant addrec step registers.
-/// TODO: Should this give more weight to users inside the loop?
-void
-LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
- SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
- SmallPtrSet<const SCEV *, 32> Visited;
-
- while (!Worklist.empty()) {
- const SCEV *S = Worklist.pop_back_val();
-
- // Don't process the same SCEV twice
- if (!Visited.insert(S).second)
- continue;
-
- if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
- Worklist.append(N->op_begin(), N->op_end());
+}
+
+/// Return true if the chain increment is profitable to expand into a loop
+/// invariant value, which may require its own register. A profitable chain
+/// increment will be an offset relative to the same base. We allow such offsets
+/// to potentially be used as chain increment as long as it's not obviously
+/// expensive to expand using real instructions.
+bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
+ const SCEV *IncExpr,
+ ScalarEvolution &SE) {
+ // Aggressively form chains when -stress-ivchain.
+ if (StressIVChain)
+ return true;
+
+ // Do not replace a constant offset from IV head with a nonconstant IV
+ // increment.
+ if (!isa<SCEVConstant>(IncExpr)) {
+ const SCEV *HeadExpr = SE.getSCEV(getWideOperand(Incs[0].IVOperand));
+ if (isa<SCEVConstant>(SE.getMinusSCEV(OperExpr, HeadExpr)))
+ return false;
+ }
+
+ SmallPtrSet<const SCEV*, 8> Processed;
+ return !isHighCostExpansion(IncExpr, Processed, SE);
+}
+
+/// Return true if the number of registers needed for the chain is estimated to
+/// be less than the number required for the individual IV users. First prohibit
+/// any IV users that keep the IV live across increments (the Users set should
+/// be empty). Next count the number and type of increments in the chain.
+///
+/// Chaining IVs can lead to considerable code bloat if ISEL doesn't
+/// effectively use postinc addressing modes. Only consider it profitable it the
+/// increments can be computed in fewer registers when chained.
+///
+/// TODO: Consider IVInc free if it's already used in another chains.
+static bool isProfitableChain(IVChain &Chain,
+ SmallPtrSetImpl<Instruction *> &Users,
+ ScalarEvolution &SE,
+ const TargetTransformInfo &TTI) {
+ if (StressIVChain)
+ return true;
+
+ if (!Chain.hasIncs())
+ return false;
+
+ if (!Users.empty()) {
+ LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " users:\n";
+ for (Instruction *Inst
+ : Users) { dbgs() << " " << *Inst << "\n"; });
+ return false;
+ }
+ assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+
+ // The chain itself may require a register, so intialize cost to 1.
+ int cost = 1;
+
+ // A complete chain likely eliminates the need for keeping the original IV in
+ // a register. LSR does not currently know how to form a complete chain unless
+ // the header phi already exists.
+ if (isa<PHINode>(Chain.tailUserInst())
+ && SE.getSCEV(Chain.tailUserInst()) == Chain.Incs[0].IncExpr) {
+ --cost;
+ }
+ const SCEV *LastIncExpr = nullptr;
+ unsigned NumConstIncrements = 0;
+ unsigned NumVarIncrements = 0;
+ unsigned NumReusedIncrements = 0;
+
+ if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
+ return true;
+
+ for (const IVInc &Inc : Chain) {
+ if (TTI.isProfitableLSRChainElement(Inc.UserInst))
+ return true;
+ if (Inc.IncExpr->isZero())
+ continue;
+
+ // Incrementing by zero or some constant is neutral. We assume constants can
+ // be folded into an addressing mode or an add's immediate operand.
+ if (isa<SCEVConstant>(Inc.IncExpr)) {
+ ++NumConstIncrements;
+ continue;
+ }
+
+ if (Inc.IncExpr == LastIncExpr)
+ ++NumReusedIncrements;
+ else
+ ++NumVarIncrements;
+
+ LastIncExpr = Inc.IncExpr;
+ }
+ // An IV chain with a single increment is handled by LSR's postinc
+ // uses. However, a chain with multiple increments requires keeping the IV's
+ // value live longer than it needs to be if chained.
+ if (NumConstIncrements > 1)
+ --cost;
+
+ // Materializing increment expressions in the preheader that didn't exist in
+ // the original code may cost a register. For example, sign-extended array
+ // indices can produce ridiculous increments like this:
+ // IV + ((sext i32 (2 * %s) to i64) + (-1 * (sext i32 %s to i64)))
+ cost += NumVarIncrements;
+
+ // Reusing variable increments likely saves a register to hold the multiple of
+ // the stride.
+ cost -= NumReusedIncrements;
+
+ LLVM_DEBUG(dbgs() << "Chain: " << *Chain.Incs[0].UserInst << " Cost: " << cost
+ << "\n");
+
+ return cost < 0;
+}
+
+/// Add this IV user to an existing chain or make it the head of a new chain.
+void LSRInstance::ChainInstruction(Instruction *UserInst, Instruction *IVOper,
+ SmallVectorImpl<ChainUsers> &ChainUsersVec) {
+ // When IVs are used as types of varying widths, they are generally converted
+ // to a wider type with some uses remaining narrow under a (free) trunc.
+ Value *const NextIV = getWideOperand(IVOper);
+ const SCEV *const OperExpr = SE.getSCEV(NextIV);
+ const SCEV *const OperExprBase = getExprBase(OperExpr);
+
+ // Visit all existing chains. Check if its IVOper can be computed as a
+ // profitable loop invariant increment from the last link in the Chain.
+ unsigned ChainIdx = 0, NChains = IVChainVec.size();
+ const SCEV *LastIncExpr = nullptr;
+ for (; ChainIdx < NChains; ++ChainIdx) {
+ IVChain &Chain = IVChainVec[ChainIdx];
+
+ // Prune the solution space aggressively by checking that both IV operands
+ // are expressions that operate on the same unscaled SCEVUnknown. This
+ // "base" will be canceled by the subsequent getMinusSCEV call. Checking
+ // first avoids creating extra SCEV expressions.
+ if (!StressIVChain && Chain.ExprBase != OperExprBase)
+ continue;
+
+ Value *PrevIV = getWideOperand(Chain.Incs.back().IVOperand);
+ if (!isCompatibleIVType(PrevIV, NextIV))
+ continue;
+
+ // A phi node terminates a chain.
+ if (isa<PHINode>(UserInst) && isa<PHINode>(Chain.tailUserInst()))
+ continue;
+
+ // The increment must be loop-invariant so it can be kept in a register.
+ const SCEV *PrevExpr = SE.getSCEV(PrevIV);
+ const SCEV *IncExpr = SE.getMinusSCEV(OperExpr, PrevExpr);
+ if (!SE.isLoopInvariant(IncExpr, L))
+ continue;
+
+ if (Chain.isProfitableIncrement(OperExpr, IncExpr, SE)) {
+ LastIncExpr = IncExpr;
+ break;
+ }
+ }
+ // If we haven't found a chain, create a new one, unless we hit the max. Don't
+ // bother for phi nodes, because they must be last in the chain.
+ if (ChainIdx == NChains) {
+ if (isa<PHINode>(UserInst))
+ return;
+ if (NChains >= MaxChains && !StressIVChain) {
+ LLVM_DEBUG(dbgs() << "IV Chain Limit\n");
+ return;
+ }
+ LastIncExpr = OperExpr;
+ // IVUsers may have skipped over sign/zero extensions. We don't currently
+ // attempt to form chains involving extensions unless they can be hoisted
+ // into this loop's AddRec.
+ if (!isa<SCEVAddRecExpr>(LastIncExpr))
+ return;
+ ++NChains;
+ IVChainVec.push_back(IVChain(IVInc(UserInst, IVOper, LastIncExpr),
+ OperExprBase));
+ ChainUsersVec.resize(NChains);
+ LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Head: (" << *UserInst
+ << ") IV=" << *LastIncExpr << "\n");
+ } else {
+ LLVM_DEBUG(dbgs() << "IV Chain#" << ChainIdx << " Inc: (" << *UserInst
+ << ") IV+" << *LastIncExpr << "\n");
+ // Add this IV user to the end of the chain.
+ IVChainVec[ChainIdx].add(IVInc(UserInst, IVOper, LastIncExpr));
+ }
+ IVChain &Chain = IVChainVec[ChainIdx];
+
+ SmallPtrSet<Instruction*,4> &NearUsers = ChainUsersVec[ChainIdx].NearUsers;
+ // This chain's NearUsers become FarUsers.
+ if (!LastIncExpr->isZero()) {
+ ChainUsersVec[ChainIdx].FarUsers.insert(NearUsers.begin(),
+ NearUsers.end());
+ NearUsers.clear();
+ }
+
+ // All other uses of IVOperand become near uses of the chain.
+ // We currently ignore intermediate values within SCEV expressions, assuming
+ // they will eventually be used be the current chain, or can be computed
+ // from one of the chain increments. To be more precise we could
+ // transitively follow its user and only add leaf IV users to the set.
+ for (User *U : IVOper->users()) {
+ Instruction *OtherUse = dyn_cast<Instruction>(U);
+ if (!OtherUse)
+ continue;
+ // Uses in the chain will no longer be uses if the chain is formed.
+ // Include the head of the chain in this iteration (not Chain.begin()).
+ IVChain::const_iterator IncIter = Chain.Incs.begin();
+ IVChain::const_iterator IncEnd = Chain.Incs.end();
+ for( ; IncIter != IncEnd; ++IncIter) {
+ if (IncIter->UserInst == OtherUse)
+ break;
+ }
+ if (IncIter != IncEnd)
+ continue;
+
+ if (SE.isSCEVable(OtherUse->getType())
+ && !isa<SCEVUnknown>(SE.getSCEV(OtherUse))
+ && IU.isIVUserOrOperand(OtherUse)) {
+ continue;
+ }
+ NearUsers.insert(OtherUse);
+ }
+
+ // Since this user is part of the chain, it's no longer considered a use
+ // of the chain.
+ ChainUsersVec[ChainIdx].FarUsers.erase(UserInst);
+}
+
+/// Populate the vector of Chains.
+///
+/// This decreases ILP at the architecture level. Targets with ample registers,
+/// multiple memory ports, and no register renaming probably don't want
+/// this. However, such targets should probably disable LSR altogether.
+///
+/// The job of LSR is to make a reasonable choice of induction variables across
+/// the loop. Subsequent passes can easily "unchain" computation exposing more
+/// ILP *within the loop* if the target wants it.
+///
+/// Finding the best IV chain is potentially a scheduling problem. Since LSR
+/// will not reorder memory operations, it will recognize this as a chain, but
+/// will generate redundant IV increments. Ideally this would be corrected later
+/// by a smart scheduler:
+/// = A[i]
+/// = A[i+x]
+/// A[i] =
+/// A[i+x] =
+///
+/// TODO: Walk the entire domtree within this loop, not just the path to the
+/// loop latch. This will discover chains on side paths, but requires
+/// maintaining multiple copies of the Chains state.
+void LSRInstance::CollectChains() {
+ LLVM_DEBUG(dbgs() << "Collecting IV Chains.\n");
+ SmallVector<ChainUsers, 8> ChainUsersVec;
+
+ SmallVector<BasicBlock *,8> LatchPath;
+ BasicBlock *LoopHeader = L->getHeader();
+ for (DomTreeNode *Rung = DT.getNode(L->getLoopLatch());
+ Rung->getBlock() != LoopHeader; Rung = Rung->getIDom()) {
+ LatchPath.push_back(Rung->getBlock());
+ }
+ LatchPath.push_back(LoopHeader);
+
+ // Walk the instruction stream from the loop header to the loop latch.
+ for (BasicBlock *BB : reverse(LatchPath)) {
+ for (Instruction &I : *BB) {
+ // Skip instructions that weren't seen by IVUsers analysis.
+ if (isa<PHINode>(I) || !IU.isIVUserOrOperand(&I))
+ continue;
+
+ // Ignore users that are part of a SCEV expression. This way we only
+ // consider leaf IV Users. This effectively rediscovers a portion of
+ // IVUsers analysis but in program order this time.
+ if (SE.isSCEVable(I.getType()) && !isa<SCEVUnknown>(SE.getSCEV(&I)))
+ continue;
+
+ // Remove this instruction from any NearUsers set it may be in.
+ for (unsigned ChainIdx = 0, NChains = IVChainVec.size();
+ ChainIdx < NChains; ++ChainIdx) {
+ ChainUsersVec[ChainIdx].NearUsers.erase(&I);
+ }
+ // Search for operands that can be chained.
+ SmallPtrSet<Instruction*, 4> UniqueOperands;
+ User::op_iterator IVOpEnd = I.op_end();
+ User::op_iterator IVOpIter = findIVOperand(I.op_begin(), IVOpEnd, L, SE);
+ while (IVOpIter != IVOpEnd) {
+ Instruction *IVOpInst = cast<Instruction>(*IVOpIter);
+ if (UniqueOperands.insert(IVOpInst).second)
+ ChainInstruction(&I, IVOpInst, ChainUsersVec);
+ IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
+ }
+ } // Continue walking down the instructions.
+ } // Continue walking down the domtree.
+ // Visit phi backedges to determine if the chain can generate the IV postinc.
+ for (PHINode &PN : L->getHeader()->phis()) {
+ if (!SE.isSCEVable(PN.getType()))
+ continue;
+
+ Instruction *IncV =
+ dyn_cast<Instruction>(PN.getIncomingValueForBlock(L->getLoopLatch()));
+ if (IncV)
+ ChainInstruction(&PN, IncV, ChainUsersVec);
+ }
+ // Remove any unprofitable chains.
+ unsigned ChainIdx = 0;
+ for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
+ UsersIdx < NChains; ++UsersIdx) {
+ if (!isProfitableChain(IVChainVec[UsersIdx],
+ ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
+ continue;
+ // Preserve the chain at UsesIdx.
+ if (ChainIdx != UsersIdx)
+ IVChainVec[ChainIdx] = IVChainVec[UsersIdx];
+ FinalizeChain(IVChainVec[ChainIdx]);
+ ++ChainIdx;
+ }
+ IVChainVec.resize(ChainIdx);
+}
+
+void LSRInstance::FinalizeChain(IVChain &Chain) {
+ assert(!Chain.Incs.empty() && "empty IV chains are not allowed");
+ LLVM_DEBUG(dbgs() << "Final Chain: " << *Chain.Incs[0].UserInst << "\n");
+
+ for (const IVInc &Inc : Chain) {
+ LLVM_DEBUG(dbgs() << " Inc: " << *Inc.UserInst << "\n");
+ auto UseI = find(Inc.UserInst->operands(), Inc.IVOperand);
+ assert(UseI != Inc.UserInst->op_end() && "cannot find IV operand");
+ IVIncSet.insert(UseI);
+ }
+}
+
+/// Return true if the IVInc can be folded into an addressing mode.
+static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst,
+ Value *Operand, const TargetTransformInfo &TTI) {
+ const SCEVConstant *IncConst = dyn_cast<SCEVConstant>(IncExpr);
+ if (!IncConst || !isAddressUse(TTI, UserInst, Operand))
+ return false;
+
+ if (IncConst->getAPInt().getMinSignedBits() > 64)
+ return false;
+
+ MemAccessTy AccessTy = getAccessType(TTI, UserInst, Operand);
+ int64_t IncOffset = IncConst->getValue()->getSExtValue();
+ if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr,
+ IncOffset, /*HasBaseReg=*/false))
+ return false;
+
+ return true;
+}
+
+/// Generate an add or subtract for each IVInc in a chain to materialize the IV
+/// user's operand from the previous IV user's operand.
+void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
+ // Find the new IVOperand for the head of the chain. It may have been replaced
+ // by LSR.
+ const IVInc &Head = Chain.Incs[0];
+ User::op_iterator IVOpEnd = Head.UserInst->op_end();
+ // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
+ User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
+ IVOpEnd, L, SE);
+ Value *IVSrc = nullptr;
+ while (IVOpIter != IVOpEnd) {
+ IVSrc = getWideOperand(*IVOpIter);
+
+ // If this operand computes the expression that the chain needs, we may use
+ // it. (Check this after setting IVSrc which is used below.)
+ //
+ // Note that if Head.IncExpr is wider than IVSrc, then this phi is too
+ // narrow for the chain, so we can no longer use it. We do allow using a
+ // wider phi, assuming the LSR checked for free truncation. In that case we
+ // should already have a truncate on this operand such that
+ // getSCEV(IVSrc) == IncExpr.
+ if (SE.getSCEV(*IVOpIter) == Head.IncExpr
+ || SE.getSCEV(IVSrc) == Head.IncExpr) {
+ break;
+ }
+ IVOpIter = findIVOperand(std::next(IVOpIter), IVOpEnd, L, SE);
+ }
+ if (IVOpIter == IVOpEnd) {
+ // Gracefully give up on this chain.
+ LLVM_DEBUG(dbgs() << "Concealed chain head: " << *Head.UserInst << "\n");
+ return;
+ }
+ assert(IVSrc && "Failed to find IV chain source");
+
+ LLVM_DEBUG(dbgs() << "Generate chain at: " << *IVSrc << "\n");
+ Type *IVTy = IVSrc->getType();
+ Type *IntTy = SE.getEffectiveSCEVType(IVTy);
+ const SCEV *LeftOverExpr = nullptr;
+ for (const IVInc &Inc : Chain) {
+ Instruction *InsertPt = Inc.UserInst;
+ if (isa<PHINode>(InsertPt))
+ InsertPt = L->getLoopLatch()->getTerminator();
+
+ // IVOper will replace the current IV User's operand. IVSrc is the IV
+ // value currently held in a register.
+ Value *IVOper = IVSrc;
+ if (!Inc.IncExpr->isZero()) {
+ // IncExpr was the result of subtraction of two narrow values, so must
+ // be signed.
+ const SCEV *IncExpr = SE.getNoopOrSignExtend(Inc.IncExpr, IntTy);
+ LeftOverExpr = LeftOverExpr ?
+ SE.getAddExpr(LeftOverExpr, IncExpr) : IncExpr;
+ }
+ if (LeftOverExpr && !LeftOverExpr->isZero()) {
+ // Expand the IV increment.
+ Rewriter.clearPostInc();
+ Value *IncV = Rewriter.expandCodeFor(LeftOverExpr, IntTy, InsertPt);
+ const SCEV *IVOperExpr = SE.getAddExpr(SE.getUnknown(IVSrc),
+ SE.getUnknown(IncV));
+ IVOper = Rewriter.expandCodeFor(IVOperExpr, IVTy, InsertPt);
+
+ // If an IV increment can't be folded, use it as the next IV value.
+ if (!canFoldIVIncExpr(LeftOverExpr, Inc.UserInst, Inc.IVOperand, TTI)) {
+ assert(IVTy == IVOper->getType() && "inconsistent IV increment type");
+ IVSrc = IVOper;
+ LeftOverExpr = nullptr;
+ }
+ }
+ Type *OperTy = Inc.IVOperand->getType();
+ if (IVTy != OperTy) {
+ assert(SE.getTypeSizeInBits(IVTy) >= SE.getTypeSizeInBits(OperTy) &&
+ "cannot extend a chained IV");
+ IRBuilder<> Builder(InsertPt);
+ IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
+ }
+ Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
+ if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
+ DeadInsts.emplace_back(OperandIsInstr);
+ }
+ // If LSR created a new, wider phi, we may also replace its postinc. We only
+ // do this if we also found a wide value for the head of the chain.
+ if (isa<PHINode>(Chain.tailUserInst())) {
+ for (PHINode &Phi : L->getHeader()->phis()) {
+ if (!isCompatibleIVType(&Phi, IVSrc))
+ continue;
+ Instruction *PostIncV = dyn_cast<Instruction>(
+ Phi.getIncomingValueForBlock(L->getLoopLatch()));
+ if (!PostIncV || (SE.getSCEV(PostIncV) != SE.getSCEV(IVSrc)))
+ continue;
+ Value *IVOper = IVSrc;
+ Type *PostIncTy = PostIncV->getType();
+ if (IVTy != PostIncTy) {
+ assert(PostIncTy->isPointerTy() && "mixing int/ptr IV types");
+ IRBuilder<> Builder(L->getLoopLatch()->getTerminator());
+ Builder.SetCurrentDebugLocation(PostIncV->getDebugLoc());
+ IVOper = Builder.CreatePointerCast(IVSrc, PostIncTy, "lsr.chain");
+ }
+ Phi.replaceUsesOfWith(PostIncV, IVOper);
+ DeadInsts.emplace_back(PostIncV);
+ }
+ }
+}
+
+void LSRInstance::CollectFixupsAndInitialFormulae() {
+ BranchInst *ExitBranch = nullptr;
+ bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
+
+ for (const IVStrideUse &U : IU) {
+ Instruction *UserInst = U.getUser();
+ // Skip IV users that are part of profitable IV Chains.
+ User::op_iterator UseI =
+ find(UserInst->operands(), U.getOperandValToReplace());
+ assert(UseI != UserInst->op_end() && "cannot find IV operand");
+ if (IVIncSet.count(UseI)) {
+ LLVM_DEBUG(dbgs() << "Use is in profitable chain: " << **UseI << '\n');
+ continue;
+ }
+
+ LSRUse::KindType Kind = LSRUse::Basic;
+ MemAccessTy AccessTy;
+ if (isAddressUse(TTI, UserInst, U.getOperandValToReplace())) {
+ Kind = LSRUse::Address;
+ AccessTy = getAccessType(TTI, UserInst, U.getOperandValToReplace());
+ }
+
+ const SCEV *S = IU.getExpr(U);
+ PostIncLoopSet TmpPostIncLoops = U.getPostIncLoops();
+
+ // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
+ // (N - i == 0), and this allows (N - i) to be the expression that we work
+ // with rather than just N or i, so we can consider the register
+ // requirements for both N and i at the same time. Limiting this code to
+ // equality icmps is not a problem because all interesting loops use
+ // equality icmps, thanks to IndVarSimplify.
+ if (ICmpInst *CI = dyn_cast<ICmpInst>(UserInst)) {
+ // If CI can be saved in some target, like replaced inside hardware loop
+ // in PowerPC, no need to generate initial formulae for it.
+ if (SaveCmp && CI == dyn_cast<ICmpInst>(ExitBranch->getCondition()))
+ continue;
+ if (CI->isEquality()) {
+ // Swap the operands if needed to put the OperandValToReplace on the
+ // left, for consistency.
+ Value *NV = CI->getOperand(1);
+ if (NV == U.getOperandValToReplace()) {
+ CI->setOperand(1, CI->getOperand(0));
+ CI->setOperand(0, NV);
+ NV = CI->getOperand(1);
+ Changed = true;
+ }
+
+ // x == y --> x - y == 0
+ const SCEV *N = SE.getSCEV(NV);
+ if (SE.isLoopInvariant(N, L) && isSafeToExpand(N, SE)) {
+ // S is normalized, so normalize N before folding it into S
+ // to keep the result normalized.
+ N = normalizeForPostIncUse(N, TmpPostIncLoops, SE);
+ Kind = LSRUse::ICmpZero;
+ S = SE.getMinusSCEV(N, S);
+ }
+
+ // -1 and the negations of all interesting strides (except the negation
+ // of -1) are now also interesting.
+ for (size_t i = 0, e = Factors.size(); i != e; ++i)
+ if (Factors[i] != -1)
+ Factors.insert(-(uint64_t)Factors[i]);
+ Factors.insert(-1);
+ }
+ }
+
+ // Get or create an LSRUse.
+ std::pair<size_t, int64_t> P = getUse(S, Kind, AccessTy);
+ size_t LUIdx = P.first;
+ int64_t Offset = P.second;
+ LSRUse &LU = Uses[LUIdx];
+
+ // Record the fixup.
+ LSRFixup &LF = LU.getNewFixup();
+ LF.UserInst = UserInst;
+ LF.OperandValToReplace = U.getOperandValToReplace();
+ LF.PostIncLoops = TmpPostIncLoops;
+ LF.Offset = Offset;
+ LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+
+ if (!LU.WidestFixupType ||
+ SE.getTypeSizeInBits(LU.WidestFixupType) <
+ SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+ LU.WidestFixupType = LF.OperandValToReplace->getType();
+
+ // If this is the first use of this LSRUse, give it a formula.
+ if (LU.Formulae.empty()) {
+ InsertInitialFormula(S, LU, LUIdx);
+ CountRegisters(LU.Formulae.back(), LUIdx);
+ }
+ }
+
+ LLVM_DEBUG(print_fixups(dbgs()));
+}
+
+/// Insert a formula for the given expression into the given use, separating out
+/// loop-variant portions from loop-invariant and loop-computable portions.
+void
+LSRInstance::InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx) {
+ // Mark uses whose expressions cannot be expanded.
+ if (!isSafeToExpand(S, SE))
+ LU.RigidFormula = true;
+
+ Formula F;
+ F.initialMatch(S, L, SE);
+ bool Inserted = InsertFormula(LU, LUIdx, F);
+ assert(Inserted && "Initial formula already exists!"); (void)Inserted;
+}
+
+/// Insert a simple single-register formula for the given expression into the
+/// given use.
+void
+LSRInstance::InsertSupplementalFormula(const SCEV *S,
+ LSRUse &LU, size_t LUIdx) {
+ Formula F;
+ F.BaseRegs.push_back(S);
+ F.HasBaseReg = true;
+ bool Inserted = InsertFormula(LU, LUIdx, F);
+ assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
+}
+
+/// Note which registers are used by the given formula, updating RegUses.
+void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
+ if (F.ScaledReg)
+ RegUses.countRegister(F.ScaledReg, LUIdx);
+ for (const SCEV *BaseReg : F.BaseRegs)
+ RegUses.countRegister(BaseReg, LUIdx);
+}
+
+/// If the given formula has not yet been inserted, add it to the list, and
+/// return true. Return false otherwise.
+bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
+ // Do not insert formula that we will not be able to expand.
+ assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
+ "Formula is illegal");
+
+ if (!LU.InsertFormula(F, *L))
+ return false;
+
+ CountRegisters(F, LUIdx);
+ return true;
+}
+
+/// Check for other uses of loop-invariant values which we're tracking. These
+/// other uses will pin these values in registers, making them less profitable
+/// for elimination.
+/// TODO: This currently misses non-constant addrec step registers.
+/// TODO: Should this give more weight to users inside the loop?
+void
+LSRInstance::CollectLoopInvariantFixupsAndFormulae() {
+ SmallVector<const SCEV *, 8> Worklist(RegUses.begin(), RegUses.end());
+ SmallPtrSet<const SCEV *, 32> Visited;
+
+ while (!Worklist.empty()) {
+ const SCEV *S = Worklist.pop_back_val();
+
+ // Don't process the same SCEV twice
+ if (!Visited.insert(S).second)
+ continue;
+
+ if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S))
+ Worklist.append(N->op_begin(), N->op_end());
else if (const SCEVIntegralCastExpr *C = dyn_cast<SCEVIntegralCastExpr>(S))
- Worklist.push_back(C->getOperand());
- else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
- Worklist.push_back(D->getLHS());
- Worklist.push_back(D->getRHS());
- } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
- const Value *V = US->getValue();
- if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
- // Look for instructions defined outside the loop.
- if (L->contains(Inst)) continue;
- } else if (isa<UndefValue>(V))
- // Undef doesn't have a live range, so it doesn't matter.
- continue;
- for (const Use &U : V->uses()) {
- const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
- // Ignore non-instructions.
- if (!UserInst)
- continue;
- // Ignore instructions in other functions (as can happen with
- // Constants).
- if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
- continue;
- // Ignore instructions not dominated by the loop.
- const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
- UserInst->getParent() :
- cast<PHINode>(UserInst)->getIncomingBlock(
- PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
- if (!DT.dominates(L->getHeader(), UseBB))
- continue;
- // Don't bother if the instruction is in a BB which ends in an EHPad.
- if (UseBB->getTerminator()->isEHPad())
- continue;
- // Don't bother rewriting PHIs in catchswitch blocks.
- if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
- continue;
- // Ignore uses which are part of other SCEV expressions, to avoid
- // analyzing them multiple times.
- if (SE.isSCEVable(UserInst->getType())) {
- const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
- // If the user is a no-op, look through to its uses.
- if (!isa<SCEVUnknown>(UserS))
- continue;
- if (UserS == US) {
- Worklist.push_back(
- SE.getUnknown(const_cast<Instruction *>(UserInst)));
- continue;
- }
- }
- // Ignore icmp instructions which are already being analyzed.
- if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
- unsigned OtherIdx = !U.getOperandNo();
- Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
- if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
- continue;
- }
-
- std::pair<size_t, int64_t> P = getUse(
- S, LSRUse::Basic, MemAccessTy());
- size_t LUIdx = P.first;
- int64_t Offset = P.second;
- LSRUse &LU = Uses[LUIdx];
- LSRFixup &LF = LU.getNewFixup();
- LF.UserInst = const_cast<Instruction *>(UserInst);
- LF.OperandValToReplace = U;
- LF.Offset = Offset;
- LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
- if (!LU.WidestFixupType ||
- SE.getTypeSizeInBits(LU.WidestFixupType) <
- SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
- LU.WidestFixupType = LF.OperandValToReplace->getType();
- InsertSupplementalFormula(US, LU, LUIdx);
- CountRegisters(LU.Formulae.back(), Uses.size() - 1);
- break;
- }
- }
- }
-}
-
-/// Split S into subexpressions which can be pulled out into separate
-/// registers. If C is non-null, multiply each subexpression by C.
-///
-/// Return remainder expression after factoring the subexpressions captured by
-/// Ops. If Ops is complete, return NULL.
-static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
- SmallVectorImpl<const SCEV *> &Ops,
- const Loop *L,
- ScalarEvolution &SE,
- unsigned Depth = 0) {
- // Arbitrarily cap recursion to protect compile time.
- if (Depth >= 3)
- return S;
-
- if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
- // Break out add operands.
- for (const SCEV *S : Add->operands()) {
- const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
- if (Remainder)
- Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
- }
- return nullptr;
- } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
- // Split a non-zero base out of an addrec.
- if (AR->getStart()->isZero() || !AR->isAffine())
- return S;
-
- const SCEV *Remainder = CollectSubexprs(AR->getStart(),
- C, Ops, L, SE, Depth+1);
- // Split the non-zero AddRec unless it is part of a nested recurrence that
- // does not pertain to this loop.
- if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
- Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
- Remainder = nullptr;
- }
- if (Remainder != AR->getStart()) {
- if (!Remainder)
- Remainder = SE.getConstant(AR->getType(), 0);
- return SE.getAddRecExpr(Remainder,
- AR->getStepRecurrence(SE),
- AR->getLoop(),
- //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
- SCEV::FlagAnyWrap);
- }
- } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
- // Break (C * (a + b + c)) into C*a + C*b + C*c.
- if (Mul->getNumOperands() != 2)
- return S;
- if (const SCEVConstant *Op0 =
- dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
- C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
- const SCEV *Remainder =
- CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
- if (Remainder)
- Ops.push_back(SE.getMulExpr(C, Remainder));
- return nullptr;
- }
- }
- return S;
-}
-
-/// Return true if the SCEV represents a value that may end up as a
-/// post-increment operation.
-static bool mayUsePostIncMode(const TargetTransformInfo &TTI,
- LSRUse &LU, const SCEV *S, const Loop *L,
- ScalarEvolution &SE) {
- if (LU.Kind != LSRUse::Address ||
- !LU.AccessTy.getType()->isIntOrIntVectorTy())
- return false;
- const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
- if (!AR)
- return false;
- const SCEV *LoopStep = AR->getStepRecurrence(SE);
- if (!isa<SCEVConstant>(LoopStep))
- return false;
- // Check if a post-indexed load/store can be used.
- if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
- TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
- const SCEV *LoopStart = AR->getStart();
- if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
- return true;
- }
- return false;
-}
-
-/// Helper function for LSRInstance::GenerateReassociations.
-void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
- const Formula &Base,
- unsigned Depth, size_t Idx,
- bool IsScaledReg) {
- const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
- // Don't generate reassociations for the base register of a value that
- // may generate a post-increment operator. The reason is that the
- // reassociations cause extra base+register formula to be created,
- // and possibly chosen, but the post-increment is more efficient.
- if (TTI.shouldFavorPostInc() && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
- return;
- SmallVector<const SCEV *, 8> AddOps;
- const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
- if (Remainder)
- AddOps.push_back(Remainder);
-
- if (AddOps.size() == 1)
- return;
-
- for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
- JE = AddOps.end();
- J != JE; ++J) {
- // Loop-variant "unknown" values are uninteresting; we won't be able to
- // do anything meaningful with them.
- if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
- continue;
-
- // Don't pull a constant into a register if the constant could be folded
- // into an immediate field.
- if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
- LU.AccessTy, *J, Base.getNumRegs() > 1))
- continue;
-
- // Collect all operands except *J.
- SmallVector<const SCEV *, 8> InnerAddOps(
- ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
- InnerAddOps.append(std::next(J),
- ((const SmallVector<const SCEV *, 8> &)AddOps).end());
-
- // Don't leave just a constant behind in a register if the constant could
- // be folded into an immediate field.
- if (InnerAddOps.size() == 1 &&
- isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
- LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
- continue;
-
- const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
- if (InnerSum->isZero())
- continue;
- Formula F = Base;
-
- // Add the remaining pieces of the add back into the new formula.
- const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
- if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
- TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
- InnerSumSC->getValue()->getZExtValue())) {
- F.UnfoldedOffset =
- (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
- if (IsScaledReg)
- F.ScaledReg = nullptr;
- else
- F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
- } else if (IsScaledReg)
- F.ScaledReg = InnerSum;
- else
- F.BaseRegs[Idx] = InnerSum;
-
- // Add J as its own register, or an unfolded immediate.
- const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
- if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
- TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
- SC->getValue()->getZExtValue()))
- F.UnfoldedOffset =
- (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
- else
- F.BaseRegs.push_back(*J);
- // We may have changed the number of register in base regs, adjust the
- // formula accordingly.
- F.canonicalize(*L);
-
- if (InsertFormula(LU, LUIdx, F))
- // If that formula hadn't been seen before, recurse to find more like
- // it.
- // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
- // Because just Depth is not enough to bound compile time.
- // This means that every time AddOps.size() is greater 16^x we will add
- // x to Depth.
- GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
- Depth + 1 + (Log2_32(AddOps.size()) >> 2));
- }
-}
-
-/// Split out subexpressions from adds and the bases of addrecs.
-void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
- Formula Base, unsigned Depth) {
- assert(Base.isCanonical(*L) && "Input must be in the canonical form");
- // Arbitrarily cap recursion to protect compile time.
- if (Depth >= 3)
- return;
-
- for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
- GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
-
- if (Base.Scale == 1)
- GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
- /* Idx */ -1, /* IsScaledReg */ true);
-}
-
-/// Generate a formula consisting of all of the loop-dominating registers added
-/// into a single register.
-void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
- Formula Base) {
- // This method is only interesting on a plurality of registers.
- if (Base.BaseRegs.size() + (Base.Scale == 1) +
- (Base.UnfoldedOffset != 0) <= 1)
- return;
-
- // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
- // processing the formula.
- Base.unscale();
- SmallVector<const SCEV *, 4> Ops;
- Formula NewBase = Base;
- NewBase.BaseRegs.clear();
- Type *CombinedIntegerType = nullptr;
- for (const SCEV *BaseReg : Base.BaseRegs) {
- if (SE.properlyDominates(BaseReg, L->getHeader()) &&
- !SE.hasComputableLoopEvolution(BaseReg, L)) {
- if (!CombinedIntegerType)
- CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
- Ops.push_back(BaseReg);
- }
- else
- NewBase.BaseRegs.push_back(BaseReg);
- }
-
- // If no register is relevant, we're done.
- if (Ops.size() == 0)
- return;
-
- // Utility function for generating the required variants of the combined
- // registers.
- auto GenerateFormula = [&](const SCEV *Sum) {
- Formula F = NewBase;
-
- // TODO: If Sum is zero, it probably means ScalarEvolution missed an
- // opportunity to fold something. For now, just ignore such cases
- // rather than proceed with zero in a register.
- if (Sum->isZero())
- return;
-
- F.BaseRegs.push_back(Sum);
- F.canonicalize(*L);
- (void)InsertFormula(LU, LUIdx, F);
- };
-
- // If we collected at least two registers, generate a formula combining them.
- if (Ops.size() > 1) {
- SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
- GenerateFormula(SE.getAddExpr(OpsCopy));
- }
-
- // If we have an unfolded offset, generate a formula combining it with the
- // registers collected.
- if (NewBase.UnfoldedOffset) {
- assert(CombinedIntegerType && "Missing a type for the unfolded offset");
- Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset,
- true));
- NewBase.UnfoldedOffset = 0;
- GenerateFormula(SE.getAddExpr(Ops));
- }
-}
-
-/// Helper function for LSRInstance::GenerateSymbolicOffsets.
-void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
- const Formula &Base, size_t Idx,
- bool IsScaledReg) {
- const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
- GlobalValue *GV = ExtractSymbol(G, SE);
- if (G->isZero() || !GV)
- return;
- Formula F = Base;
- F.BaseGV = GV;
- if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
- return;
- if (IsScaledReg)
- F.ScaledReg = G;
- else
- F.BaseRegs[Idx] = G;
- (void)InsertFormula(LU, LUIdx, F);
-}
-
-/// Generate reuse formulae using symbolic offsets.
-void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
- Formula Base) {
- // We can't add a symbolic offset if the address already contains one.
- if (Base.BaseGV) return;
-
- for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
- GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
- if (Base.Scale == 1)
- GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
- /* IsScaledReg */ true);
-}
-
-/// Helper function for LSRInstance::GenerateConstantOffsets.
-void LSRInstance::GenerateConstantOffsetsImpl(
- LSRUse &LU, unsigned LUIdx, const Formula &Base,
- const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
-
- auto GenerateOffset = [&](const SCEV *G, int64_t Offset) {
- Formula F = Base;
- F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
-
- if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
- LU.AccessTy, F)) {
- // Add the offset to the base register.
- const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
- // If it cancelled out, drop the base register, otherwise update it.
- if (NewG->isZero()) {
- if (IsScaledReg) {
- F.Scale = 0;
- F.ScaledReg = nullptr;
- } else
- F.deleteBaseReg(F.BaseRegs[Idx]);
- F.canonicalize(*L);
- } else if (IsScaledReg)
- F.ScaledReg = NewG;
- else
- F.BaseRegs[Idx] = NewG;
-
- (void)InsertFormula(LU, LUIdx, F);
- }
- };
-
- const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
-
- // With constant offsets and constant steps, we can generate pre-inc
- // accesses by having the offset equal the step. So, for access #0 with a
- // step of 8, we generate a G - 8 base which would require the first access
- // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
- // for itself and hopefully becomes the base for other accesses. This means
- // means that a single pre-indexed access can be generated to become the new
- // base pointer for each iteration of the loop, resulting in no extra add/sub
- // instructions for pointer updating.
- if (FavorBackedgeIndex && LU.Kind == LSRUse::Address) {
- if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
- if (auto *StepRec =
- dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
- const APInt &StepInt = StepRec->getAPInt();
- int64_t Step = StepInt.isNegative() ?
- StepInt.getSExtValue() : StepInt.getZExtValue();
-
- for (int64_t Offset : Worklist) {
- Offset -= Step;
- GenerateOffset(G, Offset);
- }
- }
- }
- }
- for (int64_t Offset : Worklist)
- GenerateOffset(G, Offset);
-
- int64_t Imm = ExtractImmediate(G, SE);
- if (G->isZero() || Imm == 0)
- return;
- Formula F = Base;
- F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
- if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
- return;
+ Worklist.push_back(C->getOperand());
+ else if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+ Worklist.push_back(D->getLHS());
+ Worklist.push_back(D->getRHS());
+ } else if (const SCEVUnknown *US = dyn_cast<SCEVUnknown>(S)) {
+ const Value *V = US->getValue();
+ if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+ // Look for instructions defined outside the loop.
+ if (L->contains(Inst)) continue;
+ } else if (isa<UndefValue>(V))
+ // Undef doesn't have a live range, so it doesn't matter.
+ continue;
+ for (const Use &U : V->uses()) {
+ const Instruction *UserInst = dyn_cast<Instruction>(U.getUser());
+ // Ignore non-instructions.
+ if (!UserInst)
+ continue;
+ // Ignore instructions in other functions (as can happen with
+ // Constants).
+ if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
+ continue;
+ // Ignore instructions not dominated by the loop.
+ const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
+ UserInst->getParent() :
+ cast<PHINode>(UserInst)->getIncomingBlock(
+ PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
+ if (!DT.dominates(L->getHeader(), UseBB))
+ continue;
+ // Don't bother if the instruction is in a BB which ends in an EHPad.
+ if (UseBB->getTerminator()->isEHPad())
+ continue;
+ // Don't bother rewriting PHIs in catchswitch blocks.
+ if (isa<CatchSwitchInst>(UserInst->getParent()->getTerminator()))
+ continue;
+ // Ignore uses which are part of other SCEV expressions, to avoid
+ // analyzing them multiple times.
+ if (SE.isSCEVable(UserInst->getType())) {
+ const SCEV *UserS = SE.getSCEV(const_cast<Instruction *>(UserInst));
+ // If the user is a no-op, look through to its uses.
+ if (!isa<SCEVUnknown>(UserS))
+ continue;
+ if (UserS == US) {
+ Worklist.push_back(
+ SE.getUnknown(const_cast<Instruction *>(UserInst)));
+ continue;
+ }
+ }
+ // Ignore icmp instructions which are already being analyzed.
+ if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
+ unsigned OtherIdx = !U.getOperandNo();
+ Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
+ if (SE.hasComputableLoopEvolution(SE.getSCEV(OtherOp), L))
+ continue;
+ }
+
+ std::pair<size_t, int64_t> P = getUse(
+ S, LSRUse::Basic, MemAccessTy());
+ size_t LUIdx = P.first;
+ int64_t Offset = P.second;
+ LSRUse &LU = Uses[LUIdx];
+ LSRFixup &LF = LU.getNewFixup();
+ LF.UserInst = const_cast<Instruction *>(UserInst);
+ LF.OperandValToReplace = U;
+ LF.Offset = Offset;
+ LU.AllFixupsOutsideLoop &= LF.isUseFullyOutsideLoop(L);
+ if (!LU.WidestFixupType ||
+ SE.getTypeSizeInBits(LU.WidestFixupType) <
+ SE.getTypeSizeInBits(LF.OperandValToReplace->getType()))
+ LU.WidestFixupType = LF.OperandValToReplace->getType();
+ InsertSupplementalFormula(US, LU, LUIdx);
+ CountRegisters(LU.Formulae.back(), Uses.size() - 1);
+ break;
+ }
+ }
+ }
+}
+
+/// Split S into subexpressions which can be pulled out into separate
+/// registers. If C is non-null, multiply each subexpression by C.
+///
+/// Return remainder expression after factoring the subexpressions captured by
+/// Ops. If Ops is complete, return NULL.
+static const SCEV *CollectSubexprs(const SCEV *S, const SCEVConstant *C,
+ SmallVectorImpl<const SCEV *> &Ops,
+ const Loop *L,
+ ScalarEvolution &SE,
+ unsigned Depth = 0) {
+ // Arbitrarily cap recursion to protect compile time.
+ if (Depth >= 3)
+ return S;
+
+ if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
+ // Break out add operands.
+ for (const SCEV *S : Add->operands()) {
+ const SCEV *Remainder = CollectSubexprs(S, C, Ops, L, SE, Depth+1);
+ if (Remainder)
+ Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+ }
+ return nullptr;
+ } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ // Split a non-zero base out of an addrec.
+ if (AR->getStart()->isZero() || !AR->isAffine())
+ return S;
+
+ const SCEV *Remainder = CollectSubexprs(AR->getStart(),
+ C, Ops, L, SE, Depth+1);
+ // Split the non-zero AddRec unless it is part of a nested recurrence that
+ // does not pertain to this loop.
+ if (Remainder && (AR->getLoop() == L || !isa<SCEVAddRecExpr>(Remainder))) {
+ Ops.push_back(C ? SE.getMulExpr(C, Remainder) : Remainder);
+ Remainder = nullptr;
+ }
+ if (Remainder != AR->getStart()) {
+ if (!Remainder)
+ Remainder = SE.getConstant(AR->getType(), 0);
+ return SE.getAddRecExpr(Remainder,
+ AR->getStepRecurrence(SE),
+ AR->getLoop(),
+ //FIXME: AR->getNoWrapFlags(SCEV::FlagNW)
+ SCEV::FlagAnyWrap);
+ }
+ } else if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) {
+ // Break (C * (a + b + c)) into C*a + C*b + C*c.
+ if (Mul->getNumOperands() != 2)
+ return S;
+ if (const SCEVConstant *Op0 =
+ dyn_cast<SCEVConstant>(Mul->getOperand(0))) {
+ C = C ? cast<SCEVConstant>(SE.getMulExpr(C, Op0)) : Op0;
+ const SCEV *Remainder =
+ CollectSubexprs(Mul->getOperand(1), C, Ops, L, SE, Depth+1);
+ if (Remainder)
+ Ops.push_back(SE.getMulExpr(C, Remainder));
+ return nullptr;
+ }
+ }
+ return S;
+}
+
+/// Return true if the SCEV represents a value that may end up as a
+/// post-increment operation.
+static bool mayUsePostIncMode(const TargetTransformInfo &TTI,
+ LSRUse &LU, const SCEV *S, const Loop *L,
+ ScalarEvolution &SE) {
+ if (LU.Kind != LSRUse::Address ||
+ !LU.AccessTy.getType()->isIntOrIntVectorTy())
+ return false;
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
+ if (!AR)
+ return false;
+ const SCEV *LoopStep = AR->getStepRecurrence(SE);
+ if (!isa<SCEVConstant>(LoopStep))
+ return false;
+ // Check if a post-indexed load/store can be used.
+ if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
+ TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
+ const SCEV *LoopStart = AR->getStart();
+ if (!isa<SCEVConstant>(LoopStart) && SE.isLoopInvariant(LoopStart, L))
+ return true;
+ }
+ return false;
+}
+
+/// Helper function for LSRInstance::GenerateReassociations.
+void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base,
+ unsigned Depth, size_t Idx,
+ bool IsScaledReg) {
+ const SCEV *BaseReg = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+ // Don't generate reassociations for the base register of a value that
+ // may generate a post-increment operator. The reason is that the
+ // reassociations cause extra base+register formula to be created,
+ // and possibly chosen, but the post-increment is more efficient.
+ if (TTI.shouldFavorPostInc() && mayUsePostIncMode(TTI, LU, BaseReg, L, SE))
+ return;
+ SmallVector<const SCEV *, 8> AddOps;
+ const SCEV *Remainder = CollectSubexprs(BaseReg, nullptr, AddOps, L, SE);
+ if (Remainder)
+ AddOps.push_back(Remainder);
+
+ if (AddOps.size() == 1)
+ return;
+
+ for (SmallVectorImpl<const SCEV *>::const_iterator J = AddOps.begin(),
+ JE = AddOps.end();
+ J != JE; ++J) {
+ // Loop-variant "unknown" values are uninteresting; we won't be able to
+ // do anything meaningful with them.
+ if (isa<SCEVUnknown>(*J) && !SE.isLoopInvariant(*J, L))
+ continue;
+
+ // Don't pull a constant into a register if the constant could be folded
+ // into an immediate field.
+ if (isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+ LU.AccessTy, *J, Base.getNumRegs() > 1))
+ continue;
+
+ // Collect all operands except *J.
+ SmallVector<const SCEV *, 8> InnerAddOps(
+ ((const SmallVector<const SCEV *, 8> &)AddOps).begin(), J);
+ InnerAddOps.append(std::next(J),
+ ((const SmallVector<const SCEV *, 8> &)AddOps).end());
+
+ // Don't leave just a constant behind in a register if the constant could
+ // be folded into an immediate field.
+ if (InnerAddOps.size() == 1 &&
+ isAlwaysFoldable(TTI, SE, LU.MinOffset, LU.MaxOffset, LU.Kind,
+ LU.AccessTy, InnerAddOps[0], Base.getNumRegs() > 1))
+ continue;
+
+ const SCEV *InnerSum = SE.getAddExpr(InnerAddOps);
+ if (InnerSum->isZero())
+ continue;
+ Formula F = Base;
+
+ // Add the remaining pieces of the add back into the new formula.
+ const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum);
+ if (InnerSumSC && SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 &&
+ TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+ InnerSumSC->getValue()->getZExtValue())) {
+ F.UnfoldedOffset =
+ (uint64_t)F.UnfoldedOffset + InnerSumSC->getValue()->getZExtValue();
+ if (IsScaledReg)
+ F.ScaledReg = nullptr;
+ else
+ F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
+ } else if (IsScaledReg)
+ F.ScaledReg = InnerSum;
+ else
+ F.BaseRegs[Idx] = InnerSum;
+
+ // Add J as its own register, or an unfolded immediate.
+ const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J);
+ if (SC && SE.getTypeSizeInBits(SC->getType()) <= 64 &&
+ TTI.isLegalAddImmediate((uint64_t)F.UnfoldedOffset +
+ SC->getValue()->getZExtValue()))
+ F.UnfoldedOffset =
+ (uint64_t)F.UnfoldedOffset + SC->getValue()->getZExtValue();
+ else
+ F.BaseRegs.push_back(*J);
+ // We may have changed the number of register in base regs, adjust the
+ // formula accordingly.
+ F.canonicalize(*L);
+
+ if (InsertFormula(LU, LUIdx, F))
+ // If that formula hadn't been seen before, recurse to find more like
+ // it.
+ // Add check on Log16(AddOps.size()) - same as Log2_32(AddOps.size()) >> 2)
+ // Because just Depth is not enough to bound compile time.
+ // This means that every time AddOps.size() is greater 16^x we will add
+ // x to Depth.
+ GenerateReassociations(LU, LUIdx, LU.Formulae.back(),
+ Depth + 1 + (Log2_32(AddOps.size()) >> 2));
+ }
+}
+
+/// Split out subexpressions from adds and the bases of addrecs.
+void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
+ Formula Base, unsigned Depth) {
+ assert(Base.isCanonical(*L) && "Input must be in the canonical form");
+ // Arbitrarily cap recursion to protect compile time.
+ if (Depth >= 3)
+ return;
+
+ for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+ GenerateReassociationsImpl(LU, LUIdx, Base, Depth, i);
+
+ if (Base.Scale == 1)
+ GenerateReassociationsImpl(LU, LUIdx, Base, Depth,
+ /* Idx */ -1, /* IsScaledReg */ true);
+}
+
+/// Generate a formula consisting of all of the loop-dominating registers added
+/// into a single register.
+void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
+ Formula Base) {
+ // This method is only interesting on a plurality of registers.
+ if (Base.BaseRegs.size() + (Base.Scale == 1) +
+ (Base.UnfoldedOffset != 0) <= 1)
+ return;
+
+ // Flatten the representation, i.e., reg1 + 1*reg2 => reg1 + reg2, before
+ // processing the formula.
+ Base.unscale();
+ SmallVector<const SCEV *, 4> Ops;
+ Formula NewBase = Base;
+ NewBase.BaseRegs.clear();
+ Type *CombinedIntegerType = nullptr;
+ for (const SCEV *BaseReg : Base.BaseRegs) {
+ if (SE.properlyDominates(BaseReg, L->getHeader()) &&
+ !SE.hasComputableLoopEvolution(BaseReg, L)) {
+ if (!CombinedIntegerType)
+ CombinedIntegerType = SE.getEffectiveSCEVType(BaseReg->getType());
+ Ops.push_back(BaseReg);
+ }
+ else
+ NewBase.BaseRegs.push_back(BaseReg);
+ }
+
+ // If no register is relevant, we're done.
+ if (Ops.size() == 0)
+ return;
+
+ // Utility function for generating the required variants of the combined
+ // registers.
+ auto GenerateFormula = [&](const SCEV *Sum) {
+ Formula F = NewBase;
+
+ // TODO: If Sum is zero, it probably means ScalarEvolution missed an
+ // opportunity to fold something. For now, just ignore such cases
+ // rather than proceed with zero in a register.
+ if (Sum->isZero())
+ return;
+
+ F.BaseRegs.push_back(Sum);
+ F.canonicalize(*L);
+ (void)InsertFormula(LU, LUIdx, F);
+ };
+
+ // If we collected at least two registers, generate a formula combining them.
+ if (Ops.size() > 1) {
+ SmallVector<const SCEV *, 4> OpsCopy(Ops); // Don't let SE modify Ops.
+ GenerateFormula(SE.getAddExpr(OpsCopy));
+ }
+
+ // If we have an unfolded offset, generate a formula combining it with the
+ // registers collected.
+ if (NewBase.UnfoldedOffset) {
+ assert(CombinedIntegerType && "Missing a type for the unfolded offset");
+ Ops.push_back(SE.getConstant(CombinedIntegerType, NewBase.UnfoldedOffset,
+ true));
+ NewBase.UnfoldedOffset = 0;
+ GenerateFormula(SE.getAddExpr(Ops));
+ }
+}
+
+/// Helper function for LSRInstance::GenerateSymbolicOffsets.
+void LSRInstance::GenerateSymbolicOffsetsImpl(LSRUse &LU, unsigned LUIdx,
+ const Formula &Base, size_t Idx,
+ bool IsScaledReg) {
+ const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+ GlobalValue *GV = ExtractSymbol(G, SE);
+ if (G->isZero() || !GV)
+ return;
+ Formula F = Base;
+ F.BaseGV = GV;
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+ return;
+ if (IsScaledReg)
+ F.ScaledReg = G;
+ else
+ F.BaseRegs[Idx] = G;
+ (void)InsertFormula(LU, LUIdx, F);
+}
+
+/// Generate reuse formulae using symbolic offsets.
+void LSRInstance::GenerateSymbolicOffsets(LSRUse &LU, unsigned LUIdx,
+ Formula Base) {
+ // We can't add a symbolic offset if the address already contains one.
+ if (Base.BaseGV) return;
+
+ for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+ GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, i);
+ if (Base.Scale == 1)
+ GenerateSymbolicOffsetsImpl(LU, LUIdx, Base, /* Idx */ -1,
+ /* IsScaledReg */ true);
+}
+
+/// Helper function for LSRInstance::GenerateConstantOffsets.
+void LSRInstance::GenerateConstantOffsetsImpl(
+ LSRUse &LU, unsigned LUIdx, const Formula &Base,
+ const SmallVectorImpl<int64_t> &Worklist, size_t Idx, bool IsScaledReg) {
+
+ auto GenerateOffset = [&](const SCEV *G, int64_t Offset) {
+ Formula F = Base;
+ F.BaseOffset = (uint64_t)Base.BaseOffset - Offset;
+
+ if (isLegalUse(TTI, LU.MinOffset - Offset, LU.MaxOffset - Offset, LU.Kind,
+ LU.AccessTy, F)) {
+ // Add the offset to the base register.
+ const SCEV *NewG = SE.getAddExpr(SE.getConstant(G->getType(), Offset), G);
+ // If it cancelled out, drop the base register, otherwise update it.
+ if (NewG->isZero()) {
+ if (IsScaledReg) {
+ F.Scale = 0;
+ F.ScaledReg = nullptr;
+ } else
+ F.deleteBaseReg(F.BaseRegs[Idx]);
+ F.canonicalize(*L);
+ } else if (IsScaledReg)
+ F.ScaledReg = NewG;
+ else
+ F.BaseRegs[Idx] = NewG;
+
+ (void)InsertFormula(LU, LUIdx, F);
+ }
+ };
+
+ const SCEV *G = IsScaledReg ? Base.ScaledReg : Base.BaseRegs[Idx];
+
+ // With constant offsets and constant steps, we can generate pre-inc
+ // accesses by having the offset equal the step. So, for access #0 with a
+ // step of 8, we generate a G - 8 base which would require the first access
+ // to be ((G - 8) + 8),+,8. The pre-indexed access then updates the pointer
+ // for itself and hopefully becomes the base for other accesses. This means
+ // means that a single pre-indexed access can be generated to become the new
+ // base pointer for each iteration of the loop, resulting in no extra add/sub
+ // instructions for pointer updating.
+ if (FavorBackedgeIndex && LU.Kind == LSRUse::Address) {
+ if (auto *GAR = dyn_cast<SCEVAddRecExpr>(G)) {
+ if (auto *StepRec =
+ dyn_cast<SCEVConstant>(GAR->getStepRecurrence(SE))) {
+ const APInt &StepInt = StepRec->getAPInt();
+ int64_t Step = StepInt.isNegative() ?
+ StepInt.getSExtValue() : StepInt.getZExtValue();
+
+ for (int64_t Offset : Worklist) {
+ Offset -= Step;
+ GenerateOffset(G, Offset);
+ }
+ }
+ }
+ }
+ for (int64_t Offset : Worklist)
+ GenerateOffset(G, Offset);
+
+ int64_t Imm = ExtractImmediate(G, SE);
+ if (G->isZero() || Imm == 0)
+ return;
+ Formula F = Base;
+ F.BaseOffset = (uint64_t)F.BaseOffset + Imm;
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F))
+ return;
if (IsScaledReg) {
- F.ScaledReg = G;
+ F.ScaledReg = G;
} else {
- F.BaseRegs[Idx] = G;
+ F.BaseRegs[Idx] = G;
// We may generate non canonical Formula if G is a recurrent expr reg
// related with current loop while F.ScaledReg is not.
F.canonicalize(*L);
}
- (void)InsertFormula(LU, LUIdx, F);
-}
-
-/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
-void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
- Formula Base) {
- // TODO: For now, just add the min and max offset, because it usually isn't
- // worthwhile looking at everything inbetween.
- SmallVector<int64_t, 2> Worklist;
- Worklist.push_back(LU.MinOffset);
- if (LU.MaxOffset != LU.MinOffset)
- Worklist.push_back(LU.MaxOffset);
-
- for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
- GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
- if (Base.Scale == 1)
- GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
- /* IsScaledReg */ true);
-}
-
-/// For ICmpZero, check to see if we can scale up the comparison. For example, x
-/// == y -> x*c == y*c.
-void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
- Formula Base) {
- if (LU.Kind != LSRUse::ICmpZero) return;
-
- // Determine the integer type for the base formula.
- Type *IntTy = Base.getType();
- if (!IntTy) return;
- if (SE.getTypeSizeInBits(IntTy) > 64) return;
-
- // Don't do this if there is more than one offset.
- if (LU.MinOffset != LU.MaxOffset) return;
-
- // Check if transformation is valid. It is illegal to multiply pointer.
- if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
- return;
- for (const SCEV *BaseReg : Base.BaseRegs)
- if (BaseReg->getType()->isPointerTy())
- return;
- assert(!Base.BaseGV && "ICmpZero use is not legal!");
-
- // Check each interesting stride.
- for (int64_t Factor : Factors) {
- // Check that the multiplication doesn't overflow.
- if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1)
- continue;
- int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
- if (NewBaseOffset / Factor != Base.BaseOffset)
- continue;
- // If the offset will be truncated at this use, check that it is in bounds.
- if (!IntTy->isPointerTy() &&
- !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
- continue;
-
- // Check that multiplying with the use offset doesn't overflow.
- int64_t Offset = LU.MinOffset;
- if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1)
- continue;
- Offset = (uint64_t)Offset * Factor;
- if (Offset / Factor != LU.MinOffset)
- continue;
- // If the offset will be truncated at this use, check that it is in bounds.
- if (!IntTy->isPointerTy() &&
- !ConstantInt::isValueValidForType(IntTy, Offset))
- continue;
-
- Formula F = Base;
- F.BaseOffset = NewBaseOffset;
-
- // Check that this scale is legal.
- if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
- continue;
-
- // Compensate for the use having MinOffset built into it.
- F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
-
- const SCEV *FactorS = SE.getConstant(IntTy, Factor);
-
- // Check that multiplying with each base register doesn't overflow.
- for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
- F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
- if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
- goto next;
- }
-
- // Check that multiplying with the scaled register doesn't overflow.
- if (F.ScaledReg) {
- F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
- if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
- continue;
- }
-
- // Check that multiplying with the unfolded offset doesn't overflow.
- if (F.UnfoldedOffset != 0) {
- if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() &&
- Factor == -1)
- continue;
- F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
- if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
- continue;
- // If the offset will be truncated, check that it is in bounds.
- if (!IntTy->isPointerTy() &&
- !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
- continue;
- }
-
- // If we make it here and it's legal, add it.
- (void)InsertFormula(LU, LUIdx, F);
- next:;
- }
-}
-
-/// Generate stride factor reuse formulae by making use of scaled-offset address
-/// modes, for example.
-void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
- // Determine the integer type for the base formula.
- Type *IntTy = Base.getType();
- if (!IntTy) return;
-
- // If this Formula already has a scaled register, we can't add another one.
- // Try to unscale the formula to generate a better scale.
- if (Base.Scale != 0 && !Base.unscale())
- return;
-
- assert(Base.Scale == 0 && "unscale did not did its job!");
-
- // Check each interesting stride.
- for (int64_t Factor : Factors) {
- Base.Scale = Factor;
- Base.HasBaseReg = Base.BaseRegs.size() > 1;
- // Check whether this scale is going to be legal.
- if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
- Base)) {
- // As a special-case, handle special out-of-loop Basic users specially.
- // TODO: Reconsider this special case.
- if (LU.Kind == LSRUse::Basic &&
- isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
- LU.AccessTy, Base) &&
- LU.AllFixupsOutsideLoop)
- LU.Kind = LSRUse::Special;
- else
- continue;
- }
- // For an ICmpZero, negating a solitary base register won't lead to
- // new solutions.
- if (LU.Kind == LSRUse::ICmpZero &&
- !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
- continue;
- // For each addrec base reg, if its loop is current loop, apply the scale.
- for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
- const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
- if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
- const SCEV *FactorS = SE.getConstant(IntTy, Factor);
- if (FactorS->isZero())
- continue;
- // Divide out the factor, ignoring high bits, since we'll be
- // scaling the value back up in the end.
- if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
- // TODO: This could be optimized to avoid all the copying.
- Formula F = Base;
- F.ScaledReg = Quotient;
- F.deleteBaseReg(F.BaseRegs[i]);
- // The canonical representation of 1*reg is reg, which is already in
- // Base. In that case, do not try to insert the formula, it will be
- // rejected anyway.
- if (F.Scale == 1 && (F.BaseRegs.empty() ||
- (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
- continue;
- // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
- // non canonical Formula with ScaledReg's loop not being L.
- if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
- F.canonicalize(*L);
- (void)InsertFormula(LU, LUIdx, F);
- }
- }
- }
- }
-}
-
-/// Generate reuse formulae from different IV types.
-void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
- // Don't bother truncating symbolic values.
- if (Base.BaseGV) return;
-
- // Determine the integer type for the base formula.
- Type *DstTy = Base.getType();
- if (!DstTy) return;
- DstTy = SE.getEffectiveSCEVType(DstTy);
-
- for (Type *SrcTy : Types) {
- if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
- Formula F = Base;
-
- // Sometimes SCEV is able to prove zero during ext transform. It may
- // happen if SCEV did not do all possible transforms while creating the
- // initial node (maybe due to depth limitations), but it can do them while
- // taking ext.
- if (F.ScaledReg) {
- const SCEV *NewScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);
- if (NewScaledReg->isZero())
- continue;
- F.ScaledReg = NewScaledReg;
- }
- bool HasZeroBaseReg = false;
- for (const SCEV *&BaseReg : F.BaseRegs) {
- const SCEV *NewBaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);
- if (NewBaseReg->isZero()) {
- HasZeroBaseReg = true;
- break;
- }
- BaseReg = NewBaseReg;
- }
- if (HasZeroBaseReg)
- continue;
-
- // TODO: This assumes we've done basic processing on all uses and
- // have an idea what the register usage is.
- if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
- continue;
-
- F.canonicalize(*L);
- (void)InsertFormula(LU, LUIdx, F);
- }
- }
-}
-
-namespace {
-
-/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
-/// modifications so that the search phase doesn't have to worry about the data
-/// structures moving underneath it.
-struct WorkItem {
- size_t LUIdx;
- int64_t Imm;
- const SCEV *OrigReg;
-
- WorkItem(size_t LI, int64_t I, const SCEV *R)
- : LUIdx(LI), Imm(I), OrigReg(R) {}
-
- void print(raw_ostream &OS) const;
- void dump() const;
-};
-
-} // end anonymous namespace
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void WorkItem::print(raw_ostream &OS) const {
- OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
- << " , add offset " << Imm;
-}
-
-LLVM_DUMP_METHOD void WorkItem::dump() const {
- print(errs()); errs() << '\n';
-}
-#endif
-
-/// Look for registers which are a constant distance apart and try to form reuse
-/// opportunities between them.
-void LSRInstance::GenerateCrossUseConstantOffsets() {
- // Group the registers by their value without any added constant offset.
- using ImmMapTy = std::map<int64_t, const SCEV *>;
-
- DenseMap<const SCEV *, ImmMapTy> Map;
- DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
- SmallVector<const SCEV *, 8> Sequence;
- for (const SCEV *Use : RegUses) {
- const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
- int64_t Imm = ExtractImmediate(Reg, SE);
- auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
- if (Pair.second)
- Sequence.push_back(Reg);
- Pair.first->second.insert(std::make_pair(Imm, Use));
- UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
- }
-
- // Now examine each set of registers with the same base value. Build up
- // a list of work to do and do the work in a separate step so that we're
- // not adding formulae and register counts while we're searching.
- SmallVector<WorkItem, 32> WorkItems;
- SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
- for (const SCEV *Reg : Sequence) {
- const ImmMapTy &Imms = Map.find(Reg)->second;
-
- // It's not worthwhile looking for reuse if there's only one offset.
- if (Imms.size() == 1)
- continue;
-
- LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
- for (const auto &Entry
- : Imms) dbgs()
- << ' ' << Entry.first;
- dbgs() << '\n');
-
- // Examine each offset.
- for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
- J != JE; ++J) {
- const SCEV *OrigReg = J->second;
-
- int64_t JImm = J->first;
- const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
-
- if (!isa<SCEVConstant>(OrigReg) &&
- UsedByIndicesMap[Reg].count() == 1) {
- LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
- << '\n');
- continue;
- }
-
- // Conservatively examine offsets between this orig reg a few selected
- // other orig regs.
- int64_t First = Imms.begin()->first;
- int64_t Last = std::prev(Imms.end())->first;
- // Compute (First + Last) / 2 without overflow using the fact that
- // First + Last = 2 * (First + Last) + (First ^ Last).
- int64_t Avg = (First & Last) + ((First ^ Last) >> 1);
- // If the result is negative and First is odd and Last even (or vice versa),
- // we rounded towards -inf. Add 1 in that case, to round towards 0.
- Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63));
- ImmMapTy::const_iterator OtherImms[] = {
- Imms.begin(), std::prev(Imms.end()),
- Imms.lower_bound(Avg)};
- for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
- ImmMapTy::const_iterator M = OtherImms[i];
- if (M == J || M == JE) continue;
-
- // Compute the difference between the two.
- int64_t Imm = (uint64_t)JImm - M->first;
- for (unsigned LUIdx : UsedByIndices.set_bits())
- // Make a memo of this use, offset, and register tuple.
- if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
- WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
- }
- }
- }
-
- Map.clear();
- Sequence.clear();
- UsedByIndicesMap.clear();
- UniqueItems.clear();
-
- // Now iterate through the worklist and add new formulae.
- for (const WorkItem &WI : WorkItems) {
- size_t LUIdx = WI.LUIdx;
- LSRUse &LU = Uses[LUIdx];
- int64_t Imm = WI.Imm;
- const SCEV *OrigReg = WI.OrigReg;
-
- Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
- const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
- unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
-
- // TODO: Use a more targeted data structure.
- for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
- Formula F = LU.Formulae[L];
- // FIXME: The code for the scaled and unscaled registers looks
- // very similar but slightly different. Investigate if they
- // could be merged. That way, we would not have to unscale the
- // Formula.
- F.unscale();
- // Use the immediate in the scaled register.
- if (F.ScaledReg == OrigReg) {
- int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
- // Don't create 50 + reg(-50).
- if (F.referencesReg(SE.getSCEV(
- ConstantInt::get(IntTy, -(uint64_t)Offset))))
- continue;
- Formula NewF = F;
- NewF.BaseOffset = Offset;
- if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
- NewF))
- continue;
- NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
-
- // If the new scale is a constant in a register, and adding the constant
- // value to the immediate would produce a value closer to zero than the
- // immediate itself, then the formula isn't worthwhile.
- if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
- if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
- (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
- .ule(std::abs(NewF.BaseOffset)))
- continue;
-
- // OK, looks good.
- NewF.canonicalize(*this->L);
- (void)InsertFormula(LU, LUIdx, NewF);
- } else {
- // Use the immediate in a base register.
- for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
- const SCEV *BaseReg = F.BaseRegs[N];
- if (BaseReg != OrigReg)
- continue;
- Formula NewF = F;
- NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
- if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
- LU.Kind, LU.AccessTy, NewF)) {
- if (TTI.shouldFavorPostInc() &&
- mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
- continue;
- if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
- continue;
- NewF = F;
- NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
- }
- NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
-
- // If the new formula has a constant in a register, and adding the
- // constant value to the immediate would produce a value closer to
- // zero than the immediate itself, then the formula isn't worthwhile.
- for (const SCEV *NewReg : NewF.BaseRegs)
- if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
- if ((C->getAPInt() + NewF.BaseOffset)
- .abs()
- .slt(std::abs(NewF.BaseOffset)) &&
- (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >=
- countTrailingZeros<uint64_t>(NewF.BaseOffset))
- goto skip_formula;
-
- // Ok, looks good.
- NewF.canonicalize(*this->L);
- (void)InsertFormula(LU, LUIdx, NewF);
- break;
- skip_formula:;
- }
- }
- }
- }
-}
-
-/// Generate formulae for each use.
-void
-LSRInstance::GenerateAllReuseFormulae() {
- // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
- // queries are more precise.
- for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
- LSRUse &LU = Uses[LUIdx];
- for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
- GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
- for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
- GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
- }
- for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
- LSRUse &LU = Uses[LUIdx];
- for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
- GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
- for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
- GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
- for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
- GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
- for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
- GenerateScales(LU, LUIdx, LU.Formulae[i]);
- }
- for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
- LSRUse &LU = Uses[LUIdx];
- for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
- GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
- }
-
- GenerateCrossUseConstantOffsets();
-
- LLVM_DEBUG(dbgs() << "\n"
- "After generating reuse formulae:\n";
- print_uses(dbgs()));
-}
-
-/// If there are multiple formulae with the same set of registers used
-/// by other uses, pick the best one and delete the others.
-void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
- DenseSet<const SCEV *> VisitedRegs;
- SmallPtrSet<const SCEV *, 16> Regs;
- SmallPtrSet<const SCEV *, 16> LoserRegs;
-#ifndef NDEBUG
- bool ChangedFormulae = false;
-#endif
-
- // Collect the best formula for each unique set of shared registers. This
- // is reset for each use.
- using BestFormulaeTy =
- DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
-
- BestFormulaeTy BestFormulae;
-
- for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
- LSRUse &LU = Uses[LUIdx];
- LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
- dbgs() << '\n');
-
- bool Any = false;
- for (size_t FIdx = 0, NumForms = LU.Formulae.size();
- FIdx != NumForms; ++FIdx) {
- Formula &F = LU.Formulae[FIdx];
-
- // Some formulas are instant losers. For example, they may depend on
- // nonexistent AddRecs from other loops. These need to be filtered
- // immediately, otherwise heuristics could choose them over others leading
- // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
- // avoids the need to recompute this information across formulae using the
- // same bad AddRec. Passing LoserRegs is also essential unless we remove
- // the corresponding bad register from the Regs set.
- Cost CostF(L, SE, TTI);
- Regs.clear();
- CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
- if (CostF.isLoser()) {
- // During initial formula generation, undesirable formulae are generated
- // by uses within other loops that have some non-trivial address mode or
- // use the postinc form of the IV. LSR needs to provide these formulae
- // as the basis of rediscovering the desired formula that uses an AddRec
- // corresponding to the existing phi. Once all formulae have been
- // generated, these initial losers may be pruned.
- LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
- dbgs() << "\n");
- }
- else {
- SmallVector<const SCEV *, 4> Key;
- for (const SCEV *Reg : F.BaseRegs) {
- if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
- Key.push_back(Reg);
- }
- if (F.ScaledReg &&
- RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
- Key.push_back(F.ScaledReg);
- // Unstable sort by host order ok, because this is only used for
- // uniquifying.
- llvm::sort(Key);
-
- std::pair<BestFormulaeTy::const_iterator, bool> P =
- BestFormulae.insert(std::make_pair(Key, FIdx));
- if (P.second)
- continue;
-
- Formula &Best = LU.Formulae[P.first->second];
-
- Cost CostBest(L, SE, TTI);
- Regs.clear();
- CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
- if (CostF.isLess(CostBest))
- std::swap(F, Best);
- LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
- dbgs() << "\n"
- " in favor of formula ";
- Best.print(dbgs()); dbgs() << '\n');
- }
-#ifndef NDEBUG
- ChangedFormulae = true;
-#endif
- LU.DeleteFormula(F);
- --FIdx;
- --NumForms;
- Any = true;
- }
-
- // Now that we've filtered out some formulae, recompute the Regs set.
- if (Any)
- LU.RecomputeRegs(LUIdx, RegUses);
-
- // Reset this to prepare for the next use.
- BestFormulae.clear();
- }
-
- LLVM_DEBUG(if (ChangedFormulae) {
- dbgs() << "\n"
- "After filtering out undesirable candidates:\n";
- print_uses(dbgs());
- });
-}
-
-/// Estimate the worst-case number of solutions the solver might have to
-/// consider. It almost never considers this many solutions because it prune the
-/// search space, but the pruning isn't always sufficient.
-size_t LSRInstance::EstimateSearchSpaceComplexity() const {
- size_t Power = 1;
- for (const LSRUse &LU : Uses) {
- size_t FSize = LU.Formulae.size();
- if (FSize >= ComplexityLimit) {
- Power = ComplexityLimit;
- break;
- }
- Power *= FSize;
- if (Power >= ComplexityLimit)
- break;
- }
- return Power;
-}
-
-/// When one formula uses a superset of the registers of another formula, it
-/// won't help reduce register pressure (though it may not necessarily hurt
-/// register pressure); remove it to simplify the system.
-void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
- if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
- LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
-
- LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
- "which use a superset of registers used by other "
- "formulae.\n");
-
- for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
- LSRUse &LU = Uses[LUIdx];
- bool Any = false;
- for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
- Formula &F = LU.Formulae[i];
- // Look for a formula with a constant or GV in a register. If the use
- // also has a formula with that same value in an immediate field,
- // delete the one that uses a register.
- for (SmallVectorImpl<const SCEV *>::const_iterator
- I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
- if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
- Formula NewF = F;
- //FIXME: Formulas should store bitwidth to do wrapping properly.
- // See PR41034.
- NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue();
- NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
- (I - F.BaseRegs.begin()));
- if (LU.HasFormulaWithSameRegs(NewF)) {
- LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
- dbgs() << '\n');
- LU.DeleteFormula(F);
- --i;
- --e;
- Any = true;
- break;
- }
- } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
- if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
- if (!F.BaseGV) {
- Formula NewF = F;
- NewF.BaseGV = GV;
- NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
- (I - F.BaseRegs.begin()));
- if (LU.HasFormulaWithSameRegs(NewF)) {
- LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
- dbgs() << '\n');
- LU.DeleteFormula(F);
- --i;
- --e;
- Any = true;
- break;
- }
- }
- }
- }
- }
- if (Any)
- LU.RecomputeRegs(LUIdx, RegUses);
- }
-
- LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
- }
-}
-
-/// When there are many registers for expressions like A, A+1, A+2, etc.,
-/// allocate a single register for them.
-void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
- if (EstimateSearchSpaceComplexity() < ComplexityLimit)
- return;
-
- LLVM_DEBUG(
- dbgs() << "The search space is too complex.\n"
- "Narrowing the search space by assuming that uses separated "
- "by a constant offset will use the same registers.\n");
-
- // This is especially useful for unrolled loops.
-
- for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
- LSRUse &LU = Uses[LUIdx];
- for (const Formula &F : LU.Formulae) {
- if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
- continue;
-
- LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
- if (!LUThatHas)
- continue;
-
- if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
- LU.Kind, LU.AccessTy))
- continue;
-
- LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
-
- LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
-
- // Transfer the fixups of LU to LUThatHas.
- for (LSRFixup &Fixup : LU.Fixups) {
- Fixup.Offset += F.BaseOffset;
- LUThatHas->pushFixup(Fixup);
- LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
- }
-
- // Delete formulae from the new use which are no longer legal.
- bool Any = false;
- for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
- Formula &F = LUThatHas->Formulae[i];
- if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
- LUThatHas->Kind, LUThatHas->AccessTy, F)) {
- LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
- LUThatHas->DeleteFormula(F);
- --i;
- --e;
- Any = true;
- }
- }
-
- if (Any)
- LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
-
- // Delete the old use.
- DeleteUse(LU, LUIdx);
- --LUIdx;
- --NumUses;
- break;
- }
- }
-
- LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
-}
-
-/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
-/// we've done more filtering, as it may be able to find more formulae to
-/// eliminate.
-void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
- if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
- LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
-
- LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
- "undesirable dedicated registers.\n");
-
- FilterOutUndesirableDedicatedRegisters();
-
- LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
- }
-}
-
-/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
-/// Pick the best one and delete the others.
-/// This narrowing heuristic is to keep as many formulae with different
-/// Scale and ScaledReg pair as possible while narrowing the search space.
-/// The benefit is that it is more likely to find out a better solution
-/// from a formulae set with more Scale and ScaledReg variations than
-/// a formulae set with the same Scale and ScaledReg. The picking winner
-/// reg heuristic will often keep the formulae with the same Scale and
-/// ScaledReg and filter others, and we want to avoid that if possible.
-void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
- if (EstimateSearchSpaceComplexity() < ComplexityLimit)
- return;
-
- LLVM_DEBUG(
- dbgs() << "The search space is too complex.\n"
- "Narrowing the search space by choosing the best Formula "
- "from the Formulae with the same Scale and ScaledReg.\n");
-
- // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
- using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
-
- BestFormulaeTy BestFormulae;
-#ifndef NDEBUG
- bool ChangedFormulae = false;
-#endif
- DenseSet<const SCEV *> VisitedRegs;
- SmallPtrSet<const SCEV *, 16> Regs;
-
- for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
- LSRUse &LU = Uses[LUIdx];
- LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
- dbgs() << '\n');
-
- // Return true if Formula FA is better than Formula FB.
- auto IsBetterThan = [&](Formula &FA, Formula &FB) {
- // First we will try to choose the Formula with fewer new registers.
- // For a register used by current Formula, the more the register is
- // shared among LSRUses, the less we increase the register number
- // counter of the formula.
- size_t FARegNum = 0;
- for (const SCEV *Reg : FA.BaseRegs) {
- const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
- FARegNum += (NumUses - UsedByIndices.count() + 1);
- }
- size_t FBRegNum = 0;
- for (const SCEV *Reg : FB.BaseRegs) {
- const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
- FBRegNum += (NumUses - UsedByIndices.count() + 1);
- }
- if (FARegNum != FBRegNum)
- return FARegNum < FBRegNum;
-
- // If the new register numbers are the same, choose the Formula with
- // less Cost.
- Cost CostFA(L, SE, TTI);
- Cost CostFB(L, SE, TTI);
- Regs.clear();
- CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
- Regs.clear();
- CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
- return CostFA.isLess(CostFB);
- };
-
- bool Any = false;
- for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
- ++FIdx) {
- Formula &F = LU.Formulae[FIdx];
- if (!F.ScaledReg)
- continue;
- auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
- if (P.second)
- continue;
-
- Formula &Best = LU.Formulae[P.first->second];
- if (IsBetterThan(F, Best))
- std::swap(F, Best);
- LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
- dbgs() << "\n"
- " in favor of formula ";
- Best.print(dbgs()); dbgs() << '\n');
-#ifndef NDEBUG
- ChangedFormulae = true;
-#endif
- LU.DeleteFormula(F);
- --FIdx;
- --NumForms;
- Any = true;
- }
- if (Any)
- LU.RecomputeRegs(LUIdx, RegUses);
-
- // Reset this to prepare for the next use.
- BestFormulae.clear();
- }
-
- LLVM_DEBUG(if (ChangedFormulae) {
- dbgs() << "\n"
- "After filtering out undesirable candidates:\n";
- print_uses(dbgs());
- });
-}
-
-/// If we are over the complexity limit, filter out any post-inc prefering
-/// variables to only post-inc values.
-void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
- if (!TTI.shouldFavorPostInc())
- return;
- if (EstimateSearchSpaceComplexity() < ComplexityLimit)
- return;
-
- LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
- "Narrowing the search space by choosing the lowest "
- "register Formula for PostInc Uses.\n");
-
- for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
- LSRUse &LU = Uses[LUIdx];
-
- if (LU.Kind != LSRUse::Address)
- continue;
- if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
- !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
- continue;
-
- size_t MinRegs = std::numeric_limits<size_t>::max();
- for (const Formula &F : LU.Formulae)
- MinRegs = std::min(F.getNumRegs(), MinRegs);
-
- bool Any = false;
- for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
- ++FIdx) {
- Formula &F = LU.Formulae[FIdx];
- if (F.getNumRegs() > MinRegs) {
- LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
- dbgs() << "\n");
- LU.DeleteFormula(F);
- --FIdx;
- --NumForms;
- Any = true;
- }
- }
- if (Any)
- LU.RecomputeRegs(LUIdx, RegUses);
-
- if (EstimateSearchSpaceComplexity() < ComplexityLimit)
- break;
- }
-
- LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
-}
-
-/// The function delete formulas with high registers number expectation.
-/// Assuming we don't know the value of each formula (already delete
-/// all inefficient), generate probability of not selecting for each
-/// register.
-/// For example,
-/// Use1:
-/// reg(a) + reg({0,+,1})
-/// reg(a) + reg({-1,+,1}) + 1
-/// reg({a,+,1})
-/// Use2:
-/// reg(b) + reg({0,+,1})
-/// reg(b) + reg({-1,+,1}) + 1
-/// reg({b,+,1})
-/// Use3:
-/// reg(c) + reg(b) + reg({0,+,1})
-/// reg(c) + reg({b,+,1})
-///
-/// Probability of not selecting
-/// Use1 Use2 Use3
-/// reg(a) (1/3) * 1 * 1
-/// reg(b) 1 * (1/3) * (1/2)
-/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
-/// reg({-1,+,1}) (2/3) * (2/3) * 1
-/// reg({a,+,1}) (2/3) * 1 * 1
-/// reg({b,+,1}) 1 * (2/3) * (2/3)
-/// reg(c) 1 * 1 * 0
-///
-/// Now count registers number mathematical expectation for each formula:
-/// Note that for each use we exclude probability if not selecting for the use.
-/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
-/// probabilty 1/3 of not selecting for Use1).
-/// Use1:
-/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
-/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
-/// reg({a,+,1}) 1
-/// Use2:
-/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
-/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
-/// reg({b,+,1}) 2/3
-/// Use3:
-/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
-/// reg(c) + reg({b,+,1}) 1 + 2/3
-void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
+ (void)InsertFormula(LU, LUIdx, F);
+}
+
+/// GenerateConstantOffsets - Generate reuse formulae using symbolic offsets.
+void LSRInstance::GenerateConstantOffsets(LSRUse &LU, unsigned LUIdx,
+ Formula Base) {
+ // TODO: For now, just add the min and max offset, because it usually isn't
+ // worthwhile looking at everything inbetween.
+ SmallVector<int64_t, 2> Worklist;
+ Worklist.push_back(LU.MinOffset);
+ if (LU.MaxOffset != LU.MinOffset)
+ Worklist.push_back(LU.MaxOffset);
+
+ for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
+ GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, i);
+ if (Base.Scale == 1)
+ GenerateConstantOffsetsImpl(LU, LUIdx, Base, Worklist, /* Idx */ -1,
+ /* IsScaledReg */ true);
+}
+
+/// For ICmpZero, check to see if we can scale up the comparison. For example, x
+/// == y -> x*c == y*c.
+void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx,
+ Formula Base) {
+ if (LU.Kind != LSRUse::ICmpZero) return;
+
+ // Determine the integer type for the base formula.
+ Type *IntTy = Base.getType();
+ if (!IntTy) return;
+ if (SE.getTypeSizeInBits(IntTy) > 64) return;
+
+ // Don't do this if there is more than one offset.
+ if (LU.MinOffset != LU.MaxOffset) return;
+
+ // Check if transformation is valid. It is illegal to multiply pointer.
+ if (Base.ScaledReg && Base.ScaledReg->getType()->isPointerTy())
+ return;
+ for (const SCEV *BaseReg : Base.BaseRegs)
+ if (BaseReg->getType()->isPointerTy())
+ return;
+ assert(!Base.BaseGV && "ICmpZero use is not legal!");
+
+ // Check each interesting stride.
+ for (int64_t Factor : Factors) {
+ // Check that the multiplication doesn't overflow.
+ if (Base.BaseOffset == std::numeric_limits<int64_t>::min() && Factor == -1)
+ continue;
+ int64_t NewBaseOffset = (uint64_t)Base.BaseOffset * Factor;
+ if (NewBaseOffset / Factor != Base.BaseOffset)
+ continue;
+ // If the offset will be truncated at this use, check that it is in bounds.
+ if (!IntTy->isPointerTy() &&
+ !ConstantInt::isValueValidForType(IntTy, NewBaseOffset))
+ continue;
+
+ // Check that multiplying with the use offset doesn't overflow.
+ int64_t Offset = LU.MinOffset;
+ if (Offset == std::numeric_limits<int64_t>::min() && Factor == -1)
+ continue;
+ Offset = (uint64_t)Offset * Factor;
+ if (Offset / Factor != LU.MinOffset)
+ continue;
+ // If the offset will be truncated at this use, check that it is in bounds.
+ if (!IntTy->isPointerTy() &&
+ !ConstantInt::isValueValidForType(IntTy, Offset))
+ continue;
+
+ Formula F = Base;
+ F.BaseOffset = NewBaseOffset;
+
+ // Check that this scale is legal.
+ if (!isLegalUse(TTI, Offset, Offset, LU.Kind, LU.AccessTy, F))
+ continue;
+
+ // Compensate for the use having MinOffset built into it.
+ F.BaseOffset = (uint64_t)F.BaseOffset + Offset - LU.MinOffset;
+
+ const SCEV *FactorS = SE.getConstant(IntTy, Factor);
+
+ // Check that multiplying with each base register doesn't overflow.
+ for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
+ F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
+ if (getExactSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
+ goto next;
+ }
+
+ // Check that multiplying with the scaled register doesn't overflow.
+ if (F.ScaledReg) {
+ F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
+ if (getExactSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
+ continue;
+ }
+
+ // Check that multiplying with the unfolded offset doesn't overflow.
+ if (F.UnfoldedOffset != 0) {
+ if (F.UnfoldedOffset == std::numeric_limits<int64_t>::min() &&
+ Factor == -1)
+ continue;
+ F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor;
+ if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset)
+ continue;
+ // If the offset will be truncated, check that it is in bounds.
+ if (!IntTy->isPointerTy() &&
+ !ConstantInt::isValueValidForType(IntTy, F.UnfoldedOffset))
+ continue;
+ }
+
+ // If we make it here and it's legal, add it.
+ (void)InsertFormula(LU, LUIdx, F);
+ next:;
+ }
+}
+
+/// Generate stride factor reuse formulae by making use of scaled-offset address
+/// modes, for example.
+void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
+ // Determine the integer type for the base formula.
+ Type *IntTy = Base.getType();
+ if (!IntTy) return;
+
+ // If this Formula already has a scaled register, we can't add another one.
+ // Try to unscale the formula to generate a better scale.
+ if (Base.Scale != 0 && !Base.unscale())
+ return;
+
+ assert(Base.Scale == 0 && "unscale did not did its job!");
+
+ // Check each interesting stride.
+ for (int64_t Factor : Factors) {
+ Base.Scale = Factor;
+ Base.HasBaseReg = Base.BaseRegs.size() > 1;
+ // Check whether this scale is going to be legal.
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+ Base)) {
+ // As a special-case, handle special out-of-loop Basic users specially.
+ // TODO: Reconsider this special case.
+ if (LU.Kind == LSRUse::Basic &&
+ isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LSRUse::Special,
+ LU.AccessTy, Base) &&
+ LU.AllFixupsOutsideLoop)
+ LU.Kind = LSRUse::Special;
+ else
+ continue;
+ }
+ // For an ICmpZero, negating a solitary base register won't lead to
+ // new solutions.
+ if (LU.Kind == LSRUse::ICmpZero &&
+ !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
+ continue;
+ // For each addrec base reg, if its loop is current loop, apply the scale.
+ for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
+ if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
+ const SCEV *FactorS = SE.getConstant(IntTy, Factor);
+ if (FactorS->isZero())
+ continue;
+ // Divide out the factor, ignoring high bits, since we'll be
+ // scaling the value back up in the end.
+ if (const SCEV *Quotient = getExactSDiv(AR, FactorS, SE, true)) {
+ // TODO: This could be optimized to avoid all the copying.
+ Formula F = Base;
+ F.ScaledReg = Quotient;
+ F.deleteBaseReg(F.BaseRegs[i]);
+ // The canonical representation of 1*reg is reg, which is already in
+ // Base. In that case, do not try to insert the formula, it will be
+ // rejected anyway.
+ if (F.Scale == 1 && (F.BaseRegs.empty() ||
+ (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
+ continue;
+ // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
+ // non canonical Formula with ScaledReg's loop not being L.
+ if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
+ F.canonicalize(*L);
+ (void)InsertFormula(LU, LUIdx, F);
+ }
+ }
+ }
+ }
+}
+
+/// Generate reuse formulae from different IV types.
+void LSRInstance::GenerateTruncates(LSRUse &LU, unsigned LUIdx, Formula Base) {
+ // Don't bother truncating symbolic values.
+ if (Base.BaseGV) return;
+
+ // Determine the integer type for the base formula.
+ Type *DstTy = Base.getType();
+ if (!DstTy) return;
+ DstTy = SE.getEffectiveSCEVType(DstTy);
+
+ for (Type *SrcTy : Types) {
+ if (SrcTy != DstTy && TTI.isTruncateFree(SrcTy, DstTy)) {
+ Formula F = Base;
+
+ // Sometimes SCEV is able to prove zero during ext transform. It may
+ // happen if SCEV did not do all possible transforms while creating the
+ // initial node (maybe due to depth limitations), but it can do them while
+ // taking ext.
+ if (F.ScaledReg) {
+ const SCEV *NewScaledReg = SE.getAnyExtendExpr(F.ScaledReg, SrcTy);
+ if (NewScaledReg->isZero())
+ continue;
+ F.ScaledReg = NewScaledReg;
+ }
+ bool HasZeroBaseReg = false;
+ for (const SCEV *&BaseReg : F.BaseRegs) {
+ const SCEV *NewBaseReg = SE.getAnyExtendExpr(BaseReg, SrcTy);
+ if (NewBaseReg->isZero()) {
+ HasZeroBaseReg = true;
+ break;
+ }
+ BaseReg = NewBaseReg;
+ }
+ if (HasZeroBaseReg)
+ continue;
+
+ // TODO: This assumes we've done basic processing on all uses and
+ // have an idea what the register usage is.
+ if (!F.hasRegsUsedByUsesOtherThan(LUIdx, RegUses))
+ continue;
+
+ F.canonicalize(*L);
+ (void)InsertFormula(LU, LUIdx, F);
+ }
+ }
+}
+
+namespace {
+
+/// Helper class for GenerateCrossUseConstantOffsets. It's used to defer
+/// modifications so that the search phase doesn't have to worry about the data
+/// structures moving underneath it.
+struct WorkItem {
+ size_t LUIdx;
+ int64_t Imm;
+ const SCEV *OrigReg;
+
+ WorkItem(size_t LI, int64_t I, const SCEV *R)
+ : LUIdx(LI), Imm(I), OrigReg(R) {}
+
+ void print(raw_ostream &OS) const;
+ void dump() const;
+};
+
+} // end anonymous namespace
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void WorkItem::print(raw_ostream &OS) const {
+ OS << "in formulae referencing " << *OrigReg << " in use " << LUIdx
+ << " , add offset " << Imm;
+}
+
+LLVM_DUMP_METHOD void WorkItem::dump() const {
+ print(errs()); errs() << '\n';
+}
+#endif
+
+/// Look for registers which are a constant distance apart and try to form reuse
+/// opportunities between them.
+void LSRInstance::GenerateCrossUseConstantOffsets() {
+ // Group the registers by their value without any added constant offset.
+ using ImmMapTy = std::map<int64_t, const SCEV *>;
+
+ DenseMap<const SCEV *, ImmMapTy> Map;
+ DenseMap<const SCEV *, SmallBitVector> UsedByIndicesMap;
+ SmallVector<const SCEV *, 8> Sequence;
+ for (const SCEV *Use : RegUses) {
+ const SCEV *Reg = Use; // Make a copy for ExtractImmediate to modify.
+ int64_t Imm = ExtractImmediate(Reg, SE);
+ auto Pair = Map.insert(std::make_pair(Reg, ImmMapTy()));
+ if (Pair.second)
+ Sequence.push_back(Reg);
+ Pair.first->second.insert(std::make_pair(Imm, Use));
+ UsedByIndicesMap[Reg] |= RegUses.getUsedByIndices(Use);
+ }
+
+ // Now examine each set of registers with the same base value. Build up
+ // a list of work to do and do the work in a separate step so that we're
+ // not adding formulae and register counts while we're searching.
+ SmallVector<WorkItem, 32> WorkItems;
+ SmallSet<std::pair<size_t, int64_t>, 32> UniqueItems;
+ for (const SCEV *Reg : Sequence) {
+ const ImmMapTy &Imms = Map.find(Reg)->second;
+
+ // It's not worthwhile looking for reuse if there's only one offset.
+ if (Imms.size() == 1)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Generating cross-use offsets for " << *Reg << ':';
+ for (const auto &Entry
+ : Imms) dbgs()
+ << ' ' << Entry.first;
+ dbgs() << '\n');
+
+ // Examine each offset.
+ for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
+ J != JE; ++J) {
+ const SCEV *OrigReg = J->second;
+
+ int64_t JImm = J->first;
+ const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(OrigReg);
+
+ if (!isa<SCEVConstant>(OrigReg) &&
+ UsedByIndicesMap[Reg].count() == 1) {
+ LLVM_DEBUG(dbgs() << "Skipping cross-use reuse for " << *OrigReg
+ << '\n');
+ continue;
+ }
+
+ // Conservatively examine offsets between this orig reg a few selected
+ // other orig regs.
+ int64_t First = Imms.begin()->first;
+ int64_t Last = std::prev(Imms.end())->first;
+ // Compute (First + Last) / 2 without overflow using the fact that
+ // First + Last = 2 * (First + Last) + (First ^ Last).
+ int64_t Avg = (First & Last) + ((First ^ Last) >> 1);
+ // If the result is negative and First is odd and Last even (or vice versa),
+ // we rounded towards -inf. Add 1 in that case, to round towards 0.
+ Avg = Avg + ((First ^ Last) & ((uint64_t)Avg >> 63));
+ ImmMapTy::const_iterator OtherImms[] = {
+ Imms.begin(), std::prev(Imms.end()),
+ Imms.lower_bound(Avg)};
+ for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) {
+ ImmMapTy::const_iterator M = OtherImms[i];
+ if (M == J || M == JE) continue;
+
+ // Compute the difference between the two.
+ int64_t Imm = (uint64_t)JImm - M->first;
+ for (unsigned LUIdx : UsedByIndices.set_bits())
+ // Make a memo of this use, offset, and register tuple.
+ if (UniqueItems.insert(std::make_pair(LUIdx, Imm)).second)
+ WorkItems.push_back(WorkItem(LUIdx, Imm, OrigReg));
+ }
+ }
+ }
+
+ Map.clear();
+ Sequence.clear();
+ UsedByIndicesMap.clear();
+ UniqueItems.clear();
+
+ // Now iterate through the worklist and add new formulae.
+ for (const WorkItem &WI : WorkItems) {
+ size_t LUIdx = WI.LUIdx;
+ LSRUse &LU = Uses[LUIdx];
+ int64_t Imm = WI.Imm;
+ const SCEV *OrigReg = WI.OrigReg;
+
+ Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
+ const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy, -(uint64_t)Imm));
+ unsigned BitWidth = SE.getTypeSizeInBits(IntTy);
+
+ // TODO: Use a more targeted data structure.
+ for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
+ Formula F = LU.Formulae[L];
+ // FIXME: The code for the scaled and unscaled registers looks
+ // very similar but slightly different. Investigate if they
+ // could be merged. That way, we would not have to unscale the
+ // Formula.
+ F.unscale();
+ // Use the immediate in the scaled register.
+ if (F.ScaledReg == OrigReg) {
+ int64_t Offset = (uint64_t)F.BaseOffset + Imm * (uint64_t)F.Scale;
+ // Don't create 50 + reg(-50).
+ if (F.referencesReg(SE.getSCEV(
+ ConstantInt::get(IntTy, -(uint64_t)Offset))))
+ continue;
+ Formula NewF = F;
+ NewF.BaseOffset = Offset;
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+ NewF))
+ continue;
+ NewF.ScaledReg = SE.getAddExpr(NegImmS, NewF.ScaledReg);
+
+ // If the new scale is a constant in a register, and adding the constant
+ // value to the immediate would produce a value closer to zero than the
+ // immediate itself, then the formula isn't worthwhile.
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewF.ScaledReg))
+ if (C->getValue()->isNegative() != (NewF.BaseOffset < 0) &&
+ (C->getAPInt().abs() * APInt(BitWidth, F.Scale))
+ .ule(std::abs(NewF.BaseOffset)))
+ continue;
+
+ // OK, looks good.
+ NewF.canonicalize(*this->L);
+ (void)InsertFormula(LU, LUIdx, NewF);
+ } else {
+ // Use the immediate in a base register.
+ for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
+ const SCEV *BaseReg = F.BaseRegs[N];
+ if (BaseReg != OrigReg)
+ continue;
+ Formula NewF = F;
+ NewF.BaseOffset = (uint64_t)NewF.BaseOffset + Imm;
+ if (!isLegalUse(TTI, LU.MinOffset, LU.MaxOffset,
+ LU.Kind, LU.AccessTy, NewF)) {
+ if (TTI.shouldFavorPostInc() &&
+ mayUsePostIncMode(TTI, LU, OrigReg, this->L, SE))
+ continue;
+ if (!TTI.isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm))
+ continue;
+ NewF = F;
+ NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm;
+ }
+ NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg);
+
+ // If the new formula has a constant in a register, and adding the
+ // constant value to the immediate would produce a value closer to
+ // zero than the immediate itself, then the formula isn't worthwhile.
+ for (const SCEV *NewReg : NewF.BaseRegs)
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(NewReg))
+ if ((C->getAPInt() + NewF.BaseOffset)
+ .abs()
+ .slt(std::abs(NewF.BaseOffset)) &&
+ (C->getAPInt() + NewF.BaseOffset).countTrailingZeros() >=
+ countTrailingZeros<uint64_t>(NewF.BaseOffset))
+ goto skip_formula;
+
+ // Ok, looks good.
+ NewF.canonicalize(*this->L);
+ (void)InsertFormula(LU, LUIdx, NewF);
+ break;
+ skip_formula:;
+ }
+ }
+ }
+ }
+}
+
+/// Generate formulae for each use.
+void
+LSRInstance::GenerateAllReuseFormulae() {
+ // This is split into multiple loops so that hasRegsUsedByUsesOtherThan
+ // queries are more precise.
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateReassociations(LU, LUIdx, LU.Formulae[i]);
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateCombinations(LU, LUIdx, LU.Formulae[i]);
+ }
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateSymbolicOffsets(LU, LUIdx, LU.Formulae[i]);
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateConstantOffsets(LU, LUIdx, LU.Formulae[i]);
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateICmpZeroScales(LU, LUIdx, LU.Formulae[i]);
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateScales(LU, LUIdx, LU.Formulae[i]);
+ }
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
+ GenerateTruncates(LU, LUIdx, LU.Formulae[i]);
+ }
+
+ GenerateCrossUseConstantOffsets();
+
+ LLVM_DEBUG(dbgs() << "\n"
+ "After generating reuse formulae:\n";
+ print_uses(dbgs()));
+}
+
+/// If there are multiple formulae with the same set of registers used
+/// by other uses, pick the best one and delete the others.
+void LSRInstance::FilterOutUndesirableDedicatedRegisters() {
+ DenseSet<const SCEV *> VisitedRegs;
+ SmallPtrSet<const SCEV *, 16> Regs;
+ SmallPtrSet<const SCEV *, 16> LoserRegs;
+#ifndef NDEBUG
+ bool ChangedFormulae = false;
+#endif
+
+ // Collect the best formula for each unique set of shared registers. This
+ // is reset for each use.
+ using BestFormulaeTy =
+ DenseMap<SmallVector<const SCEV *, 4>, size_t, UniquifierDenseMapInfo>;
+
+ BestFormulaeTy BestFormulae;
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
+ dbgs() << '\n');
+
+ bool Any = false;
+ for (size_t FIdx = 0, NumForms = LU.Formulae.size();
+ FIdx != NumForms; ++FIdx) {
+ Formula &F = LU.Formulae[FIdx];
+
+ // Some formulas are instant losers. For example, they may depend on
+ // nonexistent AddRecs from other loops. These need to be filtered
+ // immediately, otherwise heuristics could choose them over others leading
+ // to an unsatisfactory solution. Passing LoserRegs into RateFormula here
+ // avoids the need to recompute this information across formulae using the
+ // same bad AddRec. Passing LoserRegs is also essential unless we remove
+ // the corresponding bad register from the Regs set.
+ Cost CostF(L, SE, TTI);
+ Regs.clear();
+ CostF.RateFormula(F, Regs, VisitedRegs, LU, &LoserRegs);
+ if (CostF.isLoser()) {
+ // During initial formula generation, undesirable formulae are generated
+ // by uses within other loops that have some non-trivial address mode or
+ // use the postinc form of the IV. LSR needs to provide these formulae
+ // as the basis of rediscovering the desired formula that uses an AddRec
+ // corresponding to the existing phi. Once all formulae have been
+ // generated, these initial losers may be pruned.
+ LLVM_DEBUG(dbgs() << " Filtering loser "; F.print(dbgs());
+ dbgs() << "\n");
+ }
+ else {
+ SmallVector<const SCEV *, 4> Key;
+ for (const SCEV *Reg : F.BaseRegs) {
+ if (RegUses.isRegUsedByUsesOtherThan(Reg, LUIdx))
+ Key.push_back(Reg);
+ }
+ if (F.ScaledReg &&
+ RegUses.isRegUsedByUsesOtherThan(F.ScaledReg, LUIdx))
+ Key.push_back(F.ScaledReg);
+ // Unstable sort by host order ok, because this is only used for
+ // uniquifying.
+ llvm::sort(Key);
+
+ std::pair<BestFormulaeTy::const_iterator, bool> P =
+ BestFormulae.insert(std::make_pair(Key, FIdx));
+ if (P.second)
+ continue;
+
+ Formula &Best = LU.Formulae[P.first->second];
+
+ Cost CostBest(L, SE, TTI);
+ Regs.clear();
+ CostBest.RateFormula(Best, Regs, VisitedRegs, LU);
+ if (CostF.isLess(CostBest))
+ std::swap(F, Best);
+ LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
+ dbgs() << "\n"
+ " in favor of formula ";
+ Best.print(dbgs()); dbgs() << '\n');
+ }
+#ifndef NDEBUG
+ ChangedFormulae = true;
+#endif
+ LU.DeleteFormula(F);
+ --FIdx;
+ --NumForms;
+ Any = true;
+ }
+
+ // Now that we've filtered out some formulae, recompute the Regs set.
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+
+ // Reset this to prepare for the next use.
+ BestFormulae.clear();
+ }
+
+ LLVM_DEBUG(if (ChangedFormulae) {
+ dbgs() << "\n"
+ "After filtering out undesirable candidates:\n";
+ print_uses(dbgs());
+ });
+}
+
+/// Estimate the worst-case number of solutions the solver might have to
+/// consider. It almost never considers this many solutions because it prune the
+/// search space, but the pruning isn't always sufficient.
+size_t LSRInstance::EstimateSearchSpaceComplexity() const {
+ size_t Power = 1;
+ for (const LSRUse &LU : Uses) {
+ size_t FSize = LU.Formulae.size();
+ if (FSize >= ComplexityLimit) {
+ Power = ComplexityLimit;
+ break;
+ }
+ Power *= FSize;
+ if (Power >= ComplexityLimit)
+ break;
+ }
+ return Power;
+}
+
+/// When one formula uses a superset of the registers of another formula, it
+/// won't help reduce register pressure (though it may not necessarily hurt
+/// register pressure); remove it to simplify the system.
+void LSRInstance::NarrowSearchSpaceByDetectingSupersets() {
+ if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+ LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
+
+ LLVM_DEBUG(dbgs() << "Narrowing the search space by eliminating formulae "
+ "which use a superset of registers used by other "
+ "formulae.\n");
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ bool Any = false;
+ for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+ Formula &F = LU.Formulae[i];
+ // Look for a formula with a constant or GV in a register. If the use
+ // also has a formula with that same value in an immediate field,
+ // delete the one that uses a register.
+ for (SmallVectorImpl<const SCEV *>::const_iterator
+ I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) {
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) {
+ Formula NewF = F;
+ //FIXME: Formulas should store bitwidth to do wrapping properly.
+ // See PR41034.
+ NewF.BaseOffset += (uint64_t)C->getValue()->getSExtValue();
+ NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+ (I - F.BaseRegs.begin()));
+ if (LU.HasFormulaWithSameRegs(NewF)) {
+ LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
+ dbgs() << '\n');
+ LU.DeleteFormula(F);
+ --i;
+ --e;
+ Any = true;
+ break;
+ }
+ } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) {
+ if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue()))
+ if (!F.BaseGV) {
+ Formula NewF = F;
+ NewF.BaseGV = GV;
+ NewF.BaseRegs.erase(NewF.BaseRegs.begin() +
+ (I - F.BaseRegs.begin()));
+ if (LU.HasFormulaWithSameRegs(NewF)) {
+ LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs());
+ dbgs() << '\n');
+ LU.DeleteFormula(F);
+ --i;
+ --e;
+ Any = true;
+ break;
+ }
+ }
+ }
+ }
+ }
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+ }
+
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+ }
+}
+
+/// When there are many registers for expressions like A, A+1, A+2, etc.,
+/// allocate a single register for them.
+void LSRInstance::NarrowSearchSpaceByCollapsingUnrolledCode() {
if (EstimateSearchSpaceComplexity() < ComplexityLimit)
- return;
- // Ok, we have too many of formulae on our hands to conveniently handle.
- // Use a rough heuristic to thin out the list.
-
- // Set of Regs wich will be 100% used in final solution.
- // Used in each formula of a solution (in example above this is reg(c)).
- // We can skip them in calculations.
- SmallPtrSet<const SCEV *, 4> UniqRegs;
- LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
-
- // Map each register to probability of not selecting
- DenseMap <const SCEV *, float> RegNumMap;
- for (const SCEV *Reg : RegUses) {
- if (UniqRegs.count(Reg))
- continue;
- float PNotSel = 1;
- for (const LSRUse &LU : Uses) {
- if (!LU.Regs.count(Reg))
- continue;
- float P = LU.getNotSelectedProbability(Reg);
- if (P != 0.0)
- PNotSel *= P;
- else
- UniqRegs.insert(Reg);
- }
- RegNumMap.insert(std::make_pair(Reg, PNotSel));
- }
-
- LLVM_DEBUG(
- dbgs() << "Narrowing the search space by deleting costly formulas\n");
-
- // Delete formulas where registers number expectation is high.
- for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
- LSRUse &LU = Uses[LUIdx];
- // If nothing to delete - continue.
- if (LU.Formulae.size() < 2)
- continue;
- // This is temporary solution to test performance. Float should be
- // replaced with round independent type (based on integers) to avoid
- // different results for different target builds.
- float FMinRegNum = LU.Formulae[0].getNumRegs();
- float FMinARegNum = LU.Formulae[0].getNumRegs();
- size_t MinIdx = 0;
- for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
- Formula &F = LU.Formulae[i];
- float FRegNum = 0;
- float FARegNum = 0;
- for (const SCEV *BaseReg : F.BaseRegs) {
- if (UniqRegs.count(BaseReg))
- continue;
- FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
- if (isa<SCEVAddRecExpr>(BaseReg))
- FARegNum +=
- RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
- }
- if (const SCEV *ScaledReg = F.ScaledReg) {
- if (!UniqRegs.count(ScaledReg)) {
- FRegNum +=
- RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
- if (isa<SCEVAddRecExpr>(ScaledReg))
- FARegNum +=
- RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
- }
- }
- if (FMinRegNum > FRegNum ||
- (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
- FMinRegNum = FRegNum;
- FMinARegNum = FARegNum;
- MinIdx = i;
- }
- }
- LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
- dbgs() << " with min reg num " << FMinRegNum << '\n');
- if (MinIdx != 0)
- std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
- while (LU.Formulae.size() != 1) {
- LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
- dbgs() << '\n');
- LU.Formulae.pop_back();
- }
- LU.RecomputeRegs(LUIdx, RegUses);
- assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
- Formula &F = LU.Formulae[0];
- LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
- // When we choose the formula, the regs become unique.
- UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
- if (F.ScaledReg)
- UniqRegs.insert(F.ScaledReg);
- }
- LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
-}
-
-/// Pick a register which seems likely to be profitable, and then in any use
-/// which has any reference to that register, delete all formulae which do not
-/// reference that register.
-void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
- // With all other options exhausted, loop until the system is simple
- // enough to handle.
- SmallPtrSet<const SCEV *, 4> Taken;
- while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
- // Ok, we have too many of formulae on our hands to conveniently handle.
- // Use a rough heuristic to thin out the list.
- LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
-
- // Pick the register which is used by the most LSRUses, which is likely
- // to be a good reuse register candidate.
- const SCEV *Best = nullptr;
- unsigned BestNum = 0;
- for (const SCEV *Reg : RegUses) {
- if (Taken.count(Reg))
- continue;
- if (!Best) {
- Best = Reg;
- BestNum = RegUses.getUsedByIndices(Reg).count();
- } else {
- unsigned Count = RegUses.getUsedByIndices(Reg).count();
- if (Count > BestNum) {
- Best = Reg;
- BestNum = Count;
- }
- }
- }
- assert(Best && "Failed to find best LSRUse candidate");
-
- LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
- << " will yield profitable reuse.\n");
- Taken.insert(Best);
-
- // In any use with formulae which references this register, delete formulae
- // which don't reference it.
- for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
- LSRUse &LU = Uses[LUIdx];
- if (!LU.Regs.count(Best)) continue;
-
- bool Any = false;
- for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
- Formula &F = LU.Formulae[i];
- if (!F.referencesReg(Best)) {
- LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
- LU.DeleteFormula(F);
- --e;
- --i;
- Any = true;
- assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
- continue;
- }
- }
-
- if (Any)
- LU.RecomputeRegs(LUIdx, RegUses);
- }
-
- LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
- }
-}
-
-/// If there are an extraordinary number of formulae to choose from, use some
-/// rough heuristics to prune down the number of formulae. This keeps the main
-/// solver from taking an extraordinary amount of time in some worst-case
-/// scenarios.
-void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
- NarrowSearchSpaceByDetectingSupersets();
- NarrowSearchSpaceByCollapsingUnrolledCode();
- NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
- if (FilterSameScaledReg)
- NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
- NarrowSearchSpaceByFilterPostInc();
- if (LSRExpNarrow)
- NarrowSearchSpaceByDeletingCostlyFormulas();
- else
- NarrowSearchSpaceByPickingWinnerRegs();
-}
-
-/// This is the recursive solver.
-void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
- Cost &SolutionCost,
- SmallVectorImpl<const Formula *> &Workspace,
- const Cost &CurCost,
- const SmallPtrSet<const SCEV *, 16> &CurRegs,
- DenseSet<const SCEV *> &VisitedRegs) const {
- // Some ideas:
- // - prune more:
- // - use more aggressive filtering
- // - sort the formula so that the most profitable solutions are found first
- // - sort the uses too
- // - search faster:
- // - don't compute a cost, and then compare. compare while computing a cost
- // and bail early.
- // - track register sets with SmallBitVector
-
- const LSRUse &LU = Uses[Workspace.size()];
-
- // If this use references any register that's already a part of the
- // in-progress solution, consider it a requirement that a formula must
- // reference that register in order to be considered. This prunes out
- // unprofitable searching.
- SmallSetVector<const SCEV *, 4> ReqRegs;
- for (const SCEV *S : CurRegs)
- if (LU.Regs.count(S))
- ReqRegs.insert(S);
-
- SmallPtrSet<const SCEV *, 16> NewRegs;
- Cost NewCost(L, SE, TTI);
- for (const Formula &F : LU.Formulae) {
- // Ignore formulae which may not be ideal in terms of register reuse of
- // ReqRegs. The formula should use all required registers before
- // introducing new ones.
- // This can sometimes (notably when trying to favour postinc) lead to
- // sub-optimial decisions. There it is best left to the cost modelling to
- // get correct.
- if (!TTI.shouldFavorPostInc() || LU.Kind != LSRUse::Address) {
- int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
- for (const SCEV *Reg : ReqRegs) {
- if ((F.ScaledReg && F.ScaledReg == Reg) ||
- is_contained(F.BaseRegs, Reg)) {
- --NumReqRegsToFind;
- if (NumReqRegsToFind == 0)
- break;
- }
- }
- if (NumReqRegsToFind != 0) {
- // If none of the formulae satisfied the required registers, then we could
- // clear ReqRegs and try again. Currently, we simply give up in this case.
- continue;
- }
- }
-
- // Evaluate the cost of the current formula. If it's already worse than
- // the current best, prune the search at that point.
- NewCost = CurCost;
- NewRegs = CurRegs;
- NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
- if (NewCost.isLess(SolutionCost)) {
- Workspace.push_back(&F);
- if (Workspace.size() != Uses.size()) {
- SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
- NewRegs, VisitedRegs);
- if (F.getNumRegs() == 1 && Workspace.size() == 1)
- VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
- } else {
- LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
- dbgs() << ".\nRegs:\n";
- for (const SCEV *S : NewRegs) dbgs()
- << "- " << *S << "\n";
- dbgs() << '\n');
-
- SolutionCost = NewCost;
- Solution = Workspace;
- }
- Workspace.pop_back();
- }
- }
-}
-
-/// Choose one formula from each use. Return the results in the given Solution
-/// vector.
-void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
- SmallVector<const Formula *, 8> Workspace;
- Cost SolutionCost(L, SE, TTI);
- SolutionCost.Lose();
- Cost CurCost(L, SE, TTI);
- SmallPtrSet<const SCEV *, 16> CurRegs;
- DenseSet<const SCEV *> VisitedRegs;
- Workspace.reserve(Uses.size());
-
- // SolveRecurse does all the work.
- SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
- CurRegs, VisitedRegs);
- if (Solution.empty()) {
- LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
- return;
- }
-
- // Ok, we've now made all our decisions.
- LLVM_DEBUG(dbgs() << "\n"
- "The chosen solution requires ";
- SolutionCost.print(dbgs()); dbgs() << ":\n";
- for (size_t i = 0, e = Uses.size(); i != e; ++i) {
- dbgs() << " ";
- Uses[i].print(dbgs());
- dbgs() << "\n"
- " ";
- Solution[i]->print(dbgs());
- dbgs() << '\n';
- });
-
- assert(Solution.size() == Uses.size() && "Malformed solution!");
-}
-
-/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
-/// we can go while still being dominated by the input positions. This helps
-/// canonicalize the insert position, which encourages sharing.
-BasicBlock::iterator
-LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
- const SmallVectorImpl<Instruction *> &Inputs)
- const {
- Instruction *Tentative = &*IP;
- while (true) {
- bool AllDominate = true;
- Instruction *BetterPos = nullptr;
- // Don't bother attempting to insert before a catchswitch, their basic block
- // cannot have other non-PHI instructions.
- if (isa<CatchSwitchInst>(Tentative))
- return IP;
-
- for (Instruction *Inst : Inputs) {
- if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
- AllDominate = false;
- break;
- }
- // Attempt to find an insert position in the middle of the block,
- // instead of at the end, so that it can be used for other expansions.
- if (Tentative->getParent() == Inst->getParent() &&
- (!BetterPos || !DT.dominates(Inst, BetterPos)))
- BetterPos = &*std::next(BasicBlock::iterator(Inst));
- }
- if (!AllDominate)
- break;
- if (BetterPos)
- IP = BetterPos->getIterator();
- else
- IP = Tentative->getIterator();
-
- const Loop *IPLoop = LI.getLoopFor(IP->getParent());
- unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
-
- BasicBlock *IDom;
- for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
- if (!Rung) return IP;
- Rung = Rung->getIDom();
- if (!Rung) return IP;
- IDom = Rung->getBlock();
-
- // Don't climb into a loop though.
- const Loop *IDomLoop = LI.getLoopFor(IDom);
- unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
- if (IDomDepth <= IPLoopDepth &&
- (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
- break;
- }
-
- Tentative = IDom->getTerminator();
- }
-
- return IP;
-}
-
-/// Determine an input position which will be dominated by the operands and
-/// which will dominate the result.
-BasicBlock::iterator
-LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
- const LSRFixup &LF,
- const LSRUse &LU,
- SCEVExpander &Rewriter) const {
- // Collect some instructions which must be dominated by the
- // expanding replacement. These must be dominated by any operands that
- // will be required in the expansion.
- SmallVector<Instruction *, 4> Inputs;
- if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
- Inputs.push_back(I);
- if (LU.Kind == LSRUse::ICmpZero)
- if (Instruction *I =
- dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
- Inputs.push_back(I);
- if (LF.PostIncLoops.count(L)) {
- if (LF.isUseFullyOutsideLoop(L))
- Inputs.push_back(L->getLoopLatch()->getTerminator());
- else
- Inputs.push_back(IVIncInsertPos);
- }
- // The expansion must also be dominated by the increment positions of any
- // loops it for which it is using post-inc mode.
- for (const Loop *PIL : LF.PostIncLoops) {
- if (PIL == L) continue;
-
- // Be dominated by the loop exit.
- SmallVector<BasicBlock *, 4> ExitingBlocks;
- PIL->getExitingBlocks(ExitingBlocks);
- if (!ExitingBlocks.empty()) {
- BasicBlock *BB = ExitingBlocks[0];
- for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
- BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
- Inputs.push_back(BB->getTerminator());
- }
- }
-
- assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
- && !isa<DbgInfoIntrinsic>(LowestIP) &&
- "Insertion point must be a normal instruction");
-
- // Then, climb up the immediate dominator tree as far as we can go while
- // still being dominated by the input positions.
- BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
-
- // Don't insert instructions before PHI nodes.
- while (isa<PHINode>(IP)) ++IP;
-
- // Ignore landingpad instructions.
- while (IP->isEHPad()) ++IP;
-
- // Ignore debug intrinsics.
- while (isa<DbgInfoIntrinsic>(IP)) ++IP;
-
- // Set IP below instructions recently inserted by SCEVExpander. This keeps the
- // IP consistent across expansions and allows the previously inserted
- // instructions to be reused by subsequent expansion.
- while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
- ++IP;
-
- return IP;
-}
-
-/// Emit instructions for the leading candidate expression for this LSRUse (this
-/// is called "expanding").
-Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
- const Formula &F, BasicBlock::iterator IP,
- SCEVExpander &Rewriter,
- SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
- if (LU.RigidFormula)
- return LF.OperandValToReplace;
-
- // Determine an input position which will be dominated by the operands and
- // which will dominate the result.
- IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
- Rewriter.setInsertPoint(&*IP);
-
- // Inform the Rewriter if we have a post-increment use, so that it can
- // perform an advantageous expansion.
- Rewriter.setPostInc(LF.PostIncLoops);
-
- // This is the type that the user actually needs.
- Type *OpTy = LF.OperandValToReplace->getType();
- // This will be the type that we'll initially expand to.
- Type *Ty = F.getType();
- if (!Ty)
- // No type known; just expand directly to the ultimate type.
- Ty = OpTy;
- else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
- // Expand directly to the ultimate type if it's the right size.
- Ty = OpTy;
- // This is the type to do integer arithmetic in.
- Type *IntTy = SE.getEffectiveSCEVType(Ty);
-
- // Build up a list of operands to add together to form the full base.
- SmallVector<const SCEV *, 8> Ops;
-
- // Expand the BaseRegs portion.
- for (const SCEV *Reg : F.BaseRegs) {
- assert(!Reg->isZero() && "Zero allocated in a base register!");
-
- // If we're expanding for a post-inc user, make the post-inc adjustment.
- Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
- Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
- }
-
- // Expand the ScaledReg portion.
- Value *ICmpScaledV = nullptr;
- if (F.Scale != 0) {
- const SCEV *ScaledS = F.ScaledReg;
-
- // If we're expanding for a post-inc user, make the post-inc adjustment.
- PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
- ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
-
- if (LU.Kind == LSRUse::ICmpZero) {
- // Expand ScaleReg as if it was part of the base regs.
- if (F.Scale == 1)
- Ops.push_back(
- SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
- else {
- // An interesting way of "folding" with an icmp is to use a negated
- // scale, which we'll implement by inserting it into the other operand
- // of the icmp.
- assert(F.Scale == -1 &&
- "The only scale supported by ICmpZero uses is -1!");
- ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
- }
- } else {
- // Otherwise just expand the scaled register and an explicit scale,
- // which is expected to be matched as part of the address.
-
- // Flush the operand list to suppress SCEVExpander hoisting address modes.
- // Unless the addressing mode will not be folded.
- if (!Ops.empty() && LU.Kind == LSRUse::Address &&
- isAMCompletelyFolded(TTI, LU, F)) {
- Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
- Ops.clear();
- Ops.push_back(SE.getUnknown(FullV));
- }
- ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
- if (F.Scale != 1)
- ScaledS =
- SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
- Ops.push_back(ScaledS);
- }
- }
-
- // Expand the GV portion.
- if (F.BaseGV) {
- // Flush the operand list to suppress SCEVExpander hoisting.
- if (!Ops.empty()) {
- Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
- Ops.clear();
- Ops.push_back(SE.getUnknown(FullV));
- }
- Ops.push_back(SE.getUnknown(F.BaseGV));
- }
-
- // Flush the operand list to suppress SCEVExpander hoisting of both folded and
- // unfolded offsets. LSR assumes they both live next to their uses.
- if (!Ops.empty()) {
- Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
- Ops.clear();
- Ops.push_back(SE.getUnknown(FullV));
- }
-
- // Expand the immediate portion.
- int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
- if (Offset != 0) {
- if (LU.Kind == LSRUse::ICmpZero) {
- // The other interesting way of "folding" with an ICmpZero is to use a
- // negated immediate.
- if (!ICmpScaledV)
- ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
- else {
- Ops.push_back(SE.getUnknown(ICmpScaledV));
- ICmpScaledV = ConstantInt::get(IntTy, Offset);
- }
- } else {
- // Just add the immediate values. These again are expected to be matched
- // as part of the address.
- Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
- }
- }
-
- // Expand the unfolded offset portion.
- int64_t UnfoldedOffset = F.UnfoldedOffset;
- if (UnfoldedOffset != 0) {
- // Just add the immediate values.
- Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
- UnfoldedOffset)));
- }
-
- // Emit instructions summing all the operands.
- const SCEV *FullS = Ops.empty() ?
- SE.getConstant(IntTy, 0) :
- SE.getAddExpr(Ops);
- Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
-
- // We're done expanding now, so reset the rewriter.
- Rewriter.clearPostInc();
-
- // An ICmpZero Formula represents an ICmp which we're handling as a
- // comparison against zero. Now that we've expanded an expression for that
- // form, update the ICmp's other operand.
- if (LU.Kind == LSRUse::ICmpZero) {
- ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
- if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
- DeadInsts.emplace_back(OperandIsInstr);
- assert(!F.BaseGV && "ICmp does not support folding a global value and "
- "a scale at the same time!");
- if (F.Scale == -1) {
- if (ICmpScaledV->getType() != OpTy) {
- Instruction *Cast =
- CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
- OpTy, false),
- ICmpScaledV, OpTy, "tmp", CI);
- ICmpScaledV = Cast;
- }
- CI->setOperand(1, ICmpScaledV);
- } else {
- // A scale of 1 means that the scale has been expanded as part of the
- // base regs.
- assert((F.Scale == 0 || F.Scale == 1) &&
- "ICmp does not support folding a global value and "
- "a scale at the same time!");
- Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
- -(uint64_t)Offset);
- if (C->getType() != OpTy)
- C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
- OpTy, false),
- C, OpTy);
-
- CI->setOperand(1, C);
- }
- }
-
- return FullV;
-}
-
-/// Helper for Rewrite. PHI nodes are special because the use of their operands
-/// effectively happens in their predecessor blocks, so the expression may need
-/// to be expanded in multiple places.
-void LSRInstance::RewriteForPHI(
- PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
- SCEVExpander &Rewriter, SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
- DenseMap<BasicBlock *, Value *> Inserted;
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
- bool needUpdateFixups = false;
- BasicBlock *BB = PN->getIncomingBlock(i);
-
- // If this is a critical edge, split the edge so that we do not insert
- // the code on all predecessor/successor paths. We do this unless this
- // is the canonical backedge for this loop, which complicates post-inc
- // users.
- if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
- !isa<IndirectBrInst>(BB->getTerminator()) &&
- !isa<CatchSwitchInst>(BB->getTerminator())) {
- BasicBlock *Parent = PN->getParent();
- Loop *PNLoop = LI.getLoopFor(Parent);
- if (!PNLoop || Parent != PNLoop->getHeader()) {
- // Split the critical edge.
- BasicBlock *NewBB = nullptr;
- if (!Parent->isLandingPad()) {
+ return;
+
+ LLVM_DEBUG(
+ dbgs() << "The search space is too complex.\n"
+ "Narrowing the search space by assuming that uses separated "
+ "by a constant offset will use the same registers.\n");
+
+ // This is especially useful for unrolled loops.
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ for (const Formula &F : LU.Formulae) {
+ if (F.BaseOffset == 0 || (F.Scale != 0 && F.Scale != 1))
+ continue;
+
+ LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU);
+ if (!LUThatHas)
+ continue;
+
+ if (!reconcileNewOffset(*LUThatHas, F.BaseOffset, /*HasBaseReg=*/ false,
+ LU.Kind, LU.AccessTy))
+ continue;
+
+ LLVM_DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); dbgs() << '\n');
+
+ LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop;
+
+ // Transfer the fixups of LU to LUThatHas.
+ for (LSRFixup &Fixup : LU.Fixups) {
+ Fixup.Offset += F.BaseOffset;
+ LUThatHas->pushFixup(Fixup);
+ LLVM_DEBUG(dbgs() << "New fixup has offset " << Fixup.Offset << '\n');
+ }
+
+ // Delete formulae from the new use which are no longer legal.
+ bool Any = false;
+ for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) {
+ Formula &F = LUThatHas->Formulae[i];
+ if (!isLegalUse(TTI, LUThatHas->MinOffset, LUThatHas->MaxOffset,
+ LUThatHas->Kind, LUThatHas->AccessTy, F)) {
+ LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
+ LUThatHas->DeleteFormula(F);
+ --i;
+ --e;
+ Any = true;
+ }
+ }
+
+ if (Any)
+ LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses);
+
+ // Delete the old use.
+ DeleteUse(LU, LUIdx);
+ --LUIdx;
+ --NumUses;
+ break;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+}
+
+/// Call FilterOutUndesirableDedicatedRegisters again, if necessary, now that
+/// we've done more filtering, as it may be able to find more formulae to
+/// eliminate.
+void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
+ if (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+ LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
+
+ LLVM_DEBUG(dbgs() << "Narrowing the search space by re-filtering out "
+ "undesirable dedicated registers.\n");
+
+ FilterOutUndesirableDedicatedRegisters();
+
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+ }
+}
+
+/// If a LSRUse has multiple formulae with the same ScaledReg and Scale.
+/// Pick the best one and delete the others.
+/// This narrowing heuristic is to keep as many formulae with different
+/// Scale and ScaledReg pair as possible while narrowing the search space.
+/// The benefit is that it is more likely to find out a better solution
+/// from a formulae set with more Scale and ScaledReg variations than
+/// a formulae set with the same Scale and ScaledReg. The picking winner
+/// reg heuristic will often keep the formulae with the same Scale and
+/// ScaledReg and filter others, and we want to avoid that if possible.
+void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+ return;
+
+ LLVM_DEBUG(
+ dbgs() << "The search space is too complex.\n"
+ "Narrowing the search space by choosing the best Formula "
+ "from the Formulae with the same Scale and ScaledReg.\n");
+
+ // Map the "Scale * ScaledReg" pair to the best formula of current LSRUse.
+ using BestFormulaeTy = DenseMap<std::pair<const SCEV *, int64_t>, size_t>;
+
+ BestFormulaeTy BestFormulae;
+#ifndef NDEBUG
+ bool ChangedFormulae = false;
+#endif
+ DenseSet<const SCEV *> VisitedRegs;
+ SmallPtrSet<const SCEV *, 16> Regs;
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ LLVM_DEBUG(dbgs() << "Filtering for use "; LU.print(dbgs());
+ dbgs() << '\n');
+
+ // Return true if Formula FA is better than Formula FB.
+ auto IsBetterThan = [&](Formula &FA, Formula &FB) {
+ // First we will try to choose the Formula with fewer new registers.
+ // For a register used by current Formula, the more the register is
+ // shared among LSRUses, the less we increase the register number
+ // counter of the formula.
+ size_t FARegNum = 0;
+ for (const SCEV *Reg : FA.BaseRegs) {
+ const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+ FARegNum += (NumUses - UsedByIndices.count() + 1);
+ }
+ size_t FBRegNum = 0;
+ for (const SCEV *Reg : FB.BaseRegs) {
+ const SmallBitVector &UsedByIndices = RegUses.getUsedByIndices(Reg);
+ FBRegNum += (NumUses - UsedByIndices.count() + 1);
+ }
+ if (FARegNum != FBRegNum)
+ return FARegNum < FBRegNum;
+
+ // If the new register numbers are the same, choose the Formula with
+ // less Cost.
+ Cost CostFA(L, SE, TTI);
+ Cost CostFB(L, SE, TTI);
+ Regs.clear();
+ CostFA.RateFormula(FA, Regs, VisitedRegs, LU);
+ Regs.clear();
+ CostFB.RateFormula(FB, Regs, VisitedRegs, LU);
+ return CostFA.isLess(CostFB);
+ };
+
+ bool Any = false;
+ for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
+ ++FIdx) {
+ Formula &F = LU.Formulae[FIdx];
+ if (!F.ScaledReg)
+ continue;
+ auto P = BestFormulae.insert({{F.ScaledReg, F.Scale}, FIdx});
+ if (P.second)
+ continue;
+
+ Formula &Best = LU.Formulae[P.first->second];
+ if (IsBetterThan(F, Best))
+ std::swap(F, Best);
+ LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
+ dbgs() << "\n"
+ " in favor of formula ";
+ Best.print(dbgs()); dbgs() << '\n');
+#ifndef NDEBUG
+ ChangedFormulae = true;
+#endif
+ LU.DeleteFormula(F);
+ --FIdx;
+ --NumForms;
+ Any = true;
+ }
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+
+ // Reset this to prepare for the next use.
+ BestFormulae.clear();
+ }
+
+ LLVM_DEBUG(if (ChangedFormulae) {
+ dbgs() << "\n"
+ "After filtering out undesirable candidates:\n";
+ print_uses(dbgs());
+ });
+}
+
+/// If we are over the complexity limit, filter out any post-inc prefering
+/// variables to only post-inc values.
+void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
+ if (!TTI.shouldFavorPostInc())
+ return;
+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+ return;
+
+ LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
+ "Narrowing the search space by choosing the lowest "
+ "register Formula for PostInc Uses.\n");
+
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+
+ if (LU.Kind != LSRUse::Address)
+ continue;
+ if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
+ !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
+ continue;
+
+ size_t MinRegs = std::numeric_limits<size_t>::max();
+ for (const Formula &F : LU.Formulae)
+ MinRegs = std::min(F.getNumRegs(), MinRegs);
+
+ bool Any = false;
+ for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
+ ++FIdx) {
+ Formula &F = LU.Formulae[FIdx];
+ if (F.getNumRegs() > MinRegs) {
+ LLVM_DEBUG(dbgs() << " Filtering out formula "; F.print(dbgs());
+ dbgs() << "\n");
+ LU.DeleteFormula(F);
+ --FIdx;
+ --NumForms;
+ Any = true;
+ }
+ }
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+
+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+ break;
+ }
+
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+}
+
+/// The function delete formulas with high registers number expectation.
+/// Assuming we don't know the value of each formula (already delete
+/// all inefficient), generate probability of not selecting for each
+/// register.
+/// For example,
+/// Use1:
+/// reg(a) + reg({0,+,1})
+/// reg(a) + reg({-1,+,1}) + 1
+/// reg({a,+,1})
+/// Use2:
+/// reg(b) + reg({0,+,1})
+/// reg(b) + reg({-1,+,1}) + 1
+/// reg({b,+,1})
+/// Use3:
+/// reg(c) + reg(b) + reg({0,+,1})
+/// reg(c) + reg({b,+,1})
+///
+/// Probability of not selecting
+/// Use1 Use2 Use3
+/// reg(a) (1/3) * 1 * 1
+/// reg(b) 1 * (1/3) * (1/2)
+/// reg({0,+,1}) (2/3) * (2/3) * (1/2)
+/// reg({-1,+,1}) (2/3) * (2/3) * 1
+/// reg({a,+,1}) (2/3) * 1 * 1
+/// reg({b,+,1}) 1 * (2/3) * (2/3)
+/// reg(c) 1 * 1 * 0
+///
+/// Now count registers number mathematical expectation for each formula:
+/// Note that for each use we exclude probability if not selecting for the use.
+/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
+/// probabilty 1/3 of not selecting for Use1).
+/// Use1:
+/// reg(a) + reg({0,+,1}) 1 + 1/3 -- to be deleted
+/// reg(a) + reg({-1,+,1}) + 1 1 + 4/9 -- to be deleted
+/// reg({a,+,1}) 1
+/// Use2:
+/// reg(b) + reg({0,+,1}) 1/2 + 1/3 -- to be deleted
+/// reg(b) + reg({-1,+,1}) + 1 1/2 + 2/3 -- to be deleted
+/// reg({b,+,1}) 2/3
+/// Use3:
+/// reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
+/// reg(c) + reg({b,+,1}) 1 + 2/3
+void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
+ if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+ return;
+ // Ok, we have too many of formulae on our hands to conveniently handle.
+ // Use a rough heuristic to thin out the list.
+
+ // Set of Regs wich will be 100% used in final solution.
+ // Used in each formula of a solution (in example above this is reg(c)).
+ // We can skip them in calculations.
+ SmallPtrSet<const SCEV *, 4> UniqRegs;
+ LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
+
+ // Map each register to probability of not selecting
+ DenseMap <const SCEV *, float> RegNumMap;
+ for (const SCEV *Reg : RegUses) {
+ if (UniqRegs.count(Reg))
+ continue;
+ float PNotSel = 1;
+ for (const LSRUse &LU : Uses) {
+ if (!LU.Regs.count(Reg))
+ continue;
+ float P = LU.getNotSelectedProbability(Reg);
+ if (P != 0.0)
+ PNotSel *= P;
+ else
+ UniqRegs.insert(Reg);
+ }
+ RegNumMap.insert(std::make_pair(Reg, PNotSel));
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "Narrowing the search space by deleting costly formulas\n");
+
+ // Delete formulas where registers number expectation is high.
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ // If nothing to delete - continue.
+ if (LU.Formulae.size() < 2)
+ continue;
+ // This is temporary solution to test performance. Float should be
+ // replaced with round independent type (based on integers) to avoid
+ // different results for different target builds.
+ float FMinRegNum = LU.Formulae[0].getNumRegs();
+ float FMinARegNum = LU.Formulae[0].getNumRegs();
+ size_t MinIdx = 0;
+ for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+ Formula &F = LU.Formulae[i];
+ float FRegNum = 0;
+ float FARegNum = 0;
+ for (const SCEV *BaseReg : F.BaseRegs) {
+ if (UniqRegs.count(BaseReg))
+ continue;
+ FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+ if (isa<SCEVAddRecExpr>(BaseReg))
+ FARegNum +=
+ RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+ }
+ if (const SCEV *ScaledReg = F.ScaledReg) {
+ if (!UniqRegs.count(ScaledReg)) {
+ FRegNum +=
+ RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+ if (isa<SCEVAddRecExpr>(ScaledReg))
+ FARegNum +=
+ RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+ }
+ }
+ if (FMinRegNum > FRegNum ||
+ (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
+ FMinRegNum = FRegNum;
+ FMinARegNum = FARegNum;
+ MinIdx = i;
+ }
+ }
+ LLVM_DEBUG(dbgs() << " The formula "; LU.Formulae[MinIdx].print(dbgs());
+ dbgs() << " with min reg num " << FMinRegNum << '\n');
+ if (MinIdx != 0)
+ std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
+ while (LU.Formulae.size() != 1) {
+ LLVM_DEBUG(dbgs() << " Deleting "; LU.Formulae.back().print(dbgs());
+ dbgs() << '\n');
+ LU.Formulae.pop_back();
+ }
+ LU.RecomputeRegs(LUIdx, RegUses);
+ assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
+ Formula &F = LU.Formulae[0];
+ LLVM_DEBUG(dbgs() << " Leaving only "; F.print(dbgs()); dbgs() << '\n');
+ // When we choose the formula, the regs become unique.
+ UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+ if (F.ScaledReg)
+ UniqRegs.insert(F.ScaledReg);
+ }
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+}
+
+/// Pick a register which seems likely to be profitable, and then in any use
+/// which has any reference to that register, delete all formulae which do not
+/// reference that register.
+void LSRInstance::NarrowSearchSpaceByPickingWinnerRegs() {
+ // With all other options exhausted, loop until the system is simple
+ // enough to handle.
+ SmallPtrSet<const SCEV *, 4> Taken;
+ while (EstimateSearchSpaceComplexity() >= ComplexityLimit) {
+ // Ok, we have too many of formulae on our hands to conveniently handle.
+ // Use a rough heuristic to thin out the list.
+ LLVM_DEBUG(dbgs() << "The search space is too complex.\n");
+
+ // Pick the register which is used by the most LSRUses, which is likely
+ // to be a good reuse register candidate.
+ const SCEV *Best = nullptr;
+ unsigned BestNum = 0;
+ for (const SCEV *Reg : RegUses) {
+ if (Taken.count(Reg))
+ continue;
+ if (!Best) {
+ Best = Reg;
+ BestNum = RegUses.getUsedByIndices(Reg).count();
+ } else {
+ unsigned Count = RegUses.getUsedByIndices(Reg).count();
+ if (Count > BestNum) {
+ Best = Reg;
+ BestNum = Count;
+ }
+ }
+ }
+ assert(Best && "Failed to find best LSRUse candidate");
+
+ LLVM_DEBUG(dbgs() << "Narrowing the search space by assuming " << *Best
+ << " will yield profitable reuse.\n");
+ Taken.insert(Best);
+
+ // In any use with formulae which references this register, delete formulae
+ // which don't reference it.
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+ LSRUse &LU = Uses[LUIdx];
+ if (!LU.Regs.count(Best)) continue;
+
+ bool Any = false;
+ for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+ Formula &F = LU.Formulae[i];
+ if (!F.referencesReg(Best)) {
+ LLVM_DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n');
+ LU.DeleteFormula(F);
+ --e;
+ --i;
+ Any = true;
+ assert(e != 0 && "Use has no formulae left! Is Regs inconsistent?");
+ continue;
+ }
+ }
+
+ if (Any)
+ LU.RecomputeRegs(LUIdx, RegUses);
+ }
+
+ LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+ }
+}
+
+/// If there are an extraordinary number of formulae to choose from, use some
+/// rough heuristics to prune down the number of formulae. This keeps the main
+/// solver from taking an extraordinary amount of time in some worst-case
+/// scenarios.
+void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
+ NarrowSearchSpaceByDetectingSupersets();
+ NarrowSearchSpaceByCollapsingUnrolledCode();
+ NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+ if (FilterSameScaledReg)
+ NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+ NarrowSearchSpaceByFilterPostInc();
+ if (LSRExpNarrow)
+ NarrowSearchSpaceByDeletingCostlyFormulas();
+ else
+ NarrowSearchSpaceByPickingWinnerRegs();
+}
+
+/// This is the recursive solver.
+void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
+ Cost &SolutionCost,
+ SmallVectorImpl<const Formula *> &Workspace,
+ const Cost &CurCost,
+ const SmallPtrSet<const SCEV *, 16> &CurRegs,
+ DenseSet<const SCEV *> &VisitedRegs) const {
+ // Some ideas:
+ // - prune more:
+ // - use more aggressive filtering
+ // - sort the formula so that the most profitable solutions are found first
+ // - sort the uses too
+ // - search faster:
+ // - don't compute a cost, and then compare. compare while computing a cost
+ // and bail early.
+ // - track register sets with SmallBitVector
+
+ const LSRUse &LU = Uses[Workspace.size()];
+
+ // If this use references any register that's already a part of the
+ // in-progress solution, consider it a requirement that a formula must
+ // reference that register in order to be considered. This prunes out
+ // unprofitable searching.
+ SmallSetVector<const SCEV *, 4> ReqRegs;
+ for (const SCEV *S : CurRegs)
+ if (LU.Regs.count(S))
+ ReqRegs.insert(S);
+
+ SmallPtrSet<const SCEV *, 16> NewRegs;
+ Cost NewCost(L, SE, TTI);
+ for (const Formula &F : LU.Formulae) {
+ // Ignore formulae which may not be ideal in terms of register reuse of
+ // ReqRegs. The formula should use all required registers before
+ // introducing new ones.
+ // This can sometimes (notably when trying to favour postinc) lead to
+ // sub-optimial decisions. There it is best left to the cost modelling to
+ // get correct.
+ if (!TTI.shouldFavorPostInc() || LU.Kind != LSRUse::Address) {
+ int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
+ for (const SCEV *Reg : ReqRegs) {
+ if ((F.ScaledReg && F.ScaledReg == Reg) ||
+ is_contained(F.BaseRegs, Reg)) {
+ --NumReqRegsToFind;
+ if (NumReqRegsToFind == 0)
+ break;
+ }
+ }
+ if (NumReqRegsToFind != 0) {
+ // If none of the formulae satisfied the required registers, then we could
+ // clear ReqRegs and try again. Currently, we simply give up in this case.
+ continue;
+ }
+ }
+
+ // Evaluate the cost of the current formula. If it's already worse than
+ // the current best, prune the search at that point.
+ NewCost = CurCost;
+ NewRegs = CurRegs;
+ NewCost.RateFormula(F, NewRegs, VisitedRegs, LU);
+ if (NewCost.isLess(SolutionCost)) {
+ Workspace.push_back(&F);
+ if (Workspace.size() != Uses.size()) {
+ SolveRecurse(Solution, SolutionCost, Workspace, NewCost,
+ NewRegs, VisitedRegs);
+ if (F.getNumRegs() == 1 && Workspace.size() == 1)
+ VisitedRegs.insert(F.ScaledReg ? F.ScaledReg : F.BaseRegs[0]);
+ } else {
+ LLVM_DEBUG(dbgs() << "New best at "; NewCost.print(dbgs());
+ dbgs() << ".\nRegs:\n";
+ for (const SCEV *S : NewRegs) dbgs()
+ << "- " << *S << "\n";
+ dbgs() << '\n');
+
+ SolutionCost = NewCost;
+ Solution = Workspace;
+ }
+ Workspace.pop_back();
+ }
+ }
+}
+
+/// Choose one formula from each use. Return the results in the given Solution
+/// vector.
+void LSRInstance::Solve(SmallVectorImpl<const Formula *> &Solution) const {
+ SmallVector<const Formula *, 8> Workspace;
+ Cost SolutionCost(L, SE, TTI);
+ SolutionCost.Lose();
+ Cost CurCost(L, SE, TTI);
+ SmallPtrSet<const SCEV *, 16> CurRegs;
+ DenseSet<const SCEV *> VisitedRegs;
+ Workspace.reserve(Uses.size());
+
+ // SolveRecurse does all the work.
+ SolveRecurse(Solution, SolutionCost, Workspace, CurCost,
+ CurRegs, VisitedRegs);
+ if (Solution.empty()) {
+ LLVM_DEBUG(dbgs() << "\nNo Satisfactory Solution\n");
+ return;
+ }
+
+ // Ok, we've now made all our decisions.
+ LLVM_DEBUG(dbgs() << "\n"
+ "The chosen solution requires ";
+ SolutionCost.print(dbgs()); dbgs() << ":\n";
+ for (size_t i = 0, e = Uses.size(); i != e; ++i) {
+ dbgs() << " ";
+ Uses[i].print(dbgs());
+ dbgs() << "\n"
+ " ";
+ Solution[i]->print(dbgs());
+ dbgs() << '\n';
+ });
+
+ assert(Solution.size() == Uses.size() && "Malformed solution!");
+}
+
+/// Helper for AdjustInsertPositionForExpand. Climb up the dominator tree far as
+/// we can go while still being dominated by the input positions. This helps
+/// canonicalize the insert position, which encourages sharing.
+BasicBlock::iterator
+LSRInstance::HoistInsertPosition(BasicBlock::iterator IP,
+ const SmallVectorImpl<Instruction *> &Inputs)
+ const {
+ Instruction *Tentative = &*IP;
+ while (true) {
+ bool AllDominate = true;
+ Instruction *BetterPos = nullptr;
+ // Don't bother attempting to insert before a catchswitch, their basic block
+ // cannot have other non-PHI instructions.
+ if (isa<CatchSwitchInst>(Tentative))
+ return IP;
+
+ for (Instruction *Inst : Inputs) {
+ if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
+ AllDominate = false;
+ break;
+ }
+ // Attempt to find an insert position in the middle of the block,
+ // instead of at the end, so that it can be used for other expansions.
+ if (Tentative->getParent() == Inst->getParent() &&
+ (!BetterPos || !DT.dominates(Inst, BetterPos)))
+ BetterPos = &*std::next(BasicBlock::iterator(Inst));
+ }
+ if (!AllDominate)
+ break;
+ if (BetterPos)
+ IP = BetterPos->getIterator();
+ else
+ IP = Tentative->getIterator();
+
+ const Loop *IPLoop = LI.getLoopFor(IP->getParent());
+ unsigned IPLoopDepth = IPLoop ? IPLoop->getLoopDepth() : 0;
+
+ BasicBlock *IDom;
+ for (DomTreeNode *Rung = DT.getNode(IP->getParent()); ; ) {
+ if (!Rung) return IP;
+ Rung = Rung->getIDom();
+ if (!Rung) return IP;
+ IDom = Rung->getBlock();
+
+ // Don't climb into a loop though.
+ const Loop *IDomLoop = LI.getLoopFor(IDom);
+ unsigned IDomDepth = IDomLoop ? IDomLoop->getLoopDepth() : 0;
+ if (IDomDepth <= IPLoopDepth &&
+ (IDomDepth != IPLoopDepth || IDomLoop == IPLoop))
+ break;
+ }
+
+ Tentative = IDom->getTerminator();
+ }
+
+ return IP;
+}
+
+/// Determine an input position which will be dominated by the operands and
+/// which will dominate the result.
+BasicBlock::iterator
+LSRInstance::AdjustInsertPositionForExpand(BasicBlock::iterator LowestIP,
+ const LSRFixup &LF,
+ const LSRUse &LU,
+ SCEVExpander &Rewriter) const {
+ // Collect some instructions which must be dominated by the
+ // expanding replacement. These must be dominated by any operands that
+ // will be required in the expansion.
+ SmallVector<Instruction *, 4> Inputs;
+ if (Instruction *I = dyn_cast<Instruction>(LF.OperandValToReplace))
+ Inputs.push_back(I);
+ if (LU.Kind == LSRUse::ICmpZero)
+ if (Instruction *I =
+ dyn_cast<Instruction>(cast<ICmpInst>(LF.UserInst)->getOperand(1)))
+ Inputs.push_back(I);
+ if (LF.PostIncLoops.count(L)) {
+ if (LF.isUseFullyOutsideLoop(L))
+ Inputs.push_back(L->getLoopLatch()->getTerminator());
+ else
+ Inputs.push_back(IVIncInsertPos);
+ }
+ // The expansion must also be dominated by the increment positions of any
+ // loops it for which it is using post-inc mode.
+ for (const Loop *PIL : LF.PostIncLoops) {
+ if (PIL == L) continue;
+
+ // Be dominated by the loop exit.
+ SmallVector<BasicBlock *, 4> ExitingBlocks;
+ PIL->getExitingBlocks(ExitingBlocks);
+ if (!ExitingBlocks.empty()) {
+ BasicBlock *BB = ExitingBlocks[0];
+ for (unsigned i = 1, e = ExitingBlocks.size(); i != e; ++i)
+ BB = DT.findNearestCommonDominator(BB, ExitingBlocks[i]);
+ Inputs.push_back(BB->getTerminator());
+ }
+ }
+
+ assert(!isa<PHINode>(LowestIP) && !LowestIP->isEHPad()
+ && !isa<DbgInfoIntrinsic>(LowestIP) &&
+ "Insertion point must be a normal instruction");
+
+ // Then, climb up the immediate dominator tree as far as we can go while
+ // still being dominated by the input positions.
+ BasicBlock::iterator IP = HoistInsertPosition(LowestIP, Inputs);
+
+ // Don't insert instructions before PHI nodes.
+ while (isa<PHINode>(IP)) ++IP;
+
+ // Ignore landingpad instructions.
+ while (IP->isEHPad()) ++IP;
+
+ // Ignore debug intrinsics.
+ while (isa<DbgInfoIntrinsic>(IP)) ++IP;
+
+ // Set IP below instructions recently inserted by SCEVExpander. This keeps the
+ // IP consistent across expansions and allows the previously inserted
+ // instructions to be reused by subsequent expansion.
+ while (Rewriter.isInsertedInstruction(&*IP) && IP != LowestIP)
+ ++IP;
+
+ return IP;
+}
+
+/// Emit instructions for the leading candidate expression for this LSRUse (this
+/// is called "expanding").
+Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
+ const Formula &F, BasicBlock::iterator IP,
+ SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+ if (LU.RigidFormula)
+ return LF.OperandValToReplace;
+
+ // Determine an input position which will be dominated by the operands and
+ // which will dominate the result.
+ IP = AdjustInsertPositionForExpand(IP, LF, LU, Rewriter);
+ Rewriter.setInsertPoint(&*IP);
+
+ // Inform the Rewriter if we have a post-increment use, so that it can
+ // perform an advantageous expansion.
+ Rewriter.setPostInc(LF.PostIncLoops);
+
+ // This is the type that the user actually needs.
+ Type *OpTy = LF.OperandValToReplace->getType();
+ // This will be the type that we'll initially expand to.
+ Type *Ty = F.getType();
+ if (!Ty)
+ // No type known; just expand directly to the ultimate type.
+ Ty = OpTy;
+ else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
+ // Expand directly to the ultimate type if it's the right size.
+ Ty = OpTy;
+ // This is the type to do integer arithmetic in.
+ Type *IntTy = SE.getEffectiveSCEVType(Ty);
+
+ // Build up a list of operands to add together to form the full base.
+ SmallVector<const SCEV *, 8> Ops;
+
+ // Expand the BaseRegs portion.
+ for (const SCEV *Reg : F.BaseRegs) {
+ assert(!Reg->isZero() && "Zero allocated in a base register!");
+
+ // If we're expanding for a post-inc user, make the post-inc adjustment.
+ Reg = denormalizeForPostIncUse(Reg, LF.PostIncLoops, SE);
+ Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, nullptr)));
+ }
+
+ // Expand the ScaledReg portion.
+ Value *ICmpScaledV = nullptr;
+ if (F.Scale != 0) {
+ const SCEV *ScaledS = F.ScaledReg;
+
+ // If we're expanding for a post-inc user, make the post-inc adjustment.
+ PostIncLoopSet &Loops = const_cast<PostIncLoopSet &>(LF.PostIncLoops);
+ ScaledS = denormalizeForPostIncUse(ScaledS, Loops, SE);
+
+ if (LU.Kind == LSRUse::ICmpZero) {
+ // Expand ScaleReg as if it was part of the base regs.
+ if (F.Scale == 1)
+ Ops.push_back(
+ SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr)));
+ else {
+ // An interesting way of "folding" with an icmp is to use a negated
+ // scale, which we'll implement by inserting it into the other operand
+ // of the icmp.
+ assert(F.Scale == -1 &&
+ "The only scale supported by ICmpZero uses is -1!");
+ ICmpScaledV = Rewriter.expandCodeFor(ScaledS, nullptr);
+ }
+ } else {
+ // Otherwise just expand the scaled register and an explicit scale,
+ // which is expected to be matched as part of the address.
+
+ // Flush the operand list to suppress SCEVExpander hoisting address modes.
+ // Unless the addressing mode will not be folded.
+ if (!Ops.empty() && LU.Kind == LSRUse::Address &&
+ isAMCompletelyFolded(TTI, LU, F)) {
+ Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), nullptr);
+ Ops.clear();
+ Ops.push_back(SE.getUnknown(FullV));
+ }
+ ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, nullptr));
+ if (F.Scale != 1)
+ ScaledS =
+ SE.getMulExpr(ScaledS, SE.getConstant(ScaledS->getType(), F.Scale));
+ Ops.push_back(ScaledS);
+ }
+ }
+
+ // Expand the GV portion.
+ if (F.BaseGV) {
+ // Flush the operand list to suppress SCEVExpander hoisting.
+ if (!Ops.empty()) {
+ Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
+ Ops.clear();
+ Ops.push_back(SE.getUnknown(FullV));
+ }
+ Ops.push_back(SE.getUnknown(F.BaseGV));
+ }
+
+ // Flush the operand list to suppress SCEVExpander hoisting of both folded and
+ // unfolded offsets. LSR assumes they both live next to their uses.
+ if (!Ops.empty()) {
+ Value *FullV = Rewriter.expandCodeFor(SE.getAddExpr(Ops), Ty);
+ Ops.clear();
+ Ops.push_back(SE.getUnknown(FullV));
+ }
+
+ // Expand the immediate portion.
+ int64_t Offset = (uint64_t)F.BaseOffset + LF.Offset;
+ if (Offset != 0) {
+ if (LU.Kind == LSRUse::ICmpZero) {
+ // The other interesting way of "folding" with an ICmpZero is to use a
+ // negated immediate.
+ if (!ICmpScaledV)
+ ICmpScaledV = ConstantInt::get(IntTy, -(uint64_t)Offset);
+ else {
+ Ops.push_back(SE.getUnknown(ICmpScaledV));
+ ICmpScaledV = ConstantInt::get(IntTy, Offset);
+ }
+ } else {
+ // Just add the immediate values. These again are expected to be matched
+ // as part of the address.
+ Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, Offset)));
+ }
+ }
+
+ // Expand the unfolded offset portion.
+ int64_t UnfoldedOffset = F.UnfoldedOffset;
+ if (UnfoldedOffset != 0) {
+ // Just add the immediate values.
+ Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy,
+ UnfoldedOffset)));
+ }
+
+ // Emit instructions summing all the operands.
+ const SCEV *FullS = Ops.empty() ?
+ SE.getConstant(IntTy, 0) :
+ SE.getAddExpr(Ops);
+ Value *FullV = Rewriter.expandCodeFor(FullS, Ty);
+
+ // We're done expanding now, so reset the rewriter.
+ Rewriter.clearPostInc();
+
+ // An ICmpZero Formula represents an ICmp which we're handling as a
+ // comparison against zero. Now that we've expanded an expression for that
+ // form, update the ICmp's other operand.
+ if (LU.Kind == LSRUse::ICmpZero) {
+ ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
+ if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
+ DeadInsts.emplace_back(OperandIsInstr);
+ assert(!F.BaseGV && "ICmp does not support folding a global value and "
+ "a scale at the same time!");
+ if (F.Scale == -1) {
+ if (ICmpScaledV->getType() != OpTy) {
+ Instruction *Cast =
+ CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
+ OpTy, false),
+ ICmpScaledV, OpTy, "tmp", CI);
+ ICmpScaledV = Cast;
+ }
+ CI->setOperand(1, ICmpScaledV);
+ } else {
+ // A scale of 1 means that the scale has been expanded as part of the
+ // base regs.
+ assert((F.Scale == 0 || F.Scale == 1) &&
+ "ICmp does not support folding a global value and "
+ "a scale at the same time!");
+ Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
+ -(uint64_t)Offset);
+ if (C->getType() != OpTy)
+ C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
+ OpTy, false),
+ C, OpTy);
+
+ CI->setOperand(1, C);
+ }
+ }
+
+ return FullV;
+}
+
+/// Helper for Rewrite. PHI nodes are special because the use of their operands
+/// effectively happens in their predecessor blocks, so the expression may need
+/// to be expanded in multiple places.
+void LSRInstance::RewriteForPHI(
+ PHINode *PN, const LSRUse &LU, const LSRFixup &LF, const Formula &F,
+ SCEVExpander &Rewriter, SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+ DenseMap<BasicBlock *, Value *> Inserted;
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (PN->getIncomingValue(i) == LF.OperandValToReplace) {
+ bool needUpdateFixups = false;
+ BasicBlock *BB = PN->getIncomingBlock(i);
+
+ // If this is a critical edge, split the edge so that we do not insert
+ // the code on all predecessor/successor paths. We do this unless this
+ // is the canonical backedge for this loop, which complicates post-inc
+ // users.
+ if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
+ !isa<IndirectBrInst>(BB->getTerminator()) &&
+ !isa<CatchSwitchInst>(BB->getTerminator())) {
+ BasicBlock *Parent = PN->getParent();
+ Loop *PNLoop = LI.getLoopFor(Parent);
+ if (!PNLoop || Parent != PNLoop->getHeader()) {
+ // Split the critical edge.
+ BasicBlock *NewBB = nullptr;
+ if (!Parent->isLandingPad()) {
NewBB =
SplitCriticalEdge(BB, Parent,
CriticalEdgeSplittingOptions(&DT, &LI, MSSAU)
.setMergeIdenticalEdges()
.setKeepOneInputPHIs());
- } else {
- SmallVector<BasicBlock*, 2> NewBBs;
- SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
- NewBB = NewBBs[0];
- }
- // If NewBB==NULL, then SplitCriticalEdge refused to split because all
- // phi predecessors are identical. The simple thing to do is skip
- // splitting in this case rather than complicate the API.
- if (NewBB) {
- // If PN is outside of the loop and BB is in the loop, we want to
- // move the block to be immediately before the PHI block, not
- // immediately after BB.
- if (L->contains(BB) && !L->contains(PN))
- NewBB->moveBefore(PN->getParent());
-
- // Splitting the edge can reduce the number of PHI entries we have.
- e = PN->getNumIncomingValues();
- BB = NewBB;
- i = PN->getBasicBlockIndex(BB);
-
- needUpdateFixups = true;
- }
- }
- }
-
- std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
- Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
- if (!Pair.second)
- PN->setIncomingValue(i, Pair.first->second);
- else {
- Value *FullV = Expand(LU, LF, F, BB->getTerminator()->getIterator(),
- Rewriter, DeadInsts);
-
- // If this is reuse-by-noop-cast, insert the noop cast.
- Type *OpTy = LF.OperandValToReplace->getType();
- if (FullV->getType() != OpTy)
- FullV =
- CastInst::Create(CastInst::getCastOpcode(FullV, false,
- OpTy, false),
- FullV, LF.OperandValToReplace->getType(),
- "tmp", BB->getTerminator());
-
- PN->setIncomingValue(i, FullV);
- Pair.first->second = FullV;
- }
-
- // If LSR splits critical edge and phi node has other pending
- // fixup operands, we need to update those pending fixups. Otherwise
- // formulae will not be implemented completely and some instructions
- // will not be eliminated.
- if (needUpdateFixups) {
- for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
- for (LSRFixup &Fixup : Uses[LUIdx].Fixups)
- // If fixup is supposed to rewrite some operand in the phi
- // that was just updated, it may be already moved to
- // another phi node. Such fixup requires update.
- if (Fixup.UserInst == PN) {
- // Check if the operand we try to replace still exists in the
- // original phi.
- bool foundInOriginalPHI = false;
- for (const auto &val : PN->incoming_values())
- if (val == Fixup.OperandValToReplace) {
- foundInOriginalPHI = true;
- break;
- }
-
- // If fixup operand found in original PHI - nothing to do.
- if (foundInOriginalPHI)
- continue;
-
- // Otherwise it might be moved to another PHI and requires update.
- // If fixup operand not found in any of the incoming blocks that
- // means we have already rewritten it - nothing to do.
- for (const auto &Block : PN->blocks())
- for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
- ++I) {
- PHINode *NewPN = cast<PHINode>(I);
- for (const auto &val : NewPN->incoming_values())
- if (val == Fixup.OperandValToReplace)
- Fixup.UserInst = NewPN;
- }
- }
- }
- }
-}
-
-/// Emit instructions for the leading candidate expression for this LSRUse (this
-/// is called "expanding"), and update the UserInst to reference the newly
-/// expanded value.
-void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
- const Formula &F, SCEVExpander &Rewriter,
- SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
- // First, find an insertion point that dominates UserInst. For PHI nodes,
- // find the nearest block which dominates all the relevant uses.
- if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
- RewriteForPHI(PN, LU, LF, F, Rewriter, DeadInsts);
- } else {
- Value *FullV =
- Expand(LU, LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
-
- // If this is reuse-by-noop-cast, insert the noop cast.
- Type *OpTy = LF.OperandValToReplace->getType();
- if (FullV->getType() != OpTy) {
- Instruction *Cast =
- CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
- FullV, OpTy, "tmp", LF.UserInst);
- FullV = Cast;
- }
-
- // Update the user. ICmpZero is handled specially here (for now) because
- // Expand may have updated one of the operands of the icmp already, and
- // its new value may happen to be equal to LF.OperandValToReplace, in
- // which case doing replaceUsesOfWith leads to replacing both operands
- // with the same value. TODO: Reorganize this.
- if (LU.Kind == LSRUse::ICmpZero)
- LF.UserInst->setOperand(0, FullV);
- else
- LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
- }
-
- if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
- DeadInsts.emplace_back(OperandIsInstr);
-}
-
-/// Rewrite all the fixup locations with new values, following the chosen
-/// solution.
-void LSRInstance::ImplementSolution(
- const SmallVectorImpl<const Formula *> &Solution) {
- // Keep track of instructions we may have made dead, so that
- // we can remove them after we are done working.
- SmallVector<WeakTrackingVH, 16> DeadInsts;
-
+ } else {
+ SmallVector<BasicBlock*, 2> NewBBs;
+ SplitLandingPadPredecessors(Parent, BB, "", "", NewBBs, &DT, &LI);
+ NewBB = NewBBs[0];
+ }
+ // If NewBB==NULL, then SplitCriticalEdge refused to split because all
+ // phi predecessors are identical. The simple thing to do is skip
+ // splitting in this case rather than complicate the API.
+ if (NewBB) {
+ // If PN is outside of the loop and BB is in the loop, we want to
+ // move the block to be immediately before the PHI block, not
+ // immediately after BB.
+ if (L->contains(BB) && !L->contains(PN))
+ NewBB->moveBefore(PN->getParent());
+
+ // Splitting the edge can reduce the number of PHI entries we have.
+ e = PN->getNumIncomingValues();
+ BB = NewBB;
+ i = PN->getBasicBlockIndex(BB);
+
+ needUpdateFixups = true;
+ }
+ }
+ }
+
+ std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
+ Inserted.insert(std::make_pair(BB, static_cast<Value *>(nullptr)));
+ if (!Pair.second)
+ PN->setIncomingValue(i, Pair.first->second);
+ else {
+ Value *FullV = Expand(LU, LF, F, BB->getTerminator()->getIterator(),
+ Rewriter, DeadInsts);
+
+ // If this is reuse-by-noop-cast, insert the noop cast.
+ Type *OpTy = LF.OperandValToReplace->getType();
+ if (FullV->getType() != OpTy)
+ FullV =
+ CastInst::Create(CastInst::getCastOpcode(FullV, false,
+ OpTy, false),
+ FullV, LF.OperandValToReplace->getType(),
+ "tmp", BB->getTerminator());
+
+ PN->setIncomingValue(i, FullV);
+ Pair.first->second = FullV;
+ }
+
+ // If LSR splits critical edge and phi node has other pending
+ // fixup operands, we need to update those pending fixups. Otherwise
+ // formulae will not be implemented completely and some instructions
+ // will not be eliminated.
+ if (needUpdateFixups) {
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
+ for (LSRFixup &Fixup : Uses[LUIdx].Fixups)
+ // If fixup is supposed to rewrite some operand in the phi
+ // that was just updated, it may be already moved to
+ // another phi node. Such fixup requires update.
+ if (Fixup.UserInst == PN) {
+ // Check if the operand we try to replace still exists in the
+ // original phi.
+ bool foundInOriginalPHI = false;
+ for (const auto &val : PN->incoming_values())
+ if (val == Fixup.OperandValToReplace) {
+ foundInOriginalPHI = true;
+ break;
+ }
+
+ // If fixup operand found in original PHI - nothing to do.
+ if (foundInOriginalPHI)
+ continue;
+
+ // Otherwise it might be moved to another PHI and requires update.
+ // If fixup operand not found in any of the incoming blocks that
+ // means we have already rewritten it - nothing to do.
+ for (const auto &Block : PN->blocks())
+ for (BasicBlock::iterator I = Block->begin(); isa<PHINode>(I);
+ ++I) {
+ PHINode *NewPN = cast<PHINode>(I);
+ for (const auto &val : NewPN->incoming_values())
+ if (val == Fixup.OperandValToReplace)
+ Fixup.UserInst = NewPN;
+ }
+ }
+ }
+ }
+}
+
+/// Emit instructions for the leading candidate expression for this LSRUse (this
+/// is called "expanding"), and update the UserInst to reference the newly
+/// expanded value.
+void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
+ const Formula &F, SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts) const {
+ // First, find an insertion point that dominates UserInst. For PHI nodes,
+ // find the nearest block which dominates all the relevant uses.
+ if (PHINode *PN = dyn_cast<PHINode>(LF.UserInst)) {
+ RewriteForPHI(PN, LU, LF, F, Rewriter, DeadInsts);
+ } else {
+ Value *FullV =
+ Expand(LU, LF, F, LF.UserInst->getIterator(), Rewriter, DeadInsts);
+
+ // If this is reuse-by-noop-cast, insert the noop cast.
+ Type *OpTy = LF.OperandValToReplace->getType();
+ if (FullV->getType() != OpTy) {
+ Instruction *Cast =
+ CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
+ FullV, OpTy, "tmp", LF.UserInst);
+ FullV = Cast;
+ }
+
+ // Update the user. ICmpZero is handled specially here (for now) because
+ // Expand may have updated one of the operands of the icmp already, and
+ // its new value may happen to be equal to LF.OperandValToReplace, in
+ // which case doing replaceUsesOfWith leads to replacing both operands
+ // with the same value. TODO: Reorganize this.
+ if (LU.Kind == LSRUse::ICmpZero)
+ LF.UserInst->setOperand(0, FullV);
+ else
+ LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
+ }
+
+ if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
+ DeadInsts.emplace_back(OperandIsInstr);
+}
+
+/// Rewrite all the fixup locations with new values, following the chosen
+/// solution.
+void LSRInstance::ImplementSolution(
+ const SmallVectorImpl<const Formula *> &Solution) {
+ // Keep track of instructions we may have made dead, so that
+ // we can remove them after we are done working.
+ SmallVector<WeakTrackingVH, 16> DeadInsts;
+
SCEVExpander Rewriter(SE, L->getHeader()->getModule()->getDataLayout(), "lsr",
false);
-#ifndef NDEBUG
- Rewriter.setDebugType(DEBUG_TYPE);
-#endif
- Rewriter.disableCanonicalMode();
- Rewriter.enableLSRMode();
- Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
-
- // Mark phi nodes that terminate chains so the expander tries to reuse them.
- for (const IVChain &Chain : IVChainVec) {
- if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
- Rewriter.setChainedPhi(PN);
- }
-
- // Expand the new value definitions and update the users.
- for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
- for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
- Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], Rewriter, DeadInsts);
- Changed = true;
- }
-
- for (const IVChain &Chain : IVChainVec) {
- GenerateIVChain(Chain, Rewriter, DeadInsts);
- Changed = true;
- }
- // Clean up after ourselves. This must be done before deleting any
- // instructions.
- Rewriter.clear();
-
- Changed |= RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts,
- &TLI, MSSAU);
-}
-
-LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
- DominatorTree &DT, LoopInfo &LI,
- const TargetTransformInfo &TTI, AssumptionCache &AC,
- TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
- : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
- MSSAU(MSSAU), FavorBackedgeIndex(EnableBackedgeIndexing &&
- TTI.shouldFavorBackedgeIndex(L)) {
- // If LoopSimplify form is not available, stay out of trouble.
- if (!L->isLoopSimplifyForm())
- return;
-
- // If there's no interesting work to be done, bail early.
- if (IU.empty()) return;
-
- // If there's too much analysis to be done, bail early. We won't be able to
- // model the problem anyway.
- unsigned NumUsers = 0;
- for (const IVStrideUse &U : IU) {
- if (++NumUsers > MaxIVUsers) {
- (void)U;
- LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
- << "\n");
- return;
- }
- // Bail out if we have a PHI on an EHPad that gets a value from a
- // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
- // no good place to stick any instructions.
- if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
- auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
- if (isa<FuncletPadInst>(FirstNonPHI) ||
- isa<CatchSwitchInst>(FirstNonPHI))
- for (BasicBlock *PredBB : PN->blocks())
- if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
- return;
- }
- }
-
-#ifndef NDEBUG
- // All dominating loops must have preheaders, or SCEVExpander may not be able
- // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
- //
- // IVUsers analysis should only create users that are dominated by simple loop
- // headers. Since this loop should dominate all of its users, its user list
- // should be empty if this loop itself is not within a simple loop nest.
- for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
- Rung; Rung = Rung->getIDom()) {
- BasicBlock *BB = Rung->getBlock();
- const Loop *DomLoop = LI.getLoopFor(BB);
- if (DomLoop && DomLoop->getHeader() == BB) {
- assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
- }
- }
-#endif // DEBUG
-
- LLVM_DEBUG(dbgs() << "\nLSR on loop ";
- L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
- dbgs() << ":\n");
-
- // First, perform some low-level loop optimizations.
- OptimizeShadowIV();
- OptimizeLoopTermCond();
-
- // If loop preparation eliminates all interesting IV users, bail.
- if (IU.empty()) return;
-
- // Skip nested loops until we can model them better with formulae.
+#ifndef NDEBUG
+ Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+ Rewriter.disableCanonicalMode();
+ Rewriter.enableLSRMode();
+ Rewriter.setIVIncInsertPos(L, IVIncInsertPos);
+
+ // Mark phi nodes that terminate chains so the expander tries to reuse them.
+ for (const IVChain &Chain : IVChainVec) {
+ if (PHINode *PN = dyn_cast<PHINode>(Chain.tailUserInst()))
+ Rewriter.setChainedPhi(PN);
+ }
+
+ // Expand the new value definitions and update the users.
+ for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx)
+ for (const LSRFixup &Fixup : Uses[LUIdx].Fixups) {
+ Rewrite(Uses[LUIdx], Fixup, *Solution[LUIdx], Rewriter, DeadInsts);
+ Changed = true;
+ }
+
+ for (const IVChain &Chain : IVChainVec) {
+ GenerateIVChain(Chain, Rewriter, DeadInsts);
+ Changed = true;
+ }
+ // Clean up after ourselves. This must be done before deleting any
+ // instructions.
+ Rewriter.clear();
+
+ Changed |= RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts,
+ &TLI, MSSAU);
+}
+
+LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
+ DominatorTree &DT, LoopInfo &LI,
+ const TargetTransformInfo &TTI, AssumptionCache &AC,
+ TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
+ : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
+ MSSAU(MSSAU), FavorBackedgeIndex(EnableBackedgeIndexing &&
+ TTI.shouldFavorBackedgeIndex(L)) {
+ // If LoopSimplify form is not available, stay out of trouble.
+ if (!L->isLoopSimplifyForm())
+ return;
+
+ // If there's no interesting work to be done, bail early.
+ if (IU.empty()) return;
+
+ // If there's too much analysis to be done, bail early. We won't be able to
+ // model the problem anyway.
+ unsigned NumUsers = 0;
+ for (const IVStrideUse &U : IU) {
+ if (++NumUsers > MaxIVUsers) {
+ (void)U;
+ LLVM_DEBUG(dbgs() << "LSR skipping loop, too many IV Users in " << U
+ << "\n");
+ return;
+ }
+ // Bail out if we have a PHI on an EHPad that gets a value from a
+ // CatchSwitchInst. Because the CatchSwitchInst cannot be split, there is
+ // no good place to stick any instructions.
+ if (auto *PN = dyn_cast<PHINode>(U.getUser())) {
+ auto *FirstNonPHI = PN->getParent()->getFirstNonPHI();
+ if (isa<FuncletPadInst>(FirstNonPHI) ||
+ isa<CatchSwitchInst>(FirstNonPHI))
+ for (BasicBlock *PredBB : PN->blocks())
+ if (isa<CatchSwitchInst>(PredBB->getFirstNonPHI()))
+ return;
+ }
+ }
+
+#ifndef NDEBUG
+ // All dominating loops must have preheaders, or SCEVExpander may not be able
+ // to materialize an AddRecExpr whose Start is an outer AddRecExpr.
+ //
+ // IVUsers analysis should only create users that are dominated by simple loop
+ // headers. Since this loop should dominate all of its users, its user list
+ // should be empty if this loop itself is not within a simple loop nest.
+ for (DomTreeNode *Rung = DT.getNode(L->getLoopPreheader());
+ Rung; Rung = Rung->getIDom()) {
+ BasicBlock *BB = Rung->getBlock();
+ const Loop *DomLoop = LI.getLoopFor(BB);
+ if (DomLoop && DomLoop->getHeader() == BB) {
+ assert(DomLoop->getLoopPreheader() && "LSR needs a simplified loop nest");
+ }
+ }
+#endif // DEBUG
+
+ LLVM_DEBUG(dbgs() << "\nLSR on loop ";
+ L->getHeader()->printAsOperand(dbgs(), /*PrintType=*/false);
+ dbgs() << ":\n");
+
+ // First, perform some low-level loop optimizations.
+ OptimizeShadowIV();
+ OptimizeLoopTermCond();
+
+ // If loop preparation eliminates all interesting IV users, bail.
+ if (IU.empty()) return;
+
+ // Skip nested loops until we can model them better with formulae.
if (!L->isInnermost()) {
- LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
- return;
- }
-
- // Start collecting data and preparing for the solver.
+ LLVM_DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
+ return;
+ }
+
+ // Start collecting data and preparing for the solver.
// If number of registers is not the major cost, we cannot benefit from the
// current profitable chain optimization which is based on number of
// registers.
@@ -5633,145 +5633,145 @@ LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
// example number of instructions.
if (TTI.isNumRegsMajorCostOfLSR() || StressIVChain)
CollectChains();
- CollectInterestingTypesAndFactors();
- CollectFixupsAndInitialFormulae();
- CollectLoopInvariantFixupsAndFormulae();
-
- if (Uses.empty())
- return;
-
- LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
- print_uses(dbgs()));
-
- // Now use the reuse data to generate a bunch of interesting ways
- // to formulate the values needed for the uses.
- GenerateAllReuseFormulae();
-
- FilterOutUndesirableDedicatedRegisters();
- NarrowSearchSpaceUsingHeuristics();
-
- SmallVector<const Formula *, 8> Solution;
- Solve(Solution);
-
- // Release memory that is no longer needed.
- Factors.clear();
- Types.clear();
- RegUses.clear();
-
- if (Solution.empty())
- return;
-
-#ifndef NDEBUG
- // Formulae should be legal.
- for (const LSRUse &LU : Uses) {
- for (const Formula &F : LU.Formulae)
- assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
- F) && "Illegal formula generated!");
- };
-#endif
-
- // Now that we've decided what we want, make it so.
- ImplementSolution(Solution);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
- if (Factors.empty() && Types.empty()) return;
-
- OS << "LSR has identified the following interesting factors and types: ";
- bool First = true;
-
- for (int64_t Factor : Factors) {
- if (!First) OS << ", ";
- First = false;
- OS << '*' << Factor;
- }
-
- for (Type *Ty : Types) {
- if (!First) OS << ", ";
- First = false;
- OS << '(' << *Ty << ')';
- }
- OS << '\n';
-}
-
-void LSRInstance::print_fixups(raw_ostream &OS) const {
- OS << "LSR is examining the following fixup sites:\n";
- for (const LSRUse &LU : Uses)
- for (const LSRFixup &LF : LU.Fixups) {
- dbgs() << " ";
- LF.print(OS);
- OS << '\n';
- }
-}
-
-void LSRInstance::print_uses(raw_ostream &OS) const {
- OS << "LSR is examining the following uses:\n";
- for (const LSRUse &LU : Uses) {
- dbgs() << " ";
- LU.print(OS);
- OS << '\n';
- for (const Formula &F : LU.Formulae) {
- OS << " ";
- F.print(OS);
- OS << '\n';
- }
- }
-}
-
-void LSRInstance::print(raw_ostream &OS) const {
- print_factors_and_types(OS);
- print_fixups(OS);
- print_uses(OS);
-}
-
-LLVM_DUMP_METHOD void LSRInstance::dump() const {
- print(errs()); errs() << '\n';
-}
-#endif
-
-namespace {
-
-class LoopStrengthReduce : public LoopPass {
-public:
- static char ID; // Pass ID, replacement for typeid
-
- LoopStrengthReduce();
-
-private:
- bool runOnLoop(Loop *L, LPPassManager &LPM) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override;
-};
-
-} // end anonymous namespace
-
-LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
- initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
-}
-
-void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
- // We split critical edges, so we change the CFG. However, we do update
- // many analyses if they are around.
- AU.addPreservedID(LoopSimplifyID);
-
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequiredID(LoopSimplifyID);
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- // Requiring LoopSimplify a second time here prevents IVUsers from running
- // twice, since LoopSimplify was invalidated by running ScalarEvolution.
- AU.addRequiredID(LoopSimplifyID);
- AU.addRequired<IVUsersWrapperPass>();
- AU.addPreserved<IVUsersWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
-}
-
+ CollectInterestingTypesAndFactors();
+ CollectFixupsAndInitialFormulae();
+ CollectLoopInvariantFixupsAndFormulae();
+
+ if (Uses.empty())
+ return;
+
+ LLVM_DEBUG(dbgs() << "LSR found " << Uses.size() << " uses:\n";
+ print_uses(dbgs()));
+
+ // Now use the reuse data to generate a bunch of interesting ways
+ // to formulate the values needed for the uses.
+ GenerateAllReuseFormulae();
+
+ FilterOutUndesirableDedicatedRegisters();
+ NarrowSearchSpaceUsingHeuristics();
+
+ SmallVector<const Formula *, 8> Solution;
+ Solve(Solution);
+
+ // Release memory that is no longer needed.
+ Factors.clear();
+ Types.clear();
+ RegUses.clear();
+
+ if (Solution.empty())
+ return;
+
+#ifndef NDEBUG
+ // Formulae should be legal.
+ for (const LSRUse &LU : Uses) {
+ for (const Formula &F : LU.Formulae)
+ assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy,
+ F) && "Illegal formula generated!");
+ };
+#endif
+
+ // Now that we've decided what we want, make it so.
+ ImplementSolution(Solution);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LSRInstance::print_factors_and_types(raw_ostream &OS) const {
+ if (Factors.empty() && Types.empty()) return;
+
+ OS << "LSR has identified the following interesting factors and types: ";
+ bool First = true;
+
+ for (int64_t Factor : Factors) {
+ if (!First) OS << ", ";
+ First = false;
+ OS << '*' << Factor;
+ }
+
+ for (Type *Ty : Types) {
+ if (!First) OS << ", ";
+ First = false;
+ OS << '(' << *Ty << ')';
+ }
+ OS << '\n';
+}
+
+void LSRInstance::print_fixups(raw_ostream &OS) const {
+ OS << "LSR is examining the following fixup sites:\n";
+ for (const LSRUse &LU : Uses)
+ for (const LSRFixup &LF : LU.Fixups) {
+ dbgs() << " ";
+ LF.print(OS);
+ OS << '\n';
+ }
+}
+
+void LSRInstance::print_uses(raw_ostream &OS) const {
+ OS << "LSR is examining the following uses:\n";
+ for (const LSRUse &LU : Uses) {
+ dbgs() << " ";
+ LU.print(OS);
+ OS << '\n';
+ for (const Formula &F : LU.Formulae) {
+ OS << " ";
+ F.print(OS);
+ OS << '\n';
+ }
+ }
+}
+
+void LSRInstance::print(raw_ostream &OS) const {
+ print_factors_and_types(OS);
+ print_fixups(OS);
+ print_uses(OS);
+}
+
+LLVM_DUMP_METHOD void LSRInstance::dump() const {
+ print(errs()); errs() << '\n';
+}
+#endif
+
+namespace {
+
+class LoopStrengthReduce : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+
+ LoopStrengthReduce();
+
+private:
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+} // end anonymous namespace
+
+LoopStrengthReduce::LoopStrengthReduce() : LoopPass(ID) {
+ initializeLoopStrengthReducePass(*PassRegistry::getPassRegistry());
+}
+
+void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
+ // We split critical edges, so we change the CFG. However, we do update
+ // many analyses if they are around.
+ AU.addPreservedID(LoopSimplifyID);
+
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ // Requiring LoopSimplify a second time here prevents IVUsers from running
+ // twice, since LoopSimplify was invalidated by running ScalarEvolution.
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequired<IVUsersWrapperPass>();
+ AU.addPreserved<IVUsersWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+}
+
using EqualValues = SmallVector<std::tuple<WeakVH, int64_t, DIExpression *>, 4>;
using EqualValuesMap = DenseMap<DbgValueInst *, EqualValues>;
@@ -5829,94 +5829,94 @@ static void DbgApplyEqualValues(EqualValuesMap &DbgValueToEqualSet) {
}
}
-static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
- DominatorTree &DT, LoopInfo &LI,
- const TargetTransformInfo &TTI,
- AssumptionCache &AC, TargetLibraryInfo &TLI,
- MemorySSA *MSSA) {
-
- bool Changed = false;
- std::unique_ptr<MemorySSAUpdater> MSSAU;
- if (MSSA)
- MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
-
- // Run the main LSR transformation.
- Changed |=
- LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get()).getChanged();
-
+static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
+ DominatorTree &DT, LoopInfo &LI,
+ const TargetTransformInfo &TTI,
+ AssumptionCache &AC, TargetLibraryInfo &TLI,
+ MemorySSA *MSSA) {
+
+ bool Changed = false;
+ std::unique_ptr<MemorySSAUpdater> MSSAU;
+ if (MSSA)
+ MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+
+ // Run the main LSR transformation.
+ Changed |=
+ LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get()).getChanged();
+
// Debug preservation - before we start removing anything create equivalence
// sets for the llvm.dbg.value intrinsics.
EqualValuesMap DbgValueToEqualSet;
DbgGatherEqualValues(L, SE, DbgValueToEqualSet);
- // Remove any extra phis created by processing inner loops.
- Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
- if (EnablePhiElim && L->isLoopSimplifyForm()) {
- SmallVector<WeakTrackingVH, 16> DeadInsts;
- const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ // Remove any extra phis created by processing inner loops.
+ Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
+ if (EnablePhiElim && L->isLoopSimplifyForm()) {
+ SmallVector<WeakTrackingVH, 16> DeadInsts;
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
SCEVExpander Rewriter(SE, DL, "lsr", false);
-#ifndef NDEBUG
- Rewriter.setDebugType(DEBUG_TYPE);
-#endif
- unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
- if (numFolded) {
- Changed = true;
- RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI,
- MSSAU.get());
- DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
- }
- }
+#ifndef NDEBUG
+ Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+ unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
+ if (numFolded) {
+ Changed = true;
+ RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI,
+ MSSAU.get());
+ DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
+ }
+ }
DbgApplyEqualValues(DbgValueToEqualSet);
- return Changed;
-}
-
-bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
- if (skipLoop(L))
- return false;
-
- auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
- auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
- *L->getHeader()->getParent());
- auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
- *L->getHeader()->getParent());
- auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
- *L->getHeader()->getParent());
- auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
- MemorySSA *MSSA = nullptr;
- if (MSSAAnalysis)
- MSSA = &MSSAAnalysis->getMSSA();
- return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
-}
-
-PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &) {
- if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
- AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
- return PreservedAnalyses::all();
-
- auto PA = getLoopPassPreservedAnalyses();
- if (AR.MSSA)
- PA.preserve<MemorySSAAnalysis>();
- return PA;
-}
-
-char LoopStrengthReduce::ID = 0;
-
-INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
- "Loop Strength Reduction", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(IVUsersWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
- "Loop Strength Reduction", false, false)
-
-Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
+ return Changed;
+}
+
+bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
+ if (skipLoop(L))
+ return false;
+
+ auto &IU = getAnalysis<IVUsersWrapperPass>().getIU();
+ auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *L->getHeader()->getParent());
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+ *L->getHeader()->getParent());
+ auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+ MemorySSA *MSSA = nullptr;
+ if (MSSAAnalysis)
+ MSSA = &MSSAAnalysis->getMSSA();
+ return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
+}
+
+PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &) {
+ if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
+ AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
+ return PreservedAnalyses::all();
+
+ auto PA = getLoopPassPreservedAnalyses();
+ if (AR.MSSA)
+ PA.preserve<MemorySSAAnalysis>();
+ return PA;
+}
+
+char LoopStrengthReduce::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopStrengthReduce, "loop-reduce",
+ "Loop Strength Reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(IVUsersWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(LoopStrengthReduce, "loop-reduce",
+ "Loop Strength Reduction", false, false)
+
+Pass *llvm::createLoopStrengthReducePass() { return new LoopStrengthReduce(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 766b313f4f..495906e1a7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -1,293 +1,293 @@
-//===- LoopUnrollAndJam.cpp - Loop unroll and jam pass --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass implements an unroll and jam pass. Most of the work is done by
-// Utils/UnrollLoopAndJam.cpp.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/PriorityWorklist.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/DependenceAnalysis.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/PassRegistry.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+//===- LoopUnrollAndJam.cpp - Loop unroll and jam pass --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements an unroll and jam pass. Most of the work is done by
+// Utils/UnrollLoopAndJam.cpp.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PriorityWorklist.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/LoopPeel.h"
-#include "llvm/Transforms/Utils/LoopSimplify.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/UnrollLoop.h"
-#include <cassert>
-#include <cstdint>
-#include <vector>
-
-namespace llvm {
-class Instruction;
-class Value;
-} // namespace llvm
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-unroll-and-jam"
-
-/// @{
-/// Metadata attribute names
-static const char *const LLVMLoopUnrollAndJamFollowupAll =
- "llvm.loop.unroll_and_jam.followup_all";
-static const char *const LLVMLoopUnrollAndJamFollowupInner =
- "llvm.loop.unroll_and_jam.followup_inner";
-static const char *const LLVMLoopUnrollAndJamFollowupOuter =
- "llvm.loop.unroll_and_jam.followup_outer";
-static const char *const LLVMLoopUnrollAndJamFollowupRemainderInner =
- "llvm.loop.unroll_and_jam.followup_remainder_inner";
-static const char *const LLVMLoopUnrollAndJamFollowupRemainderOuter =
- "llvm.loop.unroll_and_jam.followup_remainder_outer";
-/// @}
-
-static cl::opt<bool>
- AllowUnrollAndJam("allow-unroll-and-jam", cl::Hidden,
- cl::desc("Allows loops to be unroll-and-jammed."));
-
-static cl::opt<unsigned> UnrollAndJamCount(
- "unroll-and-jam-count", cl::Hidden,
- cl::desc("Use this unroll count for all loops including those with "
- "unroll_and_jam_count pragma values, for testing purposes"));
-
-static cl::opt<unsigned> UnrollAndJamThreshold(
- "unroll-and-jam-threshold", cl::init(60), cl::Hidden,
- cl::desc("Threshold to use for inner loop when doing unroll and jam."));
-
-static cl::opt<unsigned> PragmaUnrollAndJamThreshold(
- "pragma-unroll-and-jam-threshold", cl::init(1024), cl::Hidden,
- cl::desc("Unrolled size limit for loops with an unroll_and_jam(full) or "
- "unroll_count pragma."));
-
-// Returns the loop hint metadata node with the given name (for example,
-// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
-// returned.
-static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) {
- if (MDNode *LoopID = L->getLoopID())
- return GetUnrollMetadata(LoopID, Name);
- return nullptr;
-}
-
-// Returns true if the loop has any metadata starting with Prefix. For example a
-// Prefix of "llvm.loop.unroll." returns true if we have any unroll metadata.
-static bool hasAnyUnrollPragma(const Loop *L, StringRef Prefix) {
- if (MDNode *LoopID = L->getLoopID()) {
- // First operand should refer to the loop id itself.
- assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
- assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
-
- for (unsigned I = 1, E = LoopID->getNumOperands(); I < E; ++I) {
- MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
- if (!MD)
- continue;
-
- MDString *S = dyn_cast<MDString>(MD->getOperand(0));
- if (!S)
- continue;
-
- if (S->getString().startswith(Prefix))
- return true;
- }
- }
- return false;
-}
-
-// Returns true if the loop has an unroll_and_jam(enable) pragma.
-static bool hasUnrollAndJamEnablePragma(const Loop *L) {
- return getUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable");
-}
-
-// If loop has an unroll_and_jam_count pragma return the (necessarily
-// positive) value from the pragma. Otherwise return 0.
-static unsigned unrollAndJamCountPragmaValue(const Loop *L) {
- MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.count");
- if (MD) {
- assert(MD->getNumOperands() == 2 &&
- "Unroll count hint metadata should have two operands.");
- unsigned Count =
- mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
- assert(Count >= 1 && "Unroll count must be positive.");
- return Count;
- }
- return 0;
-}
-
-// Returns loop size estimation for unrolled loop.
-static uint64_t
-getUnrollAndJammedLoopSize(unsigned LoopSize,
- TargetTransformInfo::UnrollingPreferences &UP) {
- assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
- return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
-}
-
-// Calculates unroll and jam count and writes it to UP.Count. Returns true if
-// unroll count was set explicitly.
-static bool computeUnrollAndJamCount(
- Loop *L, Loop *SubLoop, const TargetTransformInfo &TTI, DominatorTree &DT,
- LoopInfo *LI, ScalarEvolution &SE,
- const SmallPtrSetImpl<const Value *> &EphValues,
- OptimizationRemarkEmitter *ORE, unsigned OuterTripCount,
- unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount,
- unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP,
- TargetTransformInfo::PeelingPreferences &PP) {
- // First up use computeUnrollCount from the loop unroller to get a count
- // for unrolling the outer loop, plus any loops requiring explicit
- // unrolling we leave to the unroller. This uses UP.Threshold /
- // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values.
- // We have already checked that the loop has no unroll.* pragmas.
- unsigned MaxTripCount = 0;
- bool UseUpperBound = false;
- bool ExplicitUnroll = computeUnrollCount(
- L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
- /*MaxOrZero*/ false, OuterTripMultiple, OuterLoopSize, UP, PP,
- UseUpperBound);
- if (ExplicitUnroll || UseUpperBound) {
- // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
- // for the unroller instead.
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; explicit count set by "
- "computeUnrollCount\n");
- UP.Count = 0;
- return false;
- }
-
- // Override with any explicit Count from the "unroll-and-jam-count" option.
- bool UserUnrollCount = UnrollAndJamCount.getNumOccurrences() > 0;
- if (UserUnrollCount) {
- UP.Count = UnrollAndJamCount;
- UP.Force = true;
- if (UP.AllowRemainder &&
- getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
- getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
- UP.UnrollAndJamInnerLoopThreshold)
- return true;
- }
-
- // Check for unroll_and_jam pragmas
- unsigned PragmaCount = unrollAndJamCountPragmaValue(L);
- if (PragmaCount > 0) {
- UP.Count = PragmaCount;
- UP.Runtime = true;
- UP.Force = true;
- if ((UP.AllowRemainder || (OuterTripMultiple % PragmaCount == 0)) &&
- getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
- getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
- UP.UnrollAndJamInnerLoopThreshold)
- return true;
- }
-
- bool PragmaEnableUnroll = hasUnrollAndJamEnablePragma(L);
- bool ExplicitUnrollAndJamCount = PragmaCount > 0 || UserUnrollCount;
- bool ExplicitUnrollAndJam = PragmaEnableUnroll || ExplicitUnrollAndJamCount;
-
- // If the loop has an unrolling pragma, we want to be more aggressive with
- // unrolling limits.
- if (ExplicitUnrollAndJam)
- UP.UnrollAndJamInnerLoopThreshold = PragmaUnrollAndJamThreshold;
-
- if (!UP.AllowRemainder && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
- UP.UnrollAndJamInnerLoopThreshold) {
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't create remainder and "
- "inner loop too large\n");
- UP.Count = 0;
- return false;
- }
-
- // We have a sensible limit for the outer loop, now adjust it for the inner
- // loop and UP.UnrollAndJamInnerLoopThreshold. If the outer limit was set
- // explicitly, we want to stick to it.
- if (!ExplicitUnrollAndJamCount && UP.AllowRemainder) {
- while (UP.Count != 0 && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
- UP.UnrollAndJamInnerLoopThreshold)
- UP.Count--;
- }
-
- // If we are explicitly unroll and jamming, we are done. Otherwise there are a
- // number of extra performance heuristics to check.
- if (ExplicitUnrollAndJam)
- return true;
-
- // If the inner loop count is known and small, leave the entire loop nest to
- // be the unroller
- if (InnerTripCount && InnerLoopSize * InnerTripCount < UP.Threshold) {
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; small inner loop count is "
- "being left for the unroller\n");
- UP.Count = 0;
- return false;
- }
-
- // Check for situations where UnJ is likely to be unprofitable. Including
- // subloops with more than 1 block.
- if (SubLoop->getBlocks().size() != 1) {
- LLVM_DEBUG(
- dbgs() << "Won't unroll-and-jam; More than one inner loop block\n");
- UP.Count = 0;
- return false;
- }
-
- // Limit to loops where there is something to gain from unrolling and
- // jamming the loop. In this case, look for loads that are invariant in the
- // outer loop and can become shared.
- unsigned NumInvariant = 0;
- for (BasicBlock *BB : SubLoop->getBlocks()) {
- for (Instruction &I : *BB) {
- if (auto *Ld = dyn_cast<LoadInst>(&I)) {
- Value *V = Ld->getPointerOperand();
- const SCEV *LSCEV = SE.getSCEVAtScope(V, L);
- if (SE.isLoopInvariant(LSCEV, L))
- NumInvariant++;
- }
- }
- }
- if (NumInvariant == 0) {
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; No loop invariant loads\n");
- UP.Count = 0;
- return false;
- }
-
- return false;
-}
-
-static LoopUnrollResult
-tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
- ScalarEvolution &SE, const TargetTransformInfo &TTI,
- AssumptionCache &AC, DependenceInfo &DI,
- OptimizationRemarkEmitter &ORE, int OptLevel) {
- TargetTransformInfo::UnrollingPreferences UP =
- gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None,
- None, None, None, None, None);
- TargetTransformInfo::PeelingPreferences PP =
- gatherPeelingPreferences(L, SE, TTI, None, None);
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <cassert>
+#include <cstdint>
+#include <vector>
+
+namespace llvm {
+class Instruction;
+class Value;
+} // namespace llvm
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll-and-jam"
+
+/// @{
+/// Metadata attribute names
+static const char *const LLVMLoopUnrollAndJamFollowupAll =
+ "llvm.loop.unroll_and_jam.followup_all";
+static const char *const LLVMLoopUnrollAndJamFollowupInner =
+ "llvm.loop.unroll_and_jam.followup_inner";
+static const char *const LLVMLoopUnrollAndJamFollowupOuter =
+ "llvm.loop.unroll_and_jam.followup_outer";
+static const char *const LLVMLoopUnrollAndJamFollowupRemainderInner =
+ "llvm.loop.unroll_and_jam.followup_remainder_inner";
+static const char *const LLVMLoopUnrollAndJamFollowupRemainderOuter =
+ "llvm.loop.unroll_and_jam.followup_remainder_outer";
+/// @}
+
+static cl::opt<bool>
+ AllowUnrollAndJam("allow-unroll-and-jam", cl::Hidden,
+ cl::desc("Allows loops to be unroll-and-jammed."));
+
+static cl::opt<unsigned> UnrollAndJamCount(
+ "unroll-and-jam-count", cl::Hidden,
+ cl::desc("Use this unroll count for all loops including those with "
+ "unroll_and_jam_count pragma values, for testing purposes"));
+
+static cl::opt<unsigned> UnrollAndJamThreshold(
+ "unroll-and-jam-threshold", cl::init(60), cl::Hidden,
+ cl::desc("Threshold to use for inner loop when doing unroll and jam."));
+
+static cl::opt<unsigned> PragmaUnrollAndJamThreshold(
+ "pragma-unroll-and-jam-threshold", cl::init(1024), cl::Hidden,
+ cl::desc("Unrolled size limit for loops with an unroll_and_jam(full) or "
+ "unroll_count pragma."));
+
+// Returns the loop hint metadata node with the given name (for example,
+// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
+// returned.
+static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+ if (MDNode *LoopID = L->getLoopID())
+ return GetUnrollMetadata(LoopID, Name);
+ return nullptr;
+}
+
+// Returns true if the loop has any metadata starting with Prefix. For example a
+// Prefix of "llvm.loop.unroll." returns true if we have any unroll metadata.
+static bool hasAnyUnrollPragma(const Loop *L, StringRef Prefix) {
+ if (MDNode *LoopID = L->getLoopID()) {
+ // First operand should refer to the loop id itself.
+ assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+ assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+ for (unsigned I = 1, E = LoopID->getNumOperands(); I < E; ++I) {
+ MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
+ if (!MD)
+ continue;
+
+ MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+ if (!S)
+ continue;
+
+ if (S->getString().startswith(Prefix))
+ return true;
+ }
+ }
+ return false;
+}
+
+// Returns true if the loop has an unroll_and_jam(enable) pragma.
+static bool hasUnrollAndJamEnablePragma(const Loop *L) {
+ return getUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable");
+}
+
+// If loop has an unroll_and_jam_count pragma return the (necessarily
+// positive) value from the pragma. Otherwise return 0.
+static unsigned unrollAndJamCountPragmaValue(const Loop *L) {
+ MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.count");
+ if (MD) {
+ assert(MD->getNumOperands() == 2 &&
+ "Unroll count hint metadata should have two operands.");
+ unsigned Count =
+ mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+ assert(Count >= 1 && "Unroll count must be positive.");
+ return Count;
+ }
+ return 0;
+}
+
+// Returns loop size estimation for unrolled loop.
+static uint64_t
+getUnrollAndJammedLoopSize(unsigned LoopSize,
+ TargetTransformInfo::UnrollingPreferences &UP) {
+ assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
+ return static_cast<uint64_t>(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
+}
+
+// Calculates unroll and jam count and writes it to UP.Count. Returns true if
+// unroll count was set explicitly.
+static bool computeUnrollAndJamCount(
+ Loop *L, Loop *SubLoop, const TargetTransformInfo &TTI, DominatorTree &DT,
+ LoopInfo *LI, ScalarEvolution &SE,
+ const SmallPtrSetImpl<const Value *> &EphValues,
+ OptimizationRemarkEmitter *ORE, unsigned OuterTripCount,
+ unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount,
+ unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP,
+ TargetTransformInfo::PeelingPreferences &PP) {
+ // First up use computeUnrollCount from the loop unroller to get a count
+ // for unrolling the outer loop, plus any loops requiring explicit
+ // unrolling we leave to the unroller. This uses UP.Threshold /
+ // UP.PartialThreshold / UP.MaxCount to come up with sensible loop values.
+ // We have already checked that the loop has no unroll.* pragmas.
+ unsigned MaxTripCount = 0;
+ bool UseUpperBound = false;
+ bool ExplicitUnroll = computeUnrollCount(
+ L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
+ /*MaxOrZero*/ false, OuterTripMultiple, OuterLoopSize, UP, PP,
+ UseUpperBound);
+ if (ExplicitUnroll || UseUpperBound) {
+ // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
+ // for the unroller instead.
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; explicit count set by "
+ "computeUnrollCount\n");
+ UP.Count = 0;
+ return false;
+ }
+
+ // Override with any explicit Count from the "unroll-and-jam-count" option.
+ bool UserUnrollCount = UnrollAndJamCount.getNumOccurrences() > 0;
+ if (UserUnrollCount) {
+ UP.Count = UnrollAndJamCount;
+ UP.Force = true;
+ if (UP.AllowRemainder &&
+ getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
+ getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
+ UP.UnrollAndJamInnerLoopThreshold)
+ return true;
+ }
+
+ // Check for unroll_and_jam pragmas
+ unsigned PragmaCount = unrollAndJamCountPragmaValue(L);
+ if (PragmaCount > 0) {
+ UP.Count = PragmaCount;
+ UP.Runtime = true;
+ UP.Force = true;
+ if ((UP.AllowRemainder || (OuterTripMultiple % PragmaCount == 0)) &&
+ getUnrollAndJammedLoopSize(OuterLoopSize, UP) < UP.Threshold &&
+ getUnrollAndJammedLoopSize(InnerLoopSize, UP) <
+ UP.UnrollAndJamInnerLoopThreshold)
+ return true;
+ }
+
+ bool PragmaEnableUnroll = hasUnrollAndJamEnablePragma(L);
+ bool ExplicitUnrollAndJamCount = PragmaCount > 0 || UserUnrollCount;
+ bool ExplicitUnrollAndJam = PragmaEnableUnroll || ExplicitUnrollAndJamCount;
+
+ // If the loop has an unrolling pragma, we want to be more aggressive with
+ // unrolling limits.
+ if (ExplicitUnrollAndJam)
+ UP.UnrollAndJamInnerLoopThreshold = PragmaUnrollAndJamThreshold;
+
+ if (!UP.AllowRemainder && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
+ UP.UnrollAndJamInnerLoopThreshold) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't create remainder and "
+ "inner loop too large\n");
+ UP.Count = 0;
+ return false;
+ }
+
+ // We have a sensible limit for the outer loop, now adjust it for the inner
+ // loop and UP.UnrollAndJamInnerLoopThreshold. If the outer limit was set
+ // explicitly, we want to stick to it.
+ if (!ExplicitUnrollAndJamCount && UP.AllowRemainder) {
+ while (UP.Count != 0 && getUnrollAndJammedLoopSize(InnerLoopSize, UP) >=
+ UP.UnrollAndJamInnerLoopThreshold)
+ UP.Count--;
+ }
+
+ // If we are explicitly unroll and jamming, we are done. Otherwise there are a
+ // number of extra performance heuristics to check.
+ if (ExplicitUnrollAndJam)
+ return true;
+
+ // If the inner loop count is known and small, leave the entire loop nest to
+ // be the unroller
+ if (InnerTripCount && InnerLoopSize * InnerTripCount < UP.Threshold) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; small inner loop count is "
+ "being left for the unroller\n");
+ UP.Count = 0;
+ return false;
+ }
+
+ // Check for situations where UnJ is likely to be unprofitable. Including
+ // subloops with more than 1 block.
+ if (SubLoop->getBlocks().size() != 1) {
+ LLVM_DEBUG(
+ dbgs() << "Won't unroll-and-jam; More than one inner loop block\n");
+ UP.Count = 0;
+ return false;
+ }
+
+ // Limit to loops where there is something to gain from unrolling and
+ // jamming the loop. In this case, look for loads that are invariant in the
+ // outer loop and can become shared.
+ unsigned NumInvariant = 0;
+ for (BasicBlock *BB : SubLoop->getBlocks()) {
+ for (Instruction &I : *BB) {
+ if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+ Value *V = Ld->getPointerOperand();
+ const SCEV *LSCEV = SE.getSCEVAtScope(V, L);
+ if (SE.isLoopInvariant(LSCEV, L))
+ NumInvariant++;
+ }
+ }
+ }
+ if (NumInvariant == 0) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; No loop invariant loads\n");
+ UP.Count = 0;
+ return false;
+ }
+
+ return false;
+}
+
+static LoopUnrollResult
+tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
+ ScalarEvolution &SE, const TargetTransformInfo &TTI,
+ AssumptionCache &AC, DependenceInfo &DI,
+ OptimizationRemarkEmitter &ORE, int OptLevel) {
+ TargetTransformInfo::UnrollingPreferences UP =
+ gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None,
+ None, None, None, None, None);
+ TargetTransformInfo::PeelingPreferences PP =
+ gatherPeelingPreferences(L, SE, TTI, None, None);
TransformationMode EnableMode = hasUnrollAndJamTransformation(L);
if (EnableMode & TM_Disable)
@@ -295,242 +295,242 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
if (EnableMode & TM_ForcedByUser)
UP.UnrollAndJam = true;
- if (AllowUnrollAndJam.getNumOccurrences() > 0)
- UP.UnrollAndJam = AllowUnrollAndJam;
- if (UnrollAndJamThreshold.getNumOccurrences() > 0)
- UP.UnrollAndJamInnerLoopThreshold = UnrollAndJamThreshold;
- // Exit early if unrolling is disabled.
- if (!UP.UnrollAndJam || UP.UnrollAndJamInnerLoopThreshold == 0)
- return LoopUnrollResult::Unmodified;
-
- LLVM_DEBUG(dbgs() << "Loop Unroll and Jam: F["
- << L->getHeader()->getParent()->getName() << "] Loop %"
- << L->getHeader()->getName() << "\n");
-
- // A loop with any unroll pragma (enabling/disabling/count/etc) is left for
- // the unroller, so long as it does not explicitly have unroll_and_jam
- // metadata. This means #pragma nounroll will disable unroll and jam as well
- // as unrolling
- if (hasAnyUnrollPragma(L, "llvm.loop.unroll.") &&
- !hasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam.")) {
- LLVM_DEBUG(dbgs() << " Disabled due to pragma.\n");
- return LoopUnrollResult::Unmodified;
- }
-
- if (!isSafeToUnrollAndJam(L, SE, DT, DI, *LI)) {
- LLVM_DEBUG(dbgs() << " Disabled due to not being safe.\n");
- return LoopUnrollResult::Unmodified;
- }
-
- // Approximate the loop size and collect useful info
- unsigned NumInlineCandidates;
- bool NotDuplicatable;
- bool Convergent;
- SmallPtrSet<const Value *, 32> EphValues;
- CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
- Loop *SubLoop = L->getSubLoops()[0];
- unsigned InnerLoopSize =
- ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable,
- Convergent, TTI, EphValues, UP.BEInsns);
- unsigned OuterLoopSize =
- ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
- TTI, EphValues, UP.BEInsns);
- LLVM_DEBUG(dbgs() << " Outer Loop Size: " << OuterLoopSize << "\n");
- LLVM_DEBUG(dbgs() << " Inner Loop Size: " << InnerLoopSize << "\n");
- if (NotDuplicatable) {
- LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable "
- "instructions.\n");
- return LoopUnrollResult::Unmodified;
- }
- if (NumInlineCandidates != 0) {
- LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
- return LoopUnrollResult::Unmodified;
- }
- if (Convergent) {
- LLVM_DEBUG(
- dbgs() << " Not unrolling loop with convergent instructions.\n");
- return LoopUnrollResult::Unmodified;
- }
-
- // Save original loop IDs for after the transformation.
- MDNode *OrigOuterLoopID = L->getLoopID();
- MDNode *OrigSubLoopID = SubLoop->getLoopID();
-
- // To assign the loop id of the epilogue, assign it before unrolling it so it
- // is applied to every inner loop of the epilogue. We later apply the loop ID
- // for the jammed inner loop.
- Optional<MDNode *> NewInnerEpilogueLoopID = makeFollowupLoopID(
- OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
- LLVMLoopUnrollAndJamFollowupRemainderInner});
- if (NewInnerEpilogueLoopID.hasValue())
- SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue());
-
- // Find trip count and trip multiple
- BasicBlock *Latch = L->getLoopLatch();
- BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
- unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch);
- unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch);
- unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch);
-
- // Decide if, and by how much, to unroll
- bool IsCountSetExplicitly = computeUnrollAndJamCount(
- L, SubLoop, TTI, DT, LI, SE, EphValues, &ORE, OuterTripCount,
- OuterTripMultiple, OuterLoopSize, InnerTripCount, InnerLoopSize, UP, PP);
- if (UP.Count <= 1)
- return LoopUnrollResult::Unmodified;
- // Unroll factor (Count) must be less or equal to TripCount.
- if (OuterTripCount && UP.Count > OuterTripCount)
- UP.Count = OuterTripCount;
-
- Loop *EpilogueOuterLoop = nullptr;
- LoopUnrollResult UnrollResult = UnrollAndJamLoop(
- L, UP.Count, OuterTripCount, OuterTripMultiple, UP.UnrollRemainder, LI,
- &SE, &DT, &AC, &TTI, &ORE, &EpilogueOuterLoop);
-
- // Assign new loop attributes.
- if (EpilogueOuterLoop) {
- Optional<MDNode *> NewOuterEpilogueLoopID = makeFollowupLoopID(
- OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
- LLVMLoopUnrollAndJamFollowupRemainderOuter});
- if (NewOuterEpilogueLoopID.hasValue())
- EpilogueOuterLoop->setLoopID(NewOuterEpilogueLoopID.getValue());
- }
-
- Optional<MDNode *> NewInnerLoopID =
- makeFollowupLoopID(OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
- LLVMLoopUnrollAndJamFollowupInner});
- if (NewInnerLoopID.hasValue())
- SubLoop->setLoopID(NewInnerLoopID.getValue());
- else
- SubLoop->setLoopID(OrigSubLoopID);
-
- if (UnrollResult == LoopUnrollResult::PartiallyUnrolled) {
- Optional<MDNode *> NewOuterLoopID = makeFollowupLoopID(
- OrigOuterLoopID,
- {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupOuter});
- if (NewOuterLoopID.hasValue()) {
- L->setLoopID(NewOuterLoopID.getValue());
-
- // Do not setLoopAlreadyUnrolled if a followup was given.
- return UnrollResult;
- }
- }
-
- // If loop has an unroll count pragma or unrolled by explicitly set count
- // mark loop as unrolled to prevent unrolling beyond that requested.
- if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
- L->setLoopAlreadyUnrolled();
-
- return UnrollResult;
-}
-
-static bool tryToUnrollAndJamLoop(Function &F, DominatorTree &DT, LoopInfo &LI,
- ScalarEvolution &SE,
- const TargetTransformInfo &TTI,
- AssumptionCache &AC, DependenceInfo &DI,
- OptimizationRemarkEmitter &ORE,
- int OptLevel) {
- bool DidSomething = false;
-
- // The loop unroll and jam pass requires loops to be in simplified form, and
- // also needs LCSSA. Since simplification may add new inner loops, it has to
- // run before the legality and profitability checks. This means running the
- // loop unroll and jam pass will simplify all loops, regardless of whether
- // anything end up being unroll and jammed.
- for (auto &L : LI) {
- DidSomething |=
- simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
- DidSomething |= formLCSSARecursively(*L, DT, &LI, &SE);
- }
-
- // Add the loop nests in the reverse order of LoopInfo. See method
- // declaration.
- SmallPriorityWorklist<Loop *, 4> Worklist;
- appendLoopsToWorklist(LI, Worklist);
- while (!Worklist.empty()) {
- Loop *L = Worklist.pop_back_val();
- LoopUnrollResult Result =
- tryToUnrollAndJamLoop(L, DT, &LI, SE, TTI, AC, DI, ORE, OptLevel);
- if (Result != LoopUnrollResult::Unmodified)
- DidSomething = true;
- }
-
- return DidSomething;
-}
-
-namespace {
-
-class LoopUnrollAndJam : public FunctionPass {
-public:
- static char ID; // Pass ID, replacement for typeid
- unsigned OptLevel;
-
- LoopUnrollAndJam(int OptLevel = 2) : FunctionPass(ID), OptLevel(OptLevel) {
- initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- const TargetTransformInfo &TTI =
- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
- auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
- return tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
- }
-
- /// This transformation requires natural loop information & requires that
- /// loop preheaders be inserted into the CFG...
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DependenceAnalysisWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-char LoopUnrollAndJam::ID = 0;
-
-INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam",
- "Unroll and Jam loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam",
- "Unroll and Jam loops", false, false)
-
-Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) {
- return new LoopUnrollAndJam(OptLevel);
-}
-
-PreservedAnalyses LoopUnrollAndJamPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
- LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
- TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
- AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
- DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
- DependenceInfo &DI = AM.getResult<DependenceAnalysis>(F);
- OptimizationRemarkEmitter &ORE =
- AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-
- if (!tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel))
- return PreservedAnalyses::all();
-
- return getLoopPassPreservedAnalyses();
-}
+ if (AllowUnrollAndJam.getNumOccurrences() > 0)
+ UP.UnrollAndJam = AllowUnrollAndJam;
+ if (UnrollAndJamThreshold.getNumOccurrences() > 0)
+ UP.UnrollAndJamInnerLoopThreshold = UnrollAndJamThreshold;
+ // Exit early if unrolling is disabled.
+ if (!UP.UnrollAndJam || UP.UnrollAndJamInnerLoopThreshold == 0)
+ return LoopUnrollResult::Unmodified;
+
+ LLVM_DEBUG(dbgs() << "Loop Unroll and Jam: F["
+ << L->getHeader()->getParent()->getName() << "] Loop %"
+ << L->getHeader()->getName() << "\n");
+
+ // A loop with any unroll pragma (enabling/disabling/count/etc) is left for
+ // the unroller, so long as it does not explicitly have unroll_and_jam
+ // metadata. This means #pragma nounroll will disable unroll and jam as well
+ // as unrolling
+ if (hasAnyUnrollPragma(L, "llvm.loop.unroll.") &&
+ !hasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam.")) {
+ LLVM_DEBUG(dbgs() << " Disabled due to pragma.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ if (!isSafeToUnrollAndJam(L, SE, DT, DI, *LI)) {
+ LLVM_DEBUG(dbgs() << " Disabled due to not being safe.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ // Approximate the loop size and collect useful info
+ unsigned NumInlineCandidates;
+ bool NotDuplicatable;
+ bool Convergent;
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+ Loop *SubLoop = L->getSubLoops()[0];
+ unsigned InnerLoopSize =
+ ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable,
+ Convergent, TTI, EphValues, UP.BEInsns);
+ unsigned OuterLoopSize =
+ ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
+ TTI, EphValues, UP.BEInsns);
+ LLVM_DEBUG(dbgs() << " Outer Loop Size: " << OuterLoopSize << "\n");
+ LLVM_DEBUG(dbgs() << " Inner Loop Size: " << InnerLoopSize << "\n");
+ if (NotDuplicatable) {
+ LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable "
+ "instructions.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+ if (NumInlineCandidates != 0) {
+ LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+ if (Convergent) {
+ LLVM_DEBUG(
+ dbgs() << " Not unrolling loop with convergent instructions.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ // Save original loop IDs for after the transformation.
+ MDNode *OrigOuterLoopID = L->getLoopID();
+ MDNode *OrigSubLoopID = SubLoop->getLoopID();
+
+ // To assign the loop id of the epilogue, assign it before unrolling it so it
+ // is applied to every inner loop of the epilogue. We later apply the loop ID
+ // for the jammed inner loop.
+ Optional<MDNode *> NewInnerEpilogueLoopID = makeFollowupLoopID(
+ OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
+ LLVMLoopUnrollAndJamFollowupRemainderInner});
+ if (NewInnerEpilogueLoopID.hasValue())
+ SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue());
+
+ // Find trip count and trip multiple
+ BasicBlock *Latch = L->getLoopLatch();
+ BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+ unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch);
+ unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch);
+ unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch);
+
+ // Decide if, and by how much, to unroll
+ bool IsCountSetExplicitly = computeUnrollAndJamCount(
+ L, SubLoop, TTI, DT, LI, SE, EphValues, &ORE, OuterTripCount,
+ OuterTripMultiple, OuterLoopSize, InnerTripCount, InnerLoopSize, UP, PP);
+ if (UP.Count <= 1)
+ return LoopUnrollResult::Unmodified;
+ // Unroll factor (Count) must be less or equal to TripCount.
+ if (OuterTripCount && UP.Count > OuterTripCount)
+ UP.Count = OuterTripCount;
+
+ Loop *EpilogueOuterLoop = nullptr;
+ LoopUnrollResult UnrollResult = UnrollAndJamLoop(
+ L, UP.Count, OuterTripCount, OuterTripMultiple, UP.UnrollRemainder, LI,
+ &SE, &DT, &AC, &TTI, &ORE, &EpilogueOuterLoop);
+
+ // Assign new loop attributes.
+ if (EpilogueOuterLoop) {
+ Optional<MDNode *> NewOuterEpilogueLoopID = makeFollowupLoopID(
+ OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
+ LLVMLoopUnrollAndJamFollowupRemainderOuter});
+ if (NewOuterEpilogueLoopID.hasValue())
+ EpilogueOuterLoop->setLoopID(NewOuterEpilogueLoopID.getValue());
+ }
+
+ Optional<MDNode *> NewInnerLoopID =
+ makeFollowupLoopID(OrigOuterLoopID, {LLVMLoopUnrollAndJamFollowupAll,
+ LLVMLoopUnrollAndJamFollowupInner});
+ if (NewInnerLoopID.hasValue())
+ SubLoop->setLoopID(NewInnerLoopID.getValue());
+ else
+ SubLoop->setLoopID(OrigSubLoopID);
+
+ if (UnrollResult == LoopUnrollResult::PartiallyUnrolled) {
+ Optional<MDNode *> NewOuterLoopID = makeFollowupLoopID(
+ OrigOuterLoopID,
+ {LLVMLoopUnrollAndJamFollowupAll, LLVMLoopUnrollAndJamFollowupOuter});
+ if (NewOuterLoopID.hasValue()) {
+ L->setLoopID(NewOuterLoopID.getValue());
+
+ // Do not setLoopAlreadyUnrolled if a followup was given.
+ return UnrollResult;
+ }
+ }
+
+ // If loop has an unroll count pragma or unrolled by explicitly set count
+ // mark loop as unrolled to prevent unrolling beyond that requested.
+ if (UnrollResult != LoopUnrollResult::FullyUnrolled && IsCountSetExplicitly)
+ L->setLoopAlreadyUnrolled();
+
+ return UnrollResult;
+}
+
+static bool tryToUnrollAndJamLoop(Function &F, DominatorTree &DT, LoopInfo &LI,
+ ScalarEvolution &SE,
+ const TargetTransformInfo &TTI,
+ AssumptionCache &AC, DependenceInfo &DI,
+ OptimizationRemarkEmitter &ORE,
+ int OptLevel) {
+ bool DidSomething = false;
+
+ // The loop unroll and jam pass requires loops to be in simplified form, and
+ // also needs LCSSA. Since simplification may add new inner loops, it has to
+ // run before the legality and profitability checks. This means running the
+ // loop unroll and jam pass will simplify all loops, regardless of whether
+ // anything end up being unroll and jammed.
+ for (auto &L : LI) {
+ DidSomething |=
+ simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
+ DidSomething |= formLCSSARecursively(*L, DT, &LI, &SE);
+ }
+
+ // Add the loop nests in the reverse order of LoopInfo. See method
+ // declaration.
+ SmallPriorityWorklist<Loop *, 4> Worklist;
+ appendLoopsToWorklist(LI, Worklist);
+ while (!Worklist.empty()) {
+ Loop *L = Worklist.pop_back_val();
+ LoopUnrollResult Result =
+ tryToUnrollAndJamLoop(L, DT, &LI, SE, TTI, AC, DI, ORE, OptLevel);
+ if (Result != LoopUnrollResult::Unmodified)
+ DidSomething = true;
+ }
+
+ return DidSomething;
+}
+
+namespace {
+
+class LoopUnrollAndJam : public FunctionPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+ unsigned OptLevel;
+
+ LoopUnrollAndJam(int OptLevel = 2) : FunctionPass(ID), OptLevel(OptLevel) {
+ initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ const TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
+ auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+ return tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
+ }
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG...
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DependenceAnalysisWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char LoopUnrollAndJam::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam",
+ "Unroll and Jam loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam",
+ "Unroll and Jam loops", false, false)
+
+Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) {
+ return new LoopUnrollAndJam(OptLevel);
+}
+
+PreservedAnalyses LoopUnrollAndJamPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+ TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+ AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
+ DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ DependenceInfo &DI = AM.getResult<DependenceAnalysis>(F);
+ OptimizationRemarkEmitter &ORE =
+ AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+ if (!tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel))
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollPass.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 02e1f82b54..1b974576a3 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -1,1472 +1,1472 @@
-//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass implements a simple loop unroller. It works best when loops have
-// been canonicalized by the -indvars pass, allowing it to determine the trip
-// counts of loops easily.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseMapInfo.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/LoopUnrollAnalyzer.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils.h"
+//===- LoopUnroll.cpp - Loop unroller pass --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop unroller. It works best when loops have
+// been canonicalized by the -indvars pass, allowing it to determine the trip
+// counts of loops easily.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LoopUnrollPass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopUnrollAnalyzer.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/LoopPeel.h"
-#include "llvm/Transforms/Utils/LoopSimplify.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SizeOpts.h"
-#include "llvm/Transforms/Utils/UnrollLoop.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <limits>
-#include <string>
-#include <tuple>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-unroll"
-
-cl::opt<bool> llvm::ForgetSCEVInLoopUnroll(
- "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
- cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll"
+
+cl::opt<bool> llvm::ForgetSCEVInLoopUnroll(
+ "forget-scev-loop-unroll", cl::init(false), cl::Hidden,
+ cl::desc("Forget everything in SCEV when doing LoopUnroll, instead of just"
" the current top-most loop. This is sometimes preferred to reduce"
- " compile time."));
-
-static cl::opt<unsigned>
- UnrollThreshold("unroll-threshold", cl::Hidden,
- cl::desc("The cost threshold for loop unrolling"));
-
+ " compile time."));
+
+static cl::opt<unsigned>
+ UnrollThreshold("unroll-threshold", cl::Hidden,
+ cl::desc("The cost threshold for loop unrolling"));
+
static cl::opt<unsigned>
UnrollOptSizeThreshold(
"unroll-optsize-threshold", cl::init(0), cl::Hidden,
cl::desc("The cost threshold for loop unrolling when optimizing for "
"size"));
-static cl::opt<unsigned> UnrollPartialThreshold(
- "unroll-partial-threshold", cl::Hidden,
- cl::desc("The cost threshold for partial loop unrolling"));
-
-static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
- "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
- cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
- "to the threshold when aggressively unrolling a loop due to the "
- "dynamic cost savings. If completely unrolling a loop will reduce "
- "the total runtime from X to Y, we boost the loop unroll "
- "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
- "X/Y). This limit avoids excessive code bloat."));
-
-static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
- "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
- cl::desc("Don't allow loop unrolling to simulate more than this number of"
- "iterations when checking full unroll profitability"));
-
-static cl::opt<unsigned> UnrollCount(
- "unroll-count", cl::Hidden,
- cl::desc("Use this unroll count for all loops including those with "
- "unroll_count pragma values, for testing purposes"));
-
-static cl::opt<unsigned> UnrollMaxCount(
- "unroll-max-count", cl::Hidden,
- cl::desc("Set the max unroll count for partial and runtime unrolling, for"
- "testing purposes"));
-
-static cl::opt<unsigned> UnrollFullMaxCount(
- "unroll-full-max-count", cl::Hidden,
- cl::desc(
- "Set the max unroll count for full unrolling, for testing purposes"));
-
-static cl::opt<bool>
- UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
- cl::desc("Allows loops to be partially unrolled until "
- "-unroll-threshold loop size is reached."));
-
-static cl::opt<bool> UnrollAllowRemainder(
- "unroll-allow-remainder", cl::Hidden,
- cl::desc("Allow generation of a loop remainder (extra iterations) "
- "when unrolling a loop."));
-
-static cl::opt<bool>
- UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
- cl::desc("Unroll loops with run-time trip counts"));
-
-static cl::opt<unsigned> UnrollMaxUpperBound(
- "unroll-max-upperbound", cl::init(8), cl::Hidden,
- cl::desc(
- "The max of trip count upper bound that is considered in unrolling"));
-
-static cl::opt<unsigned> PragmaUnrollThreshold(
- "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
- cl::desc("Unrolled size limit for loops with an unroll(full) or "
- "unroll_count pragma."));
-
-static cl::opt<unsigned> FlatLoopTripCountThreshold(
- "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
- cl::desc("If the runtime tripcount for the loop is lower than the "
- "threshold, the loop is considered as flat and will be less "
- "aggressively unrolled."));
-
-static cl::opt<bool> UnrollUnrollRemainder(
- "unroll-remainder", cl::Hidden,
- cl::desc("Allow the loop remainder to be unrolled."));
-
-// This option isn't ever intended to be enabled, it serves to allow
-// experiments to check the assumptions about when this kind of revisit is
-// necessary.
-static cl::opt<bool> UnrollRevisitChildLoops(
- "unroll-revisit-child-loops", cl::Hidden,
- cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
- "This shouldn't typically be needed as child loops (or their "
- "clones) were already visited."));
-
-static cl::opt<unsigned> UnrollThresholdAggressive(
- "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
- cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
- "optimizations"));
-static cl::opt<unsigned>
- UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
- cl::Hidden,
- cl::desc("Default threshold (max size of unrolled "
- "loop), used in all but O3 optimizations"));
-
-/// A magic value for use with the Threshold parameter to indicate
-/// that the loop unroll should be performed regardless of how much
-/// code expansion would result.
-static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
-
-/// Gather the various unrolling parameters based on the defaults, compiler
-/// flags, TTI overrides and user specified parameters.
-TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
- Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
- BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
- Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
- Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
- Optional<bool> UserUpperBound, Optional<unsigned> UserFullUnrollMaxCount) {
- TargetTransformInfo::UnrollingPreferences UP;
-
- // Set up the defaults
- UP.Threshold =
- OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault;
- UP.MaxPercentThresholdBoost = 400;
+static cl::opt<unsigned> UnrollPartialThreshold(
+ "unroll-partial-threshold", cl::Hidden,
+ cl::desc("The cost threshold for partial loop unrolling"));
+
+static cl::opt<unsigned> UnrollMaxPercentThresholdBoost(
+ "unroll-max-percent-threshold-boost", cl::init(400), cl::Hidden,
+ cl::desc("The maximum 'boost' (represented as a percentage >= 100) applied "
+ "to the threshold when aggressively unrolling a loop due to the "
+ "dynamic cost savings. If completely unrolling a loop will reduce "
+ "the total runtime from X to Y, we boost the loop unroll "
+ "threshold to DefaultThreshold*std::min(MaxPercentThresholdBoost, "
+ "X/Y). This limit avoids excessive code bloat."));
+
+static cl::opt<unsigned> UnrollMaxIterationsCountToAnalyze(
+ "unroll-max-iteration-count-to-analyze", cl::init(10), cl::Hidden,
+ cl::desc("Don't allow loop unrolling to simulate more than this number of"
+ "iterations when checking full unroll profitability"));
+
+static cl::opt<unsigned> UnrollCount(
+ "unroll-count", cl::Hidden,
+ cl::desc("Use this unroll count for all loops including those with "
+ "unroll_count pragma values, for testing purposes"));
+
+static cl::opt<unsigned> UnrollMaxCount(
+ "unroll-max-count", cl::Hidden,
+ cl::desc("Set the max unroll count for partial and runtime unrolling, for"
+ "testing purposes"));
+
+static cl::opt<unsigned> UnrollFullMaxCount(
+ "unroll-full-max-count", cl::Hidden,
+ cl::desc(
+ "Set the max unroll count for full unrolling, for testing purposes"));
+
+static cl::opt<bool>
+ UnrollAllowPartial("unroll-allow-partial", cl::Hidden,
+ cl::desc("Allows loops to be partially unrolled until "
+ "-unroll-threshold loop size is reached."));
+
+static cl::opt<bool> UnrollAllowRemainder(
+ "unroll-allow-remainder", cl::Hidden,
+ cl::desc("Allow generation of a loop remainder (extra iterations) "
+ "when unrolling a loop."));
+
+static cl::opt<bool>
+ UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden,
+ cl::desc("Unroll loops with run-time trip counts"));
+
+static cl::opt<unsigned> UnrollMaxUpperBound(
+ "unroll-max-upperbound", cl::init(8), cl::Hidden,
+ cl::desc(
+ "The max of trip count upper bound that is considered in unrolling"));
+
+static cl::opt<unsigned> PragmaUnrollThreshold(
+ "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden,
+ cl::desc("Unrolled size limit for loops with an unroll(full) or "
+ "unroll_count pragma."));
+
+static cl::opt<unsigned> FlatLoopTripCountThreshold(
+ "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden,
+ cl::desc("If the runtime tripcount for the loop is lower than the "
+ "threshold, the loop is considered as flat and will be less "
+ "aggressively unrolled."));
+
+static cl::opt<bool> UnrollUnrollRemainder(
+ "unroll-remainder", cl::Hidden,
+ cl::desc("Allow the loop remainder to be unrolled."));
+
+// This option isn't ever intended to be enabled, it serves to allow
+// experiments to check the assumptions about when this kind of revisit is
+// necessary.
+static cl::opt<bool> UnrollRevisitChildLoops(
+ "unroll-revisit-child-loops", cl::Hidden,
+ cl::desc("Enqueue and re-visit child loops in the loop PM after unrolling. "
+ "This shouldn't typically be needed as child loops (or their "
+ "clones) were already visited."));
+
+static cl::opt<unsigned> UnrollThresholdAggressive(
+ "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
+ cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
+ "optimizations"));
+static cl::opt<unsigned>
+ UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
+ cl::Hidden,
+ cl::desc("Default threshold (max size of unrolled "
+ "loop), used in all but O3 optimizations"));
+
+/// A magic value for use with the Threshold parameter to indicate
+/// that the loop unroll should be performed regardless of how much
+/// code expansion would result.
+static const unsigned NoThreshold = std::numeric_limits<unsigned>::max();
+
+/// Gather the various unrolling parameters based on the defaults, compiler
+/// flags, TTI overrides and user specified parameters.
+TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
+ Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
+ Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
+ Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
+ Optional<bool> UserUpperBound, Optional<unsigned> UserFullUnrollMaxCount) {
+ TargetTransformInfo::UnrollingPreferences UP;
+
+ // Set up the defaults
+ UP.Threshold =
+ OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault;
+ UP.MaxPercentThresholdBoost = 400;
UP.OptSizeThreshold = UnrollOptSizeThreshold;
- UP.PartialThreshold = 150;
+ UP.PartialThreshold = 150;
UP.PartialOptSizeThreshold = UnrollOptSizeThreshold;
- UP.Count = 0;
- UP.DefaultUnrollRuntimeCount = 8;
- UP.MaxCount = std::numeric_limits<unsigned>::max();
- UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
- UP.BEInsns = 2;
- UP.Partial = false;
- UP.Runtime = false;
- UP.AllowRemainder = true;
- UP.UnrollRemainder = false;
- UP.AllowExpensiveTripCount = false;
- UP.Force = false;
- UP.UpperBound = false;
- UP.UnrollAndJam = false;
- UP.UnrollAndJamInnerLoopThreshold = 60;
- UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
-
- // Override with any target specific settings
- TTI.getUnrollingPreferences(L, SE, UP);
-
- // Apply size attributes
- bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
+ UP.Count = 0;
+ UP.DefaultUnrollRuntimeCount = 8;
+ UP.MaxCount = std::numeric_limits<unsigned>::max();
+ UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
+ UP.BEInsns = 2;
+ UP.Partial = false;
+ UP.Runtime = false;
+ UP.AllowRemainder = true;
+ UP.UnrollRemainder = false;
+ UP.AllowExpensiveTripCount = false;
+ UP.Force = false;
+ UP.UpperBound = false;
+ UP.UnrollAndJam = false;
+ UP.UnrollAndJamInnerLoopThreshold = 60;
+ UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
+
+ // Override with any target specific settings
+ TTI.getUnrollingPreferences(L, SE, UP);
+
+ // Apply size attributes
+ bool OptForSize = L->getHeader()->getParent()->hasOptSize() ||
// Let unroll hints / pragmas take precedence over PGSO.
(hasUnrollTransformation(L) != TM_ForcedByUser &&
llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
PGSOQueryType::IRPass));
- if (OptForSize) {
- UP.Threshold = UP.OptSizeThreshold;
- UP.PartialThreshold = UP.PartialOptSizeThreshold;
- UP.MaxPercentThresholdBoost = 100;
- }
-
- // Apply any user values specified by cl::opt
- if (UnrollThreshold.getNumOccurrences() > 0)
- UP.Threshold = UnrollThreshold;
- if (UnrollPartialThreshold.getNumOccurrences() > 0)
- UP.PartialThreshold = UnrollPartialThreshold;
- if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
- UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
- if (UnrollMaxCount.getNumOccurrences() > 0)
- UP.MaxCount = UnrollMaxCount;
- if (UnrollFullMaxCount.getNumOccurrences() > 0)
- UP.FullUnrollMaxCount = UnrollFullMaxCount;
- if (UnrollAllowPartial.getNumOccurrences() > 0)
- UP.Partial = UnrollAllowPartial;
- if (UnrollAllowRemainder.getNumOccurrences() > 0)
- UP.AllowRemainder = UnrollAllowRemainder;
- if (UnrollRuntime.getNumOccurrences() > 0)
- UP.Runtime = UnrollRuntime;
- if (UnrollMaxUpperBound == 0)
- UP.UpperBound = false;
- if (UnrollUnrollRemainder.getNumOccurrences() > 0)
- UP.UnrollRemainder = UnrollUnrollRemainder;
- if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0)
- UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
-
- // Apply user values provided by argument
- if (UserThreshold.hasValue()) {
- UP.Threshold = *UserThreshold;
- UP.PartialThreshold = *UserThreshold;
- }
- if (UserCount.hasValue())
- UP.Count = *UserCount;
- if (UserAllowPartial.hasValue())
- UP.Partial = *UserAllowPartial;
- if (UserRuntime.hasValue())
- UP.Runtime = *UserRuntime;
- if (UserUpperBound.hasValue())
- UP.UpperBound = *UserUpperBound;
- if (UserFullUnrollMaxCount.hasValue())
- UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
-
- return UP;
-}
-
-namespace {
-
-/// A struct to densely store the state of an instruction after unrolling at
-/// each iteration.
-///
-/// This is designed to work like a tuple of <Instruction *, int> for the
-/// purposes of hashing and lookup, but to be able to associate two boolean
-/// states with each key.
-struct UnrolledInstState {
- Instruction *I;
- int Iteration : 30;
- unsigned IsFree : 1;
- unsigned IsCounted : 1;
-};
-
-/// Hashing and equality testing for a set of the instruction states.
-struct UnrolledInstStateKeyInfo {
- using PtrInfo = DenseMapInfo<Instruction *>;
- using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
-
- static inline UnrolledInstState getEmptyKey() {
- return {PtrInfo::getEmptyKey(), 0, 0, 0};
- }
-
- static inline UnrolledInstState getTombstoneKey() {
- return {PtrInfo::getTombstoneKey(), 0, 0, 0};
- }
-
- static inline unsigned getHashValue(const UnrolledInstState &S) {
- return PairInfo::getHashValue({S.I, S.Iteration});
- }
-
- static inline bool isEqual(const UnrolledInstState &LHS,
- const UnrolledInstState &RHS) {
- return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
- }
-};
-
-struct EstimatedUnrollCost {
- /// The estimated cost after unrolling.
- unsigned UnrolledCost;
-
- /// The estimated dynamic cost of executing the instructions in the
- /// rolled form.
- unsigned RolledDynamicCost;
-};
-
-} // end anonymous namespace
-
-/// Figure out if the loop is worth full unrolling.
-///
-/// Complete loop unrolling can make some loads constant, and we need to know
-/// if that would expose any further optimization opportunities. This routine
-/// estimates this optimization. It computes cost of unrolled loop
-/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
-/// dynamic cost we mean that we won't count costs of blocks that are known not
-/// to be executed (i.e. if we have a branch in the loop and we know that at the
-/// given iteration its condition would be resolved to true, we won't add up the
-/// cost of the 'false'-block).
-/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
-/// the analysis failed (no benefits expected from the unrolling, or the loop is
-/// too big to analyze), the returned value is None.
-static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
- const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
- const SmallPtrSetImpl<const Value *> &EphValues,
- const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,
- unsigned MaxIterationsCountToAnalyze) {
- // We want to be able to scale offsets by the trip count and add more offsets
- // to them without checking for overflows, and we already don't want to
- // analyze *massive* trip counts, so we force the max to be reasonably small.
- assert(MaxIterationsCountToAnalyze <
- (unsigned)(std::numeric_limits<int>::max() / 2) &&
- "The unroll iterations max is too large!");
-
- // Only analyze inner loops. We can't properly estimate cost of nested loops
- // and we won't visit inner loops again anyway.
+ if (OptForSize) {
+ UP.Threshold = UP.OptSizeThreshold;
+ UP.PartialThreshold = UP.PartialOptSizeThreshold;
+ UP.MaxPercentThresholdBoost = 100;
+ }
+
+ // Apply any user values specified by cl::opt
+ if (UnrollThreshold.getNumOccurrences() > 0)
+ UP.Threshold = UnrollThreshold;
+ if (UnrollPartialThreshold.getNumOccurrences() > 0)
+ UP.PartialThreshold = UnrollPartialThreshold;
+ if (UnrollMaxPercentThresholdBoost.getNumOccurrences() > 0)
+ UP.MaxPercentThresholdBoost = UnrollMaxPercentThresholdBoost;
+ if (UnrollMaxCount.getNumOccurrences() > 0)
+ UP.MaxCount = UnrollMaxCount;
+ if (UnrollFullMaxCount.getNumOccurrences() > 0)
+ UP.FullUnrollMaxCount = UnrollFullMaxCount;
+ if (UnrollAllowPartial.getNumOccurrences() > 0)
+ UP.Partial = UnrollAllowPartial;
+ if (UnrollAllowRemainder.getNumOccurrences() > 0)
+ UP.AllowRemainder = UnrollAllowRemainder;
+ if (UnrollRuntime.getNumOccurrences() > 0)
+ UP.Runtime = UnrollRuntime;
+ if (UnrollMaxUpperBound == 0)
+ UP.UpperBound = false;
+ if (UnrollUnrollRemainder.getNumOccurrences() > 0)
+ UP.UnrollRemainder = UnrollUnrollRemainder;
+ if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0)
+ UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
+
+ // Apply user values provided by argument
+ if (UserThreshold.hasValue()) {
+ UP.Threshold = *UserThreshold;
+ UP.PartialThreshold = *UserThreshold;
+ }
+ if (UserCount.hasValue())
+ UP.Count = *UserCount;
+ if (UserAllowPartial.hasValue())
+ UP.Partial = *UserAllowPartial;
+ if (UserRuntime.hasValue())
+ UP.Runtime = *UserRuntime;
+ if (UserUpperBound.hasValue())
+ UP.UpperBound = *UserUpperBound;
+ if (UserFullUnrollMaxCount.hasValue())
+ UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
+
+ return UP;
+}
+
+namespace {
+
+/// A struct to densely store the state of an instruction after unrolling at
+/// each iteration.
+///
+/// This is designed to work like a tuple of <Instruction *, int> for the
+/// purposes of hashing and lookup, but to be able to associate two boolean
+/// states with each key.
+struct UnrolledInstState {
+ Instruction *I;
+ int Iteration : 30;
+ unsigned IsFree : 1;
+ unsigned IsCounted : 1;
+};
+
+/// Hashing and equality testing for a set of the instruction states.
+struct UnrolledInstStateKeyInfo {
+ using PtrInfo = DenseMapInfo<Instruction *>;
+ using PairInfo = DenseMapInfo<std::pair<Instruction *, int>>;
+
+ static inline UnrolledInstState getEmptyKey() {
+ return {PtrInfo::getEmptyKey(), 0, 0, 0};
+ }
+
+ static inline UnrolledInstState getTombstoneKey() {
+ return {PtrInfo::getTombstoneKey(), 0, 0, 0};
+ }
+
+ static inline unsigned getHashValue(const UnrolledInstState &S) {
+ return PairInfo::getHashValue({S.I, S.Iteration});
+ }
+
+ static inline bool isEqual(const UnrolledInstState &LHS,
+ const UnrolledInstState &RHS) {
+ return PairInfo::isEqual({LHS.I, LHS.Iteration}, {RHS.I, RHS.Iteration});
+ }
+};
+
+struct EstimatedUnrollCost {
+ /// The estimated cost after unrolling.
+ unsigned UnrolledCost;
+
+ /// The estimated dynamic cost of executing the instructions in the
+ /// rolled form.
+ unsigned RolledDynamicCost;
+};
+
+} // end anonymous namespace
+
+/// Figure out if the loop is worth full unrolling.
+///
+/// Complete loop unrolling can make some loads constant, and we need to know
+/// if that would expose any further optimization opportunities. This routine
+/// estimates this optimization. It computes cost of unrolled loop
+/// (UnrolledCost) and dynamic cost of the original loop (RolledDynamicCost). By
+/// dynamic cost we mean that we won't count costs of blocks that are known not
+/// to be executed (i.e. if we have a branch in the loop and we know that at the
+/// given iteration its condition would be resolved to true, we won't add up the
+/// cost of the 'false'-block).
+/// \returns Optional value, holding the RolledDynamicCost and UnrolledCost. If
+/// the analysis failed (no benefits expected from the unrolling, or the loop is
+/// too big to analyze), the returned value is None.
+static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
+ const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
+ const SmallPtrSetImpl<const Value *> &EphValues,
+ const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,
+ unsigned MaxIterationsCountToAnalyze) {
+ // We want to be able to scale offsets by the trip count and add more offsets
+ // to them without checking for overflows, and we already don't want to
+ // analyze *massive* trip counts, so we force the max to be reasonably small.
+ assert(MaxIterationsCountToAnalyze <
+ (unsigned)(std::numeric_limits<int>::max() / 2) &&
+ "The unroll iterations max is too large!");
+
+ // Only analyze inner loops. We can't properly estimate cost of nested loops
+ // and we won't visit inner loops again anyway.
if (!L->isInnermost())
- return None;
-
- // Don't simulate loops with a big or unknown tripcount
- if (!TripCount || TripCount > MaxIterationsCountToAnalyze)
- return None;
-
- SmallSetVector<BasicBlock *, 16> BBWorklist;
- SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
- DenseMap<Value *, Constant *> SimplifiedValues;
- SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
-
- // The estimated cost of the unrolled form of the loop. We try to estimate
- // this by simplifying as much as we can while computing the estimate.
- unsigned UnrolledCost = 0;
-
- // We also track the estimated dynamic (that is, actually executed) cost in
- // the rolled form. This helps identify cases when the savings from unrolling
- // aren't just exposing dead control flows, but actual reduced dynamic
- // instructions due to the simplifications which we expect to occur after
- // unrolling.
- unsigned RolledDynamicCost = 0;
-
- // We track the simplification of each instruction in each iteration. We use
- // this to recursively merge costs into the unrolled cost on-demand so that
- // we don't count the cost of any dead code. This is essentially a map from
- // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
- DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
-
- // A small worklist used to accumulate cost of instructions from each
- // observable and reached root in the loop.
- SmallVector<Instruction *, 16> CostWorklist;
-
- // PHI-used worklist used between iterations while accumulating cost.
- SmallVector<Instruction *, 4> PHIUsedList;
-
- // Helper function to accumulate cost for instructions in the loop.
- auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
- assert(Iteration >= 0 && "Cannot have a negative iteration!");
- assert(CostWorklist.empty() && "Must start with an empty cost list");
- assert(PHIUsedList.empty() && "Must start with an empty phi used list");
- CostWorklist.push_back(&RootI);
+ return None;
+
+ // Don't simulate loops with a big or unknown tripcount
+ if (!TripCount || TripCount > MaxIterationsCountToAnalyze)
+ return None;
+
+ SmallSetVector<BasicBlock *, 16> BBWorklist;
+ SmallSetVector<std::pair<BasicBlock *, BasicBlock *>, 4> ExitWorklist;
+ DenseMap<Value *, Constant *> SimplifiedValues;
+ SmallVector<std::pair<Value *, Constant *>, 4> SimplifiedInputValues;
+
+ // The estimated cost of the unrolled form of the loop. We try to estimate
+ // this by simplifying as much as we can while computing the estimate.
+ unsigned UnrolledCost = 0;
+
+ // We also track the estimated dynamic (that is, actually executed) cost in
+ // the rolled form. This helps identify cases when the savings from unrolling
+ // aren't just exposing dead control flows, but actual reduced dynamic
+ // instructions due to the simplifications which we expect to occur after
+ // unrolling.
+ unsigned RolledDynamicCost = 0;
+
+ // We track the simplification of each instruction in each iteration. We use
+ // this to recursively merge costs into the unrolled cost on-demand so that
+ // we don't count the cost of any dead code. This is essentially a map from
+ // <instruction, int> to <bool, bool>, but stored as a densely packed struct.
+ DenseSet<UnrolledInstState, UnrolledInstStateKeyInfo> InstCostMap;
+
+ // A small worklist used to accumulate cost of instructions from each
+ // observable and reached root in the loop.
+ SmallVector<Instruction *, 16> CostWorklist;
+
+ // PHI-used worklist used between iterations while accumulating cost.
+ SmallVector<Instruction *, 4> PHIUsedList;
+
+ // Helper function to accumulate cost for instructions in the loop.
+ auto AddCostRecursively = [&](Instruction &RootI, int Iteration) {
+ assert(Iteration >= 0 && "Cannot have a negative iteration!");
+ assert(CostWorklist.empty() && "Must start with an empty cost list");
+ assert(PHIUsedList.empty() && "Must start with an empty phi used list");
+ CostWorklist.push_back(&RootI);
TargetTransformInfo::TargetCostKind CostKind =
RootI.getFunction()->hasMinSize() ?
TargetTransformInfo::TCK_CodeSize :
TargetTransformInfo::TCK_SizeAndLatency;
- for (;; --Iteration) {
- do {
- Instruction *I = CostWorklist.pop_back_val();
-
- // InstCostMap only uses I and Iteration as a key, the other two values
- // don't matter here.
- auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
- if (CostIter == InstCostMap.end())
- // If an input to a PHI node comes from a dead path through the loop
- // we may have no cost data for it here. What that actually means is
- // that it is free.
- continue;
- auto &Cost = *CostIter;
- if (Cost.IsCounted)
- // Already counted this instruction.
- continue;
-
- // Mark that we are counting the cost of this instruction now.
- Cost.IsCounted = true;
-
- // If this is a PHI node in the loop header, just add it to the PHI set.
- if (auto *PhiI = dyn_cast<PHINode>(I))
- if (PhiI->getParent() == L->getHeader()) {
- assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
- "inherently simplify during unrolling.");
- if (Iteration == 0)
- continue;
-
- // Push the incoming value from the backedge into the PHI used list
- // if it is an in-loop instruction. We'll use this to populate the
- // cost worklist for the next iteration (as we count backwards).
- if (auto *OpI = dyn_cast<Instruction>(
- PhiI->getIncomingValueForBlock(L->getLoopLatch())))
- if (L->contains(OpI))
- PHIUsedList.push_back(OpI);
- continue;
- }
-
- // First accumulate the cost of this instruction.
- if (!Cost.IsFree) {
+ for (;; --Iteration) {
+ do {
+ Instruction *I = CostWorklist.pop_back_val();
+
+ // InstCostMap only uses I and Iteration as a key, the other two values
+ // don't matter here.
+ auto CostIter = InstCostMap.find({I, Iteration, 0, 0});
+ if (CostIter == InstCostMap.end())
+ // If an input to a PHI node comes from a dead path through the loop
+ // we may have no cost data for it here. What that actually means is
+ // that it is free.
+ continue;
+ auto &Cost = *CostIter;
+ if (Cost.IsCounted)
+ // Already counted this instruction.
+ continue;
+
+ // Mark that we are counting the cost of this instruction now.
+ Cost.IsCounted = true;
+
+ // If this is a PHI node in the loop header, just add it to the PHI set.
+ if (auto *PhiI = dyn_cast<PHINode>(I))
+ if (PhiI->getParent() == L->getHeader()) {
+ assert(Cost.IsFree && "Loop PHIs shouldn't be evaluated as they "
+ "inherently simplify during unrolling.");
+ if (Iteration == 0)
+ continue;
+
+ // Push the incoming value from the backedge into the PHI used list
+ // if it is an in-loop instruction. We'll use this to populate the
+ // cost worklist for the next iteration (as we count backwards).
+ if (auto *OpI = dyn_cast<Instruction>(
+ PhiI->getIncomingValueForBlock(L->getLoopLatch())))
+ if (L->contains(OpI))
+ PHIUsedList.push_back(OpI);
+ continue;
+ }
+
+ // First accumulate the cost of this instruction.
+ if (!Cost.IsFree) {
UnrolledCost += TTI.getUserCost(I, CostKind);
- LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
- << Iteration << "): ");
- LLVM_DEBUG(I->dump());
- }
-
- // We must count the cost of every operand which is not free,
- // recursively. If we reach a loop PHI node, simply add it to the set
- // to be considered on the next iteration (backwards!).
- for (Value *Op : I->operands()) {
- // Check whether this operand is free due to being a constant or
- // outside the loop.
- auto *OpI = dyn_cast<Instruction>(Op);
- if (!OpI || !L->contains(OpI))
- continue;
-
- // Otherwise accumulate its cost.
- CostWorklist.push_back(OpI);
- }
- } while (!CostWorklist.empty());
-
- if (PHIUsedList.empty())
- // We've exhausted the search.
- break;
-
- assert(Iteration > 0 &&
- "Cannot track PHI-used values past the first iteration!");
- CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
- PHIUsedList.clear();
- }
- };
-
- // Ensure that we don't violate the loop structure invariants relied on by
- // this analysis.
- assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
- assert(L->isLCSSAForm(DT) &&
- "Must have loops in LCSSA form to track live-out values.");
-
- LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
-
+ LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
+ << Iteration << "): ");
+ LLVM_DEBUG(I->dump());
+ }
+
+ // We must count the cost of every operand which is not free,
+ // recursively. If we reach a loop PHI node, simply add it to the set
+ // to be considered on the next iteration (backwards!).
+ for (Value *Op : I->operands()) {
+ // Check whether this operand is free due to being a constant or
+ // outside the loop.
+ auto *OpI = dyn_cast<Instruction>(Op);
+ if (!OpI || !L->contains(OpI))
+ continue;
+
+ // Otherwise accumulate its cost.
+ CostWorklist.push_back(OpI);
+ }
+ } while (!CostWorklist.empty());
+
+ if (PHIUsedList.empty())
+ // We've exhausted the search.
+ break;
+
+ assert(Iteration > 0 &&
+ "Cannot track PHI-used values past the first iteration!");
+ CostWorklist.append(PHIUsedList.begin(), PHIUsedList.end());
+ PHIUsedList.clear();
+ }
+ };
+
+ // Ensure that we don't violate the loop structure invariants relied on by
+ // this analysis.
+ assert(L->isLoopSimplifyForm() && "Must put loop into normal form first.");
+ assert(L->isLCSSAForm(DT) &&
+ "Must have loops in LCSSA form to track live-out values.");
+
+ LLVM_DEBUG(dbgs() << "Starting LoopUnroll profitability analysis...\n");
+
TargetTransformInfo::TargetCostKind CostKind =
L->getHeader()->getParent()->hasMinSize() ?
TargetTransformInfo::TCK_CodeSize : TargetTransformInfo::TCK_SizeAndLatency;
- // Simulate execution of each iteration of the loop counting instructions,
- // which would be simplified.
- // Since the same load will take different values on different iterations,
- // we literally have to go through all loop's iterations.
- for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
- LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
-
- // Prepare for the iteration by collecting any simplified entry or backedge
- // inputs.
- for (Instruction &I : *L->getHeader()) {
- auto *PHI = dyn_cast<PHINode>(&I);
- if (!PHI)
- break;
-
- // The loop header PHI nodes must have exactly two input: one from the
- // loop preheader and one from the loop latch.
- assert(
- PHI->getNumIncomingValues() == 2 &&
- "Must have an incoming value only for the preheader and the latch.");
-
- Value *V = PHI->getIncomingValueForBlock(
- Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
- Constant *C = dyn_cast<Constant>(V);
- if (Iteration != 0 && !C)
- C = SimplifiedValues.lookup(V);
- if (C)
- SimplifiedInputValues.push_back({PHI, C});
- }
-
- // Now clear and re-populate the map for the next iteration.
- SimplifiedValues.clear();
- while (!SimplifiedInputValues.empty())
- SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
-
- UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
-
- BBWorklist.clear();
- BBWorklist.insert(L->getHeader());
- // Note that we *must not* cache the size, this loop grows the worklist.
- for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
- BasicBlock *BB = BBWorklist[Idx];
-
- // Visit all instructions in the given basic block and try to simplify
- // it. We don't change the actual IR, just count optimization
- // opportunities.
- for (Instruction &I : *BB) {
- // These won't get into the final code - don't even try calculating the
- // cost for them.
- if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
- continue;
-
- // Track this instruction's expected baseline cost when executing the
- // rolled loop form.
+ // Simulate execution of each iteration of the loop counting instructions,
+ // which would be simplified.
+ // Since the same load will take different values on different iterations,
+ // we literally have to go through all loop's iterations.
+ for (unsigned Iteration = 0; Iteration < TripCount; ++Iteration) {
+ LLVM_DEBUG(dbgs() << " Analyzing iteration " << Iteration << "\n");
+
+ // Prepare for the iteration by collecting any simplified entry or backedge
+ // inputs.
+ for (Instruction &I : *L->getHeader()) {
+ auto *PHI = dyn_cast<PHINode>(&I);
+ if (!PHI)
+ break;
+
+ // The loop header PHI nodes must have exactly two input: one from the
+ // loop preheader and one from the loop latch.
+ assert(
+ PHI->getNumIncomingValues() == 2 &&
+ "Must have an incoming value only for the preheader and the latch.");
+
+ Value *V = PHI->getIncomingValueForBlock(
+ Iteration == 0 ? L->getLoopPreheader() : L->getLoopLatch());
+ Constant *C = dyn_cast<Constant>(V);
+ if (Iteration != 0 && !C)
+ C = SimplifiedValues.lookup(V);
+ if (C)
+ SimplifiedInputValues.push_back({PHI, C});
+ }
+
+ // Now clear and re-populate the map for the next iteration.
+ SimplifiedValues.clear();
+ while (!SimplifiedInputValues.empty())
+ SimplifiedValues.insert(SimplifiedInputValues.pop_back_val());
+
+ UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
+
+ BBWorklist.clear();
+ BBWorklist.insert(L->getHeader());
+ // Note that we *must not* cache the size, this loop grows the worklist.
+ for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
+ BasicBlock *BB = BBWorklist[Idx];
+
+ // Visit all instructions in the given basic block and try to simplify
+ // it. We don't change the actual IR, just count optimization
+ // opportunities.
+ for (Instruction &I : *BB) {
+ // These won't get into the final code - don't even try calculating the
+ // cost for them.
+ if (isa<DbgInfoIntrinsic>(I) || EphValues.count(&I))
+ continue;
+
+ // Track this instruction's expected baseline cost when executing the
+ // rolled loop form.
RolledDynamicCost += TTI.getUserCost(&I, CostKind);
-
- // Visit the instruction to analyze its loop cost after unrolling,
- // and if the visitor returns true, mark the instruction as free after
- // unrolling and continue.
- bool IsFree = Analyzer.visit(I);
- bool Inserted = InstCostMap.insert({&I, (int)Iteration,
- (unsigned)IsFree,
- /*IsCounted*/ false}).second;
- (void)Inserted;
- assert(Inserted && "Cannot have a state for an unvisited instruction!");
-
- if (IsFree)
- continue;
-
- // Can't properly model a cost of a call.
- // FIXME: With a proper cost model we should be able to do it.
- if (auto *CI = dyn_cast<CallInst>(&I)) {
- const Function *Callee = CI->getCalledFunction();
- if (!Callee || TTI.isLoweredToCall(Callee)) {
- LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
- return None;
- }
- }
-
- // If the instruction might have a side-effect recursively account for
- // the cost of it and all the instructions leading up to it.
- if (I.mayHaveSideEffects())
- AddCostRecursively(I, Iteration);
-
- // If unrolled body turns out to be too big, bail out.
- if (UnrolledCost > MaxUnrolledLoopSize) {
- LLVM_DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
- << " UnrolledCost: " << UnrolledCost
- << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
- << "\n");
- return None;
- }
- }
-
- Instruction *TI = BB->getTerminator();
-
- // Add in the live successors by first checking whether we have terminator
- // that may be simplified based on the values simplified by this call.
- BasicBlock *KnownSucc = nullptr;
- if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
- if (BI->isConditional()) {
- if (Constant *SimpleCond =
- SimplifiedValues.lookup(BI->getCondition())) {
- // Just take the first successor if condition is undef
- if (isa<UndefValue>(SimpleCond))
- KnownSucc = BI->getSuccessor(0);
- else if (ConstantInt *SimpleCondVal =
- dyn_cast<ConstantInt>(SimpleCond))
- KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
- }
- }
- } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
- if (Constant *SimpleCond =
- SimplifiedValues.lookup(SI->getCondition())) {
- // Just take the first successor if condition is undef
- if (isa<UndefValue>(SimpleCond))
- KnownSucc = SI->getSuccessor(0);
- else if (ConstantInt *SimpleCondVal =
- dyn_cast<ConstantInt>(SimpleCond))
- KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
- }
- }
- if (KnownSucc) {
- if (L->contains(KnownSucc))
- BBWorklist.insert(KnownSucc);
- else
- ExitWorklist.insert({BB, KnownSucc});
- continue;
- }
-
- // Add BB's successors to the worklist.
- for (BasicBlock *Succ : successors(BB))
- if (L->contains(Succ))
- BBWorklist.insert(Succ);
- else
- ExitWorklist.insert({BB, Succ});
- AddCostRecursively(*TI, Iteration);
- }
-
- // If we found no optimization opportunities on the first iteration, we
- // won't find them on later ones too.
- if (UnrolledCost == RolledDynamicCost) {
- LLVM_DEBUG(dbgs() << " No opportunities found.. exiting.\n"
- << " UnrolledCost: " << UnrolledCost << "\n");
- return None;
- }
- }
-
- while (!ExitWorklist.empty()) {
- BasicBlock *ExitingBB, *ExitBB;
- std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
-
- for (Instruction &I : *ExitBB) {
- auto *PN = dyn_cast<PHINode>(&I);
- if (!PN)
- break;
-
- Value *Op = PN->getIncomingValueForBlock(ExitingBB);
- if (auto *OpI = dyn_cast<Instruction>(Op))
- if (L->contains(OpI))
- AddCostRecursively(*OpI, TripCount - 1);
- }
- }
-
- LLVM_DEBUG(dbgs() << "Analysis finished:\n"
- << "UnrolledCost: " << UnrolledCost << ", "
- << "RolledDynamicCost: " << RolledDynamicCost << "\n");
- return {{UnrolledCost, RolledDynamicCost}};
-}
-
-/// ApproximateLoopSize - Approximate the size of the loop.
-unsigned llvm::ApproximateLoopSize(
- const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
- const TargetTransformInfo &TTI,
- const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
- CodeMetrics Metrics;
- for (BasicBlock *BB : L->blocks())
- Metrics.analyzeBasicBlock(BB, TTI, EphValues);
- NumCalls = Metrics.NumInlineCandidates;
- NotDuplicatable = Metrics.notDuplicatable;
- Convergent = Metrics.convergent;
-
- unsigned LoopSize = Metrics.NumInsts;
-
- // Don't allow an estimate of size zero. This would allows unrolling of loops
- // with huge iteration counts, which is a compile time problem even if it's
- // not a problem for code quality. Also, the code using this size may assume
- // that each loop has at least three instructions (likely a conditional
- // branch, a comparison feeding that branch, and some kind of loop increment
- // feeding that comparison instruction).
- LoopSize = std::max(LoopSize, BEInsns + 1);
-
- return LoopSize;
-}
-
-// Returns the loop hint metadata node with the given name (for example,
-// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
-// returned.
-static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) {
- if (MDNode *LoopID = L->getLoopID())
- return GetUnrollMetadata(LoopID, Name);
- return nullptr;
-}
-
-// Returns true if the loop has an unroll(full) pragma.
-static bool hasUnrollFullPragma(const Loop *L) {
- return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
-}
-
-// Returns true if the loop has an unroll(enable) pragma. This metadata is used
-// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
-static bool hasUnrollEnablePragma(const Loop *L) {
- return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
-}
-
-// Returns true if the loop has an runtime unroll(disable) pragma.
-static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
- return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
-}
-
-// If loop has an unroll_count pragma return the (necessarily
-// positive) value from the pragma. Otherwise return 0.
-static unsigned unrollCountPragmaValue(const Loop *L) {
- MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
- if (MD) {
- assert(MD->getNumOperands() == 2 &&
- "Unroll count hint metadata should have two operands.");
- unsigned Count =
- mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
- assert(Count >= 1 && "Unroll count must be positive.");
- return Count;
- }
- return 0;
-}
-
-// Computes the boosting factor for complete unrolling.
-// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
-// be beneficial to fully unroll the loop even if unrolledcost is large. We
-// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
-// the unroll threshold.
-static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
- unsigned MaxPercentThresholdBoost) {
- if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
- return 100;
- else if (Cost.UnrolledCost != 0)
- // The boosting factor is RolledDynamicCost / UnrolledCost
- return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
- MaxPercentThresholdBoost);
- else
- return MaxPercentThresholdBoost;
-}
-
-// Returns loop size estimation for unrolled loop.
-static uint64_t getUnrolledLoopSize(
- unsigned LoopSize,
- TargetTransformInfo::UnrollingPreferences &UP) {
- assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
- return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
-}
-
-// Returns true if unroll count was set explicitly.
-// Calculates unroll count and writes it to UP.Count.
-// Unless IgnoreUser is true, will also use metadata and command-line options
-// that are specific to to the LoopUnroll pass (which, for instance, are
-// irrelevant for the LoopUnrollAndJam pass).
-// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
-// many LoopUnroll-specific options. The shared functionality should be
-// refactored into it own function.
-bool llvm::computeUnrollCount(
- Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
- ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
- OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
- bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize,
- TargetTransformInfo::UnrollingPreferences &UP,
- TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
-
- // Check for explicit Count.
- // 1st priority is unroll count set by "unroll-count" option.
- bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
- if (UserUnrollCount) {
- UP.Count = UnrollCount;
- UP.AllowExpensiveTripCount = true;
- UP.Force = true;
- if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
- return true;
- }
-
- // 2nd priority is unroll count set by pragma.
- unsigned PragmaCount = unrollCountPragmaValue(L);
- if (PragmaCount > 0) {
- UP.Count = PragmaCount;
- UP.Runtime = true;
- UP.AllowExpensiveTripCount = true;
- UP.Force = true;
- if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
- getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
- return true;
- }
- bool PragmaFullUnroll = hasUnrollFullPragma(L);
- if (PragmaFullUnroll && TripCount != 0) {
- UP.Count = TripCount;
- if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
- return false;
- }
-
- bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
- bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
- PragmaEnableUnroll || UserUnrollCount;
-
- if (ExplicitUnroll && TripCount != 0) {
- // If the loop has an unrolling pragma, we want to be more aggressive with
- // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
- // value which is larger than the default limits.
- UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
- UP.PartialThreshold =
- std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
- }
-
- // 3rd priority is full unroll count.
- // Full unroll makes sense only when TripCount or its upper bound could be
- // statically calculated.
- // Also we need to check if we exceed FullUnrollMaxCount.
- // If using the upper bound to unroll, TripMultiple should be set to 1 because
- // we do not know when loop may exit.
-
- // We can unroll by the upper bound amount if it's generally allowed or if
- // we know that the loop is executed either the upper bound or zero times.
- // (MaxOrZero unrolling keeps only the first loop test, so the number of
- // loop tests remains the same compared to the non-unrolled version, whereas
- // the generic upper bound unrolling keeps all but the last loop test so the
- // number of loop tests goes up which may end up being worse on targets with
- // constrained branch predictor resources so is controlled by an option.)
- // In addition we only unroll small upper bounds.
- unsigned FullUnrollMaxTripCount = MaxTripCount;
- if (!(UP.UpperBound || MaxOrZero) ||
- FullUnrollMaxTripCount > UnrollMaxUpperBound)
- FullUnrollMaxTripCount = 0;
-
- // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only
- // compute the former when the latter is zero.
- unsigned ExactTripCount = TripCount;
- assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) &&
- "ExtractTripCount and UnrollByMaxCount cannot both be non zero.");
-
- unsigned FullUnrollTripCount =
- ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
- UP.Count = FullUnrollTripCount;
- if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
- // When computing the unrolled size, note that BEInsns are not replicated
- // like the rest of the loop body.
- if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
- UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
- TripCount = FullUnrollTripCount;
- TripMultiple = UP.UpperBound ? 1 : TripMultiple;
- return ExplicitUnroll;
- } else {
- // The loop isn't that small, but we still can fully unroll it if that
- // helps to remove a significant number of instructions.
- // To check that, run additional analysis on the loop.
- if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
- L, FullUnrollTripCount, DT, SE, EphValues, TTI,
- UP.Threshold * UP.MaxPercentThresholdBoost / 100,
- UP.MaxIterationsCountToAnalyze)) {
- unsigned Boost =
- getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
- if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
- UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
- TripCount = FullUnrollTripCount;
- TripMultiple = UP.UpperBound ? 1 : TripMultiple;
- return ExplicitUnroll;
- }
- }
- }
- }
-
- // 4th priority is loop peeling.
+
+ // Visit the instruction to analyze its loop cost after unrolling,
+ // and if the visitor returns true, mark the instruction as free after
+ // unrolling and continue.
+ bool IsFree = Analyzer.visit(I);
+ bool Inserted = InstCostMap.insert({&I, (int)Iteration,
+ (unsigned)IsFree,
+ /*IsCounted*/ false}).second;
+ (void)Inserted;
+ assert(Inserted && "Cannot have a state for an unvisited instruction!");
+
+ if (IsFree)
+ continue;
+
+ // Can't properly model a cost of a call.
+ // FIXME: With a proper cost model we should be able to do it.
+ if (auto *CI = dyn_cast<CallInst>(&I)) {
+ const Function *Callee = CI->getCalledFunction();
+ if (!Callee || TTI.isLoweredToCall(Callee)) {
+ LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
+ return None;
+ }
+ }
+
+ // If the instruction might have a side-effect recursively account for
+ // the cost of it and all the instructions leading up to it.
+ if (I.mayHaveSideEffects())
+ AddCostRecursively(I, Iteration);
+
+ // If unrolled body turns out to be too big, bail out.
+ if (UnrolledCost > MaxUnrolledLoopSize) {
+ LLVM_DEBUG(dbgs() << " Exceeded threshold.. exiting.\n"
+ << " UnrolledCost: " << UnrolledCost
+ << ", MaxUnrolledLoopSize: " << MaxUnrolledLoopSize
+ << "\n");
+ return None;
+ }
+ }
+
+ Instruction *TI = BB->getTerminator();
+
+ // Add in the live successors by first checking whether we have terminator
+ // that may be simplified based on the values simplified by this call.
+ BasicBlock *KnownSucc = nullptr;
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ if (BI->isConditional()) {
+ if (Constant *SimpleCond =
+ SimplifiedValues.lookup(BI->getCondition())) {
+ // Just take the first successor if condition is undef
+ if (isa<UndefValue>(SimpleCond))
+ KnownSucc = BI->getSuccessor(0);
+ else if (ConstantInt *SimpleCondVal =
+ dyn_cast<ConstantInt>(SimpleCond))
+ KnownSucc = BI->getSuccessor(SimpleCondVal->isZero() ? 1 : 0);
+ }
+ }
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ if (Constant *SimpleCond =
+ SimplifiedValues.lookup(SI->getCondition())) {
+ // Just take the first successor if condition is undef
+ if (isa<UndefValue>(SimpleCond))
+ KnownSucc = SI->getSuccessor(0);
+ else if (ConstantInt *SimpleCondVal =
+ dyn_cast<ConstantInt>(SimpleCond))
+ KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
+ }
+ }
+ if (KnownSucc) {
+ if (L->contains(KnownSucc))
+ BBWorklist.insert(KnownSucc);
+ else
+ ExitWorklist.insert({BB, KnownSucc});
+ continue;
+ }
+
+ // Add BB's successors to the worklist.
+ for (BasicBlock *Succ : successors(BB))
+ if (L->contains(Succ))
+ BBWorklist.insert(Succ);
+ else
+ ExitWorklist.insert({BB, Succ});
+ AddCostRecursively(*TI, Iteration);
+ }
+
+ // If we found no optimization opportunities on the first iteration, we
+ // won't find them on later ones too.
+ if (UnrolledCost == RolledDynamicCost) {
+ LLVM_DEBUG(dbgs() << " No opportunities found.. exiting.\n"
+ << " UnrolledCost: " << UnrolledCost << "\n");
+ return None;
+ }
+ }
+
+ while (!ExitWorklist.empty()) {
+ BasicBlock *ExitingBB, *ExitBB;
+ std::tie(ExitingBB, ExitBB) = ExitWorklist.pop_back_val();
+
+ for (Instruction &I : *ExitBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ break;
+
+ Value *Op = PN->getIncomingValueForBlock(ExitingBB);
+ if (auto *OpI = dyn_cast<Instruction>(Op))
+ if (L->contains(OpI))
+ AddCostRecursively(*OpI, TripCount - 1);
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "Analysis finished:\n"
+ << "UnrolledCost: " << UnrolledCost << ", "
+ << "RolledDynamicCost: " << RolledDynamicCost << "\n");
+ return {{UnrolledCost, RolledDynamicCost}};
+}
+
+/// ApproximateLoopSize - Approximate the size of the loop.
+unsigned llvm::ApproximateLoopSize(
+ const Loop *L, unsigned &NumCalls, bool &NotDuplicatable, bool &Convergent,
+ const TargetTransformInfo &TTI,
+ const SmallPtrSetImpl<const Value *> &EphValues, unsigned BEInsns) {
+ CodeMetrics Metrics;
+ for (BasicBlock *BB : L->blocks())
+ Metrics.analyzeBasicBlock(BB, TTI, EphValues);
+ NumCalls = Metrics.NumInlineCandidates;
+ NotDuplicatable = Metrics.notDuplicatable;
+ Convergent = Metrics.convergent;
+
+ unsigned LoopSize = Metrics.NumInsts;
+
+ // Don't allow an estimate of size zero. This would allows unrolling of loops
+ // with huge iteration counts, which is a compile time problem even if it's
+ // not a problem for code quality. Also, the code using this size may assume
+ // that each loop has at least three instructions (likely a conditional
+ // branch, a comparison feeding that branch, and some kind of loop increment
+ // feeding that comparison instruction).
+ LoopSize = std::max(LoopSize, BEInsns + 1);
+
+ return LoopSize;
+}
+
+// Returns the loop hint metadata node with the given name (for example,
+// "llvm.loop.unroll.count"). If no such metadata node exists, then nullptr is
+// returned.
+static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+ if (MDNode *LoopID = L->getLoopID())
+ return GetUnrollMetadata(LoopID, Name);
+ return nullptr;
+}
+
+// Returns true if the loop has an unroll(full) pragma.
+static bool hasUnrollFullPragma(const Loop *L) {
+ return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
+}
+
+// Returns true if the loop has an unroll(enable) pragma. This metadata is used
+// for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
+static bool hasUnrollEnablePragma(const Loop *L) {
+ return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
+}
+
+// Returns true if the loop has an runtime unroll(disable) pragma.
+static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
+ return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
+}
+
+// If loop has an unroll_count pragma return the (necessarily
+// positive) value from the pragma. Otherwise return 0.
+static unsigned unrollCountPragmaValue(const Loop *L) {
+ MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
+ if (MD) {
+ assert(MD->getNumOperands() == 2 &&
+ "Unroll count hint metadata should have two operands.");
+ unsigned Count =
+ mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue();
+ assert(Count >= 1 && "Unroll count must be positive.");
+ return Count;
+ }
+ return 0;
+}
+
+// Computes the boosting factor for complete unrolling.
+// If fully unrolling the loop would save a lot of RolledDynamicCost, it would
+// be beneficial to fully unroll the loop even if unrolledcost is large. We
+// use (RolledDynamicCost / UnrolledCost) to model the unroll benefits to adjust
+// the unroll threshold.
+static unsigned getFullUnrollBoostingFactor(const EstimatedUnrollCost &Cost,
+ unsigned MaxPercentThresholdBoost) {
+ if (Cost.RolledDynamicCost >= std::numeric_limits<unsigned>::max() / 100)
+ return 100;
+ else if (Cost.UnrolledCost != 0)
+ // The boosting factor is RolledDynamicCost / UnrolledCost
+ return std::min(100 * Cost.RolledDynamicCost / Cost.UnrolledCost,
+ MaxPercentThresholdBoost);
+ else
+ return MaxPercentThresholdBoost;
+}
+
+// Returns loop size estimation for unrolled loop.
+static uint64_t getUnrolledLoopSize(
+ unsigned LoopSize,
+ TargetTransformInfo::UnrollingPreferences &UP) {
+ assert(LoopSize >= UP.BEInsns && "LoopSize should not be less than BEInsns!");
+ return (uint64_t)(LoopSize - UP.BEInsns) * UP.Count + UP.BEInsns;
+}
+
+// Returns true if unroll count was set explicitly.
+// Calculates unroll count and writes it to UP.Count.
+// Unless IgnoreUser is true, will also use metadata and command-line options
+// that are specific to to the LoopUnroll pass (which, for instance, are
+// irrelevant for the LoopUnrollAndJam pass).
+// FIXME: This function is used by LoopUnroll and LoopUnrollAndJam, but consumes
+// many LoopUnroll-specific options. The shared functionality should be
+// refactored into it own function.
+bool llvm::computeUnrollCount(
+ Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI,
+ ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
+ OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
+ bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize,
+ TargetTransformInfo::UnrollingPreferences &UP,
+ TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
+
+ // Check for explicit Count.
+ // 1st priority is unroll count set by "unroll-count" option.
+ bool UserUnrollCount = UnrollCount.getNumOccurrences() > 0;
+ if (UserUnrollCount) {
+ UP.Count = UnrollCount;
+ UP.AllowExpensiveTripCount = true;
+ UP.Force = true;
+ if (UP.AllowRemainder && getUnrolledLoopSize(LoopSize, UP) < UP.Threshold)
+ return true;
+ }
+
+ // 2nd priority is unroll count set by pragma.
+ unsigned PragmaCount = unrollCountPragmaValue(L);
+ if (PragmaCount > 0) {
+ UP.Count = PragmaCount;
+ UP.Runtime = true;
+ UP.AllowExpensiveTripCount = true;
+ UP.Force = true;
+ if ((UP.AllowRemainder || (TripMultiple % PragmaCount == 0)) &&
+ getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
+ return true;
+ }
+ bool PragmaFullUnroll = hasUnrollFullPragma(L);
+ if (PragmaFullUnroll && TripCount != 0) {
+ UP.Count = TripCount;
+ if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
+ return false;
+ }
+
+ bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
+ bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
+ PragmaEnableUnroll || UserUnrollCount;
+
+ if (ExplicitUnroll && TripCount != 0) {
+ // If the loop has an unrolling pragma, we want to be more aggressive with
+ // unrolling limits. Set thresholds to at least the PragmaUnrollThreshold
+ // value which is larger than the default limits.
+ UP.Threshold = std::max<unsigned>(UP.Threshold, PragmaUnrollThreshold);
+ UP.PartialThreshold =
+ std::max<unsigned>(UP.PartialThreshold, PragmaUnrollThreshold);
+ }
+
+ // 3rd priority is full unroll count.
+ // Full unroll makes sense only when TripCount or its upper bound could be
+ // statically calculated.
+ // Also we need to check if we exceed FullUnrollMaxCount.
+ // If using the upper bound to unroll, TripMultiple should be set to 1 because
+ // we do not know when loop may exit.
+
+ // We can unroll by the upper bound amount if it's generally allowed or if
+ // we know that the loop is executed either the upper bound or zero times.
+ // (MaxOrZero unrolling keeps only the first loop test, so the number of
+ // loop tests remains the same compared to the non-unrolled version, whereas
+ // the generic upper bound unrolling keeps all but the last loop test so the
+ // number of loop tests goes up which may end up being worse on targets with
+ // constrained branch predictor resources so is controlled by an option.)
+ // In addition we only unroll small upper bounds.
+ unsigned FullUnrollMaxTripCount = MaxTripCount;
+ if (!(UP.UpperBound || MaxOrZero) ||
+ FullUnrollMaxTripCount > UnrollMaxUpperBound)
+ FullUnrollMaxTripCount = 0;
+
+ // UnrollByMaxCount and ExactTripCount cannot both be non zero since we only
+ // compute the former when the latter is zero.
+ unsigned ExactTripCount = TripCount;
+ assert((ExactTripCount == 0 || FullUnrollMaxTripCount == 0) &&
+ "ExtractTripCount and UnrollByMaxCount cannot both be non zero.");
+
+ unsigned FullUnrollTripCount =
+ ExactTripCount ? ExactTripCount : FullUnrollMaxTripCount;
+ UP.Count = FullUnrollTripCount;
+ if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) {
+ // When computing the unrolled size, note that BEInsns are not replicated
+ // like the rest of the loop body.
+ if (getUnrolledLoopSize(LoopSize, UP) < UP.Threshold) {
+ UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
+ TripCount = FullUnrollTripCount;
+ TripMultiple = UP.UpperBound ? 1 : TripMultiple;
+ return ExplicitUnroll;
+ } else {
+ // The loop isn't that small, but we still can fully unroll it if that
+ // helps to remove a significant number of instructions.
+ // To check that, run additional analysis on the loop.
+ if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
+ L, FullUnrollTripCount, DT, SE, EphValues, TTI,
+ UP.Threshold * UP.MaxPercentThresholdBoost / 100,
+ UP.MaxIterationsCountToAnalyze)) {
+ unsigned Boost =
+ getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
+ if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
+ UseUpperBound = (FullUnrollMaxTripCount == FullUnrollTripCount);
+ TripCount = FullUnrollTripCount;
+ TripMultiple = UP.UpperBound ? 1 : TripMultiple;
+ return ExplicitUnroll;
+ }
+ }
+ }
+ }
+
+ // 4th priority is loop peeling.
computePeelCount(L, LoopSize, PP, TripCount, SE, UP.Threshold);
- if (PP.PeelCount) {
- UP.Runtime = false;
- UP.Count = 1;
- return ExplicitUnroll;
- }
-
- // 5th priority is partial unrolling.
- // Try partial unroll only when TripCount could be statically calculated.
- if (TripCount) {
- UP.Partial |= ExplicitUnroll;
- if (!UP.Partial) {
- LLVM_DEBUG(dbgs() << " will not try to unroll partially because "
- << "-unroll-allow-partial not given\n");
- UP.Count = 0;
- return false;
- }
- if (UP.Count == 0)
- UP.Count = TripCount;
- if (UP.PartialThreshold != NoThreshold) {
- // Reduce unroll count to be modulo of TripCount for partial unrolling.
- if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
- UP.Count =
- (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
- (LoopSize - UP.BEInsns);
- if (UP.Count > UP.MaxCount)
- UP.Count = UP.MaxCount;
- while (UP.Count != 0 && TripCount % UP.Count != 0)
- UP.Count--;
- if (UP.AllowRemainder && UP.Count <= 1) {
- // If there is no Count that is modulo of TripCount, set Count to
- // largest power-of-two factor that satisfies the threshold limit.
- // As we'll create fixup loop, do the type of unrolling only if
- // remainder loop is allowed.
- UP.Count = UP.DefaultUnrollRuntimeCount;
- while (UP.Count != 0 &&
- getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
- UP.Count >>= 1;
- }
- if (UP.Count < 2) {
- if (PragmaEnableUnroll)
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE,
- "UnrollAsDirectedTooLarge",
- L->getStartLoc(), L->getHeader())
- << "Unable to unroll loop as directed by unroll(enable) "
- "pragma "
- "because unrolled size is too large.";
- });
- UP.Count = 0;
- }
- } else {
- UP.Count = TripCount;
- }
- if (UP.Count > UP.MaxCount)
- UP.Count = UP.MaxCount;
- if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
- UP.Count != TripCount)
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE,
- "FullUnrollAsDirectedTooLarge",
- L->getStartLoc(), L->getHeader())
- << "Unable to fully unroll loop as directed by unroll pragma "
- "because "
- "unrolled size is too large.";
- });
- LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count
- << "\n");
- return ExplicitUnroll;
- }
- assert(TripCount == 0 &&
- "All cases when TripCount is constant should be covered here.");
- if (PragmaFullUnroll)
- ORE->emit([&]() {
- return OptimizationRemarkMissed(
- DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
- L->getStartLoc(), L->getHeader())
- << "Unable to fully unroll loop as directed by unroll(full) "
- "pragma "
- "because loop has a runtime trip count.";
- });
-
- // 6th priority is runtime unrolling.
- // Don't unroll a runtime trip count loop when it is disabled.
- if (hasRuntimeUnrollDisablePragma(L)) {
- UP.Count = 0;
- return false;
- }
-
- // Don't unroll a small upper bound loop unless user or TTI asked to do so.
- if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
- UP.Count = 0;
- return false;
- }
-
- // Check if the runtime trip count is too small when profile is available.
- if (L->getHeader()->getParent()->hasProfileData()) {
- if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
- if (*ProfileTripCount < FlatLoopTripCountThreshold)
- return false;
- else
- UP.AllowExpensiveTripCount = true;
- }
- }
-
- // Reduce count based on the type of unrolling and the threshold values.
- UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
- if (!UP.Runtime) {
- LLVM_DEBUG(
- dbgs() << " will not try to unroll loop with runtime trip count "
- << "-unroll-runtime not given\n");
- UP.Count = 0;
- return false;
- }
- if (UP.Count == 0)
- UP.Count = UP.DefaultUnrollRuntimeCount;
-
- // Reduce unroll count to be the largest power-of-two factor of
- // the original count which satisfies the threshold limit.
- while (UP.Count != 0 &&
- getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
- UP.Count >>= 1;
-
-#ifndef NDEBUG
- unsigned OrigCount = UP.Count;
-#endif
-
- if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
- while (UP.Count != 0 && TripMultiple % UP.Count != 0)
- UP.Count >>= 1;
- LLVM_DEBUG(
- dbgs() << "Remainder loop is restricted (that could architecture "
- "specific or because the loop contains a convergent "
- "instruction), so unroll count must divide the trip "
- "multiple, "
- << TripMultiple << ". Reducing unroll count from " << OrigCount
- << " to " << UP.Count << ".\n");
-
- using namespace ore;
-
- if (PragmaCount > 0 && !UP.AllowRemainder)
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE,
- "DifferentUnrollCountFromDirected",
- L->getStartLoc(), L->getHeader())
- << "Unable to unroll loop the number of times directed by "
- "unroll_count pragma because remainder loop is restricted "
- "(that could architecture specific or because the loop "
- "contains a convergent instruction) and so must have an "
- "unroll "
- "count that divides the loop trip multiple of "
- << NV("TripMultiple", TripMultiple) << ". Unrolling instead "
- << NV("UnrollCount", UP.Count) << " time(s).";
- });
- }
-
- if (UP.Count > UP.MaxCount)
- UP.Count = UP.MaxCount;
-
- if (MaxTripCount && UP.Count > MaxTripCount)
- UP.Count = MaxTripCount;
-
- LLVM_DEBUG(dbgs() << " runtime unrolling with count: " << UP.Count
- << "\n");
- if (UP.Count < 2)
- UP.Count = 0;
- return ExplicitUnroll;
-}
-
-static LoopUnrollResult tryToUnrollLoop(
- Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
- const TargetTransformInfo &TTI, AssumptionCache &AC,
- OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
- bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount,
- Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
- Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
- Optional<bool> ProvidedAllowPeeling,
- Optional<bool> ProvidedAllowProfileBasedPeeling,
- Optional<unsigned> ProvidedFullUnrollMaxCount) {
- LLVM_DEBUG(dbgs() << "Loop Unroll: F["
- << L->getHeader()->getParent()->getName() << "] Loop %"
- << L->getHeader()->getName() << "\n");
- TransformationMode TM = hasUnrollTransformation(L);
- if (TM & TM_Disable)
- return LoopUnrollResult::Unmodified;
- if (!L->isLoopSimplifyForm()) {
- LLVM_DEBUG(
- dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
- return LoopUnrollResult::Unmodified;
- }
-
+ if (PP.PeelCount) {
+ UP.Runtime = false;
+ UP.Count = 1;
+ return ExplicitUnroll;
+ }
+
+ // 5th priority is partial unrolling.
+ // Try partial unroll only when TripCount could be statically calculated.
+ if (TripCount) {
+ UP.Partial |= ExplicitUnroll;
+ if (!UP.Partial) {
+ LLVM_DEBUG(dbgs() << " will not try to unroll partially because "
+ << "-unroll-allow-partial not given\n");
+ UP.Count = 0;
+ return false;
+ }
+ if (UP.Count == 0)
+ UP.Count = TripCount;
+ if (UP.PartialThreshold != NoThreshold) {
+ // Reduce unroll count to be modulo of TripCount for partial unrolling.
+ if (getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+ UP.Count =
+ (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
+ (LoopSize - UP.BEInsns);
+ if (UP.Count > UP.MaxCount)
+ UP.Count = UP.MaxCount;
+ while (UP.Count != 0 && TripCount % UP.Count != 0)
+ UP.Count--;
+ if (UP.AllowRemainder && UP.Count <= 1) {
+ // If there is no Count that is modulo of TripCount, set Count to
+ // largest power-of-two factor that satisfies the threshold limit.
+ // As we'll create fixup loop, do the type of unrolling only if
+ // remainder loop is allowed.
+ UP.Count = UP.DefaultUnrollRuntimeCount;
+ while (UP.Count != 0 &&
+ getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+ UP.Count >>= 1;
+ }
+ if (UP.Count < 2) {
+ if (PragmaEnableUnroll)
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE,
+ "UnrollAsDirectedTooLarge",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to unroll loop as directed by unroll(enable) "
+ "pragma "
+ "because unrolled size is too large.";
+ });
+ UP.Count = 0;
+ }
+ } else {
+ UP.Count = TripCount;
+ }
+ if (UP.Count > UP.MaxCount)
+ UP.Count = UP.MaxCount;
+ if ((PragmaFullUnroll || PragmaEnableUnroll) && TripCount &&
+ UP.Count != TripCount)
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE,
+ "FullUnrollAsDirectedTooLarge",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to fully unroll loop as directed by unroll pragma "
+ "because "
+ "unrolled size is too large.";
+ });
+ LLVM_DEBUG(dbgs() << " partially unrolling with count: " << UP.Count
+ << "\n");
+ return ExplicitUnroll;
+ }
+ assert(TripCount == 0 &&
+ "All cases when TripCount is constant should be covered here.");
+ if (PragmaFullUnroll)
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(
+ DEBUG_TYPE, "CantFullUnrollAsDirectedRuntimeTripCount",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to fully unroll loop as directed by unroll(full) "
+ "pragma "
+ "because loop has a runtime trip count.";
+ });
+
+ // 6th priority is runtime unrolling.
+ // Don't unroll a runtime trip count loop when it is disabled.
+ if (hasRuntimeUnrollDisablePragma(L)) {
+ UP.Count = 0;
+ return false;
+ }
+
+ // Don't unroll a small upper bound loop unless user or TTI asked to do so.
+ if (MaxTripCount && !UP.Force && MaxTripCount < UnrollMaxUpperBound) {
+ UP.Count = 0;
+ return false;
+ }
+
+ // Check if the runtime trip count is too small when profile is available.
+ if (L->getHeader()->getParent()->hasProfileData()) {
+ if (auto ProfileTripCount = getLoopEstimatedTripCount(L)) {
+ if (*ProfileTripCount < FlatLoopTripCountThreshold)
+ return false;
+ else
+ UP.AllowExpensiveTripCount = true;
+ }
+ }
+
+ // Reduce count based on the type of unrolling and the threshold values.
+ UP.Runtime |= PragmaEnableUnroll || PragmaCount > 0 || UserUnrollCount;
+ if (!UP.Runtime) {
+ LLVM_DEBUG(
+ dbgs() << " will not try to unroll loop with runtime trip count "
+ << "-unroll-runtime not given\n");
+ UP.Count = 0;
+ return false;
+ }
+ if (UP.Count == 0)
+ UP.Count = UP.DefaultUnrollRuntimeCount;
+
+ // Reduce unroll count to be the largest power-of-two factor of
+ // the original count which satisfies the threshold limit.
+ while (UP.Count != 0 &&
+ getUnrolledLoopSize(LoopSize, UP) > UP.PartialThreshold)
+ UP.Count >>= 1;
+
+#ifndef NDEBUG
+ unsigned OrigCount = UP.Count;
+#endif
+
+ if (!UP.AllowRemainder && UP.Count != 0 && (TripMultiple % UP.Count) != 0) {
+ while (UP.Count != 0 && TripMultiple % UP.Count != 0)
+ UP.Count >>= 1;
+ LLVM_DEBUG(
+ dbgs() << "Remainder loop is restricted (that could architecture "
+ "specific or because the loop contains a convergent "
+ "instruction), so unroll count must divide the trip "
+ "multiple, "
+ << TripMultiple << ". Reducing unroll count from " << OrigCount
+ << " to " << UP.Count << ".\n");
+
+ using namespace ore;
+
+ if (PragmaCount > 0 && !UP.AllowRemainder)
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE,
+ "DifferentUnrollCountFromDirected",
+ L->getStartLoc(), L->getHeader())
+ << "Unable to unroll loop the number of times directed by "
+ "unroll_count pragma because remainder loop is restricted "
+ "(that could architecture specific or because the loop "
+ "contains a convergent instruction) and so must have an "
+ "unroll "
+ "count that divides the loop trip multiple of "
+ << NV("TripMultiple", TripMultiple) << ". Unrolling instead "
+ << NV("UnrollCount", UP.Count) << " time(s).";
+ });
+ }
+
+ if (UP.Count > UP.MaxCount)
+ UP.Count = UP.MaxCount;
+
+ if (MaxTripCount && UP.Count > MaxTripCount)
+ UP.Count = MaxTripCount;
+
+ LLVM_DEBUG(dbgs() << " runtime unrolling with count: " << UP.Count
+ << "\n");
+ if (UP.Count < 2)
+ UP.Count = 0;
+ return ExplicitUnroll;
+}
+
+static LoopUnrollResult tryToUnrollLoop(
+ Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
+ const TargetTransformInfo &TTI, AssumptionCache &AC,
+ OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI, bool PreserveLCSSA, int OptLevel,
+ bool OnlyWhenForced, bool ForgetAllSCEV, Optional<unsigned> ProvidedCount,
+ Optional<unsigned> ProvidedThreshold, Optional<bool> ProvidedAllowPartial,
+ Optional<bool> ProvidedRuntime, Optional<bool> ProvidedUpperBound,
+ Optional<bool> ProvidedAllowPeeling,
+ Optional<bool> ProvidedAllowProfileBasedPeeling,
+ Optional<unsigned> ProvidedFullUnrollMaxCount) {
+ LLVM_DEBUG(dbgs() << "Loop Unroll: F["
+ << L->getHeader()->getParent()->getName() << "] Loop %"
+ << L->getHeader()->getName() << "\n");
+ TransformationMode TM = hasUnrollTransformation(L);
+ if (TM & TM_Disable)
+ return LoopUnrollResult::Unmodified;
+ if (!L->isLoopSimplifyForm()) {
+ LLVM_DEBUG(
+ dbgs() << " Not unrolling loop which is not in loop-simplify form.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
// When automatic unrolling is disabled, do not unroll unless overridden for
- // this loop.
- if (OnlyWhenForced && !(TM & TM_Enable))
- return LoopUnrollResult::Unmodified;
-
- bool OptForSize = L->getHeader()->getParent()->hasOptSize();
- unsigned NumInlineCandidates;
- bool NotDuplicatable;
- bool Convergent;
- TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
- L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
- ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
- ProvidedFullUnrollMaxCount);
- TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
+ // this loop.
+ if (OnlyWhenForced && !(TM & TM_Enable))
+ return LoopUnrollResult::Unmodified;
+
+ bool OptForSize = L->getHeader()->getParent()->hasOptSize();
+ unsigned NumInlineCandidates;
+ bool NotDuplicatable;
+ bool Convergent;
+ TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
+ L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
+ ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
+ ProvidedFullUnrollMaxCount);
+ TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling, true);
-
- // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
- // as threshold later on.
- if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
- !OptForSize)
- return LoopUnrollResult::Unmodified;
-
- SmallPtrSet<const Value *, 32> EphValues;
- CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
-
- unsigned LoopSize =
- ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
- TTI, EphValues, UP.BEInsns);
- LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
- if (NotDuplicatable) {
- LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
- << " instructions.\n");
- return LoopUnrollResult::Unmodified;
- }
-
- // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
- // later), to (fully) unroll loops, if it does not increase code size.
- if (OptForSize)
- UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
-
- if (NumInlineCandidates != 0) {
- LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
- return LoopUnrollResult::Unmodified;
- }
-
- // Find trip count and trip multiple if count is not available
- unsigned TripCount = 0;
- unsigned TripMultiple = 1;
- // If there are multiple exiting blocks but one of them is the latch, use the
- // latch for the trip count estimation. Otherwise insist on a single exiting
- // block for the trip count estimation.
- BasicBlock *ExitingBlock = L->getLoopLatch();
- if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
- ExitingBlock = L->getExitingBlock();
- if (ExitingBlock) {
- TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
- TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
- }
-
- // If the loop contains a convergent operation, the prelude we'd add
- // to do the first few instructions before we hit the unrolled loop
- // is unsafe -- it adds a control-flow dependency to the convergent
+
+ // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
+ // as threshold later on.
+ if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0) &&
+ !OptForSize)
+ return LoopUnrollResult::Unmodified;
+
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+
+ unsigned LoopSize =
+ ApproximateLoopSize(L, NumInlineCandidates, NotDuplicatable, Convergent,
+ TTI, EphValues, UP.BEInsns);
+ LLVM_DEBUG(dbgs() << " Loop Size = " << LoopSize << "\n");
+ if (NotDuplicatable) {
+ LLVM_DEBUG(dbgs() << " Not unrolling loop which contains non-duplicatable"
+ << " instructions.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ // When optimizing for size, use LoopSize + 1 as threshold (we use < Threshold
+ // later), to (fully) unroll loops, if it does not increase code size.
+ if (OptForSize)
+ UP.Threshold = std::max(UP.Threshold, LoopSize + 1);
+
+ if (NumInlineCandidates != 0) {
+ LLVM_DEBUG(dbgs() << " Not unrolling loop with inlinable calls.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ // Find trip count and trip multiple if count is not available
+ unsigned TripCount = 0;
+ unsigned TripMultiple = 1;
+ // If there are multiple exiting blocks but one of them is the latch, use the
+ // latch for the trip count estimation. Otherwise insist on a single exiting
+ // block for the trip count estimation.
+ BasicBlock *ExitingBlock = L->getLoopLatch();
+ if (!ExitingBlock || !L->isLoopExiting(ExitingBlock))
+ ExitingBlock = L->getExitingBlock();
+ if (ExitingBlock) {
+ TripCount = SE.getSmallConstantTripCount(L, ExitingBlock);
+ TripMultiple = SE.getSmallConstantTripMultiple(L, ExitingBlock);
+ }
+
+ // If the loop contains a convergent operation, the prelude we'd add
+ // to do the first few instructions before we hit the unrolled loop
+ // is unsafe -- it adds a control-flow dependency to the convergent
// operation. Therefore restrict remainder loop (try unrolling without).
- //
- // TODO: This is quite conservative. In practice, convergent_op()
- // is likely to be called unconditionally in the loop. In this
- // case, the program would be ill-formed (on most architectures)
- // unless n were the same on all threads in a thread group.
- // Assuming n is the same on all threads, any kind of unrolling is
- // safe. But currently llvm's notion of convergence isn't powerful
- // enough to express this.
- if (Convergent)
- UP.AllowRemainder = false;
-
- // Try to find the trip count upper bound if we cannot find the exact trip
- // count.
- unsigned MaxTripCount = 0;
- bool MaxOrZero = false;
- if (!TripCount) {
- MaxTripCount = SE.getSmallConstantMaxTripCount(L);
- MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
- }
-
- // computeUnrollCount() decides whether it is beneficial to use upper bound to
- // fully unroll the loop.
- bool UseUpperBound = false;
- bool IsCountSetExplicitly = computeUnrollCount(
- L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
- TripMultiple, LoopSize, UP, PP, UseUpperBound);
- if (!UP.Count)
- return LoopUnrollResult::Unmodified;
- // Unroll factor (Count) must be less or equal to TripCount.
- if (TripCount && UP.Count > TripCount)
- UP.Count = TripCount;
-
- // Save loop properties before it is transformed.
- MDNode *OrigLoopID = L->getLoopID();
-
- // Unroll the loop.
- Loop *RemainderLoop = nullptr;
- LoopUnrollResult UnrollResult = UnrollLoop(
- L,
- {UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
- UseUpperBound, MaxOrZero, TripMultiple, PP.PeelCount, UP.UnrollRemainder,
- ForgetAllSCEV},
- LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
- if (UnrollResult == LoopUnrollResult::Unmodified)
- return LoopUnrollResult::Unmodified;
-
- if (RemainderLoop) {
- Optional<MDNode *> RemainderLoopID =
- makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
- LLVMLoopUnrollFollowupRemainder});
- if (RemainderLoopID.hasValue())
- RemainderLoop->setLoopID(RemainderLoopID.getValue());
- }
-
- if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
- Optional<MDNode *> NewLoopID =
- makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
- LLVMLoopUnrollFollowupUnrolled});
- if (NewLoopID.hasValue()) {
- L->setLoopID(NewLoopID.getValue());
-
- // Do not setLoopAlreadyUnrolled if loop attributes have been specified
- // explicitly.
- return UnrollResult;
- }
- }
-
- // If loop has an unroll count pragma or unrolled by explicitly set count
- // mark loop as unrolled to prevent unrolling beyond that requested.
- // If the loop was peeled, we already "used up" the profile information
- // we had, so we don't want to unroll or peel again.
- if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
- (IsCountSetExplicitly || (PP.PeelProfiledIterations && PP.PeelCount)))
- L->setLoopAlreadyUnrolled();
-
- return UnrollResult;
-}
-
-namespace {
-
-class LoopUnroll : public LoopPass {
-public:
- static char ID; // Pass ID, replacement for typeid
-
- int OptLevel;
-
- /// If false, use a cost model to determine whether unrolling of a loop is
- /// profitable. If true, only loops that explicitly request unrolling via
- /// metadata are considered. All other loops are skipped.
- bool OnlyWhenForced;
-
- /// If false, when SCEV is invalidated, only forget everything in the
- /// top-most loop (call forgetTopMostLoop), of the loop being processed.
- /// Otherwise, forgetAllLoops and rebuild when needed next.
- bool ForgetAllSCEV;
-
- Optional<unsigned> ProvidedCount;
- Optional<unsigned> ProvidedThreshold;
- Optional<bool> ProvidedAllowPartial;
- Optional<bool> ProvidedRuntime;
- Optional<bool> ProvidedUpperBound;
- Optional<bool> ProvidedAllowPeeling;
- Optional<bool> ProvidedAllowProfileBasedPeeling;
- Optional<unsigned> ProvidedFullUnrollMaxCount;
-
- LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
- bool ForgetAllSCEV = false, Optional<unsigned> Threshold = None,
- Optional<unsigned> Count = None,
- Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
- Optional<bool> UpperBound = None,
- Optional<bool> AllowPeeling = None,
- Optional<bool> AllowProfileBasedPeeling = None,
- Optional<unsigned> ProvidedFullUnrollMaxCount = None)
- : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
- ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
- ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
- ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
- ProvidedAllowPeeling(AllowPeeling),
- ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
- ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
- initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override {
- if (skipLoop(L))
- return false;
-
- Function &F = *L->getHeader()->getParent();
-
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- const TargetTransformInfo &TTI =
- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
- // pass. Function analyses need to be preserved across loop transformations
- // but ORE cannot be preserved (see comment before the pass definition).
- OptimizationRemarkEmitter ORE(&F);
- bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
-
- LoopUnrollResult Result = tryToUnrollLoop(
- L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
- OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold,
- ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
- ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
- ProvidedFullUnrollMaxCount);
-
- if (Result == LoopUnrollResult::FullyUnrolled)
- LPM.markLoopAsDeleted(*L);
-
- return Result != LoopUnrollResult::Unmodified;
- }
-
- /// This transformation requires natural loop information & requires that
- /// loop preheaders be inserted into the CFG...
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- // FIXME: Loop passes are required to preserve domtree, and for now we just
- // recreate dom info if anything gets unrolled.
- getLoopAnalysisUsage(AU);
- }
-};
-
-} // end anonymous namespace
-
-char LoopUnroll::ID = 0;
-
-INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
-
-Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
- bool ForgetAllSCEV, int Threshold, int Count,
- int AllowPartial, int Runtime, int UpperBound,
- int AllowPeeling) {
- // TODO: It would make more sense for this function to take the optionals
- // directly, but that's dangerous since it would silently break out of tree
- // callers.
- return new LoopUnroll(
- OptLevel, OnlyWhenForced, ForgetAllSCEV,
- Threshold == -1 ? None : Optional<unsigned>(Threshold),
- Count == -1 ? None : Optional<unsigned>(Count),
- AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
- Runtime == -1 ? None : Optional<bool>(Runtime),
- UpperBound == -1 ? None : Optional<bool>(UpperBound),
- AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling));
-}
-
-Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
- bool ForgetAllSCEV) {
- return createLoopUnrollPass(OptLevel, OnlyWhenForced, ForgetAllSCEV, -1, -1,
+ //
+ // TODO: This is quite conservative. In practice, convergent_op()
+ // is likely to be called unconditionally in the loop. In this
+ // case, the program would be ill-formed (on most architectures)
+ // unless n were the same on all threads in a thread group.
+ // Assuming n is the same on all threads, any kind of unrolling is
+ // safe. But currently llvm's notion of convergence isn't powerful
+ // enough to express this.
+ if (Convergent)
+ UP.AllowRemainder = false;
+
+ // Try to find the trip count upper bound if we cannot find the exact trip
+ // count.
+ unsigned MaxTripCount = 0;
+ bool MaxOrZero = false;
+ if (!TripCount) {
+ MaxTripCount = SE.getSmallConstantMaxTripCount(L);
+ MaxOrZero = SE.isBackedgeTakenCountMaxOrZero(L);
+ }
+
+ // computeUnrollCount() decides whether it is beneficial to use upper bound to
+ // fully unroll the loop.
+ bool UseUpperBound = false;
+ bool IsCountSetExplicitly = computeUnrollCount(
+ L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
+ TripMultiple, LoopSize, UP, PP, UseUpperBound);
+ if (!UP.Count)
+ return LoopUnrollResult::Unmodified;
+ // Unroll factor (Count) must be less or equal to TripCount.
+ if (TripCount && UP.Count > TripCount)
+ UP.Count = TripCount;
+
+ // Save loop properties before it is transformed.
+ MDNode *OrigLoopID = L->getLoopID();
+
+ // Unroll the loop.
+ Loop *RemainderLoop = nullptr;
+ LoopUnrollResult UnrollResult = UnrollLoop(
+ L,
+ {UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
+ UseUpperBound, MaxOrZero, TripMultiple, PP.PeelCount, UP.UnrollRemainder,
+ ForgetAllSCEV},
+ LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
+ if (UnrollResult == LoopUnrollResult::Unmodified)
+ return LoopUnrollResult::Unmodified;
+
+ if (RemainderLoop) {
+ Optional<MDNode *> RemainderLoopID =
+ makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
+ LLVMLoopUnrollFollowupRemainder});
+ if (RemainderLoopID.hasValue())
+ RemainderLoop->setLoopID(RemainderLoopID.getValue());
+ }
+
+ if (UnrollResult != LoopUnrollResult::FullyUnrolled) {
+ Optional<MDNode *> NewLoopID =
+ makeFollowupLoopID(OrigLoopID, {LLVMLoopUnrollFollowupAll,
+ LLVMLoopUnrollFollowupUnrolled});
+ if (NewLoopID.hasValue()) {
+ L->setLoopID(NewLoopID.getValue());
+
+ // Do not setLoopAlreadyUnrolled if loop attributes have been specified
+ // explicitly.
+ return UnrollResult;
+ }
+ }
+
+ // If loop has an unroll count pragma or unrolled by explicitly set count
+ // mark loop as unrolled to prevent unrolling beyond that requested.
+ // If the loop was peeled, we already "used up" the profile information
+ // we had, so we don't want to unroll or peel again.
+ if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
+ (IsCountSetExplicitly || (PP.PeelProfiledIterations && PP.PeelCount)))
+ L->setLoopAlreadyUnrolled();
+
+ return UnrollResult;
+}
+
+namespace {
+
+class LoopUnroll : public LoopPass {
+public:
+ static char ID; // Pass ID, replacement for typeid
+
+ int OptLevel;
+
+ /// If false, use a cost model to determine whether unrolling of a loop is
+ /// profitable. If true, only loops that explicitly request unrolling via
+ /// metadata are considered. All other loops are skipped.
+ bool OnlyWhenForced;
+
+ /// If false, when SCEV is invalidated, only forget everything in the
+ /// top-most loop (call forgetTopMostLoop), of the loop being processed.
+ /// Otherwise, forgetAllLoops and rebuild when needed next.
+ bool ForgetAllSCEV;
+
+ Optional<unsigned> ProvidedCount;
+ Optional<unsigned> ProvidedThreshold;
+ Optional<bool> ProvidedAllowPartial;
+ Optional<bool> ProvidedRuntime;
+ Optional<bool> ProvidedUpperBound;
+ Optional<bool> ProvidedAllowPeeling;
+ Optional<bool> ProvidedAllowProfileBasedPeeling;
+ Optional<unsigned> ProvidedFullUnrollMaxCount;
+
+ LoopUnroll(int OptLevel = 2, bool OnlyWhenForced = false,
+ bool ForgetAllSCEV = false, Optional<unsigned> Threshold = None,
+ Optional<unsigned> Count = None,
+ Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
+ Optional<bool> UpperBound = None,
+ Optional<bool> AllowPeeling = None,
+ Optional<bool> AllowProfileBasedPeeling = None,
+ Optional<unsigned> ProvidedFullUnrollMaxCount = None)
+ : LoopPass(ID), OptLevel(OptLevel), OnlyWhenForced(OnlyWhenForced),
+ ForgetAllSCEV(ForgetAllSCEV), ProvidedCount(std::move(Count)),
+ ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
+ ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound),
+ ProvidedAllowPeeling(AllowPeeling),
+ ProvidedAllowProfileBasedPeeling(AllowProfileBasedPeeling),
+ ProvidedFullUnrollMaxCount(ProvidedFullUnrollMaxCount) {
+ initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override {
+ if (skipLoop(L))
+ return false;
+
+ Function &F = *L->getHeader()->getParent();
+
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ const TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+ // pass. Function analyses need to be preserved across loop transformations
+ // but ORE cannot be preserved (see comment before the pass definition).
+ OptimizationRemarkEmitter ORE(&F);
+ bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+ LoopUnrollResult Result = tryToUnrollLoop(
+ L, DT, LI, SE, TTI, AC, ORE, nullptr, nullptr, PreserveLCSSA, OptLevel,
+ OnlyWhenForced, ForgetAllSCEV, ProvidedCount, ProvidedThreshold,
+ ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
+ ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
+ ProvidedFullUnrollMaxCount);
+
+ if (Result == LoopUnrollResult::FullyUnrolled)
+ LPM.markLoopAsDeleted(*L);
+
+ return Result != LoopUnrollResult::Unmodified;
+ }
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG...
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ // FIXME: Loop passes are required to preserve domtree, and for now we just
+ // recreate dom info if anything gets unrolled.
+ getLoopAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+char LoopUnroll::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
+
+Pass *llvm::createLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
+ bool ForgetAllSCEV, int Threshold, int Count,
+ int AllowPartial, int Runtime, int UpperBound,
+ int AllowPeeling) {
+ // TODO: It would make more sense for this function to take the optionals
+ // directly, but that's dangerous since it would silently break out of tree
+ // callers.
+ return new LoopUnroll(
+ OptLevel, OnlyWhenForced, ForgetAllSCEV,
+ Threshold == -1 ? None : Optional<unsigned>(Threshold),
+ Count == -1 ? None : Optional<unsigned>(Count),
+ AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
+ Runtime == -1 ? None : Optional<bool>(Runtime),
+ UpperBound == -1 ? None : Optional<bool>(UpperBound),
+ AllowPeeling == -1 ? None : Optional<bool>(AllowPeeling));
+}
+
+Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
+ bool ForgetAllSCEV) {
+ return createLoopUnrollPass(OptLevel, OnlyWhenForced, ForgetAllSCEV, -1, -1,
0, 0, 0, 1);
-}
-
-PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &Updater) {
- // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
- // pass. Function analyses need to be preserved across loop transformations
- // but ORE cannot be preserved (see comment before the pass definition).
- OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
-
- // Keep track of the previous loop structure so we can identify new loops
- // created by unrolling.
- Loop *ParentL = L.getParentLoop();
- SmallPtrSet<Loop *, 4> OldLoops;
- if (ParentL)
- OldLoops.insert(ParentL->begin(), ParentL->end());
- else
- OldLoops.insert(AR.LI.begin(), AR.LI.end());
-
- std::string LoopName = std::string(L.getName());
-
- bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
- /*BFI*/ nullptr, /*PSI*/ nullptr,
- /*PreserveLCSSA*/ true, OptLevel,
- OnlyWhenForced, ForgetSCEV, /*Count*/ None,
- /*Threshold*/ None, /*AllowPartial*/ false,
- /*Runtime*/ false, /*UpperBound*/ false,
+}
+
+PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &Updater) {
+ // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
+ // pass. Function analyses need to be preserved across loop transformations
+ // but ORE cannot be preserved (see comment before the pass definition).
+ OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
+
+ // Keep track of the previous loop structure so we can identify new loops
+ // created by unrolling.
+ Loop *ParentL = L.getParentLoop();
+ SmallPtrSet<Loop *, 4> OldLoops;
+ if (ParentL)
+ OldLoops.insert(ParentL->begin(), ParentL->end());
+ else
+ OldLoops.insert(AR.LI.begin(), AR.LI.end());
+
+ std::string LoopName = std::string(L.getName());
+
+ bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
+ /*BFI*/ nullptr, /*PSI*/ nullptr,
+ /*PreserveLCSSA*/ true, OptLevel,
+ OnlyWhenForced, ForgetSCEV, /*Count*/ None,
+ /*Threshold*/ None, /*AllowPartial*/ false,
+ /*Runtime*/ false, /*UpperBound*/ false,
/*AllowPeeling*/ true,
- /*AllowProfileBasedPeeling*/ false,
- /*FullUnrollMaxCount*/ None) !=
- LoopUnrollResult::Unmodified;
- if (!Changed)
- return PreservedAnalyses::all();
-
- // The parent must not be damaged by unrolling!
-#ifndef NDEBUG
- if (ParentL)
- ParentL->verifyLoop();
-#endif
-
- // Unrolling can do several things to introduce new loops into a loop nest:
- // - Full unrolling clones child loops within the current loop but then
- // removes the current loop making all of the children appear to be new
- // sibling loops.
- //
- // When a new loop appears as a sibling loop after fully unrolling,
- // its nesting structure has fundamentally changed and we want to revisit
- // it to reflect that.
- //
- // When unrolling has removed the current loop, we need to tell the
- // infrastructure that it is gone.
- //
- // Finally, we support a debugging/testing mode where we revisit child loops
- // as well. These are not expected to require further optimizations as either
- // they or the loop they were cloned from have been directly visited already.
- // But the debugging mode allows us to check this assumption.
- bool IsCurrentLoopValid = false;
- SmallVector<Loop *, 4> SibLoops;
- if (ParentL)
- SibLoops.append(ParentL->begin(), ParentL->end());
- else
- SibLoops.append(AR.LI.begin(), AR.LI.end());
- erase_if(SibLoops, [&](Loop *SibLoop) {
- if (SibLoop == &L) {
- IsCurrentLoopValid = true;
- return true;
- }
-
- // Otherwise erase the loop from the list if it was in the old loops.
+ /*AllowProfileBasedPeeling*/ false,
+ /*FullUnrollMaxCount*/ None) !=
+ LoopUnrollResult::Unmodified;
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ // The parent must not be damaged by unrolling!
+#ifndef NDEBUG
+ if (ParentL)
+ ParentL->verifyLoop();
+#endif
+
+ // Unrolling can do several things to introduce new loops into a loop nest:
+ // - Full unrolling clones child loops within the current loop but then
+ // removes the current loop making all of the children appear to be new
+ // sibling loops.
+ //
+ // When a new loop appears as a sibling loop after fully unrolling,
+ // its nesting structure has fundamentally changed and we want to revisit
+ // it to reflect that.
+ //
+ // When unrolling has removed the current loop, we need to tell the
+ // infrastructure that it is gone.
+ //
+ // Finally, we support a debugging/testing mode where we revisit child loops
+ // as well. These are not expected to require further optimizations as either
+ // they or the loop they were cloned from have been directly visited already.
+ // But the debugging mode allows us to check this assumption.
+ bool IsCurrentLoopValid = false;
+ SmallVector<Loop *, 4> SibLoops;
+ if (ParentL)
+ SibLoops.append(ParentL->begin(), ParentL->end());
+ else
+ SibLoops.append(AR.LI.begin(), AR.LI.end());
+ erase_if(SibLoops, [&](Loop *SibLoop) {
+ if (SibLoop == &L) {
+ IsCurrentLoopValid = true;
+ return true;
+ }
+
+ // Otherwise erase the loop from the list if it was in the old loops.
return OldLoops.contains(SibLoop);
- });
- Updater.addSiblingLoops(SibLoops);
-
- if (!IsCurrentLoopValid) {
- Updater.markLoopAsDeleted(L, LoopName);
- } else {
- // We can only walk child loops if the current loop remained valid.
- if (UnrollRevisitChildLoops) {
- // Walk *all* of the child loops.
- SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
- Updater.addChildLoops(ChildLoops);
- }
- }
-
- return getLoopPassPreservedAnalyses();
-}
-
-PreservedAnalyses LoopUnrollPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
- auto &LI = AM.getResult<LoopAnalysis>(F);
- auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &AC = AM.getResult<AssumptionAnalysis>(F);
- auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-
- LoopAnalysisManager *LAM = nullptr;
- if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
- LAM = &LAMProxy->getManager();
-
- auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
- ProfileSummaryInfo *PSI =
- MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
- auto *BFI = (PSI && PSI->hasProfileSummary()) ?
- &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
-
- bool Changed = false;
-
- // The unroller requires loops to be in simplified form, and also needs LCSSA.
- // Since simplification may add new inner loops, it has to run before the
- // legality and profitability checks. This means running the loop unroller
- // will simplify all loops, regardless of whether anything end up being
- // unrolled.
- for (auto &L : LI) {
- Changed |=
- simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
- Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
- }
-
- // Add the loop nests in the reverse order of LoopInfo. See method
- // declaration.
- SmallPriorityWorklist<Loop *, 4> Worklist;
- appendLoopsToWorklist(LI, Worklist);
-
- while (!Worklist.empty()) {
- // Because the LoopInfo stores the loops in RPO, we walk the worklist
- // from back to front so that we work forward across the CFG, which
- // for unrolling is only needed to get optimization remarks emitted in
- // a forward order.
- Loop &L = *Worklist.pop_back_val();
-#ifndef NDEBUG
- Loop *ParentL = L.getParentLoop();
-#endif
-
- // Check if the profile summary indicates that the profiled application
- // has a huge working set size, in which case we disable peeling to avoid
- // bloating it further.
- Optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
- if (PSI && PSI->hasHugeWorkingSetSize())
- LocalAllowPeeling = false;
- std::string LoopName = std::string(L.getName());
- // The API here is quite complex to call and we allow to select some
- // flavors of unrolling during construction time (by setting UnrollOpts).
- LoopUnrollResult Result = tryToUnrollLoop(
- &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
- /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
- UnrollOpts.ForgetSCEV, /*Count*/ None,
- /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
- UnrollOpts.AllowUpperBound, LocalAllowPeeling,
- UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
- Changed |= Result != LoopUnrollResult::Unmodified;
-
- // The parent must not be damaged by unrolling!
-#ifndef NDEBUG
- if (Result != LoopUnrollResult::Unmodified && ParentL)
- ParentL->verifyLoop();
-#endif
-
- // Clear any cached analysis results for L if we removed it completely.
- if (LAM && Result == LoopUnrollResult::FullyUnrolled)
- LAM->clear(L, LoopName);
- }
-
- if (!Changed)
- return PreservedAnalyses::all();
-
- return getLoopPassPreservedAnalyses();
-}
+ });
+ Updater.addSiblingLoops(SibLoops);
+
+ if (!IsCurrentLoopValid) {
+ Updater.markLoopAsDeleted(L, LoopName);
+ } else {
+ // We can only walk child loops if the current loop remained valid.
+ if (UnrollRevisitChildLoops) {
+ // Walk *all* of the child loops.
+ SmallVector<Loop *, 4> ChildLoops(L.begin(), L.end());
+ Updater.addChildLoops(ChildLoops);
+ }
+ }
+
+ return getLoopPassPreservedAnalyses();
+}
+
+PreservedAnalyses LoopUnrollPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+ LoopAnalysisManager *LAM = nullptr;
+ if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
+ LAM = &LAMProxy->getManager();
+
+ auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+ ProfileSummaryInfo *PSI =
+ MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ auto *BFI = (PSI && PSI->hasProfileSummary()) ?
+ &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
+
+ bool Changed = false;
+
+ // The unroller requires loops to be in simplified form, and also needs LCSSA.
+ // Since simplification may add new inner loops, it has to run before the
+ // legality and profitability checks. This means running the loop unroller
+ // will simplify all loops, regardless of whether anything end up being
+ // unrolled.
+ for (auto &L : LI) {
+ Changed |=
+ simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
+ Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
+ }
+
+ // Add the loop nests in the reverse order of LoopInfo. See method
+ // declaration.
+ SmallPriorityWorklist<Loop *, 4> Worklist;
+ appendLoopsToWorklist(LI, Worklist);
+
+ while (!Worklist.empty()) {
+ // Because the LoopInfo stores the loops in RPO, we walk the worklist
+ // from back to front so that we work forward across the CFG, which
+ // for unrolling is only needed to get optimization remarks emitted in
+ // a forward order.
+ Loop &L = *Worklist.pop_back_val();
+#ifndef NDEBUG
+ Loop *ParentL = L.getParentLoop();
+#endif
+
+ // Check if the profile summary indicates that the profiled application
+ // has a huge working set size, in which case we disable peeling to avoid
+ // bloating it further.
+ Optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
+ if (PSI && PSI->hasHugeWorkingSetSize())
+ LocalAllowPeeling = false;
+ std::string LoopName = std::string(L.getName());
+ // The API here is quite complex to call and we allow to select some
+ // flavors of unrolling during construction time (by setting UnrollOpts).
+ LoopUnrollResult Result = tryToUnrollLoop(
+ &L, DT, &LI, SE, TTI, AC, ORE, BFI, PSI,
+ /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, UnrollOpts.OnlyWhenForced,
+ UnrollOpts.ForgetSCEV, /*Count*/ None,
+ /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime,
+ UnrollOpts.AllowUpperBound, LocalAllowPeeling,
+ UnrollOpts.AllowProfileBasedPeeling, UnrollOpts.FullUnrollMaxCount);
+ Changed |= Result != LoopUnrollResult::Unmodified;
+
+ // The parent must not be damaged by unrolling!
+#ifndef NDEBUG
+ if (Result != LoopUnrollResult::Unmodified && ParentL)
+ ParentL->verifyLoop();
+#endif
+
+ // Clear any cached analysis results for L if we removed it completely.
+ if (LAM && Result == LoopUnrollResult::FullyUnrolled)
+ LAM->clear(L, LoopName);
+ }
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnswitch.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnswitch.cpp
index a4f67ba667..822a786fc7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -1,645 +1,645 @@
-//===- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass transforms loops that contain branches on loop-invariant conditions
-// to multiple loops. For example, it turns the left into the right code:
-//
-// for (...) if (lic)
-// A for (...)
-// if (lic) A; B; C
-// B else
-// C for (...)
-// A; C
-//
-// This can increase the size of the code exponentially (doubling it every time
-// a loop is unswitched) so we only unswitch if the resultant code will be
-// smaller than a threshold.
-//
-// This pass expects LICM to be run before it to hoist invariant conditions out
-// of the loop, to make the unswitching opportunity obvious.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/InstructionSimplify.h"
+//===- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops that contain branches on loop-invariant conditions
+// to multiple loops. For example, it turns the left into the right code:
+//
+// for (...) if (lic)
+// A for (...)
+// if (lic) A; B; C
+// B else
+// C for (...)
+// A; C
+//
+// This can increase the size of the code exponentially (doubling it every time
+// a loop is unswitched) so we only unswitch if the resultant code will be
+// smaller than a threshold.
+//
+// This pass expects LICM to be run before it to hoist invariant conditions out
+// of the loop, to make the unswitching opportunity obvious.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/InstructionSimplify.h"
#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/MustExecute.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <algorithm>
-#include <cassert>
-#include <map>
-#include <set>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-unswitch"
-
-STATISTIC(NumBranches, "Number of branches unswitched");
-STATISTIC(NumSwitches, "Number of switches unswitched");
-STATISTIC(NumGuards, "Number of guards unswitched");
-STATISTIC(NumSelects , "Number of selects unswitched");
-STATISTIC(NumTrivial , "Number of unswitches that are trivial");
-STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
-STATISTIC(TotalInsts, "Total number of instructions analyzed");
-
-// The specific value of 100 here was chosen based only on intuition and a
-// few specific examples.
-static cl::opt<unsigned>
-Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
- cl::init(100), cl::Hidden);
-
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <map>
+#include <set>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unswitch"
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumGuards, "Number of guards unswitched");
+STATISTIC(NumSelects , "Number of selects unswitched");
+STATISTIC(NumTrivial , "Number of unswitches that are trivial");
+STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
+STATISTIC(TotalInsts, "Total number of instructions analyzed");
+
+// The specific value of 100 here was chosen based only on intuition and a
+// few specific examples.
+static cl::opt<unsigned>
+Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
+ cl::init(100), cl::Hidden);
+
static cl::opt<unsigned>
MSSAThreshold("loop-unswitch-memoryssa-threshold",
cl::desc("Max number of memory uses to explore during "
"partial unswitching analysis"),
cl::init(100), cl::Hidden);
-namespace {
-
- class LUAnalysisCache {
- using UnswitchedValsMap =
- DenseMap<const SwitchInst *, SmallPtrSet<const Value *, 8>>;
- using UnswitchedValsIt = UnswitchedValsMap::iterator;
-
- struct LoopProperties {
- unsigned CanBeUnswitchedCount;
- unsigned WasUnswitchedCount;
- unsigned SizeEstimation;
- UnswitchedValsMap UnswitchedVals;
- };
-
- // Here we use std::map instead of DenseMap, since we need to keep valid
- // LoopProperties pointer for current loop for better performance.
- using LoopPropsMap = std::map<const Loop *, LoopProperties>;
- using LoopPropsMapIt = LoopPropsMap::iterator;
-
- LoopPropsMap LoopsProperties;
- UnswitchedValsMap *CurLoopInstructions = nullptr;
- LoopProperties *CurrentLoopProperties = nullptr;
-
- // A loop unswitching with an estimated cost above this threshold
- // is not performed. MaxSize is turned into unswitching quota for
- // the current loop, and reduced correspondingly, though note that
- // the quota is returned by releaseMemory() when the loop has been
- // processed, so that MaxSize will return to its previous
- // value. So in most cases MaxSize will equal the Threshold flag
- // when a new loop is processed. An exception to that is that
- // MaxSize will have a smaller value while processing nested loops
- // that were introduced due to loop unswitching of an outer loop.
- //
- // FIXME: The way that MaxSize works is subtle and depends on the
- // pass manager processing loops and calling releaseMemory() in a
- // specific order. It would be good to find a more straightforward
- // way of doing what MaxSize does.
- unsigned MaxSize;
-
- public:
- LUAnalysisCache() : MaxSize(Threshold) {}
-
- // Analyze loop. Check its size, calculate is it possible to unswitch
- // it. Returns true if we can unswitch this loop.
- bool countLoop(const Loop *L, const TargetTransformInfo &TTI,
- AssumptionCache *AC);
-
- // Clean all data related to given loop.
- void forgetLoop(const Loop *L);
-
- // Mark case value as unswitched.
- // Since SI instruction can be partly unswitched, in order to avoid
- // extra unswitching in cloned loops keep track all unswitched values.
- void setUnswitched(const SwitchInst *SI, const Value *V);
-
- // Check was this case value unswitched before or not.
- bool isUnswitched(const SwitchInst *SI, const Value *V);
-
- // Returns true if another unswitching could be done within the cost
- // threshold.
- bool costAllowsUnswitching();
-
- // Clone all loop-unswitch related loop properties.
- // Redistribute unswitching quotas.
- // Note, that new loop data is stored inside the VMap.
- void cloneData(const Loop *NewLoop, const Loop *OldLoop,
- const ValueToValueMapTy &VMap);
- };
-
- class LoopUnswitch : public LoopPass {
- LoopInfo *LI; // Loop information
- LPPassManager *LPM;
- AssumptionCache *AC;
-
- // Used to check if second loop needs processing after
- // rewriteLoopBodyWithConditionConstant rewrites first loop.
- std::vector<Loop*> LoopProcessWorklist;
-
- LUAnalysisCache BranchesInfo;
-
- bool OptimizeForSize;
- bool RedoLoop = false;
-
- Loop *CurrentLoop = nullptr;
- DominatorTree *DT = nullptr;
- MemorySSA *MSSA = nullptr;
+namespace {
+
+ class LUAnalysisCache {
+ using UnswitchedValsMap =
+ DenseMap<const SwitchInst *, SmallPtrSet<const Value *, 8>>;
+ using UnswitchedValsIt = UnswitchedValsMap::iterator;
+
+ struct LoopProperties {
+ unsigned CanBeUnswitchedCount;
+ unsigned WasUnswitchedCount;
+ unsigned SizeEstimation;
+ UnswitchedValsMap UnswitchedVals;
+ };
+
+ // Here we use std::map instead of DenseMap, since we need to keep valid
+ // LoopProperties pointer for current loop for better performance.
+ using LoopPropsMap = std::map<const Loop *, LoopProperties>;
+ using LoopPropsMapIt = LoopPropsMap::iterator;
+
+ LoopPropsMap LoopsProperties;
+ UnswitchedValsMap *CurLoopInstructions = nullptr;
+ LoopProperties *CurrentLoopProperties = nullptr;
+
+ // A loop unswitching with an estimated cost above this threshold
+ // is not performed. MaxSize is turned into unswitching quota for
+ // the current loop, and reduced correspondingly, though note that
+ // the quota is returned by releaseMemory() when the loop has been
+ // processed, so that MaxSize will return to its previous
+ // value. So in most cases MaxSize will equal the Threshold flag
+ // when a new loop is processed. An exception to that is that
+ // MaxSize will have a smaller value while processing nested loops
+ // that were introduced due to loop unswitching of an outer loop.
+ //
+ // FIXME: The way that MaxSize works is subtle and depends on the
+ // pass manager processing loops and calling releaseMemory() in a
+ // specific order. It would be good to find a more straightforward
+ // way of doing what MaxSize does.
+ unsigned MaxSize;
+
+ public:
+ LUAnalysisCache() : MaxSize(Threshold) {}
+
+ // Analyze loop. Check its size, calculate is it possible to unswitch
+ // it. Returns true if we can unswitch this loop.
+ bool countLoop(const Loop *L, const TargetTransformInfo &TTI,
+ AssumptionCache *AC);
+
+ // Clean all data related to given loop.
+ void forgetLoop(const Loop *L);
+
+ // Mark case value as unswitched.
+ // Since SI instruction can be partly unswitched, in order to avoid
+ // extra unswitching in cloned loops keep track all unswitched values.
+ void setUnswitched(const SwitchInst *SI, const Value *V);
+
+ // Check was this case value unswitched before or not.
+ bool isUnswitched(const SwitchInst *SI, const Value *V);
+
+ // Returns true if another unswitching could be done within the cost
+ // threshold.
+ bool costAllowsUnswitching();
+
+ // Clone all loop-unswitch related loop properties.
+ // Redistribute unswitching quotas.
+ // Note, that new loop data is stored inside the VMap.
+ void cloneData(const Loop *NewLoop, const Loop *OldLoop,
+ const ValueToValueMapTy &VMap);
+ };
+
+ class LoopUnswitch : public LoopPass {
+ LoopInfo *LI; // Loop information
+ LPPassManager *LPM;
+ AssumptionCache *AC;
+
+ // Used to check if second loop needs processing after
+ // rewriteLoopBodyWithConditionConstant rewrites first loop.
+ std::vector<Loop*> LoopProcessWorklist;
+
+ LUAnalysisCache BranchesInfo;
+
+ bool OptimizeForSize;
+ bool RedoLoop = false;
+
+ Loop *CurrentLoop = nullptr;
+ DominatorTree *DT = nullptr;
+ MemorySSA *MSSA = nullptr;
AAResults *AA = nullptr;
- std::unique_ptr<MemorySSAUpdater> MSSAU;
- BasicBlock *LoopHeader = nullptr;
- BasicBlock *LoopPreheader = nullptr;
-
- bool SanitizeMemory;
- SimpleLoopSafetyInfo SafetyInfo;
-
- // LoopBlocks contains all of the basic blocks of the loop, including the
- // preheader of the loop, the body of the loop, and the exit blocks of the
- // loop, in that order.
- std::vector<BasicBlock*> LoopBlocks;
- // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
- std::vector<BasicBlock*> NewBlocks;
-
- bool HasBranchDivergence;
-
- public:
- static char ID; // Pass ID, replacement for typeid
-
- explicit LoopUnswitch(bool Os = false, bool HasBranchDivergence = false)
- : LoopPass(ID), OptimizeForSize(Os),
- HasBranchDivergence(HasBranchDivergence) {
- initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override;
- bool processCurrentLoop();
- bool isUnreachableDueToPreviousUnswitching(BasicBlock *);
-
- /// This transformation requires natural loop information & requires that
- /// loop preheaders be inserted into the CFG.
- ///
- void getAnalysisUsage(AnalysisUsage &AU) const override {
+ std::unique_ptr<MemorySSAUpdater> MSSAU;
+ BasicBlock *LoopHeader = nullptr;
+ BasicBlock *LoopPreheader = nullptr;
+
+ bool SanitizeMemory;
+ SimpleLoopSafetyInfo SafetyInfo;
+
+ // LoopBlocks contains all of the basic blocks of the loop, including the
+ // preheader of the loop, the body of the loop, and the exit blocks of the
+ // loop, in that order.
+ std::vector<BasicBlock*> LoopBlocks;
+ // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
+ std::vector<BasicBlock*> NewBlocks;
+
+ bool HasBranchDivergence;
+
+ public:
+ static char ID; // Pass ID, replacement for typeid
+
+ explicit LoopUnswitch(bool Os = false, bool HasBranchDivergence = false)
+ : LoopPass(ID), OptimizeForSize(Os),
+ HasBranchDivergence(HasBranchDivergence) {
+ initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+ bool processCurrentLoop();
+ bool isUnreachableDueToPreviousUnswitching(BasicBlock *);
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG.
+ ///
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
// Lazy BFI and BPI are marked as preserved here so Loop Unswitching
// can remain part of the same loop pass as LICM
AU.addPreserved<LazyBlockFrequencyInfoPass>();
AU.addPreserved<LazyBranchProbabilityInfoPass>();
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- if (EnableMSSALoopDependency) {
- AU.addRequired<MemorySSAWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
- }
- if (HasBranchDivergence)
- AU.addRequired<LegacyDivergenceAnalysis>();
- getLoopAnalysisUsage(AU);
- }
-
- private:
- void releaseMemory() override { BranchesInfo.forgetLoop(CurrentLoop); }
-
- void initLoopData() {
- LoopHeader = CurrentLoop->getHeader();
- LoopPreheader = CurrentLoop->getLoopPreheader();
- }
-
- /// Split all of the edges from inside the loop to their exit blocks.
- /// Update the appropriate Phi nodes as we do so.
- void splitExitEdges(Loop *L,
- const SmallVectorImpl<BasicBlock *> &ExitBlocks);
-
- bool tryTrivialLoopUnswitch(bool &Changed);
-
- bool unswitchIfProfitable(Value *LoopCond, Constant *Val,
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ if (EnableMSSALoopDependency) {
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ }
+ if (HasBranchDivergence)
+ AU.addRequired<LegacyDivergenceAnalysis>();
+ getLoopAnalysisUsage(AU);
+ }
+
+ private:
+ void releaseMemory() override { BranchesInfo.forgetLoop(CurrentLoop); }
+
+ void initLoopData() {
+ LoopHeader = CurrentLoop->getHeader();
+ LoopPreheader = CurrentLoop->getLoopPreheader();
+ }
+
+ /// Split all of the edges from inside the loop to their exit blocks.
+ /// Update the appropriate Phi nodes as we do so.
+ void splitExitEdges(Loop *L,
+ const SmallVectorImpl<BasicBlock *> &ExitBlocks);
+
+ bool tryTrivialLoopUnswitch(bool &Changed);
+
+ bool unswitchIfProfitable(Value *LoopCond, Constant *Val,
Instruction *TI = nullptr,
ArrayRef<Instruction *> ToDuplicate = {});
- void unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
- BasicBlock *ExitBlock, Instruction *TI);
- void unswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L,
+ void unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+ BasicBlock *ExitBlock, Instruction *TI);
+ void unswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L,
Instruction *TI,
ArrayRef<Instruction *> ToDuplicate = {});
-
- void rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
- Constant *Val, bool IsEqual);
-
+
+ void rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+ Constant *Val, bool IsEqual);
+
void
emitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
BasicBlock *TrueDest, BasicBlock *FalseDest,
BranchInst *OldBranch, Instruction *TI,
ArrayRef<Instruction *> ToDuplicate = {});
-
- void simplifyCode(std::vector<Instruction *> &Worklist, Loop *L);
-
- /// Given that the Invariant is not equal to Val. Simplify instructions
- /// in the loop.
- Value *simplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
- Constant *Val);
- };
-
-} // end anonymous namespace
-
-// Analyze loop. Check its size, calculate is it possible to unswitch
-// it. Returns true if we can unswitch this loop.
-bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
- AssumptionCache *AC) {
- LoopPropsMapIt PropsIt;
- bool Inserted;
- std::tie(PropsIt, Inserted) =
- LoopsProperties.insert(std::make_pair(L, LoopProperties()));
-
- LoopProperties &Props = PropsIt->second;
-
- if (Inserted) {
- // New loop.
-
- // Limit the number of instructions to avoid causing significant code
- // expansion, and the number of basic blocks, to avoid loops with
- // large numbers of branches which cause loop unswitching to go crazy.
- // This is a very ad-hoc heuristic.
-
- SmallPtrSet<const Value *, 32> EphValues;
- CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
- // FIXME: This is overly conservative because it does not take into
- // consideration code simplification opportunities and code that can
- // be shared by the resultant unswitched loops.
- CodeMetrics Metrics;
- for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E;
- ++I)
- Metrics.analyzeBasicBlock(*I, TTI, EphValues);
-
- Props.SizeEstimation = Metrics.NumInsts;
- Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
- Props.WasUnswitchedCount = 0;
- MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
-
- if (Metrics.notDuplicatable) {
- LLVM_DEBUG(dbgs() << "NOT unswitching loop %" << L->getHeader()->getName()
- << ", contents cannot be "
- << "duplicated!\n");
- return false;
- }
- }
-
- // Be careful. This links are good only before new loop addition.
- CurrentLoopProperties = &Props;
- CurLoopInstructions = &Props.UnswitchedVals;
-
- return true;
-}
-
-// Clean all data related to given loop.
-void LUAnalysisCache::forgetLoop(const Loop *L) {
- LoopPropsMapIt LIt = LoopsProperties.find(L);
-
- if (LIt != LoopsProperties.end()) {
- LoopProperties &Props = LIt->second;
- MaxSize += (Props.CanBeUnswitchedCount + Props.WasUnswitchedCount) *
- Props.SizeEstimation;
- LoopsProperties.erase(LIt);
- }
-
- CurrentLoopProperties = nullptr;
- CurLoopInstructions = nullptr;
-}
-
-// Mark case value as unswitched.
-// Since SI instruction can be partly unswitched, in order to avoid
-// extra unswitching in cloned loops keep track all unswitched values.
-void LUAnalysisCache::setUnswitched(const SwitchInst *SI, const Value *V) {
- (*CurLoopInstructions)[SI].insert(V);
-}
-
-// Check was this case value unswitched before or not.
-bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) {
- return (*CurLoopInstructions)[SI].count(V);
-}
-
-bool LUAnalysisCache::costAllowsUnswitching() {
- return CurrentLoopProperties->CanBeUnswitchedCount > 0;
-}
-
-// Clone all loop-unswitch related loop properties.
-// Redistribute unswitching quotas.
-// Note, that new loop data is stored inside the VMap.
-void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
- const ValueToValueMapTy &VMap) {
- LoopProperties &NewLoopProps = LoopsProperties[NewLoop];
- LoopProperties &OldLoopProps = *CurrentLoopProperties;
- UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals;
-
- // Reallocate "can-be-unswitched quota"
-
- --OldLoopProps.CanBeUnswitchedCount;
- ++OldLoopProps.WasUnswitchedCount;
- NewLoopProps.WasUnswitchedCount = 0;
- unsigned Quota = OldLoopProps.CanBeUnswitchedCount;
- NewLoopProps.CanBeUnswitchedCount = Quota / 2;
- OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2;
-
- NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation;
-
- // Clone unswitched values info:
- // for new loop switches we clone info about values that was
- // already unswitched and has redundant successors.
- for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) {
- const SwitchInst *OldInst = I->first;
- Value *NewI = VMap.lookup(OldInst);
- const SwitchInst *NewInst = cast_or_null<SwitchInst>(NewI);
- assert(NewInst && "All instructions that are in SrcBB must be in VMap.");
-
- NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst];
- }
-}
-
-char LoopUnswitch::ID = 0;
-
-INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
- false, false)
-
-Pass *llvm::createLoopUnswitchPass(bool Os, bool HasBranchDivergence) {
- return new LoopUnswitch(Os, HasBranchDivergence);
-}
-
-/// Operator chain lattice.
-enum OperatorChain {
- OC_OpChainNone, ///< There is no operator.
- OC_OpChainOr, ///< There are only ORs.
- OC_OpChainAnd, ///< There are only ANDs.
- OC_OpChainMixed ///< There are ANDs and ORs.
-};
-
-/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
-/// an invariant piece, return the invariant. Otherwise, return null.
-//
-/// NOTE: findLIVLoopCondition will not return a partial LIV by walking up a
-/// mixed operator chain, as we can not reliably find a value which will
-/// simplify the operator chain. If the chain is AND-only or OR-only, we can use
-/// 0 or ~0 to simplify the chain.
-///
-/// NOTE: In case a partial LIV and a mixed operator chain, we may be able to
-/// simplify the condition itself to a loop variant condition, but at the
-/// cost of creating an entirely new loop.
-static Value *findLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
- OperatorChain &ParentChain,
- DenseMap<Value *, Value *> &Cache,
- MemorySSAUpdater *MSSAU) {
- auto CacheIt = Cache.find(Cond);
- if (CacheIt != Cache.end())
- return CacheIt->second;
-
- // We started analyze new instruction, increment scanned instructions counter.
- ++TotalInsts;
-
- // We can never unswitch on vector conditions.
- if (Cond->getType()->isVectorTy())
- return nullptr;
-
- // Constants should be folded, not unswitched on!
- if (isa<Constant>(Cond)) return nullptr;
-
- // TODO: Handle: br (VARIANT|INVARIANT).
-
- // Hoist simple values out.
- if (L->makeLoopInvariant(Cond, Changed, nullptr, MSSAU)) {
- Cache[Cond] = Cond;
- return Cond;
- }
-
- // Walk up the operator chain to find partial invariant conditions.
- if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
- if (BO->getOpcode() == Instruction::And ||
- BO->getOpcode() == Instruction::Or) {
- // Given the previous operator, compute the current operator chain status.
- OperatorChain NewChain;
- switch (ParentChain) {
- case OC_OpChainNone:
- NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
- OC_OpChainOr;
- break;
- case OC_OpChainOr:
- NewChain = BO->getOpcode() == Instruction::Or ? OC_OpChainOr :
- OC_OpChainMixed;
- break;
- case OC_OpChainAnd:
- NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
- OC_OpChainMixed;
- break;
- case OC_OpChainMixed:
- NewChain = OC_OpChainMixed;
- break;
- }
-
- // If we reach a Mixed state, we do not want to keep walking up as we can not
- // reliably find a value that will simplify the chain. With this check, we
- // will return null on the first sight of mixed chain and the caller will
- // either backtrack to find partial LIV in other operand or return null.
- if (NewChain != OC_OpChainMixed) {
- // Update the current operator chain type before we search up the chain.
- ParentChain = NewChain;
- // If either the left or right side is invariant, we can unswitch on this,
- // which will cause the branch to go away in one loop and the condition to
- // simplify in the other one.
- if (Value *LHS = findLIVLoopCondition(BO->getOperand(0), L, Changed,
- ParentChain, Cache, MSSAU)) {
- Cache[Cond] = LHS;
- return LHS;
- }
- // We did not manage to find a partial LIV in operand(0). Backtrack and try
- // operand(1).
- ParentChain = NewChain;
- if (Value *RHS = findLIVLoopCondition(BO->getOperand(1), L, Changed,
- ParentChain, Cache, MSSAU)) {
- Cache[Cond] = RHS;
- return RHS;
- }
- }
- }
-
- Cache[Cond] = nullptr;
- return nullptr;
-}
-
-/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
-/// an invariant piece, return the invariant along with the operator chain type.
-/// Otherwise, return null.
-static std::pair<Value *, OperatorChain>
-findLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
- MemorySSAUpdater *MSSAU) {
- DenseMap<Value *, Value *> Cache;
- OperatorChain OpChain = OC_OpChainNone;
- Value *FCond = findLIVLoopCondition(Cond, L, Changed, OpChain, Cache, MSSAU);
-
- // In case we do find a LIV, it can not be obtained by walking up a mixed
- // operator chain.
- assert((!FCond || OpChain != OC_OpChainMixed) &&
- "Do not expect a partial LIV with mixed operator chain");
- return {FCond, OpChain};
-}
-
-bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) {
- if (skipLoop(L))
- return false;
-
- AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
- *L->getHeader()->getParent());
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- LPM = &LPMRef;
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+ void simplifyCode(std::vector<Instruction *> &Worklist, Loop *L);
+
+ /// Given that the Invariant is not equal to Val. Simplify instructions
+ /// in the loop.
+ Value *simplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
+ Constant *Val);
+ };
+
+} // end anonymous namespace
+
+// Analyze loop. Check its size, calculate is it possible to unswitch
+// it. Returns true if we can unswitch this loop.
+bool LUAnalysisCache::countLoop(const Loop *L, const TargetTransformInfo &TTI,
+ AssumptionCache *AC) {
+ LoopPropsMapIt PropsIt;
+ bool Inserted;
+ std::tie(PropsIt, Inserted) =
+ LoopsProperties.insert(std::make_pair(L, LoopProperties()));
+
+ LoopProperties &Props = PropsIt->second;
+
+ if (Inserted) {
+ // New loop.
+
+ // Limit the number of instructions to avoid causing significant code
+ // expansion, and the number of basic blocks, to avoid loops with
+ // large numbers of branches which cause loop unswitching to go crazy.
+ // This is a very ad-hoc heuristic.
+
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+ // FIXME: This is overly conservative because it does not take into
+ // consideration code simplification opportunities and code that can
+ // be shared by the resultant unswitched loops.
+ CodeMetrics Metrics;
+ for (Loop::block_iterator I = L->block_begin(), E = L->block_end(); I != E;
+ ++I)
+ Metrics.analyzeBasicBlock(*I, TTI, EphValues);
+
+ Props.SizeEstimation = Metrics.NumInsts;
+ Props.CanBeUnswitchedCount = MaxSize / (Props.SizeEstimation);
+ Props.WasUnswitchedCount = 0;
+ MaxSize -= Props.SizeEstimation * Props.CanBeUnswitchedCount;
+
+ if (Metrics.notDuplicatable) {
+ LLVM_DEBUG(dbgs() << "NOT unswitching loop %" << L->getHeader()->getName()
+ << ", contents cannot be "
+ << "duplicated!\n");
+ return false;
+ }
+ }
+
+ // Be careful. This links are good only before new loop addition.
+ CurrentLoopProperties = &Props;
+ CurLoopInstructions = &Props.UnswitchedVals;
+
+ return true;
+}
+
+// Clean all data related to given loop.
+void LUAnalysisCache::forgetLoop(const Loop *L) {
+ LoopPropsMapIt LIt = LoopsProperties.find(L);
+
+ if (LIt != LoopsProperties.end()) {
+ LoopProperties &Props = LIt->second;
+ MaxSize += (Props.CanBeUnswitchedCount + Props.WasUnswitchedCount) *
+ Props.SizeEstimation;
+ LoopsProperties.erase(LIt);
+ }
+
+ CurrentLoopProperties = nullptr;
+ CurLoopInstructions = nullptr;
+}
+
+// Mark case value as unswitched.
+// Since SI instruction can be partly unswitched, in order to avoid
+// extra unswitching in cloned loops keep track all unswitched values.
+void LUAnalysisCache::setUnswitched(const SwitchInst *SI, const Value *V) {
+ (*CurLoopInstructions)[SI].insert(V);
+}
+
+// Check was this case value unswitched before or not.
+bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) {
+ return (*CurLoopInstructions)[SI].count(V);
+}
+
+bool LUAnalysisCache::costAllowsUnswitching() {
+ return CurrentLoopProperties->CanBeUnswitchedCount > 0;
+}
+
+// Clone all loop-unswitch related loop properties.
+// Redistribute unswitching quotas.
+// Note, that new loop data is stored inside the VMap.
+void LUAnalysisCache::cloneData(const Loop *NewLoop, const Loop *OldLoop,
+ const ValueToValueMapTy &VMap) {
+ LoopProperties &NewLoopProps = LoopsProperties[NewLoop];
+ LoopProperties &OldLoopProps = *CurrentLoopProperties;
+ UnswitchedValsMap &Insts = OldLoopProps.UnswitchedVals;
+
+ // Reallocate "can-be-unswitched quota"
+
+ --OldLoopProps.CanBeUnswitchedCount;
+ ++OldLoopProps.WasUnswitchedCount;
+ NewLoopProps.WasUnswitchedCount = 0;
+ unsigned Quota = OldLoopProps.CanBeUnswitchedCount;
+ NewLoopProps.CanBeUnswitchedCount = Quota / 2;
+ OldLoopProps.CanBeUnswitchedCount = Quota - Quota / 2;
+
+ NewLoopProps.SizeEstimation = OldLoopProps.SizeEstimation;
+
+ // Clone unswitched values info:
+ // for new loop switches we clone info about values that was
+ // already unswitched and has redundant successors.
+ for (UnswitchedValsIt I = Insts.begin(); I != Insts.end(); ++I) {
+ const SwitchInst *OldInst = I->first;
+ Value *NewI = VMap.lookup(OldInst);
+ const SwitchInst *NewInst = cast_or_null<SwitchInst>(NewI);
+ assert(NewInst && "All instructions that are in SrcBB must be in VMap.");
+
+ NewLoopProps.UnswitchedVals[NewInst] = OldLoopProps.UnswitchedVals[OldInst];
+ }
+}
+
+char LoopUnswitch::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
+ false, false)
+
+Pass *llvm::createLoopUnswitchPass(bool Os, bool HasBranchDivergence) {
+ return new LoopUnswitch(Os, HasBranchDivergence);
+}
+
+/// Operator chain lattice.
+enum OperatorChain {
+ OC_OpChainNone, ///< There is no operator.
+ OC_OpChainOr, ///< There are only ORs.
+ OC_OpChainAnd, ///< There are only ANDs.
+ OC_OpChainMixed ///< There are ANDs and ORs.
+};
+
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant. Otherwise, return null.
+//
+/// NOTE: findLIVLoopCondition will not return a partial LIV by walking up a
+/// mixed operator chain, as we can not reliably find a value which will
+/// simplify the operator chain. If the chain is AND-only or OR-only, we can use
+/// 0 or ~0 to simplify the chain.
+///
+/// NOTE: In case a partial LIV and a mixed operator chain, we may be able to
+/// simplify the condition itself to a loop variant condition, but at the
+/// cost of creating an entirely new loop.
+static Value *findLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+ OperatorChain &ParentChain,
+ DenseMap<Value *, Value *> &Cache,
+ MemorySSAUpdater *MSSAU) {
+ auto CacheIt = Cache.find(Cond);
+ if (CacheIt != Cache.end())
+ return CacheIt->second;
+
+ // We started analyze new instruction, increment scanned instructions counter.
+ ++TotalInsts;
+
+ // We can never unswitch on vector conditions.
+ if (Cond->getType()->isVectorTy())
+ return nullptr;
+
+ // Constants should be folded, not unswitched on!
+ if (isa<Constant>(Cond)) return nullptr;
+
+ // TODO: Handle: br (VARIANT|INVARIANT).
+
+ // Hoist simple values out.
+ if (L->makeLoopInvariant(Cond, Changed, nullptr, MSSAU)) {
+ Cache[Cond] = Cond;
+ return Cond;
+ }
+
+ // Walk up the operator chain to find partial invariant conditions.
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
+ if (BO->getOpcode() == Instruction::And ||
+ BO->getOpcode() == Instruction::Or) {
+ // Given the previous operator, compute the current operator chain status.
+ OperatorChain NewChain;
+ switch (ParentChain) {
+ case OC_OpChainNone:
+ NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+ OC_OpChainOr;
+ break;
+ case OC_OpChainOr:
+ NewChain = BO->getOpcode() == Instruction::Or ? OC_OpChainOr :
+ OC_OpChainMixed;
+ break;
+ case OC_OpChainAnd:
+ NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+ OC_OpChainMixed;
+ break;
+ case OC_OpChainMixed:
+ NewChain = OC_OpChainMixed;
+ break;
+ }
+
+ // If we reach a Mixed state, we do not want to keep walking up as we can not
+ // reliably find a value that will simplify the chain. With this check, we
+ // will return null on the first sight of mixed chain and the caller will
+ // either backtrack to find partial LIV in other operand or return null.
+ if (NewChain != OC_OpChainMixed) {
+ // Update the current operator chain type before we search up the chain.
+ ParentChain = NewChain;
+ // If either the left or right side is invariant, we can unswitch on this,
+ // which will cause the branch to go away in one loop and the condition to
+ // simplify in the other one.
+ if (Value *LHS = findLIVLoopCondition(BO->getOperand(0), L, Changed,
+ ParentChain, Cache, MSSAU)) {
+ Cache[Cond] = LHS;
+ return LHS;
+ }
+ // We did not manage to find a partial LIV in operand(0). Backtrack and try
+ // operand(1).
+ ParentChain = NewChain;
+ if (Value *RHS = findLIVLoopCondition(BO->getOperand(1), L, Changed,
+ ParentChain, Cache, MSSAU)) {
+ Cache[Cond] = RHS;
+ return RHS;
+ }
+ }
+ }
+
+ Cache[Cond] = nullptr;
+ return nullptr;
+}
+
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant along with the operator chain type.
+/// Otherwise, return null.
+static std::pair<Value *, OperatorChain>
+findLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+ MemorySSAUpdater *MSSAU) {
+ DenseMap<Value *, Value *> Cache;
+ OperatorChain OpChain = OC_OpChainNone;
+ Value *FCond = findLIVLoopCondition(Cond, L, Changed, OpChain, Cache, MSSAU);
+
+ // In case we do find a LIV, it can not be obtained by walking up a mixed
+ // operator chain.
+ assert((!FCond || OpChain != OC_OpChainMixed) &&
+ "Do not expect a partial LIV with mixed operator chain");
+ return {FCond, OpChain};
+}
+
+bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) {
+ if (skipLoop(L))
+ return false;
+
+ AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
+ *L->getHeader()->getParent());
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ LPM = &LPMRef;
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- if (EnableMSSALoopDependency) {
- MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
- MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
- assert(DT && "Cannot update MemorySSA without a valid DomTree.");
- }
- CurrentLoop = L;
- Function *F = CurrentLoop->getHeader()->getParent();
-
- SanitizeMemory = F->hasFnAttribute(Attribute::SanitizeMemory);
- if (SanitizeMemory)
- SafetyInfo.computeLoopSafetyInfo(L);
-
- if (MSSA && VerifyMemorySSA)
- MSSA->verifyMemorySSA();
-
- bool Changed = false;
- do {
- assert(CurrentLoop->isLCSSAForm(*DT));
- if (MSSA && VerifyMemorySSA)
- MSSA->verifyMemorySSA();
- RedoLoop = false;
- Changed |= processCurrentLoop();
- } while (RedoLoop);
-
- if (MSSA && VerifyMemorySSA)
- MSSA->verifyMemorySSA();
-
- return Changed;
-}
-
-// Return true if the BasicBlock BB is unreachable from the loop header.
-// Return false, otherwise.
-bool LoopUnswitch::isUnreachableDueToPreviousUnswitching(BasicBlock *BB) {
- auto *Node = DT->getNode(BB)->getIDom();
- BasicBlock *DomBB = Node->getBlock();
- while (CurrentLoop->contains(DomBB)) {
- BranchInst *BInst = dyn_cast<BranchInst>(DomBB->getTerminator());
-
- Node = DT->getNode(DomBB)->getIDom();
- DomBB = Node->getBlock();
-
- if (!BInst || !BInst->isConditional())
- continue;
-
- Value *Cond = BInst->getCondition();
- if (!isa<ConstantInt>(Cond))
- continue;
-
- BasicBlock *UnreachableSucc =
- Cond == ConstantInt::getTrue(Cond->getContext())
- ? BInst->getSuccessor(1)
- : BInst->getSuccessor(0);
-
- if (DT->dominates(UnreachableSucc, BB))
- return true;
- }
- return false;
-}
-
-/// FIXME: Remove this workaround when freeze related patches are done.
-/// LoopUnswitch and Equality propagation in GVN have discrepancy about
-/// whether branch on undef/poison has undefine behavior. Here it is to
-/// rule out some common cases that we found such discrepancy already
-/// causing problems. Detail could be found in PR31652. Note if the
-/// func returns true, it is unsafe. But if it is false, it doesn't mean
-/// it is necessarily safe.
-static bool equalityPropUnSafe(Value &LoopCond) {
- ICmpInst *CI = dyn_cast<ICmpInst>(&LoopCond);
- if (!CI || !CI->isEquality())
- return false;
-
- Value *LHS = CI->getOperand(0);
- Value *RHS = CI->getOperand(1);
- if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS))
- return true;
-
- auto HasUndefInPHI = [](PHINode &PN) {
- for (Value *Opd : PN.incoming_values()) {
- if (isa<UndefValue>(Opd))
- return true;
- }
- return false;
- };
- PHINode *LPHI = dyn_cast<PHINode>(LHS);
- PHINode *RPHI = dyn_cast<PHINode>(RHS);
- if ((LPHI && HasUndefInPHI(*LPHI)) || (RPHI && HasUndefInPHI(*RPHI)))
- return true;
-
- auto HasUndefInSelect = [](SelectInst &SI) {
- if (isa<UndefValue>(SI.getTrueValue()) ||
- isa<UndefValue>(SI.getFalseValue()))
- return true;
- return false;
- };
- SelectInst *LSI = dyn_cast<SelectInst>(LHS);
- SelectInst *RSI = dyn_cast<SelectInst>(RHS);
- if ((LSI && HasUndefInSelect(*LSI)) || (RSI && HasUndefInSelect(*RSI)))
- return true;
- return false;
-}
-
+ if (EnableMSSALoopDependency) {
+ MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+ MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+ assert(DT && "Cannot update MemorySSA without a valid DomTree.");
+ }
+ CurrentLoop = L;
+ Function *F = CurrentLoop->getHeader()->getParent();
+
+ SanitizeMemory = F->hasFnAttribute(Attribute::SanitizeMemory);
+ if (SanitizeMemory)
+ SafetyInfo.computeLoopSafetyInfo(L);
+
+ if (MSSA && VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+
+ bool Changed = false;
+ do {
+ assert(CurrentLoop->isLCSSAForm(*DT));
+ if (MSSA && VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+ RedoLoop = false;
+ Changed |= processCurrentLoop();
+ } while (RedoLoop);
+
+ if (MSSA && VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+
+ return Changed;
+}
+
+// Return true if the BasicBlock BB is unreachable from the loop header.
+// Return false, otherwise.
+bool LoopUnswitch::isUnreachableDueToPreviousUnswitching(BasicBlock *BB) {
+ auto *Node = DT->getNode(BB)->getIDom();
+ BasicBlock *DomBB = Node->getBlock();
+ while (CurrentLoop->contains(DomBB)) {
+ BranchInst *BInst = dyn_cast<BranchInst>(DomBB->getTerminator());
+
+ Node = DT->getNode(DomBB)->getIDom();
+ DomBB = Node->getBlock();
+
+ if (!BInst || !BInst->isConditional())
+ continue;
+
+ Value *Cond = BInst->getCondition();
+ if (!isa<ConstantInt>(Cond))
+ continue;
+
+ BasicBlock *UnreachableSucc =
+ Cond == ConstantInt::getTrue(Cond->getContext())
+ ? BInst->getSuccessor(1)
+ : BInst->getSuccessor(0);
+
+ if (DT->dominates(UnreachableSucc, BB))
+ return true;
+ }
+ return false;
+}
+
+/// FIXME: Remove this workaround when freeze related patches are done.
+/// LoopUnswitch and Equality propagation in GVN have discrepancy about
+/// whether branch on undef/poison has undefine behavior. Here it is to
+/// rule out some common cases that we found such discrepancy already
+/// causing problems. Detail could be found in PR31652. Note if the
+/// func returns true, it is unsafe. But if it is false, it doesn't mean
+/// it is necessarily safe.
+static bool equalityPropUnSafe(Value &LoopCond) {
+ ICmpInst *CI = dyn_cast<ICmpInst>(&LoopCond);
+ if (!CI || !CI->isEquality())
+ return false;
+
+ Value *LHS = CI->getOperand(0);
+ Value *RHS = CI->getOperand(1);
+ if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS))
+ return true;
+
+ auto HasUndefInPHI = [](PHINode &PN) {
+ for (Value *Opd : PN.incoming_values()) {
+ if (isa<UndefValue>(Opd))
+ return true;
+ }
+ return false;
+ };
+ PHINode *LPHI = dyn_cast<PHINode>(LHS);
+ PHINode *RPHI = dyn_cast<PHINode>(RHS);
+ if ((LPHI && HasUndefInPHI(*LPHI)) || (RPHI && HasUndefInPHI(*RPHI)))
+ return true;
+
+ auto HasUndefInSelect = [](SelectInst &SI) {
+ if (isa<UndefValue>(SI.getTrueValue()) ||
+ isa<UndefValue>(SI.getFalseValue()))
+ return true;
+ return false;
+ };
+ SelectInst *LSI = dyn_cast<SelectInst>(LHS);
+ SelectInst *RSI = dyn_cast<SelectInst>(RHS);
+ if ((LSI && HasUndefInSelect(*LSI)) || (RSI && HasUndefInSelect(*RSI)))
+ return true;
+ return false;
+}
+
/// Check if the loop header has a conditional branch that is not
/// loop-invariant, because it involves load instructions. If all paths from
/// either the true or false successor to the header or loop exists do not
@@ -779,205 +779,205 @@ hasPartialIVCondition(Loop *L, MemorySSA &MSSA, AAResults *AA) {
return {};
}
-/// Do actual work and unswitch loop if possible and profitable.
-bool LoopUnswitch::processCurrentLoop() {
- bool Changed = false;
-
- initLoopData();
-
- // If LoopSimplify was unable to form a preheader, don't do any unswitching.
- if (!LoopPreheader)
- return false;
-
- // Loops with indirectbr cannot be cloned.
- if (!CurrentLoop->isSafeToClone())
- return false;
-
- // Without dedicated exits, splitting the exit edge may fail.
- if (!CurrentLoop->hasDedicatedExits())
- return false;
-
- LLVMContext &Context = LoopHeader->getContext();
-
- // Analyze loop cost, and stop unswitching if loop content can not be duplicated.
- if (!BranchesInfo.countLoop(
- CurrentLoop,
- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
- *CurrentLoop->getHeader()->getParent()),
- AC))
- return false;
-
- // Try trivial unswitch first before loop over other basic blocks in the loop.
- if (tryTrivialLoopUnswitch(Changed)) {
- return true;
- }
-
- // Do not do non-trivial unswitch while optimizing for size.
- // FIXME: Use Function::hasOptSize().
- if (OptimizeForSize ||
- LoopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
+/// Do actual work and unswitch loop if possible and profitable.
+bool LoopUnswitch::processCurrentLoop() {
+ bool Changed = false;
+
+ initLoopData();
+
+ // If LoopSimplify was unable to form a preheader, don't do any unswitching.
+ if (!LoopPreheader)
+ return false;
+
+ // Loops with indirectbr cannot be cloned.
+ if (!CurrentLoop->isSafeToClone())
+ return false;
+
+ // Without dedicated exits, splitting the exit edge may fail.
+ if (!CurrentLoop->hasDedicatedExits())
+ return false;
+
+ LLVMContext &Context = LoopHeader->getContext();
+
+ // Analyze loop cost, and stop unswitching if loop content can not be duplicated.
+ if (!BranchesInfo.countLoop(
+ CurrentLoop,
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+ *CurrentLoop->getHeader()->getParent()),
+ AC))
+ return false;
+
+ // Try trivial unswitch first before loop over other basic blocks in the loop.
+ if (tryTrivialLoopUnswitch(Changed)) {
+ return true;
+ }
+
+ // Do not do non-trivial unswitch while optimizing for size.
+ // FIXME: Use Function::hasOptSize().
+ if (OptimizeForSize ||
+ LoopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
return Changed;
-
- // Run through the instructions in the loop, keeping track of three things:
- //
- // - That we do not unswitch loops containing convergent operations, as we
- // might be making them control dependent on the unswitch value when they
- // were not before.
- // FIXME: This could be refined to only bail if the convergent operation is
- // not already control-dependent on the unswitch value.
- //
- // - That basic blocks in the loop contain invokes whose predecessor edges we
- // cannot split.
- //
- // - The set of guard intrinsics encountered (these are non terminator
- // instructions that are also profitable to be unswitched).
-
- SmallVector<IntrinsicInst *, 4> Guards;
-
- for (const auto BB : CurrentLoop->blocks()) {
- for (auto &I : *BB) {
- auto *CB = dyn_cast<CallBase>(&I);
- if (!CB)
- continue;
- if (CB->isConvergent())
+
+ // Run through the instructions in the loop, keeping track of three things:
+ //
+ // - That we do not unswitch loops containing convergent operations, as we
+ // might be making them control dependent on the unswitch value when they
+ // were not before.
+ // FIXME: This could be refined to only bail if the convergent operation is
+ // not already control-dependent on the unswitch value.
+ //
+ // - That basic blocks in the loop contain invokes whose predecessor edges we
+ // cannot split.
+ //
+ // - The set of guard intrinsics encountered (these are non terminator
+ // instructions that are also profitable to be unswitched).
+
+ SmallVector<IntrinsicInst *, 4> Guards;
+
+ for (const auto BB : CurrentLoop->blocks()) {
+ for (auto &I : *BB) {
+ auto *CB = dyn_cast<CallBase>(&I);
+ if (!CB)
+ continue;
+ if (CB->isConvergent())
return Changed;
- if (auto *II = dyn_cast<InvokeInst>(&I))
- if (!II->getUnwindDest()->canSplitPredecessors())
+ if (auto *II = dyn_cast<InvokeInst>(&I))
+ if (!II->getUnwindDest()->canSplitPredecessors())
return Changed;
- if (auto *II = dyn_cast<IntrinsicInst>(&I))
- if (II->getIntrinsicID() == Intrinsic::experimental_guard)
- Guards.push_back(II);
- }
- }
-
- for (IntrinsicInst *Guard : Guards) {
- Value *LoopCond = findLIVLoopCondition(Guard->getOperand(0), CurrentLoop,
- Changed, MSSAU.get())
- .first;
- if (LoopCond &&
- unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
- // NB! Unswitching (if successful) could have erased some of the
- // instructions in Guards leaving dangling pointers there. This is fine
- // because we're returning now, and won't look at Guards again.
- ++NumGuards;
- return true;
- }
- }
-
- // Loop over all of the basic blocks in the loop. If we find an interior
- // block that is branching on a loop-invariant condition, we can unswitch this
- // loop.
- for (Loop::block_iterator I = CurrentLoop->block_begin(),
- E = CurrentLoop->block_end();
- I != E; ++I) {
- Instruction *TI = (*I)->getTerminator();
-
- // Unswitching on a potentially uninitialized predicate is not
- // MSan-friendly. Limit this to the cases when the original predicate is
- // guaranteed to execute, to avoid creating a use-of-uninitialized-value
- // in the code that did not have one.
- // This is a workaround for the discrepancy between LLVM IR and MSan
- // semantics. See PR28054 for more details.
- if (SanitizeMemory &&
- !SafetyInfo.isGuaranteedToExecute(*TI, DT, CurrentLoop))
- continue;
-
- if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
- // Some branches may be rendered unreachable because of previous
- // unswitching.
- // Unswitch only those branches that are reachable.
- if (isUnreachableDueToPreviousUnswitching(*I))
- continue;
-
- // If this isn't branching on an invariant condition, we can't unswitch
- // it.
- if (BI->isConditional()) {
- // See if this, or some part of it, is loop invariant. If so, we can
- // unswitch on it if we desire.
- Value *LoopCond = findLIVLoopCondition(BI->getCondition(), CurrentLoop,
- Changed, MSSAU.get())
- .first;
- if (LoopCond && !equalityPropUnSafe(*LoopCond) &&
- unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
- ++NumBranches;
- return true;
- }
- }
- } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
- Value *SC = SI->getCondition();
- Value *LoopCond;
- OperatorChain OpChain;
- std::tie(LoopCond, OpChain) =
- findLIVLoopCondition(SC, CurrentLoop, Changed, MSSAU.get());
-
- unsigned NumCases = SI->getNumCases();
- if (LoopCond && NumCases) {
- // Find a value to unswitch on:
- // FIXME: this should chose the most expensive case!
- // FIXME: scan for a case with a non-critical edge?
- Constant *UnswitchVal = nullptr;
- // Find a case value such that at least one case value is unswitched
- // out.
- if (OpChain == OC_OpChainAnd) {
- // If the chain only has ANDs and the switch has a case value of 0.
- // Dropping in a 0 to the chain will unswitch out the 0-casevalue.
- auto *AllZero = cast<ConstantInt>(Constant::getNullValue(SC->getType()));
- if (BranchesInfo.isUnswitched(SI, AllZero))
- continue;
- // We are unswitching 0 out.
- UnswitchVal = AllZero;
- } else if (OpChain == OC_OpChainOr) {
- // If the chain only has ORs and the switch has a case value of ~0.
- // Dropping in a ~0 to the chain will unswitch out the ~0-casevalue.
- auto *AllOne = cast<ConstantInt>(Constant::getAllOnesValue(SC->getType()));
- if (BranchesInfo.isUnswitched(SI, AllOne))
- continue;
- // We are unswitching ~0 out.
- UnswitchVal = AllOne;
- } else {
- assert(OpChain == OC_OpChainNone &&
- "Expect to unswitch on trivial chain");
- // Do not process same value again and again.
- // At this point we have some cases already unswitched and
- // some not yet unswitched. Let's find the first not yet unswitched one.
- for (auto Case : SI->cases()) {
- Constant *UnswitchValCandidate = Case.getCaseValue();
- if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
- UnswitchVal = UnswitchValCandidate;
- break;
- }
- }
- }
-
- if (!UnswitchVal)
- continue;
-
- if (unswitchIfProfitable(LoopCond, UnswitchVal)) {
- ++NumSwitches;
- // In case of a full LIV, UnswitchVal is the value we unswitched out.
- // In case of a partial LIV, we only unswitch when its an AND-chain
- // or OR-chain. In both cases switch input value simplifies to
- // UnswitchVal.
- BranchesInfo.setUnswitched(SI, UnswitchVal);
- return true;
- }
- }
- }
-
- // Scan the instructions to check for unswitchable values.
- for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
- BBI != E; ++BBI)
- if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
- Value *LoopCond = findLIVLoopCondition(SI->getCondition(), CurrentLoop,
- Changed, MSSAU.get())
- .first;
- if (LoopCond &&
- unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
- ++NumSelects;
- return true;
- }
- }
- }
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::experimental_guard)
+ Guards.push_back(II);
+ }
+ }
+
+ for (IntrinsicInst *Guard : Guards) {
+ Value *LoopCond = findLIVLoopCondition(Guard->getOperand(0), CurrentLoop,
+ Changed, MSSAU.get())
+ .first;
+ if (LoopCond &&
+ unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
+ // NB! Unswitching (if successful) could have erased some of the
+ // instructions in Guards leaving dangling pointers there. This is fine
+ // because we're returning now, and won't look at Guards again.
+ ++NumGuards;
+ return true;
+ }
+ }
+
+ // Loop over all of the basic blocks in the loop. If we find an interior
+ // block that is branching on a loop-invariant condition, we can unswitch this
+ // loop.
+ for (Loop::block_iterator I = CurrentLoop->block_begin(),
+ E = CurrentLoop->block_end();
+ I != E; ++I) {
+ Instruction *TI = (*I)->getTerminator();
+
+ // Unswitching on a potentially uninitialized predicate is not
+ // MSan-friendly. Limit this to the cases when the original predicate is
+ // guaranteed to execute, to avoid creating a use-of-uninitialized-value
+ // in the code that did not have one.
+ // This is a workaround for the discrepancy between LLVM IR and MSan
+ // semantics. See PR28054 for more details.
+ if (SanitizeMemory &&
+ !SafetyInfo.isGuaranteedToExecute(*TI, DT, CurrentLoop))
+ continue;
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ // Some branches may be rendered unreachable because of previous
+ // unswitching.
+ // Unswitch only those branches that are reachable.
+ if (isUnreachableDueToPreviousUnswitching(*I))
+ continue;
+
+ // If this isn't branching on an invariant condition, we can't unswitch
+ // it.
+ if (BI->isConditional()) {
+ // See if this, or some part of it, is loop invariant. If so, we can
+ // unswitch on it if we desire.
+ Value *LoopCond = findLIVLoopCondition(BI->getCondition(), CurrentLoop,
+ Changed, MSSAU.get())
+ .first;
+ if (LoopCond && !equalityPropUnSafe(*LoopCond) &&
+ unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
+ ++NumBranches;
+ return true;
+ }
+ }
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ Value *SC = SI->getCondition();
+ Value *LoopCond;
+ OperatorChain OpChain;
+ std::tie(LoopCond, OpChain) =
+ findLIVLoopCondition(SC, CurrentLoop, Changed, MSSAU.get());
+
+ unsigned NumCases = SI->getNumCases();
+ if (LoopCond && NumCases) {
+ // Find a value to unswitch on:
+ // FIXME: this should chose the most expensive case!
+ // FIXME: scan for a case with a non-critical edge?
+ Constant *UnswitchVal = nullptr;
+ // Find a case value such that at least one case value is unswitched
+ // out.
+ if (OpChain == OC_OpChainAnd) {
+ // If the chain only has ANDs and the switch has a case value of 0.
+ // Dropping in a 0 to the chain will unswitch out the 0-casevalue.
+ auto *AllZero = cast<ConstantInt>(Constant::getNullValue(SC->getType()));
+ if (BranchesInfo.isUnswitched(SI, AllZero))
+ continue;
+ // We are unswitching 0 out.
+ UnswitchVal = AllZero;
+ } else if (OpChain == OC_OpChainOr) {
+ // If the chain only has ORs and the switch has a case value of ~0.
+ // Dropping in a ~0 to the chain will unswitch out the ~0-casevalue.
+ auto *AllOne = cast<ConstantInt>(Constant::getAllOnesValue(SC->getType()));
+ if (BranchesInfo.isUnswitched(SI, AllOne))
+ continue;
+ // We are unswitching ~0 out.
+ UnswitchVal = AllOne;
+ } else {
+ assert(OpChain == OC_OpChainNone &&
+ "Expect to unswitch on trivial chain");
+ // Do not process same value again and again.
+ // At this point we have some cases already unswitched and
+ // some not yet unswitched. Let's find the first not yet unswitched one.
+ for (auto Case : SI->cases()) {
+ Constant *UnswitchValCandidate = Case.getCaseValue();
+ if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
+ UnswitchVal = UnswitchValCandidate;
+ break;
+ }
+ }
+ }
+
+ if (!UnswitchVal)
+ continue;
+
+ if (unswitchIfProfitable(LoopCond, UnswitchVal)) {
+ ++NumSwitches;
+ // In case of a full LIV, UnswitchVal is the value we unswitched out.
+ // In case of a partial LIV, we only unswitch when its an AND-chain
+ // or OR-chain. In both cases switch input value simplifies to
+ // UnswitchVal.
+ BranchesInfo.setUnswitched(SI, UnswitchVal);
+ return true;
+ }
+ }
+ }
+
+ // Scan the instructions to check for unswitchable values.
+ for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
+ BBI != E; ++BBI)
+ if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
+ Value *LoopCond = findLIVLoopCondition(SI->getCondition(), CurrentLoop,
+ Changed, MSSAU.get())
+ .first;
+ if (LoopCond &&
+ unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
+ ++NumSelects;
+ return true;
+ }
+ }
+ }
// Check if there is a header condition that is invariant along the patch from
// either the true or false successors to the header. This allows unswitching
@@ -1000,102 +1000,102 @@ bool LoopUnswitch::processCurrentLoop() {
}
}
- return Changed;
-}
-
-/// Check to see if all paths from BB exit the loop with no side effects
-/// (including infinite loops).
-///
-/// If true, we return true and set ExitBB to the block we
-/// exit through.
-///
-static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
- BasicBlock *&ExitBB,
- std::set<BasicBlock*> &Visited) {
- if (!Visited.insert(BB).second) {
- // Already visited. Without more analysis, this could indicate an infinite
- // loop.
- return false;
- }
- if (!L->contains(BB)) {
- // Otherwise, this is a loop exit, this is fine so long as this is the
- // first exit.
- if (ExitBB) return false;
- ExitBB = BB;
- return true;
- }
-
- // Otherwise, this is an unvisited intra-loop node. Check all successors.
- for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
- // Check to see if the successor is a trivial loop exit.
- if (!isTrivialLoopExitBlockHelper(L, *SI, ExitBB, Visited))
- return false;
- }
-
- // Okay, everything after this looks good, check to make sure that this block
- // doesn't include any side effects.
- for (Instruction &I : *BB)
- if (I.mayHaveSideEffects())
- return false;
-
- return true;
-}
-
-/// Return true if the specified block unconditionally leads to an exit from
-/// the specified loop, and has no side-effects in the process. If so, return
-/// the block that is exited to, otherwise return null.
-static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
- std::set<BasicBlock*> Visited;
- Visited.insert(L->getHeader()); // Branches to header make infinite loops.
- BasicBlock *ExitBB = nullptr;
- if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited))
- return ExitBB;
- return nullptr;
-}
-
-/// We have found that we can unswitch CurrentLoop when LoopCond == Val to
-/// simplify the loop. If we decide that this is profitable,
-/// unswitch the loop, reprocess the pieces, then return true.
-bool LoopUnswitch::unswitchIfProfitable(Value *LoopCond, Constant *Val,
+ return Changed;
+}
+
+/// Check to see if all paths from BB exit the loop with no side effects
+/// (including infinite loops).
+///
+/// If true, we return true and set ExitBB to the block we
+/// exit through.
+///
+static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
+ BasicBlock *&ExitBB,
+ std::set<BasicBlock*> &Visited) {
+ if (!Visited.insert(BB).second) {
+ // Already visited. Without more analysis, this could indicate an infinite
+ // loop.
+ return false;
+ }
+ if (!L->contains(BB)) {
+ // Otherwise, this is a loop exit, this is fine so long as this is the
+ // first exit.
+ if (ExitBB) return false;
+ ExitBB = BB;
+ return true;
+ }
+
+ // Otherwise, this is an unvisited intra-loop node. Check all successors.
+ for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
+ // Check to see if the successor is a trivial loop exit.
+ if (!isTrivialLoopExitBlockHelper(L, *SI, ExitBB, Visited))
+ return false;
+ }
+
+ // Okay, everything after this looks good, check to make sure that this block
+ // doesn't include any side effects.
+ for (Instruction &I : *BB)
+ if (I.mayHaveSideEffects())
+ return false;
+
+ return true;
+}
+
+/// Return true if the specified block unconditionally leads to an exit from
+/// the specified loop, and has no side-effects in the process. If so, return
+/// the block that is exited to, otherwise return null.
+static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
+ std::set<BasicBlock*> Visited;
+ Visited.insert(L->getHeader()); // Branches to header make infinite loops.
+ BasicBlock *ExitBB = nullptr;
+ if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited))
+ return ExitBB;
+ return nullptr;
+}
+
+/// We have found that we can unswitch CurrentLoop when LoopCond == Val to
+/// simplify the loop. If we decide that this is profitable,
+/// unswitch the loop, reprocess the pieces, then return true.
+bool LoopUnswitch::unswitchIfProfitable(Value *LoopCond, Constant *Val,
Instruction *TI,
ArrayRef<Instruction *> ToDuplicate) {
- // Check to see if it would be profitable to unswitch current loop.
- if (!BranchesInfo.costAllowsUnswitching()) {
- LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
- << CurrentLoop->getHeader()->getName()
- << " at non-trivial condition '" << *Val
- << "' == " << *LoopCond << "\n"
- << ". Cost too high.\n");
- return false;
- }
- if (HasBranchDivergence &&
- getAnalysis<LegacyDivergenceAnalysis>().isDivergent(LoopCond)) {
- LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
- << CurrentLoop->getHeader()->getName()
- << " at non-trivial condition '" << *Val
- << "' == " << *LoopCond << "\n"
- << ". Condition is divergent.\n");
- return false;
- }
-
+ // Check to see if it would be profitable to unswitch current loop.
+ if (!BranchesInfo.costAllowsUnswitching()) {
+ LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
+ << CurrentLoop->getHeader()->getName()
+ << " at non-trivial condition '" << *Val
+ << "' == " << *LoopCond << "\n"
+ << ". Cost too high.\n");
+ return false;
+ }
+ if (HasBranchDivergence &&
+ getAnalysis<LegacyDivergenceAnalysis>().isDivergent(LoopCond)) {
+ LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
+ << CurrentLoop->getHeader()->getName()
+ << " at non-trivial condition '" << *Val
+ << "' == " << *LoopCond << "\n"
+ << ". Condition is divergent.\n");
+ return false;
+ }
+
unswitchNontrivialCondition(LoopCond, Val, CurrentLoop, TI, ToDuplicate);
- return true;
-}
-
-/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
-/// otherwise branch to FalseDest. Insert the code immediately before OldBranch
-/// and remove (but not erase!) it from the function.
+ return true;
+}
+
+/// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
+/// otherwise branch to FalseDest. Insert the code immediately before OldBranch
+/// and remove (but not erase!) it from the function.
void LoopUnswitch::emitPreheaderBranchOnCondition(
Value *LIC, Constant *Val, BasicBlock *TrueDest, BasicBlock *FalseDest,
BranchInst *OldBranch, Instruction *TI,
ArrayRef<Instruction *> ToDuplicate) {
- assert(OldBranch->isUnconditional() && "Preheader is not split correctly");
- assert(TrueDest != FalseDest && "Branch targets should be different");
+ assert(OldBranch->isUnconditional() && "Preheader is not split correctly");
+ assert(TrueDest != FalseDest && "Branch targets should be different");
- // Insert a conditional branch on LIC to the two preheaders. The original
- // code is the true version and the new code is the false version.
- Value *BranchVal = LIC;
- bool Swapped = false;
+ // Insert a conditional branch on LIC to the two preheaders. The original
+ // code is the true version and the new code is the false version.
+ Value *BranchVal = LIC;
+ bool Swapped = false;
if (!ToDuplicate.empty()) {
ValueToValueMapTy Old2New;
@@ -1141,450 +1141,450 @@ void LoopUnswitch::emitPreheaderBranchOnCondition(
std::swap(TrueDest, FalseDest);
Swapped = true;
}
- }
-
- // Old branch will be removed, so save its parent and successor to update the
- // DomTree.
- auto *OldBranchSucc = OldBranch->getSuccessor(0);
- auto *OldBranchParent = OldBranch->getParent();
-
- // Insert the new branch.
- BranchInst *BI =
- IRBuilder<>(OldBranch).CreateCondBr(BranchVal, TrueDest, FalseDest, TI);
- if (Swapped)
- BI->swapProfMetadata();
-
- // Remove the old branch so there is only one branch at the end. This is
- // needed to perform DomTree's internal DFS walk on the function's CFG.
- OldBranch->removeFromParent();
-
- // Inform the DT about the new branch.
- if (DT) {
- // First, add both successors.
- SmallVector<DominatorTree::UpdateType, 3> Updates;
- if (TrueDest != OldBranchSucc)
- Updates.push_back({DominatorTree::Insert, OldBranchParent, TrueDest});
- if (FalseDest != OldBranchSucc)
- Updates.push_back({DominatorTree::Insert, OldBranchParent, FalseDest});
- // If both of the new successors are different from the old one, inform the
- // DT that the edge was deleted.
- if (OldBranchSucc != TrueDest && OldBranchSucc != FalseDest) {
- Updates.push_back({DominatorTree::Delete, OldBranchParent, OldBranchSucc});
- }
-
- if (MSSAU)
+ }
+
+ // Old branch will be removed, so save its parent and successor to update the
+ // DomTree.
+ auto *OldBranchSucc = OldBranch->getSuccessor(0);
+ auto *OldBranchParent = OldBranch->getParent();
+
+ // Insert the new branch.
+ BranchInst *BI =
+ IRBuilder<>(OldBranch).CreateCondBr(BranchVal, TrueDest, FalseDest, TI);
+ if (Swapped)
+ BI->swapProfMetadata();
+
+ // Remove the old branch so there is only one branch at the end. This is
+ // needed to perform DomTree's internal DFS walk on the function's CFG.
+ OldBranch->removeFromParent();
+
+ // Inform the DT about the new branch.
+ if (DT) {
+ // First, add both successors.
+ SmallVector<DominatorTree::UpdateType, 3> Updates;
+ if (TrueDest != OldBranchSucc)
+ Updates.push_back({DominatorTree::Insert, OldBranchParent, TrueDest});
+ if (FalseDest != OldBranchSucc)
+ Updates.push_back({DominatorTree::Insert, OldBranchParent, FalseDest});
+ // If both of the new successors are different from the old one, inform the
+ // DT that the edge was deleted.
+ if (OldBranchSucc != TrueDest && OldBranchSucc != FalseDest) {
+ Updates.push_back({DominatorTree::Delete, OldBranchParent, OldBranchSucc});
+ }
+
+ if (MSSAU)
MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
else
DT->applyUpdates(Updates);
- }
-
- // If either edge is critical, split it. This helps preserve LoopSimplify
- // form for enclosing loops.
- auto Options =
- CriticalEdgeSplittingOptions(DT, LI, MSSAU.get()).setPreserveLCSSA();
- SplitCriticalEdge(BI, 0, Options);
- SplitCriticalEdge(BI, 1, Options);
-}
-
-/// Given a loop that has a trivial unswitchable condition in it (a cond branch
-/// from its header block to its latch block, where the path through the loop
-/// that doesn't execute its body has no side-effects), unswitch it. This
-/// doesn't involve any code duplication, just moving the conditional branch
-/// outside of the loop and updating loop info.
-void LoopUnswitch::unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
- BasicBlock *ExitBlock,
- Instruction *TI) {
- LLVM_DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
- << LoopHeader->getName() << " [" << L->getBlocks().size()
- << " blocks] in Function "
- << L->getHeader()->getParent()->getName()
- << " on cond: " << *Val << " == " << *Cond << "\n");
- // We are going to make essential changes to CFG. This may invalidate cached
- // information for L or one of its parent loops in SCEV.
- if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
- SEWP->getSE().forgetTopmostLoop(L);
-
- // First step, split the preheader, so that we know that there is a safe place
- // to insert the conditional branch. We will change LoopPreheader to have a
- // conditional branch on Cond.
- BasicBlock *NewPH = SplitEdge(LoopPreheader, LoopHeader, DT, LI, MSSAU.get());
-
- // Now that we have a place to insert the conditional branch, create a place
- // to branch to: this is the exit block out of the loop that we should
- // short-circuit to.
-
- // Split this block now, so that the loop maintains its exit block, and so
- // that the jump from the preheader can execute the contents of the exit block
- // without actually branching to it (the exit block should be dominated by the
- // loop header, not the preheader).
- assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
- BasicBlock *NewExit =
- SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI, MSSAU.get());
-
- // Okay, now we have a position to branch from and a position to branch to,
- // insert the new conditional branch.
- auto *OldBranch = dyn_cast<BranchInst>(LoopPreheader->getTerminator());
- assert(OldBranch && "Failed to split the preheader");
- emitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, OldBranch, TI);
-
- // emitPreheaderBranchOnCondition removed the OldBranch from the function.
- // Delete it, as it is no longer needed.
- delete OldBranch;
-
- // We need to reprocess this loop, it could be unswitched again.
- RedoLoop = true;
-
- // Now that we know that the loop is never entered when this condition is a
- // particular value, rewrite the loop with this info. We know that this will
- // at least eliminate the old branch.
- rewriteLoopBodyWithConditionConstant(L, Cond, Val, /*IsEqual=*/false);
-
- ++NumTrivial;
-}
-
-/// Check if the first non-constant condition starting from the loop header is
-/// a trivial unswitch condition: that is, a condition controls whether or not
-/// the loop does anything at all. If it is a trivial condition, unswitching
-/// produces no code duplications (equivalently, it produces a simpler loop and
-/// a new empty loop, which gets deleted). Therefore always unswitch trivial
-/// condition.
-bool LoopUnswitch::tryTrivialLoopUnswitch(bool &Changed) {
- BasicBlock *CurrentBB = CurrentLoop->getHeader();
- Instruction *CurrentTerm = CurrentBB->getTerminator();
- LLVMContext &Context = CurrentBB->getContext();
-
- // If loop header has only one reachable successor (currently via an
- // unconditional branch or constant foldable conditional branch, but
- // should also consider adding constant foldable switch instruction in
- // future), we should keep looking for trivial condition candidates in
- // the successor as well. An alternative is to constant fold conditions
- // and merge successors into loop header (then we only need to check header's
- // terminator). The reason for not doing this in LoopUnswitch pass is that
- // it could potentially break LoopPassManager's invariants. Folding dead
- // branches could either eliminate the current loop or make other loops
- // unreachable. LCSSA form might also not be preserved after deleting
- // branches. The following code keeps traversing loop header's successors
- // until it finds the trivial condition candidate (condition that is not a
- // constant). Since unswitching generates branches with constant conditions,
- // this scenario could be very common in practice.
- SmallPtrSet<BasicBlock*, 8> Visited;
-
- while (true) {
- // If we exit loop or reach a previous visited block, then
- // we can not reach any trivial condition candidates (unfoldable
- // branch instructions or switch instructions) and no unswitch
- // can happen. Exit and return false.
- if (!CurrentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second)
- return false;
-
- // Check if this loop will execute any side-effecting instructions (e.g.
- // stores, calls, volatile loads) in the part of the loop that the code
- // *would* execute. Check the header first.
- for (Instruction &I : *CurrentBB)
- if (I.mayHaveSideEffects())
- return false;
-
- if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
- if (BI->isUnconditional()) {
- CurrentBB = BI->getSuccessor(0);
- } else if (BI->getCondition() == ConstantInt::getTrue(Context)) {
- CurrentBB = BI->getSuccessor(0);
- } else if (BI->getCondition() == ConstantInt::getFalse(Context)) {
- CurrentBB = BI->getSuccessor(1);
- } else {
- // Found a trivial condition candidate: non-foldable conditional branch.
- break;
- }
- } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
- // At this point, any constant-foldable instructions should have probably
- // been folded.
- ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
- if (!Cond)
- break;
- // Find the target block we are definitely going to.
- CurrentBB = SI->findCaseValue(Cond)->getCaseSuccessor();
- } else {
- // We do not understand these terminator instructions.
- break;
- }
-
- CurrentTerm = CurrentBB->getTerminator();
- }
-
- // CondVal is the condition that controls the trivial condition.
- // LoopExitBB is the BasicBlock that loop exits when meets trivial condition.
- Constant *CondVal = nullptr;
- BasicBlock *LoopExitBB = nullptr;
-
- if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
- // If this isn't branching on an invariant condition, we can't unswitch it.
- if (!BI->isConditional())
- return false;
-
- Value *LoopCond = findLIVLoopCondition(BI->getCondition(), CurrentLoop,
- Changed, MSSAU.get())
- .first;
-
- // Unswitch only if the trivial condition itself is an LIV (not
- // partial LIV which could occur in and/or)
- if (!LoopCond || LoopCond != BI->getCondition())
- return false;
-
- // Check to see if a successor of the branch is guaranteed to
- // exit through a unique exit block without having any
- // side-effects. If so, determine the value of Cond that causes
- // it to do this.
- if ((LoopExitBB =
- isTrivialLoopExitBlock(CurrentLoop, BI->getSuccessor(0)))) {
- CondVal = ConstantInt::getTrue(Context);
- } else if ((LoopExitBB =
- isTrivialLoopExitBlock(CurrentLoop, BI->getSuccessor(1)))) {
- CondVal = ConstantInt::getFalse(Context);
- }
-
- // If we didn't find a single unique LoopExit block, or if the loop exit
- // block contains phi nodes, this isn't trivial.
- if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
- return false; // Can't handle this.
-
- if (equalityPropUnSafe(*LoopCond))
- return false;
-
- unswitchTrivialCondition(CurrentLoop, LoopCond, CondVal, LoopExitBB,
- CurrentTerm);
- ++NumBranches;
- return true;
- } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
- // If this isn't switching on an invariant condition, we can't unswitch it.
- Value *LoopCond = findLIVLoopCondition(SI->getCondition(), CurrentLoop,
- Changed, MSSAU.get())
- .first;
-
- // Unswitch only if the trivial condition itself is an LIV (not
- // partial LIV which could occur in and/or)
- if (!LoopCond || LoopCond != SI->getCondition())
- return false;
-
- // Check to see if a successor of the switch is guaranteed to go to the
- // latch block or exit through a one exit block without having any
- // side-effects. If so, determine the value of Cond that causes it to do
- // this.
- // Note that we can't trivially unswitch on the default case or
- // on already unswitched cases.
- for (auto Case : SI->cases()) {
- BasicBlock *LoopExitCandidate;
- if ((LoopExitCandidate =
- isTrivialLoopExitBlock(CurrentLoop, Case.getCaseSuccessor()))) {
- // Okay, we found a trivial case, remember the value that is trivial.
- ConstantInt *CaseVal = Case.getCaseValue();
-
- // Check that it was not unswitched before, since already unswitched
- // trivial vals are looks trivial too.
- if (BranchesInfo.isUnswitched(SI, CaseVal))
- continue;
- LoopExitBB = LoopExitCandidate;
- CondVal = CaseVal;
- break;
- }
- }
-
- // If we didn't find a single unique LoopExit block, or if the loop exit
- // block contains phi nodes, this isn't trivial.
- if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
- return false; // Can't handle this.
-
- unswitchTrivialCondition(CurrentLoop, LoopCond, CondVal, LoopExitBB,
- nullptr);
-
- // We are only unswitching full LIV.
- BranchesInfo.setUnswitched(SI, CondVal);
- ++NumSwitches;
- return true;
- }
- return false;
-}
-
-/// Split all of the edges from inside the loop to their exit blocks.
-/// Update the appropriate Phi nodes as we do so.
-void LoopUnswitch::splitExitEdges(
- Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
-
- for (unsigned I = 0, E = ExitBlocks.size(); I != E; ++I) {
- BasicBlock *ExitBlock = ExitBlocks[I];
- SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
- pred_end(ExitBlock));
-
- // Although SplitBlockPredecessors doesn't preserve loop-simplify in
- // general, if we call it on all predecessors of all exits then it does.
- SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI, MSSAU.get(),
- /*PreserveLCSSA*/ true);
- }
-}
-
-/// We determined that the loop is profitable to unswitch when LIC equal Val.
-/// Split it into loop versions and test the condition outside of either loop.
-/// Return the loops created as Out1/Out2.
+ }
+
+ // If either edge is critical, split it. This helps preserve LoopSimplify
+ // form for enclosing loops.
+ auto Options =
+ CriticalEdgeSplittingOptions(DT, LI, MSSAU.get()).setPreserveLCSSA();
+ SplitCriticalEdge(BI, 0, Options);
+ SplitCriticalEdge(BI, 1, Options);
+}
+
+/// Given a loop that has a trivial unswitchable condition in it (a cond branch
+/// from its header block to its latch block, where the path through the loop
+/// that doesn't execute its body has no side-effects), unswitch it. This
+/// doesn't involve any code duplication, just moving the conditional branch
+/// outside of the loop and updating loop info.
+void LoopUnswitch::unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+ BasicBlock *ExitBlock,
+ Instruction *TI) {
+ LLVM_DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
+ << LoopHeader->getName() << " [" << L->getBlocks().size()
+ << " blocks] in Function "
+ << L->getHeader()->getParent()->getName()
+ << " on cond: " << *Val << " == " << *Cond << "\n");
+ // We are going to make essential changes to CFG. This may invalidate cached
+ // information for L or one of its parent loops in SCEV.
+ if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
+ SEWP->getSE().forgetTopmostLoop(L);
+
+ // First step, split the preheader, so that we know that there is a safe place
+ // to insert the conditional branch. We will change LoopPreheader to have a
+ // conditional branch on Cond.
+ BasicBlock *NewPH = SplitEdge(LoopPreheader, LoopHeader, DT, LI, MSSAU.get());
+
+ // Now that we have a place to insert the conditional branch, create a place
+ // to branch to: this is the exit block out of the loop that we should
+ // short-circuit to.
+
+ // Split this block now, so that the loop maintains its exit block, and so
+ // that the jump from the preheader can execute the contents of the exit block
+ // without actually branching to it (the exit block should be dominated by the
+ // loop header, not the preheader).
+ assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
+ BasicBlock *NewExit =
+ SplitBlock(ExitBlock, &ExitBlock->front(), DT, LI, MSSAU.get());
+
+ // Okay, now we have a position to branch from and a position to branch to,
+ // insert the new conditional branch.
+ auto *OldBranch = dyn_cast<BranchInst>(LoopPreheader->getTerminator());
+ assert(OldBranch && "Failed to split the preheader");
+ emitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, OldBranch, TI);
+
+ // emitPreheaderBranchOnCondition removed the OldBranch from the function.
+ // Delete it, as it is no longer needed.
+ delete OldBranch;
+
+ // We need to reprocess this loop, it could be unswitched again.
+ RedoLoop = true;
+
+ // Now that we know that the loop is never entered when this condition is a
+ // particular value, rewrite the loop with this info. We know that this will
+ // at least eliminate the old branch.
+ rewriteLoopBodyWithConditionConstant(L, Cond, Val, /*IsEqual=*/false);
+
+ ++NumTrivial;
+}
+
+/// Check if the first non-constant condition starting from the loop header is
+/// a trivial unswitch condition: that is, a condition controls whether or not
+/// the loop does anything at all. If it is a trivial condition, unswitching
+/// produces no code duplications (equivalently, it produces a simpler loop and
+/// a new empty loop, which gets deleted). Therefore always unswitch trivial
+/// condition.
+bool LoopUnswitch::tryTrivialLoopUnswitch(bool &Changed) {
+ BasicBlock *CurrentBB = CurrentLoop->getHeader();
+ Instruction *CurrentTerm = CurrentBB->getTerminator();
+ LLVMContext &Context = CurrentBB->getContext();
+
+ // If loop header has only one reachable successor (currently via an
+ // unconditional branch or constant foldable conditional branch, but
+ // should also consider adding constant foldable switch instruction in
+ // future), we should keep looking for trivial condition candidates in
+ // the successor as well. An alternative is to constant fold conditions
+ // and merge successors into loop header (then we only need to check header's
+ // terminator). The reason for not doing this in LoopUnswitch pass is that
+ // it could potentially break LoopPassManager's invariants. Folding dead
+ // branches could either eliminate the current loop or make other loops
+ // unreachable. LCSSA form might also not be preserved after deleting
+ // branches. The following code keeps traversing loop header's successors
+ // until it finds the trivial condition candidate (condition that is not a
+ // constant). Since unswitching generates branches with constant conditions,
+ // this scenario could be very common in practice.
+ SmallPtrSet<BasicBlock*, 8> Visited;
+
+ while (true) {
+ // If we exit loop or reach a previous visited block, then
+ // we can not reach any trivial condition candidates (unfoldable
+ // branch instructions or switch instructions) and no unswitch
+ // can happen. Exit and return false.
+ if (!CurrentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second)
+ return false;
+
+ // Check if this loop will execute any side-effecting instructions (e.g.
+ // stores, calls, volatile loads) in the part of the loop that the code
+ // *would* execute. Check the header first.
+ for (Instruction &I : *CurrentBB)
+ if (I.mayHaveSideEffects())
+ return false;
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
+ if (BI->isUnconditional()) {
+ CurrentBB = BI->getSuccessor(0);
+ } else if (BI->getCondition() == ConstantInt::getTrue(Context)) {
+ CurrentBB = BI->getSuccessor(0);
+ } else if (BI->getCondition() == ConstantInt::getFalse(Context)) {
+ CurrentBB = BI->getSuccessor(1);
+ } else {
+ // Found a trivial condition candidate: non-foldable conditional branch.
+ break;
+ }
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+ // At this point, any constant-foldable instructions should have probably
+ // been folded.
+ ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
+ if (!Cond)
+ break;
+ // Find the target block we are definitely going to.
+ CurrentBB = SI->findCaseValue(Cond)->getCaseSuccessor();
+ } else {
+ // We do not understand these terminator instructions.
+ break;
+ }
+
+ CurrentTerm = CurrentBB->getTerminator();
+ }
+
+ // CondVal is the condition that controls the trivial condition.
+ // LoopExitBB is the BasicBlock that loop exits when meets trivial condition.
+ Constant *CondVal = nullptr;
+ BasicBlock *LoopExitBB = nullptr;
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(CurrentTerm)) {
+ // If this isn't branching on an invariant condition, we can't unswitch it.
+ if (!BI->isConditional())
+ return false;
+
+ Value *LoopCond = findLIVLoopCondition(BI->getCondition(), CurrentLoop,
+ Changed, MSSAU.get())
+ .first;
+
+ // Unswitch only if the trivial condition itself is an LIV (not
+ // partial LIV which could occur in and/or)
+ if (!LoopCond || LoopCond != BI->getCondition())
+ return false;
+
+ // Check to see if a successor of the branch is guaranteed to
+ // exit through a unique exit block without having any
+ // side-effects. If so, determine the value of Cond that causes
+ // it to do this.
+ if ((LoopExitBB =
+ isTrivialLoopExitBlock(CurrentLoop, BI->getSuccessor(0)))) {
+ CondVal = ConstantInt::getTrue(Context);
+ } else if ((LoopExitBB =
+ isTrivialLoopExitBlock(CurrentLoop, BI->getSuccessor(1)))) {
+ CondVal = ConstantInt::getFalse(Context);
+ }
+
+ // If we didn't find a single unique LoopExit block, or if the loop exit
+ // block contains phi nodes, this isn't trivial.
+ if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+ return false; // Can't handle this.
+
+ if (equalityPropUnSafe(*LoopCond))
+ return false;
+
+ unswitchTrivialCondition(CurrentLoop, LoopCond, CondVal, LoopExitBB,
+ CurrentTerm);
+ ++NumBranches;
+ return true;
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+ // If this isn't switching on an invariant condition, we can't unswitch it.
+ Value *LoopCond = findLIVLoopCondition(SI->getCondition(), CurrentLoop,
+ Changed, MSSAU.get())
+ .first;
+
+ // Unswitch only if the trivial condition itself is an LIV (not
+ // partial LIV which could occur in and/or)
+ if (!LoopCond || LoopCond != SI->getCondition())
+ return false;
+
+ // Check to see if a successor of the switch is guaranteed to go to the
+ // latch block or exit through a one exit block without having any
+ // side-effects. If so, determine the value of Cond that causes it to do
+ // this.
+ // Note that we can't trivially unswitch on the default case or
+ // on already unswitched cases.
+ for (auto Case : SI->cases()) {
+ BasicBlock *LoopExitCandidate;
+ if ((LoopExitCandidate =
+ isTrivialLoopExitBlock(CurrentLoop, Case.getCaseSuccessor()))) {
+ // Okay, we found a trivial case, remember the value that is trivial.
+ ConstantInt *CaseVal = Case.getCaseValue();
+
+ // Check that it was not unswitched before, since already unswitched
+ // trivial vals are looks trivial too.
+ if (BranchesInfo.isUnswitched(SI, CaseVal))
+ continue;
+ LoopExitBB = LoopExitCandidate;
+ CondVal = CaseVal;
+ break;
+ }
+ }
+
+ // If we didn't find a single unique LoopExit block, or if the loop exit
+ // block contains phi nodes, this isn't trivial.
+ if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+ return false; // Can't handle this.
+
+ unswitchTrivialCondition(CurrentLoop, LoopCond, CondVal, LoopExitBB,
+ nullptr);
+
+ // We are only unswitching full LIV.
+ BranchesInfo.setUnswitched(SI, CondVal);
+ ++NumSwitches;
+ return true;
+ }
+ return false;
+}
+
+/// Split all of the edges from inside the loop to their exit blocks.
+/// Update the appropriate Phi nodes as we do so.
+void LoopUnswitch::splitExitEdges(
+ Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+
+ for (unsigned I = 0, E = ExitBlocks.size(); I != E; ++I) {
+ BasicBlock *ExitBlock = ExitBlocks[I];
+ SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
+ pred_end(ExitBlock));
+
+ // Although SplitBlockPredecessors doesn't preserve loop-simplify in
+ // general, if we call it on all predecessors of all exits then it does.
+ SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", DT, LI, MSSAU.get(),
+ /*PreserveLCSSA*/ true);
+ }
+}
+
+/// We determined that the loop is profitable to unswitch when LIC equal Val.
+/// Split it into loop versions and test the condition outside of either loop.
+/// Return the loops created as Out1/Out2.
void LoopUnswitch::unswitchNontrivialCondition(
Value *LIC, Constant *Val, Loop *L, Instruction *TI,
ArrayRef<Instruction *> ToDuplicate) {
- Function *F = LoopHeader->getParent();
- LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
- << LoopHeader->getName() << " [" << L->getBlocks().size()
- << " blocks] in Function " << F->getName() << " when '"
- << *Val << "' == " << *LIC << "\n");
-
- // We are going to make essential changes to CFG. This may invalidate cached
- // information for L or one of its parent loops in SCEV.
- if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
- SEWP->getSE().forgetTopmostLoop(L);
-
- LoopBlocks.clear();
- NewBlocks.clear();
-
- if (MSSAU && VerifyMemorySSA)
- MSSA->verifyMemorySSA();
-
- // First step, split the preheader and exit blocks, and add these blocks to
- // the LoopBlocks list.
- BasicBlock *NewPreheader =
- SplitEdge(LoopPreheader, LoopHeader, DT, LI, MSSAU.get());
- LoopBlocks.push_back(NewPreheader);
-
- // We want the loop to come after the preheader, but before the exit blocks.
+ Function *F = LoopHeader->getParent();
+ LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
+ << LoopHeader->getName() << " [" << L->getBlocks().size()
+ << " blocks] in Function " << F->getName() << " when '"
+ << *Val << "' == " << *LIC << "\n");
+
+ // We are going to make essential changes to CFG. This may invalidate cached
+ // information for L or one of its parent loops in SCEV.
+ if (auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>())
+ SEWP->getSE().forgetTopmostLoop(L);
+
+ LoopBlocks.clear();
+ NewBlocks.clear();
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+
+ // First step, split the preheader and exit blocks, and add these blocks to
+ // the LoopBlocks list.
+ BasicBlock *NewPreheader =
+ SplitEdge(LoopPreheader, LoopHeader, DT, LI, MSSAU.get());
+ LoopBlocks.push_back(NewPreheader);
+
+ // We want the loop to come after the preheader, but before the exit blocks.
llvm::append_range(LoopBlocks, L->blocks());
-
- SmallVector<BasicBlock*, 8> ExitBlocks;
- L->getUniqueExitBlocks(ExitBlocks);
-
- // Split all of the edges from inside the loop to their exit blocks. Update
- // the appropriate Phi nodes as we do so.
- splitExitEdges(L, ExitBlocks);
-
- // The exit blocks may have been changed due to edge splitting, recompute.
- ExitBlocks.clear();
- L->getUniqueExitBlocks(ExitBlocks);
-
- // Add exit blocks to the loop blocks.
+
+ SmallVector<BasicBlock*, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+
+ // Split all of the edges from inside the loop to their exit blocks. Update
+ // the appropriate Phi nodes as we do so.
+ splitExitEdges(L, ExitBlocks);
+
+ // The exit blocks may have been changed due to edge splitting, recompute.
+ ExitBlocks.clear();
+ L->getUniqueExitBlocks(ExitBlocks);
+
+ // Add exit blocks to the loop blocks.
llvm::append_range(LoopBlocks, ExitBlocks);
-
- // Next step, clone all of the basic blocks that make up the loop (including
- // the loop preheader and exit blocks), keeping track of the mapping between
- // the instructions and blocks.
- NewBlocks.reserve(LoopBlocks.size());
- ValueToValueMapTy VMap;
- for (unsigned I = 0, E = LoopBlocks.size(); I != E; ++I) {
- BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[I], VMap, ".us", F);
-
- NewBlocks.push_back(NewBB);
- VMap[LoopBlocks[I]] = NewBB; // Keep the BB mapping.
- }
-
- // Splice the newly inserted blocks into the function right before the
- // original preheader.
- F->getBasicBlockList().splice(NewPreheader->getIterator(),
- F->getBasicBlockList(),
- NewBlocks[0]->getIterator(), F->end());
-
- // Now we create the new Loop object for the versioned loop.
- Loop *NewLoop = cloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
-
- // Recalculate unswitching quota, inherit simplified switches info for NewBB,
- // Probably clone more loop-unswitch related loop properties.
- BranchesInfo.cloneData(NewLoop, L, VMap);
-
- Loop *ParentLoop = L->getParentLoop();
- if (ParentLoop) {
- // Make sure to add the cloned preheader and exit blocks to the parent loop
- // as well.
- ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI);
- }
-
- for (unsigned EBI = 0, EBE = ExitBlocks.size(); EBI != EBE; ++EBI) {
- BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[EBI]]);
- // The new exit block should be in the same loop as the old one.
- if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[EBI]))
- ExitBBLoop->addBasicBlockToLoop(NewExit, *LI);
-
- assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
- "Exit block should have been split to have one successor!");
- BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
-
- // If the successor of the exit block had PHI nodes, add an entry for
- // NewExit.
- for (PHINode &PN : ExitSucc->phis()) {
- Value *V = PN.getIncomingValueForBlock(ExitBlocks[EBI]);
- ValueToValueMapTy::iterator It = VMap.find(V);
- if (It != VMap.end()) V = It->second;
- PN.addIncoming(V, NewExit);
- }
-
- if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
- PHINode *PN = PHINode::Create(LPad->getType(), 0, "",
- &*ExitSucc->getFirstInsertionPt());
-
- for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
- I != E; ++I) {
- BasicBlock *BB = *I;
- LandingPadInst *LPI = BB->getLandingPadInst();
- LPI->replaceAllUsesWith(PN);
- PN->addIncoming(LPI, BB);
- }
- }
- }
-
- // Rewrite the code to refer to itself.
- for (unsigned NBI = 0, NBE = NewBlocks.size(); NBI != NBE; ++NBI) {
- for (Instruction &I : *NewBlocks[NBI]) {
- RemapInstruction(&I, VMap,
- RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
- if (auto *II = dyn_cast<IntrinsicInst>(&I))
- if (II->getIntrinsicID() == Intrinsic::assume)
- AC->registerAssumption(II);
- }
- }
-
- // Rewrite the original preheader to select between versions of the loop.
- BranchInst *OldBR = cast<BranchInst>(LoopPreheader->getTerminator());
- assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
- "Preheader splitting did not work correctly!");
-
- if (MSSAU) {
- // Update MemorySSA after cloning, and before splitting to unreachables,
- // since that invalidates the 1:1 mapping of clones in VMap.
- LoopBlocksRPO LBRPO(L);
- LBRPO.perform(LI);
- MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, VMap);
- }
-
- // Emit the new branch that selects between the two versions of this loop.
- emitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR,
+
+ // Next step, clone all of the basic blocks that make up the loop (including
+ // the loop preheader and exit blocks), keeping track of the mapping between
+ // the instructions and blocks.
+ NewBlocks.reserve(LoopBlocks.size());
+ ValueToValueMapTy VMap;
+ for (unsigned I = 0, E = LoopBlocks.size(); I != E; ++I) {
+ BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[I], VMap, ".us", F);
+
+ NewBlocks.push_back(NewBB);
+ VMap[LoopBlocks[I]] = NewBB; // Keep the BB mapping.
+ }
+
+ // Splice the newly inserted blocks into the function right before the
+ // original preheader.
+ F->getBasicBlockList().splice(NewPreheader->getIterator(),
+ F->getBasicBlockList(),
+ NewBlocks[0]->getIterator(), F->end());
+
+ // Now we create the new Loop object for the versioned loop.
+ Loop *NewLoop = cloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
+
+ // Recalculate unswitching quota, inherit simplified switches info for NewBB,
+ // Probably clone more loop-unswitch related loop properties.
+ BranchesInfo.cloneData(NewLoop, L, VMap);
+
+ Loop *ParentLoop = L->getParentLoop();
+ if (ParentLoop) {
+ // Make sure to add the cloned preheader and exit blocks to the parent loop
+ // as well.
+ ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI);
+ }
+
+ for (unsigned EBI = 0, EBE = ExitBlocks.size(); EBI != EBE; ++EBI) {
+ BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[EBI]]);
+ // The new exit block should be in the same loop as the old one.
+ if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[EBI]))
+ ExitBBLoop->addBasicBlockToLoop(NewExit, *LI);
+
+ assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
+ "Exit block should have been split to have one successor!");
+ BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
+
+ // If the successor of the exit block had PHI nodes, add an entry for
+ // NewExit.
+ for (PHINode &PN : ExitSucc->phis()) {
+ Value *V = PN.getIncomingValueForBlock(ExitBlocks[EBI]);
+ ValueToValueMapTy::iterator It = VMap.find(V);
+ if (It != VMap.end()) V = It->second;
+ PN.addIncoming(V, NewExit);
+ }
+
+ if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
+ PHINode *PN = PHINode::Create(LPad->getType(), 0, "",
+ &*ExitSucc->getFirstInsertionPt());
+
+ for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
+ I != E; ++I) {
+ BasicBlock *BB = *I;
+ LandingPadInst *LPI = BB->getLandingPadInst();
+ LPI->replaceAllUsesWith(PN);
+ PN->addIncoming(LPI, BB);
+ }
+ }
+ }
+
+ // Rewrite the code to refer to itself.
+ for (unsigned NBI = 0, NBE = NewBlocks.size(); NBI != NBE; ++NBI) {
+ for (Instruction &I : *NewBlocks[NBI]) {
+ RemapInstruction(&I, VMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
+ }
+ }
+
+ // Rewrite the original preheader to select between versions of the loop.
+ BranchInst *OldBR = cast<BranchInst>(LoopPreheader->getTerminator());
+ assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
+ "Preheader splitting did not work correctly!");
+
+ if (MSSAU) {
+ // Update MemorySSA after cloning, and before splitting to unreachables,
+ // since that invalidates the 1:1 mapping of clones in VMap.
+ LoopBlocksRPO LBRPO(L);
+ LBRPO.perform(LI);
+ MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, VMap);
+ }
+
+ // Emit the new branch that selects between the two versions of this loop.
+ emitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR,
TI, ToDuplicate);
- if (MSSAU) {
- // Update MemoryPhis in Exit blocks.
- MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMap, *DT);
- if (VerifyMemorySSA)
- MSSA->verifyMemorySSA();
- }
-
- // The OldBr was replaced by a new one and removed (but not erased) by
- // emitPreheaderBranchOnCondition. It is no longer needed, so delete it.
- delete OldBR;
-
- LoopProcessWorklist.push_back(NewLoop);
- RedoLoop = true;
-
- // Keep a WeakTrackingVH holding onto LIC. If the first call to
- // RewriteLoopBody
- // deletes the instruction (for example by simplifying a PHI that feeds into
- // the condition that we're unswitching on), we don't rewrite the second
- // iteration.
- WeakTrackingVH LICHandle(LIC);
-
+ if (MSSAU) {
+ // Update MemoryPhis in Exit blocks.
+ MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMap, *DT);
+ if (VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+ }
+
+ // The OldBr was replaced by a new one and removed (but not erased) by
+ // emitPreheaderBranchOnCondition. It is no longer needed, so delete it.
+ delete OldBR;
+
+ LoopProcessWorklist.push_back(NewLoop);
+ RedoLoop = true;
+
+ // Keep a WeakTrackingVH holding onto LIC. If the first call to
+ // RewriteLoopBody
+ // deletes the instruction (for example by simplifying a PHI that feeds into
+ // the condition that we're unswitching on), we don't rewrite the second
+ // iteration.
+ WeakTrackingVH LICHandle(LIC);
+
if (ToDuplicate.empty()) {
// Now we rewrite the original code to know that the condition is true and
// the new code to know that the condition is false.
rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/false);
-
+
// It's possible that simplifying one loop could cause the other to be
// changed to another value or a constant. If its a constant, don't
// simplify it.
@@ -1601,7 +1601,7 @@ void LoopUnswitch::unswitchNontrivialCondition(
/*IsEqual=*/true);
} else
rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/true);
-
+
// Mark the new loop as partially unswitched, to avoid unswitching on the
// same condition again.
auto &Context = NewLoop->getHeader()->getContext();
@@ -1613,270 +1613,270 @@ void LoopUnswitch::unswitchNontrivialCondition(
NewLoop->setLoopID(NewLoopID);
}
- if (MSSA && VerifyMemorySSA)
- MSSA->verifyMemorySSA();
-}
-
-/// Remove all instances of I from the worklist vector specified.
-static void removeFromWorklist(Instruction *I,
- std::vector<Instruction *> &Worklist) {
+ if (MSSA && VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+}
+
+/// Remove all instances of I from the worklist vector specified.
+static void removeFromWorklist(Instruction *I,
+ std::vector<Instruction *> &Worklist) {
llvm::erase_value(Worklist, I);
-}
-
-/// When we find that I really equals V, remove I from the
-/// program, replacing all uses with V and update the worklist.
-static void replaceUsesOfWith(Instruction *I, Value *V,
- std::vector<Instruction *> &Worklist, Loop *L,
- LPPassManager *LPM, MemorySSAUpdater *MSSAU) {
- LLVM_DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
-
- // Add uses to the worklist, which may be dead now.
- for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
- if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
- Worklist.push_back(Use);
-
- // Add users to the worklist which may be simplified now.
- for (User *U : I->users())
- Worklist.push_back(cast<Instruction>(U));
- removeFromWorklist(I, Worklist);
- I->replaceAllUsesWith(V);
- if (!I->mayHaveSideEffects()) {
- if (MSSAU)
- MSSAU->removeMemoryAccess(I);
- I->eraseFromParent();
- }
- ++NumSimplify;
-}
-
-/// We know either that the value LIC has the value specified by Val in the
-/// specified loop, or we know it does NOT have that value.
-/// Rewrite any uses of LIC or of properties correlated to it.
-void LoopUnswitch::rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
- Constant *Val,
- bool IsEqual) {
- assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
-
- // FIXME: Support correlated properties, like:
- // for (...)
- // if (li1 < li2)
- // ...
- // if (li1 > li2)
- // ...
-
- // FOLD boolean conditions (X|LIC), (X&LIC). Fold conditional branches,
- // selects, switches.
- std::vector<Instruction*> Worklist;
- LLVMContext &Context = Val->getContext();
-
- // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC
- // in the loop with the appropriate one directly.
- if (IsEqual || (isa<ConstantInt>(Val) &&
- Val->getType()->isIntegerTy(1))) {
- Value *Replacement;
- if (IsEqual)
- Replacement = Val;
- else
- Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()),
- !cast<ConstantInt>(Val)->getZExtValue());
-
- for (User *U : LIC->users()) {
- Instruction *UI = dyn_cast<Instruction>(U);
- if (!UI || !L->contains(UI))
- continue;
- Worklist.push_back(UI);
- }
-
- for (Instruction *UI : Worklist)
- UI->replaceUsesOfWith(LIC, Replacement);
-
- simplifyCode(Worklist, L);
- return;
- }
-
- // Otherwise, we don't know the precise value of LIC, but we do know that it
- // is certainly NOT "Val". As such, simplify any uses in the loop that we
- // can. This case occurs when we unswitch switch statements.
- for (User *U : LIC->users()) {
- Instruction *UI = dyn_cast<Instruction>(U);
- if (!UI || !L->contains(UI))
- continue;
-
- // At this point, we know LIC is definitely not Val. Try to use some simple
- // logic to simplify the user w.r.t. to the context.
- if (Value *Replacement = simplifyInstructionWithNotEqual(UI, LIC, Val)) {
- if (LI->replacementPreservesLCSSAForm(UI, Replacement)) {
- // This in-loop instruction has been simplified w.r.t. its context,
- // i.e. LIC != Val, make sure we propagate its replacement value to
- // all its users.
- //
- // We can not yet delete UI, the LIC user, yet, because that would invalidate
- // the LIC->users() iterator !. However, we can make this instruction
- // dead by replacing all its users and push it onto the worklist so that
- // it can be properly deleted and its operands simplified.
- UI->replaceAllUsesWith(Replacement);
- }
- }
-
- // This is a LIC user, push it into the worklist so that simplifyCode can
- // attempt to simplify it.
- Worklist.push_back(UI);
-
- // If we know that LIC is not Val, use this info to simplify code.
- SwitchInst *SI = dyn_cast<SwitchInst>(UI);
- if (!SI || !isa<ConstantInt>(Val)) continue;
-
- // NOTE: if a case value for the switch is unswitched out, we record it
- // after the unswitch finishes. We can not record it here as the switch
- // is not a direct user of the partial LIV.
- SwitchInst::CaseHandle DeadCase =
- *SI->findCaseValue(cast<ConstantInt>(Val));
- // Default case is live for multiple values.
- if (DeadCase == *SI->case_default())
- continue;
-
- // Found a dead case value. Don't remove PHI nodes in the
- // successor if they become single-entry, those PHI nodes may
- // be in the Users list.
-
- BasicBlock *Switch = SI->getParent();
- BasicBlock *SISucc = DeadCase.getCaseSuccessor();
- BasicBlock *Latch = L->getLoopLatch();
-
- if (!SI->findCaseDest(SISucc)) continue; // Edge is critical.
- // If the DeadCase successor dominates the loop latch, then the
- // transformation isn't safe since it will delete the sole predecessor edge
- // to the latch.
- if (Latch && DT->dominates(SISucc, Latch))
- continue;
-
- // FIXME: This is a hack. We need to keep the successor around
- // and hooked up so as to preserve the loop structure, because
- // trying to update it is complicated. So instead we preserve the
- // loop structure and put the block on a dead code path.
- SplitEdge(Switch, SISucc, DT, LI, MSSAU.get());
- // Compute the successors instead of relying on the return value
- // of SplitEdge, since it may have split the switch successor
- // after PHI nodes.
- BasicBlock *NewSISucc = DeadCase.getCaseSuccessor();
- BasicBlock *OldSISucc = *succ_begin(NewSISucc);
- // Create an "unreachable" destination.
- BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable",
- Switch->getParent(),
- OldSISucc);
- new UnreachableInst(Context, Abort);
- // Force the new case destination to branch to the "unreachable"
- // block while maintaining a (dead) CFG edge to the old block.
- NewSISucc->getTerminator()->eraseFromParent();
- BranchInst::Create(Abort, OldSISucc,
- ConstantInt::getTrue(Context), NewSISucc);
- // Release the PHI operands for this edge.
- for (PHINode &PN : NewSISucc->phis())
- PN.setIncomingValueForBlock(Switch, UndefValue::get(PN.getType()));
- // Tell the domtree about the new block. We don't fully update the
- // domtree here -- instead we force it to do a full recomputation
- // after the pass is complete -- but we do need to inform it of
- // new blocks.
- DT->addNewBlock(Abort, NewSISucc);
- }
-
- simplifyCode(Worklist, L);
-}
-
-/// Now that we have simplified some instructions in the loop, walk over it and
-/// constant prop, dce, and fold control flow where possible. Note that this is
-/// effectively a very simple loop-structure-aware optimizer. During processing
-/// of this loop, L could very well be deleted, so it must not be used.
-///
-/// FIXME: When the loop optimizer is more mature, separate this out to a new
-/// pass.
-///
-void LoopUnswitch::simplifyCode(std::vector<Instruction *> &Worklist, Loop *L) {
- const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
- while (!Worklist.empty()) {
- Instruction *I = Worklist.back();
- Worklist.pop_back();
-
- // Simple DCE.
- if (isInstructionTriviallyDead(I)) {
- LLVM_DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n");
-
- // Add uses to the worklist, which may be dead now.
- for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
- if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
- Worklist.push_back(Use);
- removeFromWorklist(I, Worklist);
- if (MSSAU)
- MSSAU->removeMemoryAccess(I);
- I->eraseFromParent();
- ++NumSimplify;
- continue;
- }
-
- // See if instruction simplification can hack this up. This is common for
- // things like "select false, X, Y" after unswitching made the condition be
- // 'false'. TODO: update the domtree properly so we can pass it here.
- if (Value *V = SimplifyInstruction(I, DL))
- if (LI->replacementPreservesLCSSAForm(I, V)) {
- replaceUsesOfWith(I, V, Worklist, L, LPM, MSSAU.get());
- continue;
- }
-
- // Special case hacks that appear commonly in unswitched code.
- if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
- if (BI->isUnconditional()) {
- // If BI's parent is the only pred of the successor, fold the two blocks
- // together.
- BasicBlock *Pred = BI->getParent();
- (void)Pred;
- BasicBlock *Succ = BI->getSuccessor(0);
- BasicBlock *SinglePred = Succ->getSinglePredecessor();
- if (!SinglePred) continue; // Nothing to do.
- assert(SinglePred == Pred && "CFG broken");
-
- // Make the LPM and Worklist updates specific to LoopUnswitch.
- removeFromWorklist(BI, Worklist);
- auto SuccIt = Succ->begin();
- while (PHINode *PN = dyn_cast<PHINode>(SuccIt++)) {
- for (unsigned It = 0, E = PN->getNumOperands(); It != E; ++It)
- if (Instruction *Use = dyn_cast<Instruction>(PN->getOperand(It)))
- Worklist.push_back(Use);
- for (User *U : PN->users())
- Worklist.push_back(cast<Instruction>(U));
- removeFromWorklist(PN, Worklist);
- ++NumSimplify;
- }
- // Merge the block and make the remaining analyses updates.
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- MergeBlockIntoPredecessor(Succ, &DTU, LI, MSSAU.get());
- ++NumSimplify;
- continue;
- }
-
- continue;
- }
- }
-}
-
-/// Simple simplifications we can do given the information that Cond is
-/// definitely not equal to Val.
-Value *LoopUnswitch::simplifyInstructionWithNotEqual(Instruction *Inst,
- Value *Invariant,
- Constant *Val) {
- // icmp eq cond, val -> false
- ICmpInst *CI = dyn_cast<ICmpInst>(Inst);
- if (CI && CI->isEquality()) {
- Value *Op0 = CI->getOperand(0);
- Value *Op1 = CI->getOperand(1);
- if ((Op0 == Invariant && Op1 == Val) || (Op0 == Val && Op1 == Invariant)) {
- LLVMContext &Ctx = Inst->getContext();
- if (CI->getPredicate() == CmpInst::ICMP_EQ)
- return ConstantInt::getFalse(Ctx);
- else
- return ConstantInt::getTrue(Ctx);
- }
- }
-
- // FIXME: there may be other opportunities, e.g. comparison with floating
- // point, or Invariant - Val != 0, etc.
- return nullptr;
-}
+}
+
+/// When we find that I really equals V, remove I from the
+/// program, replacing all uses with V and update the worklist.
+static void replaceUsesOfWith(Instruction *I, Value *V,
+ std::vector<Instruction *> &Worklist, Loop *L,
+ LPPassManager *LPM, MemorySSAUpdater *MSSAU) {
+ LLVM_DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
+
+ // Add uses to the worklist, which may be dead now.
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+ if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+ Worklist.push_back(Use);
+
+ // Add users to the worklist which may be simplified now.
+ for (User *U : I->users())
+ Worklist.push_back(cast<Instruction>(U));
+ removeFromWorklist(I, Worklist);
+ I->replaceAllUsesWith(V);
+ if (!I->mayHaveSideEffects()) {
+ if (MSSAU)
+ MSSAU->removeMemoryAccess(I);
+ I->eraseFromParent();
+ }
+ ++NumSimplify;
+}
+
+/// We know either that the value LIC has the value specified by Val in the
+/// specified loop, or we know it does NOT have that value.
+/// Rewrite any uses of LIC or of properties correlated to it.
+void LoopUnswitch::rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+ Constant *Val,
+ bool IsEqual) {
+ assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
+
+ // FIXME: Support correlated properties, like:
+ // for (...)
+ // if (li1 < li2)
+ // ...
+ // if (li1 > li2)
+ // ...
+
+ // FOLD boolean conditions (X|LIC), (X&LIC). Fold conditional branches,
+ // selects, switches.
+ std::vector<Instruction*> Worklist;
+ LLVMContext &Context = Val->getContext();
+
+ // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC
+ // in the loop with the appropriate one directly.
+ if (IsEqual || (isa<ConstantInt>(Val) &&
+ Val->getType()->isIntegerTy(1))) {
+ Value *Replacement;
+ if (IsEqual)
+ Replacement = Val;
+ else
+ Replacement = ConstantInt::get(Type::getInt1Ty(Val->getContext()),
+ !cast<ConstantInt>(Val)->getZExtValue());
+
+ for (User *U : LIC->users()) {
+ Instruction *UI = dyn_cast<Instruction>(U);
+ if (!UI || !L->contains(UI))
+ continue;
+ Worklist.push_back(UI);
+ }
+
+ for (Instruction *UI : Worklist)
+ UI->replaceUsesOfWith(LIC, Replacement);
+
+ simplifyCode(Worklist, L);
+ return;
+ }
+
+ // Otherwise, we don't know the precise value of LIC, but we do know that it
+ // is certainly NOT "Val". As such, simplify any uses in the loop that we
+ // can. This case occurs when we unswitch switch statements.
+ for (User *U : LIC->users()) {
+ Instruction *UI = dyn_cast<Instruction>(U);
+ if (!UI || !L->contains(UI))
+ continue;
+
+ // At this point, we know LIC is definitely not Val. Try to use some simple
+ // logic to simplify the user w.r.t. to the context.
+ if (Value *Replacement = simplifyInstructionWithNotEqual(UI, LIC, Val)) {
+ if (LI->replacementPreservesLCSSAForm(UI, Replacement)) {
+ // This in-loop instruction has been simplified w.r.t. its context,
+ // i.e. LIC != Val, make sure we propagate its replacement value to
+ // all its users.
+ //
+ // We can not yet delete UI, the LIC user, yet, because that would invalidate
+ // the LIC->users() iterator !. However, we can make this instruction
+ // dead by replacing all its users and push it onto the worklist so that
+ // it can be properly deleted and its operands simplified.
+ UI->replaceAllUsesWith(Replacement);
+ }
+ }
+
+ // This is a LIC user, push it into the worklist so that simplifyCode can
+ // attempt to simplify it.
+ Worklist.push_back(UI);
+
+ // If we know that LIC is not Val, use this info to simplify code.
+ SwitchInst *SI = dyn_cast<SwitchInst>(UI);
+ if (!SI || !isa<ConstantInt>(Val)) continue;
+
+ // NOTE: if a case value for the switch is unswitched out, we record it
+ // after the unswitch finishes. We can not record it here as the switch
+ // is not a direct user of the partial LIV.
+ SwitchInst::CaseHandle DeadCase =
+ *SI->findCaseValue(cast<ConstantInt>(Val));
+ // Default case is live for multiple values.
+ if (DeadCase == *SI->case_default())
+ continue;
+
+ // Found a dead case value. Don't remove PHI nodes in the
+ // successor if they become single-entry, those PHI nodes may
+ // be in the Users list.
+
+ BasicBlock *Switch = SI->getParent();
+ BasicBlock *SISucc = DeadCase.getCaseSuccessor();
+ BasicBlock *Latch = L->getLoopLatch();
+
+ if (!SI->findCaseDest(SISucc)) continue; // Edge is critical.
+ // If the DeadCase successor dominates the loop latch, then the
+ // transformation isn't safe since it will delete the sole predecessor edge
+ // to the latch.
+ if (Latch && DT->dominates(SISucc, Latch))
+ continue;
+
+ // FIXME: This is a hack. We need to keep the successor around
+ // and hooked up so as to preserve the loop structure, because
+ // trying to update it is complicated. So instead we preserve the
+ // loop structure and put the block on a dead code path.
+ SplitEdge(Switch, SISucc, DT, LI, MSSAU.get());
+ // Compute the successors instead of relying on the return value
+ // of SplitEdge, since it may have split the switch successor
+ // after PHI nodes.
+ BasicBlock *NewSISucc = DeadCase.getCaseSuccessor();
+ BasicBlock *OldSISucc = *succ_begin(NewSISucc);
+ // Create an "unreachable" destination.
+ BasicBlock *Abort = BasicBlock::Create(Context, "us-unreachable",
+ Switch->getParent(),
+ OldSISucc);
+ new UnreachableInst(Context, Abort);
+ // Force the new case destination to branch to the "unreachable"
+ // block while maintaining a (dead) CFG edge to the old block.
+ NewSISucc->getTerminator()->eraseFromParent();
+ BranchInst::Create(Abort, OldSISucc,
+ ConstantInt::getTrue(Context), NewSISucc);
+ // Release the PHI operands for this edge.
+ for (PHINode &PN : NewSISucc->phis())
+ PN.setIncomingValueForBlock(Switch, UndefValue::get(PN.getType()));
+ // Tell the domtree about the new block. We don't fully update the
+ // domtree here -- instead we force it to do a full recomputation
+ // after the pass is complete -- but we do need to inform it of
+ // new blocks.
+ DT->addNewBlock(Abort, NewSISucc);
+ }
+
+ simplifyCode(Worklist, L);
+}
+
+/// Now that we have simplified some instructions in the loop, walk over it and
+/// constant prop, dce, and fold control flow where possible. Note that this is
+/// effectively a very simple loop-structure-aware optimizer. During processing
+/// of this loop, L could very well be deleted, so it must not be used.
+///
+/// FIXME: When the loop optimizer is more mature, separate this out to a new
+/// pass.
+///
+void LoopUnswitch::simplifyCode(std::vector<Instruction *> &Worklist, Loop *L) {
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.back();
+ Worklist.pop_back();
+
+ // Simple DCE.
+ if (isInstructionTriviallyDead(I)) {
+ LLVM_DEBUG(dbgs() << "Remove dead instruction '" << *I << "\n");
+
+ // Add uses to the worklist, which may be dead now.
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+ if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+ Worklist.push_back(Use);
+ removeFromWorklist(I, Worklist);
+ if (MSSAU)
+ MSSAU->removeMemoryAccess(I);
+ I->eraseFromParent();
+ ++NumSimplify;
+ continue;
+ }
+
+ // See if instruction simplification can hack this up. This is common for
+ // things like "select false, X, Y" after unswitching made the condition be
+ // 'false'. TODO: update the domtree properly so we can pass it here.
+ if (Value *V = SimplifyInstruction(I, DL))
+ if (LI->replacementPreservesLCSSAForm(I, V)) {
+ replaceUsesOfWith(I, V, Worklist, L, LPM, MSSAU.get());
+ continue;
+ }
+
+ // Special case hacks that appear commonly in unswitched code.
+ if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+ if (BI->isUnconditional()) {
+ // If BI's parent is the only pred of the successor, fold the two blocks
+ // together.
+ BasicBlock *Pred = BI->getParent();
+ (void)Pred;
+ BasicBlock *Succ = BI->getSuccessor(0);
+ BasicBlock *SinglePred = Succ->getSinglePredecessor();
+ if (!SinglePred) continue; // Nothing to do.
+ assert(SinglePred == Pred && "CFG broken");
+
+ // Make the LPM and Worklist updates specific to LoopUnswitch.
+ removeFromWorklist(BI, Worklist);
+ auto SuccIt = Succ->begin();
+ while (PHINode *PN = dyn_cast<PHINode>(SuccIt++)) {
+ for (unsigned It = 0, E = PN->getNumOperands(); It != E; ++It)
+ if (Instruction *Use = dyn_cast<Instruction>(PN->getOperand(It)))
+ Worklist.push_back(Use);
+ for (User *U : PN->users())
+ Worklist.push_back(cast<Instruction>(U));
+ removeFromWorklist(PN, Worklist);
+ ++NumSimplify;
+ }
+ // Merge the block and make the remaining analyses updates.
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ MergeBlockIntoPredecessor(Succ, &DTU, LI, MSSAU.get());
+ ++NumSimplify;
+ continue;
+ }
+
+ continue;
+ }
+ }
+}
+
+/// Simple simplifications we can do given the information that Cond is
+/// definitely not equal to Val.
+Value *LoopUnswitch::simplifyInstructionWithNotEqual(Instruction *Inst,
+ Value *Invariant,
+ Constant *Val) {
+ // icmp eq cond, val -> false
+ ICmpInst *CI = dyn_cast<ICmpInst>(Inst);
+ if (CI && CI->isEquality()) {
+ Value *Op0 = CI->getOperand(0);
+ Value *Op1 = CI->getOperand(1);
+ if ((Op0 == Invariant && Op1 == Val) || (Op0 == Val && Op1 == Invariant)) {
+ LLVMContext &Ctx = Inst->getContext();
+ if (CI->getPredicate() == CmpInst::ICMP_EQ)
+ return ConstantInt::getFalse(Ctx);
+ else
+ return ConstantInt::getTrue(Ctx);
+ }
+ }
+
+ // FIXME: there may be other opportunities, e.g. comparison with floating
+ // point, or Invariant - Val != 0, etc.
+ return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index c8cd007438..2ff1e84807 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -1,147 +1,147 @@
-//===- LoopVersioningLICM.cpp - LICM Loop Versioning ----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// When alias analysis is uncertain about the aliasing between any two accesses,
-// it will return MayAlias. This uncertainty from alias analysis restricts LICM
-// from proceeding further. In cases where alias analysis is uncertain we might
-// use loop versioning as an alternative.
-//
-// Loop Versioning will create a version of the loop with aggressive aliasing
-// assumptions in addition to the original with conservative (default) aliasing
-// assumptions. The version of the loop making aggressive aliasing assumptions
-// will have all the memory accesses marked as no-alias. These two versions of
-// loop will be preceded by a memory runtime check. This runtime check consists
-// of bound checks for all unique memory accessed in loop, and it ensures the
-// lack of memory aliasing. The result of the runtime check determines which of
-// the loop versions is executed: If the runtime check detects any memory
-// aliasing, then the original loop is executed. Otherwise, the version with
-// aggressive aliasing assumptions is used.
-//
-// Following are the top level steps:
-//
-// a) Perform LoopVersioningLICM's feasibility check.
-// b) If loop is a candidate for versioning then create a memory bound check,
-// by considering all the memory accesses in loop body.
-// c) Clone original loop and set all memory accesses as no-alias in new loop.
-// d) Set original loop & versioned loop as a branch target of the runtime check
-// result.
-//
-// It transforms loop as shown below:
-//
-// +----------------+
-// |Runtime Memcheck|
-// +----------------+
-// |
-// +----------+----------------+----------+
-// | |
-// +---------+----------+ +-----------+----------+
-// |Orig Loop Preheader | |Cloned Loop Preheader |
-// +--------------------+ +----------------------+
-// | |
-// +--------------------+ +----------------------+
-// |Orig Loop Body | |Cloned Loop Body |
-// +--------------------+ +----------------------+
-// | |
-// +--------------------+ +----------------------+
-// |Orig Loop Exit Block| |Cloned Loop Exit Block|
-// +--------------------+ +-----------+----------+
-// | |
-// +----------+--------------+-----------+
-// |
-// +-----+----+
-// |Join Block|
-// +----------+
-//
-//===----------------------------------------------------------------------===//
-
+//===- LoopVersioningLICM.cpp - LICM Loop Versioning ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// When alias analysis is uncertain about the aliasing between any two accesses,
+// it will return MayAlias. This uncertainty from alias analysis restricts LICM
+// from proceeding further. In cases where alias analysis is uncertain we might
+// use loop versioning as an alternative.
+//
+// Loop Versioning will create a version of the loop with aggressive aliasing
+// assumptions in addition to the original with conservative (default) aliasing
+// assumptions. The version of the loop making aggressive aliasing assumptions
+// will have all the memory accesses marked as no-alias. These two versions of
+// loop will be preceded by a memory runtime check. This runtime check consists
+// of bound checks for all unique memory accessed in loop, and it ensures the
+// lack of memory aliasing. The result of the runtime check determines which of
+// the loop versions is executed: If the runtime check detects any memory
+// aliasing, then the original loop is executed. Otherwise, the version with
+// aggressive aliasing assumptions is used.
+//
+// Following are the top level steps:
+//
+// a) Perform LoopVersioningLICM's feasibility check.
+// b) If loop is a candidate for versioning then create a memory bound check,
+// by considering all the memory accesses in loop body.
+// c) Clone original loop and set all memory accesses as no-alias in new loop.
+// d) Set original loop & versioned loop as a branch target of the runtime check
+// result.
+//
+// It transforms loop as shown below:
+//
+// +----------------+
+// |Runtime Memcheck|
+// +----------------+
+// |
+// +----------+----------------+----------+
+// | |
+// +---------+----------+ +-----------+----------+
+// |Orig Loop Preheader | |Cloned Loop Preheader |
+// +--------------------+ +----------------------+
+// | |
+// +--------------------+ +----------------------+
+// |Orig Loop Body | |Cloned Loop Body |
+// +--------------------+ +----------------------+
+// | |
+// +--------------------+ +----------------------+
+// |Orig Loop Exit Block| |Cloned Loop Exit Block|
+// +--------------------+ +-----------+----------+
+// | |
+// +----------+--------------+-----------+
+// |
+// +-----+----+
+// |Join Block|
+// +----------+
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Scalar/LoopVersioningLICM.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/LoopVersioning.h"
-#include <cassert>
-#include <memory>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-versioning-licm"
-
-static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable";
-
-/// Threshold minimum allowed percentage for possible
-/// invariant instructions in a loop.
-static cl::opt<float>
- LVInvarThreshold("licm-versioning-invariant-threshold",
- cl::desc("LoopVersioningLICM's minimum allowed percentage"
- "of possible invariant instructions per loop"),
- cl::init(25), cl::Hidden);
-
-/// Threshold for maximum allowed loop nest/depth
-static cl::opt<unsigned> LVLoopDepthThreshold(
- "licm-versioning-max-depth-threshold",
- cl::desc(
- "LoopVersioningLICM's threshold for maximum allowed loop nest/depth"),
- cl::init(2), cl::Hidden);
-
-namespace {
-
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include <cassert>
+#include <memory>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-versioning-licm"
+
+static const char *LICMVersioningMetaData = "llvm.loop.licm_versioning.disable";
+
+/// Threshold minimum allowed percentage for possible
+/// invariant instructions in a loop.
+static cl::opt<float>
+ LVInvarThreshold("licm-versioning-invariant-threshold",
+ cl::desc("LoopVersioningLICM's minimum allowed percentage"
+ "of possible invariant instructions per loop"),
+ cl::init(25), cl::Hidden);
+
+/// Threshold for maximum allowed loop nest/depth
+static cl::opt<unsigned> LVLoopDepthThreshold(
+ "licm-versioning-max-depth-threshold",
+ cl::desc(
+ "LoopVersioningLICM's threshold for maximum allowed loop nest/depth"),
+ cl::init(2), cl::Hidden);
+
+namespace {
+
struct LoopVersioningLICMLegacyPass : public LoopPass {
- static char ID;
-
+ static char ID;
+
LoopVersioningLICMLegacyPass() : LoopPass(ID) {
initializeLoopVersioningLICMLegacyPassPass(
*PassRegistry::getPassRegistry());
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override;
-
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
StringRef getPassName() const override { return "Loop Versioning for LICM"; }
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequiredID(LCSSAID);
- AU.addRequired<LoopAccessLegacyAnalysis>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addRequiredID(LoopSimplifyID);
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequiredID(LCSSAID);
+ AU.addRequired<LoopAccessLegacyAnalysis>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ }
};
-
+
struct LoopVersioningLICM {
// We don't explicitly pass in LoopAccessInfo to the constructor since the
// loop versioning might return early due to instructions that are not safe
@@ -153,429 +153,429 @@ struct LoopVersioningLICM {
: AA(AA), SE(SE), GetLAI(GetLAI),
LoopDepthThreshold(LVLoopDepthThreshold),
InvariantThreshold(LVInvarThreshold), ORE(ORE) {}
-
+
bool runOnLoop(Loop *L, LoopInfo *LI, DominatorTree *DT);
- void reset() {
- AA = nullptr;
- SE = nullptr;
- CurLoop = nullptr;
- LoadAndStoreCounter = 0;
- InvariantCounter = 0;
- IsReadOnlyLoop = true;
- ORE = nullptr;
- CurAST.reset();
- }
-
- class AutoResetter {
- public:
- AutoResetter(LoopVersioningLICM &LVLICM) : LVLICM(LVLICM) {}
- ~AutoResetter() { LVLICM.reset(); }
-
- private:
- LoopVersioningLICM &LVLICM;
- };
-
-private:
- // Current AliasAnalysis information
- AliasAnalysis *AA = nullptr;
-
- // Current ScalarEvolution
- ScalarEvolution *SE = nullptr;
-
- // Current Loop's LoopAccessInfo
- const LoopAccessInfo *LAI = nullptr;
-
+ void reset() {
+ AA = nullptr;
+ SE = nullptr;
+ CurLoop = nullptr;
+ LoadAndStoreCounter = 0;
+ InvariantCounter = 0;
+ IsReadOnlyLoop = true;
+ ORE = nullptr;
+ CurAST.reset();
+ }
+
+ class AutoResetter {
+ public:
+ AutoResetter(LoopVersioningLICM &LVLICM) : LVLICM(LVLICM) {}
+ ~AutoResetter() { LVLICM.reset(); }
+
+ private:
+ LoopVersioningLICM &LVLICM;
+ };
+
+private:
+ // Current AliasAnalysis information
+ AliasAnalysis *AA = nullptr;
+
+ // Current ScalarEvolution
+ ScalarEvolution *SE = nullptr;
+
+ // Current Loop's LoopAccessInfo
+ const LoopAccessInfo *LAI = nullptr;
+
// Proxy for retrieving LoopAccessInfo.
function_ref<const LoopAccessInfo &(Loop *)> GetLAI;
- // The current loop we are working on.
- Loop *CurLoop = nullptr;
-
- // AliasSet information for the current loop.
- std::unique_ptr<AliasSetTracker> CurAST;
-
- // Maximum loop nest threshold
- unsigned LoopDepthThreshold;
-
- // Minimum invariant threshold
- float InvariantThreshold;
-
- // Counter to track num of load & store
- unsigned LoadAndStoreCounter = 0;
-
- // Counter to track num of invariant
- unsigned InvariantCounter = 0;
-
- // Read only loop marker.
- bool IsReadOnlyLoop = true;
-
- // OptimizationRemarkEmitter
- OptimizationRemarkEmitter *ORE;
-
- bool isLegalForVersioning();
- bool legalLoopStructure();
- bool legalLoopInstructions();
- bool legalLoopMemoryAccesses();
- bool isLoopAlreadyVisited();
- void setNoAliasToLoop(Loop *VerLoop);
- bool instructionSafeForVersioning(Instruction *I);
-};
-
-} // end anonymous namespace
-
-/// Check loop structure and confirms it's good for LoopVersioningLICM.
-bool LoopVersioningLICM::legalLoopStructure() {
- // Loop must be in loop simplify form.
- if (!CurLoop->isLoopSimplifyForm()) {
- LLVM_DEBUG(dbgs() << " loop is not in loop-simplify form.\n");
- return false;
- }
- // Loop should be innermost loop, if not return false.
- if (!CurLoop->getSubLoops().empty()) {
- LLVM_DEBUG(dbgs() << " loop is not innermost\n");
- return false;
- }
- // Loop should have a single backedge, if not return false.
- if (CurLoop->getNumBackEdges() != 1) {
- LLVM_DEBUG(dbgs() << " loop has multiple backedges\n");
- return false;
- }
- // Loop must have a single exiting block, if not return false.
- if (!CurLoop->getExitingBlock()) {
- LLVM_DEBUG(dbgs() << " loop has multiple exiting block\n");
- return false;
- }
- // We only handle bottom-tested loop, i.e. loop in which the condition is
- // checked at the end of each iteration. With that we can assume that all
- // instructions in the loop are executed the same number of times.
- if (CurLoop->getExitingBlock() != CurLoop->getLoopLatch()) {
- LLVM_DEBUG(dbgs() << " loop is not bottom tested\n");
- return false;
- }
- // Parallel loops must not have aliasing loop-invariant memory accesses.
- // Hence we don't need to version anything in this case.
- if (CurLoop->isAnnotatedParallel()) {
- LLVM_DEBUG(dbgs() << " Parallel loop is not worth versioning\n");
- return false;
- }
- // Loop depth more then LoopDepthThreshold are not allowed
- if (CurLoop->getLoopDepth() > LoopDepthThreshold) {
- LLVM_DEBUG(dbgs() << " loop depth is more then threshold\n");
- return false;
- }
- // We need to be able to compute the loop trip count in order
- // to generate the bound checks.
- const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop);
+ // The current loop we are working on.
+ Loop *CurLoop = nullptr;
+
+ // AliasSet information for the current loop.
+ std::unique_ptr<AliasSetTracker> CurAST;
+
+ // Maximum loop nest threshold
+ unsigned LoopDepthThreshold;
+
+ // Minimum invariant threshold
+ float InvariantThreshold;
+
+ // Counter to track num of load & store
+ unsigned LoadAndStoreCounter = 0;
+
+ // Counter to track num of invariant
+ unsigned InvariantCounter = 0;
+
+ // Read only loop marker.
+ bool IsReadOnlyLoop = true;
+
+ // OptimizationRemarkEmitter
+ OptimizationRemarkEmitter *ORE;
+
+ bool isLegalForVersioning();
+ bool legalLoopStructure();
+ bool legalLoopInstructions();
+ bool legalLoopMemoryAccesses();
+ bool isLoopAlreadyVisited();
+ void setNoAliasToLoop(Loop *VerLoop);
+ bool instructionSafeForVersioning(Instruction *I);
+};
+
+} // end anonymous namespace
+
+/// Check loop structure and confirms it's good for LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopStructure() {
+ // Loop must be in loop simplify form.
+ if (!CurLoop->isLoopSimplifyForm()) {
+ LLVM_DEBUG(dbgs() << " loop is not in loop-simplify form.\n");
+ return false;
+ }
+ // Loop should be innermost loop, if not return false.
+ if (!CurLoop->getSubLoops().empty()) {
+ LLVM_DEBUG(dbgs() << " loop is not innermost\n");
+ return false;
+ }
+ // Loop should have a single backedge, if not return false.
+ if (CurLoop->getNumBackEdges() != 1) {
+ LLVM_DEBUG(dbgs() << " loop has multiple backedges\n");
+ return false;
+ }
+ // Loop must have a single exiting block, if not return false.
+ if (!CurLoop->getExitingBlock()) {
+ LLVM_DEBUG(dbgs() << " loop has multiple exiting block\n");
+ return false;
+ }
+ // We only handle bottom-tested loop, i.e. loop in which the condition is
+ // checked at the end of each iteration. With that we can assume that all
+ // instructions in the loop are executed the same number of times.
+ if (CurLoop->getExitingBlock() != CurLoop->getLoopLatch()) {
+ LLVM_DEBUG(dbgs() << " loop is not bottom tested\n");
+ return false;
+ }
+ // Parallel loops must not have aliasing loop-invariant memory accesses.
+ // Hence we don't need to version anything in this case.
+ if (CurLoop->isAnnotatedParallel()) {
+ LLVM_DEBUG(dbgs() << " Parallel loop is not worth versioning\n");
+ return false;
+ }
+ // Loop depth more then LoopDepthThreshold are not allowed
+ if (CurLoop->getLoopDepth() > LoopDepthThreshold) {
+ LLVM_DEBUG(dbgs() << " loop depth is more then threshold\n");
+ return false;
+ }
+ // We need to be able to compute the loop trip count in order
+ // to generate the bound checks.
+ const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop);
if (isa<SCEVCouldNotCompute>(ExitCount)) {
- LLVM_DEBUG(dbgs() << " loop does not has trip count\n");
- return false;
- }
- return true;
-}
-
-/// Check memory accesses in loop and confirms it's good for
-/// LoopVersioningLICM.
-bool LoopVersioningLICM::legalLoopMemoryAccesses() {
- bool HasMayAlias = false;
- bool TypeSafety = false;
- bool HasMod = false;
- // Memory check:
- // Transform phase will generate a versioned loop and also a runtime check to
- // ensure the pointers are independent and they don’t alias.
- // In version variant of loop, alias meta data asserts that all access are
- // mutually independent.
- //
- // Pointers aliasing in alias domain are avoided because with multiple
- // aliasing domains we may not be able to hoist potential loop invariant
- // access out of the loop.
- //
- // Iterate over alias tracker sets, and confirm AliasSets doesn't have any
- // must alias set.
- for (const auto &I : *CurAST) {
- const AliasSet &AS = I;
- // Skip Forward Alias Sets, as this should be ignored as part of
- // the AliasSetTracker object.
- if (AS.isForwardingAliasSet())
- continue;
- // With MustAlias its not worth adding runtime bound check.
- if (AS.isMustAlias())
- return false;
- Value *SomePtr = AS.begin()->getValue();
- bool TypeCheck = true;
- // Check for Mod & MayAlias
- HasMayAlias |= AS.isMayAlias();
- HasMod |= AS.isMod();
- for (const auto &A : AS) {
- Value *Ptr = A.getValue();
- // Alias tracker should have pointers of same data type.
- TypeCheck = (TypeCheck && (SomePtr->getType() == Ptr->getType()));
- }
- // At least one alias tracker should have pointers of same data type.
- TypeSafety |= TypeCheck;
- }
- // Ensure types should be of same type.
- if (!TypeSafety) {
- LLVM_DEBUG(dbgs() << " Alias tracker type safety failed!\n");
- return false;
- }
- // Ensure loop body shouldn't be read only.
- if (!HasMod) {
- LLVM_DEBUG(dbgs() << " No memory modified in loop body\n");
- return false;
- }
- // Make sure alias set has may alias case.
- // If there no alias memory ambiguity, return false.
- if (!HasMayAlias) {
- LLVM_DEBUG(dbgs() << " No ambiguity in memory access.\n");
- return false;
- }
- return true;
-}
-
-/// Check loop instructions safe for Loop versioning.
-/// It returns true if it's safe else returns false.
-/// Consider following:
-/// 1) Check all load store in loop body are non atomic & non volatile.
-/// 2) Check function call safety, by ensuring its not accessing memory.
-/// 3) Loop body shouldn't have any may throw instruction.
-/// 4) Loop body shouldn't have any convergent or noduplicate instructions.
-bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
- assert(I != nullptr && "Null instruction found!");
- // Check function call safety
- if (auto *Call = dyn_cast<CallBase>(I)) {
- if (Call->isConvergent() || Call->cannotDuplicate()) {
- LLVM_DEBUG(dbgs() << " Convergent call site found.\n");
- return false;
- }
-
- if (!AA->doesNotAccessMemory(Call)) {
- LLVM_DEBUG(dbgs() << " Unsafe call site found.\n");
- return false;
- }
- }
-
- // Avoid loops with possiblity of throw
- if (I->mayThrow()) {
- LLVM_DEBUG(dbgs() << " May throw instruction found in loop body\n");
- return false;
- }
- // If current instruction is load instructions
- // make sure it's a simple load (non atomic & non volatile)
- if (I->mayReadFromMemory()) {
- LoadInst *Ld = dyn_cast<LoadInst>(I);
- if (!Ld || !Ld->isSimple()) {
- LLVM_DEBUG(dbgs() << " Found a non-simple load.\n");
- return false;
- }
- LoadAndStoreCounter++;
- Value *Ptr = Ld->getPointerOperand();
- // Check loop invariant.
- if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop))
- InvariantCounter++;
- }
- // If current instruction is store instruction
- // make sure it's a simple store (non atomic & non volatile)
- else if (I->mayWriteToMemory()) {
- StoreInst *St = dyn_cast<StoreInst>(I);
- if (!St || !St->isSimple()) {
- LLVM_DEBUG(dbgs() << " Found a non-simple store.\n");
- return false;
- }
- LoadAndStoreCounter++;
- Value *Ptr = St->getPointerOperand();
- // Check loop invariant.
- if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop))
- InvariantCounter++;
-
- IsReadOnlyLoop = false;
- }
- return true;
-}
-
-/// Check loop instructions and confirms it's good for
-/// LoopVersioningLICM.
-bool LoopVersioningLICM::legalLoopInstructions() {
- // Resetting counters.
- LoadAndStoreCounter = 0;
- InvariantCounter = 0;
- IsReadOnlyLoop = true;
- using namespace ore;
- // Iterate over loop blocks and instructions of each block and check
- // instruction safety.
- for (auto *Block : CurLoop->getBlocks())
- for (auto &Inst : *Block) {
- // If instruction is unsafe just return false.
- if (!instructionSafeForVersioning(&Inst)) {
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopInst", &Inst)
- << " Unsafe Loop Instruction";
- });
- return false;
- }
- }
+ LLVM_DEBUG(dbgs() << " loop does not has trip count\n");
+ return false;
+ }
+ return true;
+}
+
+/// Check memory accesses in loop and confirms it's good for
+/// LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopMemoryAccesses() {
+ bool HasMayAlias = false;
+ bool TypeSafety = false;
+ bool HasMod = false;
+ // Memory check:
+ // Transform phase will generate a versioned loop and also a runtime check to
+ // ensure the pointers are independent and they don’t alias.
+ // In version variant of loop, alias meta data asserts that all access are
+ // mutually independent.
+ //
+ // Pointers aliasing in alias domain are avoided because with multiple
+ // aliasing domains we may not be able to hoist potential loop invariant
+ // access out of the loop.
+ //
+ // Iterate over alias tracker sets, and confirm AliasSets doesn't have any
+ // must alias set.
+ for (const auto &I : *CurAST) {
+ const AliasSet &AS = I;
+ // Skip Forward Alias Sets, as this should be ignored as part of
+ // the AliasSetTracker object.
+ if (AS.isForwardingAliasSet())
+ continue;
+ // With MustAlias its not worth adding runtime bound check.
+ if (AS.isMustAlias())
+ return false;
+ Value *SomePtr = AS.begin()->getValue();
+ bool TypeCheck = true;
+ // Check for Mod & MayAlias
+ HasMayAlias |= AS.isMayAlias();
+ HasMod |= AS.isMod();
+ for (const auto &A : AS) {
+ Value *Ptr = A.getValue();
+ // Alias tracker should have pointers of same data type.
+ TypeCheck = (TypeCheck && (SomePtr->getType() == Ptr->getType()));
+ }
+ // At least one alias tracker should have pointers of same data type.
+ TypeSafety |= TypeCheck;
+ }
+ // Ensure types should be of same type.
+ if (!TypeSafety) {
+ LLVM_DEBUG(dbgs() << " Alias tracker type safety failed!\n");
+ return false;
+ }
+ // Ensure loop body shouldn't be read only.
+ if (!HasMod) {
+ LLVM_DEBUG(dbgs() << " No memory modified in loop body\n");
+ return false;
+ }
+ // Make sure alias set has may alias case.
+ // If there no alias memory ambiguity, return false.
+ if (!HasMayAlias) {
+ LLVM_DEBUG(dbgs() << " No ambiguity in memory access.\n");
+ return false;
+ }
+ return true;
+}
+
+/// Check loop instructions safe for Loop versioning.
+/// It returns true if it's safe else returns false.
+/// Consider following:
+/// 1) Check all load store in loop body are non atomic & non volatile.
+/// 2) Check function call safety, by ensuring its not accessing memory.
+/// 3) Loop body shouldn't have any may throw instruction.
+/// 4) Loop body shouldn't have any convergent or noduplicate instructions.
+bool LoopVersioningLICM::instructionSafeForVersioning(Instruction *I) {
+ assert(I != nullptr && "Null instruction found!");
+ // Check function call safety
+ if (auto *Call = dyn_cast<CallBase>(I)) {
+ if (Call->isConvergent() || Call->cannotDuplicate()) {
+ LLVM_DEBUG(dbgs() << " Convergent call site found.\n");
+ return false;
+ }
+
+ if (!AA->doesNotAccessMemory(Call)) {
+ LLVM_DEBUG(dbgs() << " Unsafe call site found.\n");
+ return false;
+ }
+ }
+
+ // Avoid loops with possiblity of throw
+ if (I->mayThrow()) {
+ LLVM_DEBUG(dbgs() << " May throw instruction found in loop body\n");
+ return false;
+ }
+ // If current instruction is load instructions
+ // make sure it's a simple load (non atomic & non volatile)
+ if (I->mayReadFromMemory()) {
+ LoadInst *Ld = dyn_cast<LoadInst>(I);
+ if (!Ld || !Ld->isSimple()) {
+ LLVM_DEBUG(dbgs() << " Found a non-simple load.\n");
+ return false;
+ }
+ LoadAndStoreCounter++;
+ Value *Ptr = Ld->getPointerOperand();
+ // Check loop invariant.
+ if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop))
+ InvariantCounter++;
+ }
+ // If current instruction is store instruction
+ // make sure it's a simple store (non atomic & non volatile)
+ else if (I->mayWriteToMemory()) {
+ StoreInst *St = dyn_cast<StoreInst>(I);
+ if (!St || !St->isSimple()) {
+ LLVM_DEBUG(dbgs() << " Found a non-simple store.\n");
+ return false;
+ }
+ LoadAndStoreCounter++;
+ Value *Ptr = St->getPointerOperand();
+ // Check loop invariant.
+ if (SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop))
+ InvariantCounter++;
+
+ IsReadOnlyLoop = false;
+ }
+ return true;
+}
+
+/// Check loop instructions and confirms it's good for
+/// LoopVersioningLICM.
+bool LoopVersioningLICM::legalLoopInstructions() {
+ // Resetting counters.
+ LoadAndStoreCounter = 0;
+ InvariantCounter = 0;
+ IsReadOnlyLoop = true;
+ using namespace ore;
+ // Iterate over loop blocks and instructions of each block and check
+ // instruction safety.
+ for (auto *Block : CurLoop->getBlocks())
+ for (auto &Inst : *Block) {
+ // If instruction is unsafe just return false.
+ if (!instructionSafeForVersioning(&Inst)) {
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopInst", &Inst)
+ << " Unsafe Loop Instruction";
+ });
+ return false;
+ }
+ }
// Get LoopAccessInfo from current loop via the proxy.
LAI = &GetLAI(CurLoop);
- // Check LoopAccessInfo for need of runtime check.
- if (LAI->getRuntimePointerChecking()->getChecks().empty()) {
- LLVM_DEBUG(dbgs() << " LAA: Runtime check not found !!\n");
- return false;
- }
- // Number of runtime-checks should be less then RuntimeMemoryCheckThreshold
- if (LAI->getNumRuntimePointerChecks() >
- VectorizerParams::RuntimeMemoryCheckThreshold) {
- LLVM_DEBUG(
- dbgs() << " LAA: Runtime checks are more than threshold !!\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "RuntimeCheck",
- CurLoop->getStartLoc(),
- CurLoop->getHeader())
- << "Number of runtime checks "
- << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks())
- << " exceeds threshold "
- << NV("Threshold", VectorizerParams::RuntimeMemoryCheckThreshold);
- });
- return false;
- }
- // Loop should have at least one invariant load or store instruction.
- if (!InvariantCounter) {
- LLVM_DEBUG(dbgs() << " Invariant not found !!\n");
- return false;
- }
- // Read only loop not allowed.
- if (IsReadOnlyLoop) {
- LLVM_DEBUG(dbgs() << " Found a read-only loop!\n");
- return false;
- }
- // Profitablity check:
- // Check invariant threshold, should be in limit.
- if (InvariantCounter * 100 < InvariantThreshold * LoadAndStoreCounter) {
- LLVM_DEBUG(
- dbgs()
- << " Invariant load & store are less then defined threshold\n");
- LLVM_DEBUG(dbgs() << " Invariant loads & stores: "
- << ((InvariantCounter * 100) / LoadAndStoreCounter)
- << "%\n");
- LLVM_DEBUG(dbgs() << " Invariant loads & store threshold: "
- << InvariantThreshold << "%\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "InvariantThreshold",
- CurLoop->getStartLoc(),
- CurLoop->getHeader())
- << "Invariant load & store "
- << NV("LoadAndStoreCounter",
- ((InvariantCounter * 100) / LoadAndStoreCounter))
- << " are less then defined threshold "
- << NV("Threshold", InvariantThreshold);
- });
- return false;
- }
- return true;
-}
-
-/// It checks loop is already visited or not.
-/// check loop meta data, if loop revisited return true
-/// else false.
-bool LoopVersioningLICM::isLoopAlreadyVisited() {
- // Check LoopVersioningLICM metadata into loop
- if (findStringMetadataForLoop(CurLoop, LICMVersioningMetaData)) {
- return true;
- }
- return false;
-}
-
-/// Checks legality for LoopVersioningLICM by considering following:
-/// a) loop structure legality b) loop instruction legality
-/// c) loop memory access legality.
-/// Return true if legal else returns false.
-bool LoopVersioningLICM::isLegalForVersioning() {
- using namespace ore;
- LLVM_DEBUG(dbgs() << "Loop: " << *CurLoop);
- // Make sure not re-visiting same loop again.
- if (isLoopAlreadyVisited()) {
- LLVM_DEBUG(
- dbgs() << " Revisiting loop in LoopVersioningLICM not allowed.\n\n");
- return false;
- }
- // Check loop structure leagality.
- if (!legalLoopStructure()) {
- LLVM_DEBUG(
- dbgs() << " Loop structure not suitable for LoopVersioningLICM\n\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopStruct",
- CurLoop->getStartLoc(),
- CurLoop->getHeader())
- << " Unsafe Loop structure";
- });
- return false;
- }
- // Check loop instruction leagality.
- if (!legalLoopInstructions()) {
- LLVM_DEBUG(
- dbgs()
- << " Loop instructions not suitable for LoopVersioningLICM\n\n");
- return false;
- }
- // Check loop memory access leagality.
- if (!legalLoopMemoryAccesses()) {
- LLVM_DEBUG(
- dbgs()
- << " Loop memory access not suitable for LoopVersioningLICM\n\n");
- ORE->emit([&]() {
- return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopMemoryAccess",
- CurLoop->getStartLoc(),
- CurLoop->getHeader())
- << " Unsafe Loop memory access";
- });
- return false;
- }
- // Loop versioning is feasible, return true.
- LLVM_DEBUG(dbgs() << " Loop Versioning found to be beneficial\n\n");
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "IsLegalForVersioning",
- CurLoop->getStartLoc(), CurLoop->getHeader())
- << " Versioned loop for LICM."
- << " Number of runtime checks we had to insert "
- << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks());
- });
- return true;
-}
-
-/// Update loop with aggressive aliasing assumptions.
-/// It marks no-alias to any pairs of memory operations by assuming
-/// loop should not have any must-alias memory accesses pairs.
-/// During LoopVersioningLICM legality we ignore loops having must
-/// aliasing memory accesses.
-void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) {
- // Get latch terminator instruction.
- Instruction *I = VerLoop->getLoopLatch()->getTerminator();
- // Create alias scope domain.
- MDBuilder MDB(I->getContext());
- MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("LVDomain");
- StringRef Name = "LVAliasScope";
- MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
+ // Check LoopAccessInfo for need of runtime check.
+ if (LAI->getRuntimePointerChecking()->getChecks().empty()) {
+ LLVM_DEBUG(dbgs() << " LAA: Runtime check not found !!\n");
+ return false;
+ }
+ // Number of runtime-checks should be less then RuntimeMemoryCheckThreshold
+ if (LAI->getNumRuntimePointerChecks() >
+ VectorizerParams::RuntimeMemoryCheckThreshold) {
+ LLVM_DEBUG(
+ dbgs() << " LAA: Runtime checks are more than threshold !!\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "RuntimeCheck",
+ CurLoop->getStartLoc(),
+ CurLoop->getHeader())
+ << "Number of runtime checks "
+ << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks())
+ << " exceeds threshold "
+ << NV("Threshold", VectorizerParams::RuntimeMemoryCheckThreshold);
+ });
+ return false;
+ }
+ // Loop should have at least one invariant load or store instruction.
+ if (!InvariantCounter) {
+ LLVM_DEBUG(dbgs() << " Invariant not found !!\n");
+ return false;
+ }
+ // Read only loop not allowed.
+ if (IsReadOnlyLoop) {
+ LLVM_DEBUG(dbgs() << " Found a read-only loop!\n");
+ return false;
+ }
+ // Profitablity check:
+ // Check invariant threshold, should be in limit.
+ if (InvariantCounter * 100 < InvariantThreshold * LoadAndStoreCounter) {
+ LLVM_DEBUG(
+ dbgs()
+ << " Invariant load & store are less then defined threshold\n");
+ LLVM_DEBUG(dbgs() << " Invariant loads & stores: "
+ << ((InvariantCounter * 100) / LoadAndStoreCounter)
+ << "%\n");
+ LLVM_DEBUG(dbgs() << " Invariant loads & store threshold: "
+ << InvariantThreshold << "%\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "InvariantThreshold",
+ CurLoop->getStartLoc(),
+ CurLoop->getHeader())
+ << "Invariant load & store "
+ << NV("LoadAndStoreCounter",
+ ((InvariantCounter * 100) / LoadAndStoreCounter))
+ << " are less then defined threshold "
+ << NV("Threshold", InvariantThreshold);
+ });
+ return false;
+ }
+ return true;
+}
+
+/// It checks loop is already visited or not.
+/// check loop meta data, if loop revisited return true
+/// else false.
+bool LoopVersioningLICM::isLoopAlreadyVisited() {
+ // Check LoopVersioningLICM metadata into loop
+ if (findStringMetadataForLoop(CurLoop, LICMVersioningMetaData)) {
+ return true;
+ }
+ return false;
+}
+
+/// Checks legality for LoopVersioningLICM by considering following:
+/// a) loop structure legality b) loop instruction legality
+/// c) loop memory access legality.
+/// Return true if legal else returns false.
+bool LoopVersioningLICM::isLegalForVersioning() {
+ using namespace ore;
+ LLVM_DEBUG(dbgs() << "Loop: " << *CurLoop);
+ // Make sure not re-visiting same loop again.
+ if (isLoopAlreadyVisited()) {
+ LLVM_DEBUG(
+ dbgs() << " Revisiting loop in LoopVersioningLICM not allowed.\n\n");
+ return false;
+ }
+ // Check loop structure leagality.
+ if (!legalLoopStructure()) {
+ LLVM_DEBUG(
+ dbgs() << " Loop structure not suitable for LoopVersioningLICM\n\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopStruct",
+ CurLoop->getStartLoc(),
+ CurLoop->getHeader())
+ << " Unsafe Loop structure";
+ });
+ return false;
+ }
+ // Check loop instruction leagality.
+ if (!legalLoopInstructions()) {
+ LLVM_DEBUG(
+ dbgs()
+ << " Loop instructions not suitable for LoopVersioningLICM\n\n");
+ return false;
+ }
+ // Check loop memory access leagality.
+ if (!legalLoopMemoryAccesses()) {
+ LLVM_DEBUG(
+ dbgs()
+ << " Loop memory access not suitable for LoopVersioningLICM\n\n");
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(DEBUG_TYPE, "IllegalLoopMemoryAccess",
+ CurLoop->getStartLoc(),
+ CurLoop->getHeader())
+ << " Unsafe Loop memory access";
+ });
+ return false;
+ }
+ // Loop versioning is feasible, return true.
+ LLVM_DEBUG(dbgs() << " Loop Versioning found to be beneficial\n\n");
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "IsLegalForVersioning",
+ CurLoop->getStartLoc(), CurLoop->getHeader())
+ << " Versioned loop for LICM."
+ << " Number of runtime checks we had to insert "
+ << NV("RuntimeChecks", LAI->getNumRuntimePointerChecks());
+ });
+ return true;
+}
+
+/// Update loop with aggressive aliasing assumptions.
+/// It marks no-alias to any pairs of memory operations by assuming
+/// loop should not have any must-alias memory accesses pairs.
+/// During LoopVersioningLICM legality we ignore loops having must
+/// aliasing memory accesses.
+void LoopVersioningLICM::setNoAliasToLoop(Loop *VerLoop) {
+ // Get latch terminator instruction.
+ Instruction *I = VerLoop->getLoopLatch()->getTerminator();
+ // Create alias scope domain.
+ MDBuilder MDB(I->getContext());
+ MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("LVDomain");
+ StringRef Name = "LVAliasScope";
+ MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
SmallVector<Metadata *, 4> Scopes{NewScope}, NoAliases{NewScope};
- // Iterate over each instruction of loop.
- // set no-alias for all load & store instructions.
- for (auto *Block : CurLoop->getBlocks()) {
- for (auto &Inst : *Block) {
- // Only interested in instruction that may modify or read memory.
- if (!Inst.mayReadFromMemory() && !Inst.mayWriteToMemory())
- continue;
- // Set no-alias for current instruction.
- Inst.setMetadata(
- LLVMContext::MD_noalias,
- MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_noalias),
- MDNode::get(Inst.getContext(), NoAliases)));
- // set alias-scope for current instruction.
- Inst.setMetadata(
- LLVMContext::MD_alias_scope,
- MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_alias_scope),
- MDNode::get(Inst.getContext(), Scopes)));
- }
- }
-}
-
+ // Iterate over each instruction of loop.
+ // set no-alias for all load & store instructions.
+ for (auto *Block : CurLoop->getBlocks()) {
+ for (auto &Inst : *Block) {
+ // Only interested in instruction that may modify or read memory.
+ if (!Inst.mayReadFromMemory() && !Inst.mayWriteToMemory())
+ continue;
+ // Set no-alias for current instruction.
+ Inst.setMetadata(
+ LLVMContext::MD_noalias,
+ MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_noalias),
+ MDNode::get(Inst.getContext(), NoAliases)));
+ // set alias-scope for current instruction.
+ Inst.setMetadata(
+ LLVMContext::MD_alias_scope,
+ MDNode::concatenate(Inst.getMetadata(LLVMContext::MD_alias_scope),
+ MDNode::get(Inst.getContext(), Scopes)));
+ }
+ }
+}
+
bool LoopVersioningLICMLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
if (skipLoop(L))
return false;
@@ -595,68 +595,68 @@ bool LoopVersioningLICMLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
}
bool LoopVersioningLICM::runOnLoop(Loop *L, LoopInfo *LI, DominatorTree *DT) {
- // This will automatically release all resources hold by the current
- // LoopVersioningLICM object.
- AutoResetter Resetter(*this);
-
- // Do not do the transformation if disabled by metadata.
- if (hasLICMVersioningTransformation(L) & TM_Disable)
- return false;
-
- // Set Current Loop
- CurLoop = L;
- CurAST.reset(new AliasSetTracker(*AA));
-
- // Loop over the body of this loop, construct AST.
- for (auto *Block : L->getBlocks()) {
- if (LI->getLoopFor(Block) == L) // Ignore blocks in subloop.
- CurAST->add(*Block); // Incorporate the specified basic block
- }
-
- bool Changed = false;
-
- // Check feasiblity of LoopVersioningLICM.
- // If versioning found to be feasible and beneficial then proceed
- // else simply return, by cleaning up memory.
- if (isLegalForVersioning()) {
- // Do loop versioning.
- // Create memcheck for memory accessed inside loop.
- // Clone original loop, and set blocks properly.
+ // This will automatically release all resources hold by the current
+ // LoopVersioningLICM object.
+ AutoResetter Resetter(*this);
+
+ // Do not do the transformation if disabled by metadata.
+ if (hasLICMVersioningTransformation(L) & TM_Disable)
+ return false;
+
+ // Set Current Loop
+ CurLoop = L;
+ CurAST.reset(new AliasSetTracker(*AA));
+
+ // Loop over the body of this loop, construct AST.
+ for (auto *Block : L->getBlocks()) {
+ if (LI->getLoopFor(Block) == L) // Ignore blocks in subloop.
+ CurAST->add(*Block); // Incorporate the specified basic block
+ }
+
+ bool Changed = false;
+
+ // Check feasiblity of LoopVersioningLICM.
+ // If versioning found to be feasible and beneficial then proceed
+ // else simply return, by cleaning up memory.
+ if (isLegalForVersioning()) {
+ // Do loop versioning.
+ // Create memcheck for memory accessed inside loop.
+ // Clone original loop, and set blocks properly.
LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
CurLoop, LI, DT, SE);
- LVer.versionLoop();
- // Set Loop Versioning metaData for original loop.
- addStringMetadataToLoop(LVer.getNonVersionedLoop(), LICMVersioningMetaData);
- // Set Loop Versioning metaData for version loop.
- addStringMetadataToLoop(LVer.getVersionedLoop(), LICMVersioningMetaData);
- // Set "llvm.mem.parallel_loop_access" metaData to versioned loop.
- // FIXME: "llvm.mem.parallel_loop_access" annotates memory access
- // instructions, not loops.
- addStringMetadataToLoop(LVer.getVersionedLoop(),
- "llvm.mem.parallel_loop_access");
- // Update version loop with aggressive aliasing assumption.
- setNoAliasToLoop(LVer.getVersionedLoop());
- Changed = true;
- }
- return Changed;
-}
-
+ LVer.versionLoop();
+ // Set Loop Versioning metaData for original loop.
+ addStringMetadataToLoop(LVer.getNonVersionedLoop(), LICMVersioningMetaData);
+ // Set Loop Versioning metaData for version loop.
+ addStringMetadataToLoop(LVer.getVersionedLoop(), LICMVersioningMetaData);
+ // Set "llvm.mem.parallel_loop_access" metaData to versioned loop.
+ // FIXME: "llvm.mem.parallel_loop_access" annotates memory access
+ // instructions, not loops.
+ addStringMetadataToLoop(LVer.getVersionedLoop(),
+ "llvm.mem.parallel_loop_access");
+ // Update version loop with aggressive aliasing assumption.
+ setNoAliasToLoop(LVer.getVersionedLoop());
+ Changed = true;
+ }
+ return Changed;
+}
+
char LoopVersioningLICMLegacyPass::ID = 0;
-
+
INITIALIZE_PASS_BEGIN(LoopVersioningLICMLegacyPass, "loop-versioning-licm",
- "Loop Versioning For LICM", false, false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+ "Loop Versioning For LICM", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
INITIALIZE_PASS_END(LoopVersioningLICMLegacyPass, "loop-versioning-licm",
- "Loop Versioning For LICM", false, false)
-
+ "Loop Versioning For LICM", false, false)
+
Pass *llvm::createLoopVersioningLICMPass() {
return new LoopVersioningLICMLegacyPass();
}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerAtomic.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerAtomic.cpp
index d9904a58a0..d1f67b355b 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -1,177 +1,177 @@
-//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass lowers atomic intrinsics to non-atomic form for use in a known
-// non-preemptible environment.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LowerAtomic.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "loweratomic"
-
-static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
- IRBuilder<> Builder(CXI);
- Value *Ptr = CXI->getPointerOperand();
- Value *Cmp = CXI->getCompareOperand();
- Value *Val = CXI->getNewValOperand();
-
- LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr);
- Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
- Value *Res = Builder.CreateSelect(Equal, Val, Orig);
- Builder.CreateStore(Res, Ptr);
-
- Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0);
- Res = Builder.CreateInsertValue(Res, Equal, 1);
-
- CXI->replaceAllUsesWith(Res);
- CXI->eraseFromParent();
- return true;
-}
-
-static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) {
- IRBuilder<> Builder(RMWI);
- Value *Ptr = RMWI->getPointerOperand();
- Value *Val = RMWI->getValOperand();
-
- LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr);
- Value *Res = nullptr;
-
- switch (RMWI->getOperation()) {
- default: llvm_unreachable("Unexpected RMW operation");
- case AtomicRMWInst::Xchg:
- Res = Val;
- break;
- case AtomicRMWInst::Add:
- Res = Builder.CreateAdd(Orig, Val);
- break;
- case AtomicRMWInst::Sub:
- Res = Builder.CreateSub(Orig, Val);
- break;
- case AtomicRMWInst::And:
- Res = Builder.CreateAnd(Orig, Val);
- break;
- case AtomicRMWInst::Nand:
- Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val));
- break;
- case AtomicRMWInst::Or:
- Res = Builder.CreateOr(Orig, Val);
- break;
- case AtomicRMWInst::Xor:
- Res = Builder.CreateXor(Orig, Val);
- break;
- case AtomicRMWInst::Max:
- Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
- Val, Orig);
- break;
- case AtomicRMWInst::Min:
- Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
- Orig, Val);
- break;
- case AtomicRMWInst::UMax:
- Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
- Val, Orig);
- break;
- case AtomicRMWInst::UMin:
- Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
- Orig, Val);
- break;
- case AtomicRMWInst::FAdd:
- Res = Builder.CreateFAdd(Orig, Val);
- break;
- case AtomicRMWInst::FSub:
- Res = Builder.CreateFSub(Orig, Val);
- break;
- }
- Builder.CreateStore(Res, Ptr);
- RMWI->replaceAllUsesWith(Orig);
- RMWI->eraseFromParent();
- return true;
-}
-
-static bool LowerFenceInst(FenceInst *FI) {
- FI->eraseFromParent();
- return true;
-}
-
-static bool LowerLoadInst(LoadInst *LI) {
- LI->setAtomic(AtomicOrdering::NotAtomic);
- return true;
-}
-
-static bool LowerStoreInst(StoreInst *SI) {
- SI->setAtomic(AtomicOrdering::NotAtomic);
- return true;
-}
-
-static bool runOnBasicBlock(BasicBlock &BB) {
- bool Changed = false;
- for (Instruction &Inst : make_early_inc_range(BB)) {
- if (FenceInst *FI = dyn_cast<FenceInst>(&Inst))
- Changed |= LowerFenceInst(FI);
- else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(&Inst))
- Changed |= LowerAtomicCmpXchgInst(CXI);
- else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&Inst))
- Changed |= LowerAtomicRMWInst(RMWI);
- else if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
- if (LI->isAtomic())
- LowerLoadInst(LI);
- } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
- if (SI->isAtomic())
- LowerStoreInst(SI);
- }
- }
- return Changed;
-}
-
-static bool lowerAtomics(Function &F) {
- bool Changed = false;
- for (BasicBlock &BB : F) {
- Changed |= runOnBasicBlock(BB);
- }
- return Changed;
-}
-
-PreservedAnalyses LowerAtomicPass::run(Function &F, FunctionAnalysisManager &) {
- if (lowerAtomics(F))
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
-}
-
-namespace {
-class LowerAtomicLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- LowerAtomicLegacyPass() : FunctionPass(ID) {
- initializeLowerAtomicLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- // Don't skip optnone functions; atomics still need to be lowered.
- FunctionAnalysisManager DummyFAM;
- auto PA = Impl.run(F, DummyFAM);
- return !PA.areAllPreserved();
- }
-
-private:
- LowerAtomicPass Impl;
- };
-}
-
-char LowerAtomicLegacyPass::ID = 0;
-INITIALIZE_PASS(LowerAtomicLegacyPass, "loweratomic",
- "Lower atomic intrinsics to non-atomic form", false, false)
-
-Pass *llvm::createLowerAtomicPass() { return new LowerAtomicLegacyPass(); }
+//===- LowerAtomic.cpp - Lower atomic intrinsics --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers atomic intrinsics to non-atomic form for use in a known
+// non-preemptible environment.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerAtomic.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loweratomic"
+
+static bool LowerAtomicCmpXchgInst(AtomicCmpXchgInst *CXI) {
+ IRBuilder<> Builder(CXI);
+ Value *Ptr = CXI->getPointerOperand();
+ Value *Cmp = CXI->getCompareOperand();
+ Value *Val = CXI->getNewValOperand();
+
+ LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr);
+ Value *Equal = Builder.CreateICmpEQ(Orig, Cmp);
+ Value *Res = Builder.CreateSelect(Equal, Val, Orig);
+ Builder.CreateStore(Res, Ptr);
+
+ Res = Builder.CreateInsertValue(UndefValue::get(CXI->getType()), Orig, 0);
+ Res = Builder.CreateInsertValue(Res, Equal, 1);
+
+ CXI->replaceAllUsesWith(Res);
+ CXI->eraseFromParent();
+ return true;
+}
+
+static bool LowerAtomicRMWInst(AtomicRMWInst *RMWI) {
+ IRBuilder<> Builder(RMWI);
+ Value *Ptr = RMWI->getPointerOperand();
+ Value *Val = RMWI->getValOperand();
+
+ LoadInst *Orig = Builder.CreateLoad(Val->getType(), Ptr);
+ Value *Res = nullptr;
+
+ switch (RMWI->getOperation()) {
+ default: llvm_unreachable("Unexpected RMW operation");
+ case AtomicRMWInst::Xchg:
+ Res = Val;
+ break;
+ case AtomicRMWInst::Add:
+ Res = Builder.CreateAdd(Orig, Val);
+ break;
+ case AtomicRMWInst::Sub:
+ Res = Builder.CreateSub(Orig, Val);
+ break;
+ case AtomicRMWInst::And:
+ Res = Builder.CreateAnd(Orig, Val);
+ break;
+ case AtomicRMWInst::Nand:
+ Res = Builder.CreateNot(Builder.CreateAnd(Orig, Val));
+ break;
+ case AtomicRMWInst::Or:
+ Res = Builder.CreateOr(Orig, Val);
+ break;
+ case AtomicRMWInst::Xor:
+ Res = Builder.CreateXor(Orig, Val);
+ break;
+ case AtomicRMWInst::Max:
+ Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+ Val, Orig);
+ break;
+ case AtomicRMWInst::Min:
+ Res = Builder.CreateSelect(Builder.CreateICmpSLT(Orig, Val),
+ Orig, Val);
+ break;
+ case AtomicRMWInst::UMax:
+ Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+ Val, Orig);
+ break;
+ case AtomicRMWInst::UMin:
+ Res = Builder.CreateSelect(Builder.CreateICmpULT(Orig, Val),
+ Orig, Val);
+ break;
+ case AtomicRMWInst::FAdd:
+ Res = Builder.CreateFAdd(Orig, Val);
+ break;
+ case AtomicRMWInst::FSub:
+ Res = Builder.CreateFSub(Orig, Val);
+ break;
+ }
+ Builder.CreateStore(Res, Ptr);
+ RMWI->replaceAllUsesWith(Orig);
+ RMWI->eraseFromParent();
+ return true;
+}
+
+static bool LowerFenceInst(FenceInst *FI) {
+ FI->eraseFromParent();
+ return true;
+}
+
+static bool LowerLoadInst(LoadInst *LI) {
+ LI->setAtomic(AtomicOrdering::NotAtomic);
+ return true;
+}
+
+static bool LowerStoreInst(StoreInst *SI) {
+ SI->setAtomic(AtomicOrdering::NotAtomic);
+ return true;
+}
+
+static bool runOnBasicBlock(BasicBlock &BB) {
+ bool Changed = false;
+ for (Instruction &Inst : make_early_inc_range(BB)) {
+ if (FenceInst *FI = dyn_cast<FenceInst>(&Inst))
+ Changed |= LowerFenceInst(FI);
+ else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(&Inst))
+ Changed |= LowerAtomicCmpXchgInst(CXI);
+ else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&Inst))
+ Changed |= LowerAtomicRMWInst(RMWI);
+ else if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
+ if (LI->isAtomic())
+ LowerLoadInst(LI);
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
+ if (SI->isAtomic())
+ LowerStoreInst(SI);
+ }
+ }
+ return Changed;
+}
+
+static bool lowerAtomics(Function &F) {
+ bool Changed = false;
+ for (BasicBlock &BB : F) {
+ Changed |= runOnBasicBlock(BB);
+ }
+ return Changed;
+}
+
+PreservedAnalyses LowerAtomicPass::run(Function &F, FunctionAnalysisManager &) {
+ if (lowerAtomics(F))
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
+
+namespace {
+class LowerAtomicLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ LowerAtomicLegacyPass() : FunctionPass(ID) {
+ initializeLowerAtomicLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ // Don't skip optnone functions; atomics still need to be lowered.
+ FunctionAnalysisManager DummyFAM;
+ auto PA = Impl.run(F, DummyFAM);
+ return !PA.areAllPreserved();
+ }
+
+private:
+ LowerAtomicPass Impl;
+ };
+}
+
+char LowerAtomicLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerAtomicLegacyPass, "loweratomic",
+ "Lower atomic intrinsics to non-atomic form", false, false)
+
+Pass *llvm::createLowerAtomicPass() { return new LowerAtomicLegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
index 4ca96ec1f6..bb30c48127 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -1,181 +1,181 @@
-//===- LowerConstantIntrinsics.cpp - Lower constant intrinsic calls -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass lowers all remaining 'objectsize' 'is.constant' intrinsic calls
-// and provides constant propagation and basic CFG cleanup on the result.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-#define DEBUG_TYPE "lower-is-constant-intrinsic"
-
-STATISTIC(IsConstantIntrinsicsHandled,
- "Number of 'is.constant' intrinsic calls handled");
-STATISTIC(ObjectSizeIntrinsicsHandled,
- "Number of 'objectsize' intrinsic calls handled");
-
-static Value *lowerIsConstantIntrinsic(IntrinsicInst *II) {
+//===- LowerConstantIntrinsics.cpp - Lower constant intrinsic calls -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers all remaining 'objectsize' 'is.constant' intrinsic calls
+// and provides constant propagation and basic CFG cleanup on the result.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "lower-is-constant-intrinsic"
+
+STATISTIC(IsConstantIntrinsicsHandled,
+ "Number of 'is.constant' intrinsic calls handled");
+STATISTIC(ObjectSizeIntrinsicsHandled,
+ "Number of 'objectsize' intrinsic calls handled");
+
+static Value *lowerIsConstantIntrinsic(IntrinsicInst *II) {
if (auto *C = dyn_cast<Constant>(II->getOperand(0)))
if (C->isManifestConstant())
return ConstantInt::getTrue(II->getType());
return ConstantInt::getFalse(II->getType());
-}
-
-static bool replaceConditionalBranchesOnConstant(Instruction *II,
- Value *NewValue) {
- bool HasDeadBlocks = false;
- SmallSetVector<Instruction *, 8> Worklist;
- replaceAndRecursivelySimplify(II, NewValue, nullptr, nullptr, nullptr,
- &Worklist);
- for (auto I : Worklist) {
- BranchInst *BI = dyn_cast<BranchInst>(I);
- if (!BI)
- continue;
- if (BI->isUnconditional())
- continue;
-
- BasicBlock *Target, *Other;
- if (match(BI->getOperand(0), m_Zero())) {
- Target = BI->getSuccessor(1);
- Other = BI->getSuccessor(0);
- } else if (match(BI->getOperand(0), m_One())) {
- Target = BI->getSuccessor(0);
- Other = BI->getSuccessor(1);
- } else {
- Target = nullptr;
- Other = nullptr;
- }
- if (Target && Target != Other) {
- BasicBlock *Source = BI->getParent();
- Other->removePredecessor(Source);
- BI->eraseFromParent();
- BranchInst::Create(Target, Source);
+}
+
+static bool replaceConditionalBranchesOnConstant(Instruction *II,
+ Value *NewValue) {
+ bool HasDeadBlocks = false;
+ SmallSetVector<Instruction *, 8> Worklist;
+ replaceAndRecursivelySimplify(II, NewValue, nullptr, nullptr, nullptr,
+ &Worklist);
+ for (auto I : Worklist) {
+ BranchInst *BI = dyn_cast<BranchInst>(I);
+ if (!BI)
+ continue;
+ if (BI->isUnconditional())
+ continue;
+
+ BasicBlock *Target, *Other;
+ if (match(BI->getOperand(0), m_Zero())) {
+ Target = BI->getSuccessor(1);
+ Other = BI->getSuccessor(0);
+ } else if (match(BI->getOperand(0), m_One())) {
+ Target = BI->getSuccessor(0);
+ Other = BI->getSuccessor(1);
+ } else {
+ Target = nullptr;
+ Other = nullptr;
+ }
+ if (Target && Target != Other) {
+ BasicBlock *Source = BI->getParent();
+ Other->removePredecessor(Source);
+ BI->eraseFromParent();
+ BranchInst::Create(Target, Source);
if (pred_empty(Other))
- HasDeadBlocks = true;
- }
- }
- return HasDeadBlocks;
-}
-
-static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo *TLI) {
- bool HasDeadBlocks = false;
- const auto &DL = F.getParent()->getDataLayout();
- SmallVector<WeakTrackingVH, 8> Worklist;
-
- ReversePostOrderTraversal<Function *> RPOT(&F);
- for (BasicBlock *BB : RPOT) {
- for (Instruction &I: *BB) {
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
- if (!II)
- continue;
- switch (II->getIntrinsicID()) {
- default:
- break;
- case Intrinsic::is_constant:
- case Intrinsic::objectsize:
- Worklist.push_back(WeakTrackingVH(&I));
- break;
- }
- }
- }
- for (WeakTrackingVH &VH: Worklist) {
- // Items on the worklist can be mutated by earlier recursive replaces.
- // This can remove the intrinsic as dead (VH == null), but also replace
- // the intrinsic in place.
- if (!VH)
- continue;
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*VH);
- if (!II)
- continue;
- Value *NewValue;
- switch (II->getIntrinsicID()) {
- default:
- continue;
- case Intrinsic::is_constant:
- NewValue = lowerIsConstantIntrinsic(II);
- IsConstantIntrinsicsHandled++;
- break;
- case Intrinsic::objectsize:
- NewValue = lowerObjectSizeCall(II, DL, TLI, true);
- ObjectSizeIntrinsicsHandled++;
- break;
- }
- HasDeadBlocks |= replaceConditionalBranchesOnConstant(II, NewValue);
- }
- if (HasDeadBlocks)
- removeUnreachableBlocks(F);
- return !Worklist.empty();
-}
-
-PreservedAnalyses
-LowerConstantIntrinsicsPass::run(Function &F, FunctionAnalysisManager &AM) {
- if (lowerConstantIntrinsics(F,
- AM.getCachedResult<TargetLibraryAnalysis>(F))) {
- PreservedAnalyses PA;
- PA.preserve<GlobalsAA>();
- return PA;
- }
-
- return PreservedAnalyses::all();
-}
-
-namespace {
-/// Legacy pass for lowering is.constant intrinsics out of the IR.
-///
-/// When this pass is run over a function it converts is.constant intrinsics
-/// into 'true' or 'false'. This complements the normal constant folding
-/// to 'true' as part of Instruction Simplify passes.
-class LowerConstantIntrinsics : public FunctionPass {
-public:
- static char ID;
- LowerConstantIntrinsics() : FunctionPass(ID) {
- initializeLowerConstantIntrinsicsPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
- return lowerConstantIntrinsics(F, TLI);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
-};
-} // namespace
-
-char LowerConstantIntrinsics::ID = 0;
-INITIALIZE_PASS(LowerConstantIntrinsics, "lower-constant-intrinsics",
- "Lower constant intrinsics", false, false)
-
-FunctionPass *llvm::createLowerConstantIntrinsicsPass() {
- return new LowerConstantIntrinsics();
-}
+ HasDeadBlocks = true;
+ }
+ }
+ return HasDeadBlocks;
+}
+
+static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo *TLI) {
+ bool HasDeadBlocks = false;
+ const auto &DL = F.getParent()->getDataLayout();
+ SmallVector<WeakTrackingVH, 8> Worklist;
+
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+ for (BasicBlock *BB : RPOT) {
+ for (Instruction &I: *BB) {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+ if (!II)
+ continue;
+ switch (II->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::is_constant:
+ case Intrinsic::objectsize:
+ Worklist.push_back(WeakTrackingVH(&I));
+ break;
+ }
+ }
+ }
+ for (WeakTrackingVH &VH: Worklist) {
+ // Items on the worklist can be mutated by earlier recursive replaces.
+ // This can remove the intrinsic as dead (VH == null), but also replace
+ // the intrinsic in place.
+ if (!VH)
+ continue;
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&*VH);
+ if (!II)
+ continue;
+ Value *NewValue;
+ switch (II->getIntrinsicID()) {
+ default:
+ continue;
+ case Intrinsic::is_constant:
+ NewValue = lowerIsConstantIntrinsic(II);
+ IsConstantIntrinsicsHandled++;
+ break;
+ case Intrinsic::objectsize:
+ NewValue = lowerObjectSizeCall(II, DL, TLI, true);
+ ObjectSizeIntrinsicsHandled++;
+ break;
+ }
+ HasDeadBlocks |= replaceConditionalBranchesOnConstant(II, NewValue);
+ }
+ if (HasDeadBlocks)
+ removeUnreachableBlocks(F);
+ return !Worklist.empty();
+}
+
+PreservedAnalyses
+LowerConstantIntrinsicsPass::run(Function &F, FunctionAnalysisManager &AM) {
+ if (lowerConstantIntrinsics(F,
+ AM.getCachedResult<TargetLibraryAnalysis>(F))) {
+ PreservedAnalyses PA;
+ PA.preserve<GlobalsAA>();
+ return PA;
+ }
+
+ return PreservedAnalyses::all();
+}
+
+namespace {
+/// Legacy pass for lowering is.constant intrinsics out of the IR.
+///
+/// When this pass is run over a function it converts is.constant intrinsics
+/// into 'true' or 'false'. This complements the normal constant folding
+/// to 'true' as part of Instruction Simplify passes.
+class LowerConstantIntrinsics : public FunctionPass {
+public:
+ static char ID;
+ LowerConstantIntrinsics() : FunctionPass(ID) {
+ initializeLowerConstantIntrinsicsPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+ return lowerConstantIntrinsics(F, TLI);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+} // namespace
+
+char LowerConstantIntrinsics::ID = 0;
+INITIALIZE_PASS(LowerConstantIntrinsics, "lower-constant-intrinsics",
+ "Lower constant intrinsics", false, false)
+
+FunctionPass *llvm::createLowerConstantIntrinsicsPass() {
+ return new LowerConstantIntrinsics();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 7911d1db70..da13075dfe 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -1,420 +1,420 @@
-//===- LowerExpectIntrinsic.cpp - Lower expect intrinsic ------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass lowers the 'expect' intrinsic to LLVM metadata.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Scalar.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "lower-expect-intrinsic"
-
-STATISTIC(ExpectIntrinsicsHandled,
- "Number of 'expect' intrinsic instructions handled");
-
-// These default values are chosen to represent an extremely skewed outcome for
-// a condition, but they leave some room for interpretation by later passes.
-//
-// If the documentation for __builtin_expect() was made explicit that it should
-// only be used in extreme cases, we could make this ratio higher. As it stands,
-// programmers may be using __builtin_expect() / llvm.expect to annotate that a
-// branch is likely or unlikely to be taken.
-//
-// There is a known dependency on this ratio in CodeGenPrepare when transforming
-// 'select' instructions. It may be worthwhile to hoist these values to some
-// shared space, so they can be used directly by other passes.
-
+//===- LowerExpectIntrinsic.cpp - Lower expect intrinsic ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the 'expect' intrinsic to LLVM metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerExpectIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-expect-intrinsic"
+
+STATISTIC(ExpectIntrinsicsHandled,
+ "Number of 'expect' intrinsic instructions handled");
+
+// These default values are chosen to represent an extremely skewed outcome for
+// a condition, but they leave some room for interpretation by later passes.
+//
+// If the documentation for __builtin_expect() was made explicit that it should
+// only be used in extreme cases, we could make this ratio higher. As it stands,
+// programmers may be using __builtin_expect() / llvm.expect to annotate that a
+// branch is likely or unlikely to be taken.
+//
+// There is a known dependency on this ratio in CodeGenPrepare when transforming
+// 'select' instructions. It may be worthwhile to hoist these values to some
+// shared space, so they can be used directly by other passes.
+
cl::opt<uint32_t> llvm::LikelyBranchWeight(
- "likely-branch-weight", cl::Hidden, cl::init(2000),
- cl::desc("Weight of the branch likely to be taken (default = 2000)"));
+ "likely-branch-weight", cl::Hidden, cl::init(2000),
+ cl::desc("Weight of the branch likely to be taken (default = 2000)"));
cl::opt<uint32_t> llvm::UnlikelyBranchWeight(
- "unlikely-branch-weight", cl::Hidden, cl::init(1),
- cl::desc("Weight of the branch unlikely to be taken (default = 1)"));
-
-static std::tuple<uint32_t, uint32_t>
-getBranchWeight(Intrinsic::ID IntrinsicID, CallInst *CI, int BranchCount) {
- if (IntrinsicID == Intrinsic::expect) {
- // __builtin_expect
- return std::make_tuple(LikelyBranchWeight.getValue(),
- UnlikelyBranchWeight.getValue());
- } else {
- // __builtin_expect_with_probability
- assert(CI->getNumOperands() >= 3 &&
- "expect with probability must have 3 arguments");
- ConstantFP *Confidence = dyn_cast<ConstantFP>(CI->getArgOperand(2));
- double TrueProb = Confidence->getValueAPF().convertToDouble();
- assert((TrueProb >= 0.0 && TrueProb <= 1.0) &&
- "probability value must be in the range [0.0, 1.0]");
- double FalseProb = (1.0 - TrueProb) / (BranchCount - 1);
- uint32_t LikelyBW = ceil((TrueProb * (double)(INT32_MAX - 1)) + 1.0);
- uint32_t UnlikelyBW = ceil((FalseProb * (double)(INT32_MAX - 1)) + 1.0);
- return std::make_tuple(LikelyBW, UnlikelyBW);
- }
-}
-
-static bool handleSwitchExpect(SwitchInst &SI) {
- CallInst *CI = dyn_cast<CallInst>(SI.getCondition());
- if (!CI)
- return false;
-
- Function *Fn = CI->getCalledFunction();
- if (!Fn || (Fn->getIntrinsicID() != Intrinsic::expect &&
- Fn->getIntrinsicID() != Intrinsic::expect_with_probability))
- return false;
-
- Value *ArgValue = CI->getArgOperand(0);
- ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
- if (!ExpectedValue)
- return false;
-
- SwitchInst::CaseHandle Case = *SI.findCaseValue(ExpectedValue);
- unsigned n = SI.getNumCases(); // +1 for default case.
- uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal;
- std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) =
- getBranchWeight(Fn->getIntrinsicID(), CI, n + 1);
-
- SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeightVal);
-
- uint64_t Index = (Case == *SI.case_default()) ? 0 : Case.getCaseIndex() + 1;
- Weights[Index] = LikelyBranchWeightVal;
-
- SI.setCondition(ArgValue);
-
- SI.setMetadata(LLVMContext::MD_prof,
- MDBuilder(CI->getContext()).createBranchWeights(Weights));
-
- return true;
-}
-
-/// Handler for PHINodes that define the value argument to an
-/// @llvm.expect call.
-///
-/// If the operand of the phi has a constant value and it 'contradicts'
-/// with the expected value of phi def, then the corresponding incoming
-/// edge of the phi is unlikely to be taken. Using that information,
-/// the branch probability info for the originating branch can be inferred.
-static void handlePhiDef(CallInst *Expect) {
- Value &Arg = *Expect->getArgOperand(0);
- ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(Expect->getArgOperand(1));
- if (!ExpectedValue)
- return;
- const APInt &ExpectedPhiValue = ExpectedValue->getValue();
-
- // Walk up in backward a list of instructions that
- // have 'copy' semantics by 'stripping' the copies
- // until a PHI node or an instruction of unknown kind
- // is reached. Negation via xor is also handled.
- //
- // C = PHI(...);
- // B = C;
- // A = B;
- // D = __builtin_expect(A, 0);
- //
- Value *V = &Arg;
- SmallVector<Instruction *, 4> Operations;
- while (!isa<PHINode>(V)) {
- if (ZExtInst *ZExt = dyn_cast<ZExtInst>(V)) {
- V = ZExt->getOperand(0);
- Operations.push_back(ZExt);
- continue;
- }
-
- if (SExtInst *SExt = dyn_cast<SExtInst>(V)) {
- V = SExt->getOperand(0);
- Operations.push_back(SExt);
- continue;
- }
-
- BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
- if (!BinOp || BinOp->getOpcode() != Instruction::Xor)
- return;
-
- ConstantInt *CInt = dyn_cast<ConstantInt>(BinOp->getOperand(1));
- if (!CInt)
- return;
-
- V = BinOp->getOperand(0);
- Operations.push_back(BinOp);
- }
-
- // Executes the recorded operations on input 'Value'.
- auto ApplyOperations = [&](const APInt &Value) {
- APInt Result = Value;
- for (auto Op : llvm::reverse(Operations)) {
- switch (Op->getOpcode()) {
- case Instruction::Xor:
- Result ^= cast<ConstantInt>(Op->getOperand(1))->getValue();
- break;
- case Instruction::ZExt:
- Result = Result.zext(Op->getType()->getIntegerBitWidth());
- break;
- case Instruction::SExt:
- Result = Result.sext(Op->getType()->getIntegerBitWidth());
- break;
- default:
- llvm_unreachable("Unexpected operation");
- }
- }
- return Result;
- };
-
- auto *PhiDef = cast<PHINode>(V);
-
- // Get the first dominating conditional branch of the operand
- // i's incoming block.
- auto GetDomConditional = [&](unsigned i) -> BranchInst * {
- BasicBlock *BB = PhiDef->getIncomingBlock(i);
- BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
- if (BI && BI->isConditional())
- return BI;
- BB = BB->getSinglePredecessor();
- if (!BB)
- return nullptr;
- BI = dyn_cast<BranchInst>(BB->getTerminator());
- if (!BI || BI->isUnconditional())
- return nullptr;
- return BI;
- };
-
- // Now walk through all Phi operands to find phi oprerands with values
- // conflicting with the expected phi output value. Any such operand
- // indicates the incoming edge to that operand is unlikely.
- for (unsigned i = 0, e = PhiDef->getNumIncomingValues(); i != e; ++i) {
-
- Value *PhiOpnd = PhiDef->getIncomingValue(i);
- ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
- if (!CI)
- continue;
-
- // Not an interesting case when IsUnlikely is false -- we can not infer
- // anything useful when the operand value matches the expected phi
- // output.
- if (ExpectedPhiValue == ApplyOperations(CI->getValue()))
- continue;
-
- BranchInst *BI = GetDomConditional(i);
- if (!BI)
- continue;
-
- MDBuilder MDB(PhiDef->getContext());
-
- // There are two situations in which an operand of the PhiDef comes
- // from a given successor of a branch instruction BI.
- // 1) When the incoming block of the operand is the successor block;
- // 2) When the incoming block is BI's enclosing block and the
- // successor is the PhiDef's enclosing block.
- //
- // Returns true if the operand which comes from OpndIncomingBB
- // comes from outgoing edge of BI that leads to Succ block.
- auto *OpndIncomingBB = PhiDef->getIncomingBlock(i);
- auto IsOpndComingFromSuccessor = [&](BasicBlock *Succ) {
- if (OpndIncomingBB == Succ)
- // If this successor is the incoming block for this
- // Phi operand, then this successor does lead to the Phi.
- return true;
- if (OpndIncomingBB == BI->getParent() && Succ == PhiDef->getParent())
- // Otherwise, if the edge is directly from the branch
- // to the Phi, this successor is the one feeding this
- // Phi operand.
- return true;
- return false;
- };
- uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal;
- std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) = getBranchWeight(
- Expect->getCalledFunction()->getIntrinsicID(), Expect, 2);
-
- if (IsOpndComingFromSuccessor(BI->getSuccessor(1)))
- BI->setMetadata(LLVMContext::MD_prof,
- MDB.createBranchWeights(LikelyBranchWeightVal,
- UnlikelyBranchWeightVal));
- else if (IsOpndComingFromSuccessor(BI->getSuccessor(0)))
- BI->setMetadata(LLVMContext::MD_prof,
- MDB.createBranchWeights(UnlikelyBranchWeightVal,
- LikelyBranchWeightVal));
- }
-}
-
-// Handle both BranchInst and SelectInst.
-template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
-
- // Handle non-optimized IR code like:
- // %expval = call i64 @llvm.expect.i64(i64 %conv1, i64 1)
- // %tobool = icmp ne i64 %expval, 0
- // br i1 %tobool, label %if.then, label %if.end
- //
- // Or the following simpler case:
- // %expval = call i1 @llvm.expect.i1(i1 %cmp, i1 1)
- // br i1 %expval, label %if.then, label %if.end
-
- CallInst *CI;
-
- ICmpInst *CmpI = dyn_cast<ICmpInst>(BSI.getCondition());
- CmpInst::Predicate Predicate;
- ConstantInt *CmpConstOperand = nullptr;
- if (!CmpI) {
- CI = dyn_cast<CallInst>(BSI.getCondition());
- Predicate = CmpInst::ICMP_NE;
- } else {
- Predicate = CmpI->getPredicate();
- if (Predicate != CmpInst::ICMP_NE && Predicate != CmpInst::ICMP_EQ)
- return false;
-
- CmpConstOperand = dyn_cast<ConstantInt>(CmpI->getOperand(1));
- if (!CmpConstOperand)
- return false;
- CI = dyn_cast<CallInst>(CmpI->getOperand(0));
- }
-
- if (!CI)
- return false;
-
- uint64_t ValueComparedTo = 0;
- if (CmpConstOperand) {
- if (CmpConstOperand->getBitWidth() > 64)
- return false;
- ValueComparedTo = CmpConstOperand->getZExtValue();
- }
-
- Function *Fn = CI->getCalledFunction();
- if (!Fn || (Fn->getIntrinsicID() != Intrinsic::expect &&
- Fn->getIntrinsicID() != Intrinsic::expect_with_probability))
- return false;
-
- Value *ArgValue = CI->getArgOperand(0);
- ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
- if (!ExpectedValue)
- return false;
-
- MDBuilder MDB(CI->getContext());
- MDNode *Node;
-
- uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal;
- std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) =
- getBranchWeight(Fn->getIntrinsicID(), CI, 2);
-
- if ((ExpectedValue->getZExtValue() == ValueComparedTo) ==
- (Predicate == CmpInst::ICMP_EQ)) {
- Node =
- MDB.createBranchWeights(LikelyBranchWeightVal, UnlikelyBranchWeightVal);
- } else {
- Node =
- MDB.createBranchWeights(UnlikelyBranchWeightVal, LikelyBranchWeightVal);
- }
-
- if (CmpI)
- CmpI->setOperand(0, ArgValue);
- else
- BSI.setCondition(ArgValue);
-
- BSI.setMetadata(LLVMContext::MD_prof, Node);
-
- return true;
-}
-
-static bool handleBranchExpect(BranchInst &BI) {
- if (BI.isUnconditional())
- return false;
-
- return handleBrSelExpect<BranchInst>(BI);
-}
-
-static bool lowerExpectIntrinsic(Function &F) {
- bool Changed = false;
-
- for (BasicBlock &BB : F) {
- // Create "block_weights" metadata.
- if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
- if (handleBranchExpect(*BI))
- ExpectIntrinsicsHandled++;
- } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
- if (handleSwitchExpect(*SI))
- ExpectIntrinsicsHandled++;
- }
-
- // Remove llvm.expect intrinsics. Iterate backwards in order
- // to process select instructions before the intrinsic gets
- // removed.
- for (auto BI = BB.rbegin(), BE = BB.rend(); BI != BE;) {
- Instruction *Inst = &*BI++;
- CallInst *CI = dyn_cast<CallInst>(Inst);
- if (!CI) {
- if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
- if (handleBrSelExpect(*SI))
- ExpectIntrinsicsHandled++;
- }
- continue;
- }
-
- Function *Fn = CI->getCalledFunction();
- if (Fn && (Fn->getIntrinsicID() == Intrinsic::expect ||
- Fn->getIntrinsicID() == Intrinsic::expect_with_probability)) {
- // Before erasing the llvm.expect, walk backward to find
- // phi that define llvm.expect's first arg, and
- // infer branch probability:
- handlePhiDef(CI);
- Value *Exp = CI->getArgOperand(0);
- CI->replaceAllUsesWith(Exp);
- CI->eraseFromParent();
- Changed = true;
- }
- }
- }
-
- return Changed;
-}
-
-PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F,
- FunctionAnalysisManager &) {
- if (lowerExpectIntrinsic(F))
- return PreservedAnalyses::none();
-
- return PreservedAnalyses::all();
-}
-
-namespace {
-/// Legacy pass for lowering expect intrinsics out of the IR.
-///
-/// When this pass is run over a function it uses expect intrinsics which feed
-/// branches and switches to provide branch weight metadata for those
-/// terminators. It then removes the expect intrinsics from the IR so the rest
-/// of the optimizer can ignore them.
-class LowerExpectIntrinsic : public FunctionPass {
-public:
- static char ID;
- LowerExpectIntrinsic() : FunctionPass(ID) {
- initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); }
-};
-}
-
-char LowerExpectIntrinsic::ID = 0;
-INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect",
- "Lower 'expect' Intrinsics", false, false)
-
-FunctionPass *llvm::createLowerExpectIntrinsicPass() {
- return new LowerExpectIntrinsic();
-}
+ "unlikely-branch-weight", cl::Hidden, cl::init(1),
+ cl::desc("Weight of the branch unlikely to be taken (default = 1)"));
+
+static std::tuple<uint32_t, uint32_t>
+getBranchWeight(Intrinsic::ID IntrinsicID, CallInst *CI, int BranchCount) {
+ if (IntrinsicID == Intrinsic::expect) {
+ // __builtin_expect
+ return std::make_tuple(LikelyBranchWeight.getValue(),
+ UnlikelyBranchWeight.getValue());
+ } else {
+ // __builtin_expect_with_probability
+ assert(CI->getNumOperands() >= 3 &&
+ "expect with probability must have 3 arguments");
+ ConstantFP *Confidence = dyn_cast<ConstantFP>(CI->getArgOperand(2));
+ double TrueProb = Confidence->getValueAPF().convertToDouble();
+ assert((TrueProb >= 0.0 && TrueProb <= 1.0) &&
+ "probability value must be in the range [0.0, 1.0]");
+ double FalseProb = (1.0 - TrueProb) / (BranchCount - 1);
+ uint32_t LikelyBW = ceil((TrueProb * (double)(INT32_MAX - 1)) + 1.0);
+ uint32_t UnlikelyBW = ceil((FalseProb * (double)(INT32_MAX - 1)) + 1.0);
+ return std::make_tuple(LikelyBW, UnlikelyBW);
+ }
+}
+
+static bool handleSwitchExpect(SwitchInst &SI) {
+ CallInst *CI = dyn_cast<CallInst>(SI.getCondition());
+ if (!CI)
+ return false;
+
+ Function *Fn = CI->getCalledFunction();
+ if (!Fn || (Fn->getIntrinsicID() != Intrinsic::expect &&
+ Fn->getIntrinsicID() != Intrinsic::expect_with_probability))
+ return false;
+
+ Value *ArgValue = CI->getArgOperand(0);
+ ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ if (!ExpectedValue)
+ return false;
+
+ SwitchInst::CaseHandle Case = *SI.findCaseValue(ExpectedValue);
+ unsigned n = SI.getNumCases(); // +1 for default case.
+ uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal;
+ std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) =
+ getBranchWeight(Fn->getIntrinsicID(), CI, n + 1);
+
+ SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeightVal);
+
+ uint64_t Index = (Case == *SI.case_default()) ? 0 : Case.getCaseIndex() + 1;
+ Weights[Index] = LikelyBranchWeightVal;
+
+ SI.setCondition(ArgValue);
+
+ SI.setMetadata(LLVMContext::MD_prof,
+ MDBuilder(CI->getContext()).createBranchWeights(Weights));
+
+ return true;
+}
+
+/// Handler for PHINodes that define the value argument to an
+/// @llvm.expect call.
+///
+/// If the operand of the phi has a constant value and it 'contradicts'
+/// with the expected value of phi def, then the corresponding incoming
+/// edge of the phi is unlikely to be taken. Using that information,
+/// the branch probability info for the originating branch can be inferred.
+static void handlePhiDef(CallInst *Expect) {
+ Value &Arg = *Expect->getArgOperand(0);
+ ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(Expect->getArgOperand(1));
+ if (!ExpectedValue)
+ return;
+ const APInt &ExpectedPhiValue = ExpectedValue->getValue();
+
+ // Walk up in backward a list of instructions that
+ // have 'copy' semantics by 'stripping' the copies
+ // until a PHI node or an instruction of unknown kind
+ // is reached. Negation via xor is also handled.
+ //
+ // C = PHI(...);
+ // B = C;
+ // A = B;
+ // D = __builtin_expect(A, 0);
+ //
+ Value *V = &Arg;
+ SmallVector<Instruction *, 4> Operations;
+ while (!isa<PHINode>(V)) {
+ if (ZExtInst *ZExt = dyn_cast<ZExtInst>(V)) {
+ V = ZExt->getOperand(0);
+ Operations.push_back(ZExt);
+ continue;
+ }
+
+ if (SExtInst *SExt = dyn_cast<SExtInst>(V)) {
+ V = SExt->getOperand(0);
+ Operations.push_back(SExt);
+ continue;
+ }
+
+ BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
+ if (!BinOp || BinOp->getOpcode() != Instruction::Xor)
+ return;
+
+ ConstantInt *CInt = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+ if (!CInt)
+ return;
+
+ V = BinOp->getOperand(0);
+ Operations.push_back(BinOp);
+ }
+
+ // Executes the recorded operations on input 'Value'.
+ auto ApplyOperations = [&](const APInt &Value) {
+ APInt Result = Value;
+ for (auto Op : llvm::reverse(Operations)) {
+ switch (Op->getOpcode()) {
+ case Instruction::Xor:
+ Result ^= cast<ConstantInt>(Op->getOperand(1))->getValue();
+ break;
+ case Instruction::ZExt:
+ Result = Result.zext(Op->getType()->getIntegerBitWidth());
+ break;
+ case Instruction::SExt:
+ Result = Result.sext(Op->getType()->getIntegerBitWidth());
+ break;
+ default:
+ llvm_unreachable("Unexpected operation");
+ }
+ }
+ return Result;
+ };
+
+ auto *PhiDef = cast<PHINode>(V);
+
+ // Get the first dominating conditional branch of the operand
+ // i's incoming block.
+ auto GetDomConditional = [&](unsigned i) -> BranchInst * {
+ BasicBlock *BB = PhiDef->getIncomingBlock(i);
+ BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (BI && BI->isConditional())
+ return BI;
+ BB = BB->getSinglePredecessor();
+ if (!BB)
+ return nullptr;
+ BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || BI->isUnconditional())
+ return nullptr;
+ return BI;
+ };
+
+ // Now walk through all Phi operands to find phi oprerands with values
+ // conflicting with the expected phi output value. Any such operand
+ // indicates the incoming edge to that operand is unlikely.
+ for (unsigned i = 0, e = PhiDef->getNumIncomingValues(); i != e; ++i) {
+
+ Value *PhiOpnd = PhiDef->getIncomingValue(i);
+ ConstantInt *CI = dyn_cast<ConstantInt>(PhiOpnd);
+ if (!CI)
+ continue;
+
+ // Not an interesting case when IsUnlikely is false -- we can not infer
+ // anything useful when the operand value matches the expected phi
+ // output.
+ if (ExpectedPhiValue == ApplyOperations(CI->getValue()))
+ continue;
+
+ BranchInst *BI = GetDomConditional(i);
+ if (!BI)
+ continue;
+
+ MDBuilder MDB(PhiDef->getContext());
+
+ // There are two situations in which an operand of the PhiDef comes
+ // from a given successor of a branch instruction BI.
+ // 1) When the incoming block of the operand is the successor block;
+ // 2) When the incoming block is BI's enclosing block and the
+ // successor is the PhiDef's enclosing block.
+ //
+ // Returns true if the operand which comes from OpndIncomingBB
+ // comes from outgoing edge of BI that leads to Succ block.
+ auto *OpndIncomingBB = PhiDef->getIncomingBlock(i);
+ auto IsOpndComingFromSuccessor = [&](BasicBlock *Succ) {
+ if (OpndIncomingBB == Succ)
+ // If this successor is the incoming block for this
+ // Phi operand, then this successor does lead to the Phi.
+ return true;
+ if (OpndIncomingBB == BI->getParent() && Succ == PhiDef->getParent())
+ // Otherwise, if the edge is directly from the branch
+ // to the Phi, this successor is the one feeding this
+ // Phi operand.
+ return true;
+ return false;
+ };
+ uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal;
+ std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) = getBranchWeight(
+ Expect->getCalledFunction()->getIntrinsicID(), Expect, 2);
+
+ if (IsOpndComingFromSuccessor(BI->getSuccessor(1)))
+ BI->setMetadata(LLVMContext::MD_prof,
+ MDB.createBranchWeights(LikelyBranchWeightVal,
+ UnlikelyBranchWeightVal));
+ else if (IsOpndComingFromSuccessor(BI->getSuccessor(0)))
+ BI->setMetadata(LLVMContext::MD_prof,
+ MDB.createBranchWeights(UnlikelyBranchWeightVal,
+ LikelyBranchWeightVal));
+ }
+}
+
+// Handle both BranchInst and SelectInst.
+template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
+
+ // Handle non-optimized IR code like:
+ // %expval = call i64 @llvm.expect.i64(i64 %conv1, i64 1)
+ // %tobool = icmp ne i64 %expval, 0
+ // br i1 %tobool, label %if.then, label %if.end
+ //
+ // Or the following simpler case:
+ // %expval = call i1 @llvm.expect.i1(i1 %cmp, i1 1)
+ // br i1 %expval, label %if.then, label %if.end
+
+ CallInst *CI;
+
+ ICmpInst *CmpI = dyn_cast<ICmpInst>(BSI.getCondition());
+ CmpInst::Predicate Predicate;
+ ConstantInt *CmpConstOperand = nullptr;
+ if (!CmpI) {
+ CI = dyn_cast<CallInst>(BSI.getCondition());
+ Predicate = CmpInst::ICMP_NE;
+ } else {
+ Predicate = CmpI->getPredicate();
+ if (Predicate != CmpInst::ICMP_NE && Predicate != CmpInst::ICMP_EQ)
+ return false;
+
+ CmpConstOperand = dyn_cast<ConstantInt>(CmpI->getOperand(1));
+ if (!CmpConstOperand)
+ return false;
+ CI = dyn_cast<CallInst>(CmpI->getOperand(0));
+ }
+
+ if (!CI)
+ return false;
+
+ uint64_t ValueComparedTo = 0;
+ if (CmpConstOperand) {
+ if (CmpConstOperand->getBitWidth() > 64)
+ return false;
+ ValueComparedTo = CmpConstOperand->getZExtValue();
+ }
+
+ Function *Fn = CI->getCalledFunction();
+ if (!Fn || (Fn->getIntrinsicID() != Intrinsic::expect &&
+ Fn->getIntrinsicID() != Intrinsic::expect_with_probability))
+ return false;
+
+ Value *ArgValue = CI->getArgOperand(0);
+ ConstantInt *ExpectedValue = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ if (!ExpectedValue)
+ return false;
+
+ MDBuilder MDB(CI->getContext());
+ MDNode *Node;
+
+ uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal;
+ std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) =
+ getBranchWeight(Fn->getIntrinsicID(), CI, 2);
+
+ if ((ExpectedValue->getZExtValue() == ValueComparedTo) ==
+ (Predicate == CmpInst::ICMP_EQ)) {
+ Node =
+ MDB.createBranchWeights(LikelyBranchWeightVal, UnlikelyBranchWeightVal);
+ } else {
+ Node =
+ MDB.createBranchWeights(UnlikelyBranchWeightVal, LikelyBranchWeightVal);
+ }
+
+ if (CmpI)
+ CmpI->setOperand(0, ArgValue);
+ else
+ BSI.setCondition(ArgValue);
+
+ BSI.setMetadata(LLVMContext::MD_prof, Node);
+
+ return true;
+}
+
+static bool handleBranchExpect(BranchInst &BI) {
+ if (BI.isUnconditional())
+ return false;
+
+ return handleBrSelExpect<BranchInst>(BI);
+}
+
+static bool lowerExpectIntrinsic(Function &F) {
+ bool Changed = false;
+
+ for (BasicBlock &BB : F) {
+ // Create "block_weights" metadata.
+ if (BranchInst *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
+ if (handleBranchExpect(*BI))
+ ExpectIntrinsicsHandled++;
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB.getTerminator())) {
+ if (handleSwitchExpect(*SI))
+ ExpectIntrinsicsHandled++;
+ }
+
+ // Remove llvm.expect intrinsics. Iterate backwards in order
+ // to process select instructions before the intrinsic gets
+ // removed.
+ for (auto BI = BB.rbegin(), BE = BB.rend(); BI != BE;) {
+ Instruction *Inst = &*BI++;
+ CallInst *CI = dyn_cast<CallInst>(Inst);
+ if (!CI) {
+ if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
+ if (handleBrSelExpect(*SI))
+ ExpectIntrinsicsHandled++;
+ }
+ continue;
+ }
+
+ Function *Fn = CI->getCalledFunction();
+ if (Fn && (Fn->getIntrinsicID() == Intrinsic::expect ||
+ Fn->getIntrinsicID() == Intrinsic::expect_with_probability)) {
+ // Before erasing the llvm.expect, walk backward to find
+ // phi that define llvm.expect's first arg, and
+ // infer branch probability:
+ handlePhiDef(CI);
+ Value *Exp = CI->getArgOperand(0);
+ CI->replaceAllUsesWith(Exp);
+ CI->eraseFromParent();
+ Changed = true;
+ }
+ }
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses LowerExpectIntrinsicPass::run(Function &F,
+ FunctionAnalysisManager &) {
+ if (lowerExpectIntrinsic(F))
+ return PreservedAnalyses::none();
+
+ return PreservedAnalyses::all();
+}
+
+namespace {
+/// Legacy pass for lowering expect intrinsics out of the IR.
+///
+/// When this pass is run over a function it uses expect intrinsics which feed
+/// branches and switches to provide branch weight metadata for those
+/// terminators. It then removes the expect intrinsics from the IR so the rest
+/// of the optimizer can ignore them.
+class LowerExpectIntrinsic : public FunctionPass {
+public:
+ static char ID;
+ LowerExpectIntrinsic() : FunctionPass(ID) {
+ initializeLowerExpectIntrinsicPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override { return lowerExpectIntrinsic(F); }
+};
+}
+
+char LowerExpectIntrinsic::ID = 0;
+INITIALIZE_PASS(LowerExpectIntrinsic, "lower-expect",
+ "Lower 'expect' Intrinsics", false, false)
+
+FunctionPass *llvm::createLowerExpectIntrinsicPass() {
+ return new LowerExpectIntrinsic();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
index a431205777..45f5929e3b 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
@@ -1,90 +1,90 @@
-//===- LowerGuardIntrinsic.cpp - Lower the guard intrinsic ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass lowers the llvm.experimental.guard intrinsic to a conditional call
-// to @llvm.experimental.deoptimize. Once this happens, the guard can no longer
-// be widened.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/GuardUtils.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/GuardUtils.h"
-
-using namespace llvm;
-
-namespace {
-struct LowerGuardIntrinsicLegacyPass : public FunctionPass {
- static char ID;
- LowerGuardIntrinsicLegacyPass() : FunctionPass(ID) {
- initializeLowerGuardIntrinsicLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-};
-}
-
-static bool lowerGuardIntrinsic(Function &F) {
- // Check if we can cheaply rule out the possibility of not having any work to
- // do.
- auto *GuardDecl = F.getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_guard));
- if (!GuardDecl || GuardDecl->use_empty())
- return false;
-
- SmallVector<CallInst *, 8> ToLower;
- for (auto &I : instructions(F))
- if (isGuard(&I))
- ToLower.push_back(cast<CallInst>(&I));
-
- if (ToLower.empty())
- return false;
-
- auto *DeoptIntrinsic = Intrinsic::getDeclaration(
- F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()});
- DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv());
-
- for (auto *CI : ToLower) {
- makeGuardControlFlowExplicit(DeoptIntrinsic, CI, false);
- CI->eraseFromParent();
- }
-
- return true;
-}
-
-bool LowerGuardIntrinsicLegacyPass::runOnFunction(Function &F) {
- return lowerGuardIntrinsic(F);
-}
-
-char LowerGuardIntrinsicLegacyPass::ID = 0;
-INITIALIZE_PASS(LowerGuardIntrinsicLegacyPass, "lower-guard-intrinsic",
- "Lower the guard intrinsic to normal control flow", false,
- false)
-
-Pass *llvm::createLowerGuardIntrinsicPass() {
- return new LowerGuardIntrinsicLegacyPass();
-}
-
-PreservedAnalyses LowerGuardIntrinsicPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- if (lowerGuardIntrinsic(F))
- return PreservedAnalyses::none();
-
- return PreservedAnalyses::all();
-}
+//===- LowerGuardIntrinsic.cpp - Lower the guard intrinsic ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the llvm.experimental.guard intrinsic to a conditional call
+// to @llvm.experimental.deoptimize. Once this happens, the guard can no longer
+// be widened.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerGuardIntrinsic.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
+
+using namespace llvm;
+
+namespace {
+struct LowerGuardIntrinsicLegacyPass : public FunctionPass {
+ static char ID;
+ LowerGuardIntrinsicLegacyPass() : FunctionPass(ID) {
+ initializeLowerGuardIntrinsicLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+};
+}
+
+static bool lowerGuardIntrinsic(Function &F) {
+ // Check if we can cheaply rule out the possibility of not having any work to
+ // do.
+ auto *GuardDecl = F.getParent()->getFunction(
+ Intrinsic::getName(Intrinsic::experimental_guard));
+ if (!GuardDecl || GuardDecl->use_empty())
+ return false;
+
+ SmallVector<CallInst *, 8> ToLower;
+ for (auto &I : instructions(F))
+ if (isGuard(&I))
+ ToLower.push_back(cast<CallInst>(&I));
+
+ if (ToLower.empty())
+ return false;
+
+ auto *DeoptIntrinsic = Intrinsic::getDeclaration(
+ F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()});
+ DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv());
+
+ for (auto *CI : ToLower) {
+ makeGuardControlFlowExplicit(DeoptIntrinsic, CI, false);
+ CI->eraseFromParent();
+ }
+
+ return true;
+}
+
+bool LowerGuardIntrinsicLegacyPass::runOnFunction(Function &F) {
+ return lowerGuardIntrinsic(F);
+}
+
+char LowerGuardIntrinsicLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerGuardIntrinsicLegacyPass, "lower-guard-intrinsic",
+ "Lower the guard intrinsic to normal control flow", false,
+ false)
+
+Pass *llvm::createLowerGuardIntrinsicPass() {
+ return new LowerGuardIntrinsicLegacyPass();
+}
+
+PreservedAnalyses LowerGuardIntrinsicPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ if (lowerGuardIntrinsic(F))
+ return PreservedAnalyses::none();
+
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 78e926254e..8e251ca940 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1,1214 +1,1214 @@
-//===- LowerMatrixIntrinsics.cpp - Lower matrix intrinsics -----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Lower matrix intrinsics to vector operations.
-//
-// TODO:
-// * Improve fusion:
-// * Support more cases, e.g. multiply-add, multiply-sub, operands/results
-// transposed.
-// * Improve cost-modeling, e.g. choose different number of rows/columns
-// columns for tiles, consider cost of copies on alias.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
-#include "llvm/ADT/GraphTraits.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Alignment.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+//===- LowerMatrixIntrinsics.cpp - Lower matrix intrinsics -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower matrix intrinsics to vector operations.
+//
+// TODO:
+// * Improve fusion:
+// * Support more cases, e.g. multiply-add, multiply-sub, operands/results
+// transposed.
+// * Improve cost-modeling, e.g. choose different number of rows/columns
+// columns for tiles, consider cost of copies on alias.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerMatrixIntrinsics.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
#include "llvm/Transforms/Utils/MatrixUtils.h"
-
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "lower-matrix-intrinsics"
-
-static cl::opt<bool> EnableShapePropagation(
- "matrix-propagate-shape", cl::init(true), cl::Hidden,
- cl::desc("Enable/disable shape propagation from matrix intrinsics to other "
- "instructions."));
-
-static cl::opt<bool>
- FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden,
- cl::desc("Enable/disable fusing matrix instructions."));
-// TODO: Allow and use non-square tiles.
-static cl::opt<unsigned> TileSize(
- "fuse-matrix-tile-size", cl::init(4), cl::Hidden,
- cl::desc(
- "Tile size for matrix instruction fusion using square-shaped tiles."));
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "lower-matrix-intrinsics"
+
+static cl::opt<bool> EnableShapePropagation(
+ "matrix-propagate-shape", cl::init(true), cl::Hidden,
+ cl::desc("Enable/disable shape propagation from matrix intrinsics to other "
+ "instructions."));
+
+static cl::opt<bool>
+ FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden,
+ cl::desc("Enable/disable fusing matrix instructions."));
+// TODO: Allow and use non-square tiles.
+static cl::opt<unsigned> TileSize(
+ "fuse-matrix-tile-size", cl::init(4), cl::Hidden,
+ cl::desc(
+ "Tile size for matrix instruction fusion using square-shaped tiles."));
static cl::opt<bool> TileUseLoops("fuse-matrix-use-loops", cl::init(false),
cl::Hidden,
cl::desc("Generate loop nest for tiling."));
-static cl::opt<bool> ForceFusion(
- "force-fuse-matrix", cl::init(false), cl::Hidden,
- cl::desc("Force matrix instruction fusion even if not profitable."));
-static cl::opt<bool> AllowContractEnabled(
- "matrix-allow-contract", cl::init(false), cl::Hidden,
- cl::desc("Allow the use of FMAs if available and profitable. This may "
- "result in different results, due to less rounding error."));
-
-enum class MatrixLayoutTy { ColumnMajor, RowMajor };
-
-static cl::opt<MatrixLayoutTy> MatrixLayout(
- "matrix-default-layout", cl::init(MatrixLayoutTy::ColumnMajor),
- cl::desc("Sets the default matrix layout"),
- cl::values(clEnumValN(MatrixLayoutTy::ColumnMajor, "column-major",
- "Use column-major layout"),
- clEnumValN(MatrixLayoutTy::RowMajor, "row-major",
- "Use row-major layout")));
-
-/// Helper function to either return Scope, if it is a subprogram or the
-/// attached subprogram for a local scope.
-static DISubprogram *getSubprogram(DIScope *Scope) {
- if (auto *Subprogram = dyn_cast<DISubprogram>(Scope))
- return Subprogram;
- return cast<DILocalScope>(Scope)->getSubprogram();
-}
-
-namespace {
-
-// Given an element pointer \p BasePtr to the start of a (sub) matrix, compute
-// the start address of vector \p VecIdx with type (\p EltType x \p NumElements)
-// assuming \p Stride elements between start two consecutive vectors.
-// \p Stride must be >= \p NumElements.
-// For column-major matrixes, the function computes the address of a column
-// vectors and \p NumElements must be set to the number of elements in a column
-// (= number of rows of the matrix). For row-major matrixes, the function
-// computes the address of a row vector and \p NumElements must be set to the
-// number of elements in a column (= number of columns of the matrix).
-//
-// Consider a 4x4 matrix in column-mjaor layout like below
-//
-// 0 1 2 3
-// 0 v_0_0 v_0_1 v_0_2 v_0_3
-// 1 v_1_0 v_1_1 v_1_2 v_1_3
-// 2 v_2_0 v_2_1 v_2_2 v_2_3
-// 3 v_3_0 v_3_1 v_3_2 v_3_3
-
-// To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1,
-// we need a pointer to the first element of the submatrix as base pointer.
-// Then we can use computeVectorAddr to compute the addresses for the columns
-// of the sub-matrix.
-//
-// Column 0: computeVectorAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..)
-// -> just returns Base
-// Column 1: computeVectorAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..)
-// -> returns Base + (1 * 4)
-// Column 2: computeVectorAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..)
-// -> returns Base + (2 * 4)
-//
-// The graphic below illustrates the number of elements in a column (marked
-// with |) and the number of skipped elements (marked with }).
-//
-// v_0_0 v_0_1 {v_0_2 {v_0_3
-// Base Col 1 Col 2
-// | | |
-// v_1_0 |v_1_1 |v_1_2 |v_1_3
-// v_2_0 |v_2_1 |v_2_2 |v_2_3
-// v_3_0 {v_3_1 {v_3_2 v_3_3
-//
-Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,
- unsigned NumElements, Type *EltType,
- IRBuilder<> &Builder) {
-
- assert((!isa<ConstantInt>(Stride) ||
- cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) &&
- "Stride must be >= the number of elements in the result vector.");
- unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
-
- // Compute the start of the vector with index VecIdx as VecIdx * Stride.
- Value *VecStart = Builder.CreateMul(VecIdx, Stride, "vec.start");
-
- // Get pointer to the start of the selected vector. Skip GEP creation,
- // if we select vector 0.
- if (isa<ConstantInt>(VecStart) && cast<ConstantInt>(VecStart)->isZero())
- VecStart = BasePtr;
- else
- VecStart = Builder.CreateGEP(EltType, BasePtr, VecStart, "vec.gep");
-
- // Cast elementwise vector start pointer to a pointer to a vector
- // (EltType x NumElements)*.
- auto *VecType = FixedVectorType::get(EltType, NumElements);
- Type *VecPtrType = PointerType::get(VecType, AS);
- return Builder.CreatePointerCast(VecStart, VecPtrType, "vec.cast");
-}
-
-/// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.
-///
-/// Currently, the lowering for each matrix intrinsic is done as follows:
-/// 1. Propagate the shape information from intrinsics to connected
-/// instructions.
-/// 2. Lower instructions with shape information (assuming column-major layout).
-/// The lowering works similarly using row-major layout.
-/// 2.1. Get column vectors for each argument. If we already lowered the
-/// definition of an argument, use the produced column vectors directly.
-/// If not, split the operand vector containing an embedded matrix into
-/// a set of column vectors,
-/// 2.2. Lower the instruction in terms of column major operations, which
-/// yields a set of column vectors containing result matrix. Note that we
-/// lower all instructions that have shape information. Besides the
-/// intrinsics, this includes stores for example.
-/// 2.3. Update uses of the lowered instruction. If we have shape information
-/// for a user, there is nothing to do, as we will look up the result
-/// column matrix when lowering the user. For other uses, we embed the
-/// result matrix in a flat vector and update the use.
-/// 2.4. Cache the result column matrix for the instruction we lowered
-/// 3. After we lowered all instructions in a function, remove the now
-/// obsolete instructions.
-///
-class LowerMatrixIntrinsics {
- Function &Func;
- const DataLayout &DL;
- const TargetTransformInfo &TTI;
+static cl::opt<bool> ForceFusion(
+ "force-fuse-matrix", cl::init(false), cl::Hidden,
+ cl::desc("Force matrix instruction fusion even if not profitable."));
+static cl::opt<bool> AllowContractEnabled(
+ "matrix-allow-contract", cl::init(false), cl::Hidden,
+ cl::desc("Allow the use of FMAs if available and profitable. This may "
+ "result in different results, due to less rounding error."));
+
+enum class MatrixLayoutTy { ColumnMajor, RowMajor };
+
+static cl::opt<MatrixLayoutTy> MatrixLayout(
+ "matrix-default-layout", cl::init(MatrixLayoutTy::ColumnMajor),
+ cl::desc("Sets the default matrix layout"),
+ cl::values(clEnumValN(MatrixLayoutTy::ColumnMajor, "column-major",
+ "Use column-major layout"),
+ clEnumValN(MatrixLayoutTy::RowMajor, "row-major",
+ "Use row-major layout")));
+
+/// Helper function to either return Scope, if it is a subprogram or the
+/// attached subprogram for a local scope.
+static DISubprogram *getSubprogram(DIScope *Scope) {
+ if (auto *Subprogram = dyn_cast<DISubprogram>(Scope))
+ return Subprogram;
+ return cast<DILocalScope>(Scope)->getSubprogram();
+}
+
+namespace {
+
+// Given an element pointer \p BasePtr to the start of a (sub) matrix, compute
+// the start address of vector \p VecIdx with type (\p EltType x \p NumElements)
+// assuming \p Stride elements between start two consecutive vectors.
+// \p Stride must be >= \p NumElements.
+// For column-major matrixes, the function computes the address of a column
+// vectors and \p NumElements must be set to the number of elements in a column
+// (= number of rows of the matrix). For row-major matrixes, the function
+// computes the address of a row vector and \p NumElements must be set to the
+// number of elements in a column (= number of columns of the matrix).
+//
+// Consider a 4x4 matrix in column-mjaor layout like below
+//
+// 0 1 2 3
+// 0 v_0_0 v_0_1 v_0_2 v_0_3
+// 1 v_1_0 v_1_1 v_1_2 v_1_3
+// 2 v_2_0 v_2_1 v_2_2 v_2_3
+// 3 v_3_0 v_3_1 v_3_2 v_3_3
+
+// To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1,
+// we need a pointer to the first element of the submatrix as base pointer.
+// Then we can use computeVectorAddr to compute the addresses for the columns
+// of the sub-matrix.
+//
+// Column 0: computeVectorAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..)
+// -> just returns Base
+// Column 1: computeVectorAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..)
+// -> returns Base + (1 * 4)
+// Column 2: computeVectorAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..)
+// -> returns Base + (2 * 4)
+//
+// The graphic below illustrates the number of elements in a column (marked
+// with |) and the number of skipped elements (marked with }).
+//
+// v_0_0 v_0_1 {v_0_2 {v_0_3
+// Base Col 1 Col 2
+// | | |
+// v_1_0 |v_1_1 |v_1_2 |v_1_3
+// v_2_0 |v_2_1 |v_2_2 |v_2_3
+// v_3_0 {v_3_1 {v_3_2 v_3_3
+//
+Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,
+ unsigned NumElements, Type *EltType,
+ IRBuilder<> &Builder) {
+
+ assert((!isa<ConstantInt>(Stride) ||
+ cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) &&
+ "Stride must be >= the number of elements in the result vector.");
+ unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
+
+ // Compute the start of the vector with index VecIdx as VecIdx * Stride.
+ Value *VecStart = Builder.CreateMul(VecIdx, Stride, "vec.start");
+
+ // Get pointer to the start of the selected vector. Skip GEP creation,
+ // if we select vector 0.
+ if (isa<ConstantInt>(VecStart) && cast<ConstantInt>(VecStart)->isZero())
+ VecStart = BasePtr;
+ else
+ VecStart = Builder.CreateGEP(EltType, BasePtr, VecStart, "vec.gep");
+
+ // Cast elementwise vector start pointer to a pointer to a vector
+ // (EltType x NumElements)*.
+ auto *VecType = FixedVectorType::get(EltType, NumElements);
+ Type *VecPtrType = PointerType::get(VecType, AS);
+ return Builder.CreatePointerCast(VecStart, VecPtrType, "vec.cast");
+}
+
+/// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.
+///
+/// Currently, the lowering for each matrix intrinsic is done as follows:
+/// 1. Propagate the shape information from intrinsics to connected
+/// instructions.
+/// 2. Lower instructions with shape information (assuming column-major layout).
+/// The lowering works similarly using row-major layout.
+/// 2.1. Get column vectors for each argument. If we already lowered the
+/// definition of an argument, use the produced column vectors directly.
+/// If not, split the operand vector containing an embedded matrix into
+/// a set of column vectors,
+/// 2.2. Lower the instruction in terms of column major operations, which
+/// yields a set of column vectors containing result matrix. Note that we
+/// lower all instructions that have shape information. Besides the
+/// intrinsics, this includes stores for example.
+/// 2.3. Update uses of the lowered instruction. If we have shape information
+/// for a user, there is nothing to do, as we will look up the result
+/// column matrix when lowering the user. For other uses, we embed the
+/// result matrix in a flat vector and update the use.
+/// 2.4. Cache the result column matrix for the instruction we lowered
+/// 3. After we lowered all instructions in a function, remove the now
+/// obsolete instructions.
+///
+class LowerMatrixIntrinsics {
+ Function &Func;
+ const DataLayout &DL;
+ const TargetTransformInfo &TTI;
AliasAnalysis *AA;
DominatorTree *DT;
LoopInfo *LI;
OptimizationRemarkEmitter *ORE;
-
- /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.
- struct OpInfoTy {
- /// Number of stores emitted to generate this matrix.
- unsigned NumStores = 0;
- /// Number of loads emitted to generate this matrix.
- unsigned NumLoads = 0;
- /// Number of compute operations emitted to generate this matrix.
- unsigned NumComputeOps = 0;
-
- OpInfoTy &operator+=(const OpInfoTy &RHS) {
- NumStores += RHS.NumStores;
- NumLoads += RHS.NumLoads;
- NumComputeOps += RHS.NumComputeOps;
- return *this;
- }
- };
-
- /// Wrapper class representing a matrix as a set of vectors, either in row or
- /// column major layout. All vectors must have the same vector type.
- class MatrixTy {
- SmallVector<Value *, 16> Vectors;
-
- OpInfoTy OpInfo;
-
- bool IsColumnMajor = true;
-
- public:
- MatrixTy()
- : Vectors(),
- IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
- MatrixTy(ArrayRef<Value *> Vectors)
- : Vectors(Vectors.begin(), Vectors.end()),
- IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
- MatrixTy(unsigned NumRows, unsigned NumColumns, Type *EltTy)
- : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {
-
- unsigned D = isColumnMajor() ? NumColumns : NumRows;
- for (unsigned J = 0; J < D; ++J)
- addVector(UndefValue::get(FixedVectorType::get(
- EltTy, isColumnMajor() ? NumRows : NumColumns)));
- }
-
- Value *getVector(unsigned i) const { return Vectors[i]; }
- Value *getColumn(unsigned i) const {
- assert(isColumnMajor() && "only supported for column-major matrixes");
- return Vectors[i];
- }
- Value *getRow(unsigned i) const {
- assert(!isColumnMajor() && "only supported for row-major matrixes");
- return Vectors[i];
- }
-
- void setVector(unsigned i, Value *V) { Vectors[i] = V; }
-
+
+ /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.
+ struct OpInfoTy {
+ /// Number of stores emitted to generate this matrix.
+ unsigned NumStores = 0;
+ /// Number of loads emitted to generate this matrix.
+ unsigned NumLoads = 0;
+ /// Number of compute operations emitted to generate this matrix.
+ unsigned NumComputeOps = 0;
+
+ OpInfoTy &operator+=(const OpInfoTy &RHS) {
+ NumStores += RHS.NumStores;
+ NumLoads += RHS.NumLoads;
+ NumComputeOps += RHS.NumComputeOps;
+ return *this;
+ }
+ };
+
+ /// Wrapper class representing a matrix as a set of vectors, either in row or
+ /// column major layout. All vectors must have the same vector type.
+ class MatrixTy {
+ SmallVector<Value *, 16> Vectors;
+
+ OpInfoTy OpInfo;
+
+ bool IsColumnMajor = true;
+
+ public:
+ MatrixTy()
+ : Vectors(),
+ IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
+ MatrixTy(ArrayRef<Value *> Vectors)
+ : Vectors(Vectors.begin(), Vectors.end()),
+ IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
+ MatrixTy(unsigned NumRows, unsigned NumColumns, Type *EltTy)
+ : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {
+
+ unsigned D = isColumnMajor() ? NumColumns : NumRows;
+ for (unsigned J = 0; J < D; ++J)
+ addVector(UndefValue::get(FixedVectorType::get(
+ EltTy, isColumnMajor() ? NumRows : NumColumns)));
+ }
+
+ Value *getVector(unsigned i) const { return Vectors[i]; }
+ Value *getColumn(unsigned i) const {
+ assert(isColumnMajor() && "only supported for column-major matrixes");
+ return Vectors[i];
+ }
+ Value *getRow(unsigned i) const {
+ assert(!isColumnMajor() && "only supported for row-major matrixes");
+ return Vectors[i];
+ }
+
+ void setVector(unsigned i, Value *V) { Vectors[i] = V; }
+
Type *getElementType() const { return getVectorTy()->getElementType(); }
-
- unsigned getNumVectors() const {
- if (isColumnMajor())
- return getNumColumns();
- return getNumRows();
- }
-
- unsigned getNumColumns() const {
- if (isColumnMajor())
- return Vectors.size();
- else {
- assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");
- return cast<FixedVectorType>(Vectors[0]->getType())->getNumElements();
- }
- }
- unsigned getNumRows() const {
- if (isColumnMajor()) {
- assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");
- return cast<FixedVectorType>(Vectors[0]->getType())->getNumElements();
- } else
- return Vectors.size();
- }
-
- void addVector(Value *V) { Vectors.push_back(V); }
- VectorType *getColumnTy() {
- assert(isColumnMajor() && "only supported for column-major matrixes");
- return getVectorTy();
- }
-
+
+ unsigned getNumVectors() const {
+ if (isColumnMajor())
+ return getNumColumns();
+ return getNumRows();
+ }
+
+ unsigned getNumColumns() const {
+ if (isColumnMajor())
+ return Vectors.size();
+ else {
+ assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");
+ return cast<FixedVectorType>(Vectors[0]->getType())->getNumElements();
+ }
+ }
+ unsigned getNumRows() const {
+ if (isColumnMajor()) {
+ assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");
+ return cast<FixedVectorType>(Vectors[0]->getType())->getNumElements();
+ } else
+ return Vectors.size();
+ }
+
+ void addVector(Value *V) { Vectors.push_back(V); }
+ VectorType *getColumnTy() {
+ assert(isColumnMajor() && "only supported for column-major matrixes");
+ return getVectorTy();
+ }
+
VectorType *getVectorTy() const {
- return cast<VectorType>(Vectors[0]->getType());
- }
-
- iterator_range<SmallVector<Value *, 8>::iterator> columns() {
- assert(isColumnMajor() &&
- "columns() only supported for column-major matrixes");
- return make_range(Vectors.begin(), Vectors.end());
- }
-
- iterator_range<SmallVector<Value *, 8>::iterator> vectors() {
- return make_range(Vectors.begin(), Vectors.end());
- }
-
- /// Embed the vectors of the matrix into a flat vector by concatenating
- /// them.
- Value *embedInVector(IRBuilder<> &Builder) const {
- return Vectors.size() == 1 ? Vectors[0]
- : concatenateVectors(Builder, Vectors);
- }
-
- MatrixTy &addNumLoads(unsigned N) {
- OpInfo.NumLoads += N;
- return *this;
- }
-
- void setNumLoads(unsigned N) { OpInfo.NumLoads = N; }
-
- MatrixTy &addNumStores(unsigned N) {
- OpInfo.NumStores += N;
- return *this;
- }
-
- MatrixTy &addNumComputeOps(unsigned N) {
- OpInfo.NumComputeOps += N;
- return *this;
- }
-
- unsigned getNumStores() const { return OpInfo.NumStores; }
- unsigned getNumLoads() const { return OpInfo.NumLoads; }
- unsigned getNumComputeOps() const { return OpInfo.NumComputeOps; }
-
- const OpInfoTy &getOpInfo() const { return OpInfo; }
-
- bool isColumnMajor() const { return IsColumnMajor; }
-
- unsigned getStride() const {
- if (isColumnMajor())
- return getNumRows();
- return getNumColumns();
- }
-
- /// Extract a vector of \p NumElts starting at index (\p I, \p J). If the
- /// matrix is column-major, the result vector is extracted from a column
- /// vector, otherwise from a row vector.
- Value *extractVector(unsigned I, unsigned J, unsigned NumElts,
- IRBuilder<> &Builder) const {
- Value *Vec = isColumnMajor() ? getColumn(J) : getRow(I);
- return Builder.CreateShuffleVector(
+ return cast<VectorType>(Vectors[0]->getType());
+ }
+
+ iterator_range<SmallVector<Value *, 8>::iterator> columns() {
+ assert(isColumnMajor() &&
+ "columns() only supported for column-major matrixes");
+ return make_range(Vectors.begin(), Vectors.end());
+ }
+
+ iterator_range<SmallVector<Value *, 8>::iterator> vectors() {
+ return make_range(Vectors.begin(), Vectors.end());
+ }
+
+ /// Embed the vectors of the matrix into a flat vector by concatenating
+ /// them.
+ Value *embedInVector(IRBuilder<> &Builder) const {
+ return Vectors.size() == 1 ? Vectors[0]
+ : concatenateVectors(Builder, Vectors);
+ }
+
+ MatrixTy &addNumLoads(unsigned N) {
+ OpInfo.NumLoads += N;
+ return *this;
+ }
+
+ void setNumLoads(unsigned N) { OpInfo.NumLoads = N; }
+
+ MatrixTy &addNumStores(unsigned N) {
+ OpInfo.NumStores += N;
+ return *this;
+ }
+
+ MatrixTy &addNumComputeOps(unsigned N) {
+ OpInfo.NumComputeOps += N;
+ return *this;
+ }
+
+ unsigned getNumStores() const { return OpInfo.NumStores; }
+ unsigned getNumLoads() const { return OpInfo.NumLoads; }
+ unsigned getNumComputeOps() const { return OpInfo.NumComputeOps; }
+
+ const OpInfoTy &getOpInfo() const { return OpInfo; }
+
+ bool isColumnMajor() const { return IsColumnMajor; }
+
+ unsigned getStride() const {
+ if (isColumnMajor())
+ return getNumRows();
+ return getNumColumns();
+ }
+
+ /// Extract a vector of \p NumElts starting at index (\p I, \p J). If the
+ /// matrix is column-major, the result vector is extracted from a column
+ /// vector, otherwise from a row vector.
+ Value *extractVector(unsigned I, unsigned J, unsigned NumElts,
+ IRBuilder<> &Builder) const {
+ Value *Vec = isColumnMajor() ? getColumn(J) : getRow(I);
+ return Builder.CreateShuffleVector(
Vec, createSequentialMask(isColumnMajor() ? I : J, NumElts, 0),
- "block");
- }
- };
-
- struct ShapeInfo {
- unsigned NumRows;
- unsigned NumColumns;
-
- bool IsColumnMajor;
-
- ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0)
- : NumRows(NumRows), NumColumns(NumColumns),
- IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
-
- ShapeInfo(Value *NumRows, Value *NumColumns)
- : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(),
- cast<ConstantInt>(NumColumns)->getZExtValue()) {}
-
- bool operator==(const ShapeInfo &other) {
- return NumRows == other.NumRows && NumColumns == other.NumColumns;
- }
- bool operator!=(const ShapeInfo &other) { return !(*this == other); }
-
- /// Returns true if shape-information is defined, meaning both dimensions
- /// are != 0.
- operator bool() const {
- assert(NumRows == 0 || NumColumns != 0);
- return NumRows != 0;
- }
-
- unsigned getStride() const {
- if (IsColumnMajor)
- return NumRows;
- return NumColumns;
- }
-
- unsigned getNumVectors() const {
- if (IsColumnMajor)
- return NumColumns;
- return NumRows;
- }
- };
-
- /// Maps instructions to their shape information. The shape information
- /// describes the shape to be used while lowering. This matches the shape of
- /// the result value of the instruction, with the only exceptions being store
- /// instructions and the matrix_column_major_store intrinsics. For those, the
- /// shape information indicates that those instructions should be lowered
- /// using shape information as well.
- DenseMap<Value *, ShapeInfo> ShapeMap;
-
- /// List of instructions to remove. While lowering, we are not replacing all
- /// users of a lowered instruction, if shape information is available and
- /// those need to be removed after we finished lowering.
- SmallVector<Instruction *, 16> ToRemove;
-
- /// Map from instructions to their produced column matrix.
- MapVector<Value *, MatrixTy> Inst2ColumnMatrix;
-
-public:
- LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,
+ "block");
+ }
+ };
+
+ struct ShapeInfo {
+ unsigned NumRows;
+ unsigned NumColumns;
+
+ bool IsColumnMajor;
+
+ ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0)
+ : NumRows(NumRows), NumColumns(NumColumns),
+ IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
+
+ ShapeInfo(Value *NumRows, Value *NumColumns)
+ : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(),
+ cast<ConstantInt>(NumColumns)->getZExtValue()) {}
+
+ bool operator==(const ShapeInfo &other) {
+ return NumRows == other.NumRows && NumColumns == other.NumColumns;
+ }
+ bool operator!=(const ShapeInfo &other) { return !(*this == other); }
+
+ /// Returns true if shape-information is defined, meaning both dimensions
+ /// are != 0.
+ operator bool() const {
+ assert(NumRows == 0 || NumColumns != 0);
+ return NumRows != 0;
+ }
+
+ unsigned getStride() const {
+ if (IsColumnMajor)
+ return NumRows;
+ return NumColumns;
+ }
+
+ unsigned getNumVectors() const {
+ if (IsColumnMajor)
+ return NumColumns;
+ return NumRows;
+ }
+ };
+
+ /// Maps instructions to their shape information. The shape information
+ /// describes the shape to be used while lowering. This matches the shape of
+ /// the result value of the instruction, with the only exceptions being store
+ /// instructions and the matrix_column_major_store intrinsics. For those, the
+ /// shape information indicates that those instructions should be lowered
+ /// using shape information as well.
+ DenseMap<Value *, ShapeInfo> ShapeMap;
+
+ /// List of instructions to remove. While lowering, we are not replacing all
+ /// users of a lowered instruction, if shape information is available and
+ /// those need to be removed after we finished lowering.
+ SmallVector<Instruction *, 16> ToRemove;
+
+ /// Map from instructions to their produced column matrix.
+ MapVector<Value *, MatrixTy> Inst2ColumnMatrix;
+
+public:
+ LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,
AliasAnalysis *AA, DominatorTree *DT, LoopInfo *LI,
OptimizationRemarkEmitter *ORE)
- : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), AA(AA), DT(DT),
- LI(LI), ORE(ORE) {}
-
- unsigned getNumOps(Type *VT) {
- assert(isa<VectorType>(VT) && "Expected vector type");
- return getNumOps(VT->getScalarType(),
- cast<FixedVectorType>(VT)->getNumElements());
- }
-
- //
- /// Return the estimated number of vector ops required for an operation on
- /// \p VT * N.
- unsigned getNumOps(Type *ST, unsigned N) {
- return std::ceil((ST->getPrimitiveSizeInBits() * N).getFixedSize() /
- double(TTI.getRegisterBitWidth(true)));
- }
-
- /// Return the set of vectors that a matrix value is lowered to.
- ///
- /// If we lowered \p MatrixVal, just return the cache result matrix. Otherwise
- /// split the flat vector \p MatrixVal containing a matrix with shape \p SI
- /// into vectors.
- MatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI,
- IRBuilder<> &Builder) {
- VectorType *VType = dyn_cast<VectorType>(MatrixVal->getType());
- assert(VType && "MatrixVal must be a vector type");
- assert(cast<FixedVectorType>(VType)->getNumElements() ==
- SI.NumRows * SI.NumColumns &&
- "The vector size must match the number of matrix elements");
-
- // Check if we lowered MatrixVal using shape information. In that case,
- // return the existing matrix, if it matches the requested shape
- // information. If there is a mis-match, embed the result in a flat
- // vector and split it later.
- auto Found = Inst2ColumnMatrix.find(MatrixVal);
- if (Found != Inst2ColumnMatrix.end()) {
- MatrixTy &M = Found->second;
- // Return the found matrix, if its shape matches the requested shape
- // information
- if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns())
- return M;
-
- MatrixVal = M.embedInVector(Builder);
- }
-
- // Otherwise split MatrixVal.
- SmallVector<Value *, 16> SplitVecs;
- for (unsigned MaskStart = 0;
- MaskStart < cast<FixedVectorType>(VType)->getNumElements();
- MaskStart += SI.getStride()) {
- Value *V = Builder.CreateShuffleVector(
+ : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), AA(AA), DT(DT),
+ LI(LI), ORE(ORE) {}
+
+ unsigned getNumOps(Type *VT) {
+ assert(isa<VectorType>(VT) && "Expected vector type");
+ return getNumOps(VT->getScalarType(),
+ cast<FixedVectorType>(VT)->getNumElements());
+ }
+
+ //
+ /// Return the estimated number of vector ops required for an operation on
+ /// \p VT * N.
+ unsigned getNumOps(Type *ST, unsigned N) {
+ return std::ceil((ST->getPrimitiveSizeInBits() * N).getFixedSize() /
+ double(TTI.getRegisterBitWidth(true)));
+ }
+
+ /// Return the set of vectors that a matrix value is lowered to.
+ ///
+ /// If we lowered \p MatrixVal, just return the cache result matrix. Otherwise
+ /// split the flat vector \p MatrixVal containing a matrix with shape \p SI
+ /// into vectors.
+ MatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI,
+ IRBuilder<> &Builder) {
+ VectorType *VType = dyn_cast<VectorType>(MatrixVal->getType());
+ assert(VType && "MatrixVal must be a vector type");
+ assert(cast<FixedVectorType>(VType)->getNumElements() ==
+ SI.NumRows * SI.NumColumns &&
+ "The vector size must match the number of matrix elements");
+
+ // Check if we lowered MatrixVal using shape information. In that case,
+ // return the existing matrix, if it matches the requested shape
+ // information. If there is a mis-match, embed the result in a flat
+ // vector and split it later.
+ auto Found = Inst2ColumnMatrix.find(MatrixVal);
+ if (Found != Inst2ColumnMatrix.end()) {
+ MatrixTy &M = Found->second;
+ // Return the found matrix, if its shape matches the requested shape
+ // information
+ if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns())
+ return M;
+
+ MatrixVal = M.embedInVector(Builder);
+ }
+
+ // Otherwise split MatrixVal.
+ SmallVector<Value *, 16> SplitVecs;
+ for (unsigned MaskStart = 0;
+ MaskStart < cast<FixedVectorType>(VType)->getNumElements();
+ MaskStart += SI.getStride()) {
+ Value *V = Builder.CreateShuffleVector(
MatrixVal, createSequentialMask(MaskStart, SI.getStride(), 0),
- "split");
- SplitVecs.push_back(V);
- }
-
- return {SplitVecs};
- }
-
- /// If \p V already has a known shape return false. Otherwise set the shape
- /// for instructions that support it.
- bool setShapeInfo(Value *V, ShapeInfo Shape) {
- assert(Shape && "Shape not set");
- if (isa<UndefValue>(V) || !supportsShapeInfo(V))
- return false;
-
- auto SIter = ShapeMap.find(V);
- if (SIter != ShapeMap.end()) {
- LLVM_DEBUG(dbgs() << " not overriding existing shape: "
- << SIter->second.NumRows << " "
- << SIter->second.NumColumns << " for " << *V << "\n");
- return false;
- }
-
- ShapeMap.insert({V, Shape});
- LLVM_DEBUG(dbgs() << " " << Shape.NumRows << " x " << Shape.NumColumns
- << " for " << *V << "\n");
- return true;
- }
-
- bool isUniformShape(Value *V) {
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I)
- return true;
-
- switch (I->getOpcode()) {
- case Instruction::FAdd:
- case Instruction::FSub:
- case Instruction::FMul: // Scalar multiply.
+ "split");
+ SplitVecs.push_back(V);
+ }
+
+ return {SplitVecs};
+ }
+
+ /// If \p V already has a known shape return false. Otherwise set the shape
+ /// for instructions that support it.
+ bool setShapeInfo(Value *V, ShapeInfo Shape) {
+ assert(Shape && "Shape not set");
+ if (isa<UndefValue>(V) || !supportsShapeInfo(V))
+ return false;
+
+ auto SIter = ShapeMap.find(V);
+ if (SIter != ShapeMap.end()) {
+ LLVM_DEBUG(dbgs() << " not overriding existing shape: "
+ << SIter->second.NumRows << " "
+ << SIter->second.NumColumns << " for " << *V << "\n");
+ return false;
+ }
+
+ ShapeMap.insert({V, Shape});
+ LLVM_DEBUG(dbgs() << " " << Shape.NumRows << " x " << Shape.NumColumns
+ << " for " << *V << "\n");
+ return true;
+ }
+
+ bool isUniformShape(Value *V) {
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return true;
+
+ switch (I->getOpcode()) {
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul: // Scalar multiply.
case Instruction::FNeg:
- case Instruction::Add:
- case Instruction::Mul:
- case Instruction::Sub:
- return true;
- default:
- return false;
- }
- }
-
- /// Returns true if shape information can be used for \p V. The supported
- /// instructions must match the instructions that can be lowered by this pass.
- bool supportsShapeInfo(Value *V) {
- Instruction *Inst = dyn_cast<Instruction>(V);
- if (!Inst)
- return false;
-
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
- if (II)
- switch (II->getIntrinsicID()) {
- case Intrinsic::matrix_multiply:
- case Intrinsic::matrix_transpose:
- case Intrinsic::matrix_column_major_load:
- case Intrinsic::matrix_column_major_store:
- return true;
- default:
- return false;
- }
- return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
- }
-
- /// Propagate the shape information of instructions to their users.
- /// The work list contains instructions for which we can compute the shape,
- /// either based on the information provided by matrix intrinsics or known
- /// shapes of operands.
- SmallVector<Instruction *, 32>
- propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) {
- SmallVector<Instruction *, 32> NewWorkList;
- // Pop an element for which we guaranteed to have at least one of the
- // operand shapes. Add the shape for this and then add users to the work
- // list.
- LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n");
- while (!WorkList.empty()) {
+ case Instruction::Add:
+ case Instruction::Mul:
+ case Instruction::Sub:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ /// Returns true if shape information can be used for \p V. The supported
+ /// instructions must match the instructions that can be lowered by this pass.
+ bool supportsShapeInfo(Value *V) {
+ Instruction *Inst = dyn_cast<Instruction>(V);
+ if (!Inst)
+ return false;
+
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
+ if (II)
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::matrix_multiply:
+ case Intrinsic::matrix_transpose:
+ case Intrinsic::matrix_column_major_load:
+ case Intrinsic::matrix_column_major_store:
+ return true;
+ default:
+ return false;
+ }
+ return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
+ }
+
+ /// Propagate the shape information of instructions to their users.
+ /// The work list contains instructions for which we can compute the shape,
+ /// either based on the information provided by matrix intrinsics or known
+ /// shapes of operands.
+ SmallVector<Instruction *, 32>
+ propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) {
+ SmallVector<Instruction *, 32> NewWorkList;
+ // Pop an element for which we guaranteed to have at least one of the
+ // operand shapes. Add the shape for this and then add users to the work
+ // list.
+ LLVM_DEBUG(dbgs() << "Forward-propagate shapes:\n");
+ while (!WorkList.empty()) {
Instruction *Inst = WorkList.pop_back_val();
-
- // New entry, set the value and insert operands
- bool Propagate = false;
-
- Value *MatrixA;
- Value *MatrixB;
- Value *M;
- Value *N;
- Value *K;
- if (match(Inst, m_Intrinsic<Intrinsic::matrix_multiply>(
- m_Value(MatrixA), m_Value(MatrixB), m_Value(M),
- m_Value(N), m_Value(K)))) {
- Propagate = setShapeInfo(Inst, {M, K});
- } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_transpose>(
- m_Value(MatrixA), m_Value(M), m_Value(N)))) {
- // Flip dimensions.
- Propagate = setShapeInfo(Inst, {N, M});
- } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_store>(
- m_Value(MatrixA), m_Value(), m_Value(),
- m_Value(), m_Value(M), m_Value(N)))) {
- Propagate = setShapeInfo(Inst, {N, M});
- } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_load>(
- m_Value(), m_Value(), m_Value(), m_Value(M),
- m_Value(N)))) {
- Propagate = setShapeInfo(Inst, {M, N});
- } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) {
- auto OpShape = ShapeMap.find(MatrixA);
- if (OpShape != ShapeMap.end())
- setShapeInfo(Inst, OpShape->second);
- continue;
- } else if (isUniformShape(Inst)) {
- // Find the first operand that has a known shape and use that.
- for (auto &Op : Inst->operands()) {
- auto OpShape = ShapeMap.find(Op.get());
- if (OpShape != ShapeMap.end()) {
- Propagate |= setShapeInfo(Inst, OpShape->second);
- break;
- }
- }
- }
-
- if (Propagate) {
- NewWorkList.push_back(Inst);
- for (auto *User : Inst->users())
- if (ShapeMap.count(User) == 0)
- WorkList.push_back(cast<Instruction>(User));
- }
- }
-
- return NewWorkList;
- }
-
- /// Propagate the shape to operands of instructions with shape information.
- /// \p Worklist contains the instruction for which we already know the shape.
- SmallVector<Instruction *, 32>
- propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) {
- SmallVector<Instruction *, 32> NewWorkList;
-
- auto pushInstruction = [](Value *V,
- SmallVectorImpl<Instruction *> &WorkList) {
- Instruction *I = dyn_cast<Instruction>(V);
- if (I)
- WorkList.push_back(I);
- };
- // Pop an element with known shape. Traverse the operands, if their shape
- // derives from the result shape and is unknown, add it and add them to the
- // worklist.
- LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n");
- while (!WorkList.empty()) {
+
+ // New entry, set the value and insert operands
+ bool Propagate = false;
+
+ Value *MatrixA;
+ Value *MatrixB;
+ Value *M;
+ Value *N;
+ Value *K;
+ if (match(Inst, m_Intrinsic<Intrinsic::matrix_multiply>(
+ m_Value(MatrixA), m_Value(MatrixB), m_Value(M),
+ m_Value(N), m_Value(K)))) {
+ Propagate = setShapeInfo(Inst, {M, K});
+ } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_transpose>(
+ m_Value(MatrixA), m_Value(M), m_Value(N)))) {
+ // Flip dimensions.
+ Propagate = setShapeInfo(Inst, {N, M});
+ } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_store>(
+ m_Value(MatrixA), m_Value(), m_Value(),
+ m_Value(), m_Value(M), m_Value(N)))) {
+ Propagate = setShapeInfo(Inst, {N, M});
+ } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_load>(
+ m_Value(), m_Value(), m_Value(), m_Value(M),
+ m_Value(N)))) {
+ Propagate = setShapeInfo(Inst, {M, N});
+ } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) {
+ auto OpShape = ShapeMap.find(MatrixA);
+ if (OpShape != ShapeMap.end())
+ setShapeInfo(Inst, OpShape->second);
+ continue;
+ } else if (isUniformShape(Inst)) {
+ // Find the first operand that has a known shape and use that.
+ for (auto &Op : Inst->operands()) {
+ auto OpShape = ShapeMap.find(Op.get());
+ if (OpShape != ShapeMap.end()) {
+ Propagate |= setShapeInfo(Inst, OpShape->second);
+ break;
+ }
+ }
+ }
+
+ if (Propagate) {
+ NewWorkList.push_back(Inst);
+ for (auto *User : Inst->users())
+ if (ShapeMap.count(User) == 0)
+ WorkList.push_back(cast<Instruction>(User));
+ }
+ }
+
+ return NewWorkList;
+ }
+
+ /// Propagate the shape to operands of instructions with shape information.
+ /// \p Worklist contains the instruction for which we already know the shape.
+ SmallVector<Instruction *, 32>
+ propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) {
+ SmallVector<Instruction *, 32> NewWorkList;
+
+ auto pushInstruction = [](Value *V,
+ SmallVectorImpl<Instruction *> &WorkList) {
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (I)
+ WorkList.push_back(I);
+ };
+ // Pop an element with known shape. Traverse the operands, if their shape
+ // derives from the result shape and is unknown, add it and add them to the
+ // worklist.
+ LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n");
+ while (!WorkList.empty()) {
Value *V = WorkList.pop_back_val();
-
- size_t BeforeProcessingV = WorkList.size();
- if (!isa<Instruction>(V))
- continue;
-
- Value *MatrixA;
- Value *MatrixB;
- Value *M;
- Value *N;
- Value *K;
- if (match(V, m_Intrinsic<Intrinsic::matrix_multiply>(
- m_Value(MatrixA), m_Value(MatrixB), m_Value(M),
- m_Value(N), m_Value(K)))) {
- if (setShapeInfo(MatrixA, {M, N}))
- pushInstruction(MatrixA, WorkList);
-
- if (setShapeInfo(MatrixB, {N, K}))
- pushInstruction(MatrixB, WorkList);
-
- } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>(
- m_Value(MatrixA), m_Value(M), m_Value(N)))) {
- // Flip dimensions.
- if (setShapeInfo(MatrixA, {M, N}))
- pushInstruction(MatrixA, WorkList);
- } else if (match(V, m_Intrinsic<Intrinsic::matrix_column_major_store>(
- m_Value(MatrixA), m_Value(), m_Value(), m_Value(),
- m_Value(M), m_Value(N)))) {
- if (setShapeInfo(MatrixA, {M, N})) {
- pushInstruction(MatrixA, WorkList);
- }
- } else if (isa<LoadInst>(V) ||
- match(V, m_Intrinsic<Intrinsic::matrix_column_major_load>())) {
- // Nothing to do, no matrix input.
- } else if (isa<StoreInst>(V)) {
- // Nothing to do. We forward-propagated to this so we would just
- // backward propagate to an instruction with an already known shape.
- } else if (isUniformShape(V)) {
- // Propagate to all operands.
- ShapeInfo Shape = ShapeMap[V];
- for (Use &U : cast<Instruction>(V)->operands()) {
- if (setShapeInfo(U.get(), Shape))
- pushInstruction(U.get(), WorkList);
- }
- }
- // After we discovered new shape info for new instructions in the
- // worklist, we use their users as seeds for the next round of forward
- // propagation.
- for (size_t I = BeforeProcessingV; I != WorkList.size(); I++)
- for (User *U : WorkList[I]->users())
- if (isa<Instruction>(U) && V != U)
- NewWorkList.push_back(cast<Instruction>(U));
- }
- return NewWorkList;
- }
-
- bool Visit() {
- if (EnableShapePropagation) {
- SmallVector<Instruction *, 32> WorkList;
-
- // Initially only the shape of matrix intrinsics is known.
- // Initialize the work list with ops carrying shape information.
- for (BasicBlock &BB : Func)
- for (Instruction &Inst : BB) {
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst);
- if (!II)
- continue;
-
- switch (II->getIntrinsicID()) {
- case Intrinsic::matrix_multiply:
- case Intrinsic::matrix_transpose:
- case Intrinsic::matrix_column_major_load:
- case Intrinsic::matrix_column_major_store:
- WorkList.push_back(&Inst);
- break;
- default:
- break;
- }
- }
- // Propagate shapes until nothing changes any longer.
- while (!WorkList.empty()) {
- WorkList = propagateShapeForward(WorkList);
- WorkList = propagateShapeBackward(WorkList);
- }
- }
-
- bool Changed = false;
- SmallVector<CallInst *, 16> MaybeFusableInsts;
- SmallVector<Instruction *, 16> MatrixInsts;
-
- // First, collect all instructions with shape information and candidates for
- // fusion (currently only matrix multiplies).
- ReversePostOrderTraversal<Function *> RPOT(&Func);
- for (auto *BB : RPOT)
- for (Instruction &I : *BB) {
- if (ShapeMap.find(&I) == ShapeMap.end())
- continue;
- if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>()))
- MaybeFusableInsts.push_back(cast<CallInst>(&I));
- MatrixInsts.push_back(&I);
- }
-
- // Second, try to fuse candidates.
- SmallPtrSet<Instruction *, 16> FusedInsts;
- for (CallInst *CI : MaybeFusableInsts)
- LowerMatrixMultiplyFused(CI, FusedInsts);
- Changed = !FusedInsts.empty();
-
- // Third, lower remaining instructions with shape information.
- for (Instruction *Inst : MatrixInsts) {
- if (FusedInsts.count(Inst))
- continue;
-
- IRBuilder<> Builder(Inst);
-
- if (CallInst *CInst = dyn_cast<CallInst>(Inst))
- Changed |= VisitCallInst(CInst);
-
- Value *Op1;
- Value *Op2;
- if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
- Changed |= VisitBinaryOperator(BinOp);
+
+ size_t BeforeProcessingV = WorkList.size();
+ if (!isa<Instruction>(V))
+ continue;
+
+ Value *MatrixA;
+ Value *MatrixB;
+ Value *M;
+ Value *N;
+ Value *K;
+ if (match(V, m_Intrinsic<Intrinsic::matrix_multiply>(
+ m_Value(MatrixA), m_Value(MatrixB), m_Value(M),
+ m_Value(N), m_Value(K)))) {
+ if (setShapeInfo(MatrixA, {M, N}))
+ pushInstruction(MatrixA, WorkList);
+
+ if (setShapeInfo(MatrixB, {N, K}))
+ pushInstruction(MatrixB, WorkList);
+
+ } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>(
+ m_Value(MatrixA), m_Value(M), m_Value(N)))) {
+ // Flip dimensions.
+ if (setShapeInfo(MatrixA, {M, N}))
+ pushInstruction(MatrixA, WorkList);
+ } else if (match(V, m_Intrinsic<Intrinsic::matrix_column_major_store>(
+ m_Value(MatrixA), m_Value(), m_Value(), m_Value(),
+ m_Value(M), m_Value(N)))) {
+ if (setShapeInfo(MatrixA, {M, N})) {
+ pushInstruction(MatrixA, WorkList);
+ }
+ } else if (isa<LoadInst>(V) ||
+ match(V, m_Intrinsic<Intrinsic::matrix_column_major_load>())) {
+ // Nothing to do, no matrix input.
+ } else if (isa<StoreInst>(V)) {
+ // Nothing to do. We forward-propagated to this so we would just
+ // backward propagate to an instruction with an already known shape.
+ } else if (isUniformShape(V)) {
+ // Propagate to all operands.
+ ShapeInfo Shape = ShapeMap[V];
+ for (Use &U : cast<Instruction>(V)->operands()) {
+ if (setShapeInfo(U.get(), Shape))
+ pushInstruction(U.get(), WorkList);
+ }
+ }
+ // After we discovered new shape info for new instructions in the
+ // worklist, we use their users as seeds for the next round of forward
+ // propagation.
+ for (size_t I = BeforeProcessingV; I != WorkList.size(); I++)
+ for (User *U : WorkList[I]->users())
+ if (isa<Instruction>(U) && V != U)
+ NewWorkList.push_back(cast<Instruction>(U));
+ }
+ return NewWorkList;
+ }
+
+ bool Visit() {
+ if (EnableShapePropagation) {
+ SmallVector<Instruction *, 32> WorkList;
+
+ // Initially only the shape of matrix intrinsics is known.
+ // Initialize the work list with ops carrying shape information.
+ for (BasicBlock &BB : Func)
+ for (Instruction &Inst : BB) {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst);
+ if (!II)
+ continue;
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::matrix_multiply:
+ case Intrinsic::matrix_transpose:
+ case Intrinsic::matrix_column_major_load:
+ case Intrinsic::matrix_column_major_store:
+ WorkList.push_back(&Inst);
+ break;
+ default:
+ break;
+ }
+ }
+ // Propagate shapes until nothing changes any longer.
+ while (!WorkList.empty()) {
+ WorkList = propagateShapeForward(WorkList);
+ WorkList = propagateShapeBackward(WorkList);
+ }
+ }
+
+ bool Changed = false;
+ SmallVector<CallInst *, 16> MaybeFusableInsts;
+ SmallVector<Instruction *, 16> MatrixInsts;
+
+ // First, collect all instructions with shape information and candidates for
+ // fusion (currently only matrix multiplies).
+ ReversePostOrderTraversal<Function *> RPOT(&Func);
+ for (auto *BB : RPOT)
+ for (Instruction &I : *BB) {
+ if (ShapeMap.find(&I) == ShapeMap.end())
+ continue;
+ if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>()))
+ MaybeFusableInsts.push_back(cast<CallInst>(&I));
+ MatrixInsts.push_back(&I);
+ }
+
+ // Second, try to fuse candidates.
+ SmallPtrSet<Instruction *, 16> FusedInsts;
+ for (CallInst *CI : MaybeFusableInsts)
+ LowerMatrixMultiplyFused(CI, FusedInsts);
+ Changed = !FusedInsts.empty();
+
+ // Third, lower remaining instructions with shape information.
+ for (Instruction *Inst : MatrixInsts) {
+ if (FusedInsts.count(Inst))
+ continue;
+
+ IRBuilder<> Builder(Inst);
+
+ if (CallInst *CInst = dyn_cast<CallInst>(Inst))
+ Changed |= VisitCallInst(CInst);
+
+ Value *Op1;
+ Value *Op2;
+ if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
+ Changed |= VisitBinaryOperator(BinOp);
if (auto *UnOp = dyn_cast<UnaryOperator>(Inst))
Changed |= VisitUnaryOperator(UnOp);
- if (match(Inst, m_Load(m_Value(Op1))))
- Changed |= VisitLoad(cast<LoadInst>(Inst), Op1, Builder);
- else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
- Changed |= VisitStore(cast<StoreInst>(Inst), Op1, Op2, Builder);
- }
-
+ if (match(Inst, m_Load(m_Value(Op1))))
+ Changed |= VisitLoad(cast<LoadInst>(Inst), Op1, Builder);
+ else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
+ Changed |= VisitStore(cast<StoreInst>(Inst), Op1, Op2, Builder);
+ }
+
if (ORE) {
RemarkGenerator RemarkGen(Inst2ColumnMatrix, *ORE, Func);
RemarkGen.emitRemarks();
}
-
- for (Instruction *Inst : reverse(ToRemove))
- Inst->eraseFromParent();
-
- return Changed;
- }
-
- /// Turns \p BasePtr into an elementwise pointer to \p EltType.
- Value *createElementPtr(Value *BasePtr, Type *EltType, IRBuilder<> &Builder) {
- unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
- Type *EltPtrType = PointerType::get(EltType, AS);
- return Builder.CreatePointerCast(BasePtr, EltPtrType);
- }
-
- /// Replace intrinsic calls
- bool VisitCallInst(CallInst *Inst) {
- if (!Inst->getCalledFunction() || !Inst->getCalledFunction()->isIntrinsic())
- return false;
-
- switch (Inst->getCalledFunction()->getIntrinsicID()) {
- case Intrinsic::matrix_multiply:
- LowerMultiply(Inst);
- break;
- case Intrinsic::matrix_transpose:
- LowerTranspose(Inst);
- break;
- case Intrinsic::matrix_column_major_load:
- LowerColumnMajorLoad(Inst);
- break;
- case Intrinsic::matrix_column_major_store:
- LowerColumnMajorStore(Inst);
- break;
- default:
- return false;
- }
- return true;
- }
-
- /// Compute the alignment for a column/row \p Idx with \p Stride between them.
- /// The address at \p Idx == 0 has alignment \p A. If \p Stride is a
- /// ConstantInt, reduce the initial alignment based on the byte offset. For
- /// non-ConstantInt strides, return the common alignment of the initial
- /// alignment and the element size in bytes.
- Align getAlignForIndex(unsigned Idx, Value *Stride, Type *ElementTy,
- MaybeAlign A) const {
- Align InitialAlign = DL.getValueOrABITypeAlignment(A, ElementTy);
- if (Idx == 0)
- return InitialAlign;
-
- TypeSize ElementSizeInBits = DL.getTypeSizeInBits(ElementTy);
- if (auto *ConstStride = dyn_cast<ConstantInt>(Stride)) {
- uint64_t StrideInBytes =
- ConstStride->getZExtValue() * ElementSizeInBits / 8;
- return commonAlignment(InitialAlign, Idx * StrideInBytes);
- }
- return commonAlignment(InitialAlign, ElementSizeInBits / 8);
- }
-
- /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between
- /// vectors.
- MatrixTy loadMatrix(Type *Ty, Value *Ptr, MaybeAlign MAlign, Value *Stride,
- bool IsVolatile, ShapeInfo Shape, IRBuilder<> &Builder) {
- auto VType = cast<VectorType>(Ty);
- Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
- MatrixTy Result;
- for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {
- Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(I), Stride,
- Shape.getStride(), VType->getElementType(),
- Builder);
- Value *Vector = Builder.CreateAlignedLoad(
- GEP, getAlignForIndex(I, Stride, VType->getElementType(), MAlign),
- IsVolatile, "col.load");
-
- Result.addVector(Vector);
- }
- return Result.addNumLoads(getNumOps(Result.getVectorTy()) *
- Result.getNumVectors());
- }
-
- /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix,
- /// starting at \p MatrixPtr[I][J].
- MatrixTy loadMatrix(Value *MatrixPtr, MaybeAlign Align, bool IsVolatile,
- ShapeInfo MatrixShape, Value *I, Value *J,
- ShapeInfo ResultShape, Type *EltTy,
- IRBuilder<> &Builder) {
-
- Value *Offset = Builder.CreateAdd(
- Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
-
- unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace();
- Value *EltPtr =
- Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS));
- Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset);
- auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows *
- ResultShape.NumColumns);
- Type *TilePtrTy = PointerType::get(TileTy, AS);
- Value *TilePtr =
- Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast");
-
- return loadMatrix(TileTy, TilePtr, Align,
- Builder.getInt64(MatrixShape.getStride()), IsVolatile,
- ResultShape, Builder);
- }
-
- /// Lower a load instruction with shape information.
- void LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align, Value *Stride,
- bool IsVolatile, ShapeInfo Shape) {
- IRBuilder<> Builder(Inst);
- finalizeLowering(Inst,
- loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile,
- Shape, Builder),
- Builder);
- }
-
- /// Lowers llvm.matrix.column.major.load.
- ///
- /// The intrinsic loads a matrix from memory using a stride between columns.
- void LowerColumnMajorLoad(CallInst *Inst) {
- assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
- "Intrinsic only supports column-major layout!");
- Value *Ptr = Inst->getArgOperand(0);
- Value *Stride = Inst->getArgOperand(1);
- LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride,
- cast<ConstantInt>(Inst->getArgOperand(2))->isOne(),
- {Inst->getArgOperand(3), Inst->getArgOperand(4)});
- }
-
- /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p
- /// MatrixPtr[I][J].
- void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr,
- MaybeAlign MAlign, bool IsVolatile, ShapeInfo MatrixShape,
- Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) {
- Value *Offset = Builder.CreateAdd(
- Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
-
- unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace();
- Value *EltPtr =
- Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS));
- Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset);
- auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() *
- StoreVal.getNumColumns());
- Type *TilePtrTy = PointerType::get(TileTy, AS);
- Value *TilePtr =
- Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast");
-
- storeMatrix(TileTy, StoreVal, TilePtr, MAlign,
- Builder.getInt64(MatrixShape.getStride()), IsVolatile, Builder);
- }
-
- /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between
- /// vectors.
- MatrixTy storeMatrix(Type *Ty, MatrixTy StoreVal, Value *Ptr,
- MaybeAlign MAlign, Value *Stride, bool IsVolatile,
- IRBuilder<> &Builder) {
- auto VType = cast<VectorType>(Ty);
- Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
- for (auto Vec : enumerate(StoreVal.vectors())) {
- Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(Vec.index()),
- Stride, StoreVal.getStride(),
- VType->getElementType(), Builder);
- Builder.CreateAlignedStore(Vec.value(), GEP,
- getAlignForIndex(Vec.index(), Stride,
- VType->getElementType(),
- MAlign),
- IsVolatile);
- }
- return MatrixTy().addNumStores(getNumOps(StoreVal.getVectorTy()) *
- StoreVal.getNumVectors());
- }
-
- /// Lower a store instruction with shape information.
- void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, MaybeAlign A,
- Value *Stride, bool IsVolatile, ShapeInfo Shape) {
- IRBuilder<> Builder(Inst);
- auto StoreVal = getMatrix(Matrix, Shape, Builder);
- finalizeLowering(Inst,
- storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride,
- IsVolatile, Builder),
- Builder);
- }
-
- /// Lowers llvm.matrix.column.major.store.
- ///
- /// The intrinsic store a matrix back memory using a stride between columns.
- void LowerColumnMajorStore(CallInst *Inst) {
- assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
- "Intrinsic only supports column-major layout!");
- Value *Matrix = Inst->getArgOperand(0);
- Value *Ptr = Inst->getArgOperand(1);
- Value *Stride = Inst->getArgOperand(2);
- LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride,
- cast<ConstantInt>(Inst->getArgOperand(3))->isOne(),
- {Inst->getArgOperand(4), Inst->getArgOperand(5)});
- }
-
- // Set elements I..I+NumElts-1 to Block
- Value *insertVector(Value *Col, unsigned I, Value *Block,
- IRBuilder<> &Builder) {
-
- // First, bring Block to the same size as Col
- unsigned BlockNumElts =
- cast<FixedVectorType>(Block->getType())->getNumElements();
- unsigned NumElts = cast<FixedVectorType>(Col->getType())->getNumElements();
- assert(NumElts >= BlockNumElts && "Too few elements for current block");
-
- Block = Builder.CreateShuffleVector(
+
+ for (Instruction *Inst : reverse(ToRemove))
+ Inst->eraseFromParent();
+
+ return Changed;
+ }
+
+ /// Turns \p BasePtr into an elementwise pointer to \p EltType.
+ Value *createElementPtr(Value *BasePtr, Type *EltType, IRBuilder<> &Builder) {
+ unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
+ Type *EltPtrType = PointerType::get(EltType, AS);
+ return Builder.CreatePointerCast(BasePtr, EltPtrType);
+ }
+
+ /// Replace intrinsic calls
+ bool VisitCallInst(CallInst *Inst) {
+ if (!Inst->getCalledFunction() || !Inst->getCalledFunction()->isIntrinsic())
+ return false;
+
+ switch (Inst->getCalledFunction()->getIntrinsicID()) {
+ case Intrinsic::matrix_multiply:
+ LowerMultiply(Inst);
+ break;
+ case Intrinsic::matrix_transpose:
+ LowerTranspose(Inst);
+ break;
+ case Intrinsic::matrix_column_major_load:
+ LowerColumnMajorLoad(Inst);
+ break;
+ case Intrinsic::matrix_column_major_store:
+ LowerColumnMajorStore(Inst);
+ break;
+ default:
+ return false;
+ }
+ return true;
+ }
+
+ /// Compute the alignment for a column/row \p Idx with \p Stride between them.
+ /// The address at \p Idx == 0 has alignment \p A. If \p Stride is a
+ /// ConstantInt, reduce the initial alignment based on the byte offset. For
+ /// non-ConstantInt strides, return the common alignment of the initial
+ /// alignment and the element size in bytes.
+ Align getAlignForIndex(unsigned Idx, Value *Stride, Type *ElementTy,
+ MaybeAlign A) const {
+ Align InitialAlign = DL.getValueOrABITypeAlignment(A, ElementTy);
+ if (Idx == 0)
+ return InitialAlign;
+
+ TypeSize ElementSizeInBits = DL.getTypeSizeInBits(ElementTy);
+ if (auto *ConstStride = dyn_cast<ConstantInt>(Stride)) {
+ uint64_t StrideInBytes =
+ ConstStride->getZExtValue() * ElementSizeInBits / 8;
+ return commonAlignment(InitialAlign, Idx * StrideInBytes);
+ }
+ return commonAlignment(InitialAlign, ElementSizeInBits / 8);
+ }
+
+ /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between
+ /// vectors.
+ MatrixTy loadMatrix(Type *Ty, Value *Ptr, MaybeAlign MAlign, Value *Stride,
+ bool IsVolatile, ShapeInfo Shape, IRBuilder<> &Builder) {
+ auto VType = cast<VectorType>(Ty);
+ Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
+ MatrixTy Result;
+ for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {
+ Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(I), Stride,
+ Shape.getStride(), VType->getElementType(),
+ Builder);
+ Value *Vector = Builder.CreateAlignedLoad(
+ GEP, getAlignForIndex(I, Stride, VType->getElementType(), MAlign),
+ IsVolatile, "col.load");
+
+ Result.addVector(Vector);
+ }
+ return Result.addNumLoads(getNumOps(Result.getVectorTy()) *
+ Result.getNumVectors());
+ }
+
+ /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix,
+ /// starting at \p MatrixPtr[I][J].
+ MatrixTy loadMatrix(Value *MatrixPtr, MaybeAlign Align, bool IsVolatile,
+ ShapeInfo MatrixShape, Value *I, Value *J,
+ ShapeInfo ResultShape, Type *EltTy,
+ IRBuilder<> &Builder) {
+
+ Value *Offset = Builder.CreateAdd(
+ Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
+
+ unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace();
+ Value *EltPtr =
+ Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS));
+ Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset);
+ auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows *
+ ResultShape.NumColumns);
+ Type *TilePtrTy = PointerType::get(TileTy, AS);
+ Value *TilePtr =
+ Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast");
+
+ return loadMatrix(TileTy, TilePtr, Align,
+ Builder.getInt64(MatrixShape.getStride()), IsVolatile,
+ ResultShape, Builder);
+ }
+
+ /// Lower a load instruction with shape information.
+ void LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align, Value *Stride,
+ bool IsVolatile, ShapeInfo Shape) {
+ IRBuilder<> Builder(Inst);
+ finalizeLowering(Inst,
+ loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile,
+ Shape, Builder),
+ Builder);
+ }
+
+ /// Lowers llvm.matrix.column.major.load.
+ ///
+ /// The intrinsic loads a matrix from memory using a stride between columns.
+ void LowerColumnMajorLoad(CallInst *Inst) {
+ assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
+ "Intrinsic only supports column-major layout!");
+ Value *Ptr = Inst->getArgOperand(0);
+ Value *Stride = Inst->getArgOperand(1);
+ LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride,
+ cast<ConstantInt>(Inst->getArgOperand(2))->isOne(),
+ {Inst->getArgOperand(3), Inst->getArgOperand(4)});
+ }
+
+ /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p
+ /// MatrixPtr[I][J].
+ void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr,
+ MaybeAlign MAlign, bool IsVolatile, ShapeInfo MatrixShape,
+ Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) {
+ Value *Offset = Builder.CreateAdd(
+ Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
+
+ unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace();
+ Value *EltPtr =
+ Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS));
+ Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset);
+ auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() *
+ StoreVal.getNumColumns());
+ Type *TilePtrTy = PointerType::get(TileTy, AS);
+ Value *TilePtr =
+ Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast");
+
+ storeMatrix(TileTy, StoreVal, TilePtr, MAlign,
+ Builder.getInt64(MatrixShape.getStride()), IsVolatile, Builder);
+ }
+
+ /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between
+ /// vectors.
+ MatrixTy storeMatrix(Type *Ty, MatrixTy StoreVal, Value *Ptr,
+ MaybeAlign MAlign, Value *Stride, bool IsVolatile,
+ IRBuilder<> &Builder) {
+ auto VType = cast<VectorType>(Ty);
+ Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
+ for (auto Vec : enumerate(StoreVal.vectors())) {
+ Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(Vec.index()),
+ Stride, StoreVal.getStride(),
+ VType->getElementType(), Builder);
+ Builder.CreateAlignedStore(Vec.value(), GEP,
+ getAlignForIndex(Vec.index(), Stride,
+ VType->getElementType(),
+ MAlign),
+ IsVolatile);
+ }
+ return MatrixTy().addNumStores(getNumOps(StoreVal.getVectorTy()) *
+ StoreVal.getNumVectors());
+ }
+
+ /// Lower a store instruction with shape information.
+ void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, MaybeAlign A,
+ Value *Stride, bool IsVolatile, ShapeInfo Shape) {
+ IRBuilder<> Builder(Inst);
+ auto StoreVal = getMatrix(Matrix, Shape, Builder);
+ finalizeLowering(Inst,
+ storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride,
+ IsVolatile, Builder),
+ Builder);
+ }
+
+ /// Lowers llvm.matrix.column.major.store.
+ ///
+ /// The intrinsic store a matrix back memory using a stride between columns.
+ void LowerColumnMajorStore(CallInst *Inst) {
+ assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
+ "Intrinsic only supports column-major layout!");
+ Value *Matrix = Inst->getArgOperand(0);
+ Value *Ptr = Inst->getArgOperand(1);
+ Value *Stride = Inst->getArgOperand(2);
+ LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride,
+ cast<ConstantInt>(Inst->getArgOperand(3))->isOne(),
+ {Inst->getArgOperand(4), Inst->getArgOperand(5)});
+ }
+
+ // Set elements I..I+NumElts-1 to Block
+ Value *insertVector(Value *Col, unsigned I, Value *Block,
+ IRBuilder<> &Builder) {
+
+ // First, bring Block to the same size as Col
+ unsigned BlockNumElts =
+ cast<FixedVectorType>(Block->getType())->getNumElements();
+ unsigned NumElts = cast<FixedVectorType>(Col->getType())->getNumElements();
+ assert(NumElts >= BlockNumElts && "Too few elements for current block");
+
+ Block = Builder.CreateShuffleVector(
Block, createSequentialMask(0, BlockNumElts, NumElts - BlockNumElts));
-
- // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7,
- // 8, 4, 5, 6
- SmallVector<int, 16> Mask;
- unsigned i;
- for (i = 0; i < I; i++)
- Mask.push_back(i);
-
- unsigned VecNumElts =
- cast<FixedVectorType>(Col->getType())->getNumElements();
- for (; i < I + BlockNumElts; i++)
- Mask.push_back(i - I + VecNumElts);
-
- for (; i < VecNumElts; i++)
- Mask.push_back(i);
-
- return Builder.CreateShuffleVector(Col, Block, Mask);
- }
-
- Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp,
- IRBuilder<> &Builder, bool AllowContraction,
- unsigned &NumComputeOps) {
- NumComputeOps += getNumOps(A->getType());
- if (!Sum)
- return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B);
-
- if (UseFPOp) {
- if (AllowContraction) {
- // Use fmuladd for floating point operations and let the backend decide
- // if that's profitable.
- Function *FMulAdd = Intrinsic::getDeclaration(
- Func.getParent(), Intrinsic::fmuladd, A->getType());
- return Builder.CreateCall(FMulAdd, {A, B, Sum});
- }
- NumComputeOps += getNumOps(A->getType());
- Value *Mul = Builder.CreateFMul(A, B);
- return Builder.CreateFAdd(Sum, Mul);
- }
-
- NumComputeOps += getNumOps(A->getType());
- Value *Mul = Builder.CreateMul(A, B);
- return Builder.CreateAdd(Sum, Mul);
- }
-
- /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For
- /// users with shape information, there's nothing to do: the will use the
- /// cached value when they are lowered. For other users, \p Matrix is
- /// flattened and the uses are updated to use it. Also marks \p Inst for
- /// deletion.
- void finalizeLowering(Instruction *Inst, MatrixTy Matrix,
- IRBuilder<> &Builder) {
- Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix));
-
- ToRemove.push_back(Inst);
- Value *Flattened = nullptr;
- for (auto I = Inst->use_begin(), E = Inst->use_end(); I != E;) {
- Use &U = *I++;
- if (ShapeMap.find(U.getUser()) == ShapeMap.end()) {
- if (!Flattened)
- Flattened = Matrix.embedInVector(Builder);
- U.set(Flattened);
- }
- }
- }
-
- /// Compute \p Result += \p A * \p B for input matrices with left-associating
- /// addition.
- void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A,
- const MatrixTy &B, bool AllowContraction,
- IRBuilder<> &Builder, bool isTiled) {
- const unsigned VF = std::max<unsigned>(
- TTI.getRegisterBitWidth(true) /
- Result.getElementType()->getPrimitiveSizeInBits().getFixedSize(),
- 1U);
- unsigned R = Result.getNumRows();
- unsigned C = Result.getNumColumns();
- unsigned M = A.getNumColumns();
-
- bool IsFP = Result.getElementType()->isFloatingPointTy();
- assert(A.isColumnMajor() == B.isColumnMajor() &&
- Result.isColumnMajor() == A.isColumnMajor() &&
- "operands must agree on matrix layout");
- unsigned NumComputeOps = 0;
- if (A.isColumnMajor()) {
- // Multiply columns from the first operand with scalars from the second
- // operand. Then move along the K axes and accumulate the columns. With
- // this the adds can be vectorized without reassociation.
- for (unsigned J = 0; J < C; ++J) {
- unsigned BlockSize = VF;
- // If Result is zero, we don't need to accumulate in the K==0 iteration.
- bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J));
-
- for (unsigned I = 0; I < R; I += BlockSize) {
- // Gradually lower the vectorization factor to cover the remainder.
- while (I + BlockSize > R)
- BlockSize /= 2;
-
- Value *Sum = isTiled ? Result.extractVector(I, J, BlockSize, Builder)
- : nullptr;
- for (unsigned K = 0; K < M; ++K) {
- Value *L = A.extractVector(I, K, BlockSize, Builder);
- Value *RH = Builder.CreateExtractElement(B.getColumn(J), K);
- Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");
- Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat,
- Result.getElementType()->isFloatingPointTy(),
- Builder, AllowContraction, NumComputeOps);
- }
- Result.setVector(J,
- insertVector(Result.getVector(J), I, Sum, Builder));
- }
- }
- } else {
- // Multiply rows from the second operand with scalars from the first
- // operand. Then move along the K axes and accumulate the rows. With this
- // the adds can be vectorized without reassociation.
- for (unsigned I = 0; I < R; ++I) {
- unsigned BlockSize = VF;
- bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I));
- for (unsigned J = 0; J < C; J += BlockSize) {
- // Gradually lower the vectorization factor to cover the remainder.
- while (J + BlockSize > C)
- BlockSize /= 2;
-
- Value *Sum = nullptr;
- for (unsigned K = 0; K < M; ++K) {
- Value *R = B.extractVector(K, J, BlockSize, Builder);
- Value *LH = Builder.CreateExtractElement(A.getVector(I), K);
- Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat");
- Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R,
- IsFP, Builder, AllowContraction, NumComputeOps);
- }
- Result.setVector(I,
- insertVector(Result.getVector(I), J, Sum, Builder));
- }
- }
- }
- Result.addNumComputeOps(NumComputeOps);
- }
-
- /// Ensure that the memory in \p Load does not alias \p Store by potentially
- /// copying it to a new location. This new or otherwise the original location
- /// is returned.
- Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store,
- CallInst *MatMul) {
- MemoryLocation StoreLoc = MemoryLocation::get(Store);
- MemoryLocation LoadLoc = MemoryLocation::get(Load);
-
+
+ // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7,
+ // 8, 4, 5, 6
+ SmallVector<int, 16> Mask;
+ unsigned i;
+ for (i = 0; i < I; i++)
+ Mask.push_back(i);
+
+ unsigned VecNumElts =
+ cast<FixedVectorType>(Col->getType())->getNumElements();
+ for (; i < I + BlockNumElts; i++)
+ Mask.push_back(i - I + VecNumElts);
+
+ for (; i < VecNumElts; i++)
+ Mask.push_back(i);
+
+ return Builder.CreateShuffleVector(Col, Block, Mask);
+ }
+
+ Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp,
+ IRBuilder<> &Builder, bool AllowContraction,
+ unsigned &NumComputeOps) {
+ NumComputeOps += getNumOps(A->getType());
+ if (!Sum)
+ return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B);
+
+ if (UseFPOp) {
+ if (AllowContraction) {
+ // Use fmuladd for floating point operations and let the backend decide
+ // if that's profitable.
+ Function *FMulAdd = Intrinsic::getDeclaration(
+ Func.getParent(), Intrinsic::fmuladd, A->getType());
+ return Builder.CreateCall(FMulAdd, {A, B, Sum});
+ }
+ NumComputeOps += getNumOps(A->getType());
+ Value *Mul = Builder.CreateFMul(A, B);
+ return Builder.CreateFAdd(Sum, Mul);
+ }
+
+ NumComputeOps += getNumOps(A->getType());
+ Value *Mul = Builder.CreateMul(A, B);
+ return Builder.CreateAdd(Sum, Mul);
+ }
+
+ /// Cache \p Matrix as result of \p Inst and update the uses of \p Inst. For
+ /// users with shape information, there's nothing to do: the will use the
+ /// cached value when they are lowered. For other users, \p Matrix is
+ /// flattened and the uses are updated to use it. Also marks \p Inst for
+ /// deletion.
+ void finalizeLowering(Instruction *Inst, MatrixTy Matrix,
+ IRBuilder<> &Builder) {
+ Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix));
+
+ ToRemove.push_back(Inst);
+ Value *Flattened = nullptr;
+ for (auto I = Inst->use_begin(), E = Inst->use_end(); I != E;) {
+ Use &U = *I++;
+ if (ShapeMap.find(U.getUser()) == ShapeMap.end()) {
+ if (!Flattened)
+ Flattened = Matrix.embedInVector(Builder);
+ U.set(Flattened);
+ }
+ }
+ }
+
+ /// Compute \p Result += \p A * \p B for input matrices with left-associating
+ /// addition.
+ void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A,
+ const MatrixTy &B, bool AllowContraction,
+ IRBuilder<> &Builder, bool isTiled) {
+ const unsigned VF = std::max<unsigned>(
+ TTI.getRegisterBitWidth(true) /
+ Result.getElementType()->getPrimitiveSizeInBits().getFixedSize(),
+ 1U);
+ unsigned R = Result.getNumRows();
+ unsigned C = Result.getNumColumns();
+ unsigned M = A.getNumColumns();
+
+ bool IsFP = Result.getElementType()->isFloatingPointTy();
+ assert(A.isColumnMajor() == B.isColumnMajor() &&
+ Result.isColumnMajor() == A.isColumnMajor() &&
+ "operands must agree on matrix layout");
+ unsigned NumComputeOps = 0;
+ if (A.isColumnMajor()) {
+ // Multiply columns from the first operand with scalars from the second
+ // operand. Then move along the K axes and accumulate the columns. With
+ // this the adds can be vectorized without reassociation.
+ for (unsigned J = 0; J < C; ++J) {
+ unsigned BlockSize = VF;
+ // If Result is zero, we don't need to accumulate in the K==0 iteration.
+ bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J));
+
+ for (unsigned I = 0; I < R; I += BlockSize) {
+ // Gradually lower the vectorization factor to cover the remainder.
+ while (I + BlockSize > R)
+ BlockSize /= 2;
+
+ Value *Sum = isTiled ? Result.extractVector(I, J, BlockSize, Builder)
+ : nullptr;
+ for (unsigned K = 0; K < M; ++K) {
+ Value *L = A.extractVector(I, K, BlockSize, Builder);
+ Value *RH = Builder.CreateExtractElement(B.getColumn(J), K);
+ Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");
+ Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat,
+ Result.getElementType()->isFloatingPointTy(),
+ Builder, AllowContraction, NumComputeOps);
+ }
+ Result.setVector(J,
+ insertVector(Result.getVector(J), I, Sum, Builder));
+ }
+ }
+ } else {
+ // Multiply rows from the second operand with scalars from the first
+ // operand. Then move along the K axes and accumulate the rows. With this
+ // the adds can be vectorized without reassociation.
+ for (unsigned I = 0; I < R; ++I) {
+ unsigned BlockSize = VF;
+ bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I));
+ for (unsigned J = 0; J < C; J += BlockSize) {
+ // Gradually lower the vectorization factor to cover the remainder.
+ while (J + BlockSize > C)
+ BlockSize /= 2;
+
+ Value *Sum = nullptr;
+ for (unsigned K = 0; K < M; ++K) {
+ Value *R = B.extractVector(K, J, BlockSize, Builder);
+ Value *LH = Builder.CreateExtractElement(A.getVector(I), K);
+ Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat");
+ Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R,
+ IsFP, Builder, AllowContraction, NumComputeOps);
+ }
+ Result.setVector(I,
+ insertVector(Result.getVector(I), J, Sum, Builder));
+ }
+ }
+ }
+ Result.addNumComputeOps(NumComputeOps);
+ }
+
+ /// Ensure that the memory in \p Load does not alias \p Store by potentially
+ /// copying it to a new location. This new or otherwise the original location
+ /// is returned.
+ Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store,
+ CallInst *MatMul) {
+ MemoryLocation StoreLoc = MemoryLocation::get(Store);
+ MemoryLocation LoadLoc = MemoryLocation::get(Load);
+
AliasResult LdAliased = AA->alias(LoadLoc, StoreLoc);
-
- // If we can statically determine noalias we're good.
- if (!LdAliased)
- return Load->getPointerOperand();
-
- // Create code to check if the memory locations of the Load and Store
- // overlap and if they do, copy Load's operand to a new buffer.
-
- // First, create new blocks for 2n part of the check and the copy.
- BasicBlock *Check0 = MatMul->getParent();
- // FIXME: Use lazy DTU and update SplitBlock to accept a DTU instead of a
- // DT. Manually collect dominator tree updates, to avoid unnecessary work,
- // as we adjust Check0 and Check1's branches.
- SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
- for (BasicBlock *Succ : successors(Check0))
+
+ // If we can statically determine noalias we're good.
+ if (!LdAliased)
+ return Load->getPointerOperand();
+
+ // Create code to check if the memory locations of the Load and Store
+ // overlap and if they do, copy Load's operand to a new buffer.
+
+ // First, create new blocks for 2n part of the check and the copy.
+ BasicBlock *Check0 = MatMul->getParent();
+ // FIXME: Use lazy DTU and update SplitBlock to accept a DTU instead of a
+ // DT. Manually collect dominator tree updates, to avoid unnecessary work,
+ // as we adjust Check0 and Check1's branches.
+ SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+ for (BasicBlock *Succ : successors(Check0))
DTUpdates.push_back({DT->Delete, Check0, Succ});
-
+
BasicBlock *Check1 =
SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,
nullptr, "alias_cont");
- BasicBlock *Copy =
+ BasicBlock *Copy =
SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,
nullptr, "copy");
BasicBlock *Fusion =
SplitBlock(MatMul->getParent(), MatMul, (DomTreeUpdater *)nullptr, LI,
nullptr, "no_alias");
-
- // Check if the loaded memory location begins before the end of the store
- // location. If the condition holds, they might overlap, otherwise they are
- // guaranteed to not overlap.
- IRBuilder<> Builder(MatMul);
- Check0->getTerminator()->eraseFromParent();
- Builder.SetInsertPoint(Check0);
- Type *IntPtrTy = Builder.getIntPtrTy(Load->getModule()->getDataLayout());
- Value *StoreBegin = Builder.CreatePtrToInt(
- const_cast<Value *>(StoreLoc.Ptr), IntPtrTy, "store.begin");
- Value *StoreEnd = Builder.CreateAdd(
- StoreBegin, ConstantInt::get(IntPtrTy, StoreLoc.Size.getValue()),
- "store.end", true, true);
- Value *LoadBegin = Builder.CreatePtrToInt(const_cast<Value *>(LoadLoc.Ptr),
- IntPtrTy, "load.begin");
- Builder.CreateCondBr(Builder.CreateICmpULT(LoadBegin, StoreEnd), Check1,
- Fusion);
-
- // Check if the store begins before the end of the load location. If the
- // condition holds, they alias, otherwise they are guaranteed to not
- // overlap.
- Check1->getTerminator()->eraseFromParent();
- Builder.SetInsertPoint(Check1, Check1->begin());
- Value *LoadEnd = Builder.CreateAdd(
- LoadBegin, ConstantInt::get(IntPtrTy, LoadLoc.Size.getValue()),
- "load.end", true, true);
- Builder.CreateCondBr(Builder.CreateICmpULT(StoreBegin, LoadEnd), Copy,
- Fusion);
-
- // Copy load operand to new alloca.
- Builder.SetInsertPoint(Copy, Copy->begin());
- AllocaInst *NewLd =
- Builder.CreateAlloca(Load->getType(), Load->getPointerAddressSpace());
- Builder.CreateMemCpy(NewLd, NewLd->getAlign(),
- Load->getPointerOperand(), Load->getAlign(),
- LoadLoc.Size.getValue());
- Builder.SetInsertPoint(Fusion, Fusion->begin());
- PHINode *PHI = Builder.CreatePHI(Load->getPointerOperandType(), 3);
- PHI->addIncoming(Load->getPointerOperand(), Check0);
- PHI->addIncoming(Load->getPointerOperand(), Check1);
- PHI->addIncoming(NewLd, Copy);
-
- // Adjust DT.
+
+ // Check if the loaded memory location begins before the end of the store
+ // location. If the condition holds, they might overlap, otherwise they are
+ // guaranteed to not overlap.
+ IRBuilder<> Builder(MatMul);
+ Check0->getTerminator()->eraseFromParent();
+ Builder.SetInsertPoint(Check0);
+ Type *IntPtrTy = Builder.getIntPtrTy(Load->getModule()->getDataLayout());
+ Value *StoreBegin = Builder.CreatePtrToInt(
+ const_cast<Value *>(StoreLoc.Ptr), IntPtrTy, "store.begin");
+ Value *StoreEnd = Builder.CreateAdd(
+ StoreBegin, ConstantInt::get(IntPtrTy, StoreLoc.Size.getValue()),
+ "store.end", true, true);
+ Value *LoadBegin = Builder.CreatePtrToInt(const_cast<Value *>(LoadLoc.Ptr),
+ IntPtrTy, "load.begin");
+ Builder.CreateCondBr(Builder.CreateICmpULT(LoadBegin, StoreEnd), Check1,
+ Fusion);
+
+ // Check if the store begins before the end of the load location. If the
+ // condition holds, they alias, otherwise they are guaranteed to not
+ // overlap.
+ Check1->getTerminator()->eraseFromParent();
+ Builder.SetInsertPoint(Check1, Check1->begin());
+ Value *LoadEnd = Builder.CreateAdd(
+ LoadBegin, ConstantInt::get(IntPtrTy, LoadLoc.Size.getValue()),
+ "load.end", true, true);
+ Builder.CreateCondBr(Builder.CreateICmpULT(StoreBegin, LoadEnd), Copy,
+ Fusion);
+
+ // Copy load operand to new alloca.
+ Builder.SetInsertPoint(Copy, Copy->begin());
+ AllocaInst *NewLd =
+ Builder.CreateAlloca(Load->getType(), Load->getPointerAddressSpace());
+ Builder.CreateMemCpy(NewLd, NewLd->getAlign(),
+ Load->getPointerOperand(), Load->getAlign(),
+ LoadLoc.Size.getValue());
+ Builder.SetInsertPoint(Fusion, Fusion->begin());
+ PHINode *PHI = Builder.CreatePHI(Load->getPointerOperandType(), 3);
+ PHI->addIncoming(Load->getPointerOperand(), Check0);
+ PHI->addIncoming(Load->getPointerOperand(), Check1);
+ PHI->addIncoming(NewLd, Copy);
+
+ // Adjust DT.
DTUpdates.push_back({DT->Insert, Check0, Check1});
DTUpdates.push_back({DT->Insert, Check0, Fusion});
DTUpdates.push_back({DT->Insert, Check1, Copy});
DTUpdates.push_back({DT->Insert, Check1, Fusion});
DT->applyUpdates(DTUpdates);
- return PHI;
- }
-
- bool isFusionProfitable(CallInst *MatMul) {
- if (ForceFusion)
- return true;
-
- ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
- ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
-
- const unsigned R = LShape.NumRows;
- const unsigned C = RShape.NumColumns;
- const unsigned M = LShape.NumColumns;
- auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
-
- const unsigned VF =
- std::max<unsigned>(TTI.getRegisterBitWidth(true) /
- EltType->getPrimitiveSizeInBits().getFixedSize(),
- 1U);
-
- // Cost model for tiling
- //
- // For tiling to be beneficial, we need reuse either along the R or
- // the C axis. We vectorize along the R axis so that means at least
- // 3 elements.
- // TODO: Also consider cost of copying if operands alias.
- if (R <= VF && C == 1)
- return false;
- // Then we need enough elements to exceed the number of vector
- // registers we have. Note that this is an oversimplification since
- // fusing also takes some extra loads which may exceed the number of
- // reloads necessary.
- unsigned Op0Regs = (R + VF - 1) / VF * M;
- unsigned Op1Regs = (M + VF - 1) / VF * C;
- return Op0Regs + Op1Regs > TTI.getNumberOfRegisters(true);
- }
-
- MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) {
- MatrixTy Res;
- auto *ColumType = FixedVectorType::get(EltType, R);
- for (unsigned I = 0; I < C; ++I)
- Res.addVector(ConstantAggregateZero::get(ColumType));
- return Res;
- }
-
+ return PHI;
+ }
+
+ bool isFusionProfitable(CallInst *MatMul) {
+ if (ForceFusion)
+ return true;
+
+ ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
+ ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
+
+ const unsigned R = LShape.NumRows;
+ const unsigned C = RShape.NumColumns;
+ const unsigned M = LShape.NumColumns;
+ auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
+
+ const unsigned VF =
+ std::max<unsigned>(TTI.getRegisterBitWidth(true) /
+ EltType->getPrimitiveSizeInBits().getFixedSize(),
+ 1U);
+
+ // Cost model for tiling
+ //
+ // For tiling to be beneficial, we need reuse either along the R or
+ // the C axis. We vectorize along the R axis so that means at least
+ // 3 elements.
+ // TODO: Also consider cost of copying if operands alias.
+ if (R <= VF && C == 1)
+ return false;
+ // Then we need enough elements to exceed the number of vector
+ // registers we have. Note that this is an oversimplification since
+ // fusing also takes some extra loads which may exceed the number of
+ // reloads necessary.
+ unsigned Op0Regs = (R + VF - 1) / VF * M;
+ unsigned Op1Regs = (M + VF - 1) / VF * C;
+ return Op0Regs + Op1Regs > TTI.getNumberOfRegisters(true);
+ }
+
+ MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) {
+ MatrixTy Res;
+ auto *ColumType = FixedVectorType::get(EltType, R);
+ for (unsigned I = 0; I < C; ++I)
+ Res.addVector(ConstantAggregateZero::get(ColumType));
+ return Res;
+ }
+
void createTiledLoops(CallInst *MatMul, Value *LPtr, ShapeInfo LShape,
Value *RPtr, ShapeInfo RShape, StoreInst *Store,
bool AllowContract) {
@@ -1266,28 +1266,28 @@ public:
"llvm.loop.unroll.count", InnerLoopUnrollCount);
}
- void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1,
- StoreInst *Store,
- SmallPtrSetImpl<Instruction *> &FusedInsts) {
- assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
- "Tiling only supported for column-major matrixes at the moment!");
- if (!isFusionProfitable(MatMul))
- return;
-
- ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
- ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
-
- const unsigned R = LShape.NumRows;
- const unsigned C = RShape.NumColumns;
- const unsigned M = LShape.NumColumns;
- auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
-
- Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul);
- Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);
- Value *CPtr = Store->getPointerOperand();
-
- bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
- MatMul->hasAllowContract());
+ void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1,
+ StoreInst *Store,
+ SmallPtrSetImpl<Instruction *> &FusedInsts) {
+ assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
+ "Tiling only supported for column-major matrixes at the moment!");
+ if (!isFusionProfitable(MatMul))
+ return;
+
+ ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
+ ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
+
+ const unsigned R = LShape.NumRows;
+ const unsigned C = RShape.NumColumns;
+ const unsigned M = LShape.NumColumns;
+ auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
+
+ Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul);
+ Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);
+ Value *CPtr = Store->getPointerOperand();
+
+ bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
+ MatMul->hasAllowContract());
if (TileUseLoops && (R % TileSize == 0 && C % TileSize == 0))
createTiledLoops(MatMul, APtr, LShape, BPtr, RShape, Store,
AllowContract);
@@ -1298,7 +1298,7 @@ public:
const unsigned TileR = std::min(R - I, unsigned(TileSize));
const unsigned TileC = std::min(C - J, unsigned(TileSize));
MatrixTy Res = getZeroMatrix(EltType, TileR, TileC);
-
+
for (unsigned K = 0; K < M; K += TileSize) {
const unsigned TileM = std::min(M - K, unsigned(TileSize));
MatrixTy A =
@@ -1314,192 +1314,192 @@ public:
storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},
Builder.getInt64(I), Builder.getInt64(J), EltType,
Builder);
- }
+ }
+ }
+
+ // Mark eliminated instructions as fused and remove them.
+ FusedInsts.insert(Store);
+ FusedInsts.insert(MatMul);
+ Store->eraseFromParent();
+ MatMul->eraseFromParent();
+ if (LoadOp0->hasNUses(0)) {
+ FusedInsts.insert(LoadOp0);
+ LoadOp0->eraseFromParent();
}
-
- // Mark eliminated instructions as fused and remove them.
- FusedInsts.insert(Store);
- FusedInsts.insert(MatMul);
- Store->eraseFromParent();
- MatMul->eraseFromParent();
- if (LoadOp0->hasNUses(0)) {
- FusedInsts.insert(LoadOp0);
- LoadOp0->eraseFromParent();
- }
- if (LoadOp1->hasNUses(0)) {
- FusedInsts.insert(LoadOp1);
- LoadOp1->eraseFromParent();
- }
- }
-
- /// Try to lower matrix multiply chains by fusing operations.
- ///
- /// Currently we only lower {ld, ld} -> matmul -> st chains.
- //
- /// No need to return a MatrixTy object for the result of the operation, since
- /// the single store user will be lowered as part of this. Instructions that
- /// are completely eliminated by fusion are added to \p FusedInsts.
- void LowerMatrixMultiplyFused(CallInst *MatMul,
- SmallPtrSetImpl<Instruction *> &FusedInsts) {
- if (!FuseMatrix || !MatMul->hasOneUse() ||
+ if (LoadOp1->hasNUses(0)) {
+ FusedInsts.insert(LoadOp1);
+ LoadOp1->eraseFromParent();
+ }
+ }
+
+ /// Try to lower matrix multiply chains by fusing operations.
+ ///
+ /// Currently we only lower {ld, ld} -> matmul -> st chains.
+ //
+ /// No need to return a MatrixTy object for the result of the operation, since
+ /// the single store user will be lowered as part of this. Instructions that
+ /// are completely eliminated by fusion are added to \p FusedInsts.
+ void LowerMatrixMultiplyFused(CallInst *MatMul,
+ SmallPtrSetImpl<Instruction *> &FusedInsts) {
+ if (!FuseMatrix || !MatMul->hasOneUse() ||
MatrixLayout != MatrixLayoutTy::ColumnMajor || !DT)
- return;
-
+ return;
+
assert(AA && LI && "Analyses should be available");
- auto *LoadOp0 = dyn_cast<LoadInst>(MatMul->getOperand(0));
- auto *LoadOp1 = dyn_cast<LoadInst>(MatMul->getOperand(1));
- auto *Store = dyn_cast<StoreInst>(*MatMul->user_begin());
- if (LoadOp0 && LoadOp1 && Store) {
- // The store address must dominate the MatMul instruction, otherwise
- // we create invalid IR.
- // FIXME: See if we can hoist the store address computation.
- auto *AddrI = dyn_cast<Instruction>(Store->getOperand(1));
+ auto *LoadOp0 = dyn_cast<LoadInst>(MatMul->getOperand(0));
+ auto *LoadOp1 = dyn_cast<LoadInst>(MatMul->getOperand(1));
+ auto *Store = dyn_cast<StoreInst>(*MatMul->user_begin());
+ if (LoadOp0 && LoadOp1 && Store) {
+ // The store address must dominate the MatMul instruction, otherwise
+ // we create invalid IR.
+ // FIXME: See if we can hoist the store address computation.
+ auto *AddrI = dyn_cast<Instruction>(Store->getOperand(1));
if (AddrI && (!DT->dominates(AddrI, MatMul)))
- return;
-
- emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts);
- return;
- }
- }
-
- /// Lowers llvm.matrix.multiply.
- void LowerMultiply(CallInst *MatMul) {
- IRBuilder<> Builder(MatMul);
- auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
- ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
- ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
-
- const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder);
- const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder);
+ return;
+
+ emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts);
+ return;
+ }
+ }
+
+ /// Lowers llvm.matrix.multiply.
+ void LowerMultiply(CallInst *MatMul) {
+ IRBuilder<> Builder(MatMul);
+ auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
+ ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
+ ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
+
+ const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder);
+ const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder);
assert(Lhs.getElementType() == Rhs.getElementType() &&
"Matrix multiply argument element types do not match.");
-
- const unsigned R = LShape.NumRows;
- const unsigned C = RShape.NumColumns;
- assert(LShape.NumColumns == RShape.NumRows);
-
- // Initialize the output
- MatrixTy Result(R, C, EltType);
+
+ const unsigned R = LShape.NumRows;
+ const unsigned C = RShape.NumColumns;
+ assert(LShape.NumColumns == RShape.NumRows);
+
+ // Initialize the output
+ MatrixTy Result(R, C, EltType);
assert(Lhs.getElementType() == Result.getElementType() &&
"Matrix multiply result element type does not match arguments.");
-
- bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
- MatMul->hasAllowContract());
- emitMatrixMultiply(Result, Lhs, Rhs, AllowContract, Builder, false);
- finalizeLowering(MatMul, Result, Builder);
- }
-
- /// Lowers llvm.matrix.transpose.
- void LowerTranspose(CallInst *Inst) {
- MatrixTy Result;
- IRBuilder<> Builder(Inst);
- Value *InputVal = Inst->getArgOperand(0);
- VectorType *VectorTy = cast<VectorType>(InputVal->getType());
- ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2));
- MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder);
-
- const unsigned NewNumVecs =
- InputMatrix.isColumnMajor() ? ArgShape.NumRows : ArgShape.NumColumns;
- const unsigned NewNumElts =
- InputMatrix.isColumnMajor() ? ArgShape.NumColumns : ArgShape.NumRows;
-
- for (unsigned I = 0; I < NewNumVecs; ++I) {
- // Build a single result vector. First initialize it.
- Value *ResultVector = UndefValue::get(
- FixedVectorType::get(VectorTy->getElementType(), NewNumElts));
- // Go through the old elements and insert it into the resulting vector.
- for (auto J : enumerate(InputMatrix.vectors())) {
- Value *Elt = Builder.CreateExtractElement(J.value(), I);
- // Row and column indices are transposed.
- ResultVector =
- Builder.CreateInsertElement(ResultVector, Elt, J.index());
- }
- Result.addVector(ResultVector);
- }
-
- // TODO: Improve estimate of operations needed for transposes. Currently we
- // just count the insertelement/extractelement instructions, but do not
- // account for later simplifications/combines.
- finalizeLowering(
- Inst,
- Result.addNumComputeOps(2 * ArgShape.NumRows * ArgShape.NumColumns),
- Builder);
- }
-
- /// Lower load instructions, if shape information is available.
- bool VisitLoad(LoadInst *Inst, Value *Ptr, IRBuilder<> &Builder) {
- auto I = ShapeMap.find(Inst);
- if (I == ShapeMap.end())
- return false;
-
- LowerLoad(Inst, Ptr, Inst->getAlign(),
- Builder.getInt64(I->second.getStride()), Inst->isVolatile(),
- I->second);
- return true;
- }
-
- bool VisitStore(StoreInst *Inst, Value *StoredVal, Value *Ptr,
- IRBuilder<> &Builder) {
- auto I = ShapeMap.find(StoredVal);
- if (I == ShapeMap.end())
- return false;
-
- LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
- Builder.getInt64(I->second.getStride()), Inst->isVolatile(),
- I->second);
- return true;
- }
-
- /// Lower binary operators, if shape information is available.
- bool VisitBinaryOperator(BinaryOperator *Inst) {
- auto I = ShapeMap.find(Inst);
- if (I == ShapeMap.end())
- return false;
-
- Value *Lhs = Inst->getOperand(0);
- Value *Rhs = Inst->getOperand(1);
-
- IRBuilder<> Builder(Inst);
- ShapeInfo &Shape = I->second;
-
- MatrixTy Result;
- MatrixTy A = getMatrix(Lhs, Shape, Builder);
- MatrixTy B = getMatrix(Rhs, Shape, Builder);
- assert(A.isColumnMajor() == B.isColumnMajor() &&
- Result.isColumnMajor() == A.isColumnMajor() &&
- "operands must agree on matrix layout");
-
- // Helper to perform binary op on vectors.
- auto BuildVectorOp = [&Builder, Inst](Value *LHS, Value *RHS) {
- switch (Inst->getOpcode()) {
- case Instruction::Add:
- return Builder.CreateAdd(LHS, RHS);
- case Instruction::Mul:
- return Builder.CreateMul(LHS, RHS);
- case Instruction::Sub:
- return Builder.CreateSub(LHS, RHS);
- case Instruction::FAdd:
- return Builder.CreateFAdd(LHS, RHS);
- case Instruction::FMul:
- return Builder.CreateFMul(LHS, RHS);
- case Instruction::FSub:
- return Builder.CreateFSub(LHS, RHS);
- default:
- llvm_unreachable("Unsupported binary operator for matrix");
- }
- };
-
- for (unsigned I = 0; I < Shape.getNumVectors(); ++I)
- Result.addVector(BuildVectorOp(A.getVector(I), B.getVector(I)));
-
- finalizeLowering(Inst,
- Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
- Result.getNumVectors()),
- Builder);
- return true;
- }
-
+
+ bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
+ MatMul->hasAllowContract());
+ emitMatrixMultiply(Result, Lhs, Rhs, AllowContract, Builder, false);
+ finalizeLowering(MatMul, Result, Builder);
+ }
+
+ /// Lowers llvm.matrix.transpose.
+ void LowerTranspose(CallInst *Inst) {
+ MatrixTy Result;
+ IRBuilder<> Builder(Inst);
+ Value *InputVal = Inst->getArgOperand(0);
+ VectorType *VectorTy = cast<VectorType>(InputVal->getType());
+ ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2));
+ MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder);
+
+ const unsigned NewNumVecs =
+ InputMatrix.isColumnMajor() ? ArgShape.NumRows : ArgShape.NumColumns;
+ const unsigned NewNumElts =
+ InputMatrix.isColumnMajor() ? ArgShape.NumColumns : ArgShape.NumRows;
+
+ for (unsigned I = 0; I < NewNumVecs; ++I) {
+ // Build a single result vector. First initialize it.
+ Value *ResultVector = UndefValue::get(
+ FixedVectorType::get(VectorTy->getElementType(), NewNumElts));
+ // Go through the old elements and insert it into the resulting vector.
+ for (auto J : enumerate(InputMatrix.vectors())) {
+ Value *Elt = Builder.CreateExtractElement(J.value(), I);
+ // Row and column indices are transposed.
+ ResultVector =
+ Builder.CreateInsertElement(ResultVector, Elt, J.index());
+ }
+ Result.addVector(ResultVector);
+ }
+
+ // TODO: Improve estimate of operations needed for transposes. Currently we
+ // just count the insertelement/extractelement instructions, but do not
+ // account for later simplifications/combines.
+ finalizeLowering(
+ Inst,
+ Result.addNumComputeOps(2 * ArgShape.NumRows * ArgShape.NumColumns),
+ Builder);
+ }
+
+ /// Lower load instructions, if shape information is available.
+ bool VisitLoad(LoadInst *Inst, Value *Ptr, IRBuilder<> &Builder) {
+ auto I = ShapeMap.find(Inst);
+ if (I == ShapeMap.end())
+ return false;
+
+ LowerLoad(Inst, Ptr, Inst->getAlign(),
+ Builder.getInt64(I->second.getStride()), Inst->isVolatile(),
+ I->second);
+ return true;
+ }
+
+ bool VisitStore(StoreInst *Inst, Value *StoredVal, Value *Ptr,
+ IRBuilder<> &Builder) {
+ auto I = ShapeMap.find(StoredVal);
+ if (I == ShapeMap.end())
+ return false;
+
+ LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
+ Builder.getInt64(I->second.getStride()), Inst->isVolatile(),
+ I->second);
+ return true;
+ }
+
+ /// Lower binary operators, if shape information is available.
+ bool VisitBinaryOperator(BinaryOperator *Inst) {
+ auto I = ShapeMap.find(Inst);
+ if (I == ShapeMap.end())
+ return false;
+
+ Value *Lhs = Inst->getOperand(0);
+ Value *Rhs = Inst->getOperand(1);
+
+ IRBuilder<> Builder(Inst);
+ ShapeInfo &Shape = I->second;
+
+ MatrixTy Result;
+ MatrixTy A = getMatrix(Lhs, Shape, Builder);
+ MatrixTy B = getMatrix(Rhs, Shape, Builder);
+ assert(A.isColumnMajor() == B.isColumnMajor() &&
+ Result.isColumnMajor() == A.isColumnMajor() &&
+ "operands must agree on matrix layout");
+
+ // Helper to perform binary op on vectors.
+ auto BuildVectorOp = [&Builder, Inst](Value *LHS, Value *RHS) {
+ switch (Inst->getOpcode()) {
+ case Instruction::Add:
+ return Builder.CreateAdd(LHS, RHS);
+ case Instruction::Mul:
+ return Builder.CreateMul(LHS, RHS);
+ case Instruction::Sub:
+ return Builder.CreateSub(LHS, RHS);
+ case Instruction::FAdd:
+ return Builder.CreateFAdd(LHS, RHS);
+ case Instruction::FMul:
+ return Builder.CreateFMul(LHS, RHS);
+ case Instruction::FSub:
+ return Builder.CreateFSub(LHS, RHS);
+ default:
+ llvm_unreachable("Unsupported binary operator for matrix");
+ }
+ };
+
+ for (unsigned I = 0; I < Shape.getNumVectors(); ++I)
+ Result.addVector(BuildVectorOp(A.getVector(I), B.getVector(I)));
+
+ finalizeLowering(Inst,
+ Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+ Result.getNumVectors()),
+ Builder);
+ return true;
+ }
+
/// Lower unary operators, if shape information is available.
bool VisitUnaryOperator(UnaryOperator *Inst) {
auto I = ShapeMap.find(Inst);
@@ -1534,449 +1534,449 @@ public:
return true;
}
- /// Helper to linearize a matrix expression tree into a string. Currently
- /// matrix expressions are linarized by starting at an expression leaf and
- /// linearizing bottom up.
- struct ExprLinearizer {
- unsigned LengthToBreak = 100;
- std::string Str;
- raw_string_ostream Stream;
- unsigned LineLength = 0;
- const DataLayout &DL;
-
- /// Mapping from instructions to matrixes. It is used to identify
- /// matrix instructions.
- const MapVector<Value *, MatrixTy> &Inst2Matrix;
-
- /// Mapping from values to the leaves of all expressions that the value is
- /// part of.
- const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared;
-
- /// Set of matrix expressions in the scope of a given DISubprogram.
- const SmallSetVector<Value *, 32> &ExprsInSubprogram;
-
- /// Leaf node of the expression to linearize.
- Value *Leaf;
-
- /// Used to keep track of sub-expressions that get reused while linearizing
- /// the expression. Re-used sub-expressions are marked as (reused).
- SmallPtrSet<Value *, 8> ReusedExprs;
-
- ExprLinearizer(const DataLayout &DL,
- const MapVector<Value *, MatrixTy> &Inst2Matrix,
- const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
- const SmallSetVector<Value *, 32> &ExprsInSubprogram,
- Value *Leaf)
- : Str(), Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared),
- ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {}
-
- void indent(unsigned N) {
- LineLength += N;
- for (unsigned i = 0; i < N; i++)
- Stream << " ";
- }
-
- void lineBreak() {
- Stream << "\n";
- LineLength = 0;
- }
-
- void maybeIndent(unsigned Indent) {
- if (LineLength >= LengthToBreak)
- lineBreak();
-
- if (LineLength == 0)
- indent(Indent);
- }
-
- void write(StringRef S) {
- LineLength += S.size();
- Stream << S;
- }
-
- Value *getUnderlyingObjectThroughLoads(Value *V) {
- if (Value *Ptr = getPointerOperand(V))
- return getUnderlyingObjectThroughLoads(Ptr);
- else if (V->getType()->isPointerTy())
+ /// Helper to linearize a matrix expression tree into a string. Currently
+ /// matrix expressions are linarized by starting at an expression leaf and
+ /// linearizing bottom up.
+ struct ExprLinearizer {
+ unsigned LengthToBreak = 100;
+ std::string Str;
+ raw_string_ostream Stream;
+ unsigned LineLength = 0;
+ const DataLayout &DL;
+
+ /// Mapping from instructions to matrixes. It is used to identify
+ /// matrix instructions.
+ const MapVector<Value *, MatrixTy> &Inst2Matrix;
+
+ /// Mapping from values to the leaves of all expressions that the value is
+ /// part of.
+ const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared;
+
+ /// Set of matrix expressions in the scope of a given DISubprogram.
+ const SmallSetVector<Value *, 32> &ExprsInSubprogram;
+
+ /// Leaf node of the expression to linearize.
+ Value *Leaf;
+
+ /// Used to keep track of sub-expressions that get reused while linearizing
+ /// the expression. Re-used sub-expressions are marked as (reused).
+ SmallPtrSet<Value *, 8> ReusedExprs;
+
+ ExprLinearizer(const DataLayout &DL,
+ const MapVector<Value *, MatrixTy> &Inst2Matrix,
+ const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
+ const SmallSetVector<Value *, 32> &ExprsInSubprogram,
+ Value *Leaf)
+ : Str(), Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared),
+ ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {}
+
+ void indent(unsigned N) {
+ LineLength += N;
+ for (unsigned i = 0; i < N; i++)
+ Stream << " ";
+ }
+
+ void lineBreak() {
+ Stream << "\n";
+ LineLength = 0;
+ }
+
+ void maybeIndent(unsigned Indent) {
+ if (LineLength >= LengthToBreak)
+ lineBreak();
+
+ if (LineLength == 0)
+ indent(Indent);
+ }
+
+ void write(StringRef S) {
+ LineLength += S.size();
+ Stream << S;
+ }
+
+ Value *getUnderlyingObjectThroughLoads(Value *V) {
+ if (Value *Ptr = getPointerOperand(V))
+ return getUnderlyingObjectThroughLoads(Ptr);
+ else if (V->getType()->isPointerTy())
return getUnderlyingObject(V);
- return V;
- }
-
- /// Returns true if \p V is a matrix value in the given subprogram.
- bool isMatrix(Value *V) const { return ExprsInSubprogram.count(V); }
-
- /// If \p V is a matrix value, print its shape as as NumRows x NumColumns to
- /// \p SS.
- void prettyPrintMatrixType(Value *V, raw_string_ostream &SS) {
- auto M = Inst2Matrix.find(V);
- if (M == Inst2Matrix.end())
- SS << "unknown";
- else {
- SS << M->second.getNumRows();
- SS << "x";
- SS << M->second.getNumColumns();
- }
- }
-
- /// Write the called function name. Handles calls to llvm.matrix.*
- /// specially: we write the name, followed by the dimensions of the input
- /// matrixes, followed by the scalar type name.
- void writeFnName(CallInst *CI) {
- if (!CI->getCalledFunction())
- write("<no called fn>");
- else {
- StringRef Name = CI->getCalledFunction()->getName();
- if (!Name.startswith("llvm.matrix")) {
- write(Name);
- return;
- }
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
- write(StringRef(Intrinsic::getName(II->getIntrinsicID(), {}))
- .drop_front(StringRef("llvm.matrix.").size()));
- write(".");
+ return V;
+ }
+
+ /// Returns true if \p V is a matrix value in the given subprogram.
+ bool isMatrix(Value *V) const { return ExprsInSubprogram.count(V); }
+
+ /// If \p V is a matrix value, print its shape as as NumRows x NumColumns to
+ /// \p SS.
+ void prettyPrintMatrixType(Value *V, raw_string_ostream &SS) {
+ auto M = Inst2Matrix.find(V);
+ if (M == Inst2Matrix.end())
+ SS << "unknown";
+ else {
+ SS << M->second.getNumRows();
+ SS << "x";
+ SS << M->second.getNumColumns();
+ }
+ }
+
+ /// Write the called function name. Handles calls to llvm.matrix.*
+ /// specially: we write the name, followed by the dimensions of the input
+ /// matrixes, followed by the scalar type name.
+ void writeFnName(CallInst *CI) {
+ if (!CI->getCalledFunction())
+ write("<no called fn>");
+ else {
+ StringRef Name = CI->getCalledFunction()->getName();
+ if (!Name.startswith("llvm.matrix")) {
+ write(Name);
+ return;
+ }
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+ write(StringRef(Intrinsic::getName(II->getIntrinsicID(), {}))
+ .drop_front(StringRef("llvm.matrix.").size()));
+ write(".");
std::string Tmp;
- raw_string_ostream SS(Tmp);
-
- switch (II->getIntrinsicID()) {
- case Intrinsic::matrix_multiply:
- prettyPrintMatrixType(II->getOperand(0), SS);
- SS << ".";
- prettyPrintMatrixType(II->getOperand(1), SS);
- SS << "." << *II->getType()->getScalarType();
- break;
- case Intrinsic::matrix_transpose:
- prettyPrintMatrixType(II->getOperand(0), SS);
- SS << "." << *II->getType()->getScalarType();
- break;
- case Intrinsic::matrix_column_major_load:
- prettyPrintMatrixType(II, SS);
- SS << "." << *II->getType()->getScalarType();
- break;
- case Intrinsic::matrix_column_major_store:
- prettyPrintMatrixType(II->getOperand(0), SS);
- SS << "." << *II->getOperand(0)->getType()->getScalarType();
- break;
- default:
- llvm_unreachable("Unhandled case");
- }
- SS.flush();
- write(Tmp);
- }
- }
-
- unsigned getNumShapeArgs(CallInst *CI) const {
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
- switch (II->getIntrinsicID()) {
- case Intrinsic::matrix_multiply:
- return 3;
- case Intrinsic::matrix_transpose:
- return 2;
- case Intrinsic::matrix_column_major_load:
- case Intrinsic::matrix_column_major_store:
- return 3;
- default:
- return 0;
- }
- }
- return 0;
- }
-
- /// Special printing for values: for pointers, we print if they refer to an
- /// (function) external address or a stack address, for other values we
- /// either print the constant or "scalar"/"matrix" for other values.
- void write(Value *V) {
- V = getUnderlyingObjectThroughLoads(V);
- if (V->getType()->isPointerTy()) {
- if (isa<AllocaInst>(V)) {
- Stream << "stack addr";
- LineLength += StringRef("stack addr").size();
- } else {
- Stream << "addr";
- LineLength += StringRef("addr").size();
- }
- if (!V->getName().empty()) {
- Stream << " %" << V->getName() << "";
- LineLength += V->getName().size() + 2;
- }
- return;
- }
-
- std::string Tmp;
- raw_string_ostream TmpStream(Tmp);
-
- if (auto *CI = dyn_cast<ConstantInt>(V))
- TmpStream << CI->getValue();
- else if (isa<Constant>(V))
- TmpStream << "constant";
- else {
- if (isMatrix(V))
- TmpStream << "matrix";
- else
- TmpStream << "scalar";
- }
- TmpStream.flush();
- Tmp = std::string(StringRef(Tmp).trim());
- LineLength += Tmp.size();
- Stream << Tmp;
- }
-
- /// Linearize expression \p Expr starting at an indentation of \p Indent.
- /// Expressions that are re-used multiple times are prefixed with (reused)
- /// at the re-used root instruction.
- void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused,
- bool ParentShared) {
- auto *I = cast<Instruction>(Expr);
- maybeIndent(Indent);
- SmallVector<Value *, 8> Ops;
-
- // Is Expr shared with other expression leaves?
- bool ExprShared = false;
-
- // Deal with shared subtrees. Mark them as shared, if required.
- if (!ParentShared) {
- auto SI = Shared.find(Expr);
- assert(SI != Shared.end() && SI->second.count(Leaf));
-
- for (Value *S : SI->second) {
- if (S == Leaf)
- continue;
- DebugLoc DL = cast<Instruction>(S)->getDebugLoc();
- write("shared with remark at line " + std::to_string(DL.getLine()) +
- " column " + std::to_string(DL.getCol()) + " (");
- }
- ExprShared = SI->second.size() > 1;
- }
-
- bool Reused = !ReusedExprs.insert(Expr).second;
- if (Reused && !ParentReused)
- write("(reused) ");
-
- if (auto *CI = dyn_cast<CallInst>(I)) {
- writeFnName(CI);
-
- Ops.append(CI->arg_begin(), CI->arg_end() - getNumShapeArgs(CI));
- } else if (isa<BitCastInst>(Expr)) {
- // Special case bitcasts, which are used to materialize matrixes from
- // non-matrix ops.
- write("matrix");
- return;
- } else {
- Ops.append(I->value_op_begin(), I->value_op_end());
- write(std::string(I->getOpcodeName()));
- }
-
- write(std::string("("));
-
- unsigned NumOpsToBreak = 1;
- if (match(Expr, m_Intrinsic<Intrinsic::matrix_column_major_load>()))
- NumOpsToBreak = 2;
-
- for (Value *Op : Ops) {
- if (Ops.size() > NumOpsToBreak)
- lineBreak();
-
- maybeIndent(Indent + 1);
- if (isMatrix(Op))
- linearizeExpr(Op, Indent + 1, Reused, ExprShared);
- else
- write(Op);
- if (Op != Ops.back())
- write(", ");
- }
-
- write(")");
- }
-
- const std::string &getResult() {
- Stream.flush();
- return Str;
- }
- };
-
- /// Generate remarks for matrix operations in a function. To generate remarks
- /// for matrix expressions, the following approach is used:
- /// 1. Use the inlined-at debug information to group matrix operations to the
- /// DISubprograms they are contained in.
- /// 2. Collect leaves of matrix expressions (done in
- /// RemarkGenerator::getExpressionLeaves) for each subprogram - expression
- // mapping. Leaves are lowered matrix instructions without other matrix
- // users (like stores) in the current subprogram.
- /// 3. For each leaf, create a remark containing a linearizied version of the
- /// matrix expression. The expression is linearized by a recursive
- /// bottom-up traversal of the matrix operands, starting at a leaf. Note
- /// that multiple leaves can share sub-expressions. Shared subexpressions
- /// are explicitly marked as shared().
- struct RemarkGenerator {
- const MapVector<Value *, MatrixTy> &Inst2Matrix;
- OptimizationRemarkEmitter &ORE;
- Function &Func;
- const DataLayout &DL;
-
- RemarkGenerator(const MapVector<Value *, MatrixTy> &Inst2Matrix,
- OptimizationRemarkEmitter &ORE, Function &Func)
- : Inst2Matrix(Inst2Matrix), ORE(ORE), Func(Func),
- DL(Func.getParent()->getDataLayout()) {}
-
- /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are
- /// instructions in Inst2Matrix returning void or without any users in
- /// \p ExprsInSubprogram. Currently that should only include stores.
- SmallVector<Value *, 4>
- getExpressionLeaves(const SmallSetVector<Value *, 32> &ExprsInSubprogram) {
- SmallVector<Value *, 4> Leaves;
- for (auto *Expr : ExprsInSubprogram)
- if (Expr->getType()->isVoidTy() ||
- !any_of(Expr->users(), [&ExprsInSubprogram](User *U) {
- return ExprsInSubprogram.count(U);
- }))
- Leaves.push_back(Expr);
- return Leaves;
- }
-
- /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf
- /// to all visited expressions in \p Shared. Limit the matrix operations to
- /// the ones in \p ExprsInSubprogram.
- void collectSharedInfo(Value *Leaf, Value *V,
- const SmallSetVector<Value *, 32> &ExprsInSubprogram,
- DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {
-
- if (!ExprsInSubprogram.count(V))
- return;
-
- auto I = Shared.insert({V, {}});
- I.first->second.insert(Leaf);
-
- for (Value *Op : cast<Instruction>(V)->operand_values())
- collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared);
- }
-
- /// Calculate the number of exclusive and shared op counts for expression
- /// starting at \p V. Expressions used multiple times are counted once.
- /// Limit the matrix operations to the ones in \p ExprsInSubprogram.
- std::pair<OpInfoTy, OpInfoTy>
- sumOpInfos(Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs,
- const SmallSetVector<Value *, 32> &ExprsInSubprogram,
- DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) const {
- if (!ExprsInSubprogram.count(Root))
- return {};
-
- // Already counted this expression. Stop.
- if (!ReusedExprs.insert(Root).second)
- return {};
-
- OpInfoTy SharedCount;
- OpInfoTy Count;
-
- auto I = Shared.find(Root);
- auto CM = Inst2Matrix.find(Root);
- if (I->second.size() == 1)
- Count = CM->second.getOpInfo();
- else
- SharedCount = CM->second.getOpInfo();
-
- for (Value *Op : cast<Instruction>(Root)->operand_values()) {
- auto C = sumOpInfos(Op, ReusedExprs, ExprsInSubprogram, Shared);
- Count += C.first;
- SharedCount += C.second;
- }
- return {Count, SharedCount};
- }
-
- void emitRemarks() {
- if (!ORE.allowExtraAnalysis(DEBUG_TYPE))
- return;
-
- // Map matrix operations to their containting subprograms, by traversing
- // the inlinedAt chain. If the function does not have a DISubprogram, we
- // only map them to the containing function.
- MapVector<DISubprogram *, SmallVector<Value *, 8>> Subprog2Exprs;
- for (auto &KV : Inst2Matrix) {
- if (Func.getSubprogram()) {
- auto *I = cast<Instruction>(KV.first);
- DILocation *Context = I->getDebugLoc();
- while (Context) {
- auto I =
- Subprog2Exprs.insert({getSubprogram(Context->getScope()), {}});
- I.first->second.push_back(KV.first);
- Context = DebugLoc(Context).getInlinedAt();
- }
- } else {
- auto I = Subprog2Exprs.insert({nullptr, {}});
- I.first->second.push_back(KV.first);
- }
- }
- for (auto &KV : Subprog2Exprs) {
- SmallSetVector<Value *, 32> ExprsInSubprogram(KV.second.begin(),
- KV.second.end());
- auto Leaves = getExpressionLeaves(ExprsInSubprogram);
-
- DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;
- for (Value *Leaf : Leaves)
- collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared);
-
- // Generate remarks for each leaf.
- for (auto *L : Leaves) {
-
- DebugLoc Loc = cast<Instruction>(L)->getDebugLoc();
- DILocation *Context = cast<Instruction>(L)->getDebugLoc();
- while (Context) {
- if (getSubprogram(Context->getScope()) == KV.first) {
- Loc = Context;
- break;
- }
- Context = DebugLoc(Context).getInlinedAt();
- }
-
- SmallPtrSet<Value *, 8> ReusedExprs;
- OpInfoTy Counts, SharedCounts;
- std::tie(Counts, SharedCounts) =
- sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared);
-
- OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc,
- cast<Instruction>(L)->getParent());
-
- Rem << "Lowered with ";
- Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "
- << ore::NV("NumLoads", Counts.NumLoads) << " loads, "
- << ore::NV("NumComputeOps", Counts.NumComputeOps)
- << " compute ops";
-
- if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||
- SharedCounts.NumComputeOps > 0) {
- Rem << ",\nadditionally "
- << ore::NV("NumStores", SharedCounts.NumStores) << " stores, "
- << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "
- << ore::NV("NumFPOps", SharedCounts.NumComputeOps)
- << " compute ops"
- << " are shared with other expressions";
- }
-
- Rem << ("\n" + linearize(L, Shared, ExprsInSubprogram, DL));
- ORE.emit(Rem);
- }
- }
- }
-
- std::string
- linearize(Value *L,
- const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
- const SmallSetVector<Value *, 32> &ExprsInSubprogram,
- const DataLayout &DL) {
- ExprLinearizer Lin(DL, Inst2Matrix, Shared, ExprsInSubprogram, L);
- Lin.linearizeExpr(L, 0, false, false);
- return Lin.getResult();
- }
- };
-};
-} // namespace
-
-PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ raw_string_ostream SS(Tmp);
+
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::matrix_multiply:
+ prettyPrintMatrixType(II->getOperand(0), SS);
+ SS << ".";
+ prettyPrintMatrixType(II->getOperand(1), SS);
+ SS << "." << *II->getType()->getScalarType();
+ break;
+ case Intrinsic::matrix_transpose:
+ prettyPrintMatrixType(II->getOperand(0), SS);
+ SS << "." << *II->getType()->getScalarType();
+ break;
+ case Intrinsic::matrix_column_major_load:
+ prettyPrintMatrixType(II, SS);
+ SS << "." << *II->getType()->getScalarType();
+ break;
+ case Intrinsic::matrix_column_major_store:
+ prettyPrintMatrixType(II->getOperand(0), SS);
+ SS << "." << *II->getOperand(0)->getType()->getScalarType();
+ break;
+ default:
+ llvm_unreachable("Unhandled case");
+ }
+ SS.flush();
+ write(Tmp);
+ }
+ }
+
+ unsigned getNumShapeArgs(CallInst *CI) const {
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::matrix_multiply:
+ return 3;
+ case Intrinsic::matrix_transpose:
+ return 2;
+ case Intrinsic::matrix_column_major_load:
+ case Intrinsic::matrix_column_major_store:
+ return 3;
+ default:
+ return 0;
+ }
+ }
+ return 0;
+ }
+
+ /// Special printing for values: for pointers, we print if they refer to an
+ /// (function) external address or a stack address, for other values we
+ /// either print the constant or "scalar"/"matrix" for other values.
+ void write(Value *V) {
+ V = getUnderlyingObjectThroughLoads(V);
+ if (V->getType()->isPointerTy()) {
+ if (isa<AllocaInst>(V)) {
+ Stream << "stack addr";
+ LineLength += StringRef("stack addr").size();
+ } else {
+ Stream << "addr";
+ LineLength += StringRef("addr").size();
+ }
+ if (!V->getName().empty()) {
+ Stream << " %" << V->getName() << "";
+ LineLength += V->getName().size() + 2;
+ }
+ return;
+ }
+
+ std::string Tmp;
+ raw_string_ostream TmpStream(Tmp);
+
+ if (auto *CI = dyn_cast<ConstantInt>(V))
+ TmpStream << CI->getValue();
+ else if (isa<Constant>(V))
+ TmpStream << "constant";
+ else {
+ if (isMatrix(V))
+ TmpStream << "matrix";
+ else
+ TmpStream << "scalar";
+ }
+ TmpStream.flush();
+ Tmp = std::string(StringRef(Tmp).trim());
+ LineLength += Tmp.size();
+ Stream << Tmp;
+ }
+
+ /// Linearize expression \p Expr starting at an indentation of \p Indent.
+ /// Expressions that are re-used multiple times are prefixed with (reused)
+ /// at the re-used root instruction.
+ void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused,
+ bool ParentShared) {
+ auto *I = cast<Instruction>(Expr);
+ maybeIndent(Indent);
+ SmallVector<Value *, 8> Ops;
+
+ // Is Expr shared with other expression leaves?
+ bool ExprShared = false;
+
+ // Deal with shared subtrees. Mark them as shared, if required.
+ if (!ParentShared) {
+ auto SI = Shared.find(Expr);
+ assert(SI != Shared.end() && SI->second.count(Leaf));
+
+ for (Value *S : SI->second) {
+ if (S == Leaf)
+ continue;
+ DebugLoc DL = cast<Instruction>(S)->getDebugLoc();
+ write("shared with remark at line " + std::to_string(DL.getLine()) +
+ " column " + std::to_string(DL.getCol()) + " (");
+ }
+ ExprShared = SI->second.size() > 1;
+ }
+
+ bool Reused = !ReusedExprs.insert(Expr).second;
+ if (Reused && !ParentReused)
+ write("(reused) ");
+
+ if (auto *CI = dyn_cast<CallInst>(I)) {
+ writeFnName(CI);
+
+ Ops.append(CI->arg_begin(), CI->arg_end() - getNumShapeArgs(CI));
+ } else if (isa<BitCastInst>(Expr)) {
+ // Special case bitcasts, which are used to materialize matrixes from
+ // non-matrix ops.
+ write("matrix");
+ return;
+ } else {
+ Ops.append(I->value_op_begin(), I->value_op_end());
+ write(std::string(I->getOpcodeName()));
+ }
+
+ write(std::string("("));
+
+ unsigned NumOpsToBreak = 1;
+ if (match(Expr, m_Intrinsic<Intrinsic::matrix_column_major_load>()))
+ NumOpsToBreak = 2;
+
+ for (Value *Op : Ops) {
+ if (Ops.size() > NumOpsToBreak)
+ lineBreak();
+
+ maybeIndent(Indent + 1);
+ if (isMatrix(Op))
+ linearizeExpr(Op, Indent + 1, Reused, ExprShared);
+ else
+ write(Op);
+ if (Op != Ops.back())
+ write(", ");
+ }
+
+ write(")");
+ }
+
+ const std::string &getResult() {
+ Stream.flush();
+ return Str;
+ }
+ };
+
+ /// Generate remarks for matrix operations in a function. To generate remarks
+ /// for matrix expressions, the following approach is used:
+ /// 1. Use the inlined-at debug information to group matrix operations to the
+ /// DISubprograms they are contained in.
+ /// 2. Collect leaves of matrix expressions (done in
+ /// RemarkGenerator::getExpressionLeaves) for each subprogram - expression
+ // mapping. Leaves are lowered matrix instructions without other matrix
+ // users (like stores) in the current subprogram.
+ /// 3. For each leaf, create a remark containing a linearizied version of the
+ /// matrix expression. The expression is linearized by a recursive
+ /// bottom-up traversal of the matrix operands, starting at a leaf. Note
+ /// that multiple leaves can share sub-expressions. Shared subexpressions
+ /// are explicitly marked as shared().
+ struct RemarkGenerator {
+ const MapVector<Value *, MatrixTy> &Inst2Matrix;
+ OptimizationRemarkEmitter &ORE;
+ Function &Func;
+ const DataLayout &DL;
+
+ RemarkGenerator(const MapVector<Value *, MatrixTy> &Inst2Matrix,
+ OptimizationRemarkEmitter &ORE, Function &Func)
+ : Inst2Matrix(Inst2Matrix), ORE(ORE), Func(Func),
+ DL(Func.getParent()->getDataLayout()) {}
+
+ /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are
+ /// instructions in Inst2Matrix returning void or without any users in
+ /// \p ExprsInSubprogram. Currently that should only include stores.
+ SmallVector<Value *, 4>
+ getExpressionLeaves(const SmallSetVector<Value *, 32> &ExprsInSubprogram) {
+ SmallVector<Value *, 4> Leaves;
+ for (auto *Expr : ExprsInSubprogram)
+ if (Expr->getType()->isVoidTy() ||
+ !any_of(Expr->users(), [&ExprsInSubprogram](User *U) {
+ return ExprsInSubprogram.count(U);
+ }))
+ Leaves.push_back(Expr);
+ return Leaves;
+ }
+
+ /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf
+ /// to all visited expressions in \p Shared. Limit the matrix operations to
+ /// the ones in \p ExprsInSubprogram.
+ void collectSharedInfo(Value *Leaf, Value *V,
+ const SmallSetVector<Value *, 32> &ExprsInSubprogram,
+ DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {
+
+ if (!ExprsInSubprogram.count(V))
+ return;
+
+ auto I = Shared.insert({V, {}});
+ I.first->second.insert(Leaf);
+
+ for (Value *Op : cast<Instruction>(V)->operand_values())
+ collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared);
+ }
+
+ /// Calculate the number of exclusive and shared op counts for expression
+ /// starting at \p V. Expressions used multiple times are counted once.
+ /// Limit the matrix operations to the ones in \p ExprsInSubprogram.
+ std::pair<OpInfoTy, OpInfoTy>
+ sumOpInfos(Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs,
+ const SmallSetVector<Value *, 32> &ExprsInSubprogram,
+ DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) const {
+ if (!ExprsInSubprogram.count(Root))
+ return {};
+
+ // Already counted this expression. Stop.
+ if (!ReusedExprs.insert(Root).second)
+ return {};
+
+ OpInfoTy SharedCount;
+ OpInfoTy Count;
+
+ auto I = Shared.find(Root);
+ auto CM = Inst2Matrix.find(Root);
+ if (I->second.size() == 1)
+ Count = CM->second.getOpInfo();
+ else
+ SharedCount = CM->second.getOpInfo();
+
+ for (Value *Op : cast<Instruction>(Root)->operand_values()) {
+ auto C = sumOpInfos(Op, ReusedExprs, ExprsInSubprogram, Shared);
+ Count += C.first;
+ SharedCount += C.second;
+ }
+ return {Count, SharedCount};
+ }
+
+ void emitRemarks() {
+ if (!ORE.allowExtraAnalysis(DEBUG_TYPE))
+ return;
+
+ // Map matrix operations to their containting subprograms, by traversing
+ // the inlinedAt chain. If the function does not have a DISubprogram, we
+ // only map them to the containing function.
+ MapVector<DISubprogram *, SmallVector<Value *, 8>> Subprog2Exprs;
+ for (auto &KV : Inst2Matrix) {
+ if (Func.getSubprogram()) {
+ auto *I = cast<Instruction>(KV.first);
+ DILocation *Context = I->getDebugLoc();
+ while (Context) {
+ auto I =
+ Subprog2Exprs.insert({getSubprogram(Context->getScope()), {}});
+ I.first->second.push_back(KV.first);
+ Context = DebugLoc(Context).getInlinedAt();
+ }
+ } else {
+ auto I = Subprog2Exprs.insert({nullptr, {}});
+ I.first->second.push_back(KV.first);
+ }
+ }
+ for (auto &KV : Subprog2Exprs) {
+ SmallSetVector<Value *, 32> ExprsInSubprogram(KV.second.begin(),
+ KV.second.end());
+ auto Leaves = getExpressionLeaves(ExprsInSubprogram);
+
+ DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;
+ for (Value *Leaf : Leaves)
+ collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared);
+
+ // Generate remarks for each leaf.
+ for (auto *L : Leaves) {
+
+ DebugLoc Loc = cast<Instruction>(L)->getDebugLoc();
+ DILocation *Context = cast<Instruction>(L)->getDebugLoc();
+ while (Context) {
+ if (getSubprogram(Context->getScope()) == KV.first) {
+ Loc = Context;
+ break;
+ }
+ Context = DebugLoc(Context).getInlinedAt();
+ }
+
+ SmallPtrSet<Value *, 8> ReusedExprs;
+ OpInfoTy Counts, SharedCounts;
+ std::tie(Counts, SharedCounts) =
+ sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared);
+
+ OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc,
+ cast<Instruction>(L)->getParent());
+
+ Rem << "Lowered with ";
+ Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "
+ << ore::NV("NumLoads", Counts.NumLoads) << " loads, "
+ << ore::NV("NumComputeOps", Counts.NumComputeOps)
+ << " compute ops";
+
+ if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||
+ SharedCounts.NumComputeOps > 0) {
+ Rem << ",\nadditionally "
+ << ore::NV("NumStores", SharedCounts.NumStores) << " stores, "
+ << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "
+ << ore::NV("NumFPOps", SharedCounts.NumComputeOps)
+ << " compute ops"
+ << " are shared with other expressions";
+ }
+
+ Rem << ("\n" + linearize(L, Shared, ExprsInSubprogram, DL));
+ ORE.emit(Rem);
+ }
+ }
+ }
+
+ std::string
+ linearize(Value *L,
+ const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
+ const SmallSetVector<Value *, 32> &ExprsInSubprogram,
+ const DataLayout &DL) {
+ ExprLinearizer Lin(DL, Inst2Matrix, Shared, ExprsInSubprogram, L);
+ Lin.linearizeExpr(L, 0, false, false);
+ return Lin.getResult();
+ }
+ };
+};
+} // namespace
+
+PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
OptimizationRemarkEmitter *ORE = nullptr;
AAResults *AA = nullptr;
DominatorTree *DT = nullptr;
LoopInfo *LI = nullptr;
-
+
if (!Minimal) {
ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
AA = &AM.getResult<AAManager>(F);
@@ -1984,66 +1984,66 @@ PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,
LI = &AM.getResult<LoopAnalysis>(F);
}
- LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE);
- if (LMT.Visit()) {
- PreservedAnalyses PA;
+ LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE);
+ if (LMT.Visit()) {
+ PreservedAnalyses PA;
if (!Minimal) {
PA.preserve<LoopAnalysis>();
PA.preserve<DominatorTreeAnalysis>();
}
- return PA;
- }
- return PreservedAnalyses::all();
-}
-
-namespace {
-
-class LowerMatrixIntrinsicsLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- LowerMatrixIntrinsicsLegacyPass() : FunctionPass(ID) {
- initializeLowerMatrixIntrinsicsLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ return PA;
+ }
+ return PreservedAnalyses::all();
+}
+
+namespace {
+
+class LowerMatrixIntrinsicsLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ LowerMatrixIntrinsicsLegacyPass() : FunctionPass(ID) {
+ initializeLowerMatrixIntrinsicsLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
LowerMatrixIntrinsics LMT(F, TTI, &AA, &DT, &LI, &ORE);
- bool C = LMT.Visit();
- return C;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- }
-};
-} // namespace
-
-static const char pass_name[] = "Lower the matrix intrinsics";
-char LowerMatrixIntrinsicsLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
- false, false)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
- false, false)
-
-Pass *llvm::createLowerMatrixIntrinsicsPass() {
- return new LowerMatrixIntrinsicsLegacyPass();
-}
+ bool C = LMT.Visit();
+ return C;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ }
+};
+} // namespace
+
+static const char pass_name[] = "Lower the matrix intrinsics";
+char LowerMatrixIntrinsicsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
+ false, false)
+
+Pass *llvm::createLowerMatrixIntrinsicsPass() {
+ return new LowerMatrixIntrinsicsLegacyPass();
+}
namespace {
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerWidenableCondition.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerWidenableCondition.cpp
index c1cc1c28b9..73b2cd06fa 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/LowerWidenableCondition.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/LowerWidenableCondition.cpp
@@ -1,86 +1,86 @@
-//===- LowerWidenableCondition.cpp - Lower the guard intrinsic ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass lowers the llvm.widenable.condition intrinsic to default value
-// which is i1 true.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/LowerWidenableCondition.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/GuardUtils.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/GuardUtils.h"
-
-using namespace llvm;
-
-namespace {
-struct LowerWidenableConditionLegacyPass : public FunctionPass {
- static char ID;
- LowerWidenableConditionLegacyPass() : FunctionPass(ID) {
- initializeLowerWidenableConditionLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-};
-}
-
-static bool lowerWidenableCondition(Function &F) {
- // Check if we can cheaply rule out the possibility of not having any work to
- // do.
- auto *WCDecl = F.getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_widenable_condition));
- if (!WCDecl || WCDecl->use_empty())
- return false;
-
- using namespace llvm::PatternMatch;
- SmallVector<CallInst *, 8> ToLower;
- for (auto &I : instructions(F))
- if (match(&I, m_Intrinsic<Intrinsic::experimental_widenable_condition>()))
- ToLower.push_back(cast<CallInst>(&I));
-
- if (ToLower.empty())
- return false;
-
- for (auto *CI : ToLower) {
- CI->replaceAllUsesWith(ConstantInt::getTrue(CI->getContext()));
- CI->eraseFromParent();
- }
- return true;
-}
-
-bool LowerWidenableConditionLegacyPass::runOnFunction(Function &F) {
- return lowerWidenableCondition(F);
-}
-
-char LowerWidenableConditionLegacyPass::ID = 0;
-INITIALIZE_PASS(LowerWidenableConditionLegacyPass, "lower-widenable-condition",
- "Lower the widenable condition to default true value", false,
- false)
-
-Pass *llvm::createLowerWidenableConditionPass() {
- return new LowerWidenableConditionLegacyPass();
-}
-
-PreservedAnalyses LowerWidenableConditionPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- if (lowerWidenableCondition(F))
- return PreservedAnalyses::none();
-
- return PreservedAnalyses::all();
-}
+//===- LowerWidenableCondition.cpp - Lower the guard intrinsic ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the llvm.widenable.condition intrinsic to default value
+// which is i1 true.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/LowerWidenableCondition.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
+
+using namespace llvm;
+
+namespace {
+struct LowerWidenableConditionLegacyPass : public FunctionPass {
+ static char ID;
+ LowerWidenableConditionLegacyPass() : FunctionPass(ID) {
+ initializeLowerWidenableConditionLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+};
+}
+
+static bool lowerWidenableCondition(Function &F) {
+ // Check if we can cheaply rule out the possibility of not having any work to
+ // do.
+ auto *WCDecl = F.getParent()->getFunction(
+ Intrinsic::getName(Intrinsic::experimental_widenable_condition));
+ if (!WCDecl || WCDecl->use_empty())
+ return false;
+
+ using namespace llvm::PatternMatch;
+ SmallVector<CallInst *, 8> ToLower;
+ for (auto &I : instructions(F))
+ if (match(&I, m_Intrinsic<Intrinsic::experimental_widenable_condition>()))
+ ToLower.push_back(cast<CallInst>(&I));
+
+ if (ToLower.empty())
+ return false;
+
+ for (auto *CI : ToLower) {
+ CI->replaceAllUsesWith(ConstantInt::getTrue(CI->getContext()));
+ CI->eraseFromParent();
+ }
+ return true;
+}
+
+bool LowerWidenableConditionLegacyPass::runOnFunction(Function &F) {
+ return lowerWidenableCondition(F);
+}
+
+char LowerWidenableConditionLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerWidenableConditionLegacyPass, "lower-widenable-condition",
+ "Lower the widenable condition to default true value", false,
+ false)
+
+Pass *llvm::createLowerWidenableConditionPass() {
+ return new LowerWidenableConditionLegacyPass();
+}
+
+PreservedAnalyses LowerWidenableConditionPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ if (lowerWidenableCondition(F))
+ return PreservedAnalyses::none();
+
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
index 760d6b198b..5ffae128f5 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
@@ -1,108 +1,108 @@
-//===- MakeGuardsExplicit.cpp - Turn guard intrinsics into guard branches -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass lowers the @llvm.experimental.guard intrinsic to the new form of
-// guard represented as widenable explicit branch to the deopt block. The
-// difference between this pass and LowerGuardIntrinsic is that after this pass
-// the guard represented as intrinsic:
-//
-// call void(i1, ...) @llvm.experimental.guard(i1 %old_cond) [ "deopt"() ]
-//
-// transforms to a guard represented as widenable explicit branch:
-//
-// %widenable_cond = call i1 @llvm.experimental.widenable.condition()
-// br i1 (%old_cond & %widenable_cond), label %guarded, label %deopt
-//
-// Here:
-// - The semantics of @llvm.experimental.widenable.condition allows to replace
-// %widenable_cond with the construction (%widenable_cond & %any_other_cond)
-// without loss of correctness;
-// - %guarded is the lower part of old guard intrinsic's parent block split by
-// the intrinsic call;
-// - %deopt is a block containing a sole call to @llvm.experimental.deoptimize
-// intrinsic.
-//
-// Therefore, this branch preserves the property of widenability.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/MakeGuardsExplicit.h"
-#include "llvm/Analysis/GuardUtils.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/GuardUtils.h"
-
-using namespace llvm;
-
-namespace {
-struct MakeGuardsExplicitLegacyPass : public FunctionPass {
- static char ID;
- MakeGuardsExplicitLegacyPass() : FunctionPass(ID) {
- initializeMakeGuardsExplicitLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-};
-}
-
-static void turnToExplicitForm(CallInst *Guard, Function *DeoptIntrinsic) {
- // Replace the guard with an explicit branch (just like in GuardWidening).
- BasicBlock *OriginalBB = Guard->getParent();
- (void)OriginalBB;
- makeGuardControlFlowExplicit(DeoptIntrinsic, Guard, true);
- assert(isWidenableBranch(OriginalBB->getTerminator()) && "should hold");
-
- Guard->eraseFromParent();
-}
-
-static bool explicifyGuards(Function &F) {
- // Check if we can cheaply rule out the possibility of not having any work to
- // do.
- auto *GuardDecl = F.getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_guard));
- if (!GuardDecl || GuardDecl->use_empty())
- return false;
-
- SmallVector<CallInst *, 8> GuardIntrinsics;
- for (auto &I : instructions(F))
- if (isGuard(&I))
- GuardIntrinsics.push_back(cast<CallInst>(&I));
-
- if (GuardIntrinsics.empty())
- return false;
-
- auto *DeoptIntrinsic = Intrinsic::getDeclaration(
- F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()});
- DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv());
-
- for (auto *Guard : GuardIntrinsics)
- turnToExplicitForm(Guard, DeoptIntrinsic);
-
- return true;
-}
-
-bool MakeGuardsExplicitLegacyPass::runOnFunction(Function &F) {
- return explicifyGuards(F);
-}
-
-char MakeGuardsExplicitLegacyPass::ID = 0;
-INITIALIZE_PASS(MakeGuardsExplicitLegacyPass, "make-guards-explicit",
- "Lower the guard intrinsic to explicit control flow form",
- false, false)
-
-PreservedAnalyses MakeGuardsExplicitPass::run(Function &F,
- FunctionAnalysisManager &) {
- if (explicifyGuards(F))
- return PreservedAnalyses::none();
- return PreservedAnalyses::all();
-}
+//===- MakeGuardsExplicit.cpp - Turn guard intrinsics into guard branches -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers the @llvm.experimental.guard intrinsic to the new form of
+// guard represented as widenable explicit branch to the deopt block. The
+// difference between this pass and LowerGuardIntrinsic is that after this pass
+// the guard represented as intrinsic:
+//
+// call void(i1, ...) @llvm.experimental.guard(i1 %old_cond) [ "deopt"() ]
+//
+// transforms to a guard represented as widenable explicit branch:
+//
+// %widenable_cond = call i1 @llvm.experimental.widenable.condition()
+// br i1 (%old_cond & %widenable_cond), label %guarded, label %deopt
+//
+// Here:
+// - The semantics of @llvm.experimental.widenable.condition allows to replace
+// %widenable_cond with the construction (%widenable_cond & %any_other_cond)
+// without loss of correctness;
+// - %guarded is the lower part of old guard intrinsic's parent block split by
+// the intrinsic call;
+// - %deopt is a block containing a sole call to @llvm.experimental.deoptimize
+// intrinsic.
+//
+// Therefore, this branch preserves the property of widenability.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/MakeGuardsExplicit.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/GuardUtils.h"
+
+using namespace llvm;
+
+namespace {
+struct MakeGuardsExplicitLegacyPass : public FunctionPass {
+ static char ID;
+ MakeGuardsExplicitLegacyPass() : FunctionPass(ID) {
+ initializeMakeGuardsExplicitLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+};
+}
+
+static void turnToExplicitForm(CallInst *Guard, Function *DeoptIntrinsic) {
+ // Replace the guard with an explicit branch (just like in GuardWidening).
+ BasicBlock *OriginalBB = Guard->getParent();
+ (void)OriginalBB;
+ makeGuardControlFlowExplicit(DeoptIntrinsic, Guard, true);
+ assert(isWidenableBranch(OriginalBB->getTerminator()) && "should hold");
+
+ Guard->eraseFromParent();
+}
+
+static bool explicifyGuards(Function &F) {
+ // Check if we can cheaply rule out the possibility of not having any work to
+ // do.
+ auto *GuardDecl = F.getParent()->getFunction(
+ Intrinsic::getName(Intrinsic::experimental_guard));
+ if (!GuardDecl || GuardDecl->use_empty())
+ return false;
+
+ SmallVector<CallInst *, 8> GuardIntrinsics;
+ for (auto &I : instructions(F))
+ if (isGuard(&I))
+ GuardIntrinsics.push_back(cast<CallInst>(&I));
+
+ if (GuardIntrinsics.empty())
+ return false;
+
+ auto *DeoptIntrinsic = Intrinsic::getDeclaration(
+ F.getParent(), Intrinsic::experimental_deoptimize, {F.getReturnType()});
+ DeoptIntrinsic->setCallingConv(GuardDecl->getCallingConv());
+
+ for (auto *Guard : GuardIntrinsics)
+ turnToExplicitForm(Guard, DeoptIntrinsic);
+
+ return true;
+}
+
+bool MakeGuardsExplicitLegacyPass::runOnFunction(Function &F) {
+ return explicifyGuards(F);
+}
+
+char MakeGuardsExplicitLegacyPass::ID = 0;
+INITIALIZE_PASS(MakeGuardsExplicitLegacyPass, "make-guards-explicit",
+ "Lower the guard intrinsic to explicit control flow form",
+ false, false)
+
+PreservedAnalyses MakeGuardsExplicitPass::run(Function &F,
+ FunctionAnalysisManager &) {
+ if (explicifyGuards(F))
+ return PreservedAnalyses::none();
+ return PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 0583e27906..a4e695497f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1,316 +1,316 @@
-//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass performs various transformations related to eliminating memcpy
-// calls, or transforming sets of stores into memset's.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/GlobalsModRef.h"
+//===- MemCpyOptimizer.cpp - Optimize use of memcpy and friends -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs various transformations related to eliminating memcpy
+// calls, or transforming sets of stores into memset's.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/MemCpyOptimizer.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "memcpyopt"
-
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "memcpyopt"
+
static cl::opt<bool>
EnableMemorySSA("enable-memcpyopt-memoryssa", cl::init(false), cl::Hidden,
cl::desc("Use MemorySSA-backed MemCpyOpt."));
-STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
-STATISTIC(NumMemSetInfer, "Number of memsets inferred");
-STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
-STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
+STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
+STATISTIC(NumMemSetInfer, "Number of memsets inferred");
+STATISTIC(NumMoveToCpy, "Number of memmoves converted to memcpy");
+STATISTIC(NumCpyToSet, "Number of memcpys converted to memset");
STATISTIC(NumCallSlot, "Number of call slot optimizations performed");
-
-namespace {
-
-/// Represents a range of memset'd bytes with the ByteVal value.
-/// This allows us to analyze stores like:
-/// store 0 -> P+1
-/// store 0 -> P+0
-/// store 0 -> P+3
-/// store 0 -> P+2
-/// which sometimes happens with stores to arrays of structs etc. When we see
-/// the first store, we make a range [1, 2). The second store extends the range
-/// to [0, 2). The third makes a new range [2, 3). The fourth store joins the
-/// two ranges into [0, 3) which is memset'able.
-struct MemsetRange {
- // Start/End - A semi range that describes the span that this range covers.
- // The range is closed at the start and open at the end: [Start, End).
- int64_t Start, End;
-
- /// StartPtr - The getelementptr instruction that points to the start of the
- /// range.
- Value *StartPtr;
-
- /// Alignment - The known alignment of the first store.
- unsigned Alignment;
-
- /// TheStores - The actual stores that make up this range.
- SmallVector<Instruction*, 16> TheStores;
-
- bool isProfitableToUseMemset(const DataLayout &DL) const;
-};
-
-} // end anonymous namespace
-
-bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
- // If we found more than 4 stores to merge or 16 bytes, use memset.
- if (TheStores.size() >= 4 || End-Start >= 16) return true;
-
- // If there is nothing to merge, don't do anything.
- if (TheStores.size() < 2) return false;
-
- // If any of the stores are a memset, then it is always good to extend the
- // memset.
- for (Instruction *SI : TheStores)
- if (!isa<StoreInst>(SI))
- return true;
-
- // Assume that the code generator is capable of merging pairs of stores
- // together if it wants to.
- if (TheStores.size() == 2) return false;
-
- // If we have fewer than 8 stores, it can still be worthwhile to do this.
- // For example, merging 4 i8 stores into an i32 store is useful almost always.
- // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
- // memset will be split into 2 32-bit stores anyway) and doing so can
- // pessimize the llvm optimizer.
- //
- // Since we don't have perfect knowledge here, make some assumptions: assume
- // the maximum GPR width is the same size as the largest legal integer
- // size. If so, check to see whether we will end up actually reducing the
- // number of stores used.
- unsigned Bytes = unsigned(End-Start);
- unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8;
- if (MaxIntSize == 0)
- MaxIntSize = 1;
- unsigned NumPointerStores = Bytes / MaxIntSize;
-
- // Assume the remaining bytes if any are done a byte at a time.
- unsigned NumByteStores = Bytes % MaxIntSize;
-
- // If we will reduce the # stores (according to this heuristic), do the
- // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
- // etc.
- return TheStores.size() > NumPointerStores+NumByteStores;
-}
-
-namespace {
-
-class MemsetRanges {
- using range_iterator = SmallVectorImpl<MemsetRange>::iterator;
-
- /// A sorted list of the memset ranges.
- SmallVector<MemsetRange, 8> Ranges;
-
- const DataLayout &DL;
-
-public:
- MemsetRanges(const DataLayout &DL) : DL(DL) {}
-
- using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator;
-
- const_iterator begin() const { return Ranges.begin(); }
- const_iterator end() const { return Ranges.end(); }
- bool empty() const { return Ranges.empty(); }
-
- void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
- if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
- addStore(OffsetFromFirst, SI);
- else
- addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst));
- }
-
- void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
- int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
-
- addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(),
- SI->getAlign().value(), SI);
- }
-
- void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
- int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
- addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlignment(), MSI);
- }
-
- void addRange(int64_t Start, int64_t Size, Value *Ptr,
- unsigned Alignment, Instruction *Inst);
-};
-
-} // end anonymous namespace
-
-/// Add a new store to the MemsetRanges data structure. This adds a
-/// new range for the specified store at the specified offset, merging into
-/// existing ranges as appropriate.
-void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
- unsigned Alignment, Instruction *Inst) {
- int64_t End = Start+Size;
-
- range_iterator I = partition_point(
- Ranges, [=](const MemsetRange &O) { return O.End < Start; });
-
- // We now know that I == E, in which case we didn't find anything to merge
- // with, or that Start <= I->End. If End < I->Start or I == E, then we need
- // to insert a new range. Handle this now.
- if (I == Ranges.end() || End < I->Start) {
- MemsetRange &R = *Ranges.insert(I, MemsetRange());
- R.Start = Start;
- R.End = End;
- R.StartPtr = Ptr;
- R.Alignment = Alignment;
- R.TheStores.push_back(Inst);
- return;
- }
-
- // This store overlaps with I, add it.
- I->TheStores.push_back(Inst);
-
- // At this point, we may have an interval that completely contains our store.
- // If so, just add it to the interval and return.
- if (I->Start <= Start && I->End >= End)
- return;
-
- // Now we know that Start <= I->End and End >= I->Start so the range overlaps
- // but is not entirely contained within the range.
-
- // See if the range extends the start of the range. In this case, it couldn't
- // possibly cause it to join the prior range, because otherwise we would have
- // stopped on *it*.
- if (Start < I->Start) {
- I->Start = Start;
- I->StartPtr = Ptr;
- I->Alignment = Alignment;
- }
-
- // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
- // is in or right at the end of I), and that End >= I->Start. Extend I out to
- // End.
- if (End > I->End) {
- I->End = End;
- range_iterator NextI = I;
- while (++NextI != Ranges.end() && End >= NextI->Start) {
- // Merge the range in.
- I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
- if (NextI->End > I->End)
- I->End = NextI->End;
- Ranges.erase(NextI);
- NextI = I;
- }
- }
-}
-
-//===----------------------------------------------------------------------===//
-// MemCpyOptLegacyPass Pass
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-class MemCpyOptLegacyPass : public FunctionPass {
- MemCpyOptPass Impl;
-
-public:
- static char ID; // Pass identification, replacement for typeid
-
- MemCpyOptLegacyPass() : FunctionPass(ID) {
- initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-
-private:
- // This transformation requires dominator postdominator info
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
+
+namespace {
+
+/// Represents a range of memset'd bytes with the ByteVal value.
+/// This allows us to analyze stores like:
+/// store 0 -> P+1
+/// store 0 -> P+0
+/// store 0 -> P+3
+/// store 0 -> P+2
+/// which sometimes happens with stores to arrays of structs etc. When we see
+/// the first store, we make a range [1, 2). The second store extends the range
+/// to [0, 2). The third makes a new range [2, 3). The fourth store joins the
+/// two ranges into [0, 3) which is memset'able.
+struct MemsetRange {
+ // Start/End - A semi range that describes the span that this range covers.
+ // The range is closed at the start and open at the end: [Start, End).
+ int64_t Start, End;
+
+ /// StartPtr - The getelementptr instruction that points to the start of the
+ /// range.
+ Value *StartPtr;
+
+ /// Alignment - The known alignment of the first store.
+ unsigned Alignment;
+
+ /// TheStores - The actual stores that make up this range.
+ SmallVector<Instruction*, 16> TheStores;
+
+ bool isProfitableToUseMemset(const DataLayout &DL) const;
+};
+
+} // end anonymous namespace
+
+bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
+ // If we found more than 4 stores to merge or 16 bytes, use memset.
+ if (TheStores.size() >= 4 || End-Start >= 16) return true;
+
+ // If there is nothing to merge, don't do anything.
+ if (TheStores.size() < 2) return false;
+
+ // If any of the stores are a memset, then it is always good to extend the
+ // memset.
+ for (Instruction *SI : TheStores)
+ if (!isa<StoreInst>(SI))
+ return true;
+
+ // Assume that the code generator is capable of merging pairs of stores
+ // together if it wants to.
+ if (TheStores.size() == 2) return false;
+
+ // If we have fewer than 8 stores, it can still be worthwhile to do this.
+ // For example, merging 4 i8 stores into an i32 store is useful almost always.
+ // However, merging 2 32-bit stores isn't useful on a 32-bit architecture (the
+ // memset will be split into 2 32-bit stores anyway) and doing so can
+ // pessimize the llvm optimizer.
+ //
+ // Since we don't have perfect knowledge here, make some assumptions: assume
+ // the maximum GPR width is the same size as the largest legal integer
+ // size. If so, check to see whether we will end up actually reducing the
+ // number of stores used.
+ unsigned Bytes = unsigned(End-Start);
+ unsigned MaxIntSize = DL.getLargestLegalIntTypeSizeInBits() / 8;
+ if (MaxIntSize == 0)
+ MaxIntSize = 1;
+ unsigned NumPointerStores = Bytes / MaxIntSize;
+
+ // Assume the remaining bytes if any are done a byte at a time.
+ unsigned NumByteStores = Bytes % MaxIntSize;
+
+ // If we will reduce the # stores (according to this heuristic), do the
+ // transformation. This encourages merging 4 x i8 -> i32 and 2 x i16 -> i32
+ // etc.
+ return TheStores.size() > NumPointerStores+NumByteStores;
+}
+
+namespace {
+
+class MemsetRanges {
+ using range_iterator = SmallVectorImpl<MemsetRange>::iterator;
+
+ /// A sorted list of the memset ranges.
+ SmallVector<MemsetRange, 8> Ranges;
+
+ const DataLayout &DL;
+
+public:
+ MemsetRanges(const DataLayout &DL) : DL(DL) {}
+
+ using const_iterator = SmallVectorImpl<MemsetRange>::const_iterator;
+
+ const_iterator begin() const { return Ranges.begin(); }
+ const_iterator end() const { return Ranges.end(); }
+ bool empty() const { return Ranges.empty(); }
+
+ void addInst(int64_t OffsetFromFirst, Instruction *Inst) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+ addStore(OffsetFromFirst, SI);
+ else
+ addMemSet(OffsetFromFirst, cast<MemSetInst>(Inst));
+ }
+
+ void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
+ int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
+
+ addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(),
+ SI->getAlign().value(), SI);
+ }
+
+ void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
+ int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+ addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getDestAlignment(), MSI);
+ }
+
+ void addRange(int64_t Start, int64_t Size, Value *Ptr,
+ unsigned Alignment, Instruction *Inst);
+};
+
+} // end anonymous namespace
+
+/// Add a new store to the MemsetRanges data structure. This adds a
+/// new range for the specified store at the specified offset, merging into
+/// existing ranges as appropriate.
+void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
+ unsigned Alignment, Instruction *Inst) {
+ int64_t End = Start+Size;
+
+ range_iterator I = partition_point(
+ Ranges, [=](const MemsetRange &O) { return O.End < Start; });
+
+ // We now know that I == E, in which case we didn't find anything to merge
+ // with, or that Start <= I->End. If End < I->Start or I == E, then we need
+ // to insert a new range. Handle this now.
+ if (I == Ranges.end() || End < I->Start) {
+ MemsetRange &R = *Ranges.insert(I, MemsetRange());
+ R.Start = Start;
+ R.End = End;
+ R.StartPtr = Ptr;
+ R.Alignment = Alignment;
+ R.TheStores.push_back(Inst);
+ return;
+ }
+
+ // This store overlaps with I, add it.
+ I->TheStores.push_back(Inst);
+
+ // At this point, we may have an interval that completely contains our store.
+ // If so, just add it to the interval and return.
+ if (I->Start <= Start && I->End >= End)
+ return;
+
+ // Now we know that Start <= I->End and End >= I->Start so the range overlaps
+ // but is not entirely contained within the range.
+
+ // See if the range extends the start of the range. In this case, it couldn't
+ // possibly cause it to join the prior range, because otherwise we would have
+ // stopped on *it*.
+ if (Start < I->Start) {
+ I->Start = Start;
+ I->StartPtr = Ptr;
+ I->Alignment = Alignment;
+ }
+
+ // Now we know that Start <= I->End and Start >= I->Start (so the startpoint
+ // is in or right at the end of I), and that End >= I->Start. Extend I out to
+ // End.
+ if (End > I->End) {
+ I->End = End;
+ range_iterator NextI = I;
+ while (++NextI != Ranges.end() && End >= NextI->Start) {
+ // Merge the range in.
+ I->TheStores.append(NextI->TheStores.begin(), NextI->TheStores.end());
+ if (NextI->End > I->End)
+ I->End = NextI->End;
+ Ranges.erase(NextI);
+ NextI = I;
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// MemCpyOptLegacyPass Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+class MemCpyOptLegacyPass : public FunctionPass {
+ MemCpyOptPass Impl;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ MemCpyOptLegacyPass() : FunctionPass(ID) {
+ initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ // This transformation requires dominator postdominator info
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
AU.addPreserved<DominatorTreeWrapperPass>();
AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
if (!EnableMemorySSA)
AU.addRequired<MemoryDependenceWrapperPass>();
- AU.addPreserved<MemoryDependenceWrapperPass>();
+ AU.addPreserved<MemoryDependenceWrapperPass>();
AU.addRequired<AAResultsWrapperPass>();
AU.addPreserved<AAResultsWrapperPass>();
if (EnableMemorySSA)
AU.addRequired<MemorySSAWrapperPass>();
AU.addPreserved<MemorySSAWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-char MemCpyOptLegacyPass::ID = 0;
-
-/// The public interface to this file...
-FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); }
-
-INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
- false, false)
-
+ }
+};
+
+} // end anonymous namespace
+
+char MemCpyOptLegacyPass::ID = 0;
+
+/// The public interface to this file...
+FunctionPass *llvm::createMemCpyOptPass() { return new MemCpyOptLegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(MemCpyOptLegacyPass, "memcpyopt", "MemCpy Optimization",
+ false, false)
+
// Check that V is either not accessible by the caller, or unwinding cannot
// occur between Start and End.
static bool mayBeVisibleThroughUnwinding(Value *V, Instruction *Start,
@@ -361,22 +361,22 @@ static bool writtenBetween(MemorySSA *MSSA, MemoryLocation Loc,
return !MSSA->dominates(Clobber, Start);
}
-/// When scanning forward over instructions, we look for some other patterns to
-/// fold away. In particular, this looks for stores to neighboring locations of
-/// memory. If it sees enough consecutive ones, it attempts to merge them
-/// together into a memcpy/memset.
-Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
- Value *StartPtr,
- Value *ByteVal) {
- const DataLayout &DL = StartInst->getModule()->getDataLayout();
-
- // Okay, so we now have a single store that can be splatable. Scan to find
- // all subsequent stores of the same value to offset from the same pointer.
- // Join these together into ranges, so we can decide whether contiguous blocks
- // are stored.
- MemsetRanges Ranges(DL);
-
- BasicBlock::iterator BI(StartInst);
+/// When scanning forward over instructions, we look for some other patterns to
+/// fold away. In particular, this looks for stores to neighboring locations of
+/// memory. If it sees enough consecutive ones, it attempts to merge them
+/// together into a memcpy/memset.
+Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
+ Value *StartPtr,
+ Value *ByteVal) {
+ const DataLayout &DL = StartInst->getModule()->getDataLayout();
+
+ // Okay, so we now have a single store that can be splatable. Scan to find
+ // all subsequent stores of the same value to offset from the same pointer.
+ // Join these together into ranges, so we can decide whether contiguous blocks
+ // are stored.
+ MemsetRanges Ranges(DL);
+
+ BasicBlock::iterator BI(StartInst);
// Keeps track of the last memory use or def before the insertion point for
// the new memset. The new MemoryDef for the inserted memsets will be inserted
@@ -387,7 +387,7 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
// for the new memset. This will become the defining access of the inserted
// memsets.
MemoryDef *LastMemDef = nullptr;
- for (++BI; !BI->isTerminator(); ++BI) {
+ for (++BI; !BI->isTerminator(); ++BI) {
if (MSSAU) {
auto *CurrentAcc = cast_or_null<MemoryUseOrDef>(
MSSAU->getMemorySSA()->getMemoryAccess(&*BI));
@@ -398,19 +398,19 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
}
}
- if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
- // If the instruction is readnone, ignore it, otherwise bail out. We
- // don't even allow readonly here because we don't want something like:
- // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
- if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
- break;
- continue;
- }
-
- if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
- // If this is a store, see if we can merge it in.
- if (!NextStore->isSimple()) break;
-
+ if (!isa<StoreInst>(BI) && !isa<MemSetInst>(BI)) {
+ // If the instruction is readnone, ignore it, otherwise bail out. We
+ // don't even allow readonly here because we don't want something like:
+ // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+ if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
+ break;
+ continue;
+ }
+
+ if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+ // If this is a store, see if we can merge it in.
+ if (!NextStore->isSimple()) break;
+
Value *StoredVal = NextStore->getValueOperand();
// Don't convert stores of non-integral pointer types to memsets (which
@@ -418,74 +418,74 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
break;
- // Check to see if this stored value is of the same byte-splattable value.
+ // Check to see if this stored value is of the same byte-splattable value.
Value *StoredByte = isBytewiseValue(StoredVal, DL);
- if (isa<UndefValue>(ByteVal) && StoredByte)
- ByteVal = StoredByte;
- if (ByteVal != StoredByte)
- break;
-
- // Check to see if this store is to a constant offset from the start ptr.
- Optional<int64_t> Offset =
- isPointerOffset(StartPtr, NextStore->getPointerOperand(), DL);
- if (!Offset)
- break;
-
- Ranges.addStore(*Offset, NextStore);
- } else {
- MemSetInst *MSI = cast<MemSetInst>(BI);
-
- if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
- !isa<ConstantInt>(MSI->getLength()))
- break;
-
- // Check to see if this store is to a constant offset from the start ptr.
- Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), DL);
- if (!Offset)
- break;
-
- Ranges.addMemSet(*Offset, MSI);
- }
- }
-
- // If we have no ranges, then we just had a single store with nothing that
- // could be merged in. This is a very common case of course.
- if (Ranges.empty())
- return nullptr;
-
- // If we had at least one store that could be merged in, add the starting
- // store as well. We try to avoid this unless there is at least something
- // interesting as a small compile-time optimization.
- Ranges.addInst(0, StartInst);
-
- // If we create any memsets, we put it right before the first instruction that
- // isn't part of the memset block. This ensure that the memset is dominated
- // by any addressing instruction needed by the start of the block.
- IRBuilder<> Builder(&*BI);
-
- // Now that we have full information about ranges, loop over the ranges and
- // emit memset's for anything big enough to be worthwhile.
- Instruction *AMemSet = nullptr;
- for (const MemsetRange &Range : Ranges) {
- if (Range.TheStores.size() == 1) continue;
-
- // If it is profitable to lower this range to memset, do so now.
- if (!Range.isProfitableToUseMemset(DL))
- continue;
-
- // Otherwise, we do want to transform this! Create a new memset.
- // Get the starting pointer of the block.
- StartPtr = Range.StartPtr;
-
- AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start,
- MaybeAlign(Range.Alignment));
- LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI
- : Range.TheStores) dbgs()
- << *SI << '\n';
- dbgs() << "With: " << *AMemSet << '\n');
- if (!Range.TheStores.empty())
- AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
-
+ if (isa<UndefValue>(ByteVal) && StoredByte)
+ ByteVal = StoredByte;
+ if (ByteVal != StoredByte)
+ break;
+
+ // Check to see if this store is to a constant offset from the start ptr.
+ Optional<int64_t> Offset =
+ isPointerOffset(StartPtr, NextStore->getPointerOperand(), DL);
+ if (!Offset)
+ break;
+
+ Ranges.addStore(*Offset, NextStore);
+ } else {
+ MemSetInst *MSI = cast<MemSetInst>(BI);
+
+ if (MSI->isVolatile() || ByteVal != MSI->getValue() ||
+ !isa<ConstantInt>(MSI->getLength()))
+ break;
+
+ // Check to see if this store is to a constant offset from the start ptr.
+ Optional<int64_t> Offset = isPointerOffset(StartPtr, MSI->getDest(), DL);
+ if (!Offset)
+ break;
+
+ Ranges.addMemSet(*Offset, MSI);
+ }
+ }
+
+ // If we have no ranges, then we just had a single store with nothing that
+ // could be merged in. This is a very common case of course.
+ if (Ranges.empty())
+ return nullptr;
+
+ // If we had at least one store that could be merged in, add the starting
+ // store as well. We try to avoid this unless there is at least something
+ // interesting as a small compile-time optimization.
+ Ranges.addInst(0, StartInst);
+
+ // If we create any memsets, we put it right before the first instruction that
+ // isn't part of the memset block. This ensure that the memset is dominated
+ // by any addressing instruction needed by the start of the block.
+ IRBuilder<> Builder(&*BI);
+
+ // Now that we have full information about ranges, loop over the ranges and
+ // emit memset's for anything big enough to be worthwhile.
+ Instruction *AMemSet = nullptr;
+ for (const MemsetRange &Range : Ranges) {
+ if (Range.TheStores.size() == 1) continue;
+
+ // If it is profitable to lower this range to memset, do so now.
+ if (!Range.isProfitableToUseMemset(DL))
+ continue;
+
+ // Otherwise, we do want to transform this! Create a new memset.
+ // Get the starting pointer of the block.
+ StartPtr = Range.StartPtr;
+
+ AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start,
+ MaybeAlign(Range.Alignment));
+ LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI
+ : Range.TheStores) dbgs()
+ << *SI << '\n';
+ dbgs() << "With: " << *AMemSet << '\n');
+ if (!Range.TheStores.empty())
+ AMemSet->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+
if (MSSAU) {
assert(LastMemDef && MemInsertPoint &&
"Both LastMemDef and MemInsertPoint need to be set");
@@ -500,105 +500,105 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
MemInsertPoint = NewDef;
}
- // Zap all the stores.
+ // Zap all the stores.
for (Instruction *SI : Range.TheStores)
eraseInstruction(SI);
- ++NumMemSetInfer;
- }
-
- return AMemSet;
-}
-
-// This method try to lift a store instruction before position P.
-// It will lift the store and its argument + that anything that
-// may alias with these.
-// The method returns true if it was successful.
+ ++NumMemSetInfer;
+ }
+
+ return AMemSet;
+}
+
+// This method try to lift a store instruction before position P.
+// It will lift the store and its argument + that anything that
+// may alias with these.
+// The method returns true if it was successful.
bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
- // If the store alias this position, early bail out.
- MemoryLocation StoreLoc = MemoryLocation::get(SI);
+ // If the store alias this position, early bail out.
+ MemoryLocation StoreLoc = MemoryLocation::get(SI);
if (isModOrRefSet(AA->getModRefInfo(P, StoreLoc)))
- return false;
-
- // Keep track of the arguments of all instruction we plan to lift
- // so we can make sure to lift them as well if appropriate.
- DenseSet<Instruction*> Args;
- if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand()))
- if (Ptr->getParent() == SI->getParent())
- Args.insert(Ptr);
-
- // Instruction to lift before P.
+ return false;
+
+ // Keep track of the arguments of all instruction we plan to lift
+ // so we can make sure to lift them as well if appropriate.
+ DenseSet<Instruction*> Args;
+ if (auto *Ptr = dyn_cast<Instruction>(SI->getPointerOperand()))
+ if (Ptr->getParent() == SI->getParent())
+ Args.insert(Ptr);
+
+ // Instruction to lift before P.
SmallVector<Instruction *, 8> ToLift{SI};
-
- // Memory locations of lifted instructions.
- SmallVector<MemoryLocation, 8> MemLocs{StoreLoc};
-
- // Lifted calls.
- SmallVector<const CallBase *, 8> Calls;
-
- const MemoryLocation LoadLoc = MemoryLocation::get(LI);
-
- for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) {
- auto *C = &*I;
-
+
+ // Memory locations of lifted instructions.
+ SmallVector<MemoryLocation, 8> MemLocs{StoreLoc};
+
+ // Lifted calls.
+ SmallVector<const CallBase *, 8> Calls;
+
+ const MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
+ for (auto I = --SI->getIterator(), E = P->getIterator(); I != E; --I) {
+ auto *C = &*I;
+
// Make sure hoisting does not perform a store that was not guaranteed to
// happen.
if (!isGuaranteedToTransferExecutionToSuccessor(C))
return false;
-
+
bool MayAlias = isModOrRefSet(AA->getModRefInfo(C, None));
- bool NeedLift = false;
- if (Args.erase(C))
- NeedLift = true;
- else if (MayAlias) {
+ bool NeedLift = false;
+ if (Args.erase(C))
+ NeedLift = true;
+ else if (MayAlias) {
NeedLift = llvm::any_of(MemLocs, [C, this](const MemoryLocation &ML) {
return isModOrRefSet(AA->getModRefInfo(C, ML));
- });
-
- if (!NeedLift)
+ });
+
+ if (!NeedLift)
NeedLift = llvm::any_of(Calls, [C, this](const CallBase *Call) {
return isModOrRefSet(AA->getModRefInfo(C, Call));
- });
- }
-
- if (!NeedLift)
- continue;
-
- if (MayAlias) {
- // Since LI is implicitly moved downwards past the lifted instructions,
- // none of them may modify its source.
+ });
+ }
+
+ if (!NeedLift)
+ continue;
+
+ if (MayAlias) {
+ // Since LI is implicitly moved downwards past the lifted instructions,
+ // none of them may modify its source.
if (isModSet(AA->getModRefInfo(C, LoadLoc)))
- return false;
- else if (const auto *Call = dyn_cast<CallBase>(C)) {
- // If we can't lift this before P, it's game over.
+ return false;
+ else if (const auto *Call = dyn_cast<CallBase>(C)) {
+ // If we can't lift this before P, it's game over.
if (isModOrRefSet(AA->getModRefInfo(P, Call)))
- return false;
-
- Calls.push_back(Call);
- } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) {
- // If we can't lift this before P, it's game over.
- auto ML = MemoryLocation::get(C);
+ return false;
+
+ Calls.push_back(Call);
+ } else if (isa<LoadInst>(C) || isa<StoreInst>(C) || isa<VAArgInst>(C)) {
+ // If we can't lift this before P, it's game over.
+ auto ML = MemoryLocation::get(C);
if (isModOrRefSet(AA->getModRefInfo(P, ML)))
- return false;
-
- MemLocs.push_back(ML);
- } else
- // We don't know how to lift this instruction.
- return false;
- }
-
- ToLift.push_back(C);
- for (unsigned k = 0, e = C->getNumOperands(); k != e; ++k)
- if (auto *A = dyn_cast<Instruction>(C->getOperand(k))) {
- if (A->getParent() == SI->getParent()) {
- // Cannot hoist user of P above P
- if(A == P) return false;
- Args.insert(A);
- }
- }
- }
-
+ return false;
+
+ MemLocs.push_back(ML);
+ } else
+ // We don't know how to lift this instruction.
+ return false;
+ }
+
+ ToLift.push_back(C);
+ for (unsigned k = 0, e = C->getNumOperands(); k != e; ++k)
+ if (auto *A = dyn_cast<Instruction>(C->getOperand(k))) {
+ if (A->getParent() == SI->getParent()) {
+ // Cannot hoist user of P above P
+ if(A == P) return false;
+ Args.insert(A);
+ }
+ }
+ }
+
// Find MSSA insertion point. Normally P will always have a corresponding
// memory access before which we can insert. However, with non-standard AA
// pipelines, there may be a mismatch between AA and MSSA, in which case we
@@ -623,9 +623,9 @@ bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
}
// We made it, we need to lift.
- for (auto *I : llvm::reverse(ToLift)) {
- LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
- I->moveBefore(P);
+ for (auto *I : llvm::reverse(ToLift)) {
+ LLVM_DEBUG(dbgs() << "Lifting " << *I << " before " << *P << "\n");
+ I->moveBefore(P);
if (MSSAU) {
assert(MemInsertPoint && "Must have found insert point");
if (MemoryUseOrDef *MA = MSSAU->getMemorySSA()->getMemoryAccess(I)) {
@@ -633,25 +633,25 @@ bool MemCpyOptPass::moveUp(StoreInst *SI, Instruction *P, const LoadInst *LI) {
MemInsertPoint = MA;
}
}
- }
-
- return true;
-}
-
-bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
- if (!SI->isSimple()) return false;
-
- // Avoid merging nontemporal stores since the resulting
- // memcpy/memset would not be able to preserve the nontemporal hint.
- // In theory we could teach how to propagate the !nontemporal metadata to
- // memset calls. However, that change would force the backend to
- // conservatively expand !nontemporal memset calls back to sequences of
- // store instructions (effectively undoing the merging).
- if (SI->getMetadata(LLVMContext::MD_nontemporal))
- return false;
-
- const DataLayout &DL = SI->getModule()->getDataLayout();
-
+ }
+
+ return true;
+}
+
+bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
+ if (!SI->isSimple()) return false;
+
+ // Avoid merging nontemporal stores since the resulting
+ // memcpy/memset would not be able to preserve the nontemporal hint.
+ // In theory we could teach how to propagate the !nontemporal metadata to
+ // memset calls. However, that change would force the backend to
+ // conservatively expand !nontemporal memset calls back to sequences of
+ // store instructions (effectively undoing the merging).
+ if (SI->getMetadata(LLVMContext::MD_nontemporal))
+ return false;
+
+ const DataLayout &DL = SI->getModule()->getDataLayout();
+
Value *StoredVal = SI->getValueOperand();
// Not all the transforms below are correct for non-integral pointers, bail
@@ -659,63 +659,63 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
if (DL.isNonIntegralPointerType(StoredVal->getType()->getScalarType()))
return false;
- // Load to store forwarding can be interpreted as memcpy.
+ // Load to store forwarding can be interpreted as memcpy.
if (LoadInst *LI = dyn_cast<LoadInst>(StoredVal)) {
- if (LI->isSimple() && LI->hasOneUse() &&
- LI->getParent() == SI->getParent()) {
-
- auto *T = LI->getType();
- if (T->isAggregateType()) {
- MemoryLocation LoadLoc = MemoryLocation::get(LI);
-
- // We use alias analysis to check if an instruction may store to
- // the memory we load from in between the load and the store. If
- // such an instruction is found, we try to promote there instead
- // of at the store position.
+ if (LI->isSimple() && LI->hasOneUse() &&
+ LI->getParent() == SI->getParent()) {
+
+ auto *T = LI->getType();
+ if (T->isAggregateType()) {
+ MemoryLocation LoadLoc = MemoryLocation::get(LI);
+
+ // We use alias analysis to check if an instruction may store to
+ // the memory we load from in between the load and the store. If
+ // such an instruction is found, we try to promote there instead
+ // of at the store position.
// TODO: Can use MSSA for this.
- Instruction *P = SI;
- for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
+ Instruction *P = SI;
+ for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
if (isModSet(AA->getModRefInfo(&I, LoadLoc))) {
- P = &I;
- break;
- }
- }
-
- // We found an instruction that may write to the loaded memory.
- // We can try to promote at this position instead of the store
- // position if nothing alias the store memory after this and the store
- // destination is not in the range.
- if (P && P != SI) {
+ P = &I;
+ break;
+ }
+ }
+
+ // We found an instruction that may write to the loaded memory.
+ // We can try to promote at this position instead of the store
+ // position if nothing alias the store memory after this and the store
+ // destination is not in the range.
+ if (P && P != SI) {
if (!moveUp(SI, P, LI))
- P = nullptr;
- }
-
- // If a valid insertion position is found, then we can promote
- // the load/store pair to a memcpy.
- if (P) {
- // If we load from memory that may alias the memory we store to,
- // memmove must be used to preserve semantic. If not, memcpy can
- // be used.
- bool UseMemMove = false;
+ P = nullptr;
+ }
+
+ // If a valid insertion position is found, then we can promote
+ // the load/store pair to a memcpy.
+ if (P) {
+ // If we load from memory that may alias the memory we store to,
+ // memmove must be used to preserve semantic. If not, memcpy can
+ // be used.
+ bool UseMemMove = false;
if (!AA->isNoAlias(MemoryLocation::get(SI), LoadLoc))
- UseMemMove = true;
-
- uint64_t Size = DL.getTypeStoreSize(T);
-
- IRBuilder<> Builder(P);
- Instruction *M;
- if (UseMemMove)
- M = Builder.CreateMemMove(
- SI->getPointerOperand(), SI->getAlign(),
- LI->getPointerOperand(), LI->getAlign(), Size);
- else
- M = Builder.CreateMemCpy(
- SI->getPointerOperand(), SI->getAlign(),
- LI->getPointerOperand(), LI->getAlign(), Size);
-
- LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => "
- << *M << "\n");
-
+ UseMemMove = true;
+
+ uint64_t Size = DL.getTypeStoreSize(T);
+
+ IRBuilder<> Builder(P);
+ Instruction *M;
+ if (UseMemMove)
+ M = Builder.CreateMemMove(
+ SI->getPointerOperand(), SI->getAlign(),
+ LI->getPointerOperand(), LI->getAlign(), Size);
+ else
+ M = Builder.CreateMemCpy(
+ SI->getPointerOperand(), SI->getAlign(),
+ LI->getPointerOperand(), LI->getAlign(), Size);
+
+ LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => "
+ << *M << "\n");
+
if (MSSAU) {
auto *LastDef =
cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI));
@@ -726,18 +726,18 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
eraseInstruction(SI);
eraseInstruction(LI);
- ++NumMemCpyInstr;
-
- // Make sure we do not invalidate the iterator.
- BBI = M->getIterator();
- return true;
- }
- }
-
- // Detect cases where we're performing call slot forwarding, but
- // happen to be using a load-store pair to implement it, rather than
- // a memcpy.
- CallInst *C = nullptr;
+ ++NumMemCpyInstr;
+
+ // Make sure we do not invalidate the iterator.
+ BBI = M->getIterator();
+ return true;
+ }
+ }
+
+ // Detect cases where we're performing call slot forwarding, but
+ // happen to be using a load-store pair to implement it, rather than
+ // a memcpy.
+ CallInst *C = nullptr;
if (EnableMemorySSA) {
if (auto *LoadClobber = dyn_cast<MemoryUseOrDef>(
MSSA->getWalker()->getClobberingMemoryAccess(LI))) {
@@ -751,15 +751,15 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
if (ldep.isClobber() && !isa<MemCpyInst>(ldep.getInst()))
C = dyn_cast<CallInst>(ldep.getInst());
}
-
- if (C) {
- // Check that nothing touches the dest of the "copy" between
- // the call and the store.
- MemoryLocation StoreLoc = MemoryLocation::get(SI);
+
+ if (C) {
+ // Check that nothing touches the dest of the "copy" between
+ // the call and the store.
+ MemoryLocation StoreLoc = MemoryLocation::get(SI);
if (EnableMemorySSA) {
if (accessedBetween(*AA, StoreLoc, MSSA->getMemoryAccess(C),
MSSA->getMemoryAccess(SI)))
- C = nullptr;
+ C = nullptr;
} else {
for (BasicBlock::iterator I = --SI->getIterator(),
E = C->getIterator();
@@ -768,52 +768,52 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
C = nullptr;
break;
}
- }
- }
- }
-
- if (C) {
- bool changed = performCallSlotOptzn(
+ }
+ }
+ }
+
+ if (C) {
+ bool changed = performCallSlotOptzn(
LI, SI, SI->getPointerOperand()->stripPointerCasts(),
- LI->getPointerOperand()->stripPointerCasts(),
- DL.getTypeStoreSize(SI->getOperand(0)->getType()),
- commonAlignment(SI->getAlign(), LI->getAlign()), C);
- if (changed) {
+ LI->getPointerOperand()->stripPointerCasts(),
+ DL.getTypeStoreSize(SI->getOperand(0)->getType()),
+ commonAlignment(SI->getAlign(), LI->getAlign()), C);
+ if (changed) {
eraseInstruction(SI);
eraseInstruction(LI);
- ++NumMemCpyInstr;
- return true;
- }
- }
- }
- }
-
- // There are two cases that are interesting for this code to handle: memcpy
- // and memset. Right now we only handle memset.
-
- // Ensure that the value being stored is something that can be memset'able a
- // byte at a time like "0" or "-1" or any width, as well as things like
- // 0xA0A0A0A0 and 0.0.
- auto *V = SI->getOperand(0);
- if (Value *ByteVal = isBytewiseValue(V, DL)) {
- if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(),
- ByteVal)) {
- BBI = I->getIterator(); // Don't invalidate iterator.
- return true;
- }
-
- // If we have an aggregate, we try to promote it to memset regardless
- // of opportunity for merging as it can expose optimization opportunities
- // in subsequent passes.
- auto *T = V->getType();
- if (T->isAggregateType()) {
- uint64_t Size = DL.getTypeStoreSize(T);
- IRBuilder<> Builder(SI);
- auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size,
- SI->getAlign());
-
- LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
-
+ ++NumMemCpyInstr;
+ return true;
+ }
+ }
+ }
+ }
+
+ // There are two cases that are interesting for this code to handle: memcpy
+ // and memset. Right now we only handle memset.
+
+ // Ensure that the value being stored is something that can be memset'able a
+ // byte at a time like "0" or "-1" or any width, as well as things like
+ // 0xA0A0A0A0 and 0.0.
+ auto *V = SI->getOperand(0);
+ if (Value *ByteVal = isBytewiseValue(V, DL)) {
+ if (Instruction *I = tryMergingIntoMemset(SI, SI->getPointerOperand(),
+ ByteVal)) {
+ BBI = I->getIterator(); // Don't invalidate iterator.
+ return true;
+ }
+
+ // If we have an aggregate, we try to promote it to memset regardless
+ // of opportunity for merging as it can expose optimization opportunities
+ // in subsequent passes.
+ auto *T = V->getType();
+ if (T->isAggregateType()) {
+ uint64_t Size = DL.getTypeStoreSize(T);
+ IRBuilder<> Builder(SI);
+ auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size,
+ SI->getAlign());
+
+ LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
+
if (MSSAU) {
assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(SI)));
auto *LastDef =
@@ -823,78 +823,78 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
}
eraseInstruction(SI);
- NumMemSetInfer++;
-
- // Make sure we do not invalidate the iterator.
- BBI = M->getIterator();
- return true;
- }
- }
-
- return false;
-}
-
-bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
- // See if there is another memset or store neighboring this memset which
- // allows us to widen out the memset to do a single larger store.
- if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
- if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(),
- MSI->getValue())) {
- BBI = I->getIterator(); // Don't invalidate iterator.
- return true;
- }
- return false;
-}
-
-/// Takes a memcpy and a call that it depends on,
-/// and checks for the possibility of a call slot optimization by having
-/// the call write its result directly into the destination of the memcpy.
+ NumMemSetInfer++;
+
+ // Make sure we do not invalidate the iterator.
+ BBI = M->getIterator();
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
+ // See if there is another memset or store neighboring this memset which
+ // allows us to widen out the memset to do a single larger store.
+ if (isa<ConstantInt>(MSI->getLength()) && !MSI->isVolatile())
+ if (Instruction *I = tryMergingIntoMemset(MSI, MSI->getDest(),
+ MSI->getValue())) {
+ BBI = I->getIterator(); // Don't invalidate iterator.
+ return true;
+ }
+ return false;
+}
+
+/// Takes a memcpy and a call that it depends on,
+/// and checks for the possibility of a call slot optimization by having
+/// the call write its result directly into the destination of the memcpy.
bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
Instruction *cpyStore, Value *cpyDest,
- Value *cpySrc, uint64_t cpyLen,
- Align cpyAlign, CallInst *C) {
- // The general transformation to keep in mind is
- //
- // call @func(..., src, ...)
- // memcpy(dest, src, ...)
- //
- // ->
- //
- // memcpy(dest, src, ...)
- // call @func(..., dest, ...)
- //
- // Since moving the memcpy is technically awkward, we additionally check that
- // src only holds uninitialized values at the moment of the call, meaning that
- // the memcpy can be discarded rather than moved.
-
- // Lifetime marks shouldn't be operated on.
- if (Function *F = C->getCalledFunction())
- if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
- return false;
-
- // Require that src be an alloca. This simplifies the reasoning considerably.
- AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
- if (!srcAlloca)
- return false;
-
- ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
- if (!srcArraySize)
- return false;
-
+ Value *cpySrc, uint64_t cpyLen,
+ Align cpyAlign, CallInst *C) {
+ // The general transformation to keep in mind is
+ //
+ // call @func(..., src, ...)
+ // memcpy(dest, src, ...)
+ //
+ // ->
+ //
+ // memcpy(dest, src, ...)
+ // call @func(..., dest, ...)
+ //
+ // Since moving the memcpy is technically awkward, we additionally check that
+ // src only holds uninitialized values at the moment of the call, meaning that
+ // the memcpy can be discarded rather than moved.
+
+ // Lifetime marks shouldn't be operated on.
+ if (Function *F = C->getCalledFunction())
+ if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
+ return false;
+
+ // Require that src be an alloca. This simplifies the reasoning considerably.
+ AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
+ if (!srcAlloca)
+ return false;
+
+ ConstantInt *srcArraySize = dyn_cast<ConstantInt>(srcAlloca->getArraySize());
+ if (!srcArraySize)
+ return false;
+
const DataLayout &DL = cpyLoad->getModule()->getDataLayout();
- uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) *
- srcArraySize->getZExtValue();
-
- if (cpyLen < srcSize)
- return false;
-
- // Check that accessing the first srcSize bytes of dest will not cause a
- // trap. Otherwise the transform is invalid since it might cause a trap
- // to occur earlier than it otherwise would.
+ uint64_t srcSize = DL.getTypeAllocSize(srcAlloca->getAllocatedType()) *
+ srcArraySize->getZExtValue();
+
+ if (cpyLen < srcSize)
+ return false;
+
+ // Check that accessing the first srcSize bytes of dest will not cause a
+ // trap. Otherwise the transform is invalid since it might cause a trap
+ // to occur earlier than it otherwise would.
if (!isDereferenceableAndAlignedPointer(cpyDest, Align(1), APInt(64, cpyLen),
DL, C, DT))
return false;
-
+
// Make sure that nothing can observe cpyDest being written early. There are
// a number of cases to consider:
// 1. cpyDest cannot be accessed between C and cpyStore as a precondition of
@@ -910,51 +910,51 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
// renders accesses from other threads undefined.
// TODO: This is currently not checked.
if (mayBeVisibleThroughUnwinding(cpyDest, C, cpyStore))
- return false;
-
- // Check that dest points to memory that is at least as aligned as src.
- Align srcAlign = srcAlloca->getAlign();
- bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
- // If dest is not aligned enough and we can't increase its alignment then
- // bail out.
- if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest))
- return false;
-
- // Check that src is not accessed except via the call and the memcpy. This
- // guarantees that it holds only undefined values when passed in (so the final
- // memcpy can be dropped), that it is not read or written between the call and
- // the memcpy, and that writing beyond the end of it is undefined.
+ return false;
+
+ // Check that dest points to memory that is at least as aligned as src.
+ Align srcAlign = srcAlloca->getAlign();
+ bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
+ // If dest is not aligned enough and we can't increase its alignment then
+ // bail out.
+ if (!isDestSufficientlyAligned && !isa<AllocaInst>(cpyDest))
+ return false;
+
+ // Check that src is not accessed except via the call and the memcpy. This
+ // guarantees that it holds only undefined values when passed in (so the final
+ // memcpy can be dropped), that it is not read or written between the call and
+ // the memcpy, and that writing beyond the end of it is undefined.
SmallVector<User *, 8> srcUseList(srcAlloca->users());
- while (!srcUseList.empty()) {
- User *U = srcUseList.pop_back_val();
-
- if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
+ while (!srcUseList.empty()) {
+ User *U = srcUseList.pop_back_val();
+
+ if (isa<BitCastInst>(U) || isa<AddrSpaceCastInst>(U)) {
append_range(srcUseList, U->users());
- continue;
- }
- if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
- if (!G->hasAllZeroIndices())
- return false;
-
+ continue;
+ }
+ if (GetElementPtrInst *G = dyn_cast<GetElementPtrInst>(U)) {
+ if (!G->hasAllZeroIndices())
+ return false;
+
append_range(srcUseList, U->users());
- continue;
- }
- if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U))
- if (IT->isLifetimeStartOrEnd())
- continue;
-
+ continue;
+ }
+ if (const IntrinsicInst *IT = dyn_cast<IntrinsicInst>(U))
+ if (IT->isLifetimeStartOrEnd())
+ continue;
+
if (U != C && U != cpyLoad)
- return false;
- }
-
- // Check that src isn't captured by the called function since the
- // transformation can cause aliasing issues in that case.
- for (unsigned ArgI = 0, E = C->arg_size(); ArgI != E; ++ArgI)
- if (C->getArgOperand(ArgI) == cpySrc && !C->doesNotCapture(ArgI))
- return false;
-
- // Since we're changing the parameter to the callsite, we need to make sure
- // that what would be the new parameter dominates the callsite.
+ return false;
+ }
+
+ // Check that src isn't captured by the called function since the
+ // transformation can cause aliasing issues in that case.
+ for (unsigned ArgI = 0, E = C->arg_size(); ArgI != E; ++ArgI)
+ if (C->getArgOperand(ArgI) == cpySrc && !C->doesNotCapture(ArgI))
+ return false;
+
+ // Since we're changing the parameter to the callsite, we need to make sure
+ // that what would be the new parameter dominates the callsite.
if (!DT->dominates(cpyDest, C)) {
// Support moving a constant index GEP before the call.
auto *GEP = dyn_cast<GetElementPtrInst>(cpyDest);
@@ -962,107 +962,107 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpyLoad,
DT->dominates(GEP->getPointerOperand(), C))
GEP->moveBefore(C);
else
- return false;
+ return false;
}
-
- // In addition to knowing that the call does not access src in some
- // unexpected manner, for example via a global, which we deduce from
- // the use analysis, we also need to know that it does not sneakily
- // access dest. We rely on AA to figure this out for us.
+
+ // In addition to knowing that the call does not access src in some
+ // unexpected manner, for example via a global, which we deduce from
+ // the use analysis, we also need to know that it does not sneakily
+ // access dest. We rely on AA to figure this out for us.
ModRefInfo MR = AA->getModRefInfo(C, cpyDest, LocationSize::precise(srcSize));
- // If necessary, perform additional analysis.
- if (isModOrRefSet(MR))
+ // If necessary, perform additional analysis.
+ if (isModOrRefSet(MR))
MR = AA->callCapturesBefore(C, cpyDest, LocationSize::precise(srcSize), DT);
- if (isModOrRefSet(MR))
- return false;
-
- // We can't create address space casts here because we don't know if they're
- // safe for the target.
- if (cpySrc->getType()->getPointerAddressSpace() !=
- cpyDest->getType()->getPointerAddressSpace())
- return false;
- for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI)
- if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc &&
- cpySrc->getType()->getPointerAddressSpace() !=
- C->getArgOperand(ArgI)->getType()->getPointerAddressSpace())
- return false;
-
- // All the checks have passed, so do the transformation.
- bool changedArgument = false;
- for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI)
- if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc) {
- Value *Dest = cpySrc->getType() == cpyDest->getType() ? cpyDest
- : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
- cpyDest->getName(), C);
- changedArgument = true;
- if (C->getArgOperand(ArgI)->getType() == Dest->getType())
- C->setArgOperand(ArgI, Dest);
- else
- C->setArgOperand(ArgI, CastInst::CreatePointerCast(
- Dest, C->getArgOperand(ArgI)->getType(),
- Dest->getName(), C));
- }
-
- if (!changedArgument)
- return false;
-
- // If the destination wasn't sufficiently aligned then increase its alignment.
- if (!isDestSufficientlyAligned) {
- assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
- cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
- }
-
- // Drop any cached information about the call, because we may have changed
- // its dependence information by changing its parameter.
+ if (isModOrRefSet(MR))
+ return false;
+
+ // We can't create address space casts here because we don't know if they're
+ // safe for the target.
+ if (cpySrc->getType()->getPointerAddressSpace() !=
+ cpyDest->getType()->getPointerAddressSpace())
+ return false;
+ for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI)
+ if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc &&
+ cpySrc->getType()->getPointerAddressSpace() !=
+ C->getArgOperand(ArgI)->getType()->getPointerAddressSpace())
+ return false;
+
+ // All the checks have passed, so do the transformation.
+ bool changedArgument = false;
+ for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI)
+ if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc) {
+ Value *Dest = cpySrc->getType() == cpyDest->getType() ? cpyDest
+ : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
+ cpyDest->getName(), C);
+ changedArgument = true;
+ if (C->getArgOperand(ArgI)->getType() == Dest->getType())
+ C->setArgOperand(ArgI, Dest);
+ else
+ C->setArgOperand(ArgI, CastInst::CreatePointerCast(
+ Dest, C->getArgOperand(ArgI)->getType(),
+ Dest->getName(), C));
+ }
+
+ if (!changedArgument)
+ return false;
+
+ // If the destination wasn't sufficiently aligned then increase its alignment.
+ if (!isDestSufficientlyAligned) {
+ assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
+ cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
+ }
+
+ // Drop any cached information about the call, because we may have changed
+ // its dependence information by changing its parameter.
if (MD)
MD->removeInstruction(C);
-
- // Update AA metadata
- // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
- // handled here, but combineMetadata doesn't support them yet
- unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
- LLVMContext::MD_noalias,
- LLVMContext::MD_invariant_group,
- LLVMContext::MD_access_group};
+
+ // Update AA metadata
+ // FIXME: MD_tbaa_struct and MD_mem_parallel_loop_access should also be
+ // handled here, but combineMetadata doesn't support them yet
+ unsigned KnownIDs[] = {LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias,
+ LLVMContext::MD_invariant_group,
+ LLVMContext::MD_access_group};
combineMetadata(C, cpyLoad, KnownIDs, true);
-
+
++NumCallSlot;
- return true;
-}
-
-/// We've found that the (upward scanning) memory dependence of memcpy 'M' is
-/// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
-bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
- MemCpyInst *MDep) {
- // We can only transforms memcpy's where the dest of one is the source of the
- // other.
- if (M->getSource() != MDep->getDest() || MDep->isVolatile())
- return false;
-
- // If dep instruction is reading from our current input, then it is a noop
- // transfer and substituting the input won't change this instruction. Just
- // ignore the input and let someone else zap MDep. This handles cases like:
- // memcpy(a <- a)
- // memcpy(b <- a)
- if (M->getSource() == MDep->getSource())
- return false;
-
- // Second, the length of the memcpy's must be the same, or the preceding one
- // must be larger than the following one.
- ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
- ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength());
- if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
- return false;
-
- // Verify that the copied-from memory doesn't change in between the two
- // transfers. For example, in:
- // memcpy(a <- b)
- // *b = 42;
- // memcpy(c <- a)
- // It would be invalid to transform the second memcpy into memcpy(c <- b).
- //
- // TODO: If the code between M and MDep is transparent to the destination "c",
- // then we could still perform the xform by moving M up to the first memcpy.
+ return true;
+}
+
+/// We've found that the (upward scanning) memory dependence of memcpy 'M' is
+/// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
+bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
+ MemCpyInst *MDep) {
+ // We can only transforms memcpy's where the dest of one is the source of the
+ // other.
+ if (M->getSource() != MDep->getDest() || MDep->isVolatile())
+ return false;
+
+ // If dep instruction is reading from our current input, then it is a noop
+ // transfer and substituting the input won't change this instruction. Just
+ // ignore the input and let someone else zap MDep. This handles cases like:
+ // memcpy(a <- a)
+ // memcpy(b <- a)
+ if (M->getSource() == MDep->getSource())
+ return false;
+
+ // Second, the length of the memcpy's must be the same, or the preceding one
+ // must be larger than the following one.
+ ConstantInt *MDepLen = dyn_cast<ConstantInt>(MDep->getLength());
+ ConstantInt *MLen = dyn_cast<ConstantInt>(M->getLength());
+ if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
+ return false;
+
+ // Verify that the copied-from memory doesn't change in between the two
+ // transfers. For example, in:
+ // memcpy(a <- b)
+ // *b = 42;
+ // memcpy(c <- a)
+ // It would be invalid to transform the second memcpy into memcpy(c <- b).
+ //
+ // TODO: If the code between M and MDep is transparent to the destination "c",
+ // then we could still perform the xform by moving M up to the first memcpy.
if (EnableMemorySSA) {
// TODO: It would be sufficient to check the MDep source up to the memcpy
// size of M, rather than MDep.
@@ -1078,32 +1078,32 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
return false;
}
-
- // If the dest of the second might alias the source of the first, then the
- // source and dest might overlap. We still want to eliminate the intermediate
- // value, but we have to generate a memmove instead of memcpy.
- bool UseMemMove = false;
+
+ // If the dest of the second might alias the source of the first, then the
+ // source and dest might overlap. We still want to eliminate the intermediate
+ // value, but we have to generate a memmove instead of memcpy.
+ bool UseMemMove = false;
if (!AA->isNoAlias(MemoryLocation::getForDest(M),
MemoryLocation::getForSource(MDep)))
- UseMemMove = true;
-
- // If all checks passed, then we can transform M.
- LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n"
- << *MDep << '\n' << *M << '\n');
-
- // TODO: Is this worth it if we're creating a less aligned memcpy? For
- // example we could be moving from movaps -> movq on x86.
- IRBuilder<> Builder(M);
+ UseMemMove = true;
+
+ // If all checks passed, then we can transform M.
+ LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy->memcpy src:\n"
+ << *MDep << '\n' << *M << '\n');
+
+ // TODO: Is this worth it if we're creating a less aligned memcpy? For
+ // example we could be moving from movaps -> movq on x86.
+ IRBuilder<> Builder(M);
Instruction *NewM;
- if (UseMemMove)
+ if (UseMemMove)
NewM = Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(),
MDep->getRawSource(), MDep->getSourceAlign(),
M->getLength(), M->isVolatile());
- else
+ else
NewM = Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(),
MDep->getRawSource(), MDep->getSourceAlign(),
M->getLength(), M->isVolatile());
-
+
if (MSSAU) {
assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M)));
auto *LastDef = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(M));
@@ -1111,40 +1111,40 @@ bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
}
- // Remove the instruction we're replacing.
+ // Remove the instruction we're replacing.
eraseInstruction(M);
- ++NumMemCpyInstr;
- return true;
-}
-
-/// We've found that the (upward scanning) memory dependence of \p MemCpy is
-/// \p MemSet. Try to simplify \p MemSet to only set the trailing bytes that
-/// weren't copied over by \p MemCpy.
-///
-/// In other words, transform:
-/// \code
-/// memset(dst, c, dst_size);
-/// memcpy(dst, src, src_size);
-/// \endcode
-/// into:
-/// \code
-/// memcpy(dst, src, src_size);
-/// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size);
-/// \endcode
-bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
- MemSetInst *MemSet) {
- // We can only transform memset/memcpy with the same destination.
- if (MemSet->getDest() != MemCpy->getDest())
- return false;
-
+ ++NumMemCpyInstr;
+ return true;
+}
+
+/// We've found that the (upward scanning) memory dependence of \p MemCpy is
+/// \p MemSet. Try to simplify \p MemSet to only set the trailing bytes that
+/// weren't copied over by \p MemCpy.
+///
+/// In other words, transform:
+/// \code
+/// memset(dst, c, dst_size);
+/// memcpy(dst, src, src_size);
+/// \endcode
+/// into:
+/// \code
+/// memcpy(dst, src, src_size);
+/// memset(dst + src_size, c, dst_size <= src_size ? 0 : dst_size - src_size);
+/// \endcode
+bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
+ MemSetInst *MemSet) {
+ // We can only transform memset/memcpy with the same destination.
+ if (MemSet->getDest() != MemCpy->getDest())
+ return false;
+
// Check that src and dst of the memcpy aren't the same. While memcpy
// operands cannot partially overlap, exact equality is allowed.
if (!AA->isNoAlias(MemoryLocation(MemCpy->getSource(),
LocationSize::precise(1)),
MemoryLocation(MemCpy->getDest(),
LocationSize::precise(1))))
- return false;
-
+ return false;
+
if (EnableMemorySSA) {
// We know that dst up to src_size is not written. We now need to make sure
// that dst up to dst_size is not accessed. (If we did not move the memset,
@@ -1164,44 +1164,44 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
return false;
}
- // Use the same i8* dest as the memcpy, killing the memset dest if different.
- Value *Dest = MemCpy->getRawDest();
- Value *DestSize = MemSet->getLength();
- Value *SrcSize = MemCpy->getLength();
-
+ // Use the same i8* dest as the memcpy, killing the memset dest if different.
+ Value *Dest = MemCpy->getRawDest();
+ Value *DestSize = MemSet->getLength();
+ Value *SrcSize = MemCpy->getLength();
+
if (mayBeVisibleThroughUnwinding(Dest, MemSet, MemCpy))
return false;
- // By default, create an unaligned memset.
- unsigned Align = 1;
- // If Dest is aligned, and SrcSize is constant, use the minimum alignment
- // of the sum.
- const unsigned DestAlign =
- std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment());
- if (DestAlign > 1)
- if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
- Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
-
- IRBuilder<> Builder(MemCpy);
-
- // If the sizes have different types, zext the smaller one.
- if (DestSize->getType() != SrcSize->getType()) {
- if (DestSize->getType()->getIntegerBitWidth() >
- SrcSize->getType()->getIntegerBitWidth())
- SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType());
- else
- DestSize = Builder.CreateZExt(DestSize, SrcSize->getType());
- }
-
- Value *Ule = Builder.CreateICmpULE(DestSize, SrcSize);
- Value *SizeDiff = Builder.CreateSub(DestSize, SrcSize);
- Value *MemsetLen = Builder.CreateSelect(
- Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff);
+ // By default, create an unaligned memset.
+ unsigned Align = 1;
+ // If Dest is aligned, and SrcSize is constant, use the minimum alignment
+ // of the sum.
+ const unsigned DestAlign =
+ std::max(MemSet->getDestAlignment(), MemCpy->getDestAlignment());
+ if (DestAlign > 1)
+ if (ConstantInt *SrcSizeC = dyn_cast<ConstantInt>(SrcSize))
+ Align = MinAlign(SrcSizeC->getZExtValue(), DestAlign);
+
+ IRBuilder<> Builder(MemCpy);
+
+ // If the sizes have different types, zext the smaller one.
+ if (DestSize->getType() != SrcSize->getType()) {
+ if (DestSize->getType()->getIntegerBitWidth() >
+ SrcSize->getType()->getIntegerBitWidth())
+ SrcSize = Builder.CreateZExt(SrcSize, DestSize->getType());
+ else
+ DestSize = Builder.CreateZExt(DestSize, SrcSize->getType());
+ }
+
+ Value *Ule = Builder.CreateICmpULE(DestSize, SrcSize);
+ Value *SizeDiff = Builder.CreateSub(DestSize, SrcSize);
+ Value *MemsetLen = Builder.CreateSelect(
+ Ule, ConstantInt::getNullValue(DestSize->getType()), SizeDiff);
Instruction *NewMemSet = Builder.CreateMemSet(
- Builder.CreateGEP(Dest->getType()->getPointerElementType(), Dest,
- SrcSize),
- MemSet->getOperand(1), MemsetLen, MaybeAlign(Align));
-
+ Builder.CreateGEP(Dest->getType()->getPointerElementType(), Dest,
+ SrcSize),
+ MemSet->getOperand(1), MemsetLen, MaybeAlign(Align));
+
if (MSSAU) {
assert(isa<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(MemCpy)) &&
"MemCpy must be a MemoryDef");
@@ -1216,24 +1216,24 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy,
}
eraseInstruction(MemSet);
- return true;
-}
-
-/// Determine whether the instruction has undefined content for the given Size,
-/// either because it was freshly alloca'd or started its lifetime.
-static bool hasUndefContents(Instruction *I, ConstantInt *Size) {
- if (isa<AllocaInst>(I))
- return true;
-
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
- if (II->getIntrinsicID() == Intrinsic::lifetime_start)
- if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0)))
- if (LTSize->getZExtValue() >= Size->getZExtValue())
- return true;
-
- return false;
-}
-
+ return true;
+}
+
+/// Determine whether the instruction has undefined content for the given Size,
+/// either because it was freshly alloca'd or started its lifetime.
+static bool hasUndefContents(Instruction *I, ConstantInt *Size) {
+ if (isa<AllocaInst>(I))
+ return true;
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I))
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+ if (ConstantInt *LTSize = dyn_cast<ConstantInt>(II->getArgOperand(0)))
+ if (LTSize->getZExtValue() >= Size->getZExtValue())
+ return true;
+
+ return false;
+}
+
static bool hasUndefContentsMSSA(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
MemoryDef *Def, ConstantInt *Size) {
if (MSSA->isLiveOnEntryDef(Def))
@@ -1252,41 +1252,41 @@ static bool hasUndefContentsMSSA(MemorySSA *MSSA, AliasAnalysis *AA, Value *V,
return false;
}
-/// Transform memcpy to memset when its source was just memset.
-/// In other words, turn:
-/// \code
-/// memset(dst1, c, dst1_size);
-/// memcpy(dst2, dst1, dst2_size);
-/// \endcode
-/// into:
-/// \code
-/// memset(dst1, c, dst1_size);
-/// memset(dst2, c, dst2_size);
-/// \endcode
-/// When dst2_size <= dst1_size.
-///
-/// The \p MemCpy must have a Constant length.
-bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
- MemSetInst *MemSet) {
- // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and
- // memcpying from the same address. Otherwise it is hard to reason about.
+/// Transform memcpy to memset when its source was just memset.
+/// In other words, turn:
+/// \code
+/// memset(dst1, c, dst1_size);
+/// memcpy(dst2, dst1, dst2_size);
+/// \endcode
+/// into:
+/// \code
+/// memset(dst1, c, dst1_size);
+/// memset(dst2, c, dst2_size);
+/// \endcode
+/// When dst2_size <= dst1_size.
+///
+/// The \p MemCpy must have a Constant length.
+bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
+ MemSetInst *MemSet) {
+ // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and
+ // memcpying from the same address. Otherwise it is hard to reason about.
if (!AA->isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource()))
- return false;
-
- // A known memset size is required.
- ConstantInt *MemSetSize = dyn_cast<ConstantInt>(MemSet->getLength());
- if (!MemSetSize)
- return false;
-
- // Make sure the memcpy doesn't read any more than what the memset wrote.
- // Don't worry about sizes larger than i64.
- ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength());
- if (CopySize->getZExtValue() > MemSetSize->getZExtValue()) {
- // If the memcpy is larger than the memset, but the memory was undef prior
- // to the memset, we can just ignore the tail. Technically we're only
- // interested in the bytes from MemSetSize..CopySize here, but as we can't
- // easily represent this location, we use the full 0..CopySize range.
- MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
+ return false;
+
+ // A known memset size is required.
+ ConstantInt *MemSetSize = dyn_cast<ConstantInt>(MemSet->getLength());
+ if (!MemSetSize)
+ return false;
+
+ // Make sure the memcpy doesn't read any more than what the memset wrote.
+ // Don't worry about sizes larger than i64.
+ ConstantInt *CopySize = cast<ConstantInt>(MemCpy->getLength());
+ if (CopySize->getZExtValue() > MemSetSize->getZExtValue()) {
+ // If the memcpy is larger than the memset, but the memory was undef prior
+ // to the memset, we can just ignore the tail. Technically we're only
+ // interested in the bytes from MemSetSize..CopySize here, but as we can't
+ // easily represent this location, we use the full 0..CopySize range.
+ MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy);
bool CanReduceSize = false;
if (EnableMemorySSA) {
MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet);
@@ -1303,11 +1303,11 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
}
if (!CanReduceSize)
- return false;
+ return false;
CopySize = MemSetSize;
- }
-
- IRBuilder<> Builder(MemCpy);
+ }
+
+ IRBuilder<> Builder(MemCpy);
Instruction *NewM =
Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
CopySize, MaybeAlign(MemCpy->getDestAlignment()));
@@ -1318,31 +1318,31 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
MSSAU->insertDef(cast<MemoryDef>(NewAccess), /*RenameUses=*/true);
}
- return true;
-}
-
-/// Perform simplification of memcpy's. If we have memcpy A
-/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
-/// B to be a memcpy from X to Z (or potentially a memmove, depending on
-/// circumstances). This allows later passes to remove the first memcpy
-/// altogether.
-bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
- // We can only optimize non-volatile memcpy's.
- if (M->isVolatile()) return false;
-
- // If the source and destination of the memcpy are the same, then zap it.
- if (M->getSource() == M->getDest()) {
- ++BBI;
+ return true;
+}
+
+/// Perform simplification of memcpy's. If we have memcpy A
+/// which copies X to Y, and memcpy B which copies Y to Z, then we can rewrite
+/// B to be a memcpy from X to Z (or potentially a memmove, depending on
+/// circumstances). This allows later passes to remove the first memcpy
+/// altogether.
+bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
+ // We can only optimize non-volatile memcpy's.
+ if (M->isVolatile()) return false;
+
+ // If the source and destination of the memcpy are the same, then zap it.
+ if (M->getSource() == M->getDest()) {
+ ++BBI;
eraseInstruction(M);
- return true;
- }
-
- // If copying from a constant, try to turn the memcpy into a memset.
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource()))
- if (GV->isConstant() && GV->hasDefinitiveInitializer())
- if (Value *ByteVal = isBytewiseValue(GV->getInitializer(),
- M->getModule()->getDataLayout())) {
- IRBuilder<> Builder(M);
+ return true;
+ }
+
+ // If copying from a constant, try to turn the memcpy into a memset.
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(M->getSource()))
+ if (GV->isConstant() && GV->hasDefinitiveInitializer())
+ if (Value *ByteVal = isBytewiseValue(GV->getInitializer(),
+ M->getModule()->getDataLayout())) {
+ IRBuilder<> Builder(M);
Instruction *NewM =
Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
MaybeAlign(M->getDestAlignment()), false);
@@ -1355,17 +1355,17 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
}
eraseInstruction(M);
- ++NumCpyToSet;
- return true;
- }
-
+ ++NumCpyToSet;
+ return true;
+ }
+
if (EnableMemorySSA) {
MemoryUseOrDef *MA = MSSA->getMemoryAccess(M);
MemoryAccess *AnyClobber = MSSA->getWalker()->getClobberingMemoryAccess(MA);
MemoryLocation DestLoc = MemoryLocation::getForDest(M);
const MemoryAccess *DestClobber =
MSSA->getWalker()->getClobberingMemoryAccess(AnyClobber, DestLoc);
-
+
// Try to turn a partially redundant memset + memcpy into
// memcpy + smaller memset. We don't need the memcpy size for this.
// The memcpy most post-dom the memset, so limit this to the same basic
@@ -1375,11 +1375,11 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
if (DestClobber->getBlock() == M->getParent())
if (processMemSetMemCpyDependence(M, MDep))
return true;
-
+
// The optimizations after this point require the memcpy size.
ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
if (!CopySize) return false;
-
+
MemoryAccess *SrcClobber = MSSA->getWalker()->getClobberingMemoryAccess(
AnyClobber, MemoryLocation::getForSource(M));
@@ -1431,19 +1431,19 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
LLVM_DEBUG(dbgs() << "Removed memcpy from undef\n");
eraseInstruction(M);
++NumMemCpyInstr;
- return true;
- }
- }
+ return true;
+ }
+ }
} else {
MemDepResult DepInfo = MD->getDependency(M);
-
+
// Try to turn a partially redundant memset + memcpy into
// memcpy + smaller memset. We don't need the memcpy size for this.
if (DepInfo.isClobber())
if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
if (processMemSetMemCpyDependence(M, MDep))
return true;
-
+
// The optimizations after this point require the memcpy size.
ConstantInt *CopySize = dyn_cast<ConstantInt>(M->getLength());
if (!CopySize) return false;
@@ -1468,8 +1468,8 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
return true;
}
}
- }
-
+ }
+
MemoryLocation SrcLoc = MemoryLocation::getForSource(M);
MemDepResult SrcDepInfo = MD->getPointerDependencyFrom(
SrcLoc, true, M->getIterator(), M->getParent());
@@ -1481,10 +1481,10 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
if (hasUndefContents(SrcDepInfo.getInst(), CopySize)) {
eraseInstruction(M);
++NumMemCpyInstr;
- return true;
- }
+ return true;
+ }
}
-
+
if (SrcDepInfo.isClobber())
if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
if (performMemCpyToMemSetOptzn(M, MDep)) {
@@ -1494,49 +1494,49 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
}
}
- return false;
-}
-
-/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
-/// not to alias.
-bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
- if (!TLI->has(LibFunc_memmove))
- return false;
-
- // See if the pointers alias.
+ return false;
+}
+
+/// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
+/// not to alias.
+bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
+ if (!TLI->has(LibFunc_memmove))
+ return false;
+
+ // See if the pointers alias.
if (!AA->isNoAlias(MemoryLocation::getForDest(M),
MemoryLocation::getForSource(M)))
- return false;
-
- LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
- << "\n");
-
- // If not, then we know we can transform this.
- Type *ArgTys[3] = { M->getRawDest()->getType(),
- M->getRawSource()->getType(),
- M->getLength()->getType() };
- M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(),
- Intrinsic::memcpy, ArgTys));
-
+ return false;
+
+ LLVM_DEBUG(dbgs() << "MemCpyOptPass: Optimizing memmove -> memcpy: " << *M
+ << "\n");
+
+ // If not, then we know we can transform this.
+ Type *ArgTys[3] = { M->getRawDest()->getType(),
+ M->getRawSource()->getType(),
+ M->getLength()->getType() };
+ M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(),
+ Intrinsic::memcpy, ArgTys));
+
// For MemorySSA nothing really changes (except that memcpy may imply stricter
// aliasing guarantees).
- // MemDep may have over conservative information about this instruction, just
- // conservatively flush it from the cache.
+ // MemDep may have over conservative information about this instruction, just
+ // conservatively flush it from the cache.
if (MD)
MD->removeInstruction(M);
-
- ++NumMoveToCpy;
- return true;
-}
-
-/// This is called on every byval argument in call sites.
-bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
- const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout();
- // Find out what feeds this byval argument.
- Value *ByValArg = CB.getArgOperand(ArgNo);
- Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
- uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
+
+ ++NumMoveToCpy;
+ return true;
+}
+
+/// This is called on every byval argument in call sites.
+bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
+ const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout();
+ // Find out what feeds this byval argument.
+ Value *ByValArg = CB.getArgOperand(ArgNo);
+ Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
+ uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
MemoryLocation Loc(ByValArg, LocationSize::precise(ByValSize));
MemCpyInst *MDep = nullptr;
if (EnableMemorySSA) {
@@ -1552,43 +1552,43 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
return false;
MDep = dyn_cast<MemCpyInst>(DepInfo.getInst());
}
-
- // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by
- // a memcpy, see if we can byval from the source of the memcpy instead of the
- // result.
- if (!MDep || MDep->isVolatile() ||
- ByValArg->stripPointerCasts() != MDep->getDest())
- return false;
-
- // The length of the memcpy must be larger or equal to the size of the byval.
- ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
- if (!C1 || C1->getValue().getZExtValue() < ByValSize)
- return false;
-
- // Get the alignment of the byval. If the call doesn't specify the alignment,
- // then it is some target specific value that we can't know.
- MaybeAlign ByValAlign = CB.getParamAlign(ArgNo);
- if (!ByValAlign) return false;
-
- // If it is greater than the memcpy, then we check to see if we can force the
- // source of the memcpy to the alignment we need. If we fail, we bail out.
- MaybeAlign MemDepAlign = MDep->getSourceAlign();
- if ((!MemDepAlign || *MemDepAlign < *ByValAlign) &&
+
+ // If the byval argument isn't fed by a memcpy, ignore it. If it is fed by
+ // a memcpy, see if we can byval from the source of the memcpy instead of the
+ // result.
+ if (!MDep || MDep->isVolatile() ||
+ ByValArg->stripPointerCasts() != MDep->getDest())
+ return false;
+
+ // The length of the memcpy must be larger or equal to the size of the byval.
+ ConstantInt *C1 = dyn_cast<ConstantInt>(MDep->getLength());
+ if (!C1 || C1->getValue().getZExtValue() < ByValSize)
+ return false;
+
+ // Get the alignment of the byval. If the call doesn't specify the alignment,
+ // then it is some target specific value that we can't know.
+ MaybeAlign ByValAlign = CB.getParamAlign(ArgNo);
+ if (!ByValAlign) return false;
+
+ // If it is greater than the memcpy, then we check to see if we can force the
+ // source of the memcpy to the alignment we need. If we fail, we bail out.
+ MaybeAlign MemDepAlign = MDep->getSourceAlign();
+ if ((!MemDepAlign || *MemDepAlign < *ByValAlign) &&
getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &CB, AC,
DT) < *ByValAlign)
- return false;
-
- // The address space of the memcpy source must match the byval argument
- if (MDep->getSource()->getType()->getPointerAddressSpace() !=
- ByValArg->getType()->getPointerAddressSpace())
- return false;
-
- // Verify that the copied-from memory doesn't change in between the memcpy and
- // the byval call.
- // memcpy(a <- b)
- // *b = 42;
- // foo(*a)
- // It would be invalid to transform the second memcpy into foo(*b).
+ return false;
+
+ // The address space of the memcpy source must match the byval argument
+ if (MDep->getSource()->getType()->getPointerAddressSpace() !=
+ ByValArg->getType()->getPointerAddressSpace())
+ return false;
+
+ // Verify that the copied-from memory doesn't change in between the memcpy and
+ // the byval call.
+ // memcpy(a <- b)
+ // *b = 42;
+ // foo(*a)
+ // It would be invalid to transform the second memcpy into foo(*b).
if (EnableMemorySSA) {
if (writtenBetween(MSSA, MemoryLocation::getForSource(MDep),
MSSA->getMemoryAccess(MDep), MSSA->getMemoryAccess(&CB)))
@@ -1602,144 +1602,144 @@ bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
return false;
}
-
- Value *TmpCast = MDep->getSource();
- if (MDep->getSource()->getType() != ByValArg->getType()) {
- BitCastInst *TmpBitCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
- "tmpcast", &CB);
- // Set the tmpcast's DebugLoc to MDep's
- TmpBitCast->setDebugLoc(MDep->getDebugLoc());
- TmpCast = TmpBitCast;
- }
-
- LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
- << " " << *MDep << "\n"
- << " " << CB << "\n");
-
- // Otherwise we're good! Update the byval argument.
- CB.setArgOperand(ArgNo, TmpCast);
- ++NumMemCpyInstr;
- return true;
-}
-
-/// Executes one iteration of MemCpyOptPass.
-bool MemCpyOptPass::iterateOnFunction(Function &F) {
- bool MadeChange = false;
-
- // Walk all instruction in the function.
- for (BasicBlock &BB : F) {
- // Skip unreachable blocks. For example processStore assumes that an
- // instruction in a BB can't be dominated by a later instruction in the
- // same BB (which is a scenario that can happen for an unreachable BB that
- // has itself as a predecessor).
+
+ Value *TmpCast = MDep->getSource();
+ if (MDep->getSource()->getType() != ByValArg->getType()) {
+ BitCastInst *TmpBitCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
+ "tmpcast", &CB);
+ // Set the tmpcast's DebugLoc to MDep's
+ TmpBitCast->setDebugLoc(MDep->getDebugLoc());
+ TmpCast = TmpBitCast;
+ }
+
+ LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
+ << " " << *MDep << "\n"
+ << " " << CB << "\n");
+
+ // Otherwise we're good! Update the byval argument.
+ CB.setArgOperand(ArgNo, TmpCast);
+ ++NumMemCpyInstr;
+ return true;
+}
+
+/// Executes one iteration of MemCpyOptPass.
+bool MemCpyOptPass::iterateOnFunction(Function &F) {
+ bool MadeChange = false;
+
+ // Walk all instruction in the function.
+ for (BasicBlock &BB : F) {
+ // Skip unreachable blocks. For example processStore assumes that an
+ // instruction in a BB can't be dominated by a later instruction in the
+ // same BB (which is a scenario that can happen for an unreachable BB that
+ // has itself as a predecessor).
if (!DT->isReachableFromEntry(&BB))
- continue;
-
- for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
- // Avoid invalidating the iterator.
- Instruction *I = &*BI++;
-
- bool RepeatInstruction = false;
-
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- MadeChange |= processStore(SI, BI);
- else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
- RepeatInstruction = processMemSet(M, BI);
- else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
- RepeatInstruction = processMemCpy(M, BI);
- else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
- RepeatInstruction = processMemMove(M);
- else if (auto *CB = dyn_cast<CallBase>(I)) {
- for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
- if (CB->isByValArgument(i))
- MadeChange |= processByValArgument(*CB, i);
- }
-
- // Reprocess the instruction if desired.
- if (RepeatInstruction) {
- if (BI != BB.begin())
- --BI;
- MadeChange = true;
- }
- }
- }
-
- return MadeChange;
-}
-
-PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
+ continue;
+
+ for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
+ // Avoid invalidating the iterator.
+ Instruction *I = &*BI++;
+
+ bool RepeatInstruction = false;
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ MadeChange |= processStore(SI, BI);
+ else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
+ RepeatInstruction = processMemSet(M, BI);
+ else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
+ RepeatInstruction = processMemCpy(M, BI);
+ else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
+ RepeatInstruction = processMemMove(M);
+ else if (auto *CB = dyn_cast<CallBase>(I)) {
+ for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
+ if (CB->isByValArgument(i))
+ MadeChange |= processByValArgument(*CB, i);
+ }
+
+ // Reprocess the instruction if desired.
+ if (RepeatInstruction) {
+ if (BI != BB.begin())
+ --BI;
+ MadeChange = true;
+ }
+ }
+ }
+
+ return MadeChange;
+}
+
+PreservedAnalyses MemCpyOptPass::run(Function &F, FunctionAnalysisManager &AM) {
auto *MD = !EnableMemorySSA ? &AM.getResult<MemoryDependenceAnalysis>(F)
: AM.getCachedResult<MemoryDependenceAnalysis>(F);
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
auto *AA = &AM.getResult<AAManager>(F);
auto *AC = &AM.getResult<AssumptionAnalysis>(F);
auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
auto *MSSA = EnableMemorySSA ? &AM.getResult<MemorySSAAnalysis>(F)
: AM.getCachedResult<MemorySSAAnalysis>(F);
-
+
bool MadeChange =
runImpl(F, MD, &TLI, AA, AC, DT, MSSA ? &MSSA->getMSSA() : nullptr);
- if (!MadeChange)
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<GlobalsAA>();
+ if (!MadeChange)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
if (MD)
PA.preserve<MemoryDependenceAnalysis>();
if (MSSA)
PA.preserve<MemorySSAAnalysis>();
- return PA;
-}
-
+ return PA;
+}
+
bool MemCpyOptPass::runImpl(Function &F, MemoryDependenceResults *MD_,
TargetLibraryInfo *TLI_, AliasAnalysis *AA_,
AssumptionCache *AC_, DominatorTree *DT_,
MemorySSA *MSSA_) {
- bool MadeChange = false;
- MD = MD_;
- TLI = TLI_;
+ bool MadeChange = false;
+ MD = MD_;
+ TLI = TLI_;
AA = AA_;
AC = AC_;
DT = DT_;
MSSA = MSSA_;
MemorySSAUpdater MSSAU_(MSSA_);
MSSAU = MSSA_ ? &MSSAU_ : nullptr;
- // If we don't have at least memset and memcpy, there is little point of doing
- // anything here. These are required by a freestanding implementation, so if
- // even they are disabled, there is no point in trying hard.
- if (!TLI->has(LibFunc_memset) || !TLI->has(LibFunc_memcpy))
- return false;
-
- while (true) {
- if (!iterateOnFunction(F))
- break;
- MadeChange = true;
- }
-
+ // If we don't have at least memset and memcpy, there is little point of doing
+ // anything here. These are required by a freestanding implementation, so if
+ // even they are disabled, there is no point in trying hard.
+ if (!TLI->has(LibFunc_memset) || !TLI->has(LibFunc_memcpy))
+ return false;
+
+ while (true) {
+ if (!iterateOnFunction(F))
+ break;
+ MadeChange = true;
+ }
+
if (MSSA_ && VerifyMemorySSA)
MSSA_->verifyMemorySSA();
- MD = nullptr;
- return MadeChange;
-}
-
-/// This is the main transformation entry point for a function.
-bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
+ MD = nullptr;
+ return MadeChange;
+}
+
+/// This is the main transformation entry point for a function.
+bool MemCpyOptLegacyPass::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
auto *MDWP = !EnableMemorySSA
? &getAnalysis<MemoryDependenceWrapperPass>()
: getAnalysisIfAvailable<MemoryDependenceWrapperPass>();
- auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *MSSAWP = EnableMemorySSA
? &getAnalysis<MemorySSAWrapperPass>()
: getAnalysisIfAvailable<MemorySSAWrapperPass>();
-
+
return Impl.runImpl(F, MDWP ? & MDWP->getMemDep() : nullptr, TLI, AA, AC, DT,
MSSAWP ? &MSSAWP->getMSSA() : nullptr);
-}
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/MergeICmps.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/MergeICmps.cpp
index 2d9c612494..7f8b75ac88 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/MergeICmps.cpp
@@ -1,629 +1,629 @@
-//===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass turns chains of integer comparisons into memcmp (the memcmp is
-// later typically inlined as a chain of efficient hardware comparisons). This
-// typically benefits c++ member or nonmember operator==().
-//
-// The basic idea is to replace a longer chain of integer comparisons loaded
-// from contiguous memory locations into a shorter chain of larger integer
-// comparisons. Benefits are double:
-// - There are less jumps, and therefore less opportunities for mispredictions
-// and I-cache misses.
-// - Code size is smaller, both because jumps are removed and because the
-// encoding of a 2*n byte compare is smaller than that of two n-byte
-// compares.
-//
-// Example:
-//
-// struct S {
-// int a;
-// char b;
-// char c;
-// uint16_t d;
-// bool operator==(const S& o) const {
-// return a == o.a && b == o.b && c == o.c && d == o.d;
-// }
-// };
-//
-// Is optimized as :
-//
-// bool S::operator==(const S& o) const {
-// return memcmp(this, &o, 8) == 0;
-// }
-//
-// Which will later be expanded (ExpandMemCmp) as a single 8-bytes icmp.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/MergeICmps.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include <algorithm>
-#include <numeric>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-namespace {
-
-#define DEBUG_TYPE "mergeicmps"
-
-// Returns true if the instruction is a simple load or a simple store
-static bool isSimpleLoadOrStore(const Instruction *I) {
- if (const LoadInst *LI = dyn_cast<LoadInst>(I))
- return LI->isSimple();
- if (const StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->isSimple();
- return false;
-}
-
-// A BCE atom "Binary Compare Expression Atom" represents an integer load
-// that is a constant offset from a base value, e.g. `a` or `o.c` in the example
-// at the top.
-struct BCEAtom {
- BCEAtom() = default;
- BCEAtom(GetElementPtrInst *GEP, LoadInst *LoadI, int BaseId, APInt Offset)
- : GEP(GEP), LoadI(LoadI), BaseId(BaseId), Offset(Offset) {}
-
- BCEAtom(const BCEAtom &) = delete;
- BCEAtom &operator=(const BCEAtom &) = delete;
-
- BCEAtom(BCEAtom &&that) = default;
- BCEAtom &operator=(BCEAtom &&that) {
- if (this == &that)
- return *this;
- GEP = that.GEP;
- LoadI = that.LoadI;
- BaseId = that.BaseId;
- Offset = std::move(that.Offset);
- return *this;
- }
-
- // We want to order BCEAtoms by (Base, Offset). However we cannot use
- // the pointer values for Base because these are non-deterministic.
- // To make sure that the sort order is stable, we first assign to each atom
- // base value an index based on its order of appearance in the chain of
- // comparisons. We call this index `BaseOrdering`. For example, for:
- // b[3] == c[2] && a[1] == d[1] && b[4] == c[3]
- // | block 1 | | block 2 | | block 3 |
- // b gets assigned index 0 and a index 1, because b appears as LHS in block 1,
- // which is before block 2.
- // We then sort by (BaseOrdering[LHS.Base()], LHS.Offset), which is stable.
- bool operator<(const BCEAtom &O) const {
- return BaseId != O.BaseId ? BaseId < O.BaseId : Offset.slt(O.Offset);
- }
-
- GetElementPtrInst *GEP = nullptr;
- LoadInst *LoadI = nullptr;
- unsigned BaseId = 0;
- APInt Offset;
-};
-
-// A class that assigns increasing ids to values in the order in which they are
-// seen. See comment in `BCEAtom::operator<()``.
-class BaseIdentifier {
-public:
- // Returns the id for value `Base`, after assigning one if `Base` has not been
- // seen before.
- int getBaseId(const Value *Base) {
- assert(Base && "invalid base");
- const auto Insertion = BaseToIndex.try_emplace(Base, Order);
- if (Insertion.second)
- ++Order;
- return Insertion.first->second;
- }
-
-private:
- unsigned Order = 1;
- DenseMap<const Value*, int> BaseToIndex;
-};
-
-// If this value is a load from a constant offset w.r.t. a base address, and
-// there are no other users of the load or address, returns the base address and
-// the offset.
-BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
- auto *const LoadI = dyn_cast<LoadInst>(Val);
- if (!LoadI)
- return {};
- LLVM_DEBUG(dbgs() << "load\n");
- if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
- LLVM_DEBUG(dbgs() << "used outside of block\n");
- return {};
- }
- // Do not optimize atomic loads to non-atomic memcmp
- if (!LoadI->isSimple()) {
- LLVM_DEBUG(dbgs() << "volatile or atomic\n");
- return {};
- }
- Value *const Addr = LoadI->getOperand(0);
- auto *const GEP = dyn_cast<GetElementPtrInst>(Addr);
- if (!GEP)
- return {};
- LLVM_DEBUG(dbgs() << "GEP\n");
- if (GEP->isUsedOutsideOfBlock(LoadI->getParent())) {
- LLVM_DEBUG(dbgs() << "used outside of block\n");
- return {};
- }
- const auto &DL = GEP->getModule()->getDataLayout();
- if (!isDereferenceablePointer(GEP, LoadI->getType(), DL)) {
- LLVM_DEBUG(dbgs() << "not dereferenceable\n");
- // We need to make sure that we can do comparison in any order, so we
- // require memory to be unconditionnally dereferencable.
- return {};
- }
- APInt Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
- if (!GEP->accumulateConstantOffset(DL, Offset))
- return {};
- return BCEAtom(GEP, LoadI, BaseId.getBaseId(GEP->getPointerOperand()),
- Offset);
-}
-
-// A basic block with a comparison between two BCE atoms, e.g. `a == o.a` in the
-// example at the top.
-// The block might do extra work besides the atom comparison, in which case
-// doesOtherWork() returns true. Under some conditions, the block can be
-// split into the atom comparison part and the "other work" part
-// (see canSplit()).
-// Note: the terminology is misleading: the comparison is symmetric, so there
-// is no real {l/r}hs. What we want though is to have the same base on the
-// left (resp. right), so that we can detect consecutive loads. To ensure this
-// we put the smallest atom on the left.
-class BCECmpBlock {
- public:
- BCECmpBlock() {}
-
- BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits)
- : Lhs_(std::move(L)), Rhs_(std::move(R)), SizeBits_(SizeBits) {
- if (Rhs_ < Lhs_) std::swap(Rhs_, Lhs_);
- }
-
- bool IsValid() const { return Lhs_.BaseId != 0 && Rhs_.BaseId != 0; }
-
- // Assert the block is consistent: If valid, it should also have
- // non-null members besides Lhs_ and Rhs_.
- void AssertConsistent() const {
- if (IsValid()) {
- assert(BB);
- assert(CmpI);
- assert(BranchI);
- }
- }
-
- const BCEAtom &Lhs() const { return Lhs_; }
- const BCEAtom &Rhs() const { return Rhs_; }
- int SizeBits() const { return SizeBits_; }
-
- // Returns true if the block does other works besides comparison.
- bool doesOtherWork() const;
-
- // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp
- // instructions in the block.
- bool canSplit(AliasAnalysis &AA) const;
-
- // Return true if this all the relevant instructions in the BCE-cmp-block can
- // be sunk below this instruction. By doing this, we know we can separate the
- // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the
- // block.
- bool canSinkBCECmpInst(const Instruction *, DenseSet<Instruction *> &,
- AliasAnalysis &AA) const;
-
- // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block
- // instructions. Split the old block and move all non-BCE-cmp-insts into the
- // new parent block.
- void split(BasicBlock *NewParent, AliasAnalysis &AA) const;
-
- // The basic block where this comparison happens.
- BasicBlock *BB = nullptr;
- // The ICMP for this comparison.
- ICmpInst *CmpI = nullptr;
- // The terminating branch.
- BranchInst *BranchI = nullptr;
- // The block requires splitting.
- bool RequireSplit = false;
-
-private:
- BCEAtom Lhs_;
- BCEAtom Rhs_;
- int SizeBits_ = 0;
-};
-
-bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
- DenseSet<Instruction *> &BlockInsts,
- AliasAnalysis &AA) const {
- // If this instruction has side effects and its in middle of the BCE cmp block
- // instructions, then bail for now.
- if (Inst->mayHaveSideEffects()) {
- // Bail if this is not a simple load or store
- if (!isSimpleLoadOrStore(Inst))
- return false;
- // Disallow stores that might alias the BCE operands
- MemoryLocation LLoc = MemoryLocation::get(Lhs_.LoadI);
- MemoryLocation RLoc = MemoryLocation::get(Rhs_.LoadI);
- if (isModSet(AA.getModRefInfo(Inst, LLoc)) ||
- isModSet(AA.getModRefInfo(Inst, RLoc)))
- return false;
- }
- // Make sure this instruction does not use any of the BCE cmp block
- // instructions as operand.
- for (auto BI : BlockInsts) {
- if (is_contained(Inst->operands(), BI))
- return false;
- }
- return true;
-}
-
-void BCECmpBlock::split(BasicBlock *NewParent, AliasAnalysis &AA) const {
- DenseSet<Instruction *> BlockInsts(
- {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
- llvm::SmallVector<Instruction *, 4> OtherInsts;
- for (Instruction &Inst : *BB) {
- if (BlockInsts.count(&Inst))
- continue;
- assert(canSinkBCECmpInst(&Inst, BlockInsts, AA) &&
- "Split unsplittable block");
- // This is a non-BCE-cmp-block instruction. And it can be separated
- // from the BCE-cmp-block instruction.
- OtherInsts.push_back(&Inst);
- }
-
- // Do the actual spliting.
- for (Instruction *Inst : reverse(OtherInsts)) {
- Inst->moveBefore(&*NewParent->begin());
- }
-}
-
-bool BCECmpBlock::canSplit(AliasAnalysis &AA) const {
- DenseSet<Instruction *> BlockInsts(
- {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
- for (Instruction &Inst : *BB) {
- if (!BlockInsts.count(&Inst)) {
- if (!canSinkBCECmpInst(&Inst, BlockInsts, AA))
- return false;
- }
- }
- return true;
-}
-
-bool BCECmpBlock::doesOtherWork() const {
- AssertConsistent();
- // All the instructions we care about in the BCE cmp block.
- DenseSet<Instruction *> BlockInsts(
- {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
- // TODO(courbet): Can we allow some other things ? This is very conservative.
- // We might be able to get away with anything does not have any side
- // effects outside of the basic block.
- // Note: The GEPs and/or loads are not necessarily in the same block.
- for (const Instruction &Inst : *BB) {
- if (!BlockInsts.count(&Inst))
- return true;
- }
- return false;
-}
-
-// Visit the given comparison. If this is a comparison between two valid
-// BCE atoms, returns the comparison.
-BCECmpBlock visitICmp(const ICmpInst *const CmpI,
- const ICmpInst::Predicate ExpectedPredicate,
- BaseIdentifier &BaseId) {
- // The comparison can only be used once:
- // - For intermediate blocks, as a branch condition.
- // - For the final block, as an incoming value for the Phi.
- // If there are any other uses of the comparison, we cannot merge it with
- // other comparisons as we would create an orphan use of the value.
- if (!CmpI->hasOneUse()) {
- LLVM_DEBUG(dbgs() << "cmp has several uses\n");
- return {};
- }
- if (CmpI->getPredicate() != ExpectedPredicate)
- return {};
- LLVM_DEBUG(dbgs() << "cmp "
- << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
- << "\n");
- auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0), BaseId);
- if (!Lhs.BaseId)
- return {};
- auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1), BaseId);
- if (!Rhs.BaseId)
- return {};
- const auto &DL = CmpI->getModule()->getDataLayout();
- return BCECmpBlock(std::move(Lhs), std::move(Rhs),
- DL.getTypeSizeInBits(CmpI->getOperand(0)->getType()));
-}
-
-// Visit the given comparison block. If this is a comparison between two valid
-// BCE atoms, returns the comparison.
-BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
- const BasicBlock *const PhiBlock,
- BaseIdentifier &BaseId) {
- if (Block->empty()) return {};
- auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
- if (!BranchI) return {};
- LLVM_DEBUG(dbgs() << "branch\n");
- if (BranchI->isUnconditional()) {
- // In this case, we expect an incoming value which is the result of the
- // comparison. This is the last link in the chain of comparisons (note
- // that this does not mean that this is the last incoming value, blocks
- // can be reordered).
- auto *const CmpI = dyn_cast<ICmpInst>(Val);
- if (!CmpI) return {};
- LLVM_DEBUG(dbgs() << "icmp\n");
- auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ, BaseId);
- Result.CmpI = CmpI;
- Result.BranchI = BranchI;
- return Result;
- } else {
- // In this case, we expect a constant incoming value (the comparison is
- // chained).
+//===- MergeICmps.cpp - Optimize chains of integer comparisons ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass turns chains of integer comparisons into memcmp (the memcmp is
+// later typically inlined as a chain of efficient hardware comparisons). This
+// typically benefits c++ member or nonmember operator==().
+//
+// The basic idea is to replace a longer chain of integer comparisons loaded
+// from contiguous memory locations into a shorter chain of larger integer
+// comparisons. Benefits are double:
+// - There are less jumps, and therefore less opportunities for mispredictions
+// and I-cache misses.
+// - Code size is smaller, both because jumps are removed and because the
+// encoding of a 2*n byte compare is smaller than that of two n-byte
+// compares.
+//
+// Example:
+//
+// struct S {
+// int a;
+// char b;
+// char c;
+// uint16_t d;
+// bool operator==(const S& o) const {
+// return a == o.a && b == o.b && c == o.c && d == o.d;
+// }
+// };
+//
+// Is optimized as :
+//
+// bool S::operator==(const S& o) const {
+// return memcmp(this, &o, 8) == 0;
+// }
+//
+// Which will later be expanded (ExpandMemCmp) as a single 8-bytes icmp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/MergeICmps.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include <algorithm>
+#include <numeric>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+namespace {
+
+#define DEBUG_TYPE "mergeicmps"
+
+// Returns true if the instruction is a simple load or a simple store
+static bool isSimpleLoadOrStore(const Instruction *I) {
+ if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+ return LI->isSimple();
+ if (const StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->isSimple();
+ return false;
+}
+
+// A BCE atom "Binary Compare Expression Atom" represents an integer load
+// that is a constant offset from a base value, e.g. `a` or `o.c` in the example
+// at the top.
+struct BCEAtom {
+ BCEAtom() = default;
+ BCEAtom(GetElementPtrInst *GEP, LoadInst *LoadI, int BaseId, APInt Offset)
+ : GEP(GEP), LoadI(LoadI), BaseId(BaseId), Offset(Offset) {}
+
+ BCEAtom(const BCEAtom &) = delete;
+ BCEAtom &operator=(const BCEAtom &) = delete;
+
+ BCEAtom(BCEAtom &&that) = default;
+ BCEAtom &operator=(BCEAtom &&that) {
+ if (this == &that)
+ return *this;
+ GEP = that.GEP;
+ LoadI = that.LoadI;
+ BaseId = that.BaseId;
+ Offset = std::move(that.Offset);
+ return *this;
+ }
+
+ // We want to order BCEAtoms by (Base, Offset). However we cannot use
+ // the pointer values for Base because these are non-deterministic.
+ // To make sure that the sort order is stable, we first assign to each atom
+ // base value an index based on its order of appearance in the chain of
+ // comparisons. We call this index `BaseOrdering`. For example, for:
+ // b[3] == c[2] && a[1] == d[1] && b[4] == c[3]
+ // | block 1 | | block 2 | | block 3 |
+ // b gets assigned index 0 and a index 1, because b appears as LHS in block 1,
+ // which is before block 2.
+ // We then sort by (BaseOrdering[LHS.Base()], LHS.Offset), which is stable.
+ bool operator<(const BCEAtom &O) const {
+ return BaseId != O.BaseId ? BaseId < O.BaseId : Offset.slt(O.Offset);
+ }
+
+ GetElementPtrInst *GEP = nullptr;
+ LoadInst *LoadI = nullptr;
+ unsigned BaseId = 0;
+ APInt Offset;
+};
+
+// A class that assigns increasing ids to values in the order in which they are
+// seen. See comment in `BCEAtom::operator<()``.
+class BaseIdentifier {
+public:
+ // Returns the id for value `Base`, after assigning one if `Base` has not been
+ // seen before.
+ int getBaseId(const Value *Base) {
+ assert(Base && "invalid base");
+ const auto Insertion = BaseToIndex.try_emplace(Base, Order);
+ if (Insertion.second)
+ ++Order;
+ return Insertion.first->second;
+ }
+
+private:
+ unsigned Order = 1;
+ DenseMap<const Value*, int> BaseToIndex;
+};
+
+// If this value is a load from a constant offset w.r.t. a base address, and
+// there are no other users of the load or address, returns the base address and
+// the offset.
+BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
+ auto *const LoadI = dyn_cast<LoadInst>(Val);
+ if (!LoadI)
+ return {};
+ LLVM_DEBUG(dbgs() << "load\n");
+ if (LoadI->isUsedOutsideOfBlock(LoadI->getParent())) {
+ LLVM_DEBUG(dbgs() << "used outside of block\n");
+ return {};
+ }
+ // Do not optimize atomic loads to non-atomic memcmp
+ if (!LoadI->isSimple()) {
+ LLVM_DEBUG(dbgs() << "volatile or atomic\n");
+ return {};
+ }
+ Value *const Addr = LoadI->getOperand(0);
+ auto *const GEP = dyn_cast<GetElementPtrInst>(Addr);
+ if (!GEP)
+ return {};
+ LLVM_DEBUG(dbgs() << "GEP\n");
+ if (GEP->isUsedOutsideOfBlock(LoadI->getParent())) {
+ LLVM_DEBUG(dbgs() << "used outside of block\n");
+ return {};
+ }
+ const auto &DL = GEP->getModule()->getDataLayout();
+ if (!isDereferenceablePointer(GEP, LoadI->getType(), DL)) {
+ LLVM_DEBUG(dbgs() << "not dereferenceable\n");
+ // We need to make sure that we can do comparison in any order, so we
+ // require memory to be unconditionnally dereferencable.
+ return {};
+ }
+ APInt Offset = APInt(DL.getPointerTypeSizeInBits(GEP->getType()), 0);
+ if (!GEP->accumulateConstantOffset(DL, Offset))
+ return {};
+ return BCEAtom(GEP, LoadI, BaseId.getBaseId(GEP->getPointerOperand()),
+ Offset);
+}
+
+// A basic block with a comparison between two BCE atoms, e.g. `a == o.a` in the
+// example at the top.
+// The block might do extra work besides the atom comparison, in which case
+// doesOtherWork() returns true. Under some conditions, the block can be
+// split into the atom comparison part and the "other work" part
+// (see canSplit()).
+// Note: the terminology is misleading: the comparison is symmetric, so there
+// is no real {l/r}hs. What we want though is to have the same base on the
+// left (resp. right), so that we can detect consecutive loads. To ensure this
+// we put the smallest atom on the left.
+class BCECmpBlock {
+ public:
+ BCECmpBlock() {}
+
+ BCECmpBlock(BCEAtom L, BCEAtom R, int SizeBits)
+ : Lhs_(std::move(L)), Rhs_(std::move(R)), SizeBits_(SizeBits) {
+ if (Rhs_ < Lhs_) std::swap(Rhs_, Lhs_);
+ }
+
+ bool IsValid() const { return Lhs_.BaseId != 0 && Rhs_.BaseId != 0; }
+
+ // Assert the block is consistent: If valid, it should also have
+ // non-null members besides Lhs_ and Rhs_.
+ void AssertConsistent() const {
+ if (IsValid()) {
+ assert(BB);
+ assert(CmpI);
+ assert(BranchI);
+ }
+ }
+
+ const BCEAtom &Lhs() const { return Lhs_; }
+ const BCEAtom &Rhs() const { return Rhs_; }
+ int SizeBits() const { return SizeBits_; }
+
+ // Returns true if the block does other works besides comparison.
+ bool doesOtherWork() const;
+
+ // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp
+ // instructions in the block.
+ bool canSplit(AliasAnalysis &AA) const;
+
+ // Return true if this all the relevant instructions in the BCE-cmp-block can
+ // be sunk below this instruction. By doing this, we know we can separate the
+ // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the
+ // block.
+ bool canSinkBCECmpInst(const Instruction *, DenseSet<Instruction *> &,
+ AliasAnalysis &AA) const;
+
+ // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block
+ // instructions. Split the old block and move all non-BCE-cmp-insts into the
+ // new parent block.
+ void split(BasicBlock *NewParent, AliasAnalysis &AA) const;
+
+ // The basic block where this comparison happens.
+ BasicBlock *BB = nullptr;
+ // The ICMP for this comparison.
+ ICmpInst *CmpI = nullptr;
+ // The terminating branch.
+ BranchInst *BranchI = nullptr;
+ // The block requires splitting.
+ bool RequireSplit = false;
+
+private:
+ BCEAtom Lhs_;
+ BCEAtom Rhs_;
+ int SizeBits_ = 0;
+};
+
+bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
+ DenseSet<Instruction *> &BlockInsts,
+ AliasAnalysis &AA) const {
+ // If this instruction has side effects and its in middle of the BCE cmp block
+ // instructions, then bail for now.
+ if (Inst->mayHaveSideEffects()) {
+ // Bail if this is not a simple load or store
+ if (!isSimpleLoadOrStore(Inst))
+ return false;
+ // Disallow stores that might alias the BCE operands
+ MemoryLocation LLoc = MemoryLocation::get(Lhs_.LoadI);
+ MemoryLocation RLoc = MemoryLocation::get(Rhs_.LoadI);
+ if (isModSet(AA.getModRefInfo(Inst, LLoc)) ||
+ isModSet(AA.getModRefInfo(Inst, RLoc)))
+ return false;
+ }
+ // Make sure this instruction does not use any of the BCE cmp block
+ // instructions as operand.
+ for (auto BI : BlockInsts) {
+ if (is_contained(Inst->operands(), BI))
+ return false;
+ }
+ return true;
+}
+
+void BCECmpBlock::split(BasicBlock *NewParent, AliasAnalysis &AA) const {
+ DenseSet<Instruction *> BlockInsts(
+ {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+ llvm::SmallVector<Instruction *, 4> OtherInsts;
+ for (Instruction &Inst : *BB) {
+ if (BlockInsts.count(&Inst))
+ continue;
+ assert(canSinkBCECmpInst(&Inst, BlockInsts, AA) &&
+ "Split unsplittable block");
+ // This is a non-BCE-cmp-block instruction. And it can be separated
+ // from the BCE-cmp-block instruction.
+ OtherInsts.push_back(&Inst);
+ }
+
+ // Do the actual spliting.
+ for (Instruction *Inst : reverse(OtherInsts)) {
+ Inst->moveBefore(&*NewParent->begin());
+ }
+}
+
+bool BCECmpBlock::canSplit(AliasAnalysis &AA) const {
+ DenseSet<Instruction *> BlockInsts(
+ {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+ for (Instruction &Inst : *BB) {
+ if (!BlockInsts.count(&Inst)) {
+ if (!canSinkBCECmpInst(&Inst, BlockInsts, AA))
+ return false;
+ }
+ }
+ return true;
+}
+
+bool BCECmpBlock::doesOtherWork() const {
+ AssertConsistent();
+ // All the instructions we care about in the BCE cmp block.
+ DenseSet<Instruction *> BlockInsts(
+ {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+ // TODO(courbet): Can we allow some other things ? This is very conservative.
+ // We might be able to get away with anything does not have any side
+ // effects outside of the basic block.
+ // Note: The GEPs and/or loads are not necessarily in the same block.
+ for (const Instruction &Inst : *BB) {
+ if (!BlockInsts.count(&Inst))
+ return true;
+ }
+ return false;
+}
+
+// Visit the given comparison. If this is a comparison between two valid
+// BCE atoms, returns the comparison.
+BCECmpBlock visitICmp(const ICmpInst *const CmpI,
+ const ICmpInst::Predicate ExpectedPredicate,
+ BaseIdentifier &BaseId) {
+ // The comparison can only be used once:
+ // - For intermediate blocks, as a branch condition.
+ // - For the final block, as an incoming value for the Phi.
+ // If there are any other uses of the comparison, we cannot merge it with
+ // other comparisons as we would create an orphan use of the value.
+ if (!CmpI->hasOneUse()) {
+ LLVM_DEBUG(dbgs() << "cmp has several uses\n");
+ return {};
+ }
+ if (CmpI->getPredicate() != ExpectedPredicate)
+ return {};
+ LLVM_DEBUG(dbgs() << "cmp "
+ << (ExpectedPredicate == ICmpInst::ICMP_EQ ? "eq" : "ne")
+ << "\n");
+ auto Lhs = visitICmpLoadOperand(CmpI->getOperand(0), BaseId);
+ if (!Lhs.BaseId)
+ return {};
+ auto Rhs = visitICmpLoadOperand(CmpI->getOperand(1), BaseId);
+ if (!Rhs.BaseId)
+ return {};
+ const auto &DL = CmpI->getModule()->getDataLayout();
+ return BCECmpBlock(std::move(Lhs), std::move(Rhs),
+ DL.getTypeSizeInBits(CmpI->getOperand(0)->getType()));
+}
+
+// Visit the given comparison block. If this is a comparison between two valid
+// BCE atoms, returns the comparison.
+BCECmpBlock visitCmpBlock(Value *const Val, BasicBlock *const Block,
+ const BasicBlock *const PhiBlock,
+ BaseIdentifier &BaseId) {
+ if (Block->empty()) return {};
+ auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
+ if (!BranchI) return {};
+ LLVM_DEBUG(dbgs() << "branch\n");
+ if (BranchI->isUnconditional()) {
+ // In this case, we expect an incoming value which is the result of the
+ // comparison. This is the last link in the chain of comparisons (note
+ // that this does not mean that this is the last incoming value, blocks
+ // can be reordered).
+ auto *const CmpI = dyn_cast<ICmpInst>(Val);
+ if (!CmpI) return {};
+ LLVM_DEBUG(dbgs() << "icmp\n");
+ auto Result = visitICmp(CmpI, ICmpInst::ICMP_EQ, BaseId);
+ Result.CmpI = CmpI;
+ Result.BranchI = BranchI;
+ return Result;
+ } else {
+ // In this case, we expect a constant incoming value (the comparison is
+ // chained).
const auto *const Const = cast<ConstantInt>(Val);
- LLVM_DEBUG(dbgs() << "const\n");
- if (!Const->isZero()) return {};
- LLVM_DEBUG(dbgs() << "false\n");
- auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition());
- if (!CmpI) return {};
- LLVM_DEBUG(dbgs() << "icmp\n");
- assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
- BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
- auto Result = visitICmp(
- CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
- BaseId);
- Result.CmpI = CmpI;
- Result.BranchI = BranchI;
- return Result;
- }
- return {};
-}
-
-static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
- BCECmpBlock &&Comparison) {
- LLVM_DEBUG(dbgs() << "Block '" << Comparison.BB->getName()
- << "': Found cmp of " << Comparison.SizeBits()
- << " bits between " << Comparison.Lhs().BaseId << " + "
- << Comparison.Lhs().Offset << " and "
- << Comparison.Rhs().BaseId << " + "
- << Comparison.Rhs().Offset << "\n");
- LLVM_DEBUG(dbgs() << "\n");
- Comparisons.push_back(std::move(Comparison));
-}
-
-// A chain of comparisons.
-class BCECmpChain {
- public:
- BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
- AliasAnalysis &AA);
-
- int size() const { return Comparisons_.size(); }
-
-#ifdef MERGEICMPS_DOT_ON
- void dump() const;
-#endif // MERGEICMPS_DOT_ON
-
- bool simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
- DomTreeUpdater &DTU);
-
-private:
- static bool IsContiguous(const BCECmpBlock &First,
- const BCECmpBlock &Second) {
- return First.Lhs().BaseId == Second.Lhs().BaseId &&
- First.Rhs().BaseId == Second.Rhs().BaseId &&
- First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset &&
- First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset;
- }
-
- PHINode &Phi_;
- std::vector<BCECmpBlock> Comparisons_;
- // The original entry block (before sorting);
- BasicBlock *EntryBlock_;
-};
-
-BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
- AliasAnalysis &AA)
- : Phi_(Phi) {
- assert(!Blocks.empty() && "a chain should have at least one block");
- // Now look inside blocks to check for BCE comparisons.
- std::vector<BCECmpBlock> Comparisons;
- BaseIdentifier BaseId;
- for (size_t BlockIdx = 0; BlockIdx < Blocks.size(); ++BlockIdx) {
- BasicBlock *const Block = Blocks[BlockIdx];
- assert(Block && "invalid block");
- BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
- Block, Phi.getParent(), BaseId);
- Comparison.BB = Block;
- if (!Comparison.IsValid()) {
- LLVM_DEBUG(dbgs() << "chain with invalid BCECmpBlock, no merge.\n");
- return;
- }
- if (Comparison.doesOtherWork()) {
- LLVM_DEBUG(dbgs() << "block '" << Comparison.BB->getName()
- << "' does extra work besides compare\n");
- if (Comparisons.empty()) {
- // This is the initial block in the chain, in case this block does other
- // work, we can try to split the block and move the irrelevant
- // instructions to the predecessor.
- //
- // If this is not the initial block in the chain, splitting it wont
- // work.
- //
- // As once split, there will still be instructions before the BCE cmp
- // instructions that do other work in program order, i.e. within the
- // chain before sorting. Unless we can abort the chain at this point
- // and start anew.
- //
- // NOTE: we only handle blocks a with single predecessor for now.
- if (Comparison.canSplit(AA)) {
- LLVM_DEBUG(dbgs()
- << "Split initial block '" << Comparison.BB->getName()
- << "' that does extra work besides compare\n");
- Comparison.RequireSplit = true;
- enqueueBlock(Comparisons, std::move(Comparison));
- } else {
- LLVM_DEBUG(dbgs()
- << "ignoring initial block '" << Comparison.BB->getName()
- << "' that does extra work besides compare\n");
- }
- continue;
- }
- // TODO(courbet): Right now we abort the whole chain. We could be
- // merging only the blocks that don't do other work and resume the
- // chain from there. For example:
- // if (a[0] == b[0]) { // bb1
- // if (a[1] == b[1]) { // bb2
- // some_value = 3; //bb3
- // if (a[2] == b[2]) { //bb3
- // do a ton of stuff //bb4
- // }
- // }
- // }
- //
- // This is:
- //
- // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+
- // \ \ \ \
- // ne ne ne \
- // \ \ \ v
- // +------------+-----------+----------> bb_phi
- //
- // We can only merge the first two comparisons, because bb3* does
- // "other work" (setting some_value to 3).
- // We could still merge bb1 and bb2 though.
- return;
- }
- enqueueBlock(Comparisons, std::move(Comparison));
- }
-
- // It is possible we have no suitable comparison to merge.
- if (Comparisons.empty()) {
- LLVM_DEBUG(dbgs() << "chain with no BCE basic blocks, no merge\n");
- return;
- }
- EntryBlock_ = Comparisons[0].BB;
- Comparisons_ = std::move(Comparisons);
-#ifdef MERGEICMPS_DOT_ON
- errs() << "BEFORE REORDERING:\n\n";
- dump();
-#endif // MERGEICMPS_DOT_ON
- // Reorder blocks by LHS. We can do that without changing the
- // semantics because we are only accessing dereferencable memory.
- llvm::sort(Comparisons_,
- [](const BCECmpBlock &LhsBlock, const BCECmpBlock &RhsBlock) {
- return std::tie(LhsBlock.Lhs(), LhsBlock.Rhs()) <
- std::tie(RhsBlock.Lhs(), RhsBlock.Rhs());
- });
-#ifdef MERGEICMPS_DOT_ON
- errs() << "AFTER REORDERING:\n\n";
- dump();
-#endif // MERGEICMPS_DOT_ON
-}
-
-#ifdef MERGEICMPS_DOT_ON
-void BCECmpChain::dump() const {
- errs() << "digraph dag {\n";
- errs() << " graph [bgcolor=transparent];\n";
- errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n";
- errs() << " edge [color=black];\n";
- for (size_t I = 0; I < Comparisons_.size(); ++I) {
- const auto &Comparison = Comparisons_[I];
- errs() << " \"" << I << "\" [label=\"%"
- << Comparison.Lhs().Base()->getName() << " + "
- << Comparison.Lhs().Offset << " == %"
- << Comparison.Rhs().Base()->getName() << " + "
- << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8)
- << " bytes)\"];\n";
- const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB);
- if (I > 0) errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n";
- errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n";
- }
- errs() << " \"Phi\" [label=\"Phi\"];\n";
- errs() << "}\n\n";
-}
-#endif // MERGEICMPS_DOT_ON
-
-namespace {
-
-// A class to compute the name of a set of merged basic blocks.
-// This is optimized for the common case of no block names.
-class MergedBlockName {
- // Storage for the uncommon case of several named blocks.
- SmallString<16> Scratch;
-
-public:
- explicit MergedBlockName(ArrayRef<BCECmpBlock> Comparisons)
- : Name(makeName(Comparisons)) {}
- const StringRef Name;
-
-private:
- StringRef makeName(ArrayRef<BCECmpBlock> Comparisons) {
- assert(!Comparisons.empty() && "no basic block");
- // Fast path: only one block, or no names at all.
- if (Comparisons.size() == 1)
- return Comparisons[0].BB->getName();
- const int size = std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
- [](int i, const BCECmpBlock &Cmp) {
- return i + Cmp.BB->getName().size();
- });
- if (size == 0)
- return StringRef("", 0);
-
- // Slow path: at least two blocks, at least one block with a name.
- Scratch.clear();
- // We'll have `size` bytes for name and `Comparisons.size() - 1` bytes for
- // separators.
- Scratch.reserve(size + Comparisons.size() - 1);
- const auto append = [this](StringRef str) {
- Scratch.append(str.begin(), str.end());
- };
- append(Comparisons[0].BB->getName());
- for (int I = 1, E = Comparisons.size(); I < E; ++I) {
- const BasicBlock *const BB = Comparisons[I].BB;
- if (!BB->getName().empty()) {
- append("+");
- append(BB->getName());
- }
- }
- return StringRef(Scratch);
- }
-};
-} // namespace
-
-// Merges the given contiguous comparison blocks into one memcmp block.
-static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
- BasicBlock *const InsertBefore,
- BasicBlock *const NextCmpBlock,
- PHINode &Phi, const TargetLibraryInfo &TLI,
- AliasAnalysis &AA, DomTreeUpdater &DTU) {
- assert(!Comparisons.empty() && "merging zero comparisons");
- LLVMContext &Context = NextCmpBlock->getContext();
- const BCECmpBlock &FirstCmp = Comparisons[0];
-
- // Create a new cmp block before next cmp block.
- BasicBlock *const BB =
- BasicBlock::Create(Context, MergedBlockName(Comparisons).Name,
- NextCmpBlock->getParent(), InsertBefore);
- IRBuilder<> Builder(BB);
- // Add the GEPs from the first BCECmpBlock.
- Value *const Lhs = Builder.Insert(FirstCmp.Lhs().GEP->clone());
- Value *const Rhs = Builder.Insert(FirstCmp.Rhs().GEP->clone());
-
- Value *IsEqual = nullptr;
- LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons -> "
- << BB->getName() << "\n");
+ LLVM_DEBUG(dbgs() << "const\n");
+ if (!Const->isZero()) return {};
+ LLVM_DEBUG(dbgs() << "false\n");
+ auto *const CmpI = dyn_cast<ICmpInst>(BranchI->getCondition());
+ if (!CmpI) return {};
+ LLVM_DEBUG(dbgs() << "icmp\n");
+ assert(BranchI->getNumSuccessors() == 2 && "expecting a cond branch");
+ BasicBlock *const FalseBlock = BranchI->getSuccessor(1);
+ auto Result = visitICmp(
+ CmpI, FalseBlock == PhiBlock ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
+ BaseId);
+ Result.CmpI = CmpI;
+ Result.BranchI = BranchI;
+ return Result;
+ }
+ return {};
+}
+
+static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
+ BCECmpBlock &&Comparison) {
+ LLVM_DEBUG(dbgs() << "Block '" << Comparison.BB->getName()
+ << "': Found cmp of " << Comparison.SizeBits()
+ << " bits between " << Comparison.Lhs().BaseId << " + "
+ << Comparison.Lhs().Offset << " and "
+ << Comparison.Rhs().BaseId << " + "
+ << Comparison.Rhs().Offset << "\n");
+ LLVM_DEBUG(dbgs() << "\n");
+ Comparisons.push_back(std::move(Comparison));
+}
+
+// A chain of comparisons.
+class BCECmpChain {
+ public:
+ BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
+ AliasAnalysis &AA);
+
+ int size() const { return Comparisons_.size(); }
+
+#ifdef MERGEICMPS_DOT_ON
+ void dump() const;
+#endif // MERGEICMPS_DOT_ON
+
+ bool simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
+ DomTreeUpdater &DTU);
+
+private:
+ static bool IsContiguous(const BCECmpBlock &First,
+ const BCECmpBlock &Second) {
+ return First.Lhs().BaseId == Second.Lhs().BaseId &&
+ First.Rhs().BaseId == Second.Rhs().BaseId &&
+ First.Lhs().Offset + First.SizeBits() / 8 == Second.Lhs().Offset &&
+ First.Rhs().Offset + First.SizeBits() / 8 == Second.Rhs().Offset;
+ }
+
+ PHINode &Phi_;
+ std::vector<BCECmpBlock> Comparisons_;
+ // The original entry block (before sorting);
+ BasicBlock *EntryBlock_;
+};
+
+BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
+ AliasAnalysis &AA)
+ : Phi_(Phi) {
+ assert(!Blocks.empty() && "a chain should have at least one block");
+ // Now look inside blocks to check for BCE comparisons.
+ std::vector<BCECmpBlock> Comparisons;
+ BaseIdentifier BaseId;
+ for (size_t BlockIdx = 0; BlockIdx < Blocks.size(); ++BlockIdx) {
+ BasicBlock *const Block = Blocks[BlockIdx];
+ assert(Block && "invalid block");
+ BCECmpBlock Comparison = visitCmpBlock(Phi.getIncomingValueForBlock(Block),
+ Block, Phi.getParent(), BaseId);
+ Comparison.BB = Block;
+ if (!Comparison.IsValid()) {
+ LLVM_DEBUG(dbgs() << "chain with invalid BCECmpBlock, no merge.\n");
+ return;
+ }
+ if (Comparison.doesOtherWork()) {
+ LLVM_DEBUG(dbgs() << "block '" << Comparison.BB->getName()
+ << "' does extra work besides compare\n");
+ if (Comparisons.empty()) {
+ // This is the initial block in the chain, in case this block does other
+ // work, we can try to split the block and move the irrelevant
+ // instructions to the predecessor.
+ //
+ // If this is not the initial block in the chain, splitting it wont
+ // work.
+ //
+ // As once split, there will still be instructions before the BCE cmp
+ // instructions that do other work in program order, i.e. within the
+ // chain before sorting. Unless we can abort the chain at this point
+ // and start anew.
+ //
+ // NOTE: we only handle blocks a with single predecessor for now.
+ if (Comparison.canSplit(AA)) {
+ LLVM_DEBUG(dbgs()
+ << "Split initial block '" << Comparison.BB->getName()
+ << "' that does extra work besides compare\n");
+ Comparison.RequireSplit = true;
+ enqueueBlock(Comparisons, std::move(Comparison));
+ } else {
+ LLVM_DEBUG(dbgs()
+ << "ignoring initial block '" << Comparison.BB->getName()
+ << "' that does extra work besides compare\n");
+ }
+ continue;
+ }
+ // TODO(courbet): Right now we abort the whole chain. We could be
+ // merging only the blocks that don't do other work and resume the
+ // chain from there. For example:
+ // if (a[0] == b[0]) { // bb1
+ // if (a[1] == b[1]) { // bb2
+ // some_value = 3; //bb3
+ // if (a[2] == b[2]) { //bb3
+ // do a ton of stuff //bb4
+ // }
+ // }
+ // }
+ //
+ // This is:
+ //
+ // bb1 --eq--> bb2 --eq--> bb3* -eq--> bb4 --+
+ // \ \ \ \
+ // ne ne ne \
+ // \ \ \ v
+ // +------------+-----------+----------> bb_phi
+ //
+ // We can only merge the first two comparisons, because bb3* does
+ // "other work" (setting some_value to 3).
+ // We could still merge bb1 and bb2 though.
+ return;
+ }
+ enqueueBlock(Comparisons, std::move(Comparison));
+ }
+
+ // It is possible we have no suitable comparison to merge.
+ if (Comparisons.empty()) {
+ LLVM_DEBUG(dbgs() << "chain with no BCE basic blocks, no merge\n");
+ return;
+ }
+ EntryBlock_ = Comparisons[0].BB;
+ Comparisons_ = std::move(Comparisons);
+#ifdef MERGEICMPS_DOT_ON
+ errs() << "BEFORE REORDERING:\n\n";
+ dump();
+#endif // MERGEICMPS_DOT_ON
+ // Reorder blocks by LHS. We can do that without changing the
+ // semantics because we are only accessing dereferencable memory.
+ llvm::sort(Comparisons_,
+ [](const BCECmpBlock &LhsBlock, const BCECmpBlock &RhsBlock) {
+ return std::tie(LhsBlock.Lhs(), LhsBlock.Rhs()) <
+ std::tie(RhsBlock.Lhs(), RhsBlock.Rhs());
+ });
+#ifdef MERGEICMPS_DOT_ON
+ errs() << "AFTER REORDERING:\n\n";
+ dump();
+#endif // MERGEICMPS_DOT_ON
+}
+
+#ifdef MERGEICMPS_DOT_ON
+void BCECmpChain::dump() const {
+ errs() << "digraph dag {\n";
+ errs() << " graph [bgcolor=transparent];\n";
+ errs() << " node [color=black,style=filled,fillcolor=lightyellow];\n";
+ errs() << " edge [color=black];\n";
+ for (size_t I = 0; I < Comparisons_.size(); ++I) {
+ const auto &Comparison = Comparisons_[I];
+ errs() << " \"" << I << "\" [label=\"%"
+ << Comparison.Lhs().Base()->getName() << " + "
+ << Comparison.Lhs().Offset << " == %"
+ << Comparison.Rhs().Base()->getName() << " + "
+ << Comparison.Rhs().Offset << " (" << (Comparison.SizeBits() / 8)
+ << " bytes)\"];\n";
+ const Value *const Val = Phi_.getIncomingValueForBlock(Comparison.BB);
+ if (I > 0) errs() << " \"" << (I - 1) << "\" -> \"" << I << "\";\n";
+ errs() << " \"" << I << "\" -> \"Phi\" [label=\"" << *Val << "\"];\n";
+ }
+ errs() << " \"Phi\" [label=\"Phi\"];\n";
+ errs() << "}\n\n";
+}
+#endif // MERGEICMPS_DOT_ON
+
+namespace {
+
+// A class to compute the name of a set of merged basic blocks.
+// This is optimized for the common case of no block names.
+class MergedBlockName {
+ // Storage for the uncommon case of several named blocks.
+ SmallString<16> Scratch;
+
+public:
+ explicit MergedBlockName(ArrayRef<BCECmpBlock> Comparisons)
+ : Name(makeName(Comparisons)) {}
+ const StringRef Name;
+
+private:
+ StringRef makeName(ArrayRef<BCECmpBlock> Comparisons) {
+ assert(!Comparisons.empty() && "no basic block");
+ // Fast path: only one block, or no names at all.
+ if (Comparisons.size() == 1)
+ return Comparisons[0].BB->getName();
+ const int size = std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
+ [](int i, const BCECmpBlock &Cmp) {
+ return i + Cmp.BB->getName().size();
+ });
+ if (size == 0)
+ return StringRef("", 0);
+
+ // Slow path: at least two blocks, at least one block with a name.
+ Scratch.clear();
+ // We'll have `size` bytes for name and `Comparisons.size() - 1` bytes for
+ // separators.
+ Scratch.reserve(size + Comparisons.size() - 1);
+ const auto append = [this](StringRef str) {
+ Scratch.append(str.begin(), str.end());
+ };
+ append(Comparisons[0].BB->getName());
+ for (int I = 1, E = Comparisons.size(); I < E; ++I) {
+ const BasicBlock *const BB = Comparisons[I].BB;
+ if (!BB->getName().empty()) {
+ append("+");
+ append(BB->getName());
+ }
+ }
+ return StringRef(Scratch);
+ }
+};
+} // namespace
+
+// Merges the given contiguous comparison blocks into one memcmp block.
+static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+ BasicBlock *const InsertBefore,
+ BasicBlock *const NextCmpBlock,
+ PHINode &Phi, const TargetLibraryInfo &TLI,
+ AliasAnalysis &AA, DomTreeUpdater &DTU) {
+ assert(!Comparisons.empty() && "merging zero comparisons");
+ LLVMContext &Context = NextCmpBlock->getContext();
+ const BCECmpBlock &FirstCmp = Comparisons[0];
+
+ // Create a new cmp block before next cmp block.
+ BasicBlock *const BB =
+ BasicBlock::Create(Context, MergedBlockName(Comparisons).Name,
+ NextCmpBlock->getParent(), InsertBefore);
+ IRBuilder<> Builder(BB);
+ // Add the GEPs from the first BCECmpBlock.
+ Value *const Lhs = Builder.Insert(FirstCmp.Lhs().GEP->clone());
+ Value *const Rhs = Builder.Insert(FirstCmp.Rhs().GEP->clone());
+
+ Value *IsEqual = nullptr;
+ LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons -> "
+ << BB->getName() << "\n");
// If there is one block that requires splitting, we do it now, i.e.
// just before we know we will collapse the chain. The instructions
@@ -635,312 +635,312 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
ToSplit->split(BB, AA);
}
- if (Comparisons.size() == 1) {
- LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n");
- Value *const LhsLoad =
- Builder.CreateLoad(FirstCmp.Lhs().LoadI->getType(), Lhs);
- Value *const RhsLoad =
- Builder.CreateLoad(FirstCmp.Rhs().LoadI->getType(), Rhs);
- // There are no blocks to merge, just do the comparison.
- IsEqual = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
- } else {
- const unsigned TotalSizeBits = std::accumulate(
- Comparisons.begin(), Comparisons.end(), 0u,
- [](int Size, const BCECmpBlock &C) { return Size + C.SizeBits(); });
-
- // Create memcmp() == 0.
- const auto &DL = Phi.getModule()->getDataLayout();
- Value *const MemCmpCall = emitMemCmp(
- Lhs, Rhs,
- ConstantInt::get(DL.getIntPtrType(Context), TotalSizeBits / 8), Builder,
- DL, &TLI);
- IsEqual = Builder.CreateICmpEQ(
- MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0));
- }
-
- BasicBlock *const PhiBB = Phi.getParent();
- // Add a branch to the next basic block in the chain.
- if (NextCmpBlock == PhiBB) {
- // Continue to phi, passing it the comparison result.
- Builder.CreateBr(PhiBB);
- Phi.addIncoming(IsEqual, BB);
- DTU.applyUpdates({{DominatorTree::Insert, BB, PhiBB}});
- } else {
- // Continue to next block if equal, exit to phi else.
- Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB);
- Phi.addIncoming(ConstantInt::getFalse(Context), BB);
- DTU.applyUpdates({{DominatorTree::Insert, BB, NextCmpBlock},
- {DominatorTree::Insert, BB, PhiBB}});
- }
- return BB;
-}
-
-bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
- DomTreeUpdater &DTU) {
- assert(Comparisons_.size() >= 2 && "simplifying trivial BCECmpChain");
- // First pass to check if there is at least one merge. If not, we don't do
- // anything and we keep analysis passes intact.
- const auto AtLeastOneMerged = [this]() {
- for (size_t I = 1; I < Comparisons_.size(); ++I) {
- if (IsContiguous(Comparisons_[I - 1], Comparisons_[I]))
- return true;
- }
- return false;
- };
- if (!AtLeastOneMerged())
- return false;
-
- LLVM_DEBUG(dbgs() << "Simplifying comparison chain starting at block "
- << EntryBlock_->getName() << "\n");
-
- // Effectively merge blocks. We go in the reverse direction from the phi block
- // so that the next block is always available to branch to.
- const auto mergeRange = [this, &TLI, &AA, &DTU](int I, int Num,
- BasicBlock *InsertBefore,
- BasicBlock *Next) {
- return mergeComparisons(makeArrayRef(Comparisons_).slice(I, Num),
- InsertBefore, Next, Phi_, TLI, AA, DTU);
- };
- int NumMerged = 1;
- BasicBlock *NextCmpBlock = Phi_.getParent();
- for (int I = static_cast<int>(Comparisons_.size()) - 2; I >= 0; --I) {
- if (IsContiguous(Comparisons_[I], Comparisons_[I + 1])) {
- LLVM_DEBUG(dbgs() << "Merging block " << Comparisons_[I].BB->getName()
- << " into " << Comparisons_[I + 1].BB->getName()
- << "\n");
- ++NumMerged;
- } else {
- NextCmpBlock = mergeRange(I + 1, NumMerged, NextCmpBlock, NextCmpBlock);
- NumMerged = 1;
- }
- }
- // Insert the entry block for the new chain before the old entry block.
- // If the old entry block was the function entry, this ensures that the new
- // entry can become the function entry.
- NextCmpBlock = mergeRange(0, NumMerged, EntryBlock_, NextCmpBlock);
-
- // Replace the original cmp chain with the new cmp chain by pointing all
- // predecessors of EntryBlock_ to NextCmpBlock instead. This makes all cmp
- // blocks in the old chain unreachable.
- while (!pred_empty(EntryBlock_)) {
- BasicBlock* const Pred = *pred_begin(EntryBlock_);
- LLVM_DEBUG(dbgs() << "Updating jump into old chain from " << Pred->getName()
- << "\n");
- Pred->getTerminator()->replaceUsesOfWith(EntryBlock_, NextCmpBlock);
- DTU.applyUpdates({{DominatorTree::Delete, Pred, EntryBlock_},
- {DominatorTree::Insert, Pred, NextCmpBlock}});
- }
-
- // If the old cmp chain was the function entry, we need to update the function
- // entry.
- const bool ChainEntryIsFnEntry =
- (EntryBlock_ == &EntryBlock_->getParent()->getEntryBlock());
- if (ChainEntryIsFnEntry && DTU.hasDomTree()) {
- LLVM_DEBUG(dbgs() << "Changing function entry from "
- << EntryBlock_->getName() << " to "
- << NextCmpBlock->getName() << "\n");
- DTU.getDomTree().setNewRoot(NextCmpBlock);
- DTU.applyUpdates({{DominatorTree::Delete, NextCmpBlock, EntryBlock_}});
- }
- EntryBlock_ = nullptr;
-
- // Delete merged blocks. This also removes incoming values in phi.
- SmallVector<BasicBlock *, 16> DeadBlocks;
- for (auto &Cmp : Comparisons_) {
- LLVM_DEBUG(dbgs() << "Deleting merged block " << Cmp.BB->getName() << "\n");
- DeadBlocks.push_back(Cmp.BB);
- }
- DeleteDeadBlocks(DeadBlocks, &DTU);
-
- Comparisons_.clear();
- return true;
-}
-
-std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
- BasicBlock *const LastBlock,
- int NumBlocks) {
- // Walk up from the last block to find other blocks.
- std::vector<BasicBlock *> Blocks(NumBlocks);
- assert(LastBlock && "invalid last block");
- BasicBlock *CurBlock = LastBlock;
- for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) {
- if (CurBlock->hasAddressTaken()) {
- // Somebody is jumping to the block through an address, all bets are
- // off.
- LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
- << " has its address taken\n");
- return {};
- }
- Blocks[BlockIndex] = CurBlock;
- auto *SinglePredecessor = CurBlock->getSinglePredecessor();
- if (!SinglePredecessor) {
- // The block has two or more predecessors.
- LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
- << " has two or more predecessors\n");
- return {};
- }
- if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) {
- // The block does not link back to the phi.
- LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
- << " does not link back to the phi\n");
- return {};
- }
- CurBlock = SinglePredecessor;
- }
- Blocks[0] = CurBlock;
- return Blocks;
-}
-
-bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA,
- DomTreeUpdater &DTU) {
- LLVM_DEBUG(dbgs() << "processPhi()\n");
- if (Phi.getNumIncomingValues() <= 1) {
- LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n");
- return false;
- }
- // We are looking for something that has the following structure:
- // bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+
- // \ \ \ \
- // ne ne ne \
- // \ \ \ v
- // +------------+-----------+----------> bb_phi
- //
- // - The last basic block (bb4 here) must branch unconditionally to bb_phi.
- // It's the only block that contributes a non-constant value to the Phi.
- // - All other blocks (b1, b2, b3) must have exactly two successors, one of
- // them being the phi block.
- // - All intermediate blocks (bb2, bb3) must have only one predecessor.
- // - Blocks cannot do other work besides the comparison, see doesOtherWork()
-
- // The blocks are not necessarily ordered in the phi, so we start from the
- // last block and reconstruct the order.
- BasicBlock *LastBlock = nullptr;
- for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) {
- if (isa<ConstantInt>(Phi.getIncomingValue(I))) continue;
- if (LastBlock) {
- // There are several non-constant values.
- LLVM_DEBUG(dbgs() << "skip: several non-constant values\n");
- return false;
- }
- if (!isa<ICmpInst>(Phi.getIncomingValue(I)) ||
- cast<ICmpInst>(Phi.getIncomingValue(I))->getParent() !=
- Phi.getIncomingBlock(I)) {
- // Non-constant incoming value is not from a cmp instruction or not
- // produced by the last block. We could end up processing the value
- // producing block more than once.
- //
- // This is an uncommon case, so we bail.
- LLVM_DEBUG(
- dbgs()
- << "skip: non-constant value not from cmp or not from last block.\n");
- return false;
- }
- LastBlock = Phi.getIncomingBlock(I);
- }
- if (!LastBlock) {
- // There is no non-constant block.
- LLVM_DEBUG(dbgs() << "skip: no non-constant block\n");
- return false;
- }
- if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
- LLVM_DEBUG(dbgs() << "skip: last block non-phi successor\n");
- return false;
- }
-
- const auto Blocks =
- getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues());
- if (Blocks.empty()) return false;
- BCECmpChain CmpChain(Blocks, Phi, AA);
-
- if (CmpChain.size() < 2) {
- LLVM_DEBUG(dbgs() << "skip: only one compare block\n");
- return false;
- }
-
- return CmpChain.simplify(TLI, AA, DTU);
-}
-
-static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
- const TargetTransformInfo &TTI, AliasAnalysis &AA,
- DominatorTree *DT) {
- LLVM_DEBUG(dbgs() << "MergeICmpsLegacyPass: " << F.getName() << "\n");
-
- // We only try merging comparisons if the target wants to expand memcmp later.
- // The rationale is to avoid turning small chains into memcmp calls.
- if (!TTI.enableMemCmpExpansion(F.hasOptSize(), true))
- return false;
-
- // If we don't have memcmp avaiable we can't emit calls to it.
- if (!TLI.has(LibFunc_memcmp))
- return false;
-
- DomTreeUpdater DTU(DT, /*PostDominatorTree*/ nullptr,
- DomTreeUpdater::UpdateStrategy::Eager);
-
- bool MadeChange = false;
-
- for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
- // A Phi operation is always first in a basic block.
- if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin()))
- MadeChange |= processPhi(*Phi, TLI, AA, DTU);
- }
-
- return MadeChange;
-}
-
-class MergeICmpsLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- MergeICmpsLegacyPass() : FunctionPass(ID) {
- initializeMergeICmpsLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F)) return false;
- const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- // MergeICmps does not need the DominatorTree, but we update it if it's
- // already available.
- auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
- return runImpl(F, TLI, TTI, AA, DTWP ? &DTWP->getDomTree() : nullptr);
- }
-
- private:
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- }
-};
-
-} // namespace
-
-char MergeICmpsLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(MergeICmpsLegacyPass, "mergeicmps",
- "Merge contiguous icmps into a memcmp", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(MergeICmpsLegacyPass, "mergeicmps",
- "Merge contiguous icmps into a memcmp", false, false)
-
-Pass *llvm::createMergeICmpsLegacyPass() { return new MergeICmpsLegacyPass(); }
-
-PreservedAnalyses MergeICmpsPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- auto &AA = AM.getResult<AAManager>(F);
- auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
- const bool MadeChanges = runImpl(F, TLI, TTI, AA, DT);
- if (!MadeChanges)
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
- PA.preserve<GlobalsAA>();
- PA.preserve<DominatorTreeAnalysis>();
- return PA;
-}
+ if (Comparisons.size() == 1) {
+ LLVM_DEBUG(dbgs() << "Only one comparison, updating branches\n");
+ Value *const LhsLoad =
+ Builder.CreateLoad(FirstCmp.Lhs().LoadI->getType(), Lhs);
+ Value *const RhsLoad =
+ Builder.CreateLoad(FirstCmp.Rhs().LoadI->getType(), Rhs);
+ // There are no blocks to merge, just do the comparison.
+ IsEqual = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
+ } else {
+ const unsigned TotalSizeBits = std::accumulate(
+ Comparisons.begin(), Comparisons.end(), 0u,
+ [](int Size, const BCECmpBlock &C) { return Size + C.SizeBits(); });
+
+ // Create memcmp() == 0.
+ const auto &DL = Phi.getModule()->getDataLayout();
+ Value *const MemCmpCall = emitMemCmp(
+ Lhs, Rhs,
+ ConstantInt::get(DL.getIntPtrType(Context), TotalSizeBits / 8), Builder,
+ DL, &TLI);
+ IsEqual = Builder.CreateICmpEQ(
+ MemCmpCall, ConstantInt::get(Type::getInt32Ty(Context), 0));
+ }
+
+ BasicBlock *const PhiBB = Phi.getParent();
+ // Add a branch to the next basic block in the chain.
+ if (NextCmpBlock == PhiBB) {
+ // Continue to phi, passing it the comparison result.
+ Builder.CreateBr(PhiBB);
+ Phi.addIncoming(IsEqual, BB);
+ DTU.applyUpdates({{DominatorTree::Insert, BB, PhiBB}});
+ } else {
+ // Continue to next block if equal, exit to phi else.
+ Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB);
+ Phi.addIncoming(ConstantInt::getFalse(Context), BB);
+ DTU.applyUpdates({{DominatorTree::Insert, BB, NextCmpBlock},
+ {DominatorTree::Insert, BB, PhiBB}});
+ }
+ return BB;
+}
+
+bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
+ DomTreeUpdater &DTU) {
+ assert(Comparisons_.size() >= 2 && "simplifying trivial BCECmpChain");
+ // First pass to check if there is at least one merge. If not, we don't do
+ // anything and we keep analysis passes intact.
+ const auto AtLeastOneMerged = [this]() {
+ for (size_t I = 1; I < Comparisons_.size(); ++I) {
+ if (IsContiguous(Comparisons_[I - 1], Comparisons_[I]))
+ return true;
+ }
+ return false;
+ };
+ if (!AtLeastOneMerged())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Simplifying comparison chain starting at block "
+ << EntryBlock_->getName() << "\n");
+
+ // Effectively merge blocks. We go in the reverse direction from the phi block
+ // so that the next block is always available to branch to.
+ const auto mergeRange = [this, &TLI, &AA, &DTU](int I, int Num,
+ BasicBlock *InsertBefore,
+ BasicBlock *Next) {
+ return mergeComparisons(makeArrayRef(Comparisons_).slice(I, Num),
+ InsertBefore, Next, Phi_, TLI, AA, DTU);
+ };
+ int NumMerged = 1;
+ BasicBlock *NextCmpBlock = Phi_.getParent();
+ for (int I = static_cast<int>(Comparisons_.size()) - 2; I >= 0; --I) {
+ if (IsContiguous(Comparisons_[I], Comparisons_[I + 1])) {
+ LLVM_DEBUG(dbgs() << "Merging block " << Comparisons_[I].BB->getName()
+ << " into " << Comparisons_[I + 1].BB->getName()
+ << "\n");
+ ++NumMerged;
+ } else {
+ NextCmpBlock = mergeRange(I + 1, NumMerged, NextCmpBlock, NextCmpBlock);
+ NumMerged = 1;
+ }
+ }
+ // Insert the entry block for the new chain before the old entry block.
+ // If the old entry block was the function entry, this ensures that the new
+ // entry can become the function entry.
+ NextCmpBlock = mergeRange(0, NumMerged, EntryBlock_, NextCmpBlock);
+
+ // Replace the original cmp chain with the new cmp chain by pointing all
+ // predecessors of EntryBlock_ to NextCmpBlock instead. This makes all cmp
+ // blocks in the old chain unreachable.
+ while (!pred_empty(EntryBlock_)) {
+ BasicBlock* const Pred = *pred_begin(EntryBlock_);
+ LLVM_DEBUG(dbgs() << "Updating jump into old chain from " << Pred->getName()
+ << "\n");
+ Pred->getTerminator()->replaceUsesOfWith(EntryBlock_, NextCmpBlock);
+ DTU.applyUpdates({{DominatorTree::Delete, Pred, EntryBlock_},
+ {DominatorTree::Insert, Pred, NextCmpBlock}});
+ }
+
+ // If the old cmp chain was the function entry, we need to update the function
+ // entry.
+ const bool ChainEntryIsFnEntry =
+ (EntryBlock_ == &EntryBlock_->getParent()->getEntryBlock());
+ if (ChainEntryIsFnEntry && DTU.hasDomTree()) {
+ LLVM_DEBUG(dbgs() << "Changing function entry from "
+ << EntryBlock_->getName() << " to "
+ << NextCmpBlock->getName() << "\n");
+ DTU.getDomTree().setNewRoot(NextCmpBlock);
+ DTU.applyUpdates({{DominatorTree::Delete, NextCmpBlock, EntryBlock_}});
+ }
+ EntryBlock_ = nullptr;
+
+ // Delete merged blocks. This also removes incoming values in phi.
+ SmallVector<BasicBlock *, 16> DeadBlocks;
+ for (auto &Cmp : Comparisons_) {
+ LLVM_DEBUG(dbgs() << "Deleting merged block " << Cmp.BB->getName() << "\n");
+ DeadBlocks.push_back(Cmp.BB);
+ }
+ DeleteDeadBlocks(DeadBlocks, &DTU);
+
+ Comparisons_.clear();
+ return true;
+}
+
+std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
+ BasicBlock *const LastBlock,
+ int NumBlocks) {
+ // Walk up from the last block to find other blocks.
+ std::vector<BasicBlock *> Blocks(NumBlocks);
+ assert(LastBlock && "invalid last block");
+ BasicBlock *CurBlock = LastBlock;
+ for (int BlockIndex = NumBlocks - 1; BlockIndex > 0; --BlockIndex) {
+ if (CurBlock->hasAddressTaken()) {
+ // Somebody is jumping to the block through an address, all bets are
+ // off.
+ LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " has its address taken\n");
+ return {};
+ }
+ Blocks[BlockIndex] = CurBlock;
+ auto *SinglePredecessor = CurBlock->getSinglePredecessor();
+ if (!SinglePredecessor) {
+ // The block has two or more predecessors.
+ LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " has two or more predecessors\n");
+ return {};
+ }
+ if (Phi.getBasicBlockIndex(SinglePredecessor) < 0) {
+ // The block does not link back to the phi.
+ LLVM_DEBUG(dbgs() << "skip: block " << BlockIndex
+ << " does not link back to the phi\n");
+ return {};
+ }
+ CurBlock = SinglePredecessor;
+ }
+ Blocks[0] = CurBlock;
+ return Blocks;
+}
+
+bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA,
+ DomTreeUpdater &DTU) {
+ LLVM_DEBUG(dbgs() << "processPhi()\n");
+ if (Phi.getNumIncomingValues() <= 1) {
+ LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n");
+ return false;
+ }
+ // We are looking for something that has the following structure:
+ // bb1 --eq--> bb2 --eq--> bb3 --eq--> bb4 --+
+ // \ \ \ \
+ // ne ne ne \
+ // \ \ \ v
+ // +------------+-----------+----------> bb_phi
+ //
+ // - The last basic block (bb4 here) must branch unconditionally to bb_phi.
+ // It's the only block that contributes a non-constant value to the Phi.
+ // - All other blocks (b1, b2, b3) must have exactly two successors, one of
+ // them being the phi block.
+ // - All intermediate blocks (bb2, bb3) must have only one predecessor.
+ // - Blocks cannot do other work besides the comparison, see doesOtherWork()
+
+ // The blocks are not necessarily ordered in the phi, so we start from the
+ // last block and reconstruct the order.
+ BasicBlock *LastBlock = nullptr;
+ for (unsigned I = 0; I < Phi.getNumIncomingValues(); ++I) {
+ if (isa<ConstantInt>(Phi.getIncomingValue(I))) continue;
+ if (LastBlock) {
+ // There are several non-constant values.
+ LLVM_DEBUG(dbgs() << "skip: several non-constant values\n");
+ return false;
+ }
+ if (!isa<ICmpInst>(Phi.getIncomingValue(I)) ||
+ cast<ICmpInst>(Phi.getIncomingValue(I))->getParent() !=
+ Phi.getIncomingBlock(I)) {
+ // Non-constant incoming value is not from a cmp instruction or not
+ // produced by the last block. We could end up processing the value
+ // producing block more than once.
+ //
+ // This is an uncommon case, so we bail.
+ LLVM_DEBUG(
+ dbgs()
+ << "skip: non-constant value not from cmp or not from last block.\n");
+ return false;
+ }
+ LastBlock = Phi.getIncomingBlock(I);
+ }
+ if (!LastBlock) {
+ // There is no non-constant block.
+ LLVM_DEBUG(dbgs() << "skip: no non-constant block\n");
+ return false;
+ }
+ if (LastBlock->getSingleSuccessor() != Phi.getParent()) {
+ LLVM_DEBUG(dbgs() << "skip: last block non-phi successor\n");
+ return false;
+ }
+
+ const auto Blocks =
+ getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues());
+ if (Blocks.empty()) return false;
+ BCECmpChain CmpChain(Blocks, Phi, AA);
+
+ if (CmpChain.size() < 2) {
+ LLVM_DEBUG(dbgs() << "skip: only one compare block\n");
+ return false;
+ }
+
+ return CmpChain.simplify(TLI, AA, DTU);
+}
+
+static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
+ const TargetTransformInfo &TTI, AliasAnalysis &AA,
+ DominatorTree *DT) {
+ LLVM_DEBUG(dbgs() << "MergeICmpsLegacyPass: " << F.getName() << "\n");
+
+ // We only try merging comparisons if the target wants to expand memcmp later.
+ // The rationale is to avoid turning small chains into memcmp calls.
+ if (!TTI.enableMemCmpExpansion(F.hasOptSize(), true))
+ return false;
+
+ // If we don't have memcmp avaiable we can't emit calls to it.
+ if (!TLI.has(LibFunc_memcmp))
+ return false;
+
+ DomTreeUpdater DTU(DT, /*PostDominatorTree*/ nullptr,
+ DomTreeUpdater::UpdateStrategy::Eager);
+
+ bool MadeChange = false;
+
+ for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
+ // A Phi operation is always first in a basic block.
+ if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin()))
+ MadeChange |= processPhi(*Phi, TLI, AA, DTU);
+ }
+
+ return MadeChange;
+}
+
+class MergeICmpsLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ MergeICmpsLegacyPass() : FunctionPass(ID) {
+ initializeMergeICmpsLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F)) return false;
+ const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ // MergeICmps does not need the DominatorTree, but we update it if it's
+ // already available.
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ return runImpl(F, TLI, TTI, AA, DTWP ? &DTWP->getDomTree() : nullptr);
+ }
+
+ private:
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ }
+};
+
+} // namespace
+
+char MergeICmpsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(MergeICmpsLegacyPass, "mergeicmps",
+ "Merge contiguous icmps into a memcmp", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MergeICmpsLegacyPass, "mergeicmps",
+ "Merge contiguous icmps into a memcmp", false, false)
+
+Pass *llvm::createMergeICmpsLegacyPass() { return new MergeICmpsLegacyPass(); }
+
+PreservedAnalyses MergeICmpsPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+ auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+ const bool MadeChanges = runImpl(F, TLI, TTI, AA, DT);
+ if (!MadeChanges)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<GlobalsAA>();
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index ba6dac8ae8..69aa0cebe1 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -1,423 +1,423 @@
-//===- MergedLoadStoreMotion.cpp - merge and hoist/sink load/stores -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//! \file
-//! This pass performs merges of loads and stores on both sides of a
-// diamond (hammock). It hoists the loads and sinks the stores.
-//
-// The algorithm iteratively hoists two loads to the same address out of a
-// diamond (hammock) and merges them into a single load in the header. Similar
-// it sinks and merges two stores to the tail block (footer). The algorithm
-// iterates over the instructions of one side of the diamond and attempts to
-// find a matching load/store on the other side. New tail/footer block may be
-// insterted if the tail/footer block has more predecessors (not only the two
-// predecessors that are forming the diamond). It hoists / sinks when it thinks
-// it safe to do so. This optimization helps with eg. hiding load latencies,
-// triggering if-conversion, and reducing static code size.
-//
-// NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-// Example:
-// Diamond shaped code before merge:
-//
-// header:
-// br %cond, label %if.then, label %if.else
-// + +
-// + +
-// + +
-// if.then: if.else:
-// %lt = load %addr_l %le = load %addr_l
-// <use %lt> <use %le>
-// <...> <...>
-// store %st, %addr_s store %se, %addr_s
-// br label %if.end br label %if.end
-// + +
-// + +
-// + +
-// if.end ("footer"):
-// <...>
-//
-// Diamond shaped code after merge:
-//
-// header:
-// %l = load %addr_l
-// br %cond, label %if.then, label %if.else
-// + +
-// + +
-// + +
-// if.then: if.else:
-// <use %l> <use %l>
-// <...> <...>
-// br label %if.end br label %if.end
-// + +
-// + +
-// + +
-// if.end ("footer"):
-// %s.sink = phi [%st, if.then], [%se, if.else]
-// <...>
-// store %s.sink, %addr_s
-// <...>
-//
-//
-//===----------------------- TODO -----------------------------------------===//
-//
-// 1) Generalize to regions other than diamonds
-// 2) Be more aggressive merging memory operations
-// Note that both changes require register pressure control
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "mldst-motion"
-
-namespace {
-//===----------------------------------------------------------------------===//
-// MergedLoadStoreMotion Pass
-//===----------------------------------------------------------------------===//
-class MergedLoadStoreMotion {
- AliasAnalysis *AA = nullptr;
-
- // The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
- // where Size0 and Size1 are the #instructions on the two sides of
- // the diamond. The constant chosen here is arbitrary. Compiler Time
- // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
- const int MagicCompileTimeControl = 250;
-
- const bool SplitFooterBB;
-public:
- MergedLoadStoreMotion(bool SplitFooterBB) : SplitFooterBB(SplitFooterBB) {}
- bool run(Function &F, AliasAnalysis &AA);
-
-private:
- BasicBlock *getDiamondTail(BasicBlock *BB);
- bool isDiamondHead(BasicBlock *BB);
- // Routines for sinking stores
- StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
- PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
- bool isStoreSinkBarrierInRange(const Instruction &Start,
- const Instruction &End, MemoryLocation Loc);
- bool canSinkStoresAndGEPs(StoreInst *S0, StoreInst *S1) const;
- void sinkStoresAndGEPs(BasicBlock *BB, StoreInst *SinkCand,
- StoreInst *ElseInst);
- bool mergeStores(BasicBlock *BB);
-};
-} // end anonymous namespace
-
-///
-/// Return tail block of a diamond.
-///
-BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
- assert(isDiamondHead(BB) && "Basic block is not head of a diamond");
- return BB->getTerminator()->getSuccessor(0)->getSingleSuccessor();
-}
-
-///
-/// True when BB is the head of a diamond (hammock)
-///
-bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
- if (!BB)
- return false;
- auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
- if (!BI || !BI->isConditional())
- return false;
-
- BasicBlock *Succ0 = BI->getSuccessor(0);
- BasicBlock *Succ1 = BI->getSuccessor(1);
-
- if (!Succ0->getSinglePredecessor())
- return false;
- if (!Succ1->getSinglePredecessor())
- return false;
-
- BasicBlock *Succ0Succ = Succ0->getSingleSuccessor();
- BasicBlock *Succ1Succ = Succ1->getSingleSuccessor();
- // Ignore triangles.
- if (!Succ0Succ || !Succ1Succ || Succ0Succ != Succ1Succ)
- return false;
- return true;
-}
-
-
-///
-/// True when instruction is a sink barrier for a store
-/// located in Loc
-///
-/// Whenever an instruction could possibly read or modify the
-/// value being stored or protect against the store from
-/// happening it is considered a sink barrier.
-///
-bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,
- const Instruction &End,
- MemoryLocation Loc) {
- for (const Instruction &Inst :
- make_range(Start.getIterator(), End.getIterator()))
- if (Inst.mayThrow())
- return true;
- return AA->canInstructionRangeModRef(Start, End, Loc, ModRefInfo::ModRef);
-}
-
-///
-/// Check if \p BB contains a store to the same address as \p SI
-///
-/// \return The store in \p when it is safe to sink. Otherwise return Null.
-///
-StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
- StoreInst *Store0) {
- LLVM_DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
- BasicBlock *BB0 = Store0->getParent();
- for (Instruction &Inst : reverse(*BB1)) {
- auto *Store1 = dyn_cast<StoreInst>(&Inst);
- if (!Store1)
- continue;
-
- MemoryLocation Loc0 = MemoryLocation::get(Store0);
- MemoryLocation Loc1 = MemoryLocation::get(Store1);
- if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&
- !isStoreSinkBarrierInRange(*Store1->getNextNode(), BB1->back(), Loc1) &&
- !isStoreSinkBarrierInRange(*Store0->getNextNode(), BB0->back(), Loc0)) {
- return Store1;
- }
- }
- return nullptr;
-}
-
-///
-/// Create a PHI node in BB for the operands of S0 and S1
-///
-PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
- StoreInst *S1) {
- // Create a phi if the values mismatch.
- Value *Opd1 = S0->getValueOperand();
- Value *Opd2 = S1->getValueOperand();
- if (Opd1 == Opd2)
- return nullptr;
-
- auto *NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink",
- &BB->front());
- NewPN->applyMergedLocation(S0->getDebugLoc(), S1->getDebugLoc());
- NewPN->addIncoming(Opd1, S0->getParent());
- NewPN->addIncoming(Opd2, S1->getParent());
- return NewPN;
-}
-
-///
-/// Check if 2 stores can be sunk together with corresponding GEPs
-///
-bool MergedLoadStoreMotion::canSinkStoresAndGEPs(StoreInst *S0,
- StoreInst *S1) const {
- auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
- auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
- return A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
- (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
- (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0);
-}
-
-///
-/// Merge two stores to same address and sink into \p BB
-///
-/// Also sinks GEP instruction computing the store address
-///
-void MergedLoadStoreMotion::sinkStoresAndGEPs(BasicBlock *BB, StoreInst *S0,
- StoreInst *S1) {
- // Only one definition?
- auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
- auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
- LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
- dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
- dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
- // Hoist the instruction.
- BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
- // Intersect optional metadata.
- S0->andIRFlags(S1);
- S0->dropUnknownNonDebugMetadata();
-
- // Create the new store to be inserted at the join point.
- StoreInst *SNew = cast<StoreInst>(S0->clone());
- Instruction *ANew = A0->clone();
- SNew->insertBefore(&*InsertPt);
- ANew->insertBefore(SNew);
-
- assert(S0->getParent() == A0->getParent());
- assert(S1->getParent() == A1->getParent());
-
- // New PHI operand? Use it.
- if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
- SNew->setOperand(0, NewPN);
- S0->eraseFromParent();
- S1->eraseFromParent();
- A0->replaceAllUsesWith(ANew);
- A0->eraseFromParent();
- A1->replaceAllUsesWith(ANew);
- A1->eraseFromParent();
-}
-
-///
-/// True when two stores are equivalent and can sink into the footer
-///
-/// Starting from a diamond head block, iterate over the instructions in one
-/// successor block and try to match a store in the second successor.
-///
-bool MergedLoadStoreMotion::mergeStores(BasicBlock *HeadBB) {
-
- bool MergedStores = false;
- BasicBlock *TailBB = getDiamondTail(HeadBB);
- BasicBlock *SinkBB = TailBB;
- assert(SinkBB && "Footer of a diamond cannot be empty");
-
- succ_iterator SI = succ_begin(HeadBB);
- assert(SI != succ_end(HeadBB) && "Diamond head cannot have zero successors");
- BasicBlock *Pred0 = *SI;
- ++SI;
- assert(SI != succ_end(HeadBB) && "Diamond head cannot have single successor");
- BasicBlock *Pred1 = *SI;
- // tail block of a diamond/hammock?
- if (Pred0 == Pred1)
- return false; // No.
- // bail out early if we can not merge into the footer BB
- if (!SplitFooterBB && TailBB->hasNPredecessorsOrMore(3))
- return false;
- // #Instructions in Pred1 for Compile Time Control
- auto InstsNoDbg = Pred1->instructionsWithoutDebug();
- int Size1 = std::distance(InstsNoDbg.begin(), InstsNoDbg.end());
- int NStores = 0;
-
- for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend();
- RBI != RBE;) {
-
- Instruction *I = &*RBI;
- ++RBI;
-
- // Don't sink non-simple (atomic, volatile) stores.
- auto *S0 = dyn_cast<StoreInst>(I);
- if (!S0 || !S0->isSimple())
- continue;
-
- ++NStores;
- if (NStores * Size1 >= MagicCompileTimeControl)
- break;
- if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) {
- if (!canSinkStoresAndGEPs(S0, S1))
- // Don't attempt to sink below stores that had to stick around
- // But after removal of a store and some of its feeding
- // instruction search again from the beginning since the iterator
- // is likely stale at this point.
- break;
-
- if (SinkBB == TailBB && TailBB->hasNPredecessorsOrMore(3)) {
- // We have more than 2 predecessors. Insert a new block
- // postdominating 2 predecessors we're going to sink from.
- SinkBB = SplitBlockPredecessors(TailBB, {Pred0, Pred1}, ".sink.split");
- if (!SinkBB)
- break;
- }
-
- MergedStores = true;
- sinkStoresAndGEPs(SinkBB, S0, S1);
- RBI = Pred0->rbegin();
- RBE = Pred0->rend();
- LLVM_DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
- }
- }
- return MergedStores;
-}
-
-bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) {
- this->AA = &AA;
-
- bool Changed = false;
- LLVM_DEBUG(dbgs() << "Instruction Merger\n");
-
- // Merge unconditional branches, allowing PRE to catch more
- // optimization opportunities.
- // This loop doesn't care about newly inserted/split blocks
- // since they never will be diamond heads.
- for (BasicBlock &BB : make_early_inc_range(F))
- // Hoist equivalent loads and sink stores
- // outside diamonds when possible
- if (isDiamondHead(&BB))
- Changed |= mergeStores(&BB);
- return Changed;
-}
-
-namespace {
-class MergedLoadStoreMotionLegacyPass : public FunctionPass {
- const bool SplitFooterBB;
-public:
- static char ID; // Pass identification, replacement for typeid
- MergedLoadStoreMotionLegacyPass(bool SplitFooterBB = false)
- : FunctionPass(ID), SplitFooterBB(SplitFooterBB) {
- initializeMergedLoadStoreMotionLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- ///
- /// Run the transformation for each function
- ///
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
- MergedLoadStoreMotion Impl(SplitFooterBB);
- return Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults());
- }
-
-private:
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- if (!SplitFooterBB)
- AU.setPreservesCFG();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
-};
-
-char MergedLoadStoreMotionLegacyPass::ID = 0;
-} // anonymous namespace
-
-///
-/// createMergedLoadStoreMotionPass - The public interface to this file.
-///
-FunctionPass *llvm::createMergedLoadStoreMotionPass(bool SplitFooterBB) {
- return new MergedLoadStoreMotionLegacyPass(SplitFooterBB);
-}
-
-INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion",
- "MergedLoadStoreMotion", false, false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
- "MergedLoadStoreMotion", false, false)
-
-PreservedAnalyses
-MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
- MergedLoadStoreMotion Impl(Options.SplitFooterBB);
- auto &AA = AM.getResult<AAManager>(F);
- if (!Impl.run(F, AA))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- if (!Options.SplitFooterBB)
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<GlobalsAA>();
- return PA;
-}
+//===- MergedLoadStoreMotion.cpp - merge and hoist/sink load/stores -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//! \file
+//! This pass performs merges of loads and stores on both sides of a
+// diamond (hammock). It hoists the loads and sinks the stores.
+//
+// The algorithm iteratively hoists two loads to the same address out of a
+// diamond (hammock) and merges them into a single load in the header. Similar
+// it sinks and merges two stores to the tail block (footer). The algorithm
+// iterates over the instructions of one side of the diamond and attempts to
+// find a matching load/store on the other side. New tail/footer block may be
+// insterted if the tail/footer block has more predecessors (not only the two
+// predecessors that are forming the diamond). It hoists / sinks when it thinks
+// it safe to do so. This optimization helps with eg. hiding load latencies,
+// triggering if-conversion, and reducing static code size.
+//
+// NOTE: This code no longer performs load hoisting, it is subsumed by GVNHoist.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+// Example:
+// Diamond shaped code before merge:
+//
+// header:
+// br %cond, label %if.then, label %if.else
+// + +
+// + +
+// + +
+// if.then: if.else:
+// %lt = load %addr_l %le = load %addr_l
+// <use %lt> <use %le>
+// <...> <...>
+// store %st, %addr_s store %se, %addr_s
+// br label %if.end br label %if.end
+// + +
+// + +
+// + +
+// if.end ("footer"):
+// <...>
+//
+// Diamond shaped code after merge:
+//
+// header:
+// %l = load %addr_l
+// br %cond, label %if.then, label %if.else
+// + +
+// + +
+// + +
+// if.then: if.else:
+// <use %l> <use %l>
+// <...> <...>
+// br label %if.end br label %if.end
+// + +
+// + +
+// + +
+// if.end ("footer"):
+// %s.sink = phi [%st, if.then], [%se, if.else]
+// <...>
+// store %s.sink, %addr_s
+// <...>
+//
+//
+//===----------------------- TODO -----------------------------------------===//
+//
+// 1) Generalize to regions other than diamonds
+// 2) Be more aggressive merging memory operations
+// Note that both changes require register pressure control
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/MergedLoadStoreMotion.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mldst-motion"
+
+namespace {
+//===----------------------------------------------------------------------===//
+// MergedLoadStoreMotion Pass
+//===----------------------------------------------------------------------===//
+class MergedLoadStoreMotion {
+ AliasAnalysis *AA = nullptr;
+
+ // The mergeLoad/Store algorithms could have Size0 * Size1 complexity,
+ // where Size0 and Size1 are the #instructions on the two sides of
+ // the diamond. The constant chosen here is arbitrary. Compiler Time
+ // Control is enforced by the check Size0 * Size1 < MagicCompileTimeControl.
+ const int MagicCompileTimeControl = 250;
+
+ const bool SplitFooterBB;
+public:
+ MergedLoadStoreMotion(bool SplitFooterBB) : SplitFooterBB(SplitFooterBB) {}
+ bool run(Function &F, AliasAnalysis &AA);
+
+private:
+ BasicBlock *getDiamondTail(BasicBlock *BB);
+ bool isDiamondHead(BasicBlock *BB);
+ // Routines for sinking stores
+ StoreInst *canSinkFromBlock(BasicBlock *BB, StoreInst *SI);
+ PHINode *getPHIOperand(BasicBlock *BB, StoreInst *S0, StoreInst *S1);
+ bool isStoreSinkBarrierInRange(const Instruction &Start,
+ const Instruction &End, MemoryLocation Loc);
+ bool canSinkStoresAndGEPs(StoreInst *S0, StoreInst *S1) const;
+ void sinkStoresAndGEPs(BasicBlock *BB, StoreInst *SinkCand,
+ StoreInst *ElseInst);
+ bool mergeStores(BasicBlock *BB);
+};
+} // end anonymous namespace
+
+///
+/// Return tail block of a diamond.
+///
+BasicBlock *MergedLoadStoreMotion::getDiamondTail(BasicBlock *BB) {
+ assert(isDiamondHead(BB) && "Basic block is not head of a diamond");
+ return BB->getTerminator()->getSuccessor(0)->getSingleSuccessor();
+}
+
+///
+/// True when BB is the head of a diamond (hammock)
+///
+bool MergedLoadStoreMotion::isDiamondHead(BasicBlock *BB) {
+ if (!BB)
+ return false;
+ auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || !BI->isConditional())
+ return false;
+
+ BasicBlock *Succ0 = BI->getSuccessor(0);
+ BasicBlock *Succ1 = BI->getSuccessor(1);
+
+ if (!Succ0->getSinglePredecessor())
+ return false;
+ if (!Succ1->getSinglePredecessor())
+ return false;
+
+ BasicBlock *Succ0Succ = Succ0->getSingleSuccessor();
+ BasicBlock *Succ1Succ = Succ1->getSingleSuccessor();
+ // Ignore triangles.
+ if (!Succ0Succ || !Succ1Succ || Succ0Succ != Succ1Succ)
+ return false;
+ return true;
+}
+
+
+///
+/// True when instruction is a sink barrier for a store
+/// located in Loc
+///
+/// Whenever an instruction could possibly read or modify the
+/// value being stored or protect against the store from
+/// happening it is considered a sink barrier.
+///
+bool MergedLoadStoreMotion::isStoreSinkBarrierInRange(const Instruction &Start,
+ const Instruction &End,
+ MemoryLocation Loc) {
+ for (const Instruction &Inst :
+ make_range(Start.getIterator(), End.getIterator()))
+ if (Inst.mayThrow())
+ return true;
+ return AA->canInstructionRangeModRef(Start, End, Loc, ModRefInfo::ModRef);
+}
+
+///
+/// Check if \p BB contains a store to the same address as \p SI
+///
+/// \return The store in \p when it is safe to sink. Otherwise return Null.
+///
+StoreInst *MergedLoadStoreMotion::canSinkFromBlock(BasicBlock *BB1,
+ StoreInst *Store0) {
+ LLVM_DEBUG(dbgs() << "can Sink? : "; Store0->dump(); dbgs() << "\n");
+ BasicBlock *BB0 = Store0->getParent();
+ for (Instruction &Inst : reverse(*BB1)) {
+ auto *Store1 = dyn_cast<StoreInst>(&Inst);
+ if (!Store1)
+ continue;
+
+ MemoryLocation Loc0 = MemoryLocation::get(Store0);
+ MemoryLocation Loc1 = MemoryLocation::get(Store1);
+ if (AA->isMustAlias(Loc0, Loc1) && Store0->isSameOperationAs(Store1) &&
+ !isStoreSinkBarrierInRange(*Store1->getNextNode(), BB1->back(), Loc1) &&
+ !isStoreSinkBarrierInRange(*Store0->getNextNode(), BB0->back(), Loc0)) {
+ return Store1;
+ }
+ }
+ return nullptr;
+}
+
+///
+/// Create a PHI node in BB for the operands of S0 and S1
+///
+PHINode *MergedLoadStoreMotion::getPHIOperand(BasicBlock *BB, StoreInst *S0,
+ StoreInst *S1) {
+ // Create a phi if the values mismatch.
+ Value *Opd1 = S0->getValueOperand();
+ Value *Opd2 = S1->getValueOperand();
+ if (Opd1 == Opd2)
+ return nullptr;
+
+ auto *NewPN = PHINode::Create(Opd1->getType(), 2, Opd2->getName() + ".sink",
+ &BB->front());
+ NewPN->applyMergedLocation(S0->getDebugLoc(), S1->getDebugLoc());
+ NewPN->addIncoming(Opd1, S0->getParent());
+ NewPN->addIncoming(Opd2, S1->getParent());
+ return NewPN;
+}
+
+///
+/// Check if 2 stores can be sunk together with corresponding GEPs
+///
+bool MergedLoadStoreMotion::canSinkStoresAndGEPs(StoreInst *S0,
+ StoreInst *S1) const {
+ auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
+ auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
+ return A0 && A1 && A0->isIdenticalTo(A1) && A0->hasOneUse() &&
+ (A0->getParent() == S0->getParent()) && A1->hasOneUse() &&
+ (A1->getParent() == S1->getParent()) && isa<GetElementPtrInst>(A0);
+}
+
+///
+/// Merge two stores to same address and sink into \p BB
+///
+/// Also sinks GEP instruction computing the store address
+///
+void MergedLoadStoreMotion::sinkStoresAndGEPs(BasicBlock *BB, StoreInst *S0,
+ StoreInst *S1) {
+ // Only one definition?
+ auto *A0 = dyn_cast<Instruction>(S0->getPointerOperand());
+ auto *A1 = dyn_cast<Instruction>(S1->getPointerOperand());
+ LLVM_DEBUG(dbgs() << "Sink Instruction into BB \n"; BB->dump();
+ dbgs() << "Instruction Left\n"; S0->dump(); dbgs() << "\n";
+ dbgs() << "Instruction Right\n"; S1->dump(); dbgs() << "\n");
+ // Hoist the instruction.
+ BasicBlock::iterator InsertPt = BB->getFirstInsertionPt();
+ // Intersect optional metadata.
+ S0->andIRFlags(S1);
+ S0->dropUnknownNonDebugMetadata();
+
+ // Create the new store to be inserted at the join point.
+ StoreInst *SNew = cast<StoreInst>(S0->clone());
+ Instruction *ANew = A0->clone();
+ SNew->insertBefore(&*InsertPt);
+ ANew->insertBefore(SNew);
+
+ assert(S0->getParent() == A0->getParent());
+ assert(S1->getParent() == A1->getParent());
+
+ // New PHI operand? Use it.
+ if (PHINode *NewPN = getPHIOperand(BB, S0, S1))
+ SNew->setOperand(0, NewPN);
+ S0->eraseFromParent();
+ S1->eraseFromParent();
+ A0->replaceAllUsesWith(ANew);
+ A0->eraseFromParent();
+ A1->replaceAllUsesWith(ANew);
+ A1->eraseFromParent();
+}
+
+///
+/// True when two stores are equivalent and can sink into the footer
+///
+/// Starting from a diamond head block, iterate over the instructions in one
+/// successor block and try to match a store in the second successor.
+///
+bool MergedLoadStoreMotion::mergeStores(BasicBlock *HeadBB) {
+
+ bool MergedStores = false;
+ BasicBlock *TailBB = getDiamondTail(HeadBB);
+ BasicBlock *SinkBB = TailBB;
+ assert(SinkBB && "Footer of a diamond cannot be empty");
+
+ succ_iterator SI = succ_begin(HeadBB);
+ assert(SI != succ_end(HeadBB) && "Diamond head cannot have zero successors");
+ BasicBlock *Pred0 = *SI;
+ ++SI;
+ assert(SI != succ_end(HeadBB) && "Diamond head cannot have single successor");
+ BasicBlock *Pred1 = *SI;
+ // tail block of a diamond/hammock?
+ if (Pred0 == Pred1)
+ return false; // No.
+ // bail out early if we can not merge into the footer BB
+ if (!SplitFooterBB && TailBB->hasNPredecessorsOrMore(3))
+ return false;
+ // #Instructions in Pred1 for Compile Time Control
+ auto InstsNoDbg = Pred1->instructionsWithoutDebug();
+ int Size1 = std::distance(InstsNoDbg.begin(), InstsNoDbg.end());
+ int NStores = 0;
+
+ for (BasicBlock::reverse_iterator RBI = Pred0->rbegin(), RBE = Pred0->rend();
+ RBI != RBE;) {
+
+ Instruction *I = &*RBI;
+ ++RBI;
+
+ // Don't sink non-simple (atomic, volatile) stores.
+ auto *S0 = dyn_cast<StoreInst>(I);
+ if (!S0 || !S0->isSimple())
+ continue;
+
+ ++NStores;
+ if (NStores * Size1 >= MagicCompileTimeControl)
+ break;
+ if (StoreInst *S1 = canSinkFromBlock(Pred1, S0)) {
+ if (!canSinkStoresAndGEPs(S0, S1))
+ // Don't attempt to sink below stores that had to stick around
+ // But after removal of a store and some of its feeding
+ // instruction search again from the beginning since the iterator
+ // is likely stale at this point.
+ break;
+
+ if (SinkBB == TailBB && TailBB->hasNPredecessorsOrMore(3)) {
+ // We have more than 2 predecessors. Insert a new block
+ // postdominating 2 predecessors we're going to sink from.
+ SinkBB = SplitBlockPredecessors(TailBB, {Pred0, Pred1}, ".sink.split");
+ if (!SinkBB)
+ break;
+ }
+
+ MergedStores = true;
+ sinkStoresAndGEPs(SinkBB, S0, S1);
+ RBI = Pred0->rbegin();
+ RBE = Pred0->rend();
+ LLVM_DEBUG(dbgs() << "Search again\n"; Instruction *I = &*RBI; I->dump());
+ }
+ }
+ return MergedStores;
+}
+
+bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) {
+ this->AA = &AA;
+
+ bool Changed = false;
+ LLVM_DEBUG(dbgs() << "Instruction Merger\n");
+
+ // Merge unconditional branches, allowing PRE to catch more
+ // optimization opportunities.
+ // This loop doesn't care about newly inserted/split blocks
+ // since they never will be diamond heads.
+ for (BasicBlock &BB : make_early_inc_range(F))
+ // Hoist equivalent loads and sink stores
+ // outside diamonds when possible
+ if (isDiamondHead(&BB))
+ Changed |= mergeStores(&BB);
+ return Changed;
+}
+
+namespace {
+class MergedLoadStoreMotionLegacyPass : public FunctionPass {
+ const bool SplitFooterBB;
+public:
+ static char ID; // Pass identification, replacement for typeid
+ MergedLoadStoreMotionLegacyPass(bool SplitFooterBB = false)
+ : FunctionPass(ID), SplitFooterBB(SplitFooterBB) {
+ initializeMergedLoadStoreMotionLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ ///
+ /// Run the transformation for each function
+ ///
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ MergedLoadStoreMotion Impl(SplitFooterBB);
+ return Impl.run(F, getAnalysis<AAResultsWrapperPass>().getAAResults());
+ }
+
+private:
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ if (!SplitFooterBB)
+ AU.setPreservesCFG();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+
+char MergedLoadStoreMotionLegacyPass::ID = 0;
+} // anonymous namespace
+
+///
+/// createMergedLoadStoreMotionPass - The public interface to this file.
+///
+FunctionPass *llvm::createMergedLoadStoreMotionPass(bool SplitFooterBB) {
+ return new MergedLoadStoreMotionLegacyPass(SplitFooterBB);
+}
+
+INITIALIZE_PASS_BEGIN(MergedLoadStoreMotionLegacyPass, "mldst-motion",
+ "MergedLoadStoreMotion", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MergedLoadStoreMotionLegacyPass, "mldst-motion",
+ "MergedLoadStoreMotion", false, false)
+
+PreservedAnalyses
+MergedLoadStoreMotionPass::run(Function &F, FunctionAnalysisManager &AM) {
+ MergedLoadStoreMotion Impl(Options.SplitFooterBB);
+ auto &AA = AM.getResult<AAManager>(F);
+ if (!Impl.run(F, AA))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ if (!Options.SplitFooterBB)
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/NaryReassociate.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/NaryReassociate.cpp
index bb49b06b35..32bb62129e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -1,228 +1,228 @@
-//===- NaryReassociate.cpp - Reassociate n-ary expressions ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass reassociates n-ary add expressions and eliminates the redundancy
-// exposed by the reassociation.
-//
-// A motivating example:
-//
-// void foo(int a, int b) {
-// bar(a + b);
-// bar((a + 2) + b);
-// }
-//
-// An ideal compiler should reassociate (a + 2) + b to (a + b) + 2 and simplify
-// the above code to
-//
-// int t = a + b;
-// bar(t);
-// bar(t + 2);
-//
-// However, the Reassociate pass is unable to do that because it processes each
-// instruction individually and believes (a + 2) + b is the best form according
-// to its rank system.
-//
-// To address this limitation, NaryReassociate reassociates an expression in a
-// form that reuses existing instructions. As a result, NaryReassociate can
-// reassociate (a + 2) + b in the example to (a + b) + 2 because it detects that
-// (a + b) is computed before.
-//
-// NaryReassociate works as follows. For every instruction in the form of (a +
-// b) + c, it checks whether a + c or b + c is already computed by a dominating
-// instruction. If so, it then reassociates (a + b) + c into (a + c) + b or (b +
-// c) + a and removes the redundancy accordingly. To efficiently look up whether
-// an expression is computed before, we store each instruction seen and its SCEV
-// into an SCEV-to-instruction map.
-//
-// Although the algorithm pattern-matches only ternary additions, it
-// automatically handles many >3-ary expressions by walking through the function
-// in the depth-first order. For example, given
-//
-// (a + c) + d
-// ((a + b) + c) + d
-//
-// NaryReassociate first rewrites (a + b) + c to (a + c) + b, and then rewrites
-// ((a + c) + b) + d into ((a + c) + d) + b.
-//
-// Finally, the above dominator-based algorithm may need to be run multiple
-// iterations before emitting optimal code. One source of this need is that we
-// only split an operand when it is used only once. The above algorithm can
-// eliminate an instruction and decrease the usage count of its operands. As a
-// result, an instruction that previously had multiple uses may become a
-// single-use instruction and thus eligible for split consideration. For
-// example,
-//
-// ac = a + c
-// ab = a + b
-// abc = ab + c
-// ab2 = ab + b
-// ab2c = ab2 + c
-//
-// In the first iteration, we cannot reassociate abc to ac+b because ab is used
-// twice. However, we can reassociate ab2c to abc+b in the first iteration. As a
-// result, ab2 becomes dead and ab will be used only once in the second
-// iteration.
-//
-// Limitations and TODO items:
-//
-// 1) We only considers n-ary adds and muls for now. This should be extended
-// and generalized.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/NaryReassociate.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <cassert>
-#include <cstdint>
-
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "nary-reassociate"
-
-namespace {
-
-class NaryReassociateLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- NaryReassociateLegacyPass() : FunctionPass(ID) {
- initializeNaryReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool doInitialization(Module &M) override {
- return false;
- }
-
- bool runOnFunction(Function &F) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- AU.addPreserved<TargetLibraryInfoWrapperPass>();
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.setPreservesCFG();
- }
-
-private:
- NaryReassociatePass Impl;
-};
-
-} // end anonymous namespace
-
-char NaryReassociateLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(NaryReassociateLegacyPass, "nary-reassociate",
- "Nary reassociation", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(NaryReassociateLegacyPass, "nary-reassociate",
- "Nary reassociation", false, false)
-
-FunctionPass *llvm::createNaryReassociatePass() {
- return new NaryReassociateLegacyPass();
-}
-
-bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-
- return Impl.runImpl(F, AC, DT, SE, TLI, TTI);
-}
-
-PreservedAnalyses NaryReassociatePass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto *AC = &AM.getResult<AssumptionAnalysis>(F);
- auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
- auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
- auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
- auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
-
- if (!runImpl(F, AC, DT, SE, TLI, TTI))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<ScalarEvolutionAnalysis>();
- return PA;
-}
-
-bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
- DominatorTree *DT_, ScalarEvolution *SE_,
- TargetLibraryInfo *TLI_,
- TargetTransformInfo *TTI_) {
- AC = AC_;
- DT = DT_;
- SE = SE_;
- TLI = TLI_;
- TTI = TTI_;
- DL = &F.getParent()->getDataLayout();
-
- bool Changed = false, ChangedInThisIteration;
- do {
- ChangedInThisIteration = doOneIteration(F);
- Changed |= ChangedInThisIteration;
- } while (ChangedInThisIteration);
- return Changed;
-}
-
-bool NaryReassociatePass::doOneIteration(Function &F) {
- bool Changed = false;
- SeenExprs.clear();
- // Process the basic blocks in a depth first traversal of the dominator
- // tree. This order ensures that all bases of a candidate are in Candidates
- // when we process it.
+//===- NaryReassociate.cpp - Reassociate n-ary expressions ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates n-ary add expressions and eliminates the redundancy
+// exposed by the reassociation.
+//
+// A motivating example:
+//
+// void foo(int a, int b) {
+// bar(a + b);
+// bar((a + 2) + b);
+// }
+//
+// An ideal compiler should reassociate (a + 2) + b to (a + b) + 2 and simplify
+// the above code to
+//
+// int t = a + b;
+// bar(t);
+// bar(t + 2);
+//
+// However, the Reassociate pass is unable to do that because it processes each
+// instruction individually and believes (a + 2) + b is the best form according
+// to its rank system.
+//
+// To address this limitation, NaryReassociate reassociates an expression in a
+// form that reuses existing instructions. As a result, NaryReassociate can
+// reassociate (a + 2) + b in the example to (a + b) + 2 because it detects that
+// (a + b) is computed before.
+//
+// NaryReassociate works as follows. For every instruction in the form of (a +
+// b) + c, it checks whether a + c or b + c is already computed by a dominating
+// instruction. If so, it then reassociates (a + b) + c into (a + c) + b or (b +
+// c) + a and removes the redundancy accordingly. To efficiently look up whether
+// an expression is computed before, we store each instruction seen and its SCEV
+// into an SCEV-to-instruction map.
+//
+// Although the algorithm pattern-matches only ternary additions, it
+// automatically handles many >3-ary expressions by walking through the function
+// in the depth-first order. For example, given
+//
+// (a + c) + d
+// ((a + b) + c) + d
+//
+// NaryReassociate first rewrites (a + b) + c to (a + c) + b, and then rewrites
+// ((a + c) + b) + d into ((a + c) + d) + b.
+//
+// Finally, the above dominator-based algorithm may need to be run multiple
+// iterations before emitting optimal code. One source of this need is that we
+// only split an operand when it is used only once. The above algorithm can
+// eliminate an instruction and decrease the usage count of its operands. As a
+// result, an instruction that previously had multiple uses may become a
+// single-use instruction and thus eligible for split consideration. For
+// example,
+//
+// ac = a + c
+// ab = a + b
+// abc = ab + c
+// ab2 = ab + b
+// ab2c = ab2 + c
+//
+// In the first iteration, we cannot reassociate abc to ac+b because ab is used
+// twice. However, we can reassociate ab2c to abc+b in the first iteration. As a
+// result, ab2 becomes dead and ab will be used only once in the second
+// iteration.
+//
+// Limitations and TODO items:
+//
+// 1) We only considers n-ary adds and muls for now. This should be extended
+// and generalized.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/NaryReassociate.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "nary-reassociate"
+
+namespace {
+
+class NaryReassociateLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ NaryReassociateLegacyPass() : FunctionPass(ID) {
+ initializeNaryReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool doInitialization(Module &M) override {
+ return false;
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
+ }
+
+private:
+ NaryReassociatePass Impl;
+};
+
+} // end anonymous namespace
+
+char NaryReassociateLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(NaryReassociateLegacyPass, "nary-reassociate",
+ "Nary reassociation", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(NaryReassociateLegacyPass, "nary-reassociate",
+ "Nary reassociation", false, false)
+
+FunctionPass *llvm::createNaryReassociatePass() {
+ return new NaryReassociateLegacyPass();
+}
+
+bool NaryReassociateLegacyPass::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ return Impl.runImpl(F, AC, DT, SE, TLI, TTI);
+}
+
+PreservedAnalyses NaryReassociatePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+ auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+ auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+ if (!runImpl(F, AC, DT, SE, TLI, TTI))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+ return PA;
+}
+
+bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
+ DominatorTree *DT_, ScalarEvolution *SE_,
+ TargetLibraryInfo *TLI_,
+ TargetTransformInfo *TTI_) {
+ AC = AC_;
+ DT = DT_;
+ SE = SE_;
+ TLI = TLI_;
+ TTI = TTI_;
+ DL = &F.getParent()->getDataLayout();
+
+ bool Changed = false, ChangedInThisIteration;
+ do {
+ ChangedInThisIteration = doOneIteration(F);
+ Changed |= ChangedInThisIteration;
+ } while (ChangedInThisIteration);
+ return Changed;
+}
+
+bool NaryReassociatePass::doOneIteration(Function &F) {
+ bool Changed = false;
+ SeenExprs.clear();
+ // Process the basic blocks in a depth first traversal of the dominator
+ // tree. This order ensures that all bases of a candidate are in Candidates
+ // when we process it.
SmallVector<WeakTrackingVH, 16> DeadInsts;
- for (const auto Node : depth_first(DT)) {
- BasicBlock *BB = Node->getBlock();
- for (auto I = BB->begin(); I != BB->end(); ++I) {
+ for (const auto Node : depth_first(DT)) {
+ BasicBlock *BB = Node->getBlock();
+ for (auto I = BB->begin(); I != BB->end(); ++I) {
Instruction *OrigI = &*I;
const SCEV *OrigSCEV = nullptr;
if (Instruction *NewI = tryReassociate(OrigI, OrigSCEV)) {
@@ -236,307 +236,307 @@ bool NaryReassociatePass::doOneIteration(Function &F) {
const SCEV *NewSCEV = SE->getSCEV(NewI);
SeenExprs[NewSCEV].push_back(WeakTrackingVH(NewI));
- // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I)
- // is equivalent to I. However, ScalarEvolution::getSCEV may
+ // Ideally, NewSCEV should equal OldSCEV because tryReassociate(I)
+ // is equivalent to I. However, ScalarEvolution::getSCEV may
// weaken nsw causing NewSCEV not to equal OldSCEV. For example,
// suppose we reassociate
- // I = &a[sext(i +nsw j)] // assuming sizeof(a[0]) = 4
- // to
- // NewI = &a[sext(i)] + sext(j).
- //
- // ScalarEvolution computes
- // getSCEV(I) = a + 4 * sext(i + j)
- // getSCEV(newI) = a + 4 * sext(i) + 4 * sext(j)
- // which are different SCEVs.
- //
- // To alleviate this issue of ScalarEvolution not always capturing
- // equivalence, we add I to SeenExprs[OldSCEV] as well so that we can
- // map both SCEV before and after tryReassociate(I) to I.
- //
+ // I = &a[sext(i +nsw j)] // assuming sizeof(a[0]) = 4
+ // to
+ // NewI = &a[sext(i)] + sext(j).
+ //
+ // ScalarEvolution computes
+ // getSCEV(I) = a + 4 * sext(i + j)
+ // getSCEV(newI) = a + 4 * sext(i) + 4 * sext(j)
+ // which are different SCEVs.
+ //
+ // To alleviate this issue of ScalarEvolution not always capturing
+ // equivalence, we add I to SeenExprs[OldSCEV] as well so that we can
+ // map both SCEV before and after tryReassociate(I) to I.
+ //
// This improvement is exercised in @reassociate_gep_nsw in
// nary-gep.ll.
if (NewSCEV != OrigSCEV)
SeenExprs[OrigSCEV].push_back(WeakTrackingVH(NewI));
} else if (OrigSCEV)
SeenExprs[OrigSCEV].push_back(WeakTrackingVH(OrigI));
- }
- }
+ }
+ }
// Delete all dead instructions from 'DeadInsts'.
// Please note ScalarEvolution is updated along the way.
RecursivelyDeleteTriviallyDeadInstructionsPermissive(
DeadInsts, TLI, nullptr, [this](Value *V) { SE->forgetValue(V); });
- return Changed;
-}
-
+ return Changed;
+}
+
Instruction *NaryReassociatePass::tryReassociate(Instruction * I,
const SCEV *&OrigSCEV) {
if (!SE->isSCEVable(I->getType()))
return nullptr;
- switch (I->getOpcode()) {
- case Instruction::Add:
- case Instruction::Mul:
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ case Instruction::Mul:
OrigSCEV = SE->getSCEV(I);
- return tryReassociateBinaryOp(cast<BinaryOperator>(I));
- case Instruction::GetElementPtr:
+ return tryReassociateBinaryOp(cast<BinaryOperator>(I));
+ case Instruction::GetElementPtr:
OrigSCEV = SE->getSCEV(I);
- return tryReassociateGEP(cast<GetElementPtrInst>(I));
- default:
+ return tryReassociateGEP(cast<GetElementPtrInst>(I));
+ default:
return nullptr;
- }
+ }
llvm_unreachable("should not be reached");
return nullptr;
-}
-
-static bool isGEPFoldable(GetElementPtrInst *GEP,
- const TargetTransformInfo *TTI) {
+}
+
+static bool isGEPFoldable(GetElementPtrInst *GEP,
+ const TargetTransformInfo *TTI) {
SmallVector<const Value *, 4> Indices(GEP->indices());
- return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
- Indices) == TargetTransformInfo::TCC_Free;
-}
-
-Instruction *NaryReassociatePass::tryReassociateGEP(GetElementPtrInst *GEP) {
- // Not worth reassociating GEP if it is foldable.
- if (isGEPFoldable(GEP, TTI))
- return nullptr;
-
- gep_type_iterator GTI = gep_type_begin(*GEP);
- for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
- if (GTI.isSequential()) {
- if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I - 1,
- GTI.getIndexedType())) {
- return NewGEP;
- }
- }
- }
- return nullptr;
-}
-
-bool NaryReassociatePass::requiresSignExtension(Value *Index,
- GetElementPtrInst *GEP) {
- unsigned PointerSizeInBits =
- DL->getPointerSizeInBits(GEP->getType()->getPointerAddressSpace());
- return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits;
-}
-
-GetElementPtrInst *
-NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
- unsigned I, Type *IndexedType) {
- Value *IndexToSplit = GEP->getOperand(I + 1);
- if (SExtInst *SExt = dyn_cast<SExtInst>(IndexToSplit)) {
- IndexToSplit = SExt->getOperand(0);
- } else if (ZExtInst *ZExt = dyn_cast<ZExtInst>(IndexToSplit)) {
- // zext can be treated as sext if the source is non-negative.
- if (isKnownNonNegative(ZExt->getOperand(0), *DL, 0, AC, GEP, DT))
- IndexToSplit = ZExt->getOperand(0);
- }
-
- if (AddOperator *AO = dyn_cast<AddOperator>(IndexToSplit)) {
- // If the I-th index needs sext and the underlying add is not equipped with
- // nsw, we cannot split the add because
- // sext(LHS + RHS) != sext(LHS) + sext(RHS).
- if (requiresSignExtension(IndexToSplit, GEP) &&
- computeOverflowForSignedAdd(AO, *DL, AC, GEP, DT) !=
- OverflowResult::NeverOverflows)
- return nullptr;
-
- Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);
- // IndexToSplit = LHS + RHS.
- if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
- return NewGEP;
- // Symmetrically, try IndexToSplit = RHS + LHS.
- if (LHS != RHS) {
- if (auto *NewGEP =
- tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
- return NewGEP;
- }
- }
- return nullptr;
-}
-
-GetElementPtrInst *
-NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
- unsigned I, Value *LHS,
- Value *RHS, Type *IndexedType) {
- // Look for GEP's closest dominator that has the same SCEV as GEP except that
- // the I-th index is replaced with LHS.
- SmallVector<const SCEV *, 4> IndexExprs;
- for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index)
- IndexExprs.push_back(SE->getSCEV(*Index));
- // Replace the I-th index with LHS.
- IndexExprs[I] = SE->getSCEV(LHS);
- if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) &&
+ return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
+ Indices) == TargetTransformInfo::TCC_Free;
+}
+
+Instruction *NaryReassociatePass::tryReassociateGEP(GetElementPtrInst *GEP) {
+ // Not worth reassociating GEP if it is foldable.
+ if (isGEPFoldable(GEP, TTI))
+ return nullptr;
+
+ gep_type_iterator GTI = gep_type_begin(*GEP);
+ for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+ if (GTI.isSequential()) {
+ if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I - 1,
+ GTI.getIndexedType())) {
+ return NewGEP;
+ }
+ }
+ }
+ return nullptr;
+}
+
+bool NaryReassociatePass::requiresSignExtension(Value *Index,
+ GetElementPtrInst *GEP) {
+ unsigned PointerSizeInBits =
+ DL->getPointerSizeInBits(GEP->getType()->getPointerAddressSpace());
+ return cast<IntegerType>(Index->getType())->getBitWidth() < PointerSizeInBits;
+}
+
+GetElementPtrInst *
+NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
+ unsigned I, Type *IndexedType) {
+ Value *IndexToSplit = GEP->getOperand(I + 1);
+ if (SExtInst *SExt = dyn_cast<SExtInst>(IndexToSplit)) {
+ IndexToSplit = SExt->getOperand(0);
+ } else if (ZExtInst *ZExt = dyn_cast<ZExtInst>(IndexToSplit)) {
+ // zext can be treated as sext if the source is non-negative.
+ if (isKnownNonNegative(ZExt->getOperand(0), *DL, 0, AC, GEP, DT))
+ IndexToSplit = ZExt->getOperand(0);
+ }
+
+ if (AddOperator *AO = dyn_cast<AddOperator>(IndexToSplit)) {
+ // If the I-th index needs sext and the underlying add is not equipped with
+ // nsw, we cannot split the add because
+ // sext(LHS + RHS) != sext(LHS) + sext(RHS).
+ if (requiresSignExtension(IndexToSplit, GEP) &&
+ computeOverflowForSignedAdd(AO, *DL, AC, GEP, DT) !=
+ OverflowResult::NeverOverflows)
+ return nullptr;
+
+ Value *LHS = AO->getOperand(0), *RHS = AO->getOperand(1);
+ // IndexToSplit = LHS + RHS.
+ if (auto *NewGEP = tryReassociateGEPAtIndex(GEP, I, LHS, RHS, IndexedType))
+ return NewGEP;
+ // Symmetrically, try IndexToSplit = RHS + LHS.
+ if (LHS != RHS) {
+ if (auto *NewGEP =
+ tryReassociateGEPAtIndex(GEP, I, RHS, LHS, IndexedType))
+ return NewGEP;
+ }
+ }
+ return nullptr;
+}
+
+GetElementPtrInst *
+NaryReassociatePass::tryReassociateGEPAtIndex(GetElementPtrInst *GEP,
+ unsigned I, Value *LHS,
+ Value *RHS, Type *IndexedType) {
+ // Look for GEP's closest dominator that has the same SCEV as GEP except that
+ // the I-th index is replaced with LHS.
+ SmallVector<const SCEV *, 4> IndexExprs;
+ for (auto Index = GEP->idx_begin(); Index != GEP->idx_end(); ++Index)
+ IndexExprs.push_back(SE->getSCEV(*Index));
+ // Replace the I-th index with LHS.
+ IndexExprs[I] = SE->getSCEV(LHS);
+ if (isKnownNonNegative(LHS, *DL, 0, AC, GEP, DT) &&
DL->getTypeSizeInBits(LHS->getType()).getFixedSize() <
DL->getTypeSizeInBits(GEP->getOperand(I)->getType()).getFixedSize()) {
- // Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to
- // zext if the source operand is proved non-negative. We should do that
- // consistently so that CandidateExpr more likely appears before. See
- // @reassociate_gep_assume for an example of this canonicalization.
- IndexExprs[I] =
- SE->getZeroExtendExpr(IndexExprs[I], GEP->getOperand(I)->getType());
- }
- const SCEV *CandidateExpr = SE->getGEPExpr(cast<GEPOperator>(GEP),
- IndexExprs);
-
- Value *Candidate = findClosestMatchingDominator(CandidateExpr, GEP);
- if (Candidate == nullptr)
- return nullptr;
-
- IRBuilder<> Builder(GEP);
- // Candidate does not necessarily have the same pointer type as GEP. Use
- // bitcast or pointer cast to make sure they have the same type, so that the
- // later RAUW doesn't complain.
- Candidate = Builder.CreateBitOrPointerCast(Candidate, GEP->getType());
- assert(Candidate->getType() == GEP->getType());
-
- // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType)
- uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType);
- Type *ElementType = GEP->getResultElementType();
- uint64_t ElementSize = DL->getTypeAllocSize(ElementType);
- // Another less rare case: because I is not necessarily the last index of the
- // GEP, the size of the type at the I-th index (IndexedSize) is not
- // necessarily divisible by ElementSize. For example,
- //
- // #pragma pack(1)
- // struct S {
- // int a[3];
- // int64 b[8];
- // };
- // #pragma pack()
- //
- // sizeof(S) = 100 is indivisible by sizeof(int64) = 8.
- //
- // TODO: bail out on this case for now. We could emit uglygep.
- if (IndexedSize % ElementSize != 0)
- return nullptr;
-
- // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0])));
- Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
- if (RHS->getType() != IntPtrTy)
- RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy);
- if (IndexedSize != ElementSize) {
- RHS = Builder.CreateMul(
- RHS, ConstantInt::get(IntPtrTy, IndexedSize / ElementSize));
- }
- GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(
- Builder.CreateGEP(GEP->getResultElementType(), Candidate, RHS));
- NewGEP->setIsInBounds(GEP->isInBounds());
- NewGEP->takeName(GEP);
- return NewGEP;
-}
-
-Instruction *NaryReassociatePass::tryReassociateBinaryOp(BinaryOperator *I) {
- Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
- // There is no need to reassociate 0.
- if (SE->getSCEV(I)->isZero())
- return nullptr;
- if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I))
- return NewI;
- if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I))
- return NewI;
- return nullptr;
-}
-
-Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS,
- BinaryOperator *I) {
- Value *A = nullptr, *B = nullptr;
- // To be conservative, we reassociate I only when it is the only user of (A op
- // B).
- if (LHS->hasOneUse() && matchTernaryOp(I, LHS, A, B)) {
- // I = (A op B) op RHS
- // = (A op RHS) op B or (B op RHS) op A
- const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B);
- const SCEV *RHSExpr = SE->getSCEV(RHS);
- if (BExpr != RHSExpr) {
- if (auto *NewI =
- tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
- return NewI;
- }
- if (AExpr != RHSExpr) {
- if (auto *NewI =
- tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
- return NewI;
- }
- }
- return nullptr;
-}
-
-Instruction *NaryReassociatePass::tryReassociatedBinaryOp(const SCEV *LHSExpr,
- Value *RHS,
- BinaryOperator *I) {
- // Look for the closest dominator LHS of I that computes LHSExpr, and replace
- // I with LHS op RHS.
- auto *LHS = findClosestMatchingDominator(LHSExpr, I);
- if (LHS == nullptr)
- return nullptr;
-
- Instruction *NewI = nullptr;
- switch (I->getOpcode()) {
- case Instruction::Add:
- NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I);
- break;
- case Instruction::Mul:
- NewI = BinaryOperator::CreateMul(LHS, RHS, "", I);
- break;
- default:
- llvm_unreachable("Unexpected instruction.");
- }
- NewI->takeName(I);
- return NewI;
-}
-
-bool NaryReassociatePass::matchTernaryOp(BinaryOperator *I, Value *V,
- Value *&Op1, Value *&Op2) {
- switch (I->getOpcode()) {
- case Instruction::Add:
- return match(V, m_Add(m_Value(Op1), m_Value(Op2)));
- case Instruction::Mul:
- return match(V, m_Mul(m_Value(Op1), m_Value(Op2)));
- default:
- llvm_unreachable("Unexpected instruction.");
- }
- return false;
-}
-
-const SCEV *NaryReassociatePass::getBinarySCEV(BinaryOperator *I,
- const SCEV *LHS,
- const SCEV *RHS) {
- switch (I->getOpcode()) {
- case Instruction::Add:
- return SE->getAddExpr(LHS, RHS);
- case Instruction::Mul:
- return SE->getMulExpr(LHS, RHS);
- default:
- llvm_unreachable("Unexpected instruction.");
- }
- return nullptr;
-}
-
-Instruction *
-NaryReassociatePass::findClosestMatchingDominator(const SCEV *CandidateExpr,
- Instruction *Dominatee) {
- auto Pos = SeenExprs.find(CandidateExpr);
- if (Pos == SeenExprs.end())
- return nullptr;
-
- auto &Candidates = Pos->second;
- // Because we process the basic blocks in pre-order of the dominator tree, a
- // candidate that doesn't dominate the current instruction won't dominate any
- // future instruction either. Therefore, we pop it out of the stack. This
- // optimization makes the algorithm O(n).
- while (!Candidates.empty()) {
- // Candidates stores WeakTrackingVHs, so a candidate can be nullptr if it's
- // removed
- // during rewriting.
- if (Value *Candidate = Candidates.back()) {
- Instruction *CandidateInstruction = cast<Instruction>(Candidate);
- if (DT->dominates(CandidateInstruction, Dominatee))
- return CandidateInstruction;
- }
- Candidates.pop_back();
- }
- return nullptr;
-}
+ // Zero-extend LHS if it is non-negative. InstCombine canonicalizes sext to
+ // zext if the source operand is proved non-negative. We should do that
+ // consistently so that CandidateExpr more likely appears before. See
+ // @reassociate_gep_assume for an example of this canonicalization.
+ IndexExprs[I] =
+ SE->getZeroExtendExpr(IndexExprs[I], GEP->getOperand(I)->getType());
+ }
+ const SCEV *CandidateExpr = SE->getGEPExpr(cast<GEPOperator>(GEP),
+ IndexExprs);
+
+ Value *Candidate = findClosestMatchingDominator(CandidateExpr, GEP);
+ if (Candidate == nullptr)
+ return nullptr;
+
+ IRBuilder<> Builder(GEP);
+ // Candidate does not necessarily have the same pointer type as GEP. Use
+ // bitcast or pointer cast to make sure they have the same type, so that the
+ // later RAUW doesn't complain.
+ Candidate = Builder.CreateBitOrPointerCast(Candidate, GEP->getType());
+ assert(Candidate->getType() == GEP->getType());
+
+ // NewGEP = (char *)Candidate + RHS * sizeof(IndexedType)
+ uint64_t IndexedSize = DL->getTypeAllocSize(IndexedType);
+ Type *ElementType = GEP->getResultElementType();
+ uint64_t ElementSize = DL->getTypeAllocSize(ElementType);
+ // Another less rare case: because I is not necessarily the last index of the
+ // GEP, the size of the type at the I-th index (IndexedSize) is not
+ // necessarily divisible by ElementSize. For example,
+ //
+ // #pragma pack(1)
+ // struct S {
+ // int a[3];
+ // int64 b[8];
+ // };
+ // #pragma pack()
+ //
+ // sizeof(S) = 100 is indivisible by sizeof(int64) = 8.
+ //
+ // TODO: bail out on this case for now. We could emit uglygep.
+ if (IndexedSize % ElementSize != 0)
+ return nullptr;
+
+ // NewGEP = &Candidate[RHS * (sizeof(IndexedType) / sizeof(Candidate[0])));
+ Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+ if (RHS->getType() != IntPtrTy)
+ RHS = Builder.CreateSExtOrTrunc(RHS, IntPtrTy);
+ if (IndexedSize != ElementSize) {
+ RHS = Builder.CreateMul(
+ RHS, ConstantInt::get(IntPtrTy, IndexedSize / ElementSize));
+ }
+ GetElementPtrInst *NewGEP = cast<GetElementPtrInst>(
+ Builder.CreateGEP(GEP->getResultElementType(), Candidate, RHS));
+ NewGEP->setIsInBounds(GEP->isInBounds());
+ NewGEP->takeName(GEP);
+ return NewGEP;
+}
+
+Instruction *NaryReassociatePass::tryReassociateBinaryOp(BinaryOperator *I) {
+ Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+ // There is no need to reassociate 0.
+ if (SE->getSCEV(I)->isZero())
+ return nullptr;
+ if (auto *NewI = tryReassociateBinaryOp(LHS, RHS, I))
+ return NewI;
+ if (auto *NewI = tryReassociateBinaryOp(RHS, LHS, I))
+ return NewI;
+ return nullptr;
+}
+
+Instruction *NaryReassociatePass::tryReassociateBinaryOp(Value *LHS, Value *RHS,
+ BinaryOperator *I) {
+ Value *A = nullptr, *B = nullptr;
+ // To be conservative, we reassociate I only when it is the only user of (A op
+ // B).
+ if (LHS->hasOneUse() && matchTernaryOp(I, LHS, A, B)) {
+ // I = (A op B) op RHS
+ // = (A op RHS) op B or (B op RHS) op A
+ const SCEV *AExpr = SE->getSCEV(A), *BExpr = SE->getSCEV(B);
+ const SCEV *RHSExpr = SE->getSCEV(RHS);
+ if (BExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, AExpr, RHSExpr), B, I))
+ return NewI;
+ }
+ if (AExpr != RHSExpr) {
+ if (auto *NewI =
+ tryReassociatedBinaryOp(getBinarySCEV(I, BExpr, RHSExpr), A, I))
+ return NewI;
+ }
+ }
+ return nullptr;
+}
+
+Instruction *NaryReassociatePass::tryReassociatedBinaryOp(const SCEV *LHSExpr,
+ Value *RHS,
+ BinaryOperator *I) {
+ // Look for the closest dominator LHS of I that computes LHSExpr, and replace
+ // I with LHS op RHS.
+ auto *LHS = findClosestMatchingDominator(LHSExpr, I);
+ if (LHS == nullptr)
+ return nullptr;
+
+ Instruction *NewI = nullptr;
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ NewI = BinaryOperator::CreateAdd(LHS, RHS, "", I);
+ break;
+ case Instruction::Mul:
+ NewI = BinaryOperator::CreateMul(LHS, RHS, "", I);
+ break;
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ }
+ NewI->takeName(I);
+ return NewI;
+}
+
+bool NaryReassociatePass::matchTernaryOp(BinaryOperator *I, Value *V,
+ Value *&Op1, Value *&Op2) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ return match(V, m_Add(m_Value(Op1), m_Value(Op2)));
+ case Instruction::Mul:
+ return match(V, m_Mul(m_Value(Op1), m_Value(Op2)));
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ }
+ return false;
+}
+
+const SCEV *NaryReassociatePass::getBinarySCEV(BinaryOperator *I,
+ const SCEV *LHS,
+ const SCEV *RHS) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ return SE->getAddExpr(LHS, RHS);
+ case Instruction::Mul:
+ return SE->getMulExpr(LHS, RHS);
+ default:
+ llvm_unreachable("Unexpected instruction.");
+ }
+ return nullptr;
+}
+
+Instruction *
+NaryReassociatePass::findClosestMatchingDominator(const SCEV *CandidateExpr,
+ Instruction *Dominatee) {
+ auto Pos = SeenExprs.find(CandidateExpr);
+ if (Pos == SeenExprs.end())
+ return nullptr;
+
+ auto &Candidates = Pos->second;
+ // Because we process the basic blocks in pre-order of the dominator tree, a
+ // candidate that doesn't dominate the current instruction won't dominate any
+ // future instruction either. Therefore, we pop it out of the stack. This
+ // optimization makes the algorithm O(n).
+ while (!Candidates.empty()) {
+ // Candidates stores WeakTrackingVHs, so a candidate can be nullptr if it's
+ // removed
+ // during rewriting.
+ if (Value *Candidate = Candidates.back()) {
+ Instruction *CandidateInstruction = cast<Instruction>(Candidate);
+ if (DT->dominates(CandidateInstruction, Dominatee))
+ return CandidateInstruction;
+ }
+ Candidates.pop_back();
+ }
+ return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/NewGVN.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/NewGVN.cpp
index 330f3e9509..281d47c862 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/NewGVN.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/NewGVN.cpp
@@ -1,1564 +1,1564 @@
-//===- NewGVN.cpp - Global Value Numbering Pass ---------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This file implements the new LLVM's Global Value Numbering pass.
-/// GVN partitions values computed by a function into congruence classes.
-/// Values ending up in the same congruence class are guaranteed to be the same
-/// for every execution of the program. In that respect, congruency is a
-/// compile-time approximation of equivalence of values at runtime.
-/// The algorithm implemented here uses a sparse formulation and it's based
-/// on the ideas described in the paper:
-/// "A Sparse Algorithm for Predicated Global Value Numbering" from
-/// Karthik Gargi.
-///
-/// A brief overview of the algorithm: The algorithm is essentially the same as
-/// the standard RPO value numbering algorithm (a good reference is the paper
-/// "SCC based value numbering" by L. Taylor Simpson) with one major difference:
-/// The RPO algorithm proceeds, on every iteration, to process every reachable
-/// block and every instruction in that block. This is because the standard RPO
-/// algorithm does not track what things have the same value number, it only
-/// tracks what the value number of a given operation is (the mapping is
-/// operation -> value number). Thus, when a value number of an operation
-/// changes, it must reprocess everything to ensure all uses of a value number
-/// get updated properly. In constrast, the sparse algorithm we use *also*
-/// tracks what operations have a given value number (IE it also tracks the
-/// reverse mapping from value number -> operations with that value number), so
-/// that it only needs to reprocess the instructions that are affected when
-/// something's value number changes. The vast majority of complexity and code
-/// in this file is devoted to tracking what value numbers could change for what
-/// instructions when various things happen. The rest of the algorithm is
-/// devoted to performing symbolic evaluation, forward propagation, and
-/// simplification of operations based on the value numbers deduced so far
-///
-/// In order to make the GVN mostly-complete, we use a technique derived from
-/// "Detection of Redundant Expressions: A Complete and Polynomial-time
-/// Algorithm in SSA" by R.R. Pai. The source of incompleteness in most SSA
-/// based GVN algorithms is related to their inability to detect equivalence
-/// between phi of ops (IE phi(a+b, c+d)) and op of phis (phi(a,c) + phi(b, d)).
-/// We resolve this issue by generating the equivalent "phi of ops" form for
-/// each op of phis we see, in a way that only takes polynomial time to resolve.
-///
-/// We also do not perform elimination by using any published algorithm. All
-/// published algorithms are O(Instructions). Instead, we use a technique that
-/// is O(number of operations with the same value number), enabling us to skip
-/// trying to eliminate things that have unique value numbers.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/NewGVN.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseMapInfo.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/GraphTraits.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SparseBitVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CFGPrinter.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/ArrayRecycler.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/DebugCounter.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/PointerLikeTypeTraits.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/GVNExpression.h"
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/PredicateInfo.h"
-#include "llvm/Transforms/Utils/VNCoercion.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <map>
-#include <memory>
-#include <set>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-using namespace llvm::GVNExpression;
-using namespace llvm::VNCoercion;
-using namespace llvm::PatternMatch;
-
-#define DEBUG_TYPE "newgvn"
-
-STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
-STATISTIC(NumGVNBlocksDeleted, "Number of blocks deleted");
-STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified");
-STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same");
-STATISTIC(NumGVNMaxIterations,
- "Maximum Number of iterations it took to converge GVN");
-STATISTIC(NumGVNLeaderChanges, "Number of leader changes");
-STATISTIC(NumGVNSortedLeaderChanges, "Number of sorted leader changes");
-STATISTIC(NumGVNAvoidedSortedLeaderChanges,
- "Number of avoided sorted leader changes");
-STATISTIC(NumGVNDeadStores, "Number of redundant/dead stores eliminated");
-STATISTIC(NumGVNPHIOfOpsCreated, "Number of PHI of ops created");
-STATISTIC(NumGVNPHIOfOpsEliminations,
- "Number of things eliminated using PHI of ops");
-DEBUG_COUNTER(VNCounter, "newgvn-vn",
- "Controls which instructions are value numbered");
-DEBUG_COUNTER(PHIOfOpsCounter, "newgvn-phi",
- "Controls which instructions we create phi of ops for");
-// Currently store defining access refinement is too slow due to basicaa being
-// egregiously slow. This flag lets us keep it working while we work on this
-// issue.
-static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
- cl::init(false), cl::Hidden);
-
-/// Currently, the generation "phi of ops" can result in correctness issues.
-static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true),
- cl::Hidden);
-
-//===----------------------------------------------------------------------===//
-// GVN Pass
-//===----------------------------------------------------------------------===//
-
-// Anchor methods.
-namespace llvm {
-namespace GVNExpression {
-
-Expression::~Expression() = default;
-BasicExpression::~BasicExpression() = default;
-CallExpression::~CallExpression() = default;
-LoadExpression::~LoadExpression() = default;
-StoreExpression::~StoreExpression() = default;
-AggregateValueExpression::~AggregateValueExpression() = default;
-PHIExpression::~PHIExpression() = default;
-
-} // end namespace GVNExpression
-} // end namespace llvm
-
-namespace {
-
-// Tarjan's SCC finding algorithm with Nuutila's improvements
-// SCCIterator is actually fairly complex for the simple thing we want.
-// It also wants to hand us SCC's that are unrelated to the phi node we ask
-// about, and have us process them there or risk redoing work.
-// Graph traits over a filter iterator also doesn't work that well here.
-// This SCC finder is specialized to walk use-def chains, and only follows
-// instructions,
-// not generic values (arguments, etc).
-struct TarjanSCC {
- TarjanSCC() : Components(1) {}
-
- void Start(const Instruction *Start) {
- if (Root.lookup(Start) == 0)
- FindSCC(Start);
- }
-
- const SmallPtrSetImpl<const Value *> &getComponentFor(const Value *V) const {
- unsigned ComponentID = ValueToComponent.lookup(V);
-
- assert(ComponentID > 0 &&
- "Asking for a component for a value we never processed");
- return Components[ComponentID];
- }
-
-private:
- void FindSCC(const Instruction *I) {
- Root[I] = ++DFSNum;
- // Store the DFS Number we had before it possibly gets incremented.
- unsigned int OurDFS = DFSNum;
- for (auto &Op : I->operands()) {
- if (auto *InstOp = dyn_cast<Instruction>(Op)) {
- if (Root.lookup(Op) == 0)
- FindSCC(InstOp);
- if (!InComponent.count(Op))
- Root[I] = std::min(Root.lookup(I), Root.lookup(Op));
- }
- }
- // See if we really were the root of a component, by seeing if we still have
- // our DFSNumber. If we do, we are the root of the component, and we have
- // completed a component. If we do not, we are not the root of a component,
- // and belong on the component stack.
- if (Root.lookup(I) == OurDFS) {
- unsigned ComponentID = Components.size();
- Components.resize(Components.size() + 1);
- auto &Component = Components.back();
- Component.insert(I);
- LLVM_DEBUG(dbgs() << "Component root is " << *I << "\n");
- InComponent.insert(I);
- ValueToComponent[I] = ComponentID;
- // Pop a component off the stack and label it.
- while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) {
- auto *Member = Stack.back();
- LLVM_DEBUG(dbgs() << "Component member is " << *Member << "\n");
- Component.insert(Member);
- InComponent.insert(Member);
- ValueToComponent[Member] = ComponentID;
- Stack.pop_back();
- }
- } else {
- // Part of a component, push to stack
- Stack.push_back(I);
- }
- }
-
- unsigned int DFSNum = 1;
- SmallPtrSet<const Value *, 8> InComponent;
- DenseMap<const Value *, unsigned int> Root;
- SmallVector<const Value *, 8> Stack;
-
- // Store the components as vector of ptr sets, because we need the topo order
- // of SCC's, but not individual member order
- SmallVector<SmallPtrSet<const Value *, 8>, 8> Components;
-
- DenseMap<const Value *, unsigned> ValueToComponent;
-};
-
-// Congruence classes represent the set of expressions/instructions
-// that are all the same *during some scope in the function*.
-// That is, because of the way we perform equality propagation, and
-// because of memory value numbering, it is not correct to assume
-// you can willy-nilly replace any member with any other at any
-// point in the function.
-//
-// For any Value in the Member set, it is valid to replace any dominated member
-// with that Value.
-//
-// Every congruence class has a leader, and the leader is used to symbolize
-// instructions in a canonical way (IE every operand of an instruction that is a
-// member of the same congruence class will always be replaced with leader
-// during symbolization). To simplify symbolization, we keep the leader as a
-// constant if class can be proved to be a constant value. Otherwise, the
-// leader is the member of the value set with the smallest DFS number. Each
-// congruence class also has a defining expression, though the expression may be
-// null. If it exists, it can be used for forward propagation and reassociation
-// of values.
-
-// For memory, we also track a representative MemoryAccess, and a set of memory
-// members for MemoryPhis (which have no real instructions). Note that for
-// memory, it seems tempting to try to split the memory members into a
-// MemoryCongruenceClass or something. Unfortunately, this does not work
-// easily. The value numbering of a given memory expression depends on the
-// leader of the memory congruence class, and the leader of memory congruence
-// class depends on the value numbering of a given memory expression. This
-// leads to wasted propagation, and in some cases, missed optimization. For
-// example: If we had value numbered two stores together before, but now do not,
-// we move them to a new value congruence class. This in turn will move at one
-// of the memorydefs to a new memory congruence class. Which in turn, affects
-// the value numbering of the stores we just value numbered (because the memory
-// congruence class is part of the value number). So while theoretically
-// possible to split them up, it turns out to be *incredibly* complicated to get
-// it to work right, because of the interdependency. While structurally
-// slightly messier, it is algorithmically much simpler and faster to do what we
-// do here, and track them both at once in the same class.
-// Note: The default iterators for this class iterate over values
-class CongruenceClass {
-public:
- using MemberType = Value;
- using MemberSet = SmallPtrSet<MemberType *, 4>;
- using MemoryMemberType = MemoryPhi;
- using MemoryMemberSet = SmallPtrSet<const MemoryMemberType *, 2>;
-
- explicit CongruenceClass(unsigned ID) : ID(ID) {}
- CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
- : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
-
- unsigned getID() const { return ID; }
-
- // True if this class has no members left. This is mainly used for assertion
- // purposes, and for skipping empty classes.
- bool isDead() const {
- // If it's both dead from a value perspective, and dead from a memory
- // perspective, it's really dead.
- return empty() && memory_empty();
- }
-
- // Leader functions
- Value *getLeader() const { return RepLeader; }
- void setLeader(Value *Leader) { RepLeader = Leader; }
- const std::pair<Value *, unsigned int> &getNextLeader() const {
- return NextLeader;
- }
- void resetNextLeader() { NextLeader = {nullptr, ~0}; }
- void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) {
- if (LeaderPair.second < NextLeader.second)
- NextLeader = LeaderPair;
- }
-
- Value *getStoredValue() const { return RepStoredValue; }
- void setStoredValue(Value *Leader) { RepStoredValue = Leader; }
- const MemoryAccess *getMemoryLeader() const { return RepMemoryAccess; }
- void setMemoryLeader(const MemoryAccess *Leader) { RepMemoryAccess = Leader; }
-
- // Forward propagation info
- const Expression *getDefiningExpr() const { return DefiningExpr; }
-
- // Value member set
- bool empty() const { return Members.empty(); }
- unsigned size() const { return Members.size(); }
- MemberSet::const_iterator begin() const { return Members.begin(); }
- MemberSet::const_iterator end() const { return Members.end(); }
- void insert(MemberType *M) { Members.insert(M); }
- void erase(MemberType *M) { Members.erase(M); }
- void swap(MemberSet &Other) { Members.swap(Other); }
-
- // Memory member set
- bool memory_empty() const { return MemoryMembers.empty(); }
- unsigned memory_size() const { return MemoryMembers.size(); }
- MemoryMemberSet::const_iterator memory_begin() const {
- return MemoryMembers.begin();
- }
- MemoryMemberSet::const_iterator memory_end() const {
- return MemoryMembers.end();
- }
- iterator_range<MemoryMemberSet::const_iterator> memory() const {
- return make_range(memory_begin(), memory_end());
- }
-
- void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); }
- void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); }
-
- // Store count
- unsigned getStoreCount() const { return StoreCount; }
- void incStoreCount() { ++StoreCount; }
- void decStoreCount() {
- assert(StoreCount != 0 && "Store count went negative");
- --StoreCount;
- }
-
- // True if this class has no memory members.
- bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
-
- // Return true if two congruence classes are equivalent to each other. This
- // means that every field but the ID number and the dead field are equivalent.
- bool isEquivalentTo(const CongruenceClass *Other) const {
- if (!Other)
- return false;
- if (this == Other)
- return true;
-
- if (std::tie(StoreCount, RepLeader, RepStoredValue, RepMemoryAccess) !=
- std::tie(Other->StoreCount, Other->RepLeader, Other->RepStoredValue,
- Other->RepMemoryAccess))
- return false;
- if (DefiningExpr != Other->DefiningExpr)
- if (!DefiningExpr || !Other->DefiningExpr ||
- *DefiningExpr != *Other->DefiningExpr)
- return false;
-
- if (Members.size() != Other->Members.size())
- return false;
-
- return all_of(Members,
- [&](const Value *V) { return Other->Members.count(V); });
- }
-
-private:
- unsigned ID;
-
- // Representative leader.
- Value *RepLeader = nullptr;
-
- // The most dominating leader after our current leader, because the member set
- // is not sorted and is expensive to keep sorted all the time.
- std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
-
- // If this is represented by a store, the value of the store.
- Value *RepStoredValue = nullptr;
-
- // If this class contains MemoryDefs or MemoryPhis, this is the leading memory
- // access.
- const MemoryAccess *RepMemoryAccess = nullptr;
-
- // Defining Expression.
- const Expression *DefiningExpr = nullptr;
-
- // Actual members of this class.
- MemberSet Members;
-
- // This is the set of MemoryPhis that exist in the class. MemoryDefs and
- // MemoryUses have real instructions representing them, so we only need to
- // track MemoryPhis here.
- MemoryMemberSet MemoryMembers;
-
- // Number of stores in this congruence class.
- // This is used so we can detect store equivalence changes properly.
- int StoreCount = 0;
-};
-
-} // end anonymous namespace
-
-namespace llvm {
-
-struct ExactEqualsExpression {
- const Expression &E;
-
- explicit ExactEqualsExpression(const Expression &E) : E(E) {}
-
- hash_code getComputedHash() const { return E.getComputedHash(); }
-
- bool operator==(const Expression &Other) const {
- return E.exactlyEquals(Other);
- }
-};
-
-template <> struct DenseMapInfo<const Expression *> {
- static const Expression *getEmptyKey() {
- auto Val = static_cast<uintptr_t>(-1);
- Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
- return reinterpret_cast<const Expression *>(Val);
- }
-
- static const Expression *getTombstoneKey() {
- auto Val = static_cast<uintptr_t>(~1U);
- Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
- return reinterpret_cast<const Expression *>(Val);
- }
-
- static unsigned getHashValue(const Expression *E) {
- return E->getComputedHash();
- }
-
- static unsigned getHashValue(const ExactEqualsExpression &E) {
- return E.getComputedHash();
- }
-
- static bool isEqual(const ExactEqualsExpression &LHS, const Expression *RHS) {
- if (RHS == getTombstoneKey() || RHS == getEmptyKey())
- return false;
- return LHS == *RHS;
- }
-
- static bool isEqual(const Expression *LHS, const Expression *RHS) {
- if (LHS == RHS)
- return true;
- if (LHS == getTombstoneKey() || RHS == getTombstoneKey() ||
- LHS == getEmptyKey() || RHS == getEmptyKey())
- return false;
- // Compare hashes before equality. This is *not* what the hashtable does,
- // since it is computing it modulo the number of buckets, whereas we are
- // using the full hash keyspace. Since the hashes are precomputed, this
- // check is *much* faster than equality.
- if (LHS->getComputedHash() != RHS->getComputedHash())
- return false;
- return *LHS == *RHS;
- }
-};
-
-} // end namespace llvm
-
-namespace {
-
-class NewGVN {
- Function &F;
- DominatorTree *DT = nullptr;
- const TargetLibraryInfo *TLI = nullptr;
- AliasAnalysis *AA = nullptr;
- MemorySSA *MSSA = nullptr;
- MemorySSAWalker *MSSAWalker = nullptr;
- AssumptionCache *AC = nullptr;
- const DataLayout &DL;
- std::unique_ptr<PredicateInfo> PredInfo;
-
- // These are the only two things the create* functions should have
- // side-effects on due to allocating memory.
- mutable BumpPtrAllocator ExpressionAllocator;
- mutable ArrayRecycler<Value *> ArgRecycler;
- mutable TarjanSCC SCCFinder;
- const SimplifyQuery SQ;
-
- // Number of function arguments, used by ranking
- unsigned int NumFuncArgs = 0;
-
- // RPOOrdering of basic blocks
- DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
-
- // Congruence class info.
-
- // This class is called INITIAL in the paper. It is the class everything
- // startsout in, and represents any value. Being an optimistic analysis,
- // anything in the TOP class has the value TOP, which is indeterminate and
- // equivalent to everything.
- CongruenceClass *TOPClass = nullptr;
- std::vector<CongruenceClass *> CongruenceClasses;
- unsigned NextCongruenceNum = 0;
-
- // Value Mappings.
- DenseMap<Value *, CongruenceClass *> ValueToClass;
- DenseMap<Value *, const Expression *> ValueToExpression;
-
- // Value PHI handling, used to make equivalence between phi(op, op) and
- // op(phi, phi).
- // These mappings just store various data that would normally be part of the
- // IR.
- SmallPtrSet<const Instruction *, 8> PHINodeUses;
-
- DenseMap<const Value *, bool> OpSafeForPHIOfOps;
-
- // Map a temporary instruction we created to a parent block.
- DenseMap<const Value *, BasicBlock *> TempToBlock;
-
- // Map between the already in-program instructions and the temporary phis we
- // created that they are known equivalent to.
- DenseMap<const Value *, PHINode *> RealToTemp;
-
- // In order to know when we should re-process instructions that have
- // phi-of-ops, we track the set of expressions that they needed as
- // leaders. When we discover new leaders for those expressions, we process the
- // associated phi-of-op instructions again in case they have changed. The
- // other way they may change is if they had leaders, and those leaders
- // disappear. However, at the point they have leaders, there are uses of the
- // relevant operands in the created phi node, and so they will get reprocessed
- // through the normal user marking we perform.
- mutable DenseMap<const Value *, SmallPtrSet<Value *, 2>> AdditionalUsers;
- DenseMap<const Expression *, SmallPtrSet<Instruction *, 2>>
- ExpressionToPhiOfOps;
-
- // Map from temporary operation to MemoryAccess.
- DenseMap<const Instruction *, MemoryUseOrDef *> TempToMemory;
-
- // Set of all temporary instructions we created.
- // Note: This will include instructions that were just created during value
- // numbering. The way to test if something is using them is to check
- // RealToTemp.
- DenseSet<Instruction *> AllTempInstructions;
-
- // This is the set of instructions to revisit on a reachability change. At
- // the end of the main iteration loop it will contain at least all the phi of
- // ops instructions that will be changed to phis, as well as regular phis.
- // During the iteration loop, it may contain other things, such as phi of ops
- // instructions that used edge reachability to reach a result, and so need to
- // be revisited when the edge changes, independent of whether the phi they
- // depended on changes.
- DenseMap<BasicBlock *, SparseBitVector<>> RevisitOnReachabilityChange;
-
- // Mapping from predicate info we used to the instructions we used it with.
- // In order to correctly ensure propagation, we must keep track of what
- // comparisons we used, so that when the values of the comparisons change, we
- // propagate the information to the places we used the comparison.
- mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>>
- PredicateToUsers;
-
- // the same reasoning as PredicateToUsers. When we skip MemoryAccesses for
- // stores, we no longer can rely solely on the def-use chains of MemorySSA.
- mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>>
- MemoryToUsers;
-
- // A table storing which memorydefs/phis represent a memory state provably
- // equivalent to another memory state.
- // We could use the congruence class machinery, but the MemoryAccess's are
- // abstract memory states, so they can only ever be equivalent to each other,
- // and not to constants, etc.
- DenseMap<const MemoryAccess *, CongruenceClass *> MemoryAccessToClass;
-
- // We could, if we wanted, build MemoryPhiExpressions and
- // MemoryVariableExpressions, etc, and value number them the same way we value
- // number phi expressions. For the moment, this seems like overkill. They
- // can only exist in one of three states: they can be TOP (equal to
- // everything), Equivalent to something else, or unique. Because we do not
- // create expressions for them, we need to simulate leader change not just
- // when they change class, but when they change state. Note: We can do the
- // same thing for phis, and avoid having phi expressions if we wanted, We
- // should eventually unify in one direction or the other, so this is a little
- // bit of an experiment in which turns out easier to maintain.
- enum MemoryPhiState { MPS_Invalid, MPS_TOP, MPS_Equivalent, MPS_Unique };
- DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
-
- enum InstCycleState { ICS_Unknown, ICS_CycleFree, ICS_Cycle };
- mutable DenseMap<const Instruction *, InstCycleState> InstCycleState;
-
- // Expression to class mapping.
- using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
- ExpressionClassMap ExpressionToClass;
-
- // We have a single expression that represents currently DeadExpressions.
- // For dead expressions we can prove will stay dead, we mark them with
- // DFS number zero. However, it's possible in the case of phi nodes
- // for us to assume/prove all arguments are dead during fixpointing.
- // We use DeadExpression for that case.
- DeadExpression *SingletonDeadExpression = nullptr;
-
- // Which values have changed as a result of leader changes.
- SmallPtrSet<Value *, 8> LeaderChanges;
-
- // Reachability info.
- using BlockEdge = BasicBlockEdge;
- DenseSet<BlockEdge> ReachableEdges;
- SmallPtrSet<const BasicBlock *, 8> ReachableBlocks;
-
- // This is a bitvector because, on larger functions, we may have
- // thousands of touched instructions at once (entire blocks,
- // instructions with hundreds of uses, etc). Even with optimization
- // for when we mark whole blocks as touched, when this was a
- // SmallPtrSet or DenseSet, for some functions, we spent >20% of all
- // the time in GVN just managing this list. The bitvector, on the
- // other hand, efficiently supports test/set/clear of both
- // individual and ranges, as well as "find next element" This
- // enables us to use it as a worklist with essentially 0 cost.
- BitVector TouchedInstructions;
-
- DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
-
-#ifndef NDEBUG
- // Debugging for how many times each block and instruction got processed.
- DenseMap<const Value *, unsigned> ProcessedCount;
-#endif
-
- // DFS info.
- // This contains a mapping from Instructions to DFS numbers.
- // The numbering starts at 1. An instruction with DFS number zero
- // means that the instruction is dead.
- DenseMap<const Value *, unsigned> InstrDFS;
-
- // This contains the mapping DFS numbers to instructions.
- SmallVector<Value *, 32> DFSToInstr;
-
- // Deletion info.
- SmallPtrSet<Instruction *, 8> InstructionsToErase;
-
-public:
- NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
- TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
- const DataLayout &DL)
- : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), AC(AC), DL(DL),
- PredInfo(std::make_unique<PredicateInfo>(F, *DT, *AC)),
+//===- NewGVN.cpp - Global Value Numbering Pass ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the new LLVM's Global Value Numbering pass.
+/// GVN partitions values computed by a function into congruence classes.
+/// Values ending up in the same congruence class are guaranteed to be the same
+/// for every execution of the program. In that respect, congruency is a
+/// compile-time approximation of equivalence of values at runtime.
+/// The algorithm implemented here uses a sparse formulation and it's based
+/// on the ideas described in the paper:
+/// "A Sparse Algorithm for Predicated Global Value Numbering" from
+/// Karthik Gargi.
+///
+/// A brief overview of the algorithm: The algorithm is essentially the same as
+/// the standard RPO value numbering algorithm (a good reference is the paper
+/// "SCC based value numbering" by L. Taylor Simpson) with one major difference:
+/// The RPO algorithm proceeds, on every iteration, to process every reachable
+/// block and every instruction in that block. This is because the standard RPO
+/// algorithm does not track what things have the same value number, it only
+/// tracks what the value number of a given operation is (the mapping is
+/// operation -> value number). Thus, when a value number of an operation
+/// changes, it must reprocess everything to ensure all uses of a value number
+/// get updated properly. In constrast, the sparse algorithm we use *also*
+/// tracks what operations have a given value number (IE it also tracks the
+/// reverse mapping from value number -> operations with that value number), so
+/// that it only needs to reprocess the instructions that are affected when
+/// something's value number changes. The vast majority of complexity and code
+/// in this file is devoted to tracking what value numbers could change for what
+/// instructions when various things happen. The rest of the algorithm is
+/// devoted to performing symbolic evaluation, forward propagation, and
+/// simplification of operations based on the value numbers deduced so far
+///
+/// In order to make the GVN mostly-complete, we use a technique derived from
+/// "Detection of Redundant Expressions: A Complete and Polynomial-time
+/// Algorithm in SSA" by R.R. Pai. The source of incompleteness in most SSA
+/// based GVN algorithms is related to their inability to detect equivalence
+/// between phi of ops (IE phi(a+b, c+d)) and op of phis (phi(a,c) + phi(b, d)).
+/// We resolve this issue by generating the equivalent "phi of ops" form for
+/// each op of phis we see, in a way that only takes polynomial time to resolve.
+///
+/// We also do not perform elimination by using any published algorithm. All
+/// published algorithms are O(Instructions). Instead, we use a technique that
+/// is O(number of operations with the same value number), enabling us to skip
+/// trying to eliminate things that have unique value numbers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/NewGVN.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/ArrayRecycler.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/PointerLikeTypeTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/GVNExpression.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::GVNExpression;
+using namespace llvm::VNCoercion;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "newgvn"
+
+STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
+STATISTIC(NumGVNBlocksDeleted, "Number of blocks deleted");
+STATISTIC(NumGVNOpsSimplified, "Number of Expressions simplified");
+STATISTIC(NumGVNPhisAllSame, "Number of PHIs whos arguments are all the same");
+STATISTIC(NumGVNMaxIterations,
+ "Maximum Number of iterations it took to converge GVN");
+STATISTIC(NumGVNLeaderChanges, "Number of leader changes");
+STATISTIC(NumGVNSortedLeaderChanges, "Number of sorted leader changes");
+STATISTIC(NumGVNAvoidedSortedLeaderChanges,
+ "Number of avoided sorted leader changes");
+STATISTIC(NumGVNDeadStores, "Number of redundant/dead stores eliminated");
+STATISTIC(NumGVNPHIOfOpsCreated, "Number of PHI of ops created");
+STATISTIC(NumGVNPHIOfOpsEliminations,
+ "Number of things eliminated using PHI of ops");
+DEBUG_COUNTER(VNCounter, "newgvn-vn",
+ "Controls which instructions are value numbered");
+DEBUG_COUNTER(PHIOfOpsCounter, "newgvn-phi",
+ "Controls which instructions we create phi of ops for");
+// Currently store defining access refinement is too slow due to basicaa being
+// egregiously slow. This flag lets us keep it working while we work on this
+// issue.
+static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
+ cl::init(false), cl::Hidden);
+
+/// Currently, the generation "phi of ops" can result in correctness issues.
+static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true),
+ cl::Hidden);
+
+//===----------------------------------------------------------------------===//
+// GVN Pass
+//===----------------------------------------------------------------------===//
+
+// Anchor methods.
+namespace llvm {
+namespace GVNExpression {
+
+Expression::~Expression() = default;
+BasicExpression::~BasicExpression() = default;
+CallExpression::~CallExpression() = default;
+LoadExpression::~LoadExpression() = default;
+StoreExpression::~StoreExpression() = default;
+AggregateValueExpression::~AggregateValueExpression() = default;
+PHIExpression::~PHIExpression() = default;
+
+} // end namespace GVNExpression
+} // end namespace llvm
+
+namespace {
+
+// Tarjan's SCC finding algorithm with Nuutila's improvements
+// SCCIterator is actually fairly complex for the simple thing we want.
+// It also wants to hand us SCC's that are unrelated to the phi node we ask
+// about, and have us process them there or risk redoing work.
+// Graph traits over a filter iterator also doesn't work that well here.
+// This SCC finder is specialized to walk use-def chains, and only follows
+// instructions,
+// not generic values (arguments, etc).
+struct TarjanSCC {
+ TarjanSCC() : Components(1) {}
+
+ void Start(const Instruction *Start) {
+ if (Root.lookup(Start) == 0)
+ FindSCC(Start);
+ }
+
+ const SmallPtrSetImpl<const Value *> &getComponentFor(const Value *V) const {
+ unsigned ComponentID = ValueToComponent.lookup(V);
+
+ assert(ComponentID > 0 &&
+ "Asking for a component for a value we never processed");
+ return Components[ComponentID];
+ }
+
+private:
+ void FindSCC(const Instruction *I) {
+ Root[I] = ++DFSNum;
+ // Store the DFS Number we had before it possibly gets incremented.
+ unsigned int OurDFS = DFSNum;
+ for (auto &Op : I->operands()) {
+ if (auto *InstOp = dyn_cast<Instruction>(Op)) {
+ if (Root.lookup(Op) == 0)
+ FindSCC(InstOp);
+ if (!InComponent.count(Op))
+ Root[I] = std::min(Root.lookup(I), Root.lookup(Op));
+ }
+ }
+ // See if we really were the root of a component, by seeing if we still have
+ // our DFSNumber. If we do, we are the root of the component, and we have
+ // completed a component. If we do not, we are not the root of a component,
+ // and belong on the component stack.
+ if (Root.lookup(I) == OurDFS) {
+ unsigned ComponentID = Components.size();
+ Components.resize(Components.size() + 1);
+ auto &Component = Components.back();
+ Component.insert(I);
+ LLVM_DEBUG(dbgs() << "Component root is " << *I << "\n");
+ InComponent.insert(I);
+ ValueToComponent[I] = ComponentID;
+ // Pop a component off the stack and label it.
+ while (!Stack.empty() && Root.lookup(Stack.back()) >= OurDFS) {
+ auto *Member = Stack.back();
+ LLVM_DEBUG(dbgs() << "Component member is " << *Member << "\n");
+ Component.insert(Member);
+ InComponent.insert(Member);
+ ValueToComponent[Member] = ComponentID;
+ Stack.pop_back();
+ }
+ } else {
+ // Part of a component, push to stack
+ Stack.push_back(I);
+ }
+ }
+
+ unsigned int DFSNum = 1;
+ SmallPtrSet<const Value *, 8> InComponent;
+ DenseMap<const Value *, unsigned int> Root;
+ SmallVector<const Value *, 8> Stack;
+
+ // Store the components as vector of ptr sets, because we need the topo order
+ // of SCC's, but not individual member order
+ SmallVector<SmallPtrSet<const Value *, 8>, 8> Components;
+
+ DenseMap<const Value *, unsigned> ValueToComponent;
+};
+
+// Congruence classes represent the set of expressions/instructions
+// that are all the same *during some scope in the function*.
+// That is, because of the way we perform equality propagation, and
+// because of memory value numbering, it is not correct to assume
+// you can willy-nilly replace any member with any other at any
+// point in the function.
+//
+// For any Value in the Member set, it is valid to replace any dominated member
+// with that Value.
+//
+// Every congruence class has a leader, and the leader is used to symbolize
+// instructions in a canonical way (IE every operand of an instruction that is a
+// member of the same congruence class will always be replaced with leader
+// during symbolization). To simplify symbolization, we keep the leader as a
+// constant if class can be proved to be a constant value. Otherwise, the
+// leader is the member of the value set with the smallest DFS number. Each
+// congruence class also has a defining expression, though the expression may be
+// null. If it exists, it can be used for forward propagation and reassociation
+// of values.
+
+// For memory, we also track a representative MemoryAccess, and a set of memory
+// members for MemoryPhis (which have no real instructions). Note that for
+// memory, it seems tempting to try to split the memory members into a
+// MemoryCongruenceClass or something. Unfortunately, this does not work
+// easily. The value numbering of a given memory expression depends on the
+// leader of the memory congruence class, and the leader of memory congruence
+// class depends on the value numbering of a given memory expression. This
+// leads to wasted propagation, and in some cases, missed optimization. For
+// example: If we had value numbered two stores together before, but now do not,
+// we move them to a new value congruence class. This in turn will move at one
+// of the memorydefs to a new memory congruence class. Which in turn, affects
+// the value numbering of the stores we just value numbered (because the memory
+// congruence class is part of the value number). So while theoretically
+// possible to split them up, it turns out to be *incredibly* complicated to get
+// it to work right, because of the interdependency. While structurally
+// slightly messier, it is algorithmically much simpler and faster to do what we
+// do here, and track them both at once in the same class.
+// Note: The default iterators for this class iterate over values
+class CongruenceClass {
+public:
+ using MemberType = Value;
+ using MemberSet = SmallPtrSet<MemberType *, 4>;
+ using MemoryMemberType = MemoryPhi;
+ using MemoryMemberSet = SmallPtrSet<const MemoryMemberType *, 2>;
+
+ explicit CongruenceClass(unsigned ID) : ID(ID) {}
+ CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
+ : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
+
+ unsigned getID() const { return ID; }
+
+ // True if this class has no members left. This is mainly used for assertion
+ // purposes, and for skipping empty classes.
+ bool isDead() const {
+ // If it's both dead from a value perspective, and dead from a memory
+ // perspective, it's really dead.
+ return empty() && memory_empty();
+ }
+
+ // Leader functions
+ Value *getLeader() const { return RepLeader; }
+ void setLeader(Value *Leader) { RepLeader = Leader; }
+ const std::pair<Value *, unsigned int> &getNextLeader() const {
+ return NextLeader;
+ }
+ void resetNextLeader() { NextLeader = {nullptr, ~0}; }
+ void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) {
+ if (LeaderPair.second < NextLeader.second)
+ NextLeader = LeaderPair;
+ }
+
+ Value *getStoredValue() const { return RepStoredValue; }
+ void setStoredValue(Value *Leader) { RepStoredValue = Leader; }
+ const MemoryAccess *getMemoryLeader() const { return RepMemoryAccess; }
+ void setMemoryLeader(const MemoryAccess *Leader) { RepMemoryAccess = Leader; }
+
+ // Forward propagation info
+ const Expression *getDefiningExpr() const { return DefiningExpr; }
+
+ // Value member set
+ bool empty() const { return Members.empty(); }
+ unsigned size() const { return Members.size(); }
+ MemberSet::const_iterator begin() const { return Members.begin(); }
+ MemberSet::const_iterator end() const { return Members.end(); }
+ void insert(MemberType *M) { Members.insert(M); }
+ void erase(MemberType *M) { Members.erase(M); }
+ void swap(MemberSet &Other) { Members.swap(Other); }
+
+ // Memory member set
+ bool memory_empty() const { return MemoryMembers.empty(); }
+ unsigned memory_size() const { return MemoryMembers.size(); }
+ MemoryMemberSet::const_iterator memory_begin() const {
+ return MemoryMembers.begin();
+ }
+ MemoryMemberSet::const_iterator memory_end() const {
+ return MemoryMembers.end();
+ }
+ iterator_range<MemoryMemberSet::const_iterator> memory() const {
+ return make_range(memory_begin(), memory_end());
+ }
+
+ void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); }
+ void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); }
+
+ // Store count
+ unsigned getStoreCount() const { return StoreCount; }
+ void incStoreCount() { ++StoreCount; }
+ void decStoreCount() {
+ assert(StoreCount != 0 && "Store count went negative");
+ --StoreCount;
+ }
+
+ // True if this class has no memory members.
+ bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); }
+
+ // Return true if two congruence classes are equivalent to each other. This
+ // means that every field but the ID number and the dead field are equivalent.
+ bool isEquivalentTo(const CongruenceClass *Other) const {
+ if (!Other)
+ return false;
+ if (this == Other)
+ return true;
+
+ if (std::tie(StoreCount, RepLeader, RepStoredValue, RepMemoryAccess) !=
+ std::tie(Other->StoreCount, Other->RepLeader, Other->RepStoredValue,
+ Other->RepMemoryAccess))
+ return false;
+ if (DefiningExpr != Other->DefiningExpr)
+ if (!DefiningExpr || !Other->DefiningExpr ||
+ *DefiningExpr != *Other->DefiningExpr)
+ return false;
+
+ if (Members.size() != Other->Members.size())
+ return false;
+
+ return all_of(Members,
+ [&](const Value *V) { return Other->Members.count(V); });
+ }
+
+private:
+ unsigned ID;
+
+ // Representative leader.
+ Value *RepLeader = nullptr;
+
+ // The most dominating leader after our current leader, because the member set
+ // is not sorted and is expensive to keep sorted all the time.
+ std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
+
+ // If this is represented by a store, the value of the store.
+ Value *RepStoredValue = nullptr;
+
+ // If this class contains MemoryDefs or MemoryPhis, this is the leading memory
+ // access.
+ const MemoryAccess *RepMemoryAccess = nullptr;
+
+ // Defining Expression.
+ const Expression *DefiningExpr = nullptr;
+
+ // Actual members of this class.
+ MemberSet Members;
+
+ // This is the set of MemoryPhis that exist in the class. MemoryDefs and
+ // MemoryUses have real instructions representing them, so we only need to
+ // track MemoryPhis here.
+ MemoryMemberSet MemoryMembers;
+
+ // Number of stores in this congruence class.
+ // This is used so we can detect store equivalence changes properly.
+ int StoreCount = 0;
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+struct ExactEqualsExpression {
+ const Expression &E;
+
+ explicit ExactEqualsExpression(const Expression &E) : E(E) {}
+
+ hash_code getComputedHash() const { return E.getComputedHash(); }
+
+ bool operator==(const Expression &Other) const {
+ return E.exactlyEquals(Other);
+ }
+};
+
+template <> struct DenseMapInfo<const Expression *> {
+ static const Expression *getEmptyKey() {
+ auto Val = static_cast<uintptr_t>(-1);
+ Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
+ return reinterpret_cast<const Expression *>(Val);
+ }
+
+ static const Expression *getTombstoneKey() {
+ auto Val = static_cast<uintptr_t>(~1U);
+ Val <<= PointerLikeTypeTraits<const Expression *>::NumLowBitsAvailable;
+ return reinterpret_cast<const Expression *>(Val);
+ }
+
+ static unsigned getHashValue(const Expression *E) {
+ return E->getComputedHash();
+ }
+
+ static unsigned getHashValue(const ExactEqualsExpression &E) {
+ return E.getComputedHash();
+ }
+
+ static bool isEqual(const ExactEqualsExpression &LHS, const Expression *RHS) {
+ if (RHS == getTombstoneKey() || RHS == getEmptyKey())
+ return false;
+ return LHS == *RHS;
+ }
+
+ static bool isEqual(const Expression *LHS, const Expression *RHS) {
+ if (LHS == RHS)
+ return true;
+ if (LHS == getTombstoneKey() || RHS == getTombstoneKey() ||
+ LHS == getEmptyKey() || RHS == getEmptyKey())
+ return false;
+ // Compare hashes before equality. This is *not* what the hashtable does,
+ // since it is computing it modulo the number of buckets, whereas we are
+ // using the full hash keyspace. Since the hashes are precomputed, this
+ // check is *much* faster than equality.
+ if (LHS->getComputedHash() != RHS->getComputedHash())
+ return false;
+ return *LHS == *RHS;
+ }
+};
+
+} // end namespace llvm
+
+namespace {
+
+class NewGVN {
+ Function &F;
+ DominatorTree *DT = nullptr;
+ const TargetLibraryInfo *TLI = nullptr;
+ AliasAnalysis *AA = nullptr;
+ MemorySSA *MSSA = nullptr;
+ MemorySSAWalker *MSSAWalker = nullptr;
+ AssumptionCache *AC = nullptr;
+ const DataLayout &DL;
+ std::unique_ptr<PredicateInfo> PredInfo;
+
+ // These are the only two things the create* functions should have
+ // side-effects on due to allocating memory.
+ mutable BumpPtrAllocator ExpressionAllocator;
+ mutable ArrayRecycler<Value *> ArgRecycler;
+ mutable TarjanSCC SCCFinder;
+ const SimplifyQuery SQ;
+
+ // Number of function arguments, used by ranking
+ unsigned int NumFuncArgs = 0;
+
+ // RPOOrdering of basic blocks
+ DenseMap<const DomTreeNode *, unsigned> RPOOrdering;
+
+ // Congruence class info.
+
+ // This class is called INITIAL in the paper. It is the class everything
+ // startsout in, and represents any value. Being an optimistic analysis,
+ // anything in the TOP class has the value TOP, which is indeterminate and
+ // equivalent to everything.
+ CongruenceClass *TOPClass = nullptr;
+ std::vector<CongruenceClass *> CongruenceClasses;
+ unsigned NextCongruenceNum = 0;
+
+ // Value Mappings.
+ DenseMap<Value *, CongruenceClass *> ValueToClass;
+ DenseMap<Value *, const Expression *> ValueToExpression;
+
+ // Value PHI handling, used to make equivalence between phi(op, op) and
+ // op(phi, phi).
+ // These mappings just store various data that would normally be part of the
+ // IR.
+ SmallPtrSet<const Instruction *, 8> PHINodeUses;
+
+ DenseMap<const Value *, bool> OpSafeForPHIOfOps;
+
+ // Map a temporary instruction we created to a parent block.
+ DenseMap<const Value *, BasicBlock *> TempToBlock;
+
+ // Map between the already in-program instructions and the temporary phis we
+ // created that they are known equivalent to.
+ DenseMap<const Value *, PHINode *> RealToTemp;
+
+ // In order to know when we should re-process instructions that have
+ // phi-of-ops, we track the set of expressions that they needed as
+ // leaders. When we discover new leaders for those expressions, we process the
+ // associated phi-of-op instructions again in case they have changed. The
+ // other way they may change is if they had leaders, and those leaders
+ // disappear. However, at the point they have leaders, there are uses of the
+ // relevant operands in the created phi node, and so they will get reprocessed
+ // through the normal user marking we perform.
+ mutable DenseMap<const Value *, SmallPtrSet<Value *, 2>> AdditionalUsers;
+ DenseMap<const Expression *, SmallPtrSet<Instruction *, 2>>
+ ExpressionToPhiOfOps;
+
+ // Map from temporary operation to MemoryAccess.
+ DenseMap<const Instruction *, MemoryUseOrDef *> TempToMemory;
+
+ // Set of all temporary instructions we created.
+ // Note: This will include instructions that were just created during value
+ // numbering. The way to test if something is using them is to check
+ // RealToTemp.
+ DenseSet<Instruction *> AllTempInstructions;
+
+ // This is the set of instructions to revisit on a reachability change. At
+ // the end of the main iteration loop it will contain at least all the phi of
+ // ops instructions that will be changed to phis, as well as regular phis.
+ // During the iteration loop, it may contain other things, such as phi of ops
+ // instructions that used edge reachability to reach a result, and so need to
+ // be revisited when the edge changes, independent of whether the phi they
+ // depended on changes.
+ DenseMap<BasicBlock *, SparseBitVector<>> RevisitOnReachabilityChange;
+
+ // Mapping from predicate info we used to the instructions we used it with.
+ // In order to correctly ensure propagation, we must keep track of what
+ // comparisons we used, so that when the values of the comparisons change, we
+ // propagate the information to the places we used the comparison.
+ mutable DenseMap<const Value *, SmallPtrSet<Instruction *, 2>>
+ PredicateToUsers;
+
+ // the same reasoning as PredicateToUsers. When we skip MemoryAccesses for
+ // stores, we no longer can rely solely on the def-use chains of MemorySSA.
+ mutable DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>>
+ MemoryToUsers;
+
+ // A table storing which memorydefs/phis represent a memory state provably
+ // equivalent to another memory state.
+ // We could use the congruence class machinery, but the MemoryAccess's are
+ // abstract memory states, so they can only ever be equivalent to each other,
+ // and not to constants, etc.
+ DenseMap<const MemoryAccess *, CongruenceClass *> MemoryAccessToClass;
+
+ // We could, if we wanted, build MemoryPhiExpressions and
+ // MemoryVariableExpressions, etc, and value number them the same way we value
+ // number phi expressions. For the moment, this seems like overkill. They
+ // can only exist in one of three states: they can be TOP (equal to
+ // everything), Equivalent to something else, or unique. Because we do not
+ // create expressions for them, we need to simulate leader change not just
+ // when they change class, but when they change state. Note: We can do the
+ // same thing for phis, and avoid having phi expressions if we wanted, We
+ // should eventually unify in one direction or the other, so this is a little
+ // bit of an experiment in which turns out easier to maintain.
+ enum MemoryPhiState { MPS_Invalid, MPS_TOP, MPS_Equivalent, MPS_Unique };
+ DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
+
+ enum InstCycleState { ICS_Unknown, ICS_CycleFree, ICS_Cycle };
+ mutable DenseMap<const Instruction *, InstCycleState> InstCycleState;
+
+ // Expression to class mapping.
+ using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
+ ExpressionClassMap ExpressionToClass;
+
+ // We have a single expression that represents currently DeadExpressions.
+ // For dead expressions we can prove will stay dead, we mark them with
+ // DFS number zero. However, it's possible in the case of phi nodes
+ // for us to assume/prove all arguments are dead during fixpointing.
+ // We use DeadExpression for that case.
+ DeadExpression *SingletonDeadExpression = nullptr;
+
+ // Which values have changed as a result of leader changes.
+ SmallPtrSet<Value *, 8> LeaderChanges;
+
+ // Reachability info.
+ using BlockEdge = BasicBlockEdge;
+ DenseSet<BlockEdge> ReachableEdges;
+ SmallPtrSet<const BasicBlock *, 8> ReachableBlocks;
+
+ // This is a bitvector because, on larger functions, we may have
+ // thousands of touched instructions at once (entire blocks,
+ // instructions with hundreds of uses, etc). Even with optimization
+ // for when we mark whole blocks as touched, when this was a
+ // SmallPtrSet or DenseSet, for some functions, we spent >20% of all
+ // the time in GVN just managing this list. The bitvector, on the
+ // other hand, efficiently supports test/set/clear of both
+ // individual and ranges, as well as "find next element" This
+ // enables us to use it as a worklist with essentially 0 cost.
+ BitVector TouchedInstructions;
+
+ DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
+
+#ifndef NDEBUG
+ // Debugging for how many times each block and instruction got processed.
+ DenseMap<const Value *, unsigned> ProcessedCount;
+#endif
+
+ // DFS info.
+ // This contains a mapping from Instructions to DFS numbers.
+ // The numbering starts at 1. An instruction with DFS number zero
+ // means that the instruction is dead.
+ DenseMap<const Value *, unsigned> InstrDFS;
+
+ // This contains the mapping DFS numbers to instructions.
+ SmallVector<Value *, 32> DFSToInstr;
+
+ // Deletion info.
+ SmallPtrSet<Instruction *, 8> InstructionsToErase;
+
+public:
+ NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
+ TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
+ const DataLayout &DL)
+ : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), AC(AC), DL(DL),
+ PredInfo(std::make_unique<PredicateInfo>(F, *DT, *AC)),
SQ(DL, TLI, DT, AC, /*CtxI=*/nullptr, /*UseInstrInfo=*/false,
/*CanUseUndef=*/false) {}
-
- bool runGVN();
-
-private:
- // Expression handling.
- const Expression *createExpression(Instruction *) const;
- const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *,
- Instruction *) const;
-
- // Our canonical form for phi arguments is a pair of incoming value, incoming
- // basic block.
- using ValPair = std::pair<Value *, BasicBlock *>;
-
- PHIExpression *createPHIExpression(ArrayRef<ValPair>, const Instruction *,
- BasicBlock *, bool &HasBackEdge,
- bool &OriginalOpsConstant) const;
- const DeadExpression *createDeadExpression() const;
- const VariableExpression *createVariableExpression(Value *) const;
- const ConstantExpression *createConstantExpression(Constant *) const;
- const Expression *createVariableOrConstant(Value *V) const;
- const UnknownExpression *createUnknownExpression(Instruction *) const;
- const StoreExpression *createStoreExpression(StoreInst *,
- const MemoryAccess *) const;
- LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
- const MemoryAccess *) const;
- const CallExpression *createCallExpression(CallInst *,
- const MemoryAccess *) const;
- const AggregateValueExpression *
- createAggregateValueExpression(Instruction *) const;
- bool setBasicExpressionInfo(Instruction *, BasicExpression *) const;
-
- // Congruence class handling.
- CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
- auto *result = new CongruenceClass(NextCongruenceNum++, Leader, E);
- CongruenceClasses.emplace_back(result);
- return result;
- }
-
- CongruenceClass *createMemoryClass(MemoryAccess *MA) {
- auto *CC = createCongruenceClass(nullptr, nullptr);
- CC->setMemoryLeader(MA);
- return CC;
- }
-
- CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) {
- auto *CC = getMemoryClass(MA);
- if (CC->getMemoryLeader() != MA)
- CC = createMemoryClass(MA);
- return CC;
- }
-
- CongruenceClass *createSingletonCongruenceClass(Value *Member) {
- CongruenceClass *CClass = createCongruenceClass(Member, nullptr);
- CClass->insert(Member);
- ValueToClass[Member] = CClass;
- return CClass;
- }
-
- void initializeCongruenceClasses(Function &F);
- const Expression *makePossiblePHIOfOps(Instruction *,
- SmallPtrSetImpl<Value *> &);
- Value *findLeaderForInst(Instruction *ValueOp,
- SmallPtrSetImpl<Value *> &Visited,
- MemoryAccess *MemAccess, Instruction *OrigInst,
- BasicBlock *PredBB);
- bool OpIsSafeForPHIOfOpsHelper(Value *V, const BasicBlock *PHIBlock,
- SmallPtrSetImpl<const Value *> &Visited,
- SmallVectorImpl<Instruction *> &Worklist);
- bool OpIsSafeForPHIOfOps(Value *Op, const BasicBlock *PHIBlock,
- SmallPtrSetImpl<const Value *> &);
- void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue);
- void removePhiOfOps(Instruction *I, PHINode *PHITemp);
-
- // Value number an Instruction or MemoryPhi.
- void valueNumberMemoryPhi(MemoryPhi *);
- void valueNumberInstruction(Instruction *);
-
- // Symbolic evaluation.
- const Expression *checkSimplificationResults(Expression *, Instruction *,
- Value *) const;
- const Expression *performSymbolicEvaluation(Value *,
- SmallPtrSetImpl<Value *> &) const;
- const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
- Instruction *,
- MemoryAccess *) const;
- const Expression *performSymbolicLoadEvaluation(Instruction *) const;
- const Expression *performSymbolicStoreEvaluation(Instruction *) const;
- const Expression *performSymbolicCallEvaluation(Instruction *) const;
- void sortPHIOps(MutableArrayRef<ValPair> Ops) const;
- const Expression *performSymbolicPHIEvaluation(ArrayRef<ValPair>,
- Instruction *I,
- BasicBlock *PHIBlock) const;
- const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
- const Expression *performSymbolicCmpEvaluation(Instruction *) const;
- const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;
-
- // Congruence finding.
- bool someEquivalentDominates(const Instruction *, const Instruction *) const;
- Value *lookupOperandLeader(Value *) const;
- CongruenceClass *getClassForExpression(const Expression *E) const;
- void performCongruenceFinding(Instruction *, const Expression *);
- void moveValueToNewCongruenceClass(Instruction *, const Expression *,
- CongruenceClass *, CongruenceClass *);
- void moveMemoryToNewCongruenceClass(Instruction *, MemoryAccess *,
- CongruenceClass *, CongruenceClass *);
- Value *getNextValueLeader(CongruenceClass *) const;
- const MemoryAccess *getNextMemoryLeader(CongruenceClass *) const;
- bool setMemoryClass(const MemoryAccess *From, CongruenceClass *To);
- CongruenceClass *getMemoryClass(const MemoryAccess *MA) const;
- const MemoryAccess *lookupMemoryLeader(const MemoryAccess *) const;
- bool isMemoryAccessTOP(const MemoryAccess *) const;
-
- // Ranking
- unsigned int getRank(const Value *) const;
- bool shouldSwapOperands(const Value *, const Value *) const;
-
- // Reachability handling.
- void updateReachableEdge(BasicBlock *, BasicBlock *);
- void processOutgoingEdges(Instruction *, BasicBlock *);
- Value *findConditionEquivalence(Value *) const;
-
- // Elimination.
- struct ValueDFS;
- void convertClassToDFSOrdered(const CongruenceClass &,
- SmallVectorImpl<ValueDFS> &,
- DenseMap<const Value *, unsigned int> &,
- SmallPtrSetImpl<Instruction *> &) const;
- void convertClassToLoadsAndStores(const CongruenceClass &,
- SmallVectorImpl<ValueDFS> &) const;
-
- bool eliminateInstructions(Function &);
- void replaceInstruction(Instruction *, Value *);
- void markInstructionForDeletion(Instruction *);
- void deleteInstructionsInBlock(BasicBlock *);
- Value *findPHIOfOpsLeader(const Expression *, const Instruction *,
- const BasicBlock *) const;
-
- // Various instruction touch utilities
- template <typename Map, typename KeyType>
- void touchAndErase(Map &, const KeyType &);
- void markUsersTouched(Value *);
- void markMemoryUsersTouched(const MemoryAccess *);
- void markMemoryDefTouched(const MemoryAccess *);
- void markPredicateUsersTouched(Instruction *);
- void markValueLeaderChangeTouched(CongruenceClass *CC);
- void markMemoryLeaderChangeTouched(CongruenceClass *CC);
- void markPhiOfOpsChanged(const Expression *E);
- void addPredicateUsers(const PredicateBase *, Instruction *) const;
- void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;
- void addAdditionalUsers(Value *To, Value *User) const;
-
- // Main loop of value numbering
- void iterateTouchedInstructions();
-
- // Utilities.
- void cleanupTables();
- std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
- void updateProcessedCount(const Value *V);
- void verifyMemoryCongruency() const;
- void verifyIterationSettled(Function &F);
- void verifyStoreExpressions() const;
- bool singleReachablePHIPath(SmallPtrSet<const MemoryAccess *, 8> &,
- const MemoryAccess *, const MemoryAccess *) const;
- BasicBlock *getBlockForValue(Value *V) const;
- void deleteExpression(const Expression *E) const;
- MemoryUseOrDef *getMemoryAccess(const Instruction *) const;
- MemoryPhi *getMemoryAccess(const BasicBlock *) const;
- template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
-
- unsigned InstrToDFSNum(const Value *V) const {
- assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
- return InstrDFS.lookup(V);
- }
-
- unsigned InstrToDFSNum(const MemoryAccess *MA) const {
- return MemoryToDFSNum(MA);
- }
-
- Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; }
-
- // Given a MemoryAccess, return the relevant instruction DFS number. Note:
- // This deliberately takes a value so it can be used with Use's, which will
- // auto-convert to Value's but not to MemoryAccess's.
- unsigned MemoryToDFSNum(const Value *MA) const {
- assert(isa<MemoryAccess>(MA) &&
- "This should not be used with instructions");
- return isa<MemoryUseOrDef>(MA)
- ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
- : InstrDFS.lookup(MA);
- }
-
- bool isCycleFree(const Instruction *) const;
- bool isBackedge(BasicBlock *From, BasicBlock *To) const;
-
- // Debug counter info. When verifying, we have to reset the value numbering
- // debug counter to the same state it started in to get the same results.
- int64_t StartingVNCounter = 0;
-};
-
-} // end anonymous namespace
-
-template <typename T>
-static bool equalsLoadStoreHelper(const T &LHS, const Expression &RHS) {
- if (!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS))
- return false;
- return LHS.MemoryExpression::equals(RHS);
-}
-
-bool LoadExpression::equals(const Expression &Other) const {
- return equalsLoadStoreHelper(*this, Other);
-}
-
-bool StoreExpression::equals(const Expression &Other) const {
- if (!equalsLoadStoreHelper(*this, Other))
- return false;
- // Make sure that store vs store includes the value operand.
- if (const auto *S = dyn_cast<StoreExpression>(&Other))
- if (getStoredValue() != S->getStoredValue())
- return false;
- return true;
-}
-
-// Determine if the edge From->To is a backedge
-bool NewGVN::isBackedge(BasicBlock *From, BasicBlock *To) const {
- return From == To ||
- RPOOrdering.lookup(DT->getNode(From)) >=
- RPOOrdering.lookup(DT->getNode(To));
-}
-
-#ifndef NDEBUG
-static std::string getBlockName(const BasicBlock *B) {
- return DOTGraphTraits<DOTFuncInfo *>::getSimpleNodeLabel(B, nullptr);
-}
-#endif
-
-// Get a MemoryAccess for an instruction, fake or real.
-MemoryUseOrDef *NewGVN::getMemoryAccess(const Instruction *I) const {
- auto *Result = MSSA->getMemoryAccess(I);
- return Result ? Result : TempToMemory.lookup(I);
-}
-
-// Get a MemoryPhi for a basic block. These are all real.
-MemoryPhi *NewGVN::getMemoryAccess(const BasicBlock *BB) const {
- return MSSA->getMemoryAccess(BB);
-}
-
-// Get the basic block from an instruction/memory value.
-BasicBlock *NewGVN::getBlockForValue(Value *V) const {
- if (auto *I = dyn_cast<Instruction>(V)) {
- auto *Parent = I->getParent();
- if (Parent)
- return Parent;
- Parent = TempToBlock.lookup(V);
- assert(Parent && "Every fake instruction should have a block");
- return Parent;
- }
-
- auto *MP = dyn_cast<MemoryPhi>(V);
- assert(MP && "Should have been an instruction or a MemoryPhi");
- return MP->getBlock();
-}
-
-// Delete a definitely dead expression, so it can be reused by the expression
-// allocator. Some of these are not in creation functions, so we have to accept
-// const versions.
-void NewGVN::deleteExpression(const Expression *E) const {
- assert(isa<BasicExpression>(E));
- auto *BE = cast<BasicExpression>(E);
- const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
- ExpressionAllocator.Deallocate(E);
-}
-
-// If V is a predicateinfo copy, get the thing it is a copy of.
-static Value *getCopyOf(const Value *V) {
- if (auto *II = dyn_cast<IntrinsicInst>(V))
- if (II->getIntrinsicID() == Intrinsic::ssa_copy)
- return II->getOperand(0);
- return nullptr;
-}
-
-// Return true if V is really PN, even accounting for predicateinfo copies.
-static bool isCopyOfPHI(const Value *V, const PHINode *PN) {
- return V == PN || getCopyOf(V) == PN;
-}
-
-static bool isCopyOfAPHI(const Value *V) {
- auto *CO = getCopyOf(V);
- return CO && isa<PHINode>(CO);
-}
-
-// Sort PHI Operands into a canonical order. What we use here is an RPO
-// order. The BlockInstRange numbers are generated in an RPO walk of the basic
-// blocks.
-void NewGVN::sortPHIOps(MutableArrayRef<ValPair> Ops) const {
- llvm::sort(Ops, [&](const ValPair &P1, const ValPair &P2) {
- return BlockInstRange.lookup(P1.second).first <
- BlockInstRange.lookup(P2.second).first;
- });
-}
-
-// Return true if V is a value that will always be available (IE can
-// be placed anywhere) in the function. We don't do globals here
-// because they are often worse to put in place.
-static bool alwaysAvailable(Value *V) {
- return isa<Constant>(V) || isa<Argument>(V);
-}
-
-// Create a PHIExpression from an array of {incoming edge, value} pairs. I is
-// the original instruction we are creating a PHIExpression for (but may not be
-// a phi node). We require, as an invariant, that all the PHIOperands in the
-// same block are sorted the same way. sortPHIOps will sort them into a
-// canonical order.
-PHIExpression *NewGVN::createPHIExpression(ArrayRef<ValPair> PHIOperands,
- const Instruction *I,
- BasicBlock *PHIBlock,
- bool &HasBackedge,
- bool &OriginalOpsConstant) const {
- unsigned NumOps = PHIOperands.size();
- auto *E = new (ExpressionAllocator) PHIExpression(NumOps, PHIBlock);
-
- E->allocateOperands(ArgRecycler, ExpressionAllocator);
- E->setType(PHIOperands.begin()->first->getType());
- E->setOpcode(Instruction::PHI);
-
- // Filter out unreachable phi operands.
- auto Filtered = make_filter_range(PHIOperands, [&](const ValPair &P) {
- auto *BB = P.second;
- if (auto *PHIOp = dyn_cast<PHINode>(I))
- if (isCopyOfPHI(P.first, PHIOp))
- return false;
- if (!ReachableEdges.count({BB, PHIBlock}))
- return false;
- // Things in TOPClass are equivalent to everything.
- if (ValueToClass.lookup(P.first) == TOPClass)
- return false;
- OriginalOpsConstant = OriginalOpsConstant && isa<Constant>(P.first);
- HasBackedge = HasBackedge || isBackedge(BB, PHIBlock);
- return lookupOperandLeader(P.first) != I;
- });
- std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
- [&](const ValPair &P) -> Value * {
- return lookupOperandLeader(P.first);
- });
- return E;
-}
-
-// Set basic expression info (Arguments, type, opcode) for Expression
-// E from Instruction I in block B.
-bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const {
- bool AllConstant = true;
- if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
- E->setType(GEP->getSourceElementType());
- else
- E->setType(I->getType());
- E->setOpcode(I->getOpcode());
- E->allocateOperands(ArgRecycler, ExpressionAllocator);
-
- // Transform the operand array into an operand leader array, and keep track of
- // whether all members are constant.
- std::transform(I->op_begin(), I->op_end(), op_inserter(E), [&](Value *O) {
- auto Operand = lookupOperandLeader(O);
- AllConstant = AllConstant && isa<Constant>(Operand);
- return Operand;
- });
-
- return AllConstant;
-}
-
-const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
- Value *Arg1, Value *Arg2,
- Instruction *I) const {
- auto *E = new (ExpressionAllocator) BasicExpression(2);
-
- E->setType(T);
- E->setOpcode(Opcode);
- E->allocateOperands(ArgRecycler, ExpressionAllocator);
- if (Instruction::isCommutative(Opcode)) {
- // Ensure that commutative instructions that only differ by a permutation
- // of their operands get the same value number by sorting the operand value
- // numbers. Since all commutative instructions have two operands it is more
- // efficient to sort by hand rather than using, say, std::sort.
- if (shouldSwapOperands(Arg1, Arg2))
- std::swap(Arg1, Arg2);
- }
- E->op_push_back(lookupOperandLeader(Arg1));
- E->op_push_back(lookupOperandLeader(Arg2));
-
- Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), SQ);
- if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
- return SimplifiedE;
- return E;
-}
-
-// Take a Value returned by simplification of Expression E/Instruction
-// I, and see if it resulted in a simpler expression. If so, return
-// that expression.
-const Expression *NewGVN::checkSimplificationResults(Expression *E,
- Instruction *I,
- Value *V) const {
- if (!V)
- return nullptr;
- if (auto *C = dyn_cast<Constant>(V)) {
- if (I)
- LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
- << " constant " << *C << "\n");
- NumGVNOpsSimplified++;
- assert(isa<BasicExpression>(E) &&
- "We should always have had a basic expression here");
- deleteExpression(E);
- return createConstantExpression(C);
- } else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
- if (I)
- LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
- << " variable " << *V << "\n");
- deleteExpression(E);
- return createVariableExpression(V);
- }
-
- CongruenceClass *CC = ValueToClass.lookup(V);
- if (CC) {
- if (CC->getLeader() && CC->getLeader() != I) {
- // If we simplified to something else, we need to communicate
- // that we're users of the value we simplified to.
- if (I != V) {
- // Don't add temporary instructions to the user lists.
- if (!AllTempInstructions.count(I))
- addAdditionalUsers(V, I);
- }
- return createVariableOrConstant(CC->getLeader());
- }
- if (CC->getDefiningExpr()) {
- // If we simplified to something else, we need to communicate
- // that we're users of the value we simplified to.
- if (I != V) {
- // Don't add temporary instructions to the user lists.
- if (!AllTempInstructions.count(I))
- addAdditionalUsers(V, I);
- }
-
- if (I)
- LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
- << " expression " << *CC->getDefiningExpr() << "\n");
- NumGVNOpsSimplified++;
- deleteExpression(E);
- return CC->getDefiningExpr();
- }
- }
-
- return nullptr;
-}
-
-// Create a value expression from the instruction I, replacing operands with
-// their leaders.
-
-const Expression *NewGVN::createExpression(Instruction *I) const {
- auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
-
- bool AllConstant = setBasicExpressionInfo(I, E);
-
- if (I->isCommutative()) {
- // Ensure that commutative instructions that only differ by a permutation
- // of their operands get the same value number by sorting the operand value
- // numbers. Since all commutative instructions have two operands it is more
- // efficient to sort by hand rather than using, say, std::sort.
- assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
- if (shouldSwapOperands(E->getOperand(0), E->getOperand(1)))
- E->swapOperands(0, 1);
- }
- // Perform simplification.
- if (auto *CI = dyn_cast<CmpInst>(I)) {
- // Sort the operand value numbers so x<y and y>x get the same value
- // number.
- CmpInst::Predicate Predicate = CI->getPredicate();
- if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) {
- E->swapOperands(0, 1);
- Predicate = CmpInst::getSwappedPredicate(Predicate);
- }
- E->setOpcode((CI->getOpcode() << 8) | Predicate);
- // TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands
- assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() &&
- "Wrong types on cmp instruction");
- assert((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
- E->getOperand(1)->getType() == I->getOperand(1)->getType()));
- Value *V =
- SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1), SQ);
- if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
- return SimplifiedE;
- } else if (isa<SelectInst>(I)) {
- if (isa<Constant>(E->getOperand(0)) ||
- E->getOperand(1) == E->getOperand(2)) {
- assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
- E->getOperand(2)->getType() == I->getOperand(2)->getType());
- Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1),
- E->getOperand(2), SQ);
- if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
- return SimplifiedE;
- }
- } else if (I->isBinaryOp()) {
- Value *V =
- SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1), SQ);
- if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
- return SimplifiedE;
- } else if (auto *CI = dyn_cast<CastInst>(I)) {
- Value *V =
- SimplifyCastInst(CI->getOpcode(), E->getOperand(0), CI->getType(), SQ);
- if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
- return SimplifiedE;
- } else if (isa<GetElementPtrInst>(I)) {
- Value *V = SimplifyGEPInst(
- E->getType(), ArrayRef<Value *>(E->op_begin(), E->op_end()), SQ);
- if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
- return SimplifiedE;
- } else if (AllConstant) {
- // We don't bother trying to simplify unless all of the operands
- // were constant.
- // TODO: There are a lot of Simplify*'s we could call here, if we
- // wanted to. The original motivating case for this code was a
- // zext i1 false to i8, which we don't have an interface to
- // simplify (IE there is no SimplifyZExt).
-
- SmallVector<Constant *, 8> C;
- for (Value *Arg : E->operands())
- C.emplace_back(cast<Constant>(Arg));
-
- if (Value *V = ConstantFoldInstOperands(I, C, DL, TLI))
- if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
- return SimplifiedE;
- }
- return E;
-}
-
-const AggregateValueExpression *
-NewGVN::createAggregateValueExpression(Instruction *I) const {
- if (auto *II = dyn_cast<InsertValueInst>(I)) {
- auto *E = new (ExpressionAllocator)
- AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
- setBasicExpressionInfo(I, E);
- E->allocateIntOperands(ExpressionAllocator);
- std::copy(II->idx_begin(), II->idx_end(), int_op_inserter(E));
- return E;
- } else if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
- auto *E = new (ExpressionAllocator)
- AggregateValueExpression(I->getNumOperands(), EI->getNumIndices());
- setBasicExpressionInfo(EI, E);
- E->allocateIntOperands(ExpressionAllocator);
- std::copy(EI->idx_begin(), EI->idx_end(), int_op_inserter(E));
- return E;
- }
- llvm_unreachable("Unhandled type of aggregate value operation");
-}
-
-const DeadExpression *NewGVN::createDeadExpression() const {
- // DeadExpression has no arguments and all DeadExpression's are the same,
- // so we only need one of them.
- return SingletonDeadExpression;
-}
-
-const VariableExpression *NewGVN::createVariableExpression(Value *V) const {
- auto *E = new (ExpressionAllocator) VariableExpression(V);
- E->setOpcode(V->getValueID());
- return E;
-}
-
-const Expression *NewGVN::createVariableOrConstant(Value *V) const {
- if (auto *C = dyn_cast<Constant>(V))
- return createConstantExpression(C);
- return createVariableExpression(V);
-}
-
-const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const {
- auto *E = new (ExpressionAllocator) ConstantExpression(C);
- E->setOpcode(C->getValueID());
- return E;
-}
-
-const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {
- auto *E = new (ExpressionAllocator) UnknownExpression(I);
- E->setOpcode(I->getOpcode());
- return E;
-}
-
-const CallExpression *
-NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
- // FIXME: Add operand bundles for calls.
+
+ bool runGVN();
+
+private:
+ // Expression handling.
+ const Expression *createExpression(Instruction *) const;
+ const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *,
+ Instruction *) const;
+
+ // Our canonical form for phi arguments is a pair of incoming value, incoming
+ // basic block.
+ using ValPair = std::pair<Value *, BasicBlock *>;
+
+ PHIExpression *createPHIExpression(ArrayRef<ValPair>, const Instruction *,
+ BasicBlock *, bool &HasBackEdge,
+ bool &OriginalOpsConstant) const;
+ const DeadExpression *createDeadExpression() const;
+ const VariableExpression *createVariableExpression(Value *) const;
+ const ConstantExpression *createConstantExpression(Constant *) const;
+ const Expression *createVariableOrConstant(Value *V) const;
+ const UnknownExpression *createUnknownExpression(Instruction *) const;
+ const StoreExpression *createStoreExpression(StoreInst *,
+ const MemoryAccess *) const;
+ LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
+ const MemoryAccess *) const;
+ const CallExpression *createCallExpression(CallInst *,
+ const MemoryAccess *) const;
+ const AggregateValueExpression *
+ createAggregateValueExpression(Instruction *) const;
+ bool setBasicExpressionInfo(Instruction *, BasicExpression *) const;
+
+ // Congruence class handling.
+ CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
+ auto *result = new CongruenceClass(NextCongruenceNum++, Leader, E);
+ CongruenceClasses.emplace_back(result);
+ return result;
+ }
+
+ CongruenceClass *createMemoryClass(MemoryAccess *MA) {
+ auto *CC = createCongruenceClass(nullptr, nullptr);
+ CC->setMemoryLeader(MA);
+ return CC;
+ }
+
+ CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) {
+ auto *CC = getMemoryClass(MA);
+ if (CC->getMemoryLeader() != MA)
+ CC = createMemoryClass(MA);
+ return CC;
+ }
+
+ CongruenceClass *createSingletonCongruenceClass(Value *Member) {
+ CongruenceClass *CClass = createCongruenceClass(Member, nullptr);
+ CClass->insert(Member);
+ ValueToClass[Member] = CClass;
+ return CClass;
+ }
+
+ void initializeCongruenceClasses(Function &F);
+ const Expression *makePossiblePHIOfOps(Instruction *,
+ SmallPtrSetImpl<Value *> &);
+ Value *findLeaderForInst(Instruction *ValueOp,
+ SmallPtrSetImpl<Value *> &Visited,
+ MemoryAccess *MemAccess, Instruction *OrigInst,
+ BasicBlock *PredBB);
+ bool OpIsSafeForPHIOfOpsHelper(Value *V, const BasicBlock *PHIBlock,
+ SmallPtrSetImpl<const Value *> &Visited,
+ SmallVectorImpl<Instruction *> &Worklist);
+ bool OpIsSafeForPHIOfOps(Value *Op, const BasicBlock *PHIBlock,
+ SmallPtrSetImpl<const Value *> &);
+ void addPhiOfOps(PHINode *Op, BasicBlock *BB, Instruction *ExistingValue);
+ void removePhiOfOps(Instruction *I, PHINode *PHITemp);
+
+ // Value number an Instruction or MemoryPhi.
+ void valueNumberMemoryPhi(MemoryPhi *);
+ void valueNumberInstruction(Instruction *);
+
+ // Symbolic evaluation.
+ const Expression *checkSimplificationResults(Expression *, Instruction *,
+ Value *) const;
+ const Expression *performSymbolicEvaluation(Value *,
+ SmallPtrSetImpl<Value *> &) const;
+ const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
+ Instruction *,
+ MemoryAccess *) const;
+ const Expression *performSymbolicLoadEvaluation(Instruction *) const;
+ const Expression *performSymbolicStoreEvaluation(Instruction *) const;
+ const Expression *performSymbolicCallEvaluation(Instruction *) const;
+ void sortPHIOps(MutableArrayRef<ValPair> Ops) const;
+ const Expression *performSymbolicPHIEvaluation(ArrayRef<ValPair>,
+ Instruction *I,
+ BasicBlock *PHIBlock) const;
+ const Expression *performSymbolicAggrValueEvaluation(Instruction *) const;
+ const Expression *performSymbolicCmpEvaluation(Instruction *) const;
+ const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const;
+
+ // Congruence finding.
+ bool someEquivalentDominates(const Instruction *, const Instruction *) const;
+ Value *lookupOperandLeader(Value *) const;
+ CongruenceClass *getClassForExpression(const Expression *E) const;
+ void performCongruenceFinding(Instruction *, const Expression *);
+ void moveValueToNewCongruenceClass(Instruction *, const Expression *,
+ CongruenceClass *, CongruenceClass *);
+ void moveMemoryToNewCongruenceClass(Instruction *, MemoryAccess *,
+ CongruenceClass *, CongruenceClass *);
+ Value *getNextValueLeader(CongruenceClass *) const;
+ const MemoryAccess *getNextMemoryLeader(CongruenceClass *) const;
+ bool setMemoryClass(const MemoryAccess *From, CongruenceClass *To);
+ CongruenceClass *getMemoryClass(const MemoryAccess *MA) const;
+ const MemoryAccess *lookupMemoryLeader(const MemoryAccess *) const;
+ bool isMemoryAccessTOP(const MemoryAccess *) const;
+
+ // Ranking
+ unsigned int getRank(const Value *) const;
+ bool shouldSwapOperands(const Value *, const Value *) const;
+
+ // Reachability handling.
+ void updateReachableEdge(BasicBlock *, BasicBlock *);
+ void processOutgoingEdges(Instruction *, BasicBlock *);
+ Value *findConditionEquivalence(Value *) const;
+
+ // Elimination.
+ struct ValueDFS;
+ void convertClassToDFSOrdered(const CongruenceClass &,
+ SmallVectorImpl<ValueDFS> &,
+ DenseMap<const Value *, unsigned int> &,
+ SmallPtrSetImpl<Instruction *> &) const;
+ void convertClassToLoadsAndStores(const CongruenceClass &,
+ SmallVectorImpl<ValueDFS> &) const;
+
+ bool eliminateInstructions(Function &);
+ void replaceInstruction(Instruction *, Value *);
+ void markInstructionForDeletion(Instruction *);
+ void deleteInstructionsInBlock(BasicBlock *);
+ Value *findPHIOfOpsLeader(const Expression *, const Instruction *,
+ const BasicBlock *) const;
+
+ // Various instruction touch utilities
+ template <typename Map, typename KeyType>
+ void touchAndErase(Map &, const KeyType &);
+ void markUsersTouched(Value *);
+ void markMemoryUsersTouched(const MemoryAccess *);
+ void markMemoryDefTouched(const MemoryAccess *);
+ void markPredicateUsersTouched(Instruction *);
+ void markValueLeaderChangeTouched(CongruenceClass *CC);
+ void markMemoryLeaderChangeTouched(CongruenceClass *CC);
+ void markPhiOfOpsChanged(const Expression *E);
+ void addPredicateUsers(const PredicateBase *, Instruction *) const;
+ void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const;
+ void addAdditionalUsers(Value *To, Value *User) const;
+
+ // Main loop of value numbering
+ void iterateTouchedInstructions();
+
+ // Utilities.
+ void cleanupTables();
+ std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
+ void updateProcessedCount(const Value *V);
+ void verifyMemoryCongruency() const;
+ void verifyIterationSettled(Function &F);
+ void verifyStoreExpressions() const;
+ bool singleReachablePHIPath(SmallPtrSet<const MemoryAccess *, 8> &,
+ const MemoryAccess *, const MemoryAccess *) const;
+ BasicBlock *getBlockForValue(Value *V) const;
+ void deleteExpression(const Expression *E) const;
+ MemoryUseOrDef *getMemoryAccess(const Instruction *) const;
+ MemoryPhi *getMemoryAccess(const BasicBlock *) const;
+ template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
+
+ unsigned InstrToDFSNum(const Value *V) const {
+ assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
+ return InstrDFS.lookup(V);
+ }
+
+ unsigned InstrToDFSNum(const MemoryAccess *MA) const {
+ return MemoryToDFSNum(MA);
+ }
+
+ Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; }
+
+ // Given a MemoryAccess, return the relevant instruction DFS number. Note:
+ // This deliberately takes a value so it can be used with Use's, which will
+ // auto-convert to Value's but not to MemoryAccess's.
+ unsigned MemoryToDFSNum(const Value *MA) const {
+ assert(isa<MemoryAccess>(MA) &&
+ "This should not be used with instructions");
+ return isa<MemoryUseOrDef>(MA)
+ ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
+ : InstrDFS.lookup(MA);
+ }
+
+ bool isCycleFree(const Instruction *) const;
+ bool isBackedge(BasicBlock *From, BasicBlock *To) const;
+
+ // Debug counter info. When verifying, we have to reset the value numbering
+ // debug counter to the same state it started in to get the same results.
+ int64_t StartingVNCounter = 0;
+};
+
+} // end anonymous namespace
+
+template <typename T>
+static bool equalsLoadStoreHelper(const T &LHS, const Expression &RHS) {
+ if (!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS))
+ return false;
+ return LHS.MemoryExpression::equals(RHS);
+}
+
+bool LoadExpression::equals(const Expression &Other) const {
+ return equalsLoadStoreHelper(*this, Other);
+}
+
+bool StoreExpression::equals(const Expression &Other) const {
+ if (!equalsLoadStoreHelper(*this, Other))
+ return false;
+ // Make sure that store vs store includes the value operand.
+ if (const auto *S = dyn_cast<StoreExpression>(&Other))
+ if (getStoredValue() != S->getStoredValue())
+ return false;
+ return true;
+}
+
+// Determine if the edge From->To is a backedge
+bool NewGVN::isBackedge(BasicBlock *From, BasicBlock *To) const {
+ return From == To ||
+ RPOOrdering.lookup(DT->getNode(From)) >=
+ RPOOrdering.lookup(DT->getNode(To));
+}
+
+#ifndef NDEBUG
+static std::string getBlockName(const BasicBlock *B) {
+ return DOTGraphTraits<DOTFuncInfo *>::getSimpleNodeLabel(B, nullptr);
+}
+#endif
+
+// Get a MemoryAccess for an instruction, fake or real.
+MemoryUseOrDef *NewGVN::getMemoryAccess(const Instruction *I) const {
+ auto *Result = MSSA->getMemoryAccess(I);
+ return Result ? Result : TempToMemory.lookup(I);
+}
+
+// Get a MemoryPhi for a basic block. These are all real.
+MemoryPhi *NewGVN::getMemoryAccess(const BasicBlock *BB) const {
+ return MSSA->getMemoryAccess(BB);
+}
+
+// Get the basic block from an instruction/memory value.
+BasicBlock *NewGVN::getBlockForValue(Value *V) const {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ auto *Parent = I->getParent();
+ if (Parent)
+ return Parent;
+ Parent = TempToBlock.lookup(V);
+ assert(Parent && "Every fake instruction should have a block");
+ return Parent;
+ }
+
+ auto *MP = dyn_cast<MemoryPhi>(V);
+ assert(MP && "Should have been an instruction or a MemoryPhi");
+ return MP->getBlock();
+}
+
+// Delete a definitely dead expression, so it can be reused by the expression
+// allocator. Some of these are not in creation functions, so we have to accept
+// const versions.
+void NewGVN::deleteExpression(const Expression *E) const {
+ assert(isa<BasicExpression>(E));
+ auto *BE = cast<BasicExpression>(E);
+ const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
+ ExpressionAllocator.Deallocate(E);
+}
+
+// If V is a predicateinfo copy, get the thing it is a copy of.
+static Value *getCopyOf(const Value *V) {
+ if (auto *II = dyn_cast<IntrinsicInst>(V))
+ if (II->getIntrinsicID() == Intrinsic::ssa_copy)
+ return II->getOperand(0);
+ return nullptr;
+}
+
+// Return true if V is really PN, even accounting for predicateinfo copies.
+static bool isCopyOfPHI(const Value *V, const PHINode *PN) {
+ return V == PN || getCopyOf(V) == PN;
+}
+
+static bool isCopyOfAPHI(const Value *V) {
+ auto *CO = getCopyOf(V);
+ return CO && isa<PHINode>(CO);
+}
+
+// Sort PHI Operands into a canonical order. What we use here is an RPO
+// order. The BlockInstRange numbers are generated in an RPO walk of the basic
+// blocks.
+void NewGVN::sortPHIOps(MutableArrayRef<ValPair> Ops) const {
+ llvm::sort(Ops, [&](const ValPair &P1, const ValPair &P2) {
+ return BlockInstRange.lookup(P1.second).first <
+ BlockInstRange.lookup(P2.second).first;
+ });
+}
+
+// Return true if V is a value that will always be available (IE can
+// be placed anywhere) in the function. We don't do globals here
+// because they are often worse to put in place.
+static bool alwaysAvailable(Value *V) {
+ return isa<Constant>(V) || isa<Argument>(V);
+}
+
+// Create a PHIExpression from an array of {incoming edge, value} pairs. I is
+// the original instruction we are creating a PHIExpression for (but may not be
+// a phi node). We require, as an invariant, that all the PHIOperands in the
+// same block are sorted the same way. sortPHIOps will sort them into a
+// canonical order.
+PHIExpression *NewGVN::createPHIExpression(ArrayRef<ValPair> PHIOperands,
+ const Instruction *I,
+ BasicBlock *PHIBlock,
+ bool &HasBackedge,
+ bool &OriginalOpsConstant) const {
+ unsigned NumOps = PHIOperands.size();
+ auto *E = new (ExpressionAllocator) PHIExpression(NumOps, PHIBlock);
+
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+ E->setType(PHIOperands.begin()->first->getType());
+ E->setOpcode(Instruction::PHI);
+
+ // Filter out unreachable phi operands.
+ auto Filtered = make_filter_range(PHIOperands, [&](const ValPair &P) {
+ auto *BB = P.second;
+ if (auto *PHIOp = dyn_cast<PHINode>(I))
+ if (isCopyOfPHI(P.first, PHIOp))
+ return false;
+ if (!ReachableEdges.count({BB, PHIBlock}))
+ return false;
+ // Things in TOPClass are equivalent to everything.
+ if (ValueToClass.lookup(P.first) == TOPClass)
+ return false;
+ OriginalOpsConstant = OriginalOpsConstant && isa<Constant>(P.first);
+ HasBackedge = HasBackedge || isBackedge(BB, PHIBlock);
+ return lookupOperandLeader(P.first) != I;
+ });
+ std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
+ [&](const ValPair &P) -> Value * {
+ return lookupOperandLeader(P.first);
+ });
+ return E;
+}
+
+// Set basic expression info (Arguments, type, opcode) for Expression
+// E from Instruction I in block B.
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const {
+ bool AllConstant = true;
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+ E->setType(GEP->getSourceElementType());
+ else
+ E->setType(I->getType());
+ E->setOpcode(I->getOpcode());
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+
+ // Transform the operand array into an operand leader array, and keep track of
+ // whether all members are constant.
+ std::transform(I->op_begin(), I->op_end(), op_inserter(E), [&](Value *O) {
+ auto Operand = lookupOperandLeader(O);
+ AllConstant = AllConstant && isa<Constant>(Operand);
+ return Operand;
+ });
+
+ return AllConstant;
+}
+
+const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
+ Value *Arg1, Value *Arg2,
+ Instruction *I) const {
+ auto *E = new (ExpressionAllocator) BasicExpression(2);
+
+ E->setType(T);
+ E->setOpcode(Opcode);
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+ if (Instruction::isCommutative(Opcode)) {
+ // Ensure that commutative instructions that only differ by a permutation
+ // of their operands get the same value number by sorting the operand value
+ // numbers. Since all commutative instructions have two operands it is more
+ // efficient to sort by hand rather than using, say, std::sort.
+ if (shouldSwapOperands(Arg1, Arg2))
+ std::swap(Arg1, Arg2);
+ }
+ E->op_push_back(lookupOperandLeader(Arg1));
+ E->op_push_back(lookupOperandLeader(Arg2));
+
+ Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), SQ);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ return E;
+}
+
+// Take a Value returned by simplification of Expression E/Instruction
+// I, and see if it resulted in a simpler expression. If so, return
+// that expression.
+const Expression *NewGVN::checkSimplificationResults(Expression *E,
+ Instruction *I,
+ Value *V) const {
+ if (!V)
+ return nullptr;
+ if (auto *C = dyn_cast<Constant>(V)) {
+ if (I)
+ LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " constant " << *C << "\n");
+ NumGVNOpsSimplified++;
+ assert(isa<BasicExpression>(E) &&
+ "We should always have had a basic expression here");
+ deleteExpression(E);
+ return createConstantExpression(C);
+ } else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
+ if (I)
+ LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " variable " << *V << "\n");
+ deleteExpression(E);
+ return createVariableExpression(V);
+ }
+
+ CongruenceClass *CC = ValueToClass.lookup(V);
+ if (CC) {
+ if (CC->getLeader() && CC->getLeader() != I) {
+ // If we simplified to something else, we need to communicate
+ // that we're users of the value we simplified to.
+ if (I != V) {
+ // Don't add temporary instructions to the user lists.
+ if (!AllTempInstructions.count(I))
+ addAdditionalUsers(V, I);
+ }
+ return createVariableOrConstant(CC->getLeader());
+ }
+ if (CC->getDefiningExpr()) {
+ // If we simplified to something else, we need to communicate
+ // that we're users of the value we simplified to.
+ if (I != V) {
+ // Don't add temporary instructions to the user lists.
+ if (!AllTempInstructions.count(I))
+ addAdditionalUsers(V, I);
+ }
+
+ if (I)
+ LLVM_DEBUG(dbgs() << "Simplified " << *I << " to "
+ << " expression " << *CC->getDefiningExpr() << "\n");
+ NumGVNOpsSimplified++;
+ deleteExpression(E);
+ return CC->getDefiningExpr();
+ }
+ }
+
+ return nullptr;
+}
+
+// Create a value expression from the instruction I, replacing operands with
+// their leaders.
+
+const Expression *NewGVN::createExpression(Instruction *I) const {
+ auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
+
+ bool AllConstant = setBasicExpressionInfo(I, E);
+
+ if (I->isCommutative()) {
+ // Ensure that commutative instructions that only differ by a permutation
+ // of their operands get the same value number by sorting the operand value
+ // numbers. Since all commutative instructions have two operands it is more
+ // efficient to sort by hand rather than using, say, std::sort.
+ assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
+ if (shouldSwapOperands(E->getOperand(0), E->getOperand(1)))
+ E->swapOperands(0, 1);
+ }
+ // Perform simplification.
+ if (auto *CI = dyn_cast<CmpInst>(I)) {
+ // Sort the operand value numbers so x<y and y>x get the same value
+ // number.
+ CmpInst::Predicate Predicate = CI->getPredicate();
+ if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) {
+ E->swapOperands(0, 1);
+ Predicate = CmpInst::getSwappedPredicate(Predicate);
+ }
+ E->setOpcode((CI->getOpcode() << 8) | Predicate);
+ // TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands
+ assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() &&
+ "Wrong types on cmp instruction");
+ assert((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
+ E->getOperand(1)->getType() == I->getOperand(1)->getType()));
+ Value *V =
+ SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1), SQ);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ } else if (isa<SelectInst>(I)) {
+ if (isa<Constant>(E->getOperand(0)) ||
+ E->getOperand(1) == E->getOperand(2)) {
+ assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
+ E->getOperand(2)->getType() == I->getOperand(2)->getType());
+ Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1),
+ E->getOperand(2), SQ);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ }
+ } else if (I->isBinaryOp()) {
+ Value *V =
+ SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1), SQ);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ } else if (auto *CI = dyn_cast<CastInst>(I)) {
+ Value *V =
+ SimplifyCastInst(CI->getOpcode(), E->getOperand(0), CI->getType(), SQ);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ } else if (isa<GetElementPtrInst>(I)) {
+ Value *V = SimplifyGEPInst(
+ E->getType(), ArrayRef<Value *>(E->op_begin(), E->op_end()), SQ);
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ } else if (AllConstant) {
+ // We don't bother trying to simplify unless all of the operands
+ // were constant.
+ // TODO: There are a lot of Simplify*'s we could call here, if we
+ // wanted to. The original motivating case for this code was a
+ // zext i1 false to i8, which we don't have an interface to
+ // simplify (IE there is no SimplifyZExt).
+
+ SmallVector<Constant *, 8> C;
+ for (Value *Arg : E->operands())
+ C.emplace_back(cast<Constant>(Arg));
+
+ if (Value *V = ConstantFoldInstOperands(I, C, DL, TLI))
+ if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+ return SimplifiedE;
+ }
+ return E;
+}
+
+const AggregateValueExpression *
+NewGVN::createAggregateValueExpression(Instruction *I) const {
+ if (auto *II = dyn_cast<InsertValueInst>(I)) {
+ auto *E = new (ExpressionAllocator)
+ AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
+ setBasicExpressionInfo(I, E);
+ E->allocateIntOperands(ExpressionAllocator);
+ std::copy(II->idx_begin(), II->idx_end(), int_op_inserter(E));
+ return E;
+ } else if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
+ auto *E = new (ExpressionAllocator)
+ AggregateValueExpression(I->getNumOperands(), EI->getNumIndices());
+ setBasicExpressionInfo(EI, E);
+ E->allocateIntOperands(ExpressionAllocator);
+ std::copy(EI->idx_begin(), EI->idx_end(), int_op_inserter(E));
+ return E;
+ }
+ llvm_unreachable("Unhandled type of aggregate value operation");
+}
+
+const DeadExpression *NewGVN::createDeadExpression() const {
+ // DeadExpression has no arguments and all DeadExpression's are the same,
+ // so we only need one of them.
+ return SingletonDeadExpression;
+}
+
+const VariableExpression *NewGVN::createVariableExpression(Value *V) const {
+ auto *E = new (ExpressionAllocator) VariableExpression(V);
+ E->setOpcode(V->getValueID());
+ return E;
+}
+
+const Expression *NewGVN::createVariableOrConstant(Value *V) const {
+ if (auto *C = dyn_cast<Constant>(V))
+ return createConstantExpression(C);
+ return createVariableExpression(V);
+}
+
+const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const {
+ auto *E = new (ExpressionAllocator) ConstantExpression(C);
+ E->setOpcode(C->getValueID());
+ return E;
+}
+
+const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const {
+ auto *E = new (ExpressionAllocator) UnknownExpression(I);
+ E->setOpcode(I->getOpcode());
+ return E;
+}
+
+const CallExpression *
+NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const {
+ // FIXME: Add operand bundles for calls.
// FIXME: Allow commutative matching for intrinsics.
- auto *E =
- new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
- setBasicExpressionInfo(CI, E);
- return E;
-}
-
-// Return true if some equivalent of instruction Inst dominates instruction U.
-bool NewGVN::someEquivalentDominates(const Instruction *Inst,
- const Instruction *U) const {
- auto *CC = ValueToClass.lookup(Inst);
- // This must be an instruction because we are only called from phi nodes
- // in the case that the value it needs to check against is an instruction.
-
- // The most likely candidates for dominance are the leader and the next leader.
- // The leader or nextleader will dominate in all cases where there is an
- // equivalent that is higher up in the dom tree.
- // We can't *only* check them, however, because the
- // dominator tree could have an infinite number of non-dominating siblings
- // with instructions that are in the right congruence class.
- // A
- // B C D E F G
- // |
- // H
- // Instruction U could be in H, with equivalents in every other sibling.
- // Depending on the rpo order picked, the leader could be the equivalent in
- // any of these siblings.
- if (!CC)
- return false;
- if (alwaysAvailable(CC->getLeader()))
- return true;
- if (DT->dominates(cast<Instruction>(CC->getLeader()), U))
- return true;
- if (CC->getNextLeader().first &&
- DT->dominates(cast<Instruction>(CC->getNextLeader().first), U))
- return true;
- return llvm::any_of(*CC, [&](const Value *Member) {
- return Member != CC->getLeader() &&
- DT->dominates(cast<Instruction>(Member), U);
- });
-}
-
-// See if we have a congruence class and leader for this operand, and if so,
-// return it. Otherwise, return the operand itself.
-Value *NewGVN::lookupOperandLeader(Value *V) const {
- CongruenceClass *CC = ValueToClass.lookup(V);
- if (CC) {
- // Everything in TOP is represented by undef, as it can be any value.
- // We do have to make sure we get the type right though, so we can't set the
- // RepLeader to undef.
- if (CC == TOPClass)
- return UndefValue::get(V->getType());
- return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
- }
-
- return V;
-}
-
-const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
- auto *CC = getMemoryClass(MA);
- assert(CC->getMemoryLeader() &&
- "Every MemoryAccess should be mapped to a congruence class with a "
- "representative memory access");
- return CC->getMemoryLeader();
-}
-
-// Return true if the MemoryAccess is really equivalent to everything. This is
-// equivalent to the lattice value "TOP" in most lattices. This is the initial
-// state of all MemoryAccesses.
-bool NewGVN::isMemoryAccessTOP(const MemoryAccess *MA) const {
- return getMemoryClass(MA) == TOPClass;
-}
-
-LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
- LoadInst *LI,
- const MemoryAccess *MA) const {
- auto *E =
- new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
- E->allocateOperands(ArgRecycler, ExpressionAllocator);
- E->setType(LoadType);
-
- // Give store and loads same opcode so they value number together.
- E->setOpcode(0);
- E->op_push_back(PointerOp);
-
- // TODO: Value number heap versions. We may be able to discover
- // things alias analysis can't on it's own (IE that a store and a
- // load have the same value, and thus, it isn't clobbering the load).
- return E;
-}
-
-const StoreExpression *
-NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const {
- auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
- auto *E = new (ExpressionAllocator)
- StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
- E->allocateOperands(ArgRecycler, ExpressionAllocator);
- E->setType(SI->getValueOperand()->getType());
-
- // Give store and loads same opcode so they value number together.
- E->setOpcode(0);
- E->op_push_back(lookupOperandLeader(SI->getPointerOperand()));
-
- // TODO: Value number heap versions. We may be able to discover
- // things alias analysis can't on it's own (IE that a store and a
- // load have the same value, and thus, it isn't clobbering the load).
- return E;
-}
-
-const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
- // Unlike loads, we never try to eliminate stores, so we do not check if they
- // are simple and avoid value numbering them.
- auto *SI = cast<StoreInst>(I);
- auto *StoreAccess = getMemoryAccess(SI);
- // Get the expression, if any, for the RHS of the MemoryDef.
- const MemoryAccess *StoreRHS = StoreAccess->getDefiningAccess();
- if (EnableStoreRefinement)
- StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess);
- // If we bypassed the use-def chains, make sure we add a use.
- StoreRHS = lookupMemoryLeader(StoreRHS);
- if (StoreRHS != StoreAccess->getDefiningAccess())
- addMemoryUsers(StoreRHS, StoreAccess);
- // If we are defined by ourselves, use the live on entry def.
- if (StoreRHS == StoreAccess)
- StoreRHS = MSSA->getLiveOnEntryDef();
-
- if (SI->isSimple()) {
- // See if we are defined by a previous store expression, it already has a
- // value, and it's the same value as our current store. FIXME: Right now, we
- // only do this for simple stores, we should expand to cover memcpys, etc.
- const auto *LastStore = createStoreExpression(SI, StoreRHS);
- const auto *LastCC = ExpressionToClass.lookup(LastStore);
- // We really want to check whether the expression we matched was a store. No
- // easy way to do that. However, we can check that the class we found has a
- // store, which, assuming the value numbering state is not corrupt, is
- // sufficient, because we must also be equivalent to that store's expression
- // for it to be in the same class as the load.
- if (LastCC && LastCC->getStoredValue() == LastStore->getStoredValue())
- return LastStore;
- // Also check if our value operand is defined by a load of the same memory
- // location, and the memory state is the same as it was then (otherwise, it
- // could have been overwritten later. See test32 in
- // transforms/DeadStoreElimination/simple.ll).
- if (auto *LI = dyn_cast<LoadInst>(LastStore->getStoredValue()))
- if ((lookupOperandLeader(LI->getPointerOperand()) ==
- LastStore->getOperand(0)) &&
- (lookupMemoryLeader(getMemoryAccess(LI)->getDefiningAccess()) ==
- StoreRHS))
- return LastStore;
- deleteExpression(LastStore);
- }
-
- // If the store is not equivalent to anything, value number it as a store that
- // produces a unique memory state (instead of using it's MemoryUse, we use
- // it's MemoryDef).
- return createStoreExpression(SI, StoreAccess);
-}
-
-// See if we can extract the value of a loaded pointer from a load, a store, or
-// a memory instruction.
-const Expression *
-NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
- LoadInst *LI, Instruction *DepInst,
- MemoryAccess *DefiningAccess) const {
- assert((!LI || LI->isSimple()) && "Not a simple load");
- if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
- // Can't forward from non-atomic to atomic without violating memory model.
- // Also don't need to coerce if they are the same type, we will just
- // propagate.
- if (LI->isAtomic() > DepSI->isAtomic() ||
- LoadType == DepSI->getValueOperand()->getType())
- return nullptr;
- int Offset = analyzeLoadFromClobberingStore(LoadType, LoadPtr, DepSI, DL);
- if (Offset >= 0) {
- if (auto *C = dyn_cast<Constant>(
- lookupOperandLeader(DepSI->getValueOperand()))) {
- LLVM_DEBUG(dbgs() << "Coercing load from store " << *DepSI
- << " to constant " << *C << "\n");
- return createConstantExpression(
- getConstantStoreValueForLoad(C, Offset, LoadType, DL));
- }
- }
- } else if (auto *DepLI = dyn_cast<LoadInst>(DepInst)) {
- // Can't forward from non-atomic to atomic without violating memory model.
- if (LI->isAtomic() > DepLI->isAtomic())
- return nullptr;
- int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL);
- if (Offset >= 0) {
- // We can coerce a constant load into a load.
- if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
- if (auto *PossibleConstant =
- getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
- LLVM_DEBUG(dbgs() << "Coercing load from load " << *LI
- << " to constant " << *PossibleConstant << "\n");
- return createConstantExpression(PossibleConstant);
- }
- }
- } else if (auto *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
- int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL);
- if (Offset >= 0) {
- if (auto *PossibleConstant =
- getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) {
- LLVM_DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
- << " to constant " << *PossibleConstant << "\n");
- return createConstantExpression(PossibleConstant);
- }
- }
- }
-
- // All of the below are only true if the loaded pointer is produced
- // by the dependent instruction.
- if (LoadPtr != lookupOperandLeader(DepInst) &&
- !AA->isMustAlias(LoadPtr, DepInst))
- return nullptr;
- // If this load really doesn't depend on anything, then we must be loading an
- // undef value. This can happen when loading for a fresh allocation with no
- // intervening stores, for example. Note that this is only true in the case
- // that the result of the allocation is pointer equal to the load ptr.
- if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
- isAlignedAllocLikeFn(DepInst, TLI)) {
- return createConstantExpression(UndefValue::get(LoadType));
- }
- // If this load occurs either right after a lifetime begin,
- // then the loaded value is undefined.
- else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
- if (II->getIntrinsicID() == Intrinsic::lifetime_start)
- return createConstantExpression(UndefValue::get(LoadType));
- }
- // If this load follows a calloc (which zero initializes memory),
- // then the loaded value is zero
- else if (isCallocLikeFn(DepInst, TLI)) {
- return createConstantExpression(Constant::getNullValue(LoadType));
- }
-
- return nullptr;
-}
-
-const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
- auto *LI = cast<LoadInst>(I);
-
- // We can eliminate in favor of non-simple loads, but we won't be able to
- // eliminate the loads themselves.
- if (!LI->isSimple())
- return nullptr;
-
- Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand());
- // Load of undef is undef.
- if (isa<UndefValue>(LoadAddressLeader))
- return createConstantExpression(UndefValue::get(LI->getType()));
- MemoryAccess *OriginalAccess = getMemoryAccess(I);
- MemoryAccess *DefiningAccess =
- MSSAWalker->getClobberingMemoryAccess(OriginalAccess);
-
- if (!MSSA->isLiveOnEntryDef(DefiningAccess)) {
- if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) {
- Instruction *DefiningInst = MD->getMemoryInst();
- // If the defining instruction is not reachable, replace with undef.
- if (!ReachableBlocks.count(DefiningInst->getParent()))
- return createConstantExpression(UndefValue::get(LI->getType()));
- // This will handle stores and memory insts. We only do if it the
- // defining access has a different type, or it is a pointer produced by
- // certain memory operations that cause the memory to have a fixed value
- // (IE things like calloc).
- if (const auto *CoercionResult =
- performSymbolicLoadCoercion(LI->getType(), LoadAddressLeader, LI,
- DefiningInst, DefiningAccess))
- return CoercionResult;
- }
- }
-
- const auto *LE = createLoadExpression(LI->getType(), LoadAddressLeader, LI,
- DefiningAccess);
- // If our MemoryLeader is not our defining access, add a use to the
- // MemoryLeader, so that we get reprocessed when it changes.
- if (LE->getMemoryLeader() != DefiningAccess)
- addMemoryUsers(LE->getMemoryLeader(), OriginalAccess);
- return LE;
-}
-
-const Expression *
-NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
- auto *PI = PredInfo->getPredicateInfoFor(I);
- if (!PI)
- return nullptr;
-
- LLVM_DEBUG(dbgs() << "Found predicate info from instruction !\n");
-
+ auto *E =
+ new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
+ setBasicExpressionInfo(CI, E);
+ return E;
+}
+
+// Return true if some equivalent of instruction Inst dominates instruction U.
+bool NewGVN::someEquivalentDominates(const Instruction *Inst,
+ const Instruction *U) const {
+ auto *CC = ValueToClass.lookup(Inst);
+ // This must be an instruction because we are only called from phi nodes
+ // in the case that the value it needs to check against is an instruction.
+
+ // The most likely candidates for dominance are the leader and the next leader.
+ // The leader or nextleader will dominate in all cases where there is an
+ // equivalent that is higher up in the dom tree.
+ // We can't *only* check them, however, because the
+ // dominator tree could have an infinite number of non-dominating siblings
+ // with instructions that are in the right congruence class.
+ // A
+ // B C D E F G
+ // |
+ // H
+ // Instruction U could be in H, with equivalents in every other sibling.
+ // Depending on the rpo order picked, the leader could be the equivalent in
+ // any of these siblings.
+ if (!CC)
+ return false;
+ if (alwaysAvailable(CC->getLeader()))
+ return true;
+ if (DT->dominates(cast<Instruction>(CC->getLeader()), U))
+ return true;
+ if (CC->getNextLeader().first &&
+ DT->dominates(cast<Instruction>(CC->getNextLeader().first), U))
+ return true;
+ return llvm::any_of(*CC, [&](const Value *Member) {
+ return Member != CC->getLeader() &&
+ DT->dominates(cast<Instruction>(Member), U);
+ });
+}
+
+// See if we have a congruence class and leader for this operand, and if so,
+// return it. Otherwise, return the operand itself.
+Value *NewGVN::lookupOperandLeader(Value *V) const {
+ CongruenceClass *CC = ValueToClass.lookup(V);
+ if (CC) {
+ // Everything in TOP is represented by undef, as it can be any value.
+ // We do have to make sure we get the type right though, so we can't set the
+ // RepLeader to undef.
+ if (CC == TOPClass)
+ return UndefValue::get(V->getType());
+ return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+ }
+
+ return V;
+}
+
+const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
+ auto *CC = getMemoryClass(MA);
+ assert(CC->getMemoryLeader() &&
+ "Every MemoryAccess should be mapped to a congruence class with a "
+ "representative memory access");
+ return CC->getMemoryLeader();
+}
+
+// Return true if the MemoryAccess is really equivalent to everything. This is
+// equivalent to the lattice value "TOP" in most lattices. This is the initial
+// state of all MemoryAccesses.
+bool NewGVN::isMemoryAccessTOP(const MemoryAccess *MA) const {
+ return getMemoryClass(MA) == TOPClass;
+}
+
+LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
+ LoadInst *LI,
+ const MemoryAccess *MA) const {
+ auto *E =
+ new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+ E->setType(LoadType);
+
+ // Give store and loads same opcode so they value number together.
+ E->setOpcode(0);
+ E->op_push_back(PointerOp);
+
+ // TODO: Value number heap versions. We may be able to discover
+ // things alias analysis can't on it's own (IE that a store and a
+ // load have the same value, and thus, it isn't clobbering the load).
+ return E;
+}
+
+const StoreExpression *
+NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const {
+ auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
+ auto *E = new (ExpressionAllocator)
+ StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
+ E->allocateOperands(ArgRecycler, ExpressionAllocator);
+ E->setType(SI->getValueOperand()->getType());
+
+ // Give store and loads same opcode so they value number together.
+ E->setOpcode(0);
+ E->op_push_back(lookupOperandLeader(SI->getPointerOperand()));
+
+ // TODO: Value number heap versions. We may be able to discover
+ // things alias analysis can't on it's own (IE that a store and a
+ // load have the same value, and thus, it isn't clobbering the load).
+ return E;
+}
+
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
+ // Unlike loads, we never try to eliminate stores, so we do not check if they
+ // are simple and avoid value numbering them.
+ auto *SI = cast<StoreInst>(I);
+ auto *StoreAccess = getMemoryAccess(SI);
+ // Get the expression, if any, for the RHS of the MemoryDef.
+ const MemoryAccess *StoreRHS = StoreAccess->getDefiningAccess();
+ if (EnableStoreRefinement)
+ StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess);
+ // If we bypassed the use-def chains, make sure we add a use.
+ StoreRHS = lookupMemoryLeader(StoreRHS);
+ if (StoreRHS != StoreAccess->getDefiningAccess())
+ addMemoryUsers(StoreRHS, StoreAccess);
+ // If we are defined by ourselves, use the live on entry def.
+ if (StoreRHS == StoreAccess)
+ StoreRHS = MSSA->getLiveOnEntryDef();
+
+ if (SI->isSimple()) {
+ // See if we are defined by a previous store expression, it already has a
+ // value, and it's the same value as our current store. FIXME: Right now, we
+ // only do this for simple stores, we should expand to cover memcpys, etc.
+ const auto *LastStore = createStoreExpression(SI, StoreRHS);
+ const auto *LastCC = ExpressionToClass.lookup(LastStore);
+ // We really want to check whether the expression we matched was a store. No
+ // easy way to do that. However, we can check that the class we found has a
+ // store, which, assuming the value numbering state is not corrupt, is
+ // sufficient, because we must also be equivalent to that store's expression
+ // for it to be in the same class as the load.
+ if (LastCC && LastCC->getStoredValue() == LastStore->getStoredValue())
+ return LastStore;
+ // Also check if our value operand is defined by a load of the same memory
+ // location, and the memory state is the same as it was then (otherwise, it
+ // could have been overwritten later. See test32 in
+ // transforms/DeadStoreElimination/simple.ll).
+ if (auto *LI = dyn_cast<LoadInst>(LastStore->getStoredValue()))
+ if ((lookupOperandLeader(LI->getPointerOperand()) ==
+ LastStore->getOperand(0)) &&
+ (lookupMemoryLeader(getMemoryAccess(LI)->getDefiningAccess()) ==
+ StoreRHS))
+ return LastStore;
+ deleteExpression(LastStore);
+ }
+
+ // If the store is not equivalent to anything, value number it as a store that
+ // produces a unique memory state (instead of using it's MemoryUse, we use
+ // it's MemoryDef).
+ return createStoreExpression(SI, StoreAccess);
+}
+
+// See if we can extract the value of a loaded pointer from a load, a store, or
+// a memory instruction.
+const Expression *
+NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
+ LoadInst *LI, Instruction *DepInst,
+ MemoryAccess *DefiningAccess) const {
+ assert((!LI || LI->isSimple()) && "Not a simple load");
+ if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
+ // Can't forward from non-atomic to atomic without violating memory model.
+ // Also don't need to coerce if they are the same type, we will just
+ // propagate.
+ if (LI->isAtomic() > DepSI->isAtomic() ||
+ LoadType == DepSI->getValueOperand()->getType())
+ return nullptr;
+ int Offset = analyzeLoadFromClobberingStore(LoadType, LoadPtr, DepSI, DL);
+ if (Offset >= 0) {
+ if (auto *C = dyn_cast<Constant>(
+ lookupOperandLeader(DepSI->getValueOperand()))) {
+ LLVM_DEBUG(dbgs() << "Coercing load from store " << *DepSI
+ << " to constant " << *C << "\n");
+ return createConstantExpression(
+ getConstantStoreValueForLoad(C, Offset, LoadType, DL));
+ }
+ }
+ } else if (auto *DepLI = dyn_cast<LoadInst>(DepInst)) {
+ // Can't forward from non-atomic to atomic without violating memory model.
+ if (LI->isAtomic() > DepLI->isAtomic())
+ return nullptr;
+ int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL);
+ if (Offset >= 0) {
+ // We can coerce a constant load into a load.
+ if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
+ if (auto *PossibleConstant =
+ getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
+ LLVM_DEBUG(dbgs() << "Coercing load from load " << *LI
+ << " to constant " << *PossibleConstant << "\n");
+ return createConstantExpression(PossibleConstant);
+ }
+ }
+ } else if (auto *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
+ int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL);
+ if (Offset >= 0) {
+ if (auto *PossibleConstant =
+ getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) {
+ LLVM_DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
+ << " to constant " << *PossibleConstant << "\n");
+ return createConstantExpression(PossibleConstant);
+ }
+ }
+ }
+
+ // All of the below are only true if the loaded pointer is produced
+ // by the dependent instruction.
+ if (LoadPtr != lookupOperandLeader(DepInst) &&
+ !AA->isMustAlias(LoadPtr, DepInst))
+ return nullptr;
+ // If this load really doesn't depend on anything, then we must be loading an
+ // undef value. This can happen when loading for a fresh allocation with no
+ // intervening stores, for example. Note that this is only true in the case
+ // that the result of the allocation is pointer equal to the load ptr.
+ if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
+ isAlignedAllocLikeFn(DepInst, TLI)) {
+ return createConstantExpression(UndefValue::get(LoadType));
+ }
+ // If this load occurs either right after a lifetime begin,
+ // then the loaded value is undefined.
+ else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+ return createConstantExpression(UndefValue::get(LoadType));
+ }
+ // If this load follows a calloc (which zero initializes memory),
+ // then the loaded value is zero
+ else if (isCallocLikeFn(DepInst, TLI)) {
+ return createConstantExpression(Constant::getNullValue(LoadType));
+ }
+
+ return nullptr;
+}
+
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
+ auto *LI = cast<LoadInst>(I);
+
+ // We can eliminate in favor of non-simple loads, but we won't be able to
+ // eliminate the loads themselves.
+ if (!LI->isSimple())
+ return nullptr;
+
+ Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand());
+ // Load of undef is undef.
+ if (isa<UndefValue>(LoadAddressLeader))
+ return createConstantExpression(UndefValue::get(LI->getType()));
+ MemoryAccess *OriginalAccess = getMemoryAccess(I);
+ MemoryAccess *DefiningAccess =
+ MSSAWalker->getClobberingMemoryAccess(OriginalAccess);
+
+ if (!MSSA->isLiveOnEntryDef(DefiningAccess)) {
+ if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) {
+ Instruction *DefiningInst = MD->getMemoryInst();
+ // If the defining instruction is not reachable, replace with undef.
+ if (!ReachableBlocks.count(DefiningInst->getParent()))
+ return createConstantExpression(UndefValue::get(LI->getType()));
+ // This will handle stores and memory insts. We only do if it the
+ // defining access has a different type, or it is a pointer produced by
+ // certain memory operations that cause the memory to have a fixed value
+ // (IE things like calloc).
+ if (const auto *CoercionResult =
+ performSymbolicLoadCoercion(LI->getType(), LoadAddressLeader, LI,
+ DefiningInst, DefiningAccess))
+ return CoercionResult;
+ }
+ }
+
+ const auto *LE = createLoadExpression(LI->getType(), LoadAddressLeader, LI,
+ DefiningAccess);
+ // If our MemoryLeader is not our defining access, add a use to the
+ // MemoryLeader, so that we get reprocessed when it changes.
+ if (LE->getMemoryLeader() != DefiningAccess)
+ addMemoryUsers(LE->getMemoryLeader(), OriginalAccess);
+ return LE;
+}
+
+const Expression *
+NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
+ auto *PI = PredInfo->getPredicateInfoFor(I);
+ if (!PI)
+ return nullptr;
+
+ LLVM_DEBUG(dbgs() << "Found predicate info from instruction !\n");
+
const Optional<PredicateConstraint> &Constraint = PI->getConstraint();
if (!Constraint)
- return nullptr;
-
+ return nullptr;
+
CmpInst::Predicate Predicate = Constraint->Predicate;
Value *CmpOp0 = I->getOperand(0);
Value *CmpOp1 = Constraint->OtherOp;
-
+
Value *FirstOp = lookupOperandLeader(CmpOp0);
Value *SecondOp = lookupOperandLeader(CmpOp1);
Value *AdditionallyUsedValue = CmpOp0;
-
- // Sort the ops.
- if (shouldSwapOperands(FirstOp, SecondOp)) {
- std::swap(FirstOp, SecondOp);
+
+ // Sort the ops.
+ if (shouldSwapOperands(FirstOp, SecondOp)) {
+ std::swap(FirstOp, SecondOp);
Predicate = CmpInst::getSwappedPredicate(Predicate);
AdditionallyUsedValue = CmpOp1;
- }
-
+ }
+
if (Predicate == CmpInst::ICMP_EQ) {
addPredicateUsers(PI, I);
addAdditionalUsers(AdditionallyUsedValue, I);
return createVariableOrConstant(FirstOp);
- }
+ }
// Handle the special case of floating point.
if (Predicate == CmpInst::FCMP_OEQ && isa<ConstantFP>(FirstOp) &&
@@ -1566,2616 +1566,2616 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const {
addPredicateUsers(PI, I);
addAdditionalUsers(AdditionallyUsedValue, I);
return createConstantExpression(cast<Constant>(FirstOp));
- }
-
- return nullptr;
-}
-
-// Evaluate read only and pure calls, and create an expression result.
-const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
- auto *CI = cast<CallInst>(I);
- if (auto *II = dyn_cast<IntrinsicInst>(I)) {
- // Intrinsics with the returned attribute are copies of arguments.
- if (auto *ReturnedValue = II->getReturnedArgOperand()) {
- if (II->getIntrinsicID() == Intrinsic::ssa_copy)
- if (const auto *Result = performSymbolicPredicateInfoEvaluation(I))
- return Result;
- return createVariableOrConstant(ReturnedValue);
- }
- }
- if (AA->doesNotAccessMemory(CI)) {
- return createCallExpression(CI, TOPClass->getMemoryLeader());
- } else if (AA->onlyReadsMemory(CI)) {
- if (auto *MA = MSSA->getMemoryAccess(CI)) {
- auto *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(MA);
- return createCallExpression(CI, DefiningAccess);
- } else // MSSA determined that CI does not access memory.
- return createCallExpression(CI, TOPClass->getMemoryLeader());
- }
- return nullptr;
-}
-
-// Retrieve the memory class for a given MemoryAccess.
-CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const {
- auto *Result = MemoryAccessToClass.lookup(MA);
- assert(Result && "Should have found memory class");
- return Result;
-}
-
-// Update the MemoryAccess equivalence table to say that From is equal to To,
-// and return true if this is different from what already existed in the table.
-bool NewGVN::setMemoryClass(const MemoryAccess *From,
- CongruenceClass *NewClass) {
- assert(NewClass &&
- "Every MemoryAccess should be getting mapped to a non-null class");
- LLVM_DEBUG(dbgs() << "Setting " << *From);
- LLVM_DEBUG(dbgs() << " equivalent to congruence class ");
- LLVM_DEBUG(dbgs() << NewClass->getID()
- << " with current MemoryAccess leader ");
- LLVM_DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
-
- auto LookupResult = MemoryAccessToClass.find(From);
- bool Changed = false;
- // If it's already in the table, see if the value changed.
- if (LookupResult != MemoryAccessToClass.end()) {
- auto *OldClass = LookupResult->second;
- if (OldClass != NewClass) {
- // If this is a phi, we have to handle memory member updates.
- if (auto *MP = dyn_cast<MemoryPhi>(From)) {
- OldClass->memory_erase(MP);
- NewClass->memory_insert(MP);
- // This may have killed the class if it had no non-memory members
- if (OldClass->getMemoryLeader() == From) {
- if (OldClass->definesNoMemory()) {
- OldClass->setMemoryLeader(nullptr);
- } else {
- OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
- LLVM_DEBUG(dbgs() << "Memory class leader change for class "
- << OldClass->getID() << " to "
- << *OldClass->getMemoryLeader()
- << " due to removal of a memory member " << *From
- << "\n");
- markMemoryLeaderChangeTouched(OldClass);
- }
- }
- }
- // It wasn't equivalent before, and now it is.
- LookupResult->second = NewClass;
- Changed = true;
- }
- }
-
- return Changed;
-}
-
-// Determine if a instruction is cycle-free. That means the values in the
-// instruction don't depend on any expressions that can change value as a result
-// of the instruction. For example, a non-cycle free instruction would be v =
-// phi(0, v+1).
-bool NewGVN::isCycleFree(const Instruction *I) const {
- // In order to compute cycle-freeness, we do SCC finding on the instruction,
- // and see what kind of SCC it ends up in. If it is a singleton, it is
- // cycle-free. If it is not in a singleton, it is only cycle free if the
- // other members are all phi nodes (as they do not compute anything, they are
- // copies).
- auto ICS = InstCycleState.lookup(I);
- if (ICS == ICS_Unknown) {
- SCCFinder.Start(I);
- auto &SCC = SCCFinder.getComponentFor(I);
- // It's cycle free if it's size 1 or the SCC is *only* phi nodes.
- if (SCC.size() == 1)
- InstCycleState.insert({I, ICS_CycleFree});
- else {
- bool AllPhis = llvm::all_of(SCC, [](const Value *V) {
- return isa<PHINode>(V) || isCopyOfAPHI(V);
- });
- ICS = AllPhis ? ICS_CycleFree : ICS_Cycle;
- for (auto *Member : SCC)
- if (auto *MemberPhi = dyn_cast<PHINode>(Member))
- InstCycleState.insert({MemberPhi, ICS});
- }
- }
- if (ICS == ICS_Cycle)
- return false;
- return true;
-}
-
-// Evaluate PHI nodes symbolically and create an expression result.
-const Expression *
-NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
- Instruction *I,
- BasicBlock *PHIBlock) const {
- // True if one of the incoming phi edges is a backedge.
- bool HasBackedge = false;
- // All constant tracks the state of whether all the *original* phi operands
- // This is really shorthand for "this phi cannot cycle due to forward
- // change in value of the phi is guaranteed not to later change the value of
- // the phi. IE it can't be v = phi(undef, v+1)
- bool OriginalOpsConstant = true;
- auto *E = cast<PHIExpression>(createPHIExpression(
- PHIOps, I, PHIBlock, HasBackedge, OriginalOpsConstant));
- // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
- // See if all arguments are the same.
- // We track if any were undef because they need special handling.
- bool HasUndef = false;
- auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) {
- if (isa<UndefValue>(Arg)) {
- HasUndef = true;
- return false;
- }
- return true;
- });
- // If we are left with no operands, it's dead.
- if (Filtered.empty()) {
- // If it has undef at this point, it means there are no-non-undef arguments,
- // and thus, the value of the phi node must be undef.
- if (HasUndef) {
- LLVM_DEBUG(
- dbgs() << "PHI Node " << *I
- << " has no non-undef arguments, valuing it as undef\n");
- return createConstantExpression(UndefValue::get(I->getType()));
- }
-
- LLVM_DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
- deleteExpression(E);
- return createDeadExpression();
- }
- Value *AllSameValue = *(Filtered.begin());
- ++Filtered.begin();
- // Can't use std::equal here, sadly, because filter.begin moves.
- if (llvm::all_of(Filtered, [&](Value *Arg) { return Arg == AllSameValue; })) {
- // In LLVM's non-standard representation of phi nodes, it's possible to have
- // phi nodes with cycles (IE dependent on other phis that are .... dependent
- // on the original phi node), especially in weird CFG's where some arguments
- // are unreachable, or uninitialized along certain paths. This can cause
- // infinite loops during evaluation. We work around this by not trying to
- // really evaluate them independently, but instead using a variable
- // expression to say if one is equivalent to the other.
- // We also special case undef, so that if we have an undef, we can't use the
- // common value unless it dominates the phi block.
- if (HasUndef) {
- // If we have undef and at least one other value, this is really a
- // multivalued phi, and we need to know if it's cycle free in order to
- // evaluate whether we can ignore the undef. The other parts of this are
- // just shortcuts. If there is no backedge, or all operands are
- // constants, it also must be cycle free.
- if (HasBackedge && !OriginalOpsConstant &&
- !isa<UndefValue>(AllSameValue) && !isCycleFree(I))
- return E;
-
- // Only have to check for instructions
- if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue))
- if (!someEquivalentDominates(AllSameInst, I))
- return E;
- }
- // Can't simplify to something that comes later in the iteration.
- // Otherwise, when and if it changes congruence class, we will never catch
- // up. We will always be a class behind it.
- if (isa<Instruction>(AllSameValue) &&
- InstrToDFSNum(AllSameValue) > InstrToDFSNum(I))
- return E;
- NumGVNPhisAllSame++;
- LLVM_DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
- << "\n");
- deleteExpression(E);
- return createVariableOrConstant(AllSameValue);
- }
- return E;
-}
-
-const Expression *
-NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {
- if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
- auto *WO = dyn_cast<WithOverflowInst>(EI->getAggregateOperand());
- if (WO && EI->getNumIndices() == 1 && *EI->idx_begin() == 0)
- // EI is an extract from one of our with.overflow intrinsics. Synthesize
- // a semantically equivalent expression instead of an extract value
- // expression.
- return createBinaryExpression(WO->getBinaryOp(), EI->getType(),
- WO->getLHS(), WO->getRHS(), I);
- }
-
- return createAggregateValueExpression(I);
-}
-
-const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
- assert(isa<CmpInst>(I) && "Expected a cmp instruction.");
-
- auto *CI = cast<CmpInst>(I);
- // See if our operands are equal to those of a previous predicate, and if so,
- // if it implies true or false.
- auto Op0 = lookupOperandLeader(CI->getOperand(0));
- auto Op1 = lookupOperandLeader(CI->getOperand(1));
- auto OurPredicate = CI->getPredicate();
- if (shouldSwapOperands(Op0, Op1)) {
- std::swap(Op0, Op1);
- OurPredicate = CI->getSwappedPredicate();
- }
-
- // Avoid processing the same info twice.
- const PredicateBase *LastPredInfo = nullptr;
- // See if we know something about the comparison itself, like it is the target
- // of an assume.
- auto *CmpPI = PredInfo->getPredicateInfoFor(I);
- if (dyn_cast_or_null<PredicateAssume>(CmpPI))
- return createConstantExpression(ConstantInt::getTrue(CI->getType()));
-
- if (Op0 == Op1) {
- // This condition does not depend on predicates, no need to add users
- if (CI->isTrueWhenEqual())
- return createConstantExpression(ConstantInt::getTrue(CI->getType()));
- else if (CI->isFalseWhenEqual())
- return createConstantExpression(ConstantInt::getFalse(CI->getType()));
- }
-
- // NOTE: Because we are comparing both operands here and below, and using
- // previous comparisons, we rely on fact that predicateinfo knows to mark
- // comparisons that use renamed operands as users of the earlier comparisons.
- // It is *not* enough to just mark predicateinfo renamed operands as users of
- // the earlier comparisons, because the *other* operand may have changed in a
- // previous iteration.
- // Example:
- // icmp slt %a, %b
- // %b.0 = ssa.copy(%b)
- // false branch:
- // icmp slt %c, %b.0
-
- // %c and %a may start out equal, and thus, the code below will say the second
- // %icmp is false. c may become equal to something else, and in that case the
- // %second icmp *must* be reexamined, but would not if only the renamed
- // %operands are considered users of the icmp.
-
- // *Currently* we only check one level of comparisons back, and only mark one
- // level back as touched when changes happen. If you modify this code to look
- // back farther through comparisons, you *must* mark the appropriate
- // comparisons as users in PredicateInfo.cpp, or you will cause bugs. See if
- // we know something just from the operands themselves
-
- // See if our operands have predicate info, so that we may be able to derive
- // something from a previous comparison.
- for (const auto &Op : CI->operands()) {
- auto *PI = PredInfo->getPredicateInfoFor(Op);
- if (const auto *PBranch = dyn_cast_or_null<PredicateBranch>(PI)) {
- if (PI == LastPredInfo)
- continue;
- LastPredInfo = PI;
- // In phi of ops cases, we may have predicate info that we are evaluating
- // in a different context.
- if (!DT->dominates(PBranch->To, getBlockForValue(I)))
- continue;
- // TODO: Along the false edge, we may know more things too, like
- // icmp of
- // same operands is false.
- // TODO: We only handle actual comparison conditions below, not
- // and/or.
- auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition);
- if (!BranchCond)
- continue;
- auto *BranchOp0 = lookupOperandLeader(BranchCond->getOperand(0));
- auto *BranchOp1 = lookupOperandLeader(BranchCond->getOperand(1));
- auto BranchPredicate = BranchCond->getPredicate();
- if (shouldSwapOperands(BranchOp0, BranchOp1)) {
- std::swap(BranchOp0, BranchOp1);
- BranchPredicate = BranchCond->getSwappedPredicate();
- }
- if (BranchOp0 == Op0 && BranchOp1 == Op1) {
- if (PBranch->TrueEdge) {
- // If we know the previous predicate is true and we are in the true
- // edge then we may be implied true or false.
- if (CmpInst::isImpliedTrueByMatchingCmp(BranchPredicate,
- OurPredicate)) {
- addPredicateUsers(PI, I);
- return createConstantExpression(
- ConstantInt::getTrue(CI->getType()));
- }
-
- if (CmpInst::isImpliedFalseByMatchingCmp(BranchPredicate,
- OurPredicate)) {
- addPredicateUsers(PI, I);
- return createConstantExpression(
- ConstantInt::getFalse(CI->getType()));
- }
- } else {
- // Just handle the ne and eq cases, where if we have the same
- // operands, we may know something.
- if (BranchPredicate == OurPredicate) {
- addPredicateUsers(PI, I);
- // Same predicate, same ops,we know it was false, so this is false.
- return createConstantExpression(
- ConstantInt::getFalse(CI->getType()));
- } else if (BranchPredicate ==
- CmpInst::getInversePredicate(OurPredicate)) {
- addPredicateUsers(PI, I);
- // Inverse predicate, we know the other was false, so this is true.
- return createConstantExpression(
- ConstantInt::getTrue(CI->getType()));
- }
- }
- }
- }
- }
- // Create expression will take care of simplifyCmpInst
- return createExpression(I);
-}
-
-// Substitute and symbolize the value before value numbering.
-const Expression *
-NewGVN::performSymbolicEvaluation(Value *V,
- SmallPtrSetImpl<Value *> &Visited) const {
- const Expression *E = nullptr;
- if (auto *C = dyn_cast<Constant>(V))
- E = createConstantExpression(C);
- else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
- E = createVariableExpression(V);
- } else {
- // TODO: memory intrinsics.
- // TODO: Some day, we should do the forward propagation and reassociation
- // parts of the algorithm.
- auto *I = cast<Instruction>(V);
- switch (I->getOpcode()) {
- case Instruction::ExtractValue:
- case Instruction::InsertValue:
- E = performSymbolicAggrValueEvaluation(I);
- break;
- case Instruction::PHI: {
- SmallVector<ValPair, 3> Ops;
- auto *PN = cast<PHINode>(I);
- for (unsigned i = 0; i < PN->getNumOperands(); ++i)
- Ops.push_back({PN->getIncomingValue(i), PN->getIncomingBlock(i)});
- // Sort to ensure the invariant createPHIExpression requires is met.
- sortPHIOps(Ops);
- E = performSymbolicPHIEvaluation(Ops, I, getBlockForValue(I));
- } break;
- case Instruction::Call:
- E = performSymbolicCallEvaluation(I);
- break;
- case Instruction::Store:
- E = performSymbolicStoreEvaluation(I);
- break;
- case Instruction::Load:
- E = performSymbolicLoadEvaluation(I);
- break;
- case Instruction::BitCast:
- case Instruction::AddrSpaceCast:
- E = createExpression(I);
- break;
- case Instruction::ICmp:
- case Instruction::FCmp:
- E = performSymbolicCmpEvaluation(I);
- break;
- case Instruction::FNeg:
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::UIToFP:
- case Instruction::SIToFP:
- case Instruction::FPTrunc:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::Select:
- case Instruction::ExtractElement:
- case Instruction::InsertElement:
- case Instruction::GetElementPtr:
- E = createExpression(I);
- break;
- case Instruction::ShuffleVector:
- // FIXME: Add support for shufflevector to createExpression.
- return nullptr;
- default:
- return nullptr;
- }
- }
- return E;
-}
-
-// Look up a container of values/instructions in a map, and touch all the
-// instructions in the container. Then erase value from the map.
-template <typename Map, typename KeyType>
-void NewGVN::touchAndErase(Map &M, const KeyType &Key) {
- const auto Result = M.find_as(Key);
- if (Result != M.end()) {
- for (const typename Map::mapped_type::value_type Mapped : Result->second)
- TouchedInstructions.set(InstrToDFSNum(Mapped));
- M.erase(Result);
- }
-}
-
-void NewGVN::addAdditionalUsers(Value *To, Value *User) const {
- assert(User && To != User);
- if (isa<Instruction>(To))
- AdditionalUsers[To].insert(User);
-}
-
-void NewGVN::markUsersTouched(Value *V) {
- // Now mark the users as touched.
- for (auto *User : V->users()) {
- assert(isa<Instruction>(User) && "Use of value not within an instruction?");
- TouchedInstructions.set(InstrToDFSNum(User));
- }
- touchAndErase(AdditionalUsers, V);
-}
-
-void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
- LLVM_DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
- MemoryToUsers[To].insert(U);
-}
-
-void NewGVN::markMemoryDefTouched(const MemoryAccess *MA) {
- TouchedInstructions.set(MemoryToDFSNum(MA));
-}
-
-void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
- if (isa<MemoryUse>(MA))
- return;
- for (auto U : MA->users())
- TouchedInstructions.set(MemoryToDFSNum(U));
- touchAndErase(MemoryToUsers, MA);
-}
-
-// Add I to the set of users of a given predicate.
-void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {
- // Don't add temporary instructions to the user lists.
- if (AllTempInstructions.count(I))
- return;
-
- if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
- PredicateToUsers[PBranch->Condition].insert(I);
- else if (auto *PAssume = dyn_cast<PredicateAssume>(PB))
- PredicateToUsers[PAssume->Condition].insert(I);
-}
-
-// Touch all the predicates that depend on this instruction.
-void NewGVN::markPredicateUsersTouched(Instruction *I) {
- touchAndErase(PredicateToUsers, I);
-}
-
-// Mark users affected by a memory leader change.
-void NewGVN::markMemoryLeaderChangeTouched(CongruenceClass *CC) {
- for (auto M : CC->memory())
- markMemoryDefTouched(M);
-}
-
-// Touch the instructions that need to be updated after a congruence class has a
-// leader change, and mark changed values.
-void NewGVN::markValueLeaderChangeTouched(CongruenceClass *CC) {
- for (auto M : *CC) {
- if (auto *I = dyn_cast<Instruction>(M))
- TouchedInstructions.set(InstrToDFSNum(I));
- LeaderChanges.insert(M);
- }
-}
-
-// Give a range of things that have instruction DFS numbers, this will return
-// the member of the range with the smallest dfs number.
-template <class T, class Range>
-T *NewGVN::getMinDFSOfRange(const Range &R) const {
- std::pair<T *, unsigned> MinDFS = {nullptr, ~0U};
- for (const auto X : R) {
- auto DFSNum = InstrToDFSNum(X);
- if (DFSNum < MinDFS.second)
- MinDFS = {X, DFSNum};
- }
- return MinDFS.first;
-}
-
-// This function returns the MemoryAccess that should be the next leader of
-// congruence class CC, under the assumption that the current leader is going to
-// disappear.
-const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
- // TODO: If this ends up to slow, we can maintain a next memory leader like we
- // do for regular leaders.
- // Make sure there will be a leader to find.
- assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
- if (CC->getStoreCount() > 0) {
- if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
- return getMemoryAccess(NL);
- // Find the store with the minimum DFS number.
- auto *V = getMinDFSOfRange<Value>(make_filter_range(
- *CC, [&](const Value *V) { return isa<StoreInst>(V); }));
- return getMemoryAccess(cast<StoreInst>(V));
- }
- assert(CC->getStoreCount() == 0);
-
- // Given our assertion, hitting this part must mean
- // !OldClass->memory_empty()
- if (CC->memory_size() == 1)
- return *CC->memory_begin();
- return getMinDFSOfRange<const MemoryPhi>(CC->memory());
-}
-
-// This function returns the next value leader of a congruence class, under the
-// assumption that the current leader is going away. This should end up being
-// the next most dominating member.
-Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const {
- // We don't need to sort members if there is only 1, and we don't care about
- // sorting the TOP class because everything either gets out of it or is
- // unreachable.
-
- if (CC->size() == 1 || CC == TOPClass) {
- return *(CC->begin());
- } else if (CC->getNextLeader().first) {
- ++NumGVNAvoidedSortedLeaderChanges;
- return CC->getNextLeader().first;
- } else {
- ++NumGVNSortedLeaderChanges;
- // NOTE: If this ends up to slow, we can maintain a dual structure for
- // member testing/insertion, or keep things mostly sorted, and sort only
- // here, or use SparseBitVector or ....
- return getMinDFSOfRange<Value>(*CC);
- }
-}
-
-// Move a MemoryAccess, currently in OldClass, to NewClass, including updates to
-// the memory members, etc for the move.
-//
-// The invariants of this function are:
-//
-// - I must be moving to NewClass from OldClass
-// - The StoreCount of OldClass and NewClass is expected to have been updated
-// for I already if it is a store.
-// - The OldClass memory leader has not been updated yet if I was the leader.
-void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
- MemoryAccess *InstMA,
- CongruenceClass *OldClass,
- CongruenceClass *NewClass) {
- // If the leader is I, and we had a representative MemoryAccess, it should
- // be the MemoryAccess of OldClass.
- assert((!InstMA || !OldClass->getMemoryLeader() ||
- OldClass->getLeader() != I ||
- MemoryAccessToClass.lookup(OldClass->getMemoryLeader()) ==
- MemoryAccessToClass.lookup(InstMA)) &&
- "Representative MemoryAccess mismatch");
- // First, see what happens to the new class
- if (!NewClass->getMemoryLeader()) {
- // Should be a new class, or a store becoming a leader of a new class.
- assert(NewClass->size() == 1 ||
- (isa<StoreInst>(I) && NewClass->getStoreCount() == 1));
- NewClass->setMemoryLeader(InstMA);
- // Mark it touched if we didn't just create a singleton
- LLVM_DEBUG(dbgs() << "Memory class leader change for class "
- << NewClass->getID()
- << " due to new memory instruction becoming leader\n");
- markMemoryLeaderChangeTouched(NewClass);
- }
- setMemoryClass(InstMA, NewClass);
- // Now, fixup the old class if necessary
- if (OldClass->getMemoryLeader() == InstMA) {
- if (!OldClass->definesNoMemory()) {
- OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
- LLVM_DEBUG(dbgs() << "Memory class leader change for class "
- << OldClass->getID() << " to "
- << *OldClass->getMemoryLeader()
- << " due to removal of old leader " << *InstMA << "\n");
- markMemoryLeaderChangeTouched(OldClass);
- } else
- OldClass->setMemoryLeader(nullptr);
- }
-}
-
-// Move a value, currently in OldClass, to be part of NewClass
-// Update OldClass and NewClass for the move (including changing leaders, etc).
-void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
- CongruenceClass *OldClass,
- CongruenceClass *NewClass) {
- if (I == OldClass->getNextLeader().first)
- OldClass->resetNextLeader();
-
- OldClass->erase(I);
- NewClass->insert(I);
-
- if (NewClass->getLeader() != I)
- NewClass->addPossibleNextLeader({I, InstrToDFSNum(I)});
- // Handle our special casing of stores.
- if (auto *SI = dyn_cast<StoreInst>(I)) {
- OldClass->decStoreCount();
- // Okay, so when do we want to make a store a leader of a class?
- // If we have a store defined by an earlier load, we want the earlier load
- // to lead the class.
- // If we have a store defined by something else, we want the store to lead
- // the class so everything else gets the "something else" as a value.
- // If we have a store as the single member of the class, we want the store
- // as the leader
- if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
- // If it's a store expression we are using, it means we are not equivalent
- // to something earlier.
- if (auto *SE = dyn_cast<StoreExpression>(E)) {
- NewClass->setStoredValue(SE->getStoredValue());
- markValueLeaderChangeTouched(NewClass);
- // Shift the new class leader to be the store
- LLVM_DEBUG(dbgs() << "Changing leader of congruence class "
- << NewClass->getID() << " from "
- << *NewClass->getLeader() << " to " << *SI
- << " because store joined class\n");
- // If we changed the leader, we have to mark it changed because we don't
- // know what it will do to symbolic evaluation.
- NewClass->setLeader(SI);
- }
- // We rely on the code below handling the MemoryAccess change.
- }
- NewClass->incStoreCount();
- }
- // True if there is no memory instructions left in a class that had memory
- // instructions before.
-
- // If it's not a memory use, set the MemoryAccess equivalence
- auto *InstMA = dyn_cast_or_null<MemoryDef>(getMemoryAccess(I));
- if (InstMA)
- moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass);
- ValueToClass[I] = NewClass;
- // See if we destroyed the class or need to swap leaders.
- if (OldClass->empty() && OldClass != TOPClass) {
- if (OldClass->getDefiningExpr()) {
- LLVM_DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
- << " from table\n");
- // We erase it as an exact expression to make sure we don't just erase an
- // equivalent one.
- auto Iter = ExpressionToClass.find_as(
- ExactEqualsExpression(*OldClass->getDefiningExpr()));
- if (Iter != ExpressionToClass.end())
- ExpressionToClass.erase(Iter);
-#ifdef EXPENSIVE_CHECKS
- assert(
- (*OldClass->getDefiningExpr() != *E || ExpressionToClass.lookup(E)) &&
- "We erased the expression we just inserted, which should not happen");
-#endif
- }
- } else if (OldClass->getLeader() == I) {
- // When the leader changes, the value numbering of
- // everything may change due to symbolization changes, so we need to
- // reprocess.
- LLVM_DEBUG(dbgs() << "Value class leader change for class "
- << OldClass->getID() << "\n");
- ++NumGVNLeaderChanges;
- // Destroy the stored value if there are no more stores to represent it.
- // Note that this is basically clean up for the expression removal that
- // happens below. If we remove stores from a class, we may leave it as a
- // class of equivalent memory phis.
- if (OldClass->getStoreCount() == 0) {
- if (OldClass->getStoredValue())
- OldClass->setStoredValue(nullptr);
- }
- OldClass->setLeader(getNextValueLeader(OldClass));
- OldClass->resetNextLeader();
- markValueLeaderChangeTouched(OldClass);
- }
-}
-
-// For a given expression, mark the phi of ops instructions that could have
-// changed as a result.
-void NewGVN::markPhiOfOpsChanged(const Expression *E) {
- touchAndErase(ExpressionToPhiOfOps, E);
-}
-
-// Perform congruence finding on a given value numbering expression.
-void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
- // This is guaranteed to return something, since it will at least find
- // TOP.
-
- CongruenceClass *IClass = ValueToClass.lookup(I);
- assert(IClass && "Should have found a IClass");
- // Dead classes should have been eliminated from the mapping.
- assert(!IClass->isDead() && "Found a dead class");
-
- CongruenceClass *EClass = nullptr;
- if (const auto *VE = dyn_cast<VariableExpression>(E)) {
- EClass = ValueToClass.lookup(VE->getVariableValue());
- } else if (isa<DeadExpression>(E)) {
- EClass = TOPClass;
- }
- if (!EClass) {
- auto lookupResult = ExpressionToClass.insert({E, nullptr});
-
- // If it's not in the value table, create a new congruence class.
- if (lookupResult.second) {
- CongruenceClass *NewClass = createCongruenceClass(nullptr, E);
- auto place = lookupResult.first;
- place->second = NewClass;
-
- // Constants and variables should always be made the leader.
- if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
- NewClass->setLeader(CE->getConstantValue());
- } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
- StoreInst *SI = SE->getStoreInst();
- NewClass->setLeader(SI);
- NewClass->setStoredValue(SE->getStoredValue());
- // The RepMemoryAccess field will be filled in properly by the
- // moveValueToNewCongruenceClass call.
- } else {
- NewClass->setLeader(I);
- }
- assert(!isa<VariableExpression>(E) &&
- "VariableExpression should have been handled already");
-
- EClass = NewClass;
- LLVM_DEBUG(dbgs() << "Created new congruence class for " << *I
- << " using expression " << *E << " at "
- << NewClass->getID() << " and leader "
- << *(NewClass->getLeader()));
- if (NewClass->getStoredValue())
- LLVM_DEBUG(dbgs() << " and stored value "
- << *(NewClass->getStoredValue()));
- LLVM_DEBUG(dbgs() << "\n");
- } else {
- EClass = lookupResult.first->second;
- if (isa<ConstantExpression>(E))
- assert((isa<Constant>(EClass->getLeader()) ||
- (EClass->getStoredValue() &&
- isa<Constant>(EClass->getStoredValue()))) &&
- "Any class with a constant expression should have a "
- "constant leader");
-
- assert(EClass && "Somehow don't have an eclass");
-
- assert(!EClass->isDead() && "We accidentally looked up a dead class");
- }
- }
- bool ClassChanged = IClass != EClass;
- bool LeaderChanged = LeaderChanges.erase(I);
- if (ClassChanged || LeaderChanged) {
- LLVM_DEBUG(dbgs() << "New class " << EClass->getID() << " for expression "
- << *E << "\n");
- if (ClassChanged) {
- moveValueToNewCongruenceClass(I, E, IClass, EClass);
- markPhiOfOpsChanged(E);
- }
-
- markUsersTouched(I);
- if (MemoryAccess *MA = getMemoryAccess(I))
- markMemoryUsersTouched(MA);
- if (auto *CI = dyn_cast<CmpInst>(I))
- markPredicateUsersTouched(CI);
- }
- // If we changed the class of the store, we want to ensure nothing finds the
- // old store expression. In particular, loads do not compare against stored
- // value, so they will find old store expressions (and associated class
- // mappings) if we leave them in the table.
- if (ClassChanged && isa<StoreInst>(I)) {
- auto *OldE = ValueToExpression.lookup(I);
- // It could just be that the old class died. We don't want to erase it if we
- // just moved classes.
- if (OldE && isa<StoreExpression>(OldE) && *E != *OldE) {
- // Erase this as an exact expression to ensure we don't erase expressions
- // equivalent to it.
- auto Iter = ExpressionToClass.find_as(ExactEqualsExpression(*OldE));
- if (Iter != ExpressionToClass.end())
- ExpressionToClass.erase(Iter);
- }
- }
- ValueToExpression[I] = E;
-}
-
-// Process the fact that Edge (from, to) is reachable, including marking
-// any newly reachable blocks and instructions for processing.
-void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
- // Check if the Edge was reachable before.
- if (ReachableEdges.insert({From, To}).second) {
- // If this block wasn't reachable before, all instructions are touched.
- if (ReachableBlocks.insert(To).second) {
- LLVM_DEBUG(dbgs() << "Block " << getBlockName(To)
- << " marked reachable\n");
- const auto &InstRange = BlockInstRange.lookup(To);
- TouchedInstructions.set(InstRange.first, InstRange.second);
- } else {
- LLVM_DEBUG(dbgs() << "Block " << getBlockName(To)
- << " was reachable, but new edge {"
- << getBlockName(From) << "," << getBlockName(To)
- << "} to it found\n");
-
- // We've made an edge reachable to an existing block, which may
- // impact predicates. Otherwise, only mark the phi nodes as touched, as
- // they are the only thing that depend on new edges. Anything using their
- // values will get propagated to if necessary.
- if (MemoryAccess *MemPhi = getMemoryAccess(To))
- TouchedInstructions.set(InstrToDFSNum(MemPhi));
-
- // FIXME: We should just add a union op on a Bitvector and
- // SparseBitVector. We can do it word by word faster than we are doing it
- // here.
- for (auto InstNum : RevisitOnReachabilityChange[To])
- TouchedInstructions.set(InstNum);
- }
- }
-}
-
-// Given a predicate condition (from a switch, cmp, or whatever) and a block,
-// see if we know some constant value for it already.
-Value *NewGVN::findConditionEquivalence(Value *Cond) const {
- auto Result = lookupOperandLeader(Cond);
- return isa<Constant>(Result) ? Result : nullptr;
-}
-
-// Process the outgoing edges of a block for reachability.
-void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) {
- // Evaluate reachability of terminator instruction.
- Value *Cond;
- BasicBlock *TrueSucc, *FalseSucc;
- if (match(TI, m_Br(m_Value(Cond), TrueSucc, FalseSucc))) {
- Value *CondEvaluated = findConditionEquivalence(Cond);
- if (!CondEvaluated) {
- if (auto *I = dyn_cast<Instruction>(Cond)) {
- const Expression *E = createExpression(I);
- if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
- CondEvaluated = CE->getConstantValue();
- }
- } else if (isa<ConstantInt>(Cond)) {
- CondEvaluated = Cond;
- }
- }
- ConstantInt *CI;
- if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) {
- if (CI->isOne()) {
- LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
- << " evaluated to true\n");
- updateReachableEdge(B, TrueSucc);
- } else if (CI->isZero()) {
- LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
- << " evaluated to false\n");
- updateReachableEdge(B, FalseSucc);
- }
- } else {
- updateReachableEdge(B, TrueSucc);
- updateReachableEdge(B, FalseSucc);
- }
- } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
- // For switches, propagate the case values into the case
- // destinations.
-
- Value *SwitchCond = SI->getCondition();
- Value *CondEvaluated = findConditionEquivalence(SwitchCond);
- // See if we were able to turn this switch statement into a constant.
- if (CondEvaluated && isa<ConstantInt>(CondEvaluated)) {
- auto *CondVal = cast<ConstantInt>(CondEvaluated);
- // We should be able to get case value for this.
- auto Case = *SI->findCaseValue(CondVal);
- if (Case.getCaseSuccessor() == SI->getDefaultDest()) {
- // We proved the value is outside of the range of the case.
- // We can't do anything other than mark the default dest as reachable,
- // and go home.
- updateReachableEdge(B, SI->getDefaultDest());
- return;
- }
- // Now get where it goes and mark it reachable.
- BasicBlock *TargetBlock = Case.getCaseSuccessor();
- updateReachableEdge(B, TargetBlock);
- } else {
- for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
- BasicBlock *TargetBlock = SI->getSuccessor(i);
- updateReachableEdge(B, TargetBlock);
- }
- }
- } else {
- // Otherwise this is either unconditional, or a type we have no
- // idea about. Just mark successors as reachable.
- for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
- BasicBlock *TargetBlock = TI->getSuccessor(i);
- updateReachableEdge(B, TargetBlock);
- }
-
- // This also may be a memory defining terminator, in which case, set it
- // equivalent only to itself.
- //
- auto *MA = getMemoryAccess(TI);
- if (MA && !isa<MemoryUse>(MA)) {
- auto *CC = ensureLeaderOfMemoryClass(MA);
- if (setMemoryClass(MA, CC))
- markMemoryUsersTouched(MA);
- }
- }
-}
-
-// Remove the PHI of Ops PHI for I
-void NewGVN::removePhiOfOps(Instruction *I, PHINode *PHITemp) {
- InstrDFS.erase(PHITemp);
- // It's still a temp instruction. We keep it in the array so it gets erased.
- // However, it's no longer used by I, or in the block
- TempToBlock.erase(PHITemp);
- RealToTemp.erase(I);
- // We don't remove the users from the phi node uses. This wastes a little
- // time, but such is life. We could use two sets to track which were there
- // are the start of NewGVN, and which were added, but right nowt he cost of
- // tracking is more than the cost of checking for more phi of ops.
-}
-
-// Add PHI Op in BB as a PHI of operations version of ExistingValue.
-void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB,
- Instruction *ExistingValue) {
- InstrDFS[Op] = InstrToDFSNum(ExistingValue);
- AllTempInstructions.insert(Op);
- TempToBlock[Op] = BB;
- RealToTemp[ExistingValue] = Op;
- // Add all users to phi node use, as they are now uses of the phi of ops phis
- // and may themselves be phi of ops.
- for (auto *U : ExistingValue->users())
- if (auto *UI = dyn_cast<Instruction>(U))
- PHINodeUses.insert(UI);
-}
-
-static bool okayForPHIOfOps(const Instruction *I) {
- if (!EnablePhiOfOps)
- return false;
- return isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<CmpInst>(I) ||
- isa<LoadInst>(I);
-}
-
-bool NewGVN::OpIsSafeForPHIOfOpsHelper(
- Value *V, const BasicBlock *PHIBlock,
- SmallPtrSetImpl<const Value *> &Visited,
- SmallVectorImpl<Instruction *> &Worklist) {
-
- if (!isa<Instruction>(V))
- return true;
- auto OISIt = OpSafeForPHIOfOps.find(V);
- if (OISIt != OpSafeForPHIOfOps.end())
- return OISIt->second;
-
- // Keep walking until we either dominate the phi block, or hit a phi, or run
- // out of things to check.
- if (DT->properlyDominates(getBlockForValue(V), PHIBlock)) {
- OpSafeForPHIOfOps.insert({V, true});
- return true;
- }
- // PHI in the same block.
- if (isa<PHINode>(V) && getBlockForValue(V) == PHIBlock) {
- OpSafeForPHIOfOps.insert({V, false});
- return false;
- }
-
- auto *OrigI = cast<Instruction>(V);
- for (auto *Op : OrigI->operand_values()) {
- if (!isa<Instruction>(Op))
- continue;
- // Stop now if we find an unsafe operand.
- auto OISIt = OpSafeForPHIOfOps.find(OrigI);
- if (OISIt != OpSafeForPHIOfOps.end()) {
- if (!OISIt->second) {
- OpSafeForPHIOfOps.insert({V, false});
- return false;
- }
- continue;
- }
- if (!Visited.insert(Op).second)
- continue;
- Worklist.push_back(cast<Instruction>(Op));
- }
- return true;
-}
-
-// Return true if this operand will be safe to use for phi of ops.
-//
-// The reason some operands are unsafe is that we are not trying to recursively
-// translate everything back through phi nodes. We actually expect some lookups
-// of expressions to fail. In particular, a lookup where the expression cannot
-// exist in the predecessor. This is true even if the expression, as shown, can
-// be determined to be constant.
-bool NewGVN::OpIsSafeForPHIOfOps(Value *V, const BasicBlock *PHIBlock,
- SmallPtrSetImpl<const Value *> &Visited) {
- SmallVector<Instruction *, 4> Worklist;
- if (!OpIsSafeForPHIOfOpsHelper(V, PHIBlock, Visited, Worklist))
- return false;
- while (!Worklist.empty()) {
- auto *I = Worklist.pop_back_val();
- if (!OpIsSafeForPHIOfOpsHelper(I, PHIBlock, Visited, Worklist))
- return false;
- }
- OpSafeForPHIOfOps.insert({V, true});
- return true;
-}
-
-// Try to find a leader for instruction TransInst, which is a phi translated
-// version of something in our original program. Visited is used to ensure we
-// don't infinite loop during translations of cycles. OrigInst is the
-// instruction in the original program, and PredBB is the predecessor we
-// translated it through.
-Value *NewGVN::findLeaderForInst(Instruction *TransInst,
- SmallPtrSetImpl<Value *> &Visited,
- MemoryAccess *MemAccess, Instruction *OrigInst,
- BasicBlock *PredBB) {
- unsigned IDFSNum = InstrToDFSNum(OrigInst);
- // Make sure it's marked as a temporary instruction.
- AllTempInstructions.insert(TransInst);
- // and make sure anything that tries to add it's DFS number is
- // redirected to the instruction we are making a phi of ops
- // for.
- TempToBlock.insert({TransInst, PredBB});
- InstrDFS.insert({TransInst, IDFSNum});
-
- const Expression *E = performSymbolicEvaluation(TransInst, Visited);
- InstrDFS.erase(TransInst);
- AllTempInstructions.erase(TransInst);
- TempToBlock.erase(TransInst);
- if (MemAccess)
- TempToMemory.erase(TransInst);
- if (!E)
- return nullptr;
- auto *FoundVal = findPHIOfOpsLeader(E, OrigInst, PredBB);
- if (!FoundVal) {
- ExpressionToPhiOfOps[E].insert(OrigInst);
- LLVM_DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst
- << " in block " << getBlockName(PredBB) << "\n");
- return nullptr;
- }
- if (auto *SI = dyn_cast<StoreInst>(FoundVal))
- FoundVal = SI->getValueOperand();
- return FoundVal;
-}
-
-// When we see an instruction that is an op of phis, generate the equivalent phi
-// of ops form.
-const Expression *
-NewGVN::makePossiblePHIOfOps(Instruction *I,
- SmallPtrSetImpl<Value *> &Visited) {
- if (!okayForPHIOfOps(I))
- return nullptr;
-
- if (!Visited.insert(I).second)
- return nullptr;
- // For now, we require the instruction be cycle free because we don't
- // *always* create a phi of ops for instructions that could be done as phi
- // of ops, we only do it if we think it is useful. If we did do it all the
- // time, we could remove the cycle free check.
- if (!isCycleFree(I))
- return nullptr;
-
- SmallPtrSet<const Value *, 8> ProcessedPHIs;
- // TODO: We don't do phi translation on memory accesses because it's
- // complicated. For a load, we'd need to be able to simulate a new memoryuse,
- // which we don't have a good way of doing ATM.
- auto *MemAccess = getMemoryAccess(I);
- // If the memory operation is defined by a memory operation this block that
- // isn't a MemoryPhi, transforming the pointer backwards through a scalar phi
- // can't help, as it would still be killed by that memory operation.
- if (MemAccess && !isa<MemoryPhi>(MemAccess->getDefiningAccess()) &&
- MemAccess->getDefiningAccess()->getBlock() == I->getParent())
- return nullptr;
-
- // Convert op of phis to phi of ops
- SmallPtrSet<const Value *, 10> VisitedOps;
- SmallVector<Value *, 4> Ops(I->operand_values());
- BasicBlock *SamePHIBlock = nullptr;
- PHINode *OpPHI = nullptr;
- if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
- return nullptr;
- for (auto *Op : Ops) {
- if (!isa<PHINode>(Op)) {
- auto *ValuePHI = RealToTemp.lookup(Op);
- if (!ValuePHI)
- continue;
- LLVM_DEBUG(dbgs() << "Found possible dependent phi of ops\n");
- Op = ValuePHI;
- }
- OpPHI = cast<PHINode>(Op);
- if (!SamePHIBlock) {
- SamePHIBlock = getBlockForValue(OpPHI);
- } else if (SamePHIBlock != getBlockForValue(OpPHI)) {
- LLVM_DEBUG(
- dbgs()
- << "PHIs for operands are not all in the same block, aborting\n");
- return nullptr;
- }
- // No point in doing this for one-operand phis.
- if (OpPHI->getNumOperands() == 1) {
- OpPHI = nullptr;
- continue;
- }
- }
-
- if (!OpPHI)
- return nullptr;
-
- SmallVector<ValPair, 4> PHIOps;
- SmallPtrSet<Value *, 4> Deps;
- auto *PHIBlock = getBlockForValue(OpPHI);
- RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I));
- for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) {
- auto *PredBB = OpPHI->getIncomingBlock(PredNum);
- Value *FoundVal = nullptr;
- SmallPtrSet<Value *, 4> CurrentDeps;
- // We could just skip unreachable edges entirely but it's tricky to do
- // with rewriting existing phi nodes.
- if (ReachableEdges.count({PredBB, PHIBlock})) {
- // Clone the instruction, create an expression from it that is
- // translated back into the predecessor, and see if we have a leader.
- Instruction *ValueOp = I->clone();
- if (MemAccess)
- TempToMemory.insert({ValueOp, MemAccess});
- bool SafeForPHIOfOps = true;
- VisitedOps.clear();
- for (auto &Op : ValueOp->operands()) {
- auto *OrigOp = &*Op;
- // When these operand changes, it could change whether there is a
- // leader for us or not, so we have to add additional users.
- if (isa<PHINode>(Op)) {
- Op = Op->DoPHITranslation(PHIBlock, PredBB);
- if (Op != OrigOp && Op != I)
- CurrentDeps.insert(Op);
- } else if (auto *ValuePHI = RealToTemp.lookup(Op)) {
- if (getBlockForValue(ValuePHI) == PHIBlock)
- Op = ValuePHI->getIncomingValueForBlock(PredBB);
- }
- // If we phi-translated the op, it must be safe.
- SafeForPHIOfOps =
- SafeForPHIOfOps &&
- (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps));
- }
- // FIXME: For those things that are not safe we could generate
- // expressions all the way down, and see if this comes out to a
- // constant. For anything where that is true, and unsafe, we should
- // have made a phi-of-ops (or value numbered it equivalent to something)
- // for the pieces already.
- FoundVal = !SafeForPHIOfOps ? nullptr
- : findLeaderForInst(ValueOp, Visited,
- MemAccess, I, PredBB);
- ValueOp->deleteValue();
- if (!FoundVal) {
- // We failed to find a leader for the current ValueOp, but this might
- // change in case of the translated operands change.
- if (SafeForPHIOfOps)
- for (auto Dep : CurrentDeps)
- addAdditionalUsers(Dep, I);
-
- return nullptr;
- }
- Deps.insert(CurrentDeps.begin(), CurrentDeps.end());
- } else {
- LLVM_DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
- << getBlockName(PredBB)
- << " because the block is unreachable\n");
- FoundVal = UndefValue::get(I->getType());
- RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
- }
-
- PHIOps.push_back({FoundVal, PredBB});
- LLVM_DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
- << getBlockName(PredBB) << "\n");
- }
- for (auto Dep : Deps)
- addAdditionalUsers(Dep, I);
- sortPHIOps(PHIOps);
- auto *E = performSymbolicPHIEvaluation(PHIOps, I, PHIBlock);
- if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) {
- LLVM_DEBUG(
- dbgs()
- << "Not creating real PHI of ops because it simplified to existing "
- "value or constant\n");
- return E;
- }
- auto *ValuePHI = RealToTemp.lookup(I);
- bool NewPHI = false;
- if (!ValuePHI) {
- ValuePHI =
- PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops");
- addPhiOfOps(ValuePHI, PHIBlock, I);
- NewPHI = true;
- NumGVNPHIOfOpsCreated++;
- }
- if (NewPHI) {
- for (auto PHIOp : PHIOps)
- ValuePHI->addIncoming(PHIOp.first, PHIOp.second);
- } else {
- TempToBlock[ValuePHI] = PHIBlock;
- unsigned int i = 0;
- for (auto PHIOp : PHIOps) {
- ValuePHI->setIncomingValue(i, PHIOp.first);
- ValuePHI->setIncomingBlock(i, PHIOp.second);
- ++i;
- }
- }
- RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
- LLVM_DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
- << "\n");
-
- return E;
-}
-
-// The algorithm initially places the values of the routine in the TOP
-// congruence class. The leader of TOP is the undetermined value `undef`.
-// When the algorithm has finished, values still in TOP are unreachable.
-void NewGVN::initializeCongruenceClasses(Function &F) {
- NextCongruenceNum = 0;
-
- // Note that even though we use the live on entry def as a representative
- // MemoryAccess, it is *not* the same as the actual live on entry def. We
- // have no real equivalemnt to undef for MemoryAccesses, and so we really
- // should be checking whether the MemoryAccess is top if we want to know if it
- // is equivalent to everything. Otherwise, what this really signifies is that
- // the access "it reaches all the way back to the beginning of the function"
-
- // Initialize all other instructions to be in TOP class.
- TOPClass = createCongruenceClass(nullptr, nullptr);
- TOPClass->setMemoryLeader(MSSA->getLiveOnEntryDef());
- // The live on entry def gets put into it's own class
- MemoryAccessToClass[MSSA->getLiveOnEntryDef()] =
- createMemoryClass(MSSA->getLiveOnEntryDef());
-
- for (auto DTN : nodes(DT)) {
- BasicBlock *BB = DTN->getBlock();
- // All MemoryAccesses are equivalent to live on entry to start. They must
- // be initialized to something so that initial changes are noticed. For
- // the maximal answer, we initialize them all to be the same as
- // liveOnEntry.
- auto *MemoryBlockDefs = MSSA->getBlockDefs(BB);
- if (MemoryBlockDefs)
- for (const auto &Def : *MemoryBlockDefs) {
- MemoryAccessToClass[&Def] = TOPClass;
- auto *MD = dyn_cast<MemoryDef>(&Def);
- // Insert the memory phis into the member list.
- if (!MD) {
- const MemoryPhi *MP = cast<MemoryPhi>(&Def);
- TOPClass->memory_insert(MP);
- MemoryPhiState.insert({MP, MPS_TOP});
- }
-
- if (MD && isa<StoreInst>(MD->getMemoryInst()))
- TOPClass->incStoreCount();
- }
-
- // FIXME: This is trying to discover which instructions are uses of phi
- // nodes. We should move this into one of the myriad of places that walk
- // all the operands already.
- for (auto &I : *BB) {
- if (isa<PHINode>(&I))
- for (auto *U : I.users())
- if (auto *UInst = dyn_cast<Instruction>(U))
- if (InstrToDFSNum(UInst) != 0 && okayForPHIOfOps(UInst))
- PHINodeUses.insert(UInst);
- // Don't insert void terminators into the class. We don't value number
- // them, and they just end up sitting in TOP.
- if (I.isTerminator() && I.getType()->isVoidTy())
- continue;
- TOPClass->insert(&I);
- ValueToClass[&I] = TOPClass;
- }
- }
-
- // Initialize arguments to be in their own unique congruence classes
- for (auto &FA : F.args())
- createSingletonCongruenceClass(&FA);
-}
-
-void NewGVN::cleanupTables() {
- for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) {
- LLVM_DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
- << " has " << CongruenceClasses[i]->size()
- << " members\n");
- // Make sure we delete the congruence class (probably worth switching to
- // a unique_ptr at some point.
- delete CongruenceClasses[i];
- CongruenceClasses[i] = nullptr;
- }
-
- // Destroy the value expressions
- SmallVector<Instruction *, 8> TempInst(AllTempInstructions.begin(),
- AllTempInstructions.end());
- AllTempInstructions.clear();
-
- // We have to drop all references for everything first, so there are no uses
- // left as we delete them.
- for (auto *I : TempInst) {
- I->dropAllReferences();
- }
-
- while (!TempInst.empty()) {
+ }
+
+ return nullptr;
+}
+
+// Evaluate read only and pure calls, and create an expression result.
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const {
+ auto *CI = cast<CallInst>(I);
+ if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+ // Intrinsics with the returned attribute are copies of arguments.
+ if (auto *ReturnedValue = II->getReturnedArgOperand()) {
+ if (II->getIntrinsicID() == Intrinsic::ssa_copy)
+ if (const auto *Result = performSymbolicPredicateInfoEvaluation(I))
+ return Result;
+ return createVariableOrConstant(ReturnedValue);
+ }
+ }
+ if (AA->doesNotAccessMemory(CI)) {
+ return createCallExpression(CI, TOPClass->getMemoryLeader());
+ } else if (AA->onlyReadsMemory(CI)) {
+ if (auto *MA = MSSA->getMemoryAccess(CI)) {
+ auto *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(MA);
+ return createCallExpression(CI, DefiningAccess);
+ } else // MSSA determined that CI does not access memory.
+ return createCallExpression(CI, TOPClass->getMemoryLeader());
+ }
+ return nullptr;
+}
+
+// Retrieve the memory class for a given MemoryAccess.
+CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const {
+ auto *Result = MemoryAccessToClass.lookup(MA);
+ assert(Result && "Should have found memory class");
+ return Result;
+}
+
+// Update the MemoryAccess equivalence table to say that From is equal to To,
+// and return true if this is different from what already existed in the table.
+bool NewGVN::setMemoryClass(const MemoryAccess *From,
+ CongruenceClass *NewClass) {
+ assert(NewClass &&
+ "Every MemoryAccess should be getting mapped to a non-null class");
+ LLVM_DEBUG(dbgs() << "Setting " << *From);
+ LLVM_DEBUG(dbgs() << " equivalent to congruence class ");
+ LLVM_DEBUG(dbgs() << NewClass->getID()
+ << " with current MemoryAccess leader ");
+ LLVM_DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n");
+
+ auto LookupResult = MemoryAccessToClass.find(From);
+ bool Changed = false;
+ // If it's already in the table, see if the value changed.
+ if (LookupResult != MemoryAccessToClass.end()) {
+ auto *OldClass = LookupResult->second;
+ if (OldClass != NewClass) {
+ // If this is a phi, we have to handle memory member updates.
+ if (auto *MP = dyn_cast<MemoryPhi>(From)) {
+ OldClass->memory_erase(MP);
+ NewClass->memory_insert(MP);
+ // This may have killed the class if it had no non-memory members
+ if (OldClass->getMemoryLeader() == From) {
+ if (OldClass->definesNoMemory()) {
+ OldClass->setMemoryLeader(nullptr);
+ } else {
+ OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+ LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+ << OldClass->getID() << " to "
+ << *OldClass->getMemoryLeader()
+ << " due to removal of a memory member " << *From
+ << "\n");
+ markMemoryLeaderChangeTouched(OldClass);
+ }
+ }
+ }
+ // It wasn't equivalent before, and now it is.
+ LookupResult->second = NewClass;
+ Changed = true;
+ }
+ }
+
+ return Changed;
+}
+
+// Determine if a instruction is cycle-free. That means the values in the
+// instruction don't depend on any expressions that can change value as a result
+// of the instruction. For example, a non-cycle free instruction would be v =
+// phi(0, v+1).
+bool NewGVN::isCycleFree(const Instruction *I) const {
+ // In order to compute cycle-freeness, we do SCC finding on the instruction,
+ // and see what kind of SCC it ends up in. If it is a singleton, it is
+ // cycle-free. If it is not in a singleton, it is only cycle free if the
+ // other members are all phi nodes (as they do not compute anything, they are
+ // copies).
+ auto ICS = InstCycleState.lookup(I);
+ if (ICS == ICS_Unknown) {
+ SCCFinder.Start(I);
+ auto &SCC = SCCFinder.getComponentFor(I);
+ // It's cycle free if it's size 1 or the SCC is *only* phi nodes.
+ if (SCC.size() == 1)
+ InstCycleState.insert({I, ICS_CycleFree});
+ else {
+ bool AllPhis = llvm::all_of(SCC, [](const Value *V) {
+ return isa<PHINode>(V) || isCopyOfAPHI(V);
+ });
+ ICS = AllPhis ? ICS_CycleFree : ICS_Cycle;
+ for (auto *Member : SCC)
+ if (auto *MemberPhi = dyn_cast<PHINode>(Member))
+ InstCycleState.insert({MemberPhi, ICS});
+ }
+ }
+ if (ICS == ICS_Cycle)
+ return false;
+ return true;
+}
+
+// Evaluate PHI nodes symbolically and create an expression result.
+const Expression *
+NewGVN::performSymbolicPHIEvaluation(ArrayRef<ValPair> PHIOps,
+ Instruction *I,
+ BasicBlock *PHIBlock) const {
+ // True if one of the incoming phi edges is a backedge.
+ bool HasBackedge = false;
+ // All constant tracks the state of whether all the *original* phi operands
+ // This is really shorthand for "this phi cannot cycle due to forward
+ // change in value of the phi is guaranteed not to later change the value of
+ // the phi. IE it can't be v = phi(undef, v+1)
+ bool OriginalOpsConstant = true;
+ auto *E = cast<PHIExpression>(createPHIExpression(
+ PHIOps, I, PHIBlock, HasBackedge, OriginalOpsConstant));
+ // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
+ // See if all arguments are the same.
+ // We track if any were undef because they need special handling.
+ bool HasUndef = false;
+ auto Filtered = make_filter_range(E->operands(), [&](Value *Arg) {
+ if (isa<UndefValue>(Arg)) {
+ HasUndef = true;
+ return false;
+ }
+ return true;
+ });
+ // If we are left with no operands, it's dead.
+ if (Filtered.empty()) {
+ // If it has undef at this point, it means there are no-non-undef arguments,
+ // and thus, the value of the phi node must be undef.
+ if (HasUndef) {
+ LLVM_DEBUG(
+ dbgs() << "PHI Node " << *I
+ << " has no non-undef arguments, valuing it as undef\n");
+ return createConstantExpression(UndefValue::get(I->getType()));
+ }
+
+ LLVM_DEBUG(dbgs() << "No arguments of PHI node " << *I << " are live\n");
+ deleteExpression(E);
+ return createDeadExpression();
+ }
+ Value *AllSameValue = *(Filtered.begin());
+ ++Filtered.begin();
+ // Can't use std::equal here, sadly, because filter.begin moves.
+ if (llvm::all_of(Filtered, [&](Value *Arg) { return Arg == AllSameValue; })) {
+ // In LLVM's non-standard representation of phi nodes, it's possible to have
+ // phi nodes with cycles (IE dependent on other phis that are .... dependent
+ // on the original phi node), especially in weird CFG's where some arguments
+ // are unreachable, or uninitialized along certain paths. This can cause
+ // infinite loops during evaluation. We work around this by not trying to
+ // really evaluate them independently, but instead using a variable
+ // expression to say if one is equivalent to the other.
+ // We also special case undef, so that if we have an undef, we can't use the
+ // common value unless it dominates the phi block.
+ if (HasUndef) {
+ // If we have undef and at least one other value, this is really a
+ // multivalued phi, and we need to know if it's cycle free in order to
+ // evaluate whether we can ignore the undef. The other parts of this are
+ // just shortcuts. If there is no backedge, or all operands are
+ // constants, it also must be cycle free.
+ if (HasBackedge && !OriginalOpsConstant &&
+ !isa<UndefValue>(AllSameValue) && !isCycleFree(I))
+ return E;
+
+ // Only have to check for instructions
+ if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue))
+ if (!someEquivalentDominates(AllSameInst, I))
+ return E;
+ }
+ // Can't simplify to something that comes later in the iteration.
+ // Otherwise, when and if it changes congruence class, we will never catch
+ // up. We will always be a class behind it.
+ if (isa<Instruction>(AllSameValue) &&
+ InstrToDFSNum(AllSameValue) > InstrToDFSNum(I))
+ return E;
+ NumGVNPhisAllSame++;
+ LLVM_DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
+ << "\n");
+ deleteExpression(E);
+ return createVariableOrConstant(AllSameValue);
+ }
+ return E;
+}
+
+const Expression *
+NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const {
+ if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
+ auto *WO = dyn_cast<WithOverflowInst>(EI->getAggregateOperand());
+ if (WO && EI->getNumIndices() == 1 && *EI->idx_begin() == 0)
+ // EI is an extract from one of our with.overflow intrinsics. Synthesize
+ // a semantically equivalent expression instead of an extract value
+ // expression.
+ return createBinaryExpression(WO->getBinaryOp(), EI->getType(),
+ WO->getLHS(), WO->getRHS(), I);
+ }
+
+ return createAggregateValueExpression(I);
+}
+
+const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const {
+ assert(isa<CmpInst>(I) && "Expected a cmp instruction.");
+
+ auto *CI = cast<CmpInst>(I);
+ // See if our operands are equal to those of a previous predicate, and if so,
+ // if it implies true or false.
+ auto Op0 = lookupOperandLeader(CI->getOperand(0));
+ auto Op1 = lookupOperandLeader(CI->getOperand(1));
+ auto OurPredicate = CI->getPredicate();
+ if (shouldSwapOperands(Op0, Op1)) {
+ std::swap(Op0, Op1);
+ OurPredicate = CI->getSwappedPredicate();
+ }
+
+ // Avoid processing the same info twice.
+ const PredicateBase *LastPredInfo = nullptr;
+ // See if we know something about the comparison itself, like it is the target
+ // of an assume.
+ auto *CmpPI = PredInfo->getPredicateInfoFor(I);
+ if (dyn_cast_or_null<PredicateAssume>(CmpPI))
+ return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+
+ if (Op0 == Op1) {
+ // This condition does not depend on predicates, no need to add users
+ if (CI->isTrueWhenEqual())
+ return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+ else if (CI->isFalseWhenEqual())
+ return createConstantExpression(ConstantInt::getFalse(CI->getType()));
+ }
+
+ // NOTE: Because we are comparing both operands here and below, and using
+ // previous comparisons, we rely on fact that predicateinfo knows to mark
+ // comparisons that use renamed operands as users of the earlier comparisons.
+ // It is *not* enough to just mark predicateinfo renamed operands as users of
+ // the earlier comparisons, because the *other* operand may have changed in a
+ // previous iteration.
+ // Example:
+ // icmp slt %a, %b
+ // %b.0 = ssa.copy(%b)
+ // false branch:
+ // icmp slt %c, %b.0
+
+ // %c and %a may start out equal, and thus, the code below will say the second
+ // %icmp is false. c may become equal to something else, and in that case the
+ // %second icmp *must* be reexamined, but would not if only the renamed
+ // %operands are considered users of the icmp.
+
+ // *Currently* we only check one level of comparisons back, and only mark one
+ // level back as touched when changes happen. If you modify this code to look
+ // back farther through comparisons, you *must* mark the appropriate
+ // comparisons as users in PredicateInfo.cpp, or you will cause bugs. See if
+ // we know something just from the operands themselves
+
+ // See if our operands have predicate info, so that we may be able to derive
+ // something from a previous comparison.
+ for (const auto &Op : CI->operands()) {
+ auto *PI = PredInfo->getPredicateInfoFor(Op);
+ if (const auto *PBranch = dyn_cast_or_null<PredicateBranch>(PI)) {
+ if (PI == LastPredInfo)
+ continue;
+ LastPredInfo = PI;
+ // In phi of ops cases, we may have predicate info that we are evaluating
+ // in a different context.
+ if (!DT->dominates(PBranch->To, getBlockForValue(I)))
+ continue;
+ // TODO: Along the false edge, we may know more things too, like
+ // icmp of
+ // same operands is false.
+ // TODO: We only handle actual comparison conditions below, not
+ // and/or.
+ auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition);
+ if (!BranchCond)
+ continue;
+ auto *BranchOp0 = lookupOperandLeader(BranchCond->getOperand(0));
+ auto *BranchOp1 = lookupOperandLeader(BranchCond->getOperand(1));
+ auto BranchPredicate = BranchCond->getPredicate();
+ if (shouldSwapOperands(BranchOp0, BranchOp1)) {
+ std::swap(BranchOp0, BranchOp1);
+ BranchPredicate = BranchCond->getSwappedPredicate();
+ }
+ if (BranchOp0 == Op0 && BranchOp1 == Op1) {
+ if (PBranch->TrueEdge) {
+ // If we know the previous predicate is true and we are in the true
+ // edge then we may be implied true or false.
+ if (CmpInst::isImpliedTrueByMatchingCmp(BranchPredicate,
+ OurPredicate)) {
+ addPredicateUsers(PI, I);
+ return createConstantExpression(
+ ConstantInt::getTrue(CI->getType()));
+ }
+
+ if (CmpInst::isImpliedFalseByMatchingCmp(BranchPredicate,
+ OurPredicate)) {
+ addPredicateUsers(PI, I);
+ return createConstantExpression(
+ ConstantInt::getFalse(CI->getType()));
+ }
+ } else {
+ // Just handle the ne and eq cases, where if we have the same
+ // operands, we may know something.
+ if (BranchPredicate == OurPredicate) {
+ addPredicateUsers(PI, I);
+ // Same predicate, same ops,we know it was false, so this is false.
+ return createConstantExpression(
+ ConstantInt::getFalse(CI->getType()));
+ } else if (BranchPredicate ==
+ CmpInst::getInversePredicate(OurPredicate)) {
+ addPredicateUsers(PI, I);
+ // Inverse predicate, we know the other was false, so this is true.
+ return createConstantExpression(
+ ConstantInt::getTrue(CI->getType()));
+ }
+ }
+ }
+ }
+ }
+ // Create expression will take care of simplifyCmpInst
+ return createExpression(I);
+}
+
+// Substitute and symbolize the value before value numbering.
+const Expression *
+NewGVN::performSymbolicEvaluation(Value *V,
+ SmallPtrSetImpl<Value *> &Visited) const {
+ const Expression *E = nullptr;
+ if (auto *C = dyn_cast<Constant>(V))
+ E = createConstantExpression(C);
+ else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
+ E = createVariableExpression(V);
+ } else {
+ // TODO: memory intrinsics.
+ // TODO: Some day, we should do the forward propagation and reassociation
+ // parts of the algorithm.
+ auto *I = cast<Instruction>(V);
+ switch (I->getOpcode()) {
+ case Instruction::ExtractValue:
+ case Instruction::InsertValue:
+ E = performSymbolicAggrValueEvaluation(I);
+ break;
+ case Instruction::PHI: {
+ SmallVector<ValPair, 3> Ops;
+ auto *PN = cast<PHINode>(I);
+ for (unsigned i = 0; i < PN->getNumOperands(); ++i)
+ Ops.push_back({PN->getIncomingValue(i), PN->getIncomingBlock(i)});
+ // Sort to ensure the invariant createPHIExpression requires is met.
+ sortPHIOps(Ops);
+ E = performSymbolicPHIEvaluation(Ops, I, getBlockForValue(I));
+ } break;
+ case Instruction::Call:
+ E = performSymbolicCallEvaluation(I);
+ break;
+ case Instruction::Store:
+ E = performSymbolicStoreEvaluation(I);
+ break;
+ case Instruction::Load:
+ E = performSymbolicLoadEvaluation(I);
+ break;
+ case Instruction::BitCast:
+ case Instruction::AddrSpaceCast:
+ E = createExpression(I);
+ break;
+ case Instruction::ICmp:
+ case Instruction::FCmp:
+ E = performSymbolicCmpEvaluation(I);
+ break;
+ case Instruction::FNeg:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPTrunc:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::Select:
+ case Instruction::ExtractElement:
+ case Instruction::InsertElement:
+ case Instruction::GetElementPtr:
+ E = createExpression(I);
+ break;
+ case Instruction::ShuffleVector:
+ // FIXME: Add support for shufflevector to createExpression.
+ return nullptr;
+ default:
+ return nullptr;
+ }
+ }
+ return E;
+}
+
+// Look up a container of values/instructions in a map, and touch all the
+// instructions in the container. Then erase value from the map.
+template <typename Map, typename KeyType>
+void NewGVN::touchAndErase(Map &M, const KeyType &Key) {
+ const auto Result = M.find_as(Key);
+ if (Result != M.end()) {
+ for (const typename Map::mapped_type::value_type Mapped : Result->second)
+ TouchedInstructions.set(InstrToDFSNum(Mapped));
+ M.erase(Result);
+ }
+}
+
+void NewGVN::addAdditionalUsers(Value *To, Value *User) const {
+ assert(User && To != User);
+ if (isa<Instruction>(To))
+ AdditionalUsers[To].insert(User);
+}
+
+void NewGVN::markUsersTouched(Value *V) {
+ // Now mark the users as touched.
+ for (auto *User : V->users()) {
+ assert(isa<Instruction>(User) && "Use of value not within an instruction?");
+ TouchedInstructions.set(InstrToDFSNum(User));
+ }
+ touchAndErase(AdditionalUsers, V);
+}
+
+void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const {
+ LLVM_DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
+ MemoryToUsers[To].insert(U);
+}
+
+void NewGVN::markMemoryDefTouched(const MemoryAccess *MA) {
+ TouchedInstructions.set(MemoryToDFSNum(MA));
+}
+
+void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
+ if (isa<MemoryUse>(MA))
+ return;
+ for (auto U : MA->users())
+ TouchedInstructions.set(MemoryToDFSNum(U));
+ touchAndErase(MemoryToUsers, MA);
+}
+
+// Add I to the set of users of a given predicate.
+void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const {
+ // Don't add temporary instructions to the user lists.
+ if (AllTempInstructions.count(I))
+ return;
+
+ if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
+ PredicateToUsers[PBranch->Condition].insert(I);
+ else if (auto *PAssume = dyn_cast<PredicateAssume>(PB))
+ PredicateToUsers[PAssume->Condition].insert(I);
+}
+
+// Touch all the predicates that depend on this instruction.
+void NewGVN::markPredicateUsersTouched(Instruction *I) {
+ touchAndErase(PredicateToUsers, I);
+}
+
+// Mark users affected by a memory leader change.
+void NewGVN::markMemoryLeaderChangeTouched(CongruenceClass *CC) {
+ for (auto M : CC->memory())
+ markMemoryDefTouched(M);
+}
+
+// Touch the instructions that need to be updated after a congruence class has a
+// leader change, and mark changed values.
+void NewGVN::markValueLeaderChangeTouched(CongruenceClass *CC) {
+ for (auto M : *CC) {
+ if (auto *I = dyn_cast<Instruction>(M))
+ TouchedInstructions.set(InstrToDFSNum(I));
+ LeaderChanges.insert(M);
+ }
+}
+
+// Give a range of things that have instruction DFS numbers, this will return
+// the member of the range with the smallest dfs number.
+template <class T, class Range>
+T *NewGVN::getMinDFSOfRange(const Range &R) const {
+ std::pair<T *, unsigned> MinDFS = {nullptr, ~0U};
+ for (const auto X : R) {
+ auto DFSNum = InstrToDFSNum(X);
+ if (DFSNum < MinDFS.second)
+ MinDFS = {X, DFSNum};
+ }
+ return MinDFS.first;
+}
+
+// This function returns the MemoryAccess that should be the next leader of
+// congruence class CC, under the assumption that the current leader is going to
+// disappear.
+const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
+ // TODO: If this ends up to slow, we can maintain a next memory leader like we
+ // do for regular leaders.
+ // Make sure there will be a leader to find.
+ assert(!CC->definesNoMemory() && "Can't get next leader if there is none");
+ if (CC->getStoreCount() > 0) {
+ if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
+ return getMemoryAccess(NL);
+ // Find the store with the minimum DFS number.
+ auto *V = getMinDFSOfRange<Value>(make_filter_range(
+ *CC, [&](const Value *V) { return isa<StoreInst>(V); }));
+ return getMemoryAccess(cast<StoreInst>(V));
+ }
+ assert(CC->getStoreCount() == 0);
+
+ // Given our assertion, hitting this part must mean
+ // !OldClass->memory_empty()
+ if (CC->memory_size() == 1)
+ return *CC->memory_begin();
+ return getMinDFSOfRange<const MemoryPhi>(CC->memory());
+}
+
+// This function returns the next value leader of a congruence class, under the
+// assumption that the current leader is going away. This should end up being
+// the next most dominating member.
+Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const {
+ // We don't need to sort members if there is only 1, and we don't care about
+ // sorting the TOP class because everything either gets out of it or is
+ // unreachable.
+
+ if (CC->size() == 1 || CC == TOPClass) {
+ return *(CC->begin());
+ } else if (CC->getNextLeader().first) {
+ ++NumGVNAvoidedSortedLeaderChanges;
+ return CC->getNextLeader().first;
+ } else {
+ ++NumGVNSortedLeaderChanges;
+ // NOTE: If this ends up to slow, we can maintain a dual structure for
+ // member testing/insertion, or keep things mostly sorted, and sort only
+ // here, or use SparseBitVector or ....
+ return getMinDFSOfRange<Value>(*CC);
+ }
+}
+
+// Move a MemoryAccess, currently in OldClass, to NewClass, including updates to
+// the memory members, etc for the move.
+//
+// The invariants of this function are:
+//
+// - I must be moving to NewClass from OldClass
+// - The StoreCount of OldClass and NewClass is expected to have been updated
+// for I already if it is a store.
+// - The OldClass memory leader has not been updated yet if I was the leader.
+void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
+ MemoryAccess *InstMA,
+ CongruenceClass *OldClass,
+ CongruenceClass *NewClass) {
+ // If the leader is I, and we had a representative MemoryAccess, it should
+ // be the MemoryAccess of OldClass.
+ assert((!InstMA || !OldClass->getMemoryLeader() ||
+ OldClass->getLeader() != I ||
+ MemoryAccessToClass.lookup(OldClass->getMemoryLeader()) ==
+ MemoryAccessToClass.lookup(InstMA)) &&
+ "Representative MemoryAccess mismatch");
+ // First, see what happens to the new class
+ if (!NewClass->getMemoryLeader()) {
+ // Should be a new class, or a store becoming a leader of a new class.
+ assert(NewClass->size() == 1 ||
+ (isa<StoreInst>(I) && NewClass->getStoreCount() == 1));
+ NewClass->setMemoryLeader(InstMA);
+ // Mark it touched if we didn't just create a singleton
+ LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+ << NewClass->getID()
+ << " due to new memory instruction becoming leader\n");
+ markMemoryLeaderChangeTouched(NewClass);
+ }
+ setMemoryClass(InstMA, NewClass);
+ // Now, fixup the old class if necessary
+ if (OldClass->getMemoryLeader() == InstMA) {
+ if (!OldClass->definesNoMemory()) {
+ OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+ LLVM_DEBUG(dbgs() << "Memory class leader change for class "
+ << OldClass->getID() << " to "
+ << *OldClass->getMemoryLeader()
+ << " due to removal of old leader " << *InstMA << "\n");
+ markMemoryLeaderChangeTouched(OldClass);
+ } else
+ OldClass->setMemoryLeader(nullptr);
+ }
+}
+
+// Move a value, currently in OldClass, to be part of NewClass
+// Update OldClass and NewClass for the move (including changing leaders, etc).
+void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
+ CongruenceClass *OldClass,
+ CongruenceClass *NewClass) {
+ if (I == OldClass->getNextLeader().first)
+ OldClass->resetNextLeader();
+
+ OldClass->erase(I);
+ NewClass->insert(I);
+
+ if (NewClass->getLeader() != I)
+ NewClass->addPossibleNextLeader({I, InstrToDFSNum(I)});
+ // Handle our special casing of stores.
+ if (auto *SI = dyn_cast<StoreInst>(I)) {
+ OldClass->decStoreCount();
+ // Okay, so when do we want to make a store a leader of a class?
+ // If we have a store defined by an earlier load, we want the earlier load
+ // to lead the class.
+ // If we have a store defined by something else, we want the store to lead
+ // the class so everything else gets the "something else" as a value.
+ // If we have a store as the single member of the class, we want the store
+ // as the leader
+ if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
+ // If it's a store expression we are using, it means we are not equivalent
+ // to something earlier.
+ if (auto *SE = dyn_cast<StoreExpression>(E)) {
+ NewClass->setStoredValue(SE->getStoredValue());
+ markValueLeaderChangeTouched(NewClass);
+ // Shift the new class leader to be the store
+ LLVM_DEBUG(dbgs() << "Changing leader of congruence class "
+ << NewClass->getID() << " from "
+ << *NewClass->getLeader() << " to " << *SI
+ << " because store joined class\n");
+ // If we changed the leader, we have to mark it changed because we don't
+ // know what it will do to symbolic evaluation.
+ NewClass->setLeader(SI);
+ }
+ // We rely on the code below handling the MemoryAccess change.
+ }
+ NewClass->incStoreCount();
+ }
+ // True if there is no memory instructions left in a class that had memory
+ // instructions before.
+
+ // If it's not a memory use, set the MemoryAccess equivalence
+ auto *InstMA = dyn_cast_or_null<MemoryDef>(getMemoryAccess(I));
+ if (InstMA)
+ moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass);
+ ValueToClass[I] = NewClass;
+ // See if we destroyed the class or need to swap leaders.
+ if (OldClass->empty() && OldClass != TOPClass) {
+ if (OldClass->getDefiningExpr()) {
+ LLVM_DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr()
+ << " from table\n");
+ // We erase it as an exact expression to make sure we don't just erase an
+ // equivalent one.
+ auto Iter = ExpressionToClass.find_as(
+ ExactEqualsExpression(*OldClass->getDefiningExpr()));
+ if (Iter != ExpressionToClass.end())
+ ExpressionToClass.erase(Iter);
+#ifdef EXPENSIVE_CHECKS
+ assert(
+ (*OldClass->getDefiningExpr() != *E || ExpressionToClass.lookup(E)) &&
+ "We erased the expression we just inserted, which should not happen");
+#endif
+ }
+ } else if (OldClass->getLeader() == I) {
+ // When the leader changes, the value numbering of
+ // everything may change due to symbolization changes, so we need to
+ // reprocess.
+ LLVM_DEBUG(dbgs() << "Value class leader change for class "
+ << OldClass->getID() << "\n");
+ ++NumGVNLeaderChanges;
+ // Destroy the stored value if there are no more stores to represent it.
+ // Note that this is basically clean up for the expression removal that
+ // happens below. If we remove stores from a class, we may leave it as a
+ // class of equivalent memory phis.
+ if (OldClass->getStoreCount() == 0) {
+ if (OldClass->getStoredValue())
+ OldClass->setStoredValue(nullptr);
+ }
+ OldClass->setLeader(getNextValueLeader(OldClass));
+ OldClass->resetNextLeader();
+ markValueLeaderChangeTouched(OldClass);
+ }
+}
+
+// For a given expression, mark the phi of ops instructions that could have
+// changed as a result.
+void NewGVN::markPhiOfOpsChanged(const Expression *E) {
+ touchAndErase(ExpressionToPhiOfOps, E);
+}
+
+// Perform congruence finding on a given value numbering expression.
+void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
+ // This is guaranteed to return something, since it will at least find
+ // TOP.
+
+ CongruenceClass *IClass = ValueToClass.lookup(I);
+ assert(IClass && "Should have found a IClass");
+ // Dead classes should have been eliminated from the mapping.
+ assert(!IClass->isDead() && "Found a dead class");
+
+ CongruenceClass *EClass = nullptr;
+ if (const auto *VE = dyn_cast<VariableExpression>(E)) {
+ EClass = ValueToClass.lookup(VE->getVariableValue());
+ } else if (isa<DeadExpression>(E)) {
+ EClass = TOPClass;
+ }
+ if (!EClass) {
+ auto lookupResult = ExpressionToClass.insert({E, nullptr});
+
+ // If it's not in the value table, create a new congruence class.
+ if (lookupResult.second) {
+ CongruenceClass *NewClass = createCongruenceClass(nullptr, E);
+ auto place = lookupResult.first;
+ place->second = NewClass;
+
+ // Constants and variables should always be made the leader.
+ if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
+ NewClass->setLeader(CE->getConstantValue());
+ } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
+ StoreInst *SI = SE->getStoreInst();
+ NewClass->setLeader(SI);
+ NewClass->setStoredValue(SE->getStoredValue());
+ // The RepMemoryAccess field will be filled in properly by the
+ // moveValueToNewCongruenceClass call.
+ } else {
+ NewClass->setLeader(I);
+ }
+ assert(!isa<VariableExpression>(E) &&
+ "VariableExpression should have been handled already");
+
+ EClass = NewClass;
+ LLVM_DEBUG(dbgs() << "Created new congruence class for " << *I
+ << " using expression " << *E << " at "
+ << NewClass->getID() << " and leader "
+ << *(NewClass->getLeader()));
+ if (NewClass->getStoredValue())
+ LLVM_DEBUG(dbgs() << " and stored value "
+ << *(NewClass->getStoredValue()));
+ LLVM_DEBUG(dbgs() << "\n");
+ } else {
+ EClass = lookupResult.first->second;
+ if (isa<ConstantExpression>(E))
+ assert((isa<Constant>(EClass->getLeader()) ||
+ (EClass->getStoredValue() &&
+ isa<Constant>(EClass->getStoredValue()))) &&
+ "Any class with a constant expression should have a "
+ "constant leader");
+
+ assert(EClass && "Somehow don't have an eclass");
+
+ assert(!EClass->isDead() && "We accidentally looked up a dead class");
+ }
+ }
+ bool ClassChanged = IClass != EClass;
+ bool LeaderChanged = LeaderChanges.erase(I);
+ if (ClassChanged || LeaderChanged) {
+ LLVM_DEBUG(dbgs() << "New class " << EClass->getID() << " for expression "
+ << *E << "\n");
+ if (ClassChanged) {
+ moveValueToNewCongruenceClass(I, E, IClass, EClass);
+ markPhiOfOpsChanged(E);
+ }
+
+ markUsersTouched(I);
+ if (MemoryAccess *MA = getMemoryAccess(I))
+ markMemoryUsersTouched(MA);
+ if (auto *CI = dyn_cast<CmpInst>(I))
+ markPredicateUsersTouched(CI);
+ }
+ // If we changed the class of the store, we want to ensure nothing finds the
+ // old store expression. In particular, loads do not compare against stored
+ // value, so they will find old store expressions (and associated class
+ // mappings) if we leave them in the table.
+ if (ClassChanged && isa<StoreInst>(I)) {
+ auto *OldE = ValueToExpression.lookup(I);
+ // It could just be that the old class died. We don't want to erase it if we
+ // just moved classes.
+ if (OldE && isa<StoreExpression>(OldE) && *E != *OldE) {
+ // Erase this as an exact expression to ensure we don't erase expressions
+ // equivalent to it.
+ auto Iter = ExpressionToClass.find_as(ExactEqualsExpression(*OldE));
+ if (Iter != ExpressionToClass.end())
+ ExpressionToClass.erase(Iter);
+ }
+ }
+ ValueToExpression[I] = E;
+}
+
+// Process the fact that Edge (from, to) is reachable, including marking
+// any newly reachable blocks and instructions for processing.
+void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
+ // Check if the Edge was reachable before.
+ if (ReachableEdges.insert({From, To}).second) {
+ // If this block wasn't reachable before, all instructions are touched.
+ if (ReachableBlocks.insert(To).second) {
+ LLVM_DEBUG(dbgs() << "Block " << getBlockName(To)
+ << " marked reachable\n");
+ const auto &InstRange = BlockInstRange.lookup(To);
+ TouchedInstructions.set(InstRange.first, InstRange.second);
+ } else {
+ LLVM_DEBUG(dbgs() << "Block " << getBlockName(To)
+ << " was reachable, but new edge {"
+ << getBlockName(From) << "," << getBlockName(To)
+ << "} to it found\n");
+
+ // We've made an edge reachable to an existing block, which may
+ // impact predicates. Otherwise, only mark the phi nodes as touched, as
+ // they are the only thing that depend on new edges. Anything using their
+ // values will get propagated to if necessary.
+ if (MemoryAccess *MemPhi = getMemoryAccess(To))
+ TouchedInstructions.set(InstrToDFSNum(MemPhi));
+
+ // FIXME: We should just add a union op on a Bitvector and
+ // SparseBitVector. We can do it word by word faster than we are doing it
+ // here.
+ for (auto InstNum : RevisitOnReachabilityChange[To])
+ TouchedInstructions.set(InstNum);
+ }
+ }
+}
+
+// Given a predicate condition (from a switch, cmp, or whatever) and a block,
+// see if we know some constant value for it already.
+Value *NewGVN::findConditionEquivalence(Value *Cond) const {
+ auto Result = lookupOperandLeader(Cond);
+ return isa<Constant>(Result) ? Result : nullptr;
+}
+
+// Process the outgoing edges of a block for reachability.
+void NewGVN::processOutgoingEdges(Instruction *TI, BasicBlock *B) {
+ // Evaluate reachability of terminator instruction.
+ Value *Cond;
+ BasicBlock *TrueSucc, *FalseSucc;
+ if (match(TI, m_Br(m_Value(Cond), TrueSucc, FalseSucc))) {
+ Value *CondEvaluated = findConditionEquivalence(Cond);
+ if (!CondEvaluated) {
+ if (auto *I = dyn_cast<Instruction>(Cond)) {
+ const Expression *E = createExpression(I);
+ if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
+ CondEvaluated = CE->getConstantValue();
+ }
+ } else if (isa<ConstantInt>(Cond)) {
+ CondEvaluated = Cond;
+ }
+ }
+ ConstantInt *CI;
+ if (CondEvaluated && (CI = dyn_cast<ConstantInt>(CondEvaluated))) {
+ if (CI->isOne()) {
+ LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
+ << " evaluated to true\n");
+ updateReachableEdge(B, TrueSucc);
+ } else if (CI->isZero()) {
+ LLVM_DEBUG(dbgs() << "Condition for Terminator " << *TI
+ << " evaluated to false\n");
+ updateReachableEdge(B, FalseSucc);
+ }
+ } else {
+ updateReachableEdge(B, TrueSucc);
+ updateReachableEdge(B, FalseSucc);
+ }
+ } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+ // For switches, propagate the case values into the case
+ // destinations.
+
+ Value *SwitchCond = SI->getCondition();
+ Value *CondEvaluated = findConditionEquivalence(SwitchCond);
+ // See if we were able to turn this switch statement into a constant.
+ if (CondEvaluated && isa<ConstantInt>(CondEvaluated)) {
+ auto *CondVal = cast<ConstantInt>(CondEvaluated);
+ // We should be able to get case value for this.
+ auto Case = *SI->findCaseValue(CondVal);
+ if (Case.getCaseSuccessor() == SI->getDefaultDest()) {
+ // We proved the value is outside of the range of the case.
+ // We can't do anything other than mark the default dest as reachable,
+ // and go home.
+ updateReachableEdge(B, SI->getDefaultDest());
+ return;
+ }
+ // Now get where it goes and mark it reachable.
+ BasicBlock *TargetBlock = Case.getCaseSuccessor();
+ updateReachableEdge(B, TargetBlock);
+ } else {
+ for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *TargetBlock = SI->getSuccessor(i);
+ updateReachableEdge(B, TargetBlock);
+ }
+ }
+ } else {
+ // Otherwise this is either unconditional, or a type we have no
+ // idea about. Just mark successors as reachable.
+ for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *TargetBlock = TI->getSuccessor(i);
+ updateReachableEdge(B, TargetBlock);
+ }
+
+ // This also may be a memory defining terminator, in which case, set it
+ // equivalent only to itself.
+ //
+ auto *MA = getMemoryAccess(TI);
+ if (MA && !isa<MemoryUse>(MA)) {
+ auto *CC = ensureLeaderOfMemoryClass(MA);
+ if (setMemoryClass(MA, CC))
+ markMemoryUsersTouched(MA);
+ }
+ }
+}
+
+// Remove the PHI of Ops PHI for I
+void NewGVN::removePhiOfOps(Instruction *I, PHINode *PHITemp) {
+ InstrDFS.erase(PHITemp);
+ // It's still a temp instruction. We keep it in the array so it gets erased.
+ // However, it's no longer used by I, or in the block
+ TempToBlock.erase(PHITemp);
+ RealToTemp.erase(I);
+ // We don't remove the users from the phi node uses. This wastes a little
+ // time, but such is life. We could use two sets to track which were there
+ // are the start of NewGVN, and which were added, but right nowt he cost of
+ // tracking is more than the cost of checking for more phi of ops.
+}
+
+// Add PHI Op in BB as a PHI of operations version of ExistingValue.
+void NewGVN::addPhiOfOps(PHINode *Op, BasicBlock *BB,
+ Instruction *ExistingValue) {
+ InstrDFS[Op] = InstrToDFSNum(ExistingValue);
+ AllTempInstructions.insert(Op);
+ TempToBlock[Op] = BB;
+ RealToTemp[ExistingValue] = Op;
+ // Add all users to phi node use, as they are now uses of the phi of ops phis
+ // and may themselves be phi of ops.
+ for (auto *U : ExistingValue->users())
+ if (auto *UI = dyn_cast<Instruction>(U))
+ PHINodeUses.insert(UI);
+}
+
+static bool okayForPHIOfOps(const Instruction *I) {
+ if (!EnablePhiOfOps)
+ return false;
+ return isa<BinaryOperator>(I) || isa<SelectInst>(I) || isa<CmpInst>(I) ||
+ isa<LoadInst>(I);
+}
+
+bool NewGVN::OpIsSafeForPHIOfOpsHelper(
+ Value *V, const BasicBlock *PHIBlock,
+ SmallPtrSetImpl<const Value *> &Visited,
+ SmallVectorImpl<Instruction *> &Worklist) {
+
+ if (!isa<Instruction>(V))
+ return true;
+ auto OISIt = OpSafeForPHIOfOps.find(V);
+ if (OISIt != OpSafeForPHIOfOps.end())
+ return OISIt->second;
+
+ // Keep walking until we either dominate the phi block, or hit a phi, or run
+ // out of things to check.
+ if (DT->properlyDominates(getBlockForValue(V), PHIBlock)) {
+ OpSafeForPHIOfOps.insert({V, true});
+ return true;
+ }
+ // PHI in the same block.
+ if (isa<PHINode>(V) && getBlockForValue(V) == PHIBlock) {
+ OpSafeForPHIOfOps.insert({V, false});
+ return false;
+ }
+
+ auto *OrigI = cast<Instruction>(V);
+ for (auto *Op : OrigI->operand_values()) {
+ if (!isa<Instruction>(Op))
+ continue;
+ // Stop now if we find an unsafe operand.
+ auto OISIt = OpSafeForPHIOfOps.find(OrigI);
+ if (OISIt != OpSafeForPHIOfOps.end()) {
+ if (!OISIt->second) {
+ OpSafeForPHIOfOps.insert({V, false});
+ return false;
+ }
+ continue;
+ }
+ if (!Visited.insert(Op).second)
+ continue;
+ Worklist.push_back(cast<Instruction>(Op));
+ }
+ return true;
+}
+
+// Return true if this operand will be safe to use for phi of ops.
+//
+// The reason some operands are unsafe is that we are not trying to recursively
+// translate everything back through phi nodes. We actually expect some lookups
+// of expressions to fail. In particular, a lookup where the expression cannot
+// exist in the predecessor. This is true even if the expression, as shown, can
+// be determined to be constant.
+bool NewGVN::OpIsSafeForPHIOfOps(Value *V, const BasicBlock *PHIBlock,
+ SmallPtrSetImpl<const Value *> &Visited) {
+ SmallVector<Instruction *, 4> Worklist;
+ if (!OpIsSafeForPHIOfOpsHelper(V, PHIBlock, Visited, Worklist))
+ return false;
+ while (!Worklist.empty()) {
+ auto *I = Worklist.pop_back_val();
+ if (!OpIsSafeForPHIOfOpsHelper(I, PHIBlock, Visited, Worklist))
+ return false;
+ }
+ OpSafeForPHIOfOps.insert({V, true});
+ return true;
+}
+
+// Try to find a leader for instruction TransInst, which is a phi translated
+// version of something in our original program. Visited is used to ensure we
+// don't infinite loop during translations of cycles. OrigInst is the
+// instruction in the original program, and PredBB is the predecessor we
+// translated it through.
+Value *NewGVN::findLeaderForInst(Instruction *TransInst,
+ SmallPtrSetImpl<Value *> &Visited,
+ MemoryAccess *MemAccess, Instruction *OrigInst,
+ BasicBlock *PredBB) {
+ unsigned IDFSNum = InstrToDFSNum(OrigInst);
+ // Make sure it's marked as a temporary instruction.
+ AllTempInstructions.insert(TransInst);
+ // and make sure anything that tries to add it's DFS number is
+ // redirected to the instruction we are making a phi of ops
+ // for.
+ TempToBlock.insert({TransInst, PredBB});
+ InstrDFS.insert({TransInst, IDFSNum});
+
+ const Expression *E = performSymbolicEvaluation(TransInst, Visited);
+ InstrDFS.erase(TransInst);
+ AllTempInstructions.erase(TransInst);
+ TempToBlock.erase(TransInst);
+ if (MemAccess)
+ TempToMemory.erase(TransInst);
+ if (!E)
+ return nullptr;
+ auto *FoundVal = findPHIOfOpsLeader(E, OrigInst, PredBB);
+ if (!FoundVal) {
+ ExpressionToPhiOfOps[E].insert(OrigInst);
+ LLVM_DEBUG(dbgs() << "Cannot find phi of ops operand for " << *TransInst
+ << " in block " << getBlockName(PredBB) << "\n");
+ return nullptr;
+ }
+ if (auto *SI = dyn_cast<StoreInst>(FoundVal))
+ FoundVal = SI->getValueOperand();
+ return FoundVal;
+}
+
+// When we see an instruction that is an op of phis, generate the equivalent phi
+// of ops form.
+const Expression *
+NewGVN::makePossiblePHIOfOps(Instruction *I,
+ SmallPtrSetImpl<Value *> &Visited) {
+ if (!okayForPHIOfOps(I))
+ return nullptr;
+
+ if (!Visited.insert(I).second)
+ return nullptr;
+ // For now, we require the instruction be cycle free because we don't
+ // *always* create a phi of ops for instructions that could be done as phi
+ // of ops, we only do it if we think it is useful. If we did do it all the
+ // time, we could remove the cycle free check.
+ if (!isCycleFree(I))
+ return nullptr;
+
+ SmallPtrSet<const Value *, 8> ProcessedPHIs;
+ // TODO: We don't do phi translation on memory accesses because it's
+ // complicated. For a load, we'd need to be able to simulate a new memoryuse,
+ // which we don't have a good way of doing ATM.
+ auto *MemAccess = getMemoryAccess(I);
+ // If the memory operation is defined by a memory operation this block that
+ // isn't a MemoryPhi, transforming the pointer backwards through a scalar phi
+ // can't help, as it would still be killed by that memory operation.
+ if (MemAccess && !isa<MemoryPhi>(MemAccess->getDefiningAccess()) &&
+ MemAccess->getDefiningAccess()->getBlock() == I->getParent())
+ return nullptr;
+
+ // Convert op of phis to phi of ops
+ SmallPtrSet<const Value *, 10> VisitedOps;
+ SmallVector<Value *, 4> Ops(I->operand_values());
+ BasicBlock *SamePHIBlock = nullptr;
+ PHINode *OpPHI = nullptr;
+ if (!DebugCounter::shouldExecute(PHIOfOpsCounter))
+ return nullptr;
+ for (auto *Op : Ops) {
+ if (!isa<PHINode>(Op)) {
+ auto *ValuePHI = RealToTemp.lookup(Op);
+ if (!ValuePHI)
+ continue;
+ LLVM_DEBUG(dbgs() << "Found possible dependent phi of ops\n");
+ Op = ValuePHI;
+ }
+ OpPHI = cast<PHINode>(Op);
+ if (!SamePHIBlock) {
+ SamePHIBlock = getBlockForValue(OpPHI);
+ } else if (SamePHIBlock != getBlockForValue(OpPHI)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "PHIs for operands are not all in the same block, aborting\n");
+ return nullptr;
+ }
+ // No point in doing this for one-operand phis.
+ if (OpPHI->getNumOperands() == 1) {
+ OpPHI = nullptr;
+ continue;
+ }
+ }
+
+ if (!OpPHI)
+ return nullptr;
+
+ SmallVector<ValPair, 4> PHIOps;
+ SmallPtrSet<Value *, 4> Deps;
+ auto *PHIBlock = getBlockForValue(OpPHI);
+ RevisitOnReachabilityChange[PHIBlock].reset(InstrToDFSNum(I));
+ for (unsigned PredNum = 0; PredNum < OpPHI->getNumOperands(); ++PredNum) {
+ auto *PredBB = OpPHI->getIncomingBlock(PredNum);
+ Value *FoundVal = nullptr;
+ SmallPtrSet<Value *, 4> CurrentDeps;
+ // We could just skip unreachable edges entirely but it's tricky to do
+ // with rewriting existing phi nodes.
+ if (ReachableEdges.count({PredBB, PHIBlock})) {
+ // Clone the instruction, create an expression from it that is
+ // translated back into the predecessor, and see if we have a leader.
+ Instruction *ValueOp = I->clone();
+ if (MemAccess)
+ TempToMemory.insert({ValueOp, MemAccess});
+ bool SafeForPHIOfOps = true;
+ VisitedOps.clear();
+ for (auto &Op : ValueOp->operands()) {
+ auto *OrigOp = &*Op;
+ // When these operand changes, it could change whether there is a
+ // leader for us or not, so we have to add additional users.
+ if (isa<PHINode>(Op)) {
+ Op = Op->DoPHITranslation(PHIBlock, PredBB);
+ if (Op != OrigOp && Op != I)
+ CurrentDeps.insert(Op);
+ } else if (auto *ValuePHI = RealToTemp.lookup(Op)) {
+ if (getBlockForValue(ValuePHI) == PHIBlock)
+ Op = ValuePHI->getIncomingValueForBlock(PredBB);
+ }
+ // If we phi-translated the op, it must be safe.
+ SafeForPHIOfOps =
+ SafeForPHIOfOps &&
+ (Op != OrigOp || OpIsSafeForPHIOfOps(Op, PHIBlock, VisitedOps));
+ }
+ // FIXME: For those things that are not safe we could generate
+ // expressions all the way down, and see if this comes out to a
+ // constant. For anything where that is true, and unsafe, we should
+ // have made a phi-of-ops (or value numbered it equivalent to something)
+ // for the pieces already.
+ FoundVal = !SafeForPHIOfOps ? nullptr
+ : findLeaderForInst(ValueOp, Visited,
+ MemAccess, I, PredBB);
+ ValueOp->deleteValue();
+ if (!FoundVal) {
+ // We failed to find a leader for the current ValueOp, but this might
+ // change in case of the translated operands change.
+ if (SafeForPHIOfOps)
+ for (auto Dep : CurrentDeps)
+ addAdditionalUsers(Dep, I);
+
+ return nullptr;
+ }
+ Deps.insert(CurrentDeps.begin(), CurrentDeps.end());
+ } else {
+ LLVM_DEBUG(dbgs() << "Skipping phi of ops operand for incoming block "
+ << getBlockName(PredBB)
+ << " because the block is unreachable\n");
+ FoundVal = UndefValue::get(I->getType());
+ RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
+ }
+
+ PHIOps.push_back({FoundVal, PredBB});
+ LLVM_DEBUG(dbgs() << "Found phi of ops operand " << *FoundVal << " in "
+ << getBlockName(PredBB) << "\n");
+ }
+ for (auto Dep : Deps)
+ addAdditionalUsers(Dep, I);
+ sortPHIOps(PHIOps);
+ auto *E = performSymbolicPHIEvaluation(PHIOps, I, PHIBlock);
+ if (isa<ConstantExpression>(E) || isa<VariableExpression>(E)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Not creating real PHI of ops because it simplified to existing "
+ "value or constant\n");
+ return E;
+ }
+ auto *ValuePHI = RealToTemp.lookup(I);
+ bool NewPHI = false;
+ if (!ValuePHI) {
+ ValuePHI =
+ PHINode::Create(I->getType(), OpPHI->getNumOperands(), "phiofops");
+ addPhiOfOps(ValuePHI, PHIBlock, I);
+ NewPHI = true;
+ NumGVNPHIOfOpsCreated++;
+ }
+ if (NewPHI) {
+ for (auto PHIOp : PHIOps)
+ ValuePHI->addIncoming(PHIOp.first, PHIOp.second);
+ } else {
+ TempToBlock[ValuePHI] = PHIBlock;
+ unsigned int i = 0;
+ for (auto PHIOp : PHIOps) {
+ ValuePHI->setIncomingValue(i, PHIOp.first);
+ ValuePHI->setIncomingBlock(i, PHIOp.second);
+ ++i;
+ }
+ }
+ RevisitOnReachabilityChange[PHIBlock].set(InstrToDFSNum(I));
+ LLVM_DEBUG(dbgs() << "Created phi of ops " << *ValuePHI << " for " << *I
+ << "\n");
+
+ return E;
+}
+
+// The algorithm initially places the values of the routine in the TOP
+// congruence class. The leader of TOP is the undetermined value `undef`.
+// When the algorithm has finished, values still in TOP are unreachable.
+void NewGVN::initializeCongruenceClasses(Function &F) {
+ NextCongruenceNum = 0;
+
+ // Note that even though we use the live on entry def as a representative
+ // MemoryAccess, it is *not* the same as the actual live on entry def. We
+ // have no real equivalemnt to undef for MemoryAccesses, and so we really
+ // should be checking whether the MemoryAccess is top if we want to know if it
+ // is equivalent to everything. Otherwise, what this really signifies is that
+ // the access "it reaches all the way back to the beginning of the function"
+
+ // Initialize all other instructions to be in TOP class.
+ TOPClass = createCongruenceClass(nullptr, nullptr);
+ TOPClass->setMemoryLeader(MSSA->getLiveOnEntryDef());
+ // The live on entry def gets put into it's own class
+ MemoryAccessToClass[MSSA->getLiveOnEntryDef()] =
+ createMemoryClass(MSSA->getLiveOnEntryDef());
+
+ for (auto DTN : nodes(DT)) {
+ BasicBlock *BB = DTN->getBlock();
+ // All MemoryAccesses are equivalent to live on entry to start. They must
+ // be initialized to something so that initial changes are noticed. For
+ // the maximal answer, we initialize them all to be the same as
+ // liveOnEntry.
+ auto *MemoryBlockDefs = MSSA->getBlockDefs(BB);
+ if (MemoryBlockDefs)
+ for (const auto &Def : *MemoryBlockDefs) {
+ MemoryAccessToClass[&Def] = TOPClass;
+ auto *MD = dyn_cast<MemoryDef>(&Def);
+ // Insert the memory phis into the member list.
+ if (!MD) {
+ const MemoryPhi *MP = cast<MemoryPhi>(&Def);
+ TOPClass->memory_insert(MP);
+ MemoryPhiState.insert({MP, MPS_TOP});
+ }
+
+ if (MD && isa<StoreInst>(MD->getMemoryInst()))
+ TOPClass->incStoreCount();
+ }
+
+ // FIXME: This is trying to discover which instructions are uses of phi
+ // nodes. We should move this into one of the myriad of places that walk
+ // all the operands already.
+ for (auto &I : *BB) {
+ if (isa<PHINode>(&I))
+ for (auto *U : I.users())
+ if (auto *UInst = dyn_cast<Instruction>(U))
+ if (InstrToDFSNum(UInst) != 0 && okayForPHIOfOps(UInst))
+ PHINodeUses.insert(UInst);
+ // Don't insert void terminators into the class. We don't value number
+ // them, and they just end up sitting in TOP.
+ if (I.isTerminator() && I.getType()->isVoidTy())
+ continue;
+ TOPClass->insert(&I);
+ ValueToClass[&I] = TOPClass;
+ }
+ }
+
+ // Initialize arguments to be in their own unique congruence classes
+ for (auto &FA : F.args())
+ createSingletonCongruenceClass(&FA);
+}
+
+void NewGVN::cleanupTables() {
+ for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) {
+ LLVM_DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
+ << " has " << CongruenceClasses[i]->size()
+ << " members\n");
+ // Make sure we delete the congruence class (probably worth switching to
+ // a unique_ptr at some point.
+ delete CongruenceClasses[i];
+ CongruenceClasses[i] = nullptr;
+ }
+
+ // Destroy the value expressions
+ SmallVector<Instruction *, 8> TempInst(AllTempInstructions.begin(),
+ AllTempInstructions.end());
+ AllTempInstructions.clear();
+
+ // We have to drop all references for everything first, so there are no uses
+ // left as we delete them.
+ for (auto *I : TempInst) {
+ I->dropAllReferences();
+ }
+
+ while (!TempInst.empty()) {
auto *I = TempInst.pop_back_val();
- I->deleteValue();
- }
-
- ValueToClass.clear();
- ArgRecycler.clear(ExpressionAllocator);
- ExpressionAllocator.Reset();
- CongruenceClasses.clear();
- ExpressionToClass.clear();
- ValueToExpression.clear();
- RealToTemp.clear();
- AdditionalUsers.clear();
- ExpressionToPhiOfOps.clear();
- TempToBlock.clear();
- TempToMemory.clear();
- PHINodeUses.clear();
- OpSafeForPHIOfOps.clear();
- ReachableBlocks.clear();
- ReachableEdges.clear();
-#ifndef NDEBUG
- ProcessedCount.clear();
-#endif
- InstrDFS.clear();
- InstructionsToErase.clear();
- DFSToInstr.clear();
- BlockInstRange.clear();
- TouchedInstructions.clear();
- MemoryAccessToClass.clear();
- PredicateToUsers.clear();
- MemoryToUsers.clear();
- RevisitOnReachabilityChange.clear();
-}
-
-// Assign local DFS number mapping to instructions, and leave space for Value
-// PHI's.
-std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
- unsigned Start) {
- unsigned End = Start;
- if (MemoryAccess *MemPhi = getMemoryAccess(B)) {
- InstrDFS[MemPhi] = End++;
- DFSToInstr.emplace_back(MemPhi);
- }
-
- // Then the real block goes next.
- for (auto &I : *B) {
- // There's no need to call isInstructionTriviallyDead more than once on
- // an instruction. Therefore, once we know that an instruction is dead
- // we change its DFS number so that it doesn't get value numbered.
- if (isInstructionTriviallyDead(&I, TLI)) {
- InstrDFS[&I] = 0;
- LLVM_DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
- markInstructionForDeletion(&I);
- continue;
- }
- if (isa<PHINode>(&I))
- RevisitOnReachabilityChange[B].set(End);
- InstrDFS[&I] = End++;
- DFSToInstr.emplace_back(&I);
- }
-
- // All of the range functions taken half-open ranges (open on the end side).
- // So we do not subtract one from count, because at this point it is one
- // greater than the last instruction.
- return std::make_pair(Start, End);
-}
-
-void NewGVN::updateProcessedCount(const Value *V) {
-#ifndef NDEBUG
- if (ProcessedCount.count(V) == 0) {
- ProcessedCount.insert({V, 1});
- } else {
- ++ProcessedCount[V];
- assert(ProcessedCount[V] < 100 &&
- "Seem to have processed the same Value a lot");
- }
-#endif
-}
-
-// Evaluate MemoryPhi nodes symbolically, just like PHI nodes
-void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
- // If all the arguments are the same, the MemoryPhi has the same value as the
- // argument. Filter out unreachable blocks and self phis from our operands.
- // TODO: We could do cycle-checking on the memory phis to allow valueizing for
- // self-phi checking.
- const BasicBlock *PHIBlock = MP->getBlock();
- auto Filtered = make_filter_range(MP->operands(), [&](const Use &U) {
- return cast<MemoryAccess>(U) != MP &&
- !isMemoryAccessTOP(cast<MemoryAccess>(U)) &&
- ReachableEdges.count({MP->getIncomingBlock(U), PHIBlock});
- });
- // If all that is left is nothing, our memoryphi is undef. We keep it as
- // InitialClass. Note: The only case this should happen is if we have at
- // least one self-argument.
- if (Filtered.begin() == Filtered.end()) {
- if (setMemoryClass(MP, TOPClass))
- markMemoryUsersTouched(MP);
- return;
- }
-
- // Transform the remaining operands into operand leaders.
- // FIXME: mapped_iterator should have a range version.
- auto LookupFunc = [&](const Use &U) {
- return lookupMemoryLeader(cast<MemoryAccess>(U));
- };
- auto MappedBegin = map_iterator(Filtered.begin(), LookupFunc);
- auto MappedEnd = map_iterator(Filtered.end(), LookupFunc);
-
- // and now check if all the elements are equal.
- // Sadly, we can't use std::equals since these are random access iterators.
- const auto *AllSameValue = *MappedBegin;
- ++MappedBegin;
- bool AllEqual = std::all_of(
- MappedBegin, MappedEnd,
- [&AllSameValue](const MemoryAccess *V) { return V == AllSameValue; });
-
- if (AllEqual)
- LLVM_DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue
- << "\n");
- else
- LLVM_DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
- // If it's equal to something, it's in that class. Otherwise, it has to be in
- // a class where it is the leader (other things may be equivalent to it, but
- // it needs to start off in its own class, which means it must have been the
- // leader, and it can't have stopped being the leader because it was never
- // removed).
- CongruenceClass *CC =
- AllEqual ? getMemoryClass(AllSameValue) : ensureLeaderOfMemoryClass(MP);
- auto OldState = MemoryPhiState.lookup(MP);
- assert(OldState != MPS_Invalid && "Invalid memory phi state");
- auto NewState = AllEqual ? MPS_Equivalent : MPS_Unique;
- MemoryPhiState[MP] = NewState;
- if (setMemoryClass(MP, CC) || OldState != NewState)
- markMemoryUsersTouched(MP);
-}
-
-// Value number a single instruction, symbolically evaluating, performing
-// congruence finding, and updating mappings.
-void NewGVN::valueNumberInstruction(Instruction *I) {
- LLVM_DEBUG(dbgs() << "Processing instruction " << *I << "\n");
- if (!I->isTerminator()) {
- const Expression *Symbolized = nullptr;
- SmallPtrSet<Value *, 2> Visited;
- if (DebugCounter::shouldExecute(VNCounter)) {
- Symbolized = performSymbolicEvaluation(I, Visited);
- // Make a phi of ops if necessary
- if (Symbolized && !isa<ConstantExpression>(Symbolized) &&
- !isa<VariableExpression>(Symbolized) && PHINodeUses.count(I)) {
- auto *PHIE = makePossiblePHIOfOps(I, Visited);
- // If we created a phi of ops, use it.
- // If we couldn't create one, make sure we don't leave one lying around
- if (PHIE) {
- Symbolized = PHIE;
- } else if (auto *Op = RealToTemp.lookup(I)) {
- removePhiOfOps(I, Op);
- }
- }
- } else {
- // Mark the instruction as unused so we don't value number it again.
- InstrDFS[I] = 0;
- }
- // If we couldn't come up with a symbolic expression, use the unknown
- // expression
- if (Symbolized == nullptr)
- Symbolized = createUnknownExpression(I);
- performCongruenceFinding(I, Symbolized);
- } else {
- // Handle terminators that return values. All of them produce values we
- // don't currently understand. We don't place non-value producing
- // terminators in a class.
- if (!I->getType()->isVoidTy()) {
- auto *Symbolized = createUnknownExpression(I);
- performCongruenceFinding(I, Symbolized);
- }
- processOutgoingEdges(I, I->getParent());
- }
-}
-
-// Check if there is a path, using single or equal argument phi nodes, from
-// First to Second.
-bool NewGVN::singleReachablePHIPath(
- SmallPtrSet<const MemoryAccess *, 8> &Visited, const MemoryAccess *First,
- const MemoryAccess *Second) const {
- if (First == Second)
- return true;
- if (MSSA->isLiveOnEntryDef(First))
- return false;
-
- // This is not perfect, but as we're just verifying here, we can live with
- // the loss of precision. The real solution would be that of doing strongly
- // connected component finding in this routine, and it's probably not worth
- // the complexity for the time being. So, we just keep a set of visited
- // MemoryAccess and return true when we hit a cycle.
- if (Visited.count(First))
- return true;
- Visited.insert(First);
-
- const auto *EndDef = First;
- for (auto *ChainDef : optimized_def_chain(First)) {
- if (ChainDef == Second)
- return true;
- if (MSSA->isLiveOnEntryDef(ChainDef))
- return false;
- EndDef = ChainDef;
- }
- auto *MP = cast<MemoryPhi>(EndDef);
- auto ReachableOperandPred = [&](const Use &U) {
- return ReachableEdges.count({MP->getIncomingBlock(U), MP->getBlock()});
- };
- auto FilteredPhiArgs =
- make_filter_range(MP->operands(), ReachableOperandPred);
- SmallVector<const Value *, 32> OperandList;
- llvm::copy(FilteredPhiArgs, std::back_inserter(OperandList));
- bool Okay = is_splat(OperandList);
- if (Okay)
- return singleReachablePHIPath(Visited, cast<MemoryAccess>(OperandList[0]),
- Second);
- return false;
-}
-
-// Verify the that the memory equivalence table makes sense relative to the
-// congruence classes. Note that this checking is not perfect, and is currently
-// subject to very rare false negatives. It is only useful for
-// testing/debugging.
-void NewGVN::verifyMemoryCongruency() const {
-#ifndef NDEBUG
- // Verify that the memory table equivalence and memory member set match
- for (const auto *CC : CongruenceClasses) {
- if (CC == TOPClass || CC->isDead())
- continue;
- if (CC->getStoreCount() != 0) {
- assert((CC->getStoredValue() || !isa<StoreInst>(CC->getLeader())) &&
- "Any class with a store as a leader should have a "
- "representative stored value");
- assert(CC->getMemoryLeader() &&
- "Any congruence class with a store should have a "
- "representative access");
- }
-
- if (CC->getMemoryLeader())
- assert(MemoryAccessToClass.lookup(CC->getMemoryLeader()) == CC &&
- "Representative MemoryAccess does not appear to be reverse "
- "mapped properly");
- for (auto M : CC->memory())
- assert(MemoryAccessToClass.lookup(M) == CC &&
- "Memory member does not appear to be reverse mapped properly");
- }
-
- // Anything equivalent in the MemoryAccess table should be in the same
- // congruence class.
-
- // Filter out the unreachable and trivially dead entries, because they may
- // never have been updated if the instructions were not processed.
- auto ReachableAccessPred =
- [&](const std::pair<const MemoryAccess *, CongruenceClass *> Pair) {
- bool Result = ReachableBlocks.count(Pair.first->getBlock());
- if (!Result || MSSA->isLiveOnEntryDef(Pair.first) ||
- MemoryToDFSNum(Pair.first) == 0)
- return false;
- if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
- return !isInstructionTriviallyDead(MemDef->getMemoryInst());
-
- // We could have phi nodes which operands are all trivially dead,
- // so we don't process them.
- if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) {
- for (auto &U : MemPHI->incoming_values()) {
- if (auto *I = dyn_cast<Instruction>(&*U)) {
- if (!isInstructionTriviallyDead(I))
- return true;
- }
- }
- return false;
- }
-
- return true;
- };
-
- auto Filtered = make_filter_range(MemoryAccessToClass, ReachableAccessPred);
- for (auto KV : Filtered) {
- if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
- auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second->getMemoryLeader());
- if (FirstMUD && SecondMUD) {
- SmallPtrSet<const MemoryAccess *, 8> VisitedMAS;
- assert((singleReachablePHIPath(VisitedMAS, FirstMUD, SecondMUD) ||
- ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
- ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
- "The instructions for these memory operations should have "
- "been in the same congruence class or reachable through"
- "a single argument phi");
- }
- } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
- // We can only sanely verify that MemoryDefs in the operand list all have
- // the same class.
- auto ReachableOperandPred = [&](const Use &U) {
- return ReachableEdges.count(
- {FirstMP->getIncomingBlock(U), FirstMP->getBlock()}) &&
- isa<MemoryDef>(U);
-
- };
- // All arguments should in the same class, ignoring unreachable arguments
- auto FilteredPhiArgs =
- make_filter_range(FirstMP->operands(), ReachableOperandPred);
- SmallVector<const CongruenceClass *, 16> PhiOpClasses;
- std::transform(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
- std::back_inserter(PhiOpClasses), [&](const Use &U) {
- const MemoryDef *MD = cast<MemoryDef>(U);
- return ValueToClass.lookup(MD->getMemoryInst());
- });
- assert(is_splat(PhiOpClasses) &&
- "All MemoryPhi arguments should be in the same class");
- }
- }
-#endif
-}
-
-// Verify that the sparse propagation we did actually found the maximal fixpoint
-// We do this by storing the value to class mapping, touching all instructions,
-// and redoing the iteration to see if anything changed.
-void NewGVN::verifyIterationSettled(Function &F) {
-#ifndef NDEBUG
- LLVM_DEBUG(dbgs() << "Beginning iteration verification\n");
- if (DebugCounter::isCounterSet(VNCounter))
- DebugCounter::setCounterValue(VNCounter, StartingVNCounter);
-
- // Note that we have to store the actual classes, as we may change existing
- // classes during iteration. This is because our memory iteration propagation
- // is not perfect, and so may waste a little work. But it should generate
- // exactly the same congruence classes we have now, with different IDs.
- std::map<const Value *, CongruenceClass> BeforeIteration;
-
- for (auto &KV : ValueToClass) {
- if (auto *I = dyn_cast<Instruction>(KV.first))
- // Skip unused/dead instructions.
- if (InstrToDFSNum(I) == 0)
- continue;
- BeforeIteration.insert({KV.first, *KV.second});
- }
-
- TouchedInstructions.set();
- TouchedInstructions.reset(0);
- iterateTouchedInstructions();
- DenseSet<std::pair<const CongruenceClass *, const CongruenceClass *>>
- EqualClasses;
- for (const auto &KV : ValueToClass) {
- if (auto *I = dyn_cast<Instruction>(KV.first))
- // Skip unused/dead instructions.
- if (InstrToDFSNum(I) == 0)
- continue;
- // We could sink these uses, but i think this adds a bit of clarity here as
- // to what we are comparing.
- auto *BeforeCC = &BeforeIteration.find(KV.first)->second;
- auto *AfterCC = KV.second;
- // Note that the classes can't change at this point, so we memoize the set
- // that are equal.
- if (!EqualClasses.count({BeforeCC, AfterCC})) {
- assert(BeforeCC->isEquivalentTo(AfterCC) &&
- "Value number changed after main loop completed!");
- EqualClasses.insert({BeforeCC, AfterCC});
- }
- }
-#endif
-}
-
-// Verify that for each store expression in the expression to class mapping,
-// only the latest appears, and multiple ones do not appear.
-// Because loads do not use the stored value when doing equality with stores,
-// if we don't erase the old store expressions from the table, a load can find
-// a no-longer valid StoreExpression.
-void NewGVN::verifyStoreExpressions() const {
-#ifndef NDEBUG
- // This is the only use of this, and it's not worth defining a complicated
- // densemapinfo hash/equality function for it.
- std::set<
- std::pair<const Value *,
- std::tuple<const Value *, const CongruenceClass *, Value *>>>
- StoreExpressionSet;
- for (const auto &KV : ExpressionToClass) {
- if (auto *SE = dyn_cast<StoreExpression>(KV.first)) {
- // Make sure a version that will conflict with loads is not already there
- auto Res = StoreExpressionSet.insert(
- {SE->getOperand(0), std::make_tuple(SE->getMemoryLeader(), KV.second,
- SE->getStoredValue())});
- bool Okay = Res.second;
- // It's okay to have the same expression already in there if it is
- // identical in nature.
- // This can happen when the leader of the stored value changes over time.
- if (!Okay)
- Okay = (std::get<1>(Res.first->second) == KV.second) &&
- (lookupOperandLeader(std::get<2>(Res.first->second)) ==
- lookupOperandLeader(SE->getStoredValue()));
- assert(Okay && "Stored expression conflict exists in expression table");
- auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst());
- assert(ValueExpr && ValueExpr->equals(*SE) &&
- "StoreExpression in ExpressionToClass is not latest "
- "StoreExpression for value");
- }
- }
-#endif
-}
-
-// This is the main value numbering loop, it iterates over the initial touched
-// instruction set, propagating value numbers, marking things touched, etc,
-// until the set of touched instructions is completely empty.
-void NewGVN::iterateTouchedInstructions() {
- unsigned int Iterations = 0;
- // Figure out where touchedinstructions starts
- int FirstInstr = TouchedInstructions.find_first();
- // Nothing set, nothing to iterate, just return.
- if (FirstInstr == -1)
- return;
- const BasicBlock *LastBlock = getBlockForValue(InstrFromDFSNum(FirstInstr));
- while (TouchedInstructions.any()) {
- ++Iterations;
- // Walk through all the instructions in all the blocks in RPO.
- // TODO: As we hit a new block, we should push and pop equalities into a
- // table lookupOperandLeader can use, to catch things PredicateInfo
- // might miss, like edge-only equivalences.
- for (unsigned InstrNum : TouchedInstructions.set_bits()) {
-
- // This instruction was found to be dead. We don't bother looking
- // at it again.
- if (InstrNum == 0) {
- TouchedInstructions.reset(InstrNum);
- continue;
- }
-
- Value *V = InstrFromDFSNum(InstrNum);
- const BasicBlock *CurrBlock = getBlockForValue(V);
-
- // If we hit a new block, do reachability processing.
- if (CurrBlock != LastBlock) {
- LastBlock = CurrBlock;
- bool BlockReachable = ReachableBlocks.count(CurrBlock);
- const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
-
- // If it's not reachable, erase any touched instructions and move on.
- if (!BlockReachable) {
- TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
- LLVM_DEBUG(dbgs() << "Skipping instructions in block "
- << getBlockName(CurrBlock)
- << " because it is unreachable\n");
- continue;
- }
- updateProcessedCount(CurrBlock);
- }
- // Reset after processing (because we may mark ourselves as touched when
- // we propagate equalities).
- TouchedInstructions.reset(InstrNum);
-
- if (auto *MP = dyn_cast<MemoryPhi>(V)) {
- LLVM_DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
- valueNumberMemoryPhi(MP);
- } else if (auto *I = dyn_cast<Instruction>(V)) {
- valueNumberInstruction(I);
- } else {
- llvm_unreachable("Should have been a MemoryPhi or Instruction");
- }
- updateProcessedCount(V);
- }
- }
- NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
-}
-
-// This is the main transformation entry point.
-bool NewGVN::runGVN() {
- if (DebugCounter::isCounterSet(VNCounter))
- StartingVNCounter = DebugCounter::getCounterValue(VNCounter);
- bool Changed = false;
- NumFuncArgs = F.arg_size();
- MSSAWalker = MSSA->getWalker();
- SingletonDeadExpression = new (ExpressionAllocator) DeadExpression();
-
- // Count number of instructions for sizing of hash tables, and come
- // up with a global dfs numbering for instructions.
- unsigned ICount = 1;
- // Add an empty instruction to account for the fact that we start at 1
- DFSToInstr.emplace_back(nullptr);
- // Note: We want ideal RPO traversal of the blocks, which is not quite the
- // same as dominator tree order, particularly with regard whether backedges
- // get visited first or second, given a block with multiple successors.
- // If we visit in the wrong order, we will end up performing N times as many
- // iterations.
- // The dominator tree does guarantee that, for a given dom tree node, it's
- // parent must occur before it in the RPO ordering. Thus, we only need to sort
- // the siblings.
- ReversePostOrderTraversal<Function *> RPOT(&F);
- unsigned Counter = 0;
- for (auto &B : RPOT) {
- auto *Node = DT->getNode(B);
- assert(Node && "RPO and Dominator tree should have same reachability");
- RPOOrdering[Node] = ++Counter;
- }
- // Sort dominator tree children arrays into RPO.
- for (auto &B : RPOT) {
- auto *Node = DT->getNode(B);
- if (Node->getNumChildren() > 1)
+ I->deleteValue();
+ }
+
+ ValueToClass.clear();
+ ArgRecycler.clear(ExpressionAllocator);
+ ExpressionAllocator.Reset();
+ CongruenceClasses.clear();
+ ExpressionToClass.clear();
+ ValueToExpression.clear();
+ RealToTemp.clear();
+ AdditionalUsers.clear();
+ ExpressionToPhiOfOps.clear();
+ TempToBlock.clear();
+ TempToMemory.clear();
+ PHINodeUses.clear();
+ OpSafeForPHIOfOps.clear();
+ ReachableBlocks.clear();
+ ReachableEdges.clear();
+#ifndef NDEBUG
+ ProcessedCount.clear();
+#endif
+ InstrDFS.clear();
+ InstructionsToErase.clear();
+ DFSToInstr.clear();
+ BlockInstRange.clear();
+ TouchedInstructions.clear();
+ MemoryAccessToClass.clear();
+ PredicateToUsers.clear();
+ MemoryToUsers.clear();
+ RevisitOnReachabilityChange.clear();
+}
+
+// Assign local DFS number mapping to instructions, and leave space for Value
+// PHI's.
+std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
+ unsigned Start) {
+ unsigned End = Start;
+ if (MemoryAccess *MemPhi = getMemoryAccess(B)) {
+ InstrDFS[MemPhi] = End++;
+ DFSToInstr.emplace_back(MemPhi);
+ }
+
+ // Then the real block goes next.
+ for (auto &I : *B) {
+ // There's no need to call isInstructionTriviallyDead more than once on
+ // an instruction. Therefore, once we know that an instruction is dead
+ // we change its DFS number so that it doesn't get value numbered.
+ if (isInstructionTriviallyDead(&I, TLI)) {
+ InstrDFS[&I] = 0;
+ LLVM_DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+ markInstructionForDeletion(&I);
+ continue;
+ }
+ if (isa<PHINode>(&I))
+ RevisitOnReachabilityChange[B].set(End);
+ InstrDFS[&I] = End++;
+ DFSToInstr.emplace_back(&I);
+ }
+
+ // All of the range functions taken half-open ranges (open on the end side).
+ // So we do not subtract one from count, because at this point it is one
+ // greater than the last instruction.
+ return std::make_pair(Start, End);
+}
+
+void NewGVN::updateProcessedCount(const Value *V) {
+#ifndef NDEBUG
+ if (ProcessedCount.count(V) == 0) {
+ ProcessedCount.insert({V, 1});
+ } else {
+ ++ProcessedCount[V];
+ assert(ProcessedCount[V] < 100 &&
+ "Seem to have processed the same Value a lot");
+ }
+#endif
+}
+
+// Evaluate MemoryPhi nodes symbolically, just like PHI nodes
+void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
+ // If all the arguments are the same, the MemoryPhi has the same value as the
+ // argument. Filter out unreachable blocks and self phis from our operands.
+ // TODO: We could do cycle-checking on the memory phis to allow valueizing for
+ // self-phi checking.
+ const BasicBlock *PHIBlock = MP->getBlock();
+ auto Filtered = make_filter_range(MP->operands(), [&](const Use &U) {
+ return cast<MemoryAccess>(U) != MP &&
+ !isMemoryAccessTOP(cast<MemoryAccess>(U)) &&
+ ReachableEdges.count({MP->getIncomingBlock(U), PHIBlock});
+ });
+ // If all that is left is nothing, our memoryphi is undef. We keep it as
+ // InitialClass. Note: The only case this should happen is if we have at
+ // least one self-argument.
+ if (Filtered.begin() == Filtered.end()) {
+ if (setMemoryClass(MP, TOPClass))
+ markMemoryUsersTouched(MP);
+ return;
+ }
+
+ // Transform the remaining operands into operand leaders.
+ // FIXME: mapped_iterator should have a range version.
+ auto LookupFunc = [&](const Use &U) {
+ return lookupMemoryLeader(cast<MemoryAccess>(U));
+ };
+ auto MappedBegin = map_iterator(Filtered.begin(), LookupFunc);
+ auto MappedEnd = map_iterator(Filtered.end(), LookupFunc);
+
+ // and now check if all the elements are equal.
+ // Sadly, we can't use std::equals since these are random access iterators.
+ const auto *AllSameValue = *MappedBegin;
+ ++MappedBegin;
+ bool AllEqual = std::all_of(
+ MappedBegin, MappedEnd,
+ [&AllSameValue](const MemoryAccess *V) { return V == AllSameValue; });
+
+ if (AllEqual)
+ LLVM_DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue
+ << "\n");
+ else
+ LLVM_DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
+ // If it's equal to something, it's in that class. Otherwise, it has to be in
+ // a class where it is the leader (other things may be equivalent to it, but
+ // it needs to start off in its own class, which means it must have been the
+ // leader, and it can't have stopped being the leader because it was never
+ // removed).
+ CongruenceClass *CC =
+ AllEqual ? getMemoryClass(AllSameValue) : ensureLeaderOfMemoryClass(MP);
+ auto OldState = MemoryPhiState.lookup(MP);
+ assert(OldState != MPS_Invalid && "Invalid memory phi state");
+ auto NewState = AllEqual ? MPS_Equivalent : MPS_Unique;
+ MemoryPhiState[MP] = NewState;
+ if (setMemoryClass(MP, CC) || OldState != NewState)
+ markMemoryUsersTouched(MP);
+}
+
+// Value number a single instruction, symbolically evaluating, performing
+// congruence finding, and updating mappings.
+void NewGVN::valueNumberInstruction(Instruction *I) {
+ LLVM_DEBUG(dbgs() << "Processing instruction " << *I << "\n");
+ if (!I->isTerminator()) {
+ const Expression *Symbolized = nullptr;
+ SmallPtrSet<Value *, 2> Visited;
+ if (DebugCounter::shouldExecute(VNCounter)) {
+ Symbolized = performSymbolicEvaluation(I, Visited);
+ // Make a phi of ops if necessary
+ if (Symbolized && !isa<ConstantExpression>(Symbolized) &&
+ !isa<VariableExpression>(Symbolized) && PHINodeUses.count(I)) {
+ auto *PHIE = makePossiblePHIOfOps(I, Visited);
+ // If we created a phi of ops, use it.
+ // If we couldn't create one, make sure we don't leave one lying around
+ if (PHIE) {
+ Symbolized = PHIE;
+ } else if (auto *Op = RealToTemp.lookup(I)) {
+ removePhiOfOps(I, Op);
+ }
+ }
+ } else {
+ // Mark the instruction as unused so we don't value number it again.
+ InstrDFS[I] = 0;
+ }
+ // If we couldn't come up with a symbolic expression, use the unknown
+ // expression
+ if (Symbolized == nullptr)
+ Symbolized = createUnknownExpression(I);
+ performCongruenceFinding(I, Symbolized);
+ } else {
+ // Handle terminators that return values. All of them produce values we
+ // don't currently understand. We don't place non-value producing
+ // terminators in a class.
+ if (!I->getType()->isVoidTy()) {
+ auto *Symbolized = createUnknownExpression(I);
+ performCongruenceFinding(I, Symbolized);
+ }
+ processOutgoingEdges(I, I->getParent());
+ }
+}
+
+// Check if there is a path, using single or equal argument phi nodes, from
+// First to Second.
+bool NewGVN::singleReachablePHIPath(
+ SmallPtrSet<const MemoryAccess *, 8> &Visited, const MemoryAccess *First,
+ const MemoryAccess *Second) const {
+ if (First == Second)
+ return true;
+ if (MSSA->isLiveOnEntryDef(First))
+ return false;
+
+ // This is not perfect, but as we're just verifying here, we can live with
+ // the loss of precision. The real solution would be that of doing strongly
+ // connected component finding in this routine, and it's probably not worth
+ // the complexity for the time being. So, we just keep a set of visited
+ // MemoryAccess and return true when we hit a cycle.
+ if (Visited.count(First))
+ return true;
+ Visited.insert(First);
+
+ const auto *EndDef = First;
+ for (auto *ChainDef : optimized_def_chain(First)) {
+ if (ChainDef == Second)
+ return true;
+ if (MSSA->isLiveOnEntryDef(ChainDef))
+ return false;
+ EndDef = ChainDef;
+ }
+ auto *MP = cast<MemoryPhi>(EndDef);
+ auto ReachableOperandPred = [&](const Use &U) {
+ return ReachableEdges.count({MP->getIncomingBlock(U), MP->getBlock()});
+ };
+ auto FilteredPhiArgs =
+ make_filter_range(MP->operands(), ReachableOperandPred);
+ SmallVector<const Value *, 32> OperandList;
+ llvm::copy(FilteredPhiArgs, std::back_inserter(OperandList));
+ bool Okay = is_splat(OperandList);
+ if (Okay)
+ return singleReachablePHIPath(Visited, cast<MemoryAccess>(OperandList[0]),
+ Second);
+ return false;
+}
+
+// Verify the that the memory equivalence table makes sense relative to the
+// congruence classes. Note that this checking is not perfect, and is currently
+// subject to very rare false negatives. It is only useful for
+// testing/debugging.
+void NewGVN::verifyMemoryCongruency() const {
+#ifndef NDEBUG
+ // Verify that the memory table equivalence and memory member set match
+ for (const auto *CC : CongruenceClasses) {
+ if (CC == TOPClass || CC->isDead())
+ continue;
+ if (CC->getStoreCount() != 0) {
+ assert((CC->getStoredValue() || !isa<StoreInst>(CC->getLeader())) &&
+ "Any class with a store as a leader should have a "
+ "representative stored value");
+ assert(CC->getMemoryLeader() &&
+ "Any congruence class with a store should have a "
+ "representative access");
+ }
+
+ if (CC->getMemoryLeader())
+ assert(MemoryAccessToClass.lookup(CC->getMemoryLeader()) == CC &&
+ "Representative MemoryAccess does not appear to be reverse "
+ "mapped properly");
+ for (auto M : CC->memory())
+ assert(MemoryAccessToClass.lookup(M) == CC &&
+ "Memory member does not appear to be reverse mapped properly");
+ }
+
+ // Anything equivalent in the MemoryAccess table should be in the same
+ // congruence class.
+
+ // Filter out the unreachable and trivially dead entries, because they may
+ // never have been updated if the instructions were not processed.
+ auto ReachableAccessPred =
+ [&](const std::pair<const MemoryAccess *, CongruenceClass *> Pair) {
+ bool Result = ReachableBlocks.count(Pair.first->getBlock());
+ if (!Result || MSSA->isLiveOnEntryDef(Pair.first) ||
+ MemoryToDFSNum(Pair.first) == 0)
+ return false;
+ if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
+ return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+
+ // We could have phi nodes which operands are all trivially dead,
+ // so we don't process them.
+ if (auto *MemPHI = dyn_cast<MemoryPhi>(Pair.first)) {
+ for (auto &U : MemPHI->incoming_values()) {
+ if (auto *I = dyn_cast<Instruction>(&*U)) {
+ if (!isInstructionTriviallyDead(I))
+ return true;
+ }
+ }
+ return false;
+ }
+
+ return true;
+ };
+
+ auto Filtered = make_filter_range(MemoryAccessToClass, ReachableAccessPred);
+ for (auto KV : Filtered) {
+ if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
+ auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second->getMemoryLeader());
+ if (FirstMUD && SecondMUD) {
+ SmallPtrSet<const MemoryAccess *, 8> VisitedMAS;
+ assert((singleReachablePHIPath(VisitedMAS, FirstMUD, SecondMUD) ||
+ ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
+ ValueToClass.lookup(SecondMUD->getMemoryInst())) &&
+ "The instructions for these memory operations should have "
+ "been in the same congruence class or reachable through"
+ "a single argument phi");
+ }
+ } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
+ // We can only sanely verify that MemoryDefs in the operand list all have
+ // the same class.
+ auto ReachableOperandPred = [&](const Use &U) {
+ return ReachableEdges.count(
+ {FirstMP->getIncomingBlock(U), FirstMP->getBlock()}) &&
+ isa<MemoryDef>(U);
+
+ };
+ // All arguments should in the same class, ignoring unreachable arguments
+ auto FilteredPhiArgs =
+ make_filter_range(FirstMP->operands(), ReachableOperandPred);
+ SmallVector<const CongruenceClass *, 16> PhiOpClasses;
+ std::transform(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+ std::back_inserter(PhiOpClasses), [&](const Use &U) {
+ const MemoryDef *MD = cast<MemoryDef>(U);
+ return ValueToClass.lookup(MD->getMemoryInst());
+ });
+ assert(is_splat(PhiOpClasses) &&
+ "All MemoryPhi arguments should be in the same class");
+ }
+ }
+#endif
+}
+
+// Verify that the sparse propagation we did actually found the maximal fixpoint
+// We do this by storing the value to class mapping, touching all instructions,
+// and redoing the iteration to see if anything changed.
+void NewGVN::verifyIterationSettled(Function &F) {
+#ifndef NDEBUG
+ LLVM_DEBUG(dbgs() << "Beginning iteration verification\n");
+ if (DebugCounter::isCounterSet(VNCounter))
+ DebugCounter::setCounterValue(VNCounter, StartingVNCounter);
+
+ // Note that we have to store the actual classes, as we may change existing
+ // classes during iteration. This is because our memory iteration propagation
+ // is not perfect, and so may waste a little work. But it should generate
+ // exactly the same congruence classes we have now, with different IDs.
+ std::map<const Value *, CongruenceClass> BeforeIteration;
+
+ for (auto &KV : ValueToClass) {
+ if (auto *I = dyn_cast<Instruction>(KV.first))
+ // Skip unused/dead instructions.
+ if (InstrToDFSNum(I) == 0)
+ continue;
+ BeforeIteration.insert({KV.first, *KV.second});
+ }
+
+ TouchedInstructions.set();
+ TouchedInstructions.reset(0);
+ iterateTouchedInstructions();
+ DenseSet<std::pair<const CongruenceClass *, const CongruenceClass *>>
+ EqualClasses;
+ for (const auto &KV : ValueToClass) {
+ if (auto *I = dyn_cast<Instruction>(KV.first))
+ // Skip unused/dead instructions.
+ if (InstrToDFSNum(I) == 0)
+ continue;
+ // We could sink these uses, but i think this adds a bit of clarity here as
+ // to what we are comparing.
+ auto *BeforeCC = &BeforeIteration.find(KV.first)->second;
+ auto *AfterCC = KV.second;
+ // Note that the classes can't change at this point, so we memoize the set
+ // that are equal.
+ if (!EqualClasses.count({BeforeCC, AfterCC})) {
+ assert(BeforeCC->isEquivalentTo(AfterCC) &&
+ "Value number changed after main loop completed!");
+ EqualClasses.insert({BeforeCC, AfterCC});
+ }
+ }
+#endif
+}
+
+// Verify that for each store expression in the expression to class mapping,
+// only the latest appears, and multiple ones do not appear.
+// Because loads do not use the stored value when doing equality with stores,
+// if we don't erase the old store expressions from the table, a load can find
+// a no-longer valid StoreExpression.
+void NewGVN::verifyStoreExpressions() const {
+#ifndef NDEBUG
+ // This is the only use of this, and it's not worth defining a complicated
+ // densemapinfo hash/equality function for it.
+ std::set<
+ std::pair<const Value *,
+ std::tuple<const Value *, const CongruenceClass *, Value *>>>
+ StoreExpressionSet;
+ for (const auto &KV : ExpressionToClass) {
+ if (auto *SE = dyn_cast<StoreExpression>(KV.first)) {
+ // Make sure a version that will conflict with loads is not already there
+ auto Res = StoreExpressionSet.insert(
+ {SE->getOperand(0), std::make_tuple(SE->getMemoryLeader(), KV.second,
+ SE->getStoredValue())});
+ bool Okay = Res.second;
+ // It's okay to have the same expression already in there if it is
+ // identical in nature.
+ // This can happen when the leader of the stored value changes over time.
+ if (!Okay)
+ Okay = (std::get<1>(Res.first->second) == KV.second) &&
+ (lookupOperandLeader(std::get<2>(Res.first->second)) ==
+ lookupOperandLeader(SE->getStoredValue()));
+ assert(Okay && "Stored expression conflict exists in expression table");
+ auto *ValueExpr = ValueToExpression.lookup(SE->getStoreInst());
+ assert(ValueExpr && ValueExpr->equals(*SE) &&
+ "StoreExpression in ExpressionToClass is not latest "
+ "StoreExpression for value");
+ }
+ }
+#endif
+}
+
+// This is the main value numbering loop, it iterates over the initial touched
+// instruction set, propagating value numbers, marking things touched, etc,
+// until the set of touched instructions is completely empty.
+void NewGVN::iterateTouchedInstructions() {
+ unsigned int Iterations = 0;
+ // Figure out where touchedinstructions starts
+ int FirstInstr = TouchedInstructions.find_first();
+ // Nothing set, nothing to iterate, just return.
+ if (FirstInstr == -1)
+ return;
+ const BasicBlock *LastBlock = getBlockForValue(InstrFromDFSNum(FirstInstr));
+ while (TouchedInstructions.any()) {
+ ++Iterations;
+ // Walk through all the instructions in all the blocks in RPO.
+ // TODO: As we hit a new block, we should push and pop equalities into a
+ // table lookupOperandLeader can use, to catch things PredicateInfo
+ // might miss, like edge-only equivalences.
+ for (unsigned InstrNum : TouchedInstructions.set_bits()) {
+
+ // This instruction was found to be dead. We don't bother looking
+ // at it again.
+ if (InstrNum == 0) {
+ TouchedInstructions.reset(InstrNum);
+ continue;
+ }
+
+ Value *V = InstrFromDFSNum(InstrNum);
+ const BasicBlock *CurrBlock = getBlockForValue(V);
+
+ // If we hit a new block, do reachability processing.
+ if (CurrBlock != LastBlock) {
+ LastBlock = CurrBlock;
+ bool BlockReachable = ReachableBlocks.count(CurrBlock);
+ const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
+
+ // If it's not reachable, erase any touched instructions and move on.
+ if (!BlockReachable) {
+ TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
+ LLVM_DEBUG(dbgs() << "Skipping instructions in block "
+ << getBlockName(CurrBlock)
+ << " because it is unreachable\n");
+ continue;
+ }
+ updateProcessedCount(CurrBlock);
+ }
+ // Reset after processing (because we may mark ourselves as touched when
+ // we propagate equalities).
+ TouchedInstructions.reset(InstrNum);
+
+ if (auto *MP = dyn_cast<MemoryPhi>(V)) {
+ LLVM_DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
+ valueNumberMemoryPhi(MP);
+ } else if (auto *I = dyn_cast<Instruction>(V)) {
+ valueNumberInstruction(I);
+ } else {
+ llvm_unreachable("Should have been a MemoryPhi or Instruction");
+ }
+ updateProcessedCount(V);
+ }
+ }
+ NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
+}
+
+// This is the main transformation entry point.
+bool NewGVN::runGVN() {
+ if (DebugCounter::isCounterSet(VNCounter))
+ StartingVNCounter = DebugCounter::getCounterValue(VNCounter);
+ bool Changed = false;
+ NumFuncArgs = F.arg_size();
+ MSSAWalker = MSSA->getWalker();
+ SingletonDeadExpression = new (ExpressionAllocator) DeadExpression();
+
+ // Count number of instructions for sizing of hash tables, and come
+ // up with a global dfs numbering for instructions.
+ unsigned ICount = 1;
+ // Add an empty instruction to account for the fact that we start at 1
+ DFSToInstr.emplace_back(nullptr);
+ // Note: We want ideal RPO traversal of the blocks, which is not quite the
+ // same as dominator tree order, particularly with regard whether backedges
+ // get visited first or second, given a block with multiple successors.
+ // If we visit in the wrong order, we will end up performing N times as many
+ // iterations.
+ // The dominator tree does guarantee that, for a given dom tree node, it's
+ // parent must occur before it in the RPO ordering. Thus, we only need to sort
+ // the siblings.
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+ unsigned Counter = 0;
+ for (auto &B : RPOT) {
+ auto *Node = DT->getNode(B);
+ assert(Node && "RPO and Dominator tree should have same reachability");
+ RPOOrdering[Node] = ++Counter;
+ }
+ // Sort dominator tree children arrays into RPO.
+ for (auto &B : RPOT) {
+ auto *Node = DT->getNode(B);
+ if (Node->getNumChildren() > 1)
llvm::sort(*Node, [&](const DomTreeNode *A, const DomTreeNode *B) {
return RPOOrdering[A] < RPOOrdering[B];
});
- }
-
- // Now a standard depth first ordering of the domtree is equivalent to RPO.
- for (auto DTN : depth_first(DT->getRootNode())) {
- BasicBlock *B = DTN->getBlock();
- const auto &BlockRange = assignDFSNumbers(B, ICount);
- BlockInstRange.insert({B, BlockRange});
- ICount += BlockRange.second - BlockRange.first;
- }
- initializeCongruenceClasses(F);
-
- TouchedInstructions.resize(ICount);
- // Ensure we don't end up resizing the expressionToClass map, as
- // that can be quite expensive. At most, we have one expression per
- // instruction.
- ExpressionToClass.reserve(ICount);
-
- // Initialize the touched instructions to include the entry block.
- const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock());
- TouchedInstructions.set(InstRange.first, InstRange.second);
- LLVM_DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock())
- << " marked reachable\n");
- ReachableBlocks.insert(&F.getEntryBlock());
-
- iterateTouchedInstructions();
- verifyMemoryCongruency();
- verifyIterationSettled(F);
- verifyStoreExpressions();
-
- Changed |= eliminateInstructions(F);
-
- // Delete all instructions marked for deletion.
- for (Instruction *ToErase : InstructionsToErase) {
- if (!ToErase->use_empty())
- ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType()));
-
- assert(ToErase->getParent() &&
- "BB containing ToErase deleted unexpectedly!");
- ToErase->eraseFromParent();
- }
- Changed |= !InstructionsToErase.empty();
-
- // Delete all unreachable blocks.
- auto UnreachableBlockPred = [&](const BasicBlock &BB) {
- return !ReachableBlocks.count(&BB);
- };
-
- for (auto &BB : make_filter_range(F, UnreachableBlockPred)) {
- LLVM_DEBUG(dbgs() << "We believe block " << getBlockName(&BB)
- << " is unreachable\n");
- deleteInstructionsInBlock(&BB);
- Changed = true;
- }
-
- cleanupTables();
- return Changed;
-}
-
-struct NewGVN::ValueDFS {
- int DFSIn = 0;
- int DFSOut = 0;
- int LocalNum = 0;
-
- // Only one of Def and U will be set.
- // The bool in the Def tells us whether the Def is the stored value of a
- // store.
- PointerIntPair<Value *, 1, bool> Def;
- Use *U = nullptr;
-
- bool operator<(const ValueDFS &Other) const {
- // It's not enough that any given field be less than - we have sets
- // of fields that need to be evaluated together to give a proper ordering.
- // For example, if you have;
- // DFS (1, 3)
- // Val 0
- // DFS (1, 2)
- // Val 50
- // We want the second to be less than the first, but if we just go field
- // by field, we will get to Val 0 < Val 50 and say the first is less than
- // the second. We only want it to be less than if the DFS orders are equal.
- //
- // Each LLVM instruction only produces one value, and thus the lowest-level
- // differentiator that really matters for the stack (and what we use as as a
- // replacement) is the local dfs number.
- // Everything else in the structure is instruction level, and only affects
- // the order in which we will replace operands of a given instruction.
- //
- // For a given instruction (IE things with equal dfsin, dfsout, localnum),
- // the order of replacement of uses does not matter.
- // IE given,
- // a = 5
- // b = a + a
- // When you hit b, you will have two valuedfs with the same dfsin, out, and
- // localnum.
- // The .val will be the same as well.
- // The .u's will be different.
- // You will replace both, and it does not matter what order you replace them
- // in (IE whether you replace operand 2, then operand 1, or operand 1, then
- // operand 2).
- // Similarly for the case of same dfsin, dfsout, localnum, but different
- // .val's
- // a = 5
- // b = 6
- // c = a + b
- // in c, we will a valuedfs for a, and one for b,with everything the same
- // but .val and .u.
- // It does not matter what order we replace these operands in.
- // You will always end up with the same IR, and this is guaranteed.
- return std::tie(DFSIn, DFSOut, LocalNum, Def, U) <
- std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Def,
- Other.U);
- }
-};
-
-// This function converts the set of members for a congruence class from values,
-// to sets of defs and uses with associated DFS info. The total number of
-// reachable uses for each value is stored in UseCount, and instructions that
-// seem
-// dead (have no non-dead uses) are stored in ProbablyDead.
-void NewGVN::convertClassToDFSOrdered(
- const CongruenceClass &Dense, SmallVectorImpl<ValueDFS> &DFSOrderedSet,
- DenseMap<const Value *, unsigned int> &UseCounts,
- SmallPtrSetImpl<Instruction *> &ProbablyDead) const {
- for (auto D : Dense) {
- // First add the value.
- BasicBlock *BB = getBlockForValue(D);
- // Constants are handled prior to ever calling this function, so
- // we should only be left with instructions as members.
- assert(BB && "Should have figured out a basic block for value");
- ValueDFS VDDef;
- DomTreeNode *DomNode = DT->getNode(BB);
- VDDef.DFSIn = DomNode->getDFSNumIn();
- VDDef.DFSOut = DomNode->getDFSNumOut();
- // If it's a store, use the leader of the value operand, if it's always
- // available, or the value operand. TODO: We could do dominance checks to
- // find a dominating leader, but not worth it ATM.
- if (auto *SI = dyn_cast<StoreInst>(D)) {
- auto Leader = lookupOperandLeader(SI->getValueOperand());
- if (alwaysAvailable(Leader)) {
- VDDef.Def.setPointer(Leader);
- } else {
- VDDef.Def.setPointer(SI->getValueOperand());
- VDDef.Def.setInt(true);
- }
- } else {
- VDDef.Def.setPointer(D);
- }
- assert(isa<Instruction>(D) &&
- "The dense set member should always be an instruction");
- Instruction *Def = cast<Instruction>(D);
- VDDef.LocalNum = InstrToDFSNum(D);
- DFSOrderedSet.push_back(VDDef);
- // If there is a phi node equivalent, add it
- if (auto *PN = RealToTemp.lookup(Def)) {
- auto *PHIE =
- dyn_cast_or_null<PHIExpression>(ValueToExpression.lookup(Def));
- if (PHIE) {
- VDDef.Def.setInt(false);
- VDDef.Def.setPointer(PN);
- VDDef.LocalNum = 0;
- DFSOrderedSet.push_back(VDDef);
- }
- }
-
- unsigned int UseCount = 0;
- // Now add the uses.
- for (auto &U : Def->uses()) {
- if (auto *I = dyn_cast<Instruction>(U.getUser())) {
- // Don't try to replace into dead uses
- if (InstructionsToErase.count(I))
- continue;
- ValueDFS VDUse;
- // Put the phi node uses in the incoming block.
- BasicBlock *IBlock;
- if (auto *P = dyn_cast<PHINode>(I)) {
- IBlock = P->getIncomingBlock(U);
- // Make phi node users appear last in the incoming block
- // they are from.
- VDUse.LocalNum = InstrDFS.size() + 1;
- } else {
- IBlock = getBlockForValue(I);
- VDUse.LocalNum = InstrToDFSNum(I);
- }
-
- // Skip uses in unreachable blocks, as we're going
- // to delete them.
- if (ReachableBlocks.count(IBlock) == 0)
- continue;
-
- DomTreeNode *DomNode = DT->getNode(IBlock);
- VDUse.DFSIn = DomNode->getDFSNumIn();
- VDUse.DFSOut = DomNode->getDFSNumOut();
- VDUse.U = &U;
- ++UseCount;
- DFSOrderedSet.emplace_back(VDUse);
- }
- }
-
- // If there are no uses, it's probably dead (but it may have side-effects,
- // so not definitely dead. Otherwise, store the number of uses so we can
- // track if it becomes dead later).
- if (UseCount == 0)
- ProbablyDead.insert(Def);
- else
- UseCounts[Def] = UseCount;
- }
-}
-
-// This function converts the set of members for a congruence class from values,
-// to the set of defs for loads and stores, with associated DFS info.
-void NewGVN::convertClassToLoadsAndStores(
- const CongruenceClass &Dense,
- SmallVectorImpl<ValueDFS> &LoadsAndStores) const {
- for (auto D : Dense) {
- if (!isa<LoadInst>(D) && !isa<StoreInst>(D))
- continue;
-
- BasicBlock *BB = getBlockForValue(D);
- ValueDFS VD;
- DomTreeNode *DomNode = DT->getNode(BB);
- VD.DFSIn = DomNode->getDFSNumIn();
- VD.DFSOut = DomNode->getDFSNumOut();
- VD.Def.setPointer(D);
-
- // If it's an instruction, use the real local dfs number.
- if (auto *I = dyn_cast<Instruction>(D))
- VD.LocalNum = InstrToDFSNum(I);
- else
- llvm_unreachable("Should have been an instruction");
-
- LoadsAndStores.emplace_back(VD);
- }
-}
-
-static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
- patchReplacementInstruction(I, Repl);
- I->replaceAllUsesWith(Repl);
-}
-
-void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
- LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
- ++NumGVNBlocksDeleted;
-
- // Delete the instructions backwards, as it has a reduced likelihood of having
- // to update as many def-use and use-def chains. Start after the terminator.
- auto StartPoint = BB->rbegin();
- ++StartPoint;
- // Note that we explicitly recalculate BB->rend() on each iteration,
- // as it may change when we remove the first instruction.
- for (BasicBlock::reverse_iterator I(StartPoint); I != BB->rend();) {
- Instruction &Inst = *I++;
- if (!Inst.use_empty())
- Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
- if (isa<LandingPadInst>(Inst))
- continue;
- salvageKnowledge(&Inst, AC);
-
- Inst.eraseFromParent();
- ++NumGVNInstrDeleted;
- }
- // Now insert something that simplifycfg will turn into an unreachable.
- Type *Int8Ty = Type::getInt8Ty(BB->getContext());
- new StoreInst(UndefValue::get(Int8Ty),
- Constant::getNullValue(Int8Ty->getPointerTo()),
- BB->getTerminator());
-}
-
-void NewGVN::markInstructionForDeletion(Instruction *I) {
- LLVM_DEBUG(dbgs() << "Marking " << *I << " for deletion\n");
- InstructionsToErase.insert(I);
-}
-
-void NewGVN::replaceInstruction(Instruction *I, Value *V) {
- LLVM_DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n");
- patchAndReplaceAllUsesWith(I, V);
- // We save the actual erasing to avoid invalidating memory
- // dependencies until we are done with everything.
- markInstructionForDeletion(I);
-}
-
-namespace {
-
-// This is a stack that contains both the value and dfs info of where
-// that value is valid.
-class ValueDFSStack {
-public:
- Value *back() const { return ValueStack.back(); }
- std::pair<int, int> dfs_back() const { return DFSStack.back(); }
-
- void push_back(Value *V, int DFSIn, int DFSOut) {
- ValueStack.emplace_back(V);
- DFSStack.emplace_back(DFSIn, DFSOut);
- }
-
- bool empty() const { return DFSStack.empty(); }
-
- bool isInScope(int DFSIn, int DFSOut) const {
- if (empty())
- return false;
- return DFSIn >= DFSStack.back().first && DFSOut <= DFSStack.back().second;
- }
-
- void popUntilDFSScope(int DFSIn, int DFSOut) {
-
- // These two should always be in sync at this point.
- assert(ValueStack.size() == DFSStack.size() &&
- "Mismatch between ValueStack and DFSStack");
- while (
- !DFSStack.empty() &&
- !(DFSIn >= DFSStack.back().first && DFSOut <= DFSStack.back().second)) {
- DFSStack.pop_back();
- ValueStack.pop_back();
- }
- }
-
-private:
- SmallVector<Value *, 8> ValueStack;
- SmallVector<std::pair<int, int>, 8> DFSStack;
-};
-
-} // end anonymous namespace
-
-// Given an expression, get the congruence class for it.
-CongruenceClass *NewGVN::getClassForExpression(const Expression *E) const {
- if (auto *VE = dyn_cast<VariableExpression>(E))
- return ValueToClass.lookup(VE->getVariableValue());
- else if (isa<DeadExpression>(E))
- return TOPClass;
- return ExpressionToClass.lookup(E);
-}
-
-// Given a value and a basic block we are trying to see if it is available in,
-// see if the value has a leader available in that block.
-Value *NewGVN::findPHIOfOpsLeader(const Expression *E,
- const Instruction *OrigInst,
- const BasicBlock *BB) const {
- // It would already be constant if we could make it constant
- if (auto *CE = dyn_cast<ConstantExpression>(E))
- return CE->getConstantValue();
- if (auto *VE = dyn_cast<VariableExpression>(E)) {
- auto *V = VE->getVariableValue();
- if (alwaysAvailable(V) || DT->dominates(getBlockForValue(V), BB))
- return VE->getVariableValue();
- }
-
- auto *CC = getClassForExpression(E);
- if (!CC)
- return nullptr;
- if (alwaysAvailable(CC->getLeader()))
- return CC->getLeader();
-
- for (auto Member : *CC) {
- auto *MemberInst = dyn_cast<Instruction>(Member);
- if (MemberInst == OrigInst)
- continue;
- // Anything that isn't an instruction is always available.
- if (!MemberInst)
- return Member;
- if (DT->dominates(getBlockForValue(MemberInst), BB))
- return Member;
- }
- return nullptr;
-}
-
-bool NewGVN::eliminateInstructions(Function &F) {
- // This is a non-standard eliminator. The normal way to eliminate is
- // to walk the dominator tree in order, keeping track of available
- // values, and eliminating them. However, this is mildly
- // pointless. It requires doing lookups on every instruction,
- // regardless of whether we will ever eliminate it. For
- // instructions part of most singleton congruence classes, we know we
- // will never eliminate them.
-
- // Instead, this eliminator looks at the congruence classes directly, sorts
- // them into a DFS ordering of the dominator tree, and then we just
- // perform elimination straight on the sets by walking the congruence
- // class member uses in order, and eliminate the ones dominated by the
- // last member. This is worst case O(E log E) where E = number of
- // instructions in a single congruence class. In theory, this is all
- // instructions. In practice, it is much faster, as most instructions are
- // either in singleton congruence classes or can't possibly be eliminated
- // anyway (if there are no overlapping DFS ranges in class).
- // When we find something not dominated, it becomes the new leader
- // for elimination purposes.
- // TODO: If we wanted to be faster, We could remove any members with no
- // overlapping ranges while sorting, as we will never eliminate anything
- // with those members, as they don't dominate anything else in our set.
-
- bool AnythingReplaced = false;
-
- // Since we are going to walk the domtree anyway, and we can't guarantee the
- // DFS numbers are updated, we compute some ourselves.
- DT->updateDFSNumbers();
-
- // Go through all of our phi nodes, and kill the arguments associated with
- // unreachable edges.
- auto ReplaceUnreachablePHIArgs = [&](PHINode *PHI, BasicBlock *BB) {
- for (auto &Operand : PHI->incoming_values())
- if (!ReachableEdges.count({PHI->getIncomingBlock(Operand), BB})) {
- LLVM_DEBUG(dbgs() << "Replacing incoming value of " << PHI
- << " for block "
- << getBlockName(PHI->getIncomingBlock(Operand))
- << " with undef due to it being unreachable\n");
- Operand.set(UndefValue::get(PHI->getType()));
- }
- };
- // Replace unreachable phi arguments.
- // At this point, RevisitOnReachabilityChange only contains:
- //
- // 1. PHIs
- // 2. Temporaries that will convert to PHIs
- // 3. Operations that are affected by an unreachable edge but do not fit into
- // 1 or 2 (rare).
- // So it is a slight overshoot of what we want. We could make it exact by
- // using two SparseBitVectors per block.
- DenseMap<const BasicBlock *, unsigned> ReachablePredCount;
- for (auto &KV : ReachableEdges)
- ReachablePredCount[KV.getEnd()]++;
- for (auto &BBPair : RevisitOnReachabilityChange) {
- for (auto InstNum : BBPair.second) {
- auto *Inst = InstrFromDFSNum(InstNum);
- auto *PHI = dyn_cast<PHINode>(Inst);
- PHI = PHI ? PHI : dyn_cast_or_null<PHINode>(RealToTemp.lookup(Inst));
- if (!PHI)
- continue;
- auto *BB = BBPair.first;
- if (ReachablePredCount.lookup(BB) != PHI->getNumIncomingValues())
- ReplaceUnreachablePHIArgs(PHI, BB);
- }
- }
-
- // Map to store the use counts
- DenseMap<const Value *, unsigned int> UseCounts;
- for (auto *CC : reverse(CongruenceClasses)) {
- LLVM_DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID()
- << "\n");
- // Track the equivalent store info so we can decide whether to try
- // dead store elimination.
- SmallVector<ValueDFS, 8> PossibleDeadStores;
- SmallPtrSet<Instruction *, 8> ProbablyDead;
- if (CC->isDead() || CC->empty())
- continue;
- // Everything still in the TOP class is unreachable or dead.
- if (CC == TOPClass) {
- for (auto M : *CC) {
- auto *VTE = ValueToExpression.lookup(M);
- if (VTE && isa<DeadExpression>(VTE))
- markInstructionForDeletion(cast<Instruction>(M));
- assert((!ReachableBlocks.count(cast<Instruction>(M)->getParent()) ||
- InstructionsToErase.count(cast<Instruction>(M))) &&
- "Everything in TOP should be unreachable or dead at this "
- "point");
- }
- continue;
- }
-
- assert(CC->getLeader() && "We should have had a leader");
- // If this is a leader that is always available, and it's a
- // constant or has no equivalences, just replace everything with
- // it. We then update the congruence class with whatever members
- // are left.
- Value *Leader =
- CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
- if (alwaysAvailable(Leader)) {
- CongruenceClass::MemberSet MembersLeft;
- for (auto M : *CC) {
- Value *Member = M;
- // Void things have no uses we can replace.
- if (Member == Leader || !isa<Instruction>(Member) ||
- Member->getType()->isVoidTy()) {
- MembersLeft.insert(Member);
- continue;
- }
- LLVM_DEBUG(dbgs() << "Found replacement " << *(Leader) << " for "
- << *Member << "\n");
- auto *I = cast<Instruction>(Member);
- assert(Leader != I && "About to accidentally remove our leader");
- replaceInstruction(I, Leader);
- AnythingReplaced = true;
- }
- CC->swap(MembersLeft);
- } else {
- // If this is a singleton, we can skip it.
- if (CC->size() != 1 || RealToTemp.count(Leader)) {
- // This is a stack because equality replacement/etc may place
- // constants in the middle of the member list, and we want to use
- // those constant values in preference to the current leader, over
- // the scope of those constants.
- ValueDFSStack EliminationStack;
-
- // Convert the members to DFS ordered sets and then merge them.
- SmallVector<ValueDFS, 8> DFSOrderedSet;
- convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead);
-
- // Sort the whole thing.
- llvm::sort(DFSOrderedSet);
- for (auto &VD : DFSOrderedSet) {
- int MemberDFSIn = VD.DFSIn;
- int MemberDFSOut = VD.DFSOut;
- Value *Def = VD.Def.getPointer();
- bool FromStore = VD.Def.getInt();
- Use *U = VD.U;
- // We ignore void things because we can't get a value from them.
- if (Def && Def->getType()->isVoidTy())
- continue;
- auto *DefInst = dyn_cast_or_null<Instruction>(Def);
- if (DefInst && AllTempInstructions.count(DefInst)) {
- auto *PN = cast<PHINode>(DefInst);
-
- // If this is a value phi and that's the expression we used, insert
- // it into the program
- // remove from temp instruction list.
- AllTempInstructions.erase(PN);
- auto *DefBlock = getBlockForValue(Def);
- LLVM_DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def
- << " into block "
- << getBlockName(getBlockForValue(Def)) << "\n");
- PN->insertBefore(&DefBlock->front());
- Def = PN;
- NumGVNPHIOfOpsEliminations++;
- }
-
- if (EliminationStack.empty()) {
- LLVM_DEBUG(dbgs() << "Elimination Stack is empty\n");
- } else {
- LLVM_DEBUG(dbgs() << "Elimination Stack Top DFS numbers are ("
- << EliminationStack.dfs_back().first << ","
- << EliminationStack.dfs_back().second << ")\n");
- }
-
- LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
- << MemberDFSOut << ")\n");
- // First, we see if we are out of scope or empty. If so,
- // and there equivalences, we try to replace the top of
- // stack with equivalences (if it's on the stack, it must
- // not have been eliminated yet).
- // Then we synchronize to our current scope, by
- // popping until we are back within a DFS scope that
- // dominates the current member.
- // Then, what happens depends on a few factors
- // If the stack is now empty, we need to push
- // If we have a constant or a local equivalence we want to
- // start using, we also push.
- // Otherwise, we walk along, processing members who are
- // dominated by this scope, and eliminate them.
- bool ShouldPush = Def && EliminationStack.empty();
- bool OutOfScope =
- !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut);
-
- if (OutOfScope || ShouldPush) {
- // Sync to our current scope.
- EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
- bool ShouldPush = Def && EliminationStack.empty();
- if (ShouldPush) {
- EliminationStack.push_back(Def, MemberDFSIn, MemberDFSOut);
- }
- }
-
- // Skip the Def's, we only want to eliminate on their uses. But mark
- // dominated defs as dead.
- if (Def) {
- // For anything in this case, what and how we value number
- // guarantees that any side-effets that would have occurred (ie
- // throwing, etc) can be proven to either still occur (because it's
- // dominated by something that has the same side-effects), or never
- // occur. Otherwise, we would not have been able to prove it value
- // equivalent to something else. For these things, we can just mark
- // it all dead. Note that this is different from the "ProbablyDead"
- // set, which may not be dominated by anything, and thus, are only
- // easy to prove dead if they are also side-effect free. Note that
- // because stores are put in terms of the stored value, we skip
- // stored values here. If the stored value is really dead, it will
- // still be marked for deletion when we process it in its own class.
- if (!EliminationStack.empty() && Def != EliminationStack.back() &&
- isa<Instruction>(Def) && !FromStore)
- markInstructionForDeletion(cast<Instruction>(Def));
- continue;
- }
- // At this point, we know it is a Use we are trying to possibly
- // replace.
-
- assert(isa<Instruction>(U->get()) &&
- "Current def should have been an instruction");
- assert(isa<Instruction>(U->getUser()) &&
- "Current user should have been an instruction");
-
- // If the thing we are replacing into is already marked to be dead,
- // this use is dead. Note that this is true regardless of whether
- // we have anything dominating the use or not. We do this here
- // because we are already walking all the uses anyway.
- Instruction *InstUse = cast<Instruction>(U->getUser());
- if (InstructionsToErase.count(InstUse)) {
- auto &UseCount = UseCounts[U->get()];
- if (--UseCount == 0) {
- ProbablyDead.insert(cast<Instruction>(U->get()));
- }
- }
-
- // If we get to this point, and the stack is empty we must have a use
- // with nothing we can use to eliminate this use, so just skip it.
- if (EliminationStack.empty())
- continue;
-
- Value *DominatingLeader = EliminationStack.back();
-
- auto *II = dyn_cast<IntrinsicInst>(DominatingLeader);
- bool isSSACopy = II && II->getIntrinsicID() == Intrinsic::ssa_copy;
- if (isSSACopy)
- DominatingLeader = II->getOperand(0);
-
- // Don't replace our existing users with ourselves.
- if (U->get() == DominatingLeader)
- continue;
- LLVM_DEBUG(dbgs()
- << "Found replacement " << *DominatingLeader << " for "
- << *U->get() << " in " << *(U->getUser()) << "\n");
-
- // If we replaced something in an instruction, handle the patching of
- // metadata. Skip this if we are replacing predicateinfo with its
- // original operand, as we already know we can just drop it.
- auto *ReplacedInst = cast<Instruction>(U->get());
- auto *PI = PredInfo->getPredicateInfoFor(ReplacedInst);
- if (!PI || DominatingLeader != PI->OriginalOp)
- patchReplacementInstruction(ReplacedInst, DominatingLeader);
- U->set(DominatingLeader);
- // This is now a use of the dominating leader, which means if the
- // dominating leader was dead, it's now live!
- auto &LeaderUseCount = UseCounts[DominatingLeader];
- // It's about to be alive again.
- if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
- ProbablyDead.erase(cast<Instruction>(DominatingLeader));
- // For copy instructions, we use their operand as a leader,
- // which means we remove a user of the copy and it may become dead.
- if (isSSACopy) {
- unsigned &IIUseCount = UseCounts[II];
- if (--IIUseCount == 0)
- ProbablyDead.insert(II);
- }
- ++LeaderUseCount;
- AnythingReplaced = true;
- }
- }
- }
-
- // At this point, anything still in the ProbablyDead set is actually dead if
- // would be trivially dead.
- for (auto *I : ProbablyDead)
- if (wouldInstructionBeTriviallyDead(I))
- markInstructionForDeletion(I);
-
- // Cleanup the congruence class.
- CongruenceClass::MemberSet MembersLeft;
- for (auto *Member : *CC)
- if (!isa<Instruction>(Member) ||
- !InstructionsToErase.count(cast<Instruction>(Member)))
- MembersLeft.insert(Member);
- CC->swap(MembersLeft);
-
- // If we have possible dead stores to look at, try to eliminate them.
- if (CC->getStoreCount() > 0) {
- convertClassToLoadsAndStores(*CC, PossibleDeadStores);
- llvm::sort(PossibleDeadStores);
- ValueDFSStack EliminationStack;
- for (auto &VD : PossibleDeadStores) {
- int MemberDFSIn = VD.DFSIn;
- int MemberDFSOut = VD.DFSOut;
- Instruction *Member = cast<Instruction>(VD.Def.getPointer());
- if (EliminationStack.empty() ||
- !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut)) {
- // Sync to our current scope.
- EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
- if (EliminationStack.empty()) {
- EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
- continue;
- }
- }
- // We already did load elimination, so nothing to do here.
- if (isa<LoadInst>(Member))
- continue;
- assert(!EliminationStack.empty());
- Instruction *Leader = cast<Instruction>(EliminationStack.back());
- (void)Leader;
- assert(DT->dominates(Leader->getParent(), Member->getParent()));
- // Member is dominater by Leader, and thus dead
- LLVM_DEBUG(dbgs() << "Marking dead store " << *Member
- << " that is dominated by " << *Leader << "\n");
- markInstructionForDeletion(Member);
- CC->erase(Member);
- ++NumGVNDeadStores;
- }
- }
- }
- return AnythingReplaced;
-}
-
-// This function provides global ranking of operations so that we can place them
-// in a canonical order. Note that rank alone is not necessarily enough for a
-// complete ordering, as constants all have the same rank. However, generally,
-// we will simplify an operation with all constants so that it doesn't matter
-// what order they appear in.
-unsigned int NewGVN::getRank(const Value *V) const {
- // Prefer constants to undef to anything else
- // Undef is a constant, have to check it first.
- // Prefer smaller constants to constantexprs
- if (isa<ConstantExpr>(V))
- return 2;
- if (isa<UndefValue>(V))
- return 1;
- if (isa<Constant>(V))
- return 0;
- else if (auto *A = dyn_cast<Argument>(V))
- return 3 + A->getArgNo();
-
- // Need to shift the instruction DFS by number of arguments + 3 to account for
- // the constant and argument ranking above.
- unsigned Result = InstrToDFSNum(V);
- if (Result > 0)
- return 4 + NumFuncArgs + Result;
- // Unreachable or something else, just return a really large number.
- return ~0;
-}
-
-// This is a function that says whether two commutative operations should
-// have their order swapped when canonicalizing.
-bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
- // Because we only care about a total ordering, and don't rewrite expressions
- // in this order, we order by rank, which will give a strict weak ordering to
- // everything but constants, and then we order by pointer address.
- return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
-}
-
-namespace {
-
-class NewGVNLegacyPass : public FunctionPass {
-public:
- // Pass identification, replacement for typeid.
- static char ID;
-
- NewGVNLegacyPass() : FunctionPass(ID) {
- initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-
-private:
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<MemorySSAWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-bool NewGVNLegacyPass::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
- return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
- &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
- &getAnalysis<AAResultsWrapperPass>().getAAResults(),
- &getAnalysis<MemorySSAWrapperPass>().getMSSA(),
- F.getParent()->getDataLayout())
- .runGVN();
-}
-
-char NewGVNLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false,
- false)
-
-// createGVNPass - The public interface to this file.
-FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); }
-
-PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
- // Apparently the order in which we get these results matter for
- // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
- // the same order here, just in case.
- auto &AC = AM.getResult<AssumptionAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &AA = AM.getResult<AAManager>(F);
- auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
- bool Changed =
- NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getParent()->getDataLayout())
- .runGVN();
- if (!Changed)
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<GlobalsAA>();
- return PA;
-}
+ }
+
+ // Now a standard depth first ordering of the domtree is equivalent to RPO.
+ for (auto DTN : depth_first(DT->getRootNode())) {
+ BasicBlock *B = DTN->getBlock();
+ const auto &BlockRange = assignDFSNumbers(B, ICount);
+ BlockInstRange.insert({B, BlockRange});
+ ICount += BlockRange.second - BlockRange.first;
+ }
+ initializeCongruenceClasses(F);
+
+ TouchedInstructions.resize(ICount);
+ // Ensure we don't end up resizing the expressionToClass map, as
+ // that can be quite expensive. At most, we have one expression per
+ // instruction.
+ ExpressionToClass.reserve(ICount);
+
+ // Initialize the touched instructions to include the entry block.
+ const auto &InstRange = BlockInstRange.lookup(&F.getEntryBlock());
+ TouchedInstructions.set(InstRange.first, InstRange.second);
+ LLVM_DEBUG(dbgs() << "Block " << getBlockName(&F.getEntryBlock())
+ << " marked reachable\n");
+ ReachableBlocks.insert(&F.getEntryBlock());
+
+ iterateTouchedInstructions();
+ verifyMemoryCongruency();
+ verifyIterationSettled(F);
+ verifyStoreExpressions();
+
+ Changed |= eliminateInstructions(F);
+
+ // Delete all instructions marked for deletion.
+ for (Instruction *ToErase : InstructionsToErase) {
+ if (!ToErase->use_empty())
+ ToErase->replaceAllUsesWith(UndefValue::get(ToErase->getType()));
+
+ assert(ToErase->getParent() &&
+ "BB containing ToErase deleted unexpectedly!");
+ ToErase->eraseFromParent();
+ }
+ Changed |= !InstructionsToErase.empty();
+
+ // Delete all unreachable blocks.
+ auto UnreachableBlockPred = [&](const BasicBlock &BB) {
+ return !ReachableBlocks.count(&BB);
+ };
+
+ for (auto &BB : make_filter_range(F, UnreachableBlockPred)) {
+ LLVM_DEBUG(dbgs() << "We believe block " << getBlockName(&BB)
+ << " is unreachable\n");
+ deleteInstructionsInBlock(&BB);
+ Changed = true;
+ }
+
+ cleanupTables();
+ return Changed;
+}
+
+struct NewGVN::ValueDFS {
+ int DFSIn = 0;
+ int DFSOut = 0;
+ int LocalNum = 0;
+
+ // Only one of Def and U will be set.
+ // The bool in the Def tells us whether the Def is the stored value of a
+ // store.
+ PointerIntPair<Value *, 1, bool> Def;
+ Use *U = nullptr;
+
+ bool operator<(const ValueDFS &Other) const {
+ // It's not enough that any given field be less than - we have sets
+ // of fields that need to be evaluated together to give a proper ordering.
+ // For example, if you have;
+ // DFS (1, 3)
+ // Val 0
+ // DFS (1, 2)
+ // Val 50
+ // We want the second to be less than the first, but if we just go field
+ // by field, we will get to Val 0 < Val 50 and say the first is less than
+ // the second. We only want it to be less than if the DFS orders are equal.
+ //
+ // Each LLVM instruction only produces one value, and thus the lowest-level
+ // differentiator that really matters for the stack (and what we use as as a
+ // replacement) is the local dfs number.
+ // Everything else in the structure is instruction level, and only affects
+ // the order in which we will replace operands of a given instruction.
+ //
+ // For a given instruction (IE things with equal dfsin, dfsout, localnum),
+ // the order of replacement of uses does not matter.
+ // IE given,
+ // a = 5
+ // b = a + a
+ // When you hit b, you will have two valuedfs with the same dfsin, out, and
+ // localnum.
+ // The .val will be the same as well.
+ // The .u's will be different.
+ // You will replace both, and it does not matter what order you replace them
+ // in (IE whether you replace operand 2, then operand 1, or operand 1, then
+ // operand 2).
+ // Similarly for the case of same dfsin, dfsout, localnum, but different
+ // .val's
+ // a = 5
+ // b = 6
+ // c = a + b
+ // in c, we will a valuedfs for a, and one for b,with everything the same
+ // but .val and .u.
+ // It does not matter what order we replace these operands in.
+ // You will always end up with the same IR, and this is guaranteed.
+ return std::tie(DFSIn, DFSOut, LocalNum, Def, U) <
+ std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Def,
+ Other.U);
+ }
+};
+
+// This function converts the set of members for a congruence class from values,
+// to sets of defs and uses with associated DFS info. The total number of
+// reachable uses for each value is stored in UseCount, and instructions that
+// seem
+// dead (have no non-dead uses) are stored in ProbablyDead.
+void NewGVN::convertClassToDFSOrdered(
+ const CongruenceClass &Dense, SmallVectorImpl<ValueDFS> &DFSOrderedSet,
+ DenseMap<const Value *, unsigned int> &UseCounts,
+ SmallPtrSetImpl<Instruction *> &ProbablyDead) const {
+ for (auto D : Dense) {
+ // First add the value.
+ BasicBlock *BB = getBlockForValue(D);
+ // Constants are handled prior to ever calling this function, so
+ // we should only be left with instructions as members.
+ assert(BB && "Should have figured out a basic block for value");
+ ValueDFS VDDef;
+ DomTreeNode *DomNode = DT->getNode(BB);
+ VDDef.DFSIn = DomNode->getDFSNumIn();
+ VDDef.DFSOut = DomNode->getDFSNumOut();
+ // If it's a store, use the leader of the value operand, if it's always
+ // available, or the value operand. TODO: We could do dominance checks to
+ // find a dominating leader, but not worth it ATM.
+ if (auto *SI = dyn_cast<StoreInst>(D)) {
+ auto Leader = lookupOperandLeader(SI->getValueOperand());
+ if (alwaysAvailable(Leader)) {
+ VDDef.Def.setPointer(Leader);
+ } else {
+ VDDef.Def.setPointer(SI->getValueOperand());
+ VDDef.Def.setInt(true);
+ }
+ } else {
+ VDDef.Def.setPointer(D);
+ }
+ assert(isa<Instruction>(D) &&
+ "The dense set member should always be an instruction");
+ Instruction *Def = cast<Instruction>(D);
+ VDDef.LocalNum = InstrToDFSNum(D);
+ DFSOrderedSet.push_back(VDDef);
+ // If there is a phi node equivalent, add it
+ if (auto *PN = RealToTemp.lookup(Def)) {
+ auto *PHIE =
+ dyn_cast_or_null<PHIExpression>(ValueToExpression.lookup(Def));
+ if (PHIE) {
+ VDDef.Def.setInt(false);
+ VDDef.Def.setPointer(PN);
+ VDDef.LocalNum = 0;
+ DFSOrderedSet.push_back(VDDef);
+ }
+ }
+
+ unsigned int UseCount = 0;
+ // Now add the uses.
+ for (auto &U : Def->uses()) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ // Don't try to replace into dead uses
+ if (InstructionsToErase.count(I))
+ continue;
+ ValueDFS VDUse;
+ // Put the phi node uses in the incoming block.
+ BasicBlock *IBlock;
+ if (auto *P = dyn_cast<PHINode>(I)) {
+ IBlock = P->getIncomingBlock(U);
+ // Make phi node users appear last in the incoming block
+ // they are from.
+ VDUse.LocalNum = InstrDFS.size() + 1;
+ } else {
+ IBlock = getBlockForValue(I);
+ VDUse.LocalNum = InstrToDFSNum(I);
+ }
+
+ // Skip uses in unreachable blocks, as we're going
+ // to delete them.
+ if (ReachableBlocks.count(IBlock) == 0)
+ continue;
+
+ DomTreeNode *DomNode = DT->getNode(IBlock);
+ VDUse.DFSIn = DomNode->getDFSNumIn();
+ VDUse.DFSOut = DomNode->getDFSNumOut();
+ VDUse.U = &U;
+ ++UseCount;
+ DFSOrderedSet.emplace_back(VDUse);
+ }
+ }
+
+ // If there are no uses, it's probably dead (but it may have side-effects,
+ // so not definitely dead. Otherwise, store the number of uses so we can
+ // track if it becomes dead later).
+ if (UseCount == 0)
+ ProbablyDead.insert(Def);
+ else
+ UseCounts[Def] = UseCount;
+ }
+}
+
+// This function converts the set of members for a congruence class from values,
+// to the set of defs for loads and stores, with associated DFS info.
+void NewGVN::convertClassToLoadsAndStores(
+ const CongruenceClass &Dense,
+ SmallVectorImpl<ValueDFS> &LoadsAndStores) const {
+ for (auto D : Dense) {
+ if (!isa<LoadInst>(D) && !isa<StoreInst>(D))
+ continue;
+
+ BasicBlock *BB = getBlockForValue(D);
+ ValueDFS VD;
+ DomTreeNode *DomNode = DT->getNode(BB);
+ VD.DFSIn = DomNode->getDFSNumIn();
+ VD.DFSOut = DomNode->getDFSNumOut();
+ VD.Def.setPointer(D);
+
+ // If it's an instruction, use the real local dfs number.
+ if (auto *I = dyn_cast<Instruction>(D))
+ VD.LocalNum = InstrToDFSNum(I);
+ else
+ llvm_unreachable("Should have been an instruction");
+
+ LoadsAndStores.emplace_back(VD);
+ }
+}
+
+static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
+ patchReplacementInstruction(I, Repl);
+ I->replaceAllUsesWith(Repl);
+}
+
+void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
+ LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << *BB);
+ ++NumGVNBlocksDeleted;
+
+ // Delete the instructions backwards, as it has a reduced likelihood of having
+ // to update as many def-use and use-def chains. Start after the terminator.
+ auto StartPoint = BB->rbegin();
+ ++StartPoint;
+ // Note that we explicitly recalculate BB->rend() on each iteration,
+ // as it may change when we remove the first instruction.
+ for (BasicBlock::reverse_iterator I(StartPoint); I != BB->rend();) {
+ Instruction &Inst = *I++;
+ if (!Inst.use_empty())
+ Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
+ if (isa<LandingPadInst>(Inst))
+ continue;
+ salvageKnowledge(&Inst, AC);
+
+ Inst.eraseFromParent();
+ ++NumGVNInstrDeleted;
+ }
+ // Now insert something that simplifycfg will turn into an unreachable.
+ Type *Int8Ty = Type::getInt8Ty(BB->getContext());
+ new StoreInst(UndefValue::get(Int8Ty),
+ Constant::getNullValue(Int8Ty->getPointerTo()),
+ BB->getTerminator());
+}
+
+void NewGVN::markInstructionForDeletion(Instruction *I) {
+ LLVM_DEBUG(dbgs() << "Marking " << *I << " for deletion\n");
+ InstructionsToErase.insert(I);
+}
+
+void NewGVN::replaceInstruction(Instruction *I, Value *V) {
+ LLVM_DEBUG(dbgs() << "Replacing " << *I << " with " << *V << "\n");
+ patchAndReplaceAllUsesWith(I, V);
+ // We save the actual erasing to avoid invalidating memory
+ // dependencies until we are done with everything.
+ markInstructionForDeletion(I);
+}
+
+namespace {
+
+// This is a stack that contains both the value and dfs info of where
+// that value is valid.
+class ValueDFSStack {
+public:
+ Value *back() const { return ValueStack.back(); }
+ std::pair<int, int> dfs_back() const { return DFSStack.back(); }
+
+ void push_back(Value *V, int DFSIn, int DFSOut) {
+ ValueStack.emplace_back(V);
+ DFSStack.emplace_back(DFSIn, DFSOut);
+ }
+
+ bool empty() const { return DFSStack.empty(); }
+
+ bool isInScope(int DFSIn, int DFSOut) const {
+ if (empty())
+ return false;
+ return DFSIn >= DFSStack.back().first && DFSOut <= DFSStack.back().second;
+ }
+
+ void popUntilDFSScope(int DFSIn, int DFSOut) {
+
+ // These two should always be in sync at this point.
+ assert(ValueStack.size() == DFSStack.size() &&
+ "Mismatch between ValueStack and DFSStack");
+ while (
+ !DFSStack.empty() &&
+ !(DFSIn >= DFSStack.back().first && DFSOut <= DFSStack.back().second)) {
+ DFSStack.pop_back();
+ ValueStack.pop_back();
+ }
+ }
+
+private:
+ SmallVector<Value *, 8> ValueStack;
+ SmallVector<std::pair<int, int>, 8> DFSStack;
+};
+
+} // end anonymous namespace
+
+// Given an expression, get the congruence class for it.
+CongruenceClass *NewGVN::getClassForExpression(const Expression *E) const {
+ if (auto *VE = dyn_cast<VariableExpression>(E))
+ return ValueToClass.lookup(VE->getVariableValue());
+ else if (isa<DeadExpression>(E))
+ return TOPClass;
+ return ExpressionToClass.lookup(E);
+}
+
+// Given a value and a basic block we are trying to see if it is available in,
+// see if the value has a leader available in that block.
+Value *NewGVN::findPHIOfOpsLeader(const Expression *E,
+ const Instruction *OrigInst,
+ const BasicBlock *BB) const {
+ // It would already be constant if we could make it constant
+ if (auto *CE = dyn_cast<ConstantExpression>(E))
+ return CE->getConstantValue();
+ if (auto *VE = dyn_cast<VariableExpression>(E)) {
+ auto *V = VE->getVariableValue();
+ if (alwaysAvailable(V) || DT->dominates(getBlockForValue(V), BB))
+ return VE->getVariableValue();
+ }
+
+ auto *CC = getClassForExpression(E);
+ if (!CC)
+ return nullptr;
+ if (alwaysAvailable(CC->getLeader()))
+ return CC->getLeader();
+
+ for (auto Member : *CC) {
+ auto *MemberInst = dyn_cast<Instruction>(Member);
+ if (MemberInst == OrigInst)
+ continue;
+ // Anything that isn't an instruction is always available.
+ if (!MemberInst)
+ return Member;
+ if (DT->dominates(getBlockForValue(MemberInst), BB))
+ return Member;
+ }
+ return nullptr;
+}
+
+bool NewGVN::eliminateInstructions(Function &F) {
+ // This is a non-standard eliminator. The normal way to eliminate is
+ // to walk the dominator tree in order, keeping track of available
+ // values, and eliminating them. However, this is mildly
+ // pointless. It requires doing lookups on every instruction,
+ // regardless of whether we will ever eliminate it. For
+ // instructions part of most singleton congruence classes, we know we
+ // will never eliminate them.
+
+ // Instead, this eliminator looks at the congruence classes directly, sorts
+ // them into a DFS ordering of the dominator tree, and then we just
+ // perform elimination straight on the sets by walking the congruence
+ // class member uses in order, and eliminate the ones dominated by the
+ // last member. This is worst case O(E log E) where E = number of
+ // instructions in a single congruence class. In theory, this is all
+ // instructions. In practice, it is much faster, as most instructions are
+ // either in singleton congruence classes or can't possibly be eliminated
+ // anyway (if there are no overlapping DFS ranges in class).
+ // When we find something not dominated, it becomes the new leader
+ // for elimination purposes.
+ // TODO: If we wanted to be faster, We could remove any members with no
+ // overlapping ranges while sorting, as we will never eliminate anything
+ // with those members, as they don't dominate anything else in our set.
+
+ bool AnythingReplaced = false;
+
+ // Since we are going to walk the domtree anyway, and we can't guarantee the
+ // DFS numbers are updated, we compute some ourselves.
+ DT->updateDFSNumbers();
+
+ // Go through all of our phi nodes, and kill the arguments associated with
+ // unreachable edges.
+ auto ReplaceUnreachablePHIArgs = [&](PHINode *PHI, BasicBlock *BB) {
+ for (auto &Operand : PHI->incoming_values())
+ if (!ReachableEdges.count({PHI->getIncomingBlock(Operand), BB})) {
+ LLVM_DEBUG(dbgs() << "Replacing incoming value of " << PHI
+ << " for block "
+ << getBlockName(PHI->getIncomingBlock(Operand))
+ << " with undef due to it being unreachable\n");
+ Operand.set(UndefValue::get(PHI->getType()));
+ }
+ };
+ // Replace unreachable phi arguments.
+ // At this point, RevisitOnReachabilityChange only contains:
+ //
+ // 1. PHIs
+ // 2. Temporaries that will convert to PHIs
+ // 3. Operations that are affected by an unreachable edge but do not fit into
+ // 1 or 2 (rare).
+ // So it is a slight overshoot of what we want. We could make it exact by
+ // using two SparseBitVectors per block.
+ DenseMap<const BasicBlock *, unsigned> ReachablePredCount;
+ for (auto &KV : ReachableEdges)
+ ReachablePredCount[KV.getEnd()]++;
+ for (auto &BBPair : RevisitOnReachabilityChange) {
+ for (auto InstNum : BBPair.second) {
+ auto *Inst = InstrFromDFSNum(InstNum);
+ auto *PHI = dyn_cast<PHINode>(Inst);
+ PHI = PHI ? PHI : dyn_cast_or_null<PHINode>(RealToTemp.lookup(Inst));
+ if (!PHI)
+ continue;
+ auto *BB = BBPair.first;
+ if (ReachablePredCount.lookup(BB) != PHI->getNumIncomingValues())
+ ReplaceUnreachablePHIArgs(PHI, BB);
+ }
+ }
+
+ // Map to store the use counts
+ DenseMap<const Value *, unsigned int> UseCounts;
+ for (auto *CC : reverse(CongruenceClasses)) {
+ LLVM_DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID()
+ << "\n");
+ // Track the equivalent store info so we can decide whether to try
+ // dead store elimination.
+ SmallVector<ValueDFS, 8> PossibleDeadStores;
+ SmallPtrSet<Instruction *, 8> ProbablyDead;
+ if (CC->isDead() || CC->empty())
+ continue;
+ // Everything still in the TOP class is unreachable or dead.
+ if (CC == TOPClass) {
+ for (auto M : *CC) {
+ auto *VTE = ValueToExpression.lookup(M);
+ if (VTE && isa<DeadExpression>(VTE))
+ markInstructionForDeletion(cast<Instruction>(M));
+ assert((!ReachableBlocks.count(cast<Instruction>(M)->getParent()) ||
+ InstructionsToErase.count(cast<Instruction>(M))) &&
+ "Everything in TOP should be unreachable or dead at this "
+ "point");
+ }
+ continue;
+ }
+
+ assert(CC->getLeader() && "We should have had a leader");
+ // If this is a leader that is always available, and it's a
+ // constant or has no equivalences, just replace everything with
+ // it. We then update the congruence class with whatever members
+ // are left.
+ Value *Leader =
+ CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+ if (alwaysAvailable(Leader)) {
+ CongruenceClass::MemberSet MembersLeft;
+ for (auto M : *CC) {
+ Value *Member = M;
+ // Void things have no uses we can replace.
+ if (Member == Leader || !isa<Instruction>(Member) ||
+ Member->getType()->isVoidTy()) {
+ MembersLeft.insert(Member);
+ continue;
+ }
+ LLVM_DEBUG(dbgs() << "Found replacement " << *(Leader) << " for "
+ << *Member << "\n");
+ auto *I = cast<Instruction>(Member);
+ assert(Leader != I && "About to accidentally remove our leader");
+ replaceInstruction(I, Leader);
+ AnythingReplaced = true;
+ }
+ CC->swap(MembersLeft);
+ } else {
+ // If this is a singleton, we can skip it.
+ if (CC->size() != 1 || RealToTemp.count(Leader)) {
+ // This is a stack because equality replacement/etc may place
+ // constants in the middle of the member list, and we want to use
+ // those constant values in preference to the current leader, over
+ // the scope of those constants.
+ ValueDFSStack EliminationStack;
+
+ // Convert the members to DFS ordered sets and then merge them.
+ SmallVector<ValueDFS, 8> DFSOrderedSet;
+ convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead);
+
+ // Sort the whole thing.
+ llvm::sort(DFSOrderedSet);
+ for (auto &VD : DFSOrderedSet) {
+ int MemberDFSIn = VD.DFSIn;
+ int MemberDFSOut = VD.DFSOut;
+ Value *Def = VD.Def.getPointer();
+ bool FromStore = VD.Def.getInt();
+ Use *U = VD.U;
+ // We ignore void things because we can't get a value from them.
+ if (Def && Def->getType()->isVoidTy())
+ continue;
+ auto *DefInst = dyn_cast_or_null<Instruction>(Def);
+ if (DefInst && AllTempInstructions.count(DefInst)) {
+ auto *PN = cast<PHINode>(DefInst);
+
+ // If this is a value phi and that's the expression we used, insert
+ // it into the program
+ // remove from temp instruction list.
+ AllTempInstructions.erase(PN);
+ auto *DefBlock = getBlockForValue(Def);
+ LLVM_DEBUG(dbgs() << "Inserting fully real phi of ops" << *Def
+ << " into block "
+ << getBlockName(getBlockForValue(Def)) << "\n");
+ PN->insertBefore(&DefBlock->front());
+ Def = PN;
+ NumGVNPHIOfOpsEliminations++;
+ }
+
+ if (EliminationStack.empty()) {
+ LLVM_DEBUG(dbgs() << "Elimination Stack is empty\n");
+ } else {
+ LLVM_DEBUG(dbgs() << "Elimination Stack Top DFS numbers are ("
+ << EliminationStack.dfs_back().first << ","
+ << EliminationStack.dfs_back().second << ")\n");
+ }
+
+ LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << MemberDFSIn << ","
+ << MemberDFSOut << ")\n");
+ // First, we see if we are out of scope or empty. If so,
+ // and there equivalences, we try to replace the top of
+ // stack with equivalences (if it's on the stack, it must
+ // not have been eliminated yet).
+ // Then we synchronize to our current scope, by
+ // popping until we are back within a DFS scope that
+ // dominates the current member.
+ // Then, what happens depends on a few factors
+ // If the stack is now empty, we need to push
+ // If we have a constant or a local equivalence we want to
+ // start using, we also push.
+ // Otherwise, we walk along, processing members who are
+ // dominated by this scope, and eliminate them.
+ bool ShouldPush = Def && EliminationStack.empty();
+ bool OutOfScope =
+ !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut);
+
+ if (OutOfScope || ShouldPush) {
+ // Sync to our current scope.
+ EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
+ bool ShouldPush = Def && EliminationStack.empty();
+ if (ShouldPush) {
+ EliminationStack.push_back(Def, MemberDFSIn, MemberDFSOut);
+ }
+ }
+
+ // Skip the Def's, we only want to eliminate on their uses. But mark
+ // dominated defs as dead.
+ if (Def) {
+ // For anything in this case, what and how we value number
+ // guarantees that any side-effets that would have occurred (ie
+ // throwing, etc) can be proven to either still occur (because it's
+ // dominated by something that has the same side-effects), or never
+ // occur. Otherwise, we would not have been able to prove it value
+ // equivalent to something else. For these things, we can just mark
+ // it all dead. Note that this is different from the "ProbablyDead"
+ // set, which may not be dominated by anything, and thus, are only
+ // easy to prove dead if they are also side-effect free. Note that
+ // because stores are put in terms of the stored value, we skip
+ // stored values here. If the stored value is really dead, it will
+ // still be marked for deletion when we process it in its own class.
+ if (!EliminationStack.empty() && Def != EliminationStack.back() &&
+ isa<Instruction>(Def) && !FromStore)
+ markInstructionForDeletion(cast<Instruction>(Def));
+ continue;
+ }
+ // At this point, we know it is a Use we are trying to possibly
+ // replace.
+
+ assert(isa<Instruction>(U->get()) &&
+ "Current def should have been an instruction");
+ assert(isa<Instruction>(U->getUser()) &&
+ "Current user should have been an instruction");
+
+ // If the thing we are replacing into is already marked to be dead,
+ // this use is dead. Note that this is true regardless of whether
+ // we have anything dominating the use or not. We do this here
+ // because we are already walking all the uses anyway.
+ Instruction *InstUse = cast<Instruction>(U->getUser());
+ if (InstructionsToErase.count(InstUse)) {
+ auto &UseCount = UseCounts[U->get()];
+ if (--UseCount == 0) {
+ ProbablyDead.insert(cast<Instruction>(U->get()));
+ }
+ }
+
+ // If we get to this point, and the stack is empty we must have a use
+ // with nothing we can use to eliminate this use, so just skip it.
+ if (EliminationStack.empty())
+ continue;
+
+ Value *DominatingLeader = EliminationStack.back();
+
+ auto *II = dyn_cast<IntrinsicInst>(DominatingLeader);
+ bool isSSACopy = II && II->getIntrinsicID() == Intrinsic::ssa_copy;
+ if (isSSACopy)
+ DominatingLeader = II->getOperand(0);
+
+ // Don't replace our existing users with ourselves.
+ if (U->get() == DominatingLeader)
+ continue;
+ LLVM_DEBUG(dbgs()
+ << "Found replacement " << *DominatingLeader << " for "
+ << *U->get() << " in " << *(U->getUser()) << "\n");
+
+ // If we replaced something in an instruction, handle the patching of
+ // metadata. Skip this if we are replacing predicateinfo with its
+ // original operand, as we already know we can just drop it.
+ auto *ReplacedInst = cast<Instruction>(U->get());
+ auto *PI = PredInfo->getPredicateInfoFor(ReplacedInst);
+ if (!PI || DominatingLeader != PI->OriginalOp)
+ patchReplacementInstruction(ReplacedInst, DominatingLeader);
+ U->set(DominatingLeader);
+ // This is now a use of the dominating leader, which means if the
+ // dominating leader was dead, it's now live!
+ auto &LeaderUseCount = UseCounts[DominatingLeader];
+ // It's about to be alive again.
+ if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
+ ProbablyDead.erase(cast<Instruction>(DominatingLeader));
+ // For copy instructions, we use their operand as a leader,
+ // which means we remove a user of the copy and it may become dead.
+ if (isSSACopy) {
+ unsigned &IIUseCount = UseCounts[II];
+ if (--IIUseCount == 0)
+ ProbablyDead.insert(II);
+ }
+ ++LeaderUseCount;
+ AnythingReplaced = true;
+ }
+ }
+ }
+
+ // At this point, anything still in the ProbablyDead set is actually dead if
+ // would be trivially dead.
+ for (auto *I : ProbablyDead)
+ if (wouldInstructionBeTriviallyDead(I))
+ markInstructionForDeletion(I);
+
+ // Cleanup the congruence class.
+ CongruenceClass::MemberSet MembersLeft;
+ for (auto *Member : *CC)
+ if (!isa<Instruction>(Member) ||
+ !InstructionsToErase.count(cast<Instruction>(Member)))
+ MembersLeft.insert(Member);
+ CC->swap(MembersLeft);
+
+ // If we have possible dead stores to look at, try to eliminate them.
+ if (CC->getStoreCount() > 0) {
+ convertClassToLoadsAndStores(*CC, PossibleDeadStores);
+ llvm::sort(PossibleDeadStores);
+ ValueDFSStack EliminationStack;
+ for (auto &VD : PossibleDeadStores) {
+ int MemberDFSIn = VD.DFSIn;
+ int MemberDFSOut = VD.DFSOut;
+ Instruction *Member = cast<Instruction>(VD.Def.getPointer());
+ if (EliminationStack.empty() ||
+ !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut)) {
+ // Sync to our current scope.
+ EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
+ if (EliminationStack.empty()) {
+ EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
+ continue;
+ }
+ }
+ // We already did load elimination, so nothing to do here.
+ if (isa<LoadInst>(Member))
+ continue;
+ assert(!EliminationStack.empty());
+ Instruction *Leader = cast<Instruction>(EliminationStack.back());
+ (void)Leader;
+ assert(DT->dominates(Leader->getParent(), Member->getParent()));
+ // Member is dominater by Leader, and thus dead
+ LLVM_DEBUG(dbgs() << "Marking dead store " << *Member
+ << " that is dominated by " << *Leader << "\n");
+ markInstructionForDeletion(Member);
+ CC->erase(Member);
+ ++NumGVNDeadStores;
+ }
+ }
+ }
+ return AnythingReplaced;
+}
+
+// This function provides global ranking of operations so that we can place them
+// in a canonical order. Note that rank alone is not necessarily enough for a
+// complete ordering, as constants all have the same rank. However, generally,
+// we will simplify an operation with all constants so that it doesn't matter
+// what order they appear in.
+unsigned int NewGVN::getRank(const Value *V) const {
+ // Prefer constants to undef to anything else
+ // Undef is a constant, have to check it first.
+ // Prefer smaller constants to constantexprs
+ if (isa<ConstantExpr>(V))
+ return 2;
+ if (isa<UndefValue>(V))
+ return 1;
+ if (isa<Constant>(V))
+ return 0;
+ else if (auto *A = dyn_cast<Argument>(V))
+ return 3 + A->getArgNo();
+
+ // Need to shift the instruction DFS by number of arguments + 3 to account for
+ // the constant and argument ranking above.
+ unsigned Result = InstrToDFSNum(V);
+ if (Result > 0)
+ return 4 + NumFuncArgs + Result;
+ // Unreachable or something else, just return a really large number.
+ return ~0;
+}
+
+// This is a function that says whether two commutative operations should
+// have their order swapped when canonicalizing.
+bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
+ // Because we only care about a total ordering, and don't rewrite expressions
+ // in this order, we order by rank, which will give a strict weak ordering to
+ // everything but constants, and then we order by pointer address.
+ return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
+}
+
+namespace {
+
+class NewGVNLegacyPass : public FunctionPass {
+public:
+ // Pass identification, replacement for typeid.
+ static char ID;
+
+ NewGVNLegacyPass() : FunctionPass(ID) {
+ initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+private:
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+bool NewGVNLegacyPass::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+ return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
+ &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+ &getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+ F.getParent()->getDataLayout())
+ .runGVN();
+}
+
+char NewGVNLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false,
+ false)
+
+// createGVNPass - The public interface to this file.
+FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); }
+
+PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
+ // Apparently the order in which we get these results matter for
+ // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
+ // the same order here, just in case.
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+ auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+ bool Changed =
+ NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getParent()->getDataLayout())
+ .runGVN();
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 9ae47d54e8..58763ec72e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -1,186 +1,186 @@
-//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass tries to partially inline the fast path of well-known library
-// functions, such as using square-root instructions for cases where sqrt()
-// does not need to set errno.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/DebugCounter.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "partially-inline-libcalls"
-
-DEBUG_COUNTER(PILCounter, "partially-inline-libcalls-transform",
- "Controls transformations in partially-inline-libcalls");
-
-static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
- BasicBlock &CurrBB, Function::iterator &BB,
- const TargetTransformInfo *TTI) {
- // There is no need to change the IR, since backend will emit sqrt
- // instruction if the call has already been marked read-only.
- if (Call->onlyReadsMemory())
- return false;
-
- if (!DebugCounter::shouldExecute(PILCounter))
- return false;
-
- // Do the following transformation:
- //
- // (before)
- // dst = sqrt(src)
- //
- // (after)
- // v0 = sqrt_noreadmem(src) # native sqrt instruction.
- // [if (v0 is a NaN) || if (src < 0)]
- // v1 = sqrt(src) # library call.
- // dst = phi(v0, v1)
- //
-
- // Move all instructions following Call to newly created block JoinBB.
- // Create phi and replace all uses.
- BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode());
- IRBuilder<> Builder(JoinBB, JoinBB->begin());
- Type *Ty = Call->getType();
- PHINode *Phi = Builder.CreatePHI(Ty, 2);
- Call->replaceAllUsesWith(Phi);
-
- // Create basic block LibCallBB and insert a call to library function sqrt.
- BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt",
- CurrBB.getParent(), JoinBB);
- Builder.SetInsertPoint(LibCallBB);
- Instruction *LibCall = Call->clone();
- Builder.Insert(LibCall);
- Builder.CreateBr(JoinBB);
-
- // Add attribute "readnone" so that backend can use a native sqrt instruction
- // for this call. Insert a FP compare instruction and a conditional branch
- // at the end of CurrBB.
- Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
- CurrBB.getTerminator()->eraseFromParent();
- Builder.SetInsertPoint(&CurrBB);
- Value *FCmp = TTI->isFCmpOrdCheaperThanFCmpZero(Ty)
- ? Builder.CreateFCmpORD(Call, Call)
- : Builder.CreateFCmpOGE(Call->getOperand(0),
- ConstantFP::get(Ty, 0.0));
- Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
-
- // Add phi operands.
- Phi->addIncoming(Call, &CurrBB);
- Phi->addIncoming(LibCall, LibCallBB);
-
- BB = JoinBB->getIterator();
- return true;
-}
-
-static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
- const TargetTransformInfo *TTI) {
- bool Changed = false;
-
- Function::iterator CurrBB;
- for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
- CurrBB = BB++;
-
- for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end();
- II != IE; ++II) {
- CallInst *Call = dyn_cast<CallInst>(&*II);
- Function *CalledFunc;
-
- if (!Call || !(CalledFunc = Call->getCalledFunction()))
- continue;
-
- if (Call->isNoBuiltin())
- continue;
-
- // Skip if function either has local linkage or is not a known library
- // function.
- LibFunc LF;
- if (CalledFunc->hasLocalLinkage() ||
- !TLI->getLibFunc(*CalledFunc, LF) || !TLI->has(LF))
- continue;
-
- switch (LF) {
- case LibFunc_sqrtf:
- case LibFunc_sqrt:
- if (TTI->haveFastSqrt(Call->getType()) &&
- optimizeSQRT(Call, CalledFunc, *CurrBB, BB, TTI))
- break;
- continue;
- default:
- continue;
- }
-
- Changed = true;
- break;
- }
- }
-
- return Changed;
-}
-
-PreservedAnalyses
-PartiallyInlineLibCallsPass::run(Function &F, FunctionAnalysisManager &AM) {
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- if (!runPartiallyInlineLibCalls(F, &TLI, &TTI))
- return PreservedAnalyses::all();
- return PreservedAnalyses::none();
-}
-
-namespace {
-class PartiallyInlineLibCallsLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- PartiallyInlineLibCallsLegacyPass() : FunctionPass(ID) {
- initializePartiallyInlineLibCallsLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- FunctionPass::getAnalysisUsage(AU);
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- const TargetTransformInfo *TTI =
- &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- return runPartiallyInlineLibCalls(F, TLI, TTI);
- }
-};
-}
-
-char PartiallyInlineLibCallsLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(PartiallyInlineLibCallsLegacyPass,
- "partially-inline-libcalls",
- "Partially inline calls to library functions", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(PartiallyInlineLibCallsLegacyPass,
- "partially-inline-libcalls",
- "Partially inline calls to library functions", false, false)
-
-FunctionPass *llvm::createPartiallyInlineLibCallsPass() {
- return new PartiallyInlineLibCallsLegacyPass();
-}
+//===--- PartiallyInlineLibCalls.cpp - Partially inline libcalls ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to partially inline the fast path of well-known library
+// functions, such as using square-root instructions for cases where sqrt()
+// does not need to set errno.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/PartiallyInlineLibCalls.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "partially-inline-libcalls"
+
+DEBUG_COUNTER(PILCounter, "partially-inline-libcalls-transform",
+ "Controls transformations in partially-inline-libcalls");
+
+static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
+ BasicBlock &CurrBB, Function::iterator &BB,
+ const TargetTransformInfo *TTI) {
+ // There is no need to change the IR, since backend will emit sqrt
+ // instruction if the call has already been marked read-only.
+ if (Call->onlyReadsMemory())
+ return false;
+
+ if (!DebugCounter::shouldExecute(PILCounter))
+ return false;
+
+ // Do the following transformation:
+ //
+ // (before)
+ // dst = sqrt(src)
+ //
+ // (after)
+ // v0 = sqrt_noreadmem(src) # native sqrt instruction.
+ // [if (v0 is a NaN) || if (src < 0)]
+ // v1 = sqrt(src) # library call.
+ // dst = phi(v0, v1)
+ //
+
+ // Move all instructions following Call to newly created block JoinBB.
+ // Create phi and replace all uses.
+ BasicBlock *JoinBB = llvm::SplitBlock(&CurrBB, Call->getNextNode());
+ IRBuilder<> Builder(JoinBB, JoinBB->begin());
+ Type *Ty = Call->getType();
+ PHINode *Phi = Builder.CreatePHI(Ty, 2);
+ Call->replaceAllUsesWith(Phi);
+
+ // Create basic block LibCallBB and insert a call to library function sqrt.
+ BasicBlock *LibCallBB = BasicBlock::Create(CurrBB.getContext(), "call.sqrt",
+ CurrBB.getParent(), JoinBB);
+ Builder.SetInsertPoint(LibCallBB);
+ Instruction *LibCall = Call->clone();
+ Builder.Insert(LibCall);
+ Builder.CreateBr(JoinBB);
+
+ // Add attribute "readnone" so that backend can use a native sqrt instruction
+ // for this call. Insert a FP compare instruction and a conditional branch
+ // at the end of CurrBB.
+ Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+ CurrBB.getTerminator()->eraseFromParent();
+ Builder.SetInsertPoint(&CurrBB);
+ Value *FCmp = TTI->isFCmpOrdCheaperThanFCmpZero(Ty)
+ ? Builder.CreateFCmpORD(Call, Call)
+ : Builder.CreateFCmpOGE(Call->getOperand(0),
+ ConstantFP::get(Ty, 0.0));
+ Builder.CreateCondBr(FCmp, JoinBB, LibCallBB);
+
+ // Add phi operands.
+ Phi->addIncoming(Call, &CurrBB);
+ Phi->addIncoming(LibCall, LibCallBB);
+
+ BB = JoinBB->getIterator();
+ return true;
+}
+
+static bool runPartiallyInlineLibCalls(Function &F, TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI) {
+ bool Changed = false;
+
+ Function::iterator CurrBB;
+ for (Function::iterator BB = F.begin(), BE = F.end(); BB != BE;) {
+ CurrBB = BB++;
+
+ for (BasicBlock::iterator II = CurrBB->begin(), IE = CurrBB->end();
+ II != IE; ++II) {
+ CallInst *Call = dyn_cast<CallInst>(&*II);
+ Function *CalledFunc;
+
+ if (!Call || !(CalledFunc = Call->getCalledFunction()))
+ continue;
+
+ if (Call->isNoBuiltin())
+ continue;
+
+ // Skip if function either has local linkage or is not a known library
+ // function.
+ LibFunc LF;
+ if (CalledFunc->hasLocalLinkage() ||
+ !TLI->getLibFunc(*CalledFunc, LF) || !TLI->has(LF))
+ continue;
+
+ switch (LF) {
+ case LibFunc_sqrtf:
+ case LibFunc_sqrt:
+ if (TTI->haveFastSqrt(Call->getType()) &&
+ optimizeSQRT(Call, CalledFunc, *CurrBB, BB, TTI))
+ break;
+ continue;
+ default:
+ continue;
+ }
+
+ Changed = true;
+ break;
+ }
+ }
+
+ return Changed;
+}
+
+PreservedAnalyses
+PartiallyInlineLibCallsPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ if (!runPartiallyInlineLibCalls(F, &TLI, &TTI))
+ return PreservedAnalyses::all();
+ return PreservedAnalyses::none();
+}
+
+namespace {
+class PartiallyInlineLibCallsLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ PartiallyInlineLibCallsLegacyPass() : FunctionPass(ID) {
+ initializePartiallyInlineLibCallsLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ const TargetTransformInfo *TTI =
+ &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return runPartiallyInlineLibCalls(F, TLI, TTI);
+ }
+};
+}
+
+char PartiallyInlineLibCallsLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PartiallyInlineLibCallsLegacyPass,
+ "partially-inline-libcalls",
+ "Partially inline calls to library functions", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(PartiallyInlineLibCallsLegacyPass,
+ "partially-inline-libcalls",
+ "Partially inline calls to library functions", false, false)
+
+FunctionPass *llvm::createPartiallyInlineLibCallsPass() {
+ return new PartiallyInlineLibCallsLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/PlaceSafepoints.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/PlaceSafepoints.cpp
index 5d91a49723..a110f7d5c2 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -1,690 +1,690 @@
-//===- PlaceSafepoints.cpp - Place GC Safepoints --------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Place garbage collection safepoints at appropriate locations in the IR. This
-// does not make relocation semantics or variable liveness explicit. That's
-// done by RewriteStatepointsForGC.
-//
-// Terminology:
-// - A call is said to be "parseable" if there is a stack map generated for the
-// return PC of the call. A runtime can determine where values listed in the
-// deopt arguments and (after RewriteStatepointsForGC) gc arguments are located
-// on the stack when the code is suspended inside such a call. Every parse
-// point is represented by a call wrapped in an gc.statepoint intrinsic.
-// - A "poll" is an explicit check in the generated code to determine if the
-// runtime needs the generated code to cooperate by calling a helper routine
-// and thus suspending its execution at a known state. The call to the helper
-// routine will be parseable. The (gc & runtime specific) logic of a poll is
-// assumed to be provided in a function of the name "gc.safepoint_poll".
-//
-// We aim to insert polls such that running code can quickly be brought to a
-// well defined state for inspection by the collector. In the current
-// implementation, this is done via the insertion of poll sites at method entry
-// and the backedge of most loops. We try to avoid inserting more polls than
-// are necessary to ensure a finite period between poll sites. This is not
-// because the poll itself is expensive in the generated code; it's not. Polls
-// do tend to impact the optimizer itself in negative ways; we'd like to avoid
-// perturbing the optimization of the method as much as we can.
-//
-// We also need to make most call sites parseable. The callee might execute a
-// poll (or otherwise be inspected by the GC). If so, the entire stack
-// (including the suspended frame of the current method) must be parseable.
-//
-// This pass will insert:
-// - Call parse points ("call safepoints") for any call which may need to
-// reach a safepoint during the execution of the callee function.
-// - Backedge safepoint polls and entry safepoint polls to ensure that
-// executing code reaches a safepoint poll in a finite amount of time.
-//
-// We do not currently support return statepoints, but adding them would not
-// be hard. They are not required for correctness - entry safepoints are an
-// alternative - but some GCs may prefer them. Patches welcome.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Statepoint.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-
-#define DEBUG_TYPE "safepoint-placement"
-
-STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted");
-STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted");
-
-STATISTIC(CallInLoop,
- "Number of loops without safepoints due to calls in loop");
-STATISTIC(FiniteExecution,
- "Number of loops without safepoints finite execution");
-
-using namespace llvm;
-
-// Ignore opportunities to avoid placing safepoints on backedges, useful for
-// validation
-static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden,
- cl::init(false));
-
-/// How narrow does the trip count of a loop have to be to have to be considered
-/// "counted"? Counted loops do not get safepoints at backedges.
-static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width",
- cl::Hidden, cl::init(32));
-
-// If true, split the backedge of a loop when placing the safepoint, otherwise
-// split the latch block itself. Both are useful to support for
-// experimentation, but in practice, it looks like splitting the backedge
-// optimizes better.
-static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden,
- cl::init(false));
-
-namespace {
-
-/// An analysis pass whose purpose is to identify each of the backedges in
-/// the function which require a safepoint poll to be inserted.
-struct PlaceBackedgeSafepointsImpl : public FunctionPass {
- static char ID;
-
- /// The output of the pass - gives a list of each backedge (described by
- /// pointing at the branch) which need a poll inserted.
- std::vector<Instruction *> PollLocations;
-
- /// True unless we're running spp-no-calls in which case we need to disable
- /// the call-dependent placement opts.
- bool CallSafepointsEnabled;
-
- ScalarEvolution *SE = nullptr;
- DominatorTree *DT = nullptr;
- LoopInfo *LI = nullptr;
- TargetLibraryInfo *TLI = nullptr;
-
- PlaceBackedgeSafepointsImpl(bool CallSafepoints = false)
- : FunctionPass(ID), CallSafepointsEnabled(CallSafepoints) {
- initializePlaceBackedgeSafepointsImplPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnLoop(Loop *);
- void runOnLoopAndSubLoops(Loop *L) {
- // Visit all the subloops
- for (Loop *I : *L)
- runOnLoopAndSubLoops(I);
- runOnLoop(L);
- }
-
- bool runOnFunction(Function &F) override {
- SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- for (Loop *I : *LI) {
- runOnLoopAndSubLoops(I);
- }
- return false;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- // We no longer modify the IR at all in this pass. Thus all
- // analysis are preserved.
- AU.setPreservesAll();
- }
-};
-}
-
-static cl::opt<bool> NoEntry("spp-no-entry", cl::Hidden, cl::init(false));
-static cl::opt<bool> NoCall("spp-no-call", cl::Hidden, cl::init(false));
-static cl::opt<bool> NoBackedge("spp-no-backedge", cl::Hidden, cl::init(false));
-
-namespace {
-struct PlaceSafepoints : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
-
- PlaceSafepoints() : FunctionPass(ID) {
- initializePlaceSafepointsPass(*PassRegistry::getPassRegistry());
- }
- bool runOnFunction(Function &F) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- // We modify the graph wholesale (inlining, block insertion, etc). We
- // preserve nothing at the moment. We could potentially preserve dom tree
- // if that was worth doing
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-};
-}
-
-// Insert a safepoint poll immediately before the given instruction. Does
-// not handle the parsability of state at the runtime call, that's the
-// callers job.
-static void
-InsertSafepointPoll(Instruction *InsertBefore,
- std::vector<CallBase *> &ParsePointsNeeded /*rval*/,
- const TargetLibraryInfo &TLI);
-
-static bool needsStatepoint(CallBase *Call, const TargetLibraryInfo &TLI) {
- if (callsGCLeafFunction(Call, TLI))
- return false;
- if (auto *CI = dyn_cast<CallInst>(Call)) {
- if (CI->isInlineAsm())
- return false;
- }
-
- return !(isa<GCStatepointInst>(Call) || isa<GCRelocateInst>(Call) ||
- isa<GCResultInst>(Call));
-}
-
-/// Returns true if this loop is known to contain a call safepoint which
-/// must unconditionally execute on any iteration of the loop which returns
-/// to the loop header via an edge from Pred. Returns a conservative correct
-/// answer; i.e. false is always valid.
-static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
- BasicBlock *Pred,
- DominatorTree &DT,
- const TargetLibraryInfo &TLI) {
- // In general, we're looking for any cut of the graph which ensures
- // there's a call safepoint along every edge between Header and Pred.
- // For the moment, we look only for the 'cuts' that consist of a single call
- // instruction in a block which is dominated by the Header and dominates the
- // loop latch (Pred) block. Somewhat surprisingly, walking the entire chain
- // of such dominating blocks gets substantially more occurrences than just
- // checking the Pred and Header blocks themselves. This may be due to the
- // density of loop exit conditions caused by range and null checks.
- // TODO: structure this as an analysis pass, cache the result for subloops,
- // avoid dom tree recalculations
- assert(DT.dominates(Header, Pred) && "loop latch not dominated by header?");
-
- BasicBlock *Current = Pred;
- while (true) {
- for (Instruction &I : *Current) {
- if (auto *Call = dyn_cast<CallBase>(&I))
- // Note: Technically, needing a safepoint isn't quite the right
- // condition here. We should instead be checking if the target method
- // has an
- // unconditional poll. In practice, this is only a theoretical concern
- // since we don't have any methods with conditional-only safepoint
- // polls.
- if (needsStatepoint(Call, TLI))
- return true;
- }
-
- if (Current == Header)
- break;
- Current = DT.getNode(Current)->getIDom()->getBlock();
- }
-
- return false;
-}
-
-/// Returns true if this loop is known to terminate in a finite number of
-/// iterations. Note that this function may return false for a loop which
-/// does actual terminate in a finite constant number of iterations due to
-/// conservatism in the analysis.
-static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
- BasicBlock *Pred) {
- // A conservative bound on the loop as a whole.
- const SCEV *MaxTrips = SE->getConstantMaxBackedgeTakenCount(L);
+//===- PlaceSafepoints.cpp - Place GC Safepoints --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Place garbage collection safepoints at appropriate locations in the IR. This
+// does not make relocation semantics or variable liveness explicit. That's
+// done by RewriteStatepointsForGC.
+//
+// Terminology:
+// - A call is said to be "parseable" if there is a stack map generated for the
+// return PC of the call. A runtime can determine where values listed in the
+// deopt arguments and (after RewriteStatepointsForGC) gc arguments are located
+// on the stack when the code is suspended inside such a call. Every parse
+// point is represented by a call wrapped in an gc.statepoint intrinsic.
+// - A "poll" is an explicit check in the generated code to determine if the
+// runtime needs the generated code to cooperate by calling a helper routine
+// and thus suspending its execution at a known state. The call to the helper
+// routine will be parseable. The (gc & runtime specific) logic of a poll is
+// assumed to be provided in a function of the name "gc.safepoint_poll".
+//
+// We aim to insert polls such that running code can quickly be brought to a
+// well defined state for inspection by the collector. In the current
+// implementation, this is done via the insertion of poll sites at method entry
+// and the backedge of most loops. We try to avoid inserting more polls than
+// are necessary to ensure a finite period between poll sites. This is not
+// because the poll itself is expensive in the generated code; it's not. Polls
+// do tend to impact the optimizer itself in negative ways; we'd like to avoid
+// perturbing the optimization of the method as much as we can.
+//
+// We also need to make most call sites parseable. The callee might execute a
+// poll (or otherwise be inspected by the GC). If so, the entire stack
+// (including the suspended frame of the current method) must be parseable.
+//
+// This pass will insert:
+// - Call parse points ("call safepoints") for any call which may need to
+// reach a safepoint during the execution of the callee function.
+// - Backedge safepoint polls and entry safepoint polls to ensure that
+// executing code reaches a safepoint poll in a finite amount of time.
+//
+// We do not currently support return statepoints, but adding them would not
+// be hard. They are not required for correctness - entry safepoints are an
+// alternative - but some GCs may prefer them. Patches welcome.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+#define DEBUG_TYPE "safepoint-placement"
+
+STATISTIC(NumEntrySafepoints, "Number of entry safepoints inserted");
+STATISTIC(NumBackedgeSafepoints, "Number of backedge safepoints inserted");
+
+STATISTIC(CallInLoop,
+ "Number of loops without safepoints due to calls in loop");
+STATISTIC(FiniteExecution,
+ "Number of loops without safepoints finite execution");
+
+using namespace llvm;
+
+// Ignore opportunities to avoid placing safepoints on backedges, useful for
+// validation
+static cl::opt<bool> AllBackedges("spp-all-backedges", cl::Hidden,
+ cl::init(false));
+
+/// How narrow does the trip count of a loop have to be to have to be considered
+/// "counted"? Counted loops do not get safepoints at backedges.
+static cl::opt<int> CountedLoopTripWidth("spp-counted-loop-trip-width",
+ cl::Hidden, cl::init(32));
+
+// If true, split the backedge of a loop when placing the safepoint, otherwise
+// split the latch block itself. Both are useful to support for
+// experimentation, but in practice, it looks like splitting the backedge
+// optimizes better.
+static cl::opt<bool> SplitBackedge("spp-split-backedge", cl::Hidden,
+ cl::init(false));
+
+namespace {
+
+/// An analysis pass whose purpose is to identify each of the backedges in
+/// the function which require a safepoint poll to be inserted.
+struct PlaceBackedgeSafepointsImpl : public FunctionPass {
+ static char ID;
+
+ /// The output of the pass - gives a list of each backedge (described by
+ /// pointing at the branch) which need a poll inserted.
+ std::vector<Instruction *> PollLocations;
+
+ /// True unless we're running spp-no-calls in which case we need to disable
+ /// the call-dependent placement opts.
+ bool CallSafepointsEnabled;
+
+ ScalarEvolution *SE = nullptr;
+ DominatorTree *DT = nullptr;
+ LoopInfo *LI = nullptr;
+ TargetLibraryInfo *TLI = nullptr;
+
+ PlaceBackedgeSafepointsImpl(bool CallSafepoints = false)
+ : FunctionPass(ID), CallSafepointsEnabled(CallSafepoints) {
+ initializePlaceBackedgeSafepointsImplPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *);
+ void runOnLoopAndSubLoops(Loop *L) {
+ // Visit all the subloops
+ for (Loop *I : *L)
+ runOnLoopAndSubLoops(I);
+ runOnLoop(L);
+ }
+
+ bool runOnFunction(Function &F) override {
+ SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ for (Loop *I : *LI) {
+ runOnLoopAndSubLoops(I);
+ }
+ return false;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ // We no longer modify the IR at all in this pass. Thus all
+ // analysis are preserved.
+ AU.setPreservesAll();
+ }
+};
+}
+
+static cl::opt<bool> NoEntry("spp-no-entry", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoCall("spp-no-call", cl::Hidden, cl::init(false));
+static cl::opt<bool> NoBackedge("spp-no-backedge", cl::Hidden, cl::init(false));
+
+namespace {
+struct PlaceSafepoints : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+
+ PlaceSafepoints() : FunctionPass(ID) {
+ initializePlaceSafepointsPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // We modify the graph wholesale (inlining, block insertion, etc). We
+ // preserve nothing at the moment. We could potentially preserve dom tree
+ // if that was worth doing
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+};
+}
+
+// Insert a safepoint poll immediately before the given instruction. Does
+// not handle the parsability of state at the runtime call, that's the
+// callers job.
+static void
+InsertSafepointPoll(Instruction *InsertBefore,
+ std::vector<CallBase *> &ParsePointsNeeded /*rval*/,
+ const TargetLibraryInfo &TLI);
+
+static bool needsStatepoint(CallBase *Call, const TargetLibraryInfo &TLI) {
+ if (callsGCLeafFunction(Call, TLI))
+ return false;
+ if (auto *CI = dyn_cast<CallInst>(Call)) {
+ if (CI->isInlineAsm())
+ return false;
+ }
+
+ return !(isa<GCStatepointInst>(Call) || isa<GCRelocateInst>(Call) ||
+ isa<GCResultInst>(Call));
+}
+
+/// Returns true if this loop is known to contain a call safepoint which
+/// must unconditionally execute on any iteration of the loop which returns
+/// to the loop header via an edge from Pred. Returns a conservative correct
+/// answer; i.e. false is always valid.
+static bool containsUnconditionalCallSafepoint(Loop *L, BasicBlock *Header,
+ BasicBlock *Pred,
+ DominatorTree &DT,
+ const TargetLibraryInfo &TLI) {
+ // In general, we're looking for any cut of the graph which ensures
+ // there's a call safepoint along every edge between Header and Pred.
+ // For the moment, we look only for the 'cuts' that consist of a single call
+ // instruction in a block which is dominated by the Header and dominates the
+ // loop latch (Pred) block. Somewhat surprisingly, walking the entire chain
+ // of such dominating blocks gets substantially more occurrences than just
+ // checking the Pred and Header blocks themselves. This may be due to the
+ // density of loop exit conditions caused by range and null checks.
+ // TODO: structure this as an analysis pass, cache the result for subloops,
+ // avoid dom tree recalculations
+ assert(DT.dominates(Header, Pred) && "loop latch not dominated by header?");
+
+ BasicBlock *Current = Pred;
+ while (true) {
+ for (Instruction &I : *Current) {
+ if (auto *Call = dyn_cast<CallBase>(&I))
+ // Note: Technically, needing a safepoint isn't quite the right
+ // condition here. We should instead be checking if the target method
+ // has an
+ // unconditional poll. In practice, this is only a theoretical concern
+ // since we don't have any methods with conditional-only safepoint
+ // polls.
+ if (needsStatepoint(Call, TLI))
+ return true;
+ }
+
+ if (Current == Header)
+ break;
+ Current = DT.getNode(Current)->getIDom()->getBlock();
+ }
+
+ return false;
+}
+
+/// Returns true if this loop is known to terminate in a finite number of
+/// iterations. Note that this function may return false for a loop which
+/// does actual terminate in a finite constant number of iterations due to
+/// conservatism in the analysis.
+static bool mustBeFiniteCountedLoop(Loop *L, ScalarEvolution *SE,
+ BasicBlock *Pred) {
+ // A conservative bound on the loop as a whole.
+ const SCEV *MaxTrips = SE->getConstantMaxBackedgeTakenCount(L);
if (!isa<SCEVCouldNotCompute>(MaxTrips) &&
- SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(
- CountedLoopTripWidth))
- return true;
-
- // If this is a conditional branch to the header with the alternate path
- // being outside the loop, we can ask questions about the execution frequency
- // of the exit block.
- if (L->isLoopExiting(Pred)) {
- // This returns an exact expression only. TODO: We really only need an
- // upper bound here, but SE doesn't expose that.
- const SCEV *MaxExec = SE->getExitCount(L, Pred);
+ SE->getUnsignedRange(MaxTrips).getUnsignedMax().isIntN(
+ CountedLoopTripWidth))
+ return true;
+
+ // If this is a conditional branch to the header with the alternate path
+ // being outside the loop, we can ask questions about the execution frequency
+ // of the exit block.
+ if (L->isLoopExiting(Pred)) {
+ // This returns an exact expression only. TODO: We really only need an
+ // upper bound here, but SE doesn't expose that.
+ const SCEV *MaxExec = SE->getExitCount(L, Pred);
if (!isa<SCEVCouldNotCompute>(MaxExec) &&
- SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(
- CountedLoopTripWidth))
- return true;
- }
-
- return /* not finite */ false;
-}
-
-static void scanOneBB(Instruction *Start, Instruction *End,
- std::vector<CallInst *> &Calls,
- DenseSet<BasicBlock *> &Seen,
- std::vector<BasicBlock *> &Worklist) {
- for (BasicBlock::iterator BBI(Start), BBE0 = Start->getParent()->end(),
- BBE1 = BasicBlock::iterator(End);
- BBI != BBE0 && BBI != BBE1; BBI++) {
- if (CallInst *CI = dyn_cast<CallInst>(&*BBI))
- Calls.push_back(CI);
-
- // FIXME: This code does not handle invokes
- assert(!isa<InvokeInst>(&*BBI) &&
- "support for invokes in poll code needed");
-
- // Only add the successor blocks if we reach the terminator instruction
- // without encountering end first
- if (BBI->isTerminator()) {
- BasicBlock *BB = BBI->getParent();
- for (BasicBlock *Succ : successors(BB)) {
- if (Seen.insert(Succ).second) {
- Worklist.push_back(Succ);
- }
- }
- }
- }
-}
-
-static void scanInlinedCode(Instruction *Start, Instruction *End,
- std::vector<CallInst *> &Calls,
- DenseSet<BasicBlock *> &Seen) {
- Calls.clear();
- std::vector<BasicBlock *> Worklist;
- Seen.insert(Start->getParent());
- scanOneBB(Start, End, Calls, Seen, Worklist);
- while (!Worklist.empty()) {
- BasicBlock *BB = Worklist.back();
- Worklist.pop_back();
- scanOneBB(&*BB->begin(), End, Calls, Seen, Worklist);
- }
-}
-
-bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
- // Loop through all loop latches (branches controlling backedges). We need
- // to place a safepoint on every backedge (potentially).
- // Note: In common usage, there will be only one edge due to LoopSimplify
- // having run sometime earlier in the pipeline, but this code must be correct
- // w.r.t. loops with multiple backedges.
- BasicBlock *Header = L->getHeader();
- SmallVector<BasicBlock*, 16> LoopLatches;
- L->getLoopLatches(LoopLatches);
- for (BasicBlock *Pred : LoopLatches) {
- assert(L->contains(Pred));
-
- // Make a policy decision about whether this loop needs a safepoint or
- // not. Note that this is about unburdening the optimizer in loops, not
- // avoiding the runtime cost of the actual safepoint.
- if (!AllBackedges) {
- if (mustBeFiniteCountedLoop(L, SE, Pred)) {
- LLVM_DEBUG(dbgs() << "skipping safepoint placement in finite loop\n");
- FiniteExecution++;
- continue;
- }
- if (CallSafepointsEnabled &&
- containsUnconditionalCallSafepoint(L, Header, Pred, *DT, *TLI)) {
- // Note: This is only semantically legal since we won't do any further
- // IPO or inlining before the actual call insertion.. If we hadn't, we
- // might latter loose this call safepoint.
- LLVM_DEBUG(
- dbgs()
- << "skipping safepoint placement due to unconditional call\n");
- CallInLoop++;
- continue;
- }
- }
-
- // TODO: We can create an inner loop which runs a finite number of
- // iterations with an outer loop which contains a safepoint. This would
- // not help runtime performance that much, but it might help our ability to
- // optimize the inner loop.
-
- // Safepoint insertion would involve creating a new basic block (as the
- // target of the current backedge) which does the safepoint (of all live
- // variables) and branches to the true header
- Instruction *Term = Pred->getTerminator();
-
- LLVM_DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term);
-
- PollLocations.push_back(Term);
- }
-
- return false;
-}
-
-/// Returns true if an entry safepoint is not required before this callsite in
-/// the caller function.
-static bool doesNotRequireEntrySafepointBefore(CallBase *Call) {
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Call)) {
- switch (II->getIntrinsicID()) {
- case Intrinsic::experimental_gc_statepoint:
- case Intrinsic::experimental_patchpoint_void:
- case Intrinsic::experimental_patchpoint_i64:
- // The can wrap an actual call which may grow the stack by an unbounded
- // amount or run forever.
- return false;
- default:
- // Most LLVM intrinsics are things which do not expand to actual calls, or
- // at least if they do, are leaf functions that cause only finite stack
- // growth. In particular, the optimizer likes to form things like memsets
- // out of stores in the original IR. Another important example is
- // llvm.localescape which must occur in the entry block. Inserting a
- // safepoint before it is not legal since it could push the localescape
- // out of the entry block.
- return true;
- }
- }
- return false;
-}
-
-static Instruction *findLocationForEntrySafepoint(Function &F,
- DominatorTree &DT) {
-
- // Conceptually, this poll needs to be on method entry, but in
- // practice, we place it as late in the entry block as possible. We
- // can place it as late as we want as long as it dominates all calls
- // that can grow the stack. This, combined with backedge polls,
- // give us all the progress guarantees we need.
-
- // hasNextInstruction and nextInstruction are used to iterate
- // through a "straight line" execution sequence.
-
- auto HasNextInstruction = [](Instruction *I) {
- if (!I->isTerminator())
- return true;
-
- BasicBlock *nextBB = I->getParent()->getUniqueSuccessor();
- return nextBB && (nextBB->getUniquePredecessor() != nullptr);
- };
-
- auto NextInstruction = [&](Instruction *I) {
- assert(HasNextInstruction(I) &&
- "first check if there is a next instruction!");
-
- if (I->isTerminator())
- return &I->getParent()->getUniqueSuccessor()->front();
- return &*++I->getIterator();
- };
-
- Instruction *Cursor = nullptr;
- for (Cursor = &F.getEntryBlock().front(); HasNextInstruction(Cursor);
- Cursor = NextInstruction(Cursor)) {
-
- // We need to ensure a safepoint poll occurs before any 'real' call. The
- // easiest way to ensure finite execution between safepoints in the face of
- // recursive and mutually recursive functions is to enforce that each take
- // a safepoint. Additionally, we need to ensure a poll before any call
- // which can grow the stack by an unbounded amount. This isn't required
- // for GC semantics per se, but is a common requirement for languages
- // which detect stack overflow via guard pages and then throw exceptions.
- if (auto *Call = dyn_cast<CallBase>(Cursor)) {
- if (doesNotRequireEntrySafepointBefore(Call))
- continue;
- break;
- }
- }
-
- assert((HasNextInstruction(Cursor) || Cursor->isTerminator()) &&
- "either we stopped because of a call, or because of terminator");
-
- return Cursor;
-}
-
+ SE->getUnsignedRange(MaxExec).getUnsignedMax().isIntN(
+ CountedLoopTripWidth))
+ return true;
+ }
+
+ return /* not finite */ false;
+}
+
+static void scanOneBB(Instruction *Start, Instruction *End,
+ std::vector<CallInst *> &Calls,
+ DenseSet<BasicBlock *> &Seen,
+ std::vector<BasicBlock *> &Worklist) {
+ for (BasicBlock::iterator BBI(Start), BBE0 = Start->getParent()->end(),
+ BBE1 = BasicBlock::iterator(End);
+ BBI != BBE0 && BBI != BBE1; BBI++) {
+ if (CallInst *CI = dyn_cast<CallInst>(&*BBI))
+ Calls.push_back(CI);
+
+ // FIXME: This code does not handle invokes
+ assert(!isa<InvokeInst>(&*BBI) &&
+ "support for invokes in poll code needed");
+
+ // Only add the successor blocks if we reach the terminator instruction
+ // without encountering end first
+ if (BBI->isTerminator()) {
+ BasicBlock *BB = BBI->getParent();
+ for (BasicBlock *Succ : successors(BB)) {
+ if (Seen.insert(Succ).second) {
+ Worklist.push_back(Succ);
+ }
+ }
+ }
+ }
+}
+
+static void scanInlinedCode(Instruction *Start, Instruction *End,
+ std::vector<CallInst *> &Calls,
+ DenseSet<BasicBlock *> &Seen) {
+ Calls.clear();
+ std::vector<BasicBlock *> Worklist;
+ Seen.insert(Start->getParent());
+ scanOneBB(Start, End, Calls, Seen, Worklist);
+ while (!Worklist.empty()) {
+ BasicBlock *BB = Worklist.back();
+ Worklist.pop_back();
+ scanOneBB(&*BB->begin(), End, Calls, Seen, Worklist);
+ }
+}
+
+bool PlaceBackedgeSafepointsImpl::runOnLoop(Loop *L) {
+ // Loop through all loop latches (branches controlling backedges). We need
+ // to place a safepoint on every backedge (potentially).
+ // Note: In common usage, there will be only one edge due to LoopSimplify
+ // having run sometime earlier in the pipeline, but this code must be correct
+ // w.r.t. loops with multiple backedges.
+ BasicBlock *Header = L->getHeader();
+ SmallVector<BasicBlock*, 16> LoopLatches;
+ L->getLoopLatches(LoopLatches);
+ for (BasicBlock *Pred : LoopLatches) {
+ assert(L->contains(Pred));
+
+ // Make a policy decision about whether this loop needs a safepoint or
+ // not. Note that this is about unburdening the optimizer in loops, not
+ // avoiding the runtime cost of the actual safepoint.
+ if (!AllBackedges) {
+ if (mustBeFiniteCountedLoop(L, SE, Pred)) {
+ LLVM_DEBUG(dbgs() << "skipping safepoint placement in finite loop\n");
+ FiniteExecution++;
+ continue;
+ }
+ if (CallSafepointsEnabled &&
+ containsUnconditionalCallSafepoint(L, Header, Pred, *DT, *TLI)) {
+ // Note: This is only semantically legal since we won't do any further
+ // IPO or inlining before the actual call insertion.. If we hadn't, we
+ // might latter loose this call safepoint.
+ LLVM_DEBUG(
+ dbgs()
+ << "skipping safepoint placement due to unconditional call\n");
+ CallInLoop++;
+ continue;
+ }
+ }
+
+ // TODO: We can create an inner loop which runs a finite number of
+ // iterations with an outer loop which contains a safepoint. This would
+ // not help runtime performance that much, but it might help our ability to
+ // optimize the inner loop.
+
+ // Safepoint insertion would involve creating a new basic block (as the
+ // target of the current backedge) which does the safepoint (of all live
+ // variables) and branches to the true header
+ Instruction *Term = Pred->getTerminator();
+
+ LLVM_DEBUG(dbgs() << "[LSP] terminator instruction: " << *Term);
+
+ PollLocations.push_back(Term);
+ }
+
+ return false;
+}
+
+/// Returns true if an entry safepoint is not required before this callsite in
+/// the caller function.
+static bool doesNotRequireEntrySafepointBefore(CallBase *Call) {
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Call)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::experimental_gc_statepoint:
+ case Intrinsic::experimental_patchpoint_void:
+ case Intrinsic::experimental_patchpoint_i64:
+ // The can wrap an actual call which may grow the stack by an unbounded
+ // amount or run forever.
+ return false;
+ default:
+ // Most LLVM intrinsics are things which do not expand to actual calls, or
+ // at least if they do, are leaf functions that cause only finite stack
+ // growth. In particular, the optimizer likes to form things like memsets
+ // out of stores in the original IR. Another important example is
+ // llvm.localescape which must occur in the entry block. Inserting a
+ // safepoint before it is not legal since it could push the localescape
+ // out of the entry block.
+ return true;
+ }
+ }
+ return false;
+}
+
+static Instruction *findLocationForEntrySafepoint(Function &F,
+ DominatorTree &DT) {
+
+ // Conceptually, this poll needs to be on method entry, but in
+ // practice, we place it as late in the entry block as possible. We
+ // can place it as late as we want as long as it dominates all calls
+ // that can grow the stack. This, combined with backedge polls,
+ // give us all the progress guarantees we need.
+
+ // hasNextInstruction and nextInstruction are used to iterate
+ // through a "straight line" execution sequence.
+
+ auto HasNextInstruction = [](Instruction *I) {
+ if (!I->isTerminator())
+ return true;
+
+ BasicBlock *nextBB = I->getParent()->getUniqueSuccessor();
+ return nextBB && (nextBB->getUniquePredecessor() != nullptr);
+ };
+
+ auto NextInstruction = [&](Instruction *I) {
+ assert(HasNextInstruction(I) &&
+ "first check if there is a next instruction!");
+
+ if (I->isTerminator())
+ return &I->getParent()->getUniqueSuccessor()->front();
+ return &*++I->getIterator();
+ };
+
+ Instruction *Cursor = nullptr;
+ for (Cursor = &F.getEntryBlock().front(); HasNextInstruction(Cursor);
+ Cursor = NextInstruction(Cursor)) {
+
+ // We need to ensure a safepoint poll occurs before any 'real' call. The
+ // easiest way to ensure finite execution between safepoints in the face of
+ // recursive and mutually recursive functions is to enforce that each take
+ // a safepoint. Additionally, we need to ensure a poll before any call
+ // which can grow the stack by an unbounded amount. This isn't required
+ // for GC semantics per se, but is a common requirement for languages
+ // which detect stack overflow via guard pages and then throw exceptions.
+ if (auto *Call = dyn_cast<CallBase>(Cursor)) {
+ if (doesNotRequireEntrySafepointBefore(Call))
+ continue;
+ break;
+ }
+ }
+
+ assert((HasNextInstruction(Cursor) || Cursor->isTerminator()) &&
+ "either we stopped because of a call, or because of terminator");
+
+ return Cursor;
+}
+
const char GCSafepointPollName[] = "gc.safepoint_poll";
-
-static bool isGCSafepointPoll(Function &F) {
- return F.getName().equals(GCSafepointPollName);
-}
-
-/// Returns true if this function should be rewritten to include safepoint
-/// polls and parseable call sites. The main point of this function is to be
-/// an extension point for custom logic.
-static bool shouldRewriteFunction(Function &F) {
- // TODO: This should check the GCStrategy
- if (F.hasGC()) {
- const auto &FunctionGCName = F.getGC();
- const StringRef StatepointExampleName("statepoint-example");
- const StringRef CoreCLRName("coreclr");
- return (StatepointExampleName == FunctionGCName) ||
- (CoreCLRName == FunctionGCName);
- } else
- return false;
-}
-
-// TODO: These should become properties of the GCStrategy, possibly with
-// command line overrides.
-static bool enableEntrySafepoints(Function &F) { return !NoEntry; }
-static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; }
-static bool enableCallSafepoints(Function &F) { return !NoCall; }
-
-bool PlaceSafepoints::runOnFunction(Function &F) {
- if (F.isDeclaration() || F.empty()) {
- // This is a declaration, nothing to do. Must exit early to avoid crash in
- // dom tree calculation
- return false;
- }
-
- if (isGCSafepointPoll(F)) {
- // Given we're inlining this inside of safepoint poll insertion, this
- // doesn't make any sense. Note that we do make any contained calls
- // parseable after we inline a poll.
- return false;
- }
-
- if (!shouldRewriteFunction(F))
- return false;
-
- const TargetLibraryInfo &TLI =
- getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-
- bool Modified = false;
-
- // In various bits below, we rely on the fact that uses are reachable from
- // defs. When there are basic blocks unreachable from the entry, dominance
- // and reachablity queries return non-sensical results. Thus, we preprocess
- // the function to ensure these properties hold.
- Modified |= removeUnreachableBlocks(F);
-
- // STEP 1 - Insert the safepoint polling locations. We do not need to
- // actually insert parse points yet. That will be done for all polls and
- // calls in a single pass.
-
- DominatorTree DT;
- DT.recalculate(F);
-
- SmallVector<Instruction *, 16> PollsNeeded;
- std::vector<CallBase *> ParsePointNeeded;
-
- if (enableBackedgeSafepoints(F)) {
- // Construct a pass manager to run the LoopPass backedge logic. We
- // need the pass manager to handle scheduling all the loop passes
- // appropriately. Doing this by hand is painful and just not worth messing
- // with for the moment.
- legacy::FunctionPassManager FPM(F.getParent());
- bool CanAssumeCallSafepoints = enableCallSafepoints(F);
- auto *PBS = new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints);
- FPM.add(PBS);
- FPM.run(F);
-
- // We preserve dominance information when inserting the poll, otherwise
- // we'd have to recalculate this on every insert
- DT.recalculate(F);
-
- auto &PollLocations = PBS->PollLocations;
-
- auto OrderByBBName = [](Instruction *a, Instruction *b) {
- return a->getParent()->getName() < b->getParent()->getName();
- };
- // We need the order of list to be stable so that naming ends up stable
- // when we split edges. This makes test cases much easier to write.
- llvm::sort(PollLocations, OrderByBBName);
-
- // We can sometimes end up with duplicate poll locations. This happens if
- // a single loop is visited more than once. The fact this happens seems
- // wrong, but it does happen for the split-backedge.ll test case.
- PollLocations.erase(std::unique(PollLocations.begin(),
- PollLocations.end()),
- PollLocations.end());
-
- // Insert a poll at each point the analysis pass identified
- // The poll location must be the terminator of a loop latch block.
- for (Instruction *Term : PollLocations) {
- // We are inserting a poll, the function is modified
- Modified = true;
-
- if (SplitBackedge) {
- // Split the backedge of the loop and insert the poll within that new
- // basic block. This creates a loop with two latches per original
- // latch (which is non-ideal), but this appears to be easier to
- // optimize in practice than inserting the poll immediately before the
- // latch test.
-
- // Since this is a latch, at least one of the successors must dominate
- // it. Its possible that we have a) duplicate edges to the same header
- // and b) edges to distinct loop headers. We need to insert pools on
- // each.
- SetVector<BasicBlock *> Headers;
- for (unsigned i = 0; i < Term->getNumSuccessors(); i++) {
- BasicBlock *Succ = Term->getSuccessor(i);
- if (DT.dominates(Succ, Term->getParent())) {
- Headers.insert(Succ);
- }
- }
- assert(!Headers.empty() && "poll location is not a loop latch?");
-
- // The split loop structure here is so that we only need to recalculate
- // the dominator tree once. Alternatively, we could just keep it up to
- // date and use a more natural merged loop.
- SetVector<BasicBlock *> SplitBackedges;
- for (BasicBlock *Header : Headers) {
- BasicBlock *NewBB = SplitEdge(Term->getParent(), Header, &DT);
- PollsNeeded.push_back(NewBB->getTerminator());
- NumBackedgeSafepoints++;
- }
- } else {
- // Split the latch block itself, right before the terminator.
- PollsNeeded.push_back(Term);
- NumBackedgeSafepoints++;
- }
- }
- }
-
- if (enableEntrySafepoints(F)) {
- if (Instruction *Location = findLocationForEntrySafepoint(F, DT)) {
- PollsNeeded.push_back(Location);
- Modified = true;
- NumEntrySafepoints++;
- }
- // TODO: else we should assert that there was, in fact, a policy choice to
- // not insert a entry safepoint poll.
- }
-
- // Now that we've identified all the needed safepoint poll locations, insert
- // safepoint polls themselves.
- for (Instruction *PollLocation : PollsNeeded) {
- std::vector<CallBase *> RuntimeCalls;
- InsertSafepointPoll(PollLocation, RuntimeCalls, TLI);
+
+static bool isGCSafepointPoll(Function &F) {
+ return F.getName().equals(GCSafepointPollName);
+}
+
+/// Returns true if this function should be rewritten to include safepoint
+/// polls and parseable call sites. The main point of this function is to be
+/// an extension point for custom logic.
+static bool shouldRewriteFunction(Function &F) {
+ // TODO: This should check the GCStrategy
+ if (F.hasGC()) {
+ const auto &FunctionGCName = F.getGC();
+ const StringRef StatepointExampleName("statepoint-example");
+ const StringRef CoreCLRName("coreclr");
+ return (StatepointExampleName == FunctionGCName) ||
+ (CoreCLRName == FunctionGCName);
+ } else
+ return false;
+}
+
+// TODO: These should become properties of the GCStrategy, possibly with
+// command line overrides.
+static bool enableEntrySafepoints(Function &F) { return !NoEntry; }
+static bool enableBackedgeSafepoints(Function &F) { return !NoBackedge; }
+static bool enableCallSafepoints(Function &F) { return !NoCall; }
+
+bool PlaceSafepoints::runOnFunction(Function &F) {
+ if (F.isDeclaration() || F.empty()) {
+ // This is a declaration, nothing to do. Must exit early to avoid crash in
+ // dom tree calculation
+ return false;
+ }
+
+ if (isGCSafepointPoll(F)) {
+ // Given we're inlining this inside of safepoint poll insertion, this
+ // doesn't make any sense. Note that we do make any contained calls
+ // parseable after we inline a poll.
+ return false;
+ }
+
+ if (!shouldRewriteFunction(F))
+ return false;
+
+ const TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+
+ bool Modified = false;
+
+ // In various bits below, we rely on the fact that uses are reachable from
+ // defs. When there are basic blocks unreachable from the entry, dominance
+ // and reachablity queries return non-sensical results. Thus, we preprocess
+ // the function to ensure these properties hold.
+ Modified |= removeUnreachableBlocks(F);
+
+ // STEP 1 - Insert the safepoint polling locations. We do not need to
+ // actually insert parse points yet. That will be done for all polls and
+ // calls in a single pass.
+
+ DominatorTree DT;
+ DT.recalculate(F);
+
+ SmallVector<Instruction *, 16> PollsNeeded;
+ std::vector<CallBase *> ParsePointNeeded;
+
+ if (enableBackedgeSafepoints(F)) {
+ // Construct a pass manager to run the LoopPass backedge logic. We
+ // need the pass manager to handle scheduling all the loop passes
+ // appropriately. Doing this by hand is painful and just not worth messing
+ // with for the moment.
+ legacy::FunctionPassManager FPM(F.getParent());
+ bool CanAssumeCallSafepoints = enableCallSafepoints(F);
+ auto *PBS = new PlaceBackedgeSafepointsImpl(CanAssumeCallSafepoints);
+ FPM.add(PBS);
+ FPM.run(F);
+
+ // We preserve dominance information when inserting the poll, otherwise
+ // we'd have to recalculate this on every insert
+ DT.recalculate(F);
+
+ auto &PollLocations = PBS->PollLocations;
+
+ auto OrderByBBName = [](Instruction *a, Instruction *b) {
+ return a->getParent()->getName() < b->getParent()->getName();
+ };
+ // We need the order of list to be stable so that naming ends up stable
+ // when we split edges. This makes test cases much easier to write.
+ llvm::sort(PollLocations, OrderByBBName);
+
+ // We can sometimes end up with duplicate poll locations. This happens if
+ // a single loop is visited more than once. The fact this happens seems
+ // wrong, but it does happen for the split-backedge.ll test case.
+ PollLocations.erase(std::unique(PollLocations.begin(),
+ PollLocations.end()),
+ PollLocations.end());
+
+ // Insert a poll at each point the analysis pass identified
+ // The poll location must be the terminator of a loop latch block.
+ for (Instruction *Term : PollLocations) {
+ // We are inserting a poll, the function is modified
+ Modified = true;
+
+ if (SplitBackedge) {
+ // Split the backedge of the loop and insert the poll within that new
+ // basic block. This creates a loop with two latches per original
+ // latch (which is non-ideal), but this appears to be easier to
+ // optimize in practice than inserting the poll immediately before the
+ // latch test.
+
+ // Since this is a latch, at least one of the successors must dominate
+ // it. Its possible that we have a) duplicate edges to the same header
+ // and b) edges to distinct loop headers. We need to insert pools on
+ // each.
+ SetVector<BasicBlock *> Headers;
+ for (unsigned i = 0; i < Term->getNumSuccessors(); i++) {
+ BasicBlock *Succ = Term->getSuccessor(i);
+ if (DT.dominates(Succ, Term->getParent())) {
+ Headers.insert(Succ);
+ }
+ }
+ assert(!Headers.empty() && "poll location is not a loop latch?");
+
+ // The split loop structure here is so that we only need to recalculate
+ // the dominator tree once. Alternatively, we could just keep it up to
+ // date and use a more natural merged loop.
+ SetVector<BasicBlock *> SplitBackedges;
+ for (BasicBlock *Header : Headers) {
+ BasicBlock *NewBB = SplitEdge(Term->getParent(), Header, &DT);
+ PollsNeeded.push_back(NewBB->getTerminator());
+ NumBackedgeSafepoints++;
+ }
+ } else {
+ // Split the latch block itself, right before the terminator.
+ PollsNeeded.push_back(Term);
+ NumBackedgeSafepoints++;
+ }
+ }
+ }
+
+ if (enableEntrySafepoints(F)) {
+ if (Instruction *Location = findLocationForEntrySafepoint(F, DT)) {
+ PollsNeeded.push_back(Location);
+ Modified = true;
+ NumEntrySafepoints++;
+ }
+ // TODO: else we should assert that there was, in fact, a policy choice to
+ // not insert a entry safepoint poll.
+ }
+
+ // Now that we've identified all the needed safepoint poll locations, insert
+ // safepoint polls themselves.
+ for (Instruction *PollLocation : PollsNeeded) {
+ std::vector<CallBase *> RuntimeCalls;
+ InsertSafepointPoll(PollLocation, RuntimeCalls, TLI);
llvm::append_range(ParsePointNeeded, RuntimeCalls);
- }
-
- return Modified;
-}
-
-char PlaceBackedgeSafepointsImpl::ID = 0;
-char PlaceSafepoints::ID = 0;
-
-FunctionPass *llvm::createPlaceSafepointsPass() {
- return new PlaceSafepoints();
-}
-
-INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl,
- "place-backedge-safepoints-impl",
- "Place Backedge Safepoints", false, false)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl,
- "place-backedge-safepoints-impl",
- "Place Backedge Safepoints", false, false)
-
-INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints",
- false, false)
-INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints",
- false, false)
-
-static void
-InsertSafepointPoll(Instruction *InsertBefore,
- std::vector<CallBase *> &ParsePointsNeeded /*rval*/,
- const TargetLibraryInfo &TLI) {
- BasicBlock *OrigBB = InsertBefore->getParent();
- Module *M = InsertBefore->getModule();
- assert(M && "must be part of a module");
-
- // Inline the safepoint poll implementation - this will get all the branch,
- // control flow, etc.. Most importantly, it will introduce the actual slow
- // path call - where we need to insert a safepoint (parsepoint).
-
- auto *F = M->getFunction(GCSafepointPollName);
- assert(F && "gc.safepoint_poll function is missing");
- assert(F->getValueType() ==
- FunctionType::get(Type::getVoidTy(M->getContext()), false) &&
- "gc.safepoint_poll declared with wrong type");
- assert(!F->empty() && "gc.safepoint_poll must be a non-empty function");
- CallInst *PollCall = CallInst::Create(F, "", InsertBefore);
-
- // Record some information about the call site we're replacing
- BasicBlock::iterator Before(PollCall), After(PollCall);
- bool IsBegin = false;
- if (Before == OrigBB->begin())
- IsBegin = true;
- else
- Before--;
-
- After++;
- assert(After != OrigBB->end() && "must have successor");
-
- // Do the actual inlining
- InlineFunctionInfo IFI;
- bool InlineStatus = InlineFunction(*PollCall, IFI).isSuccess();
- assert(InlineStatus && "inline must succeed");
- (void)InlineStatus; // suppress warning in release-asserts
-
- // Check post-conditions
- assert(IFI.StaticAllocas.empty() && "can't have allocs");
-
- std::vector<CallInst *> Calls; // new calls
- DenseSet<BasicBlock *> BBs; // new BBs + insertee
-
- // Include only the newly inserted instructions, Note: begin may not be valid
- // if we inserted to the beginning of the basic block
- BasicBlock::iterator Start = IsBegin ? OrigBB->begin() : std::next(Before);
-
- // If your poll function includes an unreachable at the end, that's not
- // valid. Bugpoint likes to create this, so check for it.
- assert(isPotentiallyReachable(&*Start, &*After) &&
- "malformed poll function");
-
- scanInlinedCode(&*Start, &*After, Calls, BBs);
- assert(!Calls.empty() && "slow path not found for safepoint poll");
-
- // Record the fact we need a parsable state at the runtime call contained in
- // the poll function. This is required so that the runtime knows how to
- // parse the last frame when we actually take the safepoint (i.e. execute
- // the slow path)
- assert(ParsePointsNeeded.empty());
- for (auto *CI : Calls) {
- // No safepoint needed or wanted
- if (!needsStatepoint(CI, TLI))
- continue;
-
- // These are likely runtime calls. Should we assert that via calling
- // convention or something?
- ParsePointsNeeded.push_back(CI);
- }
- assert(ParsePointsNeeded.size() <= Calls.size());
-}
+ }
+
+ return Modified;
+}
+
+char PlaceBackedgeSafepointsImpl::ID = 0;
+char PlaceSafepoints::ID = 0;
+
+FunctionPass *llvm::createPlaceSafepointsPass() {
+ return new PlaceSafepoints();
+}
+
+INITIALIZE_PASS_BEGIN(PlaceBackedgeSafepointsImpl,
+ "place-backedge-safepoints-impl",
+ "Place Backedge Safepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(PlaceBackedgeSafepointsImpl,
+ "place-backedge-safepoints-impl",
+ "Place Backedge Safepoints", false, false)
+
+INITIALIZE_PASS_BEGIN(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+ false, false)
+INITIALIZE_PASS_END(PlaceSafepoints, "place-safepoints", "Place Safepoints",
+ false, false)
+
+static void
+InsertSafepointPoll(Instruction *InsertBefore,
+ std::vector<CallBase *> &ParsePointsNeeded /*rval*/,
+ const TargetLibraryInfo &TLI) {
+ BasicBlock *OrigBB = InsertBefore->getParent();
+ Module *M = InsertBefore->getModule();
+ assert(M && "must be part of a module");
+
+ // Inline the safepoint poll implementation - this will get all the branch,
+ // control flow, etc.. Most importantly, it will introduce the actual slow
+ // path call - where we need to insert a safepoint (parsepoint).
+
+ auto *F = M->getFunction(GCSafepointPollName);
+ assert(F && "gc.safepoint_poll function is missing");
+ assert(F->getValueType() ==
+ FunctionType::get(Type::getVoidTy(M->getContext()), false) &&
+ "gc.safepoint_poll declared with wrong type");
+ assert(!F->empty() && "gc.safepoint_poll must be a non-empty function");
+ CallInst *PollCall = CallInst::Create(F, "", InsertBefore);
+
+ // Record some information about the call site we're replacing
+ BasicBlock::iterator Before(PollCall), After(PollCall);
+ bool IsBegin = false;
+ if (Before == OrigBB->begin())
+ IsBegin = true;
+ else
+ Before--;
+
+ After++;
+ assert(After != OrigBB->end() && "must have successor");
+
+ // Do the actual inlining
+ InlineFunctionInfo IFI;
+ bool InlineStatus = InlineFunction(*PollCall, IFI).isSuccess();
+ assert(InlineStatus && "inline must succeed");
+ (void)InlineStatus; // suppress warning in release-asserts
+
+ // Check post-conditions
+ assert(IFI.StaticAllocas.empty() && "can't have allocs");
+
+ std::vector<CallInst *> Calls; // new calls
+ DenseSet<BasicBlock *> BBs; // new BBs + insertee
+
+ // Include only the newly inserted instructions, Note: begin may not be valid
+ // if we inserted to the beginning of the basic block
+ BasicBlock::iterator Start = IsBegin ? OrigBB->begin() : std::next(Before);
+
+ // If your poll function includes an unreachable at the end, that's not
+ // valid. Bugpoint likes to create this, so check for it.
+ assert(isPotentiallyReachable(&*Start, &*After) &&
+ "malformed poll function");
+
+ scanInlinedCode(&*Start, &*After, Calls, BBs);
+ assert(!Calls.empty() && "slow path not found for safepoint poll");
+
+ // Record the fact we need a parsable state at the runtime call contained in
+ // the poll function. This is required so that the runtime knows how to
+ // parse the last frame when we actually take the safepoint (i.e. execute
+ // the slow path)
+ assert(ParsePointsNeeded.empty());
+ for (auto *CI : Calls) {
+ // No safepoint needed or wanted
+ if (!needsStatepoint(CI, TLI))
+ continue;
+
+ // These are likely runtime calls. Should we assert that via calling
+ // convention or something?
+ ParsePointsNeeded.push_back(CI);
+ }
+ assert(ParsePointsNeeded.size() <= Calls.size());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/Reassociate.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/Reassociate.cpp
index 569b4b260e..dffeb7cc22 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/Reassociate.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/Reassociate.cpp
@@ -1,925 +1,925 @@
-//===- Reassociate.cpp - Reassociate binary expressions -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass reassociates commutative expressions in an order that is designed
-// to promote better constant propagation, GCSE, LICM, PRE, etc.
-//
-// For example: 4 + (x + 5) -> x + (4 + 5)
-//
-// In the implementation of this algorithm, constants are assigned rank = 0,
-// function arguments are rank = 1, and other values are assigned ranks
-// corresponding to the reverse post order traversal of current function
-// (starting at 2), which effectively gives values in deep loops higher rank
-// than values not in loops.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/Reassociate.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
-#include <cassert>
-#include <utility>
-
-using namespace llvm;
-using namespace reassociate;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "reassociate"
-
-STATISTIC(NumChanged, "Number of insts reassociated");
-STATISTIC(NumAnnihil, "Number of expr tree annihilated");
-STATISTIC(NumFactor , "Number of multiplies factored");
-
-#ifndef NDEBUG
-/// Print out the expression identified in the Ops list.
-static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
- Module *M = I->getModule();
- dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
- << *Ops[0].Op->getType() << '\t';
- for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
- dbgs() << "[ ";
- Ops[i].Op->printAsOperand(dbgs(), false, M);
- dbgs() << ", #" << Ops[i].Rank << "] ";
- }
-}
-#endif
-
-/// Utility class representing a non-constant Xor-operand. We classify
-/// non-constant Xor-Operands into two categories:
-/// C1) The operand is in the form "X & C", where C is a constant and C != ~0
-/// C2)
-/// C2.1) The operand is in the form of "X | C", where C is a non-zero
-/// constant.
-/// C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
-/// operand as "E | 0"
-class llvm::reassociate::XorOpnd {
-public:
- XorOpnd(Value *V);
-
- bool isInvalid() const { return SymbolicPart == nullptr; }
- bool isOrExpr() const { return isOr; }
- Value *getValue() const { return OrigVal; }
- Value *getSymbolicPart() const { return SymbolicPart; }
- unsigned getSymbolicRank() const { return SymbolicRank; }
- const APInt &getConstPart() const { return ConstPart; }
-
- void Invalidate() { SymbolicPart = OrigVal = nullptr; }
- void setSymbolicRank(unsigned R) { SymbolicRank = R; }
-
-private:
- Value *OrigVal;
- Value *SymbolicPart;
- APInt ConstPart;
- unsigned SymbolicRank;
- bool isOr;
-};
-
-XorOpnd::XorOpnd(Value *V) {
- assert(!isa<ConstantInt>(V) && "No ConstantInt");
- OrigVal = V;
- Instruction *I = dyn_cast<Instruction>(V);
- SymbolicRank = 0;
-
- if (I && (I->getOpcode() == Instruction::Or ||
- I->getOpcode() == Instruction::And)) {
- Value *V0 = I->getOperand(0);
- Value *V1 = I->getOperand(1);
- const APInt *C;
- if (match(V0, m_APInt(C)))
- std::swap(V0, V1);
-
- if (match(V1, m_APInt(C))) {
- ConstPart = *C;
- SymbolicPart = V0;
- isOr = (I->getOpcode() == Instruction::Or);
- return;
- }
- }
-
- // view the operand as "V | 0"
- SymbolicPart = V;
- ConstPart = APInt::getNullValue(V->getType()->getScalarSizeInBits());
- isOr = true;
-}
-
-/// Return true if V is an instruction of the specified opcode and if it
-/// only has one use.
-static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
- auto *I = dyn_cast<Instruction>(V);
- if (I && I->hasOneUse() && I->getOpcode() == Opcode)
- if (!isa<FPMathOperator>(I) || I->isFast())
- return cast<BinaryOperator>(I);
- return nullptr;
-}
-
-static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
- unsigned Opcode2) {
- auto *I = dyn_cast<Instruction>(V);
- if (I && I->hasOneUse() &&
- (I->getOpcode() == Opcode1 || I->getOpcode() == Opcode2))
- if (!isa<FPMathOperator>(I) || I->isFast())
- return cast<BinaryOperator>(I);
- return nullptr;
-}
-
-void ReassociatePass::BuildRankMap(Function &F,
- ReversePostOrderTraversal<Function*> &RPOT) {
- unsigned Rank = 2;
-
- // Assign distinct ranks to function arguments.
- for (auto &Arg : F.args()) {
- ValueRankMap[&Arg] = ++Rank;
- LLVM_DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank
- << "\n");
- }
-
- // Traverse basic blocks in ReversePostOrder.
- for (BasicBlock *BB : RPOT) {
- unsigned BBRank = RankMap[BB] = ++Rank << 16;
-
- // Walk the basic block, adding precomputed ranks for any instructions that
- // we cannot move. This ensures that the ranks for these instructions are
- // all different in the block.
- for (Instruction &I : *BB)
- if (mayBeMemoryDependent(I))
- ValueRankMap[&I] = ++BBRank;
- }
-}
-
-unsigned ReassociatePass::getRank(Value *V) {
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I) {
- if (isa<Argument>(V)) return ValueRankMap[V]; // Function argument.
- return 0; // Otherwise it's a global or constant, rank 0.
- }
-
- if (unsigned Rank = ValueRankMap[I])
- return Rank; // Rank already known?
-
- // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
- // we can reassociate expressions for code motion! Since we do not recurse
- // for PHI nodes, we cannot have infinite recursion here, because there
- // cannot be loops in the value graph that do not go through PHI nodes.
- unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
- for (unsigned i = 0, e = I->getNumOperands(); i != e && Rank != MaxRank; ++i)
- Rank = std::max(Rank, getRank(I->getOperand(i)));
-
- // If this is a 'not' or 'neg' instruction, do not count it for rank. This
- // assures us that X and ~X will have the same rank.
- if (!match(I, m_Not(m_Value())) && !match(I, m_Neg(m_Value())) &&
- !match(I, m_FNeg(m_Value())))
- ++Rank;
-
- LLVM_DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank
- << "\n");
-
- return ValueRankMap[I] = Rank;
-}
-
-// Canonicalize constants to RHS. Otherwise, sort the operands by rank.
-void ReassociatePass::canonicalizeOperands(Instruction *I) {
- assert(isa<BinaryOperator>(I) && "Expected binary operator.");
- assert(I->isCommutative() && "Expected commutative operator.");
-
- Value *LHS = I->getOperand(0);
- Value *RHS = I->getOperand(1);
- if (LHS == RHS || isa<Constant>(RHS))
- return;
- if (isa<Constant>(LHS) || getRank(RHS) < getRank(LHS))
- cast<BinaryOperator>(I)->swapOperands();
-}
-
-static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name,
- Instruction *InsertBefore, Value *FlagsOp) {
- if (S1->getType()->isIntOrIntVectorTy())
- return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore);
- else {
- BinaryOperator *Res =
- BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore);
- Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
- return Res;
- }
-}
-
-static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
- Instruction *InsertBefore, Value *FlagsOp) {
- if (S1->getType()->isIntOrIntVectorTy())
- return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore);
- else {
- BinaryOperator *Res =
- BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore);
- Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
- return Res;
- }
-}
-
-static Instruction *CreateNeg(Value *S1, const Twine &Name,
- Instruction *InsertBefore, Value *FlagsOp) {
- if (S1->getType()->isIntOrIntVectorTy())
- return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
-
- if (auto *FMFSource = dyn_cast<Instruction>(FlagsOp))
- return UnaryOperator::CreateFNegFMF(S1, FMFSource, Name, InsertBefore);
-
- return UnaryOperator::CreateFNeg(S1, Name, InsertBefore);
-}
-
-/// Replace 0-X with X*-1.
-static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
- assert((isa<UnaryOperator>(Neg) || isa<BinaryOperator>(Neg)) &&
- "Expected a Negate!");
- // FIXME: It's not safe to lower a unary FNeg into a FMul by -1.0.
- unsigned OpNo = isa<BinaryOperator>(Neg) ? 1 : 0;
- Type *Ty = Neg->getType();
- Constant *NegOne = Ty->isIntOrIntVectorTy() ?
- ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0);
-
- BinaryOperator *Res = CreateMul(Neg->getOperand(OpNo), NegOne, "", Neg, Neg);
- Neg->setOperand(OpNo, Constant::getNullValue(Ty)); // Drop use of op.
- Res->takeName(Neg);
- Neg->replaceAllUsesWith(Res);
- Res->setDebugLoc(Neg->getDebugLoc());
- return Res;
-}
-
-/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael
-/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for
-/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
-/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
-/// even x in Bitwidth-bit arithmetic.
-static unsigned CarmichaelShift(unsigned Bitwidth) {
- if (Bitwidth < 3)
- return Bitwidth - 1;
- return Bitwidth - 2;
-}
-
-/// Add the extra weight 'RHS' to the existing weight 'LHS',
-/// reducing the combined weight using any special properties of the operation.
-/// The existing weight LHS represents the computation X op X op ... op X where
-/// X occurs LHS times. The combined weight represents X op X op ... op X with
-/// X occurring LHS + RHS times. If op is "Xor" for example then the combined
-/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
-/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
-static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
- // If we were working with infinite precision arithmetic then the combined
- // weight would be LHS + RHS. But we are using finite precision arithmetic,
- // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
- // for nilpotent operations and addition, but not for idempotent operations
- // and multiplication), so it is important to correctly reduce the combined
- // weight back into range if wrapping would be wrong.
-
- // If RHS is zero then the weight didn't change.
- if (RHS.isMinValue())
- return;
- // If LHS is zero then the combined weight is RHS.
- if (LHS.isMinValue()) {
- LHS = RHS;
- return;
- }
- // From this point on we know that neither LHS nor RHS is zero.
-
- if (Instruction::isIdempotent(Opcode)) {
- // Idempotent means X op X === X, so any non-zero weight is equivalent to a
- // weight of 1. Keeping weights at zero or one also means that wrapping is
- // not a problem.
- assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
- return; // Return a weight of 1.
- }
- if (Instruction::isNilpotent(Opcode)) {
- // Nilpotent means X op X === 0, so reduce weights modulo 2.
- assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
- LHS = 0; // 1 + 1 === 0 modulo 2.
- return;
- }
- if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) {
- // TODO: Reduce the weight by exploiting nsw/nuw?
- LHS += RHS;
- return;
- }
-
- assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) &&
- "Unknown associative operation!");
- unsigned Bitwidth = LHS.getBitWidth();
- // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
- // can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth
- // bit number x, since either x is odd in which case x^CM = 1, or x is even in
- // which case both x^W and x^(W - CM) are zero. By subtracting off multiples
- // of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
- // which by a happy accident means that they can always be represented using
- // Bitwidth bits.
- // TODO: Reduce the weight by exploiting nsw/nuw? (Could do much better than
- // the Carmichael number).
- if (Bitwidth > 3) {
- /// CM - The value of Carmichael's lambda function.
- APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
- // Any weight W >= Threshold can be replaced with W - CM.
- APInt Threshold = CM + Bitwidth;
- assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
- // For Bitwidth 4 or more the following sum does not overflow.
- LHS += RHS;
- while (LHS.uge(Threshold))
- LHS -= CM;
- } else {
- // To avoid problems with overflow do everything the same as above but using
- // a larger type.
- unsigned CM = 1U << CarmichaelShift(Bitwidth);
- unsigned Threshold = CM + Bitwidth;
- assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
- "Weights not reduced!");
- unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
- while (Total >= Threshold)
- Total -= CM;
- LHS = Total;
- }
-}
-
-using RepeatedValue = std::pair<Value*, APInt>;
-
-/// Given an associative binary expression, return the leaf
-/// nodes in Ops along with their weights (how many times the leaf occurs). The
-/// original expression is the same as
-/// (Ops[0].first op Ops[0].first op ... Ops[0].first) <- Ops[0].second times
-/// op
-/// (Ops[1].first op Ops[1].first op ... Ops[1].first) <- Ops[1].second times
-/// op
-/// ...
-/// op
-/// (Ops[N].first op Ops[N].first op ... Ops[N].first) <- Ops[N].second times
-///
-/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct.
-///
-/// This routine may modify the function, in which case it returns 'true'. The
-/// changes it makes may well be destructive, changing the value computed by 'I'
-/// to something completely different. Thus if the routine returns 'true' then
-/// you MUST either replace I with a new expression computed from the Ops array,
-/// or use RewriteExprTree to put the values back in.
-///
-/// A leaf node is either not a binary operation of the same kind as the root
-/// node 'I' (i.e. is not a binary operator at all, or is, but with a different
-/// opcode), or is the same kind of binary operator but has a use which either
-/// does not belong to the expression, or does belong to the expression but is
-/// a leaf node. Every leaf node has at least one use that is a non-leaf node
-/// of the expression, while for non-leaf nodes (except for the root 'I') every
-/// use is a non-leaf node of the expression.
-///
-/// For example:
-/// expression graph node names
-///
-/// + | I
-/// / \ |
-/// + + | A, B
-/// / \ / \ |
-/// * + * | C, D, E
-/// / \ / \ / \ |
-/// + * | F, G
-///
-/// The leaf nodes are C, E, F and G. The Ops array will contain (maybe not in
-/// that order) (C, 1), (E, 1), (F, 2), (G, 2).
-///
-/// The expression is maximal: if some instruction is a binary operator of the
-/// same kind as 'I', and all of its uses are non-leaf nodes of the expression,
-/// then the instruction also belongs to the expression, is not a leaf node of
-/// it, and its operands also belong to the expression (but may be leaf nodes).
-///
-/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in
-/// order to ensure that every non-root node in the expression has *exactly one*
-/// use by a non-leaf node of the expression. This destruction means that the
-/// caller MUST either replace 'I' with a new expression or use something like
-/// RewriteExprTree to put the values back in if the routine indicates that it
-/// made a change by returning 'true'.
-///
-/// In the above example either the right operand of A or the left operand of B
-/// will be replaced by undef. If it is B's operand then this gives:
-///
-/// + | I
-/// / \ |
-/// + + | A, B - operand of B replaced with undef
-/// / \ \ |
-/// * + * | C, D, E
-/// / \ / \ / \ |
-/// + * | F, G
-///
-/// Note that such undef operands can only be reached by passing through 'I'.
-/// For example, if you visit operands recursively starting from a leaf node
-/// then you will never see such an undef operand unless you get back to 'I',
-/// which requires passing through a phi node.
-///
-/// Note that this routine may also mutate binary operators of the wrong type
-/// that have all uses inside the expression (i.e. only used by non-leaf nodes
-/// of the expression) if it can turn them into binary operators of the right
-/// type and thus make the expression bigger.
-static bool LinearizeExprTree(Instruction *I,
- SmallVectorImpl<RepeatedValue> &Ops) {
- assert((isa<UnaryOperator>(I) || isa<BinaryOperator>(I)) &&
- "Expected a UnaryOperator or BinaryOperator!");
- LLVM_DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
- unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
- unsigned Opcode = I->getOpcode();
- assert(I->isAssociative() && I->isCommutative() &&
- "Expected an associative and commutative operation!");
-
- // Visit all operands of the expression, keeping track of their weight (the
- // number of paths from the expression root to the operand, or if you like
- // the number of times that operand occurs in the linearized expression).
- // For example, if I = X + A, where X = A + B, then I, X and B have weight 1
- // while A has weight two.
-
- // Worklist of non-leaf nodes (their operands are in the expression too) along
- // with their weights, representing a certain number of paths to the operator.
- // If an operator occurs in the worklist multiple times then we found multiple
- // ways to get to it.
- SmallVector<std::pair<Instruction*, APInt>, 8> Worklist; // (Op, Weight)
- Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
- bool Changed = false;
-
- // Leaves of the expression are values that either aren't the right kind of
- // operation (eg: a constant, or a multiply in an add tree), or are, but have
- // some uses that are not inside the expression. For example, in I = X + X,
- // X = A + B, the value X has two uses (by I) that are in the expression. If
- // X has any other uses, for example in a return instruction, then we consider
- // X to be a leaf, and won't analyze it further. When we first visit a value,
- // if it has more than one use then at first we conservatively consider it to
- // be a leaf. Later, as the expression is explored, we may discover some more
- // uses of the value from inside the expression. If all uses turn out to be
- // from within the expression (and the value is a binary operator of the right
- // kind) then the value is no longer considered to be a leaf, and its operands
- // are explored.
-
- // Leaves - Keeps track of the set of putative leaves as well as the number of
- // paths to each leaf seen so far.
- using LeafMap = DenseMap<Value *, APInt>;
- LeafMap Leaves; // Leaf -> Total weight so far.
- SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order.
-
-#ifndef NDEBUG
- SmallPtrSet<Value *, 8> Visited; // For sanity checking the iteration scheme.
-#endif
- while (!Worklist.empty()) {
- std::pair<Instruction*, APInt> P = Worklist.pop_back_val();
- I = P.first; // We examine the operands of this binary operator.
-
- for (unsigned OpIdx = 0; OpIdx < I->getNumOperands(); ++OpIdx) { // Visit operands.
- Value *Op = I->getOperand(OpIdx);
- APInt Weight = P.second; // Number of paths to this operand.
- LLVM_DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
- assert(!Op->use_empty() && "No uses, so how did we get to it?!");
-
- // If this is a binary operation of the right kind with only one use then
- // add its operands to the expression.
- if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
- assert(Visited.insert(Op).second && "Not first visit!");
- LLVM_DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
- Worklist.push_back(std::make_pair(BO, Weight));
- continue;
- }
-
- // Appears to be a leaf. Is the operand already in the set of leaves?
- LeafMap::iterator It = Leaves.find(Op);
- if (It == Leaves.end()) {
- // Not in the leaf map. Must be the first time we saw this operand.
- assert(Visited.insert(Op).second && "Not first visit!");
- if (!Op->hasOneUse()) {
- // This value has uses not accounted for by the expression, so it is
- // not safe to modify. Mark it as being a leaf.
- LLVM_DEBUG(dbgs()
- << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
- LeafOrder.push_back(Op);
- Leaves[Op] = Weight;
- continue;
- }
- // No uses outside the expression, try morphing it.
- } else {
- // Already in the leaf map.
- assert(It != Leaves.end() && Visited.count(Op) &&
- "In leaf map but not visited!");
-
- // Update the number of paths to the leaf.
- IncorporateWeight(It->second, Weight, Opcode);
-
-#if 0 // TODO: Re-enable once PR13021 is fixed.
- // The leaf already has one use from inside the expression. As we want
- // exactly one such use, drop this new use of the leaf.
- assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
- I->setOperand(OpIdx, UndefValue::get(I->getType()));
- Changed = true;
-
- // If the leaf is a binary operation of the right kind and we now see
- // that its multiple original uses were in fact all by nodes belonging
- // to the expression, then no longer consider it to be a leaf and add
- // its operands to the expression.
- if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
- LLVM_DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
- Worklist.push_back(std::make_pair(BO, It->second));
- Leaves.erase(It);
- continue;
- }
-#endif
-
- // If we still have uses that are not accounted for by the expression
- // then it is not safe to modify the value.
- if (!Op->hasOneUse())
- continue;
-
- // No uses outside the expression, try morphing it.
- Weight = It->second;
- Leaves.erase(It); // Since the value may be morphed below.
- }
-
- // At this point we have a value which, first of all, is not a binary
- // expression of the right kind, and secondly, is only used inside the
- // expression. This means that it can safely be modified. See if we
- // can usefully morph it into an expression of the right kind.
- assert((!isa<Instruction>(Op) ||
- cast<Instruction>(Op)->getOpcode() != Opcode
- || (isa<FPMathOperator>(Op) &&
- !cast<Instruction>(Op)->isFast())) &&
- "Should have been handled above!");
- assert(Op->hasOneUse() && "Has uses outside the expression tree!");
-
- // If this is a multiply expression, turn any internal negations into
- // multiplies by -1 so they can be reassociated.
- if (Instruction *Tmp = dyn_cast<Instruction>(Op))
- if ((Opcode == Instruction::Mul && match(Tmp, m_Neg(m_Value()))) ||
- (Opcode == Instruction::FMul && match(Tmp, m_FNeg(m_Value())))) {
- LLVM_DEBUG(dbgs()
- << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
- Tmp = LowerNegateToMultiply(Tmp);
- LLVM_DEBUG(dbgs() << *Tmp << '\n');
- Worklist.push_back(std::make_pair(Tmp, Weight));
- Changed = true;
- continue;
- }
-
- // Failed to morph into an expression of the right type. This really is
- // a leaf.
- LLVM_DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
- assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
- LeafOrder.push_back(Op);
- Leaves[Op] = Weight;
- }
- }
-
- // The leaves, repeated according to their weights, represent the linearized
- // form of the expression.
- for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) {
- Value *V = LeafOrder[i];
- LeafMap::iterator It = Leaves.find(V);
- if (It == Leaves.end())
- // Node initially thought to be a leaf wasn't.
- continue;
- assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!");
- APInt Weight = It->second;
- if (Weight.isMinValue())
- // Leaf already output or weight reduction eliminated it.
- continue;
- // Ensure the leaf is only output once.
- It->second = 0;
- Ops.push_back(std::make_pair(V, Weight));
- }
-
- // For nilpotent operations or addition there may be no operands, for example
- // because the expression was "X xor X" or consisted of 2^Bitwidth additions:
- // in both cases the weight reduces to 0 causing the value to be skipped.
- if (Ops.empty()) {
- Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
- assert(Identity && "Associative operation without identity!");
- Ops.emplace_back(Identity, APInt(Bitwidth, 1));
- }
-
- return Changed;
-}
-
-/// Now that the operands for this expression tree are
-/// linearized and optimized, emit them in-order.
-void ReassociatePass::RewriteExprTree(BinaryOperator *I,
- SmallVectorImpl<ValueEntry> &Ops) {
- assert(Ops.size() > 1 && "Single values should be used directly!");
-
- // Since our optimizations should never increase the number of operations, the
- // new expression can usually be written reusing the existing binary operators
- // from the original expression tree, without creating any new instructions,
- // though the rewritten expression may have a completely different topology.
- // We take care to not change anything if the new expression will be the same
- // as the original. If more than trivial changes (like commuting operands)
- // were made then we are obliged to clear out any optional subclass data like
- // nsw flags.
-
- /// NodesToRewrite - Nodes from the original expression available for writing
- /// the new expression into.
- SmallVector<BinaryOperator*, 8> NodesToRewrite;
- unsigned Opcode = I->getOpcode();
- BinaryOperator *Op = I;
-
- /// NotRewritable - The operands being written will be the leaves of the new
- /// expression and must not be used as inner nodes (via NodesToRewrite) by
- /// mistake. Inner nodes are always reassociable, and usually leaves are not
- /// (if they were they would have been incorporated into the expression and so
- /// would not be leaves), so most of the time there is no danger of this. But
- /// in rare cases a leaf may become reassociable if an optimization kills uses
- /// of it, or it may momentarily become reassociable during rewriting (below)
- /// due it being removed as an operand of one of its uses. Ensure that misuse
- /// of leaf nodes as inner nodes cannot occur by remembering all of the future
- /// leaves and refusing to reuse any of them as inner nodes.
- SmallPtrSet<Value*, 8> NotRewritable;
- for (unsigned i = 0, e = Ops.size(); i != e; ++i)
- NotRewritable.insert(Ops[i].Op);
-
- // ExpressionChanged - Non-null if the rewritten expression differs from the
- // original in some non-trivial way, requiring the clearing of optional flags.
- // Flags are cleared from the operator in ExpressionChanged up to I inclusive.
- BinaryOperator *ExpressionChanged = nullptr;
- for (unsigned i = 0; ; ++i) {
- // The last operation (which comes earliest in the IR) is special as both
- // operands will come from Ops, rather than just one with the other being
- // a subexpression.
- if (i+2 == Ops.size()) {
- Value *NewLHS = Ops[i].Op;
- Value *NewRHS = Ops[i+1].Op;
- Value *OldLHS = Op->getOperand(0);
- Value *OldRHS = Op->getOperand(1);
-
- if (NewLHS == OldLHS && NewRHS == OldRHS)
- // Nothing changed, leave it alone.
- break;
-
- if (NewLHS == OldRHS && NewRHS == OldLHS) {
- // The order of the operands was reversed. Swap them.
- LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
- Op->swapOperands();
- LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
- MadeChange = true;
- ++NumChanged;
- break;
- }
-
- // The new operation differs non-trivially from the original. Overwrite
- // the old operands with the new ones.
- LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
- if (NewLHS != OldLHS) {
- BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
- if (BO && !NotRewritable.count(BO))
- NodesToRewrite.push_back(BO);
- Op->setOperand(0, NewLHS);
- }
- if (NewRHS != OldRHS) {
- BinaryOperator *BO = isReassociableOp(OldRHS, Opcode);
- if (BO && !NotRewritable.count(BO))
- NodesToRewrite.push_back(BO);
- Op->setOperand(1, NewRHS);
- }
- LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
-
- ExpressionChanged = Op;
- MadeChange = true;
- ++NumChanged;
-
- break;
- }
-
- // Not the last operation. The left-hand side will be a sub-expression
- // while the right-hand side will be the current element of Ops.
- Value *NewRHS = Ops[i].Op;
- if (NewRHS != Op->getOperand(1)) {
- LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
- if (NewRHS == Op->getOperand(0)) {
- // The new right-hand side was already present as the left operand. If
- // we are lucky then swapping the operands will sort out both of them.
- Op->swapOperands();
- } else {
- // Overwrite with the new right-hand side.
- BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode);
- if (BO && !NotRewritable.count(BO))
- NodesToRewrite.push_back(BO);
- Op->setOperand(1, NewRHS);
- ExpressionChanged = Op;
- }
- LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
- MadeChange = true;
- ++NumChanged;
- }
-
- // Now deal with the left-hand side. If this is already an operation node
- // from the original expression then just rewrite the rest of the expression
- // into it.
- BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode);
- if (BO && !NotRewritable.count(BO)) {
- Op = BO;
- continue;
- }
-
- // Otherwise, grab a spare node from the original expression and use that as
- // the left-hand side. If there are no nodes left then the optimizers made
- // an expression with more nodes than the original! This usually means that
- // they did something stupid but it might mean that the problem was just too
- // hard (finding the mimimal number of multiplications needed to realize a
- // multiplication expression is NP-complete). Whatever the reason, smart or
- // stupid, create a new node if there are none left.
- BinaryOperator *NewOp;
- if (NodesToRewrite.empty()) {
- Constant *Undef = UndefValue::get(I->getType());
- NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
- Undef, Undef, "", I);
- if (NewOp->getType()->isFPOrFPVectorTy())
- NewOp->setFastMathFlags(I->getFastMathFlags());
- } else {
- NewOp = NodesToRewrite.pop_back_val();
- }
-
- LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
- Op->setOperand(0, NewOp);
- LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
- ExpressionChanged = Op;
- MadeChange = true;
- ++NumChanged;
- Op = NewOp;
- }
-
- // If the expression changed non-trivially then clear out all subclass data
- // starting from the operator specified in ExpressionChanged, and compactify
- // the operators to just before the expression root to guarantee that the
- // expression tree is dominated by all of Ops.
- if (ExpressionChanged)
- do {
- // Preserve FastMathFlags.
- if (isa<FPMathOperator>(I)) {
- FastMathFlags Flags = I->getFastMathFlags();
- ExpressionChanged->clearSubclassOptionalData();
- ExpressionChanged->setFastMathFlags(Flags);
- } else
- ExpressionChanged->clearSubclassOptionalData();
-
- if (ExpressionChanged == I)
- break;
-
- // Discard any debug info related to the expressions that has changed (we
- // can leave debug infor related to the root, since the result of the
- // expression tree should be the same even after reassociation).
- replaceDbgUsesWithUndef(ExpressionChanged);
-
- ExpressionChanged->moveBefore(I);
- ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
- } while (true);
-
- // Throw away any left over nodes from the original expression.
- for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
- RedoInsts.insert(NodesToRewrite[i]);
-}
-
-/// Insert instructions before the instruction pointed to by BI,
-/// that computes the negative version of the value specified. The negative
-/// version of the value is returned, and BI is left pointing at the instruction
-/// that should be processed next by the reassociation pass.
-/// Also add intermediate instructions to the redo list that are modified while
-/// pushing the negates through adds. These will be revisited to see if
-/// additional opportunities have been exposed.
-static Value *NegateValue(Value *V, Instruction *BI,
- ReassociatePass::OrderedSet &ToRedo) {
- if (auto *C = dyn_cast<Constant>(V))
- return C->getType()->isFPOrFPVectorTy() ? ConstantExpr::getFNeg(C) :
- ConstantExpr::getNeg(C);
-
- // We are trying to expose opportunity for reassociation. One of the things
- // that we want to do to achieve this is to push a negation as deep into an
- // expression chain as possible, to expose the add instructions. In practice,
- // this means that we turn this:
- // X = -(A+12+C+D) into X = -A + -12 + -C + -D = -12 + -A + -C + -D
- // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
- // the constants. We assume that instcombine will clean up the mess later if
- // we introduce tons of unnecessary negation instructions.
- //
- if (BinaryOperator *I =
- isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {
- // Push the negates through the add.
- I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo));
- I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo));
- if (I->getOpcode() == Instruction::Add) {
- I->setHasNoUnsignedWrap(false);
- I->setHasNoSignedWrap(false);
- }
-
- // We must move the add instruction here, because the neg instructions do
- // not dominate the old add instruction in general. By moving it, we are
- // assured that the neg instructions we just inserted dominate the
- // instruction we are about to insert after them.
- //
- I->moveBefore(BI);
- I->setName(I->getName()+".neg");
-
- // Add the intermediate negates to the redo list as processing them later
- // could expose more reassociating opportunities.
- ToRedo.insert(I);
- return I;
- }
-
- // Okay, we need to materialize a negated version of V with an instruction.
- // Scan the use lists of V to see if we have one already.
- for (User *U : V->users()) {
- if (!match(U, m_Neg(m_Value())) && !match(U, m_FNeg(m_Value())))
- continue;
-
- // We found one! Now we have to make sure that the definition dominates
- // this use. We do this by moving it to the entry block (if it is a
- // non-instruction value) or right after the definition. These negates will
- // be zapped by reassociate later, so we don't need much finesse here.
- Instruction *TheNeg = cast<Instruction>(U);
-
- // Verify that the negate is in this function, V might be a constant expr.
- if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
- continue;
-
- bool FoundCatchSwitch = false;
-
- BasicBlock::iterator InsertPt;
- if (Instruction *InstInput = dyn_cast<Instruction>(V)) {
- if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
- InsertPt = II->getNormalDest()->begin();
- } else {
- InsertPt = ++InstInput->getIterator();
- }
-
- const BasicBlock *BB = InsertPt->getParent();
-
- // Make sure we don't move anything before PHIs or exception
- // handling pads.
- while (InsertPt != BB->end() && (isa<PHINode>(InsertPt) ||
- InsertPt->isEHPad())) {
- if (isa<CatchSwitchInst>(InsertPt))
- // A catchswitch cannot have anything in the block except
- // itself and PHIs. We'll bail out below.
- FoundCatchSwitch = true;
- ++InsertPt;
- }
- } else {
- InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin();
- }
-
- // We found a catchswitch in the block where we want to move the
- // neg. We cannot move anything into that block. Bail and just
- // create the neg before BI, as if we hadn't found an existing
- // neg.
- if (FoundCatchSwitch)
- break;
-
- TheNeg->moveBefore(&*InsertPt);
- if (TheNeg->getOpcode() == Instruction::Sub) {
- TheNeg->setHasNoUnsignedWrap(false);
- TheNeg->setHasNoSignedWrap(false);
- } else {
- TheNeg->andIRFlags(BI);
- }
- ToRedo.insert(TheNeg);
- return TheNeg;
- }
-
- // Insert a 'neg' instruction that subtracts the value from zero to get the
- // negation.
- Instruction *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
- ToRedo.insert(NewNeg);
- return NewNeg;
-}
-
+//===- Reassociate.cpp - Reassociate binary expressions -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates commutative expressions in an order that is designed
+// to promote better constant propagation, GCSE, LICM, PRE, etc.
+//
+// For example: 4 + (x + 5) -> x + (4 + 5)
+//
+// In the implementation of this algorithm, constants are assigned rank = 0,
+// function arguments are rank = 1, and other values are assigned ranks
+// corresponding to the reverse post order traversal of current function
+// (starting at 2), which effectively gives values in deep loops higher rank
+// than values not in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/Reassociate.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+using namespace reassociate;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "reassociate"
+
+STATISTIC(NumChanged, "Number of insts reassociated");
+STATISTIC(NumAnnihil, "Number of expr tree annihilated");
+STATISTIC(NumFactor , "Number of multiplies factored");
+
+#ifndef NDEBUG
+/// Print out the expression identified in the Ops list.
+static void PrintOps(Instruction *I, const SmallVectorImpl<ValueEntry> &Ops) {
+ Module *M = I->getModule();
+ dbgs() << Instruction::getOpcodeName(I->getOpcode()) << " "
+ << *Ops[0].Op->getType() << '\t';
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ dbgs() << "[ ";
+ Ops[i].Op->printAsOperand(dbgs(), false, M);
+ dbgs() << ", #" << Ops[i].Rank << "] ";
+ }
+}
+#endif
+
+/// Utility class representing a non-constant Xor-operand. We classify
+/// non-constant Xor-Operands into two categories:
+/// C1) The operand is in the form "X & C", where C is a constant and C != ~0
+/// C2)
+/// C2.1) The operand is in the form of "X | C", where C is a non-zero
+/// constant.
+/// C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
+/// operand as "E | 0"
+class llvm::reassociate::XorOpnd {
+public:
+ XorOpnd(Value *V);
+
+ bool isInvalid() const { return SymbolicPart == nullptr; }
+ bool isOrExpr() const { return isOr; }
+ Value *getValue() const { return OrigVal; }
+ Value *getSymbolicPart() const { return SymbolicPart; }
+ unsigned getSymbolicRank() const { return SymbolicRank; }
+ const APInt &getConstPart() const { return ConstPart; }
+
+ void Invalidate() { SymbolicPart = OrigVal = nullptr; }
+ void setSymbolicRank(unsigned R) { SymbolicRank = R; }
+
+private:
+ Value *OrigVal;
+ Value *SymbolicPart;
+ APInt ConstPart;
+ unsigned SymbolicRank;
+ bool isOr;
+};
+
+XorOpnd::XorOpnd(Value *V) {
+ assert(!isa<ConstantInt>(V) && "No ConstantInt");
+ OrigVal = V;
+ Instruction *I = dyn_cast<Instruction>(V);
+ SymbolicRank = 0;
+
+ if (I && (I->getOpcode() == Instruction::Or ||
+ I->getOpcode() == Instruction::And)) {
+ Value *V0 = I->getOperand(0);
+ Value *V1 = I->getOperand(1);
+ const APInt *C;
+ if (match(V0, m_APInt(C)))
+ std::swap(V0, V1);
+
+ if (match(V1, m_APInt(C))) {
+ ConstPart = *C;
+ SymbolicPart = V0;
+ isOr = (I->getOpcode() == Instruction::Or);
+ return;
+ }
+ }
+
+ // view the operand as "V | 0"
+ SymbolicPart = V;
+ ConstPart = APInt::getNullValue(V->getType()->getScalarSizeInBits());
+ isOr = true;
+}
+
+/// Return true if V is an instruction of the specified opcode and if it
+/// only has one use.
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (I && I->hasOneUse() && I->getOpcode() == Opcode)
+ if (!isa<FPMathOperator>(I) || I->isFast())
+ return cast<BinaryOperator>(I);
+ return nullptr;
+}
+
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode1,
+ unsigned Opcode2) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (I && I->hasOneUse() &&
+ (I->getOpcode() == Opcode1 || I->getOpcode() == Opcode2))
+ if (!isa<FPMathOperator>(I) || I->isFast())
+ return cast<BinaryOperator>(I);
+ return nullptr;
+}
+
+void ReassociatePass::BuildRankMap(Function &F,
+ ReversePostOrderTraversal<Function*> &RPOT) {
+ unsigned Rank = 2;
+
+ // Assign distinct ranks to function arguments.
+ for (auto &Arg : F.args()) {
+ ValueRankMap[&Arg] = ++Rank;
+ LLVM_DEBUG(dbgs() << "Calculated Rank[" << Arg.getName() << "] = " << Rank
+ << "\n");
+ }
+
+ // Traverse basic blocks in ReversePostOrder.
+ for (BasicBlock *BB : RPOT) {
+ unsigned BBRank = RankMap[BB] = ++Rank << 16;
+
+ // Walk the basic block, adding precomputed ranks for any instructions that
+ // we cannot move. This ensures that the ranks for these instructions are
+ // all different in the block.
+ for (Instruction &I : *BB)
+ if (mayBeMemoryDependent(I))
+ ValueRankMap[&I] = ++BBRank;
+ }
+}
+
+unsigned ReassociatePass::getRank(Value *V) {
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I) {
+ if (isa<Argument>(V)) return ValueRankMap[V]; // Function argument.
+ return 0; // Otherwise it's a global or constant, rank 0.
+ }
+
+ if (unsigned Rank = ValueRankMap[I])
+ return Rank; // Rank already known?
+
+ // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
+ // we can reassociate expressions for code motion! Since we do not recurse
+ // for PHI nodes, we cannot have infinite recursion here, because there
+ // cannot be loops in the value graph that do not go through PHI nodes.
+ unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
+ for (unsigned i = 0, e = I->getNumOperands(); i != e && Rank != MaxRank; ++i)
+ Rank = std::max(Rank, getRank(I->getOperand(i)));
+
+ // If this is a 'not' or 'neg' instruction, do not count it for rank. This
+ // assures us that X and ~X will have the same rank.
+ if (!match(I, m_Not(m_Value())) && !match(I, m_Neg(m_Value())) &&
+ !match(I, m_FNeg(m_Value())))
+ ++Rank;
+
+ LLVM_DEBUG(dbgs() << "Calculated Rank[" << V->getName() << "] = " << Rank
+ << "\n");
+
+ return ValueRankMap[I] = Rank;
+}
+
+// Canonicalize constants to RHS. Otherwise, sort the operands by rank.
+void ReassociatePass::canonicalizeOperands(Instruction *I) {
+ assert(isa<BinaryOperator>(I) && "Expected binary operator.");
+ assert(I->isCommutative() && "Expected commutative operator.");
+
+ Value *LHS = I->getOperand(0);
+ Value *RHS = I->getOperand(1);
+ if (LHS == RHS || isa<Constant>(RHS))
+ return;
+ if (isa<Constant>(LHS) || getRank(RHS) < getRank(LHS))
+ cast<BinaryOperator>(I)->swapOperands();
+}
+
+static BinaryOperator *CreateAdd(Value *S1, Value *S2, const Twine &Name,
+ Instruction *InsertBefore, Value *FlagsOp) {
+ if (S1->getType()->isIntOrIntVectorTy())
+ return BinaryOperator::CreateAdd(S1, S2, Name, InsertBefore);
+ else {
+ BinaryOperator *Res =
+ BinaryOperator::CreateFAdd(S1, S2, Name, InsertBefore);
+ Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+ return Res;
+ }
+}
+
+static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
+ Instruction *InsertBefore, Value *FlagsOp) {
+ if (S1->getType()->isIntOrIntVectorTy())
+ return BinaryOperator::CreateMul(S1, S2, Name, InsertBefore);
+ else {
+ BinaryOperator *Res =
+ BinaryOperator::CreateFMul(S1, S2, Name, InsertBefore);
+ Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
+ return Res;
+ }
+}
+
+static Instruction *CreateNeg(Value *S1, const Twine &Name,
+ Instruction *InsertBefore, Value *FlagsOp) {
+ if (S1->getType()->isIntOrIntVectorTy())
+ return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
+
+ if (auto *FMFSource = dyn_cast<Instruction>(FlagsOp))
+ return UnaryOperator::CreateFNegFMF(S1, FMFSource, Name, InsertBefore);
+
+ return UnaryOperator::CreateFNeg(S1, Name, InsertBefore);
+}
+
+/// Replace 0-X with X*-1.
+static BinaryOperator *LowerNegateToMultiply(Instruction *Neg) {
+ assert((isa<UnaryOperator>(Neg) || isa<BinaryOperator>(Neg)) &&
+ "Expected a Negate!");
+ // FIXME: It's not safe to lower a unary FNeg into a FMul by -1.0.
+ unsigned OpNo = isa<BinaryOperator>(Neg) ? 1 : 0;
+ Type *Ty = Neg->getType();
+ Constant *NegOne = Ty->isIntOrIntVectorTy() ?
+ ConstantInt::getAllOnesValue(Ty) : ConstantFP::get(Ty, -1.0);
+
+ BinaryOperator *Res = CreateMul(Neg->getOperand(OpNo), NegOne, "", Neg, Neg);
+ Neg->setOperand(OpNo, Constant::getNullValue(Ty)); // Drop use of op.
+ Res->takeName(Neg);
+ Neg->replaceAllUsesWith(Res);
+ Res->setDebugLoc(Neg->getDebugLoc());
+ return Res;
+}
+
+/// Returns k such that lambda(2^Bitwidth) = 2^k, where lambda is the Carmichael
+/// function. This means that x^(2^k) === 1 mod 2^Bitwidth for
+/// every odd x, i.e. x^(2^k) = 1 for every odd x in Bitwidth-bit arithmetic.
+/// Note that 0 <= k < Bitwidth, and if Bitwidth > 3 then x^(2^k) = 0 for every
+/// even x in Bitwidth-bit arithmetic.
+static unsigned CarmichaelShift(unsigned Bitwidth) {
+ if (Bitwidth < 3)
+ return Bitwidth - 1;
+ return Bitwidth - 2;
+}
+
+/// Add the extra weight 'RHS' to the existing weight 'LHS',
+/// reducing the combined weight using any special properties of the operation.
+/// The existing weight LHS represents the computation X op X op ... op X where
+/// X occurs LHS times. The combined weight represents X op X op ... op X with
+/// X occurring LHS + RHS times. If op is "Xor" for example then the combined
+/// operation is equivalent to X if LHS + RHS is odd, or 0 if LHS + RHS is even;
+/// the routine returns 1 in LHS in the first case, and 0 in LHS in the second.
+static void IncorporateWeight(APInt &LHS, const APInt &RHS, unsigned Opcode) {
+ // If we were working with infinite precision arithmetic then the combined
+ // weight would be LHS + RHS. But we are using finite precision arithmetic,
+ // and the APInt sum LHS + RHS may not be correct if it wraps (it is correct
+ // for nilpotent operations and addition, but not for idempotent operations
+ // and multiplication), so it is important to correctly reduce the combined
+ // weight back into range if wrapping would be wrong.
+
+ // If RHS is zero then the weight didn't change.
+ if (RHS.isMinValue())
+ return;
+ // If LHS is zero then the combined weight is RHS.
+ if (LHS.isMinValue()) {
+ LHS = RHS;
+ return;
+ }
+ // From this point on we know that neither LHS nor RHS is zero.
+
+ if (Instruction::isIdempotent(Opcode)) {
+ // Idempotent means X op X === X, so any non-zero weight is equivalent to a
+ // weight of 1. Keeping weights at zero or one also means that wrapping is
+ // not a problem.
+ assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+ return; // Return a weight of 1.
+ }
+ if (Instruction::isNilpotent(Opcode)) {
+ // Nilpotent means X op X === 0, so reduce weights modulo 2.
+ assert(LHS == 1 && RHS == 1 && "Weights not reduced!");
+ LHS = 0; // 1 + 1 === 0 modulo 2.
+ return;
+ }
+ if (Opcode == Instruction::Add || Opcode == Instruction::FAdd) {
+ // TODO: Reduce the weight by exploiting nsw/nuw?
+ LHS += RHS;
+ return;
+ }
+
+ assert((Opcode == Instruction::Mul || Opcode == Instruction::FMul) &&
+ "Unknown associative operation!");
+ unsigned Bitwidth = LHS.getBitWidth();
+ // If CM is the Carmichael number then a weight W satisfying W >= CM+Bitwidth
+ // can be replaced with W-CM. That's because x^W=x^(W-CM) for every Bitwidth
+ // bit number x, since either x is odd in which case x^CM = 1, or x is even in
+ // which case both x^W and x^(W - CM) are zero. By subtracting off multiples
+ // of CM like this weights can always be reduced to the range [0, CM+Bitwidth)
+ // which by a happy accident means that they can always be represented using
+ // Bitwidth bits.
+ // TODO: Reduce the weight by exploiting nsw/nuw? (Could do much better than
+ // the Carmichael number).
+ if (Bitwidth > 3) {
+ /// CM - The value of Carmichael's lambda function.
+ APInt CM = APInt::getOneBitSet(Bitwidth, CarmichaelShift(Bitwidth));
+ // Any weight W >= Threshold can be replaced with W - CM.
+ APInt Threshold = CM + Bitwidth;
+ assert(LHS.ult(Threshold) && RHS.ult(Threshold) && "Weights not reduced!");
+ // For Bitwidth 4 or more the following sum does not overflow.
+ LHS += RHS;
+ while (LHS.uge(Threshold))
+ LHS -= CM;
+ } else {
+ // To avoid problems with overflow do everything the same as above but using
+ // a larger type.
+ unsigned CM = 1U << CarmichaelShift(Bitwidth);
+ unsigned Threshold = CM + Bitwidth;
+ assert(LHS.getZExtValue() < Threshold && RHS.getZExtValue() < Threshold &&
+ "Weights not reduced!");
+ unsigned Total = LHS.getZExtValue() + RHS.getZExtValue();
+ while (Total >= Threshold)
+ Total -= CM;
+ LHS = Total;
+ }
+}
+
+using RepeatedValue = std::pair<Value*, APInt>;
+
+/// Given an associative binary expression, return the leaf
+/// nodes in Ops along with their weights (how many times the leaf occurs). The
+/// original expression is the same as
+/// (Ops[0].first op Ops[0].first op ... Ops[0].first) <- Ops[0].second times
+/// op
+/// (Ops[1].first op Ops[1].first op ... Ops[1].first) <- Ops[1].second times
+/// op
+/// ...
+/// op
+/// (Ops[N].first op Ops[N].first op ... Ops[N].first) <- Ops[N].second times
+///
+/// Note that the values Ops[0].first, ..., Ops[N].first are all distinct.
+///
+/// This routine may modify the function, in which case it returns 'true'. The
+/// changes it makes may well be destructive, changing the value computed by 'I'
+/// to something completely different. Thus if the routine returns 'true' then
+/// you MUST either replace I with a new expression computed from the Ops array,
+/// or use RewriteExprTree to put the values back in.
+///
+/// A leaf node is either not a binary operation of the same kind as the root
+/// node 'I' (i.e. is not a binary operator at all, or is, but with a different
+/// opcode), or is the same kind of binary operator but has a use which either
+/// does not belong to the expression, or does belong to the expression but is
+/// a leaf node. Every leaf node has at least one use that is a non-leaf node
+/// of the expression, while for non-leaf nodes (except for the root 'I') every
+/// use is a non-leaf node of the expression.
+///
+/// For example:
+/// expression graph node names
+///
+/// + | I
+/// / \ |
+/// + + | A, B
+/// / \ / \ |
+/// * + * | C, D, E
+/// / \ / \ / \ |
+/// + * | F, G
+///
+/// The leaf nodes are C, E, F and G. The Ops array will contain (maybe not in
+/// that order) (C, 1), (E, 1), (F, 2), (G, 2).
+///
+/// The expression is maximal: if some instruction is a binary operator of the
+/// same kind as 'I', and all of its uses are non-leaf nodes of the expression,
+/// then the instruction also belongs to the expression, is not a leaf node of
+/// it, and its operands also belong to the expression (but may be leaf nodes).
+///
+/// NOTE: This routine will set operands of non-leaf non-root nodes to undef in
+/// order to ensure that every non-root node in the expression has *exactly one*
+/// use by a non-leaf node of the expression. This destruction means that the
+/// caller MUST either replace 'I' with a new expression or use something like
+/// RewriteExprTree to put the values back in if the routine indicates that it
+/// made a change by returning 'true'.
+///
+/// In the above example either the right operand of A or the left operand of B
+/// will be replaced by undef. If it is B's operand then this gives:
+///
+/// + | I
+/// / \ |
+/// + + | A, B - operand of B replaced with undef
+/// / \ \ |
+/// * + * | C, D, E
+/// / \ / \ / \ |
+/// + * | F, G
+///
+/// Note that such undef operands can only be reached by passing through 'I'.
+/// For example, if you visit operands recursively starting from a leaf node
+/// then you will never see such an undef operand unless you get back to 'I',
+/// which requires passing through a phi node.
+///
+/// Note that this routine may also mutate binary operators of the wrong type
+/// that have all uses inside the expression (i.e. only used by non-leaf nodes
+/// of the expression) if it can turn them into binary operators of the right
+/// type and thus make the expression bigger.
+static bool LinearizeExprTree(Instruction *I,
+ SmallVectorImpl<RepeatedValue> &Ops) {
+ assert((isa<UnaryOperator>(I) || isa<BinaryOperator>(I)) &&
+ "Expected a UnaryOperator or BinaryOperator!");
+ LLVM_DEBUG(dbgs() << "LINEARIZE: " << *I << '\n');
+ unsigned Bitwidth = I->getType()->getScalarType()->getPrimitiveSizeInBits();
+ unsigned Opcode = I->getOpcode();
+ assert(I->isAssociative() && I->isCommutative() &&
+ "Expected an associative and commutative operation!");
+
+ // Visit all operands of the expression, keeping track of their weight (the
+ // number of paths from the expression root to the operand, or if you like
+ // the number of times that operand occurs in the linearized expression).
+ // For example, if I = X + A, where X = A + B, then I, X and B have weight 1
+ // while A has weight two.
+
+ // Worklist of non-leaf nodes (their operands are in the expression too) along
+ // with their weights, representing a certain number of paths to the operator.
+ // If an operator occurs in the worklist multiple times then we found multiple
+ // ways to get to it.
+ SmallVector<std::pair<Instruction*, APInt>, 8> Worklist; // (Op, Weight)
+ Worklist.push_back(std::make_pair(I, APInt(Bitwidth, 1)));
+ bool Changed = false;
+
+ // Leaves of the expression are values that either aren't the right kind of
+ // operation (eg: a constant, or a multiply in an add tree), or are, but have
+ // some uses that are not inside the expression. For example, in I = X + X,
+ // X = A + B, the value X has two uses (by I) that are in the expression. If
+ // X has any other uses, for example in a return instruction, then we consider
+ // X to be a leaf, and won't analyze it further. When we first visit a value,
+ // if it has more than one use then at first we conservatively consider it to
+ // be a leaf. Later, as the expression is explored, we may discover some more
+ // uses of the value from inside the expression. If all uses turn out to be
+ // from within the expression (and the value is a binary operator of the right
+ // kind) then the value is no longer considered to be a leaf, and its operands
+ // are explored.
+
+ // Leaves - Keeps track of the set of putative leaves as well as the number of
+ // paths to each leaf seen so far.
+ using LeafMap = DenseMap<Value *, APInt>;
+ LeafMap Leaves; // Leaf -> Total weight so far.
+ SmallVector<Value *, 8> LeafOrder; // Ensure deterministic leaf output order.
+
+#ifndef NDEBUG
+ SmallPtrSet<Value *, 8> Visited; // For sanity checking the iteration scheme.
+#endif
+ while (!Worklist.empty()) {
+ std::pair<Instruction*, APInt> P = Worklist.pop_back_val();
+ I = P.first; // We examine the operands of this binary operator.
+
+ for (unsigned OpIdx = 0; OpIdx < I->getNumOperands(); ++OpIdx) { // Visit operands.
+ Value *Op = I->getOperand(OpIdx);
+ APInt Weight = P.second; // Number of paths to this operand.
+ LLVM_DEBUG(dbgs() << "OPERAND: " << *Op << " (" << Weight << ")\n");
+ assert(!Op->use_empty() && "No uses, so how did we get to it?!");
+
+ // If this is a binary operation of the right kind with only one use then
+ // add its operands to the expression.
+ if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+ assert(Visited.insert(Op).second && "Not first visit!");
+ LLVM_DEBUG(dbgs() << "DIRECT ADD: " << *Op << " (" << Weight << ")\n");
+ Worklist.push_back(std::make_pair(BO, Weight));
+ continue;
+ }
+
+ // Appears to be a leaf. Is the operand already in the set of leaves?
+ LeafMap::iterator It = Leaves.find(Op);
+ if (It == Leaves.end()) {
+ // Not in the leaf map. Must be the first time we saw this operand.
+ assert(Visited.insert(Op).second && "Not first visit!");
+ if (!Op->hasOneUse()) {
+ // This value has uses not accounted for by the expression, so it is
+ // not safe to modify. Mark it as being a leaf.
+ LLVM_DEBUG(dbgs()
+ << "ADD USES LEAF: " << *Op << " (" << Weight << ")\n");
+ LeafOrder.push_back(Op);
+ Leaves[Op] = Weight;
+ continue;
+ }
+ // No uses outside the expression, try morphing it.
+ } else {
+ // Already in the leaf map.
+ assert(It != Leaves.end() && Visited.count(Op) &&
+ "In leaf map but not visited!");
+
+ // Update the number of paths to the leaf.
+ IncorporateWeight(It->second, Weight, Opcode);
+
+#if 0 // TODO: Re-enable once PR13021 is fixed.
+ // The leaf already has one use from inside the expression. As we want
+ // exactly one such use, drop this new use of the leaf.
+ assert(!Op->hasOneUse() && "Only one use, but we got here twice!");
+ I->setOperand(OpIdx, UndefValue::get(I->getType()));
+ Changed = true;
+
+ // If the leaf is a binary operation of the right kind and we now see
+ // that its multiple original uses were in fact all by nodes belonging
+ // to the expression, then no longer consider it to be a leaf and add
+ // its operands to the expression.
+ if (BinaryOperator *BO = isReassociableOp(Op, Opcode)) {
+ LLVM_DEBUG(dbgs() << "UNLEAF: " << *Op << " (" << It->second << ")\n");
+ Worklist.push_back(std::make_pair(BO, It->second));
+ Leaves.erase(It);
+ continue;
+ }
+#endif
+
+ // If we still have uses that are not accounted for by the expression
+ // then it is not safe to modify the value.
+ if (!Op->hasOneUse())
+ continue;
+
+ // No uses outside the expression, try morphing it.
+ Weight = It->second;
+ Leaves.erase(It); // Since the value may be morphed below.
+ }
+
+ // At this point we have a value which, first of all, is not a binary
+ // expression of the right kind, and secondly, is only used inside the
+ // expression. This means that it can safely be modified. See if we
+ // can usefully morph it into an expression of the right kind.
+ assert((!isa<Instruction>(Op) ||
+ cast<Instruction>(Op)->getOpcode() != Opcode
+ || (isa<FPMathOperator>(Op) &&
+ !cast<Instruction>(Op)->isFast())) &&
+ "Should have been handled above!");
+ assert(Op->hasOneUse() && "Has uses outside the expression tree!");
+
+ // If this is a multiply expression, turn any internal negations into
+ // multiplies by -1 so they can be reassociated.
+ if (Instruction *Tmp = dyn_cast<Instruction>(Op))
+ if ((Opcode == Instruction::Mul && match(Tmp, m_Neg(m_Value()))) ||
+ (Opcode == Instruction::FMul && match(Tmp, m_FNeg(m_Value())))) {
+ LLVM_DEBUG(dbgs()
+ << "MORPH LEAF: " << *Op << " (" << Weight << ") TO ");
+ Tmp = LowerNegateToMultiply(Tmp);
+ LLVM_DEBUG(dbgs() << *Tmp << '\n');
+ Worklist.push_back(std::make_pair(Tmp, Weight));
+ Changed = true;
+ continue;
+ }
+
+ // Failed to morph into an expression of the right type. This really is
+ // a leaf.
+ LLVM_DEBUG(dbgs() << "ADD LEAF: " << *Op << " (" << Weight << ")\n");
+ assert(!isReassociableOp(Op, Opcode) && "Value was morphed?");
+ LeafOrder.push_back(Op);
+ Leaves[Op] = Weight;
+ }
+ }
+
+ // The leaves, repeated according to their weights, represent the linearized
+ // form of the expression.
+ for (unsigned i = 0, e = LeafOrder.size(); i != e; ++i) {
+ Value *V = LeafOrder[i];
+ LeafMap::iterator It = Leaves.find(V);
+ if (It == Leaves.end())
+ // Node initially thought to be a leaf wasn't.
+ continue;
+ assert(!isReassociableOp(V, Opcode) && "Shouldn't be a leaf!");
+ APInt Weight = It->second;
+ if (Weight.isMinValue())
+ // Leaf already output or weight reduction eliminated it.
+ continue;
+ // Ensure the leaf is only output once.
+ It->second = 0;
+ Ops.push_back(std::make_pair(V, Weight));
+ }
+
+ // For nilpotent operations or addition there may be no operands, for example
+ // because the expression was "X xor X" or consisted of 2^Bitwidth additions:
+ // in both cases the weight reduces to 0 causing the value to be skipped.
+ if (Ops.empty()) {
+ Constant *Identity = ConstantExpr::getBinOpIdentity(Opcode, I->getType());
+ assert(Identity && "Associative operation without identity!");
+ Ops.emplace_back(Identity, APInt(Bitwidth, 1));
+ }
+
+ return Changed;
+}
+
+/// Now that the operands for this expression tree are
+/// linearized and optimized, emit them in-order.
+void ReassociatePass::RewriteExprTree(BinaryOperator *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ assert(Ops.size() > 1 && "Single values should be used directly!");
+
+ // Since our optimizations should never increase the number of operations, the
+ // new expression can usually be written reusing the existing binary operators
+ // from the original expression tree, without creating any new instructions,
+ // though the rewritten expression may have a completely different topology.
+ // We take care to not change anything if the new expression will be the same
+ // as the original. If more than trivial changes (like commuting operands)
+ // were made then we are obliged to clear out any optional subclass data like
+ // nsw flags.
+
+ /// NodesToRewrite - Nodes from the original expression available for writing
+ /// the new expression into.
+ SmallVector<BinaryOperator*, 8> NodesToRewrite;
+ unsigned Opcode = I->getOpcode();
+ BinaryOperator *Op = I;
+
+ /// NotRewritable - The operands being written will be the leaves of the new
+ /// expression and must not be used as inner nodes (via NodesToRewrite) by
+ /// mistake. Inner nodes are always reassociable, and usually leaves are not
+ /// (if they were they would have been incorporated into the expression and so
+ /// would not be leaves), so most of the time there is no danger of this. But
+ /// in rare cases a leaf may become reassociable if an optimization kills uses
+ /// of it, or it may momentarily become reassociable during rewriting (below)
+ /// due it being removed as an operand of one of its uses. Ensure that misuse
+ /// of leaf nodes as inner nodes cannot occur by remembering all of the future
+ /// leaves and refusing to reuse any of them as inner nodes.
+ SmallPtrSet<Value*, 8> NotRewritable;
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+ NotRewritable.insert(Ops[i].Op);
+
+ // ExpressionChanged - Non-null if the rewritten expression differs from the
+ // original in some non-trivial way, requiring the clearing of optional flags.
+ // Flags are cleared from the operator in ExpressionChanged up to I inclusive.
+ BinaryOperator *ExpressionChanged = nullptr;
+ for (unsigned i = 0; ; ++i) {
+ // The last operation (which comes earliest in the IR) is special as both
+ // operands will come from Ops, rather than just one with the other being
+ // a subexpression.
+ if (i+2 == Ops.size()) {
+ Value *NewLHS = Ops[i].Op;
+ Value *NewRHS = Ops[i+1].Op;
+ Value *OldLHS = Op->getOperand(0);
+ Value *OldRHS = Op->getOperand(1);
+
+ if (NewLHS == OldLHS && NewRHS == OldRHS)
+ // Nothing changed, leave it alone.
+ break;
+
+ if (NewLHS == OldRHS && NewRHS == OldLHS) {
+ // The order of the operands was reversed. Swap them.
+ LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
+ Op->swapOperands();
+ LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
+ MadeChange = true;
+ ++NumChanged;
+ break;
+ }
+
+ // The new operation differs non-trivially from the original. Overwrite
+ // the old operands with the new ones.
+ LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
+ if (NewLHS != OldLHS) {
+ BinaryOperator *BO = isReassociableOp(OldLHS, Opcode);
+ if (BO && !NotRewritable.count(BO))
+ NodesToRewrite.push_back(BO);
+ Op->setOperand(0, NewLHS);
+ }
+ if (NewRHS != OldRHS) {
+ BinaryOperator *BO = isReassociableOp(OldRHS, Opcode);
+ if (BO && !NotRewritable.count(BO))
+ NodesToRewrite.push_back(BO);
+ Op->setOperand(1, NewRHS);
+ }
+ LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
+
+ ExpressionChanged = Op;
+ MadeChange = true;
+ ++NumChanged;
+
+ break;
+ }
+
+ // Not the last operation. The left-hand side will be a sub-expression
+ // while the right-hand side will be the current element of Ops.
+ Value *NewRHS = Ops[i].Op;
+ if (NewRHS != Op->getOperand(1)) {
+ LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
+ if (NewRHS == Op->getOperand(0)) {
+ // The new right-hand side was already present as the left operand. If
+ // we are lucky then swapping the operands will sort out both of them.
+ Op->swapOperands();
+ } else {
+ // Overwrite with the new right-hand side.
+ BinaryOperator *BO = isReassociableOp(Op->getOperand(1), Opcode);
+ if (BO && !NotRewritable.count(BO))
+ NodesToRewrite.push_back(BO);
+ Op->setOperand(1, NewRHS);
+ ExpressionChanged = Op;
+ }
+ LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
+ MadeChange = true;
+ ++NumChanged;
+ }
+
+ // Now deal with the left-hand side. If this is already an operation node
+ // from the original expression then just rewrite the rest of the expression
+ // into it.
+ BinaryOperator *BO = isReassociableOp(Op->getOperand(0), Opcode);
+ if (BO && !NotRewritable.count(BO)) {
+ Op = BO;
+ continue;
+ }
+
+ // Otherwise, grab a spare node from the original expression and use that as
+ // the left-hand side. If there are no nodes left then the optimizers made
+ // an expression with more nodes than the original! This usually means that
+ // they did something stupid but it might mean that the problem was just too
+ // hard (finding the mimimal number of multiplications needed to realize a
+ // multiplication expression is NP-complete). Whatever the reason, smart or
+ // stupid, create a new node if there are none left.
+ BinaryOperator *NewOp;
+ if (NodesToRewrite.empty()) {
+ Constant *Undef = UndefValue::get(I->getType());
+ NewOp = BinaryOperator::Create(Instruction::BinaryOps(Opcode),
+ Undef, Undef, "", I);
+ if (NewOp->getType()->isFPOrFPVectorTy())
+ NewOp->setFastMathFlags(I->getFastMathFlags());
+ } else {
+ NewOp = NodesToRewrite.pop_back_val();
+ }
+
+ LLVM_DEBUG(dbgs() << "RA: " << *Op << '\n');
+ Op->setOperand(0, NewOp);
+ LLVM_DEBUG(dbgs() << "TO: " << *Op << '\n');
+ ExpressionChanged = Op;
+ MadeChange = true;
+ ++NumChanged;
+ Op = NewOp;
+ }
+
+ // If the expression changed non-trivially then clear out all subclass data
+ // starting from the operator specified in ExpressionChanged, and compactify
+ // the operators to just before the expression root to guarantee that the
+ // expression tree is dominated by all of Ops.
+ if (ExpressionChanged)
+ do {
+ // Preserve FastMathFlags.
+ if (isa<FPMathOperator>(I)) {
+ FastMathFlags Flags = I->getFastMathFlags();
+ ExpressionChanged->clearSubclassOptionalData();
+ ExpressionChanged->setFastMathFlags(Flags);
+ } else
+ ExpressionChanged->clearSubclassOptionalData();
+
+ if (ExpressionChanged == I)
+ break;
+
+ // Discard any debug info related to the expressions that has changed (we
+ // can leave debug infor related to the root, since the result of the
+ // expression tree should be the same even after reassociation).
+ replaceDbgUsesWithUndef(ExpressionChanged);
+
+ ExpressionChanged->moveBefore(I);
+ ExpressionChanged = cast<BinaryOperator>(*ExpressionChanged->user_begin());
+ } while (true);
+
+ // Throw away any left over nodes from the original expression.
+ for (unsigned i = 0, e = NodesToRewrite.size(); i != e; ++i)
+ RedoInsts.insert(NodesToRewrite[i]);
+}
+
+/// Insert instructions before the instruction pointed to by BI,
+/// that computes the negative version of the value specified. The negative
+/// version of the value is returned, and BI is left pointing at the instruction
+/// that should be processed next by the reassociation pass.
+/// Also add intermediate instructions to the redo list that are modified while
+/// pushing the negates through adds. These will be revisited to see if
+/// additional opportunities have been exposed.
+static Value *NegateValue(Value *V, Instruction *BI,
+ ReassociatePass::OrderedSet &ToRedo) {
+ if (auto *C = dyn_cast<Constant>(V))
+ return C->getType()->isFPOrFPVectorTy() ? ConstantExpr::getFNeg(C) :
+ ConstantExpr::getNeg(C);
+
+ // We are trying to expose opportunity for reassociation. One of the things
+ // that we want to do to achieve this is to push a negation as deep into an
+ // expression chain as possible, to expose the add instructions. In practice,
+ // this means that we turn this:
+ // X = -(A+12+C+D) into X = -A + -12 + -C + -D = -12 + -A + -C + -D
+ // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
+ // the constants. We assume that instcombine will clean up the mess later if
+ // we introduce tons of unnecessary negation instructions.
+ //
+ if (BinaryOperator *I =
+ isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {
+ // Push the negates through the add.
+ I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo));
+ I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo));
+ if (I->getOpcode() == Instruction::Add) {
+ I->setHasNoUnsignedWrap(false);
+ I->setHasNoSignedWrap(false);
+ }
+
+ // We must move the add instruction here, because the neg instructions do
+ // not dominate the old add instruction in general. By moving it, we are
+ // assured that the neg instructions we just inserted dominate the
+ // instruction we are about to insert after them.
+ //
+ I->moveBefore(BI);
+ I->setName(I->getName()+".neg");
+
+ // Add the intermediate negates to the redo list as processing them later
+ // could expose more reassociating opportunities.
+ ToRedo.insert(I);
+ return I;
+ }
+
+ // Okay, we need to materialize a negated version of V with an instruction.
+ // Scan the use lists of V to see if we have one already.
+ for (User *U : V->users()) {
+ if (!match(U, m_Neg(m_Value())) && !match(U, m_FNeg(m_Value())))
+ continue;
+
+ // We found one! Now we have to make sure that the definition dominates
+ // this use. We do this by moving it to the entry block (if it is a
+ // non-instruction value) or right after the definition. These negates will
+ // be zapped by reassociate later, so we don't need much finesse here.
+ Instruction *TheNeg = cast<Instruction>(U);
+
+ // Verify that the negate is in this function, V might be a constant expr.
+ if (TheNeg->getParent()->getParent() != BI->getParent()->getParent())
+ continue;
+
+ bool FoundCatchSwitch = false;
+
+ BasicBlock::iterator InsertPt;
+ if (Instruction *InstInput = dyn_cast<Instruction>(V)) {
+ if (InvokeInst *II = dyn_cast<InvokeInst>(InstInput)) {
+ InsertPt = II->getNormalDest()->begin();
+ } else {
+ InsertPt = ++InstInput->getIterator();
+ }
+
+ const BasicBlock *BB = InsertPt->getParent();
+
+ // Make sure we don't move anything before PHIs or exception
+ // handling pads.
+ while (InsertPt != BB->end() && (isa<PHINode>(InsertPt) ||
+ InsertPt->isEHPad())) {
+ if (isa<CatchSwitchInst>(InsertPt))
+ // A catchswitch cannot have anything in the block except
+ // itself and PHIs. We'll bail out below.
+ FoundCatchSwitch = true;
+ ++InsertPt;
+ }
+ } else {
+ InsertPt = TheNeg->getParent()->getParent()->getEntryBlock().begin();
+ }
+
+ // We found a catchswitch in the block where we want to move the
+ // neg. We cannot move anything into that block. Bail and just
+ // create the neg before BI, as if we hadn't found an existing
+ // neg.
+ if (FoundCatchSwitch)
+ break;
+
+ TheNeg->moveBefore(&*InsertPt);
+ if (TheNeg->getOpcode() == Instruction::Sub) {
+ TheNeg->setHasNoUnsignedWrap(false);
+ TheNeg->setHasNoSignedWrap(false);
+ } else {
+ TheNeg->andIRFlags(BI);
+ }
+ ToRedo.insert(TheNeg);
+ return TheNeg;
+ }
+
+ // Insert a 'neg' instruction that subtracts the value from zero to get the
+ // negation.
+ Instruction *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
+ ToRedo.insert(NewNeg);
+ return NewNeg;
+}
+
// See if this `or` looks like an load widening reduction, i.e. that it
// consists of an `or`/`shl`/`zext`/`load` nodes only. Note that we don't
// ensure that the pattern is *really* a load widening reduction,
@@ -1014,1201 +1014,1201 @@ static BinaryOperator *ConvertOrWithNoCommonBitsToAdd(Instruction *Or) {
return New;
}
-/// Return true if we should break up this subtract of X-Y into (X + -Y).
-static bool ShouldBreakUpSubtract(Instruction *Sub) {
- // If this is a negation, we can't split it up!
- if (match(Sub, m_Neg(m_Value())) || match(Sub, m_FNeg(m_Value())))
- return false;
-
- // Don't breakup X - undef.
- if (isa<UndefValue>(Sub->getOperand(1)))
- return false;
-
- // Don't bother to break this up unless either the LHS is an associable add or
- // subtract or if this is only used by one.
- Value *V0 = Sub->getOperand(0);
- if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) ||
- isReassociableOp(V0, Instruction::Sub, Instruction::FSub))
- return true;
- Value *V1 = Sub->getOperand(1);
- if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) ||
- isReassociableOp(V1, Instruction::Sub, Instruction::FSub))
- return true;
- Value *VB = Sub->user_back();
- if (Sub->hasOneUse() &&
- (isReassociableOp(VB, Instruction::Add, Instruction::FAdd) ||
- isReassociableOp(VB, Instruction::Sub, Instruction::FSub)))
- return true;
-
- return false;
-}
-
-/// If we have (X-Y), and if either X is an add, or if this is only used by an
-/// add, transform this into (X+(0-Y)) to promote better reassociation.
-static BinaryOperator *BreakUpSubtract(Instruction *Sub,
- ReassociatePass::OrderedSet &ToRedo) {
- // Convert a subtract into an add and a neg instruction. This allows sub
- // instructions to be commuted with other add instructions.
- //
- // Calculate the negative value of Operand 1 of the sub instruction,
- // and set it as the RHS of the add instruction we just made.
- Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);
- BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
- Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
- Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
- New->takeName(Sub);
-
- // Everyone now refers to the add instruction.
- Sub->replaceAllUsesWith(New);
- New->setDebugLoc(Sub->getDebugLoc());
-
- LLVM_DEBUG(dbgs() << "Negated: " << *New << '\n');
- return New;
-}
-
-/// If this is a shift of a reassociable multiply or is used by one, change
-/// this into a multiply by a constant to assist with further reassociation.
-static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
- Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
- auto *SA = cast<ConstantInt>(Shl->getOperand(1));
- MulCst = ConstantExpr::getShl(MulCst, SA);
-
- BinaryOperator *Mul =
- BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
- Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
- Mul->takeName(Shl);
-
- // Everyone now refers to the mul instruction.
- Shl->replaceAllUsesWith(Mul);
- Mul->setDebugLoc(Shl->getDebugLoc());
-
- // We can safely preserve the nuw flag in all cases. It's also safe to turn a
- // nuw nsw shl into a nuw nsw mul. However, nsw in isolation requires special
- // handling. It can be preserved as long as we're not left shifting by
- // bitwidth - 1.
- bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap();
- bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap();
- unsigned BitWidth = Shl->getType()->getIntegerBitWidth();
- if (NSW && (NUW || SA->getValue().ult(BitWidth - 1)))
- Mul->setHasNoSignedWrap(true);
- Mul->setHasNoUnsignedWrap(NUW);
- return Mul;
-}
-
-/// Scan backwards and forwards among values with the same rank as element i
-/// to see if X exists. If X does not exist, return i. This is useful when
-/// scanning for 'x' when we see '-x' because they both get the same rank.
-static unsigned FindInOperandList(const SmallVectorImpl<ValueEntry> &Ops,
- unsigned i, Value *X) {
- unsigned XRank = Ops[i].Rank;
- unsigned e = Ops.size();
- for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
- if (Ops[j].Op == X)
- return j;
- if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
- if (Instruction *I2 = dyn_cast<Instruction>(X))
- if (I1->isIdenticalTo(I2))
- return j;
- }
- // Scan backwards.
- for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) {
- if (Ops[j].Op == X)
- return j;
- if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
- if (Instruction *I2 = dyn_cast<Instruction>(X))
- if (I1->isIdenticalTo(I2))
- return j;
- }
- return i;
-}
-
-/// Emit a tree of add instructions, summing Ops together
-/// and returning the result. Insert the tree before I.
-static Value *EmitAddTreeOfValues(Instruction *I,
- SmallVectorImpl<WeakTrackingVH> &Ops) {
- if (Ops.size() == 1) return Ops.back();
-
+/// Return true if we should break up this subtract of X-Y into (X + -Y).
+static bool ShouldBreakUpSubtract(Instruction *Sub) {
+ // If this is a negation, we can't split it up!
+ if (match(Sub, m_Neg(m_Value())) || match(Sub, m_FNeg(m_Value())))
+ return false;
+
+ // Don't breakup X - undef.
+ if (isa<UndefValue>(Sub->getOperand(1)))
+ return false;
+
+ // Don't bother to break this up unless either the LHS is an associable add or
+ // subtract or if this is only used by one.
+ Value *V0 = Sub->getOperand(0);
+ if (isReassociableOp(V0, Instruction::Add, Instruction::FAdd) ||
+ isReassociableOp(V0, Instruction::Sub, Instruction::FSub))
+ return true;
+ Value *V1 = Sub->getOperand(1);
+ if (isReassociableOp(V1, Instruction::Add, Instruction::FAdd) ||
+ isReassociableOp(V1, Instruction::Sub, Instruction::FSub))
+ return true;
+ Value *VB = Sub->user_back();
+ if (Sub->hasOneUse() &&
+ (isReassociableOp(VB, Instruction::Add, Instruction::FAdd) ||
+ isReassociableOp(VB, Instruction::Sub, Instruction::FSub)))
+ return true;
+
+ return false;
+}
+
+/// If we have (X-Y), and if either X is an add, or if this is only used by an
+/// add, transform this into (X+(0-Y)) to promote better reassociation.
+static BinaryOperator *BreakUpSubtract(Instruction *Sub,
+ ReassociatePass::OrderedSet &ToRedo) {
+ // Convert a subtract into an add and a neg instruction. This allows sub
+ // instructions to be commuted with other add instructions.
+ //
+ // Calculate the negative value of Operand 1 of the sub instruction,
+ // and set it as the RHS of the add instruction we just made.
+ Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);
+ BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
+ Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
+ Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
+ New->takeName(Sub);
+
+ // Everyone now refers to the add instruction.
+ Sub->replaceAllUsesWith(New);
+ New->setDebugLoc(Sub->getDebugLoc());
+
+ LLVM_DEBUG(dbgs() << "Negated: " << *New << '\n');
+ return New;
+}
+
+/// If this is a shift of a reassociable multiply or is used by one, change
+/// this into a multiply by a constant to assist with further reassociation.
+static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
+ Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
+ auto *SA = cast<ConstantInt>(Shl->getOperand(1));
+ MulCst = ConstantExpr::getShl(MulCst, SA);
+
+ BinaryOperator *Mul =
+ BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
+ Shl->setOperand(0, UndefValue::get(Shl->getType())); // Drop use of op.
+ Mul->takeName(Shl);
+
+ // Everyone now refers to the mul instruction.
+ Shl->replaceAllUsesWith(Mul);
+ Mul->setDebugLoc(Shl->getDebugLoc());
+
+ // We can safely preserve the nuw flag in all cases. It's also safe to turn a
+ // nuw nsw shl into a nuw nsw mul. However, nsw in isolation requires special
+ // handling. It can be preserved as long as we're not left shifting by
+ // bitwidth - 1.
+ bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap();
+ bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap();
+ unsigned BitWidth = Shl->getType()->getIntegerBitWidth();
+ if (NSW && (NUW || SA->getValue().ult(BitWidth - 1)))
+ Mul->setHasNoSignedWrap(true);
+ Mul->setHasNoUnsignedWrap(NUW);
+ return Mul;
+}
+
+/// Scan backwards and forwards among values with the same rank as element i
+/// to see if X exists. If X does not exist, return i. This is useful when
+/// scanning for 'x' when we see '-x' because they both get the same rank.
+static unsigned FindInOperandList(const SmallVectorImpl<ValueEntry> &Ops,
+ unsigned i, Value *X) {
+ unsigned XRank = Ops[i].Rank;
+ unsigned e = Ops.size();
+ for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j) {
+ if (Ops[j].Op == X)
+ return j;
+ if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+ if (Instruction *I2 = dyn_cast<Instruction>(X))
+ if (I1->isIdenticalTo(I2))
+ return j;
+ }
+ // Scan backwards.
+ for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j) {
+ if (Ops[j].Op == X)
+ return j;
+ if (Instruction *I1 = dyn_cast<Instruction>(Ops[j].Op))
+ if (Instruction *I2 = dyn_cast<Instruction>(X))
+ if (I1->isIdenticalTo(I2))
+ return j;
+ }
+ return i;
+}
+
+/// Emit a tree of add instructions, summing Ops together
+/// and returning the result. Insert the tree before I.
+static Value *EmitAddTreeOfValues(Instruction *I,
+ SmallVectorImpl<WeakTrackingVH> &Ops) {
+ if (Ops.size() == 1) return Ops.back();
+
Value *V1 = Ops.pop_back_val();
- Value *V2 = EmitAddTreeOfValues(I, Ops);
- return CreateAdd(V2, V1, "reass.add", I, I);
-}
-
-/// If V is an expression tree that is a multiplication sequence,
-/// and if this sequence contains a multiply by Factor,
-/// remove Factor from the tree and return the new tree.
-Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
- BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
- if (!BO)
- return nullptr;
-
- SmallVector<RepeatedValue, 8> Tree;
- MadeChange |= LinearizeExprTree(BO, Tree);
- SmallVector<ValueEntry, 8> Factors;
- Factors.reserve(Tree.size());
- for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
- RepeatedValue E = Tree[i];
- Factors.append(E.second.getZExtValue(),
- ValueEntry(getRank(E.first), E.first));
- }
-
- bool FoundFactor = false;
- bool NeedsNegate = false;
- for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
- if (Factors[i].Op == Factor) {
- FoundFactor = true;
- Factors.erase(Factors.begin()+i);
- break;
- }
-
- // If this is a negative version of this factor, remove it.
- if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) {
- if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
- if (FC1->getValue() == -FC2->getValue()) {
- FoundFactor = NeedsNegate = true;
- Factors.erase(Factors.begin()+i);
- break;
- }
- } else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) {
- if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) {
- const APFloat &F1 = FC1->getValueAPF();
- APFloat F2(FC2->getValueAPF());
- F2.changeSign();
- if (F1 == F2) {
- FoundFactor = NeedsNegate = true;
- Factors.erase(Factors.begin() + i);
- break;
- }
- }
- }
- }
-
- if (!FoundFactor) {
- // Make sure to restore the operands to the expression tree.
- RewriteExprTree(BO, Factors);
- return nullptr;
- }
-
- BasicBlock::iterator InsertPt = ++BO->getIterator();
-
- // If this was just a single multiply, remove the multiply and return the only
- // remaining operand.
- if (Factors.size() == 1) {
- RedoInsts.insert(BO);
- V = Factors[0].Op;
- } else {
- RewriteExprTree(BO, Factors);
- V = BO;
- }
-
- if (NeedsNegate)
- V = CreateNeg(V, "neg", &*InsertPt, BO);
-
- return V;
-}
-
-/// If V is a single-use multiply, recursively add its operands as factors,
-/// otherwise add V to the list of factors.
-///
-/// Ops is the top-level list of add operands we're trying to factor.
-static void FindSingleUseMultiplyFactors(Value *V,
- SmallVectorImpl<Value*> &Factors) {
- BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
- if (!BO) {
- Factors.push_back(V);
- return;
- }
-
- // Otherwise, add the LHS and RHS to the list of factors.
- FindSingleUseMultiplyFactors(BO->getOperand(1), Factors);
- FindSingleUseMultiplyFactors(BO->getOperand(0), Factors);
-}
-
-/// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
-/// This optimizes based on identities. If it can be reduced to a single Value,
-/// it is returned, otherwise the Ops list is mutated as necessary.
-static Value *OptimizeAndOrXor(unsigned Opcode,
- SmallVectorImpl<ValueEntry> &Ops) {
- // Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
- // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
- for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
- // First, check for X and ~X in the operand list.
- assert(i < Ops.size());
- Value *X;
- if (match(Ops[i].Op, m_Not(m_Value(X)))) { // Cannot occur for ^.
- unsigned FoundX = FindInOperandList(Ops, i, X);
- if (FoundX != i) {
- if (Opcode == Instruction::And) // ...&X&~X = 0
- return Constant::getNullValue(X->getType());
-
- if (Opcode == Instruction::Or) // ...|X|~X = -1
- return Constant::getAllOnesValue(X->getType());
- }
- }
-
- // Next, check for duplicate pairs of values, which we assume are next to
- // each other, due to our sorting criteria.
- assert(i < Ops.size());
- if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
- if (Opcode == Instruction::And || Opcode == Instruction::Or) {
- // Drop duplicate values for And and Or.
- Ops.erase(Ops.begin()+i);
- --i; --e;
- ++NumAnnihil;
- continue;
- }
-
- // Drop pairs of values for Xor.
- assert(Opcode == Instruction::Xor);
- if (e == 2)
- return Constant::getNullValue(Ops[0].Op->getType());
-
- // Y ^ X^X -> Y
- Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
- i -= 1; e -= 2;
- ++NumAnnihil;
- }
- }
- return nullptr;
-}
-
-/// Helper function of CombineXorOpnd(). It creates a bitwise-and
-/// instruction with the given two operands, and return the resulting
-/// instruction. There are two special cases: 1) if the constant operand is 0,
-/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
-/// be returned.
-static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
- const APInt &ConstOpnd) {
- if (ConstOpnd.isNullValue())
- return nullptr;
-
- if (ConstOpnd.isAllOnesValue())
- return Opnd;
-
- Instruction *I = BinaryOperator::CreateAnd(
- Opnd, ConstantInt::get(Opnd->getType(), ConstOpnd), "and.ra",
- InsertBefore);
- I->setDebugLoc(InsertBefore->getDebugLoc());
- return I;
-}
-
-// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
-// into "R ^ C", where C would be 0, and R is a symbolic value.
-//
-// If it was successful, true is returned, and the "R" and "C" is returned
-// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
-// and both "Res" and "ConstOpnd" remain unchanged.
-bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
- APInt &ConstOpnd, Value *&Res) {
- // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2
- // = ((x | c1) ^ c1) ^ (c1 ^ c2)
- // = (x & ~c1) ^ (c1 ^ c2)
- // It is useful only when c1 == c2.
- if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isNullValue())
- return false;
-
- if (!Opnd1->getValue()->hasOneUse())
- return false;
-
- const APInt &C1 = Opnd1->getConstPart();
- if (C1 != ConstOpnd)
- return false;
-
- Value *X = Opnd1->getSymbolicPart();
- Res = createAndInstr(I, X, ~C1);
- // ConstOpnd was C2, now C1 ^ C2.
- ConstOpnd ^= C1;
-
- if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
- RedoInsts.insert(T);
- return true;
-}
-
-// Helper function of OptimizeXor(). It tries to simplify
-// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
-// symbolic value.
-//
-// If it was successful, true is returned, and the "R" and "C" is returned
-// via "Res" and "ConstOpnd", respectively (If the entire expression is
-// evaluated to a constant, the Res is set to NULL); otherwise, false is
-// returned, and both "Res" and "ConstOpnd" remain unchanged.
-bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
- XorOpnd *Opnd2, APInt &ConstOpnd,
- Value *&Res) {
- Value *X = Opnd1->getSymbolicPart();
- if (X != Opnd2->getSymbolicPart())
- return false;
-
- // This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.)
- int DeadInstNum = 1;
- if (Opnd1->getValue()->hasOneUse())
- DeadInstNum++;
- if (Opnd2->getValue()->hasOneUse())
- DeadInstNum++;
-
- // Xor-Rule 2:
- // (x | c1) ^ (x & c2)
- // = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1
- // = (x & ~c1) ^ (x & c2) ^ c1 // Xor-Rule 1
- // = (x & c3) ^ c1, where c3 = ~c1 ^ c2 // Xor-rule 3
- //
- if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) {
- if (Opnd2->isOrExpr())
- std::swap(Opnd1, Opnd2);
-
- const APInt &C1 = Opnd1->getConstPart();
- const APInt &C2 = Opnd2->getConstPart();
- APInt C3((~C1) ^ C2);
-
- // Do not increase code size!
- if (!C3.isNullValue() && !C3.isAllOnesValue()) {
- int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
- if (NewInstNum > DeadInstNum)
- return false;
- }
-
- Res = createAndInstr(I, X, C3);
- ConstOpnd ^= C1;
- } else if (Opnd1->isOrExpr()) {
- // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
- //
- const APInt &C1 = Opnd1->getConstPart();
- const APInt &C2 = Opnd2->getConstPart();
- APInt C3 = C1 ^ C2;
-
- // Do not increase code size
- if (!C3.isNullValue() && !C3.isAllOnesValue()) {
- int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
- if (NewInstNum > DeadInstNum)
- return false;
- }
-
- Res = createAndInstr(I, X, C3);
- ConstOpnd ^= C3;
- } else {
- // Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2))
- //
- const APInt &C1 = Opnd1->getConstPart();
- const APInt &C2 = Opnd2->getConstPart();
- APInt C3 = C1 ^ C2;
- Res = createAndInstr(I, X, C3);
- }
-
- // Put the original operands in the Redo list; hope they will be deleted
- // as dead code.
- if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
- RedoInsts.insert(T);
- if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue()))
- RedoInsts.insert(T);
-
- return true;
-}
-
-/// Optimize a series of operands to an 'xor' instruction. If it can be reduced
-/// to a single Value, it is returned, otherwise the Ops list is mutated as
-/// necessary.
-Value *ReassociatePass::OptimizeXor(Instruction *I,
- SmallVectorImpl<ValueEntry> &Ops) {
- if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
- return V;
-
- if (Ops.size() == 1)
- return nullptr;
-
- SmallVector<XorOpnd, 8> Opnds;
- SmallVector<XorOpnd*, 8> OpndPtrs;
- Type *Ty = Ops[0].Op->getType();
- APInt ConstOpnd(Ty->getScalarSizeInBits(), 0);
-
- // Step 1: Convert ValueEntry to XorOpnd
- for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
- Value *V = Ops[i].Op;
- const APInt *C;
- // TODO: Support non-splat vectors.
- if (match(V, m_APInt(C))) {
- ConstOpnd ^= *C;
- } else {
- XorOpnd O(V);
- O.setSymbolicRank(getRank(O.getSymbolicPart()));
- Opnds.push_back(O);
- }
- }
-
- // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds".
- // It would otherwise invalidate the "Opnds"'s iterator, and hence invalidate
- // the "OpndPtrs" as well. For the similar reason, do not fuse this loop
- // with the previous loop --- the iterator of the "Opnds" may be invalidated
- // when new elements are added to the vector.
- for (unsigned i = 0, e = Opnds.size(); i != e; ++i)
- OpndPtrs.push_back(&Opnds[i]);
-
- // Step 2: Sort the Xor-Operands in a way such that the operands containing
- // the same symbolic value cluster together. For instance, the input operand
- // sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
- // ("x | 123", "x & 789", "y & 456").
- //
- // The purpose is twofold:
- // 1) Cluster together the operands sharing the same symbolic-value.
- // 2) Operand having smaller symbolic-value-rank is permuted earlier, which
- // could potentially shorten crital path, and expose more loop-invariants.
- // Note that values' rank are basically defined in RPO order (FIXME).
- // So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier
- // than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
- // "z" in the order of X-Y-Z is better than any other orders.
- llvm::stable_sort(OpndPtrs, [](XorOpnd *LHS, XorOpnd *RHS) {
- return LHS->getSymbolicRank() < RHS->getSymbolicRank();
- });
-
- // Step 3: Combine adjacent operands
- XorOpnd *PrevOpnd = nullptr;
- bool Changed = false;
- for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
- XorOpnd *CurrOpnd = OpndPtrs[i];
- // The combined value
- Value *CV;
-
- // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
- if (!ConstOpnd.isNullValue() &&
- CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
- Changed = true;
- if (CV)
- *CurrOpnd = XorOpnd(CV);
- else {
- CurrOpnd->Invalidate();
- continue;
- }
- }
-
- if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) {
- PrevOpnd = CurrOpnd;
- continue;
- }
-
- // step 3.2: When previous and current operands share the same symbolic
- // value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd"
- if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
- // Remove previous operand
- PrevOpnd->Invalidate();
- if (CV) {
- *CurrOpnd = XorOpnd(CV);
- PrevOpnd = CurrOpnd;
- } else {
- CurrOpnd->Invalidate();
- PrevOpnd = nullptr;
- }
- Changed = true;
- }
- }
-
- // Step 4: Reassemble the Ops
- if (Changed) {
- Ops.clear();
- for (unsigned int i = 0, e = Opnds.size(); i < e; i++) {
- XorOpnd &O = Opnds[i];
- if (O.isInvalid())
- continue;
- ValueEntry VE(getRank(O.getValue()), O.getValue());
- Ops.push_back(VE);
- }
- if (!ConstOpnd.isNullValue()) {
- Value *C = ConstantInt::get(Ty, ConstOpnd);
- ValueEntry VE(getRank(C), C);
- Ops.push_back(VE);
- }
- unsigned Sz = Ops.size();
- if (Sz == 1)
- return Ops.back().Op;
- if (Sz == 0) {
- assert(ConstOpnd.isNullValue());
- return ConstantInt::get(Ty, ConstOpnd);
- }
- }
-
- return nullptr;
-}
-
-/// Optimize a series of operands to an 'add' instruction. This
-/// optimizes based on identities. If it can be reduced to a single Value, it
-/// is returned, otherwise the Ops list is mutated as necessary.
-Value *ReassociatePass::OptimizeAdd(Instruction *I,
- SmallVectorImpl<ValueEntry> &Ops) {
- // Scan the operand lists looking for X and -X pairs. If we find any, we
- // can simplify expressions like X+-X == 0 and X+~X ==-1. While we're at it,
- // scan for any
- // duplicates. We want to canonicalize Y+Y+Y+Z -> 3*Y+Z.
-
- for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
- Value *TheOp = Ops[i].Op;
- // Check to see if we've seen this operand before. If so, we factor all
- // instances of the operand together. Due to our sorting criteria, we know
- // that these need to be next to each other in the vector.
- if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) {
- // Rescan the list, remove all instances of this operand from the expr.
- unsigned NumFound = 0;
- do {
- Ops.erase(Ops.begin()+i);
- ++NumFound;
- } while (i != Ops.size() && Ops[i].Op == TheOp);
-
- LLVM_DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp
- << '\n');
- ++NumFactor;
-
- // Insert a new multiply.
- Type *Ty = TheOp->getType();
- Constant *C = Ty->isIntOrIntVectorTy() ?
- ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound);
- Instruction *Mul = CreateMul(TheOp, C, "factor", I, I);
-
- // Now that we have inserted a multiply, optimize it. This allows us to
- // handle cases that require multiple factoring steps, such as this:
- // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
- RedoInsts.insert(Mul);
-
- // If every add operand was a duplicate, return the multiply.
- if (Ops.empty())
- return Mul;
-
- // Otherwise, we had some input that didn't have the dupe, such as
- // "A + A + B" -> "A*2 + B". Add the new multiply to the list of
- // things being added by this operation.
- Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul));
-
- --i;
- e = Ops.size();
- continue;
- }
-
- // Check for X and -X or X and ~X in the operand list.
- Value *X;
- if (!match(TheOp, m_Neg(m_Value(X))) && !match(TheOp, m_Not(m_Value(X))) &&
- !match(TheOp, m_FNeg(m_Value(X))))
- continue;
-
- unsigned FoundX = FindInOperandList(Ops, i, X);
- if (FoundX == i)
- continue;
-
- // Remove X and -X from the operand list.
- if (Ops.size() == 2 &&
- (match(TheOp, m_Neg(m_Value())) || match(TheOp, m_FNeg(m_Value()))))
- return Constant::getNullValue(X->getType());
-
- // Remove X and ~X from the operand list.
- if (Ops.size() == 2 && match(TheOp, m_Not(m_Value())))
- return Constant::getAllOnesValue(X->getType());
-
- Ops.erase(Ops.begin()+i);
- if (i < FoundX)
- --FoundX;
- else
- --i; // Need to back up an extra one.
- Ops.erase(Ops.begin()+FoundX);
- ++NumAnnihil;
- --i; // Revisit element.
- e -= 2; // Removed two elements.
-
- // if X and ~X we append -1 to the operand list.
- if (match(TheOp, m_Not(m_Value()))) {
- Value *V = Constant::getAllOnesValue(X->getType());
- Ops.insert(Ops.end(), ValueEntry(getRank(V), V));
- e += 1;
- }
- }
-
- // Scan the operand list, checking to see if there are any common factors
- // between operands. Consider something like A*A+A*B*C+D. We would like to
- // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
- // To efficiently find this, we count the number of times a factor occurs
- // for any ADD operands that are MULs.
- DenseMap<Value*, unsigned> FactorOccurrences;
-
- // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4)
- // where they are actually the same multiply.
- unsigned MaxOcc = 0;
- Value *MaxOccVal = nullptr;
- for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
- BinaryOperator *BOp =
- isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
- if (!BOp)
- continue;
-
- // Compute all of the factors of this added value.
- SmallVector<Value*, 8> Factors;
- FindSingleUseMultiplyFactors(BOp, Factors);
- assert(Factors.size() > 1 && "Bad linearize!");
-
- // Add one to FactorOccurrences for each unique factor in this op.
- SmallPtrSet<Value*, 8> Duplicates;
- for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
- Value *Factor = Factors[i];
- if (!Duplicates.insert(Factor).second)
- continue;
-
- unsigned Occ = ++FactorOccurrences[Factor];
- if (Occ > MaxOcc) {
- MaxOcc = Occ;
- MaxOccVal = Factor;
- }
-
- // If Factor is a negative constant, add the negated value as a factor
- // because we can percolate the negate out. Watch for minint, which
- // cannot be positivified.
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) {
- if (CI->isNegative() && !CI->isMinValue(true)) {
- Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
- if (!Duplicates.insert(Factor).second)
- continue;
- unsigned Occ = ++FactorOccurrences[Factor];
- if (Occ > MaxOcc) {
- MaxOcc = Occ;
- MaxOccVal = Factor;
- }
- }
- } else if (ConstantFP *CF = dyn_cast<ConstantFP>(Factor)) {
- if (CF->isNegative()) {
- APFloat F(CF->getValueAPF());
- F.changeSign();
- Factor = ConstantFP::get(CF->getContext(), F);
- if (!Duplicates.insert(Factor).second)
- continue;
- unsigned Occ = ++FactorOccurrences[Factor];
- if (Occ > MaxOcc) {
- MaxOcc = Occ;
- MaxOccVal = Factor;
- }
- }
- }
- }
- }
-
- // If any factor occurred more than one time, we can pull it out.
- if (MaxOcc > 1) {
- LLVM_DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal
- << '\n');
- ++NumFactor;
-
- // Create a new instruction that uses the MaxOccVal twice. If we don't do
- // this, we could otherwise run into situations where removing a factor
- // from an expression will drop a use of maxocc, and this can cause
- // RemoveFactorFromExpression on successive values to behave differently.
- Instruction *DummyInst =
- I->getType()->isIntOrIntVectorTy()
- ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
- : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);
-
- SmallVector<WeakTrackingVH, 4> NewMulOps;
- for (unsigned i = 0; i != Ops.size(); ++i) {
- // Only try to remove factors from expressions we're allowed to.
- BinaryOperator *BOp =
- isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
- if (!BOp)
- continue;
-
- if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
- // The factorized operand may occur several times. Convert them all in
- // one fell swoop.
- for (unsigned j = Ops.size(); j != i;) {
- --j;
- if (Ops[j].Op == Ops[i].Op) {
- NewMulOps.push_back(V);
- Ops.erase(Ops.begin()+j);
- }
- }
- --i;
- }
- }
-
- // No need for extra uses anymore.
- DummyInst->deleteValue();
-
- unsigned NumAddedValues = NewMulOps.size();
- Value *V = EmitAddTreeOfValues(I, NewMulOps);
-
- // Now that we have inserted the add tree, optimize it. This allows us to
- // handle cases that require multiple factoring steps, such as this:
- // A*A*B + A*A*C --> A*(A*B+A*C) --> A*(A*(B+C))
- assert(NumAddedValues > 1 && "Each occurrence should contribute a value");
- (void)NumAddedValues;
- if (Instruction *VI = dyn_cast<Instruction>(V))
- RedoInsts.insert(VI);
-
- // Create the multiply.
- Instruction *V2 = CreateMul(V, MaxOccVal, "reass.mul", I, I);
-
- // Rerun associate on the multiply in case the inner expression turned into
- // a multiply. We want to make sure that we keep things in canonical form.
- RedoInsts.insert(V2);
-
- // If every add operand included the factor (e.g. "A*B + A*C"), then the
- // entire result expression is just the multiply "A*(B+C)".
- if (Ops.empty())
- return V2;
-
- // Otherwise, we had some input that didn't have the factor, such as
- // "A*B + A*C + D" -> "A*(B+C) + D". Add the new multiply to the list of
- // things being added by this operation.
- Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
- }
-
- return nullptr;
-}
-
-/// Build up a vector of value/power pairs factoring a product.
-///
-/// Given a series of multiplication operands, build a vector of factors and
-/// the powers each is raised to when forming the final product. Sort them in
-/// the order of descending power.
-///
-/// (x*x) -> [(x, 2)]
-/// ((x*x)*x) -> [(x, 3)]
-/// ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
-///
-/// \returns Whether any factors have a power greater than one.
-static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
- SmallVectorImpl<Factor> &Factors) {
- // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
- // Compute the sum of powers of simplifiable factors.
- unsigned FactorPowerSum = 0;
- for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) {
- Value *Op = Ops[Idx-1].Op;
-
- // Count the number of occurrences of this value.
- unsigned Count = 1;
- for (; Idx < Size && Ops[Idx].Op == Op; ++Idx)
- ++Count;
- // Track for simplification all factors which occur 2 or more times.
- if (Count > 1)
- FactorPowerSum += Count;
- }
-
- // We can only simplify factors if the sum of the powers of our simplifiable
- // factors is 4 or higher. When that is the case, we will *always* have
- // a simplification. This is an important invariant to prevent cyclicly
- // trying to simplify already minimal formations.
- if (FactorPowerSum < 4)
- return false;
-
- // Now gather the simplifiable factors, removing them from Ops.
- FactorPowerSum = 0;
- for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) {
- Value *Op = Ops[Idx-1].Op;
-
- // Count the number of occurrences of this value.
- unsigned Count = 1;
- for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx)
- ++Count;
- if (Count == 1)
- continue;
- // Move an even number of occurrences to Factors.
- Count &= ~1U;
- Idx -= Count;
- FactorPowerSum += Count;
- Factors.push_back(Factor(Op, Count));
- Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count);
- }
-
- // None of the adjustments above should have reduced the sum of factor powers
- // below our mininum of '4'.
- assert(FactorPowerSum >= 4);
-
- llvm::stable_sort(Factors, [](const Factor &LHS, const Factor &RHS) {
- return LHS.Power > RHS.Power;
- });
- return true;
-}
-
-/// Build a tree of multiplies, computing the product of Ops.
-static Value *buildMultiplyTree(IRBuilderBase &Builder,
- SmallVectorImpl<Value*> &Ops) {
- if (Ops.size() == 1)
- return Ops.back();
-
- Value *LHS = Ops.pop_back_val();
- do {
- if (LHS->getType()->isIntOrIntVectorTy())
- LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
- else
- LHS = Builder.CreateFMul(LHS, Ops.pop_back_val());
- } while (!Ops.empty());
-
- return LHS;
-}
-
-/// Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
-///
-/// Given a vector of values raised to various powers, where no two values are
-/// equal and the powers are sorted in decreasing order, compute the minimal
-/// DAG of multiplies to compute the final product, and return that product
-/// value.
-Value *
-ReassociatePass::buildMinimalMultiplyDAG(IRBuilderBase &Builder,
- SmallVectorImpl<Factor> &Factors) {
- assert(Factors[0].Power);
- SmallVector<Value *, 4> OuterProduct;
- for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size();
- Idx < Size && Factors[Idx].Power > 0; ++Idx) {
- if (Factors[Idx].Power != Factors[LastIdx].Power) {
- LastIdx = Idx;
- continue;
- }
-
- // We want to multiply across all the factors with the same power so that
- // we can raise them to that power as a single entity. Build a mini tree
- // for that.
- SmallVector<Value *, 4> InnerProduct;
- InnerProduct.push_back(Factors[LastIdx].Base);
- do {
- InnerProduct.push_back(Factors[Idx].Base);
- ++Idx;
- } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power);
-
- // Reset the base value of the first factor to the new expression tree.
- // We'll remove all the factors with the same power in a second pass.
- Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct);
- if (Instruction *MI = dyn_cast<Instruction>(M))
- RedoInsts.insert(MI);
-
- LastIdx = Idx;
- }
- // Unique factors with equal powers -- we've folded them into the first one's
- // base.
- Factors.erase(std::unique(Factors.begin(), Factors.end(),
- [](const Factor &LHS, const Factor &RHS) {
- return LHS.Power == RHS.Power;
- }),
- Factors.end());
-
- // Iteratively collect the base of each factor with an add power into the
- // outer product, and halve each power in preparation for squaring the
- // expression.
- for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) {
- if (Factors[Idx].Power & 1)
- OuterProduct.push_back(Factors[Idx].Base);
- Factors[Idx].Power >>= 1;
- }
- if (Factors[0].Power) {
- Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors);
- OuterProduct.push_back(SquareRoot);
- OuterProduct.push_back(SquareRoot);
- }
- if (OuterProduct.size() == 1)
- return OuterProduct.front();
-
- Value *V = buildMultiplyTree(Builder, OuterProduct);
- return V;
-}
-
-Value *ReassociatePass::OptimizeMul(BinaryOperator *I,
- SmallVectorImpl<ValueEntry> &Ops) {
- // We can only optimize the multiplies when there is a chain of more than
- // three, such that a balanced tree might require fewer total multiplies.
- if (Ops.size() < 4)
- return nullptr;
-
- // Try to turn linear trees of multiplies without other uses of the
- // intermediate stages into minimal multiply DAGs with perfect sub-expression
- // re-use.
- SmallVector<Factor, 4> Factors;
- if (!collectMultiplyFactors(Ops, Factors))
- return nullptr; // All distinct factors, so nothing left for us to do.
-
- IRBuilder<> Builder(I);
- // The reassociate transformation for FP operations is performed only
- // if unsafe algebra is permitted by FastMathFlags. Propagate those flags
- // to the newly generated operations.
- if (auto FPI = dyn_cast<FPMathOperator>(I))
- Builder.setFastMathFlags(FPI->getFastMathFlags());
-
- Value *V = buildMinimalMultiplyDAG(Builder, Factors);
- if (Ops.empty())
- return V;
-
- ValueEntry NewEntry = ValueEntry(getRank(V), V);
- Ops.insert(llvm::lower_bound(Ops, NewEntry), NewEntry);
- return nullptr;
-}
-
-Value *ReassociatePass::OptimizeExpression(BinaryOperator *I,
- SmallVectorImpl<ValueEntry> &Ops) {
- // Now that we have the linearized expression tree, try to optimize it.
- // Start by folding any constants that we found.
- Constant *Cst = nullptr;
- unsigned Opcode = I->getOpcode();
- while (!Ops.empty() && isa<Constant>(Ops.back().Op)) {
- Constant *C = cast<Constant>(Ops.pop_back_val().Op);
- Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C;
- }
- // If there was nothing but constants then we are done.
- if (Ops.empty())
- return Cst;
-
- // Put the combined constant back at the end of the operand list, except if
- // there is no point. For example, an add of 0 gets dropped here, while a
- // multiplication by zero turns the whole expression into zero.
- if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) {
- if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType()))
- return Cst;
- Ops.push_back(ValueEntry(0, Cst));
- }
-
- if (Ops.size() == 1) return Ops[0].Op;
-
- // Handle destructive annihilation due to identities between elements in the
- // argument list here.
- unsigned NumOps = Ops.size();
- switch (Opcode) {
- default: break;
- case Instruction::And:
- case Instruction::Or:
- if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
- return Result;
- break;
-
- case Instruction::Xor:
- if (Value *Result = OptimizeXor(I, Ops))
- return Result;
- break;
-
- case Instruction::Add:
- case Instruction::FAdd:
- if (Value *Result = OptimizeAdd(I, Ops))
- return Result;
- break;
-
- case Instruction::Mul:
- case Instruction::FMul:
- if (Value *Result = OptimizeMul(I, Ops))
- return Result;
- break;
- }
-
- if (Ops.size() != NumOps)
- return OptimizeExpression(I, Ops);
- return nullptr;
-}
-
-// Remove dead instructions and if any operands are trivially dead add them to
-// Insts so they will be removed as well.
-void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I,
- OrderedSet &Insts) {
- assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+ Value *V2 = EmitAddTreeOfValues(I, Ops);
+ return CreateAdd(V2, V1, "reass.add", I, I);
+}
+
+/// If V is an expression tree that is a multiplication sequence,
+/// and if this sequence contains a multiply by Factor,
+/// remove Factor from the tree and return the new tree.
+Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
+ BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
+ if (!BO)
+ return nullptr;
+
+ SmallVector<RepeatedValue, 8> Tree;
+ MadeChange |= LinearizeExprTree(BO, Tree);
+ SmallVector<ValueEntry, 8> Factors;
+ Factors.reserve(Tree.size());
+ for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+ RepeatedValue E = Tree[i];
+ Factors.append(E.second.getZExtValue(),
+ ValueEntry(getRank(E.first), E.first));
+ }
+
+ bool FoundFactor = false;
+ bool NeedsNegate = false;
+ for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+ if (Factors[i].Op == Factor) {
+ FoundFactor = true;
+ Factors.erase(Factors.begin()+i);
+ break;
+ }
+
+ // If this is a negative version of this factor, remove it.
+ if (ConstantInt *FC1 = dyn_cast<ConstantInt>(Factor)) {
+ if (ConstantInt *FC2 = dyn_cast<ConstantInt>(Factors[i].Op))
+ if (FC1->getValue() == -FC2->getValue()) {
+ FoundFactor = NeedsNegate = true;
+ Factors.erase(Factors.begin()+i);
+ break;
+ }
+ } else if (ConstantFP *FC1 = dyn_cast<ConstantFP>(Factor)) {
+ if (ConstantFP *FC2 = dyn_cast<ConstantFP>(Factors[i].Op)) {
+ const APFloat &F1 = FC1->getValueAPF();
+ APFloat F2(FC2->getValueAPF());
+ F2.changeSign();
+ if (F1 == F2) {
+ FoundFactor = NeedsNegate = true;
+ Factors.erase(Factors.begin() + i);
+ break;
+ }
+ }
+ }
+ }
+
+ if (!FoundFactor) {
+ // Make sure to restore the operands to the expression tree.
+ RewriteExprTree(BO, Factors);
+ return nullptr;
+ }
+
+ BasicBlock::iterator InsertPt = ++BO->getIterator();
+
+ // If this was just a single multiply, remove the multiply and return the only
+ // remaining operand.
+ if (Factors.size() == 1) {
+ RedoInsts.insert(BO);
+ V = Factors[0].Op;
+ } else {
+ RewriteExprTree(BO, Factors);
+ V = BO;
+ }
+
+ if (NeedsNegate)
+ V = CreateNeg(V, "neg", &*InsertPt, BO);
+
+ return V;
+}
+
+/// If V is a single-use multiply, recursively add its operands as factors,
+/// otherwise add V to the list of factors.
+///
+/// Ops is the top-level list of add operands we're trying to factor.
+static void FindSingleUseMultiplyFactors(Value *V,
+ SmallVectorImpl<Value*> &Factors) {
+ BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
+ if (!BO) {
+ Factors.push_back(V);
+ return;
+ }
+
+ // Otherwise, add the LHS and RHS to the list of factors.
+ FindSingleUseMultiplyFactors(BO->getOperand(1), Factors);
+ FindSingleUseMultiplyFactors(BO->getOperand(0), Factors);
+}
+
+/// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
+/// This optimizes based on identities. If it can be reduced to a single Value,
+/// it is returned, otherwise the Ops list is mutated as necessary.
+static Value *OptimizeAndOrXor(unsigned Opcode,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ // Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
+ // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ // First, check for X and ~X in the operand list.
+ assert(i < Ops.size());
+ Value *X;
+ if (match(Ops[i].Op, m_Not(m_Value(X)))) { // Cannot occur for ^.
+ unsigned FoundX = FindInOperandList(Ops, i, X);
+ if (FoundX != i) {
+ if (Opcode == Instruction::And) // ...&X&~X = 0
+ return Constant::getNullValue(X->getType());
+
+ if (Opcode == Instruction::Or) // ...|X|~X = -1
+ return Constant::getAllOnesValue(X->getType());
+ }
+ }
+
+ // Next, check for duplicate pairs of values, which we assume are next to
+ // each other, due to our sorting criteria.
+ assert(i < Ops.size());
+ if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
+ if (Opcode == Instruction::And || Opcode == Instruction::Or) {
+ // Drop duplicate values for And and Or.
+ Ops.erase(Ops.begin()+i);
+ --i; --e;
+ ++NumAnnihil;
+ continue;
+ }
+
+ // Drop pairs of values for Xor.
+ assert(Opcode == Instruction::Xor);
+ if (e == 2)
+ return Constant::getNullValue(Ops[0].Op->getType());
+
+ // Y ^ X^X -> Y
+ Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
+ i -= 1; e -= 2;
+ ++NumAnnihil;
+ }
+ }
+ return nullptr;
+}
+
+/// Helper function of CombineXorOpnd(). It creates a bitwise-and
+/// instruction with the given two operands, and return the resulting
+/// instruction. There are two special cases: 1) if the constant operand is 0,
+/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
+/// be returned.
+static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd,
+ const APInt &ConstOpnd) {
+ if (ConstOpnd.isNullValue())
+ return nullptr;
+
+ if (ConstOpnd.isAllOnesValue())
+ return Opnd;
+
+ Instruction *I = BinaryOperator::CreateAnd(
+ Opnd, ConstantInt::get(Opnd->getType(), ConstOpnd), "and.ra",
+ InsertBefore);
+ I->setDebugLoc(InsertBefore->getDebugLoc());
+ return I;
+}
+
+// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
+// into "R ^ C", where C would be 0, and R is a symbolic value.
+//
+// If it was successful, true is returned, and the "R" and "C" is returned
+// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
+// and both "Res" and "ConstOpnd" remain unchanged.
+bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+ APInt &ConstOpnd, Value *&Res) {
+ // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2
+ // = ((x | c1) ^ c1) ^ (c1 ^ c2)
+ // = (x & ~c1) ^ (c1 ^ c2)
+ // It is useful only when c1 == c2.
+ if (!Opnd1->isOrExpr() || Opnd1->getConstPart().isNullValue())
+ return false;
+
+ if (!Opnd1->getValue()->hasOneUse())
+ return false;
+
+ const APInt &C1 = Opnd1->getConstPart();
+ if (C1 != ConstOpnd)
+ return false;
+
+ Value *X = Opnd1->getSymbolicPart();
+ Res = createAndInstr(I, X, ~C1);
+ // ConstOpnd was C2, now C1 ^ C2.
+ ConstOpnd ^= C1;
+
+ if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+ RedoInsts.insert(T);
+ return true;
+}
+
+// Helper function of OptimizeXor(). It tries to simplify
+// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
+// symbolic value.
+//
+// If it was successful, true is returned, and the "R" and "C" is returned
+// via "Res" and "ConstOpnd", respectively (If the entire expression is
+// evaluated to a constant, the Res is set to NULL); otherwise, false is
+// returned, and both "Res" and "ConstOpnd" remain unchanged.
+bool ReassociatePass::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+ XorOpnd *Opnd2, APInt &ConstOpnd,
+ Value *&Res) {
+ Value *X = Opnd1->getSymbolicPart();
+ if (X != Opnd2->getSymbolicPart())
+ return false;
+
+ // This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.)
+ int DeadInstNum = 1;
+ if (Opnd1->getValue()->hasOneUse())
+ DeadInstNum++;
+ if (Opnd2->getValue()->hasOneUse())
+ DeadInstNum++;
+
+ // Xor-Rule 2:
+ // (x | c1) ^ (x & c2)
+ // = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1
+ // = (x & ~c1) ^ (x & c2) ^ c1 // Xor-Rule 1
+ // = (x & c3) ^ c1, where c3 = ~c1 ^ c2 // Xor-rule 3
+ //
+ if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) {
+ if (Opnd2->isOrExpr())
+ std::swap(Opnd1, Opnd2);
+
+ const APInt &C1 = Opnd1->getConstPart();
+ const APInt &C2 = Opnd2->getConstPart();
+ APInt C3((~C1) ^ C2);
+
+ // Do not increase code size!
+ if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+ int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
+ if (NewInstNum > DeadInstNum)
+ return false;
+ }
+
+ Res = createAndInstr(I, X, C3);
+ ConstOpnd ^= C1;
+ } else if (Opnd1->isOrExpr()) {
+ // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
+ //
+ const APInt &C1 = Opnd1->getConstPart();
+ const APInt &C2 = Opnd2->getConstPart();
+ APInt C3 = C1 ^ C2;
+
+ // Do not increase code size
+ if (!C3.isNullValue() && !C3.isAllOnesValue()) {
+ int NewInstNum = ConstOpnd.getBoolValue() ? 1 : 2;
+ if (NewInstNum > DeadInstNum)
+ return false;
+ }
+
+ Res = createAndInstr(I, X, C3);
+ ConstOpnd ^= C3;
+ } else {
+ // Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2))
+ //
+ const APInt &C1 = Opnd1->getConstPart();
+ const APInt &C2 = Opnd2->getConstPart();
+ APInt C3 = C1 ^ C2;
+ Res = createAndInstr(I, X, C3);
+ }
+
+ // Put the original operands in the Redo list; hope they will be deleted
+ // as dead code.
+ if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+ RedoInsts.insert(T);
+ if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue()))
+ RedoInsts.insert(T);
+
+ return true;
+}
+
+/// Optimize a series of operands to an 'xor' instruction. If it can be reduced
+/// to a single Value, it is returned, otherwise the Ops list is mutated as
+/// necessary.
+Value *ReassociatePass::OptimizeXor(Instruction *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
+ return V;
+
+ if (Ops.size() == 1)
+ return nullptr;
+
+ SmallVector<XorOpnd, 8> Opnds;
+ SmallVector<XorOpnd*, 8> OpndPtrs;
+ Type *Ty = Ops[0].Op->getType();
+ APInt ConstOpnd(Ty->getScalarSizeInBits(), 0);
+
+ // Step 1: Convert ValueEntry to XorOpnd
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ Value *V = Ops[i].Op;
+ const APInt *C;
+ // TODO: Support non-splat vectors.
+ if (match(V, m_APInt(C))) {
+ ConstOpnd ^= *C;
+ } else {
+ XorOpnd O(V);
+ O.setSymbolicRank(getRank(O.getSymbolicPart()));
+ Opnds.push_back(O);
+ }
+ }
+
+ // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds".
+ // It would otherwise invalidate the "Opnds"'s iterator, and hence invalidate
+ // the "OpndPtrs" as well. For the similar reason, do not fuse this loop
+ // with the previous loop --- the iterator of the "Opnds" may be invalidated
+ // when new elements are added to the vector.
+ for (unsigned i = 0, e = Opnds.size(); i != e; ++i)
+ OpndPtrs.push_back(&Opnds[i]);
+
+ // Step 2: Sort the Xor-Operands in a way such that the operands containing
+ // the same symbolic value cluster together. For instance, the input operand
+ // sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
+ // ("x | 123", "x & 789", "y & 456").
+ //
+ // The purpose is twofold:
+ // 1) Cluster together the operands sharing the same symbolic-value.
+ // 2) Operand having smaller symbolic-value-rank is permuted earlier, which
+ // could potentially shorten crital path, and expose more loop-invariants.
+ // Note that values' rank are basically defined in RPO order (FIXME).
+ // So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier
+ // than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
+ // "z" in the order of X-Y-Z is better than any other orders.
+ llvm::stable_sort(OpndPtrs, [](XorOpnd *LHS, XorOpnd *RHS) {
+ return LHS->getSymbolicRank() < RHS->getSymbolicRank();
+ });
+
+ // Step 3: Combine adjacent operands
+ XorOpnd *PrevOpnd = nullptr;
+ bool Changed = false;
+ for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
+ XorOpnd *CurrOpnd = OpndPtrs[i];
+ // The combined value
+ Value *CV;
+
+ // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
+ if (!ConstOpnd.isNullValue() &&
+ CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
+ Changed = true;
+ if (CV)
+ *CurrOpnd = XorOpnd(CV);
+ else {
+ CurrOpnd->Invalidate();
+ continue;
+ }
+ }
+
+ if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) {
+ PrevOpnd = CurrOpnd;
+ continue;
+ }
+
+ // step 3.2: When previous and current operands share the same symbolic
+ // value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd"
+ if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
+ // Remove previous operand
+ PrevOpnd->Invalidate();
+ if (CV) {
+ *CurrOpnd = XorOpnd(CV);
+ PrevOpnd = CurrOpnd;
+ } else {
+ CurrOpnd->Invalidate();
+ PrevOpnd = nullptr;
+ }
+ Changed = true;
+ }
+ }
+
+ // Step 4: Reassemble the Ops
+ if (Changed) {
+ Ops.clear();
+ for (unsigned int i = 0, e = Opnds.size(); i < e; i++) {
+ XorOpnd &O = Opnds[i];
+ if (O.isInvalid())
+ continue;
+ ValueEntry VE(getRank(O.getValue()), O.getValue());
+ Ops.push_back(VE);
+ }
+ if (!ConstOpnd.isNullValue()) {
+ Value *C = ConstantInt::get(Ty, ConstOpnd);
+ ValueEntry VE(getRank(C), C);
+ Ops.push_back(VE);
+ }
+ unsigned Sz = Ops.size();
+ if (Sz == 1)
+ return Ops.back().Op;
+ if (Sz == 0) {
+ assert(ConstOpnd.isNullValue());
+ return ConstantInt::get(Ty, ConstOpnd);
+ }
+ }
+
+ return nullptr;
+}
+
+/// Optimize a series of operands to an 'add' instruction. This
+/// optimizes based on identities. If it can be reduced to a single Value, it
+/// is returned, otherwise the Ops list is mutated as necessary.
+Value *ReassociatePass::OptimizeAdd(Instruction *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ // Scan the operand lists looking for X and -X pairs. If we find any, we
+ // can simplify expressions like X+-X == 0 and X+~X ==-1. While we're at it,
+ // scan for any
+ // duplicates. We want to canonicalize Y+Y+Y+Z -> 3*Y+Z.
+
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ Value *TheOp = Ops[i].Op;
+ // Check to see if we've seen this operand before. If so, we factor all
+ // instances of the operand together. Due to our sorting criteria, we know
+ // that these need to be next to each other in the vector.
+ if (i+1 != Ops.size() && Ops[i+1].Op == TheOp) {
+ // Rescan the list, remove all instances of this operand from the expr.
+ unsigned NumFound = 0;
+ do {
+ Ops.erase(Ops.begin()+i);
+ ++NumFound;
+ } while (i != Ops.size() && Ops[i].Op == TheOp);
+
+ LLVM_DEBUG(dbgs() << "\nFACTORING [" << NumFound << "]: " << *TheOp
+ << '\n');
+ ++NumFactor;
+
+ // Insert a new multiply.
+ Type *Ty = TheOp->getType();
+ Constant *C = Ty->isIntOrIntVectorTy() ?
+ ConstantInt::get(Ty, NumFound) : ConstantFP::get(Ty, NumFound);
+ Instruction *Mul = CreateMul(TheOp, C, "factor", I, I);
+
+ // Now that we have inserted a multiply, optimize it. This allows us to
+ // handle cases that require multiple factoring steps, such as this:
+ // (X*2) + (X*2) + (X*2) -> (X*2)*3 -> X*6
+ RedoInsts.insert(Mul);
+
+ // If every add operand was a duplicate, return the multiply.
+ if (Ops.empty())
+ return Mul;
+
+ // Otherwise, we had some input that didn't have the dupe, such as
+ // "A + A + B" -> "A*2 + B". Add the new multiply to the list of
+ // things being added by this operation.
+ Ops.insert(Ops.begin(), ValueEntry(getRank(Mul), Mul));
+
+ --i;
+ e = Ops.size();
+ continue;
+ }
+
+ // Check for X and -X or X and ~X in the operand list.
+ Value *X;
+ if (!match(TheOp, m_Neg(m_Value(X))) && !match(TheOp, m_Not(m_Value(X))) &&
+ !match(TheOp, m_FNeg(m_Value(X))))
+ continue;
+
+ unsigned FoundX = FindInOperandList(Ops, i, X);
+ if (FoundX == i)
+ continue;
+
+ // Remove X and -X from the operand list.
+ if (Ops.size() == 2 &&
+ (match(TheOp, m_Neg(m_Value())) || match(TheOp, m_FNeg(m_Value()))))
+ return Constant::getNullValue(X->getType());
+
+ // Remove X and ~X from the operand list.
+ if (Ops.size() == 2 && match(TheOp, m_Not(m_Value())))
+ return Constant::getAllOnesValue(X->getType());
+
+ Ops.erase(Ops.begin()+i);
+ if (i < FoundX)
+ --FoundX;
+ else
+ --i; // Need to back up an extra one.
+ Ops.erase(Ops.begin()+FoundX);
+ ++NumAnnihil;
+ --i; // Revisit element.
+ e -= 2; // Removed two elements.
+
+ // if X and ~X we append -1 to the operand list.
+ if (match(TheOp, m_Not(m_Value()))) {
+ Value *V = Constant::getAllOnesValue(X->getType());
+ Ops.insert(Ops.end(), ValueEntry(getRank(V), V));
+ e += 1;
+ }
+ }
+
+ // Scan the operand list, checking to see if there are any common factors
+ // between operands. Consider something like A*A+A*B*C+D. We would like to
+ // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
+ // To efficiently find this, we count the number of times a factor occurs
+ // for any ADD operands that are MULs.
+ DenseMap<Value*, unsigned> FactorOccurrences;
+
+ // Keep track of each multiply we see, to avoid triggering on (X*4)+(X*4)
+ // where they are actually the same multiply.
+ unsigned MaxOcc = 0;
+ Value *MaxOccVal = nullptr;
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+ BinaryOperator *BOp =
+ isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
+ if (!BOp)
+ continue;
+
+ // Compute all of the factors of this added value.
+ SmallVector<Value*, 8> Factors;
+ FindSingleUseMultiplyFactors(BOp, Factors);
+ assert(Factors.size() > 1 && "Bad linearize!");
+
+ // Add one to FactorOccurrences for each unique factor in this op.
+ SmallPtrSet<Value*, 8> Duplicates;
+ for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+ Value *Factor = Factors[i];
+ if (!Duplicates.insert(Factor).second)
+ continue;
+
+ unsigned Occ = ++FactorOccurrences[Factor];
+ if (Occ > MaxOcc) {
+ MaxOcc = Occ;
+ MaxOccVal = Factor;
+ }
+
+ // If Factor is a negative constant, add the negated value as a factor
+ // because we can percolate the negate out. Watch for minint, which
+ // cannot be positivified.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) {
+ if (CI->isNegative() && !CI->isMinValue(true)) {
+ Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
+ if (!Duplicates.insert(Factor).second)
+ continue;
+ unsigned Occ = ++FactorOccurrences[Factor];
+ if (Occ > MaxOcc) {
+ MaxOcc = Occ;
+ MaxOccVal = Factor;
+ }
+ }
+ } else if (ConstantFP *CF = dyn_cast<ConstantFP>(Factor)) {
+ if (CF->isNegative()) {
+ APFloat F(CF->getValueAPF());
+ F.changeSign();
+ Factor = ConstantFP::get(CF->getContext(), F);
+ if (!Duplicates.insert(Factor).second)
+ continue;
+ unsigned Occ = ++FactorOccurrences[Factor];
+ if (Occ > MaxOcc) {
+ MaxOcc = Occ;
+ MaxOccVal = Factor;
+ }
+ }
+ }
+ }
+ }
+
+ // If any factor occurred more than one time, we can pull it out.
+ if (MaxOcc > 1) {
+ LLVM_DEBUG(dbgs() << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal
+ << '\n');
+ ++NumFactor;
+
+ // Create a new instruction that uses the MaxOccVal twice. If we don't do
+ // this, we could otherwise run into situations where removing a factor
+ // from an expression will drop a use of maxocc, and this can cause
+ // RemoveFactorFromExpression on successive values to behave differently.
+ Instruction *DummyInst =
+ I->getType()->isIntOrIntVectorTy()
+ ? BinaryOperator::CreateAdd(MaxOccVal, MaxOccVal)
+ : BinaryOperator::CreateFAdd(MaxOccVal, MaxOccVal);
+
+ SmallVector<WeakTrackingVH, 4> NewMulOps;
+ for (unsigned i = 0; i != Ops.size(); ++i) {
+ // Only try to remove factors from expressions we're allowed to.
+ BinaryOperator *BOp =
+ isReassociableOp(Ops[i].Op, Instruction::Mul, Instruction::FMul);
+ if (!BOp)
+ continue;
+
+ if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
+ // The factorized operand may occur several times. Convert them all in
+ // one fell swoop.
+ for (unsigned j = Ops.size(); j != i;) {
+ --j;
+ if (Ops[j].Op == Ops[i].Op) {
+ NewMulOps.push_back(V);
+ Ops.erase(Ops.begin()+j);
+ }
+ }
+ --i;
+ }
+ }
+
+ // No need for extra uses anymore.
+ DummyInst->deleteValue();
+
+ unsigned NumAddedValues = NewMulOps.size();
+ Value *V = EmitAddTreeOfValues(I, NewMulOps);
+
+ // Now that we have inserted the add tree, optimize it. This allows us to
+ // handle cases that require multiple factoring steps, such as this:
+ // A*A*B + A*A*C --> A*(A*B+A*C) --> A*(A*(B+C))
+ assert(NumAddedValues > 1 && "Each occurrence should contribute a value");
+ (void)NumAddedValues;
+ if (Instruction *VI = dyn_cast<Instruction>(V))
+ RedoInsts.insert(VI);
+
+ // Create the multiply.
+ Instruction *V2 = CreateMul(V, MaxOccVal, "reass.mul", I, I);
+
+ // Rerun associate on the multiply in case the inner expression turned into
+ // a multiply. We want to make sure that we keep things in canonical form.
+ RedoInsts.insert(V2);
+
+ // If every add operand included the factor (e.g. "A*B + A*C"), then the
+ // entire result expression is just the multiply "A*(B+C)".
+ if (Ops.empty())
+ return V2;
+
+ // Otherwise, we had some input that didn't have the factor, such as
+ // "A*B + A*C + D" -> "A*(B+C) + D". Add the new multiply to the list of
+ // things being added by this operation.
+ Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
+ }
+
+ return nullptr;
+}
+
+/// Build up a vector of value/power pairs factoring a product.
+///
+/// Given a series of multiplication operands, build a vector of factors and
+/// the powers each is raised to when forming the final product. Sort them in
+/// the order of descending power.
+///
+/// (x*x) -> [(x, 2)]
+/// ((x*x)*x) -> [(x, 3)]
+/// ((((x*y)*x)*y)*x) -> [(x, 3), (y, 2)]
+///
+/// \returns Whether any factors have a power greater than one.
+static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
+ SmallVectorImpl<Factor> &Factors) {
+ // FIXME: Have Ops be (ValueEntry, Multiplicity) pairs, simplifying this.
+ // Compute the sum of powers of simplifiable factors.
+ unsigned FactorPowerSum = 0;
+ for (unsigned Idx = 1, Size = Ops.size(); Idx < Size; ++Idx) {
+ Value *Op = Ops[Idx-1].Op;
+
+ // Count the number of occurrences of this value.
+ unsigned Count = 1;
+ for (; Idx < Size && Ops[Idx].Op == Op; ++Idx)
+ ++Count;
+ // Track for simplification all factors which occur 2 or more times.
+ if (Count > 1)
+ FactorPowerSum += Count;
+ }
+
+ // We can only simplify factors if the sum of the powers of our simplifiable
+ // factors is 4 or higher. When that is the case, we will *always* have
+ // a simplification. This is an important invariant to prevent cyclicly
+ // trying to simplify already minimal formations.
+ if (FactorPowerSum < 4)
+ return false;
+
+ // Now gather the simplifiable factors, removing them from Ops.
+ FactorPowerSum = 0;
+ for (unsigned Idx = 1; Idx < Ops.size(); ++Idx) {
+ Value *Op = Ops[Idx-1].Op;
+
+ // Count the number of occurrences of this value.
+ unsigned Count = 1;
+ for (; Idx < Ops.size() && Ops[Idx].Op == Op; ++Idx)
+ ++Count;
+ if (Count == 1)
+ continue;
+ // Move an even number of occurrences to Factors.
+ Count &= ~1U;
+ Idx -= Count;
+ FactorPowerSum += Count;
+ Factors.push_back(Factor(Op, Count));
+ Ops.erase(Ops.begin()+Idx, Ops.begin()+Idx+Count);
+ }
+
+ // None of the adjustments above should have reduced the sum of factor powers
+ // below our mininum of '4'.
+ assert(FactorPowerSum >= 4);
+
+ llvm::stable_sort(Factors, [](const Factor &LHS, const Factor &RHS) {
+ return LHS.Power > RHS.Power;
+ });
+ return true;
+}
+
+/// Build a tree of multiplies, computing the product of Ops.
+static Value *buildMultiplyTree(IRBuilderBase &Builder,
+ SmallVectorImpl<Value*> &Ops) {
+ if (Ops.size() == 1)
+ return Ops.back();
+
+ Value *LHS = Ops.pop_back_val();
+ do {
+ if (LHS->getType()->isIntOrIntVectorTy())
+ LHS = Builder.CreateMul(LHS, Ops.pop_back_val());
+ else
+ LHS = Builder.CreateFMul(LHS, Ops.pop_back_val());
+ } while (!Ops.empty());
+
+ return LHS;
+}
+
+/// Build a minimal multiplication DAG for (a^x)*(b^y)*(c^z)*...
+///
+/// Given a vector of values raised to various powers, where no two values are
+/// equal and the powers are sorted in decreasing order, compute the minimal
+/// DAG of multiplies to compute the final product, and return that product
+/// value.
+Value *
+ReassociatePass::buildMinimalMultiplyDAG(IRBuilderBase &Builder,
+ SmallVectorImpl<Factor> &Factors) {
+ assert(Factors[0].Power);
+ SmallVector<Value *, 4> OuterProduct;
+ for (unsigned LastIdx = 0, Idx = 1, Size = Factors.size();
+ Idx < Size && Factors[Idx].Power > 0; ++Idx) {
+ if (Factors[Idx].Power != Factors[LastIdx].Power) {
+ LastIdx = Idx;
+ continue;
+ }
+
+ // We want to multiply across all the factors with the same power so that
+ // we can raise them to that power as a single entity. Build a mini tree
+ // for that.
+ SmallVector<Value *, 4> InnerProduct;
+ InnerProduct.push_back(Factors[LastIdx].Base);
+ do {
+ InnerProduct.push_back(Factors[Idx].Base);
+ ++Idx;
+ } while (Idx < Size && Factors[Idx].Power == Factors[LastIdx].Power);
+
+ // Reset the base value of the first factor to the new expression tree.
+ // We'll remove all the factors with the same power in a second pass.
+ Value *M = Factors[LastIdx].Base = buildMultiplyTree(Builder, InnerProduct);
+ if (Instruction *MI = dyn_cast<Instruction>(M))
+ RedoInsts.insert(MI);
+
+ LastIdx = Idx;
+ }
+ // Unique factors with equal powers -- we've folded them into the first one's
+ // base.
+ Factors.erase(std::unique(Factors.begin(), Factors.end(),
+ [](const Factor &LHS, const Factor &RHS) {
+ return LHS.Power == RHS.Power;
+ }),
+ Factors.end());
+
+ // Iteratively collect the base of each factor with an add power into the
+ // outer product, and halve each power in preparation for squaring the
+ // expression.
+ for (unsigned Idx = 0, Size = Factors.size(); Idx != Size; ++Idx) {
+ if (Factors[Idx].Power & 1)
+ OuterProduct.push_back(Factors[Idx].Base);
+ Factors[Idx].Power >>= 1;
+ }
+ if (Factors[0].Power) {
+ Value *SquareRoot = buildMinimalMultiplyDAG(Builder, Factors);
+ OuterProduct.push_back(SquareRoot);
+ OuterProduct.push_back(SquareRoot);
+ }
+ if (OuterProduct.size() == 1)
+ return OuterProduct.front();
+
+ Value *V = buildMultiplyTree(Builder, OuterProduct);
+ return V;
+}
+
+Value *ReassociatePass::OptimizeMul(BinaryOperator *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ // We can only optimize the multiplies when there is a chain of more than
+ // three, such that a balanced tree might require fewer total multiplies.
+ if (Ops.size() < 4)
+ return nullptr;
+
+ // Try to turn linear trees of multiplies without other uses of the
+ // intermediate stages into minimal multiply DAGs with perfect sub-expression
+ // re-use.
+ SmallVector<Factor, 4> Factors;
+ if (!collectMultiplyFactors(Ops, Factors))
+ return nullptr; // All distinct factors, so nothing left for us to do.
+
+ IRBuilder<> Builder(I);
+ // The reassociate transformation for FP operations is performed only
+ // if unsafe algebra is permitted by FastMathFlags. Propagate those flags
+ // to the newly generated operations.
+ if (auto FPI = dyn_cast<FPMathOperator>(I))
+ Builder.setFastMathFlags(FPI->getFastMathFlags());
+
+ Value *V = buildMinimalMultiplyDAG(Builder, Factors);
+ if (Ops.empty())
+ return V;
+
+ ValueEntry NewEntry = ValueEntry(getRank(V), V);
+ Ops.insert(llvm::lower_bound(Ops, NewEntry), NewEntry);
+ return nullptr;
+}
+
+Value *ReassociatePass::OptimizeExpression(BinaryOperator *I,
+ SmallVectorImpl<ValueEntry> &Ops) {
+ // Now that we have the linearized expression tree, try to optimize it.
+ // Start by folding any constants that we found.
+ Constant *Cst = nullptr;
+ unsigned Opcode = I->getOpcode();
+ while (!Ops.empty() && isa<Constant>(Ops.back().Op)) {
+ Constant *C = cast<Constant>(Ops.pop_back_val().Op);
+ Cst = Cst ? ConstantExpr::get(Opcode, C, Cst) : C;
+ }
+ // If there was nothing but constants then we are done.
+ if (Ops.empty())
+ return Cst;
+
+ // Put the combined constant back at the end of the operand list, except if
+ // there is no point. For example, an add of 0 gets dropped here, while a
+ // multiplication by zero turns the whole expression into zero.
+ if (Cst && Cst != ConstantExpr::getBinOpIdentity(Opcode, I->getType())) {
+ if (Cst == ConstantExpr::getBinOpAbsorber(Opcode, I->getType()))
+ return Cst;
+ Ops.push_back(ValueEntry(0, Cst));
+ }
+
+ if (Ops.size() == 1) return Ops[0].Op;
+
+ // Handle destructive annihilation due to identities between elements in the
+ // argument list here.
+ unsigned NumOps = Ops.size();
+ switch (Opcode) {
+ default: break;
+ case Instruction::And:
+ case Instruction::Or:
+ if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
+ return Result;
+ break;
+
+ case Instruction::Xor:
+ if (Value *Result = OptimizeXor(I, Ops))
+ return Result;
+ break;
+
+ case Instruction::Add:
+ case Instruction::FAdd:
+ if (Value *Result = OptimizeAdd(I, Ops))
+ return Result;
+ break;
+
+ case Instruction::Mul:
+ case Instruction::FMul:
+ if (Value *Result = OptimizeMul(I, Ops))
+ return Result;
+ break;
+ }
+
+ if (Ops.size() != NumOps)
+ return OptimizeExpression(I, Ops);
+ return nullptr;
+}
+
+// Remove dead instructions and if any operands are trivially dead add them to
+// Insts so they will be removed as well.
+void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I,
+ OrderedSet &Insts) {
+ assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
SmallVector<Value *, 4> Ops(I->operands());
- ValueRankMap.erase(I);
- Insts.remove(I);
- RedoInsts.remove(I);
- llvm::salvageDebugInfo(*I);
- I->eraseFromParent();
- for (auto Op : Ops)
- if (Instruction *OpInst = dyn_cast<Instruction>(Op))
- if (OpInst->use_empty())
- Insts.insert(OpInst);
-}
-
-/// Zap the given instruction, adding interesting operands to the work list.
-void ReassociatePass::EraseInst(Instruction *I) {
- assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
- LLVM_DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
-
+ ValueRankMap.erase(I);
+ Insts.remove(I);
+ RedoInsts.remove(I);
+ llvm::salvageDebugInfo(*I);
+ I->eraseFromParent();
+ for (auto Op : Ops)
+ if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+ if (OpInst->use_empty())
+ Insts.insert(OpInst);
+}
+
+/// Zap the given instruction, adding interesting operands to the work list.
+void ReassociatePass::EraseInst(Instruction *I) {
+ assert(isInstructionTriviallyDead(I) && "Trivially dead instructions only!");
+ LLVM_DEBUG(dbgs() << "Erasing dead inst: "; I->dump());
+
SmallVector<Value *, 8> Ops(I->operands());
- // Erase the dead instruction.
- ValueRankMap.erase(I);
- RedoInsts.remove(I);
- llvm::salvageDebugInfo(*I);
- I->eraseFromParent();
- // Optimize its operands.
- SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
- for (unsigned i = 0, e = Ops.size(); i != e; ++i)
- if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) {
- // If this is a node in an expression tree, climb to the expression root
- // and add that since that's where optimization actually happens.
- unsigned Opcode = Op->getOpcode();
- while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
- Visited.insert(Op).second)
- Op = Op->user_back();
-
- // The instruction we're going to push may be coming from a
- // dead block, and Reassociate skips the processing of unreachable
- // blocks because it's a waste of time and also because it can
- // lead to infinite loop due to LLVM's non-standard definition
- // of dominance.
- if (ValueRankMap.find(Op) != ValueRankMap.end())
- RedoInsts.insert(Op);
- }
-
- MadeChange = true;
-}
-
-/// Recursively analyze an expression to build a list of instructions that have
-/// negative floating-point constant operands. The caller can then transform
-/// the list to create positive constants for better reassociation and CSE.
-static void getNegatibleInsts(Value *V,
- SmallVectorImpl<Instruction *> &Candidates) {
- // Handle only one-use instructions. Combining negations does not justify
- // replicating instructions.
- Instruction *I;
- if (!match(V, m_OneUse(m_Instruction(I))))
- return;
-
- // Handle expressions of multiplications and divisions.
- // TODO: This could look through floating-point casts.
- const APFloat *C;
- switch (I->getOpcode()) {
- case Instruction::FMul:
- // Not expecting non-canonical code here. Bail out and wait.
- if (match(I->getOperand(0), m_Constant()))
- break;
-
- if (match(I->getOperand(1), m_APFloat(C)) && C->isNegative()) {
- Candidates.push_back(I);
- LLVM_DEBUG(dbgs() << "FMul with negative constant: " << *I << '\n');
- }
- getNegatibleInsts(I->getOperand(0), Candidates);
- getNegatibleInsts(I->getOperand(1), Candidates);
- break;
- case Instruction::FDiv:
- // Not expecting non-canonical code here. Bail out and wait.
- if (match(I->getOperand(0), m_Constant()) &&
- match(I->getOperand(1), m_Constant()))
- break;
-
- if ((match(I->getOperand(0), m_APFloat(C)) && C->isNegative()) ||
- (match(I->getOperand(1), m_APFloat(C)) && C->isNegative())) {
- Candidates.push_back(I);
- LLVM_DEBUG(dbgs() << "FDiv with negative constant: " << *I << '\n');
- }
- getNegatibleInsts(I->getOperand(0), Candidates);
- getNegatibleInsts(I->getOperand(1), Candidates);
- break;
- default:
- break;
- }
-}
-
-/// Given an fadd/fsub with an operand that is a one-use instruction
-/// (the fadd/fsub), try to change negative floating-point constants into
-/// positive constants to increase potential for reassociation and CSE.
-Instruction *ReassociatePass::canonicalizeNegFPConstantsForOp(Instruction *I,
- Instruction *Op,
- Value *OtherOp) {
- assert((I->getOpcode() == Instruction::FAdd ||
- I->getOpcode() == Instruction::FSub) && "Expected fadd/fsub");
-
- // Collect instructions with negative FP constants from the subtree that ends
- // in Op.
- SmallVector<Instruction *, 4> Candidates;
- getNegatibleInsts(Op, Candidates);
- if (Candidates.empty())
- return nullptr;
-
- // Don't canonicalize x + (-Constant * y) -> x - (Constant * y), if the
- // resulting subtract will be broken up later. This can get us into an
- // infinite loop during reassociation.
- bool IsFSub = I->getOpcode() == Instruction::FSub;
- bool NeedsSubtract = !IsFSub && Candidates.size() % 2 == 1;
- if (NeedsSubtract && ShouldBreakUpSubtract(I))
- return nullptr;
-
- for (Instruction *Negatible : Candidates) {
- const APFloat *C;
- if (match(Negatible->getOperand(0), m_APFloat(C))) {
- assert(!match(Negatible->getOperand(1), m_Constant()) &&
- "Expecting only 1 constant operand");
- assert(C->isNegative() && "Expected negative FP constant");
- Negatible->setOperand(0, ConstantFP::get(Negatible->getType(), abs(*C)));
- MadeChange = true;
- }
- if (match(Negatible->getOperand(1), m_APFloat(C))) {
- assert(!match(Negatible->getOperand(0), m_Constant()) &&
- "Expecting only 1 constant operand");
- assert(C->isNegative() && "Expected negative FP constant");
- Negatible->setOperand(1, ConstantFP::get(Negatible->getType(), abs(*C)));
- MadeChange = true;
- }
- }
- assert(MadeChange == true && "Negative constant candidate was not changed");
-
- // Negations cancelled out.
- if (Candidates.size() % 2 == 0)
- return I;
-
- // Negate the final operand in the expression by flipping the opcode of this
- // fadd/fsub.
- assert(Candidates.size() % 2 == 1 && "Expected odd number");
- IRBuilder<> Builder(I);
- Value *NewInst = IsFSub ? Builder.CreateFAddFMF(OtherOp, Op, I)
- : Builder.CreateFSubFMF(OtherOp, Op, I);
- I->replaceAllUsesWith(NewInst);
- RedoInsts.insert(I);
- return dyn_cast<Instruction>(NewInst);
-}
-
-/// Canonicalize expressions that contain a negative floating-point constant
-/// of the following form:
-/// OtherOp + (subtree) -> OtherOp {+/-} (canonical subtree)
-/// (subtree) + OtherOp -> OtherOp {+/-} (canonical subtree)
-/// OtherOp - (subtree) -> OtherOp {+/-} (canonical subtree)
-///
-/// The fadd/fsub opcode may be switched to allow folding a negation into the
-/// input instruction.
-Instruction *ReassociatePass::canonicalizeNegFPConstants(Instruction *I) {
- LLVM_DEBUG(dbgs() << "Combine negations for: " << *I << '\n');
- Value *X;
- Instruction *Op;
- if (match(I, m_FAdd(m_Value(X), m_OneUse(m_Instruction(Op)))))
- if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
- I = R;
- if (match(I, m_FAdd(m_OneUse(m_Instruction(Op)), m_Value(X))))
- if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
- I = R;
- if (match(I, m_FSub(m_Value(X), m_OneUse(m_Instruction(Op)))))
- if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
- I = R;
- return I;
-}
-
-/// Inspect and optimize the given instruction. Note that erasing
-/// instructions is not allowed.
-void ReassociatePass::OptimizeInst(Instruction *I) {
- // Only consider operations that we understand.
- if (!isa<UnaryOperator>(I) && !isa<BinaryOperator>(I))
- return;
-
- if (I->getOpcode() == Instruction::Shl && isa<ConstantInt>(I->getOperand(1)))
- // If an operand of this shift is a reassociable multiply, or if the shift
- // is used by a reassociable multiply or add, turn into a multiply.
- if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
- (I->hasOneUse() &&
- (isReassociableOp(I->user_back(), Instruction::Mul) ||
- isReassociableOp(I->user_back(), Instruction::Add)))) {
- Instruction *NI = ConvertShiftToMul(I);
- RedoInsts.insert(I);
- MadeChange = true;
- I = NI;
- }
-
- // Commute binary operators, to canonicalize the order of their operands.
- // This can potentially expose more CSE opportunities, and makes writing other
- // transformations simpler.
- if (I->isCommutative())
- canonicalizeOperands(I);
-
- // Canonicalize negative constants out of expressions.
- if (Instruction *Res = canonicalizeNegFPConstants(I))
- I = Res;
-
- // Don't optimize floating-point instructions unless they are 'fast'.
- if (I->getType()->isFPOrFPVectorTy() && !I->isFast())
- return;
-
- // Do not reassociate boolean (i1) expressions. We want to preserve the
- // original order of evaluation for short-circuited comparisons that
- // SimplifyCFG has folded to AND/OR expressions. If the expression
- // is not further optimized, it is likely to be transformed back to a
- // short-circuited form for code gen, and the source order may have been
- // optimized for the most likely conditions.
- if (I->getType()->isIntegerTy(1))
- return;
-
+ // Erase the dead instruction.
+ ValueRankMap.erase(I);
+ RedoInsts.remove(I);
+ llvm::salvageDebugInfo(*I);
+ I->eraseFromParent();
+ // Optimize its operands.
+ SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+ if (Instruction *Op = dyn_cast<Instruction>(Ops[i])) {
+ // If this is a node in an expression tree, climb to the expression root
+ // and add that since that's where optimization actually happens.
+ unsigned Opcode = Op->getOpcode();
+ while (Op->hasOneUse() && Op->user_back()->getOpcode() == Opcode &&
+ Visited.insert(Op).second)
+ Op = Op->user_back();
+
+ // The instruction we're going to push may be coming from a
+ // dead block, and Reassociate skips the processing of unreachable
+ // blocks because it's a waste of time and also because it can
+ // lead to infinite loop due to LLVM's non-standard definition
+ // of dominance.
+ if (ValueRankMap.find(Op) != ValueRankMap.end())
+ RedoInsts.insert(Op);
+ }
+
+ MadeChange = true;
+}
+
+/// Recursively analyze an expression to build a list of instructions that have
+/// negative floating-point constant operands. The caller can then transform
+/// the list to create positive constants for better reassociation and CSE.
+static void getNegatibleInsts(Value *V,
+ SmallVectorImpl<Instruction *> &Candidates) {
+ // Handle only one-use instructions. Combining negations does not justify
+ // replicating instructions.
+ Instruction *I;
+ if (!match(V, m_OneUse(m_Instruction(I))))
+ return;
+
+ // Handle expressions of multiplications and divisions.
+ // TODO: This could look through floating-point casts.
+ const APFloat *C;
+ switch (I->getOpcode()) {
+ case Instruction::FMul:
+ // Not expecting non-canonical code here. Bail out and wait.
+ if (match(I->getOperand(0), m_Constant()))
+ break;
+
+ if (match(I->getOperand(1), m_APFloat(C)) && C->isNegative()) {
+ Candidates.push_back(I);
+ LLVM_DEBUG(dbgs() << "FMul with negative constant: " << *I << '\n');
+ }
+ getNegatibleInsts(I->getOperand(0), Candidates);
+ getNegatibleInsts(I->getOperand(1), Candidates);
+ break;
+ case Instruction::FDiv:
+ // Not expecting non-canonical code here. Bail out and wait.
+ if (match(I->getOperand(0), m_Constant()) &&
+ match(I->getOperand(1), m_Constant()))
+ break;
+
+ if ((match(I->getOperand(0), m_APFloat(C)) && C->isNegative()) ||
+ (match(I->getOperand(1), m_APFloat(C)) && C->isNegative())) {
+ Candidates.push_back(I);
+ LLVM_DEBUG(dbgs() << "FDiv with negative constant: " << *I << '\n');
+ }
+ getNegatibleInsts(I->getOperand(0), Candidates);
+ getNegatibleInsts(I->getOperand(1), Candidates);
+ break;
+ default:
+ break;
+ }
+}
+
+/// Given an fadd/fsub with an operand that is a one-use instruction
+/// (the fadd/fsub), try to change negative floating-point constants into
+/// positive constants to increase potential for reassociation and CSE.
+Instruction *ReassociatePass::canonicalizeNegFPConstantsForOp(Instruction *I,
+ Instruction *Op,
+ Value *OtherOp) {
+ assert((I->getOpcode() == Instruction::FAdd ||
+ I->getOpcode() == Instruction::FSub) && "Expected fadd/fsub");
+
+ // Collect instructions with negative FP constants from the subtree that ends
+ // in Op.
+ SmallVector<Instruction *, 4> Candidates;
+ getNegatibleInsts(Op, Candidates);
+ if (Candidates.empty())
+ return nullptr;
+
+ // Don't canonicalize x + (-Constant * y) -> x - (Constant * y), if the
+ // resulting subtract will be broken up later. This can get us into an
+ // infinite loop during reassociation.
+ bool IsFSub = I->getOpcode() == Instruction::FSub;
+ bool NeedsSubtract = !IsFSub && Candidates.size() % 2 == 1;
+ if (NeedsSubtract && ShouldBreakUpSubtract(I))
+ return nullptr;
+
+ for (Instruction *Negatible : Candidates) {
+ const APFloat *C;
+ if (match(Negatible->getOperand(0), m_APFloat(C))) {
+ assert(!match(Negatible->getOperand(1), m_Constant()) &&
+ "Expecting only 1 constant operand");
+ assert(C->isNegative() && "Expected negative FP constant");
+ Negatible->setOperand(0, ConstantFP::get(Negatible->getType(), abs(*C)));
+ MadeChange = true;
+ }
+ if (match(Negatible->getOperand(1), m_APFloat(C))) {
+ assert(!match(Negatible->getOperand(0), m_Constant()) &&
+ "Expecting only 1 constant operand");
+ assert(C->isNegative() && "Expected negative FP constant");
+ Negatible->setOperand(1, ConstantFP::get(Negatible->getType(), abs(*C)));
+ MadeChange = true;
+ }
+ }
+ assert(MadeChange == true && "Negative constant candidate was not changed");
+
+ // Negations cancelled out.
+ if (Candidates.size() % 2 == 0)
+ return I;
+
+ // Negate the final operand in the expression by flipping the opcode of this
+ // fadd/fsub.
+ assert(Candidates.size() % 2 == 1 && "Expected odd number");
+ IRBuilder<> Builder(I);
+ Value *NewInst = IsFSub ? Builder.CreateFAddFMF(OtherOp, Op, I)
+ : Builder.CreateFSubFMF(OtherOp, Op, I);
+ I->replaceAllUsesWith(NewInst);
+ RedoInsts.insert(I);
+ return dyn_cast<Instruction>(NewInst);
+}
+
+/// Canonicalize expressions that contain a negative floating-point constant
+/// of the following form:
+/// OtherOp + (subtree) -> OtherOp {+/-} (canonical subtree)
+/// (subtree) + OtherOp -> OtherOp {+/-} (canonical subtree)
+/// OtherOp - (subtree) -> OtherOp {+/-} (canonical subtree)
+///
+/// The fadd/fsub opcode may be switched to allow folding a negation into the
+/// input instruction.
+Instruction *ReassociatePass::canonicalizeNegFPConstants(Instruction *I) {
+ LLVM_DEBUG(dbgs() << "Combine negations for: " << *I << '\n');
+ Value *X;
+ Instruction *Op;
+ if (match(I, m_FAdd(m_Value(X), m_OneUse(m_Instruction(Op)))))
+ if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+ I = R;
+ if (match(I, m_FAdd(m_OneUse(m_Instruction(Op)), m_Value(X))))
+ if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+ I = R;
+ if (match(I, m_FSub(m_Value(X), m_OneUse(m_Instruction(Op)))))
+ if (Instruction *R = canonicalizeNegFPConstantsForOp(I, Op, X))
+ I = R;
+ return I;
+}
+
+/// Inspect and optimize the given instruction. Note that erasing
+/// instructions is not allowed.
+void ReassociatePass::OptimizeInst(Instruction *I) {
+ // Only consider operations that we understand.
+ if (!isa<UnaryOperator>(I) && !isa<BinaryOperator>(I))
+ return;
+
+ if (I->getOpcode() == Instruction::Shl && isa<ConstantInt>(I->getOperand(1)))
+ // If an operand of this shift is a reassociable multiply, or if the shift
+ // is used by a reassociable multiply or add, turn into a multiply.
+ if (isReassociableOp(I->getOperand(0), Instruction::Mul) ||
+ (I->hasOneUse() &&
+ (isReassociableOp(I->user_back(), Instruction::Mul) ||
+ isReassociableOp(I->user_back(), Instruction::Add)))) {
+ Instruction *NI = ConvertShiftToMul(I);
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ }
+
+ // Commute binary operators, to canonicalize the order of their operands.
+ // This can potentially expose more CSE opportunities, and makes writing other
+ // transformations simpler.
+ if (I->isCommutative())
+ canonicalizeOperands(I);
+
+ // Canonicalize negative constants out of expressions.
+ if (Instruction *Res = canonicalizeNegFPConstants(I))
+ I = Res;
+
+ // Don't optimize floating-point instructions unless they are 'fast'.
+ if (I->getType()->isFPOrFPVectorTy() && !I->isFast())
+ return;
+
+ // Do not reassociate boolean (i1) expressions. We want to preserve the
+ // original order of evaluation for short-circuited comparisons that
+ // SimplifyCFG has folded to AND/OR expressions. If the expression
+ // is not further optimized, it is likely to be transformed back to a
+ // short-circuited form for code gen, and the source order may have been
+ // optimized for the most likely conditions.
+ if (I->getType()->isIntegerTy(1))
+ return;
+
// If this is a bitwise or instruction of operands
// with no common bits set, convert it to X+Y.
if (I->getOpcode() == Instruction::Or &&
@@ -2222,397 +2222,397 @@ void ReassociatePass::OptimizeInst(Instruction *I) {
I = NI;
}
- // If this is a subtract instruction which is not already in negate form,
- // see if we can convert it to X+-Y.
- if (I->getOpcode() == Instruction::Sub) {
- if (ShouldBreakUpSubtract(I)) {
- Instruction *NI = BreakUpSubtract(I, RedoInsts);
- RedoInsts.insert(I);
- MadeChange = true;
- I = NI;
- } else if (match(I, m_Neg(m_Value()))) {
- // Otherwise, this is a negation. See if the operand is a multiply tree
- // and if this is not an inner node of a multiply tree.
- if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
- (!I->hasOneUse() ||
- !isReassociableOp(I->user_back(), Instruction::Mul))) {
- Instruction *NI = LowerNegateToMultiply(I);
- // If the negate was simplified, revisit the users to see if we can
- // reassociate further.
- for (User *U : NI->users()) {
- if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
- RedoInsts.insert(Tmp);
- }
- RedoInsts.insert(I);
- MadeChange = true;
- I = NI;
- }
- }
- } else if (I->getOpcode() == Instruction::FNeg ||
- I->getOpcode() == Instruction::FSub) {
- if (ShouldBreakUpSubtract(I)) {
- Instruction *NI = BreakUpSubtract(I, RedoInsts);
- RedoInsts.insert(I);
- MadeChange = true;
- I = NI;
- } else if (match(I, m_FNeg(m_Value()))) {
- // Otherwise, this is a negation. See if the operand is a multiply tree
- // and if this is not an inner node of a multiply tree.
- Value *Op = isa<BinaryOperator>(I) ? I->getOperand(1) :
- I->getOperand(0);
- if (isReassociableOp(Op, Instruction::FMul) &&
- (!I->hasOneUse() ||
- !isReassociableOp(I->user_back(), Instruction::FMul))) {
- // If the negate was simplified, revisit the users to see if we can
- // reassociate further.
- Instruction *NI = LowerNegateToMultiply(I);
- for (User *U : NI->users()) {
- if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
- RedoInsts.insert(Tmp);
- }
- RedoInsts.insert(I);
- MadeChange = true;
- I = NI;
- }
- }
- }
-
- // If this instruction is an associative binary operator, process it.
- if (!I->isAssociative()) return;
- BinaryOperator *BO = cast<BinaryOperator>(I);
-
- // If this is an interior node of a reassociable tree, ignore it until we
- // get to the root of the tree, to avoid N^2 analysis.
- unsigned Opcode = BO->getOpcode();
- if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) {
- // During the initial run we will get to the root of the tree.
- // But if we get here while we are redoing instructions, there is no
- // guarantee that the root will be visited. So Redo later
- if (BO->user_back() != BO &&
- BO->getParent() == BO->user_back()->getParent())
- RedoInsts.insert(BO->user_back());
- return;
- }
-
- // If this is an add tree that is used by a sub instruction, ignore it
- // until we process the subtract.
- if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
- cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub)
- return;
- if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd &&
- cast<Instruction>(BO->user_back())->getOpcode() == Instruction::FSub)
- return;
-
- ReassociateExpression(BO);
-}
-
-void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
- // First, walk the expression tree, linearizing the tree, collecting the
- // operand information.
- SmallVector<RepeatedValue, 8> Tree;
- MadeChange |= LinearizeExprTree(I, Tree);
- SmallVector<ValueEntry, 8> Ops;
- Ops.reserve(Tree.size());
- for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
- RepeatedValue E = Tree[i];
- Ops.append(E.second.getZExtValue(),
- ValueEntry(getRank(E.first), E.first));
- }
-
- LLVM_DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
-
- // Now that we have linearized the tree to a list and have gathered all of
- // the operands and their ranks, sort the operands by their rank. Use a
- // stable_sort so that values with equal ranks will have their relative
- // positions maintained (and so the compiler is deterministic). Note that
- // this sorts so that the highest ranking values end up at the beginning of
- // the vector.
- llvm::stable_sort(Ops);
-
- // Now that we have the expression tree in a convenient
- // sorted form, optimize it globally if possible.
- if (Value *V = OptimizeExpression(I, Ops)) {
- if (V == I)
- // Self-referential expression in unreachable code.
- return;
- // This expression tree simplified to something that isn't a tree,
- // eliminate it.
- LLVM_DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
- I->replaceAllUsesWith(V);
- if (Instruction *VI = dyn_cast<Instruction>(V))
- if (I->getDebugLoc())
- VI->setDebugLoc(I->getDebugLoc());
- RedoInsts.insert(I);
- ++NumAnnihil;
- return;
- }
-
- // We want to sink immediates as deeply as possible except in the case where
- // this is a multiply tree used only by an add, and the immediate is a -1.
- // In this case we reassociate to put the negation on the outside so that we
- // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
- if (I->hasOneUse()) {
- if (I->getOpcode() == Instruction::Mul &&
- cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
- isa<ConstantInt>(Ops.back().Op) &&
- cast<ConstantInt>(Ops.back().Op)->isMinusOne()) {
- ValueEntry Tmp = Ops.pop_back_val();
- Ops.insert(Ops.begin(), Tmp);
- } else if (I->getOpcode() == Instruction::FMul &&
- cast<Instruction>(I->user_back())->getOpcode() ==
- Instruction::FAdd &&
- isa<ConstantFP>(Ops.back().Op) &&
- cast<ConstantFP>(Ops.back().Op)->isExactlyValue(-1.0)) {
- ValueEntry Tmp = Ops.pop_back_val();
- Ops.insert(Ops.begin(), Tmp);
- }
- }
-
- LLVM_DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
-
- if (Ops.size() == 1) {
- if (Ops[0].Op == I)
- // Self-referential expression in unreachable code.
- return;
-
- // This expression tree simplified to something that isn't a tree,
- // eliminate it.
- I->replaceAllUsesWith(Ops[0].Op);
- if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op))
- OI->setDebugLoc(I->getDebugLoc());
- RedoInsts.insert(I);
- return;
- }
-
- if (Ops.size() > 2 && Ops.size() <= GlobalReassociateLimit) {
- // Find the pair with the highest count in the pairmap and move it to the
- // back of the list so that it can later be CSE'd.
- // example:
- // a*b*c*d*e
- // if c*e is the most "popular" pair, we can express this as
- // (((c*e)*d)*b)*a
- unsigned Max = 1;
- unsigned BestRank = 0;
- std::pair<unsigned, unsigned> BestPair;
- unsigned Idx = I->getOpcode() - Instruction::BinaryOpsBegin;
- for (unsigned i = 0; i < Ops.size() - 1; ++i)
- for (unsigned j = i + 1; j < Ops.size(); ++j) {
- unsigned Score = 0;
- Value *Op0 = Ops[i].Op;
- Value *Op1 = Ops[j].Op;
- if (std::less<Value *>()(Op1, Op0))
- std::swap(Op0, Op1);
- auto it = PairMap[Idx].find({Op0, Op1});
- if (it != PairMap[Idx].end()) {
- // Functions like BreakUpSubtract() can erase the Values we're using
- // as keys and create new Values after we built the PairMap. There's a
- // small chance that the new nodes can have the same address as
- // something already in the table. We shouldn't accumulate the stored
- // score in that case as it refers to the wrong Value.
- if (it->second.isValid())
- Score += it->second.Score;
- }
-
- unsigned MaxRank = std::max(Ops[i].Rank, Ops[j].Rank);
- if (Score > Max || (Score == Max && MaxRank < BestRank)) {
- BestPair = {i, j};
- Max = Score;
- BestRank = MaxRank;
- }
- }
- if (Max > 1) {
- auto Op0 = Ops[BestPair.first];
- auto Op1 = Ops[BestPair.second];
- Ops.erase(&Ops[BestPair.second]);
- Ops.erase(&Ops[BestPair.first]);
- Ops.push_back(Op0);
- Ops.push_back(Op1);
- }
- }
- // Now that we ordered and optimized the expressions, splat them back into
- // the expression tree, removing any unneeded nodes.
- RewriteExprTree(I, Ops);
-}
-
-void
-ReassociatePass::BuildPairMap(ReversePostOrderTraversal<Function *> &RPOT) {
- // Make a "pairmap" of how often each operand pair occurs.
- for (BasicBlock *BI : RPOT) {
- for (Instruction &I : *BI) {
- if (!I.isAssociative())
- continue;
-
- // Ignore nodes that aren't at the root of trees.
- if (I.hasOneUse() && I.user_back()->getOpcode() == I.getOpcode())
- continue;
-
- // Collect all operands in a single reassociable expression.
- // Since Reassociate has already been run once, we can assume things
- // are already canonical according to Reassociation's regime.
- SmallVector<Value *, 8> Worklist = { I.getOperand(0), I.getOperand(1) };
- SmallVector<Value *, 8> Ops;
- while (!Worklist.empty() && Ops.size() <= GlobalReassociateLimit) {
- Value *Op = Worklist.pop_back_val();
- Instruction *OpI = dyn_cast<Instruction>(Op);
- if (!OpI || OpI->getOpcode() != I.getOpcode() || !OpI->hasOneUse()) {
- Ops.push_back(Op);
- continue;
- }
- // Be paranoid about self-referencing expressions in unreachable code.
- if (OpI->getOperand(0) != OpI)
- Worklist.push_back(OpI->getOperand(0));
- if (OpI->getOperand(1) != OpI)
- Worklist.push_back(OpI->getOperand(1));
- }
- // Skip extremely long expressions.
- if (Ops.size() > GlobalReassociateLimit)
- continue;
-
- // Add all pairwise combinations of operands to the pair map.
- unsigned BinaryIdx = I.getOpcode() - Instruction::BinaryOpsBegin;
- SmallSet<std::pair<Value *, Value*>, 32> Visited;
- for (unsigned i = 0; i < Ops.size() - 1; ++i) {
- for (unsigned j = i + 1; j < Ops.size(); ++j) {
- // Canonicalize operand orderings.
- Value *Op0 = Ops[i];
- Value *Op1 = Ops[j];
- if (std::less<Value *>()(Op1, Op0))
- std::swap(Op0, Op1);
- if (!Visited.insert({Op0, Op1}).second)
- continue;
- auto res = PairMap[BinaryIdx].insert({{Op0, Op1}, {Op0, Op1, 1}});
- if (!res.second) {
- // If either key value has been erased then we've got the same
- // address by coincidence. That can't happen here because nothing is
- // erasing values but it can happen by the time we're querying the
- // map.
- assert(res.first->second.isValid() && "WeakVH invalidated");
- ++res.first->second.Score;
- }
- }
- }
- }
- }
-}
-
-PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
- // Get the functions basic blocks in Reverse Post Order. This order is used by
- // BuildRankMap to pre calculate ranks correctly. It also excludes dead basic
- // blocks (it has been seen that the analysis in this pass could hang when
- // analysing dead basic blocks).
- ReversePostOrderTraversal<Function *> RPOT(&F);
-
- // Calculate the rank map for F.
- BuildRankMap(F, RPOT);
-
- // Build the pair map before running reassociate.
- // Technically this would be more accurate if we did it after one round
- // of reassociation, but in practice it doesn't seem to help much on
- // real-world code, so don't waste the compile time running reassociate
- // twice.
- // If a user wants, they could expicitly run reassociate twice in their
- // pass pipeline for further potential gains.
- // It might also be possible to update the pair map during runtime, but the
- // overhead of that may be large if there's many reassociable chains.
- BuildPairMap(RPOT);
-
- MadeChange = false;
-
- // Traverse the same blocks that were analysed by BuildRankMap.
- for (BasicBlock *BI : RPOT) {
- assert(RankMap.count(&*BI) && "BB should be ranked.");
- // Optimize every instruction in the basic block.
- for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;)
- if (isInstructionTriviallyDead(&*II)) {
- EraseInst(&*II++);
- } else {
- OptimizeInst(&*II);
- assert(II->getParent() == &*BI && "Moved to a different block!");
- ++II;
- }
-
- // Make a copy of all the instructions to be redone so we can remove dead
- // instructions.
- OrderedSet ToRedo(RedoInsts);
- // Iterate over all instructions to be reevaluated and remove trivially dead
- // instructions. If any operand of the trivially dead instruction becomes
- // dead mark it for deletion as well. Continue this process until all
- // trivially dead instructions have been removed.
- while (!ToRedo.empty()) {
- Instruction *I = ToRedo.pop_back_val();
- if (isInstructionTriviallyDead(I)) {
- RecursivelyEraseDeadInsts(I, ToRedo);
- MadeChange = true;
- }
- }
-
- // Now that we have removed dead instructions, we can reoptimize the
- // remaining instructions.
- while (!RedoInsts.empty()) {
- Instruction *I = RedoInsts.front();
- RedoInsts.erase(RedoInsts.begin());
- if (isInstructionTriviallyDead(I))
- EraseInst(I);
- else
- OptimizeInst(I);
- }
- }
-
- // We are done with the rank map and pair map.
- RankMap.clear();
- ValueRankMap.clear();
- for (auto &Entry : PairMap)
- Entry.clear();
-
- if (MadeChange) {
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<AAManager>();
- PA.preserve<BasicAA>();
- PA.preserve<GlobalsAA>();
- return PA;
- }
-
- return PreservedAnalyses::all();
-}
-
-namespace {
-
- class ReassociateLegacyPass : public FunctionPass {
- ReassociatePass Impl;
-
- public:
- static char ID; // Pass identification, replacement for typeid
-
- ReassociateLegacyPass() : FunctionPass(ID) {
- initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- FunctionAnalysisManager DummyFAM;
- auto PA = Impl.run(F, DummyFAM);
- return !PA.areAllPreserved();
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<BasicAAWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
- };
-
-} // end anonymous namespace
-
-char ReassociateLegacyPass::ID = 0;
-
-INITIALIZE_PASS(ReassociateLegacyPass, "reassociate",
- "Reassociate expressions", false, false)
-
-// Public interface to the Reassociate pass
-FunctionPass *llvm::createReassociatePass() {
- return new ReassociateLegacyPass();
-}
+ // If this is a subtract instruction which is not already in negate form,
+ // see if we can convert it to X+-Y.
+ if (I->getOpcode() == Instruction::Sub) {
+ if (ShouldBreakUpSubtract(I)) {
+ Instruction *NI = BreakUpSubtract(I, RedoInsts);
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ } else if (match(I, m_Neg(m_Value()))) {
+ // Otherwise, this is a negation. See if the operand is a multiply tree
+ // and if this is not an inner node of a multiply tree.
+ if (isReassociableOp(I->getOperand(1), Instruction::Mul) &&
+ (!I->hasOneUse() ||
+ !isReassociableOp(I->user_back(), Instruction::Mul))) {
+ Instruction *NI = LowerNegateToMultiply(I);
+ // If the negate was simplified, revisit the users to see if we can
+ // reassociate further.
+ for (User *U : NI->users()) {
+ if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+ RedoInsts.insert(Tmp);
+ }
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ }
+ }
+ } else if (I->getOpcode() == Instruction::FNeg ||
+ I->getOpcode() == Instruction::FSub) {
+ if (ShouldBreakUpSubtract(I)) {
+ Instruction *NI = BreakUpSubtract(I, RedoInsts);
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ } else if (match(I, m_FNeg(m_Value()))) {
+ // Otherwise, this is a negation. See if the operand is a multiply tree
+ // and if this is not an inner node of a multiply tree.
+ Value *Op = isa<BinaryOperator>(I) ? I->getOperand(1) :
+ I->getOperand(0);
+ if (isReassociableOp(Op, Instruction::FMul) &&
+ (!I->hasOneUse() ||
+ !isReassociableOp(I->user_back(), Instruction::FMul))) {
+ // If the negate was simplified, revisit the users to see if we can
+ // reassociate further.
+ Instruction *NI = LowerNegateToMultiply(I);
+ for (User *U : NI->users()) {
+ if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+ RedoInsts.insert(Tmp);
+ }
+ RedoInsts.insert(I);
+ MadeChange = true;
+ I = NI;
+ }
+ }
+ }
+
+ // If this instruction is an associative binary operator, process it.
+ if (!I->isAssociative()) return;
+ BinaryOperator *BO = cast<BinaryOperator>(I);
+
+ // If this is an interior node of a reassociable tree, ignore it until we
+ // get to the root of the tree, to avoid N^2 analysis.
+ unsigned Opcode = BO->getOpcode();
+ if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) {
+ // During the initial run we will get to the root of the tree.
+ // But if we get here while we are redoing instructions, there is no
+ // guarantee that the root will be visited. So Redo later
+ if (BO->user_back() != BO &&
+ BO->getParent() == BO->user_back()->getParent())
+ RedoInsts.insert(BO->user_back());
+ return;
+ }
+
+ // If this is an add tree that is used by a sub instruction, ignore it
+ // until we process the subtract.
+ if (BO->hasOneUse() && BO->getOpcode() == Instruction::Add &&
+ cast<Instruction>(BO->user_back())->getOpcode() == Instruction::Sub)
+ return;
+ if (BO->hasOneUse() && BO->getOpcode() == Instruction::FAdd &&
+ cast<Instruction>(BO->user_back())->getOpcode() == Instruction::FSub)
+ return;
+
+ ReassociateExpression(BO);
+}
+
+void ReassociatePass::ReassociateExpression(BinaryOperator *I) {
+ // First, walk the expression tree, linearizing the tree, collecting the
+ // operand information.
+ SmallVector<RepeatedValue, 8> Tree;
+ MadeChange |= LinearizeExprTree(I, Tree);
+ SmallVector<ValueEntry, 8> Ops;
+ Ops.reserve(Tree.size());
+ for (unsigned i = 0, e = Tree.size(); i != e; ++i) {
+ RepeatedValue E = Tree[i];
+ Ops.append(E.second.getZExtValue(),
+ ValueEntry(getRank(E.first), E.first));
+ }
+
+ LLVM_DEBUG(dbgs() << "RAIn:\t"; PrintOps(I, Ops); dbgs() << '\n');
+
+ // Now that we have linearized the tree to a list and have gathered all of
+ // the operands and their ranks, sort the operands by their rank. Use a
+ // stable_sort so that values with equal ranks will have their relative
+ // positions maintained (and so the compiler is deterministic). Note that
+ // this sorts so that the highest ranking values end up at the beginning of
+ // the vector.
+ llvm::stable_sort(Ops);
+
+ // Now that we have the expression tree in a convenient
+ // sorted form, optimize it globally if possible.
+ if (Value *V = OptimizeExpression(I, Ops)) {
+ if (V == I)
+ // Self-referential expression in unreachable code.
+ return;
+ // This expression tree simplified to something that isn't a tree,
+ // eliminate it.
+ LLVM_DEBUG(dbgs() << "Reassoc to scalar: " << *V << '\n');
+ I->replaceAllUsesWith(V);
+ if (Instruction *VI = dyn_cast<Instruction>(V))
+ if (I->getDebugLoc())
+ VI->setDebugLoc(I->getDebugLoc());
+ RedoInsts.insert(I);
+ ++NumAnnihil;
+ return;
+ }
+
+ // We want to sink immediates as deeply as possible except in the case where
+ // this is a multiply tree used only by an add, and the immediate is a -1.
+ // In this case we reassociate to put the negation on the outside so that we
+ // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
+ if (I->hasOneUse()) {
+ if (I->getOpcode() == Instruction::Mul &&
+ cast<Instruction>(I->user_back())->getOpcode() == Instruction::Add &&
+ isa<ConstantInt>(Ops.back().Op) &&
+ cast<ConstantInt>(Ops.back().Op)->isMinusOne()) {
+ ValueEntry Tmp = Ops.pop_back_val();
+ Ops.insert(Ops.begin(), Tmp);
+ } else if (I->getOpcode() == Instruction::FMul &&
+ cast<Instruction>(I->user_back())->getOpcode() ==
+ Instruction::FAdd &&
+ isa<ConstantFP>(Ops.back().Op) &&
+ cast<ConstantFP>(Ops.back().Op)->isExactlyValue(-1.0)) {
+ ValueEntry Tmp = Ops.pop_back_val();
+ Ops.insert(Ops.begin(), Tmp);
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "RAOut:\t"; PrintOps(I, Ops); dbgs() << '\n');
+
+ if (Ops.size() == 1) {
+ if (Ops[0].Op == I)
+ // Self-referential expression in unreachable code.
+ return;
+
+ // This expression tree simplified to something that isn't a tree,
+ // eliminate it.
+ I->replaceAllUsesWith(Ops[0].Op);
+ if (Instruction *OI = dyn_cast<Instruction>(Ops[0].Op))
+ OI->setDebugLoc(I->getDebugLoc());
+ RedoInsts.insert(I);
+ return;
+ }
+
+ if (Ops.size() > 2 && Ops.size() <= GlobalReassociateLimit) {
+ // Find the pair with the highest count in the pairmap and move it to the
+ // back of the list so that it can later be CSE'd.
+ // example:
+ // a*b*c*d*e
+ // if c*e is the most "popular" pair, we can express this as
+ // (((c*e)*d)*b)*a
+ unsigned Max = 1;
+ unsigned BestRank = 0;
+ std::pair<unsigned, unsigned> BestPair;
+ unsigned Idx = I->getOpcode() - Instruction::BinaryOpsBegin;
+ for (unsigned i = 0; i < Ops.size() - 1; ++i)
+ for (unsigned j = i + 1; j < Ops.size(); ++j) {
+ unsigned Score = 0;
+ Value *Op0 = Ops[i].Op;
+ Value *Op1 = Ops[j].Op;
+ if (std::less<Value *>()(Op1, Op0))
+ std::swap(Op0, Op1);
+ auto it = PairMap[Idx].find({Op0, Op1});
+ if (it != PairMap[Idx].end()) {
+ // Functions like BreakUpSubtract() can erase the Values we're using
+ // as keys and create new Values after we built the PairMap. There's a
+ // small chance that the new nodes can have the same address as
+ // something already in the table. We shouldn't accumulate the stored
+ // score in that case as it refers to the wrong Value.
+ if (it->second.isValid())
+ Score += it->second.Score;
+ }
+
+ unsigned MaxRank = std::max(Ops[i].Rank, Ops[j].Rank);
+ if (Score > Max || (Score == Max && MaxRank < BestRank)) {
+ BestPair = {i, j};
+ Max = Score;
+ BestRank = MaxRank;
+ }
+ }
+ if (Max > 1) {
+ auto Op0 = Ops[BestPair.first];
+ auto Op1 = Ops[BestPair.second];
+ Ops.erase(&Ops[BestPair.second]);
+ Ops.erase(&Ops[BestPair.first]);
+ Ops.push_back(Op0);
+ Ops.push_back(Op1);
+ }
+ }
+ // Now that we ordered and optimized the expressions, splat them back into
+ // the expression tree, removing any unneeded nodes.
+ RewriteExprTree(I, Ops);
+}
+
+void
+ReassociatePass::BuildPairMap(ReversePostOrderTraversal<Function *> &RPOT) {
+ // Make a "pairmap" of how often each operand pair occurs.
+ for (BasicBlock *BI : RPOT) {
+ for (Instruction &I : *BI) {
+ if (!I.isAssociative())
+ continue;
+
+ // Ignore nodes that aren't at the root of trees.
+ if (I.hasOneUse() && I.user_back()->getOpcode() == I.getOpcode())
+ continue;
+
+ // Collect all operands in a single reassociable expression.
+ // Since Reassociate has already been run once, we can assume things
+ // are already canonical according to Reassociation's regime.
+ SmallVector<Value *, 8> Worklist = { I.getOperand(0), I.getOperand(1) };
+ SmallVector<Value *, 8> Ops;
+ while (!Worklist.empty() && Ops.size() <= GlobalReassociateLimit) {
+ Value *Op = Worklist.pop_back_val();
+ Instruction *OpI = dyn_cast<Instruction>(Op);
+ if (!OpI || OpI->getOpcode() != I.getOpcode() || !OpI->hasOneUse()) {
+ Ops.push_back(Op);
+ continue;
+ }
+ // Be paranoid about self-referencing expressions in unreachable code.
+ if (OpI->getOperand(0) != OpI)
+ Worklist.push_back(OpI->getOperand(0));
+ if (OpI->getOperand(1) != OpI)
+ Worklist.push_back(OpI->getOperand(1));
+ }
+ // Skip extremely long expressions.
+ if (Ops.size() > GlobalReassociateLimit)
+ continue;
+
+ // Add all pairwise combinations of operands to the pair map.
+ unsigned BinaryIdx = I.getOpcode() - Instruction::BinaryOpsBegin;
+ SmallSet<std::pair<Value *, Value*>, 32> Visited;
+ for (unsigned i = 0; i < Ops.size() - 1; ++i) {
+ for (unsigned j = i + 1; j < Ops.size(); ++j) {
+ // Canonicalize operand orderings.
+ Value *Op0 = Ops[i];
+ Value *Op1 = Ops[j];
+ if (std::less<Value *>()(Op1, Op0))
+ std::swap(Op0, Op1);
+ if (!Visited.insert({Op0, Op1}).second)
+ continue;
+ auto res = PairMap[BinaryIdx].insert({{Op0, Op1}, {Op0, Op1, 1}});
+ if (!res.second) {
+ // If either key value has been erased then we've got the same
+ // address by coincidence. That can't happen here because nothing is
+ // erasing values but it can happen by the time we're querying the
+ // map.
+ assert(res.first->second.isValid() && "WeakVH invalidated");
+ ++res.first->second.Score;
+ }
+ }
+ }
+ }
+ }
+}
+
+PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
+ // Get the functions basic blocks in Reverse Post Order. This order is used by
+ // BuildRankMap to pre calculate ranks correctly. It also excludes dead basic
+ // blocks (it has been seen that the analysis in this pass could hang when
+ // analysing dead basic blocks).
+ ReversePostOrderTraversal<Function *> RPOT(&F);
+
+ // Calculate the rank map for F.
+ BuildRankMap(F, RPOT);
+
+ // Build the pair map before running reassociate.
+ // Technically this would be more accurate if we did it after one round
+ // of reassociation, but in practice it doesn't seem to help much on
+ // real-world code, so don't waste the compile time running reassociate
+ // twice.
+ // If a user wants, they could expicitly run reassociate twice in their
+ // pass pipeline for further potential gains.
+ // It might also be possible to update the pair map during runtime, but the
+ // overhead of that may be large if there's many reassociable chains.
+ BuildPairMap(RPOT);
+
+ MadeChange = false;
+
+ // Traverse the same blocks that were analysed by BuildRankMap.
+ for (BasicBlock *BI : RPOT) {
+ assert(RankMap.count(&*BI) && "BB should be ranked.");
+ // Optimize every instruction in the basic block.
+ for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;)
+ if (isInstructionTriviallyDead(&*II)) {
+ EraseInst(&*II++);
+ } else {
+ OptimizeInst(&*II);
+ assert(II->getParent() == &*BI && "Moved to a different block!");
+ ++II;
+ }
+
+ // Make a copy of all the instructions to be redone so we can remove dead
+ // instructions.
+ OrderedSet ToRedo(RedoInsts);
+ // Iterate over all instructions to be reevaluated and remove trivially dead
+ // instructions. If any operand of the trivially dead instruction becomes
+ // dead mark it for deletion as well. Continue this process until all
+ // trivially dead instructions have been removed.
+ while (!ToRedo.empty()) {
+ Instruction *I = ToRedo.pop_back_val();
+ if (isInstructionTriviallyDead(I)) {
+ RecursivelyEraseDeadInsts(I, ToRedo);
+ MadeChange = true;
+ }
+ }
+
+ // Now that we have removed dead instructions, we can reoptimize the
+ // remaining instructions.
+ while (!RedoInsts.empty()) {
+ Instruction *I = RedoInsts.front();
+ RedoInsts.erase(RedoInsts.begin());
+ if (isInstructionTriviallyDead(I))
+ EraseInst(I);
+ else
+ OptimizeInst(I);
+ }
+ }
+
+ // We are done with the rank map and pair map.
+ RankMap.clear();
+ ValueRankMap.clear();
+ for (auto &Entry : PairMap)
+ Entry.clear();
+
+ if (MadeChange) {
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<AAManager>();
+ PA.preserve<BasicAA>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+ }
+
+ return PreservedAnalyses::all();
+}
+
+namespace {
+
+ class ReassociateLegacyPass : public FunctionPass {
+ ReassociatePass Impl;
+
+ public:
+ static char ID; // Pass identification, replacement for typeid
+
+ ReassociateLegacyPass() : FunctionPass(ID) {
+ initializeReassociateLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ FunctionAnalysisManager DummyFAM;
+ auto PA = Impl.run(F, DummyFAM);
+ return !PA.areAllPreserved();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+ };
+
+} // end anonymous namespace
+
+char ReassociateLegacyPass::ID = 0;
+
+INITIALIZE_PASS(ReassociateLegacyPass, "reassociate",
+ "Reassociate expressions", false, false)
+
+// Public interface to the Reassociate pass
+FunctionPass *llvm::createReassociatePass() {
+ return new ReassociateLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/Reg2Mem.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/Reg2Mem.cpp
index 6d7adb2e07..a49b9ad3f6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -1,46 +1,46 @@
-//===- Reg2Mem.cpp - Convert registers to allocas -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file demotes all registers to memory references. It is intended to be
-// the inverse of PromoteMemoryToRegister. By converting to loads, the only
-// values live across basic blocks are allocas and loads before phi nodes.
-// It is intended that this should make CFG hacking much easier.
-// To make later hacking easier, the entry block is split into two, such that
-// all introduced allocas and nothing else are in the entry block.
-//
-//===----------------------------------------------------------------------===//
-
+//===- Reg2Mem.cpp - Convert registers to allocas -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file demotes all registers to memory references. It is intended to be
+// the inverse of PromoteMemoryToRegister. By converting to loads, the only
+// values live across basic blocks are allocas and loads before phi nodes.
+// It is intended that this should make CFG hacking much easier.
+// To make later hacking easier, the entry block is split into two, such that
+// all introduced allocas and nothing else are in the entry block.
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Scalar/Reg2Mem.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <list>
-using namespace llvm;
-
-#define DEBUG_TYPE "reg2mem"
-
-STATISTIC(NumRegsDemoted, "Number of registers demoted");
-STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted");
-
+#include "llvm/Transforms/Utils/Local.h"
+#include <list>
+using namespace llvm;
+
+#define DEBUG_TYPE "reg2mem"
+
+STATISTIC(NumRegsDemoted, "Number of registers demoted");
+STATISTIC(NumPhisDemoted, "Number of phi-nodes demoted");
+
static bool valueEscapes(const Instruction &Inst) {
const BasicBlock *BB = Inst.getParent();
for (const User *U : Inst.users()) {
@@ -49,51 +49,51 @@ static bool valueEscapes(const Instruction &Inst) {
return true;
}
return false;
-}
-
+}
+
static bool runPass(Function &F) {
- // Insert all new allocas into entry block.
- BasicBlock *BBEntry = &F.getEntryBlock();
- assert(pred_empty(BBEntry) &&
- "Entry block to function must not have predecessors!");
-
- // Find first non-alloca instruction and create insertion point. This is
- // safe if block is well-formed: it always have terminator, otherwise
- // we'll get and assertion.
- BasicBlock::iterator I = BBEntry->begin();
- while (isa<AllocaInst>(I)) ++I;
-
- CastInst *AllocaInsertionPoint = new BitCastInst(
- Constant::getNullValue(Type::getInt32Ty(F.getContext())),
- Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I);
-
- // Find the escaped instructions. But don't create stack slots for
- // allocas in entry block.
- std::list<Instruction*> WorkList;
+ // Insert all new allocas into entry block.
+ BasicBlock *BBEntry = &F.getEntryBlock();
+ assert(pred_empty(BBEntry) &&
+ "Entry block to function must not have predecessors!");
+
+ // Find first non-alloca instruction and create insertion point. This is
+ // safe if block is well-formed: it always have terminator, otherwise
+ // we'll get and assertion.
+ BasicBlock::iterator I = BBEntry->begin();
+ while (isa<AllocaInst>(I)) ++I;
+
+ CastInst *AllocaInsertionPoint = new BitCastInst(
+ Constant::getNullValue(Type::getInt32Ty(F.getContext())),
+ Type::getInt32Ty(F.getContext()), "reg2mem alloca point", &*I);
+
+ // Find the escaped instructions. But don't create stack slots for
+ // allocas in entry block.
+ std::list<Instruction*> WorkList;
for (Instruction &I : instructions(F))
if (!(isa<AllocaInst>(I) && I.getParent() == BBEntry) && valueEscapes(I))
WorkList.push_front(&I);
-
- // Demote escaped instructions
- NumRegsDemoted += WorkList.size();
+
+ // Demote escaped instructions
+ NumRegsDemoted += WorkList.size();
for (Instruction *I : WorkList)
DemoteRegToStack(*I, false, AllocaInsertionPoint);
-
- WorkList.clear();
-
- // Find all phi's
+
+ WorkList.clear();
+
+ // Find all phi's
for (BasicBlock &BB : F)
for (auto &Phi : BB.phis())
WorkList.push_front(&Phi);
-
- // Demote phi nodes
- NumPhisDemoted += WorkList.size();
+
+ // Demote phi nodes
+ NumPhisDemoted += WorkList.size();
for (Instruction *I : WorkList)
DemotePHIToStack(cast<PHINode>(I), AllocaInsertionPoint);
-
- return true;
-}
-
+
+ return true;
+}
+
PreservedAnalyses RegToMemPass::run(Function &F, FunctionAnalysisManager &AM) {
auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
auto *LI = &AM.getResult<LoopAnalysis>(F);
@@ -106,7 +106,7 @@ PreservedAnalyses RegToMemPass::run(Function &F, FunctionAnalysisManager &AM) {
PA.preserve<LoopAnalysis>();
return PA;
}
-
+
namespace {
struct RegToMemLegacy : public FunctionPass {
static char ID; // Pass identification, replacement for typeid
@@ -134,8 +134,8 @@ INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
INITIALIZE_PASS_END(RegToMemLegacy, "reg2mem",
"Demote all values to stack slots", false, false)
-// createDemoteRegisterToMemory - Provide an entry point to create this pass.
+// createDemoteRegisterToMemory - Provide an entry point to create this pass.
char &llvm::DemoteRegisterToMemoryID = RegToMemLegacy::ID;
-FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
+FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
return new RegToMemLegacy();
-}
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index b245d1e9d1..b7830555bf 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1,1546 +1,1546 @@
-//===- RewriteStatepointsForGC.cpp - Make GC relocations explicit ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Rewrite call/invoke instructions so as to make potential relocations
-// performed by the garbage collector explicit in the IR.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/RewriteStatepointsForGC.h"
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Statepoint.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-
-#define DEBUG_TYPE "rewrite-statepoints-for-gc"
-
-using namespace llvm;
-
-// Print the liveset found at the insert location
-static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden,
- cl::init(false));
-static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size", cl::Hidden,
- cl::init(false));
-
-// Print out the base pointers for debugging
-static cl::opt<bool> PrintBasePointers("spp-print-base-pointers", cl::Hidden,
- cl::init(false));
-
-// Cost threshold measuring when it is profitable to rematerialize value instead
-// of relocating it
-static cl::opt<unsigned>
-RematerializationThreshold("spp-rematerialization-threshold", cl::Hidden,
- cl::init(6));
-
-#ifdef EXPENSIVE_CHECKS
-static bool ClobberNonLive = true;
-#else
-static bool ClobberNonLive = false;
-#endif
-
-static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live",
- cl::location(ClobberNonLive),
- cl::Hidden);
-
-static cl::opt<bool>
- AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info",
- cl::Hidden, cl::init(true));
-
-/// The IR fed into RewriteStatepointsForGC may have had attributes and
-/// metadata implying dereferenceability that are no longer valid/correct after
-/// RewriteStatepointsForGC has run. This is because semantically, after
-/// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
-/// heap. stripNonValidData (conservatively) restores
-/// correctness by erasing all attributes in the module that externally imply
-/// dereferenceability. Similar reasoning also applies to the noalias
-/// attributes and metadata. gc.statepoint can touch the entire heap including
-/// noalias objects.
-/// Apart from attributes and metadata, we also remove instructions that imply
-/// constant physical memory: llvm.invariant.start.
-static void stripNonValidData(Module &M);
-
-static bool shouldRewriteStatepointsIn(Function &F);
-
-PreservedAnalyses RewriteStatepointsForGC::run(Module &M,
- ModuleAnalysisManager &AM) {
- bool Changed = false;
- auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
- for (Function &F : M) {
- // Nothing to do for declarations.
- if (F.isDeclaration() || F.empty())
- continue;
-
- // Policy choice says not to rewrite - the most common reason is that we're
- // compiling code without a GCStrategy.
- if (!shouldRewriteStatepointsIn(F))
- continue;
-
- auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
- auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
- auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
- Changed |= runOnFunction(F, DT, TTI, TLI);
- }
- if (!Changed)
- return PreservedAnalyses::all();
-
- // stripNonValidData asserts that shouldRewriteStatepointsIn
- // returns true for at least one function in the module. Since at least
- // one function changed, we know that the precondition is satisfied.
- stripNonValidData(M);
-
- PreservedAnalyses PA;
- PA.preserve<TargetIRAnalysis>();
- PA.preserve<TargetLibraryAnalysis>();
- return PA;
-}
-
-namespace {
-
-class RewriteStatepointsForGCLegacyPass : public ModulePass {
- RewriteStatepointsForGC Impl;
-
-public:
- static char ID; // Pass identification, replacement for typeid
-
- RewriteStatepointsForGCLegacyPass() : ModulePass(ID), Impl() {
- initializeRewriteStatepointsForGCLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- bool Changed = false;
- for (Function &F : M) {
- // Nothing to do for declarations.
- if (F.isDeclaration() || F.empty())
- continue;
-
- // Policy choice says not to rewrite - the most common reason is that
- // we're compiling code without a GCStrategy.
- if (!shouldRewriteStatepointsIn(F))
- continue;
-
- TargetTransformInfo &TTI =
- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- const TargetLibraryInfo &TLI =
- getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- auto &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
-
- Changed |= Impl.runOnFunction(F, DT, TTI, TLI);
- }
-
- if (!Changed)
- return false;
-
- // stripNonValidData asserts that shouldRewriteStatepointsIn
- // returns true for at least one function in the module. Since at least
- // one function changed, we know that the precondition is satisfied.
- stripNonValidData(M);
- return true;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- // We add and rewrite a bunch of instructions, but don't really do much
- // else. We could in theory preserve a lot more analyses here.
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-char RewriteStatepointsForGCLegacyPass::ID = 0;
-
-ModulePass *llvm::createRewriteStatepointsForGCLegacyPass() {
- return new RewriteStatepointsForGCLegacyPass();
-}
-
-INITIALIZE_PASS_BEGIN(RewriteStatepointsForGCLegacyPass,
- "rewrite-statepoints-for-gc",
- "Make relocations explicit at statepoints", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(RewriteStatepointsForGCLegacyPass,
- "rewrite-statepoints-for-gc",
- "Make relocations explicit at statepoints", false, false)
-
-namespace {
-
-struct GCPtrLivenessData {
- /// Values defined in this block.
- MapVector<BasicBlock *, SetVector<Value *>> KillSet;
-
- /// Values used in this block (and thus live); does not included values
- /// killed within this block.
- MapVector<BasicBlock *, SetVector<Value *>> LiveSet;
-
- /// Values live into this basic block (i.e. used by any
- /// instruction in this basic block or ones reachable from here)
- MapVector<BasicBlock *, SetVector<Value *>> LiveIn;
-
- /// Values live out of this basic block (i.e. live into
- /// any successor block)
- MapVector<BasicBlock *, SetVector<Value *>> LiveOut;
-};
-
-// The type of the internal cache used inside the findBasePointers family
-// of functions. From the callers perspective, this is an opaque type and
-// should not be inspected.
-//
-// In the actual implementation this caches two relations:
-// - The base relation itself (i.e. this pointer is based on that one)
-// - The base defining value relation (i.e. before base_phi insertion)
-// Generally, after the execution of a full findBasePointer call, only the
-// base relation will remain. Internally, we add a mixture of the two
-// types, then update all the second type to the first type
-using DefiningValueMapTy = MapVector<Value *, Value *>;
-using StatepointLiveSetTy = SetVector<Value *>;
-using RematerializedValueMapTy =
- MapVector<AssertingVH<Instruction>, AssertingVH<Value>>;
-
-struct PartiallyConstructedSafepointRecord {
- /// The set of values known to be live across this safepoint
- StatepointLiveSetTy LiveSet;
-
- /// Mapping from live pointers to a base-defining-value
- MapVector<Value *, Value *> PointerToBase;
-
- /// The *new* gc.statepoint instruction itself. This produces the token
- /// that normal path gc.relocates and the gc.result are tied to.
- GCStatepointInst *StatepointToken;
-
- /// Instruction to which exceptional gc relocates are attached
- /// Makes it easier to iterate through them during relocationViaAlloca.
- Instruction *UnwindToken;
-
- /// Record live values we are rematerialized instead of relocating.
- /// They are not included into 'LiveSet' field.
- /// Maps rematerialized copy to it's original value.
- RematerializedValueMapTy RematerializedValues;
-};
-
-} // end anonymous namespace
-
-static ArrayRef<Use> GetDeoptBundleOperands(const CallBase *Call) {
- Optional<OperandBundleUse> DeoptBundle =
- Call->getOperandBundle(LLVMContext::OB_deopt);
-
- if (!DeoptBundle.hasValue()) {
- assert(AllowStatepointWithNoDeoptInfo &&
- "Found non-leaf call without deopt info!");
- return None;
- }
-
- return DeoptBundle.getValue().Inputs;
-}
-
-/// Compute the live-in set for every basic block in the function
-static void computeLiveInValues(DominatorTree &DT, Function &F,
- GCPtrLivenessData &Data);
-
-/// Given results from the dataflow liveness computation, find the set of live
-/// Values at a particular instruction.
-static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data,
- StatepointLiveSetTy &out);
-
-// TODO: Once we can get to the GCStrategy, this becomes
-// Optional<bool> isGCManagedPointer(const Type *Ty) const override {
-
-static bool isGCPointerType(Type *T) {
- if (auto *PT = dyn_cast<PointerType>(T))
- // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
- // GC managed heap. We know that a pointer into this heap needs to be
- // updated and that no other pointer does.
- return PT->getAddressSpace() == 1;
- return false;
-}
-
-// Return true if this type is one which a) is a gc pointer or contains a GC
-// pointer and b) is of a type this code expects to encounter as a live value.
-// (The insertion code will assert that a type which matches (a) and not (b)
-// is not encountered.)
-static bool isHandledGCPointerType(Type *T) {
- // We fully support gc pointers
- if (isGCPointerType(T))
- return true;
- // We partially support vectors of gc pointers. The code will assert if it
- // can't handle something.
- if (auto VT = dyn_cast<VectorType>(T))
- if (isGCPointerType(VT->getElementType()))
- return true;
- return false;
-}
-
-#ifndef NDEBUG
-/// Returns true if this type contains a gc pointer whether we know how to
-/// handle that type or not.
-static bool containsGCPtrType(Type *Ty) {
- if (isGCPointerType(Ty))
- return true;
- if (VectorType *VT = dyn_cast<VectorType>(Ty))
- return isGCPointerType(VT->getScalarType());
- if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
- return containsGCPtrType(AT->getElementType());
- if (StructType *ST = dyn_cast<StructType>(Ty))
- return llvm::any_of(ST->elements(), containsGCPtrType);
- return false;
-}
-
-// Returns true if this is a type which a) is a gc pointer or contains a GC
-// pointer and b) is of a type which the code doesn't expect (i.e. first class
-// aggregates). Used to trip assertions.
-static bool isUnhandledGCPointerType(Type *Ty) {
- return containsGCPtrType(Ty) && !isHandledGCPointerType(Ty);
-}
-#endif
-
-// Return the name of the value suffixed with the provided value, or if the
-// value didn't have a name, the default value specified.
-static std::string suffixed_name_or(Value *V, StringRef Suffix,
- StringRef DefaultName) {
- return V->hasName() ? (V->getName() + Suffix).str() : DefaultName.str();
-}
-
-// Conservatively identifies any definitions which might be live at the
-// given instruction. The analysis is performed immediately before the
-// given instruction. Values defined by that instruction are not considered
-// live. Values used by that instruction are considered live.
-static void analyzeParsePointLiveness(
- DominatorTree &DT, GCPtrLivenessData &OriginalLivenessData, CallBase *Call,
- PartiallyConstructedSafepointRecord &Result) {
- StatepointLiveSetTy LiveSet;
- findLiveSetAtInst(Call, OriginalLivenessData, LiveSet);
-
- if (PrintLiveSet) {
- dbgs() << "Live Variables:\n";
- for (Value *V : LiveSet)
- dbgs() << " " << V->getName() << " " << *V << "\n";
- }
- if (PrintLiveSetSize) {
- dbgs() << "Safepoint For: " << Call->getCalledOperand()->getName() << "\n";
- dbgs() << "Number live values: " << LiveSet.size() << "\n";
- }
- Result.LiveSet = LiveSet;
-}
-
-// Returns true is V is a knownBaseResult.
-static bool isKnownBaseResult(Value *V);
-
-// Returns true if V is a BaseResult that already exists in the IR, i.e. it is
-// not created by the findBasePointers algorithm.
-static bool isOriginalBaseResult(Value *V);
-
-namespace {
-
-/// A single base defining value - An immediate base defining value for an
-/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'.
-/// For instructions which have multiple pointer [vector] inputs or that
-/// transition between vector and scalar types, there is no immediate base
-/// defining value. The 'base defining value' for 'Def' is the transitive
-/// closure of this relation stopping at the first instruction which has no
-/// immediate base defining value. The b.d.v. might itself be a base pointer,
-/// but it can also be an arbitrary derived pointer.
-struct BaseDefiningValueResult {
- /// Contains the value which is the base defining value.
- Value * const BDV;
-
- /// True if the base defining value is also known to be an actual base
- /// pointer.
- const bool IsKnownBase;
-
- BaseDefiningValueResult(Value *BDV, bool IsKnownBase)
- : BDV(BDV), IsKnownBase(IsKnownBase) {
-#ifndef NDEBUG
- // Check consistency between new and old means of checking whether a BDV is
- // a base.
- bool MustBeBase = isKnownBaseResult(BDV);
- assert(!MustBeBase || MustBeBase == IsKnownBase);
-#endif
- }
-};
-
-} // end anonymous namespace
-
-static BaseDefiningValueResult findBaseDefiningValue(Value *I);
-
-/// Return a base defining value for the 'Index' element of the given vector
-/// instruction 'I'. If Index is null, returns a BDV for the entire vector
-/// 'I'. As an optimization, this method will try to determine when the
-/// element is known to already be a base pointer. If this can be established,
-/// the second value in the returned pair will be true. Note that either a
-/// vector or a pointer typed value can be returned. For the former, the
-/// vector returned is a BDV (and possibly a base) of the entire vector 'I'.
-/// If the later, the return pointer is a BDV (or possibly a base) for the
-/// particular element in 'I'.
-static BaseDefiningValueResult
-findBaseDefiningValueOfVector(Value *I) {
- // Each case parallels findBaseDefiningValue below, see that code for
- // detailed motivation.
-
- if (isa<Argument>(I))
- // An incoming argument to the function is a base pointer
- return BaseDefiningValueResult(I, true);
-
- if (isa<Constant>(I))
- // Base of constant vector consists only of constant null pointers.
- // For reasoning see similar case inside 'findBaseDefiningValue' function.
- return BaseDefiningValueResult(ConstantAggregateZero::get(I->getType()),
- true);
-
- if (isa<LoadInst>(I))
- return BaseDefiningValueResult(I, true);
-
- if (isa<InsertElementInst>(I))
- // We don't know whether this vector contains entirely base pointers or
- // not. To be conservatively correct, we treat it as a BDV and will
- // duplicate code as needed to construct a parallel vector of bases.
- return BaseDefiningValueResult(I, false);
-
- if (isa<ShuffleVectorInst>(I))
- // We don't know whether this vector contains entirely base pointers or
- // not. To be conservatively correct, we treat it as a BDV and will
- // duplicate code as needed to construct a parallel vector of bases.
- // TODO: There a number of local optimizations which could be applied here
- // for particular sufflevector patterns.
- return BaseDefiningValueResult(I, false);
-
- // The behavior of getelementptr instructions is the same for vector and
- // non-vector data types.
- if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
- return findBaseDefiningValue(GEP->getPointerOperand());
-
- // If the pointer comes through a bitcast of a vector of pointers to
- // a vector of another type of pointer, then look through the bitcast
- if (auto *BC = dyn_cast<BitCastInst>(I))
- return findBaseDefiningValue(BC->getOperand(0));
-
- // We assume that functions in the source language only return base
- // pointers. This should probably be generalized via attributes to support
- // both source language and internal functions.
- if (isa<CallInst>(I) || isa<InvokeInst>(I))
- return BaseDefiningValueResult(I, true);
-
- // A PHI or Select is a base defining value. The outer findBasePointer
- // algorithm is responsible for constructing a base value for this BDV.
- assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
- "unknown vector instruction - no base found for vector element");
- return BaseDefiningValueResult(I, false);
-}
-
-/// Helper function for findBasePointer - Will return a value which either a)
-/// defines the base pointer for the input, b) blocks the simple search
-/// (i.e. a PHI or Select of two derived pointers), or c) involves a change
-/// from pointer to vector type or back.
-static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
- assert(I->getType()->isPtrOrPtrVectorTy() &&
- "Illegal to ask for the base pointer of a non-pointer type");
-
- if (I->getType()->isVectorTy())
- return findBaseDefiningValueOfVector(I);
-
- if (isa<Argument>(I))
- // An incoming argument to the function is a base pointer
- // We should have never reached here if this argument isn't an gc value
- return BaseDefiningValueResult(I, true);
-
- if (isa<Constant>(I)) {
- // We assume that objects with a constant base (e.g. a global) can't move
- // and don't need to be reported to the collector because they are always
- // live. Besides global references, all kinds of constants (e.g. undef,
- // constant expressions, null pointers) can be introduced by the inliner or
- // the optimizer, especially on dynamically dead paths.
- // Here we treat all of them as having single null base. By doing this we
- // trying to avoid problems reporting various conflicts in a form of
- // "phi (const1, const2)" or "phi (const, regular gc ptr)".
- // See constant.ll file for relevant test cases.
-
- return BaseDefiningValueResult(
- ConstantPointerNull::get(cast<PointerType>(I->getType())), true);
- }
-
- if (CastInst *CI = dyn_cast<CastInst>(I)) {
- Value *Def = CI->stripPointerCasts();
- // If stripping pointer casts changes the address space there is an
- // addrspacecast in between.
- assert(cast<PointerType>(Def->getType())->getAddressSpace() ==
- cast<PointerType>(CI->getType())->getAddressSpace() &&
- "unsupported addrspacecast");
- // If we find a cast instruction here, it means we've found a cast which is
- // not simply a pointer cast (i.e. an inttoptr). We don't know how to
- // handle int->ptr conversion.
- assert(!isa<CastInst>(Def) && "shouldn't find another cast here");
- return findBaseDefiningValue(Def);
- }
-
- if (isa<LoadInst>(I))
- // The value loaded is an gc base itself
- return BaseDefiningValueResult(I, true);
-
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
- // The base of this GEP is the base
- return findBaseDefiningValue(GEP->getPointerOperand());
-
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- switch (II->getIntrinsicID()) {
- default:
- // fall through to general call handling
- break;
- case Intrinsic::experimental_gc_statepoint:
- llvm_unreachable("statepoints don't produce pointers");
- case Intrinsic::experimental_gc_relocate:
- // Rerunning safepoint insertion after safepoints are already
- // inserted is not supported. It could probably be made to work,
- // but why are you doing this? There's no good reason.
- llvm_unreachable("repeat safepoint insertion is not supported");
- case Intrinsic::gcroot:
- // Currently, this mechanism hasn't been extended to work with gcroot.
- // There's no reason it couldn't be, but I haven't thought about the
- // implications much.
- llvm_unreachable(
- "interaction with the gcroot mechanism is not supported");
- }
- }
- // We assume that functions in the source language only return base
- // pointers. This should probably be generalized via attributes to support
- // both source language and internal functions.
- if (isa<CallInst>(I) || isa<InvokeInst>(I))
- return BaseDefiningValueResult(I, true);
-
- // TODO: I have absolutely no idea how to implement this part yet. It's not
- // necessarily hard, I just haven't really looked at it yet.
- assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented");
-
- if (isa<AtomicCmpXchgInst>(I))
- // A CAS is effectively a atomic store and load combined under a
- // predicate. From the perspective of base pointers, we just treat it
- // like a load.
- return BaseDefiningValueResult(I, true);
-
- assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are "
- "binary ops which don't apply to pointers");
-
- // The aggregate ops. Aggregates can either be in the heap or on the
- // stack, but in either case, this is simply a field load. As a result,
- // this is a defining definition of the base just like a load is.
- if (isa<ExtractValueInst>(I))
- return BaseDefiningValueResult(I, true);
-
- // We should never see an insert vector since that would require we be
- // tracing back a struct value not a pointer value.
- assert(!isa<InsertValueInst>(I) &&
- "Base pointer for a struct is meaningless");
-
- // An extractelement produces a base result exactly when it's input does.
- // We may need to insert a parallel instruction to extract the appropriate
- // element out of the base vector corresponding to the input. Given this,
- // it's analogous to the phi and select case even though it's not a merge.
- if (isa<ExtractElementInst>(I))
- // Note: There a lot of obvious peephole cases here. This are deliberately
- // handled after the main base pointer inference algorithm to make writing
- // test cases to exercise that code easier.
- return BaseDefiningValueResult(I, false);
-
- // The last two cases here don't return a base pointer. Instead, they
- // return a value which dynamically selects from among several base
- // derived pointers (each with it's own base potentially). It's the job of
- // the caller to resolve these.
- assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
- "missing instruction case in findBaseDefiningValing");
- return BaseDefiningValueResult(I, false);
-}
-
-/// Returns the base defining value for this value.
-static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) {
- Value *&Cached = Cache[I];
- if (!Cached) {
- Cached = findBaseDefiningValue(I).BDV;
- LLVM_DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
- << Cached->getName() << "\n");
- }
- assert(Cache[I] != nullptr);
- return Cached;
-}
-
-/// Return a base pointer for this value if known. Otherwise, return it's
-/// base defining value.
-static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) {
- Value *Def = findBaseDefiningValueCached(I, Cache);
- auto Found = Cache.find(Def);
- if (Found != Cache.end()) {
- // Either a base-of relation, or a self reference. Caller must check.
- return Found->second;
- }
- // Only a BDV available
- return Def;
-}
-
-/// This value is a base pointer that is not generated by RS4GC, i.e. it already
-/// exists in the code.
-static bool isOriginalBaseResult(Value *V) {
- // no recursion possible
- return !isa<PHINode>(V) && !isa<SelectInst>(V) &&
- !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) &&
- !isa<ShuffleVectorInst>(V);
-}
-
-/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV,
-/// is it known to be a base pointer? Or do we need to continue searching.
-static bool isKnownBaseResult(Value *V) {
- if (isOriginalBaseResult(V))
- return true;
- if (isa<Instruction>(V) &&
- cast<Instruction>(V)->getMetadata("is_base_value")) {
- // This is a previously inserted base phi or select. We know
- // that this is a base value.
- return true;
- }
-
- // We need to keep searching
- return false;
-}
-
-// Returns true if First and Second values are both scalar or both vector.
-static bool areBothVectorOrScalar(Value *First, Value *Second) {
- return isa<VectorType>(First->getType()) ==
- isa<VectorType>(Second->getType());
-}
-
-namespace {
-
-/// Models the state of a single base defining value in the findBasePointer
-/// algorithm for determining where a new instruction is needed to propagate
-/// the base of this BDV.
-class BDVState {
-public:
- enum Status { Unknown, Base, Conflict };
-
- BDVState() : BaseValue(nullptr) {}
-
- explicit BDVState(Status Status, Value *BaseValue = nullptr)
- : Status(Status), BaseValue(BaseValue) {
- assert(Status != Base || BaseValue);
- }
-
- explicit BDVState(Value *BaseValue) : Status(Base), BaseValue(BaseValue) {}
-
- Status getStatus() const { return Status; }
- Value *getBaseValue() const { return BaseValue; }
-
- bool isBase() const { return getStatus() == Base; }
- bool isUnknown() const { return getStatus() == Unknown; }
- bool isConflict() const { return getStatus() == Conflict; }
-
- bool operator==(const BDVState &Other) const {
- return BaseValue == Other.BaseValue && Status == Other.Status;
- }
-
- bool operator!=(const BDVState &other) const { return !(*this == other); }
-
- LLVM_DUMP_METHOD
- void dump() const {
- print(dbgs());
- dbgs() << '\n';
- }
-
- void print(raw_ostream &OS) const {
- switch (getStatus()) {
- case Unknown:
- OS << "U";
- break;
- case Base:
- OS << "B";
- break;
- case Conflict:
- OS << "C";
- break;
- }
- OS << " (" << getBaseValue() << " - "
- << (getBaseValue() ? getBaseValue()->getName() : "nullptr") << "): ";
- }
-
-private:
- Status Status = Unknown;
- AssertingVH<Value> BaseValue; // Non-null only if Status == Base.
-};
-
-} // end anonymous namespace
-
-#ifndef NDEBUG
-static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) {
- State.print(OS);
- return OS;
-}
-#endif
-
-static BDVState meetBDVStateImpl(const BDVState &LHS, const BDVState &RHS) {
- switch (LHS.getStatus()) {
- case BDVState::Unknown:
- return RHS;
-
- case BDVState::Base:
- assert(LHS.getBaseValue() && "can't be null");
- if (RHS.isUnknown())
- return LHS;
-
- if (RHS.isBase()) {
- if (LHS.getBaseValue() == RHS.getBaseValue()) {
- assert(LHS == RHS && "equality broken!");
- return LHS;
- }
- return BDVState(BDVState::Conflict);
- }
- assert(RHS.isConflict() && "only three states!");
- return BDVState(BDVState::Conflict);
-
- case BDVState::Conflict:
- return LHS;
- }
- llvm_unreachable("only three states!");
-}
-
-// Values of type BDVState form a lattice, and this function implements the meet
-// operation.
-static BDVState meetBDVState(const BDVState &LHS, const BDVState &RHS) {
- BDVState Result = meetBDVStateImpl(LHS, RHS);
- assert(Result == meetBDVStateImpl(RHS, LHS) &&
- "Math is wrong: meet does not commute!");
- return Result;
-}
-
-/// For a given value or instruction, figure out what base ptr its derived from.
-/// For gc objects, this is simply itself. On success, returns a value which is
-/// the base pointer. (This is reliable and can be used for relocation.) On
-/// failure, returns nullptr.
-static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
- Value *Def = findBaseOrBDV(I, Cache);
-
- if (isKnownBaseResult(Def) && areBothVectorOrScalar(Def, I))
- return Def;
-
- // Here's the rough algorithm:
- // - For every SSA value, construct a mapping to either an actual base
- // pointer or a PHI which obscures the base pointer.
- // - Construct a mapping from PHI to unknown TOP state. Use an
- // optimistic algorithm to propagate base pointer information. Lattice
- // looks like:
- // UNKNOWN
- // b1 b2 b3 b4
- // CONFLICT
- // When algorithm terminates, all PHIs will either have a single concrete
- // base or be in a conflict state.
- // - For every conflict, insert a dummy PHI node without arguments. Add
- // these to the base[Instruction] = BasePtr mapping. For every
- // non-conflict, add the actual base.
- // - For every conflict, add arguments for the base[a] of each input
- // arguments.
- //
- // Note: A simpler form of this would be to add the conflict form of all
- // PHIs without running the optimistic algorithm. This would be
- // analogous to pessimistic data flow and would likely lead to an
- // overall worse solution.
-
-#ifndef NDEBUG
- auto isExpectedBDVType = [](Value *BDV) {
- return isa<PHINode>(BDV) || isa<SelectInst>(BDV) ||
- isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV) ||
- isa<ShuffleVectorInst>(BDV);
- };
-#endif
-
- // Once populated, will contain a mapping from each potentially non-base BDV
- // to a lattice value (described above) which corresponds to that BDV.
- // We use the order of insertion (DFS over the def/use graph) to provide a
- // stable deterministic ordering for visiting DenseMaps (which are unordered)
- // below. This is important for deterministic compilation.
- MapVector<Value *, BDVState> States;
-
- // Recursively fill in all base defining values reachable from the initial
- // one for which we don't already know a definite base value for
- /* scope */ {
- SmallVector<Value*, 16> Worklist;
- Worklist.push_back(Def);
- States.insert({Def, BDVState()});
- while (!Worklist.empty()) {
- Value *Current = Worklist.pop_back_val();
- assert(!isOriginalBaseResult(Current) && "why did it get added?");
-
- auto visitIncomingValue = [&](Value *InVal) {
- Value *Base = findBaseOrBDV(InVal, Cache);
- if (isKnownBaseResult(Base) && areBothVectorOrScalar(Base, InVal))
- // Known bases won't need new instructions introduced and can be
- // ignored safely. However, this can only be done when InVal and Base
- // are both scalar or both vector. Otherwise, we need to find a
- // correct BDV for InVal, by creating an entry in the lattice
- // (States).
- return;
- assert(isExpectedBDVType(Base) && "the only non-base values "
- "we see should be base defining values");
- if (States.insert(std::make_pair(Base, BDVState())).second)
- Worklist.push_back(Base);
- };
- if (PHINode *PN = dyn_cast<PHINode>(Current)) {
- for (Value *InVal : PN->incoming_values())
- visitIncomingValue(InVal);
- } else if (SelectInst *SI = dyn_cast<SelectInst>(Current)) {
- visitIncomingValue(SI->getTrueValue());
- visitIncomingValue(SI->getFalseValue());
- } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) {
- visitIncomingValue(EE->getVectorOperand());
- } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) {
- visitIncomingValue(IE->getOperand(0)); // vector operand
- visitIncomingValue(IE->getOperand(1)); // scalar operand
- } else if (auto *SV = dyn_cast<ShuffleVectorInst>(Current)) {
- visitIncomingValue(SV->getOperand(0));
- visitIncomingValue(SV->getOperand(1));
- }
- else {
- llvm_unreachable("Unimplemented instruction case");
- }
- }
- }
-
-#ifndef NDEBUG
- LLVM_DEBUG(dbgs() << "States after initialization:\n");
- for (auto Pair : States) {
- LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
- }
-#endif
-
- // Return a phi state for a base defining value. We'll generate a new
- // base state for known bases and expect to find a cached state otherwise.
- auto GetStateForBDV = [&](Value *BaseValue, Value *Input) {
- if (isKnownBaseResult(BaseValue) && areBothVectorOrScalar(BaseValue, Input))
- return BDVState(BaseValue);
- auto I = States.find(BaseValue);
- assert(I != States.end() && "lookup failed!");
- return I->second;
- };
-
- bool Progress = true;
- while (Progress) {
-#ifndef NDEBUG
- const size_t OldSize = States.size();
-#endif
- Progress = false;
- // We're only changing values in this loop, thus safe to keep iterators.
- // Since this is computing a fixed point, the order of visit does not
- // effect the result. TODO: We could use a worklist here and make this run
- // much faster.
- for (auto Pair : States) {
- Value *BDV = Pair.first;
- // Only values that do not have known bases or those that have differing
- // type (scalar versus vector) from a possible known base should be in the
- // lattice.
- assert((!isKnownBaseResult(BDV) ||
- !areBothVectorOrScalar(BDV, Pair.second.getBaseValue())) &&
- "why did it get added?");
-
- // Given an input value for the current instruction, return a BDVState
- // instance which represents the BDV of that value.
- auto getStateForInput = [&](Value *V) mutable {
- Value *BDV = findBaseOrBDV(V, Cache);
- return GetStateForBDV(BDV, V);
- };
-
- BDVState NewState;
- if (SelectInst *SI = dyn_cast<SelectInst>(BDV)) {
- NewState = meetBDVState(NewState, getStateForInput(SI->getTrueValue()));
- NewState =
- meetBDVState(NewState, getStateForInput(SI->getFalseValue()));
- } else if (PHINode *PN = dyn_cast<PHINode>(BDV)) {
- for (Value *Val : PN->incoming_values())
- NewState = meetBDVState(NewState, getStateForInput(Val));
- } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) {
- // The 'meet' for an extractelement is slightly trivial, but it's still
- // useful in that it drives us to conflict if our input is.
- NewState =
- meetBDVState(NewState, getStateForInput(EE->getVectorOperand()));
- } else if (auto *IE = dyn_cast<InsertElementInst>(BDV)){
- // Given there's a inherent type mismatch between the operands, will
- // *always* produce Conflict.
- NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(0)));
- NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(1)));
- } else {
- // The only instance this does not return a Conflict is when both the
- // vector operands are the same vector.
- auto *SV = cast<ShuffleVectorInst>(BDV);
- NewState = meetBDVState(NewState, getStateForInput(SV->getOperand(0)));
- NewState = meetBDVState(NewState, getStateForInput(SV->getOperand(1)));
- }
-
- BDVState OldState = States[BDV];
- if (OldState != NewState) {
- Progress = true;
- States[BDV] = NewState;
- }
- }
-
- assert(OldSize == States.size() &&
- "fixed point shouldn't be adding any new nodes to state");
- }
-
-#ifndef NDEBUG
- LLVM_DEBUG(dbgs() << "States after meet iteration:\n");
- for (auto Pair : States) {
- LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
- }
-#endif
-
- // Handle all instructions that have a vector BDV, but the instruction itself
- // is of scalar type.
- for (auto Pair : States) {
- Instruction *I = cast<Instruction>(Pair.first);
- BDVState State = Pair.second;
- auto *BaseValue = State.getBaseValue();
- // Only values that do not have known bases or those that have differing
- // type (scalar versus vector) from a possible known base should be in the
- // lattice.
- assert((!isKnownBaseResult(I) || !areBothVectorOrScalar(I, BaseValue)) &&
- "why did it get added?");
- assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
-
- if (!State.isBase() || !isa<VectorType>(BaseValue->getType()))
- continue;
- // extractelement instructions are a bit special in that we may need to
- // insert an extract even when we know an exact base for the instruction.
- // The problem is that we need to convert from a vector base to a scalar
- // base for the particular indice we're interested in.
- if (isa<ExtractElementInst>(I)) {
- auto *EE = cast<ExtractElementInst>(I);
- // TODO: In many cases, the new instruction is just EE itself. We should
- // exploit this, but can't do it here since it would break the invariant
- // about the BDV not being known to be a base.
- auto *BaseInst = ExtractElementInst::Create(
- State.getBaseValue(), EE->getIndexOperand(), "base_ee", EE);
- BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
- States[I] = BDVState(BDVState::Base, BaseInst);
- } else if (!isa<VectorType>(I->getType())) {
- // We need to handle cases that have a vector base but the instruction is
- // a scalar type (these could be phis or selects or any instruction that
- // are of scalar type, but the base can be a vector type). We
- // conservatively set this as conflict. Setting the base value for these
- // conflicts is handled in the next loop which traverses States.
- States[I] = BDVState(BDVState::Conflict);
- }
- }
-
- // Insert Phis for all conflicts
- // TODO: adjust naming patterns to avoid this order of iteration dependency
- for (auto Pair : States) {
- Instruction *I = cast<Instruction>(Pair.first);
- BDVState State = Pair.second;
- // Only values that do not have known bases or those that have differing
- // type (scalar versus vector) from a possible known base should be in the
- // lattice.
- assert((!isKnownBaseResult(I) || !areBothVectorOrScalar(I, State.getBaseValue())) &&
- "why did it get added?");
- assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
-
- // Since we're joining a vector and scalar base, they can never be the
- // same. As a result, we should always see insert element having reached
- // the conflict state.
- assert(!isa<InsertElementInst>(I) || State.isConflict());
-
- if (!State.isConflict())
- continue;
-
- /// Create and insert a new instruction which will represent the base of
- /// the given instruction 'I'.
- auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* {
- if (isa<PHINode>(I)) {
- BasicBlock *BB = I->getParent();
- int NumPreds = pred_size(BB);
- assert(NumPreds > 0 && "how did we reach here");
- std::string Name = suffixed_name_or(I, ".base", "base_phi");
- return PHINode::Create(I->getType(), NumPreds, Name, I);
- } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
- // The undef will be replaced later
- UndefValue *Undef = UndefValue::get(SI->getType());
- std::string Name = suffixed_name_or(I, ".base", "base_select");
- return SelectInst::Create(SI->getCondition(), Undef, Undef, Name, SI);
- } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
- UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType());
- std::string Name = suffixed_name_or(I, ".base", "base_ee");
- return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name,
- EE);
- } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
- UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType());
- UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType());
- std::string Name = suffixed_name_or(I, ".base", "base_ie");
- return InsertElementInst::Create(VecUndef, ScalarUndef,
- IE->getOperand(2), Name, IE);
- } else {
- auto *SV = cast<ShuffleVectorInst>(I);
- UndefValue *VecUndef = UndefValue::get(SV->getOperand(0)->getType());
- std::string Name = suffixed_name_or(I, ".base", "base_sv");
- return new ShuffleVectorInst(VecUndef, VecUndef, SV->getShuffleMask(),
- Name, SV);
- }
- };
- Instruction *BaseInst = MakeBaseInstPlaceholder(I);
- // Add metadata marking this as a base value
- BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
- States[I] = BDVState(BDVState::Conflict, BaseInst);
- }
-
- // Returns a instruction which produces the base pointer for a given
- // instruction. The instruction is assumed to be an input to one of the BDVs
- // seen in the inference algorithm above. As such, we must either already
- // know it's base defining value is a base, or have inserted a new
- // instruction to propagate the base of it's BDV and have entered that newly
- // introduced instruction into the state table. In either case, we are
- // assured to be able to determine an instruction which produces it's base
- // pointer.
- auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) {
- Value *BDV = findBaseOrBDV(Input, Cache);
- Value *Base = nullptr;
- if (isKnownBaseResult(BDV) && areBothVectorOrScalar(BDV, Input)) {
- Base = BDV;
- } else {
- // Either conflict or base.
- assert(States.count(BDV));
- Base = States[BDV].getBaseValue();
- }
- assert(Base && "Can't be null");
- // The cast is needed since base traversal may strip away bitcasts
- if (Base->getType() != Input->getType() && InsertPt)
- Base = new BitCastInst(Base, Input->getType(), "cast", InsertPt);
- return Base;
- };
-
- // Fixup all the inputs of the new PHIs. Visit order needs to be
- // deterministic and predictable because we're naming newly created
- // instructions.
- for (auto Pair : States) {
- Instruction *BDV = cast<Instruction>(Pair.first);
- BDVState State = Pair.second;
-
- // Only values that do not have known bases or those that have differing
- // type (scalar versus vector) from a possible known base should be in the
- // lattice.
- assert((!isKnownBaseResult(BDV) ||
- !areBothVectorOrScalar(BDV, State.getBaseValue())) &&
- "why did it get added?");
- assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
- if (!State.isConflict())
- continue;
-
- if (PHINode *BasePHI = dyn_cast<PHINode>(State.getBaseValue())) {
- PHINode *PN = cast<PHINode>(BDV);
- unsigned NumPHIValues = PN->getNumIncomingValues();
- for (unsigned i = 0; i < NumPHIValues; i++) {
- Value *InVal = PN->getIncomingValue(i);
- BasicBlock *InBB = PN->getIncomingBlock(i);
-
- // If we've already seen InBB, add the same incoming value
- // we added for it earlier. The IR verifier requires phi
- // nodes with multiple entries from the same basic block
- // to have the same incoming value for each of those
- // entries. If we don't do this check here and basephi
- // has a different type than base, we'll end up adding two
- // bitcasts (and hence two distinct values) as incoming
- // values for the same basic block.
-
- int BlockIndex = BasePHI->getBasicBlockIndex(InBB);
- if (BlockIndex != -1) {
- Value *OldBase = BasePHI->getIncomingValue(BlockIndex);
- BasePHI->addIncoming(OldBase, InBB);
-
-#ifndef NDEBUG
- Value *Base = getBaseForInput(InVal, nullptr);
- // In essence this assert states: the only way two values
- // incoming from the same basic block may be different is by
- // being different bitcasts of the same value. A cleanup
- // that remains TODO is changing findBaseOrBDV to return an
- // llvm::Value of the correct type (and still remain pure).
- // This will remove the need to add bitcasts.
- assert(Base->stripPointerCasts() == OldBase->stripPointerCasts() &&
- "Sanity -- findBaseOrBDV should be pure!");
-#endif
- continue;
- }
-
- // Find the instruction which produces the base for each input. We may
- // need to insert a bitcast in the incoming block.
- // TODO: Need to split critical edges if insertion is needed
- Value *Base = getBaseForInput(InVal, InBB->getTerminator());
- BasePHI->addIncoming(Base, InBB);
- }
- assert(BasePHI->getNumIncomingValues() == NumPHIValues);
- } else if (SelectInst *BaseSI =
- dyn_cast<SelectInst>(State.getBaseValue())) {
- SelectInst *SI = cast<SelectInst>(BDV);
-
- // Find the instruction which produces the base for each input.
- // We may need to insert a bitcast.
- BaseSI->setTrueValue(getBaseForInput(SI->getTrueValue(), BaseSI));
- BaseSI->setFalseValue(getBaseForInput(SI->getFalseValue(), BaseSI));
- } else if (auto *BaseEE =
- dyn_cast<ExtractElementInst>(State.getBaseValue())) {
- Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand();
- // Find the instruction which produces the base for each input. We may
- // need to insert a bitcast.
- BaseEE->setOperand(0, getBaseForInput(InVal, BaseEE));
- } else if (auto *BaseIE = dyn_cast<InsertElementInst>(State.getBaseValue())){
- auto *BdvIE = cast<InsertElementInst>(BDV);
- auto UpdateOperand = [&](int OperandIdx) {
- Value *InVal = BdvIE->getOperand(OperandIdx);
- Value *Base = getBaseForInput(InVal, BaseIE);
- BaseIE->setOperand(OperandIdx, Base);
- };
- UpdateOperand(0); // vector operand
- UpdateOperand(1); // scalar operand
- } else {
- auto *BaseSV = cast<ShuffleVectorInst>(State.getBaseValue());
- auto *BdvSV = cast<ShuffleVectorInst>(BDV);
- auto UpdateOperand = [&](int OperandIdx) {
- Value *InVal = BdvSV->getOperand(OperandIdx);
- Value *Base = getBaseForInput(InVal, BaseSV);
- BaseSV->setOperand(OperandIdx, Base);
- };
- UpdateOperand(0); // vector operand
- UpdateOperand(1); // vector operand
- }
- }
-
- // Cache all of our results so we can cheaply reuse them
- // NOTE: This is actually two caches: one of the base defining value
- // relation and one of the base pointer relation! FIXME
- for (auto Pair : States) {
- auto *BDV = Pair.first;
- Value *Base = Pair.second.getBaseValue();
- assert(BDV && Base);
- // Only values that do not have known bases or those that have differing
- // type (scalar versus vector) from a possible known base should be in the
- // lattice.
- assert((!isKnownBaseResult(BDV) || !areBothVectorOrScalar(BDV, Base)) &&
- "why did it get added?");
-
- LLVM_DEBUG(
- dbgs() << "Updating base value cache"
- << " for: " << BDV->getName() << " from: "
- << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none")
- << " to: " << Base->getName() << "\n");
-
- if (Cache.count(BDV)) {
- assert(isKnownBaseResult(Base) &&
- "must be something we 'know' is a base pointer");
- // Once we transition from the BDV relation being store in the Cache to
- // the base relation being stored, it must be stable
- assert((!isKnownBaseResult(Cache[BDV]) || Cache[BDV] == Base) &&
- "base relation should be stable");
- }
- Cache[BDV] = Base;
- }
- assert(Cache.count(Def));
- return Cache[Def];
-}
-
-// For a set of live pointers (base and/or derived), identify the base
-// pointer of the object which they are derived from. This routine will
-// mutate the IR graph as needed to make the 'base' pointer live at the
-// definition site of 'derived'. This ensures that any use of 'derived' can
-// also use 'base'. This may involve the insertion of a number of
-// additional PHI nodes.
-//
-// preconditions: live is a set of pointer type Values
-//
-// side effects: may insert PHI nodes into the existing CFG, will preserve
-// CFG, will not remove or mutate any existing nodes
-//
-// post condition: PointerToBase contains one (derived, base) pair for every
-// pointer in live. Note that derived can be equal to base if the original
-// pointer was a base pointer.
-static void
-findBasePointers(const StatepointLiveSetTy &live,
- MapVector<Value *, Value *> &PointerToBase,
- DominatorTree *DT, DefiningValueMapTy &DVCache) {
- for (Value *ptr : live) {
- Value *base = findBasePointer(ptr, DVCache);
- assert(base && "failed to find base pointer");
- PointerToBase[ptr] = base;
- assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) ||
- DT->dominates(cast<Instruction>(base)->getParent(),
- cast<Instruction>(ptr)->getParent())) &&
- "The base we found better dominate the derived pointer");
- }
-}
-
-/// Find the required based pointers (and adjust the live set) for the given
-/// parse point.
-static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
- CallBase *Call,
- PartiallyConstructedSafepointRecord &result) {
- MapVector<Value *, Value *> PointerToBase;
- findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache);
-
- if (PrintBasePointers) {
- errs() << "Base Pairs (w/o Relocation):\n";
- for (auto &Pair : PointerToBase) {
- errs() << " derived ";
- Pair.first->printAsOperand(errs(), false);
- errs() << " base ";
- Pair.second->printAsOperand(errs(), false);
- errs() << "\n";;
- }
- }
-
- result.PointerToBase = PointerToBase;
-}
-
-/// Given an updated version of the dataflow liveness results, update the
-/// liveset and base pointer maps for the call site CS.
-static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
- CallBase *Call,
- PartiallyConstructedSafepointRecord &result);
-
-static void recomputeLiveInValues(
- Function &F, DominatorTree &DT, ArrayRef<CallBase *> toUpdate,
- MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
- // TODO-PERF: reuse the original liveness, then simply run the dataflow
- // again. The old values are still live and will help it stabilize quickly.
- GCPtrLivenessData RevisedLivenessData;
- computeLiveInValues(DT, F, RevisedLivenessData);
- for (size_t i = 0; i < records.size(); i++) {
- struct PartiallyConstructedSafepointRecord &info = records[i];
- recomputeLiveInValues(RevisedLivenessData, toUpdate[i], info);
- }
-}
-
-// When inserting gc.relocate and gc.result calls, we need to ensure there are
-// no uses of the original value / return value between the gc.statepoint and
-// the gc.relocate / gc.result call. One case which can arise is a phi node
-// starting one of the successor blocks. We also need to be able to insert the
-// gc.relocates only on the path which goes through the statepoint. We might
-// need to split an edge to make this possible.
-static BasicBlock *
-normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,
- DominatorTree &DT) {
- BasicBlock *Ret = BB;
- if (!BB->getUniquePredecessor())
- Ret = SplitBlockPredecessors(BB, InvokeParent, "", &DT);
-
- // Now that 'Ret' has unique predecessor we can safely remove all phi nodes
- // from it
- FoldSingleEntryPHINodes(Ret);
- assert(!isa<PHINode>(Ret->begin()) &&
- "All PHI nodes should have been removed!");
-
- // At this point, we can safely insert a gc.relocate or gc.result as the first
- // instruction in Ret if needed.
- return Ret;
-}
-
-// Create new attribute set containing only attributes which can be transferred
-// from original call to the safepoint.
-static AttributeList legalizeCallAttributes(LLVMContext &Ctx,
- AttributeList AL) {
- if (AL.isEmpty())
- return AL;
-
- // Remove the readonly, readnone, and statepoint function attributes.
- AttrBuilder FnAttrs = AL.getFnAttributes();
- FnAttrs.removeAttribute(Attribute::ReadNone);
- FnAttrs.removeAttribute(Attribute::ReadOnly);
- for (Attribute A : AL.getFnAttributes()) {
- if (isStatepointDirectiveAttr(A))
- FnAttrs.remove(A);
- }
-
- // Just skip parameter and return attributes for now
- return AttributeList::get(Ctx, AttributeList::FunctionIndex,
- AttributeSet::get(Ctx, FnAttrs));
-}
-
-/// Helper function to place all gc relocates necessary for the given
-/// statepoint.
-/// Inputs:
-/// liveVariables - list of variables to be relocated.
-/// basePtrs - base pointers.
-/// statepointToken - statepoint instruction to which relocates should be
-/// bound.
-/// Builder - Llvm IR builder to be used to construct new calls.
-static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
- ArrayRef<Value *> BasePtrs,
- Instruction *StatepointToken,
- IRBuilder<> &Builder) {
- if (LiveVariables.empty())
- return;
-
- auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) {
- auto ValIt = llvm::find(LiveVec, Val);
- assert(ValIt != LiveVec.end() && "Val not found in LiveVec!");
- size_t Index = std::distance(LiveVec.begin(), ValIt);
- assert(Index < LiveVec.size() && "Bug in std::find?");
- return Index;
- };
- Module *M = StatepointToken->getModule();
-
- // All gc_relocate are generated as i8 addrspace(1)* (or a vector type whose
- // element type is i8 addrspace(1)*). We originally generated unique
- // declarations for each pointer type, but this proved problematic because
- // the intrinsic mangling code is incomplete and fragile. Since we're moving
- // towards a single unified pointer type anyways, we can just cast everything
- // to an i8* of the right address space. A bitcast is added later to convert
- // gc_relocate to the actual value's type.
- auto getGCRelocateDecl = [&] (Type *Ty) {
- assert(isHandledGCPointerType(Ty));
- auto AS = Ty->getScalarType()->getPointerAddressSpace();
- Type *NewTy = Type::getInt8PtrTy(M->getContext(), AS);
- if (auto *VT = dyn_cast<VectorType>(Ty))
- NewTy = FixedVectorType::get(NewTy,
- cast<FixedVectorType>(VT)->getNumElements());
- return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate,
- {NewTy});
- };
-
- // Lazily populated map from input types to the canonicalized form mentioned
- // in the comment above. This should probably be cached somewhere more
- // broadly.
- DenseMap<Type *, Function *> TypeToDeclMap;
-
- for (unsigned i = 0; i < LiveVariables.size(); i++) {
- // Generate the gc.relocate call and save the result
- Value *BaseIdx = Builder.getInt32(FindIndex(LiveVariables, BasePtrs[i]));
- Value *LiveIdx = Builder.getInt32(i);
-
- Type *Ty = LiveVariables[i]->getType();
- if (!TypeToDeclMap.count(Ty))
- TypeToDeclMap[Ty] = getGCRelocateDecl(Ty);
- Function *GCRelocateDecl = TypeToDeclMap[Ty];
-
- // only specify a debug name if we can give a useful one
- CallInst *Reloc = Builder.CreateCall(
- GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx},
- suffixed_name_or(LiveVariables[i], ".relocated", ""));
- // Trick CodeGen into thinking there are lots of free registers at this
- // fake call.
- Reloc->setCallingConv(CallingConv::Cold);
- }
-}
-
-namespace {
-
-/// This struct is used to defer RAUWs and `eraseFromParent` s. Using this
-/// avoids having to worry about keeping around dangling pointers to Values.
-class DeferredReplacement {
- AssertingVH<Instruction> Old;
- AssertingVH<Instruction> New;
- bool IsDeoptimize = false;
-
- DeferredReplacement() = default;
-
-public:
- static DeferredReplacement createRAUW(Instruction *Old, Instruction *New) {
- assert(Old != New && Old && New &&
- "Cannot RAUW equal values or to / from null!");
-
- DeferredReplacement D;
- D.Old = Old;
- D.New = New;
- return D;
- }
-
- static DeferredReplacement createDelete(Instruction *ToErase) {
- DeferredReplacement D;
- D.Old = ToErase;
- return D;
- }
-
- static DeferredReplacement createDeoptimizeReplacement(Instruction *Old) {
-#ifndef NDEBUG
- auto *F = cast<CallInst>(Old)->getCalledFunction();
- assert(F && F->getIntrinsicID() == Intrinsic::experimental_deoptimize &&
- "Only way to construct a deoptimize deferred replacement");
-#endif
- DeferredReplacement D;
- D.Old = Old;
- D.IsDeoptimize = true;
- return D;
- }
-
- /// Does the task represented by this instance.
- void doReplacement() {
- Instruction *OldI = Old;
- Instruction *NewI = New;
-
- assert(OldI != NewI && "Disallowed at construction?!");
- assert((!IsDeoptimize || !New) &&
- "Deoptimize intrinsics are not replaced!");
-
- Old = nullptr;
- New = nullptr;
-
- if (NewI)
- OldI->replaceAllUsesWith(NewI);
-
- if (IsDeoptimize) {
- // Note: we've inserted instructions, so the call to llvm.deoptimize may
- // not necessarily be followed by the matching return.
- auto *RI = cast<ReturnInst>(OldI->getParent()->getTerminator());
- new UnreachableInst(RI->getContext(), RI);
- RI->eraseFromParent();
- }
-
- OldI->eraseFromParent();
- }
-};
-
-} // end anonymous namespace
-
-static StringRef getDeoptLowering(CallBase *Call) {
- const char *DeoptLowering = "deopt-lowering";
- if (Call->hasFnAttr(DeoptLowering)) {
- // FIXME: Calls have a *really* confusing interface around attributes
- // with values.
- const AttributeList &CSAS = Call->getAttributes();
- if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering))
- return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering)
- .getValueAsString();
- Function *F = Call->getCalledFunction();
- assert(F && F->hasFnAttribute(DeoptLowering));
- return F->getFnAttribute(DeoptLowering).getValueAsString();
- }
- return "live-through";
-}
-
-static void
-makeStatepointExplicitImpl(CallBase *Call, /* to replace */
- const SmallVectorImpl<Value *> &BasePtrs,
- const SmallVectorImpl<Value *> &LiveVariables,
- PartiallyConstructedSafepointRecord &Result,
- std::vector<DeferredReplacement> &Replacements) {
- assert(BasePtrs.size() == LiveVariables.size());
-
- // Then go ahead and use the builder do actually do the inserts. We insert
- // immediately before the previous instruction under the assumption that all
- // arguments will be available here. We can't insert afterwards since we may
- // be replacing a terminator.
- IRBuilder<> Builder(Call);
-
- ArrayRef<Value *> GCArgs(LiveVariables);
- uint64_t StatepointID = StatepointDirectives::DefaultStatepointID;
- uint32_t NumPatchBytes = 0;
- uint32_t Flags = uint32_t(StatepointFlags::None);
-
+//===- RewriteStatepointsForGC.cpp - Make GC relocations explicit ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Rewrite call/invoke instructions so as to make potential relocations
+// performed by the garbage collector explicit in the IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/RewriteStatepointsForGC.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+#define DEBUG_TYPE "rewrite-statepoints-for-gc"
+
+using namespace llvm;
+
+// Print the liveset found at the insert location
+static cl::opt<bool> PrintLiveSet("spp-print-liveset", cl::Hidden,
+ cl::init(false));
+static cl::opt<bool> PrintLiveSetSize("spp-print-liveset-size", cl::Hidden,
+ cl::init(false));
+
+// Print out the base pointers for debugging
+static cl::opt<bool> PrintBasePointers("spp-print-base-pointers", cl::Hidden,
+ cl::init(false));
+
+// Cost threshold measuring when it is profitable to rematerialize value instead
+// of relocating it
+static cl::opt<unsigned>
+RematerializationThreshold("spp-rematerialization-threshold", cl::Hidden,
+ cl::init(6));
+
+#ifdef EXPENSIVE_CHECKS
+static bool ClobberNonLive = true;
+#else
+static bool ClobberNonLive = false;
+#endif
+
+static cl::opt<bool, true> ClobberNonLiveOverride("rs4gc-clobber-non-live",
+ cl::location(ClobberNonLive),
+ cl::Hidden);
+
+static cl::opt<bool>
+ AllowStatepointWithNoDeoptInfo("rs4gc-allow-statepoint-with-no-deopt-info",
+ cl::Hidden, cl::init(true));
+
+/// The IR fed into RewriteStatepointsForGC may have had attributes and
+/// metadata implying dereferenceability that are no longer valid/correct after
+/// RewriteStatepointsForGC has run. This is because semantically, after
+/// RewriteStatepointsForGC runs, all calls to gc.statepoint "free" the entire
+/// heap. stripNonValidData (conservatively) restores
+/// correctness by erasing all attributes in the module that externally imply
+/// dereferenceability. Similar reasoning also applies to the noalias
+/// attributes and metadata. gc.statepoint can touch the entire heap including
+/// noalias objects.
+/// Apart from attributes and metadata, we also remove instructions that imply
+/// constant physical memory: llvm.invariant.start.
+static void stripNonValidData(Module &M);
+
+static bool shouldRewriteStatepointsIn(Function &F);
+
+PreservedAnalyses RewriteStatepointsForGC::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ bool Changed = false;
+ auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+ for (Function &F : M) {
+ // Nothing to do for declarations.
+ if (F.isDeclaration() || F.empty())
+ continue;
+
+ // Policy choice says not to rewrite - the most common reason is that we're
+ // compiling code without a GCStrategy.
+ if (!shouldRewriteStatepointsIn(F))
+ continue;
+
+ auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
+ auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+ Changed |= runOnFunction(F, DT, TTI, TLI);
+ }
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ // stripNonValidData asserts that shouldRewriteStatepointsIn
+ // returns true for at least one function in the module. Since at least
+ // one function changed, we know that the precondition is satisfied.
+ stripNonValidData(M);
+
+ PreservedAnalyses PA;
+ PA.preserve<TargetIRAnalysis>();
+ PA.preserve<TargetLibraryAnalysis>();
+ return PA;
+}
+
+namespace {
+
+class RewriteStatepointsForGCLegacyPass : public ModulePass {
+ RewriteStatepointsForGC Impl;
+
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ RewriteStatepointsForGCLegacyPass() : ModulePass(ID), Impl() {
+ initializeRewriteStatepointsForGCLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ bool Changed = false;
+ for (Function &F : M) {
+ // Nothing to do for declarations.
+ if (F.isDeclaration() || F.empty())
+ continue;
+
+ // Policy choice says not to rewrite - the most common reason is that
+ // we're compiling code without a GCStrategy.
+ if (!shouldRewriteStatepointsIn(F))
+ continue;
+
+ TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ const TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
+
+ Changed |= Impl.runOnFunction(F, DT, TTI, TLI);
+ }
+
+ if (!Changed)
+ return false;
+
+ // stripNonValidData asserts that shouldRewriteStatepointsIn
+ // returns true for at least one function in the module. Since at least
+ // one function changed, we know that the precondition is satisfied.
+ stripNonValidData(M);
+ return true;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ // We add and rewrite a bunch of instructions, but don't really do much
+ // else. We could in theory preserve a lot more analyses here.
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char RewriteStatepointsForGCLegacyPass::ID = 0;
+
+ModulePass *llvm::createRewriteStatepointsForGCLegacyPass() {
+ return new RewriteStatepointsForGCLegacyPass();
+}
+
+INITIALIZE_PASS_BEGIN(RewriteStatepointsForGCLegacyPass,
+ "rewrite-statepoints-for-gc",
+ "Make relocations explicit at statepoints", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(RewriteStatepointsForGCLegacyPass,
+ "rewrite-statepoints-for-gc",
+ "Make relocations explicit at statepoints", false, false)
+
+namespace {
+
+struct GCPtrLivenessData {
+ /// Values defined in this block.
+ MapVector<BasicBlock *, SetVector<Value *>> KillSet;
+
+ /// Values used in this block (and thus live); does not included values
+ /// killed within this block.
+ MapVector<BasicBlock *, SetVector<Value *>> LiveSet;
+
+ /// Values live into this basic block (i.e. used by any
+ /// instruction in this basic block or ones reachable from here)
+ MapVector<BasicBlock *, SetVector<Value *>> LiveIn;
+
+ /// Values live out of this basic block (i.e. live into
+ /// any successor block)
+ MapVector<BasicBlock *, SetVector<Value *>> LiveOut;
+};
+
+// The type of the internal cache used inside the findBasePointers family
+// of functions. From the callers perspective, this is an opaque type and
+// should not be inspected.
+//
+// In the actual implementation this caches two relations:
+// - The base relation itself (i.e. this pointer is based on that one)
+// - The base defining value relation (i.e. before base_phi insertion)
+// Generally, after the execution of a full findBasePointer call, only the
+// base relation will remain. Internally, we add a mixture of the two
+// types, then update all the second type to the first type
+using DefiningValueMapTy = MapVector<Value *, Value *>;
+using StatepointLiveSetTy = SetVector<Value *>;
+using RematerializedValueMapTy =
+ MapVector<AssertingVH<Instruction>, AssertingVH<Value>>;
+
+struct PartiallyConstructedSafepointRecord {
+ /// The set of values known to be live across this safepoint
+ StatepointLiveSetTy LiveSet;
+
+ /// Mapping from live pointers to a base-defining-value
+ MapVector<Value *, Value *> PointerToBase;
+
+ /// The *new* gc.statepoint instruction itself. This produces the token
+ /// that normal path gc.relocates and the gc.result are tied to.
+ GCStatepointInst *StatepointToken;
+
+ /// Instruction to which exceptional gc relocates are attached
+ /// Makes it easier to iterate through them during relocationViaAlloca.
+ Instruction *UnwindToken;
+
+ /// Record live values we are rematerialized instead of relocating.
+ /// They are not included into 'LiveSet' field.
+ /// Maps rematerialized copy to it's original value.
+ RematerializedValueMapTy RematerializedValues;
+};
+
+} // end anonymous namespace
+
+static ArrayRef<Use> GetDeoptBundleOperands(const CallBase *Call) {
+ Optional<OperandBundleUse> DeoptBundle =
+ Call->getOperandBundle(LLVMContext::OB_deopt);
+
+ if (!DeoptBundle.hasValue()) {
+ assert(AllowStatepointWithNoDeoptInfo &&
+ "Found non-leaf call without deopt info!");
+ return None;
+ }
+
+ return DeoptBundle.getValue().Inputs;
+}
+
+/// Compute the live-in set for every basic block in the function
+static void computeLiveInValues(DominatorTree &DT, Function &F,
+ GCPtrLivenessData &Data);
+
+/// Given results from the dataflow liveness computation, find the set of live
+/// Values at a particular instruction.
+static void findLiveSetAtInst(Instruction *inst, GCPtrLivenessData &Data,
+ StatepointLiveSetTy &out);
+
+// TODO: Once we can get to the GCStrategy, this becomes
+// Optional<bool> isGCManagedPointer(const Type *Ty) const override {
+
+static bool isGCPointerType(Type *T) {
+ if (auto *PT = dyn_cast<PointerType>(T))
+ // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
+ // GC managed heap. We know that a pointer into this heap needs to be
+ // updated and that no other pointer does.
+ return PT->getAddressSpace() == 1;
+ return false;
+}
+
+// Return true if this type is one which a) is a gc pointer or contains a GC
+// pointer and b) is of a type this code expects to encounter as a live value.
+// (The insertion code will assert that a type which matches (a) and not (b)
+// is not encountered.)
+static bool isHandledGCPointerType(Type *T) {
+ // We fully support gc pointers
+ if (isGCPointerType(T))
+ return true;
+ // We partially support vectors of gc pointers. The code will assert if it
+ // can't handle something.
+ if (auto VT = dyn_cast<VectorType>(T))
+ if (isGCPointerType(VT->getElementType()))
+ return true;
+ return false;
+}
+
+#ifndef NDEBUG
+/// Returns true if this type contains a gc pointer whether we know how to
+/// handle that type or not.
+static bool containsGCPtrType(Type *Ty) {
+ if (isGCPointerType(Ty))
+ return true;
+ if (VectorType *VT = dyn_cast<VectorType>(Ty))
+ return isGCPointerType(VT->getScalarType());
+ if (ArrayType *AT = dyn_cast<ArrayType>(Ty))
+ return containsGCPtrType(AT->getElementType());
+ if (StructType *ST = dyn_cast<StructType>(Ty))
+ return llvm::any_of(ST->elements(), containsGCPtrType);
+ return false;
+}
+
+// Returns true if this is a type which a) is a gc pointer or contains a GC
+// pointer and b) is of a type which the code doesn't expect (i.e. first class
+// aggregates). Used to trip assertions.
+static bool isUnhandledGCPointerType(Type *Ty) {
+ return containsGCPtrType(Ty) && !isHandledGCPointerType(Ty);
+}
+#endif
+
+// Return the name of the value suffixed with the provided value, or if the
+// value didn't have a name, the default value specified.
+static std::string suffixed_name_or(Value *V, StringRef Suffix,
+ StringRef DefaultName) {
+ return V->hasName() ? (V->getName() + Suffix).str() : DefaultName.str();
+}
+
+// Conservatively identifies any definitions which might be live at the
+// given instruction. The analysis is performed immediately before the
+// given instruction. Values defined by that instruction are not considered
+// live. Values used by that instruction are considered live.
+static void analyzeParsePointLiveness(
+ DominatorTree &DT, GCPtrLivenessData &OriginalLivenessData, CallBase *Call,
+ PartiallyConstructedSafepointRecord &Result) {
+ StatepointLiveSetTy LiveSet;
+ findLiveSetAtInst(Call, OriginalLivenessData, LiveSet);
+
+ if (PrintLiveSet) {
+ dbgs() << "Live Variables:\n";
+ for (Value *V : LiveSet)
+ dbgs() << " " << V->getName() << " " << *V << "\n";
+ }
+ if (PrintLiveSetSize) {
+ dbgs() << "Safepoint For: " << Call->getCalledOperand()->getName() << "\n";
+ dbgs() << "Number live values: " << LiveSet.size() << "\n";
+ }
+ Result.LiveSet = LiveSet;
+}
+
+// Returns true is V is a knownBaseResult.
+static bool isKnownBaseResult(Value *V);
+
+// Returns true if V is a BaseResult that already exists in the IR, i.e. it is
+// not created by the findBasePointers algorithm.
+static bool isOriginalBaseResult(Value *V);
+
+namespace {
+
+/// A single base defining value - An immediate base defining value for an
+/// instruction 'Def' is an input to 'Def' whose base is also a base of 'Def'.
+/// For instructions which have multiple pointer [vector] inputs or that
+/// transition between vector and scalar types, there is no immediate base
+/// defining value. The 'base defining value' for 'Def' is the transitive
+/// closure of this relation stopping at the first instruction which has no
+/// immediate base defining value. The b.d.v. might itself be a base pointer,
+/// but it can also be an arbitrary derived pointer.
+struct BaseDefiningValueResult {
+ /// Contains the value which is the base defining value.
+ Value * const BDV;
+
+ /// True if the base defining value is also known to be an actual base
+ /// pointer.
+ const bool IsKnownBase;
+
+ BaseDefiningValueResult(Value *BDV, bool IsKnownBase)
+ : BDV(BDV), IsKnownBase(IsKnownBase) {
+#ifndef NDEBUG
+ // Check consistency between new and old means of checking whether a BDV is
+ // a base.
+ bool MustBeBase = isKnownBaseResult(BDV);
+ assert(!MustBeBase || MustBeBase == IsKnownBase);
+#endif
+ }
+};
+
+} // end anonymous namespace
+
+static BaseDefiningValueResult findBaseDefiningValue(Value *I);
+
+/// Return a base defining value for the 'Index' element of the given vector
+/// instruction 'I'. If Index is null, returns a BDV for the entire vector
+/// 'I'. As an optimization, this method will try to determine when the
+/// element is known to already be a base pointer. If this can be established,
+/// the second value in the returned pair will be true. Note that either a
+/// vector or a pointer typed value can be returned. For the former, the
+/// vector returned is a BDV (and possibly a base) of the entire vector 'I'.
+/// If the later, the return pointer is a BDV (or possibly a base) for the
+/// particular element in 'I'.
+static BaseDefiningValueResult
+findBaseDefiningValueOfVector(Value *I) {
+ // Each case parallels findBaseDefiningValue below, see that code for
+ // detailed motivation.
+
+ if (isa<Argument>(I))
+ // An incoming argument to the function is a base pointer
+ return BaseDefiningValueResult(I, true);
+
+ if (isa<Constant>(I))
+ // Base of constant vector consists only of constant null pointers.
+ // For reasoning see similar case inside 'findBaseDefiningValue' function.
+ return BaseDefiningValueResult(ConstantAggregateZero::get(I->getType()),
+ true);
+
+ if (isa<LoadInst>(I))
+ return BaseDefiningValueResult(I, true);
+
+ if (isa<InsertElementInst>(I))
+ // We don't know whether this vector contains entirely base pointers or
+ // not. To be conservatively correct, we treat it as a BDV and will
+ // duplicate code as needed to construct a parallel vector of bases.
+ return BaseDefiningValueResult(I, false);
+
+ if (isa<ShuffleVectorInst>(I))
+ // We don't know whether this vector contains entirely base pointers or
+ // not. To be conservatively correct, we treat it as a BDV and will
+ // duplicate code as needed to construct a parallel vector of bases.
+ // TODO: There a number of local optimizations which could be applied here
+ // for particular sufflevector patterns.
+ return BaseDefiningValueResult(I, false);
+
+ // The behavior of getelementptr instructions is the same for vector and
+ // non-vector data types.
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+ return findBaseDefiningValue(GEP->getPointerOperand());
+
+ // If the pointer comes through a bitcast of a vector of pointers to
+ // a vector of another type of pointer, then look through the bitcast
+ if (auto *BC = dyn_cast<BitCastInst>(I))
+ return findBaseDefiningValue(BC->getOperand(0));
+
+ // We assume that functions in the source language only return base
+ // pointers. This should probably be generalized via attributes to support
+ // both source language and internal functions.
+ if (isa<CallInst>(I) || isa<InvokeInst>(I))
+ return BaseDefiningValueResult(I, true);
+
+ // A PHI or Select is a base defining value. The outer findBasePointer
+ // algorithm is responsible for constructing a base value for this BDV.
+ assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
+ "unknown vector instruction - no base found for vector element");
+ return BaseDefiningValueResult(I, false);
+}
+
+/// Helper function for findBasePointer - Will return a value which either a)
+/// defines the base pointer for the input, b) blocks the simple search
+/// (i.e. a PHI or Select of two derived pointers), or c) involves a change
+/// from pointer to vector type or back.
+static BaseDefiningValueResult findBaseDefiningValue(Value *I) {
+ assert(I->getType()->isPtrOrPtrVectorTy() &&
+ "Illegal to ask for the base pointer of a non-pointer type");
+
+ if (I->getType()->isVectorTy())
+ return findBaseDefiningValueOfVector(I);
+
+ if (isa<Argument>(I))
+ // An incoming argument to the function is a base pointer
+ // We should have never reached here if this argument isn't an gc value
+ return BaseDefiningValueResult(I, true);
+
+ if (isa<Constant>(I)) {
+ // We assume that objects with a constant base (e.g. a global) can't move
+ // and don't need to be reported to the collector because they are always
+ // live. Besides global references, all kinds of constants (e.g. undef,
+ // constant expressions, null pointers) can be introduced by the inliner or
+ // the optimizer, especially on dynamically dead paths.
+ // Here we treat all of them as having single null base. By doing this we
+ // trying to avoid problems reporting various conflicts in a form of
+ // "phi (const1, const2)" or "phi (const, regular gc ptr)".
+ // See constant.ll file for relevant test cases.
+
+ return BaseDefiningValueResult(
+ ConstantPointerNull::get(cast<PointerType>(I->getType())), true);
+ }
+
+ if (CastInst *CI = dyn_cast<CastInst>(I)) {
+ Value *Def = CI->stripPointerCasts();
+ // If stripping pointer casts changes the address space there is an
+ // addrspacecast in between.
+ assert(cast<PointerType>(Def->getType())->getAddressSpace() ==
+ cast<PointerType>(CI->getType())->getAddressSpace() &&
+ "unsupported addrspacecast");
+ // If we find a cast instruction here, it means we've found a cast which is
+ // not simply a pointer cast (i.e. an inttoptr). We don't know how to
+ // handle int->ptr conversion.
+ assert(!isa<CastInst>(Def) && "shouldn't find another cast here");
+ return findBaseDefiningValue(Def);
+ }
+
+ if (isa<LoadInst>(I))
+ // The value loaded is an gc base itself
+ return BaseDefiningValueResult(I, true);
+
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
+ // The base of this GEP is the base
+ return findBaseDefiningValue(GEP->getPointerOperand());
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default:
+ // fall through to general call handling
+ break;
+ case Intrinsic::experimental_gc_statepoint:
+ llvm_unreachable("statepoints don't produce pointers");
+ case Intrinsic::experimental_gc_relocate:
+ // Rerunning safepoint insertion after safepoints are already
+ // inserted is not supported. It could probably be made to work,
+ // but why are you doing this? There's no good reason.
+ llvm_unreachable("repeat safepoint insertion is not supported");
+ case Intrinsic::gcroot:
+ // Currently, this mechanism hasn't been extended to work with gcroot.
+ // There's no reason it couldn't be, but I haven't thought about the
+ // implications much.
+ llvm_unreachable(
+ "interaction with the gcroot mechanism is not supported");
+ }
+ }
+ // We assume that functions in the source language only return base
+ // pointers. This should probably be generalized via attributes to support
+ // both source language and internal functions.
+ if (isa<CallInst>(I) || isa<InvokeInst>(I))
+ return BaseDefiningValueResult(I, true);
+
+ // TODO: I have absolutely no idea how to implement this part yet. It's not
+ // necessarily hard, I just haven't really looked at it yet.
+ assert(!isa<LandingPadInst>(I) && "Landing Pad is unimplemented");
+
+ if (isa<AtomicCmpXchgInst>(I))
+ // A CAS is effectively a atomic store and load combined under a
+ // predicate. From the perspective of base pointers, we just treat it
+ // like a load.
+ return BaseDefiningValueResult(I, true);
+
+ assert(!isa<AtomicRMWInst>(I) && "Xchg handled above, all others are "
+ "binary ops which don't apply to pointers");
+
+ // The aggregate ops. Aggregates can either be in the heap or on the
+ // stack, but in either case, this is simply a field load. As a result,
+ // this is a defining definition of the base just like a load is.
+ if (isa<ExtractValueInst>(I))
+ return BaseDefiningValueResult(I, true);
+
+ // We should never see an insert vector since that would require we be
+ // tracing back a struct value not a pointer value.
+ assert(!isa<InsertValueInst>(I) &&
+ "Base pointer for a struct is meaningless");
+
+ // An extractelement produces a base result exactly when it's input does.
+ // We may need to insert a parallel instruction to extract the appropriate
+ // element out of the base vector corresponding to the input. Given this,
+ // it's analogous to the phi and select case even though it's not a merge.
+ if (isa<ExtractElementInst>(I))
+ // Note: There a lot of obvious peephole cases here. This are deliberately
+ // handled after the main base pointer inference algorithm to make writing
+ // test cases to exercise that code easier.
+ return BaseDefiningValueResult(I, false);
+
+ // The last two cases here don't return a base pointer. Instead, they
+ // return a value which dynamically selects from among several base
+ // derived pointers (each with it's own base potentially). It's the job of
+ // the caller to resolve these.
+ assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
+ "missing instruction case in findBaseDefiningValing");
+ return BaseDefiningValueResult(I, false);
+}
+
+/// Returns the base defining value for this value.
+static Value *findBaseDefiningValueCached(Value *I, DefiningValueMapTy &Cache) {
+ Value *&Cached = Cache[I];
+ if (!Cached) {
+ Cached = findBaseDefiningValue(I).BDV;
+ LLVM_DEBUG(dbgs() << "fBDV-cached: " << I->getName() << " -> "
+ << Cached->getName() << "\n");
+ }
+ assert(Cache[I] != nullptr);
+ return Cached;
+}
+
+/// Return a base pointer for this value if known. Otherwise, return it's
+/// base defining value.
+static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) {
+ Value *Def = findBaseDefiningValueCached(I, Cache);
+ auto Found = Cache.find(Def);
+ if (Found != Cache.end()) {
+ // Either a base-of relation, or a self reference. Caller must check.
+ return Found->second;
+ }
+ // Only a BDV available
+ return Def;
+}
+
+/// This value is a base pointer that is not generated by RS4GC, i.e. it already
+/// exists in the code.
+static bool isOriginalBaseResult(Value *V) {
+ // no recursion possible
+ return !isa<PHINode>(V) && !isa<SelectInst>(V) &&
+ !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) &&
+ !isa<ShuffleVectorInst>(V);
+}
+
+/// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV,
+/// is it known to be a base pointer? Or do we need to continue searching.
+static bool isKnownBaseResult(Value *V) {
+ if (isOriginalBaseResult(V))
+ return true;
+ if (isa<Instruction>(V) &&
+ cast<Instruction>(V)->getMetadata("is_base_value")) {
+ // This is a previously inserted base phi or select. We know
+ // that this is a base value.
+ return true;
+ }
+
+ // We need to keep searching
+ return false;
+}
+
+// Returns true if First and Second values are both scalar or both vector.
+static bool areBothVectorOrScalar(Value *First, Value *Second) {
+ return isa<VectorType>(First->getType()) ==
+ isa<VectorType>(Second->getType());
+}
+
+namespace {
+
+/// Models the state of a single base defining value in the findBasePointer
+/// algorithm for determining where a new instruction is needed to propagate
+/// the base of this BDV.
+class BDVState {
+public:
+ enum Status { Unknown, Base, Conflict };
+
+ BDVState() : BaseValue(nullptr) {}
+
+ explicit BDVState(Status Status, Value *BaseValue = nullptr)
+ : Status(Status), BaseValue(BaseValue) {
+ assert(Status != Base || BaseValue);
+ }
+
+ explicit BDVState(Value *BaseValue) : Status(Base), BaseValue(BaseValue) {}
+
+ Status getStatus() const { return Status; }
+ Value *getBaseValue() const { return BaseValue; }
+
+ bool isBase() const { return getStatus() == Base; }
+ bool isUnknown() const { return getStatus() == Unknown; }
+ bool isConflict() const { return getStatus() == Conflict; }
+
+ bool operator==(const BDVState &Other) const {
+ return BaseValue == Other.BaseValue && Status == Other.Status;
+ }
+
+ bool operator!=(const BDVState &other) const { return !(*this == other); }
+
+ LLVM_DUMP_METHOD
+ void dump() const {
+ print(dbgs());
+ dbgs() << '\n';
+ }
+
+ void print(raw_ostream &OS) const {
+ switch (getStatus()) {
+ case Unknown:
+ OS << "U";
+ break;
+ case Base:
+ OS << "B";
+ break;
+ case Conflict:
+ OS << "C";
+ break;
+ }
+ OS << " (" << getBaseValue() << " - "
+ << (getBaseValue() ? getBaseValue()->getName() : "nullptr") << "): ";
+ }
+
+private:
+ Status Status = Unknown;
+ AssertingVH<Value> BaseValue; // Non-null only if Status == Base.
+};
+
+} // end anonymous namespace
+
+#ifndef NDEBUG
+static raw_ostream &operator<<(raw_ostream &OS, const BDVState &State) {
+ State.print(OS);
+ return OS;
+}
+#endif
+
+static BDVState meetBDVStateImpl(const BDVState &LHS, const BDVState &RHS) {
+ switch (LHS.getStatus()) {
+ case BDVState::Unknown:
+ return RHS;
+
+ case BDVState::Base:
+ assert(LHS.getBaseValue() && "can't be null");
+ if (RHS.isUnknown())
+ return LHS;
+
+ if (RHS.isBase()) {
+ if (LHS.getBaseValue() == RHS.getBaseValue()) {
+ assert(LHS == RHS && "equality broken!");
+ return LHS;
+ }
+ return BDVState(BDVState::Conflict);
+ }
+ assert(RHS.isConflict() && "only three states!");
+ return BDVState(BDVState::Conflict);
+
+ case BDVState::Conflict:
+ return LHS;
+ }
+ llvm_unreachable("only three states!");
+}
+
+// Values of type BDVState form a lattice, and this function implements the meet
+// operation.
+static BDVState meetBDVState(const BDVState &LHS, const BDVState &RHS) {
+ BDVState Result = meetBDVStateImpl(LHS, RHS);
+ assert(Result == meetBDVStateImpl(RHS, LHS) &&
+ "Math is wrong: meet does not commute!");
+ return Result;
+}
+
+/// For a given value or instruction, figure out what base ptr its derived from.
+/// For gc objects, this is simply itself. On success, returns a value which is
+/// the base pointer. (This is reliable and can be used for relocation.) On
+/// failure, returns nullptr.
+static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
+ Value *Def = findBaseOrBDV(I, Cache);
+
+ if (isKnownBaseResult(Def) && areBothVectorOrScalar(Def, I))
+ return Def;
+
+ // Here's the rough algorithm:
+ // - For every SSA value, construct a mapping to either an actual base
+ // pointer or a PHI which obscures the base pointer.
+ // - Construct a mapping from PHI to unknown TOP state. Use an
+ // optimistic algorithm to propagate base pointer information. Lattice
+ // looks like:
+ // UNKNOWN
+ // b1 b2 b3 b4
+ // CONFLICT
+ // When algorithm terminates, all PHIs will either have a single concrete
+ // base or be in a conflict state.
+ // - For every conflict, insert a dummy PHI node without arguments. Add
+ // these to the base[Instruction] = BasePtr mapping. For every
+ // non-conflict, add the actual base.
+ // - For every conflict, add arguments for the base[a] of each input
+ // arguments.
+ //
+ // Note: A simpler form of this would be to add the conflict form of all
+ // PHIs without running the optimistic algorithm. This would be
+ // analogous to pessimistic data flow and would likely lead to an
+ // overall worse solution.
+
+#ifndef NDEBUG
+ auto isExpectedBDVType = [](Value *BDV) {
+ return isa<PHINode>(BDV) || isa<SelectInst>(BDV) ||
+ isa<ExtractElementInst>(BDV) || isa<InsertElementInst>(BDV) ||
+ isa<ShuffleVectorInst>(BDV);
+ };
+#endif
+
+ // Once populated, will contain a mapping from each potentially non-base BDV
+ // to a lattice value (described above) which corresponds to that BDV.
+ // We use the order of insertion (DFS over the def/use graph) to provide a
+ // stable deterministic ordering for visiting DenseMaps (which are unordered)
+ // below. This is important for deterministic compilation.
+ MapVector<Value *, BDVState> States;
+
+ // Recursively fill in all base defining values reachable from the initial
+ // one for which we don't already know a definite base value for
+ /* scope */ {
+ SmallVector<Value*, 16> Worklist;
+ Worklist.push_back(Def);
+ States.insert({Def, BDVState()});
+ while (!Worklist.empty()) {
+ Value *Current = Worklist.pop_back_val();
+ assert(!isOriginalBaseResult(Current) && "why did it get added?");
+
+ auto visitIncomingValue = [&](Value *InVal) {
+ Value *Base = findBaseOrBDV(InVal, Cache);
+ if (isKnownBaseResult(Base) && areBothVectorOrScalar(Base, InVal))
+ // Known bases won't need new instructions introduced and can be
+ // ignored safely. However, this can only be done when InVal and Base
+ // are both scalar or both vector. Otherwise, we need to find a
+ // correct BDV for InVal, by creating an entry in the lattice
+ // (States).
+ return;
+ assert(isExpectedBDVType(Base) && "the only non-base values "
+ "we see should be base defining values");
+ if (States.insert(std::make_pair(Base, BDVState())).second)
+ Worklist.push_back(Base);
+ };
+ if (PHINode *PN = dyn_cast<PHINode>(Current)) {
+ for (Value *InVal : PN->incoming_values())
+ visitIncomingValue(InVal);
+ } else if (SelectInst *SI = dyn_cast<SelectInst>(Current)) {
+ visitIncomingValue(SI->getTrueValue());
+ visitIncomingValue(SI->getFalseValue());
+ } else if (auto *EE = dyn_cast<ExtractElementInst>(Current)) {
+ visitIncomingValue(EE->getVectorOperand());
+ } else if (auto *IE = dyn_cast<InsertElementInst>(Current)) {
+ visitIncomingValue(IE->getOperand(0)); // vector operand
+ visitIncomingValue(IE->getOperand(1)); // scalar operand
+ } else if (auto *SV = dyn_cast<ShuffleVectorInst>(Current)) {
+ visitIncomingValue(SV->getOperand(0));
+ visitIncomingValue(SV->getOperand(1));
+ }
+ else {
+ llvm_unreachable("Unimplemented instruction case");
+ }
+ }
+ }
+
+#ifndef NDEBUG
+ LLVM_DEBUG(dbgs() << "States after initialization:\n");
+ for (auto Pair : States) {
+ LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+ }
+#endif
+
+ // Return a phi state for a base defining value. We'll generate a new
+ // base state for known bases and expect to find a cached state otherwise.
+ auto GetStateForBDV = [&](Value *BaseValue, Value *Input) {
+ if (isKnownBaseResult(BaseValue) && areBothVectorOrScalar(BaseValue, Input))
+ return BDVState(BaseValue);
+ auto I = States.find(BaseValue);
+ assert(I != States.end() && "lookup failed!");
+ return I->second;
+ };
+
+ bool Progress = true;
+ while (Progress) {
+#ifndef NDEBUG
+ const size_t OldSize = States.size();
+#endif
+ Progress = false;
+ // We're only changing values in this loop, thus safe to keep iterators.
+ // Since this is computing a fixed point, the order of visit does not
+ // effect the result. TODO: We could use a worklist here and make this run
+ // much faster.
+ for (auto Pair : States) {
+ Value *BDV = Pair.first;
+ // Only values that do not have known bases or those that have differing
+ // type (scalar versus vector) from a possible known base should be in the
+ // lattice.
+ assert((!isKnownBaseResult(BDV) ||
+ !areBothVectorOrScalar(BDV, Pair.second.getBaseValue())) &&
+ "why did it get added?");
+
+ // Given an input value for the current instruction, return a BDVState
+ // instance which represents the BDV of that value.
+ auto getStateForInput = [&](Value *V) mutable {
+ Value *BDV = findBaseOrBDV(V, Cache);
+ return GetStateForBDV(BDV, V);
+ };
+
+ BDVState NewState;
+ if (SelectInst *SI = dyn_cast<SelectInst>(BDV)) {
+ NewState = meetBDVState(NewState, getStateForInput(SI->getTrueValue()));
+ NewState =
+ meetBDVState(NewState, getStateForInput(SI->getFalseValue()));
+ } else if (PHINode *PN = dyn_cast<PHINode>(BDV)) {
+ for (Value *Val : PN->incoming_values())
+ NewState = meetBDVState(NewState, getStateForInput(Val));
+ } else if (auto *EE = dyn_cast<ExtractElementInst>(BDV)) {
+ // The 'meet' for an extractelement is slightly trivial, but it's still
+ // useful in that it drives us to conflict if our input is.
+ NewState =
+ meetBDVState(NewState, getStateForInput(EE->getVectorOperand()));
+ } else if (auto *IE = dyn_cast<InsertElementInst>(BDV)){
+ // Given there's a inherent type mismatch between the operands, will
+ // *always* produce Conflict.
+ NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(0)));
+ NewState = meetBDVState(NewState, getStateForInput(IE->getOperand(1)));
+ } else {
+ // The only instance this does not return a Conflict is when both the
+ // vector operands are the same vector.
+ auto *SV = cast<ShuffleVectorInst>(BDV);
+ NewState = meetBDVState(NewState, getStateForInput(SV->getOperand(0)));
+ NewState = meetBDVState(NewState, getStateForInput(SV->getOperand(1)));
+ }
+
+ BDVState OldState = States[BDV];
+ if (OldState != NewState) {
+ Progress = true;
+ States[BDV] = NewState;
+ }
+ }
+
+ assert(OldSize == States.size() &&
+ "fixed point shouldn't be adding any new nodes to state");
+ }
+
+#ifndef NDEBUG
+ LLVM_DEBUG(dbgs() << "States after meet iteration:\n");
+ for (auto Pair : States) {
+ LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
+ }
+#endif
+
+ // Handle all instructions that have a vector BDV, but the instruction itself
+ // is of scalar type.
+ for (auto Pair : States) {
+ Instruction *I = cast<Instruction>(Pair.first);
+ BDVState State = Pair.second;
+ auto *BaseValue = State.getBaseValue();
+ // Only values that do not have known bases or those that have differing
+ // type (scalar versus vector) from a possible known base should be in the
+ // lattice.
+ assert((!isKnownBaseResult(I) || !areBothVectorOrScalar(I, BaseValue)) &&
+ "why did it get added?");
+ assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+
+ if (!State.isBase() || !isa<VectorType>(BaseValue->getType()))
+ continue;
+ // extractelement instructions are a bit special in that we may need to
+ // insert an extract even when we know an exact base for the instruction.
+ // The problem is that we need to convert from a vector base to a scalar
+ // base for the particular indice we're interested in.
+ if (isa<ExtractElementInst>(I)) {
+ auto *EE = cast<ExtractElementInst>(I);
+ // TODO: In many cases, the new instruction is just EE itself. We should
+ // exploit this, but can't do it here since it would break the invariant
+ // about the BDV not being known to be a base.
+ auto *BaseInst = ExtractElementInst::Create(
+ State.getBaseValue(), EE->getIndexOperand(), "base_ee", EE);
+ BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
+ States[I] = BDVState(BDVState::Base, BaseInst);
+ } else if (!isa<VectorType>(I->getType())) {
+ // We need to handle cases that have a vector base but the instruction is
+ // a scalar type (these could be phis or selects or any instruction that
+ // are of scalar type, but the base can be a vector type). We
+ // conservatively set this as conflict. Setting the base value for these
+ // conflicts is handled in the next loop which traverses States.
+ States[I] = BDVState(BDVState::Conflict);
+ }
+ }
+
+ // Insert Phis for all conflicts
+ // TODO: adjust naming patterns to avoid this order of iteration dependency
+ for (auto Pair : States) {
+ Instruction *I = cast<Instruction>(Pair.first);
+ BDVState State = Pair.second;
+ // Only values that do not have known bases or those that have differing
+ // type (scalar versus vector) from a possible known base should be in the
+ // lattice.
+ assert((!isKnownBaseResult(I) || !areBothVectorOrScalar(I, State.getBaseValue())) &&
+ "why did it get added?");
+ assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+
+ // Since we're joining a vector and scalar base, they can never be the
+ // same. As a result, we should always see insert element having reached
+ // the conflict state.
+ assert(!isa<InsertElementInst>(I) || State.isConflict());
+
+ if (!State.isConflict())
+ continue;
+
+ /// Create and insert a new instruction which will represent the base of
+ /// the given instruction 'I'.
+ auto MakeBaseInstPlaceholder = [](Instruction *I) -> Instruction* {
+ if (isa<PHINode>(I)) {
+ BasicBlock *BB = I->getParent();
+ int NumPreds = pred_size(BB);
+ assert(NumPreds > 0 && "how did we reach here");
+ std::string Name = suffixed_name_or(I, ".base", "base_phi");
+ return PHINode::Create(I->getType(), NumPreds, Name, I);
+ } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+ // The undef will be replaced later
+ UndefValue *Undef = UndefValue::get(SI->getType());
+ std::string Name = suffixed_name_or(I, ".base", "base_select");
+ return SelectInst::Create(SI->getCondition(), Undef, Undef, Name, SI);
+ } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
+ UndefValue *Undef = UndefValue::get(EE->getVectorOperand()->getType());
+ std::string Name = suffixed_name_or(I, ".base", "base_ee");
+ return ExtractElementInst::Create(Undef, EE->getIndexOperand(), Name,
+ EE);
+ } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
+ UndefValue *VecUndef = UndefValue::get(IE->getOperand(0)->getType());
+ UndefValue *ScalarUndef = UndefValue::get(IE->getOperand(1)->getType());
+ std::string Name = suffixed_name_or(I, ".base", "base_ie");
+ return InsertElementInst::Create(VecUndef, ScalarUndef,
+ IE->getOperand(2), Name, IE);
+ } else {
+ auto *SV = cast<ShuffleVectorInst>(I);
+ UndefValue *VecUndef = UndefValue::get(SV->getOperand(0)->getType());
+ std::string Name = suffixed_name_or(I, ".base", "base_sv");
+ return new ShuffleVectorInst(VecUndef, VecUndef, SV->getShuffleMask(),
+ Name, SV);
+ }
+ };
+ Instruction *BaseInst = MakeBaseInstPlaceholder(I);
+ // Add metadata marking this as a base value
+ BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
+ States[I] = BDVState(BDVState::Conflict, BaseInst);
+ }
+
+ // Returns a instruction which produces the base pointer for a given
+ // instruction. The instruction is assumed to be an input to one of the BDVs
+ // seen in the inference algorithm above. As such, we must either already
+ // know it's base defining value is a base, or have inserted a new
+ // instruction to propagate the base of it's BDV and have entered that newly
+ // introduced instruction into the state table. In either case, we are
+ // assured to be able to determine an instruction which produces it's base
+ // pointer.
+ auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) {
+ Value *BDV = findBaseOrBDV(Input, Cache);
+ Value *Base = nullptr;
+ if (isKnownBaseResult(BDV) && areBothVectorOrScalar(BDV, Input)) {
+ Base = BDV;
+ } else {
+ // Either conflict or base.
+ assert(States.count(BDV));
+ Base = States[BDV].getBaseValue();
+ }
+ assert(Base && "Can't be null");
+ // The cast is needed since base traversal may strip away bitcasts
+ if (Base->getType() != Input->getType() && InsertPt)
+ Base = new BitCastInst(Base, Input->getType(), "cast", InsertPt);
+ return Base;
+ };
+
+ // Fixup all the inputs of the new PHIs. Visit order needs to be
+ // deterministic and predictable because we're naming newly created
+ // instructions.
+ for (auto Pair : States) {
+ Instruction *BDV = cast<Instruction>(Pair.first);
+ BDVState State = Pair.second;
+
+ // Only values that do not have known bases or those that have differing
+ // type (scalar versus vector) from a possible known base should be in the
+ // lattice.
+ assert((!isKnownBaseResult(BDV) ||
+ !areBothVectorOrScalar(BDV, State.getBaseValue())) &&
+ "why did it get added?");
+ assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
+ if (!State.isConflict())
+ continue;
+
+ if (PHINode *BasePHI = dyn_cast<PHINode>(State.getBaseValue())) {
+ PHINode *PN = cast<PHINode>(BDV);
+ unsigned NumPHIValues = PN->getNumIncomingValues();
+ for (unsigned i = 0; i < NumPHIValues; i++) {
+ Value *InVal = PN->getIncomingValue(i);
+ BasicBlock *InBB = PN->getIncomingBlock(i);
+
+ // If we've already seen InBB, add the same incoming value
+ // we added for it earlier. The IR verifier requires phi
+ // nodes with multiple entries from the same basic block
+ // to have the same incoming value for each of those
+ // entries. If we don't do this check here and basephi
+ // has a different type than base, we'll end up adding two
+ // bitcasts (and hence two distinct values) as incoming
+ // values for the same basic block.
+
+ int BlockIndex = BasePHI->getBasicBlockIndex(InBB);
+ if (BlockIndex != -1) {
+ Value *OldBase = BasePHI->getIncomingValue(BlockIndex);
+ BasePHI->addIncoming(OldBase, InBB);
+
+#ifndef NDEBUG
+ Value *Base = getBaseForInput(InVal, nullptr);
+ // In essence this assert states: the only way two values
+ // incoming from the same basic block may be different is by
+ // being different bitcasts of the same value. A cleanup
+ // that remains TODO is changing findBaseOrBDV to return an
+ // llvm::Value of the correct type (and still remain pure).
+ // This will remove the need to add bitcasts.
+ assert(Base->stripPointerCasts() == OldBase->stripPointerCasts() &&
+ "Sanity -- findBaseOrBDV should be pure!");
+#endif
+ continue;
+ }
+
+ // Find the instruction which produces the base for each input. We may
+ // need to insert a bitcast in the incoming block.
+ // TODO: Need to split critical edges if insertion is needed
+ Value *Base = getBaseForInput(InVal, InBB->getTerminator());
+ BasePHI->addIncoming(Base, InBB);
+ }
+ assert(BasePHI->getNumIncomingValues() == NumPHIValues);
+ } else if (SelectInst *BaseSI =
+ dyn_cast<SelectInst>(State.getBaseValue())) {
+ SelectInst *SI = cast<SelectInst>(BDV);
+
+ // Find the instruction which produces the base for each input.
+ // We may need to insert a bitcast.
+ BaseSI->setTrueValue(getBaseForInput(SI->getTrueValue(), BaseSI));
+ BaseSI->setFalseValue(getBaseForInput(SI->getFalseValue(), BaseSI));
+ } else if (auto *BaseEE =
+ dyn_cast<ExtractElementInst>(State.getBaseValue())) {
+ Value *InVal = cast<ExtractElementInst>(BDV)->getVectorOperand();
+ // Find the instruction which produces the base for each input. We may
+ // need to insert a bitcast.
+ BaseEE->setOperand(0, getBaseForInput(InVal, BaseEE));
+ } else if (auto *BaseIE = dyn_cast<InsertElementInst>(State.getBaseValue())){
+ auto *BdvIE = cast<InsertElementInst>(BDV);
+ auto UpdateOperand = [&](int OperandIdx) {
+ Value *InVal = BdvIE->getOperand(OperandIdx);
+ Value *Base = getBaseForInput(InVal, BaseIE);
+ BaseIE->setOperand(OperandIdx, Base);
+ };
+ UpdateOperand(0); // vector operand
+ UpdateOperand(1); // scalar operand
+ } else {
+ auto *BaseSV = cast<ShuffleVectorInst>(State.getBaseValue());
+ auto *BdvSV = cast<ShuffleVectorInst>(BDV);
+ auto UpdateOperand = [&](int OperandIdx) {
+ Value *InVal = BdvSV->getOperand(OperandIdx);
+ Value *Base = getBaseForInput(InVal, BaseSV);
+ BaseSV->setOperand(OperandIdx, Base);
+ };
+ UpdateOperand(0); // vector operand
+ UpdateOperand(1); // vector operand
+ }
+ }
+
+ // Cache all of our results so we can cheaply reuse them
+ // NOTE: This is actually two caches: one of the base defining value
+ // relation and one of the base pointer relation! FIXME
+ for (auto Pair : States) {
+ auto *BDV = Pair.first;
+ Value *Base = Pair.second.getBaseValue();
+ assert(BDV && Base);
+ // Only values that do not have known bases or those that have differing
+ // type (scalar versus vector) from a possible known base should be in the
+ // lattice.
+ assert((!isKnownBaseResult(BDV) || !areBothVectorOrScalar(BDV, Base)) &&
+ "why did it get added?");
+
+ LLVM_DEBUG(
+ dbgs() << "Updating base value cache"
+ << " for: " << BDV->getName() << " from: "
+ << (Cache.count(BDV) ? Cache[BDV]->getName().str() : "none")
+ << " to: " << Base->getName() << "\n");
+
+ if (Cache.count(BDV)) {
+ assert(isKnownBaseResult(Base) &&
+ "must be something we 'know' is a base pointer");
+ // Once we transition from the BDV relation being store in the Cache to
+ // the base relation being stored, it must be stable
+ assert((!isKnownBaseResult(Cache[BDV]) || Cache[BDV] == Base) &&
+ "base relation should be stable");
+ }
+ Cache[BDV] = Base;
+ }
+ assert(Cache.count(Def));
+ return Cache[Def];
+}
+
+// For a set of live pointers (base and/or derived), identify the base
+// pointer of the object which they are derived from. This routine will
+// mutate the IR graph as needed to make the 'base' pointer live at the
+// definition site of 'derived'. This ensures that any use of 'derived' can
+// also use 'base'. This may involve the insertion of a number of
+// additional PHI nodes.
+//
+// preconditions: live is a set of pointer type Values
+//
+// side effects: may insert PHI nodes into the existing CFG, will preserve
+// CFG, will not remove or mutate any existing nodes
+//
+// post condition: PointerToBase contains one (derived, base) pair for every
+// pointer in live. Note that derived can be equal to base if the original
+// pointer was a base pointer.
+static void
+findBasePointers(const StatepointLiveSetTy &live,
+ MapVector<Value *, Value *> &PointerToBase,
+ DominatorTree *DT, DefiningValueMapTy &DVCache) {
+ for (Value *ptr : live) {
+ Value *base = findBasePointer(ptr, DVCache);
+ assert(base && "failed to find base pointer");
+ PointerToBase[ptr] = base;
+ assert((!isa<Instruction>(base) || !isa<Instruction>(ptr) ||
+ DT->dominates(cast<Instruction>(base)->getParent(),
+ cast<Instruction>(ptr)->getParent())) &&
+ "The base we found better dominate the derived pointer");
+ }
+}
+
+/// Find the required based pointers (and adjust the live set) for the given
+/// parse point.
+static void findBasePointers(DominatorTree &DT, DefiningValueMapTy &DVCache,
+ CallBase *Call,
+ PartiallyConstructedSafepointRecord &result) {
+ MapVector<Value *, Value *> PointerToBase;
+ findBasePointers(result.LiveSet, PointerToBase, &DT, DVCache);
+
+ if (PrintBasePointers) {
+ errs() << "Base Pairs (w/o Relocation):\n";
+ for (auto &Pair : PointerToBase) {
+ errs() << " derived ";
+ Pair.first->printAsOperand(errs(), false);
+ errs() << " base ";
+ Pair.second->printAsOperand(errs(), false);
+ errs() << "\n";;
+ }
+ }
+
+ result.PointerToBase = PointerToBase;
+}
+
+/// Given an updated version of the dataflow liveness results, update the
+/// liveset and base pointer maps for the call site CS.
+static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
+ CallBase *Call,
+ PartiallyConstructedSafepointRecord &result);
+
+static void recomputeLiveInValues(
+ Function &F, DominatorTree &DT, ArrayRef<CallBase *> toUpdate,
+ MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+ // TODO-PERF: reuse the original liveness, then simply run the dataflow
+ // again. The old values are still live and will help it stabilize quickly.
+ GCPtrLivenessData RevisedLivenessData;
+ computeLiveInValues(DT, F, RevisedLivenessData);
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ recomputeLiveInValues(RevisedLivenessData, toUpdate[i], info);
+ }
+}
+
+// When inserting gc.relocate and gc.result calls, we need to ensure there are
+// no uses of the original value / return value between the gc.statepoint and
+// the gc.relocate / gc.result call. One case which can arise is a phi node
+// starting one of the successor blocks. We also need to be able to insert the
+// gc.relocates only on the path which goes through the statepoint. We might
+// need to split an edge to make this possible.
+static BasicBlock *
+normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,
+ DominatorTree &DT) {
+ BasicBlock *Ret = BB;
+ if (!BB->getUniquePredecessor())
+ Ret = SplitBlockPredecessors(BB, InvokeParent, "", &DT);
+
+ // Now that 'Ret' has unique predecessor we can safely remove all phi nodes
+ // from it
+ FoldSingleEntryPHINodes(Ret);
+ assert(!isa<PHINode>(Ret->begin()) &&
+ "All PHI nodes should have been removed!");
+
+ // At this point, we can safely insert a gc.relocate or gc.result as the first
+ // instruction in Ret if needed.
+ return Ret;
+}
+
+// Create new attribute set containing only attributes which can be transferred
+// from original call to the safepoint.
+static AttributeList legalizeCallAttributes(LLVMContext &Ctx,
+ AttributeList AL) {
+ if (AL.isEmpty())
+ return AL;
+
+ // Remove the readonly, readnone, and statepoint function attributes.
+ AttrBuilder FnAttrs = AL.getFnAttributes();
+ FnAttrs.removeAttribute(Attribute::ReadNone);
+ FnAttrs.removeAttribute(Attribute::ReadOnly);
+ for (Attribute A : AL.getFnAttributes()) {
+ if (isStatepointDirectiveAttr(A))
+ FnAttrs.remove(A);
+ }
+
+ // Just skip parameter and return attributes for now
+ return AttributeList::get(Ctx, AttributeList::FunctionIndex,
+ AttributeSet::get(Ctx, FnAttrs));
+}
+
+/// Helper function to place all gc relocates necessary for the given
+/// statepoint.
+/// Inputs:
+/// liveVariables - list of variables to be relocated.
+/// basePtrs - base pointers.
+/// statepointToken - statepoint instruction to which relocates should be
+/// bound.
+/// Builder - Llvm IR builder to be used to construct new calls.
+static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
+ ArrayRef<Value *> BasePtrs,
+ Instruction *StatepointToken,
+ IRBuilder<> &Builder) {
+ if (LiveVariables.empty())
+ return;
+
+ auto FindIndex = [](ArrayRef<Value *> LiveVec, Value *Val) {
+ auto ValIt = llvm::find(LiveVec, Val);
+ assert(ValIt != LiveVec.end() && "Val not found in LiveVec!");
+ size_t Index = std::distance(LiveVec.begin(), ValIt);
+ assert(Index < LiveVec.size() && "Bug in std::find?");
+ return Index;
+ };
+ Module *M = StatepointToken->getModule();
+
+ // All gc_relocate are generated as i8 addrspace(1)* (or a vector type whose
+ // element type is i8 addrspace(1)*). We originally generated unique
+ // declarations for each pointer type, but this proved problematic because
+ // the intrinsic mangling code is incomplete and fragile. Since we're moving
+ // towards a single unified pointer type anyways, we can just cast everything
+ // to an i8* of the right address space. A bitcast is added later to convert
+ // gc_relocate to the actual value's type.
+ auto getGCRelocateDecl = [&] (Type *Ty) {
+ assert(isHandledGCPointerType(Ty));
+ auto AS = Ty->getScalarType()->getPointerAddressSpace();
+ Type *NewTy = Type::getInt8PtrTy(M->getContext(), AS);
+ if (auto *VT = dyn_cast<VectorType>(Ty))
+ NewTy = FixedVectorType::get(NewTy,
+ cast<FixedVectorType>(VT)->getNumElements());
+ return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate,
+ {NewTy});
+ };
+
+ // Lazily populated map from input types to the canonicalized form mentioned
+ // in the comment above. This should probably be cached somewhere more
+ // broadly.
+ DenseMap<Type *, Function *> TypeToDeclMap;
+
+ for (unsigned i = 0; i < LiveVariables.size(); i++) {
+ // Generate the gc.relocate call and save the result
+ Value *BaseIdx = Builder.getInt32(FindIndex(LiveVariables, BasePtrs[i]));
+ Value *LiveIdx = Builder.getInt32(i);
+
+ Type *Ty = LiveVariables[i]->getType();
+ if (!TypeToDeclMap.count(Ty))
+ TypeToDeclMap[Ty] = getGCRelocateDecl(Ty);
+ Function *GCRelocateDecl = TypeToDeclMap[Ty];
+
+ // only specify a debug name if we can give a useful one
+ CallInst *Reloc = Builder.CreateCall(
+ GCRelocateDecl, {StatepointToken, BaseIdx, LiveIdx},
+ suffixed_name_or(LiveVariables[i], ".relocated", ""));
+ // Trick CodeGen into thinking there are lots of free registers at this
+ // fake call.
+ Reloc->setCallingConv(CallingConv::Cold);
+ }
+}
+
+namespace {
+
+/// This struct is used to defer RAUWs and `eraseFromParent` s. Using this
+/// avoids having to worry about keeping around dangling pointers to Values.
+class DeferredReplacement {
+ AssertingVH<Instruction> Old;
+ AssertingVH<Instruction> New;
+ bool IsDeoptimize = false;
+
+ DeferredReplacement() = default;
+
+public:
+ static DeferredReplacement createRAUW(Instruction *Old, Instruction *New) {
+ assert(Old != New && Old && New &&
+ "Cannot RAUW equal values or to / from null!");
+
+ DeferredReplacement D;
+ D.Old = Old;
+ D.New = New;
+ return D;
+ }
+
+ static DeferredReplacement createDelete(Instruction *ToErase) {
+ DeferredReplacement D;
+ D.Old = ToErase;
+ return D;
+ }
+
+ static DeferredReplacement createDeoptimizeReplacement(Instruction *Old) {
+#ifndef NDEBUG
+ auto *F = cast<CallInst>(Old)->getCalledFunction();
+ assert(F && F->getIntrinsicID() == Intrinsic::experimental_deoptimize &&
+ "Only way to construct a deoptimize deferred replacement");
+#endif
+ DeferredReplacement D;
+ D.Old = Old;
+ D.IsDeoptimize = true;
+ return D;
+ }
+
+ /// Does the task represented by this instance.
+ void doReplacement() {
+ Instruction *OldI = Old;
+ Instruction *NewI = New;
+
+ assert(OldI != NewI && "Disallowed at construction?!");
+ assert((!IsDeoptimize || !New) &&
+ "Deoptimize intrinsics are not replaced!");
+
+ Old = nullptr;
+ New = nullptr;
+
+ if (NewI)
+ OldI->replaceAllUsesWith(NewI);
+
+ if (IsDeoptimize) {
+ // Note: we've inserted instructions, so the call to llvm.deoptimize may
+ // not necessarily be followed by the matching return.
+ auto *RI = cast<ReturnInst>(OldI->getParent()->getTerminator());
+ new UnreachableInst(RI->getContext(), RI);
+ RI->eraseFromParent();
+ }
+
+ OldI->eraseFromParent();
+ }
+};
+
+} // end anonymous namespace
+
+static StringRef getDeoptLowering(CallBase *Call) {
+ const char *DeoptLowering = "deopt-lowering";
+ if (Call->hasFnAttr(DeoptLowering)) {
+ // FIXME: Calls have a *really* confusing interface around attributes
+ // with values.
+ const AttributeList &CSAS = Call->getAttributes();
+ if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering))
+ return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering)
+ .getValueAsString();
+ Function *F = Call->getCalledFunction();
+ assert(F && F->hasFnAttribute(DeoptLowering));
+ return F->getFnAttribute(DeoptLowering).getValueAsString();
+ }
+ return "live-through";
+}
+
+static void
+makeStatepointExplicitImpl(CallBase *Call, /* to replace */
+ const SmallVectorImpl<Value *> &BasePtrs,
+ const SmallVectorImpl<Value *> &LiveVariables,
+ PartiallyConstructedSafepointRecord &Result,
+ std::vector<DeferredReplacement> &Replacements) {
+ assert(BasePtrs.size() == LiveVariables.size());
+
+ // Then go ahead and use the builder do actually do the inserts. We insert
+ // immediately before the previous instruction under the assumption that all
+ // arguments will be available here. We can't insert afterwards since we may
+ // be replacing a terminator.
+ IRBuilder<> Builder(Call);
+
+ ArrayRef<Value *> GCArgs(LiveVariables);
+ uint64_t StatepointID = StatepointDirectives::DefaultStatepointID;
+ uint32_t NumPatchBytes = 0;
+ uint32_t Flags = uint32_t(StatepointFlags::None);
+
SmallVector<Value *, 8> CallArgs(Call->args());
- Optional<ArrayRef<Use>> DeoptArgs;
- if (auto Bundle = Call->getOperandBundle(LLVMContext::OB_deopt))
- DeoptArgs = Bundle->Inputs;
- Optional<ArrayRef<Use>> TransitionArgs;
- if (auto Bundle = Call->getOperandBundle(LLVMContext::OB_gc_transition)) {
- TransitionArgs = Bundle->Inputs;
- // TODO: This flag no longer serves a purpose and can be removed later
- Flags |= uint32_t(StatepointFlags::GCTransition);
- }
-
- // Instead of lowering calls to @llvm.experimental.deoptimize as normal calls
- // with a return value, we lower then as never returning calls to
- // __llvm_deoptimize that are followed by unreachable to get better codegen.
- bool IsDeoptimize = false;
-
- StatepointDirectives SD =
- parseStatepointDirectivesFromAttrs(Call->getAttributes());
- if (SD.NumPatchBytes)
- NumPatchBytes = *SD.NumPatchBytes;
- if (SD.StatepointID)
- StatepointID = *SD.StatepointID;
-
- // Pass through the requested lowering if any. The default is live-through.
- StringRef DeoptLowering = getDeoptLowering(Call);
- if (DeoptLowering.equals("live-in"))
- Flags |= uint32_t(StatepointFlags::DeoptLiveIn);
- else {
- assert(DeoptLowering.equals("live-through") && "Unsupported value!");
- }
-
- Value *CallTarget = Call->getCalledOperand();
- if (Function *F = dyn_cast<Function>(CallTarget)) {
+ Optional<ArrayRef<Use>> DeoptArgs;
+ if (auto Bundle = Call->getOperandBundle(LLVMContext::OB_deopt))
+ DeoptArgs = Bundle->Inputs;
+ Optional<ArrayRef<Use>> TransitionArgs;
+ if (auto Bundle = Call->getOperandBundle(LLVMContext::OB_gc_transition)) {
+ TransitionArgs = Bundle->Inputs;
+ // TODO: This flag no longer serves a purpose and can be removed later
+ Flags |= uint32_t(StatepointFlags::GCTransition);
+ }
+
+ // Instead of lowering calls to @llvm.experimental.deoptimize as normal calls
+ // with a return value, we lower then as never returning calls to
+ // __llvm_deoptimize that are followed by unreachable to get better codegen.
+ bool IsDeoptimize = false;
+
+ StatepointDirectives SD =
+ parseStatepointDirectivesFromAttrs(Call->getAttributes());
+ if (SD.NumPatchBytes)
+ NumPatchBytes = *SD.NumPatchBytes;
+ if (SD.StatepointID)
+ StatepointID = *SD.StatepointID;
+
+ // Pass through the requested lowering if any. The default is live-through.
+ StringRef DeoptLowering = getDeoptLowering(Call);
+ if (DeoptLowering.equals("live-in"))
+ Flags |= uint32_t(StatepointFlags::DeoptLiveIn);
+ else {
+ assert(DeoptLowering.equals("live-through") && "Unsupported value!");
+ }
+
+ Value *CallTarget = Call->getCalledOperand();
+ if (Function *F = dyn_cast<Function>(CallTarget)) {
auto IID = F->getIntrinsicID();
if (IID == Intrinsic::experimental_deoptimize) {
- // Calls to llvm.experimental.deoptimize are lowered to calls to the
- // __llvm_deoptimize symbol. We want to resolve this now, since the
- // verifier does not allow taking the address of an intrinsic function.
-
- SmallVector<Type *, 8> DomainTy;
- for (Value *Arg : CallArgs)
- DomainTy.push_back(Arg->getType());
- auto *FTy = FunctionType::get(Type::getVoidTy(F->getContext()), DomainTy,
- /* isVarArg = */ false);
-
- // Note: CallTarget can be a bitcast instruction of a symbol if there are
- // calls to @llvm.experimental.deoptimize with different argument types in
- // the same module. This is fine -- we assume the frontend knew what it
- // was doing when generating this kind of IR.
- CallTarget = F->getParent()
- ->getOrInsertFunction("__llvm_deoptimize", FTy)
- .getCallee();
-
- IsDeoptimize = true;
+ // Calls to llvm.experimental.deoptimize are lowered to calls to the
+ // __llvm_deoptimize symbol. We want to resolve this now, since the
+ // verifier does not allow taking the address of an intrinsic function.
+
+ SmallVector<Type *, 8> DomainTy;
+ for (Value *Arg : CallArgs)
+ DomainTy.push_back(Arg->getType());
+ auto *FTy = FunctionType::get(Type::getVoidTy(F->getContext()), DomainTy,
+ /* isVarArg = */ false);
+
+ // Note: CallTarget can be a bitcast instruction of a symbol if there are
+ // calls to @llvm.experimental.deoptimize with different argument types in
+ // the same module. This is fine -- we assume the frontend knew what it
+ // was doing when generating this kind of IR.
+ CallTarget = F->getParent()
+ ->getOrInsertFunction("__llvm_deoptimize", FTy)
+ .getCallee();
+
+ IsDeoptimize = true;
} else if (IID == Intrinsic::memcpy_element_unordered_atomic ||
IID == Intrinsic::memmove_element_unordered_atomic) {
// Unordered atomic memcpy and memmove intrinsics which are not explicitly
@@ -1636,1045 +1636,1045 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
F->getParent()
->getOrInsertFunction(GetFunctionName(IID, ElementSizeCI), FTy)
.getCallee();
- }
- }
-
- // Create the statepoint given all the arguments
- GCStatepointInst *Token = nullptr;
- if (auto *CI = dyn_cast<CallInst>(Call)) {
- CallInst *SPCall = Builder.CreateGCStatepointCall(
- StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs,
- TransitionArgs, DeoptArgs, GCArgs, "safepoint_token");
-
- SPCall->setTailCallKind(CI->getTailCallKind());
- SPCall->setCallingConv(CI->getCallingConv());
-
- // Currently we will fail on parameter attributes and on certain
- // function attributes. In case if we can handle this set of attributes -
- // set up function attrs directly on statepoint and return attrs later for
- // gc_result intrinsic.
- SPCall->setAttributes(
- legalizeCallAttributes(CI->getContext(), CI->getAttributes()));
-
- Token = cast<GCStatepointInst>(SPCall);
-
- // Put the following gc_result and gc_relocate calls immediately after the
- // the old call (which we're about to delete)
- assert(CI->getNextNode() && "Not a terminator, must have next!");
- Builder.SetInsertPoint(CI->getNextNode());
- Builder.SetCurrentDebugLocation(CI->getNextNode()->getDebugLoc());
- } else {
- auto *II = cast<InvokeInst>(Call);
-
- // Insert the new invoke into the old block. We'll remove the old one in a
- // moment at which point this will become the new terminator for the
- // original block.
- InvokeInst *SPInvoke = Builder.CreateGCStatepointInvoke(
- StatepointID, NumPatchBytes, CallTarget, II->getNormalDest(),
- II->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs, GCArgs,
- "statepoint_token");
-
- SPInvoke->setCallingConv(II->getCallingConv());
-
- // Currently we will fail on parameter attributes and on certain
- // function attributes. In case if we can handle this set of attributes -
- // set up function attrs directly on statepoint and return attrs later for
- // gc_result intrinsic.
- SPInvoke->setAttributes(
- legalizeCallAttributes(II->getContext(), II->getAttributes()));
-
- Token = cast<GCStatepointInst>(SPInvoke);
-
- // Generate gc relocates in exceptional path
- BasicBlock *UnwindBlock = II->getUnwindDest();
- assert(!isa<PHINode>(UnwindBlock->begin()) &&
- UnwindBlock->getUniquePredecessor() &&
- "can't safely insert in this block!");
-
- Builder.SetInsertPoint(&*UnwindBlock->getFirstInsertionPt());
- Builder.SetCurrentDebugLocation(II->getDebugLoc());
-
- // Attach exceptional gc relocates to the landingpad.
- Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst();
- Result.UnwindToken = ExceptionalToken;
-
- CreateGCRelocates(LiveVariables, BasePtrs, ExceptionalToken, Builder);
-
- // Generate gc relocates and returns for normal block
- BasicBlock *NormalDest = II->getNormalDest();
- assert(!isa<PHINode>(NormalDest->begin()) &&
- NormalDest->getUniquePredecessor() &&
- "can't safely insert in this block!");
-
- Builder.SetInsertPoint(&*NormalDest->getFirstInsertionPt());
-
- // gc relocates will be generated later as if it were regular call
- // statepoint
- }
- assert(Token && "Should be set in one of the above branches!");
-
- if (IsDeoptimize) {
- // If we're wrapping an @llvm.experimental.deoptimize in a statepoint, we
- // transform the tail-call like structure to a call to a void function
- // followed by unreachable to get better codegen.
- Replacements.push_back(
- DeferredReplacement::createDeoptimizeReplacement(Call));
- } else {
- Token->setName("statepoint_token");
- if (!Call->getType()->isVoidTy() && !Call->use_empty()) {
- StringRef Name = Call->hasName() ? Call->getName() : "";
- CallInst *GCResult = Builder.CreateGCResult(Token, Call->getType(), Name);
- GCResult->setAttributes(
- AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex,
- Call->getAttributes().getRetAttributes()));
-
- // We cannot RAUW or delete CS.getInstruction() because it could be in the
- // live set of some other safepoint, in which case that safepoint's
- // PartiallyConstructedSafepointRecord will hold a raw pointer to this
- // llvm::Instruction. Instead, we defer the replacement and deletion to
- // after the live sets have been made explicit in the IR, and we no longer
- // have raw pointers to worry about.
- Replacements.emplace_back(
- DeferredReplacement::createRAUW(Call, GCResult));
- } else {
- Replacements.emplace_back(DeferredReplacement::createDelete(Call));
- }
- }
-
- Result.StatepointToken = Token;
-
- // Second, create a gc.relocate for every live variable
- CreateGCRelocates(LiveVariables, BasePtrs, Token, Builder);
-}
-
-// Replace an existing gc.statepoint with a new one and a set of gc.relocates
-// which make the relocations happening at this safepoint explicit.
-//
-// WARNING: Does not do any fixup to adjust users of the original live
-// values. That's the callers responsibility.
-static void
-makeStatepointExplicit(DominatorTree &DT, CallBase *Call,
- PartiallyConstructedSafepointRecord &Result,
- std::vector<DeferredReplacement> &Replacements) {
- const auto &LiveSet = Result.LiveSet;
- const auto &PointerToBase = Result.PointerToBase;
-
- // Convert to vector for efficient cross referencing.
- SmallVector<Value *, 64> BaseVec, LiveVec;
- LiveVec.reserve(LiveSet.size());
- BaseVec.reserve(LiveSet.size());
- for (Value *L : LiveSet) {
- LiveVec.push_back(L);
- assert(PointerToBase.count(L));
- Value *Base = PointerToBase.find(L)->second;
- BaseVec.push_back(Base);
- }
- assert(LiveVec.size() == BaseVec.size());
-
- // Do the actual rewriting and delete the old statepoint
- makeStatepointExplicitImpl(Call, BaseVec, LiveVec, Result, Replacements);
-}
-
-// Helper function for the relocationViaAlloca.
-//
-// It receives iterator to the statepoint gc relocates and emits a store to the
-// assigned location (via allocaMap) for the each one of them. It adds the
-// visited values into the visitedLiveValues set, which we will later use them
-// for sanity checking.
-static void
-insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
- DenseMap<Value *, AllocaInst *> &AllocaMap,
- DenseSet<Value *> &VisitedLiveValues) {
- for (User *U : GCRelocs) {
- GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U);
- if (!Relocate)
- continue;
-
- Value *OriginalValue = Relocate->getDerivedPtr();
- assert(AllocaMap.count(OriginalValue));
- Value *Alloca = AllocaMap[OriginalValue];
-
- // Emit store into the related alloca
- // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to
- // the correct type according to alloca.
- assert(Relocate->getNextNode() &&
- "Should always have one since it's not a terminator");
- IRBuilder<> Builder(Relocate->getNextNode());
- Value *CastedRelocatedValue =
- Builder.CreateBitCast(Relocate,
- cast<AllocaInst>(Alloca)->getAllocatedType(),
- suffixed_name_or(Relocate, ".casted", ""));
-
- new StoreInst(CastedRelocatedValue, Alloca,
- cast<Instruction>(CastedRelocatedValue)->getNextNode());
-
-#ifndef NDEBUG
- VisitedLiveValues.insert(OriginalValue);
-#endif
- }
-}
-
-// Helper function for the "relocationViaAlloca". Similar to the
-// "insertRelocationStores" but works for rematerialized values.
-static void insertRematerializationStores(
- const RematerializedValueMapTy &RematerializedValues,
- DenseMap<Value *, AllocaInst *> &AllocaMap,
- DenseSet<Value *> &VisitedLiveValues) {
- for (auto RematerializedValuePair: RematerializedValues) {
- Instruction *RematerializedValue = RematerializedValuePair.first;
- Value *OriginalValue = RematerializedValuePair.second;
-
- assert(AllocaMap.count(OriginalValue) &&
- "Can not find alloca for rematerialized value");
- Value *Alloca = AllocaMap[OriginalValue];
-
- new StoreInst(RematerializedValue, Alloca,
- RematerializedValue->getNextNode());
-
-#ifndef NDEBUG
- VisitedLiveValues.insert(OriginalValue);
-#endif
- }
-}
-
-/// Do all the relocation update via allocas and mem2reg
-static void relocationViaAlloca(
- Function &F, DominatorTree &DT, ArrayRef<Value *> Live,
- ArrayRef<PartiallyConstructedSafepointRecord> Records) {
-#ifndef NDEBUG
- // record initial number of (static) allocas; we'll check we have the same
- // number when we get done.
- int InitialAllocaNum = 0;
- for (Instruction &I : F.getEntryBlock())
- if (isa<AllocaInst>(I))
- InitialAllocaNum++;
-#endif
-
- // TODO-PERF: change data structures, reserve
- DenseMap<Value *, AllocaInst *> AllocaMap;
- SmallVector<AllocaInst *, 200> PromotableAllocas;
- // Used later to chack that we have enough allocas to store all values
- std::size_t NumRematerializedValues = 0;
- PromotableAllocas.reserve(Live.size());
-
- // Emit alloca for "LiveValue" and record it in "allocaMap" and
- // "PromotableAllocas"
- const DataLayout &DL = F.getParent()->getDataLayout();
- auto emitAllocaFor = [&](Value *LiveValue) {
- AllocaInst *Alloca = new AllocaInst(LiveValue->getType(),
- DL.getAllocaAddrSpace(), "",
- F.getEntryBlock().getFirstNonPHI());
- AllocaMap[LiveValue] = Alloca;
- PromotableAllocas.push_back(Alloca);
- };
-
- // Emit alloca for each live gc pointer
- for (Value *V : Live)
- emitAllocaFor(V);
-
- // Emit allocas for rematerialized values
- for (const auto &Info : Records)
- for (auto RematerializedValuePair : Info.RematerializedValues) {
- Value *OriginalValue = RematerializedValuePair.second;
- if (AllocaMap.count(OriginalValue) != 0)
- continue;
-
- emitAllocaFor(OriginalValue);
- ++NumRematerializedValues;
- }
-
- // The next two loops are part of the same conceptual operation. We need to
- // insert a store to the alloca after the original def and at each
- // redefinition. We need to insert a load before each use. These are split
- // into distinct loops for performance reasons.
-
- // Update gc pointer after each statepoint: either store a relocated value or
- // null (if no relocated value was found for this gc pointer and it is not a
- // gc_result). This must happen before we update the statepoint with load of
- // alloca otherwise we lose the link between statepoint and old def.
- for (const auto &Info : Records) {
- Value *Statepoint = Info.StatepointToken;
-
- // This will be used for consistency check
- DenseSet<Value *> VisitedLiveValues;
-
- // Insert stores for normal statepoint gc relocates
- insertRelocationStores(Statepoint->users(), AllocaMap, VisitedLiveValues);
-
- // In case if it was invoke statepoint
- // we will insert stores for exceptional path gc relocates.
- if (isa<InvokeInst>(Statepoint)) {
- insertRelocationStores(Info.UnwindToken->users(), AllocaMap,
- VisitedLiveValues);
- }
-
- // Do similar thing with rematerialized values
- insertRematerializationStores(Info.RematerializedValues, AllocaMap,
- VisitedLiveValues);
-
- if (ClobberNonLive) {
- // As a debugging aid, pretend that an unrelocated pointer becomes null at
- // the gc.statepoint. This will turn some subtle GC problems into
- // slightly easier to debug SEGVs. Note that on large IR files with
- // lots of gc.statepoints this is extremely costly both memory and time
- // wise.
- SmallVector<AllocaInst *, 64> ToClobber;
- for (auto Pair : AllocaMap) {
- Value *Def = Pair.first;
- AllocaInst *Alloca = Pair.second;
-
- // This value was relocated
- if (VisitedLiveValues.count(Def)) {
- continue;
- }
- ToClobber.push_back(Alloca);
- }
-
- auto InsertClobbersAt = [&](Instruction *IP) {
- for (auto *AI : ToClobber) {
- auto PT = cast<PointerType>(AI->getAllocatedType());
- Constant *CPN = ConstantPointerNull::get(PT);
- new StoreInst(CPN, AI, IP);
- }
- };
-
- // Insert the clobbering stores. These may get intermixed with the
- // gc.results and gc.relocates, but that's fine.
- if (auto II = dyn_cast<InvokeInst>(Statepoint)) {
- InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt());
- InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt());
- } else {
- InsertClobbersAt(cast<Instruction>(Statepoint)->getNextNode());
- }
- }
- }
-
- // Update use with load allocas and add store for gc_relocated.
- for (auto Pair : AllocaMap) {
- Value *Def = Pair.first;
- AllocaInst *Alloca = Pair.second;
-
- // We pre-record the uses of allocas so that we dont have to worry about
- // later update that changes the user information..
-
- SmallVector<Instruction *, 20> Uses;
- // PERF: trade a linear scan for repeated reallocation
- Uses.reserve(Def->getNumUses());
- for (User *U : Def->users()) {
- if (!isa<ConstantExpr>(U)) {
- // If the def has a ConstantExpr use, then the def is either a
- // ConstantExpr use itself or null. In either case
- // (recursively in the first, directly in the second), the oop
- // it is ultimately dependent on is null and this particular
- // use does not need to be fixed up.
- Uses.push_back(cast<Instruction>(U));
- }
- }
-
- llvm::sort(Uses);
- auto Last = std::unique(Uses.begin(), Uses.end());
- Uses.erase(Last, Uses.end());
-
- for (Instruction *Use : Uses) {
- if (isa<PHINode>(Use)) {
- PHINode *Phi = cast<PHINode>(Use);
- for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++) {
- if (Def == Phi->getIncomingValue(i)) {
- LoadInst *Load =
- new LoadInst(Alloca->getAllocatedType(), Alloca, "",
- Phi->getIncomingBlock(i)->getTerminator());
- Phi->setIncomingValue(i, Load);
- }
- }
- } else {
- LoadInst *Load =
- new LoadInst(Alloca->getAllocatedType(), Alloca, "", Use);
- Use->replaceUsesOfWith(Def, Load);
- }
- }
-
- // Emit store for the initial gc value. Store must be inserted after load,
- // otherwise store will be in alloca's use list and an extra load will be
- // inserted before it.
- StoreInst *Store = new StoreInst(Def, Alloca, /*volatile*/ false,
- DL.getABITypeAlign(Def->getType()));
- if (Instruction *Inst = dyn_cast<Instruction>(Def)) {
- if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) {
- // InvokeInst is a terminator so the store need to be inserted into its
- // normal destination block.
- BasicBlock *NormalDest = Invoke->getNormalDest();
- Store->insertBefore(NormalDest->getFirstNonPHI());
- } else {
- assert(!Inst->isTerminator() &&
- "The only terminator that can produce a value is "
- "InvokeInst which is handled above.");
- Store->insertAfter(Inst);
- }
- } else {
- assert(isa<Argument>(Def));
- Store->insertAfter(cast<Instruction>(Alloca));
- }
- }
-
- assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues &&
- "we must have the same allocas with lives");
- if (!PromotableAllocas.empty()) {
- // Apply mem2reg to promote alloca to SSA
- PromoteMemToReg(PromotableAllocas, DT);
- }
-
-#ifndef NDEBUG
- for (auto &I : F.getEntryBlock())
- if (isa<AllocaInst>(I))
- InitialAllocaNum--;
- assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas");
-#endif
-}
-
-/// Implement a unique function which doesn't require we sort the input
-/// vector. Doing so has the effect of changing the output of a couple of
-/// tests in ways which make them less useful in testing fused safepoints.
-template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) {
- SmallSet<T, 8> Seen;
+ }
+ }
+
+ // Create the statepoint given all the arguments
+ GCStatepointInst *Token = nullptr;
+ if (auto *CI = dyn_cast<CallInst>(Call)) {
+ CallInst *SPCall = Builder.CreateGCStatepointCall(
+ StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs,
+ TransitionArgs, DeoptArgs, GCArgs, "safepoint_token");
+
+ SPCall->setTailCallKind(CI->getTailCallKind());
+ SPCall->setCallingConv(CI->getCallingConv());
+
+ // Currently we will fail on parameter attributes and on certain
+ // function attributes. In case if we can handle this set of attributes -
+ // set up function attrs directly on statepoint and return attrs later for
+ // gc_result intrinsic.
+ SPCall->setAttributes(
+ legalizeCallAttributes(CI->getContext(), CI->getAttributes()));
+
+ Token = cast<GCStatepointInst>(SPCall);
+
+ // Put the following gc_result and gc_relocate calls immediately after the
+ // the old call (which we're about to delete)
+ assert(CI->getNextNode() && "Not a terminator, must have next!");
+ Builder.SetInsertPoint(CI->getNextNode());
+ Builder.SetCurrentDebugLocation(CI->getNextNode()->getDebugLoc());
+ } else {
+ auto *II = cast<InvokeInst>(Call);
+
+ // Insert the new invoke into the old block. We'll remove the old one in a
+ // moment at which point this will become the new terminator for the
+ // original block.
+ InvokeInst *SPInvoke = Builder.CreateGCStatepointInvoke(
+ StatepointID, NumPatchBytes, CallTarget, II->getNormalDest(),
+ II->getUnwindDest(), Flags, CallArgs, TransitionArgs, DeoptArgs, GCArgs,
+ "statepoint_token");
+
+ SPInvoke->setCallingConv(II->getCallingConv());
+
+ // Currently we will fail on parameter attributes and on certain
+ // function attributes. In case if we can handle this set of attributes -
+ // set up function attrs directly on statepoint and return attrs later for
+ // gc_result intrinsic.
+ SPInvoke->setAttributes(
+ legalizeCallAttributes(II->getContext(), II->getAttributes()));
+
+ Token = cast<GCStatepointInst>(SPInvoke);
+
+ // Generate gc relocates in exceptional path
+ BasicBlock *UnwindBlock = II->getUnwindDest();
+ assert(!isa<PHINode>(UnwindBlock->begin()) &&
+ UnwindBlock->getUniquePredecessor() &&
+ "can't safely insert in this block!");
+
+ Builder.SetInsertPoint(&*UnwindBlock->getFirstInsertionPt());
+ Builder.SetCurrentDebugLocation(II->getDebugLoc());
+
+ // Attach exceptional gc relocates to the landingpad.
+ Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst();
+ Result.UnwindToken = ExceptionalToken;
+
+ CreateGCRelocates(LiveVariables, BasePtrs, ExceptionalToken, Builder);
+
+ // Generate gc relocates and returns for normal block
+ BasicBlock *NormalDest = II->getNormalDest();
+ assert(!isa<PHINode>(NormalDest->begin()) &&
+ NormalDest->getUniquePredecessor() &&
+ "can't safely insert in this block!");
+
+ Builder.SetInsertPoint(&*NormalDest->getFirstInsertionPt());
+
+ // gc relocates will be generated later as if it were regular call
+ // statepoint
+ }
+ assert(Token && "Should be set in one of the above branches!");
+
+ if (IsDeoptimize) {
+ // If we're wrapping an @llvm.experimental.deoptimize in a statepoint, we
+ // transform the tail-call like structure to a call to a void function
+ // followed by unreachable to get better codegen.
+ Replacements.push_back(
+ DeferredReplacement::createDeoptimizeReplacement(Call));
+ } else {
+ Token->setName("statepoint_token");
+ if (!Call->getType()->isVoidTy() && !Call->use_empty()) {
+ StringRef Name = Call->hasName() ? Call->getName() : "";
+ CallInst *GCResult = Builder.CreateGCResult(Token, Call->getType(), Name);
+ GCResult->setAttributes(
+ AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex,
+ Call->getAttributes().getRetAttributes()));
+
+ // We cannot RAUW or delete CS.getInstruction() because it could be in the
+ // live set of some other safepoint, in which case that safepoint's
+ // PartiallyConstructedSafepointRecord will hold a raw pointer to this
+ // llvm::Instruction. Instead, we defer the replacement and deletion to
+ // after the live sets have been made explicit in the IR, and we no longer
+ // have raw pointers to worry about.
+ Replacements.emplace_back(
+ DeferredReplacement::createRAUW(Call, GCResult));
+ } else {
+ Replacements.emplace_back(DeferredReplacement::createDelete(Call));
+ }
+ }
+
+ Result.StatepointToken = Token;
+
+ // Second, create a gc.relocate for every live variable
+ CreateGCRelocates(LiveVariables, BasePtrs, Token, Builder);
+}
+
+// Replace an existing gc.statepoint with a new one and a set of gc.relocates
+// which make the relocations happening at this safepoint explicit.
+//
+// WARNING: Does not do any fixup to adjust users of the original live
+// values. That's the callers responsibility.
+static void
+makeStatepointExplicit(DominatorTree &DT, CallBase *Call,
+ PartiallyConstructedSafepointRecord &Result,
+ std::vector<DeferredReplacement> &Replacements) {
+ const auto &LiveSet = Result.LiveSet;
+ const auto &PointerToBase = Result.PointerToBase;
+
+ // Convert to vector for efficient cross referencing.
+ SmallVector<Value *, 64> BaseVec, LiveVec;
+ LiveVec.reserve(LiveSet.size());
+ BaseVec.reserve(LiveSet.size());
+ for (Value *L : LiveSet) {
+ LiveVec.push_back(L);
+ assert(PointerToBase.count(L));
+ Value *Base = PointerToBase.find(L)->second;
+ BaseVec.push_back(Base);
+ }
+ assert(LiveVec.size() == BaseVec.size());
+
+ // Do the actual rewriting and delete the old statepoint
+ makeStatepointExplicitImpl(Call, BaseVec, LiveVec, Result, Replacements);
+}
+
+// Helper function for the relocationViaAlloca.
+//
+// It receives iterator to the statepoint gc relocates and emits a store to the
+// assigned location (via allocaMap) for the each one of them. It adds the
+// visited values into the visitedLiveValues set, which we will later use them
+// for sanity checking.
+static void
+insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
+ DenseMap<Value *, AllocaInst *> &AllocaMap,
+ DenseSet<Value *> &VisitedLiveValues) {
+ for (User *U : GCRelocs) {
+ GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U);
+ if (!Relocate)
+ continue;
+
+ Value *OriginalValue = Relocate->getDerivedPtr();
+ assert(AllocaMap.count(OriginalValue));
+ Value *Alloca = AllocaMap[OriginalValue];
+
+ // Emit store into the related alloca
+ // All gc_relocates are i8 addrspace(1)* typed, and it must be bitcasted to
+ // the correct type according to alloca.
+ assert(Relocate->getNextNode() &&
+ "Should always have one since it's not a terminator");
+ IRBuilder<> Builder(Relocate->getNextNode());
+ Value *CastedRelocatedValue =
+ Builder.CreateBitCast(Relocate,
+ cast<AllocaInst>(Alloca)->getAllocatedType(),
+ suffixed_name_or(Relocate, ".casted", ""));
+
+ new StoreInst(CastedRelocatedValue, Alloca,
+ cast<Instruction>(CastedRelocatedValue)->getNextNode());
+
+#ifndef NDEBUG
+ VisitedLiveValues.insert(OriginalValue);
+#endif
+ }
+}
+
+// Helper function for the "relocationViaAlloca". Similar to the
+// "insertRelocationStores" but works for rematerialized values.
+static void insertRematerializationStores(
+ const RematerializedValueMapTy &RematerializedValues,
+ DenseMap<Value *, AllocaInst *> &AllocaMap,
+ DenseSet<Value *> &VisitedLiveValues) {
+ for (auto RematerializedValuePair: RematerializedValues) {
+ Instruction *RematerializedValue = RematerializedValuePair.first;
+ Value *OriginalValue = RematerializedValuePair.second;
+
+ assert(AllocaMap.count(OriginalValue) &&
+ "Can not find alloca for rematerialized value");
+ Value *Alloca = AllocaMap[OriginalValue];
+
+ new StoreInst(RematerializedValue, Alloca,
+ RematerializedValue->getNextNode());
+
+#ifndef NDEBUG
+ VisitedLiveValues.insert(OriginalValue);
+#endif
+ }
+}
+
+/// Do all the relocation update via allocas and mem2reg
+static void relocationViaAlloca(
+ Function &F, DominatorTree &DT, ArrayRef<Value *> Live,
+ ArrayRef<PartiallyConstructedSafepointRecord> Records) {
+#ifndef NDEBUG
+ // record initial number of (static) allocas; we'll check we have the same
+ // number when we get done.
+ int InitialAllocaNum = 0;
+ for (Instruction &I : F.getEntryBlock())
+ if (isa<AllocaInst>(I))
+ InitialAllocaNum++;
+#endif
+
+ // TODO-PERF: change data structures, reserve
+ DenseMap<Value *, AllocaInst *> AllocaMap;
+ SmallVector<AllocaInst *, 200> PromotableAllocas;
+ // Used later to chack that we have enough allocas to store all values
+ std::size_t NumRematerializedValues = 0;
+ PromotableAllocas.reserve(Live.size());
+
+ // Emit alloca for "LiveValue" and record it in "allocaMap" and
+ // "PromotableAllocas"
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ auto emitAllocaFor = [&](Value *LiveValue) {
+ AllocaInst *Alloca = new AllocaInst(LiveValue->getType(),
+ DL.getAllocaAddrSpace(), "",
+ F.getEntryBlock().getFirstNonPHI());
+ AllocaMap[LiveValue] = Alloca;
+ PromotableAllocas.push_back(Alloca);
+ };
+
+ // Emit alloca for each live gc pointer
+ for (Value *V : Live)
+ emitAllocaFor(V);
+
+ // Emit allocas for rematerialized values
+ for (const auto &Info : Records)
+ for (auto RematerializedValuePair : Info.RematerializedValues) {
+ Value *OriginalValue = RematerializedValuePair.second;
+ if (AllocaMap.count(OriginalValue) != 0)
+ continue;
+
+ emitAllocaFor(OriginalValue);
+ ++NumRematerializedValues;
+ }
+
+ // The next two loops are part of the same conceptual operation. We need to
+ // insert a store to the alloca after the original def and at each
+ // redefinition. We need to insert a load before each use. These are split
+ // into distinct loops for performance reasons.
+
+ // Update gc pointer after each statepoint: either store a relocated value or
+ // null (if no relocated value was found for this gc pointer and it is not a
+ // gc_result). This must happen before we update the statepoint with load of
+ // alloca otherwise we lose the link between statepoint and old def.
+ for (const auto &Info : Records) {
+ Value *Statepoint = Info.StatepointToken;
+
+ // This will be used for consistency check
+ DenseSet<Value *> VisitedLiveValues;
+
+ // Insert stores for normal statepoint gc relocates
+ insertRelocationStores(Statepoint->users(), AllocaMap, VisitedLiveValues);
+
+ // In case if it was invoke statepoint
+ // we will insert stores for exceptional path gc relocates.
+ if (isa<InvokeInst>(Statepoint)) {
+ insertRelocationStores(Info.UnwindToken->users(), AllocaMap,
+ VisitedLiveValues);
+ }
+
+ // Do similar thing with rematerialized values
+ insertRematerializationStores(Info.RematerializedValues, AllocaMap,
+ VisitedLiveValues);
+
+ if (ClobberNonLive) {
+ // As a debugging aid, pretend that an unrelocated pointer becomes null at
+ // the gc.statepoint. This will turn some subtle GC problems into
+ // slightly easier to debug SEGVs. Note that on large IR files with
+ // lots of gc.statepoints this is extremely costly both memory and time
+ // wise.
+ SmallVector<AllocaInst *, 64> ToClobber;
+ for (auto Pair : AllocaMap) {
+ Value *Def = Pair.first;
+ AllocaInst *Alloca = Pair.second;
+
+ // This value was relocated
+ if (VisitedLiveValues.count(Def)) {
+ continue;
+ }
+ ToClobber.push_back(Alloca);
+ }
+
+ auto InsertClobbersAt = [&](Instruction *IP) {
+ for (auto *AI : ToClobber) {
+ auto PT = cast<PointerType>(AI->getAllocatedType());
+ Constant *CPN = ConstantPointerNull::get(PT);
+ new StoreInst(CPN, AI, IP);
+ }
+ };
+
+ // Insert the clobbering stores. These may get intermixed with the
+ // gc.results and gc.relocates, but that's fine.
+ if (auto II = dyn_cast<InvokeInst>(Statepoint)) {
+ InsertClobbersAt(&*II->getNormalDest()->getFirstInsertionPt());
+ InsertClobbersAt(&*II->getUnwindDest()->getFirstInsertionPt());
+ } else {
+ InsertClobbersAt(cast<Instruction>(Statepoint)->getNextNode());
+ }
+ }
+ }
+
+ // Update use with load allocas and add store for gc_relocated.
+ for (auto Pair : AllocaMap) {
+ Value *Def = Pair.first;
+ AllocaInst *Alloca = Pair.second;
+
+ // We pre-record the uses of allocas so that we dont have to worry about
+ // later update that changes the user information..
+
+ SmallVector<Instruction *, 20> Uses;
+ // PERF: trade a linear scan for repeated reallocation
+ Uses.reserve(Def->getNumUses());
+ for (User *U : Def->users()) {
+ if (!isa<ConstantExpr>(U)) {
+ // If the def has a ConstantExpr use, then the def is either a
+ // ConstantExpr use itself or null. In either case
+ // (recursively in the first, directly in the second), the oop
+ // it is ultimately dependent on is null and this particular
+ // use does not need to be fixed up.
+ Uses.push_back(cast<Instruction>(U));
+ }
+ }
+
+ llvm::sort(Uses);
+ auto Last = std::unique(Uses.begin(), Uses.end());
+ Uses.erase(Last, Uses.end());
+
+ for (Instruction *Use : Uses) {
+ if (isa<PHINode>(Use)) {
+ PHINode *Phi = cast<PHINode>(Use);
+ for (unsigned i = 0; i < Phi->getNumIncomingValues(); i++) {
+ if (Def == Phi->getIncomingValue(i)) {
+ LoadInst *Load =
+ new LoadInst(Alloca->getAllocatedType(), Alloca, "",
+ Phi->getIncomingBlock(i)->getTerminator());
+ Phi->setIncomingValue(i, Load);
+ }
+ }
+ } else {
+ LoadInst *Load =
+ new LoadInst(Alloca->getAllocatedType(), Alloca, "", Use);
+ Use->replaceUsesOfWith(Def, Load);
+ }
+ }
+
+ // Emit store for the initial gc value. Store must be inserted after load,
+ // otherwise store will be in alloca's use list and an extra load will be
+ // inserted before it.
+ StoreInst *Store = new StoreInst(Def, Alloca, /*volatile*/ false,
+ DL.getABITypeAlign(Def->getType()));
+ if (Instruction *Inst = dyn_cast<Instruction>(Def)) {
+ if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) {
+ // InvokeInst is a terminator so the store need to be inserted into its
+ // normal destination block.
+ BasicBlock *NormalDest = Invoke->getNormalDest();
+ Store->insertBefore(NormalDest->getFirstNonPHI());
+ } else {
+ assert(!Inst->isTerminator() &&
+ "The only terminator that can produce a value is "
+ "InvokeInst which is handled above.");
+ Store->insertAfter(Inst);
+ }
+ } else {
+ assert(isa<Argument>(Def));
+ Store->insertAfter(cast<Instruction>(Alloca));
+ }
+ }
+
+ assert(PromotableAllocas.size() == Live.size() + NumRematerializedValues &&
+ "we must have the same allocas with lives");
+ if (!PromotableAllocas.empty()) {
+ // Apply mem2reg to promote alloca to SSA
+ PromoteMemToReg(PromotableAllocas, DT);
+ }
+
+#ifndef NDEBUG
+ for (auto &I : F.getEntryBlock())
+ if (isa<AllocaInst>(I))
+ InitialAllocaNum--;
+ assert(InitialAllocaNum == 0 && "We must not introduce any extra allocas");
+#endif
+}
+
+/// Implement a unique function which doesn't require we sort the input
+/// vector. Doing so has the effect of changing the output of a couple of
+/// tests in ways which make them less useful in testing fused safepoints.
+template <typename T> static void unique_unsorted(SmallVectorImpl<T> &Vec) {
+ SmallSet<T, 8> Seen;
erase_if(Vec, [&](const T &V) { return !Seen.insert(V).second; });
-}
-
-/// Insert holders so that each Value is obviously live through the entire
-/// lifetime of the call.
-static void insertUseHolderAfter(CallBase *Call, const ArrayRef<Value *> Values,
- SmallVectorImpl<CallInst *> &Holders) {
- if (Values.empty())
- // No values to hold live, might as well not insert the empty holder
- return;
-
- Module *M = Call->getModule();
- // Use a dummy vararg function to actually hold the values live
- FunctionCallee Func = M->getOrInsertFunction(
- "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true));
- if (isa<CallInst>(Call)) {
- // For call safepoints insert dummy calls right after safepoint
- Holders.push_back(
- CallInst::Create(Func, Values, "", &*++Call->getIterator()));
- return;
- }
- // For invoke safepooints insert dummy calls both in normal and
- // exceptional destination blocks
- auto *II = cast<InvokeInst>(Call);
- Holders.push_back(CallInst::Create(
- Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt()));
- Holders.push_back(CallInst::Create(
- Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt()));
-}
-
-static void findLiveReferences(
- Function &F, DominatorTree &DT, ArrayRef<CallBase *> toUpdate,
- MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
- GCPtrLivenessData OriginalLivenessData;
- computeLiveInValues(DT, F, OriginalLivenessData);
- for (size_t i = 0; i < records.size(); i++) {
- struct PartiallyConstructedSafepointRecord &info = records[i];
- analyzeParsePointLiveness(DT, OriginalLivenessData, toUpdate[i], info);
- }
-}
-
-// Helper function for the "rematerializeLiveValues". It walks use chain
-// starting from the "CurrentValue" until it reaches the root of the chain, i.e.
-// the base or a value it cannot process. Only "simple" values are processed
-// (currently it is GEP's and casts). The returned root is examined by the
-// callers of findRematerializableChainToBasePointer. Fills "ChainToBase" array
-// with all visited values.
-static Value* findRematerializableChainToBasePointer(
- SmallVectorImpl<Instruction*> &ChainToBase,
- Value *CurrentValue) {
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurrentValue)) {
- ChainToBase.push_back(GEP);
- return findRematerializableChainToBasePointer(ChainToBase,
- GEP->getPointerOperand());
- }
-
- if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) {
- if (!CI->isNoopCast(CI->getModule()->getDataLayout()))
- return CI;
-
- ChainToBase.push_back(CI);
- return findRematerializableChainToBasePointer(ChainToBase,
- CI->getOperand(0));
- }
-
- // We have reached the root of the chain, which is either equal to the base or
- // is the first unsupported value along the use chain.
- return CurrentValue;
-}
-
-// Helper function for the "rematerializeLiveValues". Compute cost of the use
-// chain we are going to rematerialize.
+}
+
+/// Insert holders so that each Value is obviously live through the entire
+/// lifetime of the call.
+static void insertUseHolderAfter(CallBase *Call, const ArrayRef<Value *> Values,
+ SmallVectorImpl<CallInst *> &Holders) {
+ if (Values.empty())
+ // No values to hold live, might as well not insert the empty holder
+ return;
+
+ Module *M = Call->getModule();
+ // Use a dummy vararg function to actually hold the values live
+ FunctionCallee Func = M->getOrInsertFunction(
+ "__tmp_use", FunctionType::get(Type::getVoidTy(M->getContext()), true));
+ if (isa<CallInst>(Call)) {
+ // For call safepoints insert dummy calls right after safepoint
+ Holders.push_back(
+ CallInst::Create(Func, Values, "", &*++Call->getIterator()));
+ return;
+ }
+ // For invoke safepooints insert dummy calls both in normal and
+ // exceptional destination blocks
+ auto *II = cast<InvokeInst>(Call);
+ Holders.push_back(CallInst::Create(
+ Func, Values, "", &*II->getNormalDest()->getFirstInsertionPt()));
+ Holders.push_back(CallInst::Create(
+ Func, Values, "", &*II->getUnwindDest()->getFirstInsertionPt()));
+}
+
+static void findLiveReferences(
+ Function &F, DominatorTree &DT, ArrayRef<CallBase *> toUpdate,
+ MutableArrayRef<struct PartiallyConstructedSafepointRecord> records) {
+ GCPtrLivenessData OriginalLivenessData;
+ computeLiveInValues(DT, F, OriginalLivenessData);
+ for (size_t i = 0; i < records.size(); i++) {
+ struct PartiallyConstructedSafepointRecord &info = records[i];
+ analyzeParsePointLiveness(DT, OriginalLivenessData, toUpdate[i], info);
+ }
+}
+
+// Helper function for the "rematerializeLiveValues". It walks use chain
+// starting from the "CurrentValue" until it reaches the root of the chain, i.e.
+// the base or a value it cannot process. Only "simple" values are processed
+// (currently it is GEP's and casts). The returned root is examined by the
+// callers of findRematerializableChainToBasePointer. Fills "ChainToBase" array
+// with all visited values.
+static Value* findRematerializableChainToBasePointer(
+ SmallVectorImpl<Instruction*> &ChainToBase,
+ Value *CurrentValue) {
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurrentValue)) {
+ ChainToBase.push_back(GEP);
+ return findRematerializableChainToBasePointer(ChainToBase,
+ GEP->getPointerOperand());
+ }
+
+ if (CastInst *CI = dyn_cast<CastInst>(CurrentValue)) {
+ if (!CI->isNoopCast(CI->getModule()->getDataLayout()))
+ return CI;
+
+ ChainToBase.push_back(CI);
+ return findRematerializableChainToBasePointer(ChainToBase,
+ CI->getOperand(0));
+ }
+
+ // We have reached the root of the chain, which is either equal to the base or
+ // is the first unsupported value along the use chain.
+ return CurrentValue;
+}
+
+// Helper function for the "rematerializeLiveValues". Compute cost of the use
+// chain we are going to rematerialize.
static InstructionCost
chainToBasePointerCost(SmallVectorImpl<Instruction *> &Chain,
- TargetTransformInfo &TTI) {
+ TargetTransformInfo &TTI) {
InstructionCost Cost = 0;
-
- for (Instruction *Instr : Chain) {
- if (CastInst *CI = dyn_cast<CastInst>(Instr)) {
- assert(CI->isNoopCast(CI->getModule()->getDataLayout()) &&
- "non noop cast is found during rematerialization");
-
- Type *SrcTy = CI->getOperand(0)->getType();
- Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy,
+
+ for (Instruction *Instr : Chain) {
+ if (CastInst *CI = dyn_cast<CastInst>(Instr)) {
+ assert(CI->isNoopCast(CI->getModule()->getDataLayout()) &&
+ "non noop cast is found during rematerialization");
+
+ Type *SrcTy = CI->getOperand(0)->getType();
+ Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy,
TTI::getCastContextHint(CI),
TargetTransformInfo::TCK_SizeAndLatency, CI);
-
- } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
- // Cost of the address calculation
- Type *ValTy = GEP->getSourceElementType();
- Cost += TTI.getAddressComputationCost(ValTy);
-
- // And cost of the GEP itself
- // TODO: Use TTI->getGEPCost here (it exists, but appears to be not
- // allowed for the external usage)
- if (!GEP->hasAllConstantIndices())
- Cost += 2;
-
- } else {
- llvm_unreachable("unsupported instruction type during rematerialization");
- }
- }
-
- return Cost;
-}
-
-static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPhi) {
- unsigned PhiNum = OrigRootPhi.getNumIncomingValues();
- if (PhiNum != AlternateRootPhi.getNumIncomingValues() ||
- OrigRootPhi.getParent() != AlternateRootPhi.getParent())
- return false;
- // Map of incoming values and their corresponding basic blocks of
- // OrigRootPhi.
- SmallDenseMap<Value *, BasicBlock *, 8> CurrentIncomingValues;
- for (unsigned i = 0; i < PhiNum; i++)
- CurrentIncomingValues[OrigRootPhi.getIncomingValue(i)] =
- OrigRootPhi.getIncomingBlock(i);
-
- // Both current and base PHIs should have same incoming values and
- // the same basic blocks corresponding to the incoming values.
- for (unsigned i = 0; i < PhiNum; i++) {
- auto CIVI =
- CurrentIncomingValues.find(AlternateRootPhi.getIncomingValue(i));
- if (CIVI == CurrentIncomingValues.end())
- return false;
- BasicBlock *CurrentIncomingBB = CIVI->second;
- if (CurrentIncomingBB != AlternateRootPhi.getIncomingBlock(i))
- return false;
- }
- return true;
-}
-
-// From the statepoint live set pick values that are cheaper to recompute then
-// to relocate. Remove this values from the live set, rematerialize them after
-// statepoint and record them in "Info" structure. Note that similar to
-// relocated values we don't do any user adjustments here.
-static void rematerializeLiveValues(CallBase *Call,
- PartiallyConstructedSafepointRecord &Info,
- TargetTransformInfo &TTI) {
- const unsigned int ChainLengthThreshold = 10;
-
- // Record values we are going to delete from this statepoint live set.
- // We can not di this in following loop due to iterator invalidation.
- SmallVector<Value *, 32> LiveValuesToBeDeleted;
-
- for (Value *LiveValue: Info.LiveSet) {
- // For each live pointer find its defining chain
- SmallVector<Instruction *, 3> ChainToBase;
- assert(Info.PointerToBase.count(LiveValue));
- Value *RootOfChain =
- findRematerializableChainToBasePointer(ChainToBase,
- LiveValue);
-
- // Nothing to do, or chain is too long
- if ( ChainToBase.size() == 0 ||
- ChainToBase.size() > ChainLengthThreshold)
- continue;
-
- // Handle the scenario where the RootOfChain is not equal to the
- // Base Value, but they are essentially the same phi values.
- if (RootOfChain != Info.PointerToBase[LiveValue]) {
- PHINode *OrigRootPhi = dyn_cast<PHINode>(RootOfChain);
- PHINode *AlternateRootPhi = dyn_cast<PHINode>(Info.PointerToBase[LiveValue]);
- if (!OrigRootPhi || !AlternateRootPhi)
- continue;
- // PHI nodes that have the same incoming values, and belonging to the same
- // basic blocks are essentially the same SSA value. When the original phi
- // has incoming values with different base pointers, the original phi is
- // marked as conflict, and an additional `AlternateRootPhi` with the same
- // incoming values get generated by the findBasePointer function. We need
- // to identify the newly generated AlternateRootPhi (.base version of phi)
- // and RootOfChain (the original phi node itself) are the same, so that we
- // can rematerialize the gep and casts. This is a workaround for the
- // deficiency in the findBasePointer algorithm.
- if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi))
- continue;
- // Now that the phi nodes are proved to be the same, assert that
- // findBasePointer's newly generated AlternateRootPhi is present in the
- // liveset of the call.
- assert(Info.LiveSet.count(AlternateRootPhi));
- }
- // Compute cost of this chain
+
+ } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
+ // Cost of the address calculation
+ Type *ValTy = GEP->getSourceElementType();
+ Cost += TTI.getAddressComputationCost(ValTy);
+
+ // And cost of the GEP itself
+ // TODO: Use TTI->getGEPCost here (it exists, but appears to be not
+ // allowed for the external usage)
+ if (!GEP->hasAllConstantIndices())
+ Cost += 2;
+
+ } else {
+ llvm_unreachable("unsupported instruction type during rematerialization");
+ }
+ }
+
+ return Cost;
+}
+
+static bool AreEquivalentPhiNodes(PHINode &OrigRootPhi, PHINode &AlternateRootPhi) {
+ unsigned PhiNum = OrigRootPhi.getNumIncomingValues();
+ if (PhiNum != AlternateRootPhi.getNumIncomingValues() ||
+ OrigRootPhi.getParent() != AlternateRootPhi.getParent())
+ return false;
+ // Map of incoming values and their corresponding basic blocks of
+ // OrigRootPhi.
+ SmallDenseMap<Value *, BasicBlock *, 8> CurrentIncomingValues;
+ for (unsigned i = 0; i < PhiNum; i++)
+ CurrentIncomingValues[OrigRootPhi.getIncomingValue(i)] =
+ OrigRootPhi.getIncomingBlock(i);
+
+ // Both current and base PHIs should have same incoming values and
+ // the same basic blocks corresponding to the incoming values.
+ for (unsigned i = 0; i < PhiNum; i++) {
+ auto CIVI =
+ CurrentIncomingValues.find(AlternateRootPhi.getIncomingValue(i));
+ if (CIVI == CurrentIncomingValues.end())
+ return false;
+ BasicBlock *CurrentIncomingBB = CIVI->second;
+ if (CurrentIncomingBB != AlternateRootPhi.getIncomingBlock(i))
+ return false;
+ }
+ return true;
+}
+
+// From the statepoint live set pick values that are cheaper to recompute then
+// to relocate. Remove this values from the live set, rematerialize them after
+// statepoint and record them in "Info" structure. Note that similar to
+// relocated values we don't do any user adjustments here.
+static void rematerializeLiveValues(CallBase *Call,
+ PartiallyConstructedSafepointRecord &Info,
+ TargetTransformInfo &TTI) {
+ const unsigned int ChainLengthThreshold = 10;
+
+ // Record values we are going to delete from this statepoint live set.
+ // We can not di this in following loop due to iterator invalidation.
+ SmallVector<Value *, 32> LiveValuesToBeDeleted;
+
+ for (Value *LiveValue: Info.LiveSet) {
+ // For each live pointer find its defining chain
+ SmallVector<Instruction *, 3> ChainToBase;
+ assert(Info.PointerToBase.count(LiveValue));
+ Value *RootOfChain =
+ findRematerializableChainToBasePointer(ChainToBase,
+ LiveValue);
+
+ // Nothing to do, or chain is too long
+ if ( ChainToBase.size() == 0 ||
+ ChainToBase.size() > ChainLengthThreshold)
+ continue;
+
+ // Handle the scenario where the RootOfChain is not equal to the
+ // Base Value, but they are essentially the same phi values.
+ if (RootOfChain != Info.PointerToBase[LiveValue]) {
+ PHINode *OrigRootPhi = dyn_cast<PHINode>(RootOfChain);
+ PHINode *AlternateRootPhi = dyn_cast<PHINode>(Info.PointerToBase[LiveValue]);
+ if (!OrigRootPhi || !AlternateRootPhi)
+ continue;
+ // PHI nodes that have the same incoming values, and belonging to the same
+ // basic blocks are essentially the same SSA value. When the original phi
+ // has incoming values with different base pointers, the original phi is
+ // marked as conflict, and an additional `AlternateRootPhi` with the same
+ // incoming values get generated by the findBasePointer function. We need
+ // to identify the newly generated AlternateRootPhi (.base version of phi)
+ // and RootOfChain (the original phi node itself) are the same, so that we
+ // can rematerialize the gep and casts. This is a workaround for the
+ // deficiency in the findBasePointer algorithm.
+ if (!AreEquivalentPhiNodes(*OrigRootPhi, *AlternateRootPhi))
+ continue;
+ // Now that the phi nodes are proved to be the same, assert that
+ // findBasePointer's newly generated AlternateRootPhi is present in the
+ // liveset of the call.
+ assert(Info.LiveSet.count(AlternateRootPhi));
+ }
+ // Compute cost of this chain
InstructionCost Cost = chainToBasePointerCost(ChainToBase, TTI);
- // TODO: We can also account for cases when we will be able to remove some
- // of the rematerialized values by later optimization passes. I.e if
- // we rematerialized several intersecting chains. Or if original values
- // don't have any uses besides this statepoint.
-
- // For invokes we need to rematerialize each chain twice - for normal and
- // for unwind basic blocks. Model this by multiplying cost by two.
- if (isa<InvokeInst>(Call)) {
- Cost *= 2;
- }
- // If it's too expensive - skip it
- if (Cost >= RematerializationThreshold)
- continue;
-
- // Remove value from the live set
- LiveValuesToBeDeleted.push_back(LiveValue);
-
- // Clone instructions and record them inside "Info" structure
-
- // Walk backwards to visit top-most instructions first
- std::reverse(ChainToBase.begin(), ChainToBase.end());
-
- // Utility function which clones all instructions from "ChainToBase"
- // and inserts them before "InsertBefore". Returns rematerialized value
- // which should be used after statepoint.
- auto rematerializeChain = [&ChainToBase](
- Instruction *InsertBefore, Value *RootOfChain, Value *AlternateLiveBase) {
- Instruction *LastClonedValue = nullptr;
- Instruction *LastValue = nullptr;
- for (Instruction *Instr: ChainToBase) {
- // Only GEP's and casts are supported as we need to be careful to not
- // introduce any new uses of pointers not in the liveset.
- // Note that it's fine to introduce new uses of pointers which were
- // otherwise not used after this statepoint.
- assert(isa<GetElementPtrInst>(Instr) || isa<CastInst>(Instr));
-
- Instruction *ClonedValue = Instr->clone();
- ClonedValue->insertBefore(InsertBefore);
- ClonedValue->setName(Instr->getName() + ".remat");
-
- // If it is not first instruction in the chain then it uses previously
- // cloned value. We should update it to use cloned value.
- if (LastClonedValue) {
- assert(LastValue);
- ClonedValue->replaceUsesOfWith(LastValue, LastClonedValue);
-#ifndef NDEBUG
- for (auto OpValue : ClonedValue->operand_values()) {
- // Assert that cloned instruction does not use any instructions from
- // this chain other than LastClonedValue
- assert(!is_contained(ChainToBase, OpValue) &&
- "incorrect use in rematerialization chain");
- // Assert that the cloned instruction does not use the RootOfChain
- // or the AlternateLiveBase.
- assert(OpValue != RootOfChain && OpValue != AlternateLiveBase);
- }
-#endif
- } else {
- // For the first instruction, replace the use of unrelocated base i.e.
- // RootOfChain/OrigRootPhi, with the corresponding PHI present in the
- // live set. They have been proved to be the same PHI nodes. Note
- // that the *only* use of the RootOfChain in the ChainToBase list is
- // the first Value in the list.
- if (RootOfChain != AlternateLiveBase)
- ClonedValue->replaceUsesOfWith(RootOfChain, AlternateLiveBase);
- }
-
- LastClonedValue = ClonedValue;
- LastValue = Instr;
- }
- assert(LastClonedValue);
- return LastClonedValue;
- };
-
- // Different cases for calls and invokes. For invokes we need to clone
- // instructions both on normal and unwind path.
- if (isa<CallInst>(Call)) {
- Instruction *InsertBefore = Call->getNextNode();
- assert(InsertBefore);
- Instruction *RematerializedValue = rematerializeChain(
- InsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
- Info.RematerializedValues[RematerializedValue] = LiveValue;
- } else {
- auto *Invoke = cast<InvokeInst>(Call);
-
- Instruction *NormalInsertBefore =
- &*Invoke->getNormalDest()->getFirstInsertionPt();
- Instruction *UnwindInsertBefore =
- &*Invoke->getUnwindDest()->getFirstInsertionPt();
-
- Instruction *NormalRematerializedValue = rematerializeChain(
- NormalInsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
- Instruction *UnwindRematerializedValue = rematerializeChain(
- UnwindInsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
-
- Info.RematerializedValues[NormalRematerializedValue] = LiveValue;
- Info.RematerializedValues[UnwindRematerializedValue] = LiveValue;
- }
- }
-
- // Remove rematerializaed values from the live set
- for (auto LiveValue: LiveValuesToBeDeleted) {
- Info.LiveSet.remove(LiveValue);
- }
-}
-
-static bool insertParsePoints(Function &F, DominatorTree &DT,
- TargetTransformInfo &TTI,
- SmallVectorImpl<CallBase *> &ToUpdate) {
-#ifndef NDEBUG
- // sanity check the input
- std::set<CallBase *> Uniqued;
- Uniqued.insert(ToUpdate.begin(), ToUpdate.end());
- assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!");
-
- for (CallBase *Call : ToUpdate)
- assert(Call->getFunction() == &F);
-#endif
-
- // When inserting gc.relocates for invokes, we need to be able to insert at
- // the top of the successor blocks. See the comment on
- // normalForInvokeSafepoint on exactly what is needed. Note that this step
- // may restructure the CFG.
- for (CallBase *Call : ToUpdate) {
- auto *II = dyn_cast<InvokeInst>(Call);
- if (!II)
- continue;
- normalizeForInvokeSafepoint(II->getNormalDest(), II->getParent(), DT);
- normalizeForInvokeSafepoint(II->getUnwindDest(), II->getParent(), DT);
- }
-
- // A list of dummy calls added to the IR to keep various values obviously
- // live in the IR. We'll remove all of these when done.
- SmallVector<CallInst *, 64> Holders;
-
- // Insert a dummy call with all of the deopt operands we'll need for the
- // actual safepoint insertion as arguments. This ensures reference operands
- // in the deopt argument list are considered live through the safepoint (and
- // thus makes sure they get relocated.)
- for (CallBase *Call : ToUpdate) {
- SmallVector<Value *, 64> DeoptValues;
-
- for (Value *Arg : GetDeoptBundleOperands(Call)) {
- assert(!isUnhandledGCPointerType(Arg->getType()) &&
- "support for FCA unimplemented");
- if (isHandledGCPointerType(Arg->getType()))
- DeoptValues.push_back(Arg);
- }
-
- insertUseHolderAfter(Call, DeoptValues, Holders);
- }
-
- SmallVector<PartiallyConstructedSafepointRecord, 64> Records(ToUpdate.size());
-
- // A) Identify all gc pointers which are statically live at the given call
- // site.
- findLiveReferences(F, DT, ToUpdate, Records);
-
- // B) Find the base pointers for each live pointer
- /* scope for caching */ {
- // Cache the 'defining value' relation used in the computation and
- // insertion of base phis and selects. This ensures that we don't insert
- // large numbers of duplicate base_phis.
- DefiningValueMapTy DVCache;
-
- for (size_t i = 0; i < Records.size(); i++) {
- PartiallyConstructedSafepointRecord &info = Records[i];
- findBasePointers(DT, DVCache, ToUpdate[i], info);
- }
- } // end of cache scope
-
- // The base phi insertion logic (for any safepoint) may have inserted new
- // instructions which are now live at some safepoint. The simplest such
- // example is:
- // loop:
- // phi a <-- will be a new base_phi here
- // safepoint 1 <-- that needs to be live here
- // gep a + 1
- // safepoint 2
- // br loop
- // We insert some dummy calls after each safepoint to definitely hold live
- // the base pointers which were identified for that safepoint. We'll then
- // ask liveness for _every_ base inserted to see what is now live. Then we
- // remove the dummy calls.
- Holders.reserve(Holders.size() + Records.size());
- for (size_t i = 0; i < Records.size(); i++) {
- PartiallyConstructedSafepointRecord &Info = Records[i];
-
- SmallVector<Value *, 128> Bases;
- for (auto Pair : Info.PointerToBase)
- Bases.push_back(Pair.second);
-
- insertUseHolderAfter(ToUpdate[i], Bases, Holders);
- }
-
- // By selecting base pointers, we've effectively inserted new uses. Thus, we
- // need to rerun liveness. We may *also* have inserted new defs, but that's
- // not the key issue.
- recomputeLiveInValues(F, DT, ToUpdate, Records);
-
- if (PrintBasePointers) {
- for (auto &Info : Records) {
- errs() << "Base Pairs: (w/Relocation)\n";
- for (auto Pair : Info.PointerToBase) {
- errs() << " derived ";
- Pair.first->printAsOperand(errs(), false);
- errs() << " base ";
- Pair.second->printAsOperand(errs(), false);
- errs() << "\n";
- }
- }
- }
-
- // It is possible that non-constant live variables have a constant base. For
- // example, a GEP with a variable offset from a global. In this case we can
- // remove it from the liveset. We already don't add constants to the liveset
- // because we assume they won't move at runtime and the GC doesn't need to be
- // informed about them. The same reasoning applies if the base is constant.
- // Note that the relocation placement code relies on this filtering for
- // correctness as it expects the base to be in the liveset, which isn't true
- // if the base is constant.
- for (auto &Info : Records)
- for (auto &BasePair : Info.PointerToBase)
- if (isa<Constant>(BasePair.second))
- Info.LiveSet.remove(BasePair.first);
-
- for (CallInst *CI : Holders)
- CI->eraseFromParent();
-
- Holders.clear();
-
- // In order to reduce live set of statepoint we might choose to rematerialize
- // some values instead of relocating them. This is purely an optimization and
- // does not influence correctness.
- for (size_t i = 0; i < Records.size(); i++)
- rematerializeLiveValues(ToUpdate[i], Records[i], TTI);
-
- // We need this to safely RAUW and delete call or invoke return values that
- // may themselves be live over a statepoint. For details, please see usage in
- // makeStatepointExplicitImpl.
- std::vector<DeferredReplacement> Replacements;
-
- // Now run through and replace the existing statepoints with new ones with
- // the live variables listed. We do not yet update uses of the values being
- // relocated. We have references to live variables that need to
- // survive to the last iteration of this loop. (By construction, the
- // previous statepoint can not be a live variable, thus we can and remove
- // the old statepoint calls as we go.)
- for (size_t i = 0; i < Records.size(); i++)
- makeStatepointExplicit(DT, ToUpdate[i], Records[i], Replacements);
-
- ToUpdate.clear(); // prevent accident use of invalid calls.
-
- for (auto &PR : Replacements)
- PR.doReplacement();
-
- Replacements.clear();
-
- for (auto &Info : Records) {
- // These live sets may contain state Value pointers, since we replaced calls
- // with operand bundles with calls wrapped in gc.statepoint, and some of
- // those calls may have been def'ing live gc pointers. Clear these out to
- // avoid accidentally using them.
- //
- // TODO: We should create a separate data structure that does not contain
- // these live sets, and migrate to using that data structure from this point
- // onward.
- Info.LiveSet.clear();
- Info.PointerToBase.clear();
- }
-
- // Do all the fixups of the original live variables to their relocated selves
- SmallVector<Value *, 128> Live;
- for (size_t i = 0; i < Records.size(); i++) {
- PartiallyConstructedSafepointRecord &Info = Records[i];
-
- // We can't simply save the live set from the original insertion. One of
- // the live values might be the result of a call which needs a safepoint.
- // That Value* no longer exists and we need to use the new gc_result.
- // Thankfully, the live set is embedded in the statepoint (and updated), so
- // we just grab that.
+ // TODO: We can also account for cases when we will be able to remove some
+ // of the rematerialized values by later optimization passes. I.e if
+ // we rematerialized several intersecting chains. Or if original values
+ // don't have any uses besides this statepoint.
+
+ // For invokes we need to rematerialize each chain twice - for normal and
+ // for unwind basic blocks. Model this by multiplying cost by two.
+ if (isa<InvokeInst>(Call)) {
+ Cost *= 2;
+ }
+ // If it's too expensive - skip it
+ if (Cost >= RematerializationThreshold)
+ continue;
+
+ // Remove value from the live set
+ LiveValuesToBeDeleted.push_back(LiveValue);
+
+ // Clone instructions and record them inside "Info" structure
+
+ // Walk backwards to visit top-most instructions first
+ std::reverse(ChainToBase.begin(), ChainToBase.end());
+
+ // Utility function which clones all instructions from "ChainToBase"
+ // and inserts them before "InsertBefore". Returns rematerialized value
+ // which should be used after statepoint.
+ auto rematerializeChain = [&ChainToBase](
+ Instruction *InsertBefore, Value *RootOfChain, Value *AlternateLiveBase) {
+ Instruction *LastClonedValue = nullptr;
+ Instruction *LastValue = nullptr;
+ for (Instruction *Instr: ChainToBase) {
+ // Only GEP's and casts are supported as we need to be careful to not
+ // introduce any new uses of pointers not in the liveset.
+ // Note that it's fine to introduce new uses of pointers which were
+ // otherwise not used after this statepoint.
+ assert(isa<GetElementPtrInst>(Instr) || isa<CastInst>(Instr));
+
+ Instruction *ClonedValue = Instr->clone();
+ ClonedValue->insertBefore(InsertBefore);
+ ClonedValue->setName(Instr->getName() + ".remat");
+
+ // If it is not first instruction in the chain then it uses previously
+ // cloned value. We should update it to use cloned value.
+ if (LastClonedValue) {
+ assert(LastValue);
+ ClonedValue->replaceUsesOfWith(LastValue, LastClonedValue);
+#ifndef NDEBUG
+ for (auto OpValue : ClonedValue->operand_values()) {
+ // Assert that cloned instruction does not use any instructions from
+ // this chain other than LastClonedValue
+ assert(!is_contained(ChainToBase, OpValue) &&
+ "incorrect use in rematerialization chain");
+ // Assert that the cloned instruction does not use the RootOfChain
+ // or the AlternateLiveBase.
+ assert(OpValue != RootOfChain && OpValue != AlternateLiveBase);
+ }
+#endif
+ } else {
+ // For the first instruction, replace the use of unrelocated base i.e.
+ // RootOfChain/OrigRootPhi, with the corresponding PHI present in the
+ // live set. They have been proved to be the same PHI nodes. Note
+ // that the *only* use of the RootOfChain in the ChainToBase list is
+ // the first Value in the list.
+ if (RootOfChain != AlternateLiveBase)
+ ClonedValue->replaceUsesOfWith(RootOfChain, AlternateLiveBase);
+ }
+
+ LastClonedValue = ClonedValue;
+ LastValue = Instr;
+ }
+ assert(LastClonedValue);
+ return LastClonedValue;
+ };
+
+ // Different cases for calls and invokes. For invokes we need to clone
+ // instructions both on normal and unwind path.
+ if (isa<CallInst>(Call)) {
+ Instruction *InsertBefore = Call->getNextNode();
+ assert(InsertBefore);
+ Instruction *RematerializedValue = rematerializeChain(
+ InsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
+ Info.RematerializedValues[RematerializedValue] = LiveValue;
+ } else {
+ auto *Invoke = cast<InvokeInst>(Call);
+
+ Instruction *NormalInsertBefore =
+ &*Invoke->getNormalDest()->getFirstInsertionPt();
+ Instruction *UnwindInsertBefore =
+ &*Invoke->getUnwindDest()->getFirstInsertionPt();
+
+ Instruction *NormalRematerializedValue = rematerializeChain(
+ NormalInsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
+ Instruction *UnwindRematerializedValue = rematerializeChain(
+ UnwindInsertBefore, RootOfChain, Info.PointerToBase[LiveValue]);
+
+ Info.RematerializedValues[NormalRematerializedValue] = LiveValue;
+ Info.RematerializedValues[UnwindRematerializedValue] = LiveValue;
+ }
+ }
+
+ // Remove rematerializaed values from the live set
+ for (auto LiveValue: LiveValuesToBeDeleted) {
+ Info.LiveSet.remove(LiveValue);
+ }
+}
+
+static bool insertParsePoints(Function &F, DominatorTree &DT,
+ TargetTransformInfo &TTI,
+ SmallVectorImpl<CallBase *> &ToUpdate) {
+#ifndef NDEBUG
+ // sanity check the input
+ std::set<CallBase *> Uniqued;
+ Uniqued.insert(ToUpdate.begin(), ToUpdate.end());
+ assert(Uniqued.size() == ToUpdate.size() && "no duplicates please!");
+
+ for (CallBase *Call : ToUpdate)
+ assert(Call->getFunction() == &F);
+#endif
+
+ // When inserting gc.relocates for invokes, we need to be able to insert at
+ // the top of the successor blocks. See the comment on
+ // normalForInvokeSafepoint on exactly what is needed. Note that this step
+ // may restructure the CFG.
+ for (CallBase *Call : ToUpdate) {
+ auto *II = dyn_cast<InvokeInst>(Call);
+ if (!II)
+ continue;
+ normalizeForInvokeSafepoint(II->getNormalDest(), II->getParent(), DT);
+ normalizeForInvokeSafepoint(II->getUnwindDest(), II->getParent(), DT);
+ }
+
+ // A list of dummy calls added to the IR to keep various values obviously
+ // live in the IR. We'll remove all of these when done.
+ SmallVector<CallInst *, 64> Holders;
+
+ // Insert a dummy call with all of the deopt operands we'll need for the
+ // actual safepoint insertion as arguments. This ensures reference operands
+ // in the deopt argument list are considered live through the safepoint (and
+ // thus makes sure they get relocated.)
+ for (CallBase *Call : ToUpdate) {
+ SmallVector<Value *, 64> DeoptValues;
+
+ for (Value *Arg : GetDeoptBundleOperands(Call)) {
+ assert(!isUnhandledGCPointerType(Arg->getType()) &&
+ "support for FCA unimplemented");
+ if (isHandledGCPointerType(Arg->getType()))
+ DeoptValues.push_back(Arg);
+ }
+
+ insertUseHolderAfter(Call, DeoptValues, Holders);
+ }
+
+ SmallVector<PartiallyConstructedSafepointRecord, 64> Records(ToUpdate.size());
+
+ // A) Identify all gc pointers which are statically live at the given call
+ // site.
+ findLiveReferences(F, DT, ToUpdate, Records);
+
+ // B) Find the base pointers for each live pointer
+ /* scope for caching */ {
+ // Cache the 'defining value' relation used in the computation and
+ // insertion of base phis and selects. This ensures that we don't insert
+ // large numbers of duplicate base_phis.
+ DefiningValueMapTy DVCache;
+
+ for (size_t i = 0; i < Records.size(); i++) {
+ PartiallyConstructedSafepointRecord &info = Records[i];
+ findBasePointers(DT, DVCache, ToUpdate[i], info);
+ }
+ } // end of cache scope
+
+ // The base phi insertion logic (for any safepoint) may have inserted new
+ // instructions which are now live at some safepoint. The simplest such
+ // example is:
+ // loop:
+ // phi a <-- will be a new base_phi here
+ // safepoint 1 <-- that needs to be live here
+ // gep a + 1
+ // safepoint 2
+ // br loop
+ // We insert some dummy calls after each safepoint to definitely hold live
+ // the base pointers which were identified for that safepoint. We'll then
+ // ask liveness for _every_ base inserted to see what is now live. Then we
+ // remove the dummy calls.
+ Holders.reserve(Holders.size() + Records.size());
+ for (size_t i = 0; i < Records.size(); i++) {
+ PartiallyConstructedSafepointRecord &Info = Records[i];
+
+ SmallVector<Value *, 128> Bases;
+ for (auto Pair : Info.PointerToBase)
+ Bases.push_back(Pair.second);
+
+ insertUseHolderAfter(ToUpdate[i], Bases, Holders);
+ }
+
+ // By selecting base pointers, we've effectively inserted new uses. Thus, we
+ // need to rerun liveness. We may *also* have inserted new defs, but that's
+ // not the key issue.
+ recomputeLiveInValues(F, DT, ToUpdate, Records);
+
+ if (PrintBasePointers) {
+ for (auto &Info : Records) {
+ errs() << "Base Pairs: (w/Relocation)\n";
+ for (auto Pair : Info.PointerToBase) {
+ errs() << " derived ";
+ Pair.first->printAsOperand(errs(), false);
+ errs() << " base ";
+ Pair.second->printAsOperand(errs(), false);
+ errs() << "\n";
+ }
+ }
+ }
+
+ // It is possible that non-constant live variables have a constant base. For
+ // example, a GEP with a variable offset from a global. In this case we can
+ // remove it from the liveset. We already don't add constants to the liveset
+ // because we assume they won't move at runtime and the GC doesn't need to be
+ // informed about them. The same reasoning applies if the base is constant.
+ // Note that the relocation placement code relies on this filtering for
+ // correctness as it expects the base to be in the liveset, which isn't true
+ // if the base is constant.
+ for (auto &Info : Records)
+ for (auto &BasePair : Info.PointerToBase)
+ if (isa<Constant>(BasePair.second))
+ Info.LiveSet.remove(BasePair.first);
+
+ for (CallInst *CI : Holders)
+ CI->eraseFromParent();
+
+ Holders.clear();
+
+ // In order to reduce live set of statepoint we might choose to rematerialize
+ // some values instead of relocating them. This is purely an optimization and
+ // does not influence correctness.
+ for (size_t i = 0; i < Records.size(); i++)
+ rematerializeLiveValues(ToUpdate[i], Records[i], TTI);
+
+ // We need this to safely RAUW and delete call or invoke return values that
+ // may themselves be live over a statepoint. For details, please see usage in
+ // makeStatepointExplicitImpl.
+ std::vector<DeferredReplacement> Replacements;
+
+ // Now run through and replace the existing statepoints with new ones with
+ // the live variables listed. We do not yet update uses of the values being
+ // relocated. We have references to live variables that need to
+ // survive to the last iteration of this loop. (By construction, the
+ // previous statepoint can not be a live variable, thus we can and remove
+ // the old statepoint calls as we go.)
+ for (size_t i = 0; i < Records.size(); i++)
+ makeStatepointExplicit(DT, ToUpdate[i], Records[i], Replacements);
+
+ ToUpdate.clear(); // prevent accident use of invalid calls.
+
+ for (auto &PR : Replacements)
+ PR.doReplacement();
+
+ Replacements.clear();
+
+ for (auto &Info : Records) {
+ // These live sets may contain state Value pointers, since we replaced calls
+ // with operand bundles with calls wrapped in gc.statepoint, and some of
+ // those calls may have been def'ing live gc pointers. Clear these out to
+ // avoid accidentally using them.
+ //
+ // TODO: We should create a separate data structure that does not contain
+ // these live sets, and migrate to using that data structure from this point
+ // onward.
+ Info.LiveSet.clear();
+ Info.PointerToBase.clear();
+ }
+
+ // Do all the fixups of the original live variables to their relocated selves
+ SmallVector<Value *, 128> Live;
+ for (size_t i = 0; i < Records.size(); i++) {
+ PartiallyConstructedSafepointRecord &Info = Records[i];
+
+ // We can't simply save the live set from the original insertion. One of
+ // the live values might be the result of a call which needs a safepoint.
+ // That Value* no longer exists and we need to use the new gc_result.
+ // Thankfully, the live set is embedded in the statepoint (and updated), so
+ // we just grab that.
llvm::append_range(Live, Info.StatepointToken->gc_args());
-#ifndef NDEBUG
- // Do some basic sanity checks on our liveness results before performing
- // relocation. Relocation can and will turn mistakes in liveness results
- // into non-sensical code which is must harder to debug.
- // TODO: It would be nice to test consistency as well
- assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) &&
- "statepoint must be reachable or liveness is meaningless");
- for (Value *V : Info.StatepointToken->gc_args()) {
- if (!isa<Instruction>(V))
- // Non-instruction values trivial dominate all possible uses
- continue;
- auto *LiveInst = cast<Instruction>(V);
- assert(DT.isReachableFromEntry(LiveInst->getParent()) &&
- "unreachable values should never be live");
- assert(DT.dominates(LiveInst, Info.StatepointToken) &&
- "basic SSA liveness expectation violated by liveness analysis");
- }
-#endif
- }
- unique_unsorted(Live);
-
-#ifndef NDEBUG
- // sanity check
- for (auto *Ptr : Live)
- assert(isHandledGCPointerType(Ptr->getType()) &&
- "must be a gc pointer type");
-#endif
-
- relocationViaAlloca(F, DT, Live, Records);
- return !Records.empty();
-}
-
-// Handles both return values and arguments for Functions and calls.
-template <typename AttrHolder>
-static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
- unsigned Index) {
- AttrBuilder R;
- if (AH.getDereferenceableBytes(Index))
- R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable,
- AH.getDereferenceableBytes(Index)));
- if (AH.getDereferenceableOrNullBytes(Index))
- R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,
- AH.getDereferenceableOrNullBytes(Index)));
- if (AH.getAttributes().hasAttribute(Index, Attribute::NoAlias))
- R.addAttribute(Attribute::NoAlias);
-
- if (!R.empty())
- AH.setAttributes(AH.getAttributes().removeAttributes(Ctx, Index, R));
-}
-
-static void stripNonValidAttributesFromPrototype(Function &F) {
- LLVMContext &Ctx = F.getContext();
-
- for (Argument &A : F.args())
- if (isa<PointerType>(A.getType()))
- RemoveNonValidAttrAtIndex(Ctx, F,
- A.getArgNo() + AttributeList::FirstArgIndex);
-
- if (isa<PointerType>(F.getReturnType()))
- RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
-}
-
-/// Certain metadata on instructions are invalid after running RS4GC.
-/// Optimizations that run after RS4GC can incorrectly use this metadata to
-/// optimize functions. We drop such metadata on the instruction.
-static void stripInvalidMetadataFromInstruction(Instruction &I) {
- if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
- return;
- // These are the attributes that are still valid on loads and stores after
- // RS4GC.
- // The metadata implying dereferenceability and noalias are (conservatively)
- // dropped. This is because semantically, after RewriteStatepointsForGC runs,
- // all calls to gc.statepoint "free" the entire heap. Also, gc.statepoint can
- // touch the entire heap including noalias objects. Note: The reasoning is
- // same as stripping the dereferenceability and noalias attributes that are
- // analogous to the metadata counterparts.
- // We also drop the invariant.load metadata on the load because that metadata
- // implies the address operand to the load points to memory that is never
- // changed once it became dereferenceable. This is no longer true after RS4GC.
- // Similar reasoning applies to invariant.group metadata, which applies to
- // loads within a group.
- unsigned ValidMetadataAfterRS4GC[] = {LLVMContext::MD_tbaa,
- LLVMContext::MD_range,
- LLVMContext::MD_alias_scope,
- LLVMContext::MD_nontemporal,
- LLVMContext::MD_nonnull,
- LLVMContext::MD_align,
- LLVMContext::MD_type};
-
- // Drops all metadata on the instruction other than ValidMetadataAfterRS4GC.
- I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC);
-}
-
-static void stripNonValidDataFromBody(Function &F) {
- if (F.empty())
- return;
-
- LLVMContext &Ctx = F.getContext();
- MDBuilder Builder(Ctx);
-
- // Set of invariantstart instructions that we need to remove.
- // Use this to avoid invalidating the instruction iterator.
- SmallVector<IntrinsicInst*, 12> InvariantStartInstructions;
-
- for (Instruction &I : instructions(F)) {
- // invariant.start on memory location implies that the referenced memory
- // location is constant and unchanging. This is no longer true after
- // RewriteStatepointsForGC runs because there can be calls to gc.statepoint
- // which frees the entire heap and the presence of invariant.start allows
- // the optimizer to sink the load of a memory location past a statepoint,
- // which is incorrect.
- if (auto *II = dyn_cast<IntrinsicInst>(&I))
- if (II->getIntrinsicID() == Intrinsic::invariant_start) {
- InvariantStartInstructions.push_back(II);
- continue;
- }
-
- if (MDNode *Tag = I.getMetadata(LLVMContext::MD_tbaa)) {
- MDNode *MutableTBAA = Builder.createMutableTBAAAccessTag(Tag);
- I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA);
- }
-
- stripInvalidMetadataFromInstruction(I);
-
- if (auto *Call = dyn_cast<CallBase>(&I)) {
- for (int i = 0, e = Call->arg_size(); i != e; i++)
- if (isa<PointerType>(Call->getArgOperand(i)->getType()))
- RemoveNonValidAttrAtIndex(Ctx, *Call,
- i + AttributeList::FirstArgIndex);
- if (isa<PointerType>(Call->getType()))
- RemoveNonValidAttrAtIndex(Ctx, *Call, AttributeList::ReturnIndex);
- }
- }
-
- // Delete the invariant.start instructions and RAUW undef.
- for (auto *II : InvariantStartInstructions) {
- II->replaceAllUsesWith(UndefValue::get(II->getType()));
- II->eraseFromParent();
- }
-}
-
-/// Returns true if this function should be rewritten by this pass. The main
-/// point of this function is as an extension point for custom logic.
-static bool shouldRewriteStatepointsIn(Function &F) {
- // TODO: This should check the GCStrategy
- if (F.hasGC()) {
- const auto &FunctionGCName = F.getGC();
- const StringRef StatepointExampleName("statepoint-example");
- const StringRef CoreCLRName("coreclr");
- return (StatepointExampleName == FunctionGCName) ||
- (CoreCLRName == FunctionGCName);
- } else
- return false;
-}
-
-static void stripNonValidData(Module &M) {
-#ifndef NDEBUG
- assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!");
-#endif
-
- for (Function &F : M)
- stripNonValidAttributesFromPrototype(F);
-
- for (Function &F : M)
- stripNonValidDataFromBody(F);
-}
-
-bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
- TargetTransformInfo &TTI,
- const TargetLibraryInfo &TLI) {
- assert(!F.isDeclaration() && !F.empty() &&
- "need function body to rewrite statepoints in");
- assert(shouldRewriteStatepointsIn(F) && "mismatch in rewrite decision");
-
- auto NeedsRewrite = [&TLI](Instruction &I) {
+#ifndef NDEBUG
+ // Do some basic sanity checks on our liveness results before performing
+ // relocation. Relocation can and will turn mistakes in liveness results
+ // into non-sensical code which is must harder to debug.
+ // TODO: It would be nice to test consistency as well
+ assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) &&
+ "statepoint must be reachable or liveness is meaningless");
+ for (Value *V : Info.StatepointToken->gc_args()) {
+ if (!isa<Instruction>(V))
+ // Non-instruction values trivial dominate all possible uses
+ continue;
+ auto *LiveInst = cast<Instruction>(V);
+ assert(DT.isReachableFromEntry(LiveInst->getParent()) &&
+ "unreachable values should never be live");
+ assert(DT.dominates(LiveInst, Info.StatepointToken) &&
+ "basic SSA liveness expectation violated by liveness analysis");
+ }
+#endif
+ }
+ unique_unsorted(Live);
+
+#ifndef NDEBUG
+ // sanity check
+ for (auto *Ptr : Live)
+ assert(isHandledGCPointerType(Ptr->getType()) &&
+ "must be a gc pointer type");
+#endif
+
+ relocationViaAlloca(F, DT, Live, Records);
+ return !Records.empty();
+}
+
+// Handles both return values and arguments for Functions and calls.
+template <typename AttrHolder>
+static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
+ unsigned Index) {
+ AttrBuilder R;
+ if (AH.getDereferenceableBytes(Index))
+ R.addAttribute(Attribute::get(Ctx, Attribute::Dereferenceable,
+ AH.getDereferenceableBytes(Index)));
+ if (AH.getDereferenceableOrNullBytes(Index))
+ R.addAttribute(Attribute::get(Ctx, Attribute::DereferenceableOrNull,
+ AH.getDereferenceableOrNullBytes(Index)));
+ if (AH.getAttributes().hasAttribute(Index, Attribute::NoAlias))
+ R.addAttribute(Attribute::NoAlias);
+
+ if (!R.empty())
+ AH.setAttributes(AH.getAttributes().removeAttributes(Ctx, Index, R));
+}
+
+static void stripNonValidAttributesFromPrototype(Function &F) {
+ LLVMContext &Ctx = F.getContext();
+
+ for (Argument &A : F.args())
+ if (isa<PointerType>(A.getType()))
+ RemoveNonValidAttrAtIndex(Ctx, F,
+ A.getArgNo() + AttributeList::FirstArgIndex);
+
+ if (isa<PointerType>(F.getReturnType()))
+ RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
+}
+
+/// Certain metadata on instructions are invalid after running RS4GC.
+/// Optimizations that run after RS4GC can incorrectly use this metadata to
+/// optimize functions. We drop such metadata on the instruction.
+static void stripInvalidMetadataFromInstruction(Instruction &I) {
+ if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
+ return;
+ // These are the attributes that are still valid on loads and stores after
+ // RS4GC.
+ // The metadata implying dereferenceability and noalias are (conservatively)
+ // dropped. This is because semantically, after RewriteStatepointsForGC runs,
+ // all calls to gc.statepoint "free" the entire heap. Also, gc.statepoint can
+ // touch the entire heap including noalias objects. Note: The reasoning is
+ // same as stripping the dereferenceability and noalias attributes that are
+ // analogous to the metadata counterparts.
+ // We also drop the invariant.load metadata on the load because that metadata
+ // implies the address operand to the load points to memory that is never
+ // changed once it became dereferenceable. This is no longer true after RS4GC.
+ // Similar reasoning applies to invariant.group metadata, which applies to
+ // loads within a group.
+ unsigned ValidMetadataAfterRS4GC[] = {LLVMContext::MD_tbaa,
+ LLVMContext::MD_range,
+ LLVMContext::MD_alias_scope,
+ LLVMContext::MD_nontemporal,
+ LLVMContext::MD_nonnull,
+ LLVMContext::MD_align,
+ LLVMContext::MD_type};
+
+ // Drops all metadata on the instruction other than ValidMetadataAfterRS4GC.
+ I.dropUnknownNonDebugMetadata(ValidMetadataAfterRS4GC);
+}
+
+static void stripNonValidDataFromBody(Function &F) {
+ if (F.empty())
+ return;
+
+ LLVMContext &Ctx = F.getContext();
+ MDBuilder Builder(Ctx);
+
+ // Set of invariantstart instructions that we need to remove.
+ // Use this to avoid invalidating the instruction iterator.
+ SmallVector<IntrinsicInst*, 12> InvariantStartInstructions;
+
+ for (Instruction &I : instructions(F)) {
+ // invariant.start on memory location implies that the referenced memory
+ // location is constant and unchanging. This is no longer true after
+ // RewriteStatepointsForGC runs because there can be calls to gc.statepoint
+ // which frees the entire heap and the presence of invariant.start allows
+ // the optimizer to sink the load of a memory location past a statepoint,
+ // which is incorrect.
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::invariant_start) {
+ InvariantStartInstructions.push_back(II);
+ continue;
+ }
+
+ if (MDNode *Tag = I.getMetadata(LLVMContext::MD_tbaa)) {
+ MDNode *MutableTBAA = Builder.createMutableTBAAAccessTag(Tag);
+ I.setMetadata(LLVMContext::MD_tbaa, MutableTBAA);
+ }
+
+ stripInvalidMetadataFromInstruction(I);
+
+ if (auto *Call = dyn_cast<CallBase>(&I)) {
+ for (int i = 0, e = Call->arg_size(); i != e; i++)
+ if (isa<PointerType>(Call->getArgOperand(i)->getType()))
+ RemoveNonValidAttrAtIndex(Ctx, *Call,
+ i + AttributeList::FirstArgIndex);
+ if (isa<PointerType>(Call->getType()))
+ RemoveNonValidAttrAtIndex(Ctx, *Call, AttributeList::ReturnIndex);
+ }
+ }
+
+ // Delete the invariant.start instructions and RAUW undef.
+ for (auto *II : InvariantStartInstructions) {
+ II->replaceAllUsesWith(UndefValue::get(II->getType()));
+ II->eraseFromParent();
+ }
+}
+
+/// Returns true if this function should be rewritten by this pass. The main
+/// point of this function is as an extension point for custom logic.
+static bool shouldRewriteStatepointsIn(Function &F) {
+ // TODO: This should check the GCStrategy
+ if (F.hasGC()) {
+ const auto &FunctionGCName = F.getGC();
+ const StringRef StatepointExampleName("statepoint-example");
+ const StringRef CoreCLRName("coreclr");
+ return (StatepointExampleName == FunctionGCName) ||
+ (CoreCLRName == FunctionGCName);
+ } else
+ return false;
+}
+
+static void stripNonValidData(Module &M) {
+#ifndef NDEBUG
+ assert(llvm::any_of(M, shouldRewriteStatepointsIn) && "precondition!");
+#endif
+
+ for (Function &F : M)
+ stripNonValidAttributesFromPrototype(F);
+
+ for (Function &F : M)
+ stripNonValidDataFromBody(F);
+}
+
+bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
+ TargetTransformInfo &TTI,
+ const TargetLibraryInfo &TLI) {
+ assert(!F.isDeclaration() && !F.empty() &&
+ "need function body to rewrite statepoints in");
+ assert(shouldRewriteStatepointsIn(F) && "mismatch in rewrite decision");
+
+ auto NeedsRewrite = [&TLI](Instruction &I) {
if (const auto *Call = dyn_cast<CallBase>(&I)) {
if (isa<GCStatepointInst>(Call))
return false;
@@ -2696,322 +2696,322 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
}
return true;
}
- return false;
- };
-
- // Delete any unreachable statepoints so that we don't have unrewritten
- // statepoints surviving this pass. This makes testing easier and the
- // resulting IR less confusing to human readers.
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
- bool MadeChange = removeUnreachableBlocks(F, &DTU);
- // Flush the Dominator Tree.
- DTU.getDomTree();
-
- // Gather all the statepoints which need rewritten. Be careful to only
- // consider those in reachable code since we need to ask dominance queries
- // when rewriting. We'll delete the unreachable ones in a moment.
- SmallVector<CallBase *, 64> ParsePointNeeded;
- for (Instruction &I : instructions(F)) {
- // TODO: only the ones with the flag set!
- if (NeedsRewrite(I)) {
- // NOTE removeUnreachableBlocks() is stronger than
- // DominatorTree::isReachableFromEntry(). In other words
- // removeUnreachableBlocks can remove some blocks for which
- // isReachableFromEntry() returns true.
- assert(DT.isReachableFromEntry(I.getParent()) &&
- "no unreachable blocks expected");
- ParsePointNeeded.push_back(cast<CallBase>(&I));
- }
- }
-
- // Return early if no work to do.
- if (ParsePointNeeded.empty())
- return MadeChange;
-
- // As a prepass, go ahead and aggressively destroy single entry phi nodes.
- // These are created by LCSSA. They have the effect of increasing the size
- // of liveness sets for no good reason. It may be harder to do this post
- // insertion since relocations and base phis can confuse things.
- for (BasicBlock &BB : F)
+ return false;
+ };
+
+ // Delete any unreachable statepoints so that we don't have unrewritten
+ // statepoints surviving this pass. This makes testing easier and the
+ // resulting IR less confusing to human readers.
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ bool MadeChange = removeUnreachableBlocks(F, &DTU);
+ // Flush the Dominator Tree.
+ DTU.getDomTree();
+
+ // Gather all the statepoints which need rewritten. Be careful to only
+ // consider those in reachable code since we need to ask dominance queries
+ // when rewriting. We'll delete the unreachable ones in a moment.
+ SmallVector<CallBase *, 64> ParsePointNeeded;
+ for (Instruction &I : instructions(F)) {
+ // TODO: only the ones with the flag set!
+ if (NeedsRewrite(I)) {
+ // NOTE removeUnreachableBlocks() is stronger than
+ // DominatorTree::isReachableFromEntry(). In other words
+ // removeUnreachableBlocks can remove some blocks for which
+ // isReachableFromEntry() returns true.
+ assert(DT.isReachableFromEntry(I.getParent()) &&
+ "no unreachable blocks expected");
+ ParsePointNeeded.push_back(cast<CallBase>(&I));
+ }
+ }
+
+ // Return early if no work to do.
+ if (ParsePointNeeded.empty())
+ return MadeChange;
+
+ // As a prepass, go ahead and aggressively destroy single entry phi nodes.
+ // These are created by LCSSA. They have the effect of increasing the size
+ // of liveness sets for no good reason. It may be harder to do this post
+ // insertion since relocations and base phis can confuse things.
+ for (BasicBlock &BB : F)
if (BB.getUniquePredecessor())
MadeChange |= FoldSingleEntryPHINodes(&BB);
-
- // Before we start introducing relocations, we want to tweak the IR a bit to
- // avoid unfortunate code generation effects. The main example is that we
- // want to try to make sure the comparison feeding a branch is after any
- // safepoints. Otherwise, we end up with a comparison of pre-relocation
- // values feeding a branch after relocation. This is semantically correct,
- // but results in extra register pressure since both the pre-relocation and
- // post-relocation copies must be available in registers. For code without
- // relocations this is handled elsewhere, but teaching the scheduler to
- // reverse the transform we're about to do would be slightly complex.
- // Note: This may extend the live range of the inputs to the icmp and thus
- // increase the liveset of any statepoint we move over. This is profitable
- // as long as all statepoints are in rare blocks. If we had in-register
- // lowering for live values this would be a much safer transform.
- auto getConditionInst = [](Instruction *TI) -> Instruction * {
- if (auto *BI = dyn_cast<BranchInst>(TI))
- if (BI->isConditional())
- return dyn_cast<Instruction>(BI->getCondition());
- // TODO: Extend this to handle switches
- return nullptr;
- };
- for (BasicBlock &BB : F) {
- Instruction *TI = BB.getTerminator();
- if (auto *Cond = getConditionInst(TI))
- // TODO: Handle more than just ICmps here. We should be able to move
- // most instructions without side effects or memory access.
- if (isa<ICmpInst>(Cond) && Cond->hasOneUse()) {
- MadeChange = true;
- Cond->moveBefore(TI);
- }
- }
-
- // Nasty workaround - The base computation code in the main algorithm doesn't
- // consider the fact that a GEP can be used to convert a scalar to a vector.
- // The right fix for this is to integrate GEPs into the base rewriting
- // algorithm properly, this is just a short term workaround to prevent
- // crashes by canonicalizing such GEPs into fully vector GEPs.
- for (Instruction &I : instructions(F)) {
- if (!isa<GetElementPtrInst>(I))
- continue;
-
- unsigned VF = 0;
- for (unsigned i = 0; i < I.getNumOperands(); i++)
- if (auto *OpndVTy = dyn_cast<VectorType>(I.getOperand(i)->getType())) {
- assert(VF == 0 ||
- VF == cast<FixedVectorType>(OpndVTy)->getNumElements());
- VF = cast<FixedVectorType>(OpndVTy)->getNumElements();
- }
-
- // It's the vector to scalar traversal through the pointer operand which
- // confuses base pointer rewriting, so limit ourselves to that case.
- if (!I.getOperand(0)->getType()->isVectorTy() && VF != 0) {
- IRBuilder<> B(&I);
- auto *Splat = B.CreateVectorSplat(VF, I.getOperand(0));
- I.setOperand(0, Splat);
- MadeChange = true;
- }
- }
-
- MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded);
- return MadeChange;
-}
-
-// liveness computation via standard dataflow
-// -------------------------------------------------------------------
-
-// TODO: Consider using bitvectors for liveness, the set of potentially
-// interesting values should be small and easy to pre-compute.
-
-/// Compute the live-in set for the location rbegin starting from
-/// the live-out set of the basic block
-static void computeLiveInValues(BasicBlock::reverse_iterator Begin,
- BasicBlock::reverse_iterator End,
- SetVector<Value *> &LiveTmp) {
- for (auto &I : make_range(Begin, End)) {
- // KILL/Def - Remove this definition from LiveIn
- LiveTmp.remove(&I);
-
- // Don't consider *uses* in PHI nodes, we handle their contribution to
- // predecessor blocks when we seed the LiveOut sets
- if (isa<PHINode>(I))
- continue;
-
- // USE - Add to the LiveIn set for this instruction
- for (Value *V : I.operands()) {
- assert(!isUnhandledGCPointerType(V->getType()) &&
- "support for FCA unimplemented");
- if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) {
- // The choice to exclude all things constant here is slightly subtle.
- // There are two independent reasons:
- // - We assume that things which are constant (from LLVM's definition)
- // do not move at runtime. For example, the address of a global
- // variable is fixed, even though it's contents may not be.
- // - Second, we can't disallow arbitrary inttoptr constants even
- // if the language frontend does. Optimization passes are free to
- // locally exploit facts without respect to global reachability. This
- // can create sections of code which are dynamically unreachable and
- // contain just about anything. (see constants.ll in tests)
- LiveTmp.insert(V);
- }
- }
- }
-}
-
-static void computeLiveOutSeed(BasicBlock *BB, SetVector<Value *> &LiveTmp) {
- for (BasicBlock *Succ : successors(BB)) {
- for (auto &I : *Succ) {
- PHINode *PN = dyn_cast<PHINode>(&I);
- if (!PN)
- break;
-
- Value *V = PN->getIncomingValueForBlock(BB);
- assert(!isUnhandledGCPointerType(V->getType()) &&
- "support for FCA unimplemented");
- if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V))
- LiveTmp.insert(V);
- }
- }
-}
-
-static SetVector<Value *> computeKillSet(BasicBlock *BB) {
- SetVector<Value *> KillSet;
- for (Instruction &I : *BB)
- if (isHandledGCPointerType(I.getType()))
- KillSet.insert(&I);
- return KillSet;
-}
-
-#ifndef NDEBUG
-/// Check that the items in 'Live' dominate 'TI'. This is used as a basic
-/// sanity check for the liveness computation.
-static void checkBasicSSA(DominatorTree &DT, SetVector<Value *> &Live,
- Instruction *TI, bool TermOkay = false) {
- for (Value *V : Live) {
- if (auto *I = dyn_cast<Instruction>(V)) {
- // The terminator can be a member of the LiveOut set. LLVM's definition
- // of instruction dominance states that V does not dominate itself. As
- // such, we need to special case this to allow it.
- if (TermOkay && TI == I)
- continue;
- assert(DT.dominates(I, TI) &&
- "basic SSA liveness expectation violated by liveness analysis");
- }
- }
-}
-
-/// Check that all the liveness sets used during the computation of liveness
-/// obey basic SSA properties. This is useful for finding cases where we miss
-/// a def.
-static void checkBasicSSA(DominatorTree &DT, GCPtrLivenessData &Data,
- BasicBlock &BB) {
- checkBasicSSA(DT, Data.LiveSet[&BB], BB.getTerminator());
- checkBasicSSA(DT, Data.LiveOut[&BB], BB.getTerminator(), true);
- checkBasicSSA(DT, Data.LiveIn[&BB], BB.getTerminator());
-}
-#endif
-
-static void computeLiveInValues(DominatorTree &DT, Function &F,
- GCPtrLivenessData &Data) {
- SmallSetVector<BasicBlock *, 32> Worklist;
-
- // Seed the liveness for each individual block
- for (BasicBlock &BB : F) {
- Data.KillSet[&BB] = computeKillSet(&BB);
- Data.LiveSet[&BB].clear();
- computeLiveInValues(BB.rbegin(), BB.rend(), Data.LiveSet[&BB]);
-
-#ifndef NDEBUG
- for (Value *Kill : Data.KillSet[&BB])
- assert(!Data.LiveSet[&BB].count(Kill) && "live set contains kill");
-#endif
-
- Data.LiveOut[&BB] = SetVector<Value *>();
- computeLiveOutSeed(&BB, Data.LiveOut[&BB]);
- Data.LiveIn[&BB] = Data.LiveSet[&BB];
- Data.LiveIn[&BB].set_union(Data.LiveOut[&BB]);
- Data.LiveIn[&BB].set_subtract(Data.KillSet[&BB]);
- if (!Data.LiveIn[&BB].empty())
- Worklist.insert(pred_begin(&BB), pred_end(&BB));
- }
-
- // Propagate that liveness until stable
- while (!Worklist.empty()) {
- BasicBlock *BB = Worklist.pop_back_val();
-
- // Compute our new liveout set, then exit early if it hasn't changed despite
- // the contribution of our successor.
- SetVector<Value *> LiveOut = Data.LiveOut[BB];
- const auto OldLiveOutSize = LiveOut.size();
- for (BasicBlock *Succ : successors(BB)) {
- assert(Data.LiveIn.count(Succ));
- LiveOut.set_union(Data.LiveIn[Succ]);
- }
- // assert OutLiveOut is a subset of LiveOut
- if (OldLiveOutSize == LiveOut.size()) {
- // If the sets are the same size, then we didn't actually add anything
- // when unioning our successors LiveIn. Thus, the LiveIn of this block
- // hasn't changed.
- continue;
- }
- Data.LiveOut[BB] = LiveOut;
-
- // Apply the effects of this basic block
- SetVector<Value *> LiveTmp = LiveOut;
- LiveTmp.set_union(Data.LiveSet[BB]);
- LiveTmp.set_subtract(Data.KillSet[BB]);
-
- assert(Data.LiveIn.count(BB));
- const SetVector<Value *> &OldLiveIn = Data.LiveIn[BB];
- // assert: OldLiveIn is a subset of LiveTmp
- if (OldLiveIn.size() != LiveTmp.size()) {
- Data.LiveIn[BB] = LiveTmp;
- Worklist.insert(pred_begin(BB), pred_end(BB));
- }
- } // while (!Worklist.empty())
-
-#ifndef NDEBUG
- // Sanity check our output against SSA properties. This helps catch any
- // missing kills during the above iteration.
- for (BasicBlock &BB : F)
- checkBasicSSA(DT, Data, BB);
-#endif
-}
-
-static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data,
- StatepointLiveSetTy &Out) {
- BasicBlock *BB = Inst->getParent();
-
- // Note: The copy is intentional and required
- assert(Data.LiveOut.count(BB));
- SetVector<Value *> LiveOut = Data.LiveOut[BB];
-
- // We want to handle the statepoint itself oddly. It's
- // call result is not live (normal), nor are it's arguments
- // (unless they're used again later). This adjustment is
- // specifically what we need to relocate
- computeLiveInValues(BB->rbegin(), ++Inst->getIterator().getReverse(),
- LiveOut);
- LiveOut.remove(Inst);
- Out.insert(LiveOut.begin(), LiveOut.end());
-}
-
-static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
- CallBase *Call,
- PartiallyConstructedSafepointRecord &Info) {
- StatepointLiveSetTy Updated;
- findLiveSetAtInst(Call, RevisedLivenessData, Updated);
-
- // We may have base pointers which are now live that weren't before. We need
- // to update the PointerToBase structure to reflect this.
- for (auto V : Updated)
- if (Info.PointerToBase.insert({V, V}).second) {
- assert(isKnownBaseResult(V) &&
- "Can't find base for unexpected live value!");
- continue;
- }
-
-#ifndef NDEBUG
- for (auto V : Updated)
- assert(Info.PointerToBase.count(V) &&
- "Must be able to find base for live value!");
-#endif
-
- // Remove any stale base mappings - this can happen since our liveness is
- // more precise then the one inherent in the base pointer analysis.
- DenseSet<Value *> ToErase;
- for (auto KVPair : Info.PointerToBase)
- if (!Updated.count(KVPair.first))
- ToErase.insert(KVPair.first);
-
- for (auto *V : ToErase)
- Info.PointerToBase.erase(V);
-
-#ifndef NDEBUG
- for (auto KVPair : Info.PointerToBase)
- assert(Updated.count(KVPair.first) && "record for non-live value");
-#endif
-
- Info.LiveSet = Updated;
-}
+
+ // Before we start introducing relocations, we want to tweak the IR a bit to
+ // avoid unfortunate code generation effects. The main example is that we
+ // want to try to make sure the comparison feeding a branch is after any
+ // safepoints. Otherwise, we end up with a comparison of pre-relocation
+ // values feeding a branch after relocation. This is semantically correct,
+ // but results in extra register pressure since both the pre-relocation and
+ // post-relocation copies must be available in registers. For code without
+ // relocations this is handled elsewhere, but teaching the scheduler to
+ // reverse the transform we're about to do would be slightly complex.
+ // Note: This may extend the live range of the inputs to the icmp and thus
+ // increase the liveset of any statepoint we move over. This is profitable
+ // as long as all statepoints are in rare blocks. If we had in-register
+ // lowering for live values this would be a much safer transform.
+ auto getConditionInst = [](Instruction *TI) -> Instruction * {
+ if (auto *BI = dyn_cast<BranchInst>(TI))
+ if (BI->isConditional())
+ return dyn_cast<Instruction>(BI->getCondition());
+ // TODO: Extend this to handle switches
+ return nullptr;
+ };
+ for (BasicBlock &BB : F) {
+ Instruction *TI = BB.getTerminator();
+ if (auto *Cond = getConditionInst(TI))
+ // TODO: Handle more than just ICmps here. We should be able to move
+ // most instructions without side effects or memory access.
+ if (isa<ICmpInst>(Cond) && Cond->hasOneUse()) {
+ MadeChange = true;
+ Cond->moveBefore(TI);
+ }
+ }
+
+ // Nasty workaround - The base computation code in the main algorithm doesn't
+ // consider the fact that a GEP can be used to convert a scalar to a vector.
+ // The right fix for this is to integrate GEPs into the base rewriting
+ // algorithm properly, this is just a short term workaround to prevent
+ // crashes by canonicalizing such GEPs into fully vector GEPs.
+ for (Instruction &I : instructions(F)) {
+ if (!isa<GetElementPtrInst>(I))
+ continue;
+
+ unsigned VF = 0;
+ for (unsigned i = 0; i < I.getNumOperands(); i++)
+ if (auto *OpndVTy = dyn_cast<VectorType>(I.getOperand(i)->getType())) {
+ assert(VF == 0 ||
+ VF == cast<FixedVectorType>(OpndVTy)->getNumElements());
+ VF = cast<FixedVectorType>(OpndVTy)->getNumElements();
+ }
+
+ // It's the vector to scalar traversal through the pointer operand which
+ // confuses base pointer rewriting, so limit ourselves to that case.
+ if (!I.getOperand(0)->getType()->isVectorTy() && VF != 0) {
+ IRBuilder<> B(&I);
+ auto *Splat = B.CreateVectorSplat(VF, I.getOperand(0));
+ I.setOperand(0, Splat);
+ MadeChange = true;
+ }
+ }
+
+ MadeChange |= insertParsePoints(F, DT, TTI, ParsePointNeeded);
+ return MadeChange;
+}
+
+// liveness computation via standard dataflow
+// -------------------------------------------------------------------
+
+// TODO: Consider using bitvectors for liveness, the set of potentially
+// interesting values should be small and easy to pre-compute.
+
+/// Compute the live-in set for the location rbegin starting from
+/// the live-out set of the basic block
+static void computeLiveInValues(BasicBlock::reverse_iterator Begin,
+ BasicBlock::reverse_iterator End,
+ SetVector<Value *> &LiveTmp) {
+ for (auto &I : make_range(Begin, End)) {
+ // KILL/Def - Remove this definition from LiveIn
+ LiveTmp.remove(&I);
+
+ // Don't consider *uses* in PHI nodes, we handle their contribution to
+ // predecessor blocks when we seed the LiveOut sets
+ if (isa<PHINode>(I))
+ continue;
+
+ // USE - Add to the LiveIn set for this instruction
+ for (Value *V : I.operands()) {
+ assert(!isUnhandledGCPointerType(V->getType()) &&
+ "support for FCA unimplemented");
+ if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V)) {
+ // The choice to exclude all things constant here is slightly subtle.
+ // There are two independent reasons:
+ // - We assume that things which are constant (from LLVM's definition)
+ // do not move at runtime. For example, the address of a global
+ // variable is fixed, even though it's contents may not be.
+ // - Second, we can't disallow arbitrary inttoptr constants even
+ // if the language frontend does. Optimization passes are free to
+ // locally exploit facts without respect to global reachability. This
+ // can create sections of code which are dynamically unreachable and
+ // contain just about anything. (see constants.ll in tests)
+ LiveTmp.insert(V);
+ }
+ }
+ }
+}
+
+static void computeLiveOutSeed(BasicBlock *BB, SetVector<Value *> &LiveTmp) {
+ for (BasicBlock *Succ : successors(BB)) {
+ for (auto &I : *Succ) {
+ PHINode *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ break;
+
+ Value *V = PN->getIncomingValueForBlock(BB);
+ assert(!isUnhandledGCPointerType(V->getType()) &&
+ "support for FCA unimplemented");
+ if (isHandledGCPointerType(V->getType()) && !isa<Constant>(V))
+ LiveTmp.insert(V);
+ }
+ }
+}
+
+static SetVector<Value *> computeKillSet(BasicBlock *BB) {
+ SetVector<Value *> KillSet;
+ for (Instruction &I : *BB)
+ if (isHandledGCPointerType(I.getType()))
+ KillSet.insert(&I);
+ return KillSet;
+}
+
+#ifndef NDEBUG
+/// Check that the items in 'Live' dominate 'TI'. This is used as a basic
+/// sanity check for the liveness computation.
+static void checkBasicSSA(DominatorTree &DT, SetVector<Value *> &Live,
+ Instruction *TI, bool TermOkay = false) {
+ for (Value *V : Live) {
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ // The terminator can be a member of the LiveOut set. LLVM's definition
+ // of instruction dominance states that V does not dominate itself. As
+ // such, we need to special case this to allow it.
+ if (TermOkay && TI == I)
+ continue;
+ assert(DT.dominates(I, TI) &&
+ "basic SSA liveness expectation violated by liveness analysis");
+ }
+ }
+}
+
+/// Check that all the liveness sets used during the computation of liveness
+/// obey basic SSA properties. This is useful for finding cases where we miss
+/// a def.
+static void checkBasicSSA(DominatorTree &DT, GCPtrLivenessData &Data,
+ BasicBlock &BB) {
+ checkBasicSSA(DT, Data.LiveSet[&BB], BB.getTerminator());
+ checkBasicSSA(DT, Data.LiveOut[&BB], BB.getTerminator(), true);
+ checkBasicSSA(DT, Data.LiveIn[&BB], BB.getTerminator());
+}
+#endif
+
+static void computeLiveInValues(DominatorTree &DT, Function &F,
+ GCPtrLivenessData &Data) {
+ SmallSetVector<BasicBlock *, 32> Worklist;
+
+ // Seed the liveness for each individual block
+ for (BasicBlock &BB : F) {
+ Data.KillSet[&BB] = computeKillSet(&BB);
+ Data.LiveSet[&BB].clear();
+ computeLiveInValues(BB.rbegin(), BB.rend(), Data.LiveSet[&BB]);
+
+#ifndef NDEBUG
+ for (Value *Kill : Data.KillSet[&BB])
+ assert(!Data.LiveSet[&BB].count(Kill) && "live set contains kill");
+#endif
+
+ Data.LiveOut[&BB] = SetVector<Value *>();
+ computeLiveOutSeed(&BB, Data.LiveOut[&BB]);
+ Data.LiveIn[&BB] = Data.LiveSet[&BB];
+ Data.LiveIn[&BB].set_union(Data.LiveOut[&BB]);
+ Data.LiveIn[&BB].set_subtract(Data.KillSet[&BB]);
+ if (!Data.LiveIn[&BB].empty())
+ Worklist.insert(pred_begin(&BB), pred_end(&BB));
+ }
+
+ // Propagate that liveness until stable
+ while (!Worklist.empty()) {
+ BasicBlock *BB = Worklist.pop_back_val();
+
+ // Compute our new liveout set, then exit early if it hasn't changed despite
+ // the contribution of our successor.
+ SetVector<Value *> LiveOut = Data.LiveOut[BB];
+ const auto OldLiveOutSize = LiveOut.size();
+ for (BasicBlock *Succ : successors(BB)) {
+ assert(Data.LiveIn.count(Succ));
+ LiveOut.set_union(Data.LiveIn[Succ]);
+ }
+ // assert OutLiveOut is a subset of LiveOut
+ if (OldLiveOutSize == LiveOut.size()) {
+ // If the sets are the same size, then we didn't actually add anything
+ // when unioning our successors LiveIn. Thus, the LiveIn of this block
+ // hasn't changed.
+ continue;
+ }
+ Data.LiveOut[BB] = LiveOut;
+
+ // Apply the effects of this basic block
+ SetVector<Value *> LiveTmp = LiveOut;
+ LiveTmp.set_union(Data.LiveSet[BB]);
+ LiveTmp.set_subtract(Data.KillSet[BB]);
+
+ assert(Data.LiveIn.count(BB));
+ const SetVector<Value *> &OldLiveIn = Data.LiveIn[BB];
+ // assert: OldLiveIn is a subset of LiveTmp
+ if (OldLiveIn.size() != LiveTmp.size()) {
+ Data.LiveIn[BB] = LiveTmp;
+ Worklist.insert(pred_begin(BB), pred_end(BB));
+ }
+ } // while (!Worklist.empty())
+
+#ifndef NDEBUG
+ // Sanity check our output against SSA properties. This helps catch any
+ // missing kills during the above iteration.
+ for (BasicBlock &BB : F)
+ checkBasicSSA(DT, Data, BB);
+#endif
+}
+
+static void findLiveSetAtInst(Instruction *Inst, GCPtrLivenessData &Data,
+ StatepointLiveSetTy &Out) {
+ BasicBlock *BB = Inst->getParent();
+
+ // Note: The copy is intentional and required
+ assert(Data.LiveOut.count(BB));
+ SetVector<Value *> LiveOut = Data.LiveOut[BB];
+
+ // We want to handle the statepoint itself oddly. It's
+ // call result is not live (normal), nor are it's arguments
+ // (unless they're used again later). This adjustment is
+ // specifically what we need to relocate
+ computeLiveInValues(BB->rbegin(), ++Inst->getIterator().getReverse(),
+ LiveOut);
+ LiveOut.remove(Inst);
+ Out.insert(LiveOut.begin(), LiveOut.end());
+}
+
+static void recomputeLiveInValues(GCPtrLivenessData &RevisedLivenessData,
+ CallBase *Call,
+ PartiallyConstructedSafepointRecord &Info) {
+ StatepointLiveSetTy Updated;
+ findLiveSetAtInst(Call, RevisedLivenessData, Updated);
+
+ // We may have base pointers which are now live that weren't before. We need
+ // to update the PointerToBase structure to reflect this.
+ for (auto V : Updated)
+ if (Info.PointerToBase.insert({V, V}).second) {
+ assert(isKnownBaseResult(V) &&
+ "Can't find base for unexpected live value!");
+ continue;
+ }
+
+#ifndef NDEBUG
+ for (auto V : Updated)
+ assert(Info.PointerToBase.count(V) &&
+ "Must be able to find base for live value!");
+#endif
+
+ // Remove any stale base mappings - this can happen since our liveness is
+ // more precise then the one inherent in the base pointer analysis.
+ DenseSet<Value *> ToErase;
+ for (auto KVPair : Info.PointerToBase)
+ if (!Updated.count(KVPair.first))
+ ToErase.insert(KVPair.first);
+
+ for (auto *V : ToErase)
+ Info.PointerToBase.erase(V);
+
+#ifndef NDEBUG
+ for (auto KVPair : Info.PointerToBase)
+ assert(Updated.count(KVPair.first) && "record for non-live value");
+#endif
+
+ Info.LiveSet = Updated;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/SCCP.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/SCCP.cpp
index 8dba00e11b..8feed9e9eb 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/SCCP.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/SCCP.cpp
@@ -1,665 +1,665 @@
-//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements sparse conditional constant propagation and merging:
-//
-// Specifically, this:
-// * Assumes values are constant unless proven otherwise
-// * Assumes BasicBlocks are dead unless proven otherwise
-// * Proves values to be constant, and replaces them with constants
-// * Proves conditional branches to be unconditional
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/SCCP.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/STLExtras.h"
+//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements sparse conditional constant propagation and merging:
+//
+// Specifically, this:
+// * Assumes values are constant unless proven otherwise
+// * Assumes BasicBlocks are dead unless proven otherwise
+// * Proves values to be constant, and replaces them with constants
+// * Proves conditional branches to be unconditional
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SCCP.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueLattice.h"
-#include "llvm/Analysis/ValueLatticeUtils.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueLattice.h"
+#include "llvm/Analysis/ValueLatticeUtils.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/PredicateInfo.h"
-#include <cassert>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "sccp"
-
-STATISTIC(NumInstRemoved, "Number of instructions removed");
-STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
-STATISTIC(NumInstReplaced,
- "Number of instructions replaced with (simpler) instruction");
-
-STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
-STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
-STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
-STATISTIC(
- IPNumInstReplaced,
- "Number of instructions replaced with (simpler) instruction by IPSCCP");
-
-// The maximum number of range extensions allowed for operations requiring
-// widening.
-static const unsigned MaxNumRangeExtensions = 10;
-
-/// Returns MergeOptions with MaxWidenSteps set to MaxNumRangeExtensions.
-static ValueLatticeElement::MergeOptions getMaxWidenStepsOpts() {
- return ValueLatticeElement::MergeOptions().setMaxWidenSteps(
- MaxNumRangeExtensions);
-}
-namespace {
-
-// Helper to check if \p LV is either a constant or a constant
-// range with a single element. This should cover exactly the same cases as the
-// old ValueLatticeElement::isConstant() and is intended to be used in the
-// transition to ValueLatticeElement.
-bool isConstant(const ValueLatticeElement &LV) {
- return LV.isConstant() ||
- (LV.isConstantRange() && LV.getConstantRange().isSingleElement());
-}
-
-// Helper to check if \p LV is either overdefined or a constant range with more
-// than a single element. This should cover exactly the same cases as the old
-// ValueLatticeElement::isOverdefined() and is intended to be used in the
-// transition to ValueLatticeElement.
-bool isOverdefined(const ValueLatticeElement &LV) {
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include <cassert>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "sccp"
+
+STATISTIC(NumInstRemoved, "Number of instructions removed");
+STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
+STATISTIC(NumInstReplaced,
+ "Number of instructions replaced with (simpler) instruction");
+
+STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
+STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
+STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
+STATISTIC(
+ IPNumInstReplaced,
+ "Number of instructions replaced with (simpler) instruction by IPSCCP");
+
+// The maximum number of range extensions allowed for operations requiring
+// widening.
+static const unsigned MaxNumRangeExtensions = 10;
+
+/// Returns MergeOptions with MaxWidenSteps set to MaxNumRangeExtensions.
+static ValueLatticeElement::MergeOptions getMaxWidenStepsOpts() {
+ return ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+ MaxNumRangeExtensions);
+}
+namespace {
+
+// Helper to check if \p LV is either a constant or a constant
+// range with a single element. This should cover exactly the same cases as the
+// old ValueLatticeElement::isConstant() and is intended to be used in the
+// transition to ValueLatticeElement.
+bool isConstant(const ValueLatticeElement &LV) {
+ return LV.isConstant() ||
+ (LV.isConstantRange() && LV.getConstantRange().isSingleElement());
+}
+
+// Helper to check if \p LV is either overdefined or a constant range with more
+// than a single element. This should cover exactly the same cases as the old
+// ValueLatticeElement::isOverdefined() and is intended to be used in the
+// transition to ValueLatticeElement.
+bool isOverdefined(const ValueLatticeElement &LV) {
return !LV.isUnknownOrUndef() && !isConstant(LV);
-}
-
-//===----------------------------------------------------------------------===//
-//
-/// SCCPSolver - This class is a general purpose solver for Sparse Conditional
-/// Constant Propagation.
-///
-class SCCPSolver : public InstVisitor<SCCPSolver> {
- const DataLayout &DL;
- std::function<const TargetLibraryInfo &(Function &)> GetTLI;
- SmallPtrSet<BasicBlock *, 8> BBExecutable; // The BBs that are executable.
- DenseMap<Value *, ValueLatticeElement>
- ValueState; // The state each value is in.
-
- /// StructValueState - This maintains ValueState for values that have
- /// StructType, for example for formal arguments, calls, insertelement, etc.
- DenseMap<std::pair<Value *, unsigned>, ValueLatticeElement> StructValueState;
-
- /// GlobalValue - If we are tracking any values for the contents of a global
- /// variable, we keep a mapping from the constant accessor to the element of
- /// the global, to the currently known value. If the value becomes
- /// overdefined, it's entry is simply removed from this map.
- DenseMap<GlobalVariable *, ValueLatticeElement> TrackedGlobals;
-
- /// TrackedRetVals - If we are tracking arguments into and the return
- /// value out of a function, it will have an entry in this map, indicating
- /// what the known return value for the function is.
- MapVector<Function *, ValueLatticeElement> TrackedRetVals;
-
- /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
- /// that return multiple values.
- MapVector<std::pair<Function *, unsigned>, ValueLatticeElement>
- TrackedMultipleRetVals;
-
- /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is
- /// represented here for efficient lookup.
- SmallPtrSet<Function *, 16> MRVFunctionsTracked;
-
- /// MustTailFunctions - Each function here is a callee of non-removable
- /// musttail call site.
- SmallPtrSet<Function *, 16> MustTailCallees;
-
- /// TrackingIncomingArguments - This is the set of functions for whose
- /// arguments we make optimistic assumptions about and try to prove as
- /// constants.
- SmallPtrSet<Function *, 16> TrackingIncomingArguments;
-
- /// The reason for two worklists is that overdefined is the lowest state
- /// on the lattice, and moving things to overdefined as fast as possible
- /// makes SCCP converge much faster.
- ///
- /// By having a separate worklist, we accomplish this because everything
- /// possibly overdefined will become overdefined at the soonest possible
- /// point.
- SmallVector<Value *, 64> OverdefinedInstWorkList;
- SmallVector<Value *, 64> InstWorkList;
-
- // The BasicBlock work list
- SmallVector<BasicBlock *, 64> BBWorkList;
-
- /// KnownFeasibleEdges - Entries in this set are edges which have already had
- /// PHI nodes retriggered.
- using Edge = std::pair<BasicBlock *, BasicBlock *>;
- DenseSet<Edge> KnownFeasibleEdges;
-
- DenseMap<Function *, AnalysisResultsForFn> AnalysisResults;
- DenseMap<Value *, SmallPtrSet<User *, 2>> AdditionalUsers;
-
- LLVMContext &Ctx;
-
-public:
- void addAnalysis(Function &F, AnalysisResultsForFn A) {
- AnalysisResults.insert({&F, std::move(A)});
- }
-
- const PredicateBase *getPredicateInfoFor(Instruction *I) {
- auto A = AnalysisResults.find(I->getParent()->getParent());
- if (A == AnalysisResults.end())
- return nullptr;
- return A->second.PredInfo->getPredicateInfoFor(I);
- }
-
- DomTreeUpdater getDTU(Function &F) {
- auto A = AnalysisResults.find(&F);
- assert(A != AnalysisResults.end() && "Need analysis results for function.");
- return {A->second.DT, A->second.PDT, DomTreeUpdater::UpdateStrategy::Lazy};
- }
-
- SCCPSolver(const DataLayout &DL,
- std::function<const TargetLibraryInfo &(Function &)> GetTLI,
- LLVMContext &Ctx)
- : DL(DL), GetTLI(std::move(GetTLI)), Ctx(Ctx) {}
-
- /// MarkBlockExecutable - This method can be used by clients to mark all of
- /// the blocks that are known to be intrinsically live in the processed unit.
- ///
- /// This returns true if the block was not considered live before.
- bool MarkBlockExecutable(BasicBlock *BB) {
- if (!BBExecutable.insert(BB).second)
- return false;
- LLVM_DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
- BBWorkList.push_back(BB); // Add the block to the work list!
- return true;
- }
-
- /// TrackValueOfGlobalVariable - Clients can use this method to
- /// inform the SCCPSolver that it should track loads and stores to the
- /// specified global variable if it can. This is only legal to call if
- /// performing Interprocedural SCCP.
- void TrackValueOfGlobalVariable(GlobalVariable *GV) {
- // We only track the contents of scalar globals.
- if (GV->getValueType()->isSingleValueType()) {
- ValueLatticeElement &IV = TrackedGlobals[GV];
- if (!isa<UndefValue>(GV->getInitializer()))
- IV.markConstant(GV->getInitializer());
- }
- }
-
- /// AddTrackedFunction - If the SCCP solver is supposed to track calls into
- /// and out of the specified function (which cannot have its address taken),
- /// this method must be called.
- void AddTrackedFunction(Function *F) {
- // Add an entry, F -> undef.
- if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
- MRVFunctionsTracked.insert(F);
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
- TrackedMultipleRetVals.insert(
- std::make_pair(std::make_pair(F, i), ValueLatticeElement()));
+}
+
+//===----------------------------------------------------------------------===//
+//
+/// SCCPSolver - This class is a general purpose solver for Sparse Conditional
+/// Constant Propagation.
+///
+class SCCPSolver : public InstVisitor<SCCPSolver> {
+ const DataLayout &DL;
+ std::function<const TargetLibraryInfo &(Function &)> GetTLI;
+ SmallPtrSet<BasicBlock *, 8> BBExecutable; // The BBs that are executable.
+ DenseMap<Value *, ValueLatticeElement>
+ ValueState; // The state each value is in.
+
+ /// StructValueState - This maintains ValueState for values that have
+ /// StructType, for example for formal arguments, calls, insertelement, etc.
+ DenseMap<std::pair<Value *, unsigned>, ValueLatticeElement> StructValueState;
+
+ /// GlobalValue - If we are tracking any values for the contents of a global
+ /// variable, we keep a mapping from the constant accessor to the element of
+ /// the global, to the currently known value. If the value becomes
+ /// overdefined, it's entry is simply removed from this map.
+ DenseMap<GlobalVariable *, ValueLatticeElement> TrackedGlobals;
+
+ /// TrackedRetVals - If we are tracking arguments into and the return
+ /// value out of a function, it will have an entry in this map, indicating
+ /// what the known return value for the function is.
+ MapVector<Function *, ValueLatticeElement> TrackedRetVals;
+
+ /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
+ /// that return multiple values.
+ MapVector<std::pair<Function *, unsigned>, ValueLatticeElement>
+ TrackedMultipleRetVals;
+
+ /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is
+ /// represented here for efficient lookup.
+ SmallPtrSet<Function *, 16> MRVFunctionsTracked;
+
+ /// MustTailFunctions - Each function here is a callee of non-removable
+ /// musttail call site.
+ SmallPtrSet<Function *, 16> MustTailCallees;
+
+ /// TrackingIncomingArguments - This is the set of functions for whose
+ /// arguments we make optimistic assumptions about and try to prove as
+ /// constants.
+ SmallPtrSet<Function *, 16> TrackingIncomingArguments;
+
+ /// The reason for two worklists is that overdefined is the lowest state
+ /// on the lattice, and moving things to overdefined as fast as possible
+ /// makes SCCP converge much faster.
+ ///
+ /// By having a separate worklist, we accomplish this because everything
+ /// possibly overdefined will become overdefined at the soonest possible
+ /// point.
+ SmallVector<Value *, 64> OverdefinedInstWorkList;
+ SmallVector<Value *, 64> InstWorkList;
+
+ // The BasicBlock work list
+ SmallVector<BasicBlock *, 64> BBWorkList;
+
+ /// KnownFeasibleEdges - Entries in this set are edges which have already had
+ /// PHI nodes retriggered.
+ using Edge = std::pair<BasicBlock *, BasicBlock *>;
+ DenseSet<Edge> KnownFeasibleEdges;
+
+ DenseMap<Function *, AnalysisResultsForFn> AnalysisResults;
+ DenseMap<Value *, SmallPtrSet<User *, 2>> AdditionalUsers;
+
+ LLVMContext &Ctx;
+
+public:
+ void addAnalysis(Function &F, AnalysisResultsForFn A) {
+ AnalysisResults.insert({&F, std::move(A)});
+ }
+
+ const PredicateBase *getPredicateInfoFor(Instruction *I) {
+ auto A = AnalysisResults.find(I->getParent()->getParent());
+ if (A == AnalysisResults.end())
+ return nullptr;
+ return A->second.PredInfo->getPredicateInfoFor(I);
+ }
+
+ DomTreeUpdater getDTU(Function &F) {
+ auto A = AnalysisResults.find(&F);
+ assert(A != AnalysisResults.end() && "Need analysis results for function.");
+ return {A->second.DT, A->second.PDT, DomTreeUpdater::UpdateStrategy::Lazy};
+ }
+
+ SCCPSolver(const DataLayout &DL,
+ std::function<const TargetLibraryInfo &(Function &)> GetTLI,
+ LLVMContext &Ctx)
+ : DL(DL), GetTLI(std::move(GetTLI)), Ctx(Ctx) {}
+
+ /// MarkBlockExecutable - This method can be used by clients to mark all of
+ /// the blocks that are known to be intrinsically live in the processed unit.
+ ///
+ /// This returns true if the block was not considered live before.
+ bool MarkBlockExecutable(BasicBlock *BB) {
+ if (!BBExecutable.insert(BB).second)
+ return false;
+ LLVM_DEBUG(dbgs() << "Marking Block Executable: " << BB->getName() << '\n');
+ BBWorkList.push_back(BB); // Add the block to the work list!
+ return true;
+ }
+
+ /// TrackValueOfGlobalVariable - Clients can use this method to
+ /// inform the SCCPSolver that it should track loads and stores to the
+ /// specified global variable if it can. This is only legal to call if
+ /// performing Interprocedural SCCP.
+ void TrackValueOfGlobalVariable(GlobalVariable *GV) {
+ // We only track the contents of scalar globals.
+ if (GV->getValueType()->isSingleValueType()) {
+ ValueLatticeElement &IV = TrackedGlobals[GV];
+ if (!isa<UndefValue>(GV->getInitializer()))
+ IV.markConstant(GV->getInitializer());
+ }
+ }
+
+ /// AddTrackedFunction - If the SCCP solver is supposed to track calls into
+ /// and out of the specified function (which cannot have its address taken),
+ /// this method must be called.
+ void AddTrackedFunction(Function *F) {
+ // Add an entry, F -> undef.
+ if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
+ MRVFunctionsTracked.insert(F);
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+ TrackedMultipleRetVals.insert(
+ std::make_pair(std::make_pair(F, i), ValueLatticeElement()));
} else if (!F->getReturnType()->isVoidTy())
- TrackedRetVals.insert(std::make_pair(F, ValueLatticeElement()));
- }
-
- /// AddMustTailCallee - If the SCCP solver finds that this function is called
- /// from non-removable musttail call site.
- void AddMustTailCallee(Function *F) {
- MustTailCallees.insert(F);
- }
-
- /// Returns true if the given function is called from non-removable musttail
- /// call site.
- bool isMustTailCallee(Function *F) {
- return MustTailCallees.count(F);
- }
-
- void AddArgumentTrackedFunction(Function *F) {
- TrackingIncomingArguments.insert(F);
- }
-
- /// Returns true if the given function is in the solver's set of
- /// argument-tracked functions.
- bool isArgumentTrackedFunction(Function *F) {
- return TrackingIncomingArguments.count(F);
- }
-
- /// Solve - Solve for constants and executable blocks.
- void Solve();
-
- /// ResolvedUndefsIn - While solving the dataflow for a function, we assume
- /// that branches on undef values cannot reach any of their successors.
- /// However, this is not a safe assumption. After we solve dataflow, this
- /// method should be use to handle this. If this returns true, the solver
- /// should be rerun.
- bool ResolvedUndefsIn(Function &F);
-
- bool isBlockExecutable(BasicBlock *BB) const {
- return BBExecutable.count(BB);
- }
-
- // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
- // block to the 'To' basic block is currently feasible.
+ TrackedRetVals.insert(std::make_pair(F, ValueLatticeElement()));
+ }
+
+ /// AddMustTailCallee - If the SCCP solver finds that this function is called
+ /// from non-removable musttail call site.
+ void AddMustTailCallee(Function *F) {
+ MustTailCallees.insert(F);
+ }
+
+ /// Returns true if the given function is called from non-removable musttail
+ /// call site.
+ bool isMustTailCallee(Function *F) {
+ return MustTailCallees.count(F);
+ }
+
+ void AddArgumentTrackedFunction(Function *F) {
+ TrackingIncomingArguments.insert(F);
+ }
+
+ /// Returns true if the given function is in the solver's set of
+ /// argument-tracked functions.
+ bool isArgumentTrackedFunction(Function *F) {
+ return TrackingIncomingArguments.count(F);
+ }
+
+ /// Solve - Solve for constants and executable blocks.
+ void Solve();
+
+ /// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+ /// that branches on undef values cannot reach any of their successors.
+ /// However, this is not a safe assumption. After we solve dataflow, this
+ /// method should be use to handle this. If this returns true, the solver
+ /// should be rerun.
+ bool ResolvedUndefsIn(Function &F);
+
+ bool isBlockExecutable(BasicBlock *BB) const {
+ return BBExecutable.count(BB);
+ }
+
+ // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+ // block to the 'To' basic block is currently feasible.
bool isEdgeFeasible(BasicBlock *From, BasicBlock *To) const;
-
- std::vector<ValueLatticeElement> getStructLatticeValueFor(Value *V) const {
- std::vector<ValueLatticeElement> StructValues;
- auto *STy = dyn_cast<StructType>(V->getType());
- assert(STy && "getStructLatticeValueFor() can be called only on structs");
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
- auto I = StructValueState.find(std::make_pair(V, i));
- assert(I != StructValueState.end() && "Value not in valuemap!");
- StructValues.push_back(I->second);
- }
- return StructValues;
- }
-
- void removeLatticeValueFor(Value *V) { ValueState.erase(V); }
-
- const ValueLatticeElement &getLatticeValueFor(Value *V) const {
- assert(!V->getType()->isStructTy() &&
- "Should use getStructLatticeValueFor");
- DenseMap<Value *, ValueLatticeElement>::const_iterator I =
- ValueState.find(V);
- assert(I != ValueState.end() &&
- "V not found in ValueState nor Paramstate map!");
- return I->second;
- }
-
- /// getTrackedRetVals - Get the inferred return value map.
- const MapVector<Function *, ValueLatticeElement> &getTrackedRetVals() {
- return TrackedRetVals;
- }
-
- /// getTrackedGlobals - Get and return the set of inferred initializers for
- /// global variables.
- const DenseMap<GlobalVariable *, ValueLatticeElement> &getTrackedGlobals() {
- return TrackedGlobals;
- }
-
- /// getMRVFunctionsTracked - Get the set of functions which return multiple
- /// values tracked by the pass.
- const SmallPtrSet<Function *, 16> getMRVFunctionsTracked() {
- return MRVFunctionsTracked;
- }
-
- /// getMustTailCallees - Get the set of functions which are called
- /// from non-removable musttail call sites.
- const SmallPtrSet<Function *, 16> getMustTailCallees() {
- return MustTailCallees;
- }
-
- /// markOverdefined - Mark the specified value overdefined. This
- /// works with both scalars and structs.
- void markOverdefined(Value *V) {
- if (auto *STy = dyn_cast<StructType>(V->getType()))
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
- markOverdefined(getStructValueState(V, i), V);
- else
- markOverdefined(ValueState[V], V);
- }
-
- // isStructLatticeConstant - Return true if all the lattice values
- // corresponding to elements of the structure are constants,
- // false otherwise.
- bool isStructLatticeConstant(Function *F, StructType *STy) {
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
- const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i));
- assert(It != TrackedMultipleRetVals.end());
- ValueLatticeElement LV = It->second;
- if (!isConstant(LV))
- return false;
- }
- return true;
- }
-
- /// Helper to return a Constant if \p LV is either a constant or a constant
- /// range with a single element.
- Constant *getConstant(const ValueLatticeElement &LV) const {
- if (LV.isConstant())
- return LV.getConstant();
-
- if (LV.isConstantRange()) {
- auto &CR = LV.getConstantRange();
- if (CR.getSingleElement())
- return ConstantInt::get(Ctx, *CR.getSingleElement());
- }
- return nullptr;
- }
-
-private:
- ConstantInt *getConstantInt(const ValueLatticeElement &IV) const {
- return dyn_cast_or_null<ConstantInt>(getConstant(IV));
- }
-
- // pushToWorkList - Helper for markConstant/markOverdefined
- void pushToWorkList(ValueLatticeElement &IV, Value *V) {
- if (IV.isOverdefined())
- return OverdefinedInstWorkList.push_back(V);
- InstWorkList.push_back(V);
- }
-
- // Helper to push \p V to the worklist, after updating it to \p IV. Also
- // prints a debug message with the updated value.
- void pushToWorkListMsg(ValueLatticeElement &IV, Value *V) {
- LLVM_DEBUG(dbgs() << "updated " << IV << ": " << *V << '\n');
- pushToWorkList(IV, V);
- }
-
- // markConstant - Make a value be marked as "constant". If the value
- // is not already a constant, add it to the instruction work list so that
- // the users of the instruction are updated later.
- bool markConstant(ValueLatticeElement &IV, Value *V, Constant *C,
- bool MayIncludeUndef = false) {
- if (!IV.markConstant(C, MayIncludeUndef))
- return false;
- LLVM_DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
- pushToWorkList(IV, V);
- return true;
- }
-
- bool markConstant(Value *V, Constant *C) {
- assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
- return markConstant(ValueState[V], V, C);
- }
-
- // markOverdefined - Make a value be marked as "overdefined". If the
- // value is not already overdefined, add it to the overdefined instruction
- // work list so that the users of the instruction are updated later.
- bool markOverdefined(ValueLatticeElement &IV, Value *V) {
- if (!IV.markOverdefined()) return false;
-
- LLVM_DEBUG(dbgs() << "markOverdefined: ";
- if (auto *F = dyn_cast<Function>(V)) dbgs()
- << "Function '" << F->getName() << "'\n";
- else dbgs() << *V << '\n');
- // Only instructions go on the work list
- pushToWorkList(IV, V);
- return true;
- }
-
- /// Merge \p MergeWithV into \p IV and push \p V to the worklist, if \p IV
- /// changes.
- bool mergeInValue(ValueLatticeElement &IV, Value *V,
- ValueLatticeElement MergeWithV,
- ValueLatticeElement::MergeOptions Opts = {
- /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) {
- if (IV.mergeIn(MergeWithV, Opts)) {
- pushToWorkList(IV, V);
- LLVM_DEBUG(dbgs() << "Merged " << MergeWithV << " into " << *V << " : "
- << IV << "\n");
- return true;
- }
- return false;
- }
-
- bool mergeInValue(Value *V, ValueLatticeElement MergeWithV,
- ValueLatticeElement::MergeOptions Opts = {
- /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) {
- assert(!V->getType()->isStructTy() &&
- "non-structs should use markConstant");
- return mergeInValue(ValueState[V], V, MergeWithV, Opts);
- }
-
- /// getValueState - Return the ValueLatticeElement object that corresponds to
- /// the value. This function handles the case when the value hasn't been seen
- /// yet by properly seeding constants etc.
- ValueLatticeElement &getValueState(Value *V) {
- assert(!V->getType()->isStructTy() && "Should use getStructValueState");
-
- auto I = ValueState.insert(std::make_pair(V, ValueLatticeElement()));
- ValueLatticeElement &LV = I.first->second;
-
- if (!I.second)
- return LV; // Common case, already in the map.
-
- if (auto *C = dyn_cast<Constant>(V))
- LV.markConstant(C); // Constants are constant
-
- // All others are unknown by default.
- return LV;
- }
-
- /// getStructValueState - Return the ValueLatticeElement object that
- /// corresponds to the value/field pair. This function handles the case when
- /// the value hasn't been seen yet by properly seeding constants etc.
- ValueLatticeElement &getStructValueState(Value *V, unsigned i) {
- assert(V->getType()->isStructTy() && "Should use getValueState");
- assert(i < cast<StructType>(V->getType())->getNumElements() &&
- "Invalid element #");
-
- auto I = StructValueState.insert(
- std::make_pair(std::make_pair(V, i), ValueLatticeElement()));
- ValueLatticeElement &LV = I.first->second;
-
- if (!I.second)
- return LV; // Common case, already in the map.
-
- if (auto *C = dyn_cast<Constant>(V)) {
- Constant *Elt = C->getAggregateElement(i);
-
- if (!Elt)
- LV.markOverdefined(); // Unknown sort of constant.
- else if (isa<UndefValue>(Elt))
- ; // Undef values remain unknown.
- else
- LV.markConstant(Elt); // Constants are constant.
- }
-
- // All others are underdefined by default.
- return LV;
- }
-
- /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
- /// work list if it is not already executable.
- bool markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
- if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
- return false; // This edge is already known to be executable!
-
- if (!MarkBlockExecutable(Dest)) {
- // If the destination is already executable, we just made an *edge*
- // feasible that wasn't before. Revisit the PHI nodes in the block
- // because they have potentially new operands.
- LLVM_DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
- << " -> " << Dest->getName() << '\n');
-
- for (PHINode &PN : Dest->phis())
- visitPHINode(PN);
- }
- return true;
- }
-
- // getFeasibleSuccessors - Return a vector of booleans to indicate which
- // successors are reachable from a given terminator instruction.
- void getFeasibleSuccessors(Instruction &TI, SmallVectorImpl<bool> &Succs);
-
- // OperandChangedState - This method is invoked on all of the users of an
- // instruction that was just changed state somehow. Based on this
- // information, we need to update the specified user of this instruction.
- void OperandChangedState(Instruction *I) {
- if (BBExecutable.count(I->getParent())) // Inst is executable?
- visit(*I);
- }
-
- // Add U as additional user of V.
- void addAdditionalUser(Value *V, User *U) {
- auto Iter = AdditionalUsers.insert({V, {}});
- Iter.first->second.insert(U);
- }
-
- // Mark I's users as changed, including AdditionalUsers.
- void markUsersAsChanged(Value *I) {
- // Functions include their arguments in the use-list. Changed function
- // values mean that the result of the function changed. We only need to
- // update the call sites with the new function result and do not have to
- // propagate the call arguments.
- if (isa<Function>(I)) {
- for (User *U : I->users()) {
- if (auto *CB = dyn_cast<CallBase>(U))
- handleCallResult(*CB);
- }
- } else {
- for (User *U : I->users())
- if (auto *UI = dyn_cast<Instruction>(U))
- OperandChangedState(UI);
- }
-
- auto Iter = AdditionalUsers.find(I);
- if (Iter != AdditionalUsers.end()) {
+
+ std::vector<ValueLatticeElement> getStructLatticeValueFor(Value *V) const {
+ std::vector<ValueLatticeElement> StructValues;
+ auto *STy = dyn_cast<StructType>(V->getType());
+ assert(STy && "getStructLatticeValueFor() can be called only on structs");
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ auto I = StructValueState.find(std::make_pair(V, i));
+ assert(I != StructValueState.end() && "Value not in valuemap!");
+ StructValues.push_back(I->second);
+ }
+ return StructValues;
+ }
+
+ void removeLatticeValueFor(Value *V) { ValueState.erase(V); }
+
+ const ValueLatticeElement &getLatticeValueFor(Value *V) const {
+ assert(!V->getType()->isStructTy() &&
+ "Should use getStructLatticeValueFor");
+ DenseMap<Value *, ValueLatticeElement>::const_iterator I =
+ ValueState.find(V);
+ assert(I != ValueState.end() &&
+ "V not found in ValueState nor Paramstate map!");
+ return I->second;
+ }
+
+ /// getTrackedRetVals - Get the inferred return value map.
+ const MapVector<Function *, ValueLatticeElement> &getTrackedRetVals() {
+ return TrackedRetVals;
+ }
+
+ /// getTrackedGlobals - Get and return the set of inferred initializers for
+ /// global variables.
+ const DenseMap<GlobalVariable *, ValueLatticeElement> &getTrackedGlobals() {
+ return TrackedGlobals;
+ }
+
+ /// getMRVFunctionsTracked - Get the set of functions which return multiple
+ /// values tracked by the pass.
+ const SmallPtrSet<Function *, 16> getMRVFunctionsTracked() {
+ return MRVFunctionsTracked;
+ }
+
+ /// getMustTailCallees - Get the set of functions which are called
+ /// from non-removable musttail call sites.
+ const SmallPtrSet<Function *, 16> getMustTailCallees() {
+ return MustTailCallees;
+ }
+
+ /// markOverdefined - Mark the specified value overdefined. This
+ /// works with both scalars and structs.
+ void markOverdefined(Value *V) {
+ if (auto *STy = dyn_cast<StructType>(V->getType()))
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+ markOverdefined(getStructValueState(V, i), V);
+ else
+ markOverdefined(ValueState[V], V);
+ }
+
+ // isStructLatticeConstant - Return true if all the lattice values
+ // corresponding to elements of the structure are constants,
+ // false otherwise.
+ bool isStructLatticeConstant(Function *F, StructType *STy) {
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i));
+ assert(It != TrackedMultipleRetVals.end());
+ ValueLatticeElement LV = It->second;
+ if (!isConstant(LV))
+ return false;
+ }
+ return true;
+ }
+
+ /// Helper to return a Constant if \p LV is either a constant or a constant
+ /// range with a single element.
+ Constant *getConstant(const ValueLatticeElement &LV) const {
+ if (LV.isConstant())
+ return LV.getConstant();
+
+ if (LV.isConstantRange()) {
+ auto &CR = LV.getConstantRange();
+ if (CR.getSingleElement())
+ return ConstantInt::get(Ctx, *CR.getSingleElement());
+ }
+ return nullptr;
+ }
+
+private:
+ ConstantInt *getConstantInt(const ValueLatticeElement &IV) const {
+ return dyn_cast_or_null<ConstantInt>(getConstant(IV));
+ }
+
+ // pushToWorkList - Helper for markConstant/markOverdefined
+ void pushToWorkList(ValueLatticeElement &IV, Value *V) {
+ if (IV.isOverdefined())
+ return OverdefinedInstWorkList.push_back(V);
+ InstWorkList.push_back(V);
+ }
+
+ // Helper to push \p V to the worklist, after updating it to \p IV. Also
+ // prints a debug message with the updated value.
+ void pushToWorkListMsg(ValueLatticeElement &IV, Value *V) {
+ LLVM_DEBUG(dbgs() << "updated " << IV << ": " << *V << '\n');
+ pushToWorkList(IV, V);
+ }
+
+ // markConstant - Make a value be marked as "constant". If the value
+ // is not already a constant, add it to the instruction work list so that
+ // the users of the instruction are updated later.
+ bool markConstant(ValueLatticeElement &IV, Value *V, Constant *C,
+ bool MayIncludeUndef = false) {
+ if (!IV.markConstant(C, MayIncludeUndef))
+ return false;
+ LLVM_DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
+ pushToWorkList(IV, V);
+ return true;
+ }
+
+ bool markConstant(Value *V, Constant *C) {
+ assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
+ return markConstant(ValueState[V], V, C);
+ }
+
+ // markOverdefined - Make a value be marked as "overdefined". If the
+ // value is not already overdefined, add it to the overdefined instruction
+ // work list so that the users of the instruction are updated later.
+ bool markOverdefined(ValueLatticeElement &IV, Value *V) {
+ if (!IV.markOverdefined()) return false;
+
+ LLVM_DEBUG(dbgs() << "markOverdefined: ";
+ if (auto *F = dyn_cast<Function>(V)) dbgs()
+ << "Function '" << F->getName() << "'\n";
+ else dbgs() << *V << '\n');
+ // Only instructions go on the work list
+ pushToWorkList(IV, V);
+ return true;
+ }
+
+ /// Merge \p MergeWithV into \p IV and push \p V to the worklist, if \p IV
+ /// changes.
+ bool mergeInValue(ValueLatticeElement &IV, Value *V,
+ ValueLatticeElement MergeWithV,
+ ValueLatticeElement::MergeOptions Opts = {
+ /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) {
+ if (IV.mergeIn(MergeWithV, Opts)) {
+ pushToWorkList(IV, V);
+ LLVM_DEBUG(dbgs() << "Merged " << MergeWithV << " into " << *V << " : "
+ << IV << "\n");
+ return true;
+ }
+ return false;
+ }
+
+ bool mergeInValue(Value *V, ValueLatticeElement MergeWithV,
+ ValueLatticeElement::MergeOptions Opts = {
+ /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) {
+ assert(!V->getType()->isStructTy() &&
+ "non-structs should use markConstant");
+ return mergeInValue(ValueState[V], V, MergeWithV, Opts);
+ }
+
+ /// getValueState - Return the ValueLatticeElement object that corresponds to
+ /// the value. This function handles the case when the value hasn't been seen
+ /// yet by properly seeding constants etc.
+ ValueLatticeElement &getValueState(Value *V) {
+ assert(!V->getType()->isStructTy() && "Should use getStructValueState");
+
+ auto I = ValueState.insert(std::make_pair(V, ValueLatticeElement()));
+ ValueLatticeElement &LV = I.first->second;
+
+ if (!I.second)
+ return LV; // Common case, already in the map.
+
+ if (auto *C = dyn_cast<Constant>(V))
+ LV.markConstant(C); // Constants are constant
+
+ // All others are unknown by default.
+ return LV;
+ }
+
+ /// getStructValueState - Return the ValueLatticeElement object that
+ /// corresponds to the value/field pair. This function handles the case when
+ /// the value hasn't been seen yet by properly seeding constants etc.
+ ValueLatticeElement &getStructValueState(Value *V, unsigned i) {
+ assert(V->getType()->isStructTy() && "Should use getValueState");
+ assert(i < cast<StructType>(V->getType())->getNumElements() &&
+ "Invalid element #");
+
+ auto I = StructValueState.insert(
+ std::make_pair(std::make_pair(V, i), ValueLatticeElement()));
+ ValueLatticeElement &LV = I.first->second;
+
+ if (!I.second)
+ return LV; // Common case, already in the map.
+
+ if (auto *C = dyn_cast<Constant>(V)) {
+ Constant *Elt = C->getAggregateElement(i);
+
+ if (!Elt)
+ LV.markOverdefined(); // Unknown sort of constant.
+ else if (isa<UndefValue>(Elt))
+ ; // Undef values remain unknown.
+ else
+ LV.markConstant(Elt); // Constants are constant.
+ }
+
+ // All others are underdefined by default.
+ return LV;
+ }
+
+ /// markEdgeExecutable - Mark a basic block as executable, adding it to the BB
+ /// work list if it is not already executable.
+ bool markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+ if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
+ return false; // This edge is already known to be executable!
+
+ if (!MarkBlockExecutable(Dest)) {
+ // If the destination is already executable, we just made an *edge*
+ // feasible that wasn't before. Revisit the PHI nodes in the block
+ // because they have potentially new operands.
+ LLVM_DEBUG(dbgs() << "Marking Edge Executable: " << Source->getName()
+ << " -> " << Dest->getName() << '\n');
+
+ for (PHINode &PN : Dest->phis())
+ visitPHINode(PN);
+ }
+ return true;
+ }
+
+ // getFeasibleSuccessors - Return a vector of booleans to indicate which
+ // successors are reachable from a given terminator instruction.
+ void getFeasibleSuccessors(Instruction &TI, SmallVectorImpl<bool> &Succs);
+
+ // OperandChangedState - This method is invoked on all of the users of an
+ // instruction that was just changed state somehow. Based on this
+ // information, we need to update the specified user of this instruction.
+ void OperandChangedState(Instruction *I) {
+ if (BBExecutable.count(I->getParent())) // Inst is executable?
+ visit(*I);
+ }
+
+ // Add U as additional user of V.
+ void addAdditionalUser(Value *V, User *U) {
+ auto Iter = AdditionalUsers.insert({V, {}});
+ Iter.first->second.insert(U);
+ }
+
+ // Mark I's users as changed, including AdditionalUsers.
+ void markUsersAsChanged(Value *I) {
+ // Functions include their arguments in the use-list. Changed function
+ // values mean that the result of the function changed. We only need to
+ // update the call sites with the new function result and do not have to
+ // propagate the call arguments.
+ if (isa<Function>(I)) {
+ for (User *U : I->users()) {
+ if (auto *CB = dyn_cast<CallBase>(U))
+ handleCallResult(*CB);
+ }
+ } else {
+ for (User *U : I->users())
+ if (auto *UI = dyn_cast<Instruction>(U))
+ OperandChangedState(UI);
+ }
+
+ auto Iter = AdditionalUsers.find(I);
+ if (Iter != AdditionalUsers.end()) {
// Copy additional users before notifying them of changes, because new
// users may be added, potentially invalidating the iterator.
SmallVector<Instruction *, 2> ToNotify;
- for (User *U : Iter->second)
- if (auto *UI = dyn_cast<Instruction>(U))
+ for (User *U : Iter->second)
+ if (auto *UI = dyn_cast<Instruction>(U))
ToNotify.push_back(UI);
for (Instruction *UI : ToNotify)
OperandChangedState(UI);
- }
- }
- void handleCallOverdefined(CallBase &CB);
- void handleCallResult(CallBase &CB);
- void handleCallArguments(CallBase &CB);
-
-private:
- friend class InstVisitor<SCCPSolver>;
-
- // visit implementations - Something changed in this instruction. Either an
- // operand made a transition, or the instruction is newly executable. Change
- // the value type of I to reflect these changes if appropriate.
- void visitPHINode(PHINode &I);
-
- // Terminators
-
- void visitReturnInst(ReturnInst &I);
- void visitTerminator(Instruction &TI);
-
- void visitCastInst(CastInst &I);
- void visitSelectInst(SelectInst &I);
- void visitUnaryOperator(Instruction &I);
- void visitBinaryOperator(Instruction &I);
- void visitCmpInst(CmpInst &I);
- void visitExtractValueInst(ExtractValueInst &EVI);
- void visitInsertValueInst(InsertValueInst &IVI);
-
- void visitCatchSwitchInst(CatchSwitchInst &CPI) {
- markOverdefined(&CPI);
- visitTerminator(CPI);
- }
-
- // Instructions that cannot be folded away.
-
- void visitStoreInst (StoreInst &I);
- void visitLoadInst (LoadInst &I);
- void visitGetElementPtrInst(GetElementPtrInst &I);
-
- void visitCallInst (CallInst &I) {
- visitCallBase(I);
- }
-
- void visitInvokeInst (InvokeInst &II) {
- visitCallBase(II);
- visitTerminator(II);
- }
-
- void visitCallBrInst (CallBrInst &CBI) {
- visitCallBase(CBI);
- visitTerminator(CBI);
- }
-
- void visitCallBase (CallBase &CB);
- void visitResumeInst (ResumeInst &I) { /*returns void*/ }
- void visitUnreachableInst(UnreachableInst &I) { /*returns void*/ }
- void visitFenceInst (FenceInst &I) { /*returns void*/ }
-
- void visitInstruction(Instruction &I) {
- // All the instructions we don't do any special handling for just
- // go to overdefined.
- LLVM_DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
- markOverdefined(&I);
- }
-};
-
-} // end anonymous namespace
-
-// getFeasibleSuccessors - Return a vector of booleans to indicate which
-// successors are reachable from a given terminator instruction.
-void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
- SmallVectorImpl<bool> &Succs) {
- Succs.resize(TI.getNumSuccessors());
- if (auto *BI = dyn_cast<BranchInst>(&TI)) {
- if (BI->isUnconditional()) {
- Succs[0] = true;
- return;
- }
-
- ValueLatticeElement BCValue = getValueState(BI->getCondition());
- ConstantInt *CI = getConstantInt(BCValue);
- if (!CI) {
- // Overdefined condition variables, and branches on unfoldable constant
- // conditions, mean the branch could go either way.
- if (!BCValue.isUnknownOrUndef())
- Succs[0] = Succs[1] = true;
- return;
- }
-
- // Constant condition variables mean the branch can only go a single way.
- Succs[CI->isZero()] = true;
- return;
- }
-
- // Unwinding instructions successors are always executable.
- if (TI.isExceptionalTerminator()) {
- Succs.assign(TI.getNumSuccessors(), true);
- return;
- }
-
- if (auto *SI = dyn_cast<SwitchInst>(&TI)) {
- if (!SI->getNumCases()) {
- Succs[0] = true;
- return;
- }
+ }
+ }
+ void handleCallOverdefined(CallBase &CB);
+ void handleCallResult(CallBase &CB);
+ void handleCallArguments(CallBase &CB);
+
+private:
+ friend class InstVisitor<SCCPSolver>;
+
+ // visit implementations - Something changed in this instruction. Either an
+ // operand made a transition, or the instruction is newly executable. Change
+ // the value type of I to reflect these changes if appropriate.
+ void visitPHINode(PHINode &I);
+
+ // Terminators
+
+ void visitReturnInst(ReturnInst &I);
+ void visitTerminator(Instruction &TI);
+
+ void visitCastInst(CastInst &I);
+ void visitSelectInst(SelectInst &I);
+ void visitUnaryOperator(Instruction &I);
+ void visitBinaryOperator(Instruction &I);
+ void visitCmpInst(CmpInst &I);
+ void visitExtractValueInst(ExtractValueInst &EVI);
+ void visitInsertValueInst(InsertValueInst &IVI);
+
+ void visitCatchSwitchInst(CatchSwitchInst &CPI) {
+ markOverdefined(&CPI);
+ visitTerminator(CPI);
+ }
+
+ // Instructions that cannot be folded away.
+
+ void visitStoreInst (StoreInst &I);
+ void visitLoadInst (LoadInst &I);
+ void visitGetElementPtrInst(GetElementPtrInst &I);
+
+ void visitCallInst (CallInst &I) {
+ visitCallBase(I);
+ }
+
+ void visitInvokeInst (InvokeInst &II) {
+ visitCallBase(II);
+ visitTerminator(II);
+ }
+
+ void visitCallBrInst (CallBrInst &CBI) {
+ visitCallBase(CBI);
+ visitTerminator(CBI);
+ }
+
+ void visitCallBase (CallBase &CB);
+ void visitResumeInst (ResumeInst &I) { /*returns void*/ }
+ void visitUnreachableInst(UnreachableInst &I) { /*returns void*/ }
+ void visitFenceInst (FenceInst &I) { /*returns void*/ }
+
+ void visitInstruction(Instruction &I) {
+ // All the instructions we don't do any special handling for just
+ // go to overdefined.
+ LLVM_DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
+ markOverdefined(&I);
+ }
+};
+
+} // end anonymous namespace
+
+// getFeasibleSuccessors - Return a vector of booleans to indicate which
+// successors are reachable from a given terminator instruction.
+void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
+ SmallVectorImpl<bool> &Succs) {
+ Succs.resize(TI.getNumSuccessors());
+ if (auto *BI = dyn_cast<BranchInst>(&TI)) {
+ if (BI->isUnconditional()) {
+ Succs[0] = true;
+ return;
+ }
+
+ ValueLatticeElement BCValue = getValueState(BI->getCondition());
+ ConstantInt *CI = getConstantInt(BCValue);
+ if (!CI) {
+ // Overdefined condition variables, and branches on unfoldable constant
+ // conditions, mean the branch could go either way.
+ if (!BCValue.isUnknownOrUndef())
+ Succs[0] = Succs[1] = true;
+ return;
+ }
+
+ // Constant condition variables mean the branch can only go a single way.
+ Succs[CI->isZero()] = true;
+ return;
+ }
+
+ // Unwinding instructions successors are always executable.
+ if (TI.isExceptionalTerminator()) {
+ Succs.assign(TI.getNumSuccessors(), true);
+ return;
+ }
+
+ if (auto *SI = dyn_cast<SwitchInst>(&TI)) {
+ if (!SI->getNumCases()) {
+ Succs[0] = true;
+ return;
+ }
const ValueLatticeElement &SCValue = getValueState(SI->getCondition());
if (ConstantInt *CI = getConstantInt(SCValue)) {
Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true;
return;
}
-
+
// TODO: Switch on undef is UB. Stop passing false once the rest of LLVM
// is ready.
if (SCValue.isConstantRange(/*UndefAllowed=*/false)) {
@@ -672,182 +672,182 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
// TODO: Determine whether default case is reachable.
Succs[SI->case_default()->getSuccessorIndex()] = true;
- return;
- }
-
+ return;
+ }
+
// Overdefined or unknown condition? All destinations are executable!
if (!SCValue.isUnknownOrUndef())
Succs.assign(TI.getNumSuccessors(), true);
- return;
- }
-
- // In case of indirect branch and its address is a blockaddress, we mark
- // the target as executable.
- if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
- // Casts are folded by visitCastInst.
- ValueLatticeElement IBRValue = getValueState(IBR->getAddress());
- BlockAddress *Addr = dyn_cast_or_null<BlockAddress>(getConstant(IBRValue));
- if (!Addr) { // Overdefined or unknown condition?
- // All destinations are executable!
- if (!IBRValue.isUnknownOrUndef())
- Succs.assign(TI.getNumSuccessors(), true);
- return;
- }
-
- BasicBlock* T = Addr->getBasicBlock();
- assert(Addr->getFunction() == T->getParent() &&
- "Block address of a different function ?");
- for (unsigned i = 0; i < IBR->getNumSuccessors(); ++i) {
- // This is the target.
- if (IBR->getDestination(i) == T) {
- Succs[i] = true;
- return;
- }
- }
-
- // If we didn't find our destination in the IBR successor list, then we
- // have undefined behavior. Its ok to assume no successor is executable.
- return;
- }
-
- // In case of callbr, we pessimistically assume that all successors are
- // feasible.
- if (isa<CallBrInst>(&TI)) {
- Succs.assign(TI.getNumSuccessors(), true);
- return;
- }
-
- LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
- llvm_unreachable("SCCP: Don't know how to handle this terminator!");
-}
-
-// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
-// block to the 'To' basic block is currently feasible.
+ return;
+ }
+
+ // In case of indirect branch and its address is a blockaddress, we mark
+ // the target as executable.
+ if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
+ // Casts are folded by visitCastInst.
+ ValueLatticeElement IBRValue = getValueState(IBR->getAddress());
+ BlockAddress *Addr = dyn_cast_or_null<BlockAddress>(getConstant(IBRValue));
+ if (!Addr) { // Overdefined or unknown condition?
+ // All destinations are executable!
+ if (!IBRValue.isUnknownOrUndef())
+ Succs.assign(TI.getNumSuccessors(), true);
+ return;
+ }
+
+ BasicBlock* T = Addr->getBasicBlock();
+ assert(Addr->getFunction() == T->getParent() &&
+ "Block address of a different function ?");
+ for (unsigned i = 0; i < IBR->getNumSuccessors(); ++i) {
+ // This is the target.
+ if (IBR->getDestination(i) == T) {
+ Succs[i] = true;
+ return;
+ }
+ }
+
+ // If we didn't find our destination in the IBR successor list, then we
+ // have undefined behavior. Its ok to assume no successor is executable.
+ return;
+ }
+
+ // In case of callbr, we pessimistically assume that all successors are
+ // feasible.
+ if (isa<CallBrInst>(&TI)) {
+ Succs.assign(TI.getNumSuccessors(), true);
+ return;
+ }
+
+ LLVM_DEBUG(dbgs() << "Unknown terminator instruction: " << TI << '\n');
+ llvm_unreachable("SCCP: Don't know how to handle this terminator!");
+}
+
+// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+// block to the 'To' basic block is currently feasible.
bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) const {
- // Check if we've called markEdgeExecutable on the edge yet. (We could
- // be more aggressive and try to consider edges which haven't been marked
- // yet, but there isn't any need.)
- return KnownFeasibleEdges.count(Edge(From, To));
-}
-
-// visit Implementations - Something changed in this instruction, either an
-// operand made a transition, or the instruction is newly executable. Change
-// the value type of I to reflect these changes if appropriate. This method
-// makes sure to do the following actions:
-//
-// 1. If a phi node merges two constants in, and has conflicting value coming
-// from different branches, or if the PHI node merges in an overdefined
-// value, then the PHI node becomes overdefined.
-// 2. If a phi node merges only constants in, and they all agree on value, the
-// PHI node becomes a constant value equal to that.
-// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant
-// 4. If V <- x (op) y && (isOverdefined(x) || isOverdefined(y)) V = Overdefined
-// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined
-// 6. If a conditional branch has a value that is constant, make the selected
-// destination executable
-// 7. If a conditional branch has a value that is overdefined, make all
-// successors executable.
-void SCCPSolver::visitPHINode(PHINode &PN) {
- // If this PN returns a struct, just mark the result overdefined.
- // TODO: We could do a lot better than this if code actually uses this.
- if (PN.getType()->isStructTy())
- return (void)markOverdefined(&PN);
-
- if (getValueState(&PN).isOverdefined())
- return; // Quick exit
-
- // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
- // and slow us down a lot. Just mark them overdefined.
- if (PN.getNumIncomingValues() > 64)
- return (void)markOverdefined(&PN);
-
- unsigned NumActiveIncoming = 0;
-
- // Look at all of the executable operands of the PHI node. If any of them
- // are overdefined, the PHI becomes overdefined as well. If they are all
- // constant, and they agree with each other, the PHI becomes the identical
- // constant. If they are constant and don't agree, the PHI is a constant
- // range. If there are no executable operands, the PHI remains unknown.
- ValueLatticeElement PhiState = getValueState(&PN);
- for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
- if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
- continue;
-
- ValueLatticeElement IV = getValueState(PN.getIncomingValue(i));
- PhiState.mergeIn(IV);
- NumActiveIncoming++;
- if (PhiState.isOverdefined())
- break;
- }
-
- // We allow up to 1 range extension per active incoming value and one
- // additional extension. Note that we manually adjust the number of range
- // extensions to match the number of active incoming values. This helps to
- // limit multiple extensions caused by the same incoming value, if other
- // incoming values are equal.
- mergeInValue(&PN, PhiState,
- ValueLatticeElement::MergeOptions().setMaxWidenSteps(
- NumActiveIncoming + 1));
- ValueLatticeElement &PhiStateRef = getValueState(&PN);
- PhiStateRef.setNumRangeExtensions(
- std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions()));
-}
-
-void SCCPSolver::visitReturnInst(ReturnInst &I) {
- if (I.getNumOperands() == 0) return; // ret void
-
- Function *F = I.getParent()->getParent();
- Value *ResultOp = I.getOperand(0);
-
- // If we are tracking the return value of this function, merge it in.
- if (!TrackedRetVals.empty() && !ResultOp->getType()->isStructTy()) {
- auto TFRVI = TrackedRetVals.find(F);
- if (TFRVI != TrackedRetVals.end()) {
- mergeInValue(TFRVI->second, F, getValueState(ResultOp));
- return;
- }
- }
-
- // Handle functions that return multiple values.
- if (!TrackedMultipleRetVals.empty()) {
- if (auto *STy = dyn_cast<StructType>(ResultOp->getType()))
- if (MRVFunctionsTracked.count(F))
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
- mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
- getStructValueState(ResultOp, i));
- }
-}
-
-void SCCPSolver::visitTerminator(Instruction &TI) {
- SmallVector<bool, 16> SuccFeasible;
- getFeasibleSuccessors(TI, SuccFeasible);
-
- BasicBlock *BB = TI.getParent();
-
- // Mark all feasible successors executable.
- for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i)
- if (SuccFeasible[i])
- markEdgeExecutable(BB, TI.getSuccessor(i));
-}
-
-void SCCPSolver::visitCastInst(CastInst &I) {
- // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
- // discover a concrete value later.
- if (ValueState[&I].isOverdefined())
- return;
-
- ValueLatticeElement OpSt = getValueState(I.getOperand(0));
- if (Constant *OpC = getConstant(OpSt)) {
- // Fold the constant as we build.
- Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpC, I.getType(), DL);
- if (isa<UndefValue>(C))
- return;
- // Propagate constant value
- markConstant(&I, C);
- } else if (OpSt.isConstantRange() && I.getDestTy()->isIntegerTy()) {
- auto &LV = getValueState(&I);
- ConstantRange OpRange = OpSt.getConstantRange();
- Type *DestTy = I.getDestTy();
+ // Check if we've called markEdgeExecutable on the edge yet. (We could
+ // be more aggressive and try to consider edges which haven't been marked
+ // yet, but there isn't any need.)
+ return KnownFeasibleEdges.count(Edge(From, To));
+}
+
+// visit Implementations - Something changed in this instruction, either an
+// operand made a transition, or the instruction is newly executable. Change
+// the value type of I to reflect these changes if appropriate. This method
+// makes sure to do the following actions:
+//
+// 1. If a phi node merges two constants in, and has conflicting value coming
+// from different branches, or if the PHI node merges in an overdefined
+// value, then the PHI node becomes overdefined.
+// 2. If a phi node merges only constants in, and they all agree on value, the
+// PHI node becomes a constant value equal to that.
+// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant
+// 4. If V <- x (op) y && (isOverdefined(x) || isOverdefined(y)) V = Overdefined
+// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined
+// 6. If a conditional branch has a value that is constant, make the selected
+// destination executable
+// 7. If a conditional branch has a value that is overdefined, make all
+// successors executable.
+void SCCPSolver::visitPHINode(PHINode &PN) {
+ // If this PN returns a struct, just mark the result overdefined.
+ // TODO: We could do a lot better than this if code actually uses this.
+ if (PN.getType()->isStructTy())
+ return (void)markOverdefined(&PN);
+
+ if (getValueState(&PN).isOverdefined())
+ return; // Quick exit
+
+ // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
+ // and slow us down a lot. Just mark them overdefined.
+ if (PN.getNumIncomingValues() > 64)
+ return (void)markOverdefined(&PN);
+
+ unsigned NumActiveIncoming = 0;
+
+ // Look at all of the executable operands of the PHI node. If any of them
+ // are overdefined, the PHI becomes overdefined as well. If they are all
+ // constant, and they agree with each other, the PHI becomes the identical
+ // constant. If they are constant and don't agree, the PHI is a constant
+ // range. If there are no executable operands, the PHI remains unknown.
+ ValueLatticeElement PhiState = getValueState(&PN);
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+ if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
+ continue;
+
+ ValueLatticeElement IV = getValueState(PN.getIncomingValue(i));
+ PhiState.mergeIn(IV);
+ NumActiveIncoming++;
+ if (PhiState.isOverdefined())
+ break;
+ }
+
+ // We allow up to 1 range extension per active incoming value and one
+ // additional extension. Note that we manually adjust the number of range
+ // extensions to match the number of active incoming values. This helps to
+ // limit multiple extensions caused by the same incoming value, if other
+ // incoming values are equal.
+ mergeInValue(&PN, PhiState,
+ ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+ NumActiveIncoming + 1));
+ ValueLatticeElement &PhiStateRef = getValueState(&PN);
+ PhiStateRef.setNumRangeExtensions(
+ std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions()));
+}
+
+void SCCPSolver::visitReturnInst(ReturnInst &I) {
+ if (I.getNumOperands() == 0) return; // ret void
+
+ Function *F = I.getParent()->getParent();
+ Value *ResultOp = I.getOperand(0);
+
+ // If we are tracking the return value of this function, merge it in.
+ if (!TrackedRetVals.empty() && !ResultOp->getType()->isStructTy()) {
+ auto TFRVI = TrackedRetVals.find(F);
+ if (TFRVI != TrackedRetVals.end()) {
+ mergeInValue(TFRVI->second, F, getValueState(ResultOp));
+ return;
+ }
+ }
+
+ // Handle functions that return multiple values.
+ if (!TrackedMultipleRetVals.empty()) {
+ if (auto *STy = dyn_cast<StructType>(ResultOp->getType()))
+ if (MRVFunctionsTracked.count(F))
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+ mergeInValue(TrackedMultipleRetVals[std::make_pair(F, i)], F,
+ getStructValueState(ResultOp, i));
+ }
+}
+
+void SCCPSolver::visitTerminator(Instruction &TI) {
+ SmallVector<bool, 16> SuccFeasible;
+ getFeasibleSuccessors(TI, SuccFeasible);
+
+ BasicBlock *BB = TI.getParent();
+
+ // Mark all feasible successors executable.
+ for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i)
+ if (SuccFeasible[i])
+ markEdgeExecutable(BB, TI.getSuccessor(i));
+}
+
+void SCCPSolver::visitCastInst(CastInst &I) {
+ // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+ // discover a concrete value later.
+ if (ValueState[&I].isOverdefined())
+ return;
+
+ ValueLatticeElement OpSt = getValueState(I.getOperand(0));
+ if (Constant *OpC = getConstant(OpSt)) {
+ // Fold the constant as we build.
+ Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpC, I.getType(), DL);
+ if (isa<UndefValue>(C))
+ return;
+ // Propagate constant value
+ markConstant(&I, C);
+ } else if (OpSt.isConstantRange() && I.getDestTy()->isIntegerTy()) {
+ auto &LV = getValueState(&I);
+ ConstantRange OpRange = OpSt.getConstantRange();
+ Type *DestTy = I.getDestTy();
// Vectors where all elements have the same known constant range are treated
// as a single constant range in the lattice. When bitcasting such vectors,
// there is a mis-match between the width of the lattice value (single
@@ -858,456 +858,456 @@ void SCCPSolver::visitCastInst(CastInst &I) {
OpRange.getBitWidth() < DL.getTypeSizeInBits(DestTy))
return (void)markOverdefined(&I);
- ConstantRange Res =
- OpRange.castOp(I.getOpcode(), DL.getTypeSizeInBits(DestTy));
- mergeInValue(LV, &I, ValueLatticeElement::getRange(Res));
- } else if (!OpSt.isUnknownOrUndef())
- markOverdefined(&I);
-}
-
-void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
- // If this returns a struct, mark all elements over defined, we don't track
- // structs in structs.
- if (EVI.getType()->isStructTy())
- return (void)markOverdefined(&EVI);
-
- // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
- // discover a concrete value later.
- if (ValueState[&EVI].isOverdefined())
- return (void)markOverdefined(&EVI);
-
- // If this is extracting from more than one level of struct, we don't know.
- if (EVI.getNumIndices() != 1)
- return (void)markOverdefined(&EVI);
-
- Value *AggVal = EVI.getAggregateOperand();
- if (AggVal->getType()->isStructTy()) {
- unsigned i = *EVI.idx_begin();
- ValueLatticeElement EltVal = getStructValueState(AggVal, i);
- mergeInValue(getValueState(&EVI), &EVI, EltVal);
- } else {
- // Otherwise, must be extracting from an array.
- return (void)markOverdefined(&EVI);
- }
-}
-
-void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
- auto *STy = dyn_cast<StructType>(IVI.getType());
- if (!STy)
- return (void)markOverdefined(&IVI);
-
- // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
- // discover a concrete value later.
- if (isOverdefined(ValueState[&IVI]))
- return (void)markOverdefined(&IVI);
-
- // If this has more than one index, we can't handle it, drive all results to
- // undef.
- if (IVI.getNumIndices() != 1)
- return (void)markOverdefined(&IVI);
-
- Value *Aggr = IVI.getAggregateOperand();
- unsigned Idx = *IVI.idx_begin();
-
- // Compute the result based on what we're inserting.
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
- // This passes through all values that aren't the inserted element.
- if (i != Idx) {
- ValueLatticeElement EltVal = getStructValueState(Aggr, i);
- mergeInValue(getStructValueState(&IVI, i), &IVI, EltVal);
- continue;
- }
-
- Value *Val = IVI.getInsertedValueOperand();
- if (Val->getType()->isStructTy())
- // We don't track structs in structs.
- markOverdefined(getStructValueState(&IVI, i), &IVI);
- else {
- ValueLatticeElement InVal = getValueState(Val);
- mergeInValue(getStructValueState(&IVI, i), &IVI, InVal);
- }
- }
-}
-
-void SCCPSolver::visitSelectInst(SelectInst &I) {
- // If this select returns a struct, just mark the result overdefined.
- // TODO: We could do a lot better than this if code actually uses this.
- if (I.getType()->isStructTy())
- return (void)markOverdefined(&I);
-
- // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
- // discover a concrete value later.
- if (ValueState[&I].isOverdefined())
- return (void)markOverdefined(&I);
-
- ValueLatticeElement CondValue = getValueState(I.getCondition());
- if (CondValue.isUnknownOrUndef())
- return;
-
- if (ConstantInt *CondCB = getConstantInt(CondValue)) {
- Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue();
- mergeInValue(&I, getValueState(OpVal));
- return;
- }
-
- // Otherwise, the condition is overdefined or a constant we can't evaluate.
- // See if we can produce something better than overdefined based on the T/F
- // value.
- ValueLatticeElement TVal = getValueState(I.getTrueValue());
- ValueLatticeElement FVal = getValueState(I.getFalseValue());
-
- bool Changed = ValueState[&I].mergeIn(TVal);
- Changed |= ValueState[&I].mergeIn(FVal);
- if (Changed)
- pushToWorkListMsg(ValueState[&I], &I);
-}
-
-// Handle Unary Operators.
-void SCCPSolver::visitUnaryOperator(Instruction &I) {
- ValueLatticeElement V0State = getValueState(I.getOperand(0));
-
- ValueLatticeElement &IV = ValueState[&I];
- // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
- // discover a concrete value later.
- if (isOverdefined(IV))
- return (void)markOverdefined(&I);
-
- if (isConstant(V0State)) {
- Constant *C = ConstantExpr::get(I.getOpcode(), getConstant(V0State));
-
- // op Y -> undef.
- if (isa<UndefValue>(C))
- return;
- return (void)markConstant(IV, &I, C);
- }
-
- // If something is undef, wait for it to resolve.
- if (!isOverdefined(V0State))
- return;
-
- markOverdefined(&I);
-}
-
-// Handle Binary Operators.
-void SCCPSolver::visitBinaryOperator(Instruction &I) {
- ValueLatticeElement V1State = getValueState(I.getOperand(0));
- ValueLatticeElement V2State = getValueState(I.getOperand(1));
-
- ValueLatticeElement &IV = ValueState[&I];
- if (IV.isOverdefined())
- return;
-
- // If something is undef, wait for it to resolve.
- if (V1State.isUnknownOrUndef() || V2State.isUnknownOrUndef())
- return;
-
- if (V1State.isOverdefined() && V2State.isOverdefined())
- return (void)markOverdefined(&I);
-
- // If either of the operands is a constant, try to fold it to a constant.
- // TODO: Use information from notconstant better.
- if ((V1State.isConstant() || V2State.isConstant())) {
- Value *V1 = isConstant(V1State) ? getConstant(V1State) : I.getOperand(0);
- Value *V2 = isConstant(V2State) ? getConstant(V2State) : I.getOperand(1);
- Value *R = SimplifyBinOp(I.getOpcode(), V1, V2, SimplifyQuery(DL));
- auto *C = dyn_cast_or_null<Constant>(R);
- if (C) {
- // X op Y -> undef.
- if (isa<UndefValue>(C))
- return;
- // Conservatively assume that the result may be based on operands that may
- // be undef. Note that we use mergeInValue to combine the constant with
- // the existing lattice value for I, as different constants might be found
- // after one of the operands go to overdefined, e.g. due to one operand
- // being a special floating value.
- ValueLatticeElement NewV;
- NewV.markConstant(C, /*MayIncludeUndef=*/true);
- return (void)mergeInValue(&I, NewV);
- }
- }
-
- // Only use ranges for binary operators on integers.
- if (!I.getType()->isIntegerTy())
- return markOverdefined(&I);
-
- // Try to simplify to a constant range.
- ConstantRange A = ConstantRange::getFull(I.getType()->getScalarSizeInBits());
- ConstantRange B = ConstantRange::getFull(I.getType()->getScalarSizeInBits());
- if (V1State.isConstantRange())
- A = V1State.getConstantRange();
- if (V2State.isConstantRange())
- B = V2State.getConstantRange();
-
- ConstantRange R = A.binaryOp(cast<BinaryOperator>(&I)->getOpcode(), B);
- mergeInValue(&I, ValueLatticeElement::getRange(R));
-
- // TODO: Currently we do not exploit special values that produce something
- // better than overdefined with an overdefined operand for vector or floating
- // point types, like and <4 x i32> overdefined, zeroinitializer.
-}
-
-// Handle ICmpInst instruction.
-void SCCPSolver::visitCmpInst(CmpInst &I) {
- // Do not cache this lookup, getValueState calls later in the function might
- // invalidate the reference.
- if (isOverdefined(ValueState[&I]))
- return (void)markOverdefined(&I);
-
- Value *Op1 = I.getOperand(0);
- Value *Op2 = I.getOperand(1);
-
- // For parameters, use ParamState which includes constant range info if
- // available.
- auto V1State = getValueState(Op1);
- auto V2State = getValueState(Op2);
-
- Constant *C = V1State.getCompare(I.getPredicate(), I.getType(), V2State);
- if (C) {
- if (isa<UndefValue>(C))
- return;
- ValueLatticeElement CV;
- CV.markConstant(C);
- mergeInValue(&I, CV);
- return;
- }
-
- // If operands are still unknown, wait for it to resolve.
- if ((V1State.isUnknownOrUndef() || V2State.isUnknownOrUndef()) &&
- !isConstant(ValueState[&I]))
- return;
-
- markOverdefined(&I);
-}
-
-// Handle getelementptr instructions. If all operands are constants then we
-// can turn this into a getelementptr ConstantExpr.
-void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
- if (isOverdefined(ValueState[&I]))
- return (void)markOverdefined(&I);
-
- SmallVector<Constant*, 8> Operands;
- Operands.reserve(I.getNumOperands());
-
- for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
- ValueLatticeElement State = getValueState(I.getOperand(i));
- if (State.isUnknownOrUndef())
- return; // Operands are not resolved yet.
-
- if (isOverdefined(State))
- return (void)markOverdefined(&I);
-
- if (Constant *C = getConstant(State)) {
- Operands.push_back(C);
- continue;
- }
-
- return (void)markOverdefined(&I);
- }
-
- Constant *Ptr = Operands[0];
- auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end());
- Constant *C =
- ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, Indices);
- if (isa<UndefValue>(C))
- return;
- markConstant(&I, C);
-}
-
-void SCCPSolver::visitStoreInst(StoreInst &SI) {
- // If this store is of a struct, ignore it.
- if (SI.getOperand(0)->getType()->isStructTy())
- return;
-
- if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1)))
- return;
-
- GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1));
- auto I = TrackedGlobals.find(GV);
- if (I == TrackedGlobals.end())
- return;
-
- // Get the value we are storing into the global, then merge it.
- mergeInValue(I->second, GV, getValueState(SI.getOperand(0)),
- ValueLatticeElement::MergeOptions().setCheckWiden(false));
- if (I->second.isOverdefined())
- TrackedGlobals.erase(I); // No need to keep tracking this!
-}
-
-static ValueLatticeElement getValueFromMetadata(const Instruction *I) {
- if (MDNode *Ranges = I->getMetadata(LLVMContext::MD_range))
- if (I->getType()->isIntegerTy())
- return ValueLatticeElement::getRange(
- getConstantRangeFromMetadata(*Ranges));
+ ConstantRange Res =
+ OpRange.castOp(I.getOpcode(), DL.getTypeSizeInBits(DestTy));
+ mergeInValue(LV, &I, ValueLatticeElement::getRange(Res));
+ } else if (!OpSt.isUnknownOrUndef())
+ markOverdefined(&I);
+}
+
+void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
+ // If this returns a struct, mark all elements over defined, we don't track
+ // structs in structs.
+ if (EVI.getType()->isStructTy())
+ return (void)markOverdefined(&EVI);
+
+ // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+ // discover a concrete value later.
+ if (ValueState[&EVI].isOverdefined())
+ return (void)markOverdefined(&EVI);
+
+ // If this is extracting from more than one level of struct, we don't know.
+ if (EVI.getNumIndices() != 1)
+ return (void)markOverdefined(&EVI);
+
+ Value *AggVal = EVI.getAggregateOperand();
+ if (AggVal->getType()->isStructTy()) {
+ unsigned i = *EVI.idx_begin();
+ ValueLatticeElement EltVal = getStructValueState(AggVal, i);
+ mergeInValue(getValueState(&EVI), &EVI, EltVal);
+ } else {
+ // Otherwise, must be extracting from an array.
+ return (void)markOverdefined(&EVI);
+ }
+}
+
+void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
+ auto *STy = dyn_cast<StructType>(IVI.getType());
+ if (!STy)
+ return (void)markOverdefined(&IVI);
+
+ // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+ // discover a concrete value later.
+ if (isOverdefined(ValueState[&IVI]))
+ return (void)markOverdefined(&IVI);
+
+ // If this has more than one index, we can't handle it, drive all results to
+ // undef.
+ if (IVI.getNumIndices() != 1)
+ return (void)markOverdefined(&IVI);
+
+ Value *Aggr = IVI.getAggregateOperand();
+ unsigned Idx = *IVI.idx_begin();
+
+ // Compute the result based on what we're inserting.
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ // This passes through all values that aren't the inserted element.
+ if (i != Idx) {
+ ValueLatticeElement EltVal = getStructValueState(Aggr, i);
+ mergeInValue(getStructValueState(&IVI, i), &IVI, EltVal);
+ continue;
+ }
+
+ Value *Val = IVI.getInsertedValueOperand();
+ if (Val->getType()->isStructTy())
+ // We don't track structs in structs.
+ markOverdefined(getStructValueState(&IVI, i), &IVI);
+ else {
+ ValueLatticeElement InVal = getValueState(Val);
+ mergeInValue(getStructValueState(&IVI, i), &IVI, InVal);
+ }
+ }
+}
+
+void SCCPSolver::visitSelectInst(SelectInst &I) {
+ // If this select returns a struct, just mark the result overdefined.
+ // TODO: We could do a lot better than this if code actually uses this.
+ if (I.getType()->isStructTy())
+ return (void)markOverdefined(&I);
+
+ // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+ // discover a concrete value later.
+ if (ValueState[&I].isOverdefined())
+ return (void)markOverdefined(&I);
+
+ ValueLatticeElement CondValue = getValueState(I.getCondition());
+ if (CondValue.isUnknownOrUndef())
+ return;
+
+ if (ConstantInt *CondCB = getConstantInt(CondValue)) {
+ Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue();
+ mergeInValue(&I, getValueState(OpVal));
+ return;
+ }
+
+ // Otherwise, the condition is overdefined or a constant we can't evaluate.
+ // See if we can produce something better than overdefined based on the T/F
+ // value.
+ ValueLatticeElement TVal = getValueState(I.getTrueValue());
+ ValueLatticeElement FVal = getValueState(I.getFalseValue());
+
+ bool Changed = ValueState[&I].mergeIn(TVal);
+ Changed |= ValueState[&I].mergeIn(FVal);
+ if (Changed)
+ pushToWorkListMsg(ValueState[&I], &I);
+}
+
+// Handle Unary Operators.
+void SCCPSolver::visitUnaryOperator(Instruction &I) {
+ ValueLatticeElement V0State = getValueState(I.getOperand(0));
+
+ ValueLatticeElement &IV = ValueState[&I];
+ // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+ // discover a concrete value later.
+ if (isOverdefined(IV))
+ return (void)markOverdefined(&I);
+
+ if (isConstant(V0State)) {
+ Constant *C = ConstantExpr::get(I.getOpcode(), getConstant(V0State));
+
+ // op Y -> undef.
+ if (isa<UndefValue>(C))
+ return;
+ return (void)markConstant(IV, &I, C);
+ }
+
+ // If something is undef, wait for it to resolve.
+ if (!isOverdefined(V0State))
+ return;
+
+ markOverdefined(&I);
+}
+
+// Handle Binary Operators.
+void SCCPSolver::visitBinaryOperator(Instruction &I) {
+ ValueLatticeElement V1State = getValueState(I.getOperand(0));
+ ValueLatticeElement V2State = getValueState(I.getOperand(1));
+
+ ValueLatticeElement &IV = ValueState[&I];
+ if (IV.isOverdefined())
+ return;
+
+ // If something is undef, wait for it to resolve.
+ if (V1State.isUnknownOrUndef() || V2State.isUnknownOrUndef())
+ return;
+
+ if (V1State.isOverdefined() && V2State.isOverdefined())
+ return (void)markOverdefined(&I);
+
+ // If either of the operands is a constant, try to fold it to a constant.
+ // TODO: Use information from notconstant better.
+ if ((V1State.isConstant() || V2State.isConstant())) {
+ Value *V1 = isConstant(V1State) ? getConstant(V1State) : I.getOperand(0);
+ Value *V2 = isConstant(V2State) ? getConstant(V2State) : I.getOperand(1);
+ Value *R = SimplifyBinOp(I.getOpcode(), V1, V2, SimplifyQuery(DL));
+ auto *C = dyn_cast_or_null<Constant>(R);
+ if (C) {
+ // X op Y -> undef.
+ if (isa<UndefValue>(C))
+ return;
+ // Conservatively assume that the result may be based on operands that may
+ // be undef. Note that we use mergeInValue to combine the constant with
+ // the existing lattice value for I, as different constants might be found
+ // after one of the operands go to overdefined, e.g. due to one operand
+ // being a special floating value.
+ ValueLatticeElement NewV;
+ NewV.markConstant(C, /*MayIncludeUndef=*/true);
+ return (void)mergeInValue(&I, NewV);
+ }
+ }
+
+ // Only use ranges for binary operators on integers.
+ if (!I.getType()->isIntegerTy())
+ return markOverdefined(&I);
+
+ // Try to simplify to a constant range.
+ ConstantRange A = ConstantRange::getFull(I.getType()->getScalarSizeInBits());
+ ConstantRange B = ConstantRange::getFull(I.getType()->getScalarSizeInBits());
+ if (V1State.isConstantRange())
+ A = V1State.getConstantRange();
+ if (V2State.isConstantRange())
+ B = V2State.getConstantRange();
+
+ ConstantRange R = A.binaryOp(cast<BinaryOperator>(&I)->getOpcode(), B);
+ mergeInValue(&I, ValueLatticeElement::getRange(R));
+
+ // TODO: Currently we do not exploit special values that produce something
+ // better than overdefined with an overdefined operand for vector or floating
+ // point types, like and <4 x i32> overdefined, zeroinitializer.
+}
+
+// Handle ICmpInst instruction.
+void SCCPSolver::visitCmpInst(CmpInst &I) {
+ // Do not cache this lookup, getValueState calls later in the function might
+ // invalidate the reference.
+ if (isOverdefined(ValueState[&I]))
+ return (void)markOverdefined(&I);
+
+ Value *Op1 = I.getOperand(0);
+ Value *Op2 = I.getOperand(1);
+
+ // For parameters, use ParamState which includes constant range info if
+ // available.
+ auto V1State = getValueState(Op1);
+ auto V2State = getValueState(Op2);
+
+ Constant *C = V1State.getCompare(I.getPredicate(), I.getType(), V2State);
+ if (C) {
+ if (isa<UndefValue>(C))
+ return;
+ ValueLatticeElement CV;
+ CV.markConstant(C);
+ mergeInValue(&I, CV);
+ return;
+ }
+
+ // If operands are still unknown, wait for it to resolve.
+ if ((V1State.isUnknownOrUndef() || V2State.isUnknownOrUndef()) &&
+ !isConstant(ValueState[&I]))
+ return;
+
+ markOverdefined(&I);
+}
+
+// Handle getelementptr instructions. If all operands are constants then we
+// can turn this into a getelementptr ConstantExpr.
+void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
+ if (isOverdefined(ValueState[&I]))
+ return (void)markOverdefined(&I);
+
+ SmallVector<Constant*, 8> Operands;
+ Operands.reserve(I.getNumOperands());
+
+ for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+ ValueLatticeElement State = getValueState(I.getOperand(i));
+ if (State.isUnknownOrUndef())
+ return; // Operands are not resolved yet.
+
+ if (isOverdefined(State))
+ return (void)markOverdefined(&I);
+
+ if (Constant *C = getConstant(State)) {
+ Operands.push_back(C);
+ continue;
+ }
+
+ return (void)markOverdefined(&I);
+ }
+
+ Constant *Ptr = Operands[0];
+ auto Indices = makeArrayRef(Operands.begin() + 1, Operands.end());
+ Constant *C =
+ ConstantExpr::getGetElementPtr(I.getSourceElementType(), Ptr, Indices);
+ if (isa<UndefValue>(C))
+ return;
+ markConstant(&I, C);
+}
+
+void SCCPSolver::visitStoreInst(StoreInst &SI) {
+ // If this store is of a struct, ignore it.
+ if (SI.getOperand(0)->getType()->isStructTy())
+ return;
+
+ if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1)))
+ return;
+
+ GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1));
+ auto I = TrackedGlobals.find(GV);
+ if (I == TrackedGlobals.end())
+ return;
+
+ // Get the value we are storing into the global, then merge it.
+ mergeInValue(I->second, GV, getValueState(SI.getOperand(0)),
+ ValueLatticeElement::MergeOptions().setCheckWiden(false));
+ if (I->second.isOverdefined())
+ TrackedGlobals.erase(I); // No need to keep tracking this!
+}
+
+static ValueLatticeElement getValueFromMetadata(const Instruction *I) {
+ if (MDNode *Ranges = I->getMetadata(LLVMContext::MD_range))
+ if (I->getType()->isIntegerTy())
+ return ValueLatticeElement::getRange(
+ getConstantRangeFromMetadata(*Ranges));
if (I->hasMetadata(LLVMContext::MD_nonnull))
return ValueLatticeElement::getNot(
ConstantPointerNull::get(cast<PointerType>(I->getType())));
- return ValueLatticeElement::getOverdefined();
-}
-
-// Handle load instructions. If the operand is a constant pointer to a constant
-// global, we can replace the load with the loaded constant value!
-void SCCPSolver::visitLoadInst(LoadInst &I) {
- // If this load is of a struct or the load is volatile, just mark the result
- // as overdefined.
- if (I.getType()->isStructTy() || I.isVolatile())
- return (void)markOverdefined(&I);
-
- // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
- // discover a concrete value later.
- if (ValueState[&I].isOverdefined())
- return (void)markOverdefined(&I);
-
- ValueLatticeElement PtrVal = getValueState(I.getOperand(0));
- if (PtrVal.isUnknownOrUndef())
- return; // The pointer is not resolved yet!
-
- ValueLatticeElement &IV = ValueState[&I];
-
- if (isConstant(PtrVal)) {
- Constant *Ptr = getConstant(PtrVal);
-
- // load null is undefined.
- if (isa<ConstantPointerNull>(Ptr)) {
- if (NullPointerIsDefined(I.getFunction(), I.getPointerAddressSpace()))
- return (void)markOverdefined(IV, &I);
- else
- return;
- }
-
- // Transform load (constant global) into the value loaded.
- if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) {
- if (!TrackedGlobals.empty()) {
- // If we are tracking this global, merge in the known value for it.
- auto It = TrackedGlobals.find(GV);
- if (It != TrackedGlobals.end()) {
- mergeInValue(IV, &I, It->second, getMaxWidenStepsOpts());
- return;
- }
- }
- }
-
- // Transform load from a constant into a constant if possible.
- if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) {
- if (isa<UndefValue>(C))
- return;
- return (void)markConstant(IV, &I, C);
- }
- }
-
- // Fall back to metadata.
- mergeInValue(&I, getValueFromMetadata(&I));
-}
-
-void SCCPSolver::visitCallBase(CallBase &CB) {
- handleCallResult(CB);
- handleCallArguments(CB);
-}
-
-void SCCPSolver::handleCallOverdefined(CallBase &CB) {
- Function *F = CB.getCalledFunction();
-
- // Void return and not tracking callee, just bail.
- if (CB.getType()->isVoidTy())
- return;
-
- // Always mark struct return as overdefined.
- if (CB.getType()->isStructTy())
- return (void)markOverdefined(&CB);
-
- // Otherwise, if we have a single return value case, and if the function is
- // a declaration, maybe we can constant fold it.
- if (F && F->isDeclaration() && canConstantFoldCallTo(&CB, F)) {
- SmallVector<Constant *, 8> Operands;
- for (auto AI = CB.arg_begin(), E = CB.arg_end(); AI != E; ++AI) {
- if (AI->get()->getType()->isStructTy())
- return markOverdefined(&CB); // Can't handle struct args.
- ValueLatticeElement State = getValueState(*AI);
-
- if (State.isUnknownOrUndef())
- return; // Operands are not resolved yet.
- if (isOverdefined(State))
- return (void)markOverdefined(&CB);
- assert(isConstant(State) && "Unknown state!");
- Operands.push_back(getConstant(State));
- }
-
- if (isOverdefined(getValueState(&CB)))
- return (void)markOverdefined(&CB);
-
- // If we can constant fold this, mark the result of the call as a
- // constant.
- if (Constant *C = ConstantFoldCall(&CB, F, Operands, &GetTLI(*F))) {
- // call -> undef.
- if (isa<UndefValue>(C))
- return;
- return (void)markConstant(&CB, C);
- }
- }
-
- // Fall back to metadata.
- mergeInValue(&CB, getValueFromMetadata(&CB));
-}
-
-void SCCPSolver::handleCallArguments(CallBase &CB) {
- Function *F = CB.getCalledFunction();
- // If this is a local function that doesn't have its address taken, mark its
- // entry block executable and merge in the actual arguments to the call into
- // the formal arguments of the function.
- if (!TrackingIncomingArguments.empty() &&
- TrackingIncomingArguments.count(F)) {
- MarkBlockExecutable(&F->front());
-
- // Propagate information from this call site into the callee.
- auto CAI = CB.arg_begin();
- for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
- ++AI, ++CAI) {
- // If this argument is byval, and if the function is not readonly, there
- // will be an implicit copy formed of the input aggregate.
- if (AI->hasByValAttr() && !F->onlyReadsMemory()) {
- markOverdefined(&*AI);
- continue;
- }
-
- if (auto *STy = dyn_cast<StructType>(AI->getType())) {
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
- ValueLatticeElement CallArg = getStructValueState(*CAI, i);
- mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg,
- getMaxWidenStepsOpts());
- }
- } else
- mergeInValue(&*AI, getValueState(*CAI), getMaxWidenStepsOpts());
- }
- }
-}
-
-void SCCPSolver::handleCallResult(CallBase &CB) {
- Function *F = CB.getCalledFunction();
-
- if (auto *II = dyn_cast<IntrinsicInst>(&CB)) {
- if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
- if (ValueState[&CB].isOverdefined())
- return;
-
- Value *CopyOf = CB.getOperand(0);
- ValueLatticeElement CopyOfVal = getValueState(CopyOf);
- auto *PI = getPredicateInfoFor(&CB);
- assert(PI && "Missing predicate info for ssa.copy");
-
+ return ValueLatticeElement::getOverdefined();
+}
+
+// Handle load instructions. If the operand is a constant pointer to a constant
+// global, we can replace the load with the loaded constant value!
+void SCCPSolver::visitLoadInst(LoadInst &I) {
+ // If this load is of a struct or the load is volatile, just mark the result
+ // as overdefined.
+ if (I.getType()->isStructTy() || I.isVolatile())
+ return (void)markOverdefined(&I);
+
+ // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+ // discover a concrete value later.
+ if (ValueState[&I].isOverdefined())
+ return (void)markOverdefined(&I);
+
+ ValueLatticeElement PtrVal = getValueState(I.getOperand(0));
+ if (PtrVal.isUnknownOrUndef())
+ return; // The pointer is not resolved yet!
+
+ ValueLatticeElement &IV = ValueState[&I];
+
+ if (isConstant(PtrVal)) {
+ Constant *Ptr = getConstant(PtrVal);
+
+ // load null is undefined.
+ if (isa<ConstantPointerNull>(Ptr)) {
+ if (NullPointerIsDefined(I.getFunction(), I.getPointerAddressSpace()))
+ return (void)markOverdefined(IV, &I);
+ else
+ return;
+ }
+
+ // Transform load (constant global) into the value loaded.
+ if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) {
+ if (!TrackedGlobals.empty()) {
+ // If we are tracking this global, merge in the known value for it.
+ auto It = TrackedGlobals.find(GV);
+ if (It != TrackedGlobals.end()) {
+ mergeInValue(IV, &I, It->second, getMaxWidenStepsOpts());
+ return;
+ }
+ }
+ }
+
+ // Transform load from a constant into a constant if possible.
+ if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) {
+ if (isa<UndefValue>(C))
+ return;
+ return (void)markConstant(IV, &I, C);
+ }
+ }
+
+ // Fall back to metadata.
+ mergeInValue(&I, getValueFromMetadata(&I));
+}
+
+void SCCPSolver::visitCallBase(CallBase &CB) {
+ handleCallResult(CB);
+ handleCallArguments(CB);
+}
+
+void SCCPSolver::handleCallOverdefined(CallBase &CB) {
+ Function *F = CB.getCalledFunction();
+
+ // Void return and not tracking callee, just bail.
+ if (CB.getType()->isVoidTy())
+ return;
+
+ // Always mark struct return as overdefined.
+ if (CB.getType()->isStructTy())
+ return (void)markOverdefined(&CB);
+
+ // Otherwise, if we have a single return value case, and if the function is
+ // a declaration, maybe we can constant fold it.
+ if (F && F->isDeclaration() && canConstantFoldCallTo(&CB, F)) {
+ SmallVector<Constant *, 8> Operands;
+ for (auto AI = CB.arg_begin(), E = CB.arg_end(); AI != E; ++AI) {
+ if (AI->get()->getType()->isStructTy())
+ return markOverdefined(&CB); // Can't handle struct args.
+ ValueLatticeElement State = getValueState(*AI);
+
+ if (State.isUnknownOrUndef())
+ return; // Operands are not resolved yet.
+ if (isOverdefined(State))
+ return (void)markOverdefined(&CB);
+ assert(isConstant(State) && "Unknown state!");
+ Operands.push_back(getConstant(State));
+ }
+
+ if (isOverdefined(getValueState(&CB)))
+ return (void)markOverdefined(&CB);
+
+ // If we can constant fold this, mark the result of the call as a
+ // constant.
+ if (Constant *C = ConstantFoldCall(&CB, F, Operands, &GetTLI(*F))) {
+ // call -> undef.
+ if (isa<UndefValue>(C))
+ return;
+ return (void)markConstant(&CB, C);
+ }
+ }
+
+ // Fall back to metadata.
+ mergeInValue(&CB, getValueFromMetadata(&CB));
+}
+
+void SCCPSolver::handleCallArguments(CallBase &CB) {
+ Function *F = CB.getCalledFunction();
+ // If this is a local function that doesn't have its address taken, mark its
+ // entry block executable and merge in the actual arguments to the call into
+ // the formal arguments of the function.
+ if (!TrackingIncomingArguments.empty() &&
+ TrackingIncomingArguments.count(F)) {
+ MarkBlockExecutable(&F->front());
+
+ // Propagate information from this call site into the callee.
+ auto CAI = CB.arg_begin();
+ for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
+ ++AI, ++CAI) {
+ // If this argument is byval, and if the function is not readonly, there
+ // will be an implicit copy formed of the input aggregate.
+ if (AI->hasByValAttr() && !F->onlyReadsMemory()) {
+ markOverdefined(&*AI);
+ continue;
+ }
+
+ if (auto *STy = dyn_cast<StructType>(AI->getType())) {
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ ValueLatticeElement CallArg = getStructValueState(*CAI, i);
+ mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg,
+ getMaxWidenStepsOpts());
+ }
+ } else
+ mergeInValue(&*AI, getValueState(*CAI), getMaxWidenStepsOpts());
+ }
+ }
+}
+
+void SCCPSolver::handleCallResult(CallBase &CB) {
+ Function *F = CB.getCalledFunction();
+
+ if (auto *II = dyn_cast<IntrinsicInst>(&CB)) {
+ if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
+ if (ValueState[&CB].isOverdefined())
+ return;
+
+ Value *CopyOf = CB.getOperand(0);
+ ValueLatticeElement CopyOfVal = getValueState(CopyOf);
+ auto *PI = getPredicateInfoFor(&CB);
+ assert(PI && "Missing predicate info for ssa.copy");
+
const Optional<PredicateConstraint> &Constraint = PI->getConstraint();
if (!Constraint) {
- mergeInValue(ValueState[&CB], &CB, CopyOfVal);
- return;
- }
-
+ mergeInValue(ValueState[&CB], &CB, CopyOfVal);
+ return;
+ }
+
CmpInst::Predicate Pred = Constraint->Predicate;
Value *OtherOp = Constraint->OtherOp;
-
+
// Wait until OtherOp is resolved.
if (getValueState(OtherOp).isUnknown()) {
addAdditionalUser(OtherOp, &CB);
- return;
- }
-
+ return;
+ }
+
// TODO: Actually filp MayIncludeUndef for the created range to false,
// once most places in the optimizer respect the branches on
// undef/poison are UB rule. The reason why the new range cannot be
@@ -1318,42 +1318,42 @@ void SCCPSolver::handleCallResult(CallBase &CB) {
// i32, %a, i32_max). For the latter overdefined/empty range will be
// inferred, but the branch will get folded accordingly anyways.
bool MayIncludeUndef = !isa<PredicateAssume>(PI);
-
+
ValueLatticeElement CondVal = getValueState(OtherOp);
- ValueLatticeElement &IV = ValueState[&CB];
- if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) {
- auto ImposedCR =
- ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType()));
-
- // Get the range imposed by the condition.
- if (CondVal.isConstantRange())
- ImposedCR = ConstantRange::makeAllowedICmpRegion(
- Pred, CondVal.getConstantRange());
-
- // Combine range info for the original value with the new range from the
- // condition.
- auto CopyOfCR = CopyOfVal.isConstantRange()
- ? CopyOfVal.getConstantRange()
- : ConstantRange::getFull(
- DL.getTypeSizeInBits(CopyOf->getType()));
- auto NewCR = ImposedCR.intersectWith(CopyOfCR);
- // If the existing information is != x, do not use the information from
- // a chained predicate, as the != x information is more likely to be
- // helpful in practice.
- if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement())
- NewCR = CopyOfCR;
-
+ ValueLatticeElement &IV = ValueState[&CB];
+ if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) {
+ auto ImposedCR =
+ ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType()));
+
+ // Get the range imposed by the condition.
+ if (CondVal.isConstantRange())
+ ImposedCR = ConstantRange::makeAllowedICmpRegion(
+ Pred, CondVal.getConstantRange());
+
+ // Combine range info for the original value with the new range from the
+ // condition.
+ auto CopyOfCR = CopyOfVal.isConstantRange()
+ ? CopyOfVal.getConstantRange()
+ : ConstantRange::getFull(
+ DL.getTypeSizeInBits(CopyOf->getType()));
+ auto NewCR = ImposedCR.intersectWith(CopyOfCR);
+ // If the existing information is != x, do not use the information from
+ // a chained predicate, as the != x information is more likely to be
+ // helpful in practice.
+ if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement())
+ NewCR = CopyOfCR;
+
addAdditionalUser(OtherOp, &CB);
- mergeInValue(
- IV, &CB,
+ mergeInValue(
+ IV, &CB,
ValueLatticeElement::getRange(NewCR, MayIncludeUndef));
- return;
- } else if (Pred == CmpInst::ICMP_EQ && CondVal.isConstant()) {
- // For non-integer values or integer constant expressions, only
- // propagate equal constants.
+ return;
+ } else if (Pred == CmpInst::ICMP_EQ && CondVal.isConstant()) {
+ // For non-integer values or integer constant expressions, only
+ // propagate equal constants.
addAdditionalUser(OtherOp, &CB);
- mergeInValue(IV, &CB, CondVal);
- return;
+ mergeInValue(IV, &CB, CondVal);
+ return;
} else if (Pred == CmpInst::ICMP_NE && CondVal.isConstant() &&
!MayIncludeUndef) {
// Propagate inequalities.
@@ -1361,10 +1361,10 @@ void SCCPSolver::handleCallResult(CallBase &CB) {
mergeInValue(IV, &CB,
ValueLatticeElement::getNot(CondVal.getConstant()));
return;
- }
-
- return (void)mergeInValue(IV, &CB, CopyOfVal);
- }
+ }
+
+ return (void)mergeInValue(IV, &CB, CopyOfVal);
+ }
if (ConstantRange::isIntrinsicSupported(II->getIntrinsicID())) {
// Compute result range for intrinsics supported by ConstantRange.
@@ -1384,492 +1384,492 @@ void SCCPSolver::handleCallResult(CallBase &CB) {
ConstantRange::intrinsic(II->getIntrinsicID(), OpRanges);
return (void)mergeInValue(II, ValueLatticeElement::getRange(Result));
}
- }
-
- // The common case is that we aren't tracking the callee, either because we
- // are not doing interprocedural analysis or the callee is indirect, or is
- // external. Handle these cases first.
- if (!F || F->isDeclaration())
- return handleCallOverdefined(CB);
-
- // If this is a single/zero retval case, see if we're tracking the function.
- if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
- if (!MRVFunctionsTracked.count(F))
- return handleCallOverdefined(CB); // Not tracking this callee.
-
- // If we are tracking this callee, propagate the result of the function
- // into this call site.
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
- mergeInValue(getStructValueState(&CB, i), &CB,
- TrackedMultipleRetVals[std::make_pair(F, i)],
- getMaxWidenStepsOpts());
- } else {
- auto TFRVI = TrackedRetVals.find(F);
- if (TFRVI == TrackedRetVals.end())
- return handleCallOverdefined(CB); // Not tracking this callee.
-
- // If so, propagate the return value of the callee into this call result.
- mergeInValue(&CB, TFRVI->second, getMaxWidenStepsOpts());
- }
-}
-
-void SCCPSolver::Solve() {
- // Process the work lists until they are empty!
- while (!BBWorkList.empty() || !InstWorkList.empty() ||
- !OverdefinedInstWorkList.empty()) {
- // Process the overdefined instruction's work list first, which drives other
- // things to overdefined more quickly.
- while (!OverdefinedInstWorkList.empty()) {
- Value *I = OverdefinedInstWorkList.pop_back_val();
-
- LLVM_DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
-
- // "I" got into the work list because it either made the transition from
- // bottom to constant, or to overdefined.
- //
- // Anything on this worklist that is overdefined need not be visited
- // since all of its users will have already been marked as overdefined
- // Update all of the users of this instruction's value.
- //
- markUsersAsChanged(I);
- }
-
- // Process the instruction work list.
- while (!InstWorkList.empty()) {
- Value *I = InstWorkList.pop_back_val();
-
- LLVM_DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');
-
- // "I" got into the work list because it made the transition from undef to
- // constant.
- //
- // Anything on this worklist that is overdefined need not be visited
- // since all of its users will have already been marked as overdefined.
- // Update all of the users of this instruction's value.
- //
- if (I->getType()->isStructTy() || !getValueState(I).isOverdefined())
- markUsersAsChanged(I);
- }
-
- // Process the basic block work list.
- while (!BBWorkList.empty()) {
+ }
+
+ // The common case is that we aren't tracking the callee, either because we
+ // are not doing interprocedural analysis or the callee is indirect, or is
+ // external. Handle these cases first.
+ if (!F || F->isDeclaration())
+ return handleCallOverdefined(CB);
+
+ // If this is a single/zero retval case, see if we're tracking the function.
+ if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
+ if (!MRVFunctionsTracked.count(F))
+ return handleCallOverdefined(CB); // Not tracking this callee.
+
+ // If we are tracking this callee, propagate the result of the function
+ // into this call site.
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
+ mergeInValue(getStructValueState(&CB, i), &CB,
+ TrackedMultipleRetVals[std::make_pair(F, i)],
+ getMaxWidenStepsOpts());
+ } else {
+ auto TFRVI = TrackedRetVals.find(F);
+ if (TFRVI == TrackedRetVals.end())
+ return handleCallOverdefined(CB); // Not tracking this callee.
+
+ // If so, propagate the return value of the callee into this call result.
+ mergeInValue(&CB, TFRVI->second, getMaxWidenStepsOpts());
+ }
+}
+
+void SCCPSolver::Solve() {
+ // Process the work lists until they are empty!
+ while (!BBWorkList.empty() || !InstWorkList.empty() ||
+ !OverdefinedInstWorkList.empty()) {
+ // Process the overdefined instruction's work list first, which drives other
+ // things to overdefined more quickly.
+ while (!OverdefinedInstWorkList.empty()) {
+ Value *I = OverdefinedInstWorkList.pop_back_val();
+
+ LLVM_DEBUG(dbgs() << "\nPopped off OI-WL: " << *I << '\n');
+
+ // "I" got into the work list because it either made the transition from
+ // bottom to constant, or to overdefined.
+ //
+ // Anything on this worklist that is overdefined need not be visited
+ // since all of its users will have already been marked as overdefined
+ // Update all of the users of this instruction's value.
+ //
+ markUsersAsChanged(I);
+ }
+
+ // Process the instruction work list.
+ while (!InstWorkList.empty()) {
+ Value *I = InstWorkList.pop_back_val();
+
+ LLVM_DEBUG(dbgs() << "\nPopped off I-WL: " << *I << '\n');
+
+ // "I" got into the work list because it made the transition from undef to
+ // constant.
+ //
+ // Anything on this worklist that is overdefined need not be visited
+ // since all of its users will have already been marked as overdefined.
+ // Update all of the users of this instruction's value.
+ //
+ if (I->getType()->isStructTy() || !getValueState(I).isOverdefined())
+ markUsersAsChanged(I);
+ }
+
+ // Process the basic block work list.
+ while (!BBWorkList.empty()) {
BasicBlock *BB = BBWorkList.pop_back_val();
-
- LLVM_DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
-
- // Notify all instructions in this basic block that they are newly
- // executable.
- visit(BB);
- }
- }
-}
-
-/// ResolvedUndefsIn - While solving the dataflow for a function, we assume
-/// that branches on undef values cannot reach any of their successors.
-/// However, this is not a safe assumption. After we solve dataflow, this
-/// method should be use to handle this. If this returns true, the solver
-/// should be rerun.
-///
-/// This method handles this by finding an unresolved branch and marking it one
-/// of the edges from the block as being feasible, even though the condition
-/// doesn't say it would otherwise be. This allows SCCP to find the rest of the
-/// CFG and only slightly pessimizes the analysis results (by marking one,
-/// potentially infeasible, edge feasible). This cannot usefully modify the
-/// constraints on the condition of the branch, as that would impact other users
-/// of the value.
-///
-/// This scan also checks for values that use undefs. It conservatively marks
-/// them as overdefined.
-bool SCCPSolver::ResolvedUndefsIn(Function &F) {
+
+ LLVM_DEBUG(dbgs() << "\nPopped off BBWL: " << *BB << '\n');
+
+ // Notify all instructions in this basic block that they are newly
+ // executable.
+ visit(BB);
+ }
+ }
+}
+
+/// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+/// that branches on undef values cannot reach any of their successors.
+/// However, this is not a safe assumption. After we solve dataflow, this
+/// method should be use to handle this. If this returns true, the solver
+/// should be rerun.
+///
+/// This method handles this by finding an unresolved branch and marking it one
+/// of the edges from the block as being feasible, even though the condition
+/// doesn't say it would otherwise be. This allows SCCP to find the rest of the
+/// CFG and only slightly pessimizes the analysis results (by marking one,
+/// potentially infeasible, edge feasible). This cannot usefully modify the
+/// constraints on the condition of the branch, as that would impact other users
+/// of the value.
+///
+/// This scan also checks for values that use undefs. It conservatively marks
+/// them as overdefined.
+bool SCCPSolver::ResolvedUndefsIn(Function &F) {
bool MadeChange = false;
- for (BasicBlock &BB : F) {
- if (!BBExecutable.count(&BB))
- continue;
-
- for (Instruction &I : BB) {
- // Look for instructions which produce undef values.
- if (I.getType()->isVoidTy()) continue;
-
- if (auto *STy = dyn_cast<StructType>(I.getType())) {
- // Only a few things that can be structs matter for undef.
-
- // Tracked calls must never be marked overdefined in ResolvedUndefsIn.
- if (auto *CB = dyn_cast<CallBase>(&I))
- if (Function *F = CB->getCalledFunction())
- if (MRVFunctionsTracked.count(F))
- continue;
-
- // extractvalue and insertvalue don't need to be marked; they are
- // tracked as precisely as their operands.
- if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I))
- continue;
- // Send the results of everything else to overdefined. We could be
- // more precise than this but it isn't worth bothering.
- for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
- ValueLatticeElement &LV = getStructValueState(&I, i);
+ for (BasicBlock &BB : F) {
+ if (!BBExecutable.count(&BB))
+ continue;
+
+ for (Instruction &I : BB) {
+ // Look for instructions which produce undef values.
+ if (I.getType()->isVoidTy()) continue;
+
+ if (auto *STy = dyn_cast<StructType>(I.getType())) {
+ // Only a few things that can be structs matter for undef.
+
+ // Tracked calls must never be marked overdefined in ResolvedUndefsIn.
+ if (auto *CB = dyn_cast<CallBase>(&I))
+ if (Function *F = CB->getCalledFunction())
+ if (MRVFunctionsTracked.count(F))
+ continue;
+
+ // extractvalue and insertvalue don't need to be marked; they are
+ // tracked as precisely as their operands.
+ if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I))
+ continue;
+ // Send the results of everything else to overdefined. We could be
+ // more precise than this but it isn't worth bothering.
+ for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+ ValueLatticeElement &LV = getStructValueState(&I, i);
if (LV.isUnknownOrUndef()) {
- markOverdefined(LV, &I);
+ markOverdefined(LV, &I);
MadeChange = true;
}
- }
- continue;
- }
-
- ValueLatticeElement &LV = getValueState(&I);
- if (!LV.isUnknownOrUndef())
- continue;
-
- // There are two reasons a call can have an undef result
- // 1. It could be tracked.
- // 2. It could be constant-foldable.
- // Because of the way we solve return values, tracked calls must
- // never be marked overdefined in ResolvedUndefsIn.
- if (auto *CB = dyn_cast<CallBase>(&I))
- if (Function *F = CB->getCalledFunction())
- if (TrackedRetVals.count(F))
- continue;
-
- if (isa<LoadInst>(I)) {
- // A load here means one of two things: a load of undef from a global,
- // a load from an unknown pointer. Either way, having it return undef
- // is okay.
- continue;
- }
-
- markOverdefined(&I);
+ }
+ continue;
+ }
+
+ ValueLatticeElement &LV = getValueState(&I);
+ if (!LV.isUnknownOrUndef())
+ continue;
+
+ // There are two reasons a call can have an undef result
+ // 1. It could be tracked.
+ // 2. It could be constant-foldable.
+ // Because of the way we solve return values, tracked calls must
+ // never be marked overdefined in ResolvedUndefsIn.
+ if (auto *CB = dyn_cast<CallBase>(&I))
+ if (Function *F = CB->getCalledFunction())
+ if (TrackedRetVals.count(F))
+ continue;
+
+ if (isa<LoadInst>(I)) {
+ // A load here means one of two things: a load of undef from a global,
+ // a load from an unknown pointer. Either way, having it return undef
+ // is okay.
+ continue;
+ }
+
+ markOverdefined(&I);
MadeChange = true;
- }
-
- // Check to see if we have a branch or switch on an undefined value. If so
- // we force the branch to go one way or the other to make the successor
- // values live. It doesn't really matter which way we force it.
- Instruction *TI = BB.getTerminator();
- if (auto *BI = dyn_cast<BranchInst>(TI)) {
- if (!BI->isConditional()) continue;
- if (!getValueState(BI->getCondition()).isUnknownOrUndef())
- continue;
-
- // If the input to SCCP is actually branch on undef, fix the undef to
- // false.
- if (isa<UndefValue>(BI->getCondition())) {
- BI->setCondition(ConstantInt::getFalse(BI->getContext()));
- markEdgeExecutable(&BB, TI->getSuccessor(1));
+ }
+
+ // Check to see if we have a branch or switch on an undefined value. If so
+ // we force the branch to go one way or the other to make the successor
+ // values live. It doesn't really matter which way we force it.
+ Instruction *TI = BB.getTerminator();
+ if (auto *BI = dyn_cast<BranchInst>(TI)) {
+ if (!BI->isConditional()) continue;
+ if (!getValueState(BI->getCondition()).isUnknownOrUndef())
+ continue;
+
+ // If the input to SCCP is actually branch on undef, fix the undef to
+ // false.
+ if (isa<UndefValue>(BI->getCondition())) {
+ BI->setCondition(ConstantInt::getFalse(BI->getContext()));
+ markEdgeExecutable(&BB, TI->getSuccessor(1));
MadeChange = true;
continue;
- }
-
- // Otherwise, it is a branch on a symbolic value which is currently
- // considered to be undef. Make sure some edge is executable, so a
- // branch on "undef" always flows somewhere.
- // FIXME: Distinguish between dead code and an LLVM "undef" value.
- BasicBlock *DefaultSuccessor = TI->getSuccessor(1);
- if (markEdgeExecutable(&BB, DefaultSuccessor))
+ }
+
+ // Otherwise, it is a branch on a symbolic value which is currently
+ // considered to be undef. Make sure some edge is executable, so a
+ // branch on "undef" always flows somewhere.
+ // FIXME: Distinguish between dead code and an LLVM "undef" value.
+ BasicBlock *DefaultSuccessor = TI->getSuccessor(1);
+ if (markEdgeExecutable(&BB, DefaultSuccessor))
MadeChange = true;
-
- continue;
- }
-
- if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
- // Indirect branch with no successor ?. Its ok to assume it branches
- // to no target.
- if (IBR->getNumSuccessors() < 1)
- continue;
-
- if (!getValueState(IBR->getAddress()).isUnknownOrUndef())
- continue;
-
- // If the input to SCCP is actually branch on undef, fix the undef to
- // the first successor of the indirect branch.
- if (isa<UndefValue>(IBR->getAddress())) {
- IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0)));
- markEdgeExecutable(&BB, IBR->getSuccessor(0));
+
+ continue;
+ }
+
+ if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+ // Indirect branch with no successor ?. Its ok to assume it branches
+ // to no target.
+ if (IBR->getNumSuccessors() < 1)
+ continue;
+
+ if (!getValueState(IBR->getAddress()).isUnknownOrUndef())
+ continue;
+
+ // If the input to SCCP is actually branch on undef, fix the undef to
+ // the first successor of the indirect branch.
+ if (isa<UndefValue>(IBR->getAddress())) {
+ IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0)));
+ markEdgeExecutable(&BB, IBR->getSuccessor(0));
MadeChange = true;
continue;
- }
-
- // Otherwise, it is a branch on a symbolic value which is currently
- // considered to be undef. Make sure some edge is executable, so a
- // branch on "undef" always flows somewhere.
- // FIXME: IndirectBr on "undef" doesn't actually need to go anywhere:
- // we can assume the branch has undefined behavior instead.
- BasicBlock *DefaultSuccessor = IBR->getSuccessor(0);
- if (markEdgeExecutable(&BB, DefaultSuccessor))
+ }
+
+ // Otherwise, it is a branch on a symbolic value which is currently
+ // considered to be undef. Make sure some edge is executable, so a
+ // branch on "undef" always flows somewhere.
+ // FIXME: IndirectBr on "undef" doesn't actually need to go anywhere:
+ // we can assume the branch has undefined behavior instead.
+ BasicBlock *DefaultSuccessor = IBR->getSuccessor(0);
+ if (markEdgeExecutable(&BB, DefaultSuccessor))
MadeChange = true;
-
- continue;
- }
-
- if (auto *SI = dyn_cast<SwitchInst>(TI)) {
- if (!SI->getNumCases() ||
- !getValueState(SI->getCondition()).isUnknownOrUndef())
- continue;
-
- // If the input to SCCP is actually switch on undef, fix the undef to
- // the first constant.
- if (isa<UndefValue>(SI->getCondition())) {
- SI->setCondition(SI->case_begin()->getCaseValue());
- markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor());
+
+ continue;
+ }
+
+ if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+ if (!SI->getNumCases() ||
+ !getValueState(SI->getCondition()).isUnknownOrUndef())
+ continue;
+
+ // If the input to SCCP is actually switch on undef, fix the undef to
+ // the first constant.
+ if (isa<UndefValue>(SI->getCondition())) {
+ SI->setCondition(SI->case_begin()->getCaseValue());
+ markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor());
MadeChange = true;
continue;
- }
-
- // Otherwise, it is a branch on a symbolic value which is currently
- // considered to be undef. Make sure some edge is executable, so a
- // branch on "undef" always flows somewhere.
- // FIXME: Distinguish between dead code and an LLVM "undef" value.
- BasicBlock *DefaultSuccessor = SI->case_begin()->getCaseSuccessor();
- if (markEdgeExecutable(&BB, DefaultSuccessor))
+ }
+
+ // Otherwise, it is a branch on a symbolic value which is currently
+ // considered to be undef. Make sure some edge is executable, so a
+ // branch on "undef" always flows somewhere.
+ // FIXME: Distinguish between dead code and an LLVM "undef" value.
+ BasicBlock *DefaultSuccessor = SI->case_begin()->getCaseSuccessor();
+ if (markEdgeExecutable(&BB, DefaultSuccessor))
MadeChange = true;
-
- continue;
- }
- }
-
+
+ continue;
+ }
+ }
+
return MadeChange;
-}
-
-static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
- Constant *Const = nullptr;
- if (V->getType()->isStructTy()) {
- std::vector<ValueLatticeElement> IVs = Solver.getStructLatticeValueFor(V);
- if (any_of(IVs,
- [](const ValueLatticeElement &LV) { return isOverdefined(LV); }))
- return false;
- std::vector<Constant *> ConstVals;
- auto *ST = cast<StructType>(V->getType());
- for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
- ValueLatticeElement V = IVs[i];
- ConstVals.push_back(isConstant(V)
- ? Solver.getConstant(V)
- : UndefValue::get(ST->getElementType(i)));
- }
- Const = ConstantStruct::get(ST, ConstVals);
- } else {
- const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
- if (isOverdefined(IV))
- return false;
-
- Const =
- isConstant(IV) ? Solver.getConstant(IV) : UndefValue::get(V->getType());
- }
- assert(Const && "Constant is nullptr here!");
-
- // Replacing `musttail` instructions with constant breaks `musttail` invariant
- // unless the call itself can be removed
- CallInst *CI = dyn_cast<CallInst>(V);
- if (CI && CI->isMustTailCall() && !CI->isSafeToRemove()) {
- Function *F = CI->getCalledFunction();
-
- // Don't zap returns of the callee
- if (F)
- Solver.AddMustTailCallee(F);
-
- LLVM_DEBUG(dbgs() << " Can\'t treat the result of musttail call : " << *CI
- << " as a constant\n");
- return false;
- }
-
- LLVM_DEBUG(dbgs() << " Constant: " << *Const << " = " << *V << '\n');
-
- // Replaces all of the uses of a variable with uses of the constant.
- V->replaceAllUsesWith(Const);
- return true;
-}
-
-static bool simplifyInstsInBlock(SCCPSolver &Solver, BasicBlock &BB,
- SmallPtrSetImpl<Value *> &InsertedValues,
- Statistic &InstRemovedStat,
- Statistic &InstReplacedStat) {
- bool MadeChanges = false;
- for (Instruction &Inst : make_early_inc_range(BB)) {
- if (Inst.getType()->isVoidTy())
- continue;
- if (tryToReplaceWithConstant(Solver, &Inst)) {
- if (Inst.isSafeToRemove())
- Inst.eraseFromParent();
- // Hey, we just changed something!
- MadeChanges = true;
- ++InstRemovedStat;
- } else if (isa<SExtInst>(&Inst)) {
- Value *ExtOp = Inst.getOperand(0);
- if (isa<Constant>(ExtOp) || InsertedValues.count(ExtOp))
- continue;
- const ValueLatticeElement &IV = Solver.getLatticeValueFor(ExtOp);
- if (!IV.isConstantRange(/*UndefAllowed=*/false))
- continue;
- if (IV.getConstantRange().isAllNonNegative()) {
- auto *ZExt = new ZExtInst(ExtOp, Inst.getType(), "", &Inst);
- InsertedValues.insert(ZExt);
- Inst.replaceAllUsesWith(ZExt);
- Solver.removeLatticeValueFor(&Inst);
- Inst.eraseFromParent();
- InstReplacedStat++;
- MadeChanges = true;
- }
- }
- }
- return MadeChanges;
-}
-
-// runSCCP() - Run the Sparse Conditional Constant Propagation algorithm,
-// and return true if the function was modified.
-static bool runSCCP(Function &F, const DataLayout &DL,
- const TargetLibraryInfo *TLI) {
- LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
- SCCPSolver Solver(
- DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; },
- F.getContext());
-
- // Mark the first block of the function as being executable.
- Solver.MarkBlockExecutable(&F.front());
-
- // Mark all arguments to the function as being overdefined.
- for (Argument &AI : F.args())
- Solver.markOverdefined(&AI);
-
- // Solve for constants.
- bool ResolvedUndefs = true;
- while (ResolvedUndefs) {
- Solver.Solve();
- LLVM_DEBUG(dbgs() << "RESOLVING UNDEFs\n");
- ResolvedUndefs = Solver.ResolvedUndefsIn(F);
- }
-
- bool MadeChanges = false;
-
- // If we decided that there are basic blocks that are dead in this function,
- // delete their contents now. Note that we cannot actually delete the blocks,
- // as we cannot modify the CFG of the function.
-
- SmallPtrSet<Value *, 32> InsertedValues;
- for (BasicBlock &BB : F) {
- if (!Solver.isBlockExecutable(&BB)) {
- LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << BB);
-
- ++NumDeadBlocks;
+}
+
+static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
+ Constant *Const = nullptr;
+ if (V->getType()->isStructTy()) {
+ std::vector<ValueLatticeElement> IVs = Solver.getStructLatticeValueFor(V);
+ if (any_of(IVs,
+ [](const ValueLatticeElement &LV) { return isOverdefined(LV); }))
+ return false;
+ std::vector<Constant *> ConstVals;
+ auto *ST = cast<StructType>(V->getType());
+ for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
+ ValueLatticeElement V = IVs[i];
+ ConstVals.push_back(isConstant(V)
+ ? Solver.getConstant(V)
+ : UndefValue::get(ST->getElementType(i)));
+ }
+ Const = ConstantStruct::get(ST, ConstVals);
+ } else {
+ const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
+ if (isOverdefined(IV))
+ return false;
+
+ Const =
+ isConstant(IV) ? Solver.getConstant(IV) : UndefValue::get(V->getType());
+ }
+ assert(Const && "Constant is nullptr here!");
+
+ // Replacing `musttail` instructions with constant breaks `musttail` invariant
+ // unless the call itself can be removed
+ CallInst *CI = dyn_cast<CallInst>(V);
+ if (CI && CI->isMustTailCall() && !CI->isSafeToRemove()) {
+ Function *F = CI->getCalledFunction();
+
+ // Don't zap returns of the callee
+ if (F)
+ Solver.AddMustTailCallee(F);
+
+ LLVM_DEBUG(dbgs() << " Can\'t treat the result of musttail call : " << *CI
+ << " as a constant\n");
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << " Constant: " << *Const << " = " << *V << '\n');
+
+ // Replaces all of the uses of a variable with uses of the constant.
+ V->replaceAllUsesWith(Const);
+ return true;
+}
+
+static bool simplifyInstsInBlock(SCCPSolver &Solver, BasicBlock &BB,
+ SmallPtrSetImpl<Value *> &InsertedValues,
+ Statistic &InstRemovedStat,
+ Statistic &InstReplacedStat) {
+ bool MadeChanges = false;
+ for (Instruction &Inst : make_early_inc_range(BB)) {
+ if (Inst.getType()->isVoidTy())
+ continue;
+ if (tryToReplaceWithConstant(Solver, &Inst)) {
+ if (Inst.isSafeToRemove())
+ Inst.eraseFromParent();
+ // Hey, we just changed something!
+ MadeChanges = true;
+ ++InstRemovedStat;
+ } else if (isa<SExtInst>(&Inst)) {
+ Value *ExtOp = Inst.getOperand(0);
+ if (isa<Constant>(ExtOp) || InsertedValues.count(ExtOp))
+ continue;
+ const ValueLatticeElement &IV = Solver.getLatticeValueFor(ExtOp);
+ if (!IV.isConstantRange(/*UndefAllowed=*/false))
+ continue;
+ if (IV.getConstantRange().isAllNonNegative()) {
+ auto *ZExt = new ZExtInst(ExtOp, Inst.getType(), "", &Inst);
+ InsertedValues.insert(ZExt);
+ Inst.replaceAllUsesWith(ZExt);
+ Solver.removeLatticeValueFor(&Inst);
+ Inst.eraseFromParent();
+ InstReplacedStat++;
+ MadeChanges = true;
+ }
+ }
+ }
+ return MadeChanges;
+}
+
+// runSCCP() - Run the Sparse Conditional Constant Propagation algorithm,
+// and return true if the function was modified.
+static bool runSCCP(Function &F, const DataLayout &DL,
+ const TargetLibraryInfo *TLI) {
+ LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
+ SCCPSolver Solver(
+ DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; },
+ F.getContext());
+
+ // Mark the first block of the function as being executable.
+ Solver.MarkBlockExecutable(&F.front());
+
+ // Mark all arguments to the function as being overdefined.
+ for (Argument &AI : F.args())
+ Solver.markOverdefined(&AI);
+
+ // Solve for constants.
+ bool ResolvedUndefs = true;
+ while (ResolvedUndefs) {
+ Solver.Solve();
+ LLVM_DEBUG(dbgs() << "RESOLVING UNDEFs\n");
+ ResolvedUndefs = Solver.ResolvedUndefsIn(F);
+ }
+
+ bool MadeChanges = false;
+
+ // If we decided that there are basic blocks that are dead in this function,
+ // delete their contents now. Note that we cannot actually delete the blocks,
+ // as we cannot modify the CFG of the function.
+
+ SmallPtrSet<Value *, 32> InsertedValues;
+ for (BasicBlock &BB : F) {
+ if (!Solver.isBlockExecutable(&BB)) {
+ LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << BB);
+
+ ++NumDeadBlocks;
NumInstRemoved += removeAllNonTerminatorAndEHPadInstructions(&BB).first;
-
- MadeChanges = true;
- continue;
- }
-
- MadeChanges |= simplifyInstsInBlock(Solver, BB, InsertedValues,
- NumInstRemoved, NumInstReplaced);
- }
-
- return MadeChanges;
-}
-
-PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
- const DataLayout &DL = F.getParent()->getDataLayout();
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- if (!runSCCP(F, DL, &TLI))
- return PreservedAnalyses::all();
-
- auto PA = PreservedAnalyses();
- PA.preserve<GlobalsAA>();
- PA.preserveSet<CFGAnalyses>();
- return PA;
-}
-
-namespace {
-
-//===--------------------------------------------------------------------===//
-//
-/// SCCP Class - This class uses the SCCPSolver to implement a per-function
-/// Sparse Conditional Constant Propagator.
-///
-class SCCPLegacyPass : public FunctionPass {
-public:
- // Pass identification, replacement for typeid
- static char ID;
-
- SCCPLegacyPass() : FunctionPass(ID) {
- initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.setPreservesCFG();
- }
-
- // runOnFunction - Run the Sparse Conditional Constant Propagation
- // algorithm, and return true if the function was modified.
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
- const DataLayout &DL = F.getParent()->getDataLayout();
- const TargetLibraryInfo *TLI =
- &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- return runSCCP(F, DL, TLI);
- }
-};
-
-} // end anonymous namespace
-
-char SCCPLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(SCCPLegacyPass, "sccp",
- "Sparse Conditional Constant Propagation", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(SCCPLegacyPass, "sccp",
- "Sparse Conditional Constant Propagation", false, false)
-
-// createSCCPPass - This is the public interface to this file.
-FunctionPass *llvm::createSCCPPass() { return new SCCPLegacyPass(); }
-
-static void findReturnsToZap(Function &F,
- SmallVector<ReturnInst *, 8> &ReturnsToZap,
- SCCPSolver &Solver) {
- // We can only do this if we know that nothing else can call the function.
- if (!Solver.isArgumentTrackedFunction(&F))
- return;
-
- // There is a non-removable musttail call site of this function. Zapping
- // returns is not allowed.
- if (Solver.isMustTailCallee(&F)) {
- LLVM_DEBUG(dbgs() << "Can't zap returns of the function : " << F.getName()
- << " due to present musttail call of it\n");
- return;
- }
-
- assert(
- all_of(F.users(),
- [&Solver](User *U) {
- if (isa<Instruction>(U) &&
- !Solver.isBlockExecutable(cast<Instruction>(U)->getParent()))
- return true;
- // Non-callsite uses are not impacted by zapping. Also, constant
- // uses (like blockaddresses) could stuck around, without being
- // used in the underlying IR, meaning we do not have lattice
- // values for them.
- if (!isa<CallBase>(U))
- return true;
- if (U->getType()->isStructTy()) {
- return all_of(Solver.getStructLatticeValueFor(U),
- [](const ValueLatticeElement &LV) {
- return !isOverdefined(LV);
- });
- }
- return !isOverdefined(Solver.getLatticeValueFor(U));
- }) &&
- "We can only zap functions where all live users have a concrete value");
-
- for (BasicBlock &BB : F) {
- if (CallInst *CI = BB.getTerminatingMustTailCall()) {
- LLVM_DEBUG(dbgs() << "Can't zap return of the block due to present "
- << "musttail call : " << *CI << "\n");
- (void)CI;
- return;
- }
-
- if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
- if (!isa<UndefValue>(RI->getOperand(0)))
- ReturnsToZap.push_back(RI);
- }
-}
-
+
+ MadeChanges = true;
+ continue;
+ }
+
+ MadeChanges |= simplifyInstsInBlock(Solver, BB, InsertedValues,
+ NumInstRemoved, NumInstReplaced);
+ }
+
+ return MadeChanges;
+}
+
+PreservedAnalyses SCCPPass::run(Function &F, FunctionAnalysisManager &AM) {
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ if (!runSCCP(F, DL, &TLI))
+ return PreservedAnalyses::all();
+
+ auto PA = PreservedAnalyses();
+ PA.preserve<GlobalsAA>();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+namespace {
+
+//===--------------------------------------------------------------------===//
+//
+/// SCCP Class - This class uses the SCCPSolver to implement a per-function
+/// Sparse Conditional Constant Propagator.
+///
+class SCCPLegacyPass : public FunctionPass {
+public:
+ // Pass identification, replacement for typeid
+ static char ID;
+
+ SCCPLegacyPass() : FunctionPass(ID) {
+ initializeSCCPLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.setPreservesCFG();
+ }
+
+ // runOnFunction - Run the Sparse Conditional Constant Propagation
+ // algorithm, and return true if the function was modified.
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ const DataLayout &DL = F.getParent()->getDataLayout();
+ const TargetLibraryInfo *TLI =
+ &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ return runSCCP(F, DL, TLI);
+ }
+};
+
+} // end anonymous namespace
+
+char SCCPLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(SCCPLegacyPass, "sccp",
+ "Sparse Conditional Constant Propagation", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(SCCPLegacyPass, "sccp",
+ "Sparse Conditional Constant Propagation", false, false)
+
+// createSCCPPass - This is the public interface to this file.
+FunctionPass *llvm::createSCCPPass() { return new SCCPLegacyPass(); }
+
+static void findReturnsToZap(Function &F,
+ SmallVector<ReturnInst *, 8> &ReturnsToZap,
+ SCCPSolver &Solver) {
+ // We can only do this if we know that nothing else can call the function.
+ if (!Solver.isArgumentTrackedFunction(&F))
+ return;
+
+ // There is a non-removable musttail call site of this function. Zapping
+ // returns is not allowed.
+ if (Solver.isMustTailCallee(&F)) {
+ LLVM_DEBUG(dbgs() << "Can't zap returns of the function : " << F.getName()
+ << " due to present musttail call of it\n");
+ return;
+ }
+
+ assert(
+ all_of(F.users(),
+ [&Solver](User *U) {
+ if (isa<Instruction>(U) &&
+ !Solver.isBlockExecutable(cast<Instruction>(U)->getParent()))
+ return true;
+ // Non-callsite uses are not impacted by zapping. Also, constant
+ // uses (like blockaddresses) could stuck around, without being
+ // used in the underlying IR, meaning we do not have lattice
+ // values for them.
+ if (!isa<CallBase>(U))
+ return true;
+ if (U->getType()->isStructTy()) {
+ return all_of(Solver.getStructLatticeValueFor(U),
+ [](const ValueLatticeElement &LV) {
+ return !isOverdefined(LV);
+ });
+ }
+ return !isOverdefined(Solver.getLatticeValueFor(U));
+ }) &&
+ "We can only zap functions where all live users have a concrete value");
+
+ for (BasicBlock &BB : F) {
+ if (CallInst *CI = BB.getTerminatingMustTailCall()) {
+ LLVM_DEBUG(dbgs() << "Can't zap return of the block due to present "
+ << "musttail call : " << *CI << "\n");
+ (void)CI;
+ return;
+ }
+
+ if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator()))
+ if (!isa<UndefValue>(RI->getOperand(0)))
+ ReturnsToZap.push_back(RI);
+ }
+}
+
static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
DomTreeUpdater &DTU) {
SmallPtrSet<BasicBlock *, 8> FeasibleSuccessors;
@@ -1906,7 +1906,7 @@ static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
Succ->removePredecessor(BB);
Updates.push_back({DominatorTree::Delete, BB, Succ});
- }
+ }
BranchInst::Create(OnlyFeasibleSuccessor, BB);
TI->eraseFromParent();
@@ -1925,92 +1925,92 @@ static bool removeNonFeasibleEdges(const SCCPSolver &Solver, BasicBlock *BB,
Updates.push_back({DominatorTree::Delete, BB, Succ});
SI.removeCase(CI);
// Don't increment CI, as we removed a case.
- }
+ }
DTU.applyUpdatesPermissive(Updates);
- } else {
+ } else {
llvm_unreachable("Must have at least one feasible successor");
- }
+ }
return true;
-}
-
-bool llvm::runIPSCCP(
- Module &M, const DataLayout &DL,
- std::function<const TargetLibraryInfo &(Function &)> GetTLI,
- function_ref<AnalysisResultsForFn(Function &)> getAnalysis) {
- SCCPSolver Solver(DL, GetTLI, M.getContext());
-
- // Loop over all functions, marking arguments to those with their addresses
- // taken or that are external as overdefined.
- for (Function &F : M) {
- if (F.isDeclaration())
- continue;
-
- Solver.addAnalysis(F, getAnalysis(F));
-
- // Determine if we can track the function's return values. If so, add the
- // function to the solver's set of return-tracked functions.
- if (canTrackReturnsInterprocedurally(&F))
- Solver.AddTrackedFunction(&F);
-
- // Determine if we can track the function's arguments. If so, add the
- // function to the solver's set of argument-tracked functions.
- if (canTrackArgumentsInterprocedurally(&F)) {
- Solver.AddArgumentTrackedFunction(&F);
- continue;
- }
-
- // Assume the function is called.
- Solver.MarkBlockExecutable(&F.front());
-
- // Assume nothing about the incoming arguments.
- for (Argument &AI : F.args())
- Solver.markOverdefined(&AI);
- }
-
- // Determine if we can track any of the module's global variables. If so, add
- // the global variables we can track to the solver's set of tracked global
- // variables.
- for (GlobalVariable &G : M.globals()) {
- G.removeDeadConstantUsers();
- if (canTrackGlobalVariableInterprocedurally(&G))
- Solver.TrackValueOfGlobalVariable(&G);
- }
-
- // Solve for constants.
- bool ResolvedUndefs = true;
- Solver.Solve();
- while (ResolvedUndefs) {
- LLVM_DEBUG(dbgs() << "RESOLVING UNDEFS\n");
- ResolvedUndefs = false;
+}
+
+bool llvm::runIPSCCP(
+ Module &M, const DataLayout &DL,
+ std::function<const TargetLibraryInfo &(Function &)> GetTLI,
+ function_ref<AnalysisResultsForFn(Function &)> getAnalysis) {
+ SCCPSolver Solver(DL, GetTLI, M.getContext());
+
+ // Loop over all functions, marking arguments to those with their addresses
+ // taken or that are external as overdefined.
+ for (Function &F : M) {
+ if (F.isDeclaration())
+ continue;
+
+ Solver.addAnalysis(F, getAnalysis(F));
+
+ // Determine if we can track the function's return values. If so, add the
+ // function to the solver's set of return-tracked functions.
+ if (canTrackReturnsInterprocedurally(&F))
+ Solver.AddTrackedFunction(&F);
+
+ // Determine if we can track the function's arguments. If so, add the
+ // function to the solver's set of argument-tracked functions.
+ if (canTrackArgumentsInterprocedurally(&F)) {
+ Solver.AddArgumentTrackedFunction(&F);
+ continue;
+ }
+
+ // Assume the function is called.
+ Solver.MarkBlockExecutable(&F.front());
+
+ // Assume nothing about the incoming arguments.
+ for (Argument &AI : F.args())
+ Solver.markOverdefined(&AI);
+ }
+
+ // Determine if we can track any of the module's global variables. If so, add
+ // the global variables we can track to the solver's set of tracked global
+ // variables.
+ for (GlobalVariable &G : M.globals()) {
+ G.removeDeadConstantUsers();
+ if (canTrackGlobalVariableInterprocedurally(&G))
+ Solver.TrackValueOfGlobalVariable(&G);
+ }
+
+ // Solve for constants.
+ bool ResolvedUndefs = true;
+ Solver.Solve();
+ while (ResolvedUndefs) {
+ LLVM_DEBUG(dbgs() << "RESOLVING UNDEFS\n");
+ ResolvedUndefs = false;
for (Function &F : M) {
if (Solver.ResolvedUndefsIn(F))
- ResolvedUndefs = true;
+ ResolvedUndefs = true;
}
if (ResolvedUndefs)
Solver.Solve();
- }
-
- bool MadeChanges = false;
-
- // Iterate over all of the instructions in the module, replacing them with
- // constants if we have found them to be of constant values.
-
- for (Function &F : M) {
- if (F.isDeclaration())
- continue;
-
- SmallVector<BasicBlock *, 512> BlocksToErase;
-
+ }
+
+ bool MadeChanges = false;
+
+ // Iterate over all of the instructions in the module, replacing them with
+ // constants if we have found them to be of constant values.
+
+ for (Function &F : M) {
+ if (F.isDeclaration())
+ continue;
+
+ SmallVector<BasicBlock *, 512> BlocksToErase;
+
if (Solver.isBlockExecutable(&F.front())) {
bool ReplacedPointerArg = false;
for (Argument &Arg : F.args()) {
if (!Arg.use_empty() && tryToReplaceWithConstant(Solver, &Arg)) {
ReplacedPointerArg |= Arg.getType()->isPointerTy();
- ++IPNumArgsElimed;
- }
- }
-
+ ++IPNumArgsElimed;
+ }
+ }
+
// If we replaced an argument, the argmemonly and
// inaccessiblemem_or_argmemonly attributes do not hold any longer. Remove
// them from both the function and callsites.
@@ -2031,74 +2031,74 @@ bool llvm::runIPSCCP(
}
}
- SmallPtrSet<Value *, 32> InsertedValues;
- for (BasicBlock &BB : F) {
- if (!Solver.isBlockExecutable(&BB)) {
- LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << BB);
- ++NumDeadBlocks;
-
- MadeChanges = true;
-
- if (&BB != &F.front())
- BlocksToErase.push_back(&BB);
- continue;
- }
-
- MadeChanges |= simplifyInstsInBlock(Solver, BB, InsertedValues,
- IPNumInstRemoved, IPNumInstReplaced);
- }
-
- DomTreeUpdater DTU = Solver.getDTU(F);
- // Change dead blocks to unreachable. We do it after replacing constants
- // in all executable blocks, because changeToUnreachable may remove PHI
- // nodes in executable blocks we found values for. The function's entry
- // block is not part of BlocksToErase, so we have to handle it separately.
- for (BasicBlock *BB : BlocksToErase) {
- NumInstRemoved +=
- changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false,
- /*PreserveLCSSA=*/false, &DTU);
- }
- if (!Solver.isBlockExecutable(&F.front()))
- NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHI(),
- /*UseLLVMTrap=*/false,
- /*PreserveLCSSA=*/false, &DTU);
-
+ SmallPtrSet<Value *, 32> InsertedValues;
+ for (BasicBlock &BB : F) {
+ if (!Solver.isBlockExecutable(&BB)) {
+ LLVM_DEBUG(dbgs() << " BasicBlock Dead:" << BB);
+ ++NumDeadBlocks;
+
+ MadeChanges = true;
+
+ if (&BB != &F.front())
+ BlocksToErase.push_back(&BB);
+ continue;
+ }
+
+ MadeChanges |= simplifyInstsInBlock(Solver, BB, InsertedValues,
+ IPNumInstRemoved, IPNumInstReplaced);
+ }
+
+ DomTreeUpdater DTU = Solver.getDTU(F);
+ // Change dead blocks to unreachable. We do it after replacing constants
+ // in all executable blocks, because changeToUnreachable may remove PHI
+ // nodes in executable blocks we found values for. The function's entry
+ // block is not part of BlocksToErase, so we have to handle it separately.
+ for (BasicBlock *BB : BlocksToErase) {
+ NumInstRemoved +=
+ changeToUnreachable(BB->getFirstNonPHI(), /*UseLLVMTrap=*/false,
+ /*PreserveLCSSA=*/false, &DTU);
+ }
+ if (!Solver.isBlockExecutable(&F.front()))
+ NumInstRemoved += changeToUnreachable(F.front().getFirstNonPHI(),
+ /*UseLLVMTrap=*/false,
+ /*PreserveLCSSA=*/false, &DTU);
+
for (BasicBlock &BB : F)
MadeChanges |= removeNonFeasibleEdges(Solver, &BB, DTU);
-
+
for (BasicBlock *DeadBB : BlocksToErase)
- DTU.deleteBB(DeadBB);
-
- for (BasicBlock &BB : F) {
- for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
- Instruction *Inst = &*BI++;
- if (Solver.getPredicateInfoFor(Inst)) {
- if (auto *II = dyn_cast<IntrinsicInst>(Inst)) {
- if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
- Value *Op = II->getOperand(0);
- Inst->replaceAllUsesWith(Op);
- Inst->eraseFromParent();
- }
- }
- }
- }
- }
- }
-
- // If we inferred constant or undef return values for a function, we replaced
- // all call uses with the inferred value. This means we don't need to bother
- // actually returning anything from the function. Replace all return
- // instructions with return undef.
- //
- // Do this in two stages: first identify the functions we should process, then
- // actually zap their returns. This is important because we can only do this
- // if the address of the function isn't taken. In cases where a return is the
- // last use of a function, the order of processing functions would affect
- // whether other functions are optimizable.
- SmallVector<ReturnInst*, 8> ReturnsToZap;
-
- for (const auto &I : Solver.getTrackedRetVals()) {
- Function *F = I.first;
+ DTU.deleteBB(DeadBB);
+
+ for (BasicBlock &BB : F) {
+ for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
+ Instruction *Inst = &*BI++;
+ if (Solver.getPredicateInfoFor(Inst)) {
+ if (auto *II = dyn_cast<IntrinsicInst>(Inst)) {
+ if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
+ Value *Op = II->getOperand(0);
+ Inst->replaceAllUsesWith(Op);
+ Inst->eraseFromParent();
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // If we inferred constant or undef return values for a function, we replaced
+ // all call uses with the inferred value. This means we don't need to bother
+ // actually returning anything from the function. Replace all return
+ // instructions with return undef.
+ //
+ // Do this in two stages: first identify the functions we should process, then
+ // actually zap their returns. This is important because we can only do this
+ // if the address of the function isn't taken. In cases where a return is the
+ // last use of a function, the order of processing functions would affect
+ // whether other functions are optimizable.
+ SmallVector<ReturnInst*, 8> ReturnsToZap;
+
+ for (const auto &I : Solver.getTrackedRetVals()) {
+ Function *F = I.first;
const ValueLatticeElement &ReturnValue = I.second;
// If there is a known constant range for the return value, add !range
@@ -2134,31 +2134,31 @@ bool llvm::runIPSCCP(
ConstantAsMetadata::get(ConstantInt::get(Context, CR.getUpper()))};
CB->setMetadata(LLVMContext::MD_range, MDNode::get(Context, RangeMD));
}
- continue;
+ continue;
}
if (F->getReturnType()->isVoidTy())
continue;
if (isConstant(ReturnValue) || ReturnValue.isUnknownOrUndef())
findReturnsToZap(*F, ReturnsToZap, Solver);
- }
-
- for (auto F : Solver.getMRVFunctionsTracked()) {
- assert(F->getReturnType()->isStructTy() &&
- "The return type should be a struct");
- StructType *STy = cast<StructType>(F->getReturnType());
- if (Solver.isStructLatticeConstant(F, STy))
- findReturnsToZap(*F, ReturnsToZap, Solver);
- }
-
- // Zap all returns which we've identified as zap to change.
+ }
+
+ for (auto F : Solver.getMRVFunctionsTracked()) {
+ assert(F->getReturnType()->isStructTy() &&
+ "The return type should be a struct");
+ StructType *STy = cast<StructType>(F->getReturnType());
+ if (Solver.isStructLatticeConstant(F, STy))
+ findReturnsToZap(*F, ReturnsToZap, Solver);
+ }
+
+ // Zap all returns which we've identified as zap to change.
SmallSetVector<Function *, 8> FuncZappedReturn;
- for (unsigned i = 0, e = ReturnsToZap.size(); i != e; ++i) {
- Function *F = ReturnsToZap[i]->getParent()->getParent();
- ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType()));
+ for (unsigned i = 0, e = ReturnsToZap.size(); i != e; ++i) {
+ Function *F = ReturnsToZap[i]->getParent()->getParent();
+ ReturnsToZap[i]->setOperand(0, UndefValue::get(F->getReturnType()));
// Record all functions that are zapped.
FuncZappedReturn.insert(F);
- }
-
+ }
+
// Remove the returned attribute for zapped functions and the
// corresponding call sites.
for (Function *F : FuncZappedReturn) {
@@ -2174,22 +2174,22 @@ bool llvm::runIPSCCP(
}
}
- // If we inferred constant or undef values for globals variables, we can
- // delete the global and any stores that remain to it.
- for (auto &I : make_early_inc_range(Solver.getTrackedGlobals())) {
- GlobalVariable *GV = I.first;
- if (isOverdefined(I.second))
- continue;
- LLVM_DEBUG(dbgs() << "Found that GV '" << GV->getName()
- << "' is constant!\n");
- while (!GV->use_empty()) {
- StoreInst *SI = cast<StoreInst>(GV->user_back());
- SI->eraseFromParent();
- MadeChanges = true;
- }
- M.getGlobalList().erase(GV);
- ++IPNumGlobalConst;
- }
-
- return MadeChanges;
-}
+ // If we inferred constant or undef values for globals variables, we can
+ // delete the global and any stores that remain to it.
+ for (auto &I : make_early_inc_range(Solver.getTrackedGlobals())) {
+ GlobalVariable *GV = I.first;
+ if (isOverdefined(I.second))
+ continue;
+ LLVM_DEBUG(dbgs() << "Found that GV '" << GV->getName()
+ << "' is constant!\n");
+ while (!GV->use_empty()) {
+ StoreInst *SI = cast<StoreInst>(GV->user_back());
+ SI->eraseFromParent();
+ MadeChanges = true;
+ }
+ M.getGlobalList().erase(GV);
+ ++IPNumGlobalConst;
+ }
+
+ return MadeChanges;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/SROA.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/SROA.cpp
index 6a43dd3b17..af510f1a84 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/SROA.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/SROA.cpp
@@ -1,3098 +1,3098 @@
-//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This transformation implements the well known scalar replacement of
-/// aggregates transformation. It tries to identify promotable elements of an
-/// aggregate alloca, and promote them to registers. It will also try to
-/// convert uses of an element (or set of elements) of an alloca into a vector
-/// or bitfield-style integer scalar if appropriate.
-///
-/// It works to do this with minimal slicing of the alloca so that regions
-/// which are merely transferred in and out of external memory remain unchanged
-/// and are not decomposed to scalar code.
-///
-/// Because this also performs alloca promotion, it can be thought of as also
-/// serving the purpose of SSA formation. The algorithm iterates on the
-/// function until all opportunities for promotion have been realized.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/SROA.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/ADT/iterator.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/PtrUseVisitor.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/ConstantFolder.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include <algorithm>
-#include <cassert>
-#include <chrono>
-#include <cstddef>
-#include <cstdint>
-#include <cstring>
-#include <iterator>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-using namespace llvm::sroa;
-
-#define DEBUG_TYPE "sroa"
-
-STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
-STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
-STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
-STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
-STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
-STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
-STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
-STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
-STATISTIC(NumDeleted, "Number of instructions deleted");
-STATISTIC(NumVectorized, "Number of vectorized aggregates");
-
-/// Hidden option to experiment with completely strict handling of inbounds
-/// GEPs.
-static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
- cl::Hidden);
-
-namespace {
-
-/// A custom IRBuilder inserter which prefixes all names, but only in
-/// Assert builds.
-class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
- std::string Prefix;
-
- const Twine getNameWithPrefix(const Twine &Name) const {
- return Name.isTriviallyEmpty() ? Name : Prefix + Name;
- }
-
-public:
- void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
-
- void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
- BasicBlock::iterator InsertPt) const override {
- IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB,
- InsertPt);
- }
-};
-
-/// Provide a type for IRBuilder that drops names in release builds.
-using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
-
-/// A used slice of an alloca.
-///
-/// This structure represents a slice of an alloca used by some instruction. It
-/// stores both the begin and end offsets of this use, a pointer to the use
-/// itself, and a flag indicating whether we can classify the use as splittable
-/// or not when forming partitions of the alloca.
-class Slice {
- /// The beginning offset of the range.
- uint64_t BeginOffset = 0;
-
- /// The ending offset, not included in the range.
- uint64_t EndOffset = 0;
-
- /// Storage for both the use of this slice and whether it can be
- /// split.
- PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
-
-public:
- Slice() = default;
-
- Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
- : BeginOffset(BeginOffset), EndOffset(EndOffset),
- UseAndIsSplittable(U, IsSplittable) {}
-
- uint64_t beginOffset() const { return BeginOffset; }
- uint64_t endOffset() const { return EndOffset; }
-
- bool isSplittable() const { return UseAndIsSplittable.getInt(); }
- void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
-
- Use *getUse() const { return UseAndIsSplittable.getPointer(); }
-
- bool isDead() const { return getUse() == nullptr; }
- void kill() { UseAndIsSplittable.setPointer(nullptr); }
-
- /// Support for ordering ranges.
- ///
- /// This provides an ordering over ranges such that start offsets are
- /// always increasing, and within equal start offsets, the end offsets are
- /// decreasing. Thus the spanning range comes first in a cluster with the
- /// same start position.
- bool operator<(const Slice &RHS) const {
- if (beginOffset() < RHS.beginOffset())
- return true;
- if (beginOffset() > RHS.beginOffset())
- return false;
- if (isSplittable() != RHS.isSplittable())
- return !isSplittable();
- if (endOffset() > RHS.endOffset())
- return true;
- return false;
- }
-
- /// Support comparison with a single offset to allow binary searches.
- friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
- uint64_t RHSOffset) {
- return LHS.beginOffset() < RHSOffset;
- }
- friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
- const Slice &RHS) {
- return LHSOffset < RHS.beginOffset();
- }
-
- bool operator==(const Slice &RHS) const {
- return isSplittable() == RHS.isSplittable() &&
- beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
- }
- bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
-};
-
-} // end anonymous namespace
-
-/// Representation of the alloca slices.
-///
-/// This class represents the slices of an alloca which are formed by its
-/// various uses. If a pointer escapes, we can't fully build a representation
-/// for the slices used and we reflect that in this structure. The uses are
-/// stored, sorted by increasing beginning offset and with unsplittable slices
-/// starting at a particular offset before splittable slices.
-class llvm::sroa::AllocaSlices {
-public:
- /// Construct the slices of a particular alloca.
- AllocaSlices(const DataLayout &DL, AllocaInst &AI);
-
- /// Test whether a pointer to the allocation escapes our analysis.
- ///
- /// If this is true, the slices are never fully built and should be
- /// ignored.
- bool isEscaped() const { return PointerEscapingInstr; }
-
- /// Support for iterating over the slices.
- /// @{
- using iterator = SmallVectorImpl<Slice>::iterator;
- using range = iterator_range<iterator>;
-
- iterator begin() { return Slices.begin(); }
- iterator end() { return Slices.end(); }
-
- using const_iterator = SmallVectorImpl<Slice>::const_iterator;
- using const_range = iterator_range<const_iterator>;
-
- const_iterator begin() const { return Slices.begin(); }
- const_iterator end() const { return Slices.end(); }
- /// @}
-
- /// Erase a range of slices.
- void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
-
- /// Insert new slices for this alloca.
- ///
- /// This moves the slices into the alloca's slices collection, and re-sorts
- /// everything so that the usual ordering properties of the alloca's slices
- /// hold.
- void insert(ArrayRef<Slice> NewSlices) {
- int OldSize = Slices.size();
- Slices.append(NewSlices.begin(), NewSlices.end());
- auto SliceI = Slices.begin() + OldSize;
- llvm::sort(SliceI, Slices.end());
- std::inplace_merge(Slices.begin(), SliceI, Slices.end());
- }
-
- // Forward declare the iterator and range accessor for walking the
- // partitions.
- class partition_iterator;
- iterator_range<partition_iterator> partitions();
-
- /// Access the dead users for this alloca.
- ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
-
+//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This transformation implements the well known scalar replacement of
+/// aggregates transformation. It tries to identify promotable elements of an
+/// aggregate alloca, and promote them to registers. It will also try to
+/// convert uses of an element (or set of elements) of an alloca into a vector
+/// or bitfield-style integer scalar if appropriate.
+///
+/// It works to do this with minimal slicing of the alloca so that regions
+/// which are merely transferred in and out of external memory remain unchanged
+/// and are not decomposed to scalar code.
+///
+/// Because this also performs alloca promotion, it can be thought of as also
+/// serving the purpose of SSA formation. The algorithm iterates on the
+/// function until all opportunities for promotion have been realized.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SROA.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantFolder.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <chrono>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::sroa;
+
+#define DEBUG_TYPE "sroa"
+
+STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
+STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
+STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca");
+STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten");
+STATISTIC(MaxUsesPerAllocaPartition, "Maximum number of uses of a partition");
+STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
+STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
+STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
+STATISTIC(NumDeleted, "Number of instructions deleted");
+STATISTIC(NumVectorized, "Number of vectorized aggregates");
+
+/// Hidden option to experiment with completely strict handling of inbounds
+/// GEPs.
+static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
+ cl::Hidden);
+
+namespace {
+
+/// A custom IRBuilder inserter which prefixes all names, but only in
+/// Assert builds.
+class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
+ std::string Prefix;
+
+ const Twine getNameWithPrefix(const Twine &Name) const {
+ return Name.isTriviallyEmpty() ? Name : Prefix + Name;
+ }
+
+public:
+ void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
+
+ void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
+ BasicBlock::iterator InsertPt) const override {
+ IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB,
+ InsertPt);
+ }
+};
+
+/// Provide a type for IRBuilder that drops names in release builds.
+using IRBuilderTy = IRBuilder<ConstantFolder, IRBuilderPrefixedInserter>;
+
+/// A used slice of an alloca.
+///
+/// This structure represents a slice of an alloca used by some instruction. It
+/// stores both the begin and end offsets of this use, a pointer to the use
+/// itself, and a flag indicating whether we can classify the use as splittable
+/// or not when forming partitions of the alloca.
+class Slice {
+ /// The beginning offset of the range.
+ uint64_t BeginOffset = 0;
+
+ /// The ending offset, not included in the range.
+ uint64_t EndOffset = 0;
+
+ /// Storage for both the use of this slice and whether it can be
+ /// split.
+ PointerIntPair<Use *, 1, bool> UseAndIsSplittable;
+
+public:
+ Slice() = default;
+
+ Slice(uint64_t BeginOffset, uint64_t EndOffset, Use *U, bool IsSplittable)
+ : BeginOffset(BeginOffset), EndOffset(EndOffset),
+ UseAndIsSplittable(U, IsSplittable) {}
+
+ uint64_t beginOffset() const { return BeginOffset; }
+ uint64_t endOffset() const { return EndOffset; }
+
+ bool isSplittable() const { return UseAndIsSplittable.getInt(); }
+ void makeUnsplittable() { UseAndIsSplittable.setInt(false); }
+
+ Use *getUse() const { return UseAndIsSplittable.getPointer(); }
+
+ bool isDead() const { return getUse() == nullptr; }
+ void kill() { UseAndIsSplittable.setPointer(nullptr); }
+
+ /// Support for ordering ranges.
+ ///
+ /// This provides an ordering over ranges such that start offsets are
+ /// always increasing, and within equal start offsets, the end offsets are
+ /// decreasing. Thus the spanning range comes first in a cluster with the
+ /// same start position.
+ bool operator<(const Slice &RHS) const {
+ if (beginOffset() < RHS.beginOffset())
+ return true;
+ if (beginOffset() > RHS.beginOffset())
+ return false;
+ if (isSplittable() != RHS.isSplittable())
+ return !isSplittable();
+ if (endOffset() > RHS.endOffset())
+ return true;
+ return false;
+ }
+
+ /// Support comparison with a single offset to allow binary searches.
+ friend LLVM_ATTRIBUTE_UNUSED bool operator<(const Slice &LHS,
+ uint64_t RHSOffset) {
+ return LHS.beginOffset() < RHSOffset;
+ }
+ friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
+ const Slice &RHS) {
+ return LHSOffset < RHS.beginOffset();
+ }
+
+ bool operator==(const Slice &RHS) const {
+ return isSplittable() == RHS.isSplittable() &&
+ beginOffset() == RHS.beginOffset() && endOffset() == RHS.endOffset();
+ }
+ bool operator!=(const Slice &RHS) const { return !operator==(RHS); }
+};
+
+} // end anonymous namespace
+
+/// Representation of the alloca slices.
+///
+/// This class represents the slices of an alloca which are formed by its
+/// various uses. If a pointer escapes, we can't fully build a representation
+/// for the slices used and we reflect that in this structure. The uses are
+/// stored, sorted by increasing beginning offset and with unsplittable slices
+/// starting at a particular offset before splittable slices.
+class llvm::sroa::AllocaSlices {
+public:
+ /// Construct the slices of a particular alloca.
+ AllocaSlices(const DataLayout &DL, AllocaInst &AI);
+
+ /// Test whether a pointer to the allocation escapes our analysis.
+ ///
+ /// If this is true, the slices are never fully built and should be
+ /// ignored.
+ bool isEscaped() const { return PointerEscapingInstr; }
+
+ /// Support for iterating over the slices.
+ /// @{
+ using iterator = SmallVectorImpl<Slice>::iterator;
+ using range = iterator_range<iterator>;
+
+ iterator begin() { return Slices.begin(); }
+ iterator end() { return Slices.end(); }
+
+ using const_iterator = SmallVectorImpl<Slice>::const_iterator;
+ using const_range = iterator_range<const_iterator>;
+
+ const_iterator begin() const { return Slices.begin(); }
+ const_iterator end() const { return Slices.end(); }
+ /// @}
+
+ /// Erase a range of slices.
+ void erase(iterator Start, iterator Stop) { Slices.erase(Start, Stop); }
+
+ /// Insert new slices for this alloca.
+ ///
+ /// This moves the slices into the alloca's slices collection, and re-sorts
+ /// everything so that the usual ordering properties of the alloca's slices
+ /// hold.
+ void insert(ArrayRef<Slice> NewSlices) {
+ int OldSize = Slices.size();
+ Slices.append(NewSlices.begin(), NewSlices.end());
+ auto SliceI = Slices.begin() + OldSize;
+ llvm::sort(SliceI, Slices.end());
+ std::inplace_merge(Slices.begin(), SliceI, Slices.end());
+ }
+
+ // Forward declare the iterator and range accessor for walking the
+ // partitions.
+ class partition_iterator;
+ iterator_range<partition_iterator> partitions();
+
+ /// Access the dead users for this alloca.
+ ArrayRef<Instruction *> getDeadUsers() const { return DeadUsers; }
+
/// Access Uses that should be dropped if the alloca is promotable.
ArrayRef<Use *> getDeadUsesIfPromotable() const {
return DeadUseIfPromotable;
}
- /// Access the dead operands referring to this alloca.
- ///
- /// These are operands which have cannot actually be used to refer to the
- /// alloca as they are outside its range and the user doesn't correct for
- /// that. These mostly consist of PHI node inputs and the like which we just
- /// need to replace with undef.
- ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
- void printSlice(raw_ostream &OS, const_iterator I,
- StringRef Indent = " ") const;
- void printUse(raw_ostream &OS, const_iterator I,
- StringRef Indent = " ") const;
- void print(raw_ostream &OS) const;
- void dump(const_iterator I) const;
- void dump() const;
-#endif
-
-private:
- template <typename DerivedT, typename RetT = void> class BuilderBase;
- class SliceBuilder;
-
- friend class AllocaSlices::SliceBuilder;
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- /// Handle to alloca instruction to simplify method interfaces.
- AllocaInst &AI;
-#endif
-
- /// The instruction responsible for this alloca not having a known set
- /// of slices.
- ///
- /// When an instruction (potentially) escapes the pointer to the alloca, we
- /// store a pointer to that here and abort trying to form slices of the
- /// alloca. This will be null if the alloca slices are analyzed successfully.
- Instruction *PointerEscapingInstr;
-
- /// The slices of the alloca.
- ///
- /// We store a vector of the slices formed by uses of the alloca here. This
- /// vector is sorted by increasing begin offset, and then the unsplittable
- /// slices before the splittable ones. See the Slice inner class for more
- /// details.
- SmallVector<Slice, 8> Slices;
-
- /// Instructions which will become dead if we rewrite the alloca.
- ///
- /// Note that these are not separated by slice. This is because we expect an
- /// alloca to be completely rewritten or not rewritten at all. If rewritten,
- /// all these instructions can simply be removed and replaced with undef as
- /// they come from outside of the allocated space.
- SmallVector<Instruction *, 8> DeadUsers;
-
+ /// Access the dead operands referring to this alloca.
+ ///
+ /// These are operands which have cannot actually be used to refer to the
+ /// alloca as they are outside its range and the user doesn't correct for
+ /// that. These mostly consist of PHI node inputs and the like which we just
+ /// need to replace with undef.
+ ArrayRef<Use *> getDeadOperands() const { return DeadOperands; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ void print(raw_ostream &OS, const_iterator I, StringRef Indent = " ") const;
+ void printSlice(raw_ostream &OS, const_iterator I,
+ StringRef Indent = " ") const;
+ void printUse(raw_ostream &OS, const_iterator I,
+ StringRef Indent = " ") const;
+ void print(raw_ostream &OS) const;
+ void dump(const_iterator I) const;
+ void dump() const;
+#endif
+
+private:
+ template <typename DerivedT, typename RetT = void> class BuilderBase;
+ class SliceBuilder;
+
+ friend class AllocaSlices::SliceBuilder;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ /// Handle to alloca instruction to simplify method interfaces.
+ AllocaInst &AI;
+#endif
+
+ /// The instruction responsible for this alloca not having a known set
+ /// of slices.
+ ///
+ /// When an instruction (potentially) escapes the pointer to the alloca, we
+ /// store a pointer to that here and abort trying to form slices of the
+ /// alloca. This will be null if the alloca slices are analyzed successfully.
+ Instruction *PointerEscapingInstr;
+
+ /// The slices of the alloca.
+ ///
+ /// We store a vector of the slices formed by uses of the alloca here. This
+ /// vector is sorted by increasing begin offset, and then the unsplittable
+ /// slices before the splittable ones. See the Slice inner class for more
+ /// details.
+ SmallVector<Slice, 8> Slices;
+
+ /// Instructions which will become dead if we rewrite the alloca.
+ ///
+ /// Note that these are not separated by slice. This is because we expect an
+ /// alloca to be completely rewritten or not rewritten at all. If rewritten,
+ /// all these instructions can simply be removed and replaced with undef as
+ /// they come from outside of the allocated space.
+ SmallVector<Instruction *, 8> DeadUsers;
+
/// Uses which will become dead if can promote the alloca.
SmallVector<Use *, 8> DeadUseIfPromotable;
- /// Operands which will become dead if we rewrite the alloca.
- ///
- /// These are operands that in their particular use can be replaced with
- /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
- /// to PHI nodes and the like. They aren't entirely dead (there might be
- /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
- /// want to swap this particular input for undef to simplify the use lists of
- /// the alloca.
- SmallVector<Use *, 8> DeadOperands;
-};
-
-/// A partition of the slices.
-///
-/// An ephemeral representation for a range of slices which can be viewed as
-/// a partition of the alloca. This range represents a span of the alloca's
-/// memory which cannot be split, and provides access to all of the slices
-/// overlapping some part of the partition.
-///
-/// Objects of this type are produced by traversing the alloca's slices, but
-/// are only ephemeral and not persistent.
-class llvm::sroa::Partition {
-private:
- friend class AllocaSlices;
- friend class AllocaSlices::partition_iterator;
-
- using iterator = AllocaSlices::iterator;
-
- /// The beginning and ending offsets of the alloca for this
- /// partition.
- uint64_t BeginOffset = 0, EndOffset = 0;
-
- /// The start and end iterators of this partition.
- iterator SI, SJ;
-
- /// A collection of split slice tails overlapping the partition.
- SmallVector<Slice *, 4> SplitTails;
-
- /// Raw constructor builds an empty partition starting and ending at
- /// the given iterator.
- Partition(iterator SI) : SI(SI), SJ(SI) {}
-
-public:
- /// The start offset of this partition.
- ///
- /// All of the contained slices start at or after this offset.
- uint64_t beginOffset() const { return BeginOffset; }
-
- /// The end offset of this partition.
- ///
- /// All of the contained slices end at or before this offset.
- uint64_t endOffset() const { return EndOffset; }
-
- /// The size of the partition.
- ///
- /// Note that this can never be zero.
- uint64_t size() const {
- assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
- return EndOffset - BeginOffset;
- }
-
- /// Test whether this partition contains no slices, and merely spans
- /// a region occupied by split slices.
- bool empty() const { return SI == SJ; }
-
- /// \name Iterate slices that start within the partition.
- /// These may be splittable or unsplittable. They have a begin offset >= the
- /// partition begin offset.
- /// @{
- // FIXME: We should probably define a "concat_iterator" helper and use that
- // to stitch together pointee_iterators over the split tails and the
- // contiguous iterators of the partition. That would give a much nicer
- // interface here. We could then additionally expose filtered iterators for
- // split, unsplit, and unsplittable splices based on the usage patterns.
- iterator begin() const { return SI; }
- iterator end() const { return SJ; }
- /// @}
-
- /// Get the sequence of split slice tails.
- ///
- /// These tails are of slices which start before this partition but are
- /// split and overlap into the partition. We accumulate these while forming
- /// partitions.
- ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
-};
-
-/// An iterator over partitions of the alloca's slices.
-///
-/// This iterator implements the core algorithm for partitioning the alloca's
-/// slices. It is a forward iterator as we don't support backtracking for
-/// efficiency reasons, and re-use a single storage area to maintain the
-/// current set of split slices.
-///
-/// It is templated on the slice iterator type to use so that it can operate
-/// with either const or non-const slice iterators.
-class AllocaSlices::partition_iterator
- : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
- Partition> {
- friend class AllocaSlices;
-
- /// Most of the state for walking the partitions is held in a class
- /// with a nice interface for examining them.
- Partition P;
-
- /// We need to keep the end of the slices to know when to stop.
- AllocaSlices::iterator SE;
-
- /// We also need to keep track of the maximum split end offset seen.
- /// FIXME: Do we really?
- uint64_t MaxSplitSliceEndOffset = 0;
-
- /// Sets the partition to be empty at given iterator, and sets the
- /// end iterator.
- partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
- : P(SI), SE(SE) {
- // If not already at the end, advance our state to form the initial
- // partition.
- if (SI != SE)
- advance();
- }
-
- /// Advance the iterator to the next partition.
- ///
- /// Requires that the iterator not be at the end of the slices.
- void advance() {
- assert((P.SI != SE || !P.SplitTails.empty()) &&
- "Cannot advance past the end of the slices!");
-
- // Clear out any split uses which have ended.
- if (!P.SplitTails.empty()) {
- if (P.EndOffset >= MaxSplitSliceEndOffset) {
- // If we've finished all splits, this is easy.
- P.SplitTails.clear();
- MaxSplitSliceEndOffset = 0;
- } else {
- // Remove the uses which have ended in the prior partition. This
- // cannot change the max split slice end because we just checked that
- // the prior partition ended prior to that max.
+ /// Operands which will become dead if we rewrite the alloca.
+ ///
+ /// These are operands that in their particular use can be replaced with
+ /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
+ /// to PHI nodes and the like. They aren't entirely dead (there might be
+ /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
+ /// want to swap this particular input for undef to simplify the use lists of
+ /// the alloca.
+ SmallVector<Use *, 8> DeadOperands;
+};
+
+/// A partition of the slices.
+///
+/// An ephemeral representation for a range of slices which can be viewed as
+/// a partition of the alloca. This range represents a span of the alloca's
+/// memory which cannot be split, and provides access to all of the slices
+/// overlapping some part of the partition.
+///
+/// Objects of this type are produced by traversing the alloca's slices, but
+/// are only ephemeral and not persistent.
+class llvm::sroa::Partition {
+private:
+ friend class AllocaSlices;
+ friend class AllocaSlices::partition_iterator;
+
+ using iterator = AllocaSlices::iterator;
+
+ /// The beginning and ending offsets of the alloca for this
+ /// partition.
+ uint64_t BeginOffset = 0, EndOffset = 0;
+
+ /// The start and end iterators of this partition.
+ iterator SI, SJ;
+
+ /// A collection of split slice tails overlapping the partition.
+ SmallVector<Slice *, 4> SplitTails;
+
+ /// Raw constructor builds an empty partition starting and ending at
+ /// the given iterator.
+ Partition(iterator SI) : SI(SI), SJ(SI) {}
+
+public:
+ /// The start offset of this partition.
+ ///
+ /// All of the contained slices start at or after this offset.
+ uint64_t beginOffset() const { return BeginOffset; }
+
+ /// The end offset of this partition.
+ ///
+ /// All of the contained slices end at or before this offset.
+ uint64_t endOffset() const { return EndOffset; }
+
+ /// The size of the partition.
+ ///
+ /// Note that this can never be zero.
+ uint64_t size() const {
+ assert(BeginOffset < EndOffset && "Partitions must span some bytes!");
+ return EndOffset - BeginOffset;
+ }
+
+ /// Test whether this partition contains no slices, and merely spans
+ /// a region occupied by split slices.
+ bool empty() const { return SI == SJ; }
+
+ /// \name Iterate slices that start within the partition.
+ /// These may be splittable or unsplittable. They have a begin offset >= the
+ /// partition begin offset.
+ /// @{
+ // FIXME: We should probably define a "concat_iterator" helper and use that
+ // to stitch together pointee_iterators over the split tails and the
+ // contiguous iterators of the partition. That would give a much nicer
+ // interface here. We could then additionally expose filtered iterators for
+ // split, unsplit, and unsplittable splices based on the usage patterns.
+ iterator begin() const { return SI; }
+ iterator end() const { return SJ; }
+ /// @}
+
+ /// Get the sequence of split slice tails.
+ ///
+ /// These tails are of slices which start before this partition but are
+ /// split and overlap into the partition. We accumulate these while forming
+ /// partitions.
+ ArrayRef<Slice *> splitSliceTails() const { return SplitTails; }
+};
+
+/// An iterator over partitions of the alloca's slices.
+///
+/// This iterator implements the core algorithm for partitioning the alloca's
+/// slices. It is a forward iterator as we don't support backtracking for
+/// efficiency reasons, and re-use a single storage area to maintain the
+/// current set of split slices.
+///
+/// It is templated on the slice iterator type to use so that it can operate
+/// with either const or non-const slice iterators.
+class AllocaSlices::partition_iterator
+ : public iterator_facade_base<partition_iterator, std::forward_iterator_tag,
+ Partition> {
+ friend class AllocaSlices;
+
+ /// Most of the state for walking the partitions is held in a class
+ /// with a nice interface for examining them.
+ Partition P;
+
+ /// We need to keep the end of the slices to know when to stop.
+ AllocaSlices::iterator SE;
+
+ /// We also need to keep track of the maximum split end offset seen.
+ /// FIXME: Do we really?
+ uint64_t MaxSplitSliceEndOffset = 0;
+
+ /// Sets the partition to be empty at given iterator, and sets the
+ /// end iterator.
+ partition_iterator(AllocaSlices::iterator SI, AllocaSlices::iterator SE)
+ : P(SI), SE(SE) {
+ // If not already at the end, advance our state to form the initial
+ // partition.
+ if (SI != SE)
+ advance();
+ }
+
+ /// Advance the iterator to the next partition.
+ ///
+ /// Requires that the iterator not be at the end of the slices.
+ void advance() {
+ assert((P.SI != SE || !P.SplitTails.empty()) &&
+ "Cannot advance past the end of the slices!");
+
+ // Clear out any split uses which have ended.
+ if (!P.SplitTails.empty()) {
+ if (P.EndOffset >= MaxSplitSliceEndOffset) {
+ // If we've finished all splits, this is easy.
+ P.SplitTails.clear();
+ MaxSplitSliceEndOffset = 0;
+ } else {
+ // Remove the uses which have ended in the prior partition. This
+ // cannot change the max split slice end because we just checked that
+ // the prior partition ended prior to that max.
llvm::erase_if(P.SplitTails,
[&](Slice *S) { return S->endOffset() <= P.EndOffset; });
- assert(llvm::any_of(P.SplitTails,
- [&](Slice *S) {
- return S->endOffset() == MaxSplitSliceEndOffset;
- }) &&
- "Could not find the current max split slice offset!");
- assert(llvm::all_of(P.SplitTails,
- [&](Slice *S) {
- return S->endOffset() <= MaxSplitSliceEndOffset;
- }) &&
- "Max split slice end offset is not actually the max!");
- }
- }
-
- // If P.SI is already at the end, then we've cleared the split tail and
- // now have an end iterator.
- if (P.SI == SE) {
- assert(P.SplitTails.empty() && "Failed to clear the split slices!");
- return;
- }
-
- // If we had a non-empty partition previously, set up the state for
- // subsequent partitions.
- if (P.SI != P.SJ) {
- // Accumulate all the splittable slices which started in the old
- // partition into the split list.
- for (Slice &S : P)
- if (S.isSplittable() && S.endOffset() > P.EndOffset) {
- P.SplitTails.push_back(&S);
- MaxSplitSliceEndOffset =
- std::max(S.endOffset(), MaxSplitSliceEndOffset);
- }
-
- // Start from the end of the previous partition.
- P.SI = P.SJ;
-
- // If P.SI is now at the end, we at most have a tail of split slices.
- if (P.SI == SE) {
- P.BeginOffset = P.EndOffset;
- P.EndOffset = MaxSplitSliceEndOffset;
- return;
- }
-
- // If the we have split slices and the next slice is after a gap and is
- // not splittable immediately form an empty partition for the split
- // slices up until the next slice begins.
- if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
- !P.SI->isSplittable()) {
- P.BeginOffset = P.EndOffset;
- P.EndOffset = P.SI->beginOffset();
- return;
- }
- }
-
- // OK, we need to consume new slices. Set the end offset based on the
- // current slice, and step SJ past it. The beginning offset of the
- // partition is the beginning offset of the next slice unless we have
- // pre-existing split slices that are continuing, in which case we begin
- // at the prior end offset.
- P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
- P.EndOffset = P.SI->endOffset();
- ++P.SJ;
-
- // There are two strategies to form a partition based on whether the
- // partition starts with an unsplittable slice or a splittable slice.
- if (!P.SI->isSplittable()) {
- // When we're forming an unsplittable region, it must always start at
- // the first slice and will extend through its end.
- assert(P.BeginOffset == P.SI->beginOffset());
-
- // Form a partition including all of the overlapping slices with this
- // unsplittable slice.
- while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
- if (!P.SJ->isSplittable())
- P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
- ++P.SJ;
- }
-
- // We have a partition across a set of overlapping unsplittable
- // partitions.
- return;
- }
-
- // If we're starting with a splittable slice, then we need to form
- // a synthetic partition spanning it and any other overlapping splittable
- // splices.
- assert(P.SI->isSplittable() && "Forming a splittable partition!");
-
- // Collect all of the overlapping splittable slices.
- while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
- P.SJ->isSplittable()) {
- P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
- ++P.SJ;
- }
-
- // Back upiP.EndOffset if we ended the span early when encountering an
- // unsplittable slice. This synthesizes the early end offset of
- // a partition spanning only splittable slices.
- if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
- assert(!P.SJ->isSplittable());
- P.EndOffset = P.SJ->beginOffset();
- }
- }
-
-public:
- bool operator==(const partition_iterator &RHS) const {
- assert(SE == RHS.SE &&
- "End iterators don't match between compared partition iterators!");
-
- // The observed positions of partitions is marked by the P.SI iterator and
- // the emptiness of the split slices. The latter is only relevant when
- // P.SI == SE, as the end iterator will additionally have an empty split
- // slices list, but the prior may have the same P.SI and a tail of split
- // slices.
- if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
- assert(P.SJ == RHS.P.SJ &&
- "Same set of slices formed two different sized partitions!");
- assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
- "Same slice position with differently sized non-empty split "
- "slice tails!");
- return true;
- }
- return false;
- }
-
- partition_iterator &operator++() {
- advance();
- return *this;
- }
-
- Partition &operator*() { return P; }
-};
-
-/// A forward range over the partitions of the alloca's slices.
-///
-/// This accesses an iterator range over the partitions of the alloca's
-/// slices. It computes these partitions on the fly based on the overlapping
-/// offsets of the slices and the ability to split them. It will visit "empty"
-/// partitions to cover regions of the alloca only accessed via split
-/// slices.
-iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
- return make_range(partition_iterator(begin(), end()),
- partition_iterator(end(), end()));
-}
-
-static Value *foldSelectInst(SelectInst &SI) {
- // If the condition being selected on is a constant or the same value is
- // being selected between, fold the select. Yes this does (rarely) happen
- // early on.
- if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
- return SI.getOperand(1 + CI->isZero());
- if (SI.getOperand(1) == SI.getOperand(2))
- return SI.getOperand(1);
-
- return nullptr;
-}
-
-/// A helper that folds a PHI node or a select.
-static Value *foldPHINodeOrSelectInst(Instruction &I) {
- if (PHINode *PN = dyn_cast<PHINode>(&I)) {
- // If PN merges together the same value, return that value.
- return PN->hasConstantValue();
- }
- return foldSelectInst(cast<SelectInst>(I));
-}
-
-/// Builder for the alloca slices.
-///
-/// This class builds a set of alloca slices by recursively visiting the uses
-/// of an alloca and making a slice for each load and store at each offset.
-class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
- friend class PtrUseVisitor<SliceBuilder>;
- friend class InstVisitor<SliceBuilder>;
-
- using Base = PtrUseVisitor<SliceBuilder>;
-
- const uint64_t AllocSize;
- AllocaSlices &AS;
-
- SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
- SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
-
- /// Set to de-duplicate dead instructions found in the use walk.
- SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
-
-public:
- SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
- : PtrUseVisitor<SliceBuilder>(DL),
- AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize()),
- AS(AS) {}
-
-private:
- void markAsDead(Instruction &I) {
- if (VisitedDeadInsts.insert(&I).second)
- AS.DeadUsers.push_back(&I);
- }
-
- void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
- bool IsSplittable = false) {
- // Completely skip uses which have a zero size or start either before or
- // past the end of the allocation.
- if (Size == 0 || Offset.uge(AllocSize)) {
- LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
- << Offset
- << " which has zero size or starts outside of the "
- << AllocSize << " byte alloca:\n"
- << " alloca: " << AS.AI << "\n"
- << " use: " << I << "\n");
- return markAsDead(I);
- }
-
- uint64_t BeginOffset = Offset.getZExtValue();
- uint64_t EndOffset = BeginOffset + Size;
-
- // Clamp the end offset to the end of the allocation. Note that this is
- // formulated to handle even the case where "BeginOffset + Size" overflows.
- // This may appear superficially to be something we could ignore entirely,
- // but that is not so! There may be widened loads or PHI-node uses where
- // some instructions are dead but not others. We can't completely ignore
- // them, and so have to record at least the information here.
- assert(AllocSize >= BeginOffset); // Established above.
- if (Size > AllocSize - BeginOffset) {
- LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
- << Offset << " to remain within the " << AllocSize
- << " byte alloca:\n"
- << " alloca: " << AS.AI << "\n"
- << " use: " << I << "\n");
- EndOffset = AllocSize;
- }
-
- AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
- }
-
- void visitBitCastInst(BitCastInst &BC) {
- if (BC.use_empty())
- return markAsDead(BC);
-
- return Base::visitBitCastInst(BC);
- }
-
- void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
- if (ASC.use_empty())
- return markAsDead(ASC);
-
- return Base::visitAddrSpaceCastInst(ASC);
- }
-
- void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
- if (GEPI.use_empty())
- return markAsDead(GEPI);
-
- if (SROAStrictInbounds && GEPI.isInBounds()) {
- // FIXME: This is a manually un-factored variant of the basic code inside
- // of GEPs with checking of the inbounds invariant specified in the
- // langref in a very strict sense. If we ever want to enable
- // SROAStrictInbounds, this code should be factored cleanly into
- // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
- // by writing out the code here where we have the underlying allocation
- // size readily available.
- APInt GEPOffset = Offset;
- const DataLayout &DL = GEPI.getModule()->getDataLayout();
- for (gep_type_iterator GTI = gep_type_begin(GEPI),
- GTE = gep_type_end(GEPI);
- GTI != GTE; ++GTI) {
- ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
- if (!OpC)
- break;
-
- // Handle a struct index, which adds its field offset to the pointer.
- if (StructType *STy = GTI.getStructTypeOrNull()) {
- unsigned ElementIdx = OpC->getZExtValue();
- const StructLayout *SL = DL.getStructLayout(STy);
- GEPOffset +=
- APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
- } else {
- // For array or vector indices, scale the index by the size of the
- // type.
- APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
- GEPOffset +=
- Index *
- APInt(Offset.getBitWidth(),
- DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize());
- }
-
- // If this index has computed an intermediate pointer which is not
- // inbounds, then the result of the GEP is a poison value and we can
- // delete it and all uses.
- if (GEPOffset.ugt(AllocSize))
- return markAsDead(GEPI);
- }
- }
-
- return Base::visitGetElementPtrInst(GEPI);
- }
-
- void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
- uint64_t Size, bool IsVolatile) {
- // We allow splitting of non-volatile loads and stores where the type is an
- // integer type. These may be used to implement 'memcpy' or other "transfer
- // of bits" patterns.
- bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
-
- insertUse(I, Offset, Size, IsSplittable);
- }
-
- void visitLoadInst(LoadInst &LI) {
- assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
- "All simple FCA loads should have been pre-split");
-
- if (!IsOffsetKnown)
- return PI.setAborted(&LI);
-
- if (LI.isVolatile() &&
- LI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
- return PI.setAborted(&LI);
-
+ assert(llvm::any_of(P.SplitTails,
+ [&](Slice *S) {
+ return S->endOffset() == MaxSplitSliceEndOffset;
+ }) &&
+ "Could not find the current max split slice offset!");
+ assert(llvm::all_of(P.SplitTails,
+ [&](Slice *S) {
+ return S->endOffset() <= MaxSplitSliceEndOffset;
+ }) &&
+ "Max split slice end offset is not actually the max!");
+ }
+ }
+
+ // If P.SI is already at the end, then we've cleared the split tail and
+ // now have an end iterator.
+ if (P.SI == SE) {
+ assert(P.SplitTails.empty() && "Failed to clear the split slices!");
+ return;
+ }
+
+ // If we had a non-empty partition previously, set up the state for
+ // subsequent partitions.
+ if (P.SI != P.SJ) {
+ // Accumulate all the splittable slices which started in the old
+ // partition into the split list.
+ for (Slice &S : P)
+ if (S.isSplittable() && S.endOffset() > P.EndOffset) {
+ P.SplitTails.push_back(&S);
+ MaxSplitSliceEndOffset =
+ std::max(S.endOffset(), MaxSplitSliceEndOffset);
+ }
+
+ // Start from the end of the previous partition.
+ P.SI = P.SJ;
+
+ // If P.SI is now at the end, we at most have a tail of split slices.
+ if (P.SI == SE) {
+ P.BeginOffset = P.EndOffset;
+ P.EndOffset = MaxSplitSliceEndOffset;
+ return;
+ }
+
+ // If the we have split slices and the next slice is after a gap and is
+ // not splittable immediately form an empty partition for the split
+ // slices up until the next slice begins.
+ if (!P.SplitTails.empty() && P.SI->beginOffset() != P.EndOffset &&
+ !P.SI->isSplittable()) {
+ P.BeginOffset = P.EndOffset;
+ P.EndOffset = P.SI->beginOffset();
+ return;
+ }
+ }
+
+ // OK, we need to consume new slices. Set the end offset based on the
+ // current slice, and step SJ past it. The beginning offset of the
+ // partition is the beginning offset of the next slice unless we have
+ // pre-existing split slices that are continuing, in which case we begin
+ // at the prior end offset.
+ P.BeginOffset = P.SplitTails.empty() ? P.SI->beginOffset() : P.EndOffset;
+ P.EndOffset = P.SI->endOffset();
+ ++P.SJ;
+
+ // There are two strategies to form a partition based on whether the
+ // partition starts with an unsplittable slice or a splittable slice.
+ if (!P.SI->isSplittable()) {
+ // When we're forming an unsplittable region, it must always start at
+ // the first slice and will extend through its end.
+ assert(P.BeginOffset == P.SI->beginOffset());
+
+ // Form a partition including all of the overlapping slices with this
+ // unsplittable slice.
+ while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+ if (!P.SJ->isSplittable())
+ P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+ ++P.SJ;
+ }
+
+ // We have a partition across a set of overlapping unsplittable
+ // partitions.
+ return;
+ }
+
+ // If we're starting with a splittable slice, then we need to form
+ // a synthetic partition spanning it and any other overlapping splittable
+ // splices.
+ assert(P.SI->isSplittable() && "Forming a splittable partition!");
+
+ // Collect all of the overlapping splittable slices.
+ while (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset &&
+ P.SJ->isSplittable()) {
+ P.EndOffset = std::max(P.EndOffset, P.SJ->endOffset());
+ ++P.SJ;
+ }
+
+ // Back upiP.EndOffset if we ended the span early when encountering an
+ // unsplittable slice. This synthesizes the early end offset of
+ // a partition spanning only splittable slices.
+ if (P.SJ != SE && P.SJ->beginOffset() < P.EndOffset) {
+ assert(!P.SJ->isSplittable());
+ P.EndOffset = P.SJ->beginOffset();
+ }
+ }
+
+public:
+ bool operator==(const partition_iterator &RHS) const {
+ assert(SE == RHS.SE &&
+ "End iterators don't match between compared partition iterators!");
+
+ // The observed positions of partitions is marked by the P.SI iterator and
+ // the emptiness of the split slices. The latter is only relevant when
+ // P.SI == SE, as the end iterator will additionally have an empty split
+ // slices list, but the prior may have the same P.SI and a tail of split
+ // slices.
+ if (P.SI == RHS.P.SI && P.SplitTails.empty() == RHS.P.SplitTails.empty()) {
+ assert(P.SJ == RHS.P.SJ &&
+ "Same set of slices formed two different sized partitions!");
+ assert(P.SplitTails.size() == RHS.P.SplitTails.size() &&
+ "Same slice position with differently sized non-empty split "
+ "slice tails!");
+ return true;
+ }
+ return false;
+ }
+
+ partition_iterator &operator++() {
+ advance();
+ return *this;
+ }
+
+ Partition &operator*() { return P; }
+};
+
+/// A forward range over the partitions of the alloca's slices.
+///
+/// This accesses an iterator range over the partitions of the alloca's
+/// slices. It computes these partitions on the fly based on the overlapping
+/// offsets of the slices and the ability to split them. It will visit "empty"
+/// partitions to cover regions of the alloca only accessed via split
+/// slices.
+iterator_range<AllocaSlices::partition_iterator> AllocaSlices::partitions() {
+ return make_range(partition_iterator(begin(), end()),
+ partition_iterator(end(), end()));
+}
+
+static Value *foldSelectInst(SelectInst &SI) {
+ // If the condition being selected on is a constant or the same value is
+ // being selected between, fold the select. Yes this does (rarely) happen
+ // early on.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
+ return SI.getOperand(1 + CI->isZero());
+ if (SI.getOperand(1) == SI.getOperand(2))
+ return SI.getOperand(1);
+
+ return nullptr;
+}
+
+/// A helper that folds a PHI node or a select.
+static Value *foldPHINodeOrSelectInst(Instruction &I) {
+ if (PHINode *PN = dyn_cast<PHINode>(&I)) {
+ // If PN merges together the same value, return that value.
+ return PN->hasConstantValue();
+ }
+ return foldSelectInst(cast<SelectInst>(I));
+}
+
+/// Builder for the alloca slices.
+///
+/// This class builds a set of alloca slices by recursively visiting the uses
+/// of an alloca and making a slice for each load and store at each offset.
+class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
+ friend class PtrUseVisitor<SliceBuilder>;
+ friend class InstVisitor<SliceBuilder>;
+
+ using Base = PtrUseVisitor<SliceBuilder>;
+
+ const uint64_t AllocSize;
+ AllocaSlices &AS;
+
+ SmallDenseMap<Instruction *, unsigned> MemTransferSliceMap;
+ SmallDenseMap<Instruction *, uint64_t> PHIOrSelectSizes;
+
+ /// Set to de-duplicate dead instructions found in the use walk.
+ SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
+
+public:
+ SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
+ : PtrUseVisitor<SliceBuilder>(DL),
+ AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize()),
+ AS(AS) {}
+
+private:
+ void markAsDead(Instruction &I) {
+ if (VisitedDeadInsts.insert(&I).second)
+ AS.DeadUsers.push_back(&I);
+ }
+
+ void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
+ bool IsSplittable = false) {
+ // Completely skip uses which have a zero size or start either before or
+ // past the end of the allocation.
+ if (Size == 0 || Offset.uge(AllocSize)) {
+ LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @"
+ << Offset
+ << " which has zero size or starts outside of the "
+ << AllocSize << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << I << "\n");
+ return markAsDead(I);
+ }
+
+ uint64_t BeginOffset = Offset.getZExtValue();
+ uint64_t EndOffset = BeginOffset + Size;
+
+ // Clamp the end offset to the end of the allocation. Note that this is
+ // formulated to handle even the case where "BeginOffset + Size" overflows.
+ // This may appear superficially to be something we could ignore entirely,
+ // but that is not so! There may be widened loads or PHI-node uses where
+ // some instructions are dead but not others. We can't completely ignore
+ // them, and so have to record at least the information here.
+ assert(AllocSize >= BeginOffset); // Established above.
+ if (Size > AllocSize - BeginOffset) {
+ LLVM_DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @"
+ << Offset << " to remain within the " << AllocSize
+ << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << I << "\n");
+ EndOffset = AllocSize;
+ }
+
+ AS.Slices.push_back(Slice(BeginOffset, EndOffset, U, IsSplittable));
+ }
+
+ void visitBitCastInst(BitCastInst &BC) {
+ if (BC.use_empty())
+ return markAsDead(BC);
+
+ return Base::visitBitCastInst(BC);
+ }
+
+ void visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
+ if (ASC.use_empty())
+ return markAsDead(ASC);
+
+ return Base::visitAddrSpaceCastInst(ASC);
+ }
+
+ void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+ if (GEPI.use_empty())
+ return markAsDead(GEPI);
+
+ if (SROAStrictInbounds && GEPI.isInBounds()) {
+ // FIXME: This is a manually un-factored variant of the basic code inside
+ // of GEPs with checking of the inbounds invariant specified in the
+ // langref in a very strict sense. If we ever want to enable
+ // SROAStrictInbounds, this code should be factored cleanly into
+ // PtrUseVisitor, but it is easier to experiment with SROAStrictInbounds
+ // by writing out the code here where we have the underlying allocation
+ // size readily available.
+ APInt GEPOffset = Offset;
+ const DataLayout &DL = GEPI.getModule()->getDataLayout();
+ for (gep_type_iterator GTI = gep_type_begin(GEPI),
+ GTE = gep_type_end(GEPI);
+ GTI != GTE; ++GTI) {
+ ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+ if (!OpC)
+ break;
+
+ // Handle a struct index, which adds its field offset to the pointer.
+ if (StructType *STy = GTI.getStructTypeOrNull()) {
+ unsigned ElementIdx = OpC->getZExtValue();
+ const StructLayout *SL = DL.getStructLayout(STy);
+ GEPOffset +=
+ APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
+ } else {
+ // For array or vector indices, scale the index by the size of the
+ // type.
+ APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
+ GEPOffset +=
+ Index *
+ APInt(Offset.getBitWidth(),
+ DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize());
+ }
+
+ // If this index has computed an intermediate pointer which is not
+ // inbounds, then the result of the GEP is a poison value and we can
+ // delete it and all uses.
+ if (GEPOffset.ugt(AllocSize))
+ return markAsDead(GEPI);
+ }
+ }
+
+ return Base::visitGetElementPtrInst(GEPI);
+ }
+
+ void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
+ uint64_t Size, bool IsVolatile) {
+ // We allow splitting of non-volatile loads and stores where the type is an
+ // integer type. These may be used to implement 'memcpy' or other "transfer
+ // of bits" patterns.
+ bool IsSplittable = Ty->isIntegerTy() && !IsVolatile;
+
+ insertUse(I, Offset, Size, IsSplittable);
+ }
+
+ void visitLoadInst(LoadInst &LI) {
+ assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
+ "All simple FCA loads should have been pre-split");
+
+ if (!IsOffsetKnown)
+ return PI.setAborted(&LI);
+
+ if (LI.isVolatile() &&
+ LI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
+ return PI.setAborted(&LI);
+
if (isa<ScalableVectorType>(LI.getType()))
return PI.setAborted(&LI);
- uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize();
- return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
- }
-
- void visitStoreInst(StoreInst &SI) {
- Value *ValOp = SI.getValueOperand();
- if (ValOp == *U)
- return PI.setEscapedAndAborted(&SI);
- if (!IsOffsetKnown)
- return PI.setAborted(&SI);
-
- if (SI.isVolatile() &&
- SI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
- return PI.setAborted(&SI);
-
+ uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize();
+ return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
+ }
+
+ void visitStoreInst(StoreInst &SI) {
+ Value *ValOp = SI.getValueOperand();
+ if (ValOp == *U)
+ return PI.setEscapedAndAborted(&SI);
+ if (!IsOffsetKnown)
+ return PI.setAborted(&SI);
+
+ if (SI.isVolatile() &&
+ SI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
+ return PI.setAborted(&SI);
+
if (isa<ScalableVectorType>(ValOp->getType()))
return PI.setAborted(&SI);
- uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedSize();
-
- // If this memory access can be shown to *statically* extend outside the
- // bounds of the allocation, it's behavior is undefined, so simply
- // ignore it. Note that this is more strict than the generic clamping
- // behavior of insertUse. We also try to handle cases which might run the
- // risk of overflow.
- // FIXME: We should instead consider the pointer to have escaped if this
- // function is being instrumented for addressing bugs or race conditions.
- if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
- LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
- << Offset << " which extends past the end of the "
- << AllocSize << " byte alloca:\n"
- << " alloca: " << AS.AI << "\n"
- << " use: " << SI << "\n");
- return markAsDead(SI);
- }
-
- assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
- "All simple FCA stores should have been pre-split");
- handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
- }
-
- void visitMemSetInst(MemSetInst &II) {
- assert(II.getRawDest() == *U && "Pointer use is not the destination?");
- ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
- if ((Length && Length->getValue() == 0) ||
- (IsOffsetKnown && Offset.uge(AllocSize)))
- // Zero-length mem transfer intrinsics can be ignored entirely.
- return markAsDead(II);
-
- if (!IsOffsetKnown)
- return PI.setAborted(&II);
-
- // Don't replace this with a store with a different address space. TODO:
- // Use a store with the casted new alloca?
- if (II.isVolatile() && II.getDestAddressSpace() != DL.getAllocaAddrSpace())
- return PI.setAborted(&II);
-
- insertUse(II, Offset, Length ? Length->getLimitedValue()
- : AllocSize - Offset.getLimitedValue(),
- (bool)Length);
- }
-
- void visitMemTransferInst(MemTransferInst &II) {
- ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
- if (Length && Length->getValue() == 0)
- // Zero-length mem transfer intrinsics can be ignored entirely.
- return markAsDead(II);
-
- // Because we can visit these intrinsics twice, also check to see if the
- // first time marked this instruction as dead. If so, skip it.
- if (VisitedDeadInsts.count(&II))
- return;
-
- if (!IsOffsetKnown)
- return PI.setAborted(&II);
-
- // Don't replace this with a load/store with a different address space.
- // TODO: Use a store with the casted new alloca?
- if (II.isVolatile() &&
- (II.getDestAddressSpace() != DL.getAllocaAddrSpace() ||
- II.getSourceAddressSpace() != DL.getAllocaAddrSpace()))
- return PI.setAborted(&II);
-
- // This side of the transfer is completely out-of-bounds, and so we can
- // nuke the entire transfer. However, we also need to nuke the other side
- // if already added to our partitions.
- // FIXME: Yet another place we really should bypass this when
- // instrumenting for ASan.
- if (Offset.uge(AllocSize)) {
- SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
- MemTransferSliceMap.find(&II);
- if (MTPI != MemTransferSliceMap.end())
- AS.Slices[MTPI->second].kill();
- return markAsDead(II);
- }
-
- uint64_t RawOffset = Offset.getLimitedValue();
- uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
-
- // Check for the special case where the same exact value is used for both
- // source and dest.
- if (*U == II.getRawDest() && *U == II.getRawSource()) {
- // For non-volatile transfers this is a no-op.
- if (!II.isVolatile())
- return markAsDead(II);
-
- return insertUse(II, Offset, Size, /*IsSplittable=*/false);
- }
-
- // If we have seen both source and destination for a mem transfer, then
- // they both point to the same alloca.
- bool Inserted;
- SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
- std::tie(MTPI, Inserted) =
- MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
- unsigned PrevIdx = MTPI->second;
- if (!Inserted) {
- Slice &PrevP = AS.Slices[PrevIdx];
-
- // Check if the begin offsets match and this is a non-volatile transfer.
- // In that case, we can completely elide the transfer.
- if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
- PrevP.kill();
- return markAsDead(II);
- }
-
- // Otherwise we have an offset transfer within the same alloca. We can't
- // split those.
- PrevP.makeUnsplittable();
- }
-
- // Insert the use now that we've fixed up the splittable nature.
- insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
-
- // Check that we ended up with a valid index in the map.
- assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
- "Map index doesn't point back to a slice with this user.");
- }
-
- // Disable SRoA for any intrinsics except for lifetime invariants.
- // FIXME: What about debug intrinsics? This matches old behavior, but
- // doesn't make sense.
- void visitIntrinsicInst(IntrinsicInst &II) {
+ uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedSize();
+
+ // If this memory access can be shown to *statically* extend outside the
+ // bounds of the allocation, it's behavior is undefined, so simply
+ // ignore it. Note that this is more strict than the generic clamping
+ // behavior of insertUse. We also try to handle cases which might run the
+ // risk of overflow.
+ // FIXME: We should instead consider the pointer to have escaped if this
+ // function is being instrumented for addressing bugs or race conditions.
+ if (Size > AllocSize || Offset.ugt(AllocSize - Size)) {
+ LLVM_DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @"
+ << Offset << " which extends past the end of the "
+ << AllocSize << " byte alloca:\n"
+ << " alloca: " << AS.AI << "\n"
+ << " use: " << SI << "\n");
+ return markAsDead(SI);
+ }
+
+ assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
+ "All simple FCA stores should have been pre-split");
+ handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
+ }
+
+ void visitMemSetInst(MemSetInst &II) {
+ assert(II.getRawDest() == *U && "Pointer use is not the destination?");
+ ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+ if ((Length && Length->getValue() == 0) ||
+ (IsOffsetKnown && Offset.uge(AllocSize)))
+ // Zero-length mem transfer intrinsics can be ignored entirely.
+ return markAsDead(II);
+
+ if (!IsOffsetKnown)
+ return PI.setAborted(&II);
+
+ // Don't replace this with a store with a different address space. TODO:
+ // Use a store with the casted new alloca?
+ if (II.isVolatile() && II.getDestAddressSpace() != DL.getAllocaAddrSpace())
+ return PI.setAborted(&II);
+
+ insertUse(II, Offset, Length ? Length->getLimitedValue()
+ : AllocSize - Offset.getLimitedValue(),
+ (bool)Length);
+ }
+
+ void visitMemTransferInst(MemTransferInst &II) {
+ ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+ if (Length && Length->getValue() == 0)
+ // Zero-length mem transfer intrinsics can be ignored entirely.
+ return markAsDead(II);
+
+ // Because we can visit these intrinsics twice, also check to see if the
+ // first time marked this instruction as dead. If so, skip it.
+ if (VisitedDeadInsts.count(&II))
+ return;
+
+ if (!IsOffsetKnown)
+ return PI.setAborted(&II);
+
+ // Don't replace this with a load/store with a different address space.
+ // TODO: Use a store with the casted new alloca?
+ if (II.isVolatile() &&
+ (II.getDestAddressSpace() != DL.getAllocaAddrSpace() ||
+ II.getSourceAddressSpace() != DL.getAllocaAddrSpace()))
+ return PI.setAborted(&II);
+
+ // This side of the transfer is completely out-of-bounds, and so we can
+ // nuke the entire transfer. However, we also need to nuke the other side
+ // if already added to our partitions.
+ // FIXME: Yet another place we really should bypass this when
+ // instrumenting for ASan.
+ if (Offset.uge(AllocSize)) {
+ SmallDenseMap<Instruction *, unsigned>::iterator MTPI =
+ MemTransferSliceMap.find(&II);
+ if (MTPI != MemTransferSliceMap.end())
+ AS.Slices[MTPI->second].kill();
+ return markAsDead(II);
+ }
+
+ uint64_t RawOffset = Offset.getLimitedValue();
+ uint64_t Size = Length ? Length->getLimitedValue() : AllocSize - RawOffset;
+
+ // Check for the special case where the same exact value is used for both
+ // source and dest.
+ if (*U == II.getRawDest() && *U == II.getRawSource()) {
+ // For non-volatile transfers this is a no-op.
+ if (!II.isVolatile())
+ return markAsDead(II);
+
+ return insertUse(II, Offset, Size, /*IsSplittable=*/false);
+ }
+
+ // If we have seen both source and destination for a mem transfer, then
+ // they both point to the same alloca.
+ bool Inserted;
+ SmallDenseMap<Instruction *, unsigned>::iterator MTPI;
+ std::tie(MTPI, Inserted) =
+ MemTransferSliceMap.insert(std::make_pair(&II, AS.Slices.size()));
+ unsigned PrevIdx = MTPI->second;
+ if (!Inserted) {
+ Slice &PrevP = AS.Slices[PrevIdx];
+
+ // Check if the begin offsets match and this is a non-volatile transfer.
+ // In that case, we can completely elide the transfer.
+ if (!II.isVolatile() && PrevP.beginOffset() == RawOffset) {
+ PrevP.kill();
+ return markAsDead(II);
+ }
+
+ // Otherwise we have an offset transfer within the same alloca. We can't
+ // split those.
+ PrevP.makeUnsplittable();
+ }
+
+ // Insert the use now that we've fixed up the splittable nature.
+ insertUse(II, Offset, Size, /*IsSplittable=*/Inserted && Length);
+
+ // Check that we ended up with a valid index in the map.
+ assert(AS.Slices[PrevIdx].getUse()->getUser() == &II &&
+ "Map index doesn't point back to a slice with this user.");
+ }
+
+ // Disable SRoA for any intrinsics except for lifetime invariants.
+ // FIXME: What about debug intrinsics? This matches old behavior, but
+ // doesn't make sense.
+ void visitIntrinsicInst(IntrinsicInst &II) {
if (II.isDroppable()) {
AS.DeadUseIfPromotable.push_back(U);
return;
}
- if (!IsOffsetKnown)
- return PI.setAborted(&II);
-
- if (II.isLifetimeStartOrEnd()) {
- ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
- uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
- Length->getLimitedValue());
- insertUse(II, Offset, Size, true);
- return;
- }
-
- Base::visitIntrinsicInst(II);
- }
-
- Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
- // We consider any PHI or select that results in a direct load or store of
- // the same offset to be a viable use for slicing purposes. These uses
- // are considered unsplittable and the size is the maximum loaded or stored
- // size.
- SmallPtrSet<Instruction *, 4> Visited;
- SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
- Visited.insert(Root);
- Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
- const DataLayout &DL = Root->getModule()->getDataLayout();
- // If there are no loads or stores, the access is dead. We mark that as
- // a size zero access.
- Size = 0;
- do {
- Instruction *I, *UsedI;
- std::tie(UsedI, I) = Uses.pop_back_val();
-
- if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
- Size = std::max(Size,
- DL.getTypeStoreSize(LI->getType()).getFixedSize());
- continue;
- }
- if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
- Value *Op = SI->getOperand(0);
- if (Op == UsedI)
- return SI;
- Size = std::max(Size,
- DL.getTypeStoreSize(Op->getType()).getFixedSize());
- continue;
- }
-
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
- if (!GEP->hasAllZeroIndices())
- return GEP;
- } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
- !isa<SelectInst>(I) && !isa<AddrSpaceCastInst>(I)) {
- return I;
- }
-
- for (User *U : I->users())
- if (Visited.insert(cast<Instruction>(U)).second)
- Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
- } while (!Uses.empty());
-
- return nullptr;
- }
-
- void visitPHINodeOrSelectInst(Instruction &I) {
- assert(isa<PHINode>(I) || isa<SelectInst>(I));
- if (I.use_empty())
- return markAsDead(I);
-
- // TODO: We could use SimplifyInstruction here to fold PHINodes and
- // SelectInsts. However, doing so requires to change the current
- // dead-operand-tracking mechanism. For instance, suppose neither loading
- // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
- // trap either. However, if we simply replace %U with undef using the
- // current dead-operand-tracking mechanism, "load (select undef, undef,
- // %other)" may trap because the select may return the first operand
- // "undef".
- if (Value *Result = foldPHINodeOrSelectInst(I)) {
- if (Result == *U)
- // If the result of the constant fold will be the pointer, recurse
- // through the PHI/select as if we had RAUW'ed it.
- enqueueUsers(I);
- else
- // Otherwise the operand to the PHI/select is dead, and we can replace
- // it with undef.
- AS.DeadOperands.push_back(U);
-
- return;
- }
-
- if (!IsOffsetKnown)
- return PI.setAborted(&I);
-
- // See if we already have computed info on this node.
- uint64_t &Size = PHIOrSelectSizes[&I];
- if (!Size) {
- // This is a new PHI/Select, check for an unsafe use of it.
- if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
- return PI.setAborted(UnsafeI);
- }
-
- // For PHI and select operands outside the alloca, we can't nuke the entire
- // phi or select -- the other side might still be relevant, so we special
- // case them here and use a separate structure to track the operands
- // themselves which should be replaced with undef.
- // FIXME: This should instead be escaped in the event we're instrumenting
- // for address sanitization.
- if (Offset.uge(AllocSize)) {
- AS.DeadOperands.push_back(U);
- return;
- }
-
- insertUse(I, Offset, Size);
- }
-
- void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
-
- void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
-
- /// Disable SROA entirely if there are unhandled users of the alloca.
- void visitInstruction(Instruction &I) { PI.setAborted(&I); }
-};
-
-AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
- :
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- AI(AI),
-#endif
- PointerEscapingInstr(nullptr) {
- SliceBuilder PB(DL, AI, *this);
- SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
- if (PtrI.isEscaped() || PtrI.isAborted()) {
- // FIXME: We should sink the escape vs. abort info into the caller nicely,
- // possibly by just storing the PtrInfo in the AllocaSlices.
- PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
- : PtrI.getAbortingInst();
- assert(PointerEscapingInstr && "Did not track a bad instruction");
- return;
- }
-
+ if (!IsOffsetKnown)
+ return PI.setAborted(&II);
+
+ if (II.isLifetimeStartOrEnd()) {
+ ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
+ uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
+ Length->getLimitedValue());
+ insertUse(II, Offset, Size, true);
+ return;
+ }
+
+ Base::visitIntrinsicInst(II);
+ }
+
+ Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
+ // We consider any PHI or select that results in a direct load or store of
+ // the same offset to be a viable use for slicing purposes. These uses
+ // are considered unsplittable and the size is the maximum loaded or stored
+ // size.
+ SmallPtrSet<Instruction *, 4> Visited;
+ SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
+ Visited.insert(Root);
+ Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
+ const DataLayout &DL = Root->getModule()->getDataLayout();
+ // If there are no loads or stores, the access is dead. We mark that as
+ // a size zero access.
+ Size = 0;
+ do {
+ Instruction *I, *UsedI;
+ std::tie(UsedI, I) = Uses.pop_back_val();
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ Size = std::max(Size,
+ DL.getTypeStoreSize(LI->getType()).getFixedSize());
+ continue;
+ }
+ if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ Value *Op = SI->getOperand(0);
+ if (Op == UsedI)
+ return SI;
+ Size = std::max(Size,
+ DL.getTypeStoreSize(Op->getType()).getFixedSize());
+ continue;
+ }
+
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+ if (!GEP->hasAllZeroIndices())
+ return GEP;
+ } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
+ !isa<SelectInst>(I) && !isa<AddrSpaceCastInst>(I)) {
+ return I;
+ }
+
+ for (User *U : I->users())
+ if (Visited.insert(cast<Instruction>(U)).second)
+ Uses.push_back(std::make_pair(I, cast<Instruction>(U)));
+ } while (!Uses.empty());
+
+ return nullptr;
+ }
+
+ void visitPHINodeOrSelectInst(Instruction &I) {
+ assert(isa<PHINode>(I) || isa<SelectInst>(I));
+ if (I.use_empty())
+ return markAsDead(I);
+
+ // TODO: We could use SimplifyInstruction here to fold PHINodes and
+ // SelectInsts. However, doing so requires to change the current
+ // dead-operand-tracking mechanism. For instance, suppose neither loading
+ // from %U nor %other traps. Then "load (select undef, %U, %other)" does not
+ // trap either. However, if we simply replace %U with undef using the
+ // current dead-operand-tracking mechanism, "load (select undef, undef,
+ // %other)" may trap because the select may return the first operand
+ // "undef".
+ if (Value *Result = foldPHINodeOrSelectInst(I)) {
+ if (Result == *U)
+ // If the result of the constant fold will be the pointer, recurse
+ // through the PHI/select as if we had RAUW'ed it.
+ enqueueUsers(I);
+ else
+ // Otherwise the operand to the PHI/select is dead, and we can replace
+ // it with undef.
+ AS.DeadOperands.push_back(U);
+
+ return;
+ }
+
+ if (!IsOffsetKnown)
+ return PI.setAborted(&I);
+
+ // See if we already have computed info on this node.
+ uint64_t &Size = PHIOrSelectSizes[&I];
+ if (!Size) {
+ // This is a new PHI/Select, check for an unsafe use of it.
+ if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&I, Size))
+ return PI.setAborted(UnsafeI);
+ }
+
+ // For PHI and select operands outside the alloca, we can't nuke the entire
+ // phi or select -- the other side might still be relevant, so we special
+ // case them here and use a separate structure to track the operands
+ // themselves which should be replaced with undef.
+ // FIXME: This should instead be escaped in the event we're instrumenting
+ // for address sanitization.
+ if (Offset.uge(AllocSize)) {
+ AS.DeadOperands.push_back(U);
+ return;
+ }
+
+ insertUse(I, Offset, Size);
+ }
+
+ void visitPHINode(PHINode &PN) { visitPHINodeOrSelectInst(PN); }
+
+ void visitSelectInst(SelectInst &SI) { visitPHINodeOrSelectInst(SI); }
+
+ /// Disable SROA entirely if there are unhandled users of the alloca.
+ void visitInstruction(Instruction &I) { PI.setAborted(&I); }
+};
+
+AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
+ :
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ AI(AI),
+#endif
+ PointerEscapingInstr(nullptr) {
+ SliceBuilder PB(DL, AI, *this);
+ SliceBuilder::PtrInfo PtrI = PB.visitPtr(AI);
+ if (PtrI.isEscaped() || PtrI.isAborted()) {
+ // FIXME: We should sink the escape vs. abort info into the caller nicely,
+ // possibly by just storing the PtrInfo in the AllocaSlices.
+ PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
+ : PtrI.getAbortingInst();
+ assert(PointerEscapingInstr && "Did not track a bad instruction");
+ return;
+ }
+
llvm::erase_if(Slices, [](const Slice &S) { return S.isDead(); });
-
- // Sort the uses. This arranges for the offsets to be in ascending order,
- // and the sizes to be in descending order.
+
+ // Sort the uses. This arranges for the offsets to be in ascending order,
+ // and the sizes to be in descending order.
llvm::stable_sort(Slices);
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-
-void AllocaSlices::print(raw_ostream &OS, const_iterator I,
- StringRef Indent) const {
- printSlice(OS, I, Indent);
- OS << "\n";
- printUse(OS, I, Indent);
-}
-
-void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
- StringRef Indent) const {
- OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
- << " slice #" << (I - begin())
- << (I->isSplittable() ? " (splittable)" : "");
-}
-
-void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
- StringRef Indent) const {
- OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
-}
-
-void AllocaSlices::print(raw_ostream &OS) const {
- if (PointerEscapingInstr) {
- OS << "Can't analyze slices for alloca: " << AI << "\n"
- << " A pointer to this alloca escaped by:\n"
- << " " << *PointerEscapingInstr << "\n";
- return;
- }
-
- OS << "Slices of alloca: " << AI << "\n";
- for (const_iterator I = begin(), E = end(); I != E; ++I)
- print(OS, I);
-}
-
-LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
- print(dbgs(), I);
-}
-LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
-
-#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-
-/// Walk the range of a partitioning looking for a common type to cover this
-/// sequence of slices.
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+void AllocaSlices::print(raw_ostream &OS, const_iterator I,
+ StringRef Indent) const {
+ printSlice(OS, I, Indent);
+ OS << "\n";
+ printUse(OS, I, Indent);
+}
+
+void AllocaSlices::printSlice(raw_ostream &OS, const_iterator I,
+ StringRef Indent) const {
+ OS << Indent << "[" << I->beginOffset() << "," << I->endOffset() << ")"
+ << " slice #" << (I - begin())
+ << (I->isSplittable() ? " (splittable)" : "");
+}
+
+void AllocaSlices::printUse(raw_ostream &OS, const_iterator I,
+ StringRef Indent) const {
+ OS << Indent << " used by: " << *I->getUse()->getUser() << "\n";
+}
+
+void AllocaSlices::print(raw_ostream &OS) const {
+ if (PointerEscapingInstr) {
+ OS << "Can't analyze slices for alloca: " << AI << "\n"
+ << " A pointer to this alloca escaped by:\n"
+ << " " << *PointerEscapingInstr << "\n";
+ return;
+ }
+
+ OS << "Slices of alloca: " << AI << "\n";
+ for (const_iterator I = begin(), E = end(); I != E; ++I)
+ print(OS, I);
+}
+
+LLVM_DUMP_METHOD void AllocaSlices::dump(const_iterator I) const {
+ print(dbgs(), I);
+}
+LLVM_DUMP_METHOD void AllocaSlices::dump() const { print(dbgs()); }
+
+#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+/// Walk the range of a partitioning looking for a common type to cover this
+/// sequence of slices.
static std::pair<Type *, IntegerType *>
findCommonType(AllocaSlices::const_iterator B, AllocaSlices::const_iterator E,
uint64_t EndOffset) {
- Type *Ty = nullptr;
- bool TyIsCommon = true;
- IntegerType *ITy = nullptr;
-
- // Note that we need to look at *every* alloca slice's Use to ensure we
- // always get consistent results regardless of the order of slices.
- for (AllocaSlices::const_iterator I = B; I != E; ++I) {
- Use *U = I->getUse();
- if (isa<IntrinsicInst>(*U->getUser()))
- continue;
- if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
- continue;
-
- Type *UserTy = nullptr;
- if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
- UserTy = LI->getType();
- } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
- UserTy = SI->getValueOperand()->getType();
- }
-
- if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
- // If the type is larger than the partition, skip it. We only encounter
- // this for split integer operations where we want to use the type of the
- // entity causing the split. Also skip if the type is not a byte width
- // multiple.
- if (UserITy->getBitWidth() % 8 != 0 ||
- UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
- continue;
-
- // Track the largest bitwidth integer type used in this way in case there
- // is no common type.
- if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
- ITy = UserITy;
- }
-
- // To avoid depending on the order of slices, Ty and TyIsCommon must not
- // depend on types skipped above.
- if (!UserTy || (Ty && Ty != UserTy))
- TyIsCommon = false; // Give up on anything but an iN type.
- else
- Ty = UserTy;
- }
-
+ Type *Ty = nullptr;
+ bool TyIsCommon = true;
+ IntegerType *ITy = nullptr;
+
+ // Note that we need to look at *every* alloca slice's Use to ensure we
+ // always get consistent results regardless of the order of slices.
+ for (AllocaSlices::const_iterator I = B; I != E; ++I) {
+ Use *U = I->getUse();
+ if (isa<IntrinsicInst>(*U->getUser()))
+ continue;
+ if (I->beginOffset() != B->beginOffset() || I->endOffset() != EndOffset)
+ continue;
+
+ Type *UserTy = nullptr;
+ if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+ UserTy = LI->getType();
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+ UserTy = SI->getValueOperand()->getType();
+ }
+
+ if (IntegerType *UserITy = dyn_cast_or_null<IntegerType>(UserTy)) {
+ // If the type is larger than the partition, skip it. We only encounter
+ // this for split integer operations where we want to use the type of the
+ // entity causing the split. Also skip if the type is not a byte width
+ // multiple.
+ if (UserITy->getBitWidth() % 8 != 0 ||
+ UserITy->getBitWidth() / 8 > (EndOffset - B->beginOffset()))
+ continue;
+
+ // Track the largest bitwidth integer type used in this way in case there
+ // is no common type.
+ if (!ITy || ITy->getBitWidth() < UserITy->getBitWidth())
+ ITy = UserITy;
+ }
+
+ // To avoid depending on the order of slices, Ty and TyIsCommon must not
+ // depend on types skipped above.
+ if (!UserTy || (Ty && Ty != UserTy))
+ TyIsCommon = false; // Give up on anything but an iN type.
+ else
+ Ty = UserTy;
+ }
+
return {TyIsCommon ? Ty : nullptr, ITy};
-}
-
-/// PHI instructions that use an alloca and are subsequently loaded can be
-/// rewritten to load both input pointers in the pred blocks and then PHI the
-/// results, allowing the load of the alloca to be promoted.
-/// From this:
-/// %P2 = phi [i32* %Alloca, i32* %Other]
-/// %V = load i32* %P2
-/// to:
-/// %V1 = load i32* %Alloca -> will be mem2reg'd
-/// ...
-/// %V2 = load i32* %Other
-/// ...
-/// %V = phi [i32 %V1, i32 %V2]
-///
-/// We can do this to a select if its only uses are loads and if the operands
-/// to the select can be loaded unconditionally.
-///
-/// FIXME: This should be hoisted into a generic utility, likely in
-/// Transforms/Util/Local.h
-static bool isSafePHIToSpeculate(PHINode &PN) {
- const DataLayout &DL = PN.getModule()->getDataLayout();
-
- // For now, we can only do this promotion if the load is in the same block
- // as the PHI, and if there are no stores between the phi and load.
- // TODO: Allow recursive phi users.
- // TODO: Allow stores.
- BasicBlock *BB = PN.getParent();
- Align MaxAlign;
- uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
- APInt MaxSize(APWidth, 0);
- bool HaveLoad = false;
- for (User *U : PN.users()) {
- LoadInst *LI = dyn_cast<LoadInst>(U);
- if (!LI || !LI->isSimple())
- return false;
-
- // For now we only allow loads in the same block as the PHI. This is
- // a common case that happens when instcombine merges two loads through
- // a PHI.
- if (LI->getParent() != BB)
- return false;
-
- // Ensure that there are no instructions between the PHI and the load that
- // could store.
- for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
- if (BBI->mayWriteToMemory())
- return false;
-
- uint64_t Size = DL.getTypeStoreSize(LI->getType()).getFixedSize();
- MaxAlign = std::max(MaxAlign, LI->getAlign());
- MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize;
- HaveLoad = true;
- }
-
- if (!HaveLoad)
- return false;
-
- // We can only transform this if it is safe to push the loads into the
- // predecessor blocks. The only thing to watch out for is that we can't put
- // a possibly trapping load in the predecessor if it is a critical edge.
- for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
- Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator();
- Value *InVal = PN.getIncomingValue(Idx);
-
- // If the value is produced by the terminator of the predecessor (an
- // invoke) or it has side-effects, there is no valid place to put a load
- // in the predecessor.
- if (TI == InVal || TI->mayHaveSideEffects())
- return false;
-
- // If the predecessor has a single successor, then the edge isn't
- // critical.
- if (TI->getNumSuccessors() == 1)
- continue;
-
- // If this pointer is always safe to load, or if we can prove that there
- // is already a load in the block, then we can move the load to the pred
- // block.
- if (isSafeToLoadUnconditionally(InVal, MaxAlign, MaxSize, DL, TI))
- continue;
-
- return false;
- }
-
- return true;
-}
-
-static void speculatePHINodeLoads(PHINode &PN) {
- LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
-
- LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
- Type *LoadTy = SomeLoad->getType();
- IRBuilderTy PHIBuilder(&PN);
- PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
- PN.getName() + ".sroa.speculated");
-
- // Get the AA tags and alignment to use from one of the loads. It does not
- // matter which one we get and if any differ.
- AAMDNodes AATags;
- SomeLoad->getAAMetadata(AATags);
- Align Alignment = SomeLoad->getAlign();
-
- // Rewrite all loads of the PN to use the new PHI.
- while (!PN.use_empty()) {
- LoadInst *LI = cast<LoadInst>(PN.user_back());
- LI->replaceAllUsesWith(NewPN);
- LI->eraseFromParent();
- }
-
- // Inject loads into all of the pred blocks.
- DenseMap<BasicBlock*, Value*> InjectedLoads;
- for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
- BasicBlock *Pred = PN.getIncomingBlock(Idx);
- Value *InVal = PN.getIncomingValue(Idx);
-
- // A PHI node is allowed to have multiple (duplicated) entries for the same
- // basic block, as long as the value is the same. So if we already injected
- // a load in the predecessor, then we should reuse the same load for all
- // duplicated entries.
- if (Value* V = InjectedLoads.lookup(Pred)) {
- NewPN->addIncoming(V, Pred);
- continue;
- }
-
- Instruction *TI = Pred->getTerminator();
- IRBuilderTy PredBuilder(TI);
-
- LoadInst *Load = PredBuilder.CreateAlignedLoad(
- LoadTy, InVal, Alignment,
- (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
- ++NumLoadsSpeculated;
- if (AATags)
- Load->setAAMetadata(AATags);
- NewPN->addIncoming(Load, Pred);
- InjectedLoads[Pred] = Load;
- }
-
- LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
- PN.eraseFromParent();
-}
-
-/// Select instructions that use an alloca and are subsequently loaded can be
-/// rewritten to load both input pointers and then select between the result,
-/// allowing the load of the alloca to be promoted.
-/// From this:
-/// %P2 = select i1 %cond, i32* %Alloca, i32* %Other
-/// %V = load i32* %P2
-/// to:
-/// %V1 = load i32* %Alloca -> will be mem2reg'd
-/// %V2 = load i32* %Other
-/// %V = select i1 %cond, i32 %V1, i32 %V2
-///
-/// We can do this to a select if its only uses are loads and if the operand
-/// to the select can be loaded unconditionally.
-static bool isSafeSelectToSpeculate(SelectInst &SI) {
- Value *TValue = SI.getTrueValue();
- Value *FValue = SI.getFalseValue();
- const DataLayout &DL = SI.getModule()->getDataLayout();
-
- for (User *U : SI.users()) {
- LoadInst *LI = dyn_cast<LoadInst>(U);
- if (!LI || !LI->isSimple())
- return false;
-
- // Both operands to the select need to be dereferenceable, either
- // absolutely (e.g. allocas) or at this point because we can see other
- // accesses to it.
- if (!isSafeToLoadUnconditionally(TValue, LI->getType(),
- LI->getAlign(), DL, LI))
- return false;
- if (!isSafeToLoadUnconditionally(FValue, LI->getType(),
- LI->getAlign(), DL, LI))
- return false;
- }
-
- return true;
-}
-
-static void speculateSelectInstLoads(SelectInst &SI) {
- LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
-
- IRBuilderTy IRB(&SI);
- Value *TV = SI.getTrueValue();
- Value *FV = SI.getFalseValue();
- // Replace the loads of the select with a select of two loads.
- while (!SI.use_empty()) {
- LoadInst *LI = cast<LoadInst>(SI.user_back());
- assert(LI->isSimple() && "We only speculate simple loads");
-
- IRB.SetInsertPoint(LI);
- LoadInst *TL = IRB.CreateLoad(LI->getType(), TV,
- LI->getName() + ".sroa.speculate.load.true");
- LoadInst *FL = IRB.CreateLoad(LI->getType(), FV,
- LI->getName() + ".sroa.speculate.load.false");
- NumLoadsSpeculated += 2;
-
- // Transfer alignment and AA info if present.
- TL->setAlignment(LI->getAlign());
- FL->setAlignment(LI->getAlign());
-
- AAMDNodes Tags;
- LI->getAAMetadata(Tags);
- if (Tags) {
- TL->setAAMetadata(Tags);
- FL->setAAMetadata(Tags);
- }
-
- Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
- LI->getName() + ".sroa.speculated");
-
- LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
- LI->replaceAllUsesWith(V);
- LI->eraseFromParent();
- }
- SI.eraseFromParent();
-}
-
-/// Build a GEP out of a base pointer and indices.
-///
-/// This will return the BasePtr if that is valid, or build a new GEP
-/// instruction using the IRBuilder if GEP-ing is needed.
-static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
+}
+
+/// PHI instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers in the pred blocks and then PHI the
+/// results, allowing the load of the alloca to be promoted.
+/// From this:
+/// %P2 = phi [i32* %Alloca, i32* %Other]
+/// %V = load i32* %P2
+/// to:
+/// %V1 = load i32* %Alloca -> will be mem2reg'd
+/// ...
+/// %V2 = load i32* %Other
+/// ...
+/// %V = phi [i32 %V1, i32 %V2]
+///
+/// We can do this to a select if its only uses are loads and if the operands
+/// to the select can be loaded unconditionally.
+///
+/// FIXME: This should be hoisted into a generic utility, likely in
+/// Transforms/Util/Local.h
+static bool isSafePHIToSpeculate(PHINode &PN) {
+ const DataLayout &DL = PN.getModule()->getDataLayout();
+
+ // For now, we can only do this promotion if the load is in the same block
+ // as the PHI, and if there are no stores between the phi and load.
+ // TODO: Allow recursive phi users.
+ // TODO: Allow stores.
+ BasicBlock *BB = PN.getParent();
+ Align MaxAlign;
+ uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
+ APInt MaxSize(APWidth, 0);
+ bool HaveLoad = false;
+ for (User *U : PN.users()) {
+ LoadInst *LI = dyn_cast<LoadInst>(U);
+ if (!LI || !LI->isSimple())
+ return false;
+
+ // For now we only allow loads in the same block as the PHI. This is
+ // a common case that happens when instcombine merges two loads through
+ // a PHI.
+ if (LI->getParent() != BB)
+ return false;
+
+ // Ensure that there are no instructions between the PHI and the load that
+ // could store.
+ for (BasicBlock::iterator BBI(PN); &*BBI != LI; ++BBI)
+ if (BBI->mayWriteToMemory())
+ return false;
+
+ uint64_t Size = DL.getTypeStoreSize(LI->getType()).getFixedSize();
+ MaxAlign = std::max(MaxAlign, LI->getAlign());
+ MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize;
+ HaveLoad = true;
+ }
+
+ if (!HaveLoad)
+ return false;
+
+ // We can only transform this if it is safe to push the loads into the
+ // predecessor blocks. The only thing to watch out for is that we can't put
+ // a possibly trapping load in the predecessor if it is a critical edge.
+ for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+ Instruction *TI = PN.getIncomingBlock(Idx)->getTerminator();
+ Value *InVal = PN.getIncomingValue(Idx);
+
+ // If the value is produced by the terminator of the predecessor (an
+ // invoke) or it has side-effects, there is no valid place to put a load
+ // in the predecessor.
+ if (TI == InVal || TI->mayHaveSideEffects())
+ return false;
+
+ // If the predecessor has a single successor, then the edge isn't
+ // critical.
+ if (TI->getNumSuccessors() == 1)
+ continue;
+
+ // If this pointer is always safe to load, or if we can prove that there
+ // is already a load in the block, then we can move the load to the pred
+ // block.
+ if (isSafeToLoadUnconditionally(InVal, MaxAlign, MaxSize, DL, TI))
+ continue;
+
+ return false;
+ }
+
+ return true;
+}
+
+static void speculatePHINodeLoads(PHINode &PN) {
+ LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
+
+ LoadInst *SomeLoad = cast<LoadInst>(PN.user_back());
+ Type *LoadTy = SomeLoad->getType();
+ IRBuilderTy PHIBuilder(&PN);
+ PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
+ PN.getName() + ".sroa.speculated");
+
+ // Get the AA tags and alignment to use from one of the loads. It does not
+ // matter which one we get and if any differ.
+ AAMDNodes AATags;
+ SomeLoad->getAAMetadata(AATags);
+ Align Alignment = SomeLoad->getAlign();
+
+ // Rewrite all loads of the PN to use the new PHI.
+ while (!PN.use_empty()) {
+ LoadInst *LI = cast<LoadInst>(PN.user_back());
+ LI->replaceAllUsesWith(NewPN);
+ LI->eraseFromParent();
+ }
+
+ // Inject loads into all of the pred blocks.
+ DenseMap<BasicBlock*, Value*> InjectedLoads;
+ for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+ BasicBlock *Pred = PN.getIncomingBlock(Idx);
+ Value *InVal = PN.getIncomingValue(Idx);
+
+ // A PHI node is allowed to have multiple (duplicated) entries for the same
+ // basic block, as long as the value is the same. So if we already injected
+ // a load in the predecessor, then we should reuse the same load for all
+ // duplicated entries.
+ if (Value* V = InjectedLoads.lookup(Pred)) {
+ NewPN->addIncoming(V, Pred);
+ continue;
+ }
+
+ Instruction *TI = Pred->getTerminator();
+ IRBuilderTy PredBuilder(TI);
+
+ LoadInst *Load = PredBuilder.CreateAlignedLoad(
+ LoadTy, InVal, Alignment,
+ (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
+ ++NumLoadsSpeculated;
+ if (AATags)
+ Load->setAAMetadata(AATags);
+ NewPN->addIncoming(Load, Pred);
+ InjectedLoads[Pred] = Load;
+ }
+
+ LLVM_DEBUG(dbgs() << " speculated to: " << *NewPN << "\n");
+ PN.eraseFromParent();
+}
+
+/// Select instructions that use an alloca and are subsequently loaded can be
+/// rewritten to load both input pointers and then select between the result,
+/// allowing the load of the alloca to be promoted.
+/// From this:
+/// %P2 = select i1 %cond, i32* %Alloca, i32* %Other
+/// %V = load i32* %P2
+/// to:
+/// %V1 = load i32* %Alloca -> will be mem2reg'd
+/// %V2 = load i32* %Other
+/// %V = select i1 %cond, i32 %V1, i32 %V2
+///
+/// We can do this to a select if its only uses are loads and if the operand
+/// to the select can be loaded unconditionally.
+static bool isSafeSelectToSpeculate(SelectInst &SI) {
+ Value *TValue = SI.getTrueValue();
+ Value *FValue = SI.getFalseValue();
+ const DataLayout &DL = SI.getModule()->getDataLayout();
+
+ for (User *U : SI.users()) {
+ LoadInst *LI = dyn_cast<LoadInst>(U);
+ if (!LI || !LI->isSimple())
+ return false;
+
+ // Both operands to the select need to be dereferenceable, either
+ // absolutely (e.g. allocas) or at this point because we can see other
+ // accesses to it.
+ if (!isSafeToLoadUnconditionally(TValue, LI->getType(),
+ LI->getAlign(), DL, LI))
+ return false;
+ if (!isSafeToLoadUnconditionally(FValue, LI->getType(),
+ LI->getAlign(), DL, LI))
+ return false;
+ }
+
+ return true;
+}
+
+static void speculateSelectInstLoads(SelectInst &SI) {
+ LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
+
+ IRBuilderTy IRB(&SI);
+ Value *TV = SI.getTrueValue();
+ Value *FV = SI.getFalseValue();
+ // Replace the loads of the select with a select of two loads.
+ while (!SI.use_empty()) {
+ LoadInst *LI = cast<LoadInst>(SI.user_back());
+ assert(LI->isSimple() && "We only speculate simple loads");
+
+ IRB.SetInsertPoint(LI);
+ LoadInst *TL = IRB.CreateLoad(LI->getType(), TV,
+ LI->getName() + ".sroa.speculate.load.true");
+ LoadInst *FL = IRB.CreateLoad(LI->getType(), FV,
+ LI->getName() + ".sroa.speculate.load.false");
+ NumLoadsSpeculated += 2;
+
+ // Transfer alignment and AA info if present.
+ TL->setAlignment(LI->getAlign());
+ FL->setAlignment(LI->getAlign());
+
+ AAMDNodes Tags;
+ LI->getAAMetadata(Tags);
+ if (Tags) {
+ TL->setAAMetadata(Tags);
+ FL->setAAMetadata(Tags);
+ }
+
+ Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL,
+ LI->getName() + ".sroa.speculated");
+
+ LLVM_DEBUG(dbgs() << " speculated to: " << *V << "\n");
+ LI->replaceAllUsesWith(V);
+ LI->eraseFromParent();
+ }
+ SI.eraseFromParent();
+}
+
+/// Build a GEP out of a base pointer and indices.
+///
+/// This will return the BasePtr if that is valid, or build a new GEP
+/// instruction using the IRBuilder if GEP-ing is needed.
+static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
SmallVectorImpl<Value *> &Indices,
const Twine &NamePrefix) {
- if (Indices.empty())
- return BasePtr;
-
- // A single zero index is a no-op, so check for this and avoid building a GEP
- // in that case.
- if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
- return BasePtr;
-
- return IRB.CreateInBoundsGEP(BasePtr->getType()->getPointerElementType(),
- BasePtr, Indices, NamePrefix + "sroa_idx");
-}
-
-/// Get a natural GEP off of the BasePtr walking through Ty toward
-/// TargetTy without changing the offset of the pointer.
-///
-/// This routine assumes we've already established a properly offset GEP with
-/// Indices, and arrived at the Ty type. The goal is to continue to GEP with
-/// zero-indices down through type layers until we find one the same as
-/// TargetTy. If we can't find one with the same type, we at least try to use
-/// one with the same size. If none of that works, we just produce the GEP as
-/// indicated by Indices to have the correct offset.
-static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
- Value *BasePtr, Type *Ty, Type *TargetTy,
- SmallVectorImpl<Value *> &Indices,
+ if (Indices.empty())
+ return BasePtr;
+
+ // A single zero index is a no-op, so check for this and avoid building a GEP
+ // in that case.
+ if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
+ return BasePtr;
+
+ return IRB.CreateInBoundsGEP(BasePtr->getType()->getPointerElementType(),
+ BasePtr, Indices, NamePrefix + "sroa_idx");
+}
+
+/// Get a natural GEP off of the BasePtr walking through Ty toward
+/// TargetTy without changing the offset of the pointer.
+///
+/// This routine assumes we've already established a properly offset GEP with
+/// Indices, and arrived at the Ty type. The goal is to continue to GEP with
+/// zero-indices down through type layers until we find one the same as
+/// TargetTy. If we can't find one with the same type, we at least try to use
+/// one with the same size. If none of that works, we just produce the GEP as
+/// indicated by Indices to have the correct offset.
+static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &DL,
+ Value *BasePtr, Type *Ty, Type *TargetTy,
+ SmallVectorImpl<Value *> &Indices,
const Twine &NamePrefix) {
- if (Ty == TargetTy)
- return buildGEP(IRB, BasePtr, Indices, NamePrefix);
-
- // Offset size to use for the indices.
- unsigned OffsetSize = DL.getIndexTypeSizeInBits(BasePtr->getType());
-
- // See if we can descend into a struct and locate a field with the correct
- // type.
- unsigned NumLayers = 0;
- Type *ElementTy = Ty;
- do {
- if (ElementTy->isPointerTy())
- break;
-
- if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
- ElementTy = ArrayTy->getElementType();
- Indices.push_back(IRB.getIntN(OffsetSize, 0));
- } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
- ElementTy = VectorTy->getElementType();
- Indices.push_back(IRB.getInt32(0));
- } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
- if (STy->element_begin() == STy->element_end())
- break; // Nothing left to descend into.
- ElementTy = *STy->element_begin();
- Indices.push_back(IRB.getInt32(0));
- } else {
- break;
- }
- ++NumLayers;
- } while (ElementTy != TargetTy);
- if (ElementTy != TargetTy)
- Indices.erase(Indices.end() - NumLayers, Indices.end());
-
- return buildGEP(IRB, BasePtr, Indices, NamePrefix);
-}
-
-/// Recursively compute indices for a natural GEP.
-///
-/// This is the recursive step for getNaturalGEPWithOffset that walks down the
-/// element types adding appropriate indices for the GEP.
-static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
- Value *Ptr, Type *Ty, APInt &Offset,
- Type *TargetTy,
- SmallVectorImpl<Value *> &Indices,
+ if (Ty == TargetTy)
+ return buildGEP(IRB, BasePtr, Indices, NamePrefix);
+
+ // Offset size to use for the indices.
+ unsigned OffsetSize = DL.getIndexTypeSizeInBits(BasePtr->getType());
+
+ // See if we can descend into a struct and locate a field with the correct
+ // type.
+ unsigned NumLayers = 0;
+ Type *ElementTy = Ty;
+ do {
+ if (ElementTy->isPointerTy())
+ break;
+
+ if (ArrayType *ArrayTy = dyn_cast<ArrayType>(ElementTy)) {
+ ElementTy = ArrayTy->getElementType();
+ Indices.push_back(IRB.getIntN(OffsetSize, 0));
+ } else if (VectorType *VectorTy = dyn_cast<VectorType>(ElementTy)) {
+ ElementTy = VectorTy->getElementType();
+ Indices.push_back(IRB.getInt32(0));
+ } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
+ if (STy->element_begin() == STy->element_end())
+ break; // Nothing left to descend into.
+ ElementTy = *STy->element_begin();
+ Indices.push_back(IRB.getInt32(0));
+ } else {
+ break;
+ }
+ ++NumLayers;
+ } while (ElementTy != TargetTy);
+ if (ElementTy != TargetTy)
+ Indices.erase(Indices.end() - NumLayers, Indices.end());
+
+ return buildGEP(IRB, BasePtr, Indices, NamePrefix);
+}
+
+/// Recursively compute indices for a natural GEP.
+///
+/// This is the recursive step for getNaturalGEPWithOffset that walks down the
+/// element types adding appropriate indices for the GEP.
+static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
+ Value *Ptr, Type *Ty, APInt &Offset,
+ Type *TargetTy,
+ SmallVectorImpl<Value *> &Indices,
const Twine &NamePrefix) {
- if (Offset == 0)
- return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
- NamePrefix);
-
- // We can't recurse through pointer types.
- if (Ty->isPointerTy())
- return nullptr;
-
- // We try to analyze GEPs over vectors here, but note that these GEPs are
- // extremely poorly defined currently. The long-term goal is to remove GEPing
- // over a vector from the IR completely.
- if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
- unsigned ElementSizeInBits =
- DL.getTypeSizeInBits(VecTy->getScalarType()).getFixedSize();
- if (ElementSizeInBits % 8 != 0) {
- // GEPs over non-multiple of 8 size vector elements are invalid.
- return nullptr;
- }
- APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
- APInt NumSkippedElements = Offset.sdiv(ElementSize);
- if (NumSkippedElements.ugt(cast<FixedVectorType>(VecTy)->getNumElements()))
- return nullptr;
- Offset -= NumSkippedElements * ElementSize;
- Indices.push_back(IRB.getInt(NumSkippedElements));
- return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
- Offset, TargetTy, Indices, NamePrefix);
- }
-
- if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
- Type *ElementTy = ArrTy->getElementType();
- APInt ElementSize(Offset.getBitWidth(),
- DL.getTypeAllocSize(ElementTy).getFixedSize());
- APInt NumSkippedElements = Offset.sdiv(ElementSize);
- if (NumSkippedElements.ugt(ArrTy->getNumElements()))
- return nullptr;
-
- Offset -= NumSkippedElements * ElementSize;
- Indices.push_back(IRB.getInt(NumSkippedElements));
- return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
- Indices, NamePrefix);
- }
-
- StructType *STy = dyn_cast<StructType>(Ty);
- if (!STy)
- return nullptr;
-
- const StructLayout *SL = DL.getStructLayout(STy);
- uint64_t StructOffset = Offset.getZExtValue();
- if (StructOffset >= SL->getSizeInBytes())
- return nullptr;
- unsigned Index = SL->getElementContainingOffset(StructOffset);
- Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
- Type *ElementTy = STy->getElementType(Index);
- if (Offset.uge(DL.getTypeAllocSize(ElementTy).getFixedSize()))
- return nullptr; // The offset points into alignment padding.
-
- Indices.push_back(IRB.getInt32(Index));
- return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
- Indices, NamePrefix);
-}
-
-/// Get a natural GEP from a base pointer to a particular offset and
-/// resulting in a particular type.
-///
-/// The goal is to produce a "natural" looking GEP that works with the existing
-/// composite types to arrive at the appropriate offset and element type for
-/// a pointer. TargetTy is the element type the returned GEP should point-to if
-/// possible. We recurse by decreasing Offset, adding the appropriate index to
-/// Indices, and setting Ty to the result subtype.
-///
-/// If no natural GEP can be constructed, this function returns null.
-static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
- Value *Ptr, APInt Offset, Type *TargetTy,
- SmallVectorImpl<Value *> &Indices,
+ if (Offset == 0)
+ return getNaturalGEPWithType(IRB, DL, Ptr, Ty, TargetTy, Indices,
+ NamePrefix);
+
+ // We can't recurse through pointer types.
+ if (Ty->isPointerTy())
+ return nullptr;
+
+ // We try to analyze GEPs over vectors here, but note that these GEPs are
+ // extremely poorly defined currently. The long-term goal is to remove GEPing
+ // over a vector from the IR completely.
+ if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
+ unsigned ElementSizeInBits =
+ DL.getTypeSizeInBits(VecTy->getScalarType()).getFixedSize();
+ if (ElementSizeInBits % 8 != 0) {
+ // GEPs over non-multiple of 8 size vector elements are invalid.
+ return nullptr;
+ }
+ APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
+ APInt NumSkippedElements = Offset.sdiv(ElementSize);
+ if (NumSkippedElements.ugt(cast<FixedVectorType>(VecTy)->getNumElements()))
+ return nullptr;
+ Offset -= NumSkippedElements * ElementSize;
+ Indices.push_back(IRB.getInt(NumSkippedElements));
+ return getNaturalGEPRecursively(IRB, DL, Ptr, VecTy->getElementType(),
+ Offset, TargetTy, Indices, NamePrefix);
+ }
+
+ if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+ Type *ElementTy = ArrTy->getElementType();
+ APInt ElementSize(Offset.getBitWidth(),
+ DL.getTypeAllocSize(ElementTy).getFixedSize());
+ APInt NumSkippedElements = Offset.sdiv(ElementSize);
+ if (NumSkippedElements.ugt(ArrTy->getNumElements()))
+ return nullptr;
+
+ Offset -= NumSkippedElements * ElementSize;
+ Indices.push_back(IRB.getInt(NumSkippedElements));
+ return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+ Indices, NamePrefix);
+ }
+
+ StructType *STy = dyn_cast<StructType>(Ty);
+ if (!STy)
+ return nullptr;
+
+ const StructLayout *SL = DL.getStructLayout(STy);
+ uint64_t StructOffset = Offset.getZExtValue();
+ if (StructOffset >= SL->getSizeInBytes())
+ return nullptr;
+ unsigned Index = SL->getElementContainingOffset(StructOffset);
+ Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
+ Type *ElementTy = STy->getElementType(Index);
+ if (Offset.uge(DL.getTypeAllocSize(ElementTy).getFixedSize()))
+ return nullptr; // The offset points into alignment padding.
+
+ Indices.push_back(IRB.getInt32(Index));
+ return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+ Indices, NamePrefix);
+}
+
+/// Get a natural GEP from a base pointer to a particular offset and
+/// resulting in a particular type.
+///
+/// The goal is to produce a "natural" looking GEP that works with the existing
+/// composite types to arrive at the appropriate offset and element type for
+/// a pointer. TargetTy is the element type the returned GEP should point-to if
+/// possible. We recurse by decreasing Offset, adding the appropriate index to
+/// Indices, and setting Ty to the result subtype.
+///
+/// If no natural GEP can be constructed, this function returns null.
+static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
+ Value *Ptr, APInt Offset, Type *TargetTy,
+ SmallVectorImpl<Value *> &Indices,
const Twine &NamePrefix) {
- PointerType *Ty = cast<PointerType>(Ptr->getType());
-
- // Don't consider any GEPs through an i8* as natural unless the TargetTy is
- // an i8.
- if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
- return nullptr;
-
- Type *ElementTy = Ty->getElementType();
- if (!ElementTy->isSized())
- return nullptr; // We can't GEP through an unsized element.
+ PointerType *Ty = cast<PointerType>(Ptr->getType());
+
+ // Don't consider any GEPs through an i8* as natural unless the TargetTy is
+ // an i8.
+ if (Ty == IRB.getInt8PtrTy(Ty->getAddressSpace()) && TargetTy->isIntegerTy(8))
+ return nullptr;
+
+ Type *ElementTy = Ty->getElementType();
+ if (!ElementTy->isSized())
+ return nullptr; // We can't GEP through an unsized element.
if (isa<ScalableVectorType>(ElementTy))
return nullptr;
- APInt ElementSize(Offset.getBitWidth(),
- DL.getTypeAllocSize(ElementTy).getFixedSize());
- if (ElementSize == 0)
- return nullptr; // Zero-length arrays can't help us build a natural GEP.
- APInt NumSkippedElements = Offset.sdiv(ElementSize);
-
- Offset -= NumSkippedElements * ElementSize;
- Indices.push_back(IRB.getInt(NumSkippedElements));
- return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
- Indices, NamePrefix);
-}
-
-/// Compute an adjusted pointer from Ptr by Offset bytes where the
-/// resulting pointer has PointerTy.
-///
-/// This tries very hard to compute a "natural" GEP which arrives at the offset
-/// and produces the pointer type desired. Where it cannot, it will try to use
-/// the natural GEP to arrive at the offset and bitcast to the type. Where that
-/// fails, it will try to use an existing i8* and GEP to the byte offset and
-/// bitcast to the type.
-///
-/// The strategy for finding the more natural GEPs is to peel off layers of the
-/// pointer, walking back through bit casts and GEPs, searching for a base
-/// pointer from which we can compute a natural GEP with the desired
-/// properties. The algorithm tries to fold as many constant indices into
-/// a single GEP as possible, thus making each GEP more independent of the
-/// surrounding code.
-static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
+ APInt ElementSize(Offset.getBitWidth(),
+ DL.getTypeAllocSize(ElementTy).getFixedSize());
+ if (ElementSize == 0)
+ return nullptr; // Zero-length arrays can't help us build a natural GEP.
+ APInt NumSkippedElements = Offset.sdiv(ElementSize);
+
+ Offset -= NumSkippedElements * ElementSize;
+ Indices.push_back(IRB.getInt(NumSkippedElements));
+ return getNaturalGEPRecursively(IRB, DL, Ptr, ElementTy, Offset, TargetTy,
+ Indices, NamePrefix);
+}
+
+/// Compute an adjusted pointer from Ptr by Offset bytes where the
+/// resulting pointer has PointerTy.
+///
+/// This tries very hard to compute a "natural" GEP which arrives at the offset
+/// and produces the pointer type desired. Where it cannot, it will try to use
+/// the natural GEP to arrive at the offset and bitcast to the type. Where that
+/// fails, it will try to use an existing i8* and GEP to the byte offset and
+/// bitcast to the type.
+///
+/// The strategy for finding the more natural GEPs is to peel off layers of the
+/// pointer, walking back through bit casts and GEPs, searching for a base
+/// pointer from which we can compute a natural GEP with the desired
+/// properties. The algorithm tries to fold as many constant indices into
+/// a single GEP as possible, thus making each GEP more independent of the
+/// surrounding code.
+static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
APInt Offset, Type *PointerTy,
const Twine &NamePrefix) {
- // Even though we don't look through PHI nodes, we could be called on an
- // instruction in an unreachable block, which may be on a cycle.
- SmallPtrSet<Value *, 4> Visited;
- Visited.insert(Ptr);
- SmallVector<Value *, 4> Indices;
-
- // We may end up computing an offset pointer that has the wrong type. If we
- // never are able to compute one directly that has the correct type, we'll
- // fall back to it, so keep it and the base it was computed from around here.
- Value *OffsetPtr = nullptr;
- Value *OffsetBasePtr;
-
- // Remember any i8 pointer we come across to re-use if we need to do a raw
- // byte offset.
- Value *Int8Ptr = nullptr;
- APInt Int8PtrOffset(Offset.getBitWidth(), 0);
-
- PointerType *TargetPtrTy = cast<PointerType>(PointerTy);
- Type *TargetTy = TargetPtrTy->getElementType();
-
- // As `addrspacecast` is , `Ptr` (the storage pointer) may have different
- // address space from the expected `PointerTy` (the pointer to be used).
- // Adjust the pointer type based the original storage pointer.
- auto AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
- PointerTy = TargetTy->getPointerTo(AS);
-
- do {
- // First fold any existing GEPs into the offset.
- while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
- APInt GEPOffset(Offset.getBitWidth(), 0);
- if (!GEP->accumulateConstantOffset(DL, GEPOffset))
- break;
- Offset += GEPOffset;
- Ptr = GEP->getPointerOperand();
- if (!Visited.insert(Ptr).second)
- break;
- }
-
- // See if we can perform a natural GEP here.
- Indices.clear();
- if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
- Indices, NamePrefix)) {
- // If we have a new natural pointer at the offset, clear out any old
- // offset pointer we computed. Unless it is the base pointer or
- // a non-instruction, we built a GEP we don't need. Zap it.
- if (OffsetPtr && OffsetPtr != OffsetBasePtr)
- if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
- assert(I->use_empty() && "Built a GEP with uses some how!");
- I->eraseFromParent();
- }
- OffsetPtr = P;
- OffsetBasePtr = Ptr;
- // If we also found a pointer of the right type, we're done.
- if (P->getType() == PointerTy)
- break;
- }
-
- // Stash this pointer if we've found an i8*.
- if (Ptr->getType()->isIntegerTy(8)) {
- Int8Ptr = Ptr;
- Int8PtrOffset = Offset;
- }
-
- // Peel off a layer of the pointer and update the offset appropriately.
- if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
- Ptr = cast<Operator>(Ptr)->getOperand(0);
- } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
- if (GA->isInterposable())
- break;
- Ptr = GA->getAliasee();
- } else {
- break;
- }
- assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
- } while (Visited.insert(Ptr).second);
-
- if (!OffsetPtr) {
- if (!Int8Ptr) {
- Int8Ptr = IRB.CreateBitCast(
- Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
- NamePrefix + "sroa_raw_cast");
- Int8PtrOffset = Offset;
- }
-
- OffsetPtr = Int8PtrOffset == 0
- ? Int8Ptr
- : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
- IRB.getInt(Int8PtrOffset),
- NamePrefix + "sroa_raw_idx");
- }
- Ptr = OffsetPtr;
-
- // On the off chance we were targeting i8*, guard the bitcast here.
- if (cast<PointerType>(Ptr->getType()) != TargetPtrTy) {
- Ptr = IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr,
- TargetPtrTy,
- NamePrefix + "sroa_cast");
- }
-
- return Ptr;
-}
-
-/// Compute the adjusted alignment for a load or store from an offset.
-static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
- return commonAlignment(getLoadStoreAlignment(I), Offset);
-}
-
-/// Test whether we can convert a value from the old to the new type.
-///
-/// This predicate should be used to guard calls to convertValue in order to
-/// ensure that we only try to convert viable values. The strategy is that we
-/// will peel off single element struct and array wrappings to get to an
-/// underlying value, and convert that value.
-static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
- if (OldTy == NewTy)
- return true;
-
- // For integer types, we can't handle any bit-width differences. This would
- // break both vector conversions with extension and introduce endianness
- // issues when in conjunction with loads and stores.
- if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
- assert(cast<IntegerType>(OldTy)->getBitWidth() !=
- cast<IntegerType>(NewTy)->getBitWidth() &&
- "We can't have the same bitwidth for different int types");
- return false;
- }
-
- if (DL.getTypeSizeInBits(NewTy).getFixedSize() !=
- DL.getTypeSizeInBits(OldTy).getFixedSize())
- return false;
- if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
- return false;
-
- // We can convert pointers to integers and vice-versa. Same for vectors
- // of pointers and integers.
- OldTy = OldTy->getScalarType();
- NewTy = NewTy->getScalarType();
- if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
- if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
- unsigned OldAS = OldTy->getPointerAddressSpace();
- unsigned NewAS = NewTy->getPointerAddressSpace();
- // Convert pointers if they are pointers from the same address space or
- // different integral (not non-integral) address spaces with the same
- // pointer size.
- return OldAS == NewAS ||
- (!DL.isNonIntegralAddressSpace(OldAS) &&
- !DL.isNonIntegralAddressSpace(NewAS) &&
- DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
- }
-
- // We can convert integers to integral pointers, but not to non-integral
- // pointers.
- if (OldTy->isIntegerTy())
- return !DL.isNonIntegralPointerType(NewTy);
-
- // We can convert integral pointers to integers, but non-integral pointers
- // need to remain pointers.
- if (!DL.isNonIntegralPointerType(OldTy))
- return NewTy->isIntegerTy();
-
- return false;
- }
-
- return true;
-}
-
-/// Generic routine to convert an SSA value to a value of a different
-/// type.
-///
-/// This will try various different casting techniques, such as bitcasts,
-/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
-/// two types for viability with this routine.
-static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
- Type *NewTy) {
- Type *OldTy = V->getType();
- assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
-
- if (OldTy == NewTy)
- return V;
-
- assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
- "Integer types must be the exact same to convert.");
-
- // See if we need inttoptr for this type pair. May require additional bitcast.
- if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
- // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
- // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
- // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
- // Directly handle i64 to i8*
- return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
- NewTy);
- }
-
- // See if we need ptrtoint for this type pair. May require additional bitcast.
- if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
- // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
- // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
- // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
- // Expand i8* to i64 --> i8* to i64 to i64
- return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
- NewTy);
- }
-
- if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
- unsigned OldAS = OldTy->getPointerAddressSpace();
- unsigned NewAS = NewTy->getPointerAddressSpace();
- // To convert pointers with different address spaces (they are already
- // checked convertible, i.e. they have the same pointer size), so far we
- // cannot use `bitcast` (which has restrict on the same address space) or
- // `addrspacecast` (which is not always no-op casting). Instead, use a pair
- // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
- // size.
- if (OldAS != NewAS) {
- assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
- return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
- NewTy);
- }
- }
-
- return IRB.CreateBitCast(V, NewTy);
-}
-
-/// Test whether the given slice use can be promoted to a vector.
-///
-/// This function is called to test each entry in a partition which is slated
-/// for a single slice.
-static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
- VectorType *Ty,
- uint64_t ElementSize,
- const DataLayout &DL) {
- // First validate the slice offsets.
- uint64_t BeginOffset =
- std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
- uint64_t BeginIndex = BeginOffset / ElementSize;
- if (BeginIndex * ElementSize != BeginOffset ||
- BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
- return false;
- uint64_t EndOffset =
- std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
- uint64_t EndIndex = EndOffset / ElementSize;
- if (EndIndex * ElementSize != EndOffset ||
- EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
- return false;
-
- assert(EndIndex > BeginIndex && "Empty vector!");
- uint64_t NumElements = EndIndex - BeginIndex;
- Type *SliceTy = (NumElements == 1)
- ? Ty->getElementType()
- : FixedVectorType::get(Ty->getElementType(), NumElements);
-
- Type *SplitIntTy =
- Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
-
- Use *U = S.getUse();
-
- if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
- if (MI->isVolatile())
- return false;
- if (!S.isSplittable())
- return false; // Skip any unsplittable intrinsics.
- } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+ // Even though we don't look through PHI nodes, we could be called on an
+ // instruction in an unreachable block, which may be on a cycle.
+ SmallPtrSet<Value *, 4> Visited;
+ Visited.insert(Ptr);
+ SmallVector<Value *, 4> Indices;
+
+ // We may end up computing an offset pointer that has the wrong type. If we
+ // never are able to compute one directly that has the correct type, we'll
+ // fall back to it, so keep it and the base it was computed from around here.
+ Value *OffsetPtr = nullptr;
+ Value *OffsetBasePtr;
+
+ // Remember any i8 pointer we come across to re-use if we need to do a raw
+ // byte offset.
+ Value *Int8Ptr = nullptr;
+ APInt Int8PtrOffset(Offset.getBitWidth(), 0);
+
+ PointerType *TargetPtrTy = cast<PointerType>(PointerTy);
+ Type *TargetTy = TargetPtrTy->getElementType();
+
+ // As `addrspacecast` is , `Ptr` (the storage pointer) may have different
+ // address space from the expected `PointerTy` (the pointer to be used).
+ // Adjust the pointer type based the original storage pointer.
+ auto AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
+ PointerTy = TargetTy->getPointerTo(AS);
+
+ do {
+ // First fold any existing GEPs into the offset.
+ while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
+ APInt GEPOffset(Offset.getBitWidth(), 0);
+ if (!GEP->accumulateConstantOffset(DL, GEPOffset))
+ break;
+ Offset += GEPOffset;
+ Ptr = GEP->getPointerOperand();
+ if (!Visited.insert(Ptr).second)
+ break;
+ }
+
+ // See if we can perform a natural GEP here.
+ Indices.clear();
+ if (Value *P = getNaturalGEPWithOffset(IRB, DL, Ptr, Offset, TargetTy,
+ Indices, NamePrefix)) {
+ // If we have a new natural pointer at the offset, clear out any old
+ // offset pointer we computed. Unless it is the base pointer or
+ // a non-instruction, we built a GEP we don't need. Zap it.
+ if (OffsetPtr && OffsetPtr != OffsetBasePtr)
+ if (Instruction *I = dyn_cast<Instruction>(OffsetPtr)) {
+ assert(I->use_empty() && "Built a GEP with uses some how!");
+ I->eraseFromParent();
+ }
+ OffsetPtr = P;
+ OffsetBasePtr = Ptr;
+ // If we also found a pointer of the right type, we're done.
+ if (P->getType() == PointerTy)
+ break;
+ }
+
+ // Stash this pointer if we've found an i8*.
+ if (Ptr->getType()->isIntegerTy(8)) {
+ Int8Ptr = Ptr;
+ Int8PtrOffset = Offset;
+ }
+
+ // Peel off a layer of the pointer and update the offset appropriately.
+ if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
+ Ptr = cast<Operator>(Ptr)->getOperand(0);
+ } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
+ if (GA->isInterposable())
+ break;
+ Ptr = GA->getAliasee();
+ } else {
+ break;
+ }
+ assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
+ } while (Visited.insert(Ptr).second);
+
+ if (!OffsetPtr) {
+ if (!Int8Ptr) {
+ Int8Ptr = IRB.CreateBitCast(
+ Ptr, IRB.getInt8PtrTy(PointerTy->getPointerAddressSpace()),
+ NamePrefix + "sroa_raw_cast");
+ Int8PtrOffset = Offset;
+ }
+
+ OffsetPtr = Int8PtrOffset == 0
+ ? Int8Ptr
+ : IRB.CreateInBoundsGEP(IRB.getInt8Ty(), Int8Ptr,
+ IRB.getInt(Int8PtrOffset),
+ NamePrefix + "sroa_raw_idx");
+ }
+ Ptr = OffsetPtr;
+
+ // On the off chance we were targeting i8*, guard the bitcast here.
+ if (cast<PointerType>(Ptr->getType()) != TargetPtrTy) {
+ Ptr = IRB.CreatePointerBitCastOrAddrSpaceCast(Ptr,
+ TargetPtrTy,
+ NamePrefix + "sroa_cast");
+ }
+
+ return Ptr;
+}
+
+/// Compute the adjusted alignment for a load or store from an offset.
+static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
+ return commonAlignment(getLoadStoreAlignment(I), Offset);
+}
+
+/// Test whether we can convert a value from the old to the new type.
+///
+/// This predicate should be used to guard calls to convertValue in order to
+/// ensure that we only try to convert viable values. The strategy is that we
+/// will peel off single element struct and array wrappings to get to an
+/// underlying value, and convert that value.
+static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
+ if (OldTy == NewTy)
+ return true;
+
+ // For integer types, we can't handle any bit-width differences. This would
+ // break both vector conversions with extension and introduce endianness
+ // issues when in conjunction with loads and stores.
+ if (isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) {
+ assert(cast<IntegerType>(OldTy)->getBitWidth() !=
+ cast<IntegerType>(NewTy)->getBitWidth() &&
+ "We can't have the same bitwidth for different int types");
+ return false;
+ }
+
+ if (DL.getTypeSizeInBits(NewTy).getFixedSize() !=
+ DL.getTypeSizeInBits(OldTy).getFixedSize())
+ return false;
+ if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
+ return false;
+
+ // We can convert pointers to integers and vice-versa. Same for vectors
+ // of pointers and integers.
+ OldTy = OldTy->getScalarType();
+ NewTy = NewTy->getScalarType();
+ if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
+ if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
+ unsigned OldAS = OldTy->getPointerAddressSpace();
+ unsigned NewAS = NewTy->getPointerAddressSpace();
+ // Convert pointers if they are pointers from the same address space or
+ // different integral (not non-integral) address spaces with the same
+ // pointer size.
+ return OldAS == NewAS ||
+ (!DL.isNonIntegralAddressSpace(OldAS) &&
+ !DL.isNonIntegralAddressSpace(NewAS) &&
+ DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
+ }
+
+ // We can convert integers to integral pointers, but not to non-integral
+ // pointers.
+ if (OldTy->isIntegerTy())
+ return !DL.isNonIntegralPointerType(NewTy);
+
+ // We can convert integral pointers to integers, but non-integral pointers
+ // need to remain pointers.
+ if (!DL.isNonIntegralPointerType(OldTy))
+ return NewTy->isIntegerTy();
+
+ return false;
+ }
+
+ return true;
+}
+
+/// Generic routine to convert an SSA value to a value of a different
+/// type.
+///
+/// This will try various different casting techniques, such as bitcasts,
+/// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
+/// two types for viability with this routine.
+static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
+ Type *NewTy) {
+ Type *OldTy = V->getType();
+ assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type");
+
+ if (OldTy == NewTy)
+ return V;
+
+ assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
+ "Integer types must be the exact same to convert.");
+
+ // See if we need inttoptr for this type pair. May require additional bitcast.
+ if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
+ // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
+ // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
+ // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
+ // Directly handle i64 to i8*
+ return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+ NewTy);
+ }
+
+ // See if we need ptrtoint for this type pair. May require additional bitcast.
+ if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
+ // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
+ // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
+ // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
+ // Expand i8* to i64 --> i8* to i64 to i64
+ return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+ NewTy);
+ }
+
+ if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
+ unsigned OldAS = OldTy->getPointerAddressSpace();
+ unsigned NewAS = NewTy->getPointerAddressSpace();
+ // To convert pointers with different address spaces (they are already
+ // checked convertible, i.e. they have the same pointer size), so far we
+ // cannot use `bitcast` (which has restrict on the same address space) or
+ // `addrspacecast` (which is not always no-op casting). Instead, use a pair
+ // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
+ // size.
+ if (OldAS != NewAS) {
+ assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
+ return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+ NewTy);
+ }
+ }
+
+ return IRB.CreateBitCast(V, NewTy);
+}
+
+/// Test whether the given slice use can be promoted to a vector.
+///
+/// This function is called to test each entry in a partition which is slated
+/// for a single slice.
+static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
+ VectorType *Ty,
+ uint64_t ElementSize,
+ const DataLayout &DL) {
+ // First validate the slice offsets.
+ uint64_t BeginOffset =
+ std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
+ uint64_t BeginIndex = BeginOffset / ElementSize;
+ if (BeginIndex * ElementSize != BeginOffset ||
+ BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
+ return false;
+ uint64_t EndOffset =
+ std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
+ uint64_t EndIndex = EndOffset / ElementSize;
+ if (EndIndex * ElementSize != EndOffset ||
+ EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
+ return false;
+
+ assert(EndIndex > BeginIndex && "Empty vector!");
+ uint64_t NumElements = EndIndex - BeginIndex;
+ Type *SliceTy = (NumElements == 1)
+ ? Ty->getElementType()
+ : FixedVectorType::get(Ty->getElementType(), NumElements);
+
+ Type *SplitIntTy =
+ Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
+
+ Use *U = S.getUse();
+
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+ if (MI->isVolatile())
+ return false;
+ if (!S.isSplittable())
+ return false; // Skip any unsplittable intrinsics.
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
- return false;
- } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
- // Disable vector promotion when there are loads or stores of an FCA.
- return false;
- } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
- if (LI->isVolatile())
- return false;
- Type *LTy = LI->getType();
- if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
- assert(LTy->isIntegerTy());
- LTy = SplitIntTy;
- }
- if (!canConvertValue(DL, SliceTy, LTy))
- return false;
- } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
- if (SI->isVolatile())
- return false;
- Type *STy = SI->getValueOperand()->getType();
- if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
- assert(STy->isIntegerTy());
- STy = SplitIntTy;
- }
- if (!canConvertValue(DL, STy, SliceTy))
- return false;
- } else {
- return false;
- }
-
- return true;
-}
-
-/// Test whether the given alloca partitioning and range of slices can be
-/// promoted to a vector.
-///
-/// This is a quick test to check whether we can rewrite a particular alloca
-/// partition (and its newly formed alloca) into a vector alloca with only
-/// whole-vector loads and stores such that it could be promoted to a vector
-/// SSA value. We only can ensure this for a limited set of operations, and we
-/// don't want to do the rewrites unless we are confident that the result will
-/// be promotable, so we have an early test here.
-static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
- // Collect the candidate types for vector-based promotion. Also track whether
- // we have different element types.
- SmallVector<VectorType *, 4> CandidateTys;
- Type *CommonEltTy = nullptr;
- bool HaveCommonEltTy = true;
- auto CheckCandidateType = [&](Type *Ty) {
- if (auto *VTy = dyn_cast<VectorType>(Ty)) {
- // Return if bitcast to vectors is different for total size in bits.
- if (!CandidateTys.empty()) {
- VectorType *V = CandidateTys[0];
- if (DL.getTypeSizeInBits(VTy).getFixedSize() !=
- DL.getTypeSizeInBits(V).getFixedSize()) {
- CandidateTys.clear();
- return;
- }
- }
- CandidateTys.push_back(VTy);
- if (!CommonEltTy)
- CommonEltTy = VTy->getElementType();
- else if (CommonEltTy != VTy->getElementType())
- HaveCommonEltTy = false;
- }
- };
- // Consider any loads or stores that are the exact size of the slice.
- for (const Slice &S : P)
- if (S.beginOffset() == P.beginOffset() &&
- S.endOffset() == P.endOffset()) {
- if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
- CheckCandidateType(LI->getType());
- else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
- CheckCandidateType(SI->getValueOperand()->getType());
- }
-
- // If we didn't find a vector type, nothing to do here.
- if (CandidateTys.empty())
- return nullptr;
-
- // Remove non-integer vector types if we had multiple common element types.
- // FIXME: It'd be nice to replace them with integer vector types, but we can't
- // do that until all the backends are known to produce good code for all
- // integer vector types.
- if (!HaveCommonEltTy) {
+ return false;
+ } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
+ // Disable vector promotion when there are loads or stores of an FCA.
+ return false;
+ } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+ if (LI->isVolatile())
+ return false;
+ Type *LTy = LI->getType();
+ if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
+ assert(LTy->isIntegerTy());
+ LTy = SplitIntTy;
+ }
+ if (!canConvertValue(DL, SliceTy, LTy))
+ return false;
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+ if (SI->isVolatile())
+ return false;
+ Type *STy = SI->getValueOperand()->getType();
+ if (P.beginOffset() > S.beginOffset() || P.endOffset() < S.endOffset()) {
+ assert(STy->isIntegerTy());
+ STy = SplitIntTy;
+ }
+ if (!canConvertValue(DL, STy, SliceTy))
+ return false;
+ } else {
+ return false;
+ }
+
+ return true;
+}
+
+/// Test whether the given alloca partitioning and range of slices can be
+/// promoted to a vector.
+///
+/// This is a quick test to check whether we can rewrite a particular alloca
+/// partition (and its newly formed alloca) into a vector alloca with only
+/// whole-vector loads and stores such that it could be promoted to a vector
+/// SSA value. We only can ensure this for a limited set of operations, and we
+/// don't want to do the rewrites unless we are confident that the result will
+/// be promotable, so we have an early test here.
+static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
+ // Collect the candidate types for vector-based promotion. Also track whether
+ // we have different element types.
+ SmallVector<VectorType *, 4> CandidateTys;
+ Type *CommonEltTy = nullptr;
+ bool HaveCommonEltTy = true;
+ auto CheckCandidateType = [&](Type *Ty) {
+ if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+ // Return if bitcast to vectors is different for total size in bits.
+ if (!CandidateTys.empty()) {
+ VectorType *V = CandidateTys[0];
+ if (DL.getTypeSizeInBits(VTy).getFixedSize() !=
+ DL.getTypeSizeInBits(V).getFixedSize()) {
+ CandidateTys.clear();
+ return;
+ }
+ }
+ CandidateTys.push_back(VTy);
+ if (!CommonEltTy)
+ CommonEltTy = VTy->getElementType();
+ else if (CommonEltTy != VTy->getElementType())
+ HaveCommonEltTy = false;
+ }
+ };
+ // Consider any loads or stores that are the exact size of the slice.
+ for (const Slice &S : P)
+ if (S.beginOffset() == P.beginOffset() &&
+ S.endOffset() == P.endOffset()) {
+ if (auto *LI = dyn_cast<LoadInst>(S.getUse()->getUser()))
+ CheckCandidateType(LI->getType());
+ else if (auto *SI = dyn_cast<StoreInst>(S.getUse()->getUser()))
+ CheckCandidateType(SI->getValueOperand()->getType());
+ }
+
+ // If we didn't find a vector type, nothing to do here.
+ if (CandidateTys.empty())
+ return nullptr;
+
+ // Remove non-integer vector types if we had multiple common element types.
+ // FIXME: It'd be nice to replace them with integer vector types, but we can't
+ // do that until all the backends are known to produce good code for all
+ // integer vector types.
+ if (!HaveCommonEltTy) {
llvm::erase_if(CandidateTys, [](VectorType *VTy) {
return !VTy->getElementType()->isIntegerTy();
});
-
- // If there were no integer vector types, give up.
- if (CandidateTys.empty())
- return nullptr;
-
- // Rank the remaining candidate vector types. This is easy because we know
- // they're all integer vectors. We sort by ascending number of elements.
- auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
- (void)DL;
- assert(DL.getTypeSizeInBits(RHSTy).getFixedSize() ==
- DL.getTypeSizeInBits(LHSTy).getFixedSize() &&
- "Cannot have vector types of different sizes!");
- assert(RHSTy->getElementType()->isIntegerTy() &&
- "All non-integer types eliminated!");
- assert(LHSTy->getElementType()->isIntegerTy() &&
- "All non-integer types eliminated!");
- return cast<FixedVectorType>(RHSTy)->getNumElements() <
- cast<FixedVectorType>(LHSTy)->getNumElements();
- };
- llvm::sort(CandidateTys, RankVectorTypes);
- CandidateTys.erase(
- std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
- CandidateTys.end());
- } else {
-// The only way to have the same element type in every vector type is to
-// have the same vector type. Check that and remove all but one.
-#ifndef NDEBUG
- for (VectorType *VTy : CandidateTys) {
- assert(VTy->getElementType() == CommonEltTy &&
- "Unaccounted for element type!");
- assert(VTy == CandidateTys[0] &&
- "Different vector types with the same element type!");
- }
-#endif
- CandidateTys.resize(1);
- }
-
- // Try each vector type, and return the one which works.
- auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
- uint64_t ElementSize =
- DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
-
- // While the definition of LLVM vectors is bitpacked, we don't support sizes
- // that aren't byte sized.
- if (ElementSize % 8)
- return false;
- assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
- "vector size not a multiple of element size?");
- ElementSize /= 8;
-
- for (const Slice &S : P)
- if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
- return false;
-
- for (const Slice *S : P.splitSliceTails())
- if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
- return false;
-
- return true;
- };
- for (VectorType *VTy : CandidateTys)
- if (CheckVectorTypeForPromotion(VTy))
- return VTy;
-
- return nullptr;
-}
-
-/// Test whether a slice of an alloca is valid for integer widening.
-///
-/// This implements the necessary checking for the \c isIntegerWideningViable
-/// test below on a single slice of the alloca.
-static bool isIntegerWideningViableForSlice(const Slice &S,
- uint64_t AllocBeginOffset,
- Type *AllocaTy,
- const DataLayout &DL,
- bool &WholeAllocaOp) {
- uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedSize();
-
- uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
- uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
-
- // We can't reasonably handle cases where the load or store extends past
- // the end of the alloca's type and into its padding.
- if (RelEnd > Size)
- return false;
-
- Use *U = S.getUse();
-
- if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
- if (LI->isVolatile())
- return false;
- // We can't handle loads that extend past the allocated memory.
- if (DL.getTypeStoreSize(LI->getType()).getFixedSize() > Size)
- return false;
- // So far, AllocaSliceRewriter does not support widening split slice tails
- // in rewriteIntegerLoad.
- if (S.beginOffset() < AllocBeginOffset)
- return false;
- // Note that we don't count vector loads or stores as whole-alloca
- // operations which enable integer widening because we would prefer to use
- // vector widening instead.
- if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
- WholeAllocaOp = true;
- if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
- if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
- return false;
- } else if (RelBegin != 0 || RelEnd != Size ||
- !canConvertValue(DL, AllocaTy, LI->getType())) {
- // Non-integer loads need to be convertible from the alloca type so that
- // they are promotable.
- return false;
- }
- } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
- Type *ValueTy = SI->getValueOperand()->getType();
- if (SI->isVolatile())
- return false;
- // We can't handle stores that extend past the allocated memory.
- if (DL.getTypeStoreSize(ValueTy).getFixedSize() > Size)
- return false;
- // So far, AllocaSliceRewriter does not support widening split slice tails
- // in rewriteIntegerStore.
- if (S.beginOffset() < AllocBeginOffset)
- return false;
- // Note that we don't count vector loads or stores as whole-alloca
- // operations which enable integer widening because we would prefer to use
- // vector widening instead.
- if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
- WholeAllocaOp = true;
- if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
- if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
- return false;
- } else if (RelBegin != 0 || RelEnd != Size ||
- !canConvertValue(DL, ValueTy, AllocaTy)) {
- // Non-integer stores need to be convertible to the alloca type so that
- // they are promotable.
- return false;
- }
- } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
- if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
- return false;
- if (!S.isSplittable())
- return false; // Skip any unsplittable intrinsics.
- } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
+
+ // If there were no integer vector types, give up.
+ if (CandidateTys.empty())
+ return nullptr;
+
+ // Rank the remaining candidate vector types. This is easy because we know
+ // they're all integer vectors. We sort by ascending number of elements.
+ auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+ (void)DL;
+ assert(DL.getTypeSizeInBits(RHSTy).getFixedSize() ==
+ DL.getTypeSizeInBits(LHSTy).getFixedSize() &&
+ "Cannot have vector types of different sizes!");
+ assert(RHSTy->getElementType()->isIntegerTy() &&
+ "All non-integer types eliminated!");
+ assert(LHSTy->getElementType()->isIntegerTy() &&
+ "All non-integer types eliminated!");
+ return cast<FixedVectorType>(RHSTy)->getNumElements() <
+ cast<FixedVectorType>(LHSTy)->getNumElements();
+ };
+ llvm::sort(CandidateTys, RankVectorTypes);
+ CandidateTys.erase(
+ std::unique(CandidateTys.begin(), CandidateTys.end(), RankVectorTypes),
+ CandidateTys.end());
+ } else {
+// The only way to have the same element type in every vector type is to
+// have the same vector type. Check that and remove all but one.
+#ifndef NDEBUG
+ for (VectorType *VTy : CandidateTys) {
+ assert(VTy->getElementType() == CommonEltTy &&
+ "Unaccounted for element type!");
+ assert(VTy == CandidateTys[0] &&
+ "Different vector types with the same element type!");
+ }
+#endif
+ CandidateTys.resize(1);
+ }
+
+ // Try each vector type, and return the one which works.
+ auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
+ uint64_t ElementSize =
+ DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
+
+ // While the definition of LLVM vectors is bitpacked, we don't support sizes
+ // that aren't byte sized.
+ if (ElementSize % 8)
+ return false;
+ assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
+ "vector size not a multiple of element size?");
+ ElementSize /= 8;
+
+ for (const Slice &S : P)
+ if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL))
+ return false;
+
+ for (const Slice *S : P.splitSliceTails())
+ if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL))
+ return false;
+
+ return true;
+ };
+ for (VectorType *VTy : CandidateTys)
+ if (CheckVectorTypeForPromotion(VTy))
+ return VTy;
+
+ return nullptr;
+}
+
+/// Test whether a slice of an alloca is valid for integer widening.
+///
+/// This implements the necessary checking for the \c isIntegerWideningViable
+/// test below on a single slice of the alloca.
+static bool isIntegerWideningViableForSlice(const Slice &S,
+ uint64_t AllocBeginOffset,
+ Type *AllocaTy,
+ const DataLayout &DL,
+ bool &WholeAllocaOp) {
+ uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedSize();
+
+ uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
+ uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
+
+ // We can't reasonably handle cases where the load or store extends past
+ // the end of the alloca's type and into its padding.
+ if (RelEnd > Size)
+ return false;
+
+ Use *U = S.getUse();
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
+ if (LI->isVolatile())
+ return false;
+ // We can't handle loads that extend past the allocated memory.
+ if (DL.getTypeStoreSize(LI->getType()).getFixedSize() > Size)
+ return false;
+ // So far, AllocaSliceRewriter does not support widening split slice tails
+ // in rewriteIntegerLoad.
+ if (S.beginOffset() < AllocBeginOffset)
+ return false;
+ // Note that we don't count vector loads or stores as whole-alloca
+ // operations which enable integer widening because we would prefer to use
+ // vector widening instead.
+ if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
+ WholeAllocaOp = true;
+ if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
+ if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
+ return false;
+ } else if (RelBegin != 0 || RelEnd != Size ||
+ !canConvertValue(DL, AllocaTy, LI->getType())) {
+ // Non-integer loads need to be convertible from the alloca type so that
+ // they are promotable.
+ return false;
+ }
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
+ Type *ValueTy = SI->getValueOperand()->getType();
+ if (SI->isVolatile())
+ return false;
+ // We can't handle stores that extend past the allocated memory.
+ if (DL.getTypeStoreSize(ValueTy).getFixedSize() > Size)
+ return false;
+ // So far, AllocaSliceRewriter does not support widening split slice tails
+ // in rewriteIntegerStore.
+ if (S.beginOffset() < AllocBeginOffset)
+ return false;
+ // Note that we don't count vector loads or stores as whole-alloca
+ // operations which enable integer widening because we would prefer to use
+ // vector widening instead.
+ if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
+ WholeAllocaOp = true;
+ if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
+ if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
+ return false;
+ } else if (RelBegin != 0 || RelEnd != Size ||
+ !canConvertValue(DL, ValueTy, AllocaTy)) {
+ // Non-integer stores need to be convertible to the alloca type so that
+ // they are promotable.
+ return false;
+ }
+ } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
+ if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
+ return false;
+ if (!S.isSplittable())
+ return false; // Skip any unsplittable intrinsics.
+ } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
- return false;
- } else {
- return false;
- }
-
- return true;
-}
-
-/// Test whether the given alloca partition's integer operations can be
-/// widened to promotable ones.
-///
-/// This is a quick test to check whether we can rewrite the integer loads and
-/// stores to a particular alloca into wider loads and stores and be able to
-/// promote the resulting alloca.
-static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
- const DataLayout &DL) {
- uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedSize();
- // Don't create integer types larger than the maximum bitwidth.
- if (SizeInBits > IntegerType::MAX_INT_BITS)
- return false;
-
- // Don't try to handle allocas with bit-padding.
- if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedSize())
- return false;
-
- // We need to ensure that an integer type with the appropriate bitwidth can
- // be converted to the alloca type, whatever that is. We don't want to force
- // the alloca itself to have an integer type if there is a more suitable one.
- Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
- if (!canConvertValue(DL, AllocaTy, IntTy) ||
- !canConvertValue(DL, IntTy, AllocaTy))
- return false;
-
- // While examining uses, we ensure that the alloca has a covering load or
- // store. We don't want to widen the integer operations only to fail to
- // promote due to some other unsplittable entry (which we may make splittable
- // later). However, if there are only splittable uses, go ahead and assume
- // that we cover the alloca.
- // FIXME: We shouldn't consider split slices that happen to start in the
- // partition here...
+ return false;
+ } else {
+ return false;
+ }
+
+ return true;
+}
+
+/// Test whether the given alloca partition's integer operations can be
+/// widened to promotable ones.
+///
+/// This is a quick test to check whether we can rewrite the integer loads and
+/// stores to a particular alloca into wider loads and stores and be able to
+/// promote the resulting alloca.
+static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
+ const DataLayout &DL) {
+ uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedSize();
+ // Don't create integer types larger than the maximum bitwidth.
+ if (SizeInBits > IntegerType::MAX_INT_BITS)
+ return false;
+
+ // Don't try to handle allocas with bit-padding.
+ if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedSize())
+ return false;
+
+ // We need to ensure that an integer type with the appropriate bitwidth can
+ // be converted to the alloca type, whatever that is. We don't want to force
+ // the alloca itself to have an integer type if there is a more suitable one.
+ Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
+ if (!canConvertValue(DL, AllocaTy, IntTy) ||
+ !canConvertValue(DL, IntTy, AllocaTy))
+ return false;
+
+ // While examining uses, we ensure that the alloca has a covering load or
+ // store. We don't want to widen the integer operations only to fail to
+ // promote due to some other unsplittable entry (which we may make splittable
+ // later). However, if there are only splittable uses, go ahead and assume
+ // that we cover the alloca.
+ // FIXME: We shouldn't consider split slices that happen to start in the
+ // partition here...
bool WholeAllocaOp = P.empty() && DL.isLegalInteger(SizeInBits);
-
- for (const Slice &S : P)
- if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
- WholeAllocaOp))
- return false;
-
- for (const Slice *S : P.splitSliceTails())
- if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
- WholeAllocaOp))
- return false;
-
- return WholeAllocaOp;
-}
-
-static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
- IntegerType *Ty, uint64_t Offset,
- const Twine &Name) {
- LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
- IntegerType *IntTy = cast<IntegerType>(V->getType());
- assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
- DL.getTypeStoreSize(IntTy).getFixedSize() &&
- "Element extends past full value");
- uint64_t ShAmt = 8 * Offset;
- if (DL.isBigEndian())
- ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
- DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
- if (ShAmt) {
- V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
- LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
- }
- assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
- "Cannot extract to a larger integer!");
- if (Ty != IntTy) {
- V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
- LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
- }
- return V;
-}
-
-static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
- Value *V, uint64_t Offset, const Twine &Name) {
- IntegerType *IntTy = cast<IntegerType>(Old->getType());
- IntegerType *Ty = cast<IntegerType>(V->getType());
- assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
- "Cannot insert a larger integer!");
- LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
- if (Ty != IntTy) {
- V = IRB.CreateZExt(V, IntTy, Name + ".ext");
- LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
- }
- assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
- DL.getTypeStoreSize(IntTy).getFixedSize() &&
- "Element store outside of alloca store");
- uint64_t ShAmt = 8 * Offset;
- if (DL.isBigEndian())
- ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
- DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
- if (ShAmt) {
- V = IRB.CreateShl(V, ShAmt, Name + ".shift");
- LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
- }
-
- if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
- APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
- Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
- LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
- V = IRB.CreateOr(Old, V, Name + ".insert");
- LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
- }
- return V;
-}
-
-static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
- unsigned EndIndex, const Twine &Name) {
- auto *VecTy = cast<FixedVectorType>(V->getType());
- unsigned NumElements = EndIndex - BeginIndex;
- assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
-
- if (NumElements == VecTy->getNumElements())
- return V;
-
- if (NumElements == 1) {
- V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
- Name + ".extract");
- LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
- return V;
- }
-
- SmallVector<int, 8> Mask;
- Mask.reserve(NumElements);
- for (unsigned i = BeginIndex; i != EndIndex; ++i)
- Mask.push_back(i);
+
+ for (const Slice &S : P)
+ if (!isIntegerWideningViableForSlice(S, P.beginOffset(), AllocaTy, DL,
+ WholeAllocaOp))
+ return false;
+
+ for (const Slice *S : P.splitSliceTails())
+ if (!isIntegerWideningViableForSlice(*S, P.beginOffset(), AllocaTy, DL,
+ WholeAllocaOp))
+ return false;
+
+ return WholeAllocaOp;
+}
+
+static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
+ IntegerType *Ty, uint64_t Offset,
+ const Twine &Name) {
+ LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
+ IntegerType *IntTy = cast<IntegerType>(V->getType());
+ assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
+ DL.getTypeStoreSize(IntTy).getFixedSize() &&
+ "Element extends past full value");
+ uint64_t ShAmt = 8 * Offset;
+ if (DL.isBigEndian())
+ ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
+ DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
+ if (ShAmt) {
+ V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
+ LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
+ }
+ assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+ "Cannot extract to a larger integer!");
+ if (Ty != IntTy) {
+ V = IRB.CreateTrunc(V, Ty, Name + ".trunc");
+ LLVM_DEBUG(dbgs() << " trunced: " << *V << "\n");
+ }
+ return V;
+}
+
+static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
+ Value *V, uint64_t Offset, const Twine &Name) {
+ IntegerType *IntTy = cast<IntegerType>(Old->getType());
+ IntegerType *Ty = cast<IntegerType>(V->getType());
+ assert(Ty->getBitWidth() <= IntTy->getBitWidth() &&
+ "Cannot insert a larger integer!");
+ LLVM_DEBUG(dbgs() << " start: " << *V << "\n");
+ if (Ty != IntTy) {
+ V = IRB.CreateZExt(V, IntTy, Name + ".ext");
+ LLVM_DEBUG(dbgs() << " extended: " << *V << "\n");
+ }
+ assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
+ DL.getTypeStoreSize(IntTy).getFixedSize() &&
+ "Element store outside of alloca store");
+ uint64_t ShAmt = 8 * Offset;
+ if (DL.isBigEndian())
+ ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
+ DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
+ if (ShAmt) {
+ V = IRB.CreateShl(V, ShAmt, Name + ".shift");
+ LLVM_DEBUG(dbgs() << " shifted: " << *V << "\n");
+ }
+
+ if (ShAmt || Ty->getBitWidth() < IntTy->getBitWidth()) {
+ APInt Mask = ~Ty->getMask().zext(IntTy->getBitWidth()).shl(ShAmt);
+ Old = IRB.CreateAnd(Old, Mask, Name + ".mask");
+ LLVM_DEBUG(dbgs() << " masked: " << *Old << "\n");
+ V = IRB.CreateOr(Old, V, Name + ".insert");
+ LLVM_DEBUG(dbgs() << " inserted: " << *V << "\n");
+ }
+ return V;
+}
+
+static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
+ unsigned EndIndex, const Twine &Name) {
+ auto *VecTy = cast<FixedVectorType>(V->getType());
+ unsigned NumElements = EndIndex - BeginIndex;
+ assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+ if (NumElements == VecTy->getNumElements())
+ return V;
+
+ if (NumElements == 1) {
+ V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
+ Name + ".extract");
+ LLVM_DEBUG(dbgs() << " extract: " << *V << "\n");
+ return V;
+ }
+
+ SmallVector<int, 8> Mask;
+ Mask.reserve(NumElements);
+ for (unsigned i = BeginIndex; i != EndIndex; ++i)
+ Mask.push_back(i);
V = IRB.CreateShuffleVector(V, Mask, Name + ".extract");
- LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
- return V;
-}
-
-static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
- unsigned BeginIndex, const Twine &Name) {
- VectorType *VecTy = cast<VectorType>(Old->getType());
- assert(VecTy && "Can only insert a vector into a vector");
-
- VectorType *Ty = dyn_cast<VectorType>(V->getType());
- if (!Ty) {
- // Single element to insert.
- V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
- Name + ".insert");
- LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
- return V;
- }
-
- assert(cast<FixedVectorType>(Ty)->getNumElements() <=
- cast<FixedVectorType>(VecTy)->getNumElements() &&
- "Too many elements!");
- if (cast<FixedVectorType>(Ty)->getNumElements() ==
- cast<FixedVectorType>(VecTy)->getNumElements()) {
- assert(V->getType() == VecTy && "Vector type mismatch");
- return V;
- }
- unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
-
- // When inserting a smaller vector into the larger to store, we first
- // use a shuffle vector to widen it with undef elements, and then
- // a second shuffle vector to select between the loaded vector and the
- // incoming vector.
+ LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
+ return V;
+}
+
+static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
+ unsigned BeginIndex, const Twine &Name) {
+ VectorType *VecTy = cast<VectorType>(Old->getType());
+ assert(VecTy && "Can only insert a vector into a vector");
+
+ VectorType *Ty = dyn_cast<VectorType>(V->getType());
+ if (!Ty) {
+ // Single element to insert.
+ V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
+ Name + ".insert");
+ LLVM_DEBUG(dbgs() << " insert: " << *V << "\n");
+ return V;
+ }
+
+ assert(cast<FixedVectorType>(Ty)->getNumElements() <=
+ cast<FixedVectorType>(VecTy)->getNumElements() &&
+ "Too many elements!");
+ if (cast<FixedVectorType>(Ty)->getNumElements() ==
+ cast<FixedVectorType>(VecTy)->getNumElements()) {
+ assert(V->getType() == VecTy && "Vector type mismatch");
+ return V;
+ }
+ unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
+
+ // When inserting a smaller vector into the larger to store, we first
+ // use a shuffle vector to widen it with undef elements, and then
+ // a second shuffle vector to select between the loaded vector and the
+ // incoming vector.
SmallVector<int, 8> Mask;
- Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
- for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
- if (i >= BeginIndex && i < EndIndex)
+ Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
+ for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
+ if (i >= BeginIndex && i < EndIndex)
Mask.push_back(i - BeginIndex);
- else
+ else
Mask.push_back(-1);
V = IRB.CreateShuffleVector(V, Mask, Name + ".expand");
- LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
-
+ LLVM_DEBUG(dbgs() << " shuffle: " << *V << "\n");
+
SmallVector<Constant *, 8> Mask2;
Mask2.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
- for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
+ for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
Mask2.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
-
+
V = IRB.CreateSelect(ConstantVector::get(Mask2), V, Old, Name + "blend");
-
- LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
- return V;
-}
-
-/// Visitor to rewrite instructions using p particular slice of an alloca
-/// to use a new alloca.
-///
-/// Also implements the rewriting to vector-based accesses when the partition
-/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
-/// lives here.
-class llvm::sroa::AllocaSliceRewriter
- : public InstVisitor<AllocaSliceRewriter, bool> {
- // Befriend the base class so it can delegate to private visit methods.
- friend class InstVisitor<AllocaSliceRewriter, bool>;
-
- using Base = InstVisitor<AllocaSliceRewriter, bool>;
-
- const DataLayout &DL;
- AllocaSlices &AS;
- SROA &Pass;
- AllocaInst &OldAI, &NewAI;
- const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
- Type *NewAllocaTy;
-
- // This is a convenience and flag variable that will be null unless the new
- // alloca's integer operations should be widened to this integer type due to
- // passing isIntegerWideningViable above. If it is non-null, the desired
- // integer type will be stored here for easy access during rewriting.
- IntegerType *IntTy;
-
- // If we are rewriting an alloca partition which can be written as pure
- // vector operations, we stash extra information here. When VecTy is
- // non-null, we have some strict guarantees about the rewritten alloca:
- // - The new alloca is exactly the size of the vector type here.
- // - The accesses all either map to the entire vector or to a single
- // element.
- // - The set of accessing instructions is only one of those handled above
- // in isVectorPromotionViable. Generally these are the same access kinds
- // which are promotable via mem2reg.
- VectorType *VecTy;
- Type *ElementTy;
- uint64_t ElementSize;
-
- // The original offset of the slice currently being rewritten relative to
- // the original alloca.
- uint64_t BeginOffset = 0;
- uint64_t EndOffset = 0;
-
- // The new offsets of the slice currently being rewritten relative to the
- // original alloca.
- uint64_t NewBeginOffset = 0, NewEndOffset = 0;
-
- uint64_t SliceSize = 0;
- bool IsSplittable = false;
- bool IsSplit = false;
- Use *OldUse = nullptr;
- Instruction *OldPtr = nullptr;
-
- // Track post-rewrite users which are PHI nodes and Selects.
- SmallSetVector<PHINode *, 8> &PHIUsers;
- SmallSetVector<SelectInst *, 8> &SelectUsers;
-
- // Utility IR builder, whose name prefix is setup for each visited use, and
- // the insertion point is set to point to the user.
- IRBuilderTy IRB;
-
-public:
- AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
- AllocaInst &OldAI, AllocaInst &NewAI,
- uint64_t NewAllocaBeginOffset,
- uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
- VectorType *PromotableVecTy,
- SmallSetVector<PHINode *, 8> &PHIUsers,
- SmallSetVector<SelectInst *, 8> &SelectUsers)
- : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
- NewAllocaBeginOffset(NewAllocaBeginOffset),
- NewAllocaEndOffset(NewAllocaEndOffset),
- NewAllocaTy(NewAI.getAllocatedType()),
- IntTy(
- IsIntegerPromotable
- ? Type::getIntNTy(NewAI.getContext(),
- DL.getTypeSizeInBits(NewAI.getAllocatedType())
- .getFixedSize())
- : nullptr),
- VecTy(PromotableVecTy),
- ElementTy(VecTy ? VecTy->getElementType() : nullptr),
- ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8
- : 0),
- PHIUsers(PHIUsers), SelectUsers(SelectUsers),
- IRB(NewAI.getContext(), ConstantFolder()) {
- if (VecTy) {
- assert((DL.getTypeSizeInBits(ElementTy).getFixedSize() % 8) == 0 &&
- "Only multiple-of-8 sized vector elements are viable");
- ++NumVectorized;
- }
- assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
- }
-
- bool visit(AllocaSlices::const_iterator I) {
- bool CanSROA = true;
- BeginOffset = I->beginOffset();
- EndOffset = I->endOffset();
- IsSplittable = I->isSplittable();
- IsSplit =
- BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
- LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
- LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
- LLVM_DEBUG(dbgs() << "\n");
-
- // Compute the intersecting offset range.
- assert(BeginOffset < NewAllocaEndOffset);
- assert(EndOffset > NewAllocaBeginOffset);
- NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
- NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
-
- SliceSize = NewEndOffset - NewBeginOffset;
-
- OldUse = I->getUse();
- OldPtr = cast<Instruction>(OldUse->get());
-
- Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
- IRB.SetInsertPoint(OldUserI);
- IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
- IRB.getInserter().SetNamePrefix(
- Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
-
- CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
- if (VecTy || IntTy)
- assert(CanSROA);
- return CanSROA;
- }
-
-private:
- // Make sure the other visit overloads are visible.
- using Base::visit;
-
- // Every instruction which can end up as a user must have a rewrite rule.
- bool visitInstruction(Instruction &I) {
- LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
- llvm_unreachable("No rewrite rule for this instruction!");
- }
-
- Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
- // Note that the offset computation can use BeginOffset or NewBeginOffset
- // interchangeably for unsplit slices.
- assert(IsSplit || BeginOffset == NewBeginOffset);
- uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
-
-#ifndef NDEBUG
- StringRef OldName = OldPtr->getName();
- // Skip through the last '.sroa.' component of the name.
- size_t LastSROAPrefix = OldName.rfind(".sroa.");
- if (LastSROAPrefix != StringRef::npos) {
- OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
- // Look for an SROA slice index.
- size_t IndexEnd = OldName.find_first_not_of("0123456789");
- if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
- // Strip the index and look for the offset.
- OldName = OldName.substr(IndexEnd + 1);
- size_t OffsetEnd = OldName.find_first_not_of("0123456789");
- if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
- // Strip the offset.
- OldName = OldName.substr(OffsetEnd + 1);
- }
- }
- // Strip any SROA suffixes as well.
- OldName = OldName.substr(0, OldName.find(".sroa_"));
-#endif
-
- return getAdjustedPtr(IRB, DL, &NewAI,
- APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
- PointerTy,
-#ifndef NDEBUG
- Twine(OldName) + "."
-#else
- Twine()
-#endif
- );
- }
-
- /// Compute suitable alignment to access this slice of the *new*
- /// alloca.
- ///
- /// You can optionally pass a type to this routine and if that type's ABI
- /// alignment is itself suitable, this will return zero.
- Align getSliceAlign() {
- return commonAlignment(NewAI.getAlign(),
- NewBeginOffset - NewAllocaBeginOffset);
- }
-
- unsigned getIndex(uint64_t Offset) {
- assert(VecTy && "Can only call getIndex when rewriting a vector");
- uint64_t RelOffset = Offset - NewAllocaBeginOffset;
- assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
- uint32_t Index = RelOffset / ElementSize;
- assert(Index * ElementSize == RelOffset);
- return Index;
- }
-
- void deleteIfTriviallyDead(Value *V) {
- Instruction *I = cast<Instruction>(V);
- if (isInstructionTriviallyDead(I))
+
+ LLVM_DEBUG(dbgs() << " blend: " << *V << "\n");
+ return V;
+}
+
+/// Visitor to rewrite instructions using p particular slice of an alloca
+/// to use a new alloca.
+///
+/// Also implements the rewriting to vector-based accesses when the partition
+/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
+/// lives here.
+class llvm::sroa::AllocaSliceRewriter
+ : public InstVisitor<AllocaSliceRewriter, bool> {
+ // Befriend the base class so it can delegate to private visit methods.
+ friend class InstVisitor<AllocaSliceRewriter, bool>;
+
+ using Base = InstVisitor<AllocaSliceRewriter, bool>;
+
+ const DataLayout &DL;
+ AllocaSlices &AS;
+ SROA &Pass;
+ AllocaInst &OldAI, &NewAI;
+ const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
+ Type *NewAllocaTy;
+
+ // This is a convenience and flag variable that will be null unless the new
+ // alloca's integer operations should be widened to this integer type due to
+ // passing isIntegerWideningViable above. If it is non-null, the desired
+ // integer type will be stored here for easy access during rewriting.
+ IntegerType *IntTy;
+
+ // If we are rewriting an alloca partition which can be written as pure
+ // vector operations, we stash extra information here. When VecTy is
+ // non-null, we have some strict guarantees about the rewritten alloca:
+ // - The new alloca is exactly the size of the vector type here.
+ // - The accesses all either map to the entire vector or to a single
+ // element.
+ // - The set of accessing instructions is only one of those handled above
+ // in isVectorPromotionViable. Generally these are the same access kinds
+ // which are promotable via mem2reg.
+ VectorType *VecTy;
+ Type *ElementTy;
+ uint64_t ElementSize;
+
+ // The original offset of the slice currently being rewritten relative to
+ // the original alloca.
+ uint64_t BeginOffset = 0;
+ uint64_t EndOffset = 0;
+
+ // The new offsets of the slice currently being rewritten relative to the
+ // original alloca.
+ uint64_t NewBeginOffset = 0, NewEndOffset = 0;
+
+ uint64_t SliceSize = 0;
+ bool IsSplittable = false;
+ bool IsSplit = false;
+ Use *OldUse = nullptr;
+ Instruction *OldPtr = nullptr;
+
+ // Track post-rewrite users which are PHI nodes and Selects.
+ SmallSetVector<PHINode *, 8> &PHIUsers;
+ SmallSetVector<SelectInst *, 8> &SelectUsers;
+
+ // Utility IR builder, whose name prefix is setup for each visited use, and
+ // the insertion point is set to point to the user.
+ IRBuilderTy IRB;
+
+public:
+ AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROA &Pass,
+ AllocaInst &OldAI, AllocaInst &NewAI,
+ uint64_t NewAllocaBeginOffset,
+ uint64_t NewAllocaEndOffset, bool IsIntegerPromotable,
+ VectorType *PromotableVecTy,
+ SmallSetVector<PHINode *, 8> &PHIUsers,
+ SmallSetVector<SelectInst *, 8> &SelectUsers)
+ : DL(DL), AS(AS), Pass(Pass), OldAI(OldAI), NewAI(NewAI),
+ NewAllocaBeginOffset(NewAllocaBeginOffset),
+ NewAllocaEndOffset(NewAllocaEndOffset),
+ NewAllocaTy(NewAI.getAllocatedType()),
+ IntTy(
+ IsIntegerPromotable
+ ? Type::getIntNTy(NewAI.getContext(),
+ DL.getTypeSizeInBits(NewAI.getAllocatedType())
+ .getFixedSize())
+ : nullptr),
+ VecTy(PromotableVecTy),
+ ElementTy(VecTy ? VecTy->getElementType() : nullptr),
+ ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8
+ : 0),
+ PHIUsers(PHIUsers), SelectUsers(SelectUsers),
+ IRB(NewAI.getContext(), ConstantFolder()) {
+ if (VecTy) {
+ assert((DL.getTypeSizeInBits(ElementTy).getFixedSize() % 8) == 0 &&
+ "Only multiple-of-8 sized vector elements are viable");
+ ++NumVectorized;
+ }
+ assert((!IntTy && !VecTy) || (IntTy && !VecTy) || (!IntTy && VecTy));
+ }
+
+ bool visit(AllocaSlices::const_iterator I) {
+ bool CanSROA = true;
+ BeginOffset = I->beginOffset();
+ EndOffset = I->endOffset();
+ IsSplittable = I->isSplittable();
+ IsSplit =
+ BeginOffset < NewAllocaBeginOffset || EndOffset > NewAllocaEndOffset;
+ LLVM_DEBUG(dbgs() << " rewriting " << (IsSplit ? "split " : ""));
+ LLVM_DEBUG(AS.printSlice(dbgs(), I, ""));
+ LLVM_DEBUG(dbgs() << "\n");
+
+ // Compute the intersecting offset range.
+ assert(BeginOffset < NewAllocaEndOffset);
+ assert(EndOffset > NewAllocaBeginOffset);
+ NewBeginOffset = std::max(BeginOffset, NewAllocaBeginOffset);
+ NewEndOffset = std::min(EndOffset, NewAllocaEndOffset);
+
+ SliceSize = NewEndOffset - NewBeginOffset;
+
+ OldUse = I->getUse();
+ OldPtr = cast<Instruction>(OldUse->get());
+
+ Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
+ IRB.SetInsertPoint(OldUserI);
+ IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
+ IRB.getInserter().SetNamePrefix(
+ Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
+
+ CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
+ if (VecTy || IntTy)
+ assert(CanSROA);
+ return CanSROA;
+ }
+
+private:
+ // Make sure the other visit overloads are visible.
+ using Base::visit;
+
+ // Every instruction which can end up as a user must have a rewrite rule.
+ bool visitInstruction(Instruction &I) {
+ LLVM_DEBUG(dbgs() << " !!!! Cannot rewrite: " << I << "\n");
+ llvm_unreachable("No rewrite rule for this instruction!");
+ }
+
+ Value *getNewAllocaSlicePtr(IRBuilderTy &IRB, Type *PointerTy) {
+ // Note that the offset computation can use BeginOffset or NewBeginOffset
+ // interchangeably for unsplit slices.
+ assert(IsSplit || BeginOffset == NewBeginOffset);
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+
+#ifndef NDEBUG
+ StringRef OldName = OldPtr->getName();
+ // Skip through the last '.sroa.' component of the name.
+ size_t LastSROAPrefix = OldName.rfind(".sroa.");
+ if (LastSROAPrefix != StringRef::npos) {
+ OldName = OldName.substr(LastSROAPrefix + strlen(".sroa."));
+ // Look for an SROA slice index.
+ size_t IndexEnd = OldName.find_first_not_of("0123456789");
+ if (IndexEnd != StringRef::npos && OldName[IndexEnd] == '.') {
+ // Strip the index and look for the offset.
+ OldName = OldName.substr(IndexEnd + 1);
+ size_t OffsetEnd = OldName.find_first_not_of("0123456789");
+ if (OffsetEnd != StringRef::npos && OldName[OffsetEnd] == '.')
+ // Strip the offset.
+ OldName = OldName.substr(OffsetEnd + 1);
+ }
+ }
+ // Strip any SROA suffixes as well.
+ OldName = OldName.substr(0, OldName.find(".sroa_"));
+#endif
+
+ return getAdjustedPtr(IRB, DL, &NewAI,
+ APInt(DL.getIndexTypeSizeInBits(PointerTy), Offset),
+ PointerTy,
+#ifndef NDEBUG
+ Twine(OldName) + "."
+#else
+ Twine()
+#endif
+ );
+ }
+
+ /// Compute suitable alignment to access this slice of the *new*
+ /// alloca.
+ ///
+ /// You can optionally pass a type to this routine and if that type's ABI
+ /// alignment is itself suitable, this will return zero.
+ Align getSliceAlign() {
+ return commonAlignment(NewAI.getAlign(),
+ NewBeginOffset - NewAllocaBeginOffset);
+ }
+
+ unsigned getIndex(uint64_t Offset) {
+ assert(VecTy && "Can only call getIndex when rewriting a vector");
+ uint64_t RelOffset = Offset - NewAllocaBeginOffset;
+ assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
+ uint32_t Index = RelOffset / ElementSize;
+ assert(Index * ElementSize == RelOffset);
+ return Index;
+ }
+
+ void deleteIfTriviallyDead(Value *V) {
+ Instruction *I = cast<Instruction>(V);
+ if (isInstructionTriviallyDead(I))
Pass.DeadInsts.push_back(I);
- }
-
- Value *rewriteVectorizedLoadInst() {
- unsigned BeginIndex = getIndex(NewBeginOffset);
- unsigned EndIndex = getIndex(NewEndOffset);
- assert(EndIndex > BeginIndex && "Empty vector!");
-
- Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
- NewAI.getAlign(), "load");
- return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
- }
-
- Value *rewriteIntegerLoad(LoadInst &LI) {
- assert(IntTy && "We cannot insert an integer to the alloca");
- assert(!LI.isVolatile());
- Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
- NewAI.getAlign(), "load");
- V = convertValue(DL, IRB, V, IntTy);
- assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
- uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
- if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
- IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
- V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
- }
- // It is possible that the extracted type is not the load type. This
- // happens if there is a load past the end of the alloca, and as
- // a consequence the slice is narrower but still a candidate for integer
- // lowering. To handle this case, we just zero extend the extracted
- // integer.
- assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
- "Can only handle an extract for an overly wide load");
- if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
- V = IRB.CreateZExt(V, LI.getType());
- return V;
- }
-
- bool visitLoadInst(LoadInst &LI) {
- LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
- Value *OldOp = LI.getOperand(0);
- assert(OldOp == OldPtr);
-
- AAMDNodes AATags;
- LI.getAAMetadata(AATags);
-
- unsigned AS = LI.getPointerAddressSpace();
-
- Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
- : LI.getType();
- const bool IsLoadPastEnd =
- DL.getTypeStoreSize(TargetTy).getFixedSize() > SliceSize;
- bool IsPtrAdjusted = false;
- Value *V;
- if (VecTy) {
- V = rewriteVectorizedLoadInst();
- } else if (IntTy && LI.getType()->isIntegerTy()) {
- V = rewriteIntegerLoad(LI);
- } else if (NewBeginOffset == NewAllocaBeginOffset &&
- NewEndOffset == NewAllocaEndOffset &&
- (canConvertValue(DL, NewAllocaTy, TargetTy) ||
- (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
- TargetTy->isIntegerTy()))) {
- LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
- NewAI.getAlign(), LI.isVolatile(),
- LI.getName());
- if (AATags)
+ }
+
+ Value *rewriteVectorizedLoadInst() {
+ unsigned BeginIndex = getIndex(NewBeginOffset);
+ unsigned EndIndex = getIndex(NewEndOffset);
+ assert(EndIndex > BeginIndex && "Empty vector!");
+
+ Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+ NewAI.getAlign(), "load");
+ return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
+ }
+
+ Value *rewriteIntegerLoad(LoadInst &LI) {
+ assert(IntTy && "We cannot insert an integer to the alloca");
+ assert(!LI.isVolatile());
+ Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+ NewAI.getAlign(), "load");
+ V = convertValue(DL, IRB, V, IntTy);
+ assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+ if (Offset > 0 || NewEndOffset < NewAllocaEndOffset) {
+ IntegerType *ExtractTy = Type::getIntNTy(LI.getContext(), SliceSize * 8);
+ V = extractInteger(DL, IRB, V, ExtractTy, Offset, "extract");
+ }
+ // It is possible that the extracted type is not the load type. This
+ // happens if there is a load past the end of the alloca, and as
+ // a consequence the slice is narrower but still a candidate for integer
+ // lowering. To handle this case, we just zero extend the extracted
+ // integer.
+ assert(cast<IntegerType>(LI.getType())->getBitWidth() >= SliceSize * 8 &&
+ "Can only handle an extract for an overly wide load");
+ if (cast<IntegerType>(LI.getType())->getBitWidth() > SliceSize * 8)
+ V = IRB.CreateZExt(V, LI.getType());
+ return V;
+ }
+
+ bool visitLoadInst(LoadInst &LI) {
+ LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
+ Value *OldOp = LI.getOperand(0);
+ assert(OldOp == OldPtr);
+
+ AAMDNodes AATags;
+ LI.getAAMetadata(AATags);
+
+ unsigned AS = LI.getPointerAddressSpace();
+
+ Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
+ : LI.getType();
+ const bool IsLoadPastEnd =
+ DL.getTypeStoreSize(TargetTy).getFixedSize() > SliceSize;
+ bool IsPtrAdjusted = false;
+ Value *V;
+ if (VecTy) {
+ V = rewriteVectorizedLoadInst();
+ } else if (IntTy && LI.getType()->isIntegerTy()) {
+ V = rewriteIntegerLoad(LI);
+ } else if (NewBeginOffset == NewAllocaBeginOffset &&
+ NewEndOffset == NewAllocaEndOffset &&
+ (canConvertValue(DL, NewAllocaTy, TargetTy) ||
+ (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
+ TargetTy->isIntegerTy()))) {
+ LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+ NewAI.getAlign(), LI.isVolatile(),
+ LI.getName());
+ if (AATags)
NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
- if (LI.isVolatile())
- NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
- if (NewLI->isAtomic())
- NewLI->setAlignment(LI.getAlign());
-
- // Any !nonnull metadata or !range metadata on the old load is also valid
- // on the new load. This is even true in some cases even when the loads
- // are different types, for example by mapping !nonnull metadata to
- // !range metadata by modeling the null pointer constant converted to the
- // integer type.
- // FIXME: Add support for range metadata here. Currently the utilities
- // for this don't propagate range metadata in trivial cases from one
- // integer load to another, don't handle non-addrspace-0 null pointers
- // correctly, and don't have any support for mapping ranges as the
- // integer type becomes winder or narrower.
- if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
- copyNonnullMetadata(LI, N, *NewLI);
-
- // Try to preserve nonnull metadata
- V = NewLI;
-
- // If this is an integer load past the end of the slice (which means the
- // bytes outside the slice are undef or this load is dead) just forcibly
- // fix the integer size with correct handling of endianness.
- if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
- if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
- if (AITy->getBitWidth() < TITy->getBitWidth()) {
- V = IRB.CreateZExt(V, TITy, "load.ext");
- if (DL.isBigEndian())
- V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
- "endian_shift");
- }
- } else {
- Type *LTy = TargetTy->getPointerTo(AS);
- LoadInst *NewLI =
- IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
- getSliceAlign(), LI.isVolatile(), LI.getName());
- if (AATags)
+ if (LI.isVolatile())
+ NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+ if (NewLI->isAtomic())
+ NewLI->setAlignment(LI.getAlign());
+
+ // Any !nonnull metadata or !range metadata on the old load is also valid
+ // on the new load. This is even true in some cases even when the loads
+ // are different types, for example by mapping !nonnull metadata to
+ // !range metadata by modeling the null pointer constant converted to the
+ // integer type.
+ // FIXME: Add support for range metadata here. Currently the utilities
+ // for this don't propagate range metadata in trivial cases from one
+ // integer load to another, don't handle non-addrspace-0 null pointers
+ // correctly, and don't have any support for mapping ranges as the
+ // integer type becomes winder or narrower.
+ if (MDNode *N = LI.getMetadata(LLVMContext::MD_nonnull))
+ copyNonnullMetadata(LI, N, *NewLI);
+
+ // Try to preserve nonnull metadata
+ V = NewLI;
+
+ // If this is an integer load past the end of the slice (which means the
+ // bytes outside the slice are undef or this load is dead) just forcibly
+ // fix the integer size with correct handling of endianness.
+ if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+ if (auto *TITy = dyn_cast<IntegerType>(TargetTy))
+ if (AITy->getBitWidth() < TITy->getBitWidth()) {
+ V = IRB.CreateZExt(V, TITy, "load.ext");
+ if (DL.isBigEndian())
+ V = IRB.CreateShl(V, TITy->getBitWidth() - AITy->getBitWidth(),
+ "endian_shift");
+ }
+ } else {
+ Type *LTy = TargetTy->getPointerTo(AS);
+ LoadInst *NewLI =
+ IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
+ getSliceAlign(), LI.isVolatile(), LI.getName());
+ if (AATags)
NewLI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
- if (LI.isVolatile())
- NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
-
- V = NewLI;
- IsPtrAdjusted = true;
- }
- V = convertValue(DL, IRB, V, TargetTy);
-
- if (IsSplit) {
- assert(!LI.isVolatile());
- assert(LI.getType()->isIntegerTy() &&
- "Only integer type loads and stores are split");
- assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedSize() &&
- "Split load isn't smaller than original load");
- assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
- "Non-byte-multiple bit width");
- // Move the insertion point just past the load so that we can refer to it.
- IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
- // Create a placeholder value with the same type as LI to use as the
- // basis for the new value. This allows us to replace the uses of LI with
- // the computed value, and then replace the placeholder with LI, leaving
- // LI only used for this computation.
- Value *Placeholder = new LoadInst(
- LI.getType(), UndefValue::get(LI.getType()->getPointerTo(AS)), "",
- false, Align(1));
- V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
- "insert");
- LI.replaceAllUsesWith(V);
- Placeholder->replaceAllUsesWith(&LI);
- Placeholder->deleteValue();
- } else {
- LI.replaceAllUsesWith(V);
- }
-
+ if (LI.isVolatile())
+ NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+
+ V = NewLI;
+ IsPtrAdjusted = true;
+ }
+ V = convertValue(DL, IRB, V, TargetTy);
+
+ if (IsSplit) {
+ assert(!LI.isVolatile());
+ assert(LI.getType()->isIntegerTy() &&
+ "Only integer type loads and stores are split");
+ assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedSize() &&
+ "Split load isn't smaller than original load");
+ assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
+ "Non-byte-multiple bit width");
+ // Move the insertion point just past the load so that we can refer to it.
+ IRB.SetInsertPoint(&*std::next(BasicBlock::iterator(&LI)));
+ // Create a placeholder value with the same type as LI to use as the
+ // basis for the new value. This allows us to replace the uses of LI with
+ // the computed value, and then replace the placeholder with LI, leaving
+ // LI only used for this computation.
+ Value *Placeholder = new LoadInst(
+ LI.getType(), UndefValue::get(LI.getType()->getPointerTo(AS)), "",
+ false, Align(1));
+ V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
+ "insert");
+ LI.replaceAllUsesWith(V);
+ Placeholder->replaceAllUsesWith(&LI);
+ Placeholder->deleteValue();
+ } else {
+ LI.replaceAllUsesWith(V);
+ }
+
Pass.DeadInsts.push_back(&LI);
- deleteIfTriviallyDead(OldOp);
- LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
- return !LI.isVolatile() && !IsPtrAdjusted;
- }
-
- bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
- AAMDNodes AATags) {
- if (V->getType() != VecTy) {
- unsigned BeginIndex = getIndex(NewBeginOffset);
- unsigned EndIndex = getIndex(NewEndOffset);
- assert(EndIndex > BeginIndex && "Empty vector!");
- unsigned NumElements = EndIndex - BeginIndex;
- assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
- "Too many elements!");
- Type *SliceTy = (NumElements == 1)
- ? ElementTy
- : FixedVectorType::get(ElementTy, NumElements);
- if (V->getType() != SliceTy)
- V = convertValue(DL, IRB, V, SliceTy);
-
- // Mix in the existing elements.
- Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
- NewAI.getAlign(), "load");
- V = insertVector(IRB, Old, V, BeginIndex, "vec");
- }
- StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
- if (AATags)
+ deleteIfTriviallyDead(OldOp);
+ LLVM_DEBUG(dbgs() << " to: " << *V << "\n");
+ return !LI.isVolatile() && !IsPtrAdjusted;
+ }
+
+ bool rewriteVectorizedStoreInst(Value *V, StoreInst &SI, Value *OldOp,
+ AAMDNodes AATags) {
+ if (V->getType() != VecTy) {
+ unsigned BeginIndex = getIndex(NewBeginOffset);
+ unsigned EndIndex = getIndex(NewEndOffset);
+ assert(EndIndex > BeginIndex && "Empty vector!");
+ unsigned NumElements = EndIndex - BeginIndex;
+ assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
+ "Too many elements!");
+ Type *SliceTy = (NumElements == 1)
+ ? ElementTy
+ : FixedVectorType::get(ElementTy, NumElements);
+ if (V->getType() != SliceTy)
+ V = convertValue(DL, IRB, V, SliceTy);
+
+ // Mix in the existing elements.
+ Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+ NewAI.getAlign(), "load");
+ V = insertVector(IRB, Old, V, BeginIndex, "vec");
+ }
+ StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
+ if (AATags)
Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
Pass.DeadInsts.push_back(&SI);
-
- LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
- return true;
- }
-
- bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
- assert(IntTy && "We cannot extract an integer from the alloca");
- assert(!SI.isVolatile());
- if (DL.getTypeSizeInBits(V->getType()).getFixedSize() !=
- IntTy->getBitWidth()) {
- Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
- NewAI.getAlign(), "oldload");
- Old = convertValue(DL, IRB, Old, IntTy);
- assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
- uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
- V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
- }
- V = convertValue(DL, IRB, V, NewAllocaTy);
- StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
- Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
- LLVMContext::MD_access_group});
- if (AATags)
+
+ LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
+ return true;
+ }
+
+ bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
+ assert(IntTy && "We cannot extract an integer from the alloca");
+ assert(!SI.isVolatile());
+ if (DL.getTypeSizeInBits(V->getType()).getFixedSize() !=
+ IntTy->getBitWidth()) {
+ Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+ NewAI.getAlign(), "oldload");
+ Old = convertValue(DL, IRB, Old, IntTy);
+ assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+ uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+ V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
+ }
+ V = convertValue(DL, IRB, V, NewAllocaTy);
+ StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
+ Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
+ LLVMContext::MD_access_group});
+ if (AATags)
Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
Pass.DeadInsts.push_back(&SI);
- LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
- return true;
- }
-
- bool visitStoreInst(StoreInst &SI) {
- LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
- Value *OldOp = SI.getOperand(1);
- assert(OldOp == OldPtr);
-
- AAMDNodes AATags;
- SI.getAAMetadata(AATags);
-
- Value *V = SI.getValueOperand();
-
- // Strip all inbounds GEPs and pointer casts to try to dig out any root
- // alloca that should be re-examined after promoting this alloca.
- if (V->getType()->isPointerTy())
- if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
- Pass.PostPromotionWorklist.insert(AI);
-
- if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedSize()) {
- assert(!SI.isVolatile());
- assert(V->getType()->isIntegerTy() &&
- "Only integer type loads and stores are split");
- assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
- "Non-byte-multiple bit width");
- IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
- V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
- "extract");
- }
-
- if (VecTy)
- return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
- if (IntTy && V->getType()->isIntegerTy())
- return rewriteIntegerStore(V, SI, AATags);
-
- const bool IsStorePastEnd =
- DL.getTypeStoreSize(V->getType()).getFixedSize() > SliceSize;
- StoreInst *NewSI;
- if (NewBeginOffset == NewAllocaBeginOffset &&
- NewEndOffset == NewAllocaEndOffset &&
- (canConvertValue(DL, V->getType(), NewAllocaTy) ||
- (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
- V->getType()->isIntegerTy()))) {
- // If this is an integer store past the end of slice (and thus the bytes
- // past that point are irrelevant or this is unreachable), truncate the
- // value prior to storing.
- if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
- if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
- if (VITy->getBitWidth() > AITy->getBitWidth()) {
- if (DL.isBigEndian())
- V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
- "endian_shift");
- V = IRB.CreateTrunc(V, AITy, "load.trunc");
- }
-
- V = convertValue(DL, IRB, V, NewAllocaTy);
- NewSI =
- IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile());
- } else {
- unsigned AS = SI.getPointerAddressSpace();
- Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
- NewSI =
- IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
- }
- NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
- LLVMContext::MD_access_group});
- if (AATags)
+ LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
+ return true;
+ }
+
+ bool visitStoreInst(StoreInst &SI) {
+ LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
+ Value *OldOp = SI.getOperand(1);
+ assert(OldOp == OldPtr);
+
+ AAMDNodes AATags;
+ SI.getAAMetadata(AATags);
+
+ Value *V = SI.getValueOperand();
+
+ // Strip all inbounds GEPs and pointer casts to try to dig out any root
+ // alloca that should be re-examined after promoting this alloca.
+ if (V->getType()->isPointerTy())
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
+ Pass.PostPromotionWorklist.insert(AI);
+
+ if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedSize()) {
+ assert(!SI.isVolatile());
+ assert(V->getType()->isIntegerTy() &&
+ "Only integer type loads and stores are split");
+ assert(DL.typeSizeEqualsStoreSize(V->getType()) &&
+ "Non-byte-multiple bit width");
+ IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), SliceSize * 8);
+ V = extractInteger(DL, IRB, V, NarrowTy, NewBeginOffset - BeginOffset,
+ "extract");
+ }
+
+ if (VecTy)
+ return rewriteVectorizedStoreInst(V, SI, OldOp, AATags);
+ if (IntTy && V->getType()->isIntegerTy())
+ return rewriteIntegerStore(V, SI, AATags);
+
+ const bool IsStorePastEnd =
+ DL.getTypeStoreSize(V->getType()).getFixedSize() > SliceSize;
+ StoreInst *NewSI;
+ if (NewBeginOffset == NewAllocaBeginOffset &&
+ NewEndOffset == NewAllocaEndOffset &&
+ (canConvertValue(DL, V->getType(), NewAllocaTy) ||
+ (IsStorePastEnd && NewAllocaTy->isIntegerTy() &&
+ V->getType()->isIntegerTy()))) {
+ // If this is an integer store past the end of slice (and thus the bytes
+ // past that point are irrelevant or this is unreachable), truncate the
+ // value prior to storing.
+ if (auto *VITy = dyn_cast<IntegerType>(V->getType()))
+ if (auto *AITy = dyn_cast<IntegerType>(NewAllocaTy))
+ if (VITy->getBitWidth() > AITy->getBitWidth()) {
+ if (DL.isBigEndian())
+ V = IRB.CreateLShr(V, VITy->getBitWidth() - AITy->getBitWidth(),
+ "endian_shift");
+ V = IRB.CreateTrunc(V, AITy, "load.trunc");
+ }
+
+ V = convertValue(DL, IRB, V, NewAllocaTy);
+ NewSI =
+ IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile());
+ } else {
+ unsigned AS = SI.getPointerAddressSpace();
+ Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
+ NewSI =
+ IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
+ }
+ NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
+ LLVMContext::MD_access_group});
+ if (AATags)
NewSI->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
- if (SI.isVolatile())
- NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
- if (NewSI->isAtomic())
- NewSI->setAlignment(SI.getAlign());
+ if (SI.isVolatile())
+ NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
+ if (NewSI->isAtomic())
+ NewSI->setAlignment(SI.getAlign());
Pass.DeadInsts.push_back(&SI);
- deleteIfTriviallyDead(OldOp);
-
- LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
- return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
- }
-
- /// Compute an integer value from splatting an i8 across the given
- /// number of bytes.
- ///
- /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
- /// call this routine.
- /// FIXME: Heed the advice above.
- ///
- /// \param V The i8 value to splat.
- /// \param Size The number of bytes in the output (assuming i8 is one byte)
- Value *getIntegerSplat(Value *V, unsigned Size) {
- assert(Size > 0 && "Expected a positive number of bytes.");
- IntegerType *VTy = cast<IntegerType>(V->getType());
- assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
- if (Size == 1)
- return V;
-
- Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
- V = IRB.CreateMul(
- IRB.CreateZExt(V, SplatIntTy, "zext"),
- ConstantExpr::getUDiv(
- Constant::getAllOnesValue(SplatIntTy),
- ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
- SplatIntTy)),
- "isplat");
- return V;
- }
-
- /// Compute a vector splat for a given element value.
- Value *getVectorSplat(Value *V, unsigned NumElements) {
- V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
- LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
- return V;
- }
-
- bool visitMemSetInst(MemSetInst &II) {
- LLVM_DEBUG(dbgs() << " original: " << II << "\n");
- assert(II.getRawDest() == OldPtr);
-
- AAMDNodes AATags;
- II.getAAMetadata(AATags);
-
- // If the memset has a variable size, it cannot be split, just adjust the
- // pointer to the new alloca.
- if (!isa<Constant>(II.getLength())) {
- assert(!IsSplit);
- assert(NewBeginOffset == BeginOffset);
- II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
- II.setDestAlignment(getSliceAlign());
-
- deleteIfTriviallyDead(OldPtr);
- return false;
- }
-
- // Record this instruction for deletion.
+ deleteIfTriviallyDead(OldOp);
+
+ LLVM_DEBUG(dbgs() << " to: " << *NewSI << "\n");
+ return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
+ }
+
+ /// Compute an integer value from splatting an i8 across the given
+ /// number of bytes.
+ ///
+ /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
+ /// call this routine.
+ /// FIXME: Heed the advice above.
+ ///
+ /// \param V The i8 value to splat.
+ /// \param Size The number of bytes in the output (assuming i8 is one byte)
+ Value *getIntegerSplat(Value *V, unsigned Size) {
+ assert(Size > 0 && "Expected a positive number of bytes.");
+ IntegerType *VTy = cast<IntegerType>(V->getType());
+ assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
+ if (Size == 1)
+ return V;
+
+ Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size * 8);
+ V = IRB.CreateMul(
+ IRB.CreateZExt(V, SplatIntTy, "zext"),
+ ConstantExpr::getUDiv(
+ Constant::getAllOnesValue(SplatIntTy),
+ ConstantExpr::getZExt(Constant::getAllOnesValue(V->getType()),
+ SplatIntTy)),
+ "isplat");
+ return V;
+ }
+
+ /// Compute a vector splat for a given element value.
+ Value *getVectorSplat(Value *V, unsigned NumElements) {
+ V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
+ LLVM_DEBUG(dbgs() << " splat: " << *V << "\n");
+ return V;
+ }
+
+ bool visitMemSetInst(MemSetInst &II) {
+ LLVM_DEBUG(dbgs() << " original: " << II << "\n");
+ assert(II.getRawDest() == OldPtr);
+
+ AAMDNodes AATags;
+ II.getAAMetadata(AATags);
+
+ // If the memset has a variable size, it cannot be split, just adjust the
+ // pointer to the new alloca.
+ if (!isa<Constant>(II.getLength())) {
+ assert(!IsSplit);
+ assert(NewBeginOffset == BeginOffset);
+ II.setDest(getNewAllocaSlicePtr(IRB, OldPtr->getType()));
+ II.setDestAlignment(getSliceAlign());
+
+ deleteIfTriviallyDead(OldPtr);
+ return false;
+ }
+
+ // Record this instruction for deletion.
Pass.DeadInsts.push_back(&II);
-
- Type *AllocaTy = NewAI.getAllocatedType();
- Type *ScalarTy = AllocaTy->getScalarType();
-
- const bool CanContinue = [&]() {
- if (VecTy || IntTy)
- return true;
- if (BeginOffset > NewAllocaBeginOffset ||
- EndOffset < NewAllocaEndOffset)
- return false;
- auto *C = cast<ConstantInt>(II.getLength());
- if (C->getBitWidth() > 64)
- return false;
- const auto Len = C->getZExtValue();
- auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
- auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
- return canConvertValue(DL, SrcTy, AllocaTy) &&
- DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedSize());
- }();
-
- // If this doesn't map cleanly onto the alloca type, and that type isn't
- // a single value type, just emit a memset.
- if (!CanContinue) {
- Type *SizeTy = II.getLength()->getType();
- Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
- CallInst *New = IRB.CreateMemSet(
- getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
- MaybeAlign(getSliceAlign()), II.isVolatile());
- if (AATags)
+
+ Type *AllocaTy = NewAI.getAllocatedType();
+ Type *ScalarTy = AllocaTy->getScalarType();
+
+ const bool CanContinue = [&]() {
+ if (VecTy || IntTy)
+ return true;
+ if (BeginOffset > NewAllocaBeginOffset ||
+ EndOffset < NewAllocaEndOffset)
+ return false;
+ auto *C = cast<ConstantInt>(II.getLength());
+ if (C->getBitWidth() > 64)
+ return false;
+ const auto Len = C->getZExtValue();
+ auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
+ auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
+ return canConvertValue(DL, SrcTy, AllocaTy) &&
+ DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedSize());
+ }();
+
+ // If this doesn't map cleanly onto the alloca type, and that type isn't
+ // a single value type, just emit a memset.
+ if (!CanContinue) {
+ Type *SizeTy = II.getLength()->getType();
+ Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
+ CallInst *New = IRB.CreateMemSet(
+ getNewAllocaSlicePtr(IRB, OldPtr->getType()), II.getValue(), Size,
+ MaybeAlign(getSliceAlign()), II.isVolatile());
+ if (AATags)
New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
- LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
- return false;
- }
-
- // If we can represent this as a simple value, we have to build the actual
- // value to store, which requires expanding the byte present in memset to
- // a sensible representation for the alloca type. This is essentially
- // splatting the byte to a sufficiently wide integer, splatting it across
- // any desired vector width, and bitcasting to the final type.
- Value *V;
-
- if (VecTy) {
- // If this is a memset of a vectorized alloca, insert it.
- assert(ElementTy == ScalarTy);
-
- unsigned BeginIndex = getIndex(NewBeginOffset);
- unsigned EndIndex = getIndex(NewEndOffset);
- assert(EndIndex > BeginIndex && "Empty vector!");
- unsigned NumElements = EndIndex - BeginIndex;
- assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
- "Too many elements!");
-
- Value *Splat = getIntegerSplat(
- II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8);
- Splat = convertValue(DL, IRB, Splat, ElementTy);
- if (NumElements > 1)
- Splat = getVectorSplat(Splat, NumElements);
-
- Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
- NewAI.getAlign(), "oldload");
- V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
- } else if (IntTy) {
- // If this is a memset on an alloca where we can widen stores, insert the
- // set integer.
- assert(!II.isVolatile());
-
- uint64_t Size = NewEndOffset - NewBeginOffset;
- V = getIntegerSplat(II.getValue(), Size);
-
- if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
- EndOffset != NewAllocaBeginOffset)) {
- Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
- NewAI.getAlign(), "oldload");
- Old = convertValue(DL, IRB, Old, IntTy);
- uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
- V = insertInteger(DL, IRB, Old, V, Offset, "insert");
- } else {
- assert(V->getType() == IntTy &&
- "Wrong type for an alloca wide integer!");
- }
- V = convertValue(DL, IRB, V, AllocaTy);
- } else {
- // Established these invariants above.
- assert(NewBeginOffset == NewAllocaBeginOffset);
- assert(NewEndOffset == NewAllocaEndOffset);
-
- V = getIntegerSplat(II.getValue(),
- DL.getTypeSizeInBits(ScalarTy).getFixedSize() / 8);
- if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
- V = getVectorSplat(
- V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
-
- V = convertValue(DL, IRB, V, AllocaTy);
- }
-
- StoreInst *New =
- IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile());
- if (AATags)
+ LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
+ return false;
+ }
+
+ // If we can represent this as a simple value, we have to build the actual
+ // value to store, which requires expanding the byte present in memset to
+ // a sensible representation for the alloca type. This is essentially
+ // splatting the byte to a sufficiently wide integer, splatting it across
+ // any desired vector width, and bitcasting to the final type.
+ Value *V;
+
+ if (VecTy) {
+ // If this is a memset of a vectorized alloca, insert it.
+ assert(ElementTy == ScalarTy);
+
+ unsigned BeginIndex = getIndex(NewBeginOffset);
+ unsigned EndIndex = getIndex(NewEndOffset);
+ assert(EndIndex > BeginIndex && "Empty vector!");
+ unsigned NumElements = EndIndex - BeginIndex;
+ assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
+ "Too many elements!");
+
+ Value *Splat = getIntegerSplat(
+ II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8);
+ Splat = convertValue(DL, IRB, Splat, ElementTy);
+ if (NumElements > 1)
+ Splat = getVectorSplat(Splat, NumElements);
+
+ Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+ NewAI.getAlign(), "oldload");
+ V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
+ } else if (IntTy) {
+ // If this is a memset on an alloca where we can widen stores, insert the
+ // set integer.
+ assert(!II.isVolatile());
+
+ uint64_t Size = NewEndOffset - NewBeginOffset;
+ V = getIntegerSplat(II.getValue(), Size);
+
+ if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
+ EndOffset != NewAllocaBeginOffset)) {
+ Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+ NewAI.getAlign(), "oldload");
+ Old = convertValue(DL, IRB, Old, IntTy);
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+ V = insertInteger(DL, IRB, Old, V, Offset, "insert");
+ } else {
+ assert(V->getType() == IntTy &&
+ "Wrong type for an alloca wide integer!");
+ }
+ V = convertValue(DL, IRB, V, AllocaTy);
+ } else {
+ // Established these invariants above.
+ assert(NewBeginOffset == NewAllocaBeginOffset);
+ assert(NewEndOffset == NewAllocaEndOffset);
+
+ V = getIntegerSplat(II.getValue(),
+ DL.getTypeSizeInBits(ScalarTy).getFixedSize() / 8);
+ if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
+ V = getVectorSplat(
+ V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
+
+ V = convertValue(DL, IRB, V, AllocaTy);
+ }
+
+ StoreInst *New =
+ IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile());
+ if (AATags)
New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
- LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
- return !II.isVolatile();
- }
-
- bool visitMemTransferInst(MemTransferInst &II) {
- // Rewriting of memory transfer instructions can be a bit tricky. We break
- // them into two categories: split intrinsics and unsplit intrinsics.
-
- LLVM_DEBUG(dbgs() << " original: " << II << "\n");
-
- AAMDNodes AATags;
- II.getAAMetadata(AATags);
-
- bool IsDest = &II.getRawDestUse() == OldUse;
- assert((IsDest && II.getRawDest() == OldPtr) ||
- (!IsDest && II.getRawSource() == OldPtr));
-
- MaybeAlign SliceAlign = getSliceAlign();
-
- // For unsplit intrinsics, we simply modify the source and destination
- // pointers in place. This isn't just an optimization, it is a matter of
- // correctness. With unsplit intrinsics we may be dealing with transfers
- // within a single alloca before SROA ran, or with transfers that have
- // a variable length. We may also be dealing with memmove instead of
- // memcpy, and so simply updating the pointers is the necessary for us to
- // update both source and dest of a single call.
- if (!IsSplittable) {
- Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
- if (IsDest) {
- II.setDest(AdjustedPtr);
- II.setDestAlignment(SliceAlign);
- }
- else {
- II.setSource(AdjustedPtr);
- II.setSourceAlignment(SliceAlign);
- }
-
- LLVM_DEBUG(dbgs() << " to: " << II << "\n");
- deleteIfTriviallyDead(OldPtr);
- return false;
- }
- // For split transfer intrinsics we have an incredibly useful assurance:
- // the source and destination do not reside within the same alloca, and at
- // least one of them does not escape. This means that we can replace
- // memmove with memcpy, and we don't need to worry about all manner of
- // downsides to splitting and transforming the operations.
-
- // If this doesn't map cleanly onto the alloca type, and that type isn't
- // a single value type, just emit a memcpy.
- bool EmitMemCpy =
- !VecTy && !IntTy &&
- (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
- SliceSize !=
- DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedSize() ||
- !NewAI.getAllocatedType()->isSingleValueType());
-
- // If we're just going to emit a memcpy, the alloca hasn't changed, and the
- // size hasn't been shrunk based on analysis of the viable range, this is
- // a no-op.
- if (EmitMemCpy && &OldAI == &NewAI) {
- // Ensure the start lines up.
- assert(NewBeginOffset == BeginOffset);
-
- // Rewrite the size as needed.
- if (NewEndOffset != EndOffset)
- II.setLength(ConstantInt::get(II.getLength()->getType(),
- NewEndOffset - NewBeginOffset));
- return false;
- }
- // Record this instruction for deletion.
+ LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
+ return !II.isVolatile();
+ }
+
+ bool visitMemTransferInst(MemTransferInst &II) {
+ // Rewriting of memory transfer instructions can be a bit tricky. We break
+ // them into two categories: split intrinsics and unsplit intrinsics.
+
+ LLVM_DEBUG(dbgs() << " original: " << II << "\n");
+
+ AAMDNodes AATags;
+ II.getAAMetadata(AATags);
+
+ bool IsDest = &II.getRawDestUse() == OldUse;
+ assert((IsDest && II.getRawDest() == OldPtr) ||
+ (!IsDest && II.getRawSource() == OldPtr));
+
+ MaybeAlign SliceAlign = getSliceAlign();
+
+ // For unsplit intrinsics, we simply modify the source and destination
+ // pointers in place. This isn't just an optimization, it is a matter of
+ // correctness. With unsplit intrinsics we may be dealing with transfers
+ // within a single alloca before SROA ran, or with transfers that have
+ // a variable length. We may also be dealing with memmove instead of
+ // memcpy, and so simply updating the pointers is the necessary for us to
+ // update both source and dest of a single call.
+ if (!IsSplittable) {
+ Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+ if (IsDest) {
+ II.setDest(AdjustedPtr);
+ II.setDestAlignment(SliceAlign);
+ }
+ else {
+ II.setSource(AdjustedPtr);
+ II.setSourceAlignment(SliceAlign);
+ }
+
+ LLVM_DEBUG(dbgs() << " to: " << II << "\n");
+ deleteIfTriviallyDead(OldPtr);
+ return false;
+ }
+ // For split transfer intrinsics we have an incredibly useful assurance:
+ // the source and destination do not reside within the same alloca, and at
+ // least one of them does not escape. This means that we can replace
+ // memmove with memcpy, and we don't need to worry about all manner of
+ // downsides to splitting and transforming the operations.
+
+ // If this doesn't map cleanly onto the alloca type, and that type isn't
+ // a single value type, just emit a memcpy.
+ bool EmitMemCpy =
+ !VecTy && !IntTy &&
+ (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
+ SliceSize !=
+ DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedSize() ||
+ !NewAI.getAllocatedType()->isSingleValueType());
+
+ // If we're just going to emit a memcpy, the alloca hasn't changed, and the
+ // size hasn't been shrunk based on analysis of the viable range, this is
+ // a no-op.
+ if (EmitMemCpy && &OldAI == &NewAI) {
+ // Ensure the start lines up.
+ assert(NewBeginOffset == BeginOffset);
+
+ // Rewrite the size as needed.
+ if (NewEndOffset != EndOffset)
+ II.setLength(ConstantInt::get(II.getLength()->getType(),
+ NewEndOffset - NewBeginOffset));
+ return false;
+ }
+ // Record this instruction for deletion.
Pass.DeadInsts.push_back(&II);
-
- // Strip all inbounds GEPs and pointer casts to try to dig out any root
- // alloca that should be re-examined after rewriting this instruction.
- Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
- if (AllocaInst *AI =
- dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
- assert(AI != &OldAI && AI != &NewAI &&
- "Splittable transfers cannot reach the same alloca on both ends.");
- Pass.Worklist.insert(AI);
- }
-
- Type *OtherPtrTy = OtherPtr->getType();
- unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
-
- // Compute the relative offset for the other pointer within the transfer.
- unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
- APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
- Align OtherAlign =
- (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
- OtherAlign =
- commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
-
- if (EmitMemCpy) {
- // Compute the other pointer, folding as much as possible to produce
- // a single, simple GEP in most cases.
- OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
- OtherPtr->getName() + ".");
-
- Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
- Type *SizeTy = II.getLength()->getType();
- Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
-
- Value *DestPtr, *SrcPtr;
- MaybeAlign DestAlign, SrcAlign;
- // Note: IsDest is true iff we're copying into the new alloca slice
- if (IsDest) {
- DestPtr = OurPtr;
- DestAlign = SliceAlign;
- SrcPtr = OtherPtr;
- SrcAlign = OtherAlign;
- } else {
- DestPtr = OtherPtr;
- DestAlign = OtherAlign;
- SrcPtr = OurPtr;
- SrcAlign = SliceAlign;
- }
- CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
- Size, II.isVolatile());
- if (AATags)
+
+ // Strip all inbounds GEPs and pointer casts to try to dig out any root
+ // alloca that should be re-examined after rewriting this instruction.
+ Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
+ if (AllocaInst *AI =
+ dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets())) {
+ assert(AI != &OldAI && AI != &NewAI &&
+ "Splittable transfers cannot reach the same alloca on both ends.");
+ Pass.Worklist.insert(AI);
+ }
+
+ Type *OtherPtrTy = OtherPtr->getType();
+ unsigned OtherAS = OtherPtrTy->getPointerAddressSpace();
+
+ // Compute the relative offset for the other pointer within the transfer.
+ unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
+ APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
+ Align OtherAlign =
+ (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
+ OtherAlign =
+ commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
+
+ if (EmitMemCpy) {
+ // Compute the other pointer, folding as much as possible to produce
+ // a single, simple GEP in most cases.
+ OtherPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+ OtherPtr->getName() + ".");
+
+ Value *OurPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+ Type *SizeTy = II.getLength()->getType();
+ Constant *Size = ConstantInt::get(SizeTy, NewEndOffset - NewBeginOffset);
+
+ Value *DestPtr, *SrcPtr;
+ MaybeAlign DestAlign, SrcAlign;
+ // Note: IsDest is true iff we're copying into the new alloca slice
+ if (IsDest) {
+ DestPtr = OurPtr;
+ DestAlign = SliceAlign;
+ SrcPtr = OtherPtr;
+ SrcAlign = OtherAlign;
+ } else {
+ DestPtr = OtherPtr;
+ DestAlign = OtherAlign;
+ SrcPtr = OurPtr;
+ SrcAlign = SliceAlign;
+ }
+ CallInst *New = IRB.CreateMemCpy(DestPtr, DestAlign, SrcPtr, SrcAlign,
+ Size, II.isVolatile());
+ if (AATags)
New->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
- LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
- return false;
- }
-
- bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
- NewEndOffset == NewAllocaEndOffset;
- uint64_t Size = NewEndOffset - NewBeginOffset;
- unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
- unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
- unsigned NumElements = EndIndex - BeginIndex;
- IntegerType *SubIntTy =
- IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
-
- // Reset the other pointer type to match the register type we're going to
- // use, but using the address space of the original other pointer.
- Type *OtherTy;
- if (VecTy && !IsWholeAlloca) {
- if (NumElements == 1)
- OtherTy = VecTy->getElementType();
- else
- OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
- } else if (IntTy && !IsWholeAlloca) {
- OtherTy = SubIntTy;
- } else {
- OtherTy = NewAllocaTy;
- }
- OtherPtrTy = OtherTy->getPointerTo(OtherAS);
-
- Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
- OtherPtr->getName() + ".");
- MaybeAlign SrcAlign = OtherAlign;
- Value *DstPtr = &NewAI;
- MaybeAlign DstAlign = SliceAlign;
- if (!IsDest) {
- std::swap(SrcPtr, DstPtr);
- std::swap(SrcAlign, DstAlign);
- }
-
- Value *Src;
- if (VecTy && !IsWholeAlloca && !IsDest) {
- Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
- NewAI.getAlign(), "load");
- Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
- } else if (IntTy && !IsWholeAlloca && !IsDest) {
- Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
- NewAI.getAlign(), "load");
- Src = convertValue(DL, IRB, Src, IntTy);
- uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
- Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
- } else {
- LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
- II.isVolatile(), "copyload");
- if (AATags)
+ LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
+ return false;
+ }
+
+ bool IsWholeAlloca = NewBeginOffset == NewAllocaBeginOffset &&
+ NewEndOffset == NewAllocaEndOffset;
+ uint64_t Size = NewEndOffset - NewBeginOffset;
+ unsigned BeginIndex = VecTy ? getIndex(NewBeginOffset) : 0;
+ unsigned EndIndex = VecTy ? getIndex(NewEndOffset) : 0;
+ unsigned NumElements = EndIndex - BeginIndex;
+ IntegerType *SubIntTy =
+ IntTy ? Type::getIntNTy(IntTy->getContext(), Size * 8) : nullptr;
+
+ // Reset the other pointer type to match the register type we're going to
+ // use, but using the address space of the original other pointer.
+ Type *OtherTy;
+ if (VecTy && !IsWholeAlloca) {
+ if (NumElements == 1)
+ OtherTy = VecTy->getElementType();
+ else
+ OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
+ } else if (IntTy && !IsWholeAlloca) {
+ OtherTy = SubIntTy;
+ } else {
+ OtherTy = NewAllocaTy;
+ }
+ OtherPtrTy = OtherTy->getPointerTo(OtherAS);
+
+ Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
+ OtherPtr->getName() + ".");
+ MaybeAlign SrcAlign = OtherAlign;
+ Value *DstPtr = &NewAI;
+ MaybeAlign DstAlign = SliceAlign;
+ if (!IsDest) {
+ std::swap(SrcPtr, DstPtr);
+ std::swap(SrcAlign, DstAlign);
+ }
+
+ Value *Src;
+ if (VecTy && !IsWholeAlloca && !IsDest) {
+ Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+ NewAI.getAlign(), "load");
+ Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
+ } else if (IntTy && !IsWholeAlloca && !IsDest) {
+ Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+ NewAI.getAlign(), "load");
+ Src = convertValue(DL, IRB, Src, IntTy);
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+ Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
+ } else {
+ LoadInst *Load = IRB.CreateAlignedLoad(OtherTy, SrcPtr, SrcAlign,
+ II.isVolatile(), "copyload");
+ if (AATags)
Load->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
- Src = Load;
- }
-
- if (VecTy && !IsWholeAlloca && IsDest) {
- Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
- NewAI.getAlign(), "oldload");
- Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
- } else if (IntTy && !IsWholeAlloca && IsDest) {
- Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
- NewAI.getAlign(), "oldload");
- Old = convertValue(DL, IRB, Old, IntTy);
- uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
- Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
- Src = convertValue(DL, IRB, Src, NewAllocaTy);
- }
-
- StoreInst *Store = cast<StoreInst>(
- IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
- if (AATags)
+ Src = Load;
+ }
+
+ if (VecTy && !IsWholeAlloca && IsDest) {
+ Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+ NewAI.getAlign(), "oldload");
+ Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
+ } else if (IntTy && !IsWholeAlloca && IsDest) {
+ Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
+ NewAI.getAlign(), "oldload");
+ Old = convertValue(DL, IRB, Old, IntTy);
+ uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
+ Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
+ Src = convertValue(DL, IRB, Src, NewAllocaTy);
+ }
+
+ StoreInst *Store = cast<StoreInst>(
+ IRB.CreateAlignedStore(Src, DstPtr, DstAlign, II.isVolatile()));
+ if (AATags)
Store->setAAMetadata(AATags.shift(NewBeginOffset - BeginOffset));
- LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
- return !II.isVolatile();
- }
-
- bool visitIntrinsicInst(IntrinsicInst &II) {
+ LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
+ return !II.isVolatile();
+ }
+
+ bool visitIntrinsicInst(IntrinsicInst &II) {
assert((II.isLifetimeStartOrEnd() || II.isDroppable()) &&
"Unexpected intrinsic!");
- LLVM_DEBUG(dbgs() << " original: " << II << "\n");
-
- // Record this instruction for deletion.
+ LLVM_DEBUG(dbgs() << " original: " << II << "\n");
+
+ // Record this instruction for deletion.
Pass.DeadInsts.push_back(&II);
-
+
if (II.isDroppable()) {
assert(II.getIntrinsicID() == Intrinsic::assume && "Expected assume");
// TODO For now we forget assumed information, this can be improved.
@@ -3101,286 +3101,286 @@ private:
}
assert(II.getArgOperand(1) == OldPtr);
- // Lifetime intrinsics are only promotable if they cover the whole alloca.
- // Therefore, we drop lifetime intrinsics which don't cover the whole
- // alloca.
- // (In theory, intrinsics which partially cover an alloca could be
- // promoted, but PromoteMemToReg doesn't handle that case.)
- // FIXME: Check whether the alloca is promotable before dropping the
- // lifetime intrinsics?
- if (NewBeginOffset != NewAllocaBeginOffset ||
- NewEndOffset != NewAllocaEndOffset)
- return true;
-
- ConstantInt *Size =
- ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
- NewEndOffset - NewBeginOffset);
- // Lifetime intrinsics always expect an i8* so directly get such a pointer
- // for the new alloca slice.
- Type *PointerTy = IRB.getInt8PtrTy(OldPtr->getType()->getPointerAddressSpace());
- Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
- Value *New;
- if (II.getIntrinsicID() == Intrinsic::lifetime_start)
- New = IRB.CreateLifetimeStart(Ptr, Size);
- else
- New = IRB.CreateLifetimeEnd(Ptr, Size);
-
- (void)New;
- LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
-
- return true;
- }
-
- void fixLoadStoreAlign(Instruction &Root) {
- // This algorithm implements the same visitor loop as
- // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
- // or store found.
- SmallPtrSet<Instruction *, 4> Visited;
- SmallVector<Instruction *, 4> Uses;
- Visited.insert(&Root);
- Uses.push_back(&Root);
- do {
- Instruction *I = Uses.pop_back_val();
-
- if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
- LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
- continue;
- }
- if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
- SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
- continue;
- }
-
- assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
- isa<PHINode>(I) || isa<SelectInst>(I) ||
- isa<GetElementPtrInst>(I));
- for (User *U : I->users())
- if (Visited.insert(cast<Instruction>(U)).second)
- Uses.push_back(cast<Instruction>(U));
- } while (!Uses.empty());
- }
-
- bool visitPHINode(PHINode &PN) {
- LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
- assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
- assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
-
- // We would like to compute a new pointer in only one place, but have it be
- // as local as possible to the PHI. To do that, we re-use the location of
- // the old pointer, which necessarily must be in the right position to
- // dominate the PHI.
- IRBuilderBase::InsertPointGuard Guard(IRB);
- if (isa<PHINode>(OldPtr))
- IRB.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
- else
- IRB.SetInsertPoint(OldPtr);
- IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
-
- Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
- // Replace the operands which were using the old pointer.
- std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
-
- LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
- deleteIfTriviallyDead(OldPtr);
-
- // Fix the alignment of any loads or stores using this PHI node.
- fixLoadStoreAlign(PN);
-
- // PHIs can't be promoted on their own, but often can be speculated. We
- // check the speculation outside of the rewriter so that we see the
- // fully-rewritten alloca.
- PHIUsers.insert(&PN);
- return true;
- }
-
- bool visitSelectInst(SelectInst &SI) {
- LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
- assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
- "Pointer isn't an operand!");
- assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
- assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
-
- Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
- // Replace the operands which were using the old pointer.
- if (SI.getOperand(1) == OldPtr)
- SI.setOperand(1, NewPtr);
- if (SI.getOperand(2) == OldPtr)
- SI.setOperand(2, NewPtr);
-
- LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
- deleteIfTriviallyDead(OldPtr);
-
- // Fix the alignment of any loads or stores using this select.
- fixLoadStoreAlign(SI);
-
- // Selects can't be promoted on their own, but often can be speculated. We
- // check the speculation outside of the rewriter so that we see the
- // fully-rewritten alloca.
- SelectUsers.insert(&SI);
- return true;
- }
-};
-
-namespace {
-
-/// Visitor to rewrite aggregate loads and stores as scalar.
-///
-/// This pass aggressively rewrites all aggregate loads and stores on
-/// a particular pointer (or any pointer derived from it which we can identify)
-/// with scalar loads and stores.
-class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
- // Befriend the base class so it can delegate to private visit methods.
- friend class InstVisitor<AggLoadStoreRewriter, bool>;
-
- /// Queue of pointer uses to analyze and potentially rewrite.
- SmallVector<Use *, 8> Queue;
-
- /// Set to prevent us from cycling with phi nodes and loops.
- SmallPtrSet<User *, 8> Visited;
-
- /// The current pointer use being rewritten. This is used to dig up the used
- /// value (as opposed to the user).
- Use *U = nullptr;
-
- /// Used to calculate offsets, and hence alignment, of subobjects.
- const DataLayout &DL;
-
-public:
- AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {}
-
- /// Rewrite loads and stores through a pointer and all pointers derived from
- /// it.
- bool rewrite(Instruction &I) {
- LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
- enqueueUsers(I);
- bool Changed = false;
- while (!Queue.empty()) {
- U = Queue.pop_back_val();
- Changed |= visit(cast<Instruction>(U->getUser()));
- }
- return Changed;
- }
-
-private:
- /// Enqueue all the users of the given instruction for further processing.
- /// This uses a set to de-duplicate users.
- void enqueueUsers(Instruction &I) {
- for (Use &U : I.uses())
- if (Visited.insert(U.getUser()).second)
- Queue.push_back(&U);
- }
-
- // Conservative default is to not rewrite anything.
- bool visitInstruction(Instruction &I) { return false; }
-
- /// Generic recursive split emission class.
- template <typename Derived> class OpSplitter {
- protected:
- /// The builder used to form new instructions.
- IRBuilderTy IRB;
-
- /// The indices which to be used with insert- or extractvalue to select the
- /// appropriate value within the aggregate.
- SmallVector<unsigned, 4> Indices;
-
- /// The indices to a GEP instruction which will move Ptr to the correct slot
- /// within the aggregate.
- SmallVector<Value *, 4> GEPIndices;
-
- /// The base pointer of the original op, used as a base for GEPing the
- /// split operations.
- Value *Ptr;
-
- /// The base pointee type being GEPed into.
- Type *BaseTy;
-
- /// Known alignment of the base pointer.
- Align BaseAlign;
-
- /// To calculate offset of each component so we can correctly deduce
- /// alignments.
- const DataLayout &DL;
-
- /// Initialize the splitter with an insertion point, Ptr and start with a
- /// single zero GEP index.
- OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
- Align BaseAlign, const DataLayout &DL)
- : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr),
- BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {}
-
- public:
- /// Generic recursive split emission routine.
- ///
- /// This method recursively splits an aggregate op (load or store) into
- /// scalar or vector ops. It splits recursively until it hits a single value
- /// and emits that single value operation via the template argument.
- ///
- /// The logic of this routine relies on GEPs and insertvalue and
- /// extractvalue all operating with the same fundamental index list, merely
- /// formatted differently (GEPs need actual values).
- ///
- /// \param Ty The type being split recursively into smaller ops.
- /// \param Agg The aggregate value being built up or stored, depending on
- /// whether this is splitting a load or a store respectively.
- void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
- if (Ty->isSingleValueType()) {
- unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
- return static_cast<Derived *>(this)->emitFunc(
- Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
- }
-
- if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
- unsigned OldSize = Indices.size();
- (void)OldSize;
- for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
- ++Idx) {
- assert(Indices.size() == OldSize && "Did not return to the old size");
- Indices.push_back(Idx);
- GEPIndices.push_back(IRB.getInt32(Idx));
- emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
- GEPIndices.pop_back();
- Indices.pop_back();
- }
- return;
- }
-
- if (StructType *STy = dyn_cast<StructType>(Ty)) {
- unsigned OldSize = Indices.size();
- (void)OldSize;
- for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
- ++Idx) {
- assert(Indices.size() == OldSize && "Did not return to the old size");
- Indices.push_back(Idx);
- GEPIndices.push_back(IRB.getInt32(Idx));
- emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
- GEPIndices.pop_back();
- Indices.pop_back();
- }
- return;
- }
-
- llvm_unreachable("Only arrays and structs are aggregate loadable types");
- }
- };
-
- struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
- AAMDNodes AATags;
-
- LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
- AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
- : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
- DL),
- AATags(AATags) {}
-
- /// Emit a leaf load of a single value. This is called at the leaves of the
- /// recursive emission to actually load values.
- void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
- assert(Ty->isSingleValueType());
- // Load the single value and insert it using the indices.
- Value *GEP =
- IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
- LoadInst *Load =
- IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
+ // Lifetime intrinsics are only promotable if they cover the whole alloca.
+ // Therefore, we drop lifetime intrinsics which don't cover the whole
+ // alloca.
+ // (In theory, intrinsics which partially cover an alloca could be
+ // promoted, but PromoteMemToReg doesn't handle that case.)
+ // FIXME: Check whether the alloca is promotable before dropping the
+ // lifetime intrinsics?
+ if (NewBeginOffset != NewAllocaBeginOffset ||
+ NewEndOffset != NewAllocaEndOffset)
+ return true;
+
+ ConstantInt *Size =
+ ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
+ NewEndOffset - NewBeginOffset);
+ // Lifetime intrinsics always expect an i8* so directly get such a pointer
+ // for the new alloca slice.
+ Type *PointerTy = IRB.getInt8PtrTy(OldPtr->getType()->getPointerAddressSpace());
+ Value *Ptr = getNewAllocaSlicePtr(IRB, PointerTy);
+ Value *New;
+ if (II.getIntrinsicID() == Intrinsic::lifetime_start)
+ New = IRB.CreateLifetimeStart(Ptr, Size);
+ else
+ New = IRB.CreateLifetimeEnd(Ptr, Size);
+
+ (void)New;
+ LLVM_DEBUG(dbgs() << " to: " << *New << "\n");
+
+ return true;
+ }
+
+ void fixLoadStoreAlign(Instruction &Root) {
+ // This algorithm implements the same visitor loop as
+ // hasUnsafePHIOrSelectUse, and fixes the alignment of each load
+ // or store found.
+ SmallPtrSet<Instruction *, 4> Visited;
+ SmallVector<Instruction *, 4> Uses;
+ Visited.insert(&Root);
+ Uses.push_back(&Root);
+ do {
+ Instruction *I = Uses.pop_back_val();
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
+ continue;
+ }
+ if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
+ continue;
+ }
+
+ assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
+ isa<PHINode>(I) || isa<SelectInst>(I) ||
+ isa<GetElementPtrInst>(I));
+ for (User *U : I->users())
+ if (Visited.insert(cast<Instruction>(U)).second)
+ Uses.push_back(cast<Instruction>(U));
+ } while (!Uses.empty());
+ }
+
+ bool visitPHINode(PHINode &PN) {
+ LLVM_DEBUG(dbgs() << " original: " << PN << "\n");
+ assert(BeginOffset >= NewAllocaBeginOffset && "PHIs are unsplittable");
+ assert(EndOffset <= NewAllocaEndOffset && "PHIs are unsplittable");
+
+ // We would like to compute a new pointer in only one place, but have it be
+ // as local as possible to the PHI. To do that, we re-use the location of
+ // the old pointer, which necessarily must be in the right position to
+ // dominate the PHI.
+ IRBuilderBase::InsertPointGuard Guard(IRB);
+ if (isa<PHINode>(OldPtr))
+ IRB.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
+ else
+ IRB.SetInsertPoint(OldPtr);
+ IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
+
+ Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+ // Replace the operands which were using the old pointer.
+ std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
+
+ LLVM_DEBUG(dbgs() << " to: " << PN << "\n");
+ deleteIfTriviallyDead(OldPtr);
+
+ // Fix the alignment of any loads or stores using this PHI node.
+ fixLoadStoreAlign(PN);
+
+ // PHIs can't be promoted on their own, but often can be speculated. We
+ // check the speculation outside of the rewriter so that we see the
+ // fully-rewritten alloca.
+ PHIUsers.insert(&PN);
+ return true;
+ }
+
+ bool visitSelectInst(SelectInst &SI) {
+ LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
+ assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
+ "Pointer isn't an operand!");
+ assert(BeginOffset >= NewAllocaBeginOffset && "Selects are unsplittable");
+ assert(EndOffset <= NewAllocaEndOffset && "Selects are unsplittable");
+
+ Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
+ // Replace the operands which were using the old pointer.
+ if (SI.getOperand(1) == OldPtr)
+ SI.setOperand(1, NewPtr);
+ if (SI.getOperand(2) == OldPtr)
+ SI.setOperand(2, NewPtr);
+
+ LLVM_DEBUG(dbgs() << " to: " << SI << "\n");
+ deleteIfTriviallyDead(OldPtr);
+
+ // Fix the alignment of any loads or stores using this select.
+ fixLoadStoreAlign(SI);
+
+ // Selects can't be promoted on their own, but often can be speculated. We
+ // check the speculation outside of the rewriter so that we see the
+ // fully-rewritten alloca.
+ SelectUsers.insert(&SI);
+ return true;
+ }
+};
+
+namespace {
+
+/// Visitor to rewrite aggregate loads and stores as scalar.
+///
+/// This pass aggressively rewrites all aggregate loads and stores on
+/// a particular pointer (or any pointer derived from it which we can identify)
+/// with scalar loads and stores.
+class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
+ // Befriend the base class so it can delegate to private visit methods.
+ friend class InstVisitor<AggLoadStoreRewriter, bool>;
+
+ /// Queue of pointer uses to analyze and potentially rewrite.
+ SmallVector<Use *, 8> Queue;
+
+ /// Set to prevent us from cycling with phi nodes and loops.
+ SmallPtrSet<User *, 8> Visited;
+
+ /// The current pointer use being rewritten. This is used to dig up the used
+ /// value (as opposed to the user).
+ Use *U = nullptr;
+
+ /// Used to calculate offsets, and hence alignment, of subobjects.
+ const DataLayout &DL;
+
+public:
+ AggLoadStoreRewriter(const DataLayout &DL) : DL(DL) {}
+
+ /// Rewrite loads and stores through a pointer and all pointers derived from
+ /// it.
+ bool rewrite(Instruction &I) {
+ LLVM_DEBUG(dbgs() << " Rewriting FCA loads and stores...\n");
+ enqueueUsers(I);
+ bool Changed = false;
+ while (!Queue.empty()) {
+ U = Queue.pop_back_val();
+ Changed |= visit(cast<Instruction>(U->getUser()));
+ }
+ return Changed;
+ }
+
+private:
+ /// Enqueue all the users of the given instruction for further processing.
+ /// This uses a set to de-duplicate users.
+ void enqueueUsers(Instruction &I) {
+ for (Use &U : I.uses())
+ if (Visited.insert(U.getUser()).second)
+ Queue.push_back(&U);
+ }
+
+ // Conservative default is to not rewrite anything.
+ bool visitInstruction(Instruction &I) { return false; }
+
+ /// Generic recursive split emission class.
+ template <typename Derived> class OpSplitter {
+ protected:
+ /// The builder used to form new instructions.
+ IRBuilderTy IRB;
+
+ /// The indices which to be used with insert- or extractvalue to select the
+ /// appropriate value within the aggregate.
+ SmallVector<unsigned, 4> Indices;
+
+ /// The indices to a GEP instruction which will move Ptr to the correct slot
+ /// within the aggregate.
+ SmallVector<Value *, 4> GEPIndices;
+
+ /// The base pointer of the original op, used as a base for GEPing the
+ /// split operations.
+ Value *Ptr;
+
+ /// The base pointee type being GEPed into.
+ Type *BaseTy;
+
+ /// Known alignment of the base pointer.
+ Align BaseAlign;
+
+ /// To calculate offset of each component so we can correctly deduce
+ /// alignments.
+ const DataLayout &DL;
+
+ /// Initialize the splitter with an insertion point, Ptr and start with a
+ /// single zero GEP index.
+ OpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
+ Align BaseAlign, const DataLayout &DL)
+ : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr),
+ BaseTy(BaseTy), BaseAlign(BaseAlign), DL(DL) {}
+
+ public:
+ /// Generic recursive split emission routine.
+ ///
+ /// This method recursively splits an aggregate op (load or store) into
+ /// scalar or vector ops. It splits recursively until it hits a single value
+ /// and emits that single value operation via the template argument.
+ ///
+ /// The logic of this routine relies on GEPs and insertvalue and
+ /// extractvalue all operating with the same fundamental index list, merely
+ /// formatted differently (GEPs need actual values).
+ ///
+ /// \param Ty The type being split recursively into smaller ops.
+ /// \param Agg The aggregate value being built up or stored, depending on
+ /// whether this is splitting a load or a store respectively.
+ void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
+ if (Ty->isSingleValueType()) {
+ unsigned Offset = DL.getIndexedOffsetInType(BaseTy, GEPIndices);
+ return static_cast<Derived *>(this)->emitFunc(
+ Ty, Agg, commonAlignment(BaseAlign, Offset), Name);
+ }
+
+ if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+ unsigned OldSize = Indices.size();
+ (void)OldSize;
+ for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
+ ++Idx) {
+ assert(Indices.size() == OldSize && "Did not return to the old size");
+ Indices.push_back(Idx);
+ GEPIndices.push_back(IRB.getInt32(Idx));
+ emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
+ GEPIndices.pop_back();
+ Indices.pop_back();
+ }
+ return;
+ }
+
+ if (StructType *STy = dyn_cast<StructType>(Ty)) {
+ unsigned OldSize = Indices.size();
+ (void)OldSize;
+ for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
+ ++Idx) {
+ assert(Indices.size() == OldSize && "Did not return to the old size");
+ Indices.push_back(Idx);
+ GEPIndices.push_back(IRB.getInt32(Idx));
+ emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
+ GEPIndices.pop_back();
+ Indices.pop_back();
+ }
+ return;
+ }
+
+ llvm_unreachable("Only arrays and structs are aggregate loadable types");
+ }
+ };
+
+ struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
+ AAMDNodes AATags;
+
+ LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
+ AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
+ : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
+ DL),
+ AATags(AATags) {}
+
+ /// Emit a leaf load of a single value. This is called at the leaves of the
+ /// recursive emission to actually load values.
+ void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
+ assert(Ty->isSingleValueType());
+ // Load the single value and insert it using the indices.
+ Value *GEP =
+ IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
+ LoadInst *Load =
+ IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
APInt Offset(
DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
@@ -3388,51 +3388,51 @@ private:
GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
Load->setAAMetadata(AATags.shift(Offset.getZExtValue()));
- Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
- LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
- }
- };
-
- bool visitLoadInst(LoadInst &LI) {
- assert(LI.getPointerOperand() == *U);
- if (!LI.isSimple() || LI.getType()->isSingleValueType())
- return false;
-
- // We have an aggregate being loaded, split it apart.
- LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
- AAMDNodes AATags;
- LI.getAAMetadata(AATags);
- LoadOpSplitter Splitter(&LI, *U, LI.getType(), AATags,
- getAdjustedAlignment(&LI, 0), DL);
- Value *V = UndefValue::get(LI.getType());
- Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
- Visited.erase(&LI);
- LI.replaceAllUsesWith(V);
- LI.eraseFromParent();
- return true;
- }
-
- struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
- StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
- AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
- : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
- DL),
- AATags(AATags) {}
- AAMDNodes AATags;
- /// Emit a leaf store of a single value. This is called at the leaves of the
- /// recursive emission to actually produce stores.
- void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
- assert(Ty->isSingleValueType());
- // Extract the single value and store it using the indices.
- //
- // The gep and extractvalue values are factored out of the CreateStore
- // call to make the output independent of the argument evaluation order.
- Value *ExtractValue =
- IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
- Value *InBoundsGEP =
- IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
- StoreInst *Store =
- IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
+ Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
+ LLVM_DEBUG(dbgs() << " to: " << *Load << "\n");
+ }
+ };
+
+ bool visitLoadInst(LoadInst &LI) {
+ assert(LI.getPointerOperand() == *U);
+ if (!LI.isSimple() || LI.getType()->isSingleValueType())
+ return false;
+
+ // We have an aggregate being loaded, split it apart.
+ LLVM_DEBUG(dbgs() << " original: " << LI << "\n");
+ AAMDNodes AATags;
+ LI.getAAMetadata(AATags);
+ LoadOpSplitter Splitter(&LI, *U, LI.getType(), AATags,
+ getAdjustedAlignment(&LI, 0), DL);
+ Value *V = UndefValue::get(LI.getType());
+ Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
+ Visited.erase(&LI);
+ LI.replaceAllUsesWith(V);
+ LI.eraseFromParent();
+ return true;
+ }
+
+ struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
+ StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr, Type *BaseTy,
+ AAMDNodes AATags, Align BaseAlign, const DataLayout &DL)
+ : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr, BaseTy, BaseAlign,
+ DL),
+ AATags(AATags) {}
+ AAMDNodes AATags;
+ /// Emit a leaf store of a single value. This is called at the leaves of the
+ /// recursive emission to actually produce stores.
+ void emitFunc(Type *Ty, Value *&Agg, Align Alignment, const Twine &Name) {
+ assert(Ty->isSingleValueType());
+ // Extract the single value and store it using the indices.
+ //
+ // The gep and extractvalue values are factored out of the CreateStore
+ // call to make the output independent of the argument evaluation order.
+ Value *ExtractValue =
+ IRB.CreateExtractValue(Agg, Indices, Name + ".extract");
+ Value *InBoundsGEP =
+ IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
+ StoreInst *Store =
+ IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
APInt Offset(
DL.getIndexSizeInBits(Ptr->getType()->getPointerAddressSpace()), 0);
@@ -3440,112 +3440,112 @@ private:
GEPOperator::accumulateConstantOffset(BaseTy, GEPIndices, DL, Offset))
Store->setAAMetadata(AATags.shift(Offset.getZExtValue()));
- LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
- }
- };
-
- bool visitStoreInst(StoreInst &SI) {
- if (!SI.isSimple() || SI.getPointerOperand() != *U)
- return false;
- Value *V = SI.getValueOperand();
- if (V->getType()->isSingleValueType())
- return false;
-
- // We have an aggregate being stored, split it apart.
- LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
- AAMDNodes AATags;
- SI.getAAMetadata(AATags);
- StoreOpSplitter Splitter(&SI, *U, V->getType(), AATags,
- getAdjustedAlignment(&SI, 0), DL);
- Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
- Visited.erase(&SI);
- SI.eraseFromParent();
- return true;
- }
-
- bool visitBitCastInst(BitCastInst &BC) {
- enqueueUsers(BC);
- return false;
- }
-
- bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
- enqueueUsers(ASC);
- return false;
- }
-
- // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2)
- bool foldGEPSelect(GetElementPtrInst &GEPI) {
- if (!GEPI.hasAllConstantIndices())
- return false;
-
- SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand());
-
- LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):"
- << "\n original: " << *Sel
- << "\n " << GEPI);
-
- IRBuilderTy Builder(&GEPI);
+ LLVM_DEBUG(dbgs() << " to: " << *Store << "\n");
+ }
+ };
+
+ bool visitStoreInst(StoreInst &SI) {
+ if (!SI.isSimple() || SI.getPointerOperand() != *U)
+ return false;
+ Value *V = SI.getValueOperand();
+ if (V->getType()->isSingleValueType())
+ return false;
+
+ // We have an aggregate being stored, split it apart.
+ LLVM_DEBUG(dbgs() << " original: " << SI << "\n");
+ AAMDNodes AATags;
+ SI.getAAMetadata(AATags);
+ StoreOpSplitter Splitter(&SI, *U, V->getType(), AATags,
+ getAdjustedAlignment(&SI, 0), DL);
+ Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
+ Visited.erase(&SI);
+ SI.eraseFromParent();
+ return true;
+ }
+
+ bool visitBitCastInst(BitCastInst &BC) {
+ enqueueUsers(BC);
+ return false;
+ }
+
+ bool visitAddrSpaceCastInst(AddrSpaceCastInst &ASC) {
+ enqueueUsers(ASC);
+ return false;
+ }
+
+ // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2)
+ bool foldGEPSelect(GetElementPtrInst &GEPI) {
+ if (!GEPI.hasAllConstantIndices())
+ return false;
+
+ SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand());
+
+ LLVM_DEBUG(dbgs() << " Rewriting gep(select) -> select(gep):"
+ << "\n original: " << *Sel
+ << "\n " << GEPI);
+
+ IRBuilderTy Builder(&GEPI);
SmallVector<Value *, 4> Index(GEPI.indices());
- bool IsInBounds = GEPI.isInBounds();
-
- Value *True = Sel->getTrueValue();
- Value *NTrue =
- IsInBounds
- ? Builder.CreateInBoundsGEP(True, Index,
- True->getName() + ".sroa.gep")
- : Builder.CreateGEP(True, Index, True->getName() + ".sroa.gep");
-
- Value *False = Sel->getFalseValue();
-
- Value *NFalse =
- IsInBounds
- ? Builder.CreateInBoundsGEP(False, Index,
- False->getName() + ".sroa.gep")
- : Builder.CreateGEP(False, Index, False->getName() + ".sroa.gep");
-
- Value *NSel = Builder.CreateSelect(Sel->getCondition(), NTrue, NFalse,
- Sel->getName() + ".sroa.sel");
- Visited.erase(&GEPI);
- GEPI.replaceAllUsesWith(NSel);
- GEPI.eraseFromParent();
- Instruction *NSelI = cast<Instruction>(NSel);
- Visited.insert(NSelI);
- enqueueUsers(*NSelI);
-
- LLVM_DEBUG(dbgs() << "\n to: " << *NTrue
- << "\n " << *NFalse
- << "\n " << *NSel << '\n');
-
- return true;
- }
-
- // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
- bool foldGEPPhi(GetElementPtrInst &GEPI) {
- if (!GEPI.hasAllConstantIndices())
- return false;
-
- PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
- if (GEPI.getParent() != PHI->getParent() ||
- llvm::any_of(PHI->incoming_values(), [](Value *In)
- { Instruction *I = dyn_cast<Instruction>(In);
- return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
- succ_empty(I->getParent()) ||
- !I->getParent()->isLegalToHoistInto();
- }))
- return false;
-
- LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):"
- << "\n original: " << *PHI
- << "\n " << GEPI
- << "\n to: ");
-
+ bool IsInBounds = GEPI.isInBounds();
+
+ Value *True = Sel->getTrueValue();
+ Value *NTrue =
+ IsInBounds
+ ? Builder.CreateInBoundsGEP(True, Index,
+ True->getName() + ".sroa.gep")
+ : Builder.CreateGEP(True, Index, True->getName() + ".sroa.gep");
+
+ Value *False = Sel->getFalseValue();
+
+ Value *NFalse =
+ IsInBounds
+ ? Builder.CreateInBoundsGEP(False, Index,
+ False->getName() + ".sroa.gep")
+ : Builder.CreateGEP(False, Index, False->getName() + ".sroa.gep");
+
+ Value *NSel = Builder.CreateSelect(Sel->getCondition(), NTrue, NFalse,
+ Sel->getName() + ".sroa.sel");
+ Visited.erase(&GEPI);
+ GEPI.replaceAllUsesWith(NSel);
+ GEPI.eraseFromParent();
+ Instruction *NSelI = cast<Instruction>(NSel);
+ Visited.insert(NSelI);
+ enqueueUsers(*NSelI);
+
+ LLVM_DEBUG(dbgs() << "\n to: " << *NTrue
+ << "\n " << *NFalse
+ << "\n " << *NSel << '\n');
+
+ return true;
+ }
+
+ // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
+ bool foldGEPPhi(GetElementPtrInst &GEPI) {
+ if (!GEPI.hasAllConstantIndices())
+ return false;
+
+ PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
+ if (GEPI.getParent() != PHI->getParent() ||
+ llvm::any_of(PHI->incoming_values(), [](Value *In)
+ { Instruction *I = dyn_cast<Instruction>(In);
+ return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
+ succ_empty(I->getParent()) ||
+ !I->getParent()->isLegalToHoistInto();
+ }))
+ return false;
+
+ LLVM_DEBUG(dbgs() << " Rewriting gep(phi) -> phi(gep):"
+ << "\n original: " << *PHI
+ << "\n " << GEPI
+ << "\n to: ");
+
SmallVector<Value *, 4> Index(GEPI.indices());
- bool IsInBounds = GEPI.isInBounds();
- IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI());
- PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(),
- PHI->getNumIncomingValues(),
- PHI->getName() + ".sroa.phi");
- for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
+ bool IsInBounds = GEPI.isInBounds();
+ IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI());
+ PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(),
+ PHI->getNumIncomingValues(),
+ PHI->getName() + ".sroa.phi");
+ for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
BasicBlock *B = PHI->getIncomingBlock(I);
Value *NewVal = nullptr;
int Idx = NewPN->getBasicBlockIndex(B);
@@ -3553,354 +3553,354 @@ private:
NewVal = NewPN->getIncomingValue(Idx);
} else {
Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
-
+
IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
NewVal = IsInBounds
? B.CreateInBoundsGEP(In, Index, In->getName() + ".sroa.gep")
: B.CreateGEP(In, Index, In->getName() + ".sroa.gep");
}
NewPN->addIncoming(NewVal, B);
- }
-
- Visited.erase(&GEPI);
- GEPI.replaceAllUsesWith(NewPN);
- GEPI.eraseFromParent();
- Visited.insert(NewPN);
- enqueueUsers(*NewPN);
-
- LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
- dbgs() << "\n " << *In;
- dbgs() << "\n " << *NewPN << '\n');
-
- return true;
- }
-
- bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
- if (isa<SelectInst>(GEPI.getPointerOperand()) &&
- foldGEPSelect(GEPI))
- return true;
-
- if (isa<PHINode>(GEPI.getPointerOperand()) &&
- foldGEPPhi(GEPI))
- return true;
-
- enqueueUsers(GEPI);
- return false;
- }
-
- bool visitPHINode(PHINode &PN) {
- enqueueUsers(PN);
- return false;
- }
-
- bool visitSelectInst(SelectInst &SI) {
- enqueueUsers(SI);
- return false;
- }
-};
-
-} // end anonymous namespace
-
-/// Strip aggregate type wrapping.
-///
-/// This removes no-op aggregate types wrapping an underlying type. It will
-/// strip as many layers of types as it can without changing either the type
-/// size or the allocated size.
-static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
- if (Ty->isSingleValueType())
- return Ty;
-
- uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedSize();
- uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedSize();
-
- Type *InnerTy;
- if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
- InnerTy = ArrTy->getElementType();
- } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
- const StructLayout *SL = DL.getStructLayout(STy);
- unsigned Index = SL->getElementContainingOffset(0);
- InnerTy = STy->getElementType(Index);
- } else {
- return Ty;
- }
-
- if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedSize() ||
- TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedSize())
- return Ty;
-
- return stripAggregateTypeWrapping(DL, InnerTy);
-}
-
-/// Try to find a partition of the aggregate type passed in for a given
-/// offset and size.
-///
-/// This recurses through the aggregate type and tries to compute a subtype
-/// based on the offset and size. When the offset and size span a sub-section
-/// of an array, it will even compute a new array type for that sub-section,
-/// and the same for structs.
-///
-/// Note that this routine is very strict and tries to find a partition of the
-/// type which produces the *exact* right offset and size. It is not forgiving
-/// when the size or offset cause either end of type-based partition to be off.
-/// Also, this is a best-effort routine. It is reasonable to give up and not
-/// return a type if necessary.
-static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
- uint64_t Size) {
- if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedSize() == Size)
- return stripAggregateTypeWrapping(DL, Ty);
- if (Offset > DL.getTypeAllocSize(Ty).getFixedSize() ||
- (DL.getTypeAllocSize(Ty).getFixedSize() - Offset) < Size)
- return nullptr;
-
- if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
- Type *ElementTy;
- uint64_t TyNumElements;
- if (auto *AT = dyn_cast<ArrayType>(Ty)) {
- ElementTy = AT->getElementType();
- TyNumElements = AT->getNumElements();
- } else {
- // FIXME: This isn't right for vectors with non-byte-sized or
- // non-power-of-two sized elements.
- auto *VT = cast<FixedVectorType>(Ty);
- ElementTy = VT->getElementType();
- TyNumElements = VT->getNumElements();
- }
- uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
- uint64_t NumSkippedElements = Offset / ElementSize;
- if (NumSkippedElements >= TyNumElements)
- return nullptr;
- Offset -= NumSkippedElements * ElementSize;
-
- // First check if we need to recurse.
- if (Offset > 0 || Size < ElementSize) {
- // Bail if the partition ends in a different array element.
- if ((Offset + Size) > ElementSize)
- return nullptr;
- // Recurse through the element type trying to peel off offset bytes.
- return getTypePartition(DL, ElementTy, Offset, Size);
- }
- assert(Offset == 0);
-
- if (Size == ElementSize)
- return stripAggregateTypeWrapping(DL, ElementTy);
- assert(Size > ElementSize);
- uint64_t NumElements = Size / ElementSize;
- if (NumElements * ElementSize != Size)
- return nullptr;
- return ArrayType::get(ElementTy, NumElements);
- }
-
- StructType *STy = dyn_cast<StructType>(Ty);
- if (!STy)
- return nullptr;
-
- const StructLayout *SL = DL.getStructLayout(STy);
- if (Offset >= SL->getSizeInBytes())
- return nullptr;
- uint64_t EndOffset = Offset + Size;
- if (EndOffset > SL->getSizeInBytes())
- return nullptr;
-
- unsigned Index = SL->getElementContainingOffset(Offset);
- Offset -= SL->getElementOffset(Index);
-
- Type *ElementTy = STy->getElementType(Index);
- uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
- if (Offset >= ElementSize)
- return nullptr; // The offset points into alignment padding.
-
- // See if any partition must be contained by the element.
- if (Offset > 0 || Size < ElementSize) {
- if ((Offset + Size) > ElementSize)
- return nullptr;
- return getTypePartition(DL, ElementTy, Offset, Size);
- }
- assert(Offset == 0);
-
- if (Size == ElementSize)
- return stripAggregateTypeWrapping(DL, ElementTy);
-
- StructType::element_iterator EI = STy->element_begin() + Index,
- EE = STy->element_end();
- if (EndOffset < SL->getSizeInBytes()) {
- unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
- if (Index == EndIndex)
- return nullptr; // Within a single element and its padding.
-
- // Don't try to form "natural" types if the elements don't line up with the
- // expected size.
- // FIXME: We could potentially recurse down through the last element in the
- // sub-struct to find a natural end point.
- if (SL->getElementOffset(EndIndex) != EndOffset)
- return nullptr;
-
- assert(Index < EndIndex);
- EE = STy->element_begin() + EndIndex;
- }
-
- // Try to build up a sub-structure.
- StructType *SubTy =
- StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
- const StructLayout *SubSL = DL.getStructLayout(SubTy);
- if (Size != SubSL->getSizeInBytes())
- return nullptr; // The sub-struct doesn't have quite the size needed.
-
- return SubTy;
-}
-
-/// Pre-split loads and stores to simplify rewriting.
-///
-/// We want to break up the splittable load+store pairs as much as
-/// possible. This is important to do as a preprocessing step, as once we
-/// start rewriting the accesses to partitions of the alloca we lose the
-/// necessary information to correctly split apart paired loads and stores
-/// which both point into this alloca. The case to consider is something like
-/// the following:
-///
-/// %a = alloca [12 x i8]
-/// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
-/// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
-/// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
-/// %iptr1 = bitcast i8* %gep1 to i64*
-/// %iptr2 = bitcast i8* %gep2 to i64*
-/// %fptr1 = bitcast i8* %gep1 to float*
-/// %fptr2 = bitcast i8* %gep2 to float*
-/// %fptr3 = bitcast i8* %gep3 to float*
-/// store float 0.0, float* %fptr1
-/// store float 1.0, float* %fptr2
-/// %v = load i64* %iptr1
-/// store i64 %v, i64* %iptr2
-/// %f1 = load float* %fptr2
-/// %f2 = load float* %fptr3
-///
-/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
-/// promote everything so we recover the 2 SSA values that should have been
-/// there all along.
-///
-/// \returns true if any changes are made.
-bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
- LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
-
- // Track the loads and stores which are candidates for pre-splitting here, in
- // the order they first appear during the partition scan. These give stable
- // iteration order and a basis for tracking which loads and stores we
- // actually split.
- SmallVector<LoadInst *, 4> Loads;
- SmallVector<StoreInst *, 4> Stores;
-
- // We need to accumulate the splits required of each load or store where we
- // can find them via a direct lookup. This is important to cross-check loads
- // and stores against each other. We also track the slice so that we can kill
- // all the slices that end up split.
- struct SplitOffsets {
- Slice *S;
- std::vector<uint64_t> Splits;
- };
- SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
-
- // Track loads out of this alloca which cannot, for any reason, be pre-split.
- // This is important as we also cannot pre-split stores of those loads!
- // FIXME: This is all pretty gross. It means that we can be more aggressive
- // in pre-splitting when the load feeding the store happens to come from
- // a separate alloca. Put another way, the effectiveness of SROA would be
- // decreased by a frontend which just concatenated all of its local allocas
- // into one big flat alloca. But defeating such patterns is exactly the job
- // SROA is tasked with! Sadly, to not have this discrepancy we would have
- // change store pre-splitting to actually force pre-splitting of the load
- // that feeds it *and all stores*. That makes pre-splitting much harder, but
- // maybe it would make it more principled?
- SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
-
- LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
- for (auto &P : AS.partitions()) {
- for (Slice &S : P) {
- Instruction *I = cast<Instruction>(S.getUse()->getUser());
- if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
- // If this is a load we have to track that it can't participate in any
- // pre-splitting. If this is a store of a load we have to track that
- // that load also can't participate in any pre-splitting.
- if (auto *LI = dyn_cast<LoadInst>(I))
- UnsplittableLoads.insert(LI);
- else if (auto *SI = dyn_cast<StoreInst>(I))
- if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
- UnsplittableLoads.insert(LI);
- continue;
- }
- assert(P.endOffset() > S.beginOffset() &&
- "Empty or backwards partition!");
-
- // Determine if this is a pre-splittable slice.
- if (auto *LI = dyn_cast<LoadInst>(I)) {
- assert(!LI->isVolatile() && "Cannot split volatile loads!");
-
- // The load must be used exclusively to store into other pointers for
- // us to be able to arbitrarily pre-split it. The stores must also be
- // simple to avoid changing semantics.
- auto IsLoadSimplyStored = [](LoadInst *LI) {
- for (User *LU : LI->users()) {
- auto *SI = dyn_cast<StoreInst>(LU);
- if (!SI || !SI->isSimple())
- return false;
- }
- return true;
- };
- if (!IsLoadSimplyStored(LI)) {
- UnsplittableLoads.insert(LI);
- continue;
- }
-
- Loads.push_back(LI);
- } else if (auto *SI = dyn_cast<StoreInst>(I)) {
- if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
- // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
- continue;
- auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
- if (!StoredLoad || !StoredLoad->isSimple())
- continue;
- assert(!SI->isVolatile() && "Cannot split volatile stores!");
-
- Stores.push_back(SI);
- } else {
- // Other uses cannot be pre-split.
- continue;
- }
-
- // Record the initial split.
- LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
- auto &Offsets = SplitOffsetsMap[I];
- assert(Offsets.Splits.empty() &&
- "Should not have splits the first time we see an instruction!");
- Offsets.S = &S;
- Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
- }
-
- // Now scan the already split slices, and add a split for any of them which
- // we're going to pre-split.
- for (Slice *S : P.splitSliceTails()) {
- auto SplitOffsetsMapI =
- SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
- if (SplitOffsetsMapI == SplitOffsetsMap.end())
- continue;
- auto &Offsets = SplitOffsetsMapI->second;
-
- assert(Offsets.S == S && "Found a mismatched slice!");
- assert(!Offsets.Splits.empty() &&
- "Cannot have an empty set of splits on the second partition!");
- assert(Offsets.Splits.back() ==
- P.beginOffset() - Offsets.S->beginOffset() &&
- "Previous split does not end where this one begins!");
-
- // Record each split. The last partition's end isn't needed as the size
- // of the slice dictates that.
- if (S->endOffset() > P.endOffset())
- Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
- }
- }
-
- // We may have split loads where some of their stores are split stores. For
- // such loads and stores, we can only pre-split them if their splits exactly
- // match relative to their starting offset. We have to verify this prior to
- // any rewriting.
+ }
+
+ Visited.erase(&GEPI);
+ GEPI.replaceAllUsesWith(NewPN);
+ GEPI.eraseFromParent();
+ Visited.insert(NewPN);
+ enqueueUsers(*NewPN);
+
+ LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
+ dbgs() << "\n " << *In;
+ dbgs() << "\n " << *NewPN << '\n');
+
+ return true;
+ }
+
+ bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+ if (isa<SelectInst>(GEPI.getPointerOperand()) &&
+ foldGEPSelect(GEPI))
+ return true;
+
+ if (isa<PHINode>(GEPI.getPointerOperand()) &&
+ foldGEPPhi(GEPI))
+ return true;
+
+ enqueueUsers(GEPI);
+ return false;
+ }
+
+ bool visitPHINode(PHINode &PN) {
+ enqueueUsers(PN);
+ return false;
+ }
+
+ bool visitSelectInst(SelectInst &SI) {
+ enqueueUsers(SI);
+ return false;
+ }
+};
+
+} // end anonymous namespace
+
+/// Strip aggregate type wrapping.
+///
+/// This removes no-op aggregate types wrapping an underlying type. It will
+/// strip as many layers of types as it can without changing either the type
+/// size or the allocated size.
+static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
+ if (Ty->isSingleValueType())
+ return Ty;
+
+ uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedSize();
+ uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedSize();
+
+ Type *InnerTy;
+ if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+ InnerTy = ArrTy->getElementType();
+ } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
+ const StructLayout *SL = DL.getStructLayout(STy);
+ unsigned Index = SL->getElementContainingOffset(0);
+ InnerTy = STy->getElementType(Index);
+ } else {
+ return Ty;
+ }
+
+ if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedSize() ||
+ TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedSize())
+ return Ty;
+
+ return stripAggregateTypeWrapping(DL, InnerTy);
+}
+
+/// Try to find a partition of the aggregate type passed in for a given
+/// offset and size.
+///
+/// This recurses through the aggregate type and tries to compute a subtype
+/// based on the offset and size. When the offset and size span a sub-section
+/// of an array, it will even compute a new array type for that sub-section,
+/// and the same for structs.
+///
+/// Note that this routine is very strict and tries to find a partition of the
+/// type which produces the *exact* right offset and size. It is not forgiving
+/// when the size or offset cause either end of type-based partition to be off.
+/// Also, this is a best-effort routine. It is reasonable to give up and not
+/// return a type if necessary.
+static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
+ uint64_t Size) {
+ if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedSize() == Size)
+ return stripAggregateTypeWrapping(DL, Ty);
+ if (Offset > DL.getTypeAllocSize(Ty).getFixedSize() ||
+ (DL.getTypeAllocSize(Ty).getFixedSize() - Offset) < Size)
+ return nullptr;
+
+ if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
+ Type *ElementTy;
+ uint64_t TyNumElements;
+ if (auto *AT = dyn_cast<ArrayType>(Ty)) {
+ ElementTy = AT->getElementType();
+ TyNumElements = AT->getNumElements();
+ } else {
+ // FIXME: This isn't right for vectors with non-byte-sized or
+ // non-power-of-two sized elements.
+ auto *VT = cast<FixedVectorType>(Ty);
+ ElementTy = VT->getElementType();
+ TyNumElements = VT->getNumElements();
+ }
+ uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
+ uint64_t NumSkippedElements = Offset / ElementSize;
+ if (NumSkippedElements >= TyNumElements)
+ return nullptr;
+ Offset -= NumSkippedElements * ElementSize;
+
+ // First check if we need to recurse.
+ if (Offset > 0 || Size < ElementSize) {
+ // Bail if the partition ends in a different array element.
+ if ((Offset + Size) > ElementSize)
+ return nullptr;
+ // Recurse through the element type trying to peel off offset bytes.
+ return getTypePartition(DL, ElementTy, Offset, Size);
+ }
+ assert(Offset == 0);
+
+ if (Size == ElementSize)
+ return stripAggregateTypeWrapping(DL, ElementTy);
+ assert(Size > ElementSize);
+ uint64_t NumElements = Size / ElementSize;
+ if (NumElements * ElementSize != Size)
+ return nullptr;
+ return ArrayType::get(ElementTy, NumElements);
+ }
+
+ StructType *STy = dyn_cast<StructType>(Ty);
+ if (!STy)
+ return nullptr;
+
+ const StructLayout *SL = DL.getStructLayout(STy);
+ if (Offset >= SL->getSizeInBytes())
+ return nullptr;
+ uint64_t EndOffset = Offset + Size;
+ if (EndOffset > SL->getSizeInBytes())
+ return nullptr;
+
+ unsigned Index = SL->getElementContainingOffset(Offset);
+ Offset -= SL->getElementOffset(Index);
+
+ Type *ElementTy = STy->getElementType(Index);
+ uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
+ if (Offset >= ElementSize)
+ return nullptr; // The offset points into alignment padding.
+
+ // See if any partition must be contained by the element.
+ if (Offset > 0 || Size < ElementSize) {
+ if ((Offset + Size) > ElementSize)
+ return nullptr;
+ return getTypePartition(DL, ElementTy, Offset, Size);
+ }
+ assert(Offset == 0);
+
+ if (Size == ElementSize)
+ return stripAggregateTypeWrapping(DL, ElementTy);
+
+ StructType::element_iterator EI = STy->element_begin() + Index,
+ EE = STy->element_end();
+ if (EndOffset < SL->getSizeInBytes()) {
+ unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
+ if (Index == EndIndex)
+ return nullptr; // Within a single element and its padding.
+
+ // Don't try to form "natural" types if the elements don't line up with the
+ // expected size.
+ // FIXME: We could potentially recurse down through the last element in the
+ // sub-struct to find a natural end point.
+ if (SL->getElementOffset(EndIndex) != EndOffset)
+ return nullptr;
+
+ assert(Index < EndIndex);
+ EE = STy->element_begin() + EndIndex;
+ }
+
+ // Try to build up a sub-structure.
+ StructType *SubTy =
+ StructType::get(STy->getContext(), makeArrayRef(EI, EE), STy->isPacked());
+ const StructLayout *SubSL = DL.getStructLayout(SubTy);
+ if (Size != SubSL->getSizeInBytes())
+ return nullptr; // The sub-struct doesn't have quite the size needed.
+
+ return SubTy;
+}
+
+/// Pre-split loads and stores to simplify rewriting.
+///
+/// We want to break up the splittable load+store pairs as much as
+/// possible. This is important to do as a preprocessing step, as once we
+/// start rewriting the accesses to partitions of the alloca we lose the
+/// necessary information to correctly split apart paired loads and stores
+/// which both point into this alloca. The case to consider is something like
+/// the following:
+///
+/// %a = alloca [12 x i8]
+/// %gep1 = getelementptr [12 x i8]* %a, i32 0, i32 0
+/// %gep2 = getelementptr [12 x i8]* %a, i32 0, i32 4
+/// %gep3 = getelementptr [12 x i8]* %a, i32 0, i32 8
+/// %iptr1 = bitcast i8* %gep1 to i64*
+/// %iptr2 = bitcast i8* %gep2 to i64*
+/// %fptr1 = bitcast i8* %gep1 to float*
+/// %fptr2 = bitcast i8* %gep2 to float*
+/// %fptr3 = bitcast i8* %gep3 to float*
+/// store float 0.0, float* %fptr1
+/// store float 1.0, float* %fptr2
+/// %v = load i64* %iptr1
+/// store i64 %v, i64* %iptr2
+/// %f1 = load float* %fptr2
+/// %f2 = load float* %fptr3
+///
+/// Here we want to form 3 partitions of the alloca, each 4 bytes large, and
+/// promote everything so we recover the 2 SSA values that should have been
+/// there all along.
+///
+/// \returns true if any changes are made.
+bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
+ LLVM_DEBUG(dbgs() << "Pre-splitting loads and stores\n");
+
+ // Track the loads and stores which are candidates for pre-splitting here, in
+ // the order they first appear during the partition scan. These give stable
+ // iteration order and a basis for tracking which loads and stores we
+ // actually split.
+ SmallVector<LoadInst *, 4> Loads;
+ SmallVector<StoreInst *, 4> Stores;
+
+ // We need to accumulate the splits required of each load or store where we
+ // can find them via a direct lookup. This is important to cross-check loads
+ // and stores against each other. We also track the slice so that we can kill
+ // all the slices that end up split.
+ struct SplitOffsets {
+ Slice *S;
+ std::vector<uint64_t> Splits;
+ };
+ SmallDenseMap<Instruction *, SplitOffsets, 8> SplitOffsetsMap;
+
+ // Track loads out of this alloca which cannot, for any reason, be pre-split.
+ // This is important as we also cannot pre-split stores of those loads!
+ // FIXME: This is all pretty gross. It means that we can be more aggressive
+ // in pre-splitting when the load feeding the store happens to come from
+ // a separate alloca. Put another way, the effectiveness of SROA would be
+ // decreased by a frontend which just concatenated all of its local allocas
+ // into one big flat alloca. But defeating such patterns is exactly the job
+ // SROA is tasked with! Sadly, to not have this discrepancy we would have
+ // change store pre-splitting to actually force pre-splitting of the load
+ // that feeds it *and all stores*. That makes pre-splitting much harder, but
+ // maybe it would make it more principled?
+ SmallPtrSet<LoadInst *, 8> UnsplittableLoads;
+
+ LLVM_DEBUG(dbgs() << " Searching for candidate loads and stores\n");
+ for (auto &P : AS.partitions()) {
+ for (Slice &S : P) {
+ Instruction *I = cast<Instruction>(S.getUse()->getUser());
+ if (!S.isSplittable() || S.endOffset() <= P.endOffset()) {
+ // If this is a load we have to track that it can't participate in any
+ // pre-splitting. If this is a store of a load we have to track that
+ // that load also can't participate in any pre-splitting.
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ UnsplittableLoads.insert(LI);
+ else if (auto *SI = dyn_cast<StoreInst>(I))
+ if (auto *LI = dyn_cast<LoadInst>(SI->getValueOperand()))
+ UnsplittableLoads.insert(LI);
+ continue;
+ }
+ assert(P.endOffset() > S.beginOffset() &&
+ "Empty or backwards partition!");
+
+ // Determine if this is a pre-splittable slice.
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ assert(!LI->isVolatile() && "Cannot split volatile loads!");
+
+ // The load must be used exclusively to store into other pointers for
+ // us to be able to arbitrarily pre-split it. The stores must also be
+ // simple to avoid changing semantics.
+ auto IsLoadSimplyStored = [](LoadInst *LI) {
+ for (User *LU : LI->users()) {
+ auto *SI = dyn_cast<StoreInst>(LU);
+ if (!SI || !SI->isSimple())
+ return false;
+ }
+ return true;
+ };
+ if (!IsLoadSimplyStored(LI)) {
+ UnsplittableLoads.insert(LI);
+ continue;
+ }
+
+ Loads.push_back(LI);
+ } else if (auto *SI = dyn_cast<StoreInst>(I)) {
+ if (S.getUse() != &SI->getOperandUse(SI->getPointerOperandIndex()))
+ // Skip stores *of* pointers. FIXME: This shouldn't even be possible!
+ continue;
+ auto *StoredLoad = dyn_cast<LoadInst>(SI->getValueOperand());
+ if (!StoredLoad || !StoredLoad->isSimple())
+ continue;
+ assert(!SI->isVolatile() && "Cannot split volatile stores!");
+
+ Stores.push_back(SI);
+ } else {
+ // Other uses cannot be pre-split.
+ continue;
+ }
+
+ // Record the initial split.
+ LLVM_DEBUG(dbgs() << " Candidate: " << *I << "\n");
+ auto &Offsets = SplitOffsetsMap[I];
+ assert(Offsets.Splits.empty() &&
+ "Should not have splits the first time we see an instruction!");
+ Offsets.S = &S;
+ Offsets.Splits.push_back(P.endOffset() - S.beginOffset());
+ }
+
+ // Now scan the already split slices, and add a split for any of them which
+ // we're going to pre-split.
+ for (Slice *S : P.splitSliceTails()) {
+ auto SplitOffsetsMapI =
+ SplitOffsetsMap.find(cast<Instruction>(S->getUse()->getUser()));
+ if (SplitOffsetsMapI == SplitOffsetsMap.end())
+ continue;
+ auto &Offsets = SplitOffsetsMapI->second;
+
+ assert(Offsets.S == S && "Found a mismatched slice!");
+ assert(!Offsets.Splits.empty() &&
+ "Cannot have an empty set of splits on the second partition!");
+ assert(Offsets.Splits.back() ==
+ P.beginOffset() - Offsets.S->beginOffset() &&
+ "Previous split does not end where this one begins!");
+
+ // Record each split. The last partition's end isn't needed as the size
+ // of the slice dictates that.
+ if (S->endOffset() > P.endOffset())
+ Offsets.Splits.push_back(P.endOffset() - Offsets.S->beginOffset());
+ }
+ }
+
+ // We may have split loads where some of their stores are split stores. For
+ // such loads and stores, we can only pre-split them if their splits exactly
+ // match relative to their starting offset. We have to verify this prior to
+ // any rewriting.
llvm::erase_if(Stores, [&UnsplittableLoads, &SplitOffsetsMap](StoreInst *SI) {
// Lookup the load we are storing in our map of split
// offsets.
@@ -3909,25 +3909,25 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
// and this store can't be pre-split.
if (UnsplittableLoads.count(LI))
return true;
-
+
auto LoadOffsetsI = SplitOffsetsMap.find(LI);
if (LoadOffsetsI == SplitOffsetsMap.end())
return false; // Unrelated loads are definitely safe.
auto &LoadOffsets = LoadOffsetsI->second;
-
+
// Now lookup the store's offsets.
auto &StoreOffsets = SplitOffsetsMap[SI];
-
+
// If the relative offsets of each split in the load and
// store match exactly, then we can split them and we
// don't need to remove them here.
if (LoadOffsets.Splits == StoreOffsets.Splits)
return false;
-
+
LLVM_DEBUG(dbgs() << " Mismatched splits for load and store:\n"
<< " " << *LI << "\n"
<< " " << *SI << "\n");
-
+
// We've found a store and load that we need to split
// with mismatched relative splits. Just give up on them
// and remove both instructions from our list of
@@ -3935,330 +3935,330 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
UnsplittableLoads.insert(LI);
return true;
});
- // Now we have to go *back* through all the stores, because a later store may
- // have caused an earlier store's load to become unsplittable and if it is
- // unsplittable for the later store, then we can't rely on it being split in
- // the earlier store either.
+ // Now we have to go *back* through all the stores, because a later store may
+ // have caused an earlier store's load to become unsplittable and if it is
+ // unsplittable for the later store, then we can't rely on it being split in
+ // the earlier store either.
llvm::erase_if(Stores, [&UnsplittableLoads](StoreInst *SI) {
auto *LI = cast<LoadInst>(SI->getValueOperand());
return UnsplittableLoads.count(LI);
});
- // Once we've established all the loads that can't be split for some reason,
- // filter any that made it into our list out.
+ // Once we've established all the loads that can't be split for some reason,
+ // filter any that made it into our list out.
llvm::erase_if(Loads, [&UnsplittableLoads](LoadInst *LI) {
return UnsplittableLoads.count(LI);
});
-
- // If no loads or stores are left, there is no pre-splitting to be done for
- // this alloca.
- if (Loads.empty() && Stores.empty())
- return false;
-
- // From here on, we can't fail and will be building new accesses, so rig up
- // an IR builder.
- IRBuilderTy IRB(&AI);
-
- // Collect the new slices which we will merge into the alloca slices.
- SmallVector<Slice, 4> NewSlices;
-
- // Track any allocas we end up splitting loads and stores for so we iterate
- // on them.
- SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
-
- // At this point, we have collected all of the loads and stores we can
- // pre-split, and the specific splits needed for them. We actually do the
- // splitting in a specific order in order to handle when one of the loads in
- // the value operand to one of the stores.
- //
- // First, we rewrite all of the split loads, and just accumulate each split
- // load in a parallel structure. We also build the slices for them and append
- // them to the alloca slices.
- SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
- std::vector<LoadInst *> SplitLoads;
- const DataLayout &DL = AI.getModule()->getDataLayout();
- for (LoadInst *LI : Loads) {
- SplitLoads.clear();
-
- IntegerType *Ty = cast<IntegerType>(LI->getType());
- uint64_t LoadSize = Ty->getBitWidth() / 8;
- assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
-
- auto &Offsets = SplitOffsetsMap[LI];
- assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
- "Slice size should always match load size exactly!");
- uint64_t BaseOffset = Offsets.S->beginOffset();
- assert(BaseOffset + LoadSize > BaseOffset &&
- "Cannot represent alloca access size using 64-bit integers!");
-
- Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
- IRB.SetInsertPoint(LI);
-
- LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
-
- uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
- int Idx = 0, Size = Offsets.Splits.size();
- for (;;) {
- auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
- auto AS = LI->getPointerAddressSpace();
- auto *PartPtrTy = PartTy->getPointerTo(AS);
- LoadInst *PLoad = IRB.CreateAlignedLoad(
- PartTy,
- getAdjustedPtr(IRB, DL, BasePtr,
- APInt(DL.getIndexSizeInBits(AS), PartOffset),
- PartPtrTy, BasePtr->getName() + "."),
- getAdjustedAlignment(LI, PartOffset),
- /*IsVolatile*/ false, LI->getName());
- PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
- LLVMContext::MD_access_group});
-
- // Append this load onto the list of split loads so we can find it later
- // to rewrite the stores.
- SplitLoads.push_back(PLoad);
-
- // Now build a new slice for the alloca.
- NewSlices.push_back(
- Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
- &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
- /*IsSplittable*/ false));
- LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
- << ", " << NewSlices.back().endOffset()
- << "): " << *PLoad << "\n");
-
- // See if we've handled all the splits.
- if (Idx >= Size)
- break;
-
- // Setup the next partition.
- PartOffset = Offsets.Splits[Idx];
- ++Idx;
- PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
- }
-
- // Now that we have the split loads, do the slow walk over all uses of the
- // load and rewrite them as split stores, or save the split loads to use
- // below if the store is going to be split there anyways.
- bool DeferredStores = false;
- for (User *LU : LI->users()) {
- StoreInst *SI = cast<StoreInst>(LU);
- if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
- DeferredStores = true;
- LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
- << "\n");
- continue;
- }
-
- Value *StoreBasePtr = SI->getPointerOperand();
- IRB.SetInsertPoint(SI);
-
- LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
-
- for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
- LoadInst *PLoad = SplitLoads[Idx];
- uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
- auto *PartPtrTy =
- PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
-
- auto AS = SI->getPointerAddressSpace();
- StoreInst *PStore = IRB.CreateAlignedStore(
- PLoad,
- getAdjustedPtr(IRB, DL, StoreBasePtr,
- APInt(DL.getIndexSizeInBits(AS), PartOffset),
- PartPtrTy, StoreBasePtr->getName() + "."),
- getAdjustedAlignment(SI, PartOffset),
- /*IsVolatile*/ false);
- PStore->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
- LLVMContext::MD_access_group});
- LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
- }
-
- // We want to immediately iterate on any allocas impacted by splitting
- // this store, and we have to track any promotable alloca (indicated by
- // a direct store) as needing to be resplit because it is no longer
- // promotable.
- if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
- ResplitPromotableAllocas.insert(OtherAI);
- Worklist.insert(OtherAI);
- } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
- StoreBasePtr->stripInBoundsOffsets())) {
- Worklist.insert(OtherAI);
- }
-
- // Mark the original store as dead.
+
+ // If no loads or stores are left, there is no pre-splitting to be done for
+ // this alloca.
+ if (Loads.empty() && Stores.empty())
+ return false;
+
+ // From here on, we can't fail and will be building new accesses, so rig up
+ // an IR builder.
+ IRBuilderTy IRB(&AI);
+
+ // Collect the new slices which we will merge into the alloca slices.
+ SmallVector<Slice, 4> NewSlices;
+
+ // Track any allocas we end up splitting loads and stores for so we iterate
+ // on them.
+ SmallPtrSet<AllocaInst *, 4> ResplitPromotableAllocas;
+
+ // At this point, we have collected all of the loads and stores we can
+ // pre-split, and the specific splits needed for them. We actually do the
+ // splitting in a specific order in order to handle when one of the loads in
+ // the value operand to one of the stores.
+ //
+ // First, we rewrite all of the split loads, and just accumulate each split
+ // load in a parallel structure. We also build the slices for them and append
+ // them to the alloca slices.
+ SmallDenseMap<LoadInst *, std::vector<LoadInst *>, 1> SplitLoadsMap;
+ std::vector<LoadInst *> SplitLoads;
+ const DataLayout &DL = AI.getModule()->getDataLayout();
+ for (LoadInst *LI : Loads) {
+ SplitLoads.clear();
+
+ IntegerType *Ty = cast<IntegerType>(LI->getType());
+ uint64_t LoadSize = Ty->getBitWidth() / 8;
+ assert(LoadSize > 0 && "Cannot have a zero-sized integer load!");
+
+ auto &Offsets = SplitOffsetsMap[LI];
+ assert(LoadSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+ "Slice size should always match load size exactly!");
+ uint64_t BaseOffset = Offsets.S->beginOffset();
+ assert(BaseOffset + LoadSize > BaseOffset &&
+ "Cannot represent alloca access size using 64-bit integers!");
+
+ Instruction *BasePtr = cast<Instruction>(LI->getPointerOperand());
+ IRB.SetInsertPoint(LI);
+
+ LLVM_DEBUG(dbgs() << " Splitting load: " << *LI << "\n");
+
+ uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+ int Idx = 0, Size = Offsets.Splits.size();
+ for (;;) {
+ auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+ auto AS = LI->getPointerAddressSpace();
+ auto *PartPtrTy = PartTy->getPointerTo(AS);
+ LoadInst *PLoad = IRB.CreateAlignedLoad(
+ PartTy,
+ getAdjustedPtr(IRB, DL, BasePtr,
+ APInt(DL.getIndexSizeInBits(AS), PartOffset),
+ PartPtrTy, BasePtr->getName() + "."),
+ getAdjustedAlignment(LI, PartOffset),
+ /*IsVolatile*/ false, LI->getName());
+ PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
+ LLVMContext::MD_access_group});
+
+ // Append this load onto the list of split loads so we can find it later
+ // to rewrite the stores.
+ SplitLoads.push_back(PLoad);
+
+ // Now build a new slice for the alloca.
+ NewSlices.push_back(
+ Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+ &PLoad->getOperandUse(PLoad->getPointerOperandIndex()),
+ /*IsSplittable*/ false));
+ LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
+ << ", " << NewSlices.back().endOffset()
+ << "): " << *PLoad << "\n");
+
+ // See if we've handled all the splits.
+ if (Idx >= Size)
+ break;
+
+ // Setup the next partition.
+ PartOffset = Offsets.Splits[Idx];
+ ++Idx;
+ PartSize = (Idx < Size ? Offsets.Splits[Idx] : LoadSize) - PartOffset;
+ }
+
+ // Now that we have the split loads, do the slow walk over all uses of the
+ // load and rewrite them as split stores, or save the split loads to use
+ // below if the store is going to be split there anyways.
+ bool DeferredStores = false;
+ for (User *LU : LI->users()) {
+ StoreInst *SI = cast<StoreInst>(LU);
+ if (!Stores.empty() && SplitOffsetsMap.count(SI)) {
+ DeferredStores = true;
+ LLVM_DEBUG(dbgs() << " Deferred splitting of store: " << *SI
+ << "\n");
+ continue;
+ }
+
+ Value *StoreBasePtr = SI->getPointerOperand();
+ IRB.SetInsertPoint(SI);
+
+ LLVM_DEBUG(dbgs() << " Splitting store of load: " << *SI << "\n");
+
+ for (int Idx = 0, Size = SplitLoads.size(); Idx < Size; ++Idx) {
+ LoadInst *PLoad = SplitLoads[Idx];
+ uint64_t PartOffset = Idx == 0 ? 0 : Offsets.Splits[Idx - 1];
+ auto *PartPtrTy =
+ PLoad->getType()->getPointerTo(SI->getPointerAddressSpace());
+
+ auto AS = SI->getPointerAddressSpace();
+ StoreInst *PStore = IRB.CreateAlignedStore(
+ PLoad,
+ getAdjustedPtr(IRB, DL, StoreBasePtr,
+ APInt(DL.getIndexSizeInBits(AS), PartOffset),
+ PartPtrTy, StoreBasePtr->getName() + "."),
+ getAdjustedAlignment(SI, PartOffset),
+ /*IsVolatile*/ false);
+ PStore->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
+ LLVMContext::MD_access_group});
+ LLVM_DEBUG(dbgs() << " +" << PartOffset << ":" << *PStore << "\n");
+ }
+
+ // We want to immediately iterate on any allocas impacted by splitting
+ // this store, and we have to track any promotable alloca (indicated by
+ // a direct store) as needing to be resplit because it is no longer
+ // promotable.
+ if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(StoreBasePtr)) {
+ ResplitPromotableAllocas.insert(OtherAI);
+ Worklist.insert(OtherAI);
+ } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+ StoreBasePtr->stripInBoundsOffsets())) {
+ Worklist.insert(OtherAI);
+ }
+
+ // Mark the original store as dead.
DeadInsts.push_back(SI);
- }
-
- // Save the split loads if there are deferred stores among the users.
- if (DeferredStores)
- SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
-
- // Mark the original load as dead and kill the original slice.
+ }
+
+ // Save the split loads if there are deferred stores among the users.
+ if (DeferredStores)
+ SplitLoadsMap.insert(std::make_pair(LI, std::move(SplitLoads)));
+
+ // Mark the original load as dead and kill the original slice.
DeadInsts.push_back(LI);
- Offsets.S->kill();
- }
-
- // Second, we rewrite all of the split stores. At this point, we know that
- // all loads from this alloca have been split already. For stores of such
- // loads, we can simply look up the pre-existing split loads. For stores of
- // other loads, we split those loads first and then write split stores of
- // them.
- for (StoreInst *SI : Stores) {
- auto *LI = cast<LoadInst>(SI->getValueOperand());
- IntegerType *Ty = cast<IntegerType>(LI->getType());
- uint64_t StoreSize = Ty->getBitWidth() / 8;
- assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
-
- auto &Offsets = SplitOffsetsMap[SI];
- assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
- "Slice size should always match load size exactly!");
- uint64_t BaseOffset = Offsets.S->beginOffset();
- assert(BaseOffset + StoreSize > BaseOffset &&
- "Cannot represent alloca access size using 64-bit integers!");
-
- Value *LoadBasePtr = LI->getPointerOperand();
- Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
-
- LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
-
- // Check whether we have an already split load.
- auto SplitLoadsMapI = SplitLoadsMap.find(LI);
- std::vector<LoadInst *> *SplitLoads = nullptr;
- if (SplitLoadsMapI != SplitLoadsMap.end()) {
- SplitLoads = &SplitLoadsMapI->second;
- assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
- "Too few split loads for the number of splits in the store!");
- } else {
- LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
- }
-
- uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
- int Idx = 0, Size = Offsets.Splits.size();
- for (;;) {
- auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
- auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
- auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
-
- // Either lookup a split load or create one.
- LoadInst *PLoad;
- if (SplitLoads) {
- PLoad = (*SplitLoads)[Idx];
- } else {
- IRB.SetInsertPoint(LI);
- auto AS = LI->getPointerAddressSpace();
- PLoad = IRB.CreateAlignedLoad(
- PartTy,
- getAdjustedPtr(IRB, DL, LoadBasePtr,
- APInt(DL.getIndexSizeInBits(AS), PartOffset),
- LoadPartPtrTy, LoadBasePtr->getName() + "."),
- getAdjustedAlignment(LI, PartOffset),
- /*IsVolatile*/ false, LI->getName());
- }
-
- // And store this partition.
- IRB.SetInsertPoint(SI);
- auto AS = SI->getPointerAddressSpace();
- StoreInst *PStore = IRB.CreateAlignedStore(
- PLoad,
- getAdjustedPtr(IRB, DL, StoreBasePtr,
- APInt(DL.getIndexSizeInBits(AS), PartOffset),
- StorePartPtrTy, StoreBasePtr->getName() + "."),
- getAdjustedAlignment(SI, PartOffset),
- /*IsVolatile*/ false);
-
- // Now build a new slice for the alloca.
- NewSlices.push_back(
- Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
- &PStore->getOperandUse(PStore->getPointerOperandIndex()),
- /*IsSplittable*/ false));
- LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
- << ", " << NewSlices.back().endOffset()
- << "): " << *PStore << "\n");
- if (!SplitLoads) {
- LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
- }
-
- // See if we've finished all the splits.
- if (Idx >= Size)
- break;
-
- // Setup the next partition.
- PartOffset = Offsets.Splits[Idx];
- ++Idx;
- PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
- }
-
- // We want to immediately iterate on any allocas impacted by splitting
- // this load, which is only relevant if it isn't a load of this alloca and
- // thus we didn't already split the loads above. We also have to keep track
- // of any promotable allocas we split loads on as they can no longer be
- // promoted.
- if (!SplitLoads) {
- if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
- assert(OtherAI != &AI && "We can't re-split our own alloca!");
- ResplitPromotableAllocas.insert(OtherAI);
- Worklist.insert(OtherAI);
- } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
- LoadBasePtr->stripInBoundsOffsets())) {
- assert(OtherAI != &AI && "We can't re-split our own alloca!");
- Worklist.insert(OtherAI);
- }
- }
-
- // Mark the original store as dead now that we've split it up and kill its
- // slice. Note that we leave the original load in place unless this store
- // was its only use. It may in turn be split up if it is an alloca load
- // for some other alloca, but it may be a normal load. This may introduce
- // redundant loads, but where those can be merged the rest of the optimizer
- // should handle the merging, and this uncovers SSA splits which is more
- // important. In practice, the original loads will almost always be fully
- // split and removed eventually, and the splits will be merged by any
- // trivial CSE, including instcombine.
- if (LI->hasOneUse()) {
- assert(*LI->user_begin() == SI && "Single use isn't this store!");
+ Offsets.S->kill();
+ }
+
+ // Second, we rewrite all of the split stores. At this point, we know that
+ // all loads from this alloca have been split already. For stores of such
+ // loads, we can simply look up the pre-existing split loads. For stores of
+ // other loads, we split those loads first and then write split stores of
+ // them.
+ for (StoreInst *SI : Stores) {
+ auto *LI = cast<LoadInst>(SI->getValueOperand());
+ IntegerType *Ty = cast<IntegerType>(LI->getType());
+ uint64_t StoreSize = Ty->getBitWidth() / 8;
+ assert(StoreSize > 0 && "Cannot have a zero-sized integer store!");
+
+ auto &Offsets = SplitOffsetsMap[SI];
+ assert(StoreSize == Offsets.S->endOffset() - Offsets.S->beginOffset() &&
+ "Slice size should always match load size exactly!");
+ uint64_t BaseOffset = Offsets.S->beginOffset();
+ assert(BaseOffset + StoreSize > BaseOffset &&
+ "Cannot represent alloca access size using 64-bit integers!");
+
+ Value *LoadBasePtr = LI->getPointerOperand();
+ Instruction *StoreBasePtr = cast<Instruction>(SI->getPointerOperand());
+
+ LLVM_DEBUG(dbgs() << " Splitting store: " << *SI << "\n");
+
+ // Check whether we have an already split load.
+ auto SplitLoadsMapI = SplitLoadsMap.find(LI);
+ std::vector<LoadInst *> *SplitLoads = nullptr;
+ if (SplitLoadsMapI != SplitLoadsMap.end()) {
+ SplitLoads = &SplitLoadsMapI->second;
+ assert(SplitLoads->size() == Offsets.Splits.size() + 1 &&
+ "Too few split loads for the number of splits in the store!");
+ } else {
+ LLVM_DEBUG(dbgs() << " of load: " << *LI << "\n");
+ }
+
+ uint64_t PartOffset = 0, PartSize = Offsets.Splits.front();
+ int Idx = 0, Size = Offsets.Splits.size();
+ for (;;) {
+ auto *PartTy = Type::getIntNTy(Ty->getContext(), PartSize * 8);
+ auto *LoadPartPtrTy = PartTy->getPointerTo(LI->getPointerAddressSpace());
+ auto *StorePartPtrTy = PartTy->getPointerTo(SI->getPointerAddressSpace());
+
+ // Either lookup a split load or create one.
+ LoadInst *PLoad;
+ if (SplitLoads) {
+ PLoad = (*SplitLoads)[Idx];
+ } else {
+ IRB.SetInsertPoint(LI);
+ auto AS = LI->getPointerAddressSpace();
+ PLoad = IRB.CreateAlignedLoad(
+ PartTy,
+ getAdjustedPtr(IRB, DL, LoadBasePtr,
+ APInt(DL.getIndexSizeInBits(AS), PartOffset),
+ LoadPartPtrTy, LoadBasePtr->getName() + "."),
+ getAdjustedAlignment(LI, PartOffset),
+ /*IsVolatile*/ false, LI->getName());
+ }
+
+ // And store this partition.
+ IRB.SetInsertPoint(SI);
+ auto AS = SI->getPointerAddressSpace();
+ StoreInst *PStore = IRB.CreateAlignedStore(
+ PLoad,
+ getAdjustedPtr(IRB, DL, StoreBasePtr,
+ APInt(DL.getIndexSizeInBits(AS), PartOffset),
+ StorePartPtrTy, StoreBasePtr->getName() + "."),
+ getAdjustedAlignment(SI, PartOffset),
+ /*IsVolatile*/ false);
+
+ // Now build a new slice for the alloca.
+ NewSlices.push_back(
+ Slice(BaseOffset + PartOffset, BaseOffset + PartOffset + PartSize,
+ &PStore->getOperandUse(PStore->getPointerOperandIndex()),
+ /*IsSplittable*/ false));
+ LLVM_DEBUG(dbgs() << " new slice [" << NewSlices.back().beginOffset()
+ << ", " << NewSlices.back().endOffset()
+ << "): " << *PStore << "\n");
+ if (!SplitLoads) {
+ LLVM_DEBUG(dbgs() << " of split load: " << *PLoad << "\n");
+ }
+
+ // See if we've finished all the splits.
+ if (Idx >= Size)
+ break;
+
+ // Setup the next partition.
+ PartOffset = Offsets.Splits[Idx];
+ ++Idx;
+ PartSize = (Idx < Size ? Offsets.Splits[Idx] : StoreSize) - PartOffset;
+ }
+
+ // We want to immediately iterate on any allocas impacted by splitting
+ // this load, which is only relevant if it isn't a load of this alloca and
+ // thus we didn't already split the loads above. We also have to keep track
+ // of any promotable allocas we split loads on as they can no longer be
+ // promoted.
+ if (!SplitLoads) {
+ if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(LoadBasePtr)) {
+ assert(OtherAI != &AI && "We can't re-split our own alloca!");
+ ResplitPromotableAllocas.insert(OtherAI);
+ Worklist.insert(OtherAI);
+ } else if (AllocaInst *OtherAI = dyn_cast<AllocaInst>(
+ LoadBasePtr->stripInBoundsOffsets())) {
+ assert(OtherAI != &AI && "We can't re-split our own alloca!");
+ Worklist.insert(OtherAI);
+ }
+ }
+
+ // Mark the original store as dead now that we've split it up and kill its
+ // slice. Note that we leave the original load in place unless this store
+ // was its only use. It may in turn be split up if it is an alloca load
+ // for some other alloca, but it may be a normal load. This may introduce
+ // redundant loads, but where those can be merged the rest of the optimizer
+ // should handle the merging, and this uncovers SSA splits which is more
+ // important. In practice, the original loads will almost always be fully
+ // split and removed eventually, and the splits will be merged by any
+ // trivial CSE, including instcombine.
+ if (LI->hasOneUse()) {
+ assert(*LI->user_begin() == SI && "Single use isn't this store!");
DeadInsts.push_back(LI);
- }
+ }
DeadInsts.push_back(SI);
- Offsets.S->kill();
- }
-
- // Remove the killed slices that have ben pre-split.
+ Offsets.S->kill();
+ }
+
+ // Remove the killed slices that have ben pre-split.
llvm::erase_if(AS, [](const Slice &S) { return S.isDead(); });
-
- // Insert our new slices. This will sort and merge them into the sorted
- // sequence.
- AS.insert(NewSlices);
-
- LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
-#ifndef NDEBUG
- for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
- LLVM_DEBUG(AS.print(dbgs(), I, " "));
-#endif
-
- // Finally, don't try to promote any allocas that new require re-splitting.
- // They have already been added to the worklist above.
+
+ // Insert our new slices. This will sort and merge them into the sorted
+ // sequence.
+ AS.insert(NewSlices);
+
+ LLVM_DEBUG(dbgs() << " Pre-split slices:\n");
+#ifndef NDEBUG
+ for (auto I = AS.begin(), E = AS.end(); I != E; ++I)
+ LLVM_DEBUG(AS.print(dbgs(), I, " "));
+#endif
+
+ // Finally, don't try to promote any allocas that new require re-splitting.
+ // They have already been added to the worklist above.
llvm::erase_if(PromotableAllocas, [&](AllocaInst *AI) {
return ResplitPromotableAllocas.count(AI);
});
-
- return true;
-}
-
-/// Rewrite an alloca partition's users.
-///
-/// This routine drives both of the rewriting goals of the SROA pass. It tries
-/// to rewrite uses of an alloca partition to be conducive for SSA value
-/// promotion. If the partition needs a new, more refined alloca, this will
-/// build that new alloca, preserving as much type information as possible, and
-/// rewrite the uses of the old alloca to point at the new one and have the
-/// appropriate new offsets. It also evaluates how successful the rewrite was
-/// at enabling promotion and if it was successful queues the alloca to be
-/// promoted.
-AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
- Partition &P) {
- // Try to compute a friendly type for this partition of the alloca. This
- // won't always succeed, in which case we fall back to a legal integer type
- // or an i8 array of an appropriate size.
- Type *SliceTy = nullptr;
- const DataLayout &DL = AI.getModule()->getDataLayout();
+
+ return true;
+}
+
+/// Rewrite an alloca partition's users.
+///
+/// This routine drives both of the rewriting goals of the SROA pass. It tries
+/// to rewrite uses of an alloca partition to be conducive for SSA value
+/// promotion. If the partition needs a new, more refined alloca, this will
+/// build that new alloca, preserving as much type information as possible, and
+/// rewrite the uses of the old alloca to point at the new one and have the
+/// appropriate new offsets. It also evaluates how successful the rewrite was
+/// at enabling promotion and if it was successful queues the alloca to be
+/// promoted.
+AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
+ Partition &P) {
+ // Try to compute a friendly type for this partition of the alloca. This
+ // won't always succeed, in which case we fall back to a legal integer type
+ // or an i8 array of an appropriate size.
+ Type *SliceTy = nullptr;
+ const DataLayout &DL = AI.getModule()->getDataLayout();
std::pair<Type *, IntegerType *> CommonUseTy =
findCommonType(P.begin(), P.end(), P.endOffset());
// Do all uses operate on the same type?
@@ -4266,103 +4266,103 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
if (DL.getTypeAllocSize(CommonUseTy.first).getFixedSize() >= P.size())
SliceTy = CommonUseTy.first;
// If not, can we find an appropriate subtype in the original allocated type?
- if (!SliceTy)
- if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
- P.beginOffset(), P.size()))
- SliceTy = TypePartitionTy;
+ if (!SliceTy)
+ if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
+ P.beginOffset(), P.size()))
+ SliceTy = TypePartitionTy;
// If still not, can we use the largest bitwidth integer type used?
if (!SliceTy && CommonUseTy.second)
if (DL.getTypeAllocSize(CommonUseTy.second).getFixedSize() >= P.size())
SliceTy = CommonUseTy.second;
- if ((!SliceTy || (SliceTy->isArrayTy() &&
- SliceTy->getArrayElementType()->isIntegerTy())) &&
- DL.isLegalInteger(P.size() * 8))
- SliceTy = Type::getIntNTy(*C, P.size() * 8);
- if (!SliceTy)
- SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
- assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());
-
- bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
-
- VectorType *VecTy =
- IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
- if (VecTy)
- SliceTy = VecTy;
-
- // Check for the case where we're going to rewrite to a new alloca of the
- // exact same type as the original, and with the same access offsets. In that
- // case, re-use the existing alloca, but still run through the rewriter to
- // perform phi and select speculation.
- // P.beginOffset() can be non-zero even with the same type in a case with
- // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
- AllocaInst *NewAI;
- if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
- NewAI = &AI;
- // FIXME: We should be able to bail at this point with "nothing changed".
- // FIXME: We might want to defer PHI speculation until after here.
- // FIXME: return nullptr;
- } else {
- // Make sure the alignment is compatible with P.beginOffset().
- const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
- // If we will get at least this much alignment from the type alone, leave
- // the alloca's alignment unconstrained.
- const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
- NewAI = new AllocaInst(
- SliceTy, AI.getType()->getAddressSpace(), nullptr,
- IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
- AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
- // Copy the old AI debug location over to the new one.
- NewAI->setDebugLoc(AI.getDebugLoc());
- ++NumNewAllocas;
- }
-
- LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
- << "[" << P.beginOffset() << "," << P.endOffset()
- << ") to: " << *NewAI << "\n");
-
- // Track the high watermark on the worklist as it is only relevant for
- // promoted allocas. We will reset it to this point if the alloca is not in
- // fact scheduled for promotion.
- unsigned PPWOldSize = PostPromotionWorklist.size();
- unsigned NumUses = 0;
- SmallSetVector<PHINode *, 8> PHIUsers;
- SmallSetVector<SelectInst *, 8> SelectUsers;
-
- AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
- P.endOffset(), IsIntegerPromotable, VecTy,
- PHIUsers, SelectUsers);
- bool Promotable = true;
- for (Slice *S : P.splitSliceTails()) {
- Promotable &= Rewriter.visit(S);
- ++NumUses;
- }
- for (Slice &S : P) {
- Promotable &= Rewriter.visit(&S);
- ++NumUses;
- }
-
- NumAllocaPartitionUses += NumUses;
- MaxUsesPerAllocaPartition.updateMax(NumUses);
-
- // Now that we've processed all the slices in the new partition, check if any
- // PHIs or Selects would block promotion.
- for (PHINode *PHI : PHIUsers)
- if (!isSafePHIToSpeculate(*PHI)) {
- Promotable = false;
- PHIUsers.clear();
- SelectUsers.clear();
- break;
- }
-
- for (SelectInst *Sel : SelectUsers)
- if (!isSafeSelectToSpeculate(*Sel)) {
- Promotable = false;
- PHIUsers.clear();
- SelectUsers.clear();
- break;
- }
-
- if (Promotable) {
+ if ((!SliceTy || (SliceTy->isArrayTy() &&
+ SliceTy->getArrayElementType()->isIntegerTy())) &&
+ DL.isLegalInteger(P.size() * 8))
+ SliceTy = Type::getIntNTy(*C, P.size() * 8);
+ if (!SliceTy)
+ SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
+ assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());
+
+ bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
+
+ VectorType *VecTy =
+ IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL);
+ if (VecTy)
+ SliceTy = VecTy;
+
+ // Check for the case where we're going to rewrite to a new alloca of the
+ // exact same type as the original, and with the same access offsets. In that
+ // case, re-use the existing alloca, but still run through the rewriter to
+ // perform phi and select speculation.
+ // P.beginOffset() can be non-zero even with the same type in a case with
+ // out-of-bounds access (e.g. @PR35657 function in SROA/basictest.ll).
+ AllocaInst *NewAI;
+ if (SliceTy == AI.getAllocatedType() && P.beginOffset() == 0) {
+ NewAI = &AI;
+ // FIXME: We should be able to bail at this point with "nothing changed".
+ // FIXME: We might want to defer PHI speculation until after here.
+ // FIXME: return nullptr;
+ } else {
+ // Make sure the alignment is compatible with P.beginOffset().
+ const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
+ // If we will get at least this much alignment from the type alone, leave
+ // the alloca's alignment unconstrained.
+ const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
+ NewAI = new AllocaInst(
+ SliceTy, AI.getType()->getAddressSpace(), nullptr,
+ IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
+ AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
+ // Copy the old AI debug location over to the new one.
+ NewAI->setDebugLoc(AI.getDebugLoc());
+ ++NumNewAllocas;
+ }
+
+ LLVM_DEBUG(dbgs() << "Rewriting alloca partition "
+ << "[" << P.beginOffset() << "," << P.endOffset()
+ << ") to: " << *NewAI << "\n");
+
+ // Track the high watermark on the worklist as it is only relevant for
+ // promoted allocas. We will reset it to this point if the alloca is not in
+ // fact scheduled for promotion.
+ unsigned PPWOldSize = PostPromotionWorklist.size();
+ unsigned NumUses = 0;
+ SmallSetVector<PHINode *, 8> PHIUsers;
+ SmallSetVector<SelectInst *, 8> SelectUsers;
+
+ AllocaSliceRewriter Rewriter(DL, AS, *this, AI, *NewAI, P.beginOffset(),
+ P.endOffset(), IsIntegerPromotable, VecTy,
+ PHIUsers, SelectUsers);
+ bool Promotable = true;
+ for (Slice *S : P.splitSliceTails()) {
+ Promotable &= Rewriter.visit(S);
+ ++NumUses;
+ }
+ for (Slice &S : P) {
+ Promotable &= Rewriter.visit(&S);
+ ++NumUses;
+ }
+
+ NumAllocaPartitionUses += NumUses;
+ MaxUsesPerAllocaPartition.updateMax(NumUses);
+
+ // Now that we've processed all the slices in the new partition, check if any
+ // PHIs or Selects would block promotion.
+ for (PHINode *PHI : PHIUsers)
+ if (!isSafePHIToSpeculate(*PHI)) {
+ Promotable = false;
+ PHIUsers.clear();
+ SelectUsers.clear();
+ break;
+ }
+
+ for (SelectInst *Sel : SelectUsers)
+ if (!isSafeSelectToSpeculate(*Sel)) {
+ Promotable = false;
+ PHIUsers.clear();
+ SelectUsers.clear();
+ break;
+ }
+
+ if (Promotable) {
for (Use *U : AS.getDeadUsesIfPromotable()) {
auto *OldInst = dyn_cast<Instruction>(U->get());
Value::dropDroppableUse(*U);
@@ -4370,190 +4370,190 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
if (isInstructionTriviallyDead(OldInst))
DeadInsts.push_back(OldInst);
}
- if (PHIUsers.empty() && SelectUsers.empty()) {
- // Promote the alloca.
- PromotableAllocas.push_back(NewAI);
- } else {
- // If we have either PHIs or Selects to speculate, add them to those
- // worklists and re-queue the new alloca so that we promote in on the
- // next iteration.
- for (PHINode *PHIUser : PHIUsers)
- SpeculatablePHIs.insert(PHIUser);
- for (SelectInst *SelectUser : SelectUsers)
- SpeculatableSelects.insert(SelectUser);
- Worklist.insert(NewAI);
- }
- } else {
- // Drop any post-promotion work items if promotion didn't happen.
- while (PostPromotionWorklist.size() > PPWOldSize)
- PostPromotionWorklist.pop_back();
-
- // We couldn't promote and we didn't create a new partition, nothing
- // happened.
- if (NewAI == &AI)
- return nullptr;
-
- // If we can't promote the alloca, iterate on it to check for new
- // refinements exposed by splitting the current alloca. Don't iterate on an
- // alloca which didn't actually change and didn't get promoted.
- Worklist.insert(NewAI);
- }
-
- return NewAI;
-}
-
-/// Walks the slices of an alloca and form partitions based on them,
-/// rewriting each of their uses.
-bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
- if (AS.begin() == AS.end())
- return false;
-
- unsigned NumPartitions = 0;
- bool Changed = false;
- const DataLayout &DL = AI.getModule()->getDataLayout();
-
- // First try to pre-split loads and stores.
- Changed |= presplitLoadsAndStores(AI, AS);
-
- // Now that we have identified any pre-splitting opportunities,
- // mark loads and stores unsplittable except for the following case.
- // We leave a slice splittable if all other slices are disjoint or fully
- // included in the slice, such as whole-alloca loads and stores.
- // If we fail to split these during pre-splitting, we want to force them
- // to be rewritten into a partition.
- bool IsSorted = true;
-
- uint64_t AllocaSize =
- DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize();
- const uint64_t MaxBitVectorSize = 1024;
- if (AllocaSize <= MaxBitVectorSize) {
- // If a byte boundary is included in any load or store, a slice starting or
- // ending at the boundary is not splittable.
- SmallBitVector SplittableOffset(AllocaSize + 1, true);
- for (Slice &S : AS)
- for (unsigned O = S.beginOffset() + 1;
- O < S.endOffset() && O < AllocaSize; O++)
- SplittableOffset.reset(O);
-
- for (Slice &S : AS) {
- if (!S.isSplittable())
- continue;
-
- if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
- (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
- continue;
-
- if (isa<LoadInst>(S.getUse()->getUser()) ||
- isa<StoreInst>(S.getUse()->getUser())) {
- S.makeUnsplittable();
- IsSorted = false;
- }
- }
- }
- else {
- // We only allow whole-alloca splittable loads and stores
- // for a large alloca to avoid creating too large BitVector.
- for (Slice &S : AS) {
- if (!S.isSplittable())
- continue;
-
- if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
- continue;
-
- if (isa<LoadInst>(S.getUse()->getUser()) ||
- isa<StoreInst>(S.getUse()->getUser())) {
- S.makeUnsplittable();
- IsSorted = false;
- }
- }
- }
-
- if (!IsSorted)
- llvm::sort(AS);
-
- /// Describes the allocas introduced by rewritePartition in order to migrate
- /// the debug info.
- struct Fragment {
- AllocaInst *Alloca;
- uint64_t Offset;
- uint64_t Size;
- Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
- : Alloca(AI), Offset(O), Size(S) {}
- };
- SmallVector<Fragment, 4> Fragments;
-
- // Rewrite each partition.
- for (auto &P : AS.partitions()) {
- if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
- Changed = true;
- if (NewAI != &AI) {
- uint64_t SizeOfByte = 8;
- uint64_t AllocaSize =
- DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedSize();
- // Don't include any padding.
- uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
- Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
- }
- }
- ++NumPartitions;
- }
-
- NumAllocaPartitions += NumPartitions;
- MaxPartitionsPerAlloca.updateMax(NumPartitions);
-
- // Migrate debug information from the old alloca to the new alloca(s)
- // and the individual partitions.
- TinyPtrVector<DbgVariableIntrinsic *> DbgDeclares = FindDbgAddrUses(&AI);
+ if (PHIUsers.empty() && SelectUsers.empty()) {
+ // Promote the alloca.
+ PromotableAllocas.push_back(NewAI);
+ } else {
+ // If we have either PHIs or Selects to speculate, add them to those
+ // worklists and re-queue the new alloca so that we promote in on the
+ // next iteration.
+ for (PHINode *PHIUser : PHIUsers)
+ SpeculatablePHIs.insert(PHIUser);
+ for (SelectInst *SelectUser : SelectUsers)
+ SpeculatableSelects.insert(SelectUser);
+ Worklist.insert(NewAI);
+ }
+ } else {
+ // Drop any post-promotion work items if promotion didn't happen.
+ while (PostPromotionWorklist.size() > PPWOldSize)
+ PostPromotionWorklist.pop_back();
+
+ // We couldn't promote and we didn't create a new partition, nothing
+ // happened.
+ if (NewAI == &AI)
+ return nullptr;
+
+ // If we can't promote the alloca, iterate on it to check for new
+ // refinements exposed by splitting the current alloca. Don't iterate on an
+ // alloca which didn't actually change and didn't get promoted.
+ Worklist.insert(NewAI);
+ }
+
+ return NewAI;
+}
+
+/// Walks the slices of an alloca and form partitions based on them,
+/// rewriting each of their uses.
+bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
+ if (AS.begin() == AS.end())
+ return false;
+
+ unsigned NumPartitions = 0;
+ bool Changed = false;
+ const DataLayout &DL = AI.getModule()->getDataLayout();
+
+ // First try to pre-split loads and stores.
+ Changed |= presplitLoadsAndStores(AI, AS);
+
+ // Now that we have identified any pre-splitting opportunities,
+ // mark loads and stores unsplittable except for the following case.
+ // We leave a slice splittable if all other slices are disjoint or fully
+ // included in the slice, such as whole-alloca loads and stores.
+ // If we fail to split these during pre-splitting, we want to force them
+ // to be rewritten into a partition.
+ bool IsSorted = true;
+
+ uint64_t AllocaSize =
+ DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize();
+ const uint64_t MaxBitVectorSize = 1024;
+ if (AllocaSize <= MaxBitVectorSize) {
+ // If a byte boundary is included in any load or store, a slice starting or
+ // ending at the boundary is not splittable.
+ SmallBitVector SplittableOffset(AllocaSize + 1, true);
+ for (Slice &S : AS)
+ for (unsigned O = S.beginOffset() + 1;
+ O < S.endOffset() && O < AllocaSize; O++)
+ SplittableOffset.reset(O);
+
+ for (Slice &S : AS) {
+ if (!S.isSplittable())
+ continue;
+
+ if ((S.beginOffset() > AllocaSize || SplittableOffset[S.beginOffset()]) &&
+ (S.endOffset() > AllocaSize || SplittableOffset[S.endOffset()]))
+ continue;
+
+ if (isa<LoadInst>(S.getUse()->getUser()) ||
+ isa<StoreInst>(S.getUse()->getUser())) {
+ S.makeUnsplittable();
+ IsSorted = false;
+ }
+ }
+ }
+ else {
+ // We only allow whole-alloca splittable loads and stores
+ // for a large alloca to avoid creating too large BitVector.
+ for (Slice &S : AS) {
+ if (!S.isSplittable())
+ continue;
+
+ if (S.beginOffset() == 0 && S.endOffset() >= AllocaSize)
+ continue;
+
+ if (isa<LoadInst>(S.getUse()->getUser()) ||
+ isa<StoreInst>(S.getUse()->getUser())) {
+ S.makeUnsplittable();
+ IsSorted = false;
+ }
+ }
+ }
+
+ if (!IsSorted)
+ llvm::sort(AS);
+
+ /// Describes the allocas introduced by rewritePartition in order to migrate
+ /// the debug info.
+ struct Fragment {
+ AllocaInst *Alloca;
+ uint64_t Offset;
+ uint64_t Size;
+ Fragment(AllocaInst *AI, uint64_t O, uint64_t S)
+ : Alloca(AI), Offset(O), Size(S) {}
+ };
+ SmallVector<Fragment, 4> Fragments;
+
+ // Rewrite each partition.
+ for (auto &P : AS.partitions()) {
+ if (AllocaInst *NewAI = rewritePartition(AI, AS, P)) {
+ Changed = true;
+ if (NewAI != &AI) {
+ uint64_t SizeOfByte = 8;
+ uint64_t AllocaSize =
+ DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedSize();
+ // Don't include any padding.
+ uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
+ Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
+ }
+ }
+ ++NumPartitions;
+ }
+
+ NumAllocaPartitions += NumPartitions;
+ MaxPartitionsPerAlloca.updateMax(NumPartitions);
+
+ // Migrate debug information from the old alloca to the new alloca(s)
+ // and the individual partitions.
+ TinyPtrVector<DbgVariableIntrinsic *> DbgDeclares = FindDbgAddrUses(&AI);
for (DbgVariableIntrinsic *DbgDeclare : DbgDeclares) {
auto *Expr = DbgDeclare->getExpression();
- DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
- uint64_t AllocaSize =
- DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedSize();
- for (auto Fragment : Fragments) {
- // Create a fragment expression describing the new partition or reuse AI's
- // expression if there is only one partition.
- auto *FragmentExpr = Expr;
- if (Fragment.Size < AllocaSize || Expr->isFragment()) {
- // If this alloca is already a scalar replacement of a larger aggregate,
- // Fragment.Offset describes the offset inside the scalar.
- auto ExprFragment = Expr->getFragmentInfo();
- uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0;
- uint64_t Start = Offset + Fragment.Offset;
- uint64_t Size = Fragment.Size;
- if (ExprFragment) {
- uint64_t AbsEnd =
- ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
- if (Start >= AbsEnd)
- // No need to describe a SROAed padding.
- continue;
- Size = std::min(Size, AbsEnd - Start);
- }
- // The new, smaller fragment is stenciled out from the old fragment.
- if (auto OrigFragment = FragmentExpr->getFragmentInfo()) {
- assert(Start >= OrigFragment->OffsetInBits &&
- "new fragment is outside of original fragment");
- Start -= OrigFragment->OffsetInBits;
- }
-
- // The alloca may be larger than the variable.
+ DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
+ uint64_t AllocaSize =
+ DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedSize();
+ for (auto Fragment : Fragments) {
+ // Create a fragment expression describing the new partition or reuse AI's
+ // expression if there is only one partition.
+ auto *FragmentExpr = Expr;
+ if (Fragment.Size < AllocaSize || Expr->isFragment()) {
+ // If this alloca is already a scalar replacement of a larger aggregate,
+ // Fragment.Offset describes the offset inside the scalar.
+ auto ExprFragment = Expr->getFragmentInfo();
+ uint64_t Offset = ExprFragment ? ExprFragment->OffsetInBits : 0;
+ uint64_t Start = Offset + Fragment.Offset;
+ uint64_t Size = Fragment.Size;
+ if (ExprFragment) {
+ uint64_t AbsEnd =
+ ExprFragment->OffsetInBits + ExprFragment->SizeInBits;
+ if (Start >= AbsEnd)
+ // No need to describe a SROAed padding.
+ continue;
+ Size = std::min(Size, AbsEnd - Start);
+ }
+ // The new, smaller fragment is stenciled out from the old fragment.
+ if (auto OrigFragment = FragmentExpr->getFragmentInfo()) {
+ assert(Start >= OrigFragment->OffsetInBits &&
+ "new fragment is outside of original fragment");
+ Start -= OrigFragment->OffsetInBits;
+ }
+
+ // The alloca may be larger than the variable.
auto VarSize = DbgDeclare->getVariable()->getSizeInBits();
- if (VarSize) {
- if (Size > *VarSize)
- Size = *VarSize;
- if (Size == 0 || Start + Size > *VarSize)
- continue;
- }
-
- // Avoid creating a fragment expression that covers the entire variable.
- if (!VarSize || *VarSize != Size) {
- if (auto E =
- DIExpression::createFragmentExpression(Expr, Start, Size))
- FragmentExpr = *E;
- else
- continue;
- }
- }
-
+ if (VarSize) {
+ if (Size > *VarSize)
+ Size = *VarSize;
+ if (Size == 0 || Start + Size > *VarSize)
+ continue;
+ }
+
+ // Avoid creating a fragment expression that covers the entire variable.
+ if (!VarSize || *VarSize != Size) {
+ if (auto E =
+ DIExpression::createFragmentExpression(Expr, Start, Size))
+ FragmentExpr = *E;
+ else
+ continue;
+ }
+ }
+
// Remove any existing intrinsics on the new alloca describing
// the variable fragment.
for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(Fragment.Alloca)) {
@@ -4566,262 +4566,262 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
if (SameVariableFragment(OldDII, DbgDeclare))
OldDII->eraseFromParent();
}
-
+
DIB.insertDeclare(Fragment.Alloca, DbgDeclare->getVariable(), FragmentExpr,
DbgDeclare->getDebugLoc(), &AI);
- }
- }
- return Changed;
-}
-
-/// Clobber a use with undef, deleting the used value if it becomes dead.
-void SROA::clobberUse(Use &U) {
- Value *OldV = U;
- // Replace the use with an undef value.
- U = UndefValue::get(OldV->getType());
-
- // Check for this making an instruction dead. We have to garbage collect
- // all the dead instructions to ensure the uses of any alloca end up being
- // minimal.
- if (Instruction *OldI = dyn_cast<Instruction>(OldV))
- if (isInstructionTriviallyDead(OldI)) {
+ }
+ }
+ return Changed;
+}
+
+/// Clobber a use with undef, deleting the used value if it becomes dead.
+void SROA::clobberUse(Use &U) {
+ Value *OldV = U;
+ // Replace the use with an undef value.
+ U = UndefValue::get(OldV->getType());
+
+ // Check for this making an instruction dead. We have to garbage collect
+ // all the dead instructions to ensure the uses of any alloca end up being
+ // minimal.
+ if (Instruction *OldI = dyn_cast<Instruction>(OldV))
+ if (isInstructionTriviallyDead(OldI)) {
DeadInsts.push_back(OldI);
- }
-}
-
-/// Analyze an alloca for SROA.
-///
-/// This analyzes the alloca to ensure we can reason about it, builds
-/// the slices of the alloca, and then hands it off to be split and
-/// rewritten as needed.
-bool SROA::runOnAlloca(AllocaInst &AI) {
- LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
- ++NumAllocasAnalyzed;
-
- // Special case dead allocas, as they're trivial.
- if (AI.use_empty()) {
- AI.eraseFromParent();
- return true;
- }
- const DataLayout &DL = AI.getModule()->getDataLayout();
-
- // Skip alloca forms that this analysis can't handle.
- auto *AT = AI.getAllocatedType();
- if (AI.isArrayAllocation() || !AT->isSized() || isa<ScalableVectorType>(AT) ||
- DL.getTypeAllocSize(AT).getFixedSize() == 0)
- return false;
-
- bool Changed = false;
-
- // First, split any FCA loads and stores touching this alloca to promote
- // better splitting and promotion opportunities.
- AggLoadStoreRewriter AggRewriter(DL);
- Changed |= AggRewriter.rewrite(AI);
-
- // Build the slices using a recursive instruction-visiting builder.
- AllocaSlices AS(DL, AI);
- LLVM_DEBUG(AS.print(dbgs()));
- if (AS.isEscaped())
- return Changed;
-
- // Delete all the dead users of this alloca before splitting and rewriting it.
- for (Instruction *DeadUser : AS.getDeadUsers()) {
- // Free up everything used by this instruction.
- for (Use &DeadOp : DeadUser->operands())
- clobberUse(DeadOp);
-
- // Now replace the uses of this instruction.
- DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
-
- // And mark it for deletion.
+ }
+}
+
+/// Analyze an alloca for SROA.
+///
+/// This analyzes the alloca to ensure we can reason about it, builds
+/// the slices of the alloca, and then hands it off to be split and
+/// rewritten as needed.
+bool SROA::runOnAlloca(AllocaInst &AI) {
+ LLVM_DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
+ ++NumAllocasAnalyzed;
+
+ // Special case dead allocas, as they're trivial.
+ if (AI.use_empty()) {
+ AI.eraseFromParent();
+ return true;
+ }
+ const DataLayout &DL = AI.getModule()->getDataLayout();
+
+ // Skip alloca forms that this analysis can't handle.
+ auto *AT = AI.getAllocatedType();
+ if (AI.isArrayAllocation() || !AT->isSized() || isa<ScalableVectorType>(AT) ||
+ DL.getTypeAllocSize(AT).getFixedSize() == 0)
+ return false;
+
+ bool Changed = false;
+
+ // First, split any FCA loads and stores touching this alloca to promote
+ // better splitting and promotion opportunities.
+ AggLoadStoreRewriter AggRewriter(DL);
+ Changed |= AggRewriter.rewrite(AI);
+
+ // Build the slices using a recursive instruction-visiting builder.
+ AllocaSlices AS(DL, AI);
+ LLVM_DEBUG(AS.print(dbgs()));
+ if (AS.isEscaped())
+ return Changed;
+
+ // Delete all the dead users of this alloca before splitting and rewriting it.
+ for (Instruction *DeadUser : AS.getDeadUsers()) {
+ // Free up everything used by this instruction.
+ for (Use &DeadOp : DeadUser->operands())
+ clobberUse(DeadOp);
+
+ // Now replace the uses of this instruction.
+ DeadUser->replaceAllUsesWith(UndefValue::get(DeadUser->getType()));
+
+ // And mark it for deletion.
DeadInsts.push_back(DeadUser);
- Changed = true;
- }
- for (Use *DeadOp : AS.getDeadOperands()) {
- clobberUse(*DeadOp);
- Changed = true;
- }
-
- // No slices to split. Leave the dead alloca for a later pass to clean up.
- if (AS.begin() == AS.end())
- return Changed;
-
- Changed |= splitAlloca(AI, AS);
-
- LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
- while (!SpeculatablePHIs.empty())
- speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
-
- LLVM_DEBUG(dbgs() << " Speculating Selects\n");
- while (!SpeculatableSelects.empty())
- speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
-
- return Changed;
-}
-
-/// Delete the dead instructions accumulated in this run.
-///
-/// Recursively deletes the dead instructions we've accumulated. This is done
-/// at the very end to maximize locality of the recursive delete and to
-/// minimize the problems of invalidated instruction pointers as such pointers
-/// are used heavily in the intermediate stages of the algorithm.
-///
-/// We also record the alloca instructions deleted here so that they aren't
-/// subsequently handed to mem2reg to promote.
-bool SROA::deleteDeadInstructions(
- SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
- bool Changed = false;
- while (!DeadInsts.empty()) {
+ Changed = true;
+ }
+ for (Use *DeadOp : AS.getDeadOperands()) {
+ clobberUse(*DeadOp);
+ Changed = true;
+ }
+
+ // No slices to split. Leave the dead alloca for a later pass to clean up.
+ if (AS.begin() == AS.end())
+ return Changed;
+
+ Changed |= splitAlloca(AI, AS);
+
+ LLVM_DEBUG(dbgs() << " Speculating PHIs\n");
+ while (!SpeculatablePHIs.empty())
+ speculatePHINodeLoads(*SpeculatablePHIs.pop_back_val());
+
+ LLVM_DEBUG(dbgs() << " Speculating Selects\n");
+ while (!SpeculatableSelects.empty())
+ speculateSelectInstLoads(*SpeculatableSelects.pop_back_val());
+
+ return Changed;
+}
+
+/// Delete the dead instructions accumulated in this run.
+///
+/// Recursively deletes the dead instructions we've accumulated. This is done
+/// at the very end to maximize locality of the recursive delete and to
+/// minimize the problems of invalidated instruction pointers as such pointers
+/// are used heavily in the intermediate stages of the algorithm.
+///
+/// We also record the alloca instructions deleted here so that they aren't
+/// subsequently handed to mem2reg to promote.
+bool SROA::deleteDeadInstructions(
+ SmallPtrSetImpl<AllocaInst *> &DeletedAllocas) {
+ bool Changed = false;
+ while (!DeadInsts.empty()) {
Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
if (!I) continue;
- LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
-
- // If the instruction is an alloca, find the possible dbg.declare connected
- // to it, and remove it too. We must do this before calling RAUW or we will
- // not be able to find it.
- if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
- DeletedAllocas.insert(AI);
- for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(AI))
- OldDII->eraseFromParent();
- }
-
- I->replaceAllUsesWith(UndefValue::get(I->getType()));
-
- for (Use &Operand : I->operands())
- if (Instruction *U = dyn_cast<Instruction>(Operand)) {
- // Zero out the operand and see if it becomes trivially dead.
- Operand = nullptr;
- if (isInstructionTriviallyDead(U))
+ LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+
+ // If the instruction is an alloca, find the possible dbg.declare connected
+ // to it, and remove it too. We must do this before calling RAUW or we will
+ // not be able to find it.
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+ DeletedAllocas.insert(AI);
+ for (DbgVariableIntrinsic *OldDII : FindDbgAddrUses(AI))
+ OldDII->eraseFromParent();
+ }
+
+ I->replaceAllUsesWith(UndefValue::get(I->getType()));
+
+ for (Use &Operand : I->operands())
+ if (Instruction *U = dyn_cast<Instruction>(Operand)) {
+ // Zero out the operand and see if it becomes trivially dead.
+ Operand = nullptr;
+ if (isInstructionTriviallyDead(U))
DeadInsts.push_back(U);
- }
-
- ++NumDeleted;
- I->eraseFromParent();
- Changed = true;
- }
- return Changed;
-}
-
-/// Promote the allocas, using the best available technique.
-///
-/// This attempts to promote whatever allocas have been identified as viable in
-/// the PromotableAllocas list. If that list is empty, there is nothing to do.
-/// This function returns whether any promotion occurred.
-bool SROA::promoteAllocas(Function &F) {
- if (PromotableAllocas.empty())
- return false;
-
- NumPromoted += PromotableAllocas.size();
-
- LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
- PromoteMemToReg(PromotableAllocas, *DT, AC);
- PromotableAllocas.clear();
- return true;
-}
-
-PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
- AssumptionCache &RunAC) {
- LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
- C = &F.getContext();
- DT = &RunDT;
- AC = &RunAC;
-
- BasicBlock &EntryBB = F.getEntryBlock();
- for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
- I != E; ++I) {
- if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
- if (isa<ScalableVectorType>(AI->getAllocatedType())) {
- if (isAllocaPromotable(AI))
- PromotableAllocas.push_back(AI);
- } else {
- Worklist.insert(AI);
- }
- }
- }
-
- bool Changed = false;
- // A set of deleted alloca instruction pointers which should be removed from
- // the list of promotable allocas.
- SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
-
- do {
- while (!Worklist.empty()) {
- Changed |= runOnAlloca(*Worklist.pop_back_val());
- Changed |= deleteDeadInstructions(DeletedAllocas);
-
- // Remove the deleted allocas from various lists so that we don't try to
- // continue processing them.
- if (!DeletedAllocas.empty()) {
- auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
- Worklist.remove_if(IsInSet);
- PostPromotionWorklist.remove_if(IsInSet);
+ }
+
+ ++NumDeleted;
+ I->eraseFromParent();
+ Changed = true;
+ }
+ return Changed;
+}
+
+/// Promote the allocas, using the best available technique.
+///
+/// This attempts to promote whatever allocas have been identified as viable in
+/// the PromotableAllocas list. If that list is empty, there is nothing to do.
+/// This function returns whether any promotion occurred.
+bool SROA::promoteAllocas(Function &F) {
+ if (PromotableAllocas.empty())
+ return false;
+
+ NumPromoted += PromotableAllocas.size();
+
+ LLVM_DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
+ PromoteMemToReg(PromotableAllocas, *DT, AC);
+ PromotableAllocas.clear();
+ return true;
+}
+
+PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
+ AssumptionCache &RunAC) {
+ LLVM_DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
+ C = &F.getContext();
+ DT = &RunDT;
+ AC = &RunAC;
+
+ BasicBlock &EntryBB = F.getEntryBlock();
+ for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
+ I != E; ++I) {
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+ if (isa<ScalableVectorType>(AI->getAllocatedType())) {
+ if (isAllocaPromotable(AI))
+ PromotableAllocas.push_back(AI);
+ } else {
+ Worklist.insert(AI);
+ }
+ }
+ }
+
+ bool Changed = false;
+ // A set of deleted alloca instruction pointers which should be removed from
+ // the list of promotable allocas.
+ SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
+
+ do {
+ while (!Worklist.empty()) {
+ Changed |= runOnAlloca(*Worklist.pop_back_val());
+ Changed |= deleteDeadInstructions(DeletedAllocas);
+
+ // Remove the deleted allocas from various lists so that we don't try to
+ // continue processing them.
+ if (!DeletedAllocas.empty()) {
+ auto IsInSet = [&](AllocaInst *AI) { return DeletedAllocas.count(AI); };
+ Worklist.remove_if(IsInSet);
+ PostPromotionWorklist.remove_if(IsInSet);
llvm::erase_if(PromotableAllocas, IsInSet);
- DeletedAllocas.clear();
- }
- }
-
- Changed |= promoteAllocas(F);
-
- Worklist = PostPromotionWorklist;
- PostPromotionWorklist.clear();
- } while (!Worklist.empty());
-
- if (!Changed)
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<GlobalsAA>();
- return PA;
-}
-
-PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) {
- return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
- AM.getResult<AssumptionAnalysis>(F));
-}
-
-/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
-///
-/// This is in the llvm namespace purely to allow it to be a friend of the \c
-/// SROA pass.
-class llvm::sroa::SROALegacyPass : public FunctionPass {
- /// The SROA implementation.
- SROA Impl;
-
-public:
- static char ID;
-
- SROALegacyPass() : FunctionPass(ID) {
- initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto PA = Impl.runImpl(
- F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
- getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
- return !PA.areAllPreserved();
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.setPreservesCFG();
- }
-
- StringRef getPassName() const override { return "SROA"; }
-};
-
-char SROALegacyPass::ID = 0;
-
-FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); }
-
-INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
- "Scalar Replacement Of Aggregates", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
- false, false)
+ DeletedAllocas.clear();
+ }
+ }
+
+ Changed |= promoteAllocas(F);
+
+ Worklist = PostPromotionWorklist;
+ PostPromotionWorklist.clear();
+ } while (!Worklist.empty());
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+PreservedAnalyses SROA::run(Function &F, FunctionAnalysisManager &AM) {
+ return runImpl(F, AM.getResult<DominatorTreeAnalysis>(F),
+ AM.getResult<AssumptionAnalysis>(F));
+}
+
+/// A legacy pass for the legacy pass manager that wraps the \c SROA pass.
+///
+/// This is in the llvm namespace purely to allow it to be a friend of the \c
+/// SROA pass.
+class llvm::sroa::SROALegacyPass : public FunctionPass {
+ /// The SROA implementation.
+ SROA Impl;
+
+public:
+ static char ID;
+
+ SROALegacyPass() : FunctionPass(ID) {
+ initializeSROALegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto PA = Impl.runImpl(
+ F, getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F));
+ return !PA.areAllPreserved();
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.setPreservesCFG();
+ }
+
+ StringRef getPassName() const override { return "SROA"; }
+};
+
+char SROALegacyPass::ID = 0;
+
+FunctionPass *llvm::createSROAPass() { return new SROALegacyPass(); }
+
+INITIALIZE_PASS_BEGIN(SROALegacyPass, "sroa",
+ "Scalar Replacement Of Aggregates", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(SROALegacyPass, "sroa", "Scalar Replacement Of Aggregates",
+ false, false)
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/Scalar.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/Scalar.cpp
index 1a19157cdb..dba3dba24e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/Scalar.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/Scalar.cpp
@@ -1,307 +1,307 @@
-//===-- Scalar.cpp --------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements common infrastructure for libLLVMScalarOpts.a, which
-// implements several scalar transformations over the LLVM intermediate
-// representation, including the C bindings for that library.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar.h"
-#include "llvm-c/Initialization.h"
-#include "llvm-c/Transforms/Scalar.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/Analysis/ScopedNoAliasAA.h"
-#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Transforms/Scalar/GVN.h"
-#include "llvm/Transforms/Scalar/Scalarizer.h"
-#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
-#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
-
-using namespace llvm;
-
-/// initializeScalarOptsPasses - Initialize all passes linked into the
-/// ScalarOpts library.
-void llvm::initializeScalarOpts(PassRegistry &Registry) {
- initializeADCELegacyPassPass(Registry);
+//===-- Scalar.cpp --------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMScalarOpts.a, which
+// implements several scalar transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Scalar.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Scalar/GVN.h"
+#include "llvm/Transforms/Scalar/Scalarizer.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+
+using namespace llvm;
+
+/// initializeScalarOptsPasses - Initialize all passes linked into the
+/// ScalarOpts library.
+void llvm::initializeScalarOpts(PassRegistry &Registry) {
+ initializeADCELegacyPassPass(Registry);
initializeAnnotationRemarksLegacyPass(Registry);
- initializeBDCELegacyPassPass(Registry);
- initializeAlignmentFromAssumptionsPass(Registry);
- initializeCallSiteSplittingLegacyPassPass(Registry);
- initializeConstantHoistingLegacyPassPass(Registry);
+ initializeBDCELegacyPassPass(Registry);
+ initializeAlignmentFromAssumptionsPass(Registry);
+ initializeCallSiteSplittingLegacyPassPass(Registry);
+ initializeConstantHoistingLegacyPassPass(Registry);
initializeConstraintEliminationPass(Registry);
- initializeCorrelatedValuePropagationPass(Registry);
- initializeDCELegacyPassPass(Registry);
- initializeDivRemPairsLegacyPassPass(Registry);
- initializeScalarizerLegacyPassPass(Registry);
- initializeDSELegacyPassPass(Registry);
- initializeGuardWideningLegacyPassPass(Registry);
- initializeLoopGuardWideningLegacyPassPass(Registry);
- initializeGVNLegacyPassPass(Registry);
- initializeNewGVNLegacyPassPass(Registry);
- initializeEarlyCSELegacyPassPass(Registry);
- initializeEarlyCSEMemSSALegacyPassPass(Registry);
- initializeMakeGuardsExplicitLegacyPassPass(Registry);
- initializeGVNHoistLegacyPassPass(Registry);
- initializeGVNSinkLegacyPassPass(Registry);
- initializeFlattenCFGPassPass(Registry);
- initializeIRCELegacyPassPass(Registry);
- initializeIndVarSimplifyLegacyPassPass(Registry);
- initializeInferAddressSpacesPass(Registry);
- initializeInstSimplifyLegacyPassPass(Registry);
- initializeJumpThreadingPass(Registry);
- initializeLegacyLICMPassPass(Registry);
- initializeLegacyLoopSinkPassPass(Registry);
- initializeLoopFuseLegacyPass(Registry);
- initializeLoopDataPrefetchLegacyPassPass(Registry);
- initializeLoopDeletionLegacyPassPass(Registry);
- initializeLoopAccessLegacyAnalysisPass(Registry);
- initializeLoopInstSimplifyLegacyPassPass(Registry);
+ initializeCorrelatedValuePropagationPass(Registry);
+ initializeDCELegacyPassPass(Registry);
+ initializeDivRemPairsLegacyPassPass(Registry);
+ initializeScalarizerLegacyPassPass(Registry);
+ initializeDSELegacyPassPass(Registry);
+ initializeGuardWideningLegacyPassPass(Registry);
+ initializeLoopGuardWideningLegacyPassPass(Registry);
+ initializeGVNLegacyPassPass(Registry);
+ initializeNewGVNLegacyPassPass(Registry);
+ initializeEarlyCSELegacyPassPass(Registry);
+ initializeEarlyCSEMemSSALegacyPassPass(Registry);
+ initializeMakeGuardsExplicitLegacyPassPass(Registry);
+ initializeGVNHoistLegacyPassPass(Registry);
+ initializeGVNSinkLegacyPassPass(Registry);
+ initializeFlattenCFGPassPass(Registry);
+ initializeIRCELegacyPassPass(Registry);
+ initializeIndVarSimplifyLegacyPassPass(Registry);
+ initializeInferAddressSpacesPass(Registry);
+ initializeInstSimplifyLegacyPassPass(Registry);
+ initializeJumpThreadingPass(Registry);
+ initializeLegacyLICMPassPass(Registry);
+ initializeLegacyLoopSinkPassPass(Registry);
+ initializeLoopFuseLegacyPass(Registry);
+ initializeLoopDataPrefetchLegacyPassPass(Registry);
+ initializeLoopDeletionLegacyPassPass(Registry);
+ initializeLoopAccessLegacyAnalysisPass(Registry);
+ initializeLoopInstSimplifyLegacyPassPass(Registry);
initializeLoopInterchangeLegacyPassPass(Registry);
initializeLoopFlattenLegacyPassPass(Registry);
- initializeLoopPredicationLegacyPassPass(Registry);
- initializeLoopRotateLegacyPassPass(Registry);
- initializeLoopStrengthReducePass(Registry);
+ initializeLoopPredicationLegacyPassPass(Registry);
+ initializeLoopRotateLegacyPassPass(Registry);
+ initializeLoopStrengthReducePass(Registry);
initializeLoopRerollLegacyPassPass(Registry);
- initializeLoopUnrollPass(Registry);
- initializeLoopUnrollAndJamPass(Registry);
- initializeLoopUnswitchPass(Registry);
- initializeWarnMissedTransformationsLegacyPass(Registry);
+ initializeLoopUnrollPass(Registry);
+ initializeLoopUnrollAndJamPass(Registry);
+ initializeLoopUnswitchPass(Registry);
+ initializeWarnMissedTransformationsLegacyPass(Registry);
initializeLoopVersioningLICMLegacyPassPass(Registry);
- initializeLoopIdiomRecognizeLegacyPassPass(Registry);
- initializeLowerAtomicLegacyPassPass(Registry);
- initializeLowerConstantIntrinsicsPass(Registry);
- initializeLowerExpectIntrinsicPass(Registry);
- initializeLowerGuardIntrinsicLegacyPassPass(Registry);
- initializeLowerMatrixIntrinsicsLegacyPassPass(Registry);
+ initializeLoopIdiomRecognizeLegacyPassPass(Registry);
+ initializeLowerAtomicLegacyPassPass(Registry);
+ initializeLowerConstantIntrinsicsPass(Registry);
+ initializeLowerExpectIntrinsicPass(Registry);
+ initializeLowerGuardIntrinsicLegacyPassPass(Registry);
+ initializeLowerMatrixIntrinsicsLegacyPassPass(Registry);
initializeLowerMatrixIntrinsicsMinimalLegacyPassPass(Registry);
- initializeLowerWidenableConditionLegacyPassPass(Registry);
- initializeMemCpyOptLegacyPassPass(Registry);
- initializeMergeICmpsLegacyPassPass(Registry);
- initializeMergedLoadStoreMotionLegacyPassPass(Registry);
- initializeNaryReassociateLegacyPassPass(Registry);
- initializePartiallyInlineLibCallsLegacyPassPass(Registry);
- initializeReassociateLegacyPassPass(Registry);
- initializeRedundantDbgInstEliminationPass(Registry);
+ initializeLowerWidenableConditionLegacyPassPass(Registry);
+ initializeMemCpyOptLegacyPassPass(Registry);
+ initializeMergeICmpsLegacyPassPass(Registry);
+ initializeMergedLoadStoreMotionLegacyPassPass(Registry);
+ initializeNaryReassociateLegacyPassPass(Registry);
+ initializePartiallyInlineLibCallsLegacyPassPass(Registry);
+ initializeReassociateLegacyPassPass(Registry);
+ initializeRedundantDbgInstEliminationPass(Registry);
initializeRegToMemLegacyPass(Registry);
- initializeRewriteStatepointsForGCLegacyPassPass(Registry);
+ initializeRewriteStatepointsForGCLegacyPassPass(Registry);
initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
- initializeSCCPLegacyPassPass(Registry);
- initializeSROALegacyPassPass(Registry);
- initializeCFGSimplifyPassPass(Registry);
+ initializeSCCPLegacyPassPass(Registry);
+ initializeSROALegacyPassPass(Registry);
+ initializeCFGSimplifyPassPass(Registry);
initializeStructurizeCFGLegacyPassPass(Registry);
- initializeSimpleLoopUnswitchLegacyPassPass(Registry);
- initializeSinkingLegacyPassPass(Registry);
- initializeTailCallElimPass(Registry);
+ initializeSimpleLoopUnswitchLegacyPassPass(Registry);
+ initializeSinkingLegacyPassPass(Registry);
+ initializeTailCallElimPass(Registry);
initializeSeparateConstOffsetFromGEPLegacyPassPass(Registry);
- initializeSpeculativeExecutionLegacyPassPass(Registry);
+ initializeSpeculativeExecutionLegacyPassPass(Registry);
initializeStraightLineStrengthReduceLegacyPassPass(Registry);
- initializePlaceBackedgeSafepointsImplPass(Registry);
- initializePlaceSafepointsPass(Registry);
- initializeFloat2IntLegacyPassPass(Registry);
- initializeLoopDistributeLegacyPass(Registry);
- initializeLoopLoadEliminationPass(Registry);
- initializeLoopSimplifyCFGLegacyPassPass(Registry);
+ initializePlaceBackedgeSafepointsImplPass(Registry);
+ initializePlaceSafepointsPass(Registry);
+ initializeFloat2IntLegacyPassPass(Registry);
+ initializeLoopDistributeLegacyPass(Registry);
+ initializeLoopLoadEliminationPass(Registry);
+ initializeLoopSimplifyCFGLegacyPassPass(Registry);
initializeLoopVersioningLegacyPassPass(Registry);
- initializeEntryExitInstrumenterPass(Registry);
- initializePostInlineEntryExitInstrumenterPass(Registry);
-}
-
-void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLoopSimplifyCFGPass());
-}
-
-void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
- initializeScalarOpts(*unwrap(R));
-}
-
-void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createAggressiveDCEPass());
-}
-
-void LLVMAddDCEPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createDeadCodeEliminationPass());
-}
-
-void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createBitTrackingDCEPass());
-}
-
-void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createAlignmentFromAssumptionsPass());
-}
-
-void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
+ initializeEntryExitInstrumenterPass(Registry);
+ initializePostInlineEntryExitInstrumenterPass(Registry);
+}
+
+void LLVMAddLoopSimplifyCFGPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopSimplifyCFGPass());
+}
+
+void LLVMInitializeScalarOpts(LLVMPassRegistryRef R) {
+ initializeScalarOpts(*unwrap(R));
+}
+
+void LLVMAddAggressiveDCEPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createAggressiveDCEPass());
+}
+
+void LLVMAddDCEPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createDeadCodeEliminationPass());
+}
+
+void LLVMAddBitTrackingDCEPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createBitTrackingDCEPass());
+}
+
+void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createAlignmentFromAssumptionsPass());
+}
+
+void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createCFGSimplificationPass());
-}
-
-void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createDeadStoreEliminationPass());
-}
-
-void LLVMAddScalarizerPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createScalarizerPass());
-}
-
-void LLVMAddGVNPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createGVNPass());
-}
-
-void LLVMAddNewGVNPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createNewGVNPass());
-}
-
-void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createMergedLoadStoreMotionPass());
-}
-
-void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createIndVarSimplifyPass());
-}
-
+}
+
+void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createDeadStoreEliminationPass());
+}
+
+void LLVMAddScalarizerPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createScalarizerPass());
+}
+
+void LLVMAddGVNPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createGVNPass());
+}
+
+void LLVMAddNewGVNPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createNewGVNPass());
+}
+
+void LLVMAddMergedLoadStoreMotionPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createMergedLoadStoreMotionPass());
+}
+
+void LLVMAddIndVarSimplifyPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createIndVarSimplifyPass());
+}
+
void LLVMAddInstructionSimplifyPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createInstSimplifyLegacyPass());
}
-void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createJumpThreadingPass());
-}
-
-void LLVMAddLoopSinkPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLoopSinkPass());
-}
-
-void LLVMAddLICMPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLICMPass());
-}
-
-void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLoopDeletionPass());
-}
-
+void LLVMAddJumpThreadingPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createJumpThreadingPass());
+}
+
+void LLVMAddLoopSinkPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopSinkPass());
+}
+
+void LLVMAddLICMPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLICMPass());
+}
+
+void LLVMAddLoopDeletionPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopDeletionPass());
+}
+
void LLVMAddLoopFlattenPass(LLVMPassManagerRef PM) {
unwrap(PM)->add(createLoopFlattenPass());
}
-void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLoopIdiomPass());
-}
-
-void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLoopRotatePass());
-}
-
-void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLoopRerollPass());
-}
-
-void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLoopUnrollPass());
-}
-
-void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLoopUnrollAndJamPass());
-}
-
-void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLoopUnswitchPass());
-}
-
-void LLVMAddLowerAtomicPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLowerAtomicPass());
-}
-
-void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createMemCpyOptPass());
-}
-
-void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createPartiallyInlineLibCallsPass());
-}
-
-void LLVMAddReassociatePass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createReassociatePass());
-}
-
-void LLVMAddSCCPPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createSCCPPass());
-}
-
-void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createSROAPass());
-}
-
-void LLVMAddScalarReplAggregatesPassSSA(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createSROAPass());
-}
-
-void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM,
- int Threshold) {
- unwrap(PM)->add(createSROAPass());
-}
-
-void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) {
- // NOTE: The simplify-libcalls pass has been removed.
-}
-
-void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createTailCallEliminationPass());
-}
-
-void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createDemoteRegisterToMemoryPass());
-}
-
-void LLVMAddVerifierPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createVerifierPass());
-}
-
-void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createCorrelatedValuePropagationPass());
-}
-
-void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createEarlyCSEPass(false/*=UseMemorySSA*/));
-}
-
-void LLVMAddEarlyCSEMemSSAPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createEarlyCSEPass(true/*=UseMemorySSA*/));
-}
-
-void LLVMAddGVNHoistLegacyPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createGVNHoistPass());
-}
-
-void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createTypeBasedAAWrapperPass());
-}
-
-void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createScopedNoAliasAAWrapperPass());
-}
-
-void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createBasicAAWrapperPass());
-}
-
-void LLVMAddLowerConstantIntrinsicsPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLowerConstantIntrinsicsPass());
-}
-
-void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLowerExpectIntrinsicPass());
-}
-
-void LLVMAddUnifyFunctionExitNodesPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createUnifyFunctionExitNodesPass());
-}
+void LLVMAddLoopIdiomPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopIdiomPass());
+}
+
+void LLVMAddLoopRotatePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopRotatePass());
+}
+
+void LLVMAddLoopRerollPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopRerollPass());
+}
+
+void LLVMAddLoopUnrollPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopUnrollPass());
+}
+
+void LLVMAddLoopUnrollAndJamPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopUnrollAndJamPass());
+}
+
+void LLVMAddLoopUnswitchPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopUnswitchPass());
+}
+
+void LLVMAddLowerAtomicPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLowerAtomicPass());
+}
+
+void LLVMAddMemCpyOptPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createMemCpyOptPass());
+}
+
+void LLVMAddPartiallyInlineLibCallsPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createPartiallyInlineLibCallsPass());
+}
+
+void LLVMAddReassociatePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createReassociatePass());
+}
+
+void LLVMAddSCCPPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createSCCPPass());
+}
+
+void LLVMAddScalarReplAggregatesPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createSROAPass());
+}
+
+void LLVMAddScalarReplAggregatesPassSSA(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createSROAPass());
+}
+
+void LLVMAddScalarReplAggregatesPassWithThreshold(LLVMPassManagerRef PM,
+ int Threshold) {
+ unwrap(PM)->add(createSROAPass());
+}
+
+void LLVMAddSimplifyLibCallsPass(LLVMPassManagerRef PM) {
+ // NOTE: The simplify-libcalls pass has been removed.
+}
+
+void LLVMAddTailCallEliminationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createTailCallEliminationPass());
+}
+
+void LLVMAddDemoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createDemoteRegisterToMemoryPass());
+}
+
+void LLVMAddVerifierPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createVerifierPass());
+}
+
+void LLVMAddCorrelatedValuePropagationPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createCorrelatedValuePropagationPass());
+}
+
+void LLVMAddEarlyCSEPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createEarlyCSEPass(false/*=UseMemorySSA*/));
+}
+
+void LLVMAddEarlyCSEMemSSAPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createEarlyCSEPass(true/*=UseMemorySSA*/));
+}
+
+void LLVMAddGVNHoistLegacyPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createGVNHoistPass());
+}
+
+void LLVMAddTypeBasedAliasAnalysisPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createTypeBasedAAWrapperPass());
+}
+
+void LLVMAddScopedNoAliasAAPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createScopedNoAliasAAWrapperPass());
+}
+
+void LLVMAddBasicAliasAnalysisPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createBasicAAWrapperPass());
+}
+
+void LLVMAddLowerConstantIntrinsicsPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLowerConstantIntrinsicsPass());
+}
+
+void LLVMAddLowerExpectIntrinsicPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLowerExpectIntrinsicPass());
+}
+
+void LLVMAddUnifyFunctionExitNodesPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createUnifyFunctionExitNodesPass());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/Scalarizer.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/Scalarizer.cpp
index 45af72520f..c95984fe19 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/Scalarizer.cpp
@@ -1,974 +1,974 @@
-//===- Scalarizer.cpp - Scalarize vector operations -----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass converts vector operations into scalar operations, in order
-// to expose optimization opportunities on the individual scalar operations.
-// It is mainly intended for targets that do not have vector units, but it
-// may also be useful for revectorizing code to different vector widths.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/Scalarizer.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <map>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "scalarizer"
-
-static cl::opt<bool> ScalarizeVariableInsertExtract(
- "scalarize-variable-insert-extract", cl::init(true), cl::Hidden,
- cl::desc("Allow the scalarizer pass to scalarize "
- "insertelement/extractelement with variable index"));
-
-// This is disabled by default because having separate loads and stores
-// makes it more likely that the -combiner-alias-analysis limits will be
-// reached.
-static cl::opt<bool>
- ScalarizeLoadStore("scalarize-load-store", cl::init(false), cl::Hidden,
- cl::desc("Allow the scalarizer pass to scalarize loads and store"));
-
-namespace {
-
-// Used to store the scattered form of a vector.
-using ValueVector = SmallVector<Value *, 8>;
-
-// Used to map a vector Value to its scattered form. We use std::map
-// because we want iterators to persist across insertion and because the
-// values are relatively large.
-using ScatterMap = std::map<Value *, ValueVector>;
-
-// Lists Instructions that have been replaced with scalar implementations,
-// along with a pointer to their scattered forms.
-using GatherList = SmallVector<std::pair<Instruction *, ValueVector *>, 16>;
-
-// Provides a very limited vector-like interface for lazily accessing one
-// component of a scattered vector or vector pointer.
-class Scatterer {
-public:
- Scatterer() = default;
-
- // Scatter V into Size components. If new instructions are needed,
- // insert them before BBI in BB. If Cache is nonnull, use it to cache
- // the results.
- Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
- ValueVector *cachePtr = nullptr);
-
- // Return component I, creating a new Value for it if necessary.
- Value *operator[](unsigned I);
-
- // Return the number of components.
- unsigned size() const { return Size; }
-
-private:
- BasicBlock *BB;
- BasicBlock::iterator BBI;
- Value *V;
- ValueVector *CachePtr;
- PointerType *PtrTy;
- ValueVector Tmp;
- unsigned Size;
-};
-
-// FCmpSpliiter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp
-// called Name that compares X and Y in the same way as FCI.
-struct FCmpSplitter {
- FCmpSplitter(FCmpInst &fci) : FCI(fci) {}
-
- Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
- const Twine &Name) const {
- return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name);
- }
-
- FCmpInst &FCI;
-};
-
-// ICmpSpliiter(ICI)(Builder, X, Y, Name) uses Builder to create an ICmp
-// called Name that compares X and Y in the same way as ICI.
-struct ICmpSplitter {
- ICmpSplitter(ICmpInst &ici) : ICI(ici) {}
-
- Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
- const Twine &Name) const {
- return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name);
- }
-
- ICmpInst &ICI;
-};
-
-// UnarySpliiter(UO)(Builder, X, Name) uses Builder to create
-// a unary operator like UO called Name with operand X.
-struct UnarySplitter {
- UnarySplitter(UnaryOperator &uo) : UO(uo) {}
-
- Value *operator()(IRBuilder<> &Builder, Value *Op, const Twine &Name) const {
- return Builder.CreateUnOp(UO.getOpcode(), Op, Name);
- }
-
- UnaryOperator &UO;
-};
-
-// BinarySpliiter(BO)(Builder, X, Y, Name) uses Builder to create
-// a binary operator like BO called Name with operands X and Y.
-struct BinarySplitter {
- BinarySplitter(BinaryOperator &bo) : BO(bo) {}
-
- Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
- const Twine &Name) const {
- return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name);
- }
-
- BinaryOperator &BO;
-};
-
-// Information about a load or store that we're scalarizing.
-struct VectorLayout {
- VectorLayout() = default;
-
- // Return the alignment of element I.
- Align getElemAlign(unsigned I) {
- return commonAlignment(VecAlign, I * ElemSize);
- }
-
- // The type of the vector.
- VectorType *VecTy = nullptr;
-
- // The type of each element.
- Type *ElemTy = nullptr;
-
- // The alignment of the vector.
- Align VecAlign;
-
- // The size of each element.
- uint64_t ElemSize = 0;
-};
-
-class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
-public:
- ScalarizerVisitor(unsigned ParallelLoopAccessMDKind, DominatorTree *DT)
- : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind), DT(DT) {
- }
-
- bool visit(Function &F);
-
- // InstVisitor methods. They return true if the instruction was scalarized,
- // false if nothing changed.
- bool visitInstruction(Instruction &I) { return false; }
- bool visitSelectInst(SelectInst &SI);
- bool visitICmpInst(ICmpInst &ICI);
- bool visitFCmpInst(FCmpInst &FCI);
- bool visitUnaryOperator(UnaryOperator &UO);
- bool visitBinaryOperator(BinaryOperator &BO);
- bool visitGetElementPtrInst(GetElementPtrInst &GEPI);
- bool visitCastInst(CastInst &CI);
- bool visitBitCastInst(BitCastInst &BCI);
- bool visitInsertElementInst(InsertElementInst &IEI);
- bool visitExtractElementInst(ExtractElementInst &EEI);
- bool visitShuffleVectorInst(ShuffleVectorInst &SVI);
- bool visitPHINode(PHINode &PHI);
- bool visitLoadInst(LoadInst &LI);
- bool visitStoreInst(StoreInst &SI);
- bool visitCallInst(CallInst &ICI);
-
-private:
- Scatterer scatter(Instruction *Point, Value *V);
- void gather(Instruction *Op, const ValueVector &CV);
- bool canTransferMetadata(unsigned Kind);
- void transferMetadataAndIRFlags(Instruction *Op, const ValueVector &CV);
- Optional<VectorLayout> getVectorLayout(Type *Ty, Align Alignment,
- const DataLayout &DL);
- bool finish();
-
- template<typename T> bool splitUnary(Instruction &, const T &);
- template<typename T> bool splitBinary(Instruction &, const T &);
-
- bool splitCall(CallInst &CI);
-
- ScatterMap Scattered;
- GatherList Gathered;
-
- SmallVector<WeakTrackingVH, 32> PotentiallyDeadInstrs;
-
- unsigned ParallelLoopAccessMDKind;
-
- DominatorTree *DT;
-};
-
-class ScalarizerLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- ScalarizerLegacyPass() : FunctionPass(ID) {
- initializeScalarizerLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-
- void getAnalysisUsage(AnalysisUsage& AU) const override {
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-char ScalarizerLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(ScalarizerLegacyPass, "scalarizer",
- "Scalarize vector operations", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer",
- "Scalarize vector operations", false, false)
-
-Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
- ValueVector *cachePtr)
- : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) {
- Type *Ty = V->getType();
- PtrTy = dyn_cast<PointerType>(Ty);
- if (PtrTy)
- Ty = PtrTy->getElementType();
- Size = cast<FixedVectorType>(Ty)->getNumElements();
- if (!CachePtr)
- Tmp.resize(Size, nullptr);
- else if (CachePtr->empty())
- CachePtr->resize(Size, nullptr);
- else
- assert(Size == CachePtr->size() && "Inconsistent vector sizes");
-}
-
-// Return component I, creating a new Value for it if necessary.
-Value *Scatterer::operator[](unsigned I) {
- ValueVector &CV = (CachePtr ? *CachePtr : Tmp);
- // Try to reuse a previous value.
- if (CV[I])
- return CV[I];
- IRBuilder<> Builder(BB, BBI);
- if (PtrTy) {
- Type *ElTy = cast<VectorType>(PtrTy->getElementType())->getElementType();
- if (!CV[0]) {
- Type *NewPtrTy = PointerType::get(ElTy, PtrTy->getAddressSpace());
- CV[0] = Builder.CreateBitCast(V, NewPtrTy, V->getName() + ".i0");
- }
- if (I != 0)
- CV[I] = Builder.CreateConstGEP1_32(ElTy, CV[0], I,
- V->getName() + ".i" + Twine(I));
- } else {
- // Search through a chain of InsertElementInsts looking for element I.
- // Record other elements in the cache. The new V is still suitable
- // for all uncached indices.
- while (true) {
- InsertElementInst *Insert = dyn_cast<InsertElementInst>(V);
- if (!Insert)
- break;
- ConstantInt *Idx = dyn_cast<ConstantInt>(Insert->getOperand(2));
- if (!Idx)
- break;
- unsigned J = Idx->getZExtValue();
- V = Insert->getOperand(0);
- if (I == J) {
- CV[J] = Insert->getOperand(1);
- return CV[J];
- } else if (!CV[J]) {
- // Only cache the first entry we find for each index we're not actively
- // searching for. This prevents us from going too far up the chain and
- // caching incorrect entries.
- CV[J] = Insert->getOperand(1);
- }
- }
- CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I),
- V->getName() + ".i" + Twine(I));
- }
- return CV[I];
-}
-
-bool ScalarizerLegacyPass::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- Module &M = *F.getParent();
- unsigned ParallelLoopAccessMDKind =
- M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
- DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT);
- return Impl.visit(F);
-}
-
-FunctionPass *llvm::createScalarizerPass() {
- return new ScalarizerLegacyPass();
-}
-
-bool ScalarizerVisitor::visit(Function &F) {
- assert(Gathered.empty() && Scattered.empty());
-
- // To ensure we replace gathered components correctly we need to do an ordered
- // traversal of the basic blocks in the function.
- ReversePostOrderTraversal<BasicBlock *> RPOT(&F.getEntryBlock());
- for (BasicBlock *BB : RPOT) {
- for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
- Instruction *I = &*II;
- bool Done = InstVisitor::visit(I);
- ++II;
- if (Done && I->getType()->isVoidTy())
- I->eraseFromParent();
- }
- }
- return finish();
-}
-
-// Return a scattered form of V that can be accessed by Point. V must be a
-// vector or a pointer to a vector.
-Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) {
- if (Argument *VArg = dyn_cast<Argument>(V)) {
- // Put the scattered form of arguments in the entry block,
- // so that it can be used everywhere.
- Function *F = VArg->getParent();
- BasicBlock *BB = &F->getEntryBlock();
- return Scatterer(BB, BB->begin(), V, &Scattered[V]);
- }
- if (Instruction *VOp = dyn_cast<Instruction>(V)) {
- // When scalarizing PHI nodes we might try to examine/rewrite InsertElement
- // nodes in predecessors. If those predecessors are unreachable from entry,
- // then the IR in those blocks could have unexpected properties resulting in
- // infinite loops in Scatterer::operator[]. By simply treating values
- // originating from instructions in unreachable blocks as undef we do not
- // need to analyse them further.
- if (!DT->isReachableFromEntry(VOp->getParent()))
- return Scatterer(Point->getParent(), Point->getIterator(),
- UndefValue::get(V->getType()));
- // Put the scattered form of an instruction directly after the
- // instruction.
- BasicBlock *BB = VOp->getParent();
- return Scatterer(BB, std::next(BasicBlock::iterator(VOp)),
- V, &Scattered[V]);
- }
- // In the fallback case, just put the scattered before Point and
- // keep the result local to Point.
- return Scatterer(Point->getParent(), Point->getIterator(), V);
-}
-
-// Replace Op with the gathered form of the components in CV. Defer the
-// deletion of Op and creation of the gathered form to the end of the pass,
-// so that we can avoid creating the gathered form if all uses of Op are
-// replaced with uses of CV.
-void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV) {
- transferMetadataAndIRFlags(Op, CV);
-
- // If we already have a scattered form of Op (created from ExtractElements
- // of Op itself), replace them with the new form.
- ValueVector &SV = Scattered[Op];
- if (!SV.empty()) {
- for (unsigned I = 0, E = SV.size(); I != E; ++I) {
- Value *V = SV[I];
- if (V == nullptr || SV[I] == CV[I])
- continue;
-
- Instruction *Old = cast<Instruction>(V);
+//===- Scalarizer.cpp - Scalarize vector operations -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts vector operations into scalar operations, in order
+// to expose optimization opportunities on the individual scalar operations.
+// It is mainly intended for targets that do not have vector units, but it
+// may also be useful for revectorizing code to different vector widths.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/Scalarizer.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scalarizer"
+
+static cl::opt<bool> ScalarizeVariableInsertExtract(
+ "scalarize-variable-insert-extract", cl::init(true), cl::Hidden,
+ cl::desc("Allow the scalarizer pass to scalarize "
+ "insertelement/extractelement with variable index"));
+
+// This is disabled by default because having separate loads and stores
+// makes it more likely that the -combiner-alias-analysis limits will be
+// reached.
+static cl::opt<bool>
+ ScalarizeLoadStore("scalarize-load-store", cl::init(false), cl::Hidden,
+ cl::desc("Allow the scalarizer pass to scalarize loads and store"));
+
+namespace {
+
+// Used to store the scattered form of a vector.
+using ValueVector = SmallVector<Value *, 8>;
+
+// Used to map a vector Value to its scattered form. We use std::map
+// because we want iterators to persist across insertion and because the
+// values are relatively large.
+using ScatterMap = std::map<Value *, ValueVector>;
+
+// Lists Instructions that have been replaced with scalar implementations,
+// along with a pointer to their scattered forms.
+using GatherList = SmallVector<std::pair<Instruction *, ValueVector *>, 16>;
+
+// Provides a very limited vector-like interface for lazily accessing one
+// component of a scattered vector or vector pointer.
+class Scatterer {
+public:
+ Scatterer() = default;
+
+ // Scatter V into Size components. If new instructions are needed,
+ // insert them before BBI in BB. If Cache is nonnull, use it to cache
+ // the results.
+ Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+ ValueVector *cachePtr = nullptr);
+
+ // Return component I, creating a new Value for it if necessary.
+ Value *operator[](unsigned I);
+
+ // Return the number of components.
+ unsigned size() const { return Size; }
+
+private:
+ BasicBlock *BB;
+ BasicBlock::iterator BBI;
+ Value *V;
+ ValueVector *CachePtr;
+ PointerType *PtrTy;
+ ValueVector Tmp;
+ unsigned Size;
+};
+
+// FCmpSpliiter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp
+// called Name that compares X and Y in the same way as FCI.
+struct FCmpSplitter {
+ FCmpSplitter(FCmpInst &fci) : FCI(fci) {}
+
+ Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+ const Twine &Name) const {
+ return Builder.CreateFCmp(FCI.getPredicate(), Op0, Op1, Name);
+ }
+
+ FCmpInst &FCI;
+};
+
+// ICmpSpliiter(ICI)(Builder, X, Y, Name) uses Builder to create an ICmp
+// called Name that compares X and Y in the same way as ICI.
+struct ICmpSplitter {
+ ICmpSplitter(ICmpInst &ici) : ICI(ici) {}
+
+ Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+ const Twine &Name) const {
+ return Builder.CreateICmp(ICI.getPredicate(), Op0, Op1, Name);
+ }
+
+ ICmpInst &ICI;
+};
+
+// UnarySpliiter(UO)(Builder, X, Name) uses Builder to create
+// a unary operator like UO called Name with operand X.
+struct UnarySplitter {
+ UnarySplitter(UnaryOperator &uo) : UO(uo) {}
+
+ Value *operator()(IRBuilder<> &Builder, Value *Op, const Twine &Name) const {
+ return Builder.CreateUnOp(UO.getOpcode(), Op, Name);
+ }
+
+ UnaryOperator &UO;
+};
+
+// BinarySpliiter(BO)(Builder, X, Y, Name) uses Builder to create
+// a binary operator like BO called Name with operands X and Y.
+struct BinarySplitter {
+ BinarySplitter(BinaryOperator &bo) : BO(bo) {}
+
+ Value *operator()(IRBuilder<> &Builder, Value *Op0, Value *Op1,
+ const Twine &Name) const {
+ return Builder.CreateBinOp(BO.getOpcode(), Op0, Op1, Name);
+ }
+
+ BinaryOperator &BO;
+};
+
+// Information about a load or store that we're scalarizing.
+struct VectorLayout {
+ VectorLayout() = default;
+
+ // Return the alignment of element I.
+ Align getElemAlign(unsigned I) {
+ return commonAlignment(VecAlign, I * ElemSize);
+ }
+
+ // The type of the vector.
+ VectorType *VecTy = nullptr;
+
+ // The type of each element.
+ Type *ElemTy = nullptr;
+
+ // The alignment of the vector.
+ Align VecAlign;
+
+ // The size of each element.
+ uint64_t ElemSize = 0;
+};
+
+class ScalarizerVisitor : public InstVisitor<ScalarizerVisitor, bool> {
+public:
+ ScalarizerVisitor(unsigned ParallelLoopAccessMDKind, DominatorTree *DT)
+ : ParallelLoopAccessMDKind(ParallelLoopAccessMDKind), DT(DT) {
+ }
+
+ bool visit(Function &F);
+
+ // InstVisitor methods. They return true if the instruction was scalarized,
+ // false if nothing changed.
+ bool visitInstruction(Instruction &I) { return false; }
+ bool visitSelectInst(SelectInst &SI);
+ bool visitICmpInst(ICmpInst &ICI);
+ bool visitFCmpInst(FCmpInst &FCI);
+ bool visitUnaryOperator(UnaryOperator &UO);
+ bool visitBinaryOperator(BinaryOperator &BO);
+ bool visitGetElementPtrInst(GetElementPtrInst &GEPI);
+ bool visitCastInst(CastInst &CI);
+ bool visitBitCastInst(BitCastInst &BCI);
+ bool visitInsertElementInst(InsertElementInst &IEI);
+ bool visitExtractElementInst(ExtractElementInst &EEI);
+ bool visitShuffleVectorInst(ShuffleVectorInst &SVI);
+ bool visitPHINode(PHINode &PHI);
+ bool visitLoadInst(LoadInst &LI);
+ bool visitStoreInst(StoreInst &SI);
+ bool visitCallInst(CallInst &ICI);
+
+private:
+ Scatterer scatter(Instruction *Point, Value *V);
+ void gather(Instruction *Op, const ValueVector &CV);
+ bool canTransferMetadata(unsigned Kind);
+ void transferMetadataAndIRFlags(Instruction *Op, const ValueVector &CV);
+ Optional<VectorLayout> getVectorLayout(Type *Ty, Align Alignment,
+ const DataLayout &DL);
+ bool finish();
+
+ template<typename T> bool splitUnary(Instruction &, const T &);
+ template<typename T> bool splitBinary(Instruction &, const T &);
+
+ bool splitCall(CallInst &CI);
+
+ ScatterMap Scattered;
+ GatherList Gathered;
+
+ SmallVector<WeakTrackingVH, 32> PotentiallyDeadInstrs;
+
+ unsigned ParallelLoopAccessMDKind;
+
+ DominatorTree *DT;
+};
+
+class ScalarizerLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ ScalarizerLegacyPass() : FunctionPass(ID) {
+ initializeScalarizerLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage& AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+char ScalarizerLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(ScalarizerLegacyPass, "scalarizer",
+ "Scalarize vector operations", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer",
+ "Scalarize vector operations", false, false)
+
+Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
+ ValueVector *cachePtr)
+ : BB(bb), BBI(bbi), V(v), CachePtr(cachePtr) {
+ Type *Ty = V->getType();
+ PtrTy = dyn_cast<PointerType>(Ty);
+ if (PtrTy)
+ Ty = PtrTy->getElementType();
+ Size = cast<FixedVectorType>(Ty)->getNumElements();
+ if (!CachePtr)
+ Tmp.resize(Size, nullptr);
+ else if (CachePtr->empty())
+ CachePtr->resize(Size, nullptr);
+ else
+ assert(Size == CachePtr->size() && "Inconsistent vector sizes");
+}
+
+// Return component I, creating a new Value for it if necessary.
+Value *Scatterer::operator[](unsigned I) {
+ ValueVector &CV = (CachePtr ? *CachePtr : Tmp);
+ // Try to reuse a previous value.
+ if (CV[I])
+ return CV[I];
+ IRBuilder<> Builder(BB, BBI);
+ if (PtrTy) {
+ Type *ElTy = cast<VectorType>(PtrTy->getElementType())->getElementType();
+ if (!CV[0]) {
+ Type *NewPtrTy = PointerType::get(ElTy, PtrTy->getAddressSpace());
+ CV[0] = Builder.CreateBitCast(V, NewPtrTy, V->getName() + ".i0");
+ }
+ if (I != 0)
+ CV[I] = Builder.CreateConstGEP1_32(ElTy, CV[0], I,
+ V->getName() + ".i" + Twine(I));
+ } else {
+ // Search through a chain of InsertElementInsts looking for element I.
+ // Record other elements in the cache. The new V is still suitable
+ // for all uncached indices.
+ while (true) {
+ InsertElementInst *Insert = dyn_cast<InsertElementInst>(V);
+ if (!Insert)
+ break;
+ ConstantInt *Idx = dyn_cast<ConstantInt>(Insert->getOperand(2));
+ if (!Idx)
+ break;
+ unsigned J = Idx->getZExtValue();
+ V = Insert->getOperand(0);
+ if (I == J) {
+ CV[J] = Insert->getOperand(1);
+ return CV[J];
+ } else if (!CV[J]) {
+ // Only cache the first entry we find for each index we're not actively
+ // searching for. This prevents us from going too far up the chain and
+ // caching incorrect entries.
+ CV[J] = Insert->getOperand(1);
+ }
+ }
+ CV[I] = Builder.CreateExtractElement(V, Builder.getInt32(I),
+ V->getName() + ".i" + Twine(I));
+ }
+ return CV[I];
+}
+
+bool ScalarizerLegacyPass::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ Module &M = *F.getParent();
+ unsigned ParallelLoopAccessMDKind =
+ M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT);
+ return Impl.visit(F);
+}
+
+FunctionPass *llvm::createScalarizerPass() {
+ return new ScalarizerLegacyPass();
+}
+
+bool ScalarizerVisitor::visit(Function &F) {
+ assert(Gathered.empty() && Scattered.empty());
+
+ // To ensure we replace gathered components correctly we need to do an ordered
+ // traversal of the basic blocks in the function.
+ ReversePostOrderTraversal<BasicBlock *> RPOT(&F.getEntryBlock());
+ for (BasicBlock *BB : RPOT) {
+ for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
+ Instruction *I = &*II;
+ bool Done = InstVisitor::visit(I);
+ ++II;
+ if (Done && I->getType()->isVoidTy())
+ I->eraseFromParent();
+ }
+ }
+ return finish();
+}
+
+// Return a scattered form of V that can be accessed by Point. V must be a
+// vector or a pointer to a vector.
+Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) {
+ if (Argument *VArg = dyn_cast<Argument>(V)) {
+ // Put the scattered form of arguments in the entry block,
+ // so that it can be used everywhere.
+ Function *F = VArg->getParent();
+ BasicBlock *BB = &F->getEntryBlock();
+ return Scatterer(BB, BB->begin(), V, &Scattered[V]);
+ }
+ if (Instruction *VOp = dyn_cast<Instruction>(V)) {
+ // When scalarizing PHI nodes we might try to examine/rewrite InsertElement
+ // nodes in predecessors. If those predecessors are unreachable from entry,
+ // then the IR in those blocks could have unexpected properties resulting in
+ // infinite loops in Scatterer::operator[]. By simply treating values
+ // originating from instructions in unreachable blocks as undef we do not
+ // need to analyse them further.
+ if (!DT->isReachableFromEntry(VOp->getParent()))
+ return Scatterer(Point->getParent(), Point->getIterator(),
+ UndefValue::get(V->getType()));
+ // Put the scattered form of an instruction directly after the
+ // instruction.
+ BasicBlock *BB = VOp->getParent();
+ return Scatterer(BB, std::next(BasicBlock::iterator(VOp)),
+ V, &Scattered[V]);
+ }
+ // In the fallback case, just put the scattered before Point and
+ // keep the result local to Point.
+ return Scatterer(Point->getParent(), Point->getIterator(), V);
+}
+
+// Replace Op with the gathered form of the components in CV. Defer the
+// deletion of Op and creation of the gathered form to the end of the pass,
+// so that we can avoid creating the gathered form if all uses of Op are
+// replaced with uses of CV.
+void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV) {
+ transferMetadataAndIRFlags(Op, CV);
+
+ // If we already have a scattered form of Op (created from ExtractElements
+ // of Op itself), replace them with the new form.
+ ValueVector &SV = Scattered[Op];
+ if (!SV.empty()) {
+ for (unsigned I = 0, E = SV.size(); I != E; ++I) {
+ Value *V = SV[I];
+ if (V == nullptr || SV[I] == CV[I])
+ continue;
+
+ Instruction *Old = cast<Instruction>(V);
if (isa<Instruction>(CV[I]))
CV[I]->takeName(Old);
- Old->replaceAllUsesWith(CV[I]);
- PotentiallyDeadInstrs.emplace_back(Old);
- }
- }
- SV = CV;
- Gathered.push_back(GatherList::value_type(Op, &SV));
-}
-
-// Return true if it is safe to transfer the given metadata tag from
-// vector to scalar instructions.
-bool ScalarizerVisitor::canTransferMetadata(unsigned Tag) {
- return (Tag == LLVMContext::MD_tbaa
- || Tag == LLVMContext::MD_fpmath
- || Tag == LLVMContext::MD_tbaa_struct
- || Tag == LLVMContext::MD_invariant_load
- || Tag == LLVMContext::MD_alias_scope
- || Tag == LLVMContext::MD_noalias
- || Tag == ParallelLoopAccessMDKind
- || Tag == LLVMContext::MD_access_group);
-}
-
-// Transfer metadata from Op to the instructions in CV if it is known
-// to be safe to do so.
-void ScalarizerVisitor::transferMetadataAndIRFlags(Instruction *Op,
- const ValueVector &CV) {
- SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
- Op->getAllMetadataOtherThanDebugLoc(MDs);
- for (unsigned I = 0, E = CV.size(); I != E; ++I) {
- if (Instruction *New = dyn_cast<Instruction>(CV[I])) {
- for (const auto &MD : MDs)
- if (canTransferMetadata(MD.first))
- New->setMetadata(MD.first, MD.second);
- New->copyIRFlags(Op);
- if (Op->getDebugLoc() && !New->getDebugLoc())
- New->setDebugLoc(Op->getDebugLoc());
- }
- }
-}
-
-// Try to fill in Layout from Ty, returning true on success. Alignment is
-// the alignment of the vector, or None if the ABI default should be used.
-Optional<VectorLayout>
-ScalarizerVisitor::getVectorLayout(Type *Ty, Align Alignment,
- const DataLayout &DL) {
- VectorLayout Layout;
- // Make sure we're dealing with a vector.
- Layout.VecTy = dyn_cast<VectorType>(Ty);
- if (!Layout.VecTy)
- return None;
- // Check that we're dealing with full-byte elements.
- Layout.ElemTy = Layout.VecTy->getElementType();
- if (!DL.typeSizeEqualsStoreSize(Layout.ElemTy))
- return None;
- Layout.VecAlign = Alignment;
- Layout.ElemSize = DL.getTypeStoreSize(Layout.ElemTy);
- return Layout;
-}
-
-// Scalarize one-operand instruction I, using Split(Builder, X, Name)
-// to create an instruction like I with operand X and name Name.
-template<typename Splitter>
-bool ScalarizerVisitor::splitUnary(Instruction &I, const Splitter &Split) {
- VectorType *VT = dyn_cast<VectorType>(I.getType());
- if (!VT)
- return false;
-
- unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
- IRBuilder<> Builder(&I);
- Scatterer Op = scatter(&I, I.getOperand(0));
- assert(Op.size() == NumElems && "Mismatched unary operation");
- ValueVector Res;
- Res.resize(NumElems);
- for (unsigned Elem = 0; Elem < NumElems; ++Elem)
- Res[Elem] = Split(Builder, Op[Elem], I.getName() + ".i" + Twine(Elem));
- gather(&I, Res);
- return true;
-}
-
-// Scalarize two-operand instruction I, using Split(Builder, X, Y, Name)
-// to create an instruction like I with operands X and Y and name Name.
-template<typename Splitter>
-bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) {
- VectorType *VT = dyn_cast<VectorType>(I.getType());
- if (!VT)
- return false;
-
- unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
- IRBuilder<> Builder(&I);
- Scatterer VOp0 = scatter(&I, I.getOperand(0));
- Scatterer VOp1 = scatter(&I, I.getOperand(1));
- assert(VOp0.size() == NumElems && "Mismatched binary operation");
- assert(VOp1.size() == NumElems && "Mismatched binary operation");
- ValueVector Res;
- Res.resize(NumElems);
- for (unsigned Elem = 0; Elem < NumElems; ++Elem) {
- Value *Op0 = VOp0[Elem];
- Value *Op1 = VOp1[Elem];
- Res[Elem] = Split(Builder, Op0, Op1, I.getName() + ".i" + Twine(Elem));
- }
- gather(&I, Res);
- return true;
-}
-
-static bool isTriviallyScalariable(Intrinsic::ID ID) {
- return isTriviallyVectorizable(ID);
-}
-
-// All of the current scalarizable intrinsics only have one mangled type.
-static Function *getScalarIntrinsicDeclaration(Module *M,
- Intrinsic::ID ID,
- VectorType *Ty) {
- return Intrinsic::getDeclaration(M, ID, { Ty->getScalarType() });
-}
-
-/// If a call to a vector typed intrinsic function, split into a scalar call per
-/// element if possible for the intrinsic.
-bool ScalarizerVisitor::splitCall(CallInst &CI) {
- VectorType *VT = dyn_cast<VectorType>(CI.getType());
- if (!VT)
- return false;
-
- Function *F = CI.getCalledFunction();
- if (!F)
- return false;
-
- Intrinsic::ID ID = F->getIntrinsicID();
- if (ID == Intrinsic::not_intrinsic || !isTriviallyScalariable(ID))
- return false;
-
- unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
- unsigned NumArgs = CI.getNumArgOperands();
-
- ValueVector ScalarOperands(NumArgs);
- SmallVector<Scatterer, 8> Scattered(NumArgs);
-
- Scattered.resize(NumArgs);
-
- // Assumes that any vector type has the same number of elements as the return
- // vector type, which is true for all current intrinsics.
- for (unsigned I = 0; I != NumArgs; ++I) {
- Value *OpI = CI.getOperand(I);
- if (OpI->getType()->isVectorTy()) {
- Scattered[I] = scatter(&CI, OpI);
- assert(Scattered[I].size() == NumElems && "mismatched call operands");
- } else {
- ScalarOperands[I] = OpI;
- }
- }
-
- ValueVector Res(NumElems);
- ValueVector ScalarCallOps(NumArgs);
-
- Function *NewIntrin = getScalarIntrinsicDeclaration(F->getParent(), ID, VT);
- IRBuilder<> Builder(&CI);
-
- // Perform actual scalarization, taking care to preserve any scalar operands.
- for (unsigned Elem = 0; Elem < NumElems; ++Elem) {
- ScalarCallOps.clear();
-
- for (unsigned J = 0; J != NumArgs; ++J) {
- if (hasVectorInstrinsicScalarOpd(ID, J))
- ScalarCallOps.push_back(ScalarOperands[J]);
- else
- ScalarCallOps.push_back(Scattered[J][Elem]);
- }
-
- Res[Elem] = Builder.CreateCall(NewIntrin, ScalarCallOps,
- CI.getName() + ".i" + Twine(Elem));
- }
-
- gather(&CI, Res);
- return true;
-}
-
-bool ScalarizerVisitor::visitSelectInst(SelectInst &SI) {
- VectorType *VT = dyn_cast<VectorType>(SI.getType());
- if (!VT)
- return false;
-
- unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
- IRBuilder<> Builder(&SI);
- Scatterer VOp1 = scatter(&SI, SI.getOperand(1));
- Scatterer VOp2 = scatter(&SI, SI.getOperand(2));
- assert(VOp1.size() == NumElems && "Mismatched select");
- assert(VOp2.size() == NumElems && "Mismatched select");
- ValueVector Res;
- Res.resize(NumElems);
-
- if (SI.getOperand(0)->getType()->isVectorTy()) {
- Scatterer VOp0 = scatter(&SI, SI.getOperand(0));
- assert(VOp0.size() == NumElems && "Mismatched select");
- for (unsigned I = 0; I < NumElems; ++I) {
- Value *Op0 = VOp0[I];
- Value *Op1 = VOp1[I];
- Value *Op2 = VOp2[I];
- Res[I] = Builder.CreateSelect(Op0, Op1, Op2,
- SI.getName() + ".i" + Twine(I));
- }
- } else {
- Value *Op0 = SI.getOperand(0);
- for (unsigned I = 0; I < NumElems; ++I) {
- Value *Op1 = VOp1[I];
- Value *Op2 = VOp2[I];
- Res[I] = Builder.CreateSelect(Op0, Op1, Op2,
- SI.getName() + ".i" + Twine(I));
- }
- }
- gather(&SI, Res);
- return true;
-}
-
-bool ScalarizerVisitor::visitICmpInst(ICmpInst &ICI) {
- return splitBinary(ICI, ICmpSplitter(ICI));
-}
-
-bool ScalarizerVisitor::visitFCmpInst(FCmpInst &FCI) {
- return splitBinary(FCI, FCmpSplitter(FCI));
-}
-
-bool ScalarizerVisitor::visitUnaryOperator(UnaryOperator &UO) {
- return splitUnary(UO, UnarySplitter(UO));
-}
-
-bool ScalarizerVisitor::visitBinaryOperator(BinaryOperator &BO) {
- return splitBinary(BO, BinarySplitter(BO));
-}
-
-bool ScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
- VectorType *VT = dyn_cast<VectorType>(GEPI.getType());
- if (!VT)
- return false;
-
- IRBuilder<> Builder(&GEPI);
- unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
- unsigned NumIndices = GEPI.getNumIndices();
-
- // The base pointer might be scalar even if it's a vector GEP. In those cases,
- // splat the pointer into a vector value, and scatter that vector.
- Value *Op0 = GEPI.getOperand(0);
- if (!Op0->getType()->isVectorTy())
- Op0 = Builder.CreateVectorSplat(NumElems, Op0);
- Scatterer Base = scatter(&GEPI, Op0);
-
- SmallVector<Scatterer, 8> Ops;
- Ops.resize(NumIndices);
- for (unsigned I = 0; I < NumIndices; ++I) {
- Value *Op = GEPI.getOperand(I + 1);
-
- // The indices might be scalars even if it's a vector GEP. In those cases,
- // splat the scalar into a vector value, and scatter that vector.
- if (!Op->getType()->isVectorTy())
- Op = Builder.CreateVectorSplat(NumElems, Op);
-
- Ops[I] = scatter(&GEPI, Op);
- }
-
- ValueVector Res;
- Res.resize(NumElems);
- for (unsigned I = 0; I < NumElems; ++I) {
- SmallVector<Value *, 8> Indices;
- Indices.resize(NumIndices);
- for (unsigned J = 0; J < NumIndices; ++J)
- Indices[J] = Ops[J][I];
- Res[I] = Builder.CreateGEP(GEPI.getSourceElementType(), Base[I], Indices,
- GEPI.getName() + ".i" + Twine(I));
- if (GEPI.isInBounds())
- if (GetElementPtrInst *NewGEPI = dyn_cast<GetElementPtrInst>(Res[I]))
- NewGEPI->setIsInBounds();
- }
- gather(&GEPI, Res);
- return true;
-}
-
-bool ScalarizerVisitor::visitCastInst(CastInst &CI) {
- VectorType *VT = dyn_cast<VectorType>(CI.getDestTy());
- if (!VT)
- return false;
-
- unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
- IRBuilder<> Builder(&CI);
- Scatterer Op0 = scatter(&CI, CI.getOperand(0));
- assert(Op0.size() == NumElems && "Mismatched cast");
- ValueVector Res;
- Res.resize(NumElems);
- for (unsigned I = 0; I < NumElems; ++I)
- Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], VT->getElementType(),
- CI.getName() + ".i" + Twine(I));
- gather(&CI, Res);
- return true;
-}
-
-bool ScalarizerVisitor::visitBitCastInst(BitCastInst &BCI) {
- VectorType *DstVT = dyn_cast<VectorType>(BCI.getDestTy());
- VectorType *SrcVT = dyn_cast<VectorType>(BCI.getSrcTy());
- if (!DstVT || !SrcVT)
- return false;
-
- unsigned DstNumElems = cast<FixedVectorType>(DstVT)->getNumElements();
- unsigned SrcNumElems = cast<FixedVectorType>(SrcVT)->getNumElements();
- IRBuilder<> Builder(&BCI);
- Scatterer Op0 = scatter(&BCI, BCI.getOperand(0));
- ValueVector Res;
- Res.resize(DstNumElems);
-
- if (DstNumElems == SrcNumElems) {
- for (unsigned I = 0; I < DstNumElems; ++I)
- Res[I] = Builder.CreateBitCast(Op0[I], DstVT->getElementType(),
- BCI.getName() + ".i" + Twine(I));
- } else if (DstNumElems > SrcNumElems) {
- // <M x t1> -> <N*M x t2>. Convert each t1 to <N x t2> and copy the
- // individual elements to the destination.
- unsigned FanOut = DstNumElems / SrcNumElems;
- auto *MidTy = FixedVectorType::get(DstVT->getElementType(), FanOut);
- unsigned ResI = 0;
- for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) {
- Value *V = Op0[Op0I];
- Instruction *VI;
- // Look through any existing bitcasts before converting to <N x t2>.
- // In the best case, the resulting conversion might be a no-op.
- while ((VI = dyn_cast<Instruction>(V)) &&
- VI->getOpcode() == Instruction::BitCast)
- V = VI->getOperand(0);
- V = Builder.CreateBitCast(V, MidTy, V->getName() + ".cast");
- Scatterer Mid = scatter(&BCI, V);
- for (unsigned MidI = 0; MidI < FanOut; ++MidI)
- Res[ResI++] = Mid[MidI];
- }
- } else {
- // <N*M x t1> -> <M x t2>. Convert each group of <N x t1> into a t2.
- unsigned FanIn = SrcNumElems / DstNumElems;
- auto *MidTy = FixedVectorType::get(SrcVT->getElementType(), FanIn);
- unsigned Op0I = 0;
- for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) {
+ Old->replaceAllUsesWith(CV[I]);
+ PotentiallyDeadInstrs.emplace_back(Old);
+ }
+ }
+ SV = CV;
+ Gathered.push_back(GatherList::value_type(Op, &SV));
+}
+
+// Return true if it is safe to transfer the given metadata tag from
+// vector to scalar instructions.
+bool ScalarizerVisitor::canTransferMetadata(unsigned Tag) {
+ return (Tag == LLVMContext::MD_tbaa
+ || Tag == LLVMContext::MD_fpmath
+ || Tag == LLVMContext::MD_tbaa_struct
+ || Tag == LLVMContext::MD_invariant_load
+ || Tag == LLVMContext::MD_alias_scope
+ || Tag == LLVMContext::MD_noalias
+ || Tag == ParallelLoopAccessMDKind
+ || Tag == LLVMContext::MD_access_group);
+}
+
+// Transfer metadata from Op to the instructions in CV if it is known
+// to be safe to do so.
+void ScalarizerVisitor::transferMetadataAndIRFlags(Instruction *Op,
+ const ValueVector &CV) {
+ SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+ Op->getAllMetadataOtherThanDebugLoc(MDs);
+ for (unsigned I = 0, E = CV.size(); I != E; ++I) {
+ if (Instruction *New = dyn_cast<Instruction>(CV[I])) {
+ for (const auto &MD : MDs)
+ if (canTransferMetadata(MD.first))
+ New->setMetadata(MD.first, MD.second);
+ New->copyIRFlags(Op);
+ if (Op->getDebugLoc() && !New->getDebugLoc())
+ New->setDebugLoc(Op->getDebugLoc());
+ }
+ }
+}
+
+// Try to fill in Layout from Ty, returning true on success. Alignment is
+// the alignment of the vector, or None if the ABI default should be used.
+Optional<VectorLayout>
+ScalarizerVisitor::getVectorLayout(Type *Ty, Align Alignment,
+ const DataLayout &DL) {
+ VectorLayout Layout;
+ // Make sure we're dealing with a vector.
+ Layout.VecTy = dyn_cast<VectorType>(Ty);
+ if (!Layout.VecTy)
+ return None;
+ // Check that we're dealing with full-byte elements.
+ Layout.ElemTy = Layout.VecTy->getElementType();
+ if (!DL.typeSizeEqualsStoreSize(Layout.ElemTy))
+ return None;
+ Layout.VecAlign = Alignment;
+ Layout.ElemSize = DL.getTypeStoreSize(Layout.ElemTy);
+ return Layout;
+}
+
+// Scalarize one-operand instruction I, using Split(Builder, X, Name)
+// to create an instruction like I with operand X and name Name.
+template<typename Splitter>
+bool ScalarizerVisitor::splitUnary(Instruction &I, const Splitter &Split) {
+ VectorType *VT = dyn_cast<VectorType>(I.getType());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+ IRBuilder<> Builder(&I);
+ Scatterer Op = scatter(&I, I.getOperand(0));
+ assert(Op.size() == NumElems && "Mismatched unary operation");
+ ValueVector Res;
+ Res.resize(NumElems);
+ for (unsigned Elem = 0; Elem < NumElems; ++Elem)
+ Res[Elem] = Split(Builder, Op[Elem], I.getName() + ".i" + Twine(Elem));
+ gather(&I, Res);
+ return true;
+}
+
+// Scalarize two-operand instruction I, using Split(Builder, X, Y, Name)
+// to create an instruction like I with operands X and Y and name Name.
+template<typename Splitter>
+bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) {
+ VectorType *VT = dyn_cast<VectorType>(I.getType());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+ IRBuilder<> Builder(&I);
+ Scatterer VOp0 = scatter(&I, I.getOperand(0));
+ Scatterer VOp1 = scatter(&I, I.getOperand(1));
+ assert(VOp0.size() == NumElems && "Mismatched binary operation");
+ assert(VOp1.size() == NumElems && "Mismatched binary operation");
+ ValueVector Res;
+ Res.resize(NumElems);
+ for (unsigned Elem = 0; Elem < NumElems; ++Elem) {
+ Value *Op0 = VOp0[Elem];
+ Value *Op1 = VOp1[Elem];
+ Res[Elem] = Split(Builder, Op0, Op1, I.getName() + ".i" + Twine(Elem));
+ }
+ gather(&I, Res);
+ return true;
+}
+
+static bool isTriviallyScalariable(Intrinsic::ID ID) {
+ return isTriviallyVectorizable(ID);
+}
+
+// All of the current scalarizable intrinsics only have one mangled type.
+static Function *getScalarIntrinsicDeclaration(Module *M,
+ Intrinsic::ID ID,
+ VectorType *Ty) {
+ return Intrinsic::getDeclaration(M, ID, { Ty->getScalarType() });
+}
+
+/// If a call to a vector typed intrinsic function, split into a scalar call per
+/// element if possible for the intrinsic.
+bool ScalarizerVisitor::splitCall(CallInst &CI) {
+ VectorType *VT = dyn_cast<VectorType>(CI.getType());
+ if (!VT)
+ return false;
+
+ Function *F = CI.getCalledFunction();
+ if (!F)
+ return false;
+
+ Intrinsic::ID ID = F->getIntrinsicID();
+ if (ID == Intrinsic::not_intrinsic || !isTriviallyScalariable(ID))
+ return false;
+
+ unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+ unsigned NumArgs = CI.getNumArgOperands();
+
+ ValueVector ScalarOperands(NumArgs);
+ SmallVector<Scatterer, 8> Scattered(NumArgs);
+
+ Scattered.resize(NumArgs);
+
+ // Assumes that any vector type has the same number of elements as the return
+ // vector type, which is true for all current intrinsics.
+ for (unsigned I = 0; I != NumArgs; ++I) {
+ Value *OpI = CI.getOperand(I);
+ if (OpI->getType()->isVectorTy()) {
+ Scattered[I] = scatter(&CI, OpI);
+ assert(Scattered[I].size() == NumElems && "mismatched call operands");
+ } else {
+ ScalarOperands[I] = OpI;
+ }
+ }
+
+ ValueVector Res(NumElems);
+ ValueVector ScalarCallOps(NumArgs);
+
+ Function *NewIntrin = getScalarIntrinsicDeclaration(F->getParent(), ID, VT);
+ IRBuilder<> Builder(&CI);
+
+ // Perform actual scalarization, taking care to preserve any scalar operands.
+ for (unsigned Elem = 0; Elem < NumElems; ++Elem) {
+ ScalarCallOps.clear();
+
+ for (unsigned J = 0; J != NumArgs; ++J) {
+ if (hasVectorInstrinsicScalarOpd(ID, J))
+ ScalarCallOps.push_back(ScalarOperands[J]);
+ else
+ ScalarCallOps.push_back(Scattered[J][Elem]);
+ }
+
+ Res[Elem] = Builder.CreateCall(NewIntrin, ScalarCallOps,
+ CI.getName() + ".i" + Twine(Elem));
+ }
+
+ gather(&CI, Res);
+ return true;
+}
+
+bool ScalarizerVisitor::visitSelectInst(SelectInst &SI) {
+ VectorType *VT = dyn_cast<VectorType>(SI.getType());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+ IRBuilder<> Builder(&SI);
+ Scatterer VOp1 = scatter(&SI, SI.getOperand(1));
+ Scatterer VOp2 = scatter(&SI, SI.getOperand(2));
+ assert(VOp1.size() == NumElems && "Mismatched select");
+ assert(VOp2.size() == NumElems && "Mismatched select");
+ ValueVector Res;
+ Res.resize(NumElems);
+
+ if (SI.getOperand(0)->getType()->isVectorTy()) {
+ Scatterer VOp0 = scatter(&SI, SI.getOperand(0));
+ assert(VOp0.size() == NumElems && "Mismatched select");
+ for (unsigned I = 0; I < NumElems; ++I) {
+ Value *Op0 = VOp0[I];
+ Value *Op1 = VOp1[I];
+ Value *Op2 = VOp2[I];
+ Res[I] = Builder.CreateSelect(Op0, Op1, Op2,
+ SI.getName() + ".i" + Twine(I));
+ }
+ } else {
+ Value *Op0 = SI.getOperand(0);
+ for (unsigned I = 0; I < NumElems; ++I) {
+ Value *Op1 = VOp1[I];
+ Value *Op2 = VOp2[I];
+ Res[I] = Builder.CreateSelect(Op0, Op1, Op2,
+ SI.getName() + ".i" + Twine(I));
+ }
+ }
+ gather(&SI, Res);
+ return true;
+}
+
+bool ScalarizerVisitor::visitICmpInst(ICmpInst &ICI) {
+ return splitBinary(ICI, ICmpSplitter(ICI));
+}
+
+bool ScalarizerVisitor::visitFCmpInst(FCmpInst &FCI) {
+ return splitBinary(FCI, FCmpSplitter(FCI));
+}
+
+bool ScalarizerVisitor::visitUnaryOperator(UnaryOperator &UO) {
+ return splitUnary(UO, UnarySplitter(UO));
+}
+
+bool ScalarizerVisitor::visitBinaryOperator(BinaryOperator &BO) {
+ return splitBinary(BO, BinarySplitter(BO));
+}
+
+bool ScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+ VectorType *VT = dyn_cast<VectorType>(GEPI.getType());
+ if (!VT)
+ return false;
+
+ IRBuilder<> Builder(&GEPI);
+ unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+ unsigned NumIndices = GEPI.getNumIndices();
+
+ // The base pointer might be scalar even if it's a vector GEP. In those cases,
+ // splat the pointer into a vector value, and scatter that vector.
+ Value *Op0 = GEPI.getOperand(0);
+ if (!Op0->getType()->isVectorTy())
+ Op0 = Builder.CreateVectorSplat(NumElems, Op0);
+ Scatterer Base = scatter(&GEPI, Op0);
+
+ SmallVector<Scatterer, 8> Ops;
+ Ops.resize(NumIndices);
+ for (unsigned I = 0; I < NumIndices; ++I) {
+ Value *Op = GEPI.getOperand(I + 1);
+
+ // The indices might be scalars even if it's a vector GEP. In those cases,
+ // splat the scalar into a vector value, and scatter that vector.
+ if (!Op->getType()->isVectorTy())
+ Op = Builder.CreateVectorSplat(NumElems, Op);
+
+ Ops[I] = scatter(&GEPI, Op);
+ }
+
+ ValueVector Res;
+ Res.resize(NumElems);
+ for (unsigned I = 0; I < NumElems; ++I) {
+ SmallVector<Value *, 8> Indices;
+ Indices.resize(NumIndices);
+ for (unsigned J = 0; J < NumIndices; ++J)
+ Indices[J] = Ops[J][I];
+ Res[I] = Builder.CreateGEP(GEPI.getSourceElementType(), Base[I], Indices,
+ GEPI.getName() + ".i" + Twine(I));
+ if (GEPI.isInBounds())
+ if (GetElementPtrInst *NewGEPI = dyn_cast<GetElementPtrInst>(Res[I]))
+ NewGEPI->setIsInBounds();
+ }
+ gather(&GEPI, Res);
+ return true;
+}
+
+bool ScalarizerVisitor::visitCastInst(CastInst &CI) {
+ VectorType *VT = dyn_cast<VectorType>(CI.getDestTy());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+ IRBuilder<> Builder(&CI);
+ Scatterer Op0 = scatter(&CI, CI.getOperand(0));
+ assert(Op0.size() == NumElems && "Mismatched cast");
+ ValueVector Res;
+ Res.resize(NumElems);
+ for (unsigned I = 0; I < NumElems; ++I)
+ Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], VT->getElementType(),
+ CI.getName() + ".i" + Twine(I));
+ gather(&CI, Res);
+ return true;
+}
+
+bool ScalarizerVisitor::visitBitCastInst(BitCastInst &BCI) {
+ VectorType *DstVT = dyn_cast<VectorType>(BCI.getDestTy());
+ VectorType *SrcVT = dyn_cast<VectorType>(BCI.getSrcTy());
+ if (!DstVT || !SrcVT)
+ return false;
+
+ unsigned DstNumElems = cast<FixedVectorType>(DstVT)->getNumElements();
+ unsigned SrcNumElems = cast<FixedVectorType>(SrcVT)->getNumElements();
+ IRBuilder<> Builder(&BCI);
+ Scatterer Op0 = scatter(&BCI, BCI.getOperand(0));
+ ValueVector Res;
+ Res.resize(DstNumElems);
+
+ if (DstNumElems == SrcNumElems) {
+ for (unsigned I = 0; I < DstNumElems; ++I)
+ Res[I] = Builder.CreateBitCast(Op0[I], DstVT->getElementType(),
+ BCI.getName() + ".i" + Twine(I));
+ } else if (DstNumElems > SrcNumElems) {
+ // <M x t1> -> <N*M x t2>. Convert each t1 to <N x t2> and copy the
+ // individual elements to the destination.
+ unsigned FanOut = DstNumElems / SrcNumElems;
+ auto *MidTy = FixedVectorType::get(DstVT->getElementType(), FanOut);
+ unsigned ResI = 0;
+ for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) {
+ Value *V = Op0[Op0I];
+ Instruction *VI;
+ // Look through any existing bitcasts before converting to <N x t2>.
+ // In the best case, the resulting conversion might be a no-op.
+ while ((VI = dyn_cast<Instruction>(V)) &&
+ VI->getOpcode() == Instruction::BitCast)
+ V = VI->getOperand(0);
+ V = Builder.CreateBitCast(V, MidTy, V->getName() + ".cast");
+ Scatterer Mid = scatter(&BCI, V);
+ for (unsigned MidI = 0; MidI < FanOut; ++MidI)
+ Res[ResI++] = Mid[MidI];
+ }
+ } else {
+ // <N*M x t1> -> <M x t2>. Convert each group of <N x t1> into a t2.
+ unsigned FanIn = SrcNumElems / DstNumElems;
+ auto *MidTy = FixedVectorType::get(SrcVT->getElementType(), FanIn);
+ unsigned Op0I = 0;
+ for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) {
Value *V = PoisonValue::get(MidTy);
- for (unsigned MidI = 0; MidI < FanIn; ++MidI)
- V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI),
- BCI.getName() + ".i" + Twine(ResI)
- + ".upto" + Twine(MidI));
- Res[ResI] = Builder.CreateBitCast(V, DstVT->getElementType(),
- BCI.getName() + ".i" + Twine(ResI));
- }
- }
- gather(&BCI, Res);
- return true;
-}
-
-bool ScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) {
- VectorType *VT = dyn_cast<VectorType>(IEI.getType());
- if (!VT)
- return false;
-
- unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
- IRBuilder<> Builder(&IEI);
- Scatterer Op0 = scatter(&IEI, IEI.getOperand(0));
- Value *NewElt = IEI.getOperand(1);
- Value *InsIdx = IEI.getOperand(2);
-
- ValueVector Res;
- Res.resize(NumElems);
-
- if (auto *CI = dyn_cast<ConstantInt>(InsIdx)) {
- for (unsigned I = 0; I < NumElems; ++I)
- Res[I] = CI->getValue().getZExtValue() == I ? NewElt : Op0[I];
- } else {
- if (!ScalarizeVariableInsertExtract)
- return false;
-
- for (unsigned I = 0; I < NumElems; ++I) {
- Value *ShouldReplace =
- Builder.CreateICmpEQ(InsIdx, ConstantInt::get(InsIdx->getType(), I),
- InsIdx->getName() + ".is." + Twine(I));
- Value *OldElt = Op0[I];
- Res[I] = Builder.CreateSelect(ShouldReplace, NewElt, OldElt,
- IEI.getName() + ".i" + Twine(I));
- }
- }
-
- gather(&IEI, Res);
- return true;
-}
-
-bool ScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
- VectorType *VT = dyn_cast<VectorType>(EEI.getOperand(0)->getType());
- if (!VT)
- return false;
-
- unsigned NumSrcElems = cast<FixedVectorType>(VT)->getNumElements();
- IRBuilder<> Builder(&EEI);
- Scatterer Op0 = scatter(&EEI, EEI.getOperand(0));
- Value *ExtIdx = EEI.getOperand(1);
-
- if (auto *CI = dyn_cast<ConstantInt>(ExtIdx)) {
- Value *Res = Op0[CI->getValue().getZExtValue()];
- gather(&EEI, {Res});
- return true;
- }
-
- if (!ScalarizeVariableInsertExtract)
- return false;
-
- Value *Res = UndefValue::get(VT->getElementType());
- for (unsigned I = 0; I < NumSrcElems; ++I) {
- Value *ShouldExtract =
- Builder.CreateICmpEQ(ExtIdx, ConstantInt::get(ExtIdx->getType(), I),
- ExtIdx->getName() + ".is." + Twine(I));
- Value *Elt = Op0[I];
- Res = Builder.CreateSelect(ShouldExtract, Elt, Res,
- EEI.getName() + ".upto" + Twine(I));
- }
- gather(&EEI, {Res});
- return true;
-}
-
-bool ScalarizerVisitor::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
- VectorType *VT = dyn_cast<VectorType>(SVI.getType());
- if (!VT)
- return false;
-
- unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
- Scatterer Op0 = scatter(&SVI, SVI.getOperand(0));
- Scatterer Op1 = scatter(&SVI, SVI.getOperand(1));
- ValueVector Res;
- Res.resize(NumElems);
-
- for (unsigned I = 0; I < NumElems; ++I) {
- int Selector = SVI.getMaskValue(I);
- if (Selector < 0)
- Res[I] = UndefValue::get(VT->getElementType());
- else if (unsigned(Selector) < Op0.size())
- Res[I] = Op0[Selector];
- else
- Res[I] = Op1[Selector - Op0.size()];
- }
- gather(&SVI, Res);
- return true;
-}
-
-bool ScalarizerVisitor::visitPHINode(PHINode &PHI) {
- VectorType *VT = dyn_cast<VectorType>(PHI.getType());
- if (!VT)
- return false;
-
- unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
- IRBuilder<> Builder(&PHI);
- ValueVector Res;
- Res.resize(NumElems);
-
- unsigned NumOps = PHI.getNumOperands();
- for (unsigned I = 0; I < NumElems; ++I)
- Res[I] = Builder.CreatePHI(VT->getElementType(), NumOps,
- PHI.getName() + ".i" + Twine(I));
-
- for (unsigned I = 0; I < NumOps; ++I) {
- Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I));
- BasicBlock *IncomingBlock = PHI.getIncomingBlock(I);
- for (unsigned J = 0; J < NumElems; ++J)
- cast<PHINode>(Res[J])->addIncoming(Op[J], IncomingBlock);
- }
- gather(&PHI, Res);
- return true;
-}
-
-bool ScalarizerVisitor::visitLoadInst(LoadInst &LI) {
- if (!ScalarizeLoadStore)
- return false;
- if (!LI.isSimple())
- return false;
-
- Optional<VectorLayout> Layout = getVectorLayout(
- LI.getType(), LI.getAlign(), LI.getModule()->getDataLayout());
- if (!Layout)
- return false;
-
- unsigned NumElems = cast<FixedVectorType>(Layout->VecTy)->getNumElements();
- IRBuilder<> Builder(&LI);
- Scatterer Ptr = scatter(&LI, LI.getPointerOperand());
- ValueVector Res;
- Res.resize(NumElems);
-
- for (unsigned I = 0; I < NumElems; ++I)
- Res[I] = Builder.CreateAlignedLoad(Layout->VecTy->getElementType(), Ptr[I],
- Align(Layout->getElemAlign(I)),
- LI.getName() + ".i" + Twine(I));
- gather(&LI, Res);
- return true;
-}
-
-bool ScalarizerVisitor::visitStoreInst(StoreInst &SI) {
- if (!ScalarizeLoadStore)
- return false;
- if (!SI.isSimple())
- return false;
-
- Value *FullValue = SI.getValueOperand();
- Optional<VectorLayout> Layout = getVectorLayout(
- FullValue->getType(), SI.getAlign(), SI.getModule()->getDataLayout());
- if (!Layout)
- return false;
-
- unsigned NumElems = cast<FixedVectorType>(Layout->VecTy)->getNumElements();
- IRBuilder<> Builder(&SI);
- Scatterer VPtr = scatter(&SI, SI.getPointerOperand());
- Scatterer VVal = scatter(&SI, FullValue);
-
- ValueVector Stores;
- Stores.resize(NumElems);
- for (unsigned I = 0; I < NumElems; ++I) {
- Value *Val = VVal[I];
- Value *Ptr = VPtr[I];
- Stores[I] = Builder.CreateAlignedStore(Val, Ptr, Layout->getElemAlign(I));
- }
- transferMetadataAndIRFlags(&SI, Stores);
- return true;
-}
-
-bool ScalarizerVisitor::visitCallInst(CallInst &CI) {
- return splitCall(CI);
-}
-
-// Delete the instructions that we scalarized. If a full vector result
-// is still needed, recreate it using InsertElements.
-bool ScalarizerVisitor::finish() {
- // The presence of data in Gathered or Scattered indicates changes
- // made to the Function.
- if (Gathered.empty() && Scattered.empty())
- return false;
- for (const auto &GMI : Gathered) {
- Instruction *Op = GMI.first;
- ValueVector &CV = *GMI.second;
- if (!Op->use_empty()) {
- // The value is still needed, so recreate it using a series of
- // InsertElements.
+ for (unsigned MidI = 0; MidI < FanIn; ++MidI)
+ V = Builder.CreateInsertElement(V, Op0[Op0I++], Builder.getInt32(MidI),
+ BCI.getName() + ".i" + Twine(ResI)
+ + ".upto" + Twine(MidI));
+ Res[ResI] = Builder.CreateBitCast(V, DstVT->getElementType(),
+ BCI.getName() + ".i" + Twine(ResI));
+ }
+ }
+ gather(&BCI, Res);
+ return true;
+}
+
+bool ScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) {
+ VectorType *VT = dyn_cast<VectorType>(IEI.getType());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+ IRBuilder<> Builder(&IEI);
+ Scatterer Op0 = scatter(&IEI, IEI.getOperand(0));
+ Value *NewElt = IEI.getOperand(1);
+ Value *InsIdx = IEI.getOperand(2);
+
+ ValueVector Res;
+ Res.resize(NumElems);
+
+ if (auto *CI = dyn_cast<ConstantInt>(InsIdx)) {
+ for (unsigned I = 0; I < NumElems; ++I)
+ Res[I] = CI->getValue().getZExtValue() == I ? NewElt : Op0[I];
+ } else {
+ if (!ScalarizeVariableInsertExtract)
+ return false;
+
+ for (unsigned I = 0; I < NumElems; ++I) {
+ Value *ShouldReplace =
+ Builder.CreateICmpEQ(InsIdx, ConstantInt::get(InsIdx->getType(), I),
+ InsIdx->getName() + ".is." + Twine(I));
+ Value *OldElt = Op0[I];
+ Res[I] = Builder.CreateSelect(ShouldReplace, NewElt, OldElt,
+ IEI.getName() + ".i" + Twine(I));
+ }
+ }
+
+ gather(&IEI, Res);
+ return true;
+}
+
+bool ScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
+ VectorType *VT = dyn_cast<VectorType>(EEI.getOperand(0)->getType());
+ if (!VT)
+ return false;
+
+ unsigned NumSrcElems = cast<FixedVectorType>(VT)->getNumElements();
+ IRBuilder<> Builder(&EEI);
+ Scatterer Op0 = scatter(&EEI, EEI.getOperand(0));
+ Value *ExtIdx = EEI.getOperand(1);
+
+ if (auto *CI = dyn_cast<ConstantInt>(ExtIdx)) {
+ Value *Res = Op0[CI->getValue().getZExtValue()];
+ gather(&EEI, {Res});
+ return true;
+ }
+
+ if (!ScalarizeVariableInsertExtract)
+ return false;
+
+ Value *Res = UndefValue::get(VT->getElementType());
+ for (unsigned I = 0; I < NumSrcElems; ++I) {
+ Value *ShouldExtract =
+ Builder.CreateICmpEQ(ExtIdx, ConstantInt::get(ExtIdx->getType(), I),
+ ExtIdx->getName() + ".is." + Twine(I));
+ Value *Elt = Op0[I];
+ Res = Builder.CreateSelect(ShouldExtract, Elt, Res,
+ EEI.getName() + ".upto" + Twine(I));
+ }
+ gather(&EEI, {Res});
+ return true;
+}
+
+bool ScalarizerVisitor::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+ VectorType *VT = dyn_cast<VectorType>(SVI.getType());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+ Scatterer Op0 = scatter(&SVI, SVI.getOperand(0));
+ Scatterer Op1 = scatter(&SVI, SVI.getOperand(1));
+ ValueVector Res;
+ Res.resize(NumElems);
+
+ for (unsigned I = 0; I < NumElems; ++I) {
+ int Selector = SVI.getMaskValue(I);
+ if (Selector < 0)
+ Res[I] = UndefValue::get(VT->getElementType());
+ else if (unsigned(Selector) < Op0.size())
+ Res[I] = Op0[Selector];
+ else
+ Res[I] = Op1[Selector - Op0.size()];
+ }
+ gather(&SVI, Res);
+ return true;
+}
+
+bool ScalarizerVisitor::visitPHINode(PHINode &PHI) {
+ VectorType *VT = dyn_cast<VectorType>(PHI.getType());
+ if (!VT)
+ return false;
+
+ unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+ IRBuilder<> Builder(&PHI);
+ ValueVector Res;
+ Res.resize(NumElems);
+
+ unsigned NumOps = PHI.getNumOperands();
+ for (unsigned I = 0; I < NumElems; ++I)
+ Res[I] = Builder.CreatePHI(VT->getElementType(), NumOps,
+ PHI.getName() + ".i" + Twine(I));
+
+ for (unsigned I = 0; I < NumOps; ++I) {
+ Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I));
+ BasicBlock *IncomingBlock = PHI.getIncomingBlock(I);
+ for (unsigned J = 0; J < NumElems; ++J)
+ cast<PHINode>(Res[J])->addIncoming(Op[J], IncomingBlock);
+ }
+ gather(&PHI, Res);
+ return true;
+}
+
+bool ScalarizerVisitor::visitLoadInst(LoadInst &LI) {
+ if (!ScalarizeLoadStore)
+ return false;
+ if (!LI.isSimple())
+ return false;
+
+ Optional<VectorLayout> Layout = getVectorLayout(
+ LI.getType(), LI.getAlign(), LI.getModule()->getDataLayout());
+ if (!Layout)
+ return false;
+
+ unsigned NumElems = cast<FixedVectorType>(Layout->VecTy)->getNumElements();
+ IRBuilder<> Builder(&LI);
+ Scatterer Ptr = scatter(&LI, LI.getPointerOperand());
+ ValueVector Res;
+ Res.resize(NumElems);
+
+ for (unsigned I = 0; I < NumElems; ++I)
+ Res[I] = Builder.CreateAlignedLoad(Layout->VecTy->getElementType(), Ptr[I],
+ Align(Layout->getElemAlign(I)),
+ LI.getName() + ".i" + Twine(I));
+ gather(&LI, Res);
+ return true;
+}
+
+bool ScalarizerVisitor::visitStoreInst(StoreInst &SI) {
+ if (!ScalarizeLoadStore)
+ return false;
+ if (!SI.isSimple())
+ return false;
+
+ Value *FullValue = SI.getValueOperand();
+ Optional<VectorLayout> Layout = getVectorLayout(
+ FullValue->getType(), SI.getAlign(), SI.getModule()->getDataLayout());
+ if (!Layout)
+ return false;
+
+ unsigned NumElems = cast<FixedVectorType>(Layout->VecTy)->getNumElements();
+ IRBuilder<> Builder(&SI);
+ Scatterer VPtr = scatter(&SI, SI.getPointerOperand());
+ Scatterer VVal = scatter(&SI, FullValue);
+
+ ValueVector Stores;
+ Stores.resize(NumElems);
+ for (unsigned I = 0; I < NumElems; ++I) {
+ Value *Val = VVal[I];
+ Value *Ptr = VPtr[I];
+ Stores[I] = Builder.CreateAlignedStore(Val, Ptr, Layout->getElemAlign(I));
+ }
+ transferMetadataAndIRFlags(&SI, Stores);
+ return true;
+}
+
+bool ScalarizerVisitor::visitCallInst(CallInst &CI) {
+ return splitCall(CI);
+}
+
+// Delete the instructions that we scalarized. If a full vector result
+// is still needed, recreate it using InsertElements.
+bool ScalarizerVisitor::finish() {
+ // The presence of data in Gathered or Scattered indicates changes
+ // made to the Function.
+ if (Gathered.empty() && Scattered.empty())
+ return false;
+ for (const auto &GMI : Gathered) {
+ Instruction *Op = GMI.first;
+ ValueVector &CV = *GMI.second;
+ if (!Op->use_empty()) {
+ // The value is still needed, so recreate it using a series of
+ // InsertElements.
Value *Res = PoisonValue::get(Op->getType());
- if (auto *Ty = dyn_cast<VectorType>(Op->getType())) {
- BasicBlock *BB = Op->getParent();
- unsigned Count = cast<FixedVectorType>(Ty)->getNumElements();
- IRBuilder<> Builder(Op);
- if (isa<PHINode>(Op))
- Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
- for (unsigned I = 0; I < Count; ++I)
- Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I),
- Op->getName() + ".upto" + Twine(I));
+ if (auto *Ty = dyn_cast<VectorType>(Op->getType())) {
+ BasicBlock *BB = Op->getParent();
+ unsigned Count = cast<FixedVectorType>(Ty)->getNumElements();
+ IRBuilder<> Builder(Op);
+ if (isa<PHINode>(Op))
+ Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
+ for (unsigned I = 0; I < Count; ++I)
+ Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I),
+ Op->getName() + ".upto" + Twine(I));
Res->takeName(Op);
- } else {
- assert(CV.size() == 1 && Op->getType() == CV[0]->getType());
- Res = CV[0];
- if (Op == Res)
- continue;
- }
- Op->replaceAllUsesWith(Res);
- }
- PotentiallyDeadInstrs.emplace_back(Op);
- }
- Gathered.clear();
- Scattered.clear();
-
- RecursivelyDeleteTriviallyDeadInstructionsPermissive(PotentiallyDeadInstrs);
-
- return true;
-}
-
-PreservedAnalyses ScalarizerPass::run(Function &F, FunctionAnalysisManager &AM) {
- Module &M = *F.getParent();
- unsigned ParallelLoopAccessMDKind =
- M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
- DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
- ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT);
- bool Changed = Impl.visit(F);
- PreservedAnalyses PA;
- PA.preserve<DominatorTreeAnalysis>();
- return Changed ? PA : PreservedAnalyses::all();
-}
+ } else {
+ assert(CV.size() == 1 && Op->getType() == CV[0]->getType());
+ Res = CV[0];
+ if (Op == Res)
+ continue;
+ }
+ Op->replaceAllUsesWith(Res);
+ }
+ PotentiallyDeadInstrs.emplace_back(Op);
+ }
+ Gathered.clear();
+ Scattered.clear();
+
+ RecursivelyDeleteTriviallyDeadInstructionsPermissive(PotentiallyDeadInstrs);
+
+ return true;
+}
+
+PreservedAnalyses ScalarizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+ Module &M = *F.getParent();
+ unsigned ParallelLoopAccessMDKind =
+ M.getContext().getMDKindID("llvm.mem.parallel_loop_access");
+ DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ ScalarizerVisitor Impl(ParallelLoopAccessMDKind, DT);
+ bool Changed = Impl.visit(F);
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ return Changed ? PA : PreservedAnalyses::all();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 64bdd151fb..f216956406 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -1,371 +1,371 @@
-//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Loop unrolling may create many similar GEPs for array accesses.
-// e.g., a 2-level loop
-//
-// float a[32][32]; // global variable
-//
-// for (int i = 0; i < 2; ++i) {
-// for (int j = 0; j < 2; ++j) {
-// ...
-// ... = a[x + i][y + j];
-// ...
-// }
-// }
-//
-// will probably be unrolled to:
-//
-// gep %a, 0, %x, %y; load
-// gep %a, 0, %x, %y + 1; load
-// gep %a, 0, %x + 1, %y; load
-// gep %a, 0, %x + 1, %y + 1; load
-//
-// LLVM's GVN does not use partial redundancy elimination yet, and is thus
-// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
-// significant slowdown in targets with limited addressing modes. For instance,
-// because the PTX target does not support the reg+reg addressing mode, the
-// NVPTX backend emits PTX code that literally computes the pointer address of
-// each GEP, wasting tons of registers. It emits the following PTX for the
-// first load and similar PTX for other loads.
-//
-// mov.u32 %r1, %x;
-// mov.u32 %r2, %y;
-// mul.wide.u32 %rl2, %r1, 128;
-// mov.u64 %rl3, a;
-// add.s64 %rl4, %rl3, %rl2;
-// mul.wide.u32 %rl5, %r2, 4;
-// add.s64 %rl6, %rl4, %rl5;
-// ld.global.f32 %f1, [%rl6];
-//
-// To reduce the register pressure, the optimization implemented in this file
-// merges the common part of a group of GEPs, so we can compute each pointer
-// address by adding a simple offset to the common part, saving many registers.
-//
-// It works by splitting each GEP into a variadic base and a constant offset.
-// The variadic base can be computed once and reused by multiple GEPs, and the
-// constant offsets can be nicely folded into the reg+immediate addressing mode
-// (supported by most targets) without using any extra register.
-//
-// For instance, we transform the four GEPs and four loads in the above example
-// into:
-//
-// base = gep a, 0, x, y
-// load base
-// laod base + 1 * sizeof(float)
-// load base + 32 * sizeof(float)
-// load base + 33 * sizeof(float)
-//
-// Given the transformed IR, a backend that supports the reg+immediate
-// addressing mode can easily fold the pointer arithmetics into the loads. For
-// example, the NVPTX backend can easily fold the pointer arithmetics into the
-// ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
-//
-// mov.u32 %r1, %tid.x;
-// mov.u32 %r2, %tid.y;
-// mul.wide.u32 %rl2, %r1, 128;
-// mov.u64 %rl3, a;
-// add.s64 %rl4, %rl3, %rl2;
-// mul.wide.u32 %rl5, %r2, 4;
-// add.s64 %rl6, %rl4, %rl5;
-// ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX
-// ld.global.f32 %f2, [%rl6+4]; // much better
-// ld.global.f32 %f3, [%rl6+128]; // much better
-// ld.global.f32 %f4, [%rl6+132]; // much better
-//
-// Another improvement enabled by the LowerGEP flag is to lower a GEP with
-// multiple indices to either multiple GEPs with a single index or arithmetic
-// operations (depending on whether the target uses alias analysis in codegen).
-// Such transformation can have following benefits:
-// (1) It can always extract constants in the indices of structure type.
-// (2) After such Lowering, there are more optimization opportunities such as
-// CSE, LICM and CGP.
-//
-// E.g. The following GEPs have multiple indices:
-// BB1:
-// %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
-// load %p
-// ...
-// BB2:
-// %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
-// load %p2
-// ...
-//
-// We can not do CSE to the common part related to index "i64 %i". Lowering
-// GEPs can achieve such goals.
-// If the target does not use alias analysis in codegen, this pass will
-// lower a GEP with multiple indices into arithmetic operations:
-// BB1:
-// %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
-// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
-// %3 = add i64 %1, %2 ; CSE opportunity
-// %4 = mul i64 %j1, length_of_struct
-// %5 = add i64 %3, %4
-// %6 = add i64 %3, struct_field_3 ; Constant offset
-// %p = inttoptr i64 %6 to i32*
-// load %p
-// ...
-// BB2:
-// %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
-// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
-// %9 = add i64 %7, %8 ; CSE opportunity
-// %10 = mul i64 %j2, length_of_struct
-// %11 = add i64 %9, %10
-// %12 = add i64 %11, struct_field_2 ; Constant offset
-// %p = inttoptr i64 %12 to i32*
-// load %p2
-// ...
-//
-// If the target uses alias analysis in codegen, this pass will lower a GEP
-// with multiple indices into multiple GEPs with a single index:
-// BB1:
-// %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
-// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
-// %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity
-// %4 = mul i64 %j1, length_of_struct
-// %5 = getelementptr i8* %3, i64 %4
-// %6 = getelementptr i8* %5, struct_field_3 ; Constant offset
-// %p = bitcast i8* %6 to i32*
-// load %p
-// ...
-// BB2:
-// %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
-// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
-// %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity
-// %10 = mul i64 %j2, length_of_struct
-// %11 = getelementptr i8* %9, i64 %10
-// %12 = getelementptr i8* %11, struct_field_2 ; Constant offset
-// %p2 = bitcast i8* %12 to i32*
-// load %p2
-// ...
-//
-// Lowering GEPs can also benefit other passes such as LICM and CGP.
-// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
-// indices if one of the index is variant. If we lower such GEP into invariant
-// parts and variant parts, LICM can hoist/sink those invariant parts.
-// CGP (CodeGen Prepare) tries to sink address calculations that match the
-// target's addressing modes. A GEP with multiple indices may not match and will
-// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
-// them. So we end up with a better addressing mode.
-//
-//===----------------------------------------------------------------------===//
-
+//===- SeparateConstOffsetFromGEP.cpp -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Loop unrolling may create many similar GEPs for array accesses.
+// e.g., a 2-level loop
+//
+// float a[32][32]; // global variable
+//
+// for (int i = 0; i < 2; ++i) {
+// for (int j = 0; j < 2; ++j) {
+// ...
+// ... = a[x + i][y + j];
+// ...
+// }
+// }
+//
+// will probably be unrolled to:
+//
+// gep %a, 0, %x, %y; load
+// gep %a, 0, %x, %y + 1; load
+// gep %a, 0, %x + 1, %y; load
+// gep %a, 0, %x + 1, %y + 1; load
+//
+// LLVM's GVN does not use partial redundancy elimination yet, and is thus
+// unable to reuse (gep %a, 0, %x, %y). As a result, this misoptimization incurs
+// significant slowdown in targets with limited addressing modes. For instance,
+// because the PTX target does not support the reg+reg addressing mode, the
+// NVPTX backend emits PTX code that literally computes the pointer address of
+// each GEP, wasting tons of registers. It emits the following PTX for the
+// first load and similar PTX for other loads.
+//
+// mov.u32 %r1, %x;
+// mov.u32 %r2, %y;
+// mul.wide.u32 %rl2, %r1, 128;
+// mov.u64 %rl3, a;
+// add.s64 %rl4, %rl3, %rl2;
+// mul.wide.u32 %rl5, %r2, 4;
+// add.s64 %rl6, %rl4, %rl5;
+// ld.global.f32 %f1, [%rl6];
+//
+// To reduce the register pressure, the optimization implemented in this file
+// merges the common part of a group of GEPs, so we can compute each pointer
+// address by adding a simple offset to the common part, saving many registers.
+//
+// It works by splitting each GEP into a variadic base and a constant offset.
+// The variadic base can be computed once and reused by multiple GEPs, and the
+// constant offsets can be nicely folded into the reg+immediate addressing mode
+// (supported by most targets) without using any extra register.
+//
+// For instance, we transform the four GEPs and four loads in the above example
+// into:
+//
+// base = gep a, 0, x, y
+// load base
+// laod base + 1 * sizeof(float)
+// load base + 32 * sizeof(float)
+// load base + 33 * sizeof(float)
+//
+// Given the transformed IR, a backend that supports the reg+immediate
+// addressing mode can easily fold the pointer arithmetics into the loads. For
+// example, the NVPTX backend can easily fold the pointer arithmetics into the
+// ld.global.f32 instructions, and the resultant PTX uses much fewer registers.
+//
+// mov.u32 %r1, %tid.x;
+// mov.u32 %r2, %tid.y;
+// mul.wide.u32 %rl2, %r1, 128;
+// mov.u64 %rl3, a;
+// add.s64 %rl4, %rl3, %rl2;
+// mul.wide.u32 %rl5, %r2, 4;
+// add.s64 %rl6, %rl4, %rl5;
+// ld.global.f32 %f1, [%rl6]; // so far the same as unoptimized PTX
+// ld.global.f32 %f2, [%rl6+4]; // much better
+// ld.global.f32 %f3, [%rl6+128]; // much better
+// ld.global.f32 %f4, [%rl6+132]; // much better
+//
+// Another improvement enabled by the LowerGEP flag is to lower a GEP with
+// multiple indices to either multiple GEPs with a single index or arithmetic
+// operations (depending on whether the target uses alias analysis in codegen).
+// Such transformation can have following benefits:
+// (1) It can always extract constants in the indices of structure type.
+// (2) After such Lowering, there are more optimization opportunities such as
+// CSE, LICM and CGP.
+//
+// E.g. The following GEPs have multiple indices:
+// BB1:
+// %p = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 3
+// load %p
+// ...
+// BB2:
+// %p2 = getelementptr [10 x %struct]* %ptr, i64 %i, i64 %j1, i32 2
+// load %p2
+// ...
+//
+// We can not do CSE to the common part related to index "i64 %i". Lowering
+// GEPs can achieve such goals.
+// If the target does not use alias analysis in codegen, this pass will
+// lower a GEP with multiple indices into arithmetic operations:
+// BB1:
+// %1 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
+// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %3 = add i64 %1, %2 ; CSE opportunity
+// %4 = mul i64 %j1, length_of_struct
+// %5 = add i64 %3, %4
+// %6 = add i64 %3, struct_field_3 ; Constant offset
+// %p = inttoptr i64 %6 to i32*
+// load %p
+// ...
+// BB2:
+// %7 = ptrtoint [10 x %struct]* %ptr to i64 ; CSE opportunity
+// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %9 = add i64 %7, %8 ; CSE opportunity
+// %10 = mul i64 %j2, length_of_struct
+// %11 = add i64 %9, %10
+// %12 = add i64 %11, struct_field_2 ; Constant offset
+// %p = inttoptr i64 %12 to i32*
+// load %p2
+// ...
+//
+// If the target uses alias analysis in codegen, this pass will lower a GEP
+// with multiple indices into multiple GEPs with a single index:
+// BB1:
+// %1 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
+// %2 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %3 = getelementptr i8* %1, i64 %2 ; CSE opportunity
+// %4 = mul i64 %j1, length_of_struct
+// %5 = getelementptr i8* %3, i64 %4
+// %6 = getelementptr i8* %5, struct_field_3 ; Constant offset
+// %p = bitcast i8* %6 to i32*
+// load %p
+// ...
+// BB2:
+// %7 = bitcast [10 x %struct]* %ptr to i8* ; CSE opportunity
+// %8 = mul i64 %i, length_of_10xstruct ; CSE opportunity
+// %9 = getelementptr i8* %7, i64 %8 ; CSE opportunity
+// %10 = mul i64 %j2, length_of_struct
+// %11 = getelementptr i8* %9, i64 %10
+// %12 = getelementptr i8* %11, struct_field_2 ; Constant offset
+// %p2 = bitcast i8* %12 to i32*
+// load %p2
+// ...
+//
+// Lowering GEPs can also benefit other passes such as LICM and CGP.
+// LICM (Loop Invariant Code Motion) can not hoist/sink a GEP of multiple
+// indices if one of the index is variant. If we lower such GEP into invariant
+// parts and variant parts, LICM can hoist/sink those invariant parts.
+// CGP (CodeGen Prepare) tries to sink address calculations that match the
+// target's addressing modes. A GEP with multiple indices may not match and will
+// not be sunk. If we lower such GEP into smaller parts, CGP may sink some of
+// them. So we end up with a better addressing mode.
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Scalar/SeparateConstOffsetFromGEP.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <cassert>
-#include <cstdint>
-#include <string>
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
- "disable-separate-const-offset-from-gep", cl::init(false),
- cl::desc("Do not separate the constant offset from a GEP instruction"),
- cl::Hidden);
-
-// Setting this flag may emit false positives when the input module already
-// contains dead instructions. Therefore, we set it only in unit tests that are
-// free of dead code.
-static cl::opt<bool>
- VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false),
- cl::desc("Verify this pass produces no dead code"),
- cl::Hidden);
-
-namespace {
-
-/// A helper class for separating a constant offset from a GEP index.
-///
-/// In real programs, a GEP index may be more complicated than a simple addition
-/// of something and a constant integer which can be trivially splitted. For
-/// example, to split ((a << 3) | 5) + b, we need to search deeper for the
-/// constant offset, so that we can separate the index to (a << 3) + b and 5.
-///
-/// Therefore, this class looks into the expression that computes a given GEP
-/// index, and tries to find a constant integer that can be hoisted to the
-/// outermost level of the expression as an addition. Not every constant in an
-/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
-/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
-/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
-class ConstantOffsetExtractor {
-public:
- /// Extracts a constant offset from the given GEP index. It returns the
- /// new index representing the remainder (equal to the original index minus
- /// the constant offset), or nullptr if we cannot extract a constant offset.
- /// \p Idx The given GEP index
- /// \p GEP The given GEP
- /// \p UserChainTail Outputs the tail of UserChain so that we can
- /// garbage-collect unused instructions in UserChain.
- static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
- User *&UserChainTail, const DominatorTree *DT);
-
- /// Looks for a constant offset from the given GEP index without extracting
- /// it. It returns the numeric value of the extracted constant offset (0 if
- /// failed). The meaning of the arguments are the same as Extract.
- static int64_t Find(Value *Idx, GetElementPtrInst *GEP,
- const DominatorTree *DT);
-
-private:
- ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
- : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
- }
-
- /// Searches the expression that computes V for a non-zero constant C s.t.
- /// V can be reassociated into the form V' + C. If the searching is
- /// successful, returns C and update UserChain as a def-use chain from C to V;
- /// otherwise, UserChain is empty.
- ///
- /// \p V The given expression
- /// \p SignExtended Whether V will be sign-extended in the computation of the
- /// GEP index
- /// \p ZeroExtended Whether V will be zero-extended in the computation of the
- /// GEP index
- /// \p NonNegative Whether V is guaranteed to be non-negative. For example,
- /// an index of an inbounds GEP is guaranteed to be
- /// non-negative. Levaraging this, we can better split
- /// inbounds GEPs.
- APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
-
- /// A helper function to look into both operands of a binary operator.
- APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
- bool ZeroExtended);
-
- /// After finding the constant offset C from the GEP index I, we build a new
- /// index I' s.t. I' + C = I. This function builds and returns the new
- /// index I' according to UserChain produced by function "find".
- ///
- /// The building conceptually takes two steps:
- /// 1) iteratively distribute s/zext towards the leaves of the expression tree
- /// that computes I
- /// 2) reassociate the expression tree to the form I' + C.
- ///
- /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
- /// sext to a, b and 5 so that we have
- /// sext(a) + (sext(b) + 5).
- /// Then, we reassociate it to
- /// (sext(a) + sext(b)) + 5.
- /// Given this form, we know I' is sext(a) + sext(b).
- Value *rebuildWithoutConstOffset();
-
- /// After the first step of rebuilding the GEP index without the constant
- /// offset, distribute s/zext to the operands of all operators in UserChain.
- /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
- /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
- ///
- /// The function also updates UserChain to point to new subexpressions after
- /// distributing s/zext. e.g., the old UserChain of the above example is
- /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
- /// and the new UserChain is
- /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
- /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
- ///
- /// \p ChainIndex The index to UserChain. ChainIndex is initially
- /// UserChain.size() - 1, and is decremented during
- /// the recursion.
- Value *distributeExtsAndCloneChain(unsigned ChainIndex);
-
- /// Reassociates the GEP index to the form I' + C and returns I'.
- Value *removeConstOffset(unsigned ChainIndex);
-
- /// A helper function to apply ExtInsts, a list of s/zext, to value V.
- /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
- /// returns "sext i32 (zext i16 V to i32) to i64".
- Value *applyExts(Value *V);
-
- /// A helper function that returns whether we can trace into the operands
- /// of binary operator BO for a constant offset.
- ///
- /// \p SignExtended Whether BO is surrounded by sext
- /// \p ZeroExtended Whether BO is surrounded by zext
- /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
- /// array index.
- bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
- bool NonNegative);
-
- /// The path from the constant offset to the old GEP index. e.g., if the GEP
- /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
- /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
- /// UserChain[2] will be the entire expression "a * b + (c + 5)".
- ///
- /// This path helps to rebuild the new GEP index.
- SmallVector<User *, 8> UserChain;
-
- /// A data structure used in rebuildWithoutConstOffset. Contains all
- /// sext/zext instructions along UserChain.
- SmallVector<CastInst *, 16> ExtInsts;
-
- /// Insertion position of cloned instructions.
- Instruction *IP;
-
- const DataLayout &DL;
- const DominatorTree *DT;
-};
-
-/// A pass that tries to split every GEP in the function into a variadic
-/// base and a constant offset. It is a FunctionPass because searching for the
-/// constant offset may inspect other basic blocks.
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<bool> DisableSeparateConstOffsetFromGEP(
+ "disable-separate-const-offset-from-gep", cl::init(false),
+ cl::desc("Do not separate the constant offset from a GEP instruction"),
+ cl::Hidden);
+
+// Setting this flag may emit false positives when the input module already
+// contains dead instructions. Therefore, we set it only in unit tests that are
+// free of dead code.
+static cl::opt<bool>
+ VerifyNoDeadCode("reassociate-geps-verify-no-dead-code", cl::init(false),
+ cl::desc("Verify this pass produces no dead code"),
+ cl::Hidden);
+
+namespace {
+
+/// A helper class for separating a constant offset from a GEP index.
+///
+/// In real programs, a GEP index may be more complicated than a simple addition
+/// of something and a constant integer which can be trivially splitted. For
+/// example, to split ((a << 3) | 5) + b, we need to search deeper for the
+/// constant offset, so that we can separate the index to (a << 3) + b and 5.
+///
+/// Therefore, this class looks into the expression that computes a given GEP
+/// index, and tries to find a constant integer that can be hoisted to the
+/// outermost level of the expression as an addition. Not every constant in an
+/// expression can jump out. e.g., we cannot transform (b * (a + 5)) to (b * a +
+/// 5); nor can we transform (3 * (a + 5)) to (3 * a + 5), however in this case,
+/// -instcombine probably already optimized (3 * (a + 5)) to (3 * a + 15).
+class ConstantOffsetExtractor {
+public:
+ /// Extracts a constant offset from the given GEP index. It returns the
+ /// new index representing the remainder (equal to the original index minus
+ /// the constant offset), or nullptr if we cannot extract a constant offset.
+ /// \p Idx The given GEP index
+ /// \p GEP The given GEP
+ /// \p UserChainTail Outputs the tail of UserChain so that we can
+ /// garbage-collect unused instructions in UserChain.
+ static Value *Extract(Value *Idx, GetElementPtrInst *GEP,
+ User *&UserChainTail, const DominatorTree *DT);
+
+ /// Looks for a constant offset from the given GEP index without extracting
+ /// it. It returns the numeric value of the extracted constant offset (0 if
+ /// failed). The meaning of the arguments are the same as Extract.
+ static int64_t Find(Value *Idx, GetElementPtrInst *GEP,
+ const DominatorTree *DT);
+
+private:
+ ConstantOffsetExtractor(Instruction *InsertionPt, const DominatorTree *DT)
+ : IP(InsertionPt), DL(InsertionPt->getModule()->getDataLayout()), DT(DT) {
+ }
+
+ /// Searches the expression that computes V for a non-zero constant C s.t.
+ /// V can be reassociated into the form V' + C. If the searching is
+ /// successful, returns C and update UserChain as a def-use chain from C to V;
+ /// otherwise, UserChain is empty.
+ ///
+ /// \p V The given expression
+ /// \p SignExtended Whether V will be sign-extended in the computation of the
+ /// GEP index
+ /// \p ZeroExtended Whether V will be zero-extended in the computation of the
+ /// GEP index
+ /// \p NonNegative Whether V is guaranteed to be non-negative. For example,
+ /// an index of an inbounds GEP is guaranteed to be
+ /// non-negative. Levaraging this, we can better split
+ /// inbounds GEPs.
+ APInt find(Value *V, bool SignExtended, bool ZeroExtended, bool NonNegative);
+
+ /// A helper function to look into both operands of a binary operator.
+ APInt findInEitherOperand(BinaryOperator *BO, bool SignExtended,
+ bool ZeroExtended);
+
+ /// After finding the constant offset C from the GEP index I, we build a new
+ /// index I' s.t. I' + C = I. This function builds and returns the new
+ /// index I' according to UserChain produced by function "find".
+ ///
+ /// The building conceptually takes two steps:
+ /// 1) iteratively distribute s/zext towards the leaves of the expression tree
+ /// that computes I
+ /// 2) reassociate the expression tree to the form I' + C.
+ ///
+ /// For example, to extract the 5 from sext(a + (b + 5)), we first distribute
+ /// sext to a, b and 5 so that we have
+ /// sext(a) + (sext(b) + 5).
+ /// Then, we reassociate it to
+ /// (sext(a) + sext(b)) + 5.
+ /// Given this form, we know I' is sext(a) + sext(b).
+ Value *rebuildWithoutConstOffset();
+
+ /// After the first step of rebuilding the GEP index without the constant
+ /// offset, distribute s/zext to the operands of all operators in UserChain.
+ /// e.g., zext(sext(a + (b + 5)) (assuming no overflow) =>
+ /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))).
+ ///
+ /// The function also updates UserChain to point to new subexpressions after
+ /// distributing s/zext. e.g., the old UserChain of the above example is
+ /// 5 -> b + 5 -> a + (b + 5) -> sext(...) -> zext(sext(...)),
+ /// and the new UserChain is
+ /// zext(sext(5)) -> zext(sext(b)) + zext(sext(5)) ->
+ /// zext(sext(a)) + (zext(sext(b)) + zext(sext(5))
+ ///
+ /// \p ChainIndex The index to UserChain. ChainIndex is initially
+ /// UserChain.size() - 1, and is decremented during
+ /// the recursion.
+ Value *distributeExtsAndCloneChain(unsigned ChainIndex);
+
+ /// Reassociates the GEP index to the form I' + C and returns I'.
+ Value *removeConstOffset(unsigned ChainIndex);
+
+ /// A helper function to apply ExtInsts, a list of s/zext, to value V.
+ /// e.g., if ExtInsts = [sext i32 to i64, zext i16 to i32], this function
+ /// returns "sext i32 (zext i16 V to i32) to i64".
+ Value *applyExts(Value *V);
+
+ /// A helper function that returns whether we can trace into the operands
+ /// of binary operator BO for a constant offset.
+ ///
+ /// \p SignExtended Whether BO is surrounded by sext
+ /// \p ZeroExtended Whether BO is surrounded by zext
+ /// \p NonNegative Whether BO is known to be non-negative, e.g., an in-bound
+ /// array index.
+ bool CanTraceInto(bool SignExtended, bool ZeroExtended, BinaryOperator *BO,
+ bool NonNegative);
+
+ /// The path from the constant offset to the old GEP index. e.g., if the GEP
+ /// index is "a * b + (c + 5)". After running function find, UserChain[0] will
+ /// be the constant 5, UserChain[1] will be the subexpression "c + 5", and
+ /// UserChain[2] will be the entire expression "a * b + (c + 5)".
+ ///
+ /// This path helps to rebuild the new GEP index.
+ SmallVector<User *, 8> UserChain;
+
+ /// A data structure used in rebuildWithoutConstOffset. Contains all
+ /// sext/zext instructions along UserChain.
+ SmallVector<CastInst *, 16> ExtInsts;
+
+ /// Insertion position of cloned instructions.
+ Instruction *IP;
+
+ const DataLayout &DL;
+ const DominatorTree *DT;
+};
+
+/// A pass that tries to split every GEP in the function into a variadic
+/// base and a constant offset. It is a FunctionPass because searching for the
+/// constant offset may inspect other basic blocks.
class SeparateConstOffsetFromGEPLegacyPass : public FunctionPass {
-public:
- static char ID;
-
+public:
+ static char ID;
+
SeparateConstOffsetFromGEPLegacyPass(bool LowerGEP = false)
- : FunctionPass(ID), LowerGEP(LowerGEP) {
+ : FunctionPass(ID), LowerGEP(LowerGEP) {
initializeSeparateConstOffsetFromGEPLegacyPassPass(
*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.setPreservesCFG();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- }
-
- bool runOnFunction(Function &F) override;
-
-private:
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.setPreservesCFG();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override;
+
+private:
bool LowerGEP;
};
@@ -383,769 +383,769 @@ public:
bool run(Function &F);
private:
- /// Tries to split the given GEP into a variadic base and a constant offset,
- /// and returns true if the splitting succeeds.
- bool splitGEP(GetElementPtrInst *GEP);
-
- /// Lower a GEP with multiple indices into multiple GEPs with a single index.
- /// Function splitGEP already split the original GEP into a variadic part and
- /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
- /// variadic part into a set of GEPs with a single index and applies
- /// AccumulativeByteOffset to it.
- /// \p Variadic The variadic part of the original GEP.
- /// \p AccumulativeByteOffset The constant offset.
- void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
- int64_t AccumulativeByteOffset);
-
- /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
- /// Function splitGEP already split the original GEP into a variadic part and
- /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
- /// variadic part into a set of arithmetic operations and applies
- /// AccumulativeByteOffset to it.
- /// \p Variadic The variadic part of the original GEP.
- /// \p AccumulativeByteOffset The constant offset.
- void lowerToArithmetics(GetElementPtrInst *Variadic,
- int64_t AccumulativeByteOffset);
-
- /// Finds the constant offset within each index and accumulates them. If
- /// LowerGEP is true, it finds in indices of both sequential and structure
- /// types, otherwise it only finds in sequential indices. The output
- /// NeedsExtraction indicates whether we successfully find a non-zero constant
- /// offset.
- int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
-
- /// Canonicalize array indices to pointer-size integers. This helps to
- /// simplify the logic of splitting a GEP. For example, if a + b is a
- /// pointer-size integer, we have
- /// gep base, a + b = gep (gep base, a), b
- /// However, this equality may not hold if the size of a + b is smaller than
- /// the pointer size, because LLVM conceptually sign-extends GEP indices to
- /// pointer size before computing the address
- /// (http://llvm.org/docs/LangRef.html#id181).
- ///
- /// This canonicalization is very likely already done in clang and
- /// instcombine. Therefore, the program will probably remain the same.
- ///
- /// Returns true if the module changes.
- ///
- /// Verified in @i32_add in split-gep.ll
- bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
-
- /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
- /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
- /// the constant offset. After extraction, it becomes desirable to reunion the
- /// distributed sexts. For example,
- ///
- /// &a[sext(i +nsw (j +nsw 5)]
- /// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)]
- /// => constant extraction &a[sext(i) + sext(j)] + 5
- /// => reunion &a[sext(i +nsw j)] + 5
- bool reuniteExts(Function &F);
-
- /// A helper that reunites sexts in an instruction.
- bool reuniteExts(Instruction *I);
-
- /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
- Instruction *findClosestMatchingDominator(
- const SCEV *Key, Instruction *Dominatee,
- DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs);
-
- /// Verify F is free of dead code.
- void verifyNoDeadCode(Function &F);
-
- bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
-
- // Swap the index operand of two GEP.
- void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
-
- // Check if it is safe to swap operand of two GEP.
- bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
- Loop *CurLoop);
-
- const DataLayout *DL = nullptr;
- DominatorTree *DT = nullptr;
- ScalarEvolution *SE;
- LoopInfo *LI;
- TargetLibraryInfo *TLI;
+ /// Tries to split the given GEP into a variadic base and a constant offset,
+ /// and returns true if the splitting succeeds.
+ bool splitGEP(GetElementPtrInst *GEP);
+
+ /// Lower a GEP with multiple indices into multiple GEPs with a single index.
+ /// Function splitGEP already split the original GEP into a variadic part and
+ /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+ /// variadic part into a set of GEPs with a single index and applies
+ /// AccumulativeByteOffset to it.
+ /// \p Variadic The variadic part of the original GEP.
+ /// \p AccumulativeByteOffset The constant offset.
+ void lowerToSingleIndexGEPs(GetElementPtrInst *Variadic,
+ int64_t AccumulativeByteOffset);
+
+ /// Lower a GEP with multiple indices into ptrtoint+arithmetics+inttoptr form.
+ /// Function splitGEP already split the original GEP into a variadic part and
+ /// a constant offset (i.e., AccumulativeByteOffset). This function lowers the
+ /// variadic part into a set of arithmetic operations and applies
+ /// AccumulativeByteOffset to it.
+ /// \p Variadic The variadic part of the original GEP.
+ /// \p AccumulativeByteOffset The constant offset.
+ void lowerToArithmetics(GetElementPtrInst *Variadic,
+ int64_t AccumulativeByteOffset);
+
+ /// Finds the constant offset within each index and accumulates them. If
+ /// LowerGEP is true, it finds in indices of both sequential and structure
+ /// types, otherwise it only finds in sequential indices. The output
+ /// NeedsExtraction indicates whether we successfully find a non-zero constant
+ /// offset.
+ int64_t accumulateByteOffset(GetElementPtrInst *GEP, bool &NeedsExtraction);
+
+ /// Canonicalize array indices to pointer-size integers. This helps to
+ /// simplify the logic of splitting a GEP. For example, if a + b is a
+ /// pointer-size integer, we have
+ /// gep base, a + b = gep (gep base, a), b
+ /// However, this equality may not hold if the size of a + b is smaller than
+ /// the pointer size, because LLVM conceptually sign-extends GEP indices to
+ /// pointer size before computing the address
+ /// (http://llvm.org/docs/LangRef.html#id181).
+ ///
+ /// This canonicalization is very likely already done in clang and
+ /// instcombine. Therefore, the program will probably remain the same.
+ ///
+ /// Returns true if the module changes.
+ ///
+ /// Verified in @i32_add in split-gep.ll
+ bool canonicalizeArrayIndicesToPointerSize(GetElementPtrInst *GEP);
+
+ /// Optimize sext(a)+sext(b) to sext(a+b) when a+b can't sign overflow.
+ /// SeparateConstOffsetFromGEP distributes a sext to leaves before extracting
+ /// the constant offset. After extraction, it becomes desirable to reunion the
+ /// distributed sexts. For example,
+ ///
+ /// &a[sext(i +nsw (j +nsw 5)]
+ /// => distribute &a[sext(i) +nsw (sext(j) +nsw 5)]
+ /// => constant extraction &a[sext(i) + sext(j)] + 5
+ /// => reunion &a[sext(i +nsw j)] + 5
+ bool reuniteExts(Function &F);
+
+ /// A helper that reunites sexts in an instruction.
+ bool reuniteExts(Instruction *I);
+
+ /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
+ Instruction *findClosestMatchingDominator(
+ const SCEV *Key, Instruction *Dominatee,
+ DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs);
+
+ /// Verify F is free of dead code.
+ void verifyNoDeadCode(Function &F);
+
+ bool hasMoreThanOneUseInLoop(Value *v, Loop *L);
+
+ // Swap the index operand of two GEP.
+ void swapGEPOperand(GetElementPtrInst *First, GetElementPtrInst *Second);
+
+ // Check if it is safe to swap operand of two GEP.
+ bool isLegalToSwapOperand(GetElementPtrInst *First, GetElementPtrInst *Second,
+ Loop *CurLoop);
+
+ const DataLayout *DL = nullptr;
+ DominatorTree *DT = nullptr;
+ ScalarEvolution *SE;
+ LoopInfo *LI;
+ TargetLibraryInfo *TLI;
// Retrieved lazily since not always used.
function_ref<TargetTransformInfo &(Function &)> GetTTI;
-
- /// Whether to lower a GEP with multiple indices into arithmetic operations or
- /// multiple GEPs with a single index.
- bool LowerGEP;
-
- DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingAdds;
- DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingSubs;
-};
-
-} // end anonymous namespace
-
+
+ /// Whether to lower a GEP with multiple indices into arithmetic operations or
+ /// multiple GEPs with a single index.
+ bool LowerGEP;
+
+ DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingAdds;
+ DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingSubs;
+};
+
+} // end anonymous namespace
+
char SeparateConstOffsetFromGEPLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(
+
+INITIALIZE_PASS_BEGIN(
SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep",
- "Split GEPs to a variadic base and a constant offset for better CSE", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(
+ "Split GEPs to a variadic base and a constant offset for better CSE", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(
SeparateConstOffsetFromGEPLegacyPass, "separate-const-offset-from-gep",
- "Split GEPs to a variadic base and a constant offset for better CSE", false,
- false)
-
-FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) {
+ "Split GEPs to a variadic base and a constant offset for better CSE", false,
+ false)
+
+FunctionPass *llvm::createSeparateConstOffsetFromGEPPass(bool LowerGEP) {
return new SeparateConstOffsetFromGEPLegacyPass(LowerGEP);
-}
-
-bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
- bool ZeroExtended,
- BinaryOperator *BO,
- bool NonNegative) {
- // We only consider ADD, SUB and OR, because a non-zero constant found in
- // expressions composed of these operations can be easily hoisted as a
- // constant offset by reassociation.
- if (BO->getOpcode() != Instruction::Add &&
- BO->getOpcode() != Instruction::Sub &&
- BO->getOpcode() != Instruction::Or) {
- return false;
- }
-
- Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
- // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
- // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
- // FIXME: this does not appear to be covered by any tests
- // (with x86/aarch64 backends at least)
- if (BO->getOpcode() == Instruction::Or &&
- !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT))
- return false;
-
- // In addition, tracing into BO requires that its surrounding s/zext (if
- // any) is distributable to both operands.
- //
- // Suppose BO = A op B.
- // SignExtended | ZeroExtended | Distributable?
- // --------------+--------------+----------------------------------
- // 0 | 0 | true because no s/zext exists
- // 0 | 1 | zext(BO) == zext(A) op zext(B)
- // 1 | 0 | sext(BO) == sext(A) op sext(B)
- // 1 | 1 | zext(sext(BO)) ==
- // | | zext(sext(A)) op zext(sext(B))
- if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) {
- // If a + b >= 0 and (a >= 0 or b >= 0), then
- // sext(a + b) = sext(a) + sext(b)
- // even if the addition is not marked nsw.
- //
- // Leveraging this invariant, we can trace into an sext'ed inbound GEP
- // index if the constant offset is non-negative.
- //
- // Verified in @sext_add in split-gep.ll.
- if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
- if (!ConstLHS->isNegative())
- return true;
- }
- if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
- if (!ConstRHS->isNegative())
- return true;
- }
- }
-
- // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
- // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
- if (BO->getOpcode() == Instruction::Add ||
- BO->getOpcode() == Instruction::Sub) {
- if (SignExtended && !BO->hasNoSignedWrap())
- return false;
- if (ZeroExtended && !BO->hasNoUnsignedWrap())
- return false;
- }
-
- return true;
-}
-
-APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
- bool SignExtended,
- bool ZeroExtended) {
- // Save off the current height of the chain, in case we need to restore it.
- size_t ChainLength = UserChain.size();
-
- // BO being non-negative does not shed light on whether its operands are
- // non-negative. Clear the NonNegative flag here.
- APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
- /* NonNegative */ false);
- // If we found a constant offset in the left operand, stop and return that.
- // This shortcut might cause us to miss opportunities of combining the
- // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
- // However, such cases are probably already handled by -instcombine,
- // given this pass runs after the standard optimizations.
- if (ConstantOffset != 0) return ConstantOffset;
-
- // Reset the chain back to where it was when we started exploring this node,
- // since visiting the LHS didn't pan out.
- UserChain.resize(ChainLength);
-
- ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
- /* NonNegative */ false);
- // If U is a sub operator, negate the constant offset found in the right
- // operand.
- if (BO->getOpcode() == Instruction::Sub)
- ConstantOffset = -ConstantOffset;
-
- // If RHS wasn't a suitable candidate either, reset the chain again.
- if (ConstantOffset == 0)
- UserChain.resize(ChainLength);
-
- return ConstantOffset;
-}
-
-APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
- bool ZeroExtended, bool NonNegative) {
- // TODO(jingyue): We could trace into integer/pointer casts, such as
- // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
- // integers because it gives good enough results for our benchmarks.
- unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
-
- // We cannot do much with Values that are not a User, such as an Argument.
- User *U = dyn_cast<User>(V);
- if (U == nullptr) return APInt(BitWidth, 0);
-
- APInt ConstantOffset(BitWidth, 0);
- if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
- // Hooray, we found it!
- ConstantOffset = CI->getValue();
- } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
- // Trace into subexpressions for more hoisting opportunities.
- if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
- ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
- } else if (isa<TruncInst>(V)) {
- ConstantOffset =
- find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
- .trunc(BitWidth);
- } else if (isa<SExtInst>(V)) {
- ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
- ZeroExtended, NonNegative).sext(BitWidth);
- } else if (isa<ZExtInst>(V)) {
- // As an optimization, we can clear the SignExtended flag because
- // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
- //
- // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
- ConstantOffset =
- find(U->getOperand(0), /* SignExtended */ false,
- /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
- }
-
- // If we found a non-zero constant offset, add it to the path for
- // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
- // help this optimization.
- if (ConstantOffset != 0)
- UserChain.push_back(U);
- return ConstantOffset;
-}
-
-Value *ConstantOffsetExtractor::applyExts(Value *V) {
- Value *Current = V;
- // ExtInsts is built in the use-def order. Therefore, we apply them to V
- // in the reversed order.
- for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) {
- if (Constant *C = dyn_cast<Constant>(Current)) {
- // If Current is a constant, apply s/zext using ConstantExpr::getCast.
- // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
- Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType());
- } else {
- Instruction *Ext = (*I)->clone();
- Ext->setOperand(0, Current);
- Ext->insertBefore(IP);
- Current = Ext;
- }
- }
- return Current;
-}
-
-Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
- distributeExtsAndCloneChain(UserChain.size() - 1);
- // Remove all nullptrs (used to be s/zext) from UserChain.
- unsigned NewSize = 0;
- for (User *I : UserChain) {
- if (I != nullptr) {
- UserChain[NewSize] = I;
- NewSize++;
- }
- }
- UserChain.resize(NewSize);
- return removeConstOffset(UserChain.size() - 1);
-}
-
-Value *
-ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
- User *U = UserChain[ChainIndex];
- if (ChainIndex == 0) {
- assert(isa<ConstantInt>(U));
- // If U is a ConstantInt, applyExts will return a ConstantInt as well.
- return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
- }
-
- if (CastInst *Cast = dyn_cast<CastInst>(U)) {
- assert(
- (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) &&
- "Only following instructions can be traced: sext, zext & trunc");
- ExtInsts.push_back(Cast);
- UserChain[ChainIndex] = nullptr;
- return distributeExtsAndCloneChain(ChainIndex - 1);
- }
-
- // Function find only trace into BinaryOperator and CastInst.
- BinaryOperator *BO = cast<BinaryOperator>(U);
- // OpNo = which operand of BO is UserChain[ChainIndex - 1]
- unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
- Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
- Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
-
- BinaryOperator *NewBO = nullptr;
- if (OpNo == 0) {
- NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
- BO->getName(), IP);
- } else {
- NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
- BO->getName(), IP);
- }
- return UserChain[ChainIndex] = NewBO;
-}
-
-Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
- if (ChainIndex == 0) {
- assert(isa<ConstantInt>(UserChain[ChainIndex]));
- return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
- }
-
- BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
- assert((BO->use_empty() || BO->hasOneUse()) &&
- "distributeExtsAndCloneChain clones each BinaryOperator in "
- "UserChain, so no one should be used more than "
- "once");
-
- unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
- assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
- Value *NextInChain = removeConstOffset(ChainIndex - 1);
- Value *TheOther = BO->getOperand(1 - OpNo);
-
- // If NextInChain is 0 and not the LHS of a sub, we can simplify the
- // sub-expression to be just TheOther.
- if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
- if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
- return TheOther;
- }
-
- BinaryOperator::BinaryOps NewOp = BO->getOpcode();
- if (BO->getOpcode() == Instruction::Or) {
- // Rebuild "or" as "add", because "or" may be invalid for the new
- // expression.
- //
- // For instance, given
- // a | (b + 5) where a and b + 5 have no common bits,
- // we can extract 5 as the constant offset.
- //
- // However, reusing the "or" in the new index would give us
- // (a | b) + 5
- // which does not equal a | (b + 5).
- //
- // Replacing the "or" with "add" is fine, because
- // a | (b + 5) = a + (b + 5) = (a + b) + 5
- NewOp = Instruction::Add;
- }
-
- BinaryOperator *NewBO;
- if (OpNo == 0) {
- NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP);
- } else {
- NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP);
- }
- NewBO->takeName(BO);
- return NewBO;
-}
-
-Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
- User *&UserChainTail,
- const DominatorTree *DT) {
- ConstantOffsetExtractor Extractor(GEP, DT);
- // Find a non-zero constant offset first.
- APInt ConstantOffset =
- Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
- GEP->isInBounds());
- if (ConstantOffset == 0) {
- UserChainTail = nullptr;
- return nullptr;
- }
- // Separates the constant offset from the GEP index.
- Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();
- UserChainTail = Extractor.UserChain.back();
- return IdxWithoutConstOffset;
-}
-
-int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
- const DominatorTree *DT) {
- // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
- return ConstantOffsetExtractor(GEP, DT)
- .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
- GEP->isInBounds())
- .getSExtValue();
-}
-
-bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
- GetElementPtrInst *GEP) {
- bool Changed = false;
- Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
- gep_type_iterator GTI = gep_type_begin(*GEP);
- for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
- I != E; ++I, ++GTI) {
- // Skip struct member indices which must be i32.
- if (GTI.isSequential()) {
- if ((*I)->getType() != IntPtrTy) {
- *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
- Changed = true;
- }
- }
- }
- return Changed;
-}
-
-int64_t
-SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
- bool &NeedsExtraction) {
- NeedsExtraction = false;
- int64_t AccumulativeByteOffset = 0;
- gep_type_iterator GTI = gep_type_begin(*GEP);
- for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
- if (GTI.isSequential()) {
- // Tries to extract a constant offset from this GEP index.
- int64_t ConstantOffset =
- ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
- if (ConstantOffset != 0) {
- NeedsExtraction = true;
- // A GEP may have multiple indices. We accumulate the extracted
- // constant offset to a byte offset, and later offset the remainder of
- // the original GEP with this byte offset.
- AccumulativeByteOffset +=
- ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
- }
- } else if (LowerGEP) {
- StructType *StTy = GTI.getStructType();
- uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
- // Skip field 0 as the offset is always 0.
- if (Field != 0) {
- NeedsExtraction = true;
- AccumulativeByteOffset +=
- DL->getStructLayout(StTy)->getElementOffset(Field);
- }
- }
- }
- return AccumulativeByteOffset;
-}
-
-void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
- GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
- IRBuilder<> Builder(Variadic);
- Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
-
- Type *I8PtrTy =
- Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
- Value *ResultPtr = Variadic->getOperand(0);
- Loop *L = LI->getLoopFor(Variadic->getParent());
- // Check if the base is not loop invariant or used more than once.
- bool isSwapCandidate =
- L && L->isLoopInvariant(ResultPtr) &&
- !hasMoreThanOneUseInLoop(ResultPtr, L);
- Value *FirstResult = nullptr;
-
- if (ResultPtr->getType() != I8PtrTy)
- ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
-
- gep_type_iterator GTI = gep_type_begin(*Variadic);
- // Create an ugly GEP for each sequential index. We don't create GEPs for
- // structure indices, as they are accumulated in the constant offset index.
- for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
- if (GTI.isSequential()) {
- Value *Idx = Variadic->getOperand(I);
- // Skip zero indices.
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
- if (CI->isZero())
- continue;
-
- APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
- DL->getTypeAllocSize(GTI.getIndexedType()));
- // Scale the index by element size.
- if (ElementSize != 1) {
- if (ElementSize.isPowerOf2()) {
- Idx = Builder.CreateShl(
- Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
- } else {
- Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
- }
- }
- // Create an ugly GEP with a single index for each index.
- ResultPtr =
- Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
- if (FirstResult == nullptr)
- FirstResult = ResultPtr;
- }
- }
-
- // Create a GEP with the constant offset index.
- if (AccumulativeByteOffset != 0) {
- Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
- ResultPtr =
- Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
- } else
- isSwapCandidate = false;
-
- // If we created a GEP with constant index, and the base is loop invariant,
- // then we swap the first one with it, so LICM can move constant GEP out
- // later.
+}
+
+bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
+ bool ZeroExtended,
+ BinaryOperator *BO,
+ bool NonNegative) {
+ // We only consider ADD, SUB and OR, because a non-zero constant found in
+ // expressions composed of these operations can be easily hoisted as a
+ // constant offset by reassociation.
+ if (BO->getOpcode() != Instruction::Add &&
+ BO->getOpcode() != Instruction::Sub &&
+ BO->getOpcode() != Instruction::Or) {
+ return false;
+ }
+
+ Value *LHS = BO->getOperand(0), *RHS = BO->getOperand(1);
+ // Do not trace into "or" unless it is equivalent to "add". If LHS and RHS
+ // don't have common bits, (LHS | RHS) is equivalent to (LHS + RHS).
+ // FIXME: this does not appear to be covered by any tests
+ // (with x86/aarch64 backends at least)
+ if (BO->getOpcode() == Instruction::Or &&
+ !haveNoCommonBitsSet(LHS, RHS, DL, nullptr, BO, DT))
+ return false;
+
+ // In addition, tracing into BO requires that its surrounding s/zext (if
+ // any) is distributable to both operands.
+ //
+ // Suppose BO = A op B.
+ // SignExtended | ZeroExtended | Distributable?
+ // --------------+--------------+----------------------------------
+ // 0 | 0 | true because no s/zext exists
+ // 0 | 1 | zext(BO) == zext(A) op zext(B)
+ // 1 | 0 | sext(BO) == sext(A) op sext(B)
+ // 1 | 1 | zext(sext(BO)) ==
+ // | | zext(sext(A)) op zext(sext(B))
+ if (BO->getOpcode() == Instruction::Add && !ZeroExtended && NonNegative) {
+ // If a + b >= 0 and (a >= 0 or b >= 0), then
+ // sext(a + b) = sext(a) + sext(b)
+ // even if the addition is not marked nsw.
+ //
+ // Leveraging this invariant, we can trace into an sext'ed inbound GEP
+ // index if the constant offset is non-negative.
+ //
+ // Verified in @sext_add in split-gep.ll.
+ if (ConstantInt *ConstLHS = dyn_cast<ConstantInt>(LHS)) {
+ if (!ConstLHS->isNegative())
+ return true;
+ }
+ if (ConstantInt *ConstRHS = dyn_cast<ConstantInt>(RHS)) {
+ if (!ConstRHS->isNegative())
+ return true;
+ }
+ }
+
+ // sext (add/sub nsw A, B) == add/sub nsw (sext A), (sext B)
+ // zext (add/sub nuw A, B) == add/sub nuw (zext A), (zext B)
+ if (BO->getOpcode() == Instruction::Add ||
+ BO->getOpcode() == Instruction::Sub) {
+ if (SignExtended && !BO->hasNoSignedWrap())
+ return false;
+ if (ZeroExtended && !BO->hasNoUnsignedWrap())
+ return false;
+ }
+
+ return true;
+}
+
+APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
+ bool SignExtended,
+ bool ZeroExtended) {
+ // Save off the current height of the chain, in case we need to restore it.
+ size_t ChainLength = UserChain.size();
+
+ // BO being non-negative does not shed light on whether its operands are
+ // non-negative. Clear the NonNegative flag here.
+ APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
+ /* NonNegative */ false);
+ // If we found a constant offset in the left operand, stop and return that.
+ // This shortcut might cause us to miss opportunities of combining the
+ // constant offsets in both operands, e.g., (a + 4) + (b + 5) => (a + b) + 9.
+ // However, such cases are probably already handled by -instcombine,
+ // given this pass runs after the standard optimizations.
+ if (ConstantOffset != 0) return ConstantOffset;
+
+ // Reset the chain back to where it was when we started exploring this node,
+ // since visiting the LHS didn't pan out.
+ UserChain.resize(ChainLength);
+
+ ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
+ /* NonNegative */ false);
+ // If U is a sub operator, negate the constant offset found in the right
+ // operand.
+ if (BO->getOpcode() == Instruction::Sub)
+ ConstantOffset = -ConstantOffset;
+
+ // If RHS wasn't a suitable candidate either, reset the chain again.
+ if (ConstantOffset == 0)
+ UserChain.resize(ChainLength);
+
+ return ConstantOffset;
+}
+
+APInt ConstantOffsetExtractor::find(Value *V, bool SignExtended,
+ bool ZeroExtended, bool NonNegative) {
+ // TODO(jingyue): We could trace into integer/pointer casts, such as
+ // inttoptr, ptrtoint, bitcast, and addrspacecast. We choose to handle only
+ // integers because it gives good enough results for our benchmarks.
+ unsigned BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+
+ // We cannot do much with Values that are not a User, such as an Argument.
+ User *U = dyn_cast<User>(V);
+ if (U == nullptr) return APInt(BitWidth, 0);
+
+ APInt ConstantOffset(BitWidth, 0);
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+ // Hooray, we found it!
+ ConstantOffset = CI->getValue();
+ } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(V)) {
+ // Trace into subexpressions for more hoisting opportunities.
+ if (CanTraceInto(SignExtended, ZeroExtended, BO, NonNegative))
+ ConstantOffset = findInEitherOperand(BO, SignExtended, ZeroExtended);
+ } else if (isa<TruncInst>(V)) {
+ ConstantOffset =
+ find(U->getOperand(0), SignExtended, ZeroExtended, NonNegative)
+ .trunc(BitWidth);
+ } else if (isa<SExtInst>(V)) {
+ ConstantOffset = find(U->getOperand(0), /* SignExtended */ true,
+ ZeroExtended, NonNegative).sext(BitWidth);
+ } else if (isa<ZExtInst>(V)) {
+ // As an optimization, we can clear the SignExtended flag because
+ // sext(zext(a)) = zext(a). Verified in @sext_zext in split-gep.ll.
+ //
+ // Clear the NonNegative flag, because zext(a) >= 0 does not imply a >= 0.
+ ConstantOffset =
+ find(U->getOperand(0), /* SignExtended */ false,
+ /* ZeroExtended */ true, /* NonNegative */ false).zext(BitWidth);
+ }
+
+ // If we found a non-zero constant offset, add it to the path for
+ // rebuildWithoutConstOffset. Zero is a valid constant offset, but doesn't
+ // help this optimization.
+ if (ConstantOffset != 0)
+ UserChain.push_back(U);
+ return ConstantOffset;
+}
+
+Value *ConstantOffsetExtractor::applyExts(Value *V) {
+ Value *Current = V;
+ // ExtInsts is built in the use-def order. Therefore, we apply them to V
+ // in the reversed order.
+ for (auto I = ExtInsts.rbegin(), E = ExtInsts.rend(); I != E; ++I) {
+ if (Constant *C = dyn_cast<Constant>(Current)) {
+ // If Current is a constant, apply s/zext using ConstantExpr::getCast.
+ // ConstantExpr::getCast emits a ConstantInt if C is a ConstantInt.
+ Current = ConstantExpr::getCast((*I)->getOpcode(), C, (*I)->getType());
+ } else {
+ Instruction *Ext = (*I)->clone();
+ Ext->setOperand(0, Current);
+ Ext->insertBefore(IP);
+ Current = Ext;
+ }
+ }
+ return Current;
+}
+
+Value *ConstantOffsetExtractor::rebuildWithoutConstOffset() {
+ distributeExtsAndCloneChain(UserChain.size() - 1);
+ // Remove all nullptrs (used to be s/zext) from UserChain.
+ unsigned NewSize = 0;
+ for (User *I : UserChain) {
+ if (I != nullptr) {
+ UserChain[NewSize] = I;
+ NewSize++;
+ }
+ }
+ UserChain.resize(NewSize);
+ return removeConstOffset(UserChain.size() - 1);
+}
+
+Value *
+ConstantOffsetExtractor::distributeExtsAndCloneChain(unsigned ChainIndex) {
+ User *U = UserChain[ChainIndex];
+ if (ChainIndex == 0) {
+ assert(isa<ConstantInt>(U));
+ // If U is a ConstantInt, applyExts will return a ConstantInt as well.
+ return UserChain[ChainIndex] = cast<ConstantInt>(applyExts(U));
+ }
+
+ if (CastInst *Cast = dyn_cast<CastInst>(U)) {
+ assert(
+ (isa<SExtInst>(Cast) || isa<ZExtInst>(Cast) || isa<TruncInst>(Cast)) &&
+ "Only following instructions can be traced: sext, zext & trunc");
+ ExtInsts.push_back(Cast);
+ UserChain[ChainIndex] = nullptr;
+ return distributeExtsAndCloneChain(ChainIndex - 1);
+ }
+
+ // Function find only trace into BinaryOperator and CastInst.
+ BinaryOperator *BO = cast<BinaryOperator>(U);
+ // OpNo = which operand of BO is UserChain[ChainIndex - 1]
+ unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+ Value *TheOther = applyExts(BO->getOperand(1 - OpNo));
+ Value *NextInChain = distributeExtsAndCloneChain(ChainIndex - 1);
+
+ BinaryOperator *NewBO = nullptr;
+ if (OpNo == 0) {
+ NewBO = BinaryOperator::Create(BO->getOpcode(), NextInChain, TheOther,
+ BO->getName(), IP);
+ } else {
+ NewBO = BinaryOperator::Create(BO->getOpcode(), TheOther, NextInChain,
+ BO->getName(), IP);
+ }
+ return UserChain[ChainIndex] = NewBO;
+}
+
+Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
+ if (ChainIndex == 0) {
+ assert(isa<ConstantInt>(UserChain[ChainIndex]));
+ return ConstantInt::getNullValue(UserChain[ChainIndex]->getType());
+ }
+
+ BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
+ assert((BO->use_empty() || BO->hasOneUse()) &&
+ "distributeExtsAndCloneChain clones each BinaryOperator in "
+ "UserChain, so no one should be used more than "
+ "once");
+
+ unsigned OpNo = (BO->getOperand(0) == UserChain[ChainIndex - 1] ? 0 : 1);
+ assert(BO->getOperand(OpNo) == UserChain[ChainIndex - 1]);
+ Value *NextInChain = removeConstOffset(ChainIndex - 1);
+ Value *TheOther = BO->getOperand(1 - OpNo);
+
+ // If NextInChain is 0 and not the LHS of a sub, we can simplify the
+ // sub-expression to be just TheOther.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(NextInChain)) {
+ if (CI->isZero() && !(BO->getOpcode() == Instruction::Sub && OpNo == 0))
+ return TheOther;
+ }
+
+ BinaryOperator::BinaryOps NewOp = BO->getOpcode();
+ if (BO->getOpcode() == Instruction::Or) {
+ // Rebuild "or" as "add", because "or" may be invalid for the new
+ // expression.
+ //
+ // For instance, given
+ // a | (b + 5) where a and b + 5 have no common bits,
+ // we can extract 5 as the constant offset.
+ //
+ // However, reusing the "or" in the new index would give us
+ // (a | b) + 5
+ // which does not equal a | (b + 5).
+ //
+ // Replacing the "or" with "add" is fine, because
+ // a | (b + 5) = a + (b + 5) = (a + b) + 5
+ NewOp = Instruction::Add;
+ }
+
+ BinaryOperator *NewBO;
+ if (OpNo == 0) {
+ NewBO = BinaryOperator::Create(NewOp, NextInChain, TheOther, "", IP);
+ } else {
+ NewBO = BinaryOperator::Create(NewOp, TheOther, NextInChain, "", IP);
+ }
+ NewBO->takeName(BO);
+ return NewBO;
+}
+
+Value *ConstantOffsetExtractor::Extract(Value *Idx, GetElementPtrInst *GEP,
+ User *&UserChainTail,
+ const DominatorTree *DT) {
+ ConstantOffsetExtractor Extractor(GEP, DT);
+ // Find a non-zero constant offset first.
+ APInt ConstantOffset =
+ Extractor.find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+ GEP->isInBounds());
+ if (ConstantOffset == 0) {
+ UserChainTail = nullptr;
+ return nullptr;
+ }
+ // Separates the constant offset from the GEP index.
+ Value *IdxWithoutConstOffset = Extractor.rebuildWithoutConstOffset();
+ UserChainTail = Extractor.UserChain.back();
+ return IdxWithoutConstOffset;
+}
+
+int64_t ConstantOffsetExtractor::Find(Value *Idx, GetElementPtrInst *GEP,
+ const DominatorTree *DT) {
+ // If Idx is an index of an inbound GEP, Idx is guaranteed to be non-negative.
+ return ConstantOffsetExtractor(GEP, DT)
+ .find(Idx, /* SignExtended */ false, /* ZeroExtended */ false,
+ GEP->isInBounds())
+ .getSExtValue();
+}
+
+bool SeparateConstOffsetFromGEP::canonicalizeArrayIndicesToPointerSize(
+ GetElementPtrInst *GEP) {
+ bool Changed = false;
+ Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+ gep_type_iterator GTI = gep_type_begin(*GEP);
+ for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end();
+ I != E; ++I, ++GTI) {
+ // Skip struct member indices which must be i32.
+ if (GTI.isSequential()) {
+ if ((*I)->getType() != IntPtrTy) {
+ *I = CastInst::CreateIntegerCast(*I, IntPtrTy, true, "idxprom", GEP);
+ Changed = true;
+ }
+ }
+ }
+ return Changed;
+}
+
+int64_t
+SeparateConstOffsetFromGEP::accumulateByteOffset(GetElementPtrInst *GEP,
+ bool &NeedsExtraction) {
+ NeedsExtraction = false;
+ int64_t AccumulativeByteOffset = 0;
+ gep_type_iterator GTI = gep_type_begin(*GEP);
+ for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+ if (GTI.isSequential()) {
+ // Tries to extract a constant offset from this GEP index.
+ int64_t ConstantOffset =
+ ConstantOffsetExtractor::Find(GEP->getOperand(I), GEP, DT);
+ if (ConstantOffset != 0) {
+ NeedsExtraction = true;
+ // A GEP may have multiple indices. We accumulate the extracted
+ // constant offset to a byte offset, and later offset the remainder of
+ // the original GEP with this byte offset.
+ AccumulativeByteOffset +=
+ ConstantOffset * DL->getTypeAllocSize(GTI.getIndexedType());
+ }
+ } else if (LowerGEP) {
+ StructType *StTy = GTI.getStructType();
+ uint64_t Field = cast<ConstantInt>(GEP->getOperand(I))->getZExtValue();
+ // Skip field 0 as the offset is always 0.
+ if (Field != 0) {
+ NeedsExtraction = true;
+ AccumulativeByteOffset +=
+ DL->getStructLayout(StTy)->getElementOffset(Field);
+ }
+ }
+ }
+ return AccumulativeByteOffset;
+}
+
+void SeparateConstOffsetFromGEP::lowerToSingleIndexGEPs(
+ GetElementPtrInst *Variadic, int64_t AccumulativeByteOffset) {
+ IRBuilder<> Builder(Variadic);
+ Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+ Type *I8PtrTy =
+ Builder.getInt8PtrTy(Variadic->getType()->getPointerAddressSpace());
+ Value *ResultPtr = Variadic->getOperand(0);
+ Loop *L = LI->getLoopFor(Variadic->getParent());
+ // Check if the base is not loop invariant or used more than once.
+ bool isSwapCandidate =
+ L && L->isLoopInvariant(ResultPtr) &&
+ !hasMoreThanOneUseInLoop(ResultPtr, L);
+ Value *FirstResult = nullptr;
+
+ if (ResultPtr->getType() != I8PtrTy)
+ ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+
+ gep_type_iterator GTI = gep_type_begin(*Variadic);
+ // Create an ugly GEP for each sequential index. We don't create GEPs for
+ // structure indices, as they are accumulated in the constant offset index.
+ for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+ if (GTI.isSequential()) {
+ Value *Idx = Variadic->getOperand(I);
+ // Skip zero indices.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+ if (CI->isZero())
+ continue;
+
+ APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+ DL->getTypeAllocSize(GTI.getIndexedType()));
+ // Scale the index by element size.
+ if (ElementSize != 1) {
+ if (ElementSize.isPowerOf2()) {
+ Idx = Builder.CreateShl(
+ Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+ } else {
+ Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+ }
+ }
+ // Create an ugly GEP with a single index for each index.
+ ResultPtr =
+ Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Idx, "uglygep");
+ if (FirstResult == nullptr)
+ FirstResult = ResultPtr;
+ }
+ }
+
+ // Create a GEP with the constant offset index.
+ if (AccumulativeByteOffset != 0) {
+ Value *Offset = ConstantInt::get(IntPtrTy, AccumulativeByteOffset);
+ ResultPtr =
+ Builder.CreateGEP(Builder.getInt8Ty(), ResultPtr, Offset, "uglygep");
+ } else
+ isSwapCandidate = false;
+
+ // If we created a GEP with constant index, and the base is loop invariant,
+ // then we swap the first one with it, so LICM can move constant GEP out
+ // later.
auto *FirstGEP = dyn_cast_or_null<GetElementPtrInst>(FirstResult);
auto *SecondGEP = dyn_cast<GetElementPtrInst>(ResultPtr);
- if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
- swapGEPOperand(FirstGEP, SecondGEP);
-
- if (ResultPtr->getType() != Variadic->getType())
- ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
-
- Variadic->replaceAllUsesWith(ResultPtr);
- Variadic->eraseFromParent();
-}
-
-void
-SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
- int64_t AccumulativeByteOffset) {
- IRBuilder<> Builder(Variadic);
- Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
-
- Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
- gep_type_iterator GTI = gep_type_begin(*Variadic);
- // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
- // don't create arithmetics for structure indices, as they are accumulated
- // in the constant offset index.
- for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
- if (GTI.isSequential()) {
- Value *Idx = Variadic->getOperand(I);
- // Skip zero indices.
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
- if (CI->isZero())
- continue;
-
- APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
- DL->getTypeAllocSize(GTI.getIndexedType()));
- // Scale the index by element size.
- if (ElementSize != 1) {
- if (ElementSize.isPowerOf2()) {
- Idx = Builder.CreateShl(
- Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
- } else {
- Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
- }
- }
- // Create an ADD for each index.
- ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
- }
- }
-
- // Create an ADD for the constant offset index.
- if (AccumulativeByteOffset != 0) {
- ResultPtr = Builder.CreateAdd(
- ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
- }
-
- ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
- Variadic->replaceAllUsesWith(ResultPtr);
- Variadic->eraseFromParent();
-}
-
-bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
- // Skip vector GEPs.
- if (GEP->getType()->isVectorTy())
- return false;
-
- // The backend can already nicely handle the case where all indices are
- // constant.
- if (GEP->hasAllConstantIndices())
- return false;
-
- bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
-
- bool NeedsExtraction;
- int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
-
- if (!NeedsExtraction)
- return Changed;
-
+ if (isSwapCandidate && isLegalToSwapOperand(FirstGEP, SecondGEP, L))
+ swapGEPOperand(FirstGEP, SecondGEP);
+
+ if (ResultPtr->getType() != Variadic->getType())
+ ResultPtr = Builder.CreateBitCast(ResultPtr, Variadic->getType());
+
+ Variadic->replaceAllUsesWith(ResultPtr);
+ Variadic->eraseFromParent();
+}
+
+void
+SeparateConstOffsetFromGEP::lowerToArithmetics(GetElementPtrInst *Variadic,
+ int64_t AccumulativeByteOffset) {
+ IRBuilder<> Builder(Variadic);
+ Type *IntPtrTy = DL->getIntPtrType(Variadic->getType());
+
+ Value *ResultPtr = Builder.CreatePtrToInt(Variadic->getOperand(0), IntPtrTy);
+ gep_type_iterator GTI = gep_type_begin(*Variadic);
+ // Create ADD/SHL/MUL arithmetic operations for each sequential indices. We
+ // don't create arithmetics for structure indices, as they are accumulated
+ // in the constant offset index.
+ for (unsigned I = 1, E = Variadic->getNumOperands(); I != E; ++I, ++GTI) {
+ if (GTI.isSequential()) {
+ Value *Idx = Variadic->getOperand(I);
+ // Skip zero indices.
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Idx))
+ if (CI->isZero())
+ continue;
+
+ APInt ElementSize = APInt(IntPtrTy->getIntegerBitWidth(),
+ DL->getTypeAllocSize(GTI.getIndexedType()));
+ // Scale the index by element size.
+ if (ElementSize != 1) {
+ if (ElementSize.isPowerOf2()) {
+ Idx = Builder.CreateShl(
+ Idx, ConstantInt::get(IntPtrTy, ElementSize.logBase2()));
+ } else {
+ Idx = Builder.CreateMul(Idx, ConstantInt::get(IntPtrTy, ElementSize));
+ }
+ }
+ // Create an ADD for each index.
+ ResultPtr = Builder.CreateAdd(ResultPtr, Idx);
+ }
+ }
+
+ // Create an ADD for the constant offset index.
+ if (AccumulativeByteOffset != 0) {
+ ResultPtr = Builder.CreateAdd(
+ ResultPtr, ConstantInt::get(IntPtrTy, AccumulativeByteOffset));
+ }
+
+ ResultPtr = Builder.CreateIntToPtr(ResultPtr, Variadic->getType());
+ Variadic->replaceAllUsesWith(ResultPtr);
+ Variadic->eraseFromParent();
+}
+
+bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) {
+ // Skip vector GEPs.
+ if (GEP->getType()->isVectorTy())
+ return false;
+
+ // The backend can already nicely handle the case where all indices are
+ // constant.
+ if (GEP->hasAllConstantIndices())
+ return false;
+
+ bool Changed = canonicalizeArrayIndicesToPointerSize(GEP);
+
+ bool NeedsExtraction;
+ int64_t AccumulativeByteOffset = accumulateByteOffset(GEP, NeedsExtraction);
+
+ if (!NeedsExtraction)
+ return Changed;
+
TargetTransformInfo &TTI = GetTTI(*GEP->getFunction());
-
- // If LowerGEP is disabled, before really splitting the GEP, check whether the
- // backend supports the addressing mode we are about to produce. If no, this
- // splitting probably won't be beneficial.
- // If LowerGEP is enabled, even the extracted constant offset can not match
- // the addressing mode, we can still do optimizations to other lowered parts
- // of variable indices. Therefore, we don't check for addressing modes in that
- // case.
- if (!LowerGEP) {
- unsigned AddrSpace = GEP->getPointerAddressSpace();
- if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
- /*BaseGV=*/nullptr, AccumulativeByteOffset,
- /*HasBaseReg=*/true, /*Scale=*/0,
- AddrSpace)) {
- return Changed;
- }
- }
-
- // Remove the constant offset in each sequential index. The resultant GEP
- // computes the variadic base.
- // Notice that we don't remove struct field indices here. If LowerGEP is
- // disabled, a structure index is not accumulated and we still use the old
- // one. If LowerGEP is enabled, a structure index is accumulated in the
- // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
- // handle the constant offset and won't need a new structure index.
- gep_type_iterator GTI = gep_type_begin(*GEP);
- for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
- if (GTI.isSequential()) {
- // Splits this GEP index into a variadic part and a constant offset, and
- // uses the variadic part as the new index.
- Value *OldIdx = GEP->getOperand(I);
- User *UserChainTail;
- Value *NewIdx =
- ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT);
- if (NewIdx != nullptr) {
- // Switches to the index with the constant offset removed.
- GEP->setOperand(I, NewIdx);
- // After switching to the new index, we can garbage-collect UserChain
- // and the old index if they are not used.
- RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
- RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
- }
- }
- }
-
- // Clear the inbounds attribute because the new index may be off-bound.
- // e.g.,
- //
- // b = add i64 a, 5
- // addr = gep inbounds float, float* p, i64 b
- //
- // is transformed to:
- //
- // addr2 = gep float, float* p, i64 a ; inbounds removed
- // addr = gep inbounds float, float* addr2, i64 5
- //
- // If a is -4, although the old index b is in bounds, the new index a is
- // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
- // inbounds keyword is not present, the offsets are added to the base
- // address with silently-wrapping two's complement arithmetic".
- // Therefore, the final code will be a semantically equivalent.
- //
- // TODO(jingyue): do some range analysis to keep as many inbounds as
- // possible. GEPs with inbounds are more friendly to alias analysis.
- bool GEPWasInBounds = GEP->isInBounds();
- GEP->setIsInBounds(false);
-
- // Lowers a GEP to either GEPs with a single index or arithmetic operations.
- if (LowerGEP) {
- // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
- // arithmetic operations if the target uses alias analysis in codegen.
- if (TTI.useAA())
- lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
- else
- lowerToArithmetics(GEP, AccumulativeByteOffset);
- return true;
- }
-
- // No need to create another GEP if the accumulative byte offset is 0.
- if (AccumulativeByteOffset == 0)
- return true;
-
- // Offsets the base with the accumulative byte offset.
- //
- // %gep ; the base
- // ... %gep ...
- //
- // => add the offset
- //
- // %gep2 ; clone of %gep
- // %new.gep = gep %gep2, <offset / sizeof(*%gep)>
- // %gep ; will be removed
- // ... %gep ...
- //
- // => replace all uses of %gep with %new.gep and remove %gep
- //
- // %gep2 ; clone of %gep
- // %new.gep = gep %gep2, <offset / sizeof(*%gep)>
- // ... %new.gep ...
- //
- // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
- // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
- // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
- // type of %gep.
- //
- // %gep2 ; clone of %gep
- // %0 = bitcast %gep2 to i8*
- // %uglygep = gep %0, <offset>
- // %new.gep = bitcast %uglygep to <type of %gep>
- // ... %new.gep ...
- Instruction *NewGEP = GEP->clone();
- NewGEP->insertBefore(GEP);
-
- // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
- // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
- // used with unsigned integers later.
- int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
- DL->getTypeAllocSize(GEP->getResultElementType()));
- Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
- if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
- // Very likely. As long as %gep is naturally aligned, the byte offset we
- // extracted should be a multiple of sizeof(*%gep).
- int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
- NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
- ConstantInt::get(IntPtrTy, Index, true),
- GEP->getName(), GEP);
- NewGEP->copyMetadata(*GEP);
- // Inherit the inbounds attribute of the original GEP.
- cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
- } else {
- // Unlikely but possible. For example,
- // #pragma pack(1)
- // struct S {
- // int a[3];
- // int64 b[8];
- // };
- // #pragma pack()
- //
- // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
- // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
- // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
- // sizeof(int64).
- //
- // Emit an uglygep in this case.
- Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
- GEP->getPointerAddressSpace());
- NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
- NewGEP = GetElementPtrInst::Create(
- Type::getInt8Ty(GEP->getContext()), NewGEP,
- ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
- GEP);
- NewGEP->copyMetadata(*GEP);
- // Inherit the inbounds attribute of the original GEP.
- cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
- if (GEP->getType() != I8PtrTy)
- NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
- }
-
- GEP->replaceAllUsesWith(NewGEP);
- GEP->eraseFromParent();
-
- return true;
-}
-
+
+ // If LowerGEP is disabled, before really splitting the GEP, check whether the
+ // backend supports the addressing mode we are about to produce. If no, this
+ // splitting probably won't be beneficial.
+ // If LowerGEP is enabled, even the extracted constant offset can not match
+ // the addressing mode, we can still do optimizations to other lowered parts
+ // of variable indices. Therefore, we don't check for addressing modes in that
+ // case.
+ if (!LowerGEP) {
+ unsigned AddrSpace = GEP->getPointerAddressSpace();
+ if (!TTI.isLegalAddressingMode(GEP->getResultElementType(),
+ /*BaseGV=*/nullptr, AccumulativeByteOffset,
+ /*HasBaseReg=*/true, /*Scale=*/0,
+ AddrSpace)) {
+ return Changed;
+ }
+ }
+
+ // Remove the constant offset in each sequential index. The resultant GEP
+ // computes the variadic base.
+ // Notice that we don't remove struct field indices here. If LowerGEP is
+ // disabled, a structure index is not accumulated and we still use the old
+ // one. If LowerGEP is enabled, a structure index is accumulated in the
+ // constant offset. LowerToSingleIndexGEPs or lowerToArithmetics will later
+ // handle the constant offset and won't need a new structure index.
+ gep_type_iterator GTI = gep_type_begin(*GEP);
+ for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+ if (GTI.isSequential()) {
+ // Splits this GEP index into a variadic part and a constant offset, and
+ // uses the variadic part as the new index.
+ Value *OldIdx = GEP->getOperand(I);
+ User *UserChainTail;
+ Value *NewIdx =
+ ConstantOffsetExtractor::Extract(OldIdx, GEP, UserChainTail, DT);
+ if (NewIdx != nullptr) {
+ // Switches to the index with the constant offset removed.
+ GEP->setOperand(I, NewIdx);
+ // After switching to the new index, we can garbage-collect UserChain
+ // and the old index if they are not used.
+ RecursivelyDeleteTriviallyDeadInstructions(UserChainTail);
+ RecursivelyDeleteTriviallyDeadInstructions(OldIdx);
+ }
+ }
+ }
+
+ // Clear the inbounds attribute because the new index may be off-bound.
+ // e.g.,
+ //
+ // b = add i64 a, 5
+ // addr = gep inbounds float, float* p, i64 b
+ //
+ // is transformed to:
+ //
+ // addr2 = gep float, float* p, i64 a ; inbounds removed
+ // addr = gep inbounds float, float* addr2, i64 5
+ //
+ // If a is -4, although the old index b is in bounds, the new index a is
+ // off-bound. http://llvm.org/docs/LangRef.html#id181 says "if the
+ // inbounds keyword is not present, the offsets are added to the base
+ // address with silently-wrapping two's complement arithmetic".
+ // Therefore, the final code will be a semantically equivalent.
+ //
+ // TODO(jingyue): do some range analysis to keep as many inbounds as
+ // possible. GEPs with inbounds are more friendly to alias analysis.
+ bool GEPWasInBounds = GEP->isInBounds();
+ GEP->setIsInBounds(false);
+
+ // Lowers a GEP to either GEPs with a single index or arithmetic operations.
+ if (LowerGEP) {
+ // As currently BasicAA does not analyze ptrtoint/inttoptr, do not lower to
+ // arithmetic operations if the target uses alias analysis in codegen.
+ if (TTI.useAA())
+ lowerToSingleIndexGEPs(GEP, AccumulativeByteOffset);
+ else
+ lowerToArithmetics(GEP, AccumulativeByteOffset);
+ return true;
+ }
+
+ // No need to create another GEP if the accumulative byte offset is 0.
+ if (AccumulativeByteOffset == 0)
+ return true;
+
+ // Offsets the base with the accumulative byte offset.
+ //
+ // %gep ; the base
+ // ... %gep ...
+ //
+ // => add the offset
+ //
+ // %gep2 ; clone of %gep
+ // %new.gep = gep %gep2, <offset / sizeof(*%gep)>
+ // %gep ; will be removed
+ // ... %gep ...
+ //
+ // => replace all uses of %gep with %new.gep and remove %gep
+ //
+ // %gep2 ; clone of %gep
+ // %new.gep = gep %gep2, <offset / sizeof(*%gep)>
+ // ... %new.gep ...
+ //
+ // If AccumulativeByteOffset is not a multiple of sizeof(*%gep), we emit an
+ // uglygep (http://llvm.org/docs/GetElementPtr.html#what-s-an-uglygep):
+ // bitcast %gep2 to i8*, add the offset, and bitcast the result back to the
+ // type of %gep.
+ //
+ // %gep2 ; clone of %gep
+ // %0 = bitcast %gep2 to i8*
+ // %uglygep = gep %0, <offset>
+ // %new.gep = bitcast %uglygep to <type of %gep>
+ // ... %new.gep ...
+ Instruction *NewGEP = GEP->clone();
+ NewGEP->insertBefore(GEP);
+
+ // Per ANSI C standard, signed / unsigned = unsigned and signed % unsigned =
+ // unsigned.. Therefore, we cast ElementTypeSizeOfGEP to signed because it is
+ // used with unsigned integers later.
+ int64_t ElementTypeSizeOfGEP = static_cast<int64_t>(
+ DL->getTypeAllocSize(GEP->getResultElementType()));
+ Type *IntPtrTy = DL->getIntPtrType(GEP->getType());
+ if (AccumulativeByteOffset % ElementTypeSizeOfGEP == 0) {
+ // Very likely. As long as %gep is naturally aligned, the byte offset we
+ // extracted should be a multiple of sizeof(*%gep).
+ int64_t Index = AccumulativeByteOffset / ElementTypeSizeOfGEP;
+ NewGEP = GetElementPtrInst::Create(GEP->getResultElementType(), NewGEP,
+ ConstantInt::get(IntPtrTy, Index, true),
+ GEP->getName(), GEP);
+ NewGEP->copyMetadata(*GEP);
+ // Inherit the inbounds attribute of the original GEP.
+ cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
+ } else {
+ // Unlikely but possible. For example,
+ // #pragma pack(1)
+ // struct S {
+ // int a[3];
+ // int64 b[8];
+ // };
+ // #pragma pack()
+ //
+ // Suppose the gep before extraction is &s[i + 1].b[j + 3]. After
+ // extraction, it becomes &s[i].b[j] and AccumulativeByteOffset is
+ // sizeof(S) + 3 * sizeof(int64) = 100, which is not a multiple of
+ // sizeof(int64).
+ //
+ // Emit an uglygep in this case.
+ Type *I8PtrTy = Type::getInt8PtrTy(GEP->getContext(),
+ GEP->getPointerAddressSpace());
+ NewGEP = new BitCastInst(NewGEP, I8PtrTy, "", GEP);
+ NewGEP = GetElementPtrInst::Create(
+ Type::getInt8Ty(GEP->getContext()), NewGEP,
+ ConstantInt::get(IntPtrTy, AccumulativeByteOffset, true), "uglygep",
+ GEP);
+ NewGEP->copyMetadata(*GEP);
+ // Inherit the inbounds attribute of the original GEP.
+ cast<GetElementPtrInst>(NewGEP)->setIsInBounds(GEPWasInBounds);
+ if (GEP->getType() != I8PtrTy)
+ NewGEP = new BitCastInst(NewGEP, GEP->getType(), GEP->getName(), GEP);
+ }
+
+ GEP->replaceAllUsesWith(NewGEP);
+ GEP->eraseFromParent();
+
+ return true;
+}
+
bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
+ if (skipFunction(F))
+ return false;
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
@@ -1156,218 +1156,218 @@ bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) {
SeparateConstOffsetFromGEP Impl(DT, SE, LI, TLI, GetTTI, LowerGEP);
return Impl.run(F);
}
-
+
bool SeparateConstOffsetFromGEP::run(Function &F) {
- if (DisableSeparateConstOffsetFromGEP)
- return false;
-
+ if (DisableSeparateConstOffsetFromGEP)
+ return false;
+
DL = &F.getParent()->getDataLayout();
- bool Changed = false;
- for (BasicBlock &B : F) {
- for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
- Changed |= splitGEP(GEP);
- // No need to split GEP ConstantExprs because all its indices are constant
- // already.
- }
-
- Changed |= reuniteExts(F);
-
- if (VerifyNoDeadCode)
- verifyNoDeadCode(F);
-
- return Changed;
-}
-
-Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
- const SCEV *Key, Instruction *Dominatee,
- DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs) {
- auto Pos = DominatingExprs.find(Key);
- if (Pos == DominatingExprs.end())
- return nullptr;
-
- auto &Candidates = Pos->second;
- // Because we process the basic blocks in pre-order of the dominator tree, a
- // candidate that doesn't dominate the current instruction won't dominate any
- // future instruction either. Therefore, we pop it out of the stack. This
- // optimization makes the algorithm O(n).
- while (!Candidates.empty()) {
- Instruction *Candidate = Candidates.back();
- if (DT->dominates(Candidate, Dominatee))
- return Candidate;
- Candidates.pop_back();
- }
- return nullptr;
-}
-
-bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
- if (!SE->isSCEVable(I->getType()))
- return false;
-
- // Dom: LHS+RHS
- // I: sext(LHS)+sext(RHS)
- // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
- // TODO: handle zext
- Value *LHS = nullptr, *RHS = nullptr;
- if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
- if (LHS->getType() == RHS->getType()) {
- const SCEV *Key =
- SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
- if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingAdds)) {
- Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
- NewSExt->takeName(I);
- I->replaceAllUsesWith(NewSExt);
- RecursivelyDeleteTriviallyDeadInstructions(I);
- return true;
- }
- }
- } else if (match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
- if (LHS->getType() == RHS->getType()) {
- const SCEV *Key =
- SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
- if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingSubs)) {
- Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
- NewSExt->takeName(I);
- I->replaceAllUsesWith(NewSExt);
- RecursivelyDeleteTriviallyDeadInstructions(I);
- return true;
- }
- }
- }
-
- // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
- if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS)))) {
- if (programUndefinedIfPoison(I)) {
- const SCEV *Key =
- SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
- DominatingAdds[Key].push_back(I);
- }
- } else if (match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
- if (programUndefinedIfPoison(I)) {
- const SCEV *Key =
- SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
- DominatingSubs[Key].push_back(I);
- }
- }
- return false;
-}
-
-bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
- bool Changed = false;
- DominatingAdds.clear();
- DominatingSubs.clear();
- for (const auto Node : depth_first(DT)) {
- BasicBlock *BB = Node->getBlock();
- for (auto I = BB->begin(); I != BB->end(); ) {
- Instruction *Cur = &*I++;
- Changed |= reuniteExts(Cur);
- }
- }
- return Changed;
-}
-
-void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
- for (BasicBlock &B : F) {
- for (Instruction &I : B) {
- if (isInstructionTriviallyDead(&I)) {
- std::string ErrMessage;
- raw_string_ostream RSO(ErrMessage);
- RSO << "Dead instruction detected!\n" << I << "\n";
- llvm_unreachable(RSO.str().c_str());
- }
- }
- }
-}
-
-bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
- GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
- if (!FirstGEP || !FirstGEP->hasOneUse())
- return false;
-
- if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
- return false;
-
- if (FirstGEP == SecondGEP)
- return false;
-
- unsigned FirstNum = FirstGEP->getNumOperands();
- unsigned SecondNum = SecondGEP->getNumOperands();
- // Give up if the number of operands are not 2.
- if (FirstNum != SecondNum || FirstNum != 2)
- return false;
-
- Value *FirstBase = FirstGEP->getOperand(0);
- Value *SecondBase = SecondGEP->getOperand(0);
- Value *FirstOffset = FirstGEP->getOperand(1);
- // Give up if the index of the first GEP is loop invariant.
- if (CurLoop->isLoopInvariant(FirstOffset))
- return false;
-
- // Give up if base doesn't have same type.
- if (FirstBase->getType() != SecondBase->getType())
- return false;
-
- Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);
-
- // Check if the second operand of first GEP has constant coefficient.
- // For an example, for the following code, we won't gain anything by
- // hoisting the second GEP out because the second GEP can be folded away.
- // %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
- // %67 = shl i64 %scevgep.sum.ur159, 2
- // %uglygep160 = getelementptr i8* %65, i64 %67
- // %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
-
- // Skip constant shift instruction which may be generated by Splitting GEPs.
- if (FirstOffsetDef && FirstOffsetDef->isShift() &&
- isa<ConstantInt>(FirstOffsetDef->getOperand(1)))
- FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));
-
- // Give up if FirstOffsetDef is an Add or Sub with constant.
- // Because it may not profitable at all due to constant folding.
- if (FirstOffsetDef)
- if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
- unsigned opc = BO->getOpcode();
- if ((opc == Instruction::Add || opc == Instruction::Sub) &&
- (isa<ConstantInt>(BO->getOperand(0)) ||
- isa<ConstantInt>(BO->getOperand(1))))
- return false;
- }
- return true;
-}
-
-bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
- int UsesInLoop = 0;
- for (User *U : V->users()) {
- if (Instruction *User = dyn_cast<Instruction>(U))
- if (L->contains(User))
- if (++UsesInLoop > 1)
- return true;
- }
- return false;
-}
-
-void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
- GetElementPtrInst *Second) {
- Value *Offset1 = First->getOperand(1);
- Value *Offset2 = Second->getOperand(1);
- First->setOperand(1, Offset2);
- Second->setOperand(1, Offset1);
-
- // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
- const DataLayout &DAL = First->getModule()->getDataLayout();
- APInt Offset(DAL.getIndexSizeInBits(
- cast<PointerType>(First->getType())->getAddressSpace()),
- 0);
- Value *NewBase =
- First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
- uint64_t ObjectSize;
- if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) ||
- Offset.ugt(ObjectSize)) {
- First->setIsInBounds(false);
- Second->setIsInBounds(false);
- } else
- First->setIsInBounds(true);
-}
+ bool Changed = false;
+ for (BasicBlock &B : F) {
+ for (BasicBlock::iterator I = B.begin(), IE = B.end(); I != IE;)
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I++))
+ Changed |= splitGEP(GEP);
+ // No need to split GEP ConstantExprs because all its indices are constant
+ // already.
+ }
+
+ Changed |= reuniteExts(F);
+
+ if (VerifyNoDeadCode)
+ verifyNoDeadCode(F);
+
+ return Changed;
+}
+
+Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
+ const SCEV *Key, Instruction *Dominatee,
+ DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs) {
+ auto Pos = DominatingExprs.find(Key);
+ if (Pos == DominatingExprs.end())
+ return nullptr;
+
+ auto &Candidates = Pos->second;
+ // Because we process the basic blocks in pre-order of the dominator tree, a
+ // candidate that doesn't dominate the current instruction won't dominate any
+ // future instruction either. Therefore, we pop it out of the stack. This
+ // optimization makes the algorithm O(n).
+ while (!Candidates.empty()) {
+ Instruction *Candidate = Candidates.back();
+ if (DT->dominates(Candidate, Dominatee))
+ return Candidate;
+ Candidates.pop_back();
+ }
+ return nullptr;
+}
+
+bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
+ if (!SE->isSCEVable(I->getType()))
+ return false;
+
+ // Dom: LHS+RHS
+ // I: sext(LHS)+sext(RHS)
+ // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
+ // TODO: handle zext
+ Value *LHS = nullptr, *RHS = nullptr;
+ if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
+ if (LHS->getType() == RHS->getType()) {
+ const SCEV *Key =
+ SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+ if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingAdds)) {
+ Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
+ NewSExt->takeName(I);
+ I->replaceAllUsesWith(NewSExt);
+ RecursivelyDeleteTriviallyDeadInstructions(I);
+ return true;
+ }
+ }
+ } else if (match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
+ if (LHS->getType() == RHS->getType()) {
+ const SCEV *Key =
+ SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+ if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingSubs)) {
+ Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
+ NewSExt->takeName(I);
+ I->replaceAllUsesWith(NewSExt);
+ RecursivelyDeleteTriviallyDeadInstructions(I);
+ return true;
+ }
+ }
+ }
+
+ // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
+ if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS)))) {
+ if (programUndefinedIfPoison(I)) {
+ const SCEV *Key =
+ SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+ DominatingAdds[Key].push_back(I);
+ }
+ } else if (match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
+ if (programUndefinedIfPoison(I)) {
+ const SCEV *Key =
+ SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+ DominatingSubs[Key].push_back(I);
+ }
+ }
+ return false;
+}
+
+bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
+ bool Changed = false;
+ DominatingAdds.clear();
+ DominatingSubs.clear();
+ for (const auto Node : depth_first(DT)) {
+ BasicBlock *BB = Node->getBlock();
+ for (auto I = BB->begin(); I != BB->end(); ) {
+ Instruction *Cur = &*I++;
+ Changed |= reuniteExts(Cur);
+ }
+ }
+ return Changed;
+}
+
+void SeparateConstOffsetFromGEP::verifyNoDeadCode(Function &F) {
+ for (BasicBlock &B : F) {
+ for (Instruction &I : B) {
+ if (isInstructionTriviallyDead(&I)) {
+ std::string ErrMessage;
+ raw_string_ostream RSO(ErrMessage);
+ RSO << "Dead instruction detected!\n" << I << "\n";
+ llvm_unreachable(RSO.str().c_str());
+ }
+ }
+ }
+}
+
+bool SeparateConstOffsetFromGEP::isLegalToSwapOperand(
+ GetElementPtrInst *FirstGEP, GetElementPtrInst *SecondGEP, Loop *CurLoop) {
+ if (!FirstGEP || !FirstGEP->hasOneUse())
+ return false;
+
+ if (!SecondGEP || FirstGEP->getParent() != SecondGEP->getParent())
+ return false;
+
+ if (FirstGEP == SecondGEP)
+ return false;
+
+ unsigned FirstNum = FirstGEP->getNumOperands();
+ unsigned SecondNum = SecondGEP->getNumOperands();
+ // Give up if the number of operands are not 2.
+ if (FirstNum != SecondNum || FirstNum != 2)
+ return false;
+
+ Value *FirstBase = FirstGEP->getOperand(0);
+ Value *SecondBase = SecondGEP->getOperand(0);
+ Value *FirstOffset = FirstGEP->getOperand(1);
+ // Give up if the index of the first GEP is loop invariant.
+ if (CurLoop->isLoopInvariant(FirstOffset))
+ return false;
+
+ // Give up if base doesn't have same type.
+ if (FirstBase->getType() != SecondBase->getType())
+ return false;
+
+ Instruction *FirstOffsetDef = dyn_cast<Instruction>(FirstOffset);
+
+ // Check if the second operand of first GEP has constant coefficient.
+ // For an example, for the following code, we won't gain anything by
+ // hoisting the second GEP out because the second GEP can be folded away.
+ // %scevgep.sum.ur159 = add i64 %idxprom48.ur, 256
+ // %67 = shl i64 %scevgep.sum.ur159, 2
+ // %uglygep160 = getelementptr i8* %65, i64 %67
+ // %uglygep161 = getelementptr i8* %uglygep160, i64 -1024
+
+ // Skip constant shift instruction which may be generated by Splitting GEPs.
+ if (FirstOffsetDef && FirstOffsetDef->isShift() &&
+ isa<ConstantInt>(FirstOffsetDef->getOperand(1)))
+ FirstOffsetDef = dyn_cast<Instruction>(FirstOffsetDef->getOperand(0));
+
+ // Give up if FirstOffsetDef is an Add or Sub with constant.
+ // Because it may not profitable at all due to constant folding.
+ if (FirstOffsetDef)
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FirstOffsetDef)) {
+ unsigned opc = BO->getOpcode();
+ if ((opc == Instruction::Add || opc == Instruction::Sub) &&
+ (isa<ConstantInt>(BO->getOperand(0)) ||
+ isa<ConstantInt>(BO->getOperand(1))))
+ return false;
+ }
+ return true;
+}
+
+bool SeparateConstOffsetFromGEP::hasMoreThanOneUseInLoop(Value *V, Loop *L) {
+ int UsesInLoop = 0;
+ for (User *U : V->users()) {
+ if (Instruction *User = dyn_cast<Instruction>(U))
+ if (L->contains(User))
+ if (++UsesInLoop > 1)
+ return true;
+ }
+ return false;
+}
+
+void SeparateConstOffsetFromGEP::swapGEPOperand(GetElementPtrInst *First,
+ GetElementPtrInst *Second) {
+ Value *Offset1 = First->getOperand(1);
+ Value *Offset2 = Second->getOperand(1);
+ First->setOperand(1, Offset2);
+ Second->setOperand(1, Offset1);
+
+ // We changed p+o+c to p+c+o, p+c may not be inbound anymore.
+ const DataLayout &DAL = First->getModule()->getDataLayout();
+ APInt Offset(DAL.getIndexSizeInBits(
+ cast<PointerType>(First->getType())->getAddressSpace()),
+ 0);
+ Value *NewBase =
+ First->stripAndAccumulateInBoundsConstantOffsets(DAL, Offset);
+ uint64_t ObjectSize;
+ if (!getObjectSize(NewBase, ObjectSize, DAL, TLI) ||
+ Offset.ugt(ObjectSize)) {
+ First->setIsInBounds(false);
+ Second->setIsInBounds(false);
+ } else
+ First->setIsInBounds(true);
+}
PreservedAnalyses
SeparateConstOffsetFromGEPPass::run(Function &F, FunctionAnalysisManager &AM) {
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index 43ed0957ed..9d3c8d0f37 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -1,1145 +1,1145 @@
-///===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/GuardUtils.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
+///===- SimpleLoopUnswitch.cpp - Hoist loop-invariant control flow ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
#include "llvm/Analysis/MustExecute.h"
#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GenericDomTree.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar/SimpleLoopUnswitch.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <algorithm>
-#include <cassert>
-#include <iterator>
-#include <numeric>
-#include <utility>
-
-#define DEBUG_TYPE "simple-loop-unswitch"
-
-using namespace llvm;
-
-STATISTIC(NumBranches, "Number of branches unswitched");
-STATISTIC(NumSwitches, "Number of switches unswitched");
-STATISTIC(NumGuards, "Number of guards turned into branches for unswitching");
-STATISTIC(NumTrivial, "Number of unswitches that are trivial");
-STATISTIC(
- NumCostMultiplierSkipped,
- "Number of unswitch candidates that had their cost multiplier skipped");
-
-static cl::opt<bool> EnableNonTrivialUnswitch(
- "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
- cl::desc("Forcibly enables non-trivial loop unswitching rather than "
- "following the configuration passed into the pass."));
-
-static cl::opt<int>
- UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
- cl::desc("The cost threshold for unswitching a loop."));
-
-static cl::opt<bool> EnableUnswitchCostMultiplier(
- "enable-unswitch-cost-multiplier", cl::init(true), cl::Hidden,
- cl::desc("Enable unswitch cost multiplier that prohibits exponential "
- "explosion in nontrivial unswitch."));
-static cl::opt<int> UnswitchSiblingsToplevelDiv(
- "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden,
- cl::desc("Toplevel siblings divisor for cost multiplier."));
-static cl::opt<int> UnswitchNumInitialUnscaledCandidates(
- "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden,
- cl::desc("Number of unswitch candidates that are ignored when calculating "
- "cost multiplier."));
-static cl::opt<bool> UnswitchGuards(
- "simple-loop-unswitch-guards", cl::init(true), cl::Hidden,
- cl::desc("If enabled, simple loop unswitching will also consider "
- "llvm.experimental.guard intrinsics as unswitch candidates."));
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <numeric>
+#include <utility>
+
+#define DEBUG_TYPE "simple-loop-unswitch"
+
+using namespace llvm;
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumGuards, "Number of guards turned into branches for unswitching");
+STATISTIC(NumTrivial, "Number of unswitches that are trivial");
+STATISTIC(
+ NumCostMultiplierSkipped,
+ "Number of unswitch candidates that had their cost multiplier skipped");
+
+static cl::opt<bool> EnableNonTrivialUnswitch(
+ "enable-nontrivial-unswitch", cl::init(false), cl::Hidden,
+ cl::desc("Forcibly enables non-trivial loop unswitching rather than "
+ "following the configuration passed into the pass."));
+
+static cl::opt<int>
+ UnswitchThreshold("unswitch-threshold", cl::init(50), cl::Hidden,
+ cl::desc("The cost threshold for unswitching a loop."));
+
+static cl::opt<bool> EnableUnswitchCostMultiplier(
+ "enable-unswitch-cost-multiplier", cl::init(true), cl::Hidden,
+ cl::desc("Enable unswitch cost multiplier that prohibits exponential "
+ "explosion in nontrivial unswitch."));
+static cl::opt<int> UnswitchSiblingsToplevelDiv(
+ "unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden,
+ cl::desc("Toplevel siblings divisor for cost multiplier."));
+static cl::opt<int> UnswitchNumInitialUnscaledCandidates(
+ "unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden,
+ cl::desc("Number of unswitch candidates that are ignored when calculating "
+ "cost multiplier."));
+static cl::opt<bool> UnswitchGuards(
+ "simple-loop-unswitch-guards", cl::init(true), cl::Hidden,
+ cl::desc("If enabled, simple loop unswitching will also consider "
+ "llvm.experimental.guard intrinsics as unswitch candidates."));
static cl::opt<bool> DropNonTrivialImplicitNullChecks(
"simple-loop-unswitch-drop-non-trivial-implicit-null-checks",
cl::init(false), cl::Hidden,
cl::desc("If enabled, drop make.implicit metadata in unswitched implicit "
"null checks to save time analyzing if we can keep it."));
-
-/// Collect all of the loop invariant input values transitively used by the
-/// homogeneous instruction graph from a given root.
-///
-/// This essentially walks from a root recursively through loop variant operands
-/// which have the exact same opcode and finds all inputs which are loop
-/// invariant. For some operations these can be re-associated and unswitched out
-/// of the loop entirely.
-static TinyPtrVector<Value *>
-collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root,
- LoopInfo &LI) {
- assert(!L.isLoopInvariant(&Root) &&
- "Only need to walk the graph if root itself is not invariant.");
- TinyPtrVector<Value *> Invariants;
-
- // Build a worklist and recurse through operators collecting invariants.
- SmallVector<Instruction *, 4> Worklist;
- SmallPtrSet<Instruction *, 8> Visited;
- Worklist.push_back(&Root);
- Visited.insert(&Root);
- do {
- Instruction &I = *Worklist.pop_back_val();
- for (Value *OpV : I.operand_values()) {
- // Skip constants as unswitching isn't interesting for them.
- if (isa<Constant>(OpV))
- continue;
-
- // Add it to our result if loop invariant.
- if (L.isLoopInvariant(OpV)) {
- Invariants.push_back(OpV);
- continue;
- }
-
- // If not an instruction with the same opcode, nothing we can do.
- Instruction *OpI = dyn_cast<Instruction>(OpV);
- if (!OpI || OpI->getOpcode() != Root.getOpcode())
- continue;
-
- // Visit this operand.
- if (Visited.insert(OpI).second)
- Worklist.push_back(OpI);
- }
- } while (!Worklist.empty());
-
- return Invariants;
-}
-
-static void replaceLoopInvariantUses(Loop &L, Value *Invariant,
- Constant &Replacement) {
- assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?");
-
- // Replace uses of LIC in the loop with the given constant.
- for (auto UI = Invariant->use_begin(), UE = Invariant->use_end(); UI != UE;) {
- // Grab the use and walk past it so we can clobber it in the use list.
- Use *U = &*UI++;
- Instruction *UserI = dyn_cast<Instruction>(U->getUser());
-
- // Replace this use within the loop body.
- if (UserI && L.contains(UserI))
- U->set(&Replacement);
- }
-}
-
-/// Check that all the LCSSA PHI nodes in the loop exit block have trivial
-/// incoming values along this edge.
-static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
- BasicBlock &ExitBB) {
- for (Instruction &I : ExitBB) {
- auto *PN = dyn_cast<PHINode>(&I);
- if (!PN)
- // No more PHIs to check.
- return true;
-
- // If the incoming value for this edge isn't loop invariant the unswitch
- // won't be trivial.
- if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
- return false;
- }
- llvm_unreachable("Basic blocks should never be empty!");
-}
-
-/// Insert code to test a set of loop invariant values, and conditionally branch
-/// on them.
-static void buildPartialUnswitchConditionalBranch(BasicBlock &BB,
- ArrayRef<Value *> Invariants,
- bool Direction,
- BasicBlock &UnswitchedSucc,
- BasicBlock &NormalSucc) {
- IRBuilder<> IRB(&BB);
-
- Value *Cond = Direction ? IRB.CreateOr(Invariants) :
- IRB.CreateAnd(Invariants);
- IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
- Direction ? &NormalSucc : &UnswitchedSucc);
-}
-
-/// Rewrite the PHI nodes in an unswitched loop exit basic block.
-///
-/// Requires that the loop exit and unswitched basic block are the same, and
-/// that the exiting block was a unique predecessor of that block. Rewrites the
-/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
-/// PHI nodes from the old preheader that now contains the unswitched
-/// terminator.
-static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
- BasicBlock &OldExitingBB,
- BasicBlock &OldPH) {
- for (PHINode &PN : UnswitchedBB.phis()) {
- // When the loop exit is directly unswitched we just need to update the
- // incoming basic block. We loop to handle weird cases with repeated
- // incoming blocks, but expect to typically only have one operand here.
- for (auto i : seq<int>(0, PN.getNumOperands())) {
- assert(PN.getIncomingBlock(i) == &OldExitingBB &&
- "Found incoming block different from unique predecessor!");
- PN.setIncomingBlock(i, &OldPH);
- }
- }
-}
-
-/// Rewrite the PHI nodes in the loop exit basic block and the split off
-/// unswitched block.
-///
-/// Because the exit block remains an exit from the loop, this rewrites the
-/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
-/// nodes into the unswitched basic block to select between the value in the
-/// old preheader and the loop exit.
-static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
- BasicBlock &UnswitchedBB,
- BasicBlock &OldExitingBB,
- BasicBlock &OldPH,
- bool FullUnswitch) {
- assert(&ExitBB != &UnswitchedBB &&
- "Must have different loop exit and unswitched blocks!");
- Instruction *InsertPt = &*UnswitchedBB.begin();
- for (PHINode &PN : ExitBB.phis()) {
- auto *NewPN = PHINode::Create(PN.getType(), /*NumReservedValues*/ 2,
- PN.getName() + ".split", InsertPt);
-
- // Walk backwards over the old PHI node's inputs to minimize the cost of
- // removing each one. We have to do this weird loop manually so that we
- // create the same number of new incoming edges in the new PHI as we expect
- // each case-based edge to be included in the unswitched switch in some
- // cases.
- // FIXME: This is really, really gross. It would be much cleaner if LLVM
- // allowed us to create a single entry for a predecessor block without
- // having separate entries for each "edge" even though these edges are
- // required to produce identical results.
- for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) {
- if (PN.getIncomingBlock(i) != &OldExitingBB)
- continue;
-
- Value *Incoming = PN.getIncomingValue(i);
- if (FullUnswitch)
- // No more edge from the old exiting block to the exit block.
- PN.removeIncomingValue(i);
-
- NewPN->addIncoming(Incoming, &OldPH);
- }
-
- // Now replace the old PHI with the new one and wire the old one in as an
- // input to the new one.
- PN.replaceAllUsesWith(NewPN);
- NewPN->addIncoming(&PN, &ExitBB);
- }
-}
-
-/// Hoist the current loop up to the innermost loop containing a remaining exit.
-///
-/// Because we've removed an exit from the loop, we may have changed the set of
-/// loops reachable and need to move the current loop up the loop nest or even
-/// to an entirely separate nest.
-static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader,
- DominatorTree &DT, LoopInfo &LI,
- MemorySSAUpdater *MSSAU, ScalarEvolution *SE) {
- // If the loop is already at the top level, we can't hoist it anywhere.
- Loop *OldParentL = L.getParentLoop();
- if (!OldParentL)
- return;
-
- SmallVector<BasicBlock *, 4> Exits;
- L.getExitBlocks(Exits);
- Loop *NewParentL = nullptr;
- for (auto *ExitBB : Exits)
- if (Loop *ExitL = LI.getLoopFor(ExitBB))
- if (!NewParentL || NewParentL->contains(ExitL))
- NewParentL = ExitL;
-
- if (NewParentL == OldParentL)
- return;
-
- // The new parent loop (if different) should always contain the old one.
- if (NewParentL)
- assert(NewParentL->contains(OldParentL) &&
- "Can only hoist this loop up the nest!");
-
- // The preheader will need to move with the body of this loop. However,
- // because it isn't in this loop we also need to update the primary loop map.
- assert(OldParentL == LI.getLoopFor(&Preheader) &&
- "Parent loop of this loop should contain this loop's preheader!");
- LI.changeLoopFor(&Preheader, NewParentL);
-
- // Remove this loop from its old parent.
- OldParentL->removeChildLoop(&L);
-
- // Add the loop either to the new parent or as a top-level loop.
- if (NewParentL)
- NewParentL->addChildLoop(&L);
- else
- LI.addTopLevelLoop(&L);
-
- // Remove this loops blocks from the old parent and every other loop up the
- // nest until reaching the new parent. Also update all of these
- // no-longer-containing loops to reflect the nesting change.
- for (Loop *OldContainingL = OldParentL; OldContainingL != NewParentL;
- OldContainingL = OldContainingL->getParentLoop()) {
- llvm::erase_if(OldContainingL->getBlocksVector(),
- [&](const BasicBlock *BB) {
- return BB == &Preheader || L.contains(BB);
- });
-
- OldContainingL->getBlocksSet().erase(&Preheader);
- for (BasicBlock *BB : L.blocks())
- OldContainingL->getBlocksSet().erase(BB);
-
- // Because we just hoisted a loop out of this one, we have essentially
- // created new exit paths from it. That means we need to form LCSSA PHI
- // nodes for values used in the no-longer-nested loop.
- formLCSSA(*OldContainingL, DT, &LI, SE);
-
- // We shouldn't need to form dedicated exits because the exit introduced
- // here is the (just split by unswitching) preheader. However, after trivial
- // unswitching it is possible to get new non-dedicated exits out of parent
- // loop so let's conservatively form dedicated exit blocks and figure out
- // if we can optimize later.
- formDedicatedExitBlocks(OldContainingL, &DT, &LI, MSSAU,
- /*PreserveLCSSA*/ true);
- }
-}
-
-// Return the top-most loop containing ExitBB and having ExitBB as exiting block
-// or the loop containing ExitBB, if there is no parent loop containing ExitBB
-// as exiting block.
-static Loop *getTopMostExitingLoop(BasicBlock *ExitBB, LoopInfo &LI) {
- Loop *TopMost = LI.getLoopFor(ExitBB);
- Loop *Current = TopMost;
- while (Current) {
- if (Current->isLoopExiting(ExitBB))
- TopMost = Current;
- Current = Current->getParentLoop();
- }
- return TopMost;
-}
-
-/// Unswitch a trivial branch if the condition is loop invariant.
-///
-/// This routine should only be called when loop code leading to the branch has
-/// been validated as trivial (no side effects). This routine checks if the
-/// condition is invariant and one of the successors is a loop exit. This
-/// allows us to unswitch without duplicating the loop, making it trivial.
-///
-/// If this routine fails to unswitch the branch it returns false.
-///
-/// If the branch can be unswitched, this routine splits the preheader and
-/// hoists the branch above that split. Preserves loop simplified form
-/// (splitting the exit block as necessary). It simplifies the branch within
-/// the loop to an unconditional branch but doesn't remove it entirely. Further
-/// cleanup can be done with some simplify-cfg like pass.
-///
-/// If `SE` is not null, it will be updated based on the potential loop SCEVs
-/// invalidated by this.
-static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
- LoopInfo &LI, ScalarEvolution *SE,
- MemorySSAUpdater *MSSAU) {
- assert(BI.isConditional() && "Can only unswitch a conditional branch!");
- LLVM_DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n");
-
- // The loop invariant values that we want to unswitch.
- TinyPtrVector<Value *> Invariants;
-
- // When true, we're fully unswitching the branch rather than just unswitching
- // some input conditions to the branch.
- bool FullUnswitch = false;
-
- if (L.isLoopInvariant(BI.getCondition())) {
- Invariants.push_back(BI.getCondition());
- FullUnswitch = true;
- } else {
- if (auto *CondInst = dyn_cast<Instruction>(BI.getCondition()))
- Invariants = collectHomogenousInstGraphLoopInvariants(L, *CondInst, LI);
- if (Invariants.empty())
- // Couldn't find invariant inputs!
- return false;
- }
-
- // Check that one of the branch's successors exits, and which one.
- bool ExitDirection = true;
- int LoopExitSuccIdx = 0;
- auto *LoopExitBB = BI.getSuccessor(0);
- if (L.contains(LoopExitBB)) {
- ExitDirection = false;
- LoopExitSuccIdx = 1;
- LoopExitBB = BI.getSuccessor(1);
- if (L.contains(LoopExitBB))
- return false;
- }
- auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx);
- auto *ParentBB = BI.getParent();
- if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
- return false;
-
- // When unswitching only part of the branch's condition, we need the exit
- // block to be reached directly from the partially unswitched input. This can
- // be done when the exit block is along the true edge and the branch condition
- // is a graph of `or` operations, or the exit block is along the false edge
- // and the condition is a graph of `and` operations.
- if (!FullUnswitch) {
- if (ExitDirection) {
- if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::Or)
- return false;
- } else {
- if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::And)
- return false;
- }
- }
-
- LLVM_DEBUG({
- dbgs() << " unswitching trivial invariant conditions for: " << BI
- << "\n";
- for (Value *Invariant : Invariants) {
- dbgs() << " " << *Invariant << " == true";
- if (Invariant != Invariants.back())
- dbgs() << " ||";
- dbgs() << "\n";
- }
- });
-
- // If we have scalar evolutions, we need to invalidate them including this
- // loop, the loop containing the exit block and the topmost parent loop
- // exiting via LoopExitBB.
- if (SE) {
- if (Loop *ExitL = getTopMostExitingLoop(LoopExitBB, LI))
- SE->forgetLoop(ExitL);
- else
- // Forget the entire nest as this exits the entire nest.
- SE->forgetTopmostLoop(&L);
- }
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- // Split the preheader, so that we know that there is a safe place to insert
- // the conditional branch. We will change the preheader to have a conditional
- // branch on LoopCond.
- BasicBlock *OldPH = L.getLoopPreheader();
- BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU);
-
- // Now that we have a place to insert the conditional branch, create a place
- // to branch to: this is the exit block out of the loop that we are
- // unswitching. We need to split this if there are other loop predecessors.
- // Because the loop is in simplified form, *any* other predecessor is enough.
- BasicBlock *UnswitchedBB;
- if (FullUnswitch && LoopExitBB->getUniquePredecessor()) {
- assert(LoopExitBB->getUniquePredecessor() == BI.getParent() &&
- "A branch's parent isn't a predecessor!");
- UnswitchedBB = LoopExitBB;
- } else {
- UnswitchedBB =
- SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI, MSSAU);
- }
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- // Actually move the invariant uses into the unswitched position. If possible,
- // we do this by moving the instructions, but when doing partial unswitching
- // we do it by building a new merge of the values in the unswitched position.
- OldPH->getTerminator()->eraseFromParent();
- if (FullUnswitch) {
- // If fully unswitching, we can use the existing branch instruction.
- // Splice it into the old PH to gate reaching the new preheader and re-point
- // its successors.
- OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(),
- BI);
- if (MSSAU) {
- // Temporarily clone the terminator, to make MSSA update cheaper by
- // separating "insert edge" updates from "remove edge" ones.
- ParentBB->getInstList().push_back(BI.clone());
- } else {
- // Create a new unconditional branch that will continue the loop as a new
- // terminator.
- BranchInst::Create(ContinueBB, ParentBB);
- }
- BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
- BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
- } else {
- // Only unswitching a subset of inputs to the condition, so we will need to
- // build a new branch that merges the invariant inputs.
- if (ExitDirection)
- assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
- Instruction::Or &&
- "Must have an `or` of `i1`s for the condition!");
- else
- assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
- Instruction::And &&
- "Must have an `and` of `i1`s for the condition!");
- buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection,
- *UnswitchedBB, *NewPH);
- }
-
- // Update the dominator tree with the added edge.
- DT.insertEdge(OldPH, UnswitchedBB);
-
- // After the dominator tree was updated with the added edge, update MemorySSA
- // if available.
- if (MSSAU) {
- SmallVector<CFGUpdate, 1> Updates;
- Updates.push_back({cfg::UpdateKind::Insert, OldPH, UnswitchedBB});
- MSSAU->applyInsertUpdates(Updates, DT);
- }
-
- // Finish updating dominator tree and memory ssa for full unswitch.
- if (FullUnswitch) {
- if (MSSAU) {
- // Remove the cloned branch instruction.
- ParentBB->getTerminator()->eraseFromParent();
- // Create unconditional branch now.
- BranchInst::Create(ContinueBB, ParentBB);
- MSSAU->removeEdge(ParentBB, LoopExitBB);
- }
- DT.deleteEdge(ParentBB, LoopExitBB);
- }
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- // Rewrite the relevant PHI nodes.
- if (UnswitchedBB == LoopExitBB)
- rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
- else
- rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
- *ParentBB, *OldPH, FullUnswitch);
-
- // The constant we can replace all of our invariants with inside the loop
- // body. If any of the invariants have a value other than this the loop won't
- // be entered.
- ConstantInt *Replacement = ExitDirection
- ? ConstantInt::getFalse(BI.getContext())
- : ConstantInt::getTrue(BI.getContext());
-
- // Since this is an i1 condition we can also trivially replace uses of it
- // within the loop with a constant.
- for (Value *Invariant : Invariants)
- replaceLoopInvariantUses(L, Invariant, *Replacement);
-
- // If this was full unswitching, we may have changed the nesting relationship
- // for this loop so hoist it to its correct parent if needed.
- if (FullUnswitch)
- hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE);
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- LLVM_DEBUG(dbgs() << " done: unswitching trivial branch...\n");
- ++NumTrivial;
- ++NumBranches;
- return true;
-}
-
-/// Unswitch a trivial switch if the condition is loop invariant.
-///
-/// This routine should only be called when loop code leading to the switch has
-/// been validated as trivial (no side effects). This routine checks if the
-/// condition is invariant and that at least one of the successors is a loop
-/// exit. This allows us to unswitch without duplicating the loop, making it
-/// trivial.
-///
-/// If this routine fails to unswitch the switch it returns false.
-///
-/// If the switch can be unswitched, this routine splits the preheader and
-/// copies the switch above that split. If the default case is one of the
-/// exiting cases, it copies the non-exiting cases and points them at the new
-/// preheader. If the default case is not exiting, it copies the exiting cases
-/// and points the default at the preheader. It preserves loop simplified form
-/// (splitting the exit blocks as necessary). It simplifies the switch within
-/// the loop by removing now-dead cases. If the default case is one of those
-/// unswitched, it replaces its destination with a new basic block containing
-/// only unreachable. Such basic blocks, while technically loop exits, are not
-/// considered for unswitching so this is a stable transform and the same
-/// switch will not be revisited. If after unswitching there is only a single
-/// in-loop successor, the switch is further simplified to an unconditional
-/// branch. Still more cleanup can be done with some simplify-cfg like pass.
-///
-/// If `SE` is not null, it will be updated based on the potential loop SCEVs
-/// invalidated by this.
-static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
- LoopInfo &LI, ScalarEvolution *SE,
- MemorySSAUpdater *MSSAU) {
- LLVM_DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n");
- Value *LoopCond = SI.getCondition();
-
- // If this isn't switching on an invariant condition, we can't unswitch it.
- if (!L.isLoopInvariant(LoopCond))
- return false;
-
- auto *ParentBB = SI.getParent();
-
- // The same check must be used both for the default and the exit cases. We
- // should never leave edges from the switch instruction to a basic block that
- // we are unswitching, hence the condition used to determine the default case
- // needs to also be used to populate ExitCaseIndices, which is then used to
- // remove cases from the switch.
- auto IsTriviallyUnswitchableExitBlock = [&](BasicBlock &BBToCheck) {
- // BBToCheck is not an exit block if it is inside loop L.
- if (L.contains(&BBToCheck))
- return false;
- // BBToCheck is not trivial to unswitch if its phis aren't loop invariant.
- if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, BBToCheck))
- return false;
- // We do not unswitch a block that only has an unreachable statement, as
- // it's possible this is a previously unswitched block. Only unswitch if
- // either the terminator is not unreachable, or, if it is, it's not the only
- // instruction in the block.
- auto *TI = BBToCheck.getTerminator();
- bool isUnreachable = isa<UnreachableInst>(TI);
- return !isUnreachable ||
- (isUnreachable && (BBToCheck.getFirstNonPHIOrDbg() != TI));
- };
-
- SmallVector<int, 4> ExitCaseIndices;
- for (auto Case : SI.cases())
- if (IsTriviallyUnswitchableExitBlock(*Case.getCaseSuccessor()))
- ExitCaseIndices.push_back(Case.getCaseIndex());
- BasicBlock *DefaultExitBB = nullptr;
- SwitchInstProfUpdateWrapper::CaseWeightOpt DefaultCaseWeight =
- SwitchInstProfUpdateWrapper::getSuccessorWeight(SI, 0);
- if (IsTriviallyUnswitchableExitBlock(*SI.getDefaultDest())) {
- DefaultExitBB = SI.getDefaultDest();
- } else if (ExitCaseIndices.empty())
- return false;
-
- LLVM_DEBUG(dbgs() << " unswitching trivial switch...\n");
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- // We may need to invalidate SCEVs for the outermost loop reached by any of
- // the exits.
- Loop *OuterL = &L;
-
- if (DefaultExitBB) {
- // Clear out the default destination temporarily to allow accurate
- // predecessor lists to be examined below.
- SI.setDefaultDest(nullptr);
- // Check the loop containing this exit.
- Loop *ExitL = LI.getLoopFor(DefaultExitBB);
- if (!ExitL || ExitL->contains(OuterL))
- OuterL = ExitL;
- }
-
- // Store the exit cases into a separate data structure and remove them from
- // the switch.
- SmallVector<std::tuple<ConstantInt *, BasicBlock *,
- SwitchInstProfUpdateWrapper::CaseWeightOpt>,
- 4> ExitCases;
- ExitCases.reserve(ExitCaseIndices.size());
- SwitchInstProfUpdateWrapper SIW(SI);
- // We walk the case indices backwards so that we remove the last case first
- // and don't disrupt the earlier indices.
- for (unsigned Index : reverse(ExitCaseIndices)) {
- auto CaseI = SI.case_begin() + Index;
- // Compute the outer loop from this exit.
- Loop *ExitL = LI.getLoopFor(CaseI->getCaseSuccessor());
- if (!ExitL || ExitL->contains(OuterL))
- OuterL = ExitL;
- // Save the value of this case.
- auto W = SIW.getSuccessorWeight(CaseI->getSuccessorIndex());
- ExitCases.emplace_back(CaseI->getCaseValue(), CaseI->getCaseSuccessor(), W);
- // Delete the unswitched cases.
- SIW.removeCase(CaseI);
- }
-
- if (SE) {
- if (OuterL)
- SE->forgetLoop(OuterL);
- else
- SE->forgetTopmostLoop(&L);
- }
-
- // Check if after this all of the remaining cases point at the same
- // successor.
- BasicBlock *CommonSuccBB = nullptr;
- if (SI.getNumCases() > 0 &&
+
+/// Collect all of the loop invariant input values transitively used by the
+/// homogeneous instruction graph from a given root.
+///
+/// This essentially walks from a root recursively through loop variant operands
+/// which have the exact same opcode and finds all inputs which are loop
+/// invariant. For some operations these can be re-associated and unswitched out
+/// of the loop entirely.
+static TinyPtrVector<Value *>
+collectHomogenousInstGraphLoopInvariants(Loop &L, Instruction &Root,
+ LoopInfo &LI) {
+ assert(!L.isLoopInvariant(&Root) &&
+ "Only need to walk the graph if root itself is not invariant.");
+ TinyPtrVector<Value *> Invariants;
+
+ // Build a worklist and recurse through operators collecting invariants.
+ SmallVector<Instruction *, 4> Worklist;
+ SmallPtrSet<Instruction *, 8> Visited;
+ Worklist.push_back(&Root);
+ Visited.insert(&Root);
+ do {
+ Instruction &I = *Worklist.pop_back_val();
+ for (Value *OpV : I.operand_values()) {
+ // Skip constants as unswitching isn't interesting for them.
+ if (isa<Constant>(OpV))
+ continue;
+
+ // Add it to our result if loop invariant.
+ if (L.isLoopInvariant(OpV)) {
+ Invariants.push_back(OpV);
+ continue;
+ }
+
+ // If not an instruction with the same opcode, nothing we can do.
+ Instruction *OpI = dyn_cast<Instruction>(OpV);
+ if (!OpI || OpI->getOpcode() != Root.getOpcode())
+ continue;
+
+ // Visit this operand.
+ if (Visited.insert(OpI).second)
+ Worklist.push_back(OpI);
+ }
+ } while (!Worklist.empty());
+
+ return Invariants;
+}
+
+static void replaceLoopInvariantUses(Loop &L, Value *Invariant,
+ Constant &Replacement) {
+ assert(!isa<Constant>(Invariant) && "Why are we unswitching on a constant?");
+
+ // Replace uses of LIC in the loop with the given constant.
+ for (auto UI = Invariant->use_begin(), UE = Invariant->use_end(); UI != UE;) {
+ // Grab the use and walk past it so we can clobber it in the use list.
+ Use *U = &*UI++;
+ Instruction *UserI = dyn_cast<Instruction>(U->getUser());
+
+ // Replace this use within the loop body.
+ if (UserI && L.contains(UserI))
+ U->set(&Replacement);
+ }
+}
+
+/// Check that all the LCSSA PHI nodes in the loop exit block have trivial
+/// incoming values along this edge.
+static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB,
+ BasicBlock &ExitBB) {
+ for (Instruction &I : ExitBB) {
+ auto *PN = dyn_cast<PHINode>(&I);
+ if (!PN)
+ // No more PHIs to check.
+ return true;
+
+ // If the incoming value for this edge isn't loop invariant the unswitch
+ // won't be trivial.
+ if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB)))
+ return false;
+ }
+ llvm_unreachable("Basic blocks should never be empty!");
+}
+
+/// Insert code to test a set of loop invariant values, and conditionally branch
+/// on them.
+static void buildPartialUnswitchConditionalBranch(BasicBlock &BB,
+ ArrayRef<Value *> Invariants,
+ bool Direction,
+ BasicBlock &UnswitchedSucc,
+ BasicBlock &NormalSucc) {
+ IRBuilder<> IRB(&BB);
+
+ Value *Cond = Direction ? IRB.CreateOr(Invariants) :
+ IRB.CreateAnd(Invariants);
+ IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
+ Direction ? &NormalSucc : &UnswitchedSucc);
+}
+
+/// Rewrite the PHI nodes in an unswitched loop exit basic block.
+///
+/// Requires that the loop exit and unswitched basic block are the same, and
+/// that the exiting block was a unique predecessor of that block. Rewrites the
+/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial
+/// PHI nodes from the old preheader that now contains the unswitched
+/// terminator.
+static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB,
+ BasicBlock &OldExitingBB,
+ BasicBlock &OldPH) {
+ for (PHINode &PN : UnswitchedBB.phis()) {
+ // When the loop exit is directly unswitched we just need to update the
+ // incoming basic block. We loop to handle weird cases with repeated
+ // incoming blocks, but expect to typically only have one operand here.
+ for (auto i : seq<int>(0, PN.getNumOperands())) {
+ assert(PN.getIncomingBlock(i) == &OldExitingBB &&
+ "Found incoming block different from unique predecessor!");
+ PN.setIncomingBlock(i, &OldPH);
+ }
+ }
+}
+
+/// Rewrite the PHI nodes in the loop exit basic block and the split off
+/// unswitched block.
+///
+/// Because the exit block remains an exit from the loop, this rewrites the
+/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI
+/// nodes into the unswitched basic block to select between the value in the
+/// old preheader and the loop exit.
+static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB,
+ BasicBlock &UnswitchedBB,
+ BasicBlock &OldExitingBB,
+ BasicBlock &OldPH,
+ bool FullUnswitch) {
+ assert(&ExitBB != &UnswitchedBB &&
+ "Must have different loop exit and unswitched blocks!");
+ Instruction *InsertPt = &*UnswitchedBB.begin();
+ for (PHINode &PN : ExitBB.phis()) {
+ auto *NewPN = PHINode::Create(PN.getType(), /*NumReservedValues*/ 2,
+ PN.getName() + ".split", InsertPt);
+
+ // Walk backwards over the old PHI node's inputs to minimize the cost of
+ // removing each one. We have to do this weird loop manually so that we
+ // create the same number of new incoming edges in the new PHI as we expect
+ // each case-based edge to be included in the unswitched switch in some
+ // cases.
+ // FIXME: This is really, really gross. It would be much cleaner if LLVM
+ // allowed us to create a single entry for a predecessor block without
+ // having separate entries for each "edge" even though these edges are
+ // required to produce identical results.
+ for (int i = PN.getNumIncomingValues() - 1; i >= 0; --i) {
+ if (PN.getIncomingBlock(i) != &OldExitingBB)
+ continue;
+
+ Value *Incoming = PN.getIncomingValue(i);
+ if (FullUnswitch)
+ // No more edge from the old exiting block to the exit block.
+ PN.removeIncomingValue(i);
+
+ NewPN->addIncoming(Incoming, &OldPH);
+ }
+
+ // Now replace the old PHI with the new one and wire the old one in as an
+ // input to the new one.
+ PN.replaceAllUsesWith(NewPN);
+ NewPN->addIncoming(&PN, &ExitBB);
+ }
+}
+
+/// Hoist the current loop up to the innermost loop containing a remaining exit.
+///
+/// Because we've removed an exit from the loop, we may have changed the set of
+/// loops reachable and need to move the current loop up the loop nest or even
+/// to an entirely separate nest.
+static void hoistLoopToNewParent(Loop &L, BasicBlock &Preheader,
+ DominatorTree &DT, LoopInfo &LI,
+ MemorySSAUpdater *MSSAU, ScalarEvolution *SE) {
+ // If the loop is already at the top level, we can't hoist it anywhere.
+ Loop *OldParentL = L.getParentLoop();
+ if (!OldParentL)
+ return;
+
+ SmallVector<BasicBlock *, 4> Exits;
+ L.getExitBlocks(Exits);
+ Loop *NewParentL = nullptr;
+ for (auto *ExitBB : Exits)
+ if (Loop *ExitL = LI.getLoopFor(ExitBB))
+ if (!NewParentL || NewParentL->contains(ExitL))
+ NewParentL = ExitL;
+
+ if (NewParentL == OldParentL)
+ return;
+
+ // The new parent loop (if different) should always contain the old one.
+ if (NewParentL)
+ assert(NewParentL->contains(OldParentL) &&
+ "Can only hoist this loop up the nest!");
+
+ // The preheader will need to move with the body of this loop. However,
+ // because it isn't in this loop we also need to update the primary loop map.
+ assert(OldParentL == LI.getLoopFor(&Preheader) &&
+ "Parent loop of this loop should contain this loop's preheader!");
+ LI.changeLoopFor(&Preheader, NewParentL);
+
+ // Remove this loop from its old parent.
+ OldParentL->removeChildLoop(&L);
+
+ // Add the loop either to the new parent or as a top-level loop.
+ if (NewParentL)
+ NewParentL->addChildLoop(&L);
+ else
+ LI.addTopLevelLoop(&L);
+
+ // Remove this loops blocks from the old parent and every other loop up the
+ // nest until reaching the new parent. Also update all of these
+ // no-longer-containing loops to reflect the nesting change.
+ for (Loop *OldContainingL = OldParentL; OldContainingL != NewParentL;
+ OldContainingL = OldContainingL->getParentLoop()) {
+ llvm::erase_if(OldContainingL->getBlocksVector(),
+ [&](const BasicBlock *BB) {
+ return BB == &Preheader || L.contains(BB);
+ });
+
+ OldContainingL->getBlocksSet().erase(&Preheader);
+ for (BasicBlock *BB : L.blocks())
+ OldContainingL->getBlocksSet().erase(BB);
+
+ // Because we just hoisted a loop out of this one, we have essentially
+ // created new exit paths from it. That means we need to form LCSSA PHI
+ // nodes for values used in the no-longer-nested loop.
+ formLCSSA(*OldContainingL, DT, &LI, SE);
+
+ // We shouldn't need to form dedicated exits because the exit introduced
+ // here is the (just split by unswitching) preheader. However, after trivial
+ // unswitching it is possible to get new non-dedicated exits out of parent
+ // loop so let's conservatively form dedicated exit blocks and figure out
+ // if we can optimize later.
+ formDedicatedExitBlocks(OldContainingL, &DT, &LI, MSSAU,
+ /*PreserveLCSSA*/ true);
+ }
+}
+
+// Return the top-most loop containing ExitBB and having ExitBB as exiting block
+// or the loop containing ExitBB, if there is no parent loop containing ExitBB
+// as exiting block.
+static Loop *getTopMostExitingLoop(BasicBlock *ExitBB, LoopInfo &LI) {
+ Loop *TopMost = LI.getLoopFor(ExitBB);
+ Loop *Current = TopMost;
+ while (Current) {
+ if (Current->isLoopExiting(ExitBB))
+ TopMost = Current;
+ Current = Current->getParentLoop();
+ }
+ return TopMost;
+}
+
+/// Unswitch a trivial branch if the condition is loop invariant.
+///
+/// This routine should only be called when loop code leading to the branch has
+/// been validated as trivial (no side effects). This routine checks if the
+/// condition is invariant and one of the successors is a loop exit. This
+/// allows us to unswitch without duplicating the loop, making it trivial.
+///
+/// If this routine fails to unswitch the branch it returns false.
+///
+/// If the branch can be unswitched, this routine splits the preheader and
+/// hoists the branch above that split. Preserves loop simplified form
+/// (splitting the exit block as necessary). It simplifies the branch within
+/// the loop to an unconditional branch but doesn't remove it entirely. Further
+/// cleanup can be done with some simplify-cfg like pass.
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
+static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT,
+ LoopInfo &LI, ScalarEvolution *SE,
+ MemorySSAUpdater *MSSAU) {
+ assert(BI.isConditional() && "Can only unswitch a conditional branch!");
+ LLVM_DEBUG(dbgs() << " Trying to unswitch branch: " << BI << "\n");
+
+ // The loop invariant values that we want to unswitch.
+ TinyPtrVector<Value *> Invariants;
+
+ // When true, we're fully unswitching the branch rather than just unswitching
+ // some input conditions to the branch.
+ bool FullUnswitch = false;
+
+ if (L.isLoopInvariant(BI.getCondition())) {
+ Invariants.push_back(BI.getCondition());
+ FullUnswitch = true;
+ } else {
+ if (auto *CondInst = dyn_cast<Instruction>(BI.getCondition()))
+ Invariants = collectHomogenousInstGraphLoopInvariants(L, *CondInst, LI);
+ if (Invariants.empty())
+ // Couldn't find invariant inputs!
+ return false;
+ }
+
+ // Check that one of the branch's successors exits, and which one.
+ bool ExitDirection = true;
+ int LoopExitSuccIdx = 0;
+ auto *LoopExitBB = BI.getSuccessor(0);
+ if (L.contains(LoopExitBB)) {
+ ExitDirection = false;
+ LoopExitSuccIdx = 1;
+ LoopExitBB = BI.getSuccessor(1);
+ if (L.contains(LoopExitBB))
+ return false;
+ }
+ auto *ContinueBB = BI.getSuccessor(1 - LoopExitSuccIdx);
+ auto *ParentBB = BI.getParent();
+ if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB))
+ return false;
+
+ // When unswitching only part of the branch's condition, we need the exit
+ // block to be reached directly from the partially unswitched input. This can
+ // be done when the exit block is along the true edge and the branch condition
+ // is a graph of `or` operations, or the exit block is along the false edge
+ // and the condition is a graph of `and` operations.
+ if (!FullUnswitch) {
+ if (ExitDirection) {
+ if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::Or)
+ return false;
+ } else {
+ if (cast<Instruction>(BI.getCondition())->getOpcode() != Instruction::And)
+ return false;
+ }
+ }
+
+ LLVM_DEBUG({
+ dbgs() << " unswitching trivial invariant conditions for: " << BI
+ << "\n";
+ for (Value *Invariant : Invariants) {
+ dbgs() << " " << *Invariant << " == true";
+ if (Invariant != Invariants.back())
+ dbgs() << " ||";
+ dbgs() << "\n";
+ }
+ });
+
+ // If we have scalar evolutions, we need to invalidate them including this
+ // loop, the loop containing the exit block and the topmost parent loop
+ // exiting via LoopExitBB.
+ if (SE) {
+ if (Loop *ExitL = getTopMostExitingLoop(LoopExitBB, LI))
+ SE->forgetLoop(ExitL);
+ else
+ // Forget the entire nest as this exits the entire nest.
+ SE->forgetTopmostLoop(&L);
+ }
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ // Split the preheader, so that we know that there is a safe place to insert
+ // the conditional branch. We will change the preheader to have a conditional
+ // branch on LoopCond.
+ BasicBlock *OldPH = L.getLoopPreheader();
+ BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU);
+
+ // Now that we have a place to insert the conditional branch, create a place
+ // to branch to: this is the exit block out of the loop that we are
+ // unswitching. We need to split this if there are other loop predecessors.
+ // Because the loop is in simplified form, *any* other predecessor is enough.
+ BasicBlock *UnswitchedBB;
+ if (FullUnswitch && LoopExitBB->getUniquePredecessor()) {
+ assert(LoopExitBB->getUniquePredecessor() == BI.getParent() &&
+ "A branch's parent isn't a predecessor!");
+ UnswitchedBB = LoopExitBB;
+ } else {
+ UnswitchedBB =
+ SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI, MSSAU);
+ }
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ // Actually move the invariant uses into the unswitched position. If possible,
+ // we do this by moving the instructions, but when doing partial unswitching
+ // we do it by building a new merge of the values in the unswitched position.
+ OldPH->getTerminator()->eraseFromParent();
+ if (FullUnswitch) {
+ // If fully unswitching, we can use the existing branch instruction.
+ // Splice it into the old PH to gate reaching the new preheader and re-point
+ // its successors.
+ OldPH->getInstList().splice(OldPH->end(), BI.getParent()->getInstList(),
+ BI);
+ if (MSSAU) {
+ // Temporarily clone the terminator, to make MSSA update cheaper by
+ // separating "insert edge" updates from "remove edge" ones.
+ ParentBB->getInstList().push_back(BI.clone());
+ } else {
+ // Create a new unconditional branch that will continue the loop as a new
+ // terminator.
+ BranchInst::Create(ContinueBB, ParentBB);
+ }
+ BI.setSuccessor(LoopExitSuccIdx, UnswitchedBB);
+ BI.setSuccessor(1 - LoopExitSuccIdx, NewPH);
+ } else {
+ // Only unswitching a subset of inputs to the condition, so we will need to
+ // build a new branch that merges the invariant inputs.
+ if (ExitDirection)
+ assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+ Instruction::Or &&
+ "Must have an `or` of `i1`s for the condition!");
+ else
+ assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+ Instruction::And &&
+ "Must have an `and` of `i1`s for the condition!");
+ buildPartialUnswitchConditionalBranch(*OldPH, Invariants, ExitDirection,
+ *UnswitchedBB, *NewPH);
+ }
+
+ // Update the dominator tree with the added edge.
+ DT.insertEdge(OldPH, UnswitchedBB);
+
+ // After the dominator tree was updated with the added edge, update MemorySSA
+ // if available.
+ if (MSSAU) {
+ SmallVector<CFGUpdate, 1> Updates;
+ Updates.push_back({cfg::UpdateKind::Insert, OldPH, UnswitchedBB});
+ MSSAU->applyInsertUpdates(Updates, DT);
+ }
+
+ // Finish updating dominator tree and memory ssa for full unswitch.
+ if (FullUnswitch) {
+ if (MSSAU) {
+ // Remove the cloned branch instruction.
+ ParentBB->getTerminator()->eraseFromParent();
+ // Create unconditional branch now.
+ BranchInst::Create(ContinueBB, ParentBB);
+ MSSAU->removeEdge(ParentBB, LoopExitBB);
+ }
+ DT.deleteEdge(ParentBB, LoopExitBB);
+ }
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ // Rewrite the relevant PHI nodes.
+ if (UnswitchedBB == LoopExitBB)
+ rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH);
+ else
+ rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB,
+ *ParentBB, *OldPH, FullUnswitch);
+
+ // The constant we can replace all of our invariants with inside the loop
+ // body. If any of the invariants have a value other than this the loop won't
+ // be entered.
+ ConstantInt *Replacement = ExitDirection
+ ? ConstantInt::getFalse(BI.getContext())
+ : ConstantInt::getTrue(BI.getContext());
+
+ // Since this is an i1 condition we can also trivially replace uses of it
+ // within the loop with a constant.
+ for (Value *Invariant : Invariants)
+ replaceLoopInvariantUses(L, Invariant, *Replacement);
+
+ // If this was full unswitching, we may have changed the nesting relationship
+ // for this loop so hoist it to its correct parent if needed.
+ if (FullUnswitch)
+ hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE);
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ LLVM_DEBUG(dbgs() << " done: unswitching trivial branch...\n");
+ ++NumTrivial;
+ ++NumBranches;
+ return true;
+}
+
+/// Unswitch a trivial switch if the condition is loop invariant.
+///
+/// This routine should only be called when loop code leading to the switch has
+/// been validated as trivial (no side effects). This routine checks if the
+/// condition is invariant and that at least one of the successors is a loop
+/// exit. This allows us to unswitch without duplicating the loop, making it
+/// trivial.
+///
+/// If this routine fails to unswitch the switch it returns false.
+///
+/// If the switch can be unswitched, this routine splits the preheader and
+/// copies the switch above that split. If the default case is one of the
+/// exiting cases, it copies the non-exiting cases and points them at the new
+/// preheader. If the default case is not exiting, it copies the exiting cases
+/// and points the default at the preheader. It preserves loop simplified form
+/// (splitting the exit blocks as necessary). It simplifies the switch within
+/// the loop by removing now-dead cases. If the default case is one of those
+/// unswitched, it replaces its destination with a new basic block containing
+/// only unreachable. Such basic blocks, while technically loop exits, are not
+/// considered for unswitching so this is a stable transform and the same
+/// switch will not be revisited. If after unswitching there is only a single
+/// in-loop successor, the switch is further simplified to an unconditional
+/// branch. Still more cleanup can be done with some simplify-cfg like pass.
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
+static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
+ LoopInfo &LI, ScalarEvolution *SE,
+ MemorySSAUpdater *MSSAU) {
+ LLVM_DEBUG(dbgs() << " Trying to unswitch switch: " << SI << "\n");
+ Value *LoopCond = SI.getCondition();
+
+ // If this isn't switching on an invariant condition, we can't unswitch it.
+ if (!L.isLoopInvariant(LoopCond))
+ return false;
+
+ auto *ParentBB = SI.getParent();
+
+ // The same check must be used both for the default and the exit cases. We
+ // should never leave edges from the switch instruction to a basic block that
+ // we are unswitching, hence the condition used to determine the default case
+ // needs to also be used to populate ExitCaseIndices, which is then used to
+ // remove cases from the switch.
+ auto IsTriviallyUnswitchableExitBlock = [&](BasicBlock &BBToCheck) {
+ // BBToCheck is not an exit block if it is inside loop L.
+ if (L.contains(&BBToCheck))
+ return false;
+ // BBToCheck is not trivial to unswitch if its phis aren't loop invariant.
+ if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, BBToCheck))
+ return false;
+ // We do not unswitch a block that only has an unreachable statement, as
+ // it's possible this is a previously unswitched block. Only unswitch if
+ // either the terminator is not unreachable, or, if it is, it's not the only
+ // instruction in the block.
+ auto *TI = BBToCheck.getTerminator();
+ bool isUnreachable = isa<UnreachableInst>(TI);
+ return !isUnreachable ||
+ (isUnreachable && (BBToCheck.getFirstNonPHIOrDbg() != TI));
+ };
+
+ SmallVector<int, 4> ExitCaseIndices;
+ for (auto Case : SI.cases())
+ if (IsTriviallyUnswitchableExitBlock(*Case.getCaseSuccessor()))
+ ExitCaseIndices.push_back(Case.getCaseIndex());
+ BasicBlock *DefaultExitBB = nullptr;
+ SwitchInstProfUpdateWrapper::CaseWeightOpt DefaultCaseWeight =
+ SwitchInstProfUpdateWrapper::getSuccessorWeight(SI, 0);
+ if (IsTriviallyUnswitchableExitBlock(*SI.getDefaultDest())) {
+ DefaultExitBB = SI.getDefaultDest();
+ } else if (ExitCaseIndices.empty())
+ return false;
+
+ LLVM_DEBUG(dbgs() << " unswitching trivial switch...\n");
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ // We may need to invalidate SCEVs for the outermost loop reached by any of
+ // the exits.
+ Loop *OuterL = &L;
+
+ if (DefaultExitBB) {
+ // Clear out the default destination temporarily to allow accurate
+ // predecessor lists to be examined below.
+ SI.setDefaultDest(nullptr);
+ // Check the loop containing this exit.
+ Loop *ExitL = LI.getLoopFor(DefaultExitBB);
+ if (!ExitL || ExitL->contains(OuterL))
+ OuterL = ExitL;
+ }
+
+ // Store the exit cases into a separate data structure and remove them from
+ // the switch.
+ SmallVector<std::tuple<ConstantInt *, BasicBlock *,
+ SwitchInstProfUpdateWrapper::CaseWeightOpt>,
+ 4> ExitCases;
+ ExitCases.reserve(ExitCaseIndices.size());
+ SwitchInstProfUpdateWrapper SIW(SI);
+ // We walk the case indices backwards so that we remove the last case first
+ // and don't disrupt the earlier indices.
+ for (unsigned Index : reverse(ExitCaseIndices)) {
+ auto CaseI = SI.case_begin() + Index;
+ // Compute the outer loop from this exit.
+ Loop *ExitL = LI.getLoopFor(CaseI->getCaseSuccessor());
+ if (!ExitL || ExitL->contains(OuterL))
+ OuterL = ExitL;
+ // Save the value of this case.
+ auto W = SIW.getSuccessorWeight(CaseI->getSuccessorIndex());
+ ExitCases.emplace_back(CaseI->getCaseValue(), CaseI->getCaseSuccessor(), W);
+ // Delete the unswitched cases.
+ SIW.removeCase(CaseI);
+ }
+
+ if (SE) {
+ if (OuterL)
+ SE->forgetLoop(OuterL);
+ else
+ SE->forgetTopmostLoop(&L);
+ }
+
+ // Check if after this all of the remaining cases point at the same
+ // successor.
+ BasicBlock *CommonSuccBB = nullptr;
+ if (SI.getNumCases() > 0 &&
all_of(drop_begin(SI.cases()), [&SI](const SwitchInst::CaseHandle &Case) {
return Case.getCaseSuccessor() == SI.case_begin()->getCaseSuccessor();
}))
- CommonSuccBB = SI.case_begin()->getCaseSuccessor();
- if (!DefaultExitBB) {
- // If we're not unswitching the default, we need it to match any cases to
- // have a common successor or if we have no cases it is the common
- // successor.
- if (SI.getNumCases() == 0)
- CommonSuccBB = SI.getDefaultDest();
- else if (SI.getDefaultDest() != CommonSuccBB)
- CommonSuccBB = nullptr;
- }
-
- // Split the preheader, so that we know that there is a safe place to insert
- // the switch.
- BasicBlock *OldPH = L.getLoopPreheader();
- BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU);
- OldPH->getTerminator()->eraseFromParent();
-
- // Now add the unswitched switch.
- auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
- SwitchInstProfUpdateWrapper NewSIW(*NewSI);
-
- // Rewrite the IR for the unswitched basic blocks. This requires two steps.
- // First, we split any exit blocks with remaining in-loop predecessors. Then
- // we update the PHIs in one of two ways depending on if there was a split.
- // We walk in reverse so that we split in the same order as the cases
- // appeared. This is purely for convenience of reading the resulting IR, but
- // it doesn't cost anything really.
- SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
- SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
- // Handle the default exit if necessary.
- // FIXME: It'd be great if we could merge this with the loop below but LLVM's
- // ranges aren't quite powerful enough yet.
- if (DefaultExitBB) {
- if (pred_empty(DefaultExitBB)) {
- UnswitchedExitBBs.insert(DefaultExitBB);
- rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
- } else {
- auto *SplitBB =
- SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI, MSSAU);
- rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
- *ParentBB, *OldPH,
- /*FullUnswitch*/ true);
- DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
- }
- }
- // Note that we must use a reference in the for loop so that we update the
- // container.
- for (auto &ExitCase : reverse(ExitCases)) {
- // Grab a reference to the exit block in the pair so that we can update it.
- BasicBlock *ExitBB = std::get<1>(ExitCase);
-
- // If this case is the last edge into the exit block, we can simply reuse it
- // as it will no longer be a loop exit. No mapping necessary.
- if (pred_empty(ExitBB)) {
- // Only rewrite once.
- if (UnswitchedExitBBs.insert(ExitBB).second)
- rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
- continue;
- }
-
- // Otherwise we need to split the exit block so that we retain an exit
- // block from the loop and a target for the unswitched condition.
- BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB];
- if (!SplitExitBB) {
- // If this is the first time we see this, do the split and remember it.
- SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI, MSSAU);
- rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
- *ParentBB, *OldPH,
- /*FullUnswitch*/ true);
- }
- // Update the case pair to point to the split block.
- std::get<1>(ExitCase) = SplitExitBB;
- }
-
- // Now add the unswitched cases. We do this in reverse order as we built them
- // in reverse order.
- for (auto &ExitCase : reverse(ExitCases)) {
- ConstantInt *CaseVal = std::get<0>(ExitCase);
- BasicBlock *UnswitchedBB = std::get<1>(ExitCase);
-
- NewSIW.addCase(CaseVal, UnswitchedBB, std::get<2>(ExitCase));
- }
-
- // If the default was unswitched, re-point it and add explicit cases for
- // entering the loop.
- if (DefaultExitBB) {
- NewSIW->setDefaultDest(DefaultExitBB);
- NewSIW.setSuccessorWeight(0, DefaultCaseWeight);
-
- // We removed all the exit cases, so we just copy the cases to the
- // unswitched switch.
- for (const auto &Case : SI.cases())
- NewSIW.addCase(Case.getCaseValue(), NewPH,
- SIW.getSuccessorWeight(Case.getSuccessorIndex()));
- } else if (DefaultCaseWeight) {
- // We have to set branch weight of the default case.
- uint64_t SW = *DefaultCaseWeight;
- for (const auto &Case : SI.cases()) {
- auto W = SIW.getSuccessorWeight(Case.getSuccessorIndex());
- assert(W &&
- "case weight must be defined as default case weight is defined");
- SW += *W;
- }
- NewSIW.setSuccessorWeight(0, SW);
- }
-
- // If we ended up with a common successor for every path through the switch
- // after unswitching, rewrite it to an unconditional branch to make it easy
- // to recognize. Otherwise we potentially have to recognize the default case
- // pointing at unreachable and other complexity.
- if (CommonSuccBB) {
- BasicBlock *BB = SI.getParent();
- // We may have had multiple edges to this common successor block, so remove
- // them as predecessors. We skip the first one, either the default or the
- // actual first case.
- bool SkippedFirst = DefaultExitBB == nullptr;
- for (auto Case : SI.cases()) {
- assert(Case.getCaseSuccessor() == CommonSuccBB &&
- "Non-common successor!");
- (void)Case;
- if (!SkippedFirst) {
- SkippedFirst = true;
- continue;
- }
- CommonSuccBB->removePredecessor(BB,
- /*KeepOneInputPHIs*/ true);
- }
- // Now nuke the switch and replace it with a direct branch.
- SIW.eraseFromParent();
- BranchInst::Create(CommonSuccBB, BB);
- } else if (DefaultExitBB) {
- assert(SI.getNumCases() > 0 &&
- "If we had no cases we'd have a common successor!");
- // Move the last case to the default successor. This is valid as if the
- // default got unswitched it cannot be reached. This has the advantage of
- // being simple and keeping the number of edges from this switch to
- // successors the same, and avoiding any PHI update complexity.
- auto LastCaseI = std::prev(SI.case_end());
-
- SI.setDefaultDest(LastCaseI->getCaseSuccessor());
- SIW.setSuccessorWeight(
- 0, SIW.getSuccessorWeight(LastCaseI->getSuccessorIndex()));
- SIW.removeCase(LastCaseI);
- }
-
- // Walk the unswitched exit blocks and the unswitched split blocks and update
- // the dominator tree based on the CFG edits. While we are walking unordered
- // containers here, the API for applyUpdates takes an unordered list of
- // updates and requires them to not contain duplicates.
- SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
- for (auto *UnswitchedExitBB : UnswitchedExitBBs) {
- DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedExitBB});
- DTUpdates.push_back({DT.Insert, OldPH, UnswitchedExitBB});
- }
- for (auto SplitUnswitchedPair : SplitExitBBMap) {
- DTUpdates.push_back({DT.Delete, ParentBB, SplitUnswitchedPair.first});
- DTUpdates.push_back({DT.Insert, OldPH, SplitUnswitchedPair.second});
- }
-
- if (MSSAU) {
+ CommonSuccBB = SI.case_begin()->getCaseSuccessor();
+ if (!DefaultExitBB) {
+ // If we're not unswitching the default, we need it to match any cases to
+ // have a common successor or if we have no cases it is the common
+ // successor.
+ if (SI.getNumCases() == 0)
+ CommonSuccBB = SI.getDefaultDest();
+ else if (SI.getDefaultDest() != CommonSuccBB)
+ CommonSuccBB = nullptr;
+ }
+
+ // Split the preheader, so that we know that there is a safe place to insert
+ // the switch.
+ BasicBlock *OldPH = L.getLoopPreheader();
+ BasicBlock *NewPH = SplitEdge(OldPH, L.getHeader(), &DT, &LI, MSSAU);
+ OldPH->getTerminator()->eraseFromParent();
+
+ // Now add the unswitched switch.
+ auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH);
+ SwitchInstProfUpdateWrapper NewSIW(*NewSI);
+
+ // Rewrite the IR for the unswitched basic blocks. This requires two steps.
+ // First, we split any exit blocks with remaining in-loop predecessors. Then
+ // we update the PHIs in one of two ways depending on if there was a split.
+ // We walk in reverse so that we split in the same order as the cases
+ // appeared. This is purely for convenience of reading the resulting IR, but
+ // it doesn't cost anything really.
+ SmallPtrSet<BasicBlock *, 2> UnswitchedExitBBs;
+ SmallDenseMap<BasicBlock *, BasicBlock *, 2> SplitExitBBMap;
+ // Handle the default exit if necessary.
+ // FIXME: It'd be great if we could merge this with the loop below but LLVM's
+ // ranges aren't quite powerful enough yet.
+ if (DefaultExitBB) {
+ if (pred_empty(DefaultExitBB)) {
+ UnswitchedExitBBs.insert(DefaultExitBB);
+ rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH);
+ } else {
+ auto *SplitBB =
+ SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI, MSSAU);
+ rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB,
+ *ParentBB, *OldPH,
+ /*FullUnswitch*/ true);
+ DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB;
+ }
+ }
+ // Note that we must use a reference in the for loop so that we update the
+ // container.
+ for (auto &ExitCase : reverse(ExitCases)) {
+ // Grab a reference to the exit block in the pair so that we can update it.
+ BasicBlock *ExitBB = std::get<1>(ExitCase);
+
+ // If this case is the last edge into the exit block, we can simply reuse it
+ // as it will no longer be a loop exit. No mapping necessary.
+ if (pred_empty(ExitBB)) {
+ // Only rewrite once.
+ if (UnswitchedExitBBs.insert(ExitBB).second)
+ rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH);
+ continue;
+ }
+
+ // Otherwise we need to split the exit block so that we retain an exit
+ // block from the loop and a target for the unswitched condition.
+ BasicBlock *&SplitExitBB = SplitExitBBMap[ExitBB];
+ if (!SplitExitBB) {
+ // If this is the first time we see this, do the split and remember it.
+ SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI, MSSAU);
+ rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB,
+ *ParentBB, *OldPH,
+ /*FullUnswitch*/ true);
+ }
+ // Update the case pair to point to the split block.
+ std::get<1>(ExitCase) = SplitExitBB;
+ }
+
+ // Now add the unswitched cases. We do this in reverse order as we built them
+ // in reverse order.
+ for (auto &ExitCase : reverse(ExitCases)) {
+ ConstantInt *CaseVal = std::get<0>(ExitCase);
+ BasicBlock *UnswitchedBB = std::get<1>(ExitCase);
+
+ NewSIW.addCase(CaseVal, UnswitchedBB, std::get<2>(ExitCase));
+ }
+
+ // If the default was unswitched, re-point it and add explicit cases for
+ // entering the loop.
+ if (DefaultExitBB) {
+ NewSIW->setDefaultDest(DefaultExitBB);
+ NewSIW.setSuccessorWeight(0, DefaultCaseWeight);
+
+ // We removed all the exit cases, so we just copy the cases to the
+ // unswitched switch.
+ for (const auto &Case : SI.cases())
+ NewSIW.addCase(Case.getCaseValue(), NewPH,
+ SIW.getSuccessorWeight(Case.getSuccessorIndex()));
+ } else if (DefaultCaseWeight) {
+ // We have to set branch weight of the default case.
+ uint64_t SW = *DefaultCaseWeight;
+ for (const auto &Case : SI.cases()) {
+ auto W = SIW.getSuccessorWeight(Case.getSuccessorIndex());
+ assert(W &&
+ "case weight must be defined as default case weight is defined");
+ SW += *W;
+ }
+ NewSIW.setSuccessorWeight(0, SW);
+ }
+
+ // If we ended up with a common successor for every path through the switch
+ // after unswitching, rewrite it to an unconditional branch to make it easy
+ // to recognize. Otherwise we potentially have to recognize the default case
+ // pointing at unreachable and other complexity.
+ if (CommonSuccBB) {
+ BasicBlock *BB = SI.getParent();
+ // We may have had multiple edges to this common successor block, so remove
+ // them as predecessors. We skip the first one, either the default or the
+ // actual first case.
+ bool SkippedFirst = DefaultExitBB == nullptr;
+ for (auto Case : SI.cases()) {
+ assert(Case.getCaseSuccessor() == CommonSuccBB &&
+ "Non-common successor!");
+ (void)Case;
+ if (!SkippedFirst) {
+ SkippedFirst = true;
+ continue;
+ }
+ CommonSuccBB->removePredecessor(BB,
+ /*KeepOneInputPHIs*/ true);
+ }
+ // Now nuke the switch and replace it with a direct branch.
+ SIW.eraseFromParent();
+ BranchInst::Create(CommonSuccBB, BB);
+ } else if (DefaultExitBB) {
+ assert(SI.getNumCases() > 0 &&
+ "If we had no cases we'd have a common successor!");
+ // Move the last case to the default successor. This is valid as if the
+ // default got unswitched it cannot be reached. This has the advantage of
+ // being simple and keeping the number of edges from this switch to
+ // successors the same, and avoiding any PHI update complexity.
+ auto LastCaseI = std::prev(SI.case_end());
+
+ SI.setDefaultDest(LastCaseI->getCaseSuccessor());
+ SIW.setSuccessorWeight(
+ 0, SIW.getSuccessorWeight(LastCaseI->getSuccessorIndex()));
+ SIW.removeCase(LastCaseI);
+ }
+
+ // Walk the unswitched exit blocks and the unswitched split blocks and update
+ // the dominator tree based on the CFG edits. While we are walking unordered
+ // containers here, the API for applyUpdates takes an unordered list of
+ // updates and requires them to not contain duplicates.
+ SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+ for (auto *UnswitchedExitBB : UnswitchedExitBBs) {
+ DTUpdates.push_back({DT.Delete, ParentBB, UnswitchedExitBB});
+ DTUpdates.push_back({DT.Insert, OldPH, UnswitchedExitBB});
+ }
+ for (auto SplitUnswitchedPair : SplitExitBBMap) {
+ DTUpdates.push_back({DT.Delete, ParentBB, SplitUnswitchedPair.first});
+ DTUpdates.push_back({DT.Insert, OldPH, SplitUnswitchedPair.second});
+ }
+
+ if (MSSAU) {
MSSAU->applyUpdates(DTUpdates, DT, /*UpdateDT=*/true);
- if (VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
+ if (VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
} else {
DT.applyUpdates(DTUpdates);
- }
-
- assert(DT.verify(DominatorTree::VerificationLevel::Fast));
-
- // We may have changed the nesting relationship for this loop so hoist it to
- // its correct parent if needed.
- hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE);
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- ++NumTrivial;
- ++NumSwitches;
- LLVM_DEBUG(dbgs() << " done: unswitching trivial switch...\n");
- return true;
-}
-
-/// This routine scans the loop to find a branch or switch which occurs before
-/// any side effects occur. These can potentially be unswitched without
-/// duplicating the loop. If a branch or switch is successfully unswitched the
-/// scanning continues to see if subsequent branches or switches have become
-/// trivial. Once all trivial candidates have been unswitched, this routine
-/// returns.
-///
-/// The return value indicates whether anything was unswitched (and therefore
-/// changed).
-///
-/// If `SE` is not null, it will be updated based on the potential loop SCEVs
-/// invalidated by this.
-static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
- LoopInfo &LI, ScalarEvolution *SE,
- MemorySSAUpdater *MSSAU) {
- bool Changed = false;
-
- // If loop header has only one reachable successor we should keep looking for
- // trivial condition candidates in the successor as well. An alternative is
- // to constant fold conditions and merge successors into loop header (then we
- // only need to check header's terminator). The reason for not doing this in
- // LoopUnswitch pass is that it could potentially break LoopPassManager's
- // invariants. Folding dead branches could either eliminate the current loop
- // or make other loops unreachable. LCSSA form might also not be preserved
- // after deleting branches. The following code keeps traversing loop header's
- // successors until it finds the trivial condition candidate (condition that
- // is not a constant). Since unswitching generates branches with constant
- // conditions, this scenario could be very common in practice.
- BasicBlock *CurrentBB = L.getHeader();
- SmallPtrSet<BasicBlock *, 8> Visited;
- Visited.insert(CurrentBB);
- do {
- // Check if there are any side-effecting instructions (e.g. stores, calls,
- // volatile loads) in the part of the loop that the code *would* execute
- // without unswitching.
- if (MSSAU) // Possible early exit with MSSA
- if (auto *Defs = MSSAU->getMemorySSA()->getBlockDefs(CurrentBB))
- if (!isa<MemoryPhi>(*Defs->begin()) || (++Defs->begin() != Defs->end()))
- return Changed;
- if (llvm::any_of(*CurrentBB,
- [](Instruction &I) { return I.mayHaveSideEffects(); }))
- return Changed;
-
- Instruction *CurrentTerm = CurrentBB->getTerminator();
-
- if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
- // Don't bother trying to unswitch past a switch with a constant
- // condition. This should be removed prior to running this pass by
- // simplify-cfg.
- if (isa<Constant>(SI->getCondition()))
- return Changed;
-
- if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE, MSSAU))
- // Couldn't unswitch this one so we're done.
- return Changed;
-
- // Mark that we managed to unswitch something.
- Changed = true;
-
- // If unswitching turned the terminator into an unconditional branch then
- // we can continue. The unswitching logic specifically works to fold any
- // cases it can into an unconditional branch to make it easier to
- // recognize here.
- auto *BI = dyn_cast<BranchInst>(CurrentBB->getTerminator());
- if (!BI || BI->isConditional())
- return Changed;
-
- CurrentBB = BI->getSuccessor(0);
- continue;
- }
-
- auto *BI = dyn_cast<BranchInst>(CurrentTerm);
- if (!BI)
- // We do not understand other terminator instructions.
- return Changed;
-
- // Don't bother trying to unswitch past an unconditional branch or a branch
- // with a constant value. These should be removed by simplify-cfg prior to
- // running this pass.
- if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
- return Changed;
-
- // Found a trivial condition candidate: non-foldable conditional branch. If
- // we fail to unswitch this, we can't do anything else that is trivial.
- if (!unswitchTrivialBranch(L, *BI, DT, LI, SE, MSSAU))
- return Changed;
-
- // Mark that we managed to unswitch something.
- Changed = true;
-
- // If we only unswitched some of the conditions feeding the branch, we won't
- // have collapsed it to a single successor.
- BI = cast<BranchInst>(CurrentBB->getTerminator());
- if (BI->isConditional())
- return Changed;
-
- // Follow the newly unconditional branch into its successor.
- CurrentBB = BI->getSuccessor(0);
-
- // When continuing, if we exit the loop or reach a previous visited block,
- // then we can not reach any trivial condition candidates (unfoldable
- // branch instructions or switch instructions) and no unswitch can happen.
- } while (L.contains(CurrentBB) && Visited.insert(CurrentBB).second);
-
- return Changed;
-}
-
-/// Build the cloned blocks for an unswitched copy of the given loop.
-///
-/// The cloned blocks are inserted before the loop preheader (`LoopPH`) and
-/// after the split block (`SplitBB`) that will be used to select between the
-/// cloned and original loop.
-///
-/// This routine handles cloning all of the necessary loop blocks and exit
-/// blocks including rewriting their instructions and the relevant PHI nodes.
-/// Any loop blocks or exit blocks which are dominated by a different successor
-/// than the one for this clone of the loop blocks can be trivially skipped. We
-/// use the `DominatingSucc` map to determine whether a block satisfies that
-/// property with a simple map lookup.
-///
-/// It also correctly creates the unconditional branch in the cloned
-/// unswitched parent block to only point at the unswitched successor.
-///
-/// This does not handle most of the necessary updates to `LoopInfo`. Only exit
-/// block splitting is correctly reflected in `LoopInfo`, essentially all of
-/// the cloned blocks (and their loops) are left without full `LoopInfo`
-/// updates. This also doesn't fully update `DominatorTree`. It adds the cloned
-/// blocks to them but doesn't create the cloned `DominatorTree` structure and
-/// instead the caller must recompute an accurate DT. It *does* correctly
-/// update the `AssumptionCache` provided in `AC`.
-static BasicBlock *buildClonedLoopBlocks(
- Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB,
- ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB,
- BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB,
- const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc,
- ValueToValueMapTy &VMap,
- SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC,
- DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) {
- SmallVector<BasicBlock *, 4> NewBlocks;
- NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size());
-
- // We will need to clone a bunch of blocks, wrap up the clone operation in
- // a helper.
- auto CloneBlock = [&](BasicBlock *OldBB) {
- // Clone the basic block and insert it before the new preheader.
- BasicBlock *NewBB = CloneBasicBlock(OldBB, VMap, ".us", OldBB->getParent());
- NewBB->moveBefore(LoopPH);
-
- // Record this block and the mapping.
- NewBlocks.push_back(NewBB);
- VMap[OldBB] = NewBB;
-
- return NewBB;
- };
-
- // We skip cloning blocks when they have a dominating succ that is not the
- // succ we are cloning for.
- auto SkipBlock = [&](BasicBlock *BB) {
- auto It = DominatingSucc.find(BB);
- return It != DominatingSucc.end() && It->second != UnswitchedSuccBB;
- };
-
- // First, clone the preheader.
- auto *ClonedPH = CloneBlock(LoopPH);
-
- // Then clone all the loop blocks, skipping the ones that aren't necessary.
- for (auto *LoopBB : L.blocks())
- if (!SkipBlock(LoopBB))
- CloneBlock(LoopBB);
-
- // Split all the loop exit edges so that when we clone the exit blocks, if
- // any of the exit blocks are *also* a preheader for some other loop, we
- // don't create multiple predecessors entering the loop header.
- for (auto *ExitBB : ExitBlocks) {
- if (SkipBlock(ExitBB))
- continue;
-
- // When we are going to clone an exit, we don't need to clone all the
- // instructions in the exit block and we want to ensure we have an easy
- // place to merge the CFG, so split the exit first. This is always safe to
- // do because there cannot be any non-loop predecessors of a loop exit in
- // loop simplified form.
- auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI, MSSAU);
-
- // Rearrange the names to make it easier to write test cases by having the
- // exit block carry the suffix rather than the merge block carrying the
- // suffix.
- MergeBB->takeName(ExitBB);
- ExitBB->setName(Twine(MergeBB->getName()) + ".split");
-
- // Now clone the original exit block.
- auto *ClonedExitBB = CloneBlock(ExitBB);
- assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 &&
- "Exit block should have been split to have one successor!");
- assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB &&
- "Cloned exit block has the wrong successor!");
-
- // Remap any cloned instructions and create a merge phi node for them.
- for (auto ZippedInsts : llvm::zip_first(
- llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())),
- llvm::make_range(ClonedExitBB->begin(),
- std::prev(ClonedExitBB->end())))) {
- Instruction &I = std::get<0>(ZippedInsts);
- Instruction &ClonedI = std::get<1>(ZippedInsts);
-
- // The only instructions in the exit block should be PHI nodes and
- // potentially a landing pad.
- assert(
- (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) &&
- "Bad instruction in exit block!");
- // We should have a value map between the instruction and its clone.
- assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!");
-
- auto *MergePN =
- PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi",
- &*MergeBB->getFirstInsertionPt());
- I.replaceAllUsesWith(MergePN);
- MergePN->addIncoming(&I, ExitBB);
- MergePN->addIncoming(&ClonedI, ClonedExitBB);
- }
- }
-
- // Rewrite the instructions in the cloned blocks to refer to the instructions
- // in the cloned blocks. We have to do this as a second pass so that we have
- // everything available. Also, we have inserted new instructions which may
- // include assume intrinsics, so we update the assumption cache while
- // processing this.
- for (auto *ClonedBB : NewBlocks)
- for (Instruction &I : *ClonedBB) {
- RemapInstruction(&I, VMap,
- RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
- if (auto *II = dyn_cast<IntrinsicInst>(&I))
- if (II->getIntrinsicID() == Intrinsic::assume)
- AC.registerAssumption(II);
- }
-
- // Update any PHI nodes in the cloned successors of the skipped blocks to not
- // have spurious incoming values.
- for (auto *LoopBB : L.blocks())
- if (SkipBlock(LoopBB))
- for (auto *SuccBB : successors(LoopBB))
- if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)))
- for (PHINode &PN : ClonedSuccBB->phis())
- PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false);
-
- // Remove the cloned parent as a predecessor of any successor we ended up
- // cloning other than the unswitched one.
- auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
- for (auto *SuccBB : successors(ParentBB)) {
- if (SuccBB == UnswitchedSuccBB)
- continue;
-
- auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB));
- if (!ClonedSuccBB)
- continue;
-
- ClonedSuccBB->removePredecessor(ClonedParentBB,
- /*KeepOneInputPHIs*/ true);
- }
-
- // Replace the cloned branch with an unconditional branch to the cloned
- // unswitched successor.
- auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
+ }
+
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
+ // We may have changed the nesting relationship for this loop so hoist it to
+ // its correct parent if needed.
+ hoistLoopToNewParent(L, *NewPH, DT, LI, MSSAU, SE);
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ ++NumTrivial;
+ ++NumSwitches;
+ LLVM_DEBUG(dbgs() << " done: unswitching trivial switch...\n");
+ return true;
+}
+
+/// This routine scans the loop to find a branch or switch which occurs before
+/// any side effects occur. These can potentially be unswitched without
+/// duplicating the loop. If a branch or switch is successfully unswitched the
+/// scanning continues to see if subsequent branches or switches have become
+/// trivial. Once all trivial candidates have been unswitched, this routine
+/// returns.
+///
+/// The return value indicates whether anything was unswitched (and therefore
+/// changed).
+///
+/// If `SE` is not null, it will be updated based on the potential loop SCEVs
+/// invalidated by this.
+static bool unswitchAllTrivialConditions(Loop &L, DominatorTree &DT,
+ LoopInfo &LI, ScalarEvolution *SE,
+ MemorySSAUpdater *MSSAU) {
+ bool Changed = false;
+
+ // If loop header has only one reachable successor we should keep looking for
+ // trivial condition candidates in the successor as well. An alternative is
+ // to constant fold conditions and merge successors into loop header (then we
+ // only need to check header's terminator). The reason for not doing this in
+ // LoopUnswitch pass is that it could potentially break LoopPassManager's
+ // invariants. Folding dead branches could either eliminate the current loop
+ // or make other loops unreachable. LCSSA form might also not be preserved
+ // after deleting branches. The following code keeps traversing loop header's
+ // successors until it finds the trivial condition candidate (condition that
+ // is not a constant). Since unswitching generates branches with constant
+ // conditions, this scenario could be very common in practice.
+ BasicBlock *CurrentBB = L.getHeader();
+ SmallPtrSet<BasicBlock *, 8> Visited;
+ Visited.insert(CurrentBB);
+ do {
+ // Check if there are any side-effecting instructions (e.g. stores, calls,
+ // volatile loads) in the part of the loop that the code *would* execute
+ // without unswitching.
+ if (MSSAU) // Possible early exit with MSSA
+ if (auto *Defs = MSSAU->getMemorySSA()->getBlockDefs(CurrentBB))
+ if (!isa<MemoryPhi>(*Defs->begin()) || (++Defs->begin() != Defs->end()))
+ return Changed;
+ if (llvm::any_of(*CurrentBB,
+ [](Instruction &I) { return I.mayHaveSideEffects(); }))
+ return Changed;
+
+ Instruction *CurrentTerm = CurrentBB->getTerminator();
+
+ if (auto *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
+ // Don't bother trying to unswitch past a switch with a constant
+ // condition. This should be removed prior to running this pass by
+ // simplify-cfg.
+ if (isa<Constant>(SI->getCondition()))
+ return Changed;
+
+ if (!unswitchTrivialSwitch(L, *SI, DT, LI, SE, MSSAU))
+ // Couldn't unswitch this one so we're done.
+ return Changed;
+
+ // Mark that we managed to unswitch something.
+ Changed = true;
+
+ // If unswitching turned the terminator into an unconditional branch then
+ // we can continue. The unswitching logic specifically works to fold any
+ // cases it can into an unconditional branch to make it easier to
+ // recognize here.
+ auto *BI = dyn_cast<BranchInst>(CurrentBB->getTerminator());
+ if (!BI || BI->isConditional())
+ return Changed;
+
+ CurrentBB = BI->getSuccessor(0);
+ continue;
+ }
+
+ auto *BI = dyn_cast<BranchInst>(CurrentTerm);
+ if (!BI)
+ // We do not understand other terminator instructions.
+ return Changed;
+
+ // Don't bother trying to unswitch past an unconditional branch or a branch
+ // with a constant value. These should be removed by simplify-cfg prior to
+ // running this pass.
+ if (!BI->isConditional() || isa<Constant>(BI->getCondition()))
+ return Changed;
+
+ // Found a trivial condition candidate: non-foldable conditional branch. If
+ // we fail to unswitch this, we can't do anything else that is trivial.
+ if (!unswitchTrivialBranch(L, *BI, DT, LI, SE, MSSAU))
+ return Changed;
+
+ // Mark that we managed to unswitch something.
+ Changed = true;
+
+ // If we only unswitched some of the conditions feeding the branch, we won't
+ // have collapsed it to a single successor.
+ BI = cast<BranchInst>(CurrentBB->getTerminator());
+ if (BI->isConditional())
+ return Changed;
+
+ // Follow the newly unconditional branch into its successor.
+ CurrentBB = BI->getSuccessor(0);
+
+ // When continuing, if we exit the loop or reach a previous visited block,
+ // then we can not reach any trivial condition candidates (unfoldable
+ // branch instructions or switch instructions) and no unswitch can happen.
+ } while (L.contains(CurrentBB) && Visited.insert(CurrentBB).second);
+
+ return Changed;
+}
+
+/// Build the cloned blocks for an unswitched copy of the given loop.
+///
+/// The cloned blocks are inserted before the loop preheader (`LoopPH`) and
+/// after the split block (`SplitBB`) that will be used to select between the
+/// cloned and original loop.
+///
+/// This routine handles cloning all of the necessary loop blocks and exit
+/// blocks including rewriting their instructions and the relevant PHI nodes.
+/// Any loop blocks or exit blocks which are dominated by a different successor
+/// than the one for this clone of the loop blocks can be trivially skipped. We
+/// use the `DominatingSucc` map to determine whether a block satisfies that
+/// property with a simple map lookup.
+///
+/// It also correctly creates the unconditional branch in the cloned
+/// unswitched parent block to only point at the unswitched successor.
+///
+/// This does not handle most of the necessary updates to `LoopInfo`. Only exit
+/// block splitting is correctly reflected in `LoopInfo`, essentially all of
+/// the cloned blocks (and their loops) are left without full `LoopInfo`
+/// updates. This also doesn't fully update `DominatorTree`. It adds the cloned
+/// blocks to them but doesn't create the cloned `DominatorTree` structure and
+/// instead the caller must recompute an accurate DT. It *does* correctly
+/// update the `AssumptionCache` provided in `AC`.
+static BasicBlock *buildClonedLoopBlocks(
+ Loop &L, BasicBlock *LoopPH, BasicBlock *SplitBB,
+ ArrayRef<BasicBlock *> ExitBlocks, BasicBlock *ParentBB,
+ BasicBlock *UnswitchedSuccBB, BasicBlock *ContinueSuccBB,
+ const SmallDenseMap<BasicBlock *, BasicBlock *, 16> &DominatingSucc,
+ ValueToValueMapTy &VMap,
+ SmallVectorImpl<DominatorTree::UpdateType> &DTUpdates, AssumptionCache &AC,
+ DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) {
+ SmallVector<BasicBlock *, 4> NewBlocks;
+ NewBlocks.reserve(L.getNumBlocks() + ExitBlocks.size());
+
+ // We will need to clone a bunch of blocks, wrap up the clone operation in
+ // a helper.
+ auto CloneBlock = [&](BasicBlock *OldBB) {
+ // Clone the basic block and insert it before the new preheader.
+ BasicBlock *NewBB = CloneBasicBlock(OldBB, VMap, ".us", OldBB->getParent());
+ NewBB->moveBefore(LoopPH);
+
+ // Record this block and the mapping.
+ NewBlocks.push_back(NewBB);
+ VMap[OldBB] = NewBB;
+
+ return NewBB;
+ };
+
+ // We skip cloning blocks when they have a dominating succ that is not the
+ // succ we are cloning for.
+ auto SkipBlock = [&](BasicBlock *BB) {
+ auto It = DominatingSucc.find(BB);
+ return It != DominatingSucc.end() && It->second != UnswitchedSuccBB;
+ };
+
+ // First, clone the preheader.
+ auto *ClonedPH = CloneBlock(LoopPH);
+
+ // Then clone all the loop blocks, skipping the ones that aren't necessary.
+ for (auto *LoopBB : L.blocks())
+ if (!SkipBlock(LoopBB))
+ CloneBlock(LoopBB);
+
+ // Split all the loop exit edges so that when we clone the exit blocks, if
+ // any of the exit blocks are *also* a preheader for some other loop, we
+ // don't create multiple predecessors entering the loop header.
+ for (auto *ExitBB : ExitBlocks) {
+ if (SkipBlock(ExitBB))
+ continue;
+
+ // When we are going to clone an exit, we don't need to clone all the
+ // instructions in the exit block and we want to ensure we have an easy
+ // place to merge the CFG, so split the exit first. This is always safe to
+ // do because there cannot be any non-loop predecessors of a loop exit in
+ // loop simplified form.
+ auto *MergeBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI, MSSAU);
+
+ // Rearrange the names to make it easier to write test cases by having the
+ // exit block carry the suffix rather than the merge block carrying the
+ // suffix.
+ MergeBB->takeName(ExitBB);
+ ExitBB->setName(Twine(MergeBB->getName()) + ".split");
+
+ // Now clone the original exit block.
+ auto *ClonedExitBB = CloneBlock(ExitBB);
+ assert(ClonedExitBB->getTerminator()->getNumSuccessors() == 1 &&
+ "Exit block should have been split to have one successor!");
+ assert(ClonedExitBB->getTerminator()->getSuccessor(0) == MergeBB &&
+ "Cloned exit block has the wrong successor!");
+
+ // Remap any cloned instructions and create a merge phi node for them.
+ for (auto ZippedInsts : llvm::zip_first(
+ llvm::make_range(ExitBB->begin(), std::prev(ExitBB->end())),
+ llvm::make_range(ClonedExitBB->begin(),
+ std::prev(ClonedExitBB->end())))) {
+ Instruction &I = std::get<0>(ZippedInsts);
+ Instruction &ClonedI = std::get<1>(ZippedInsts);
+
+ // The only instructions in the exit block should be PHI nodes and
+ // potentially a landing pad.
+ assert(
+ (isa<PHINode>(I) || isa<LandingPadInst>(I) || isa<CatchPadInst>(I)) &&
+ "Bad instruction in exit block!");
+ // We should have a value map between the instruction and its clone.
+ assert(VMap.lookup(&I) == &ClonedI && "Mismatch in the value map!");
+
+ auto *MergePN =
+ PHINode::Create(I.getType(), /*NumReservedValues*/ 2, ".us-phi",
+ &*MergeBB->getFirstInsertionPt());
+ I.replaceAllUsesWith(MergePN);
+ MergePN->addIncoming(&I, ExitBB);
+ MergePN->addIncoming(&ClonedI, ClonedExitBB);
+ }
+ }
+
+ // Rewrite the instructions in the cloned blocks to refer to the instructions
+ // in the cloned blocks. We have to do this as a second pass so that we have
+ // everything available. Also, we have inserted new instructions which may
+ // include assume intrinsics, so we update the assumption cache while
+ // processing this.
+ for (auto *ClonedBB : NewBlocks)
+ for (Instruction &I : *ClonedBB) {
+ RemapInstruction(&I, VMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC.registerAssumption(II);
+ }
+
+ // Update any PHI nodes in the cloned successors of the skipped blocks to not
+ // have spurious incoming values.
+ for (auto *LoopBB : L.blocks())
+ if (SkipBlock(LoopBB))
+ for (auto *SuccBB : successors(LoopBB))
+ if (auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB)))
+ for (PHINode &PN : ClonedSuccBB->phis())
+ PN.removeIncomingValue(LoopBB, /*DeletePHIIfEmpty*/ false);
+
+ // Remove the cloned parent as a predecessor of any successor we ended up
+ // cloning other than the unswitched one.
+ auto *ClonedParentBB = cast<BasicBlock>(VMap.lookup(ParentBB));
+ for (auto *SuccBB : successors(ParentBB)) {
+ if (SuccBB == UnswitchedSuccBB)
+ continue;
+
+ auto *ClonedSuccBB = cast_or_null<BasicBlock>(VMap.lookup(SuccBB));
+ if (!ClonedSuccBB)
+ continue;
+
+ ClonedSuccBB->removePredecessor(ClonedParentBB,
+ /*KeepOneInputPHIs*/ true);
+ }
+
+ // Replace the cloned branch with an unconditional branch to the cloned
+ // unswitched successor.
+ auto *ClonedSuccBB = cast<BasicBlock>(VMap.lookup(UnswitchedSuccBB));
Instruction *ClonedTerminator = ClonedParentBB->getTerminator();
// Trivial Simplification. If Terminator is a conditional branch and
// condition becomes dead - erase it.
@@ -1150,946 +1150,946 @@ static BasicBlock *buildClonedLoopBlocks(
ClonedConditionToErase = SI->getCondition();
ClonedTerminator->eraseFromParent();
- BranchInst::Create(ClonedSuccBB, ClonedParentBB);
-
+ BranchInst::Create(ClonedSuccBB, ClonedParentBB);
+
if (ClonedConditionToErase)
RecursivelyDeleteTriviallyDeadInstructions(ClonedConditionToErase, nullptr,
MSSAU);
- // If there are duplicate entries in the PHI nodes because of multiple edges
- // to the unswitched successor, we need to nuke all but one as we replaced it
- // with a direct branch.
- for (PHINode &PN : ClonedSuccBB->phis()) {
- bool Found = false;
- // Loop over the incoming operands backwards so we can easily delete as we
- // go without invalidating the index.
- for (int i = PN.getNumOperands() - 1; i >= 0; --i) {
- if (PN.getIncomingBlock(i) != ClonedParentBB)
- continue;
- if (!Found) {
- Found = true;
- continue;
- }
- PN.removeIncomingValue(i, /*DeletePHIIfEmpty*/ false);
- }
- }
-
- // Record the domtree updates for the new blocks.
- SmallPtrSet<BasicBlock *, 4> SuccSet;
- for (auto *ClonedBB : NewBlocks) {
- for (auto *SuccBB : successors(ClonedBB))
- if (SuccSet.insert(SuccBB).second)
- DTUpdates.push_back({DominatorTree::Insert, ClonedBB, SuccBB});
- SuccSet.clear();
- }
-
- return ClonedPH;
-}
-
-/// Recursively clone the specified loop and all of its children.
-///
-/// The target parent loop for the clone should be provided, or can be null if
-/// the clone is a top-level loop. While cloning, all the blocks are mapped
-/// with the provided value map. The entire original loop must be present in
-/// the value map. The cloned loop is returned.
-static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
- const ValueToValueMapTy &VMap, LoopInfo &LI) {
- auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) {
- assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!");
- ClonedL.reserveBlocks(OrigL.getNumBlocks());
- for (auto *BB : OrigL.blocks()) {
- auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB));
- ClonedL.addBlockEntry(ClonedBB);
- if (LI.getLoopFor(BB) == &OrigL)
- LI.changeLoopFor(ClonedBB, &ClonedL);
- }
- };
-
- // We specially handle the first loop because it may get cloned into
- // a different parent and because we most commonly are cloning leaf loops.
- Loop *ClonedRootL = LI.AllocateLoop();
- if (RootParentL)
- RootParentL->addChildLoop(ClonedRootL);
- else
- LI.addTopLevelLoop(ClonedRootL);
- AddClonedBlocksToLoop(OrigRootL, *ClonedRootL);
-
+ // If there are duplicate entries in the PHI nodes because of multiple edges
+ // to the unswitched successor, we need to nuke all but one as we replaced it
+ // with a direct branch.
+ for (PHINode &PN : ClonedSuccBB->phis()) {
+ bool Found = false;
+ // Loop over the incoming operands backwards so we can easily delete as we
+ // go without invalidating the index.
+ for (int i = PN.getNumOperands() - 1; i >= 0; --i) {
+ if (PN.getIncomingBlock(i) != ClonedParentBB)
+ continue;
+ if (!Found) {
+ Found = true;
+ continue;
+ }
+ PN.removeIncomingValue(i, /*DeletePHIIfEmpty*/ false);
+ }
+ }
+
+ // Record the domtree updates for the new blocks.
+ SmallPtrSet<BasicBlock *, 4> SuccSet;
+ for (auto *ClonedBB : NewBlocks) {
+ for (auto *SuccBB : successors(ClonedBB))
+ if (SuccSet.insert(SuccBB).second)
+ DTUpdates.push_back({DominatorTree::Insert, ClonedBB, SuccBB});
+ SuccSet.clear();
+ }
+
+ return ClonedPH;
+}
+
+/// Recursively clone the specified loop and all of its children.
+///
+/// The target parent loop for the clone should be provided, or can be null if
+/// the clone is a top-level loop. While cloning, all the blocks are mapped
+/// with the provided value map. The entire original loop must be present in
+/// the value map. The cloned loop is returned.
+static Loop *cloneLoopNest(Loop &OrigRootL, Loop *RootParentL,
+ const ValueToValueMapTy &VMap, LoopInfo &LI) {
+ auto AddClonedBlocksToLoop = [&](Loop &OrigL, Loop &ClonedL) {
+ assert(ClonedL.getBlocks().empty() && "Must start with an empty loop!");
+ ClonedL.reserveBlocks(OrigL.getNumBlocks());
+ for (auto *BB : OrigL.blocks()) {
+ auto *ClonedBB = cast<BasicBlock>(VMap.lookup(BB));
+ ClonedL.addBlockEntry(ClonedBB);
+ if (LI.getLoopFor(BB) == &OrigL)
+ LI.changeLoopFor(ClonedBB, &ClonedL);
+ }
+ };
+
+ // We specially handle the first loop because it may get cloned into
+ // a different parent and because we most commonly are cloning leaf loops.
+ Loop *ClonedRootL = LI.AllocateLoop();
+ if (RootParentL)
+ RootParentL->addChildLoop(ClonedRootL);
+ else
+ LI.addTopLevelLoop(ClonedRootL);
+ AddClonedBlocksToLoop(OrigRootL, *ClonedRootL);
+
if (OrigRootL.isInnermost())
- return ClonedRootL;
-
- // If we have a nest, we can quickly clone the entire loop nest using an
- // iterative approach because it is a tree. We keep the cloned parent in the
- // data structure to avoid repeatedly querying through a map to find it.
- SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone;
- // Build up the loops to clone in reverse order as we'll clone them from the
- // back.
- for (Loop *ChildL : llvm::reverse(OrigRootL))
- LoopsToClone.push_back({ClonedRootL, ChildL});
- do {
- Loop *ClonedParentL, *L;
- std::tie(ClonedParentL, L) = LoopsToClone.pop_back_val();
- Loop *ClonedL = LI.AllocateLoop();
- ClonedParentL->addChildLoop(ClonedL);
- AddClonedBlocksToLoop(*L, *ClonedL);
- for (Loop *ChildL : llvm::reverse(*L))
- LoopsToClone.push_back({ClonedL, ChildL});
- } while (!LoopsToClone.empty());
-
- return ClonedRootL;
-}
-
-/// Build the cloned loops of an original loop from unswitching.
-///
-/// Because unswitching simplifies the CFG of the loop, this isn't a trivial
-/// operation. We need to re-verify that there even is a loop (as the backedge
-/// may not have been cloned), and even if there are remaining backedges the
-/// backedge set may be different. However, we know that each child loop is
-/// undisturbed, we only need to find where to place each child loop within
-/// either any parent loop or within a cloned version of the original loop.
-///
-/// Because child loops may end up cloned outside of any cloned version of the
-/// original loop, multiple cloned sibling loops may be created. All of them
-/// are returned so that the newly introduced loop nest roots can be
-/// identified.
-static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
- const ValueToValueMapTy &VMap, LoopInfo &LI,
- SmallVectorImpl<Loop *> &NonChildClonedLoops) {
- Loop *ClonedL = nullptr;
-
- auto *OrigPH = OrigL.getLoopPreheader();
- auto *OrigHeader = OrigL.getHeader();
-
- auto *ClonedPH = cast<BasicBlock>(VMap.lookup(OrigPH));
- auto *ClonedHeader = cast<BasicBlock>(VMap.lookup(OrigHeader));
-
- // We need to know the loops of the cloned exit blocks to even compute the
- // accurate parent loop. If we only clone exits to some parent of the
- // original parent, we want to clone into that outer loop. We also keep track
- // of the loops that our cloned exit blocks participate in.
- Loop *ParentL = nullptr;
- SmallVector<BasicBlock *, 4> ClonedExitsInLoops;
- SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap;
- ClonedExitsInLoops.reserve(ExitBlocks.size());
- for (auto *ExitBB : ExitBlocks)
- if (auto *ClonedExitBB = cast_or_null<BasicBlock>(VMap.lookup(ExitBB)))
- if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
- ExitLoopMap[ClonedExitBB] = ExitL;
- ClonedExitsInLoops.push_back(ClonedExitBB);
- if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
- ParentL = ExitL;
- }
- assert((!ParentL || ParentL == OrigL.getParentLoop() ||
- ParentL->contains(OrigL.getParentLoop())) &&
- "The computed parent loop should always contain (or be) the parent of "
- "the original loop.");
-
- // We build the set of blocks dominated by the cloned header from the set of
- // cloned blocks out of the original loop. While not all of these will
- // necessarily be in the cloned loop, it is enough to establish that they
- // aren't in unreachable cycles, etc.
- SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks;
- for (auto *BB : OrigL.blocks())
- if (auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB)))
- ClonedLoopBlocks.insert(ClonedBB);
-
- // Rebuild the set of blocks that will end up in the cloned loop. We may have
- // skipped cloning some region of this loop which can in turn skip some of
- // the backedges so we have to rebuild the blocks in the loop based on the
- // backedges that remain after cloning.
- SmallVector<BasicBlock *, 16> Worklist;
- SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop;
- for (auto *Pred : predecessors(ClonedHeader)) {
- // The only possible non-loop header predecessor is the preheader because
- // we know we cloned the loop in simplified form.
- if (Pred == ClonedPH)
- continue;
-
- // Because the loop was in simplified form, the only non-loop predecessor
- // should be the preheader.
- assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop "
- "header other than the preheader "
- "that is not part of the loop!");
-
- // Insert this block into the loop set and on the first visit (and if it
- // isn't the header we're currently walking) put it into the worklist to
- // recurse through.
- if (BlocksInClonedLoop.insert(Pred).second && Pred != ClonedHeader)
- Worklist.push_back(Pred);
- }
-
- // If we had any backedges then there *is* a cloned loop. Put the header into
- // the loop set and then walk the worklist backwards to find all the blocks
- // that remain within the loop after cloning.
- if (!BlocksInClonedLoop.empty()) {
- BlocksInClonedLoop.insert(ClonedHeader);
-
- while (!Worklist.empty()) {
- BasicBlock *BB = Worklist.pop_back_val();
- assert(BlocksInClonedLoop.count(BB) &&
- "Didn't put block into the loop set!");
-
- // Insert any predecessors that are in the possible set into the cloned
- // set, and if the insert is successful, add them to the worklist. Note
- // that we filter on the blocks that are definitely reachable via the
- // backedge to the loop header so we may prune out dead code within the
- // cloned loop.
- for (auto *Pred : predecessors(BB))
- if (ClonedLoopBlocks.count(Pred) &&
- BlocksInClonedLoop.insert(Pred).second)
- Worklist.push_back(Pred);
- }
-
- ClonedL = LI.AllocateLoop();
- if (ParentL) {
- ParentL->addBasicBlockToLoop(ClonedPH, LI);
- ParentL->addChildLoop(ClonedL);
- } else {
- LI.addTopLevelLoop(ClonedL);
- }
- NonChildClonedLoops.push_back(ClonedL);
-
- ClonedL->reserveBlocks(BlocksInClonedLoop.size());
- // We don't want to just add the cloned loop blocks based on how we
- // discovered them. The original order of blocks was carefully built in
- // a way that doesn't rely on predecessor ordering. Rather than re-invent
- // that logic, we just re-walk the original blocks (and those of the child
- // loops) and filter them as we add them into the cloned loop.
- for (auto *BB : OrigL.blocks()) {
- auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB));
- if (!ClonedBB || !BlocksInClonedLoop.count(ClonedBB))
- continue;
-
- // Directly add the blocks that are only in this loop.
- if (LI.getLoopFor(BB) == &OrigL) {
- ClonedL->addBasicBlockToLoop(ClonedBB, LI);
- continue;
- }
-
- // We want to manually add it to this loop and parents.
- // Registering it with LoopInfo will happen when we clone the top
- // loop for this block.
- for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop())
- PL->addBlockEntry(ClonedBB);
- }
-
- // Now add each child loop whose header remains within the cloned loop. All
- // of the blocks within the loop must satisfy the same constraints as the
- // header so once we pass the header checks we can just clone the entire
- // child loop nest.
- for (Loop *ChildL : OrigL) {
- auto *ClonedChildHeader =
- cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
- if (!ClonedChildHeader || !BlocksInClonedLoop.count(ClonedChildHeader))
- continue;
-
-#ifndef NDEBUG
- // We should never have a cloned child loop header but fail to have
- // all of the blocks for that child loop.
- for (auto *ChildLoopBB : ChildL->blocks())
- assert(BlocksInClonedLoop.count(
- cast<BasicBlock>(VMap.lookup(ChildLoopBB))) &&
- "Child cloned loop has a header within the cloned outer "
- "loop but not all of its blocks!");
-#endif
-
- cloneLoopNest(*ChildL, ClonedL, VMap, LI);
- }
- }
-
- // Now that we've handled all the components of the original loop that were
- // cloned into a new loop, we still need to handle anything from the original
- // loop that wasn't in a cloned loop.
-
- // Figure out what blocks are left to place within any loop nest containing
- // the unswitched loop. If we never formed a loop, the cloned PH is one of
- // them.
- SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet;
- if (BlocksInClonedLoop.empty())
- UnloopedBlockSet.insert(ClonedPH);
- for (auto *ClonedBB : ClonedLoopBlocks)
- if (!BlocksInClonedLoop.count(ClonedBB))
- UnloopedBlockSet.insert(ClonedBB);
-
- // Copy the cloned exits and sort them in ascending loop depth, we'll work
- // backwards across these to process them inside out. The order shouldn't
- // matter as we're just trying to build up the map from inside-out; we use
- // the map in a more stably ordered way below.
- auto OrderedClonedExitsInLoops = ClonedExitsInLoops;
- llvm::sort(OrderedClonedExitsInLoops, [&](BasicBlock *LHS, BasicBlock *RHS) {
- return ExitLoopMap.lookup(LHS)->getLoopDepth() <
- ExitLoopMap.lookup(RHS)->getLoopDepth();
- });
-
- // Populate the existing ExitLoopMap with everything reachable from each
- // exit, starting from the inner most exit.
- while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) {
- assert(Worklist.empty() && "Didn't clear worklist!");
-
- BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val();
- Loop *ExitL = ExitLoopMap.lookup(ExitBB);
-
- // Walk the CFG back until we hit the cloned PH adding everything reachable
- // and in the unlooped set to this exit block's loop.
- Worklist.push_back(ExitBB);
- do {
- BasicBlock *BB = Worklist.pop_back_val();
- // We can stop recursing at the cloned preheader (if we get there).
- if (BB == ClonedPH)
- continue;
-
- for (BasicBlock *PredBB : predecessors(BB)) {
- // If this pred has already been moved to our set or is part of some
- // (inner) loop, no update needed.
- if (!UnloopedBlockSet.erase(PredBB)) {
- assert(
- (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) &&
- "Predecessor not mapped to a loop!");
- continue;
- }
-
- // We just insert into the loop set here. We'll add these blocks to the
- // exit loop after we build up the set in an order that doesn't rely on
- // predecessor order (which in turn relies on use list order).
- bool Inserted = ExitLoopMap.insert({PredBB, ExitL}).second;
- (void)Inserted;
- assert(Inserted && "Should only visit an unlooped block once!");
-
- // And recurse through to its predecessors.
- Worklist.push_back(PredBB);
- }
- } while (!Worklist.empty());
- }
-
- // Now that the ExitLoopMap gives as mapping for all the non-looping cloned
- // blocks to their outer loops, walk the cloned blocks and the cloned exits
- // in their original order adding them to the correct loop.
-
- // We need a stable insertion order. We use the order of the original loop
- // order and map into the correct parent loop.
- for (auto *BB : llvm::concat<BasicBlock *const>(
- makeArrayRef(ClonedPH), ClonedLoopBlocks, ClonedExitsInLoops))
- if (Loop *OuterL = ExitLoopMap.lookup(BB))
- OuterL->addBasicBlockToLoop(BB, LI);
-
-#ifndef NDEBUG
- for (auto &BBAndL : ExitLoopMap) {
- auto *BB = BBAndL.first;
- auto *OuterL = BBAndL.second;
- assert(LI.getLoopFor(BB) == OuterL &&
- "Failed to put all blocks into outer loops!");
- }
-#endif
-
- // Now that all the blocks are placed into the correct containing loop in the
- // absence of child loops, find all the potentially cloned child loops and
- // clone them into whatever outer loop we placed their header into.
- for (Loop *ChildL : OrigL) {
- auto *ClonedChildHeader =
- cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
- if (!ClonedChildHeader || BlocksInClonedLoop.count(ClonedChildHeader))
- continue;
-
-#ifndef NDEBUG
- for (auto *ChildLoopBB : ChildL->blocks())
- assert(VMap.count(ChildLoopBB) &&
- "Cloned a child loop header but not all of that loops blocks!");
-#endif
-
- NonChildClonedLoops.push_back(cloneLoopNest(
- *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI));
- }
-}
-
-static void
-deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
- ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
- DominatorTree &DT, MemorySSAUpdater *MSSAU) {
- // Find all the dead clones, and remove them from their successors.
- SmallVector<BasicBlock *, 16> DeadBlocks;
- for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks))
- for (auto &VMap : VMaps)
- if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(VMap->lookup(BB)))
- if (!DT.isReachableFromEntry(ClonedBB)) {
- for (BasicBlock *SuccBB : successors(ClonedBB))
- SuccBB->removePredecessor(ClonedBB);
- DeadBlocks.push_back(ClonedBB);
- }
-
- // Remove all MemorySSA in the dead blocks
- if (MSSAU) {
- SmallSetVector<BasicBlock *, 8> DeadBlockSet(DeadBlocks.begin(),
- DeadBlocks.end());
- MSSAU->removeBlocks(DeadBlockSet);
- }
-
- // Drop any remaining references to break cycles.
- for (BasicBlock *BB : DeadBlocks)
- BB->dropAllReferences();
- // Erase them from the IR.
- for (BasicBlock *BB : DeadBlocks)
- BB->eraseFromParent();
-}
-
-static void deleteDeadBlocksFromLoop(Loop &L,
- SmallVectorImpl<BasicBlock *> &ExitBlocks,
- DominatorTree &DT, LoopInfo &LI,
- MemorySSAUpdater *MSSAU) {
- // Find all the dead blocks tied to this loop, and remove them from their
- // successors.
- SmallSetVector<BasicBlock *, 8> DeadBlockSet;
-
- // Start with loop/exit blocks and get a transitive closure of reachable dead
- // blocks.
- SmallVector<BasicBlock *, 16> DeathCandidates(ExitBlocks.begin(),
- ExitBlocks.end());
- DeathCandidates.append(L.blocks().begin(), L.blocks().end());
- while (!DeathCandidates.empty()) {
- auto *BB = DeathCandidates.pop_back_val();
- if (!DeadBlockSet.count(BB) && !DT.isReachableFromEntry(BB)) {
- for (BasicBlock *SuccBB : successors(BB)) {
- SuccBB->removePredecessor(BB);
- DeathCandidates.push_back(SuccBB);
- }
- DeadBlockSet.insert(BB);
- }
- }
-
- // Remove all MemorySSA in the dead blocks
- if (MSSAU)
- MSSAU->removeBlocks(DeadBlockSet);
-
- // Filter out the dead blocks from the exit blocks list so that it can be
- // used in the caller.
- llvm::erase_if(ExitBlocks,
- [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
-
- // Walk from this loop up through its parents removing all of the dead blocks.
- for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) {
- for (auto *BB : DeadBlockSet)
- ParentL->getBlocksSet().erase(BB);
- llvm::erase_if(ParentL->getBlocksVector(),
- [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
- }
-
- // Now delete the dead child loops. This raw delete will clear them
- // recursively.
- llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) {
- if (!DeadBlockSet.count(ChildL->getHeader()))
- return false;
-
- assert(llvm::all_of(ChildL->blocks(),
- [&](BasicBlock *ChildBB) {
- return DeadBlockSet.count(ChildBB);
- }) &&
- "If the child loop header is dead all blocks in the child loop must "
- "be dead as well!");
- LI.destroy(ChildL);
- return true;
- });
-
- // Remove the loop mappings for the dead blocks and drop all the references
- // from these blocks to others to handle cyclic references as we start
- // deleting the blocks themselves.
- for (auto *BB : DeadBlockSet) {
- // Check that the dominator tree has already been updated.
- assert(!DT.getNode(BB) && "Should already have cleared domtree!");
- LI.changeLoopFor(BB, nullptr);
- // Drop all uses of the instructions to make sure we won't have dangling
- // uses in other blocks.
- for (auto &I : *BB)
- if (!I.use_empty())
- I.replaceAllUsesWith(UndefValue::get(I.getType()));
- BB->dropAllReferences();
- }
-
- // Actually delete the blocks now that they've been fully unhooked from the
- // IR.
- for (auto *BB : DeadBlockSet)
- BB->eraseFromParent();
-}
-
-/// Recompute the set of blocks in a loop after unswitching.
-///
-/// This walks from the original headers predecessors to rebuild the loop. We
-/// take advantage of the fact that new blocks can't have been added, and so we
-/// filter by the original loop's blocks. This also handles potentially
-/// unreachable code that we don't want to explore but might be found examining
-/// the predecessors of the header.
-///
-/// If the original loop is no longer a loop, this will return an empty set. If
-/// it remains a loop, all the blocks within it will be added to the set
-/// (including those blocks in inner loops).
-static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
- LoopInfo &LI) {
- SmallPtrSet<const BasicBlock *, 16> LoopBlockSet;
-
- auto *PH = L.getLoopPreheader();
- auto *Header = L.getHeader();
-
- // A worklist to use while walking backwards from the header.
- SmallVector<BasicBlock *, 16> Worklist;
-
- // First walk the predecessors of the header to find the backedges. This will
- // form the basis of our walk.
- for (auto *Pred : predecessors(Header)) {
- // Skip the preheader.
- if (Pred == PH)
- continue;
-
- // Because the loop was in simplified form, the only non-loop predecessor
- // is the preheader.
- assert(L.contains(Pred) && "Found a predecessor of the loop header other "
- "than the preheader that is not part of the "
- "loop!");
-
- // Insert this block into the loop set and on the first visit and, if it
- // isn't the header we're currently walking, put it into the worklist to
- // recurse through.
- if (LoopBlockSet.insert(Pred).second && Pred != Header)
- Worklist.push_back(Pred);
- }
-
- // If no backedges were found, we're done.
- if (LoopBlockSet.empty())
- return LoopBlockSet;
-
- // We found backedges, recurse through them to identify the loop blocks.
- while (!Worklist.empty()) {
- BasicBlock *BB = Worklist.pop_back_val();
- assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!");
-
- // No need to walk past the header.
- if (BB == Header)
- continue;
-
- // Because we know the inner loop structure remains valid we can use the
- // loop structure to jump immediately across the entire nested loop.
- // Further, because it is in loop simplified form, we can directly jump
- // to its preheader afterward.
- if (Loop *InnerL = LI.getLoopFor(BB))
- if (InnerL != &L) {
- assert(L.contains(InnerL) &&
- "Should not reach a loop *outside* this loop!");
- // The preheader is the only possible predecessor of the loop so
- // insert it into the set and check whether it was already handled.
- auto *InnerPH = InnerL->getLoopPreheader();
- assert(L.contains(InnerPH) && "Cannot contain an inner loop block "
- "but not contain the inner loop "
- "preheader!");
- if (!LoopBlockSet.insert(InnerPH).second)
- // The only way to reach the preheader is through the loop body
- // itself so if it has been visited the loop is already handled.
- continue;
-
- // Insert all of the blocks (other than those already present) into
- // the loop set. We expect at least the block that led us to find the
- // inner loop to be in the block set, but we may also have other loop
- // blocks if they were already enqueued as predecessors of some other
- // outer loop block.
- for (auto *InnerBB : InnerL->blocks()) {
- if (InnerBB == BB) {
- assert(LoopBlockSet.count(InnerBB) &&
- "Block should already be in the set!");
- continue;
- }
-
- LoopBlockSet.insert(InnerBB);
- }
-
- // Add the preheader to the worklist so we will continue past the
- // loop body.
- Worklist.push_back(InnerPH);
- continue;
- }
-
- // Insert any predecessors that were in the original loop into the new
- // set, and if the insert is successful, add them to the worklist.
- for (auto *Pred : predecessors(BB))
- if (L.contains(Pred) && LoopBlockSet.insert(Pred).second)
- Worklist.push_back(Pred);
- }
-
- assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!");
-
- // We've found all the blocks participating in the loop, return our completed
- // set.
- return LoopBlockSet;
-}
-
-/// Rebuild a loop after unswitching removes some subset of blocks and edges.
-///
-/// The removal may have removed some child loops entirely but cannot have
-/// disturbed any remaining child loops. However, they may need to be hoisted
-/// to the parent loop (or to be top-level loops). The original loop may be
-/// completely removed.
-///
-/// The sibling loops resulting from this update are returned. If the original
-/// loop remains a valid loop, it will be the first entry in this list with all
-/// of the newly sibling loops following it.
-///
-/// Returns true if the loop remains a loop after unswitching, and false if it
-/// is no longer a loop after unswitching (and should not continue to be
-/// referenced).
-static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
- LoopInfo &LI,
- SmallVectorImpl<Loop *> &HoistedLoops) {
- auto *PH = L.getLoopPreheader();
-
- // Compute the actual parent loop from the exit blocks. Because we may have
- // pruned some exits the loop may be different from the original parent.
- Loop *ParentL = nullptr;
- SmallVector<Loop *, 4> ExitLoops;
- SmallVector<BasicBlock *, 4> ExitsInLoops;
- ExitsInLoops.reserve(ExitBlocks.size());
- for (auto *ExitBB : ExitBlocks)
- if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
- ExitLoops.push_back(ExitL);
- ExitsInLoops.push_back(ExitBB);
- if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
- ParentL = ExitL;
- }
-
- // Recompute the blocks participating in this loop. This may be empty if it
- // is no longer a loop.
- auto LoopBlockSet = recomputeLoopBlockSet(L, LI);
-
- // If we still have a loop, we need to re-set the loop's parent as the exit
- // block set changing may have moved it within the loop nest. Note that this
- // can only happen when this loop has a parent as it can only hoist the loop
- // *up* the nest.
- if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) {
- // Remove this loop's (original) blocks from all of the intervening loops.
- for (Loop *IL = L.getParentLoop(); IL != ParentL;
- IL = IL->getParentLoop()) {
- IL->getBlocksSet().erase(PH);
- for (auto *BB : L.blocks())
- IL->getBlocksSet().erase(BB);
- llvm::erase_if(IL->getBlocksVector(), [&](BasicBlock *BB) {
- return BB == PH || L.contains(BB);
- });
- }
-
- LI.changeLoopFor(PH, ParentL);
- L.getParentLoop()->removeChildLoop(&L);
- if (ParentL)
- ParentL->addChildLoop(&L);
- else
- LI.addTopLevelLoop(&L);
- }
-
- // Now we update all the blocks which are no longer within the loop.
- auto &Blocks = L.getBlocksVector();
- auto BlocksSplitI =
- LoopBlockSet.empty()
- ? Blocks.begin()
- : std::stable_partition(
- Blocks.begin(), Blocks.end(),
- [&](BasicBlock *BB) { return LoopBlockSet.count(BB); });
-
- // Before we erase the list of unlooped blocks, build a set of them.
- SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end());
- if (LoopBlockSet.empty())
- UnloopedBlocks.insert(PH);
-
- // Now erase these blocks from the loop.
- for (auto *BB : make_range(BlocksSplitI, Blocks.end()))
- L.getBlocksSet().erase(BB);
- Blocks.erase(BlocksSplitI, Blocks.end());
-
- // Sort the exits in ascending loop depth, we'll work backwards across these
- // to process them inside out.
- llvm::stable_sort(ExitsInLoops, [&](BasicBlock *LHS, BasicBlock *RHS) {
- return LI.getLoopDepth(LHS) < LI.getLoopDepth(RHS);
- });
-
- // We'll build up a set for each exit loop.
- SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks;
- Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop.
-
- auto RemoveUnloopedBlocksFromLoop =
- [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) {
- for (auto *BB : UnloopedBlocks)
- L.getBlocksSet().erase(BB);
- llvm::erase_if(L.getBlocksVector(), [&](BasicBlock *BB) {
- return UnloopedBlocks.count(BB);
- });
- };
-
- SmallVector<BasicBlock *, 16> Worklist;
- while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) {
- assert(Worklist.empty() && "Didn't clear worklist!");
- assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!");
-
- // Grab the next exit block, in decreasing loop depth order.
- BasicBlock *ExitBB = ExitsInLoops.pop_back_val();
- Loop &ExitL = *LI.getLoopFor(ExitBB);
- assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!");
-
- // Erase all of the unlooped blocks from the loops between the previous
- // exit loop and this exit loop. This works because the ExitInLoops list is
- // sorted in increasing order of loop depth and thus we visit loops in
- // decreasing order of loop depth.
- for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop())
- RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
-
- // Walk the CFG back until we hit the cloned PH adding everything reachable
- // and in the unlooped set to this exit block's loop.
- Worklist.push_back(ExitBB);
- do {
- BasicBlock *BB = Worklist.pop_back_val();
- // We can stop recursing at the cloned preheader (if we get there).
- if (BB == PH)
- continue;
-
- for (BasicBlock *PredBB : predecessors(BB)) {
- // If this pred has already been moved to our set or is part of some
- // (inner) loop, no update needed.
- if (!UnloopedBlocks.erase(PredBB)) {
- assert((NewExitLoopBlocks.count(PredBB) ||
- ExitL.contains(LI.getLoopFor(PredBB))) &&
- "Predecessor not in a nested loop (or already visited)!");
- continue;
- }
-
- // We just insert into the loop set here. We'll add these blocks to the
- // exit loop after we build up the set in a deterministic order rather
- // than the predecessor-influenced visit order.
- bool Inserted = NewExitLoopBlocks.insert(PredBB).second;
- (void)Inserted;
- assert(Inserted && "Should only visit an unlooped block once!");
-
- // And recurse through to its predecessors.
- Worklist.push_back(PredBB);
- }
- } while (!Worklist.empty());
-
- // If blocks in this exit loop were directly part of the original loop (as
- // opposed to a child loop) update the map to point to this exit loop. This
- // just updates a map and so the fact that the order is unstable is fine.
- for (auto *BB : NewExitLoopBlocks)
- if (Loop *BBL = LI.getLoopFor(BB))
- if (BBL == &L || !L.contains(BBL))
- LI.changeLoopFor(BB, &ExitL);
-
- // We will remove the remaining unlooped blocks from this loop in the next
- // iteration or below.
- NewExitLoopBlocks.clear();
- }
-
- // Any remaining unlooped blocks are no longer part of any loop unless they
- // are part of some child loop.
- for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop())
- RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
- for (auto *BB : UnloopedBlocks)
- if (Loop *BBL = LI.getLoopFor(BB))
- if (BBL == &L || !L.contains(BBL))
- LI.changeLoopFor(BB, nullptr);
-
- // Sink all the child loops whose headers are no longer in the loop set to
- // the parent (or to be top level loops). We reach into the loop and directly
- // update its subloop vector to make this batch update efficient.
- auto &SubLoops = L.getSubLoopsVector();
- auto SubLoopsSplitI =
- LoopBlockSet.empty()
- ? SubLoops.begin()
- : std::stable_partition(
- SubLoops.begin(), SubLoops.end(), [&](Loop *SubL) {
- return LoopBlockSet.count(SubL->getHeader());
- });
- for (auto *HoistedL : make_range(SubLoopsSplitI, SubLoops.end())) {
- HoistedLoops.push_back(HoistedL);
- HoistedL->setParentLoop(nullptr);
-
- // To compute the new parent of this hoisted loop we look at where we
- // placed the preheader above. We can't lookup the header itself because we
- // retained the mapping from the header to the hoisted loop. But the
- // preheader and header should have the exact same new parent computed
- // based on the set of exit blocks from the original loop as the preheader
- // is a predecessor of the header and so reached in the reverse walk. And
- // because the loops were all in simplified form the preheader of the
- // hoisted loop can't be part of some *other* loop.
- if (auto *NewParentL = LI.getLoopFor(HoistedL->getLoopPreheader()))
- NewParentL->addChildLoop(HoistedL);
- else
- LI.addTopLevelLoop(HoistedL);
- }
- SubLoops.erase(SubLoopsSplitI, SubLoops.end());
-
- // Actually delete the loop if nothing remained within it.
- if (Blocks.empty()) {
- assert(SubLoops.empty() &&
- "Failed to remove all subloops from the original loop!");
- if (Loop *ParentL = L.getParentLoop())
- ParentL->removeChildLoop(llvm::find(*ParentL, &L));
- else
- LI.removeLoop(llvm::find(LI, &L));
- LI.destroy(&L);
- return false;
- }
-
- return true;
-}
-
-/// Helper to visit a dominator subtree, invoking a callable on each node.
-///
-/// Returning false at any point will stop walking past that node of the tree.
-template <typename CallableT>
-void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
- SmallVector<DomTreeNode *, 4> DomWorklist;
- DomWorklist.push_back(DT[BB]);
-#ifndef NDEBUG
- SmallPtrSet<DomTreeNode *, 4> Visited;
- Visited.insert(DT[BB]);
-#endif
- do {
- DomTreeNode *N = DomWorklist.pop_back_val();
-
- // Visit this node.
- if (!Callable(N->getBlock()))
- continue;
-
- // Accumulate the child nodes.
- for (DomTreeNode *ChildN : *N) {
- assert(Visited.insert(ChildN).second &&
- "Cannot visit a node twice when walking a tree!");
- DomWorklist.push_back(ChildN);
- }
- } while (!DomWorklist.empty());
-}
-
-static void unswitchNontrivialInvariants(
- Loop &L, Instruction &TI, ArrayRef<Value *> Invariants,
- SmallVectorImpl<BasicBlock *> &ExitBlocks, DominatorTree &DT, LoopInfo &LI,
- AssumptionCache &AC, function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
- ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
- auto *ParentBB = TI.getParent();
- BranchInst *BI = dyn_cast<BranchInst>(&TI);
- SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI);
-
- // We can only unswitch switches, conditional branches with an invariant
- // condition, or combining invariant conditions with an instruction.
- assert((SI || (BI && BI->isConditional())) &&
- "Can only unswitch switches and conditional branch!");
- bool FullUnswitch = SI || BI->getCondition() == Invariants[0];
- if (FullUnswitch)
- assert(Invariants.size() == 1 &&
- "Cannot have other invariants with full unswitching!");
- else
- assert(isa<Instruction>(BI->getCondition()) &&
- "Partial unswitching requires an instruction as the condition!");
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- // Constant and BBs tracking the cloned and continuing successor. When we are
- // unswitching the entire condition, this can just be trivially chosen to
- // unswitch towards `true`. However, when we are unswitching a set of
- // invariants combined with `and` or `or`, the combining operation determines
- // the best direction to unswitch: we want to unswitch the direction that will
- // collapse the branch.
- bool Direction = true;
- int ClonedSucc = 0;
- if (!FullUnswitch) {
- if (cast<Instruction>(BI->getCondition())->getOpcode() != Instruction::Or) {
- assert(cast<Instruction>(BI->getCondition())->getOpcode() ==
- Instruction::And &&
- "Only `or` and `and` instructions can combine invariants being "
- "unswitched.");
- Direction = false;
- ClonedSucc = 1;
- }
- }
-
- BasicBlock *RetainedSuccBB =
- BI ? BI->getSuccessor(1 - ClonedSucc) : SI->getDefaultDest();
- SmallSetVector<BasicBlock *, 4> UnswitchedSuccBBs;
- if (BI)
- UnswitchedSuccBBs.insert(BI->getSuccessor(ClonedSucc));
- else
- for (auto Case : SI->cases())
- if (Case.getCaseSuccessor() != RetainedSuccBB)
- UnswitchedSuccBBs.insert(Case.getCaseSuccessor());
-
- assert(!UnswitchedSuccBBs.count(RetainedSuccBB) &&
- "Should not unswitch the same successor we are retaining!");
-
- // The branch should be in this exact loop. Any inner loop's invariant branch
- // should be handled by unswitching that inner loop. The caller of this
- // routine should filter out any candidates that remain (but were skipped for
- // whatever reason).
- assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!");
-
- // Compute the parent loop now before we start hacking on things.
- Loop *ParentL = L.getParentLoop();
- // Get blocks in RPO order for MSSA update, before changing the CFG.
- LoopBlocksRPO LBRPO(&L);
- if (MSSAU)
- LBRPO.perform(&LI);
-
- // Compute the outer-most loop containing one of our exit blocks. This is the
- // furthest up our loopnest which can be mutated, which we will use below to
- // update things.
- Loop *OuterExitL = &L;
- for (auto *ExitBB : ExitBlocks) {
- Loop *NewOuterExitL = LI.getLoopFor(ExitBB);
- if (!NewOuterExitL) {
- // We exited the entire nest with this block, so we're done.
- OuterExitL = nullptr;
- break;
- }
- if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(OuterExitL))
- OuterExitL = NewOuterExitL;
- }
-
- // At this point, we're definitely going to unswitch something so invalidate
- // any cached information in ScalarEvolution for the outer most loop
- // containing an exit block and all nested loops.
- if (SE) {
- if (OuterExitL)
- SE->forgetLoop(OuterExitL);
- else
- SE->forgetTopmostLoop(&L);
- }
-
- // If the edge from this terminator to a successor dominates that successor,
- // store a map from each block in its dominator subtree to it. This lets us
- // tell when cloning for a particular successor if a block is dominated by
- // some *other* successor with a single data structure. We use this to
- // significantly reduce cloning.
- SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc;
- for (auto *SuccBB : llvm::concat<BasicBlock *const>(
- makeArrayRef(RetainedSuccBB), UnswitchedSuccBBs))
- if (SuccBB->getUniquePredecessor() ||
- llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
- return PredBB == ParentBB || DT.dominates(SuccBB, PredBB);
- }))
- visitDomSubTree(DT, SuccBB, [&](BasicBlock *BB) {
- DominatingSucc[BB] = SuccBB;
- return true;
- });
-
- // Split the preheader, so that we know that there is a safe place to insert
- // the conditional branch. We will change the preheader to have a conditional
- // branch on LoopCond. The original preheader will become the split point
- // between the unswitched versions, and we will have a new preheader for the
- // original loop.
- BasicBlock *SplitBB = L.getLoopPreheader();
- BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI, MSSAU);
-
- // Keep track of the dominator tree updates needed.
- SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
-
- // Clone the loop for each unswitched successor.
- SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
- VMaps.reserve(UnswitchedSuccBBs.size());
- SmallDenseMap<BasicBlock *, BasicBlock *, 4> ClonedPHs;
- for (auto *SuccBB : UnswitchedSuccBBs) {
- VMaps.emplace_back(new ValueToValueMapTy());
- ClonedPHs[SuccBB] = buildClonedLoopBlocks(
- L, LoopPH, SplitBB, ExitBlocks, ParentBB, SuccBB, RetainedSuccBB,
- DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU);
- }
-
+ return ClonedRootL;
+
+ // If we have a nest, we can quickly clone the entire loop nest using an
+ // iterative approach because it is a tree. We keep the cloned parent in the
+ // data structure to avoid repeatedly querying through a map to find it.
+ SmallVector<std::pair<Loop *, Loop *>, 16> LoopsToClone;
+ // Build up the loops to clone in reverse order as we'll clone them from the
+ // back.
+ for (Loop *ChildL : llvm::reverse(OrigRootL))
+ LoopsToClone.push_back({ClonedRootL, ChildL});
+ do {
+ Loop *ClonedParentL, *L;
+ std::tie(ClonedParentL, L) = LoopsToClone.pop_back_val();
+ Loop *ClonedL = LI.AllocateLoop();
+ ClonedParentL->addChildLoop(ClonedL);
+ AddClonedBlocksToLoop(*L, *ClonedL);
+ for (Loop *ChildL : llvm::reverse(*L))
+ LoopsToClone.push_back({ClonedL, ChildL});
+ } while (!LoopsToClone.empty());
+
+ return ClonedRootL;
+}
+
+/// Build the cloned loops of an original loop from unswitching.
+///
+/// Because unswitching simplifies the CFG of the loop, this isn't a trivial
+/// operation. We need to re-verify that there even is a loop (as the backedge
+/// may not have been cloned), and even if there are remaining backedges the
+/// backedge set may be different. However, we know that each child loop is
+/// undisturbed, we only need to find where to place each child loop within
+/// either any parent loop or within a cloned version of the original loop.
+///
+/// Because child loops may end up cloned outside of any cloned version of the
+/// original loop, multiple cloned sibling loops may be created. All of them
+/// are returned so that the newly introduced loop nest roots can be
+/// identified.
+static void buildClonedLoops(Loop &OrigL, ArrayRef<BasicBlock *> ExitBlocks,
+ const ValueToValueMapTy &VMap, LoopInfo &LI,
+ SmallVectorImpl<Loop *> &NonChildClonedLoops) {
+ Loop *ClonedL = nullptr;
+
+ auto *OrigPH = OrigL.getLoopPreheader();
+ auto *OrigHeader = OrigL.getHeader();
+
+ auto *ClonedPH = cast<BasicBlock>(VMap.lookup(OrigPH));
+ auto *ClonedHeader = cast<BasicBlock>(VMap.lookup(OrigHeader));
+
+ // We need to know the loops of the cloned exit blocks to even compute the
+ // accurate parent loop. If we only clone exits to some parent of the
+ // original parent, we want to clone into that outer loop. We also keep track
+ // of the loops that our cloned exit blocks participate in.
+ Loop *ParentL = nullptr;
+ SmallVector<BasicBlock *, 4> ClonedExitsInLoops;
+ SmallDenseMap<BasicBlock *, Loop *, 16> ExitLoopMap;
+ ClonedExitsInLoops.reserve(ExitBlocks.size());
+ for (auto *ExitBB : ExitBlocks)
+ if (auto *ClonedExitBB = cast_or_null<BasicBlock>(VMap.lookup(ExitBB)))
+ if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
+ ExitLoopMap[ClonedExitBB] = ExitL;
+ ClonedExitsInLoops.push_back(ClonedExitBB);
+ if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
+ ParentL = ExitL;
+ }
+ assert((!ParentL || ParentL == OrigL.getParentLoop() ||
+ ParentL->contains(OrigL.getParentLoop())) &&
+ "The computed parent loop should always contain (or be) the parent of "
+ "the original loop.");
+
+ // We build the set of blocks dominated by the cloned header from the set of
+ // cloned blocks out of the original loop. While not all of these will
+ // necessarily be in the cloned loop, it is enough to establish that they
+ // aren't in unreachable cycles, etc.
+ SmallSetVector<BasicBlock *, 16> ClonedLoopBlocks;
+ for (auto *BB : OrigL.blocks())
+ if (auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB)))
+ ClonedLoopBlocks.insert(ClonedBB);
+
+ // Rebuild the set of blocks that will end up in the cloned loop. We may have
+ // skipped cloning some region of this loop which can in turn skip some of
+ // the backedges so we have to rebuild the blocks in the loop based on the
+ // backedges that remain after cloning.
+ SmallVector<BasicBlock *, 16> Worklist;
+ SmallPtrSet<BasicBlock *, 16> BlocksInClonedLoop;
+ for (auto *Pred : predecessors(ClonedHeader)) {
+ // The only possible non-loop header predecessor is the preheader because
+ // we know we cloned the loop in simplified form.
+ if (Pred == ClonedPH)
+ continue;
+
+ // Because the loop was in simplified form, the only non-loop predecessor
+ // should be the preheader.
+ assert(ClonedLoopBlocks.count(Pred) && "Found a predecessor of the loop "
+ "header other than the preheader "
+ "that is not part of the loop!");
+
+ // Insert this block into the loop set and on the first visit (and if it
+ // isn't the header we're currently walking) put it into the worklist to
+ // recurse through.
+ if (BlocksInClonedLoop.insert(Pred).second && Pred != ClonedHeader)
+ Worklist.push_back(Pred);
+ }
+
+ // If we had any backedges then there *is* a cloned loop. Put the header into
+ // the loop set and then walk the worklist backwards to find all the blocks
+ // that remain within the loop after cloning.
+ if (!BlocksInClonedLoop.empty()) {
+ BlocksInClonedLoop.insert(ClonedHeader);
+
+ while (!Worklist.empty()) {
+ BasicBlock *BB = Worklist.pop_back_val();
+ assert(BlocksInClonedLoop.count(BB) &&
+ "Didn't put block into the loop set!");
+
+ // Insert any predecessors that are in the possible set into the cloned
+ // set, and if the insert is successful, add them to the worklist. Note
+ // that we filter on the blocks that are definitely reachable via the
+ // backedge to the loop header so we may prune out dead code within the
+ // cloned loop.
+ for (auto *Pred : predecessors(BB))
+ if (ClonedLoopBlocks.count(Pred) &&
+ BlocksInClonedLoop.insert(Pred).second)
+ Worklist.push_back(Pred);
+ }
+
+ ClonedL = LI.AllocateLoop();
+ if (ParentL) {
+ ParentL->addBasicBlockToLoop(ClonedPH, LI);
+ ParentL->addChildLoop(ClonedL);
+ } else {
+ LI.addTopLevelLoop(ClonedL);
+ }
+ NonChildClonedLoops.push_back(ClonedL);
+
+ ClonedL->reserveBlocks(BlocksInClonedLoop.size());
+ // We don't want to just add the cloned loop blocks based on how we
+ // discovered them. The original order of blocks was carefully built in
+ // a way that doesn't rely on predecessor ordering. Rather than re-invent
+ // that logic, we just re-walk the original blocks (and those of the child
+ // loops) and filter them as we add them into the cloned loop.
+ for (auto *BB : OrigL.blocks()) {
+ auto *ClonedBB = cast_or_null<BasicBlock>(VMap.lookup(BB));
+ if (!ClonedBB || !BlocksInClonedLoop.count(ClonedBB))
+ continue;
+
+ // Directly add the blocks that are only in this loop.
+ if (LI.getLoopFor(BB) == &OrigL) {
+ ClonedL->addBasicBlockToLoop(ClonedBB, LI);
+ continue;
+ }
+
+ // We want to manually add it to this loop and parents.
+ // Registering it with LoopInfo will happen when we clone the top
+ // loop for this block.
+ for (Loop *PL = ClonedL; PL; PL = PL->getParentLoop())
+ PL->addBlockEntry(ClonedBB);
+ }
+
+ // Now add each child loop whose header remains within the cloned loop. All
+ // of the blocks within the loop must satisfy the same constraints as the
+ // header so once we pass the header checks we can just clone the entire
+ // child loop nest.
+ for (Loop *ChildL : OrigL) {
+ auto *ClonedChildHeader =
+ cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
+ if (!ClonedChildHeader || !BlocksInClonedLoop.count(ClonedChildHeader))
+ continue;
+
+#ifndef NDEBUG
+ // We should never have a cloned child loop header but fail to have
+ // all of the blocks for that child loop.
+ for (auto *ChildLoopBB : ChildL->blocks())
+ assert(BlocksInClonedLoop.count(
+ cast<BasicBlock>(VMap.lookup(ChildLoopBB))) &&
+ "Child cloned loop has a header within the cloned outer "
+ "loop but not all of its blocks!");
+#endif
+
+ cloneLoopNest(*ChildL, ClonedL, VMap, LI);
+ }
+ }
+
+ // Now that we've handled all the components of the original loop that were
+ // cloned into a new loop, we still need to handle anything from the original
+ // loop that wasn't in a cloned loop.
+
+ // Figure out what blocks are left to place within any loop nest containing
+ // the unswitched loop. If we never formed a loop, the cloned PH is one of
+ // them.
+ SmallPtrSet<BasicBlock *, 16> UnloopedBlockSet;
+ if (BlocksInClonedLoop.empty())
+ UnloopedBlockSet.insert(ClonedPH);
+ for (auto *ClonedBB : ClonedLoopBlocks)
+ if (!BlocksInClonedLoop.count(ClonedBB))
+ UnloopedBlockSet.insert(ClonedBB);
+
+ // Copy the cloned exits and sort them in ascending loop depth, we'll work
+ // backwards across these to process them inside out. The order shouldn't
+ // matter as we're just trying to build up the map from inside-out; we use
+ // the map in a more stably ordered way below.
+ auto OrderedClonedExitsInLoops = ClonedExitsInLoops;
+ llvm::sort(OrderedClonedExitsInLoops, [&](BasicBlock *LHS, BasicBlock *RHS) {
+ return ExitLoopMap.lookup(LHS)->getLoopDepth() <
+ ExitLoopMap.lookup(RHS)->getLoopDepth();
+ });
+
+ // Populate the existing ExitLoopMap with everything reachable from each
+ // exit, starting from the inner most exit.
+ while (!UnloopedBlockSet.empty() && !OrderedClonedExitsInLoops.empty()) {
+ assert(Worklist.empty() && "Didn't clear worklist!");
+
+ BasicBlock *ExitBB = OrderedClonedExitsInLoops.pop_back_val();
+ Loop *ExitL = ExitLoopMap.lookup(ExitBB);
+
+ // Walk the CFG back until we hit the cloned PH adding everything reachable
+ // and in the unlooped set to this exit block's loop.
+ Worklist.push_back(ExitBB);
+ do {
+ BasicBlock *BB = Worklist.pop_back_val();
+ // We can stop recursing at the cloned preheader (if we get there).
+ if (BB == ClonedPH)
+ continue;
+
+ for (BasicBlock *PredBB : predecessors(BB)) {
+ // If this pred has already been moved to our set or is part of some
+ // (inner) loop, no update needed.
+ if (!UnloopedBlockSet.erase(PredBB)) {
+ assert(
+ (BlocksInClonedLoop.count(PredBB) || ExitLoopMap.count(PredBB)) &&
+ "Predecessor not mapped to a loop!");
+ continue;
+ }
+
+ // We just insert into the loop set here. We'll add these blocks to the
+ // exit loop after we build up the set in an order that doesn't rely on
+ // predecessor order (which in turn relies on use list order).
+ bool Inserted = ExitLoopMap.insert({PredBB, ExitL}).second;
+ (void)Inserted;
+ assert(Inserted && "Should only visit an unlooped block once!");
+
+ // And recurse through to its predecessors.
+ Worklist.push_back(PredBB);
+ }
+ } while (!Worklist.empty());
+ }
+
+ // Now that the ExitLoopMap gives as mapping for all the non-looping cloned
+ // blocks to their outer loops, walk the cloned blocks and the cloned exits
+ // in their original order adding them to the correct loop.
+
+ // We need a stable insertion order. We use the order of the original loop
+ // order and map into the correct parent loop.
+ for (auto *BB : llvm::concat<BasicBlock *const>(
+ makeArrayRef(ClonedPH), ClonedLoopBlocks, ClonedExitsInLoops))
+ if (Loop *OuterL = ExitLoopMap.lookup(BB))
+ OuterL->addBasicBlockToLoop(BB, LI);
+
+#ifndef NDEBUG
+ for (auto &BBAndL : ExitLoopMap) {
+ auto *BB = BBAndL.first;
+ auto *OuterL = BBAndL.second;
+ assert(LI.getLoopFor(BB) == OuterL &&
+ "Failed to put all blocks into outer loops!");
+ }
+#endif
+
+ // Now that all the blocks are placed into the correct containing loop in the
+ // absence of child loops, find all the potentially cloned child loops and
+ // clone them into whatever outer loop we placed their header into.
+ for (Loop *ChildL : OrigL) {
+ auto *ClonedChildHeader =
+ cast_or_null<BasicBlock>(VMap.lookup(ChildL->getHeader()));
+ if (!ClonedChildHeader || BlocksInClonedLoop.count(ClonedChildHeader))
+ continue;
+
+#ifndef NDEBUG
+ for (auto *ChildLoopBB : ChildL->blocks())
+ assert(VMap.count(ChildLoopBB) &&
+ "Cloned a child loop header but not all of that loops blocks!");
+#endif
+
+ NonChildClonedLoops.push_back(cloneLoopNest(
+ *ChildL, ExitLoopMap.lookup(ClonedChildHeader), VMap, LI));
+ }
+}
+
+static void
+deleteDeadClonedBlocks(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
+ ArrayRef<std::unique_ptr<ValueToValueMapTy>> VMaps,
+ DominatorTree &DT, MemorySSAUpdater *MSSAU) {
+ // Find all the dead clones, and remove them from their successors.
+ SmallVector<BasicBlock *, 16> DeadBlocks;
+ for (BasicBlock *BB : llvm::concat<BasicBlock *const>(L.blocks(), ExitBlocks))
+ for (auto &VMap : VMaps)
+ if (BasicBlock *ClonedBB = cast_or_null<BasicBlock>(VMap->lookup(BB)))
+ if (!DT.isReachableFromEntry(ClonedBB)) {
+ for (BasicBlock *SuccBB : successors(ClonedBB))
+ SuccBB->removePredecessor(ClonedBB);
+ DeadBlocks.push_back(ClonedBB);
+ }
+
+ // Remove all MemorySSA in the dead blocks
+ if (MSSAU) {
+ SmallSetVector<BasicBlock *, 8> DeadBlockSet(DeadBlocks.begin(),
+ DeadBlocks.end());
+ MSSAU->removeBlocks(DeadBlockSet);
+ }
+
+ // Drop any remaining references to break cycles.
+ for (BasicBlock *BB : DeadBlocks)
+ BB->dropAllReferences();
+ // Erase them from the IR.
+ for (BasicBlock *BB : DeadBlocks)
+ BB->eraseFromParent();
+}
+
+static void deleteDeadBlocksFromLoop(Loop &L,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks,
+ DominatorTree &DT, LoopInfo &LI,
+ MemorySSAUpdater *MSSAU) {
+ // Find all the dead blocks tied to this loop, and remove them from their
+ // successors.
+ SmallSetVector<BasicBlock *, 8> DeadBlockSet;
+
+ // Start with loop/exit blocks and get a transitive closure of reachable dead
+ // blocks.
+ SmallVector<BasicBlock *, 16> DeathCandidates(ExitBlocks.begin(),
+ ExitBlocks.end());
+ DeathCandidates.append(L.blocks().begin(), L.blocks().end());
+ while (!DeathCandidates.empty()) {
+ auto *BB = DeathCandidates.pop_back_val();
+ if (!DeadBlockSet.count(BB) && !DT.isReachableFromEntry(BB)) {
+ for (BasicBlock *SuccBB : successors(BB)) {
+ SuccBB->removePredecessor(BB);
+ DeathCandidates.push_back(SuccBB);
+ }
+ DeadBlockSet.insert(BB);
+ }
+ }
+
+ // Remove all MemorySSA in the dead blocks
+ if (MSSAU)
+ MSSAU->removeBlocks(DeadBlockSet);
+
+ // Filter out the dead blocks from the exit blocks list so that it can be
+ // used in the caller.
+ llvm::erase_if(ExitBlocks,
+ [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
+
+ // Walk from this loop up through its parents removing all of the dead blocks.
+ for (Loop *ParentL = &L; ParentL; ParentL = ParentL->getParentLoop()) {
+ for (auto *BB : DeadBlockSet)
+ ParentL->getBlocksSet().erase(BB);
+ llvm::erase_if(ParentL->getBlocksVector(),
+ [&](BasicBlock *BB) { return DeadBlockSet.count(BB); });
+ }
+
+ // Now delete the dead child loops. This raw delete will clear them
+ // recursively.
+ llvm::erase_if(L.getSubLoopsVector(), [&](Loop *ChildL) {
+ if (!DeadBlockSet.count(ChildL->getHeader()))
+ return false;
+
+ assert(llvm::all_of(ChildL->blocks(),
+ [&](BasicBlock *ChildBB) {
+ return DeadBlockSet.count(ChildBB);
+ }) &&
+ "If the child loop header is dead all blocks in the child loop must "
+ "be dead as well!");
+ LI.destroy(ChildL);
+ return true;
+ });
+
+ // Remove the loop mappings for the dead blocks and drop all the references
+ // from these blocks to others to handle cyclic references as we start
+ // deleting the blocks themselves.
+ for (auto *BB : DeadBlockSet) {
+ // Check that the dominator tree has already been updated.
+ assert(!DT.getNode(BB) && "Should already have cleared domtree!");
+ LI.changeLoopFor(BB, nullptr);
+ // Drop all uses of the instructions to make sure we won't have dangling
+ // uses in other blocks.
+ for (auto &I : *BB)
+ if (!I.use_empty())
+ I.replaceAllUsesWith(UndefValue::get(I.getType()));
+ BB->dropAllReferences();
+ }
+
+ // Actually delete the blocks now that they've been fully unhooked from the
+ // IR.
+ for (auto *BB : DeadBlockSet)
+ BB->eraseFromParent();
+}
+
+/// Recompute the set of blocks in a loop after unswitching.
+///
+/// This walks from the original headers predecessors to rebuild the loop. We
+/// take advantage of the fact that new blocks can't have been added, and so we
+/// filter by the original loop's blocks. This also handles potentially
+/// unreachable code that we don't want to explore but might be found examining
+/// the predecessors of the header.
+///
+/// If the original loop is no longer a loop, this will return an empty set. If
+/// it remains a loop, all the blocks within it will be added to the set
+/// (including those blocks in inner loops).
+static SmallPtrSet<const BasicBlock *, 16> recomputeLoopBlockSet(Loop &L,
+ LoopInfo &LI) {
+ SmallPtrSet<const BasicBlock *, 16> LoopBlockSet;
+
+ auto *PH = L.getLoopPreheader();
+ auto *Header = L.getHeader();
+
+ // A worklist to use while walking backwards from the header.
+ SmallVector<BasicBlock *, 16> Worklist;
+
+ // First walk the predecessors of the header to find the backedges. This will
+ // form the basis of our walk.
+ for (auto *Pred : predecessors(Header)) {
+ // Skip the preheader.
+ if (Pred == PH)
+ continue;
+
+ // Because the loop was in simplified form, the only non-loop predecessor
+ // is the preheader.
+ assert(L.contains(Pred) && "Found a predecessor of the loop header other "
+ "than the preheader that is not part of the "
+ "loop!");
+
+ // Insert this block into the loop set and on the first visit and, if it
+ // isn't the header we're currently walking, put it into the worklist to
+ // recurse through.
+ if (LoopBlockSet.insert(Pred).second && Pred != Header)
+ Worklist.push_back(Pred);
+ }
+
+ // If no backedges were found, we're done.
+ if (LoopBlockSet.empty())
+ return LoopBlockSet;
+
+ // We found backedges, recurse through them to identify the loop blocks.
+ while (!Worklist.empty()) {
+ BasicBlock *BB = Worklist.pop_back_val();
+ assert(LoopBlockSet.count(BB) && "Didn't put block into the loop set!");
+
+ // No need to walk past the header.
+ if (BB == Header)
+ continue;
+
+ // Because we know the inner loop structure remains valid we can use the
+ // loop structure to jump immediately across the entire nested loop.
+ // Further, because it is in loop simplified form, we can directly jump
+ // to its preheader afterward.
+ if (Loop *InnerL = LI.getLoopFor(BB))
+ if (InnerL != &L) {
+ assert(L.contains(InnerL) &&
+ "Should not reach a loop *outside* this loop!");
+ // The preheader is the only possible predecessor of the loop so
+ // insert it into the set and check whether it was already handled.
+ auto *InnerPH = InnerL->getLoopPreheader();
+ assert(L.contains(InnerPH) && "Cannot contain an inner loop block "
+ "but not contain the inner loop "
+ "preheader!");
+ if (!LoopBlockSet.insert(InnerPH).second)
+ // The only way to reach the preheader is through the loop body
+ // itself so if it has been visited the loop is already handled.
+ continue;
+
+ // Insert all of the blocks (other than those already present) into
+ // the loop set. We expect at least the block that led us to find the
+ // inner loop to be in the block set, but we may also have other loop
+ // blocks if they were already enqueued as predecessors of some other
+ // outer loop block.
+ for (auto *InnerBB : InnerL->blocks()) {
+ if (InnerBB == BB) {
+ assert(LoopBlockSet.count(InnerBB) &&
+ "Block should already be in the set!");
+ continue;
+ }
+
+ LoopBlockSet.insert(InnerBB);
+ }
+
+ // Add the preheader to the worklist so we will continue past the
+ // loop body.
+ Worklist.push_back(InnerPH);
+ continue;
+ }
+
+ // Insert any predecessors that were in the original loop into the new
+ // set, and if the insert is successful, add them to the worklist.
+ for (auto *Pred : predecessors(BB))
+ if (L.contains(Pred) && LoopBlockSet.insert(Pred).second)
+ Worklist.push_back(Pred);
+ }
+
+ assert(LoopBlockSet.count(Header) && "Cannot fail to add the header!");
+
+ // We've found all the blocks participating in the loop, return our completed
+ // set.
+ return LoopBlockSet;
+}
+
+/// Rebuild a loop after unswitching removes some subset of blocks and edges.
+///
+/// The removal may have removed some child loops entirely but cannot have
+/// disturbed any remaining child loops. However, they may need to be hoisted
+/// to the parent loop (or to be top-level loops). The original loop may be
+/// completely removed.
+///
+/// The sibling loops resulting from this update are returned. If the original
+/// loop remains a valid loop, it will be the first entry in this list with all
+/// of the newly sibling loops following it.
+///
+/// Returns true if the loop remains a loop after unswitching, and false if it
+/// is no longer a loop after unswitching (and should not continue to be
+/// referenced).
+static bool rebuildLoopAfterUnswitch(Loop &L, ArrayRef<BasicBlock *> ExitBlocks,
+ LoopInfo &LI,
+ SmallVectorImpl<Loop *> &HoistedLoops) {
+ auto *PH = L.getLoopPreheader();
+
+ // Compute the actual parent loop from the exit blocks. Because we may have
+ // pruned some exits the loop may be different from the original parent.
+ Loop *ParentL = nullptr;
+ SmallVector<Loop *, 4> ExitLoops;
+ SmallVector<BasicBlock *, 4> ExitsInLoops;
+ ExitsInLoops.reserve(ExitBlocks.size());
+ for (auto *ExitBB : ExitBlocks)
+ if (Loop *ExitL = LI.getLoopFor(ExitBB)) {
+ ExitLoops.push_back(ExitL);
+ ExitsInLoops.push_back(ExitBB);
+ if (!ParentL || (ParentL != ExitL && ParentL->contains(ExitL)))
+ ParentL = ExitL;
+ }
+
+ // Recompute the blocks participating in this loop. This may be empty if it
+ // is no longer a loop.
+ auto LoopBlockSet = recomputeLoopBlockSet(L, LI);
+
+ // If we still have a loop, we need to re-set the loop's parent as the exit
+ // block set changing may have moved it within the loop nest. Note that this
+ // can only happen when this loop has a parent as it can only hoist the loop
+ // *up* the nest.
+ if (!LoopBlockSet.empty() && L.getParentLoop() != ParentL) {
+ // Remove this loop's (original) blocks from all of the intervening loops.
+ for (Loop *IL = L.getParentLoop(); IL != ParentL;
+ IL = IL->getParentLoop()) {
+ IL->getBlocksSet().erase(PH);
+ for (auto *BB : L.blocks())
+ IL->getBlocksSet().erase(BB);
+ llvm::erase_if(IL->getBlocksVector(), [&](BasicBlock *BB) {
+ return BB == PH || L.contains(BB);
+ });
+ }
+
+ LI.changeLoopFor(PH, ParentL);
+ L.getParentLoop()->removeChildLoop(&L);
+ if (ParentL)
+ ParentL->addChildLoop(&L);
+ else
+ LI.addTopLevelLoop(&L);
+ }
+
+ // Now we update all the blocks which are no longer within the loop.
+ auto &Blocks = L.getBlocksVector();
+ auto BlocksSplitI =
+ LoopBlockSet.empty()
+ ? Blocks.begin()
+ : std::stable_partition(
+ Blocks.begin(), Blocks.end(),
+ [&](BasicBlock *BB) { return LoopBlockSet.count(BB); });
+
+ // Before we erase the list of unlooped blocks, build a set of them.
+ SmallPtrSet<BasicBlock *, 16> UnloopedBlocks(BlocksSplitI, Blocks.end());
+ if (LoopBlockSet.empty())
+ UnloopedBlocks.insert(PH);
+
+ // Now erase these blocks from the loop.
+ for (auto *BB : make_range(BlocksSplitI, Blocks.end()))
+ L.getBlocksSet().erase(BB);
+ Blocks.erase(BlocksSplitI, Blocks.end());
+
+ // Sort the exits in ascending loop depth, we'll work backwards across these
+ // to process them inside out.
+ llvm::stable_sort(ExitsInLoops, [&](BasicBlock *LHS, BasicBlock *RHS) {
+ return LI.getLoopDepth(LHS) < LI.getLoopDepth(RHS);
+ });
+
+ // We'll build up a set for each exit loop.
+ SmallPtrSet<BasicBlock *, 16> NewExitLoopBlocks;
+ Loop *PrevExitL = L.getParentLoop(); // The deepest possible exit loop.
+
+ auto RemoveUnloopedBlocksFromLoop =
+ [](Loop &L, SmallPtrSetImpl<BasicBlock *> &UnloopedBlocks) {
+ for (auto *BB : UnloopedBlocks)
+ L.getBlocksSet().erase(BB);
+ llvm::erase_if(L.getBlocksVector(), [&](BasicBlock *BB) {
+ return UnloopedBlocks.count(BB);
+ });
+ };
+
+ SmallVector<BasicBlock *, 16> Worklist;
+ while (!UnloopedBlocks.empty() && !ExitsInLoops.empty()) {
+ assert(Worklist.empty() && "Didn't clear worklist!");
+ assert(NewExitLoopBlocks.empty() && "Didn't clear loop set!");
+
+ // Grab the next exit block, in decreasing loop depth order.
+ BasicBlock *ExitBB = ExitsInLoops.pop_back_val();
+ Loop &ExitL = *LI.getLoopFor(ExitBB);
+ assert(ExitL.contains(&L) && "Exit loop must contain the inner loop!");
+
+ // Erase all of the unlooped blocks from the loops between the previous
+ // exit loop and this exit loop. This works because the ExitInLoops list is
+ // sorted in increasing order of loop depth and thus we visit loops in
+ // decreasing order of loop depth.
+ for (; PrevExitL != &ExitL; PrevExitL = PrevExitL->getParentLoop())
+ RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
+
+ // Walk the CFG back until we hit the cloned PH adding everything reachable
+ // and in the unlooped set to this exit block's loop.
+ Worklist.push_back(ExitBB);
+ do {
+ BasicBlock *BB = Worklist.pop_back_val();
+ // We can stop recursing at the cloned preheader (if we get there).
+ if (BB == PH)
+ continue;
+
+ for (BasicBlock *PredBB : predecessors(BB)) {
+ // If this pred has already been moved to our set or is part of some
+ // (inner) loop, no update needed.
+ if (!UnloopedBlocks.erase(PredBB)) {
+ assert((NewExitLoopBlocks.count(PredBB) ||
+ ExitL.contains(LI.getLoopFor(PredBB))) &&
+ "Predecessor not in a nested loop (or already visited)!");
+ continue;
+ }
+
+ // We just insert into the loop set here. We'll add these blocks to the
+ // exit loop after we build up the set in a deterministic order rather
+ // than the predecessor-influenced visit order.
+ bool Inserted = NewExitLoopBlocks.insert(PredBB).second;
+ (void)Inserted;
+ assert(Inserted && "Should only visit an unlooped block once!");
+
+ // And recurse through to its predecessors.
+ Worklist.push_back(PredBB);
+ }
+ } while (!Worklist.empty());
+
+ // If blocks in this exit loop were directly part of the original loop (as
+ // opposed to a child loop) update the map to point to this exit loop. This
+ // just updates a map and so the fact that the order is unstable is fine.
+ for (auto *BB : NewExitLoopBlocks)
+ if (Loop *BBL = LI.getLoopFor(BB))
+ if (BBL == &L || !L.contains(BBL))
+ LI.changeLoopFor(BB, &ExitL);
+
+ // We will remove the remaining unlooped blocks from this loop in the next
+ // iteration or below.
+ NewExitLoopBlocks.clear();
+ }
+
+ // Any remaining unlooped blocks are no longer part of any loop unless they
+ // are part of some child loop.
+ for (; PrevExitL; PrevExitL = PrevExitL->getParentLoop())
+ RemoveUnloopedBlocksFromLoop(*PrevExitL, UnloopedBlocks);
+ for (auto *BB : UnloopedBlocks)
+ if (Loop *BBL = LI.getLoopFor(BB))
+ if (BBL == &L || !L.contains(BBL))
+ LI.changeLoopFor(BB, nullptr);
+
+ // Sink all the child loops whose headers are no longer in the loop set to
+ // the parent (or to be top level loops). We reach into the loop and directly
+ // update its subloop vector to make this batch update efficient.
+ auto &SubLoops = L.getSubLoopsVector();
+ auto SubLoopsSplitI =
+ LoopBlockSet.empty()
+ ? SubLoops.begin()
+ : std::stable_partition(
+ SubLoops.begin(), SubLoops.end(), [&](Loop *SubL) {
+ return LoopBlockSet.count(SubL->getHeader());
+ });
+ for (auto *HoistedL : make_range(SubLoopsSplitI, SubLoops.end())) {
+ HoistedLoops.push_back(HoistedL);
+ HoistedL->setParentLoop(nullptr);
+
+ // To compute the new parent of this hoisted loop we look at where we
+ // placed the preheader above. We can't lookup the header itself because we
+ // retained the mapping from the header to the hoisted loop. But the
+ // preheader and header should have the exact same new parent computed
+ // based on the set of exit blocks from the original loop as the preheader
+ // is a predecessor of the header and so reached in the reverse walk. And
+ // because the loops were all in simplified form the preheader of the
+ // hoisted loop can't be part of some *other* loop.
+ if (auto *NewParentL = LI.getLoopFor(HoistedL->getLoopPreheader()))
+ NewParentL->addChildLoop(HoistedL);
+ else
+ LI.addTopLevelLoop(HoistedL);
+ }
+ SubLoops.erase(SubLoopsSplitI, SubLoops.end());
+
+ // Actually delete the loop if nothing remained within it.
+ if (Blocks.empty()) {
+ assert(SubLoops.empty() &&
+ "Failed to remove all subloops from the original loop!");
+ if (Loop *ParentL = L.getParentLoop())
+ ParentL->removeChildLoop(llvm::find(*ParentL, &L));
+ else
+ LI.removeLoop(llvm::find(LI, &L));
+ LI.destroy(&L);
+ return false;
+ }
+
+ return true;
+}
+
+/// Helper to visit a dominator subtree, invoking a callable on each node.
+///
+/// Returning false at any point will stop walking past that node of the tree.
+template <typename CallableT>
+void visitDomSubTree(DominatorTree &DT, BasicBlock *BB, CallableT Callable) {
+ SmallVector<DomTreeNode *, 4> DomWorklist;
+ DomWorklist.push_back(DT[BB]);
+#ifndef NDEBUG
+ SmallPtrSet<DomTreeNode *, 4> Visited;
+ Visited.insert(DT[BB]);
+#endif
+ do {
+ DomTreeNode *N = DomWorklist.pop_back_val();
+
+ // Visit this node.
+ if (!Callable(N->getBlock()))
+ continue;
+
+ // Accumulate the child nodes.
+ for (DomTreeNode *ChildN : *N) {
+ assert(Visited.insert(ChildN).second &&
+ "Cannot visit a node twice when walking a tree!");
+ DomWorklist.push_back(ChildN);
+ }
+ } while (!DomWorklist.empty());
+}
+
+static void unswitchNontrivialInvariants(
+ Loop &L, Instruction &TI, ArrayRef<Value *> Invariants,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks, DominatorTree &DT, LoopInfo &LI,
+ AssumptionCache &AC, function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+ ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
+ auto *ParentBB = TI.getParent();
+ BranchInst *BI = dyn_cast<BranchInst>(&TI);
+ SwitchInst *SI = BI ? nullptr : cast<SwitchInst>(&TI);
+
+ // We can only unswitch switches, conditional branches with an invariant
+ // condition, or combining invariant conditions with an instruction.
+ assert((SI || (BI && BI->isConditional())) &&
+ "Can only unswitch switches and conditional branch!");
+ bool FullUnswitch = SI || BI->getCondition() == Invariants[0];
+ if (FullUnswitch)
+ assert(Invariants.size() == 1 &&
+ "Cannot have other invariants with full unswitching!");
+ else
+ assert(isa<Instruction>(BI->getCondition()) &&
+ "Partial unswitching requires an instruction as the condition!");
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ // Constant and BBs tracking the cloned and continuing successor. When we are
+ // unswitching the entire condition, this can just be trivially chosen to
+ // unswitch towards `true`. However, when we are unswitching a set of
+ // invariants combined with `and` or `or`, the combining operation determines
+ // the best direction to unswitch: we want to unswitch the direction that will
+ // collapse the branch.
+ bool Direction = true;
+ int ClonedSucc = 0;
+ if (!FullUnswitch) {
+ if (cast<Instruction>(BI->getCondition())->getOpcode() != Instruction::Or) {
+ assert(cast<Instruction>(BI->getCondition())->getOpcode() ==
+ Instruction::And &&
+ "Only `or` and `and` instructions can combine invariants being "
+ "unswitched.");
+ Direction = false;
+ ClonedSucc = 1;
+ }
+ }
+
+ BasicBlock *RetainedSuccBB =
+ BI ? BI->getSuccessor(1 - ClonedSucc) : SI->getDefaultDest();
+ SmallSetVector<BasicBlock *, 4> UnswitchedSuccBBs;
+ if (BI)
+ UnswitchedSuccBBs.insert(BI->getSuccessor(ClonedSucc));
+ else
+ for (auto Case : SI->cases())
+ if (Case.getCaseSuccessor() != RetainedSuccBB)
+ UnswitchedSuccBBs.insert(Case.getCaseSuccessor());
+
+ assert(!UnswitchedSuccBBs.count(RetainedSuccBB) &&
+ "Should not unswitch the same successor we are retaining!");
+
+ // The branch should be in this exact loop. Any inner loop's invariant branch
+ // should be handled by unswitching that inner loop. The caller of this
+ // routine should filter out any candidates that remain (but were skipped for
+ // whatever reason).
+ assert(LI.getLoopFor(ParentBB) == &L && "Branch in an inner loop!");
+
+ // Compute the parent loop now before we start hacking on things.
+ Loop *ParentL = L.getParentLoop();
+ // Get blocks in RPO order for MSSA update, before changing the CFG.
+ LoopBlocksRPO LBRPO(&L);
+ if (MSSAU)
+ LBRPO.perform(&LI);
+
+ // Compute the outer-most loop containing one of our exit blocks. This is the
+ // furthest up our loopnest which can be mutated, which we will use below to
+ // update things.
+ Loop *OuterExitL = &L;
+ for (auto *ExitBB : ExitBlocks) {
+ Loop *NewOuterExitL = LI.getLoopFor(ExitBB);
+ if (!NewOuterExitL) {
+ // We exited the entire nest with this block, so we're done.
+ OuterExitL = nullptr;
+ break;
+ }
+ if (NewOuterExitL != OuterExitL && NewOuterExitL->contains(OuterExitL))
+ OuterExitL = NewOuterExitL;
+ }
+
+ // At this point, we're definitely going to unswitch something so invalidate
+ // any cached information in ScalarEvolution for the outer most loop
+ // containing an exit block and all nested loops.
+ if (SE) {
+ if (OuterExitL)
+ SE->forgetLoop(OuterExitL);
+ else
+ SE->forgetTopmostLoop(&L);
+ }
+
+ // If the edge from this terminator to a successor dominates that successor,
+ // store a map from each block in its dominator subtree to it. This lets us
+ // tell when cloning for a particular successor if a block is dominated by
+ // some *other* successor with a single data structure. We use this to
+ // significantly reduce cloning.
+ SmallDenseMap<BasicBlock *, BasicBlock *, 16> DominatingSucc;
+ for (auto *SuccBB : llvm::concat<BasicBlock *const>(
+ makeArrayRef(RetainedSuccBB), UnswitchedSuccBBs))
+ if (SuccBB->getUniquePredecessor() ||
+ llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
+ return PredBB == ParentBB || DT.dominates(SuccBB, PredBB);
+ }))
+ visitDomSubTree(DT, SuccBB, [&](BasicBlock *BB) {
+ DominatingSucc[BB] = SuccBB;
+ return true;
+ });
+
+ // Split the preheader, so that we know that there is a safe place to insert
+ // the conditional branch. We will change the preheader to have a conditional
+ // branch on LoopCond. The original preheader will become the split point
+ // between the unswitched versions, and we will have a new preheader for the
+ // original loop.
+ BasicBlock *SplitBB = L.getLoopPreheader();
+ BasicBlock *LoopPH = SplitEdge(SplitBB, L.getHeader(), &DT, &LI, MSSAU);
+
+ // Keep track of the dominator tree updates needed.
+ SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+
+ // Clone the loop for each unswitched successor.
+ SmallVector<std::unique_ptr<ValueToValueMapTy>, 4> VMaps;
+ VMaps.reserve(UnswitchedSuccBBs.size());
+ SmallDenseMap<BasicBlock *, BasicBlock *, 4> ClonedPHs;
+ for (auto *SuccBB : UnswitchedSuccBBs) {
+ VMaps.emplace_back(new ValueToValueMapTy());
+ ClonedPHs[SuccBB] = buildClonedLoopBlocks(
+ L, LoopPH, SplitBB, ExitBlocks, ParentBB, SuccBB, RetainedSuccBB,
+ DominatingSucc, *VMaps.back(), DTUpdates, AC, DT, LI, MSSAU);
+ }
+
// Drop metadata if we may break its semantics by moving this instr into the
// split block.
if (TI.getMetadata(LLVMContext::MD_make_implicit)) {
@@ -2107,967 +2107,967 @@ static void unswitchNontrivialInvariants(
}
}
- // The stitching of the branched code back together depends on whether we're
- // doing full unswitching or not with the exception that we always want to
- // nuke the initial terminator placed in the split block.
- SplitBB->getTerminator()->eraseFromParent();
- if (FullUnswitch) {
- // Splice the terminator from the original loop and rewrite its
- // successors.
- SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI);
-
- // Keep a clone of the terminator for MSSA updates.
- Instruction *NewTI = TI.clone();
- ParentBB->getInstList().push_back(NewTI);
-
- // First wire up the moved terminator to the preheaders.
- if (BI) {
- BasicBlock *ClonedPH = ClonedPHs.begin()->second;
- BI->setSuccessor(ClonedSucc, ClonedPH);
- BI->setSuccessor(1 - ClonedSucc, LoopPH);
- DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
- } else {
- assert(SI && "Must either be a branch or switch!");
-
- // Walk the cases and directly update their successors.
- assert(SI->getDefaultDest() == RetainedSuccBB &&
- "Not retaining default successor!");
- SI->setDefaultDest(LoopPH);
- for (auto &Case : SI->cases())
- if (Case.getCaseSuccessor() == RetainedSuccBB)
- Case.setSuccessor(LoopPH);
- else
- Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second);
-
- // We need to use the set to populate domtree updates as even when there
- // are multiple cases pointing at the same successor we only want to
- // remove and insert one edge in the domtree.
- for (BasicBlock *SuccBB : UnswitchedSuccBBs)
- DTUpdates.push_back(
- {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second});
- }
-
- if (MSSAU) {
- DT.applyUpdates(DTUpdates);
- DTUpdates.clear();
-
- // Remove all but one edge to the retained block and all unswitched
- // blocks. This is to avoid having duplicate entries in the cloned Phis,
- // when we know we only keep a single edge for each case.
- MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, RetainedSuccBB);
- for (BasicBlock *SuccBB : UnswitchedSuccBBs)
- MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, SuccBB);
-
- for (auto &VMap : VMaps)
- MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap,
- /*IgnoreIncomingWithNoClones=*/true);
- MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT);
-
- // Remove all edges to unswitched blocks.
- for (BasicBlock *SuccBB : UnswitchedSuccBBs)
- MSSAU->removeEdge(ParentBB, SuccBB);
- }
-
- // Now unhook the successor relationship as we'll be replacing
- // the terminator with a direct branch. This is much simpler for branches
- // than switches so we handle those first.
- if (BI) {
- // Remove the parent as a predecessor of the unswitched successor.
- assert(UnswitchedSuccBBs.size() == 1 &&
- "Only one possible unswitched block for a branch!");
- BasicBlock *UnswitchedSuccBB = *UnswitchedSuccBBs.begin();
- UnswitchedSuccBB->removePredecessor(ParentBB,
- /*KeepOneInputPHIs*/ true);
- DTUpdates.push_back({DominatorTree::Delete, ParentBB, UnswitchedSuccBB});
- } else {
- // Note that we actually want to remove the parent block as a predecessor
- // of *every* case successor. The case successor is either unswitched,
- // completely eliminating an edge from the parent to that successor, or it
- // is a duplicate edge to the retained successor as the retained successor
- // is always the default successor and as we'll replace this with a direct
- // branch we no longer need the duplicate entries in the PHI nodes.
- SwitchInst *NewSI = cast<SwitchInst>(NewTI);
- assert(NewSI->getDefaultDest() == RetainedSuccBB &&
- "Not retaining default successor!");
- for (auto &Case : NewSI->cases())
- Case.getCaseSuccessor()->removePredecessor(
- ParentBB,
- /*KeepOneInputPHIs*/ true);
-
- // We need to use the set to populate domtree updates as even when there
- // are multiple cases pointing at the same successor we only want to
- // remove and insert one edge in the domtree.
- for (BasicBlock *SuccBB : UnswitchedSuccBBs)
- DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB});
- }
-
- // After MSSAU update, remove the cloned terminator instruction NewTI.
- ParentBB->getTerminator()->eraseFromParent();
-
- // Create a new unconditional branch to the continuing block (as opposed to
- // the one cloned).
- BranchInst::Create(RetainedSuccBB, ParentBB);
- } else {
- assert(BI && "Only branches have partial unswitching.");
- assert(UnswitchedSuccBBs.size() == 1 &&
- "Only one possible unswitched block for a branch!");
- BasicBlock *ClonedPH = ClonedPHs.begin()->second;
- // When doing a partial unswitch, we have to do a bit more work to build up
- // the branch in the split block.
- buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
- *ClonedPH, *LoopPH);
- DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
-
- if (MSSAU) {
- DT.applyUpdates(DTUpdates);
- DTUpdates.clear();
-
- // Perform MSSA cloning updates.
- for (auto &VMap : VMaps)
- MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap,
- /*IgnoreIncomingWithNoClones=*/true);
- MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT);
- }
- }
-
- // Apply the updates accumulated above to get an up-to-date dominator tree.
- DT.applyUpdates(DTUpdates);
-
- // Now that we have an accurate dominator tree, first delete the dead cloned
- // blocks so that we can accurately build any cloned loops. It is important to
- // not delete the blocks from the original loop yet because we still want to
- // reference the original loop to understand the cloned loop's structure.
- deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT, MSSAU);
-
- // Build the cloned loop structure itself. This may be substantially
- // different from the original structure due to the simplified CFG. This also
- // handles inserting all the cloned blocks into the correct loops.
- SmallVector<Loop *, 4> NonChildClonedLoops;
- for (std::unique_ptr<ValueToValueMapTy> &VMap : VMaps)
- buildClonedLoops(L, ExitBlocks, *VMap, LI, NonChildClonedLoops);
-
- // Now that our cloned loops have been built, we can update the original loop.
- // First we delete the dead blocks from it and then we rebuild the loop
- // structure taking these deletions into account.
- deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU);
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- SmallVector<Loop *, 4> HoistedLoops;
- bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops);
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- // This transformation has a high risk of corrupting the dominator tree, and
- // the below steps to rebuild loop structures will result in hard to debug
- // errors in that case so verify that the dominator tree is sane first.
- // FIXME: Remove this when the bugs stop showing up and rely on existing
- // verification steps.
- assert(DT.verify(DominatorTree::VerificationLevel::Fast));
-
- if (BI) {
- // If we unswitched a branch which collapses the condition to a known
- // constant we want to replace all the uses of the invariants within both
- // the original and cloned blocks. We do this here so that we can use the
- // now updated dominator tree to identify which side the users are on.
- assert(UnswitchedSuccBBs.size() == 1 &&
- "Only one possible unswitched block for a branch!");
- BasicBlock *ClonedPH = ClonedPHs.begin()->second;
-
- // When considering multiple partially-unswitched invariants
- // we cant just go replace them with constants in both branches.
- //
- // For 'AND' we infer that true branch ("continue") means true
- // for each invariant operand.
- // For 'OR' we can infer that false branch ("continue") means false
- // for each invariant operand.
- // So it happens that for multiple-partial case we dont replace
- // in the unswitched branch.
- bool ReplaceUnswitched = FullUnswitch || (Invariants.size() == 1);
-
- ConstantInt *UnswitchedReplacement =
- Direction ? ConstantInt::getTrue(BI->getContext())
- : ConstantInt::getFalse(BI->getContext());
- ConstantInt *ContinueReplacement =
- Direction ? ConstantInt::getFalse(BI->getContext())
- : ConstantInt::getTrue(BI->getContext());
- for (Value *Invariant : Invariants)
- for (auto UI = Invariant->use_begin(), UE = Invariant->use_end();
- UI != UE;) {
- // Grab the use and walk past it so we can clobber it in the use list.
- Use *U = &*UI++;
- Instruction *UserI = dyn_cast<Instruction>(U->getUser());
- if (!UserI)
- continue;
-
- // Replace it with the 'continue' side if in the main loop body, and the
- // unswitched if in the cloned blocks.
- if (DT.dominates(LoopPH, UserI->getParent()))
- U->set(ContinueReplacement);
- else if (ReplaceUnswitched &&
- DT.dominates(ClonedPH, UserI->getParent()))
- U->set(UnswitchedReplacement);
- }
- }
-
- // We can change which blocks are exit blocks of all the cloned sibling
- // loops, the current loop, and any parent loops which shared exit blocks
- // with the current loop. As a consequence, we need to re-form LCSSA for
- // them. But we shouldn't need to re-form LCSSA for any child loops.
- // FIXME: This could be made more efficient by tracking which exit blocks are
- // new, and focusing on them, but that isn't likely to be necessary.
- //
- // In order to reasonably rebuild LCSSA we need to walk inside-out across the
- // loop nest and update every loop that could have had its exits changed. We
- // also need to cover any intervening loops. We add all of these loops to
- // a list and sort them by loop depth to achieve this without updating
- // unnecessary loops.
- auto UpdateLoop = [&](Loop &UpdateL) {
-#ifndef NDEBUG
- UpdateL.verifyLoop();
- for (Loop *ChildL : UpdateL) {
- ChildL->verifyLoop();
- assert(ChildL->isRecursivelyLCSSAForm(DT, LI) &&
- "Perturbed a child loop's LCSSA form!");
- }
-#endif
- // First build LCSSA for this loop so that we can preserve it when
- // forming dedicated exits. We don't want to perturb some other loop's
- // LCSSA while doing that CFG edit.
- formLCSSA(UpdateL, DT, &LI, SE);
-
- // For loops reached by this loop's original exit blocks we may
- // introduced new, non-dedicated exits. At least try to re-form dedicated
- // exits for these loops. This may fail if they couldn't have dedicated
- // exits to start with.
- formDedicatedExitBlocks(&UpdateL, &DT, &LI, MSSAU, /*PreserveLCSSA*/ true);
- };
-
- // For non-child cloned loops and hoisted loops, we just need to update LCSSA
- // and we can do it in any order as they don't nest relative to each other.
- //
- // Also check if any of the loops we have updated have become top-level loops
- // as that will necessitate widening the outer loop scope.
- for (Loop *UpdatedL :
- llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) {
- UpdateLoop(*UpdatedL);
+ // The stitching of the branched code back together depends on whether we're
+ // doing full unswitching or not with the exception that we always want to
+ // nuke the initial terminator placed in the split block.
+ SplitBB->getTerminator()->eraseFromParent();
+ if (FullUnswitch) {
+ // Splice the terminator from the original loop and rewrite its
+ // successors.
+ SplitBB->getInstList().splice(SplitBB->end(), ParentBB->getInstList(), TI);
+
+ // Keep a clone of the terminator for MSSA updates.
+ Instruction *NewTI = TI.clone();
+ ParentBB->getInstList().push_back(NewTI);
+
+ // First wire up the moved terminator to the preheaders.
+ if (BI) {
+ BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+ BI->setSuccessor(ClonedSucc, ClonedPH);
+ BI->setSuccessor(1 - ClonedSucc, LoopPH);
+ DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
+ } else {
+ assert(SI && "Must either be a branch or switch!");
+
+ // Walk the cases and directly update their successors.
+ assert(SI->getDefaultDest() == RetainedSuccBB &&
+ "Not retaining default successor!");
+ SI->setDefaultDest(LoopPH);
+ for (auto &Case : SI->cases())
+ if (Case.getCaseSuccessor() == RetainedSuccBB)
+ Case.setSuccessor(LoopPH);
+ else
+ Case.setSuccessor(ClonedPHs.find(Case.getCaseSuccessor())->second);
+
+ // We need to use the set to populate domtree updates as even when there
+ // are multiple cases pointing at the same successor we only want to
+ // remove and insert one edge in the domtree.
+ for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+ DTUpdates.push_back(
+ {DominatorTree::Insert, SplitBB, ClonedPHs.find(SuccBB)->second});
+ }
+
+ if (MSSAU) {
+ DT.applyUpdates(DTUpdates);
+ DTUpdates.clear();
+
+ // Remove all but one edge to the retained block and all unswitched
+ // blocks. This is to avoid having duplicate entries in the cloned Phis,
+ // when we know we only keep a single edge for each case.
+ MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, RetainedSuccBB);
+ for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+ MSSAU->removeDuplicatePhiEdgesBetween(ParentBB, SuccBB);
+
+ for (auto &VMap : VMaps)
+ MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap,
+ /*IgnoreIncomingWithNoClones=*/true);
+ MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT);
+
+ // Remove all edges to unswitched blocks.
+ for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+ MSSAU->removeEdge(ParentBB, SuccBB);
+ }
+
+ // Now unhook the successor relationship as we'll be replacing
+ // the terminator with a direct branch. This is much simpler for branches
+ // than switches so we handle those first.
+ if (BI) {
+ // Remove the parent as a predecessor of the unswitched successor.
+ assert(UnswitchedSuccBBs.size() == 1 &&
+ "Only one possible unswitched block for a branch!");
+ BasicBlock *UnswitchedSuccBB = *UnswitchedSuccBBs.begin();
+ UnswitchedSuccBB->removePredecessor(ParentBB,
+ /*KeepOneInputPHIs*/ true);
+ DTUpdates.push_back({DominatorTree::Delete, ParentBB, UnswitchedSuccBB});
+ } else {
+ // Note that we actually want to remove the parent block as a predecessor
+ // of *every* case successor. The case successor is either unswitched,
+ // completely eliminating an edge from the parent to that successor, or it
+ // is a duplicate edge to the retained successor as the retained successor
+ // is always the default successor and as we'll replace this with a direct
+ // branch we no longer need the duplicate entries in the PHI nodes.
+ SwitchInst *NewSI = cast<SwitchInst>(NewTI);
+ assert(NewSI->getDefaultDest() == RetainedSuccBB &&
+ "Not retaining default successor!");
+ for (auto &Case : NewSI->cases())
+ Case.getCaseSuccessor()->removePredecessor(
+ ParentBB,
+ /*KeepOneInputPHIs*/ true);
+
+ // We need to use the set to populate domtree updates as even when there
+ // are multiple cases pointing at the same successor we only want to
+ // remove and insert one edge in the domtree.
+ for (BasicBlock *SuccBB : UnswitchedSuccBBs)
+ DTUpdates.push_back({DominatorTree::Delete, ParentBB, SuccBB});
+ }
+
+ // After MSSAU update, remove the cloned terminator instruction NewTI.
+ ParentBB->getTerminator()->eraseFromParent();
+
+ // Create a new unconditional branch to the continuing block (as opposed to
+ // the one cloned).
+ BranchInst::Create(RetainedSuccBB, ParentBB);
+ } else {
+ assert(BI && "Only branches have partial unswitching.");
+ assert(UnswitchedSuccBBs.size() == 1 &&
+ "Only one possible unswitched block for a branch!");
+ BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+ // When doing a partial unswitch, we have to do a bit more work to build up
+ // the branch in the split block.
+ buildPartialUnswitchConditionalBranch(*SplitBB, Invariants, Direction,
+ *ClonedPH, *LoopPH);
+ DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH});
+
+ if (MSSAU) {
+ DT.applyUpdates(DTUpdates);
+ DTUpdates.clear();
+
+ // Perform MSSA cloning updates.
+ for (auto &VMap : VMaps)
+ MSSAU->updateForClonedLoop(LBRPO, ExitBlocks, *VMap,
+ /*IgnoreIncomingWithNoClones=*/true);
+ MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMaps, DT);
+ }
+ }
+
+ // Apply the updates accumulated above to get an up-to-date dominator tree.
+ DT.applyUpdates(DTUpdates);
+
+ // Now that we have an accurate dominator tree, first delete the dead cloned
+ // blocks so that we can accurately build any cloned loops. It is important to
+ // not delete the blocks from the original loop yet because we still want to
+ // reference the original loop to understand the cloned loop's structure.
+ deleteDeadClonedBlocks(L, ExitBlocks, VMaps, DT, MSSAU);
+
+ // Build the cloned loop structure itself. This may be substantially
+ // different from the original structure due to the simplified CFG. This also
+ // handles inserting all the cloned blocks into the correct loops.
+ SmallVector<Loop *, 4> NonChildClonedLoops;
+ for (std::unique_ptr<ValueToValueMapTy> &VMap : VMaps)
+ buildClonedLoops(L, ExitBlocks, *VMap, LI, NonChildClonedLoops);
+
+ // Now that our cloned loops have been built, we can update the original loop.
+ // First we delete the dead blocks from it and then we rebuild the loop
+ // structure taking these deletions into account.
+ deleteDeadBlocksFromLoop(L, ExitBlocks, DT, LI, MSSAU);
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ SmallVector<Loop *, 4> HoistedLoops;
+ bool IsStillLoop = rebuildLoopAfterUnswitch(L, ExitBlocks, LI, HoistedLoops);
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ // This transformation has a high risk of corrupting the dominator tree, and
+ // the below steps to rebuild loop structures will result in hard to debug
+ // errors in that case so verify that the dominator tree is sane first.
+ // FIXME: Remove this when the bugs stop showing up and rely on existing
+ // verification steps.
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
+ if (BI) {
+ // If we unswitched a branch which collapses the condition to a known
+ // constant we want to replace all the uses of the invariants within both
+ // the original and cloned blocks. We do this here so that we can use the
+ // now updated dominator tree to identify which side the users are on.
+ assert(UnswitchedSuccBBs.size() == 1 &&
+ "Only one possible unswitched block for a branch!");
+ BasicBlock *ClonedPH = ClonedPHs.begin()->second;
+
+ // When considering multiple partially-unswitched invariants
+ // we cant just go replace them with constants in both branches.
+ //
+ // For 'AND' we infer that true branch ("continue") means true
+ // for each invariant operand.
+ // For 'OR' we can infer that false branch ("continue") means false
+ // for each invariant operand.
+ // So it happens that for multiple-partial case we dont replace
+ // in the unswitched branch.
+ bool ReplaceUnswitched = FullUnswitch || (Invariants.size() == 1);
+
+ ConstantInt *UnswitchedReplacement =
+ Direction ? ConstantInt::getTrue(BI->getContext())
+ : ConstantInt::getFalse(BI->getContext());
+ ConstantInt *ContinueReplacement =
+ Direction ? ConstantInt::getFalse(BI->getContext())
+ : ConstantInt::getTrue(BI->getContext());
+ for (Value *Invariant : Invariants)
+ for (auto UI = Invariant->use_begin(), UE = Invariant->use_end();
+ UI != UE;) {
+ // Grab the use and walk past it so we can clobber it in the use list.
+ Use *U = &*UI++;
+ Instruction *UserI = dyn_cast<Instruction>(U->getUser());
+ if (!UserI)
+ continue;
+
+ // Replace it with the 'continue' side if in the main loop body, and the
+ // unswitched if in the cloned blocks.
+ if (DT.dominates(LoopPH, UserI->getParent()))
+ U->set(ContinueReplacement);
+ else if (ReplaceUnswitched &&
+ DT.dominates(ClonedPH, UserI->getParent()))
+ U->set(UnswitchedReplacement);
+ }
+ }
+
+ // We can change which blocks are exit blocks of all the cloned sibling
+ // loops, the current loop, and any parent loops which shared exit blocks
+ // with the current loop. As a consequence, we need to re-form LCSSA for
+ // them. But we shouldn't need to re-form LCSSA for any child loops.
+ // FIXME: This could be made more efficient by tracking which exit blocks are
+ // new, and focusing on them, but that isn't likely to be necessary.
+ //
+ // In order to reasonably rebuild LCSSA we need to walk inside-out across the
+ // loop nest and update every loop that could have had its exits changed. We
+ // also need to cover any intervening loops. We add all of these loops to
+ // a list and sort them by loop depth to achieve this without updating
+ // unnecessary loops.
+ auto UpdateLoop = [&](Loop &UpdateL) {
+#ifndef NDEBUG
+ UpdateL.verifyLoop();
+ for (Loop *ChildL : UpdateL) {
+ ChildL->verifyLoop();
+ assert(ChildL->isRecursivelyLCSSAForm(DT, LI) &&
+ "Perturbed a child loop's LCSSA form!");
+ }
+#endif
+ // First build LCSSA for this loop so that we can preserve it when
+ // forming dedicated exits. We don't want to perturb some other loop's
+ // LCSSA while doing that CFG edit.
+ formLCSSA(UpdateL, DT, &LI, SE);
+
+ // For loops reached by this loop's original exit blocks we may
+ // introduced new, non-dedicated exits. At least try to re-form dedicated
+ // exits for these loops. This may fail if they couldn't have dedicated
+ // exits to start with.
+ formDedicatedExitBlocks(&UpdateL, &DT, &LI, MSSAU, /*PreserveLCSSA*/ true);
+ };
+
+ // For non-child cloned loops and hoisted loops, we just need to update LCSSA
+ // and we can do it in any order as they don't nest relative to each other.
+ //
+ // Also check if any of the loops we have updated have become top-level loops
+ // as that will necessitate widening the outer loop scope.
+ for (Loop *UpdatedL :
+ llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops)) {
+ UpdateLoop(*UpdatedL);
if (UpdatedL->isOutermost())
- OuterExitL = nullptr;
- }
- if (IsStillLoop) {
- UpdateLoop(L);
+ OuterExitL = nullptr;
+ }
+ if (IsStillLoop) {
+ UpdateLoop(L);
if (L.isOutermost())
- OuterExitL = nullptr;
- }
-
- // If the original loop had exit blocks, walk up through the outer most loop
- // of those exit blocks to update LCSSA and form updated dedicated exits.
- if (OuterExitL != &L)
- for (Loop *OuterL = ParentL; OuterL != OuterExitL;
- OuterL = OuterL->getParentLoop())
- UpdateLoop(*OuterL);
-
-#ifndef NDEBUG
- // Verify the entire loop structure to catch any incorrect updates before we
- // progress in the pass pipeline.
- LI.verify(DT);
-#endif
-
- // Now that we've unswitched something, make callbacks to report the changes.
- // For that we need to merge together the updated loops and the cloned loops
- // and check whether the original loop survived.
- SmallVector<Loop *, 4> SibLoops;
- for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
- if (UpdatedL->getParentLoop() == ParentL)
- SibLoops.push_back(UpdatedL);
- UnswitchCB(IsStillLoop, SibLoops);
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- if (BI)
- ++NumBranches;
- else
- ++NumSwitches;
-}
-
-/// Recursively compute the cost of a dominator subtree based on the per-block
-/// cost map provided.
-///
-/// The recursive computation is memozied into the provided DT-indexed cost map
-/// to allow querying it for most nodes in the domtree without it becoming
-/// quadratic.
-static int
-computeDomSubtreeCost(DomTreeNode &N,
- const SmallDenseMap<BasicBlock *, int, 4> &BBCostMap,
- SmallDenseMap<DomTreeNode *, int, 4> &DTCostMap) {
- // Don't accumulate cost (or recurse through) blocks not in our block cost
- // map and thus not part of the duplication cost being considered.
- auto BBCostIt = BBCostMap.find(N.getBlock());
- if (BBCostIt == BBCostMap.end())
- return 0;
-
- // Lookup this node to see if we already computed its cost.
- auto DTCostIt = DTCostMap.find(&N);
- if (DTCostIt != DTCostMap.end())
- return DTCostIt->second;
-
- // If not, we have to compute it. We can't use insert above and update
- // because computing the cost may insert more things into the map.
- int Cost = std::accumulate(
- N.begin(), N.end(), BBCostIt->second, [&](int Sum, DomTreeNode *ChildN) {
- return Sum + computeDomSubtreeCost(*ChildN, BBCostMap, DTCostMap);
- });
- bool Inserted = DTCostMap.insert({&N, Cost}).second;
- (void)Inserted;
- assert(Inserted && "Should not insert a node while visiting children!");
- return Cost;
-}
-
-/// Turns a llvm.experimental.guard intrinsic into implicit control flow branch,
-/// making the following replacement:
-///
-/// --code before guard--
-/// call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
-/// --code after guard--
-///
-/// into
-///
-/// --code before guard--
-/// br i1 %cond, label %guarded, label %deopt
-///
-/// guarded:
-/// --code after guard--
-///
-/// deopt:
-/// call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
-/// unreachable
-///
-/// It also makes all relevant DT and LI updates, so that all structures are in
-/// valid state after this transform.
-static BranchInst *
-turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
- SmallVectorImpl<BasicBlock *> &ExitBlocks,
- DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) {
- SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
- LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n");
- BasicBlock *CheckBB = GI->getParent();
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- // Remove all CheckBB's successors from DomTree. A block can be seen among
- // successors more than once, but for DomTree it should be added only once.
- SmallPtrSet<BasicBlock *, 4> Successors;
- for (auto *Succ : successors(CheckBB))
- if (Successors.insert(Succ).second)
- DTUpdates.push_back({DominatorTree::Delete, CheckBB, Succ});
-
- Instruction *DeoptBlockTerm =
- SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true);
- BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
- // SplitBlockAndInsertIfThen inserts control flow that branches to
- // DeoptBlockTerm if the condition is true. We want the opposite.
- CheckBI->swapSuccessors();
-
- BasicBlock *GuardedBlock = CheckBI->getSuccessor(0);
- GuardedBlock->setName("guarded");
- CheckBI->getSuccessor(1)->setName("deopt");
- BasicBlock *DeoptBlock = CheckBI->getSuccessor(1);
-
- // We now have a new exit block.
- ExitBlocks.push_back(CheckBI->getSuccessor(1));
-
- if (MSSAU)
- MSSAU->moveAllAfterSpliceBlocks(CheckBB, GuardedBlock, GI);
-
- GI->moveBefore(DeoptBlockTerm);
- GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext()));
-
- // Add new successors of CheckBB into DomTree.
- for (auto *Succ : successors(CheckBB))
- DTUpdates.push_back({DominatorTree::Insert, CheckBB, Succ});
-
- // Now the blocks that used to be CheckBB's successors are GuardedBlock's
- // successors.
- for (auto *Succ : Successors)
- DTUpdates.push_back({DominatorTree::Insert, GuardedBlock, Succ});
-
- // Make proper changes to DT.
- DT.applyUpdates(DTUpdates);
- // Inform LI of a new loop block.
- L.addBasicBlockToLoop(GuardedBlock, LI);
-
- if (MSSAU) {
- MemoryDef *MD = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(GI));
- MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::BeforeTerminator);
- if (VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
- }
-
- ++NumGuards;
- return CheckBI;
-}
-
-/// Cost multiplier is a way to limit potentially exponential behavior
-/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
-/// candidates available. Also accounting for the number of "sibling" loops with
-/// the idea to account for previous unswitches that already happened on this
-/// cluster of loops. There was an attempt to keep this formula simple,
-/// just enough to limit the worst case behavior. Even if it is not that simple
-/// now it is still not an attempt to provide a detailed heuristic size
-/// prediction.
-///
-/// TODO: Make a proper accounting of "explosion" effect for all kinds of
-/// unswitch candidates, making adequate predictions instead of wild guesses.
-/// That requires knowing not just the number of "remaining" candidates but
-/// also costs of unswitching for each of these candidates.
-static int CalculateUnswitchCostMultiplier(
- Instruction &TI, Loop &L, LoopInfo &LI, DominatorTree &DT,
- ArrayRef<std::pair<Instruction *, TinyPtrVector<Value *>>>
- UnswitchCandidates) {
-
- // Guards and other exiting conditions do not contribute to exponential
- // explosion as soon as they dominate the latch (otherwise there might be
- // another path to the latch remaining that does not allow to eliminate the
- // loop copy on unswitch).
- BasicBlock *Latch = L.getLoopLatch();
- BasicBlock *CondBlock = TI.getParent();
- if (DT.dominates(CondBlock, Latch) &&
- (isGuard(&TI) ||
- llvm::count_if(successors(&TI), [&L](BasicBlock *SuccBB) {
- return L.contains(SuccBB);
- }) <= 1)) {
- NumCostMultiplierSkipped++;
- return 1;
- }
-
- auto *ParentL = L.getParentLoop();
- int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size()
- : std::distance(LI.begin(), LI.end()));
- // Count amount of clones that all the candidates might cause during
- // unswitching. Branch/guard counts as 1, switch counts as log2 of its cases.
- int UnswitchedClones = 0;
- for (auto Candidate : UnswitchCandidates) {
- Instruction *CI = Candidate.first;
- BasicBlock *CondBlock = CI->getParent();
- bool SkipExitingSuccessors = DT.dominates(CondBlock, Latch);
- if (isGuard(CI)) {
- if (!SkipExitingSuccessors)
- UnswitchedClones++;
- continue;
- }
- int NonExitingSuccessors = llvm::count_if(
- successors(CondBlock), [SkipExitingSuccessors, &L](BasicBlock *SuccBB) {
- return !SkipExitingSuccessors || L.contains(SuccBB);
- });
- UnswitchedClones += Log2_32(NonExitingSuccessors);
- }
-
- // Ignore up to the "unscaled candidates" number of unswitch candidates
- // when calculating the power-of-two scaling of the cost. The main idea
- // with this control is to allow a small number of unswitches to happen
- // and rely more on siblings multiplier (see below) when the number
- // of candidates is small.
- unsigned ClonesPower =
- std::max(UnswitchedClones - (int)UnswitchNumInitialUnscaledCandidates, 0);
-
- // Allowing top-level loops to spread a bit more than nested ones.
- int SiblingsMultiplier =
- std::max((ParentL ? SiblingsCount
- : SiblingsCount / (int)UnswitchSiblingsToplevelDiv),
- 1);
- // Compute the cost multiplier in a way that won't overflow by saturating
- // at an upper bound.
- int CostMultiplier;
- if (ClonesPower > Log2_32(UnswitchThreshold) ||
- SiblingsMultiplier > UnswitchThreshold)
- CostMultiplier = UnswitchThreshold;
- else
- CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower),
- (int)UnswitchThreshold);
-
- LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier
- << " (siblings " << SiblingsMultiplier << " * clones "
- << (1 << ClonesPower) << ")"
- << " for unswitch candidate: " << TI << "\n");
- return CostMultiplier;
-}
-
-static bool
-unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
- AssumptionCache &AC, TargetTransformInfo &TTI,
- function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
- ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
- // Collect all invariant conditions within this loop (as opposed to an inner
- // loop which would be handled when visiting that inner loop).
- SmallVector<std::pair<Instruction *, TinyPtrVector<Value *>>, 4>
- UnswitchCandidates;
-
- // Whether or not we should also collect guards in the loop.
- bool CollectGuards = false;
- if (UnswitchGuards) {
- auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction(
- Intrinsic::getName(Intrinsic::experimental_guard));
- if (GuardDecl && !GuardDecl->use_empty())
- CollectGuards = true;
- }
-
- for (auto *BB : L.blocks()) {
- if (LI.getLoopFor(BB) != &L)
- continue;
-
- if (CollectGuards)
- for (auto &I : *BB)
- if (isGuard(&I)) {
- auto *Cond = cast<IntrinsicInst>(&I)->getArgOperand(0);
- // TODO: Support AND, OR conditions and partial unswitching.
- if (!isa<Constant>(Cond) && L.isLoopInvariant(Cond))
- UnswitchCandidates.push_back({&I, {Cond}});
- }
-
- if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
- // We can only consider fully loop-invariant switch conditions as we need
- // to completely eliminate the switch after unswitching.
- if (!isa<Constant>(SI->getCondition()) &&
- L.isLoopInvariant(SI->getCondition()) && !BB->getUniqueSuccessor())
- UnswitchCandidates.push_back({SI, {SI->getCondition()}});
- continue;
- }
-
- auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
- if (!BI || !BI->isConditional() || isa<Constant>(BI->getCondition()) ||
- BI->getSuccessor(0) == BI->getSuccessor(1))
- continue;
-
- if (L.isLoopInvariant(BI->getCondition())) {
- UnswitchCandidates.push_back({BI, {BI->getCondition()}});
- continue;
- }
-
- Instruction &CondI = *cast<Instruction>(BI->getCondition());
- if (CondI.getOpcode() != Instruction::And &&
- CondI.getOpcode() != Instruction::Or)
- continue;
-
- TinyPtrVector<Value *> Invariants =
- collectHomogenousInstGraphLoopInvariants(L, CondI, LI);
- if (Invariants.empty())
- continue;
-
- UnswitchCandidates.push_back({BI, std::move(Invariants)});
- }
-
- // If we didn't find any candidates, we're done.
- if (UnswitchCandidates.empty())
- return false;
-
- // Check if there are irreducible CFG cycles in this loop. If so, we cannot
- // easily unswitch non-trivial edges out of the loop. Doing so might turn the
- // irreducible control flow into reducible control flow and introduce new
- // loops "out of thin air". If we ever discover important use cases for doing
- // this, we can add support to loop unswitch, but it is a lot of complexity
- // for what seems little or no real world benefit.
- LoopBlocksRPO RPOT(&L);
- RPOT.perform(&LI);
- if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
- return false;
-
- SmallVector<BasicBlock *, 4> ExitBlocks;
- L.getUniqueExitBlocks(ExitBlocks);
-
- // We cannot unswitch if exit blocks contain a cleanuppad instruction as we
- // don't know how to split those exit blocks.
- // FIXME: We should teach SplitBlock to handle this and remove this
- // restriction.
- for (auto *ExitBB : ExitBlocks)
- if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI())) {
- dbgs() << "Cannot unswitch because of cleanuppad in exit block\n";
- return false;
- }
-
- LLVM_DEBUG(
- dbgs() << "Considering " << UnswitchCandidates.size()
- << " non-trivial loop invariant conditions for unswitching.\n");
-
- // Given that unswitching these terminators will require duplicating parts of
- // the loop, so we need to be able to model that cost. Compute the ephemeral
- // values and set up a data structure to hold per-BB costs. We cache each
- // block's cost so that we don't recompute this when considering different
- // subsets of the loop for duplication during unswitching.
- SmallPtrSet<const Value *, 4> EphValues;
- CodeMetrics::collectEphemeralValues(&L, &AC, EphValues);
- SmallDenseMap<BasicBlock *, int, 4> BBCostMap;
-
- // Compute the cost of each block, as well as the total loop cost. Also, bail
- // out if we see instructions which are incompatible with loop unswitching
- // (convergent, noduplicate, or cross-basic-block tokens).
- // FIXME: We might be able to safely handle some of these in non-duplicated
- // regions.
+ OuterExitL = nullptr;
+ }
+
+ // If the original loop had exit blocks, walk up through the outer most loop
+ // of those exit blocks to update LCSSA and form updated dedicated exits.
+ if (OuterExitL != &L)
+ for (Loop *OuterL = ParentL; OuterL != OuterExitL;
+ OuterL = OuterL->getParentLoop())
+ UpdateLoop(*OuterL);
+
+#ifndef NDEBUG
+ // Verify the entire loop structure to catch any incorrect updates before we
+ // progress in the pass pipeline.
+ LI.verify(DT);
+#endif
+
+ // Now that we've unswitched something, make callbacks to report the changes.
+ // For that we need to merge together the updated loops and the cloned loops
+ // and check whether the original loop survived.
+ SmallVector<Loop *, 4> SibLoops;
+ for (Loop *UpdatedL : llvm::concat<Loop *>(NonChildClonedLoops, HoistedLoops))
+ if (UpdatedL->getParentLoop() == ParentL)
+ SibLoops.push_back(UpdatedL);
+ UnswitchCB(IsStillLoop, SibLoops);
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ if (BI)
+ ++NumBranches;
+ else
+ ++NumSwitches;
+}
+
+/// Recursively compute the cost of a dominator subtree based on the per-block
+/// cost map provided.
+///
+/// The recursive computation is memozied into the provided DT-indexed cost map
+/// to allow querying it for most nodes in the domtree without it becoming
+/// quadratic.
+static int
+computeDomSubtreeCost(DomTreeNode &N,
+ const SmallDenseMap<BasicBlock *, int, 4> &BBCostMap,
+ SmallDenseMap<DomTreeNode *, int, 4> &DTCostMap) {
+ // Don't accumulate cost (or recurse through) blocks not in our block cost
+ // map and thus not part of the duplication cost being considered.
+ auto BBCostIt = BBCostMap.find(N.getBlock());
+ if (BBCostIt == BBCostMap.end())
+ return 0;
+
+ // Lookup this node to see if we already computed its cost.
+ auto DTCostIt = DTCostMap.find(&N);
+ if (DTCostIt != DTCostMap.end())
+ return DTCostIt->second;
+
+ // If not, we have to compute it. We can't use insert above and update
+ // because computing the cost may insert more things into the map.
+ int Cost = std::accumulate(
+ N.begin(), N.end(), BBCostIt->second, [&](int Sum, DomTreeNode *ChildN) {
+ return Sum + computeDomSubtreeCost(*ChildN, BBCostMap, DTCostMap);
+ });
+ bool Inserted = DTCostMap.insert({&N, Cost}).second;
+ (void)Inserted;
+ assert(Inserted && "Should not insert a node while visiting children!");
+ return Cost;
+}
+
+/// Turns a llvm.experimental.guard intrinsic into implicit control flow branch,
+/// making the following replacement:
+///
+/// --code before guard--
+/// call void (i1, ...) @llvm.experimental.guard(i1 %cond) [ "deopt"() ]
+/// --code after guard--
+///
+/// into
+///
+/// --code before guard--
+/// br i1 %cond, label %guarded, label %deopt
+///
+/// guarded:
+/// --code after guard--
+///
+/// deopt:
+/// call void (i1, ...) @llvm.experimental.guard(i1 false) [ "deopt"() ]
+/// unreachable
+///
+/// It also makes all relevant DT and LI updates, so that all structures are in
+/// valid state after this transform.
+static BranchInst *
+turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
+ SmallVectorImpl<BasicBlock *> &ExitBlocks,
+ DominatorTree &DT, LoopInfo &LI, MemorySSAUpdater *MSSAU) {
+ SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+ LLVM_DEBUG(dbgs() << "Turning " << *GI << " into a branch.\n");
+ BasicBlock *CheckBB = GI->getParent();
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ // Remove all CheckBB's successors from DomTree. A block can be seen among
+ // successors more than once, but for DomTree it should be added only once.
+ SmallPtrSet<BasicBlock *, 4> Successors;
+ for (auto *Succ : successors(CheckBB))
+ if (Successors.insert(Succ).second)
+ DTUpdates.push_back({DominatorTree::Delete, CheckBB, Succ});
+
+ Instruction *DeoptBlockTerm =
+ SplitBlockAndInsertIfThen(GI->getArgOperand(0), GI, true);
+ BranchInst *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
+ // SplitBlockAndInsertIfThen inserts control flow that branches to
+ // DeoptBlockTerm if the condition is true. We want the opposite.
+ CheckBI->swapSuccessors();
+
+ BasicBlock *GuardedBlock = CheckBI->getSuccessor(0);
+ GuardedBlock->setName("guarded");
+ CheckBI->getSuccessor(1)->setName("deopt");
+ BasicBlock *DeoptBlock = CheckBI->getSuccessor(1);
+
+ // We now have a new exit block.
+ ExitBlocks.push_back(CheckBI->getSuccessor(1));
+
+ if (MSSAU)
+ MSSAU->moveAllAfterSpliceBlocks(CheckBB, GuardedBlock, GI);
+
+ GI->moveBefore(DeoptBlockTerm);
+ GI->setArgOperand(0, ConstantInt::getFalse(GI->getContext()));
+
+ // Add new successors of CheckBB into DomTree.
+ for (auto *Succ : successors(CheckBB))
+ DTUpdates.push_back({DominatorTree::Insert, CheckBB, Succ});
+
+ // Now the blocks that used to be CheckBB's successors are GuardedBlock's
+ // successors.
+ for (auto *Succ : Successors)
+ DTUpdates.push_back({DominatorTree::Insert, GuardedBlock, Succ});
+
+ // Make proper changes to DT.
+ DT.applyUpdates(DTUpdates);
+ // Inform LI of a new loop block.
+ L.addBasicBlockToLoop(GuardedBlock, LI);
+
+ if (MSSAU) {
+ MemoryDef *MD = cast<MemoryDef>(MSSAU->getMemorySSA()->getMemoryAccess(GI));
+ MSSAU->moveToPlace(MD, DeoptBlock, MemorySSA::BeforeTerminator);
+ if (VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+ }
+
+ ++NumGuards;
+ return CheckBI;
+}
+
+/// Cost multiplier is a way to limit potentially exponential behavior
+/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
+/// candidates available. Also accounting for the number of "sibling" loops with
+/// the idea to account for previous unswitches that already happened on this
+/// cluster of loops. There was an attempt to keep this formula simple,
+/// just enough to limit the worst case behavior. Even if it is not that simple
+/// now it is still not an attempt to provide a detailed heuristic size
+/// prediction.
+///
+/// TODO: Make a proper accounting of "explosion" effect for all kinds of
+/// unswitch candidates, making adequate predictions instead of wild guesses.
+/// That requires knowing not just the number of "remaining" candidates but
+/// also costs of unswitching for each of these candidates.
+static int CalculateUnswitchCostMultiplier(
+ Instruction &TI, Loop &L, LoopInfo &LI, DominatorTree &DT,
+ ArrayRef<std::pair<Instruction *, TinyPtrVector<Value *>>>
+ UnswitchCandidates) {
+
+ // Guards and other exiting conditions do not contribute to exponential
+ // explosion as soon as they dominate the latch (otherwise there might be
+ // another path to the latch remaining that does not allow to eliminate the
+ // loop copy on unswitch).
+ BasicBlock *Latch = L.getLoopLatch();
+ BasicBlock *CondBlock = TI.getParent();
+ if (DT.dominates(CondBlock, Latch) &&
+ (isGuard(&TI) ||
+ llvm::count_if(successors(&TI), [&L](BasicBlock *SuccBB) {
+ return L.contains(SuccBB);
+ }) <= 1)) {
+ NumCostMultiplierSkipped++;
+ return 1;
+ }
+
+ auto *ParentL = L.getParentLoop();
+ int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size()
+ : std::distance(LI.begin(), LI.end()));
+ // Count amount of clones that all the candidates might cause during
+ // unswitching. Branch/guard counts as 1, switch counts as log2 of its cases.
+ int UnswitchedClones = 0;
+ for (auto Candidate : UnswitchCandidates) {
+ Instruction *CI = Candidate.first;
+ BasicBlock *CondBlock = CI->getParent();
+ bool SkipExitingSuccessors = DT.dominates(CondBlock, Latch);
+ if (isGuard(CI)) {
+ if (!SkipExitingSuccessors)
+ UnswitchedClones++;
+ continue;
+ }
+ int NonExitingSuccessors = llvm::count_if(
+ successors(CondBlock), [SkipExitingSuccessors, &L](BasicBlock *SuccBB) {
+ return !SkipExitingSuccessors || L.contains(SuccBB);
+ });
+ UnswitchedClones += Log2_32(NonExitingSuccessors);
+ }
+
+ // Ignore up to the "unscaled candidates" number of unswitch candidates
+ // when calculating the power-of-two scaling of the cost. The main idea
+ // with this control is to allow a small number of unswitches to happen
+ // and rely more on siblings multiplier (see below) when the number
+ // of candidates is small.
+ unsigned ClonesPower =
+ std::max(UnswitchedClones - (int)UnswitchNumInitialUnscaledCandidates, 0);
+
+ // Allowing top-level loops to spread a bit more than nested ones.
+ int SiblingsMultiplier =
+ std::max((ParentL ? SiblingsCount
+ : SiblingsCount / (int)UnswitchSiblingsToplevelDiv),
+ 1);
+ // Compute the cost multiplier in a way that won't overflow by saturating
+ // at an upper bound.
+ int CostMultiplier;
+ if (ClonesPower > Log2_32(UnswitchThreshold) ||
+ SiblingsMultiplier > UnswitchThreshold)
+ CostMultiplier = UnswitchThreshold;
+ else
+ CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower),
+ (int)UnswitchThreshold);
+
+ LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier
+ << " (siblings " << SiblingsMultiplier << " * clones "
+ << (1 << ClonesPower) << ")"
+ << " for unswitch candidate: " << TI << "\n");
+ return CostMultiplier;
+}
+
+static bool
+unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
+ AssumptionCache &AC, TargetTransformInfo &TTI,
+ function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+ ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
+ // Collect all invariant conditions within this loop (as opposed to an inner
+ // loop which would be handled when visiting that inner loop).
+ SmallVector<std::pair<Instruction *, TinyPtrVector<Value *>>, 4>
+ UnswitchCandidates;
+
+ // Whether or not we should also collect guards in the loop.
+ bool CollectGuards = false;
+ if (UnswitchGuards) {
+ auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction(
+ Intrinsic::getName(Intrinsic::experimental_guard));
+ if (GuardDecl && !GuardDecl->use_empty())
+ CollectGuards = true;
+ }
+
+ for (auto *BB : L.blocks()) {
+ if (LI.getLoopFor(BB) != &L)
+ continue;
+
+ if (CollectGuards)
+ for (auto &I : *BB)
+ if (isGuard(&I)) {
+ auto *Cond = cast<IntrinsicInst>(&I)->getArgOperand(0);
+ // TODO: Support AND, OR conditions and partial unswitching.
+ if (!isa<Constant>(Cond) && L.isLoopInvariant(Cond))
+ UnswitchCandidates.push_back({&I, {Cond}});
+ }
+
+ if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+ // We can only consider fully loop-invariant switch conditions as we need
+ // to completely eliminate the switch after unswitching.
+ if (!isa<Constant>(SI->getCondition()) &&
+ L.isLoopInvariant(SI->getCondition()) && !BB->getUniqueSuccessor())
+ UnswitchCandidates.push_back({SI, {SI->getCondition()}});
+ continue;
+ }
+
+ auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || !BI->isConditional() || isa<Constant>(BI->getCondition()) ||
+ BI->getSuccessor(0) == BI->getSuccessor(1))
+ continue;
+
+ if (L.isLoopInvariant(BI->getCondition())) {
+ UnswitchCandidates.push_back({BI, {BI->getCondition()}});
+ continue;
+ }
+
+ Instruction &CondI = *cast<Instruction>(BI->getCondition());
+ if (CondI.getOpcode() != Instruction::And &&
+ CondI.getOpcode() != Instruction::Or)
+ continue;
+
+ TinyPtrVector<Value *> Invariants =
+ collectHomogenousInstGraphLoopInvariants(L, CondI, LI);
+ if (Invariants.empty())
+ continue;
+
+ UnswitchCandidates.push_back({BI, std::move(Invariants)});
+ }
+
+ // If we didn't find any candidates, we're done.
+ if (UnswitchCandidates.empty())
+ return false;
+
+ // Check if there are irreducible CFG cycles in this loop. If so, we cannot
+ // easily unswitch non-trivial edges out of the loop. Doing so might turn the
+ // irreducible control flow into reducible control flow and introduce new
+ // loops "out of thin air". If we ever discover important use cases for doing
+ // this, we can add support to loop unswitch, but it is a lot of complexity
+ // for what seems little or no real world benefit.
+ LoopBlocksRPO RPOT(&L);
+ RPOT.perform(&LI);
+ if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
+ return false;
+
+ SmallVector<BasicBlock *, 4> ExitBlocks;
+ L.getUniqueExitBlocks(ExitBlocks);
+
+ // We cannot unswitch if exit blocks contain a cleanuppad instruction as we
+ // don't know how to split those exit blocks.
+ // FIXME: We should teach SplitBlock to handle this and remove this
+ // restriction.
+ for (auto *ExitBB : ExitBlocks)
+ if (isa<CleanupPadInst>(ExitBB->getFirstNonPHI())) {
+ dbgs() << "Cannot unswitch because of cleanuppad in exit block\n";
+ return false;
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "Considering " << UnswitchCandidates.size()
+ << " non-trivial loop invariant conditions for unswitching.\n");
+
+ // Given that unswitching these terminators will require duplicating parts of
+ // the loop, so we need to be able to model that cost. Compute the ephemeral
+ // values and set up a data structure to hold per-BB costs. We cache each
+ // block's cost so that we don't recompute this when considering different
+ // subsets of the loop for duplication during unswitching.
+ SmallPtrSet<const Value *, 4> EphValues;
+ CodeMetrics::collectEphemeralValues(&L, &AC, EphValues);
+ SmallDenseMap<BasicBlock *, int, 4> BBCostMap;
+
+ // Compute the cost of each block, as well as the total loop cost. Also, bail
+ // out if we see instructions which are incompatible with loop unswitching
+ // (convergent, noduplicate, or cross-basic-block tokens).
+ // FIXME: We might be able to safely handle some of these in non-duplicated
+ // regions.
TargetTransformInfo::TargetCostKind CostKind =
L.getHeader()->getParent()->hasMinSize()
? TargetTransformInfo::TCK_CodeSize
: TargetTransformInfo::TCK_SizeAndLatency;
- int LoopCost = 0;
- for (auto *BB : L.blocks()) {
- int Cost = 0;
- for (auto &I : *BB) {
- if (EphValues.count(&I))
- continue;
-
- if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
- return false;
- if (auto *CB = dyn_cast<CallBase>(&I))
- if (CB->isConvergent() || CB->cannotDuplicate())
- return false;
-
+ int LoopCost = 0;
+ for (auto *BB : L.blocks()) {
+ int Cost = 0;
+ for (auto &I : *BB) {
+ if (EphValues.count(&I))
+ continue;
+
+ if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
+ return false;
+ if (auto *CB = dyn_cast<CallBase>(&I))
+ if (CB->isConvergent() || CB->cannotDuplicate())
+ return false;
+
Cost += TTI.getUserCost(&I, CostKind);
- }
- assert(Cost >= 0 && "Must not have negative costs!");
- LoopCost += Cost;
- assert(LoopCost >= 0 && "Must not have negative loop costs!");
- BBCostMap[BB] = Cost;
- }
- LLVM_DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n");
-
- // Now we find the best candidate by searching for the one with the following
- // properties in order:
- //
- // 1) An unswitching cost below the threshold
- // 2) The smallest number of duplicated unswitch candidates (to avoid
- // creating redundant subsequent unswitching)
- // 3) The smallest cost after unswitching.
- //
- // We prioritize reducing fanout of unswitch candidates provided the cost
- // remains below the threshold because this has a multiplicative effect.
- //
- // This requires memoizing each dominator subtree to avoid redundant work.
- //
- // FIXME: Need to actually do the number of candidates part above.
- SmallDenseMap<DomTreeNode *, int, 4> DTCostMap;
- // Given a terminator which might be unswitched, computes the non-duplicated
- // cost for that terminator.
- auto ComputeUnswitchedCost = [&](Instruction &TI, bool FullUnswitch) {
- BasicBlock &BB = *TI.getParent();
- SmallPtrSet<BasicBlock *, 4> Visited;
-
- int Cost = LoopCost;
- for (BasicBlock *SuccBB : successors(&BB)) {
- // Don't count successors more than once.
- if (!Visited.insert(SuccBB).second)
- continue;
-
- // If this is a partial unswitch candidate, then it must be a conditional
- // branch with a condition of either `or` or `and`. In that case, one of
- // the successors is necessarily duplicated, so don't even try to remove
- // its cost.
- if (!FullUnswitch) {
- auto &BI = cast<BranchInst>(TI);
- if (cast<Instruction>(BI.getCondition())->getOpcode() ==
- Instruction::And) {
- if (SuccBB == BI.getSuccessor(1))
- continue;
- } else {
- assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
- Instruction::Or &&
- "Only `and` and `or` conditions can result in a partial "
- "unswitch!");
- if (SuccBB == BI.getSuccessor(0))
- continue;
- }
- }
-
- // This successor's domtree will not need to be duplicated after
- // unswitching if the edge to the successor dominates it (and thus the
- // entire tree). This essentially means there is no other path into this
- // subtree and so it will end up live in only one clone of the loop.
- if (SuccBB->getUniquePredecessor() ||
- llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
- return PredBB == &BB || DT.dominates(SuccBB, PredBB);
- })) {
- Cost -= computeDomSubtreeCost(*DT[SuccBB], BBCostMap, DTCostMap);
- assert(Cost >= 0 &&
- "Non-duplicated cost should never exceed total loop cost!");
- }
- }
-
- // Now scale the cost by the number of unique successors minus one. We
- // subtract one because there is already at least one copy of the entire
- // loop. This is computing the new cost of unswitching a condition.
- // Note that guards always have 2 unique successors that are implicit and
- // will be materialized if we decide to unswitch it.
- int SuccessorsCount = isGuard(&TI) ? 2 : Visited.size();
- assert(SuccessorsCount > 1 &&
- "Cannot unswitch a condition without multiple distinct successors!");
- return Cost * (SuccessorsCount - 1);
- };
- Instruction *BestUnswitchTI = nullptr;
- int BestUnswitchCost = 0;
- ArrayRef<Value *> BestUnswitchInvariants;
- for (auto &TerminatorAndInvariants : UnswitchCandidates) {
- Instruction &TI = *TerminatorAndInvariants.first;
- ArrayRef<Value *> Invariants = TerminatorAndInvariants.second;
- BranchInst *BI = dyn_cast<BranchInst>(&TI);
- int CandidateCost = ComputeUnswitchedCost(
- TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 &&
- Invariants[0] == BI->getCondition()));
- // Calculate cost multiplier which is a tool to limit potentially
- // exponential behavior of loop-unswitch.
- if (EnableUnswitchCostMultiplier) {
- int CostMultiplier =
- CalculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates);
- assert(
- (CostMultiplier > 0 && CostMultiplier <= UnswitchThreshold) &&
- "cost multiplier needs to be in the range of 1..UnswitchThreshold");
- CandidateCost *= CostMultiplier;
- LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost
- << " (multiplier: " << CostMultiplier << ")"
- << " for unswitch candidate: " << TI << "\n");
- } else {
- LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost
- << " for unswitch candidate: " << TI << "\n");
- }
-
- if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
- BestUnswitchTI = &TI;
- BestUnswitchCost = CandidateCost;
- BestUnswitchInvariants = Invariants;
- }
- }
- assert(BestUnswitchTI && "Failed to find loop unswitch candidate");
-
- if (BestUnswitchCost >= UnswitchThreshold) {
- LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: "
- << BestUnswitchCost << "\n");
- return false;
- }
-
- // If the best candidate is a guard, turn it into a branch.
- if (isGuard(BestUnswitchTI))
- BestUnswitchTI = turnGuardIntoBranch(cast<IntrinsicInst>(BestUnswitchTI), L,
- ExitBlocks, DT, LI, MSSAU);
-
- LLVM_DEBUG(dbgs() << " Unswitching non-trivial (cost = "
- << BestUnswitchCost << ") terminator: " << *BestUnswitchTI
- << "\n");
- unswitchNontrivialInvariants(L, *BestUnswitchTI, BestUnswitchInvariants,
- ExitBlocks, DT, LI, AC, UnswitchCB, SE, MSSAU);
- return true;
-}
-
-/// Unswitch control flow predicated on loop invariant conditions.
-///
-/// This first hoists all branches or switches which are trivial (IE, do not
-/// require duplicating any part of the loop) out of the loop body. It then
-/// looks at other loop invariant control flows and tries to unswitch those as
-/// well by cloning the loop if the result is small enough.
-///
-/// The `DT`, `LI`, `AC`, `TTI` parameters are required analyses that are also
-/// updated based on the unswitch.
-/// The `MSSA` analysis is also updated if valid (i.e. its use is enabled).
-///
-/// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is
-/// true, we will attempt to do non-trivial unswitching as well as trivial
-/// unswitching.
-///
-/// The `UnswitchCB` callback provided will be run after unswitching is
-/// complete, with the first parameter set to `true` if the provided loop
-/// remains a loop, and a list of new sibling loops created.
-///
-/// If `SE` is non-null, we will update that analysis based on the unswitching
-/// done.
-static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
- AssumptionCache &AC, TargetTransformInfo &TTI,
- bool NonTrivial,
- function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
- ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
- assert(L.isRecursivelyLCSSAForm(DT, LI) &&
- "Loops must be in LCSSA form before unswitching.");
-
- // Must be in loop simplified form: we need a preheader and dedicated exits.
- if (!L.isLoopSimplifyForm())
- return false;
-
- // Try trivial unswitch first before loop over other basic blocks in the loop.
- if (unswitchAllTrivialConditions(L, DT, LI, SE, MSSAU)) {
- // If we unswitched successfully we will want to clean up the loop before
- // processing it further so just mark it as unswitched and return.
- UnswitchCB(/*CurrentLoopValid*/ true, {});
- return true;
- }
-
- // If we're not doing non-trivial unswitching, we're done. We both accept
- // a parameter but also check a local flag that can be used for testing
- // a debugging.
- if (!NonTrivial && !EnableNonTrivialUnswitch)
- return false;
-
+ }
+ assert(Cost >= 0 && "Must not have negative costs!");
+ LoopCost += Cost;
+ assert(LoopCost >= 0 && "Must not have negative loop costs!");
+ BBCostMap[BB] = Cost;
+ }
+ LLVM_DEBUG(dbgs() << " Total loop cost: " << LoopCost << "\n");
+
+ // Now we find the best candidate by searching for the one with the following
+ // properties in order:
+ //
+ // 1) An unswitching cost below the threshold
+ // 2) The smallest number of duplicated unswitch candidates (to avoid
+ // creating redundant subsequent unswitching)
+ // 3) The smallest cost after unswitching.
+ //
+ // We prioritize reducing fanout of unswitch candidates provided the cost
+ // remains below the threshold because this has a multiplicative effect.
+ //
+ // This requires memoizing each dominator subtree to avoid redundant work.
+ //
+ // FIXME: Need to actually do the number of candidates part above.
+ SmallDenseMap<DomTreeNode *, int, 4> DTCostMap;
+ // Given a terminator which might be unswitched, computes the non-duplicated
+ // cost for that terminator.
+ auto ComputeUnswitchedCost = [&](Instruction &TI, bool FullUnswitch) {
+ BasicBlock &BB = *TI.getParent();
+ SmallPtrSet<BasicBlock *, 4> Visited;
+
+ int Cost = LoopCost;
+ for (BasicBlock *SuccBB : successors(&BB)) {
+ // Don't count successors more than once.
+ if (!Visited.insert(SuccBB).second)
+ continue;
+
+ // If this is a partial unswitch candidate, then it must be a conditional
+ // branch with a condition of either `or` or `and`. In that case, one of
+ // the successors is necessarily duplicated, so don't even try to remove
+ // its cost.
+ if (!FullUnswitch) {
+ auto &BI = cast<BranchInst>(TI);
+ if (cast<Instruction>(BI.getCondition())->getOpcode() ==
+ Instruction::And) {
+ if (SuccBB == BI.getSuccessor(1))
+ continue;
+ } else {
+ assert(cast<Instruction>(BI.getCondition())->getOpcode() ==
+ Instruction::Or &&
+ "Only `and` and `or` conditions can result in a partial "
+ "unswitch!");
+ if (SuccBB == BI.getSuccessor(0))
+ continue;
+ }
+ }
+
+ // This successor's domtree will not need to be duplicated after
+ // unswitching if the edge to the successor dominates it (and thus the
+ // entire tree). This essentially means there is no other path into this
+ // subtree and so it will end up live in only one clone of the loop.
+ if (SuccBB->getUniquePredecessor() ||
+ llvm::all_of(predecessors(SuccBB), [&](BasicBlock *PredBB) {
+ return PredBB == &BB || DT.dominates(SuccBB, PredBB);
+ })) {
+ Cost -= computeDomSubtreeCost(*DT[SuccBB], BBCostMap, DTCostMap);
+ assert(Cost >= 0 &&
+ "Non-duplicated cost should never exceed total loop cost!");
+ }
+ }
+
+ // Now scale the cost by the number of unique successors minus one. We
+ // subtract one because there is already at least one copy of the entire
+ // loop. This is computing the new cost of unswitching a condition.
+ // Note that guards always have 2 unique successors that are implicit and
+ // will be materialized if we decide to unswitch it.
+ int SuccessorsCount = isGuard(&TI) ? 2 : Visited.size();
+ assert(SuccessorsCount > 1 &&
+ "Cannot unswitch a condition without multiple distinct successors!");
+ return Cost * (SuccessorsCount - 1);
+ };
+ Instruction *BestUnswitchTI = nullptr;
+ int BestUnswitchCost = 0;
+ ArrayRef<Value *> BestUnswitchInvariants;
+ for (auto &TerminatorAndInvariants : UnswitchCandidates) {
+ Instruction &TI = *TerminatorAndInvariants.first;
+ ArrayRef<Value *> Invariants = TerminatorAndInvariants.second;
+ BranchInst *BI = dyn_cast<BranchInst>(&TI);
+ int CandidateCost = ComputeUnswitchedCost(
+ TI, /*FullUnswitch*/ !BI || (Invariants.size() == 1 &&
+ Invariants[0] == BI->getCondition()));
+ // Calculate cost multiplier which is a tool to limit potentially
+ // exponential behavior of loop-unswitch.
+ if (EnableUnswitchCostMultiplier) {
+ int CostMultiplier =
+ CalculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates);
+ assert(
+ (CostMultiplier > 0 && CostMultiplier <= UnswitchThreshold) &&
+ "cost multiplier needs to be in the range of 1..UnswitchThreshold");
+ CandidateCost *= CostMultiplier;
+ LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost
+ << " (multiplier: " << CostMultiplier << ")"
+ << " for unswitch candidate: " << TI << "\n");
+ } else {
+ LLVM_DEBUG(dbgs() << " Computed cost of " << CandidateCost
+ << " for unswitch candidate: " << TI << "\n");
+ }
+
+ if (!BestUnswitchTI || CandidateCost < BestUnswitchCost) {
+ BestUnswitchTI = &TI;
+ BestUnswitchCost = CandidateCost;
+ BestUnswitchInvariants = Invariants;
+ }
+ }
+ assert(BestUnswitchTI && "Failed to find loop unswitch candidate");
+
+ if (BestUnswitchCost >= UnswitchThreshold) {
+ LLVM_DEBUG(dbgs() << "Cannot unswitch, lowest cost found: "
+ << BestUnswitchCost << "\n");
+ return false;
+ }
+
+ // If the best candidate is a guard, turn it into a branch.
+ if (isGuard(BestUnswitchTI))
+ BestUnswitchTI = turnGuardIntoBranch(cast<IntrinsicInst>(BestUnswitchTI), L,
+ ExitBlocks, DT, LI, MSSAU);
+
+ LLVM_DEBUG(dbgs() << " Unswitching non-trivial (cost = "
+ << BestUnswitchCost << ") terminator: " << *BestUnswitchTI
+ << "\n");
+ unswitchNontrivialInvariants(L, *BestUnswitchTI, BestUnswitchInvariants,
+ ExitBlocks, DT, LI, AC, UnswitchCB, SE, MSSAU);
+ return true;
+}
+
+/// Unswitch control flow predicated on loop invariant conditions.
+///
+/// This first hoists all branches or switches which are trivial (IE, do not
+/// require duplicating any part of the loop) out of the loop body. It then
+/// looks at other loop invariant control flows and tries to unswitch those as
+/// well by cloning the loop if the result is small enough.
+///
+/// The `DT`, `LI`, `AC`, `TTI` parameters are required analyses that are also
+/// updated based on the unswitch.
+/// The `MSSA` analysis is also updated if valid (i.e. its use is enabled).
+///
+/// If either `NonTrivial` is true or the flag `EnableNonTrivialUnswitch` is
+/// true, we will attempt to do non-trivial unswitching as well as trivial
+/// unswitching.
+///
+/// The `UnswitchCB` callback provided will be run after unswitching is
+/// complete, with the first parameter set to `true` if the provided loop
+/// remains a loop, and a list of new sibling loops created.
+///
+/// If `SE` is non-null, we will update that analysis based on the unswitching
+/// done.
+static bool unswitchLoop(Loop &L, DominatorTree &DT, LoopInfo &LI,
+ AssumptionCache &AC, TargetTransformInfo &TTI,
+ bool NonTrivial,
+ function_ref<void(bool, ArrayRef<Loop *>)> UnswitchCB,
+ ScalarEvolution *SE, MemorySSAUpdater *MSSAU) {
+ assert(L.isRecursivelyLCSSAForm(DT, LI) &&
+ "Loops must be in LCSSA form before unswitching.");
+
+ // Must be in loop simplified form: we need a preheader and dedicated exits.
+ if (!L.isLoopSimplifyForm())
+ return false;
+
+ // Try trivial unswitch first before loop over other basic blocks in the loop.
+ if (unswitchAllTrivialConditions(L, DT, LI, SE, MSSAU)) {
+ // If we unswitched successfully we will want to clean up the loop before
+ // processing it further so just mark it as unswitched and return.
+ UnswitchCB(/*CurrentLoopValid*/ true, {});
+ return true;
+ }
+
+ // If we're not doing non-trivial unswitching, we're done. We both accept
+ // a parameter but also check a local flag that can be used for testing
+ // a debugging.
+ if (!NonTrivial && !EnableNonTrivialUnswitch)
+ return false;
+
// Skip non-trivial unswitching for optsize functions.
if (L.getHeader()->getParent()->hasOptSize())
return false;
- // For non-trivial unswitching, because it often creates new loops, we rely on
- // the pass manager to iterate on the loops rather than trying to immediately
- // reach a fixed point. There is no substantial advantage to iterating
- // internally, and if any of the new loops are simplified enough to contain
- // trivial unswitching we want to prefer those.
-
- // Try to unswitch the best invariant condition. We prefer this full unswitch to
- // a partial unswitch when possible below the threshold.
- if (unswitchBestCondition(L, DT, LI, AC, TTI, UnswitchCB, SE, MSSAU))
- return true;
-
- // No other opportunities to unswitch.
+ // For non-trivial unswitching, because it often creates new loops, we rely on
+ // the pass manager to iterate on the loops rather than trying to immediately
+ // reach a fixed point. There is no substantial advantage to iterating
+ // internally, and if any of the new loops are simplified enough to contain
+ // trivial unswitching we want to prefer those.
+
+ // Try to unswitch the best invariant condition. We prefer this full unswitch to
+ // a partial unswitch when possible below the threshold.
+ if (unswitchBestCondition(L, DT, LI, AC, TTI, UnswitchCB, SE, MSSAU))
+ return true;
+
+ // No other opportunities to unswitch.
return false;
-}
-
-PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &U) {
- Function &F = *L.getHeader()->getParent();
- (void)F;
-
- LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L
- << "\n");
-
- // Save the current loop name in a variable so that we can report it even
- // after it has been deleted.
- std::string LoopName = std::string(L.getName());
-
- auto UnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
- ArrayRef<Loop *> NewLoops) {
- // If we did a non-trivial unswitch, we have added new (cloned) loops.
- if (!NewLoops.empty())
- U.addSiblingLoops(NewLoops);
-
- // If the current loop remains valid, we should revisit it to catch any
- // other unswitch opportunities. Otherwise, we need to mark it as deleted.
- if (CurrentLoopValid)
- U.revisitCurrentLoop();
- else
- U.markLoopAsDeleted(L, LoopName);
- };
-
- Optional<MemorySSAUpdater> MSSAU;
- if (AR.MSSA) {
- MSSAU = MemorySSAUpdater(AR.MSSA);
- if (VerifyMemorySSA)
- AR.MSSA->verifyMemorySSA();
- }
- if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial, UnswitchCB,
- &AR.SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr))
- return PreservedAnalyses::all();
-
- if (AR.MSSA && VerifyMemorySSA)
- AR.MSSA->verifyMemorySSA();
-
- // Historically this pass has had issues with the dominator tree so verify it
- // in asserts builds.
- assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast));
-
- auto PA = getLoopPassPreservedAnalyses();
- if (AR.MSSA)
- PA.preserve<MemorySSAAnalysis>();
- return PA;
-}
-
-namespace {
-
-class SimpleLoopUnswitchLegacyPass : public LoopPass {
- bool NonTrivial;
-
-public:
- static char ID; // Pass ID, replacement for typeid
-
- explicit SimpleLoopUnswitchLegacyPass(bool NonTrivial = false)
- : LoopPass(ID), NonTrivial(NonTrivial) {
- initializeSimpleLoopUnswitchLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnLoop(Loop *L, LPPassManager &LPM) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- if (EnableMSSALoopDependency) {
- AU.addRequired<MemorySSAWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
- }
- getLoopAnalysisUsage(AU);
- }
-};
-
-} // end anonymous namespace
-
-bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
- if (skipLoop(L))
- return false;
-
- Function &F = *L->getHeader()->getParent();
-
- LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L
- << "\n");
-
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- MemorySSA *MSSA = nullptr;
- Optional<MemorySSAUpdater> MSSAU;
- if (EnableMSSALoopDependency) {
- MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
- MSSAU = MemorySSAUpdater(MSSA);
- }
-
- auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
- auto *SE = SEWP ? &SEWP->getSE() : nullptr;
-
- auto UnswitchCB = [&L, &LPM](bool CurrentLoopValid,
- ArrayRef<Loop *> NewLoops) {
- // If we did a non-trivial unswitch, we have added new (cloned) loops.
- for (auto *NewL : NewLoops)
- LPM.addLoop(*NewL);
-
- // If the current loop remains valid, re-add it to the queue. This is
- // a little wasteful as we'll finish processing the current loop as well,
- // but it is the best we can do in the old PM.
- if (CurrentLoopValid)
- LPM.addLoop(*L);
- else
- LPM.markLoopAsDeleted(*L);
- };
-
- if (MSSA && VerifyMemorySSA)
- MSSA->verifyMemorySSA();
-
- bool Changed = unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, UnswitchCB, SE,
- MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
-
- if (MSSA && VerifyMemorySSA)
- MSSA->verifyMemorySSA();
-
- // Historically this pass has had issues with the dominator tree so verify it
- // in asserts builds.
- assert(DT.verify(DominatorTree::VerificationLevel::Fast));
-
- return Changed;
-}
-
-char SimpleLoopUnswitchLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
- "Simple unswitch loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
- "Simple unswitch loops", false, false)
-
-Pass *llvm::createSimpleLoopUnswitchLegacyPass(bool NonTrivial) {
- return new SimpleLoopUnswitchLegacyPass(NonTrivial);
-}
+}
+
+PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ Function &F = *L.getHeader()->getParent();
+ (void)F;
+
+ LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << L
+ << "\n");
+
+ // Save the current loop name in a variable so that we can report it even
+ // after it has been deleted.
+ std::string LoopName = std::string(L.getName());
+
+ auto UnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
+ ArrayRef<Loop *> NewLoops) {
+ // If we did a non-trivial unswitch, we have added new (cloned) loops.
+ if (!NewLoops.empty())
+ U.addSiblingLoops(NewLoops);
+
+ // If the current loop remains valid, we should revisit it to catch any
+ // other unswitch opportunities. Otherwise, we need to mark it as deleted.
+ if (CurrentLoopValid)
+ U.revisitCurrentLoop();
+ else
+ U.markLoopAsDeleted(L, LoopName);
+ };
+
+ Optional<MemorySSAUpdater> MSSAU;
+ if (AR.MSSA) {
+ MSSAU = MemorySSAUpdater(AR.MSSA);
+ if (VerifyMemorySSA)
+ AR.MSSA->verifyMemorySSA();
+ }
+ if (!unswitchLoop(L, AR.DT, AR.LI, AR.AC, AR.TTI, NonTrivial, UnswitchCB,
+ &AR.SE, MSSAU.hasValue() ? MSSAU.getPointer() : nullptr))
+ return PreservedAnalyses::all();
+
+ if (AR.MSSA && VerifyMemorySSA)
+ AR.MSSA->verifyMemorySSA();
+
+ // Historically this pass has had issues with the dominator tree so verify it
+ // in asserts builds.
+ assert(AR.DT.verify(DominatorTree::VerificationLevel::Fast));
+
+ auto PA = getLoopPassPreservedAnalyses();
+ if (AR.MSSA)
+ PA.preserve<MemorySSAAnalysis>();
+ return PA;
+}
+
+namespace {
+
+class SimpleLoopUnswitchLegacyPass : public LoopPass {
+ bool NonTrivial;
+
+public:
+ static char ID; // Pass ID, replacement for typeid
+
+ explicit SimpleLoopUnswitchLegacyPass(bool NonTrivial = false)
+ : LoopPass(ID), NonTrivial(NonTrivial) {
+ initializeSimpleLoopUnswitchLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ if (EnableMSSALoopDependency) {
+ AU.addRequired<MemorySSAWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+ }
+ getLoopAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
+bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
+ if (skipLoop(L))
+ return false;
+
+ Function &F = *L->getHeader()->getParent();
+
+ LLVM_DEBUG(dbgs() << "Unswitching loop in " << F.getName() << ": " << *L
+ << "\n");
+
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ MemorySSA *MSSA = nullptr;
+ Optional<MemorySSAUpdater> MSSAU;
+ if (EnableMSSALoopDependency) {
+ MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
+ MSSAU = MemorySSAUpdater(MSSA);
+ }
+
+ auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ auto *SE = SEWP ? &SEWP->getSE() : nullptr;
+
+ auto UnswitchCB = [&L, &LPM](bool CurrentLoopValid,
+ ArrayRef<Loop *> NewLoops) {
+ // If we did a non-trivial unswitch, we have added new (cloned) loops.
+ for (auto *NewL : NewLoops)
+ LPM.addLoop(*NewL);
+
+ // If the current loop remains valid, re-add it to the queue. This is
+ // a little wasteful as we'll finish processing the current loop as well,
+ // but it is the best we can do in the old PM.
+ if (CurrentLoopValid)
+ LPM.addLoop(*L);
+ else
+ LPM.markLoopAsDeleted(*L);
+ };
+
+ if (MSSA && VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+
+ bool Changed = unswitchLoop(*L, DT, LI, AC, TTI, NonTrivial, UnswitchCB, SE,
+ MSSAU.hasValue() ? MSSAU.getPointer() : nullptr);
+
+ if (MSSA && VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+
+ // Historically this pass has had issues with the dominator tree so verify it
+ // in asserts builds.
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+
+ return Changed;
+}
+
+char SimpleLoopUnswitchLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
+ "Simple unswitch loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(SimpleLoopUnswitchLegacyPass, "simple-loop-unswitch",
+ "Simple unswitch loops", false, false)
+
+Pass *llvm::createSimpleLoopUnswitchLegacyPass(bool NonTrivial) {
+ return new SimpleLoopUnswitchLegacyPass(NonTrivial);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index f06efd7f85..38e7109ead 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -1,145 +1,145 @@
-//===- SimplifyCFGPass.cpp - CFG Simplification Pass ----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements dead code elimination and basic block merging, along
-// with a collection of other peephole control flow optimizations. For example:
-//
-// * Removes basic blocks with no predecessors.
-// * Merges a basic block into its predecessor if there is only one and the
-// predecessor only has one successor.
-// * Eliminates PHI nodes for basic blocks with a single predecessor.
-// * Eliminates a basic block that only contains an unconditional branch.
-// * Changes invoke instructions to nounwind functions to be calls.
-// * Change things like "if (x) if (y)" into "if (x&y)".
-// * etc..
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CFG.h"
+//===- SimplifyCFGPass.cpp - CFG Simplification Pass ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead code elimination and basic block merging, along
+// with a collection of other peephole control flow optimizations. For example:
+//
+// * Removes basic blocks with no predecessors.
+// * Merges a basic block into its predecessor if there is only one and the
+// predecessor only has one successor.
+// * Eliminates PHI nodes for basic blocks with a single predecessor.
+// * Eliminates a basic block that only contains an unconditional branch.
+// * Changes invoke instructions to nounwind functions to be calls.
+// * Change things like "if (x) if (y)" into "if (x&y)".
+// * etc..
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/SimplifyCFG.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SimplifyCFGOptions.h"
-#include <utility>
-using namespace llvm;
-
-#define DEBUG_TYPE "simplifycfg"
-
-static cl::opt<unsigned> UserBonusInstThreshold(
- "bonus-inst-threshold", cl::Hidden, cl::init(1),
- cl::desc("Control the number of bonus instructions (default = 1)"));
-
-static cl::opt<bool> UserKeepLoops(
- "keep-loops", cl::Hidden, cl::init(true),
- cl::desc("Preserve canonical loop structure (default = true)"));
-
-static cl::opt<bool> UserSwitchToLookup(
- "switch-to-lookup", cl::Hidden, cl::init(false),
- cl::desc("Convert switches to lookup tables (default = false)"));
-
-static cl::opt<bool> UserForwardSwitchCond(
- "forward-switch-cond", cl::Hidden, cl::init(false),
- cl::desc("Forward switch condition to phi ops (default = false)"));
-
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "simplifycfg"
+
+static cl::opt<unsigned> UserBonusInstThreshold(
+ "bonus-inst-threshold", cl::Hidden, cl::init(1),
+ cl::desc("Control the number of bonus instructions (default = 1)"));
+
+static cl::opt<bool> UserKeepLoops(
+ "keep-loops", cl::Hidden, cl::init(true),
+ cl::desc("Preserve canonical loop structure (default = true)"));
+
+static cl::opt<bool> UserSwitchToLookup(
+ "switch-to-lookup", cl::Hidden, cl::init(false),
+ cl::desc("Convert switches to lookup tables (default = false)"));
+
+static cl::opt<bool> UserForwardSwitchCond(
+ "forward-switch-cond", cl::Hidden, cl::init(false),
+ cl::desc("Forward switch condition to phi ops (default = false)"));
+
static cl::opt<bool> UserHoistCommonInsts(
"hoist-common-insts", cl::Hidden, cl::init(false),
cl::desc("hoist common instructions (default = false)"));
-static cl::opt<bool> UserSinkCommonInsts(
- "sink-common-insts", cl::Hidden, cl::init(false),
- cl::desc("Sink common instructions (default = false)"));
-
-
-STATISTIC(NumSimpl, "Number of blocks simplified");
-
-/// If we have more than one empty (other than phi node) return blocks,
-/// merge them together to promote recursive block merging.
+static cl::opt<bool> UserSinkCommonInsts(
+ "sink-common-insts", cl::Hidden, cl::init(false),
+ cl::desc("Sink common instructions (default = false)"));
+
+
+STATISTIC(NumSimpl, "Number of blocks simplified");
+
+/// If we have more than one empty (other than phi node) return blocks,
+/// merge them together to promote recursive block merging.
static bool mergeEmptyReturnBlocks(Function &F, DomTreeUpdater *DTU) {
- bool Changed = false;
-
+ bool Changed = false;
+
std::vector<DominatorTree::UpdateType> Updates;
SmallVector<BasicBlock *, 8> DeadBlocks;
- BasicBlock *RetBlock = nullptr;
-
- // Scan all the blocks in the function, looking for empty return blocks.
+ BasicBlock *RetBlock = nullptr;
+
+ // Scan all the blocks in the function, looking for empty return blocks.
for (BasicBlock &BB : make_early_inc_range(F)) {
if (DTU && DTU->isBBPendingDeletion(&BB))
continue;
-
- // Only look at return blocks.
- ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
- if (!Ret) continue;
-
- // Only look at the block if it is empty or the only other thing in it is a
- // single PHI node that is the operand to the return.
- if (Ret != &BB.front()) {
- // Check for something else in the block.
- BasicBlock::iterator I(Ret);
- --I;
- // Skip over debug info.
- while (isa<DbgInfoIntrinsic>(I) && I != BB.begin())
- --I;
- if (!isa<DbgInfoIntrinsic>(I) &&
- (!isa<PHINode>(I) || I != BB.begin() || Ret->getNumOperands() == 0 ||
- Ret->getOperand(0) != &*I))
- continue;
- }
-
- // If this is the first returning block, remember it and keep going.
- if (!RetBlock) {
- RetBlock = &BB;
- continue;
- }
-
- // Skip merging if this would result in a CallBr instruction with a
- // duplicate destination. FIXME: See note in CodeGenPrepare.cpp.
- bool SkipCallBr = false;
- for (pred_iterator PI = pred_begin(&BB), E = pred_end(&BB);
- PI != E && !SkipCallBr; ++PI) {
- if (auto *CBI = dyn_cast<CallBrInst>((*PI)->getTerminator()))
- for (unsigned i = 0, e = CBI->getNumSuccessors(); i != e; ++i)
- if (RetBlock == CBI->getSuccessor(i)) {
- SkipCallBr = true;
- break;
- }
- }
- if (SkipCallBr)
- continue;
-
- // Otherwise, we found a duplicate return block. Merge the two.
- Changed = true;
-
- // Case when there is no input to the return or when the returned values
- // agree is trivial. Note that they can't agree if there are phis in the
- // blocks.
- if (Ret->getNumOperands() == 0 ||
- Ret->getOperand(0) ==
- cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) {
+
+ // Only look at return blocks.
+ ReturnInst *Ret = dyn_cast<ReturnInst>(BB.getTerminator());
+ if (!Ret) continue;
+
+ // Only look at the block if it is empty or the only other thing in it is a
+ // single PHI node that is the operand to the return.
+ if (Ret != &BB.front()) {
+ // Check for something else in the block.
+ BasicBlock::iterator I(Ret);
+ --I;
+ // Skip over debug info.
+ while (isa<DbgInfoIntrinsic>(I) && I != BB.begin())
+ --I;
+ if (!isa<DbgInfoIntrinsic>(I) &&
+ (!isa<PHINode>(I) || I != BB.begin() || Ret->getNumOperands() == 0 ||
+ Ret->getOperand(0) != &*I))
+ continue;
+ }
+
+ // If this is the first returning block, remember it and keep going.
+ if (!RetBlock) {
+ RetBlock = &BB;
+ continue;
+ }
+
+ // Skip merging if this would result in a CallBr instruction with a
+ // duplicate destination. FIXME: See note in CodeGenPrepare.cpp.
+ bool SkipCallBr = false;
+ for (pred_iterator PI = pred_begin(&BB), E = pred_end(&BB);
+ PI != E && !SkipCallBr; ++PI) {
+ if (auto *CBI = dyn_cast<CallBrInst>((*PI)->getTerminator()))
+ for (unsigned i = 0, e = CBI->getNumSuccessors(); i != e; ++i)
+ if (RetBlock == CBI->getSuccessor(i)) {
+ SkipCallBr = true;
+ break;
+ }
+ }
+ if (SkipCallBr)
+ continue;
+
+ // Otherwise, we found a duplicate return block. Merge the two.
+ Changed = true;
+
+ // Case when there is no input to the return or when the returned values
+ // agree is trivial. Note that they can't agree if there are phis in the
+ // blocks.
+ if (Ret->getNumOperands() == 0 ||
+ Ret->getOperand(0) ==
+ cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0)) {
// All predecessors of BB should now branch to RetBlock instead.
if (DTU) {
for (auto *Predecessor : predecessors(&BB)) {
@@ -150,35 +150,35 @@ static bool mergeEmptyReturnBlocks(Function &F, DomTreeUpdater *DTU) {
Updates.push_back({DominatorTree::Delete, Predecessor, &BB});
}
}
- BB.replaceAllUsesWith(RetBlock);
+ BB.replaceAllUsesWith(RetBlock);
DeadBlocks.emplace_back(&BB);
- continue;
- }
-
- // If the canonical return block has no PHI node, create one now.
- PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin());
- if (!RetBlockPHI) {
- Value *InVal = cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0);
- pred_iterator PB = pred_begin(RetBlock), PE = pred_end(RetBlock);
- RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(),
- std::distance(PB, PE), "merge",
- &RetBlock->front());
-
- for (pred_iterator PI = PB; PI != PE; ++PI)
- RetBlockPHI->addIncoming(InVal, *PI);
- RetBlock->getTerminator()->setOperand(0, RetBlockPHI);
- }
-
- // Turn BB into a block that just unconditionally branches to the return
- // block. This handles the case when the two return blocks have a common
- // predecessor but that return different things.
- RetBlockPHI->addIncoming(Ret->getOperand(0), &BB);
- BB.getTerminator()->eraseFromParent();
- BranchInst::Create(RetBlock, &BB);
+ continue;
+ }
+
+ // If the canonical return block has no PHI node, create one now.
+ PHINode *RetBlockPHI = dyn_cast<PHINode>(RetBlock->begin());
+ if (!RetBlockPHI) {
+ Value *InVal = cast<ReturnInst>(RetBlock->getTerminator())->getOperand(0);
+ pred_iterator PB = pred_begin(RetBlock), PE = pred_end(RetBlock);
+ RetBlockPHI = PHINode::Create(Ret->getOperand(0)->getType(),
+ std::distance(PB, PE), "merge",
+ &RetBlock->front());
+
+ for (pred_iterator PI = PB; PI != PE; ++PI)
+ RetBlockPHI->addIncoming(InVal, *PI);
+ RetBlock->getTerminator()->setOperand(0, RetBlockPHI);
+ }
+
+ // Turn BB into a block that just unconditionally branches to the return
+ // block. This handles the case when the two return blocks have a common
+ // predecessor but that return different things.
+ RetBlockPHI->addIncoming(Ret->getOperand(0), &BB);
+ BB.getTerminator()->eraseFromParent();
+ BranchInst::Create(RetBlock, &BB);
if (DTU)
Updates.push_back({DominatorTree::Insert, &BB, RetBlock});
- }
-
+ }
+
if (DTU) {
DTU->applyUpdates(Updates);
for (auto *BB : DeadBlocks)
@@ -188,31 +188,31 @@ static bool mergeEmptyReturnBlocks(Function &F, DomTreeUpdater *DTU) {
BB->eraseFromParent();
}
- return Changed;
-}
-
-/// Call SimplifyCFG on all the blocks in the function,
-/// iterating until no more changes are made.
-static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
+ return Changed;
+}
+
+/// Call SimplifyCFG on all the blocks in the function,
+/// iterating until no more changes are made.
+static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
DomTreeUpdater *DTU,
- const SimplifyCFGOptions &Options) {
- bool Changed = false;
- bool LocalChange = true;
-
- SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 32> Edges;
- FindFunctionBackedges(F, Edges);
+ const SimplifyCFGOptions &Options) {
+ bool Changed = false;
+ bool LocalChange = true;
+
+ SmallVector<std::pair<const BasicBlock *, const BasicBlock *>, 32> Edges;
+ FindFunctionBackedges(F, Edges);
SmallPtrSet<BasicBlock *, 16> UniqueLoopHeaders;
- for (unsigned i = 0, e = Edges.size(); i != e; ++i)
+ for (unsigned i = 0, e = Edges.size(); i != e; ++i)
UniqueLoopHeaders.insert(const_cast<BasicBlock *>(Edges[i].second));
-
+
SmallVector<WeakVH, 16> LoopHeaders(UniqueLoopHeaders.begin(),
UniqueLoopHeaders.end());
- while (LocalChange) {
- LocalChange = false;
-
- // Loop over all of the basic blocks and remove them if they are unneeded.
- for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
+ while (LocalChange) {
+ LocalChange = false;
+
+ // Loop over all of the basic blocks and remove them if they are unneeded.
+ for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
BasicBlock &BB = *BBIt++;
if (DTU) {
assert(
@@ -224,43 +224,43 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
++BBIt;
}
if (simplifyCFG(&BB, TTI, DTU, Options, LoopHeaders)) {
- LocalChange = true;
- ++NumSimpl;
- }
- }
- Changed |= LocalChange;
- }
- return Changed;
-}
-
+ LocalChange = true;
+ ++NumSimpl;
+ }
+ }
+ Changed |= LocalChange;
+ }
+ return Changed;
+}
+
static bool simplifyFunctionCFGImpl(Function &F, const TargetTransformInfo &TTI,
DominatorTree *DT,
const SimplifyCFGOptions &Options) {
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-
+
bool EverChanged = removeUnreachableBlocks(F, DT ? &DTU : nullptr);
EverChanged |= mergeEmptyReturnBlocks(F, DT ? &DTU : nullptr);
EverChanged |= iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options);
- // If neither pass changed anything, we're done.
- if (!EverChanged) return false;
-
- // iterativelySimplifyCFG can (rarely) make some loops dead. If this happens,
- // removeUnreachableBlocks is needed to nuke them, which means we should
- // iterate between the two optimizations. We structure the code like this to
- // avoid rerunning iterativelySimplifyCFG if the second pass of
- // removeUnreachableBlocks doesn't do anything.
+ // If neither pass changed anything, we're done.
+ if (!EverChanged) return false;
+
+ // iterativelySimplifyCFG can (rarely) make some loops dead. If this happens,
+ // removeUnreachableBlocks is needed to nuke them, which means we should
+ // iterate between the two optimizations. We structure the code like this to
+ // avoid rerunning iterativelySimplifyCFG if the second pass of
+ // removeUnreachableBlocks doesn't do anything.
if (!removeUnreachableBlocks(F, DT ? &DTU : nullptr))
- return true;
-
- do {
+ return true;
+
+ do {
EverChanged = iterativelySimplifyCFG(F, TTI, DT ? &DTU : nullptr, Options);
EverChanged |= removeUnreachableBlocks(F, DT ? &DTU : nullptr);
- } while (EverChanged);
-
- return true;
-}
-
+ } while (EverChanged);
+
+ return true;
+}
+
static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
DominatorTree *DT,
const SimplifyCFGOptions &Options) {
@@ -277,7 +277,7 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
return Changed;
}
-// Command-line settings override compile-time settings.
+// Command-line settings override compile-time settings.
static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
if (UserBonusInstThreshold.getNumOccurrences())
Options.BonusInstThreshold = UserBonusInstThreshold;
@@ -291,8 +291,8 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
Options.HoistCommonInsts = UserHoistCommonInsts;
if (UserSinkCommonInsts.getNumOccurrences())
Options.SinkCommonInsts = UserSinkCommonInsts;
-}
-
+}
+
SimplifyCFGPass::SimplifyCFGPass() : Options() {
applyCommandLineOverridesToOptions(Options);
}
@@ -302,10 +302,10 @@ SimplifyCFGPass::SimplifyCFGPass(const SimplifyCFGOptions &Opts)
applyCommandLineOverridesToOptions(Options);
}
-PreservedAnalyses SimplifyCFGPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- Options.AC = &AM.getResult<AssumptionAnalysis>(F);
+PreservedAnalyses SimplifyCFGPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ Options.AC = &AM.getResult<AssumptionAnalysis>(F);
DominatorTree *DT = nullptr;
if (RequireAndPreserveDomTree)
DT = &AM.getResult<DominatorTreeAnalysis>(F);
@@ -315,73 +315,73 @@ PreservedAnalyses SimplifyCFGPass::run(Function &F,
Options.setSimplifyCondBranch(true).setFoldTwoEntryPHINode(true);
}
if (!simplifyFunctionCFG(F, TTI, DT, Options))
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
if (RequireAndPreserveDomTree)
PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<GlobalsAA>();
- return PA;
-}
-
-namespace {
-struct CFGSimplifyPass : public FunctionPass {
- static char ID;
- SimplifyCFGOptions Options;
- std::function<bool(const Function &)> PredicateFtor;
-
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+namespace {
+struct CFGSimplifyPass : public FunctionPass {
+ static char ID;
+ SimplifyCFGOptions Options;
+ std::function<bool(const Function &)> PredicateFtor;
+
CFGSimplifyPass(SimplifyCFGOptions Options_ = SimplifyCFGOptions(),
- std::function<bool(const Function &)> Ftor = nullptr)
+ std::function<bool(const Function &)> Ftor = nullptr)
: FunctionPass(ID), Options(Options_), PredicateFtor(std::move(Ftor)) {
-
- initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
-
- // Check for command-line overrides of options for debug/customization.
+
+ initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+
+ // Check for command-line overrides of options for debug/customization.
applyCommandLineOverridesToOptions(Options);
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F)))
- return false;
-
- Options.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F)))
+ return false;
+
+ Options.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
DominatorTree *DT = nullptr;
if (RequireAndPreserveDomTree)
DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- if (F.hasFnAttribute(Attribute::OptForFuzzing)) {
- Options.setSimplifyCondBranch(false)
- .setFoldTwoEntryPHINode(false);
- } else {
- Options.setSimplifyCondBranch(true)
- .setFoldTwoEntryPHINode(true);
- }
-
- auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ if (F.hasFnAttribute(Attribute::OptForFuzzing)) {
+ Options.setSimplifyCondBranch(false)
+ .setFoldTwoEntryPHINode(false);
+ } else {
+ Options.setSimplifyCondBranch(true)
+ .setFoldTwoEntryPHINode(true);
+ }
+
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
return simplifyFunctionCFG(F, TTI, DT, Options);
- }
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
if (RequireAndPreserveDomTree)
AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
if (RequireAndPreserveDomTree)
AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
-};
-}
-
-char CFGSimplifyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+};
+}
+
+char CFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
- false)
-
-// Public interface to the CFGSimplification pass
-FunctionPass *
+INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
+ false)
+
+// Public interface to the CFGSimplification pass
+FunctionPass *
llvm::createCFGSimplificationPass(SimplifyCFGOptions Options,
- std::function<bool(const Function &)> Ftor) {
+ std::function<bool(const Function &)> Ftor) {
return new CFGSimplifyPass(Options, std::move(Ftor));
-}
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/Sink.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/Sink.cpp
index 152614695d..89cfbe384b 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/Sink.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/Sink.cpp
@@ -1,135 +1,135 @@
-//===-- Sink.cpp - Code Sinking -------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass moves instructions into successor blocks, when possible, so that
-// they aren't executed on paths where their results aren't needed.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/Sink.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "sink"
-
-STATISTIC(NumSunk, "Number of instructions sunk");
-STATISTIC(NumSinkIter, "Number of sinking iterations");
-
-static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
- SmallPtrSetImpl<Instruction *> &Stores) {
-
- if (Inst->mayWriteToMemory()) {
- Stores.insert(Inst);
- return false;
- }
-
- if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
- MemoryLocation Loc = MemoryLocation::get(L);
- for (Instruction *S : Stores)
- if (isModSet(AA.getModRefInfo(S, Loc)))
- return false;
- }
-
- if (Inst->isTerminator() || isa<PHINode>(Inst) || Inst->isEHPad() ||
- Inst->mayThrow())
- return false;
-
- if (auto *Call = dyn_cast<CallBase>(Inst)) {
- // Convergent operations cannot be made control-dependent on additional
- // values.
- if (Call->isConvergent())
- return false;
-
- for (Instruction *S : Stores)
- if (isModSet(AA.getModRefInfo(S, Call)))
- return false;
- }
-
- return true;
-}
-
-/// IsAcceptableTarget - Return true if it is possible to sink the instruction
-/// in the specified basic block.
-static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
- DominatorTree &DT, LoopInfo &LI) {
- assert(Inst && "Instruction to be sunk is null");
- assert(SuccToSinkTo && "Candidate sink target is null");
-
- // It's never legal to sink an instruction into a block which terminates in an
- // EH-pad.
- if (SuccToSinkTo->getTerminator()->isExceptionalTerminator())
- return false;
-
- // If the block has multiple predecessors, this would introduce computation
- // on different code paths. We could split the critical edge, but for now we
- // just punt.
- // FIXME: Split critical edges if not backedges.
- if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
- // We cannot sink a load across a critical edge - there may be stores in
- // other code paths.
- if (Inst->mayReadFromMemory())
- return false;
-
- // We don't want to sink across a critical edge if we don't dominate the
- // successor. We could be introducing calculations to new code paths.
- if (!DT.dominates(Inst->getParent(), SuccToSinkTo))
- return false;
-
- // Don't sink instructions into a loop.
- Loop *succ = LI.getLoopFor(SuccToSinkTo);
- Loop *cur = LI.getLoopFor(Inst->getParent());
- if (succ != nullptr && succ != cur)
- return false;
- }
-
+//===-- Sink.cpp - Code Sinking -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass moves instructions into successor blocks, when possible, so that
+// they aren't executed on paths where their results aren't needed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/Sink.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "sink"
+
+STATISTIC(NumSunk, "Number of instructions sunk");
+STATISTIC(NumSinkIter, "Number of sinking iterations");
+
+static bool isSafeToMove(Instruction *Inst, AliasAnalysis &AA,
+ SmallPtrSetImpl<Instruction *> &Stores) {
+
+ if (Inst->mayWriteToMemory()) {
+ Stores.insert(Inst);
+ return false;
+ }
+
+ if (LoadInst *L = dyn_cast<LoadInst>(Inst)) {
+ MemoryLocation Loc = MemoryLocation::get(L);
+ for (Instruction *S : Stores)
+ if (isModSet(AA.getModRefInfo(S, Loc)))
+ return false;
+ }
+
+ if (Inst->isTerminator() || isa<PHINode>(Inst) || Inst->isEHPad() ||
+ Inst->mayThrow())
+ return false;
+
+ if (auto *Call = dyn_cast<CallBase>(Inst)) {
+ // Convergent operations cannot be made control-dependent on additional
+ // values.
+ if (Call->isConvergent())
+ return false;
+
+ for (Instruction *S : Stores)
+ if (isModSet(AA.getModRefInfo(S, Call)))
+ return false;
+ }
+
+ return true;
+}
+
+/// IsAcceptableTarget - Return true if it is possible to sink the instruction
+/// in the specified basic block.
+static bool IsAcceptableTarget(Instruction *Inst, BasicBlock *SuccToSinkTo,
+ DominatorTree &DT, LoopInfo &LI) {
+ assert(Inst && "Instruction to be sunk is null");
+ assert(SuccToSinkTo && "Candidate sink target is null");
+
+ // It's never legal to sink an instruction into a block which terminates in an
+ // EH-pad.
+ if (SuccToSinkTo->getTerminator()->isExceptionalTerminator())
+ return false;
+
+ // If the block has multiple predecessors, this would introduce computation
+ // on different code paths. We could split the critical edge, but for now we
+ // just punt.
+ // FIXME: Split critical edges if not backedges.
+ if (SuccToSinkTo->getUniquePredecessor() != Inst->getParent()) {
+ // We cannot sink a load across a critical edge - there may be stores in
+ // other code paths.
+ if (Inst->mayReadFromMemory())
+ return false;
+
+ // We don't want to sink across a critical edge if we don't dominate the
+ // successor. We could be introducing calculations to new code paths.
+ if (!DT.dominates(Inst->getParent(), SuccToSinkTo))
+ return false;
+
+ // Don't sink instructions into a loop.
+ Loop *succ = LI.getLoopFor(SuccToSinkTo);
+ Loop *cur = LI.getLoopFor(Inst->getParent());
+ if (succ != nullptr && succ != cur)
+ return false;
+ }
+
return true;
-}
-
-/// SinkInstruction - Determine whether it is safe to sink the specified machine
-/// instruction out of its current block into a successor.
-static bool SinkInstruction(Instruction *Inst,
- SmallPtrSetImpl<Instruction *> &Stores,
- DominatorTree &DT, LoopInfo &LI, AAResults &AA) {
-
- // Don't sink static alloca instructions. CodeGen assumes allocas outside the
- // entry block are dynamically sized stack objects.
- if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst))
- if (AI->isStaticAlloca())
- return false;
-
- // Check if it's safe to move the instruction.
- if (!isSafeToMove(Inst, AA, Stores))
- return false;
-
- // FIXME: This should include support for sinking instructions within the
- // block they are currently in to shorten the live ranges. We often get
- // instructions sunk into the top of a large block, but it would be better to
- // also sink them down before their first use in the block. This xform has to
- // be careful not to *increase* register pressure though, e.g. sinking
- // "x = y + z" down if it kills y and z would increase the live ranges of y
- // and z and only shrink the live range of x.
-
- // SuccToSinkTo - This is the successor to sink this instruction to, once we
- // decide.
- BasicBlock *SuccToSinkTo = nullptr;
-
+}
+
+/// SinkInstruction - Determine whether it is safe to sink the specified machine
+/// instruction out of its current block into a successor.
+static bool SinkInstruction(Instruction *Inst,
+ SmallPtrSetImpl<Instruction *> &Stores,
+ DominatorTree &DT, LoopInfo &LI, AAResults &AA) {
+
+ // Don't sink static alloca instructions. CodeGen assumes allocas outside the
+ // entry block are dynamically sized stack objects.
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(Inst))
+ if (AI->isStaticAlloca())
+ return false;
+
+ // Check if it's safe to move the instruction.
+ if (!isSafeToMove(Inst, AA, Stores))
+ return false;
+
+ // FIXME: This should include support for sinking instructions within the
+ // block they are currently in to shorten the live ranges. We often get
+ // instructions sunk into the top of a large block, but it would be better to
+ // also sink them down before their first use in the block. This xform has to
+ // be careful not to *increase* register pressure though, e.g. sinking
+ // "x = y + z" down if it kills y and z would increase the live ranges of y
+ // and z and only shrink the live range of x.
+
+ // SuccToSinkTo - This is the successor to sink this instruction to, once we
+ // decide.
+ BasicBlock *SuccToSinkTo = nullptr;
+
// Find the nearest common dominator of all users as the candidate.
BasicBlock *BB = Inst->getParent();
for (Use &U : Inst->uses()) {
@@ -151,8 +151,8 @@ static bool SinkInstruction(Instruction *Inst,
// The current basic block needs to dominate the candidate.
if (!DT.dominates(BB, SuccToSinkTo))
return false;
- }
-
+ }
+
if (SuccToSinkTo) {
// The nearest common dominator may be in a parent loop of BB, which may not
// be beneficial. Find an ancestor.
@@ -161,124 +161,124 @@ static bool SinkInstruction(Instruction *Inst,
SuccToSinkTo = DT.getNode(SuccToSinkTo)->getIDom()->getBlock();
if (SuccToSinkTo == BB)
SuccToSinkTo = nullptr;
- }
-
- // If we couldn't find a block to sink to, ignore this instruction.
- if (!SuccToSinkTo)
- return false;
-
- LLVM_DEBUG(dbgs() << "Sink" << *Inst << " (";
- Inst->getParent()->printAsOperand(dbgs(), false); dbgs() << " -> ";
- SuccToSinkTo->printAsOperand(dbgs(), false); dbgs() << ")\n");
-
- // Move the instruction.
- Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt());
- return true;
-}
-
-static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI,
- AAResults &AA) {
- // Can't sink anything out of a block that has less than two successors.
- if (BB.getTerminator()->getNumSuccessors() <= 1) return false;
-
- // Don't bother sinking code out of unreachable blocks. In addition to being
- // unprofitable, it can also lead to infinite looping, because in an
- // unreachable loop there may be nowhere to stop.
- if (!DT.isReachableFromEntry(&BB)) return false;
-
- bool MadeChange = false;
-
- // Walk the basic block bottom-up. Remember if we saw a store.
- BasicBlock::iterator I = BB.end();
- --I;
- bool ProcessedBegin = false;
- SmallPtrSet<Instruction *, 8> Stores;
- do {
- Instruction *Inst = &*I; // The instruction to sink.
-
- // Predecrement I (if it's not begin) so that it isn't invalidated by
- // sinking.
- ProcessedBegin = I == BB.begin();
- if (!ProcessedBegin)
- --I;
-
- if (isa<DbgInfoIntrinsic>(Inst))
- continue;
-
- if (SinkInstruction(Inst, Stores, DT, LI, AA)) {
- ++NumSunk;
- MadeChange = true;
- }
-
- // If we just processed the first instruction in the block, we're done.
- } while (!ProcessedBegin);
-
- return MadeChange;
-}
-
-static bool iterativelySinkInstructions(Function &F, DominatorTree &DT,
- LoopInfo &LI, AAResults &AA) {
- bool MadeChange, EverMadeChange = false;
-
- do {
- MadeChange = false;
- LLVM_DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
- // Process all basic blocks.
- for (BasicBlock &I : F)
- MadeChange |= ProcessBlock(I, DT, LI, AA);
- EverMadeChange |= MadeChange;
- NumSinkIter++;
- } while (MadeChange);
-
- return EverMadeChange;
-}
-
-PreservedAnalyses SinkingPass::run(Function &F, FunctionAnalysisManager &AM) {
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &LI = AM.getResult<LoopAnalysis>(F);
- auto &AA = AM.getResult<AAManager>(F);
-
- if (!iterativelySinkInstructions(F, DT, LI, AA))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
-}
-
-namespace {
- class SinkingLegacyPass : public FunctionPass {
- public:
- static char ID; // Pass identification
- SinkingLegacyPass() : FunctionPass(ID) {
- initializeSinkingLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
-
- return iterativelySinkInstructions(F, DT, LI, AA);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
- FunctionPass::getAnalysisUsage(AU);
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- }
- };
-} // end anonymous namespace
-
-char SinkingLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(SinkingLegacyPass, "sink", "Code sinking", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_END(SinkingLegacyPass, "sink", "Code sinking", false, false)
-
-FunctionPass *llvm::createSinkingPass() { return new SinkingLegacyPass(); }
+ }
+
+ // If we couldn't find a block to sink to, ignore this instruction.
+ if (!SuccToSinkTo)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Sink" << *Inst << " (";
+ Inst->getParent()->printAsOperand(dbgs(), false); dbgs() << " -> ";
+ SuccToSinkTo->printAsOperand(dbgs(), false); dbgs() << ")\n");
+
+ // Move the instruction.
+ Inst->moveBefore(&*SuccToSinkTo->getFirstInsertionPt());
+ return true;
+}
+
+static bool ProcessBlock(BasicBlock &BB, DominatorTree &DT, LoopInfo &LI,
+ AAResults &AA) {
+ // Can't sink anything out of a block that has less than two successors.
+ if (BB.getTerminator()->getNumSuccessors() <= 1) return false;
+
+ // Don't bother sinking code out of unreachable blocks. In addition to being
+ // unprofitable, it can also lead to infinite looping, because in an
+ // unreachable loop there may be nowhere to stop.
+ if (!DT.isReachableFromEntry(&BB)) return false;
+
+ bool MadeChange = false;
+
+ // Walk the basic block bottom-up. Remember if we saw a store.
+ BasicBlock::iterator I = BB.end();
+ --I;
+ bool ProcessedBegin = false;
+ SmallPtrSet<Instruction *, 8> Stores;
+ do {
+ Instruction *Inst = &*I; // The instruction to sink.
+
+ // Predecrement I (if it's not begin) so that it isn't invalidated by
+ // sinking.
+ ProcessedBegin = I == BB.begin();
+ if (!ProcessedBegin)
+ --I;
+
+ if (isa<DbgInfoIntrinsic>(Inst))
+ continue;
+
+ if (SinkInstruction(Inst, Stores, DT, LI, AA)) {
+ ++NumSunk;
+ MadeChange = true;
+ }
+
+ // If we just processed the first instruction in the block, we're done.
+ } while (!ProcessedBegin);
+
+ return MadeChange;
+}
+
+static bool iterativelySinkInstructions(Function &F, DominatorTree &DT,
+ LoopInfo &LI, AAResults &AA) {
+ bool MadeChange, EverMadeChange = false;
+
+ do {
+ MadeChange = false;
+ LLVM_DEBUG(dbgs() << "Sinking iteration " << NumSinkIter << "\n");
+ // Process all basic blocks.
+ for (BasicBlock &I : F)
+ MadeChange |= ProcessBlock(I, DT, LI, AA);
+ EverMadeChange |= MadeChange;
+ NumSinkIter++;
+ } while (MadeChange);
+
+ return EverMadeChange;
+}
+
+PreservedAnalyses SinkingPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+
+ if (!iterativelySinkInstructions(F, DT, LI, AA))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+namespace {
+ class SinkingLegacyPass : public FunctionPass {
+ public:
+ static char ID; // Pass identification
+ SinkingLegacyPass() : FunctionPass(ID) {
+ initializeSinkingLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+ return iterativelySinkInstructions(F, DT, LI, AA);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ FunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ }
+ };
+} // end anonymous namespace
+
+char SinkingLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SinkingLegacyPass, "sink", "Code sinking", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(SinkingLegacyPass, "sink", "Code sinking", false, false)
+
+FunctionPass *llvm::createSinkingPass() { return new SinkingLegacyPass(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
index f5529f9e4f..9b18c945d9 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -1,830 +1,830 @@
-//===- SpeculateAroundPHIs.cpp --------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "spec-phis"
-
-STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around");
-STATISTIC(NumEdgesSplit,
- "Number of critical edges which were split for speculation");
-STATISTIC(NumSpeculatedInstructions,
- "Number of instructions we speculated around the PHI nodes");
-STATISTIC(NumNewRedundantInstructions,
- "Number of new, redundant instructions inserted");
-
-/// Check whether speculating the users of a PHI node around the PHI
-/// will be safe.
-///
-/// This checks both that all of the users are safe and also that all of their
-/// operands are either recursively safe or already available along an incoming
-/// edge to the PHI.
-///
-/// This routine caches both all the safe nodes explored in `PotentialSpecSet`
-/// and the chain of nodes that definitively reach any unsafe node in
-/// `UnsafeSet`. By preserving these between repeated calls to this routine for
-/// PHIs in the same basic block, the exploration here can be reused. However,
-/// these caches must no be reused for PHIs in a different basic block as they
-/// reflect what is available along incoming edges.
-static bool
-isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
- SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
- SmallPtrSetImpl<Instruction *> &UnsafeSet) {
- auto *PhiBB = PN.getParent();
- SmallPtrSet<Instruction *, 4> Visited;
- SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
-
- // Walk each user of the PHI node.
- for (Use &U : PN.uses()) {
- auto *UI = cast<Instruction>(U.getUser());
-
- // Ensure the use post-dominates the PHI node. This ensures that, in the
- // absence of unwinding, the use will actually be reached.
- // FIXME: We use a blunt hammer of requiring them to be in the same basic
- // block. We should consider using actual post-dominance here in the
- // future.
- if (UI->getParent() != PhiBB) {
- LLVM_DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n");
- return false;
- }
-
- if (const auto *CS = dyn_cast<CallBase>(UI)) {
- if (CS->isConvergent() || CS->cannotDuplicate()) {
- LLVM_DEBUG(dbgs() << " Unsafe: convergent "
- "callsite cannot de duplicated: " << *UI << '\n');
- return false;
- }
- }
-
- // FIXME: This check is much too conservative. We're not going to move these
- // instructions onto new dynamic paths through the program unless there is
- // a call instruction between the use and the PHI node. And memory isn't
- // changing unless there is a store in that same sequence. We should
- // probably change this to do at least a limited scan of the intervening
- // instructions and allow handling stores in easily proven safe cases.
- if (mayBeMemoryDependent(*UI)) {
- LLVM_DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n");
- return false;
- }
-
- // Now do a depth-first search of everything these users depend on to make
- // sure they are transitively safe. This is a depth-first search, but we
- // check nodes in preorder to minimize the amount of checking.
- Visited.insert(UI);
- DFSStack.push_back({UI, UI->value_op_begin()});
- do {
- User::value_op_iterator OpIt;
- std::tie(UI, OpIt) = DFSStack.pop_back_val();
-
- while (OpIt != UI->value_op_end()) {
- auto *OpI = dyn_cast<Instruction>(*OpIt);
- // Increment to the next operand for whenever we continue.
- ++OpIt;
- // No need to visit non-instructions, which can't form dependencies.
- if (!OpI)
- continue;
-
- // Now do the main pre-order checks that this operand is a viable
- // dependency of something we want to speculate.
-
- // First do a few checks for instructions that won't require
- // speculation at all because they are trivially available on the
- // incoming edge (either through dominance or through an incoming value
- // to a PHI).
- //
- // The cases in the current block will be trivially dominated by the
- // edge.
- auto *ParentBB = OpI->getParent();
- if (ParentBB == PhiBB) {
- if (isa<PHINode>(OpI)) {
- // We can trivially map through phi nodes in the same block.
- continue;
- }
- } else if (DT.dominates(ParentBB, PhiBB)) {
- // Instructions from dominating blocks are already available.
- continue;
- }
-
- // Once we know that we're considering speculating the operand, check
- // if we've already explored this subgraph and found it to be safe.
- if (PotentialSpecSet.count(OpI))
- continue;
-
- // If we've already explored this subgraph and found it unsafe, bail.
- // If when we directly test whether this is safe it fails, bail.
- if (UnsafeSet.count(OpI) || ParentBB != PhiBB ||
- mayBeMemoryDependent(*OpI)) {
- LLVM_DEBUG(dbgs() << " Unsafe: can't speculate transitive use: "
- << *OpI << "\n");
- // Record the stack of instructions which reach this node as unsafe
- // so we prune subsequent searches.
- UnsafeSet.insert(OpI);
- for (auto &StackPair : DFSStack) {
- Instruction *I = StackPair.first;
- UnsafeSet.insert(I);
- }
- return false;
- }
-
- // Skip any operands we're already recursively checking.
- if (!Visited.insert(OpI).second)
- continue;
-
- // Push onto the stack and descend. We can directly continue this
- // loop when ascending.
- DFSStack.push_back({UI, OpIt});
- UI = OpI;
- OpIt = OpI->value_op_begin();
- }
-
- // This node and all its operands are safe. Go ahead and cache that for
- // reuse later.
- PotentialSpecSet.insert(UI);
-
- // Continue with the next node on the stack.
- } while (!DFSStack.empty());
- }
-
-#ifndef NDEBUG
- // Every visited operand should have been marked as safe for speculation at
- // this point. Verify this and return success.
- for (auto *I : Visited)
- assert(PotentialSpecSet.count(I) &&
- "Failed to mark a visited instruction as safe!");
-#endif
- return true;
-}
-
-/// Check whether, in isolation, a given PHI node is both safe and profitable
-/// to speculate users around.
-///
-/// This handles checking whether there are any constant operands to a PHI
-/// which could represent a useful speculation candidate, whether the users of
-/// the PHI are safe to speculate including all their transitive dependencies,
-/// and whether after speculation there will be some cost savings (profit) to
-/// folding the operands into the users of the PHI node. Returns true if both
-/// safe and profitable with relevant cost savings updated in the map and with
-/// an update to the `PotentialSpecSet`. Returns false if either safety or
-/// profitability are absent. Some new entries may be made to the
-/// `PotentialSpecSet` even when this routine returns false, but they remain
-/// conservatively correct.
-///
-/// The profitability check here is a local one, but it checks this in an
-/// interesting way. Beyond checking that the total cost of materializing the
-/// constants will be less than the cost of folding them into their users, it
-/// also checks that no one incoming constant will have a higher cost when
-/// folded into its users rather than materialized. This higher cost could
-/// result in a dynamic *path* that is more expensive even when the total cost
-/// is lower. Currently, all of the interesting cases where this optimization
-/// should fire are ones where it is a no-loss operation in this sense. If we
-/// ever want to be more aggressive here, we would need to balance the
-/// different incoming edges' cost by looking at their respective
-/// probabilities.
-static bool isSafeAndProfitableToSpeculateAroundPHI(
- PHINode &PN, SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
- SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
- SmallPtrSetImpl<Instruction *> &UnsafeSet, DominatorTree &DT,
- TargetTransformInfo &TTI) {
- // First see whether there is any cost savings to speculating around this
- // PHI, and build up a map of the constant inputs to how many times they
- // occur.
- bool NonFreeMat = false;
- struct CostsAndCount {
- int MatCost = TargetTransformInfo::TCC_Free;
- int FoldedCost = TargetTransformInfo::TCC_Free;
- int Count = 0;
- };
- SmallDenseMap<ConstantInt *, CostsAndCount, 16> CostsAndCounts;
- SmallPtrSet<BasicBlock *, 16> IncomingConstantBlocks;
- for (int i : llvm::seq<int>(0, PN.getNumIncomingValues())) {
- auto *IncomingC = dyn_cast<ConstantInt>(PN.getIncomingValue(i));
- if (!IncomingC)
- continue;
-
- // Only visit each incoming edge with a constant input once.
- if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second)
- continue;
-
- auto InsertResult = CostsAndCounts.insert({IncomingC, {}});
- // Count how many edges share a given incoming costant.
- ++InsertResult.first->second.Count;
- // Only compute the cost the first time we see a particular constant.
- if (!InsertResult.second)
- continue;
-
- int &MatCost = InsertResult.first->second.MatCost;
- MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType(),
- TargetTransformInfo::TCK_SizeAndLatency);
- NonFreeMat |= MatCost != TTI.TCC_Free;
- }
- if (!NonFreeMat) {
- LLVM_DEBUG(dbgs() << " Free: " << PN << "\n");
- // No profit in free materialization.
- return false;
- }
-
- // Now check that the uses of this PHI can actually be speculated,
- // otherwise we'll still have to materialize the PHI value.
- if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
- LLVM_DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n");
- return false;
- }
-
- // Compute how much (if any) savings are available by speculating around this
- // PHI.
- for (Use &U : PN.uses()) {
- auto *UserI = cast<Instruction>(U.getUser());
- // Now check whether there is any savings to folding the incoming constants
- // into this use.
- unsigned Idx = U.getOperandNo();
-
- // If we have a binary operator that is commutative, an actual constant
- // operand would end up on the RHS, so pretend the use of the PHI is on the
- // RHS.
- //
- // Technically, this is a bit weird if *both* operands are PHIs we're
- // speculating. But if that is the case, giving an "optimistic" cost isn't
- // a bad thing because after speculation it will constant fold. And
- // moreover, such cases should likely have been constant folded already by
- // some other pass, so we shouldn't worry about "modeling" them terribly
- // accurately here. Similarly, if the other operand is a constant, it still
- // seems fine to be "optimistic" in our cost modeling, because when the
- // incoming operand from the PHI node is also a constant, we will end up
- // constant folding.
- if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1)
- // Assume we will commute the constant to the RHS to be canonical.
- Idx = 1;
-
- // Get the intrinsic ID if this user is an intrinsic.
- Intrinsic::ID IID = Intrinsic::not_intrinsic;
- if (auto *UserII = dyn_cast<IntrinsicInst>(UserI))
- IID = UserII->getIntrinsicID();
-
- for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) {
- ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first;
- int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
- int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
- if (IID)
- FoldedCost +=
- TTI.getIntImmCostIntrin(IID, Idx, IncomingC->getValue(),
- IncomingC->getType(),
- TargetTransformInfo::TCK_SizeAndLatency);
- else
- FoldedCost +=
- TTI.getIntImmCostInst(UserI->getOpcode(), Idx,
- IncomingC->getValue(), IncomingC->getType(),
- TargetTransformInfo::TCK_SizeAndLatency);
-
- // If we accumulate more folded cost for this incoming constant than
- // materialized cost, then we'll regress any edge with this constant so
- // just bail. We're only interested in cases where folding the incoming
- // constants is at least break-even on all paths.
- if (FoldedCost > MatCost) {
- LLVM_DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC
- << "\n"
- " Materializing cost: "
- << MatCost
- << "\n"
- " Accumulated folded cost: "
- << FoldedCost << "\n");
- return false;
- }
- }
- }
-
- // Compute the total cost savings afforded by this PHI node.
- int TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free;
- for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) {
- int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
- int FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
- int Count = IncomingConstantAndCostsAndCount.second.Count;
-
- TotalMatCost += MatCost * Count;
- TotalFoldedCost += FoldedCost * Count;
- }
- assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is "
- "less that its materialized cost, "
- "the sum must be as well.");
-
- LLVM_DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost)
- << ": " << PN << "\n");
- CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
- return true;
-}
-
-/// Simple helper to walk all the users of a list of phis depth first, and call
-/// a visit function on each one in post-order.
-///
-/// All of the PHIs should be in the same basic block, and this is primarily
-/// used to make a single depth-first walk across their collective users
-/// without revisiting any subgraphs. Callers should provide a fast, idempotent
-/// callable to test whether a node has been visited and the more important
-/// callable to actually visit a particular node.
-///
-/// Depth-first and postorder here refer to the *operand* graph -- we start
-/// from a collection of users of PHI nodes and walk "up" the operands
-/// depth-first.
-template <typename IsVisitedT, typename VisitT>
-static void visitPHIUsersAndDepsInPostOrder(ArrayRef<PHINode *> PNs,
- IsVisitedT IsVisited,
- VisitT Visit) {
- SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
- for (auto *PN : PNs)
- for (Use &U : PN->uses()) {
- auto *UI = cast<Instruction>(U.getUser());
- if (IsVisited(UI))
- // Already visited this user, continue across the roots.
- continue;
-
- // Otherwise, walk the operand graph depth-first and visit each
- // dependency in postorder.
- DFSStack.push_back({UI, UI->value_op_begin()});
- do {
- User::value_op_iterator OpIt;
- std::tie(UI, OpIt) = DFSStack.pop_back_val();
- while (OpIt != UI->value_op_end()) {
- auto *OpI = dyn_cast<Instruction>(*OpIt);
- // Increment to the next operand for whenever we continue.
- ++OpIt;
- // No need to visit non-instructions, which can't form dependencies,
- // or instructions outside of our potential dependency set that we
- // were given. Finally, if we've already visited the node, continue
- // to the next.
- if (!OpI || IsVisited(OpI))
- continue;
-
- // Push onto the stack and descend. We can directly continue this
- // loop when ascending.
- DFSStack.push_back({UI, OpIt});
- UI = OpI;
- OpIt = OpI->value_op_begin();
- }
-
- // Finished visiting children, visit this node.
- assert(!IsVisited(UI) && "Should not have already visited a node!");
- Visit(UI);
- } while (!DFSStack.empty());
- }
-}
-
-/// Find profitable PHIs to speculate.
-///
-/// For a PHI node to be profitable, we need the cost of speculating its users
-/// (and their dependencies) to not exceed the savings of folding the PHI's
-/// constant operands into the speculated users.
-///
-/// Computing this is surprisingly challenging. Because users of two different
-/// PHI nodes can depend on each other or on common other instructions, it may
-/// be profitable to speculate two PHI nodes together even though neither one
-/// in isolation is profitable. The straightforward way to find all the
-/// profitable PHIs would be to check each combination of PHIs' cost, but this
-/// is exponential in complexity.
-///
-/// Even if we assume that we only care about cases where we can consider each
-/// PHI node in isolation (rather than considering cases where none are
-/// profitable in isolation but some subset are profitable as a set), we still
-/// have a challenge. The obvious way to find all individually profitable PHIs
-/// is to iterate until reaching a fixed point, but this will be quadratic in
-/// complexity. =/
-///
-/// This code currently uses a linear-to-compute order for a greedy approach.
-/// It won't find cases where a set of PHIs must be considered together, but it
-/// handles most cases of order dependence without quadratic iteration. The
-/// specific order used is the post-order across the operand DAG. When the last
-/// user of a PHI is visited in this postorder walk, we check it for
-/// profitability.
-///
-/// There is an orthogonal extra complexity to all of this: computing the cost
-/// itself can easily become a linear computation making everything again (at
-/// best) quadratic. Using a postorder over the operand graph makes it
-/// particularly easy to avoid this through dynamic programming. As we do the
-/// postorder walk, we build the transitive cost of that subgraph. It is also
-/// straightforward to then update these costs when we mark a PHI for
-/// speculation so that subsequent PHIs don't re-pay the cost of already
-/// speculated instructions.
-static SmallVector<PHINode *, 16>
-findProfitablePHIs(ArrayRef<PHINode *> PNs,
- const SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
- const SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
- int NumPreds, DominatorTree &DT, TargetTransformInfo &TTI) {
- SmallVector<PHINode *, 16> SpecPNs;
-
- // First, establish a reverse mapping from immediate users of the PHI nodes
- // to the nodes themselves, and count how many users each PHI node has in
- // a way we can update while processing them.
- SmallDenseMap<Instruction *, TinyPtrVector<PHINode *>, 16> UserToPNMap;
- SmallDenseMap<PHINode *, int, 16> PNUserCountMap;
- SmallPtrSet<Instruction *, 16> UserSet;
- for (auto *PN : PNs) {
- assert(UserSet.empty() && "Must start with an empty user set!");
- for (Use &U : PN->uses())
- UserSet.insert(cast<Instruction>(U.getUser()));
- PNUserCountMap[PN] = UserSet.size();
- for (auto *UI : UserSet)
- UserToPNMap.insert({UI, {}}).first->second.push_back(PN);
- UserSet.clear();
- }
-
- // Now do a DFS across the operand graph of the users, computing cost as we
- // go and when all costs for a given PHI are known, checking that PHI for
- // profitability.
- SmallDenseMap<Instruction *, int, 16> SpecCostMap;
- visitPHIUsersAndDepsInPostOrder(
- PNs,
- /*IsVisited*/
- [&](Instruction *I) {
- // We consider anything that isn't potentially speculated to be
- // "visited" as it is already handled. Similarly, anything that *is*
- // potentially speculated but for which we have an entry in our cost
- // map, we're done.
- return !PotentialSpecSet.count(I) || SpecCostMap.count(I);
- },
- /*Visit*/
- [&](Instruction *I) {
- // We've fully visited the operands, so sum their cost with this node
- // and update the cost map.
- int Cost = TTI.TCC_Free;
- for (Value *OpV : I->operand_values())
- if (auto *OpI = dyn_cast<Instruction>(OpV)) {
- auto CostMapIt = SpecCostMap.find(OpI);
- if (CostMapIt != SpecCostMap.end())
- Cost += CostMapIt->second;
- }
- Cost += TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
- bool Inserted = SpecCostMap.insert({I, Cost}).second;
- (void)Inserted;
- assert(Inserted && "Must not re-insert a cost during the DFS!");
-
- // Now check if this node had a corresponding PHI node using it. If so,
- // we need to decrement the outstanding user count for it.
- auto UserPNsIt = UserToPNMap.find(I);
- if (UserPNsIt == UserToPNMap.end())
- return;
- auto &UserPNs = UserPNsIt->second;
- auto UserPNsSplitIt = std::stable_partition(
- UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) {
- int &PNUserCount = PNUserCountMap.find(UserPN)->second;
- assert(
- PNUserCount > 0 &&
- "Should never re-visit a PN after its user count hits zero!");
- --PNUserCount;
- return PNUserCount != 0;
- });
-
- // FIXME: Rather than one at a time, we should sum the savings as the
- // cost will be completely shared.
- SmallVector<Instruction *, 16> SpecWorklist;
- for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) {
- int SpecCost = TTI.TCC_Free;
- for (Use &U : PN->uses())
- SpecCost +=
- SpecCostMap.find(cast<Instruction>(U.getUser()))->second;
- SpecCost *= (NumPreds - 1);
- // When the user count of a PHI node hits zero, we should check its
- // profitability. If profitable, we should mark it for speculation
- // and zero out the cost of everything it depends on.
- int CostSavings = CostSavingsMap.find(PN)->second;
- if (SpecCost > CostSavings) {
- LLVM_DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN
- << "\n"
- " Cost savings: "
- << CostSavings
- << "\n"
- " Speculation cost: "
- << SpecCost << "\n");
- continue;
- }
-
- // We're going to speculate this user-associated PHI. Copy it out and
- // add its users to the worklist to update their cost.
- SpecPNs.push_back(PN);
- for (Use &U : PN->uses()) {
- auto *UI = cast<Instruction>(U.getUser());
- auto CostMapIt = SpecCostMap.find(UI);
- if (CostMapIt->second == 0)
- continue;
- // Zero out this cost entry to avoid duplicates.
- CostMapIt->second = 0;
- SpecWorklist.push_back(UI);
- }
- }
-
- // Now walk all the operands of the users in the worklist transitively
- // to zero out all the memoized costs.
- while (!SpecWorklist.empty()) {
- Instruction *SpecI = SpecWorklist.pop_back_val();
- assert(SpecCostMap.find(SpecI)->second == 0 &&
- "Didn't zero out a cost!");
-
- // Walk the operands recursively to zero out their cost as well.
- for (auto *OpV : SpecI->operand_values()) {
- auto *OpI = dyn_cast<Instruction>(OpV);
- if (!OpI)
- continue;
- auto CostMapIt = SpecCostMap.find(OpI);
- if (CostMapIt == SpecCostMap.end() || CostMapIt->second == 0)
- continue;
- CostMapIt->second = 0;
- SpecWorklist.push_back(OpI);
- }
- }
- });
-
- return SpecPNs;
-}
-
-/// Speculate users around a set of PHI nodes.
-///
-/// This routine does the actual speculation around a set of PHI nodes where we
-/// have determined this to be both safe and profitable.
-///
-/// This routine handles any spliting of critical edges necessary to create
-/// a safe block to speculate into as well as cloning the instructions and
-/// rewriting all uses.
-static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
- SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
- SmallSetVector<BasicBlock *, 16> &PredSet,
- DominatorTree &DT) {
- LLVM_DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n");
- NumPHIsSpeculated += SpecPNs.size();
-
- // Split any critical edges so that we have a block to hoist into.
- auto *ParentBB = SpecPNs[0]->getParent();
- SmallVector<BasicBlock *, 16> SpecPreds;
- SpecPreds.reserve(PredSet.size());
- for (auto *PredBB : PredSet) {
- auto *NewPredBB = SplitCriticalEdge(
- PredBB, ParentBB,
- CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
- if (NewPredBB) {
- ++NumEdgesSplit;
- LLVM_DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName()
- << "\n");
- SpecPreds.push_back(NewPredBB);
- } else {
- assert(PredBB->getSingleSuccessor() == ParentBB &&
- "We need a non-critical predecessor to speculate into.");
- assert(!isa<InvokeInst>(PredBB->getTerminator()) &&
- "Cannot have a non-critical invoke!");
-
- // Already non-critical, use existing pred.
- SpecPreds.push_back(PredBB);
- }
- }
-
- SmallPtrSet<Instruction *, 16> SpecSet;
- SmallVector<Instruction *, 16> SpecList;
- visitPHIUsersAndDepsInPostOrder(SpecPNs,
- /*IsVisited*/
- [&](Instruction *I) {
- // This is visited if we don't need to
- // speculate it or we already have
- // speculated it.
- return !PotentialSpecSet.count(I) ||
- SpecSet.count(I);
- },
- /*Visit*/
- [&](Instruction *I) {
- // All operands scheduled, schedule this
- // node.
- SpecSet.insert(I);
- SpecList.push_back(I);
- });
-
- int NumSpecInsts = SpecList.size() * SpecPreds.size();
- int NumRedundantInsts = NumSpecInsts - SpecList.size();
- LLVM_DEBUG(dbgs() << " Inserting " << NumSpecInsts
- << " speculated instructions, " << NumRedundantInsts
- << " redundancies\n");
- NumSpeculatedInstructions += NumSpecInsts;
- NumNewRedundantInstructions += NumRedundantInsts;
-
- // Each predecessor is numbered by its index in `SpecPreds`, so for each
- // instruction we speculate, the speculated instruction is stored in that
- // index of the vector associated with the original instruction. We also
- // store the incoming values for each predecessor from any PHIs used.
- SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap;
-
- // Inject the synthetic mappings to rewrite PHIs to the appropriate incoming
- // value. This handles both the PHIs we are speculating around and any other
- // PHIs that happen to be used.
- for (auto *OrigI : SpecList)
- for (auto *OpV : OrigI->operand_values()) {
- auto *OpPN = dyn_cast<PHINode>(OpV);
- if (!OpPN || OpPN->getParent() != ParentBB)
- continue;
-
- auto InsertResult = SpeculatedValueMap.insert({OpPN, {}});
- if (!InsertResult.second)
- continue;
-
- auto &SpeculatedVals = InsertResult.first->second;
-
- // Populating our structure for mapping is particularly annoying because
- // finding an incoming value for a particular predecessor block in a PHI
- // node is a linear time operation! To avoid quadratic behavior, we build
- // a map for this PHI node's incoming values and then translate it into
- // the more compact representation used below.
- SmallDenseMap<BasicBlock *, Value *, 16> IncomingValueMap;
- for (int i : llvm::seq<int>(0, OpPN->getNumIncomingValues()))
- IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i);
-
- for (auto *PredBB : SpecPreds)
- SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second);
- }
-
- // Speculate into each predecessor.
- for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) {
- auto *PredBB = SpecPreds[PredIdx];
- assert(PredBB->getSingleSuccessor() == ParentBB &&
- "We need a non-critical predecessor to speculate into.");
-
- for (auto *OrigI : SpecList) {
- auto *NewI = OrigI->clone();
- NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx));
- NewI->insertBefore(PredBB->getTerminator());
-
- // Rewrite all the operands to the previously speculated instructions.
- // Because we're walking in-order, the defs must precede the uses and we
- // should already have these mappings.
- for (Use &U : NewI->operands()) {
- auto *OpI = dyn_cast<Instruction>(U.get());
- if (!OpI)
- continue;
- auto MapIt = SpeculatedValueMap.find(OpI);
- if (MapIt == SpeculatedValueMap.end())
- continue;
- const auto &SpeculatedVals = MapIt->second;
- assert(SpeculatedVals[PredIdx] &&
- "Must have a speculated value for this predecessor!");
- assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() &&
- "Speculated value has the wrong type!");
-
- // Rewrite the use to this predecessor's speculated instruction.
- U.set(SpeculatedVals[PredIdx]);
- }
-
- // Commute instructions which now have a constant in the LHS but not the
- // RHS.
- if (NewI->isBinaryOp() && NewI->isCommutative() &&
- isa<Constant>(NewI->getOperand(0)) &&
- !isa<Constant>(NewI->getOperand(1)))
- NewI->getOperandUse(0).swap(NewI->getOperandUse(1));
-
- SpeculatedValueMap[OrigI].push_back(NewI);
- assert(SpeculatedValueMap[OrigI][PredIdx] == NewI &&
- "Mismatched speculated instruction index!");
- }
- }
-
- // Walk the speculated instruction list and if they have uses, insert a PHI
- // for them from the speculated versions, and replace the uses with the PHI.
- // Then erase the instructions as they have been fully speculated. The walk
- // needs to be in reverse so that we don't think there are users when we'll
- // actually eventually remove them later.
- IRBuilder<> IRB(SpecPNs[0]);
- for (auto *OrigI : llvm::reverse(SpecList)) {
- // Check if we need a PHI for any remaining users and if so, insert it.
- if (!OrigI->use_empty()) {
- auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(),
- Twine(OrigI->getName()) + ".phi");
- // Add the incoming values we speculated.
- auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second;
- for (int PredIdx : llvm::seq<int>(0, SpecPreds.size()))
- SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]);
-
- // And replace the uses with the PHI node.
- OrigI->replaceAllUsesWith(SpecIPN);
- }
-
- // It is important to immediately erase this so that it stops using other
- // instructions. This avoids inserting needless PHIs of them.
- OrigI->eraseFromParent();
- }
-
- // All of the uses of the speculated phi nodes should be removed at this
- // point, so erase them.
- for (auto *SpecPN : SpecPNs) {
- assert(SpecPN->use_empty() && "All users should have been speculated!");
- SpecPN->eraseFromParent();
- }
-}
-
-/// Try to speculate around a series of PHIs from a single basic block.
-///
-/// This routine checks whether any of these PHIs are profitable to speculate
-/// users around. If safe and profitable, it does the speculation. It returns
-/// true when at least some speculation occurs.
-static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
- DominatorTree &DT, TargetTransformInfo &TTI) {
- LLVM_DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
-
- // Savings in cost from speculating around a PHI node.
- SmallDenseMap<PHINode *, int, 16> CostSavingsMap;
-
- // Remember the set of instructions that are candidates for speculation so
- // that we can quickly walk things within that space. This prunes out
- // instructions already available along edges, etc.
- SmallPtrSet<Instruction *, 16> PotentialSpecSet;
-
- // Remember the set of instructions that are (transitively) unsafe to
- // speculate into the incoming edges of this basic block. This avoids
- // recomputing them for each PHI node we check. This set is specific to this
- // block though as things are pruned out of it based on what is available
- // along incoming edges.
- SmallPtrSet<Instruction *, 16> UnsafeSet;
-
- // For each PHI node in this block, check whether there are immediate folding
- // opportunities from speculation, and whether that speculation will be
- // valid. This determise the set of safe PHIs to speculate.
+//===- SpeculateAroundPHIs.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SpeculateAroundPHIs.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "spec-phis"
+
+STATISTIC(NumPHIsSpeculated, "Number of PHI nodes we speculated around");
+STATISTIC(NumEdgesSplit,
+ "Number of critical edges which were split for speculation");
+STATISTIC(NumSpeculatedInstructions,
+ "Number of instructions we speculated around the PHI nodes");
+STATISTIC(NumNewRedundantInstructions,
+ "Number of new, redundant instructions inserted");
+
+/// Check whether speculating the users of a PHI node around the PHI
+/// will be safe.
+///
+/// This checks both that all of the users are safe and also that all of their
+/// operands are either recursively safe or already available along an incoming
+/// edge to the PHI.
+///
+/// This routine caches both all the safe nodes explored in `PotentialSpecSet`
+/// and the chain of nodes that definitively reach any unsafe node in
+/// `UnsafeSet`. By preserving these between repeated calls to this routine for
+/// PHIs in the same basic block, the exploration here can be reused. However,
+/// these caches must no be reused for PHIs in a different basic block as they
+/// reflect what is available along incoming edges.
+static bool
+isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
+ SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+ SmallPtrSetImpl<Instruction *> &UnsafeSet) {
+ auto *PhiBB = PN.getParent();
+ SmallPtrSet<Instruction *, 4> Visited;
+ SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
+
+ // Walk each user of the PHI node.
+ for (Use &U : PN.uses()) {
+ auto *UI = cast<Instruction>(U.getUser());
+
+ // Ensure the use post-dominates the PHI node. This ensures that, in the
+ // absence of unwinding, the use will actually be reached.
+ // FIXME: We use a blunt hammer of requiring them to be in the same basic
+ // block. We should consider using actual post-dominance here in the
+ // future.
+ if (UI->getParent() != PhiBB) {
+ LLVM_DEBUG(dbgs() << " Unsafe: use in a different BB: " << *UI << "\n");
+ return false;
+ }
+
+ if (const auto *CS = dyn_cast<CallBase>(UI)) {
+ if (CS->isConvergent() || CS->cannotDuplicate()) {
+ LLVM_DEBUG(dbgs() << " Unsafe: convergent "
+ "callsite cannot de duplicated: " << *UI << '\n');
+ return false;
+ }
+ }
+
+ // FIXME: This check is much too conservative. We're not going to move these
+ // instructions onto new dynamic paths through the program unless there is
+ // a call instruction between the use and the PHI node. And memory isn't
+ // changing unless there is a store in that same sequence. We should
+ // probably change this to do at least a limited scan of the intervening
+ // instructions and allow handling stores in easily proven safe cases.
+ if (mayBeMemoryDependent(*UI)) {
+ LLVM_DEBUG(dbgs() << " Unsafe: can't speculate use: " << *UI << "\n");
+ return false;
+ }
+
+ // Now do a depth-first search of everything these users depend on to make
+ // sure they are transitively safe. This is a depth-first search, but we
+ // check nodes in preorder to minimize the amount of checking.
+ Visited.insert(UI);
+ DFSStack.push_back({UI, UI->value_op_begin()});
+ do {
+ User::value_op_iterator OpIt;
+ std::tie(UI, OpIt) = DFSStack.pop_back_val();
+
+ while (OpIt != UI->value_op_end()) {
+ auto *OpI = dyn_cast<Instruction>(*OpIt);
+ // Increment to the next operand for whenever we continue.
+ ++OpIt;
+ // No need to visit non-instructions, which can't form dependencies.
+ if (!OpI)
+ continue;
+
+ // Now do the main pre-order checks that this operand is a viable
+ // dependency of something we want to speculate.
+
+ // First do a few checks for instructions that won't require
+ // speculation at all because they are trivially available on the
+ // incoming edge (either through dominance or through an incoming value
+ // to a PHI).
+ //
+ // The cases in the current block will be trivially dominated by the
+ // edge.
+ auto *ParentBB = OpI->getParent();
+ if (ParentBB == PhiBB) {
+ if (isa<PHINode>(OpI)) {
+ // We can trivially map through phi nodes in the same block.
+ continue;
+ }
+ } else if (DT.dominates(ParentBB, PhiBB)) {
+ // Instructions from dominating blocks are already available.
+ continue;
+ }
+
+ // Once we know that we're considering speculating the operand, check
+ // if we've already explored this subgraph and found it to be safe.
+ if (PotentialSpecSet.count(OpI))
+ continue;
+
+ // If we've already explored this subgraph and found it unsafe, bail.
+ // If when we directly test whether this is safe it fails, bail.
+ if (UnsafeSet.count(OpI) || ParentBB != PhiBB ||
+ mayBeMemoryDependent(*OpI)) {
+ LLVM_DEBUG(dbgs() << " Unsafe: can't speculate transitive use: "
+ << *OpI << "\n");
+ // Record the stack of instructions which reach this node as unsafe
+ // so we prune subsequent searches.
+ UnsafeSet.insert(OpI);
+ for (auto &StackPair : DFSStack) {
+ Instruction *I = StackPair.first;
+ UnsafeSet.insert(I);
+ }
+ return false;
+ }
+
+ // Skip any operands we're already recursively checking.
+ if (!Visited.insert(OpI).second)
+ continue;
+
+ // Push onto the stack and descend. We can directly continue this
+ // loop when ascending.
+ DFSStack.push_back({UI, OpIt});
+ UI = OpI;
+ OpIt = OpI->value_op_begin();
+ }
+
+ // This node and all its operands are safe. Go ahead and cache that for
+ // reuse later.
+ PotentialSpecSet.insert(UI);
+
+ // Continue with the next node on the stack.
+ } while (!DFSStack.empty());
+ }
+
+#ifndef NDEBUG
+ // Every visited operand should have been marked as safe for speculation at
+ // this point. Verify this and return success.
+ for (auto *I : Visited)
+ assert(PotentialSpecSet.count(I) &&
+ "Failed to mark a visited instruction as safe!");
+#endif
+ return true;
+}
+
+/// Check whether, in isolation, a given PHI node is both safe and profitable
+/// to speculate users around.
+///
+/// This handles checking whether there are any constant operands to a PHI
+/// which could represent a useful speculation candidate, whether the users of
+/// the PHI are safe to speculate including all their transitive dependencies,
+/// and whether after speculation there will be some cost savings (profit) to
+/// folding the operands into the users of the PHI node. Returns true if both
+/// safe and profitable with relevant cost savings updated in the map and with
+/// an update to the `PotentialSpecSet`. Returns false if either safety or
+/// profitability are absent. Some new entries may be made to the
+/// `PotentialSpecSet` even when this routine returns false, but they remain
+/// conservatively correct.
+///
+/// The profitability check here is a local one, but it checks this in an
+/// interesting way. Beyond checking that the total cost of materializing the
+/// constants will be less than the cost of folding them into their users, it
+/// also checks that no one incoming constant will have a higher cost when
+/// folded into its users rather than materialized. This higher cost could
+/// result in a dynamic *path* that is more expensive even when the total cost
+/// is lower. Currently, all of the interesting cases where this optimization
+/// should fire are ones where it is a no-loss operation in this sense. If we
+/// ever want to be more aggressive here, we would need to balance the
+/// different incoming edges' cost by looking at their respective
+/// probabilities.
+static bool isSafeAndProfitableToSpeculateAroundPHI(
+ PHINode &PN, SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
+ SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+ SmallPtrSetImpl<Instruction *> &UnsafeSet, DominatorTree &DT,
+ TargetTransformInfo &TTI) {
+ // First see whether there is any cost savings to speculating around this
+ // PHI, and build up a map of the constant inputs to how many times they
+ // occur.
+ bool NonFreeMat = false;
+ struct CostsAndCount {
+ int MatCost = TargetTransformInfo::TCC_Free;
+ int FoldedCost = TargetTransformInfo::TCC_Free;
+ int Count = 0;
+ };
+ SmallDenseMap<ConstantInt *, CostsAndCount, 16> CostsAndCounts;
+ SmallPtrSet<BasicBlock *, 16> IncomingConstantBlocks;
+ for (int i : llvm::seq<int>(0, PN.getNumIncomingValues())) {
+ auto *IncomingC = dyn_cast<ConstantInt>(PN.getIncomingValue(i));
+ if (!IncomingC)
+ continue;
+
+ // Only visit each incoming edge with a constant input once.
+ if (!IncomingConstantBlocks.insert(PN.getIncomingBlock(i)).second)
+ continue;
+
+ auto InsertResult = CostsAndCounts.insert({IncomingC, {}});
+ // Count how many edges share a given incoming costant.
+ ++InsertResult.first->second.Count;
+ // Only compute the cost the first time we see a particular constant.
+ if (!InsertResult.second)
+ continue;
+
+ int &MatCost = InsertResult.first->second.MatCost;
+ MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType(),
+ TargetTransformInfo::TCK_SizeAndLatency);
+ NonFreeMat |= MatCost != TTI.TCC_Free;
+ }
+ if (!NonFreeMat) {
+ LLVM_DEBUG(dbgs() << " Free: " << PN << "\n");
+ // No profit in free materialization.
+ return false;
+ }
+
+ // Now check that the uses of this PHI can actually be speculated,
+ // otherwise we'll still have to materialize the PHI value.
+ if (!isSafeToSpeculatePHIUsers(PN, DT, PotentialSpecSet, UnsafeSet)) {
+ LLVM_DEBUG(dbgs() << " Unsafe PHI: " << PN << "\n");
+ return false;
+ }
+
+ // Compute how much (if any) savings are available by speculating around this
+ // PHI.
+ for (Use &U : PN.uses()) {
+ auto *UserI = cast<Instruction>(U.getUser());
+ // Now check whether there is any savings to folding the incoming constants
+ // into this use.
+ unsigned Idx = U.getOperandNo();
+
+ // If we have a binary operator that is commutative, an actual constant
+ // operand would end up on the RHS, so pretend the use of the PHI is on the
+ // RHS.
+ //
+ // Technically, this is a bit weird if *both* operands are PHIs we're
+ // speculating. But if that is the case, giving an "optimistic" cost isn't
+ // a bad thing because after speculation it will constant fold. And
+ // moreover, such cases should likely have been constant folded already by
+ // some other pass, so we shouldn't worry about "modeling" them terribly
+ // accurately here. Similarly, if the other operand is a constant, it still
+ // seems fine to be "optimistic" in our cost modeling, because when the
+ // incoming operand from the PHI node is also a constant, we will end up
+ // constant folding.
+ if (UserI->isBinaryOp() && UserI->isCommutative() && Idx != 1)
+ // Assume we will commute the constant to the RHS to be canonical.
+ Idx = 1;
+
+ // Get the intrinsic ID if this user is an intrinsic.
+ Intrinsic::ID IID = Intrinsic::not_intrinsic;
+ if (auto *UserII = dyn_cast<IntrinsicInst>(UserI))
+ IID = UserII->getIntrinsicID();
+
+ for (auto &IncomingConstantAndCostsAndCount : CostsAndCounts) {
+ ConstantInt *IncomingC = IncomingConstantAndCostsAndCount.first;
+ int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
+ int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
+ if (IID)
+ FoldedCost +=
+ TTI.getIntImmCostIntrin(IID, Idx, IncomingC->getValue(),
+ IncomingC->getType(),
+ TargetTransformInfo::TCK_SizeAndLatency);
+ else
+ FoldedCost +=
+ TTI.getIntImmCostInst(UserI->getOpcode(), Idx,
+ IncomingC->getValue(), IncomingC->getType(),
+ TargetTransformInfo::TCK_SizeAndLatency);
+
+ // If we accumulate more folded cost for this incoming constant than
+ // materialized cost, then we'll regress any edge with this constant so
+ // just bail. We're only interested in cases where folding the incoming
+ // constants is at least break-even on all paths.
+ if (FoldedCost > MatCost) {
+ LLVM_DEBUG(dbgs() << " Not profitable to fold imm: " << *IncomingC
+ << "\n"
+ " Materializing cost: "
+ << MatCost
+ << "\n"
+ " Accumulated folded cost: "
+ << FoldedCost << "\n");
+ return false;
+ }
+ }
+ }
+
+ // Compute the total cost savings afforded by this PHI node.
+ int TotalMatCost = TTI.TCC_Free, TotalFoldedCost = TTI.TCC_Free;
+ for (auto IncomingConstantAndCostsAndCount : CostsAndCounts) {
+ int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
+ int FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
+ int Count = IncomingConstantAndCostsAndCount.second.Count;
+
+ TotalMatCost += MatCost * Count;
+ TotalFoldedCost += FoldedCost * Count;
+ }
+ assert(TotalFoldedCost <= TotalMatCost && "If each constant's folded cost is "
+ "less that its materialized cost, "
+ "the sum must be as well.");
+
+ LLVM_DEBUG(dbgs() << " Cost savings " << (TotalMatCost - TotalFoldedCost)
+ << ": " << PN << "\n");
+ CostSavingsMap[&PN] = TotalMatCost - TotalFoldedCost;
+ return true;
+}
+
+/// Simple helper to walk all the users of a list of phis depth first, and call
+/// a visit function on each one in post-order.
+///
+/// All of the PHIs should be in the same basic block, and this is primarily
+/// used to make a single depth-first walk across their collective users
+/// without revisiting any subgraphs. Callers should provide a fast, idempotent
+/// callable to test whether a node has been visited and the more important
+/// callable to actually visit a particular node.
+///
+/// Depth-first and postorder here refer to the *operand* graph -- we start
+/// from a collection of users of PHI nodes and walk "up" the operands
+/// depth-first.
+template <typename IsVisitedT, typename VisitT>
+static void visitPHIUsersAndDepsInPostOrder(ArrayRef<PHINode *> PNs,
+ IsVisitedT IsVisited,
+ VisitT Visit) {
+ SmallVector<std::pair<Instruction *, User::value_op_iterator>, 16> DFSStack;
+ for (auto *PN : PNs)
+ for (Use &U : PN->uses()) {
+ auto *UI = cast<Instruction>(U.getUser());
+ if (IsVisited(UI))
+ // Already visited this user, continue across the roots.
+ continue;
+
+ // Otherwise, walk the operand graph depth-first and visit each
+ // dependency in postorder.
+ DFSStack.push_back({UI, UI->value_op_begin()});
+ do {
+ User::value_op_iterator OpIt;
+ std::tie(UI, OpIt) = DFSStack.pop_back_val();
+ while (OpIt != UI->value_op_end()) {
+ auto *OpI = dyn_cast<Instruction>(*OpIt);
+ // Increment to the next operand for whenever we continue.
+ ++OpIt;
+ // No need to visit non-instructions, which can't form dependencies,
+ // or instructions outside of our potential dependency set that we
+ // were given. Finally, if we've already visited the node, continue
+ // to the next.
+ if (!OpI || IsVisited(OpI))
+ continue;
+
+ // Push onto the stack and descend. We can directly continue this
+ // loop when ascending.
+ DFSStack.push_back({UI, OpIt});
+ UI = OpI;
+ OpIt = OpI->value_op_begin();
+ }
+
+ // Finished visiting children, visit this node.
+ assert(!IsVisited(UI) && "Should not have already visited a node!");
+ Visit(UI);
+ } while (!DFSStack.empty());
+ }
+}
+
+/// Find profitable PHIs to speculate.
+///
+/// For a PHI node to be profitable, we need the cost of speculating its users
+/// (and their dependencies) to not exceed the savings of folding the PHI's
+/// constant operands into the speculated users.
+///
+/// Computing this is surprisingly challenging. Because users of two different
+/// PHI nodes can depend on each other or on common other instructions, it may
+/// be profitable to speculate two PHI nodes together even though neither one
+/// in isolation is profitable. The straightforward way to find all the
+/// profitable PHIs would be to check each combination of PHIs' cost, but this
+/// is exponential in complexity.
+///
+/// Even if we assume that we only care about cases where we can consider each
+/// PHI node in isolation (rather than considering cases where none are
+/// profitable in isolation but some subset are profitable as a set), we still
+/// have a challenge. The obvious way to find all individually profitable PHIs
+/// is to iterate until reaching a fixed point, but this will be quadratic in
+/// complexity. =/
+///
+/// This code currently uses a linear-to-compute order for a greedy approach.
+/// It won't find cases where a set of PHIs must be considered together, but it
+/// handles most cases of order dependence without quadratic iteration. The
+/// specific order used is the post-order across the operand DAG. When the last
+/// user of a PHI is visited in this postorder walk, we check it for
+/// profitability.
+///
+/// There is an orthogonal extra complexity to all of this: computing the cost
+/// itself can easily become a linear computation making everything again (at
+/// best) quadratic. Using a postorder over the operand graph makes it
+/// particularly easy to avoid this through dynamic programming. As we do the
+/// postorder walk, we build the transitive cost of that subgraph. It is also
+/// straightforward to then update these costs when we mark a PHI for
+/// speculation so that subsequent PHIs don't re-pay the cost of already
+/// speculated instructions.
+static SmallVector<PHINode *, 16>
+findProfitablePHIs(ArrayRef<PHINode *> PNs,
+ const SmallDenseMap<PHINode *, int, 16> &CostSavingsMap,
+ const SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+ int NumPreds, DominatorTree &DT, TargetTransformInfo &TTI) {
+ SmallVector<PHINode *, 16> SpecPNs;
+
+ // First, establish a reverse mapping from immediate users of the PHI nodes
+ // to the nodes themselves, and count how many users each PHI node has in
+ // a way we can update while processing them.
+ SmallDenseMap<Instruction *, TinyPtrVector<PHINode *>, 16> UserToPNMap;
+ SmallDenseMap<PHINode *, int, 16> PNUserCountMap;
+ SmallPtrSet<Instruction *, 16> UserSet;
+ for (auto *PN : PNs) {
+ assert(UserSet.empty() && "Must start with an empty user set!");
+ for (Use &U : PN->uses())
+ UserSet.insert(cast<Instruction>(U.getUser()));
+ PNUserCountMap[PN] = UserSet.size();
+ for (auto *UI : UserSet)
+ UserToPNMap.insert({UI, {}}).first->second.push_back(PN);
+ UserSet.clear();
+ }
+
+ // Now do a DFS across the operand graph of the users, computing cost as we
+ // go and when all costs for a given PHI are known, checking that PHI for
+ // profitability.
+ SmallDenseMap<Instruction *, int, 16> SpecCostMap;
+ visitPHIUsersAndDepsInPostOrder(
+ PNs,
+ /*IsVisited*/
+ [&](Instruction *I) {
+ // We consider anything that isn't potentially speculated to be
+ // "visited" as it is already handled. Similarly, anything that *is*
+ // potentially speculated but for which we have an entry in our cost
+ // map, we're done.
+ return !PotentialSpecSet.count(I) || SpecCostMap.count(I);
+ },
+ /*Visit*/
+ [&](Instruction *I) {
+ // We've fully visited the operands, so sum their cost with this node
+ // and update the cost map.
+ int Cost = TTI.TCC_Free;
+ for (Value *OpV : I->operand_values())
+ if (auto *OpI = dyn_cast<Instruction>(OpV)) {
+ auto CostMapIt = SpecCostMap.find(OpI);
+ if (CostMapIt != SpecCostMap.end())
+ Cost += CostMapIt->second;
+ }
+ Cost += TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
+ bool Inserted = SpecCostMap.insert({I, Cost}).second;
+ (void)Inserted;
+ assert(Inserted && "Must not re-insert a cost during the DFS!");
+
+ // Now check if this node had a corresponding PHI node using it. If so,
+ // we need to decrement the outstanding user count for it.
+ auto UserPNsIt = UserToPNMap.find(I);
+ if (UserPNsIt == UserToPNMap.end())
+ return;
+ auto &UserPNs = UserPNsIt->second;
+ auto UserPNsSplitIt = std::stable_partition(
+ UserPNs.begin(), UserPNs.end(), [&](PHINode *UserPN) {
+ int &PNUserCount = PNUserCountMap.find(UserPN)->second;
+ assert(
+ PNUserCount > 0 &&
+ "Should never re-visit a PN after its user count hits zero!");
+ --PNUserCount;
+ return PNUserCount != 0;
+ });
+
+ // FIXME: Rather than one at a time, we should sum the savings as the
+ // cost will be completely shared.
+ SmallVector<Instruction *, 16> SpecWorklist;
+ for (auto *PN : llvm::make_range(UserPNsSplitIt, UserPNs.end())) {
+ int SpecCost = TTI.TCC_Free;
+ for (Use &U : PN->uses())
+ SpecCost +=
+ SpecCostMap.find(cast<Instruction>(U.getUser()))->second;
+ SpecCost *= (NumPreds - 1);
+ // When the user count of a PHI node hits zero, we should check its
+ // profitability. If profitable, we should mark it for speculation
+ // and zero out the cost of everything it depends on.
+ int CostSavings = CostSavingsMap.find(PN)->second;
+ if (SpecCost > CostSavings) {
+ LLVM_DEBUG(dbgs() << " Not profitable, speculation cost: " << *PN
+ << "\n"
+ " Cost savings: "
+ << CostSavings
+ << "\n"
+ " Speculation cost: "
+ << SpecCost << "\n");
+ continue;
+ }
+
+ // We're going to speculate this user-associated PHI. Copy it out and
+ // add its users to the worklist to update their cost.
+ SpecPNs.push_back(PN);
+ for (Use &U : PN->uses()) {
+ auto *UI = cast<Instruction>(U.getUser());
+ auto CostMapIt = SpecCostMap.find(UI);
+ if (CostMapIt->second == 0)
+ continue;
+ // Zero out this cost entry to avoid duplicates.
+ CostMapIt->second = 0;
+ SpecWorklist.push_back(UI);
+ }
+ }
+
+ // Now walk all the operands of the users in the worklist transitively
+ // to zero out all the memoized costs.
+ while (!SpecWorklist.empty()) {
+ Instruction *SpecI = SpecWorklist.pop_back_val();
+ assert(SpecCostMap.find(SpecI)->second == 0 &&
+ "Didn't zero out a cost!");
+
+ // Walk the operands recursively to zero out their cost as well.
+ for (auto *OpV : SpecI->operand_values()) {
+ auto *OpI = dyn_cast<Instruction>(OpV);
+ if (!OpI)
+ continue;
+ auto CostMapIt = SpecCostMap.find(OpI);
+ if (CostMapIt == SpecCostMap.end() || CostMapIt->second == 0)
+ continue;
+ CostMapIt->second = 0;
+ SpecWorklist.push_back(OpI);
+ }
+ }
+ });
+
+ return SpecPNs;
+}
+
+/// Speculate users around a set of PHI nodes.
+///
+/// This routine does the actual speculation around a set of PHI nodes where we
+/// have determined this to be both safe and profitable.
+///
+/// This routine handles any spliting of critical edges necessary to create
+/// a safe block to speculate into as well as cloning the instructions and
+/// rewriting all uses.
+static void speculatePHIs(ArrayRef<PHINode *> SpecPNs,
+ SmallPtrSetImpl<Instruction *> &PotentialSpecSet,
+ SmallSetVector<BasicBlock *, 16> &PredSet,
+ DominatorTree &DT) {
+ LLVM_DEBUG(dbgs() << " Speculating around " << SpecPNs.size() << " PHIs!\n");
+ NumPHIsSpeculated += SpecPNs.size();
+
+ // Split any critical edges so that we have a block to hoist into.
+ auto *ParentBB = SpecPNs[0]->getParent();
+ SmallVector<BasicBlock *, 16> SpecPreds;
+ SpecPreds.reserve(PredSet.size());
+ for (auto *PredBB : PredSet) {
+ auto *NewPredBB = SplitCriticalEdge(
+ PredBB, ParentBB,
+ CriticalEdgeSplittingOptions(&DT).setMergeIdenticalEdges());
+ if (NewPredBB) {
+ ++NumEdgesSplit;
+ LLVM_DEBUG(dbgs() << " Split critical edge from: " << PredBB->getName()
+ << "\n");
+ SpecPreds.push_back(NewPredBB);
+ } else {
+ assert(PredBB->getSingleSuccessor() == ParentBB &&
+ "We need a non-critical predecessor to speculate into.");
+ assert(!isa<InvokeInst>(PredBB->getTerminator()) &&
+ "Cannot have a non-critical invoke!");
+
+ // Already non-critical, use existing pred.
+ SpecPreds.push_back(PredBB);
+ }
+ }
+
+ SmallPtrSet<Instruction *, 16> SpecSet;
+ SmallVector<Instruction *, 16> SpecList;
+ visitPHIUsersAndDepsInPostOrder(SpecPNs,
+ /*IsVisited*/
+ [&](Instruction *I) {
+ // This is visited if we don't need to
+ // speculate it or we already have
+ // speculated it.
+ return !PotentialSpecSet.count(I) ||
+ SpecSet.count(I);
+ },
+ /*Visit*/
+ [&](Instruction *I) {
+ // All operands scheduled, schedule this
+ // node.
+ SpecSet.insert(I);
+ SpecList.push_back(I);
+ });
+
+ int NumSpecInsts = SpecList.size() * SpecPreds.size();
+ int NumRedundantInsts = NumSpecInsts - SpecList.size();
+ LLVM_DEBUG(dbgs() << " Inserting " << NumSpecInsts
+ << " speculated instructions, " << NumRedundantInsts
+ << " redundancies\n");
+ NumSpeculatedInstructions += NumSpecInsts;
+ NumNewRedundantInstructions += NumRedundantInsts;
+
+ // Each predecessor is numbered by its index in `SpecPreds`, so for each
+ // instruction we speculate, the speculated instruction is stored in that
+ // index of the vector associated with the original instruction. We also
+ // store the incoming values for each predecessor from any PHIs used.
+ SmallDenseMap<Instruction *, SmallVector<Value *, 2>, 16> SpeculatedValueMap;
+
+ // Inject the synthetic mappings to rewrite PHIs to the appropriate incoming
+ // value. This handles both the PHIs we are speculating around and any other
+ // PHIs that happen to be used.
+ for (auto *OrigI : SpecList)
+ for (auto *OpV : OrigI->operand_values()) {
+ auto *OpPN = dyn_cast<PHINode>(OpV);
+ if (!OpPN || OpPN->getParent() != ParentBB)
+ continue;
+
+ auto InsertResult = SpeculatedValueMap.insert({OpPN, {}});
+ if (!InsertResult.second)
+ continue;
+
+ auto &SpeculatedVals = InsertResult.first->second;
+
+ // Populating our structure for mapping is particularly annoying because
+ // finding an incoming value for a particular predecessor block in a PHI
+ // node is a linear time operation! To avoid quadratic behavior, we build
+ // a map for this PHI node's incoming values and then translate it into
+ // the more compact representation used below.
+ SmallDenseMap<BasicBlock *, Value *, 16> IncomingValueMap;
+ for (int i : llvm::seq<int>(0, OpPN->getNumIncomingValues()))
+ IncomingValueMap[OpPN->getIncomingBlock(i)] = OpPN->getIncomingValue(i);
+
+ for (auto *PredBB : SpecPreds)
+ SpeculatedVals.push_back(IncomingValueMap.find(PredBB)->second);
+ }
+
+ // Speculate into each predecessor.
+ for (int PredIdx : llvm::seq<int>(0, SpecPreds.size())) {
+ auto *PredBB = SpecPreds[PredIdx];
+ assert(PredBB->getSingleSuccessor() == ParentBB &&
+ "We need a non-critical predecessor to speculate into.");
+
+ for (auto *OrigI : SpecList) {
+ auto *NewI = OrigI->clone();
+ NewI->setName(Twine(OrigI->getName()) + "." + Twine(PredIdx));
+ NewI->insertBefore(PredBB->getTerminator());
+
+ // Rewrite all the operands to the previously speculated instructions.
+ // Because we're walking in-order, the defs must precede the uses and we
+ // should already have these mappings.
+ for (Use &U : NewI->operands()) {
+ auto *OpI = dyn_cast<Instruction>(U.get());
+ if (!OpI)
+ continue;
+ auto MapIt = SpeculatedValueMap.find(OpI);
+ if (MapIt == SpeculatedValueMap.end())
+ continue;
+ const auto &SpeculatedVals = MapIt->second;
+ assert(SpeculatedVals[PredIdx] &&
+ "Must have a speculated value for this predecessor!");
+ assert(SpeculatedVals[PredIdx]->getType() == OpI->getType() &&
+ "Speculated value has the wrong type!");
+
+ // Rewrite the use to this predecessor's speculated instruction.
+ U.set(SpeculatedVals[PredIdx]);
+ }
+
+ // Commute instructions which now have a constant in the LHS but not the
+ // RHS.
+ if (NewI->isBinaryOp() && NewI->isCommutative() &&
+ isa<Constant>(NewI->getOperand(0)) &&
+ !isa<Constant>(NewI->getOperand(1)))
+ NewI->getOperandUse(0).swap(NewI->getOperandUse(1));
+
+ SpeculatedValueMap[OrigI].push_back(NewI);
+ assert(SpeculatedValueMap[OrigI][PredIdx] == NewI &&
+ "Mismatched speculated instruction index!");
+ }
+ }
+
+ // Walk the speculated instruction list and if they have uses, insert a PHI
+ // for them from the speculated versions, and replace the uses with the PHI.
+ // Then erase the instructions as they have been fully speculated. The walk
+ // needs to be in reverse so that we don't think there are users when we'll
+ // actually eventually remove them later.
+ IRBuilder<> IRB(SpecPNs[0]);
+ for (auto *OrigI : llvm::reverse(SpecList)) {
+ // Check if we need a PHI for any remaining users and if so, insert it.
+ if (!OrigI->use_empty()) {
+ auto *SpecIPN = IRB.CreatePHI(OrigI->getType(), SpecPreds.size(),
+ Twine(OrigI->getName()) + ".phi");
+ // Add the incoming values we speculated.
+ auto &SpeculatedVals = SpeculatedValueMap.find(OrigI)->second;
+ for (int PredIdx : llvm::seq<int>(0, SpecPreds.size()))
+ SpecIPN->addIncoming(SpeculatedVals[PredIdx], SpecPreds[PredIdx]);
+
+ // And replace the uses with the PHI node.
+ OrigI->replaceAllUsesWith(SpecIPN);
+ }
+
+ // It is important to immediately erase this so that it stops using other
+ // instructions. This avoids inserting needless PHIs of them.
+ OrigI->eraseFromParent();
+ }
+
+ // All of the uses of the speculated phi nodes should be removed at this
+ // point, so erase them.
+ for (auto *SpecPN : SpecPNs) {
+ assert(SpecPN->use_empty() && "All users should have been speculated!");
+ SpecPN->eraseFromParent();
+ }
+}
+
+/// Try to speculate around a series of PHIs from a single basic block.
+///
+/// This routine checks whether any of these PHIs are profitable to speculate
+/// users around. If safe and profitable, it does the speculation. It returns
+/// true when at least some speculation occurs.
+static bool tryToSpeculatePHIs(SmallVectorImpl<PHINode *> &PNs,
+ DominatorTree &DT, TargetTransformInfo &TTI) {
+ LLVM_DEBUG(dbgs() << "Evaluating phi nodes for speculation:\n");
+
+ // Savings in cost from speculating around a PHI node.
+ SmallDenseMap<PHINode *, int, 16> CostSavingsMap;
+
+ // Remember the set of instructions that are candidates for speculation so
+ // that we can quickly walk things within that space. This prunes out
+ // instructions already available along edges, etc.
+ SmallPtrSet<Instruction *, 16> PotentialSpecSet;
+
+ // Remember the set of instructions that are (transitively) unsafe to
+ // speculate into the incoming edges of this basic block. This avoids
+ // recomputing them for each PHI node we check. This set is specific to this
+ // block though as things are pruned out of it based on what is available
+ // along incoming edges.
+ SmallPtrSet<Instruction *, 16> UnsafeSet;
+
+ // For each PHI node in this block, check whether there are immediate folding
+ // opportunities from speculation, and whether that speculation will be
+ // valid. This determise the set of safe PHIs to speculate.
llvm::erase_if(PNs, [&](PHINode *PN) {
return !isSafeAndProfitableToSpeculateAroundPHI(
*PN, CostSavingsMap, PotentialSpecSet, UnsafeSet, DT, TTI);
});
- // If no PHIs were profitable, skip.
- if (PNs.empty()) {
- LLVM_DEBUG(dbgs() << " No safe and profitable PHIs found!\n");
- return false;
- }
-
- // We need to know how much speculation will cost which is determined by how
- // many incoming edges will need a copy of each speculated instruction.
- SmallSetVector<BasicBlock *, 16> PredSet;
- for (auto *PredBB : PNs[0]->blocks()) {
- if (!PredSet.insert(PredBB))
- continue;
-
- // We cannot speculate when a predecessor is an indirect branch.
- // FIXME: We also can't reliably create a non-critical edge block for
- // speculation if the predecessor is an invoke. This doesn't seem
- // fundamental and we should probably be splitting critical edges
- // differently.
- const auto *TermInst = PredBB->getTerminator();
- if (isa<IndirectBrInst>(TermInst) ||
- isa<InvokeInst>(TermInst) ||
- isa<CallBrInst>(TermInst)) {
- LLVM_DEBUG(dbgs() << " Invalid: predecessor terminator: "
- << PredBB->getName() << "\n");
- return false;
- }
- }
- if (PredSet.size() < 2) {
- LLVM_DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n");
- return false;
- }
-
- SmallVector<PHINode *, 16> SpecPNs = findProfitablePHIs(
- PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI);
- if (SpecPNs.empty())
- // Nothing to do.
- return false;
-
- speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT);
- return true;
-}
-
-PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &TTI = AM.getResult<TargetIRAnalysis>(F);
-
- bool Changed = false;
- for (auto *BB : ReversePostOrderTraversal<Function *>(&F)) {
- SmallVector<PHINode *, 16> PNs;
- auto BBI = BB->begin();
- while (auto *PN = dyn_cast<PHINode>(&*BBI)) {
- PNs.push_back(PN);
- ++BBI;
- }
-
- if (PNs.empty())
- continue;
-
- Changed |= tryToSpeculatePHIs(PNs, DT, TTI);
- }
-
- if (!Changed)
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- return PA;
-}
+ // If no PHIs were profitable, skip.
+ if (PNs.empty()) {
+ LLVM_DEBUG(dbgs() << " No safe and profitable PHIs found!\n");
+ return false;
+ }
+
+ // We need to know how much speculation will cost which is determined by how
+ // many incoming edges will need a copy of each speculated instruction.
+ SmallSetVector<BasicBlock *, 16> PredSet;
+ for (auto *PredBB : PNs[0]->blocks()) {
+ if (!PredSet.insert(PredBB))
+ continue;
+
+ // We cannot speculate when a predecessor is an indirect branch.
+ // FIXME: We also can't reliably create a non-critical edge block for
+ // speculation if the predecessor is an invoke. This doesn't seem
+ // fundamental and we should probably be splitting critical edges
+ // differently.
+ const auto *TermInst = PredBB->getTerminator();
+ if (isa<IndirectBrInst>(TermInst) ||
+ isa<InvokeInst>(TermInst) ||
+ isa<CallBrInst>(TermInst)) {
+ LLVM_DEBUG(dbgs() << " Invalid: predecessor terminator: "
+ << PredBB->getName() << "\n");
+ return false;
+ }
+ }
+ if (PredSet.size() < 2) {
+ LLVM_DEBUG(dbgs() << " Unimportant: phi with only one predecessor\n");
+ return false;
+ }
+
+ SmallVector<PHINode *, 16> SpecPNs = findProfitablePHIs(
+ PNs, CostSavingsMap, PotentialSpecSet, PredSet.size(), DT, TTI);
+ if (SpecPNs.empty())
+ // Nothing to do.
+ return false;
+
+ speculatePHIs(SpecPNs, PotentialSpecSet, PredSet, DT);
+ return true;
+}
+
+PreservedAnalyses SpeculateAroundPHIsPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+
+ bool Changed = false;
+ for (auto *BB : ReversePostOrderTraversal<Function *>(&F)) {
+ SmallVector<PHINode *, 16> PNs;
+ auto BBI = BB->begin();
+ while (auto *PN = dyn_cast<PHINode>(&*BBI)) {
+ PNs.push_back(PN);
+ ++BBI;
+ }
+
+ if (PNs.empty())
+ continue;
+
+ Changed |= tryToSpeculatePHIs(PNs, DT, TTI);
+ }
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculativeExecution.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculativeExecution.cpp
index a1fc58d8f3..c78185f2a6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -1,250 +1,250 @@
-//===- SpeculativeExecution.cpp ---------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass hoists instructions to enable speculative execution on
-// targets where branches are expensive. This is aimed at GPUs. It
-// currently works on simple if-then and if-then-else
-// patterns.
-//
-// Removing branches is not the only motivation for this
-// pass. E.g. consider this code and assume that there is no
-// addressing mode for multiplying by sizeof(*a):
-//
-// if (b > 0)
-// c = a[i + 1]
-// if (d > 0)
-// e = a[i + 2]
-//
-// turns into
-//
-// p = &a[i + 1];
-// if (b > 0)
-// c = *p;
-// q = &a[i + 2];
-// if (d > 0)
-// e = *q;
-//
-// which could later be optimized to
-//
-// r = &a[i];
-// if (b > 0)
-// c = r[1];
-// if (d > 0)
-// e = r[2];
-//
-// Later passes sink back much of the speculated code that did not enable
-// further optimization.
-//
-// This pass is more aggressive than the function SpeculativeyExecuteBB in
-// SimplifyCFG. SimplifyCFG will not speculate if no selects are introduced and
-// it will speculate at most one instruction. It also will not speculate if
-// there is a value defined in the if-block that is only used in the then-block.
-// These restrictions make sense since the speculation in SimplifyCFG seems
-// aimed at introducing cheap selects, while this pass is intended to do more
-// aggressive speculation while counting on later passes to either capitalize on
-// that or clean it up.
-//
-// If the pass was created by calling
-// createSpeculativeExecutionIfHasBranchDivergencePass or the
-// -spec-exec-only-if-divergent-target option is present, this pass only has an
-// effect on targets where TargetTransformInfo::hasBranchDivergence() is true;
-// on other targets, it is a nop.
-//
-// This lets you include this pass unconditionally in the IR pass pipeline, but
-// only enable it for relevant targets.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/SpeculativeExecution.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "speculative-execution"
-
-// The risk that speculation will not pay off increases with the
-// number of instructions speculated, so we put a limit on that.
-static cl::opt<unsigned> SpecExecMaxSpeculationCost(
- "spec-exec-max-speculation-cost", cl::init(7), cl::Hidden,
- cl::desc("Speculative execution is not applied to basic blocks where "
- "the cost of the instructions to speculatively execute "
- "exceeds this limit."));
-
-// Speculating just a few instructions from a larger block tends not
-// to be profitable and this limit prevents that. A reason for that is
-// that small basic blocks are more likely to be candidates for
-// further optimization.
-static cl::opt<unsigned> SpecExecMaxNotHoisted(
- "spec-exec-max-not-hoisted", cl::init(5), cl::Hidden,
- cl::desc("Speculative execution is not applied to basic blocks where the "
- "number of instructions that would not be speculatively executed "
- "exceeds this limit."));
-
-static cl::opt<bool> SpecExecOnlyIfDivergentTarget(
- "spec-exec-only-if-divergent-target", cl::init(false), cl::Hidden,
- cl::desc("Speculative execution is applied only to targets with divergent "
- "branches, even if the pass was configured to apply only to all "
- "targets."));
-
-namespace {
-
-class SpeculativeExecutionLegacyPass : public FunctionPass {
-public:
- static char ID;
- explicit SpeculativeExecutionLegacyPass(bool OnlyIfDivergentTarget = false)
- : FunctionPass(ID), OnlyIfDivergentTarget(OnlyIfDivergentTarget ||
- SpecExecOnlyIfDivergentTarget),
- Impl(OnlyIfDivergentTarget) {}
-
- void getAnalysisUsage(AnalysisUsage &AU) const override;
- bool runOnFunction(Function &F) override;
-
- StringRef getPassName() const override {
- if (OnlyIfDivergentTarget)
- return "Speculatively execute instructions if target has divergent "
- "branches";
- return "Speculatively execute instructions";
- }
-
-private:
- // Variable preserved purely for correct name printing.
- const bool OnlyIfDivergentTarget;
-
- SpeculativeExecutionPass Impl;
-};
-} // namespace
-
-char SpeculativeExecutionLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(SpeculativeExecutionLegacyPass, "speculative-execution",
- "Speculatively execute instructions", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(SpeculativeExecutionLegacyPass, "speculative-execution",
- "Speculatively execute instructions", false, false)
-
-void SpeculativeExecutionLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.setPreservesCFG();
-}
-
-bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
- auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- return Impl.runImpl(F, TTI);
-}
-
-namespace llvm {
-
-bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) {
- if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence()) {
- LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because "
- "TTI->hasBranchDivergence() is false.\n");
- return false;
- }
-
- this->TTI = TTI;
- bool Changed = false;
- for (auto& B : F) {
- Changed |= runOnBasicBlock(B);
- }
- return Changed;
-}
-
-bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) {
- BranchInst *BI = dyn_cast<BranchInst>(B.getTerminator());
- if (BI == nullptr)
- return false;
-
- if (BI->getNumSuccessors() != 2)
- return false;
- BasicBlock &Succ0 = *BI->getSuccessor(0);
- BasicBlock &Succ1 = *BI->getSuccessor(1);
-
- if (&B == &Succ0 || &B == &Succ1 || &Succ0 == &Succ1) {
- return false;
- }
-
- // Hoist from if-then (triangle).
- if (Succ0.getSinglePredecessor() != nullptr &&
- Succ0.getSingleSuccessor() == &Succ1) {
- return considerHoistingFromTo(Succ0, B);
- }
-
- // Hoist from if-else (triangle).
- if (Succ1.getSinglePredecessor() != nullptr &&
- Succ1.getSingleSuccessor() == &Succ0) {
- return considerHoistingFromTo(Succ1, B);
- }
-
- // Hoist from if-then-else (diamond), but only if it is equivalent to
- // an if-else or if-then due to one of the branches doing nothing.
- if (Succ0.getSinglePredecessor() != nullptr &&
- Succ1.getSinglePredecessor() != nullptr &&
- Succ1.getSingleSuccessor() != nullptr &&
- Succ1.getSingleSuccessor() != &B &&
- Succ1.getSingleSuccessor() == Succ0.getSingleSuccessor()) {
- // If a block has only one instruction, then that is a terminator
- // instruction so that the block does nothing. This does happen.
- if (Succ1.size() == 1) // equivalent to if-then
- return considerHoistingFromTo(Succ0, B);
- if (Succ0.size() == 1) // equivalent to if-else
- return considerHoistingFromTo(Succ1, B);
- }
-
- return false;
-}
-
-static unsigned ComputeSpeculationCost(const Instruction *I,
- const TargetTransformInfo &TTI) {
- switch (Operator::getOpcode(I)) {
- case Instruction::GetElementPtr:
- case Instruction::Add:
- case Instruction::Mul:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Select:
- case Instruction::Shl:
- case Instruction::Sub:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::Xor:
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::Call:
- case Instruction::BitCast:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::AddrSpaceCast:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::UIToFP:
- case Instruction::SIToFP:
- case Instruction::FPExt:
- case Instruction::FPTrunc:
- case Instruction::FAdd:
- case Instruction::FSub:
- case Instruction::FMul:
- case Instruction::FDiv:
- case Instruction::FRem:
- case Instruction::FNeg:
- case Instruction::ICmp:
- case Instruction::FCmp:
+//===- SpeculativeExecution.cpp ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass hoists instructions to enable speculative execution on
+// targets where branches are expensive. This is aimed at GPUs. It
+// currently works on simple if-then and if-then-else
+// patterns.
+//
+// Removing branches is not the only motivation for this
+// pass. E.g. consider this code and assume that there is no
+// addressing mode for multiplying by sizeof(*a):
+//
+// if (b > 0)
+// c = a[i + 1]
+// if (d > 0)
+// e = a[i + 2]
+//
+// turns into
+//
+// p = &a[i + 1];
+// if (b > 0)
+// c = *p;
+// q = &a[i + 2];
+// if (d > 0)
+// e = *q;
+//
+// which could later be optimized to
+//
+// r = &a[i];
+// if (b > 0)
+// c = r[1];
+// if (d > 0)
+// e = r[2];
+//
+// Later passes sink back much of the speculated code that did not enable
+// further optimization.
+//
+// This pass is more aggressive than the function SpeculativeyExecuteBB in
+// SimplifyCFG. SimplifyCFG will not speculate if no selects are introduced and
+// it will speculate at most one instruction. It also will not speculate if
+// there is a value defined in the if-block that is only used in the then-block.
+// These restrictions make sense since the speculation in SimplifyCFG seems
+// aimed at introducing cheap selects, while this pass is intended to do more
+// aggressive speculation while counting on later passes to either capitalize on
+// that or clean it up.
+//
+// If the pass was created by calling
+// createSpeculativeExecutionIfHasBranchDivergencePass or the
+// -spec-exec-only-if-divergent-target option is present, this pass only has an
+// effect on targets where TargetTransformInfo::hasBranchDivergence() is true;
+// on other targets, it is a nop.
+//
+// This lets you include this pass unconditionally in the IR pass pipeline, but
+// only enable it for relevant targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/SpeculativeExecution.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "speculative-execution"
+
+// The risk that speculation will not pay off increases with the
+// number of instructions speculated, so we put a limit on that.
+static cl::opt<unsigned> SpecExecMaxSpeculationCost(
+ "spec-exec-max-speculation-cost", cl::init(7), cl::Hidden,
+ cl::desc("Speculative execution is not applied to basic blocks where "
+ "the cost of the instructions to speculatively execute "
+ "exceeds this limit."));
+
+// Speculating just a few instructions from a larger block tends not
+// to be profitable and this limit prevents that. A reason for that is
+// that small basic blocks are more likely to be candidates for
+// further optimization.
+static cl::opt<unsigned> SpecExecMaxNotHoisted(
+ "spec-exec-max-not-hoisted", cl::init(5), cl::Hidden,
+ cl::desc("Speculative execution is not applied to basic blocks where the "
+ "number of instructions that would not be speculatively executed "
+ "exceeds this limit."));
+
+static cl::opt<bool> SpecExecOnlyIfDivergentTarget(
+ "spec-exec-only-if-divergent-target", cl::init(false), cl::Hidden,
+ cl::desc("Speculative execution is applied only to targets with divergent "
+ "branches, even if the pass was configured to apply only to all "
+ "targets."));
+
+namespace {
+
+class SpeculativeExecutionLegacyPass : public FunctionPass {
+public:
+ static char ID;
+ explicit SpeculativeExecutionLegacyPass(bool OnlyIfDivergentTarget = false)
+ : FunctionPass(ID), OnlyIfDivergentTarget(OnlyIfDivergentTarget ||
+ SpecExecOnlyIfDivergentTarget),
+ Impl(OnlyIfDivergentTarget) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override {
+ if (OnlyIfDivergentTarget)
+ return "Speculatively execute instructions if target has divergent "
+ "branches";
+ return "Speculatively execute instructions";
+ }
+
+private:
+ // Variable preserved purely for correct name printing.
+ const bool OnlyIfDivergentTarget;
+
+ SpeculativeExecutionPass Impl;
+};
+} // namespace
+
+char SpeculativeExecutionLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(SpeculativeExecutionLegacyPass, "speculative-execution",
+ "Speculatively execute instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(SpeculativeExecutionLegacyPass, "speculative-execution",
+ "Speculatively execute instructions", false, false)
+
+void SpeculativeExecutionLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.setPreservesCFG();
+}
+
+bool SpeculativeExecutionLegacyPass::runOnFunction(Function &F) {
+ if (skipFunction(F))
+ return false;
+
+ auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ return Impl.runImpl(F, TTI);
+}
+
+namespace llvm {
+
+bool SpeculativeExecutionPass::runImpl(Function &F, TargetTransformInfo *TTI) {
+ if (OnlyIfDivergentTarget && !TTI->hasBranchDivergence()) {
+ LLVM_DEBUG(dbgs() << "Not running SpeculativeExecution because "
+ "TTI->hasBranchDivergence() is false.\n");
+ return false;
+ }
+
+ this->TTI = TTI;
+ bool Changed = false;
+ for (auto& B : F) {
+ Changed |= runOnBasicBlock(B);
+ }
+ return Changed;
+}
+
+bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) {
+ BranchInst *BI = dyn_cast<BranchInst>(B.getTerminator());
+ if (BI == nullptr)
+ return false;
+
+ if (BI->getNumSuccessors() != 2)
+ return false;
+ BasicBlock &Succ0 = *BI->getSuccessor(0);
+ BasicBlock &Succ1 = *BI->getSuccessor(1);
+
+ if (&B == &Succ0 || &B == &Succ1 || &Succ0 == &Succ1) {
+ return false;
+ }
+
+ // Hoist from if-then (triangle).
+ if (Succ0.getSinglePredecessor() != nullptr &&
+ Succ0.getSingleSuccessor() == &Succ1) {
+ return considerHoistingFromTo(Succ0, B);
+ }
+
+ // Hoist from if-else (triangle).
+ if (Succ1.getSinglePredecessor() != nullptr &&
+ Succ1.getSingleSuccessor() == &Succ0) {
+ return considerHoistingFromTo(Succ1, B);
+ }
+
+ // Hoist from if-then-else (diamond), but only if it is equivalent to
+ // an if-else or if-then due to one of the branches doing nothing.
+ if (Succ0.getSinglePredecessor() != nullptr &&
+ Succ1.getSinglePredecessor() != nullptr &&
+ Succ1.getSingleSuccessor() != nullptr &&
+ Succ1.getSingleSuccessor() != &B &&
+ Succ1.getSingleSuccessor() == Succ0.getSingleSuccessor()) {
+ // If a block has only one instruction, then that is a terminator
+ // instruction so that the block does nothing. This does happen.
+ if (Succ1.size() == 1) // equivalent to if-then
+ return considerHoistingFromTo(Succ0, B);
+ if (Succ0.size() == 1) // equivalent to if-else
+ return considerHoistingFromTo(Succ1, B);
+ }
+
+ return false;
+}
+
+static unsigned ComputeSpeculationCost(const Instruction *I,
+ const TargetTransformInfo &TTI) {
+ switch (Operator::getOpcode(I)) {
+ case Instruction::GetElementPtr:
+ case Instruction::Add:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Select:
+ case Instruction::Shl:
+ case Instruction::Sub:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::Xor:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::Call:
+ case Instruction::BitCast:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::AddrSpaceCast:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::UIToFP:
+ case Instruction::SIToFP:
+ case Instruction::FPExt:
+ case Instruction::FPTrunc:
+ case Instruction::FAdd:
+ case Instruction::FSub:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::FNeg:
+ case Instruction::ICmp:
+ case Instruction::FCmp:
case Instruction::Trunc:
case Instruction::Freeze:
case Instruction::ExtractElement:
@@ -252,96 +252,96 @@ static unsigned ComputeSpeculationCost(const Instruction *I,
case Instruction::ShuffleVector:
case Instruction::ExtractValue:
case Instruction::InsertValue:
- return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
-
- default:
- return UINT_MAX; // Disallow anything not explicitly listed.
- }
-}
-
-bool SpeculativeExecutionPass::considerHoistingFromTo(
- BasicBlock &FromBlock, BasicBlock &ToBlock) {
- SmallPtrSet<const Instruction *, 8> NotHoisted;
- const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](const User *U) {
- // Debug variable has special operand to check it's not hoisted.
- if (const auto *DVI = dyn_cast<DbgVariableIntrinsic>(U)) {
- if (const auto *I =
- dyn_cast_or_null<Instruction>(DVI->getVariableLocation()))
- if (NotHoisted.count(I) == 0)
- return true;
- return false;
- }
-
- // Usially debug label instrinsic corresponds to label in LLVM IR. In these
- // cases we should not move it here.
- // TODO: Possible special processing needed to detect it is related to a
- // hoisted instruction.
- if (isa<DbgLabelInst>(U))
- return false;
-
- for (const Value *V : U->operand_values()) {
- if (const Instruction *I = dyn_cast<Instruction>(V)) {
+ return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
+
+ default:
+ return UINT_MAX; // Disallow anything not explicitly listed.
+ }
+}
+
+bool SpeculativeExecutionPass::considerHoistingFromTo(
+ BasicBlock &FromBlock, BasicBlock &ToBlock) {
+ SmallPtrSet<const Instruction *, 8> NotHoisted;
+ const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](const User *U) {
+ // Debug variable has special operand to check it's not hoisted.
+ if (const auto *DVI = dyn_cast<DbgVariableIntrinsic>(U)) {
+ if (const auto *I =
+ dyn_cast_or_null<Instruction>(DVI->getVariableLocation()))
+ if (NotHoisted.count(I) == 0)
+ return true;
+ return false;
+ }
+
+ // Usially debug label instrinsic corresponds to label in LLVM IR. In these
+ // cases we should not move it here.
+ // TODO: Possible special processing needed to detect it is related to a
+ // hoisted instruction.
+ if (isa<DbgLabelInst>(U))
+ return false;
+
+ for (const Value *V : U->operand_values()) {
+ if (const Instruction *I = dyn_cast<Instruction>(V)) {
if (NotHoisted.contains(I))
- return false;
- }
- }
- return true;
- };
-
- unsigned TotalSpeculationCost = 0;
- unsigned NotHoistedInstCount = 0;
- for (const auto &I : FromBlock) {
- const unsigned Cost = ComputeSpeculationCost(&I, *TTI);
- if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) &&
- AllPrecedingUsesFromBlockHoisted(&I)) {
- TotalSpeculationCost += Cost;
- if (TotalSpeculationCost > SpecExecMaxSpeculationCost)
- return false; // too much to hoist
- } else {
- // Debug info instrinsics should not be counted for threshold.
- if (!isa<DbgInfoIntrinsic>(I))
- NotHoistedInstCount++;
- if (NotHoistedInstCount > SpecExecMaxNotHoisted)
- return false; // too much left behind
- NotHoisted.insert(&I);
- }
- }
-
- for (auto I = FromBlock.begin(); I != FromBlock.end();) {
- // We have to increment I before moving Current as moving Current
- // changes the list that I is iterating through.
- auto Current = I;
- ++I;
- if (!NotHoisted.count(&*Current)) {
- Current->moveBefore(ToBlock.getTerminator());
- }
- }
- return true;
-}
-
-FunctionPass *createSpeculativeExecutionPass() {
- return new SpeculativeExecutionLegacyPass();
-}
-
-FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() {
- return new SpeculativeExecutionLegacyPass(/* OnlyIfDivergentTarget = */ true);
-}
-
-SpeculativeExecutionPass::SpeculativeExecutionPass(bool OnlyIfDivergentTarget)
- : OnlyIfDivergentTarget(OnlyIfDivergentTarget ||
- SpecExecOnlyIfDivergentTarget) {}
-
-PreservedAnalyses SpeculativeExecutionPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
-
- bool Changed = runImpl(F, TTI);
-
- if (!Changed)
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
- PA.preserve<GlobalsAA>();
- PA.preserveSet<CFGAnalyses>();
- return PA;
-}
-} // namespace llvm
+ return false;
+ }
+ }
+ return true;
+ };
+
+ unsigned TotalSpeculationCost = 0;
+ unsigned NotHoistedInstCount = 0;
+ for (const auto &I : FromBlock) {
+ const unsigned Cost = ComputeSpeculationCost(&I, *TTI);
+ if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) &&
+ AllPrecedingUsesFromBlockHoisted(&I)) {
+ TotalSpeculationCost += Cost;
+ if (TotalSpeculationCost > SpecExecMaxSpeculationCost)
+ return false; // too much to hoist
+ } else {
+ // Debug info instrinsics should not be counted for threshold.
+ if (!isa<DbgInfoIntrinsic>(I))
+ NotHoistedInstCount++;
+ if (NotHoistedInstCount > SpecExecMaxNotHoisted)
+ return false; // too much left behind
+ NotHoisted.insert(&I);
+ }
+ }
+
+ for (auto I = FromBlock.begin(); I != FromBlock.end();) {
+ // We have to increment I before moving Current as moving Current
+ // changes the list that I is iterating through.
+ auto Current = I;
+ ++I;
+ if (!NotHoisted.count(&*Current)) {
+ Current->moveBefore(ToBlock.getTerminator());
+ }
+ }
+ return true;
+}
+
+FunctionPass *createSpeculativeExecutionPass() {
+ return new SpeculativeExecutionLegacyPass();
+}
+
+FunctionPass *createSpeculativeExecutionIfHasBranchDivergencePass() {
+ return new SpeculativeExecutionLegacyPass(/* OnlyIfDivergentTarget = */ true);
+}
+
+SpeculativeExecutionPass::SpeculativeExecutionPass(bool OnlyIfDivergentTarget)
+ : OnlyIfDivergentTarget(OnlyIfDivergentTarget ||
+ SpecExecOnlyIfDivergentTarget) {}
+
+PreservedAnalyses SpeculativeExecutionPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+
+ bool Changed = runImpl(F, TTI);
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<GlobalsAA>();
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
index 9aa2663941..577992ccb5 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp
@@ -1,105 +1,105 @@
-//===- StraightLineStrengthReduce.cpp - -----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements straight-line strength reduction (SLSR). Unlike loop
-// strength reduction, this algorithm is designed to reduce arithmetic
-// redundancy in straight-line code instead of loops. It has proven to be
-// effective in simplifying arithmetic statements derived from an unrolled loop.
-// It can also simplify the logic of SeparateConstOffsetFromGEP.
-//
-// There are many optimizations we can perform in the domain of SLSR. This file
-// for now contains only an initial step. Specifically, we look for strength
-// reduction candidates in the following forms:
-//
-// Form 1: B + i * S
-// Form 2: (B + i) * S
-// Form 3: &B[i * S]
-//
-// where S is an integer variable, and i is a constant integer. If we found two
-// candidates S1 and S2 in the same form and S1 dominates S2, we may rewrite S2
-// in a simpler way with respect to S1. For example,
-//
-// S1: X = B + i * S
-// S2: Y = B + i' * S => X + (i' - i) * S
-//
-// S1: X = (B + i) * S
-// S2: Y = (B + i') * S => X + (i' - i) * S
-//
-// S1: X = &B[i * S]
-// S2: Y = &B[i' * S] => &X[(i' - i) * S]
-//
-// Note: (i' - i) * S is folded to the extent possible.
-//
-// This rewriting is in general a good idea. The code patterns we focus on
-// usually come from loop unrolling, so (i' - i) * S is likely the same
-// across iterations and can be reused. When that happens, the optimized form
-// takes only one add starting from the second iteration.
-//
-// When such rewriting is possible, we call S1 a "basis" of S2. When S2 has
-// multiple bases, we choose to rewrite S2 with respect to its "immediate"
-// basis, the basis that is the closest ancestor in the dominator tree.
-//
-// TODO:
-//
-// - Floating point arithmetics when fast math is enabled.
-//
-// - SLSR may decrease ILP at the architecture level. Targets that are very
-// sensitive to ILP may want to disable it. Having SLSR to consider ILP is
-// left as future work.
-//
-// - When (i' - i) is constant but i and i' are not, we could still perform
-// SLSR.
-
+//===- StraightLineStrengthReduce.cpp - -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements straight-line strength reduction (SLSR). Unlike loop
+// strength reduction, this algorithm is designed to reduce arithmetic
+// redundancy in straight-line code instead of loops. It has proven to be
+// effective in simplifying arithmetic statements derived from an unrolled loop.
+// It can also simplify the logic of SeparateConstOffsetFromGEP.
+//
+// There are many optimizations we can perform in the domain of SLSR. This file
+// for now contains only an initial step. Specifically, we look for strength
+// reduction candidates in the following forms:
+//
+// Form 1: B + i * S
+// Form 2: (B + i) * S
+// Form 3: &B[i * S]
+//
+// where S is an integer variable, and i is a constant integer. If we found two
+// candidates S1 and S2 in the same form and S1 dominates S2, we may rewrite S2
+// in a simpler way with respect to S1. For example,
+//
+// S1: X = B + i * S
+// S2: Y = B + i' * S => X + (i' - i) * S
+//
+// S1: X = (B + i) * S
+// S2: Y = (B + i') * S => X + (i' - i) * S
+//
+// S1: X = &B[i * S]
+// S2: Y = &B[i' * S] => &X[(i' - i) * S]
+//
+// Note: (i' - i) * S is folded to the extent possible.
+//
+// This rewriting is in general a good idea. The code patterns we focus on
+// usually come from loop unrolling, so (i' - i) * S is likely the same
+// across iterations and can be reused. When that happens, the optimized form
+// takes only one add starting from the second iteration.
+//
+// When such rewriting is possible, we call S1 a "basis" of S2. When S2 has
+// multiple bases, we choose to rewrite S2 with respect to its "immediate"
+// basis, the basis that is the closest ancestor in the dominator tree.
+//
+// TODO:
+//
+// - Floating point arithmetics when fast math is enabled.
+//
+// - SLSR may decrease ILP at the architecture level. Targets that are very
+// sensitive to ILP may want to disable it. Having SLSR to consider ILP is
+// left as future work.
+//
+// - When (i' - i) is constant but i and i' are not, we could still perform
+// SLSR.
+
#include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <cassert>
-#include <cstdint>
-#include <limits>
-#include <list>
-#include <vector>
-
-using namespace llvm;
-using namespace PatternMatch;
-
-static const unsigned UnknownAddressSpace =
- std::numeric_limits<unsigned>::max();
-
-namespace {
-
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <list>
+#include <vector>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+static const unsigned UnknownAddressSpace =
+ std::numeric_limits<unsigned>::max();
+
+namespace {
+
class StraightLineStrengthReduceLegacyPass : public FunctionPass {
const DataLayout *DL = nullptr;
-public:
+public:
static char ID;
StraightLineStrengthReduceLegacyPass() : FunctionPass(ID) {
@@ -129,596 +129,596 @@ public:
ScalarEvolution *SE, TargetTransformInfo *TTI)
: DL(DL), DT(DT), SE(SE), TTI(TTI) {}
- // SLSR candidate. Such a candidate must be in one of the forms described in
- // the header comments.
- struct Candidate {
- enum Kind {
- Invalid, // reserved for the default constructor
- Add, // B + i * S
- Mul, // (B + i) * S
- GEP, // &B[..][i * S][..]
- };
-
- Candidate() = default;
- Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
- Instruction *I)
- : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I) {}
-
- Kind CandidateKind = Invalid;
-
- const SCEV *Base = nullptr;
-
- // Note that Index and Stride of a GEP candidate do not necessarily have the
- // same integer type. In that case, during rewriting, Stride will be
- // sign-extended or truncated to Index's type.
- ConstantInt *Index = nullptr;
-
- Value *Stride = nullptr;
-
- // The instruction this candidate corresponds to. It helps us to rewrite a
- // candidate with respect to its immediate basis. Note that one instruction
- // can correspond to multiple candidates depending on how you associate the
- // expression. For instance,
- //
- // (a + 1) * (b + 2)
- //
- // can be treated as
- //
- // <Base: a, Index: 1, Stride: b + 2>
- //
- // or
- //
- // <Base: b, Index: 2, Stride: a + 1>
- Instruction *Ins = nullptr;
-
- // Points to the immediate basis of this candidate, or nullptr if we cannot
- // find any basis for this candidate.
- Candidate *Basis = nullptr;
- };
-
+ // SLSR candidate. Such a candidate must be in one of the forms described in
+ // the header comments.
+ struct Candidate {
+ enum Kind {
+ Invalid, // reserved for the default constructor
+ Add, // B + i * S
+ Mul, // (B + i) * S
+ GEP, // &B[..][i * S][..]
+ };
+
+ Candidate() = default;
+ Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
+ Instruction *I)
+ : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I) {}
+
+ Kind CandidateKind = Invalid;
+
+ const SCEV *Base = nullptr;
+
+ // Note that Index and Stride of a GEP candidate do not necessarily have the
+ // same integer type. In that case, during rewriting, Stride will be
+ // sign-extended or truncated to Index's type.
+ ConstantInt *Index = nullptr;
+
+ Value *Stride = nullptr;
+
+ // The instruction this candidate corresponds to. It helps us to rewrite a
+ // candidate with respect to its immediate basis. Note that one instruction
+ // can correspond to multiple candidates depending on how you associate the
+ // expression. For instance,
+ //
+ // (a + 1) * (b + 2)
+ //
+ // can be treated as
+ //
+ // <Base: a, Index: 1, Stride: b + 2>
+ //
+ // or
+ //
+ // <Base: b, Index: 2, Stride: a + 1>
+ Instruction *Ins = nullptr;
+
+ // Points to the immediate basis of this candidate, or nullptr if we cannot
+ // find any basis for this candidate.
+ Candidate *Basis = nullptr;
+ };
+
bool runOnFunction(Function &F);
-
-private:
- // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
- // share the same base and stride.
- bool isBasisFor(const Candidate &Basis, const Candidate &C);
-
- // Returns whether the candidate can be folded into an addressing mode.
- bool isFoldable(const Candidate &C, TargetTransformInfo *TTI,
- const DataLayout *DL);
-
- // Returns true if C is already in a simplest form and not worth being
- // rewritten.
- bool isSimplestForm(const Candidate &C);
-
- // Checks whether I is in a candidate form. If so, adds all the matching forms
- // to Candidates, and tries to find the immediate basis for each of them.
- void allocateCandidatesAndFindBasis(Instruction *I);
-
- // Allocate candidates and find bases for Add instructions.
- void allocateCandidatesAndFindBasisForAdd(Instruction *I);
-
- // Given I = LHS + RHS, factors RHS into i * S and makes (LHS + i * S) a
- // candidate.
- void allocateCandidatesAndFindBasisForAdd(Value *LHS, Value *RHS,
- Instruction *I);
- // Allocate candidates and find bases for Mul instructions.
- void allocateCandidatesAndFindBasisForMul(Instruction *I);
-
- // Splits LHS into Base + Index and, if succeeds, calls
- // allocateCandidatesAndFindBasis.
- void allocateCandidatesAndFindBasisForMul(Value *LHS, Value *RHS,
- Instruction *I);
-
- // Allocate candidates and find bases for GetElementPtr instructions.
- void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP);
-
- // A helper function that scales Idx with ElementSize before invoking
- // allocateCandidatesAndFindBasis.
- void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx,
- Value *S, uint64_t ElementSize,
- Instruction *I);
-
- // Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate
- // basis.
- void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B,
- ConstantInt *Idx, Value *S,
- Instruction *I);
-
- // Rewrites candidate C with respect to Basis.
- void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
-
- // A helper function that factors ArrayIdx to a product of a stride and a
- // constant index, and invokes allocateCandidatesAndFindBasis with the
- // factorings.
- void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize,
- GetElementPtrInst *GEP);
-
- // Emit code that computes the "bump" from Basis to C. If the candidate is a
- // GEP and the bump is not divisible by the element size of the GEP, this
- // function sets the BumpWithUglyGEP flag to notify its caller to bump the
- // basis using an ugly GEP.
- static Value *emitBump(const Candidate &Basis, const Candidate &C,
- IRBuilder<> &Builder, const DataLayout *DL,
- bool &BumpWithUglyGEP);
-
- const DataLayout *DL = nullptr;
- DominatorTree *DT = nullptr;
- ScalarEvolution *SE;
- TargetTransformInfo *TTI = nullptr;
- std::list<Candidate> Candidates;
-
- // Temporarily holds all instructions that are unlinked (but not deleted) by
- // rewriteCandidateWithBasis. These instructions will be actually removed
- // after all rewriting finishes.
- std::vector<Instruction *> UnlinkedInstructions;
-};
-
-} // end anonymous namespace
-
+
+private:
+ // Returns true if Basis is a basis for C, i.e., Basis dominates C and they
+ // share the same base and stride.
+ bool isBasisFor(const Candidate &Basis, const Candidate &C);
+
+ // Returns whether the candidate can be folded into an addressing mode.
+ bool isFoldable(const Candidate &C, TargetTransformInfo *TTI,
+ const DataLayout *DL);
+
+ // Returns true if C is already in a simplest form and not worth being
+ // rewritten.
+ bool isSimplestForm(const Candidate &C);
+
+ // Checks whether I is in a candidate form. If so, adds all the matching forms
+ // to Candidates, and tries to find the immediate basis for each of them.
+ void allocateCandidatesAndFindBasis(Instruction *I);
+
+ // Allocate candidates and find bases for Add instructions.
+ void allocateCandidatesAndFindBasisForAdd(Instruction *I);
+
+ // Given I = LHS + RHS, factors RHS into i * S and makes (LHS + i * S) a
+ // candidate.
+ void allocateCandidatesAndFindBasisForAdd(Value *LHS, Value *RHS,
+ Instruction *I);
+ // Allocate candidates and find bases for Mul instructions.
+ void allocateCandidatesAndFindBasisForMul(Instruction *I);
+
+ // Splits LHS into Base + Index and, if succeeds, calls
+ // allocateCandidatesAndFindBasis.
+ void allocateCandidatesAndFindBasisForMul(Value *LHS, Value *RHS,
+ Instruction *I);
+
+ // Allocate candidates and find bases for GetElementPtr instructions.
+ void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP);
+
+ // A helper function that scales Idx with ElementSize before invoking
+ // allocateCandidatesAndFindBasis.
+ void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx,
+ Value *S, uint64_t ElementSize,
+ Instruction *I);
+
+ // Adds the given form <CT, B, Idx, S> to Candidates, and finds its immediate
+ // basis.
+ void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B,
+ ConstantInt *Idx, Value *S,
+ Instruction *I);
+
+ // Rewrites candidate C with respect to Basis.
+ void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis);
+
+ // A helper function that factors ArrayIdx to a product of a stride and a
+ // constant index, and invokes allocateCandidatesAndFindBasis with the
+ // factorings.
+ void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize,
+ GetElementPtrInst *GEP);
+
+ // Emit code that computes the "bump" from Basis to C. If the candidate is a
+ // GEP and the bump is not divisible by the element size of the GEP, this
+ // function sets the BumpWithUglyGEP flag to notify its caller to bump the
+ // basis using an ugly GEP.
+ static Value *emitBump(const Candidate &Basis, const Candidate &C,
+ IRBuilder<> &Builder, const DataLayout *DL,
+ bool &BumpWithUglyGEP);
+
+ const DataLayout *DL = nullptr;
+ DominatorTree *DT = nullptr;
+ ScalarEvolution *SE;
+ TargetTransformInfo *TTI = nullptr;
+ std::list<Candidate> Candidates;
+
+ // Temporarily holds all instructions that are unlinked (but not deleted) by
+ // rewriteCandidateWithBasis. These instructions will be actually removed
+ // after all rewriting finishes.
+ std::vector<Instruction *> UnlinkedInstructions;
+};
+
+} // end anonymous namespace
+
char StraightLineStrengthReduceLegacyPass::ID = 0;
-
+
INITIALIZE_PASS_BEGIN(StraightLineStrengthReduceLegacyPass, "slsr",
- "Straight line strength reduction", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+ "Straight line strength reduction", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
INITIALIZE_PASS_END(StraightLineStrengthReduceLegacyPass, "slsr",
- "Straight line strength reduction", false, false)
-
-FunctionPass *llvm::createStraightLineStrengthReducePass() {
+ "Straight line strength reduction", false, false)
+
+FunctionPass *llvm::createStraightLineStrengthReducePass() {
return new StraightLineStrengthReduceLegacyPass();
-}
-
-bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
- const Candidate &C) {
- return (Basis.Ins != C.Ins && // skip the same instruction
- // They must have the same type too. Basis.Base == C.Base doesn't
- // guarantee their types are the same (PR23975).
- Basis.Ins->getType() == C.Ins->getType() &&
- // Basis must dominate C in order to rewrite C with respect to Basis.
- DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) &&
- // They share the same base, stride, and candidate kind.
- Basis.Base == C.Base && Basis.Stride == C.Stride &&
- Basis.CandidateKind == C.CandidateKind);
-}
-
-static bool isGEPFoldable(GetElementPtrInst *GEP,
- const TargetTransformInfo *TTI) {
+}
+
+bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis,
+ const Candidate &C) {
+ return (Basis.Ins != C.Ins && // skip the same instruction
+ // They must have the same type too. Basis.Base == C.Base doesn't
+ // guarantee their types are the same (PR23975).
+ Basis.Ins->getType() == C.Ins->getType() &&
+ // Basis must dominate C in order to rewrite C with respect to Basis.
+ DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) &&
+ // They share the same base, stride, and candidate kind.
+ Basis.Base == C.Base && Basis.Stride == C.Stride &&
+ Basis.CandidateKind == C.CandidateKind);
+}
+
+static bool isGEPFoldable(GetElementPtrInst *GEP,
+ const TargetTransformInfo *TTI) {
SmallVector<const Value *, 4> Indices(GEP->indices());
- return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
- Indices) == TargetTransformInfo::TCC_Free;
-}
-
-// Returns whether (Base + Index * Stride) can be folded to an addressing mode.
-static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride,
- TargetTransformInfo *TTI) {
- // Index->getSExtValue() may crash if Index is wider than 64-bit.
- return Index->getBitWidth() <= 64 &&
- TTI->isLegalAddressingMode(Base->getType(), nullptr, 0, true,
- Index->getSExtValue(), UnknownAddressSpace);
-}
-
-bool StraightLineStrengthReduce::isFoldable(const Candidate &C,
- TargetTransformInfo *TTI,
- const DataLayout *DL) {
- if (C.CandidateKind == Candidate::Add)
- return isAddFoldable(C.Base, C.Index, C.Stride, TTI);
- if (C.CandidateKind == Candidate::GEP)
- return isGEPFoldable(cast<GetElementPtrInst>(C.Ins), TTI);
- return false;
-}
-
-// Returns true if GEP has zero or one non-zero index.
-static bool hasOnlyOneNonZeroIndex(GetElementPtrInst *GEP) {
- unsigned NumNonZeroIndices = 0;
- for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) {
- ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I);
- if (ConstIdx == nullptr || !ConstIdx->isZero())
- ++NumNonZeroIndices;
- }
- return NumNonZeroIndices <= 1;
-}
-
-bool StraightLineStrengthReduce::isSimplestForm(const Candidate &C) {
- if (C.CandidateKind == Candidate::Add) {
- // B + 1 * S or B + (-1) * S
- return C.Index->isOne() || C.Index->isMinusOne();
- }
- if (C.CandidateKind == Candidate::Mul) {
- // (B + 0) * S
- return C.Index->isZero();
- }
- if (C.CandidateKind == Candidate::GEP) {
- // (char*)B + S or (char*)B - S
- return ((C.Index->isOne() || C.Index->isMinusOne()) &&
- hasOnlyOneNonZeroIndex(cast<GetElementPtrInst>(C.Ins)));
- }
- return false;
-}
-
-// TODO: We currently implement an algorithm whose time complexity is linear in
-// the number of existing candidates. However, we could do better by using
-// ScopedHashTable. Specifically, while traversing the dominator tree, we could
-// maintain all the candidates that dominate the basic block being traversed in
-// a ScopedHashTable. This hash table is indexed by the base and the stride of
-// a candidate. Therefore, finding the immediate basis of a candidate boils down
-// to one hash-table look up.
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
- Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
- Instruction *I) {
- Candidate C(CT, B, Idx, S, I);
- // SLSR can complicate an instruction in two cases:
- //
- // 1. If we can fold I into an addressing mode, computing I is likely free or
- // takes only one instruction.
- //
- // 2. I is already in a simplest form. For example, when
- // X = B + 8 * S
- // Y = B + S,
- // rewriting Y to X - 7 * S is probably a bad idea.
- //
- // In the above cases, we still add I to the candidate list so that I can be
- // the basis of other candidates, but we leave I's basis blank so that I
- // won't be rewritten.
- if (!isFoldable(C, TTI, DL) && !isSimplestForm(C)) {
- // Try to compute the immediate basis of C.
- unsigned NumIterations = 0;
- // Limit the scan radius to avoid running in quadratice time.
- static const unsigned MaxNumIterations = 50;
- for (auto Basis = Candidates.rbegin();
- Basis != Candidates.rend() && NumIterations < MaxNumIterations;
- ++Basis, ++NumIterations) {
- if (isBasisFor(*Basis, C)) {
- C.Basis = &(*Basis);
- break;
- }
- }
- }
- // Regardless of whether we find a basis for C, we need to push C to the
- // candidate list so that it can be the basis of other candidates.
- Candidates.push_back(C);
-}
-
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
- Instruction *I) {
- switch (I->getOpcode()) {
- case Instruction::Add:
- allocateCandidatesAndFindBasisForAdd(I);
- break;
- case Instruction::Mul:
- allocateCandidatesAndFindBasisForMul(I);
- break;
- case Instruction::GetElementPtr:
- allocateCandidatesAndFindBasisForGEP(cast<GetElementPtrInst>(I));
- break;
- }
-}
-
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
- Instruction *I) {
- // Try matching B + i * S.
- if (!isa<IntegerType>(I->getType()))
- return;
-
- assert(I->getNumOperands() == 2 && "isn't I an add?");
- Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
- allocateCandidatesAndFindBasisForAdd(LHS, RHS, I);
- if (LHS != RHS)
- allocateCandidatesAndFindBasisForAdd(RHS, LHS, I);
-}
-
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
- Value *LHS, Value *RHS, Instruction *I) {
- Value *S = nullptr;
- ConstantInt *Idx = nullptr;
- if (match(RHS, m_Mul(m_Value(S), m_ConstantInt(Idx)))) {
- // I = LHS + RHS = LHS + Idx * S
- allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I);
- } else if (match(RHS, m_Shl(m_Value(S), m_ConstantInt(Idx)))) {
- // I = LHS + RHS = LHS + (S << Idx) = LHS + S * (1 << Idx)
- APInt One(Idx->getBitWidth(), 1);
- Idx = ConstantInt::get(Idx->getContext(), One << Idx->getValue());
- allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I);
- } else {
- // At least, I = LHS + 1 * RHS
- ConstantInt *One = ConstantInt::get(cast<IntegerType>(I->getType()), 1);
- allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), One, RHS,
- I);
- }
-}
-
-// Returns true if A matches B + C where C is constant.
-static bool matchesAdd(Value *A, Value *&B, ConstantInt *&C) {
- return (match(A, m_Add(m_Value(B), m_ConstantInt(C))) ||
- match(A, m_Add(m_ConstantInt(C), m_Value(B))));
-}
-
-// Returns true if A matches B | C where C is constant.
-static bool matchesOr(Value *A, Value *&B, ConstantInt *&C) {
- return (match(A, m_Or(m_Value(B), m_ConstantInt(C))) ||
- match(A, m_Or(m_ConstantInt(C), m_Value(B))));
-}
-
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
- Value *LHS, Value *RHS, Instruction *I) {
- Value *B = nullptr;
- ConstantInt *Idx = nullptr;
- if (matchesAdd(LHS, B, Idx)) {
- // If LHS is in the form of "Base + Index", then I is in the form of
- // "(Base + Index) * RHS".
- allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
- } else if (matchesOr(LHS, B, Idx) && haveNoCommonBitsSet(B, Idx, *DL)) {
- // If LHS is in the form of "Base | Index" and Base and Index have no common
- // bits set, then
- // Base | Index = Base + Index
- // and I is thus in the form of "(Base + Index) * RHS".
- allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
- } else {
- // Otherwise, at least try the form (LHS + 0) * RHS.
- ConstantInt *Zero = ConstantInt::get(cast<IntegerType>(I->getType()), 0);
- allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(LHS), Zero, RHS,
- I);
- }
-}
-
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
- Instruction *I) {
- // Try matching (B + i) * S.
- // TODO: we could extend SLSR to float and vector types.
- if (!isa<IntegerType>(I->getType()))
- return;
-
- assert(I->getNumOperands() == 2 && "isn't I a mul?");
- Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
- allocateCandidatesAndFindBasisForMul(LHS, RHS, I);
- if (LHS != RHS) {
- // Symmetrically, try to split RHS to Base + Index.
- allocateCandidatesAndFindBasisForMul(RHS, LHS, I);
- }
-}
-
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
- const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize,
- Instruction *I) {
- // I = B + sext(Idx *nsw S) * ElementSize
- // = B + (sext(Idx) * sext(S)) * ElementSize
- // = B + (sext(Idx) * ElementSize) * sext(S)
- // Casting to IntegerType is safe because we skipped vector GEPs.
- IntegerType *IntPtrTy = cast<IntegerType>(DL->getIntPtrType(I->getType()));
- ConstantInt *ScaledIdx = ConstantInt::get(
- IntPtrTy, Idx->getSExtValue() * (int64_t)ElementSize, true);
- allocateCandidatesAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I);
-}
-
-void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx,
- const SCEV *Base,
- uint64_t ElementSize,
- GetElementPtrInst *GEP) {
- // At least, ArrayIdx = ArrayIdx *nsw 1.
- allocateCandidatesAndFindBasisForGEP(
- Base, ConstantInt::get(cast<IntegerType>(ArrayIdx->getType()), 1),
- ArrayIdx, ElementSize, GEP);
- Value *LHS = nullptr;
- ConstantInt *RHS = nullptr;
- // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx
- // itself. This would allow us to handle the shl case for free. However,
- // matching SCEVs has two issues:
- //
- // 1. this would complicate rewriting because the rewriting procedure
- // would have to translate SCEVs back to IR instructions. This translation
- // is difficult when LHS is further evaluated to a composite SCEV.
- //
- // 2. ScalarEvolution is designed to be control-flow oblivious. It tends
- // to strip nsw/nuw flags which are critical for SLSR to trace into
- // sext'ed multiplication.
- if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) {
- // SLSR is currently unsafe if i * S may overflow.
- // GEP = Base + sext(LHS *nsw RHS) * ElementSize
- allocateCandidatesAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP);
- } else if (match(ArrayIdx, m_NSWShl(m_Value(LHS), m_ConstantInt(RHS)))) {
- // GEP = Base + sext(LHS <<nsw RHS) * ElementSize
- // = Base + sext(LHS *nsw (1 << RHS)) * ElementSize
- APInt One(RHS->getBitWidth(), 1);
- ConstantInt *PowerOf2 =
- ConstantInt::get(RHS->getContext(), One << RHS->getValue());
- allocateCandidatesAndFindBasisForGEP(Base, PowerOf2, LHS, ElementSize, GEP);
- }
-}
-
-void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
- GetElementPtrInst *GEP) {
- // TODO: handle vector GEPs
- if (GEP->getType()->isVectorTy())
- return;
-
- SmallVector<const SCEV *, 4> IndexExprs;
- for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
- IndexExprs.push_back(SE->getSCEV(*I));
-
- gep_type_iterator GTI = gep_type_begin(GEP);
- for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
- if (GTI.isStruct())
- continue;
-
- const SCEV *OrigIndexExpr = IndexExprs[I - 1];
- IndexExprs[I - 1] = SE->getZero(OrigIndexExpr->getType());
-
- // The base of this candidate is GEP's base plus the offsets of all
- // indices except this current one.
- const SCEV *BaseExpr = SE->getGEPExpr(cast<GEPOperator>(GEP), IndexExprs);
- Value *ArrayIdx = GEP->getOperand(I);
- uint64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
- if (ArrayIdx->getType()->getIntegerBitWidth() <=
- DL->getPointerSizeInBits(GEP->getAddressSpace())) {
- // Skip factoring if ArrayIdx is wider than the pointer size, because
- // ArrayIdx is implicitly truncated to the pointer size.
- factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP);
- }
- // When ArrayIdx is the sext of a value, we try to factor that value as
- // well. Handling this case is important because array indices are
- // typically sign-extended to the pointer size.
- Value *TruncatedArrayIdx = nullptr;
- if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx))) &&
- TruncatedArrayIdx->getType()->getIntegerBitWidth() <=
- DL->getPointerSizeInBits(GEP->getAddressSpace())) {
- // Skip factoring if TruncatedArrayIdx is wider than the pointer size,
- // because TruncatedArrayIdx is implicitly truncated to the pointer size.
- factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP);
- }
-
- IndexExprs[I - 1] = OrigIndexExpr;
- }
-}
-
-// A helper function that unifies the bitwidth of A and B.
-static void unifyBitWidth(APInt &A, APInt &B) {
- if (A.getBitWidth() < B.getBitWidth())
- A = A.sext(B.getBitWidth());
- else if (A.getBitWidth() > B.getBitWidth())
- B = B.sext(A.getBitWidth());
-}
-
-Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
- const Candidate &C,
- IRBuilder<> &Builder,
- const DataLayout *DL,
- bool &BumpWithUglyGEP) {
- APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue();
- unifyBitWidth(Idx, BasisIdx);
- APInt IndexOffset = Idx - BasisIdx;
-
- BumpWithUglyGEP = false;
- if (Basis.CandidateKind == Candidate::GEP) {
- APInt ElementSize(
- IndexOffset.getBitWidth(),
- DL->getTypeAllocSize(
- cast<GetElementPtrInst>(Basis.Ins)->getResultElementType()));
- APInt Q, R;
- APInt::sdivrem(IndexOffset, ElementSize, Q, R);
- if (R == 0)
- IndexOffset = Q;
- else
- BumpWithUglyGEP = true;
- }
-
- // Compute Bump = C - Basis = (i' - i) * S.
- // Common case 1: if (i' - i) is 1, Bump = S.
- if (IndexOffset == 1)
- return C.Stride;
- // Common case 2: if (i' - i) is -1, Bump = -S.
- if (IndexOffset.isAllOnesValue())
- return Builder.CreateNeg(C.Stride);
-
- // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may
- // have different bit widths.
- IntegerType *DeltaType =
- IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth());
- Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType);
- if (IndexOffset.isPowerOf2()) {
- // If (i' - i) is a power of 2, Bump = sext/trunc(S) << log(i' - i).
- ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2());
- return Builder.CreateShl(ExtendedStride, Exponent);
- }
- if ((-IndexOffset).isPowerOf2()) {
- // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i).
- ConstantInt *Exponent =
- ConstantInt::get(DeltaType, (-IndexOffset).logBase2());
- return Builder.CreateNeg(Builder.CreateShl(ExtendedStride, Exponent));
- }
- Constant *Delta = ConstantInt::get(DeltaType, IndexOffset);
- return Builder.CreateMul(ExtendedStride, Delta);
-}
-
-void StraightLineStrengthReduce::rewriteCandidateWithBasis(
- const Candidate &C, const Candidate &Basis) {
- assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base &&
- C.Stride == Basis.Stride);
- // We run rewriteCandidateWithBasis on all candidates in a post-order, so the
- // basis of a candidate cannot be unlinked before the candidate.
- assert(Basis.Ins->getParent() != nullptr && "the basis is unlinked");
-
- // An instruction can correspond to multiple candidates. Therefore, instead of
- // simply deleting an instruction when we rewrite it, we mark its parent as
- // nullptr (i.e. unlink it) so that we can skip the candidates whose
- // instruction is already rewritten.
- if (!C.Ins->getParent())
- return;
-
- IRBuilder<> Builder(C.Ins);
- bool BumpWithUglyGEP;
- Value *Bump = emitBump(Basis, C, Builder, DL, BumpWithUglyGEP);
- Value *Reduced = nullptr; // equivalent to but weaker than C.Ins
- switch (C.CandidateKind) {
- case Candidate::Add:
- case Candidate::Mul: {
- // C = Basis + Bump
- Value *NegBump;
- if (match(Bump, m_Neg(m_Value(NegBump)))) {
- // If Bump is a neg instruction, emit C = Basis - (-Bump).
- Reduced = Builder.CreateSub(Basis.Ins, NegBump);
- // We only use the negative argument of Bump, and Bump itself may be
- // trivially dead.
- RecursivelyDeleteTriviallyDeadInstructions(Bump);
- } else {
- // It's tempting to preserve nsw on Bump and/or Reduced. However, it's
- // usually unsound, e.g.,
- //
- // X = (-2 +nsw 1) *nsw INT_MAX
- // Y = (-2 +nsw 3) *nsw INT_MAX
- // =>
- // Y = X + 2 * INT_MAX
- //
- // Neither + and * in the resultant expression are nsw.
- Reduced = Builder.CreateAdd(Basis.Ins, Bump);
- }
- break;
- }
- case Candidate::GEP:
- {
- Type *IntPtrTy = DL->getIntPtrType(C.Ins->getType());
- bool InBounds = cast<GetElementPtrInst>(C.Ins)->isInBounds();
- if (BumpWithUglyGEP) {
- // C = (char *)Basis + Bump
- unsigned AS = Basis.Ins->getType()->getPointerAddressSpace();
- Type *CharTy = Type::getInt8PtrTy(Basis.Ins->getContext(), AS);
- Reduced = Builder.CreateBitCast(Basis.Ins, CharTy);
- if (InBounds)
- Reduced =
- Builder.CreateInBoundsGEP(Builder.getInt8Ty(), Reduced, Bump);
- else
- Reduced = Builder.CreateGEP(Builder.getInt8Ty(), Reduced, Bump);
- Reduced = Builder.CreateBitCast(Reduced, C.Ins->getType());
- } else {
- // C = gep Basis, Bump
- // Canonicalize bump to pointer size.
- Bump = Builder.CreateSExtOrTrunc(Bump, IntPtrTy);
- if (InBounds)
- Reduced = Builder.CreateInBoundsGEP(
- cast<GetElementPtrInst>(Basis.Ins)->getResultElementType(),
- Basis.Ins, Bump);
- else
- Reduced = Builder.CreateGEP(
- cast<GetElementPtrInst>(Basis.Ins)->getResultElementType(),
- Basis.Ins, Bump);
- }
- break;
- }
- default:
- llvm_unreachable("C.CandidateKind is invalid");
- };
- Reduced->takeName(C.Ins);
- C.Ins->replaceAllUsesWith(Reduced);
- // Unlink C.Ins so that we can skip other candidates also corresponding to
- // C.Ins. The actual deletion is postponed to the end of runOnFunction.
- C.Ins->removeFromParent();
- UnlinkedInstructions.push_back(C.Ins);
-}
-
+ return TTI->getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
+ Indices) == TargetTransformInfo::TCC_Free;
+}
+
+// Returns whether (Base + Index * Stride) can be folded to an addressing mode.
+static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride,
+ TargetTransformInfo *TTI) {
+ // Index->getSExtValue() may crash if Index is wider than 64-bit.
+ return Index->getBitWidth() <= 64 &&
+ TTI->isLegalAddressingMode(Base->getType(), nullptr, 0, true,
+ Index->getSExtValue(), UnknownAddressSpace);
+}
+
+bool StraightLineStrengthReduce::isFoldable(const Candidate &C,
+ TargetTransformInfo *TTI,
+ const DataLayout *DL) {
+ if (C.CandidateKind == Candidate::Add)
+ return isAddFoldable(C.Base, C.Index, C.Stride, TTI);
+ if (C.CandidateKind == Candidate::GEP)
+ return isGEPFoldable(cast<GetElementPtrInst>(C.Ins), TTI);
+ return false;
+}
+
+// Returns true if GEP has zero or one non-zero index.
+static bool hasOnlyOneNonZeroIndex(GetElementPtrInst *GEP) {
+ unsigned NumNonZeroIndices = 0;
+ for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I) {
+ ConstantInt *ConstIdx = dyn_cast<ConstantInt>(*I);
+ if (ConstIdx == nullptr || !ConstIdx->isZero())
+ ++NumNonZeroIndices;
+ }
+ return NumNonZeroIndices <= 1;
+}
+
+bool StraightLineStrengthReduce::isSimplestForm(const Candidate &C) {
+ if (C.CandidateKind == Candidate::Add) {
+ // B + 1 * S or B + (-1) * S
+ return C.Index->isOne() || C.Index->isMinusOne();
+ }
+ if (C.CandidateKind == Candidate::Mul) {
+ // (B + 0) * S
+ return C.Index->isZero();
+ }
+ if (C.CandidateKind == Candidate::GEP) {
+ // (char*)B + S or (char*)B - S
+ return ((C.Index->isOne() || C.Index->isMinusOne()) &&
+ hasOnlyOneNonZeroIndex(cast<GetElementPtrInst>(C.Ins)));
+ }
+ return false;
+}
+
+// TODO: We currently implement an algorithm whose time complexity is linear in
+// the number of existing candidates. However, we could do better by using
+// ScopedHashTable. Specifically, while traversing the dominator tree, we could
+// maintain all the candidates that dominate the basic block being traversed in
+// a ScopedHashTable. This hash table is indexed by the base and the stride of
+// a candidate. Therefore, finding the immediate basis of a candidate boils down
+// to one hash-table look up.
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
+ Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S,
+ Instruction *I) {
+ Candidate C(CT, B, Idx, S, I);
+ // SLSR can complicate an instruction in two cases:
+ //
+ // 1. If we can fold I into an addressing mode, computing I is likely free or
+ // takes only one instruction.
+ //
+ // 2. I is already in a simplest form. For example, when
+ // X = B + 8 * S
+ // Y = B + S,
+ // rewriting Y to X - 7 * S is probably a bad idea.
+ //
+ // In the above cases, we still add I to the candidate list so that I can be
+ // the basis of other candidates, but we leave I's basis blank so that I
+ // won't be rewritten.
+ if (!isFoldable(C, TTI, DL) && !isSimplestForm(C)) {
+ // Try to compute the immediate basis of C.
+ unsigned NumIterations = 0;
+ // Limit the scan radius to avoid running in quadratice time.
+ static const unsigned MaxNumIterations = 50;
+ for (auto Basis = Candidates.rbegin();
+ Basis != Candidates.rend() && NumIterations < MaxNumIterations;
+ ++Basis, ++NumIterations) {
+ if (isBasisFor(*Basis, C)) {
+ C.Basis = &(*Basis);
+ break;
+ }
+ }
+ }
+ // Regardless of whether we find a basis for C, we need to push C to the
+ // candidate list so that it can be the basis of other candidates.
+ Candidates.push_back(C);
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasis(
+ Instruction *I) {
+ switch (I->getOpcode()) {
+ case Instruction::Add:
+ allocateCandidatesAndFindBasisForAdd(I);
+ break;
+ case Instruction::Mul:
+ allocateCandidatesAndFindBasisForMul(I);
+ break;
+ case Instruction::GetElementPtr:
+ allocateCandidatesAndFindBasisForGEP(cast<GetElementPtrInst>(I));
+ break;
+ }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
+ Instruction *I) {
+ // Try matching B + i * S.
+ if (!isa<IntegerType>(I->getType()))
+ return;
+
+ assert(I->getNumOperands() == 2 && "isn't I an add?");
+ Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+ allocateCandidatesAndFindBasisForAdd(LHS, RHS, I);
+ if (LHS != RHS)
+ allocateCandidatesAndFindBasisForAdd(RHS, LHS, I);
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForAdd(
+ Value *LHS, Value *RHS, Instruction *I) {
+ Value *S = nullptr;
+ ConstantInt *Idx = nullptr;
+ if (match(RHS, m_Mul(m_Value(S), m_ConstantInt(Idx)))) {
+ // I = LHS + RHS = LHS + Idx * S
+ allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I);
+ } else if (match(RHS, m_Shl(m_Value(S), m_ConstantInt(Idx)))) {
+ // I = LHS + RHS = LHS + (S << Idx) = LHS + S * (1 << Idx)
+ APInt One(Idx->getBitWidth(), 1);
+ Idx = ConstantInt::get(Idx->getContext(), One << Idx->getValue());
+ allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), Idx, S, I);
+ } else {
+ // At least, I = LHS + 1 * RHS
+ ConstantInt *One = ConstantInt::get(cast<IntegerType>(I->getType()), 1);
+ allocateCandidatesAndFindBasis(Candidate::Add, SE->getSCEV(LHS), One, RHS,
+ I);
+ }
+}
+
+// Returns true if A matches B + C where C is constant.
+static bool matchesAdd(Value *A, Value *&B, ConstantInt *&C) {
+ return (match(A, m_Add(m_Value(B), m_ConstantInt(C))) ||
+ match(A, m_Add(m_ConstantInt(C), m_Value(B))));
+}
+
+// Returns true if A matches B | C where C is constant.
+static bool matchesOr(Value *A, Value *&B, ConstantInt *&C) {
+ return (match(A, m_Or(m_Value(B), m_ConstantInt(C))) ||
+ match(A, m_Or(m_ConstantInt(C), m_Value(B))));
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
+ Value *LHS, Value *RHS, Instruction *I) {
+ Value *B = nullptr;
+ ConstantInt *Idx = nullptr;
+ if (matchesAdd(LHS, B, Idx)) {
+ // If LHS is in the form of "Base + Index", then I is in the form of
+ // "(Base + Index) * RHS".
+ allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
+ } else if (matchesOr(LHS, B, Idx) && haveNoCommonBitsSet(B, Idx, *DL)) {
+ // If LHS is in the form of "Base | Index" and Base and Index have no common
+ // bits set, then
+ // Base | Index = Base + Index
+ // and I is thus in the form of "(Base + Index) * RHS".
+ allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(B), Idx, RHS, I);
+ } else {
+ // Otherwise, at least try the form (LHS + 0) * RHS.
+ ConstantInt *Zero = ConstantInt::get(cast<IntegerType>(I->getType()), 0);
+ allocateCandidatesAndFindBasis(Candidate::Mul, SE->getSCEV(LHS), Zero, RHS,
+ I);
+ }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul(
+ Instruction *I) {
+ // Try matching (B + i) * S.
+ // TODO: we could extend SLSR to float and vector types.
+ if (!isa<IntegerType>(I->getType()))
+ return;
+
+ assert(I->getNumOperands() == 2 && "isn't I a mul?");
+ Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+ allocateCandidatesAndFindBasisForMul(LHS, RHS, I);
+ if (LHS != RHS) {
+ // Symmetrically, try to split RHS to Base + Index.
+ allocateCandidatesAndFindBasisForMul(RHS, LHS, I);
+ }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
+ const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize,
+ Instruction *I) {
+ // I = B + sext(Idx *nsw S) * ElementSize
+ // = B + (sext(Idx) * sext(S)) * ElementSize
+ // = B + (sext(Idx) * ElementSize) * sext(S)
+ // Casting to IntegerType is safe because we skipped vector GEPs.
+ IntegerType *IntPtrTy = cast<IntegerType>(DL->getIntPtrType(I->getType()));
+ ConstantInt *ScaledIdx = ConstantInt::get(
+ IntPtrTy, Idx->getSExtValue() * (int64_t)ElementSize, true);
+ allocateCandidatesAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I);
+}
+
+void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx,
+ const SCEV *Base,
+ uint64_t ElementSize,
+ GetElementPtrInst *GEP) {
+ // At least, ArrayIdx = ArrayIdx *nsw 1.
+ allocateCandidatesAndFindBasisForGEP(
+ Base, ConstantInt::get(cast<IntegerType>(ArrayIdx->getType()), 1),
+ ArrayIdx, ElementSize, GEP);
+ Value *LHS = nullptr;
+ ConstantInt *RHS = nullptr;
+ // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx
+ // itself. This would allow us to handle the shl case for free. However,
+ // matching SCEVs has two issues:
+ //
+ // 1. this would complicate rewriting because the rewriting procedure
+ // would have to translate SCEVs back to IR instructions. This translation
+ // is difficult when LHS is further evaluated to a composite SCEV.
+ //
+ // 2. ScalarEvolution is designed to be control-flow oblivious. It tends
+ // to strip nsw/nuw flags which are critical for SLSR to trace into
+ // sext'ed multiplication.
+ if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) {
+ // SLSR is currently unsafe if i * S may overflow.
+ // GEP = Base + sext(LHS *nsw RHS) * ElementSize
+ allocateCandidatesAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP);
+ } else if (match(ArrayIdx, m_NSWShl(m_Value(LHS), m_ConstantInt(RHS)))) {
+ // GEP = Base + sext(LHS <<nsw RHS) * ElementSize
+ // = Base + sext(LHS *nsw (1 << RHS)) * ElementSize
+ APInt One(RHS->getBitWidth(), 1);
+ ConstantInt *PowerOf2 =
+ ConstantInt::get(RHS->getContext(), One << RHS->getValue());
+ allocateCandidatesAndFindBasisForGEP(Base, PowerOf2, LHS, ElementSize, GEP);
+ }
+}
+
+void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP(
+ GetElementPtrInst *GEP) {
+ // TODO: handle vector GEPs
+ if (GEP->getType()->isVectorTy())
+ return;
+
+ SmallVector<const SCEV *, 4> IndexExprs;
+ for (auto I = GEP->idx_begin(); I != GEP->idx_end(); ++I)
+ IndexExprs.push_back(SE->getSCEV(*I));
+
+ gep_type_iterator GTI = gep_type_begin(GEP);
+ for (unsigned I = 1, E = GEP->getNumOperands(); I != E; ++I, ++GTI) {
+ if (GTI.isStruct())
+ continue;
+
+ const SCEV *OrigIndexExpr = IndexExprs[I - 1];
+ IndexExprs[I - 1] = SE->getZero(OrigIndexExpr->getType());
+
+ // The base of this candidate is GEP's base plus the offsets of all
+ // indices except this current one.
+ const SCEV *BaseExpr = SE->getGEPExpr(cast<GEPOperator>(GEP), IndexExprs);
+ Value *ArrayIdx = GEP->getOperand(I);
+ uint64_t ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+ if (ArrayIdx->getType()->getIntegerBitWidth() <=
+ DL->getPointerSizeInBits(GEP->getAddressSpace())) {
+ // Skip factoring if ArrayIdx is wider than the pointer size, because
+ // ArrayIdx is implicitly truncated to the pointer size.
+ factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP);
+ }
+ // When ArrayIdx is the sext of a value, we try to factor that value as
+ // well. Handling this case is important because array indices are
+ // typically sign-extended to the pointer size.
+ Value *TruncatedArrayIdx = nullptr;
+ if (match(ArrayIdx, m_SExt(m_Value(TruncatedArrayIdx))) &&
+ TruncatedArrayIdx->getType()->getIntegerBitWidth() <=
+ DL->getPointerSizeInBits(GEP->getAddressSpace())) {
+ // Skip factoring if TruncatedArrayIdx is wider than the pointer size,
+ // because TruncatedArrayIdx is implicitly truncated to the pointer size.
+ factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP);
+ }
+
+ IndexExprs[I - 1] = OrigIndexExpr;
+ }
+}
+
+// A helper function that unifies the bitwidth of A and B.
+static void unifyBitWidth(APInt &A, APInt &B) {
+ if (A.getBitWidth() < B.getBitWidth())
+ A = A.sext(B.getBitWidth());
+ else if (A.getBitWidth() > B.getBitWidth())
+ B = B.sext(A.getBitWidth());
+}
+
+Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis,
+ const Candidate &C,
+ IRBuilder<> &Builder,
+ const DataLayout *DL,
+ bool &BumpWithUglyGEP) {
+ APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue();
+ unifyBitWidth(Idx, BasisIdx);
+ APInt IndexOffset = Idx - BasisIdx;
+
+ BumpWithUglyGEP = false;
+ if (Basis.CandidateKind == Candidate::GEP) {
+ APInt ElementSize(
+ IndexOffset.getBitWidth(),
+ DL->getTypeAllocSize(
+ cast<GetElementPtrInst>(Basis.Ins)->getResultElementType()));
+ APInt Q, R;
+ APInt::sdivrem(IndexOffset, ElementSize, Q, R);
+ if (R == 0)
+ IndexOffset = Q;
+ else
+ BumpWithUglyGEP = true;
+ }
+
+ // Compute Bump = C - Basis = (i' - i) * S.
+ // Common case 1: if (i' - i) is 1, Bump = S.
+ if (IndexOffset == 1)
+ return C.Stride;
+ // Common case 2: if (i' - i) is -1, Bump = -S.
+ if (IndexOffset.isAllOnesValue())
+ return Builder.CreateNeg(C.Stride);
+
+ // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may
+ // have different bit widths.
+ IntegerType *DeltaType =
+ IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth());
+ Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType);
+ if (IndexOffset.isPowerOf2()) {
+ // If (i' - i) is a power of 2, Bump = sext/trunc(S) << log(i' - i).
+ ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2());
+ return Builder.CreateShl(ExtendedStride, Exponent);
+ }
+ if ((-IndexOffset).isPowerOf2()) {
+ // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i).
+ ConstantInt *Exponent =
+ ConstantInt::get(DeltaType, (-IndexOffset).logBase2());
+ return Builder.CreateNeg(Builder.CreateShl(ExtendedStride, Exponent));
+ }
+ Constant *Delta = ConstantInt::get(DeltaType, IndexOffset);
+ return Builder.CreateMul(ExtendedStride, Delta);
+}
+
+void StraightLineStrengthReduce::rewriteCandidateWithBasis(
+ const Candidate &C, const Candidate &Basis) {
+ assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base &&
+ C.Stride == Basis.Stride);
+ // We run rewriteCandidateWithBasis on all candidates in a post-order, so the
+ // basis of a candidate cannot be unlinked before the candidate.
+ assert(Basis.Ins->getParent() != nullptr && "the basis is unlinked");
+
+ // An instruction can correspond to multiple candidates. Therefore, instead of
+ // simply deleting an instruction when we rewrite it, we mark its parent as
+ // nullptr (i.e. unlink it) so that we can skip the candidates whose
+ // instruction is already rewritten.
+ if (!C.Ins->getParent())
+ return;
+
+ IRBuilder<> Builder(C.Ins);
+ bool BumpWithUglyGEP;
+ Value *Bump = emitBump(Basis, C, Builder, DL, BumpWithUglyGEP);
+ Value *Reduced = nullptr; // equivalent to but weaker than C.Ins
+ switch (C.CandidateKind) {
+ case Candidate::Add:
+ case Candidate::Mul: {
+ // C = Basis + Bump
+ Value *NegBump;
+ if (match(Bump, m_Neg(m_Value(NegBump)))) {
+ // If Bump is a neg instruction, emit C = Basis - (-Bump).
+ Reduced = Builder.CreateSub(Basis.Ins, NegBump);
+ // We only use the negative argument of Bump, and Bump itself may be
+ // trivially dead.
+ RecursivelyDeleteTriviallyDeadInstructions(Bump);
+ } else {
+ // It's tempting to preserve nsw on Bump and/or Reduced. However, it's
+ // usually unsound, e.g.,
+ //
+ // X = (-2 +nsw 1) *nsw INT_MAX
+ // Y = (-2 +nsw 3) *nsw INT_MAX
+ // =>
+ // Y = X + 2 * INT_MAX
+ //
+ // Neither + and * in the resultant expression are nsw.
+ Reduced = Builder.CreateAdd(Basis.Ins, Bump);
+ }
+ break;
+ }
+ case Candidate::GEP:
+ {
+ Type *IntPtrTy = DL->getIntPtrType(C.Ins->getType());
+ bool InBounds = cast<GetElementPtrInst>(C.Ins)->isInBounds();
+ if (BumpWithUglyGEP) {
+ // C = (char *)Basis + Bump
+ unsigned AS = Basis.Ins->getType()->getPointerAddressSpace();
+ Type *CharTy = Type::getInt8PtrTy(Basis.Ins->getContext(), AS);
+ Reduced = Builder.CreateBitCast(Basis.Ins, CharTy);
+ if (InBounds)
+ Reduced =
+ Builder.CreateInBoundsGEP(Builder.getInt8Ty(), Reduced, Bump);
+ else
+ Reduced = Builder.CreateGEP(Builder.getInt8Ty(), Reduced, Bump);
+ Reduced = Builder.CreateBitCast(Reduced, C.Ins->getType());
+ } else {
+ // C = gep Basis, Bump
+ // Canonicalize bump to pointer size.
+ Bump = Builder.CreateSExtOrTrunc(Bump, IntPtrTy);
+ if (InBounds)
+ Reduced = Builder.CreateInBoundsGEP(
+ cast<GetElementPtrInst>(Basis.Ins)->getResultElementType(),
+ Basis.Ins, Bump);
+ else
+ Reduced = Builder.CreateGEP(
+ cast<GetElementPtrInst>(Basis.Ins)->getResultElementType(),
+ Basis.Ins, Bump);
+ }
+ break;
+ }
+ default:
+ llvm_unreachable("C.CandidateKind is invalid");
+ };
+ Reduced->takeName(C.Ins);
+ C.Ins->replaceAllUsesWith(Reduced);
+ // Unlink C.Ins so that we can skip other candidates also corresponding to
+ // C.Ins. The actual deletion is postponed to the end of runOnFunction.
+ C.Ins->removeFromParent();
+ UnlinkedInstructions.push_back(C.Ins);
+}
+
bool StraightLineStrengthReduceLegacyPass::runOnFunction(Function &F) {
- if (skipFunction(F))
- return false;
-
+ if (skipFunction(F))
+ return false;
+
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
@@ -726,35 +726,35 @@ bool StraightLineStrengthReduceLegacyPass::runOnFunction(Function &F) {
}
bool StraightLineStrengthReduce::runOnFunction(Function &F) {
- // Traverse the dominator tree in the depth-first order. This order makes sure
- // all bases of a candidate are in Candidates when we process it.
- for (const auto Node : depth_first(DT))
- for (auto &I : *(Node->getBlock()))
- allocateCandidatesAndFindBasis(&I);
-
- // Rewrite candidates in the reverse depth-first order. This order makes sure
- // a candidate being rewritten is not a basis for any other candidate.
- while (!Candidates.empty()) {
- const Candidate &C = Candidates.back();
- if (C.Basis != nullptr) {
- rewriteCandidateWithBasis(C, *C.Basis);
- }
- Candidates.pop_back();
- }
-
- // Delete all unlink instructions.
- for (auto *UnlinkedInst : UnlinkedInstructions) {
- for (unsigned I = 0, E = UnlinkedInst->getNumOperands(); I != E; ++I) {
- Value *Op = UnlinkedInst->getOperand(I);
- UnlinkedInst->setOperand(I, nullptr);
- RecursivelyDeleteTriviallyDeadInstructions(Op);
- }
- UnlinkedInst->deleteValue();
- }
- bool Ret = !UnlinkedInstructions.empty();
- UnlinkedInstructions.clear();
- return Ret;
-}
+ // Traverse the dominator tree in the depth-first order. This order makes sure
+ // all bases of a candidate are in Candidates when we process it.
+ for (const auto Node : depth_first(DT))
+ for (auto &I : *(Node->getBlock()))
+ allocateCandidatesAndFindBasis(&I);
+
+ // Rewrite candidates in the reverse depth-first order. This order makes sure
+ // a candidate being rewritten is not a basis for any other candidate.
+ while (!Candidates.empty()) {
+ const Candidate &C = Candidates.back();
+ if (C.Basis != nullptr) {
+ rewriteCandidateWithBasis(C, *C.Basis);
+ }
+ Candidates.pop_back();
+ }
+
+ // Delete all unlink instructions.
+ for (auto *UnlinkedInst : UnlinkedInstructions) {
+ for (unsigned I = 0, E = UnlinkedInst->getNumOperands(); I != E; ++I) {
+ Value *Op = UnlinkedInst->getOperand(I);
+ UnlinkedInst->setOperand(I, nullptr);
+ RecursivelyDeleteTriviallyDeadInstructions(Op);
+ }
+ UnlinkedInst->deleteValue();
+ }
+ bool Ret = !UnlinkedInstructions.empty();
+ UnlinkedInstructions.clear();
+ return Ret;
+}
namespace llvm {
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/StructurizeCFG.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/StructurizeCFG.cpp
index 5fd33b57e3..3e15cad5f3 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -1,315 +1,315 @@
-//===- StructurizeCFG.cpp -------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
+//===- StructurizeCFG.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Scalar/StructurizeCFG.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/SCCIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
-#include "llvm/Analysis/RegionInfo.h"
-#include "llvm/Analysis/RegionIterator.h"
-#include "llvm/Analysis/RegionPass.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Metadata.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/RegionPass.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include <algorithm>
-#include <cassert>
-#include <utility>
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-#define DEBUG_TYPE "structurizecfg"
-
-// The name for newly created blocks.
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include <algorithm>
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "structurizecfg"
+
+// The name for newly created blocks.
const char FlowBlockName[] = "Flow";
-
-namespace {
-
-static cl::opt<bool> ForceSkipUniformRegions(
- "structurizecfg-skip-uniform-regions",
- cl::Hidden,
- cl::desc("Force whether the StructurizeCFG pass skips uniform regions"),
- cl::init(false));
-
-static cl::opt<bool>
- RelaxedUniformRegions("structurizecfg-relaxed-uniform-regions", cl::Hidden,
- cl::desc("Allow relaxed uniform region checks"),
- cl::init(true));
-
-// Definition of the complex types used in this pass.
-
-using BBValuePair = std::pair<BasicBlock *, Value *>;
-
-using RNVector = SmallVector<RegionNode *, 8>;
-using BBVector = SmallVector<BasicBlock *, 8>;
-using BranchVector = SmallVector<BranchInst *, 8>;
-using BBValueVector = SmallVector<BBValuePair, 2>;
-
-using BBSet = SmallPtrSet<BasicBlock *, 8>;
-
-using PhiMap = MapVector<PHINode *, BBValueVector>;
-using BB2BBVecMap = MapVector<BasicBlock *, BBVector>;
-
-using BBPhiMap = DenseMap<BasicBlock *, PhiMap>;
-using BBPredicates = DenseMap<BasicBlock *, Value *>;
-using PredMap = DenseMap<BasicBlock *, BBPredicates>;
-using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
-
-// A traits type that is intended to be used in graph algorithms. The graph
-// traits starts at an entry node, and traverses the RegionNodes that are in
-// the Nodes set.
-struct SubGraphTraits {
- using NodeRef = std::pair<RegionNode *, SmallDenseSet<RegionNode *> *>;
- using BaseSuccIterator = GraphTraits<RegionNode *>::ChildIteratorType;
-
- // This wraps a set of Nodes into the iterator, so we know which edges to
- // filter out.
- class WrappedSuccIterator
- : public iterator_adaptor_base<
- WrappedSuccIterator, BaseSuccIterator,
- typename std::iterator_traits<BaseSuccIterator>::iterator_category,
- NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> {
- SmallDenseSet<RegionNode *> *Nodes;
-
- public:
- WrappedSuccIterator(BaseSuccIterator It, SmallDenseSet<RegionNode *> *Nodes)
- : iterator_adaptor_base(It), Nodes(Nodes) {}
-
- NodeRef operator*() const { return {*I, Nodes}; }
- };
-
- static bool filterAll(const NodeRef &N) { return true; }
- static bool filterSet(const NodeRef &N) { return N.second->count(N.first); }
-
- using ChildIteratorType =
- filter_iterator<WrappedSuccIterator, bool (*)(const NodeRef &)>;
-
- static NodeRef getEntryNode(Region *R) {
- return {GraphTraits<Region *>::getEntryNode(R), nullptr};
- }
-
- static NodeRef getEntryNode(NodeRef N) { return N; }
-
- static iterator_range<ChildIteratorType> children(const NodeRef &N) {
- auto *filter = N.second ? &filterSet : &filterAll;
- return make_filter_range(
- make_range<WrappedSuccIterator>(
- {GraphTraits<RegionNode *>::child_begin(N.first), N.second},
- {GraphTraits<RegionNode *>::child_end(N.first), N.second}),
- filter);
- }
-
- static ChildIteratorType child_begin(const NodeRef &N) {
- return children(N).begin();
- }
-
- static ChildIteratorType child_end(const NodeRef &N) {
- return children(N).end();
- }
-};
-
-/// Finds the nearest common dominator of a set of BasicBlocks.
-///
-/// For every BB you add to the set, you can specify whether we "remember" the
-/// block. When you get the common dominator, you can also ask whether it's one
-/// of the blocks we remembered.
-class NearestCommonDominator {
- DominatorTree *DT;
- BasicBlock *Result = nullptr;
- bool ResultIsRemembered = false;
-
- /// Add BB to the resulting dominator.
- void addBlock(BasicBlock *BB, bool Remember) {
- if (!Result) {
- Result = BB;
- ResultIsRemembered = Remember;
- return;
- }
-
- BasicBlock *NewResult = DT->findNearestCommonDominator(Result, BB);
- if (NewResult != Result)
- ResultIsRemembered = false;
- if (NewResult == BB)
- ResultIsRemembered |= Remember;
- Result = NewResult;
- }
-
-public:
- explicit NearestCommonDominator(DominatorTree *DomTree) : DT(DomTree) {}
-
- void addBlock(BasicBlock *BB) {
- addBlock(BB, /* Remember = */ false);
- }
-
- void addAndRememberBlock(BasicBlock *BB) {
- addBlock(BB, /* Remember = */ true);
- }
-
- /// Get the nearest common dominator of all the BBs added via addBlock() and
- /// addAndRememberBlock().
- BasicBlock *result() { return Result; }
-
- /// Is the BB returned by getResult() one of the blocks we added to the set
- /// with addAndRememberBlock()?
- bool resultIsRememberedBlock() { return ResultIsRemembered; }
-};
-
-/// Transforms the control flow graph on one single entry/exit region
-/// at a time.
-///
-/// After the transform all "If"/"Then"/"Else" style control flow looks like
-/// this:
-///
-/// \verbatim
-/// 1
-/// ||
-/// | |
-/// 2 |
-/// | /
-/// |/
-/// 3
-/// || Where:
-/// | | 1 = "If" block, calculates the condition
-/// 4 | 2 = "Then" subregion, runs if the condition is true
-/// | / 3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow
-/// |/ 4 = "Else" optional subregion, runs if the condition is false
-/// 5 5 = "End" block, also rejoins the control flow
-/// \endverbatim
-///
-/// Control flow is expressed as a branch where the true exit goes into the
-/// "Then"/"Else" region, while the false exit skips the region
-/// The condition for the optional "Else" region is expressed as a PHI node.
-/// The incoming values of the PHI node are true for the "If" edge and false
-/// for the "Then" edge.
-///
-/// Additionally to that even complicated loops look like this:
-///
-/// \verbatim
-/// 1
-/// ||
-/// | |
-/// 2 ^ Where:
-/// | / 1 = "Entry" block
-/// |/ 2 = "Loop" optional subregion, with all exits at "Flow" block
-/// 3 3 = "Flow" block, with back edge to entry block
-/// |
-/// \endverbatim
-///
-/// The back edge of the "Flow" block is always on the false side of the branch
-/// while the true side continues the general flow. So the loop condition
-/// consist of a network of PHI nodes where the true incoming values expresses
-/// breaks and the false values expresses continue states.
-
+
+namespace {
+
+static cl::opt<bool> ForceSkipUniformRegions(
+ "structurizecfg-skip-uniform-regions",
+ cl::Hidden,
+ cl::desc("Force whether the StructurizeCFG pass skips uniform regions"),
+ cl::init(false));
+
+static cl::opt<bool>
+ RelaxedUniformRegions("structurizecfg-relaxed-uniform-regions", cl::Hidden,
+ cl::desc("Allow relaxed uniform region checks"),
+ cl::init(true));
+
+// Definition of the complex types used in this pass.
+
+using BBValuePair = std::pair<BasicBlock *, Value *>;
+
+using RNVector = SmallVector<RegionNode *, 8>;
+using BBVector = SmallVector<BasicBlock *, 8>;
+using BranchVector = SmallVector<BranchInst *, 8>;
+using BBValueVector = SmallVector<BBValuePair, 2>;
+
+using BBSet = SmallPtrSet<BasicBlock *, 8>;
+
+using PhiMap = MapVector<PHINode *, BBValueVector>;
+using BB2BBVecMap = MapVector<BasicBlock *, BBVector>;
+
+using BBPhiMap = DenseMap<BasicBlock *, PhiMap>;
+using BBPredicates = DenseMap<BasicBlock *, Value *>;
+using PredMap = DenseMap<BasicBlock *, BBPredicates>;
+using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
+
+// A traits type that is intended to be used in graph algorithms. The graph
+// traits starts at an entry node, and traverses the RegionNodes that are in
+// the Nodes set.
+struct SubGraphTraits {
+ using NodeRef = std::pair<RegionNode *, SmallDenseSet<RegionNode *> *>;
+ using BaseSuccIterator = GraphTraits<RegionNode *>::ChildIteratorType;
+
+ // This wraps a set of Nodes into the iterator, so we know which edges to
+ // filter out.
+ class WrappedSuccIterator
+ : public iterator_adaptor_base<
+ WrappedSuccIterator, BaseSuccIterator,
+ typename std::iterator_traits<BaseSuccIterator>::iterator_category,
+ NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> {
+ SmallDenseSet<RegionNode *> *Nodes;
+
+ public:
+ WrappedSuccIterator(BaseSuccIterator It, SmallDenseSet<RegionNode *> *Nodes)
+ : iterator_adaptor_base(It), Nodes(Nodes) {}
+
+ NodeRef operator*() const { return {*I, Nodes}; }
+ };
+
+ static bool filterAll(const NodeRef &N) { return true; }
+ static bool filterSet(const NodeRef &N) { return N.second->count(N.first); }
+
+ using ChildIteratorType =
+ filter_iterator<WrappedSuccIterator, bool (*)(const NodeRef &)>;
+
+ static NodeRef getEntryNode(Region *R) {
+ return {GraphTraits<Region *>::getEntryNode(R), nullptr};
+ }
+
+ static NodeRef getEntryNode(NodeRef N) { return N; }
+
+ static iterator_range<ChildIteratorType> children(const NodeRef &N) {
+ auto *filter = N.second ? &filterSet : &filterAll;
+ return make_filter_range(
+ make_range<WrappedSuccIterator>(
+ {GraphTraits<RegionNode *>::child_begin(N.first), N.second},
+ {GraphTraits<RegionNode *>::child_end(N.first), N.second}),
+ filter);
+ }
+
+ static ChildIteratorType child_begin(const NodeRef &N) {
+ return children(N).begin();
+ }
+
+ static ChildIteratorType child_end(const NodeRef &N) {
+ return children(N).end();
+ }
+};
+
+/// Finds the nearest common dominator of a set of BasicBlocks.
+///
+/// For every BB you add to the set, you can specify whether we "remember" the
+/// block. When you get the common dominator, you can also ask whether it's one
+/// of the blocks we remembered.
+class NearestCommonDominator {
+ DominatorTree *DT;
+ BasicBlock *Result = nullptr;
+ bool ResultIsRemembered = false;
+
+ /// Add BB to the resulting dominator.
+ void addBlock(BasicBlock *BB, bool Remember) {
+ if (!Result) {
+ Result = BB;
+ ResultIsRemembered = Remember;
+ return;
+ }
+
+ BasicBlock *NewResult = DT->findNearestCommonDominator(Result, BB);
+ if (NewResult != Result)
+ ResultIsRemembered = false;
+ if (NewResult == BB)
+ ResultIsRemembered |= Remember;
+ Result = NewResult;
+ }
+
+public:
+ explicit NearestCommonDominator(DominatorTree *DomTree) : DT(DomTree) {}
+
+ void addBlock(BasicBlock *BB) {
+ addBlock(BB, /* Remember = */ false);
+ }
+
+ void addAndRememberBlock(BasicBlock *BB) {
+ addBlock(BB, /* Remember = */ true);
+ }
+
+ /// Get the nearest common dominator of all the BBs added via addBlock() and
+ /// addAndRememberBlock().
+ BasicBlock *result() { return Result; }
+
+ /// Is the BB returned by getResult() one of the blocks we added to the set
+ /// with addAndRememberBlock()?
+ bool resultIsRememberedBlock() { return ResultIsRemembered; }
+};
+
+/// Transforms the control flow graph on one single entry/exit region
+/// at a time.
+///
+/// After the transform all "If"/"Then"/"Else" style control flow looks like
+/// this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 |
+/// | /
+/// |/
+/// 3
+/// || Where:
+/// | | 1 = "If" block, calculates the condition
+/// 4 | 2 = "Then" subregion, runs if the condition is true
+/// | / 3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow
+/// |/ 4 = "Else" optional subregion, runs if the condition is false
+/// 5 5 = "End" block, also rejoins the control flow
+/// \endverbatim
+///
+/// Control flow is expressed as a branch where the true exit goes into the
+/// "Then"/"Else" region, while the false exit skips the region
+/// The condition for the optional "Else" region is expressed as a PHI node.
+/// The incoming values of the PHI node are true for the "If" edge and false
+/// for the "Then" edge.
+///
+/// Additionally to that even complicated loops look like this:
+///
+/// \verbatim
+/// 1
+/// ||
+/// | |
+/// 2 ^ Where:
+/// | / 1 = "Entry" block
+/// |/ 2 = "Loop" optional subregion, with all exits at "Flow" block
+/// 3 3 = "Flow" block, with back edge to entry block
+/// |
+/// \endverbatim
+///
+/// The back edge of the "Flow" block is always on the false side of the branch
+/// while the true side continues the general flow. So the loop condition
+/// consist of a network of PHI nodes where the true incoming values expresses
+/// breaks and the false values expresses continue states.
+
class StructurizeCFG {
- Type *Boolean;
- ConstantInt *BoolTrue;
- ConstantInt *BoolFalse;
- UndefValue *BoolUndef;
-
- Function *Func;
- Region *ParentRegion;
-
+ Type *Boolean;
+ ConstantInt *BoolTrue;
+ ConstantInt *BoolFalse;
+ UndefValue *BoolUndef;
+
+ Function *Func;
+ Region *ParentRegion;
+
LegacyDivergenceAnalysis *DA = nullptr;
- DominatorTree *DT;
-
- SmallVector<RegionNode *, 8> Order;
- BBSet Visited;
-
- SmallVector<WeakVH, 8> AffectedPhis;
- BBPhiMap DeletedPhis;
- BB2BBVecMap AddedPhis;
-
- PredMap Predicates;
- BranchVector Conditions;
-
- BB2BBMap Loops;
- PredMap LoopPreds;
- BranchVector LoopConds;
-
- RegionNode *PrevNode;
-
- void orderNodes();
-
- void analyzeLoops(RegionNode *N);
-
- Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
-
- void gatherPredicates(RegionNode *N);
-
- void collectInfos();
-
- void insertConditions(bool Loops);
-
- void delPhiValues(BasicBlock *From, BasicBlock *To);
-
- void addPhiValues(BasicBlock *From, BasicBlock *To);
-
- void setPhiValues();
-
- void simplifyAffectedPhis();
-
- void killTerminator(BasicBlock *BB);
-
- void changeExit(RegionNode *Node, BasicBlock *NewExit,
- bool IncludeDominator);
-
- BasicBlock *getNextFlow(BasicBlock *Dominator);
-
- BasicBlock *needPrefix(bool NeedEmpty);
-
- BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed);
-
- void setPrevNode(BasicBlock *BB);
-
- bool dominatesPredicates(BasicBlock *BB, RegionNode *Node);
-
- bool isPredictableTrue(RegionNode *Node);
-
- void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd);
-
- void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);
-
- void createFlow();
-
- void rebuildSSA();
-
-public:
+ DominatorTree *DT;
+
+ SmallVector<RegionNode *, 8> Order;
+ BBSet Visited;
+
+ SmallVector<WeakVH, 8> AffectedPhis;
+ BBPhiMap DeletedPhis;
+ BB2BBVecMap AddedPhis;
+
+ PredMap Predicates;
+ BranchVector Conditions;
+
+ BB2BBMap Loops;
+ PredMap LoopPreds;
+ BranchVector LoopConds;
+
+ RegionNode *PrevNode;
+
+ void orderNodes();
+
+ void analyzeLoops(RegionNode *N);
+
+ Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
+
+ void gatherPredicates(RegionNode *N);
+
+ void collectInfos();
+
+ void insertConditions(bool Loops);
+
+ void delPhiValues(BasicBlock *From, BasicBlock *To);
+
+ void addPhiValues(BasicBlock *From, BasicBlock *To);
+
+ void setPhiValues();
+
+ void simplifyAffectedPhis();
+
+ void killTerminator(BasicBlock *BB);
+
+ void changeExit(RegionNode *Node, BasicBlock *NewExit,
+ bool IncludeDominator);
+
+ BasicBlock *getNextFlow(BasicBlock *Dominator);
+
+ BasicBlock *needPrefix(bool NeedEmpty);
+
+ BasicBlock *needPostfix(BasicBlock *Flow, bool ExitUseAllowed);
+
+ void setPrevNode(BasicBlock *BB);
+
+ bool dominatesPredicates(BasicBlock *BB, RegionNode *Node);
+
+ bool isPredictableTrue(RegionNode *Node);
+
+ void wireFlow(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+ void handleLoops(bool ExitUseAllowed, BasicBlock *LoopEnd);
+
+ void createFlow();
+
+ void rebuildSSA();
+
+public:
void init(Region *R);
bool run(Region *R, DominatorTree *DT);
bool makeUniformRegion(Region *R, LegacyDivergenceAnalysis *DA);
@@ -319,15 +319,15 @@ class StructurizeCFGLegacyPass : public RegionPass {
bool SkipUniformRegions;
public:
- static char ID;
-
+ static char ID;
+
explicit StructurizeCFGLegacyPass(bool SkipUniformRegions_ = false)
: RegionPass(ID), SkipUniformRegions(SkipUniformRegions_) {
- if (ForceSkipUniformRegions.getNumOccurrences())
- SkipUniformRegions = ForceSkipUniformRegions.getValue();
+ if (ForceSkipUniformRegions.getNumOccurrences())
+ SkipUniformRegions = ForceSkipUniformRegions.getValue();
initializeStructurizeCFGLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
+ }
+
bool runOnRegion(Region *R, RGPassManager &RGM) override {
StructurizeCFG SCFG;
SCFG.init(R);
@@ -339,675 +339,675 @@ public:
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
return SCFG.run(R, DT);
}
-
- StringRef getPassName() const override { return "Structurize control flow"; }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- if (SkipUniformRegions)
- AU.addRequired<LegacyDivergenceAnalysis>();
- AU.addRequiredID(LowerSwitchID);
- AU.addRequired<DominatorTreeWrapperPass>();
-
- AU.addPreserved<DominatorTreeWrapperPass>();
- RegionPass::getAnalysisUsage(AU);
- }
-};
-
-} // end anonymous namespace
-
+
+ StringRef getPassName() const override { return "Structurize control flow"; }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ if (SkipUniformRegions)
+ AU.addRequired<LegacyDivergenceAnalysis>();
+ AU.addRequiredID(LowerSwitchID);
+ AU.addRequired<DominatorTreeWrapperPass>();
+
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ RegionPass::getAnalysisUsage(AU);
+ }
+};
+
+} // end anonymous namespace
+
char StructurizeCFGLegacyPass::ID = 0;
-
+
INITIALIZE_PASS_BEGIN(StructurizeCFGLegacyPass, "structurizecfg",
"Structurize the CFG", false, false)
-INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
+INITIALIZE_PASS_DEPENDENCY(LegacyDivergenceAnalysis)
INITIALIZE_PASS_DEPENDENCY(LowerSwitchLegacyPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(RegionInfoPass)
INITIALIZE_PASS_END(StructurizeCFGLegacyPass, "structurizecfg",
"Structurize the CFG", false, false)
-
-/// Build up the general order of nodes, by performing a topological sort of the
-/// parent region's nodes, while ensuring that there is no outer cycle node
-/// between any two inner cycle nodes.
-void StructurizeCFG::orderNodes() {
- Order.resize(std::distance(GraphTraits<Region *>::nodes_begin(ParentRegion),
- GraphTraits<Region *>::nodes_end(ParentRegion)));
- if (Order.empty())
- return;
-
- SmallDenseSet<RegionNode *> Nodes;
- auto EntryNode = SubGraphTraits::getEntryNode(ParentRegion);
-
- // A list of range indices of SCCs in Order, to be processed.
- SmallVector<std::pair<unsigned, unsigned>, 8> WorkList;
- unsigned I = 0, E = Order.size();
- while (true) {
- // Run through all the SCCs in the subgraph starting with Entry.
- for (auto SCCI =
- scc_iterator<SubGraphTraits::NodeRef, SubGraphTraits>::begin(
- EntryNode);
- !SCCI.isAtEnd(); ++SCCI) {
- auto &SCC = *SCCI;
-
- // An SCC up to the size of 2, can be reduced to an entry (the last node),
- // and a possible additional node. Therefore, it is already in order, and
- // there is no need to add it to the work-list.
- unsigned Size = SCC.size();
- if (Size > 2)
- WorkList.emplace_back(I, I + Size);
-
- // Add the SCC nodes to the Order array.
- for (auto &N : SCC) {
- assert(I < E && "SCC size mismatch!");
- Order[I++] = N.first;
- }
- }
- assert(I == E && "SCC size mismatch!");
-
- // If there are no more SCCs to order, then we are done.
- if (WorkList.empty())
- break;
-
- std::tie(I, E) = WorkList.pop_back_val();
-
- // Collect the set of nodes in the SCC's subgraph. These are only the
- // possible child nodes; we do not add the entry (last node) otherwise we
- // will have the same exact SCC all over again.
- Nodes.clear();
- Nodes.insert(Order.begin() + I, Order.begin() + E - 1);
-
- // Update the entry node.
- EntryNode.first = Order[E - 1];
- EntryNode.second = &Nodes;
- }
-}
-
-/// Determine the end of the loops
-void StructurizeCFG::analyzeLoops(RegionNode *N) {
- if (N->isSubRegion()) {
- // Test for exit as back edge
- BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
- if (Visited.count(Exit))
- Loops[Exit] = N->getEntry();
-
- } else {
- // Test for successors as back edge
- BasicBlock *BB = N->getNodeAs<BasicBlock>();
- BranchInst *Term = cast<BranchInst>(BB->getTerminator());
-
- for (BasicBlock *Succ : Term->successors())
- if (Visited.count(Succ))
- Loops[Succ] = BB;
- }
-}
-
-/// Build the condition for one edge
-Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
- bool Invert) {
- Value *Cond = Invert ? BoolFalse : BoolTrue;
- if (Term->isConditional()) {
- Cond = Term->getCondition();
-
- if (Idx != (unsigned)Invert)
- Cond = invertCondition(Cond);
- }
- return Cond;
-}
-
-/// Analyze the predecessors of each block and build up predicates
-void StructurizeCFG::gatherPredicates(RegionNode *N) {
- RegionInfo *RI = ParentRegion->getRegionInfo();
- BasicBlock *BB = N->getEntry();
- BBPredicates &Pred = Predicates[BB];
- BBPredicates &LPred = LoopPreds[BB];
-
- for (BasicBlock *P : predecessors(BB)) {
- // Ignore it if it's a branch from outside into our region entry
- if (!ParentRegion->contains(P))
- continue;
-
- Region *R = RI->getRegionFor(P);
- if (R == ParentRegion) {
- // It's a top level block in our region
- BranchInst *Term = cast<BranchInst>(P->getTerminator());
- for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
- BasicBlock *Succ = Term->getSuccessor(i);
- if (Succ != BB)
- continue;
-
- if (Visited.count(P)) {
- // Normal forward edge
- if (Term->isConditional()) {
- // Try to treat it like an ELSE block
- BasicBlock *Other = Term->getSuccessor(!i);
- if (Visited.count(Other) && !Loops.count(Other) &&
- !Pred.count(Other) && !Pred.count(P)) {
-
- Pred[Other] = BoolFalse;
- Pred[P] = BoolTrue;
- continue;
- }
- }
- Pred[P] = buildCondition(Term, i, false);
- } else {
- // Back edge
- LPred[P] = buildCondition(Term, i, true);
- }
- }
- } else {
- // It's an exit from a sub region
- while (R->getParent() != ParentRegion)
- R = R->getParent();
-
- // Edge from inside a subregion to its entry, ignore it
- if (*R == *N)
- continue;
-
- BasicBlock *Entry = R->getEntry();
- if (Visited.count(Entry))
- Pred[Entry] = BoolTrue;
- else
- LPred[Entry] = BoolFalse;
- }
- }
-}
-
-/// Collect various loop and predicate infos
-void StructurizeCFG::collectInfos() {
- // Reset predicate
- Predicates.clear();
-
- // and loop infos
- Loops.clear();
- LoopPreds.clear();
-
- // Reset the visited nodes
- Visited.clear();
-
- for (RegionNode *RN : reverse(Order)) {
- LLVM_DEBUG(dbgs() << "Visiting: "
- << (RN->isSubRegion() ? "SubRegion with entry: " : "")
- << RN->getEntry()->getName() << "\n");
-
- // Analyze all the conditions leading to a node
- gatherPredicates(RN);
-
- // Remember that we've seen this node
- Visited.insert(RN->getEntry());
-
- // Find the last back edges
- analyzeLoops(RN);
- }
-}
-
-/// Insert the missing branch conditions
-void StructurizeCFG::insertConditions(bool Loops) {
- BranchVector &Conds = Loops ? LoopConds : Conditions;
- Value *Default = Loops ? BoolTrue : BoolFalse;
- SSAUpdater PhiInserter;
-
- for (BranchInst *Term : Conds) {
- assert(Term->isConditional());
-
- BasicBlock *Parent = Term->getParent();
- BasicBlock *SuccTrue = Term->getSuccessor(0);
- BasicBlock *SuccFalse = Term->getSuccessor(1);
-
- PhiInserter.Initialize(Boolean, "");
- PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
- PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
-
- BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
-
- NearestCommonDominator Dominator(DT);
- Dominator.addBlock(Parent);
-
- Value *ParentValue = nullptr;
- for (std::pair<BasicBlock *, Value *> BBAndPred : Preds) {
- BasicBlock *BB = BBAndPred.first;
- Value *Pred = BBAndPred.second;
-
- if (BB == Parent) {
- ParentValue = Pred;
- break;
- }
- PhiInserter.AddAvailableValue(BB, Pred);
- Dominator.addAndRememberBlock(BB);
- }
-
- if (ParentValue) {
- Term->setCondition(ParentValue);
- } else {
- if (!Dominator.resultIsRememberedBlock())
- PhiInserter.AddAvailableValue(Dominator.result(), Default);
-
- Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
- }
- }
-}
-
-/// Remove all PHI values coming from "From" into "To" and remember
-/// them in DeletedPhis
-void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
- PhiMap &Map = DeletedPhis[To];
- for (PHINode &Phi : To->phis()) {
- bool Recorded = false;
- while (Phi.getBasicBlockIndex(From) != -1) {
- Value *Deleted = Phi.removeIncomingValue(From, false);
- Map[&Phi].push_back(std::make_pair(From, Deleted));
- if (!Recorded) {
- AffectedPhis.push_back(&Phi);
- Recorded = true;
- }
- }
- }
-}
-
-/// Add a dummy PHI value as soon as we knew the new predecessor
-void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
- for (PHINode &Phi : To->phis()) {
- Value *Undef = UndefValue::get(Phi.getType());
- Phi.addIncoming(Undef, From);
- }
- AddedPhis[To].push_back(From);
-}
-
-/// Add the real PHI value as soon as everything is set up
-void StructurizeCFG::setPhiValues() {
- SmallVector<PHINode *, 8> InsertedPhis;
- SSAUpdater Updater(&InsertedPhis);
- for (const auto &AddedPhi : AddedPhis) {
- BasicBlock *To = AddedPhi.first;
- const BBVector &From = AddedPhi.second;
-
- if (!DeletedPhis.count(To))
- continue;
-
- PhiMap &Map = DeletedPhis[To];
- for (const auto &PI : Map) {
- PHINode *Phi = PI.first;
- Value *Undef = UndefValue::get(Phi->getType());
- Updater.Initialize(Phi->getType(), "");
- Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
- Updater.AddAvailableValue(To, Undef);
-
- NearestCommonDominator Dominator(DT);
- Dominator.addBlock(To);
- for (const auto &VI : PI.second) {
- Updater.AddAvailableValue(VI.first, VI.second);
- Dominator.addAndRememberBlock(VI.first);
- }
-
- if (!Dominator.resultIsRememberedBlock())
- Updater.AddAvailableValue(Dominator.result(), Undef);
-
- for (BasicBlock *FI : From)
- Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI));
- AffectedPhis.push_back(Phi);
- }
-
- DeletedPhis.erase(To);
- }
- assert(DeletedPhis.empty());
-
- AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
-}
-
-void StructurizeCFG::simplifyAffectedPhis() {
- bool Changed;
- do {
- Changed = false;
- SimplifyQuery Q(Func->getParent()->getDataLayout());
- Q.DT = DT;
- for (WeakVH VH : AffectedPhis) {
- if (auto Phi = dyn_cast_or_null<PHINode>(VH)) {
- if (auto NewValue = SimplifyInstruction(Phi, Q)) {
- Phi->replaceAllUsesWith(NewValue);
- Phi->eraseFromParent();
- Changed = true;
- }
- }
- }
- } while (Changed);
-}
-
-/// Remove phi values from all successors and then remove the terminator.
-void StructurizeCFG::killTerminator(BasicBlock *BB) {
- Instruction *Term = BB->getTerminator();
- if (!Term)
- return;
-
- for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
- SI != SE; ++SI)
- delPhiValues(BB, *SI);
-
- if (DA)
- DA->removeValue(Term);
- Term->eraseFromParent();
-}
-
-/// Let node exit(s) point to NewExit
-void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
- bool IncludeDominator) {
- if (Node->isSubRegion()) {
- Region *SubRegion = Node->getNodeAs<Region>();
- BasicBlock *OldExit = SubRegion->getExit();
- BasicBlock *Dominator = nullptr;
-
- // Find all the edges from the sub region to the exit
- for (auto BBI = pred_begin(OldExit), E = pred_end(OldExit); BBI != E;) {
- // Incrememt BBI before mucking with BB's terminator.
- BasicBlock *BB = *BBI++;
-
- if (!SubRegion->contains(BB))
- continue;
-
- // Modify the edges to point to the new exit
- delPhiValues(BB, OldExit);
- BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit);
- addPhiValues(BB, NewExit);
-
- // Find the new dominator (if requested)
- if (IncludeDominator) {
- if (!Dominator)
- Dominator = BB;
- else
- Dominator = DT->findNearestCommonDominator(Dominator, BB);
- }
- }
-
- // Change the dominator (if requested)
- if (Dominator)
- DT->changeImmediateDominator(NewExit, Dominator);
-
- // Update the region info
- SubRegion->replaceExit(NewExit);
- } else {
- BasicBlock *BB = Node->getNodeAs<BasicBlock>();
- killTerminator(BB);
- BranchInst::Create(NewExit, BB);
- addPhiValues(BB, NewExit);
- if (IncludeDominator)
- DT->changeImmediateDominator(NewExit, BB);
- }
-}
-
-/// Create a new flow node and update dominator tree and region info
-BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
- LLVMContext &Context = Func->getContext();
- BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
- Order.back()->getEntry();
- BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
- Func, Insert);
- DT->addNewBlock(Flow, Dominator);
- ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
- return Flow;
-}
-
-/// Create a new or reuse the previous node as flow node
-BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
- BasicBlock *Entry = PrevNode->getEntry();
-
- if (!PrevNode->isSubRegion()) {
- killTerminator(Entry);
- if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
- return Entry;
- }
-
- // create a new flow node
- BasicBlock *Flow = getNextFlow(Entry);
-
- // and wire it up
- changeExit(PrevNode, Flow, true);
- PrevNode = ParentRegion->getBBNode(Flow);
- return Flow;
-}
-
-/// Returns the region exit if possible, otherwise just a new flow node
-BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
- bool ExitUseAllowed) {
- if (!Order.empty() || !ExitUseAllowed)
- return getNextFlow(Flow);
-
- BasicBlock *Exit = ParentRegion->getExit();
- DT->changeImmediateDominator(Exit, Flow);
- addPhiValues(Flow, Exit);
- return Exit;
-}
-
-/// Set the previous node
-void StructurizeCFG::setPrevNode(BasicBlock *BB) {
- PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB)
- : nullptr;
-}
-
-/// Does BB dominate all the predicates of Node?
-bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
- BBPredicates &Preds = Predicates[Node->getEntry()];
- return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) {
- return DT->dominates(BB, Pred.first);
- });
-}
-
-/// Can we predict that this node will always be called?
-bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
- BBPredicates &Preds = Predicates[Node->getEntry()];
- bool Dominated = false;
-
- // Regionentry is always true
- if (!PrevNode)
- return true;
-
- for (std::pair<BasicBlock*, Value*> Pred : Preds) {
- BasicBlock *BB = Pred.first;
- Value *V = Pred.second;
-
- if (V != BoolTrue)
- return false;
-
- if (!Dominated && DT->dominates(BB, PrevNode->getEntry()))
- Dominated = true;
- }
-
- // TODO: The dominator check is too strict
- return Dominated;
-}
-
-/// Take one node from the order vector and wire it up
-void StructurizeCFG::wireFlow(bool ExitUseAllowed,
- BasicBlock *LoopEnd) {
- RegionNode *Node = Order.pop_back_val();
- Visited.insert(Node->getEntry());
-
- if (isPredictableTrue(Node)) {
- // Just a linear flow
- if (PrevNode) {
- changeExit(PrevNode, Node->getEntry(), true);
- }
- PrevNode = Node;
- } else {
- // Insert extra prefix node (or reuse last one)
- BasicBlock *Flow = needPrefix(false);
-
- // Insert extra postfix node (or use exit instead)
- BasicBlock *Entry = Node->getEntry();
- BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);
-
- // let it point to entry and next block
- Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow));
- addPhiValues(Flow, Entry);
- DT->changeImmediateDominator(Entry, Flow);
-
- PrevNode = Node;
- while (!Order.empty() && !Visited.count(LoopEnd) &&
- dominatesPredicates(Entry, Order.back())) {
- handleLoops(false, LoopEnd);
- }
-
- changeExit(PrevNode, Next, false);
- setPrevNode(Next);
- }
-}
-
-void StructurizeCFG::handleLoops(bool ExitUseAllowed,
- BasicBlock *LoopEnd) {
- RegionNode *Node = Order.back();
- BasicBlock *LoopStart = Node->getEntry();
-
- if (!Loops.count(LoopStart)) {
- wireFlow(ExitUseAllowed, LoopEnd);
- return;
- }
-
- if (!isPredictableTrue(Node))
- LoopStart = needPrefix(true);
-
- LoopEnd = Loops[Node->getEntry()];
- wireFlow(false, LoopEnd);
- while (!Visited.count(LoopEnd)) {
- handleLoops(false, LoopEnd);
- }
-
- // If the start of the loop is the entry block, we can't branch to it so
- // insert a new dummy entry block.
- Function *LoopFunc = LoopStart->getParent();
- if (LoopStart == &LoopFunc->getEntryBlock()) {
- LoopStart->setName("entry.orig");
-
- BasicBlock *NewEntry =
- BasicBlock::Create(LoopStart->getContext(),
- "entry",
- LoopFunc,
- LoopStart);
- BranchInst::Create(LoopStart, NewEntry);
- DT->setNewRoot(NewEntry);
- }
-
- // Create an extra loop end node
- LoopEnd = needPrefix(false);
- BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
- LoopConds.push_back(BranchInst::Create(Next, LoopStart,
- BoolUndef, LoopEnd));
- addPhiValues(LoopEnd, LoopStart);
- setPrevNode(Next);
-}
-
-/// After this function control flow looks like it should be, but
-/// branches and PHI nodes only have undefined conditions.
-void StructurizeCFG::createFlow() {
- BasicBlock *Exit = ParentRegion->getExit();
- bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
-
- AffectedPhis.clear();
- DeletedPhis.clear();
- AddedPhis.clear();
- Conditions.clear();
- LoopConds.clear();
-
- PrevNode = nullptr;
- Visited.clear();
-
- while (!Order.empty()) {
- handleLoops(EntryDominatesExit, nullptr);
- }
-
- if (PrevNode)
- changeExit(PrevNode, Exit, EntryDominatesExit);
- else
- assert(EntryDominatesExit);
-}
-
-/// Handle a rare case where the disintegrated nodes instructions
-/// no longer dominate all their uses. Not sure if this is really necessary
-void StructurizeCFG::rebuildSSA() {
- SSAUpdater Updater;
- for (BasicBlock *BB : ParentRegion->blocks())
- for (Instruction &I : *BB) {
- bool Initialized = false;
- // We may modify the use list as we iterate over it, so be careful to
- // compute the next element in the use list at the top of the loop.
- for (auto UI = I.use_begin(), E = I.use_end(); UI != E;) {
- Use &U = *UI++;
- Instruction *User = cast<Instruction>(U.getUser());
- if (User->getParent() == BB) {
- continue;
- } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
- if (UserPN->getIncomingBlock(U) == BB)
- continue;
- }
-
- if (DT->dominates(&I, User))
- continue;
-
- if (!Initialized) {
- Value *Undef = UndefValue::get(I.getType());
- Updater.Initialize(I.getType(), "");
- Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
- Updater.AddAvailableValue(BB, &I);
- Initialized = true;
- }
- Updater.RewriteUseAfterInsertions(U);
- }
- }
-}
-
-static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID,
- const LegacyDivergenceAnalysis &DA) {
- // Bool for if all sub-regions are uniform.
- bool SubRegionsAreUniform = true;
- // Count of how many direct children are conditional.
- unsigned ConditionalDirectChildren = 0;
-
- for (auto E : R->elements()) {
- if (!E->isSubRegion()) {
- auto Br = dyn_cast<BranchInst>(E->getEntry()->getTerminator());
- if (!Br || !Br->isConditional())
- continue;
-
- if (!DA.isUniform(Br))
- return false;
-
- // One of our direct children is conditional.
- ConditionalDirectChildren++;
-
- LLVM_DEBUG(dbgs() << "BB: " << Br->getParent()->getName()
- << " has uniform terminator\n");
- } else {
- // Explicitly refuse to treat regions as uniform if they have non-uniform
- // subregions. We cannot rely on DivergenceAnalysis for branches in
- // subregions because those branches may have been removed and re-created,
- // so we look for our metadata instead.
- //
- // Warning: It would be nice to treat regions as uniform based only on
- // their direct child basic blocks' terminators, regardless of whether
- // subregions are uniform or not. However, this requires a very careful
- // look at SIAnnotateControlFlow to make sure nothing breaks there.
- for (auto BB : E->getNodeAs<Region>()->blocks()) {
- auto Br = dyn_cast<BranchInst>(BB->getTerminator());
- if (!Br || !Br->isConditional())
- continue;
-
- if (!Br->getMetadata(UniformMDKindID)) {
- // Early exit if we cannot have relaxed uniform regions.
- if (!RelaxedUniformRegions)
- return false;
-
- SubRegionsAreUniform = false;
- break;
- }
- }
- }
- }
-
- // Our region is uniform if:
- // 1. All conditional branches that are direct children are uniform (checked
- // above).
- // 2. And either:
- // a. All sub-regions are uniform.
- // b. There is one or less conditional branches among the direct children.
- return SubRegionsAreUniform || (ConditionalDirectChildren <= 1);
-}
-
+
+/// Build up the general order of nodes, by performing a topological sort of the
+/// parent region's nodes, while ensuring that there is no outer cycle node
+/// between any two inner cycle nodes.
+void StructurizeCFG::orderNodes() {
+ Order.resize(std::distance(GraphTraits<Region *>::nodes_begin(ParentRegion),
+ GraphTraits<Region *>::nodes_end(ParentRegion)));
+ if (Order.empty())
+ return;
+
+ SmallDenseSet<RegionNode *> Nodes;
+ auto EntryNode = SubGraphTraits::getEntryNode(ParentRegion);
+
+ // A list of range indices of SCCs in Order, to be processed.
+ SmallVector<std::pair<unsigned, unsigned>, 8> WorkList;
+ unsigned I = 0, E = Order.size();
+ while (true) {
+ // Run through all the SCCs in the subgraph starting with Entry.
+ for (auto SCCI =
+ scc_iterator<SubGraphTraits::NodeRef, SubGraphTraits>::begin(
+ EntryNode);
+ !SCCI.isAtEnd(); ++SCCI) {
+ auto &SCC = *SCCI;
+
+ // An SCC up to the size of 2, can be reduced to an entry (the last node),
+ // and a possible additional node. Therefore, it is already in order, and
+ // there is no need to add it to the work-list.
+ unsigned Size = SCC.size();
+ if (Size > 2)
+ WorkList.emplace_back(I, I + Size);
+
+ // Add the SCC nodes to the Order array.
+ for (auto &N : SCC) {
+ assert(I < E && "SCC size mismatch!");
+ Order[I++] = N.first;
+ }
+ }
+ assert(I == E && "SCC size mismatch!");
+
+ // If there are no more SCCs to order, then we are done.
+ if (WorkList.empty())
+ break;
+
+ std::tie(I, E) = WorkList.pop_back_val();
+
+ // Collect the set of nodes in the SCC's subgraph. These are only the
+ // possible child nodes; we do not add the entry (last node) otherwise we
+ // will have the same exact SCC all over again.
+ Nodes.clear();
+ Nodes.insert(Order.begin() + I, Order.begin() + E - 1);
+
+ // Update the entry node.
+ EntryNode.first = Order[E - 1];
+ EntryNode.second = &Nodes;
+ }
+}
+
+/// Determine the end of the loops
+void StructurizeCFG::analyzeLoops(RegionNode *N) {
+ if (N->isSubRegion()) {
+ // Test for exit as back edge
+ BasicBlock *Exit = N->getNodeAs<Region>()->getExit();
+ if (Visited.count(Exit))
+ Loops[Exit] = N->getEntry();
+
+ } else {
+ // Test for successors as back edge
+ BasicBlock *BB = N->getNodeAs<BasicBlock>();
+ BranchInst *Term = cast<BranchInst>(BB->getTerminator());
+
+ for (BasicBlock *Succ : Term->successors())
+ if (Visited.count(Succ))
+ Loops[Succ] = BB;
+ }
+}
+
+/// Build the condition for one edge
+Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
+ bool Invert) {
+ Value *Cond = Invert ? BoolFalse : BoolTrue;
+ if (Term->isConditional()) {
+ Cond = Term->getCondition();
+
+ if (Idx != (unsigned)Invert)
+ Cond = invertCondition(Cond);
+ }
+ return Cond;
+}
+
+/// Analyze the predecessors of each block and build up predicates
+void StructurizeCFG::gatherPredicates(RegionNode *N) {
+ RegionInfo *RI = ParentRegion->getRegionInfo();
+ BasicBlock *BB = N->getEntry();
+ BBPredicates &Pred = Predicates[BB];
+ BBPredicates &LPred = LoopPreds[BB];
+
+ for (BasicBlock *P : predecessors(BB)) {
+ // Ignore it if it's a branch from outside into our region entry
+ if (!ParentRegion->contains(P))
+ continue;
+
+ Region *R = RI->getRegionFor(P);
+ if (R == ParentRegion) {
+ // It's a top level block in our region
+ BranchInst *Term = cast<BranchInst>(P->getTerminator());
+ for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *Succ = Term->getSuccessor(i);
+ if (Succ != BB)
+ continue;
+
+ if (Visited.count(P)) {
+ // Normal forward edge
+ if (Term->isConditional()) {
+ // Try to treat it like an ELSE block
+ BasicBlock *Other = Term->getSuccessor(!i);
+ if (Visited.count(Other) && !Loops.count(Other) &&
+ !Pred.count(Other) && !Pred.count(P)) {
+
+ Pred[Other] = BoolFalse;
+ Pred[P] = BoolTrue;
+ continue;
+ }
+ }
+ Pred[P] = buildCondition(Term, i, false);
+ } else {
+ // Back edge
+ LPred[P] = buildCondition(Term, i, true);
+ }
+ }
+ } else {
+ // It's an exit from a sub region
+ while (R->getParent() != ParentRegion)
+ R = R->getParent();
+
+ // Edge from inside a subregion to its entry, ignore it
+ if (*R == *N)
+ continue;
+
+ BasicBlock *Entry = R->getEntry();
+ if (Visited.count(Entry))
+ Pred[Entry] = BoolTrue;
+ else
+ LPred[Entry] = BoolFalse;
+ }
+ }
+}
+
+/// Collect various loop and predicate infos
+void StructurizeCFG::collectInfos() {
+ // Reset predicate
+ Predicates.clear();
+
+ // and loop infos
+ Loops.clear();
+ LoopPreds.clear();
+
+ // Reset the visited nodes
+ Visited.clear();
+
+ for (RegionNode *RN : reverse(Order)) {
+ LLVM_DEBUG(dbgs() << "Visiting: "
+ << (RN->isSubRegion() ? "SubRegion with entry: " : "")
+ << RN->getEntry()->getName() << "\n");
+
+ // Analyze all the conditions leading to a node
+ gatherPredicates(RN);
+
+ // Remember that we've seen this node
+ Visited.insert(RN->getEntry());
+
+ // Find the last back edges
+ analyzeLoops(RN);
+ }
+}
+
+/// Insert the missing branch conditions
+void StructurizeCFG::insertConditions(bool Loops) {
+ BranchVector &Conds = Loops ? LoopConds : Conditions;
+ Value *Default = Loops ? BoolTrue : BoolFalse;
+ SSAUpdater PhiInserter;
+
+ for (BranchInst *Term : Conds) {
+ assert(Term->isConditional());
+
+ BasicBlock *Parent = Term->getParent();
+ BasicBlock *SuccTrue = Term->getSuccessor(0);
+ BasicBlock *SuccFalse = Term->getSuccessor(1);
+
+ PhiInserter.Initialize(Boolean, "");
+ PhiInserter.AddAvailableValue(&Func->getEntryBlock(), Default);
+ PhiInserter.AddAvailableValue(Loops ? SuccFalse : Parent, Default);
+
+ BBPredicates &Preds = Loops ? LoopPreds[SuccFalse] : Predicates[SuccTrue];
+
+ NearestCommonDominator Dominator(DT);
+ Dominator.addBlock(Parent);
+
+ Value *ParentValue = nullptr;
+ for (std::pair<BasicBlock *, Value *> BBAndPred : Preds) {
+ BasicBlock *BB = BBAndPred.first;
+ Value *Pred = BBAndPred.second;
+
+ if (BB == Parent) {
+ ParentValue = Pred;
+ break;
+ }
+ PhiInserter.AddAvailableValue(BB, Pred);
+ Dominator.addAndRememberBlock(BB);
+ }
+
+ if (ParentValue) {
+ Term->setCondition(ParentValue);
+ } else {
+ if (!Dominator.resultIsRememberedBlock())
+ PhiInserter.AddAvailableValue(Dominator.result(), Default);
+
+ Term->setCondition(PhiInserter.GetValueInMiddleOfBlock(Parent));
+ }
+ }
+}
+
+/// Remove all PHI values coming from "From" into "To" and remember
+/// them in DeletedPhis
+void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
+ PhiMap &Map = DeletedPhis[To];
+ for (PHINode &Phi : To->phis()) {
+ bool Recorded = false;
+ while (Phi.getBasicBlockIndex(From) != -1) {
+ Value *Deleted = Phi.removeIncomingValue(From, false);
+ Map[&Phi].push_back(std::make_pair(From, Deleted));
+ if (!Recorded) {
+ AffectedPhis.push_back(&Phi);
+ Recorded = true;
+ }
+ }
+ }
+}
+
+/// Add a dummy PHI value as soon as we knew the new predecessor
+void StructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) {
+ for (PHINode &Phi : To->phis()) {
+ Value *Undef = UndefValue::get(Phi.getType());
+ Phi.addIncoming(Undef, From);
+ }
+ AddedPhis[To].push_back(From);
+}
+
+/// Add the real PHI value as soon as everything is set up
+void StructurizeCFG::setPhiValues() {
+ SmallVector<PHINode *, 8> InsertedPhis;
+ SSAUpdater Updater(&InsertedPhis);
+ for (const auto &AddedPhi : AddedPhis) {
+ BasicBlock *To = AddedPhi.first;
+ const BBVector &From = AddedPhi.second;
+
+ if (!DeletedPhis.count(To))
+ continue;
+
+ PhiMap &Map = DeletedPhis[To];
+ for (const auto &PI : Map) {
+ PHINode *Phi = PI.first;
+ Value *Undef = UndefValue::get(Phi->getType());
+ Updater.Initialize(Phi->getType(), "");
+ Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+ Updater.AddAvailableValue(To, Undef);
+
+ NearestCommonDominator Dominator(DT);
+ Dominator.addBlock(To);
+ for (const auto &VI : PI.second) {
+ Updater.AddAvailableValue(VI.first, VI.second);
+ Dominator.addAndRememberBlock(VI.first);
+ }
+
+ if (!Dominator.resultIsRememberedBlock())
+ Updater.AddAvailableValue(Dominator.result(), Undef);
+
+ for (BasicBlock *FI : From)
+ Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI));
+ AffectedPhis.push_back(Phi);
+ }
+
+ DeletedPhis.erase(To);
+ }
+ assert(DeletedPhis.empty());
+
+ AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
+}
+
+void StructurizeCFG::simplifyAffectedPhis() {
+ bool Changed;
+ do {
+ Changed = false;
+ SimplifyQuery Q(Func->getParent()->getDataLayout());
+ Q.DT = DT;
+ for (WeakVH VH : AffectedPhis) {
+ if (auto Phi = dyn_cast_or_null<PHINode>(VH)) {
+ if (auto NewValue = SimplifyInstruction(Phi, Q)) {
+ Phi->replaceAllUsesWith(NewValue);
+ Phi->eraseFromParent();
+ Changed = true;
+ }
+ }
+ }
+ } while (Changed);
+}
+
+/// Remove phi values from all successors and then remove the terminator.
+void StructurizeCFG::killTerminator(BasicBlock *BB) {
+ Instruction *Term = BB->getTerminator();
+ if (!Term)
+ return;
+
+ for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+ SI != SE; ++SI)
+ delPhiValues(BB, *SI);
+
+ if (DA)
+ DA->removeValue(Term);
+ Term->eraseFromParent();
+}
+
+/// Let node exit(s) point to NewExit
+void StructurizeCFG::changeExit(RegionNode *Node, BasicBlock *NewExit,
+ bool IncludeDominator) {
+ if (Node->isSubRegion()) {
+ Region *SubRegion = Node->getNodeAs<Region>();
+ BasicBlock *OldExit = SubRegion->getExit();
+ BasicBlock *Dominator = nullptr;
+
+ // Find all the edges from the sub region to the exit
+ for (auto BBI = pred_begin(OldExit), E = pred_end(OldExit); BBI != E;) {
+ // Incrememt BBI before mucking with BB's terminator.
+ BasicBlock *BB = *BBI++;
+
+ if (!SubRegion->contains(BB))
+ continue;
+
+ // Modify the edges to point to the new exit
+ delPhiValues(BB, OldExit);
+ BB->getTerminator()->replaceUsesOfWith(OldExit, NewExit);
+ addPhiValues(BB, NewExit);
+
+ // Find the new dominator (if requested)
+ if (IncludeDominator) {
+ if (!Dominator)
+ Dominator = BB;
+ else
+ Dominator = DT->findNearestCommonDominator(Dominator, BB);
+ }
+ }
+
+ // Change the dominator (if requested)
+ if (Dominator)
+ DT->changeImmediateDominator(NewExit, Dominator);
+
+ // Update the region info
+ SubRegion->replaceExit(NewExit);
+ } else {
+ BasicBlock *BB = Node->getNodeAs<BasicBlock>();
+ killTerminator(BB);
+ BranchInst::Create(NewExit, BB);
+ addPhiValues(BB, NewExit);
+ if (IncludeDominator)
+ DT->changeImmediateDominator(NewExit, BB);
+ }
+}
+
+/// Create a new flow node and update dominator tree and region info
+BasicBlock *StructurizeCFG::getNextFlow(BasicBlock *Dominator) {
+ LLVMContext &Context = Func->getContext();
+ BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() :
+ Order.back()->getEntry();
+ BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName,
+ Func, Insert);
+ DT->addNewBlock(Flow, Dominator);
+ ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion);
+ return Flow;
+}
+
+/// Create a new or reuse the previous node as flow node
+BasicBlock *StructurizeCFG::needPrefix(bool NeedEmpty) {
+ BasicBlock *Entry = PrevNode->getEntry();
+
+ if (!PrevNode->isSubRegion()) {
+ killTerminator(Entry);
+ if (!NeedEmpty || Entry->getFirstInsertionPt() == Entry->end())
+ return Entry;
+ }
+
+ // create a new flow node
+ BasicBlock *Flow = getNextFlow(Entry);
+
+ // and wire it up
+ changeExit(PrevNode, Flow, true);
+ PrevNode = ParentRegion->getBBNode(Flow);
+ return Flow;
+}
+
+/// Returns the region exit if possible, otherwise just a new flow node
+BasicBlock *StructurizeCFG::needPostfix(BasicBlock *Flow,
+ bool ExitUseAllowed) {
+ if (!Order.empty() || !ExitUseAllowed)
+ return getNextFlow(Flow);
+
+ BasicBlock *Exit = ParentRegion->getExit();
+ DT->changeImmediateDominator(Exit, Flow);
+ addPhiValues(Flow, Exit);
+ return Exit;
+}
+
+/// Set the previous node
+void StructurizeCFG::setPrevNode(BasicBlock *BB) {
+ PrevNode = ParentRegion->contains(BB) ? ParentRegion->getBBNode(BB)
+ : nullptr;
+}
+
+/// Does BB dominate all the predicates of Node?
+bool StructurizeCFG::dominatesPredicates(BasicBlock *BB, RegionNode *Node) {
+ BBPredicates &Preds = Predicates[Node->getEntry()];
+ return llvm::all_of(Preds, [&](std::pair<BasicBlock *, Value *> Pred) {
+ return DT->dominates(BB, Pred.first);
+ });
+}
+
+/// Can we predict that this node will always be called?
+bool StructurizeCFG::isPredictableTrue(RegionNode *Node) {
+ BBPredicates &Preds = Predicates[Node->getEntry()];
+ bool Dominated = false;
+
+ // Regionentry is always true
+ if (!PrevNode)
+ return true;
+
+ for (std::pair<BasicBlock*, Value*> Pred : Preds) {
+ BasicBlock *BB = Pred.first;
+ Value *V = Pred.second;
+
+ if (V != BoolTrue)
+ return false;
+
+ if (!Dominated && DT->dominates(BB, PrevNode->getEntry()))
+ Dominated = true;
+ }
+
+ // TODO: The dominator check is too strict
+ return Dominated;
+}
+
+/// Take one node from the order vector and wire it up
+void StructurizeCFG::wireFlow(bool ExitUseAllowed,
+ BasicBlock *LoopEnd) {
+ RegionNode *Node = Order.pop_back_val();
+ Visited.insert(Node->getEntry());
+
+ if (isPredictableTrue(Node)) {
+ // Just a linear flow
+ if (PrevNode) {
+ changeExit(PrevNode, Node->getEntry(), true);
+ }
+ PrevNode = Node;
+ } else {
+ // Insert extra prefix node (or reuse last one)
+ BasicBlock *Flow = needPrefix(false);
+
+ // Insert extra postfix node (or use exit instead)
+ BasicBlock *Entry = Node->getEntry();
+ BasicBlock *Next = needPostfix(Flow, ExitUseAllowed);
+
+ // let it point to entry and next block
+ Conditions.push_back(BranchInst::Create(Entry, Next, BoolUndef, Flow));
+ addPhiValues(Flow, Entry);
+ DT->changeImmediateDominator(Entry, Flow);
+
+ PrevNode = Node;
+ while (!Order.empty() && !Visited.count(LoopEnd) &&
+ dominatesPredicates(Entry, Order.back())) {
+ handleLoops(false, LoopEnd);
+ }
+
+ changeExit(PrevNode, Next, false);
+ setPrevNode(Next);
+ }
+}
+
+void StructurizeCFG::handleLoops(bool ExitUseAllowed,
+ BasicBlock *LoopEnd) {
+ RegionNode *Node = Order.back();
+ BasicBlock *LoopStart = Node->getEntry();
+
+ if (!Loops.count(LoopStart)) {
+ wireFlow(ExitUseAllowed, LoopEnd);
+ return;
+ }
+
+ if (!isPredictableTrue(Node))
+ LoopStart = needPrefix(true);
+
+ LoopEnd = Loops[Node->getEntry()];
+ wireFlow(false, LoopEnd);
+ while (!Visited.count(LoopEnd)) {
+ handleLoops(false, LoopEnd);
+ }
+
+ // If the start of the loop is the entry block, we can't branch to it so
+ // insert a new dummy entry block.
+ Function *LoopFunc = LoopStart->getParent();
+ if (LoopStart == &LoopFunc->getEntryBlock()) {
+ LoopStart->setName("entry.orig");
+
+ BasicBlock *NewEntry =
+ BasicBlock::Create(LoopStart->getContext(),
+ "entry",
+ LoopFunc,
+ LoopStart);
+ BranchInst::Create(LoopStart, NewEntry);
+ DT->setNewRoot(NewEntry);
+ }
+
+ // Create an extra loop end node
+ LoopEnd = needPrefix(false);
+ BasicBlock *Next = needPostfix(LoopEnd, ExitUseAllowed);
+ LoopConds.push_back(BranchInst::Create(Next, LoopStart,
+ BoolUndef, LoopEnd));
+ addPhiValues(LoopEnd, LoopStart);
+ setPrevNode(Next);
+}
+
+/// After this function control flow looks like it should be, but
+/// branches and PHI nodes only have undefined conditions.
+void StructurizeCFG::createFlow() {
+ BasicBlock *Exit = ParentRegion->getExit();
+ bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
+
+ AffectedPhis.clear();
+ DeletedPhis.clear();
+ AddedPhis.clear();
+ Conditions.clear();
+ LoopConds.clear();
+
+ PrevNode = nullptr;
+ Visited.clear();
+
+ while (!Order.empty()) {
+ handleLoops(EntryDominatesExit, nullptr);
+ }
+
+ if (PrevNode)
+ changeExit(PrevNode, Exit, EntryDominatesExit);
+ else
+ assert(EntryDominatesExit);
+}
+
+/// Handle a rare case where the disintegrated nodes instructions
+/// no longer dominate all their uses. Not sure if this is really necessary
+void StructurizeCFG::rebuildSSA() {
+ SSAUpdater Updater;
+ for (BasicBlock *BB : ParentRegion->blocks())
+ for (Instruction &I : *BB) {
+ bool Initialized = false;
+ // We may modify the use list as we iterate over it, so be careful to
+ // compute the next element in the use list at the top of the loop.
+ for (auto UI = I.use_begin(), E = I.use_end(); UI != E;) {
+ Use &U = *UI++;
+ Instruction *User = cast<Instruction>(U.getUser());
+ if (User->getParent() == BB) {
+ continue;
+ } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
+ if (UserPN->getIncomingBlock(U) == BB)
+ continue;
+ }
+
+ if (DT->dominates(&I, User))
+ continue;
+
+ if (!Initialized) {
+ Value *Undef = UndefValue::get(I.getType());
+ Updater.Initialize(I.getType(), "");
+ Updater.AddAvailableValue(&Func->getEntryBlock(), Undef);
+ Updater.AddAvailableValue(BB, &I);
+ Initialized = true;
+ }
+ Updater.RewriteUseAfterInsertions(U);
+ }
+ }
+}
+
+static bool hasOnlyUniformBranches(Region *R, unsigned UniformMDKindID,
+ const LegacyDivergenceAnalysis &DA) {
+ // Bool for if all sub-regions are uniform.
+ bool SubRegionsAreUniform = true;
+ // Count of how many direct children are conditional.
+ unsigned ConditionalDirectChildren = 0;
+
+ for (auto E : R->elements()) {
+ if (!E->isSubRegion()) {
+ auto Br = dyn_cast<BranchInst>(E->getEntry()->getTerminator());
+ if (!Br || !Br->isConditional())
+ continue;
+
+ if (!DA.isUniform(Br))
+ return false;
+
+ // One of our direct children is conditional.
+ ConditionalDirectChildren++;
+
+ LLVM_DEBUG(dbgs() << "BB: " << Br->getParent()->getName()
+ << " has uniform terminator\n");
+ } else {
+ // Explicitly refuse to treat regions as uniform if they have non-uniform
+ // subregions. We cannot rely on DivergenceAnalysis for branches in
+ // subregions because those branches may have been removed and re-created,
+ // so we look for our metadata instead.
+ //
+ // Warning: It would be nice to treat regions as uniform based only on
+ // their direct child basic blocks' terminators, regardless of whether
+ // subregions are uniform or not. However, this requires a very careful
+ // look at SIAnnotateControlFlow to make sure nothing breaks there.
+ for (auto BB : E->getNodeAs<Region>()->blocks()) {
+ auto Br = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!Br || !Br->isConditional())
+ continue;
+
+ if (!Br->getMetadata(UniformMDKindID)) {
+ // Early exit if we cannot have relaxed uniform regions.
+ if (!RelaxedUniformRegions)
+ return false;
+
+ SubRegionsAreUniform = false;
+ break;
+ }
+ }
+ }
+ }
+
+ // Our region is uniform if:
+ // 1. All conditional branches that are direct children are uniform (checked
+ // above).
+ // 2. And either:
+ // a. All sub-regions are uniform.
+ // b. There is one or less conditional branches among the direct children.
+ return SubRegionsAreUniform || (ConditionalDirectChildren <= 1);
+}
+
void StructurizeCFG::init(Region *R) {
LLVMContext &Context = R->getEntry()->getContext();
@@ -1021,9 +1021,9 @@ void StructurizeCFG::init(Region *R) {
bool StructurizeCFG::makeUniformRegion(Region *R,
LegacyDivergenceAnalysis *DA) {
- if (R->isTopLevelRegion())
- return false;
-
+ if (R->isTopLevelRegion())
+ return false;
+
this->DA = DA;
// TODO: We could probably be smarter here with how we handle sub-regions.
// We currently rely on the fact that metadata is set by earlier invocations
@@ -1031,11 +1031,11 @@ bool StructurizeCFG::makeUniformRegion(Region *R,
// but we shouldn't rely on metadata for correctness!
unsigned UniformMDKindID =
R->getEntry()->getContext().getMDKindID("structurizecfg.uniform");
-
+
if (hasOnlyUniformBranches(R, UniformMDKindID, *DA)) {
LLVM_DEBUG(dbgs() << "Skipping region with uniform control flow: " << *R
<< '\n');
-
+
// Mark all direct child block terminators as having been treated as
// uniform. To account for a possible future in which non-uniform
// sub-regions are treated more cleverly, indirect children are not
@@ -1044,16 +1044,16 @@ bool StructurizeCFG::makeUniformRegion(Region *R,
for (RegionNode *E : R->elements()) {
if (E->isSubRegion())
continue;
-
+
if (Instruction *Term = E->getEntry()->getTerminator())
Term->setMetadata(UniformMDKindID, MD);
}
-
+
return true;
- }
+ }
return false;
}
-
+
/// Run the transformation for each region found
bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
if (R->isTopLevelRegion())
@@ -1061,35 +1061,35 @@ bool StructurizeCFG::run(Region *R, DominatorTree *DT) {
this->DT = DT;
- Func = R->getEntry()->getParent();
- ParentRegion = R;
-
- orderNodes();
- collectInfos();
- createFlow();
- insertConditions(false);
- insertConditions(true);
- setPhiValues();
- simplifyAffectedPhis();
- rebuildSSA();
-
- // Cleanup
- Order.clear();
- Visited.clear();
- DeletedPhis.clear();
- AddedPhis.clear();
- Predicates.clear();
- Conditions.clear();
- Loops.clear();
- LoopPreds.clear();
- LoopConds.clear();
-
- return true;
-}
-
-Pass *llvm::createStructurizeCFGPass(bool SkipUniformRegions) {
+ Func = R->getEntry()->getParent();
+ ParentRegion = R;
+
+ orderNodes();
+ collectInfos();
+ createFlow();
+ insertConditions(false);
+ insertConditions(true);
+ setPhiValues();
+ simplifyAffectedPhis();
+ rebuildSSA();
+
+ // Cleanup
+ Order.clear();
+ Visited.clear();
+ DeletedPhis.clear();
+ AddedPhis.clear();
+ Predicates.clear();
+ Conditions.clear();
+ Loops.clear();
+ LoopPreds.clear();
+ LoopConds.clear();
+
+ return true;
+}
+
+Pass *llvm::createStructurizeCFGPass(bool SkipUniformRegions) {
return new StructurizeCFGLegacyPass(SkipUniformRegions);
-}
+}
static void addRegionIntoQueue(Region &R, std::vector<Region *> &Regions) {
Regions.push_back(&R);
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/TailRecursionElimination.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/TailRecursionElimination.cpp
index cfd50023b4..9e7cccc884 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -1,752 +1,752 @@
-//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file transforms calls of the current function (self recursion) followed
-// by a return instruction with a branch to the entry of the function, creating
-// a loop. This pass also implements the following extensions to the basic
-// algorithm:
-//
-// 1. Trivial instructions between the call and return do not prevent the
-// transformation from taking place, though currently the analysis cannot
-// support moving any really useful instructions (only dead ones).
-// 2. This pass transforms functions that are prevented from being tail
-// recursive by an associative and commutative expression to use an
-// accumulator variable, thus compiling the typical naive factorial or
-// 'fib' implementation into efficient code.
-// 3. TRE is performed if the function returns void, if the return
-// returns the result returned by the call, or if the function returns a
-// run-time constant on all exits from the function. It is possible, though
-// unlikely, that the return returns something else (like constant 0), and
-// can still be TRE'd. It can be TRE'd if ALL OTHER return instructions in
-// the function return the exact same value.
-// 4. If it can prove that callees do not access their caller stack frame,
-// they are marked as eligible for tail call elimination (by the code
-// generator).
-//
-// There are several improvements that could be made:
-//
-// 1. If the function has any alloca instructions, these instructions will be
-// moved out of the entry block of the function, causing them to be
-// evaluated each time through the tail recursion. Safely keeping allocas
-// in the entry block requires analysis to proves that the tail-called
-// function does not read or write the stack object.
-// 2. Tail recursion is only performed if the call immediately precedes the
-// return instruction. It's possible that there could be a jump between
-// the call and the return.
-// 3. There can be intervening operations between the call and the return that
-// prevent the TRE from occurring. For example, there could be GEP's and
-// stores to memory that will not be read or written by the call. This
-// requires some substantial analysis (such as with DSA) to prove safe to
-// move ahead of the call, but doing so could allow many more TREs to be
-// performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark.
-// 4. The algorithm we use to detect if callees access their caller stack
-// frames is very primitive.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/TailRecursionElimination.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "tailcallelim"
-
-STATISTIC(NumEliminated, "Number of tail calls removed");
-STATISTIC(NumRetDuped, "Number of return duplicated");
-STATISTIC(NumAccumAdded, "Number of accumulators introduced");
-
-/// Scan the specified function for alloca instructions.
-/// If it contains any dynamic allocas, returns false.
-static bool canTRE(Function &F) {
+//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file transforms calls of the current function (self recursion) followed
+// by a return instruction with a branch to the entry of the function, creating
+// a loop. This pass also implements the following extensions to the basic
+// algorithm:
+//
+// 1. Trivial instructions between the call and return do not prevent the
+// transformation from taking place, though currently the analysis cannot
+// support moving any really useful instructions (only dead ones).
+// 2. This pass transforms functions that are prevented from being tail
+// recursive by an associative and commutative expression to use an
+// accumulator variable, thus compiling the typical naive factorial or
+// 'fib' implementation into efficient code.
+// 3. TRE is performed if the function returns void, if the return
+// returns the result returned by the call, or if the function returns a
+// run-time constant on all exits from the function. It is possible, though
+// unlikely, that the return returns something else (like constant 0), and
+// can still be TRE'd. It can be TRE'd if ALL OTHER return instructions in
+// the function return the exact same value.
+// 4. If it can prove that callees do not access their caller stack frame,
+// they are marked as eligible for tail call elimination (by the code
+// generator).
+//
+// There are several improvements that could be made:
+//
+// 1. If the function has any alloca instructions, these instructions will be
+// moved out of the entry block of the function, causing them to be
+// evaluated each time through the tail recursion. Safely keeping allocas
+// in the entry block requires analysis to proves that the tail-called
+// function does not read or write the stack object.
+// 2. Tail recursion is only performed if the call immediately precedes the
+// return instruction. It's possible that there could be a jump between
+// the call and the return.
+// 3. There can be intervening operations between the call and the return that
+// prevent the TRE from occurring. For example, there could be GEP's and
+// stores to memory that will not be read or written by the call. This
+// requires some substantial analysis (such as with DSA) to prove safe to
+// move ahead of the call, but doing so could allow many more TREs to be
+// performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark.
+// 4. The algorithm we use to detect if callees access their caller stack
+// frames is very primitive.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/TailRecursionElimination.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "tailcallelim"
+
+STATISTIC(NumEliminated, "Number of tail calls removed");
+STATISTIC(NumRetDuped, "Number of return duplicated");
+STATISTIC(NumAccumAdded, "Number of accumulators introduced");
+
+/// Scan the specified function for alloca instructions.
+/// If it contains any dynamic allocas, returns false.
+static bool canTRE(Function &F) {
// FIXME: The code generator produces really bad code when an 'escaping
// alloca' is changed from being a static alloca to being a dynamic alloca.
// Until this is resolved, disable this transformation if that would ever
// happen. This bug is PR962.
- return llvm::all_of(instructions(F), [](Instruction &I) {
- auto *AI = dyn_cast<AllocaInst>(&I);
- return !AI || AI->isStaticAlloca();
- });
-}
-
-namespace {
-struct AllocaDerivedValueTracker {
- // Start at a root value and walk its use-def chain to mark calls that use the
- // value or a derived value in AllocaUsers, and places where it may escape in
- // EscapePoints.
- void walk(Value *Root) {
- SmallVector<Use *, 32> Worklist;
- SmallPtrSet<Use *, 32> Visited;
-
- auto AddUsesToWorklist = [&](Value *V) {
- for (auto &U : V->uses()) {
- if (!Visited.insert(&U).second)
- continue;
- Worklist.push_back(&U);
- }
- };
-
- AddUsesToWorklist(Root);
-
- while (!Worklist.empty()) {
- Use *U = Worklist.pop_back_val();
- Instruction *I = cast<Instruction>(U->getUser());
-
- switch (I->getOpcode()) {
- case Instruction::Call:
- case Instruction::Invoke: {
- auto &CB = cast<CallBase>(*I);
- // If the alloca-derived argument is passed byval it is not an escape
- // point, or a use of an alloca. Calling with byval copies the contents
- // of the alloca into argument registers or stack slots, which exist
- // beyond the lifetime of the current frame.
- if (CB.isArgOperand(U) && CB.isByValArgument(CB.getArgOperandNo(U)))
- continue;
- bool IsNocapture =
- CB.isDataOperand(U) && CB.doesNotCapture(CB.getDataOperandNo(U));
- callUsesLocalStack(CB, IsNocapture);
- if (IsNocapture) {
- // If the alloca-derived argument is passed in as nocapture, then it
- // can't propagate to the call's return. That would be capturing.
- continue;
- }
- break;
- }
- case Instruction::Load: {
- // The result of a load is not alloca-derived (unless an alloca has
- // otherwise escaped, but this is a local analysis).
- continue;
- }
- case Instruction::Store: {
- if (U->getOperandNo() == 0)
- EscapePoints.insert(I);
- continue; // Stores have no users to analyze.
- }
- case Instruction::BitCast:
- case Instruction::GetElementPtr:
- case Instruction::PHI:
- case Instruction::Select:
- case Instruction::AddrSpaceCast:
- break;
- default:
- EscapePoints.insert(I);
- break;
- }
-
- AddUsesToWorklist(I);
- }
- }
-
- void callUsesLocalStack(CallBase &CB, bool IsNocapture) {
- // Add it to the list of alloca users.
- AllocaUsers.insert(&CB);
-
- // If it's nocapture then it can't capture this alloca.
- if (IsNocapture)
- return;
-
- // If it can write to memory, it can leak the alloca value.
- if (!CB.onlyReadsMemory())
- EscapePoints.insert(&CB);
- }
-
- SmallPtrSet<Instruction *, 32> AllocaUsers;
- SmallPtrSet<Instruction *, 32> EscapePoints;
-};
-}
-
-static bool markTails(Function &F, bool &AllCallsAreTailCalls,
- OptimizationRemarkEmitter *ORE) {
- if (F.callsFunctionThatReturnsTwice())
- return false;
- AllCallsAreTailCalls = true;
-
- // The local stack holds all alloca instructions and all byval arguments.
- AllocaDerivedValueTracker Tracker;
- for (Argument &Arg : F.args()) {
- if (Arg.hasByValAttr())
- Tracker.walk(&Arg);
- }
- for (auto &BB : F) {
- for (auto &I : BB)
- if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
- Tracker.walk(AI);
- }
-
- bool Modified = false;
-
- // Track whether a block is reachable after an alloca has escaped. Blocks that
- // contain the escaping instruction will be marked as being visited without an
- // escaped alloca, since that is how the block began.
- enum VisitType {
- UNVISITED,
- UNESCAPED,
- ESCAPED
- };
- DenseMap<BasicBlock *, VisitType> Visited;
-
- // We propagate the fact that an alloca has escaped from block to successor.
- // Visit the blocks that are propagating the escapedness first. To do this, we
- // maintain two worklists.
- SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped;
-
- // We may enter a block and visit it thinking that no alloca has escaped yet,
- // then see an escape point and go back around a loop edge and come back to
- // the same block twice. Because of this, we defer setting tail on calls when
- // we first encounter them in a block. Every entry in this list does not
- // statically use an alloca via use-def chain analysis, but may find an alloca
- // through other means if the block turns out to be reachable after an escape
- // point.
- SmallVector<CallInst *, 32> DeferredTails;
-
- BasicBlock *BB = &F.getEntryBlock();
- VisitType Escaped = UNESCAPED;
- do {
- for (auto &I : *BB) {
- if (Tracker.EscapePoints.count(&I))
- Escaped = ESCAPED;
-
- CallInst *CI = dyn_cast<CallInst>(&I);
+ return llvm::all_of(instructions(F), [](Instruction &I) {
+ auto *AI = dyn_cast<AllocaInst>(&I);
+ return !AI || AI->isStaticAlloca();
+ });
+}
+
+namespace {
+struct AllocaDerivedValueTracker {
+ // Start at a root value and walk its use-def chain to mark calls that use the
+ // value or a derived value in AllocaUsers, and places where it may escape in
+ // EscapePoints.
+ void walk(Value *Root) {
+ SmallVector<Use *, 32> Worklist;
+ SmallPtrSet<Use *, 32> Visited;
+
+ auto AddUsesToWorklist = [&](Value *V) {
+ for (auto &U : V->uses()) {
+ if (!Visited.insert(&U).second)
+ continue;
+ Worklist.push_back(&U);
+ }
+ };
+
+ AddUsesToWorklist(Root);
+
+ while (!Worklist.empty()) {
+ Use *U = Worklist.pop_back_val();
+ Instruction *I = cast<Instruction>(U->getUser());
+
+ switch (I->getOpcode()) {
+ case Instruction::Call:
+ case Instruction::Invoke: {
+ auto &CB = cast<CallBase>(*I);
+ // If the alloca-derived argument is passed byval it is not an escape
+ // point, or a use of an alloca. Calling with byval copies the contents
+ // of the alloca into argument registers or stack slots, which exist
+ // beyond the lifetime of the current frame.
+ if (CB.isArgOperand(U) && CB.isByValArgument(CB.getArgOperandNo(U)))
+ continue;
+ bool IsNocapture =
+ CB.isDataOperand(U) && CB.doesNotCapture(CB.getDataOperandNo(U));
+ callUsesLocalStack(CB, IsNocapture);
+ if (IsNocapture) {
+ // If the alloca-derived argument is passed in as nocapture, then it
+ // can't propagate to the call's return. That would be capturing.
+ continue;
+ }
+ break;
+ }
+ case Instruction::Load: {
+ // The result of a load is not alloca-derived (unless an alloca has
+ // otherwise escaped, but this is a local analysis).
+ continue;
+ }
+ case Instruction::Store: {
+ if (U->getOperandNo() == 0)
+ EscapePoints.insert(I);
+ continue; // Stores have no users to analyze.
+ }
+ case Instruction::BitCast:
+ case Instruction::GetElementPtr:
+ case Instruction::PHI:
+ case Instruction::Select:
+ case Instruction::AddrSpaceCast:
+ break;
+ default:
+ EscapePoints.insert(I);
+ break;
+ }
+
+ AddUsesToWorklist(I);
+ }
+ }
+
+ void callUsesLocalStack(CallBase &CB, bool IsNocapture) {
+ // Add it to the list of alloca users.
+ AllocaUsers.insert(&CB);
+
+ // If it's nocapture then it can't capture this alloca.
+ if (IsNocapture)
+ return;
+
+ // If it can write to memory, it can leak the alloca value.
+ if (!CB.onlyReadsMemory())
+ EscapePoints.insert(&CB);
+ }
+
+ SmallPtrSet<Instruction *, 32> AllocaUsers;
+ SmallPtrSet<Instruction *, 32> EscapePoints;
+};
+}
+
+static bool markTails(Function &F, bool &AllCallsAreTailCalls,
+ OptimizationRemarkEmitter *ORE) {
+ if (F.callsFunctionThatReturnsTwice())
+ return false;
+ AllCallsAreTailCalls = true;
+
+ // The local stack holds all alloca instructions and all byval arguments.
+ AllocaDerivedValueTracker Tracker;
+ for (Argument &Arg : F.args()) {
+ if (Arg.hasByValAttr())
+ Tracker.walk(&Arg);
+ }
+ for (auto &BB : F) {
+ for (auto &I : BB)
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+ Tracker.walk(AI);
+ }
+
+ bool Modified = false;
+
+ // Track whether a block is reachable after an alloca has escaped. Blocks that
+ // contain the escaping instruction will be marked as being visited without an
+ // escaped alloca, since that is how the block began.
+ enum VisitType {
+ UNVISITED,
+ UNESCAPED,
+ ESCAPED
+ };
+ DenseMap<BasicBlock *, VisitType> Visited;
+
+ // We propagate the fact that an alloca has escaped from block to successor.
+ // Visit the blocks that are propagating the escapedness first. To do this, we
+ // maintain two worklists.
+ SmallVector<BasicBlock *, 32> WorklistUnescaped, WorklistEscaped;
+
+ // We may enter a block and visit it thinking that no alloca has escaped yet,
+ // then see an escape point and go back around a loop edge and come back to
+ // the same block twice. Because of this, we defer setting tail on calls when
+ // we first encounter them in a block. Every entry in this list does not
+ // statically use an alloca via use-def chain analysis, but may find an alloca
+ // through other means if the block turns out to be reachable after an escape
+ // point.
+ SmallVector<CallInst *, 32> DeferredTails;
+
+ BasicBlock *BB = &F.getEntryBlock();
+ VisitType Escaped = UNESCAPED;
+ do {
+ for (auto &I : *BB) {
+ if (Tracker.EscapePoints.count(&I))
+ Escaped = ESCAPED;
+
+ CallInst *CI = dyn_cast<CallInst>(&I);
// A PseudoProbeInst has the IntrInaccessibleMemOnly tag hence it is
// considered accessing memory and will be marked as a tail call if we
// don't bail out here.
if (!CI || CI->isTailCall() || isa<DbgInfoIntrinsic>(&I) ||
isa<PseudoProbeInst>(&I))
- continue;
-
- bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundles();
-
- if (!IsNoTail && CI->doesNotAccessMemory()) {
- // A call to a readnone function whose arguments are all things computed
- // outside this function can be marked tail. Even if you stored the
- // alloca address into a global, a readnone function can't load the
- // global anyhow.
- //
- // Note that this runs whether we know an alloca has escaped or not. If
- // it has, then we can't trust Tracker.AllocaUsers to be accurate.
- bool SafeToTail = true;
- for (auto &Arg : CI->arg_operands()) {
- if (isa<Constant>(Arg.getUser()))
- continue;
- if (Argument *A = dyn_cast<Argument>(Arg.getUser()))
- if (!A->hasByValAttr())
- continue;
- SafeToTail = false;
- break;
- }
- if (SafeToTail) {
- using namespace ore;
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "tailcall-readnone", CI)
- << "marked as tail call candidate (readnone)";
- });
- CI->setTailCall();
- Modified = true;
- continue;
- }
- }
-
- if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
- DeferredTails.push_back(CI);
- } else {
- AllCallsAreTailCalls = false;
- }
- }
-
+ continue;
+
+ bool IsNoTail = CI->isNoTailCall() || CI->hasOperandBundles();
+
+ if (!IsNoTail && CI->doesNotAccessMemory()) {
+ // A call to a readnone function whose arguments are all things computed
+ // outside this function can be marked tail. Even if you stored the
+ // alloca address into a global, a readnone function can't load the
+ // global anyhow.
+ //
+ // Note that this runs whether we know an alloca has escaped or not. If
+ // it has, then we can't trust Tracker.AllocaUsers to be accurate.
+ bool SafeToTail = true;
+ for (auto &Arg : CI->arg_operands()) {
+ if (isa<Constant>(Arg.getUser()))
+ continue;
+ if (Argument *A = dyn_cast<Argument>(Arg.getUser()))
+ if (!A->hasByValAttr())
+ continue;
+ SafeToTail = false;
+ break;
+ }
+ if (SafeToTail) {
+ using namespace ore;
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "tailcall-readnone", CI)
+ << "marked as tail call candidate (readnone)";
+ });
+ CI->setTailCall();
+ Modified = true;
+ continue;
+ }
+ }
+
+ if (!IsNoTail && Escaped == UNESCAPED && !Tracker.AllocaUsers.count(CI)) {
+ DeferredTails.push_back(CI);
+ } else {
+ AllCallsAreTailCalls = false;
+ }
+ }
+
for (auto *SuccBB : successors(BB)) {
- auto &State = Visited[SuccBB];
- if (State < Escaped) {
- State = Escaped;
- if (State == ESCAPED)
- WorklistEscaped.push_back(SuccBB);
- else
- WorklistUnescaped.push_back(SuccBB);
- }
- }
-
- if (!WorklistEscaped.empty()) {
- BB = WorklistEscaped.pop_back_val();
- Escaped = ESCAPED;
- } else {
- BB = nullptr;
- while (!WorklistUnescaped.empty()) {
- auto *NextBB = WorklistUnescaped.pop_back_val();
- if (Visited[NextBB] == UNESCAPED) {
- BB = NextBB;
- Escaped = UNESCAPED;
- break;
- }
- }
- }
- } while (BB);
-
- for (CallInst *CI : DeferredTails) {
- if (Visited[CI->getParent()] != ESCAPED) {
- // If the escape point was part way through the block, calls after the
- // escape point wouldn't have been put into DeferredTails.
- LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
- CI->setTailCall();
- Modified = true;
- } else {
- AllCallsAreTailCalls = false;
- }
- }
-
- return Modified;
-}
-
-/// Return true if it is safe to move the specified
-/// instruction from after the call to before the call, assuming that all
-/// instructions between the call and this instruction are movable.
-///
-static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
- // FIXME: We can move load/store/call/free instructions above the call if the
- // call does not mod/ref the memory location being processed.
- if (I->mayHaveSideEffects()) // This also handles volatile loads.
- return false;
-
- if (LoadInst *L = dyn_cast<LoadInst>(I)) {
- // Loads may always be moved above calls without side effects.
- if (CI->mayHaveSideEffects()) {
- // Non-volatile loads may be moved above a call with side effects if it
- // does not write to memory and the load provably won't trap.
- // Writes to memory only matter if they may alias the pointer
- // being loaded from.
- const DataLayout &DL = L->getModule()->getDataLayout();
- if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) ||
- !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(),
- L->getAlign(), DL, L))
- return false;
- }
- }
-
- // Otherwise, if this is a side-effect free instruction, check to make sure
- // that it does not use the return value of the call. If it doesn't use the
- // return value of the call, it must only use things that are defined before
- // the call, or movable instructions between the call and the instruction
- // itself.
- return !is_contained(I->operands(), CI);
-}
-
-static bool canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) {
- if (!I->isAssociative() || !I->isCommutative())
- return false;
-
- assert(I->getNumOperands() == 2 &&
- "Associative/commutative operations should have 2 args!");
-
- // Exactly one operand should be the result of the call instruction.
- if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
- (I->getOperand(0) != CI && I->getOperand(1) != CI))
- return false;
-
- // The only user of this instruction we allow is a single return instruction.
- if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back()))
- return false;
-
- return true;
-}
-
-static Instruction *firstNonDbg(BasicBlock::iterator I) {
- while (isa<DbgInfoIntrinsic>(I))
- ++I;
- return &*I;
-}
-
-namespace {
-class TailRecursionEliminator {
- Function &F;
- const TargetTransformInfo *TTI;
- AliasAnalysis *AA;
- OptimizationRemarkEmitter *ORE;
- DomTreeUpdater &DTU;
-
- // The below are shared state we want to have available when eliminating any
- // calls in the function. There values should be populated by
- // createTailRecurseLoopHeader the first time we find a call we can eliminate.
- BasicBlock *HeaderBB = nullptr;
- SmallVector<PHINode *, 8> ArgumentPHIs;
- bool RemovableCallsMustBeMarkedTail = false;
-
- // PHI node to store our return value.
- PHINode *RetPN = nullptr;
-
- // i1 PHI node to track if we have a valid return value stored in RetPN.
- PHINode *RetKnownPN = nullptr;
-
- // Vector of select instructions we insereted. These selects use RetKnownPN
- // to either propagate RetPN or select a new return value.
- SmallVector<SelectInst *, 8> RetSelects;
-
- // The below are shared state needed when performing accumulator recursion.
- // There values should be populated by insertAccumulator the first time we
- // find an elimination that requires an accumulator.
-
- // PHI node to store our current accumulated value.
- PHINode *AccPN = nullptr;
-
- // The instruction doing the accumulating.
- Instruction *AccumulatorRecursionInstr = nullptr;
-
- TailRecursionEliminator(Function &F, const TargetTransformInfo *TTI,
- AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
- DomTreeUpdater &DTU)
- : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {}
-
+ auto &State = Visited[SuccBB];
+ if (State < Escaped) {
+ State = Escaped;
+ if (State == ESCAPED)
+ WorklistEscaped.push_back(SuccBB);
+ else
+ WorklistUnescaped.push_back(SuccBB);
+ }
+ }
+
+ if (!WorklistEscaped.empty()) {
+ BB = WorklistEscaped.pop_back_val();
+ Escaped = ESCAPED;
+ } else {
+ BB = nullptr;
+ while (!WorklistUnescaped.empty()) {
+ auto *NextBB = WorklistUnescaped.pop_back_val();
+ if (Visited[NextBB] == UNESCAPED) {
+ BB = NextBB;
+ Escaped = UNESCAPED;
+ break;
+ }
+ }
+ }
+ } while (BB);
+
+ for (CallInst *CI : DeferredTails) {
+ if (Visited[CI->getParent()] != ESCAPED) {
+ // If the escape point was part way through the block, calls after the
+ // escape point wouldn't have been put into DeferredTails.
+ LLVM_DEBUG(dbgs() << "Marked as tail call candidate: " << *CI << "\n");
+ CI->setTailCall();
+ Modified = true;
+ } else {
+ AllCallsAreTailCalls = false;
+ }
+ }
+
+ return Modified;
+}
+
+/// Return true if it is safe to move the specified
+/// instruction from after the call to before the call, assuming that all
+/// instructions between the call and this instruction are movable.
+///
+static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
+ // FIXME: We can move load/store/call/free instructions above the call if the
+ // call does not mod/ref the memory location being processed.
+ if (I->mayHaveSideEffects()) // This also handles volatile loads.
+ return false;
+
+ if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+ // Loads may always be moved above calls without side effects.
+ if (CI->mayHaveSideEffects()) {
+ // Non-volatile loads may be moved above a call with side effects if it
+ // does not write to memory and the load provably won't trap.
+ // Writes to memory only matter if they may alias the pointer
+ // being loaded from.
+ const DataLayout &DL = L->getModule()->getDataLayout();
+ if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) ||
+ !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(),
+ L->getAlign(), DL, L))
+ return false;
+ }
+ }
+
+ // Otherwise, if this is a side-effect free instruction, check to make sure
+ // that it does not use the return value of the call. If it doesn't use the
+ // return value of the call, it must only use things that are defined before
+ // the call, or movable instructions between the call and the instruction
+ // itself.
+ return !is_contained(I->operands(), CI);
+}
+
+static bool canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) {
+ if (!I->isAssociative() || !I->isCommutative())
+ return false;
+
+ assert(I->getNumOperands() == 2 &&
+ "Associative/commutative operations should have 2 args!");
+
+ // Exactly one operand should be the result of the call instruction.
+ if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
+ (I->getOperand(0) != CI && I->getOperand(1) != CI))
+ return false;
+
+ // The only user of this instruction we allow is a single return instruction.
+ if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back()))
+ return false;
+
+ return true;
+}
+
+static Instruction *firstNonDbg(BasicBlock::iterator I) {
+ while (isa<DbgInfoIntrinsic>(I))
+ ++I;
+ return &*I;
+}
+
+namespace {
+class TailRecursionEliminator {
+ Function &F;
+ const TargetTransformInfo *TTI;
+ AliasAnalysis *AA;
+ OptimizationRemarkEmitter *ORE;
+ DomTreeUpdater &DTU;
+
+ // The below are shared state we want to have available when eliminating any
+ // calls in the function. There values should be populated by
+ // createTailRecurseLoopHeader the first time we find a call we can eliminate.
+ BasicBlock *HeaderBB = nullptr;
+ SmallVector<PHINode *, 8> ArgumentPHIs;
+ bool RemovableCallsMustBeMarkedTail = false;
+
+ // PHI node to store our return value.
+ PHINode *RetPN = nullptr;
+
+ // i1 PHI node to track if we have a valid return value stored in RetPN.
+ PHINode *RetKnownPN = nullptr;
+
+ // Vector of select instructions we insereted. These selects use RetKnownPN
+ // to either propagate RetPN or select a new return value.
+ SmallVector<SelectInst *, 8> RetSelects;
+
+ // The below are shared state needed when performing accumulator recursion.
+ // There values should be populated by insertAccumulator the first time we
+ // find an elimination that requires an accumulator.
+
+ // PHI node to store our current accumulated value.
+ PHINode *AccPN = nullptr;
+
+ // The instruction doing the accumulating.
+ Instruction *AccumulatorRecursionInstr = nullptr;
+
+ TailRecursionEliminator(Function &F, const TargetTransformInfo *TTI,
+ AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
+ DomTreeUpdater &DTU)
+ : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {}
+
CallInst *findTRECandidate(BasicBlock *BB,
- bool CannotTailCallElimCallsMarkedTail);
-
- void createTailRecurseLoopHeader(CallInst *CI);
-
- void insertAccumulator(Instruction *AccRecInstr);
-
- bool eliminateCall(CallInst *CI);
-
+ bool CannotTailCallElimCallsMarkedTail);
+
+ void createTailRecurseLoopHeader(CallInst *CI);
+
+ void insertAccumulator(Instruction *AccRecInstr);
+
+ bool eliminateCall(CallInst *CI);
+
void cleanupAndFinalize();
-
+
bool processBlock(BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail);
-
-public:
- static bool eliminate(Function &F, const TargetTransformInfo *TTI,
- AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
- DomTreeUpdater &DTU);
-};
-} // namespace
-
-CallInst *TailRecursionEliminator::findTRECandidate(
+
+public:
+ static bool eliminate(Function &F, const TargetTransformInfo *TTI,
+ AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
+ DomTreeUpdater &DTU);
+};
+} // namespace
+
+CallInst *TailRecursionEliminator::findTRECandidate(
BasicBlock *BB, bool CannotTailCallElimCallsMarkedTail) {
Instruction *TI = BB->getTerminator();
-
- if (&BB->front() == TI) // Make sure there is something before the terminator.
- return nullptr;
-
- // Scan backwards from the return, checking to see if there is a tail call in
- // this block. If so, set CI to it.
- CallInst *CI = nullptr;
- BasicBlock::iterator BBI(TI);
- while (true) {
- CI = dyn_cast<CallInst>(BBI);
- if (CI && CI->getCalledFunction() == &F)
- break;
-
- if (BBI == BB->begin())
- return nullptr; // Didn't find a potential tail call.
- --BBI;
- }
-
- // If this call is marked as a tail call, and if there are dynamic allocas in
- // the function, we cannot perform this optimization.
- if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
- return nullptr;
-
- // As a special case, detect code like this:
- // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
- // and disable this xform in this case, because the code generator will
- // lower the call to fabs into inline code.
- if (BB == &F.getEntryBlock() &&
- firstNonDbg(BB->front().getIterator()) == CI &&
- firstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
- !TTI->isLoweredToCall(CI->getCalledFunction())) {
- // A single-block function with just a call and a return. Check that
- // the arguments match.
- auto I = CI->arg_begin(), E = CI->arg_end();
- Function::arg_iterator FI = F.arg_begin(), FE = F.arg_end();
- for (; I != E && FI != FE; ++I, ++FI)
- if (*I != &*FI) break;
- if (I == E && FI == FE)
- return nullptr;
- }
-
- return CI;
-}
-
-void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) {
- HeaderBB = &F.getEntryBlock();
- BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "", &F, HeaderBB);
- NewEntry->takeName(HeaderBB);
- HeaderBB->setName("tailrecurse");
- BranchInst *BI = BranchInst::Create(HeaderBB, NewEntry);
- BI->setDebugLoc(CI->getDebugLoc());
-
- // If this function has self recursive calls in the tail position where some
- // are marked tail and some are not, only transform one flavor or another.
- // We have to choose whether we move allocas in the entry block to the new
- // entry block or not, so we can't make a good choice for both. We make this
- // decision here based on whether the first call we found to remove is
- // marked tail.
- // NOTE: We could do slightly better here in the case that the function has
- // no entry block allocas.
- RemovableCallsMustBeMarkedTail = CI->isTailCall();
-
- // If this tail call is marked 'tail' and if there are any allocas in the
- // entry block, move them up to the new entry block.
- if (RemovableCallsMustBeMarkedTail)
- // Move all fixed sized allocas from HeaderBB to NewEntry.
- for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(),
- NEBI = NewEntry->begin();
- OEBI != E;)
- if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
- if (isa<ConstantInt>(AI->getArraySize()))
- AI->moveBefore(&*NEBI);
-
- // Now that we have created a new block, which jumps to the entry
- // block, insert a PHI node for each argument of the function.
- // For now, we initialize each PHI to only have the real arguments
- // which are passed in.
- Instruction *InsertPos = &HeaderBB->front();
- for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
- PHINode *PN =
- PHINode::Create(I->getType(), 2, I->getName() + ".tr", InsertPos);
- I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
- PN->addIncoming(&*I, NewEntry);
- ArgumentPHIs.push_back(PN);
- }
-
- // If the function doen't return void, create the RetPN and RetKnownPN PHI
- // nodes to track our return value. We initialize RetPN with undef and
- // RetKnownPN with false since we can't know our return value at function
- // entry.
- Type *RetType = F.getReturnType();
- if (!RetType->isVoidTy()) {
- Type *BoolType = Type::getInt1Ty(F.getContext());
- RetPN = PHINode::Create(RetType, 2, "ret.tr", InsertPos);
- RetKnownPN = PHINode::Create(BoolType, 2, "ret.known.tr", InsertPos);
-
- RetPN->addIncoming(UndefValue::get(RetType), NewEntry);
- RetKnownPN->addIncoming(ConstantInt::getFalse(BoolType), NewEntry);
- }
-
- // The entry block was changed from HeaderBB to NewEntry.
- // The forward DominatorTree needs to be recalculated when the EntryBB is
- // changed. In this corner-case we recalculate the entire tree.
- DTU.recalculate(*NewEntry->getParent());
-}
-
-void TailRecursionEliminator::insertAccumulator(Instruction *AccRecInstr) {
- assert(!AccPN && "Trying to insert multiple accumulators");
-
- AccumulatorRecursionInstr = AccRecInstr;
-
- // Start by inserting a new PHI node for the accumulator.
- pred_iterator PB = pred_begin(HeaderBB), PE = pred_end(HeaderBB);
- AccPN = PHINode::Create(F.getReturnType(), std::distance(PB, PE) + 1,
- "accumulator.tr", &HeaderBB->front());
-
- // Loop over all of the predecessors of the tail recursion block. For the
- // real entry into the function we seed the PHI with the identity constant for
- // the accumulation operation. For any other existing branches to this block
- // (due to other tail recursions eliminated) the accumulator is not modified.
- // Because we haven't added the branch in the current block to HeaderBB yet,
- // it will not show up as a predecessor.
- for (pred_iterator PI = PB; PI != PE; ++PI) {
- BasicBlock *P = *PI;
- if (P == &F.getEntryBlock()) {
- Constant *Identity = ConstantExpr::getBinOpIdentity(
- AccRecInstr->getOpcode(), AccRecInstr->getType());
- AccPN->addIncoming(Identity, P);
- } else {
- AccPN->addIncoming(AccPN, P);
- }
- }
-
- ++NumAccumAdded;
-}
-
-bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
- ReturnInst *Ret = cast<ReturnInst>(CI->getParent()->getTerminator());
-
- // Ok, we found a potential tail call. We can currently only transform the
- // tail call if all of the instructions between the call and the return are
- // movable to above the call itself, leaving the call next to the return.
- // Check that this is the case now.
- Instruction *AccRecInstr = nullptr;
- BasicBlock::iterator BBI(CI);
- for (++BBI; &*BBI != Ret; ++BBI) {
- if (canMoveAboveCall(&*BBI, CI, AA))
- continue;
-
- // If we can't move the instruction above the call, it might be because it
- // is an associative and commutative operation that could be transformed
- // using accumulator recursion elimination. Check to see if this is the
- // case, and if so, remember which instruction accumulates for later.
- if (AccPN || !canTransformAccumulatorRecursion(&*BBI, CI))
- return false; // We cannot eliminate the tail recursion!
-
- // Yes, this is accumulator recursion. Remember which instruction
- // accumulates.
- AccRecInstr = &*BBI;
- }
-
- BasicBlock *BB = Ret->getParent();
-
- using namespace ore;
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "tailcall-recursion", CI)
- << "transforming tail recursion into loop";
- });
-
- // OK! We can transform this tail call. If this is the first one found,
- // create the new entry block, allowing us to branch back to the old entry.
- if (!HeaderBB)
- createTailRecurseLoopHeader(CI);
-
- if (RemovableCallsMustBeMarkedTail && !CI->isTailCall())
- return false;
-
- // Ok, now that we know we have a pseudo-entry block WITH all of the
- // required PHI nodes, add entries into the PHI node for the actual
- // parameters passed into the tail-recursive call.
- for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
- ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB);
-
- if (AccRecInstr) {
- insertAccumulator(AccRecInstr);
-
- // Rewrite the accumulator recursion instruction so that it does not use
- // the result of the call anymore, instead, use the PHI node we just
- // inserted.
- AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
- }
-
- // Update our return value tracking
- if (RetPN) {
- if (Ret->getReturnValue() == CI || AccRecInstr) {
- // Defer selecting a return value
- RetPN->addIncoming(RetPN, BB);
- RetKnownPN->addIncoming(RetKnownPN, BB);
- } else {
- // We found a return value we want to use, insert a select instruction to
- // select it if we don't already know what our return value will be and
- // store the result in our return value PHI node.
- SelectInst *SI = SelectInst::Create(
- RetKnownPN, RetPN, Ret->getReturnValue(), "current.ret.tr", Ret);
- RetSelects.push_back(SI);
-
- RetPN->addIncoming(SI, BB);
- RetKnownPN->addIncoming(ConstantInt::getTrue(RetKnownPN->getType()), BB);
- }
-
- if (AccPN)
- AccPN->addIncoming(AccRecInstr ? AccRecInstr : AccPN, BB);
- }
-
- // Now that all of the PHI nodes are in place, remove the call and
- // ret instructions, replacing them with an unconditional branch.
- BranchInst *NewBI = BranchInst::Create(HeaderBB, Ret);
- NewBI->setDebugLoc(CI->getDebugLoc());
-
- BB->getInstList().erase(Ret); // Remove return.
- BB->getInstList().erase(CI); // Remove call.
- DTU.applyUpdates({{DominatorTree::Insert, BB, HeaderBB}});
- ++NumEliminated;
- return true;
-}
-
-void TailRecursionEliminator::cleanupAndFinalize() {
- // If we eliminated any tail recursions, it's possible that we inserted some
- // silly PHI nodes which just merge an initial value (the incoming operand)
- // with themselves. Check to see if we did and clean up our mess if so. This
- // occurs when a function passes an argument straight through to its tail
- // call.
- for (PHINode *PN : ArgumentPHIs) {
- // If the PHI Node is a dynamic constant, replace it with the value it is.
- if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
- PN->replaceAllUsesWith(PNV);
- PN->eraseFromParent();
- }
- }
-
- if (RetPN) {
- if (RetSelects.empty()) {
- // If we didn't insert any select instructions, then we know we didn't
- // store a return value and we can remove the PHI nodes we inserted.
- RetPN->dropAllReferences();
- RetPN->eraseFromParent();
-
- RetKnownPN->dropAllReferences();
- RetKnownPN->eraseFromParent();
-
- if (AccPN) {
- // We need to insert a copy of our accumulator instruction before any
- // return in the function, and return its result instead.
- Instruction *AccRecInstr = AccumulatorRecursionInstr;
- for (BasicBlock &BB : F) {
- ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
- if (!RI)
- continue;
-
- Instruction *AccRecInstrNew = AccRecInstr->clone();
- AccRecInstrNew->setName("accumulator.ret.tr");
- AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN,
- RI->getOperand(0));
- AccRecInstrNew->insertBefore(RI);
- RI->setOperand(0, AccRecInstrNew);
- }
- }
- } else {
- // We need to insert a select instruction before any return left in the
- // function to select our stored return value if we have one.
- for (BasicBlock &BB : F) {
- ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
- if (!RI)
- continue;
-
- SelectInst *SI = SelectInst::Create(
- RetKnownPN, RetPN, RI->getOperand(0), "current.ret.tr", RI);
- RetSelects.push_back(SI);
- RI->setOperand(0, SI);
- }
-
- if (AccPN) {
- // We need to insert a copy of our accumulator instruction before any
- // of the selects we inserted, and select its result instead.
- Instruction *AccRecInstr = AccumulatorRecursionInstr;
- for (SelectInst *SI : RetSelects) {
- Instruction *AccRecInstrNew = AccRecInstr->clone();
- AccRecInstrNew->setName("accumulator.ret.tr");
- AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN,
- SI->getFalseValue());
- AccRecInstrNew->insertBefore(SI);
- SI->setFalseValue(AccRecInstrNew);
- }
- }
- }
- }
-}
-
+
+ if (&BB->front() == TI) // Make sure there is something before the terminator.
+ return nullptr;
+
+ // Scan backwards from the return, checking to see if there is a tail call in
+ // this block. If so, set CI to it.
+ CallInst *CI = nullptr;
+ BasicBlock::iterator BBI(TI);
+ while (true) {
+ CI = dyn_cast<CallInst>(BBI);
+ if (CI && CI->getCalledFunction() == &F)
+ break;
+
+ if (BBI == BB->begin())
+ return nullptr; // Didn't find a potential tail call.
+ --BBI;
+ }
+
+ // If this call is marked as a tail call, and if there are dynamic allocas in
+ // the function, we cannot perform this optimization.
+ if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
+ return nullptr;
+
+ // As a special case, detect code like this:
+ // double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
+ // and disable this xform in this case, because the code generator will
+ // lower the call to fabs into inline code.
+ if (BB == &F.getEntryBlock() &&
+ firstNonDbg(BB->front().getIterator()) == CI &&
+ firstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
+ !TTI->isLoweredToCall(CI->getCalledFunction())) {
+ // A single-block function with just a call and a return. Check that
+ // the arguments match.
+ auto I = CI->arg_begin(), E = CI->arg_end();
+ Function::arg_iterator FI = F.arg_begin(), FE = F.arg_end();
+ for (; I != E && FI != FE; ++I, ++FI)
+ if (*I != &*FI) break;
+ if (I == E && FI == FE)
+ return nullptr;
+ }
+
+ return CI;
+}
+
+void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) {
+ HeaderBB = &F.getEntryBlock();
+ BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "", &F, HeaderBB);
+ NewEntry->takeName(HeaderBB);
+ HeaderBB->setName("tailrecurse");
+ BranchInst *BI = BranchInst::Create(HeaderBB, NewEntry);
+ BI->setDebugLoc(CI->getDebugLoc());
+
+ // If this function has self recursive calls in the tail position where some
+ // are marked tail and some are not, only transform one flavor or another.
+ // We have to choose whether we move allocas in the entry block to the new
+ // entry block or not, so we can't make a good choice for both. We make this
+ // decision here based on whether the first call we found to remove is
+ // marked tail.
+ // NOTE: We could do slightly better here in the case that the function has
+ // no entry block allocas.
+ RemovableCallsMustBeMarkedTail = CI->isTailCall();
+
+ // If this tail call is marked 'tail' and if there are any allocas in the
+ // entry block, move them up to the new entry block.
+ if (RemovableCallsMustBeMarkedTail)
+ // Move all fixed sized allocas from HeaderBB to NewEntry.
+ for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(),
+ NEBI = NewEntry->begin();
+ OEBI != E;)
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
+ if (isa<ConstantInt>(AI->getArraySize()))
+ AI->moveBefore(&*NEBI);
+
+ // Now that we have created a new block, which jumps to the entry
+ // block, insert a PHI node for each argument of the function.
+ // For now, we initialize each PHI to only have the real arguments
+ // which are passed in.
+ Instruction *InsertPos = &HeaderBB->front();
+ for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
+ PHINode *PN =
+ PHINode::Create(I->getType(), 2, I->getName() + ".tr", InsertPos);
+ I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
+ PN->addIncoming(&*I, NewEntry);
+ ArgumentPHIs.push_back(PN);
+ }
+
+ // If the function doen't return void, create the RetPN and RetKnownPN PHI
+ // nodes to track our return value. We initialize RetPN with undef and
+ // RetKnownPN with false since we can't know our return value at function
+ // entry.
+ Type *RetType = F.getReturnType();
+ if (!RetType->isVoidTy()) {
+ Type *BoolType = Type::getInt1Ty(F.getContext());
+ RetPN = PHINode::Create(RetType, 2, "ret.tr", InsertPos);
+ RetKnownPN = PHINode::Create(BoolType, 2, "ret.known.tr", InsertPos);
+
+ RetPN->addIncoming(UndefValue::get(RetType), NewEntry);
+ RetKnownPN->addIncoming(ConstantInt::getFalse(BoolType), NewEntry);
+ }
+
+ // The entry block was changed from HeaderBB to NewEntry.
+ // The forward DominatorTree needs to be recalculated when the EntryBB is
+ // changed. In this corner-case we recalculate the entire tree.
+ DTU.recalculate(*NewEntry->getParent());
+}
+
+void TailRecursionEliminator::insertAccumulator(Instruction *AccRecInstr) {
+ assert(!AccPN && "Trying to insert multiple accumulators");
+
+ AccumulatorRecursionInstr = AccRecInstr;
+
+ // Start by inserting a new PHI node for the accumulator.
+ pred_iterator PB = pred_begin(HeaderBB), PE = pred_end(HeaderBB);
+ AccPN = PHINode::Create(F.getReturnType(), std::distance(PB, PE) + 1,
+ "accumulator.tr", &HeaderBB->front());
+
+ // Loop over all of the predecessors of the tail recursion block. For the
+ // real entry into the function we seed the PHI with the identity constant for
+ // the accumulation operation. For any other existing branches to this block
+ // (due to other tail recursions eliminated) the accumulator is not modified.
+ // Because we haven't added the branch in the current block to HeaderBB yet,
+ // it will not show up as a predecessor.
+ for (pred_iterator PI = PB; PI != PE; ++PI) {
+ BasicBlock *P = *PI;
+ if (P == &F.getEntryBlock()) {
+ Constant *Identity = ConstantExpr::getBinOpIdentity(
+ AccRecInstr->getOpcode(), AccRecInstr->getType());
+ AccPN->addIncoming(Identity, P);
+ } else {
+ AccPN->addIncoming(AccPN, P);
+ }
+ }
+
+ ++NumAccumAdded;
+}
+
+bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
+ ReturnInst *Ret = cast<ReturnInst>(CI->getParent()->getTerminator());
+
+ // Ok, we found a potential tail call. We can currently only transform the
+ // tail call if all of the instructions between the call and the return are
+ // movable to above the call itself, leaving the call next to the return.
+ // Check that this is the case now.
+ Instruction *AccRecInstr = nullptr;
+ BasicBlock::iterator BBI(CI);
+ for (++BBI; &*BBI != Ret; ++BBI) {
+ if (canMoveAboveCall(&*BBI, CI, AA))
+ continue;
+
+ // If we can't move the instruction above the call, it might be because it
+ // is an associative and commutative operation that could be transformed
+ // using accumulator recursion elimination. Check to see if this is the
+ // case, and if so, remember which instruction accumulates for later.
+ if (AccPN || !canTransformAccumulatorRecursion(&*BBI, CI))
+ return false; // We cannot eliminate the tail recursion!
+
+ // Yes, this is accumulator recursion. Remember which instruction
+ // accumulates.
+ AccRecInstr = &*BBI;
+ }
+
+ BasicBlock *BB = Ret->getParent();
+
+ using namespace ore;
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "tailcall-recursion", CI)
+ << "transforming tail recursion into loop";
+ });
+
+ // OK! We can transform this tail call. If this is the first one found,
+ // create the new entry block, allowing us to branch back to the old entry.
+ if (!HeaderBB)
+ createTailRecurseLoopHeader(CI);
+
+ if (RemovableCallsMustBeMarkedTail && !CI->isTailCall())
+ return false;
+
+ // Ok, now that we know we have a pseudo-entry block WITH all of the
+ // required PHI nodes, add entries into the PHI node for the actual
+ // parameters passed into the tail-recursive call.
+ for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+ ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB);
+
+ if (AccRecInstr) {
+ insertAccumulator(AccRecInstr);
+
+ // Rewrite the accumulator recursion instruction so that it does not use
+ // the result of the call anymore, instead, use the PHI node we just
+ // inserted.
+ AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+ }
+
+ // Update our return value tracking
+ if (RetPN) {
+ if (Ret->getReturnValue() == CI || AccRecInstr) {
+ // Defer selecting a return value
+ RetPN->addIncoming(RetPN, BB);
+ RetKnownPN->addIncoming(RetKnownPN, BB);
+ } else {
+ // We found a return value we want to use, insert a select instruction to
+ // select it if we don't already know what our return value will be and
+ // store the result in our return value PHI node.
+ SelectInst *SI = SelectInst::Create(
+ RetKnownPN, RetPN, Ret->getReturnValue(), "current.ret.tr", Ret);
+ RetSelects.push_back(SI);
+
+ RetPN->addIncoming(SI, BB);
+ RetKnownPN->addIncoming(ConstantInt::getTrue(RetKnownPN->getType()), BB);
+ }
+
+ if (AccPN)
+ AccPN->addIncoming(AccRecInstr ? AccRecInstr : AccPN, BB);
+ }
+
+ // Now that all of the PHI nodes are in place, remove the call and
+ // ret instructions, replacing them with an unconditional branch.
+ BranchInst *NewBI = BranchInst::Create(HeaderBB, Ret);
+ NewBI->setDebugLoc(CI->getDebugLoc());
+
+ BB->getInstList().erase(Ret); // Remove return.
+ BB->getInstList().erase(CI); // Remove call.
+ DTU.applyUpdates({{DominatorTree::Insert, BB, HeaderBB}});
+ ++NumEliminated;
+ return true;
+}
+
+void TailRecursionEliminator::cleanupAndFinalize() {
+ // If we eliminated any tail recursions, it's possible that we inserted some
+ // silly PHI nodes which just merge an initial value (the incoming operand)
+ // with themselves. Check to see if we did and clean up our mess if so. This
+ // occurs when a function passes an argument straight through to its tail
+ // call.
+ for (PHINode *PN : ArgumentPHIs) {
+ // If the PHI Node is a dynamic constant, replace it with the value it is.
+ if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
+ PN->replaceAllUsesWith(PNV);
+ PN->eraseFromParent();
+ }
+ }
+
+ if (RetPN) {
+ if (RetSelects.empty()) {
+ // If we didn't insert any select instructions, then we know we didn't
+ // store a return value and we can remove the PHI nodes we inserted.
+ RetPN->dropAllReferences();
+ RetPN->eraseFromParent();
+
+ RetKnownPN->dropAllReferences();
+ RetKnownPN->eraseFromParent();
+
+ if (AccPN) {
+ // We need to insert a copy of our accumulator instruction before any
+ // return in the function, and return its result instead.
+ Instruction *AccRecInstr = AccumulatorRecursionInstr;
+ for (BasicBlock &BB : F) {
+ ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
+ if (!RI)
+ continue;
+
+ Instruction *AccRecInstrNew = AccRecInstr->clone();
+ AccRecInstrNew->setName("accumulator.ret.tr");
+ AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN,
+ RI->getOperand(0));
+ AccRecInstrNew->insertBefore(RI);
+ RI->setOperand(0, AccRecInstrNew);
+ }
+ }
+ } else {
+ // We need to insert a select instruction before any return left in the
+ // function to select our stored return value if we have one.
+ for (BasicBlock &BB : F) {
+ ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
+ if (!RI)
+ continue;
+
+ SelectInst *SI = SelectInst::Create(
+ RetKnownPN, RetPN, RI->getOperand(0), "current.ret.tr", RI);
+ RetSelects.push_back(SI);
+ RI->setOperand(0, SI);
+ }
+
+ if (AccPN) {
+ // We need to insert a copy of our accumulator instruction before any
+ // of the selects we inserted, and select its result instead.
+ Instruction *AccRecInstr = AccumulatorRecursionInstr;
+ for (SelectInst *SI : RetSelects) {
+ Instruction *AccRecInstrNew = AccRecInstr->clone();
+ AccRecInstrNew->setName("accumulator.ret.tr");
+ AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN,
+ SI->getFalseValue());
+ AccRecInstrNew->insertBefore(SI);
+ SI->setFalseValue(AccRecInstrNew);
+ }
+ }
+ }
+ }
+}
+
bool TailRecursionEliminator::processBlock(
BasicBlock &BB, bool CannotTailCallElimCallsMarkedTail) {
Instruction *TI = BB.getTerminator();
@@ -791,110 +791,110 @@ bool TailRecursionEliminator::processBlock(
return false;
}
-bool TailRecursionEliminator::eliminate(Function &F,
- const TargetTransformInfo *TTI,
- AliasAnalysis *AA,
- OptimizationRemarkEmitter *ORE,
- DomTreeUpdater &DTU) {
- if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
- return false;
-
- bool MadeChange = false;
- bool AllCallsAreTailCalls = false;
- MadeChange |= markTails(F, AllCallsAreTailCalls, ORE);
- if (!AllCallsAreTailCalls)
- return MadeChange;
-
- // If this function is a varargs function, we won't be able to PHI the args
- // right, so don't even try to convert it...
- if (F.getFunctionType()->isVarArg())
- return MadeChange;
-
- // If false, we cannot perform TRE on tail calls marked with the 'tail'
- // attribute, because doing so would cause the stack size to increase (real
- // TRE would deallocate variable sized allocas, TRE doesn't).
- bool CanTRETailMarkedCall = canTRE(F);
-
+bool TailRecursionEliminator::eliminate(Function &F,
+ const TargetTransformInfo *TTI,
+ AliasAnalysis *AA,
+ OptimizationRemarkEmitter *ORE,
+ DomTreeUpdater &DTU) {
+ if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
+ return false;
+
+ bool MadeChange = false;
+ bool AllCallsAreTailCalls = false;
+ MadeChange |= markTails(F, AllCallsAreTailCalls, ORE);
+ if (!AllCallsAreTailCalls)
+ return MadeChange;
+
+ // If this function is a varargs function, we won't be able to PHI the args
+ // right, so don't even try to convert it...
+ if (F.getFunctionType()->isVarArg())
+ return MadeChange;
+
+ // If false, we cannot perform TRE on tail calls marked with the 'tail'
+ // attribute, because doing so would cause the stack size to increase (real
+ // TRE would deallocate variable sized allocas, TRE doesn't).
+ bool CanTRETailMarkedCall = canTRE(F);
+
// Change any tail recursive calls to loops.
- TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU);
-
+ TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU);
+
for (BasicBlock &BB : F)
MadeChange |= TRE.processBlock(BB, !CanTRETailMarkedCall);
-
- TRE.cleanupAndFinalize();
-
- return MadeChange;
-}
-
-namespace {
-struct TailCallElim : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- TailCallElim() : FunctionPass(ID) {
- initializeTailCallElimPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<PostDominatorTreeWrapperPass>();
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
- auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
- auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
- // There is no noticable performance difference here between Lazy and Eager
- // UpdateStrategy based on some test results. It is feasible to switch the
- // UpdateStrategy to Lazy if we find it profitable later.
- DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
-
- return TailRecursionEliminator::eliminate(
- F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
- &getAnalysis<AAResultsWrapperPass>().getAAResults(),
- &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(), DTU);
- }
-};
-}
-
-char TailCallElim::ID = 0;
-INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination",
- false, false)
-
-// Public interface to the TailCallElimination pass
-FunctionPass *llvm::createTailCallEliminationPass() {
- return new TailCallElim();
-}
-
-PreservedAnalyses TailCallElimPass::run(Function &F,
- FunctionAnalysisManager &AM) {
-
- TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
- AliasAnalysis &AA = AM.getResult<AAManager>(F);
- auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
- auto *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
- // There is no noticable performance difference here between Lazy and Eager
- // UpdateStrategy based on some test results. It is feasible to switch the
- // UpdateStrategy to Lazy if we find it profitable later.
- DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
- bool Changed = TailRecursionEliminator::eliminate(F, &TTI, &AA, &ORE, DTU);
-
- if (!Changed)
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
- PA.preserve<GlobalsAA>();
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<PostDominatorTreeAnalysis>();
- return PA;
-}
+
+ TRE.cleanupAndFinalize();
+
+ return MadeChange;
+}
+
+namespace {
+struct TailCallElim : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ TailCallElim() : FunctionPass(ID) {
+ initializeTailCallElimPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<PostDominatorTreeWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+ auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+ // There is no noticable performance difference here between Lazy and Eager
+ // UpdateStrategy based on some test results. It is feasible to switch the
+ // UpdateStrategy to Lazy if we find it profitable later.
+ DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
+
+ return TailRecursionEliminator::eliminate(
+ F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
+ &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+ &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(), DTU);
+ }
+};
+}
+
+char TailCallElim::ID = 0;
+INITIALIZE_PASS_BEGIN(TailCallElim, "tailcallelim", "Tail Call Elimination",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(TailCallElim, "tailcallelim", "Tail Call Elimination",
+ false, false)
+
+// Public interface to the TailCallElimination pass
+FunctionPass *llvm::createTailCallEliminationPass() {
+ return new TailCallElim();
+}
+
+PreservedAnalyses TailCallElimPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+
+ TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+ AliasAnalysis &AA = AM.getResult<AAManager>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+ auto *PDT = AM.getCachedResult<PostDominatorTreeAnalysis>(F);
+ // There is no noticable performance difference here between Lazy and Eager
+ // UpdateStrategy based on some test results. It is feasible to switch the
+ // UpdateStrategy to Lazy if we find it profitable later.
+ DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
+ bool Changed = TailRecursionEliminator::eliminate(F, &TTI, &AA, &ORE, DTU);
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<GlobalsAA>();
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<PostDominatorTreeAnalysis>();
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/contrib/libs/llvm12/lib/Transforms/Scalar/WarnMissedTransforms.cpp
index 464eb20d8f..80a7d3a43a 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/WarnMissedTransforms.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/WarnMissedTransforms.cpp
@@ -1,150 +1,150 @@
-//===- LoopTransformWarning.cpp - ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Emit warnings if forced code transformations have not been performed.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Scalar/WarnMissedTransforms.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "transform-warning"
-
-/// Emit warnings for forced (i.e. user-defined) loop transformations which have
-/// still not been performed.
-static void warnAboutLeftoverTransformations(Loop *L,
- OptimizationRemarkEmitter *ORE) {
- if (hasUnrollTransformation(L) == TM_ForcedByUser) {
- LLVM_DEBUG(dbgs() << "Leftover unroll transformation\n");
- ORE->emit(
- DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
- "FailedRequestedUnrolling",
- L->getStartLoc(), L->getHeader())
- << "loop not unrolled: the optimizer was unable to perform the "
- "requested transformation; the transformation might be disabled or "
- "specified as part of an unsupported transformation ordering");
- }
-
- if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser) {
- LLVM_DEBUG(dbgs() << "Leftover unroll-and-jam transformation\n");
- ORE->emit(
- DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
- "FailedRequestedUnrollAndJamming",
- L->getStartLoc(), L->getHeader())
- << "loop not unroll-and-jammed: the optimizer was unable to perform "
- "the requested transformation; the transformation might be disabled "
- "or specified as part of an unsupported transformation ordering");
- }
-
- if (hasVectorizeTransformation(L) == TM_ForcedByUser) {
- LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n");
+//===- LoopTransformWarning.cpp - ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Emit warnings if forced code transformations have not been performed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar/WarnMissedTransforms.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "transform-warning"
+
+/// Emit warnings for forced (i.e. user-defined) loop transformations which have
+/// still not been performed.
+static void warnAboutLeftoverTransformations(Loop *L,
+ OptimizationRemarkEmitter *ORE) {
+ if (hasUnrollTransformation(L) == TM_ForcedByUser) {
+ LLVM_DEBUG(dbgs() << "Leftover unroll transformation\n");
+ ORE->emit(
+ DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+ "FailedRequestedUnrolling",
+ L->getStartLoc(), L->getHeader())
+ << "loop not unrolled: the optimizer was unable to perform the "
+ "requested transformation; the transformation might be disabled or "
+ "specified as part of an unsupported transformation ordering");
+ }
+
+ if (hasUnrollAndJamTransformation(L) == TM_ForcedByUser) {
+ LLVM_DEBUG(dbgs() << "Leftover unroll-and-jam transformation\n");
+ ORE->emit(
+ DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+ "FailedRequestedUnrollAndJamming",
+ L->getStartLoc(), L->getHeader())
+ << "loop not unroll-and-jammed: the optimizer was unable to perform "
+ "the requested transformation; the transformation might be disabled "
+ "or specified as part of an unsupported transformation ordering");
+ }
+
+ if (hasVectorizeTransformation(L) == TM_ForcedByUser) {
+ LLVM_DEBUG(dbgs() << "Leftover vectorization transformation\n");
Optional<ElementCount> VectorizeWidth =
getOptionalElementCountLoopAttribute(L);
- Optional<int> InterleaveCount =
- getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
-
+ Optional<int> InterleaveCount =
+ getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
+
if (!VectorizeWidth || VectorizeWidth->isVector())
- ORE->emit(
- DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
- "FailedRequestedVectorization",
- L->getStartLoc(), L->getHeader())
- << "loop not vectorized: the optimizer was unable to perform the "
- "requested transformation; the transformation might be disabled "
- "or specified as part of an unsupported transformation ordering");
- else if (InterleaveCount.getValueOr(0) != 1)
- ORE->emit(
- DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
- "FailedRequestedInterleaving",
- L->getStartLoc(), L->getHeader())
- << "loop not interleaved: the optimizer was unable to perform the "
- "requested transformation; the transformation might be disabled "
- "or specified as part of an unsupported transformation ordering");
- }
-
- if (hasDistributeTransformation(L) == TM_ForcedByUser) {
- LLVM_DEBUG(dbgs() << "Leftover distribute transformation\n");
- ORE->emit(
- DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
- "FailedRequestedDistribution",
- L->getStartLoc(), L->getHeader())
- << "loop not distributed: the optimizer was unable to perform the "
- "requested transformation; the transformation might be disabled or "
- "specified as part of an unsupported transformation ordering");
- }
-}
-
-static void warnAboutLeftoverTransformations(Function *F, LoopInfo *LI,
- OptimizationRemarkEmitter *ORE) {
- for (auto *L : LI->getLoopsInPreorder())
- warnAboutLeftoverTransformations(L, ORE);
-}
-
-// New pass manager boilerplate
-PreservedAnalyses
-WarnMissedTransformationsPass::run(Function &F, FunctionAnalysisManager &AM) {
- // Do not warn about not applied transformations if optimizations are
- // disabled.
- if (F.hasOptNone())
- return PreservedAnalyses::all();
-
- auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- auto &LI = AM.getResult<LoopAnalysis>(F);
-
- warnAboutLeftoverTransformations(&F, &LI, &ORE);
-
- return PreservedAnalyses::all();
-}
-
-// Legacy pass manager boilerplate
-namespace {
-class WarnMissedTransformationsLegacy : public FunctionPass {
-public:
- static char ID;
-
- explicit WarnMissedTransformationsLegacy() : FunctionPass(ID) {
- initializeWarnMissedTransformationsLegacyPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-
- warnAboutLeftoverTransformations(&F, &LI, &ORE);
- return false;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
-
- AU.setPreservesAll();
- }
-};
-} // end anonymous namespace
-
-char WarnMissedTransformationsLegacy::ID = 0;
-
-INITIALIZE_PASS_BEGIN(WarnMissedTransformationsLegacy, "transform-warning",
- "Warn about non-applied transformations", false, false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_END(WarnMissedTransformationsLegacy, "transform-warning",
- "Warn about non-applied transformations", false, false)
-
-Pass *llvm::createWarnMissedTransformationsPass() {
- return new WarnMissedTransformationsLegacy();
-}
+ ORE->emit(
+ DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+ "FailedRequestedVectorization",
+ L->getStartLoc(), L->getHeader())
+ << "loop not vectorized: the optimizer was unable to perform the "
+ "requested transformation; the transformation might be disabled "
+ "or specified as part of an unsupported transformation ordering");
+ else if (InterleaveCount.getValueOr(0) != 1)
+ ORE->emit(
+ DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+ "FailedRequestedInterleaving",
+ L->getStartLoc(), L->getHeader())
+ << "loop not interleaved: the optimizer was unable to perform the "
+ "requested transformation; the transformation might be disabled "
+ "or specified as part of an unsupported transformation ordering");
+ }
+
+ if (hasDistributeTransformation(L) == TM_ForcedByUser) {
+ LLVM_DEBUG(dbgs() << "Leftover distribute transformation\n");
+ ORE->emit(
+ DiagnosticInfoOptimizationFailure(DEBUG_TYPE,
+ "FailedRequestedDistribution",
+ L->getStartLoc(), L->getHeader())
+ << "loop not distributed: the optimizer was unable to perform the "
+ "requested transformation; the transformation might be disabled or "
+ "specified as part of an unsupported transformation ordering");
+ }
+}
+
+static void warnAboutLeftoverTransformations(Function *F, LoopInfo *LI,
+ OptimizationRemarkEmitter *ORE) {
+ for (auto *L : LI->getLoopsInPreorder())
+ warnAboutLeftoverTransformations(L, ORE);
+}
+
+// New pass manager boilerplate
+PreservedAnalyses
+WarnMissedTransformationsPass::run(Function &F, FunctionAnalysisManager &AM) {
+ // Do not warn about not applied transformations if optimizations are
+ // disabled.
+ if (F.hasOptNone())
+ return PreservedAnalyses::all();
+
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+
+ warnAboutLeftoverTransformations(&F, &LI, &ORE);
+
+ return PreservedAnalyses::all();
+}
+
+// Legacy pass manager boilerplate
+namespace {
+class WarnMissedTransformationsLegacy : public FunctionPass {
+public:
+ static char ID;
+
+ explicit WarnMissedTransformationsLegacy() : FunctionPass(ID) {
+ initializeWarnMissedTransformationsLegacyPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+
+ warnAboutLeftoverTransformations(&F, &LI, &ORE);
+ return false;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+
+ AU.setPreservesAll();
+ }
+};
+} // end anonymous namespace
+
+char WarnMissedTransformationsLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(WarnMissedTransformationsLegacy, "transform-warning",
+ "Warn about non-applied transformations", false, false)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_END(WarnMissedTransformationsLegacy, "transform-warning",
+ "Warn about non-applied transformations", false, false)
+
+Pass *llvm::createWarnMissedTransformationsPass() {
+ return new WarnMissedTransformationsLegacy();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Scalar/ya.make b/contrib/libs/llvm12/lib/Transforms/Scalar/ya.make
index beb88625e7..75501ae81a 100644
--- a/contrib/libs/llvm12/lib/Transforms/Scalar/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/Scalar/ya.make
@@ -1,17 +1,17 @@
-# Generated by devtools/yamaker.
-
-LIBRARY()
-
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
OWNER(
orivej
g:cpp-contrib
)
-
+
LICENSE(Apache-2.0 WITH LLVM-exception)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-PEERDIR(
+PEERDIR(
contrib/libs/llvm12
contrib/libs/llvm12/include
contrib/libs/llvm12/lib/Analysis
@@ -20,95 +20,95 @@ PEERDIR(
contrib/libs/llvm12/lib/Transforms/AggressiveInstCombine
contrib/libs/llvm12/lib/Transforms/InstCombine
contrib/libs/llvm12/lib/Transforms/Utils
-)
-
+)
+
ADDINCL(
contrib/libs/llvm12/lib/Transforms/Scalar
)
-
-NO_COMPILER_WARNINGS()
-
-NO_UTIL()
-
-SRCS(
- ADCE.cpp
- AlignmentFromAssumptions.cpp
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+ ADCE.cpp
+ AlignmentFromAssumptions.cpp
AnnotationRemarks.cpp
- BDCE.cpp
- CallSiteSplitting.cpp
- ConstantHoisting.cpp
+ BDCE.cpp
+ CallSiteSplitting.cpp
+ ConstantHoisting.cpp
ConstraintElimination.cpp
- CorrelatedValuePropagation.cpp
- DCE.cpp
- DeadStoreElimination.cpp
- DivRemPairs.cpp
- EarlyCSE.cpp
- FlattenCFGPass.cpp
- Float2Int.cpp
- GVN.cpp
- GVNHoist.cpp
- GVNSink.cpp
- GuardWidening.cpp
- IVUsersPrinter.cpp
- IndVarSimplify.cpp
- InductiveRangeCheckElimination.cpp
- InferAddressSpaces.cpp
- InstSimplifyPass.cpp
- JumpThreading.cpp
- LICM.cpp
- LoopAccessAnalysisPrinter.cpp
- LoopDataPrefetch.cpp
- LoopDeletion.cpp
- LoopDistribute.cpp
+ CorrelatedValuePropagation.cpp
+ DCE.cpp
+ DeadStoreElimination.cpp
+ DivRemPairs.cpp
+ EarlyCSE.cpp
+ FlattenCFGPass.cpp
+ Float2Int.cpp
+ GVN.cpp
+ GVNHoist.cpp
+ GVNSink.cpp
+ GuardWidening.cpp
+ IVUsersPrinter.cpp
+ IndVarSimplify.cpp
+ InductiveRangeCheckElimination.cpp
+ InferAddressSpaces.cpp
+ InstSimplifyPass.cpp
+ JumpThreading.cpp
+ LICM.cpp
+ LoopAccessAnalysisPrinter.cpp
+ LoopDataPrefetch.cpp
+ LoopDeletion.cpp
+ LoopDistribute.cpp
LoopFlatten.cpp
- LoopFuse.cpp
- LoopIdiomRecognize.cpp
- LoopInstSimplify.cpp
- LoopInterchange.cpp
- LoopLoadElimination.cpp
- LoopPassManager.cpp
- LoopPredication.cpp
- LoopRerollPass.cpp
- LoopRotation.cpp
- LoopSimplifyCFG.cpp
- LoopSink.cpp
- LoopStrengthReduce.cpp
- LoopUnrollAndJamPass.cpp
- LoopUnrollPass.cpp
- LoopUnswitch.cpp
- LoopVersioningLICM.cpp
- LowerAtomic.cpp
- LowerConstantIntrinsics.cpp
- LowerExpectIntrinsic.cpp
- LowerGuardIntrinsic.cpp
- LowerMatrixIntrinsics.cpp
- LowerWidenableCondition.cpp
- MakeGuardsExplicit.cpp
- MemCpyOptimizer.cpp
- MergeICmps.cpp
- MergedLoadStoreMotion.cpp
- NaryReassociate.cpp
- NewGVN.cpp
- PartiallyInlineLibCalls.cpp
- PlaceSafepoints.cpp
- Reassociate.cpp
- Reg2Mem.cpp
- RewriteStatepointsForGC.cpp
- SCCP.cpp
- SROA.cpp
- Scalar.cpp
+ LoopFuse.cpp
+ LoopIdiomRecognize.cpp
+ LoopInstSimplify.cpp
+ LoopInterchange.cpp
+ LoopLoadElimination.cpp
+ LoopPassManager.cpp
+ LoopPredication.cpp
+ LoopRerollPass.cpp
+ LoopRotation.cpp
+ LoopSimplifyCFG.cpp
+ LoopSink.cpp
+ LoopStrengthReduce.cpp
+ LoopUnrollAndJamPass.cpp
+ LoopUnrollPass.cpp
+ LoopUnswitch.cpp
+ LoopVersioningLICM.cpp
+ LowerAtomic.cpp
+ LowerConstantIntrinsics.cpp
+ LowerExpectIntrinsic.cpp
+ LowerGuardIntrinsic.cpp
+ LowerMatrixIntrinsics.cpp
+ LowerWidenableCondition.cpp
+ MakeGuardsExplicit.cpp
+ MemCpyOptimizer.cpp
+ MergeICmps.cpp
+ MergedLoadStoreMotion.cpp
+ NaryReassociate.cpp
+ NewGVN.cpp
+ PartiallyInlineLibCalls.cpp
+ PlaceSafepoints.cpp
+ Reassociate.cpp
+ Reg2Mem.cpp
+ RewriteStatepointsForGC.cpp
+ SCCP.cpp
+ SROA.cpp
+ Scalar.cpp
ScalarizeMaskedMemIntrin.cpp
- Scalarizer.cpp
- SeparateConstOffsetFromGEP.cpp
- SimpleLoopUnswitch.cpp
- SimplifyCFGPass.cpp
- Sink.cpp
- SpeculateAroundPHIs.cpp
- SpeculativeExecution.cpp
- StraightLineStrengthReduce.cpp
- StructurizeCFG.cpp
- TailRecursionElimination.cpp
- WarnMissedTransforms.cpp
-)
-
-END()
+ Scalarizer.cpp
+ SeparateConstOffsetFromGEP.cpp
+ SimpleLoopUnswitch.cpp
+ SimplifyCFGPass.cpp
+ Sink.cpp
+ SpeculateAroundPHIs.cpp
+ SpeculativeExecution.cpp
+ StraightLineStrengthReduce.cpp
+ StructurizeCFG.cpp
+ TailRecursionElimination.cpp
+ WarnMissedTransforms.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
index 3692462855..ccdcf7cbce 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
@@ -1,243 +1,243 @@
-//===- AMDGPUEmitPrintf.cpp -----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Utility function to lower a printf call into a series of device
-// library calls on the AMDGPU target.
-//
-// WARNING: This file knows about certain library functions. It recognizes them
-// by name, and hardwires knowledge of their semantics.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h"
-#include "llvm/ADT/SparseBitVector.h"
-#include "llvm/Analysis/ValueTracking.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "amdgpu-emit-printf"
-
-static bool isCString(const Value *Arg) {
- auto Ty = Arg->getType();
- auto PtrTy = dyn_cast<PointerType>(Ty);
- if (!PtrTy)
- return false;
-
- auto IntTy = dyn_cast<IntegerType>(PtrTy->getElementType());
- if (!IntTy)
- return false;
-
- return IntTy->getBitWidth() == 8;
-}
-
-static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) {
- auto Int64Ty = Builder.getInt64Ty();
- auto Ty = Arg->getType();
-
- if (auto IntTy = dyn_cast<IntegerType>(Ty)) {
- switch (IntTy->getBitWidth()) {
- case 32:
- return Builder.CreateZExt(Arg, Int64Ty);
- case 64:
- return Arg;
- }
- }
-
- if (Ty->getTypeID() == Type::DoubleTyID) {
- return Builder.CreateBitCast(Arg, Int64Ty);
- }
-
- if (isa<PointerType>(Ty)) {
- return Builder.CreatePtrToInt(Arg, Int64Ty);
- }
-
- llvm_unreachable("unexpected type");
-}
-
-static Value *callPrintfBegin(IRBuilder<> &Builder, Value *Version) {
- auto Int64Ty = Builder.getInt64Ty();
- auto M = Builder.GetInsertBlock()->getModule();
- auto Fn = M->getOrInsertFunction("__ockl_printf_begin", Int64Ty, Int64Ty);
- return Builder.CreateCall(Fn, Version);
-}
-
-static Value *callAppendArgs(IRBuilder<> &Builder, Value *Desc, int NumArgs,
- Value *Arg0, Value *Arg1, Value *Arg2, Value *Arg3,
- Value *Arg4, Value *Arg5, Value *Arg6,
- bool IsLast) {
- auto Int64Ty = Builder.getInt64Ty();
- auto Int32Ty = Builder.getInt32Ty();
- auto M = Builder.GetInsertBlock()->getModule();
- auto Fn = M->getOrInsertFunction("__ockl_printf_append_args", Int64Ty,
- Int64Ty, Int32Ty, Int64Ty, Int64Ty, Int64Ty,
- Int64Ty, Int64Ty, Int64Ty, Int64Ty, Int32Ty);
- auto IsLastValue = Builder.getInt32(IsLast);
- auto NumArgsValue = Builder.getInt32(NumArgs);
- return Builder.CreateCall(Fn, {Desc, NumArgsValue, Arg0, Arg1, Arg2, Arg3,
- Arg4, Arg5, Arg6, IsLastValue});
-}
-
-static Value *appendArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
- bool IsLast) {
- auto Arg0 = fitArgInto64Bits(Builder, Arg);
- auto Zero = Builder.getInt64(0);
- return callAppendArgs(Builder, Desc, 1, Arg0, Zero, Zero, Zero, Zero, Zero,
- Zero, IsLast);
-}
-
-// The device library does not provide strlen, so we build our own loop
-// here. While we are at it, we also include the terminating null in the length.
-static Value *getStrlenWithNull(IRBuilder<> &Builder, Value *Str) {
- auto *Prev = Builder.GetInsertBlock();
- Module *M = Prev->getModule();
-
- auto CharZero = Builder.getInt8(0);
- auto One = Builder.getInt64(1);
- auto Zero = Builder.getInt64(0);
- auto Int64Ty = Builder.getInt64Ty();
-
- // The length is either zero for a null pointer, or the computed value for an
- // actual string. We need a join block for a phi that represents the final
- // value.
- //
- // Strictly speaking, the zero does not matter since
- // __ockl_printf_append_string_n ignores the length if the pointer is null.
- BasicBlock *Join = nullptr;
- if (Prev->getTerminator()) {
- Join = Prev->splitBasicBlock(Builder.GetInsertPoint(),
- "strlen.join");
- Prev->getTerminator()->eraseFromParent();
- } else {
- Join = BasicBlock::Create(M->getContext(), "strlen.join",
- Prev->getParent());
- }
- BasicBlock *While =
- BasicBlock::Create(M->getContext(), "strlen.while",
- Prev->getParent(), Join);
- BasicBlock *WhileDone = BasicBlock::Create(
- M->getContext(), "strlen.while.done",
- Prev->getParent(), Join);
-
- // Emit an early return for when the pointer is null.
- Builder.SetInsertPoint(Prev);
- auto CmpNull =
- Builder.CreateICmpEQ(Str, Constant::getNullValue(Str->getType()));
- BranchInst::Create(Join, While, CmpNull, Prev);
-
- // Entry to the while loop.
- Builder.SetInsertPoint(While);
-
- auto PtrPhi = Builder.CreatePHI(Str->getType(), 2);
- PtrPhi->addIncoming(Str, Prev);
- auto PtrNext = Builder.CreateGEP(PtrPhi, One);
- PtrPhi->addIncoming(PtrNext, While);
-
- // Condition for the while loop.
- auto Data = Builder.CreateLoad(PtrPhi);
- auto Cmp = Builder.CreateICmpEQ(Data, CharZero);
- Builder.CreateCondBr(Cmp, WhileDone, While);
-
- // Add one to the computed length.
- Builder.SetInsertPoint(WhileDone, WhileDone->begin());
- auto Begin = Builder.CreatePtrToInt(Str, Int64Ty);
- auto End = Builder.CreatePtrToInt(PtrPhi, Int64Ty);
- auto Len = Builder.CreateSub(End, Begin);
- Len = Builder.CreateAdd(Len, One);
-
- // Final join.
- BranchInst::Create(Join, WhileDone);
- Builder.SetInsertPoint(Join, Join->begin());
- auto LenPhi = Builder.CreatePHI(Len->getType(), 2);
- LenPhi->addIncoming(Len, WhileDone);
- LenPhi->addIncoming(Zero, Prev);
-
- return LenPhi;
-}
-
-static Value *callAppendStringN(IRBuilder<> &Builder, Value *Desc, Value *Str,
- Value *Length, bool isLast) {
- auto Int64Ty = Builder.getInt64Ty();
- auto CharPtrTy = Builder.getInt8PtrTy();
- auto Int32Ty = Builder.getInt32Ty();
- auto M = Builder.GetInsertBlock()->getModule();
- auto Fn = M->getOrInsertFunction("__ockl_printf_append_string_n", Int64Ty,
- Int64Ty, CharPtrTy, Int64Ty, Int32Ty);
- auto IsLastInt32 = Builder.getInt32(isLast);
- return Builder.CreateCall(Fn, {Desc, Str, Length, IsLastInt32});
-}
-
-static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg,
- bool IsLast) {
- auto Length = getStrlenWithNull(Builder, Arg);
- return callAppendStringN(Builder, Desc, Arg, Length, IsLast);
-}
-
-static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
- bool SpecIsCString, bool IsLast) {
- if (SpecIsCString && isCString(Arg)) {
- return appendString(Builder, Desc, Arg, IsLast);
- }
- // If the format specifies a string but the argument is not, the frontend will
- // have printed a warning. We just rely on undefined behaviour and send the
- // argument anyway.
- return appendArg(Builder, Desc, Arg, IsLast);
-}
-
-// Scan the format string to locate all specifiers, and mark the ones that
-// specify a string, i.e, the "%s" specifier with optional '*' characters.
-static void locateCStrings(SparseBitVector<8> &BV, Value *Fmt) {
- StringRef Str;
- if (!getConstantStringInfo(Fmt, Str) || Str.empty())
- return;
-
- static const char ConvSpecifiers[] = "diouxXfFeEgGaAcspn";
- size_t SpecPos = 0;
- // Skip the first argument, the format string.
- unsigned ArgIdx = 1;
-
- while ((SpecPos = Str.find_first_of('%', SpecPos)) != StringRef::npos) {
- if (Str[SpecPos + 1] == '%') {
- SpecPos += 2;
- continue;
- }
- auto SpecEnd = Str.find_first_of(ConvSpecifiers, SpecPos);
- if (SpecEnd == StringRef::npos)
- return;
- auto Spec = Str.slice(SpecPos, SpecEnd + 1);
- ArgIdx += Spec.count('*');
- if (Str[SpecEnd] == 's') {
- BV.set(ArgIdx);
- }
- SpecPos = SpecEnd + 1;
- ++ArgIdx;
- }
-}
-
-Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder,
- ArrayRef<Value *> Args) {
- auto NumOps = Args.size();
- assert(NumOps >= 1);
-
- auto Fmt = Args[0];
- SparseBitVector<8> SpecIsCString;
- locateCStrings(SpecIsCString, Fmt);
-
- auto Desc = callPrintfBegin(Builder, Builder.getIntN(64, 0));
- Desc = appendString(Builder, Desc, Fmt, NumOps == 1);
-
- // FIXME: This invokes hostcall once for each argument. We can pack up to
- // seven scalar printf arguments in a single hostcall. See the signature of
- // callAppendArgs().
- for (unsigned int i = 1; i != NumOps; ++i) {
- bool IsLast = i == NumOps - 1;
- bool IsCString = SpecIsCString.test(i);
- Desc = processArg(Builder, Desc, Args[i], IsCString, IsLast);
- }
-
- return Builder.CreateTrunc(Desc, Builder.getInt32Ty());
-}
+//===- AMDGPUEmitPrintf.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility function to lower a printf call into a series of device
+// library calls on the AMDGPU target.
+//
+// WARNING: This file knows about certain library functions. It recognizes them
+// by name, and hardwires knowledge of their semantics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/Analysis/ValueTracking.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-emit-printf"
+
+static bool isCString(const Value *Arg) {
+ auto Ty = Arg->getType();
+ auto PtrTy = dyn_cast<PointerType>(Ty);
+ if (!PtrTy)
+ return false;
+
+ auto IntTy = dyn_cast<IntegerType>(PtrTy->getElementType());
+ if (!IntTy)
+ return false;
+
+ return IntTy->getBitWidth() == 8;
+}
+
+static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) {
+ auto Int64Ty = Builder.getInt64Ty();
+ auto Ty = Arg->getType();
+
+ if (auto IntTy = dyn_cast<IntegerType>(Ty)) {
+ switch (IntTy->getBitWidth()) {
+ case 32:
+ return Builder.CreateZExt(Arg, Int64Ty);
+ case 64:
+ return Arg;
+ }
+ }
+
+ if (Ty->getTypeID() == Type::DoubleTyID) {
+ return Builder.CreateBitCast(Arg, Int64Ty);
+ }
+
+ if (isa<PointerType>(Ty)) {
+ return Builder.CreatePtrToInt(Arg, Int64Ty);
+ }
+
+ llvm_unreachable("unexpected type");
+}
+
+static Value *callPrintfBegin(IRBuilder<> &Builder, Value *Version) {
+ auto Int64Ty = Builder.getInt64Ty();
+ auto M = Builder.GetInsertBlock()->getModule();
+ auto Fn = M->getOrInsertFunction("__ockl_printf_begin", Int64Ty, Int64Ty);
+ return Builder.CreateCall(Fn, Version);
+}
+
+static Value *callAppendArgs(IRBuilder<> &Builder, Value *Desc, int NumArgs,
+ Value *Arg0, Value *Arg1, Value *Arg2, Value *Arg3,
+ Value *Arg4, Value *Arg5, Value *Arg6,
+ bool IsLast) {
+ auto Int64Ty = Builder.getInt64Ty();
+ auto Int32Ty = Builder.getInt32Ty();
+ auto M = Builder.GetInsertBlock()->getModule();
+ auto Fn = M->getOrInsertFunction("__ockl_printf_append_args", Int64Ty,
+ Int64Ty, Int32Ty, Int64Ty, Int64Ty, Int64Ty,
+ Int64Ty, Int64Ty, Int64Ty, Int64Ty, Int32Ty);
+ auto IsLastValue = Builder.getInt32(IsLast);
+ auto NumArgsValue = Builder.getInt32(NumArgs);
+ return Builder.CreateCall(Fn, {Desc, NumArgsValue, Arg0, Arg1, Arg2, Arg3,
+ Arg4, Arg5, Arg6, IsLastValue});
+}
+
+static Value *appendArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
+ bool IsLast) {
+ auto Arg0 = fitArgInto64Bits(Builder, Arg);
+ auto Zero = Builder.getInt64(0);
+ return callAppendArgs(Builder, Desc, 1, Arg0, Zero, Zero, Zero, Zero, Zero,
+ Zero, IsLast);
+}
+
+// The device library does not provide strlen, so we build our own loop
+// here. While we are at it, we also include the terminating null in the length.
+static Value *getStrlenWithNull(IRBuilder<> &Builder, Value *Str) {
+ auto *Prev = Builder.GetInsertBlock();
+ Module *M = Prev->getModule();
+
+ auto CharZero = Builder.getInt8(0);
+ auto One = Builder.getInt64(1);
+ auto Zero = Builder.getInt64(0);
+ auto Int64Ty = Builder.getInt64Ty();
+
+ // The length is either zero for a null pointer, or the computed value for an
+ // actual string. We need a join block for a phi that represents the final
+ // value.
+ //
+ // Strictly speaking, the zero does not matter since
+ // __ockl_printf_append_string_n ignores the length if the pointer is null.
+ BasicBlock *Join = nullptr;
+ if (Prev->getTerminator()) {
+ Join = Prev->splitBasicBlock(Builder.GetInsertPoint(),
+ "strlen.join");
+ Prev->getTerminator()->eraseFromParent();
+ } else {
+ Join = BasicBlock::Create(M->getContext(), "strlen.join",
+ Prev->getParent());
+ }
+ BasicBlock *While =
+ BasicBlock::Create(M->getContext(), "strlen.while",
+ Prev->getParent(), Join);
+ BasicBlock *WhileDone = BasicBlock::Create(
+ M->getContext(), "strlen.while.done",
+ Prev->getParent(), Join);
+
+ // Emit an early return for when the pointer is null.
+ Builder.SetInsertPoint(Prev);
+ auto CmpNull =
+ Builder.CreateICmpEQ(Str, Constant::getNullValue(Str->getType()));
+ BranchInst::Create(Join, While, CmpNull, Prev);
+
+ // Entry to the while loop.
+ Builder.SetInsertPoint(While);
+
+ auto PtrPhi = Builder.CreatePHI(Str->getType(), 2);
+ PtrPhi->addIncoming(Str, Prev);
+ auto PtrNext = Builder.CreateGEP(PtrPhi, One);
+ PtrPhi->addIncoming(PtrNext, While);
+
+ // Condition for the while loop.
+ auto Data = Builder.CreateLoad(PtrPhi);
+ auto Cmp = Builder.CreateICmpEQ(Data, CharZero);
+ Builder.CreateCondBr(Cmp, WhileDone, While);
+
+ // Add one to the computed length.
+ Builder.SetInsertPoint(WhileDone, WhileDone->begin());
+ auto Begin = Builder.CreatePtrToInt(Str, Int64Ty);
+ auto End = Builder.CreatePtrToInt(PtrPhi, Int64Ty);
+ auto Len = Builder.CreateSub(End, Begin);
+ Len = Builder.CreateAdd(Len, One);
+
+ // Final join.
+ BranchInst::Create(Join, WhileDone);
+ Builder.SetInsertPoint(Join, Join->begin());
+ auto LenPhi = Builder.CreatePHI(Len->getType(), 2);
+ LenPhi->addIncoming(Len, WhileDone);
+ LenPhi->addIncoming(Zero, Prev);
+
+ return LenPhi;
+}
+
+static Value *callAppendStringN(IRBuilder<> &Builder, Value *Desc, Value *Str,
+ Value *Length, bool isLast) {
+ auto Int64Ty = Builder.getInt64Ty();
+ auto CharPtrTy = Builder.getInt8PtrTy();
+ auto Int32Ty = Builder.getInt32Ty();
+ auto M = Builder.GetInsertBlock()->getModule();
+ auto Fn = M->getOrInsertFunction("__ockl_printf_append_string_n", Int64Ty,
+ Int64Ty, CharPtrTy, Int64Ty, Int32Ty);
+ auto IsLastInt32 = Builder.getInt32(isLast);
+ return Builder.CreateCall(Fn, {Desc, Str, Length, IsLastInt32});
+}
+
+static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg,
+ bool IsLast) {
+ auto Length = getStrlenWithNull(Builder, Arg);
+ return callAppendStringN(Builder, Desc, Arg, Length, IsLast);
+}
+
+static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
+ bool SpecIsCString, bool IsLast) {
+ if (SpecIsCString && isCString(Arg)) {
+ return appendString(Builder, Desc, Arg, IsLast);
+ }
+ // If the format specifies a string but the argument is not, the frontend will
+ // have printed a warning. We just rely on undefined behaviour and send the
+ // argument anyway.
+ return appendArg(Builder, Desc, Arg, IsLast);
+}
+
+// Scan the format string to locate all specifiers, and mark the ones that
+// specify a string, i.e, the "%s" specifier with optional '*' characters.
+static void locateCStrings(SparseBitVector<8> &BV, Value *Fmt) {
+ StringRef Str;
+ if (!getConstantStringInfo(Fmt, Str) || Str.empty())
+ return;
+
+ static const char ConvSpecifiers[] = "diouxXfFeEgGaAcspn";
+ size_t SpecPos = 0;
+ // Skip the first argument, the format string.
+ unsigned ArgIdx = 1;
+
+ while ((SpecPos = Str.find_first_of('%', SpecPos)) != StringRef::npos) {
+ if (Str[SpecPos + 1] == '%') {
+ SpecPos += 2;
+ continue;
+ }
+ auto SpecEnd = Str.find_first_of(ConvSpecifiers, SpecPos);
+ if (SpecEnd == StringRef::npos)
+ return;
+ auto Spec = Str.slice(SpecPos, SpecEnd + 1);
+ ArgIdx += Spec.count('*');
+ if (Str[SpecEnd] == 's') {
+ BV.set(ArgIdx);
+ }
+ SpecPos = SpecEnd + 1;
+ ++ArgIdx;
+ }
+}
+
+Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder,
+ ArrayRef<Value *> Args) {
+ auto NumOps = Args.size();
+ assert(NumOps >= 1);
+
+ auto Fmt = Args[0];
+ SparseBitVector<8> SpecIsCString;
+ locateCStrings(SpecIsCString, Fmt);
+
+ auto Desc = callPrintfBegin(Builder, Builder.getIntN(64, 0));
+ Desc = appendString(Builder, Desc, Fmt, NumOps == 1);
+
+ // FIXME: This invokes hostcall once for each argument. We can pack up to
+ // seven scalar printf arguments in a single hostcall. See the signature of
+ // callAppendArgs().
+ for (unsigned int i = 1; i != NumOps; ++i) {
+ bool IsLast = i == NumOps - 1;
+ bool IsCString = SpecIsCString.test(i);
+ Desc = processArg(Builder, Desc, Args[i], IsCString, IsLast);
+ }
+
+ return Builder.CreateTrunc(Desc, Builder.getInt32Ty());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/ASanStackFrameLayout.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/ASanStackFrameLayout.cpp
index 0834298936..0191229732 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/ASanStackFrameLayout.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/ASanStackFrameLayout.cpp
@@ -1,152 +1,152 @@
-//===-- ASanStackFrameLayout.cpp - helper for AddressSanitizer ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Definition of ComputeASanStackFrameLayout (see ASanStackFrameLayout.h).
-//
-//===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/ScopedPrinter.h"
-#include "llvm/Support/raw_ostream.h"
-#include <algorithm>
-
-namespace llvm {
-
-// We sort the stack variables by alignment (largest first) to minimize
-// unnecessary large gaps due to alignment.
-// It is tempting to also sort variables by size so that larger variables
-// have larger redzones at both ends. But reordering will make report analysis
-// harder, especially when temporary unnamed variables are present.
-// So, until we can provide more information (type, line number, etc)
-// for the stack variables we avoid reordering them too much.
-static inline bool CompareVars(const ASanStackVariableDescription &a,
- const ASanStackVariableDescription &b) {
- return a.Alignment > b.Alignment;
-}
-
-// We also force minimal alignment for all vars to kMinAlignment so that vars
-// with e.g. alignment 1 and alignment 16 do not get reordered by CompareVars.
-static const size_t kMinAlignment = 16;
-
-// We want to add a full redzone after every variable.
-// The larger the variable Size the larger is the redzone.
-// The resulting frame size is a multiple of Alignment.
-static size_t VarAndRedzoneSize(size_t Size, size_t Granularity,
- size_t Alignment) {
- size_t Res = 0;
- if (Size <= 4) Res = 16;
- else if (Size <= 16) Res = 32;
- else if (Size <= 128) Res = Size + 32;
- else if (Size <= 512) Res = Size + 64;
- else if (Size <= 4096) Res = Size + 128;
- else Res = Size + 256;
- return alignTo(std::max(Res, 2 * Granularity), Alignment);
-}
-
-ASanStackFrameLayout
-ComputeASanStackFrameLayout(SmallVectorImpl<ASanStackVariableDescription> &Vars,
- size_t Granularity, size_t MinHeaderSize) {
- assert(Granularity >= 8 && Granularity <= 64 &&
- (Granularity & (Granularity - 1)) == 0);
- assert(MinHeaderSize >= 16 && (MinHeaderSize & (MinHeaderSize - 1)) == 0 &&
- MinHeaderSize >= Granularity);
- const size_t NumVars = Vars.size();
- assert(NumVars > 0);
- for (size_t i = 0; i < NumVars; i++)
- Vars[i].Alignment = std::max(Vars[i].Alignment, kMinAlignment);
-
- llvm::stable_sort(Vars, CompareVars);
-
- ASanStackFrameLayout Layout;
- Layout.Granularity = Granularity;
- Layout.FrameAlignment = std::max(Granularity, Vars[0].Alignment);
- size_t Offset = std::max(std::max(MinHeaderSize, Granularity),
- Vars[0].Alignment);
- assert((Offset % Granularity) == 0);
- for (size_t i = 0; i < NumVars; i++) {
- bool IsLast = i == NumVars - 1;
- size_t Alignment = std::max(Granularity, Vars[i].Alignment);
- (void)Alignment; // Used only in asserts.
- size_t Size = Vars[i].Size;
- assert((Alignment & (Alignment - 1)) == 0);
- assert(Layout.FrameAlignment >= Alignment);
- assert((Offset % Alignment) == 0);
- assert(Size > 0);
- size_t NextAlignment = IsLast ? Granularity
- : std::max(Granularity, Vars[i + 1].Alignment);
- size_t SizeWithRedzone = VarAndRedzoneSize(Size, Granularity,
- NextAlignment);
- Vars[i].Offset = Offset;
- Offset += SizeWithRedzone;
- }
- if (Offset % MinHeaderSize) {
- Offset += MinHeaderSize - (Offset % MinHeaderSize);
- }
- Layout.FrameSize = Offset;
- assert((Layout.FrameSize % MinHeaderSize) == 0);
- return Layout;
-}
-
-SmallString<64> ComputeASanStackFrameDescription(
- const SmallVectorImpl<ASanStackVariableDescription> &Vars) {
- SmallString<2048> StackDescriptionStorage;
- raw_svector_ostream StackDescription(StackDescriptionStorage);
- StackDescription << Vars.size();
-
- for (const auto &Var : Vars) {
- std::string Name = Var.Name;
- if (Var.Line) {
- Name += ":";
- Name += to_string(Var.Line);
- }
- StackDescription << " " << Var.Offset << " " << Var.Size << " "
- << Name.size() << " " << Name;
- }
- return StackDescription.str();
-}
-
-SmallVector<uint8_t, 64>
-GetShadowBytes(const SmallVectorImpl<ASanStackVariableDescription> &Vars,
- const ASanStackFrameLayout &Layout) {
- assert(Vars.size() > 0);
- SmallVector<uint8_t, 64> SB;
- SB.clear();
- const size_t Granularity = Layout.Granularity;
- SB.resize(Vars[0].Offset / Granularity, kAsanStackLeftRedzoneMagic);
- for (const auto &Var : Vars) {
- SB.resize(Var.Offset / Granularity, kAsanStackMidRedzoneMagic);
-
- SB.resize(SB.size() + Var.Size / Granularity, 0);
- if (Var.Size % Granularity)
- SB.push_back(Var.Size % Granularity);
- }
- SB.resize(Layout.FrameSize / Granularity, kAsanStackRightRedzoneMagic);
- return SB;
-}
-
-SmallVector<uint8_t, 64> GetShadowBytesAfterScope(
- const SmallVectorImpl<ASanStackVariableDescription> &Vars,
- const ASanStackFrameLayout &Layout) {
- SmallVector<uint8_t, 64> SB = GetShadowBytes(Vars, Layout);
- const size_t Granularity = Layout.Granularity;
-
- for (const auto &Var : Vars) {
- assert(Var.LifetimeSize <= Var.Size);
- const size_t LifetimeShadowSize =
- (Var.LifetimeSize + Granularity - 1) / Granularity;
- const size_t Offset = Var.Offset / Granularity;
- std::fill(SB.begin() + Offset, SB.begin() + Offset + LifetimeShadowSize,
- kAsanStackUseAfterScopeMagic);
- }
-
- return SB;
-}
-
-} // llvm namespace
+//===-- ASanStackFrameLayout.cpp - helper for AddressSanitizer ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of ComputeASanStackFrameLayout (see ASanStackFrameLayout.h).
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+
+namespace llvm {
+
+// We sort the stack variables by alignment (largest first) to minimize
+// unnecessary large gaps due to alignment.
+// It is tempting to also sort variables by size so that larger variables
+// have larger redzones at both ends. But reordering will make report analysis
+// harder, especially when temporary unnamed variables are present.
+// So, until we can provide more information (type, line number, etc)
+// for the stack variables we avoid reordering them too much.
+static inline bool CompareVars(const ASanStackVariableDescription &a,
+ const ASanStackVariableDescription &b) {
+ return a.Alignment > b.Alignment;
+}
+
+// We also force minimal alignment for all vars to kMinAlignment so that vars
+// with e.g. alignment 1 and alignment 16 do not get reordered by CompareVars.
+static const size_t kMinAlignment = 16;
+
+// We want to add a full redzone after every variable.
+// The larger the variable Size the larger is the redzone.
+// The resulting frame size is a multiple of Alignment.
+static size_t VarAndRedzoneSize(size_t Size, size_t Granularity,
+ size_t Alignment) {
+ size_t Res = 0;
+ if (Size <= 4) Res = 16;
+ else if (Size <= 16) Res = 32;
+ else if (Size <= 128) Res = Size + 32;
+ else if (Size <= 512) Res = Size + 64;
+ else if (Size <= 4096) Res = Size + 128;
+ else Res = Size + 256;
+ return alignTo(std::max(Res, 2 * Granularity), Alignment);
+}
+
+ASanStackFrameLayout
+ComputeASanStackFrameLayout(SmallVectorImpl<ASanStackVariableDescription> &Vars,
+ size_t Granularity, size_t MinHeaderSize) {
+ assert(Granularity >= 8 && Granularity <= 64 &&
+ (Granularity & (Granularity - 1)) == 0);
+ assert(MinHeaderSize >= 16 && (MinHeaderSize & (MinHeaderSize - 1)) == 0 &&
+ MinHeaderSize >= Granularity);
+ const size_t NumVars = Vars.size();
+ assert(NumVars > 0);
+ for (size_t i = 0; i < NumVars; i++)
+ Vars[i].Alignment = std::max(Vars[i].Alignment, kMinAlignment);
+
+ llvm::stable_sort(Vars, CompareVars);
+
+ ASanStackFrameLayout Layout;
+ Layout.Granularity = Granularity;
+ Layout.FrameAlignment = std::max(Granularity, Vars[0].Alignment);
+ size_t Offset = std::max(std::max(MinHeaderSize, Granularity),
+ Vars[0].Alignment);
+ assert((Offset % Granularity) == 0);
+ for (size_t i = 0; i < NumVars; i++) {
+ bool IsLast = i == NumVars - 1;
+ size_t Alignment = std::max(Granularity, Vars[i].Alignment);
+ (void)Alignment; // Used only in asserts.
+ size_t Size = Vars[i].Size;
+ assert((Alignment & (Alignment - 1)) == 0);
+ assert(Layout.FrameAlignment >= Alignment);
+ assert((Offset % Alignment) == 0);
+ assert(Size > 0);
+ size_t NextAlignment = IsLast ? Granularity
+ : std::max(Granularity, Vars[i + 1].Alignment);
+ size_t SizeWithRedzone = VarAndRedzoneSize(Size, Granularity,
+ NextAlignment);
+ Vars[i].Offset = Offset;
+ Offset += SizeWithRedzone;
+ }
+ if (Offset % MinHeaderSize) {
+ Offset += MinHeaderSize - (Offset % MinHeaderSize);
+ }
+ Layout.FrameSize = Offset;
+ assert((Layout.FrameSize % MinHeaderSize) == 0);
+ return Layout;
+}
+
+SmallString<64> ComputeASanStackFrameDescription(
+ const SmallVectorImpl<ASanStackVariableDescription> &Vars) {
+ SmallString<2048> StackDescriptionStorage;
+ raw_svector_ostream StackDescription(StackDescriptionStorage);
+ StackDescription << Vars.size();
+
+ for (const auto &Var : Vars) {
+ std::string Name = Var.Name;
+ if (Var.Line) {
+ Name += ":";
+ Name += to_string(Var.Line);
+ }
+ StackDescription << " " << Var.Offset << " " << Var.Size << " "
+ << Name.size() << " " << Name;
+ }
+ return StackDescription.str();
+}
+
+SmallVector<uint8_t, 64>
+GetShadowBytes(const SmallVectorImpl<ASanStackVariableDescription> &Vars,
+ const ASanStackFrameLayout &Layout) {
+ assert(Vars.size() > 0);
+ SmallVector<uint8_t, 64> SB;
+ SB.clear();
+ const size_t Granularity = Layout.Granularity;
+ SB.resize(Vars[0].Offset / Granularity, kAsanStackLeftRedzoneMagic);
+ for (const auto &Var : Vars) {
+ SB.resize(Var.Offset / Granularity, kAsanStackMidRedzoneMagic);
+
+ SB.resize(SB.size() + Var.Size / Granularity, 0);
+ if (Var.Size % Granularity)
+ SB.push_back(Var.Size % Granularity);
+ }
+ SB.resize(Layout.FrameSize / Granularity, kAsanStackRightRedzoneMagic);
+ return SB;
+}
+
+SmallVector<uint8_t, 64> GetShadowBytesAfterScope(
+ const SmallVectorImpl<ASanStackVariableDescription> &Vars,
+ const ASanStackFrameLayout &Layout) {
+ SmallVector<uint8_t, 64> SB = GetShadowBytes(Vars, Layout);
+ const size_t Granularity = Layout.Granularity;
+
+ for (const auto &Var : Vars) {
+ assert(Var.LifetimeSize <= Var.Size);
+ const size_t LifetimeShadowSize =
+ (Var.LifetimeSize + Granularity - 1) / Granularity;
+ const size_t Offset = Var.Offset / Granularity;
+ std::fill(SB.begin() + Offset, SB.begin() + Offset + LifetimeShadowSize,
+ kAsanStackUseAfterScopeMagic);
+ }
+
+ return SB;
+}
+
+} // llvm namespace
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/AddDiscriminators.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/AddDiscriminators.cpp
index e2ad63143f..0908b361a4 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -1,277 +1,277 @@
-//===- AddDiscriminators.cpp - Insert DWARF path discriminators -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file adds DWARF discriminators to the IR. Path discriminators are
-// used to decide what CFG path was taken inside sub-graphs whose instructions
-// share the same line and column number information.
-//
-// The main user of this is the sample profiler. Instruction samples are
-// mapped to line number information. Since a single line may be spread
-// out over several basic blocks, discriminators add more precise location
-// for the samples.
-//
-// For example,
-//
-// 1 #define ASSERT(P)
-// 2 if (!(P))
-// 3 abort()
-// ...
-// 100 while (true) {
-// 101 ASSERT (sum < 0);
-// 102 ...
-// 130 }
-//
-// when converted to IR, this snippet looks something like:
-//
-// while.body: ; preds = %entry, %if.end
-// %0 = load i32* %sum, align 4, !dbg !15
-// %cmp = icmp slt i32 %0, 0, !dbg !15
-// br i1 %cmp, label %if.end, label %if.then, !dbg !15
-//
-// if.then: ; preds = %while.body
-// call void @abort(), !dbg !15
-// br label %if.end, !dbg !15
-//
-// Notice that all the instructions in blocks 'while.body' and 'if.then'
-// have exactly the same debug information. When this program is sampled
-// at runtime, the profiler will assume that all these instructions are
-// equally frequent. This, in turn, will consider the edge while.body->if.then
-// to be frequently taken (which is incorrect).
-//
-// By adding a discriminator value to the instructions in block 'if.then',
-// we can distinguish instructions at line 101 with discriminator 0 from
-// the instructions at line 101 with discriminator 1.
-//
-// For more details about DWARF discriminators, please visit
-// http://wiki.dwarfstd.org/index.php?title=Path_Discriminators
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/AddDiscriminators.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils.h"
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "add-discriminators"
-
-// Command line option to disable discriminator generation even in the
-// presence of debug information. This is only needed when debugging
-// debug info generation issues.
-static cl::opt<bool> NoDiscriminators(
- "no-discriminators", cl::init(false),
- cl::desc("Disable generation of discriminator information."));
-
-namespace {
-
-// The legacy pass of AddDiscriminators.
-struct AddDiscriminatorsLegacyPass : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
-
- AddDiscriminatorsLegacyPass() : FunctionPass(ID) {
- initializeAddDiscriminatorsLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-};
-
-} // end anonymous namespace
-
-char AddDiscriminatorsLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(AddDiscriminatorsLegacyPass, "add-discriminators",
- "Add DWARF path discriminators", false, false)
-INITIALIZE_PASS_END(AddDiscriminatorsLegacyPass, "add-discriminators",
- "Add DWARF path discriminators", false, false)
-
-// Create the legacy AddDiscriminatorsPass.
-FunctionPass *llvm::createAddDiscriminatorsPass() {
- return new AddDiscriminatorsLegacyPass();
-}
-
-static bool shouldHaveDiscriminator(const Instruction *I) {
- return !isa<IntrinsicInst>(I) || isa<MemIntrinsic>(I);
-}
-
-/// Assign DWARF discriminators.
-///
-/// To assign discriminators, we examine the boundaries of every
-/// basic block and its successors. Suppose there is a basic block B1
-/// with successor B2. The last instruction I1 in B1 and the first
-/// instruction I2 in B2 are located at the same file and line number.
-/// This situation is illustrated in the following code snippet:
-///
-/// if (i < 10) x = i;
-///
-/// entry:
-/// br i1 %cmp, label %if.then, label %if.end, !dbg !10
-/// if.then:
-/// %1 = load i32* %i.addr, align 4, !dbg !10
-/// store i32 %1, i32* %x, align 4, !dbg !10
-/// br label %if.end, !dbg !10
-/// if.end:
-/// ret void, !dbg !12
-///
-/// Notice how the branch instruction in block 'entry' and all the
-/// instructions in block 'if.then' have the exact same debug location
-/// information (!dbg !10).
-///
-/// To distinguish instructions in block 'entry' from instructions in
-/// block 'if.then', we generate a new lexical block for all the
-/// instruction in block 'if.then' that share the same file and line
-/// location with the last instruction of block 'entry'.
-///
-/// This new lexical block will have the same location information as
-/// the previous one, but with a new DWARF discriminator value.
-///
-/// One of the main uses of this discriminator value is in runtime
-/// sample profilers. It allows the profiler to distinguish instructions
-/// at location !dbg !10 that execute on different basic blocks. This is
-/// important because while the predicate 'if (x < 10)' may have been
-/// executed millions of times, the assignment 'x = i' may have only
-/// executed a handful of times (meaning that the entry->if.then edge is
-/// seldom taken).
-///
-/// If we did not have discriminator information, the profiler would
-/// assign the same weight to both blocks 'entry' and 'if.then', which
-/// in turn will make it conclude that the entry->if.then edge is very
-/// hot.
-///
-/// To decide where to create new discriminator values, this function
-/// traverses the CFG and examines instruction at basic block boundaries.
-/// If the last instruction I1 of a block B1 is at the same file and line
-/// location as instruction I2 of successor B2, then it creates a new
-/// lexical block for I2 and all the instruction in B2 that share the same
-/// file and line location as I2. This new lexical block will have a
-/// different discriminator number than I1.
-static bool addDiscriminators(Function &F) {
- // If the function has debug information, but the user has disabled
- // discriminators, do nothing.
- // Simlarly, if the function has no debug info, do nothing.
- if (NoDiscriminators || !F.getSubprogram())
- return false;
-
- bool Changed = false;
-
- using Location = std::pair<StringRef, unsigned>;
- using BBSet = DenseSet<const BasicBlock *>;
- using LocationBBMap = DenseMap<Location, BBSet>;
- using LocationDiscriminatorMap = DenseMap<Location, unsigned>;
- using LocationSet = DenseSet<Location>;
-
- LocationBBMap LBM;
- LocationDiscriminatorMap LDM;
-
- // Traverse all instructions in the function. If the source line location
- // of the instruction appears in other basic block, assign a new
- // discriminator for this instruction.
- for (BasicBlock &B : F) {
- for (auto &I : B.getInstList()) {
- // Not all intrinsic calls should have a discriminator.
- // We want to avoid a non-deterministic assignment of discriminators at
- // different debug levels. We still allow discriminators on memory
- // intrinsic calls because those can be early expanded by SROA into
- // pairs of loads and stores, and the expanded load/store instructions
- // should have a valid discriminator.
- if (!shouldHaveDiscriminator(&I))
- continue;
- const DILocation *DIL = I.getDebugLoc();
- if (!DIL)
- continue;
- Location L = std::make_pair(DIL->getFilename(), DIL->getLine());
- auto &BBMap = LBM[L];
- auto R = BBMap.insert(&B);
- if (BBMap.size() == 1)
- continue;
- // If we could insert more than one block with the same line+file, a
- // discriminator is needed to distinguish both instructions.
- // Only the lowest 7 bits are used to represent a discriminator to fit
- // it in 1 byte ULEB128 representation.
- unsigned Discriminator = R.second ? ++LDM[L] : LDM[L];
- auto NewDIL = DIL->cloneWithBaseDiscriminator(Discriminator);
- if (!NewDIL) {
- LLVM_DEBUG(dbgs() << "Could not encode discriminator: "
- << DIL->getFilename() << ":" << DIL->getLine() << ":"
- << DIL->getColumn() << ":" << Discriminator << " "
- << I << "\n");
- } else {
- I.setDebugLoc(NewDIL.getValue());
- LLVM_DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
- << DIL->getColumn() << ":" << Discriminator << " " << I
- << "\n");
- }
- Changed = true;
- }
- }
-
- // Traverse all instructions and assign new discriminators to call
- // instructions with the same lineno that are in the same basic block.
- // Sample base profile needs to distinguish different function calls within
- // a same source line for correct profile annotation.
- for (BasicBlock &B : F) {
- LocationSet CallLocations;
- for (auto &I : B.getInstList()) {
- // We bypass intrinsic calls for the following two reasons:
- // 1) We want to avoid a non-deterministic assignment of
- // discriminators.
- // 2) We want to minimize the number of base discriminators used.
- if (!isa<InvokeInst>(I) && (!isa<CallInst>(I) || isa<IntrinsicInst>(I)))
- continue;
-
- DILocation *CurrentDIL = I.getDebugLoc();
- if (!CurrentDIL)
- continue;
- Location L =
- std::make_pair(CurrentDIL->getFilename(), CurrentDIL->getLine());
- if (!CallLocations.insert(L).second) {
- unsigned Discriminator = ++LDM[L];
- auto NewDIL = CurrentDIL->cloneWithBaseDiscriminator(Discriminator);
- if (!NewDIL) {
- LLVM_DEBUG(dbgs()
- << "Could not encode discriminator: "
- << CurrentDIL->getFilename() << ":"
- << CurrentDIL->getLine() << ":" << CurrentDIL->getColumn()
- << ":" << Discriminator << " " << I << "\n");
- } else {
- I.setDebugLoc(NewDIL.getValue());
- Changed = true;
- }
- }
- }
- }
- return Changed;
-}
-
-bool AddDiscriminatorsLegacyPass::runOnFunction(Function &F) {
- return addDiscriminators(F);
-}
-
-PreservedAnalyses AddDiscriminatorsPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- if (!addDiscriminators(F))
- return PreservedAnalyses::all();
-
- // FIXME: should be all()
- return PreservedAnalyses::none();
-}
+//===- AddDiscriminators.cpp - Insert DWARF path discriminators -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file adds DWARF discriminators to the IR. Path discriminators are
+// used to decide what CFG path was taken inside sub-graphs whose instructions
+// share the same line and column number information.
+//
+// The main user of this is the sample profiler. Instruction samples are
+// mapped to line number information. Since a single line may be spread
+// out over several basic blocks, discriminators add more precise location
+// for the samples.
+//
+// For example,
+//
+// 1 #define ASSERT(P)
+// 2 if (!(P))
+// 3 abort()
+// ...
+// 100 while (true) {
+// 101 ASSERT (sum < 0);
+// 102 ...
+// 130 }
+//
+// when converted to IR, this snippet looks something like:
+//
+// while.body: ; preds = %entry, %if.end
+// %0 = load i32* %sum, align 4, !dbg !15
+// %cmp = icmp slt i32 %0, 0, !dbg !15
+// br i1 %cmp, label %if.end, label %if.then, !dbg !15
+//
+// if.then: ; preds = %while.body
+// call void @abort(), !dbg !15
+// br label %if.end, !dbg !15
+//
+// Notice that all the instructions in blocks 'while.body' and 'if.then'
+// have exactly the same debug information. When this program is sampled
+// at runtime, the profiler will assume that all these instructions are
+// equally frequent. This, in turn, will consider the edge while.body->if.then
+// to be frequently taken (which is incorrect).
+//
+// By adding a discriminator value to the instructions in block 'if.then',
+// we can distinguish instructions at line 101 with discriminator 0 from
+// the instructions at line 101 with discriminator 1.
+//
+// For more details about DWARF discriminators, please visit
+// http://wiki.dwarfstd.org/index.php?title=Path_Discriminators
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/AddDiscriminators.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils.h"
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "add-discriminators"
+
+// Command line option to disable discriminator generation even in the
+// presence of debug information. This is only needed when debugging
+// debug info generation issues.
+static cl::opt<bool> NoDiscriminators(
+ "no-discriminators", cl::init(false),
+ cl::desc("Disable generation of discriminator information."));
+
+namespace {
+
+// The legacy pass of AddDiscriminators.
+struct AddDiscriminatorsLegacyPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+
+ AddDiscriminatorsLegacyPass() : FunctionPass(ID) {
+ initializeAddDiscriminatorsLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+};
+
+} // end anonymous namespace
+
+char AddDiscriminatorsLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AddDiscriminatorsLegacyPass, "add-discriminators",
+ "Add DWARF path discriminators", false, false)
+INITIALIZE_PASS_END(AddDiscriminatorsLegacyPass, "add-discriminators",
+ "Add DWARF path discriminators", false, false)
+
+// Create the legacy AddDiscriminatorsPass.
+FunctionPass *llvm::createAddDiscriminatorsPass() {
+ return new AddDiscriminatorsLegacyPass();
+}
+
+static bool shouldHaveDiscriminator(const Instruction *I) {
+ return !isa<IntrinsicInst>(I) || isa<MemIntrinsic>(I);
+}
+
+/// Assign DWARF discriminators.
+///
+/// To assign discriminators, we examine the boundaries of every
+/// basic block and its successors. Suppose there is a basic block B1
+/// with successor B2. The last instruction I1 in B1 and the first
+/// instruction I2 in B2 are located at the same file and line number.
+/// This situation is illustrated in the following code snippet:
+///
+/// if (i < 10) x = i;
+///
+/// entry:
+/// br i1 %cmp, label %if.then, label %if.end, !dbg !10
+/// if.then:
+/// %1 = load i32* %i.addr, align 4, !dbg !10
+/// store i32 %1, i32* %x, align 4, !dbg !10
+/// br label %if.end, !dbg !10
+/// if.end:
+/// ret void, !dbg !12
+///
+/// Notice how the branch instruction in block 'entry' and all the
+/// instructions in block 'if.then' have the exact same debug location
+/// information (!dbg !10).
+///
+/// To distinguish instructions in block 'entry' from instructions in
+/// block 'if.then', we generate a new lexical block for all the
+/// instruction in block 'if.then' that share the same file and line
+/// location with the last instruction of block 'entry'.
+///
+/// This new lexical block will have the same location information as
+/// the previous one, but with a new DWARF discriminator value.
+///
+/// One of the main uses of this discriminator value is in runtime
+/// sample profilers. It allows the profiler to distinguish instructions
+/// at location !dbg !10 that execute on different basic blocks. This is
+/// important because while the predicate 'if (x < 10)' may have been
+/// executed millions of times, the assignment 'x = i' may have only
+/// executed a handful of times (meaning that the entry->if.then edge is
+/// seldom taken).
+///
+/// If we did not have discriminator information, the profiler would
+/// assign the same weight to both blocks 'entry' and 'if.then', which
+/// in turn will make it conclude that the entry->if.then edge is very
+/// hot.
+///
+/// To decide where to create new discriminator values, this function
+/// traverses the CFG and examines instruction at basic block boundaries.
+/// If the last instruction I1 of a block B1 is at the same file and line
+/// location as instruction I2 of successor B2, then it creates a new
+/// lexical block for I2 and all the instruction in B2 that share the same
+/// file and line location as I2. This new lexical block will have a
+/// different discriminator number than I1.
+static bool addDiscriminators(Function &F) {
+ // If the function has debug information, but the user has disabled
+ // discriminators, do nothing.
+ // Simlarly, if the function has no debug info, do nothing.
+ if (NoDiscriminators || !F.getSubprogram())
+ return false;
+
+ bool Changed = false;
+
+ using Location = std::pair<StringRef, unsigned>;
+ using BBSet = DenseSet<const BasicBlock *>;
+ using LocationBBMap = DenseMap<Location, BBSet>;
+ using LocationDiscriminatorMap = DenseMap<Location, unsigned>;
+ using LocationSet = DenseSet<Location>;
+
+ LocationBBMap LBM;
+ LocationDiscriminatorMap LDM;
+
+ // Traverse all instructions in the function. If the source line location
+ // of the instruction appears in other basic block, assign a new
+ // discriminator for this instruction.
+ for (BasicBlock &B : F) {
+ for (auto &I : B.getInstList()) {
+ // Not all intrinsic calls should have a discriminator.
+ // We want to avoid a non-deterministic assignment of discriminators at
+ // different debug levels. We still allow discriminators on memory
+ // intrinsic calls because those can be early expanded by SROA into
+ // pairs of loads and stores, and the expanded load/store instructions
+ // should have a valid discriminator.
+ if (!shouldHaveDiscriminator(&I))
+ continue;
+ const DILocation *DIL = I.getDebugLoc();
+ if (!DIL)
+ continue;
+ Location L = std::make_pair(DIL->getFilename(), DIL->getLine());
+ auto &BBMap = LBM[L];
+ auto R = BBMap.insert(&B);
+ if (BBMap.size() == 1)
+ continue;
+ // If we could insert more than one block with the same line+file, a
+ // discriminator is needed to distinguish both instructions.
+ // Only the lowest 7 bits are used to represent a discriminator to fit
+ // it in 1 byte ULEB128 representation.
+ unsigned Discriminator = R.second ? ++LDM[L] : LDM[L];
+ auto NewDIL = DIL->cloneWithBaseDiscriminator(Discriminator);
+ if (!NewDIL) {
+ LLVM_DEBUG(dbgs() << "Could not encode discriminator: "
+ << DIL->getFilename() << ":" << DIL->getLine() << ":"
+ << DIL->getColumn() << ":" << Discriminator << " "
+ << I << "\n");
+ } else {
+ I.setDebugLoc(NewDIL.getValue());
+ LLVM_DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
+ << DIL->getColumn() << ":" << Discriminator << " " << I
+ << "\n");
+ }
+ Changed = true;
+ }
+ }
+
+ // Traverse all instructions and assign new discriminators to call
+ // instructions with the same lineno that are in the same basic block.
+ // Sample base profile needs to distinguish different function calls within
+ // a same source line for correct profile annotation.
+ for (BasicBlock &B : F) {
+ LocationSet CallLocations;
+ for (auto &I : B.getInstList()) {
+ // We bypass intrinsic calls for the following two reasons:
+ // 1) We want to avoid a non-deterministic assignment of
+ // discriminators.
+ // 2) We want to minimize the number of base discriminators used.
+ if (!isa<InvokeInst>(I) && (!isa<CallInst>(I) || isa<IntrinsicInst>(I)))
+ continue;
+
+ DILocation *CurrentDIL = I.getDebugLoc();
+ if (!CurrentDIL)
+ continue;
+ Location L =
+ std::make_pair(CurrentDIL->getFilename(), CurrentDIL->getLine());
+ if (!CallLocations.insert(L).second) {
+ unsigned Discriminator = ++LDM[L];
+ auto NewDIL = CurrentDIL->cloneWithBaseDiscriminator(Discriminator);
+ if (!NewDIL) {
+ LLVM_DEBUG(dbgs()
+ << "Could not encode discriminator: "
+ << CurrentDIL->getFilename() << ":"
+ << CurrentDIL->getLine() << ":" << CurrentDIL->getColumn()
+ << ":" << Discriminator << " " << I << "\n");
+ } else {
+ I.setDebugLoc(NewDIL.getValue());
+ Changed = true;
+ }
+ }
+ }
+ }
+ return Changed;
+}
+
+bool AddDiscriminatorsLegacyPass::runOnFunction(Function &F) {
+ return addDiscriminators(F);
+}
+
+PreservedAnalyses AddDiscriminatorsPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ if (!addDiscriminators(F))
+ return PreservedAnalyses::all();
+
+ // FIXME: should be all()
+ return PreservedAnalyses::none();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/AssumeBundleBuilder.cpp
index a9d283aeeb..3daff3b443 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/AssumeBundleBuilder.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/AssumeBundleBuilder.cpp
@@ -1,619 +1,619 @@
-//===- AssumeBundleBuilder.cpp - tools to preserve informations -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "assume-builder"
-
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumeBundleQueries.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/DebugCounter.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-using namespace llvm;
-
-cl::opt<bool> ShouldPreserveAllAttributes(
- "assume-preserve-all", cl::init(false), cl::Hidden,
- cl::desc("enable preservation of all attrbitues. even those that are "
- "unlikely to be usefull"));
-
-cl::opt<bool> EnableKnowledgeRetention(
- "enable-knowledge-retention", cl::init(false), cl::Hidden,
- cl::desc(
- "enable preservation of attributes throughout code transformation"));
-
-STATISTIC(NumAssumeBuilt, "Number of assume built by the assume builder");
-STATISTIC(NumBundlesInAssumes, "Total number of Bundles in the assume built");
-STATISTIC(NumAssumesMerged,
- "Number of assume merged by the assume simplify pass");
-STATISTIC(NumAssumesRemoved,
- "Number of assume removed by the assume simplify pass");
-
-DEBUG_COUNTER(BuildAssumeCounter, "assume-builder-counter",
- "Controls which assumes gets created");
-
-namespace {
-
-bool isUsefullToPreserve(Attribute::AttrKind Kind) {
- switch (Kind) {
- case Attribute::NonNull:
+//===- AssumeBundleBuilder.cpp - tools to preserve informations -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "assume-builder"
+
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+cl::opt<bool> ShouldPreserveAllAttributes(
+ "assume-preserve-all", cl::init(false), cl::Hidden,
+ cl::desc("enable preservation of all attrbitues. even those that are "
+ "unlikely to be usefull"));
+
+cl::opt<bool> EnableKnowledgeRetention(
+ "enable-knowledge-retention", cl::init(false), cl::Hidden,
+ cl::desc(
+ "enable preservation of attributes throughout code transformation"));
+
+STATISTIC(NumAssumeBuilt, "Number of assume built by the assume builder");
+STATISTIC(NumBundlesInAssumes, "Total number of Bundles in the assume built");
+STATISTIC(NumAssumesMerged,
+ "Number of assume merged by the assume simplify pass");
+STATISTIC(NumAssumesRemoved,
+ "Number of assume removed by the assume simplify pass");
+
+DEBUG_COUNTER(BuildAssumeCounter, "assume-builder-counter",
+ "Controls which assumes gets created");
+
+namespace {
+
+bool isUsefullToPreserve(Attribute::AttrKind Kind) {
+ switch (Kind) {
+ case Attribute::NonNull:
case Attribute::NoUndef:
- case Attribute::Alignment:
- case Attribute::Dereferenceable:
- case Attribute::DereferenceableOrNull:
- case Attribute::Cold:
- return true;
- default:
- return false;
- }
-}
-
-/// This function will try to transform the given knowledge into a more
-/// canonical one. the canonical knowledge maybe the given one.
-RetainedKnowledge canonicalizedKnowledge(RetainedKnowledge RK, Module *M) {
- switch (RK.AttrKind) {
- default:
- return RK;
- case Attribute::NonNull:
+ case Attribute::Alignment:
+ case Attribute::Dereferenceable:
+ case Attribute::DereferenceableOrNull:
+ case Attribute::Cold:
+ return true;
+ default:
+ return false;
+ }
+}
+
+/// This function will try to transform the given knowledge into a more
+/// canonical one. the canonical knowledge maybe the given one.
+RetainedKnowledge canonicalizedKnowledge(RetainedKnowledge RK, Module *M) {
+ switch (RK.AttrKind) {
+ default:
+ return RK;
+ case Attribute::NonNull:
RK.WasOn = getUnderlyingObject(RK.WasOn);
- return RK;
- case Attribute::Alignment: {
- Value *V = RK.WasOn->stripInBoundsOffsets([&](const Value *Strip) {
- if (auto *GEP = dyn_cast<GEPOperator>(Strip))
- RK.ArgValue =
- MinAlign(RK.ArgValue,
- GEP->getMaxPreservedAlignment(M->getDataLayout()).value());
- });
- RK.WasOn = V;
- return RK;
- }
- case Attribute::Dereferenceable:
- case Attribute::DereferenceableOrNull: {
- int64_t Offset = 0;
- Value *V = GetPointerBaseWithConstantOffset(
- RK.WasOn, Offset, M->getDataLayout(), /*AllowNonInBounds*/ false);
- if (Offset < 0)
- return RK;
- RK.ArgValue = RK.ArgValue + Offset;
- RK.WasOn = V;
- }
- }
- return RK;
-}
-
-/// This class contain all knowledge that have been gather while building an
-/// llvm.assume and the function to manipulate it.
-struct AssumeBuilderState {
- Module *M;
-
- using MapKey = std::pair<Value *, Attribute::AttrKind>;
- SmallMapVector<MapKey, unsigned, 8> AssumedKnowledgeMap;
- Instruction *InstBeingRemoved = nullptr;
- AssumptionCache* AC = nullptr;
- DominatorTree* DT = nullptr;
-
- AssumeBuilderState(Module *M, Instruction *I = nullptr,
- AssumptionCache *AC = nullptr, DominatorTree *DT = nullptr)
- : M(M), InstBeingRemoved(I), AC(AC), DT(DT) {}
-
- bool tryToPreserveWithoutAddingAssume(RetainedKnowledge RK) {
- if (!InstBeingRemoved || !RK.WasOn)
- return false;
- bool HasBeenPreserved = false;
- Use* ToUpdate = nullptr;
- getKnowledgeForValue(
- RK.WasOn, {RK.AttrKind}, AC,
- [&](RetainedKnowledge RKOther, Instruction *Assume,
- const CallInst::BundleOpInfo *Bundle) {
- if (!isValidAssumeForContext(Assume, InstBeingRemoved, DT))
- return false;
- if (RKOther.ArgValue >= RK.ArgValue) {
- HasBeenPreserved = true;
- return true;
- } else if (isValidAssumeForContext(InstBeingRemoved, Assume,
- DT)) {
- HasBeenPreserved = true;
- IntrinsicInst *Intr = cast<IntrinsicInst>(Assume);
- ToUpdate = &Intr->op_begin()[Bundle->Begin + ABA_Argument];
- return true;
- }
- return false;
- });
- if (ToUpdate)
- ToUpdate->set(
- ConstantInt::get(Type::getInt64Ty(M->getContext()), RK.ArgValue));
- return HasBeenPreserved;
- }
-
- bool isKnowledgeWorthPreserving(RetainedKnowledge RK) {
- if (!RK)
- return false;
- if (!RK.WasOn)
- return true;
- if (RK.WasOn->getType()->isPointerTy()) {
+ return RK;
+ case Attribute::Alignment: {
+ Value *V = RK.WasOn->stripInBoundsOffsets([&](const Value *Strip) {
+ if (auto *GEP = dyn_cast<GEPOperator>(Strip))
+ RK.ArgValue =
+ MinAlign(RK.ArgValue,
+ GEP->getMaxPreservedAlignment(M->getDataLayout()).value());
+ });
+ RK.WasOn = V;
+ return RK;
+ }
+ case Attribute::Dereferenceable:
+ case Attribute::DereferenceableOrNull: {
+ int64_t Offset = 0;
+ Value *V = GetPointerBaseWithConstantOffset(
+ RK.WasOn, Offset, M->getDataLayout(), /*AllowNonInBounds*/ false);
+ if (Offset < 0)
+ return RK;
+ RK.ArgValue = RK.ArgValue + Offset;
+ RK.WasOn = V;
+ }
+ }
+ return RK;
+}
+
+/// This class contain all knowledge that have been gather while building an
+/// llvm.assume and the function to manipulate it.
+struct AssumeBuilderState {
+ Module *M;
+
+ using MapKey = std::pair<Value *, Attribute::AttrKind>;
+ SmallMapVector<MapKey, unsigned, 8> AssumedKnowledgeMap;
+ Instruction *InstBeingRemoved = nullptr;
+ AssumptionCache* AC = nullptr;
+ DominatorTree* DT = nullptr;
+
+ AssumeBuilderState(Module *M, Instruction *I = nullptr,
+ AssumptionCache *AC = nullptr, DominatorTree *DT = nullptr)
+ : M(M), InstBeingRemoved(I), AC(AC), DT(DT) {}
+
+ bool tryToPreserveWithoutAddingAssume(RetainedKnowledge RK) {
+ if (!InstBeingRemoved || !RK.WasOn)
+ return false;
+ bool HasBeenPreserved = false;
+ Use* ToUpdate = nullptr;
+ getKnowledgeForValue(
+ RK.WasOn, {RK.AttrKind}, AC,
+ [&](RetainedKnowledge RKOther, Instruction *Assume,
+ const CallInst::BundleOpInfo *Bundle) {
+ if (!isValidAssumeForContext(Assume, InstBeingRemoved, DT))
+ return false;
+ if (RKOther.ArgValue >= RK.ArgValue) {
+ HasBeenPreserved = true;
+ return true;
+ } else if (isValidAssumeForContext(InstBeingRemoved, Assume,
+ DT)) {
+ HasBeenPreserved = true;
+ IntrinsicInst *Intr = cast<IntrinsicInst>(Assume);
+ ToUpdate = &Intr->op_begin()[Bundle->Begin + ABA_Argument];
+ return true;
+ }
+ return false;
+ });
+ if (ToUpdate)
+ ToUpdate->set(
+ ConstantInt::get(Type::getInt64Ty(M->getContext()), RK.ArgValue));
+ return HasBeenPreserved;
+ }
+
+ bool isKnowledgeWorthPreserving(RetainedKnowledge RK) {
+ if (!RK)
+ return false;
+ if (!RK.WasOn)
+ return true;
+ if (RK.WasOn->getType()->isPointerTy()) {
Value *UnderlyingPtr = getUnderlyingObject(RK.WasOn);
- if (isa<AllocaInst>(UnderlyingPtr) || isa<GlobalValue>(UnderlyingPtr))
- return false;
- }
- if (auto *Arg = dyn_cast<Argument>(RK.WasOn)) {
- if (Arg->hasAttribute(RK.AttrKind) &&
- (!Attribute::doesAttrKindHaveArgument(RK.AttrKind) ||
- Arg->getAttribute(RK.AttrKind).getValueAsInt() >= RK.ArgValue))
- return false;
- return true;
- }
- if (auto *Inst = dyn_cast<Instruction>(RK.WasOn))
- if (wouldInstructionBeTriviallyDead(Inst)) {
- if (RK.WasOn->use_empty())
- return false;
- Use *SingleUse = RK.WasOn->getSingleUndroppableUse();
- if (SingleUse && SingleUse->getUser() == InstBeingRemoved)
- return false;
- }
- return true;
- }
-
- void addKnowledge(RetainedKnowledge RK) {
- RK = canonicalizedKnowledge(RK, M);
-
- if (!isKnowledgeWorthPreserving(RK))
- return;
-
- if (tryToPreserveWithoutAddingAssume(RK))
- return;
- MapKey Key{RK.WasOn, RK.AttrKind};
- auto Lookup = AssumedKnowledgeMap.find(Key);
- if (Lookup == AssumedKnowledgeMap.end()) {
- AssumedKnowledgeMap[Key] = RK.ArgValue;
- return;
- }
- assert(((Lookup->second == 0 && RK.ArgValue == 0) ||
- (Lookup->second != 0 && RK.ArgValue != 0)) &&
- "inconsistent argument value");
-
- /// This is only desirable because for all attributes taking an argument
- /// higher is better.
- Lookup->second = std::max(Lookup->second, RK.ArgValue);
- }
-
- void addAttribute(Attribute Attr, Value *WasOn) {
- if (Attr.isTypeAttribute() || Attr.isStringAttribute() ||
- (!ShouldPreserveAllAttributes &&
- !isUsefullToPreserve(Attr.getKindAsEnum())))
- return;
- unsigned AttrArg = 0;
- if (Attr.isIntAttribute())
- AttrArg = Attr.getValueAsInt();
- addKnowledge({Attr.getKindAsEnum(), AttrArg, WasOn});
- }
-
- void addCall(const CallBase *Call) {
- auto addAttrList = [&](AttributeList AttrList) {
- for (unsigned Idx = AttributeList::FirstArgIndex;
- Idx < AttrList.getNumAttrSets(); Idx++)
- for (Attribute Attr : AttrList.getAttributes(Idx))
- addAttribute(Attr, Call->getArgOperand(Idx - 1));
- for (Attribute Attr : AttrList.getFnAttributes())
- addAttribute(Attr, nullptr);
- };
- addAttrList(Call->getAttributes());
- if (Function *Fn = Call->getCalledFunction())
- addAttrList(Fn->getAttributes());
- }
-
- IntrinsicInst *build() {
- if (AssumedKnowledgeMap.empty())
- return nullptr;
- if (!DebugCounter::shouldExecute(BuildAssumeCounter))
- return nullptr;
- Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume);
- LLVMContext &C = M->getContext();
- SmallVector<OperandBundleDef, 8> OpBundle;
- for (auto &MapElem : AssumedKnowledgeMap) {
- SmallVector<Value *, 2> Args;
- if (MapElem.first.first)
- Args.push_back(MapElem.first.first);
-
- /// This is only valid because for all attribute that currently exist a
- /// value of 0 is useless. and should not be preserved.
- if (MapElem.second)
- Args.push_back(ConstantInt::get(Type::getInt64Ty(M->getContext()),
- MapElem.second));
- OpBundle.push_back(OperandBundleDefT<Value *>(
- std::string(Attribute::getNameFromAttrKind(MapElem.first.second)),
- Args));
- NumBundlesInAssumes++;
- }
- NumAssumeBuilt++;
- return cast<IntrinsicInst>(CallInst::Create(
- FnAssume, ArrayRef<Value *>({ConstantInt::getTrue(C)}), OpBundle));
- }
-
- void addAccessedPtr(Instruction *MemInst, Value *Pointer, Type *AccType,
- MaybeAlign MA) {
- unsigned DerefSize = MemInst->getModule()
- ->getDataLayout()
- .getTypeStoreSize(AccType)
- .getKnownMinSize();
- if (DerefSize != 0) {
- addKnowledge({Attribute::Dereferenceable, DerefSize, Pointer});
- if (!NullPointerIsDefined(MemInst->getFunction(),
- Pointer->getType()->getPointerAddressSpace()))
- addKnowledge({Attribute::NonNull, 0u, Pointer});
- }
- if (MA.valueOrOne() > 1)
- addKnowledge(
- {Attribute::Alignment, unsigned(MA.valueOrOne().value()), Pointer});
- }
-
- void addInstruction(Instruction *I) {
- if (auto *Call = dyn_cast<CallBase>(I))
- return addCall(Call);
- if (auto *Load = dyn_cast<LoadInst>(I))
- return addAccessedPtr(I, Load->getPointerOperand(), Load->getType(),
- Load->getAlign());
- if (auto *Store = dyn_cast<StoreInst>(I))
- return addAccessedPtr(I, Store->getPointerOperand(),
- Store->getValueOperand()->getType(),
- Store->getAlign());
- // TODO: Add support for the other Instructions.
- // TODO: Maybe we should look around and merge with other llvm.assume.
- }
-};
-
-} // namespace
-
-IntrinsicInst *llvm::buildAssumeFromInst(Instruction *I) {
- if (!EnableKnowledgeRetention)
- return nullptr;
- AssumeBuilderState Builder(I->getModule());
- Builder.addInstruction(I);
- return Builder.build();
-}
-
-void llvm::salvageKnowledge(Instruction *I, AssumptionCache *AC,
- DominatorTree *DT) {
- if (!EnableKnowledgeRetention || I->isTerminator())
- return;
- AssumeBuilderState Builder(I->getModule(), I, AC, DT);
- Builder.addInstruction(I);
- if (IntrinsicInst *Intr = Builder.build()) {
- Intr->insertBefore(I);
- if (AC)
- AC->registerAssumption(Intr);
- }
-}
-
-namespace {
-
-struct AssumeSimplify {
- Function &F;
- AssumptionCache &AC;
- DominatorTree *DT;
- LLVMContext &C;
- SmallDenseSet<IntrinsicInst *> CleanupToDo;
- StringMapEntry<uint32_t> *IgnoreTag;
- SmallDenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 4>, 8> BBToAssume;
- bool MadeChange = false;
-
- AssumeSimplify(Function &F, AssumptionCache &AC, DominatorTree *DT,
- LLVMContext &C)
- : F(F), AC(AC), DT(DT), C(C),
- IgnoreTag(C.getOrInsertBundleTag(IgnoreBundleTag)) {}
-
- void buildMapping(bool FilterBooleanArgument) {
- BBToAssume.clear();
- for (Value *V : AC.assumptions()) {
- if (!V)
- continue;
- IntrinsicInst *Assume = cast<IntrinsicInst>(V);
- if (FilterBooleanArgument) {
- auto *Arg = dyn_cast<ConstantInt>(Assume->getOperand(0));
- if (!Arg || Arg->isZero())
- continue;
- }
- BBToAssume[Assume->getParent()].push_back(Assume);
- }
-
- for (auto &Elem : BBToAssume) {
- llvm::sort(Elem.second,
- [](const IntrinsicInst *LHS, const IntrinsicInst *RHS) {
- return LHS->comesBefore(RHS);
- });
- }
- }
-
- /// Remove all asumes in CleanupToDo if there boolean argument is true and
- /// ForceCleanup is set or the assume doesn't hold valuable knowledge.
- void RunCleanup(bool ForceCleanup) {
- for (IntrinsicInst *Assume : CleanupToDo) {
- auto *Arg = dyn_cast<ConstantInt>(Assume->getOperand(0));
- if (!Arg || Arg->isZero() ||
- (!ForceCleanup && !isAssumeWithEmptyBundle(*Assume)))
- continue;
- MadeChange = true;
- if (ForceCleanup)
- NumAssumesMerged++;
- else
- NumAssumesRemoved++;
- Assume->eraseFromParent();
- }
- CleanupToDo.clear();
- }
-
- /// Remove knowledge stored in assume when it is already know by an attribute
- /// or an other assume. This can when valid update an existing knowledge in an
- /// attribute or an other assume.
- void dropRedundantKnowledge() {
- struct MapValue {
- IntrinsicInst *Assume;
- unsigned ArgValue;
- CallInst::BundleOpInfo *BOI;
- };
- buildMapping(false);
- SmallDenseMap<std::pair<Value *, Attribute::AttrKind>,
- SmallVector<MapValue, 2>, 16>
- Knowledge;
- for (BasicBlock *BB : depth_first(&F))
- for (Value *V : BBToAssume[BB]) {
- if (!V)
- continue;
- IntrinsicInst *Assume = cast<IntrinsicInst>(V);
- for (CallInst::BundleOpInfo &BOI : Assume->bundle_op_infos()) {
- auto RemoveFromAssume = [&]() {
- CleanupToDo.insert(Assume);
- if (BOI.Begin != BOI.End) {
- Use *U = &Assume->op_begin()[BOI.Begin + ABA_WasOn];
- U->set(UndefValue::get(U->get()->getType()));
- }
- BOI.Tag = IgnoreTag;
- };
- if (BOI.Tag == IgnoreTag) {
- CleanupToDo.insert(Assume);
- continue;
- }
- RetainedKnowledge RK = getKnowledgeFromBundle(*Assume, BOI);
- if (auto *Arg = dyn_cast_or_null<Argument>(RK.WasOn)) {
- bool HasSameKindAttr = Arg->hasAttribute(RK.AttrKind);
- if (HasSameKindAttr)
- if (!Attribute::doesAttrKindHaveArgument(RK.AttrKind) ||
- Arg->getAttribute(RK.AttrKind).getValueAsInt() >=
- RK.ArgValue) {
- RemoveFromAssume();
- continue;
- }
- if (isValidAssumeForContext(
- Assume, &*F.getEntryBlock().getFirstInsertionPt()) ||
- Assume == &*F.getEntryBlock().getFirstInsertionPt()) {
- if (HasSameKindAttr)
- Arg->removeAttr(RK.AttrKind);
- Arg->addAttr(Attribute::get(C, RK.AttrKind, RK.ArgValue));
- MadeChange = true;
- RemoveFromAssume();
- continue;
- }
- }
- auto &Lookup = Knowledge[{RK.WasOn, RK.AttrKind}];
- for (MapValue &Elem : Lookup) {
- if (!isValidAssumeForContext(Elem.Assume, Assume, DT))
- continue;
- if (Elem.ArgValue >= RK.ArgValue) {
- RemoveFromAssume();
- continue;
- } else if (isValidAssumeForContext(Assume, Elem.Assume, DT)) {
- Elem.Assume->op_begin()[Elem.BOI->Begin + ABA_Argument].set(
- ConstantInt::get(Type::getInt64Ty(C), RK.ArgValue));
- MadeChange = true;
- RemoveFromAssume();
- continue;
- }
- }
- Lookup.push_back({Assume, RK.ArgValue, &BOI});
- }
- }
- }
-
- using MergeIterator = SmallVectorImpl<IntrinsicInst *>::iterator;
-
- /// Merge all Assumes from Begin to End in and insert the resulting assume as
- /// high as possible in the basicblock.
- void mergeRange(BasicBlock *BB, MergeIterator Begin, MergeIterator End) {
- if (Begin == End || std::next(Begin) == End)
- return;
- /// Provide no additional information so that AssumeBuilderState doesn't
- /// try to do any punning since it already has been done better.
- AssumeBuilderState Builder(F.getParent());
-
- /// For now it is initialized to the best value it could have
- Instruction *InsertPt = BB->getFirstNonPHI();
- if (isa<LandingPadInst>(InsertPt))
- InsertPt = InsertPt->getNextNode();
- for (IntrinsicInst *I : make_range(Begin, End)) {
- CleanupToDo.insert(I);
- for (CallInst::BundleOpInfo &BOI : I->bundle_op_infos()) {
- RetainedKnowledge RK = getKnowledgeFromBundle(*I, BOI);
- if (!RK)
- continue;
- Builder.addKnowledge(RK);
- if (auto *I = dyn_cast_or_null<Instruction>(RK.WasOn))
- if (I->getParent() == InsertPt->getParent() &&
- (InsertPt->comesBefore(I) || InsertPt == I))
- InsertPt = I->getNextNode();
- }
- }
-
- /// Adjust InsertPt if it is before Begin, since mergeAssumes only
- /// guarantees we can place the resulting assume between Begin and End.
- if (InsertPt->comesBefore(*Begin))
- for (auto It = (*Begin)->getIterator(), E = InsertPt->getIterator();
- It != E; --It)
- if (!isGuaranteedToTransferExecutionToSuccessor(&*It)) {
- InsertPt = It->getNextNode();
- break;
- }
- IntrinsicInst *MergedAssume = Builder.build();
- if (!MergedAssume)
- return;
- MadeChange = true;
- MergedAssume->insertBefore(InsertPt);
- AC.registerAssumption(MergedAssume);
- }
-
- /// Merge assume when they are in the same BasicBlock and for all instruction
- /// between them isGuaranteedToTransferExecutionToSuccessor returns true.
- void mergeAssumes() {
- buildMapping(true);
-
- SmallVector<MergeIterator, 4> SplitPoints;
- for (auto &Elem : BBToAssume) {
- SmallVectorImpl<IntrinsicInst *> &AssumesInBB = Elem.second;
- if (AssumesInBB.size() < 2)
- continue;
- /// AssumesInBB is already sorted by order in the block.
-
- BasicBlock::iterator It = AssumesInBB.front()->getIterator();
- BasicBlock::iterator E = AssumesInBB.back()->getIterator();
- SplitPoints.push_back(AssumesInBB.begin());
- MergeIterator LastSplit = AssumesInBB.begin();
- for (; It != E; ++It)
- if (!isGuaranteedToTransferExecutionToSuccessor(&*It)) {
- for (; (*LastSplit)->comesBefore(&*It); ++LastSplit)
- ;
- if (SplitPoints.back() != LastSplit)
- SplitPoints.push_back(LastSplit);
- }
- SplitPoints.push_back(AssumesInBB.end());
- for (auto SplitIt = SplitPoints.begin();
- SplitIt != std::prev(SplitPoints.end()); SplitIt++) {
- mergeRange(Elem.first, *SplitIt, *(SplitIt + 1));
- }
- SplitPoints.clear();
- }
- }
-};
-
-bool simplifyAssumes(Function &F, AssumptionCache *AC, DominatorTree *DT) {
- AssumeSimplify AS(F, *AC, DT, F.getContext());
-
- /// Remove knowledge that is already known by a dominating other assume or an
- /// attribute.
- AS.dropRedundantKnowledge();
-
- /// Remove assume that are empty.
- AS.RunCleanup(false);
-
- /// Merge assume in the same basicblock when possible.
- AS.mergeAssumes();
-
- /// Remove assume that were merged.
- AS.RunCleanup(true);
- return AS.MadeChange;
-}
-
-} // namespace
-
-PreservedAnalyses AssumeSimplifyPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- if (!EnableKnowledgeRetention)
- return PreservedAnalyses::all();
- simplifyAssumes(F, &AM.getResult<AssumptionAnalysis>(F),
- AM.getCachedResult<DominatorTreeAnalysis>(F));
- return PreservedAnalyses::all();
-}
-
-namespace {
-class AssumeSimplifyPassLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- AssumeSimplifyPassLegacyPass() : FunctionPass(ID) {
- initializeAssumeSimplifyPassLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
- bool runOnFunction(Function &F) override {
- if (skipFunction(F) || !EnableKnowledgeRetention)
- return false;
- AssumptionCache &AC =
- getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- DominatorTreeWrapperPass *DTWP =
- getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- return simplifyAssumes(F, &AC, DTWP ? &DTWP->getDomTree() : nullptr);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
-
- AU.setPreservesAll();
- }
-};
-} // namespace
-
-char AssumeSimplifyPassLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(AssumeSimplifyPassLegacyPass, "assume-simplify",
- "Assume Simplify", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_END(AssumeSimplifyPassLegacyPass, "assume-simplify",
- "Assume Simplify", false, false)
-
-FunctionPass *llvm::createAssumeSimplifyPass() {
- return new AssumeSimplifyPassLegacyPass();
-}
-
-PreservedAnalyses AssumeBuilderPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
- DominatorTree* DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
- for (Instruction &I : instructions(F))
- salvageKnowledge(&I, AC, DT);
- return PreservedAnalyses::all();
-}
-
-namespace {
-class AssumeBuilderPassLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- AssumeBuilderPassLegacyPass() : FunctionPass(ID) {
- initializeAssumeBuilderPassLegacyPassPass(*PassRegistry::getPassRegistry());
- }
- bool runOnFunction(Function &F) override {
- AssumptionCache &AC =
- getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- DominatorTreeWrapperPass *DTWP =
- getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- for (Instruction &I : instructions(F))
- salvageKnowledge(&I, &AC, DTWP ? &DTWP->getDomTree() : nullptr);
- return true;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
-
- AU.setPreservesAll();
- }
-};
-} // namespace
-
-char AssumeBuilderPassLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(AssumeBuilderPassLegacyPass, "assume-builder",
- "Assume Builder", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_END(AssumeBuilderPassLegacyPass, "assume-builder",
- "Assume Builder", false, false)
+ if (isa<AllocaInst>(UnderlyingPtr) || isa<GlobalValue>(UnderlyingPtr))
+ return false;
+ }
+ if (auto *Arg = dyn_cast<Argument>(RK.WasOn)) {
+ if (Arg->hasAttribute(RK.AttrKind) &&
+ (!Attribute::doesAttrKindHaveArgument(RK.AttrKind) ||
+ Arg->getAttribute(RK.AttrKind).getValueAsInt() >= RK.ArgValue))
+ return false;
+ return true;
+ }
+ if (auto *Inst = dyn_cast<Instruction>(RK.WasOn))
+ if (wouldInstructionBeTriviallyDead(Inst)) {
+ if (RK.WasOn->use_empty())
+ return false;
+ Use *SingleUse = RK.WasOn->getSingleUndroppableUse();
+ if (SingleUse && SingleUse->getUser() == InstBeingRemoved)
+ return false;
+ }
+ return true;
+ }
+
+ void addKnowledge(RetainedKnowledge RK) {
+ RK = canonicalizedKnowledge(RK, M);
+
+ if (!isKnowledgeWorthPreserving(RK))
+ return;
+
+ if (tryToPreserveWithoutAddingAssume(RK))
+ return;
+ MapKey Key{RK.WasOn, RK.AttrKind};
+ auto Lookup = AssumedKnowledgeMap.find(Key);
+ if (Lookup == AssumedKnowledgeMap.end()) {
+ AssumedKnowledgeMap[Key] = RK.ArgValue;
+ return;
+ }
+ assert(((Lookup->second == 0 && RK.ArgValue == 0) ||
+ (Lookup->second != 0 && RK.ArgValue != 0)) &&
+ "inconsistent argument value");
+
+ /// This is only desirable because for all attributes taking an argument
+ /// higher is better.
+ Lookup->second = std::max(Lookup->second, RK.ArgValue);
+ }
+
+ void addAttribute(Attribute Attr, Value *WasOn) {
+ if (Attr.isTypeAttribute() || Attr.isStringAttribute() ||
+ (!ShouldPreserveAllAttributes &&
+ !isUsefullToPreserve(Attr.getKindAsEnum())))
+ return;
+ unsigned AttrArg = 0;
+ if (Attr.isIntAttribute())
+ AttrArg = Attr.getValueAsInt();
+ addKnowledge({Attr.getKindAsEnum(), AttrArg, WasOn});
+ }
+
+ void addCall(const CallBase *Call) {
+ auto addAttrList = [&](AttributeList AttrList) {
+ for (unsigned Idx = AttributeList::FirstArgIndex;
+ Idx < AttrList.getNumAttrSets(); Idx++)
+ for (Attribute Attr : AttrList.getAttributes(Idx))
+ addAttribute(Attr, Call->getArgOperand(Idx - 1));
+ for (Attribute Attr : AttrList.getFnAttributes())
+ addAttribute(Attr, nullptr);
+ };
+ addAttrList(Call->getAttributes());
+ if (Function *Fn = Call->getCalledFunction())
+ addAttrList(Fn->getAttributes());
+ }
+
+ IntrinsicInst *build() {
+ if (AssumedKnowledgeMap.empty())
+ return nullptr;
+ if (!DebugCounter::shouldExecute(BuildAssumeCounter))
+ return nullptr;
+ Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume);
+ LLVMContext &C = M->getContext();
+ SmallVector<OperandBundleDef, 8> OpBundle;
+ for (auto &MapElem : AssumedKnowledgeMap) {
+ SmallVector<Value *, 2> Args;
+ if (MapElem.first.first)
+ Args.push_back(MapElem.first.first);
+
+ /// This is only valid because for all attribute that currently exist a
+ /// value of 0 is useless. and should not be preserved.
+ if (MapElem.second)
+ Args.push_back(ConstantInt::get(Type::getInt64Ty(M->getContext()),
+ MapElem.second));
+ OpBundle.push_back(OperandBundleDefT<Value *>(
+ std::string(Attribute::getNameFromAttrKind(MapElem.first.second)),
+ Args));
+ NumBundlesInAssumes++;
+ }
+ NumAssumeBuilt++;
+ return cast<IntrinsicInst>(CallInst::Create(
+ FnAssume, ArrayRef<Value *>({ConstantInt::getTrue(C)}), OpBundle));
+ }
+
+ void addAccessedPtr(Instruction *MemInst, Value *Pointer, Type *AccType,
+ MaybeAlign MA) {
+ unsigned DerefSize = MemInst->getModule()
+ ->getDataLayout()
+ .getTypeStoreSize(AccType)
+ .getKnownMinSize();
+ if (DerefSize != 0) {
+ addKnowledge({Attribute::Dereferenceable, DerefSize, Pointer});
+ if (!NullPointerIsDefined(MemInst->getFunction(),
+ Pointer->getType()->getPointerAddressSpace()))
+ addKnowledge({Attribute::NonNull, 0u, Pointer});
+ }
+ if (MA.valueOrOne() > 1)
+ addKnowledge(
+ {Attribute::Alignment, unsigned(MA.valueOrOne().value()), Pointer});
+ }
+
+ void addInstruction(Instruction *I) {
+ if (auto *Call = dyn_cast<CallBase>(I))
+ return addCall(Call);
+ if (auto *Load = dyn_cast<LoadInst>(I))
+ return addAccessedPtr(I, Load->getPointerOperand(), Load->getType(),
+ Load->getAlign());
+ if (auto *Store = dyn_cast<StoreInst>(I))
+ return addAccessedPtr(I, Store->getPointerOperand(),
+ Store->getValueOperand()->getType(),
+ Store->getAlign());
+ // TODO: Add support for the other Instructions.
+ // TODO: Maybe we should look around and merge with other llvm.assume.
+ }
+};
+
+} // namespace
+
+IntrinsicInst *llvm::buildAssumeFromInst(Instruction *I) {
+ if (!EnableKnowledgeRetention)
+ return nullptr;
+ AssumeBuilderState Builder(I->getModule());
+ Builder.addInstruction(I);
+ return Builder.build();
+}
+
+void llvm::salvageKnowledge(Instruction *I, AssumptionCache *AC,
+ DominatorTree *DT) {
+ if (!EnableKnowledgeRetention || I->isTerminator())
+ return;
+ AssumeBuilderState Builder(I->getModule(), I, AC, DT);
+ Builder.addInstruction(I);
+ if (IntrinsicInst *Intr = Builder.build()) {
+ Intr->insertBefore(I);
+ if (AC)
+ AC->registerAssumption(Intr);
+ }
+}
+
+namespace {
+
+struct AssumeSimplify {
+ Function &F;
+ AssumptionCache &AC;
+ DominatorTree *DT;
+ LLVMContext &C;
+ SmallDenseSet<IntrinsicInst *> CleanupToDo;
+ StringMapEntry<uint32_t> *IgnoreTag;
+ SmallDenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 4>, 8> BBToAssume;
+ bool MadeChange = false;
+
+ AssumeSimplify(Function &F, AssumptionCache &AC, DominatorTree *DT,
+ LLVMContext &C)
+ : F(F), AC(AC), DT(DT), C(C),
+ IgnoreTag(C.getOrInsertBundleTag(IgnoreBundleTag)) {}
+
+ void buildMapping(bool FilterBooleanArgument) {
+ BBToAssume.clear();
+ for (Value *V : AC.assumptions()) {
+ if (!V)
+ continue;
+ IntrinsicInst *Assume = cast<IntrinsicInst>(V);
+ if (FilterBooleanArgument) {
+ auto *Arg = dyn_cast<ConstantInt>(Assume->getOperand(0));
+ if (!Arg || Arg->isZero())
+ continue;
+ }
+ BBToAssume[Assume->getParent()].push_back(Assume);
+ }
+
+ for (auto &Elem : BBToAssume) {
+ llvm::sort(Elem.second,
+ [](const IntrinsicInst *LHS, const IntrinsicInst *RHS) {
+ return LHS->comesBefore(RHS);
+ });
+ }
+ }
+
+ /// Remove all asumes in CleanupToDo if there boolean argument is true and
+ /// ForceCleanup is set or the assume doesn't hold valuable knowledge.
+ void RunCleanup(bool ForceCleanup) {
+ for (IntrinsicInst *Assume : CleanupToDo) {
+ auto *Arg = dyn_cast<ConstantInt>(Assume->getOperand(0));
+ if (!Arg || Arg->isZero() ||
+ (!ForceCleanup && !isAssumeWithEmptyBundle(*Assume)))
+ continue;
+ MadeChange = true;
+ if (ForceCleanup)
+ NumAssumesMerged++;
+ else
+ NumAssumesRemoved++;
+ Assume->eraseFromParent();
+ }
+ CleanupToDo.clear();
+ }
+
+ /// Remove knowledge stored in assume when it is already know by an attribute
+ /// or an other assume. This can when valid update an existing knowledge in an
+ /// attribute or an other assume.
+ void dropRedundantKnowledge() {
+ struct MapValue {
+ IntrinsicInst *Assume;
+ unsigned ArgValue;
+ CallInst::BundleOpInfo *BOI;
+ };
+ buildMapping(false);
+ SmallDenseMap<std::pair<Value *, Attribute::AttrKind>,
+ SmallVector<MapValue, 2>, 16>
+ Knowledge;
+ for (BasicBlock *BB : depth_first(&F))
+ for (Value *V : BBToAssume[BB]) {
+ if (!V)
+ continue;
+ IntrinsicInst *Assume = cast<IntrinsicInst>(V);
+ for (CallInst::BundleOpInfo &BOI : Assume->bundle_op_infos()) {
+ auto RemoveFromAssume = [&]() {
+ CleanupToDo.insert(Assume);
+ if (BOI.Begin != BOI.End) {
+ Use *U = &Assume->op_begin()[BOI.Begin + ABA_WasOn];
+ U->set(UndefValue::get(U->get()->getType()));
+ }
+ BOI.Tag = IgnoreTag;
+ };
+ if (BOI.Tag == IgnoreTag) {
+ CleanupToDo.insert(Assume);
+ continue;
+ }
+ RetainedKnowledge RK = getKnowledgeFromBundle(*Assume, BOI);
+ if (auto *Arg = dyn_cast_or_null<Argument>(RK.WasOn)) {
+ bool HasSameKindAttr = Arg->hasAttribute(RK.AttrKind);
+ if (HasSameKindAttr)
+ if (!Attribute::doesAttrKindHaveArgument(RK.AttrKind) ||
+ Arg->getAttribute(RK.AttrKind).getValueAsInt() >=
+ RK.ArgValue) {
+ RemoveFromAssume();
+ continue;
+ }
+ if (isValidAssumeForContext(
+ Assume, &*F.getEntryBlock().getFirstInsertionPt()) ||
+ Assume == &*F.getEntryBlock().getFirstInsertionPt()) {
+ if (HasSameKindAttr)
+ Arg->removeAttr(RK.AttrKind);
+ Arg->addAttr(Attribute::get(C, RK.AttrKind, RK.ArgValue));
+ MadeChange = true;
+ RemoveFromAssume();
+ continue;
+ }
+ }
+ auto &Lookup = Knowledge[{RK.WasOn, RK.AttrKind}];
+ for (MapValue &Elem : Lookup) {
+ if (!isValidAssumeForContext(Elem.Assume, Assume, DT))
+ continue;
+ if (Elem.ArgValue >= RK.ArgValue) {
+ RemoveFromAssume();
+ continue;
+ } else if (isValidAssumeForContext(Assume, Elem.Assume, DT)) {
+ Elem.Assume->op_begin()[Elem.BOI->Begin + ABA_Argument].set(
+ ConstantInt::get(Type::getInt64Ty(C), RK.ArgValue));
+ MadeChange = true;
+ RemoveFromAssume();
+ continue;
+ }
+ }
+ Lookup.push_back({Assume, RK.ArgValue, &BOI});
+ }
+ }
+ }
+
+ using MergeIterator = SmallVectorImpl<IntrinsicInst *>::iterator;
+
+ /// Merge all Assumes from Begin to End in and insert the resulting assume as
+ /// high as possible in the basicblock.
+ void mergeRange(BasicBlock *BB, MergeIterator Begin, MergeIterator End) {
+ if (Begin == End || std::next(Begin) == End)
+ return;
+ /// Provide no additional information so that AssumeBuilderState doesn't
+ /// try to do any punning since it already has been done better.
+ AssumeBuilderState Builder(F.getParent());
+
+ /// For now it is initialized to the best value it could have
+ Instruction *InsertPt = BB->getFirstNonPHI();
+ if (isa<LandingPadInst>(InsertPt))
+ InsertPt = InsertPt->getNextNode();
+ for (IntrinsicInst *I : make_range(Begin, End)) {
+ CleanupToDo.insert(I);
+ for (CallInst::BundleOpInfo &BOI : I->bundle_op_infos()) {
+ RetainedKnowledge RK = getKnowledgeFromBundle(*I, BOI);
+ if (!RK)
+ continue;
+ Builder.addKnowledge(RK);
+ if (auto *I = dyn_cast_or_null<Instruction>(RK.WasOn))
+ if (I->getParent() == InsertPt->getParent() &&
+ (InsertPt->comesBefore(I) || InsertPt == I))
+ InsertPt = I->getNextNode();
+ }
+ }
+
+ /// Adjust InsertPt if it is before Begin, since mergeAssumes only
+ /// guarantees we can place the resulting assume between Begin and End.
+ if (InsertPt->comesBefore(*Begin))
+ for (auto It = (*Begin)->getIterator(), E = InsertPt->getIterator();
+ It != E; --It)
+ if (!isGuaranteedToTransferExecutionToSuccessor(&*It)) {
+ InsertPt = It->getNextNode();
+ break;
+ }
+ IntrinsicInst *MergedAssume = Builder.build();
+ if (!MergedAssume)
+ return;
+ MadeChange = true;
+ MergedAssume->insertBefore(InsertPt);
+ AC.registerAssumption(MergedAssume);
+ }
+
+ /// Merge assume when they are in the same BasicBlock and for all instruction
+ /// between them isGuaranteedToTransferExecutionToSuccessor returns true.
+ void mergeAssumes() {
+ buildMapping(true);
+
+ SmallVector<MergeIterator, 4> SplitPoints;
+ for (auto &Elem : BBToAssume) {
+ SmallVectorImpl<IntrinsicInst *> &AssumesInBB = Elem.second;
+ if (AssumesInBB.size() < 2)
+ continue;
+ /// AssumesInBB is already sorted by order in the block.
+
+ BasicBlock::iterator It = AssumesInBB.front()->getIterator();
+ BasicBlock::iterator E = AssumesInBB.back()->getIterator();
+ SplitPoints.push_back(AssumesInBB.begin());
+ MergeIterator LastSplit = AssumesInBB.begin();
+ for (; It != E; ++It)
+ if (!isGuaranteedToTransferExecutionToSuccessor(&*It)) {
+ for (; (*LastSplit)->comesBefore(&*It); ++LastSplit)
+ ;
+ if (SplitPoints.back() != LastSplit)
+ SplitPoints.push_back(LastSplit);
+ }
+ SplitPoints.push_back(AssumesInBB.end());
+ for (auto SplitIt = SplitPoints.begin();
+ SplitIt != std::prev(SplitPoints.end()); SplitIt++) {
+ mergeRange(Elem.first, *SplitIt, *(SplitIt + 1));
+ }
+ SplitPoints.clear();
+ }
+ }
+};
+
+bool simplifyAssumes(Function &F, AssumptionCache *AC, DominatorTree *DT) {
+ AssumeSimplify AS(F, *AC, DT, F.getContext());
+
+ /// Remove knowledge that is already known by a dominating other assume or an
+ /// attribute.
+ AS.dropRedundantKnowledge();
+
+ /// Remove assume that are empty.
+ AS.RunCleanup(false);
+
+ /// Merge assume in the same basicblock when possible.
+ AS.mergeAssumes();
+
+ /// Remove assume that were merged.
+ AS.RunCleanup(true);
+ return AS.MadeChange;
+}
+
+} // namespace
+
+PreservedAnalyses AssumeSimplifyPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ if (!EnableKnowledgeRetention)
+ return PreservedAnalyses::all();
+ simplifyAssumes(F, &AM.getResult<AssumptionAnalysis>(F),
+ AM.getCachedResult<DominatorTreeAnalysis>(F));
+ return PreservedAnalyses::all();
+}
+
+namespace {
+class AssumeSimplifyPassLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ AssumeSimplifyPassLegacyPass() : FunctionPass(ID) {
+ initializeAssumeSimplifyPassLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F) || !EnableKnowledgeRetention)
+ return false;
+ AssumptionCache &AC =
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ DominatorTreeWrapperPass *DTWP =
+ getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ return simplifyAssumes(F, &AC, DTWP ? &DTWP->getDomTree() : nullptr);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+
+ AU.setPreservesAll();
+ }
+};
+} // namespace
+
+char AssumeSimplifyPassLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AssumeSimplifyPassLegacyPass, "assume-simplify",
+ "Assume Simplify", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(AssumeSimplifyPassLegacyPass, "assume-simplify",
+ "Assume Simplify", false, false)
+
+FunctionPass *llvm::createAssumeSimplifyPass() {
+ return new AssumeSimplifyPassLegacyPass();
+}
+
+PreservedAnalyses AssumeBuilderPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+ DominatorTree* DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+ for (Instruction &I : instructions(F))
+ salvageKnowledge(&I, AC, DT);
+ return PreservedAnalyses::all();
+}
+
+namespace {
+class AssumeBuilderPassLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ AssumeBuilderPassLegacyPass() : FunctionPass(ID) {
+ initializeAssumeBuilderPassLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override {
+ AssumptionCache &AC =
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ DominatorTreeWrapperPass *DTWP =
+ getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ for (Instruction &I : instructions(F))
+ salvageKnowledge(&I, &AC, DTWP ? &DTWP->getDomTree() : nullptr);
+ return true;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+
+ AU.setPreservesAll();
+ }
+};
+} // namespace
+
+char AssumeBuilderPassLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AssumeBuilderPassLegacyPass, "assume-builder",
+ "Assume Builder", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(AssumeBuilderPassLegacyPass, "assume-builder",
+ "Assume Builder", false, false)
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/BasicBlockUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/BasicBlockUtils.cpp
index 414d6044ff..6bcd42c4c6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -1,547 +1,547 @@
-//===- BasicBlockUtils.cpp - BasicBlock Utilities --------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This family of functions perform manipulations on basic blocks, and
-// instructions contained within basic blocks.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <cassert>
-#include <cstdint>
-#include <string>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "basicblock-utils"
-
-void llvm::DetatchDeadBlocks(
- ArrayRef<BasicBlock *> BBs,
- SmallVectorImpl<DominatorTree::UpdateType> *Updates,
- bool KeepOneInputPHIs) {
- for (auto *BB : BBs) {
- // Loop through all of our successors and make sure they know that one
- // of their predecessors is going away.
- SmallPtrSet<BasicBlock *, 4> UniqueSuccessors;
- for (BasicBlock *Succ : successors(BB)) {
- Succ->removePredecessor(BB, KeepOneInputPHIs);
- if (Updates && UniqueSuccessors.insert(Succ).second)
- Updates->push_back({DominatorTree::Delete, BB, Succ});
- }
-
- // Zap all the instructions in the block.
- while (!BB->empty()) {
- Instruction &I = BB->back();
- // If this instruction is used, replace uses with an arbitrary value.
- // Because control flow can't get here, we don't care what we replace the
- // value with. Note that since this block is unreachable, and all values
- // contained within it must dominate their uses, that all uses will
- // eventually be removed (they are themselves dead).
- if (!I.use_empty())
- I.replaceAllUsesWith(UndefValue::get(I.getType()));
- BB->getInstList().pop_back();
- }
- new UnreachableInst(BB->getContext(), BB);
- assert(BB->getInstList().size() == 1 &&
- isa<UnreachableInst>(BB->getTerminator()) &&
- "The successor list of BB isn't empty before "
- "applying corresponding DTU updates.");
- }
-}
-
-void llvm::DeleteDeadBlock(BasicBlock *BB, DomTreeUpdater *DTU,
- bool KeepOneInputPHIs) {
- DeleteDeadBlocks({BB}, DTU, KeepOneInputPHIs);
-}
-
-void llvm::DeleteDeadBlocks(ArrayRef <BasicBlock *> BBs, DomTreeUpdater *DTU,
- bool KeepOneInputPHIs) {
-#ifndef NDEBUG
- // Make sure that all predecessors of each dead block is also dead.
- SmallPtrSet<BasicBlock *, 4> Dead(BBs.begin(), BBs.end());
- assert(Dead.size() == BBs.size() && "Duplicating blocks?");
- for (auto *BB : Dead)
- for (BasicBlock *Pred : predecessors(BB))
- assert(Dead.count(Pred) && "All predecessors must be dead!");
-#endif
-
- SmallVector<DominatorTree::UpdateType, 4> Updates;
- DetatchDeadBlocks(BBs, DTU ? &Updates : nullptr, KeepOneInputPHIs);
-
- if (DTU)
+//===- BasicBlockUtils.cpp - BasicBlock Utilities --------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform manipulations on basic blocks, and
+// instructions contained within basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "basicblock-utils"
+
+void llvm::DetatchDeadBlocks(
+ ArrayRef<BasicBlock *> BBs,
+ SmallVectorImpl<DominatorTree::UpdateType> *Updates,
+ bool KeepOneInputPHIs) {
+ for (auto *BB : BBs) {
+ // Loop through all of our successors and make sure they know that one
+ // of their predecessors is going away.
+ SmallPtrSet<BasicBlock *, 4> UniqueSuccessors;
+ for (BasicBlock *Succ : successors(BB)) {
+ Succ->removePredecessor(BB, KeepOneInputPHIs);
+ if (Updates && UniqueSuccessors.insert(Succ).second)
+ Updates->push_back({DominatorTree::Delete, BB, Succ});
+ }
+
+ // Zap all the instructions in the block.
+ while (!BB->empty()) {
+ Instruction &I = BB->back();
+ // If this instruction is used, replace uses with an arbitrary value.
+ // Because control flow can't get here, we don't care what we replace the
+ // value with. Note that since this block is unreachable, and all values
+ // contained within it must dominate their uses, that all uses will
+ // eventually be removed (they are themselves dead).
+ if (!I.use_empty())
+ I.replaceAllUsesWith(UndefValue::get(I.getType()));
+ BB->getInstList().pop_back();
+ }
+ new UnreachableInst(BB->getContext(), BB);
+ assert(BB->getInstList().size() == 1 &&
+ isa<UnreachableInst>(BB->getTerminator()) &&
+ "The successor list of BB isn't empty before "
+ "applying corresponding DTU updates.");
+ }
+}
+
+void llvm::DeleteDeadBlock(BasicBlock *BB, DomTreeUpdater *DTU,
+ bool KeepOneInputPHIs) {
+ DeleteDeadBlocks({BB}, DTU, KeepOneInputPHIs);
+}
+
+void llvm::DeleteDeadBlocks(ArrayRef <BasicBlock *> BBs, DomTreeUpdater *DTU,
+ bool KeepOneInputPHIs) {
+#ifndef NDEBUG
+ // Make sure that all predecessors of each dead block is also dead.
+ SmallPtrSet<BasicBlock *, 4> Dead(BBs.begin(), BBs.end());
+ assert(Dead.size() == BBs.size() && "Duplicating blocks?");
+ for (auto *BB : Dead)
+ for (BasicBlock *Pred : predecessors(BB))
+ assert(Dead.count(Pred) && "All predecessors must be dead!");
+#endif
+
+ SmallVector<DominatorTree::UpdateType, 4> Updates;
+ DetatchDeadBlocks(BBs, DTU ? &Updates : nullptr, KeepOneInputPHIs);
+
+ if (DTU)
DTU->applyUpdates(Updates);
-
- for (BasicBlock *BB : BBs)
- if (DTU)
- DTU->deleteBB(BB);
- else
- BB->eraseFromParent();
-}
-
-bool llvm::EliminateUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
- bool KeepOneInputPHIs) {
- df_iterator_default_set<BasicBlock*> Reachable;
-
- // Mark all reachable blocks.
- for (BasicBlock *BB : depth_first_ext(&F, Reachable))
- (void)BB/* Mark all reachable blocks */;
-
- // Collect all dead blocks.
- std::vector<BasicBlock*> DeadBlocks;
- for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
- if (!Reachable.count(&*I)) {
- BasicBlock *BB = &*I;
- DeadBlocks.push_back(BB);
- }
-
- // Delete the dead blocks.
- DeleteDeadBlocks(DeadBlocks, DTU, KeepOneInputPHIs);
-
- return !DeadBlocks.empty();
-}
-
+
+ for (BasicBlock *BB : BBs)
+ if (DTU)
+ DTU->deleteBB(BB);
+ else
+ BB->eraseFromParent();
+}
+
+bool llvm::EliminateUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
+ bool KeepOneInputPHIs) {
+ df_iterator_default_set<BasicBlock*> Reachable;
+
+ // Mark all reachable blocks.
+ for (BasicBlock *BB : depth_first_ext(&F, Reachable))
+ (void)BB/* Mark all reachable blocks */;
+
+ // Collect all dead blocks.
+ std::vector<BasicBlock*> DeadBlocks;
+ for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+ if (!Reachable.count(&*I)) {
+ BasicBlock *BB = &*I;
+ DeadBlocks.push_back(BB);
+ }
+
+ // Delete the dead blocks.
+ DeleteDeadBlocks(DeadBlocks, DTU, KeepOneInputPHIs);
+
+ return !DeadBlocks.empty();
+}
+
bool llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
- MemoryDependenceResults *MemDep) {
+ MemoryDependenceResults *MemDep) {
if (!isa<PHINode>(BB->begin()))
return false;
-
- while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
- if (PN->getIncomingValue(0) != PN)
- PN->replaceAllUsesWith(PN->getIncomingValue(0));
- else
- PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
-
- if (MemDep)
- MemDep->removeInstruction(PN); // Memdep updates AA itself.
-
- PN->eraseFromParent();
- }
+
+ while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+ if (PN->getIncomingValue(0) != PN)
+ PN->replaceAllUsesWith(PN->getIncomingValue(0));
+ else
+ PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+
+ if (MemDep)
+ MemDep->removeInstruction(PN); // Memdep updates AA itself.
+
+ PN->eraseFromParent();
+ }
return true;
-}
-
-bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI,
- MemorySSAUpdater *MSSAU) {
- // Recursively deleting a PHI may cause multiple PHIs to be deleted
- // or RAUW'd undef, so use an array of WeakTrackingVH for the PHIs to delete.
- SmallVector<WeakTrackingVH, 8> PHIs;
- for (PHINode &PN : BB->phis())
- PHIs.push_back(&PN);
-
- bool Changed = false;
- for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
- if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i].operator Value*()))
- Changed |= RecursivelyDeleteDeadPHINode(PN, TLI, MSSAU);
-
- return Changed;
-}
-
-bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
- LoopInfo *LI, MemorySSAUpdater *MSSAU,
- MemoryDependenceResults *MemDep,
- bool PredecessorWithTwoSuccessors) {
- if (BB->hasAddressTaken())
- return false;
-
- // Can't merge if there are multiple predecessors, or no predecessors.
- BasicBlock *PredBB = BB->getUniquePredecessor();
- if (!PredBB) return false;
-
- // Don't break self-loops.
- if (PredBB == BB) return false;
- // Don't break unwinding instructions.
- if (PredBB->getTerminator()->isExceptionalTerminator())
- return false;
-
- // Can't merge if there are multiple distinct successors.
- if (!PredecessorWithTwoSuccessors && PredBB->getUniqueSuccessor() != BB)
- return false;
-
- // Currently only allow PredBB to have two predecessors, one being BB.
- // Update BI to branch to BB's only successor instead of BB.
- BranchInst *PredBB_BI;
- BasicBlock *NewSucc = nullptr;
- unsigned FallThruPath;
- if (PredecessorWithTwoSuccessors) {
- if (!(PredBB_BI = dyn_cast<BranchInst>(PredBB->getTerminator())))
- return false;
- BranchInst *BB_JmpI = dyn_cast<BranchInst>(BB->getTerminator());
- if (!BB_JmpI || !BB_JmpI->isUnconditional())
- return false;
- NewSucc = BB_JmpI->getSuccessor(0);
- FallThruPath = PredBB_BI->getSuccessor(0) == BB ? 0 : 1;
- }
-
- // Can't merge if there is PHI loop.
- for (PHINode &PN : BB->phis())
- for (Value *IncValue : PN.incoming_values())
- if (IncValue == &PN)
- return false;
-
- LLVM_DEBUG(dbgs() << "Merging: " << BB->getName() << " into "
- << PredBB->getName() << "\n");
-
- // Begin by getting rid of unneeded PHIs.
- SmallVector<AssertingVH<Value>, 4> IncomingValues;
- if (isa<PHINode>(BB->front())) {
- for (PHINode &PN : BB->phis())
- if (!isa<PHINode>(PN.getIncomingValue(0)) ||
- cast<PHINode>(PN.getIncomingValue(0))->getParent() != BB)
- IncomingValues.push_back(PN.getIncomingValue(0));
- FoldSingleEntryPHINodes(BB, MemDep);
- }
-
- // DTU update: Collect all the edges that exit BB.
- // These dominator edges will be redirected from Pred.
- std::vector<DominatorTree::UpdateType> Updates;
- if (DTU) {
+}
+
+bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI,
+ MemorySSAUpdater *MSSAU) {
+ // Recursively deleting a PHI may cause multiple PHIs to be deleted
+ // or RAUW'd undef, so use an array of WeakTrackingVH for the PHIs to delete.
+ SmallVector<WeakTrackingVH, 8> PHIs;
+ for (PHINode &PN : BB->phis())
+ PHIs.push_back(&PN);
+
+ bool Changed = false;
+ for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+ if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i].operator Value*()))
+ Changed |= RecursivelyDeleteDeadPHINode(PN, TLI, MSSAU);
+
+ return Changed;
+}
+
+bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
+ LoopInfo *LI, MemorySSAUpdater *MSSAU,
+ MemoryDependenceResults *MemDep,
+ bool PredecessorWithTwoSuccessors) {
+ if (BB->hasAddressTaken())
+ return false;
+
+ // Can't merge if there are multiple predecessors, or no predecessors.
+ BasicBlock *PredBB = BB->getUniquePredecessor();
+ if (!PredBB) return false;
+
+ // Don't break self-loops.
+ if (PredBB == BB) return false;
+ // Don't break unwinding instructions.
+ if (PredBB->getTerminator()->isExceptionalTerminator())
+ return false;
+
+ // Can't merge if there are multiple distinct successors.
+ if (!PredecessorWithTwoSuccessors && PredBB->getUniqueSuccessor() != BB)
+ return false;
+
+ // Currently only allow PredBB to have two predecessors, one being BB.
+ // Update BI to branch to BB's only successor instead of BB.
+ BranchInst *PredBB_BI;
+ BasicBlock *NewSucc = nullptr;
+ unsigned FallThruPath;
+ if (PredecessorWithTwoSuccessors) {
+ if (!(PredBB_BI = dyn_cast<BranchInst>(PredBB->getTerminator())))
+ return false;
+ BranchInst *BB_JmpI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BB_JmpI || !BB_JmpI->isUnconditional())
+ return false;
+ NewSucc = BB_JmpI->getSuccessor(0);
+ FallThruPath = PredBB_BI->getSuccessor(0) == BB ? 0 : 1;
+ }
+
+ // Can't merge if there is PHI loop.
+ for (PHINode &PN : BB->phis())
+ for (Value *IncValue : PN.incoming_values())
+ if (IncValue == &PN)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Merging: " << BB->getName() << " into "
+ << PredBB->getName() << "\n");
+
+ // Begin by getting rid of unneeded PHIs.
+ SmallVector<AssertingVH<Value>, 4> IncomingValues;
+ if (isa<PHINode>(BB->front())) {
+ for (PHINode &PN : BB->phis())
+ if (!isa<PHINode>(PN.getIncomingValue(0)) ||
+ cast<PHINode>(PN.getIncomingValue(0))->getParent() != BB)
+ IncomingValues.push_back(PN.getIncomingValue(0));
+ FoldSingleEntryPHINodes(BB, MemDep);
+ }
+
+ // DTU update: Collect all the edges that exit BB.
+ // These dominator edges will be redirected from Pred.
+ std::vector<DominatorTree::UpdateType> Updates;
+ if (DTU) {
SmallSetVector<BasicBlock *, 2> UniqueSuccessors(succ_begin(BB),
succ_end(BB));
Updates.reserve(1 + (2 * UniqueSuccessors.size()));
- // Add insert edges first. Experimentally, for the particular case of two
- // blocks that can be merged, with a single successor and single predecessor
- // respectively, it is beneficial to have all insert updates first. Deleting
- // edges first may lead to unreachable blocks, followed by inserting edges
- // making the blocks reachable again. Such DT updates lead to high compile
- // times. We add inserts before deletes here to reduce compile time.
+ // Add insert edges first. Experimentally, for the particular case of two
+ // blocks that can be merged, with a single successor and single predecessor
+ // respectively, it is beneficial to have all insert updates first. Deleting
+ // edges first may lead to unreachable blocks, followed by inserting edges
+ // making the blocks reachable again. Such DT updates lead to high compile
+ // times. We add inserts before deletes here to reduce compile time.
for (BasicBlock *UniqueSuccessor : UniqueSuccessors)
- // This successor of BB may already have PredBB as a predecessor.
+ // This successor of BB may already have PredBB as a predecessor.
if (!llvm::is_contained(successors(PredBB), UniqueSuccessor))
Updates.push_back({DominatorTree::Insert, PredBB, UniqueSuccessor});
for (BasicBlock *UniqueSuccessor : UniqueSuccessors)
Updates.push_back({DominatorTree::Delete, BB, UniqueSuccessor});
- Updates.push_back({DominatorTree::Delete, PredBB, BB});
- }
-
- Instruction *PTI = PredBB->getTerminator();
- Instruction *STI = BB->getTerminator();
- Instruction *Start = &*BB->begin();
- // If there's nothing to move, mark the starting instruction as the last
- // instruction in the block. Terminator instruction is handled separately.
- if (Start == STI)
- Start = PTI;
-
- // Move all definitions in the successor to the predecessor...
- PredBB->getInstList().splice(PTI->getIterator(), BB->getInstList(),
- BB->begin(), STI->getIterator());
-
- if (MSSAU)
- MSSAU->moveAllAfterMergeBlocks(BB, PredBB, Start);
-
- // Make all PHI nodes that referred to BB now refer to Pred as their
- // source...
- BB->replaceAllUsesWith(PredBB);
-
- if (PredecessorWithTwoSuccessors) {
- // Delete the unconditional branch from BB.
- BB->getInstList().pop_back();
-
- // Update branch in the predecessor.
- PredBB_BI->setSuccessor(FallThruPath, NewSucc);
- } else {
- // Delete the unconditional branch from the predecessor.
- PredBB->getInstList().pop_back();
-
- // Move terminator instruction.
- PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
-
- // Terminator may be a memory accessing instruction too.
- if (MSSAU)
- if (MemoryUseOrDef *MUD = cast_or_null<MemoryUseOrDef>(
- MSSAU->getMemorySSA()->getMemoryAccess(PredBB->getTerminator())))
- MSSAU->moveToPlace(MUD, PredBB, MemorySSA::End);
- }
- // Add unreachable to now empty BB.
- new UnreachableInst(BB->getContext(), BB);
-
- // Inherit predecessors name if it exists.
- if (!PredBB->hasName())
- PredBB->takeName(BB);
-
- if (LI)
- LI->removeBlock(BB);
-
- if (MemDep)
- MemDep->invalidateCachedPredecessors();
-
- // Finally, erase the old block and update dominator info.
- if (DTU) {
- assert(BB->getInstList().size() == 1 &&
- isa<UnreachableInst>(BB->getTerminator()) &&
- "The successor list of BB isn't empty before "
- "applying corresponding DTU updates.");
+ Updates.push_back({DominatorTree::Delete, PredBB, BB});
+ }
+
+ Instruction *PTI = PredBB->getTerminator();
+ Instruction *STI = BB->getTerminator();
+ Instruction *Start = &*BB->begin();
+ // If there's nothing to move, mark the starting instruction as the last
+ // instruction in the block. Terminator instruction is handled separately.
+ if (Start == STI)
+ Start = PTI;
+
+ // Move all definitions in the successor to the predecessor...
+ PredBB->getInstList().splice(PTI->getIterator(), BB->getInstList(),
+ BB->begin(), STI->getIterator());
+
+ if (MSSAU)
+ MSSAU->moveAllAfterMergeBlocks(BB, PredBB, Start);
+
+ // Make all PHI nodes that referred to BB now refer to Pred as their
+ // source...
+ BB->replaceAllUsesWith(PredBB);
+
+ if (PredecessorWithTwoSuccessors) {
+ // Delete the unconditional branch from BB.
+ BB->getInstList().pop_back();
+
+ // Update branch in the predecessor.
+ PredBB_BI->setSuccessor(FallThruPath, NewSucc);
+ } else {
+ // Delete the unconditional branch from the predecessor.
+ PredBB->getInstList().pop_back();
+
+ // Move terminator instruction.
+ PredBB->getInstList().splice(PredBB->end(), BB->getInstList());
+
+ // Terminator may be a memory accessing instruction too.
+ if (MSSAU)
+ if (MemoryUseOrDef *MUD = cast_or_null<MemoryUseOrDef>(
+ MSSAU->getMemorySSA()->getMemoryAccess(PredBB->getTerminator())))
+ MSSAU->moveToPlace(MUD, PredBB, MemorySSA::End);
+ }
+ // Add unreachable to now empty BB.
+ new UnreachableInst(BB->getContext(), BB);
+
+ // Inherit predecessors name if it exists.
+ if (!PredBB->hasName())
+ PredBB->takeName(BB);
+
+ if (LI)
+ LI->removeBlock(BB);
+
+ if (MemDep)
+ MemDep->invalidateCachedPredecessors();
+
+ // Finally, erase the old block and update dominator info.
+ if (DTU) {
+ assert(BB->getInstList().size() == 1 &&
+ isa<UnreachableInst>(BB->getTerminator()) &&
+ "The successor list of BB isn't empty before "
+ "applying corresponding DTU updates.");
DTU->applyUpdates(Updates);
- DTU->deleteBB(BB);
- } else {
- BB->eraseFromParent(); // Nuke BB if DTU is nullptr.
- }
-
- return true;
-}
-
-bool llvm::MergeBlockSuccessorsIntoGivenBlocks(
- SmallPtrSetImpl<BasicBlock *> &MergeBlocks, Loop *L, DomTreeUpdater *DTU,
- LoopInfo *LI) {
- assert(!MergeBlocks.empty() && "MergeBlocks should not be empty");
-
- bool BlocksHaveBeenMerged = false;
- while (!MergeBlocks.empty()) {
- BasicBlock *BB = *MergeBlocks.begin();
- BasicBlock *Dest = BB->getSingleSuccessor();
- if (Dest && (!L || L->contains(Dest))) {
- BasicBlock *Fold = Dest->getUniquePredecessor();
- (void)Fold;
- if (MergeBlockIntoPredecessor(Dest, DTU, LI)) {
- assert(Fold == BB &&
- "Expecting BB to be unique predecessor of the Dest block");
- MergeBlocks.erase(Dest);
- BlocksHaveBeenMerged = true;
- } else
- MergeBlocks.erase(BB);
- } else
- MergeBlocks.erase(BB);
- }
- return BlocksHaveBeenMerged;
-}
-
-/// Remove redundant instructions within sequences of consecutive dbg.value
-/// instructions. This is done using a backward scan to keep the last dbg.value
-/// describing a specific variable/fragment.
-///
-/// BackwardScan strategy:
-/// ----------------------
-/// Given a sequence of consecutive DbgValueInst like this
-///
-/// dbg.value ..., "x", FragmentX1 (*)
-/// dbg.value ..., "y", FragmentY1
-/// dbg.value ..., "x", FragmentX2
-/// dbg.value ..., "x", FragmentX1 (**)
-///
-/// then the instruction marked with (*) can be removed (it is guaranteed to be
-/// obsoleted by the instruction marked with (**) as the latter instruction is
-/// describing the same variable using the same fragment info).
-///
-/// Possible improvements:
-/// - Check fully overlapping fragments and not only identical fragments.
-/// - Support dbg.addr, dbg.declare. dbg.label, and possibly other meta
-/// instructions being part of the sequence of consecutive instructions.
-static bool removeRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
- SmallVector<DbgValueInst *, 8> ToBeRemoved;
- SmallDenseSet<DebugVariable> VariableSet;
- for (auto &I : reverse(*BB)) {
- if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) {
- DebugVariable Key(DVI->getVariable(),
- DVI->getExpression(),
- DVI->getDebugLoc()->getInlinedAt());
- auto R = VariableSet.insert(Key);
- // If the same variable fragment is described more than once it is enough
- // to keep the last one (i.e. the first found since we for reverse
- // iteration).
- if (!R.second)
- ToBeRemoved.push_back(DVI);
- continue;
- }
- // Sequence with consecutive dbg.value instrs ended. Clear the map to
- // restart identifying redundant instructions if case we find another
- // dbg.value sequence.
- VariableSet.clear();
- }
-
- for (auto &Instr : ToBeRemoved)
- Instr->eraseFromParent();
-
- return !ToBeRemoved.empty();
-}
-
-/// Remove redundant dbg.value instructions using a forward scan. This can
-/// remove a dbg.value instruction that is redundant due to indicating that a
-/// variable has the same value as already being indicated by an earlier
-/// dbg.value.
-///
-/// ForwardScan strategy:
-/// ---------------------
-/// Given two identical dbg.value instructions, separated by a block of
-/// instructions that isn't describing the same variable, like this
-///
-/// dbg.value X1, "x", FragmentX1 (**)
-/// <block of instructions, none being "dbg.value ..., "x", ...">
-/// dbg.value X1, "x", FragmentX1 (*)
-///
-/// then the instruction marked with (*) can be removed. Variable "x" is already
-/// described as being mapped to the SSA value X1.
-///
-/// Possible improvements:
-/// - Keep track of non-overlapping fragments.
-static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
- SmallVector<DbgValueInst *, 8> ToBeRemoved;
- DenseMap<DebugVariable, std::pair<Value *, DIExpression *> > VariableMap;
- for (auto &I : *BB) {
- if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) {
- DebugVariable Key(DVI->getVariable(),
- NoneType(),
- DVI->getDebugLoc()->getInlinedAt());
- auto VMI = VariableMap.find(Key);
- // Update the map if we found a new value/expression describing the
- // variable, or if the variable wasn't mapped already.
- if (VMI == VariableMap.end() ||
- VMI->second.first != DVI->getValue() ||
- VMI->second.second != DVI->getExpression()) {
- VariableMap[Key] = { DVI->getValue(), DVI->getExpression() };
- continue;
- }
- // Found an identical mapping. Remember the instruction for later removal.
- ToBeRemoved.push_back(DVI);
- }
- }
-
- for (auto &Instr : ToBeRemoved)
- Instr->eraseFromParent();
-
- return !ToBeRemoved.empty();
-}
-
-bool llvm::RemoveRedundantDbgInstrs(BasicBlock *BB) {
- bool MadeChanges = false;
- // By using the "backward scan" strategy before the "forward scan" strategy we
- // can remove both dbg.value (2) and (3) in a situation like this:
- //
- // (1) dbg.value V1, "x", DIExpression()
- // ...
- // (2) dbg.value V2, "x", DIExpression()
- // (3) dbg.value V1, "x", DIExpression()
- //
- // The backward scan will remove (2), it is made obsolete by (3). After
- // getting (2) out of the way, the foward scan will remove (3) since "x"
- // already is described as having the value V1 at (1).
- MadeChanges |= removeRedundantDbgInstrsUsingBackwardScan(BB);
- MadeChanges |= removeRedundantDbgInstrsUsingForwardScan(BB);
-
- if (MadeChanges)
- LLVM_DEBUG(dbgs() << "Removed redundant dbg instrs from: "
- << BB->getName() << "\n");
- return MadeChanges;
-}
-
-void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL,
- BasicBlock::iterator &BI, Value *V) {
- Instruction &I = *BI;
- // Replaces all of the uses of the instruction with uses of the value
- I.replaceAllUsesWith(V);
-
- // Make sure to propagate a name if there is one already.
- if (I.hasName() && !V->hasName())
- V->takeName(&I);
-
- // Delete the unnecessary instruction now...
- BI = BIL.erase(BI);
-}
-
-void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL,
- BasicBlock::iterator &BI, Instruction *I) {
- assert(I->getParent() == nullptr &&
- "ReplaceInstWithInst: Instruction already inserted into basic block!");
-
- // Copy debug location to newly added instruction, if it wasn't already set
- // by the caller.
- if (!I->getDebugLoc())
- I->setDebugLoc(BI->getDebugLoc());
-
- // Insert the new instruction into the basic block...
- BasicBlock::iterator New = BIL.insert(BI, I);
-
- // Replace all uses of the old instruction, and delete it.
- ReplaceInstWithValue(BIL, BI, I);
-
- // Move BI back to point to the newly inserted instruction
- BI = New;
-}
-
-void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) {
- BasicBlock::iterator BI(From);
- ReplaceInstWithInst(From->getParent()->getInstList(), BI, To);
-}
-
-BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
+ DTU->deleteBB(BB);
+ } else {
+ BB->eraseFromParent(); // Nuke BB if DTU is nullptr.
+ }
+
+ return true;
+}
+
+bool llvm::MergeBlockSuccessorsIntoGivenBlocks(
+ SmallPtrSetImpl<BasicBlock *> &MergeBlocks, Loop *L, DomTreeUpdater *DTU,
+ LoopInfo *LI) {
+ assert(!MergeBlocks.empty() && "MergeBlocks should not be empty");
+
+ bool BlocksHaveBeenMerged = false;
+ while (!MergeBlocks.empty()) {
+ BasicBlock *BB = *MergeBlocks.begin();
+ BasicBlock *Dest = BB->getSingleSuccessor();
+ if (Dest && (!L || L->contains(Dest))) {
+ BasicBlock *Fold = Dest->getUniquePredecessor();
+ (void)Fold;
+ if (MergeBlockIntoPredecessor(Dest, DTU, LI)) {
+ assert(Fold == BB &&
+ "Expecting BB to be unique predecessor of the Dest block");
+ MergeBlocks.erase(Dest);
+ BlocksHaveBeenMerged = true;
+ } else
+ MergeBlocks.erase(BB);
+ } else
+ MergeBlocks.erase(BB);
+ }
+ return BlocksHaveBeenMerged;
+}
+
+/// Remove redundant instructions within sequences of consecutive dbg.value
+/// instructions. This is done using a backward scan to keep the last dbg.value
+/// describing a specific variable/fragment.
+///
+/// BackwardScan strategy:
+/// ----------------------
+/// Given a sequence of consecutive DbgValueInst like this
+///
+/// dbg.value ..., "x", FragmentX1 (*)
+/// dbg.value ..., "y", FragmentY1
+/// dbg.value ..., "x", FragmentX2
+/// dbg.value ..., "x", FragmentX1 (**)
+///
+/// then the instruction marked with (*) can be removed (it is guaranteed to be
+/// obsoleted by the instruction marked with (**) as the latter instruction is
+/// describing the same variable using the same fragment info).
+///
+/// Possible improvements:
+/// - Check fully overlapping fragments and not only identical fragments.
+/// - Support dbg.addr, dbg.declare. dbg.label, and possibly other meta
+/// instructions being part of the sequence of consecutive instructions.
+static bool removeRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
+ SmallVector<DbgValueInst *, 8> ToBeRemoved;
+ SmallDenseSet<DebugVariable> VariableSet;
+ for (auto &I : reverse(*BB)) {
+ if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) {
+ DebugVariable Key(DVI->getVariable(),
+ DVI->getExpression(),
+ DVI->getDebugLoc()->getInlinedAt());
+ auto R = VariableSet.insert(Key);
+ // If the same variable fragment is described more than once it is enough
+ // to keep the last one (i.e. the first found since we for reverse
+ // iteration).
+ if (!R.second)
+ ToBeRemoved.push_back(DVI);
+ continue;
+ }
+ // Sequence with consecutive dbg.value instrs ended. Clear the map to
+ // restart identifying redundant instructions if case we find another
+ // dbg.value sequence.
+ VariableSet.clear();
+ }
+
+ for (auto &Instr : ToBeRemoved)
+ Instr->eraseFromParent();
+
+ return !ToBeRemoved.empty();
+}
+
+/// Remove redundant dbg.value instructions using a forward scan. This can
+/// remove a dbg.value instruction that is redundant due to indicating that a
+/// variable has the same value as already being indicated by an earlier
+/// dbg.value.
+///
+/// ForwardScan strategy:
+/// ---------------------
+/// Given two identical dbg.value instructions, separated by a block of
+/// instructions that isn't describing the same variable, like this
+///
+/// dbg.value X1, "x", FragmentX1 (**)
+/// <block of instructions, none being "dbg.value ..., "x", ...">
+/// dbg.value X1, "x", FragmentX1 (*)
+///
+/// then the instruction marked with (*) can be removed. Variable "x" is already
+/// described as being mapped to the SSA value X1.
+///
+/// Possible improvements:
+/// - Keep track of non-overlapping fragments.
+static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
+ SmallVector<DbgValueInst *, 8> ToBeRemoved;
+ DenseMap<DebugVariable, std::pair<Value *, DIExpression *> > VariableMap;
+ for (auto &I : *BB) {
+ if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(&I)) {
+ DebugVariable Key(DVI->getVariable(),
+ NoneType(),
+ DVI->getDebugLoc()->getInlinedAt());
+ auto VMI = VariableMap.find(Key);
+ // Update the map if we found a new value/expression describing the
+ // variable, or if the variable wasn't mapped already.
+ if (VMI == VariableMap.end() ||
+ VMI->second.first != DVI->getValue() ||
+ VMI->second.second != DVI->getExpression()) {
+ VariableMap[Key] = { DVI->getValue(), DVI->getExpression() };
+ continue;
+ }
+ // Found an identical mapping. Remember the instruction for later removal.
+ ToBeRemoved.push_back(DVI);
+ }
+ }
+
+ for (auto &Instr : ToBeRemoved)
+ Instr->eraseFromParent();
+
+ return !ToBeRemoved.empty();
+}
+
+bool llvm::RemoveRedundantDbgInstrs(BasicBlock *BB) {
+ bool MadeChanges = false;
+ // By using the "backward scan" strategy before the "forward scan" strategy we
+ // can remove both dbg.value (2) and (3) in a situation like this:
+ //
+ // (1) dbg.value V1, "x", DIExpression()
+ // ...
+ // (2) dbg.value V2, "x", DIExpression()
+ // (3) dbg.value V1, "x", DIExpression()
+ //
+ // The backward scan will remove (2), it is made obsolete by (3). After
+ // getting (2) out of the way, the foward scan will remove (3) since "x"
+ // already is described as having the value V1 at (1).
+ MadeChanges |= removeRedundantDbgInstrsUsingBackwardScan(BB);
+ MadeChanges |= removeRedundantDbgInstrsUsingForwardScan(BB);
+
+ if (MadeChanges)
+ LLVM_DEBUG(dbgs() << "Removed redundant dbg instrs from: "
+ << BB->getName() << "\n");
+ return MadeChanges;
+}
+
+void llvm::ReplaceInstWithValue(BasicBlock::InstListType &BIL,
+ BasicBlock::iterator &BI, Value *V) {
+ Instruction &I = *BI;
+ // Replaces all of the uses of the instruction with uses of the value
+ I.replaceAllUsesWith(V);
+
+ // Make sure to propagate a name if there is one already.
+ if (I.hasName() && !V->hasName())
+ V->takeName(&I);
+
+ // Delete the unnecessary instruction now...
+ BI = BIL.erase(BI);
+}
+
+void llvm::ReplaceInstWithInst(BasicBlock::InstListType &BIL,
+ BasicBlock::iterator &BI, Instruction *I) {
+ assert(I->getParent() == nullptr &&
+ "ReplaceInstWithInst: Instruction already inserted into basic block!");
+
+ // Copy debug location to newly added instruction, if it wasn't already set
+ // by the caller.
+ if (!I->getDebugLoc())
+ I->setDebugLoc(BI->getDebugLoc());
+
+ // Insert the new instruction into the basic block...
+ BasicBlock::iterator New = BIL.insert(BI, I);
+
+ // Replace all uses of the old instruction, and delete it.
+ ReplaceInstWithValue(BIL, BI, I);
+
+ // Move BI back to point to the newly inserted instruction
+ BI = New;
+}
+
+void llvm::ReplaceInstWithInst(Instruction *From, Instruction *To) {
+ BasicBlock::iterator BI(From);
+ ReplaceInstWithInst(From->getParent()->getInstList(), BI, To);
+}
+
+BasicBlock *llvm::SplitEdge(BasicBlock *BB, BasicBlock *Succ, DominatorTree *DT,
LoopInfo *LI, MemorySSAUpdater *MSSAU,
const Twine &BBName) {
- unsigned SuccNum = GetSuccessorNumber(BB, Succ);
-
- // If this is a critical edge, let SplitCriticalEdge do it.
- Instruction *LatchTerm = BB->getTerminator();
- if (SplitCriticalEdge(
- LatchTerm, SuccNum,
+ unsigned SuccNum = GetSuccessorNumber(BB, Succ);
+
+ // If this is a critical edge, let SplitCriticalEdge do it.
+ Instruction *LatchTerm = BB->getTerminator();
+ if (SplitCriticalEdge(
+ LatchTerm, SuccNum,
CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA(),
BBName))
- return LatchTerm->getSuccessor(SuccNum);
-
- // If the edge isn't critical, then BB has a single successor or Succ has a
- // single pred. Split the block.
- if (BasicBlock *SP = Succ->getSinglePredecessor()) {
- // If the successor only has a single pred, split the top of the successor
- // block.
- assert(SP == BB && "CFG broken");
- SP = nullptr;
+ return LatchTerm->getSuccessor(SuccNum);
+
+ // If the edge isn't critical, then BB has a single successor or Succ has a
+ // single pred. Split the block.
+ if (BasicBlock *SP = Succ->getSinglePredecessor()) {
+ // If the successor only has a single pred, split the top of the successor
+ // block.
+ assert(SP == BB && "CFG broken");
+ SP = nullptr;
return SplitBlock(Succ, &Succ->front(), DT, LI, MSSAU, BBName,
/*Before=*/true);
- }
-
- // Otherwise, if BB has a single successor, split it at the bottom of the
- // block.
- assert(BB->getTerminator()->getNumSuccessors() == 1 &&
- "Should have a single succ!");
+ }
+
+ // Otherwise, if BB has a single successor, split it at the bottom of the
+ // block.
+ assert(BB->getTerminator()->getNumSuccessors() == 1 &&
+ "Should have a single succ!");
return SplitBlock(BB, BB->getTerminator(), DT, LI, MSSAU, BBName);
-}
-
-unsigned
-llvm::SplitAllCriticalEdges(Function &F,
- const CriticalEdgeSplittingOptions &Options) {
- unsigned NumBroken = 0;
- for (BasicBlock &BB : F) {
- Instruction *TI = BB.getTerminator();
- if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI) &&
- !isa<CallBrInst>(TI))
- for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
- if (SplitCriticalEdge(TI, i, Options))
- ++NumBroken;
- }
- return NumBroken;
-}
-
+}
+
+unsigned
+llvm::SplitAllCriticalEdges(Function &F,
+ const CriticalEdgeSplittingOptions &Options) {
+ unsigned NumBroken = 0;
+ for (BasicBlock &BB : F) {
+ Instruction *TI = BB.getTerminator();
+ if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI) &&
+ !isa<CallBrInst>(TI))
+ for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+ if (SplitCriticalEdge(TI, i, Options))
+ ++NumBroken;
+ }
+ return NumBroken;
+}
+
static BasicBlock *SplitBlockImpl(BasicBlock *Old, Instruction *SplitPt,
DomTreeUpdater *DTU, DominatorTree *DT,
LoopInfo *LI, MemorySSAUpdater *MSSAU,
@@ -552,22 +552,22 @@ static BasicBlock *SplitBlockImpl(BasicBlock *Old, Instruction *SplitPt,
DTU ? DTU : (DT ? &LocalDTU : nullptr), LI, MSSAU,
BBName);
}
- BasicBlock::iterator SplitIt = SplitPt->getIterator();
- while (isa<PHINode>(SplitIt) || SplitIt->isEHPad())
- ++SplitIt;
- std::string Name = BBName.str();
- BasicBlock *New = Old->splitBasicBlock(
- SplitIt, Name.empty() ? Old->getName() + ".split" : Name);
-
- // The new block lives in whichever loop the old one did. This preserves
- // LCSSA as well, because we force the split point to be after any PHI nodes.
- if (LI)
- if (Loop *L = LI->getLoopFor(Old))
- L->addBasicBlockToLoop(New, *LI);
-
+ BasicBlock::iterator SplitIt = SplitPt->getIterator();
+ while (isa<PHINode>(SplitIt) || SplitIt->isEHPad())
+ ++SplitIt;
+ std::string Name = BBName.str();
+ BasicBlock *New = Old->splitBasicBlock(
+ SplitIt, Name.empty() ? Old->getName() + ".split" : Name);
+
+ // The new block lives in whichever loop the old one did. This preserves
+ // LCSSA as well, because we force the split point to be after any PHI nodes.
+ if (LI)
+ if (Loop *L = LI->getLoopFor(Old))
+ L->addBasicBlockToLoop(New, *LI);
+
if (DTU) {
SmallVector<DominatorTree::UpdateType, 8> Updates;
- // Old dominates New. New node dominates all other nodes dominated by Old.
+ // Old dominates New. New node dominates all other nodes dominated by Old.
SmallSetVector<BasicBlock *, 8> UniqueSuccessorsOfOld(succ_begin(New),
succ_end(New));
Updates.push_back({DominatorTree::Insert, Old, New});
@@ -580,22 +580,22 @@ static BasicBlock *SplitBlockImpl(BasicBlock *Old, Instruction *SplitPt,
DTU->applyUpdates(Updates);
} else if (DT)
// Old dominates New. New node dominates all other nodes dominated by Old.
- if (DomTreeNode *OldNode = DT->getNode(Old)) {
- std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
-
- DomTreeNode *NewNode = DT->addNewBlock(New, Old);
- for (DomTreeNode *I : Children)
- DT->changeImmediateDominator(I, NewNode);
- }
-
- // Move MemoryAccesses still tracked in Old, but part of New now.
- // Update accesses in successor blocks accordingly.
- if (MSSAU)
- MSSAU->moveAllAfterSpliceBlocks(Old, New, &*(New->begin()));
-
- return New;
-}
-
+ if (DomTreeNode *OldNode = DT->getNode(Old)) {
+ std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
+
+ DomTreeNode *NewNode = DT->addNewBlock(New, Old);
+ for (DomTreeNode *I : Children)
+ DT->changeImmediateDominator(I, NewNode);
+ }
+
+ // Move MemoryAccesses still tracked in Old, but part of New now.
+ // Update accesses in successor blocks accordingly.
+ if (MSSAU)
+ MSSAU->moveAllAfterSpliceBlocks(Old, New, &*(New->begin()));
+
+ return New;
+}
+
BasicBlock *llvm::SplitBlock(BasicBlock *Old, Instruction *SplitPt,
DominatorTree *DT, LoopInfo *LI,
MemorySSAUpdater *MSSAU, const Twine &BBName,
@@ -656,13 +656,13 @@ BasicBlock *llvm::splitBlockBefore(BasicBlock *Old, Instruction *SplitPt,
return New;
}
-/// Update DominatorTree, LoopInfo, and LCCSA analysis information.
-static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
- ArrayRef<BasicBlock *> Preds,
+/// Update DominatorTree, LoopInfo, and LCCSA analysis information.
+static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
+ ArrayRef<BasicBlock *> Preds,
DomTreeUpdater *DTU, DominatorTree *DT,
LoopInfo *LI, MemorySSAUpdater *MSSAU,
- bool PreserveLCSSA, bool &HasLoopExit) {
- // Update dominator tree if available.
+ bool PreserveLCSSA, bool &HasLoopExit) {
+ // Update dominator tree if available.
if (DTU) {
// Recalculation of DomTree is needed when updating a forward DomTree and
// the Entry BB is replaced.
@@ -684,158 +684,158 @@ static void UpdateAnalysisInformation(BasicBlock *OldBB, BasicBlock *NewBB,
DTU->applyUpdates(Updates);
}
} else if (DT) {
- if (OldBB == DT->getRootNode()->getBlock()) {
- assert(NewBB == &NewBB->getParent()->getEntryBlock());
- DT->setNewRoot(NewBB);
- } else {
- // Split block expects NewBB to have a non-empty set of predecessors.
- DT->splitBlock(NewBB);
- }
- }
-
- // Update MemoryPhis after split if MemorySSA is available
- if (MSSAU)
- MSSAU->wireOldPredecessorsToNewImmediatePredecessor(OldBB, NewBB, Preds);
-
- // The rest of the logic is only relevant for updating the loop structures.
- if (!LI)
- return;
-
+ if (OldBB == DT->getRootNode()->getBlock()) {
+ assert(NewBB == &NewBB->getParent()->getEntryBlock());
+ DT->setNewRoot(NewBB);
+ } else {
+ // Split block expects NewBB to have a non-empty set of predecessors.
+ DT->splitBlock(NewBB);
+ }
+ }
+
+ // Update MemoryPhis after split if MemorySSA is available
+ if (MSSAU)
+ MSSAU->wireOldPredecessorsToNewImmediatePredecessor(OldBB, NewBB, Preds);
+
+ // The rest of the logic is only relevant for updating the loop structures.
+ if (!LI)
+ return;
+
if (DTU && DTU->hasDomTree())
DT = &DTU->getDomTree();
- assert(DT && "DT should be available to update LoopInfo!");
- Loop *L = LI->getLoopFor(OldBB);
-
- // If we need to preserve loop analyses, collect some information about how
- // this split will affect loops.
- bool IsLoopEntry = !!L;
- bool SplitMakesNewLoopHeader = false;
- for (BasicBlock *Pred : Preds) {
- // Preds that are not reachable from entry should not be used to identify if
- // OldBB is a loop entry or if SplitMakesNewLoopHeader. Unreachable blocks
- // are not within any loops, so we incorrectly mark SplitMakesNewLoopHeader
- // as true and make the NewBB the header of some loop. This breaks LI.
- if (!DT->isReachableFromEntry(Pred))
- continue;
- // If we need to preserve LCSSA, determine if any of the preds is a loop
- // exit.
- if (PreserveLCSSA)
- if (Loop *PL = LI->getLoopFor(Pred))
- if (!PL->contains(OldBB))
- HasLoopExit = true;
-
- // If we need to preserve LoopInfo, note whether any of the preds crosses
- // an interesting loop boundary.
- if (!L)
- continue;
- if (L->contains(Pred))
- IsLoopEntry = false;
- else
- SplitMakesNewLoopHeader = true;
- }
-
- // Unless we have a loop for OldBB, nothing else to do here.
- if (!L)
- return;
-
- if (IsLoopEntry) {
- // Add the new block to the nearest enclosing loop (and not an adjacent
- // loop). To find this, examine each of the predecessors and determine which
- // loops enclose them, and select the most-nested loop which contains the
- // loop containing the block being split.
- Loop *InnermostPredLoop = nullptr;
- for (BasicBlock *Pred : Preds) {
- if (Loop *PredLoop = LI->getLoopFor(Pred)) {
- // Seek a loop which actually contains the block being split (to avoid
- // adjacent loops).
- while (PredLoop && !PredLoop->contains(OldBB))
- PredLoop = PredLoop->getParentLoop();
-
- // Select the most-nested of these loops which contains the block.
- if (PredLoop && PredLoop->contains(OldBB) &&
- (!InnermostPredLoop ||
- InnermostPredLoop->getLoopDepth() < PredLoop->getLoopDepth()))
- InnermostPredLoop = PredLoop;
- }
- }
-
- if (InnermostPredLoop)
- InnermostPredLoop->addBasicBlockToLoop(NewBB, *LI);
- } else {
- L->addBasicBlockToLoop(NewBB, *LI);
- if (SplitMakesNewLoopHeader)
- L->moveToHeader(NewBB);
- }
-}
-
-/// Update the PHI nodes in OrigBB to include the values coming from NewBB.
-/// This also updates AliasAnalysis, if available.
-static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
- ArrayRef<BasicBlock *> Preds, BranchInst *BI,
- bool HasLoopExit) {
- // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB.
- SmallPtrSet<BasicBlock *, 16> PredSet(Preds.begin(), Preds.end());
- for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) {
- PHINode *PN = cast<PHINode>(I++);
-
- // Check to see if all of the values coming in are the same. If so, we
- // don't need to create a new PHI node, unless it's needed for LCSSA.
- Value *InVal = nullptr;
- if (!HasLoopExit) {
- InVal = PN->getIncomingValueForBlock(Preds[0]);
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- if (!PredSet.count(PN->getIncomingBlock(i)))
- continue;
- if (!InVal)
- InVal = PN->getIncomingValue(i);
- else if (InVal != PN->getIncomingValue(i)) {
- InVal = nullptr;
- break;
- }
- }
- }
-
- if (InVal) {
- // If all incoming values for the new PHI would be the same, just don't
- // make a new PHI. Instead, just remove the incoming values from the old
- // PHI.
-
- // NOTE! This loop walks backwards for a reason! First off, this minimizes
- // the cost of removal if we end up removing a large number of values, and
- // second off, this ensures that the indices for the incoming values
- // aren't invalidated when we remove one.
- for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i)
- if (PredSet.count(PN->getIncomingBlock(i)))
- PN->removeIncomingValue(i, false);
-
- // Add an incoming value to the PHI node in the loop for the preheader
- // edge.
- PN->addIncoming(InVal, NewBB);
- continue;
- }
-
- // If the values coming into the block are not the same, we need a new
- // PHI.
- // Create the new PHI node, insert it into NewBB at the end of the block
- PHINode *NewPHI =
- PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
-
- // NOTE! This loop walks backwards for a reason! First off, this minimizes
- // the cost of removal if we end up removing a large number of values, and
- // second off, this ensures that the indices for the incoming values aren't
- // invalidated when we remove one.
- for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i) {
- BasicBlock *IncomingBB = PN->getIncomingBlock(i);
- if (PredSet.count(IncomingBB)) {
- Value *V = PN->removeIncomingValue(i, false);
- NewPHI->addIncoming(V, IncomingBB);
- }
- }
-
- PN->addIncoming(NewPHI, NewBB);
- }
-}
-
+ assert(DT && "DT should be available to update LoopInfo!");
+ Loop *L = LI->getLoopFor(OldBB);
+
+ // If we need to preserve loop analyses, collect some information about how
+ // this split will affect loops.
+ bool IsLoopEntry = !!L;
+ bool SplitMakesNewLoopHeader = false;
+ for (BasicBlock *Pred : Preds) {
+ // Preds that are not reachable from entry should not be used to identify if
+ // OldBB is a loop entry or if SplitMakesNewLoopHeader. Unreachable blocks
+ // are not within any loops, so we incorrectly mark SplitMakesNewLoopHeader
+ // as true and make the NewBB the header of some loop. This breaks LI.
+ if (!DT->isReachableFromEntry(Pred))
+ continue;
+ // If we need to preserve LCSSA, determine if any of the preds is a loop
+ // exit.
+ if (PreserveLCSSA)
+ if (Loop *PL = LI->getLoopFor(Pred))
+ if (!PL->contains(OldBB))
+ HasLoopExit = true;
+
+ // If we need to preserve LoopInfo, note whether any of the preds crosses
+ // an interesting loop boundary.
+ if (!L)
+ continue;
+ if (L->contains(Pred))
+ IsLoopEntry = false;
+ else
+ SplitMakesNewLoopHeader = true;
+ }
+
+ // Unless we have a loop for OldBB, nothing else to do here.
+ if (!L)
+ return;
+
+ if (IsLoopEntry) {
+ // Add the new block to the nearest enclosing loop (and not an adjacent
+ // loop). To find this, examine each of the predecessors and determine which
+ // loops enclose them, and select the most-nested loop which contains the
+ // loop containing the block being split.
+ Loop *InnermostPredLoop = nullptr;
+ for (BasicBlock *Pred : Preds) {
+ if (Loop *PredLoop = LI->getLoopFor(Pred)) {
+ // Seek a loop which actually contains the block being split (to avoid
+ // adjacent loops).
+ while (PredLoop && !PredLoop->contains(OldBB))
+ PredLoop = PredLoop->getParentLoop();
+
+ // Select the most-nested of these loops which contains the block.
+ if (PredLoop && PredLoop->contains(OldBB) &&
+ (!InnermostPredLoop ||
+ InnermostPredLoop->getLoopDepth() < PredLoop->getLoopDepth()))
+ InnermostPredLoop = PredLoop;
+ }
+ }
+
+ if (InnermostPredLoop)
+ InnermostPredLoop->addBasicBlockToLoop(NewBB, *LI);
+ } else {
+ L->addBasicBlockToLoop(NewBB, *LI);
+ if (SplitMakesNewLoopHeader)
+ L->moveToHeader(NewBB);
+ }
+}
+
+/// Update the PHI nodes in OrigBB to include the values coming from NewBB.
+/// This also updates AliasAnalysis, if available.
+static void UpdatePHINodes(BasicBlock *OrigBB, BasicBlock *NewBB,
+ ArrayRef<BasicBlock *> Preds, BranchInst *BI,
+ bool HasLoopExit) {
+ // Otherwise, create a new PHI node in NewBB for each PHI node in OrigBB.
+ SmallPtrSet<BasicBlock *, 16> PredSet(Preds.begin(), Preds.end());
+ for (BasicBlock::iterator I = OrigBB->begin(); isa<PHINode>(I); ) {
+ PHINode *PN = cast<PHINode>(I++);
+
+ // Check to see if all of the values coming in are the same. If so, we
+ // don't need to create a new PHI node, unless it's needed for LCSSA.
+ Value *InVal = nullptr;
+ if (!HasLoopExit) {
+ InVal = PN->getIncomingValueForBlock(Preds[0]);
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ if (!PredSet.count(PN->getIncomingBlock(i)))
+ continue;
+ if (!InVal)
+ InVal = PN->getIncomingValue(i);
+ else if (InVal != PN->getIncomingValue(i)) {
+ InVal = nullptr;
+ break;
+ }
+ }
+ }
+
+ if (InVal) {
+ // If all incoming values for the new PHI would be the same, just don't
+ // make a new PHI. Instead, just remove the incoming values from the old
+ // PHI.
+
+ // NOTE! This loop walks backwards for a reason! First off, this minimizes
+ // the cost of removal if we end up removing a large number of values, and
+ // second off, this ensures that the indices for the incoming values
+ // aren't invalidated when we remove one.
+ for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i)
+ if (PredSet.count(PN->getIncomingBlock(i)))
+ PN->removeIncomingValue(i, false);
+
+ // Add an incoming value to the PHI node in the loop for the preheader
+ // edge.
+ PN->addIncoming(InVal, NewBB);
+ continue;
+ }
+
+ // If the values coming into the block are not the same, we need a new
+ // PHI.
+ // Create the new PHI node, insert it into NewBB at the end of the block
+ PHINode *NewPHI =
+ PHINode::Create(PN->getType(), Preds.size(), PN->getName() + ".ph", BI);
+
+ // NOTE! This loop walks backwards for a reason! First off, this minimizes
+ // the cost of removal if we end up removing a large number of values, and
+ // second off, this ensures that the indices for the incoming values aren't
+ // invalidated when we remove one.
+ for (int64_t i = PN->getNumIncomingValues() - 1; i >= 0; --i) {
+ BasicBlock *IncomingBB = PN->getIncomingBlock(i);
+ if (PredSet.count(IncomingBB)) {
+ Value *V = PN->removeIncomingValue(i, false);
+ NewPHI->addIncoming(V, IncomingBB);
+ }
+ }
+
+ PN->addIncoming(NewPHI, NewBB);
+ }
+}
+
static void SplitLandingPadPredecessorsImpl(
BasicBlock *OrigBB, ArrayRef<BasicBlock *> Preds, const char *Suffix1,
const char *Suffix2, SmallVectorImpl<BasicBlock *> &NewBBs,
@@ -847,35 +847,35 @@ SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
const char *Suffix, DomTreeUpdater *DTU,
DominatorTree *DT, LoopInfo *LI,
MemorySSAUpdater *MSSAU, bool PreserveLCSSA) {
- // Do not attempt to split that which cannot be split.
- if (!BB->canSplitPredecessors())
- return nullptr;
-
- // For the landingpads we need to act a bit differently.
- // Delegate this work to the SplitLandingPadPredecessors.
- if (BB->isLandingPad()) {
- SmallVector<BasicBlock*, 2> NewBBs;
- std::string NewName = std::string(Suffix) + ".split-lp";
-
+ // Do not attempt to split that which cannot be split.
+ if (!BB->canSplitPredecessors())
+ return nullptr;
+
+ // For the landingpads we need to act a bit differently.
+ // Delegate this work to the SplitLandingPadPredecessors.
+ if (BB->isLandingPad()) {
+ SmallVector<BasicBlock*, 2> NewBBs;
+ std::string NewName = std::string(Suffix) + ".split-lp";
+
SplitLandingPadPredecessorsImpl(BB, Preds, Suffix, NewName.c_str(), NewBBs,
DTU, DT, LI, MSSAU, PreserveLCSSA);
- return NewBBs[0];
- }
-
- // Create new basic block, insert right before the original block.
- BasicBlock *NewBB = BasicBlock::Create(
- BB->getContext(), BB->getName() + Suffix, BB->getParent(), BB);
-
- // The new block unconditionally branches to the old block.
- BranchInst *BI = BranchInst::Create(BB, NewBB);
+ return NewBBs[0];
+ }
+
+ // Create new basic block, insert right before the original block.
+ BasicBlock *NewBB = BasicBlock::Create(
+ BB->getContext(), BB->getName() + Suffix, BB->getParent(), BB);
+
+ // The new block unconditionally branches to the old block.
+ BranchInst *BI = BranchInst::Create(BB, NewBB);
Loop *L = nullptr;
BasicBlock *OldLatch = nullptr;
- // Splitting the predecessors of a loop header creates a preheader block.
+ // Splitting the predecessors of a loop header creates a preheader block.
if (LI && LI->isLoopHeader(BB)) {
L = LI->getLoopFor(BB);
- // Using the loop start line number prevents debuggers stepping into the
- // loop body for this instruction.
+ // Using the loop start line number prevents debuggers stepping into the
+ // loop body for this instruction.
BI->setDebugLoc(L->getStartLoc());
// If BB is the header of the Loop, it is possible that the loop is
@@ -884,40 +884,40 @@ SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
// to be applied to the new latch.
OldLatch = L->getLoopLatch();
} else
- BI->setDebugLoc(BB->getFirstNonPHIOrDbg()->getDebugLoc());
-
- // Move the edges from Preds to point to NewBB instead of BB.
- for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
- // This is slightly more strict than necessary; the minimum requirement
- // is that there be no more than one indirectbr branching to BB. And
- // all BlockAddress uses would need to be updated.
- assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
- "Cannot split an edge from an IndirectBrInst");
- assert(!isa<CallBrInst>(Preds[i]->getTerminator()) &&
- "Cannot split an edge from a CallBrInst");
- Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB);
- }
-
- // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI
- // node becomes an incoming value for BB's phi node. However, if the Preds
- // list is empty, we need to insert dummy entries into the PHI nodes in BB to
- // account for the newly created predecessor.
- if (Preds.empty()) {
- // Insert dummy values as the incoming value.
- for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
- cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
- }
-
- // Update DominatorTree, LoopInfo, and LCCSA analysis information.
- bool HasLoopExit = false;
+ BI->setDebugLoc(BB->getFirstNonPHIOrDbg()->getDebugLoc());
+
+ // Move the edges from Preds to point to NewBB instead of BB.
+ for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+ // This is slightly more strict than necessary; the minimum requirement
+ // is that there be no more than one indirectbr branching to BB. And
+ // all BlockAddress uses would need to be updated.
+ assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
+ "Cannot split an edge from an IndirectBrInst");
+ assert(!isa<CallBrInst>(Preds[i]->getTerminator()) &&
+ "Cannot split an edge from a CallBrInst");
+ Preds[i]->getTerminator()->replaceUsesOfWith(BB, NewBB);
+ }
+
+ // Insert a new PHI node into NewBB for every PHI node in BB and that new PHI
+ // node becomes an incoming value for BB's phi node. However, if the Preds
+ // list is empty, we need to insert dummy entries into the PHI nodes in BB to
+ // account for the newly created predecessor.
+ if (Preds.empty()) {
+ // Insert dummy values as the incoming value.
+ for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++I)
+ cast<PHINode>(I)->addIncoming(UndefValue::get(I->getType()), NewBB);
+ }
+
+ // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+ bool HasLoopExit = false;
UpdateAnalysisInformation(BB, NewBB, Preds, DTU, DT, LI, MSSAU, PreserveLCSSA,
- HasLoopExit);
-
- if (!Preds.empty()) {
- // Update the PHI nodes in BB with the values coming from NewBB.
- UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit);
- }
-
+ HasLoopExit);
+
+ if (!Preds.empty()) {
+ // Update the PHI nodes in BB with the values coming from NewBB.
+ UpdatePHINodes(BB, NewBB, Preds, BI, HasLoopExit);
+ }
+
if (OldLatch) {
BasicBlock *NewLatch = L->getLoopLatch();
if (NewLatch != OldLatch) {
@@ -927,9 +927,9 @@ SplitBlockPredecessorsImpl(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
}
}
- return NewBB;
-}
-
+ return NewBB;
+}
+
BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
ArrayRef<BasicBlock *> Preds,
const char *Suffix, DominatorTree *DT,
@@ -953,103 +953,103 @@ static void SplitLandingPadPredecessorsImpl(
const char *Suffix2, SmallVectorImpl<BasicBlock *> &NewBBs,
DomTreeUpdater *DTU, DominatorTree *DT, LoopInfo *LI,
MemorySSAUpdater *MSSAU, bool PreserveLCSSA) {
- assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!");
-
- // Create a new basic block for OrigBB's predecessors listed in Preds. Insert
- // it right before the original block.
- BasicBlock *NewBB1 = BasicBlock::Create(OrigBB->getContext(),
- OrigBB->getName() + Suffix1,
- OrigBB->getParent(), OrigBB);
- NewBBs.push_back(NewBB1);
-
- // The new block unconditionally branches to the old block.
- BranchInst *BI1 = BranchInst::Create(OrigBB, NewBB1);
- BI1->setDebugLoc(OrigBB->getFirstNonPHI()->getDebugLoc());
-
- // Move the edges from Preds to point to NewBB1 instead of OrigBB.
- for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
- // This is slightly more strict than necessary; the minimum requirement
- // is that there be no more than one indirectbr branching to BB. And
- // all BlockAddress uses would need to be updated.
- assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
- "Cannot split an edge from an IndirectBrInst");
- Preds[i]->getTerminator()->replaceUsesOfWith(OrigBB, NewBB1);
- }
-
- bool HasLoopExit = false;
+ assert(OrigBB->isLandingPad() && "Trying to split a non-landing pad!");
+
+ // Create a new basic block for OrigBB's predecessors listed in Preds. Insert
+ // it right before the original block.
+ BasicBlock *NewBB1 = BasicBlock::Create(OrigBB->getContext(),
+ OrigBB->getName() + Suffix1,
+ OrigBB->getParent(), OrigBB);
+ NewBBs.push_back(NewBB1);
+
+ // The new block unconditionally branches to the old block.
+ BranchInst *BI1 = BranchInst::Create(OrigBB, NewBB1);
+ BI1->setDebugLoc(OrigBB->getFirstNonPHI()->getDebugLoc());
+
+ // Move the edges from Preds to point to NewBB1 instead of OrigBB.
+ for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+ // This is slightly more strict than necessary; the minimum requirement
+ // is that there be no more than one indirectbr branching to BB. And
+ // all BlockAddress uses would need to be updated.
+ assert(!isa<IndirectBrInst>(Preds[i]->getTerminator()) &&
+ "Cannot split an edge from an IndirectBrInst");
+ Preds[i]->getTerminator()->replaceUsesOfWith(OrigBB, NewBB1);
+ }
+
+ bool HasLoopExit = false;
UpdateAnalysisInformation(OrigBB, NewBB1, Preds, DTU, DT, LI, MSSAU,
PreserveLCSSA, HasLoopExit);
-
- // Update the PHI nodes in OrigBB with the values coming from NewBB1.
- UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, HasLoopExit);
-
- // Move the remaining edges from OrigBB to point to NewBB2.
- SmallVector<BasicBlock*, 8> NewBB2Preds;
- for (pred_iterator i = pred_begin(OrigBB), e = pred_end(OrigBB);
- i != e; ) {
- BasicBlock *Pred = *i++;
- if (Pred == NewBB1) continue;
- assert(!isa<IndirectBrInst>(Pred->getTerminator()) &&
- "Cannot split an edge from an IndirectBrInst");
- NewBB2Preds.push_back(Pred);
- e = pred_end(OrigBB);
- }
-
- BasicBlock *NewBB2 = nullptr;
- if (!NewBB2Preds.empty()) {
- // Create another basic block for the rest of OrigBB's predecessors.
- NewBB2 = BasicBlock::Create(OrigBB->getContext(),
- OrigBB->getName() + Suffix2,
- OrigBB->getParent(), OrigBB);
- NewBBs.push_back(NewBB2);
-
- // The new block unconditionally branches to the old block.
- BranchInst *BI2 = BranchInst::Create(OrigBB, NewBB2);
- BI2->setDebugLoc(OrigBB->getFirstNonPHI()->getDebugLoc());
-
- // Move the remaining edges from OrigBB to point to NewBB2.
- for (BasicBlock *NewBB2Pred : NewBB2Preds)
- NewBB2Pred->getTerminator()->replaceUsesOfWith(OrigBB, NewBB2);
-
- // Update DominatorTree, LoopInfo, and LCCSA analysis information.
- HasLoopExit = false;
+
+ // Update the PHI nodes in OrigBB with the values coming from NewBB1.
+ UpdatePHINodes(OrigBB, NewBB1, Preds, BI1, HasLoopExit);
+
+ // Move the remaining edges from OrigBB to point to NewBB2.
+ SmallVector<BasicBlock*, 8> NewBB2Preds;
+ for (pred_iterator i = pred_begin(OrigBB), e = pred_end(OrigBB);
+ i != e; ) {
+ BasicBlock *Pred = *i++;
+ if (Pred == NewBB1) continue;
+ assert(!isa<IndirectBrInst>(Pred->getTerminator()) &&
+ "Cannot split an edge from an IndirectBrInst");
+ NewBB2Preds.push_back(Pred);
+ e = pred_end(OrigBB);
+ }
+
+ BasicBlock *NewBB2 = nullptr;
+ if (!NewBB2Preds.empty()) {
+ // Create another basic block for the rest of OrigBB's predecessors.
+ NewBB2 = BasicBlock::Create(OrigBB->getContext(),
+ OrigBB->getName() + Suffix2,
+ OrigBB->getParent(), OrigBB);
+ NewBBs.push_back(NewBB2);
+
+ // The new block unconditionally branches to the old block.
+ BranchInst *BI2 = BranchInst::Create(OrigBB, NewBB2);
+ BI2->setDebugLoc(OrigBB->getFirstNonPHI()->getDebugLoc());
+
+ // Move the remaining edges from OrigBB to point to NewBB2.
+ for (BasicBlock *NewBB2Pred : NewBB2Preds)
+ NewBB2Pred->getTerminator()->replaceUsesOfWith(OrigBB, NewBB2);
+
+ // Update DominatorTree, LoopInfo, and LCCSA analysis information.
+ HasLoopExit = false;
UpdateAnalysisInformation(OrigBB, NewBB2, NewBB2Preds, DTU, DT, LI, MSSAU,
- PreserveLCSSA, HasLoopExit);
-
- // Update the PHI nodes in OrigBB with the values coming from NewBB2.
- UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, HasLoopExit);
- }
-
- LandingPadInst *LPad = OrigBB->getLandingPadInst();
- Instruction *Clone1 = LPad->clone();
- Clone1->setName(Twine("lpad") + Suffix1);
- NewBB1->getInstList().insert(NewBB1->getFirstInsertionPt(), Clone1);
-
- if (NewBB2) {
- Instruction *Clone2 = LPad->clone();
- Clone2->setName(Twine("lpad") + Suffix2);
- NewBB2->getInstList().insert(NewBB2->getFirstInsertionPt(), Clone2);
-
- // Create a PHI node for the two cloned landingpad instructions only
- // if the original landingpad instruction has some uses.
- if (!LPad->use_empty()) {
- assert(!LPad->getType()->isTokenTy() &&
- "Split cannot be applied if LPad is token type. Otherwise an "
- "invalid PHINode of token type would be created.");
- PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad);
- PN->addIncoming(Clone1, NewBB1);
- PN->addIncoming(Clone2, NewBB2);
- LPad->replaceAllUsesWith(PN);
- }
- LPad->eraseFromParent();
- } else {
- // There is no second clone. Just replace the landing pad with the first
- // clone.
- LPad->replaceAllUsesWith(Clone1);
- LPad->eraseFromParent();
- }
-}
-
+ PreserveLCSSA, HasLoopExit);
+
+ // Update the PHI nodes in OrigBB with the values coming from NewBB2.
+ UpdatePHINodes(OrigBB, NewBB2, NewBB2Preds, BI2, HasLoopExit);
+ }
+
+ LandingPadInst *LPad = OrigBB->getLandingPadInst();
+ Instruction *Clone1 = LPad->clone();
+ Clone1->setName(Twine("lpad") + Suffix1);
+ NewBB1->getInstList().insert(NewBB1->getFirstInsertionPt(), Clone1);
+
+ if (NewBB2) {
+ Instruction *Clone2 = LPad->clone();
+ Clone2->setName(Twine("lpad") + Suffix2);
+ NewBB2->getInstList().insert(NewBB2->getFirstInsertionPt(), Clone2);
+
+ // Create a PHI node for the two cloned landingpad instructions only
+ // if the original landingpad instruction has some uses.
+ if (!LPad->use_empty()) {
+ assert(!LPad->getType()->isTokenTy() &&
+ "Split cannot be applied if LPad is token type. Otherwise an "
+ "invalid PHINode of token type would be created.");
+ PHINode *PN = PHINode::Create(LPad->getType(), 2, "lpad.phi", LPad);
+ PN->addIncoming(Clone1, NewBB1);
+ PN->addIncoming(Clone2, NewBB2);
+ LPad->replaceAllUsesWith(PN);
+ }
+ LPad->eraseFromParent();
+ } else {
+ // There is no second clone. Just replace the landing pad with the first
+ // clone.
+ LPad->replaceAllUsesWith(Clone1);
+ LPad->eraseFromParent();
+ }
+}
+
void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
ArrayRef<BasicBlock *> Preds,
const char *Suffix1, const char *Suffix2,
@@ -1073,73 +1073,73 @@ void llvm::SplitLandingPadPredecessors(BasicBlock *OrigBB,
PreserveLCSSA);
}
-ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
- BasicBlock *Pred,
- DomTreeUpdater *DTU) {
- Instruction *UncondBranch = Pred->getTerminator();
- // Clone the return and add it to the end of the predecessor.
- Instruction *NewRet = RI->clone();
- Pred->getInstList().push_back(NewRet);
-
- // If the return instruction returns a value, and if the value was a
- // PHI node in "BB", propagate the right value into the return.
- for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end();
- i != e; ++i) {
- Value *V = *i;
- Instruction *NewBC = nullptr;
- if (BitCastInst *BCI = dyn_cast<BitCastInst>(V)) {
- // Return value might be bitcasted. Clone and insert it before the
- // return instruction.
- V = BCI->getOperand(0);
- NewBC = BCI->clone();
- Pred->getInstList().insert(NewRet->getIterator(), NewBC);
- *i = NewBC;
- }
-
- Instruction *NewEV = nullptr;
- if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) {
- V = EVI->getOperand(0);
- NewEV = EVI->clone();
- if (NewBC) {
- NewBC->setOperand(0, NewEV);
- Pred->getInstList().insert(NewBC->getIterator(), NewEV);
- } else {
- Pred->getInstList().insert(NewRet->getIterator(), NewEV);
- *i = NewEV;
- }
- }
-
- if (PHINode *PN = dyn_cast<PHINode>(V)) {
- if (PN->getParent() == BB) {
- if (NewEV) {
- NewEV->setOperand(0, PN->getIncomingValueForBlock(Pred));
- } else if (NewBC)
- NewBC->setOperand(0, PN->getIncomingValueForBlock(Pred));
- else
- *i = PN->getIncomingValueForBlock(Pred);
- }
- }
- }
-
- // Update any PHI nodes in the returning block to realize that we no
- // longer branch to them.
- BB->removePredecessor(Pred);
- UncondBranch->eraseFromParent();
-
- if (DTU)
- DTU->applyUpdates({{DominatorTree::Delete, Pred, BB}});
-
- return cast<ReturnInst>(NewRet);
-}
-
+ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
+ BasicBlock *Pred,
+ DomTreeUpdater *DTU) {
+ Instruction *UncondBranch = Pred->getTerminator();
+ // Clone the return and add it to the end of the predecessor.
+ Instruction *NewRet = RI->clone();
+ Pred->getInstList().push_back(NewRet);
+
+ // If the return instruction returns a value, and if the value was a
+ // PHI node in "BB", propagate the right value into the return.
+ for (User::op_iterator i = NewRet->op_begin(), e = NewRet->op_end();
+ i != e; ++i) {
+ Value *V = *i;
+ Instruction *NewBC = nullptr;
+ if (BitCastInst *BCI = dyn_cast<BitCastInst>(V)) {
+ // Return value might be bitcasted. Clone and insert it before the
+ // return instruction.
+ V = BCI->getOperand(0);
+ NewBC = BCI->clone();
+ Pred->getInstList().insert(NewRet->getIterator(), NewBC);
+ *i = NewBC;
+ }
+
+ Instruction *NewEV = nullptr;
+ if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) {
+ V = EVI->getOperand(0);
+ NewEV = EVI->clone();
+ if (NewBC) {
+ NewBC->setOperand(0, NewEV);
+ Pred->getInstList().insert(NewBC->getIterator(), NewEV);
+ } else {
+ Pred->getInstList().insert(NewRet->getIterator(), NewEV);
+ *i = NewEV;
+ }
+ }
+
+ if (PHINode *PN = dyn_cast<PHINode>(V)) {
+ if (PN->getParent() == BB) {
+ if (NewEV) {
+ NewEV->setOperand(0, PN->getIncomingValueForBlock(Pred));
+ } else if (NewBC)
+ NewBC->setOperand(0, PN->getIncomingValueForBlock(Pred));
+ else
+ *i = PN->getIncomingValueForBlock(Pred);
+ }
+ }
+ }
+
+ // Update any PHI nodes in the returning block to realize that we no
+ // longer branch to them.
+ BB->removePredecessor(Pred);
+ UncondBranch->eraseFromParent();
+
+ if (DTU)
+ DTU->applyUpdates({{DominatorTree::Delete, Pred, BB}});
+
+ return cast<ReturnInst>(NewRet);
+}
+
static Instruction *
SplitBlockAndInsertIfThenImpl(Value *Cond, Instruction *SplitBefore,
bool Unreachable, MDNode *BranchWeights,
DomTreeUpdater *DTU, DominatorTree *DT,
LoopInfo *LI, BasicBlock *ThenBlock) {
SmallVector<DominatorTree::UpdateType, 8> Updates;
- BasicBlock *Head = SplitBefore->getParent();
- BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
+ BasicBlock *Head = SplitBefore->getParent();
+ BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
if (DTU) {
SmallSetVector<BasicBlock *, 8> UniqueSuccessorsOfHead(succ_begin(Tail),
succ_end(Tail));
@@ -1150,57 +1150,57 @@ SplitBlockAndInsertIfThenImpl(Value *Cond, Instruction *SplitBefore,
Updates.push_back({DominatorTree::Delete, Head, UniqueSuccessorOfHead});
}
}
- Instruction *HeadOldTerm = Head->getTerminator();
- LLVMContext &C = Head->getContext();
- Instruction *CheckTerm;
- bool CreateThenBlock = (ThenBlock == nullptr);
- if (CreateThenBlock) {
- ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
- if (Unreachable)
- CheckTerm = new UnreachableInst(C, ThenBlock);
+ Instruction *HeadOldTerm = Head->getTerminator();
+ LLVMContext &C = Head->getContext();
+ Instruction *CheckTerm;
+ bool CreateThenBlock = (ThenBlock == nullptr);
+ if (CreateThenBlock) {
+ ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+ if (Unreachable)
+ CheckTerm = new UnreachableInst(C, ThenBlock);
else {
- CheckTerm = BranchInst::Create(Tail, ThenBlock);
+ CheckTerm = BranchInst::Create(Tail, ThenBlock);
if (DTU)
Updates.push_back({DominatorTree::Insert, ThenBlock, Tail});
}
- CheckTerm->setDebugLoc(SplitBefore->getDebugLoc());
- } else
- CheckTerm = ThenBlock->getTerminator();
- BranchInst *HeadNewTerm =
+ CheckTerm->setDebugLoc(SplitBefore->getDebugLoc());
+ } else
+ CheckTerm = ThenBlock->getTerminator();
+ BranchInst *HeadNewTerm =
BranchInst::Create(/*ifTrue*/ ThenBlock, /*ifFalse*/ Tail, Cond);
if (DTU)
Updates.push_back({DominatorTree::Insert, Head, ThenBlock});
- HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
- ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
-
+ HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
+ ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
+
if (DTU)
DTU->applyUpdates(Updates);
else if (DT) {
- if (DomTreeNode *OldNode = DT->getNode(Head)) {
- std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
-
- DomTreeNode *NewNode = DT->addNewBlock(Tail, Head);
- for (DomTreeNode *Child : Children)
- DT->changeImmediateDominator(Child, NewNode);
-
- // Head dominates ThenBlock.
- if (CreateThenBlock)
- DT->addNewBlock(ThenBlock, Head);
- else
- DT->changeImmediateDominator(ThenBlock, Head);
- }
- }
-
- if (LI) {
- if (Loop *L = LI->getLoopFor(Head)) {
- L->addBasicBlockToLoop(ThenBlock, *LI);
- L->addBasicBlockToLoop(Tail, *LI);
- }
- }
-
- return CheckTerm;
-}
-
+ if (DomTreeNode *OldNode = DT->getNode(Head)) {
+ std::vector<DomTreeNode *> Children(OldNode->begin(), OldNode->end());
+
+ DomTreeNode *NewNode = DT->addNewBlock(Tail, Head);
+ for (DomTreeNode *Child : Children)
+ DT->changeImmediateDominator(Child, NewNode);
+
+ // Head dominates ThenBlock.
+ if (CreateThenBlock)
+ DT->addNewBlock(ThenBlock, Head);
+ else
+ DT->changeImmediateDominator(ThenBlock, Head);
+ }
+ }
+
+ if (LI) {
+ if (Loop *L = LI->getLoopFor(Head)) {
+ L->addBasicBlockToLoop(ThenBlock, *LI);
+ L->addBasicBlockToLoop(Tail, *LI);
+ }
+ }
+
+ return CheckTerm;
+}
+
Instruction *llvm::SplitBlockAndInsertIfThen(Value *Cond,
Instruction *SplitBefore,
bool Unreachable,
@@ -1222,358 +1222,358 @@ Instruction *llvm::SplitBlockAndInsertIfThen(Value *Cond,
ThenBlock);
}
-void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
- Instruction **ThenTerm,
- Instruction **ElseTerm,
- MDNode *BranchWeights) {
- BasicBlock *Head = SplitBefore->getParent();
- BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
- Instruction *HeadOldTerm = Head->getTerminator();
- LLVMContext &C = Head->getContext();
- BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
- BasicBlock *ElseBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
- *ThenTerm = BranchInst::Create(Tail, ThenBlock);
- (*ThenTerm)->setDebugLoc(SplitBefore->getDebugLoc());
- *ElseTerm = BranchInst::Create(Tail, ElseBlock);
- (*ElseTerm)->setDebugLoc(SplitBefore->getDebugLoc());
- BranchInst *HeadNewTerm =
- BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/ElseBlock, Cond);
- HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
- ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
-}
-
-Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
- BasicBlock *&IfFalse) {
- PHINode *SomePHI = dyn_cast<PHINode>(BB->begin());
- BasicBlock *Pred1 = nullptr;
- BasicBlock *Pred2 = nullptr;
-
- if (SomePHI) {
- if (SomePHI->getNumIncomingValues() != 2)
- return nullptr;
- Pred1 = SomePHI->getIncomingBlock(0);
- Pred2 = SomePHI->getIncomingBlock(1);
- } else {
- pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
- if (PI == PE) // No predecessor
- return nullptr;
- Pred1 = *PI++;
- if (PI == PE) // Only one predecessor
- return nullptr;
- Pred2 = *PI++;
- if (PI != PE) // More than two predecessors
- return nullptr;
- }
-
- // We can only handle branches. Other control flow will be lowered to
- // branches if possible anyway.
- BranchInst *Pred1Br = dyn_cast<BranchInst>(Pred1->getTerminator());
- BranchInst *Pred2Br = dyn_cast<BranchInst>(Pred2->getTerminator());
- if (!Pred1Br || !Pred2Br)
- return nullptr;
-
- // Eliminate code duplication by ensuring that Pred1Br is conditional if
- // either are.
- if (Pred2Br->isConditional()) {
- // If both branches are conditional, we don't have an "if statement". In
- // reality, we could transform this case, but since the condition will be
- // required anyway, we stand no chance of eliminating it, so the xform is
- // probably not profitable.
- if (Pred1Br->isConditional())
- return nullptr;
-
- std::swap(Pred1, Pred2);
- std::swap(Pred1Br, Pred2Br);
- }
-
- if (Pred1Br->isConditional()) {
- // The only thing we have to watch out for here is to make sure that Pred2
- // doesn't have incoming edges from other blocks. If it does, the condition
- // doesn't dominate BB.
- if (!Pred2->getSinglePredecessor())
- return nullptr;
-
- // If we found a conditional branch predecessor, make sure that it branches
- // to BB and Pred2Br. If it doesn't, this isn't an "if statement".
- if (Pred1Br->getSuccessor(0) == BB &&
- Pred1Br->getSuccessor(1) == Pred2) {
- IfTrue = Pred1;
- IfFalse = Pred2;
- } else if (Pred1Br->getSuccessor(0) == Pred2 &&
- Pred1Br->getSuccessor(1) == BB) {
- IfTrue = Pred2;
- IfFalse = Pred1;
- } else {
- // We know that one arm of the conditional goes to BB, so the other must
- // go somewhere unrelated, and this must not be an "if statement".
- return nullptr;
- }
-
- return Pred1Br->getCondition();
- }
-
- // Ok, if we got here, both predecessors end with an unconditional branch to
- // BB. Don't panic! If both blocks only have a single (identical)
- // predecessor, and THAT is a conditional branch, then we're all ok!
- BasicBlock *CommonPred = Pred1->getSinglePredecessor();
- if (CommonPred == nullptr || CommonPred != Pred2->getSinglePredecessor())
- return nullptr;
-
- // Otherwise, if this is a conditional branch, then we can use it!
- BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator());
- if (!BI) return nullptr;
-
- assert(BI->isConditional() && "Two successors but not conditional?");
- if (BI->getSuccessor(0) == Pred1) {
- IfTrue = Pred1;
- IfFalse = Pred2;
- } else {
- IfTrue = Pred2;
- IfFalse = Pred1;
- }
- return BI->getCondition();
-}
-
-// After creating a control flow hub, the operands of PHINodes in an outgoing
-// block Out no longer match the predecessors of that block. Predecessors of Out
-// that are incoming blocks to the hub are now replaced by just one edge from
-// the hub. To match this new control flow, the corresponding values from each
-// PHINode must now be moved a new PHINode in the first guard block of the hub.
-//
-// This operation cannot be performed with SSAUpdater, because it involves one
-// new use: If the block Out is in the list of Incoming blocks, then the newly
-// created PHI in the Hub will use itself along that edge from Out to Hub.
-static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock,
- const SetVector<BasicBlock *> &Incoming,
- BasicBlock *FirstGuardBlock) {
- auto I = Out->begin();
- while (I != Out->end() && isa<PHINode>(I)) {
- auto Phi = cast<PHINode>(I);
- auto NewPhi =
- PHINode::Create(Phi->getType(), Incoming.size(),
- Phi->getName() + ".moved", &FirstGuardBlock->back());
- for (auto In : Incoming) {
- Value *V = UndefValue::get(Phi->getType());
- if (In == Out) {
- V = NewPhi;
- } else if (Phi->getBasicBlockIndex(In) != -1) {
- V = Phi->removeIncomingValue(In, false);
- }
- NewPhi->addIncoming(V, In);
- }
- assert(NewPhi->getNumIncomingValues() == Incoming.size());
- if (Phi->getNumOperands() == 0) {
- Phi->replaceAllUsesWith(NewPhi);
- I = Phi->eraseFromParent();
- continue;
- }
- Phi->addIncoming(NewPhi, GuardBlock);
- ++I;
- }
-}
-
-using BBPredicates = DenseMap<BasicBlock *, PHINode *>;
-using BBSetVector = SetVector<BasicBlock *>;
-
-// Redirects the terminator of the incoming block to the first guard
-// block in the hub. The condition of the original terminator (if it
-// was conditional) and its original successors are returned as a
-// tuple <condition, succ0, succ1>. The function additionally filters
-// out successors that are not in the set of outgoing blocks.
-//
-// - condition is non-null iff the branch is conditional.
-// - Succ1 is non-null iff the sole/taken target is an outgoing block.
-// - Succ2 is non-null iff condition is non-null and the fallthrough
-// target is an outgoing block.
-static std::tuple<Value *, BasicBlock *, BasicBlock *>
-redirectToHub(BasicBlock *BB, BasicBlock *FirstGuardBlock,
- const BBSetVector &Outgoing) {
- auto Branch = cast<BranchInst>(BB->getTerminator());
- auto Condition = Branch->isConditional() ? Branch->getCondition() : nullptr;
-
- BasicBlock *Succ0 = Branch->getSuccessor(0);
- BasicBlock *Succ1 = nullptr;
- Succ0 = Outgoing.count(Succ0) ? Succ0 : nullptr;
-
- if (Branch->isUnconditional()) {
- Branch->setSuccessor(0, FirstGuardBlock);
- assert(Succ0);
- } else {
- Succ1 = Branch->getSuccessor(1);
- Succ1 = Outgoing.count(Succ1) ? Succ1 : nullptr;
- assert(Succ0 || Succ1);
- if (Succ0 && !Succ1) {
- Branch->setSuccessor(0, FirstGuardBlock);
- } else if (Succ1 && !Succ0) {
- Branch->setSuccessor(1, FirstGuardBlock);
- } else {
- Branch->eraseFromParent();
- BranchInst::Create(FirstGuardBlock, BB);
- }
- }
-
- assert(Succ0 || Succ1);
- return std::make_tuple(Condition, Succ0, Succ1);
-}
-
-// Capture the existing control flow as guard predicates, and redirect
-// control flow from every incoming block to the first guard block in
-// the hub.
-//
-// There is one guard predicate for each outgoing block OutBB. The
-// predicate is a PHINode with one input for each InBB which
-// represents whether the hub should transfer control flow to OutBB if
-// it arrived from InBB. These predicates are NOT ORTHOGONAL. The Hub
-// evaluates them in the same order as the Outgoing set-vector, and
-// control branches to the first outgoing block whose predicate
-// evaluates to true.
-static void convertToGuardPredicates(
- BasicBlock *FirstGuardBlock, BBPredicates &GuardPredicates,
- SmallVectorImpl<WeakVH> &DeletionCandidates, const BBSetVector &Incoming,
- const BBSetVector &Outgoing) {
- auto &Context = Incoming.front()->getContext();
- auto BoolTrue = ConstantInt::getTrue(Context);
- auto BoolFalse = ConstantInt::getFalse(Context);
-
- // The predicate for the last outgoing is trivially true, and so we
- // process only the first N-1 successors.
- for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) {
- auto Out = Outgoing[i];
- LLVM_DEBUG(dbgs() << "Creating guard for " << Out->getName() << "\n");
- auto Phi =
- PHINode::Create(Type::getInt1Ty(Context), Incoming.size(),
- StringRef("Guard.") + Out->getName(), FirstGuardBlock);
- GuardPredicates[Out] = Phi;
- }
-
- for (auto In : Incoming) {
- Value *Condition;
- BasicBlock *Succ0;
- BasicBlock *Succ1;
- std::tie(Condition, Succ0, Succ1) =
- redirectToHub(In, FirstGuardBlock, Outgoing);
-
- // Optimization: Consider an incoming block A with both successors
- // Succ0 and Succ1 in the set of outgoing blocks. The predicates
- // for Succ0 and Succ1 complement each other. If Succ0 is visited
- // first in the loop below, control will branch to Succ0 using the
- // corresponding predicate. But if that branch is not taken, then
- // control must reach Succ1, which means that the predicate for
- // Succ1 is always true.
- bool OneSuccessorDone = false;
- for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) {
- auto Out = Outgoing[i];
- auto Phi = GuardPredicates[Out];
- if (Out != Succ0 && Out != Succ1) {
- Phi->addIncoming(BoolFalse, In);
- continue;
- }
- // Optimization: When only one successor is an outgoing block,
- // the predicate is always true.
- if (!Succ0 || !Succ1 || OneSuccessorDone) {
- Phi->addIncoming(BoolTrue, In);
- continue;
- }
- assert(Succ0 && Succ1);
- OneSuccessorDone = true;
- if (Out == Succ0) {
- Phi->addIncoming(Condition, In);
- continue;
- }
- auto Inverted = invertCondition(Condition);
- DeletionCandidates.push_back(Condition);
- Phi->addIncoming(Inverted, In);
- }
- }
-}
-
-// For each outgoing block OutBB, create a guard block in the Hub. The
-// first guard block was already created outside, and available as the
-// first element in the vector of guard blocks.
-//
-// Each guard block terminates in a conditional branch that transfers
-// control to the corresponding outgoing block or the next guard
-// block. The last guard block has two outgoing blocks as successors
-// since the condition for the final outgoing block is trivially
-// true. So we create one less block (including the first guard block)
-// than the number of outgoing blocks.
-static void createGuardBlocks(SmallVectorImpl<BasicBlock *> &GuardBlocks,
- Function *F, const BBSetVector &Outgoing,
- BBPredicates &GuardPredicates, StringRef Prefix) {
- for (int i = 0, e = Outgoing.size() - 2; i != e; ++i) {
- GuardBlocks.push_back(
- BasicBlock::Create(F->getContext(), Prefix + ".guard", F));
- }
- assert(GuardBlocks.size() == GuardPredicates.size());
-
- // To help keep the loop simple, temporarily append the last
- // outgoing block to the list of guard blocks.
- GuardBlocks.push_back(Outgoing.back());
-
- for (int i = 0, e = GuardBlocks.size() - 1; i != e; ++i) {
- auto Out = Outgoing[i];
- assert(GuardPredicates.count(Out));
- BranchInst::Create(Out, GuardBlocks[i + 1], GuardPredicates[Out],
- GuardBlocks[i]);
- }
-
- // Remove the last block from the guard list.
- GuardBlocks.pop_back();
-}
-
-BasicBlock *llvm::CreateControlFlowHub(
- DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks,
- const BBSetVector &Incoming, const BBSetVector &Outgoing,
- const StringRef Prefix) {
- auto F = Incoming.front()->getParent();
- auto FirstGuardBlock =
- BasicBlock::Create(F->getContext(), Prefix + ".guard", F);
-
- SmallVector<DominatorTree::UpdateType, 16> Updates;
- if (DTU) {
- for (auto In : Incoming) {
+void llvm::SplitBlockAndInsertIfThenElse(Value *Cond, Instruction *SplitBefore,
+ Instruction **ThenTerm,
+ Instruction **ElseTerm,
+ MDNode *BranchWeights) {
+ BasicBlock *Head = SplitBefore->getParent();
+ BasicBlock *Tail = Head->splitBasicBlock(SplitBefore->getIterator());
+ Instruction *HeadOldTerm = Head->getTerminator();
+ LLVMContext &C = Head->getContext();
+ BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+ BasicBlock *ElseBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+ *ThenTerm = BranchInst::Create(Tail, ThenBlock);
+ (*ThenTerm)->setDebugLoc(SplitBefore->getDebugLoc());
+ *ElseTerm = BranchInst::Create(Tail, ElseBlock);
+ (*ElseTerm)->setDebugLoc(SplitBefore->getDebugLoc());
+ BranchInst *HeadNewTerm =
+ BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/ElseBlock, Cond);
+ HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
+ ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
+}
+
+Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
+ BasicBlock *&IfFalse) {
+ PHINode *SomePHI = dyn_cast<PHINode>(BB->begin());
+ BasicBlock *Pred1 = nullptr;
+ BasicBlock *Pred2 = nullptr;
+
+ if (SomePHI) {
+ if (SomePHI->getNumIncomingValues() != 2)
+ return nullptr;
+ Pred1 = SomePHI->getIncomingBlock(0);
+ Pred2 = SomePHI->getIncomingBlock(1);
+ } else {
+ pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+ if (PI == PE) // No predecessor
+ return nullptr;
+ Pred1 = *PI++;
+ if (PI == PE) // Only one predecessor
+ return nullptr;
+ Pred2 = *PI++;
+ if (PI != PE) // More than two predecessors
+ return nullptr;
+ }
+
+ // We can only handle branches. Other control flow will be lowered to
+ // branches if possible anyway.
+ BranchInst *Pred1Br = dyn_cast<BranchInst>(Pred1->getTerminator());
+ BranchInst *Pred2Br = dyn_cast<BranchInst>(Pred2->getTerminator());
+ if (!Pred1Br || !Pred2Br)
+ return nullptr;
+
+ // Eliminate code duplication by ensuring that Pred1Br is conditional if
+ // either are.
+ if (Pred2Br->isConditional()) {
+ // If both branches are conditional, we don't have an "if statement". In
+ // reality, we could transform this case, but since the condition will be
+ // required anyway, we stand no chance of eliminating it, so the xform is
+ // probably not profitable.
+ if (Pred1Br->isConditional())
+ return nullptr;
+
+ std::swap(Pred1, Pred2);
+ std::swap(Pred1Br, Pred2Br);
+ }
+
+ if (Pred1Br->isConditional()) {
+ // The only thing we have to watch out for here is to make sure that Pred2
+ // doesn't have incoming edges from other blocks. If it does, the condition
+ // doesn't dominate BB.
+ if (!Pred2->getSinglePredecessor())
+ return nullptr;
+
+ // If we found a conditional branch predecessor, make sure that it branches
+ // to BB and Pred2Br. If it doesn't, this isn't an "if statement".
+ if (Pred1Br->getSuccessor(0) == BB &&
+ Pred1Br->getSuccessor(1) == Pred2) {
+ IfTrue = Pred1;
+ IfFalse = Pred2;
+ } else if (Pred1Br->getSuccessor(0) == Pred2 &&
+ Pred1Br->getSuccessor(1) == BB) {
+ IfTrue = Pred2;
+ IfFalse = Pred1;
+ } else {
+ // We know that one arm of the conditional goes to BB, so the other must
+ // go somewhere unrelated, and this must not be an "if statement".
+ return nullptr;
+ }
+
+ return Pred1Br->getCondition();
+ }
+
+ // Ok, if we got here, both predecessors end with an unconditional branch to
+ // BB. Don't panic! If both blocks only have a single (identical)
+ // predecessor, and THAT is a conditional branch, then we're all ok!
+ BasicBlock *CommonPred = Pred1->getSinglePredecessor();
+ if (CommonPred == nullptr || CommonPred != Pred2->getSinglePredecessor())
+ return nullptr;
+
+ // Otherwise, if this is a conditional branch, then we can use it!
+ BranchInst *BI = dyn_cast<BranchInst>(CommonPred->getTerminator());
+ if (!BI) return nullptr;
+
+ assert(BI->isConditional() && "Two successors but not conditional?");
+ if (BI->getSuccessor(0) == Pred1) {
+ IfTrue = Pred1;
+ IfFalse = Pred2;
+ } else {
+ IfTrue = Pred2;
+ IfFalse = Pred1;
+ }
+ return BI->getCondition();
+}
+
+// After creating a control flow hub, the operands of PHINodes in an outgoing
+// block Out no longer match the predecessors of that block. Predecessors of Out
+// that are incoming blocks to the hub are now replaced by just one edge from
+// the hub. To match this new control flow, the corresponding values from each
+// PHINode must now be moved a new PHINode in the first guard block of the hub.
+//
+// This operation cannot be performed with SSAUpdater, because it involves one
+// new use: If the block Out is in the list of Incoming blocks, then the newly
+// created PHI in the Hub will use itself along that edge from Out to Hub.
+static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock,
+ const SetVector<BasicBlock *> &Incoming,
+ BasicBlock *FirstGuardBlock) {
+ auto I = Out->begin();
+ while (I != Out->end() && isa<PHINode>(I)) {
+ auto Phi = cast<PHINode>(I);
+ auto NewPhi =
+ PHINode::Create(Phi->getType(), Incoming.size(),
+ Phi->getName() + ".moved", &FirstGuardBlock->back());
+ for (auto In : Incoming) {
+ Value *V = UndefValue::get(Phi->getType());
+ if (In == Out) {
+ V = NewPhi;
+ } else if (Phi->getBasicBlockIndex(In) != -1) {
+ V = Phi->removeIncomingValue(In, false);
+ }
+ NewPhi->addIncoming(V, In);
+ }
+ assert(NewPhi->getNumIncomingValues() == Incoming.size());
+ if (Phi->getNumOperands() == 0) {
+ Phi->replaceAllUsesWith(NewPhi);
+ I = Phi->eraseFromParent();
+ continue;
+ }
+ Phi->addIncoming(NewPhi, GuardBlock);
+ ++I;
+ }
+}
+
+using BBPredicates = DenseMap<BasicBlock *, PHINode *>;
+using BBSetVector = SetVector<BasicBlock *>;
+
+// Redirects the terminator of the incoming block to the first guard
+// block in the hub. The condition of the original terminator (if it
+// was conditional) and its original successors are returned as a
+// tuple <condition, succ0, succ1>. The function additionally filters
+// out successors that are not in the set of outgoing blocks.
+//
+// - condition is non-null iff the branch is conditional.
+// - Succ1 is non-null iff the sole/taken target is an outgoing block.
+// - Succ2 is non-null iff condition is non-null and the fallthrough
+// target is an outgoing block.
+static std::tuple<Value *, BasicBlock *, BasicBlock *>
+redirectToHub(BasicBlock *BB, BasicBlock *FirstGuardBlock,
+ const BBSetVector &Outgoing) {
+ auto Branch = cast<BranchInst>(BB->getTerminator());
+ auto Condition = Branch->isConditional() ? Branch->getCondition() : nullptr;
+
+ BasicBlock *Succ0 = Branch->getSuccessor(0);
+ BasicBlock *Succ1 = nullptr;
+ Succ0 = Outgoing.count(Succ0) ? Succ0 : nullptr;
+
+ if (Branch->isUnconditional()) {
+ Branch->setSuccessor(0, FirstGuardBlock);
+ assert(Succ0);
+ } else {
+ Succ1 = Branch->getSuccessor(1);
+ Succ1 = Outgoing.count(Succ1) ? Succ1 : nullptr;
+ assert(Succ0 || Succ1);
+ if (Succ0 && !Succ1) {
+ Branch->setSuccessor(0, FirstGuardBlock);
+ } else if (Succ1 && !Succ0) {
+ Branch->setSuccessor(1, FirstGuardBlock);
+ } else {
+ Branch->eraseFromParent();
+ BranchInst::Create(FirstGuardBlock, BB);
+ }
+ }
+
+ assert(Succ0 || Succ1);
+ return std::make_tuple(Condition, Succ0, Succ1);
+}
+
+// Capture the existing control flow as guard predicates, and redirect
+// control flow from every incoming block to the first guard block in
+// the hub.
+//
+// There is one guard predicate for each outgoing block OutBB. The
+// predicate is a PHINode with one input for each InBB which
+// represents whether the hub should transfer control flow to OutBB if
+// it arrived from InBB. These predicates are NOT ORTHOGONAL. The Hub
+// evaluates them in the same order as the Outgoing set-vector, and
+// control branches to the first outgoing block whose predicate
+// evaluates to true.
+static void convertToGuardPredicates(
+ BasicBlock *FirstGuardBlock, BBPredicates &GuardPredicates,
+ SmallVectorImpl<WeakVH> &DeletionCandidates, const BBSetVector &Incoming,
+ const BBSetVector &Outgoing) {
+ auto &Context = Incoming.front()->getContext();
+ auto BoolTrue = ConstantInt::getTrue(Context);
+ auto BoolFalse = ConstantInt::getFalse(Context);
+
+ // The predicate for the last outgoing is trivially true, and so we
+ // process only the first N-1 successors.
+ for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) {
+ auto Out = Outgoing[i];
+ LLVM_DEBUG(dbgs() << "Creating guard for " << Out->getName() << "\n");
+ auto Phi =
+ PHINode::Create(Type::getInt1Ty(Context), Incoming.size(),
+ StringRef("Guard.") + Out->getName(), FirstGuardBlock);
+ GuardPredicates[Out] = Phi;
+ }
+
+ for (auto In : Incoming) {
+ Value *Condition;
+ BasicBlock *Succ0;
+ BasicBlock *Succ1;
+ std::tie(Condition, Succ0, Succ1) =
+ redirectToHub(In, FirstGuardBlock, Outgoing);
+
+ // Optimization: Consider an incoming block A with both successors
+ // Succ0 and Succ1 in the set of outgoing blocks. The predicates
+ // for Succ0 and Succ1 complement each other. If Succ0 is visited
+ // first in the loop below, control will branch to Succ0 using the
+ // corresponding predicate. But if that branch is not taken, then
+ // control must reach Succ1, which means that the predicate for
+ // Succ1 is always true.
+ bool OneSuccessorDone = false;
+ for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) {
+ auto Out = Outgoing[i];
+ auto Phi = GuardPredicates[Out];
+ if (Out != Succ0 && Out != Succ1) {
+ Phi->addIncoming(BoolFalse, In);
+ continue;
+ }
+ // Optimization: When only one successor is an outgoing block,
+ // the predicate is always true.
+ if (!Succ0 || !Succ1 || OneSuccessorDone) {
+ Phi->addIncoming(BoolTrue, In);
+ continue;
+ }
+ assert(Succ0 && Succ1);
+ OneSuccessorDone = true;
+ if (Out == Succ0) {
+ Phi->addIncoming(Condition, In);
+ continue;
+ }
+ auto Inverted = invertCondition(Condition);
+ DeletionCandidates.push_back(Condition);
+ Phi->addIncoming(Inverted, In);
+ }
+ }
+}
+
+// For each outgoing block OutBB, create a guard block in the Hub. The
+// first guard block was already created outside, and available as the
+// first element in the vector of guard blocks.
+//
+// Each guard block terminates in a conditional branch that transfers
+// control to the corresponding outgoing block or the next guard
+// block. The last guard block has two outgoing blocks as successors
+// since the condition for the final outgoing block is trivially
+// true. So we create one less block (including the first guard block)
+// than the number of outgoing blocks.
+static void createGuardBlocks(SmallVectorImpl<BasicBlock *> &GuardBlocks,
+ Function *F, const BBSetVector &Outgoing,
+ BBPredicates &GuardPredicates, StringRef Prefix) {
+ for (int i = 0, e = Outgoing.size() - 2; i != e; ++i) {
+ GuardBlocks.push_back(
+ BasicBlock::Create(F->getContext(), Prefix + ".guard", F));
+ }
+ assert(GuardBlocks.size() == GuardPredicates.size());
+
+ // To help keep the loop simple, temporarily append the last
+ // outgoing block to the list of guard blocks.
+ GuardBlocks.push_back(Outgoing.back());
+
+ for (int i = 0, e = GuardBlocks.size() - 1; i != e; ++i) {
+ auto Out = Outgoing[i];
+ assert(GuardPredicates.count(Out));
+ BranchInst::Create(Out, GuardBlocks[i + 1], GuardPredicates[Out],
+ GuardBlocks[i]);
+ }
+
+ // Remove the last block from the guard list.
+ GuardBlocks.pop_back();
+}
+
+BasicBlock *llvm::CreateControlFlowHub(
+ DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks,
+ const BBSetVector &Incoming, const BBSetVector &Outgoing,
+ const StringRef Prefix) {
+ auto F = Incoming.front()->getParent();
+ auto FirstGuardBlock =
+ BasicBlock::Create(F->getContext(), Prefix + ".guard", F);
+
+ SmallVector<DominatorTree::UpdateType, 16> Updates;
+ if (DTU) {
+ for (auto In : Incoming) {
Updates.push_back({DominatorTree::Insert, In, FirstGuardBlock});
- for (auto Succ : successors(In)) {
- if (Outgoing.count(Succ))
- Updates.push_back({DominatorTree::Delete, In, Succ});
- }
- }
- }
-
- BBPredicates GuardPredicates;
- SmallVector<WeakVH, 8> DeletionCandidates;
- convertToGuardPredicates(FirstGuardBlock, GuardPredicates, DeletionCandidates,
- Incoming, Outgoing);
-
- GuardBlocks.push_back(FirstGuardBlock);
- createGuardBlocks(GuardBlocks, F, Outgoing, GuardPredicates, Prefix);
-
- // Update the PHINodes in each outgoing block to match the new control flow.
- for (int i = 0, e = GuardBlocks.size(); i != e; ++i) {
- reconnectPhis(Outgoing[i], GuardBlocks[i], Incoming, FirstGuardBlock);
- }
- reconnectPhis(Outgoing.back(), GuardBlocks.back(), Incoming, FirstGuardBlock);
-
- if (DTU) {
- int NumGuards = GuardBlocks.size();
- assert((int)Outgoing.size() == NumGuards + 1);
- for (int i = 0; i != NumGuards - 1; ++i) {
- Updates.push_back({DominatorTree::Insert, GuardBlocks[i], Outgoing[i]});
- Updates.push_back(
- {DominatorTree::Insert, GuardBlocks[i], GuardBlocks[i + 1]});
- }
- Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1],
- Outgoing[NumGuards - 1]});
- Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1],
- Outgoing[NumGuards]});
- DTU->applyUpdates(Updates);
- }
-
- for (auto I : DeletionCandidates) {
- if (I->use_empty())
- if (auto Inst = dyn_cast_or_null<Instruction>(I))
- Inst->eraseFromParent();
- }
-
- return FirstGuardBlock;
-}
+ for (auto Succ : successors(In)) {
+ if (Outgoing.count(Succ))
+ Updates.push_back({DominatorTree::Delete, In, Succ});
+ }
+ }
+ }
+
+ BBPredicates GuardPredicates;
+ SmallVector<WeakVH, 8> DeletionCandidates;
+ convertToGuardPredicates(FirstGuardBlock, GuardPredicates, DeletionCandidates,
+ Incoming, Outgoing);
+
+ GuardBlocks.push_back(FirstGuardBlock);
+ createGuardBlocks(GuardBlocks, F, Outgoing, GuardPredicates, Prefix);
+
+ // Update the PHINodes in each outgoing block to match the new control flow.
+ for (int i = 0, e = GuardBlocks.size(); i != e; ++i) {
+ reconnectPhis(Outgoing[i], GuardBlocks[i], Incoming, FirstGuardBlock);
+ }
+ reconnectPhis(Outgoing.back(), GuardBlocks.back(), Incoming, FirstGuardBlock);
+
+ if (DTU) {
+ int NumGuards = GuardBlocks.size();
+ assert((int)Outgoing.size() == NumGuards + 1);
+ for (int i = 0; i != NumGuards - 1; ++i) {
+ Updates.push_back({DominatorTree::Insert, GuardBlocks[i], Outgoing[i]});
+ Updates.push_back(
+ {DominatorTree::Insert, GuardBlocks[i], GuardBlocks[i + 1]});
+ }
+ Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1],
+ Outgoing[NumGuards - 1]});
+ Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1],
+ Outgoing[NumGuards]});
+ DTU->applyUpdates(Updates);
+ }
+
+ for (auto I : DeletionCandidates) {
+ if (I->use_empty())
+ if (auto Inst = dyn_cast_or_null<Instruction>(I))
+ Inst->eraseFromParent();
+ }
+
+ return FirstGuardBlock;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/BreakCriticalEdges.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/BreakCriticalEdges.cpp
index bb1438e94b..939a1a3a86 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -1,203 +1,203 @@
-//===- BreakCriticalEdges.cpp - Critical Edge Elimination Pass ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// BreakCriticalEdges pass - Break all of the critical edges in the CFG by
-// inserting a dummy basic block. This pass may be "required" by passes that
-// cannot deal with critical edges. For this usage, the structure type is
-// forward declared. This pass obviously invalidates the CFG, but can update
-// dominator trees.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/BreakCriticalEdges.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "break-crit-edges"
-
-STATISTIC(NumBroken, "Number of blocks inserted");
-
-namespace {
- struct BreakCriticalEdges : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- BreakCriticalEdges() : FunctionPass(ID) {
- initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-
- auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
- auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
-
- auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
- auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
- unsigned N =
- SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT));
- NumBroken += N;
- return N > 0;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
-
- // No loop canonicalization guarantees are broken by this pass.
- AU.addPreservedID(LoopSimplifyID);
- }
- };
-}
-
-char BreakCriticalEdges::ID = 0;
-INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
- "Break critical edges in CFG", false, false)
-
-// Publicly exposed interface to pass...
-char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID;
-FunctionPass *llvm::createBreakCriticalEdgesPass() {
- return new BreakCriticalEdges();
-}
-
-PreservedAnalyses BreakCriticalEdgesPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
- auto *LI = AM.getCachedResult<LoopAnalysis>(F);
- unsigned N = SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI));
- NumBroken += N;
- if (N == 0)
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<LoopAnalysis>();
- return PA;
-}
-
-//===----------------------------------------------------------------------===//
-// Implementation of the external critical edge manipulation functions
-//===----------------------------------------------------------------------===//
-
-/// When a loop exit edge is split, LCSSA form may require new PHIs in the new
-/// exit block. This function inserts the new PHIs, as needed. Preds is a list
-/// of preds inside the loop, SplitBB is the new loop exit block, and DestBB is
-/// the old loop exit, now the successor of SplitBB.
-static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
- BasicBlock *SplitBB,
- BasicBlock *DestBB) {
- // SplitBB shouldn't have anything non-trivial in it yet.
- assert((SplitBB->getFirstNonPHI() == SplitBB->getTerminator() ||
- SplitBB->isLandingPad()) && "SplitBB has non-PHI nodes!");
-
- // For each PHI in the destination block.
- for (PHINode &PN : DestBB->phis()) {
- unsigned Idx = PN.getBasicBlockIndex(SplitBB);
- Value *V = PN.getIncomingValue(Idx);
-
- // If the input is a PHI which already satisfies LCSSA, don't create
- // a new one.
- if (const PHINode *VP = dyn_cast<PHINode>(V))
- if (VP->getParent() == SplitBB)
- continue;
-
- // Otherwise a new PHI is needed. Create one and populate it.
- PHINode *NewPN = PHINode::Create(
- PN.getType(), Preds.size(), "split",
- SplitBB->isLandingPad() ? &SplitBB->front() : SplitBB->getTerminator());
- for (unsigned i = 0, e = Preds.size(); i != e; ++i)
- NewPN->addIncoming(V, Preds[i]);
-
- // Update the original PHI.
- PN.setIncomingValue(Idx, NewPN);
- }
-}
-
+//===- BreakCriticalEdges.cpp - Critical Edge Elimination Pass ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// BreakCriticalEdges pass - Break all of the critical edges in the CFG by
+// inserting a dummy basic block. This pass may be "required" by passes that
+// cannot deal with critical edges. For this usage, the structure type is
+// forward declared. This pass obviously invalidates the CFG, but can update
+// dominator trees.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BreakCriticalEdges.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "break-crit-edges"
+
+STATISTIC(NumBroken, "Number of blocks inserted");
+
+namespace {
+ struct BreakCriticalEdges : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ BreakCriticalEdges() : FunctionPass(ID) {
+ initializeBreakCriticalEdgesPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+
+ auto *PDTWP = getAnalysisIfAvailable<PostDominatorTreeWrapperPass>();
+ auto *PDT = PDTWP ? &PDTWP->getPostDomTree() : nullptr;
+
+ auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
+ auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
+ unsigned N =
+ SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI, nullptr, PDT));
+ NumBroken += N;
+ return N > 0;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+
+ // No loop canonicalization guarantees are broken by this pass.
+ AU.addPreservedID(LoopSimplifyID);
+ }
+ };
+}
+
+char BreakCriticalEdges::ID = 0;
+INITIALIZE_PASS(BreakCriticalEdges, "break-crit-edges",
+ "Break critical edges in CFG", false, false)
+
+// Publicly exposed interface to pass...
+char &llvm::BreakCriticalEdgesID = BreakCriticalEdges::ID;
+FunctionPass *llvm::createBreakCriticalEdgesPass() {
+ return new BreakCriticalEdges();
+}
+
+PreservedAnalyses BreakCriticalEdgesPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto *DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+ auto *LI = AM.getCachedResult<LoopAnalysis>(F);
+ unsigned N = SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions(DT, LI));
+ NumBroken += N;
+ if (N == 0)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ return PA;
+}
+
+//===----------------------------------------------------------------------===//
+// Implementation of the external critical edge manipulation functions
+//===----------------------------------------------------------------------===//
+
+/// When a loop exit edge is split, LCSSA form may require new PHIs in the new
+/// exit block. This function inserts the new PHIs, as needed. Preds is a list
+/// of preds inside the loop, SplitBB is the new loop exit block, and DestBB is
+/// the old loop exit, now the successor of SplitBB.
+static void createPHIsForSplitLoopExit(ArrayRef<BasicBlock *> Preds,
+ BasicBlock *SplitBB,
+ BasicBlock *DestBB) {
+ // SplitBB shouldn't have anything non-trivial in it yet.
+ assert((SplitBB->getFirstNonPHI() == SplitBB->getTerminator() ||
+ SplitBB->isLandingPad()) && "SplitBB has non-PHI nodes!");
+
+ // For each PHI in the destination block.
+ for (PHINode &PN : DestBB->phis()) {
+ unsigned Idx = PN.getBasicBlockIndex(SplitBB);
+ Value *V = PN.getIncomingValue(Idx);
+
+ // If the input is a PHI which already satisfies LCSSA, don't create
+ // a new one.
+ if (const PHINode *VP = dyn_cast<PHINode>(V))
+ if (VP->getParent() == SplitBB)
+ continue;
+
+ // Otherwise a new PHI is needed. Create one and populate it.
+ PHINode *NewPN = PHINode::Create(
+ PN.getType(), Preds.size(), "split",
+ SplitBB->isLandingPad() ? &SplitBB->front() : SplitBB->getTerminator());
+ for (unsigned i = 0, e = Preds.size(); i != e; ++i)
+ NewPN->addIncoming(V, Preds[i]);
+
+ // Update the original PHI.
+ PN.setIncomingValue(Idx, NewPN);
+ }
+}
+
BasicBlock *llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
const CriticalEdgeSplittingOptions &Options,
const Twine &BBName) {
- if (!isCriticalEdge(TI, SuccNum, Options.MergeIdenticalEdges))
- return nullptr;
-
- assert(!isa<IndirectBrInst>(TI) &&
- "Cannot split critical edge from IndirectBrInst");
-
- BasicBlock *TIBB = TI->getParent();
- BasicBlock *DestBB = TI->getSuccessor(SuccNum);
-
- // Splitting the critical edge to a pad block is non-trivial. Don't do
- // it in this generic function.
- if (DestBB->isEHPad()) return nullptr;
-
- if (Options.IgnoreUnreachableDests &&
- isa<UnreachableInst>(DestBB->getFirstNonPHIOrDbgOrLifetime()))
- return nullptr;
-
- auto *LI = Options.LI;
- SmallVector<BasicBlock *, 4> LoopPreds;
- // Check if extra modifications will be required to preserve loop-simplify
- // form after splitting. If it would require splitting blocks with IndirectBr
+ if (!isCriticalEdge(TI, SuccNum, Options.MergeIdenticalEdges))
+ return nullptr;
+
+ assert(!isa<IndirectBrInst>(TI) &&
+ "Cannot split critical edge from IndirectBrInst");
+
+ BasicBlock *TIBB = TI->getParent();
+ BasicBlock *DestBB = TI->getSuccessor(SuccNum);
+
+ // Splitting the critical edge to a pad block is non-trivial. Don't do
+ // it in this generic function.
+ if (DestBB->isEHPad()) return nullptr;
+
+ if (Options.IgnoreUnreachableDests &&
+ isa<UnreachableInst>(DestBB->getFirstNonPHIOrDbgOrLifetime()))
+ return nullptr;
+
+ auto *LI = Options.LI;
+ SmallVector<BasicBlock *, 4> LoopPreds;
+ // Check if extra modifications will be required to preserve loop-simplify
+ // form after splitting. If it would require splitting blocks with IndirectBr
// or CallBr terminators, bail out if preserving loop-simplify form is
// requested.
- if (LI) {
- if (Loop *TIL = LI->getLoopFor(TIBB)) {
-
+ if (LI) {
+ if (Loop *TIL = LI->getLoopFor(TIBB)) {
+
// The only way that we can break LoopSimplify form by splitting a
// critical edge is if after the split there exists some edge from TIL to
// DestBB *and* the only edge into DestBB from outside of TIL is that of
- // NewBB. If the first isn't true, then LoopSimplify still holds, NewBB
- // is the new exit block and it has no non-loop predecessors. If the
- // second isn't true, then DestBB was not in LoopSimplify form prior to
- // the split as it had a non-loop predecessor. In both of these cases,
- // the predecessor must be directly in TIL, not in a subloop, or again
- // LoopSimplify doesn't hold.
+ // NewBB. If the first isn't true, then LoopSimplify still holds, NewBB
+ // is the new exit block and it has no non-loop predecessors. If the
+ // second isn't true, then DestBB was not in LoopSimplify form prior to
+ // the split as it had a non-loop predecessor. In both of these cases,
+ // the predecessor must be directly in TIL, not in a subloop, or again
+ // LoopSimplify doesn't hold.
for (BasicBlock *P : predecessors(DestBB)) {
- if (P == TIBB)
- continue; // The new block is known.
- if (LI->getLoopFor(P) != TIL) {
- // No need to re-simplify, it wasn't to start with.
- LoopPreds.clear();
- break;
- }
- LoopPreds.push_back(P);
- }
- // Loop-simplify form can be preserved, if we can split all in-loop
- // predecessors.
- if (any_of(LoopPreds, [](BasicBlock *Pred) {
+ if (P == TIBB)
+ continue; // The new block is known.
+ if (LI->getLoopFor(P) != TIL) {
+ // No need to re-simplify, it wasn't to start with.
+ LoopPreds.clear();
+ break;
+ }
+ LoopPreds.push_back(P);
+ }
+ // Loop-simplify form can be preserved, if we can split all in-loop
+ // predecessors.
+ if (any_of(LoopPreds, [](BasicBlock *Pred) {
const Instruction *T = Pred->getTerminator();
if (const auto *CBR = dyn_cast<CallBrInst>(T))
return CBR->getDefaultDest() != Pred;
return isa<IndirectBrInst>(T);
- })) {
- if (Options.PreserveLoopSimplify)
- return nullptr;
- LoopPreds.clear();
- }
- }
- }
-
- // Create a new basic block, linking it into the CFG.
+ })) {
+ if (Options.PreserveLoopSimplify)
+ return nullptr;
+ LoopPreds.clear();
+ }
+ }
+ }
+
+ // Create a new basic block, linking it into the CFG.
BasicBlock *NewBB = nullptr;
if (BBName.str() != "")
NewBB = BasicBlock::Create(TI->getContext(), BBName);
@@ -205,297 +205,297 @@ BasicBlock *llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
NewBB = BasicBlock::Create(TI->getContext(), TIBB->getName() + "." +
DestBB->getName() +
"_crit_edge");
- // Create our unconditional branch.
- BranchInst *NewBI = BranchInst::Create(DestBB, NewBB);
- NewBI->setDebugLoc(TI->getDebugLoc());
-
- // Insert the block into the function... right after the block TI lives in.
- Function &F = *TIBB->getParent();
- Function::iterator FBBI = TIBB->getIterator();
- F.getBasicBlockList().insert(++FBBI, NewBB);
-
- // Branch to the new block, breaking the edge.
- TI->setSuccessor(SuccNum, NewBB);
-
- // If there are any PHI nodes in DestBB, we need to update them so that they
- // merge incoming values from NewBB instead of from TIBB.
- {
- unsigned BBIdx = 0;
- for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
- // We no longer enter through TIBB, now we come in through NewBB.
- // Revector exactly one entry in the PHI node that used to come from
- // TIBB to come from NewBB.
- PHINode *PN = cast<PHINode>(I);
-
- // Reuse the previous value of BBIdx if it lines up. In cases where we
- // have multiple phi nodes with *lots* of predecessors, this is a speed
- // win because we don't have to scan the PHI looking for TIBB. This
- // happens because the BB list of PHI nodes are usually in the same
- // order.
- if (PN->getIncomingBlock(BBIdx) != TIBB)
- BBIdx = PN->getBasicBlockIndex(TIBB);
- PN->setIncomingBlock(BBIdx, NewBB);
- }
- }
-
- // If there are any other edges from TIBB to DestBB, update those to go
- // through the split block, making those edges non-critical as well (and
- // reducing the number of phi entries in the DestBB if relevant).
- if (Options.MergeIdenticalEdges) {
- for (unsigned i = SuccNum+1, e = TI->getNumSuccessors(); i != e; ++i) {
- if (TI->getSuccessor(i) != DestBB) continue;
-
- // Remove an entry for TIBB from DestBB phi nodes.
- DestBB->removePredecessor(TIBB, Options.KeepOneInputPHIs);
-
- // We found another edge to DestBB, go to NewBB instead.
- TI->setSuccessor(i, NewBB);
- }
- }
-
- // If we have nothing to update, just return.
- auto *DT = Options.DT;
- auto *PDT = Options.PDT;
- auto *MSSAU = Options.MSSAU;
- if (MSSAU)
- MSSAU->wireOldPredecessorsToNewImmediatePredecessor(
- DestBB, NewBB, {TIBB}, Options.MergeIdenticalEdges);
-
- if (!DT && !PDT && !LI)
- return NewBB;
-
- if (DT || PDT) {
- // Update the DominatorTree.
- // ---> NewBB -----\
- // / V
- // TIBB -------\\------> DestBB
- //
- // First, inform the DT about the new path from TIBB to DestBB via NewBB,
- // then delete the old edge from TIBB to DestBB. By doing this in that order
- // DestBB stays reachable in the DT the whole time and its subtree doesn't
- // get disconnected.
- SmallVector<DominatorTree::UpdateType, 3> Updates;
- Updates.push_back({DominatorTree::Insert, TIBB, NewBB});
- Updates.push_back({DominatorTree::Insert, NewBB, DestBB});
+ // Create our unconditional branch.
+ BranchInst *NewBI = BranchInst::Create(DestBB, NewBB);
+ NewBI->setDebugLoc(TI->getDebugLoc());
+
+ // Insert the block into the function... right after the block TI lives in.
+ Function &F = *TIBB->getParent();
+ Function::iterator FBBI = TIBB->getIterator();
+ F.getBasicBlockList().insert(++FBBI, NewBB);
+
+ // Branch to the new block, breaking the edge.
+ TI->setSuccessor(SuccNum, NewBB);
+
+ // If there are any PHI nodes in DestBB, we need to update them so that they
+ // merge incoming values from NewBB instead of from TIBB.
+ {
+ unsigned BBIdx = 0;
+ for (BasicBlock::iterator I = DestBB->begin(); isa<PHINode>(I); ++I) {
+ // We no longer enter through TIBB, now we come in through NewBB.
+ // Revector exactly one entry in the PHI node that used to come from
+ // TIBB to come from NewBB.
+ PHINode *PN = cast<PHINode>(I);
+
+ // Reuse the previous value of BBIdx if it lines up. In cases where we
+ // have multiple phi nodes with *lots* of predecessors, this is a speed
+ // win because we don't have to scan the PHI looking for TIBB. This
+ // happens because the BB list of PHI nodes are usually in the same
+ // order.
+ if (PN->getIncomingBlock(BBIdx) != TIBB)
+ BBIdx = PN->getBasicBlockIndex(TIBB);
+ PN->setIncomingBlock(BBIdx, NewBB);
+ }
+ }
+
+ // If there are any other edges from TIBB to DestBB, update those to go
+ // through the split block, making those edges non-critical as well (and
+ // reducing the number of phi entries in the DestBB if relevant).
+ if (Options.MergeIdenticalEdges) {
+ for (unsigned i = SuccNum+1, e = TI->getNumSuccessors(); i != e; ++i) {
+ if (TI->getSuccessor(i) != DestBB) continue;
+
+ // Remove an entry for TIBB from DestBB phi nodes.
+ DestBB->removePredecessor(TIBB, Options.KeepOneInputPHIs);
+
+ // We found another edge to DestBB, go to NewBB instead.
+ TI->setSuccessor(i, NewBB);
+ }
+ }
+
+ // If we have nothing to update, just return.
+ auto *DT = Options.DT;
+ auto *PDT = Options.PDT;
+ auto *MSSAU = Options.MSSAU;
+ if (MSSAU)
+ MSSAU->wireOldPredecessorsToNewImmediatePredecessor(
+ DestBB, NewBB, {TIBB}, Options.MergeIdenticalEdges);
+
+ if (!DT && !PDT && !LI)
+ return NewBB;
+
+ if (DT || PDT) {
+ // Update the DominatorTree.
+ // ---> NewBB -----\
+ // / V
+ // TIBB -------\\------> DestBB
+ //
+ // First, inform the DT about the new path from TIBB to DestBB via NewBB,
+ // then delete the old edge from TIBB to DestBB. By doing this in that order
+ // DestBB stays reachable in the DT the whole time and its subtree doesn't
+ // get disconnected.
+ SmallVector<DominatorTree::UpdateType, 3> Updates;
+ Updates.push_back({DominatorTree::Insert, TIBB, NewBB});
+ Updates.push_back({DominatorTree::Insert, NewBB, DestBB});
if (!llvm::is_contained(successors(TIBB), DestBB))
- Updates.push_back({DominatorTree::Delete, TIBB, DestBB});
-
- if (DT)
- DT->applyUpdates(Updates);
- if (PDT)
- PDT->applyUpdates(Updates);
- }
-
- // Update LoopInfo if it is around.
- if (LI) {
- if (Loop *TIL = LI->getLoopFor(TIBB)) {
- // If one or the other blocks were not in a loop, the new block is not
- // either, and thus LI doesn't need to be updated.
- if (Loop *DestLoop = LI->getLoopFor(DestBB)) {
- if (TIL == DestLoop) {
- // Both in the same loop, the NewBB joins loop.
- DestLoop->addBasicBlockToLoop(NewBB, *LI);
- } else if (TIL->contains(DestLoop)) {
- // Edge from an outer loop to an inner loop. Add to the outer loop.
- TIL->addBasicBlockToLoop(NewBB, *LI);
- } else if (DestLoop->contains(TIL)) {
- // Edge from an inner loop to an outer loop. Add to the outer loop.
- DestLoop->addBasicBlockToLoop(NewBB, *LI);
- } else {
- // Edge from two loops with no containment relation. Because these
- // are natural loops, we know that the destination block must be the
- // header of its loop (adding a branch into a loop elsewhere would
- // create an irreducible loop).
- assert(DestLoop->getHeader() == DestBB &&
- "Should not create irreducible loops!");
- if (Loop *P = DestLoop->getParentLoop())
- P->addBasicBlockToLoop(NewBB, *LI);
- }
- }
-
- // If TIBB is in a loop and DestBB is outside of that loop, we may need
- // to update LoopSimplify form and LCSSA form.
- if (!TIL->contains(DestBB)) {
- assert(!TIL->contains(NewBB) &&
- "Split point for loop exit is contained in loop!");
-
- // Update LCSSA form in the newly created exit block.
- if (Options.PreserveLCSSA) {
- createPHIsForSplitLoopExit(TIBB, NewBB, DestBB);
- }
-
- if (!LoopPreds.empty()) {
- assert(!DestBB->isEHPad() && "We don't split edges to EH pads!");
- BasicBlock *NewExitBB = SplitBlockPredecessors(
- DestBB, LoopPreds, "split", DT, LI, MSSAU, Options.PreserveLCSSA);
- if (Options.PreserveLCSSA)
- createPHIsForSplitLoopExit(LoopPreds, NewExitBB, DestBB);
- }
- }
- }
- }
-
- return NewBB;
-}
-
-// Return the unique indirectbr predecessor of a block. This may return null
-// even if such a predecessor exists, if it's not useful for splitting.
-// If a predecessor is found, OtherPreds will contain all other (non-indirectbr)
-// predecessors of BB.
-static BasicBlock *
-findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) {
- // If the block doesn't have any PHIs, we don't care about it, since there's
- // no point in splitting it.
- PHINode *PN = dyn_cast<PHINode>(BB->begin());
- if (!PN)
- return nullptr;
-
- // Verify we have exactly one IBR predecessor.
- // Conservatively bail out if one of the other predecessors is not a "regular"
- // terminator (that is, not a switch or a br).
- BasicBlock *IBB = nullptr;
- for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) {
- BasicBlock *PredBB = PN->getIncomingBlock(Pred);
- Instruction *PredTerm = PredBB->getTerminator();
- switch (PredTerm->getOpcode()) {
- case Instruction::IndirectBr:
- if (IBB)
- return nullptr;
- IBB = PredBB;
- break;
- case Instruction::Br:
- case Instruction::Switch:
- OtherPreds.push_back(PredBB);
- continue;
- default:
- return nullptr;
- }
- }
-
- return IBB;
-}
-
-bool llvm::SplitIndirectBrCriticalEdges(Function &F,
- BranchProbabilityInfo *BPI,
- BlockFrequencyInfo *BFI) {
- // Check whether the function has any indirectbrs, and collect which blocks
- // they may jump to. Since most functions don't have indirect branches,
- // this lowers the common case's overhead to O(Blocks) instead of O(Edges).
- SmallSetVector<BasicBlock *, 16> Targets;
- for (auto &BB : F) {
- auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator());
- if (!IBI)
- continue;
-
- for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ)
- Targets.insert(IBI->getSuccessor(Succ));
- }
-
- if (Targets.empty())
- return false;
-
- bool ShouldUpdateAnalysis = BPI && BFI;
- bool Changed = false;
- for (BasicBlock *Target : Targets) {
- SmallVector<BasicBlock *, 16> OtherPreds;
- BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds);
- // If we did not found an indirectbr, or the indirectbr is the only
- // incoming edge, this isn't the kind of edge we're looking for.
- if (!IBRPred || OtherPreds.empty())
- continue;
-
- // Don't even think about ehpads/landingpads.
- Instruction *FirstNonPHI = Target->getFirstNonPHI();
- if (FirstNonPHI->isEHPad() || Target->isLandingPad())
- continue;
-
- // Remember edge probabilities if needed.
- SmallVector<BranchProbability, 4> EdgeProbabilities;
- if (ShouldUpdateAnalysis) {
- EdgeProbabilities.reserve(Target->getTerminator()->getNumSuccessors());
- for (unsigned I = 0, E = Target->getTerminator()->getNumSuccessors();
- I < E; ++I)
- EdgeProbabilities.emplace_back(BPI->getEdgeProbability(Target, I));
- BPI->eraseBlock(Target);
- }
-
- BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split");
- if (ShouldUpdateAnalysis) {
- // Copy the BFI/BPI from Target to BodyBlock.
- BPI->setEdgeProbability(BodyBlock, EdgeProbabilities);
- BFI->setBlockFreq(BodyBlock, BFI->getBlockFreq(Target).getFrequency());
- }
- // It's possible Target was its own successor through an indirectbr.
- // In this case, the indirectbr now comes from BodyBlock.
- if (IBRPred == Target)
- IBRPred = BodyBlock;
-
- // At this point Target only has PHIs, and BodyBlock has the rest of the
- // block's body. Create a copy of Target that will be used by the "direct"
- // preds.
- ValueToValueMapTy VMap;
- BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F);
-
- BlockFrequency BlockFreqForDirectSucc;
- for (BasicBlock *Pred : OtherPreds) {
- // If the target is a loop to itself, then the terminator of the split
- // block (BodyBlock) needs to be updated.
- BasicBlock *Src = Pred != Target ? Pred : BodyBlock;
- Src->getTerminator()->replaceUsesOfWith(Target, DirectSucc);
- if (ShouldUpdateAnalysis)
- BlockFreqForDirectSucc += BFI->getBlockFreq(Src) *
- BPI->getEdgeProbability(Src, DirectSucc);
- }
- if (ShouldUpdateAnalysis) {
- BFI->setBlockFreq(DirectSucc, BlockFreqForDirectSucc.getFrequency());
- BlockFrequency NewBlockFreqForTarget =
- BFI->getBlockFreq(Target) - BlockFreqForDirectSucc;
- BFI->setBlockFreq(Target, NewBlockFreqForTarget.getFrequency());
- }
-
- // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that
- // they are clones, so the number of PHIs are the same.
- // (a) Remove the edge coming from IBRPred from the "Direct" PHI
- // (b) Leave that as the only edge in the "Indirect" PHI.
- // (c) Merge the two in the body block.
- BasicBlock::iterator Indirect = Target->begin(),
- End = Target->getFirstNonPHI()->getIterator();
- BasicBlock::iterator Direct = DirectSucc->begin();
- BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt();
-
- assert(&*End == Target->getTerminator() &&
- "Block was expected to only contain PHIs");
-
- while (Indirect != End) {
- PHINode *DirPHI = cast<PHINode>(Direct);
- PHINode *IndPHI = cast<PHINode>(Indirect);
-
- // Now, clean up - the direct block shouldn't get the indirect value,
- // and vice versa.
- DirPHI->removeIncomingValue(IBRPred);
- Direct++;
-
- // Advance the pointer here, to avoid invalidation issues when the old
- // PHI is erased.
- Indirect++;
-
- PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI);
- NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred),
- IBRPred);
-
- // Create a PHI in the body block, to merge the direct and indirect
- // predecessors.
- PHINode *MergePHI =
- PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert);
- MergePHI->addIncoming(NewIndPHI, Target);
- MergePHI->addIncoming(DirPHI, DirectSucc);
-
- IndPHI->replaceAllUsesWith(MergePHI);
- IndPHI->eraseFromParent();
- }
-
- Changed = true;
- }
-
- return Changed;
-}
+ Updates.push_back({DominatorTree::Delete, TIBB, DestBB});
+
+ if (DT)
+ DT->applyUpdates(Updates);
+ if (PDT)
+ PDT->applyUpdates(Updates);
+ }
+
+ // Update LoopInfo if it is around.
+ if (LI) {
+ if (Loop *TIL = LI->getLoopFor(TIBB)) {
+ // If one or the other blocks were not in a loop, the new block is not
+ // either, and thus LI doesn't need to be updated.
+ if (Loop *DestLoop = LI->getLoopFor(DestBB)) {
+ if (TIL == DestLoop) {
+ // Both in the same loop, the NewBB joins loop.
+ DestLoop->addBasicBlockToLoop(NewBB, *LI);
+ } else if (TIL->contains(DestLoop)) {
+ // Edge from an outer loop to an inner loop. Add to the outer loop.
+ TIL->addBasicBlockToLoop(NewBB, *LI);
+ } else if (DestLoop->contains(TIL)) {
+ // Edge from an inner loop to an outer loop. Add to the outer loop.
+ DestLoop->addBasicBlockToLoop(NewBB, *LI);
+ } else {
+ // Edge from two loops with no containment relation. Because these
+ // are natural loops, we know that the destination block must be the
+ // header of its loop (adding a branch into a loop elsewhere would
+ // create an irreducible loop).
+ assert(DestLoop->getHeader() == DestBB &&
+ "Should not create irreducible loops!");
+ if (Loop *P = DestLoop->getParentLoop())
+ P->addBasicBlockToLoop(NewBB, *LI);
+ }
+ }
+
+ // If TIBB is in a loop and DestBB is outside of that loop, we may need
+ // to update LoopSimplify form and LCSSA form.
+ if (!TIL->contains(DestBB)) {
+ assert(!TIL->contains(NewBB) &&
+ "Split point for loop exit is contained in loop!");
+
+ // Update LCSSA form in the newly created exit block.
+ if (Options.PreserveLCSSA) {
+ createPHIsForSplitLoopExit(TIBB, NewBB, DestBB);
+ }
+
+ if (!LoopPreds.empty()) {
+ assert(!DestBB->isEHPad() && "We don't split edges to EH pads!");
+ BasicBlock *NewExitBB = SplitBlockPredecessors(
+ DestBB, LoopPreds, "split", DT, LI, MSSAU, Options.PreserveLCSSA);
+ if (Options.PreserveLCSSA)
+ createPHIsForSplitLoopExit(LoopPreds, NewExitBB, DestBB);
+ }
+ }
+ }
+ }
+
+ return NewBB;
+}
+
+// Return the unique indirectbr predecessor of a block. This may return null
+// even if such a predecessor exists, if it's not useful for splitting.
+// If a predecessor is found, OtherPreds will contain all other (non-indirectbr)
+// predecessors of BB.
+static BasicBlock *
+findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) {
+ // If the block doesn't have any PHIs, we don't care about it, since there's
+ // no point in splitting it.
+ PHINode *PN = dyn_cast<PHINode>(BB->begin());
+ if (!PN)
+ return nullptr;
+
+ // Verify we have exactly one IBR predecessor.
+ // Conservatively bail out if one of the other predecessors is not a "regular"
+ // terminator (that is, not a switch or a br).
+ BasicBlock *IBB = nullptr;
+ for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) {
+ BasicBlock *PredBB = PN->getIncomingBlock(Pred);
+ Instruction *PredTerm = PredBB->getTerminator();
+ switch (PredTerm->getOpcode()) {
+ case Instruction::IndirectBr:
+ if (IBB)
+ return nullptr;
+ IBB = PredBB;
+ break;
+ case Instruction::Br:
+ case Instruction::Switch:
+ OtherPreds.push_back(PredBB);
+ continue;
+ default:
+ return nullptr;
+ }
+ }
+
+ return IBB;
+}
+
+bool llvm::SplitIndirectBrCriticalEdges(Function &F,
+ BranchProbabilityInfo *BPI,
+ BlockFrequencyInfo *BFI) {
+ // Check whether the function has any indirectbrs, and collect which blocks
+ // they may jump to. Since most functions don't have indirect branches,
+ // this lowers the common case's overhead to O(Blocks) instead of O(Edges).
+ SmallSetVector<BasicBlock *, 16> Targets;
+ for (auto &BB : F) {
+ auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator());
+ if (!IBI)
+ continue;
+
+ for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ)
+ Targets.insert(IBI->getSuccessor(Succ));
+ }
+
+ if (Targets.empty())
+ return false;
+
+ bool ShouldUpdateAnalysis = BPI && BFI;
+ bool Changed = false;
+ for (BasicBlock *Target : Targets) {
+ SmallVector<BasicBlock *, 16> OtherPreds;
+ BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds);
+ // If we did not found an indirectbr, or the indirectbr is the only
+ // incoming edge, this isn't the kind of edge we're looking for.
+ if (!IBRPred || OtherPreds.empty())
+ continue;
+
+ // Don't even think about ehpads/landingpads.
+ Instruction *FirstNonPHI = Target->getFirstNonPHI();
+ if (FirstNonPHI->isEHPad() || Target->isLandingPad())
+ continue;
+
+ // Remember edge probabilities if needed.
+ SmallVector<BranchProbability, 4> EdgeProbabilities;
+ if (ShouldUpdateAnalysis) {
+ EdgeProbabilities.reserve(Target->getTerminator()->getNumSuccessors());
+ for (unsigned I = 0, E = Target->getTerminator()->getNumSuccessors();
+ I < E; ++I)
+ EdgeProbabilities.emplace_back(BPI->getEdgeProbability(Target, I));
+ BPI->eraseBlock(Target);
+ }
+
+ BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split");
+ if (ShouldUpdateAnalysis) {
+ // Copy the BFI/BPI from Target to BodyBlock.
+ BPI->setEdgeProbability(BodyBlock, EdgeProbabilities);
+ BFI->setBlockFreq(BodyBlock, BFI->getBlockFreq(Target).getFrequency());
+ }
+ // It's possible Target was its own successor through an indirectbr.
+ // In this case, the indirectbr now comes from BodyBlock.
+ if (IBRPred == Target)
+ IBRPred = BodyBlock;
+
+ // At this point Target only has PHIs, and BodyBlock has the rest of the
+ // block's body. Create a copy of Target that will be used by the "direct"
+ // preds.
+ ValueToValueMapTy VMap;
+ BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F);
+
+ BlockFrequency BlockFreqForDirectSucc;
+ for (BasicBlock *Pred : OtherPreds) {
+ // If the target is a loop to itself, then the terminator of the split
+ // block (BodyBlock) needs to be updated.
+ BasicBlock *Src = Pred != Target ? Pred : BodyBlock;
+ Src->getTerminator()->replaceUsesOfWith(Target, DirectSucc);
+ if (ShouldUpdateAnalysis)
+ BlockFreqForDirectSucc += BFI->getBlockFreq(Src) *
+ BPI->getEdgeProbability(Src, DirectSucc);
+ }
+ if (ShouldUpdateAnalysis) {
+ BFI->setBlockFreq(DirectSucc, BlockFreqForDirectSucc.getFrequency());
+ BlockFrequency NewBlockFreqForTarget =
+ BFI->getBlockFreq(Target) - BlockFreqForDirectSucc;
+ BFI->setBlockFreq(Target, NewBlockFreqForTarget.getFrequency());
+ }
+
+ // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that
+ // they are clones, so the number of PHIs are the same.
+ // (a) Remove the edge coming from IBRPred from the "Direct" PHI
+ // (b) Leave that as the only edge in the "Indirect" PHI.
+ // (c) Merge the two in the body block.
+ BasicBlock::iterator Indirect = Target->begin(),
+ End = Target->getFirstNonPHI()->getIterator();
+ BasicBlock::iterator Direct = DirectSucc->begin();
+ BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt();
+
+ assert(&*End == Target->getTerminator() &&
+ "Block was expected to only contain PHIs");
+
+ while (Indirect != End) {
+ PHINode *DirPHI = cast<PHINode>(Direct);
+ PHINode *IndPHI = cast<PHINode>(Indirect);
+
+ // Now, clean up - the direct block shouldn't get the indirect value,
+ // and vice versa.
+ DirPHI->removeIncomingValue(IBRPred);
+ Direct++;
+
+ // Advance the pointer here, to avoid invalidation issues when the old
+ // PHI is erased.
+ Indirect++;
+
+ PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI);
+ NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred),
+ IBRPred);
+
+ // Create a PHI in the body block, to merge the direct and indirect
+ // predecessors.
+ PHINode *MergePHI =
+ PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert);
+ MergePHI->addIncoming(NewIndPHI, Target);
+ MergePHI->addIncoming(DirPHI, DirectSucc);
+
+ IndPHI->replaceAllUsesWith(MergePHI);
+ IndPHI->eraseFromParent();
+ }
+
+ Changed = true;
+ }
+
+ return Changed;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/BuildLibCalls.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/BuildLibCalls.cpp
index 205ea1b9fd..dba5403f27 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -1,61 +1,61 @@
-//===- BuildLibCalls.cpp - Utility builder for libcalls -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements some functions that will create standard C libcalls.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "build-libcalls"
-
-//- Infer Attributes ---------------------------------------------------------//
-
-STATISTIC(NumReadNone, "Number of functions inferred as readnone");
+//===- BuildLibCalls.cpp - Utility builder for libcalls -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some functions that will create standard C libcalls.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "build-libcalls"
+
+//- Infer Attributes ---------------------------------------------------------//
+
+STATISTIC(NumReadNone, "Number of functions inferred as readnone");
STATISTIC(NumInaccessibleMemOnly,
"Number of functions inferred as inaccessiblememonly");
-STATISTIC(NumReadOnly, "Number of functions inferred as readonly");
-STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly");
+STATISTIC(NumReadOnly, "Number of functions inferred as readonly");
+STATISTIC(NumArgMemOnly, "Number of functions inferred as argmemonly");
STATISTIC(NumInaccessibleMemOrArgMemOnly,
"Number of functions inferred as inaccessiblemem_or_argmemonly");
-STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind");
-STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture");
+STATISTIC(NumNoUnwind, "Number of functions inferred as nounwind");
+STATISTIC(NumNoCapture, "Number of arguments inferred as nocapture");
STATISTIC(NumWriteOnlyArg, "Number of arguments inferred as writeonly");
STATISTIC(NumSExtArg, "Number of arguments inferred as signext");
-STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly");
-STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
+STATISTIC(NumReadOnlyArg, "Number of arguments inferred as readonly");
+STATISTIC(NumNoAlias, "Number of function returns inferred as noalias");
STATISTIC(NumNoUndef, "Number of function returns inferred as noundef returns");
-STATISTIC(NumNonNull, "Number of function returns inferred as nonnull returns");
-STATISTIC(NumReturnedArg, "Number of arguments inferred as returned");
+STATISTIC(NumNonNull, "Number of function returns inferred as nonnull returns");
+STATISTIC(NumReturnedArg, "Number of arguments inferred as returned");
STATISTIC(NumWillReturn, "Number of functions inferred as willreturn");
-
-static bool setDoesNotAccessMemory(Function &F) {
- if (F.doesNotAccessMemory())
- return false;
- F.setDoesNotAccessMemory();
- ++NumReadNone;
- return true;
-}
-
+
+static bool setDoesNotAccessMemory(Function &F) {
+ if (F.doesNotAccessMemory())
+ return false;
+ F.setDoesNotAccessMemory();
+ ++NumReadNone;
+ return true;
+}
+
static bool setOnlyAccessesInaccessibleMemory(Function &F) {
if (F.onlyAccessesInaccessibleMemory())
return false;
@@ -64,22 +64,22 @@ static bool setOnlyAccessesInaccessibleMemory(Function &F) {
return true;
}
-static bool setOnlyReadsMemory(Function &F) {
- if (F.onlyReadsMemory())
- return false;
- F.setOnlyReadsMemory();
- ++NumReadOnly;
- return true;
-}
-
-static bool setOnlyAccessesArgMemory(Function &F) {
- if (F.onlyAccessesArgMemory())
- return false;
- F.setOnlyAccessesArgMemory();
- ++NumArgMemOnly;
- return true;
-}
-
+static bool setOnlyReadsMemory(Function &F) {
+ if (F.onlyReadsMemory())
+ return false;
+ F.setOnlyReadsMemory();
+ ++NumReadOnly;
+ return true;
+}
+
+static bool setOnlyAccessesArgMemory(Function &F) {
+ if (F.onlyAccessesArgMemory())
+ return false;
+ F.setOnlyAccessesArgMemory();
+ ++NumArgMemOnly;
+ return true;
+}
+
static bool setOnlyAccessesInaccessibleMemOrArgMem(Function &F) {
if (F.onlyAccessesInaccessibleMemOrArgMem())
return false;
@@ -88,54 +88,54 @@ static bool setOnlyAccessesInaccessibleMemOrArgMem(Function &F) {
return true;
}
-static bool setDoesNotThrow(Function &F) {
- if (F.doesNotThrow())
- return false;
- F.setDoesNotThrow();
- ++NumNoUnwind;
- return true;
-}
-
-static bool setRetDoesNotAlias(Function &F) {
- if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias))
- return false;
- F.addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
- ++NumNoAlias;
- return true;
-}
-
-static bool setDoesNotCapture(Function &F, unsigned ArgNo) {
- if (F.hasParamAttribute(ArgNo, Attribute::NoCapture))
- return false;
- F.addParamAttr(ArgNo, Attribute::NoCapture);
- ++NumNoCapture;
- return true;
-}
-
-static bool setDoesNotAlias(Function &F, unsigned ArgNo) {
- if (F.hasParamAttribute(ArgNo, Attribute::NoAlias))
- return false;
- F.addParamAttr(ArgNo, Attribute::NoAlias);
- ++NumNoAlias;
- return true;
-}
-
-static bool setOnlyReadsMemory(Function &F, unsigned ArgNo) {
- if (F.hasParamAttribute(ArgNo, Attribute::ReadOnly))
- return false;
- F.addParamAttr(ArgNo, Attribute::ReadOnly);
- ++NumReadOnlyArg;
- return true;
-}
-
+static bool setDoesNotThrow(Function &F) {
+ if (F.doesNotThrow())
+ return false;
+ F.setDoesNotThrow();
+ ++NumNoUnwind;
+ return true;
+}
+
+static bool setRetDoesNotAlias(Function &F) {
+ if (F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoAlias))
+ return false;
+ F.addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+ ++NumNoAlias;
+ return true;
+}
+
+static bool setDoesNotCapture(Function &F, unsigned ArgNo) {
+ if (F.hasParamAttribute(ArgNo, Attribute::NoCapture))
+ return false;
+ F.addParamAttr(ArgNo, Attribute::NoCapture);
+ ++NumNoCapture;
+ return true;
+}
+
+static bool setDoesNotAlias(Function &F, unsigned ArgNo) {
+ if (F.hasParamAttribute(ArgNo, Attribute::NoAlias))
+ return false;
+ F.addParamAttr(ArgNo, Attribute::NoAlias);
+ ++NumNoAlias;
+ return true;
+}
+
+static bool setOnlyReadsMemory(Function &F, unsigned ArgNo) {
+ if (F.hasParamAttribute(ArgNo, Attribute::ReadOnly))
+ return false;
+ F.addParamAttr(ArgNo, Attribute::ReadOnly);
+ ++NumReadOnlyArg;
+ return true;
+}
+
static bool setOnlyWritesMemory(Function &F, unsigned ArgNo) {
if (F.hasParamAttribute(ArgNo, Attribute::WriteOnly))
- return false;
+ return false;
F.addParamAttr(ArgNo, Attribute::WriteOnly);
++NumWriteOnlyArg;
- return true;
-}
-
+ return true;
+}
+
static bool setSignExtendedArg(Function &F, unsigned ArgNo) {
if (F.hasParamAttribute(ArgNo, Attribute::SExt))
return false;
@@ -170,28 +170,28 @@ static bool setRetAndArgsNoUndef(Function &F) {
return setRetNoUndef(F) | setArgsNoUndef(F);
}
-static bool setReturnedArg(Function &F, unsigned ArgNo) {
- if (F.hasParamAttribute(ArgNo, Attribute::Returned))
- return false;
- F.addParamAttr(ArgNo, Attribute::Returned);
- ++NumReturnedArg;
- return true;
-}
-
-static bool setNonLazyBind(Function &F) {
- if (F.hasFnAttribute(Attribute::NonLazyBind))
- return false;
- F.addFnAttr(Attribute::NonLazyBind);
- return true;
-}
-
-static bool setDoesNotFreeMemory(Function &F) {
- if (F.hasFnAttribute(Attribute::NoFree))
- return false;
- F.addFnAttr(Attribute::NoFree);
- return true;
-}
-
+static bool setReturnedArg(Function &F, unsigned ArgNo) {
+ if (F.hasParamAttribute(ArgNo, Attribute::Returned))
+ return false;
+ F.addParamAttr(ArgNo, Attribute::Returned);
+ ++NumReturnedArg;
+ return true;
+}
+
+static bool setNonLazyBind(Function &F) {
+ if (F.hasFnAttribute(Attribute::NonLazyBind))
+ return false;
+ F.addFnAttr(Attribute::NonLazyBind);
+ return true;
+}
+
+static bool setDoesNotFreeMemory(Function &F) {
+ if (F.hasFnAttribute(Attribute::NoFree))
+ return false;
+ F.addFnAttr(Attribute::NoFree);
+ return true;
+}
+
static bool setWillReturn(Function &F) {
if (F.hasFnAttribute(Attribute::WillReturn))
return false;
@@ -200,84 +200,84 @@ static bool setWillReturn(Function &F) {
return true;
}
-bool llvm::inferLibFuncAttributes(Module *M, StringRef Name,
- const TargetLibraryInfo &TLI) {
- Function *F = M->getFunction(Name);
- if (!F)
- return false;
- return inferLibFuncAttributes(*F, TLI);
-}
-
-bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
- LibFunc TheLibFunc;
- if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
- return false;
-
- bool Changed = false;
-
- if(!isLibFreeFunction(&F, TheLibFunc) && !isReallocLikeFn(&F, &TLI))
- Changed |= setDoesNotFreeMemory(F);
-
- if (F.getParent() != nullptr && F.getParent()->getRtLibUseGOT())
- Changed |= setNonLazyBind(F);
-
- switch (TheLibFunc) {
- case LibFunc_strlen:
- case LibFunc_wcslen:
- Changed |= setOnlyReadsMemory(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setOnlyAccessesArgMemory(F);
+bool llvm::inferLibFuncAttributes(Module *M, StringRef Name,
+ const TargetLibraryInfo &TLI) {
+ Function *F = M->getFunction(Name);
+ if (!F)
+ return false;
+ return inferLibFuncAttributes(*F, TLI);
+}
+
+bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
+ LibFunc TheLibFunc;
+ if (!(TLI.getLibFunc(F, TheLibFunc) && TLI.has(TheLibFunc)))
+ return false;
+
+ bool Changed = false;
+
+ if(!isLibFreeFunction(&F, TheLibFunc) && !isReallocLikeFn(&F, &TLI))
+ Changed |= setDoesNotFreeMemory(F);
+
+ if (F.getParent() != nullptr && F.getParent()->getRtLibUseGOT())
+ Changed |= setNonLazyBind(F);
+
+ switch (TheLibFunc) {
+ case LibFunc_strlen:
+ case LibFunc_wcslen:
+ Changed |= setOnlyReadsMemory(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setOnlyAccessesArgMemory(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_strchr:
- case LibFunc_strrchr:
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_strchr:
+ case LibFunc_strrchr:
Changed |= setOnlyAccessesArgMemory(F);
- Changed |= setOnlyReadsMemory(F);
- Changed |= setDoesNotThrow(F);
+ Changed |= setOnlyReadsMemory(F);
+ Changed |= setDoesNotThrow(F);
Changed |= setWillReturn(F);
- return Changed;
- case LibFunc_strtol:
- case LibFunc_strtod:
- case LibFunc_strtof:
- case LibFunc_strtoul:
- case LibFunc_strtoll:
- case LibFunc_strtold:
- case LibFunc_strtoull:
- Changed |= setDoesNotThrow(F);
+ return Changed;
+ case LibFunc_strtol:
+ case LibFunc_strtod:
+ case LibFunc_strtof:
+ case LibFunc_strtoul:
+ case LibFunc_strtoll:
+ case LibFunc_strtold:
+ case LibFunc_strtoull:
+ Changed |= setDoesNotThrow(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_strcpy:
- case LibFunc_strncpy:
- case LibFunc_strcat:
- case LibFunc_strncat:
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_strcpy:
+ case LibFunc_strncpy:
+ case LibFunc_strcat:
+ case LibFunc_strncat:
Changed |= setWillReturn(F);
- Changed |= setReturnedArg(F, 0);
- LLVM_FALLTHROUGH;
- case LibFunc_stpcpy:
- case LibFunc_stpncpy:
+ Changed |= setReturnedArg(F, 0);
+ LLVM_FALLTHROUGH;
+ case LibFunc_stpcpy:
+ case LibFunc_stpncpy:
Changed |= setOnlyAccessesArgMemory(F);
- Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotThrow(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 1);
+ Changed |= setDoesNotCapture(F, 1);
Changed |= setOnlyWritesMemory(F, 0);
- Changed |= setOnlyReadsMemory(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
Changed |= setDoesNotAlias(F, 0);
Changed |= setDoesNotAlias(F, 1);
- return Changed;
- case LibFunc_strxfrm:
- Changed |= setDoesNotThrow(F);
+ return Changed;
+ case LibFunc_strxfrm:
+ Changed |= setDoesNotThrow(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_strcmp: // 0,1
- case LibFunc_strspn: // 0,1
- case LibFunc_strncmp: // 0,1
- case LibFunc_strcspn: // 0,1
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_strcmp: // 0,1
+ case LibFunc_strspn: // 0,1
+ case LibFunc_strncmp: // 0,1
+ case LibFunc_strcspn: // 0,1
Changed |= setDoesNotThrow(F);
Changed |= setOnlyAccessesArgMemory(F);
Changed |= setWillReturn(F);
@@ -286,325 +286,325 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setDoesNotCapture(F, 1);
return Changed;
case LibFunc_strcoll:
- case LibFunc_strcasecmp: // 0,1
- case LibFunc_strncasecmp: //
+ case LibFunc_strcasecmp: // 0,1
+ case LibFunc_strncasecmp: //
// Those functions may depend on the locale, which may be accessed through
// global memory.
- Changed |= setOnlyReadsMemory(F);
- Changed |= setDoesNotThrow(F);
+ Changed |= setOnlyReadsMemory(F);
+ Changed |= setDoesNotThrow(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_strstr:
- case LibFunc_strpbrk:
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_strstr:
+ case LibFunc_strpbrk:
Changed |= setOnlyAccessesArgMemory(F);
- Changed |= setOnlyReadsMemory(F);
- Changed |= setDoesNotThrow(F);
+ Changed |= setOnlyReadsMemory(F);
+ Changed |= setDoesNotThrow(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_strtok:
- case LibFunc_strtok_r:
- Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_strtok:
+ case LibFunc_strtok_r:
+ Changed |= setDoesNotThrow(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_scanf:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_setbuf:
- case LibFunc_setvbuf:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_strdup:
- case LibFunc_strndup:
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_scanf:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_setbuf:
+ case LibFunc_setvbuf:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_strdup:
+ case LibFunc_strndup:
Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setRetDoesNotAlias(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_stat:
- case LibFunc_statvfs:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_sscanf:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 0);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_sprintf:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotAlias(F, 0);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_stat:
+ case LibFunc_statvfs:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_sscanf:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 0);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_sprintf:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotAlias(F, 0);
Changed |= setOnlyWritesMemory(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_snprintf:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotAlias(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_snprintf:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotAlias(F, 0);
Changed |= setOnlyWritesMemory(F, 0);
- Changed |= setDoesNotCapture(F, 2);
- Changed |= setOnlyReadsMemory(F, 2);
- return Changed;
- case LibFunc_setitimer:
+ Changed |= setDoesNotCapture(F, 2);
+ Changed |= setOnlyReadsMemory(F, 2);
+ return Changed;
+ case LibFunc_setitimer:
Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotThrow(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setDoesNotCapture(F, 2);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_system:
- // May throw; "system" is a valid pthread cancellation point.
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_malloc:
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setDoesNotCapture(F, 2);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_system:
+ // May throw; "system" is a valid pthread cancellation point.
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_malloc:
case LibFunc_vec_malloc:
Changed |= setOnlyAccessesInaccessibleMemory(F);
Changed |= setRetNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setRetDoesNotAlias(F);
Changed |= setWillReturn(F);
- return Changed;
- case LibFunc_memcmp:
+ return Changed;
+ case LibFunc_memcmp:
Changed |= setOnlyAccessesArgMemory(F);
- Changed |= setOnlyReadsMemory(F);
- Changed |= setDoesNotThrow(F);
+ Changed |= setOnlyReadsMemory(F);
+ Changed |= setDoesNotThrow(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_memchr:
- case LibFunc_memrchr:
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_memchr:
+ case LibFunc_memrchr:
Changed |= setDoesNotThrow(F);
Changed |= setOnlyAccessesArgMemory(F);
- Changed |= setOnlyReadsMemory(F);
+ Changed |= setOnlyReadsMemory(F);
Changed |= setWillReturn(F);
- return Changed;
- case LibFunc_modf:
- case LibFunc_modff:
- case LibFunc_modfl:
- Changed |= setDoesNotThrow(F);
+ return Changed;
+ case LibFunc_modf:
+ case LibFunc_modff:
+ case LibFunc_modfl:
+ Changed |= setDoesNotThrow(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_memcpy:
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_memcpy:
Changed |= setDoesNotThrow(F);
Changed |= setOnlyAccessesArgMemory(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotAlias(F, 0);
+ Changed |= setDoesNotAlias(F, 0);
Changed |= setReturnedArg(F, 0);
Changed |= setOnlyWritesMemory(F, 0);
- Changed |= setDoesNotAlias(F, 1);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_memmove:
+ Changed |= setDoesNotAlias(F, 1);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_memmove:
Changed |= setDoesNotThrow(F);
Changed |= setOnlyAccessesArgMemory(F);
Changed |= setWillReturn(F);
- Changed |= setReturnedArg(F, 0);
+ Changed |= setReturnedArg(F, 0);
Changed |= setOnlyWritesMemory(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_mempcpy:
- case LibFunc_memccpy:
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_mempcpy:
+ case LibFunc_memccpy:
Changed |= setDoesNotThrow(F);
Changed |= setOnlyAccessesArgMemory(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotAlias(F, 0);
+ Changed |= setDoesNotAlias(F, 0);
Changed |= setOnlyWritesMemory(F, 0);
- Changed |= setDoesNotAlias(F, 1);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_memcpy_chk:
- Changed |= setDoesNotThrow(F);
- return Changed;
- case LibFunc_memalign:
+ Changed |= setDoesNotAlias(F, 1);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_memcpy_chk:
+ Changed |= setDoesNotThrow(F);
+ return Changed;
+ case LibFunc_memalign:
Changed |= setOnlyAccessesInaccessibleMemory(F);
Changed |= setRetNoUndef(F);
Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
+ Changed |= setRetDoesNotAlias(F);
Changed |= setWillReturn(F);
- return Changed;
- case LibFunc_mkdir:
+ return Changed;
+ case LibFunc_mkdir:
Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_mktime:
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_mktime:
Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotThrow(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_realloc:
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_realloc:
case LibFunc_vec_realloc:
Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
Changed |= setRetNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setRetDoesNotAlias(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
case LibFunc_reallocf:
Changed |= setRetNoUndef(F);
Changed |= setWillReturn(F);
return Changed;
- case LibFunc_read:
- // May throw; "read" is a valid pthread cancellation point.
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_rewind:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_rmdir:
- case LibFunc_remove:
- case LibFunc_realpath:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_rename:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 0);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_readlink:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_write:
- // May throw; "write" is a valid pthread cancellation point.
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_aligned_alloc:
+ case LibFunc_read:
+ // May throw; "read" is a valid pthread cancellation point.
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_rewind:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_rmdir:
+ case LibFunc_remove:
+ case LibFunc_realpath:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_rename:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 0);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_readlink:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_write:
+ // May throw; "write" is a valid pthread cancellation point.
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_aligned_alloc:
Changed |= setOnlyAccessesInaccessibleMemory(F);
Changed |= setRetNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setRetDoesNotAlias(F);
Changed |= setWillReturn(F);
- return Changed;
- case LibFunc_bcopy:
- Changed |= setDoesNotThrow(F);
+ return Changed;
+ case LibFunc_bcopy:
+ Changed |= setDoesNotThrow(F);
Changed |= setOnlyAccessesArgMemory(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 0);
Changed |= setOnlyReadsMemory(F, 0);
Changed |= setOnlyWritesMemory(F, 1);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_bcmp:
- Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_bcmp:
+ Changed |= setDoesNotThrow(F);
Changed |= setOnlyAccessesArgMemory(F);
- Changed |= setOnlyReadsMemory(F);
+ Changed |= setOnlyReadsMemory(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_bzero:
- Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_bzero:
+ Changed |= setDoesNotThrow(F);
Changed |= setOnlyAccessesArgMemory(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 0);
Changed |= setOnlyWritesMemory(F, 0);
- return Changed;
- case LibFunc_calloc:
+ return Changed;
+ case LibFunc_calloc:
case LibFunc_vec_calloc:
Changed |= setOnlyAccessesInaccessibleMemory(F);
Changed |= setRetNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setRetDoesNotAlias(F);
Changed |= setWillReturn(F);
- return Changed;
- case LibFunc_chmod:
- case LibFunc_chown:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_ctermid:
- case LibFunc_clearerr:
- case LibFunc_closedir:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_atoi:
- case LibFunc_atol:
- case LibFunc_atof:
- case LibFunc_atoll:
- Changed |= setDoesNotThrow(F);
- Changed |= setOnlyReadsMemory(F);
+ return Changed;
+ case LibFunc_chmod:
+ case LibFunc_chown:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_ctermid:
+ case LibFunc_clearerr:
+ case LibFunc_closedir:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_atoi:
+ case LibFunc_atol:
+ case LibFunc_atof:
+ case LibFunc_atoll:
+ Changed |= setDoesNotThrow(F);
+ Changed |= setOnlyReadsMemory(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_access:
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_access:
Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_fopen:
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_fopen:
Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 0);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_fdopen:
+ Changed |= setDoesNotThrow(F);
+ Changed |= setRetDoesNotAlias(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 0);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_fdopen:
Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_feof:
+ Changed |= setDoesNotThrow(F);
+ Changed |= setRetDoesNotAlias(F);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_feof:
Changed |= setRetAndArgsNoUndef(F);
Changed |= setDoesNotThrow(F);
Changed |= setDoesNotCapture(F, 0);
return Changed;
- case LibFunc_free:
+ case LibFunc_free:
case LibFunc_vec_free:
Changed |= setOnlyAccessesInaccessibleMemOrArgMem(F);
Changed |= setArgsNoUndef(F);
@@ -612,411 +612,411 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setWillReturn(F);
Changed |= setDoesNotCapture(F, 0);
return Changed;
- case LibFunc_fseek:
- case LibFunc_ftell:
- case LibFunc_fgetc:
- case LibFunc_fgetc_unlocked:
- case LibFunc_fseeko:
- case LibFunc_ftello:
- case LibFunc_fileno:
- case LibFunc_fflush:
- case LibFunc_fclose:
- case LibFunc_fsetpos:
- case LibFunc_flockfile:
- case LibFunc_funlockfile:
- case LibFunc_ftrylockfile:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_ferror:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F);
- return Changed;
- case LibFunc_fputc:
- case LibFunc_fputc_unlocked:
- case LibFunc_fstat:
+ case LibFunc_fseek:
+ case LibFunc_ftell:
+ case LibFunc_fgetc:
+ case LibFunc_fgetc_unlocked:
+ case LibFunc_fseeko:
+ case LibFunc_ftello:
+ case LibFunc_fileno:
+ case LibFunc_fflush:
+ case LibFunc_fclose:
+ case LibFunc_fsetpos:
+ case LibFunc_flockfile:
+ case LibFunc_funlockfile:
+ case LibFunc_ftrylockfile:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_ferror:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F);
+ return Changed;
+ case LibFunc_fputc:
+ case LibFunc_fputc_unlocked:
+ case LibFunc_fstat:
Changed |= setRetAndArgsNoUndef(F);
Changed |= setDoesNotThrow(F);
Changed |= setDoesNotCapture(F, 1);
return Changed;
- case LibFunc_frexp:
- case LibFunc_frexpf:
- case LibFunc_frexpl:
+ case LibFunc_frexp:
+ case LibFunc_frexpf:
+ case LibFunc_frexpl:
Changed |= setDoesNotThrow(F);
Changed |= setWillReturn(F);
Changed |= setDoesNotCapture(F, 1);
return Changed;
- case LibFunc_fstatvfs:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_fgets:
- case LibFunc_fgets_unlocked:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 2);
- return Changed;
- case LibFunc_fread:
- case LibFunc_fread_unlocked:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 3);
- return Changed;
- case LibFunc_fwrite:
- case LibFunc_fwrite_unlocked:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 3);
- // FIXME: readonly #1?
- return Changed;
- case LibFunc_fputs:
- case LibFunc_fputs_unlocked:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_fscanf:
- case LibFunc_fprintf:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_fgetpos:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_getc:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_getlogin_r:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_getc_unlocked:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_getenv:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setOnlyReadsMemory(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_gets:
- case LibFunc_getchar:
- case LibFunc_getchar_unlocked:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- return Changed;
- case LibFunc_getitimer:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_getpwnam:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_ungetc:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_uname:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_unlink:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_unsetenv:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_utime:
- case LibFunc_utimes:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 0);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_putc:
- case LibFunc_putc_unlocked:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_puts:
- case LibFunc_printf:
- case LibFunc_perror:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_pread:
- // May throw; "pread" is a valid pthread cancellation point.
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_pwrite:
- // May throw; "pwrite" is a valid pthread cancellation point.
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_putchar:
- case LibFunc_putchar_unlocked:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- return Changed;
- case LibFunc_popen:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 0);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_pclose:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_vscanf:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_vsscanf:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 0);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_vfscanf:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_valloc:
+ case LibFunc_fstatvfs:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_fgets:
+ case LibFunc_fgets_unlocked:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 2);
+ return Changed;
+ case LibFunc_fread:
+ case LibFunc_fread_unlocked:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 3);
+ return Changed;
+ case LibFunc_fwrite:
+ case LibFunc_fwrite_unlocked:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 3);
+ // FIXME: readonly #1?
+ return Changed;
+ case LibFunc_fputs:
+ case LibFunc_fputs_unlocked:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_fscanf:
+ case LibFunc_fprintf:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_fgetpos:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_getc:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_getlogin_r:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_getc_unlocked:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_getenv:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setOnlyReadsMemory(F);
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_gets:
+ case LibFunc_getchar:
+ case LibFunc_getchar_unlocked:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ return Changed;
+ case LibFunc_getitimer:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_getpwnam:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_ungetc:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_uname:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_unlink:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_unsetenv:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_utime:
+ case LibFunc_utimes:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 0);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_putc:
+ case LibFunc_putc_unlocked:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_puts:
+ case LibFunc_printf:
+ case LibFunc_perror:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_pread:
+ // May throw; "pread" is a valid pthread cancellation point.
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_pwrite:
+ // May throw; "pwrite" is a valid pthread cancellation point.
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_putchar:
+ case LibFunc_putchar_unlocked:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ return Changed;
+ case LibFunc_popen:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setRetDoesNotAlias(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 0);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_pclose:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_vscanf:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_vsscanf:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 0);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_vfscanf:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_valloc:
Changed |= setOnlyAccessesInaccessibleMemory(F);
Changed |= setRetNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setRetDoesNotAlias(F);
Changed |= setWillReturn(F);
- return Changed;
- case LibFunc_vprintf:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_vfprintf:
- case LibFunc_vsprintf:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_vsnprintf:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 2);
- Changed |= setOnlyReadsMemory(F, 2);
- return Changed;
- case LibFunc_open:
- // May throw; "open" is a valid pthread cancellation point.
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_opendir:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_tmpfile:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
- return Changed;
- case LibFunc_times:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_htonl:
- case LibFunc_htons:
- case LibFunc_ntohl:
- case LibFunc_ntohs:
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotAccessMemory(F);
- return Changed;
- case LibFunc_lstat:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_lchown:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_qsort:
- // May throw; places call through function pointer.
+ return Changed;
+ case LibFunc_vprintf:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_vfprintf:
+ case LibFunc_vsprintf:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_vsnprintf:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 2);
+ Changed |= setOnlyReadsMemory(F, 2);
+ return Changed;
+ case LibFunc_open:
+ // May throw; "open" is a valid pthread cancellation point.
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_opendir:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setRetDoesNotAlias(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_tmpfile:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setRetDoesNotAlias(F);
+ return Changed;
+ case LibFunc_times:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_htonl:
+ case LibFunc_htons:
+ case LibFunc_ntohl:
+ case LibFunc_ntohs:
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotAccessMemory(F);
+ return Changed;
+ case LibFunc_lstat:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_lchown:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_qsort:
+ // May throw; places call through function pointer.
// Cannot give undef pointer/size
Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotCapture(F, 3);
- return Changed;
- case LibFunc_dunder_strdup:
- case LibFunc_dunder_strndup:
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
+ Changed |= setDoesNotCapture(F, 3);
+ return Changed;
+ case LibFunc_dunder_strdup:
+ case LibFunc_dunder_strndup:
+ Changed |= setDoesNotThrow(F);
+ Changed |= setRetDoesNotAlias(F);
Changed |= setWillReturn(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_dunder_strtok_r:
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_under_IO_getc:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_under_IO_putc:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_dunder_isoc99_scanf:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_stat64:
- case LibFunc_lstat64:
- case LibFunc_statvfs64:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_dunder_isoc99_sscanf:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 0);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_fopen64:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 0);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
- case LibFunc_fseeko64:
- case LibFunc_ftello64:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- return Changed;
- case LibFunc_tmpfile64:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setRetDoesNotAlias(F);
- return Changed;
- case LibFunc_fstat64:
- case LibFunc_fstatvfs64:
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- case LibFunc_open64:
- // May throw; "open" is a valid pthread cancellation point.
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setOnlyReadsMemory(F, 0);
- return Changed;
- case LibFunc_gettimeofday:
- // Currently some platforms have the restrict keyword on the arguments to
- // gettimeofday. To be conservative, do not add noalias to gettimeofday's
- // arguments.
- Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotThrow(F);
- Changed |= setDoesNotCapture(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- return Changed;
- // TODO: add LibFunc entries for:
- // case LibFunc_memset_pattern4:
- // case LibFunc_memset_pattern8:
- case LibFunc_memset_pattern16:
- Changed |= setOnlyAccessesArgMemory(F);
- Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_dunder_strtok_r:
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_under_IO_getc:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_under_IO_putc:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_dunder_isoc99_scanf:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_stat64:
+ case LibFunc_lstat64:
+ case LibFunc_statvfs64:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_dunder_isoc99_sscanf:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 0);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_fopen64:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setRetDoesNotAlias(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 0);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
+ case LibFunc_fseeko64:
+ case LibFunc_ftello64:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ return Changed;
+ case LibFunc_tmpfile64:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setRetDoesNotAlias(F);
+ return Changed;
+ case LibFunc_fstat64:
+ case LibFunc_fstatvfs64:
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ case LibFunc_open64:
+ // May throw; "open" is a valid pthread cancellation point.
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setOnlyReadsMemory(F, 0);
+ return Changed;
+ case LibFunc_gettimeofday:
+ // Currently some platforms have the restrict keyword on the arguments to
+ // gettimeofday. To be conservative, do not add noalias to gettimeofday's
+ // arguments.
+ Changed |= setRetAndArgsNoUndef(F);
+ Changed |= setDoesNotThrow(F);
+ Changed |= setDoesNotCapture(F, 0);
+ Changed |= setDoesNotCapture(F, 1);
+ return Changed;
+ // TODO: add LibFunc entries for:
+ // case LibFunc_memset_pattern4:
+ // case LibFunc_memset_pattern8:
+ case LibFunc_memset_pattern16:
+ Changed |= setOnlyAccessesArgMemory(F);
+ Changed |= setDoesNotCapture(F, 0);
Changed |= setOnlyWritesMemory(F, 0);
- Changed |= setDoesNotCapture(F, 1);
- Changed |= setOnlyReadsMemory(F, 1);
- return Changed;
+ Changed |= setDoesNotCapture(F, 1);
+ Changed |= setOnlyReadsMemory(F, 1);
+ return Changed;
case LibFunc_memset:
Changed |= setOnlyAccessesArgMemory(F);
Changed |= setWillReturn(F);
Changed |= setDoesNotThrow(F);
Changed |= setOnlyWritesMemory(F, 0);
return Changed;
- // int __nvvm_reflect(const char *)
- case LibFunc_nvvm_reflect:
+ // int __nvvm_reflect(const char *)
+ case LibFunc_nvvm_reflect:
Changed |= setRetAndArgsNoUndef(F);
- Changed |= setDoesNotAccessMemory(F);
- Changed |= setDoesNotThrow(F);
- return Changed;
+ Changed |= setDoesNotAccessMemory(F);
+ Changed |= setDoesNotThrow(F);
+ return Changed;
case LibFunc_ldexp:
case LibFunc_ldexpf:
case LibFunc_ldexpl:
@@ -1150,154 +1150,154 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
Changed |= setDoesNotFreeMemory(F);
Changed |= setWillReturn(F);
return Changed;
- default:
- // FIXME: It'd be really nice to cover all the library functions we're
- // aware of here.
- return false;
- }
-}
-
-bool llvm::hasFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
- LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn) {
- switch (Ty->getTypeID()) {
- case Type::HalfTyID:
- return false;
- case Type::FloatTyID:
- return TLI->has(FloatFn);
- case Type::DoubleTyID:
- return TLI->has(DoubleFn);
- default:
- return TLI->has(LongDoubleFn);
- }
-}
-
-StringRef llvm::getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty,
- LibFunc DoubleFn, LibFunc FloatFn,
- LibFunc LongDoubleFn) {
- assert(hasFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
- "Cannot get name for unavailable function!");
-
- switch (Ty->getTypeID()) {
- case Type::HalfTyID:
- llvm_unreachable("No name for HalfTy!");
- case Type::FloatTyID:
- return TLI->getName(FloatFn);
- case Type::DoubleTyID:
- return TLI->getName(DoubleFn);
- default:
- return TLI->getName(LongDoubleFn);
- }
-}
-
-//- Emit LibCalls ------------------------------------------------------------//
-
-Value *llvm::castToCStr(Value *V, IRBuilderBase &B) {
- unsigned AS = V->getType()->getPointerAddressSpace();
- return B.CreateBitCast(V, B.getInt8PtrTy(AS), "cstr");
-}
-
-static Value *emitLibCall(LibFunc TheLibFunc, Type *ReturnType,
- ArrayRef<Type *> ParamTypes,
- ArrayRef<Value *> Operands, IRBuilderBase &B,
- const TargetLibraryInfo *TLI,
- bool IsVaArgs = false) {
- if (!TLI->has(TheLibFunc))
- return nullptr;
-
- Module *M = B.GetInsertBlock()->getModule();
- StringRef FuncName = TLI->getName(TheLibFunc);
- FunctionType *FuncType = FunctionType::get(ReturnType, ParamTypes, IsVaArgs);
- FunctionCallee Callee = M->getOrInsertFunction(FuncName, FuncType);
- inferLibFuncAttributes(M, FuncName, *TLI);
- CallInst *CI = B.CreateCall(Callee, Operands, FuncName);
- if (const Function *F =
- dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
- CI->setCallingConv(F->getCallingConv());
- return CI;
-}
-
-Value *llvm::emitStrLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL,
- const TargetLibraryInfo *TLI) {
- LLVMContext &Context = B.GetInsertBlock()->getContext();
- return emitLibCall(LibFunc_strlen, DL.getIntPtrType(Context),
- B.getInt8PtrTy(), castToCStr(Ptr, B), B, TLI);
-}
-
-Value *llvm::emitStrDup(Value *Ptr, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- return emitLibCall(LibFunc_strdup, B.getInt8PtrTy(), B.getInt8PtrTy(),
- castToCStr(Ptr, B), B, TLI);
-}
-
-Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- Type *I8Ptr = B.getInt8PtrTy();
- Type *I32Ty = B.getInt32Ty();
- return emitLibCall(LibFunc_strchr, I8Ptr, {I8Ptr, I32Ty},
- {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, B, TLI);
-}
-
-Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
- const DataLayout &DL, const TargetLibraryInfo *TLI) {
- LLVMContext &Context = B.GetInsertBlock()->getContext();
- return emitLibCall(
- LibFunc_strncmp, B.getInt32Ty(),
- {B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)},
- {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI);
-}
-
-Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- Type *I8Ptr = B.getInt8PtrTy();
- return emitLibCall(LibFunc_strcpy, I8Ptr, {I8Ptr, I8Ptr},
- {castToCStr(Dst, B), castToCStr(Src, B)}, B, TLI);
-}
-
-Value *llvm::emitStpCpy(Value *Dst, Value *Src, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- Type *I8Ptr = B.getInt8PtrTy();
- return emitLibCall(LibFunc_stpcpy, I8Ptr, {I8Ptr, I8Ptr},
- {castToCStr(Dst, B), castToCStr(Src, B)}, B, TLI);
-}
-
-Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- Type *I8Ptr = B.getInt8PtrTy();
- return emitLibCall(LibFunc_strncpy, I8Ptr, {I8Ptr, I8Ptr, Len->getType()},
- {castToCStr(Dst, B), castToCStr(Src, B), Len}, B, TLI);
-}
-
-Value *llvm::emitStpNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- Type *I8Ptr = B.getInt8PtrTy();
- return emitLibCall(LibFunc_stpncpy, I8Ptr, {I8Ptr, I8Ptr, Len->getType()},
- {castToCStr(Dst, B), castToCStr(Src, B), Len}, B, TLI);
-}
-
-Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
- IRBuilderBase &B, const DataLayout &DL,
- const TargetLibraryInfo *TLI) {
- if (!TLI->has(LibFunc_memcpy_chk))
- return nullptr;
-
- Module *M = B.GetInsertBlock()->getModule();
- AttributeList AS;
- AS = AttributeList::get(M->getContext(), AttributeList::FunctionIndex,
- Attribute::NoUnwind);
- LLVMContext &Context = B.GetInsertBlock()->getContext();
- FunctionCallee MemCpy = M->getOrInsertFunction(
- "__memcpy_chk", AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(),
- B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context),
- DL.getIntPtrType(Context));
- Dst = castToCStr(Dst, B);
- Src = castToCStr(Src, B);
- CallInst *CI = B.CreateCall(MemCpy, {Dst, Src, Len, ObjSize});
- if (const Function *F =
- dyn_cast<Function>(MemCpy.getCallee()->stripPointerCasts()))
- CI->setCallingConv(F->getCallingConv());
- return CI;
-}
-
+ default:
+ // FIXME: It'd be really nice to cover all the library functions we're
+ // aware of here.
+ return false;
+ }
+}
+
+bool llvm::hasFloatFn(const TargetLibraryInfo *TLI, Type *Ty,
+ LibFunc DoubleFn, LibFunc FloatFn, LibFunc LongDoubleFn) {
+ switch (Ty->getTypeID()) {
+ case Type::HalfTyID:
+ return false;
+ case Type::FloatTyID:
+ return TLI->has(FloatFn);
+ case Type::DoubleTyID:
+ return TLI->has(DoubleFn);
+ default:
+ return TLI->has(LongDoubleFn);
+ }
+}
+
+StringRef llvm::getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty,
+ LibFunc DoubleFn, LibFunc FloatFn,
+ LibFunc LongDoubleFn) {
+ assert(hasFloatFn(TLI, Ty, DoubleFn, FloatFn, LongDoubleFn) &&
+ "Cannot get name for unavailable function!");
+
+ switch (Ty->getTypeID()) {
+ case Type::HalfTyID:
+ llvm_unreachable("No name for HalfTy!");
+ case Type::FloatTyID:
+ return TLI->getName(FloatFn);
+ case Type::DoubleTyID:
+ return TLI->getName(DoubleFn);
+ default:
+ return TLI->getName(LongDoubleFn);
+ }
+}
+
+//- Emit LibCalls ------------------------------------------------------------//
+
+Value *llvm::castToCStr(Value *V, IRBuilderBase &B) {
+ unsigned AS = V->getType()->getPointerAddressSpace();
+ return B.CreateBitCast(V, B.getInt8PtrTy(AS), "cstr");
+}
+
+static Value *emitLibCall(LibFunc TheLibFunc, Type *ReturnType,
+ ArrayRef<Type *> ParamTypes,
+ ArrayRef<Value *> Operands, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI,
+ bool IsVaArgs = false) {
+ if (!TLI->has(TheLibFunc))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ StringRef FuncName = TLI->getName(TheLibFunc);
+ FunctionType *FuncType = FunctionType::get(ReturnType, ParamTypes, IsVaArgs);
+ FunctionCallee Callee = M->getOrInsertFunction(FuncName, FuncType);
+ inferLibFuncAttributes(M, FuncName, *TLI);
+ CallInst *CI = B.CreateCall(Callee, Operands, FuncName);
+ if (const Function *F =
+ dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
+ CI->setCallingConv(F->getCallingConv());
+ return CI;
+}
+
+Value *llvm::emitStrLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL,
+ const TargetLibraryInfo *TLI) {
+ LLVMContext &Context = B.GetInsertBlock()->getContext();
+ return emitLibCall(LibFunc_strlen, DL.getIntPtrType(Context),
+ B.getInt8PtrTy(), castToCStr(Ptr, B), B, TLI);
+}
+
+Value *llvm::emitStrDup(Value *Ptr, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ return emitLibCall(LibFunc_strdup, B.getInt8PtrTy(), B.getInt8PtrTy(),
+ castToCStr(Ptr, B), B, TLI);
+}
+
+Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ Type *I8Ptr = B.getInt8PtrTy();
+ Type *I32Ty = B.getInt32Ty();
+ return emitLibCall(LibFunc_strchr, I8Ptr, {I8Ptr, I32Ty},
+ {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, B, TLI);
+}
+
+Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
+ const DataLayout &DL, const TargetLibraryInfo *TLI) {
+ LLVMContext &Context = B.GetInsertBlock()->getContext();
+ return emitLibCall(
+ LibFunc_strncmp, B.getInt32Ty(),
+ {B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)},
+ {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI);
+}
+
+Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ Type *I8Ptr = B.getInt8PtrTy();
+ return emitLibCall(LibFunc_strcpy, I8Ptr, {I8Ptr, I8Ptr},
+ {castToCStr(Dst, B), castToCStr(Src, B)}, B, TLI);
+}
+
+Value *llvm::emitStpCpy(Value *Dst, Value *Src, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ Type *I8Ptr = B.getInt8PtrTy();
+ return emitLibCall(LibFunc_stpcpy, I8Ptr, {I8Ptr, I8Ptr},
+ {castToCStr(Dst, B), castToCStr(Src, B)}, B, TLI);
+}
+
+Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ Type *I8Ptr = B.getInt8PtrTy();
+ return emitLibCall(LibFunc_strncpy, I8Ptr, {I8Ptr, I8Ptr, Len->getType()},
+ {castToCStr(Dst, B), castToCStr(Src, B), Len}, B, TLI);
+}
+
+Value *llvm::emitStpNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ Type *I8Ptr = B.getInt8PtrTy();
+ return emitLibCall(LibFunc_stpncpy, I8Ptr, {I8Ptr, I8Ptr, Len->getType()},
+ {castToCStr(Dst, B), castToCStr(Src, B), Len}, B, TLI);
+}
+
+Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
+ IRBuilderBase &B, const DataLayout &DL,
+ const TargetLibraryInfo *TLI) {
+ if (!TLI->has(LibFunc_memcpy_chk))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ AttributeList AS;
+ AS = AttributeList::get(M->getContext(), AttributeList::FunctionIndex,
+ Attribute::NoUnwind);
+ LLVMContext &Context = B.GetInsertBlock()->getContext();
+ FunctionCallee MemCpy = M->getOrInsertFunction(
+ "__memcpy_chk", AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(),
+ B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context),
+ DL.getIntPtrType(Context));
+ Dst = castToCStr(Dst, B);
+ Src = castToCStr(Src, B);
+ CallInst *CI = B.CreateCall(MemCpy, {Dst, Src, Len, ObjSize});
+ if (const Function *F =
+ dyn_cast<Function>(MemCpy.getCallee()->stripPointerCasts()))
+ CI->setCallingConv(F->getCallingConv());
+ return CI;
+}
+
Value *llvm::emitMemPCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
const DataLayout &DL, const TargetLibraryInfo *TLI) {
LLVMContext &Context = B.GetInsertBlock()->getContext();
@@ -1307,351 +1307,351 @@ Value *llvm::emitMemPCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
{Dst, Src, Len}, B, TLI);
}
-Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilderBase &B,
- const DataLayout &DL, const TargetLibraryInfo *TLI) {
- LLVMContext &Context = B.GetInsertBlock()->getContext();
- return emitLibCall(
- LibFunc_memchr, B.getInt8PtrTy(),
- {B.getInt8PtrTy(), B.getInt32Ty(), DL.getIntPtrType(Context)},
- {castToCStr(Ptr, B), Val, Len}, B, TLI);
-}
-
-Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
- const DataLayout &DL, const TargetLibraryInfo *TLI) {
- LLVMContext &Context = B.GetInsertBlock()->getContext();
- return emitLibCall(
- LibFunc_memcmp, B.getInt32Ty(),
- {B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)},
- {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI);
-}
-
-Value *llvm::emitBCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
- const DataLayout &DL, const TargetLibraryInfo *TLI) {
- LLVMContext &Context = B.GetInsertBlock()->getContext();
- return emitLibCall(
- LibFunc_bcmp, B.getInt32Ty(),
- {B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)},
- {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI);
-}
-
-Value *llvm::emitMemCCpy(Value *Ptr1, Value *Ptr2, Value *Val, Value *Len,
- IRBuilderBase &B, const TargetLibraryInfo *TLI) {
- return emitLibCall(
- LibFunc_memccpy, B.getInt8PtrTy(),
- {B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt32Ty(), Len->getType()},
- {Ptr1, Ptr2, Val, Len}, B, TLI);
-}
-
-Value *llvm::emitSNPrintf(Value *Dest, Value *Size, Value *Fmt,
- ArrayRef<Value *> VariadicArgs, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- SmallVector<Value *, 8> Args{castToCStr(Dest, B), Size, castToCStr(Fmt, B)};
+Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilderBase &B,
+ const DataLayout &DL, const TargetLibraryInfo *TLI) {
+ LLVMContext &Context = B.GetInsertBlock()->getContext();
+ return emitLibCall(
+ LibFunc_memchr, B.getInt8PtrTy(),
+ {B.getInt8PtrTy(), B.getInt32Ty(), DL.getIntPtrType(Context)},
+ {castToCStr(Ptr, B), Val, Len}, B, TLI);
+}
+
+Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
+ const DataLayout &DL, const TargetLibraryInfo *TLI) {
+ LLVMContext &Context = B.GetInsertBlock()->getContext();
+ return emitLibCall(
+ LibFunc_memcmp, B.getInt32Ty(),
+ {B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)},
+ {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI);
+}
+
+Value *llvm::emitBCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
+ const DataLayout &DL, const TargetLibraryInfo *TLI) {
+ LLVMContext &Context = B.GetInsertBlock()->getContext();
+ return emitLibCall(
+ LibFunc_bcmp, B.getInt32Ty(),
+ {B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context)},
+ {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI);
+}
+
+Value *llvm::emitMemCCpy(Value *Ptr1, Value *Ptr2, Value *Val, Value *Len,
+ IRBuilderBase &B, const TargetLibraryInfo *TLI) {
+ return emitLibCall(
+ LibFunc_memccpy, B.getInt8PtrTy(),
+ {B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt32Ty(), Len->getType()},
+ {Ptr1, Ptr2, Val, Len}, B, TLI);
+}
+
+Value *llvm::emitSNPrintf(Value *Dest, Value *Size, Value *Fmt,
+ ArrayRef<Value *> VariadicArgs, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ SmallVector<Value *, 8> Args{castToCStr(Dest, B), Size, castToCStr(Fmt, B)};
llvm::append_range(Args, VariadicArgs);
- return emitLibCall(LibFunc_snprintf, B.getInt32Ty(),
- {B.getInt8PtrTy(), Size->getType(), B.getInt8PtrTy()},
- Args, B, TLI, /*IsVaArgs=*/true);
-}
-
-Value *llvm::emitSPrintf(Value *Dest, Value *Fmt,
- ArrayRef<Value *> VariadicArgs, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- SmallVector<Value *, 8> Args{castToCStr(Dest, B), castToCStr(Fmt, B)};
+ return emitLibCall(LibFunc_snprintf, B.getInt32Ty(),
+ {B.getInt8PtrTy(), Size->getType(), B.getInt8PtrTy()},
+ Args, B, TLI, /*IsVaArgs=*/true);
+}
+
+Value *llvm::emitSPrintf(Value *Dest, Value *Fmt,
+ ArrayRef<Value *> VariadicArgs, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ SmallVector<Value *, 8> Args{castToCStr(Dest, B), castToCStr(Fmt, B)};
llvm::append_range(Args, VariadicArgs);
- return emitLibCall(LibFunc_sprintf, B.getInt32Ty(),
- {B.getInt8PtrTy(), B.getInt8PtrTy()}, Args, B, TLI,
- /*IsVaArgs=*/true);
-}
-
-Value *llvm::emitStrCat(Value *Dest, Value *Src, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- return emitLibCall(LibFunc_strcat, B.getInt8PtrTy(),
- {B.getInt8PtrTy(), B.getInt8PtrTy()},
- {castToCStr(Dest, B), castToCStr(Src, B)}, B, TLI);
-}
-
-Value *llvm::emitStrLCpy(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- return emitLibCall(LibFunc_strlcpy, Size->getType(),
- {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()},
- {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI);
-}
-
-Value *llvm::emitStrLCat(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- return emitLibCall(LibFunc_strlcat, Size->getType(),
- {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()},
- {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI);
-}
-
-Value *llvm::emitStrNCat(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- return emitLibCall(LibFunc_strncat, B.getInt8PtrTy(),
- {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()},
- {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI);
-}
-
-Value *llvm::emitVSNPrintf(Value *Dest, Value *Size, Value *Fmt, Value *VAList,
- IRBuilderBase &B, const TargetLibraryInfo *TLI) {
- return emitLibCall(
- LibFunc_vsnprintf, B.getInt32Ty(),
- {B.getInt8PtrTy(), Size->getType(), B.getInt8PtrTy(), VAList->getType()},
- {castToCStr(Dest, B), Size, castToCStr(Fmt, B), VAList}, B, TLI);
-}
-
-Value *llvm::emitVSPrintf(Value *Dest, Value *Fmt, Value *VAList,
- IRBuilderBase &B, const TargetLibraryInfo *TLI) {
- return emitLibCall(LibFunc_vsprintf, B.getInt32Ty(),
- {B.getInt8PtrTy(), B.getInt8PtrTy(), VAList->getType()},
- {castToCStr(Dest, B), castToCStr(Fmt, B), VAList}, B, TLI);
-}
-
-/// Append a suffix to the function name according to the type of 'Op'.
-static void appendTypeSuffix(Value *Op, StringRef &Name,
- SmallString<20> &NameBuffer) {
- if (!Op->getType()->isDoubleTy()) {
- NameBuffer += Name;
-
- if (Op->getType()->isFloatTy())
- NameBuffer += 'f';
- else
- NameBuffer += 'l';
-
- Name = NameBuffer;
- }
-}
-
-static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
- IRBuilderBase &B,
- const AttributeList &Attrs) {
- assert((Name != "") && "Must specify Name to emitUnaryFloatFnCall");
-
- Module *M = B.GetInsertBlock()->getModule();
- FunctionCallee Callee =
- M->getOrInsertFunction(Name, Op->getType(), Op->getType());
- CallInst *CI = B.CreateCall(Callee, Op, Name);
-
- // The incoming attribute set may have come from a speculatable intrinsic, but
- // is being replaced with a library call which is not allowed to be
- // speculatable.
- CI->setAttributes(Attrs.removeAttribute(B.getContext(),
- AttributeList::FunctionIndex,
- Attribute::Speculatable));
- if (const Function *F =
- dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
- CI->setCallingConv(F->getCallingConv());
-
- return CI;
-}
-
-Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilderBase &B,
- const AttributeList &Attrs) {
- SmallString<20> NameBuffer;
- appendTypeSuffix(Op, Name, NameBuffer);
-
- return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
-}
-
-Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
- LibFunc DoubleFn, LibFunc FloatFn,
- LibFunc LongDoubleFn, IRBuilderBase &B,
- const AttributeList &Attrs) {
- // Get the name of the function according to TLI.
- StringRef Name = getFloatFnName(TLI, Op->getType(),
- DoubleFn, FloatFn, LongDoubleFn);
-
- return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
-}
-
-static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
- StringRef Name, IRBuilderBase &B,
+ return emitLibCall(LibFunc_sprintf, B.getInt32Ty(),
+ {B.getInt8PtrTy(), B.getInt8PtrTy()}, Args, B, TLI,
+ /*IsVaArgs=*/true);
+}
+
+Value *llvm::emitStrCat(Value *Dest, Value *Src, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ return emitLibCall(LibFunc_strcat, B.getInt8PtrTy(),
+ {B.getInt8PtrTy(), B.getInt8PtrTy()},
+ {castToCStr(Dest, B), castToCStr(Src, B)}, B, TLI);
+}
+
+Value *llvm::emitStrLCpy(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ return emitLibCall(LibFunc_strlcpy, Size->getType(),
+ {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()},
+ {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI);
+}
+
+Value *llvm::emitStrLCat(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ return emitLibCall(LibFunc_strlcat, Size->getType(),
+ {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()},
+ {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI);
+}
+
+Value *llvm::emitStrNCat(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ return emitLibCall(LibFunc_strncat, B.getInt8PtrTy(),
+ {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()},
+ {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI);
+}
+
+Value *llvm::emitVSNPrintf(Value *Dest, Value *Size, Value *Fmt, Value *VAList,
+ IRBuilderBase &B, const TargetLibraryInfo *TLI) {
+ return emitLibCall(
+ LibFunc_vsnprintf, B.getInt32Ty(),
+ {B.getInt8PtrTy(), Size->getType(), B.getInt8PtrTy(), VAList->getType()},
+ {castToCStr(Dest, B), Size, castToCStr(Fmt, B), VAList}, B, TLI);
+}
+
+Value *llvm::emitVSPrintf(Value *Dest, Value *Fmt, Value *VAList,
+ IRBuilderBase &B, const TargetLibraryInfo *TLI) {
+ return emitLibCall(LibFunc_vsprintf, B.getInt32Ty(),
+ {B.getInt8PtrTy(), B.getInt8PtrTy(), VAList->getType()},
+ {castToCStr(Dest, B), castToCStr(Fmt, B), VAList}, B, TLI);
+}
+
+/// Append a suffix to the function name according to the type of 'Op'.
+static void appendTypeSuffix(Value *Op, StringRef &Name,
+ SmallString<20> &NameBuffer) {
+ if (!Op->getType()->isDoubleTy()) {
+ NameBuffer += Name;
+
+ if (Op->getType()->isFloatTy())
+ NameBuffer += 'f';
+ else
+ NameBuffer += 'l';
+
+ Name = NameBuffer;
+ }
+}
+
+static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
+ IRBuilderBase &B,
+ const AttributeList &Attrs) {
+ assert((Name != "") && "Must specify Name to emitUnaryFloatFnCall");
+
+ Module *M = B.GetInsertBlock()->getModule();
+ FunctionCallee Callee =
+ M->getOrInsertFunction(Name, Op->getType(), Op->getType());
+ CallInst *CI = B.CreateCall(Callee, Op, Name);
+
+ // The incoming attribute set may have come from a speculatable intrinsic, but
+ // is being replaced with a library call which is not allowed to be
+ // speculatable.
+ CI->setAttributes(Attrs.removeAttribute(B.getContext(),
+ AttributeList::FunctionIndex,
+ Attribute::Speculatable));
+ if (const Function *F =
+ dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
+ CI->setCallingConv(F->getCallingConv());
+
+ return CI;
+}
+
+Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilderBase &B,
+ const AttributeList &Attrs) {
+ SmallString<20> NameBuffer;
+ appendTypeSuffix(Op, Name, NameBuffer);
+
+ return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
+}
+
+Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
+ LibFunc DoubleFn, LibFunc FloatFn,
+ LibFunc LongDoubleFn, IRBuilderBase &B,
+ const AttributeList &Attrs) {
+ // Get the name of the function according to TLI.
+ StringRef Name = getFloatFnName(TLI, Op->getType(),
+ DoubleFn, FloatFn, LongDoubleFn);
+
+ return emitUnaryFloatFnCallHelper(Op, Name, B, Attrs);
+}
+
+static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
+ StringRef Name, IRBuilderBase &B,
const AttributeList &Attrs,
const TargetLibraryInfo *TLI = nullptr) {
- assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
-
- Module *M = B.GetInsertBlock()->getModule();
- FunctionCallee Callee = M->getOrInsertFunction(Name, Op1->getType(),
- Op1->getType(), Op2->getType());
+ assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
+
+ Module *M = B.GetInsertBlock()->getModule();
+ FunctionCallee Callee = M->getOrInsertFunction(Name, Op1->getType(),
+ Op1->getType(), Op2->getType());
if (TLI != nullptr)
inferLibFuncAttributes(M, Name, *TLI);
- CallInst *CI = B.CreateCall(Callee, { Op1, Op2 }, Name);
-
- // The incoming attribute set may have come from a speculatable intrinsic, but
- // is being replaced with a library call which is not allowed to be
- // speculatable.
- CI->setAttributes(Attrs.removeAttribute(B.getContext(),
- AttributeList::FunctionIndex,
- Attribute::Speculatable));
- if (const Function *F =
- dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
- CI->setCallingConv(F->getCallingConv());
-
- return CI;
-}
-
-Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
- IRBuilderBase &B,
- const AttributeList &Attrs) {
- assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
-
- SmallString<20> NameBuffer;
- appendTypeSuffix(Op1, Name, NameBuffer);
-
- return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs);
-}
-
-Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
- const TargetLibraryInfo *TLI,
- LibFunc DoubleFn, LibFunc FloatFn,
- LibFunc LongDoubleFn, IRBuilderBase &B,
- const AttributeList &Attrs) {
- // Get the name of the function according to TLI.
- StringRef Name = getFloatFnName(TLI, Op1->getType(),
- DoubleFn, FloatFn, LongDoubleFn);
-
+ CallInst *CI = B.CreateCall(Callee, { Op1, Op2 }, Name);
+
+ // The incoming attribute set may have come from a speculatable intrinsic, but
+ // is being replaced with a library call which is not allowed to be
+ // speculatable.
+ CI->setAttributes(Attrs.removeAttribute(B.getContext(),
+ AttributeList::FunctionIndex,
+ Attribute::Speculatable));
+ if (const Function *F =
+ dyn_cast<Function>(Callee.getCallee()->stripPointerCasts()))
+ CI->setCallingConv(F->getCallingConv());
+
+ return CI;
+}
+
+Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
+ IRBuilderBase &B,
+ const AttributeList &Attrs) {
+ assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
+
+ SmallString<20> NameBuffer;
+ appendTypeSuffix(Op1, Name, NameBuffer);
+
+ return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs);
+}
+
+Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
+ const TargetLibraryInfo *TLI,
+ LibFunc DoubleFn, LibFunc FloatFn,
+ LibFunc LongDoubleFn, IRBuilderBase &B,
+ const AttributeList &Attrs) {
+ // Get the name of the function according to TLI.
+ StringRef Name = getFloatFnName(TLI, Op1->getType(),
+ DoubleFn, FloatFn, LongDoubleFn);
+
return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs, TLI);
-}
-
-Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- if (!TLI->has(LibFunc_putchar))
- return nullptr;
-
- Module *M = B.GetInsertBlock()->getModule();
- StringRef PutCharName = TLI->getName(LibFunc_putchar);
- FunctionCallee PutChar =
- M->getOrInsertFunction(PutCharName, B.getInt32Ty(), B.getInt32Ty());
- inferLibFuncAttributes(M, PutCharName, *TLI);
- CallInst *CI = B.CreateCall(PutChar,
- B.CreateIntCast(Char,
- B.getInt32Ty(),
- /*isSigned*/true,
- "chari"),
- PutCharName);
-
- if (const Function *F =
- dyn_cast<Function>(PutChar.getCallee()->stripPointerCasts()))
- CI->setCallingConv(F->getCallingConv());
- return CI;
-}
-
-Value *llvm::emitPutS(Value *Str, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- if (!TLI->has(LibFunc_puts))
- return nullptr;
-
- Module *M = B.GetInsertBlock()->getModule();
- StringRef PutsName = TLI->getName(LibFunc_puts);
- FunctionCallee PutS =
- M->getOrInsertFunction(PutsName, B.getInt32Ty(), B.getInt8PtrTy());
- inferLibFuncAttributes(M, PutsName, *TLI);
- CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), PutsName);
- if (const Function *F =
- dyn_cast<Function>(PutS.getCallee()->stripPointerCasts()))
- CI->setCallingConv(F->getCallingConv());
- return CI;
-}
-
-Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- if (!TLI->has(LibFunc_fputc))
- return nullptr;
-
- Module *M = B.GetInsertBlock()->getModule();
- StringRef FPutcName = TLI->getName(LibFunc_fputc);
- FunctionCallee F = M->getOrInsertFunction(FPutcName, B.getInt32Ty(),
- B.getInt32Ty(), File->getType());
- if (File->getType()->isPointerTy())
- inferLibFuncAttributes(M, FPutcName, *TLI);
- Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
- "chari");
- CallInst *CI = B.CreateCall(F, {Char, File}, FPutcName);
-
- if (const Function *Fn =
- dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
- CI->setCallingConv(Fn->getCallingConv());
- return CI;
-}
-
-Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- if (!TLI->has(LibFunc_fputs))
- return nullptr;
-
- Module *M = B.GetInsertBlock()->getModule();
- StringRef FPutsName = TLI->getName(LibFunc_fputs);
- FunctionCallee F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(),
- B.getInt8PtrTy(), File->getType());
- if (File->getType()->isPointerTy())
- inferLibFuncAttributes(M, FPutsName, *TLI);
- CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, FPutsName);
-
- if (const Function *Fn =
- dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
- CI->setCallingConv(Fn->getCallingConv());
- return CI;
-}
-
-Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B,
- const DataLayout &DL, const TargetLibraryInfo *TLI) {
- if (!TLI->has(LibFunc_fwrite))
- return nullptr;
-
- Module *M = B.GetInsertBlock()->getModule();
- LLVMContext &Context = B.GetInsertBlock()->getContext();
- StringRef FWriteName = TLI->getName(LibFunc_fwrite);
- FunctionCallee F = M->getOrInsertFunction(
- FWriteName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
- DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
-
- if (File->getType()->isPointerTy())
- inferLibFuncAttributes(M, FWriteName, *TLI);
- CallInst *CI =
- B.CreateCall(F, {castToCStr(Ptr, B), Size,
- ConstantInt::get(DL.getIntPtrType(Context), 1), File});
-
- if (const Function *Fn =
- dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
- CI->setCallingConv(Fn->getCallingConv());
- return CI;
-}
-
-Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL,
- const TargetLibraryInfo *TLI) {
- if (!TLI->has(LibFunc_malloc))
- return nullptr;
-
- Module *M = B.GetInsertBlock()->getModule();
- StringRef MallocName = TLI->getName(LibFunc_malloc);
- LLVMContext &Context = B.GetInsertBlock()->getContext();
- FunctionCallee Malloc = M->getOrInsertFunction(MallocName, B.getInt8PtrTy(),
- DL.getIntPtrType(Context));
- inferLibFuncAttributes(M, MallocName, *TLI);
- CallInst *CI = B.CreateCall(Malloc, Num, MallocName);
-
- if (const Function *F =
- dyn_cast<Function>(Malloc.getCallee()->stripPointerCasts()))
- CI->setCallingConv(F->getCallingConv());
-
- return CI;
-}
-
-Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
- IRBuilderBase &B, const TargetLibraryInfo &TLI) {
- if (!TLI.has(LibFunc_calloc))
- return nullptr;
-
- Module *M = B.GetInsertBlock()->getModule();
- StringRef CallocName = TLI.getName(LibFunc_calloc);
- const DataLayout &DL = M->getDataLayout();
- IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
- FunctionCallee Calloc = M->getOrInsertFunction(
- CallocName, Attrs, B.getInt8PtrTy(), PtrType, PtrType);
- inferLibFuncAttributes(M, CallocName, TLI);
- CallInst *CI = B.CreateCall(Calloc, {Num, Size}, CallocName);
-
- if (const auto *F =
- dyn_cast<Function>(Calloc.getCallee()->stripPointerCasts()))
- CI->setCallingConv(F->getCallingConv());
-
- return CI;
-}
+}
+
+Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ if (!TLI->has(LibFunc_putchar))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ StringRef PutCharName = TLI->getName(LibFunc_putchar);
+ FunctionCallee PutChar =
+ M->getOrInsertFunction(PutCharName, B.getInt32Ty(), B.getInt32Ty());
+ inferLibFuncAttributes(M, PutCharName, *TLI);
+ CallInst *CI = B.CreateCall(PutChar,
+ B.CreateIntCast(Char,
+ B.getInt32Ty(),
+ /*isSigned*/true,
+ "chari"),
+ PutCharName);
+
+ if (const Function *F =
+ dyn_cast<Function>(PutChar.getCallee()->stripPointerCasts()))
+ CI->setCallingConv(F->getCallingConv());
+ return CI;
+}
+
+Value *llvm::emitPutS(Value *Str, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ if (!TLI->has(LibFunc_puts))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ StringRef PutsName = TLI->getName(LibFunc_puts);
+ FunctionCallee PutS =
+ M->getOrInsertFunction(PutsName, B.getInt32Ty(), B.getInt8PtrTy());
+ inferLibFuncAttributes(M, PutsName, *TLI);
+ CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), PutsName);
+ if (const Function *F =
+ dyn_cast<Function>(PutS.getCallee()->stripPointerCasts()))
+ CI->setCallingConv(F->getCallingConv());
+ return CI;
+}
+
+Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ if (!TLI->has(LibFunc_fputc))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ StringRef FPutcName = TLI->getName(LibFunc_fputc);
+ FunctionCallee F = M->getOrInsertFunction(FPutcName, B.getInt32Ty(),
+ B.getInt32Ty(), File->getType());
+ if (File->getType()->isPointerTy())
+ inferLibFuncAttributes(M, FPutcName, *TLI);
+ Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
+ "chari");
+ CallInst *CI = B.CreateCall(F, {Char, File}, FPutcName);
+
+ if (const Function *Fn =
+ dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
+ CI->setCallingConv(Fn->getCallingConv());
+ return CI;
+}
+
+Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ if (!TLI->has(LibFunc_fputs))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ StringRef FPutsName = TLI->getName(LibFunc_fputs);
+ FunctionCallee F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(),
+ B.getInt8PtrTy(), File->getType());
+ if (File->getType()->isPointerTy())
+ inferLibFuncAttributes(M, FPutsName, *TLI);
+ CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, FPutsName);
+
+ if (const Function *Fn =
+ dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
+ CI->setCallingConv(Fn->getCallingConv());
+ return CI;
+}
+
+Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B,
+ const DataLayout &DL, const TargetLibraryInfo *TLI) {
+ if (!TLI->has(LibFunc_fwrite))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ LLVMContext &Context = B.GetInsertBlock()->getContext();
+ StringRef FWriteName = TLI->getName(LibFunc_fwrite);
+ FunctionCallee F = M->getOrInsertFunction(
+ FWriteName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
+ DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+
+ if (File->getType()->isPointerTy())
+ inferLibFuncAttributes(M, FWriteName, *TLI);
+ CallInst *CI =
+ B.CreateCall(F, {castToCStr(Ptr, B), Size,
+ ConstantInt::get(DL.getIntPtrType(Context), 1), File});
+
+ if (const Function *Fn =
+ dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
+ CI->setCallingConv(Fn->getCallingConv());
+ return CI;
+}
+
+Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL,
+ const TargetLibraryInfo *TLI) {
+ if (!TLI->has(LibFunc_malloc))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ StringRef MallocName = TLI->getName(LibFunc_malloc);
+ LLVMContext &Context = B.GetInsertBlock()->getContext();
+ FunctionCallee Malloc = M->getOrInsertFunction(MallocName, B.getInt8PtrTy(),
+ DL.getIntPtrType(Context));
+ inferLibFuncAttributes(M, MallocName, *TLI);
+ CallInst *CI = B.CreateCall(Malloc, Num, MallocName);
+
+ if (const Function *F =
+ dyn_cast<Function>(Malloc.getCallee()->stripPointerCasts()))
+ CI->setCallingConv(F->getCallingConv());
+
+ return CI;
+}
+
+Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
+ IRBuilderBase &B, const TargetLibraryInfo &TLI) {
+ if (!TLI.has(LibFunc_calloc))
+ return nullptr;
+
+ Module *M = B.GetInsertBlock()->getModule();
+ StringRef CallocName = TLI.getName(LibFunc_calloc);
+ const DataLayout &DL = M->getDataLayout();
+ IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
+ FunctionCallee Calloc = M->getOrInsertFunction(
+ CallocName, Attrs, B.getInt8PtrTy(), PtrType, PtrType);
+ inferLibFuncAttributes(M, CallocName, TLI);
+ CallInst *CI = B.CreateCall(Calloc, {Num, Size}, CallocName);
+
+ if (const auto *F =
+ dyn_cast<Function>(Calloc.getCallee()->stripPointerCasts()))
+ CI->setCallingConv(F->getCallingConv());
+
+ return CI;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/BypassSlowDivision.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/BypassSlowDivision.cpp
index 4299153e7b..833d042106 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -1,482 +1,482 @@
-//===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains an optimization for div and rem on architectures that
-// execute short instructions significantly faster than longer instructions.
-// For example, on Intel Atom 32-bit divides are slow enough that during
-// runtime it is profitable to check the value of the operands, and if they are
-// positive and less than 256 use an unsigned 8-bit divide.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/BypassSlowDivision.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/KnownBits.h"
-#include <cassert>
-#include <cstdint>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "bypass-slow-division"
-
-namespace {
-
- struct QuotRemPair {
- Value *Quotient;
- Value *Remainder;
-
- QuotRemPair(Value *InQuotient, Value *InRemainder)
- : Quotient(InQuotient), Remainder(InRemainder) {}
- };
-
- /// A quotient and remainder, plus a BB from which they logically "originate".
- /// If you use Quotient or Remainder in a Phi node, you should use BB as its
- /// corresponding predecessor.
- struct QuotRemWithBB {
- BasicBlock *BB = nullptr;
- Value *Quotient = nullptr;
- Value *Remainder = nullptr;
- };
-
-using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
-using BypassWidthsTy = DenseMap<unsigned, unsigned>;
-using VisitedSetTy = SmallPtrSet<Instruction *, 4>;
-
-enum ValueRange {
- /// Operand definitely fits into BypassType. No runtime checks are needed.
- VALRNG_KNOWN_SHORT,
- /// A runtime check is required, as value range is unknown.
- VALRNG_UNKNOWN,
- /// Operand is unlikely to fit into BypassType. The bypassing should be
- /// disabled.
- VALRNG_LIKELY_LONG
-};
-
-class FastDivInsertionTask {
- bool IsValidTask = false;
- Instruction *SlowDivOrRem = nullptr;
- IntegerType *BypassType = nullptr;
- BasicBlock *MainBB = nullptr;
-
- bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
- ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
- QuotRemWithBB createSlowBB(BasicBlock *Successor);
- QuotRemWithBB createFastBB(BasicBlock *Successor);
- QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
- BasicBlock *PhiBB);
- Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
- Optional<QuotRemPair> insertFastDivAndRem();
-
- bool isSignedOp() {
- return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
- SlowDivOrRem->getOpcode() == Instruction::SRem;
- }
-
- bool isDivisionOp() {
- return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
- SlowDivOrRem->getOpcode() == Instruction::UDiv;
- }
-
- Type *getSlowType() { return SlowDivOrRem->getType(); }
-
-public:
- FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
-
- Value *getReplacement(DivCacheTy &Cache);
-};
-
-} // end anonymous namespace
-
-FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
- const BypassWidthsTy &BypassWidths) {
- switch (I->getOpcode()) {
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::URem:
- case Instruction::SRem:
- SlowDivOrRem = I;
- break;
- default:
- // I is not a div/rem operation.
- return;
- }
-
- // Skip division on vector types. Only optimize integer instructions.
- IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
- if (!SlowType)
- return;
-
- // Skip if this bitwidth is not bypassed.
- auto BI = BypassWidths.find(SlowType->getBitWidth());
- if (BI == BypassWidths.end())
- return;
-
- // Get type for div/rem instruction with bypass bitwidth.
- IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
- BypassType = BT;
-
- // The original basic block.
- MainBB = I->getParent();
-
- // The instruction is indeed a slow div or rem operation.
- IsValidTask = true;
-}
-
-/// Reuses previously-computed dividend or remainder from the current BB if
-/// operands and operation are identical. Otherwise calls insertFastDivAndRem to
-/// perform the optimization and caches the resulting dividend and remainder.
-/// If no replacement can be generated, nullptr is returned.
-Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
- // First, make sure that the task is valid.
- if (!IsValidTask)
- return nullptr;
-
- // Then, look for a value in Cache.
- Value *Dividend = SlowDivOrRem->getOperand(0);
- Value *Divisor = SlowDivOrRem->getOperand(1);
- DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
- auto CacheI = Cache.find(Key);
-
- if (CacheI == Cache.end()) {
- // If previous instance does not exist, try to insert fast div.
- Optional<QuotRemPair> OptResult = insertFastDivAndRem();
- // Bail out if insertFastDivAndRem has failed.
- if (!OptResult)
- return nullptr;
- CacheI = Cache.insert({Key, *OptResult}).first;
- }
-
- QuotRemPair &Value = CacheI->second;
- return isDivisionOp() ? Value.Quotient : Value.Remainder;
-}
-
-/// Check if a value looks like a hash.
-///
-/// The routine is expected to detect values computed using the most common hash
-/// algorithms. Typically, hash computations end with one of the following
-/// instructions:
-///
-/// 1) MUL with a constant wider than BypassType
-/// 2) XOR instruction
-///
-/// And even if we are wrong and the value is not a hash, it is still quite
-/// unlikely that such values will fit into BypassType.
-///
-/// To detect string hash algorithms like FNV we have to look through PHI-nodes.
-/// It is implemented as a depth-first search for values that look neither long
-/// nor hash-like.
-bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I)
- return false;
-
- switch (I->getOpcode()) {
- case Instruction::Xor:
- return true;
- case Instruction::Mul: {
- // After Constant Hoisting pass, long constants may be represented as
- // bitcast instructions. As a result, some constants may look like an
- // instruction at first, and an additional check is necessary to find out if
- // an operand is actually a constant.
- Value *Op1 = I->getOperand(1);
- ConstantInt *C = dyn_cast<ConstantInt>(Op1);
- if (!C && isa<BitCastInst>(Op1))
- C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
- return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
- }
- case Instruction::PHI:
- // Stop IR traversal in case of a crazy input code. This limits recursion
- // depth.
- if (Visited.size() >= 16)
- return false;
- // Do not visit nodes that have been visited already. We return true because
- // it means that we couldn't find any value that doesn't look hash-like.
- if (!Visited.insert(I).second)
- return true;
- return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
- // Ignore undef values as they probably don't affect the division
- // operands.
- return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
- isa<UndefValue>(V);
- });
- default:
- return false;
- }
-}
-
-/// Check if an integer value fits into our bypass type.
-ValueRange FastDivInsertionTask::getValueRange(Value *V,
- VisitedSetTy &Visited) {
- unsigned ShortLen = BypassType->getBitWidth();
- unsigned LongLen = V->getType()->getIntegerBitWidth();
-
- assert(LongLen > ShortLen && "Value type must be wider than BypassType");
- unsigned HiBits = LongLen - ShortLen;
-
- const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
- KnownBits Known(LongLen);
-
- computeKnownBits(V, Known, DL);
-
- if (Known.countMinLeadingZeros() >= HiBits)
- return VALRNG_KNOWN_SHORT;
-
- if (Known.countMaxLeadingZeros() < HiBits)
- return VALRNG_LIKELY_LONG;
-
- // Long integer divisions are often used in hashtable implementations. It's
- // not worth bypassing such divisions because hash values are extremely
- // unlikely to have enough leading zeros. The call below tries to detect
- // values that are unlikely to fit BypassType (including hashes).
- if (isHashLikeValue(V, Visited))
- return VALRNG_LIKELY_LONG;
-
- return VALRNG_UNKNOWN;
-}
-
-/// Add new basic block for slow div and rem operations and put it before
-/// SuccessorBB.
-QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
- QuotRemWithBB DivRemPair;
- DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
- MainBB->getParent(), SuccessorBB);
- IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
- Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
-
- Value *Dividend = SlowDivOrRem->getOperand(0);
- Value *Divisor = SlowDivOrRem->getOperand(1);
-
- if (isSignedOp()) {
- DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
- DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
- } else {
- DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
- DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
- }
-
- Builder.CreateBr(SuccessorBB);
- return DivRemPair;
-}
-
-/// Add new basic block for fast div and rem operations and put it before
-/// SuccessorBB.
-QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
- QuotRemWithBB DivRemPair;
- DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
- MainBB->getParent(), SuccessorBB);
- IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
- Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
-
- Value *Dividend = SlowDivOrRem->getOperand(0);
- Value *Divisor = SlowDivOrRem->getOperand(1);
- Value *ShortDivisorV =
- Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
- Value *ShortDividendV =
- Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
-
- // udiv/urem because this optimization only handles positive numbers.
- Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
- Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
- DivRemPair.Quotient =
- Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
- DivRemPair.Remainder =
- Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
- Builder.CreateBr(SuccessorBB);
-
- return DivRemPair;
-}
-
-/// Creates Phi nodes for result of Div and Rem.
-QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
- QuotRemWithBB &RHS,
- BasicBlock *PhiBB) {
- IRBuilder<> Builder(PhiBB, PhiBB->begin());
- Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
- PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
- QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
- QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
- PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
- RemPhi->addIncoming(LHS.Remainder, LHS.BB);
- RemPhi->addIncoming(RHS.Remainder, RHS.BB);
- return QuotRemPair(QuoPhi, RemPhi);
-}
-
-/// Creates a runtime check to test whether both the divisor and dividend fit
-/// into BypassType. The check is inserted at the end of MainBB. True return
-/// value means that the operands fit. Either of the operands may be NULL if it
-/// doesn't need a runtime check.
-Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
- assert((Op1 || Op2) && "Nothing to check");
- IRBuilder<> Builder(MainBB, MainBB->end());
- Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
-
- Value *OrV;
- if (Op1 && Op2)
- OrV = Builder.CreateOr(Op1, Op2);
- else
- OrV = Op1 ? Op1 : Op2;
-
- // BitMask is inverted to check if the operands are
- // larger than the bypass type
- uint64_t BitMask = ~BypassType->getBitMask();
- Value *AndV = Builder.CreateAnd(OrV, BitMask);
-
- // Compare operand values
- Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
- return Builder.CreateICmpEQ(AndV, ZeroV);
-}
-
-/// Substitutes the div/rem instruction with code that checks the value of the
-/// operands and uses a shorter-faster div/rem instruction when possible.
-Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
- Value *Dividend = SlowDivOrRem->getOperand(0);
- Value *Divisor = SlowDivOrRem->getOperand(1);
-
- VisitedSetTy SetL;
- ValueRange DividendRange = getValueRange(Dividend, SetL);
- if (DividendRange == VALRNG_LIKELY_LONG)
- return None;
-
- VisitedSetTy SetR;
- ValueRange DivisorRange = getValueRange(Divisor, SetR);
- if (DivisorRange == VALRNG_LIKELY_LONG)
- return None;
-
- bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
- bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
-
- if (DividendShort && DivisorShort) {
- // If both operands are known to be short then just replace the long
- // division with a short one in-place. Since we're not introducing control
- // flow in this case, narrowing the division is always a win, even if the
- // divisor is a constant (and will later get replaced by a multiplication).
-
- IRBuilder<> Builder(SlowDivOrRem);
- Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
- Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
- Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
- Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
- Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
- Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
- return QuotRemPair(ExtDiv, ExtRem);
- }
-
- if (isa<ConstantInt>(Divisor)) {
- // If the divisor is not a constant, DAGCombiner will convert it to a
- // multiplication by a magic constant. It isn't clear if it is worth
- // introducing control flow to get a narrower multiply.
- return None;
- }
-
- // After Constant Hoisting pass, long constants may be represented as
- // bitcast instructions. As a result, some constants may look like an
- // instruction at first, and an additional check is necessary to find out if
- // an operand is actually a constant.
- if (auto *BCI = dyn_cast<BitCastInst>(Divisor))
- if (BCI->getParent() == SlowDivOrRem->getParent() &&
- isa<ConstantInt>(BCI->getOperand(0)))
- return None;
-
- IRBuilder<> Builder(MainBB, MainBB->end());
- Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
-
- if (DividendShort && !isSignedOp()) {
- // If the division is unsigned and Dividend is known to be short, then
- // either
- // 1) Divisor is less or equal to Dividend, and the result can be computed
- // with a short division.
- // 2) Divisor is greater than Dividend. In this case, no division is needed
- // at all: The quotient is 0 and the remainder is equal to Dividend.
- //
- // So instead of checking at runtime whether Divisor fits into BypassType,
- // we emit a runtime check to differentiate between these two cases. This
- // lets us entirely avoid a long div.
-
- // Split the basic block before the div/rem.
- BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
- // Remove the unconditional branch from MainBB to SuccessorBB.
- MainBB->getInstList().back().eraseFromParent();
- QuotRemWithBB Long;
- Long.BB = MainBB;
- Long.Quotient = ConstantInt::get(getSlowType(), 0);
- Long.Remainder = Dividend;
- QuotRemWithBB Fast = createFastBB(SuccessorBB);
- QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
- Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
- Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
- return Result;
- } else {
- // General case. Create both slow and fast div/rem pairs and choose one of
- // them at runtime.
-
- // Split the basic block before the div/rem.
- BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
- // Remove the unconditional branch from MainBB to SuccessorBB.
- MainBB->getInstList().back().eraseFromParent();
- QuotRemWithBB Fast = createFastBB(SuccessorBB);
- QuotRemWithBB Slow = createSlowBB(SuccessorBB);
- QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
- Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
- DivisorShort ? nullptr : Divisor);
- Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
- return Result;
- }
-}
-
-/// This optimization identifies DIV/REM instructions in a BB that can be
-/// profitably bypassed and carried out with a shorter, faster divide.
-bool llvm::bypassSlowDivision(BasicBlock *BB,
- const BypassWidthsTy &BypassWidths) {
- DivCacheTy PerBBDivCache;
-
- bool MadeChange = false;
- Instruction *Next = &*BB->begin();
- while (Next != nullptr) {
- // We may add instructions immediately after I, but we want to skip over
- // them.
- Instruction *I = Next;
- Next = Next->getNextNode();
-
- // Ignore dead code to save time and avoid bugs.
- if (I->hasNUses(0))
- continue;
-
- FastDivInsertionTask Task(I, BypassWidths);
- if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
- I->replaceAllUsesWith(Replacement);
- I->eraseFromParent();
- MadeChange = true;
- }
- }
-
- // Above we eagerly create divs and rems, as pairs, so that we can efficiently
- // create divrem machine instructions. Now erase any unused divs / rems so we
- // don't leave extra instructions sitting around.
- for (auto &KV : PerBBDivCache)
- for (Value *V : {KV.second.Quotient, KV.second.Remainder})
- RecursivelyDeleteTriviallyDeadInstructions(V);
-
- return MadeChange;
-}
+//===- BypassSlowDivision.cpp - Bypass slow division ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an optimization for div and rem on architectures that
+// execute short instructions significantly faster than longer instructions.
+// For example, on Intel Atom 32-bit divides are slow enough that during
+// runtime it is profitable to check the value of the operands, and if they are
+// positive and less than 256 use an unsigned 8-bit divide.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/KnownBits.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "bypass-slow-division"
+
+namespace {
+
+ struct QuotRemPair {
+ Value *Quotient;
+ Value *Remainder;
+
+ QuotRemPair(Value *InQuotient, Value *InRemainder)
+ : Quotient(InQuotient), Remainder(InRemainder) {}
+ };
+
+ /// A quotient and remainder, plus a BB from which they logically "originate".
+ /// If you use Quotient or Remainder in a Phi node, you should use BB as its
+ /// corresponding predecessor.
+ struct QuotRemWithBB {
+ BasicBlock *BB = nullptr;
+ Value *Quotient = nullptr;
+ Value *Remainder = nullptr;
+ };
+
+using DivCacheTy = DenseMap<DivRemMapKey, QuotRemPair>;
+using BypassWidthsTy = DenseMap<unsigned, unsigned>;
+using VisitedSetTy = SmallPtrSet<Instruction *, 4>;
+
+enum ValueRange {
+ /// Operand definitely fits into BypassType. No runtime checks are needed.
+ VALRNG_KNOWN_SHORT,
+ /// A runtime check is required, as value range is unknown.
+ VALRNG_UNKNOWN,
+ /// Operand is unlikely to fit into BypassType. The bypassing should be
+ /// disabled.
+ VALRNG_LIKELY_LONG
+};
+
+class FastDivInsertionTask {
+ bool IsValidTask = false;
+ Instruction *SlowDivOrRem = nullptr;
+ IntegerType *BypassType = nullptr;
+ BasicBlock *MainBB = nullptr;
+
+ bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
+ ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
+ QuotRemWithBB createSlowBB(BasicBlock *Successor);
+ QuotRemWithBB createFastBB(BasicBlock *Successor);
+ QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
+ BasicBlock *PhiBB);
+ Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
+ Optional<QuotRemPair> insertFastDivAndRem();
+
+ bool isSignedOp() {
+ return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
+ SlowDivOrRem->getOpcode() == Instruction::SRem;
+ }
+
+ bool isDivisionOp() {
+ return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
+ SlowDivOrRem->getOpcode() == Instruction::UDiv;
+ }
+
+ Type *getSlowType() { return SlowDivOrRem->getType(); }
+
+public:
+ FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
+
+ Value *getReplacement(DivCacheTy &Cache);
+};
+
+} // end anonymous namespace
+
+FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
+ const BypassWidthsTy &BypassWidths) {
+ switch (I->getOpcode()) {
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ SlowDivOrRem = I;
+ break;
+ default:
+ // I is not a div/rem operation.
+ return;
+ }
+
+ // Skip division on vector types. Only optimize integer instructions.
+ IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
+ if (!SlowType)
+ return;
+
+ // Skip if this bitwidth is not bypassed.
+ auto BI = BypassWidths.find(SlowType->getBitWidth());
+ if (BI == BypassWidths.end())
+ return;
+
+ // Get type for div/rem instruction with bypass bitwidth.
+ IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
+ BypassType = BT;
+
+ // The original basic block.
+ MainBB = I->getParent();
+
+ // The instruction is indeed a slow div or rem operation.
+ IsValidTask = true;
+}
+
+/// Reuses previously-computed dividend or remainder from the current BB if
+/// operands and operation are identical. Otherwise calls insertFastDivAndRem to
+/// perform the optimization and caches the resulting dividend and remainder.
+/// If no replacement can be generated, nullptr is returned.
+Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
+ // First, make sure that the task is valid.
+ if (!IsValidTask)
+ return nullptr;
+
+ // Then, look for a value in Cache.
+ Value *Dividend = SlowDivOrRem->getOperand(0);
+ Value *Divisor = SlowDivOrRem->getOperand(1);
+ DivRemMapKey Key(isSignedOp(), Dividend, Divisor);
+ auto CacheI = Cache.find(Key);
+
+ if (CacheI == Cache.end()) {
+ // If previous instance does not exist, try to insert fast div.
+ Optional<QuotRemPair> OptResult = insertFastDivAndRem();
+ // Bail out if insertFastDivAndRem has failed.
+ if (!OptResult)
+ return nullptr;
+ CacheI = Cache.insert({Key, *OptResult}).first;
+ }
+
+ QuotRemPair &Value = CacheI->second;
+ return isDivisionOp() ? Value.Quotient : Value.Remainder;
+}
+
+/// Check if a value looks like a hash.
+///
+/// The routine is expected to detect values computed using the most common hash
+/// algorithms. Typically, hash computations end with one of the following
+/// instructions:
+///
+/// 1) MUL with a constant wider than BypassType
+/// 2) XOR instruction
+///
+/// And even if we are wrong and the value is not a hash, it is still quite
+/// unlikely that such values will fit into BypassType.
+///
+/// To detect string hash algorithms like FNV we have to look through PHI-nodes.
+/// It is implemented as a depth-first search for values that look neither long
+/// nor hash-like.
+bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I)
+ return false;
+
+ switch (I->getOpcode()) {
+ case Instruction::Xor:
+ return true;
+ case Instruction::Mul: {
+ // After Constant Hoisting pass, long constants may be represented as
+ // bitcast instructions. As a result, some constants may look like an
+ // instruction at first, and an additional check is necessary to find out if
+ // an operand is actually a constant.
+ Value *Op1 = I->getOperand(1);
+ ConstantInt *C = dyn_cast<ConstantInt>(Op1);
+ if (!C && isa<BitCastInst>(Op1))
+ C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
+ return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
+ }
+ case Instruction::PHI:
+ // Stop IR traversal in case of a crazy input code. This limits recursion
+ // depth.
+ if (Visited.size() >= 16)
+ return false;
+ // Do not visit nodes that have been visited already. We return true because
+ // it means that we couldn't find any value that doesn't look hash-like.
+ if (!Visited.insert(I).second)
+ return true;
+ return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
+ // Ignore undef values as they probably don't affect the division
+ // operands.
+ return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
+ isa<UndefValue>(V);
+ });
+ default:
+ return false;
+ }
+}
+
+/// Check if an integer value fits into our bypass type.
+ValueRange FastDivInsertionTask::getValueRange(Value *V,
+ VisitedSetTy &Visited) {
+ unsigned ShortLen = BypassType->getBitWidth();
+ unsigned LongLen = V->getType()->getIntegerBitWidth();
+
+ assert(LongLen > ShortLen && "Value type must be wider than BypassType");
+ unsigned HiBits = LongLen - ShortLen;
+
+ const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
+ KnownBits Known(LongLen);
+
+ computeKnownBits(V, Known, DL);
+
+ if (Known.countMinLeadingZeros() >= HiBits)
+ return VALRNG_KNOWN_SHORT;
+
+ if (Known.countMaxLeadingZeros() < HiBits)
+ return VALRNG_LIKELY_LONG;
+
+ // Long integer divisions are often used in hashtable implementations. It's
+ // not worth bypassing such divisions because hash values are extremely
+ // unlikely to have enough leading zeros. The call below tries to detect
+ // values that are unlikely to fit BypassType (including hashes).
+ if (isHashLikeValue(V, Visited))
+ return VALRNG_LIKELY_LONG;
+
+ return VALRNG_UNKNOWN;
+}
+
+/// Add new basic block for slow div and rem operations and put it before
+/// SuccessorBB.
+QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
+ QuotRemWithBB DivRemPair;
+ DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
+ MainBB->getParent(), SuccessorBB);
+ IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
+ Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
+
+ Value *Dividend = SlowDivOrRem->getOperand(0);
+ Value *Divisor = SlowDivOrRem->getOperand(1);
+
+ if (isSignedOp()) {
+ DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
+ DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
+ } else {
+ DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
+ DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
+ }
+
+ Builder.CreateBr(SuccessorBB);
+ return DivRemPair;
+}
+
+/// Add new basic block for fast div and rem operations and put it before
+/// SuccessorBB.
+QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
+ QuotRemWithBB DivRemPair;
+ DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
+ MainBB->getParent(), SuccessorBB);
+ IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
+ Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
+
+ Value *Dividend = SlowDivOrRem->getOperand(0);
+ Value *Divisor = SlowDivOrRem->getOperand(1);
+ Value *ShortDivisorV =
+ Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
+ Value *ShortDividendV =
+ Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
+
+ // udiv/urem because this optimization only handles positive numbers.
+ Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
+ Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
+ DivRemPair.Quotient =
+ Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
+ DivRemPair.Remainder =
+ Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
+ Builder.CreateBr(SuccessorBB);
+
+ return DivRemPair;
+}
+
+/// Creates Phi nodes for result of Div and Rem.
+QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
+ QuotRemWithBB &RHS,
+ BasicBlock *PhiBB) {
+ IRBuilder<> Builder(PhiBB, PhiBB->begin());
+ Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
+ PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
+ QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
+ QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
+ PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
+ RemPhi->addIncoming(LHS.Remainder, LHS.BB);
+ RemPhi->addIncoming(RHS.Remainder, RHS.BB);
+ return QuotRemPair(QuoPhi, RemPhi);
+}
+
+/// Creates a runtime check to test whether both the divisor and dividend fit
+/// into BypassType. The check is inserted at the end of MainBB. True return
+/// value means that the operands fit. Either of the operands may be NULL if it
+/// doesn't need a runtime check.
+Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
+ assert((Op1 || Op2) && "Nothing to check");
+ IRBuilder<> Builder(MainBB, MainBB->end());
+ Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
+
+ Value *OrV;
+ if (Op1 && Op2)
+ OrV = Builder.CreateOr(Op1, Op2);
+ else
+ OrV = Op1 ? Op1 : Op2;
+
+ // BitMask is inverted to check if the operands are
+ // larger than the bypass type
+ uint64_t BitMask = ~BypassType->getBitMask();
+ Value *AndV = Builder.CreateAnd(OrV, BitMask);
+
+ // Compare operand values
+ Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
+ return Builder.CreateICmpEQ(AndV, ZeroV);
+}
+
+/// Substitutes the div/rem instruction with code that checks the value of the
+/// operands and uses a shorter-faster div/rem instruction when possible.
+Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
+ Value *Dividend = SlowDivOrRem->getOperand(0);
+ Value *Divisor = SlowDivOrRem->getOperand(1);
+
+ VisitedSetTy SetL;
+ ValueRange DividendRange = getValueRange(Dividend, SetL);
+ if (DividendRange == VALRNG_LIKELY_LONG)
+ return None;
+
+ VisitedSetTy SetR;
+ ValueRange DivisorRange = getValueRange(Divisor, SetR);
+ if (DivisorRange == VALRNG_LIKELY_LONG)
+ return None;
+
+ bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
+ bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
+
+ if (DividendShort && DivisorShort) {
+ // If both operands are known to be short then just replace the long
+ // division with a short one in-place. Since we're not introducing control
+ // flow in this case, narrowing the division is always a win, even if the
+ // divisor is a constant (and will later get replaced by a multiplication).
+
+ IRBuilder<> Builder(SlowDivOrRem);
+ Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
+ Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
+ Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
+ Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
+ Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
+ Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
+ return QuotRemPair(ExtDiv, ExtRem);
+ }
+
+ if (isa<ConstantInt>(Divisor)) {
+ // If the divisor is not a constant, DAGCombiner will convert it to a
+ // multiplication by a magic constant. It isn't clear if it is worth
+ // introducing control flow to get a narrower multiply.
+ return None;
+ }
+
+ // After Constant Hoisting pass, long constants may be represented as
+ // bitcast instructions. As a result, some constants may look like an
+ // instruction at first, and an additional check is necessary to find out if
+ // an operand is actually a constant.
+ if (auto *BCI = dyn_cast<BitCastInst>(Divisor))
+ if (BCI->getParent() == SlowDivOrRem->getParent() &&
+ isa<ConstantInt>(BCI->getOperand(0)))
+ return None;
+
+ IRBuilder<> Builder(MainBB, MainBB->end());
+ Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
+
+ if (DividendShort && !isSignedOp()) {
+ // If the division is unsigned and Dividend is known to be short, then
+ // either
+ // 1) Divisor is less or equal to Dividend, and the result can be computed
+ // with a short division.
+ // 2) Divisor is greater than Dividend. In this case, no division is needed
+ // at all: The quotient is 0 and the remainder is equal to Dividend.
+ //
+ // So instead of checking at runtime whether Divisor fits into BypassType,
+ // we emit a runtime check to differentiate between these two cases. This
+ // lets us entirely avoid a long div.
+
+ // Split the basic block before the div/rem.
+ BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+ // Remove the unconditional branch from MainBB to SuccessorBB.
+ MainBB->getInstList().back().eraseFromParent();
+ QuotRemWithBB Long;
+ Long.BB = MainBB;
+ Long.Quotient = ConstantInt::get(getSlowType(), 0);
+ Long.Remainder = Dividend;
+ QuotRemWithBB Fast = createFastBB(SuccessorBB);
+ QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
+ Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
+ Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
+ return Result;
+ } else {
+ // General case. Create both slow and fast div/rem pairs and choose one of
+ // them at runtime.
+
+ // Split the basic block before the div/rem.
+ BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+ // Remove the unconditional branch from MainBB to SuccessorBB.
+ MainBB->getInstList().back().eraseFromParent();
+ QuotRemWithBB Fast = createFastBB(SuccessorBB);
+ QuotRemWithBB Slow = createSlowBB(SuccessorBB);
+ QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
+ Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
+ DivisorShort ? nullptr : Divisor);
+ Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
+ return Result;
+ }
+}
+
+/// This optimization identifies DIV/REM instructions in a BB that can be
+/// profitably bypassed and carried out with a shorter, faster divide.
+bool llvm::bypassSlowDivision(BasicBlock *BB,
+ const BypassWidthsTy &BypassWidths) {
+ DivCacheTy PerBBDivCache;
+
+ bool MadeChange = false;
+ Instruction *Next = &*BB->begin();
+ while (Next != nullptr) {
+ // We may add instructions immediately after I, but we want to skip over
+ // them.
+ Instruction *I = Next;
+ Next = Next->getNextNode();
+
+ // Ignore dead code to save time and avoid bugs.
+ if (I->hasNUses(0))
+ continue;
+
+ FastDivInsertionTask Task(I, BypassWidths);
+ if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
+ I->replaceAllUsesWith(Replacement);
+ I->eraseFromParent();
+ MadeChange = true;
+ }
+ }
+
+ // Above we eagerly create divs and rems, as pairs, so that we can efficiently
+ // create divrem machine instructions. Now erase any unused divs / rems so we
+ // don't leave extra instructions sitting around.
+ for (auto &KV : PerBBDivCache)
+ for (Value *V : {KV.second.Quotient, KV.second.Remainder})
+ RecursivelyDeleteTriviallyDeadInstructions(V);
+
+ return MadeChange;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CallGraphUpdater.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CallGraphUpdater.cpp
index f3facac06f..b2763900e1 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CallGraphUpdater.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CallGraphUpdater.cpp
@@ -1,168 +1,168 @@
-//===- CallGraphUpdater.cpp - A (lazy) call graph update helper -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-///
-/// This file provides interfaces used to manipulate a call graph, regardless
-/// if it is a "old style" CallGraph or an "new style" LazyCallGraph.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/CallGraphUpdater.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-
-using namespace llvm;
-
-bool CallGraphUpdater::finalize() {
- if (!DeadFunctionsInComdats.empty()) {
- filterDeadComdatFunctions(*DeadFunctionsInComdats.front()->getParent(),
- DeadFunctionsInComdats);
- DeadFunctions.append(DeadFunctionsInComdats.begin(),
- DeadFunctionsInComdats.end());
- }
-
- if (CG) {
- // First remove all references, e.g., outgoing via called functions. This is
- // necessary as we can delete functions that have circular references.
- for (Function *DeadFn : DeadFunctions) {
- DeadFn->removeDeadConstantUsers();
- CallGraphNode *DeadCGN = (*CG)[DeadFn];
- DeadCGN->removeAllCalledFunctions();
- CG->getExternalCallingNode()->removeAnyCallEdgeTo(DeadCGN);
- DeadFn->replaceAllUsesWith(UndefValue::get(DeadFn->getType()));
- }
-
- // Then remove the node and function from the module.
- for (Function *DeadFn : DeadFunctions) {
- CallGraphNode *DeadCGN = CG->getOrInsertFunction(DeadFn);
- assert(DeadCGN->getNumReferences() == 0 &&
- "References should have been handled by now");
- delete CG->removeFunctionFromModule(DeadCGN);
- }
- } else {
- // This is the code path for the new lazy call graph and for the case were
- // no call graph was provided.
- for (Function *DeadFn : DeadFunctions) {
- DeadFn->removeDeadConstantUsers();
- DeadFn->replaceAllUsesWith(UndefValue::get(DeadFn->getType()));
-
- if (LCG && !ReplacedFunctions.count(DeadFn)) {
- // Taken mostly from the inliner:
- LazyCallGraph::Node &N = LCG->get(*DeadFn);
- auto *DeadSCC = LCG->lookupSCC(N);
- assert(DeadSCC && DeadSCC->size() == 1 &&
- &DeadSCC->begin()->getFunction() == DeadFn);
- auto &DeadRC = DeadSCC->getOuterRefSCC();
-
- FunctionAnalysisManager &FAM =
- AM->getResult<FunctionAnalysisManagerCGSCCProxy>(*DeadSCC, *LCG)
- .getManager();
-
- FAM.clear(*DeadFn, DeadFn->getName());
- AM->clear(*DeadSCC, DeadSCC->getName());
- LCG->removeDeadFunction(*DeadFn);
-
- // Mark the relevant parts of the call graph as invalid so we don't
- // visit them.
- UR->InvalidatedSCCs.insert(DeadSCC);
- UR->InvalidatedRefSCCs.insert(&DeadRC);
- }
-
- // The function is now really dead and de-attached from everything.
- DeadFn->eraseFromParent();
- }
- }
-
- bool Changed = !DeadFunctions.empty();
- DeadFunctionsInComdats.clear();
- DeadFunctions.clear();
- return Changed;
-}
-
-void CallGraphUpdater::reanalyzeFunction(Function &Fn) {
- if (CG) {
- CallGraphNode *OldCGN = CG->getOrInsertFunction(&Fn);
- OldCGN->removeAllCalledFunctions();
- CG->populateCallGraphNode(OldCGN);
- } else if (LCG) {
- LazyCallGraph::Node &N = LCG->get(Fn);
- LazyCallGraph::SCC *C = LCG->lookupSCC(N);
- updateCGAndAnalysisManagerForCGSCCPass(*LCG, *C, N, *AM, *UR, *FAM);
- }
-}
-
+//===- CallGraphUpdater.cpp - A (lazy) call graph update helper -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides interfaces used to manipulate a call graph, regardless
+/// if it is a "old style" CallGraph or an "new style" LazyCallGraph.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+bool CallGraphUpdater::finalize() {
+ if (!DeadFunctionsInComdats.empty()) {
+ filterDeadComdatFunctions(*DeadFunctionsInComdats.front()->getParent(),
+ DeadFunctionsInComdats);
+ DeadFunctions.append(DeadFunctionsInComdats.begin(),
+ DeadFunctionsInComdats.end());
+ }
+
+ if (CG) {
+ // First remove all references, e.g., outgoing via called functions. This is
+ // necessary as we can delete functions that have circular references.
+ for (Function *DeadFn : DeadFunctions) {
+ DeadFn->removeDeadConstantUsers();
+ CallGraphNode *DeadCGN = (*CG)[DeadFn];
+ DeadCGN->removeAllCalledFunctions();
+ CG->getExternalCallingNode()->removeAnyCallEdgeTo(DeadCGN);
+ DeadFn->replaceAllUsesWith(UndefValue::get(DeadFn->getType()));
+ }
+
+ // Then remove the node and function from the module.
+ for (Function *DeadFn : DeadFunctions) {
+ CallGraphNode *DeadCGN = CG->getOrInsertFunction(DeadFn);
+ assert(DeadCGN->getNumReferences() == 0 &&
+ "References should have been handled by now");
+ delete CG->removeFunctionFromModule(DeadCGN);
+ }
+ } else {
+ // This is the code path for the new lazy call graph and for the case were
+ // no call graph was provided.
+ for (Function *DeadFn : DeadFunctions) {
+ DeadFn->removeDeadConstantUsers();
+ DeadFn->replaceAllUsesWith(UndefValue::get(DeadFn->getType()));
+
+ if (LCG && !ReplacedFunctions.count(DeadFn)) {
+ // Taken mostly from the inliner:
+ LazyCallGraph::Node &N = LCG->get(*DeadFn);
+ auto *DeadSCC = LCG->lookupSCC(N);
+ assert(DeadSCC && DeadSCC->size() == 1 &&
+ &DeadSCC->begin()->getFunction() == DeadFn);
+ auto &DeadRC = DeadSCC->getOuterRefSCC();
+
+ FunctionAnalysisManager &FAM =
+ AM->getResult<FunctionAnalysisManagerCGSCCProxy>(*DeadSCC, *LCG)
+ .getManager();
+
+ FAM.clear(*DeadFn, DeadFn->getName());
+ AM->clear(*DeadSCC, DeadSCC->getName());
+ LCG->removeDeadFunction(*DeadFn);
+
+ // Mark the relevant parts of the call graph as invalid so we don't
+ // visit them.
+ UR->InvalidatedSCCs.insert(DeadSCC);
+ UR->InvalidatedRefSCCs.insert(&DeadRC);
+ }
+
+ // The function is now really dead and de-attached from everything.
+ DeadFn->eraseFromParent();
+ }
+ }
+
+ bool Changed = !DeadFunctions.empty();
+ DeadFunctionsInComdats.clear();
+ DeadFunctions.clear();
+ return Changed;
+}
+
+void CallGraphUpdater::reanalyzeFunction(Function &Fn) {
+ if (CG) {
+ CallGraphNode *OldCGN = CG->getOrInsertFunction(&Fn);
+ OldCGN->removeAllCalledFunctions();
+ CG->populateCallGraphNode(OldCGN);
+ } else if (LCG) {
+ LazyCallGraph::Node &N = LCG->get(Fn);
+ LazyCallGraph::SCC *C = LCG->lookupSCC(N);
+ updateCGAndAnalysisManagerForCGSCCPass(*LCG, *C, N, *AM, *UR, *FAM);
+ }
+}
+
void CallGraphUpdater::registerOutlinedFunction(Function &OriginalFn,
Function &NewFn) {
- if (CG)
- CG->addToCallGraph(&NewFn);
- else if (LCG)
+ if (CG)
+ CG->addToCallGraph(&NewFn);
+ else if (LCG)
LCG->addSplitFunction(OriginalFn, NewFn);
-}
-
-void CallGraphUpdater::removeFunction(Function &DeadFn) {
- DeadFn.deleteBody();
- DeadFn.setLinkage(GlobalValue::ExternalLinkage);
- if (DeadFn.hasComdat())
- DeadFunctionsInComdats.push_back(&DeadFn);
- else
- DeadFunctions.push_back(&DeadFn);
-
- // For the old call graph we remove the function from the SCC right away.
- if (CG && !ReplacedFunctions.count(&DeadFn)) {
- CallGraphNode *DeadCGN = (*CG)[&DeadFn];
- DeadCGN->removeAllCalledFunctions();
- CGSCC->DeleteNode(DeadCGN);
- }
-}
-
-void CallGraphUpdater::replaceFunctionWith(Function &OldFn, Function &NewFn) {
- OldFn.removeDeadConstantUsers();
- ReplacedFunctions.insert(&OldFn);
- if (CG) {
- // Update the call graph for the newly promoted function.
- CallGraphNode *OldCGN = (*CG)[&OldFn];
- CallGraphNode *NewCGN = CG->getOrInsertFunction(&NewFn);
- NewCGN->stealCalledFunctionsFrom(OldCGN);
- CG->ReplaceExternalCallEdge(OldCGN, NewCGN);
-
- // And update the SCC we're iterating as well.
- CGSCC->ReplaceNode(OldCGN, NewCGN);
- } else if (LCG) {
- // Directly substitute the functions in the call graph.
- LazyCallGraph::Node &OldLCGN = LCG->get(OldFn);
- SCC->getOuterRefSCC().replaceNodeFunction(OldLCGN, NewFn);
- }
- removeFunction(OldFn);
-}
-
-bool CallGraphUpdater::replaceCallSite(CallBase &OldCS, CallBase &NewCS) {
- // This is only necessary in the (old) CG.
- if (!CG)
- return true;
-
- Function *Caller = OldCS.getCaller();
- CallGraphNode *NewCalleeNode =
- CG->getOrInsertFunction(NewCS.getCalledFunction());
- CallGraphNode *CallerNode = (*CG)[Caller];
- if (llvm::none_of(*CallerNode, [&OldCS](const CallGraphNode::CallRecord &CR) {
- return CR.first && *CR.first == &OldCS;
- }))
- return false;
- CallerNode->replaceCallEdge(OldCS, NewCS, NewCalleeNode);
- return true;
-}
-
-void CallGraphUpdater::removeCallSite(CallBase &CS) {
- // This is only necessary in the (old) CG.
- if (!CG)
- return;
-
- Function *Caller = CS.getCaller();
- CallGraphNode *CallerNode = (*CG)[Caller];
- CallerNode->removeCallEdgeFor(CS);
-}
+}
+
+void CallGraphUpdater::removeFunction(Function &DeadFn) {
+ DeadFn.deleteBody();
+ DeadFn.setLinkage(GlobalValue::ExternalLinkage);
+ if (DeadFn.hasComdat())
+ DeadFunctionsInComdats.push_back(&DeadFn);
+ else
+ DeadFunctions.push_back(&DeadFn);
+
+ // For the old call graph we remove the function from the SCC right away.
+ if (CG && !ReplacedFunctions.count(&DeadFn)) {
+ CallGraphNode *DeadCGN = (*CG)[&DeadFn];
+ DeadCGN->removeAllCalledFunctions();
+ CGSCC->DeleteNode(DeadCGN);
+ }
+}
+
+void CallGraphUpdater::replaceFunctionWith(Function &OldFn, Function &NewFn) {
+ OldFn.removeDeadConstantUsers();
+ ReplacedFunctions.insert(&OldFn);
+ if (CG) {
+ // Update the call graph for the newly promoted function.
+ CallGraphNode *OldCGN = (*CG)[&OldFn];
+ CallGraphNode *NewCGN = CG->getOrInsertFunction(&NewFn);
+ NewCGN->stealCalledFunctionsFrom(OldCGN);
+ CG->ReplaceExternalCallEdge(OldCGN, NewCGN);
+
+ // And update the SCC we're iterating as well.
+ CGSCC->ReplaceNode(OldCGN, NewCGN);
+ } else if (LCG) {
+ // Directly substitute the functions in the call graph.
+ LazyCallGraph::Node &OldLCGN = LCG->get(OldFn);
+ SCC->getOuterRefSCC().replaceNodeFunction(OldLCGN, NewFn);
+ }
+ removeFunction(OldFn);
+}
+
+bool CallGraphUpdater::replaceCallSite(CallBase &OldCS, CallBase &NewCS) {
+ // This is only necessary in the (old) CG.
+ if (!CG)
+ return true;
+
+ Function *Caller = OldCS.getCaller();
+ CallGraphNode *NewCalleeNode =
+ CG->getOrInsertFunction(NewCS.getCalledFunction());
+ CallGraphNode *CallerNode = (*CG)[Caller];
+ if (llvm::none_of(*CallerNode, [&OldCS](const CallGraphNode::CallRecord &CR) {
+ return CR.first && *CR.first == &OldCS;
+ }))
+ return false;
+ CallerNode->replaceCallEdge(OldCS, NewCS, NewCalleeNode);
+ return true;
+}
+
+void CallGraphUpdater::removeCallSite(CallBase &CS) {
+ // This is only necessary in the (old) CG.
+ if (!CG)
+ return;
+
+ Function *Caller = CS.getCaller();
+ CallGraphNode *CallerNode = (*CG)[Caller];
+ CallerNode->removeCallEdgeFor(CS);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CallPromotionUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CallPromotionUtils.cpp
index 9478516f98..bf08bf2747 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -1,595 +1,595 @@
-//===- CallPromotionUtils.cpp - Utilities for call promotion ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements utilities useful for promoting indirect call sites to
-// direct call sites.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/CallPromotionUtils.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/TypeMetadataUtils.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "call-promotion-utils"
-
-/// Fix-up phi nodes in an invoke instruction's normal destination.
-///
-/// After versioning an invoke instruction, values coming from the original
-/// block will now be coming from the "merge" block. For example, in the code
-/// below:
-///
-/// then_bb:
-/// %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
-///
-/// else_bb:
-/// %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
-///
-/// merge_bb:
-/// %t2 = phi i32 [ %t0, %then_bb ], [ %t1, %else_bb ]
-/// br %normal_dst
-///
-/// normal_dst:
-/// %t3 = phi i32 [ %x, %orig_bb ], ...
-///
-/// "orig_bb" is no longer a predecessor of "normal_dst", so the phi nodes in
-/// "normal_dst" must be fixed to refer to "merge_bb":
-///
-/// normal_dst:
-/// %t3 = phi i32 [ %x, %merge_bb ], ...
-///
-static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
- BasicBlock *MergeBlock) {
- for (PHINode &Phi : Invoke->getNormalDest()->phis()) {
- int Idx = Phi.getBasicBlockIndex(OrigBlock);
- if (Idx == -1)
- continue;
- Phi.setIncomingBlock(Idx, MergeBlock);
- }
-}
-
-/// Fix-up phi nodes in an invoke instruction's unwind destination.
-///
-/// After versioning an invoke instruction, values coming from the original
-/// block will now be coming from either the "then" block or the "else" block.
-/// For example, in the code below:
-///
-/// then_bb:
-/// %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
-///
-/// else_bb:
-/// %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
-///
-/// unwind_dst:
-/// %t3 = phi i32 [ %x, %orig_bb ], ...
-///
-/// "orig_bb" is no longer a predecessor of "unwind_dst", so the phi nodes in
-/// "unwind_dst" must be fixed to refer to "then_bb" and "else_bb":
-///
-/// unwind_dst:
-/// %t3 = phi i32 [ %x, %then_bb ], [ %x, %else_bb ], ...
-///
-static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
- BasicBlock *ThenBlock,
- BasicBlock *ElseBlock) {
- for (PHINode &Phi : Invoke->getUnwindDest()->phis()) {
- int Idx = Phi.getBasicBlockIndex(OrigBlock);
- if (Idx == -1)
- continue;
- auto *V = Phi.getIncomingValue(Idx);
- Phi.setIncomingBlock(Idx, ThenBlock);
- Phi.addIncoming(V, ElseBlock);
- }
-}
-
-/// Create a phi node for the returned value of a call or invoke instruction.
-///
-/// After versioning a call or invoke instruction that returns a value, we have
-/// to merge the value of the original and new instructions. We do this by
-/// creating a phi node and replacing uses of the original instruction with this
-/// phi node.
-///
-/// For example, if \p OrigInst is defined in "else_bb" and \p NewInst is
-/// defined in "then_bb", we create the following phi node:
-///
-/// ; Uses of the original instruction are replaced by uses of the phi node.
-/// %t0 = phi i32 [ %orig_inst, %else_bb ], [ %new_inst, %then_bb ],
-///
-static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst,
- BasicBlock *MergeBlock, IRBuilder<> &Builder) {
-
- if (OrigInst->getType()->isVoidTy() || OrigInst->use_empty())
- return;
-
- Builder.SetInsertPoint(&MergeBlock->front());
- PHINode *Phi = Builder.CreatePHI(OrigInst->getType(), 0);
+//===- CallPromotionUtils.cpp - Utilities for call promotion ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities useful for promoting indirect call sites to
+// direct call sites.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "call-promotion-utils"
+
+/// Fix-up phi nodes in an invoke instruction's normal destination.
+///
+/// After versioning an invoke instruction, values coming from the original
+/// block will now be coming from the "merge" block. For example, in the code
+/// below:
+///
+/// then_bb:
+/// %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+/// else_bb:
+/// %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+/// merge_bb:
+/// %t2 = phi i32 [ %t0, %then_bb ], [ %t1, %else_bb ]
+/// br %normal_dst
+///
+/// normal_dst:
+/// %t3 = phi i32 [ %x, %orig_bb ], ...
+///
+/// "orig_bb" is no longer a predecessor of "normal_dst", so the phi nodes in
+/// "normal_dst" must be fixed to refer to "merge_bb":
+///
+/// normal_dst:
+/// %t3 = phi i32 [ %x, %merge_bb ], ...
+///
+static void fixupPHINodeForNormalDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
+ BasicBlock *MergeBlock) {
+ for (PHINode &Phi : Invoke->getNormalDest()->phis()) {
+ int Idx = Phi.getBasicBlockIndex(OrigBlock);
+ if (Idx == -1)
+ continue;
+ Phi.setIncomingBlock(Idx, MergeBlock);
+ }
+}
+
+/// Fix-up phi nodes in an invoke instruction's unwind destination.
+///
+/// After versioning an invoke instruction, values coming from the original
+/// block will now be coming from either the "then" block or the "else" block.
+/// For example, in the code below:
+///
+/// then_bb:
+/// %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+/// else_bb:
+/// %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+/// unwind_dst:
+/// %t3 = phi i32 [ %x, %orig_bb ], ...
+///
+/// "orig_bb" is no longer a predecessor of "unwind_dst", so the phi nodes in
+/// "unwind_dst" must be fixed to refer to "then_bb" and "else_bb":
+///
+/// unwind_dst:
+/// %t3 = phi i32 [ %x, %then_bb ], [ %x, %else_bb ], ...
+///
+static void fixupPHINodeForUnwindDest(InvokeInst *Invoke, BasicBlock *OrigBlock,
+ BasicBlock *ThenBlock,
+ BasicBlock *ElseBlock) {
+ for (PHINode &Phi : Invoke->getUnwindDest()->phis()) {
+ int Idx = Phi.getBasicBlockIndex(OrigBlock);
+ if (Idx == -1)
+ continue;
+ auto *V = Phi.getIncomingValue(Idx);
+ Phi.setIncomingBlock(Idx, ThenBlock);
+ Phi.addIncoming(V, ElseBlock);
+ }
+}
+
+/// Create a phi node for the returned value of a call or invoke instruction.
+///
+/// After versioning a call or invoke instruction that returns a value, we have
+/// to merge the value of the original and new instructions. We do this by
+/// creating a phi node and replacing uses of the original instruction with this
+/// phi node.
+///
+/// For example, if \p OrigInst is defined in "else_bb" and \p NewInst is
+/// defined in "then_bb", we create the following phi node:
+///
+/// ; Uses of the original instruction are replaced by uses of the phi node.
+/// %t0 = phi i32 [ %orig_inst, %else_bb ], [ %new_inst, %then_bb ],
+///
+static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst,
+ BasicBlock *MergeBlock, IRBuilder<> &Builder) {
+
+ if (OrigInst->getType()->isVoidTy() || OrigInst->use_empty())
+ return;
+
+ Builder.SetInsertPoint(&MergeBlock->front());
+ PHINode *Phi = Builder.CreatePHI(OrigInst->getType(), 0);
SmallVector<User *, 16> UsersToUpdate(OrigInst->users());
- for (User *U : UsersToUpdate)
- U->replaceUsesOfWith(OrigInst, Phi);
- Phi->addIncoming(OrigInst, OrigInst->getParent());
- Phi->addIncoming(NewInst, NewInst->getParent());
-}
-
-/// Cast a call or invoke instruction to the given type.
-///
-/// When promoting a call site, the return type of the call site might not match
-/// that of the callee. If this is the case, we have to cast the returned value
-/// to the correct type. The location of the cast depends on if we have a call
-/// or invoke instruction.
-///
-/// For example, if the call instruction below requires a bitcast after
-/// promotion:
-///
-/// orig_bb:
-/// %t0 = call i32 @func()
-/// ...
-///
-/// The bitcast is placed after the call instruction:
-///
-/// orig_bb:
-/// ; Uses of the original return value are replaced by uses of the bitcast.
-/// %t0 = call i32 @func()
-/// %t1 = bitcast i32 %t0 to ...
-/// ...
-///
-/// A similar transformation is performed for invoke instructions. However,
-/// since invokes are terminating, a new block is created for the bitcast. For
-/// example, if the invoke instruction below requires a bitcast after promotion:
-///
-/// orig_bb:
-/// %t0 = invoke i32 @func() to label %normal_dst unwind label %unwind_dst
-///
-/// The edge between the original block and the invoke's normal destination is
-/// split, and the bitcast is placed there:
-///
-/// orig_bb:
-/// %t0 = invoke i32 @func() to label %split_bb unwind label %unwind_dst
-///
-/// split_bb:
-/// ; Uses of the original return value are replaced by uses of the bitcast.
-/// %t1 = bitcast i32 %t0 to ...
-/// br label %normal_dst
-///
-static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
-
- // Save the users of the calling instruction. These uses will be changed to
- // use the bitcast after we create it.
+ for (User *U : UsersToUpdate)
+ U->replaceUsesOfWith(OrigInst, Phi);
+ Phi->addIncoming(OrigInst, OrigInst->getParent());
+ Phi->addIncoming(NewInst, NewInst->getParent());
+}
+
+/// Cast a call or invoke instruction to the given type.
+///
+/// When promoting a call site, the return type of the call site might not match
+/// that of the callee. If this is the case, we have to cast the returned value
+/// to the correct type. The location of the cast depends on if we have a call
+/// or invoke instruction.
+///
+/// For example, if the call instruction below requires a bitcast after
+/// promotion:
+///
+/// orig_bb:
+/// %t0 = call i32 @func()
+/// ...
+///
+/// The bitcast is placed after the call instruction:
+///
+/// orig_bb:
+/// ; Uses of the original return value are replaced by uses of the bitcast.
+/// %t0 = call i32 @func()
+/// %t1 = bitcast i32 %t0 to ...
+/// ...
+///
+/// A similar transformation is performed for invoke instructions. However,
+/// since invokes are terminating, a new block is created for the bitcast. For
+/// example, if the invoke instruction below requires a bitcast after promotion:
+///
+/// orig_bb:
+/// %t0 = invoke i32 @func() to label %normal_dst unwind label %unwind_dst
+///
+/// The edge between the original block and the invoke's normal destination is
+/// split, and the bitcast is placed there:
+///
+/// orig_bb:
+/// %t0 = invoke i32 @func() to label %split_bb unwind label %unwind_dst
+///
+/// split_bb:
+/// ; Uses of the original return value are replaced by uses of the bitcast.
+/// %t1 = bitcast i32 %t0 to ...
+/// br label %normal_dst
+///
+static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
+
+ // Save the users of the calling instruction. These uses will be changed to
+ // use the bitcast after we create it.
SmallVector<User *, 16> UsersToUpdate(CB.users());
-
- // Determine an appropriate location to create the bitcast for the return
- // value. The location depends on if we have a call or invoke instruction.
- Instruction *InsertBefore = nullptr;
- if (auto *Invoke = dyn_cast<InvokeInst>(&CB))
- InsertBefore =
- &SplitEdge(Invoke->getParent(), Invoke->getNormalDest())->front();
- else
- InsertBefore = &*std::next(CB.getIterator());
-
- // Bitcast the return value to the correct type.
- auto *Cast = CastInst::CreateBitOrPointerCast(&CB, RetTy, "", InsertBefore);
- if (RetBitCast)
- *RetBitCast = Cast;
-
- // Replace all the original uses of the calling instruction with the bitcast.
- for (User *U : UsersToUpdate)
- U->replaceUsesOfWith(&CB, Cast);
-}
-
-/// Predicate and clone the given call site.
-///
-/// This function creates an if-then-else structure at the location of the call
-/// site. The "if" condition compares the call site's called value to the given
-/// callee. The original call site is moved into the "else" block, and a clone
-/// of the call site is placed in the "then" block. The cloned instruction is
-/// returned.
-///
-/// For example, the call instruction below:
-///
-/// orig_bb:
-/// %t0 = call i32 %ptr()
-/// ...
-///
-/// Is replace by the following:
-///
-/// orig_bb:
-/// %cond = icmp eq i32 ()* %ptr, @func
-/// br i1 %cond, %then_bb, %else_bb
-///
-/// then_bb:
-/// ; The clone of the original call instruction is placed in the "then"
-/// ; block. It is not yet promoted.
-/// %t1 = call i32 %ptr()
-/// br merge_bb
-///
-/// else_bb:
-/// ; The original call instruction is moved to the "else" block.
-/// %t0 = call i32 %ptr()
-/// br merge_bb
-///
-/// merge_bb:
-/// ; Uses of the original call instruction are replaced by uses of the phi
-/// ; node.
-/// %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ]
-/// ...
-///
-/// A similar transformation is performed for invoke instructions. However,
-/// since invokes are terminating, more work is required. For example, the
-/// invoke instruction below:
-///
-/// orig_bb:
-/// %t0 = invoke %ptr() to label %normal_dst unwind label %unwind_dst
-///
-/// Is replace by the following:
-///
-/// orig_bb:
-/// %cond = icmp eq i32 ()* %ptr, @func
-/// br i1 %cond, %then_bb, %else_bb
-///
-/// then_bb:
-/// ; The clone of the original invoke instruction is placed in the "then"
-/// ; block, and its normal destination is set to the "merge" block. It is
-/// ; not yet promoted.
-/// %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
-///
-/// else_bb:
-/// ; The original invoke instruction is moved into the "else" block, and
-/// ; its normal destination is set to the "merge" block.
-/// %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
-///
-/// merge_bb:
-/// ; Uses of the original invoke instruction are replaced by uses of the
-/// ; phi node, and the merge block branches to the normal destination.
-/// %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ]
-/// br %normal_dst
-///
-/// An indirect musttail call is processed slightly differently in that:
-/// 1. No merge block needed for the orginal and the cloned callsite, since
-/// either one ends the flow. No phi node is needed either.
-/// 2. The return statement following the original call site is duplicated too
-/// and placed immediately after the cloned call site per the IR convention.
-///
-/// For example, the musttail call instruction below:
-///
-/// orig_bb:
-/// %t0 = musttail call i32 %ptr()
-/// ...
-///
-/// Is replaced by the following:
-///
-/// cond_bb:
-/// %cond = icmp eq i32 ()* %ptr, @func
-/// br i1 %cond, %then_bb, %orig_bb
-///
-/// then_bb:
-/// ; The clone of the original call instruction is placed in the "then"
-/// ; block. It is not yet promoted.
-/// %t1 = musttail call i32 %ptr()
-/// ret %t1
-///
-/// orig_bb:
-/// ; The original call instruction stays in its original block.
-/// %t0 = musttail call i32 %ptr()
-/// ret %t0
-static CallBase &versionCallSite(CallBase &CB, Value *Callee,
- MDNode *BranchWeights) {
-
- IRBuilder<> Builder(&CB);
- CallBase *OrigInst = &CB;
- BasicBlock *OrigBlock = OrigInst->getParent();
-
- // Create the compare. The called value and callee must have the same type to
- // be compared.
- if (CB.getCalledOperand()->getType() != Callee->getType())
- Callee = Builder.CreateBitCast(Callee, CB.getCalledOperand()->getType());
- auto *Cond = Builder.CreateICmpEQ(CB.getCalledOperand(), Callee);
-
- if (OrigInst->isMustTailCall()) {
- // Create an if-then structure. The original instruction stays in its block,
- // and a clone of the original instruction is placed in the "then" block.
- Instruction *ThenTerm =
- SplitBlockAndInsertIfThen(Cond, &CB, false, BranchWeights);
- BasicBlock *ThenBlock = ThenTerm->getParent();
- ThenBlock->setName("if.true.direct_targ");
- CallBase *NewInst = cast<CallBase>(OrigInst->clone());
- NewInst->insertBefore(ThenTerm);
-
- // Place a clone of the optional bitcast after the new call site.
- Value *NewRetVal = NewInst;
- auto Next = OrigInst->getNextNode();
- if (auto *BitCast = dyn_cast_or_null<BitCastInst>(Next)) {
- assert(BitCast->getOperand(0) == OrigInst &&
- "bitcast following musttail call must use the call");
- auto NewBitCast = BitCast->clone();
- NewBitCast->replaceUsesOfWith(OrigInst, NewInst);
- NewBitCast->insertBefore(ThenTerm);
- NewRetVal = NewBitCast;
- Next = BitCast->getNextNode();
- }
-
- // Place a clone of the return instruction after the new call site.
- ReturnInst *Ret = dyn_cast_or_null<ReturnInst>(Next);
- assert(Ret && "musttail call must precede a ret with an optional bitcast");
- auto NewRet = Ret->clone();
- if (Ret->getReturnValue())
- NewRet->replaceUsesOfWith(Ret->getReturnValue(), NewRetVal);
- NewRet->insertBefore(ThenTerm);
-
- // A return instructions is terminating, so we don't need the terminator
- // instruction just created.
- ThenTerm->eraseFromParent();
-
- return *NewInst;
- }
-
- // Create an if-then-else structure. The original instruction is moved into
- // the "else" block, and a clone of the original instruction is placed in the
- // "then" block.
- Instruction *ThenTerm = nullptr;
- Instruction *ElseTerm = nullptr;
- SplitBlockAndInsertIfThenElse(Cond, &CB, &ThenTerm, &ElseTerm, BranchWeights);
- BasicBlock *ThenBlock = ThenTerm->getParent();
- BasicBlock *ElseBlock = ElseTerm->getParent();
- BasicBlock *MergeBlock = OrigInst->getParent();
-
- ThenBlock->setName("if.true.direct_targ");
- ElseBlock->setName("if.false.orig_indirect");
- MergeBlock->setName("if.end.icp");
-
- CallBase *NewInst = cast<CallBase>(OrigInst->clone());
- OrigInst->moveBefore(ElseTerm);
- NewInst->insertBefore(ThenTerm);
-
- // If the original call site is an invoke instruction, we have extra work to
- // do since invoke instructions are terminating. We have to fix-up phi nodes
- // in the invoke's normal and unwind destinations.
- if (auto *OrigInvoke = dyn_cast<InvokeInst>(OrigInst)) {
- auto *NewInvoke = cast<InvokeInst>(NewInst);
-
- // Invoke instructions are terminating, so we don't need the terminator
- // instructions that were just created.
- ThenTerm->eraseFromParent();
- ElseTerm->eraseFromParent();
-
- // Branch from the "merge" block to the original normal destination.
- Builder.SetInsertPoint(MergeBlock);
- Builder.CreateBr(OrigInvoke->getNormalDest());
-
- // Fix-up phi nodes in the original invoke's normal and unwind destinations.
- fixupPHINodeForNormalDest(OrigInvoke, OrigBlock, MergeBlock);
- fixupPHINodeForUnwindDest(OrigInvoke, MergeBlock, ThenBlock, ElseBlock);
-
- // Now set the normal destinations of the invoke instructions to be the
- // "merge" block.
- OrigInvoke->setNormalDest(MergeBlock);
- NewInvoke->setNormalDest(MergeBlock);
- }
-
- // Create a phi node for the returned value of the call site.
- createRetPHINode(OrigInst, NewInst, MergeBlock, Builder);
-
- return *NewInst;
-}
-
-bool llvm::isLegalToPromote(const CallBase &CB, Function *Callee,
- const char **FailureReason) {
- assert(!CB.getCalledFunction() && "Only indirect call sites can be promoted");
-
- auto &DL = Callee->getParent()->getDataLayout();
-
- // Check the return type. The callee's return value type must be bitcast
- // compatible with the call site's type.
- Type *CallRetTy = CB.getType();
- Type *FuncRetTy = Callee->getReturnType();
- if (CallRetTy != FuncRetTy)
- if (!CastInst::isBitOrNoopPointerCastable(FuncRetTy, CallRetTy, DL)) {
- if (FailureReason)
- *FailureReason = "Return type mismatch";
- return false;
- }
-
- // The number of formal arguments of the callee.
- unsigned NumParams = Callee->getFunctionType()->getNumParams();
-
- // The number of actual arguments in the call.
- unsigned NumArgs = CB.arg_size();
-
- // Check the number of arguments. The callee and call site must agree on the
- // number of arguments.
- if (NumArgs != NumParams && !Callee->isVarArg()) {
- if (FailureReason)
- *FailureReason = "The number of arguments mismatch";
- return false;
- }
-
- // Check the argument types. The callee's formal argument types must be
- // bitcast compatible with the corresponding actual argument types of the call
- // site.
- unsigned I = 0;
- for (; I < NumParams; ++I) {
- Type *FormalTy = Callee->getFunctionType()->getFunctionParamType(I);
- Type *ActualTy = CB.getArgOperand(I)->getType();
- if (FormalTy == ActualTy)
- continue;
- if (!CastInst::isBitOrNoopPointerCastable(ActualTy, FormalTy, DL)) {
- if (FailureReason)
- *FailureReason = "Argument type mismatch";
- return false;
- }
- }
- for (; I < NumArgs; I++) {
+
+ // Determine an appropriate location to create the bitcast for the return
+ // value. The location depends on if we have a call or invoke instruction.
+ Instruction *InsertBefore = nullptr;
+ if (auto *Invoke = dyn_cast<InvokeInst>(&CB))
+ InsertBefore =
+ &SplitEdge(Invoke->getParent(), Invoke->getNormalDest())->front();
+ else
+ InsertBefore = &*std::next(CB.getIterator());
+
+ // Bitcast the return value to the correct type.
+ auto *Cast = CastInst::CreateBitOrPointerCast(&CB, RetTy, "", InsertBefore);
+ if (RetBitCast)
+ *RetBitCast = Cast;
+
+ // Replace all the original uses of the calling instruction with the bitcast.
+ for (User *U : UsersToUpdate)
+ U->replaceUsesOfWith(&CB, Cast);
+}
+
+/// Predicate and clone the given call site.
+///
+/// This function creates an if-then-else structure at the location of the call
+/// site. The "if" condition compares the call site's called value to the given
+/// callee. The original call site is moved into the "else" block, and a clone
+/// of the call site is placed in the "then" block. The cloned instruction is
+/// returned.
+///
+/// For example, the call instruction below:
+///
+/// orig_bb:
+/// %t0 = call i32 %ptr()
+/// ...
+///
+/// Is replace by the following:
+///
+/// orig_bb:
+/// %cond = icmp eq i32 ()* %ptr, @func
+/// br i1 %cond, %then_bb, %else_bb
+///
+/// then_bb:
+/// ; The clone of the original call instruction is placed in the "then"
+/// ; block. It is not yet promoted.
+/// %t1 = call i32 %ptr()
+/// br merge_bb
+///
+/// else_bb:
+/// ; The original call instruction is moved to the "else" block.
+/// %t0 = call i32 %ptr()
+/// br merge_bb
+///
+/// merge_bb:
+/// ; Uses of the original call instruction are replaced by uses of the phi
+/// ; node.
+/// %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ]
+/// ...
+///
+/// A similar transformation is performed for invoke instructions. However,
+/// since invokes are terminating, more work is required. For example, the
+/// invoke instruction below:
+///
+/// orig_bb:
+/// %t0 = invoke %ptr() to label %normal_dst unwind label %unwind_dst
+///
+/// Is replace by the following:
+///
+/// orig_bb:
+/// %cond = icmp eq i32 ()* %ptr, @func
+/// br i1 %cond, %then_bb, %else_bb
+///
+/// then_bb:
+/// ; The clone of the original invoke instruction is placed in the "then"
+/// ; block, and its normal destination is set to the "merge" block. It is
+/// ; not yet promoted.
+/// %t1 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+/// else_bb:
+/// ; The original invoke instruction is moved into the "else" block, and
+/// ; its normal destination is set to the "merge" block.
+/// %t0 = invoke i32 %ptr() to label %merge_bb unwind label %unwind_dst
+///
+/// merge_bb:
+/// ; Uses of the original invoke instruction are replaced by uses of the
+/// ; phi node, and the merge block branches to the normal destination.
+/// %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ]
+/// br %normal_dst
+///
+/// An indirect musttail call is processed slightly differently in that:
+/// 1. No merge block needed for the orginal and the cloned callsite, since
+/// either one ends the flow. No phi node is needed either.
+/// 2. The return statement following the original call site is duplicated too
+/// and placed immediately after the cloned call site per the IR convention.
+///
+/// For example, the musttail call instruction below:
+///
+/// orig_bb:
+/// %t0 = musttail call i32 %ptr()
+/// ...
+///
+/// Is replaced by the following:
+///
+/// cond_bb:
+/// %cond = icmp eq i32 ()* %ptr, @func
+/// br i1 %cond, %then_bb, %orig_bb
+///
+/// then_bb:
+/// ; The clone of the original call instruction is placed in the "then"
+/// ; block. It is not yet promoted.
+/// %t1 = musttail call i32 %ptr()
+/// ret %t1
+///
+/// orig_bb:
+/// ; The original call instruction stays in its original block.
+/// %t0 = musttail call i32 %ptr()
+/// ret %t0
+static CallBase &versionCallSite(CallBase &CB, Value *Callee,
+ MDNode *BranchWeights) {
+
+ IRBuilder<> Builder(&CB);
+ CallBase *OrigInst = &CB;
+ BasicBlock *OrigBlock = OrigInst->getParent();
+
+ // Create the compare. The called value and callee must have the same type to
+ // be compared.
+ if (CB.getCalledOperand()->getType() != Callee->getType())
+ Callee = Builder.CreateBitCast(Callee, CB.getCalledOperand()->getType());
+ auto *Cond = Builder.CreateICmpEQ(CB.getCalledOperand(), Callee);
+
+ if (OrigInst->isMustTailCall()) {
+ // Create an if-then structure. The original instruction stays in its block,
+ // and a clone of the original instruction is placed in the "then" block.
+ Instruction *ThenTerm =
+ SplitBlockAndInsertIfThen(Cond, &CB, false, BranchWeights);
+ BasicBlock *ThenBlock = ThenTerm->getParent();
+ ThenBlock->setName("if.true.direct_targ");
+ CallBase *NewInst = cast<CallBase>(OrigInst->clone());
+ NewInst->insertBefore(ThenTerm);
+
+ // Place a clone of the optional bitcast after the new call site.
+ Value *NewRetVal = NewInst;
+ auto Next = OrigInst->getNextNode();
+ if (auto *BitCast = dyn_cast_or_null<BitCastInst>(Next)) {
+ assert(BitCast->getOperand(0) == OrigInst &&
+ "bitcast following musttail call must use the call");
+ auto NewBitCast = BitCast->clone();
+ NewBitCast->replaceUsesOfWith(OrigInst, NewInst);
+ NewBitCast->insertBefore(ThenTerm);
+ NewRetVal = NewBitCast;
+ Next = BitCast->getNextNode();
+ }
+
+ // Place a clone of the return instruction after the new call site.
+ ReturnInst *Ret = dyn_cast_or_null<ReturnInst>(Next);
+ assert(Ret && "musttail call must precede a ret with an optional bitcast");
+ auto NewRet = Ret->clone();
+ if (Ret->getReturnValue())
+ NewRet->replaceUsesOfWith(Ret->getReturnValue(), NewRetVal);
+ NewRet->insertBefore(ThenTerm);
+
+ // A return instructions is terminating, so we don't need the terminator
+ // instruction just created.
+ ThenTerm->eraseFromParent();
+
+ return *NewInst;
+ }
+
+ // Create an if-then-else structure. The original instruction is moved into
+ // the "else" block, and a clone of the original instruction is placed in the
+ // "then" block.
+ Instruction *ThenTerm = nullptr;
+ Instruction *ElseTerm = nullptr;
+ SplitBlockAndInsertIfThenElse(Cond, &CB, &ThenTerm, &ElseTerm, BranchWeights);
+ BasicBlock *ThenBlock = ThenTerm->getParent();
+ BasicBlock *ElseBlock = ElseTerm->getParent();
+ BasicBlock *MergeBlock = OrigInst->getParent();
+
+ ThenBlock->setName("if.true.direct_targ");
+ ElseBlock->setName("if.false.orig_indirect");
+ MergeBlock->setName("if.end.icp");
+
+ CallBase *NewInst = cast<CallBase>(OrigInst->clone());
+ OrigInst->moveBefore(ElseTerm);
+ NewInst->insertBefore(ThenTerm);
+
+ // If the original call site is an invoke instruction, we have extra work to
+ // do since invoke instructions are terminating. We have to fix-up phi nodes
+ // in the invoke's normal and unwind destinations.
+ if (auto *OrigInvoke = dyn_cast<InvokeInst>(OrigInst)) {
+ auto *NewInvoke = cast<InvokeInst>(NewInst);
+
+ // Invoke instructions are terminating, so we don't need the terminator
+ // instructions that were just created.
+ ThenTerm->eraseFromParent();
+ ElseTerm->eraseFromParent();
+
+ // Branch from the "merge" block to the original normal destination.
+ Builder.SetInsertPoint(MergeBlock);
+ Builder.CreateBr(OrigInvoke->getNormalDest());
+
+ // Fix-up phi nodes in the original invoke's normal and unwind destinations.
+ fixupPHINodeForNormalDest(OrigInvoke, OrigBlock, MergeBlock);
+ fixupPHINodeForUnwindDest(OrigInvoke, MergeBlock, ThenBlock, ElseBlock);
+
+ // Now set the normal destinations of the invoke instructions to be the
+ // "merge" block.
+ OrigInvoke->setNormalDest(MergeBlock);
+ NewInvoke->setNormalDest(MergeBlock);
+ }
+
+ // Create a phi node for the returned value of the call site.
+ createRetPHINode(OrigInst, NewInst, MergeBlock, Builder);
+
+ return *NewInst;
+}
+
+bool llvm::isLegalToPromote(const CallBase &CB, Function *Callee,
+ const char **FailureReason) {
+ assert(!CB.getCalledFunction() && "Only indirect call sites can be promoted");
+
+ auto &DL = Callee->getParent()->getDataLayout();
+
+ // Check the return type. The callee's return value type must be bitcast
+ // compatible with the call site's type.
+ Type *CallRetTy = CB.getType();
+ Type *FuncRetTy = Callee->getReturnType();
+ if (CallRetTy != FuncRetTy)
+ if (!CastInst::isBitOrNoopPointerCastable(FuncRetTy, CallRetTy, DL)) {
+ if (FailureReason)
+ *FailureReason = "Return type mismatch";
+ return false;
+ }
+
+ // The number of formal arguments of the callee.
+ unsigned NumParams = Callee->getFunctionType()->getNumParams();
+
+ // The number of actual arguments in the call.
+ unsigned NumArgs = CB.arg_size();
+
+ // Check the number of arguments. The callee and call site must agree on the
+ // number of arguments.
+ if (NumArgs != NumParams && !Callee->isVarArg()) {
+ if (FailureReason)
+ *FailureReason = "The number of arguments mismatch";
+ return false;
+ }
+
+ // Check the argument types. The callee's formal argument types must be
+ // bitcast compatible with the corresponding actual argument types of the call
+ // site.
+ unsigned I = 0;
+ for (; I < NumParams; ++I) {
+ Type *FormalTy = Callee->getFunctionType()->getFunctionParamType(I);
+ Type *ActualTy = CB.getArgOperand(I)->getType();
+ if (FormalTy == ActualTy)
+ continue;
+ if (!CastInst::isBitOrNoopPointerCastable(ActualTy, FormalTy, DL)) {
+ if (FailureReason)
+ *FailureReason = "Argument type mismatch";
+ return false;
+ }
+ }
+ for (; I < NumArgs; I++) {
// Vararg functions can have more arguments than parameters.
- assert(Callee->isVarArg());
- if (CB.paramHasAttr(I, Attribute::StructRet)) {
+ assert(Callee->isVarArg());
+ if (CB.paramHasAttr(I, Attribute::StructRet)) {
if (FailureReason)
*FailureReason = "SRet arg to vararg function";
- return false;
- }
- }
-
- return true;
-}
-
-CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
- CastInst **RetBitCast) {
- assert(!CB.getCalledFunction() && "Only indirect call sites can be promoted");
-
- // Set the called function of the call site to be the given callee (but don't
- // change the type).
- CB.setCalledOperand(Callee);
-
- // Since the call site will no longer be direct, we must clear metadata that
- // is only appropriate for indirect calls. This includes !prof and !callees
- // metadata.
- CB.setMetadata(LLVMContext::MD_prof, nullptr);
- CB.setMetadata(LLVMContext::MD_callees, nullptr);
-
- // If the function type of the call site matches that of the callee, no
- // additional work is required.
- if (CB.getFunctionType() == Callee->getFunctionType())
- return CB;
-
- // Save the return types of the call site and callee.
- Type *CallSiteRetTy = CB.getType();
- Type *CalleeRetTy = Callee->getReturnType();
-
- // Change the function type of the call site the match that of the callee.
- CB.mutateFunctionType(Callee->getFunctionType());
-
- // Inspect the arguments of the call site. If an argument's type doesn't
- // match the corresponding formal argument's type in the callee, bitcast it
- // to the correct type.
- auto CalleeType = Callee->getFunctionType();
- auto CalleeParamNum = CalleeType->getNumParams();
-
- LLVMContext &Ctx = Callee->getContext();
- const AttributeList &CallerPAL = CB.getAttributes();
- // The new list of argument attributes.
- SmallVector<AttributeSet, 4> NewArgAttrs;
- bool AttributeChanged = false;
-
- for (unsigned ArgNo = 0; ArgNo < CalleeParamNum; ++ArgNo) {
- auto *Arg = CB.getArgOperand(ArgNo);
- Type *FormalTy = CalleeType->getParamType(ArgNo);
- Type *ActualTy = Arg->getType();
- if (FormalTy != ActualTy) {
- auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", &CB);
- CB.setArgOperand(ArgNo, Cast);
-
- // Remove any incompatible attributes for the argument.
- AttrBuilder ArgAttrs(CallerPAL.getParamAttributes(ArgNo));
- ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy));
-
- // If byval is used, this must be a pointer type, and the byval type must
- // match the element type. Update it if present.
- if (ArgAttrs.getByValType()) {
- Type *NewTy = Callee->getParamByValType(ArgNo);
- ArgAttrs.addByValAttr(
- NewTy ? NewTy : cast<PointerType>(FormalTy)->getElementType());
- }
-
- NewArgAttrs.push_back(AttributeSet::get(Ctx, ArgAttrs));
- AttributeChanged = true;
- } else
- NewArgAttrs.push_back(CallerPAL.getParamAttributes(ArgNo));
- }
-
- // If the return type of the call site doesn't match that of the callee, cast
- // the returned value to the appropriate type.
- // Remove any incompatible return value attribute.
- AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
- if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) {
- createRetBitCast(CB, CallSiteRetTy, RetBitCast);
- RAttrs.remove(AttributeFuncs::typeIncompatible(CalleeRetTy));
- AttributeChanged = true;
- }
-
- // Set the new callsite attribute.
- if (AttributeChanged)
- CB.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttributes(),
- AttributeSet::get(Ctx, RAttrs),
- NewArgAttrs));
-
- return CB;
-}
-
-CallBase &llvm::promoteCallWithIfThenElse(CallBase &CB, Function *Callee,
- MDNode *BranchWeights) {
-
- // Version the indirect call site. If the called value is equal to the given
- // callee, 'NewInst' will be executed, otherwise the original call site will
- // be executed.
- CallBase &NewInst = versionCallSite(CB, Callee, BranchWeights);
-
- // Promote 'NewInst' so that it directly calls the desired function.
- return promoteCall(NewInst, Callee);
-}
-
-bool llvm::tryPromoteCall(CallBase &CB) {
- assert(!CB.getCalledFunction());
- Module *M = CB.getCaller()->getParent();
- const DataLayout &DL = M->getDataLayout();
- Value *Callee = CB.getCalledOperand();
-
- LoadInst *VTableEntryLoad = dyn_cast<LoadInst>(Callee);
- if (!VTableEntryLoad)
- return false; // Not a vtable entry load.
- Value *VTableEntryPtr = VTableEntryLoad->getPointerOperand();
- APInt VTableOffset(DL.getTypeSizeInBits(VTableEntryPtr->getType()), 0);
- Value *VTableBasePtr = VTableEntryPtr->stripAndAccumulateConstantOffsets(
- DL, VTableOffset, /* AllowNonInbounds */ true);
- LoadInst *VTablePtrLoad = dyn_cast<LoadInst>(VTableBasePtr);
- if (!VTablePtrLoad)
- return false; // Not a vtable load.
- Value *Object = VTablePtrLoad->getPointerOperand();
- APInt ObjectOffset(DL.getTypeSizeInBits(Object->getType()), 0);
- Value *ObjectBase = Object->stripAndAccumulateConstantOffsets(
- DL, ObjectOffset, /* AllowNonInbounds */ true);
- if (!(isa<AllocaInst>(ObjectBase) && ObjectOffset == 0))
- // Not an Alloca or the offset isn't zero.
- return false;
-
- // Look for the vtable pointer store into the object by the ctor.
- BasicBlock::iterator BBI(VTablePtrLoad);
- Value *VTablePtr = FindAvailableLoadedValue(
- VTablePtrLoad, VTablePtrLoad->getParent(), BBI, 0, nullptr, nullptr);
- if (!VTablePtr)
- return false; // No vtable found.
- APInt VTableOffsetGVBase(DL.getTypeSizeInBits(VTablePtr->getType()), 0);
- Value *VTableGVBase = VTablePtr->stripAndAccumulateConstantOffsets(
- DL, VTableOffsetGVBase, /* AllowNonInbounds */ true);
- GlobalVariable *GV = dyn_cast<GlobalVariable>(VTableGVBase);
- if (!(GV && GV->isConstant() && GV->hasDefinitiveInitializer()))
- // Not in the form of a global constant variable with an initializer.
- return false;
-
- Constant *VTableGVInitializer = GV->getInitializer();
- APInt VTableGVOffset = VTableOffsetGVBase + VTableOffset;
- if (!(VTableGVOffset.getActiveBits() <= 64))
- return false; // Out of range.
- Constant *Ptr = getPointerAtOffset(VTableGVInitializer,
- VTableGVOffset.getZExtValue(),
- *M);
- if (!Ptr)
- return false; // No constant (function) pointer found.
- Function *DirectCallee = dyn_cast<Function>(Ptr->stripPointerCasts());
- if (!DirectCallee)
- return false; // No function pointer found.
-
- if (!isLegalToPromote(CB, DirectCallee))
- return false;
-
- // Success.
- promoteCall(CB, DirectCallee);
- return true;
-}
-
-#undef DEBUG_TYPE
+ return false;
+ }
+ }
+
+ return true;
+}
+
+CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
+ CastInst **RetBitCast) {
+ assert(!CB.getCalledFunction() && "Only indirect call sites can be promoted");
+
+ // Set the called function of the call site to be the given callee (but don't
+ // change the type).
+ CB.setCalledOperand(Callee);
+
+ // Since the call site will no longer be direct, we must clear metadata that
+ // is only appropriate for indirect calls. This includes !prof and !callees
+ // metadata.
+ CB.setMetadata(LLVMContext::MD_prof, nullptr);
+ CB.setMetadata(LLVMContext::MD_callees, nullptr);
+
+ // If the function type of the call site matches that of the callee, no
+ // additional work is required.
+ if (CB.getFunctionType() == Callee->getFunctionType())
+ return CB;
+
+ // Save the return types of the call site and callee.
+ Type *CallSiteRetTy = CB.getType();
+ Type *CalleeRetTy = Callee->getReturnType();
+
+ // Change the function type of the call site the match that of the callee.
+ CB.mutateFunctionType(Callee->getFunctionType());
+
+ // Inspect the arguments of the call site. If an argument's type doesn't
+ // match the corresponding formal argument's type in the callee, bitcast it
+ // to the correct type.
+ auto CalleeType = Callee->getFunctionType();
+ auto CalleeParamNum = CalleeType->getNumParams();
+
+ LLVMContext &Ctx = Callee->getContext();
+ const AttributeList &CallerPAL = CB.getAttributes();
+ // The new list of argument attributes.
+ SmallVector<AttributeSet, 4> NewArgAttrs;
+ bool AttributeChanged = false;
+
+ for (unsigned ArgNo = 0; ArgNo < CalleeParamNum; ++ArgNo) {
+ auto *Arg = CB.getArgOperand(ArgNo);
+ Type *FormalTy = CalleeType->getParamType(ArgNo);
+ Type *ActualTy = Arg->getType();
+ if (FormalTy != ActualTy) {
+ auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", &CB);
+ CB.setArgOperand(ArgNo, Cast);
+
+ // Remove any incompatible attributes for the argument.
+ AttrBuilder ArgAttrs(CallerPAL.getParamAttributes(ArgNo));
+ ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy));
+
+ // If byval is used, this must be a pointer type, and the byval type must
+ // match the element type. Update it if present.
+ if (ArgAttrs.getByValType()) {
+ Type *NewTy = Callee->getParamByValType(ArgNo);
+ ArgAttrs.addByValAttr(
+ NewTy ? NewTy : cast<PointerType>(FormalTy)->getElementType());
+ }
+
+ NewArgAttrs.push_back(AttributeSet::get(Ctx, ArgAttrs));
+ AttributeChanged = true;
+ } else
+ NewArgAttrs.push_back(CallerPAL.getParamAttributes(ArgNo));
+ }
+
+ // If the return type of the call site doesn't match that of the callee, cast
+ // the returned value to the appropriate type.
+ // Remove any incompatible return value attribute.
+ AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
+ if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) {
+ createRetBitCast(CB, CallSiteRetTy, RetBitCast);
+ RAttrs.remove(AttributeFuncs::typeIncompatible(CalleeRetTy));
+ AttributeChanged = true;
+ }
+
+ // Set the new callsite attribute.
+ if (AttributeChanged)
+ CB.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttributes(),
+ AttributeSet::get(Ctx, RAttrs),
+ NewArgAttrs));
+
+ return CB;
+}
+
+CallBase &llvm::promoteCallWithIfThenElse(CallBase &CB, Function *Callee,
+ MDNode *BranchWeights) {
+
+ // Version the indirect call site. If the called value is equal to the given
+ // callee, 'NewInst' will be executed, otherwise the original call site will
+ // be executed.
+ CallBase &NewInst = versionCallSite(CB, Callee, BranchWeights);
+
+ // Promote 'NewInst' so that it directly calls the desired function.
+ return promoteCall(NewInst, Callee);
+}
+
+bool llvm::tryPromoteCall(CallBase &CB) {
+ assert(!CB.getCalledFunction());
+ Module *M = CB.getCaller()->getParent();
+ const DataLayout &DL = M->getDataLayout();
+ Value *Callee = CB.getCalledOperand();
+
+ LoadInst *VTableEntryLoad = dyn_cast<LoadInst>(Callee);
+ if (!VTableEntryLoad)
+ return false; // Not a vtable entry load.
+ Value *VTableEntryPtr = VTableEntryLoad->getPointerOperand();
+ APInt VTableOffset(DL.getTypeSizeInBits(VTableEntryPtr->getType()), 0);
+ Value *VTableBasePtr = VTableEntryPtr->stripAndAccumulateConstantOffsets(
+ DL, VTableOffset, /* AllowNonInbounds */ true);
+ LoadInst *VTablePtrLoad = dyn_cast<LoadInst>(VTableBasePtr);
+ if (!VTablePtrLoad)
+ return false; // Not a vtable load.
+ Value *Object = VTablePtrLoad->getPointerOperand();
+ APInt ObjectOffset(DL.getTypeSizeInBits(Object->getType()), 0);
+ Value *ObjectBase = Object->stripAndAccumulateConstantOffsets(
+ DL, ObjectOffset, /* AllowNonInbounds */ true);
+ if (!(isa<AllocaInst>(ObjectBase) && ObjectOffset == 0))
+ // Not an Alloca or the offset isn't zero.
+ return false;
+
+ // Look for the vtable pointer store into the object by the ctor.
+ BasicBlock::iterator BBI(VTablePtrLoad);
+ Value *VTablePtr = FindAvailableLoadedValue(
+ VTablePtrLoad, VTablePtrLoad->getParent(), BBI, 0, nullptr, nullptr);
+ if (!VTablePtr)
+ return false; // No vtable found.
+ APInt VTableOffsetGVBase(DL.getTypeSizeInBits(VTablePtr->getType()), 0);
+ Value *VTableGVBase = VTablePtr->stripAndAccumulateConstantOffsets(
+ DL, VTableOffsetGVBase, /* AllowNonInbounds */ true);
+ GlobalVariable *GV = dyn_cast<GlobalVariable>(VTableGVBase);
+ if (!(GV && GV->isConstant() && GV->hasDefinitiveInitializer()))
+ // Not in the form of a global constant variable with an initializer.
+ return false;
+
+ Constant *VTableGVInitializer = GV->getInitializer();
+ APInt VTableGVOffset = VTableOffsetGVBase + VTableOffset;
+ if (!(VTableGVOffset.getActiveBits() <= 64))
+ return false; // Out of range.
+ Constant *Ptr = getPointerAtOffset(VTableGVInitializer,
+ VTableGVOffset.getZExtValue(),
+ *M);
+ if (!Ptr)
+ return false; // No constant (function) pointer found.
+ Function *DirectCallee = dyn_cast<Function>(Ptr->stripPointerCasts());
+ if (!DirectCallee)
+ return false; // No function pointer found.
+
+ if (!isLegalToPromote(CB, DirectCallee))
+ return false;
+
+ // Success.
+ promoteCall(CB, DirectCallee);
+ return true;
+}
+
+#undef DEBUG_TYPE
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeAliases.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeAliases.cpp
index 295fc67108..6b01c0c71d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeAliases.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeAliases.cpp
@@ -1,105 +1,105 @@
-//===- CanonicalizeAliases.cpp - ThinLTO Support: Canonicalize Aliases ----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Currently this file implements partial alias canonicalization, to
-// flatten chains of aliases (also done by GlobalOpt, but not on for
-// O0 compiles). E.g.
-// @a = alias i8, i8 *@b
-// @b = alias i8, i8 *@g
-//
-// will be converted to:
-// @a = alias i8, i8 *@g <-- @a is now an alias to base object @g
-// @b = alias i8, i8 *@g
-//
-// Eventually this file will implement full alias canonicalation, so that
-// all aliasees are private anonymous values. E.g.
-// @a = alias i8, i8 *@g
-// @g = global i8 0
-//
-// will be converted to:
-// @0 = private global
-// @a = alias i8, i8* @0
-// @g = alias i8, i8* @0
-//
-// This simplifies optimization and ThinLTO linking of the original symbols.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-
-using namespace llvm;
-
-namespace {
-
-static Constant *canonicalizeAlias(Constant *C, bool &Changed) {
- if (auto *GA = dyn_cast<GlobalAlias>(C)) {
- auto *NewAliasee = canonicalizeAlias(GA->getAliasee(), Changed);
- if (NewAliasee != GA->getAliasee()) {
- GA->setAliasee(NewAliasee);
- Changed = true;
- }
- return NewAliasee;
- }
-
- auto *CE = dyn_cast<ConstantExpr>(C);
- if (!CE)
- return C;
-
- std::vector<Constant *> Ops;
- for (Use &U : CE->operands())
- Ops.push_back(canonicalizeAlias(cast<Constant>(U), Changed));
- return CE->getWithOperands(Ops);
-}
-
-/// Convert aliases to canonical form.
-static bool canonicalizeAliases(Module &M) {
- bool Changed = false;
- for (auto &GA : M.aliases())
- canonicalizeAlias(&GA, Changed);
- return Changed;
-}
-
-// Legacy pass that canonicalizes aliases.
-class CanonicalizeAliasesLegacyPass : public ModulePass {
-
-public:
- /// Pass identification, replacement for typeid
- static char ID;
-
- /// Specify pass name for debug output
- StringRef getPassName() const override { return "Canonicalize Aliases"; }
-
- explicit CanonicalizeAliasesLegacyPass() : ModulePass(ID) {}
-
- bool runOnModule(Module &M) override { return canonicalizeAliases(M); }
-};
-char CanonicalizeAliasesLegacyPass::ID = 0;
-
-} // anonymous namespace
-
-PreservedAnalyses CanonicalizeAliasesPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- if (!canonicalizeAliases(M))
- return PreservedAnalyses::all();
-
- return PreservedAnalyses::none();
-}
-
-INITIALIZE_PASS_BEGIN(CanonicalizeAliasesLegacyPass, "canonicalize-aliases",
- "Canonicalize aliases", false, false)
-INITIALIZE_PASS_END(CanonicalizeAliasesLegacyPass, "canonicalize-aliases",
- "Canonicalize aliases", false, false)
-
-namespace llvm {
-ModulePass *createCanonicalizeAliasesPass() {
- return new CanonicalizeAliasesLegacyPass();
-}
-} // namespace llvm
+//===- CanonicalizeAliases.cpp - ThinLTO Support: Canonicalize Aliases ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Currently this file implements partial alias canonicalization, to
+// flatten chains of aliases (also done by GlobalOpt, but not on for
+// O0 compiles). E.g.
+// @a = alias i8, i8 *@b
+// @b = alias i8, i8 *@g
+//
+// will be converted to:
+// @a = alias i8, i8 *@g <-- @a is now an alias to base object @g
+// @b = alias i8, i8 *@g
+//
+// Eventually this file will implement full alias canonicalation, so that
+// all aliasees are private anonymous values. E.g.
+// @a = alias i8, i8 *@g
+// @g = global i8 0
+//
+// will be converted to:
+// @0 = private global
+// @a = alias i8, i8* @0
+// @g = alias i8, i8* @0
+//
+// This simplifies optimization and ThinLTO linking of the original symbols.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CanonicalizeAliases.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+
+static Constant *canonicalizeAlias(Constant *C, bool &Changed) {
+ if (auto *GA = dyn_cast<GlobalAlias>(C)) {
+ auto *NewAliasee = canonicalizeAlias(GA->getAliasee(), Changed);
+ if (NewAliasee != GA->getAliasee()) {
+ GA->setAliasee(NewAliasee);
+ Changed = true;
+ }
+ return NewAliasee;
+ }
+
+ auto *CE = dyn_cast<ConstantExpr>(C);
+ if (!CE)
+ return C;
+
+ std::vector<Constant *> Ops;
+ for (Use &U : CE->operands())
+ Ops.push_back(canonicalizeAlias(cast<Constant>(U), Changed));
+ return CE->getWithOperands(Ops);
+}
+
+/// Convert aliases to canonical form.
+static bool canonicalizeAliases(Module &M) {
+ bool Changed = false;
+ for (auto &GA : M.aliases())
+ canonicalizeAlias(&GA, Changed);
+ return Changed;
+}
+
+// Legacy pass that canonicalizes aliases.
+class CanonicalizeAliasesLegacyPass : public ModulePass {
+
+public:
+ /// Pass identification, replacement for typeid
+ static char ID;
+
+ /// Specify pass name for debug output
+ StringRef getPassName() const override { return "Canonicalize Aliases"; }
+
+ explicit CanonicalizeAliasesLegacyPass() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override { return canonicalizeAliases(M); }
+};
+char CanonicalizeAliasesLegacyPass::ID = 0;
+
+} // anonymous namespace
+
+PreservedAnalyses CanonicalizeAliasesPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ if (!canonicalizeAliases(M))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+INITIALIZE_PASS_BEGIN(CanonicalizeAliasesLegacyPass, "canonicalize-aliases",
+ "Canonicalize aliases", false, false)
+INITIALIZE_PASS_END(CanonicalizeAliasesLegacyPass, "canonicalize-aliases",
+ "Canonicalize aliases", false, false)
+
+namespace llvm {
+ModulePass *createCanonicalizeAliasesPass() {
+ return new CanonicalizeAliasesLegacyPass();
+}
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
index 611efd8243..1f649fe6c7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
@@ -1,250 +1,250 @@
-//==- CanonicalizeFreezeInLoops - Canonicalize freezes in a loop-*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass canonicalizes freeze instructions in a loop by pushing them out to
-// the preheader.
-//
-// loop:
-// i = phi init, i.next
-// i.next = add nsw i, 1
-// i.next.fr = freeze i.next // push this out of this loop
-// use(i.next.fr)
-// br i1 (i.next <= N), loop, exit
-// =>
-// init.fr = freeze init
-// loop:
-// i = phi init.fr, i.next
-// i.next = add i, 1 // nsw is dropped here
-// use(i.next)
-// br i1 (i.next <= N), loop, exit
-//
-// Removing freezes from these chains help scalar evolution successfully analyze
-// expressions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/IVDescriptors.h"
-#include "llvm/Analysis/IVUsers.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Transforms/Utils.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "canon-freeze"
-
-namespace {
-
-class CanonicalizeFreezeInLoops : public LoopPass {
-public:
- static char ID;
-
- CanonicalizeFreezeInLoops();
-
-private:
- bool runOnLoop(Loop *L, LPPassManager &LPM) override;
- void getAnalysisUsage(AnalysisUsage &AU) const override;
-};
-
-class CanonicalizeFreezeInLoopsImpl {
- Loop *L;
- ScalarEvolution &SE;
- DominatorTree &DT;
-
- struct FrozenIndPHIInfo {
- // A freeze instruction that uses an induction phi
- FreezeInst *FI = nullptr;
- // The induction phi, step instruction, the operand idx of StepInst which is
- // a step value
- PHINode *PHI;
- BinaryOperator *StepInst;
- unsigned StepValIdx = 0;
-
- FrozenIndPHIInfo(PHINode *PHI, BinaryOperator *StepInst)
- : PHI(PHI), StepInst(StepInst) {}
- };
-
- // Can freeze instruction be pushed into operands of I?
- // In order to do this, I should not create a poison after I's flags are
- // stripped.
- bool canHandleInst(const Instruction *I) {
- auto Opc = I->getOpcode();
- // If add/sub/mul, drop nsw/nuw flags.
- return Opc == Instruction::Add || Opc == Instruction::Sub ||
- Opc == Instruction::Mul;
- }
-
- void InsertFreezeAndForgetFromSCEV(Use &U);
-
-public:
- CanonicalizeFreezeInLoopsImpl(Loop *L, ScalarEvolution &SE, DominatorTree &DT)
- : L(L), SE(SE), DT(DT) {}
- bool run();
-};
-
-} // anonymous namespace
-
-// Given U = (value, user), replace value with freeze(value), and let
-// SCEV forget user. The inserted freeze is placed in the preheader.
-void CanonicalizeFreezeInLoopsImpl::InsertFreezeAndForgetFromSCEV(Use &U) {
- auto *PH = L->getLoopPreheader();
-
- auto *UserI = cast<Instruction>(U.getUser());
- auto *ValueToFr = U.get();
- assert(L->contains(UserI->getParent()) &&
- "Should not process an instruction that isn't inside the loop");
+//==- CanonicalizeFreezeInLoops - Canonicalize freezes in a loop-*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass canonicalizes freeze instructions in a loop by pushing them out to
+// the preheader.
+//
+// loop:
+// i = phi init, i.next
+// i.next = add nsw i, 1
+// i.next.fr = freeze i.next // push this out of this loop
+// use(i.next.fr)
+// br i1 (i.next <= N), loop, exit
+// =>
+// init.fr = freeze init
+// loop:
+// i = phi init.fr, i.next
+// i.next = add i, 1 // nsw is dropped here
+// use(i.next)
+// br i1 (i.next <= N), loop, exit
+//
+// Removing freezes from these chains help scalar evolution successfully analyze
+// expressions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "canon-freeze"
+
+namespace {
+
+class CanonicalizeFreezeInLoops : public LoopPass {
+public:
+ static char ID;
+
+ CanonicalizeFreezeInLoops();
+
+private:
+ bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+class CanonicalizeFreezeInLoopsImpl {
+ Loop *L;
+ ScalarEvolution &SE;
+ DominatorTree &DT;
+
+ struct FrozenIndPHIInfo {
+ // A freeze instruction that uses an induction phi
+ FreezeInst *FI = nullptr;
+ // The induction phi, step instruction, the operand idx of StepInst which is
+ // a step value
+ PHINode *PHI;
+ BinaryOperator *StepInst;
+ unsigned StepValIdx = 0;
+
+ FrozenIndPHIInfo(PHINode *PHI, BinaryOperator *StepInst)
+ : PHI(PHI), StepInst(StepInst) {}
+ };
+
+ // Can freeze instruction be pushed into operands of I?
+ // In order to do this, I should not create a poison after I's flags are
+ // stripped.
+ bool canHandleInst(const Instruction *I) {
+ auto Opc = I->getOpcode();
+ // If add/sub/mul, drop nsw/nuw flags.
+ return Opc == Instruction::Add || Opc == Instruction::Sub ||
+ Opc == Instruction::Mul;
+ }
+
+ void InsertFreezeAndForgetFromSCEV(Use &U);
+
+public:
+ CanonicalizeFreezeInLoopsImpl(Loop *L, ScalarEvolution &SE, DominatorTree &DT)
+ : L(L), SE(SE), DT(DT) {}
+ bool run();
+};
+
+} // anonymous namespace
+
+// Given U = (value, user), replace value with freeze(value), and let
+// SCEV forget user. The inserted freeze is placed in the preheader.
+void CanonicalizeFreezeInLoopsImpl::InsertFreezeAndForgetFromSCEV(Use &U) {
+ auto *PH = L->getLoopPreheader();
+
+ auto *UserI = cast<Instruction>(U.getUser());
+ auto *ValueToFr = U.get();
+ assert(L->contains(UserI->getParent()) &&
+ "Should not process an instruction that isn't inside the loop");
if (isGuaranteedNotToBeUndefOrPoison(ValueToFr, nullptr, UserI, &DT))
- return;
-
- LLVM_DEBUG(dbgs() << "canonfr: inserting freeze:\n");
- LLVM_DEBUG(dbgs() << "\tUser: " << *U.getUser() << "\n");
- LLVM_DEBUG(dbgs() << "\tOperand: " << *U.get() << "\n");
-
- U.set(new FreezeInst(ValueToFr, ValueToFr->getName() + ".frozen",
- PH->getTerminator()));
-
- SE.forgetValue(UserI);
-}
-
-bool CanonicalizeFreezeInLoopsImpl::run() {
- // The loop should be in LoopSimplify form.
- if (!L->isLoopSimplifyForm())
- return false;
-
- SmallVector<FrozenIndPHIInfo, 4> Candidates;
-
- for (auto &PHI : L->getHeader()->phis()) {
- InductionDescriptor ID;
- if (!InductionDescriptor::isInductionPHI(&PHI, L, &SE, ID))
- continue;
-
- LLVM_DEBUG(dbgs() << "canonfr: PHI: " << PHI << "\n");
- FrozenIndPHIInfo Info(&PHI, ID.getInductionBinOp());
- if (!Info.StepInst || !canHandleInst(Info.StepInst)) {
- // The stepping instruction has unknown form.
- // Ignore this PHI.
- continue;
- }
-
- Info.StepValIdx = Info.StepInst->getOperand(0) == &PHI;
- Value *StepV = Info.StepInst->getOperand(Info.StepValIdx);
- if (auto *StepI = dyn_cast<Instruction>(StepV)) {
- if (L->contains(StepI->getParent())) {
- // The step value is inside the loop. Freezing step value will introduce
- // another freeze into the loop, so skip this PHI.
- continue;
- }
- }
-
- auto Visit = [&](User *U) {
- if (auto *FI = dyn_cast<FreezeInst>(U)) {
- LLVM_DEBUG(dbgs() << "canonfr: found: " << *FI << "\n");
- Info.FI = FI;
- Candidates.push_back(Info);
- }
- };
- for_each(PHI.users(), Visit);
- for_each(Info.StepInst->users(), Visit);
- }
-
- if (Candidates.empty())
- return false;
-
- SmallSet<PHINode *, 8> ProcessedPHIs;
- for (const auto &Info : Candidates) {
- PHINode *PHI = Info.PHI;
- if (!ProcessedPHIs.insert(Info.PHI).second)
- continue;
-
- BinaryOperator *StepI = Info.StepInst;
- assert(StepI && "Step instruction should have been found");
-
- // Drop flags from the step instruction.
+ return;
+
+ LLVM_DEBUG(dbgs() << "canonfr: inserting freeze:\n");
+ LLVM_DEBUG(dbgs() << "\tUser: " << *U.getUser() << "\n");
+ LLVM_DEBUG(dbgs() << "\tOperand: " << *U.get() << "\n");
+
+ U.set(new FreezeInst(ValueToFr, ValueToFr->getName() + ".frozen",
+ PH->getTerminator()));
+
+ SE.forgetValue(UserI);
+}
+
+bool CanonicalizeFreezeInLoopsImpl::run() {
+ // The loop should be in LoopSimplify form.
+ if (!L->isLoopSimplifyForm())
+ return false;
+
+ SmallVector<FrozenIndPHIInfo, 4> Candidates;
+
+ for (auto &PHI : L->getHeader()->phis()) {
+ InductionDescriptor ID;
+ if (!InductionDescriptor::isInductionPHI(&PHI, L, &SE, ID))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "canonfr: PHI: " << PHI << "\n");
+ FrozenIndPHIInfo Info(&PHI, ID.getInductionBinOp());
+ if (!Info.StepInst || !canHandleInst(Info.StepInst)) {
+ // The stepping instruction has unknown form.
+ // Ignore this PHI.
+ continue;
+ }
+
+ Info.StepValIdx = Info.StepInst->getOperand(0) == &PHI;
+ Value *StepV = Info.StepInst->getOperand(Info.StepValIdx);
+ if (auto *StepI = dyn_cast<Instruction>(StepV)) {
+ if (L->contains(StepI->getParent())) {
+ // The step value is inside the loop. Freezing step value will introduce
+ // another freeze into the loop, so skip this PHI.
+ continue;
+ }
+ }
+
+ auto Visit = [&](User *U) {
+ if (auto *FI = dyn_cast<FreezeInst>(U)) {
+ LLVM_DEBUG(dbgs() << "canonfr: found: " << *FI << "\n");
+ Info.FI = FI;
+ Candidates.push_back(Info);
+ }
+ };
+ for_each(PHI.users(), Visit);
+ for_each(Info.StepInst->users(), Visit);
+ }
+
+ if (Candidates.empty())
+ return false;
+
+ SmallSet<PHINode *, 8> ProcessedPHIs;
+ for (const auto &Info : Candidates) {
+ PHINode *PHI = Info.PHI;
+ if (!ProcessedPHIs.insert(Info.PHI).second)
+ continue;
+
+ BinaryOperator *StepI = Info.StepInst;
+ assert(StepI && "Step instruction should have been found");
+
+ // Drop flags from the step instruction.
if (!isGuaranteedNotToBeUndefOrPoison(StepI, nullptr, StepI, &DT)) {
- LLVM_DEBUG(dbgs() << "canonfr: drop flags: " << *StepI << "\n");
- StepI->dropPoisonGeneratingFlags();
- SE.forgetValue(StepI);
- }
-
- InsertFreezeAndForgetFromSCEV(StepI->getOperandUse(Info.StepValIdx));
-
- unsigned OperandIdx =
- PHI->getOperandNumForIncomingValue(PHI->getIncomingValue(0) == StepI);
- InsertFreezeAndForgetFromSCEV(PHI->getOperandUse(OperandIdx));
- }
-
- // Finally, remove the old freeze instructions.
- for (const auto &Item : Candidates) {
- auto *FI = Item.FI;
- LLVM_DEBUG(dbgs() << "canonfr: removing " << *FI << "\n");
- SE.forgetValue(FI);
- FI->replaceAllUsesWith(FI->getOperand(0));
- FI->eraseFromParent();
- }
-
- return true;
-}
-
-CanonicalizeFreezeInLoops::CanonicalizeFreezeInLoops() : LoopPass(ID) {
- initializeCanonicalizeFreezeInLoopsPass(*PassRegistry::getPassRegistry());
-}
-
-void CanonicalizeFreezeInLoops::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addPreservedID(LoopSimplifyID);
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequiredID(LoopSimplifyID);
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
-}
-
-bool CanonicalizeFreezeInLoops::runOnLoop(Loop *L, LPPassManager &) {
- if (skipLoop(L))
- return false;
-
- auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- return CanonicalizeFreezeInLoopsImpl(L, SE, DT).run();
-}
-
-PreservedAnalyses
-CanonicalizeFreezeInLoopsPass::run(Loop &L, LoopAnalysisManager &AM,
- LoopStandardAnalysisResults &AR,
- LPMUpdater &U) {
- if (!CanonicalizeFreezeInLoopsImpl(&L, AR.SE, AR.DT).run())
- return PreservedAnalyses::all();
-
- return getLoopPassPreservedAnalyses();
-}
-
-INITIALIZE_PASS_BEGIN(CanonicalizeFreezeInLoops, "canon-freeze",
- "Canonicalize Freeze Instructions in Loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_END(CanonicalizeFreezeInLoops, "canon-freeze",
- "Canonicalize Freeze Instructions in Loops", false, false)
-
-Pass *llvm::createCanonicalizeFreezeInLoopsPass() {
- return new CanonicalizeFreezeInLoops();
-}
-
-char CanonicalizeFreezeInLoops::ID = 0;
+ LLVM_DEBUG(dbgs() << "canonfr: drop flags: " << *StepI << "\n");
+ StepI->dropPoisonGeneratingFlags();
+ SE.forgetValue(StepI);
+ }
+
+ InsertFreezeAndForgetFromSCEV(StepI->getOperandUse(Info.StepValIdx));
+
+ unsigned OperandIdx =
+ PHI->getOperandNumForIncomingValue(PHI->getIncomingValue(0) == StepI);
+ InsertFreezeAndForgetFromSCEV(PHI->getOperandUse(OperandIdx));
+ }
+
+ // Finally, remove the old freeze instructions.
+ for (const auto &Item : Candidates) {
+ auto *FI = Item.FI;
+ LLVM_DEBUG(dbgs() << "canonfr: removing " << *FI << "\n");
+ SE.forgetValue(FI);
+ FI->replaceAllUsesWith(FI->getOperand(0));
+ FI->eraseFromParent();
+ }
+
+ return true;
+}
+
+CanonicalizeFreezeInLoops::CanonicalizeFreezeInLoops() : LoopPass(ID) {
+ initializeCanonicalizeFreezeInLoopsPass(*PassRegistry::getPassRegistry());
+}
+
+void CanonicalizeFreezeInLoops::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addPreservedID(LoopSimplifyID);
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
+bool CanonicalizeFreezeInLoops::runOnLoop(Loop *L, LPPassManager &) {
+ if (skipLoop(L))
+ return false;
+
+ auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ return CanonicalizeFreezeInLoopsImpl(L, SE, DT).run();
+}
+
+PreservedAnalyses
+CanonicalizeFreezeInLoopsPass::run(Loop &L, LoopAnalysisManager &AM,
+ LoopStandardAnalysisResults &AR,
+ LPMUpdater &U) {
+ if (!CanonicalizeFreezeInLoopsImpl(&L, AR.SE, AR.DT).run())
+ return PreservedAnalyses::all();
+
+ return getLoopPassPreservedAnalyses();
+}
+
+INITIALIZE_PASS_BEGIN(CanonicalizeFreezeInLoops, "canon-freeze",
+ "Canonicalize Freeze Instructions in Loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(CanonicalizeFreezeInLoops, "canon-freeze",
+ "Canonicalize Freeze Instructions in Loops", false, false)
+
+Pass *llvm::createCanonicalizeFreezeInLoopsPass() {
+ return new CanonicalizeFreezeInLoops();
+}
+
+char CanonicalizeFreezeInLoops::ID = 0;
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CloneFunction.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CloneFunction.cpp
index 7f34784c6f..6ab061510a 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CloneFunction.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CloneFunction.cpp
@@ -1,198 +1,198 @@
-//===- CloneFunction.cpp - Clone a function into another function ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the CloneFunctionInto interface, which is used as the
-// low-level function cloner. This is used by the CloneFunction and function
-// inliner to do the dirty work of copying the body of a function around.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
+//===- CloneFunction.cpp - Clone a function into another function ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneFunctionInto interface, which is used as the
+// low-level function cloner. This is used by the CloneFunction and function
+// inliner to do the dirty work of copying the body of a function around.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <map>
-using namespace llvm;
-
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <map>
+using namespace llvm;
+
#define DEBUG_TYPE "clone-function"
-/// See comments in Cloning.h.
-BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
- const Twine &NameSuffix, Function *F,
- ClonedCodeInfo *CodeInfo,
- DebugInfoFinder *DIFinder) {
- DenseMap<const MDNode *, MDNode *> Cache;
- BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
- if (BB->hasName())
- NewBB->setName(BB->getName() + NameSuffix);
-
- bool hasCalls = false, hasDynamicAllocas = false;
- Module *TheModule = F ? F->getParent() : nullptr;
-
- // Loop over all instructions, and copy them over.
- for (const Instruction &I : *BB) {
- if (DIFinder && TheModule)
- DIFinder->processInstruction(*TheModule, I);
-
- Instruction *NewInst = I.clone();
- if (I.hasName())
- NewInst->setName(I.getName() + NameSuffix);
- NewBB->getInstList().push_back(NewInst);
- VMap[&I] = NewInst; // Add instruction map to value.
-
- hasCalls |= (isa<CallInst>(I) && !isa<DbgInfoIntrinsic>(I));
- if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
- if (!AI->isStaticAlloca()) {
- hasDynamicAllocas = true;
- }
- }
- }
-
- if (CodeInfo) {
- CodeInfo->ContainsCalls |= hasCalls;
- CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
- }
- return NewBB;
-}
-
-// Clone OldFunc into NewFunc, transforming the old arguments into references to
-// VMap values.
-//
-void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
- ValueToValueMapTy &VMap,
- bool ModuleLevelChanges,
- SmallVectorImpl<ReturnInst*> &Returns,
- const char *NameSuffix, ClonedCodeInfo *CodeInfo,
- ValueMapTypeRemapper *TypeMapper,
- ValueMaterializer *Materializer) {
- assert(NameSuffix && "NameSuffix cannot be null!");
-
-#ifndef NDEBUG
- for (const Argument &I : OldFunc->args())
- assert(VMap.count(&I) && "No mapping from source argument specified!");
-#endif
-
- // Copy all attributes other than those stored in the AttributeList. We need
- // to remap the parameter indices of the AttributeList.
- AttributeList NewAttrs = NewFunc->getAttributes();
- NewFunc->copyAttributesFrom(OldFunc);
- NewFunc->setAttributes(NewAttrs);
-
- // Fix up the personality function that got copied over.
- if (OldFunc->hasPersonalityFn())
- NewFunc->setPersonalityFn(
- MapValue(OldFunc->getPersonalityFn(), VMap,
- ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
- TypeMapper, Materializer));
-
- SmallVector<AttributeSet, 4> NewArgAttrs(NewFunc->arg_size());
- AttributeList OldAttrs = OldFunc->getAttributes();
-
- // Clone any argument attributes that are present in the VMap.
- for (const Argument &OldArg : OldFunc->args()) {
- if (Argument *NewArg = dyn_cast<Argument>(VMap[&OldArg])) {
- NewArgAttrs[NewArg->getArgNo()] =
- OldAttrs.getParamAttributes(OldArg.getArgNo());
- }
- }
-
- NewFunc->setAttributes(
- AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(),
- OldAttrs.getRetAttributes(), NewArgAttrs));
-
- bool MustCloneSP =
- OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent();
- DISubprogram *SP = OldFunc->getSubprogram();
- if (SP) {
- assert(!MustCloneSP || ModuleLevelChanges);
- // Add mappings for some DebugInfo nodes that we don't want duplicated
- // even if they're distinct.
- auto &MD = VMap.MD();
- MD[SP->getUnit()].reset(SP->getUnit());
- MD[SP->getType()].reset(SP->getType());
- MD[SP->getFile()].reset(SP->getFile());
- // If we're not cloning into the same module, no need to clone the
- // subprogram
- if (!MustCloneSP)
- MD[SP].reset(SP);
- }
-
+/// See comments in Cloning.h.
+BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
+ const Twine &NameSuffix, Function *F,
+ ClonedCodeInfo *CodeInfo,
+ DebugInfoFinder *DIFinder) {
+ DenseMap<const MDNode *, MDNode *> Cache;
+ BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F);
+ if (BB->hasName())
+ NewBB->setName(BB->getName() + NameSuffix);
+
+ bool hasCalls = false, hasDynamicAllocas = false;
+ Module *TheModule = F ? F->getParent() : nullptr;
+
+ // Loop over all instructions, and copy them over.
+ for (const Instruction &I : *BB) {
+ if (DIFinder && TheModule)
+ DIFinder->processInstruction(*TheModule, I);
+
+ Instruction *NewInst = I.clone();
+ if (I.hasName())
+ NewInst->setName(I.getName() + NameSuffix);
+ NewBB->getInstList().push_back(NewInst);
+ VMap[&I] = NewInst; // Add instruction map to value.
+
+ hasCalls |= (isa<CallInst>(I) && !isa<DbgInfoIntrinsic>(I));
+ if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+ if (!AI->isStaticAlloca()) {
+ hasDynamicAllocas = true;
+ }
+ }
+ }
+
+ if (CodeInfo) {
+ CodeInfo->ContainsCalls |= hasCalls;
+ CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
+ }
+ return NewBB;
+}
+
+// Clone OldFunc into NewFunc, transforming the old arguments into references to
+// VMap values.
+//
+void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
+ ValueToValueMapTy &VMap,
+ bool ModuleLevelChanges,
+ SmallVectorImpl<ReturnInst*> &Returns,
+ const char *NameSuffix, ClonedCodeInfo *CodeInfo,
+ ValueMapTypeRemapper *TypeMapper,
+ ValueMaterializer *Materializer) {
+ assert(NameSuffix && "NameSuffix cannot be null!");
+
+#ifndef NDEBUG
+ for (const Argument &I : OldFunc->args())
+ assert(VMap.count(&I) && "No mapping from source argument specified!");
+#endif
+
+ // Copy all attributes other than those stored in the AttributeList. We need
+ // to remap the parameter indices of the AttributeList.
+ AttributeList NewAttrs = NewFunc->getAttributes();
+ NewFunc->copyAttributesFrom(OldFunc);
+ NewFunc->setAttributes(NewAttrs);
+
+ // Fix up the personality function that got copied over.
+ if (OldFunc->hasPersonalityFn())
+ NewFunc->setPersonalityFn(
+ MapValue(OldFunc->getPersonalityFn(), VMap,
+ ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+ TypeMapper, Materializer));
+
+ SmallVector<AttributeSet, 4> NewArgAttrs(NewFunc->arg_size());
+ AttributeList OldAttrs = OldFunc->getAttributes();
+
+ // Clone any argument attributes that are present in the VMap.
+ for (const Argument &OldArg : OldFunc->args()) {
+ if (Argument *NewArg = dyn_cast<Argument>(VMap[&OldArg])) {
+ NewArgAttrs[NewArg->getArgNo()] =
+ OldAttrs.getParamAttributes(OldArg.getArgNo());
+ }
+ }
+
+ NewFunc->setAttributes(
+ AttributeList::get(NewFunc->getContext(), OldAttrs.getFnAttributes(),
+ OldAttrs.getRetAttributes(), NewArgAttrs));
+
+ bool MustCloneSP =
+ OldFunc->getParent() && OldFunc->getParent() == NewFunc->getParent();
+ DISubprogram *SP = OldFunc->getSubprogram();
+ if (SP) {
+ assert(!MustCloneSP || ModuleLevelChanges);
+ // Add mappings for some DebugInfo nodes that we don't want duplicated
+ // even if they're distinct.
+ auto &MD = VMap.MD();
+ MD[SP->getUnit()].reset(SP->getUnit());
+ MD[SP->getType()].reset(SP->getType());
+ MD[SP->getFile()].reset(SP->getFile());
+ // If we're not cloning into the same module, no need to clone the
+ // subprogram
+ if (!MustCloneSP)
+ MD[SP].reset(SP);
+ }
+
// Everything else beyond this point deals with function instructions,
// so if we are dealing with a function declaration, we're done.
if (OldFunc->isDeclaration())
return;
-
- // When we remap instructions, we want to avoid duplicating inlined
- // DISubprograms, so record all subprograms we find as we duplicate
- // instructions and then freeze them in the MD map.
- // We also record information about dbg.value and dbg.declare to avoid
- // duplicating the types.
- DebugInfoFinder DIFinder;
-
- // Loop over all of the basic blocks in the function, cloning them as
- // appropriate. Note that we save BE this way in order to handle cloning of
- // recursive functions into themselves.
- for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end();
- BI != BE; ++BI) {
- const BasicBlock &BB = *BI;
-
- // Create a new basic block and copy instructions into it!
- BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo,
- ModuleLevelChanges ? &DIFinder : nullptr);
-
- // Add basic block mapping.
- VMap[&BB] = CBB;
-
- // It is only legal to clone a function if a block address within that
- // function is never referenced outside of the function. Given that, we
- // want to map block addresses from the old function to block addresses in
- // the clone. (This is different from the generic ValueMapper
- // implementation, which generates an invalid blockaddress when
- // cloning a function.)
- if (BB.hasAddressTaken()) {
- Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
- const_cast<BasicBlock*>(&BB));
- VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);
- }
-
- // Note return instructions for the caller.
- if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator()))
- Returns.push_back(RI);
- }
-
- for (DISubprogram *ISP : DIFinder.subprograms())
- if (ISP != SP)
- VMap.MD()[ISP].reset(ISP);
-
- for (DICompileUnit *CU : DIFinder.compile_units())
- VMap.MD()[CU].reset(CU);
-
- for (DIType *Type : DIFinder.types())
- VMap.MD()[Type].reset(Type);
-
+
+ // When we remap instructions, we want to avoid duplicating inlined
+ // DISubprograms, so record all subprograms we find as we duplicate
+ // instructions and then freeze them in the MD map.
+ // We also record information about dbg.value and dbg.declare to avoid
+ // duplicating the types.
+ DebugInfoFinder DIFinder;
+
+ // Loop over all of the basic blocks in the function, cloning them as
+ // appropriate. Note that we save BE this way in order to handle cloning of
+ // recursive functions into themselves.
+ for (Function::const_iterator BI = OldFunc->begin(), BE = OldFunc->end();
+ BI != BE; ++BI) {
+ const BasicBlock &BB = *BI;
+
+ // Create a new basic block and copy instructions into it!
+ BasicBlock *CBB = CloneBasicBlock(&BB, VMap, NameSuffix, NewFunc, CodeInfo,
+ ModuleLevelChanges ? &DIFinder : nullptr);
+
+ // Add basic block mapping.
+ VMap[&BB] = CBB;
+
+ // It is only legal to clone a function if a block address within that
+ // function is never referenced outside of the function. Given that, we
+ // want to map block addresses from the old function to block addresses in
+ // the clone. (This is different from the generic ValueMapper
+ // implementation, which generates an invalid blockaddress when
+ // cloning a function.)
+ if (BB.hasAddressTaken()) {
+ Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
+ const_cast<BasicBlock*>(&BB));
+ VMap[OldBBAddr] = BlockAddress::get(NewFunc, CBB);
+ }
+
+ // Note return instructions for the caller.
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(CBB->getTerminator()))
+ Returns.push_back(RI);
+ }
+
+ for (DISubprogram *ISP : DIFinder.subprograms())
+ if (ISP != SP)
+ VMap.MD()[ISP].reset(ISP);
+
+ for (DICompileUnit *CU : DIFinder.compile_units())
+ VMap.MD()[CU].reset(CU);
+
+ for (DIType *Type : DIFinder.types())
+ VMap.MD()[Type].reset(Type);
+
// Duplicate the metadata that is attached to the cloned function.
// Subprograms/CUs/types that were already mapped to themselves won't be
// duplicated.
@@ -206,684 +206,684 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
TypeMapper, Materializer));
}
- // Loop over all of the instructions in the function, fixing up operand
- // references as we go. This uses VMap to do all the hard work.
- for (Function::iterator BB =
- cast<BasicBlock>(VMap[&OldFunc->front()])->getIterator(),
- BE = NewFunc->end();
- BB != BE; ++BB)
- // Loop over all instructions, fixing each one as we find it...
- for (Instruction &II : *BB)
- RemapInstruction(&II, VMap,
- ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
- TypeMapper, Materializer);
-
- // Register all DICompileUnits of the old parent module in the new parent module
- auto* OldModule = OldFunc->getParent();
- auto* NewModule = NewFunc->getParent();
- if (OldModule && NewModule && OldModule != NewModule && DIFinder.compile_unit_count()) {
- auto* NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu");
- // Avoid multiple insertions of the same DICompileUnit to NMD.
- SmallPtrSet<const void*, 8> Visited;
- for (auto* Operand : NMD->operands())
- Visited.insert(Operand);
- for (auto* Unit : DIFinder.compile_units())
- // VMap.MD()[Unit] == Unit
- if (Visited.insert(Unit).second)
- NMD->addOperand(Unit);
- }
-}
-
-/// Return a copy of the specified function and add it to that function's
-/// module. Also, any references specified in the VMap are changed to refer to
-/// their mapped value instead of the original one. If any of the arguments to
-/// the function are in the VMap, the arguments are deleted from the resultant
-/// function. The VMap is updated to include mappings from all of the
-/// instructions and basicblocks in the function from their old to new values.
-///
-Function *llvm::CloneFunction(Function *F, ValueToValueMapTy &VMap,
- ClonedCodeInfo *CodeInfo) {
- std::vector<Type*> ArgTypes;
-
- // The user might be deleting arguments to the function by specifying them in
- // the VMap. If so, we need to not add the arguments to the arg ty vector
- //
- for (const Argument &I : F->args())
- if (VMap.count(&I) == 0) // Haven't mapped the argument to anything yet?
- ArgTypes.push_back(I.getType());
-
- // Create a new function type...
- FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(),
- ArgTypes, F->getFunctionType()->isVarArg());
-
- // Create the new function...
- Function *NewF = Function::Create(FTy, F->getLinkage(), F->getAddressSpace(),
- F->getName(), F->getParent());
-
- // Loop over the arguments, copying the names of the mapped arguments over...
- Function::arg_iterator DestI = NewF->arg_begin();
- for (const Argument & I : F->args())
- if (VMap.count(&I) == 0) { // Is this argument preserved?
- DestI->setName(I.getName()); // Copy the name over...
- VMap[&I] = &*DestI++; // Add mapping to VMap
- }
-
- SmallVector<ReturnInst*, 8> Returns; // Ignore returns cloned.
- CloneFunctionInto(NewF, F, VMap, F->getSubprogram() != nullptr, Returns, "",
- CodeInfo);
-
- return NewF;
-}
-
-
-
-namespace {
- /// This is a private class used to implement CloneAndPruneFunctionInto.
- struct PruningFunctionCloner {
- Function *NewFunc;
- const Function *OldFunc;
- ValueToValueMapTy &VMap;
- bool ModuleLevelChanges;
- const char *NameSuffix;
- ClonedCodeInfo *CodeInfo;
-
- public:
- PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
- ValueToValueMapTy &valueMap, bool moduleLevelChanges,
- const char *nameSuffix, ClonedCodeInfo *codeInfo)
- : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap),
- ModuleLevelChanges(moduleLevelChanges), NameSuffix(nameSuffix),
- CodeInfo(codeInfo) {}
-
- /// The specified block is found to be reachable, clone it and
- /// anything that it can reach.
- void CloneBlock(const BasicBlock *BB,
- BasicBlock::const_iterator StartingInst,
- std::vector<const BasicBlock*> &ToClone);
- };
-}
-
-/// The specified block is found to be reachable, clone it and
-/// anything that it can reach.
-void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
- BasicBlock::const_iterator StartingInst,
- std::vector<const BasicBlock*> &ToClone){
- WeakTrackingVH &BBEntry = VMap[BB];
-
- // Have we already cloned this block?
- if (BBEntry) return;
-
- // Nope, clone it now.
- BasicBlock *NewBB;
- BBEntry = NewBB = BasicBlock::Create(BB->getContext());
- if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
-
- // It is only legal to clone a function if a block address within that
- // function is never referenced outside of the function. Given that, we
- // want to map block addresses from the old function to block addresses in
- // the clone. (This is different from the generic ValueMapper
- // implementation, which generates an invalid blockaddress when
- // cloning a function.)
- //
- // Note that we don't need to fix the mapping for unreachable blocks;
- // the default mapping there is safe.
- if (BB->hasAddressTaken()) {
- Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
- const_cast<BasicBlock*>(BB));
- VMap[OldBBAddr] = BlockAddress::get(NewFunc, NewBB);
- }
-
- bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
-
- // Loop over all instructions, and copy them over, DCE'ing as we go. This
- // loop doesn't include the terminator.
- for (BasicBlock::const_iterator II = StartingInst, IE = --BB->end();
- II != IE; ++II) {
-
- Instruction *NewInst = II->clone();
-
- // Eagerly remap operands to the newly cloned instruction, except for PHI
- // nodes for which we defer processing until we update the CFG.
- if (!isa<PHINode>(NewInst)) {
- RemapInstruction(NewInst, VMap,
- ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
-
- // If we can simplify this instruction to some other value, simply add
- // a mapping to that value rather than inserting a new instruction into
- // the basic block.
- if (Value *V =
- SimplifyInstruction(NewInst, BB->getModule()->getDataLayout())) {
- // On the off-chance that this simplifies to an instruction in the old
- // function, map it back into the new function.
- if (NewFunc != OldFunc)
- if (Value *MappedV = VMap.lookup(V))
- V = MappedV;
-
- if (!NewInst->mayHaveSideEffects()) {
- VMap[&*II] = V;
- NewInst->deleteValue();
- continue;
- }
- }
- }
-
- if (II->hasName())
- NewInst->setName(II->getName()+NameSuffix);
- VMap[&*II] = NewInst; // Add instruction map to value.
- NewBB->getInstList().push_back(NewInst);
- hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
-
- if (CodeInfo)
- if (auto *CB = dyn_cast<CallBase>(&*II))
- if (CB->hasOperandBundles())
- CodeInfo->OperandBundleCallSites.push_back(NewInst);
-
- if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
- if (isa<ConstantInt>(AI->getArraySize()))
- hasStaticAllocas = true;
- else
- hasDynamicAllocas = true;
- }
- }
-
- // Finally, clone over the terminator.
- const Instruction *OldTI = BB->getTerminator();
- bool TerminatorDone = false;
- if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) {
- if (BI->isConditional()) {
- // If the condition was a known constant in the callee...
- ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
- // Or is a known constant in the caller...
- if (!Cond) {
- Value *V = VMap.lookup(BI->getCondition());
- Cond = dyn_cast_or_null<ConstantInt>(V);
- }
-
- // Constant fold to uncond branch!
- if (Cond) {
- BasicBlock *Dest = BI->getSuccessor(!Cond->getZExtValue());
- VMap[OldTI] = BranchInst::Create(Dest, NewBB);
- ToClone.push_back(Dest);
- TerminatorDone = true;
- }
- }
- } else if (const SwitchInst *SI = dyn_cast<SwitchInst>(OldTI)) {
- // If switching on a value known constant in the caller.
- ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
- if (!Cond) { // Or known constant after constant prop in the callee...
- Value *V = VMap.lookup(SI->getCondition());
- Cond = dyn_cast_or_null<ConstantInt>(V);
- }
- if (Cond) { // Constant fold to uncond branch!
- SwitchInst::ConstCaseHandle Case = *SI->findCaseValue(Cond);
- BasicBlock *Dest = const_cast<BasicBlock*>(Case.getCaseSuccessor());
- VMap[OldTI] = BranchInst::Create(Dest, NewBB);
- ToClone.push_back(Dest);
- TerminatorDone = true;
- }
- }
-
- if (!TerminatorDone) {
- Instruction *NewInst = OldTI->clone();
- if (OldTI->hasName())
- NewInst->setName(OldTI->getName()+NameSuffix);
- NewBB->getInstList().push_back(NewInst);
- VMap[OldTI] = NewInst; // Add instruction map to value.
-
- if (CodeInfo)
- if (auto *CB = dyn_cast<CallBase>(OldTI))
- if (CB->hasOperandBundles())
- CodeInfo->OperandBundleCallSites.push_back(NewInst);
-
- // Recursively clone any reachable successor blocks.
+ // Loop over all of the instructions in the function, fixing up operand
+ // references as we go. This uses VMap to do all the hard work.
+ for (Function::iterator BB =
+ cast<BasicBlock>(VMap[&OldFunc->front()])->getIterator(),
+ BE = NewFunc->end();
+ BB != BE; ++BB)
+ // Loop over all instructions, fixing each one as we find it...
+ for (Instruction &II : *BB)
+ RemapInstruction(&II, VMap,
+ ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+ TypeMapper, Materializer);
+
+ // Register all DICompileUnits of the old parent module in the new parent module
+ auto* OldModule = OldFunc->getParent();
+ auto* NewModule = NewFunc->getParent();
+ if (OldModule && NewModule && OldModule != NewModule && DIFinder.compile_unit_count()) {
+ auto* NMD = NewModule->getOrInsertNamedMetadata("llvm.dbg.cu");
+ // Avoid multiple insertions of the same DICompileUnit to NMD.
+ SmallPtrSet<const void*, 8> Visited;
+ for (auto* Operand : NMD->operands())
+ Visited.insert(Operand);
+ for (auto* Unit : DIFinder.compile_units())
+ // VMap.MD()[Unit] == Unit
+ if (Visited.insert(Unit).second)
+ NMD->addOperand(Unit);
+ }
+}
+
+/// Return a copy of the specified function and add it to that function's
+/// module. Also, any references specified in the VMap are changed to refer to
+/// their mapped value instead of the original one. If any of the arguments to
+/// the function are in the VMap, the arguments are deleted from the resultant
+/// function. The VMap is updated to include mappings from all of the
+/// instructions and basicblocks in the function from their old to new values.
+///
+Function *llvm::CloneFunction(Function *F, ValueToValueMapTy &VMap,
+ ClonedCodeInfo *CodeInfo) {
+ std::vector<Type*> ArgTypes;
+
+ // The user might be deleting arguments to the function by specifying them in
+ // the VMap. If so, we need to not add the arguments to the arg ty vector
+ //
+ for (const Argument &I : F->args())
+ if (VMap.count(&I) == 0) // Haven't mapped the argument to anything yet?
+ ArgTypes.push_back(I.getType());
+
+ // Create a new function type...
+ FunctionType *FTy = FunctionType::get(F->getFunctionType()->getReturnType(),
+ ArgTypes, F->getFunctionType()->isVarArg());
+
+ // Create the new function...
+ Function *NewF = Function::Create(FTy, F->getLinkage(), F->getAddressSpace(),
+ F->getName(), F->getParent());
+
+ // Loop over the arguments, copying the names of the mapped arguments over...
+ Function::arg_iterator DestI = NewF->arg_begin();
+ for (const Argument & I : F->args())
+ if (VMap.count(&I) == 0) { // Is this argument preserved?
+ DestI->setName(I.getName()); // Copy the name over...
+ VMap[&I] = &*DestI++; // Add mapping to VMap
+ }
+
+ SmallVector<ReturnInst*, 8> Returns; // Ignore returns cloned.
+ CloneFunctionInto(NewF, F, VMap, F->getSubprogram() != nullptr, Returns, "",
+ CodeInfo);
+
+ return NewF;
+}
+
+
+
+namespace {
+ /// This is a private class used to implement CloneAndPruneFunctionInto.
+ struct PruningFunctionCloner {
+ Function *NewFunc;
+ const Function *OldFunc;
+ ValueToValueMapTy &VMap;
+ bool ModuleLevelChanges;
+ const char *NameSuffix;
+ ClonedCodeInfo *CodeInfo;
+
+ public:
+ PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
+ ValueToValueMapTy &valueMap, bool moduleLevelChanges,
+ const char *nameSuffix, ClonedCodeInfo *codeInfo)
+ : NewFunc(newFunc), OldFunc(oldFunc), VMap(valueMap),
+ ModuleLevelChanges(moduleLevelChanges), NameSuffix(nameSuffix),
+ CodeInfo(codeInfo) {}
+
+ /// The specified block is found to be reachable, clone it and
+ /// anything that it can reach.
+ void CloneBlock(const BasicBlock *BB,
+ BasicBlock::const_iterator StartingInst,
+ std::vector<const BasicBlock*> &ToClone);
+ };
+}
+
+/// The specified block is found to be reachable, clone it and
+/// anything that it can reach.
+void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
+ BasicBlock::const_iterator StartingInst,
+ std::vector<const BasicBlock*> &ToClone){
+ WeakTrackingVH &BBEntry = VMap[BB];
+
+ // Have we already cloned this block?
+ if (BBEntry) return;
+
+ // Nope, clone it now.
+ BasicBlock *NewBB;
+ BBEntry = NewBB = BasicBlock::Create(BB->getContext());
+ if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix);
+
+ // It is only legal to clone a function if a block address within that
+ // function is never referenced outside of the function. Given that, we
+ // want to map block addresses from the old function to block addresses in
+ // the clone. (This is different from the generic ValueMapper
+ // implementation, which generates an invalid blockaddress when
+ // cloning a function.)
+ //
+ // Note that we don't need to fix the mapping for unreachable blocks;
+ // the default mapping there is safe.
+ if (BB->hasAddressTaken()) {
+ Constant *OldBBAddr = BlockAddress::get(const_cast<Function*>(OldFunc),
+ const_cast<BasicBlock*>(BB));
+ VMap[OldBBAddr] = BlockAddress::get(NewFunc, NewBB);
+ }
+
+ bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
+
+ // Loop over all instructions, and copy them over, DCE'ing as we go. This
+ // loop doesn't include the terminator.
+ for (BasicBlock::const_iterator II = StartingInst, IE = --BB->end();
+ II != IE; ++II) {
+
+ Instruction *NewInst = II->clone();
+
+ // Eagerly remap operands to the newly cloned instruction, except for PHI
+ // nodes for which we defer processing until we update the CFG.
+ if (!isa<PHINode>(NewInst)) {
+ RemapInstruction(NewInst, VMap,
+ ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+
+ // If we can simplify this instruction to some other value, simply add
+ // a mapping to that value rather than inserting a new instruction into
+ // the basic block.
+ if (Value *V =
+ SimplifyInstruction(NewInst, BB->getModule()->getDataLayout())) {
+ // On the off-chance that this simplifies to an instruction in the old
+ // function, map it back into the new function.
+ if (NewFunc != OldFunc)
+ if (Value *MappedV = VMap.lookup(V))
+ V = MappedV;
+
+ if (!NewInst->mayHaveSideEffects()) {
+ VMap[&*II] = V;
+ NewInst->deleteValue();
+ continue;
+ }
+ }
+ }
+
+ if (II->hasName())
+ NewInst->setName(II->getName()+NameSuffix);
+ VMap[&*II] = NewInst; // Add instruction map to value.
+ NewBB->getInstList().push_back(NewInst);
+ hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
+
+ if (CodeInfo)
+ if (auto *CB = dyn_cast<CallBase>(&*II))
+ if (CB->hasOperandBundles())
+ CodeInfo->OperandBundleCallSites.push_back(NewInst);
+
+ if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
+ if (isa<ConstantInt>(AI->getArraySize()))
+ hasStaticAllocas = true;
+ else
+ hasDynamicAllocas = true;
+ }
+ }
+
+ // Finally, clone over the terminator.
+ const Instruction *OldTI = BB->getTerminator();
+ bool TerminatorDone = false;
+ if (const BranchInst *BI = dyn_cast<BranchInst>(OldTI)) {
+ if (BI->isConditional()) {
+ // If the condition was a known constant in the callee...
+ ConstantInt *Cond = dyn_cast<ConstantInt>(BI->getCondition());
+ // Or is a known constant in the caller...
+ if (!Cond) {
+ Value *V = VMap.lookup(BI->getCondition());
+ Cond = dyn_cast_or_null<ConstantInt>(V);
+ }
+
+ // Constant fold to uncond branch!
+ if (Cond) {
+ BasicBlock *Dest = BI->getSuccessor(!Cond->getZExtValue());
+ VMap[OldTI] = BranchInst::Create(Dest, NewBB);
+ ToClone.push_back(Dest);
+ TerminatorDone = true;
+ }
+ }
+ } else if (const SwitchInst *SI = dyn_cast<SwitchInst>(OldTI)) {
+ // If switching on a value known constant in the caller.
+ ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition());
+ if (!Cond) { // Or known constant after constant prop in the callee...
+ Value *V = VMap.lookup(SI->getCondition());
+ Cond = dyn_cast_or_null<ConstantInt>(V);
+ }
+ if (Cond) { // Constant fold to uncond branch!
+ SwitchInst::ConstCaseHandle Case = *SI->findCaseValue(Cond);
+ BasicBlock *Dest = const_cast<BasicBlock*>(Case.getCaseSuccessor());
+ VMap[OldTI] = BranchInst::Create(Dest, NewBB);
+ ToClone.push_back(Dest);
+ TerminatorDone = true;
+ }
+ }
+
+ if (!TerminatorDone) {
+ Instruction *NewInst = OldTI->clone();
+ if (OldTI->hasName())
+ NewInst->setName(OldTI->getName()+NameSuffix);
+ NewBB->getInstList().push_back(NewInst);
+ VMap[OldTI] = NewInst; // Add instruction map to value.
+
+ if (CodeInfo)
+ if (auto *CB = dyn_cast<CallBase>(OldTI))
+ if (CB->hasOperandBundles())
+ CodeInfo->OperandBundleCallSites.push_back(NewInst);
+
+ // Recursively clone any reachable successor blocks.
append_range(ToClone, successors(BB->getTerminator()));
- }
-
- if (CodeInfo) {
- CodeInfo->ContainsCalls |= hasCalls;
- CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
- CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas &&
- BB != &BB->getParent()->front();
- }
-}
-
-/// This works like CloneAndPruneFunctionInto, except that it does not clone the
-/// entire function. Instead it starts at an instruction provided by the caller
-/// and copies (and prunes) only the code reachable from that instruction.
-void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
- const Instruction *StartingInst,
- ValueToValueMapTy &VMap,
- bool ModuleLevelChanges,
- SmallVectorImpl<ReturnInst *> &Returns,
- const char *NameSuffix,
- ClonedCodeInfo *CodeInfo) {
- assert(NameSuffix && "NameSuffix cannot be null!");
-
- ValueMapTypeRemapper *TypeMapper = nullptr;
- ValueMaterializer *Materializer = nullptr;
-
-#ifndef NDEBUG
- // If the cloning starts at the beginning of the function, verify that
- // the function arguments are mapped.
- if (!StartingInst)
- for (const Argument &II : OldFunc->args())
- assert(VMap.count(&II) && "No mapping from source argument specified!");
-#endif
-
- PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges,
- NameSuffix, CodeInfo);
- const BasicBlock *StartingBB;
- if (StartingInst)
- StartingBB = StartingInst->getParent();
- else {
- StartingBB = &OldFunc->getEntryBlock();
- StartingInst = &StartingBB->front();
- }
-
- // Clone the entry block, and anything recursively reachable from it.
- std::vector<const BasicBlock*> CloneWorklist;
- PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist);
- while (!CloneWorklist.empty()) {
- const BasicBlock *BB = CloneWorklist.back();
- CloneWorklist.pop_back();
- PFC.CloneBlock(BB, BB->begin(), CloneWorklist);
- }
-
- // Loop over all of the basic blocks in the old function. If the block was
- // reachable, we have cloned it and the old block is now in the value map:
- // insert it into the new function in the right order. If not, ignore it.
- //
- // Defer PHI resolution until rest of function is resolved.
- SmallVector<const PHINode*, 16> PHIToResolve;
- for (const BasicBlock &BI : *OldFunc) {
- Value *V = VMap.lookup(&BI);
- BasicBlock *NewBB = cast_or_null<BasicBlock>(V);
- if (!NewBB) continue; // Dead block.
-
- // Add the new block to the new function.
- NewFunc->getBasicBlockList().push_back(NewBB);
-
- // Handle PHI nodes specially, as we have to remove references to dead
- // blocks.
- for (const PHINode &PN : BI.phis()) {
- // PHI nodes may have been remapped to non-PHI nodes by the caller or
- // during the cloning process.
- if (isa<PHINode>(VMap[&PN]))
- PHIToResolve.push_back(&PN);
- else
- break;
- }
-
- // Finally, remap the terminator instructions, as those can't be remapped
- // until all BBs are mapped.
- RemapInstruction(NewBB->getTerminator(), VMap,
- ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
- TypeMapper, Materializer);
- }
-
- // Defer PHI resolution until rest of function is resolved, PHI resolution
- // requires the CFG to be up-to-date.
- for (unsigned phino = 0, e = PHIToResolve.size(); phino != e; ) {
- const PHINode *OPN = PHIToResolve[phino];
- unsigned NumPreds = OPN->getNumIncomingValues();
- const BasicBlock *OldBB = OPN->getParent();
- BasicBlock *NewBB = cast<BasicBlock>(VMap[OldBB]);
-
- // Map operands for blocks that are live and remove operands for blocks
- // that are dead.
- for (; phino != PHIToResolve.size() &&
- PHIToResolve[phino]->getParent() == OldBB; ++phino) {
- OPN = PHIToResolve[phino];
- PHINode *PN = cast<PHINode>(VMap[OPN]);
- for (unsigned pred = 0, e = NumPreds; pred != e; ++pred) {
- Value *V = VMap.lookup(PN->getIncomingBlock(pred));
- if (BasicBlock *MappedBlock = cast_or_null<BasicBlock>(V)) {
- Value *InVal = MapValue(PN->getIncomingValue(pred),
- VMap,
- ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
- assert(InVal && "Unknown input value?");
- PN->setIncomingValue(pred, InVal);
- PN->setIncomingBlock(pred, MappedBlock);
- } else {
- PN->removeIncomingValue(pred, false);
- --pred; // Revisit the next entry.
- --e;
- }
- }
- }
-
- // The loop above has removed PHI entries for those blocks that are dead
- // and has updated others. However, if a block is live (i.e. copied over)
- // but its terminator has been changed to not go to this block, then our
- // phi nodes will have invalid entries. Update the PHI nodes in this
- // case.
- PHINode *PN = cast<PHINode>(NewBB->begin());
- NumPreds = pred_size(NewBB);
- if (NumPreds != PN->getNumIncomingValues()) {
- assert(NumPreds < PN->getNumIncomingValues());
- // Count how many times each predecessor comes to this block.
- std::map<BasicBlock*, unsigned> PredCount;
- for (pred_iterator PI = pred_begin(NewBB), E = pred_end(NewBB);
- PI != E; ++PI)
- --PredCount[*PI];
-
- // Figure out how many entries to remove from each PHI.
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- ++PredCount[PN->getIncomingBlock(i)];
-
- // At this point, the excess predecessor entries are positive in the
- // map. Loop over all of the PHIs and remove excess predecessor
- // entries.
- BasicBlock::iterator I = NewBB->begin();
- for (; (PN = dyn_cast<PHINode>(I)); ++I) {
- for (const auto &PCI : PredCount) {
- BasicBlock *Pred = PCI.first;
- for (unsigned NumToRemove = PCI.second; NumToRemove; --NumToRemove)
- PN->removeIncomingValue(Pred, false);
- }
- }
- }
-
- // If the loops above have made these phi nodes have 0 or 1 operand,
- // replace them with undef or the input value. We must do this for
- // correctness, because 0-operand phis are not valid.
- PN = cast<PHINode>(NewBB->begin());
- if (PN->getNumIncomingValues() == 0) {
- BasicBlock::iterator I = NewBB->begin();
- BasicBlock::const_iterator OldI = OldBB->begin();
- while ((PN = dyn_cast<PHINode>(I++))) {
- Value *NV = UndefValue::get(PN->getType());
- PN->replaceAllUsesWith(NV);
- assert(VMap[&*OldI] == PN && "VMap mismatch");
- VMap[&*OldI] = NV;
- PN->eraseFromParent();
- ++OldI;
- }
- }
- }
-
- // Make a second pass over the PHINodes now that all of them have been
- // remapped into the new function, simplifying the PHINode and performing any
- // recursive simplifications exposed. This will transparently update the
- // WeakTrackingVH in the VMap. Notably, we rely on that so that if we coalesce
- // two PHINodes, the iteration over the old PHIs remains valid, and the
- // mapping will just map us to the new node (which may not even be a PHI
- // node).
- const DataLayout &DL = NewFunc->getParent()->getDataLayout();
- SmallSetVector<const Value *, 8> Worklist;
- for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx)
- if (isa<PHINode>(VMap[PHIToResolve[Idx]]))
- Worklist.insert(PHIToResolve[Idx]);
-
- // Note that we must test the size on each iteration, the worklist can grow.
- for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
- const Value *OrigV = Worklist[Idx];
- auto *I = dyn_cast_or_null<Instruction>(VMap.lookup(OrigV));
- if (!I)
- continue;
-
- // Skip over non-intrinsic callsites, we don't want to remove any nodes from
- // the CGSCC.
- CallBase *CB = dyn_cast<CallBase>(I);
- if (CB && CB->getCalledFunction() &&
- !CB->getCalledFunction()->isIntrinsic())
- continue;
-
- // See if this instruction simplifies.
- Value *SimpleV = SimplifyInstruction(I, DL);
- if (!SimpleV)
- continue;
-
- // Stash away all the uses of the old instruction so we can check them for
- // recursive simplifications after a RAUW. This is cheaper than checking all
- // uses of To on the recursive step in most cases.
- for (const User *U : OrigV->users())
- Worklist.insert(cast<Instruction>(U));
-
- // Replace the instruction with its simplified value.
- I->replaceAllUsesWith(SimpleV);
-
- // If the original instruction had no side effects, remove it.
- if (isInstructionTriviallyDead(I))
- I->eraseFromParent();
- else
- VMap[OrigV] = I;
- }
-
- // Now that the inlined function body has been fully constructed, go through
- // and zap unconditional fall-through branches. This happens all the time when
- // specializing code: code specialization turns conditional branches into
- // uncond branches, and this code folds them.
- Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator();
- Function::iterator I = Begin;
- while (I != NewFunc->end()) {
- // We need to simplify conditional branches and switches with a constant
- // operand. We try to prune these out when cloning, but if the
- // simplification required looking through PHI nodes, those are only
- // available after forming the full basic block. That may leave some here,
- // and we still want to prune the dead code as early as possible.
- //
- // Do the folding before we check if the block is dead since we want code
- // like
- // bb:
- // br i1 undef, label %bb, label %bb
- // to be simplified to
- // bb:
- // br label %bb
- // before we call I->getSinglePredecessor().
- ConstantFoldTerminator(&*I);
-
- // Check if this block has become dead during inlining or other
- // simplifications. Note that the first block will appear dead, as it has
- // not yet been wired up properly.
+ }
+
+ if (CodeInfo) {
+ CodeInfo->ContainsCalls |= hasCalls;
+ CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
+ CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas &&
+ BB != &BB->getParent()->front();
+ }
+}
+
+/// This works like CloneAndPruneFunctionInto, except that it does not clone the
+/// entire function. Instead it starts at an instruction provided by the caller
+/// and copies (and prunes) only the code reachable from that instruction.
+void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
+ const Instruction *StartingInst,
+ ValueToValueMapTy &VMap,
+ bool ModuleLevelChanges,
+ SmallVectorImpl<ReturnInst *> &Returns,
+ const char *NameSuffix,
+ ClonedCodeInfo *CodeInfo) {
+ assert(NameSuffix && "NameSuffix cannot be null!");
+
+ ValueMapTypeRemapper *TypeMapper = nullptr;
+ ValueMaterializer *Materializer = nullptr;
+
+#ifndef NDEBUG
+ // If the cloning starts at the beginning of the function, verify that
+ // the function arguments are mapped.
+ if (!StartingInst)
+ for (const Argument &II : OldFunc->args())
+ assert(VMap.count(&II) && "No mapping from source argument specified!");
+#endif
+
+ PruningFunctionCloner PFC(NewFunc, OldFunc, VMap, ModuleLevelChanges,
+ NameSuffix, CodeInfo);
+ const BasicBlock *StartingBB;
+ if (StartingInst)
+ StartingBB = StartingInst->getParent();
+ else {
+ StartingBB = &OldFunc->getEntryBlock();
+ StartingInst = &StartingBB->front();
+ }
+
+ // Clone the entry block, and anything recursively reachable from it.
+ std::vector<const BasicBlock*> CloneWorklist;
+ PFC.CloneBlock(StartingBB, StartingInst->getIterator(), CloneWorklist);
+ while (!CloneWorklist.empty()) {
+ const BasicBlock *BB = CloneWorklist.back();
+ CloneWorklist.pop_back();
+ PFC.CloneBlock(BB, BB->begin(), CloneWorklist);
+ }
+
+ // Loop over all of the basic blocks in the old function. If the block was
+ // reachable, we have cloned it and the old block is now in the value map:
+ // insert it into the new function in the right order. If not, ignore it.
+ //
+ // Defer PHI resolution until rest of function is resolved.
+ SmallVector<const PHINode*, 16> PHIToResolve;
+ for (const BasicBlock &BI : *OldFunc) {
+ Value *V = VMap.lookup(&BI);
+ BasicBlock *NewBB = cast_or_null<BasicBlock>(V);
+ if (!NewBB) continue; // Dead block.
+
+ // Add the new block to the new function.
+ NewFunc->getBasicBlockList().push_back(NewBB);
+
+ // Handle PHI nodes specially, as we have to remove references to dead
+ // blocks.
+ for (const PHINode &PN : BI.phis()) {
+ // PHI nodes may have been remapped to non-PHI nodes by the caller or
+ // during the cloning process.
+ if (isa<PHINode>(VMap[&PN]))
+ PHIToResolve.push_back(&PN);
+ else
+ break;
+ }
+
+ // Finally, remap the terminator instructions, as those can't be remapped
+ // until all BBs are mapped.
+ RemapInstruction(NewBB->getTerminator(), VMap,
+ ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
+ TypeMapper, Materializer);
+ }
+
+ // Defer PHI resolution until rest of function is resolved, PHI resolution
+ // requires the CFG to be up-to-date.
+ for (unsigned phino = 0, e = PHIToResolve.size(); phino != e; ) {
+ const PHINode *OPN = PHIToResolve[phino];
+ unsigned NumPreds = OPN->getNumIncomingValues();
+ const BasicBlock *OldBB = OPN->getParent();
+ BasicBlock *NewBB = cast<BasicBlock>(VMap[OldBB]);
+
+ // Map operands for blocks that are live and remove operands for blocks
+ // that are dead.
+ for (; phino != PHIToResolve.size() &&
+ PHIToResolve[phino]->getParent() == OldBB; ++phino) {
+ OPN = PHIToResolve[phino];
+ PHINode *PN = cast<PHINode>(VMap[OPN]);
+ for (unsigned pred = 0, e = NumPreds; pred != e; ++pred) {
+ Value *V = VMap.lookup(PN->getIncomingBlock(pred));
+ if (BasicBlock *MappedBlock = cast_or_null<BasicBlock>(V)) {
+ Value *InVal = MapValue(PN->getIncomingValue(pred),
+ VMap,
+ ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges);
+ assert(InVal && "Unknown input value?");
+ PN->setIncomingValue(pred, InVal);
+ PN->setIncomingBlock(pred, MappedBlock);
+ } else {
+ PN->removeIncomingValue(pred, false);
+ --pred; // Revisit the next entry.
+ --e;
+ }
+ }
+ }
+
+ // The loop above has removed PHI entries for those blocks that are dead
+ // and has updated others. However, if a block is live (i.e. copied over)
+ // but its terminator has been changed to not go to this block, then our
+ // phi nodes will have invalid entries. Update the PHI nodes in this
+ // case.
+ PHINode *PN = cast<PHINode>(NewBB->begin());
+ NumPreds = pred_size(NewBB);
+ if (NumPreds != PN->getNumIncomingValues()) {
+ assert(NumPreds < PN->getNumIncomingValues());
+ // Count how many times each predecessor comes to this block.
+ std::map<BasicBlock*, unsigned> PredCount;
+ for (pred_iterator PI = pred_begin(NewBB), E = pred_end(NewBB);
+ PI != E; ++PI)
+ --PredCount[*PI];
+
+ // Figure out how many entries to remove from each PHI.
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ ++PredCount[PN->getIncomingBlock(i)];
+
+ // At this point, the excess predecessor entries are positive in the
+ // map. Loop over all of the PHIs and remove excess predecessor
+ // entries.
+ BasicBlock::iterator I = NewBB->begin();
+ for (; (PN = dyn_cast<PHINode>(I)); ++I) {
+ for (const auto &PCI : PredCount) {
+ BasicBlock *Pred = PCI.first;
+ for (unsigned NumToRemove = PCI.second; NumToRemove; --NumToRemove)
+ PN->removeIncomingValue(Pred, false);
+ }
+ }
+ }
+
+ // If the loops above have made these phi nodes have 0 or 1 operand,
+ // replace them with undef or the input value. We must do this for
+ // correctness, because 0-operand phis are not valid.
+ PN = cast<PHINode>(NewBB->begin());
+ if (PN->getNumIncomingValues() == 0) {
+ BasicBlock::iterator I = NewBB->begin();
+ BasicBlock::const_iterator OldI = OldBB->begin();
+ while ((PN = dyn_cast<PHINode>(I++))) {
+ Value *NV = UndefValue::get(PN->getType());
+ PN->replaceAllUsesWith(NV);
+ assert(VMap[&*OldI] == PN && "VMap mismatch");
+ VMap[&*OldI] = NV;
+ PN->eraseFromParent();
+ ++OldI;
+ }
+ }
+ }
+
+ // Make a second pass over the PHINodes now that all of them have been
+ // remapped into the new function, simplifying the PHINode and performing any
+ // recursive simplifications exposed. This will transparently update the
+ // WeakTrackingVH in the VMap. Notably, we rely on that so that if we coalesce
+ // two PHINodes, the iteration over the old PHIs remains valid, and the
+ // mapping will just map us to the new node (which may not even be a PHI
+ // node).
+ const DataLayout &DL = NewFunc->getParent()->getDataLayout();
+ SmallSetVector<const Value *, 8> Worklist;
+ for (unsigned Idx = 0, Size = PHIToResolve.size(); Idx != Size; ++Idx)
+ if (isa<PHINode>(VMap[PHIToResolve[Idx]]))
+ Worklist.insert(PHIToResolve[Idx]);
+
+ // Note that we must test the size on each iteration, the worklist can grow.
+ for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
+ const Value *OrigV = Worklist[Idx];
+ auto *I = dyn_cast_or_null<Instruction>(VMap.lookup(OrigV));
+ if (!I)
+ continue;
+
+ // Skip over non-intrinsic callsites, we don't want to remove any nodes from
+ // the CGSCC.
+ CallBase *CB = dyn_cast<CallBase>(I);
+ if (CB && CB->getCalledFunction() &&
+ !CB->getCalledFunction()->isIntrinsic())
+ continue;
+
+ // See if this instruction simplifies.
+ Value *SimpleV = SimplifyInstruction(I, DL);
+ if (!SimpleV)
+ continue;
+
+ // Stash away all the uses of the old instruction so we can check them for
+ // recursive simplifications after a RAUW. This is cheaper than checking all
+ // uses of To on the recursive step in most cases.
+ for (const User *U : OrigV->users())
+ Worklist.insert(cast<Instruction>(U));
+
+ // Replace the instruction with its simplified value.
+ I->replaceAllUsesWith(SimpleV);
+
+ // If the original instruction had no side effects, remove it.
+ if (isInstructionTriviallyDead(I))
+ I->eraseFromParent();
+ else
+ VMap[OrigV] = I;
+ }
+
+ // Now that the inlined function body has been fully constructed, go through
+ // and zap unconditional fall-through branches. This happens all the time when
+ // specializing code: code specialization turns conditional branches into
+ // uncond branches, and this code folds them.
+ Function::iterator Begin = cast<BasicBlock>(VMap[StartingBB])->getIterator();
+ Function::iterator I = Begin;
+ while (I != NewFunc->end()) {
+ // We need to simplify conditional branches and switches with a constant
+ // operand. We try to prune these out when cloning, but if the
+ // simplification required looking through PHI nodes, those are only
+ // available after forming the full basic block. That may leave some here,
+ // and we still want to prune the dead code as early as possible.
+ //
+ // Do the folding before we check if the block is dead since we want code
+ // like
+ // bb:
+ // br i1 undef, label %bb, label %bb
+ // to be simplified to
+ // bb:
+ // br label %bb
+ // before we call I->getSinglePredecessor().
+ ConstantFoldTerminator(&*I);
+
+ // Check if this block has become dead during inlining or other
+ // simplifications. Note that the first block will appear dead, as it has
+ // not yet been wired up properly.
if (I != Begin && (pred_empty(&*I) || I->getSinglePredecessor() == &*I)) {
- BasicBlock *DeadBB = &*I++;
- DeleteDeadBlock(DeadBB);
- continue;
- }
-
- BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator());
- if (!BI || BI->isConditional()) { ++I; continue; }
-
- BasicBlock *Dest = BI->getSuccessor(0);
- if (!Dest->getSinglePredecessor()) {
- ++I; continue;
- }
-
- // We shouldn't be able to get single-entry PHI nodes here, as instsimplify
- // above should have zapped all of them..
- assert(!isa<PHINode>(Dest->begin()));
-
- // We know all single-entry PHI nodes in the inlined function have been
- // removed, so we just need to splice the blocks.
- BI->eraseFromParent();
-
- // Make all PHI nodes that referred to Dest now refer to I as their source.
- Dest->replaceAllUsesWith(&*I);
-
- // Move all the instructions in the succ to the pred.
- I->getInstList().splice(I->end(), Dest->getInstList());
-
- // Remove the dest block.
- Dest->eraseFromParent();
-
- // Do not increment I, iteratively merge all things this block branches to.
- }
-
- // Make a final pass over the basic blocks from the old function to gather
- // any return instructions which survived folding. We have to do this here
- // because we can iteratively remove and merge returns above.
- for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB])->getIterator(),
- E = NewFunc->end();
- I != E; ++I)
- if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator()))
- Returns.push_back(RI);
-}
-
-
-/// This works exactly like CloneFunctionInto,
-/// except that it does some simple constant prop and DCE on the fly. The
-/// effect of this is to copy significantly less code in cases where (for
-/// example) a function call with constant arguments is inlined, and those
-/// constant arguments cause a significant amount of code in the callee to be
-/// dead. Since this doesn't produce an exact copy of the input, it can't be
-/// used for things like CloneFunction or CloneModule.
-void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
- ValueToValueMapTy &VMap,
- bool ModuleLevelChanges,
- SmallVectorImpl<ReturnInst*> &Returns,
- const char *NameSuffix,
- ClonedCodeInfo *CodeInfo,
- Instruction *TheCall) {
- CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap,
- ModuleLevelChanges, Returns, NameSuffix, CodeInfo);
-}
-
-/// Remaps instructions in \p Blocks using the mapping in \p VMap.
-void llvm::remapInstructionsInBlocks(
- const SmallVectorImpl<BasicBlock *> &Blocks, ValueToValueMapTy &VMap) {
- // Rewrite the code to refer to itself.
- for (auto *BB : Blocks)
- for (auto &Inst : *BB)
- RemapInstruction(&Inst, VMap,
- RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-}
-
-/// Clones a loop \p OrigLoop. Returns the loop and the blocks in \p
-/// Blocks.
-///
-/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block
-/// \p LoopDomBB. Insert the new blocks before block specified in \p Before.
-Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
- Loop *OrigLoop, ValueToValueMapTy &VMap,
- const Twine &NameSuffix, LoopInfo *LI,
- DominatorTree *DT,
- SmallVectorImpl<BasicBlock *> &Blocks) {
- Function *F = OrigLoop->getHeader()->getParent();
- Loop *ParentLoop = OrigLoop->getParentLoop();
- DenseMap<Loop *, Loop *> LMap;
-
- Loop *NewLoop = LI->AllocateLoop();
- LMap[OrigLoop] = NewLoop;
- if (ParentLoop)
- ParentLoop->addChildLoop(NewLoop);
- else
- LI->addTopLevelLoop(NewLoop);
-
- BasicBlock *OrigPH = OrigLoop->getLoopPreheader();
- assert(OrigPH && "No preheader");
- BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F);
- // To rename the loop PHIs.
- VMap[OrigPH] = NewPH;
- Blocks.push_back(NewPH);
-
- // Update LoopInfo.
- if (ParentLoop)
- ParentLoop->addBasicBlockToLoop(NewPH, *LI);
-
- // Update DominatorTree.
- DT->addNewBlock(NewPH, LoopDomBB);
-
- for (Loop *CurLoop : OrigLoop->getLoopsInPreorder()) {
- Loop *&NewLoop = LMap[CurLoop];
- if (!NewLoop) {
- NewLoop = LI->AllocateLoop();
-
- // Establish the parent/child relationship.
- Loop *OrigParent = CurLoop->getParentLoop();
- assert(OrigParent && "Could not find the original parent loop");
- Loop *NewParentLoop = LMap[OrigParent];
- assert(NewParentLoop && "Could not find the new parent loop");
-
- NewParentLoop->addChildLoop(NewLoop);
- }
- }
-
- for (BasicBlock *BB : OrigLoop->getBlocks()) {
- Loop *CurLoop = LI->getLoopFor(BB);
- Loop *&NewLoop = LMap[CurLoop];
- assert(NewLoop && "Expecting new loop to be allocated");
-
- BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F);
- VMap[BB] = NewBB;
-
- // Update LoopInfo.
- NewLoop->addBasicBlockToLoop(NewBB, *LI);
-
- // Add DominatorTree node. After seeing all blocks, update to correct
- // IDom.
- DT->addNewBlock(NewBB, NewPH);
-
- Blocks.push_back(NewBB);
- }
-
- for (BasicBlock *BB : OrigLoop->getBlocks()) {
- // Update loop headers.
- Loop *CurLoop = LI->getLoopFor(BB);
- if (BB == CurLoop->getHeader())
- LMap[CurLoop]->moveToHeader(cast<BasicBlock>(VMap[BB]));
-
- // Update DominatorTree.
- BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
- DT->changeImmediateDominator(cast<BasicBlock>(VMap[BB]),
- cast<BasicBlock>(VMap[IDomBB]));
- }
-
- // Move them physically from the end of the block list.
- F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(),
- NewPH);
- F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(),
- NewLoop->getHeader()->getIterator(), F->end());
-
- return NewLoop;
-}
-
-/// Duplicate non-Phi instructions from the beginning of block up to
-/// StopAt instruction into a split block between BB and its predecessor.
-BasicBlock *llvm::DuplicateInstructionsInSplitBetween(
- BasicBlock *BB, BasicBlock *PredBB, Instruction *StopAt,
- ValueToValueMapTy &ValueMapping, DomTreeUpdater &DTU) {
-
- assert(count(successors(PredBB), BB) == 1 &&
- "There must be a single edge between PredBB and BB!");
- // We are going to have to map operands from the original BB block to the new
- // copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to
- // account for entry from PredBB.
- BasicBlock::iterator BI = BB->begin();
- for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
- ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
-
- BasicBlock *NewBB = SplitEdge(PredBB, BB);
- NewBB->setName(PredBB->getName() + ".split");
- Instruction *NewTerm = NewBB->getTerminator();
-
- // FIXME: SplitEdge does not yet take a DTU, so we include the split edge
- // in the update set here.
- DTU.applyUpdates({{DominatorTree::Delete, PredBB, BB},
- {DominatorTree::Insert, PredBB, NewBB},
- {DominatorTree::Insert, NewBB, BB}});
-
- // Clone the non-phi instructions of BB into NewBB, keeping track of the
- // mapping and using it to remap operands in the cloned instructions.
- // Stop once we see the terminator too. This covers the case where BB's
- // terminator gets replaced and StopAt == BB's terminator.
- for (; StopAt != &*BI && BB->getTerminator() != &*BI; ++BI) {
- Instruction *New = BI->clone();
- New->setName(BI->getName());
- New->insertBefore(NewTerm);
- ValueMapping[&*BI] = New;
-
- // Remap operands to patch up intra-block references.
- for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
- if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
- auto I = ValueMapping.find(Inst);
- if (I != ValueMapping.end())
- New->setOperand(i, I->second);
- }
- }
-
- return NewBB;
-}
+ BasicBlock *DeadBB = &*I++;
+ DeleteDeadBlock(DeadBB);
+ continue;
+ }
+
+ BranchInst *BI = dyn_cast<BranchInst>(I->getTerminator());
+ if (!BI || BI->isConditional()) { ++I; continue; }
+
+ BasicBlock *Dest = BI->getSuccessor(0);
+ if (!Dest->getSinglePredecessor()) {
+ ++I; continue;
+ }
+
+ // We shouldn't be able to get single-entry PHI nodes here, as instsimplify
+ // above should have zapped all of them..
+ assert(!isa<PHINode>(Dest->begin()));
+
+ // We know all single-entry PHI nodes in the inlined function have been
+ // removed, so we just need to splice the blocks.
+ BI->eraseFromParent();
+
+ // Make all PHI nodes that referred to Dest now refer to I as their source.
+ Dest->replaceAllUsesWith(&*I);
+
+ // Move all the instructions in the succ to the pred.
+ I->getInstList().splice(I->end(), Dest->getInstList());
+
+ // Remove the dest block.
+ Dest->eraseFromParent();
+
+ // Do not increment I, iteratively merge all things this block branches to.
+ }
+
+ // Make a final pass over the basic blocks from the old function to gather
+ // any return instructions which survived folding. We have to do this here
+ // because we can iteratively remove and merge returns above.
+ for (Function::iterator I = cast<BasicBlock>(VMap[StartingBB])->getIterator(),
+ E = NewFunc->end();
+ I != E; ++I)
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(I->getTerminator()))
+ Returns.push_back(RI);
+}
+
+
+/// This works exactly like CloneFunctionInto,
+/// except that it does some simple constant prop and DCE on the fly. The
+/// effect of this is to copy significantly less code in cases where (for
+/// example) a function call with constant arguments is inlined, and those
+/// constant arguments cause a significant amount of code in the callee to be
+/// dead. Since this doesn't produce an exact copy of the input, it can't be
+/// used for things like CloneFunction or CloneModule.
+void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
+ ValueToValueMapTy &VMap,
+ bool ModuleLevelChanges,
+ SmallVectorImpl<ReturnInst*> &Returns,
+ const char *NameSuffix,
+ ClonedCodeInfo *CodeInfo,
+ Instruction *TheCall) {
+ CloneAndPruneIntoFromInst(NewFunc, OldFunc, &OldFunc->front().front(), VMap,
+ ModuleLevelChanges, Returns, NameSuffix, CodeInfo);
+}
+
+/// Remaps instructions in \p Blocks using the mapping in \p VMap.
+void llvm::remapInstructionsInBlocks(
+ const SmallVectorImpl<BasicBlock *> &Blocks, ValueToValueMapTy &VMap) {
+ // Rewrite the code to refer to itself.
+ for (auto *BB : Blocks)
+ for (auto &Inst : *BB)
+ RemapInstruction(&Inst, VMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+}
+
+/// Clones a loop \p OrigLoop. Returns the loop and the blocks in \p
+/// Blocks.
+///
+/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block
+/// \p LoopDomBB. Insert the new blocks before block specified in \p Before.
+Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
+ Loop *OrigLoop, ValueToValueMapTy &VMap,
+ const Twine &NameSuffix, LoopInfo *LI,
+ DominatorTree *DT,
+ SmallVectorImpl<BasicBlock *> &Blocks) {
+ Function *F = OrigLoop->getHeader()->getParent();
+ Loop *ParentLoop = OrigLoop->getParentLoop();
+ DenseMap<Loop *, Loop *> LMap;
+
+ Loop *NewLoop = LI->AllocateLoop();
+ LMap[OrigLoop] = NewLoop;
+ if (ParentLoop)
+ ParentLoop->addChildLoop(NewLoop);
+ else
+ LI->addTopLevelLoop(NewLoop);
+
+ BasicBlock *OrigPH = OrigLoop->getLoopPreheader();
+ assert(OrigPH && "No preheader");
+ BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F);
+ // To rename the loop PHIs.
+ VMap[OrigPH] = NewPH;
+ Blocks.push_back(NewPH);
+
+ // Update LoopInfo.
+ if (ParentLoop)
+ ParentLoop->addBasicBlockToLoop(NewPH, *LI);
+
+ // Update DominatorTree.
+ DT->addNewBlock(NewPH, LoopDomBB);
+
+ for (Loop *CurLoop : OrigLoop->getLoopsInPreorder()) {
+ Loop *&NewLoop = LMap[CurLoop];
+ if (!NewLoop) {
+ NewLoop = LI->AllocateLoop();
+
+ // Establish the parent/child relationship.
+ Loop *OrigParent = CurLoop->getParentLoop();
+ assert(OrigParent && "Could not find the original parent loop");
+ Loop *NewParentLoop = LMap[OrigParent];
+ assert(NewParentLoop && "Could not find the new parent loop");
+
+ NewParentLoop->addChildLoop(NewLoop);
+ }
+ }
+
+ for (BasicBlock *BB : OrigLoop->getBlocks()) {
+ Loop *CurLoop = LI->getLoopFor(BB);
+ Loop *&NewLoop = LMap[CurLoop];
+ assert(NewLoop && "Expecting new loop to be allocated");
+
+ BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F);
+ VMap[BB] = NewBB;
+
+ // Update LoopInfo.
+ NewLoop->addBasicBlockToLoop(NewBB, *LI);
+
+ // Add DominatorTree node. After seeing all blocks, update to correct
+ // IDom.
+ DT->addNewBlock(NewBB, NewPH);
+
+ Blocks.push_back(NewBB);
+ }
+
+ for (BasicBlock *BB : OrigLoop->getBlocks()) {
+ // Update loop headers.
+ Loop *CurLoop = LI->getLoopFor(BB);
+ if (BB == CurLoop->getHeader())
+ LMap[CurLoop]->moveToHeader(cast<BasicBlock>(VMap[BB]));
+
+ // Update DominatorTree.
+ BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
+ DT->changeImmediateDominator(cast<BasicBlock>(VMap[BB]),
+ cast<BasicBlock>(VMap[IDomBB]));
+ }
+
+ // Move them physically from the end of the block list.
+ F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(),
+ NewPH);
+ F->getBasicBlockList().splice(Before->getIterator(), F->getBasicBlockList(),
+ NewLoop->getHeader()->getIterator(), F->end());
+
+ return NewLoop;
+}
+
+/// Duplicate non-Phi instructions from the beginning of block up to
+/// StopAt instruction into a split block between BB and its predecessor.
+BasicBlock *llvm::DuplicateInstructionsInSplitBetween(
+ BasicBlock *BB, BasicBlock *PredBB, Instruction *StopAt,
+ ValueToValueMapTy &ValueMapping, DomTreeUpdater &DTU) {
+
+ assert(count(successors(PredBB), BB) == 1 &&
+ "There must be a single edge between PredBB and BB!");
+ // We are going to have to map operands from the original BB block to the new
+ // copy of the block 'NewBB'. If there are PHI nodes in BB, evaluate them to
+ // account for entry from PredBB.
+ BasicBlock::iterator BI = BB->begin();
+ for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+ ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+
+ BasicBlock *NewBB = SplitEdge(PredBB, BB);
+ NewBB->setName(PredBB->getName() + ".split");
+ Instruction *NewTerm = NewBB->getTerminator();
+
+ // FIXME: SplitEdge does not yet take a DTU, so we include the split edge
+ // in the update set here.
+ DTU.applyUpdates({{DominatorTree::Delete, PredBB, BB},
+ {DominatorTree::Insert, PredBB, NewBB},
+ {DominatorTree::Insert, NewBB, BB}});
+
+ // Clone the non-phi instructions of BB into NewBB, keeping track of the
+ // mapping and using it to remap operands in the cloned instructions.
+ // Stop once we see the terminator too. This covers the case where BB's
+ // terminator gets replaced and StopAt == BB's terminator.
+ for (; StopAt != &*BI && BB->getTerminator() != &*BI; ++BI) {
+ Instruction *New = BI->clone();
+ New->setName(BI->getName());
+ New->insertBefore(NewTerm);
+ ValueMapping[&*BI] = New;
+
+ // Remap operands to patch up intra-block references.
+ for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+ if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+ auto I = ValueMapping.find(Inst);
+ if (I != ValueMapping.end())
+ New->setOperand(i, I->second);
+ }
+ }
+
+ return NewBB;
+}
void llvm::cloneNoAliasScopes(
ArrayRef<MDNode *> NoAliasDeclScopes,
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CloneModule.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CloneModule.cpp
index ae1c463b08..a6327bbf21 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CloneModule.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CloneModule.cpp
@@ -1,122 +1,122 @@
-//===- CloneModule.cpp - Clone an entire module ---------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the CloneModule interface which makes a copy of an
-// entire module.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-using namespace llvm;
-
-static void copyComdat(GlobalObject *Dst, const GlobalObject *Src) {
- const Comdat *SC = Src->getComdat();
- if (!SC)
- return;
- Comdat *DC = Dst->getParent()->getOrInsertComdat(SC->getName());
- DC->setSelectionKind(SC->getSelectionKind());
- Dst->setComdat(DC);
-}
-
-/// This is not as easy as it might seem because we have to worry about making
-/// copies of global variables and functions, and making their (initializers and
-/// references, respectively) refer to the right globals.
-///
-std::unique_ptr<Module> llvm::CloneModule(const Module &M) {
- // Create the value map that maps things from the old module over to the new
- // module.
- ValueToValueMapTy VMap;
- return CloneModule(M, VMap);
-}
-
-std::unique_ptr<Module> llvm::CloneModule(const Module &M,
- ValueToValueMapTy &VMap) {
- return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; });
-}
-
-std::unique_ptr<Module> llvm::CloneModule(
- const Module &M, ValueToValueMapTy &VMap,
- function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) {
- // First off, we need to create the new module.
- std::unique_ptr<Module> New =
- std::make_unique<Module>(M.getModuleIdentifier(), M.getContext());
- New->setSourceFileName(M.getSourceFileName());
- New->setDataLayout(M.getDataLayout());
- New->setTargetTriple(M.getTargetTriple());
- New->setModuleInlineAsm(M.getModuleInlineAsm());
-
- // Loop over all of the global variables, making corresponding globals in the
- // new module. Here we add them to the VMap and to the new Module. We
- // don't worry about attributes or initializers, they will come later.
- //
- for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
- I != E; ++I) {
- GlobalVariable *GV = new GlobalVariable(*New,
- I->getValueType(),
- I->isConstant(), I->getLinkage(),
- (Constant*) nullptr, I->getName(),
- (GlobalVariable*) nullptr,
- I->getThreadLocalMode(),
- I->getType()->getAddressSpace());
- GV->copyAttributesFrom(&*I);
- VMap[&*I] = GV;
- }
-
- // Loop over the functions in the module, making external functions as before
- for (const Function &I : M) {
- Function *NF =
- Function::Create(cast<FunctionType>(I.getValueType()), I.getLinkage(),
- I.getAddressSpace(), I.getName(), New.get());
- NF->copyAttributesFrom(&I);
- VMap[&I] = NF;
- }
-
- // Loop over the aliases in the module
- for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
- I != E; ++I) {
- if (!ShouldCloneDefinition(&*I)) {
- // An alias cannot act as an external reference, so we need to create
- // either a function or a global variable depending on the value type.
- // FIXME: Once pointee types are gone we can probably pick one or the
- // other.
- GlobalValue *GV;
- if (I->getValueType()->isFunctionTy())
- GV = Function::Create(cast<FunctionType>(I->getValueType()),
- GlobalValue::ExternalLinkage,
- I->getAddressSpace(), I->getName(), New.get());
- else
- GV = new GlobalVariable(
- *New, I->getValueType(), false, GlobalValue::ExternalLinkage,
- nullptr, I->getName(), nullptr,
- I->getThreadLocalMode(), I->getType()->getAddressSpace());
- VMap[&*I] = GV;
- // We do not copy attributes (mainly because copying between different
- // kinds of globals is forbidden), but this is generally not required for
- // correctness.
- continue;
- }
- auto *GA = GlobalAlias::create(I->getValueType(),
- I->getType()->getPointerAddressSpace(),
- I->getLinkage(), I->getName(), New.get());
- GA->copyAttributesFrom(&*I);
- VMap[&*I] = GA;
- }
-
- // Now that all of the things that global variable initializer can refer to
- // have been created, loop through and copy the global variable referrers
- // over... We also set the attributes on the global now.
- //
- for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
- I != E; ++I) {
+//===- CloneModule.cpp - Clone an entire module ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CloneModule interface which makes a copy of an
+// entire module.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+static void copyComdat(GlobalObject *Dst, const GlobalObject *Src) {
+ const Comdat *SC = Src->getComdat();
+ if (!SC)
+ return;
+ Comdat *DC = Dst->getParent()->getOrInsertComdat(SC->getName());
+ DC->setSelectionKind(SC->getSelectionKind());
+ Dst->setComdat(DC);
+}
+
+/// This is not as easy as it might seem because we have to worry about making
+/// copies of global variables and functions, and making their (initializers and
+/// references, respectively) refer to the right globals.
+///
+std::unique_ptr<Module> llvm::CloneModule(const Module &M) {
+ // Create the value map that maps things from the old module over to the new
+ // module.
+ ValueToValueMapTy VMap;
+ return CloneModule(M, VMap);
+}
+
+std::unique_ptr<Module> llvm::CloneModule(const Module &M,
+ ValueToValueMapTy &VMap) {
+ return CloneModule(M, VMap, [](const GlobalValue *GV) { return true; });
+}
+
+std::unique_ptr<Module> llvm::CloneModule(
+ const Module &M, ValueToValueMapTy &VMap,
+ function_ref<bool(const GlobalValue *)> ShouldCloneDefinition) {
+ // First off, we need to create the new module.
+ std::unique_ptr<Module> New =
+ std::make_unique<Module>(M.getModuleIdentifier(), M.getContext());
+ New->setSourceFileName(M.getSourceFileName());
+ New->setDataLayout(M.getDataLayout());
+ New->setTargetTriple(M.getTargetTriple());
+ New->setModuleInlineAsm(M.getModuleInlineAsm());
+
+ // Loop over all of the global variables, making corresponding globals in the
+ // new module. Here we add them to the VMap and to the new Module. We
+ // don't worry about attributes or initializers, they will come later.
+ //
+ for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+ I != E; ++I) {
+ GlobalVariable *GV = new GlobalVariable(*New,
+ I->getValueType(),
+ I->isConstant(), I->getLinkage(),
+ (Constant*) nullptr, I->getName(),
+ (GlobalVariable*) nullptr,
+ I->getThreadLocalMode(),
+ I->getType()->getAddressSpace());
+ GV->copyAttributesFrom(&*I);
+ VMap[&*I] = GV;
+ }
+
+ // Loop over the functions in the module, making external functions as before
+ for (const Function &I : M) {
+ Function *NF =
+ Function::Create(cast<FunctionType>(I.getValueType()), I.getLinkage(),
+ I.getAddressSpace(), I.getName(), New.get());
+ NF->copyAttributesFrom(&I);
+ VMap[&I] = NF;
+ }
+
+ // Loop over the aliases in the module
+ for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
+ I != E; ++I) {
+ if (!ShouldCloneDefinition(&*I)) {
+ // An alias cannot act as an external reference, so we need to create
+ // either a function or a global variable depending on the value type.
+ // FIXME: Once pointee types are gone we can probably pick one or the
+ // other.
+ GlobalValue *GV;
+ if (I->getValueType()->isFunctionTy())
+ GV = Function::Create(cast<FunctionType>(I->getValueType()),
+ GlobalValue::ExternalLinkage,
+ I->getAddressSpace(), I->getName(), New.get());
+ else
+ GV = new GlobalVariable(
+ *New, I->getValueType(), false, GlobalValue::ExternalLinkage,
+ nullptr, I->getName(), nullptr,
+ I->getThreadLocalMode(), I->getType()->getAddressSpace());
+ VMap[&*I] = GV;
+ // We do not copy attributes (mainly because copying between different
+ // kinds of globals is forbidden), but this is generally not required for
+ // correctness.
+ continue;
+ }
+ auto *GA = GlobalAlias::create(I->getValueType(),
+ I->getType()->getPointerAddressSpace(),
+ I->getLinkage(), I->getName(), New.get());
+ GA->copyAttributesFrom(&*I);
+ VMap[&*I] = GA;
+ }
+
+ // Now that all of the things that global variable initializer can refer to
+ // have been created, loop through and copy the global variable referrers
+ // over... We also set the attributes on the global now.
+ //
+ for (Module::const_global_iterator I = M.global_begin(), E = M.global_end();
+ I != E; ++I) {
GlobalVariable *GV = cast<GlobalVariable>(VMap[&*I]);
SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
@@ -125,91 +125,91 @@ std::unique_ptr<Module> llvm::CloneModule(
GV->addMetadata(MD.first,
*MapMetadata(MD.second, VMap, RF_MoveDistinctMDs));
- if (I->isDeclaration())
- continue;
-
- if (!ShouldCloneDefinition(&*I)) {
- // Skip after setting the correct linkage for an external reference.
- GV->setLinkage(GlobalValue::ExternalLinkage);
- continue;
- }
- if (I->hasInitializer())
- GV->setInitializer(MapValue(I->getInitializer(), VMap));
-
- copyComdat(GV, &*I);
- }
-
- // Similarly, copy over function bodies now...
- //
- for (const Function &I : M) {
- if (I.isDeclaration())
- continue;
-
- Function *F = cast<Function>(VMap[&I]);
- if (!ShouldCloneDefinition(&I)) {
- // Skip after setting the correct linkage for an external reference.
- F->setLinkage(GlobalValue::ExternalLinkage);
- // Personality function is not valid on a declaration.
- F->setPersonalityFn(nullptr);
- continue;
- }
-
- Function::arg_iterator DestI = F->arg_begin();
- for (Function::const_arg_iterator J = I.arg_begin(); J != I.arg_end();
- ++J) {
- DestI->setName(J->getName());
- VMap[&*J] = &*DestI++;
- }
-
- SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned.
- CloneFunctionInto(F, &I, VMap, /*ModuleLevelChanges=*/true, Returns);
-
- if (I.hasPersonalityFn())
- F->setPersonalityFn(MapValue(I.getPersonalityFn(), VMap));
-
- copyComdat(F, &I);
- }
-
- // And aliases
- for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
- I != E; ++I) {
- // We already dealt with undefined aliases above.
- if (!ShouldCloneDefinition(&*I))
- continue;
- GlobalAlias *GA = cast<GlobalAlias>(VMap[&*I]);
- if (const Constant *C = I->getAliasee())
- GA->setAliasee(MapValue(C, VMap));
- }
-
- // And named metadata....
- const auto* LLVM_DBG_CU = M.getNamedMetadata("llvm.dbg.cu");
- for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
- E = M.named_metadata_end();
- I != E; ++I) {
- const NamedMDNode &NMD = *I;
- NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName());
- if (&NMD == LLVM_DBG_CU) {
- // Do not insert duplicate operands.
- SmallPtrSet<const void*, 8> Visited;
- for (const auto* Operand : NewNMD->operands())
- Visited.insert(Operand);
- for (const auto* Operand : NMD.operands()) {
- auto* MappedOperand = MapMetadata(Operand, VMap);
- if (Visited.insert(MappedOperand).second)
- NewNMD->addOperand(MappedOperand);
- }
- } else
- for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
- NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap));
- }
-
- return New;
-}
-
-extern "C" {
-
-LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) {
- return wrap(CloneModule(*unwrap(M)).release());
-}
-
-}
+ if (I->isDeclaration())
+ continue;
+
+ if (!ShouldCloneDefinition(&*I)) {
+ // Skip after setting the correct linkage for an external reference.
+ GV->setLinkage(GlobalValue::ExternalLinkage);
+ continue;
+ }
+ if (I->hasInitializer())
+ GV->setInitializer(MapValue(I->getInitializer(), VMap));
+
+ copyComdat(GV, &*I);
+ }
+
+ // Similarly, copy over function bodies now...
+ //
+ for (const Function &I : M) {
+ if (I.isDeclaration())
+ continue;
+
+ Function *F = cast<Function>(VMap[&I]);
+ if (!ShouldCloneDefinition(&I)) {
+ // Skip after setting the correct linkage for an external reference.
+ F->setLinkage(GlobalValue::ExternalLinkage);
+ // Personality function is not valid on a declaration.
+ F->setPersonalityFn(nullptr);
+ continue;
+ }
+
+ Function::arg_iterator DestI = F->arg_begin();
+ for (Function::const_arg_iterator J = I.arg_begin(); J != I.arg_end();
+ ++J) {
+ DestI->setName(J->getName());
+ VMap[&*J] = &*DestI++;
+ }
+
+ SmallVector<ReturnInst *, 8> Returns; // Ignore returns cloned.
+ CloneFunctionInto(F, &I, VMap, /*ModuleLevelChanges=*/true, Returns);
+
+ if (I.hasPersonalityFn())
+ F->setPersonalityFn(MapValue(I.getPersonalityFn(), VMap));
+
+ copyComdat(F, &I);
+ }
+
+ // And aliases
+ for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
+ I != E; ++I) {
+ // We already dealt with undefined aliases above.
+ if (!ShouldCloneDefinition(&*I))
+ continue;
+ GlobalAlias *GA = cast<GlobalAlias>(VMap[&*I]);
+ if (const Constant *C = I->getAliasee())
+ GA->setAliasee(MapValue(C, VMap));
+ }
+
+ // And named metadata....
+ const auto* LLVM_DBG_CU = M.getNamedMetadata("llvm.dbg.cu");
+ for (Module::const_named_metadata_iterator I = M.named_metadata_begin(),
+ E = M.named_metadata_end();
+ I != E; ++I) {
+ const NamedMDNode &NMD = *I;
+ NamedMDNode *NewNMD = New->getOrInsertNamedMetadata(NMD.getName());
+ if (&NMD == LLVM_DBG_CU) {
+ // Do not insert duplicate operands.
+ SmallPtrSet<const void*, 8> Visited;
+ for (const auto* Operand : NewNMD->operands())
+ Visited.insert(Operand);
+ for (const auto* Operand : NMD.operands()) {
+ auto* MappedOperand = MapMetadata(Operand, VMap);
+ if (Visited.insert(MappedOperand).second)
+ NewNMD->addOperand(MappedOperand);
+ }
+ } else
+ for (unsigned i = 0, e = NMD.getNumOperands(); i != e; ++i)
+ NewNMD->addOperand(MapMetadata(NMD.getOperand(i), VMap));
+ }
+
+ return New;
+}
+
+extern "C" {
+
+LLVMModuleRef LLVMCloneModule(LLVMModuleRef M) {
+ return wrap(CloneModule(*unwrap(M)).release());
+}
+
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CodeExtractor.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CodeExtractor.cpp
index 03a371c093..390925a03b 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1,540 +1,540 @@
-//===- CodeExtractor.cpp - Pull code region into a new function -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the interface to tear out a code region, such as an
-// individual loop or a parallel section, into a new function, replacing it with
-// a call to the new function.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/CodeExtractor.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/BlockFrequency.h"
-#include "llvm/Support/BranchProbability.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <map>
-#include <set>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-using ProfileCount = Function::ProfileCount;
-
-#define DEBUG_TYPE "code-extractor"
-
-// Provide a command-line option to aggregate function arguments into a struct
-// for functions produced by the code extractor. This is useful when converting
-// extracted functions to pthread-based code, as only one argument (void*) can
-// be passed in to pthread_create().
-static cl::opt<bool>
-AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,
- cl::desc("Aggregate arguments to code-extracted functions"));
-
-/// Test whether a block is valid for extraction.
-static bool isBlockValidForExtraction(const BasicBlock &BB,
- const SetVector<BasicBlock *> &Result,
- bool AllowVarArgs, bool AllowAlloca) {
- // taking the address of a basic block moved to another function is illegal
- if (BB.hasAddressTaken())
- return false;
-
- // don't hoist code that uses another basicblock address, as it's likely to
- // lead to unexpected behavior, like cross-function jumps
- SmallPtrSet<User const *, 16> Visited;
- SmallVector<User const *, 16> ToVisit;
-
- for (Instruction const &Inst : BB)
- ToVisit.push_back(&Inst);
-
- while (!ToVisit.empty()) {
- User const *Curr = ToVisit.pop_back_val();
- if (!Visited.insert(Curr).second)
- continue;
- if (isa<BlockAddress const>(Curr))
- return false; // even a reference to self is likely to be not compatible
-
- if (isa<Instruction>(Curr) && cast<Instruction>(Curr)->getParent() != &BB)
- continue;
-
- for (auto const &U : Curr->operands()) {
- if (auto *UU = dyn_cast<User>(U))
- ToVisit.push_back(UU);
- }
- }
-
- // If explicitly requested, allow vastart and alloca. For invoke instructions
- // verify that extraction is valid.
- for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
- if (isa<AllocaInst>(I)) {
- if (!AllowAlloca)
- return false;
- continue;
- }
-
- if (const auto *II = dyn_cast<InvokeInst>(I)) {
- // Unwind destination (either a landingpad, catchswitch, or cleanuppad)
- // must be a part of the subgraph which is being extracted.
- if (auto *UBB = II->getUnwindDest())
- if (!Result.count(UBB))
- return false;
- continue;
- }
-
- // All catch handlers of a catchswitch instruction as well as the unwind
- // destination must be in the subgraph.
- if (const auto *CSI = dyn_cast<CatchSwitchInst>(I)) {
- if (auto *UBB = CSI->getUnwindDest())
- if (!Result.count(UBB))
- return false;
- for (auto *HBB : CSI->handlers())
- if (!Result.count(const_cast<BasicBlock*>(HBB)))
- return false;
- continue;
- }
-
- // Make sure that entire catch handler is within subgraph. It is sufficient
- // to check that catch return's block is in the list.
- if (const auto *CPI = dyn_cast<CatchPadInst>(I)) {
- for (const auto *U : CPI->users())
- if (const auto *CRI = dyn_cast<CatchReturnInst>(U))
- if (!Result.count(const_cast<BasicBlock*>(CRI->getParent())))
- return false;
- continue;
- }
-
- // And do similar checks for cleanup handler - the entire handler must be
- // in subgraph which is going to be extracted. For cleanup return should
- // additionally check that the unwind destination is also in the subgraph.
- if (const auto *CPI = dyn_cast<CleanupPadInst>(I)) {
- for (const auto *U : CPI->users())
- if (const auto *CRI = dyn_cast<CleanupReturnInst>(U))
- if (!Result.count(const_cast<BasicBlock*>(CRI->getParent())))
- return false;
- continue;
- }
- if (const auto *CRI = dyn_cast<CleanupReturnInst>(I)) {
- if (auto *UBB = CRI->getUnwindDest())
- if (!Result.count(UBB))
- return false;
- continue;
- }
-
- if (const CallInst *CI = dyn_cast<CallInst>(I)) {
- if (const Function *F = CI->getCalledFunction()) {
- auto IID = F->getIntrinsicID();
- if (IID == Intrinsic::vastart) {
- if (AllowVarArgs)
- continue;
- else
- return false;
- }
-
- // Currently, we miscompile outlined copies of eh_typid_for. There are
- // proposals for fixing this in llvm.org/PR39545.
- if (IID == Intrinsic::eh_typeid_for)
- return false;
- }
- }
- }
-
- return true;
-}
-
-/// Build a set of blocks to extract if the input blocks are viable.
-static SetVector<BasicBlock *>
-buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
- bool AllowVarArgs, bool AllowAlloca) {
- assert(!BBs.empty() && "The set of blocks to extract must be non-empty");
- SetVector<BasicBlock *> Result;
-
- // Loop over the blocks, adding them to our set-vector, and aborting with an
- // empty set if we encounter invalid blocks.
- for (BasicBlock *BB : BBs) {
- // If this block is dead, don't process it.
- if (DT && !DT->isReachableFromEntry(BB))
- continue;
-
- if (!Result.insert(BB))
- llvm_unreachable("Repeated basic blocks in extraction input");
- }
-
- LLVM_DEBUG(dbgs() << "Region front block: " << Result.front()->getName()
- << '\n');
-
- for (auto *BB : Result) {
- if (!isBlockValidForExtraction(*BB, Result, AllowVarArgs, AllowAlloca))
- return {};
-
- // Make sure that the first block is not a landing pad.
- if (BB == Result.front()) {
- if (BB->isEHPad()) {
- LLVM_DEBUG(dbgs() << "The first block cannot be an unwind block\n");
- return {};
- }
- continue;
- }
-
- // All blocks other than the first must not have predecessors outside of
- // the subgraph which is being extracted.
- for (auto *PBB : predecessors(BB))
- if (!Result.count(PBB)) {
- LLVM_DEBUG(dbgs() << "No blocks in this region may have entries from "
- "outside the region except for the first block!\n"
- << "Problematic source BB: " << BB->getName() << "\n"
- << "Problematic destination BB: " << PBB->getName()
- << "\n");
- return {};
- }
- }
-
- return Result;
-}
-
-CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
- bool AggregateArgs, BlockFrequencyInfo *BFI,
- BranchProbabilityInfo *BPI, AssumptionCache *AC,
- bool AllowVarArgs, bool AllowAlloca,
- std::string Suffix)
- : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
- BPI(BPI), AC(AC), AllowVarArgs(AllowVarArgs),
- Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)),
- Suffix(Suffix) {}
-
-CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
- BlockFrequencyInfo *BFI,
- BranchProbabilityInfo *BPI, AssumptionCache *AC,
- std::string Suffix)
- : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
- BPI(BPI), AC(AC), AllowVarArgs(false),
- Blocks(buildExtractionBlockSet(L.getBlocks(), &DT,
- /* AllowVarArgs */ false,
- /* AllowAlloca */ false)),
- Suffix(Suffix) {}
-
-/// definedInRegion - Return true if the specified value is defined in the
-/// extracted region.
-static bool definedInRegion(const SetVector<BasicBlock *> &Blocks, Value *V) {
- if (Instruction *I = dyn_cast<Instruction>(V))
- if (Blocks.count(I->getParent()))
- return true;
- return false;
-}
-
-/// definedInCaller - Return true if the specified value is defined in the
-/// function being code extracted, but not in the region being extracted.
-/// These values must be passed in as live-ins to the function.
-static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) {
- if (isa<Argument>(V)) return true;
- if (Instruction *I = dyn_cast<Instruction>(V))
- if (!Blocks.count(I->getParent()))
- return true;
- return false;
-}
-
-static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) {
- BasicBlock *CommonExitBlock = nullptr;
- auto hasNonCommonExitSucc = [&](BasicBlock *Block) {
- for (auto *Succ : successors(Block)) {
- // Internal edges, ok.
- if (Blocks.count(Succ))
- continue;
- if (!CommonExitBlock) {
- CommonExitBlock = Succ;
- continue;
- }
- if (CommonExitBlock != Succ)
- return true;
- }
- return false;
- };
-
- if (any_of(Blocks, hasNonCommonExitSucc))
- return nullptr;
-
- return CommonExitBlock;
-}
-
-CodeExtractorAnalysisCache::CodeExtractorAnalysisCache(Function &F) {
- for (BasicBlock &BB : F) {
- for (Instruction &II : BB.instructionsWithoutDebug())
- if (auto *AI = dyn_cast<AllocaInst>(&II))
- Allocas.push_back(AI);
-
- findSideEffectInfoForBlock(BB);
- }
-}
-
-void CodeExtractorAnalysisCache::findSideEffectInfoForBlock(BasicBlock &BB) {
- for (Instruction &II : BB.instructionsWithoutDebug()) {
- unsigned Opcode = II.getOpcode();
- Value *MemAddr = nullptr;
- switch (Opcode) {
- case Instruction::Store:
- case Instruction::Load: {
- if (Opcode == Instruction::Store) {
- StoreInst *SI = cast<StoreInst>(&II);
- MemAddr = SI->getPointerOperand();
- } else {
- LoadInst *LI = cast<LoadInst>(&II);
- MemAddr = LI->getPointerOperand();
- }
- // Global variable can not be aliased with locals.
- if (dyn_cast<Constant>(MemAddr))
- break;
- Value *Base = MemAddr->stripInBoundsConstantOffsets();
- if (!isa<AllocaInst>(Base)) {
- SideEffectingBlocks.insert(&BB);
- return;
- }
- BaseMemAddrs[&BB].insert(Base);
- break;
- }
- default: {
- IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II);
- if (IntrInst) {
- if (IntrInst->isLifetimeStartOrEnd())
- break;
- SideEffectingBlocks.insert(&BB);
- return;
- }
- // Treat all the other cases conservatively if it has side effects.
- if (II.mayHaveSideEffects()) {
- SideEffectingBlocks.insert(&BB);
- return;
- }
- }
- }
- }
-}
-
-bool CodeExtractorAnalysisCache::doesBlockContainClobberOfAddr(
- BasicBlock &BB, AllocaInst *Addr) const {
- if (SideEffectingBlocks.count(&BB))
- return true;
- auto It = BaseMemAddrs.find(&BB);
- if (It != BaseMemAddrs.end())
- return It->second.count(Addr);
- return false;
-}
-
-bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers(
- const CodeExtractorAnalysisCache &CEAC, Instruction *Addr) const {
- AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets());
- Function *Func = (*Blocks.begin())->getParent();
- for (BasicBlock &BB : *Func) {
- if (Blocks.count(&BB))
- continue;
- if (CEAC.doesBlockContainClobberOfAddr(BB, AI))
- return false;
- }
- return true;
-}
-
-BasicBlock *
-CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) {
- BasicBlock *SinglePredFromOutlineRegion = nullptr;
- assert(!Blocks.count(CommonExitBlock) &&
- "Expect a block outside the region!");
- for (auto *Pred : predecessors(CommonExitBlock)) {
- if (!Blocks.count(Pred))
- continue;
- if (!SinglePredFromOutlineRegion) {
- SinglePredFromOutlineRegion = Pred;
- } else if (SinglePredFromOutlineRegion != Pred) {
- SinglePredFromOutlineRegion = nullptr;
- break;
- }
- }
-
- if (SinglePredFromOutlineRegion)
- return SinglePredFromOutlineRegion;
-
-#ifndef NDEBUG
- auto getFirstPHI = [](BasicBlock *BB) {
- BasicBlock::iterator I = BB->begin();
- PHINode *FirstPhi = nullptr;
- while (I != BB->end()) {
- PHINode *Phi = dyn_cast<PHINode>(I);
- if (!Phi)
- break;
- if (!FirstPhi) {
- FirstPhi = Phi;
- break;
- }
- }
- return FirstPhi;
- };
- // If there are any phi nodes, the single pred either exists or has already
- // be created before code extraction.
- assert(!getFirstPHI(CommonExitBlock) && "Phi not expected");
-#endif
-
- BasicBlock *NewExitBlock = CommonExitBlock->splitBasicBlock(
- CommonExitBlock->getFirstNonPHI()->getIterator());
-
- for (auto PI = pred_begin(CommonExitBlock), PE = pred_end(CommonExitBlock);
- PI != PE;) {
- BasicBlock *Pred = *PI++;
- if (Blocks.count(Pred))
- continue;
- Pred->getTerminator()->replaceUsesOfWith(CommonExitBlock, NewExitBlock);
- }
- // Now add the old exit block to the outline region.
- Blocks.insert(CommonExitBlock);
- return CommonExitBlock;
-}
-
-// Find the pair of life time markers for address 'Addr' that are either
-// defined inside the outline region or can legally be shrinkwrapped into the
-// outline region. If there are not other untracked uses of the address, return
-// the pair of markers if found; otherwise return a pair of nullptr.
-CodeExtractor::LifetimeMarkerInfo
-CodeExtractor::getLifetimeMarkers(const CodeExtractorAnalysisCache &CEAC,
- Instruction *Addr,
- BasicBlock *ExitBlock) const {
- LifetimeMarkerInfo Info;
-
- for (User *U : Addr->users()) {
- IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(U);
- if (IntrInst) {
- // We don't model addresses with multiple start/end markers, but the
- // markers do not need to be in the region.
- if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start) {
- if (Info.LifeStart)
- return {};
- Info.LifeStart = IntrInst;
- continue;
- }
- if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) {
- if (Info.LifeEnd)
- return {};
- Info.LifeEnd = IntrInst;
- continue;
- }
- // At this point, permit debug uses outside of the region.
- // This is fixed in a later call to fixupDebugInfoPostExtraction().
- if (isa<DbgInfoIntrinsic>(IntrInst))
- continue;
- }
- // Find untracked uses of the address, bail.
- if (!definedInRegion(Blocks, U))
- return {};
- }
-
- if (!Info.LifeStart || !Info.LifeEnd)
- return {};
-
- Info.SinkLifeStart = !definedInRegion(Blocks, Info.LifeStart);
- Info.HoistLifeEnd = !definedInRegion(Blocks, Info.LifeEnd);
- // Do legality check.
- if ((Info.SinkLifeStart || Info.HoistLifeEnd) &&
- !isLegalToShrinkwrapLifetimeMarkers(CEAC, Addr))
- return {};
-
- // Check to see if we have a place to do hoisting, if not, bail.
- if (Info.HoistLifeEnd && !ExitBlock)
- return {};
-
- return Info;
-}
-
-void CodeExtractor::findAllocas(const CodeExtractorAnalysisCache &CEAC,
- ValueSet &SinkCands, ValueSet &HoistCands,
- BasicBlock *&ExitBlock) const {
- Function *Func = (*Blocks.begin())->getParent();
- ExitBlock = getCommonExitBlock(Blocks);
-
- auto moveOrIgnoreLifetimeMarkers =
- [&](const LifetimeMarkerInfo &LMI) -> bool {
- if (!LMI.LifeStart)
- return false;
- if (LMI.SinkLifeStart) {
- LLVM_DEBUG(dbgs() << "Sinking lifetime.start: " << *LMI.LifeStart
- << "\n");
- SinkCands.insert(LMI.LifeStart);
- }
- if (LMI.HoistLifeEnd) {
- LLVM_DEBUG(dbgs() << "Hoisting lifetime.end: " << *LMI.LifeEnd << "\n");
- HoistCands.insert(LMI.LifeEnd);
- }
- return true;
- };
-
- // Look up allocas in the original function in CodeExtractorAnalysisCache, as
- // this is much faster than walking all the instructions.
- for (AllocaInst *AI : CEAC.getAllocas()) {
- BasicBlock *BB = AI->getParent();
- if (Blocks.count(BB))
- continue;
-
- // As a prior call to extractCodeRegion() may have shrinkwrapped the alloca,
- // check whether it is actually still in the original function.
- Function *AIFunc = BB->getParent();
- if (AIFunc != Func)
- continue;
-
- LifetimeMarkerInfo MarkerInfo = getLifetimeMarkers(CEAC, AI, ExitBlock);
- bool Moved = moveOrIgnoreLifetimeMarkers(MarkerInfo);
- if (Moved) {
- LLVM_DEBUG(dbgs() << "Sinking alloca: " << *AI << "\n");
- SinkCands.insert(AI);
- continue;
- }
-
+//===- CodeExtractor.cpp - Pull code region into a new function -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interface to tear out a code region, such as an
+// individual loop or a parallel section, into a new function, replacing it with
+// a call to the new function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CodeExtractor.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <set>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+using ProfileCount = Function::ProfileCount;
+
+#define DEBUG_TYPE "code-extractor"
+
+// Provide a command-line option to aggregate function arguments into a struct
+// for functions produced by the code extractor. This is useful when converting
+// extracted functions to pthread-based code, as only one argument (void*) can
+// be passed in to pthread_create().
+static cl::opt<bool>
+AggregateArgsOpt("aggregate-extracted-args", cl::Hidden,
+ cl::desc("Aggregate arguments to code-extracted functions"));
+
+/// Test whether a block is valid for extraction.
+static bool isBlockValidForExtraction(const BasicBlock &BB,
+ const SetVector<BasicBlock *> &Result,
+ bool AllowVarArgs, bool AllowAlloca) {
+ // taking the address of a basic block moved to another function is illegal
+ if (BB.hasAddressTaken())
+ return false;
+
+ // don't hoist code that uses another basicblock address, as it's likely to
+ // lead to unexpected behavior, like cross-function jumps
+ SmallPtrSet<User const *, 16> Visited;
+ SmallVector<User const *, 16> ToVisit;
+
+ for (Instruction const &Inst : BB)
+ ToVisit.push_back(&Inst);
+
+ while (!ToVisit.empty()) {
+ User const *Curr = ToVisit.pop_back_val();
+ if (!Visited.insert(Curr).second)
+ continue;
+ if (isa<BlockAddress const>(Curr))
+ return false; // even a reference to self is likely to be not compatible
+
+ if (isa<Instruction>(Curr) && cast<Instruction>(Curr)->getParent() != &BB)
+ continue;
+
+ for (auto const &U : Curr->operands()) {
+ if (auto *UU = dyn_cast<User>(U))
+ ToVisit.push_back(UU);
+ }
+ }
+
+ // If explicitly requested, allow vastart and alloca. For invoke instructions
+ // verify that extraction is valid.
+ for (BasicBlock::const_iterator I = BB.begin(), E = BB.end(); I != E; ++I) {
+ if (isa<AllocaInst>(I)) {
+ if (!AllowAlloca)
+ return false;
+ continue;
+ }
+
+ if (const auto *II = dyn_cast<InvokeInst>(I)) {
+ // Unwind destination (either a landingpad, catchswitch, or cleanuppad)
+ // must be a part of the subgraph which is being extracted.
+ if (auto *UBB = II->getUnwindDest())
+ if (!Result.count(UBB))
+ return false;
+ continue;
+ }
+
+ // All catch handlers of a catchswitch instruction as well as the unwind
+ // destination must be in the subgraph.
+ if (const auto *CSI = dyn_cast<CatchSwitchInst>(I)) {
+ if (auto *UBB = CSI->getUnwindDest())
+ if (!Result.count(UBB))
+ return false;
+ for (auto *HBB : CSI->handlers())
+ if (!Result.count(const_cast<BasicBlock*>(HBB)))
+ return false;
+ continue;
+ }
+
+ // Make sure that entire catch handler is within subgraph. It is sufficient
+ // to check that catch return's block is in the list.
+ if (const auto *CPI = dyn_cast<CatchPadInst>(I)) {
+ for (const auto *U : CPI->users())
+ if (const auto *CRI = dyn_cast<CatchReturnInst>(U))
+ if (!Result.count(const_cast<BasicBlock*>(CRI->getParent())))
+ return false;
+ continue;
+ }
+
+ // And do similar checks for cleanup handler - the entire handler must be
+ // in subgraph which is going to be extracted. For cleanup return should
+ // additionally check that the unwind destination is also in the subgraph.
+ if (const auto *CPI = dyn_cast<CleanupPadInst>(I)) {
+ for (const auto *U : CPI->users())
+ if (const auto *CRI = dyn_cast<CleanupReturnInst>(U))
+ if (!Result.count(const_cast<BasicBlock*>(CRI->getParent())))
+ return false;
+ continue;
+ }
+ if (const auto *CRI = dyn_cast<CleanupReturnInst>(I)) {
+ if (auto *UBB = CRI->getUnwindDest())
+ if (!Result.count(UBB))
+ return false;
+ continue;
+ }
+
+ if (const CallInst *CI = dyn_cast<CallInst>(I)) {
+ if (const Function *F = CI->getCalledFunction()) {
+ auto IID = F->getIntrinsicID();
+ if (IID == Intrinsic::vastart) {
+ if (AllowVarArgs)
+ continue;
+ else
+ return false;
+ }
+
+ // Currently, we miscompile outlined copies of eh_typid_for. There are
+ // proposals for fixing this in llvm.org/PR39545.
+ if (IID == Intrinsic::eh_typeid_for)
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
+/// Build a set of blocks to extract if the input blocks are viable.
+static SetVector<BasicBlock *>
+buildExtractionBlockSet(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
+ bool AllowVarArgs, bool AllowAlloca) {
+ assert(!BBs.empty() && "The set of blocks to extract must be non-empty");
+ SetVector<BasicBlock *> Result;
+
+ // Loop over the blocks, adding them to our set-vector, and aborting with an
+ // empty set if we encounter invalid blocks.
+ for (BasicBlock *BB : BBs) {
+ // If this block is dead, don't process it.
+ if (DT && !DT->isReachableFromEntry(BB))
+ continue;
+
+ if (!Result.insert(BB))
+ llvm_unreachable("Repeated basic blocks in extraction input");
+ }
+
+ LLVM_DEBUG(dbgs() << "Region front block: " << Result.front()->getName()
+ << '\n');
+
+ for (auto *BB : Result) {
+ if (!isBlockValidForExtraction(*BB, Result, AllowVarArgs, AllowAlloca))
+ return {};
+
+ // Make sure that the first block is not a landing pad.
+ if (BB == Result.front()) {
+ if (BB->isEHPad()) {
+ LLVM_DEBUG(dbgs() << "The first block cannot be an unwind block\n");
+ return {};
+ }
+ continue;
+ }
+
+ // All blocks other than the first must not have predecessors outside of
+ // the subgraph which is being extracted.
+ for (auto *PBB : predecessors(BB))
+ if (!Result.count(PBB)) {
+ LLVM_DEBUG(dbgs() << "No blocks in this region may have entries from "
+ "outside the region except for the first block!\n"
+ << "Problematic source BB: " << BB->getName() << "\n"
+ << "Problematic destination BB: " << PBB->getName()
+ << "\n");
+ return {};
+ }
+ }
+
+ return Result;
+}
+
+CodeExtractor::CodeExtractor(ArrayRef<BasicBlock *> BBs, DominatorTree *DT,
+ bool AggregateArgs, BlockFrequencyInfo *BFI,
+ BranchProbabilityInfo *BPI, AssumptionCache *AC,
+ bool AllowVarArgs, bool AllowAlloca,
+ std::string Suffix)
+ : DT(DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
+ BPI(BPI), AC(AC), AllowVarArgs(AllowVarArgs),
+ Blocks(buildExtractionBlockSet(BBs, DT, AllowVarArgs, AllowAlloca)),
+ Suffix(Suffix) {}
+
+CodeExtractor::CodeExtractor(DominatorTree &DT, Loop &L, bool AggregateArgs,
+ BlockFrequencyInfo *BFI,
+ BranchProbabilityInfo *BPI, AssumptionCache *AC,
+ std::string Suffix)
+ : DT(&DT), AggregateArgs(AggregateArgs || AggregateArgsOpt), BFI(BFI),
+ BPI(BPI), AC(AC), AllowVarArgs(false),
+ Blocks(buildExtractionBlockSet(L.getBlocks(), &DT,
+ /* AllowVarArgs */ false,
+ /* AllowAlloca */ false)),
+ Suffix(Suffix) {}
+
+/// definedInRegion - Return true if the specified value is defined in the
+/// extracted region.
+static bool definedInRegion(const SetVector<BasicBlock *> &Blocks, Value *V) {
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ if (Blocks.count(I->getParent()))
+ return true;
+ return false;
+}
+
+/// definedInCaller - Return true if the specified value is defined in the
+/// function being code extracted, but not in the region being extracted.
+/// These values must be passed in as live-ins to the function.
+static bool definedInCaller(const SetVector<BasicBlock *> &Blocks, Value *V) {
+ if (isa<Argument>(V)) return true;
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ if (!Blocks.count(I->getParent()))
+ return true;
+ return false;
+}
+
+static BasicBlock *getCommonExitBlock(const SetVector<BasicBlock *> &Blocks) {
+ BasicBlock *CommonExitBlock = nullptr;
+ auto hasNonCommonExitSucc = [&](BasicBlock *Block) {
+ for (auto *Succ : successors(Block)) {
+ // Internal edges, ok.
+ if (Blocks.count(Succ))
+ continue;
+ if (!CommonExitBlock) {
+ CommonExitBlock = Succ;
+ continue;
+ }
+ if (CommonExitBlock != Succ)
+ return true;
+ }
+ return false;
+ };
+
+ if (any_of(Blocks, hasNonCommonExitSucc))
+ return nullptr;
+
+ return CommonExitBlock;
+}
+
+CodeExtractorAnalysisCache::CodeExtractorAnalysisCache(Function &F) {
+ for (BasicBlock &BB : F) {
+ for (Instruction &II : BB.instructionsWithoutDebug())
+ if (auto *AI = dyn_cast<AllocaInst>(&II))
+ Allocas.push_back(AI);
+
+ findSideEffectInfoForBlock(BB);
+ }
+}
+
+void CodeExtractorAnalysisCache::findSideEffectInfoForBlock(BasicBlock &BB) {
+ for (Instruction &II : BB.instructionsWithoutDebug()) {
+ unsigned Opcode = II.getOpcode();
+ Value *MemAddr = nullptr;
+ switch (Opcode) {
+ case Instruction::Store:
+ case Instruction::Load: {
+ if (Opcode == Instruction::Store) {
+ StoreInst *SI = cast<StoreInst>(&II);
+ MemAddr = SI->getPointerOperand();
+ } else {
+ LoadInst *LI = cast<LoadInst>(&II);
+ MemAddr = LI->getPointerOperand();
+ }
+ // Global variable can not be aliased with locals.
+ if (dyn_cast<Constant>(MemAddr))
+ break;
+ Value *Base = MemAddr->stripInBoundsConstantOffsets();
+ if (!isa<AllocaInst>(Base)) {
+ SideEffectingBlocks.insert(&BB);
+ return;
+ }
+ BaseMemAddrs[&BB].insert(Base);
+ break;
+ }
+ default: {
+ IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(&II);
+ if (IntrInst) {
+ if (IntrInst->isLifetimeStartOrEnd())
+ break;
+ SideEffectingBlocks.insert(&BB);
+ return;
+ }
+ // Treat all the other cases conservatively if it has side effects.
+ if (II.mayHaveSideEffects()) {
+ SideEffectingBlocks.insert(&BB);
+ return;
+ }
+ }
+ }
+ }
+}
+
+bool CodeExtractorAnalysisCache::doesBlockContainClobberOfAddr(
+ BasicBlock &BB, AllocaInst *Addr) const {
+ if (SideEffectingBlocks.count(&BB))
+ return true;
+ auto It = BaseMemAddrs.find(&BB);
+ if (It != BaseMemAddrs.end())
+ return It->second.count(Addr);
+ return false;
+}
+
+bool CodeExtractor::isLegalToShrinkwrapLifetimeMarkers(
+ const CodeExtractorAnalysisCache &CEAC, Instruction *Addr) const {
+ AllocaInst *AI = cast<AllocaInst>(Addr->stripInBoundsConstantOffsets());
+ Function *Func = (*Blocks.begin())->getParent();
+ for (BasicBlock &BB : *Func) {
+ if (Blocks.count(&BB))
+ continue;
+ if (CEAC.doesBlockContainClobberOfAddr(BB, AI))
+ return false;
+ }
+ return true;
+}
+
+BasicBlock *
+CodeExtractor::findOrCreateBlockForHoisting(BasicBlock *CommonExitBlock) {
+ BasicBlock *SinglePredFromOutlineRegion = nullptr;
+ assert(!Blocks.count(CommonExitBlock) &&
+ "Expect a block outside the region!");
+ for (auto *Pred : predecessors(CommonExitBlock)) {
+ if (!Blocks.count(Pred))
+ continue;
+ if (!SinglePredFromOutlineRegion) {
+ SinglePredFromOutlineRegion = Pred;
+ } else if (SinglePredFromOutlineRegion != Pred) {
+ SinglePredFromOutlineRegion = nullptr;
+ break;
+ }
+ }
+
+ if (SinglePredFromOutlineRegion)
+ return SinglePredFromOutlineRegion;
+
+#ifndef NDEBUG
+ auto getFirstPHI = [](BasicBlock *BB) {
+ BasicBlock::iterator I = BB->begin();
+ PHINode *FirstPhi = nullptr;
+ while (I != BB->end()) {
+ PHINode *Phi = dyn_cast<PHINode>(I);
+ if (!Phi)
+ break;
+ if (!FirstPhi) {
+ FirstPhi = Phi;
+ break;
+ }
+ }
+ return FirstPhi;
+ };
+ // If there are any phi nodes, the single pred either exists or has already
+ // be created before code extraction.
+ assert(!getFirstPHI(CommonExitBlock) && "Phi not expected");
+#endif
+
+ BasicBlock *NewExitBlock = CommonExitBlock->splitBasicBlock(
+ CommonExitBlock->getFirstNonPHI()->getIterator());
+
+ for (auto PI = pred_begin(CommonExitBlock), PE = pred_end(CommonExitBlock);
+ PI != PE;) {
+ BasicBlock *Pred = *PI++;
+ if (Blocks.count(Pred))
+ continue;
+ Pred->getTerminator()->replaceUsesOfWith(CommonExitBlock, NewExitBlock);
+ }
+ // Now add the old exit block to the outline region.
+ Blocks.insert(CommonExitBlock);
+ return CommonExitBlock;
+}
+
+// Find the pair of life time markers for address 'Addr' that are either
+// defined inside the outline region or can legally be shrinkwrapped into the
+// outline region. If there are not other untracked uses of the address, return
+// the pair of markers if found; otherwise return a pair of nullptr.
+CodeExtractor::LifetimeMarkerInfo
+CodeExtractor::getLifetimeMarkers(const CodeExtractorAnalysisCache &CEAC,
+ Instruction *Addr,
+ BasicBlock *ExitBlock) const {
+ LifetimeMarkerInfo Info;
+
+ for (User *U : Addr->users()) {
+ IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(U);
+ if (IntrInst) {
+ // We don't model addresses with multiple start/end markers, but the
+ // markers do not need to be in the region.
+ if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start) {
+ if (Info.LifeStart)
+ return {};
+ Info.LifeStart = IntrInst;
+ continue;
+ }
+ if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) {
+ if (Info.LifeEnd)
+ return {};
+ Info.LifeEnd = IntrInst;
+ continue;
+ }
+ // At this point, permit debug uses outside of the region.
+ // This is fixed in a later call to fixupDebugInfoPostExtraction().
+ if (isa<DbgInfoIntrinsic>(IntrInst))
+ continue;
+ }
+ // Find untracked uses of the address, bail.
+ if (!definedInRegion(Blocks, U))
+ return {};
+ }
+
+ if (!Info.LifeStart || !Info.LifeEnd)
+ return {};
+
+ Info.SinkLifeStart = !definedInRegion(Blocks, Info.LifeStart);
+ Info.HoistLifeEnd = !definedInRegion(Blocks, Info.LifeEnd);
+ // Do legality check.
+ if ((Info.SinkLifeStart || Info.HoistLifeEnd) &&
+ !isLegalToShrinkwrapLifetimeMarkers(CEAC, Addr))
+ return {};
+
+ // Check to see if we have a place to do hoisting, if not, bail.
+ if (Info.HoistLifeEnd && !ExitBlock)
+ return {};
+
+ return Info;
+}
+
+void CodeExtractor::findAllocas(const CodeExtractorAnalysisCache &CEAC,
+ ValueSet &SinkCands, ValueSet &HoistCands,
+ BasicBlock *&ExitBlock) const {
+ Function *Func = (*Blocks.begin())->getParent();
+ ExitBlock = getCommonExitBlock(Blocks);
+
+ auto moveOrIgnoreLifetimeMarkers =
+ [&](const LifetimeMarkerInfo &LMI) -> bool {
+ if (!LMI.LifeStart)
+ return false;
+ if (LMI.SinkLifeStart) {
+ LLVM_DEBUG(dbgs() << "Sinking lifetime.start: " << *LMI.LifeStart
+ << "\n");
+ SinkCands.insert(LMI.LifeStart);
+ }
+ if (LMI.HoistLifeEnd) {
+ LLVM_DEBUG(dbgs() << "Hoisting lifetime.end: " << *LMI.LifeEnd << "\n");
+ HoistCands.insert(LMI.LifeEnd);
+ }
+ return true;
+ };
+
+ // Look up allocas in the original function in CodeExtractorAnalysisCache, as
+ // this is much faster than walking all the instructions.
+ for (AllocaInst *AI : CEAC.getAllocas()) {
+ BasicBlock *BB = AI->getParent();
+ if (Blocks.count(BB))
+ continue;
+
+ // As a prior call to extractCodeRegion() may have shrinkwrapped the alloca,
+ // check whether it is actually still in the original function.
+ Function *AIFunc = BB->getParent();
+ if (AIFunc != Func)
+ continue;
+
+ LifetimeMarkerInfo MarkerInfo = getLifetimeMarkers(CEAC, AI, ExitBlock);
+ bool Moved = moveOrIgnoreLifetimeMarkers(MarkerInfo);
+ if (Moved) {
+ LLVM_DEBUG(dbgs() << "Sinking alloca: " << *AI << "\n");
+ SinkCands.insert(AI);
+ continue;
+ }
+
// Find bitcasts in the outlined region that have lifetime marker users
// outside that region. Replace the lifetime marker use with an
// outside region bitcast to avoid unnecessary alloca/reload instructions
@@ -575,1235 +575,1235 @@ void CodeExtractor::findAllocas(const CodeExtractorAnalysisCache &CEAC,
I->replaceUsesOfWith(I->getOperand(1), CastI);
}
- // Follow any bitcasts.
- SmallVector<Instruction *, 2> Bitcasts;
- SmallVector<LifetimeMarkerInfo, 2> BitcastLifetimeInfo;
- for (User *U : AI->users()) {
- if (U->stripInBoundsConstantOffsets() == AI) {
- Instruction *Bitcast = cast<Instruction>(U);
- LifetimeMarkerInfo LMI = getLifetimeMarkers(CEAC, Bitcast, ExitBlock);
- if (LMI.LifeStart) {
- Bitcasts.push_back(Bitcast);
- BitcastLifetimeInfo.push_back(LMI);
- continue;
- }
- }
-
- // Found unknown use of AI.
- if (!definedInRegion(Blocks, U)) {
- Bitcasts.clear();
- break;
- }
- }
-
- // Either no bitcasts reference the alloca or there are unknown uses.
- if (Bitcasts.empty())
- continue;
-
- LLVM_DEBUG(dbgs() << "Sinking alloca (via bitcast): " << *AI << "\n");
- SinkCands.insert(AI);
- for (unsigned I = 0, E = Bitcasts.size(); I != E; ++I) {
- Instruction *BitcastAddr = Bitcasts[I];
- const LifetimeMarkerInfo &LMI = BitcastLifetimeInfo[I];
- assert(LMI.LifeStart &&
- "Unsafe to sink bitcast without lifetime markers");
- moveOrIgnoreLifetimeMarkers(LMI);
- if (!definedInRegion(Blocks, BitcastAddr)) {
- LLVM_DEBUG(dbgs() << "Sinking bitcast-of-alloca: " << *BitcastAddr
- << "\n");
- SinkCands.insert(BitcastAddr);
- }
- }
- }
-}
-
-bool CodeExtractor::isEligible() const {
- if (Blocks.empty())
- return false;
- BasicBlock *Header = *Blocks.begin();
- Function *F = Header->getParent();
-
- // For functions with varargs, check that varargs handling is only done in the
- // outlined function, i.e vastart and vaend are only used in outlined blocks.
- if (AllowVarArgs && F->getFunctionType()->isVarArg()) {
- auto containsVarArgIntrinsic = [](const Instruction &I) {
- if (const CallInst *CI = dyn_cast<CallInst>(&I))
- if (const Function *Callee = CI->getCalledFunction())
- return Callee->getIntrinsicID() == Intrinsic::vastart ||
- Callee->getIntrinsicID() == Intrinsic::vaend;
- return false;
- };
-
- for (auto &BB : *F) {
- if (Blocks.count(&BB))
- continue;
- if (llvm::any_of(BB, containsVarArgIntrinsic))
- return false;
- }
- }
- return true;
-}
-
-void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs,
- const ValueSet &SinkCands) const {
- for (BasicBlock *BB : Blocks) {
- // If a used value is defined outside the region, it's an input. If an
- // instruction is used outside the region, it's an output.
- for (Instruction &II : *BB) {
- for (auto &OI : II.operands()) {
- Value *V = OI;
- if (!SinkCands.count(V) && definedInCaller(Blocks, V))
- Inputs.insert(V);
- }
-
- for (User *U : II.users())
- if (!definedInRegion(Blocks, U)) {
- Outputs.insert(&II);
- break;
- }
- }
- }
-}
-
-/// severSplitPHINodesOfEntry - If a PHI node has multiple inputs from outside
-/// of the region, we need to split the entry block of the region so that the
-/// PHI node is easier to deal with.
-void CodeExtractor::severSplitPHINodesOfEntry(BasicBlock *&Header) {
- unsigned NumPredsFromRegion = 0;
- unsigned NumPredsOutsideRegion = 0;
-
- if (Header != &Header->getParent()->getEntryBlock()) {
- PHINode *PN = dyn_cast<PHINode>(Header->begin());
- if (!PN) return; // No PHI nodes.
-
- // If the header node contains any PHI nodes, check to see if there is more
- // than one entry from outside the region. If so, we need to sever the
- // header block into two.
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- if (Blocks.count(PN->getIncomingBlock(i)))
- ++NumPredsFromRegion;
- else
- ++NumPredsOutsideRegion;
-
- // If there is one (or fewer) predecessor from outside the region, we don't
- // need to do anything special.
- if (NumPredsOutsideRegion <= 1) return;
- }
-
- // Otherwise, we need to split the header block into two pieces: one
- // containing PHI nodes merging values from outside of the region, and a
- // second that contains all of the code for the block and merges back any
- // incoming values from inside of the region.
- BasicBlock *NewBB = SplitBlock(Header, Header->getFirstNonPHI(), DT);
-
- // We only want to code extract the second block now, and it becomes the new
- // header of the region.
- BasicBlock *OldPred = Header;
- Blocks.remove(OldPred);
- Blocks.insert(NewBB);
- Header = NewBB;
-
- // Okay, now we need to adjust the PHI nodes and any branches from within the
- // region to go to the new header block instead of the old header block.
- if (NumPredsFromRegion) {
- PHINode *PN = cast<PHINode>(OldPred->begin());
- // Loop over all of the predecessors of OldPred that are in the region,
- // changing them to branch to NewBB instead.
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- if (Blocks.count(PN->getIncomingBlock(i))) {
- Instruction *TI = PN->getIncomingBlock(i)->getTerminator();
- TI->replaceUsesOfWith(OldPred, NewBB);
- }
-
- // Okay, everything within the region is now branching to the right block, we
- // just have to update the PHI nodes now, inserting PHI nodes into NewBB.
- BasicBlock::iterator AfterPHIs;
- for (AfterPHIs = OldPred->begin(); isa<PHINode>(AfterPHIs); ++AfterPHIs) {
- PHINode *PN = cast<PHINode>(AfterPHIs);
- // Create a new PHI node in the new region, which has an incoming value
- // from OldPred of PN.
- PHINode *NewPN = PHINode::Create(PN->getType(), 1 + NumPredsFromRegion,
- PN->getName() + ".ce", &NewBB->front());
- PN->replaceAllUsesWith(NewPN);
- NewPN->addIncoming(PN, OldPred);
-
- // Loop over all of the incoming value in PN, moving them to NewPN if they
- // are from the extracted region.
- for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
- if (Blocks.count(PN->getIncomingBlock(i))) {
- NewPN->addIncoming(PN->getIncomingValue(i), PN->getIncomingBlock(i));
- PN->removeIncomingValue(i);
- --i;
- }
- }
- }
- }
-}
-
-/// severSplitPHINodesOfExits - if PHI nodes in exit blocks have inputs from
-/// outlined region, we split these PHIs on two: one with inputs from region
-/// and other with remaining incoming blocks; then first PHIs are placed in
-/// outlined region.
-void CodeExtractor::severSplitPHINodesOfExits(
- const SmallPtrSetImpl<BasicBlock *> &Exits) {
- for (BasicBlock *ExitBB : Exits) {
- BasicBlock *NewBB = nullptr;
-
- for (PHINode &PN : ExitBB->phis()) {
- // Find all incoming values from the outlining region.
- SmallVector<unsigned, 2> IncomingVals;
- for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
- if (Blocks.count(PN.getIncomingBlock(i)))
- IncomingVals.push_back(i);
-
- // Do not process PHI if there is one (or fewer) predecessor from region.
- // If PHI has exactly one predecessor from region, only this one incoming
- // will be replaced on codeRepl block, so it should be safe to skip PHI.
- if (IncomingVals.size() <= 1)
- continue;
-
- // Create block for new PHIs and add it to the list of outlined if it
- // wasn't done before.
- if (!NewBB) {
- NewBB = BasicBlock::Create(ExitBB->getContext(),
- ExitBB->getName() + ".split",
- ExitBB->getParent(), ExitBB);
+ // Follow any bitcasts.
+ SmallVector<Instruction *, 2> Bitcasts;
+ SmallVector<LifetimeMarkerInfo, 2> BitcastLifetimeInfo;
+ for (User *U : AI->users()) {
+ if (U->stripInBoundsConstantOffsets() == AI) {
+ Instruction *Bitcast = cast<Instruction>(U);
+ LifetimeMarkerInfo LMI = getLifetimeMarkers(CEAC, Bitcast, ExitBlock);
+ if (LMI.LifeStart) {
+ Bitcasts.push_back(Bitcast);
+ BitcastLifetimeInfo.push_back(LMI);
+ continue;
+ }
+ }
+
+ // Found unknown use of AI.
+ if (!definedInRegion(Blocks, U)) {
+ Bitcasts.clear();
+ break;
+ }
+ }
+
+ // Either no bitcasts reference the alloca or there are unknown uses.
+ if (Bitcasts.empty())
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Sinking alloca (via bitcast): " << *AI << "\n");
+ SinkCands.insert(AI);
+ for (unsigned I = 0, E = Bitcasts.size(); I != E; ++I) {
+ Instruction *BitcastAddr = Bitcasts[I];
+ const LifetimeMarkerInfo &LMI = BitcastLifetimeInfo[I];
+ assert(LMI.LifeStart &&
+ "Unsafe to sink bitcast without lifetime markers");
+ moveOrIgnoreLifetimeMarkers(LMI);
+ if (!definedInRegion(Blocks, BitcastAddr)) {
+ LLVM_DEBUG(dbgs() << "Sinking bitcast-of-alloca: " << *BitcastAddr
+ << "\n");
+ SinkCands.insert(BitcastAddr);
+ }
+ }
+ }
+}
+
+bool CodeExtractor::isEligible() const {
+ if (Blocks.empty())
+ return false;
+ BasicBlock *Header = *Blocks.begin();
+ Function *F = Header->getParent();
+
+ // For functions with varargs, check that varargs handling is only done in the
+ // outlined function, i.e vastart and vaend are only used in outlined blocks.
+ if (AllowVarArgs && F->getFunctionType()->isVarArg()) {
+ auto containsVarArgIntrinsic = [](const Instruction &I) {
+ if (const CallInst *CI = dyn_cast<CallInst>(&I))
+ if (const Function *Callee = CI->getCalledFunction())
+ return Callee->getIntrinsicID() == Intrinsic::vastart ||
+ Callee->getIntrinsicID() == Intrinsic::vaend;
+ return false;
+ };
+
+ for (auto &BB : *F) {
+ if (Blocks.count(&BB))
+ continue;
+ if (llvm::any_of(BB, containsVarArgIntrinsic))
+ return false;
+ }
+ }
+ return true;
+}
+
+void CodeExtractor::findInputsOutputs(ValueSet &Inputs, ValueSet &Outputs,
+ const ValueSet &SinkCands) const {
+ for (BasicBlock *BB : Blocks) {
+ // If a used value is defined outside the region, it's an input. If an
+ // instruction is used outside the region, it's an output.
+ for (Instruction &II : *BB) {
+ for (auto &OI : II.operands()) {
+ Value *V = OI;
+ if (!SinkCands.count(V) && definedInCaller(Blocks, V))
+ Inputs.insert(V);
+ }
+
+ for (User *U : II.users())
+ if (!definedInRegion(Blocks, U)) {
+ Outputs.insert(&II);
+ break;
+ }
+ }
+ }
+}
+
+/// severSplitPHINodesOfEntry - If a PHI node has multiple inputs from outside
+/// of the region, we need to split the entry block of the region so that the
+/// PHI node is easier to deal with.
+void CodeExtractor::severSplitPHINodesOfEntry(BasicBlock *&Header) {
+ unsigned NumPredsFromRegion = 0;
+ unsigned NumPredsOutsideRegion = 0;
+
+ if (Header != &Header->getParent()->getEntryBlock()) {
+ PHINode *PN = dyn_cast<PHINode>(Header->begin());
+ if (!PN) return; // No PHI nodes.
+
+ // If the header node contains any PHI nodes, check to see if there is more
+ // than one entry from outside the region. If so, we need to sever the
+ // header block into two.
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (Blocks.count(PN->getIncomingBlock(i)))
+ ++NumPredsFromRegion;
+ else
+ ++NumPredsOutsideRegion;
+
+ // If there is one (or fewer) predecessor from outside the region, we don't
+ // need to do anything special.
+ if (NumPredsOutsideRegion <= 1) return;
+ }
+
+ // Otherwise, we need to split the header block into two pieces: one
+ // containing PHI nodes merging values from outside of the region, and a
+ // second that contains all of the code for the block and merges back any
+ // incoming values from inside of the region.
+ BasicBlock *NewBB = SplitBlock(Header, Header->getFirstNonPHI(), DT);
+
+ // We only want to code extract the second block now, and it becomes the new
+ // header of the region.
+ BasicBlock *OldPred = Header;
+ Blocks.remove(OldPred);
+ Blocks.insert(NewBB);
+ Header = NewBB;
+
+ // Okay, now we need to adjust the PHI nodes and any branches from within the
+ // region to go to the new header block instead of the old header block.
+ if (NumPredsFromRegion) {
+ PHINode *PN = cast<PHINode>(OldPred->begin());
+ // Loop over all of the predecessors of OldPred that are in the region,
+ // changing them to branch to NewBB instead.
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (Blocks.count(PN->getIncomingBlock(i))) {
+ Instruction *TI = PN->getIncomingBlock(i)->getTerminator();
+ TI->replaceUsesOfWith(OldPred, NewBB);
+ }
+
+ // Okay, everything within the region is now branching to the right block, we
+ // just have to update the PHI nodes now, inserting PHI nodes into NewBB.
+ BasicBlock::iterator AfterPHIs;
+ for (AfterPHIs = OldPred->begin(); isa<PHINode>(AfterPHIs); ++AfterPHIs) {
+ PHINode *PN = cast<PHINode>(AfterPHIs);
+ // Create a new PHI node in the new region, which has an incoming value
+ // from OldPred of PN.
+ PHINode *NewPN = PHINode::Create(PN->getType(), 1 + NumPredsFromRegion,
+ PN->getName() + ".ce", &NewBB->front());
+ PN->replaceAllUsesWith(NewPN);
+ NewPN->addIncoming(PN, OldPred);
+
+ // Loop over all of the incoming value in PN, moving them to NewPN if they
+ // are from the extracted region.
+ for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
+ if (Blocks.count(PN->getIncomingBlock(i))) {
+ NewPN->addIncoming(PN->getIncomingValue(i), PN->getIncomingBlock(i));
+ PN->removeIncomingValue(i);
+ --i;
+ }
+ }
+ }
+ }
+}
+
+/// severSplitPHINodesOfExits - if PHI nodes in exit blocks have inputs from
+/// outlined region, we split these PHIs on two: one with inputs from region
+/// and other with remaining incoming blocks; then first PHIs are placed in
+/// outlined region.
+void CodeExtractor::severSplitPHINodesOfExits(
+ const SmallPtrSetImpl<BasicBlock *> &Exits) {
+ for (BasicBlock *ExitBB : Exits) {
+ BasicBlock *NewBB = nullptr;
+
+ for (PHINode &PN : ExitBB->phis()) {
+ // Find all incoming values from the outlining region.
+ SmallVector<unsigned, 2> IncomingVals;
+ for (unsigned i = 0; i < PN.getNumIncomingValues(); ++i)
+ if (Blocks.count(PN.getIncomingBlock(i)))
+ IncomingVals.push_back(i);
+
+ // Do not process PHI if there is one (or fewer) predecessor from region.
+ // If PHI has exactly one predecessor from region, only this one incoming
+ // will be replaced on codeRepl block, so it should be safe to skip PHI.
+ if (IncomingVals.size() <= 1)
+ continue;
+
+ // Create block for new PHIs and add it to the list of outlined if it
+ // wasn't done before.
+ if (!NewBB) {
+ NewBB = BasicBlock::Create(ExitBB->getContext(),
+ ExitBB->getName() + ".split",
+ ExitBB->getParent(), ExitBB);
SmallVector<BasicBlock *, 4> Preds(predecessors(ExitBB));
- for (BasicBlock *PredBB : Preds)
- if (Blocks.count(PredBB))
- PredBB->getTerminator()->replaceUsesOfWith(ExitBB, NewBB);
- BranchInst::Create(ExitBB, NewBB);
- Blocks.insert(NewBB);
- }
-
- // Split this PHI.
- PHINode *NewPN =
- PHINode::Create(PN.getType(), IncomingVals.size(),
- PN.getName() + ".ce", NewBB->getFirstNonPHI());
- for (unsigned i : IncomingVals)
- NewPN->addIncoming(PN.getIncomingValue(i), PN.getIncomingBlock(i));
- for (unsigned i : reverse(IncomingVals))
- PN.removeIncomingValue(i, false);
- PN.addIncoming(NewPN, NewBB);
- }
- }
-}
-
-void CodeExtractor::splitReturnBlocks() {
- for (BasicBlock *Block : Blocks)
- if (ReturnInst *RI = dyn_cast<ReturnInst>(Block->getTerminator())) {
- BasicBlock *New =
- Block->splitBasicBlock(RI->getIterator(), Block->getName() + ".ret");
- if (DT) {
- // Old dominates New. New node dominates all other nodes dominated
- // by Old.
- DomTreeNode *OldNode = DT->getNode(Block);
- SmallVector<DomTreeNode *, 8> Children(OldNode->begin(),
- OldNode->end());
-
- DomTreeNode *NewNode = DT->addNewBlock(New, Block);
-
- for (DomTreeNode *I : Children)
- DT->changeImmediateDominator(I, NewNode);
- }
- }
-}
-
-/// constructFunction - make a function based on inputs and outputs, as follows:
-/// f(in0, ..., inN, out0, ..., outN)
-Function *CodeExtractor::constructFunction(const ValueSet &inputs,
- const ValueSet &outputs,
- BasicBlock *header,
- BasicBlock *newRootNode,
- BasicBlock *newHeader,
- Function *oldFunction,
- Module *M) {
- LLVM_DEBUG(dbgs() << "inputs: " << inputs.size() << "\n");
- LLVM_DEBUG(dbgs() << "outputs: " << outputs.size() << "\n");
-
- // This function returns unsigned, outputs will go back by reference.
- switch (NumExitBlocks) {
- case 0:
- case 1: RetTy = Type::getVoidTy(header->getContext()); break;
- case 2: RetTy = Type::getInt1Ty(header->getContext()); break;
- default: RetTy = Type::getInt16Ty(header->getContext()); break;
- }
-
- std::vector<Type *> paramTy;
-
- // Add the types of the input values to the function's argument list
- for (Value *value : inputs) {
- LLVM_DEBUG(dbgs() << "value used in func: " << *value << "\n");
- paramTy.push_back(value->getType());
- }
-
- // Add the types of the output values to the function's argument list.
- for (Value *output : outputs) {
- LLVM_DEBUG(dbgs() << "instr used in func: " << *output << "\n");
- if (AggregateArgs)
- paramTy.push_back(output->getType());
- else
- paramTy.push_back(PointerType::getUnqual(output->getType()));
- }
-
- LLVM_DEBUG({
- dbgs() << "Function type: " << *RetTy << " f(";
- for (Type *i : paramTy)
- dbgs() << *i << ", ";
- dbgs() << ")\n";
- });
-
- StructType *StructTy = nullptr;
- if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
- StructTy = StructType::get(M->getContext(), paramTy);
- paramTy.clear();
- paramTy.push_back(PointerType::getUnqual(StructTy));
- }
- FunctionType *funcType =
- FunctionType::get(RetTy, paramTy,
- AllowVarArgs && oldFunction->isVarArg());
-
- std::string SuffixToUse =
- Suffix.empty()
- ? (header->getName().empty() ? "extracted" : header->getName().str())
- : Suffix;
- // Create the new function
- Function *newFunction = Function::Create(
- funcType, GlobalValue::InternalLinkage, oldFunction->getAddressSpace(),
- oldFunction->getName() + "." + SuffixToUse, M);
- // If the old function is no-throw, so is the new one.
- if (oldFunction->doesNotThrow())
- newFunction->setDoesNotThrow();
-
- // Inherit the uwtable attribute if we need to.
- if (oldFunction->hasUWTable())
- newFunction->setHasUWTable();
-
- // Inherit all of the target dependent attributes and white-listed
- // target independent attributes.
- // (e.g. If the extracted region contains a call to an x86.sse
- // instruction we need to make sure that the extracted region has the
- // "target-features" attribute allowing it to be lowered.
- // FIXME: This should be changed to check to see if a specific
- // attribute can not be inherited.
- for (const auto &Attr : oldFunction->getAttributes().getFnAttributes()) {
- if (Attr.isStringAttribute()) {
- if (Attr.getKindAsString() == "thunk")
- continue;
- } else
- switch (Attr.getKindAsEnum()) {
- // Those attributes cannot be propagated safely. Explicitly list them
- // here so we get a warning if new attributes are added. This list also
- // includes non-function attributes.
- case Attribute::Alignment:
- case Attribute::AllocSize:
- case Attribute::ArgMemOnly:
- case Attribute::Builtin:
- case Attribute::ByVal:
- case Attribute::Convergent:
- case Attribute::Dereferenceable:
- case Attribute::DereferenceableOrNull:
- case Attribute::InAlloca:
- case Attribute::InReg:
- case Attribute::InaccessibleMemOnly:
- case Attribute::InaccessibleMemOrArgMemOnly:
- case Attribute::JumpTable:
- case Attribute::Naked:
- case Attribute::Nest:
- case Attribute::NoAlias:
- case Attribute::NoBuiltin:
- case Attribute::NoCapture:
- case Attribute::NoMerge:
- case Attribute::NoReturn:
- case Attribute::NoSync:
- case Attribute::NoUndef:
- case Attribute::None:
- case Attribute::NonNull:
- case Attribute::Preallocated:
- case Attribute::ReadNone:
- case Attribute::ReadOnly:
- case Attribute::Returned:
- case Attribute::ReturnsTwice:
- case Attribute::SExt:
- case Attribute::Speculatable:
- case Attribute::StackAlignment:
- case Attribute::StructRet:
- case Attribute::SwiftError:
- case Attribute::SwiftSelf:
- case Attribute::WillReturn:
- case Attribute::WriteOnly:
- case Attribute::ZExt:
- case Attribute::ImmArg:
+ for (BasicBlock *PredBB : Preds)
+ if (Blocks.count(PredBB))
+ PredBB->getTerminator()->replaceUsesOfWith(ExitBB, NewBB);
+ BranchInst::Create(ExitBB, NewBB);
+ Blocks.insert(NewBB);
+ }
+
+ // Split this PHI.
+ PHINode *NewPN =
+ PHINode::Create(PN.getType(), IncomingVals.size(),
+ PN.getName() + ".ce", NewBB->getFirstNonPHI());
+ for (unsigned i : IncomingVals)
+ NewPN->addIncoming(PN.getIncomingValue(i), PN.getIncomingBlock(i));
+ for (unsigned i : reverse(IncomingVals))
+ PN.removeIncomingValue(i, false);
+ PN.addIncoming(NewPN, NewBB);
+ }
+ }
+}
+
+void CodeExtractor::splitReturnBlocks() {
+ for (BasicBlock *Block : Blocks)
+ if (ReturnInst *RI = dyn_cast<ReturnInst>(Block->getTerminator())) {
+ BasicBlock *New =
+ Block->splitBasicBlock(RI->getIterator(), Block->getName() + ".ret");
+ if (DT) {
+ // Old dominates New. New node dominates all other nodes dominated
+ // by Old.
+ DomTreeNode *OldNode = DT->getNode(Block);
+ SmallVector<DomTreeNode *, 8> Children(OldNode->begin(),
+ OldNode->end());
+
+ DomTreeNode *NewNode = DT->addNewBlock(New, Block);
+
+ for (DomTreeNode *I : Children)
+ DT->changeImmediateDominator(I, NewNode);
+ }
+ }
+}
+
+/// constructFunction - make a function based on inputs and outputs, as follows:
+/// f(in0, ..., inN, out0, ..., outN)
+Function *CodeExtractor::constructFunction(const ValueSet &inputs,
+ const ValueSet &outputs,
+ BasicBlock *header,
+ BasicBlock *newRootNode,
+ BasicBlock *newHeader,
+ Function *oldFunction,
+ Module *M) {
+ LLVM_DEBUG(dbgs() << "inputs: " << inputs.size() << "\n");
+ LLVM_DEBUG(dbgs() << "outputs: " << outputs.size() << "\n");
+
+ // This function returns unsigned, outputs will go back by reference.
+ switch (NumExitBlocks) {
+ case 0:
+ case 1: RetTy = Type::getVoidTy(header->getContext()); break;
+ case 2: RetTy = Type::getInt1Ty(header->getContext()); break;
+ default: RetTy = Type::getInt16Ty(header->getContext()); break;
+ }
+
+ std::vector<Type *> paramTy;
+
+ // Add the types of the input values to the function's argument list
+ for (Value *value : inputs) {
+ LLVM_DEBUG(dbgs() << "value used in func: " << *value << "\n");
+ paramTy.push_back(value->getType());
+ }
+
+ // Add the types of the output values to the function's argument list.
+ for (Value *output : outputs) {
+ LLVM_DEBUG(dbgs() << "instr used in func: " << *output << "\n");
+ if (AggregateArgs)
+ paramTy.push_back(output->getType());
+ else
+ paramTy.push_back(PointerType::getUnqual(output->getType()));
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "Function type: " << *RetTy << " f(";
+ for (Type *i : paramTy)
+ dbgs() << *i << ", ";
+ dbgs() << ")\n";
+ });
+
+ StructType *StructTy = nullptr;
+ if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
+ StructTy = StructType::get(M->getContext(), paramTy);
+ paramTy.clear();
+ paramTy.push_back(PointerType::getUnqual(StructTy));
+ }
+ FunctionType *funcType =
+ FunctionType::get(RetTy, paramTy,
+ AllowVarArgs && oldFunction->isVarArg());
+
+ std::string SuffixToUse =
+ Suffix.empty()
+ ? (header->getName().empty() ? "extracted" : header->getName().str())
+ : Suffix;
+ // Create the new function
+ Function *newFunction = Function::Create(
+ funcType, GlobalValue::InternalLinkage, oldFunction->getAddressSpace(),
+ oldFunction->getName() + "." + SuffixToUse, M);
+ // If the old function is no-throw, so is the new one.
+ if (oldFunction->doesNotThrow())
+ newFunction->setDoesNotThrow();
+
+ // Inherit the uwtable attribute if we need to.
+ if (oldFunction->hasUWTable())
+ newFunction->setHasUWTable();
+
+ // Inherit all of the target dependent attributes and white-listed
+ // target independent attributes.
+ // (e.g. If the extracted region contains a call to an x86.sse
+ // instruction we need to make sure that the extracted region has the
+ // "target-features" attribute allowing it to be lowered.
+ // FIXME: This should be changed to check to see if a specific
+ // attribute can not be inherited.
+ for (const auto &Attr : oldFunction->getAttributes().getFnAttributes()) {
+ if (Attr.isStringAttribute()) {
+ if (Attr.getKindAsString() == "thunk")
+ continue;
+ } else
+ switch (Attr.getKindAsEnum()) {
+ // Those attributes cannot be propagated safely. Explicitly list them
+ // here so we get a warning if new attributes are added. This list also
+ // includes non-function attributes.
+ case Attribute::Alignment:
+ case Attribute::AllocSize:
+ case Attribute::ArgMemOnly:
+ case Attribute::Builtin:
+ case Attribute::ByVal:
+ case Attribute::Convergent:
+ case Attribute::Dereferenceable:
+ case Attribute::DereferenceableOrNull:
+ case Attribute::InAlloca:
+ case Attribute::InReg:
+ case Attribute::InaccessibleMemOnly:
+ case Attribute::InaccessibleMemOrArgMemOnly:
+ case Attribute::JumpTable:
+ case Attribute::Naked:
+ case Attribute::Nest:
+ case Attribute::NoAlias:
+ case Attribute::NoBuiltin:
+ case Attribute::NoCapture:
+ case Attribute::NoMerge:
+ case Attribute::NoReturn:
+ case Attribute::NoSync:
+ case Attribute::NoUndef:
+ case Attribute::None:
+ case Attribute::NonNull:
+ case Attribute::Preallocated:
+ case Attribute::ReadNone:
+ case Attribute::ReadOnly:
+ case Attribute::Returned:
+ case Attribute::ReturnsTwice:
+ case Attribute::SExt:
+ case Attribute::Speculatable:
+ case Attribute::StackAlignment:
+ case Attribute::StructRet:
+ case Attribute::SwiftError:
+ case Attribute::SwiftSelf:
+ case Attribute::WillReturn:
+ case Attribute::WriteOnly:
+ case Attribute::ZExt:
+ case Attribute::ImmArg:
case Attribute::ByRef:
- case Attribute::EndAttrKinds:
- case Attribute::EmptyKey:
- case Attribute::TombstoneKey:
- continue;
- // Those attributes should be safe to propagate to the extracted function.
- case Attribute::AlwaysInline:
- case Attribute::Cold:
+ case Attribute::EndAttrKinds:
+ case Attribute::EmptyKey:
+ case Attribute::TombstoneKey:
+ continue;
+ // Those attributes should be safe to propagate to the extracted function.
+ case Attribute::AlwaysInline:
+ case Attribute::Cold:
case Attribute::Hot:
- case Attribute::NoRecurse:
- case Attribute::InlineHint:
- case Attribute::MinSize:
+ case Attribute::NoRecurse:
+ case Attribute::InlineHint:
+ case Attribute::MinSize:
case Attribute::NoCallback:
- case Attribute::NoDuplicate:
- case Attribute::NoFree:
- case Attribute::NoImplicitFloat:
- case Attribute::NoInline:
- case Attribute::NonLazyBind:
- case Attribute::NoRedZone:
- case Attribute::NoUnwind:
- case Attribute::NullPointerIsValid:
- case Attribute::OptForFuzzing:
- case Attribute::OptimizeNone:
- case Attribute::OptimizeForSize:
- case Attribute::SafeStack:
- case Attribute::ShadowCallStack:
- case Attribute::SanitizeAddress:
- case Attribute::SanitizeMemory:
- case Attribute::SanitizeThread:
- case Attribute::SanitizeHWAddress:
- case Attribute::SanitizeMemTag:
- case Attribute::SpeculativeLoadHardening:
- case Attribute::StackProtect:
- case Attribute::StackProtectReq:
- case Attribute::StackProtectStrong:
- case Attribute::StrictFP:
- case Attribute::UWTable:
- case Attribute::NoCfCheck:
+ case Attribute::NoDuplicate:
+ case Attribute::NoFree:
+ case Attribute::NoImplicitFloat:
+ case Attribute::NoInline:
+ case Attribute::NonLazyBind:
+ case Attribute::NoRedZone:
+ case Attribute::NoUnwind:
+ case Attribute::NullPointerIsValid:
+ case Attribute::OptForFuzzing:
+ case Attribute::OptimizeNone:
+ case Attribute::OptimizeForSize:
+ case Attribute::SafeStack:
+ case Attribute::ShadowCallStack:
+ case Attribute::SanitizeAddress:
+ case Attribute::SanitizeMemory:
+ case Attribute::SanitizeThread:
+ case Attribute::SanitizeHWAddress:
+ case Attribute::SanitizeMemTag:
+ case Attribute::SpeculativeLoadHardening:
+ case Attribute::StackProtect:
+ case Attribute::StackProtectReq:
+ case Attribute::StackProtectStrong:
+ case Attribute::StrictFP:
+ case Attribute::UWTable:
+ case Attribute::NoCfCheck:
case Attribute::MustProgress:
case Attribute::NoProfile:
- break;
- }
-
- newFunction->addFnAttr(Attr);
- }
- newFunction->getBasicBlockList().push_back(newRootNode);
-
- // Create an iterator to name all of the arguments we inserted.
- Function::arg_iterator AI = newFunction->arg_begin();
-
- // Rewrite all users of the inputs in the extracted region to use the
- // arguments (or appropriate addressing into struct) instead.
- for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
- Value *RewriteVal;
- if (AggregateArgs) {
- Value *Idx[2];
- Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext()));
- Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i);
- Instruction *TI = newFunction->begin()->getTerminator();
- GetElementPtrInst *GEP = GetElementPtrInst::Create(
- StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI);
- RewriteVal = new LoadInst(StructTy->getElementType(i), GEP,
- "loadgep_" + inputs[i]->getName(), TI);
- } else
- RewriteVal = &*AI++;
-
- std::vector<User *> Users(inputs[i]->user_begin(), inputs[i]->user_end());
- for (User *use : Users)
- if (Instruction *inst = dyn_cast<Instruction>(use))
- if (Blocks.count(inst->getParent()))
- inst->replaceUsesOfWith(inputs[i], RewriteVal);
- }
-
- // Set names for input and output arguments.
- if (!AggregateArgs) {
- AI = newFunction->arg_begin();
- for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++AI)
- AI->setName(inputs[i]->getName());
- for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++AI)
- AI->setName(outputs[i]->getName()+".out");
- }
-
- // Rewrite branches to basic blocks outside of the loop to new dummy blocks
- // within the new function. This must be done before we lose track of which
- // blocks were originally in the code region.
- std::vector<User *> Users(header->user_begin(), header->user_end());
- for (auto &U : Users)
- // The BasicBlock which contains the branch is not in the region
- // modify the branch target to a new block
- if (Instruction *I = dyn_cast<Instruction>(U))
- if (I->isTerminator() && I->getFunction() == oldFunction &&
- !Blocks.count(I->getParent()))
- I->replaceUsesOfWith(header, newHeader);
-
- return newFunction;
-}
-
-/// Erase lifetime.start markers which reference inputs to the extraction
-/// region, and insert the referenced memory into \p LifetimesStart.
-///
-/// The extraction region is defined by a set of blocks (\p Blocks), and a set
-/// of allocas which will be moved from the caller function into the extracted
-/// function (\p SunkAllocas).
-static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks,
- const SetVector<Value *> &SunkAllocas,
- SetVector<Value *> &LifetimesStart) {
- for (BasicBlock *BB : Blocks) {
- for (auto It = BB->begin(), End = BB->end(); It != End;) {
- auto *II = dyn_cast<IntrinsicInst>(&*It);
- ++It;
- if (!II || !II->isLifetimeStartOrEnd())
- continue;
-
- // Get the memory operand of the lifetime marker. If the underlying
- // object is a sunk alloca, or is otherwise defined in the extraction
- // region, the lifetime marker must not be erased.
- Value *Mem = II->getOperand(1)->stripInBoundsOffsets();
- if (SunkAllocas.count(Mem) || definedInRegion(Blocks, Mem))
- continue;
-
- if (II->getIntrinsicID() == Intrinsic::lifetime_start)
- LifetimesStart.insert(Mem);
- II->eraseFromParent();
- }
- }
-}
-
-/// Insert lifetime start/end markers surrounding the call to the new function
-/// for objects defined in the caller.
-static void insertLifetimeMarkersSurroundingCall(
- Module *M, ArrayRef<Value *> LifetimesStart, ArrayRef<Value *> LifetimesEnd,
- CallInst *TheCall) {
- LLVMContext &Ctx = M->getContext();
- auto Int8PtrTy = Type::getInt8PtrTy(Ctx);
- auto NegativeOne = ConstantInt::getSigned(Type::getInt64Ty(Ctx), -1);
- Instruction *Term = TheCall->getParent()->getTerminator();
-
- // The memory argument to a lifetime marker must be a i8*. Cache any bitcasts
- // needed to satisfy this requirement so they may be reused.
- DenseMap<Value *, Value *> Bitcasts;
-
- // Emit lifetime markers for the pointers given in \p Objects. Insert the
- // markers before the call if \p InsertBefore, and after the call otherwise.
- auto insertMarkers = [&](Function *MarkerFunc, ArrayRef<Value *> Objects,
- bool InsertBefore) {
- for (Value *Mem : Objects) {
- assert((!isa<Instruction>(Mem) || cast<Instruction>(Mem)->getFunction() ==
- TheCall->getFunction()) &&
- "Input memory not defined in original function");
- Value *&MemAsI8Ptr = Bitcasts[Mem];
- if (!MemAsI8Ptr) {
- if (Mem->getType() == Int8PtrTy)
- MemAsI8Ptr = Mem;
- else
- MemAsI8Ptr =
- CastInst::CreatePointerCast(Mem, Int8PtrTy, "lt.cast", TheCall);
- }
-
- auto Marker = CallInst::Create(MarkerFunc, {NegativeOne, MemAsI8Ptr});
- if (InsertBefore)
- Marker->insertBefore(TheCall);
- else
- Marker->insertBefore(Term);
- }
- };
-
- if (!LifetimesStart.empty()) {
- auto StartFn = llvm::Intrinsic::getDeclaration(
- M, llvm::Intrinsic::lifetime_start, Int8PtrTy);
- insertMarkers(StartFn, LifetimesStart, /*InsertBefore=*/true);
- }
-
- if (!LifetimesEnd.empty()) {
- auto EndFn = llvm::Intrinsic::getDeclaration(
- M, llvm::Intrinsic::lifetime_end, Int8PtrTy);
- insertMarkers(EndFn, LifetimesEnd, /*InsertBefore=*/false);
- }
-}
-
-/// emitCallAndSwitchStatement - This method sets up the caller side by adding
-/// the call instruction, splitting any PHI nodes in the header block as
-/// necessary.
-CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
- BasicBlock *codeReplacer,
- ValueSet &inputs,
- ValueSet &outputs) {
- // Emit a call to the new function, passing in: *pointer to struct (if
- // aggregating parameters), or plan inputs and allocated memory for outputs
- std::vector<Value *> params, StructValues, ReloadOutputs, Reloads;
-
- Module *M = newFunction->getParent();
- LLVMContext &Context = M->getContext();
- const DataLayout &DL = M->getDataLayout();
- CallInst *call = nullptr;
-
- // Add inputs as params, or to be filled into the struct
- unsigned ArgNo = 0;
- SmallVector<unsigned, 1> SwiftErrorArgs;
- for (Value *input : inputs) {
- if (AggregateArgs)
- StructValues.push_back(input);
- else {
- params.push_back(input);
- if (input->isSwiftError())
- SwiftErrorArgs.push_back(ArgNo);
- }
- ++ArgNo;
- }
-
- // Create allocas for the outputs
- for (Value *output : outputs) {
- if (AggregateArgs) {
- StructValues.push_back(output);
- } else {
- AllocaInst *alloca =
- new AllocaInst(output->getType(), DL.getAllocaAddrSpace(),
- nullptr, output->getName() + ".loc",
- &codeReplacer->getParent()->front().front());
- ReloadOutputs.push_back(alloca);
- params.push_back(alloca);
- }
- }
-
- StructType *StructArgTy = nullptr;
- AllocaInst *Struct = nullptr;
- if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
- std::vector<Type *> ArgTypes;
- for (ValueSet::iterator v = StructValues.begin(),
- ve = StructValues.end(); v != ve; ++v)
- ArgTypes.push_back((*v)->getType());
-
- // Allocate a struct at the beginning of this function
- StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
- Struct = new AllocaInst(StructArgTy, DL.getAllocaAddrSpace(), nullptr,
- "structArg",
- &codeReplacer->getParent()->front().front());
- params.push_back(Struct);
-
- for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
- Value *Idx[2];
- Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
- Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i);
- GetElementPtrInst *GEP = GetElementPtrInst::Create(
- StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName());
- codeReplacer->getInstList().push_back(GEP);
- new StoreInst(StructValues[i], GEP, codeReplacer);
- }
- }
-
- // Emit the call to the function
- call = CallInst::Create(newFunction, params,
- NumExitBlocks > 1 ? "targetBlock" : "");
- // Add debug location to the new call, if the original function has debug
- // info. In that case, the terminator of the entry block of the extracted
- // function contains the first debug location of the extracted function,
- // set in extractCodeRegion.
- if (codeReplacer->getParent()->getSubprogram()) {
- if (auto DL = newFunction->getEntryBlock().getTerminator()->getDebugLoc())
- call->setDebugLoc(DL);
- }
- codeReplacer->getInstList().push_back(call);
-
- // Set swifterror parameter attributes.
- for (unsigned SwiftErrArgNo : SwiftErrorArgs) {
- call->addParamAttr(SwiftErrArgNo, Attribute::SwiftError);
- newFunction->addParamAttr(SwiftErrArgNo, Attribute::SwiftError);
- }
-
- Function::arg_iterator OutputArgBegin = newFunction->arg_begin();
- unsigned FirstOut = inputs.size();
- if (!AggregateArgs)
- std::advance(OutputArgBegin, inputs.size());
-
- // Reload the outputs passed in by reference.
- for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
- Value *Output = nullptr;
- if (AggregateArgs) {
- Value *Idx[2];
- Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
- Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
- GetElementPtrInst *GEP = GetElementPtrInst::Create(
- StructArgTy, Struct, Idx, "gep_reload_" + outputs[i]->getName());
- codeReplacer->getInstList().push_back(GEP);
- Output = GEP;
- } else {
- Output = ReloadOutputs[i];
- }
- LoadInst *load = new LoadInst(outputs[i]->getType(), Output,
- outputs[i]->getName() + ".reload",
- codeReplacer);
- Reloads.push_back(load);
- std::vector<User *> Users(outputs[i]->user_begin(), outputs[i]->user_end());
- for (unsigned u = 0, e = Users.size(); u != e; ++u) {
- Instruction *inst = cast<Instruction>(Users[u]);
- if (!Blocks.count(inst->getParent()))
- inst->replaceUsesOfWith(outputs[i], load);
- }
- }
-
- // Now we can emit a switch statement using the call as a value.
- SwitchInst *TheSwitch =
- SwitchInst::Create(Constant::getNullValue(Type::getInt16Ty(Context)),
- codeReplacer, 0, codeReplacer);
-
- // Since there may be multiple exits from the original region, make the new
- // function return an unsigned, switch on that number. This loop iterates
- // over all of the blocks in the extracted region, updating any terminator
- // instructions in the to-be-extracted region that branch to blocks that are
- // not in the region to be extracted.
- std::map<BasicBlock *, BasicBlock *> ExitBlockMap;
-
- unsigned switchVal = 0;
- for (BasicBlock *Block : Blocks) {
- Instruction *TI = Block->getTerminator();
- for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
- if (!Blocks.count(TI->getSuccessor(i))) {
- BasicBlock *OldTarget = TI->getSuccessor(i);
- // add a new basic block which returns the appropriate value
- BasicBlock *&NewTarget = ExitBlockMap[OldTarget];
- if (!NewTarget) {
- // If we don't already have an exit stub for this non-extracted
- // destination, create one now!
- NewTarget = BasicBlock::Create(Context,
- OldTarget->getName() + ".exitStub",
- newFunction);
- unsigned SuccNum = switchVal++;
-
- Value *brVal = nullptr;
- switch (NumExitBlocks) {
- case 0:
- case 1: break; // No value needed.
- case 2: // Conditional branch, return a bool
- brVal = ConstantInt::get(Type::getInt1Ty(Context), !SuccNum);
- break;
- default:
- brVal = ConstantInt::get(Type::getInt16Ty(Context), SuccNum);
- break;
- }
-
- ReturnInst::Create(Context, brVal, NewTarget);
-
- // Update the switch instruction.
- TheSwitch->addCase(ConstantInt::get(Type::getInt16Ty(Context),
- SuccNum),
- OldTarget);
- }
-
- // rewrite the original branch instruction with this new target
- TI->setSuccessor(i, NewTarget);
- }
- }
-
- // Store the arguments right after the definition of output value.
- // This should be proceeded after creating exit stubs to be ensure that invoke
- // result restore will be placed in the outlined function.
- Function::arg_iterator OAI = OutputArgBegin;
- for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
- auto *OutI = dyn_cast<Instruction>(outputs[i]);
- if (!OutI)
- continue;
-
- // Find proper insertion point.
- BasicBlock::iterator InsertPt;
- // In case OutI is an invoke, we insert the store at the beginning in the
- // 'normal destination' BB. Otherwise we insert the store right after OutI.
- if (auto *InvokeI = dyn_cast<InvokeInst>(OutI))
- InsertPt = InvokeI->getNormalDest()->getFirstInsertionPt();
- else if (auto *Phi = dyn_cast<PHINode>(OutI))
- InsertPt = Phi->getParent()->getFirstInsertionPt();
- else
- InsertPt = std::next(OutI->getIterator());
-
- Instruction *InsertBefore = &*InsertPt;
- assert((InsertBefore->getFunction() == newFunction ||
- Blocks.count(InsertBefore->getParent())) &&
- "InsertPt should be in new function");
- assert(OAI != newFunction->arg_end() &&
- "Number of output arguments should match "
- "the amount of defined values");
- if (AggregateArgs) {
- Value *Idx[2];
- Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
- Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
- GetElementPtrInst *GEP = GetElementPtrInst::Create(
- StructArgTy, &*OAI, Idx, "gep_" + outputs[i]->getName(),
- InsertBefore);
- new StoreInst(outputs[i], GEP, InsertBefore);
- // Since there should be only one struct argument aggregating
- // all the output values, we shouldn't increment OAI, which always
- // points to the struct argument, in this case.
- } else {
- new StoreInst(outputs[i], &*OAI, InsertBefore);
- ++OAI;
- }
- }
-
- // Now that we've done the deed, simplify the switch instruction.
- Type *OldFnRetTy = TheSwitch->getParent()->getParent()->getReturnType();
- switch (NumExitBlocks) {
- case 0:
- // There are no successors (the block containing the switch itself), which
- // means that previously this was the last part of the function, and hence
- // this should be rewritten as a `ret'
-
- // Check if the function should return a value
- if (OldFnRetTy->isVoidTy()) {
- ReturnInst::Create(Context, nullptr, TheSwitch); // Return void
- } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) {
- // return what we have
- ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch);
- } else {
- // Otherwise we must have code extracted an unwind or something, just
- // return whatever we want.
- ReturnInst::Create(Context,
- Constant::getNullValue(OldFnRetTy), TheSwitch);
- }
-
- TheSwitch->eraseFromParent();
- break;
- case 1:
- // Only a single destination, change the switch into an unconditional
- // branch.
- BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch);
- TheSwitch->eraseFromParent();
- break;
- case 2:
- BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getSuccessor(2),
- call, TheSwitch);
- TheSwitch->eraseFromParent();
- break;
- default:
- // Otherwise, make the default destination of the switch instruction be one
- // of the other successors.
- TheSwitch->setCondition(call);
- TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks));
- // Remove redundant case
- TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1));
- break;
- }
-
- // Insert lifetime markers around the reloads of any output values. The
- // allocas output values are stored in are only in-use in the codeRepl block.
- insertLifetimeMarkersSurroundingCall(M, ReloadOutputs, ReloadOutputs, call);
-
- return call;
-}
-
-void CodeExtractor::moveCodeToFunction(Function *newFunction) {
- Function *oldFunc = (*Blocks.begin())->getParent();
- Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList();
- Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList();
-
- for (BasicBlock *Block : Blocks) {
- // Delete the basic block from the old function, and the list of blocks
- oldBlocks.remove(Block);
-
- // Insert this basic block into the new function
- newBlocks.push_back(Block);
- }
-}
-
-void CodeExtractor::calculateNewCallTerminatorWeights(
- BasicBlock *CodeReplacer,
- DenseMap<BasicBlock *, BlockFrequency> &ExitWeights,
- BranchProbabilityInfo *BPI) {
- using Distribution = BlockFrequencyInfoImplBase::Distribution;
- using BlockNode = BlockFrequencyInfoImplBase::BlockNode;
-
- // Update the branch weights for the exit block.
- Instruction *TI = CodeReplacer->getTerminator();
- SmallVector<unsigned, 8> BranchWeights(TI->getNumSuccessors(), 0);
-
- // Block Frequency distribution with dummy node.
- Distribution BranchDist;
-
- SmallVector<BranchProbability, 4> EdgeProbabilities(
- TI->getNumSuccessors(), BranchProbability::getUnknown());
-
- // Add each of the frequencies of the successors.
- for (unsigned i = 0, e = TI->getNumSuccessors(); i < e; ++i) {
- BlockNode ExitNode(i);
- uint64_t ExitFreq = ExitWeights[TI->getSuccessor(i)].getFrequency();
- if (ExitFreq != 0)
- BranchDist.addExit(ExitNode, ExitFreq);
- else
- EdgeProbabilities[i] = BranchProbability::getZero();
- }
-
- // Check for no total weight.
- if (BranchDist.Total == 0) {
- BPI->setEdgeProbability(CodeReplacer, EdgeProbabilities);
- return;
- }
-
- // Normalize the distribution so that they can fit in unsigned.
- BranchDist.normalize();
-
- // Create normalized branch weights and set the metadata.
- for (unsigned I = 0, E = BranchDist.Weights.size(); I < E; ++I) {
- const auto &Weight = BranchDist.Weights[I];
-
- // Get the weight and update the current BFI.
- BranchWeights[Weight.TargetNode.Index] = Weight.Amount;
- BranchProbability BP(Weight.Amount, BranchDist.Total);
- EdgeProbabilities[Weight.TargetNode.Index] = BP;
- }
- BPI->setEdgeProbability(CodeReplacer, EdgeProbabilities);
- TI->setMetadata(
- LLVMContext::MD_prof,
- MDBuilder(TI->getContext()).createBranchWeights(BranchWeights));
-}
-
-/// Erase debug info intrinsics which refer to values in \p F but aren't in
-/// \p F.
-static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) {
- for (Instruction &I : instructions(F)) {
- SmallVector<DbgVariableIntrinsic *, 4> DbgUsers;
- findDbgUsers(DbgUsers, &I);
- for (DbgVariableIntrinsic *DVI : DbgUsers)
- if (DVI->getFunction() != &F)
- DVI->eraseFromParent();
- }
-}
-
-/// Fix up the debug info in the old and new functions by pointing line
-/// locations and debug intrinsics to the new subprogram scope, and by deleting
-/// intrinsics which point to values outside of the new function.
-static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
- CallInst &TheCall) {
- DISubprogram *OldSP = OldFunc.getSubprogram();
- LLVMContext &Ctx = OldFunc.getContext();
-
- if (!OldSP) {
- // Erase any debug info the new function contains.
- stripDebugInfo(NewFunc);
- // Make sure the old function doesn't contain any non-local metadata refs.
- eraseDebugIntrinsicsWithNonLocalRefs(NewFunc);
- return;
- }
-
- // Create a subprogram for the new function. Leave out a description of the
- // function arguments, as the parameters don't correspond to anything at the
- // source level.
- assert(OldSP->getUnit() && "Missing compile unit for subprogram");
+ break;
+ }
+
+ newFunction->addFnAttr(Attr);
+ }
+ newFunction->getBasicBlockList().push_back(newRootNode);
+
+ // Create an iterator to name all of the arguments we inserted.
+ Function::arg_iterator AI = newFunction->arg_begin();
+
+ // Rewrite all users of the inputs in the extracted region to use the
+ // arguments (or appropriate addressing into struct) instead.
+ for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+ Value *RewriteVal;
+ if (AggregateArgs) {
+ Value *Idx[2];
+ Idx[0] = Constant::getNullValue(Type::getInt32Ty(header->getContext()));
+ Idx[1] = ConstantInt::get(Type::getInt32Ty(header->getContext()), i);
+ Instruction *TI = newFunction->begin()->getTerminator();
+ GetElementPtrInst *GEP = GetElementPtrInst::Create(
+ StructTy, &*AI, Idx, "gep_" + inputs[i]->getName(), TI);
+ RewriteVal = new LoadInst(StructTy->getElementType(i), GEP,
+ "loadgep_" + inputs[i]->getName(), TI);
+ } else
+ RewriteVal = &*AI++;
+
+ std::vector<User *> Users(inputs[i]->user_begin(), inputs[i]->user_end());
+ for (User *use : Users)
+ if (Instruction *inst = dyn_cast<Instruction>(use))
+ if (Blocks.count(inst->getParent()))
+ inst->replaceUsesOfWith(inputs[i], RewriteVal);
+ }
+
+ // Set names for input and output arguments.
+ if (!AggregateArgs) {
+ AI = newFunction->arg_begin();
+ for (unsigned i = 0, e = inputs.size(); i != e; ++i, ++AI)
+ AI->setName(inputs[i]->getName());
+ for (unsigned i = 0, e = outputs.size(); i != e; ++i, ++AI)
+ AI->setName(outputs[i]->getName()+".out");
+ }
+
+ // Rewrite branches to basic blocks outside of the loop to new dummy blocks
+ // within the new function. This must be done before we lose track of which
+ // blocks were originally in the code region.
+ std::vector<User *> Users(header->user_begin(), header->user_end());
+ for (auto &U : Users)
+ // The BasicBlock which contains the branch is not in the region
+ // modify the branch target to a new block
+ if (Instruction *I = dyn_cast<Instruction>(U))
+ if (I->isTerminator() && I->getFunction() == oldFunction &&
+ !Blocks.count(I->getParent()))
+ I->replaceUsesOfWith(header, newHeader);
+
+ return newFunction;
+}
+
+/// Erase lifetime.start markers which reference inputs to the extraction
+/// region, and insert the referenced memory into \p LifetimesStart.
+///
+/// The extraction region is defined by a set of blocks (\p Blocks), and a set
+/// of allocas which will be moved from the caller function into the extracted
+/// function (\p SunkAllocas).
+static void eraseLifetimeMarkersOnInputs(const SetVector<BasicBlock *> &Blocks,
+ const SetVector<Value *> &SunkAllocas,
+ SetVector<Value *> &LifetimesStart) {
+ for (BasicBlock *BB : Blocks) {
+ for (auto It = BB->begin(), End = BB->end(); It != End;) {
+ auto *II = dyn_cast<IntrinsicInst>(&*It);
+ ++It;
+ if (!II || !II->isLifetimeStartOrEnd())
+ continue;
+
+ // Get the memory operand of the lifetime marker. If the underlying
+ // object is a sunk alloca, or is otherwise defined in the extraction
+ // region, the lifetime marker must not be erased.
+ Value *Mem = II->getOperand(1)->stripInBoundsOffsets();
+ if (SunkAllocas.count(Mem) || definedInRegion(Blocks, Mem))
+ continue;
+
+ if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+ LifetimesStart.insert(Mem);
+ II->eraseFromParent();
+ }
+ }
+}
+
+/// Insert lifetime start/end markers surrounding the call to the new function
+/// for objects defined in the caller.
+static void insertLifetimeMarkersSurroundingCall(
+ Module *M, ArrayRef<Value *> LifetimesStart, ArrayRef<Value *> LifetimesEnd,
+ CallInst *TheCall) {
+ LLVMContext &Ctx = M->getContext();
+ auto Int8PtrTy = Type::getInt8PtrTy(Ctx);
+ auto NegativeOne = ConstantInt::getSigned(Type::getInt64Ty(Ctx), -1);
+ Instruction *Term = TheCall->getParent()->getTerminator();
+
+ // The memory argument to a lifetime marker must be a i8*. Cache any bitcasts
+ // needed to satisfy this requirement so they may be reused.
+ DenseMap<Value *, Value *> Bitcasts;
+
+ // Emit lifetime markers for the pointers given in \p Objects. Insert the
+ // markers before the call if \p InsertBefore, and after the call otherwise.
+ auto insertMarkers = [&](Function *MarkerFunc, ArrayRef<Value *> Objects,
+ bool InsertBefore) {
+ for (Value *Mem : Objects) {
+ assert((!isa<Instruction>(Mem) || cast<Instruction>(Mem)->getFunction() ==
+ TheCall->getFunction()) &&
+ "Input memory not defined in original function");
+ Value *&MemAsI8Ptr = Bitcasts[Mem];
+ if (!MemAsI8Ptr) {
+ if (Mem->getType() == Int8PtrTy)
+ MemAsI8Ptr = Mem;
+ else
+ MemAsI8Ptr =
+ CastInst::CreatePointerCast(Mem, Int8PtrTy, "lt.cast", TheCall);
+ }
+
+ auto Marker = CallInst::Create(MarkerFunc, {NegativeOne, MemAsI8Ptr});
+ if (InsertBefore)
+ Marker->insertBefore(TheCall);
+ else
+ Marker->insertBefore(Term);
+ }
+ };
+
+ if (!LifetimesStart.empty()) {
+ auto StartFn = llvm::Intrinsic::getDeclaration(
+ M, llvm::Intrinsic::lifetime_start, Int8PtrTy);
+ insertMarkers(StartFn, LifetimesStart, /*InsertBefore=*/true);
+ }
+
+ if (!LifetimesEnd.empty()) {
+ auto EndFn = llvm::Intrinsic::getDeclaration(
+ M, llvm::Intrinsic::lifetime_end, Int8PtrTy);
+ insertMarkers(EndFn, LifetimesEnd, /*InsertBefore=*/false);
+ }
+}
+
+/// emitCallAndSwitchStatement - This method sets up the caller side by adding
+/// the call instruction, splitting any PHI nodes in the header block as
+/// necessary.
+CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
+ BasicBlock *codeReplacer,
+ ValueSet &inputs,
+ ValueSet &outputs) {
+ // Emit a call to the new function, passing in: *pointer to struct (if
+ // aggregating parameters), or plan inputs and allocated memory for outputs
+ std::vector<Value *> params, StructValues, ReloadOutputs, Reloads;
+
+ Module *M = newFunction->getParent();
+ LLVMContext &Context = M->getContext();
+ const DataLayout &DL = M->getDataLayout();
+ CallInst *call = nullptr;
+
+ // Add inputs as params, or to be filled into the struct
+ unsigned ArgNo = 0;
+ SmallVector<unsigned, 1> SwiftErrorArgs;
+ for (Value *input : inputs) {
+ if (AggregateArgs)
+ StructValues.push_back(input);
+ else {
+ params.push_back(input);
+ if (input->isSwiftError())
+ SwiftErrorArgs.push_back(ArgNo);
+ }
+ ++ArgNo;
+ }
+
+ // Create allocas for the outputs
+ for (Value *output : outputs) {
+ if (AggregateArgs) {
+ StructValues.push_back(output);
+ } else {
+ AllocaInst *alloca =
+ new AllocaInst(output->getType(), DL.getAllocaAddrSpace(),
+ nullptr, output->getName() + ".loc",
+ &codeReplacer->getParent()->front().front());
+ ReloadOutputs.push_back(alloca);
+ params.push_back(alloca);
+ }
+ }
+
+ StructType *StructArgTy = nullptr;
+ AllocaInst *Struct = nullptr;
+ if (AggregateArgs && (inputs.size() + outputs.size() > 0)) {
+ std::vector<Type *> ArgTypes;
+ for (ValueSet::iterator v = StructValues.begin(),
+ ve = StructValues.end(); v != ve; ++v)
+ ArgTypes.push_back((*v)->getType());
+
+ // Allocate a struct at the beginning of this function
+ StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
+ Struct = new AllocaInst(StructArgTy, DL.getAllocaAddrSpace(), nullptr,
+ "structArg",
+ &codeReplacer->getParent()->front().front());
+ params.push_back(Struct);
+
+ for (unsigned i = 0, e = inputs.size(); i != e; ++i) {
+ Value *Idx[2];
+ Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+ Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), i);
+ GetElementPtrInst *GEP = GetElementPtrInst::Create(
+ StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName());
+ codeReplacer->getInstList().push_back(GEP);
+ new StoreInst(StructValues[i], GEP, codeReplacer);
+ }
+ }
+
+ // Emit the call to the function
+ call = CallInst::Create(newFunction, params,
+ NumExitBlocks > 1 ? "targetBlock" : "");
+ // Add debug location to the new call, if the original function has debug
+ // info. In that case, the terminator of the entry block of the extracted
+ // function contains the first debug location of the extracted function,
+ // set in extractCodeRegion.
+ if (codeReplacer->getParent()->getSubprogram()) {
+ if (auto DL = newFunction->getEntryBlock().getTerminator()->getDebugLoc())
+ call->setDebugLoc(DL);
+ }
+ codeReplacer->getInstList().push_back(call);
+
+ // Set swifterror parameter attributes.
+ for (unsigned SwiftErrArgNo : SwiftErrorArgs) {
+ call->addParamAttr(SwiftErrArgNo, Attribute::SwiftError);
+ newFunction->addParamAttr(SwiftErrArgNo, Attribute::SwiftError);
+ }
+
+ Function::arg_iterator OutputArgBegin = newFunction->arg_begin();
+ unsigned FirstOut = inputs.size();
+ if (!AggregateArgs)
+ std::advance(OutputArgBegin, inputs.size());
+
+ // Reload the outputs passed in by reference.
+ for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
+ Value *Output = nullptr;
+ if (AggregateArgs) {
+ Value *Idx[2];
+ Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+ Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
+ GetElementPtrInst *GEP = GetElementPtrInst::Create(
+ StructArgTy, Struct, Idx, "gep_reload_" + outputs[i]->getName());
+ codeReplacer->getInstList().push_back(GEP);
+ Output = GEP;
+ } else {
+ Output = ReloadOutputs[i];
+ }
+ LoadInst *load = new LoadInst(outputs[i]->getType(), Output,
+ outputs[i]->getName() + ".reload",
+ codeReplacer);
+ Reloads.push_back(load);
+ std::vector<User *> Users(outputs[i]->user_begin(), outputs[i]->user_end());
+ for (unsigned u = 0, e = Users.size(); u != e; ++u) {
+ Instruction *inst = cast<Instruction>(Users[u]);
+ if (!Blocks.count(inst->getParent()))
+ inst->replaceUsesOfWith(outputs[i], load);
+ }
+ }
+
+ // Now we can emit a switch statement using the call as a value.
+ SwitchInst *TheSwitch =
+ SwitchInst::Create(Constant::getNullValue(Type::getInt16Ty(Context)),
+ codeReplacer, 0, codeReplacer);
+
+ // Since there may be multiple exits from the original region, make the new
+ // function return an unsigned, switch on that number. This loop iterates
+ // over all of the blocks in the extracted region, updating any terminator
+ // instructions in the to-be-extracted region that branch to blocks that are
+ // not in the region to be extracted.
+ std::map<BasicBlock *, BasicBlock *> ExitBlockMap;
+
+ unsigned switchVal = 0;
+ for (BasicBlock *Block : Blocks) {
+ Instruction *TI = Block->getTerminator();
+ for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+ if (!Blocks.count(TI->getSuccessor(i))) {
+ BasicBlock *OldTarget = TI->getSuccessor(i);
+ // add a new basic block which returns the appropriate value
+ BasicBlock *&NewTarget = ExitBlockMap[OldTarget];
+ if (!NewTarget) {
+ // If we don't already have an exit stub for this non-extracted
+ // destination, create one now!
+ NewTarget = BasicBlock::Create(Context,
+ OldTarget->getName() + ".exitStub",
+ newFunction);
+ unsigned SuccNum = switchVal++;
+
+ Value *brVal = nullptr;
+ switch (NumExitBlocks) {
+ case 0:
+ case 1: break; // No value needed.
+ case 2: // Conditional branch, return a bool
+ brVal = ConstantInt::get(Type::getInt1Ty(Context), !SuccNum);
+ break;
+ default:
+ brVal = ConstantInt::get(Type::getInt16Ty(Context), SuccNum);
+ break;
+ }
+
+ ReturnInst::Create(Context, brVal, NewTarget);
+
+ // Update the switch instruction.
+ TheSwitch->addCase(ConstantInt::get(Type::getInt16Ty(Context),
+ SuccNum),
+ OldTarget);
+ }
+
+ // rewrite the original branch instruction with this new target
+ TI->setSuccessor(i, NewTarget);
+ }
+ }
+
+ // Store the arguments right after the definition of output value.
+ // This should be proceeded after creating exit stubs to be ensure that invoke
+ // result restore will be placed in the outlined function.
+ Function::arg_iterator OAI = OutputArgBegin;
+ for (unsigned i = 0, e = outputs.size(); i != e; ++i) {
+ auto *OutI = dyn_cast<Instruction>(outputs[i]);
+ if (!OutI)
+ continue;
+
+ // Find proper insertion point.
+ BasicBlock::iterator InsertPt;
+ // In case OutI is an invoke, we insert the store at the beginning in the
+ // 'normal destination' BB. Otherwise we insert the store right after OutI.
+ if (auto *InvokeI = dyn_cast<InvokeInst>(OutI))
+ InsertPt = InvokeI->getNormalDest()->getFirstInsertionPt();
+ else if (auto *Phi = dyn_cast<PHINode>(OutI))
+ InsertPt = Phi->getParent()->getFirstInsertionPt();
+ else
+ InsertPt = std::next(OutI->getIterator());
+
+ Instruction *InsertBefore = &*InsertPt;
+ assert((InsertBefore->getFunction() == newFunction ||
+ Blocks.count(InsertBefore->getParent())) &&
+ "InsertPt should be in new function");
+ assert(OAI != newFunction->arg_end() &&
+ "Number of output arguments should match "
+ "the amount of defined values");
+ if (AggregateArgs) {
+ Value *Idx[2];
+ Idx[0] = Constant::getNullValue(Type::getInt32Ty(Context));
+ Idx[1] = ConstantInt::get(Type::getInt32Ty(Context), FirstOut + i);
+ GetElementPtrInst *GEP = GetElementPtrInst::Create(
+ StructArgTy, &*OAI, Idx, "gep_" + outputs[i]->getName(),
+ InsertBefore);
+ new StoreInst(outputs[i], GEP, InsertBefore);
+ // Since there should be only one struct argument aggregating
+ // all the output values, we shouldn't increment OAI, which always
+ // points to the struct argument, in this case.
+ } else {
+ new StoreInst(outputs[i], &*OAI, InsertBefore);
+ ++OAI;
+ }
+ }
+
+ // Now that we've done the deed, simplify the switch instruction.
+ Type *OldFnRetTy = TheSwitch->getParent()->getParent()->getReturnType();
+ switch (NumExitBlocks) {
+ case 0:
+ // There are no successors (the block containing the switch itself), which
+ // means that previously this was the last part of the function, and hence
+ // this should be rewritten as a `ret'
+
+ // Check if the function should return a value
+ if (OldFnRetTy->isVoidTy()) {
+ ReturnInst::Create(Context, nullptr, TheSwitch); // Return void
+ } else if (OldFnRetTy == TheSwitch->getCondition()->getType()) {
+ // return what we have
+ ReturnInst::Create(Context, TheSwitch->getCondition(), TheSwitch);
+ } else {
+ // Otherwise we must have code extracted an unwind or something, just
+ // return whatever we want.
+ ReturnInst::Create(Context,
+ Constant::getNullValue(OldFnRetTy), TheSwitch);
+ }
+
+ TheSwitch->eraseFromParent();
+ break;
+ case 1:
+ // Only a single destination, change the switch into an unconditional
+ // branch.
+ BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch);
+ TheSwitch->eraseFromParent();
+ break;
+ case 2:
+ BranchInst::Create(TheSwitch->getSuccessor(1), TheSwitch->getSuccessor(2),
+ call, TheSwitch);
+ TheSwitch->eraseFromParent();
+ break;
+ default:
+ // Otherwise, make the default destination of the switch instruction be one
+ // of the other successors.
+ TheSwitch->setCondition(call);
+ TheSwitch->setDefaultDest(TheSwitch->getSuccessor(NumExitBlocks));
+ // Remove redundant case
+ TheSwitch->removeCase(SwitchInst::CaseIt(TheSwitch, NumExitBlocks-1));
+ break;
+ }
+
+ // Insert lifetime markers around the reloads of any output values. The
+ // allocas output values are stored in are only in-use in the codeRepl block.
+ insertLifetimeMarkersSurroundingCall(M, ReloadOutputs, ReloadOutputs, call);
+
+ return call;
+}
+
+void CodeExtractor::moveCodeToFunction(Function *newFunction) {
+ Function *oldFunc = (*Blocks.begin())->getParent();
+ Function::BasicBlockListType &oldBlocks = oldFunc->getBasicBlockList();
+ Function::BasicBlockListType &newBlocks = newFunction->getBasicBlockList();
+
+ for (BasicBlock *Block : Blocks) {
+ // Delete the basic block from the old function, and the list of blocks
+ oldBlocks.remove(Block);
+
+ // Insert this basic block into the new function
+ newBlocks.push_back(Block);
+ }
+}
+
+void CodeExtractor::calculateNewCallTerminatorWeights(
+ BasicBlock *CodeReplacer,
+ DenseMap<BasicBlock *, BlockFrequency> &ExitWeights,
+ BranchProbabilityInfo *BPI) {
+ using Distribution = BlockFrequencyInfoImplBase::Distribution;
+ using BlockNode = BlockFrequencyInfoImplBase::BlockNode;
+
+ // Update the branch weights for the exit block.
+ Instruction *TI = CodeReplacer->getTerminator();
+ SmallVector<unsigned, 8> BranchWeights(TI->getNumSuccessors(), 0);
+
+ // Block Frequency distribution with dummy node.
+ Distribution BranchDist;
+
+ SmallVector<BranchProbability, 4> EdgeProbabilities(
+ TI->getNumSuccessors(), BranchProbability::getUnknown());
+
+ // Add each of the frequencies of the successors.
+ for (unsigned i = 0, e = TI->getNumSuccessors(); i < e; ++i) {
+ BlockNode ExitNode(i);
+ uint64_t ExitFreq = ExitWeights[TI->getSuccessor(i)].getFrequency();
+ if (ExitFreq != 0)
+ BranchDist.addExit(ExitNode, ExitFreq);
+ else
+ EdgeProbabilities[i] = BranchProbability::getZero();
+ }
+
+ // Check for no total weight.
+ if (BranchDist.Total == 0) {
+ BPI->setEdgeProbability(CodeReplacer, EdgeProbabilities);
+ return;
+ }
+
+ // Normalize the distribution so that they can fit in unsigned.
+ BranchDist.normalize();
+
+ // Create normalized branch weights and set the metadata.
+ for (unsigned I = 0, E = BranchDist.Weights.size(); I < E; ++I) {
+ const auto &Weight = BranchDist.Weights[I];
+
+ // Get the weight and update the current BFI.
+ BranchWeights[Weight.TargetNode.Index] = Weight.Amount;
+ BranchProbability BP(Weight.Amount, BranchDist.Total);
+ EdgeProbabilities[Weight.TargetNode.Index] = BP;
+ }
+ BPI->setEdgeProbability(CodeReplacer, EdgeProbabilities);
+ TI->setMetadata(
+ LLVMContext::MD_prof,
+ MDBuilder(TI->getContext()).createBranchWeights(BranchWeights));
+}
+
+/// Erase debug info intrinsics which refer to values in \p F but aren't in
+/// \p F.
+static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) {
+ for (Instruction &I : instructions(F)) {
+ SmallVector<DbgVariableIntrinsic *, 4> DbgUsers;
+ findDbgUsers(DbgUsers, &I);
+ for (DbgVariableIntrinsic *DVI : DbgUsers)
+ if (DVI->getFunction() != &F)
+ DVI->eraseFromParent();
+ }
+}
+
+/// Fix up the debug info in the old and new functions by pointing line
+/// locations and debug intrinsics to the new subprogram scope, and by deleting
+/// intrinsics which point to values outside of the new function.
+static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
+ CallInst &TheCall) {
+ DISubprogram *OldSP = OldFunc.getSubprogram();
+ LLVMContext &Ctx = OldFunc.getContext();
+
+ if (!OldSP) {
+ // Erase any debug info the new function contains.
+ stripDebugInfo(NewFunc);
+ // Make sure the old function doesn't contain any non-local metadata refs.
+ eraseDebugIntrinsicsWithNonLocalRefs(NewFunc);
+ return;
+ }
+
+ // Create a subprogram for the new function. Leave out a description of the
+ // function arguments, as the parameters don't correspond to anything at the
+ // source level.
+ assert(OldSP->getUnit() && "Missing compile unit for subprogram");
DIBuilder DIB(*OldFunc.getParent(), /*AllowUnresolved=*/false,
- OldSP->getUnit());
- auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
- DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition |
- DISubprogram::SPFlagOptimized |
- DISubprogram::SPFlagLocalToUnit;
- auto NewSP = DIB.createFunction(
- OldSP->getUnit(), NewFunc.getName(), NewFunc.getName(), OldSP->getFile(),
- /*LineNo=*/0, SPType, /*ScopeLine=*/0, DINode::FlagZero, SPFlags);
- NewFunc.setSubprogram(NewSP);
-
- // Debug intrinsics in the new function need to be updated in one of two
- // ways:
- // 1) They need to be deleted, because they describe a value in the old
- // function.
- // 2) They need to point to fresh metadata, e.g. because they currently
- // point to a variable in the wrong scope.
- SmallDenseMap<DINode *, DINode *> RemappedMetadata;
- SmallVector<Instruction *, 4> DebugIntrinsicsToDelete;
- for (Instruction &I : instructions(NewFunc)) {
- auto *DII = dyn_cast<DbgInfoIntrinsic>(&I);
- if (!DII)
- continue;
-
- // Point the intrinsic to a fresh label within the new function.
- if (auto *DLI = dyn_cast<DbgLabelInst>(&I)) {
- DILabel *OldLabel = DLI->getLabel();
- DINode *&NewLabel = RemappedMetadata[OldLabel];
- if (!NewLabel)
- NewLabel = DILabel::get(Ctx, NewSP, OldLabel->getName(),
- OldLabel->getFile(), OldLabel->getLine());
- DLI->setArgOperand(0, MetadataAsValue::get(Ctx, NewLabel));
- continue;
- }
-
- // If the location isn't a constant or an instruction, delete the
- // intrinsic.
- auto *DVI = cast<DbgVariableIntrinsic>(DII);
- Value *Location = DVI->getVariableLocation();
- if (!Location ||
- (!isa<Constant>(Location) && !isa<Instruction>(Location))) {
- DebugIntrinsicsToDelete.push_back(DVI);
- continue;
- }
-
- // If the variable location is an instruction but isn't in the new
- // function, delete the intrinsic.
- Instruction *LocationInst = dyn_cast<Instruction>(Location);
- if (LocationInst && LocationInst->getFunction() != &NewFunc) {
- DebugIntrinsicsToDelete.push_back(DVI);
- continue;
- }
-
- // Point the intrinsic to a fresh variable within the new function.
- DILocalVariable *OldVar = DVI->getVariable();
- DINode *&NewVar = RemappedMetadata[OldVar];
- if (!NewVar)
- NewVar = DIB.createAutoVariable(
- NewSP, OldVar->getName(), OldVar->getFile(), OldVar->getLine(),
- OldVar->getType(), /*AlwaysPreserve=*/false, DINode::FlagZero,
- OldVar->getAlignInBits());
- DVI->setArgOperand(1, MetadataAsValue::get(Ctx, NewVar));
- }
- for (auto *DII : DebugIntrinsicsToDelete)
- DII->eraseFromParent();
- DIB.finalizeSubprogram(NewSP);
-
- // Fix up the scope information attached to the line locations in the new
- // function.
- for (Instruction &I : instructions(NewFunc)) {
- if (const DebugLoc &DL = I.getDebugLoc())
+ OldSP->getUnit());
+ auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
+ DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition |
+ DISubprogram::SPFlagOptimized |
+ DISubprogram::SPFlagLocalToUnit;
+ auto NewSP = DIB.createFunction(
+ OldSP->getUnit(), NewFunc.getName(), NewFunc.getName(), OldSP->getFile(),
+ /*LineNo=*/0, SPType, /*ScopeLine=*/0, DINode::FlagZero, SPFlags);
+ NewFunc.setSubprogram(NewSP);
+
+ // Debug intrinsics in the new function need to be updated in one of two
+ // ways:
+ // 1) They need to be deleted, because they describe a value in the old
+ // function.
+ // 2) They need to point to fresh metadata, e.g. because they currently
+ // point to a variable in the wrong scope.
+ SmallDenseMap<DINode *, DINode *> RemappedMetadata;
+ SmallVector<Instruction *, 4> DebugIntrinsicsToDelete;
+ for (Instruction &I : instructions(NewFunc)) {
+ auto *DII = dyn_cast<DbgInfoIntrinsic>(&I);
+ if (!DII)
+ continue;
+
+ // Point the intrinsic to a fresh label within the new function.
+ if (auto *DLI = dyn_cast<DbgLabelInst>(&I)) {
+ DILabel *OldLabel = DLI->getLabel();
+ DINode *&NewLabel = RemappedMetadata[OldLabel];
+ if (!NewLabel)
+ NewLabel = DILabel::get(Ctx, NewSP, OldLabel->getName(),
+ OldLabel->getFile(), OldLabel->getLine());
+ DLI->setArgOperand(0, MetadataAsValue::get(Ctx, NewLabel));
+ continue;
+ }
+
+ // If the location isn't a constant or an instruction, delete the
+ // intrinsic.
+ auto *DVI = cast<DbgVariableIntrinsic>(DII);
+ Value *Location = DVI->getVariableLocation();
+ if (!Location ||
+ (!isa<Constant>(Location) && !isa<Instruction>(Location))) {
+ DebugIntrinsicsToDelete.push_back(DVI);
+ continue;
+ }
+
+ // If the variable location is an instruction but isn't in the new
+ // function, delete the intrinsic.
+ Instruction *LocationInst = dyn_cast<Instruction>(Location);
+ if (LocationInst && LocationInst->getFunction() != &NewFunc) {
+ DebugIntrinsicsToDelete.push_back(DVI);
+ continue;
+ }
+
+ // Point the intrinsic to a fresh variable within the new function.
+ DILocalVariable *OldVar = DVI->getVariable();
+ DINode *&NewVar = RemappedMetadata[OldVar];
+ if (!NewVar)
+ NewVar = DIB.createAutoVariable(
+ NewSP, OldVar->getName(), OldVar->getFile(), OldVar->getLine(),
+ OldVar->getType(), /*AlwaysPreserve=*/false, DINode::FlagZero,
+ OldVar->getAlignInBits());
+ DVI->setArgOperand(1, MetadataAsValue::get(Ctx, NewVar));
+ }
+ for (auto *DII : DebugIntrinsicsToDelete)
+ DII->eraseFromParent();
+ DIB.finalizeSubprogram(NewSP);
+
+ // Fix up the scope information attached to the line locations in the new
+ // function.
+ for (Instruction &I : instructions(NewFunc)) {
+ if (const DebugLoc &DL = I.getDebugLoc())
I.setDebugLoc(DILocation::get(Ctx, DL.getLine(), DL.getCol(), NewSP));
-
- // Loop info metadata may contain line locations. Fix them up.
- auto updateLoopInfoLoc = [&Ctx,
- NewSP](const DILocation &Loc) -> DILocation * {
- return DILocation::get(Ctx, Loc.getLine(), Loc.getColumn(), NewSP,
- nullptr);
- };
- updateLoopMetadataDebugLocations(I, updateLoopInfoLoc);
- }
- if (!TheCall.getDebugLoc())
+
+ // Loop info metadata may contain line locations. Fix them up.
+ auto updateLoopInfoLoc = [&Ctx,
+ NewSP](const DILocation &Loc) -> DILocation * {
+ return DILocation::get(Ctx, Loc.getLine(), Loc.getColumn(), NewSP,
+ nullptr);
+ };
+ updateLoopMetadataDebugLocations(I, updateLoopInfoLoc);
+ }
+ if (!TheCall.getDebugLoc())
TheCall.setDebugLoc(DILocation::get(Ctx, 0, 0, OldSP));
-
- eraseDebugIntrinsicsWithNonLocalRefs(NewFunc);
-}
-
-Function *
-CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) {
- if (!isEligible())
- return nullptr;
-
- // Assumption: this is a single-entry code region, and the header is the first
- // block in the region.
- BasicBlock *header = *Blocks.begin();
- Function *oldFunction = header->getParent();
-
- // Calculate the entry frequency of the new function before we change the root
- // block.
- BlockFrequency EntryFreq;
- if (BFI) {
- assert(BPI && "Both BPI and BFI are required to preserve profile info");
- for (BasicBlock *Pred : predecessors(header)) {
- if (Blocks.count(Pred))
- continue;
- EntryFreq +=
- BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, header);
- }
- }
-
- // Remove @llvm.assume calls that will be moved to the new function from the
- // old function's assumption cache.
- for (BasicBlock *Block : Blocks) {
- for (auto It = Block->begin(), End = Block->end(); It != End;) {
- Instruction *I = &*It;
- ++It;
-
- if (match(I, m_Intrinsic<Intrinsic::assume>())) {
- if (AC)
- AC->unregisterAssumption(cast<CallInst>(I));
- I->eraseFromParent();
- }
- }
- }
-
- // If we have any return instructions in the region, split those blocks so
- // that the return is not in the region.
- splitReturnBlocks();
-
- // Calculate the exit blocks for the extracted region and the total exit
- // weights for each of those blocks.
- DenseMap<BasicBlock *, BlockFrequency> ExitWeights;
- SmallPtrSet<BasicBlock *, 1> ExitBlocks;
- for (BasicBlock *Block : Blocks) {
- for (succ_iterator SI = succ_begin(Block), SE = succ_end(Block); SI != SE;
- ++SI) {
- if (!Blocks.count(*SI)) {
- // Update the branch weight for this successor.
- if (BFI) {
- BlockFrequency &BF = ExitWeights[*SI];
- BF += BFI->getBlockFreq(Block) * BPI->getEdgeProbability(Block, *SI);
- }
- ExitBlocks.insert(*SI);
- }
- }
- }
- NumExitBlocks = ExitBlocks.size();
-
- // If we have to split PHI nodes of the entry or exit blocks, do so now.
- severSplitPHINodesOfEntry(header);
- severSplitPHINodesOfExits(ExitBlocks);
-
- // This takes place of the original loop
- BasicBlock *codeReplacer = BasicBlock::Create(header->getContext(),
- "codeRepl", oldFunction,
- header);
-
- // The new function needs a root node because other nodes can branch to the
- // head of the region, but the entry node of a function cannot have preds.
- BasicBlock *newFuncRoot = BasicBlock::Create(header->getContext(),
- "newFuncRoot");
- auto *BranchI = BranchInst::Create(header);
- // If the original function has debug info, we have to add a debug location
- // to the new branch instruction from the artificial entry block.
- // We use the debug location of the first instruction in the extracted
- // blocks, as there is no other equivalent line in the source code.
- if (oldFunction->getSubprogram()) {
- any_of(Blocks, [&BranchI](const BasicBlock *BB) {
- return any_of(*BB, [&BranchI](const Instruction &I) {
- if (!I.getDebugLoc())
- return false;
- BranchI->setDebugLoc(I.getDebugLoc());
- return true;
- });
- });
- }
- newFuncRoot->getInstList().push_back(BranchI);
-
- ValueSet inputs, outputs, SinkingCands, HoistingCands;
- BasicBlock *CommonExit = nullptr;
- findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
- assert(HoistingCands.empty() || CommonExit);
-
- // Find inputs to, outputs from the code region.
- findInputsOutputs(inputs, outputs, SinkingCands);
-
- // Now sink all instructions which only have non-phi uses inside the region.
- // Group the allocas at the start of the block, so that any bitcast uses of
- // the allocas are well-defined.
- AllocaInst *FirstSunkAlloca = nullptr;
- for (auto *II : SinkingCands) {
- if (auto *AI = dyn_cast<AllocaInst>(II)) {
- AI->moveBefore(*newFuncRoot, newFuncRoot->getFirstInsertionPt());
- if (!FirstSunkAlloca)
- FirstSunkAlloca = AI;
- }
- }
- assert((SinkingCands.empty() || FirstSunkAlloca) &&
- "Did not expect a sink candidate without any allocas");
- for (auto *II : SinkingCands) {
- if (!isa<AllocaInst>(II)) {
- cast<Instruction>(II)->moveAfter(FirstSunkAlloca);
- }
- }
-
- if (!HoistingCands.empty()) {
- auto *HoistToBlock = findOrCreateBlockForHoisting(CommonExit);
- Instruction *TI = HoistToBlock->getTerminator();
- for (auto *II : HoistingCands)
- cast<Instruction>(II)->moveBefore(TI);
- }
-
- // Collect objects which are inputs to the extraction region and also
- // referenced by lifetime start markers within it. The effects of these
- // markers must be replicated in the calling function to prevent the stack
- // coloring pass from merging slots which store input objects.
- ValueSet LifetimesStart;
- eraseLifetimeMarkersOnInputs(Blocks, SinkingCands, LifetimesStart);
-
- // Construct new function based on inputs/outputs & add allocas for all defs.
- Function *newFunction =
- constructFunction(inputs, outputs, header, newFuncRoot, codeReplacer,
- oldFunction, oldFunction->getParent());
-
- // Update the entry count of the function.
- if (BFI) {
- auto Count = BFI->getProfileCountFromFreq(EntryFreq.getFrequency());
- if (Count.hasValue())
- newFunction->setEntryCount(
- ProfileCount(Count.getValue(), Function::PCT_Real)); // FIXME
- BFI->setBlockFreq(codeReplacer, EntryFreq.getFrequency());
- }
-
- CallInst *TheCall =
- emitCallAndSwitchStatement(newFunction, codeReplacer, inputs, outputs);
-
- moveCodeToFunction(newFunction);
-
- // Replicate the effects of any lifetime start/end markers which referenced
- // input objects in the extraction region by placing markers around the call.
- insertLifetimeMarkersSurroundingCall(
- oldFunction->getParent(), LifetimesStart.getArrayRef(), {}, TheCall);
-
- // Propagate personality info to the new function if there is one.
- if (oldFunction->hasPersonalityFn())
- newFunction->setPersonalityFn(oldFunction->getPersonalityFn());
-
- // Update the branch weights for the exit block.
- if (BFI && NumExitBlocks > 1)
- calculateNewCallTerminatorWeights(codeReplacer, ExitWeights, BPI);
-
- // Loop over all of the PHI nodes in the header and exit blocks, and change
- // any references to the old incoming edge to be the new incoming edge.
- for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) {
- PHINode *PN = cast<PHINode>(I);
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- if (!Blocks.count(PN->getIncomingBlock(i)))
- PN->setIncomingBlock(i, newFuncRoot);
- }
-
- for (BasicBlock *ExitBB : ExitBlocks)
- for (PHINode &PN : ExitBB->phis()) {
- Value *IncomingCodeReplacerVal = nullptr;
- for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
- // Ignore incoming values from outside of the extracted region.
- if (!Blocks.count(PN.getIncomingBlock(i)))
- continue;
-
- // Ensure that there is only one incoming value from codeReplacer.
- if (!IncomingCodeReplacerVal) {
- PN.setIncomingBlock(i, codeReplacer);
- IncomingCodeReplacerVal = PN.getIncomingValue(i);
- } else
- assert(IncomingCodeReplacerVal == PN.getIncomingValue(i) &&
- "PHI has two incompatbile incoming values from codeRepl");
- }
- }
-
- fixupDebugInfoPostExtraction(*oldFunction, *newFunction, *TheCall);
-
- // Mark the new function `noreturn` if applicable. Terminators which resume
- // exception propagation are treated as returning instructions. This is to
- // avoid inserting traps after calls to outlined functions which unwind.
- bool doesNotReturn = none_of(*newFunction, [](const BasicBlock &BB) {
- const Instruction *Term = BB.getTerminator();
- return isa<ReturnInst>(Term) || isa<ResumeInst>(Term);
- });
- if (doesNotReturn)
- newFunction->setDoesNotReturn();
-
- LLVM_DEBUG(if (verifyFunction(*newFunction, &errs())) {
- newFunction->dump();
- report_fatal_error("verification of newFunction failed!");
- });
- LLVM_DEBUG(if (verifyFunction(*oldFunction))
- report_fatal_error("verification of oldFunction failed!"));
- LLVM_DEBUG(if (AC && verifyAssumptionCache(*oldFunction, *newFunction, AC))
- report_fatal_error("Stale Asumption cache for old Function!"));
- return newFunction;
-}
-
-bool CodeExtractor::verifyAssumptionCache(const Function &OldFunc,
- const Function &NewFunc,
- AssumptionCache *AC) {
- for (auto AssumeVH : AC->assumptions()) {
+
+ eraseDebugIntrinsicsWithNonLocalRefs(NewFunc);
+}
+
+Function *
+CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) {
+ if (!isEligible())
+ return nullptr;
+
+ // Assumption: this is a single-entry code region, and the header is the first
+ // block in the region.
+ BasicBlock *header = *Blocks.begin();
+ Function *oldFunction = header->getParent();
+
+ // Calculate the entry frequency of the new function before we change the root
+ // block.
+ BlockFrequency EntryFreq;
+ if (BFI) {
+ assert(BPI && "Both BPI and BFI are required to preserve profile info");
+ for (BasicBlock *Pred : predecessors(header)) {
+ if (Blocks.count(Pred))
+ continue;
+ EntryFreq +=
+ BFI->getBlockFreq(Pred) * BPI->getEdgeProbability(Pred, header);
+ }
+ }
+
+ // Remove @llvm.assume calls that will be moved to the new function from the
+ // old function's assumption cache.
+ for (BasicBlock *Block : Blocks) {
+ for (auto It = Block->begin(), End = Block->end(); It != End;) {
+ Instruction *I = &*It;
+ ++It;
+
+ if (match(I, m_Intrinsic<Intrinsic::assume>())) {
+ if (AC)
+ AC->unregisterAssumption(cast<CallInst>(I));
+ I->eraseFromParent();
+ }
+ }
+ }
+
+ // If we have any return instructions in the region, split those blocks so
+ // that the return is not in the region.
+ splitReturnBlocks();
+
+ // Calculate the exit blocks for the extracted region and the total exit
+ // weights for each of those blocks.
+ DenseMap<BasicBlock *, BlockFrequency> ExitWeights;
+ SmallPtrSet<BasicBlock *, 1> ExitBlocks;
+ for (BasicBlock *Block : Blocks) {
+ for (succ_iterator SI = succ_begin(Block), SE = succ_end(Block); SI != SE;
+ ++SI) {
+ if (!Blocks.count(*SI)) {
+ // Update the branch weight for this successor.
+ if (BFI) {
+ BlockFrequency &BF = ExitWeights[*SI];
+ BF += BFI->getBlockFreq(Block) * BPI->getEdgeProbability(Block, *SI);
+ }
+ ExitBlocks.insert(*SI);
+ }
+ }
+ }
+ NumExitBlocks = ExitBlocks.size();
+
+ // If we have to split PHI nodes of the entry or exit blocks, do so now.
+ severSplitPHINodesOfEntry(header);
+ severSplitPHINodesOfExits(ExitBlocks);
+
+ // This takes place of the original loop
+ BasicBlock *codeReplacer = BasicBlock::Create(header->getContext(),
+ "codeRepl", oldFunction,
+ header);
+
+ // The new function needs a root node because other nodes can branch to the
+ // head of the region, but the entry node of a function cannot have preds.
+ BasicBlock *newFuncRoot = BasicBlock::Create(header->getContext(),
+ "newFuncRoot");
+ auto *BranchI = BranchInst::Create(header);
+ // If the original function has debug info, we have to add a debug location
+ // to the new branch instruction from the artificial entry block.
+ // We use the debug location of the first instruction in the extracted
+ // blocks, as there is no other equivalent line in the source code.
+ if (oldFunction->getSubprogram()) {
+ any_of(Blocks, [&BranchI](const BasicBlock *BB) {
+ return any_of(*BB, [&BranchI](const Instruction &I) {
+ if (!I.getDebugLoc())
+ return false;
+ BranchI->setDebugLoc(I.getDebugLoc());
+ return true;
+ });
+ });
+ }
+ newFuncRoot->getInstList().push_back(BranchI);
+
+ ValueSet inputs, outputs, SinkingCands, HoistingCands;
+ BasicBlock *CommonExit = nullptr;
+ findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
+ assert(HoistingCands.empty() || CommonExit);
+
+ // Find inputs to, outputs from the code region.
+ findInputsOutputs(inputs, outputs, SinkingCands);
+
+ // Now sink all instructions which only have non-phi uses inside the region.
+ // Group the allocas at the start of the block, so that any bitcast uses of
+ // the allocas are well-defined.
+ AllocaInst *FirstSunkAlloca = nullptr;
+ for (auto *II : SinkingCands) {
+ if (auto *AI = dyn_cast<AllocaInst>(II)) {
+ AI->moveBefore(*newFuncRoot, newFuncRoot->getFirstInsertionPt());
+ if (!FirstSunkAlloca)
+ FirstSunkAlloca = AI;
+ }
+ }
+ assert((SinkingCands.empty() || FirstSunkAlloca) &&
+ "Did not expect a sink candidate without any allocas");
+ for (auto *II : SinkingCands) {
+ if (!isa<AllocaInst>(II)) {
+ cast<Instruction>(II)->moveAfter(FirstSunkAlloca);
+ }
+ }
+
+ if (!HoistingCands.empty()) {
+ auto *HoistToBlock = findOrCreateBlockForHoisting(CommonExit);
+ Instruction *TI = HoistToBlock->getTerminator();
+ for (auto *II : HoistingCands)
+ cast<Instruction>(II)->moveBefore(TI);
+ }
+
+ // Collect objects which are inputs to the extraction region and also
+ // referenced by lifetime start markers within it. The effects of these
+ // markers must be replicated in the calling function to prevent the stack
+ // coloring pass from merging slots which store input objects.
+ ValueSet LifetimesStart;
+ eraseLifetimeMarkersOnInputs(Blocks, SinkingCands, LifetimesStart);
+
+ // Construct new function based on inputs/outputs & add allocas for all defs.
+ Function *newFunction =
+ constructFunction(inputs, outputs, header, newFuncRoot, codeReplacer,
+ oldFunction, oldFunction->getParent());
+
+ // Update the entry count of the function.
+ if (BFI) {
+ auto Count = BFI->getProfileCountFromFreq(EntryFreq.getFrequency());
+ if (Count.hasValue())
+ newFunction->setEntryCount(
+ ProfileCount(Count.getValue(), Function::PCT_Real)); // FIXME
+ BFI->setBlockFreq(codeReplacer, EntryFreq.getFrequency());
+ }
+
+ CallInst *TheCall =
+ emitCallAndSwitchStatement(newFunction, codeReplacer, inputs, outputs);
+
+ moveCodeToFunction(newFunction);
+
+ // Replicate the effects of any lifetime start/end markers which referenced
+ // input objects in the extraction region by placing markers around the call.
+ insertLifetimeMarkersSurroundingCall(
+ oldFunction->getParent(), LifetimesStart.getArrayRef(), {}, TheCall);
+
+ // Propagate personality info to the new function if there is one.
+ if (oldFunction->hasPersonalityFn())
+ newFunction->setPersonalityFn(oldFunction->getPersonalityFn());
+
+ // Update the branch weights for the exit block.
+ if (BFI && NumExitBlocks > 1)
+ calculateNewCallTerminatorWeights(codeReplacer, ExitWeights, BPI);
+
+ // Loop over all of the PHI nodes in the header and exit blocks, and change
+ // any references to the old incoming edge to be the new incoming edge.
+ for (BasicBlock::iterator I = header->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PN = cast<PHINode>(I);
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (!Blocks.count(PN->getIncomingBlock(i)))
+ PN->setIncomingBlock(i, newFuncRoot);
+ }
+
+ for (BasicBlock *ExitBB : ExitBlocks)
+ for (PHINode &PN : ExitBB->phis()) {
+ Value *IncomingCodeReplacerVal = nullptr;
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+ // Ignore incoming values from outside of the extracted region.
+ if (!Blocks.count(PN.getIncomingBlock(i)))
+ continue;
+
+ // Ensure that there is only one incoming value from codeReplacer.
+ if (!IncomingCodeReplacerVal) {
+ PN.setIncomingBlock(i, codeReplacer);
+ IncomingCodeReplacerVal = PN.getIncomingValue(i);
+ } else
+ assert(IncomingCodeReplacerVal == PN.getIncomingValue(i) &&
+ "PHI has two incompatbile incoming values from codeRepl");
+ }
+ }
+
+ fixupDebugInfoPostExtraction(*oldFunction, *newFunction, *TheCall);
+
+ // Mark the new function `noreturn` if applicable. Terminators which resume
+ // exception propagation are treated as returning instructions. This is to
+ // avoid inserting traps after calls to outlined functions which unwind.
+ bool doesNotReturn = none_of(*newFunction, [](const BasicBlock &BB) {
+ const Instruction *Term = BB.getTerminator();
+ return isa<ReturnInst>(Term) || isa<ResumeInst>(Term);
+ });
+ if (doesNotReturn)
+ newFunction->setDoesNotReturn();
+
+ LLVM_DEBUG(if (verifyFunction(*newFunction, &errs())) {
+ newFunction->dump();
+ report_fatal_error("verification of newFunction failed!");
+ });
+ LLVM_DEBUG(if (verifyFunction(*oldFunction))
+ report_fatal_error("verification of oldFunction failed!"));
+ LLVM_DEBUG(if (AC && verifyAssumptionCache(*oldFunction, *newFunction, AC))
+ report_fatal_error("Stale Asumption cache for old Function!"));
+ return newFunction;
+}
+
+bool CodeExtractor::verifyAssumptionCache(const Function &OldFunc,
+ const Function &NewFunc,
+ AssumptionCache *AC) {
+ for (auto AssumeVH : AC->assumptions()) {
auto *I = dyn_cast_or_null<CallInst>(AssumeVH);
- if (!I)
- continue;
-
- // There shouldn't be any llvm.assume intrinsics in the new function.
- if (I->getFunction() != &OldFunc)
- return true;
-
- // There shouldn't be any stale affected values in the assumption cache
- // that were previously in the old function, but that have now been moved
- // to the new function.
- for (auto AffectedValVH : AC->assumptionsFor(I->getOperand(0))) {
+ if (!I)
+ continue;
+
+ // There shouldn't be any llvm.assume intrinsics in the new function.
+ if (I->getFunction() != &OldFunc)
+ return true;
+
+ // There shouldn't be any stale affected values in the assumption cache
+ // that were previously in the old function, but that have now been moved
+ // to the new function.
+ for (auto AffectedValVH : AC->assumptionsFor(I->getOperand(0))) {
auto *AffectedCI = dyn_cast_or_null<CallInst>(AffectedValVH);
- if (!AffectedCI)
- continue;
- if (AffectedCI->getFunction() != &OldFunc)
- return true;
+ if (!AffectedCI)
+ continue;
+ if (AffectedCI->getFunction() != &OldFunc)
+ return true;
auto *AssumedInst = cast<Instruction>(AffectedCI->getOperand(0));
- if (AssumedInst->getFunction() != &OldFunc)
- return true;
- }
- }
- return false;
-}
+ if (AssumedInst->getFunction() != &OldFunc)
+ return true;
+ }
+ }
+ return false;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CodeMoverUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CodeMoverUtils.cpp
index 0e2f64b1d4..ce982c7403 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CodeMoverUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CodeMoverUtils.cpp
@@ -1,364 +1,364 @@
-//===- CodeMoverUtils.cpp - CodeMover Utilities ----------------------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This family of functions perform movements on basic blocks, and instructions
-// contained within a function.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/CodeMoverUtils.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/DependenceAnalysis.h"
-#include "llvm/Analysis/PostDominators.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Dominators.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "codemover-utils"
-
-STATISTIC(HasDependences,
- "Cannot move across instructions that has memory dependences");
-STATISTIC(MayThrowException, "Cannot move across instructions that may throw");
-STATISTIC(NotControlFlowEquivalent,
- "Instructions are not control flow equivalent");
-STATISTIC(NotMovedPHINode, "Movement of PHINodes are not supported");
-STATISTIC(NotMovedTerminator, "Movement of Terminator are not supported");
-
-namespace {
-/// Represent a control condition. A control condition is a condition of a
-/// terminator to decide which successors to execute. The pointer field
-/// represents the address of the condition of the terminator. The integer field
-/// is a bool, it is true when the basic block is executed when V is true. For
-/// example, `br %cond, bb0, bb1` %cond is a control condition of bb0 with the
-/// integer field equals to true, while %cond is a control condition of bb1 with
-/// the integer field equals to false.
-using ControlCondition = PointerIntPair<Value *, 1, bool>;
-#ifndef NDEBUG
-raw_ostream &operator<<(raw_ostream &OS, const ControlCondition &C) {
- OS << "[" << *C.getPointer() << ", " << (C.getInt() ? "true" : "false")
- << "]";
- return OS;
-}
-#endif
-
-/// Represent a set of control conditions required to execute ToBB from FromBB.
-class ControlConditions {
- using ConditionVectorTy = SmallVector<ControlCondition, 6>;
-
- /// A SmallVector of control conditions.
- ConditionVectorTy Conditions;
-
-public:
- /// Return a ControlConditions which stores all conditions required to execute
- /// \p BB from \p Dominator. If \p MaxLookup is non-zero, it limits the
- /// number of conditions to collect. Return None if not all conditions are
- /// collected successfully, or we hit the limit.
- static const Optional<ControlConditions>
- collectControlConditions(const BasicBlock &BB, const BasicBlock &Dominator,
- const DominatorTree &DT,
- const PostDominatorTree &PDT,
- unsigned MaxLookup = 6);
-
- /// Return true if there exists no control conditions required to execute ToBB
- /// from FromBB.
- bool isUnconditional() const { return Conditions.empty(); }
-
- /// Return a constant reference of Conditions.
- const ConditionVectorTy &getControlConditions() const { return Conditions; }
-
- /// Add \p V as one of the ControlCondition in Condition with IsTrueCondition
- /// equals to \p True. Return true if inserted successfully.
- bool addControlCondition(ControlCondition C);
-
- /// Return true if for all control conditions in Conditions, there exists an
- /// equivalent control condition in \p Other.Conditions.
- bool isEquivalent(const ControlConditions &Other) const;
-
- /// Return true if \p C1 and \p C2 are equivalent.
- static bool isEquivalent(const ControlCondition &C1,
- const ControlCondition &C2);
-
-private:
- ControlConditions() = default;
-
- static bool isEquivalent(const Value &V1, const Value &V2);
- static bool isInverse(const Value &V1, const Value &V2);
-};
-} // namespace
-
-static bool domTreeLevelBefore(DominatorTree *DT, const Instruction *InstA,
- const Instruction *InstB) {
- // Use ordered basic block in case the 2 instructions are in the same
- // block.
- if (InstA->getParent() == InstB->getParent())
- return InstA->comesBefore(InstB);
-
- DomTreeNode *DA = DT->getNode(InstA->getParent());
- DomTreeNode *DB = DT->getNode(InstB->getParent());
- return DA->getLevel() < DB->getLevel();
-}
-
-const Optional<ControlConditions> ControlConditions::collectControlConditions(
- const BasicBlock &BB, const BasicBlock &Dominator, const DominatorTree &DT,
- const PostDominatorTree &PDT, unsigned MaxLookup) {
- assert(DT.dominates(&Dominator, &BB) && "Expecting Dominator to dominate BB");
-
- ControlConditions Conditions;
- unsigned NumConditions = 0;
-
- // BB is executed unconditional from itself.
- if (&Dominator == &BB)
- return Conditions;
-
- const BasicBlock *CurBlock = &BB;
- // Walk up the dominator tree from the associated DT node for BB to the
- // associated DT node for Dominator.
- do {
- assert(DT.getNode(CurBlock) && "Expecting a valid DT node for CurBlock");
- BasicBlock *IDom = DT.getNode(CurBlock)->getIDom()->getBlock();
- assert(DT.dominates(&Dominator, IDom) &&
- "Expecting Dominator to dominate IDom");
-
- // Limitation: can only handle branch instruction currently.
- const BranchInst *BI = dyn_cast<BranchInst>(IDom->getTerminator());
- if (!BI)
- return None;
-
- bool Inserted = false;
- if (PDT.dominates(CurBlock, IDom)) {
- LLVM_DEBUG(dbgs() << CurBlock->getName()
- << " is executed unconditionally from "
- << IDom->getName() << "\n");
- } else if (PDT.dominates(CurBlock, BI->getSuccessor(0))) {
- LLVM_DEBUG(dbgs() << CurBlock->getName() << " is executed when \""
- << *BI->getCondition() << "\" is true from "
- << IDom->getName() << "\n");
- Inserted = Conditions.addControlCondition(
- ControlCondition(BI->getCondition(), true));
- } else if (PDT.dominates(CurBlock, BI->getSuccessor(1))) {
- LLVM_DEBUG(dbgs() << CurBlock->getName() << " is executed when \""
- << *BI->getCondition() << "\" is false from "
- << IDom->getName() << "\n");
- Inserted = Conditions.addControlCondition(
- ControlCondition(BI->getCondition(), false));
- } else
- return None;
-
- if (Inserted)
- ++NumConditions;
-
- if (MaxLookup != 0 && NumConditions > MaxLookup)
- return None;
-
- CurBlock = IDom;
- } while (CurBlock != &Dominator);
-
- return Conditions;
-}
-
-bool ControlConditions::addControlCondition(ControlCondition C) {
- bool Inserted = false;
- if (none_of(Conditions, [&](ControlCondition &Exists) {
- return ControlConditions::isEquivalent(C, Exists);
- })) {
- Conditions.push_back(C);
- Inserted = true;
- }
-
- LLVM_DEBUG(dbgs() << (Inserted ? "Inserted " : "Not inserted ") << C << "\n");
- return Inserted;
-}
-
-bool ControlConditions::isEquivalent(const ControlConditions &Other) const {
- if (Conditions.empty() && Other.Conditions.empty())
- return true;
-
- if (Conditions.size() != Other.Conditions.size())
- return false;
-
- return all_of(Conditions, [&](const ControlCondition &C) {
- return any_of(Other.Conditions, [&](const ControlCondition &OtherC) {
- return ControlConditions::isEquivalent(C, OtherC);
- });
- });
-}
-
-bool ControlConditions::isEquivalent(const ControlCondition &C1,
- const ControlCondition &C2) {
- if (C1.getInt() == C2.getInt()) {
- if (isEquivalent(*C1.getPointer(), *C2.getPointer()))
- return true;
- } else if (isInverse(*C1.getPointer(), *C2.getPointer()))
- return true;
-
- return false;
-}
-
-// FIXME: Use SCEV and reuse GVN/CSE logic to check for equivalence between
-// Values.
-// Currently, isEquivalent rely on other passes to ensure equivalent conditions
-// have the same value, e.g. GVN.
-bool ControlConditions::isEquivalent(const Value &V1, const Value &V2) {
- return &V1 == &V2;
-}
-
-bool ControlConditions::isInverse(const Value &V1, const Value &V2) {
- if (const CmpInst *Cmp1 = dyn_cast<CmpInst>(&V1))
- if (const CmpInst *Cmp2 = dyn_cast<CmpInst>(&V2)) {
- if (Cmp1->getPredicate() == Cmp2->getInversePredicate() &&
- Cmp1->getOperand(0) == Cmp2->getOperand(0) &&
- Cmp1->getOperand(1) == Cmp2->getOperand(1))
- return true;
-
- if (Cmp1->getPredicate() ==
- CmpInst::getSwappedPredicate(Cmp2->getInversePredicate()) &&
- Cmp1->getOperand(0) == Cmp2->getOperand(1) &&
- Cmp1->getOperand(1) == Cmp2->getOperand(0))
- return true;
- }
- return false;
-}
-
-bool llvm::isControlFlowEquivalent(const Instruction &I0, const Instruction &I1,
- const DominatorTree &DT,
- const PostDominatorTree &PDT) {
- return isControlFlowEquivalent(*I0.getParent(), *I1.getParent(), DT, PDT);
-}
-
-bool llvm::isControlFlowEquivalent(const BasicBlock &BB0, const BasicBlock &BB1,
- const DominatorTree &DT,
- const PostDominatorTree &PDT) {
- if (&BB0 == &BB1)
- return true;
-
- if ((DT.dominates(&BB0, &BB1) && PDT.dominates(&BB1, &BB0)) ||
- (PDT.dominates(&BB0, &BB1) && DT.dominates(&BB1, &BB0)))
- return true;
-
- // If the set of conditions required to execute BB0 and BB1 from their common
- // dominator are the same, then BB0 and BB1 are control flow equivalent.
- const BasicBlock *CommonDominator = DT.findNearestCommonDominator(&BB0, &BB1);
- LLVM_DEBUG(dbgs() << "The nearest common dominator of " << BB0.getName()
- << " and " << BB1.getName() << " is "
- << CommonDominator->getName() << "\n");
-
- const Optional<ControlConditions> BB0Conditions =
- ControlConditions::collectControlConditions(BB0, *CommonDominator, DT,
- PDT);
- if (BB0Conditions == None)
- return false;
-
- const Optional<ControlConditions> BB1Conditions =
- ControlConditions::collectControlConditions(BB1, *CommonDominator, DT,
- PDT);
- if (BB1Conditions == None)
- return false;
-
- return BB0Conditions->isEquivalent(*BB1Conditions);
-}
-
-static bool reportInvalidCandidate(const Instruction &I,
- llvm::Statistic &Stat) {
- ++Stat;
- LLVM_DEBUG(dbgs() << "Unable to move instruction: " << I << ". "
- << Stat.getDesc());
- return false;
-}
-
-/// Collect all instructions in between \p StartInst and \p EndInst, and store
-/// them in \p InBetweenInsts.
-static void
-collectInstructionsInBetween(Instruction &StartInst, const Instruction &EndInst,
- SmallPtrSetImpl<Instruction *> &InBetweenInsts) {
- assert(InBetweenInsts.empty() && "Expecting InBetweenInsts to be empty");
-
- /// Get the next instructions of \p I, and push them to \p WorkList.
- auto getNextInsts = [](Instruction &I,
- SmallPtrSetImpl<Instruction *> &WorkList) {
- if (Instruction *NextInst = I.getNextNode())
- WorkList.insert(NextInst);
- else {
- assert(I.isTerminator() && "Expecting a terminator instruction");
- for (BasicBlock *Succ : successors(&I))
- WorkList.insert(&Succ->front());
- }
- };
-
- SmallPtrSet<Instruction *, 10> WorkList;
- getNextInsts(StartInst, WorkList);
- while (!WorkList.empty()) {
- Instruction *CurInst = *WorkList.begin();
- WorkList.erase(CurInst);
-
- if (CurInst == &EndInst)
- continue;
-
- if (!InBetweenInsts.insert(CurInst).second)
- continue;
-
- getNextInsts(*CurInst, WorkList);
- }
-}
-
-bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
- DominatorTree &DT, const PostDominatorTree *PDT,
- DependenceInfo *DI) {
- // Skip tests when we don't have PDT or DI
- if (!PDT || !DI)
- return false;
-
- // Cannot move itself before itself.
- if (&I == &InsertPoint)
- return false;
-
- // Not moved.
- if (I.getNextNode() == &InsertPoint)
- return true;
-
- if (isa<PHINode>(I) || isa<PHINode>(InsertPoint))
- return reportInvalidCandidate(I, NotMovedPHINode);
-
- if (I.isTerminator())
- return reportInvalidCandidate(I, NotMovedTerminator);
-
- // TODO remove this limitation.
- if (!isControlFlowEquivalent(I, InsertPoint, DT, *PDT))
- return reportInvalidCandidate(I, NotControlFlowEquivalent);
-
- if (!DT.dominates(&InsertPoint, &I))
- for (const Use &U : I.uses())
- if (auto *UserInst = dyn_cast<Instruction>(U.getUser()))
- if (UserInst != &InsertPoint && !DT.dominates(&InsertPoint, U))
- return false;
- if (!DT.dominates(&I, &InsertPoint))
- for (const Value *Op : I.operands())
- if (auto *OpInst = dyn_cast<Instruction>(Op))
- if (&InsertPoint == OpInst || !DT.dominates(OpInst, &InsertPoint))
- return false;
-
- DT.updateDFSNumbers();
- const bool MoveForward = domTreeLevelBefore(&DT, &I, &InsertPoint);
- Instruction &StartInst = (MoveForward ? I : InsertPoint);
- Instruction &EndInst = (MoveForward ? InsertPoint : I);
- SmallPtrSet<Instruction *, 10> InstsToCheck;
- collectInstructionsInBetween(StartInst, EndInst, InstsToCheck);
- if (!MoveForward)
- InstsToCheck.insert(&InsertPoint);
-
- // Check if there exists instructions which may throw, may synchonize, or may
- // never return, from I to InsertPoint.
- if (!isSafeToSpeculativelyExecute(&I))
+//===- CodeMoverUtils.cpp - CodeMover Utilities ----------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform movements on basic blocks, and instructions
+// contained within a function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CodeMoverUtils.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "codemover-utils"
+
+STATISTIC(HasDependences,
+ "Cannot move across instructions that has memory dependences");
+STATISTIC(MayThrowException, "Cannot move across instructions that may throw");
+STATISTIC(NotControlFlowEquivalent,
+ "Instructions are not control flow equivalent");
+STATISTIC(NotMovedPHINode, "Movement of PHINodes are not supported");
+STATISTIC(NotMovedTerminator, "Movement of Terminator are not supported");
+
+namespace {
+/// Represent a control condition. A control condition is a condition of a
+/// terminator to decide which successors to execute. The pointer field
+/// represents the address of the condition of the terminator. The integer field
+/// is a bool, it is true when the basic block is executed when V is true. For
+/// example, `br %cond, bb0, bb1` %cond is a control condition of bb0 with the
+/// integer field equals to true, while %cond is a control condition of bb1 with
+/// the integer field equals to false.
+using ControlCondition = PointerIntPair<Value *, 1, bool>;
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const ControlCondition &C) {
+ OS << "[" << *C.getPointer() << ", " << (C.getInt() ? "true" : "false")
+ << "]";
+ return OS;
+}
+#endif
+
+/// Represent a set of control conditions required to execute ToBB from FromBB.
+class ControlConditions {
+ using ConditionVectorTy = SmallVector<ControlCondition, 6>;
+
+ /// A SmallVector of control conditions.
+ ConditionVectorTy Conditions;
+
+public:
+ /// Return a ControlConditions which stores all conditions required to execute
+ /// \p BB from \p Dominator. If \p MaxLookup is non-zero, it limits the
+ /// number of conditions to collect. Return None if not all conditions are
+ /// collected successfully, or we hit the limit.
+ static const Optional<ControlConditions>
+ collectControlConditions(const BasicBlock &BB, const BasicBlock &Dominator,
+ const DominatorTree &DT,
+ const PostDominatorTree &PDT,
+ unsigned MaxLookup = 6);
+
+ /// Return true if there exists no control conditions required to execute ToBB
+ /// from FromBB.
+ bool isUnconditional() const { return Conditions.empty(); }
+
+ /// Return a constant reference of Conditions.
+ const ConditionVectorTy &getControlConditions() const { return Conditions; }
+
+ /// Add \p V as one of the ControlCondition in Condition with IsTrueCondition
+ /// equals to \p True. Return true if inserted successfully.
+ bool addControlCondition(ControlCondition C);
+
+ /// Return true if for all control conditions in Conditions, there exists an
+ /// equivalent control condition in \p Other.Conditions.
+ bool isEquivalent(const ControlConditions &Other) const;
+
+ /// Return true if \p C1 and \p C2 are equivalent.
+ static bool isEquivalent(const ControlCondition &C1,
+ const ControlCondition &C2);
+
+private:
+ ControlConditions() = default;
+
+ static bool isEquivalent(const Value &V1, const Value &V2);
+ static bool isInverse(const Value &V1, const Value &V2);
+};
+} // namespace
+
+static bool domTreeLevelBefore(DominatorTree *DT, const Instruction *InstA,
+ const Instruction *InstB) {
+ // Use ordered basic block in case the 2 instructions are in the same
+ // block.
+ if (InstA->getParent() == InstB->getParent())
+ return InstA->comesBefore(InstB);
+
+ DomTreeNode *DA = DT->getNode(InstA->getParent());
+ DomTreeNode *DB = DT->getNode(InstB->getParent());
+ return DA->getLevel() < DB->getLevel();
+}
+
+const Optional<ControlConditions> ControlConditions::collectControlConditions(
+ const BasicBlock &BB, const BasicBlock &Dominator, const DominatorTree &DT,
+ const PostDominatorTree &PDT, unsigned MaxLookup) {
+ assert(DT.dominates(&Dominator, &BB) && "Expecting Dominator to dominate BB");
+
+ ControlConditions Conditions;
+ unsigned NumConditions = 0;
+
+ // BB is executed unconditional from itself.
+ if (&Dominator == &BB)
+ return Conditions;
+
+ const BasicBlock *CurBlock = &BB;
+ // Walk up the dominator tree from the associated DT node for BB to the
+ // associated DT node for Dominator.
+ do {
+ assert(DT.getNode(CurBlock) && "Expecting a valid DT node for CurBlock");
+ BasicBlock *IDom = DT.getNode(CurBlock)->getIDom()->getBlock();
+ assert(DT.dominates(&Dominator, IDom) &&
+ "Expecting Dominator to dominate IDom");
+
+ // Limitation: can only handle branch instruction currently.
+ const BranchInst *BI = dyn_cast<BranchInst>(IDom->getTerminator());
+ if (!BI)
+ return None;
+
+ bool Inserted = false;
+ if (PDT.dominates(CurBlock, IDom)) {
+ LLVM_DEBUG(dbgs() << CurBlock->getName()
+ << " is executed unconditionally from "
+ << IDom->getName() << "\n");
+ } else if (PDT.dominates(CurBlock, BI->getSuccessor(0))) {
+ LLVM_DEBUG(dbgs() << CurBlock->getName() << " is executed when \""
+ << *BI->getCondition() << "\" is true from "
+ << IDom->getName() << "\n");
+ Inserted = Conditions.addControlCondition(
+ ControlCondition(BI->getCondition(), true));
+ } else if (PDT.dominates(CurBlock, BI->getSuccessor(1))) {
+ LLVM_DEBUG(dbgs() << CurBlock->getName() << " is executed when \""
+ << *BI->getCondition() << "\" is false from "
+ << IDom->getName() << "\n");
+ Inserted = Conditions.addControlCondition(
+ ControlCondition(BI->getCondition(), false));
+ } else
+ return None;
+
+ if (Inserted)
+ ++NumConditions;
+
+ if (MaxLookup != 0 && NumConditions > MaxLookup)
+ return None;
+
+ CurBlock = IDom;
+ } while (CurBlock != &Dominator);
+
+ return Conditions;
+}
+
+bool ControlConditions::addControlCondition(ControlCondition C) {
+ bool Inserted = false;
+ if (none_of(Conditions, [&](ControlCondition &Exists) {
+ return ControlConditions::isEquivalent(C, Exists);
+ })) {
+ Conditions.push_back(C);
+ Inserted = true;
+ }
+
+ LLVM_DEBUG(dbgs() << (Inserted ? "Inserted " : "Not inserted ") << C << "\n");
+ return Inserted;
+}
+
+bool ControlConditions::isEquivalent(const ControlConditions &Other) const {
+ if (Conditions.empty() && Other.Conditions.empty())
+ return true;
+
+ if (Conditions.size() != Other.Conditions.size())
+ return false;
+
+ return all_of(Conditions, [&](const ControlCondition &C) {
+ return any_of(Other.Conditions, [&](const ControlCondition &OtherC) {
+ return ControlConditions::isEquivalent(C, OtherC);
+ });
+ });
+}
+
+bool ControlConditions::isEquivalent(const ControlCondition &C1,
+ const ControlCondition &C2) {
+ if (C1.getInt() == C2.getInt()) {
+ if (isEquivalent(*C1.getPointer(), *C2.getPointer()))
+ return true;
+ } else if (isInverse(*C1.getPointer(), *C2.getPointer()))
+ return true;
+
+ return false;
+}
+
+// FIXME: Use SCEV and reuse GVN/CSE logic to check for equivalence between
+// Values.
+// Currently, isEquivalent rely on other passes to ensure equivalent conditions
+// have the same value, e.g. GVN.
+bool ControlConditions::isEquivalent(const Value &V1, const Value &V2) {
+ return &V1 == &V2;
+}
+
+bool ControlConditions::isInverse(const Value &V1, const Value &V2) {
+ if (const CmpInst *Cmp1 = dyn_cast<CmpInst>(&V1))
+ if (const CmpInst *Cmp2 = dyn_cast<CmpInst>(&V2)) {
+ if (Cmp1->getPredicate() == Cmp2->getInversePredicate() &&
+ Cmp1->getOperand(0) == Cmp2->getOperand(0) &&
+ Cmp1->getOperand(1) == Cmp2->getOperand(1))
+ return true;
+
+ if (Cmp1->getPredicate() ==
+ CmpInst::getSwappedPredicate(Cmp2->getInversePredicate()) &&
+ Cmp1->getOperand(0) == Cmp2->getOperand(1) &&
+ Cmp1->getOperand(1) == Cmp2->getOperand(0))
+ return true;
+ }
+ return false;
+}
+
+bool llvm::isControlFlowEquivalent(const Instruction &I0, const Instruction &I1,
+ const DominatorTree &DT,
+ const PostDominatorTree &PDT) {
+ return isControlFlowEquivalent(*I0.getParent(), *I1.getParent(), DT, PDT);
+}
+
+bool llvm::isControlFlowEquivalent(const BasicBlock &BB0, const BasicBlock &BB1,
+ const DominatorTree &DT,
+ const PostDominatorTree &PDT) {
+ if (&BB0 == &BB1)
+ return true;
+
+ if ((DT.dominates(&BB0, &BB1) && PDT.dominates(&BB1, &BB0)) ||
+ (PDT.dominates(&BB0, &BB1) && DT.dominates(&BB1, &BB0)))
+ return true;
+
+ // If the set of conditions required to execute BB0 and BB1 from their common
+ // dominator are the same, then BB0 and BB1 are control flow equivalent.
+ const BasicBlock *CommonDominator = DT.findNearestCommonDominator(&BB0, &BB1);
+ LLVM_DEBUG(dbgs() << "The nearest common dominator of " << BB0.getName()
+ << " and " << BB1.getName() << " is "
+ << CommonDominator->getName() << "\n");
+
+ const Optional<ControlConditions> BB0Conditions =
+ ControlConditions::collectControlConditions(BB0, *CommonDominator, DT,
+ PDT);
+ if (BB0Conditions == None)
+ return false;
+
+ const Optional<ControlConditions> BB1Conditions =
+ ControlConditions::collectControlConditions(BB1, *CommonDominator, DT,
+ PDT);
+ if (BB1Conditions == None)
+ return false;
+
+ return BB0Conditions->isEquivalent(*BB1Conditions);
+}
+
+static bool reportInvalidCandidate(const Instruction &I,
+ llvm::Statistic &Stat) {
+ ++Stat;
+ LLVM_DEBUG(dbgs() << "Unable to move instruction: " << I << ". "
+ << Stat.getDesc());
+ return false;
+}
+
+/// Collect all instructions in between \p StartInst and \p EndInst, and store
+/// them in \p InBetweenInsts.
+static void
+collectInstructionsInBetween(Instruction &StartInst, const Instruction &EndInst,
+ SmallPtrSetImpl<Instruction *> &InBetweenInsts) {
+ assert(InBetweenInsts.empty() && "Expecting InBetweenInsts to be empty");
+
+ /// Get the next instructions of \p I, and push them to \p WorkList.
+ auto getNextInsts = [](Instruction &I,
+ SmallPtrSetImpl<Instruction *> &WorkList) {
+ if (Instruction *NextInst = I.getNextNode())
+ WorkList.insert(NextInst);
+ else {
+ assert(I.isTerminator() && "Expecting a terminator instruction");
+ for (BasicBlock *Succ : successors(&I))
+ WorkList.insert(&Succ->front());
+ }
+ };
+
+ SmallPtrSet<Instruction *, 10> WorkList;
+ getNextInsts(StartInst, WorkList);
+ while (!WorkList.empty()) {
+ Instruction *CurInst = *WorkList.begin();
+ WorkList.erase(CurInst);
+
+ if (CurInst == &EndInst)
+ continue;
+
+ if (!InBetweenInsts.insert(CurInst).second)
+ continue;
+
+ getNextInsts(*CurInst, WorkList);
+ }
+}
+
+bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
+ DominatorTree &DT, const PostDominatorTree *PDT,
+ DependenceInfo *DI) {
+ // Skip tests when we don't have PDT or DI
+ if (!PDT || !DI)
+ return false;
+
+ // Cannot move itself before itself.
+ if (&I == &InsertPoint)
+ return false;
+
+ // Not moved.
+ if (I.getNextNode() == &InsertPoint)
+ return true;
+
+ if (isa<PHINode>(I) || isa<PHINode>(InsertPoint))
+ return reportInvalidCandidate(I, NotMovedPHINode);
+
+ if (I.isTerminator())
+ return reportInvalidCandidate(I, NotMovedTerminator);
+
+ // TODO remove this limitation.
+ if (!isControlFlowEquivalent(I, InsertPoint, DT, *PDT))
+ return reportInvalidCandidate(I, NotControlFlowEquivalent);
+
+ if (!DT.dominates(&InsertPoint, &I))
+ for (const Use &U : I.uses())
+ if (auto *UserInst = dyn_cast<Instruction>(U.getUser()))
+ if (UserInst != &InsertPoint && !DT.dominates(&InsertPoint, U))
+ return false;
+ if (!DT.dominates(&I, &InsertPoint))
+ for (const Value *Op : I.operands())
+ if (auto *OpInst = dyn_cast<Instruction>(Op))
+ if (&InsertPoint == OpInst || !DT.dominates(OpInst, &InsertPoint))
+ return false;
+
+ DT.updateDFSNumbers();
+ const bool MoveForward = domTreeLevelBefore(&DT, &I, &InsertPoint);
+ Instruction &StartInst = (MoveForward ? I : InsertPoint);
+ Instruction &EndInst = (MoveForward ? InsertPoint : I);
+ SmallPtrSet<Instruction *, 10> InstsToCheck;
+ collectInstructionsInBetween(StartInst, EndInst, InstsToCheck);
+ if (!MoveForward)
+ InstsToCheck.insert(&InsertPoint);
+
+ // Check if there exists instructions which may throw, may synchonize, or may
+ // never return, from I to InsertPoint.
+ if (!isSafeToSpeculativelyExecute(&I))
if (llvm::any_of(InstsToCheck, [](Instruction *I) {
if (I->mayThrow())
return true;
-
+
const CallBase *CB = dyn_cast<CallBase>(I);
if (!CB)
return false;
@@ -366,14 +366,14 @@ bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
return true;
if (!CB->hasFnAttr(Attribute::NoSync))
return true;
-
+
return false;
})) {
- return reportInvalidCandidate(I, MayThrowException);
- }
-
- // Check if I has any output/flow/anti dependences with instructions from \p
- // StartInst to \p EndInst.
+ return reportInvalidCandidate(I, MayThrowException);
+ }
+
+ // Check if I has any output/flow/anti dependences with instructions from \p
+ // StartInst to \p EndInst.
if (llvm::any_of(InstsToCheck, [&DI, &I](Instruction *CurInst) {
auto DepResult = DI->depends(&I, CurInst, true);
if (DepResult && (DepResult->isOutput() || DepResult->isFlow() ||
@@ -381,45 +381,45 @@ bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
return true;
return false;
}))
- return reportInvalidCandidate(I, HasDependences);
-
- return true;
-}
-
-bool llvm::isSafeToMoveBefore(BasicBlock &BB, Instruction &InsertPoint,
- DominatorTree &DT, const PostDominatorTree *PDT,
- DependenceInfo *DI) {
- return llvm::all_of(BB, [&](Instruction &I) {
- if (BB.getTerminator() == &I)
- return true;
-
- return isSafeToMoveBefore(I, InsertPoint, DT, PDT, DI);
- });
-}
-
-void llvm::moveInstructionsToTheBeginning(BasicBlock &FromBB, BasicBlock &ToBB,
- DominatorTree &DT,
- const PostDominatorTree &PDT,
- DependenceInfo &DI) {
- for (auto It = ++FromBB.rbegin(); It != FromBB.rend();) {
- Instruction *MovePos = ToBB.getFirstNonPHIOrDbg();
- Instruction &I = *It;
- // Increment the iterator before modifying FromBB.
- ++It;
-
- if (isSafeToMoveBefore(I, *MovePos, DT, &PDT, &DI))
- I.moveBefore(MovePos);
- }
-}
-
-void llvm::moveInstructionsToTheEnd(BasicBlock &FromBB, BasicBlock &ToBB,
- DominatorTree &DT,
- const PostDominatorTree &PDT,
- DependenceInfo &DI) {
- Instruction *MovePos = ToBB.getTerminator();
- while (FromBB.size() > 1) {
- Instruction &I = FromBB.front();
- if (isSafeToMoveBefore(I, *MovePos, DT, &PDT, &DI))
- I.moveBefore(MovePos);
- }
-}
+ return reportInvalidCandidate(I, HasDependences);
+
+ return true;
+}
+
+bool llvm::isSafeToMoveBefore(BasicBlock &BB, Instruction &InsertPoint,
+ DominatorTree &DT, const PostDominatorTree *PDT,
+ DependenceInfo *DI) {
+ return llvm::all_of(BB, [&](Instruction &I) {
+ if (BB.getTerminator() == &I)
+ return true;
+
+ return isSafeToMoveBefore(I, InsertPoint, DT, PDT, DI);
+ });
+}
+
+void llvm::moveInstructionsToTheBeginning(BasicBlock &FromBB, BasicBlock &ToBB,
+ DominatorTree &DT,
+ const PostDominatorTree &PDT,
+ DependenceInfo &DI) {
+ for (auto It = ++FromBB.rbegin(); It != FromBB.rend();) {
+ Instruction *MovePos = ToBB.getFirstNonPHIOrDbg();
+ Instruction &I = *It;
+ // Increment the iterator before modifying FromBB.
+ ++It;
+
+ if (isSafeToMoveBefore(I, *MovePos, DT, &PDT, &DI))
+ I.moveBefore(MovePos);
+ }
+}
+
+void llvm::moveInstructionsToTheEnd(BasicBlock &FromBB, BasicBlock &ToBB,
+ DominatorTree &DT,
+ const PostDominatorTree &PDT,
+ DependenceInfo &DI) {
+ Instruction *MovePos = ToBB.getTerminator();
+ while (FromBB.size() > 1) {
+ Instruction &I = FromBB.front();
+ if (isSafeToMoveBefore(I, *MovePos, DT, &PDT, &DI))
+ I.moveBefore(MovePos);
+ }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/CtorUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/CtorUtils.cpp
index b973b7709b..069a86f6ab 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/CtorUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/CtorUtils.cpp
@@ -1,159 +1,159 @@
-//===- CtorUtils.cpp - Helpers for working with global_ctors ----*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines functions that are used to process llvm.global_ctors.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/CtorUtils.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-#define DEBUG_TYPE "ctor_utils"
-
-using namespace llvm;
-
-/// Given a specified llvm.global_ctors list, remove the listed elements.
-static void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) {
- // Filter out the initializer elements to remove.
- ConstantArray *OldCA = cast<ConstantArray>(GCL->getInitializer());
- SmallVector<Constant *, 10> CAList;
- for (unsigned I = 0, E = OldCA->getNumOperands(); I < E; ++I)
- if (!CtorsToRemove.test(I))
- CAList.push_back(OldCA->getOperand(I));
-
- // Create the new array initializer.
- ArrayType *ATy =
- ArrayType::get(OldCA->getType()->getElementType(), CAList.size());
- Constant *CA = ConstantArray::get(ATy, CAList);
-
- // If we didn't change the number of elements, don't create a new GV.
- if (CA->getType() == OldCA->getType()) {
- GCL->setInitializer(CA);
- return;
- }
-
- // Create the new global and insert it next to the existing list.
- GlobalVariable *NGV =
- new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(),
- CA, "", GCL->getThreadLocalMode());
- GCL->getParent()->getGlobalList().insert(GCL->getIterator(), NGV);
- NGV->takeName(GCL);
-
- // Nuke the old list, replacing any uses with the new one.
- if (!GCL->use_empty()) {
- Constant *V = NGV;
- if (V->getType() != GCL->getType())
- V = ConstantExpr::getBitCast(V, GCL->getType());
- GCL->replaceAllUsesWith(V);
- }
- GCL->eraseFromParent();
-}
-
-/// Given a llvm.global_ctors list that we can understand,
-/// return a list of the functions and null terminator as a vector.
-static std::vector<Function *> parseGlobalCtors(GlobalVariable *GV) {
- if (GV->getInitializer()->isNullValue())
- return std::vector<Function *>();
- ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
- std::vector<Function *> Result;
- Result.reserve(CA->getNumOperands());
- for (auto &V : CA->operands()) {
- ConstantStruct *CS = cast<ConstantStruct>(V);
- Result.push_back(dyn_cast<Function>(CS->getOperand(1)));
- }
- return Result;
-}
-
-/// Find the llvm.global_ctors list, verifying that all initializers have an
-/// init priority of 65535.
-static GlobalVariable *findGlobalCtors(Module &M) {
- GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
- if (!GV)
- return nullptr;
-
- // Verify that the initializer is simple enough for us to handle. We are
- // only allowed to optimize the initializer if it is unique.
- if (!GV->hasUniqueInitializer())
- return nullptr;
-
- if (isa<ConstantAggregateZero>(GV->getInitializer()))
- return GV;
- ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
-
- for (auto &V : CA->operands()) {
- if (isa<ConstantAggregateZero>(V))
- continue;
- ConstantStruct *CS = cast<ConstantStruct>(V);
- if (isa<ConstantPointerNull>(CS->getOperand(1)))
- continue;
-
- // Must have a function or null ptr.
- if (!isa<Function>(CS->getOperand(1)))
- return nullptr;
-
- // Init priority must be standard.
- ConstantInt *CI = cast<ConstantInt>(CS->getOperand(0));
- if (CI->getZExtValue() != 65535)
- return nullptr;
- }
-
- return GV;
-}
-
-/// Call "ShouldRemove" for every entry in M's global_ctor list and remove the
-/// entries for which it returns true. Return true if anything changed.
-bool llvm::optimizeGlobalCtorsList(
- Module &M, function_ref<bool(Function *)> ShouldRemove) {
- GlobalVariable *GlobalCtors = findGlobalCtors(M);
- if (!GlobalCtors)
- return false;
-
- std::vector<Function *> Ctors = parseGlobalCtors(GlobalCtors);
- if (Ctors.empty())
- return false;
-
- bool MadeChange = false;
-
- // Loop over global ctors, optimizing them when we can.
- unsigned NumCtors = Ctors.size();
- BitVector CtorsToRemove(NumCtors);
- for (unsigned i = 0; i != Ctors.size() && NumCtors > 0; ++i) {
- Function *F = Ctors[i];
- // Found a null terminator in the middle of the list, prune off the rest of
- // the list.
- if (!F)
- continue;
-
- LLVM_DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
-
- // We cannot simplify external ctor functions.
- if (F->empty())
- continue;
-
- // If we can evaluate the ctor at compile time, do.
- if (ShouldRemove(F)) {
- Ctors[i] = nullptr;
- CtorsToRemove.set(i);
- NumCtors--;
- MadeChange = true;
- continue;
- }
- }
-
- if (!MadeChange)
- return false;
-
- removeGlobalCtors(GlobalCtors, CtorsToRemove);
- return true;
-}
+//===- CtorUtils.cpp - Helpers for working with global_ctors ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines functions that are used to process llvm.global_ctors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CtorUtils.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "ctor_utils"
+
+using namespace llvm;
+
+/// Given a specified llvm.global_ctors list, remove the listed elements.
+static void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemove) {
+ // Filter out the initializer elements to remove.
+ ConstantArray *OldCA = cast<ConstantArray>(GCL->getInitializer());
+ SmallVector<Constant *, 10> CAList;
+ for (unsigned I = 0, E = OldCA->getNumOperands(); I < E; ++I)
+ if (!CtorsToRemove.test(I))
+ CAList.push_back(OldCA->getOperand(I));
+
+ // Create the new array initializer.
+ ArrayType *ATy =
+ ArrayType::get(OldCA->getType()->getElementType(), CAList.size());
+ Constant *CA = ConstantArray::get(ATy, CAList);
+
+ // If we didn't change the number of elements, don't create a new GV.
+ if (CA->getType() == OldCA->getType()) {
+ GCL->setInitializer(CA);
+ return;
+ }
+
+ // Create the new global and insert it next to the existing list.
+ GlobalVariable *NGV =
+ new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(),
+ CA, "", GCL->getThreadLocalMode());
+ GCL->getParent()->getGlobalList().insert(GCL->getIterator(), NGV);
+ NGV->takeName(GCL);
+
+ // Nuke the old list, replacing any uses with the new one.
+ if (!GCL->use_empty()) {
+ Constant *V = NGV;
+ if (V->getType() != GCL->getType())
+ V = ConstantExpr::getBitCast(V, GCL->getType());
+ GCL->replaceAllUsesWith(V);
+ }
+ GCL->eraseFromParent();
+}
+
+/// Given a llvm.global_ctors list that we can understand,
+/// return a list of the functions and null terminator as a vector.
+static std::vector<Function *> parseGlobalCtors(GlobalVariable *GV) {
+ if (GV->getInitializer()->isNullValue())
+ return std::vector<Function *>();
+ ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+ std::vector<Function *> Result;
+ Result.reserve(CA->getNumOperands());
+ for (auto &V : CA->operands()) {
+ ConstantStruct *CS = cast<ConstantStruct>(V);
+ Result.push_back(dyn_cast<Function>(CS->getOperand(1)));
+ }
+ return Result;
+}
+
+/// Find the llvm.global_ctors list, verifying that all initializers have an
+/// init priority of 65535.
+static GlobalVariable *findGlobalCtors(Module &M) {
+ GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
+ if (!GV)
+ return nullptr;
+
+ // Verify that the initializer is simple enough for us to handle. We are
+ // only allowed to optimize the initializer if it is unique.
+ if (!GV->hasUniqueInitializer())
+ return nullptr;
+
+ if (isa<ConstantAggregateZero>(GV->getInitializer()))
+ return GV;
+ ConstantArray *CA = cast<ConstantArray>(GV->getInitializer());
+
+ for (auto &V : CA->operands()) {
+ if (isa<ConstantAggregateZero>(V))
+ continue;
+ ConstantStruct *CS = cast<ConstantStruct>(V);
+ if (isa<ConstantPointerNull>(CS->getOperand(1)))
+ continue;
+
+ // Must have a function or null ptr.
+ if (!isa<Function>(CS->getOperand(1)))
+ return nullptr;
+
+ // Init priority must be standard.
+ ConstantInt *CI = cast<ConstantInt>(CS->getOperand(0));
+ if (CI->getZExtValue() != 65535)
+ return nullptr;
+ }
+
+ return GV;
+}
+
+/// Call "ShouldRemove" for every entry in M's global_ctor list and remove the
+/// entries for which it returns true. Return true if anything changed.
+bool llvm::optimizeGlobalCtorsList(
+ Module &M, function_ref<bool(Function *)> ShouldRemove) {
+ GlobalVariable *GlobalCtors = findGlobalCtors(M);
+ if (!GlobalCtors)
+ return false;
+
+ std::vector<Function *> Ctors = parseGlobalCtors(GlobalCtors);
+ if (Ctors.empty())
+ return false;
+
+ bool MadeChange = false;
+
+ // Loop over global ctors, optimizing them when we can.
+ unsigned NumCtors = Ctors.size();
+ BitVector CtorsToRemove(NumCtors);
+ for (unsigned i = 0; i != Ctors.size() && NumCtors > 0; ++i) {
+ Function *F = Ctors[i];
+ // Found a null terminator in the middle of the list, prune off the rest of
+ // the list.
+ if (!F)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "Optimizing Global Constructor: " << *F << "\n");
+
+ // We cannot simplify external ctor functions.
+ if (F->empty())
+ continue;
+
+ // If we can evaluate the ctor at compile time, do.
+ if (ShouldRemove(F)) {
+ Ctors[i] = nullptr;
+ CtorsToRemove.set(i);
+ NumCtors--;
+ MadeChange = true;
+ continue;
+ }
+ }
+
+ if (!MadeChange)
+ return false;
+
+ removeGlobalCtors(GlobalCtors, CtorsToRemove);
+ return true;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/Debugify.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/Debugify.cpp
index 9cbea67a55..3e4d53c10d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/Debugify.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/Debugify.cpp
@@ -1,204 +1,204 @@
-//===- Debugify.cpp - Attach synthetic debug info to everything -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file This pass attaches synthetic debug info to everything. It can be used
-/// to create targeted tests for debug info preservation.
-///
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/Debugify.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
+//===- Debugify.cpp - Attach synthetic debug info to everything -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This pass attaches synthetic debug info to everything. It can be used
+/// to create targeted tests for debug info preservation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Debugify.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/PassInstrumentation.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-
-using namespace llvm;
-
-namespace {
-
-cl::opt<bool> Quiet("debugify-quiet",
- cl::desc("Suppress verbose debugify output"));
-
-enum class Level {
- Locations,
- LocationsAndVariables
-};
-cl::opt<Level> DebugifyLevel(
- "debugify-level", cl::desc("Kind of debug info to add"),
- cl::values(clEnumValN(Level::Locations, "locations", "Locations only"),
- clEnumValN(Level::LocationsAndVariables, "location+variables",
- "Locations and Variables")),
- cl::init(Level::LocationsAndVariables));
-
-raw_ostream &dbg() { return Quiet ? nulls() : errs(); }
-
-uint64_t getAllocSizeInBits(Module &M, Type *Ty) {
- return Ty->isSized() ? M.getDataLayout().getTypeAllocSizeInBits(Ty) : 0;
-}
-
-bool isFunctionSkipped(Function &F) {
- return F.isDeclaration() || !F.hasExactDefinition();
-}
-
-/// Find the basic block's terminating instruction.
-///
-/// Special care is needed to handle musttail and deopt calls, as these behave
-/// like (but are in fact not) terminators.
-Instruction *findTerminatingInstruction(BasicBlock &BB) {
- if (auto *I = BB.getTerminatingMustTailCall())
- return I;
- if (auto *I = BB.getTerminatingDeoptimizeCall())
- return I;
- return BB.getTerminator();
-}
-} // end anonymous namespace
-
-bool llvm::applyDebugifyMetadata(
- Module &M, iterator_range<Module::iterator> Functions, StringRef Banner,
- std::function<bool(DIBuilder &DIB, Function &F)> ApplyToMF) {
- // Skip modules with debug info.
- if (M.getNamedMetadata("llvm.dbg.cu")) {
- dbg() << Banner << "Skipping module with debug info\n";
- return false;
- }
-
- DIBuilder DIB(M);
- LLVMContext &Ctx = M.getContext();
- auto *Int32Ty = Type::getInt32Ty(Ctx);
-
- // Get a DIType which corresponds to Ty.
- DenseMap<uint64_t, DIType *> TypeCache;
- auto getCachedDIType = [&](Type *Ty) -> DIType * {
- uint64_t Size = getAllocSizeInBits(M, Ty);
- DIType *&DTy = TypeCache[Size];
- if (!DTy) {
- std::string Name = "ty" + utostr(Size);
- DTy = DIB.createBasicType(Name, Size, dwarf::DW_ATE_unsigned);
- }
- return DTy;
- };
-
- unsigned NextLine = 1;
- unsigned NextVar = 1;
- auto File = DIB.createFile(M.getName(), "/");
- auto CU = DIB.createCompileUnit(dwarf::DW_LANG_C, File, "debugify",
- /*isOptimized=*/true, "", 0);
-
- // Visit each instruction.
- for (Function &F : Functions) {
- if (isFunctionSkipped(F))
- continue;
-
- bool InsertedDbgVal = false;
- auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
- DISubprogram::DISPFlags SPFlags =
- DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized;
- if (F.hasPrivateLinkage() || F.hasInternalLinkage())
- SPFlags |= DISubprogram::SPFlagLocalToUnit;
- auto SP = DIB.createFunction(CU, F.getName(), F.getName(), File, NextLine,
- SPType, NextLine, DINode::FlagZero, SPFlags);
- F.setSubprogram(SP);
-
- // Helper that inserts a dbg.value before \p InsertBefore, copying the
- // location (and possibly the type, if it's non-void) from \p TemplateInst.
- auto insertDbgVal = [&](Instruction &TemplateInst,
- Instruction *InsertBefore) {
- std::string Name = utostr(NextVar++);
- Value *V = &TemplateInst;
- if (TemplateInst.getType()->isVoidTy())
- V = ConstantInt::get(Int32Ty, 0);
- const DILocation *Loc = TemplateInst.getDebugLoc().get();
- auto LocalVar = DIB.createAutoVariable(SP, Name, File, Loc->getLine(),
- getCachedDIType(V->getType()),
- /*AlwaysPreserve=*/true);
- DIB.insertDbgValueIntrinsic(V, LocalVar, DIB.createExpression(), Loc,
- InsertBefore);
- };
-
- for (BasicBlock &BB : F) {
- // Attach debug locations.
- for (Instruction &I : BB)
- I.setDebugLoc(DILocation::get(Ctx, NextLine++, 1, SP));
-
- if (DebugifyLevel < Level::LocationsAndVariables)
- continue;
-
- // Inserting debug values into EH pads can break IR invariants.
- if (BB.isEHPad())
- continue;
-
- // Find the terminating instruction, after which no debug values are
- // attached.
- Instruction *LastInst = findTerminatingInstruction(BB);
- assert(LastInst && "Expected basic block with a terminator");
-
- // Maintain an insertion point which can't be invalidated when updates
- // are made.
- BasicBlock::iterator InsertPt = BB.getFirstInsertionPt();
- assert(InsertPt != BB.end() && "Expected to find an insertion point");
- Instruction *InsertBefore = &*InsertPt;
-
- // Attach debug values.
- for (Instruction *I = &*BB.begin(); I != LastInst; I = I->getNextNode()) {
- // Skip void-valued instructions.
- if (I->getType()->isVoidTy())
- continue;
-
- // Phis and EH pads must be grouped at the beginning of the block.
- // Only advance the insertion point when we finish visiting these.
- if (!isa<PHINode>(I) && !I->isEHPad())
- InsertBefore = I->getNextNode();
-
- insertDbgVal(*I, InsertBefore);
- InsertedDbgVal = true;
- }
- }
- // Make sure we emit at least one dbg.value, otherwise MachineDebugify may
- // not have anything to work with as it goes about inserting DBG_VALUEs.
- // (It's common for MIR tests to be written containing skeletal IR with
- // empty functions -- we're still interested in debugifying the MIR within
- // those tests, and this helps with that.)
- if (DebugifyLevel == Level::LocationsAndVariables && !InsertedDbgVal) {
- auto *Term = findTerminatingInstruction(F.getEntryBlock());
- insertDbgVal(*Term, Term);
- }
- if (ApplyToMF)
- ApplyToMF(DIB, F);
- DIB.finalizeSubprogram(SP);
- }
- DIB.finalize();
-
- // Track the number of distinct lines and variables.
- NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.debugify");
- auto addDebugifyOperand = [&](unsigned N) {
- NMD->addOperand(MDNode::get(
- Ctx, ValueAsMetadata::getConstant(ConstantInt::get(Int32Ty, N))));
- };
- addDebugifyOperand(NextLine - 1); // Original number of lines.
- addDebugifyOperand(NextVar - 1); // Original number of variables.
- assert(NMD->getNumOperands() == 2 &&
- "llvm.debugify should have exactly 2 operands!");
-
- // Claim that this synthetic debug info is valid.
- StringRef DIVersionKey = "Debug Info Version";
- if (!M.getModuleFlag(DIVersionKey))
- M.addModuleFlag(Module::Warning, DIVersionKey, DEBUG_METADATA_VERSION);
-
- return true;
-}
-
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+namespace {
+
+cl::opt<bool> Quiet("debugify-quiet",
+ cl::desc("Suppress verbose debugify output"));
+
+enum class Level {
+ Locations,
+ LocationsAndVariables
+};
+cl::opt<Level> DebugifyLevel(
+ "debugify-level", cl::desc("Kind of debug info to add"),
+ cl::values(clEnumValN(Level::Locations, "locations", "Locations only"),
+ clEnumValN(Level::LocationsAndVariables, "location+variables",
+ "Locations and Variables")),
+ cl::init(Level::LocationsAndVariables));
+
+raw_ostream &dbg() { return Quiet ? nulls() : errs(); }
+
+uint64_t getAllocSizeInBits(Module &M, Type *Ty) {
+ return Ty->isSized() ? M.getDataLayout().getTypeAllocSizeInBits(Ty) : 0;
+}
+
+bool isFunctionSkipped(Function &F) {
+ return F.isDeclaration() || !F.hasExactDefinition();
+}
+
+/// Find the basic block's terminating instruction.
+///
+/// Special care is needed to handle musttail and deopt calls, as these behave
+/// like (but are in fact not) terminators.
+Instruction *findTerminatingInstruction(BasicBlock &BB) {
+ if (auto *I = BB.getTerminatingMustTailCall())
+ return I;
+ if (auto *I = BB.getTerminatingDeoptimizeCall())
+ return I;
+ return BB.getTerminator();
+}
+} // end anonymous namespace
+
+bool llvm::applyDebugifyMetadata(
+ Module &M, iterator_range<Module::iterator> Functions, StringRef Banner,
+ std::function<bool(DIBuilder &DIB, Function &F)> ApplyToMF) {
+ // Skip modules with debug info.
+ if (M.getNamedMetadata("llvm.dbg.cu")) {
+ dbg() << Banner << "Skipping module with debug info\n";
+ return false;
+ }
+
+ DIBuilder DIB(M);
+ LLVMContext &Ctx = M.getContext();
+ auto *Int32Ty = Type::getInt32Ty(Ctx);
+
+ // Get a DIType which corresponds to Ty.
+ DenseMap<uint64_t, DIType *> TypeCache;
+ auto getCachedDIType = [&](Type *Ty) -> DIType * {
+ uint64_t Size = getAllocSizeInBits(M, Ty);
+ DIType *&DTy = TypeCache[Size];
+ if (!DTy) {
+ std::string Name = "ty" + utostr(Size);
+ DTy = DIB.createBasicType(Name, Size, dwarf::DW_ATE_unsigned);
+ }
+ return DTy;
+ };
+
+ unsigned NextLine = 1;
+ unsigned NextVar = 1;
+ auto File = DIB.createFile(M.getName(), "/");
+ auto CU = DIB.createCompileUnit(dwarf::DW_LANG_C, File, "debugify",
+ /*isOptimized=*/true, "", 0);
+
+ // Visit each instruction.
+ for (Function &F : Functions) {
+ if (isFunctionSkipped(F))
+ continue;
+
+ bool InsertedDbgVal = false;
+ auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
+ DISubprogram::DISPFlags SPFlags =
+ DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized;
+ if (F.hasPrivateLinkage() || F.hasInternalLinkage())
+ SPFlags |= DISubprogram::SPFlagLocalToUnit;
+ auto SP = DIB.createFunction(CU, F.getName(), F.getName(), File, NextLine,
+ SPType, NextLine, DINode::FlagZero, SPFlags);
+ F.setSubprogram(SP);
+
+ // Helper that inserts a dbg.value before \p InsertBefore, copying the
+ // location (and possibly the type, if it's non-void) from \p TemplateInst.
+ auto insertDbgVal = [&](Instruction &TemplateInst,
+ Instruction *InsertBefore) {
+ std::string Name = utostr(NextVar++);
+ Value *V = &TemplateInst;
+ if (TemplateInst.getType()->isVoidTy())
+ V = ConstantInt::get(Int32Ty, 0);
+ const DILocation *Loc = TemplateInst.getDebugLoc().get();
+ auto LocalVar = DIB.createAutoVariable(SP, Name, File, Loc->getLine(),
+ getCachedDIType(V->getType()),
+ /*AlwaysPreserve=*/true);
+ DIB.insertDbgValueIntrinsic(V, LocalVar, DIB.createExpression(), Loc,
+ InsertBefore);
+ };
+
+ for (BasicBlock &BB : F) {
+ // Attach debug locations.
+ for (Instruction &I : BB)
+ I.setDebugLoc(DILocation::get(Ctx, NextLine++, 1, SP));
+
+ if (DebugifyLevel < Level::LocationsAndVariables)
+ continue;
+
+ // Inserting debug values into EH pads can break IR invariants.
+ if (BB.isEHPad())
+ continue;
+
+ // Find the terminating instruction, after which no debug values are
+ // attached.
+ Instruction *LastInst = findTerminatingInstruction(BB);
+ assert(LastInst && "Expected basic block with a terminator");
+
+ // Maintain an insertion point which can't be invalidated when updates
+ // are made.
+ BasicBlock::iterator InsertPt = BB.getFirstInsertionPt();
+ assert(InsertPt != BB.end() && "Expected to find an insertion point");
+ Instruction *InsertBefore = &*InsertPt;
+
+ // Attach debug values.
+ for (Instruction *I = &*BB.begin(); I != LastInst; I = I->getNextNode()) {
+ // Skip void-valued instructions.
+ if (I->getType()->isVoidTy())
+ continue;
+
+ // Phis and EH pads must be grouped at the beginning of the block.
+ // Only advance the insertion point when we finish visiting these.
+ if (!isa<PHINode>(I) && !I->isEHPad())
+ InsertBefore = I->getNextNode();
+
+ insertDbgVal(*I, InsertBefore);
+ InsertedDbgVal = true;
+ }
+ }
+ // Make sure we emit at least one dbg.value, otherwise MachineDebugify may
+ // not have anything to work with as it goes about inserting DBG_VALUEs.
+ // (It's common for MIR tests to be written containing skeletal IR with
+ // empty functions -- we're still interested in debugifying the MIR within
+ // those tests, and this helps with that.)
+ if (DebugifyLevel == Level::LocationsAndVariables && !InsertedDbgVal) {
+ auto *Term = findTerminatingInstruction(F.getEntryBlock());
+ insertDbgVal(*Term, Term);
+ }
+ if (ApplyToMF)
+ ApplyToMF(DIB, F);
+ DIB.finalizeSubprogram(SP);
+ }
+ DIB.finalize();
+
+ // Track the number of distinct lines and variables.
+ NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.debugify");
+ auto addDebugifyOperand = [&](unsigned N) {
+ NMD->addOperand(MDNode::get(
+ Ctx, ValueAsMetadata::getConstant(ConstantInt::get(Int32Ty, N))));
+ };
+ addDebugifyOperand(NextLine - 1); // Original number of lines.
+ addDebugifyOperand(NextVar - 1); // Original number of variables.
+ assert(NMD->getNumOperands() == 2 &&
+ "llvm.debugify should have exactly 2 operands!");
+
+ // Claim that this synthetic debug info is valid.
+ StringRef DIVersionKey = "Debug Info Version";
+ if (!M.getModuleFlag(DIVersionKey))
+ M.addModuleFlag(Module::Warning, DIVersionKey, DEBUG_METADATA_VERSION);
+
+ return true;
+}
+
static bool applyDebugify(Function &F) {
Module &M = *F.getParent();
auto FuncIt = F.getIterator();
@@ -211,270 +211,270 @@ static bool applyDebugify(Module &M) {
"ModuleDebugify: ", /*ApplyToMF=*/nullptr);
}
-bool llvm::stripDebugifyMetadata(Module &M) {
- bool Changed = false;
-
- // Remove the llvm.debugify module-level named metadata.
- NamedMDNode *DebugifyMD = M.getNamedMetadata("llvm.debugify");
- if (DebugifyMD) {
- M.eraseNamedMetadata(DebugifyMD);
- Changed = true;
- }
-
- // Strip out all debug intrinsics and supporting metadata (subprograms, types,
- // variables, etc).
- Changed |= StripDebugInfo(M);
-
- // Strip out the dead dbg.value prototype.
- Function *DbgValF = M.getFunction("llvm.dbg.value");
- if (DbgValF) {
- assert(DbgValF->isDeclaration() && DbgValF->use_empty() &&
- "Not all debug info stripped?");
- DbgValF->eraseFromParent();
- Changed = true;
- }
-
- // Strip out the module-level Debug Info Version metadata.
- // FIXME: There must be an easier way to remove an operand from a NamedMDNode.
- NamedMDNode *NMD = M.getModuleFlagsMetadata();
- if (!NMD)
- return Changed;
+bool llvm::stripDebugifyMetadata(Module &M) {
+ bool Changed = false;
+
+ // Remove the llvm.debugify module-level named metadata.
+ NamedMDNode *DebugifyMD = M.getNamedMetadata("llvm.debugify");
+ if (DebugifyMD) {
+ M.eraseNamedMetadata(DebugifyMD);
+ Changed = true;
+ }
+
+ // Strip out all debug intrinsics and supporting metadata (subprograms, types,
+ // variables, etc).
+ Changed |= StripDebugInfo(M);
+
+ // Strip out the dead dbg.value prototype.
+ Function *DbgValF = M.getFunction("llvm.dbg.value");
+ if (DbgValF) {
+ assert(DbgValF->isDeclaration() && DbgValF->use_empty() &&
+ "Not all debug info stripped?");
+ DbgValF->eraseFromParent();
+ Changed = true;
+ }
+
+ // Strip out the module-level Debug Info Version metadata.
+ // FIXME: There must be an easier way to remove an operand from a NamedMDNode.
+ NamedMDNode *NMD = M.getModuleFlagsMetadata();
+ if (!NMD)
+ return Changed;
SmallVector<MDNode *, 4> Flags(NMD->operands());
- NMD->clearOperands();
- for (MDNode *Flag : Flags) {
- MDString *Key = dyn_cast_or_null<MDString>(Flag->getOperand(1));
- if (Key->getString() == "Debug Info Version") {
- Changed = true;
- continue;
- }
- NMD->addOperand(Flag);
- }
- // If we left it empty we might as well remove it.
- if (NMD->getNumOperands() == 0)
- NMD->eraseFromParent();
-
- return Changed;
-}
-
-namespace {
-/// Return true if a mis-sized diagnostic is issued for \p DVI.
-bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) {
- // The size of a dbg.value's value operand should match the size of the
- // variable it corresponds to.
- //
- // TODO: This, along with a check for non-null value operands, should be
- // promoted to verifier failures.
- Value *V = DVI->getValue();
- if (!V)
- return false;
-
- // For now, don't try to interpret anything more complicated than an empty
- // DIExpression. Eventually we should try to handle OP_deref and fragments.
- if (DVI->getExpression()->getNumElements())
- return false;
-
- Type *Ty = V->getType();
- uint64_t ValueOperandSize = getAllocSizeInBits(M, Ty);
- Optional<uint64_t> DbgVarSize = DVI->getFragmentSizeInBits();
- if (!ValueOperandSize || !DbgVarSize)
- return false;
-
- bool HasBadSize = false;
- if (Ty->isIntegerTy()) {
- auto Signedness = DVI->getVariable()->getSignedness();
- if (Signedness && *Signedness == DIBasicType::Signedness::Signed)
- HasBadSize = ValueOperandSize < *DbgVarSize;
- } else {
- HasBadSize = ValueOperandSize != *DbgVarSize;
- }
-
- if (HasBadSize) {
- dbg() << "ERROR: dbg.value operand has size " << ValueOperandSize
- << ", but its variable has size " << *DbgVarSize << ": ";
- DVI->print(dbg());
- dbg() << "\n";
- }
- return HasBadSize;
-}
-
-bool checkDebugifyMetadata(Module &M,
- iterator_range<Module::iterator> Functions,
- StringRef NameOfWrappedPass, StringRef Banner,
- bool Strip, DebugifyStatsMap *StatsMap) {
- // Skip modules without debugify metadata.
- NamedMDNode *NMD = M.getNamedMetadata("llvm.debugify");
- if (!NMD) {
- dbg() << Banner << ": Skipping module without debugify metadata\n";
- return false;
- }
-
- auto getDebugifyOperand = [&](unsigned Idx) -> unsigned {
- return mdconst::extract<ConstantInt>(NMD->getOperand(Idx)->getOperand(0))
- ->getZExtValue();
- };
- assert(NMD->getNumOperands() == 2 &&
- "llvm.debugify should have exactly 2 operands!");
- unsigned OriginalNumLines = getDebugifyOperand(0);
- unsigned OriginalNumVars = getDebugifyOperand(1);
- bool HasErrors = false;
-
- // Track debug info loss statistics if able.
- DebugifyStatistics *Stats = nullptr;
- if (StatsMap && !NameOfWrappedPass.empty())
- Stats = &StatsMap->operator[](NameOfWrappedPass);
-
- BitVector MissingLines{OriginalNumLines, true};
- BitVector MissingVars{OriginalNumVars, true};
- for (Function &F : Functions) {
- if (isFunctionSkipped(F))
- continue;
-
- // Find missing lines.
- for (Instruction &I : instructions(F)) {
- if (isa<DbgValueInst>(&I) || isa<PHINode>(&I))
- continue;
-
- auto DL = I.getDebugLoc();
- if (DL && DL.getLine() != 0) {
- MissingLines.reset(DL.getLine() - 1);
- continue;
- }
-
- if (!DL) {
- dbg() << "WARNING: Instruction with empty DebugLoc in function ";
- dbg() << F.getName() << " --";
- I.print(dbg());
- dbg() << "\n";
- }
- }
-
- // Find missing variables and mis-sized debug values.
- for (Instruction &I : instructions(F)) {
- auto *DVI = dyn_cast<DbgValueInst>(&I);
- if (!DVI)
- continue;
-
- unsigned Var = ~0U;
- (void)to_integer(DVI->getVariable()->getName(), Var, 10);
- assert(Var <= OriginalNumVars && "Unexpected name for DILocalVariable");
- bool HasBadSize = diagnoseMisSizedDbgValue(M, DVI);
- if (!HasBadSize)
- MissingVars.reset(Var - 1);
- HasErrors |= HasBadSize;
- }
- }
-
- // Print the results.
- for (unsigned Idx : MissingLines.set_bits())
- dbg() << "WARNING: Missing line " << Idx + 1 << "\n";
-
- for (unsigned Idx : MissingVars.set_bits())
- dbg() << "WARNING: Missing variable " << Idx + 1 << "\n";
-
- // Update DI loss statistics.
- if (Stats) {
- Stats->NumDbgLocsExpected += OriginalNumLines;
- Stats->NumDbgLocsMissing += MissingLines.count();
- Stats->NumDbgValuesExpected += OriginalNumVars;
- Stats->NumDbgValuesMissing += MissingVars.count();
- }
-
- dbg() << Banner;
- if (!NameOfWrappedPass.empty())
- dbg() << " [" << NameOfWrappedPass << "]";
- dbg() << ": " << (HasErrors ? "FAIL" : "PASS") << '\n';
-
- // Strip debugify metadata if required.
- if (Strip)
- return stripDebugifyMetadata(M);
-
- return false;
-}
-
-/// ModulePass for attaching synthetic debug info to everything, used with the
-/// legacy module pass manager.
-struct DebugifyModulePass : public ModulePass {
+ NMD->clearOperands();
+ for (MDNode *Flag : Flags) {
+ MDString *Key = dyn_cast_or_null<MDString>(Flag->getOperand(1));
+ if (Key->getString() == "Debug Info Version") {
+ Changed = true;
+ continue;
+ }
+ NMD->addOperand(Flag);
+ }
+ // If we left it empty we might as well remove it.
+ if (NMD->getNumOperands() == 0)
+ NMD->eraseFromParent();
+
+ return Changed;
+}
+
+namespace {
+/// Return true if a mis-sized diagnostic is issued for \p DVI.
+bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) {
+ // The size of a dbg.value's value operand should match the size of the
+ // variable it corresponds to.
+ //
+ // TODO: This, along with a check for non-null value operands, should be
+ // promoted to verifier failures.
+ Value *V = DVI->getValue();
+ if (!V)
+ return false;
+
+ // For now, don't try to interpret anything more complicated than an empty
+ // DIExpression. Eventually we should try to handle OP_deref and fragments.
+ if (DVI->getExpression()->getNumElements())
+ return false;
+
+ Type *Ty = V->getType();
+ uint64_t ValueOperandSize = getAllocSizeInBits(M, Ty);
+ Optional<uint64_t> DbgVarSize = DVI->getFragmentSizeInBits();
+ if (!ValueOperandSize || !DbgVarSize)
+ return false;
+
+ bool HasBadSize = false;
+ if (Ty->isIntegerTy()) {
+ auto Signedness = DVI->getVariable()->getSignedness();
+ if (Signedness && *Signedness == DIBasicType::Signedness::Signed)
+ HasBadSize = ValueOperandSize < *DbgVarSize;
+ } else {
+ HasBadSize = ValueOperandSize != *DbgVarSize;
+ }
+
+ if (HasBadSize) {
+ dbg() << "ERROR: dbg.value operand has size " << ValueOperandSize
+ << ", but its variable has size " << *DbgVarSize << ": ";
+ DVI->print(dbg());
+ dbg() << "\n";
+ }
+ return HasBadSize;
+}
+
+bool checkDebugifyMetadata(Module &M,
+ iterator_range<Module::iterator> Functions,
+ StringRef NameOfWrappedPass, StringRef Banner,
+ bool Strip, DebugifyStatsMap *StatsMap) {
+ // Skip modules without debugify metadata.
+ NamedMDNode *NMD = M.getNamedMetadata("llvm.debugify");
+ if (!NMD) {
+ dbg() << Banner << ": Skipping module without debugify metadata\n";
+ return false;
+ }
+
+ auto getDebugifyOperand = [&](unsigned Idx) -> unsigned {
+ return mdconst::extract<ConstantInt>(NMD->getOperand(Idx)->getOperand(0))
+ ->getZExtValue();
+ };
+ assert(NMD->getNumOperands() == 2 &&
+ "llvm.debugify should have exactly 2 operands!");
+ unsigned OriginalNumLines = getDebugifyOperand(0);
+ unsigned OriginalNumVars = getDebugifyOperand(1);
+ bool HasErrors = false;
+
+ // Track debug info loss statistics if able.
+ DebugifyStatistics *Stats = nullptr;
+ if (StatsMap && !NameOfWrappedPass.empty())
+ Stats = &StatsMap->operator[](NameOfWrappedPass);
+
+ BitVector MissingLines{OriginalNumLines, true};
+ BitVector MissingVars{OriginalNumVars, true};
+ for (Function &F : Functions) {
+ if (isFunctionSkipped(F))
+ continue;
+
+ // Find missing lines.
+ for (Instruction &I : instructions(F)) {
+ if (isa<DbgValueInst>(&I) || isa<PHINode>(&I))
+ continue;
+
+ auto DL = I.getDebugLoc();
+ if (DL && DL.getLine() != 0) {
+ MissingLines.reset(DL.getLine() - 1);
+ continue;
+ }
+
+ if (!DL) {
+ dbg() << "WARNING: Instruction with empty DebugLoc in function ";
+ dbg() << F.getName() << " --";
+ I.print(dbg());
+ dbg() << "\n";
+ }
+ }
+
+ // Find missing variables and mis-sized debug values.
+ for (Instruction &I : instructions(F)) {
+ auto *DVI = dyn_cast<DbgValueInst>(&I);
+ if (!DVI)
+ continue;
+
+ unsigned Var = ~0U;
+ (void)to_integer(DVI->getVariable()->getName(), Var, 10);
+ assert(Var <= OriginalNumVars && "Unexpected name for DILocalVariable");
+ bool HasBadSize = diagnoseMisSizedDbgValue(M, DVI);
+ if (!HasBadSize)
+ MissingVars.reset(Var - 1);
+ HasErrors |= HasBadSize;
+ }
+ }
+
+ // Print the results.
+ for (unsigned Idx : MissingLines.set_bits())
+ dbg() << "WARNING: Missing line " << Idx + 1 << "\n";
+
+ for (unsigned Idx : MissingVars.set_bits())
+ dbg() << "WARNING: Missing variable " << Idx + 1 << "\n";
+
+ // Update DI loss statistics.
+ if (Stats) {
+ Stats->NumDbgLocsExpected += OriginalNumLines;
+ Stats->NumDbgLocsMissing += MissingLines.count();
+ Stats->NumDbgValuesExpected += OriginalNumVars;
+ Stats->NumDbgValuesMissing += MissingVars.count();
+ }
+
+ dbg() << Banner;
+ if (!NameOfWrappedPass.empty())
+ dbg() << " [" << NameOfWrappedPass << "]";
+ dbg() << ": " << (HasErrors ? "FAIL" : "PASS") << '\n';
+
+ // Strip debugify metadata if required.
+ if (Strip)
+ return stripDebugifyMetadata(M);
+
+ return false;
+}
+
+/// ModulePass for attaching synthetic debug info to everything, used with the
+/// legacy module pass manager.
+struct DebugifyModulePass : public ModulePass {
bool runOnModule(Module &M) override { return applyDebugify(M); }
-
- DebugifyModulePass() : ModulePass(ID) {}
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- }
-
- static char ID; // Pass identification.
-};
-
-/// FunctionPass for attaching synthetic debug info to instructions within a
-/// single function, used with the legacy module pass manager.
-struct DebugifyFunctionPass : public FunctionPass {
+
+ DebugifyModulePass() : ModulePass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+
+ static char ID; // Pass identification.
+};
+
+/// FunctionPass for attaching synthetic debug info to instructions within a
+/// single function, used with the legacy module pass manager.
+struct DebugifyFunctionPass : public FunctionPass {
bool runOnFunction(Function &F) override { return applyDebugify(F); }
-
- DebugifyFunctionPass() : FunctionPass(ID) {}
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- }
-
- static char ID; // Pass identification.
-};
-
-/// ModulePass for checking debug info inserted by -debugify, used with the
-/// legacy module pass manager.
-struct CheckDebugifyModulePass : public ModulePass {
- bool runOnModule(Module &M) override {
- return checkDebugifyMetadata(M, M.functions(), NameOfWrappedPass,
- "CheckModuleDebugify", Strip, StatsMap);
- }
-
- CheckDebugifyModulePass(bool Strip = false, StringRef NameOfWrappedPass = "",
- DebugifyStatsMap *StatsMap = nullptr)
- : ModulePass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass),
- StatsMap(StatsMap) {}
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- }
-
- static char ID; // Pass identification.
-
-private:
- bool Strip;
- StringRef NameOfWrappedPass;
- DebugifyStatsMap *StatsMap;
-};
-
-/// FunctionPass for checking debug info inserted by -debugify-function, used
-/// with the legacy module pass manager.
-struct CheckDebugifyFunctionPass : public FunctionPass {
- bool runOnFunction(Function &F) override {
- Module &M = *F.getParent();
- auto FuncIt = F.getIterator();
- return checkDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)),
- NameOfWrappedPass, "CheckFunctionDebugify",
- Strip, StatsMap);
- }
-
- CheckDebugifyFunctionPass(bool Strip = false,
- StringRef NameOfWrappedPass = "",
- DebugifyStatsMap *StatsMap = nullptr)
- : FunctionPass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass),
- StatsMap(StatsMap) {}
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- }
-
- static char ID; // Pass identification.
-
-private:
- bool Strip;
- StringRef NameOfWrappedPass;
- DebugifyStatsMap *StatsMap;
-};
-
-} // end anonymous namespace
-
+
+ DebugifyFunctionPass() : FunctionPass(ID) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+
+ static char ID; // Pass identification.
+};
+
+/// ModulePass for checking debug info inserted by -debugify, used with the
+/// legacy module pass manager.
+struct CheckDebugifyModulePass : public ModulePass {
+ bool runOnModule(Module &M) override {
+ return checkDebugifyMetadata(M, M.functions(), NameOfWrappedPass,
+ "CheckModuleDebugify", Strip, StatsMap);
+ }
+
+ CheckDebugifyModulePass(bool Strip = false, StringRef NameOfWrappedPass = "",
+ DebugifyStatsMap *StatsMap = nullptr)
+ : ModulePass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass),
+ StatsMap(StatsMap) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+
+ static char ID; // Pass identification.
+
+private:
+ bool Strip;
+ StringRef NameOfWrappedPass;
+ DebugifyStatsMap *StatsMap;
+};
+
+/// FunctionPass for checking debug info inserted by -debugify-function, used
+/// with the legacy module pass manager.
+struct CheckDebugifyFunctionPass : public FunctionPass {
+ bool runOnFunction(Function &F) override {
+ Module &M = *F.getParent();
+ auto FuncIt = F.getIterator();
+ return checkDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)),
+ NameOfWrappedPass, "CheckFunctionDebugify",
+ Strip, StatsMap);
+ }
+
+ CheckDebugifyFunctionPass(bool Strip = false,
+ StringRef NameOfWrappedPass = "",
+ DebugifyStatsMap *StatsMap = nullptr)
+ : FunctionPass(ID), Strip(Strip), NameOfWrappedPass(NameOfWrappedPass),
+ StatsMap(StatsMap) {}
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+
+ static char ID; // Pass identification.
+
+private:
+ bool Strip;
+ StringRef NameOfWrappedPass;
+ DebugifyStatsMap *StatsMap;
+};
+
+} // end anonymous namespace
+
void llvm::exportDebugifyStats(StringRef Path, const DebugifyStatsMap &Map) {
std::error_code EC;
raw_fd_ostream OS{Path, EC};
@@ -482,7 +482,7 @@ void llvm::exportDebugifyStats(StringRef Path, const DebugifyStatsMap &Map) {
errs() << "Could not open file: " << EC.message() << ", " << Path << '\n';
return;
}
-
+
OS << "Pass Name" << ',' << "# of missing debug values" << ','
<< "# of missing locations" << ',' << "Missing/Expected value ratio" << ','
<< "Missing/Expected location ratio" << '\n';
@@ -501,34 +501,34 @@ ModulePass *llvm::createDebugifyModulePass() {
}
FunctionPass *llvm::createDebugifyFunctionPass() {
- return new DebugifyFunctionPass();
-}
-
-PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) {
- applyDebugifyMetadata(M, M.functions(),
- "ModuleDebugify: ", /*ApplyToMF*/ nullptr);
- return PreservedAnalyses::all();
-}
-
+ return new DebugifyFunctionPass();
+}
+
+PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) {
+ applyDebugifyMetadata(M, M.functions(),
+ "ModuleDebugify: ", /*ApplyToMF*/ nullptr);
+ return PreservedAnalyses::all();
+}
+
ModulePass *llvm::createCheckDebugifyModulePass(bool Strip,
StringRef NameOfWrappedPass,
DebugifyStatsMap *StatsMap) {
- return new CheckDebugifyModulePass(Strip, NameOfWrappedPass, StatsMap);
-}
-
+ return new CheckDebugifyModulePass(Strip, NameOfWrappedPass, StatsMap);
+}
+
FunctionPass *
llvm::createCheckDebugifyFunctionPass(bool Strip, StringRef NameOfWrappedPass,
DebugifyStatsMap *StatsMap) {
- return new CheckDebugifyFunctionPass(Strip, NameOfWrappedPass, StatsMap);
-}
-
-PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M,
- ModuleAnalysisManager &) {
- checkDebugifyMetadata(M, M.functions(), "", "CheckModuleDebugify", false,
- nullptr);
- return PreservedAnalyses::all();
-}
-
+ return new CheckDebugifyFunctionPass(Strip, NameOfWrappedPass, StatsMap);
+}
+
+PreservedAnalyses NewPMCheckDebugifyPass::run(Module &M,
+ ModuleAnalysisManager &) {
+ checkDebugifyMetadata(M, M.functions(), "", "CheckModuleDebugify", false,
+ nullptr);
+ return PreservedAnalyses::all();
+}
+
static bool isIgnoredPass(StringRef PassID) {
return isSpecialPass(PassID, {"PassManager", "PassAdaptor",
"AnalysisManagerProxy", "PrintFunctionPass",
@@ -564,18 +564,18 @@ void DebugifyEachInstrumentation::registerCallbacks(
});
}
-char DebugifyModulePass::ID = 0;
-static RegisterPass<DebugifyModulePass> DM("debugify",
- "Attach debug info to everything");
-
-char CheckDebugifyModulePass::ID = 0;
-static RegisterPass<CheckDebugifyModulePass>
- CDM("check-debugify", "Check debug info from -debugify");
-
-char DebugifyFunctionPass::ID = 0;
-static RegisterPass<DebugifyFunctionPass> DF("debugify-function",
- "Attach debug info to a function");
-
-char CheckDebugifyFunctionPass::ID = 0;
-static RegisterPass<CheckDebugifyFunctionPass>
- CDF("check-debugify-function", "Check debug info from -debugify-function");
+char DebugifyModulePass::ID = 0;
+static RegisterPass<DebugifyModulePass> DM("debugify",
+ "Attach debug info to everything");
+
+char CheckDebugifyModulePass::ID = 0;
+static RegisterPass<CheckDebugifyModulePass>
+ CDM("check-debugify", "Check debug info from -debugify");
+
+char DebugifyFunctionPass::ID = 0;
+static RegisterPass<DebugifyFunctionPass> DF("debugify-function",
+ "Attach debug info to a function");
+
+char CheckDebugifyFunctionPass::ID = 0;
+static RegisterPass<CheckDebugifyFunctionPass>
+ CDF("check-debugify-function", "Check debug info from -debugify-function");
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/DemoteRegToStack.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/DemoteRegToStack.cpp
index fb9db4033c..5f53d794fe 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -1,153 +1,153 @@
-//===- DemoteRegToStack.cpp - Move a virtual register to the stack --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-using namespace llvm;
-
-/// DemoteRegToStack - This function takes a virtual register computed by an
-/// Instruction and replaces it with a slot in the stack frame, allocated via
-/// alloca. This allows the CFG to be changed around without fear of
-/// invalidating the SSA information for the value. It returns the pointer to
-/// the alloca inserted to create a stack slot for I.
-AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
- Instruction *AllocaPoint) {
- if (I.use_empty()) {
- I.eraseFromParent();
- return nullptr;
- }
-
- Function *F = I.getParent()->getParent();
- const DataLayout &DL = F->getParent()->getDataLayout();
-
- // Create a stack slot to hold the value.
- AllocaInst *Slot;
- if (AllocaPoint) {
- Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
- I.getName()+".reg2mem", AllocaPoint);
- } else {
- Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
- I.getName() + ".reg2mem", &F->getEntryBlock().front());
- }
-
- // We cannot demote invoke instructions to the stack if their normal edge
- // is critical. Therefore, split the critical edge and create a basic block
- // into which the store can be inserted.
- if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
- if (!II->getNormalDest()->getSinglePredecessor()) {
- unsigned SuccNum = GetSuccessorNumber(II->getParent(), II->getNormalDest());
- assert(isCriticalEdge(II, SuccNum) && "Expected a critical edge!");
- BasicBlock *BB = SplitCriticalEdge(II, SuccNum);
- assert(BB && "Unable to split critical edge.");
- (void)BB;
- }
- }
-
- // Change all of the users of the instruction to read from the stack slot.
- while (!I.use_empty()) {
- Instruction *U = cast<Instruction>(I.user_back());
- if (PHINode *PN = dyn_cast<PHINode>(U)) {
- // If this is a PHI node, we can't insert a load of the value before the
- // use. Instead insert the load in the predecessor block corresponding
- // to the incoming value.
- //
- // Note that if there are multiple edges from a basic block to this PHI
- // node that we cannot have multiple loads. The problem is that the
- // resulting PHI node will have multiple values (from each load) coming in
- // from the same block, which is illegal SSA form. For this reason, we
- // keep track of and reuse loads we insert.
- DenseMap<BasicBlock*, Value*> Loads;
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- if (PN->getIncomingValue(i) == &I) {
- Value *&V = Loads[PN->getIncomingBlock(i)];
- if (!V) {
- // Insert the load into the predecessor block
- V = new LoadInst(I.getType(), Slot, I.getName() + ".reload",
- VolatileLoads,
- PN->getIncomingBlock(i)->getTerminator());
- }
- PN->setIncomingValue(i, V);
- }
-
- } else {
- // If this is a normal instruction, just insert a load.
- Value *V = new LoadInst(I.getType(), Slot, I.getName() + ".reload",
- VolatileLoads, U);
- U->replaceUsesOfWith(&I, V);
- }
- }
-
- // Insert stores of the computed value into the stack slot. We have to be
- // careful if I is an invoke instruction, because we can't insert the store
- // AFTER the terminator instruction.
- BasicBlock::iterator InsertPt;
- if (!I.isTerminator()) {
- InsertPt = ++I.getIterator();
- for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
- /* empty */; // Don't insert before PHI nodes or landingpad instrs.
- } else {
- InvokeInst &II = cast<InvokeInst>(I);
- InsertPt = II.getNormalDest()->getFirstInsertionPt();
- }
-
- new StoreInst(&I, Slot, &*InsertPt);
- return Slot;
-}
-
-/// DemotePHIToStack - This function takes a virtual register computed by a PHI
-/// node and replaces it with a slot in the stack frame allocated via alloca.
-/// The PHI node is deleted. It returns the pointer to the alloca inserted.
-AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
- if (P->use_empty()) {
- P->eraseFromParent();
- return nullptr;
- }
-
- const DataLayout &DL = P->getModule()->getDataLayout();
-
- // Create a stack slot to hold the value.
- AllocaInst *Slot;
- if (AllocaPoint) {
- Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
- P->getName()+".reg2mem", AllocaPoint);
- } else {
- Function *F = P->getParent()->getParent();
- Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
- P->getName() + ".reg2mem",
- &F->getEntryBlock().front());
- }
-
- // Iterate over each operand inserting a store in each predecessor.
- for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
- if (InvokeInst *II = dyn_cast<InvokeInst>(P->getIncomingValue(i))) {
- assert(II->getParent() != P->getIncomingBlock(i) &&
- "Invoke edge not supported yet"); (void)II;
- }
- new StoreInst(P->getIncomingValue(i), Slot,
- P->getIncomingBlock(i)->getTerminator());
- }
-
- // Insert a load in place of the PHI and replace all uses.
- BasicBlock::iterator InsertPt = P->getIterator();
-
- for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
- /* empty */; // Don't insert before PHI nodes or landingpad instrs.
-
- Value *V =
- new LoadInst(P->getType(), Slot, P->getName() + ".reload", &*InsertPt);
- P->replaceAllUsesWith(V);
-
- // Delete PHI.
- P->eraseFromParent();
- return Slot;
-}
+//===- DemoteRegToStack.cpp - Move a virtual register to the stack --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+/// DemoteRegToStack - This function takes a virtual register computed by an
+/// Instruction and replaces it with a slot in the stack frame, allocated via
+/// alloca. This allows the CFG to be changed around without fear of
+/// invalidating the SSA information for the value. It returns the pointer to
+/// the alloca inserted to create a stack slot for I.
+AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
+ Instruction *AllocaPoint) {
+ if (I.use_empty()) {
+ I.eraseFromParent();
+ return nullptr;
+ }
+
+ Function *F = I.getParent()->getParent();
+ const DataLayout &DL = F->getParent()->getDataLayout();
+
+ // Create a stack slot to hold the value.
+ AllocaInst *Slot;
+ if (AllocaPoint) {
+ Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
+ I.getName()+".reg2mem", AllocaPoint);
+ } else {
+ Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
+ I.getName() + ".reg2mem", &F->getEntryBlock().front());
+ }
+
+ // We cannot demote invoke instructions to the stack if their normal edge
+ // is critical. Therefore, split the critical edge and create a basic block
+ // into which the store can be inserted.
+ if (InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
+ if (!II->getNormalDest()->getSinglePredecessor()) {
+ unsigned SuccNum = GetSuccessorNumber(II->getParent(), II->getNormalDest());
+ assert(isCriticalEdge(II, SuccNum) && "Expected a critical edge!");
+ BasicBlock *BB = SplitCriticalEdge(II, SuccNum);
+ assert(BB && "Unable to split critical edge.");
+ (void)BB;
+ }
+ }
+
+ // Change all of the users of the instruction to read from the stack slot.
+ while (!I.use_empty()) {
+ Instruction *U = cast<Instruction>(I.user_back());
+ if (PHINode *PN = dyn_cast<PHINode>(U)) {
+ // If this is a PHI node, we can't insert a load of the value before the
+ // use. Instead insert the load in the predecessor block corresponding
+ // to the incoming value.
+ //
+ // Note that if there are multiple edges from a basic block to this PHI
+ // node that we cannot have multiple loads. The problem is that the
+ // resulting PHI node will have multiple values (from each load) coming in
+ // from the same block, which is illegal SSA form. For this reason, we
+ // keep track of and reuse loads we insert.
+ DenseMap<BasicBlock*, Value*> Loads;
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (PN->getIncomingValue(i) == &I) {
+ Value *&V = Loads[PN->getIncomingBlock(i)];
+ if (!V) {
+ // Insert the load into the predecessor block
+ V = new LoadInst(I.getType(), Slot, I.getName() + ".reload",
+ VolatileLoads,
+ PN->getIncomingBlock(i)->getTerminator());
+ }
+ PN->setIncomingValue(i, V);
+ }
+
+ } else {
+ // If this is a normal instruction, just insert a load.
+ Value *V = new LoadInst(I.getType(), Slot, I.getName() + ".reload",
+ VolatileLoads, U);
+ U->replaceUsesOfWith(&I, V);
+ }
+ }
+
+ // Insert stores of the computed value into the stack slot. We have to be
+ // careful if I is an invoke instruction, because we can't insert the store
+ // AFTER the terminator instruction.
+ BasicBlock::iterator InsertPt;
+ if (!I.isTerminator()) {
+ InsertPt = ++I.getIterator();
+ for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
+ /* empty */; // Don't insert before PHI nodes or landingpad instrs.
+ } else {
+ InvokeInst &II = cast<InvokeInst>(I);
+ InsertPt = II.getNormalDest()->getFirstInsertionPt();
+ }
+
+ new StoreInst(&I, Slot, &*InsertPt);
+ return Slot;
+}
+
+/// DemotePHIToStack - This function takes a virtual register computed by a PHI
+/// node and replaces it with a slot in the stack frame allocated via alloca.
+/// The PHI node is deleted. It returns the pointer to the alloca inserted.
+AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
+ if (P->use_empty()) {
+ P->eraseFromParent();
+ return nullptr;
+ }
+
+ const DataLayout &DL = P->getModule()->getDataLayout();
+
+ // Create a stack slot to hold the value.
+ AllocaInst *Slot;
+ if (AllocaPoint) {
+ Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
+ P->getName()+".reg2mem", AllocaPoint);
+ } else {
+ Function *F = P->getParent()->getParent();
+ Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
+ P->getName() + ".reg2mem",
+ &F->getEntryBlock().front());
+ }
+
+ // Iterate over each operand inserting a store in each predecessor.
+ for (unsigned i = 0, e = P->getNumIncomingValues(); i < e; ++i) {
+ if (InvokeInst *II = dyn_cast<InvokeInst>(P->getIncomingValue(i))) {
+ assert(II->getParent() != P->getIncomingBlock(i) &&
+ "Invoke edge not supported yet"); (void)II;
+ }
+ new StoreInst(P->getIncomingValue(i), Slot,
+ P->getIncomingBlock(i)->getTerminator());
+ }
+
+ // Insert a load in place of the PHI and replace all uses.
+ BasicBlock::iterator InsertPt = P->getIterator();
+
+ for (; isa<PHINode>(InsertPt) || InsertPt->isEHPad(); ++InsertPt)
+ /* empty */; // Don't insert before PHI nodes or landingpad instrs.
+
+ Value *V =
+ new LoadInst(P->getType(), Slot, P->getName() + ".reload", &*InsertPt);
+ P->replaceAllUsesWith(V);
+
+ // Delete PHI.
+ P->eraseFromParent();
+ return Slot;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index 8171eb6b2c..26f8e21952 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -1,168 +1,168 @@
-//===- EntryExitInstrumenter.cpp - Function Entry/Exit Instrumentation ----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils.h"
-using namespace llvm;
-
-static void insertCall(Function &CurFn, StringRef Func,
- Instruction *InsertionPt, DebugLoc DL) {
- Module &M = *InsertionPt->getParent()->getParent()->getParent();
- LLVMContext &C = InsertionPt->getParent()->getContext();
-
- if (Func == "mcount" ||
- Func == ".mcount" ||
- Func == "llvm.arm.gnu.eabi.mcount" ||
- Func == "\01_mcount" ||
- Func == "\01mcount" ||
- Func == "__mcount" ||
- Func == "_mcount" ||
- Func == "__cyg_profile_func_enter_bare") {
- FunctionCallee Fn = M.getOrInsertFunction(Func, Type::getVoidTy(C));
- CallInst *Call = CallInst::Create(Fn, "", InsertionPt);
- Call->setDebugLoc(DL);
- return;
- }
-
- if (Func == "__cyg_profile_func_enter" || Func == "__cyg_profile_func_exit") {
- Type *ArgTypes[] = {Type::getInt8PtrTy(C), Type::getInt8PtrTy(C)};
-
- FunctionCallee Fn = M.getOrInsertFunction(
- Func, FunctionType::get(Type::getVoidTy(C), ArgTypes, false));
-
- Instruction *RetAddr = CallInst::Create(
- Intrinsic::getDeclaration(&M, Intrinsic::returnaddress),
- ArrayRef<Value *>(ConstantInt::get(Type::getInt32Ty(C), 0)), "",
- InsertionPt);
- RetAddr->setDebugLoc(DL);
-
- Value *Args[] = {ConstantExpr::getBitCast(&CurFn, Type::getInt8PtrTy(C)),
- RetAddr};
-
- CallInst *Call =
- CallInst::Create(Fn, ArrayRef<Value *>(Args), "", InsertionPt);
- Call->setDebugLoc(DL);
- return;
- }
-
- // We only know how to call a fixed set of instrumentation functions, because
- // they all expect different arguments, etc.
- report_fatal_error(Twine("Unknown instrumentation function: '") + Func + "'");
-}
-
-static bool runOnFunction(Function &F, bool PostInlining) {
- StringRef EntryAttr = PostInlining ? "instrument-function-entry-inlined"
- : "instrument-function-entry";
-
- StringRef ExitAttr = PostInlining ? "instrument-function-exit-inlined"
- : "instrument-function-exit";
-
- StringRef EntryFunc = F.getFnAttribute(EntryAttr).getValueAsString();
- StringRef ExitFunc = F.getFnAttribute(ExitAttr).getValueAsString();
-
- bool Changed = false;
-
- // If the attribute is specified, insert instrumentation and then "consume"
- // the attribute so that it's not inserted again if the pass should happen to
- // run later for some reason.
-
- if (!EntryFunc.empty()) {
- DebugLoc DL;
- if (auto SP = F.getSubprogram())
+//===- EntryExitInstrumenter.cpp - Function Entry/Exit Instrumentation ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+using namespace llvm;
+
+static void insertCall(Function &CurFn, StringRef Func,
+ Instruction *InsertionPt, DebugLoc DL) {
+ Module &M = *InsertionPt->getParent()->getParent()->getParent();
+ LLVMContext &C = InsertionPt->getParent()->getContext();
+
+ if (Func == "mcount" ||
+ Func == ".mcount" ||
+ Func == "llvm.arm.gnu.eabi.mcount" ||
+ Func == "\01_mcount" ||
+ Func == "\01mcount" ||
+ Func == "__mcount" ||
+ Func == "_mcount" ||
+ Func == "__cyg_profile_func_enter_bare") {
+ FunctionCallee Fn = M.getOrInsertFunction(Func, Type::getVoidTy(C));
+ CallInst *Call = CallInst::Create(Fn, "", InsertionPt);
+ Call->setDebugLoc(DL);
+ return;
+ }
+
+ if (Func == "__cyg_profile_func_enter" || Func == "__cyg_profile_func_exit") {
+ Type *ArgTypes[] = {Type::getInt8PtrTy(C), Type::getInt8PtrTy(C)};
+
+ FunctionCallee Fn = M.getOrInsertFunction(
+ Func, FunctionType::get(Type::getVoidTy(C), ArgTypes, false));
+
+ Instruction *RetAddr = CallInst::Create(
+ Intrinsic::getDeclaration(&M, Intrinsic::returnaddress),
+ ArrayRef<Value *>(ConstantInt::get(Type::getInt32Ty(C), 0)), "",
+ InsertionPt);
+ RetAddr->setDebugLoc(DL);
+
+ Value *Args[] = {ConstantExpr::getBitCast(&CurFn, Type::getInt8PtrTy(C)),
+ RetAddr};
+
+ CallInst *Call =
+ CallInst::Create(Fn, ArrayRef<Value *>(Args), "", InsertionPt);
+ Call->setDebugLoc(DL);
+ return;
+ }
+
+ // We only know how to call a fixed set of instrumentation functions, because
+ // they all expect different arguments, etc.
+ report_fatal_error(Twine("Unknown instrumentation function: '") + Func + "'");
+}
+
+static bool runOnFunction(Function &F, bool PostInlining) {
+ StringRef EntryAttr = PostInlining ? "instrument-function-entry-inlined"
+ : "instrument-function-entry";
+
+ StringRef ExitAttr = PostInlining ? "instrument-function-exit-inlined"
+ : "instrument-function-exit";
+
+ StringRef EntryFunc = F.getFnAttribute(EntryAttr).getValueAsString();
+ StringRef ExitFunc = F.getFnAttribute(ExitAttr).getValueAsString();
+
+ bool Changed = false;
+
+ // If the attribute is specified, insert instrumentation and then "consume"
+ // the attribute so that it's not inserted again if the pass should happen to
+ // run later for some reason.
+
+ if (!EntryFunc.empty()) {
+ DebugLoc DL;
+ if (auto SP = F.getSubprogram())
DL = DILocation::get(SP->getContext(), SP->getScopeLine(), 0, SP);
-
- insertCall(F, EntryFunc, &*F.begin()->getFirstInsertionPt(), DL);
- Changed = true;
- F.removeAttribute(AttributeList::FunctionIndex, EntryAttr);
- }
-
- if (!ExitFunc.empty()) {
- for (BasicBlock &BB : F) {
- Instruction *T = BB.getTerminator();
- if (!isa<ReturnInst>(T))
- continue;
-
- // If T is preceded by a musttail call, that's the real terminator.
+
+ insertCall(F, EntryFunc, &*F.begin()->getFirstInsertionPt(), DL);
+ Changed = true;
+ F.removeAttribute(AttributeList::FunctionIndex, EntryAttr);
+ }
+
+ if (!ExitFunc.empty()) {
+ for (BasicBlock &BB : F) {
+ Instruction *T = BB.getTerminator();
+ if (!isa<ReturnInst>(T))
+ continue;
+
+ // If T is preceded by a musttail call, that's the real terminator.
if (CallInst *CI = BB.getTerminatingMustTailCall())
T = CI;
-
- DebugLoc DL;
- if (DebugLoc TerminatorDL = T->getDebugLoc())
- DL = TerminatorDL;
- else if (auto SP = F.getSubprogram())
+
+ DebugLoc DL;
+ if (DebugLoc TerminatorDL = T->getDebugLoc())
+ DL = TerminatorDL;
+ else if (auto SP = F.getSubprogram())
DL = DILocation::get(SP->getContext(), 0, 0, SP);
-
- insertCall(F, ExitFunc, T, DL);
- Changed = true;
- }
- F.removeAttribute(AttributeList::FunctionIndex, ExitAttr);
- }
-
- return Changed;
-}
-
-namespace {
-struct EntryExitInstrumenter : public FunctionPass {
- static char ID;
- EntryExitInstrumenter() : FunctionPass(ID) {
- initializeEntryExitInstrumenterPass(*PassRegistry::getPassRegistry());
- }
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
- bool runOnFunction(Function &F) override { return ::runOnFunction(F, false); }
-};
-char EntryExitInstrumenter::ID = 0;
-
-struct PostInlineEntryExitInstrumenter : public FunctionPass {
- static char ID;
- PostInlineEntryExitInstrumenter() : FunctionPass(ID) {
- initializePostInlineEntryExitInstrumenterPass(
- *PassRegistry::getPassRegistry());
- }
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addPreserved<GlobalsAAWrapperPass>();
- }
- bool runOnFunction(Function &F) override { return ::runOnFunction(F, true); }
-};
-char PostInlineEntryExitInstrumenter::ID = 0;
-}
-
-INITIALIZE_PASS(
- EntryExitInstrumenter, "ee-instrument",
- "Instrument function entry/exit with calls to e.g. mcount() (pre inlining)",
- false, false)
-INITIALIZE_PASS(PostInlineEntryExitInstrumenter, "post-inline-ee-instrument",
- "Instrument function entry/exit with calls to e.g. mcount() "
- "(post inlining)",
- false, false)
-
-FunctionPass *llvm::createEntryExitInstrumenterPass() {
- return new EntryExitInstrumenter();
-}
-
-FunctionPass *llvm::createPostInlineEntryExitInstrumenterPass() {
- return new PostInlineEntryExitInstrumenter();
-}
-
-PreservedAnalyses
-llvm::EntryExitInstrumenterPass::run(Function &F, FunctionAnalysisManager &AM) {
- runOnFunction(F, PostInlining);
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
-}
+
+ insertCall(F, ExitFunc, T, DL);
+ Changed = true;
+ }
+ F.removeAttribute(AttributeList::FunctionIndex, ExitAttr);
+ }
+
+ return Changed;
+}
+
+namespace {
+struct EntryExitInstrumenter : public FunctionPass {
+ static char ID;
+ EntryExitInstrumenter() : FunctionPass(ID) {
+ initializeEntryExitInstrumenterPass(*PassRegistry::getPassRegistry());
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+ bool runOnFunction(Function &F) override { return ::runOnFunction(F, false); }
+};
+char EntryExitInstrumenter::ID = 0;
+
+struct PostInlineEntryExitInstrumenter : public FunctionPass {
+ static char ID;
+ PostInlineEntryExitInstrumenter() : FunctionPass(ID) {
+ initializePostInlineEntryExitInstrumenterPass(
+ *PassRegistry::getPassRegistry());
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ }
+ bool runOnFunction(Function &F) override { return ::runOnFunction(F, true); }
+};
+char PostInlineEntryExitInstrumenter::ID = 0;
+}
+
+INITIALIZE_PASS(
+ EntryExitInstrumenter, "ee-instrument",
+ "Instrument function entry/exit with calls to e.g. mcount() (pre inlining)",
+ false, false)
+INITIALIZE_PASS(PostInlineEntryExitInstrumenter, "post-inline-ee-instrument",
+ "Instrument function entry/exit with calls to e.g. mcount() "
+ "(post inlining)",
+ false, false)
+
+FunctionPass *llvm::createEntryExitInstrumenterPass() {
+ return new EntryExitInstrumenter();
+}
+
+FunctionPass *llvm::createPostInlineEntryExitInstrumenterPass() {
+ return new PostInlineEntryExitInstrumenter();
+}
+
+PreservedAnalyses
+llvm::EntryExitInstrumenterPass::run(Function &F, FunctionAnalysisManager &AM) {
+ runOnFunction(F, PostInlining);
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/EscapeEnumerator.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/EscapeEnumerator.cpp
index d57669834a..accedd5b4e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -1,98 +1,98 @@
-//===- EscapeEnumerator.cpp -----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Defines a helper class that enumerates all possible exits from a function,
-// including exception handling.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/EscapeEnumerator.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/Local.h"
-
-using namespace llvm;
-
-static FunctionCallee getDefaultPersonalityFn(Module *M) {
- LLVMContext &C = M->getContext();
- Triple T(M->getTargetTriple());
- EHPersonality Pers = getDefaultEHPersonality(T);
- return M->getOrInsertFunction(getEHPersonalityName(Pers),
- FunctionType::get(Type::getInt32Ty(C), true));
-}
-
-IRBuilder<> *EscapeEnumerator::Next() {
- if (Done)
- return nullptr;
-
- // Find all 'return', 'resume', and 'unwind' instructions.
- while (StateBB != StateE) {
- BasicBlock *CurBB = &*StateBB++;
-
- // Branches and invokes do not escape, only unwind, resume, and return
- // do.
- Instruction *TI = CurBB->getTerminator();
- if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
- continue;
-
+//===- EscapeEnumerator.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines a helper class that enumerates all possible exits from a function,
+// including exception handling.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+static FunctionCallee getDefaultPersonalityFn(Module *M) {
+ LLVMContext &C = M->getContext();
+ Triple T(M->getTargetTriple());
+ EHPersonality Pers = getDefaultEHPersonality(T);
+ return M->getOrInsertFunction(getEHPersonalityName(Pers),
+ FunctionType::get(Type::getInt32Ty(C), true));
+}
+
+IRBuilder<> *EscapeEnumerator::Next() {
+ if (Done)
+ return nullptr;
+
+ // Find all 'return', 'resume', and 'unwind' instructions.
+ while (StateBB != StateE) {
+ BasicBlock *CurBB = &*StateBB++;
+
+ // Branches and invokes do not escape, only unwind, resume, and return
+ // do.
+ Instruction *TI = CurBB->getTerminator();
+ if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
+ continue;
+
if (CallInst *CI = CurBB->getTerminatingMustTailCall())
TI = CI;
- Builder.SetInsertPoint(TI);
- return &Builder;
- }
-
- Done = true;
-
- if (!HandleExceptions)
- return nullptr;
-
- if (F.doesNotThrow())
- return nullptr;
-
- // Find all 'call' instructions that may throw.
+ Builder.SetInsertPoint(TI);
+ return &Builder;
+ }
+
+ Done = true;
+
+ if (!HandleExceptions)
+ return nullptr;
+
+ if (F.doesNotThrow())
+ return nullptr;
+
+ // Find all 'call' instructions that may throw.
// We cannot tranform calls with musttail tag.
- SmallVector<Instruction *, 16> Calls;
- for (BasicBlock &BB : F)
- for (Instruction &II : BB)
- if (CallInst *CI = dyn_cast<CallInst>(&II))
+ SmallVector<Instruction *, 16> Calls;
+ for (BasicBlock &BB : F)
+ for (Instruction &II : BB)
+ if (CallInst *CI = dyn_cast<CallInst>(&II))
if (!CI->doesNotThrow() && !CI->isMustTailCall())
- Calls.push_back(CI);
-
- if (Calls.empty())
- return nullptr;
-
- // Create a cleanup block.
- LLVMContext &C = F.getContext();
- BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
- Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C));
- if (!F.hasPersonalityFn()) {
- FunctionCallee PersFn = getDefaultPersonalityFn(F.getParent());
- F.setPersonalityFn(cast<Constant>(PersFn.getCallee()));
- }
-
- if (isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) {
- report_fatal_error("Scoped EH not supported");
- }
-
- LandingPadInst *LPad =
- LandingPadInst::Create(ExnTy, 1, "cleanup.lpad", CleanupBB);
- LPad->setCleanup(true);
- ResumeInst *RI = ResumeInst::Create(LPad, CleanupBB);
-
- // Transform the 'call' instructions into 'invoke's branching to the
- // cleanup block. Go in reverse order to make prettier BB names.
- SmallVector<Value *, 16> Args;
- for (unsigned I = Calls.size(); I != 0;) {
- CallInst *CI = cast<CallInst>(Calls[--I]);
- changeToInvokeAndSplitBasicBlock(CI, CleanupBB);
- }
-
- Builder.SetInsertPoint(RI);
- return &Builder;
-}
+ Calls.push_back(CI);
+
+ if (Calls.empty())
+ return nullptr;
+
+ // Create a cleanup block.
+ LLVMContext &C = F.getContext();
+ BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
+ Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C));
+ if (!F.hasPersonalityFn()) {
+ FunctionCallee PersFn = getDefaultPersonalityFn(F.getParent());
+ F.setPersonalityFn(cast<Constant>(PersFn.getCallee()));
+ }
+
+ if (isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn()))) {
+ report_fatal_error("Scoped EH not supported");
+ }
+
+ LandingPadInst *LPad =
+ LandingPadInst::Create(ExnTy, 1, "cleanup.lpad", CleanupBB);
+ LPad->setCleanup(true);
+ ResumeInst *RI = ResumeInst::Create(LPad, CleanupBB);
+
+ // Transform the 'call' instructions into 'invoke's branching to the
+ // cleanup block. Go in reverse order to make prettier BB names.
+ SmallVector<Value *, 16> Args;
+ for (unsigned I = Calls.size(); I != 0;) {
+ CallInst *CI = cast<CallInst>(Calls[--I]);
+ changeToInvokeAndSplitBasicBlock(CI, CleanupBB);
+ }
+
+ Builder.SetInsertPoint(RI);
+ return &Builder;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/Evaluator.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/Evaluator.cpp
index 4a7167069b..732b00635e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/Evaluator.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/Evaluator.cpp
@@ -1,728 +1,728 @@
-//===- Evaluator.cpp - LLVM IR evaluator ----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Function evaluator for LLVM IR.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/Evaluator.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <iterator>
-
-#define DEBUG_TYPE "evaluator"
-
-using namespace llvm;
-
-static inline bool
-isSimpleEnoughValueToCommit(Constant *C,
- SmallPtrSetImpl<Constant *> &SimpleConstants,
- const DataLayout &DL);
-
-/// Return true if the specified constant can be handled by the code generator.
-/// We don't want to generate something like:
-/// void *X = &X/42;
-/// because the code generator doesn't have a relocation that can handle that.
-///
-/// This function should be called if C was not found (but just got inserted)
-/// in SimpleConstants to avoid having to rescan the same constants all the
-/// time.
-static bool
-isSimpleEnoughValueToCommitHelper(Constant *C,
- SmallPtrSetImpl<Constant *> &SimpleConstants,
- const DataLayout &DL) {
- // Simple global addresses are supported, do not allow dllimport or
- // thread-local globals.
- if (auto *GV = dyn_cast<GlobalValue>(C))
- return !GV->hasDLLImportStorageClass() && !GV->isThreadLocal();
-
- // Simple integer, undef, constant aggregate zero, etc are all supported.
- if (C->getNumOperands() == 0 || isa<BlockAddress>(C))
- return true;
-
- // Aggregate values are safe if all their elements are.
- if (isa<ConstantAggregate>(C)) {
- for (Value *Op : C->operands())
- if (!isSimpleEnoughValueToCommit(cast<Constant>(Op), SimpleConstants, DL))
- return false;
- return true;
- }
-
- // We don't know exactly what relocations are allowed in constant expressions,
- // so we allow &global+constantoffset, which is safe and uniformly supported
- // across targets.
- ConstantExpr *CE = cast<ConstantExpr>(C);
- switch (CE->getOpcode()) {
- case Instruction::BitCast:
- // Bitcast is fine if the casted value is fine.
- return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
-
- case Instruction::IntToPtr:
- case Instruction::PtrToInt:
- // int <=> ptr is fine if the int type is the same size as the
- // pointer type.
- if (DL.getTypeSizeInBits(CE->getType()) !=
- DL.getTypeSizeInBits(CE->getOperand(0)->getType()))
- return false;
- return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
-
- // GEP is fine if it is simple + constant offset.
- case Instruction::GetElementPtr:
- for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
- if (!isa<ConstantInt>(CE->getOperand(i)))
- return false;
- return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
-
- case Instruction::Add:
- // We allow simple+cst.
- if (!isa<ConstantInt>(CE->getOperand(1)))
- return false;
- return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
- }
- return false;
-}
-
-static inline bool
-isSimpleEnoughValueToCommit(Constant *C,
- SmallPtrSetImpl<Constant *> &SimpleConstants,
- const DataLayout &DL) {
- // If we already checked this constant, we win.
- if (!SimpleConstants.insert(C).second)
- return true;
- // Check the constant.
- return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL);
-}
-
-/// Return true if this constant is simple enough for us to understand. In
-/// particular, if it is a cast to anything other than from one pointer type to
-/// another pointer type, we punt. We basically just support direct accesses to
-/// globals and GEP's of globals. This should be kept up to date with
-/// CommitValueTo.
-static bool isSimpleEnoughPointerToCommit(Constant *C) {
- // Conservatively, avoid aggregate types. This is because we don't
- // want to worry about them partially overlapping other stores.
- if (!cast<PointerType>(C->getType())->getElementType()->isSingleValueType())
- return false;
-
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
- // Do not allow weak/*_odr/linkonce linkage or external globals.
- return GV->hasUniqueInitializer();
-
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
- // Handle a constantexpr gep.
- if (CE->getOpcode() == Instruction::GetElementPtr &&
- isa<GlobalVariable>(CE->getOperand(0)) &&
- cast<GEPOperator>(CE)->isInBounds()) {
- GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
- // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
- // external globals.
- if (!GV->hasUniqueInitializer())
- return false;
-
- // The first index must be zero.
- ConstantInt *CI = dyn_cast<ConstantInt>(*std::next(CE->op_begin()));
- if (!CI || !CI->isZero()) return false;
-
- // The remaining indices must be compile-time known integers within the
- // notional bounds of the corresponding static array types.
- if (!CE->isGEPWithNoNotionalOverIndexing())
- return false;
-
- return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
-
- // A constantexpr bitcast from a pointer to another pointer is a no-op,
- // and we know how to evaluate it by moving the bitcast from the pointer
- // operand to the value operand.
- } else if (CE->getOpcode() == Instruction::BitCast &&
- isa<GlobalVariable>(CE->getOperand(0))) {
- // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
- // external globals.
- return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer();
- }
- }
-
- return false;
-}
-
-/// Apply 'Func' to Ptr. If this returns nullptr, introspect the pointer's
-/// type and walk down through the initial elements to obtain additional
-/// pointers to try. Returns the first non-null return value from Func, or
-/// nullptr if the type can't be introspected further.
-static Constant *
-evaluateBitcastFromPtr(Constant *Ptr, const DataLayout &DL,
- const TargetLibraryInfo *TLI,
- std::function<Constant *(Constant *)> Func) {
- Constant *Val;
- while (!(Val = Func(Ptr))) {
+//===- Evaluator.cpp - LLVM IR evaluator ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Function evaluator for LLVM IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Evaluator.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <iterator>
+
+#define DEBUG_TYPE "evaluator"
+
+using namespace llvm;
+
+static inline bool
+isSimpleEnoughValueToCommit(Constant *C,
+ SmallPtrSetImpl<Constant *> &SimpleConstants,
+ const DataLayout &DL);
+
+/// Return true if the specified constant can be handled by the code generator.
+/// We don't want to generate something like:
+/// void *X = &X/42;
+/// because the code generator doesn't have a relocation that can handle that.
+///
+/// This function should be called if C was not found (but just got inserted)
+/// in SimpleConstants to avoid having to rescan the same constants all the
+/// time.
+static bool
+isSimpleEnoughValueToCommitHelper(Constant *C,
+ SmallPtrSetImpl<Constant *> &SimpleConstants,
+ const DataLayout &DL) {
+ // Simple global addresses are supported, do not allow dllimport or
+ // thread-local globals.
+ if (auto *GV = dyn_cast<GlobalValue>(C))
+ return !GV->hasDLLImportStorageClass() && !GV->isThreadLocal();
+
+ // Simple integer, undef, constant aggregate zero, etc are all supported.
+ if (C->getNumOperands() == 0 || isa<BlockAddress>(C))
+ return true;
+
+ // Aggregate values are safe if all their elements are.
+ if (isa<ConstantAggregate>(C)) {
+ for (Value *Op : C->operands())
+ if (!isSimpleEnoughValueToCommit(cast<Constant>(Op), SimpleConstants, DL))
+ return false;
+ return true;
+ }
+
+ // We don't know exactly what relocations are allowed in constant expressions,
+ // so we allow &global+constantoffset, which is safe and uniformly supported
+ // across targets.
+ ConstantExpr *CE = cast<ConstantExpr>(C);
+ switch (CE->getOpcode()) {
+ case Instruction::BitCast:
+ // Bitcast is fine if the casted value is fine.
+ return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+ case Instruction::IntToPtr:
+ case Instruction::PtrToInt:
+ // int <=> ptr is fine if the int type is the same size as the
+ // pointer type.
+ if (DL.getTypeSizeInBits(CE->getType()) !=
+ DL.getTypeSizeInBits(CE->getOperand(0)->getType()))
+ return false;
+ return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+ // GEP is fine if it is simple + constant offset.
+ case Instruction::GetElementPtr:
+ for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
+ if (!isa<ConstantInt>(CE->getOperand(i)))
+ return false;
+ return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+
+ case Instruction::Add:
+ // We allow simple+cst.
+ if (!isa<ConstantInt>(CE->getOperand(1)))
+ return false;
+ return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, DL);
+ }
+ return false;
+}
+
+static inline bool
+isSimpleEnoughValueToCommit(Constant *C,
+ SmallPtrSetImpl<Constant *> &SimpleConstants,
+ const DataLayout &DL) {
+ // If we already checked this constant, we win.
+ if (!SimpleConstants.insert(C).second)
+ return true;
+ // Check the constant.
+ return isSimpleEnoughValueToCommitHelper(C, SimpleConstants, DL);
+}
+
+/// Return true if this constant is simple enough for us to understand. In
+/// particular, if it is a cast to anything other than from one pointer type to
+/// another pointer type, we punt. We basically just support direct accesses to
+/// globals and GEP's of globals. This should be kept up to date with
+/// CommitValueTo.
+static bool isSimpleEnoughPointerToCommit(Constant *C) {
+ // Conservatively, avoid aggregate types. This is because we don't
+ // want to worry about them partially overlapping other stores.
+ if (!cast<PointerType>(C->getType())->getElementType()->isSingleValueType())
+ return false;
+
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C))
+ // Do not allow weak/*_odr/linkonce linkage or external globals.
+ return GV->hasUniqueInitializer();
+
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+ // Handle a constantexpr gep.
+ if (CE->getOpcode() == Instruction::GetElementPtr &&
+ isa<GlobalVariable>(CE->getOperand(0)) &&
+ cast<GEPOperator>(CE)->isInBounds()) {
+ GlobalVariable *GV = cast<GlobalVariable>(CE->getOperand(0));
+ // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
+ // external globals.
+ if (!GV->hasUniqueInitializer())
+ return false;
+
+ // The first index must be zero.
+ ConstantInt *CI = dyn_cast<ConstantInt>(*std::next(CE->op_begin()));
+ if (!CI || !CI->isZero()) return false;
+
+ // The remaining indices must be compile-time known integers within the
+ // notional bounds of the corresponding static array types.
+ if (!CE->isGEPWithNoNotionalOverIndexing())
+ return false;
+
+ return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
+
+ // A constantexpr bitcast from a pointer to another pointer is a no-op,
+ // and we know how to evaluate it by moving the bitcast from the pointer
+ // operand to the value operand.
+ } else if (CE->getOpcode() == Instruction::BitCast &&
+ isa<GlobalVariable>(CE->getOperand(0))) {
+ // Do not allow weak/*_odr/linkonce/dllimport/dllexport linkage or
+ // external globals.
+ return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer();
+ }
+ }
+
+ return false;
+}
+
+/// Apply 'Func' to Ptr. If this returns nullptr, introspect the pointer's
+/// type and walk down through the initial elements to obtain additional
+/// pointers to try. Returns the first non-null return value from Func, or
+/// nullptr if the type can't be introspected further.
+static Constant *
+evaluateBitcastFromPtr(Constant *Ptr, const DataLayout &DL,
+ const TargetLibraryInfo *TLI,
+ std::function<Constant *(Constant *)> Func) {
+ Constant *Val;
+ while (!(Val = Func(Ptr))) {
// If Ty is a non-opaque struct, we can convert the pointer to the struct
- // into a pointer to its first member.
- // FIXME: This could be extended to support arrays as well.
- Type *Ty = cast<PointerType>(Ptr->getType())->getElementType();
+ // into a pointer to its first member.
+ // FIXME: This could be extended to support arrays as well.
+ Type *Ty = cast<PointerType>(Ptr->getType())->getElementType();
if (!isa<StructType>(Ty) || cast<StructType>(Ty)->isOpaque())
- break;
-
- IntegerType *IdxTy = IntegerType::get(Ty->getContext(), 32);
- Constant *IdxZero = ConstantInt::get(IdxTy, 0, false);
- Constant *const IdxList[] = {IdxZero, IdxZero};
-
- Ptr = ConstantExpr::getGetElementPtr(Ty, Ptr, IdxList);
- Ptr = ConstantFoldConstant(Ptr, DL, TLI);
- }
- return Val;
-}
-
-static Constant *getInitializer(Constant *C) {
- auto *GV = dyn_cast<GlobalVariable>(C);
- return GV && GV->hasDefinitiveInitializer() ? GV->getInitializer() : nullptr;
-}
-
-/// Return the value that would be computed by a load from P after the stores
-/// reflected by 'memory' have been performed. If we can't decide, return null.
-Constant *Evaluator::ComputeLoadResult(Constant *P) {
- // If this memory location has been recently stored, use the stored value: it
- // is the most up-to-date.
+ break;
+
+ IntegerType *IdxTy = IntegerType::get(Ty->getContext(), 32);
+ Constant *IdxZero = ConstantInt::get(IdxTy, 0, false);
+ Constant *const IdxList[] = {IdxZero, IdxZero};
+
+ Ptr = ConstantExpr::getGetElementPtr(Ty, Ptr, IdxList);
+ Ptr = ConstantFoldConstant(Ptr, DL, TLI);
+ }
+ return Val;
+}
+
+static Constant *getInitializer(Constant *C) {
+ auto *GV = dyn_cast<GlobalVariable>(C);
+ return GV && GV->hasDefinitiveInitializer() ? GV->getInitializer() : nullptr;
+}
+
+/// Return the value that would be computed by a load from P after the stores
+/// reflected by 'memory' have been performed. If we can't decide, return null.
+Constant *Evaluator::ComputeLoadResult(Constant *P) {
+ // If this memory location has been recently stored, use the stored value: it
+ // is the most up-to-date.
auto findMemLoc = [this](Constant *Ptr) { return MutatedMemory.lookup(Ptr); };
-
- if (Constant *Val = findMemLoc(P))
- return Val;
-
- // Access it.
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
- if (GV->hasDefinitiveInitializer())
- return GV->getInitializer();
- return nullptr;
- }
-
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P)) {
- switch (CE->getOpcode()) {
- // Handle a constantexpr getelementptr.
- case Instruction::GetElementPtr:
- if (auto *I = getInitializer(CE->getOperand(0)))
- return ConstantFoldLoadThroughGEPConstantExpr(I, CE);
- break;
- // Handle a constantexpr bitcast.
- case Instruction::BitCast:
- // We're evaluating a load through a pointer that was bitcast to a
- // different type. See if the "from" pointer has recently been stored.
- // If it hasn't, we may still be able to find a stored pointer by
- // introspecting the type.
- Constant *Val =
- evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, findMemLoc);
- if (!Val)
- Val = getInitializer(CE->getOperand(0));
- if (Val)
- return ConstantFoldLoadThroughBitcast(
- Val, P->getType()->getPointerElementType(), DL);
- break;
- }
- }
-
- return nullptr; // don't know how to evaluate.
-}
-
-static Function *getFunction(Constant *C) {
- if (auto *Fn = dyn_cast<Function>(C))
- return Fn;
-
- if (auto *Alias = dyn_cast<GlobalAlias>(C))
- if (auto *Fn = dyn_cast<Function>(Alias->getAliasee()))
- return Fn;
- return nullptr;
-}
-
-Function *
-Evaluator::getCalleeWithFormalArgs(CallBase &CB,
- SmallVectorImpl<Constant *> &Formals) {
- auto *V = CB.getCalledOperand();
- if (auto *Fn = getFunction(getVal(V)))
- return getFormalParams(CB, Fn, Formals) ? Fn : nullptr;
-
- auto *CE = dyn_cast<ConstantExpr>(V);
- if (!CE || CE->getOpcode() != Instruction::BitCast ||
- !getFormalParams(CB, getFunction(CE->getOperand(0)), Formals))
- return nullptr;
-
- return dyn_cast<Function>(
- ConstantFoldLoadThroughBitcast(CE, CE->getOperand(0)->getType(), DL));
-}
-
-bool Evaluator::getFormalParams(CallBase &CB, Function *F,
- SmallVectorImpl<Constant *> &Formals) {
- if (!F)
- return false;
-
- auto *FTy = F->getFunctionType();
- if (FTy->getNumParams() > CB.getNumArgOperands()) {
- LLVM_DEBUG(dbgs() << "Too few arguments for function.\n");
- return false;
- }
-
- auto ArgI = CB.arg_begin();
- for (auto ParI = FTy->param_begin(), ParE = FTy->param_end(); ParI != ParE;
- ++ParI) {
- auto *ArgC = ConstantFoldLoadThroughBitcast(getVal(*ArgI), *ParI, DL);
- if (!ArgC) {
- LLVM_DEBUG(dbgs() << "Can not convert function argument.\n");
- return false;
- }
- Formals.push_back(ArgC);
- ++ArgI;
- }
- return true;
-}
-
-/// If call expression contains bitcast then we may need to cast
-/// evaluated return value to a type of the call expression.
-Constant *Evaluator::castCallResultIfNeeded(Value *CallExpr, Constant *RV) {
- ConstantExpr *CE = dyn_cast<ConstantExpr>(CallExpr);
- if (!RV || !CE || CE->getOpcode() != Instruction::BitCast)
- return RV;
-
- if (auto *FT =
- dyn_cast<FunctionType>(CE->getType()->getPointerElementType())) {
- RV = ConstantFoldLoadThroughBitcast(RV, FT->getReturnType(), DL);
- if (!RV)
- LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n");
- }
- return RV;
-}
-
-/// Evaluate all instructions in block BB, returning true if successful, false
-/// if we can't evaluate it. NewBB returns the next BB that control flows into,
-/// or null upon return.
-bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
- BasicBlock *&NextBB) {
- // This is the main evaluation loop.
- while (true) {
- Constant *InstResult = nullptr;
-
- LLVM_DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
-
- if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
- if (!SI->isSimple()) {
- LLVM_DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
- return false; // no volatile/atomic accesses.
- }
- Constant *Ptr = getVal(SI->getOperand(1));
- Constant *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI);
- if (Ptr != FoldedPtr) {
- LLVM_DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
- Ptr = FoldedPtr;
- LLVM_DEBUG(dbgs() << "; To: " << *Ptr << "\n");
- }
- if (!isSimpleEnoughPointerToCommit(Ptr)) {
- // If this is too complex for us to commit, reject it.
- LLVM_DEBUG(
- dbgs() << "Pointer is too complex for us to evaluate store.");
- return false;
- }
-
- Constant *Val = getVal(SI->getOperand(0));
-
- // If this might be too difficult for the backend to handle (e.g. the addr
- // of one global variable divided by another) then we can't commit it.
- if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) {
- LLVM_DEBUG(dbgs() << "Store value is too complex to evaluate store. "
- << *Val << "\n");
- return false;
- }
-
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
- if (CE->getOpcode() == Instruction::BitCast) {
- LLVM_DEBUG(dbgs()
- << "Attempting to resolve bitcast on constant ptr.\n");
- // If we're evaluating a store through a bitcast, then we need
- // to pull the bitcast off the pointer type and push it onto the
- // stored value. In order to push the bitcast onto the stored value,
- // a bitcast from the pointer's element type to Val's type must be
- // legal. If it's not, we can try introspecting the type to find a
- // legal conversion.
-
- auto castValTy = [&](Constant *P) -> Constant * {
- Type *Ty = cast<PointerType>(P->getType())->getElementType();
- if (Constant *FV = ConstantFoldLoadThroughBitcast(Val, Ty, DL)) {
- Ptr = P;
- return FV;
- }
- return nullptr;
- };
-
- Constant *NewVal =
- evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, castValTy);
- if (!NewVal) {
- LLVM_DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
- "evaluate.\n");
- return false;
- }
-
- Val = NewVal;
- LLVM_DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
- }
- }
-
- MutatedMemory[Ptr] = Val;
- } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
- InstResult = ConstantExpr::get(BO->getOpcode(),
- getVal(BO->getOperand(0)),
- getVal(BO->getOperand(1)));
- LLVM_DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: "
- << *InstResult << "\n");
- } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
- InstResult = ConstantExpr::getCompare(CI->getPredicate(),
- getVal(CI->getOperand(0)),
- getVal(CI->getOperand(1)));
- LLVM_DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
- << "\n");
- } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
- InstResult = ConstantExpr::getCast(CI->getOpcode(),
- getVal(CI->getOperand(0)),
- CI->getType());
- LLVM_DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
- << "\n");
- } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
- InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
- getVal(SI->getOperand(1)),
- getVal(SI->getOperand(2)));
- LLVM_DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
- << "\n");
- } else if (auto *EVI = dyn_cast<ExtractValueInst>(CurInst)) {
- InstResult = ConstantExpr::getExtractValue(
- getVal(EVI->getAggregateOperand()), EVI->getIndices());
- LLVM_DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: "
- << *InstResult << "\n");
- } else if (auto *IVI = dyn_cast<InsertValueInst>(CurInst)) {
- InstResult = ConstantExpr::getInsertValue(
- getVal(IVI->getAggregateOperand()),
- getVal(IVI->getInsertedValueOperand()), IVI->getIndices());
- LLVM_DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: "
- << *InstResult << "\n");
- } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
- Constant *P = getVal(GEP->getOperand(0));
- SmallVector<Constant*, 8> GEPOps;
- for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
- i != e; ++i)
- GEPOps.push_back(getVal(*i));
- InstResult =
- ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), P, GEPOps,
- cast<GEPOperator>(GEP)->isInBounds());
- LLVM_DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult << "\n");
- } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
- if (!LI->isSimple()) {
- LLVM_DEBUG(
- dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
- return false; // no volatile/atomic accesses.
- }
-
- Constant *Ptr = getVal(LI->getOperand(0));
- Constant *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI);
- if (Ptr != FoldedPtr) {
- Ptr = FoldedPtr;
- LLVM_DEBUG(dbgs() << "Found a constant pointer expression, constant "
- "folding: "
- << *Ptr << "\n");
- }
- InstResult = ComputeLoadResult(Ptr);
- if (!InstResult) {
- LLVM_DEBUG(
- dbgs() << "Failed to compute load result. Can not evaluate load."
- "\n");
- return false; // Could not evaluate load.
- }
-
- LLVM_DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
- } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
- if (AI->isArrayAllocation()) {
- LLVM_DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
- return false; // Cannot handle array allocs.
- }
- Type *Ty = AI->getAllocatedType();
- AllocaTmps.push_back(std::make_unique<GlobalVariable>(
- Ty, false, GlobalValue::InternalLinkage, UndefValue::get(Ty),
- AI->getName(), /*TLMode=*/GlobalValue::NotThreadLocal,
- AI->getType()->getPointerAddressSpace()));
- InstResult = AllocaTmps.back().get();
- LLVM_DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
- } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
- CallBase &CB = *cast<CallBase>(&*CurInst);
-
- // Debug info can safely be ignored here.
- if (isa<DbgInfoIntrinsic>(CB)) {
- LLVM_DEBUG(dbgs() << "Ignoring debug info.\n");
- ++CurInst;
- continue;
- }
-
- // Cannot handle inline asm.
- if (CB.isInlineAsm()) {
- LLVM_DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
- return false;
- }
-
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CB)) {
- if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
- if (MSI->isVolatile()) {
- LLVM_DEBUG(dbgs() << "Can not optimize a volatile memset "
- << "intrinsic.\n");
- return false;
- }
- Constant *Ptr = getVal(MSI->getDest());
- Constant *Val = getVal(MSI->getValue());
- Constant *DestVal = ComputeLoadResult(getVal(Ptr));
- if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
- // This memset is a no-op.
- LLVM_DEBUG(dbgs() << "Ignoring no-op memset.\n");
- ++CurInst;
- continue;
- }
- }
-
- if (II->isLifetimeStartOrEnd()) {
- LLVM_DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
- ++CurInst;
- continue;
- }
-
- if (II->getIntrinsicID() == Intrinsic::invariant_start) {
- // We don't insert an entry into Values, as it doesn't have a
- // meaningful return value.
- if (!II->use_empty()) {
- LLVM_DEBUG(dbgs()
- << "Found unused invariant_start. Can't evaluate.\n");
- return false;
- }
- ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
- Value *PtrArg = getVal(II->getArgOperand(1));
- Value *Ptr = PtrArg->stripPointerCasts();
- if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
- Type *ElemTy = GV->getValueType();
- if (!Size->isMinusOne() &&
- Size->getValue().getLimitedValue() >=
- DL.getTypeStoreSize(ElemTy)) {
- Invariants.insert(GV);
- LLVM_DEBUG(dbgs() << "Found a global var that is an invariant: "
- << *GV << "\n");
- } else {
- LLVM_DEBUG(dbgs()
- << "Found a global var, but can not treat it as an "
- "invariant.\n");
- }
- }
- // Continue even if we do nothing.
- ++CurInst;
- continue;
- } else if (II->getIntrinsicID() == Intrinsic::assume) {
- LLVM_DEBUG(dbgs() << "Skipping assume intrinsic.\n");
- ++CurInst;
- continue;
- } else if (II->getIntrinsicID() == Intrinsic::sideeffect) {
- LLVM_DEBUG(dbgs() << "Skipping sideeffect intrinsic.\n");
- ++CurInst;
- continue;
+
+ if (Constant *Val = findMemLoc(P))
+ return Val;
+
+ // Access it.
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(P)) {
+ if (GV->hasDefinitiveInitializer())
+ return GV->getInitializer();
+ return nullptr;
+ }
+
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(P)) {
+ switch (CE->getOpcode()) {
+ // Handle a constantexpr getelementptr.
+ case Instruction::GetElementPtr:
+ if (auto *I = getInitializer(CE->getOperand(0)))
+ return ConstantFoldLoadThroughGEPConstantExpr(I, CE);
+ break;
+ // Handle a constantexpr bitcast.
+ case Instruction::BitCast:
+ // We're evaluating a load through a pointer that was bitcast to a
+ // different type. See if the "from" pointer has recently been stored.
+ // If it hasn't, we may still be able to find a stored pointer by
+ // introspecting the type.
+ Constant *Val =
+ evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, findMemLoc);
+ if (!Val)
+ Val = getInitializer(CE->getOperand(0));
+ if (Val)
+ return ConstantFoldLoadThroughBitcast(
+ Val, P->getType()->getPointerElementType(), DL);
+ break;
+ }
+ }
+
+ return nullptr; // don't know how to evaluate.
+}
+
+static Function *getFunction(Constant *C) {
+ if (auto *Fn = dyn_cast<Function>(C))
+ return Fn;
+
+ if (auto *Alias = dyn_cast<GlobalAlias>(C))
+ if (auto *Fn = dyn_cast<Function>(Alias->getAliasee()))
+ return Fn;
+ return nullptr;
+}
+
+Function *
+Evaluator::getCalleeWithFormalArgs(CallBase &CB,
+ SmallVectorImpl<Constant *> &Formals) {
+ auto *V = CB.getCalledOperand();
+ if (auto *Fn = getFunction(getVal(V)))
+ return getFormalParams(CB, Fn, Formals) ? Fn : nullptr;
+
+ auto *CE = dyn_cast<ConstantExpr>(V);
+ if (!CE || CE->getOpcode() != Instruction::BitCast ||
+ !getFormalParams(CB, getFunction(CE->getOperand(0)), Formals))
+ return nullptr;
+
+ return dyn_cast<Function>(
+ ConstantFoldLoadThroughBitcast(CE, CE->getOperand(0)->getType(), DL));
+}
+
+bool Evaluator::getFormalParams(CallBase &CB, Function *F,
+ SmallVectorImpl<Constant *> &Formals) {
+ if (!F)
+ return false;
+
+ auto *FTy = F->getFunctionType();
+ if (FTy->getNumParams() > CB.getNumArgOperands()) {
+ LLVM_DEBUG(dbgs() << "Too few arguments for function.\n");
+ return false;
+ }
+
+ auto ArgI = CB.arg_begin();
+ for (auto ParI = FTy->param_begin(), ParE = FTy->param_end(); ParI != ParE;
+ ++ParI) {
+ auto *ArgC = ConstantFoldLoadThroughBitcast(getVal(*ArgI), *ParI, DL);
+ if (!ArgC) {
+ LLVM_DEBUG(dbgs() << "Can not convert function argument.\n");
+ return false;
+ }
+ Formals.push_back(ArgC);
+ ++ArgI;
+ }
+ return true;
+}
+
+/// If call expression contains bitcast then we may need to cast
+/// evaluated return value to a type of the call expression.
+Constant *Evaluator::castCallResultIfNeeded(Value *CallExpr, Constant *RV) {
+ ConstantExpr *CE = dyn_cast<ConstantExpr>(CallExpr);
+ if (!RV || !CE || CE->getOpcode() != Instruction::BitCast)
+ return RV;
+
+ if (auto *FT =
+ dyn_cast<FunctionType>(CE->getType()->getPointerElementType())) {
+ RV = ConstantFoldLoadThroughBitcast(RV, FT->getReturnType(), DL);
+ if (!RV)
+ LLVM_DEBUG(dbgs() << "Failed to fold bitcast call expr\n");
+ }
+ return RV;
+}
+
+/// Evaluate all instructions in block BB, returning true if successful, false
+/// if we can't evaluate it. NewBB returns the next BB that control flows into,
+/// or null upon return.
+bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
+ BasicBlock *&NextBB) {
+ // This is the main evaluation loop.
+ while (true) {
+ Constant *InstResult = nullptr;
+
+ LLVM_DEBUG(dbgs() << "Evaluating Instruction: " << *CurInst << "\n");
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(CurInst)) {
+ if (!SI->isSimple()) {
+ LLVM_DEBUG(dbgs() << "Store is not simple! Can not evaluate.\n");
+ return false; // no volatile/atomic accesses.
+ }
+ Constant *Ptr = getVal(SI->getOperand(1));
+ Constant *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI);
+ if (Ptr != FoldedPtr) {
+ LLVM_DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
+ Ptr = FoldedPtr;
+ LLVM_DEBUG(dbgs() << "; To: " << *Ptr << "\n");
+ }
+ if (!isSimpleEnoughPointerToCommit(Ptr)) {
+ // If this is too complex for us to commit, reject it.
+ LLVM_DEBUG(
+ dbgs() << "Pointer is too complex for us to evaluate store.");
+ return false;
+ }
+
+ Constant *Val = getVal(SI->getOperand(0));
+
+ // If this might be too difficult for the backend to handle (e.g. the addr
+ // of one global variable divided by another) then we can't commit it.
+ if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, DL)) {
+ LLVM_DEBUG(dbgs() << "Store value is too complex to evaluate store. "
+ << *Val << "\n");
+ return false;
+ }
+
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr)) {
+ if (CE->getOpcode() == Instruction::BitCast) {
+ LLVM_DEBUG(dbgs()
+ << "Attempting to resolve bitcast on constant ptr.\n");
+ // If we're evaluating a store through a bitcast, then we need
+ // to pull the bitcast off the pointer type and push it onto the
+ // stored value. In order to push the bitcast onto the stored value,
+ // a bitcast from the pointer's element type to Val's type must be
+ // legal. If it's not, we can try introspecting the type to find a
+ // legal conversion.
+
+ auto castValTy = [&](Constant *P) -> Constant * {
+ Type *Ty = cast<PointerType>(P->getType())->getElementType();
+ if (Constant *FV = ConstantFoldLoadThroughBitcast(Val, Ty, DL)) {
+ Ptr = P;
+ return FV;
+ }
+ return nullptr;
+ };
+
+ Constant *NewVal =
+ evaluateBitcastFromPtr(CE->getOperand(0), DL, TLI, castValTy);
+ if (!NewVal) {
+ LLVM_DEBUG(dbgs() << "Failed to bitcast constant ptr, can not "
+ "evaluate.\n");
+ return false;
+ }
+
+ Val = NewVal;
+ LLVM_DEBUG(dbgs() << "Evaluated bitcast: " << *Val << "\n");
+ }
+ }
+
+ MutatedMemory[Ptr] = Val;
+ } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
+ InstResult = ConstantExpr::get(BO->getOpcode(),
+ getVal(BO->getOperand(0)),
+ getVal(BO->getOperand(1)));
+ LLVM_DEBUG(dbgs() << "Found a BinaryOperator! Simplifying: "
+ << *InstResult << "\n");
+ } else if (CmpInst *CI = dyn_cast<CmpInst>(CurInst)) {
+ InstResult = ConstantExpr::getCompare(CI->getPredicate(),
+ getVal(CI->getOperand(0)),
+ getVal(CI->getOperand(1)));
+ LLVM_DEBUG(dbgs() << "Found a CmpInst! Simplifying: " << *InstResult
+ << "\n");
+ } else if (CastInst *CI = dyn_cast<CastInst>(CurInst)) {
+ InstResult = ConstantExpr::getCast(CI->getOpcode(),
+ getVal(CI->getOperand(0)),
+ CI->getType());
+ LLVM_DEBUG(dbgs() << "Found a Cast! Simplifying: " << *InstResult
+ << "\n");
+ } else if (SelectInst *SI = dyn_cast<SelectInst>(CurInst)) {
+ InstResult = ConstantExpr::getSelect(getVal(SI->getOperand(0)),
+ getVal(SI->getOperand(1)),
+ getVal(SI->getOperand(2)));
+ LLVM_DEBUG(dbgs() << "Found a Select! Simplifying: " << *InstResult
+ << "\n");
+ } else if (auto *EVI = dyn_cast<ExtractValueInst>(CurInst)) {
+ InstResult = ConstantExpr::getExtractValue(
+ getVal(EVI->getAggregateOperand()), EVI->getIndices());
+ LLVM_DEBUG(dbgs() << "Found an ExtractValueInst! Simplifying: "
+ << *InstResult << "\n");
+ } else if (auto *IVI = dyn_cast<InsertValueInst>(CurInst)) {
+ InstResult = ConstantExpr::getInsertValue(
+ getVal(IVI->getAggregateOperand()),
+ getVal(IVI->getInsertedValueOperand()), IVI->getIndices());
+ LLVM_DEBUG(dbgs() << "Found an InsertValueInst! Simplifying: "
+ << *InstResult << "\n");
+ } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(CurInst)) {
+ Constant *P = getVal(GEP->getOperand(0));
+ SmallVector<Constant*, 8> GEPOps;
+ for (User::op_iterator i = GEP->op_begin() + 1, e = GEP->op_end();
+ i != e; ++i)
+ GEPOps.push_back(getVal(*i));
+ InstResult =
+ ConstantExpr::getGetElementPtr(GEP->getSourceElementType(), P, GEPOps,
+ cast<GEPOperator>(GEP)->isInBounds());
+ LLVM_DEBUG(dbgs() << "Found a GEP! Simplifying: " << *InstResult << "\n");
+ } else if (LoadInst *LI = dyn_cast<LoadInst>(CurInst)) {
+ if (!LI->isSimple()) {
+ LLVM_DEBUG(
+ dbgs() << "Found a Load! Not a simple load, can not evaluate.\n");
+ return false; // no volatile/atomic accesses.
+ }
+
+ Constant *Ptr = getVal(LI->getOperand(0));
+ Constant *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI);
+ if (Ptr != FoldedPtr) {
+ Ptr = FoldedPtr;
+ LLVM_DEBUG(dbgs() << "Found a constant pointer expression, constant "
+ "folding: "
+ << *Ptr << "\n");
+ }
+ InstResult = ComputeLoadResult(Ptr);
+ if (!InstResult) {
+ LLVM_DEBUG(
+ dbgs() << "Failed to compute load result. Can not evaluate load."
+ "\n");
+ return false; // Could not evaluate load.
+ }
+
+ LLVM_DEBUG(dbgs() << "Evaluated load: " << *InstResult << "\n");
+ } else if (AllocaInst *AI = dyn_cast<AllocaInst>(CurInst)) {
+ if (AI->isArrayAllocation()) {
+ LLVM_DEBUG(dbgs() << "Found an array alloca. Can not evaluate.\n");
+ return false; // Cannot handle array allocs.
+ }
+ Type *Ty = AI->getAllocatedType();
+ AllocaTmps.push_back(std::make_unique<GlobalVariable>(
+ Ty, false, GlobalValue::InternalLinkage, UndefValue::get(Ty),
+ AI->getName(), /*TLMode=*/GlobalValue::NotThreadLocal,
+ AI->getType()->getPointerAddressSpace()));
+ InstResult = AllocaTmps.back().get();
+ LLVM_DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
+ } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
+ CallBase &CB = *cast<CallBase>(&*CurInst);
+
+ // Debug info can safely be ignored here.
+ if (isa<DbgInfoIntrinsic>(CB)) {
+ LLVM_DEBUG(dbgs() << "Ignoring debug info.\n");
+ ++CurInst;
+ continue;
+ }
+
+ // Cannot handle inline asm.
+ if (CB.isInlineAsm()) {
+ LLVM_DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
+ return false;
+ }
+
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CB)) {
+ if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
+ if (MSI->isVolatile()) {
+ LLVM_DEBUG(dbgs() << "Can not optimize a volatile memset "
+ << "intrinsic.\n");
+ return false;
+ }
+ Constant *Ptr = getVal(MSI->getDest());
+ Constant *Val = getVal(MSI->getValue());
+ Constant *DestVal = ComputeLoadResult(getVal(Ptr));
+ if (Val->isNullValue() && DestVal && DestVal->isNullValue()) {
+ // This memset is a no-op.
+ LLVM_DEBUG(dbgs() << "Ignoring no-op memset.\n");
+ ++CurInst;
+ continue;
+ }
+ }
+
+ if (II->isLifetimeStartOrEnd()) {
+ LLVM_DEBUG(dbgs() << "Ignoring lifetime intrinsic.\n");
+ ++CurInst;
+ continue;
+ }
+
+ if (II->getIntrinsicID() == Intrinsic::invariant_start) {
+ // We don't insert an entry into Values, as it doesn't have a
+ // meaningful return value.
+ if (!II->use_empty()) {
+ LLVM_DEBUG(dbgs()
+ << "Found unused invariant_start. Can't evaluate.\n");
+ return false;
+ }
+ ConstantInt *Size = cast<ConstantInt>(II->getArgOperand(0));
+ Value *PtrArg = getVal(II->getArgOperand(1));
+ Value *Ptr = PtrArg->stripPointerCasts();
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+ Type *ElemTy = GV->getValueType();
+ if (!Size->isMinusOne() &&
+ Size->getValue().getLimitedValue() >=
+ DL.getTypeStoreSize(ElemTy)) {
+ Invariants.insert(GV);
+ LLVM_DEBUG(dbgs() << "Found a global var that is an invariant: "
+ << *GV << "\n");
+ } else {
+ LLVM_DEBUG(dbgs()
+ << "Found a global var, but can not treat it as an "
+ "invariant.\n");
+ }
+ }
+ // Continue even if we do nothing.
+ ++CurInst;
+ continue;
+ } else if (II->getIntrinsicID() == Intrinsic::assume) {
+ LLVM_DEBUG(dbgs() << "Skipping assume intrinsic.\n");
+ ++CurInst;
+ continue;
+ } else if (II->getIntrinsicID() == Intrinsic::sideeffect) {
+ LLVM_DEBUG(dbgs() << "Skipping sideeffect intrinsic.\n");
+ ++CurInst;
+ continue;
} else if (II->getIntrinsicID() == Intrinsic::pseudoprobe) {
LLVM_DEBUG(dbgs() << "Skipping pseudoprobe intrinsic.\n");
++CurInst;
continue;
- }
-
- LLVM_DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
- return false;
- }
-
- // Resolve function pointers.
- SmallVector<Constant *, 8> Formals;
- Function *Callee = getCalleeWithFormalArgs(CB, Formals);
- if (!Callee || Callee->isInterposable()) {
- LLVM_DEBUG(dbgs() << "Can not resolve function pointer.\n");
- return false; // Cannot resolve.
- }
-
- if (Callee->isDeclaration()) {
- // If this is a function we can constant fold, do it.
- if (Constant *C = ConstantFoldCall(&CB, Callee, Formals, TLI)) {
- InstResult = castCallResultIfNeeded(CB.getCalledOperand(), C);
- if (!InstResult)
- return false;
- LLVM_DEBUG(dbgs() << "Constant folded function call. Result: "
- << *InstResult << "\n");
- } else {
- LLVM_DEBUG(dbgs() << "Can not constant fold function call.\n");
- return false;
- }
- } else {
- if (Callee->getFunctionType()->isVarArg()) {
- LLVM_DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
- return false;
- }
-
- Constant *RetVal = nullptr;
- // Execute the call, if successful, use the return value.
- ValueStack.emplace_back();
- if (!EvaluateFunction(Callee, RetVal, Formals)) {
- LLVM_DEBUG(dbgs() << "Failed to evaluate function.\n");
- return false;
- }
- ValueStack.pop_back();
- InstResult = castCallResultIfNeeded(CB.getCalledOperand(), RetVal);
- if (RetVal && !InstResult)
- return false;
-
- if (InstResult) {
- LLVM_DEBUG(dbgs() << "Successfully evaluated function. Result: "
- << *InstResult << "\n\n");
- } else {
- LLVM_DEBUG(dbgs()
- << "Successfully evaluated function. Result: 0\n\n");
- }
- }
- } else if (CurInst->isTerminator()) {
- LLVM_DEBUG(dbgs() << "Found a terminator instruction.\n");
-
- if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
- if (BI->isUnconditional()) {
- NextBB = BI->getSuccessor(0);
- } else {
- ConstantInt *Cond =
- dyn_cast<ConstantInt>(getVal(BI->getCondition()));
- if (!Cond) return false; // Cannot determine.
-
- NextBB = BI->getSuccessor(!Cond->getZExtValue());
- }
- } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurInst)) {
- ConstantInt *Val =
- dyn_cast<ConstantInt>(getVal(SI->getCondition()));
- if (!Val) return false; // Cannot determine.
- NextBB = SI->findCaseValue(Val)->getCaseSuccessor();
- } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) {
- Value *Val = getVal(IBI->getAddress())->stripPointerCasts();
- if (BlockAddress *BA = dyn_cast<BlockAddress>(Val))
- NextBB = BA->getBasicBlock();
- else
- return false; // Cannot determine.
- } else if (isa<ReturnInst>(CurInst)) {
- NextBB = nullptr;
- } else {
- // invoke, unwind, resume, unreachable.
- LLVM_DEBUG(dbgs() << "Can not handle terminator.");
- return false; // Cannot handle this terminator.
- }
-
- // We succeeded at evaluating this block!
- LLVM_DEBUG(dbgs() << "Successfully evaluated block.\n");
- return true;
- } else {
- // Did not know how to evaluate this!
- LLVM_DEBUG(
- dbgs() << "Failed to evaluate block due to unhandled instruction."
- "\n");
- return false;
- }
-
- if (!CurInst->use_empty()) {
- InstResult = ConstantFoldConstant(InstResult, DL, TLI);
- setVal(&*CurInst, InstResult);
- }
-
- // If we just processed an invoke, we finished evaluating the block.
- if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
- NextBB = II->getNormalDest();
- LLVM_DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
- return true;
- }
-
- // Advance program counter.
- ++CurInst;
- }
-}
-
-/// Evaluate a call to function F, returning true if successful, false if we
-/// can't evaluate it. ActualArgs contains the formal arguments for the
-/// function.
-bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
- const SmallVectorImpl<Constant*> &ActualArgs) {
- // Check to see if this function is already executing (recursion). If so,
- // bail out. TODO: we might want to accept limited recursion.
- if (is_contained(CallStack, F))
- return false;
-
- CallStack.push_back(F);
-
- // Initialize arguments to the incoming values specified.
- unsigned ArgNo = 0;
- for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
- ++AI, ++ArgNo)
- setVal(&*AI, ActualArgs[ArgNo]);
-
- // ExecutedBlocks - We only handle non-looping, non-recursive code. As such,
- // we can only evaluate any one basic block at most once. This set keeps
- // track of what we have executed so we can detect recursive cases etc.
- SmallPtrSet<BasicBlock*, 32> ExecutedBlocks;
-
- // CurBB - The current basic block we're evaluating.
- BasicBlock *CurBB = &F->front();
-
- BasicBlock::iterator CurInst = CurBB->begin();
-
- while (true) {
- BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings.
- LLVM_DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
-
- if (!EvaluateBlock(CurInst, NextBB))
- return false;
-
- if (!NextBB) {
- // Successfully running until there's no next block means that we found
- // the return. Fill it the return value and pop the call stack.
- ReturnInst *RI = cast<ReturnInst>(CurBB->getTerminator());
- if (RI->getNumOperands())
- RetVal = getVal(RI->getOperand(0));
- CallStack.pop_back();
- return true;
- }
-
- // Okay, we succeeded in evaluating this control flow. See if we have
- // executed the new block before. If so, we have a looping function,
- // which we cannot evaluate in reasonable time.
- if (!ExecutedBlocks.insert(NextBB).second)
- return false; // looped!
-
- // Okay, we have never been in this block before. Check to see if there
- // are any PHI nodes. If so, evaluate them with information about where
- // we came from.
- PHINode *PN = nullptr;
- for (CurInst = NextBB->begin();
- (PN = dyn_cast<PHINode>(CurInst)); ++CurInst)
- setVal(PN, getVal(PN->getIncomingValueForBlock(CurBB)));
-
- // Advance to the next block.
- CurBB = NextBB;
- }
-}
+ }
+
+ LLVM_DEBUG(dbgs() << "Unknown intrinsic. Can not evaluate.\n");
+ return false;
+ }
+
+ // Resolve function pointers.
+ SmallVector<Constant *, 8> Formals;
+ Function *Callee = getCalleeWithFormalArgs(CB, Formals);
+ if (!Callee || Callee->isInterposable()) {
+ LLVM_DEBUG(dbgs() << "Can not resolve function pointer.\n");
+ return false; // Cannot resolve.
+ }
+
+ if (Callee->isDeclaration()) {
+ // If this is a function we can constant fold, do it.
+ if (Constant *C = ConstantFoldCall(&CB, Callee, Formals, TLI)) {
+ InstResult = castCallResultIfNeeded(CB.getCalledOperand(), C);
+ if (!InstResult)
+ return false;
+ LLVM_DEBUG(dbgs() << "Constant folded function call. Result: "
+ << *InstResult << "\n");
+ } else {
+ LLVM_DEBUG(dbgs() << "Can not constant fold function call.\n");
+ return false;
+ }
+ } else {
+ if (Callee->getFunctionType()->isVarArg()) {
+ LLVM_DEBUG(dbgs() << "Can not constant fold vararg function call.\n");
+ return false;
+ }
+
+ Constant *RetVal = nullptr;
+ // Execute the call, if successful, use the return value.
+ ValueStack.emplace_back();
+ if (!EvaluateFunction(Callee, RetVal, Formals)) {
+ LLVM_DEBUG(dbgs() << "Failed to evaluate function.\n");
+ return false;
+ }
+ ValueStack.pop_back();
+ InstResult = castCallResultIfNeeded(CB.getCalledOperand(), RetVal);
+ if (RetVal && !InstResult)
+ return false;
+
+ if (InstResult) {
+ LLVM_DEBUG(dbgs() << "Successfully evaluated function. Result: "
+ << *InstResult << "\n\n");
+ } else {
+ LLVM_DEBUG(dbgs()
+ << "Successfully evaluated function. Result: 0\n\n");
+ }
+ }
+ } else if (CurInst->isTerminator()) {
+ LLVM_DEBUG(dbgs() << "Found a terminator instruction.\n");
+
+ if (BranchInst *BI = dyn_cast<BranchInst>(CurInst)) {
+ if (BI->isUnconditional()) {
+ NextBB = BI->getSuccessor(0);
+ } else {
+ ConstantInt *Cond =
+ dyn_cast<ConstantInt>(getVal(BI->getCondition()));
+ if (!Cond) return false; // Cannot determine.
+
+ NextBB = BI->getSuccessor(!Cond->getZExtValue());
+ }
+ } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurInst)) {
+ ConstantInt *Val =
+ dyn_cast<ConstantInt>(getVal(SI->getCondition()));
+ if (!Val) return false; // Cannot determine.
+ NextBB = SI->findCaseValue(Val)->getCaseSuccessor();
+ } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) {
+ Value *Val = getVal(IBI->getAddress())->stripPointerCasts();
+ if (BlockAddress *BA = dyn_cast<BlockAddress>(Val))
+ NextBB = BA->getBasicBlock();
+ else
+ return false; // Cannot determine.
+ } else if (isa<ReturnInst>(CurInst)) {
+ NextBB = nullptr;
+ } else {
+ // invoke, unwind, resume, unreachable.
+ LLVM_DEBUG(dbgs() << "Can not handle terminator.");
+ return false; // Cannot handle this terminator.
+ }
+
+ // We succeeded at evaluating this block!
+ LLVM_DEBUG(dbgs() << "Successfully evaluated block.\n");
+ return true;
+ } else {
+ // Did not know how to evaluate this!
+ LLVM_DEBUG(
+ dbgs() << "Failed to evaluate block due to unhandled instruction."
+ "\n");
+ return false;
+ }
+
+ if (!CurInst->use_empty()) {
+ InstResult = ConstantFoldConstant(InstResult, DL, TLI);
+ setVal(&*CurInst, InstResult);
+ }
+
+ // If we just processed an invoke, we finished evaluating the block.
+ if (InvokeInst *II = dyn_cast<InvokeInst>(CurInst)) {
+ NextBB = II->getNormalDest();
+ LLVM_DEBUG(dbgs() << "Found an invoke instruction. Finished Block.\n\n");
+ return true;
+ }
+
+ // Advance program counter.
+ ++CurInst;
+ }
+}
+
+/// Evaluate a call to function F, returning true if successful, false if we
+/// can't evaluate it. ActualArgs contains the formal arguments for the
+/// function.
+bool Evaluator::EvaluateFunction(Function *F, Constant *&RetVal,
+ const SmallVectorImpl<Constant*> &ActualArgs) {
+ // Check to see if this function is already executing (recursion). If so,
+ // bail out. TODO: we might want to accept limited recursion.
+ if (is_contained(CallStack, F))
+ return false;
+
+ CallStack.push_back(F);
+
+ // Initialize arguments to the incoming values specified.
+ unsigned ArgNo = 0;
+ for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
+ ++AI, ++ArgNo)
+ setVal(&*AI, ActualArgs[ArgNo]);
+
+ // ExecutedBlocks - We only handle non-looping, non-recursive code. As such,
+ // we can only evaluate any one basic block at most once. This set keeps
+ // track of what we have executed so we can detect recursive cases etc.
+ SmallPtrSet<BasicBlock*, 32> ExecutedBlocks;
+
+ // CurBB - The current basic block we're evaluating.
+ BasicBlock *CurBB = &F->front();
+
+ BasicBlock::iterator CurInst = CurBB->begin();
+
+ while (true) {
+ BasicBlock *NextBB = nullptr; // Initialized to avoid compiler warnings.
+ LLVM_DEBUG(dbgs() << "Trying to evaluate BB: " << *CurBB << "\n");
+
+ if (!EvaluateBlock(CurInst, NextBB))
+ return false;
+
+ if (!NextBB) {
+ // Successfully running until there's no next block means that we found
+ // the return. Fill it the return value and pop the call stack.
+ ReturnInst *RI = cast<ReturnInst>(CurBB->getTerminator());
+ if (RI->getNumOperands())
+ RetVal = getVal(RI->getOperand(0));
+ CallStack.pop_back();
+ return true;
+ }
+
+ // Okay, we succeeded in evaluating this control flow. See if we have
+ // executed the new block before. If so, we have a looping function,
+ // which we cannot evaluate in reasonable time.
+ if (!ExecutedBlocks.insert(NextBB).second)
+ return false; // looped!
+
+ // Okay, we have never been in this block before. Check to see if there
+ // are any PHI nodes. If so, evaluate them with information about where
+ // we came from.
+ PHINode *PN = nullptr;
+ for (CurInst = NextBB->begin();
+ (PN = dyn_cast<PHINode>(CurInst)); ++CurInst)
+ setVal(PN, getVal(PN->getIncomingValueForBlock(CurBB)));
+
+ // Advance to the next block.
+ CurBB = NextBB;
+ }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/FixIrreducible.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/FixIrreducible.cpp
index 29fa7f12d7..44af95eef6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/FixIrreducible.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/FixIrreducible.cpp
@@ -1,336 +1,336 @@
-//===- FixIrreducible.cpp - Convert irreducible control-flow into loops ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// An irreducible SCC is one which has multiple "header" blocks, i.e., blocks
-// with control-flow edges incident from outside the SCC. This pass converts a
-// irreducible SCC into a natural loop by applying the following transformation:
-//
-// 1. Collect the set of headers H of the SCC.
-// 2. Collect the set of predecessors P of these headers. These may be inside as
-// well as outside the SCC.
-// 3. Create block N and redirect every edge from set P to set H through N.
-//
-// This converts the SCC into a natural loop with N as the header: N is the only
-// block with edges incident from outside the SCC, and all backedges in the SCC
-// are incident on N, i.e., for every backedge, the head now dominates the tail.
-//
-// INPUT CFG: The blocks A and B form an irreducible loop with two headers.
-//
-// Entry
-// / \
-// v v
-// A ----> B
-// ^ /|
-// `----' |
-// v
-// Exit
-//
-// OUTPUT CFG: Edges incident on A and B are now redirected through a
-// new block N, forming a natural loop consisting of N, A and B.
-//
-// Entry
-// |
-// v
-// .---> N <---.
-// / / \ \
-// | / \ |
-// \ v v /
-// `-- A B --'
-// |
-// v
-// Exit
-//
-// The transformation is applied to every maximal SCC that is not already
-// recognized as a loop. The pass operates on all maximal SCCs found in the
-// function body outside of any loop, as well as those found inside each loop,
-// including inside any newly created loops. This ensures that any SCC hidden
-// inside a maximal SCC is also transformed.
-//
-// The actual transformation is handled by function CreateControlFlowHub, which
-// takes a set of incoming blocks (the predecessors) and outgoing blocks (the
-// headers). The function also moves every PHINode in an outgoing block to the
-// hub. Since the hub dominates all the outgoing blocks, each such PHINode
-// continues to dominate its uses. Since every header in an SCC has at least two
-// predecessors, every value used in the header (or later) but defined in a
-// predecessor (or earlier) is represented by a PHINode in a header. Hence the
-// above handling of PHINodes is sufficient and no further processing is
-// required to restore SSA.
-//
-// Limitation: The pass cannot handle switch statements and indirect
-// branches. Both must be lowered to plain branches first.
-//
-//===----------------------------------------------------------------------===//
-
+//===- FixIrreducible.cpp - Convert irreducible control-flow into loops ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// An irreducible SCC is one which has multiple "header" blocks, i.e., blocks
+// with control-flow edges incident from outside the SCC. This pass converts a
+// irreducible SCC into a natural loop by applying the following transformation:
+//
+// 1. Collect the set of headers H of the SCC.
+// 2. Collect the set of predecessors P of these headers. These may be inside as
+// well as outside the SCC.
+// 3. Create block N and redirect every edge from set P to set H through N.
+//
+// This converts the SCC into a natural loop with N as the header: N is the only
+// block with edges incident from outside the SCC, and all backedges in the SCC
+// are incident on N, i.e., for every backedge, the head now dominates the tail.
+//
+// INPUT CFG: The blocks A and B form an irreducible loop with two headers.
+//
+// Entry
+// / \
+// v v
+// A ----> B
+// ^ /|
+// `----' |
+// v
+// Exit
+//
+// OUTPUT CFG: Edges incident on A and B are now redirected through a
+// new block N, forming a natural loop consisting of N, A and B.
+//
+// Entry
+// |
+// v
+// .---> N <---.
+// / / \ \
+// | / \ |
+// \ v v /
+// `-- A B --'
+// |
+// v
+// Exit
+//
+// The transformation is applied to every maximal SCC that is not already
+// recognized as a loop. The pass operates on all maximal SCCs found in the
+// function body outside of any loop, as well as those found inside each loop,
+// including inside any newly created loops. This ensures that any SCC hidden
+// inside a maximal SCC is also transformed.
+//
+// The actual transformation is handled by function CreateControlFlowHub, which
+// takes a set of incoming blocks (the predecessors) and outgoing blocks (the
+// headers). The function also moves every PHINode in an outgoing block to the
+// hub. Since the hub dominates all the outgoing blocks, each such PHINode
+// continues to dominate its uses. Since every header in an SCC has at least two
+// predecessors, every value used in the header (or later) but defined in a
+// predecessor (or earlier) is represented by a PHINode in a header. Hence the
+// above handling of PHINodes is sufficient and no further processing is
+// required to restore SSA.
+//
+// Limitation: The pass cannot handle switch statements and indirect
+// branches. Both must be lowered to plain branches first.
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Utils/FixIrreducible.h"
-#include "llvm/ADT/SCCIterator.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-
-#define DEBUG_TYPE "fix-irreducible"
-
-using namespace llvm;
-
-namespace {
-struct FixIrreducible : public FunctionPass {
- static char ID;
- FixIrreducible() : FunctionPass(ID) {
- initializeFixIrreduciblePass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequiredID(LowerSwitchID);
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreservedID(LowerSwitchID);
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- }
-
- bool runOnFunction(Function &F) override;
-};
-} // namespace
-
-char FixIrreducible::ID = 0;
-
-FunctionPass *llvm::createFixIrreduciblePass() { return new FixIrreducible(); }
-
-INITIALIZE_PASS_BEGIN(FixIrreducible, "fix-irreducible",
- "Convert irreducible control-flow into natural loops",
- false /* Only looks at CFG */, false /* Analysis Pass */)
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "fix-irreducible"
+
+using namespace llvm;
+
+namespace {
+struct FixIrreducible : public FunctionPass {
+ static char ID;
+ FixIrreducible() : FunctionPass(ID) {
+ initializeFixIrreduciblePass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequiredID(LowerSwitchID);
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreservedID(LowerSwitchID);
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+char FixIrreducible::ID = 0;
+
+FunctionPass *llvm::createFixIrreduciblePass() { return new FixIrreducible(); }
+
+INITIALIZE_PASS_BEGIN(FixIrreducible, "fix-irreducible",
+ "Convert irreducible control-flow into natural loops",
+ false /* Only looks at CFG */, false /* Analysis Pass */)
INITIALIZE_PASS_DEPENDENCY(LowerSwitchLegacyPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(FixIrreducible, "fix-irreducible",
- "Convert irreducible control-flow into natural loops",
- false /* Only looks at CFG */, false /* Analysis Pass */)
-
-// When a new loop is created, existing children of the parent loop may now be
-// fully inside the new loop. Reconnect these as children of the new loop.
-static void reconnectChildLoops(LoopInfo &LI, Loop *ParentLoop, Loop *NewLoop,
- SetVector<BasicBlock *> &Blocks,
- SetVector<BasicBlock *> &Headers) {
- auto &CandidateLoops = ParentLoop ? ParentLoop->getSubLoopsVector()
- : LI.getTopLevelLoopsVector();
- // The new loop cannot be its own child, and any candidate is a
- // child iff its header is owned by the new loop. Move all the
- // children to a new vector.
- auto FirstChild = std::partition(
- CandidateLoops.begin(), CandidateLoops.end(), [&](Loop *L) {
- return L == NewLoop || Blocks.count(L->getHeader()) == 0;
- });
- SmallVector<Loop *, 8> ChildLoops(FirstChild, CandidateLoops.end());
- CandidateLoops.erase(FirstChild, CandidateLoops.end());
-
- for (auto II = ChildLoops.begin(), IE = ChildLoops.end(); II != IE; ++II) {
- auto Child = *II;
- LLVM_DEBUG(dbgs() << "child loop: " << Child->getHeader()->getName()
- << "\n");
- // TODO: A child loop whose header is also a header in the current
- // SCC gets destroyed since its backedges are removed. That may
- // not be necessary if we can retain such backedges.
- if (Headers.count(Child->getHeader())) {
- for (auto BB : Child->blocks()) {
- LI.changeLoopFor(BB, NewLoop);
- LLVM_DEBUG(dbgs() << "moved block from child: " << BB->getName()
- << "\n");
- }
- LI.destroy(Child);
- LLVM_DEBUG(dbgs() << "subsumed child loop (common header)\n");
- continue;
- }
-
- Child->setParentLoop(nullptr);
- NewLoop->addChildLoop(Child);
- LLVM_DEBUG(dbgs() << "added child loop to new loop\n");
- }
-}
-
-// Given a set of blocks and headers in an irreducible SCC, convert it into a
-// natural loop. Also insert this new loop at its appropriate place in the
-// hierarchy of loops.
-static void createNaturalLoopInternal(LoopInfo &LI, DominatorTree &DT,
- Loop *ParentLoop,
- SetVector<BasicBlock *> &Blocks,
- SetVector<BasicBlock *> &Headers) {
-#ifndef NDEBUG
- // All headers are part of the SCC
- for (auto H : Headers) {
- assert(Blocks.count(H));
- }
-#endif
-
- SetVector<BasicBlock *> Predecessors;
- for (auto H : Headers) {
- for (auto P : predecessors(H)) {
- Predecessors.insert(P);
- }
- }
-
- LLVM_DEBUG(
- dbgs() << "Found predecessors:";
- for (auto P : Predecessors) {
- dbgs() << " " << P->getName();
- }
- dbgs() << "\n");
-
- // Redirect all the backedges through a "hub" consisting of a series
- // of guard blocks that manage the flow of control from the
- // predecessors to the headers.
- SmallVector<BasicBlock *, 8> GuardBlocks;
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- CreateControlFlowHub(&DTU, GuardBlocks, Predecessors, Headers, "irr");
-#if defined(EXPENSIVE_CHECKS)
- assert(DT.verify(DominatorTree::VerificationLevel::Full));
-#else
- assert(DT.verify(DominatorTree::VerificationLevel::Fast));
-#endif
-
- // Create a new loop from the now-transformed cycle
- auto NewLoop = LI.AllocateLoop();
- if (ParentLoop) {
- ParentLoop->addChildLoop(NewLoop);
- } else {
- LI.addTopLevelLoop(NewLoop);
- }
-
- // Add the guard blocks to the new loop. The first guard block is
- // the head of all the backedges, and it is the first to be inserted
- // in the loop. This ensures that it is recognized as the
- // header. Since the new loop is already in LoopInfo, the new blocks
- // are also propagated up the chain of parent loops.
- for (auto G : GuardBlocks) {
- LLVM_DEBUG(dbgs() << "added guard block: " << G->getName() << "\n");
- NewLoop->addBasicBlockToLoop(G, LI);
- }
-
- // Add the SCC blocks to the new loop.
- for (auto BB : Blocks) {
- NewLoop->addBlockEntry(BB);
- if (LI.getLoopFor(BB) == ParentLoop) {
- LLVM_DEBUG(dbgs() << "moved block from parent: " << BB->getName()
- << "\n");
- LI.changeLoopFor(BB, NewLoop);
- } else {
- LLVM_DEBUG(dbgs() << "added block from child: " << BB->getName() << "\n");
- }
- }
- LLVM_DEBUG(dbgs() << "header for new loop: "
- << NewLoop->getHeader()->getName() << "\n");
-
- reconnectChildLoops(LI, ParentLoop, NewLoop, Blocks, Headers);
-
- NewLoop->verifyLoop();
- if (ParentLoop) {
- ParentLoop->verifyLoop();
- }
-#if defined(EXPENSIVE_CHECKS)
- LI.verify(DT);
-#endif // EXPENSIVE_CHECKS
-}
-
-namespace llvm {
-// Enable the graph traits required for traversing a Loop body.
-template <> struct GraphTraits<Loop> : LoopBodyTraits {};
-} // namespace llvm
-
-// Overloaded wrappers to go with the function template below.
-static BasicBlock *unwrapBlock(BasicBlock *B) { return B; }
-static BasicBlock *unwrapBlock(LoopBodyTraits::NodeRef &N) { return N.second; }
-
-static void createNaturalLoop(LoopInfo &LI, DominatorTree &DT, Function *F,
- SetVector<BasicBlock *> &Blocks,
- SetVector<BasicBlock *> &Headers) {
- createNaturalLoopInternal(LI, DT, nullptr, Blocks, Headers);
-}
-
-static void createNaturalLoop(LoopInfo &LI, DominatorTree &DT, Loop &L,
- SetVector<BasicBlock *> &Blocks,
- SetVector<BasicBlock *> &Headers) {
- createNaturalLoopInternal(LI, DT, &L, Blocks, Headers);
-}
-
-// Convert irreducible SCCs; Graph G may be a Function* or a Loop&.
-template <class Graph>
-static bool makeReducible(LoopInfo &LI, DominatorTree &DT, Graph &&G) {
- bool Changed = false;
- for (auto Scc = scc_begin(G); !Scc.isAtEnd(); ++Scc) {
- if (Scc->size() < 2)
- continue;
- SetVector<BasicBlock *> Blocks;
- LLVM_DEBUG(dbgs() << "Found SCC:");
- for (auto N : *Scc) {
- auto BB = unwrapBlock(N);
- LLVM_DEBUG(dbgs() << " " << BB->getName());
- Blocks.insert(BB);
- }
- LLVM_DEBUG(dbgs() << "\n");
-
- // Minor optimization: The SCC blocks are usually discovered in an order
- // that is the opposite of the order in which these blocks appear as branch
- // targets. This results in a lot of condition inversions in the control
- // flow out of the new ControlFlowHub, which can be mitigated if the orders
- // match. So we discover the headers using the reverse of the block order.
- SetVector<BasicBlock *> Headers;
- LLVM_DEBUG(dbgs() << "Found headers:");
- for (auto BB : reverse(Blocks)) {
- for (const auto P : predecessors(BB)) {
- // Skip unreachable predecessors.
- if (!DT.isReachableFromEntry(P))
- continue;
- if (!Blocks.count(P)) {
- LLVM_DEBUG(dbgs() << " " << BB->getName());
- Headers.insert(BB);
- break;
- }
- }
- }
- LLVM_DEBUG(dbgs() << "\n");
-
- if (Headers.size() == 1) {
- assert(LI.isLoopHeader(Headers.front()));
- LLVM_DEBUG(dbgs() << "Natural loop with a single header: skipped\n");
- continue;
- }
- createNaturalLoop(LI, DT, G, Blocks, Headers);
- Changed = true;
- }
- return Changed;
-}
-
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(FixIrreducible, "fix-irreducible",
+ "Convert irreducible control-flow into natural loops",
+ false /* Only looks at CFG */, false /* Analysis Pass */)
+
+// When a new loop is created, existing children of the parent loop may now be
+// fully inside the new loop. Reconnect these as children of the new loop.
+static void reconnectChildLoops(LoopInfo &LI, Loop *ParentLoop, Loop *NewLoop,
+ SetVector<BasicBlock *> &Blocks,
+ SetVector<BasicBlock *> &Headers) {
+ auto &CandidateLoops = ParentLoop ? ParentLoop->getSubLoopsVector()
+ : LI.getTopLevelLoopsVector();
+ // The new loop cannot be its own child, and any candidate is a
+ // child iff its header is owned by the new loop. Move all the
+ // children to a new vector.
+ auto FirstChild = std::partition(
+ CandidateLoops.begin(), CandidateLoops.end(), [&](Loop *L) {
+ return L == NewLoop || Blocks.count(L->getHeader()) == 0;
+ });
+ SmallVector<Loop *, 8> ChildLoops(FirstChild, CandidateLoops.end());
+ CandidateLoops.erase(FirstChild, CandidateLoops.end());
+
+ for (auto II = ChildLoops.begin(), IE = ChildLoops.end(); II != IE; ++II) {
+ auto Child = *II;
+ LLVM_DEBUG(dbgs() << "child loop: " << Child->getHeader()->getName()
+ << "\n");
+ // TODO: A child loop whose header is also a header in the current
+ // SCC gets destroyed since its backedges are removed. That may
+ // not be necessary if we can retain such backedges.
+ if (Headers.count(Child->getHeader())) {
+ for (auto BB : Child->blocks()) {
+ LI.changeLoopFor(BB, NewLoop);
+ LLVM_DEBUG(dbgs() << "moved block from child: " << BB->getName()
+ << "\n");
+ }
+ LI.destroy(Child);
+ LLVM_DEBUG(dbgs() << "subsumed child loop (common header)\n");
+ continue;
+ }
+
+ Child->setParentLoop(nullptr);
+ NewLoop->addChildLoop(Child);
+ LLVM_DEBUG(dbgs() << "added child loop to new loop\n");
+ }
+}
+
+// Given a set of blocks and headers in an irreducible SCC, convert it into a
+// natural loop. Also insert this new loop at its appropriate place in the
+// hierarchy of loops.
+static void createNaturalLoopInternal(LoopInfo &LI, DominatorTree &DT,
+ Loop *ParentLoop,
+ SetVector<BasicBlock *> &Blocks,
+ SetVector<BasicBlock *> &Headers) {
+#ifndef NDEBUG
+ // All headers are part of the SCC
+ for (auto H : Headers) {
+ assert(Blocks.count(H));
+ }
+#endif
+
+ SetVector<BasicBlock *> Predecessors;
+ for (auto H : Headers) {
+ for (auto P : predecessors(H)) {
+ Predecessors.insert(P);
+ }
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "Found predecessors:";
+ for (auto P : Predecessors) {
+ dbgs() << " " << P->getName();
+ }
+ dbgs() << "\n");
+
+ // Redirect all the backedges through a "hub" consisting of a series
+ // of guard blocks that manage the flow of control from the
+ // predecessors to the headers.
+ SmallVector<BasicBlock *, 8> GuardBlocks;
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ CreateControlFlowHub(&DTU, GuardBlocks, Predecessors, Headers, "irr");
+#if defined(EXPENSIVE_CHECKS)
+ assert(DT.verify(DominatorTree::VerificationLevel::Full));
+#else
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+#endif
+
+ // Create a new loop from the now-transformed cycle
+ auto NewLoop = LI.AllocateLoop();
+ if (ParentLoop) {
+ ParentLoop->addChildLoop(NewLoop);
+ } else {
+ LI.addTopLevelLoop(NewLoop);
+ }
+
+ // Add the guard blocks to the new loop. The first guard block is
+ // the head of all the backedges, and it is the first to be inserted
+ // in the loop. This ensures that it is recognized as the
+ // header. Since the new loop is already in LoopInfo, the new blocks
+ // are also propagated up the chain of parent loops.
+ for (auto G : GuardBlocks) {
+ LLVM_DEBUG(dbgs() << "added guard block: " << G->getName() << "\n");
+ NewLoop->addBasicBlockToLoop(G, LI);
+ }
+
+ // Add the SCC blocks to the new loop.
+ for (auto BB : Blocks) {
+ NewLoop->addBlockEntry(BB);
+ if (LI.getLoopFor(BB) == ParentLoop) {
+ LLVM_DEBUG(dbgs() << "moved block from parent: " << BB->getName()
+ << "\n");
+ LI.changeLoopFor(BB, NewLoop);
+ } else {
+ LLVM_DEBUG(dbgs() << "added block from child: " << BB->getName() << "\n");
+ }
+ }
+ LLVM_DEBUG(dbgs() << "header for new loop: "
+ << NewLoop->getHeader()->getName() << "\n");
+
+ reconnectChildLoops(LI, ParentLoop, NewLoop, Blocks, Headers);
+
+ NewLoop->verifyLoop();
+ if (ParentLoop) {
+ ParentLoop->verifyLoop();
+ }
+#if defined(EXPENSIVE_CHECKS)
+ LI.verify(DT);
+#endif // EXPENSIVE_CHECKS
+}
+
+namespace llvm {
+// Enable the graph traits required for traversing a Loop body.
+template <> struct GraphTraits<Loop> : LoopBodyTraits {};
+} // namespace llvm
+
+// Overloaded wrappers to go with the function template below.
+static BasicBlock *unwrapBlock(BasicBlock *B) { return B; }
+static BasicBlock *unwrapBlock(LoopBodyTraits::NodeRef &N) { return N.second; }
+
+static void createNaturalLoop(LoopInfo &LI, DominatorTree &DT, Function *F,
+ SetVector<BasicBlock *> &Blocks,
+ SetVector<BasicBlock *> &Headers) {
+ createNaturalLoopInternal(LI, DT, nullptr, Blocks, Headers);
+}
+
+static void createNaturalLoop(LoopInfo &LI, DominatorTree &DT, Loop &L,
+ SetVector<BasicBlock *> &Blocks,
+ SetVector<BasicBlock *> &Headers) {
+ createNaturalLoopInternal(LI, DT, &L, Blocks, Headers);
+}
+
+// Convert irreducible SCCs; Graph G may be a Function* or a Loop&.
+template <class Graph>
+static bool makeReducible(LoopInfo &LI, DominatorTree &DT, Graph &&G) {
+ bool Changed = false;
+ for (auto Scc = scc_begin(G); !Scc.isAtEnd(); ++Scc) {
+ if (Scc->size() < 2)
+ continue;
+ SetVector<BasicBlock *> Blocks;
+ LLVM_DEBUG(dbgs() << "Found SCC:");
+ for (auto N : *Scc) {
+ auto BB = unwrapBlock(N);
+ LLVM_DEBUG(dbgs() << " " << BB->getName());
+ Blocks.insert(BB);
+ }
+ LLVM_DEBUG(dbgs() << "\n");
+
+ // Minor optimization: The SCC blocks are usually discovered in an order
+ // that is the opposite of the order in which these blocks appear as branch
+ // targets. This results in a lot of condition inversions in the control
+ // flow out of the new ControlFlowHub, which can be mitigated if the orders
+ // match. So we discover the headers using the reverse of the block order.
+ SetVector<BasicBlock *> Headers;
+ LLVM_DEBUG(dbgs() << "Found headers:");
+ for (auto BB : reverse(Blocks)) {
+ for (const auto P : predecessors(BB)) {
+ // Skip unreachable predecessors.
+ if (!DT.isReachableFromEntry(P))
+ continue;
+ if (!Blocks.count(P)) {
+ LLVM_DEBUG(dbgs() << " " << BB->getName());
+ Headers.insert(BB);
+ break;
+ }
+ }
+ }
+ LLVM_DEBUG(dbgs() << "\n");
+
+ if (Headers.size() == 1) {
+ assert(LI.isLoopHeader(Headers.front()));
+ LLVM_DEBUG(dbgs() << "Natural loop with a single header: skipped\n");
+ continue;
+ }
+ createNaturalLoop(LI, DT, G, Blocks, Headers);
+ Changed = true;
+ }
+ return Changed;
+}
+
static bool FixIrreducibleImpl(Function &F, LoopInfo &LI, DominatorTree &DT) {
- LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: "
- << F.getName() << "\n");
-
- bool Changed = false;
- SmallVector<Loop *, 8> WorkList;
-
- LLVM_DEBUG(dbgs() << "visiting top-level\n");
- Changed |= makeReducible(LI, DT, &F);
-
- // Any SCCs reduced are now already in the list of top-level loops, so simply
- // add them all to the worklist.
+ LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: "
+ << F.getName() << "\n");
+
+ bool Changed = false;
+ SmallVector<Loop *, 8> WorkList;
+
+ LLVM_DEBUG(dbgs() << "visiting top-level\n");
+ Changed |= makeReducible(LI, DT, &F);
+
+ // Any SCCs reduced are now already in the list of top-level loops, so simply
+ // add them all to the worklist.
append_range(WorkList, LI);
-
- while (!WorkList.empty()) {
+
+ while (!WorkList.empty()) {
auto L = WorkList.pop_back_val();
- LLVM_DEBUG(dbgs() << "visiting loop with header "
- << L->getHeader()->getName() << "\n");
- Changed |= makeReducible(LI, DT, *L);
- // Any SCCs reduced are now already in the list of child loops, so simply
- // add them all to the worklist.
- WorkList.append(L->begin(), L->end());
- }
-
- return Changed;
-}
+ LLVM_DEBUG(dbgs() << "visiting loop with header "
+ << L->getHeader()->getName() << "\n");
+ Changed |= makeReducible(LI, DT, *L);
+ // Any SCCs reduced are now already in the list of child loops, so simply
+ // add them all to the worklist.
+ WorkList.append(L->begin(), L->end());
+ }
+
+ return Changed;
+}
bool FixIrreducible::runOnFunction(Function &F) {
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/FlattenCFG.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/FlattenCFG.cpp
index a24f9f8fd5..0098dcaeb0 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/FlattenCFG.cpp
@@ -1,545 +1,545 @@
-//===- FlatternCFG.cpp - Code to perform CFG flattening -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Reduce conditional branches in CFG.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <cassert>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "flattencfg"
-
-namespace {
-
-class FlattenCFGOpt {
- AliasAnalysis *AA;
-
- /// Use parallel-and or parallel-or to generate conditions for
- /// conditional branches.
- bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder);
-
- /// If \param BB is the merge block of an if-region, attempt to merge
- /// the if-region with an adjacent if-region upstream if two if-regions
- /// contain identical instructions.
- bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder);
-
- /// Compare a pair of blocks: \p Block1 and \p Block2, which
- /// are from two if-regions, where \p Head2 is the entry block of the 2nd
- /// if-region. \returns true if \p Block1 and \p Block2 contain identical
- /// instructions, and have no memory reference alias with \p Head2.
- /// This is used as a legality check for merging if-regions.
- bool CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2,
- BasicBlock *Head2);
-
-public:
- FlattenCFGOpt(AliasAnalysis *AA) : AA(AA) {}
-
- bool run(BasicBlock *BB);
-};
-
-} // end anonymous namespace
-
-/// If \param [in] BB has more than one predecessor that is a conditional
-/// branch, attempt to use parallel and/or for the branch condition. \returns
-/// true on success.
-///
-/// Before:
-/// ......
-/// %cmp10 = fcmp une float %tmp1, %tmp2
-/// br i1 %cmp10, label %if.then, label %lor.rhs
-///
-/// lor.rhs:
-/// ......
-/// %cmp11 = fcmp une float %tmp3, %tmp4
-/// br i1 %cmp11, label %if.then, label %ifend
-///
-/// if.end: // the merge block
-/// ......
-///
-/// if.then: // has two predecessors, both of them contains conditional branch.
-/// ......
-/// br label %if.end;
-///
-/// After:
-/// ......
-/// %cmp10 = fcmp une float %tmp1, %tmp2
-/// ......
-/// %cmp11 = fcmp une float %tmp3, %tmp4
-/// %cmp12 = or i1 %cmp10, %cmp11 // parallel-or mode.
-/// br i1 %cmp12, label %if.then, label %ifend
-///
-/// if.end:
-/// ......
-///
-/// if.then:
-/// ......
-/// br label %if.end;
-///
-/// Current implementation handles two cases.
-/// Case 1: BB is on the else-path.
-///
-/// BB1
-/// / |
-/// BB2 |
-/// / \ |
-/// BB3 \ | where, BB1, BB2 contain conditional branches.
-/// \ | / BB3 contains unconditional branch.
-/// \ | / BB4 corresponds to BB which is also the merge.
-/// BB => BB4
-///
-///
-/// Corresponding source code:
-///
-/// if (a == b && c == d)
-/// statement; // BB3
-///
-/// Case 2: BB is on the then-path.
-///
-/// BB1
-/// / |
-/// | BB2
-/// \ / | where BB1, BB2 contain conditional branches.
-/// BB => BB3 | BB3 contains unconditiona branch and corresponds
-/// \ / to BB. BB4 is the merge.
-/// BB4
-///
-/// Corresponding source code:
-///
-/// if (a == b || c == d)
-/// statement; // BB3
-///
-/// In both cases, BB is the common successor of conditional branches.
-/// In Case 1, BB (BB4) has an unconditional branch (BB3) as
-/// its predecessor. In Case 2, BB (BB3) only has conditional branches
-/// as its predecessors.
-bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
- PHINode *PHI = dyn_cast<PHINode>(BB->begin());
- if (PHI)
- return false; // For simplicity, avoid cases containing PHI nodes.
-
- BasicBlock *LastCondBlock = nullptr;
- BasicBlock *FirstCondBlock = nullptr;
- BasicBlock *UnCondBlock = nullptr;
- int Idx = -1;
-
- // Check predecessors of \param BB.
- SmallPtrSet<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
- for (SmallPtrSetIterator<BasicBlock *> PI = Preds.begin(), PE = Preds.end();
- PI != PE; ++PI) {
- BasicBlock *Pred = *PI;
- BranchInst *PBI = dyn_cast<BranchInst>(Pred->getTerminator());
-
- // All predecessors should terminate with a branch.
- if (!PBI)
- return false;
-
- BasicBlock *PP = Pred->getSinglePredecessor();
-
- if (PBI->isUnconditional()) {
- // Case 1: Pred (BB3) is an unconditional block, it should
- // have a single predecessor (BB2) that is also a predecessor
- // of \param BB (BB4) and should not have address-taken.
- // There should exist only one such unconditional
- // branch among the predecessors.
- if (UnCondBlock || !PP || (Preds.count(PP) == 0) ||
- Pred->hasAddressTaken())
- return false;
-
- UnCondBlock = Pred;
- continue;
- }
-
- // Only conditional branches are allowed beyond this point.
- assert(PBI->isConditional());
-
- // Condition's unique use should be the branch instruction.
- Value *PC = PBI->getCondition();
- if (!PC || !PC->hasOneUse())
- return false;
-
- if (PP && Preds.count(PP)) {
- // These are internal condition blocks to be merged from, e.g.,
- // BB2 in both cases.
- // Should not be address-taken.
- if (Pred->hasAddressTaken())
- return false;
-
- // Instructions in the internal condition blocks should be safe
- // to hoist up.
- for (BasicBlock::iterator BI = Pred->begin(), BE = PBI->getIterator();
- BI != BE;) {
- Instruction *CI = &*BI++;
- if (isa<PHINode>(CI) || !isSafeToSpeculativelyExecute(CI))
- return false;
- }
- } else {
- // This is the condition block to be merged into, e.g. BB1 in
- // both cases.
- if (FirstCondBlock)
- return false;
- FirstCondBlock = Pred;
- }
-
- // Find whether BB is uniformly on the true (or false) path
- // for all of its predecessors.
- BasicBlock *PS1 = PBI->getSuccessor(0);
- BasicBlock *PS2 = PBI->getSuccessor(1);
- BasicBlock *PS = (PS1 == BB) ? PS2 : PS1;
- int CIdx = (PS1 == BB) ? 0 : 1;
-
- if (Idx == -1)
- Idx = CIdx;
- else if (CIdx != Idx)
- return false;
-
- // PS is the successor which is not BB. Check successors to identify
- // the last conditional branch.
- if (Preds.count(PS) == 0) {
- // Case 2.
- LastCondBlock = Pred;
- } else {
- // Case 1
- BranchInst *BPS = dyn_cast<BranchInst>(PS->getTerminator());
- if (BPS && BPS->isUnconditional()) {
- // Case 1: PS(BB3) should be an unconditional branch.
- LastCondBlock = Pred;
- }
- }
- }
-
- if (!FirstCondBlock || !LastCondBlock || (FirstCondBlock == LastCondBlock))
- return false;
-
- Instruction *TBB = LastCondBlock->getTerminator();
- BasicBlock *PS1 = TBB->getSuccessor(0);
- BasicBlock *PS2 = TBB->getSuccessor(1);
- BranchInst *PBI1 = dyn_cast<BranchInst>(PS1->getTerminator());
- BranchInst *PBI2 = dyn_cast<BranchInst>(PS2->getTerminator());
-
- // If PS1 does not jump into PS2, but PS2 jumps into PS1,
- // attempt branch inversion.
- if (!PBI1 || !PBI1->isUnconditional() ||
- (PS1->getTerminator()->getSuccessor(0) != PS2)) {
- // Check whether PS2 jumps into PS1.
- if (!PBI2 || !PBI2->isUnconditional() ||
- (PS2->getTerminator()->getSuccessor(0) != PS1))
- return false;
-
- // Do branch inversion.
- BasicBlock *CurrBlock = LastCondBlock;
- bool EverChanged = false;
- for (; CurrBlock != FirstCondBlock;
- CurrBlock = CurrBlock->getSinglePredecessor()) {
- auto *BI = cast<BranchInst>(CurrBlock->getTerminator());
- auto *CI = dyn_cast<CmpInst>(BI->getCondition());
- if (!CI)
- continue;
-
- CmpInst::Predicate Predicate = CI->getPredicate();
- // Canonicalize icmp_ne -> icmp_eq, fcmp_one -> fcmp_oeq
- if ((Predicate == CmpInst::ICMP_NE) || (Predicate == CmpInst::FCMP_ONE)) {
- CI->setPredicate(ICmpInst::getInversePredicate(Predicate));
- BI->swapSuccessors();
- EverChanged = true;
- }
- }
- return EverChanged;
- }
-
- // PS1 must have a conditional branch.
- if (!PBI1 || !PBI1->isUnconditional())
- return false;
-
- // PS2 should not contain PHI node.
- PHI = dyn_cast<PHINode>(PS2->begin());
- if (PHI)
- return false;
-
- // Do the transformation.
- BasicBlock *CB;
- BranchInst *PBI = cast<BranchInst>(FirstCondBlock->getTerminator());
- bool Iteration = true;
- IRBuilder<>::InsertPointGuard Guard(Builder);
- Value *PC = PBI->getCondition();
-
- do {
- CB = PBI->getSuccessor(1 - Idx);
- // Delete the conditional branch.
- FirstCondBlock->getInstList().pop_back();
- FirstCondBlock->getInstList()
- .splice(FirstCondBlock->end(), CB->getInstList());
- PBI = cast<BranchInst>(FirstCondBlock->getTerminator());
- Value *CC = PBI->getCondition();
- // Merge conditions.
- Builder.SetInsertPoint(PBI);
- Value *NC;
- if (Idx == 0)
- // Case 2, use parallel or.
- NC = Builder.CreateOr(PC, CC);
- else
- // Case 1, use parallel and.
- NC = Builder.CreateAnd(PC, CC);
-
- PBI->replaceUsesOfWith(CC, NC);
- PC = NC;
- if (CB == LastCondBlock)
- Iteration = false;
- // Remove internal conditional branches.
- CB->dropAllReferences();
- // make CB unreachable and let downstream to delete the block.
- new UnreachableInst(CB->getContext(), CB);
- } while (Iteration);
-
- LLVM_DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock);
- return true;
-}
-
-/// Compare blocks from two if-regions, where \param Head2 is the entry of the
-/// 2nd if-region. \param Block1 is a block in the 1st if-region to compare.
-/// \param Block2 is a block in the 2nd if-region to compare. \returns true if
-/// Block1 and Block2 have identical instructions and do not have
-/// memory reference alias with Head2.
-bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2,
- BasicBlock *Head2) {
- Instruction *PTI2 = Head2->getTerminator();
- Instruction *PBI2 = &Head2->front();
-
- // Check whether instructions in Block1 and Block2 are identical
- // and do not alias with instructions in Head2.
- BasicBlock::iterator iter1 = Block1->begin();
- BasicBlock::iterator end1 = Block1->getTerminator()->getIterator();
- BasicBlock::iterator iter2 = Block2->begin();
- BasicBlock::iterator end2 = Block2->getTerminator()->getIterator();
-
- while (true) {
- if (iter1 == end1) {
- if (iter2 != end2)
- return false;
- break;
- }
-
- if (!iter1->isIdenticalTo(&*iter2))
- return false;
-
- // Illegal to remove instructions with side effects except
- // non-volatile stores.
- if (iter1->mayHaveSideEffects()) {
- Instruction *CurI = &*iter1;
- StoreInst *SI = dyn_cast<StoreInst>(CurI);
- if (!SI || SI->isVolatile())
- return false;
- }
-
- // For simplicity and speed, data dependency check can be
- // avoided if read from memory doesn't exist.
- if (iter1->mayReadFromMemory())
- return false;
-
- if (iter1->mayWriteToMemory()) {
- for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) {
- if (BI->mayReadFromMemory() || BI->mayWriteToMemory()) {
- // Check alias with Head2.
- if (!AA || AA->alias(&*iter1, &*BI))
- return false;
- }
- }
- }
- ++iter1;
- ++iter2;
- }
-
- return true;
-}
-
-/// Check whether \param BB is the merge block of a if-region. If yes, check
-/// whether there exists an adjacent if-region upstream, the two if-regions
-/// contain identical instructions and can be legally merged. \returns true if
-/// the two if-regions are merged.
-///
-/// From:
-/// if (a)
-/// statement;
-/// if (b)
-/// statement;
-///
-/// To:
-/// if (a || b)
-/// statement;
-///
-///
-/// And from:
-/// if (a)
-/// ;
-/// else
-/// statement;
-/// if (b)
-/// ;
-/// else
-/// statement;
-///
-/// To:
-/// if (a && b)
-/// ;
-/// else
-/// statement;
-///
-/// We always take the form of the first if-region. This means that if the
-/// statement in the first if-region, is in the "then-path", while in the second
-/// if-region it is in the "else-path", then we convert the second to the first
-/// form, by inverting the condition and the branch successors. The same
-/// approach goes for the opposite case.
-bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
- BasicBlock *IfTrue2, *IfFalse2;
- Value *IfCond2 = GetIfCondition(BB, IfTrue2, IfFalse2);
- Instruction *CInst2 = dyn_cast_or_null<Instruction>(IfCond2);
- if (!CInst2)
- return false;
-
- BasicBlock *SecondEntryBlock = CInst2->getParent();
- if (SecondEntryBlock->hasAddressTaken())
- return false;
-
- BasicBlock *IfTrue1, *IfFalse1;
- Value *IfCond1 = GetIfCondition(SecondEntryBlock, IfTrue1, IfFalse1);
- Instruction *CInst1 = dyn_cast_or_null<Instruction>(IfCond1);
- if (!CInst1)
- return false;
-
- BasicBlock *FirstEntryBlock = CInst1->getParent();
-
- // Either then-path or else-path should be empty.
- bool InvertCond2 = false;
- BinaryOperator::BinaryOps CombineOp;
- if (IfFalse1 == FirstEntryBlock) {
- // The else-path is empty, so we must use "or" operation to combine the
- // conditions.
- CombineOp = BinaryOperator::Or;
- if (IfFalse2 != SecondEntryBlock) {
- if (IfTrue2 != SecondEntryBlock)
- return false;
-
- InvertCond2 = true;
- std::swap(IfTrue2, IfFalse2);
- }
-
- if (!CompareIfRegionBlock(IfTrue1, IfTrue2, SecondEntryBlock))
- return false;
- } else if (IfTrue1 == FirstEntryBlock) {
- // The then-path is empty, so we must use "and" operation to combine the
- // conditions.
- CombineOp = BinaryOperator::And;
- if (IfTrue2 != SecondEntryBlock) {
- if (IfFalse2 != SecondEntryBlock)
- return false;
-
- InvertCond2 = true;
- std::swap(IfTrue2, IfFalse2);
- }
-
- if (!CompareIfRegionBlock(IfFalse1, IfFalse2, SecondEntryBlock))
- return false;
- } else
- return false;
-
- Instruction *PTI2 = SecondEntryBlock->getTerminator();
- Instruction *PBI2 = &SecondEntryBlock->front();
-
- // Check whether \param SecondEntryBlock has side-effect and is safe to
- // speculate.
- for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) {
- Instruction *CI = &*BI;
- if (isa<PHINode>(CI) || CI->mayHaveSideEffects() ||
- !isSafeToSpeculativelyExecute(CI))
- return false;
- }
-
- // Merge \param SecondEntryBlock into \param FirstEntryBlock.
- FirstEntryBlock->getInstList().pop_back();
- FirstEntryBlock->getInstList()
- .splice(FirstEntryBlock->end(), SecondEntryBlock->getInstList());
- BranchInst *PBI = cast<BranchInst>(FirstEntryBlock->getTerminator());
- assert(PBI->getCondition() == IfCond2);
- BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
- BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
- Builder.SetInsertPoint(PBI);
- if (InvertCond2) {
- // If this is a "cmp" instruction, only used for branching (and nowhere
- // else), then we can simply invert the predicate.
- auto Cmp2 = dyn_cast<CmpInst>(CInst2);
- if (Cmp2 && Cmp2->hasOneUse())
- Cmp2->setPredicate(Cmp2->getInversePredicate());
- else
- CInst2 = cast<Instruction>(Builder.CreateNot(CInst2));
- PBI->swapSuccessors();
- }
- Value *NC = Builder.CreateBinOp(CombineOp, CInst1, CInst2);
- PBI->replaceUsesOfWith(IfCond2, NC);
- Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt);
-
- // Handle PHI node to replace its predecessors to FirstEntryBlock.
- for (BasicBlock *Succ : successors(PBI)) {
- for (PHINode &Phi : Succ->phis()) {
- for (unsigned i = 0, e = Phi.getNumIncomingValues(); i != e; ++i) {
- if (Phi.getIncomingBlock(i) == SecondEntryBlock)
- Phi.setIncomingBlock(i, FirstEntryBlock);
- }
- }
- }
-
- // Remove IfTrue1
- if (IfTrue1 != FirstEntryBlock) {
- IfTrue1->dropAllReferences();
- IfTrue1->eraseFromParent();
- }
-
- // Remove IfFalse1
- if (IfFalse1 != FirstEntryBlock) {
- IfFalse1->dropAllReferences();
- IfFalse1->eraseFromParent();
- }
-
- // Remove \param SecondEntryBlock
- SecondEntryBlock->dropAllReferences();
- SecondEntryBlock->eraseFromParent();
- LLVM_DEBUG(dbgs() << "If conditions merged into:\n" << *FirstEntryBlock);
- return true;
-}
-
-bool FlattenCFGOpt::run(BasicBlock *BB) {
- assert(BB && BB->getParent() && "Block not embedded in function!");
- assert(BB->getTerminator() && "Degenerate basic block encountered!");
-
- IRBuilder<> Builder(BB);
-
- if (FlattenParallelAndOr(BB, Builder) || MergeIfRegion(BB, Builder))
- return true;
- return false;
-}
-
-/// FlattenCFG - This function is used to flatten a CFG. For
-/// example, it uses parallel-and and parallel-or mode to collapse
-/// if-conditions and merge if-regions with identical statements.
-bool llvm::FlattenCFG(BasicBlock *BB, AAResults *AA) {
- return FlattenCFGOpt(AA).run(BB);
-}
+//===- FlatternCFG.cpp - Code to perform CFG flattening -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Reduce conditional branches in CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "flattencfg"
+
+namespace {
+
+class FlattenCFGOpt {
+ AliasAnalysis *AA;
+
+ /// Use parallel-and or parallel-or to generate conditions for
+ /// conditional branches.
+ bool FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder);
+
+ /// If \param BB is the merge block of an if-region, attempt to merge
+ /// the if-region with an adjacent if-region upstream if two if-regions
+ /// contain identical instructions.
+ bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder);
+
+ /// Compare a pair of blocks: \p Block1 and \p Block2, which
+ /// are from two if-regions, where \p Head2 is the entry block of the 2nd
+ /// if-region. \returns true if \p Block1 and \p Block2 contain identical
+ /// instructions, and have no memory reference alias with \p Head2.
+ /// This is used as a legality check for merging if-regions.
+ bool CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2,
+ BasicBlock *Head2);
+
+public:
+ FlattenCFGOpt(AliasAnalysis *AA) : AA(AA) {}
+
+ bool run(BasicBlock *BB);
+};
+
+} // end anonymous namespace
+
+/// If \param [in] BB has more than one predecessor that is a conditional
+/// branch, attempt to use parallel and/or for the branch condition. \returns
+/// true on success.
+///
+/// Before:
+/// ......
+/// %cmp10 = fcmp une float %tmp1, %tmp2
+/// br i1 %cmp10, label %if.then, label %lor.rhs
+///
+/// lor.rhs:
+/// ......
+/// %cmp11 = fcmp une float %tmp3, %tmp4
+/// br i1 %cmp11, label %if.then, label %ifend
+///
+/// if.end: // the merge block
+/// ......
+///
+/// if.then: // has two predecessors, both of them contains conditional branch.
+/// ......
+/// br label %if.end;
+///
+/// After:
+/// ......
+/// %cmp10 = fcmp une float %tmp1, %tmp2
+/// ......
+/// %cmp11 = fcmp une float %tmp3, %tmp4
+/// %cmp12 = or i1 %cmp10, %cmp11 // parallel-or mode.
+/// br i1 %cmp12, label %if.then, label %ifend
+///
+/// if.end:
+/// ......
+///
+/// if.then:
+/// ......
+/// br label %if.end;
+///
+/// Current implementation handles two cases.
+/// Case 1: BB is on the else-path.
+///
+/// BB1
+/// / |
+/// BB2 |
+/// / \ |
+/// BB3 \ | where, BB1, BB2 contain conditional branches.
+/// \ | / BB3 contains unconditional branch.
+/// \ | / BB4 corresponds to BB which is also the merge.
+/// BB => BB4
+///
+///
+/// Corresponding source code:
+///
+/// if (a == b && c == d)
+/// statement; // BB3
+///
+/// Case 2: BB is on the then-path.
+///
+/// BB1
+/// / |
+/// | BB2
+/// \ / | where BB1, BB2 contain conditional branches.
+/// BB => BB3 | BB3 contains unconditiona branch and corresponds
+/// \ / to BB. BB4 is the merge.
+/// BB4
+///
+/// Corresponding source code:
+///
+/// if (a == b || c == d)
+/// statement; // BB3
+///
+/// In both cases, BB is the common successor of conditional branches.
+/// In Case 1, BB (BB4) has an unconditional branch (BB3) as
+/// its predecessor. In Case 2, BB (BB3) only has conditional branches
+/// as its predecessors.
+bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
+ PHINode *PHI = dyn_cast<PHINode>(BB->begin());
+ if (PHI)
+ return false; // For simplicity, avoid cases containing PHI nodes.
+
+ BasicBlock *LastCondBlock = nullptr;
+ BasicBlock *FirstCondBlock = nullptr;
+ BasicBlock *UnCondBlock = nullptr;
+ int Idx = -1;
+
+ // Check predecessors of \param BB.
+ SmallPtrSet<BasicBlock *, 16> Preds(pred_begin(BB), pred_end(BB));
+ for (SmallPtrSetIterator<BasicBlock *> PI = Preds.begin(), PE = Preds.end();
+ PI != PE; ++PI) {
+ BasicBlock *Pred = *PI;
+ BranchInst *PBI = dyn_cast<BranchInst>(Pred->getTerminator());
+
+ // All predecessors should terminate with a branch.
+ if (!PBI)
+ return false;
+
+ BasicBlock *PP = Pred->getSinglePredecessor();
+
+ if (PBI->isUnconditional()) {
+ // Case 1: Pred (BB3) is an unconditional block, it should
+ // have a single predecessor (BB2) that is also a predecessor
+ // of \param BB (BB4) and should not have address-taken.
+ // There should exist only one such unconditional
+ // branch among the predecessors.
+ if (UnCondBlock || !PP || (Preds.count(PP) == 0) ||
+ Pred->hasAddressTaken())
+ return false;
+
+ UnCondBlock = Pred;
+ continue;
+ }
+
+ // Only conditional branches are allowed beyond this point.
+ assert(PBI->isConditional());
+
+ // Condition's unique use should be the branch instruction.
+ Value *PC = PBI->getCondition();
+ if (!PC || !PC->hasOneUse())
+ return false;
+
+ if (PP && Preds.count(PP)) {
+ // These are internal condition blocks to be merged from, e.g.,
+ // BB2 in both cases.
+ // Should not be address-taken.
+ if (Pred->hasAddressTaken())
+ return false;
+
+ // Instructions in the internal condition blocks should be safe
+ // to hoist up.
+ for (BasicBlock::iterator BI = Pred->begin(), BE = PBI->getIterator();
+ BI != BE;) {
+ Instruction *CI = &*BI++;
+ if (isa<PHINode>(CI) || !isSafeToSpeculativelyExecute(CI))
+ return false;
+ }
+ } else {
+ // This is the condition block to be merged into, e.g. BB1 in
+ // both cases.
+ if (FirstCondBlock)
+ return false;
+ FirstCondBlock = Pred;
+ }
+
+ // Find whether BB is uniformly on the true (or false) path
+ // for all of its predecessors.
+ BasicBlock *PS1 = PBI->getSuccessor(0);
+ BasicBlock *PS2 = PBI->getSuccessor(1);
+ BasicBlock *PS = (PS1 == BB) ? PS2 : PS1;
+ int CIdx = (PS1 == BB) ? 0 : 1;
+
+ if (Idx == -1)
+ Idx = CIdx;
+ else if (CIdx != Idx)
+ return false;
+
+ // PS is the successor which is not BB. Check successors to identify
+ // the last conditional branch.
+ if (Preds.count(PS) == 0) {
+ // Case 2.
+ LastCondBlock = Pred;
+ } else {
+ // Case 1
+ BranchInst *BPS = dyn_cast<BranchInst>(PS->getTerminator());
+ if (BPS && BPS->isUnconditional()) {
+ // Case 1: PS(BB3) should be an unconditional branch.
+ LastCondBlock = Pred;
+ }
+ }
+ }
+
+ if (!FirstCondBlock || !LastCondBlock || (FirstCondBlock == LastCondBlock))
+ return false;
+
+ Instruction *TBB = LastCondBlock->getTerminator();
+ BasicBlock *PS1 = TBB->getSuccessor(0);
+ BasicBlock *PS2 = TBB->getSuccessor(1);
+ BranchInst *PBI1 = dyn_cast<BranchInst>(PS1->getTerminator());
+ BranchInst *PBI2 = dyn_cast<BranchInst>(PS2->getTerminator());
+
+ // If PS1 does not jump into PS2, but PS2 jumps into PS1,
+ // attempt branch inversion.
+ if (!PBI1 || !PBI1->isUnconditional() ||
+ (PS1->getTerminator()->getSuccessor(0) != PS2)) {
+ // Check whether PS2 jumps into PS1.
+ if (!PBI2 || !PBI2->isUnconditional() ||
+ (PS2->getTerminator()->getSuccessor(0) != PS1))
+ return false;
+
+ // Do branch inversion.
+ BasicBlock *CurrBlock = LastCondBlock;
+ bool EverChanged = false;
+ for (; CurrBlock != FirstCondBlock;
+ CurrBlock = CurrBlock->getSinglePredecessor()) {
+ auto *BI = cast<BranchInst>(CurrBlock->getTerminator());
+ auto *CI = dyn_cast<CmpInst>(BI->getCondition());
+ if (!CI)
+ continue;
+
+ CmpInst::Predicate Predicate = CI->getPredicate();
+ // Canonicalize icmp_ne -> icmp_eq, fcmp_one -> fcmp_oeq
+ if ((Predicate == CmpInst::ICMP_NE) || (Predicate == CmpInst::FCMP_ONE)) {
+ CI->setPredicate(ICmpInst::getInversePredicate(Predicate));
+ BI->swapSuccessors();
+ EverChanged = true;
+ }
+ }
+ return EverChanged;
+ }
+
+ // PS1 must have a conditional branch.
+ if (!PBI1 || !PBI1->isUnconditional())
+ return false;
+
+ // PS2 should not contain PHI node.
+ PHI = dyn_cast<PHINode>(PS2->begin());
+ if (PHI)
+ return false;
+
+ // Do the transformation.
+ BasicBlock *CB;
+ BranchInst *PBI = cast<BranchInst>(FirstCondBlock->getTerminator());
+ bool Iteration = true;
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ Value *PC = PBI->getCondition();
+
+ do {
+ CB = PBI->getSuccessor(1 - Idx);
+ // Delete the conditional branch.
+ FirstCondBlock->getInstList().pop_back();
+ FirstCondBlock->getInstList()
+ .splice(FirstCondBlock->end(), CB->getInstList());
+ PBI = cast<BranchInst>(FirstCondBlock->getTerminator());
+ Value *CC = PBI->getCondition();
+ // Merge conditions.
+ Builder.SetInsertPoint(PBI);
+ Value *NC;
+ if (Idx == 0)
+ // Case 2, use parallel or.
+ NC = Builder.CreateOr(PC, CC);
+ else
+ // Case 1, use parallel and.
+ NC = Builder.CreateAnd(PC, CC);
+
+ PBI->replaceUsesOfWith(CC, NC);
+ PC = NC;
+ if (CB == LastCondBlock)
+ Iteration = false;
+ // Remove internal conditional branches.
+ CB->dropAllReferences();
+ // make CB unreachable and let downstream to delete the block.
+ new UnreachableInst(CB->getContext(), CB);
+ } while (Iteration);
+
+ LLVM_DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock);
+ return true;
+}
+
+/// Compare blocks from two if-regions, where \param Head2 is the entry of the
+/// 2nd if-region. \param Block1 is a block in the 1st if-region to compare.
+/// \param Block2 is a block in the 2nd if-region to compare. \returns true if
+/// Block1 and Block2 have identical instructions and do not have
+/// memory reference alias with Head2.
+bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2,
+ BasicBlock *Head2) {
+ Instruction *PTI2 = Head2->getTerminator();
+ Instruction *PBI2 = &Head2->front();
+
+ // Check whether instructions in Block1 and Block2 are identical
+ // and do not alias with instructions in Head2.
+ BasicBlock::iterator iter1 = Block1->begin();
+ BasicBlock::iterator end1 = Block1->getTerminator()->getIterator();
+ BasicBlock::iterator iter2 = Block2->begin();
+ BasicBlock::iterator end2 = Block2->getTerminator()->getIterator();
+
+ while (true) {
+ if (iter1 == end1) {
+ if (iter2 != end2)
+ return false;
+ break;
+ }
+
+ if (!iter1->isIdenticalTo(&*iter2))
+ return false;
+
+ // Illegal to remove instructions with side effects except
+ // non-volatile stores.
+ if (iter1->mayHaveSideEffects()) {
+ Instruction *CurI = &*iter1;
+ StoreInst *SI = dyn_cast<StoreInst>(CurI);
+ if (!SI || SI->isVolatile())
+ return false;
+ }
+
+ // For simplicity and speed, data dependency check can be
+ // avoided if read from memory doesn't exist.
+ if (iter1->mayReadFromMemory())
+ return false;
+
+ if (iter1->mayWriteToMemory()) {
+ for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) {
+ if (BI->mayReadFromMemory() || BI->mayWriteToMemory()) {
+ // Check alias with Head2.
+ if (!AA || AA->alias(&*iter1, &*BI))
+ return false;
+ }
+ }
+ }
+ ++iter1;
+ ++iter2;
+ }
+
+ return true;
+}
+
+/// Check whether \param BB is the merge block of a if-region. If yes, check
+/// whether there exists an adjacent if-region upstream, the two if-regions
+/// contain identical instructions and can be legally merged. \returns true if
+/// the two if-regions are merged.
+///
+/// From:
+/// if (a)
+/// statement;
+/// if (b)
+/// statement;
+///
+/// To:
+/// if (a || b)
+/// statement;
+///
+///
+/// And from:
+/// if (a)
+/// ;
+/// else
+/// statement;
+/// if (b)
+/// ;
+/// else
+/// statement;
+///
+/// To:
+/// if (a && b)
+/// ;
+/// else
+/// statement;
+///
+/// We always take the form of the first if-region. This means that if the
+/// statement in the first if-region, is in the "then-path", while in the second
+/// if-region it is in the "else-path", then we convert the second to the first
+/// form, by inverting the condition and the branch successors. The same
+/// approach goes for the opposite case.
+bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
+ BasicBlock *IfTrue2, *IfFalse2;
+ Value *IfCond2 = GetIfCondition(BB, IfTrue2, IfFalse2);
+ Instruction *CInst2 = dyn_cast_or_null<Instruction>(IfCond2);
+ if (!CInst2)
+ return false;
+
+ BasicBlock *SecondEntryBlock = CInst2->getParent();
+ if (SecondEntryBlock->hasAddressTaken())
+ return false;
+
+ BasicBlock *IfTrue1, *IfFalse1;
+ Value *IfCond1 = GetIfCondition(SecondEntryBlock, IfTrue1, IfFalse1);
+ Instruction *CInst1 = dyn_cast_or_null<Instruction>(IfCond1);
+ if (!CInst1)
+ return false;
+
+ BasicBlock *FirstEntryBlock = CInst1->getParent();
+
+ // Either then-path or else-path should be empty.
+ bool InvertCond2 = false;
+ BinaryOperator::BinaryOps CombineOp;
+ if (IfFalse1 == FirstEntryBlock) {
+ // The else-path is empty, so we must use "or" operation to combine the
+ // conditions.
+ CombineOp = BinaryOperator::Or;
+ if (IfFalse2 != SecondEntryBlock) {
+ if (IfTrue2 != SecondEntryBlock)
+ return false;
+
+ InvertCond2 = true;
+ std::swap(IfTrue2, IfFalse2);
+ }
+
+ if (!CompareIfRegionBlock(IfTrue1, IfTrue2, SecondEntryBlock))
+ return false;
+ } else if (IfTrue1 == FirstEntryBlock) {
+ // The then-path is empty, so we must use "and" operation to combine the
+ // conditions.
+ CombineOp = BinaryOperator::And;
+ if (IfTrue2 != SecondEntryBlock) {
+ if (IfFalse2 != SecondEntryBlock)
+ return false;
+
+ InvertCond2 = true;
+ std::swap(IfTrue2, IfFalse2);
+ }
+
+ if (!CompareIfRegionBlock(IfFalse1, IfFalse2, SecondEntryBlock))
+ return false;
+ } else
+ return false;
+
+ Instruction *PTI2 = SecondEntryBlock->getTerminator();
+ Instruction *PBI2 = &SecondEntryBlock->front();
+
+ // Check whether \param SecondEntryBlock has side-effect and is safe to
+ // speculate.
+ for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) {
+ Instruction *CI = &*BI;
+ if (isa<PHINode>(CI) || CI->mayHaveSideEffects() ||
+ !isSafeToSpeculativelyExecute(CI))
+ return false;
+ }
+
+ // Merge \param SecondEntryBlock into \param FirstEntryBlock.
+ FirstEntryBlock->getInstList().pop_back();
+ FirstEntryBlock->getInstList()
+ .splice(FirstEntryBlock->end(), SecondEntryBlock->getInstList());
+ BranchInst *PBI = cast<BranchInst>(FirstEntryBlock->getTerminator());
+ assert(PBI->getCondition() == IfCond2);
+ BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
+ BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
+ Builder.SetInsertPoint(PBI);
+ if (InvertCond2) {
+ // If this is a "cmp" instruction, only used for branching (and nowhere
+ // else), then we can simply invert the predicate.
+ auto Cmp2 = dyn_cast<CmpInst>(CInst2);
+ if (Cmp2 && Cmp2->hasOneUse())
+ Cmp2->setPredicate(Cmp2->getInversePredicate());
+ else
+ CInst2 = cast<Instruction>(Builder.CreateNot(CInst2));
+ PBI->swapSuccessors();
+ }
+ Value *NC = Builder.CreateBinOp(CombineOp, CInst1, CInst2);
+ PBI->replaceUsesOfWith(IfCond2, NC);
+ Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt);
+
+ // Handle PHI node to replace its predecessors to FirstEntryBlock.
+ for (BasicBlock *Succ : successors(PBI)) {
+ for (PHINode &Phi : Succ->phis()) {
+ for (unsigned i = 0, e = Phi.getNumIncomingValues(); i != e; ++i) {
+ if (Phi.getIncomingBlock(i) == SecondEntryBlock)
+ Phi.setIncomingBlock(i, FirstEntryBlock);
+ }
+ }
+ }
+
+ // Remove IfTrue1
+ if (IfTrue1 != FirstEntryBlock) {
+ IfTrue1->dropAllReferences();
+ IfTrue1->eraseFromParent();
+ }
+
+ // Remove IfFalse1
+ if (IfFalse1 != FirstEntryBlock) {
+ IfFalse1->dropAllReferences();
+ IfFalse1->eraseFromParent();
+ }
+
+ // Remove \param SecondEntryBlock
+ SecondEntryBlock->dropAllReferences();
+ SecondEntryBlock->eraseFromParent();
+ LLVM_DEBUG(dbgs() << "If conditions merged into:\n" << *FirstEntryBlock);
+ return true;
+}
+
+bool FlattenCFGOpt::run(BasicBlock *BB) {
+ assert(BB && BB->getParent() && "Block not embedded in function!");
+ assert(BB->getTerminator() && "Degenerate basic block encountered!");
+
+ IRBuilder<> Builder(BB);
+
+ if (FlattenParallelAndOr(BB, Builder) || MergeIfRegion(BB, Builder))
+ return true;
+ return false;
+}
+
+/// FlattenCFG - This function is used to flatten a CFG. For
+/// example, it uses parallel-and and parallel-or mode to collapse
+/// if-conditions and merge if-regions with identical statements.
+bool llvm::FlattenCFG(BasicBlock *BB, AAResults *AA) {
+ return FlattenCFGOpt(AA).run(BB);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/FunctionComparator.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/FunctionComparator.cpp
index df90e972b1..2696557a71 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/FunctionComparator.cpp
@@ -1,499 +1,499 @@
-//===- FunctionComparator.h - Function Comparator -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the FunctionComparator and GlobalNumberState classes
-// which are used by the MergeFunctions pass for comparing functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/FunctionComparator.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "functioncomparator"
-
-int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const {
- if (L < R)
- return -1;
- if (L > R)
- return 1;
- return 0;
-}
-
-int FunctionComparator::cmpOrderings(AtomicOrdering L, AtomicOrdering R) const {
- if ((int)L < (int)R)
- return -1;
- if ((int)L > (int)R)
- return 1;
- return 0;
-}
-
-int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const {
- if (int Res = cmpNumbers(L.getBitWidth(), R.getBitWidth()))
- return Res;
- if (L.ugt(R))
- return 1;
- if (R.ugt(L))
- return -1;
- return 0;
-}
-
-int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const {
- // Floats are ordered first by semantics (i.e. float, double, half, etc.),
- // then by value interpreted as a bitstring (aka APInt).
- const fltSemantics &SL = L.getSemantics(), &SR = R.getSemantics();
- if (int Res = cmpNumbers(APFloat::semanticsPrecision(SL),
- APFloat::semanticsPrecision(SR)))
- return Res;
- if (int Res = cmpNumbers(APFloat::semanticsMaxExponent(SL),
- APFloat::semanticsMaxExponent(SR)))
- return Res;
- if (int Res = cmpNumbers(APFloat::semanticsMinExponent(SL),
- APFloat::semanticsMinExponent(SR)))
- return Res;
- if (int Res = cmpNumbers(APFloat::semanticsSizeInBits(SL),
- APFloat::semanticsSizeInBits(SR)))
- return Res;
- return cmpAPInts(L.bitcastToAPInt(), R.bitcastToAPInt());
-}
-
-int FunctionComparator::cmpMem(StringRef L, StringRef R) const {
- // Prevent heavy comparison, compare sizes first.
- if (int Res = cmpNumbers(L.size(), R.size()))
- return Res;
-
- // Compare strings lexicographically only when it is necessary: only when
- // strings are equal in size.
- return L.compare(R);
-}
-
-int FunctionComparator::cmpAttrs(const AttributeList L,
- const AttributeList R) const {
- if (int Res = cmpNumbers(L.getNumAttrSets(), R.getNumAttrSets()))
- return Res;
-
- for (unsigned i = L.index_begin(), e = L.index_end(); i != e; ++i) {
- AttributeSet LAS = L.getAttributes(i);
- AttributeSet RAS = R.getAttributes(i);
- AttributeSet::iterator LI = LAS.begin(), LE = LAS.end();
- AttributeSet::iterator RI = RAS.begin(), RE = RAS.end();
- for (; LI != LE && RI != RE; ++LI, ++RI) {
- Attribute LA = *LI;
- Attribute RA = *RI;
- if (LA.isTypeAttribute() && RA.isTypeAttribute()) {
- if (LA.getKindAsEnum() != RA.getKindAsEnum())
- return cmpNumbers(LA.getKindAsEnum(), RA.getKindAsEnum());
-
- Type *TyL = LA.getValueAsType();
- Type *TyR = RA.getValueAsType();
+//===- FunctionComparator.h - Function Comparator -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the FunctionComparator and GlobalNumberState classes
+// which are used by the MergeFunctions pass for comparing functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/FunctionComparator.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "functioncomparator"
+
+int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const {
+ if (L < R)
+ return -1;
+ if (L > R)
+ return 1;
+ return 0;
+}
+
+int FunctionComparator::cmpOrderings(AtomicOrdering L, AtomicOrdering R) const {
+ if ((int)L < (int)R)
+ return -1;
+ if ((int)L > (int)R)
+ return 1;
+ return 0;
+}
+
+int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const {
+ if (int Res = cmpNumbers(L.getBitWidth(), R.getBitWidth()))
+ return Res;
+ if (L.ugt(R))
+ return 1;
+ if (R.ugt(L))
+ return -1;
+ return 0;
+}
+
+int FunctionComparator::cmpAPFloats(const APFloat &L, const APFloat &R) const {
+ // Floats are ordered first by semantics (i.e. float, double, half, etc.),
+ // then by value interpreted as a bitstring (aka APInt).
+ const fltSemantics &SL = L.getSemantics(), &SR = R.getSemantics();
+ if (int Res = cmpNumbers(APFloat::semanticsPrecision(SL),
+ APFloat::semanticsPrecision(SR)))
+ return Res;
+ if (int Res = cmpNumbers(APFloat::semanticsMaxExponent(SL),
+ APFloat::semanticsMaxExponent(SR)))
+ return Res;
+ if (int Res = cmpNumbers(APFloat::semanticsMinExponent(SL),
+ APFloat::semanticsMinExponent(SR)))
+ return Res;
+ if (int Res = cmpNumbers(APFloat::semanticsSizeInBits(SL),
+ APFloat::semanticsSizeInBits(SR)))
+ return Res;
+ return cmpAPInts(L.bitcastToAPInt(), R.bitcastToAPInt());
+}
+
+int FunctionComparator::cmpMem(StringRef L, StringRef R) const {
+ // Prevent heavy comparison, compare sizes first.
+ if (int Res = cmpNumbers(L.size(), R.size()))
+ return Res;
+
+ // Compare strings lexicographically only when it is necessary: only when
+ // strings are equal in size.
+ return L.compare(R);
+}
+
+int FunctionComparator::cmpAttrs(const AttributeList L,
+ const AttributeList R) const {
+ if (int Res = cmpNumbers(L.getNumAttrSets(), R.getNumAttrSets()))
+ return Res;
+
+ for (unsigned i = L.index_begin(), e = L.index_end(); i != e; ++i) {
+ AttributeSet LAS = L.getAttributes(i);
+ AttributeSet RAS = R.getAttributes(i);
+ AttributeSet::iterator LI = LAS.begin(), LE = LAS.end();
+ AttributeSet::iterator RI = RAS.begin(), RE = RAS.end();
+ for (; LI != LE && RI != RE; ++LI, ++RI) {
+ Attribute LA = *LI;
+ Attribute RA = *RI;
+ if (LA.isTypeAttribute() && RA.isTypeAttribute()) {
+ if (LA.getKindAsEnum() != RA.getKindAsEnum())
+ return cmpNumbers(LA.getKindAsEnum(), RA.getKindAsEnum());
+
+ Type *TyL = LA.getValueAsType();
+ Type *TyR = RA.getValueAsType();
if (TyL && TyR) {
if (int Res = cmpTypes(TyL, TyR))
return Res;
continue;
}
-
- // Two pointers, at least one null, so the comparison result is
- // independent of the value of a real pointer.
+
+ // Two pointers, at least one null, so the comparison result is
+ // independent of the value of a real pointer.
if (int Res = cmpNumbers((uint64_t)TyL, (uint64_t)TyR))
return Res;
continue;
- }
- if (LA < RA)
- return -1;
- if (RA < LA)
- return 1;
- }
- if (LI != LE)
- return 1;
- if (RI != RE)
- return -1;
- }
- return 0;
-}
-
-int FunctionComparator::cmpRangeMetadata(const MDNode *L,
- const MDNode *R) const {
- if (L == R)
- return 0;
- if (!L)
- return -1;
- if (!R)
- return 1;
- // Range metadata is a sequence of numbers. Make sure they are the same
- // sequence.
- // TODO: Note that as this is metadata, it is possible to drop and/or merge
- // this data when considering functions to merge. Thus this comparison would
- // return 0 (i.e. equivalent), but merging would become more complicated
- // because the ranges would need to be unioned. It is not likely that
- // functions differ ONLY in this metadata if they are actually the same
- // function semantically.
- if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
- return Res;
- for (size_t I = 0; I < L->getNumOperands(); ++I) {
- ConstantInt *LLow = mdconst::extract<ConstantInt>(L->getOperand(I));
- ConstantInt *RLow = mdconst::extract<ConstantInt>(R->getOperand(I));
- if (int Res = cmpAPInts(LLow->getValue(), RLow->getValue()))
- return Res;
- }
- return 0;
-}
-
-int FunctionComparator::cmpOperandBundlesSchema(const CallBase &LCS,
- const CallBase &RCS) const {
- assert(LCS.getOpcode() == RCS.getOpcode() && "Can't compare otherwise!");
-
- if (int Res =
- cmpNumbers(LCS.getNumOperandBundles(), RCS.getNumOperandBundles()))
- return Res;
-
- for (unsigned I = 0, E = LCS.getNumOperandBundles(); I != E; ++I) {
- auto OBL = LCS.getOperandBundleAt(I);
- auto OBR = RCS.getOperandBundleAt(I);
-
- if (int Res = OBL.getTagName().compare(OBR.getTagName()))
- return Res;
-
- if (int Res = cmpNumbers(OBL.Inputs.size(), OBR.Inputs.size()))
- return Res;
- }
-
- return 0;
-}
-
-/// Constants comparison:
-/// 1. Check whether type of L constant could be losslessly bitcasted to R
-/// type.
-/// 2. Compare constant contents.
-/// For more details see declaration comments.
-int FunctionComparator::cmpConstants(const Constant *L,
- const Constant *R) const {
- Type *TyL = L->getType();
- Type *TyR = R->getType();
-
- // Check whether types are bitcastable. This part is just re-factored
- // Type::canLosslesslyBitCastTo method, but instead of returning true/false,
- // we also pack into result which type is "less" for us.
- int TypesRes = cmpTypes(TyL, TyR);
- if (TypesRes != 0) {
- // Types are different, but check whether we can bitcast them.
- if (!TyL->isFirstClassType()) {
- if (TyR->isFirstClassType())
- return -1;
- // Neither TyL nor TyR are values of first class type. Return the result
- // of comparing the types
- return TypesRes;
- }
- if (!TyR->isFirstClassType()) {
- if (TyL->isFirstClassType())
- return 1;
- return TypesRes;
- }
-
- // Vector -> Vector conversions are always lossless if the two vector types
- // have the same size, otherwise not.
- unsigned TyLWidth = 0;
- unsigned TyRWidth = 0;
-
- if (auto *VecTyL = dyn_cast<VectorType>(TyL))
- TyLWidth = VecTyL->getPrimitiveSizeInBits().getFixedSize();
- if (auto *VecTyR = dyn_cast<VectorType>(TyR))
- TyRWidth = VecTyR->getPrimitiveSizeInBits().getFixedSize();
-
- if (TyLWidth != TyRWidth)
- return cmpNumbers(TyLWidth, TyRWidth);
-
- // Zero bit-width means neither TyL nor TyR are vectors.
- if (!TyLWidth) {
- PointerType *PTyL = dyn_cast<PointerType>(TyL);
- PointerType *PTyR = dyn_cast<PointerType>(TyR);
- if (PTyL && PTyR) {
- unsigned AddrSpaceL = PTyL->getAddressSpace();
- unsigned AddrSpaceR = PTyR->getAddressSpace();
- if (int Res = cmpNumbers(AddrSpaceL, AddrSpaceR))
- return Res;
- }
- if (PTyL)
- return 1;
- if (PTyR)
- return -1;
-
- // TyL and TyR aren't vectors, nor pointers. We don't know how to
- // bitcast them.
- return TypesRes;
- }
- }
-
- // OK, types are bitcastable, now check constant contents.
-
- if (L->isNullValue() && R->isNullValue())
- return TypesRes;
- if (L->isNullValue() && !R->isNullValue())
- return 1;
- if (!L->isNullValue() && R->isNullValue())
- return -1;
-
- auto GlobalValueL = const_cast<GlobalValue *>(dyn_cast<GlobalValue>(L));
- auto GlobalValueR = const_cast<GlobalValue *>(dyn_cast<GlobalValue>(R));
- if (GlobalValueL && GlobalValueR) {
- return cmpGlobalValues(GlobalValueL, GlobalValueR);
- }
-
- if (int Res = cmpNumbers(L->getValueID(), R->getValueID()))
- return Res;
-
- if (const auto *SeqL = dyn_cast<ConstantDataSequential>(L)) {
- const auto *SeqR = cast<ConstantDataSequential>(R);
- // This handles ConstantDataArray and ConstantDataVector. Note that we
- // compare the two raw data arrays, which might differ depending on the host
- // endianness. This isn't a problem though, because the endiness of a module
- // will affect the order of the constants, but this order is the same
- // for a given input module and host platform.
- return cmpMem(SeqL->getRawDataValues(), SeqR->getRawDataValues());
- }
-
- switch (L->getValueID()) {
- case Value::UndefValueVal:
+ }
+ if (LA < RA)
+ return -1;
+ if (RA < LA)
+ return 1;
+ }
+ if (LI != LE)
+ return 1;
+ if (RI != RE)
+ return -1;
+ }
+ return 0;
+}
+
+int FunctionComparator::cmpRangeMetadata(const MDNode *L,
+ const MDNode *R) const {
+ if (L == R)
+ return 0;
+ if (!L)
+ return -1;
+ if (!R)
+ return 1;
+ // Range metadata is a sequence of numbers. Make sure they are the same
+ // sequence.
+ // TODO: Note that as this is metadata, it is possible to drop and/or merge
+ // this data when considering functions to merge. Thus this comparison would
+ // return 0 (i.e. equivalent), but merging would become more complicated
+ // because the ranges would need to be unioned. It is not likely that
+ // functions differ ONLY in this metadata if they are actually the same
+ // function semantically.
+ if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
+ return Res;
+ for (size_t I = 0; I < L->getNumOperands(); ++I) {
+ ConstantInt *LLow = mdconst::extract<ConstantInt>(L->getOperand(I));
+ ConstantInt *RLow = mdconst::extract<ConstantInt>(R->getOperand(I));
+ if (int Res = cmpAPInts(LLow->getValue(), RLow->getValue()))
+ return Res;
+ }
+ return 0;
+}
+
+int FunctionComparator::cmpOperandBundlesSchema(const CallBase &LCS,
+ const CallBase &RCS) const {
+ assert(LCS.getOpcode() == RCS.getOpcode() && "Can't compare otherwise!");
+
+ if (int Res =
+ cmpNumbers(LCS.getNumOperandBundles(), RCS.getNumOperandBundles()))
+ return Res;
+
+ for (unsigned I = 0, E = LCS.getNumOperandBundles(); I != E; ++I) {
+ auto OBL = LCS.getOperandBundleAt(I);
+ auto OBR = RCS.getOperandBundleAt(I);
+
+ if (int Res = OBL.getTagName().compare(OBR.getTagName()))
+ return Res;
+
+ if (int Res = cmpNumbers(OBL.Inputs.size(), OBR.Inputs.size()))
+ return Res;
+ }
+
+ return 0;
+}
+
+/// Constants comparison:
+/// 1. Check whether type of L constant could be losslessly bitcasted to R
+/// type.
+/// 2. Compare constant contents.
+/// For more details see declaration comments.
+int FunctionComparator::cmpConstants(const Constant *L,
+ const Constant *R) const {
+ Type *TyL = L->getType();
+ Type *TyR = R->getType();
+
+ // Check whether types are bitcastable. This part is just re-factored
+ // Type::canLosslesslyBitCastTo method, but instead of returning true/false,
+ // we also pack into result which type is "less" for us.
+ int TypesRes = cmpTypes(TyL, TyR);
+ if (TypesRes != 0) {
+ // Types are different, but check whether we can bitcast them.
+ if (!TyL->isFirstClassType()) {
+ if (TyR->isFirstClassType())
+ return -1;
+ // Neither TyL nor TyR are values of first class type. Return the result
+ // of comparing the types
+ return TypesRes;
+ }
+ if (!TyR->isFirstClassType()) {
+ if (TyL->isFirstClassType())
+ return 1;
+ return TypesRes;
+ }
+
+ // Vector -> Vector conversions are always lossless if the two vector types
+ // have the same size, otherwise not.
+ unsigned TyLWidth = 0;
+ unsigned TyRWidth = 0;
+
+ if (auto *VecTyL = dyn_cast<VectorType>(TyL))
+ TyLWidth = VecTyL->getPrimitiveSizeInBits().getFixedSize();
+ if (auto *VecTyR = dyn_cast<VectorType>(TyR))
+ TyRWidth = VecTyR->getPrimitiveSizeInBits().getFixedSize();
+
+ if (TyLWidth != TyRWidth)
+ return cmpNumbers(TyLWidth, TyRWidth);
+
+ // Zero bit-width means neither TyL nor TyR are vectors.
+ if (!TyLWidth) {
+ PointerType *PTyL = dyn_cast<PointerType>(TyL);
+ PointerType *PTyR = dyn_cast<PointerType>(TyR);
+ if (PTyL && PTyR) {
+ unsigned AddrSpaceL = PTyL->getAddressSpace();
+ unsigned AddrSpaceR = PTyR->getAddressSpace();
+ if (int Res = cmpNumbers(AddrSpaceL, AddrSpaceR))
+ return Res;
+ }
+ if (PTyL)
+ return 1;
+ if (PTyR)
+ return -1;
+
+ // TyL and TyR aren't vectors, nor pointers. We don't know how to
+ // bitcast them.
+ return TypesRes;
+ }
+ }
+
+ // OK, types are bitcastable, now check constant contents.
+
+ if (L->isNullValue() && R->isNullValue())
+ return TypesRes;
+ if (L->isNullValue() && !R->isNullValue())
+ return 1;
+ if (!L->isNullValue() && R->isNullValue())
+ return -1;
+
+ auto GlobalValueL = const_cast<GlobalValue *>(dyn_cast<GlobalValue>(L));
+ auto GlobalValueR = const_cast<GlobalValue *>(dyn_cast<GlobalValue>(R));
+ if (GlobalValueL && GlobalValueR) {
+ return cmpGlobalValues(GlobalValueL, GlobalValueR);
+ }
+
+ if (int Res = cmpNumbers(L->getValueID(), R->getValueID()))
+ return Res;
+
+ if (const auto *SeqL = dyn_cast<ConstantDataSequential>(L)) {
+ const auto *SeqR = cast<ConstantDataSequential>(R);
+ // This handles ConstantDataArray and ConstantDataVector. Note that we
+ // compare the two raw data arrays, which might differ depending on the host
+ // endianness. This isn't a problem though, because the endiness of a module
+ // will affect the order of the constants, but this order is the same
+ // for a given input module and host platform.
+ return cmpMem(SeqL->getRawDataValues(), SeqR->getRawDataValues());
+ }
+
+ switch (L->getValueID()) {
+ case Value::UndefValueVal:
case Value::PoisonValueVal:
- case Value::ConstantTokenNoneVal:
- return TypesRes;
- case Value::ConstantIntVal: {
- const APInt &LInt = cast<ConstantInt>(L)->getValue();
- const APInt &RInt = cast<ConstantInt>(R)->getValue();
- return cmpAPInts(LInt, RInt);
- }
- case Value::ConstantFPVal: {
- const APFloat &LAPF = cast<ConstantFP>(L)->getValueAPF();
- const APFloat &RAPF = cast<ConstantFP>(R)->getValueAPF();
- return cmpAPFloats(LAPF, RAPF);
- }
- case Value::ConstantArrayVal: {
- const ConstantArray *LA = cast<ConstantArray>(L);
- const ConstantArray *RA = cast<ConstantArray>(R);
- uint64_t NumElementsL = cast<ArrayType>(TyL)->getNumElements();
- uint64_t NumElementsR = cast<ArrayType>(TyR)->getNumElements();
- if (int Res = cmpNumbers(NumElementsL, NumElementsR))
- return Res;
- for (uint64_t i = 0; i < NumElementsL; ++i) {
- if (int Res = cmpConstants(cast<Constant>(LA->getOperand(i)),
- cast<Constant>(RA->getOperand(i))))
- return Res;
- }
- return 0;
- }
- case Value::ConstantStructVal: {
- const ConstantStruct *LS = cast<ConstantStruct>(L);
- const ConstantStruct *RS = cast<ConstantStruct>(R);
- unsigned NumElementsL = cast<StructType>(TyL)->getNumElements();
- unsigned NumElementsR = cast<StructType>(TyR)->getNumElements();
- if (int Res = cmpNumbers(NumElementsL, NumElementsR))
- return Res;
- for (unsigned i = 0; i != NumElementsL; ++i) {
- if (int Res = cmpConstants(cast<Constant>(LS->getOperand(i)),
- cast<Constant>(RS->getOperand(i))))
- return Res;
- }
- return 0;
- }
- case Value::ConstantVectorVal: {
- const ConstantVector *LV = cast<ConstantVector>(L);
- const ConstantVector *RV = cast<ConstantVector>(R);
- unsigned NumElementsL = cast<FixedVectorType>(TyL)->getNumElements();
- unsigned NumElementsR = cast<FixedVectorType>(TyR)->getNumElements();
- if (int Res = cmpNumbers(NumElementsL, NumElementsR))
- return Res;
- for (uint64_t i = 0; i < NumElementsL; ++i) {
- if (int Res = cmpConstants(cast<Constant>(LV->getOperand(i)),
- cast<Constant>(RV->getOperand(i))))
- return Res;
- }
- return 0;
- }
- case Value::ConstantExprVal: {
- const ConstantExpr *LE = cast<ConstantExpr>(L);
- const ConstantExpr *RE = cast<ConstantExpr>(R);
- unsigned NumOperandsL = LE->getNumOperands();
- unsigned NumOperandsR = RE->getNumOperands();
- if (int Res = cmpNumbers(NumOperandsL, NumOperandsR))
- return Res;
- for (unsigned i = 0; i < NumOperandsL; ++i) {
- if (int Res = cmpConstants(cast<Constant>(LE->getOperand(i)),
- cast<Constant>(RE->getOperand(i))))
- return Res;
- }
- return 0;
- }
- case Value::BlockAddressVal: {
- const BlockAddress *LBA = cast<BlockAddress>(L);
- const BlockAddress *RBA = cast<BlockAddress>(R);
- if (int Res = cmpValues(LBA->getFunction(), RBA->getFunction()))
- return Res;
- if (LBA->getFunction() == RBA->getFunction()) {
- // They are BBs in the same function. Order by which comes first in the
- // BB order of the function. This order is deterministic.
- Function *F = LBA->getFunction();
- BasicBlock *LBB = LBA->getBasicBlock();
- BasicBlock *RBB = RBA->getBasicBlock();
- if (LBB == RBB)
- return 0;
- for (BasicBlock &BB : F->getBasicBlockList()) {
- if (&BB == LBB) {
- assert(&BB != RBB);
- return -1;
- }
- if (&BB == RBB)
- return 1;
- }
- llvm_unreachable("Basic Block Address does not point to a basic block in "
- "its function.");
- return -1;
- } else {
- // cmpValues said the functions are the same. So because they aren't
- // literally the same pointer, they must respectively be the left and
- // right functions.
- assert(LBA->getFunction() == FnL && RBA->getFunction() == FnR);
- // cmpValues will tell us if these are equivalent BasicBlocks, in the
- // context of their respective functions.
- return cmpValues(LBA->getBasicBlock(), RBA->getBasicBlock());
- }
- }
- default: // Unknown constant, abort.
- LLVM_DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n");
- llvm_unreachable("Constant ValueID not recognized.");
- return -1;
- }
-}
-
-int FunctionComparator::cmpGlobalValues(GlobalValue *L, GlobalValue *R) const {
- uint64_t LNumber = GlobalNumbers->getNumber(L);
- uint64_t RNumber = GlobalNumbers->getNumber(R);
- return cmpNumbers(LNumber, RNumber);
-}
-
-/// cmpType - compares two types,
-/// defines total ordering among the types set.
-/// See method declaration comments for more details.
-int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
- PointerType *PTyL = dyn_cast<PointerType>(TyL);
- PointerType *PTyR = dyn_cast<PointerType>(TyR);
-
- const DataLayout &DL = FnL->getParent()->getDataLayout();
- if (PTyL && PTyL->getAddressSpace() == 0)
- TyL = DL.getIntPtrType(TyL);
- if (PTyR && PTyR->getAddressSpace() == 0)
- TyR = DL.getIntPtrType(TyR);
-
- if (TyL == TyR)
- return 0;
-
- if (int Res = cmpNumbers(TyL->getTypeID(), TyR->getTypeID()))
- return Res;
-
- switch (TyL->getTypeID()) {
- default:
- llvm_unreachable("Unknown type!");
- case Type::IntegerTyID:
- return cmpNumbers(cast<IntegerType>(TyL)->getBitWidth(),
- cast<IntegerType>(TyR)->getBitWidth());
- // TyL == TyR would have returned true earlier, because types are uniqued.
- case Type::VoidTyID:
- case Type::FloatTyID:
- case Type::DoubleTyID:
- case Type::X86_FP80TyID:
- case Type::FP128TyID:
- case Type::PPC_FP128TyID:
- case Type::LabelTyID:
- case Type::MetadataTyID:
- case Type::TokenTyID:
- return 0;
-
- case Type::PointerTyID:
- assert(PTyL && PTyR && "Both types must be pointers here.");
- return cmpNumbers(PTyL->getAddressSpace(), PTyR->getAddressSpace());
-
- case Type::StructTyID: {
- StructType *STyL = cast<StructType>(TyL);
- StructType *STyR = cast<StructType>(TyR);
- if (STyL->getNumElements() != STyR->getNumElements())
- return cmpNumbers(STyL->getNumElements(), STyR->getNumElements());
-
- if (STyL->isPacked() != STyR->isPacked())
- return cmpNumbers(STyL->isPacked(), STyR->isPacked());
-
- for (unsigned i = 0, e = STyL->getNumElements(); i != e; ++i) {
- if (int Res = cmpTypes(STyL->getElementType(i), STyR->getElementType(i)))
- return Res;
- }
- return 0;
- }
-
- case Type::FunctionTyID: {
- FunctionType *FTyL = cast<FunctionType>(TyL);
- FunctionType *FTyR = cast<FunctionType>(TyR);
- if (FTyL->getNumParams() != FTyR->getNumParams())
- return cmpNumbers(FTyL->getNumParams(), FTyR->getNumParams());
-
- if (FTyL->isVarArg() != FTyR->isVarArg())
- return cmpNumbers(FTyL->isVarArg(), FTyR->isVarArg());
-
- if (int Res = cmpTypes(FTyL->getReturnType(), FTyR->getReturnType()))
- return Res;
-
- for (unsigned i = 0, e = FTyL->getNumParams(); i != e; ++i) {
- if (int Res = cmpTypes(FTyL->getParamType(i), FTyR->getParamType(i)))
- return Res;
- }
- return 0;
- }
-
- case Type::ArrayTyID: {
- auto *STyL = cast<ArrayType>(TyL);
- auto *STyR = cast<ArrayType>(TyR);
- if (STyL->getNumElements() != STyR->getNumElements())
- return cmpNumbers(STyL->getNumElements(), STyR->getNumElements());
- return cmpTypes(STyL->getElementType(), STyR->getElementType());
- }
- case Type::FixedVectorTyID:
- case Type::ScalableVectorTyID: {
- auto *STyL = cast<VectorType>(TyL);
- auto *STyR = cast<VectorType>(TyR);
+ case Value::ConstantTokenNoneVal:
+ return TypesRes;
+ case Value::ConstantIntVal: {
+ const APInt &LInt = cast<ConstantInt>(L)->getValue();
+ const APInt &RInt = cast<ConstantInt>(R)->getValue();
+ return cmpAPInts(LInt, RInt);
+ }
+ case Value::ConstantFPVal: {
+ const APFloat &LAPF = cast<ConstantFP>(L)->getValueAPF();
+ const APFloat &RAPF = cast<ConstantFP>(R)->getValueAPF();
+ return cmpAPFloats(LAPF, RAPF);
+ }
+ case Value::ConstantArrayVal: {
+ const ConstantArray *LA = cast<ConstantArray>(L);
+ const ConstantArray *RA = cast<ConstantArray>(R);
+ uint64_t NumElementsL = cast<ArrayType>(TyL)->getNumElements();
+ uint64_t NumElementsR = cast<ArrayType>(TyR)->getNumElements();
+ if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+ return Res;
+ for (uint64_t i = 0; i < NumElementsL; ++i) {
+ if (int Res = cmpConstants(cast<Constant>(LA->getOperand(i)),
+ cast<Constant>(RA->getOperand(i))))
+ return Res;
+ }
+ return 0;
+ }
+ case Value::ConstantStructVal: {
+ const ConstantStruct *LS = cast<ConstantStruct>(L);
+ const ConstantStruct *RS = cast<ConstantStruct>(R);
+ unsigned NumElementsL = cast<StructType>(TyL)->getNumElements();
+ unsigned NumElementsR = cast<StructType>(TyR)->getNumElements();
+ if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+ return Res;
+ for (unsigned i = 0; i != NumElementsL; ++i) {
+ if (int Res = cmpConstants(cast<Constant>(LS->getOperand(i)),
+ cast<Constant>(RS->getOperand(i))))
+ return Res;
+ }
+ return 0;
+ }
+ case Value::ConstantVectorVal: {
+ const ConstantVector *LV = cast<ConstantVector>(L);
+ const ConstantVector *RV = cast<ConstantVector>(R);
+ unsigned NumElementsL = cast<FixedVectorType>(TyL)->getNumElements();
+ unsigned NumElementsR = cast<FixedVectorType>(TyR)->getNumElements();
+ if (int Res = cmpNumbers(NumElementsL, NumElementsR))
+ return Res;
+ for (uint64_t i = 0; i < NumElementsL; ++i) {
+ if (int Res = cmpConstants(cast<Constant>(LV->getOperand(i)),
+ cast<Constant>(RV->getOperand(i))))
+ return Res;
+ }
+ return 0;
+ }
+ case Value::ConstantExprVal: {
+ const ConstantExpr *LE = cast<ConstantExpr>(L);
+ const ConstantExpr *RE = cast<ConstantExpr>(R);
+ unsigned NumOperandsL = LE->getNumOperands();
+ unsigned NumOperandsR = RE->getNumOperands();
+ if (int Res = cmpNumbers(NumOperandsL, NumOperandsR))
+ return Res;
+ for (unsigned i = 0; i < NumOperandsL; ++i) {
+ if (int Res = cmpConstants(cast<Constant>(LE->getOperand(i)),
+ cast<Constant>(RE->getOperand(i))))
+ return Res;
+ }
+ return 0;
+ }
+ case Value::BlockAddressVal: {
+ const BlockAddress *LBA = cast<BlockAddress>(L);
+ const BlockAddress *RBA = cast<BlockAddress>(R);
+ if (int Res = cmpValues(LBA->getFunction(), RBA->getFunction()))
+ return Res;
+ if (LBA->getFunction() == RBA->getFunction()) {
+ // They are BBs in the same function. Order by which comes first in the
+ // BB order of the function. This order is deterministic.
+ Function *F = LBA->getFunction();
+ BasicBlock *LBB = LBA->getBasicBlock();
+ BasicBlock *RBB = RBA->getBasicBlock();
+ if (LBB == RBB)
+ return 0;
+ for (BasicBlock &BB : F->getBasicBlockList()) {
+ if (&BB == LBB) {
+ assert(&BB != RBB);
+ return -1;
+ }
+ if (&BB == RBB)
+ return 1;
+ }
+ llvm_unreachable("Basic Block Address does not point to a basic block in "
+ "its function.");
+ return -1;
+ } else {
+ // cmpValues said the functions are the same. So because they aren't
+ // literally the same pointer, they must respectively be the left and
+ // right functions.
+ assert(LBA->getFunction() == FnL && RBA->getFunction() == FnR);
+ // cmpValues will tell us if these are equivalent BasicBlocks, in the
+ // context of their respective functions.
+ return cmpValues(LBA->getBasicBlock(), RBA->getBasicBlock());
+ }
+ }
+ default: // Unknown constant, abort.
+ LLVM_DEBUG(dbgs() << "Looking at valueID " << L->getValueID() << "\n");
+ llvm_unreachable("Constant ValueID not recognized.");
+ return -1;
+ }
+}
+
+int FunctionComparator::cmpGlobalValues(GlobalValue *L, GlobalValue *R) const {
+ uint64_t LNumber = GlobalNumbers->getNumber(L);
+ uint64_t RNumber = GlobalNumbers->getNumber(R);
+ return cmpNumbers(LNumber, RNumber);
+}
+
+/// cmpType - compares two types,
+/// defines total ordering among the types set.
+/// See method declaration comments for more details.
+int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
+ PointerType *PTyL = dyn_cast<PointerType>(TyL);
+ PointerType *PTyR = dyn_cast<PointerType>(TyR);
+
+ const DataLayout &DL = FnL->getParent()->getDataLayout();
+ if (PTyL && PTyL->getAddressSpace() == 0)
+ TyL = DL.getIntPtrType(TyL);
+ if (PTyR && PTyR->getAddressSpace() == 0)
+ TyR = DL.getIntPtrType(TyR);
+
+ if (TyL == TyR)
+ return 0;
+
+ if (int Res = cmpNumbers(TyL->getTypeID(), TyR->getTypeID()))
+ return Res;
+
+ switch (TyL->getTypeID()) {
+ default:
+ llvm_unreachable("Unknown type!");
+ case Type::IntegerTyID:
+ return cmpNumbers(cast<IntegerType>(TyL)->getBitWidth(),
+ cast<IntegerType>(TyR)->getBitWidth());
+ // TyL == TyR would have returned true earlier, because types are uniqued.
+ case Type::VoidTyID:
+ case Type::FloatTyID:
+ case Type::DoubleTyID:
+ case Type::X86_FP80TyID:
+ case Type::FP128TyID:
+ case Type::PPC_FP128TyID:
+ case Type::LabelTyID:
+ case Type::MetadataTyID:
+ case Type::TokenTyID:
+ return 0;
+
+ case Type::PointerTyID:
+ assert(PTyL && PTyR && "Both types must be pointers here.");
+ return cmpNumbers(PTyL->getAddressSpace(), PTyR->getAddressSpace());
+
+ case Type::StructTyID: {
+ StructType *STyL = cast<StructType>(TyL);
+ StructType *STyR = cast<StructType>(TyR);
+ if (STyL->getNumElements() != STyR->getNumElements())
+ return cmpNumbers(STyL->getNumElements(), STyR->getNumElements());
+
+ if (STyL->isPacked() != STyR->isPacked())
+ return cmpNumbers(STyL->isPacked(), STyR->isPacked());
+
+ for (unsigned i = 0, e = STyL->getNumElements(); i != e; ++i) {
+ if (int Res = cmpTypes(STyL->getElementType(i), STyR->getElementType(i)))
+ return Res;
+ }
+ return 0;
+ }
+
+ case Type::FunctionTyID: {
+ FunctionType *FTyL = cast<FunctionType>(TyL);
+ FunctionType *FTyR = cast<FunctionType>(TyR);
+ if (FTyL->getNumParams() != FTyR->getNumParams())
+ return cmpNumbers(FTyL->getNumParams(), FTyR->getNumParams());
+
+ if (FTyL->isVarArg() != FTyR->isVarArg())
+ return cmpNumbers(FTyL->isVarArg(), FTyR->isVarArg());
+
+ if (int Res = cmpTypes(FTyL->getReturnType(), FTyR->getReturnType()))
+ return Res;
+
+ for (unsigned i = 0, e = FTyL->getNumParams(); i != e; ++i) {
+ if (int Res = cmpTypes(FTyL->getParamType(i), FTyR->getParamType(i)))
+ return Res;
+ }
+ return 0;
+ }
+
+ case Type::ArrayTyID: {
+ auto *STyL = cast<ArrayType>(TyL);
+ auto *STyR = cast<ArrayType>(TyR);
+ if (STyL->getNumElements() != STyR->getNumElements())
+ return cmpNumbers(STyL->getNumElements(), STyR->getNumElements());
+ return cmpTypes(STyL->getElementType(), STyR->getElementType());
+ }
+ case Type::FixedVectorTyID:
+ case Type::ScalableVectorTyID: {
+ auto *STyL = cast<VectorType>(TyL);
+ auto *STyR = cast<VectorType>(TyR);
if (STyL->getElementCount().isScalable() !=
STyR->getElementCount().isScalable())
return cmpNumbers(STyL->getElementCount().isScalable(),
@@ -501,476 +501,476 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
if (STyL->getElementCount() != STyR->getElementCount())
return cmpNumbers(STyL->getElementCount().getKnownMinValue(),
STyR->getElementCount().getKnownMinValue());
- return cmpTypes(STyL->getElementType(), STyR->getElementType());
- }
- }
-}
-
-// Determine whether the two operations are the same except that pointer-to-A
-// and pointer-to-B are equivalent. This should be kept in sync with
-// Instruction::isSameOperationAs.
-// Read method declaration comments for more details.
-int FunctionComparator::cmpOperations(const Instruction *L,
- const Instruction *R,
- bool &needToCmpOperands) const {
- needToCmpOperands = true;
- if (int Res = cmpValues(L, R))
- return Res;
-
- // Differences from Instruction::isSameOperationAs:
- // * replace type comparison with calls to cmpTypes.
- // * we test for I->getRawSubclassOptionalData (nuw/nsw/tail) at the top.
- // * because of the above, we don't test for the tail bit on calls later on.
- if (int Res = cmpNumbers(L->getOpcode(), R->getOpcode()))
- return Res;
-
- if (const GetElementPtrInst *GEPL = dyn_cast<GetElementPtrInst>(L)) {
- needToCmpOperands = false;
- const GetElementPtrInst *GEPR = cast<GetElementPtrInst>(R);
- if (int Res =
- cmpValues(GEPL->getPointerOperand(), GEPR->getPointerOperand()))
- return Res;
- return cmpGEPs(GEPL, GEPR);
- }
-
- if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
- return Res;
-
- if (int Res = cmpTypes(L->getType(), R->getType()))
- return Res;
-
- if (int Res = cmpNumbers(L->getRawSubclassOptionalData(),
- R->getRawSubclassOptionalData()))
- return Res;
-
- // We have two instructions of identical opcode and #operands. Check to see
- // if all operands are the same type
- for (unsigned i = 0, e = L->getNumOperands(); i != e; ++i) {
- if (int Res =
- cmpTypes(L->getOperand(i)->getType(), R->getOperand(i)->getType()))
- return Res;
- }
-
- // Check special state that is a part of some instructions.
- if (const AllocaInst *AI = dyn_cast<AllocaInst>(L)) {
- if (int Res = cmpTypes(AI->getAllocatedType(),
- cast<AllocaInst>(R)->getAllocatedType()))
- return Res;
- return cmpNumbers(AI->getAlignment(), cast<AllocaInst>(R)->getAlignment());
- }
- if (const LoadInst *LI = dyn_cast<LoadInst>(L)) {
- if (int Res = cmpNumbers(LI->isVolatile(), cast<LoadInst>(R)->isVolatile()))
- return Res;
- if (int Res =
- cmpNumbers(LI->getAlignment(), cast<LoadInst>(R)->getAlignment()))
- return Res;
- if (int Res =
- cmpOrderings(LI->getOrdering(), cast<LoadInst>(R)->getOrdering()))
- return Res;
- if (int Res = cmpNumbers(LI->getSyncScopeID(),
- cast<LoadInst>(R)->getSyncScopeID()))
- return Res;
- return cmpRangeMetadata(
- LI->getMetadata(LLVMContext::MD_range),
- cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range));
- }
- if (const StoreInst *SI = dyn_cast<StoreInst>(L)) {
- if (int Res =
- cmpNumbers(SI->isVolatile(), cast<StoreInst>(R)->isVolatile()))
- return Res;
- if (int Res =
- cmpNumbers(SI->getAlignment(), cast<StoreInst>(R)->getAlignment()))
- return Res;
- if (int Res =
- cmpOrderings(SI->getOrdering(), cast<StoreInst>(R)->getOrdering()))
- return Res;
- return cmpNumbers(SI->getSyncScopeID(),
- cast<StoreInst>(R)->getSyncScopeID());
- }
- if (const CmpInst *CI = dyn_cast<CmpInst>(L))
- return cmpNumbers(CI->getPredicate(), cast<CmpInst>(R)->getPredicate());
- if (auto *CBL = dyn_cast<CallBase>(L)) {
- auto *CBR = cast<CallBase>(R);
- if (int Res = cmpNumbers(CBL->getCallingConv(), CBR->getCallingConv()))
- return Res;
- if (int Res = cmpAttrs(CBL->getAttributes(), CBR->getAttributes()))
- return Res;
- if (int Res = cmpOperandBundlesSchema(*CBL, *CBR))
- return Res;
- if (const CallInst *CI = dyn_cast<CallInst>(L))
- if (int Res = cmpNumbers(CI->getTailCallKind(),
- cast<CallInst>(R)->getTailCallKind()))
- return Res;
- return cmpRangeMetadata(L->getMetadata(LLVMContext::MD_range),
- R->getMetadata(LLVMContext::MD_range));
- }
- if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) {
- ArrayRef<unsigned> LIndices = IVI->getIndices();
- ArrayRef<unsigned> RIndices = cast<InsertValueInst>(R)->getIndices();
- if (int Res = cmpNumbers(LIndices.size(), RIndices.size()))
- return Res;
- for (size_t i = 0, e = LIndices.size(); i != e; ++i) {
- if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
- return Res;
- }
- return 0;
- }
- if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(L)) {
- ArrayRef<unsigned> LIndices = EVI->getIndices();
- ArrayRef<unsigned> RIndices = cast<ExtractValueInst>(R)->getIndices();
- if (int Res = cmpNumbers(LIndices.size(), RIndices.size()))
- return Res;
- for (size_t i = 0, e = LIndices.size(); i != e; ++i) {
- if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
- return Res;
- }
- }
- if (const FenceInst *FI = dyn_cast<FenceInst>(L)) {
- if (int Res =
- cmpOrderings(FI->getOrdering(), cast<FenceInst>(R)->getOrdering()))
- return Res;
- return cmpNumbers(FI->getSyncScopeID(),
- cast<FenceInst>(R)->getSyncScopeID());
- }
- if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(L)) {
- if (int Res = cmpNumbers(CXI->isVolatile(),
- cast<AtomicCmpXchgInst>(R)->isVolatile()))
- return Res;
- if (int Res =
- cmpNumbers(CXI->isWeak(), cast<AtomicCmpXchgInst>(R)->isWeak()))
- return Res;
- if (int Res =
- cmpOrderings(CXI->getSuccessOrdering(),
- cast<AtomicCmpXchgInst>(R)->getSuccessOrdering()))
- return Res;
- if (int Res =
- cmpOrderings(CXI->getFailureOrdering(),
- cast<AtomicCmpXchgInst>(R)->getFailureOrdering()))
- return Res;
- return cmpNumbers(CXI->getSyncScopeID(),
- cast<AtomicCmpXchgInst>(R)->getSyncScopeID());
- }
- if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(L)) {
- if (int Res = cmpNumbers(RMWI->getOperation(),
- cast<AtomicRMWInst>(R)->getOperation()))
- return Res;
- if (int Res = cmpNumbers(RMWI->isVolatile(),
- cast<AtomicRMWInst>(R)->isVolatile()))
- return Res;
- if (int Res = cmpOrderings(RMWI->getOrdering(),
- cast<AtomicRMWInst>(R)->getOrdering()))
- return Res;
- return cmpNumbers(RMWI->getSyncScopeID(),
- cast<AtomicRMWInst>(R)->getSyncScopeID());
- }
- if (const ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(L)) {
- ArrayRef<int> LMask = SVI->getShuffleMask();
- ArrayRef<int> RMask = cast<ShuffleVectorInst>(R)->getShuffleMask();
- if (int Res = cmpNumbers(LMask.size(), RMask.size()))
- return Res;
- for (size_t i = 0, e = LMask.size(); i != e; ++i) {
- if (int Res = cmpNumbers(LMask[i], RMask[i]))
- return Res;
- }
- }
- if (const PHINode *PNL = dyn_cast<PHINode>(L)) {
- const PHINode *PNR = cast<PHINode>(R);
- // Ensure that in addition to the incoming values being identical
- // (checked by the caller of this function), the incoming blocks
- // are also identical.
- for (unsigned i = 0, e = PNL->getNumIncomingValues(); i != e; ++i) {
- if (int Res =
- cmpValues(PNL->getIncomingBlock(i), PNR->getIncomingBlock(i)))
- return Res;
- }
- }
- return 0;
-}
-
-// Determine whether two GEP operations perform the same underlying arithmetic.
-// Read method declaration comments for more details.
-int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,
- const GEPOperator *GEPR) const {
- unsigned int ASL = GEPL->getPointerAddressSpace();
- unsigned int ASR = GEPR->getPointerAddressSpace();
-
- if (int Res = cmpNumbers(ASL, ASR))
- return Res;
-
- // When we have target data, we can reduce the GEP down to the value in bytes
- // added to the address.
- const DataLayout &DL = FnL->getParent()->getDataLayout();
- unsigned BitWidth = DL.getPointerSizeInBits(ASL);
- APInt OffsetL(BitWidth, 0), OffsetR(BitWidth, 0);
- if (GEPL->accumulateConstantOffset(DL, OffsetL) &&
- GEPR->accumulateConstantOffset(DL, OffsetR))
- return cmpAPInts(OffsetL, OffsetR);
- if (int Res =
- cmpTypes(GEPL->getSourceElementType(), GEPR->getSourceElementType()))
- return Res;
-
- if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands()))
- return Res;
-
- for (unsigned i = 0, e = GEPL->getNumOperands(); i != e; ++i) {
- if (int Res = cmpValues(GEPL->getOperand(i), GEPR->getOperand(i)))
- return Res;
- }
-
- return 0;
-}
-
-int FunctionComparator::cmpInlineAsm(const InlineAsm *L,
- const InlineAsm *R) const {
- // InlineAsm's are uniqued. If they are the same pointer, obviously they are
- // the same, otherwise compare the fields.
- if (L == R)
- return 0;
- if (int Res = cmpTypes(L->getFunctionType(), R->getFunctionType()))
- return Res;
- if (int Res = cmpMem(L->getAsmString(), R->getAsmString()))
- return Res;
- if (int Res = cmpMem(L->getConstraintString(), R->getConstraintString()))
- return Res;
- if (int Res = cmpNumbers(L->hasSideEffects(), R->hasSideEffects()))
- return Res;
- if (int Res = cmpNumbers(L->isAlignStack(), R->isAlignStack()))
- return Res;
- if (int Res = cmpNumbers(L->getDialect(), R->getDialect()))
- return Res;
- assert(L->getFunctionType() != R->getFunctionType());
- return 0;
-}
-
-/// Compare two values used by the two functions under pair-wise comparison. If
-/// this is the first time the values are seen, they're added to the mapping so
-/// that we will detect mismatches on next use.
-/// See comments in declaration for more details.
-int FunctionComparator::cmpValues(const Value *L, const Value *R) const {
- // Catch self-reference case.
- if (L == FnL) {
- if (R == FnR)
- return 0;
- return -1;
- }
- if (R == FnR) {
- if (L == FnL)
- return 0;
- return 1;
- }
-
- const Constant *ConstL = dyn_cast<Constant>(L);
- const Constant *ConstR = dyn_cast<Constant>(R);
- if (ConstL && ConstR) {
- if (L == R)
- return 0;
- return cmpConstants(ConstL, ConstR);
- }
-
- if (ConstL)
- return 1;
- if (ConstR)
- return -1;
-
- const InlineAsm *InlineAsmL = dyn_cast<InlineAsm>(L);
- const InlineAsm *InlineAsmR = dyn_cast<InlineAsm>(R);
-
- if (InlineAsmL && InlineAsmR)
- return cmpInlineAsm(InlineAsmL, InlineAsmR);
- if (InlineAsmL)
- return 1;
- if (InlineAsmR)
- return -1;
-
- auto LeftSN = sn_mapL.insert(std::make_pair(L, sn_mapL.size())),
- RightSN = sn_mapR.insert(std::make_pair(R, sn_mapR.size()));
-
- return cmpNumbers(LeftSN.first->second, RightSN.first->second);
-}
-
-// Test whether two basic blocks have equivalent behaviour.
-int FunctionComparator::cmpBasicBlocks(const BasicBlock *BBL,
- const BasicBlock *BBR) const {
- BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end();
- BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end();
-
- do {
- bool needToCmpOperands = true;
- if (int Res = cmpOperations(&*InstL, &*InstR, needToCmpOperands))
- return Res;
- if (needToCmpOperands) {
- assert(InstL->getNumOperands() == InstR->getNumOperands());
-
- for (unsigned i = 0, e = InstL->getNumOperands(); i != e; ++i) {
- Value *OpL = InstL->getOperand(i);
- Value *OpR = InstR->getOperand(i);
- if (int Res = cmpValues(OpL, OpR))
- return Res;
- // cmpValues should ensure this is true.
- assert(cmpTypes(OpL->getType(), OpR->getType()) == 0);
- }
- }
-
- ++InstL;
- ++InstR;
- } while (InstL != InstLE && InstR != InstRE);
-
- if (InstL != InstLE && InstR == InstRE)
- return 1;
- if (InstL == InstLE && InstR != InstRE)
- return -1;
- return 0;
-}
-
-int FunctionComparator::compareSignature() const {
- if (int Res = cmpAttrs(FnL->getAttributes(), FnR->getAttributes()))
- return Res;
-
- if (int Res = cmpNumbers(FnL->hasGC(), FnR->hasGC()))
- return Res;
-
- if (FnL->hasGC()) {
- if (int Res = cmpMem(FnL->getGC(), FnR->getGC()))
- return Res;
- }
-
- if (int Res = cmpNumbers(FnL->hasSection(), FnR->hasSection()))
- return Res;
-
- if (FnL->hasSection()) {
- if (int Res = cmpMem(FnL->getSection(), FnR->getSection()))
- return Res;
- }
-
- if (int Res = cmpNumbers(FnL->isVarArg(), FnR->isVarArg()))
- return Res;
-
- // TODO: if it's internal and only used in direct calls, we could handle this
- // case too.
- if (int Res = cmpNumbers(FnL->getCallingConv(), FnR->getCallingConv()))
- return Res;
-
- if (int Res = cmpTypes(FnL->getFunctionType(), FnR->getFunctionType()))
- return Res;
-
- assert(FnL->arg_size() == FnR->arg_size() &&
- "Identically typed functions have different numbers of args!");
-
- // Visit the arguments so that they get enumerated in the order they're
- // passed in.
- for (Function::const_arg_iterator ArgLI = FnL->arg_begin(),
- ArgRI = FnR->arg_begin(),
- ArgLE = FnL->arg_end();
- ArgLI != ArgLE; ++ArgLI, ++ArgRI) {
- if (cmpValues(&*ArgLI, &*ArgRI) != 0)
- llvm_unreachable("Arguments repeat!");
- }
- return 0;
-}
-
-// Test whether the two functions have equivalent behaviour.
-int FunctionComparator::compare() {
- beginCompare();
-
- if (int Res = compareSignature())
- return Res;
-
- // We do a CFG-ordered walk since the actual ordering of the blocks in the
- // linked list is immaterial. Our walk starts at the entry block for both
- // functions, then takes each block from each terminator in order. As an
- // artifact, this also means that unreachable blocks are ignored.
- SmallVector<const BasicBlock *, 8> FnLBBs, FnRBBs;
- SmallPtrSet<const BasicBlock *, 32> VisitedBBs; // in terms of F1.
-
- FnLBBs.push_back(&FnL->getEntryBlock());
- FnRBBs.push_back(&FnR->getEntryBlock());
-
- VisitedBBs.insert(FnLBBs[0]);
- while (!FnLBBs.empty()) {
- const BasicBlock *BBL = FnLBBs.pop_back_val();
- const BasicBlock *BBR = FnRBBs.pop_back_val();
-
- if (int Res = cmpValues(BBL, BBR))
- return Res;
-
- if (int Res = cmpBasicBlocks(BBL, BBR))
- return Res;
-
- const Instruction *TermL = BBL->getTerminator();
- const Instruction *TermR = BBR->getTerminator();
-
- assert(TermL->getNumSuccessors() == TermR->getNumSuccessors());
- for (unsigned i = 0, e = TermL->getNumSuccessors(); i != e; ++i) {
- if (!VisitedBBs.insert(TermL->getSuccessor(i)).second)
- continue;
-
- FnLBBs.push_back(TermL->getSuccessor(i));
- FnRBBs.push_back(TermR->getSuccessor(i));
- }
- }
- return 0;
-}
-
-namespace {
-
-// Accumulate the hash of a sequence of 64-bit integers. This is similar to a
-// hash of a sequence of 64bit ints, but the entire input does not need to be
-// available at once. This interface is necessary for functionHash because it
-// needs to accumulate the hash as the structure of the function is traversed
-// without saving these values to an intermediate buffer. This form of hashing
-// is not often needed, as usually the object to hash is just read from a
-// buffer.
-class HashAccumulator64 {
- uint64_t Hash;
-
-public:
- // Initialize to random constant, so the state isn't zero.
- HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; }
-
- void add(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); }
-
- // No finishing is required, because the entire hash value is used.
- uint64_t getHash() { return Hash; }
-};
-
-} // end anonymous namespace
-
-// A function hash is calculated by considering only the number of arguments and
-// whether a function is varargs, the order of basic blocks (given by the
-// successors of each basic block in depth first order), and the order of
-// opcodes of each instruction within each of these basic blocks. This mirrors
-// the strategy compare() uses to compare functions by walking the BBs in depth
-// first order and comparing each instruction in sequence. Because this hash
-// does not look at the operands, it is insensitive to things such as the
-// target of calls and the constants used in the function, which makes it useful
-// when possibly merging functions which are the same modulo constants and call
-// targets.
-FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) {
- HashAccumulator64 H;
- H.add(F.isVarArg());
- H.add(F.arg_size());
-
- SmallVector<const BasicBlock *, 8> BBs;
- SmallPtrSet<const BasicBlock *, 16> VisitedBBs;
-
- // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(),
- // accumulating the hash of the function "structure." (BB and opcode sequence)
- BBs.push_back(&F.getEntryBlock());
- VisitedBBs.insert(BBs[0]);
- while (!BBs.empty()) {
- const BasicBlock *BB = BBs.pop_back_val();
- // This random value acts as a block header, as otherwise the partition of
- // opcodes into BBs wouldn't affect the hash, only the order of the opcodes
- H.add(45798);
- for (auto &Inst : *BB) {
- H.add(Inst.getOpcode());
- }
- const Instruction *Term = BB->getTerminator();
- for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
- if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
- continue;
- BBs.push_back(Term->getSuccessor(i));
- }
- }
- return H.getHash();
-}
+ return cmpTypes(STyL->getElementType(), STyR->getElementType());
+ }
+ }
+}
+
+// Determine whether the two operations are the same except that pointer-to-A
+// and pointer-to-B are equivalent. This should be kept in sync with
+// Instruction::isSameOperationAs.
+// Read method declaration comments for more details.
+int FunctionComparator::cmpOperations(const Instruction *L,
+ const Instruction *R,
+ bool &needToCmpOperands) const {
+ needToCmpOperands = true;
+ if (int Res = cmpValues(L, R))
+ return Res;
+
+ // Differences from Instruction::isSameOperationAs:
+ // * replace type comparison with calls to cmpTypes.
+ // * we test for I->getRawSubclassOptionalData (nuw/nsw/tail) at the top.
+ // * because of the above, we don't test for the tail bit on calls later on.
+ if (int Res = cmpNumbers(L->getOpcode(), R->getOpcode()))
+ return Res;
+
+ if (const GetElementPtrInst *GEPL = dyn_cast<GetElementPtrInst>(L)) {
+ needToCmpOperands = false;
+ const GetElementPtrInst *GEPR = cast<GetElementPtrInst>(R);
+ if (int Res =
+ cmpValues(GEPL->getPointerOperand(), GEPR->getPointerOperand()))
+ return Res;
+ return cmpGEPs(GEPL, GEPR);
+ }
+
+ if (int Res = cmpNumbers(L->getNumOperands(), R->getNumOperands()))
+ return Res;
+
+ if (int Res = cmpTypes(L->getType(), R->getType()))
+ return Res;
+
+ if (int Res = cmpNumbers(L->getRawSubclassOptionalData(),
+ R->getRawSubclassOptionalData()))
+ return Res;
+
+ // We have two instructions of identical opcode and #operands. Check to see
+ // if all operands are the same type
+ for (unsigned i = 0, e = L->getNumOperands(); i != e; ++i) {
+ if (int Res =
+ cmpTypes(L->getOperand(i)->getType(), R->getOperand(i)->getType()))
+ return Res;
+ }
+
+ // Check special state that is a part of some instructions.
+ if (const AllocaInst *AI = dyn_cast<AllocaInst>(L)) {
+ if (int Res = cmpTypes(AI->getAllocatedType(),
+ cast<AllocaInst>(R)->getAllocatedType()))
+ return Res;
+ return cmpNumbers(AI->getAlignment(), cast<AllocaInst>(R)->getAlignment());
+ }
+ if (const LoadInst *LI = dyn_cast<LoadInst>(L)) {
+ if (int Res = cmpNumbers(LI->isVolatile(), cast<LoadInst>(R)->isVolatile()))
+ return Res;
+ if (int Res =
+ cmpNumbers(LI->getAlignment(), cast<LoadInst>(R)->getAlignment()))
+ return Res;
+ if (int Res =
+ cmpOrderings(LI->getOrdering(), cast<LoadInst>(R)->getOrdering()))
+ return Res;
+ if (int Res = cmpNumbers(LI->getSyncScopeID(),
+ cast<LoadInst>(R)->getSyncScopeID()))
+ return Res;
+ return cmpRangeMetadata(
+ LI->getMetadata(LLVMContext::MD_range),
+ cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range));
+ }
+ if (const StoreInst *SI = dyn_cast<StoreInst>(L)) {
+ if (int Res =
+ cmpNumbers(SI->isVolatile(), cast<StoreInst>(R)->isVolatile()))
+ return Res;
+ if (int Res =
+ cmpNumbers(SI->getAlignment(), cast<StoreInst>(R)->getAlignment()))
+ return Res;
+ if (int Res =
+ cmpOrderings(SI->getOrdering(), cast<StoreInst>(R)->getOrdering()))
+ return Res;
+ return cmpNumbers(SI->getSyncScopeID(),
+ cast<StoreInst>(R)->getSyncScopeID());
+ }
+ if (const CmpInst *CI = dyn_cast<CmpInst>(L))
+ return cmpNumbers(CI->getPredicate(), cast<CmpInst>(R)->getPredicate());
+ if (auto *CBL = dyn_cast<CallBase>(L)) {
+ auto *CBR = cast<CallBase>(R);
+ if (int Res = cmpNumbers(CBL->getCallingConv(), CBR->getCallingConv()))
+ return Res;
+ if (int Res = cmpAttrs(CBL->getAttributes(), CBR->getAttributes()))
+ return Res;
+ if (int Res = cmpOperandBundlesSchema(*CBL, *CBR))
+ return Res;
+ if (const CallInst *CI = dyn_cast<CallInst>(L))
+ if (int Res = cmpNumbers(CI->getTailCallKind(),
+ cast<CallInst>(R)->getTailCallKind()))
+ return Res;
+ return cmpRangeMetadata(L->getMetadata(LLVMContext::MD_range),
+ R->getMetadata(LLVMContext::MD_range));
+ }
+ if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(L)) {
+ ArrayRef<unsigned> LIndices = IVI->getIndices();
+ ArrayRef<unsigned> RIndices = cast<InsertValueInst>(R)->getIndices();
+ if (int Res = cmpNumbers(LIndices.size(), RIndices.size()))
+ return Res;
+ for (size_t i = 0, e = LIndices.size(); i != e; ++i) {
+ if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
+ return Res;
+ }
+ return 0;
+ }
+ if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(L)) {
+ ArrayRef<unsigned> LIndices = EVI->getIndices();
+ ArrayRef<unsigned> RIndices = cast<ExtractValueInst>(R)->getIndices();
+ if (int Res = cmpNumbers(LIndices.size(), RIndices.size()))
+ return Res;
+ for (size_t i = 0, e = LIndices.size(); i != e; ++i) {
+ if (int Res = cmpNumbers(LIndices[i], RIndices[i]))
+ return Res;
+ }
+ }
+ if (const FenceInst *FI = dyn_cast<FenceInst>(L)) {
+ if (int Res =
+ cmpOrderings(FI->getOrdering(), cast<FenceInst>(R)->getOrdering()))
+ return Res;
+ return cmpNumbers(FI->getSyncScopeID(),
+ cast<FenceInst>(R)->getSyncScopeID());
+ }
+ if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(L)) {
+ if (int Res = cmpNumbers(CXI->isVolatile(),
+ cast<AtomicCmpXchgInst>(R)->isVolatile()))
+ return Res;
+ if (int Res =
+ cmpNumbers(CXI->isWeak(), cast<AtomicCmpXchgInst>(R)->isWeak()))
+ return Res;
+ if (int Res =
+ cmpOrderings(CXI->getSuccessOrdering(),
+ cast<AtomicCmpXchgInst>(R)->getSuccessOrdering()))
+ return Res;
+ if (int Res =
+ cmpOrderings(CXI->getFailureOrdering(),
+ cast<AtomicCmpXchgInst>(R)->getFailureOrdering()))
+ return Res;
+ return cmpNumbers(CXI->getSyncScopeID(),
+ cast<AtomicCmpXchgInst>(R)->getSyncScopeID());
+ }
+ if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(L)) {
+ if (int Res = cmpNumbers(RMWI->getOperation(),
+ cast<AtomicRMWInst>(R)->getOperation()))
+ return Res;
+ if (int Res = cmpNumbers(RMWI->isVolatile(),
+ cast<AtomicRMWInst>(R)->isVolatile()))
+ return Res;
+ if (int Res = cmpOrderings(RMWI->getOrdering(),
+ cast<AtomicRMWInst>(R)->getOrdering()))
+ return Res;
+ return cmpNumbers(RMWI->getSyncScopeID(),
+ cast<AtomicRMWInst>(R)->getSyncScopeID());
+ }
+ if (const ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(L)) {
+ ArrayRef<int> LMask = SVI->getShuffleMask();
+ ArrayRef<int> RMask = cast<ShuffleVectorInst>(R)->getShuffleMask();
+ if (int Res = cmpNumbers(LMask.size(), RMask.size()))
+ return Res;
+ for (size_t i = 0, e = LMask.size(); i != e; ++i) {
+ if (int Res = cmpNumbers(LMask[i], RMask[i]))
+ return Res;
+ }
+ }
+ if (const PHINode *PNL = dyn_cast<PHINode>(L)) {
+ const PHINode *PNR = cast<PHINode>(R);
+ // Ensure that in addition to the incoming values being identical
+ // (checked by the caller of this function), the incoming blocks
+ // are also identical.
+ for (unsigned i = 0, e = PNL->getNumIncomingValues(); i != e; ++i) {
+ if (int Res =
+ cmpValues(PNL->getIncomingBlock(i), PNR->getIncomingBlock(i)))
+ return Res;
+ }
+ }
+ return 0;
+}
+
+// Determine whether two GEP operations perform the same underlying arithmetic.
+// Read method declaration comments for more details.
+int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,
+ const GEPOperator *GEPR) const {
+ unsigned int ASL = GEPL->getPointerAddressSpace();
+ unsigned int ASR = GEPR->getPointerAddressSpace();
+
+ if (int Res = cmpNumbers(ASL, ASR))
+ return Res;
+
+ // When we have target data, we can reduce the GEP down to the value in bytes
+ // added to the address.
+ const DataLayout &DL = FnL->getParent()->getDataLayout();
+ unsigned BitWidth = DL.getPointerSizeInBits(ASL);
+ APInt OffsetL(BitWidth, 0), OffsetR(BitWidth, 0);
+ if (GEPL->accumulateConstantOffset(DL, OffsetL) &&
+ GEPR->accumulateConstantOffset(DL, OffsetR))
+ return cmpAPInts(OffsetL, OffsetR);
+ if (int Res =
+ cmpTypes(GEPL->getSourceElementType(), GEPR->getSourceElementType()))
+ return Res;
+
+ if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands()))
+ return Res;
+
+ for (unsigned i = 0, e = GEPL->getNumOperands(); i != e; ++i) {
+ if (int Res = cmpValues(GEPL->getOperand(i), GEPR->getOperand(i)))
+ return Res;
+ }
+
+ return 0;
+}
+
+int FunctionComparator::cmpInlineAsm(const InlineAsm *L,
+ const InlineAsm *R) const {
+ // InlineAsm's are uniqued. If they are the same pointer, obviously they are
+ // the same, otherwise compare the fields.
+ if (L == R)
+ return 0;
+ if (int Res = cmpTypes(L->getFunctionType(), R->getFunctionType()))
+ return Res;
+ if (int Res = cmpMem(L->getAsmString(), R->getAsmString()))
+ return Res;
+ if (int Res = cmpMem(L->getConstraintString(), R->getConstraintString()))
+ return Res;
+ if (int Res = cmpNumbers(L->hasSideEffects(), R->hasSideEffects()))
+ return Res;
+ if (int Res = cmpNumbers(L->isAlignStack(), R->isAlignStack()))
+ return Res;
+ if (int Res = cmpNumbers(L->getDialect(), R->getDialect()))
+ return Res;
+ assert(L->getFunctionType() != R->getFunctionType());
+ return 0;
+}
+
+/// Compare two values used by the two functions under pair-wise comparison. If
+/// this is the first time the values are seen, they're added to the mapping so
+/// that we will detect mismatches on next use.
+/// See comments in declaration for more details.
+int FunctionComparator::cmpValues(const Value *L, const Value *R) const {
+ // Catch self-reference case.
+ if (L == FnL) {
+ if (R == FnR)
+ return 0;
+ return -1;
+ }
+ if (R == FnR) {
+ if (L == FnL)
+ return 0;
+ return 1;
+ }
+
+ const Constant *ConstL = dyn_cast<Constant>(L);
+ const Constant *ConstR = dyn_cast<Constant>(R);
+ if (ConstL && ConstR) {
+ if (L == R)
+ return 0;
+ return cmpConstants(ConstL, ConstR);
+ }
+
+ if (ConstL)
+ return 1;
+ if (ConstR)
+ return -1;
+
+ const InlineAsm *InlineAsmL = dyn_cast<InlineAsm>(L);
+ const InlineAsm *InlineAsmR = dyn_cast<InlineAsm>(R);
+
+ if (InlineAsmL && InlineAsmR)
+ return cmpInlineAsm(InlineAsmL, InlineAsmR);
+ if (InlineAsmL)
+ return 1;
+ if (InlineAsmR)
+ return -1;
+
+ auto LeftSN = sn_mapL.insert(std::make_pair(L, sn_mapL.size())),
+ RightSN = sn_mapR.insert(std::make_pair(R, sn_mapR.size()));
+
+ return cmpNumbers(LeftSN.first->second, RightSN.first->second);
+}
+
+// Test whether two basic blocks have equivalent behaviour.
+int FunctionComparator::cmpBasicBlocks(const BasicBlock *BBL,
+ const BasicBlock *BBR) const {
+ BasicBlock::const_iterator InstL = BBL->begin(), InstLE = BBL->end();
+ BasicBlock::const_iterator InstR = BBR->begin(), InstRE = BBR->end();
+
+ do {
+ bool needToCmpOperands = true;
+ if (int Res = cmpOperations(&*InstL, &*InstR, needToCmpOperands))
+ return Res;
+ if (needToCmpOperands) {
+ assert(InstL->getNumOperands() == InstR->getNumOperands());
+
+ for (unsigned i = 0, e = InstL->getNumOperands(); i != e; ++i) {
+ Value *OpL = InstL->getOperand(i);
+ Value *OpR = InstR->getOperand(i);
+ if (int Res = cmpValues(OpL, OpR))
+ return Res;
+ // cmpValues should ensure this is true.
+ assert(cmpTypes(OpL->getType(), OpR->getType()) == 0);
+ }
+ }
+
+ ++InstL;
+ ++InstR;
+ } while (InstL != InstLE && InstR != InstRE);
+
+ if (InstL != InstLE && InstR == InstRE)
+ return 1;
+ if (InstL == InstLE && InstR != InstRE)
+ return -1;
+ return 0;
+}
+
+int FunctionComparator::compareSignature() const {
+ if (int Res = cmpAttrs(FnL->getAttributes(), FnR->getAttributes()))
+ return Res;
+
+ if (int Res = cmpNumbers(FnL->hasGC(), FnR->hasGC()))
+ return Res;
+
+ if (FnL->hasGC()) {
+ if (int Res = cmpMem(FnL->getGC(), FnR->getGC()))
+ return Res;
+ }
+
+ if (int Res = cmpNumbers(FnL->hasSection(), FnR->hasSection()))
+ return Res;
+
+ if (FnL->hasSection()) {
+ if (int Res = cmpMem(FnL->getSection(), FnR->getSection()))
+ return Res;
+ }
+
+ if (int Res = cmpNumbers(FnL->isVarArg(), FnR->isVarArg()))
+ return Res;
+
+ // TODO: if it's internal and only used in direct calls, we could handle this
+ // case too.
+ if (int Res = cmpNumbers(FnL->getCallingConv(), FnR->getCallingConv()))
+ return Res;
+
+ if (int Res = cmpTypes(FnL->getFunctionType(), FnR->getFunctionType()))
+ return Res;
+
+ assert(FnL->arg_size() == FnR->arg_size() &&
+ "Identically typed functions have different numbers of args!");
+
+ // Visit the arguments so that they get enumerated in the order they're
+ // passed in.
+ for (Function::const_arg_iterator ArgLI = FnL->arg_begin(),
+ ArgRI = FnR->arg_begin(),
+ ArgLE = FnL->arg_end();
+ ArgLI != ArgLE; ++ArgLI, ++ArgRI) {
+ if (cmpValues(&*ArgLI, &*ArgRI) != 0)
+ llvm_unreachable("Arguments repeat!");
+ }
+ return 0;
+}
+
+// Test whether the two functions have equivalent behaviour.
+int FunctionComparator::compare() {
+ beginCompare();
+
+ if (int Res = compareSignature())
+ return Res;
+
+ // We do a CFG-ordered walk since the actual ordering of the blocks in the
+ // linked list is immaterial. Our walk starts at the entry block for both
+ // functions, then takes each block from each terminator in order. As an
+ // artifact, this also means that unreachable blocks are ignored.
+ SmallVector<const BasicBlock *, 8> FnLBBs, FnRBBs;
+ SmallPtrSet<const BasicBlock *, 32> VisitedBBs; // in terms of F1.
+
+ FnLBBs.push_back(&FnL->getEntryBlock());
+ FnRBBs.push_back(&FnR->getEntryBlock());
+
+ VisitedBBs.insert(FnLBBs[0]);
+ while (!FnLBBs.empty()) {
+ const BasicBlock *BBL = FnLBBs.pop_back_val();
+ const BasicBlock *BBR = FnRBBs.pop_back_val();
+
+ if (int Res = cmpValues(BBL, BBR))
+ return Res;
+
+ if (int Res = cmpBasicBlocks(BBL, BBR))
+ return Res;
+
+ const Instruction *TermL = BBL->getTerminator();
+ const Instruction *TermR = BBR->getTerminator();
+
+ assert(TermL->getNumSuccessors() == TermR->getNumSuccessors());
+ for (unsigned i = 0, e = TermL->getNumSuccessors(); i != e; ++i) {
+ if (!VisitedBBs.insert(TermL->getSuccessor(i)).second)
+ continue;
+
+ FnLBBs.push_back(TermL->getSuccessor(i));
+ FnRBBs.push_back(TermR->getSuccessor(i));
+ }
+ }
+ return 0;
+}
+
+namespace {
+
+// Accumulate the hash of a sequence of 64-bit integers. This is similar to a
+// hash of a sequence of 64bit ints, but the entire input does not need to be
+// available at once. This interface is necessary for functionHash because it
+// needs to accumulate the hash as the structure of the function is traversed
+// without saving these values to an intermediate buffer. This form of hashing
+// is not often needed, as usually the object to hash is just read from a
+// buffer.
+class HashAccumulator64 {
+ uint64_t Hash;
+
+public:
+ // Initialize to random constant, so the state isn't zero.
+ HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; }
+
+ void add(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); }
+
+ // No finishing is required, because the entire hash value is used.
+ uint64_t getHash() { return Hash; }
+};
+
+} // end anonymous namespace
+
+// A function hash is calculated by considering only the number of arguments and
+// whether a function is varargs, the order of basic blocks (given by the
+// successors of each basic block in depth first order), and the order of
+// opcodes of each instruction within each of these basic blocks. This mirrors
+// the strategy compare() uses to compare functions by walking the BBs in depth
+// first order and comparing each instruction in sequence. Because this hash
+// does not look at the operands, it is insensitive to things such as the
+// target of calls and the constants used in the function, which makes it useful
+// when possibly merging functions which are the same modulo constants and call
+// targets.
+FunctionComparator::FunctionHash FunctionComparator::functionHash(Function &F) {
+ HashAccumulator64 H;
+ H.add(F.isVarArg());
+ H.add(F.arg_size());
+
+ SmallVector<const BasicBlock *, 8> BBs;
+ SmallPtrSet<const BasicBlock *, 16> VisitedBBs;
+
+ // Walk the blocks in the same order as FunctionComparator::cmpBasicBlocks(),
+ // accumulating the hash of the function "structure." (BB and opcode sequence)
+ BBs.push_back(&F.getEntryBlock());
+ VisitedBBs.insert(BBs[0]);
+ while (!BBs.empty()) {
+ const BasicBlock *BB = BBs.pop_back_val();
+ // This random value acts as a block header, as otherwise the partition of
+ // opcodes into BBs wouldn't affect the hash, only the order of the opcodes
+ H.add(45798);
+ for (auto &Inst : *BB) {
+ H.add(Inst.getOpcode());
+ }
+ const Instruction *Term = BB->getTerminator();
+ for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+ if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
+ continue;
+ BBs.push_back(Term->getSuccessor(i));
+ }
+ }
+ return H.getHash();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/FunctionImportUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/FunctionImportUtils.cpp
index 28f69a8b6a..8df7ae9563 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -1,334 +1,334 @@
-//===- lib/Transforms/Utils/FunctionImportUtils.cpp - Importing utilities -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the FunctionImportGlobalProcessing class, used
-// to perform the necessary global value handling for function importing.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/FunctionImportUtils.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/InstIterator.h"
-using namespace llvm;
-
-/// Checks if we should import SGV as a definition, otherwise import as a
-/// declaration.
-bool FunctionImportGlobalProcessing::doImportAsDefinition(
- const GlobalValue *SGV) {
- if (!isPerformingImport())
- return false;
-
- // Only import the globals requested for importing.
- if (!GlobalsToImport->count(const_cast<GlobalValue *>(SGV)))
- return false;
-
- assert(!isa<GlobalAlias>(SGV) &&
- "Unexpected global alias in the import list.");
-
- // Otherwise yes.
- return true;
-}
-
-bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
- const GlobalValue *SGV, ValueInfo VI) {
- assert(SGV->hasLocalLinkage());
- // Both the imported references and the original local variable must
- // be promoted.
- if (!isPerformingImport() && !isModuleExporting())
- return false;
-
- if (isPerformingImport()) {
- assert((!GlobalsToImport->count(const_cast<GlobalValue *>(SGV)) ||
- !isNonRenamableLocal(*SGV)) &&
- "Attempting to promote non-renamable local");
- // We don't know for sure yet if we are importing this value (as either
- // a reference or a def), since we are simply walking all values in the
- // module. But by necessity if we end up importing it and it is local,
- // it must be promoted, so unconditionally promote all values in the
- // importing module.
- return true;
- }
-
- // When exporting, consult the index. We can have more than one local
- // with the same GUID, in the case of same-named locals in different but
- // same-named source files that were compiled in their respective directories
- // (so the source file name and resulting GUID is the same). Find the one
- // in this module.
- auto Summary = ImportIndex.findSummaryInModule(
- VI, SGV->getParent()->getModuleIdentifier());
- assert(Summary && "Missing summary for global value when exporting");
- auto Linkage = Summary->linkage();
- if (!GlobalValue::isLocalLinkage(Linkage)) {
- assert(!isNonRenamableLocal(*SGV) &&
- "Attempting to promote non-renamable local");
- return true;
- }
-
- return false;
-}
-
-#ifndef NDEBUG
-bool FunctionImportGlobalProcessing::isNonRenamableLocal(
- const GlobalValue &GV) const {
- if (!GV.hasLocalLinkage())
- return false;
- // This needs to stay in sync with the logic in buildModuleSummaryIndex.
- if (GV.hasSection())
- return true;
- if (Used.count(const_cast<GlobalValue *>(&GV)))
- return true;
- return false;
-}
-#endif
-
-std::string
-FunctionImportGlobalProcessing::getPromotedName(const GlobalValue *SGV) {
- assert(SGV->hasLocalLinkage());
- // For locals that must be promoted to global scope, ensure that
- // the promoted name uniquely identifies the copy in the original module,
- // using the ID assigned during combined index creation.
- return ModuleSummaryIndex::getGlobalNameForLocal(
- SGV->getName(),
- ImportIndex.getModuleHash(SGV->getParent()->getModuleIdentifier()));
-}
-
-GlobalValue::LinkageTypes
-FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV,
- bool DoPromote) {
- // Any local variable that is referenced by an exported function needs
- // to be promoted to global scope. Since we don't currently know which
- // functions reference which local variables/functions, we must treat
- // all as potentially exported if this module is exporting anything.
- if (isModuleExporting()) {
- if (SGV->hasLocalLinkage() && DoPromote)
- return GlobalValue::ExternalLinkage;
- return SGV->getLinkage();
- }
-
- // Otherwise, if we aren't importing, no linkage change is needed.
- if (!isPerformingImport())
- return SGV->getLinkage();
-
- switch (SGV->getLinkage()) {
- case GlobalValue::LinkOnceODRLinkage:
- case GlobalValue::ExternalLinkage:
- // External and linkonce definitions are converted to available_externally
- // definitions upon import, so that they are available for inlining
- // and/or optimization, but are turned into declarations later
- // during the EliminateAvailableExternally pass.
- if (doImportAsDefinition(SGV) && !isa<GlobalAlias>(SGV))
- return GlobalValue::AvailableExternallyLinkage;
- // An imported external declaration stays external.
- return SGV->getLinkage();
-
- case GlobalValue::AvailableExternallyLinkage:
- // An imported available_externally definition converts
- // to external if imported as a declaration.
- if (!doImportAsDefinition(SGV))
- return GlobalValue::ExternalLinkage;
- // An imported available_externally declaration stays that way.
- return SGV->getLinkage();
-
- case GlobalValue::LinkOnceAnyLinkage:
- case GlobalValue::WeakAnyLinkage:
- // Can't import linkonce_any/weak_any definitions correctly, or we might
- // change the program semantics, since the linker will pick the first
- // linkonce_any/weak_any definition and importing would change the order
- // they are seen by the linker. The module linking caller needs to enforce
- // this.
- assert(!doImportAsDefinition(SGV));
- // If imported as a declaration, it becomes external_weak.
- return SGV->getLinkage();
-
- case GlobalValue::WeakODRLinkage:
- // For weak_odr linkage, there is a guarantee that all copies will be
- // equivalent, so the issue described above for weak_any does not exist,
- // and the definition can be imported. It can be treated similarly
- // to an imported externally visible global value.
- if (doImportAsDefinition(SGV) && !isa<GlobalAlias>(SGV))
- return GlobalValue::AvailableExternallyLinkage;
- else
- return GlobalValue::ExternalLinkage;
-
- case GlobalValue::AppendingLinkage:
- // It would be incorrect to import an appending linkage variable,
- // since it would cause global constructors/destructors to be
- // executed multiple times. This should have already been handled
- // by linkIfNeeded, and we will assert in shouldLinkFromSource
- // if we try to import, so we simply return AppendingLinkage.
- return GlobalValue::AppendingLinkage;
-
- case GlobalValue::InternalLinkage:
- case GlobalValue::PrivateLinkage:
- // If we are promoting the local to global scope, it is handled
- // similarly to a normal externally visible global.
- if (DoPromote) {
- if (doImportAsDefinition(SGV) && !isa<GlobalAlias>(SGV))
- return GlobalValue::AvailableExternallyLinkage;
- else
- return GlobalValue::ExternalLinkage;
- }
- // A non-promoted imported local definition stays local.
- // The ThinLTO pass will eventually force-import their definitions.
- return SGV->getLinkage();
-
- case GlobalValue::ExternalWeakLinkage:
- // External weak doesn't apply to definitions, must be a declaration.
- assert(!doImportAsDefinition(SGV));
- // Linkage stays external_weak.
- return SGV->getLinkage();
-
- case GlobalValue::CommonLinkage:
- // Linkage stays common on definitions.
- // The ThinLTO pass will eventually force-import their definitions.
- return SGV->getLinkage();
- }
-
- llvm_unreachable("unknown linkage type");
-}
-
-void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
-
- ValueInfo VI;
- if (GV.hasName()) {
- VI = ImportIndex.getValueInfo(GV.getGUID());
- // Set synthetic function entry counts.
- if (VI && ImportIndex.hasSyntheticEntryCounts()) {
- if (Function *F = dyn_cast<Function>(&GV)) {
- if (!F->isDeclaration()) {
- for (auto &S : VI.getSummaryList()) {
- auto *FS = cast<FunctionSummary>(S->getBaseObject());
- if (FS->modulePath() == M.getModuleIdentifier()) {
- F->setEntryCount(Function::ProfileCount(FS->entryCount(),
- Function::PCT_Synthetic));
- break;
- }
- }
- }
- }
- }
- }
-
- // We should always have a ValueInfo (i.e. GV in index) for definitions when
- // we are exporting, and also when importing that value.
- assert(VI || GV.isDeclaration() ||
- (isPerformingImport() && !doImportAsDefinition(&GV)));
-
- // Mark read/write-only variables which can be imported with specific
- // attribute. We can't internalize them now because IRMover will fail
- // to link variable definitions to their external declarations during
- // ThinLTO import. We'll internalize read-only variables later, after
- // import is finished. See internalizeGVsAfterImport.
- //
- // If global value dead stripping is not enabled in summary then
- // propagateConstants hasn't been run. We can't internalize GV
- // in such case.
- if (!GV.isDeclaration() && VI && ImportIndex.withAttributePropagation()) {
- if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) {
- // We can have more than one local with the same GUID, in the case of
- // same-named locals in different but same-named source files that were
- // compiled in their respective directories (so the source file name
- // and resulting GUID is the same). Find the one in this module.
- // Handle the case where there is no summary found in this module. That
- // can happen in the distributed ThinLTO backend, because the index only
- // contains summaries from the source modules if they are being imported.
- // We might have a non-null VI and get here even in that case if the name
- // matches one in this module (e.g. weak or appending linkage).
- auto *GVS = dyn_cast_or_null<GlobalVarSummary>(
- ImportIndex.findSummaryInModule(VI, M.getModuleIdentifier()));
- if (GVS &&
- (ImportIndex.isReadOnly(GVS) || ImportIndex.isWriteOnly(GVS))) {
- V->addAttribute("thinlto-internalize");
- // Objects referenced by writeonly GV initializer should not be
- // promoted, because there is no any kind of read access to them
- // on behalf of this writeonly GV. To avoid promotion we convert
- // GV initializer to 'zeroinitializer'. This effectively drops
- // references in IR module (not in combined index), so we can
- // ignore them when computing import. We do not export references
- // of writeonly object. See computeImportForReferencedGlobals
- if (ImportIndex.isWriteOnly(GVS))
- V->setInitializer(Constant::getNullValue(V->getValueType()));
- }
- }
- }
-
- if (GV.hasLocalLinkage() && shouldPromoteLocalToGlobal(&GV, VI)) {
- // Save the original name string before we rename GV below.
- auto Name = GV.getName().str();
- GV.setName(getPromotedName(&GV));
- GV.setLinkage(getLinkage(&GV, /* DoPromote */ true));
- assert(!GV.hasLocalLinkage());
- GV.setVisibility(GlobalValue::HiddenVisibility);
-
- // If we are renaming a COMDAT leader, ensure that we record the COMDAT
- // for later renaming as well. This is required for COFF.
- if (const auto *C = GV.getComdat())
- if (C->getName() == Name)
- RenamedComdats.try_emplace(C, M.getOrInsertComdat(GV.getName()));
- } else
- GV.setLinkage(getLinkage(&GV, /* DoPromote */ false));
-
- // When ClearDSOLocalOnDeclarations is true, clear dso_local if GV is
- // converted to a declaration, to disable direct access. Don't do this if GV
- // is implicitly dso_local due to a non-default visibility.
- if (ClearDSOLocalOnDeclarations && GV.isDeclarationForLinker() &&
- !GV.isImplicitDSOLocal()) {
- GV.setDSOLocal(false);
- } else if (VI && VI.isDSOLocal()) {
- // If all summaries are dso_local, symbol gets resolved to a known local
- // definition.
- GV.setDSOLocal(true);
- if (GV.hasDLLImportStorageClass())
- GV.setDLLStorageClass(GlobalValue::DefaultStorageClass);
- }
-
- // Remove functions imported as available externally defs from comdats,
- // as this is a declaration for the linker, and will be dropped eventually.
- // It is illegal for comdats to contain declarations.
- auto *GO = dyn_cast<GlobalObject>(&GV);
- if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) {
- // The IRMover should not have placed any imported declarations in
- // a comdat, so the only declaration that should be in a comdat
- // at this point would be a definition imported as available_externally.
- assert(GO->hasAvailableExternallyLinkage() &&
- "Expected comdat on definition (possibly available external)");
- GO->setComdat(nullptr);
- }
-}
-
-void FunctionImportGlobalProcessing::processGlobalsForThinLTO() {
- for (GlobalVariable &GV : M.globals())
- processGlobalForThinLTO(GV);
- for (Function &SF : M)
- processGlobalForThinLTO(SF);
- for (GlobalAlias &GA : M.aliases())
- processGlobalForThinLTO(GA);
-
- // Replace any COMDATS that required renaming (because the COMDAT leader was
- // promoted and renamed).
- if (!RenamedComdats.empty())
- for (auto &GO : M.global_objects())
- if (auto *C = GO.getComdat()) {
- auto Replacement = RenamedComdats.find(C);
- if (Replacement != RenamedComdats.end())
- GO.setComdat(Replacement->second);
- }
-}
-
-bool FunctionImportGlobalProcessing::run() {
- processGlobalsForThinLTO();
- return false;
-}
-
-bool llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index,
- bool ClearDSOLocalOnDeclarations,
- SetVector<GlobalValue *> *GlobalsToImport) {
- FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport,
- ClearDSOLocalOnDeclarations);
- return ThinLTOProcessing.run();
-}
+//===- lib/Transforms/Utils/FunctionImportUtils.cpp - Importing utilities -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the FunctionImportGlobalProcessing class, used
+// to perform the necessary global value handling for function importing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/FunctionImportUtils.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/InstIterator.h"
+using namespace llvm;
+
+/// Checks if we should import SGV as a definition, otherwise import as a
+/// declaration.
+bool FunctionImportGlobalProcessing::doImportAsDefinition(
+ const GlobalValue *SGV) {
+ if (!isPerformingImport())
+ return false;
+
+ // Only import the globals requested for importing.
+ if (!GlobalsToImport->count(const_cast<GlobalValue *>(SGV)))
+ return false;
+
+ assert(!isa<GlobalAlias>(SGV) &&
+ "Unexpected global alias in the import list.");
+
+ // Otherwise yes.
+ return true;
+}
+
+bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
+ const GlobalValue *SGV, ValueInfo VI) {
+ assert(SGV->hasLocalLinkage());
+ // Both the imported references and the original local variable must
+ // be promoted.
+ if (!isPerformingImport() && !isModuleExporting())
+ return false;
+
+ if (isPerformingImport()) {
+ assert((!GlobalsToImport->count(const_cast<GlobalValue *>(SGV)) ||
+ !isNonRenamableLocal(*SGV)) &&
+ "Attempting to promote non-renamable local");
+ // We don't know for sure yet if we are importing this value (as either
+ // a reference or a def), since we are simply walking all values in the
+ // module. But by necessity if we end up importing it and it is local,
+ // it must be promoted, so unconditionally promote all values in the
+ // importing module.
+ return true;
+ }
+
+ // When exporting, consult the index. We can have more than one local
+ // with the same GUID, in the case of same-named locals in different but
+ // same-named source files that were compiled in their respective directories
+ // (so the source file name and resulting GUID is the same). Find the one
+ // in this module.
+ auto Summary = ImportIndex.findSummaryInModule(
+ VI, SGV->getParent()->getModuleIdentifier());
+ assert(Summary && "Missing summary for global value when exporting");
+ auto Linkage = Summary->linkage();
+ if (!GlobalValue::isLocalLinkage(Linkage)) {
+ assert(!isNonRenamableLocal(*SGV) &&
+ "Attempting to promote non-renamable local");
+ return true;
+ }
+
+ return false;
+}
+
+#ifndef NDEBUG
+bool FunctionImportGlobalProcessing::isNonRenamableLocal(
+ const GlobalValue &GV) const {
+ if (!GV.hasLocalLinkage())
+ return false;
+ // This needs to stay in sync with the logic in buildModuleSummaryIndex.
+ if (GV.hasSection())
+ return true;
+ if (Used.count(const_cast<GlobalValue *>(&GV)))
+ return true;
+ return false;
+}
+#endif
+
+std::string
+FunctionImportGlobalProcessing::getPromotedName(const GlobalValue *SGV) {
+ assert(SGV->hasLocalLinkage());
+ // For locals that must be promoted to global scope, ensure that
+ // the promoted name uniquely identifies the copy in the original module,
+ // using the ID assigned during combined index creation.
+ return ModuleSummaryIndex::getGlobalNameForLocal(
+ SGV->getName(),
+ ImportIndex.getModuleHash(SGV->getParent()->getModuleIdentifier()));
+}
+
+GlobalValue::LinkageTypes
+FunctionImportGlobalProcessing::getLinkage(const GlobalValue *SGV,
+ bool DoPromote) {
+ // Any local variable that is referenced by an exported function needs
+ // to be promoted to global scope. Since we don't currently know which
+ // functions reference which local variables/functions, we must treat
+ // all as potentially exported if this module is exporting anything.
+ if (isModuleExporting()) {
+ if (SGV->hasLocalLinkage() && DoPromote)
+ return GlobalValue::ExternalLinkage;
+ return SGV->getLinkage();
+ }
+
+ // Otherwise, if we aren't importing, no linkage change is needed.
+ if (!isPerformingImport())
+ return SGV->getLinkage();
+
+ switch (SGV->getLinkage()) {
+ case GlobalValue::LinkOnceODRLinkage:
+ case GlobalValue::ExternalLinkage:
+ // External and linkonce definitions are converted to available_externally
+ // definitions upon import, so that they are available for inlining
+ // and/or optimization, but are turned into declarations later
+ // during the EliminateAvailableExternally pass.
+ if (doImportAsDefinition(SGV) && !isa<GlobalAlias>(SGV))
+ return GlobalValue::AvailableExternallyLinkage;
+ // An imported external declaration stays external.
+ return SGV->getLinkage();
+
+ case GlobalValue::AvailableExternallyLinkage:
+ // An imported available_externally definition converts
+ // to external if imported as a declaration.
+ if (!doImportAsDefinition(SGV))
+ return GlobalValue::ExternalLinkage;
+ // An imported available_externally declaration stays that way.
+ return SGV->getLinkage();
+
+ case GlobalValue::LinkOnceAnyLinkage:
+ case GlobalValue::WeakAnyLinkage:
+ // Can't import linkonce_any/weak_any definitions correctly, or we might
+ // change the program semantics, since the linker will pick the first
+ // linkonce_any/weak_any definition and importing would change the order
+ // they are seen by the linker. The module linking caller needs to enforce
+ // this.
+ assert(!doImportAsDefinition(SGV));
+ // If imported as a declaration, it becomes external_weak.
+ return SGV->getLinkage();
+
+ case GlobalValue::WeakODRLinkage:
+ // For weak_odr linkage, there is a guarantee that all copies will be
+ // equivalent, so the issue described above for weak_any does not exist,
+ // and the definition can be imported. It can be treated similarly
+ // to an imported externally visible global value.
+ if (doImportAsDefinition(SGV) && !isa<GlobalAlias>(SGV))
+ return GlobalValue::AvailableExternallyLinkage;
+ else
+ return GlobalValue::ExternalLinkage;
+
+ case GlobalValue::AppendingLinkage:
+ // It would be incorrect to import an appending linkage variable,
+ // since it would cause global constructors/destructors to be
+ // executed multiple times. This should have already been handled
+ // by linkIfNeeded, and we will assert in shouldLinkFromSource
+ // if we try to import, so we simply return AppendingLinkage.
+ return GlobalValue::AppendingLinkage;
+
+ case GlobalValue::InternalLinkage:
+ case GlobalValue::PrivateLinkage:
+ // If we are promoting the local to global scope, it is handled
+ // similarly to a normal externally visible global.
+ if (DoPromote) {
+ if (doImportAsDefinition(SGV) && !isa<GlobalAlias>(SGV))
+ return GlobalValue::AvailableExternallyLinkage;
+ else
+ return GlobalValue::ExternalLinkage;
+ }
+ // A non-promoted imported local definition stays local.
+ // The ThinLTO pass will eventually force-import their definitions.
+ return SGV->getLinkage();
+
+ case GlobalValue::ExternalWeakLinkage:
+ // External weak doesn't apply to definitions, must be a declaration.
+ assert(!doImportAsDefinition(SGV));
+ // Linkage stays external_weak.
+ return SGV->getLinkage();
+
+ case GlobalValue::CommonLinkage:
+ // Linkage stays common on definitions.
+ // The ThinLTO pass will eventually force-import their definitions.
+ return SGV->getLinkage();
+ }
+
+ llvm_unreachable("unknown linkage type");
+}
+
+void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
+
+ ValueInfo VI;
+ if (GV.hasName()) {
+ VI = ImportIndex.getValueInfo(GV.getGUID());
+ // Set synthetic function entry counts.
+ if (VI && ImportIndex.hasSyntheticEntryCounts()) {
+ if (Function *F = dyn_cast<Function>(&GV)) {
+ if (!F->isDeclaration()) {
+ for (auto &S : VI.getSummaryList()) {
+ auto *FS = cast<FunctionSummary>(S->getBaseObject());
+ if (FS->modulePath() == M.getModuleIdentifier()) {
+ F->setEntryCount(Function::ProfileCount(FS->entryCount(),
+ Function::PCT_Synthetic));
+ break;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // We should always have a ValueInfo (i.e. GV in index) for definitions when
+ // we are exporting, and also when importing that value.
+ assert(VI || GV.isDeclaration() ||
+ (isPerformingImport() && !doImportAsDefinition(&GV)));
+
+ // Mark read/write-only variables which can be imported with specific
+ // attribute. We can't internalize them now because IRMover will fail
+ // to link variable definitions to their external declarations during
+ // ThinLTO import. We'll internalize read-only variables later, after
+ // import is finished. See internalizeGVsAfterImport.
+ //
+ // If global value dead stripping is not enabled in summary then
+ // propagateConstants hasn't been run. We can't internalize GV
+ // in such case.
+ if (!GV.isDeclaration() && VI && ImportIndex.withAttributePropagation()) {
+ if (GlobalVariable *V = dyn_cast<GlobalVariable>(&GV)) {
+ // We can have more than one local with the same GUID, in the case of
+ // same-named locals in different but same-named source files that were
+ // compiled in their respective directories (so the source file name
+ // and resulting GUID is the same). Find the one in this module.
+ // Handle the case where there is no summary found in this module. That
+ // can happen in the distributed ThinLTO backend, because the index only
+ // contains summaries from the source modules if they are being imported.
+ // We might have a non-null VI and get here even in that case if the name
+ // matches one in this module (e.g. weak or appending linkage).
+ auto *GVS = dyn_cast_or_null<GlobalVarSummary>(
+ ImportIndex.findSummaryInModule(VI, M.getModuleIdentifier()));
+ if (GVS &&
+ (ImportIndex.isReadOnly(GVS) || ImportIndex.isWriteOnly(GVS))) {
+ V->addAttribute("thinlto-internalize");
+ // Objects referenced by writeonly GV initializer should not be
+ // promoted, because there is no any kind of read access to them
+ // on behalf of this writeonly GV. To avoid promotion we convert
+ // GV initializer to 'zeroinitializer'. This effectively drops
+ // references in IR module (not in combined index), so we can
+ // ignore them when computing import. We do not export references
+ // of writeonly object. See computeImportForReferencedGlobals
+ if (ImportIndex.isWriteOnly(GVS))
+ V->setInitializer(Constant::getNullValue(V->getValueType()));
+ }
+ }
+ }
+
+ if (GV.hasLocalLinkage() && shouldPromoteLocalToGlobal(&GV, VI)) {
+ // Save the original name string before we rename GV below.
+ auto Name = GV.getName().str();
+ GV.setName(getPromotedName(&GV));
+ GV.setLinkage(getLinkage(&GV, /* DoPromote */ true));
+ assert(!GV.hasLocalLinkage());
+ GV.setVisibility(GlobalValue::HiddenVisibility);
+
+ // If we are renaming a COMDAT leader, ensure that we record the COMDAT
+ // for later renaming as well. This is required for COFF.
+ if (const auto *C = GV.getComdat())
+ if (C->getName() == Name)
+ RenamedComdats.try_emplace(C, M.getOrInsertComdat(GV.getName()));
+ } else
+ GV.setLinkage(getLinkage(&GV, /* DoPromote */ false));
+
+ // When ClearDSOLocalOnDeclarations is true, clear dso_local if GV is
+ // converted to a declaration, to disable direct access. Don't do this if GV
+ // is implicitly dso_local due to a non-default visibility.
+ if (ClearDSOLocalOnDeclarations && GV.isDeclarationForLinker() &&
+ !GV.isImplicitDSOLocal()) {
+ GV.setDSOLocal(false);
+ } else if (VI && VI.isDSOLocal()) {
+ // If all summaries are dso_local, symbol gets resolved to a known local
+ // definition.
+ GV.setDSOLocal(true);
+ if (GV.hasDLLImportStorageClass())
+ GV.setDLLStorageClass(GlobalValue::DefaultStorageClass);
+ }
+
+ // Remove functions imported as available externally defs from comdats,
+ // as this is a declaration for the linker, and will be dropped eventually.
+ // It is illegal for comdats to contain declarations.
+ auto *GO = dyn_cast<GlobalObject>(&GV);
+ if (GO && GO->isDeclarationForLinker() && GO->hasComdat()) {
+ // The IRMover should not have placed any imported declarations in
+ // a comdat, so the only declaration that should be in a comdat
+ // at this point would be a definition imported as available_externally.
+ assert(GO->hasAvailableExternallyLinkage() &&
+ "Expected comdat on definition (possibly available external)");
+ GO->setComdat(nullptr);
+ }
+}
+
+void FunctionImportGlobalProcessing::processGlobalsForThinLTO() {
+ for (GlobalVariable &GV : M.globals())
+ processGlobalForThinLTO(GV);
+ for (Function &SF : M)
+ processGlobalForThinLTO(SF);
+ for (GlobalAlias &GA : M.aliases())
+ processGlobalForThinLTO(GA);
+
+ // Replace any COMDATS that required renaming (because the COMDAT leader was
+ // promoted and renamed).
+ if (!RenamedComdats.empty())
+ for (auto &GO : M.global_objects())
+ if (auto *C = GO.getComdat()) {
+ auto Replacement = RenamedComdats.find(C);
+ if (Replacement != RenamedComdats.end())
+ GO.setComdat(Replacement->second);
+ }
+}
+
+bool FunctionImportGlobalProcessing::run() {
+ processGlobalsForThinLTO();
+ return false;
+}
+
+bool llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index,
+ bool ClearDSOLocalOnDeclarations,
+ SetVector<GlobalValue *> *GlobalsToImport) {
+ FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport,
+ ClearDSOLocalOnDeclarations);
+ return ThinLTOProcessing.run();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/GlobalStatus.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/GlobalStatus.cpp
index 7220a86d3e..f782396be7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/GlobalStatus.cpp
@@ -1,194 +1,194 @@
-//===-- GlobalStatus.cpp - Compute status info for globals -----------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/GlobalStatus.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/AtomicOrdering.h"
-#include "llvm/Support/Casting.h"
-#include <algorithm>
-#include <cassert>
-
-using namespace llvm;
-
-/// Return the stronger of the two ordering. If the two orderings are acquire
-/// and release, then return AcquireRelease.
-///
-static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
- if ((X == AtomicOrdering::Acquire && Y == AtomicOrdering::Release) ||
- (Y == AtomicOrdering::Acquire && X == AtomicOrdering::Release))
- return AtomicOrdering::AcquireRelease;
- return (AtomicOrdering)std::max((unsigned)X, (unsigned)Y);
-}
-
-/// It is safe to destroy a constant iff it is only used by constants itself.
-/// Note that constants cannot be cyclic, so this test is pretty easy to
-/// implement recursively.
-///
-bool llvm::isSafeToDestroyConstant(const Constant *C) {
- if (isa<GlobalValue>(C))
- return false;
-
- if (isa<ConstantData>(C))
- return false;
-
- for (const User *U : C->users())
- if (const Constant *CU = dyn_cast<Constant>(U)) {
- if (!isSafeToDestroyConstant(CU))
- return false;
- } else
- return false;
- return true;
-}
-
-static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
- SmallPtrSetImpl<const Value *> &VisitedUsers) {
- if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
- if (GV->isExternallyInitialized())
- GS.StoredType = GlobalStatus::StoredOnce;
-
- for (const Use &U : V->uses()) {
- const User *UR = U.getUser();
- if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) {
- GS.HasNonInstructionUser = true;
-
- // If the result of the constantexpr isn't pointer type, then we won't
- // know to expect it in various places. Just reject early.
- if (!isa<PointerType>(CE->getType()))
- return true;
-
- // FIXME: Do we need to add constexpr selects to VisitedUsers?
- if (analyzeGlobalAux(CE, GS, VisitedUsers))
- return true;
- } else if (const Instruction *I = dyn_cast<Instruction>(UR)) {
- if (!GS.HasMultipleAccessingFunctions) {
- const Function *F = I->getParent()->getParent();
- if (!GS.AccessingFunction)
- GS.AccessingFunction = F;
- else if (GS.AccessingFunction != F)
- GS.HasMultipleAccessingFunctions = true;
- }
- if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
- GS.IsLoaded = true;
- // Don't hack on volatile loads.
- if (LI->isVolatile())
- return true;
- GS.Ordering = strongerOrdering(GS.Ordering, LI->getOrdering());
- } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
- // Don't allow a store OF the address, only stores TO the address.
- if (SI->getOperand(0) == V)
- return true;
-
- // Don't hack on volatile stores.
- if (SI->isVolatile())
- return true;
-
- GS.Ordering = strongerOrdering(GS.Ordering, SI->getOrdering());
-
- // If this is a direct store to the global (i.e., the global is a scalar
- // value, not an aggregate), keep more specific information about
- // stores.
- if (GS.StoredType != GlobalStatus::Stored) {
- if (const GlobalVariable *GV =
- dyn_cast<GlobalVariable>(SI->getOperand(1))) {
- Value *StoredVal = SI->getOperand(0);
-
- if (Constant *C = dyn_cast<Constant>(StoredVal)) {
- if (C->isThreadDependent()) {
- // The stored value changes between threads; don't track it.
- return true;
- }
- }
-
- if (GV->hasInitializer() && StoredVal == GV->getInitializer()) {
- if (GS.StoredType < GlobalStatus::InitializerStored)
- GS.StoredType = GlobalStatus::InitializerStored;
- } else if (isa<LoadInst>(StoredVal) &&
- cast<LoadInst>(StoredVal)->getOperand(0) == GV) {
- if (GS.StoredType < GlobalStatus::InitializerStored)
- GS.StoredType = GlobalStatus::InitializerStored;
- } else if (GS.StoredType < GlobalStatus::StoredOnce) {
- GS.StoredType = GlobalStatus::StoredOnce;
- GS.StoredOnceValue = StoredVal;
- } else if (GS.StoredType == GlobalStatus::StoredOnce &&
- GS.StoredOnceValue == StoredVal) {
- // noop.
- } else {
- GS.StoredType = GlobalStatus::Stored;
- }
- } else {
- GS.StoredType = GlobalStatus::Stored;
- }
- }
+//===-- GlobalStatus.cpp - Compute status info for globals -----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+
+using namespace llvm;
+
+/// Return the stronger of the two ordering. If the two orderings are acquire
+/// and release, then return AcquireRelease.
+///
+static AtomicOrdering strongerOrdering(AtomicOrdering X, AtomicOrdering Y) {
+ if ((X == AtomicOrdering::Acquire && Y == AtomicOrdering::Release) ||
+ (Y == AtomicOrdering::Acquire && X == AtomicOrdering::Release))
+ return AtomicOrdering::AcquireRelease;
+ return (AtomicOrdering)std::max((unsigned)X, (unsigned)Y);
+}
+
+/// It is safe to destroy a constant iff it is only used by constants itself.
+/// Note that constants cannot be cyclic, so this test is pretty easy to
+/// implement recursively.
+///
+bool llvm::isSafeToDestroyConstant(const Constant *C) {
+ if (isa<GlobalValue>(C))
+ return false;
+
+ if (isa<ConstantData>(C))
+ return false;
+
+ for (const User *U : C->users())
+ if (const Constant *CU = dyn_cast<Constant>(U)) {
+ if (!isSafeToDestroyConstant(CU))
+ return false;
+ } else
+ return false;
+ return true;
+}
+
+static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
+ SmallPtrSetImpl<const Value *> &VisitedUsers) {
+ if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+ if (GV->isExternallyInitialized())
+ GS.StoredType = GlobalStatus::StoredOnce;
+
+ for (const Use &U : V->uses()) {
+ const User *UR = U.getUser();
+ if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(UR)) {
+ GS.HasNonInstructionUser = true;
+
+ // If the result of the constantexpr isn't pointer type, then we won't
+ // know to expect it in various places. Just reject early.
+ if (!isa<PointerType>(CE->getType()))
+ return true;
+
+ // FIXME: Do we need to add constexpr selects to VisitedUsers?
+ if (analyzeGlobalAux(CE, GS, VisitedUsers))
+ return true;
+ } else if (const Instruction *I = dyn_cast<Instruction>(UR)) {
+ if (!GS.HasMultipleAccessingFunctions) {
+ const Function *F = I->getParent()->getParent();
+ if (!GS.AccessingFunction)
+ GS.AccessingFunction = F;
+ else if (GS.AccessingFunction != F)
+ GS.HasMultipleAccessingFunctions = true;
+ }
+ if (const LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ GS.IsLoaded = true;
+ // Don't hack on volatile loads.
+ if (LI->isVolatile())
+ return true;
+ GS.Ordering = strongerOrdering(GS.Ordering, LI->getOrdering());
+ } else if (const StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ // Don't allow a store OF the address, only stores TO the address.
+ if (SI->getOperand(0) == V)
+ return true;
+
+ // Don't hack on volatile stores.
+ if (SI->isVolatile())
+ return true;
+
+ GS.Ordering = strongerOrdering(GS.Ordering, SI->getOrdering());
+
+ // If this is a direct store to the global (i.e., the global is a scalar
+ // value, not an aggregate), keep more specific information about
+ // stores.
+ if (GS.StoredType != GlobalStatus::Stored) {
+ if (const GlobalVariable *GV =
+ dyn_cast<GlobalVariable>(SI->getOperand(1))) {
+ Value *StoredVal = SI->getOperand(0);
+
+ if (Constant *C = dyn_cast<Constant>(StoredVal)) {
+ if (C->isThreadDependent()) {
+ // The stored value changes between threads; don't track it.
+ return true;
+ }
+ }
+
+ if (GV->hasInitializer() && StoredVal == GV->getInitializer()) {
+ if (GS.StoredType < GlobalStatus::InitializerStored)
+ GS.StoredType = GlobalStatus::InitializerStored;
+ } else if (isa<LoadInst>(StoredVal) &&
+ cast<LoadInst>(StoredVal)->getOperand(0) == GV) {
+ if (GS.StoredType < GlobalStatus::InitializerStored)
+ GS.StoredType = GlobalStatus::InitializerStored;
+ } else if (GS.StoredType < GlobalStatus::StoredOnce) {
+ GS.StoredType = GlobalStatus::StoredOnce;
+ GS.StoredOnceValue = StoredVal;
+ } else if (GS.StoredType == GlobalStatus::StoredOnce &&
+ GS.StoredOnceValue == StoredVal) {
+ // noop.
+ } else {
+ GS.StoredType = GlobalStatus::Stored;
+ }
+ } else {
+ GS.StoredType = GlobalStatus::Stored;
+ }
+ }
} else if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I) ||
isa<AddrSpaceCastInst>(I)) {
- // Skip over bitcasts and GEPs; we don't care about the type or offset
- // of the pointer.
- if (analyzeGlobalAux(I, GS, VisitedUsers))
- return true;
- } else if (isa<SelectInst>(I) || isa<PHINode>(I)) {
- // Look through selects and PHIs to find if the pointer is
- // conditionally accessed. Make sure we only visit an instruction
- // once; otherwise, we can get infinite recursion or exponential
- // compile time.
- if (VisitedUsers.insert(I).second)
- if (analyzeGlobalAux(I, GS, VisitedUsers))
- return true;
- } else if (isa<CmpInst>(I)) {
- GS.IsCompared = true;
- } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
- if (MTI->isVolatile())
- return true;
- if (MTI->getArgOperand(0) == V)
- GS.StoredType = GlobalStatus::Stored;
- if (MTI->getArgOperand(1) == V)
- GS.IsLoaded = true;
- } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
- assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!");
- if (MSI->isVolatile())
- return true;
- GS.StoredType = GlobalStatus::Stored;
- } else if (const auto *CB = dyn_cast<CallBase>(I)) {
- if (!CB->isCallee(&U))
- return true;
- GS.IsLoaded = true;
- } else {
- return true; // Any other non-load instruction might take address!
- }
- } else if (const Constant *C = dyn_cast<Constant>(UR)) {
- GS.HasNonInstructionUser = true;
- // We might have a dead and dangling constant hanging off of here.
- if (!isSafeToDestroyConstant(C))
- return true;
- } else {
- GS.HasNonInstructionUser = true;
- // Otherwise must be some other user.
- return true;
- }
- }
-
- return false;
-}
-
-GlobalStatus::GlobalStatus() = default;
-
-bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) {
- SmallPtrSet<const Value *, 16> VisitedUsers;
- return analyzeGlobalAux(V, GS, VisitedUsers);
-}
+ // Skip over bitcasts and GEPs; we don't care about the type or offset
+ // of the pointer.
+ if (analyzeGlobalAux(I, GS, VisitedUsers))
+ return true;
+ } else if (isa<SelectInst>(I) || isa<PHINode>(I)) {
+ // Look through selects and PHIs to find if the pointer is
+ // conditionally accessed. Make sure we only visit an instruction
+ // once; otherwise, we can get infinite recursion or exponential
+ // compile time.
+ if (VisitedUsers.insert(I).second)
+ if (analyzeGlobalAux(I, GS, VisitedUsers))
+ return true;
+ } else if (isa<CmpInst>(I)) {
+ GS.IsCompared = true;
+ } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
+ if (MTI->isVolatile())
+ return true;
+ if (MTI->getArgOperand(0) == V)
+ GS.StoredType = GlobalStatus::Stored;
+ if (MTI->getArgOperand(1) == V)
+ GS.IsLoaded = true;
+ } else if (const MemSetInst *MSI = dyn_cast<MemSetInst>(I)) {
+ assert(MSI->getArgOperand(0) == V && "Memset only takes one pointer!");
+ if (MSI->isVolatile())
+ return true;
+ GS.StoredType = GlobalStatus::Stored;
+ } else if (const auto *CB = dyn_cast<CallBase>(I)) {
+ if (!CB->isCallee(&U))
+ return true;
+ GS.IsLoaded = true;
+ } else {
+ return true; // Any other non-load instruction might take address!
+ }
+ } else if (const Constant *C = dyn_cast<Constant>(UR)) {
+ GS.HasNonInstructionUser = true;
+ // We might have a dead and dangling constant hanging off of here.
+ if (!isSafeToDestroyConstant(C))
+ return true;
+ } else {
+ GS.HasNonInstructionUser = true;
+ // Otherwise must be some other user.
+ return true;
+ }
+ }
+
+ return false;
+}
+
+GlobalStatus::GlobalStatus() = default;
+
+bool GlobalStatus::analyzeGlobal(const Value *V, GlobalStatus &GS) {
+ SmallPtrSet<const Value *, 16> VisitedUsers;
+ return analyzeGlobalAux(V, GS, VisitedUsers);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/GuardUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/GuardUtils.cpp
index 13f22440bb..4dbcbf80d3 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/GuardUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/GuardUtils.cpp
@@ -1,126 +1,126 @@
-//===-- GuardUtils.cpp - Utils for work with guards -------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// Utils that are used to perform transformations related to guards and their
-// conditions.
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/GuardUtils.h"
-#include "llvm/Analysis/GuardUtils.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-static cl::opt<uint32_t> PredicatePassBranchWeight(
- "guards-predicate-pass-branch-weight", cl::Hidden, cl::init(1 << 20),
- cl::desc("The probability of a guard failing is assumed to be the "
- "reciprocal of this value (default = 1 << 20)"));
-
-void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic,
- CallInst *Guard, bool UseWC) {
- OperandBundleDef DeoptOB(*Guard->getOperandBundle(LLVMContext::OB_deopt));
+//===-- GuardUtils.cpp - Utils for work with guards -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Utils that are used to perform transformations related to guards and their
+// conditions.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/GuardUtils.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<uint32_t> PredicatePassBranchWeight(
+ "guards-predicate-pass-branch-weight", cl::Hidden, cl::init(1 << 20),
+ cl::desc("The probability of a guard failing is assumed to be the "
+ "reciprocal of this value (default = 1 << 20)"));
+
+void llvm::makeGuardControlFlowExplicit(Function *DeoptIntrinsic,
+ CallInst *Guard, bool UseWC) {
+ OperandBundleDef DeoptOB(*Guard->getOperandBundle(LLVMContext::OB_deopt));
SmallVector<Value *, 4> Args(drop_begin(Guard->args()));
-
- auto *CheckBB = Guard->getParent();
- auto *DeoptBlockTerm =
- SplitBlockAndInsertIfThen(Guard->getArgOperand(0), Guard, true);
-
- auto *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
-
- // SplitBlockAndInsertIfThen inserts control flow that branches to
- // DeoptBlockTerm if the condition is true. We want the opposite.
- CheckBI->swapSuccessors();
-
- CheckBI->getSuccessor(0)->setName("guarded");
- CheckBI->getSuccessor(1)->setName("deopt");
-
- if (auto *MD = Guard->getMetadata(LLVMContext::MD_make_implicit))
- CheckBI->setMetadata(LLVMContext::MD_make_implicit, MD);
-
- MDBuilder MDB(Guard->getContext());
- CheckBI->setMetadata(LLVMContext::MD_prof,
- MDB.createBranchWeights(PredicatePassBranchWeight, 1));
-
- IRBuilder<> B(DeoptBlockTerm);
- auto *DeoptCall = B.CreateCall(DeoptIntrinsic, Args, {DeoptOB}, "");
-
- if (DeoptIntrinsic->getReturnType()->isVoidTy()) {
- B.CreateRetVoid();
- } else {
- DeoptCall->setName("deoptcall");
- B.CreateRet(DeoptCall);
- }
-
- DeoptCall->setCallingConv(Guard->getCallingConv());
- DeoptBlockTerm->eraseFromParent();
-
- if (UseWC) {
- // We want the guard to be expressed as explicit control flow, but still be
- // widenable. For that, we add Widenable Condition intrinsic call to the
- // guard's condition.
- IRBuilder<> B(CheckBI);
- auto *WC = B.CreateIntrinsic(Intrinsic::experimental_widenable_condition,
- {}, {}, nullptr, "widenable_cond");
- CheckBI->setCondition(B.CreateAnd(CheckBI->getCondition(), WC,
- "exiplicit_guard_cond"));
- assert(isWidenableBranch(CheckBI) && "sanity check");
- }
-}
-
-
-void llvm::widenWidenableBranch(BranchInst *WidenableBR, Value *NewCond) {
- assert(isWidenableBranch(WidenableBR) && "precondition");
-
- // The tempting trivially option is to produce something like this:
- // br (and oldcond, newcond) where oldcond is assumed to contain a widenable
- // condition, but that doesn't match the pattern parseWidenableBranch expects
- // so we have to be more sophisticated.
-
- Use *C, *WC;
- BasicBlock *IfTrueBB, *IfFalseBB;
- parseWidenableBranch(WidenableBR, C, WC, IfTrueBB, IfFalseBB);
- if (!C) {
- // br (wc()), ... form
- IRBuilder<> B(WidenableBR);
- WidenableBR->setCondition(B.CreateAnd(NewCond, WC->get()));
- } else {
- // br (wc & C), ... form
- IRBuilder<> B(WidenableBR);
- C->set(B.CreateAnd(NewCond, C->get()));
- Instruction *WCAnd = cast<Instruction>(WidenableBR->getCondition());
- // Condition is only guaranteed to dominate branch
- WCAnd->moveBefore(WidenableBR);
- }
- assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy");
-}
-
-void llvm::setWidenableBranchCond(BranchInst *WidenableBR, Value *NewCond) {
- assert(isWidenableBranch(WidenableBR) && "precondition");
-
- Use *C, *WC;
- BasicBlock *IfTrueBB, *IfFalseBB;
- parseWidenableBranch(WidenableBR, C, WC, IfTrueBB, IfFalseBB);
- if (!C) {
- // br (wc()), ... form
- IRBuilder<> B(WidenableBR);
- WidenableBR->setCondition(B.CreateAnd(NewCond, WC->get()));
- } else {
- // br (wc & C), ... form
- Instruction *WCAnd = cast<Instruction>(WidenableBR->getCondition());
- // Condition is only guaranteed to dominate branch
- WCAnd->moveBefore(WidenableBR);
- C->set(NewCond);
- }
- assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy");
-}
+
+ auto *CheckBB = Guard->getParent();
+ auto *DeoptBlockTerm =
+ SplitBlockAndInsertIfThen(Guard->getArgOperand(0), Guard, true);
+
+ auto *CheckBI = cast<BranchInst>(CheckBB->getTerminator());
+
+ // SplitBlockAndInsertIfThen inserts control flow that branches to
+ // DeoptBlockTerm if the condition is true. We want the opposite.
+ CheckBI->swapSuccessors();
+
+ CheckBI->getSuccessor(0)->setName("guarded");
+ CheckBI->getSuccessor(1)->setName("deopt");
+
+ if (auto *MD = Guard->getMetadata(LLVMContext::MD_make_implicit))
+ CheckBI->setMetadata(LLVMContext::MD_make_implicit, MD);
+
+ MDBuilder MDB(Guard->getContext());
+ CheckBI->setMetadata(LLVMContext::MD_prof,
+ MDB.createBranchWeights(PredicatePassBranchWeight, 1));
+
+ IRBuilder<> B(DeoptBlockTerm);
+ auto *DeoptCall = B.CreateCall(DeoptIntrinsic, Args, {DeoptOB}, "");
+
+ if (DeoptIntrinsic->getReturnType()->isVoidTy()) {
+ B.CreateRetVoid();
+ } else {
+ DeoptCall->setName("deoptcall");
+ B.CreateRet(DeoptCall);
+ }
+
+ DeoptCall->setCallingConv(Guard->getCallingConv());
+ DeoptBlockTerm->eraseFromParent();
+
+ if (UseWC) {
+ // We want the guard to be expressed as explicit control flow, but still be
+ // widenable. For that, we add Widenable Condition intrinsic call to the
+ // guard's condition.
+ IRBuilder<> B(CheckBI);
+ auto *WC = B.CreateIntrinsic(Intrinsic::experimental_widenable_condition,
+ {}, {}, nullptr, "widenable_cond");
+ CheckBI->setCondition(B.CreateAnd(CheckBI->getCondition(), WC,
+ "exiplicit_guard_cond"));
+ assert(isWidenableBranch(CheckBI) && "sanity check");
+ }
+}
+
+
+void llvm::widenWidenableBranch(BranchInst *WidenableBR, Value *NewCond) {
+ assert(isWidenableBranch(WidenableBR) && "precondition");
+
+ // The tempting trivially option is to produce something like this:
+ // br (and oldcond, newcond) where oldcond is assumed to contain a widenable
+ // condition, but that doesn't match the pattern parseWidenableBranch expects
+ // so we have to be more sophisticated.
+
+ Use *C, *WC;
+ BasicBlock *IfTrueBB, *IfFalseBB;
+ parseWidenableBranch(WidenableBR, C, WC, IfTrueBB, IfFalseBB);
+ if (!C) {
+ // br (wc()), ... form
+ IRBuilder<> B(WidenableBR);
+ WidenableBR->setCondition(B.CreateAnd(NewCond, WC->get()));
+ } else {
+ // br (wc & C), ... form
+ IRBuilder<> B(WidenableBR);
+ C->set(B.CreateAnd(NewCond, C->get()));
+ Instruction *WCAnd = cast<Instruction>(WidenableBR->getCondition());
+ // Condition is only guaranteed to dominate branch
+ WCAnd->moveBefore(WidenableBR);
+ }
+ assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy");
+}
+
+void llvm::setWidenableBranchCond(BranchInst *WidenableBR, Value *NewCond) {
+ assert(isWidenableBranch(WidenableBR) && "precondition");
+
+ Use *C, *WC;
+ BasicBlock *IfTrueBB, *IfFalseBB;
+ parseWidenableBranch(WidenableBR, C, WC, IfTrueBB, IfFalseBB);
+ if (!C) {
+ // br (wc()), ... form
+ IRBuilder<> B(WidenableBR);
+ WidenableBR->setCondition(B.CreateAnd(NewCond, WC->get()));
+ } else {
+ // br (wc & C), ... form
+ Instruction *WCAnd = cast<Instruction>(WidenableBR->getCondition());
+ // Condition is only guaranteed to dominate branch
+ WCAnd->moveBefore(WidenableBR);
+ C->set(NewCond);
+ }
+ assert(isWidenableBranch(WidenableBR) && "preserve widenabiliy");
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/InjectTLIMappings.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/InjectTLIMappings.cpp
index 2626ebb942..a2b72e4e7f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/InjectTLIMappings.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/InjectTLIMappings.cpp
@@ -1,166 +1,166 @@
-//===- InjectTLIMAppings.cpp - TLI to VFABI attribute injection ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Populates the VFABI attribute with the scalar-to-vector mappings
-// from the TargetLibraryInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/InjectTLIMappings.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/DemandedBits.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+//===- InjectTLIMAppings.cpp - TLI to VFABI attribute injection ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Populates the VFABI attribute with the scalar-to-vector mappings
+// from the TargetLibraryInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/InjectTLIMappings.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "inject-tli-mappings"
-
-STATISTIC(NumCallInjected,
- "Number of calls in which the mappings have been injected.");
-
-STATISTIC(NumVFDeclAdded,
- "Number of function declarations that have been added.");
-STATISTIC(NumCompUsedAdded,
- "Number of `@llvm.compiler.used` operands that have been added.");
-
-/// A helper function that adds the vector function declaration that
-/// vectorizes the CallInst CI with a vectorization factor of VF
-/// lanes. The TLI assumes that all parameters and the return type of
-/// CI (other than void) need to be widened to a VectorType of VF
-/// lanes.
-static void addVariantDeclaration(CallInst &CI, const unsigned VF,
- const StringRef VFName) {
- Module *M = CI.getModule();
-
- // Add function declaration.
- Type *RetTy = ToVectorTy(CI.getType(), VF);
- SmallVector<Type *, 4> Tys;
- for (Value *ArgOperand : CI.arg_operands())
- Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
- assert(!CI.getFunctionType()->isVarArg() &&
- "VarArg functions are not supported.");
- FunctionType *FTy = FunctionType::get(RetTy, Tys, /*isVarArg=*/false);
- Function *VectorF =
- Function::Create(FTy, Function::ExternalLinkage, VFName, M);
- VectorF->copyAttributesFrom(CI.getCalledFunction());
- ++NumVFDeclAdded;
- LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added to the module: `" << VFName
- << "` of type " << *(VectorF->getType()) << "\n");
-
- // Make function declaration (without a body) "sticky" in the IR by
- // listing it in the @llvm.compiler.used intrinsic.
- assert(!VectorF->size() && "VFABI attribute requires `@llvm.compiler.used` "
- "only on declarations.");
- appendToCompilerUsed(*M, {VectorF});
- LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << VFName
- << "` to `@llvm.compiler.used`.\n");
- ++NumCompUsedAdded;
-}
-
-static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
- // This is needed to make sure we don't query the TLI for calls to
- // bitcast of function pointers, like `%call = call i32 (i32*, ...)
- // bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull %i)`,
- // as such calls make the `isFunctionVectorizable` raise an
- // exception.
- if (CI.isNoBuiltin() || !CI.getCalledFunction())
- return;
-
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inject-tli-mappings"
+
+STATISTIC(NumCallInjected,
+ "Number of calls in which the mappings have been injected.");
+
+STATISTIC(NumVFDeclAdded,
+ "Number of function declarations that have been added.");
+STATISTIC(NumCompUsedAdded,
+ "Number of `@llvm.compiler.used` operands that have been added.");
+
+/// A helper function that adds the vector function declaration that
+/// vectorizes the CallInst CI with a vectorization factor of VF
+/// lanes. The TLI assumes that all parameters and the return type of
+/// CI (other than void) need to be widened to a VectorType of VF
+/// lanes.
+static void addVariantDeclaration(CallInst &CI, const unsigned VF,
+ const StringRef VFName) {
+ Module *M = CI.getModule();
+
+ // Add function declaration.
+ Type *RetTy = ToVectorTy(CI.getType(), VF);
+ SmallVector<Type *, 4> Tys;
+ for (Value *ArgOperand : CI.arg_operands())
+ Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+ assert(!CI.getFunctionType()->isVarArg() &&
+ "VarArg functions are not supported.");
+ FunctionType *FTy = FunctionType::get(RetTy, Tys, /*isVarArg=*/false);
+ Function *VectorF =
+ Function::Create(FTy, Function::ExternalLinkage, VFName, M);
+ VectorF->copyAttributesFrom(CI.getCalledFunction());
+ ++NumVFDeclAdded;
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Added to the module: `" << VFName
+ << "` of type " << *(VectorF->getType()) << "\n");
+
+ // Make function declaration (without a body) "sticky" in the IR by
+ // listing it in the @llvm.compiler.used intrinsic.
+ assert(!VectorF->size() && "VFABI attribute requires `@llvm.compiler.used` "
+ "only on declarations.");
+ appendToCompilerUsed(*M, {VectorF});
+ LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Adding `" << VFName
+ << "` to `@llvm.compiler.used`.\n");
+ ++NumCompUsedAdded;
+}
+
+static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
+ // This is needed to make sure we don't query the TLI for calls to
+ // bitcast of function pointers, like `%call = call i32 (i32*, ...)
+ // bitcast (i32 (...)* @goo to i32 (i32*, ...)*)(i32* nonnull %i)`,
+ // as such calls make the `isFunctionVectorizable` raise an
+ // exception.
+ if (CI.isNoBuiltin() || !CI.getCalledFunction())
+ return;
+
StringRef ScalarName = CI.getCalledFunction()->getName();
- // Nothing to be done if the TLI thinks the function is not
- // vectorizable.
- if (!TLI.isFunctionVectorizable(ScalarName))
- return;
- SmallVector<std::string, 8> Mappings;
- VFABI::getVectorVariantNames(CI, Mappings);
- Module *M = CI.getModule();
- const SetVector<StringRef> OriginalSetOfMappings(Mappings.begin(),
- Mappings.end());
- // All VFs in the TLI are powers of 2.
- for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName); VF <= WidestVF;
- VF *= 2) {
- const std::string TLIName =
- std::string(TLI.getVectorizedFunction(ScalarName, VF));
- if (!TLIName.empty()) {
- std::string MangledName = VFABI::mangleTLIVectorName(
- TLIName, ScalarName, CI.getNumArgOperands(), VF);
- if (!OriginalSetOfMappings.count(MangledName)) {
- Mappings.push_back(MangledName);
- ++NumCallInjected;
- }
- Function *VariantF = M->getFunction(TLIName);
- if (!VariantF)
- addVariantDeclaration(CI, VF, TLIName);
- }
- }
-
- VFABI::setVectorVariantNames(&CI, Mappings);
-}
-
-static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
- for (auto &I : instructions(F))
- if (auto CI = dyn_cast<CallInst>(&I))
- addMappingsFromTLI(TLI, *CI);
- // Even if the pass adds IR attributes, the analyses are preserved.
- return false;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// New pass manager implementation.
-////////////////////////////////////////////////////////////////////////////////
-PreservedAnalyses InjectTLIMappings::run(Function &F,
- FunctionAnalysisManager &AM) {
- const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- runImpl(TLI, F);
- // Even if the pass adds IR attributes, the analyses are preserved.
- return PreservedAnalyses::all();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Legacy PM Implementation.
-////////////////////////////////////////////////////////////////////////////////
-bool InjectTLIMappingsLegacy::runOnFunction(Function &F) {
- const TargetLibraryInfo &TLI =
- getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- return runImpl(TLI, F);
-}
-
-void InjectTLIMappingsLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.setPreservesCFG();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
- AU.addPreserved<TargetLibraryInfoWrapperPass>();
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<LoopAccessLegacyAnalysis>();
- AU.addPreserved<DemandedBitsWrapperPass>();
- AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Legacy Pass manager initialization
-////////////////////////////////////////////////////////////////////////////////
-char InjectTLIMappingsLegacy::ID = 0;
-
-INITIALIZE_PASS_BEGIN(InjectTLIMappingsLegacy, DEBUG_TYPE,
- "Inject TLI Mappings", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(InjectTLIMappingsLegacy, DEBUG_TYPE, "Inject TLI Mappings",
- false, false)
-
-FunctionPass *llvm::createInjectTLIMappingsLegacyPass() {
- return new InjectTLIMappingsLegacy();
-}
+ // Nothing to be done if the TLI thinks the function is not
+ // vectorizable.
+ if (!TLI.isFunctionVectorizable(ScalarName))
+ return;
+ SmallVector<std::string, 8> Mappings;
+ VFABI::getVectorVariantNames(CI, Mappings);
+ Module *M = CI.getModule();
+ const SetVector<StringRef> OriginalSetOfMappings(Mappings.begin(),
+ Mappings.end());
+ // All VFs in the TLI are powers of 2.
+ for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName); VF <= WidestVF;
+ VF *= 2) {
+ const std::string TLIName =
+ std::string(TLI.getVectorizedFunction(ScalarName, VF));
+ if (!TLIName.empty()) {
+ std::string MangledName = VFABI::mangleTLIVectorName(
+ TLIName, ScalarName, CI.getNumArgOperands(), VF);
+ if (!OriginalSetOfMappings.count(MangledName)) {
+ Mappings.push_back(MangledName);
+ ++NumCallInjected;
+ }
+ Function *VariantF = M->getFunction(TLIName);
+ if (!VariantF)
+ addVariantDeclaration(CI, VF, TLIName);
+ }
+ }
+
+ VFABI::setVectorVariantNames(&CI, Mappings);
+}
+
+static bool runImpl(const TargetLibraryInfo &TLI, Function &F) {
+ for (auto &I : instructions(F))
+ if (auto CI = dyn_cast<CallInst>(&I))
+ addMappingsFromTLI(TLI, *CI);
+ // Even if the pass adds IR attributes, the analyses are preserved.
+ return false;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// New pass manager implementation.
+////////////////////////////////////////////////////////////////////////////////
+PreservedAnalyses InjectTLIMappings::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ runImpl(TLI, F);
+ // Even if the pass adds IR attributes, the analyses are preserved.
+ return PreservedAnalyses::all();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Legacy PM Implementation.
+////////////////////////////////////////////////////////////////////////////////
+bool InjectTLIMappingsLegacy::runOnFunction(Function &F) {
+ const TargetLibraryInfo &TLI =
+ getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ return runImpl(TLI, F);
+}
+
+void InjectTLIMappingsLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesCFG();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<TargetLibraryInfoWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<LoopAccessLegacyAnalysis>();
+ AU.addPreserved<DemandedBitsWrapperPass>();
+ AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Legacy Pass manager initialization
+////////////////////////////////////////////////////////////////////////////////
+char InjectTLIMappingsLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(InjectTLIMappingsLegacy, DEBUG_TYPE,
+ "Inject TLI Mappings", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(InjectTLIMappingsLegacy, DEBUG_TYPE, "Inject TLI Mappings",
+ false, false)
+
+FunctionPass *llvm::createInjectTLIMappingsLegacyPass() {
+ return new InjectTLIMappingsLegacy();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/InlineFunction.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/InlineFunction.cpp
index 1c5604f8a3..fb271a2118 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/InlineFunction.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/InlineFunction.cpp
@@ -1,782 +1,782 @@
-//===- InlineFunction.cpp - Code to perform function inlining -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements inlining of a function into a call site, resolving
-// parameters and the return value as appropriate.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <limits>
-#include <string>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-using ProfileCount = Function::ProfileCount;
-
-static cl::opt<bool>
-EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true),
- cl::Hidden,
- cl::desc("Convert noalias attributes to metadata during inlining."));
-
+//===- InlineFunction.cpp - Code to perform function inlining -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements inlining of a function into a call site, resolving
+// parameters and the return value as appropriate.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <string>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using ProfileCount = Function::ProfileCount;
+
+static cl::opt<bool>
+EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true),
+ cl::Hidden,
+ cl::desc("Convert noalias attributes to metadata during inlining."));
+
static cl::opt<bool>
UseNoAliasIntrinsic("use-noalias-intrinsic-during-inlining", cl::Hidden,
cl::ZeroOrMore, cl::init(true),
cl::desc("Use the llvm.experimental.noalias.scope.decl "
"intrinsic during inlining."));
-// Disabled by default, because the added alignment assumptions may increase
-// compile-time and block optimizations. This option is not suitable for use
-// with frontends that emit comprehensive parameter alignment annotations.
-static cl::opt<bool>
-PreserveAlignmentAssumptions("preserve-alignment-assumptions-during-inlining",
- cl::init(false), cl::Hidden,
- cl::desc("Convert align attributes to assumptions during inlining."));
-
-static cl::opt<bool> UpdateReturnAttributes(
- "update-return-attrs", cl::init(true), cl::Hidden,
- cl::desc("Update return attributes on calls within inlined body"));
-
-static cl::opt<unsigned> InlinerAttributeWindow(
- "max-inst-checked-for-throw-during-inlining", cl::Hidden,
- cl::desc("the maximum number of instructions analyzed for may throw during "
- "attribute inference in inlined body"),
- cl::init(4));
-
-namespace {
-
- /// A class for recording information about inlining a landing pad.
- class LandingPadInliningInfo {
- /// Destination of the invoke's unwind.
- BasicBlock *OuterResumeDest;
-
- /// Destination for the callee's resume.
- BasicBlock *InnerResumeDest = nullptr;
-
- /// LandingPadInst associated with the invoke.
- LandingPadInst *CallerLPad = nullptr;
-
- /// PHI for EH values from landingpad insts.
- PHINode *InnerEHValuesPHI = nullptr;
-
- SmallVector<Value*, 8> UnwindDestPHIValues;
-
- public:
- LandingPadInliningInfo(InvokeInst *II)
- : OuterResumeDest(II->getUnwindDest()) {
- // If there are PHI nodes in the unwind destination block, we need to keep
- // track of which values came into them from the invoke before removing
- // the edge from this block.
- BasicBlock *InvokeBB = II->getParent();
- BasicBlock::iterator I = OuterResumeDest->begin();
- for (; isa<PHINode>(I); ++I) {
- // Save the value to use for this edge.
- PHINode *PHI = cast<PHINode>(I);
- UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB));
- }
-
- CallerLPad = cast<LandingPadInst>(I);
- }
-
- /// The outer unwind destination is the target of
- /// unwind edges introduced for calls within the inlined function.
- BasicBlock *getOuterResumeDest() const {
- return OuterResumeDest;
- }
-
- BasicBlock *getInnerResumeDest();
-
- LandingPadInst *getLandingPadInst() const { return CallerLPad; }
-
- /// Forward the 'resume' instruction to the caller's landing pad block.
- /// When the landing pad block has only one predecessor, this is
- /// a simple branch. When there is more than one predecessor, we need to
- /// split the landing pad block after the landingpad instruction and jump
- /// to there.
- void forwardResume(ResumeInst *RI,
- SmallPtrSetImpl<LandingPadInst*> &InlinedLPads);
-
- /// Add incoming-PHI values to the unwind destination block for the given
- /// basic block, using the values for the original invoke's source block.
- void addIncomingPHIValuesFor(BasicBlock *BB) const {
- addIncomingPHIValuesForInto(BB, OuterResumeDest);
- }
-
- void addIncomingPHIValuesForInto(BasicBlock *src, BasicBlock *dest) const {
- BasicBlock::iterator I = dest->begin();
- for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
- PHINode *phi = cast<PHINode>(I);
- phi->addIncoming(UnwindDestPHIValues[i], src);
- }
- }
- };
-
-} // end anonymous namespace
-
-/// Get or create a target for the branch from ResumeInsts.
-BasicBlock *LandingPadInliningInfo::getInnerResumeDest() {
- if (InnerResumeDest) return InnerResumeDest;
-
- // Split the landing pad.
- BasicBlock::iterator SplitPoint = ++CallerLPad->getIterator();
- InnerResumeDest =
- OuterResumeDest->splitBasicBlock(SplitPoint,
- OuterResumeDest->getName() + ".body");
-
- // The number of incoming edges we expect to the inner landing pad.
- const unsigned PHICapacity = 2;
-
- // Create corresponding new PHIs for all the PHIs in the outer landing pad.
- Instruction *InsertPoint = &InnerResumeDest->front();
- BasicBlock::iterator I = OuterResumeDest->begin();
- for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
- PHINode *OuterPHI = cast<PHINode>(I);
- PHINode *InnerPHI = PHINode::Create(OuterPHI->getType(), PHICapacity,
- OuterPHI->getName() + ".lpad-body",
- InsertPoint);
- OuterPHI->replaceAllUsesWith(InnerPHI);
- InnerPHI->addIncoming(OuterPHI, OuterResumeDest);
- }
-
- // Create a PHI for the exception values.
- InnerEHValuesPHI = PHINode::Create(CallerLPad->getType(), PHICapacity,
- "eh.lpad-body", InsertPoint);
- CallerLPad->replaceAllUsesWith(InnerEHValuesPHI);
- InnerEHValuesPHI->addIncoming(CallerLPad, OuterResumeDest);
-
- // All done.
- return InnerResumeDest;
-}
-
-/// Forward the 'resume' instruction to the caller's landing pad block.
-/// When the landing pad block has only one predecessor, this is a simple
-/// branch. When there is more than one predecessor, we need to split the
-/// landing pad block after the landingpad instruction and jump to there.
-void LandingPadInliningInfo::forwardResume(
- ResumeInst *RI, SmallPtrSetImpl<LandingPadInst *> &InlinedLPads) {
- BasicBlock *Dest = getInnerResumeDest();
- BasicBlock *Src = RI->getParent();
-
- BranchInst::Create(Dest, Src);
-
- // Update the PHIs in the destination. They were inserted in an order which
- // makes this work.
- addIncomingPHIValuesForInto(Src, Dest);
-
- InnerEHValuesPHI->addIncoming(RI->getOperand(0), Src);
- RI->eraseFromParent();
-}
-
-/// Helper for getUnwindDestToken/getUnwindDestTokenHelper.
-static Value *getParentPad(Value *EHPad) {
- if (auto *FPI = dyn_cast<FuncletPadInst>(EHPad))
- return FPI->getParentPad();
- return cast<CatchSwitchInst>(EHPad)->getParentPad();
-}
-
-using UnwindDestMemoTy = DenseMap<Instruction *, Value *>;
-
-/// Helper for getUnwindDestToken that does the descendant-ward part of
-/// the search.
-static Value *getUnwindDestTokenHelper(Instruction *EHPad,
- UnwindDestMemoTy &MemoMap) {
- SmallVector<Instruction *, 8> Worklist(1, EHPad);
-
- while (!Worklist.empty()) {
- Instruction *CurrentPad = Worklist.pop_back_val();
- // We only put pads on the worklist that aren't in the MemoMap. When
- // we find an unwind dest for a pad we may update its ancestors, but
- // the queue only ever contains uncles/great-uncles/etc. of CurrentPad,
- // so they should never get updated while queued on the worklist.
- assert(!MemoMap.count(CurrentPad));
- Value *UnwindDestToken = nullptr;
- if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(CurrentPad)) {
- if (CatchSwitch->hasUnwindDest()) {
- UnwindDestToken = CatchSwitch->getUnwindDest()->getFirstNonPHI();
- } else {
- // Catchswitch doesn't have a 'nounwind' variant, and one might be
- // annotated as "unwinds to caller" when really it's nounwind (see
- // e.g. SimplifyCFGOpt::SimplifyUnreachable), so we can't infer the
- // parent's unwind dest from this. We can check its catchpads'
- // descendants, since they might include a cleanuppad with an
- // "unwinds to caller" cleanupret, which can be trusted.
- for (auto HI = CatchSwitch->handler_begin(),
- HE = CatchSwitch->handler_end();
- HI != HE && !UnwindDestToken; ++HI) {
- BasicBlock *HandlerBlock = *HI;
- auto *CatchPad = cast<CatchPadInst>(HandlerBlock->getFirstNonPHI());
- for (User *Child : CatchPad->users()) {
- // Intentionally ignore invokes here -- since the catchswitch is
- // marked "unwind to caller", it would be a verifier error if it
- // contained an invoke which unwinds out of it, so any invoke we'd
- // encounter must unwind to some child of the catch.
- if (!isa<CleanupPadInst>(Child) && !isa<CatchSwitchInst>(Child))
- continue;
-
- Instruction *ChildPad = cast<Instruction>(Child);
- auto Memo = MemoMap.find(ChildPad);
- if (Memo == MemoMap.end()) {
- // Haven't figured out this child pad yet; queue it.
- Worklist.push_back(ChildPad);
- continue;
- }
- // We've already checked this child, but might have found that
- // it offers no proof either way.
- Value *ChildUnwindDestToken = Memo->second;
- if (!ChildUnwindDestToken)
- continue;
- // We already know the child's unwind dest, which can either
- // be ConstantTokenNone to indicate unwind to caller, or can
- // be another child of the catchpad. Only the former indicates
- // the unwind dest of the catchswitch.
- if (isa<ConstantTokenNone>(ChildUnwindDestToken)) {
- UnwindDestToken = ChildUnwindDestToken;
- break;
- }
- assert(getParentPad(ChildUnwindDestToken) == CatchPad);
- }
- }
- }
- } else {
- auto *CleanupPad = cast<CleanupPadInst>(CurrentPad);
- for (User *U : CleanupPad->users()) {
- if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) {
- if (BasicBlock *RetUnwindDest = CleanupRet->getUnwindDest())
- UnwindDestToken = RetUnwindDest->getFirstNonPHI();
- else
- UnwindDestToken = ConstantTokenNone::get(CleanupPad->getContext());
- break;
- }
- Value *ChildUnwindDestToken;
- if (auto *Invoke = dyn_cast<InvokeInst>(U)) {
- ChildUnwindDestToken = Invoke->getUnwindDest()->getFirstNonPHI();
- } else if (isa<CleanupPadInst>(U) || isa<CatchSwitchInst>(U)) {
- Instruction *ChildPad = cast<Instruction>(U);
- auto Memo = MemoMap.find(ChildPad);
- if (Memo == MemoMap.end()) {
- // Haven't resolved this child yet; queue it and keep searching.
- Worklist.push_back(ChildPad);
- continue;
- }
- // We've checked this child, but still need to ignore it if it
- // had no proof either way.
- ChildUnwindDestToken = Memo->second;
- if (!ChildUnwindDestToken)
- continue;
- } else {
- // Not a relevant user of the cleanuppad
- continue;
- }
- // In a well-formed program, the child/invoke must either unwind to
- // an(other) child of the cleanup, or exit the cleanup. In the
- // first case, continue searching.
- if (isa<Instruction>(ChildUnwindDestToken) &&
- getParentPad(ChildUnwindDestToken) == CleanupPad)
- continue;
- UnwindDestToken = ChildUnwindDestToken;
- break;
- }
- }
- // If we haven't found an unwind dest for CurrentPad, we may have queued its
- // children, so move on to the next in the worklist.
- if (!UnwindDestToken)
- continue;
-
- // Now we know that CurrentPad unwinds to UnwindDestToken. It also exits
- // any ancestors of CurrentPad up to but not including UnwindDestToken's
- // parent pad. Record this in the memo map, and check to see if the
- // original EHPad being queried is one of the ones exited.
- Value *UnwindParent;
- if (auto *UnwindPad = dyn_cast<Instruction>(UnwindDestToken))
- UnwindParent = getParentPad(UnwindPad);
- else
- UnwindParent = nullptr;
- bool ExitedOriginalPad = false;
- for (Instruction *ExitedPad = CurrentPad;
- ExitedPad && ExitedPad != UnwindParent;
- ExitedPad = dyn_cast<Instruction>(getParentPad(ExitedPad))) {
- // Skip over catchpads since they just follow their catchswitches.
- if (isa<CatchPadInst>(ExitedPad))
- continue;
- MemoMap[ExitedPad] = UnwindDestToken;
- ExitedOriginalPad |= (ExitedPad == EHPad);
- }
-
- if (ExitedOriginalPad)
- return UnwindDestToken;
-
- // Continue the search.
- }
-
- // No definitive information is contained within this funclet.
- return nullptr;
-}
-
-/// Given an EH pad, find where it unwinds. If it unwinds to an EH pad,
-/// return that pad instruction. If it unwinds to caller, return
-/// ConstantTokenNone. If it does not have a definitive unwind destination,
-/// return nullptr.
-///
-/// This routine gets invoked for calls in funclets in inlinees when inlining
-/// an invoke. Since many funclets don't have calls inside them, it's queried
-/// on-demand rather than building a map of pads to unwind dests up front.
-/// Determining a funclet's unwind dest may require recursively searching its
-/// descendants, and also ancestors and cousins if the descendants don't provide
-/// an answer. Since most funclets will have their unwind dest immediately
-/// available as the unwind dest of a catchswitch or cleanupret, this routine
-/// searches top-down from the given pad and then up. To avoid worst-case
-/// quadratic run-time given that approach, it uses a memo map to avoid
-/// re-processing funclet trees. The callers that rewrite the IR as they go
-/// take advantage of this, for correctness, by checking/forcing rewritten
-/// pads' entries to match the original callee view.
-static Value *getUnwindDestToken(Instruction *EHPad,
- UnwindDestMemoTy &MemoMap) {
- // Catchpads unwind to the same place as their catchswitch;
- // redirct any queries on catchpads so the code below can
- // deal with just catchswitches and cleanuppads.
- if (auto *CPI = dyn_cast<CatchPadInst>(EHPad))
- EHPad = CPI->getCatchSwitch();
-
- // Check if we've already determined the unwind dest for this pad.
- auto Memo = MemoMap.find(EHPad);
- if (Memo != MemoMap.end())
- return Memo->second;
-
- // Search EHPad and, if necessary, its descendants.
- Value *UnwindDestToken = getUnwindDestTokenHelper(EHPad, MemoMap);
- assert((UnwindDestToken == nullptr) != (MemoMap.count(EHPad) != 0));
- if (UnwindDestToken)
- return UnwindDestToken;
-
- // No information is available for this EHPad from itself or any of its
- // descendants. An unwind all the way out to a pad in the caller would
- // need also to agree with the unwind dest of the parent funclet, so
- // search up the chain to try to find a funclet with information. Put
- // null entries in the memo map to avoid re-processing as we go up.
- MemoMap[EHPad] = nullptr;
-#ifndef NDEBUG
- SmallPtrSet<Instruction *, 4> TempMemos;
- TempMemos.insert(EHPad);
-#endif
- Instruction *LastUselessPad = EHPad;
- Value *AncestorToken;
- for (AncestorToken = getParentPad(EHPad);
- auto *AncestorPad = dyn_cast<Instruction>(AncestorToken);
- AncestorToken = getParentPad(AncestorToken)) {
- // Skip over catchpads since they just follow their catchswitches.
- if (isa<CatchPadInst>(AncestorPad))
- continue;
- // If the MemoMap had an entry mapping AncestorPad to nullptr, since we
- // haven't yet called getUnwindDestTokenHelper for AncestorPad in this
- // call to getUnwindDestToken, that would mean that AncestorPad had no
- // information in itself, its descendants, or its ancestors. If that
- // were the case, then we should also have recorded the lack of information
- // for the descendant that we're coming from. So assert that we don't
- // find a null entry in the MemoMap for AncestorPad.
- assert(!MemoMap.count(AncestorPad) || MemoMap[AncestorPad]);
- auto AncestorMemo = MemoMap.find(AncestorPad);
- if (AncestorMemo == MemoMap.end()) {
- UnwindDestToken = getUnwindDestTokenHelper(AncestorPad, MemoMap);
- } else {
- UnwindDestToken = AncestorMemo->second;
- }
- if (UnwindDestToken)
- break;
- LastUselessPad = AncestorPad;
- MemoMap[LastUselessPad] = nullptr;
-#ifndef NDEBUG
- TempMemos.insert(LastUselessPad);
-#endif
- }
-
- // We know that getUnwindDestTokenHelper was called on LastUselessPad and
- // returned nullptr (and likewise for EHPad and any of its ancestors up to
- // LastUselessPad), so LastUselessPad has no information from below. Since
- // getUnwindDestTokenHelper must investigate all downward paths through
- // no-information nodes to prove that a node has no information like this,
- // and since any time it finds information it records it in the MemoMap for
- // not just the immediately-containing funclet but also any ancestors also
- // exited, it must be the case that, walking downward from LastUselessPad,
- // visiting just those nodes which have not been mapped to an unwind dest
- // by getUnwindDestTokenHelper (the nullptr TempMemos notwithstanding, since
- // they are just used to keep getUnwindDestTokenHelper from repeating work),
- // any node visited must have been exhaustively searched with no information
- // for it found.
- SmallVector<Instruction *, 8> Worklist(1, LastUselessPad);
- while (!Worklist.empty()) {
- Instruction *UselessPad = Worklist.pop_back_val();
- auto Memo = MemoMap.find(UselessPad);
- if (Memo != MemoMap.end() && Memo->second) {
- // Here the name 'UselessPad' is a bit of a misnomer, because we've found
- // that it is a funclet that does have information about unwinding to
- // a particular destination; its parent was a useless pad.
- // Since its parent has no information, the unwind edge must not escape
- // the parent, and must target a sibling of this pad. This local unwind
- // gives us no information about EHPad. Leave it and the subtree rooted
- // at it alone.
- assert(getParentPad(Memo->second) == getParentPad(UselessPad));
- continue;
- }
- // We know we don't have information for UselesPad. If it has an entry in
- // the MemoMap (mapping it to nullptr), it must be one of the TempMemos
- // added on this invocation of getUnwindDestToken; if a previous invocation
- // recorded nullptr, it would have had to prove that the ancestors of
- // UselessPad, which include LastUselessPad, had no information, and that
- // in turn would have required proving that the descendants of
- // LastUselesPad, which include EHPad, have no information about
- // LastUselessPad, which would imply that EHPad was mapped to nullptr in
- // the MemoMap on that invocation, which isn't the case if we got here.
- assert(!MemoMap.count(UselessPad) || TempMemos.count(UselessPad));
- // Assert as we enumerate users that 'UselessPad' doesn't have any unwind
- // information that we'd be contradicting by making a map entry for it
- // (which is something that getUnwindDestTokenHelper must have proved for
- // us to get here). Just assert on is direct users here; the checks in
- // this downward walk at its descendants will verify that they don't have
- // any unwind edges that exit 'UselessPad' either (i.e. they either have no
- // unwind edges or unwind to a sibling).
- MemoMap[UselessPad] = UnwindDestToken;
- if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(UselessPad)) {
- assert(CatchSwitch->getUnwindDest() == nullptr && "Expected useless pad");
- for (BasicBlock *HandlerBlock : CatchSwitch->handlers()) {
- auto *CatchPad = HandlerBlock->getFirstNonPHI();
- for (User *U : CatchPad->users()) {
- assert(
- (!isa<InvokeInst>(U) ||
- (getParentPad(
- cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHI()) ==
- CatchPad)) &&
- "Expected useless pad");
- if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
- Worklist.push_back(cast<Instruction>(U));
- }
- }
- } else {
- assert(isa<CleanupPadInst>(UselessPad));
- for (User *U : UselessPad->users()) {
- assert(!isa<CleanupReturnInst>(U) && "Expected useless pad");
- assert((!isa<InvokeInst>(U) ||
- (getParentPad(
- cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHI()) ==
- UselessPad)) &&
- "Expected useless pad");
- if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
- Worklist.push_back(cast<Instruction>(U));
- }
- }
- }
-
- return UnwindDestToken;
-}
-
-/// When we inline a basic block into an invoke,
-/// we have to turn all of the calls that can throw into invokes.
-/// This function analyze BB to see if there are any calls, and if so,
-/// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI
-/// nodes in that block with the values specified in InvokeDestPHIValues.
-static BasicBlock *HandleCallsInBlockInlinedThroughInvoke(
- BasicBlock *BB, BasicBlock *UnwindEdge,
- UnwindDestMemoTy *FuncletUnwindMap = nullptr) {
- for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
- Instruction *I = &*BBI++;
-
- // We only need to check for function calls: inlined invoke
- // instructions require no special handling.
- CallInst *CI = dyn_cast<CallInst>(I);
-
- if (!CI || CI->doesNotThrow() || CI->isInlineAsm())
- continue;
-
- // We do not need to (and in fact, cannot) convert possibly throwing calls
- // to @llvm.experimental_deoptimize (resp. @llvm.experimental.guard) into
- // invokes. The caller's "segment" of the deoptimization continuation
- // attached to the newly inlined @llvm.experimental_deoptimize
- // (resp. @llvm.experimental.guard) call should contain the exception
- // handling logic, if any.
- if (auto *F = CI->getCalledFunction())
- if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize ||
- F->getIntrinsicID() == Intrinsic::experimental_guard)
- continue;
-
- if (auto FuncletBundle = CI->getOperandBundle(LLVMContext::OB_funclet)) {
- // This call is nested inside a funclet. If that funclet has an unwind
- // destination within the inlinee, then unwinding out of this call would
- // be UB. Rewriting this call to an invoke which targets the inlined
- // invoke's unwind dest would give the call's parent funclet multiple
- // unwind destinations, which is something that subsequent EH table
- // generation can't handle and that the veirifer rejects. So when we
- // see such a call, leave it as a call.
- auto *FuncletPad = cast<Instruction>(FuncletBundle->Inputs[0]);
- Value *UnwindDestToken =
- getUnwindDestToken(FuncletPad, *FuncletUnwindMap);
- if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken))
- continue;
-#ifndef NDEBUG
- Instruction *MemoKey;
- if (auto *CatchPad = dyn_cast<CatchPadInst>(FuncletPad))
- MemoKey = CatchPad->getCatchSwitch();
- else
- MemoKey = FuncletPad;
- assert(FuncletUnwindMap->count(MemoKey) &&
- (*FuncletUnwindMap)[MemoKey] == UnwindDestToken &&
- "must get memoized to avoid confusing later searches");
-#endif // NDEBUG
- }
-
- changeToInvokeAndSplitBasicBlock(CI, UnwindEdge);
- return BB;
- }
- return nullptr;
-}
-
-/// If we inlined an invoke site, we need to convert calls
-/// in the body of the inlined function into invokes.
-///
-/// II is the invoke instruction being inlined. FirstNewBlock is the first
-/// block of the inlined code (the last block is the end of the function),
-/// and InlineCodeInfo is information about the code that got inlined.
-static void HandleInlinedLandingPad(InvokeInst *II, BasicBlock *FirstNewBlock,
- ClonedCodeInfo &InlinedCodeInfo) {
- BasicBlock *InvokeDest = II->getUnwindDest();
-
- Function *Caller = FirstNewBlock->getParent();
-
- // The inlined code is currently at the end of the function, scan from the
- // start of the inlined code to its end, checking for stuff we need to
- // rewrite.
- LandingPadInliningInfo Invoke(II);
-
- // Get all of the inlined landing pad instructions.
- SmallPtrSet<LandingPadInst*, 16> InlinedLPads;
- for (Function::iterator I = FirstNewBlock->getIterator(), E = Caller->end();
- I != E; ++I)
- if (InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator()))
- InlinedLPads.insert(II->getLandingPadInst());
-
- // Append the clauses from the outer landing pad instruction into the inlined
- // landing pad instructions.
- LandingPadInst *OuterLPad = Invoke.getLandingPadInst();
- for (LandingPadInst *InlinedLPad : InlinedLPads) {
- unsigned OuterNum = OuterLPad->getNumClauses();
- InlinedLPad->reserveClauses(OuterNum);
- for (unsigned OuterIdx = 0; OuterIdx != OuterNum; ++OuterIdx)
- InlinedLPad->addClause(OuterLPad->getClause(OuterIdx));
- if (OuterLPad->isCleanup())
- InlinedLPad->setCleanup(true);
- }
-
- for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end();
- BB != E; ++BB) {
- if (InlinedCodeInfo.ContainsCalls)
- if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke(
- &*BB, Invoke.getOuterResumeDest()))
- // Update any PHI nodes in the exceptional block to indicate that there
- // is now a new entry in them.
- Invoke.addIncomingPHIValuesFor(NewBB);
-
- // Forward any resumes that are remaining here.
- if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator()))
- Invoke.forwardResume(RI, InlinedLPads);
- }
-
- // Now that everything is happy, we have one final detail. The PHI nodes in
- // the exception destination block still have entries due to the original
- // invoke instruction. Eliminate these entries (which might even delete the
- // PHI node) now.
- InvokeDest->removePredecessor(II->getParent());
-}
-
-/// If we inlined an invoke site, we need to convert calls
-/// in the body of the inlined function into invokes.
-///
-/// II is the invoke instruction being inlined. FirstNewBlock is the first
-/// block of the inlined code (the last block is the end of the function),
-/// and InlineCodeInfo is information about the code that got inlined.
-static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
- ClonedCodeInfo &InlinedCodeInfo) {
- BasicBlock *UnwindDest = II->getUnwindDest();
- Function *Caller = FirstNewBlock->getParent();
-
- assert(UnwindDest->getFirstNonPHI()->isEHPad() && "unexpected BasicBlock!");
-
- // If there are PHI nodes in the unwind destination block, we need to keep
- // track of which values came into them from the invoke before removing the
- // edge from this block.
- SmallVector<Value *, 8> UnwindDestPHIValues;
- BasicBlock *InvokeBB = II->getParent();
- for (Instruction &I : *UnwindDest) {
- // Save the value to use for this edge.
- PHINode *PHI = dyn_cast<PHINode>(&I);
- if (!PHI)
- break;
- UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB));
- }
-
- // Add incoming-PHI values to the unwind destination block for the given basic
- // block, using the values for the original invoke's source block.
- auto UpdatePHINodes = [&](BasicBlock *Src) {
- BasicBlock::iterator I = UnwindDest->begin();
- for (Value *V : UnwindDestPHIValues) {
- PHINode *PHI = cast<PHINode>(I);
- PHI->addIncoming(V, Src);
- ++I;
- }
- };
-
- // This connects all the instructions which 'unwind to caller' to the invoke
- // destination.
- UnwindDestMemoTy FuncletUnwindMap;
- for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end();
- BB != E; ++BB) {
- if (auto *CRI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
- if (CRI->unwindsToCaller()) {
- auto *CleanupPad = CRI->getCleanupPad();
- CleanupReturnInst::Create(CleanupPad, UnwindDest, CRI);
- CRI->eraseFromParent();
- UpdatePHINodes(&*BB);
- // Finding a cleanupret with an unwind destination would confuse
- // subsequent calls to getUnwindDestToken, so map the cleanuppad
- // to short-circuit any such calls and recognize this as an "unwind
- // to caller" cleanup.
- assert(!FuncletUnwindMap.count(CleanupPad) ||
- isa<ConstantTokenNone>(FuncletUnwindMap[CleanupPad]));
- FuncletUnwindMap[CleanupPad] =
- ConstantTokenNone::get(Caller->getContext());
- }
- }
-
- Instruction *I = BB->getFirstNonPHI();
- if (!I->isEHPad())
- continue;
-
- Instruction *Replacement = nullptr;
- if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
- if (CatchSwitch->unwindsToCaller()) {
- Value *UnwindDestToken;
- if (auto *ParentPad =
- dyn_cast<Instruction>(CatchSwitch->getParentPad())) {
- // This catchswitch is nested inside another funclet. If that
- // funclet has an unwind destination within the inlinee, then
- // unwinding out of this catchswitch would be UB. Rewriting this
- // catchswitch to unwind to the inlined invoke's unwind dest would
- // give the parent funclet multiple unwind destinations, which is
- // something that subsequent EH table generation can't handle and
- // that the veirifer rejects. So when we see such a call, leave it
- // as "unwind to caller".
- UnwindDestToken = getUnwindDestToken(ParentPad, FuncletUnwindMap);
- if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken))
- continue;
- } else {
- // This catchswitch has no parent to inherit constraints from, and
- // none of its descendants can have an unwind edge that exits it and
- // targets another funclet in the inlinee. It may or may not have a
- // descendant that definitively has an unwind to caller. In either
- // case, we'll have to assume that any unwinds out of it may need to
- // be routed to the caller, so treat it as though it has a definitive
- // unwind to caller.
- UnwindDestToken = ConstantTokenNone::get(Caller->getContext());
- }
- auto *NewCatchSwitch = CatchSwitchInst::Create(
- CatchSwitch->getParentPad(), UnwindDest,
- CatchSwitch->getNumHandlers(), CatchSwitch->getName(),
- CatchSwitch);
- for (BasicBlock *PadBB : CatchSwitch->handlers())
- NewCatchSwitch->addHandler(PadBB);
- // Propagate info for the old catchswitch over to the new one in
- // the unwind map. This also serves to short-circuit any subsequent
- // checks for the unwind dest of this catchswitch, which would get
- // confused if they found the outer handler in the callee.
- FuncletUnwindMap[NewCatchSwitch] = UnwindDestToken;
- Replacement = NewCatchSwitch;
- }
- } else if (!isa<FuncletPadInst>(I)) {
- llvm_unreachable("unexpected EHPad!");
- }
-
- if (Replacement) {
- Replacement->takeName(I);
- I->replaceAllUsesWith(Replacement);
- I->eraseFromParent();
- UpdatePHINodes(&*BB);
- }
- }
-
- if (InlinedCodeInfo.ContainsCalls)
- for (Function::iterator BB = FirstNewBlock->getIterator(),
- E = Caller->end();
- BB != E; ++BB)
- if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke(
- &*BB, UnwindDest, &FuncletUnwindMap))
- // Update any PHI nodes in the exceptional block to indicate that there
- // is now a new entry in them.
- UpdatePHINodes(NewBB);
-
- // Now that everything is happy, we have one final detail. The PHI nodes in
- // the exception destination block still have entries due to the original
- // invoke instruction. Eliminate these entries (which might even delete the
- // PHI node) now.
- UnwindDest->removePredecessor(InvokeBB);
-}
-
+// Disabled by default, because the added alignment assumptions may increase
+// compile-time and block optimizations. This option is not suitable for use
+// with frontends that emit comprehensive parameter alignment annotations.
+static cl::opt<bool>
+PreserveAlignmentAssumptions("preserve-alignment-assumptions-during-inlining",
+ cl::init(false), cl::Hidden,
+ cl::desc("Convert align attributes to assumptions during inlining."));
+
+static cl::opt<bool> UpdateReturnAttributes(
+ "update-return-attrs", cl::init(true), cl::Hidden,
+ cl::desc("Update return attributes on calls within inlined body"));
+
+static cl::opt<unsigned> InlinerAttributeWindow(
+ "max-inst-checked-for-throw-during-inlining", cl::Hidden,
+ cl::desc("the maximum number of instructions analyzed for may throw during "
+ "attribute inference in inlined body"),
+ cl::init(4));
+
+namespace {
+
+ /// A class for recording information about inlining a landing pad.
+ class LandingPadInliningInfo {
+ /// Destination of the invoke's unwind.
+ BasicBlock *OuterResumeDest;
+
+ /// Destination for the callee's resume.
+ BasicBlock *InnerResumeDest = nullptr;
+
+ /// LandingPadInst associated with the invoke.
+ LandingPadInst *CallerLPad = nullptr;
+
+ /// PHI for EH values from landingpad insts.
+ PHINode *InnerEHValuesPHI = nullptr;
+
+ SmallVector<Value*, 8> UnwindDestPHIValues;
+
+ public:
+ LandingPadInliningInfo(InvokeInst *II)
+ : OuterResumeDest(II->getUnwindDest()) {
+ // If there are PHI nodes in the unwind destination block, we need to keep
+ // track of which values came into them from the invoke before removing
+ // the edge from this block.
+ BasicBlock *InvokeBB = II->getParent();
+ BasicBlock::iterator I = OuterResumeDest->begin();
+ for (; isa<PHINode>(I); ++I) {
+ // Save the value to use for this edge.
+ PHINode *PHI = cast<PHINode>(I);
+ UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB));
+ }
+
+ CallerLPad = cast<LandingPadInst>(I);
+ }
+
+ /// The outer unwind destination is the target of
+ /// unwind edges introduced for calls within the inlined function.
+ BasicBlock *getOuterResumeDest() const {
+ return OuterResumeDest;
+ }
+
+ BasicBlock *getInnerResumeDest();
+
+ LandingPadInst *getLandingPadInst() const { return CallerLPad; }
+
+ /// Forward the 'resume' instruction to the caller's landing pad block.
+ /// When the landing pad block has only one predecessor, this is
+ /// a simple branch. When there is more than one predecessor, we need to
+ /// split the landing pad block after the landingpad instruction and jump
+ /// to there.
+ void forwardResume(ResumeInst *RI,
+ SmallPtrSetImpl<LandingPadInst*> &InlinedLPads);
+
+ /// Add incoming-PHI values to the unwind destination block for the given
+ /// basic block, using the values for the original invoke's source block.
+ void addIncomingPHIValuesFor(BasicBlock *BB) const {
+ addIncomingPHIValuesForInto(BB, OuterResumeDest);
+ }
+
+ void addIncomingPHIValuesForInto(BasicBlock *src, BasicBlock *dest) const {
+ BasicBlock::iterator I = dest->begin();
+ for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
+ PHINode *phi = cast<PHINode>(I);
+ phi->addIncoming(UnwindDestPHIValues[i], src);
+ }
+ }
+ };
+
+} // end anonymous namespace
+
+/// Get or create a target for the branch from ResumeInsts.
+BasicBlock *LandingPadInliningInfo::getInnerResumeDest() {
+ if (InnerResumeDest) return InnerResumeDest;
+
+ // Split the landing pad.
+ BasicBlock::iterator SplitPoint = ++CallerLPad->getIterator();
+ InnerResumeDest =
+ OuterResumeDest->splitBasicBlock(SplitPoint,
+ OuterResumeDest->getName() + ".body");
+
+ // The number of incoming edges we expect to the inner landing pad.
+ const unsigned PHICapacity = 2;
+
+ // Create corresponding new PHIs for all the PHIs in the outer landing pad.
+ Instruction *InsertPoint = &InnerResumeDest->front();
+ BasicBlock::iterator I = OuterResumeDest->begin();
+ for (unsigned i = 0, e = UnwindDestPHIValues.size(); i != e; ++i, ++I) {
+ PHINode *OuterPHI = cast<PHINode>(I);
+ PHINode *InnerPHI = PHINode::Create(OuterPHI->getType(), PHICapacity,
+ OuterPHI->getName() + ".lpad-body",
+ InsertPoint);
+ OuterPHI->replaceAllUsesWith(InnerPHI);
+ InnerPHI->addIncoming(OuterPHI, OuterResumeDest);
+ }
+
+ // Create a PHI for the exception values.
+ InnerEHValuesPHI = PHINode::Create(CallerLPad->getType(), PHICapacity,
+ "eh.lpad-body", InsertPoint);
+ CallerLPad->replaceAllUsesWith(InnerEHValuesPHI);
+ InnerEHValuesPHI->addIncoming(CallerLPad, OuterResumeDest);
+
+ // All done.
+ return InnerResumeDest;
+}
+
+/// Forward the 'resume' instruction to the caller's landing pad block.
+/// When the landing pad block has only one predecessor, this is a simple
+/// branch. When there is more than one predecessor, we need to split the
+/// landing pad block after the landingpad instruction and jump to there.
+void LandingPadInliningInfo::forwardResume(
+ ResumeInst *RI, SmallPtrSetImpl<LandingPadInst *> &InlinedLPads) {
+ BasicBlock *Dest = getInnerResumeDest();
+ BasicBlock *Src = RI->getParent();
+
+ BranchInst::Create(Dest, Src);
+
+ // Update the PHIs in the destination. They were inserted in an order which
+ // makes this work.
+ addIncomingPHIValuesForInto(Src, Dest);
+
+ InnerEHValuesPHI->addIncoming(RI->getOperand(0), Src);
+ RI->eraseFromParent();
+}
+
+/// Helper for getUnwindDestToken/getUnwindDestTokenHelper.
+static Value *getParentPad(Value *EHPad) {
+ if (auto *FPI = dyn_cast<FuncletPadInst>(EHPad))
+ return FPI->getParentPad();
+ return cast<CatchSwitchInst>(EHPad)->getParentPad();
+}
+
+using UnwindDestMemoTy = DenseMap<Instruction *, Value *>;
+
+/// Helper for getUnwindDestToken that does the descendant-ward part of
+/// the search.
+static Value *getUnwindDestTokenHelper(Instruction *EHPad,
+ UnwindDestMemoTy &MemoMap) {
+ SmallVector<Instruction *, 8> Worklist(1, EHPad);
+
+ while (!Worklist.empty()) {
+ Instruction *CurrentPad = Worklist.pop_back_val();
+ // We only put pads on the worklist that aren't in the MemoMap. When
+ // we find an unwind dest for a pad we may update its ancestors, but
+ // the queue only ever contains uncles/great-uncles/etc. of CurrentPad,
+ // so they should never get updated while queued on the worklist.
+ assert(!MemoMap.count(CurrentPad));
+ Value *UnwindDestToken = nullptr;
+ if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(CurrentPad)) {
+ if (CatchSwitch->hasUnwindDest()) {
+ UnwindDestToken = CatchSwitch->getUnwindDest()->getFirstNonPHI();
+ } else {
+ // Catchswitch doesn't have a 'nounwind' variant, and one might be
+ // annotated as "unwinds to caller" when really it's nounwind (see
+ // e.g. SimplifyCFGOpt::SimplifyUnreachable), so we can't infer the
+ // parent's unwind dest from this. We can check its catchpads'
+ // descendants, since they might include a cleanuppad with an
+ // "unwinds to caller" cleanupret, which can be trusted.
+ for (auto HI = CatchSwitch->handler_begin(),
+ HE = CatchSwitch->handler_end();
+ HI != HE && !UnwindDestToken; ++HI) {
+ BasicBlock *HandlerBlock = *HI;
+ auto *CatchPad = cast<CatchPadInst>(HandlerBlock->getFirstNonPHI());
+ for (User *Child : CatchPad->users()) {
+ // Intentionally ignore invokes here -- since the catchswitch is
+ // marked "unwind to caller", it would be a verifier error if it
+ // contained an invoke which unwinds out of it, so any invoke we'd
+ // encounter must unwind to some child of the catch.
+ if (!isa<CleanupPadInst>(Child) && !isa<CatchSwitchInst>(Child))
+ continue;
+
+ Instruction *ChildPad = cast<Instruction>(Child);
+ auto Memo = MemoMap.find(ChildPad);
+ if (Memo == MemoMap.end()) {
+ // Haven't figured out this child pad yet; queue it.
+ Worklist.push_back(ChildPad);
+ continue;
+ }
+ // We've already checked this child, but might have found that
+ // it offers no proof either way.
+ Value *ChildUnwindDestToken = Memo->second;
+ if (!ChildUnwindDestToken)
+ continue;
+ // We already know the child's unwind dest, which can either
+ // be ConstantTokenNone to indicate unwind to caller, or can
+ // be another child of the catchpad. Only the former indicates
+ // the unwind dest of the catchswitch.
+ if (isa<ConstantTokenNone>(ChildUnwindDestToken)) {
+ UnwindDestToken = ChildUnwindDestToken;
+ break;
+ }
+ assert(getParentPad(ChildUnwindDestToken) == CatchPad);
+ }
+ }
+ }
+ } else {
+ auto *CleanupPad = cast<CleanupPadInst>(CurrentPad);
+ for (User *U : CleanupPad->users()) {
+ if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) {
+ if (BasicBlock *RetUnwindDest = CleanupRet->getUnwindDest())
+ UnwindDestToken = RetUnwindDest->getFirstNonPHI();
+ else
+ UnwindDestToken = ConstantTokenNone::get(CleanupPad->getContext());
+ break;
+ }
+ Value *ChildUnwindDestToken;
+ if (auto *Invoke = dyn_cast<InvokeInst>(U)) {
+ ChildUnwindDestToken = Invoke->getUnwindDest()->getFirstNonPHI();
+ } else if (isa<CleanupPadInst>(U) || isa<CatchSwitchInst>(U)) {
+ Instruction *ChildPad = cast<Instruction>(U);
+ auto Memo = MemoMap.find(ChildPad);
+ if (Memo == MemoMap.end()) {
+ // Haven't resolved this child yet; queue it and keep searching.
+ Worklist.push_back(ChildPad);
+ continue;
+ }
+ // We've checked this child, but still need to ignore it if it
+ // had no proof either way.
+ ChildUnwindDestToken = Memo->second;
+ if (!ChildUnwindDestToken)
+ continue;
+ } else {
+ // Not a relevant user of the cleanuppad
+ continue;
+ }
+ // In a well-formed program, the child/invoke must either unwind to
+ // an(other) child of the cleanup, or exit the cleanup. In the
+ // first case, continue searching.
+ if (isa<Instruction>(ChildUnwindDestToken) &&
+ getParentPad(ChildUnwindDestToken) == CleanupPad)
+ continue;
+ UnwindDestToken = ChildUnwindDestToken;
+ break;
+ }
+ }
+ // If we haven't found an unwind dest for CurrentPad, we may have queued its
+ // children, so move on to the next in the worklist.
+ if (!UnwindDestToken)
+ continue;
+
+ // Now we know that CurrentPad unwinds to UnwindDestToken. It also exits
+ // any ancestors of CurrentPad up to but not including UnwindDestToken's
+ // parent pad. Record this in the memo map, and check to see if the
+ // original EHPad being queried is one of the ones exited.
+ Value *UnwindParent;
+ if (auto *UnwindPad = dyn_cast<Instruction>(UnwindDestToken))
+ UnwindParent = getParentPad(UnwindPad);
+ else
+ UnwindParent = nullptr;
+ bool ExitedOriginalPad = false;
+ for (Instruction *ExitedPad = CurrentPad;
+ ExitedPad && ExitedPad != UnwindParent;
+ ExitedPad = dyn_cast<Instruction>(getParentPad(ExitedPad))) {
+ // Skip over catchpads since they just follow their catchswitches.
+ if (isa<CatchPadInst>(ExitedPad))
+ continue;
+ MemoMap[ExitedPad] = UnwindDestToken;
+ ExitedOriginalPad |= (ExitedPad == EHPad);
+ }
+
+ if (ExitedOriginalPad)
+ return UnwindDestToken;
+
+ // Continue the search.
+ }
+
+ // No definitive information is contained within this funclet.
+ return nullptr;
+}
+
+/// Given an EH pad, find where it unwinds. If it unwinds to an EH pad,
+/// return that pad instruction. If it unwinds to caller, return
+/// ConstantTokenNone. If it does not have a definitive unwind destination,
+/// return nullptr.
+///
+/// This routine gets invoked for calls in funclets in inlinees when inlining
+/// an invoke. Since many funclets don't have calls inside them, it's queried
+/// on-demand rather than building a map of pads to unwind dests up front.
+/// Determining a funclet's unwind dest may require recursively searching its
+/// descendants, and also ancestors and cousins if the descendants don't provide
+/// an answer. Since most funclets will have their unwind dest immediately
+/// available as the unwind dest of a catchswitch or cleanupret, this routine
+/// searches top-down from the given pad and then up. To avoid worst-case
+/// quadratic run-time given that approach, it uses a memo map to avoid
+/// re-processing funclet trees. The callers that rewrite the IR as they go
+/// take advantage of this, for correctness, by checking/forcing rewritten
+/// pads' entries to match the original callee view.
+static Value *getUnwindDestToken(Instruction *EHPad,
+ UnwindDestMemoTy &MemoMap) {
+ // Catchpads unwind to the same place as their catchswitch;
+ // redirct any queries on catchpads so the code below can
+ // deal with just catchswitches and cleanuppads.
+ if (auto *CPI = dyn_cast<CatchPadInst>(EHPad))
+ EHPad = CPI->getCatchSwitch();
+
+ // Check if we've already determined the unwind dest for this pad.
+ auto Memo = MemoMap.find(EHPad);
+ if (Memo != MemoMap.end())
+ return Memo->second;
+
+ // Search EHPad and, if necessary, its descendants.
+ Value *UnwindDestToken = getUnwindDestTokenHelper(EHPad, MemoMap);
+ assert((UnwindDestToken == nullptr) != (MemoMap.count(EHPad) != 0));
+ if (UnwindDestToken)
+ return UnwindDestToken;
+
+ // No information is available for this EHPad from itself or any of its
+ // descendants. An unwind all the way out to a pad in the caller would
+ // need also to agree with the unwind dest of the parent funclet, so
+ // search up the chain to try to find a funclet with information. Put
+ // null entries in the memo map to avoid re-processing as we go up.
+ MemoMap[EHPad] = nullptr;
+#ifndef NDEBUG
+ SmallPtrSet<Instruction *, 4> TempMemos;
+ TempMemos.insert(EHPad);
+#endif
+ Instruction *LastUselessPad = EHPad;
+ Value *AncestorToken;
+ for (AncestorToken = getParentPad(EHPad);
+ auto *AncestorPad = dyn_cast<Instruction>(AncestorToken);
+ AncestorToken = getParentPad(AncestorToken)) {
+ // Skip over catchpads since they just follow their catchswitches.
+ if (isa<CatchPadInst>(AncestorPad))
+ continue;
+ // If the MemoMap had an entry mapping AncestorPad to nullptr, since we
+ // haven't yet called getUnwindDestTokenHelper for AncestorPad in this
+ // call to getUnwindDestToken, that would mean that AncestorPad had no
+ // information in itself, its descendants, or its ancestors. If that
+ // were the case, then we should also have recorded the lack of information
+ // for the descendant that we're coming from. So assert that we don't
+ // find a null entry in the MemoMap for AncestorPad.
+ assert(!MemoMap.count(AncestorPad) || MemoMap[AncestorPad]);
+ auto AncestorMemo = MemoMap.find(AncestorPad);
+ if (AncestorMemo == MemoMap.end()) {
+ UnwindDestToken = getUnwindDestTokenHelper(AncestorPad, MemoMap);
+ } else {
+ UnwindDestToken = AncestorMemo->second;
+ }
+ if (UnwindDestToken)
+ break;
+ LastUselessPad = AncestorPad;
+ MemoMap[LastUselessPad] = nullptr;
+#ifndef NDEBUG
+ TempMemos.insert(LastUselessPad);
+#endif
+ }
+
+ // We know that getUnwindDestTokenHelper was called on LastUselessPad and
+ // returned nullptr (and likewise for EHPad and any of its ancestors up to
+ // LastUselessPad), so LastUselessPad has no information from below. Since
+ // getUnwindDestTokenHelper must investigate all downward paths through
+ // no-information nodes to prove that a node has no information like this,
+ // and since any time it finds information it records it in the MemoMap for
+ // not just the immediately-containing funclet but also any ancestors also
+ // exited, it must be the case that, walking downward from LastUselessPad,
+ // visiting just those nodes which have not been mapped to an unwind dest
+ // by getUnwindDestTokenHelper (the nullptr TempMemos notwithstanding, since
+ // they are just used to keep getUnwindDestTokenHelper from repeating work),
+ // any node visited must have been exhaustively searched with no information
+ // for it found.
+ SmallVector<Instruction *, 8> Worklist(1, LastUselessPad);
+ while (!Worklist.empty()) {
+ Instruction *UselessPad = Worklist.pop_back_val();
+ auto Memo = MemoMap.find(UselessPad);
+ if (Memo != MemoMap.end() && Memo->second) {
+ // Here the name 'UselessPad' is a bit of a misnomer, because we've found
+ // that it is a funclet that does have information about unwinding to
+ // a particular destination; its parent was a useless pad.
+ // Since its parent has no information, the unwind edge must not escape
+ // the parent, and must target a sibling of this pad. This local unwind
+ // gives us no information about EHPad. Leave it and the subtree rooted
+ // at it alone.
+ assert(getParentPad(Memo->second) == getParentPad(UselessPad));
+ continue;
+ }
+ // We know we don't have information for UselesPad. If it has an entry in
+ // the MemoMap (mapping it to nullptr), it must be one of the TempMemos
+ // added on this invocation of getUnwindDestToken; if a previous invocation
+ // recorded nullptr, it would have had to prove that the ancestors of
+ // UselessPad, which include LastUselessPad, had no information, and that
+ // in turn would have required proving that the descendants of
+ // LastUselesPad, which include EHPad, have no information about
+ // LastUselessPad, which would imply that EHPad was mapped to nullptr in
+ // the MemoMap on that invocation, which isn't the case if we got here.
+ assert(!MemoMap.count(UselessPad) || TempMemos.count(UselessPad));
+ // Assert as we enumerate users that 'UselessPad' doesn't have any unwind
+ // information that we'd be contradicting by making a map entry for it
+ // (which is something that getUnwindDestTokenHelper must have proved for
+ // us to get here). Just assert on is direct users here; the checks in
+ // this downward walk at its descendants will verify that they don't have
+ // any unwind edges that exit 'UselessPad' either (i.e. they either have no
+ // unwind edges or unwind to a sibling).
+ MemoMap[UselessPad] = UnwindDestToken;
+ if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(UselessPad)) {
+ assert(CatchSwitch->getUnwindDest() == nullptr && "Expected useless pad");
+ for (BasicBlock *HandlerBlock : CatchSwitch->handlers()) {
+ auto *CatchPad = HandlerBlock->getFirstNonPHI();
+ for (User *U : CatchPad->users()) {
+ assert(
+ (!isa<InvokeInst>(U) ||
+ (getParentPad(
+ cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHI()) ==
+ CatchPad)) &&
+ "Expected useless pad");
+ if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
+ Worklist.push_back(cast<Instruction>(U));
+ }
+ }
+ } else {
+ assert(isa<CleanupPadInst>(UselessPad));
+ for (User *U : UselessPad->users()) {
+ assert(!isa<CleanupReturnInst>(U) && "Expected useless pad");
+ assert((!isa<InvokeInst>(U) ||
+ (getParentPad(
+ cast<InvokeInst>(U)->getUnwindDest()->getFirstNonPHI()) ==
+ UselessPad)) &&
+ "Expected useless pad");
+ if (isa<CatchSwitchInst>(U) || isa<CleanupPadInst>(U))
+ Worklist.push_back(cast<Instruction>(U));
+ }
+ }
+ }
+
+ return UnwindDestToken;
+}
+
+/// When we inline a basic block into an invoke,
+/// we have to turn all of the calls that can throw into invokes.
+/// This function analyze BB to see if there are any calls, and if so,
+/// it rewrites them to be invokes that jump to InvokeDest and fills in the PHI
+/// nodes in that block with the values specified in InvokeDestPHIValues.
+static BasicBlock *HandleCallsInBlockInlinedThroughInvoke(
+ BasicBlock *BB, BasicBlock *UnwindEdge,
+ UnwindDestMemoTy *FuncletUnwindMap = nullptr) {
+ for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
+ Instruction *I = &*BBI++;
+
+ // We only need to check for function calls: inlined invoke
+ // instructions require no special handling.
+ CallInst *CI = dyn_cast<CallInst>(I);
+
+ if (!CI || CI->doesNotThrow() || CI->isInlineAsm())
+ continue;
+
+ // We do not need to (and in fact, cannot) convert possibly throwing calls
+ // to @llvm.experimental_deoptimize (resp. @llvm.experimental.guard) into
+ // invokes. The caller's "segment" of the deoptimization continuation
+ // attached to the newly inlined @llvm.experimental_deoptimize
+ // (resp. @llvm.experimental.guard) call should contain the exception
+ // handling logic, if any.
+ if (auto *F = CI->getCalledFunction())
+ if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize ||
+ F->getIntrinsicID() == Intrinsic::experimental_guard)
+ continue;
+
+ if (auto FuncletBundle = CI->getOperandBundle(LLVMContext::OB_funclet)) {
+ // This call is nested inside a funclet. If that funclet has an unwind
+ // destination within the inlinee, then unwinding out of this call would
+ // be UB. Rewriting this call to an invoke which targets the inlined
+ // invoke's unwind dest would give the call's parent funclet multiple
+ // unwind destinations, which is something that subsequent EH table
+ // generation can't handle and that the veirifer rejects. So when we
+ // see such a call, leave it as a call.
+ auto *FuncletPad = cast<Instruction>(FuncletBundle->Inputs[0]);
+ Value *UnwindDestToken =
+ getUnwindDestToken(FuncletPad, *FuncletUnwindMap);
+ if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken))
+ continue;
+#ifndef NDEBUG
+ Instruction *MemoKey;
+ if (auto *CatchPad = dyn_cast<CatchPadInst>(FuncletPad))
+ MemoKey = CatchPad->getCatchSwitch();
+ else
+ MemoKey = FuncletPad;
+ assert(FuncletUnwindMap->count(MemoKey) &&
+ (*FuncletUnwindMap)[MemoKey] == UnwindDestToken &&
+ "must get memoized to avoid confusing later searches");
+#endif // NDEBUG
+ }
+
+ changeToInvokeAndSplitBasicBlock(CI, UnwindEdge);
+ return BB;
+ }
+ return nullptr;
+}
+
+/// If we inlined an invoke site, we need to convert calls
+/// in the body of the inlined function into invokes.
+///
+/// II is the invoke instruction being inlined. FirstNewBlock is the first
+/// block of the inlined code (the last block is the end of the function),
+/// and InlineCodeInfo is information about the code that got inlined.
+static void HandleInlinedLandingPad(InvokeInst *II, BasicBlock *FirstNewBlock,
+ ClonedCodeInfo &InlinedCodeInfo) {
+ BasicBlock *InvokeDest = II->getUnwindDest();
+
+ Function *Caller = FirstNewBlock->getParent();
+
+ // The inlined code is currently at the end of the function, scan from the
+ // start of the inlined code to its end, checking for stuff we need to
+ // rewrite.
+ LandingPadInliningInfo Invoke(II);
+
+ // Get all of the inlined landing pad instructions.
+ SmallPtrSet<LandingPadInst*, 16> InlinedLPads;
+ for (Function::iterator I = FirstNewBlock->getIterator(), E = Caller->end();
+ I != E; ++I)
+ if (InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator()))
+ InlinedLPads.insert(II->getLandingPadInst());
+
+ // Append the clauses from the outer landing pad instruction into the inlined
+ // landing pad instructions.
+ LandingPadInst *OuterLPad = Invoke.getLandingPadInst();
+ for (LandingPadInst *InlinedLPad : InlinedLPads) {
+ unsigned OuterNum = OuterLPad->getNumClauses();
+ InlinedLPad->reserveClauses(OuterNum);
+ for (unsigned OuterIdx = 0; OuterIdx != OuterNum; ++OuterIdx)
+ InlinedLPad->addClause(OuterLPad->getClause(OuterIdx));
+ if (OuterLPad->isCleanup())
+ InlinedLPad->setCleanup(true);
+ }
+
+ for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end();
+ BB != E; ++BB) {
+ if (InlinedCodeInfo.ContainsCalls)
+ if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke(
+ &*BB, Invoke.getOuterResumeDest()))
+ // Update any PHI nodes in the exceptional block to indicate that there
+ // is now a new entry in them.
+ Invoke.addIncomingPHIValuesFor(NewBB);
+
+ // Forward any resumes that are remaining here.
+ if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator()))
+ Invoke.forwardResume(RI, InlinedLPads);
+ }
+
+ // Now that everything is happy, we have one final detail. The PHI nodes in
+ // the exception destination block still have entries due to the original
+ // invoke instruction. Eliminate these entries (which might even delete the
+ // PHI node) now.
+ InvokeDest->removePredecessor(II->getParent());
+}
+
+/// If we inlined an invoke site, we need to convert calls
+/// in the body of the inlined function into invokes.
+///
+/// II is the invoke instruction being inlined. FirstNewBlock is the first
+/// block of the inlined code (the last block is the end of the function),
+/// and InlineCodeInfo is information about the code that got inlined.
+static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
+ ClonedCodeInfo &InlinedCodeInfo) {
+ BasicBlock *UnwindDest = II->getUnwindDest();
+ Function *Caller = FirstNewBlock->getParent();
+
+ assert(UnwindDest->getFirstNonPHI()->isEHPad() && "unexpected BasicBlock!");
+
+ // If there are PHI nodes in the unwind destination block, we need to keep
+ // track of which values came into them from the invoke before removing the
+ // edge from this block.
+ SmallVector<Value *, 8> UnwindDestPHIValues;
+ BasicBlock *InvokeBB = II->getParent();
+ for (Instruction &I : *UnwindDest) {
+ // Save the value to use for this edge.
+ PHINode *PHI = dyn_cast<PHINode>(&I);
+ if (!PHI)
+ break;
+ UnwindDestPHIValues.push_back(PHI->getIncomingValueForBlock(InvokeBB));
+ }
+
+ // Add incoming-PHI values to the unwind destination block for the given basic
+ // block, using the values for the original invoke's source block.
+ auto UpdatePHINodes = [&](BasicBlock *Src) {
+ BasicBlock::iterator I = UnwindDest->begin();
+ for (Value *V : UnwindDestPHIValues) {
+ PHINode *PHI = cast<PHINode>(I);
+ PHI->addIncoming(V, Src);
+ ++I;
+ }
+ };
+
+ // This connects all the instructions which 'unwind to caller' to the invoke
+ // destination.
+ UnwindDestMemoTy FuncletUnwindMap;
+ for (Function::iterator BB = FirstNewBlock->getIterator(), E = Caller->end();
+ BB != E; ++BB) {
+ if (auto *CRI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
+ if (CRI->unwindsToCaller()) {
+ auto *CleanupPad = CRI->getCleanupPad();
+ CleanupReturnInst::Create(CleanupPad, UnwindDest, CRI);
+ CRI->eraseFromParent();
+ UpdatePHINodes(&*BB);
+ // Finding a cleanupret with an unwind destination would confuse
+ // subsequent calls to getUnwindDestToken, so map the cleanuppad
+ // to short-circuit any such calls and recognize this as an "unwind
+ // to caller" cleanup.
+ assert(!FuncletUnwindMap.count(CleanupPad) ||
+ isa<ConstantTokenNone>(FuncletUnwindMap[CleanupPad]));
+ FuncletUnwindMap[CleanupPad] =
+ ConstantTokenNone::get(Caller->getContext());
+ }
+ }
+
+ Instruction *I = BB->getFirstNonPHI();
+ if (!I->isEHPad())
+ continue;
+
+ Instruction *Replacement = nullptr;
+ if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
+ if (CatchSwitch->unwindsToCaller()) {
+ Value *UnwindDestToken;
+ if (auto *ParentPad =
+ dyn_cast<Instruction>(CatchSwitch->getParentPad())) {
+ // This catchswitch is nested inside another funclet. If that
+ // funclet has an unwind destination within the inlinee, then
+ // unwinding out of this catchswitch would be UB. Rewriting this
+ // catchswitch to unwind to the inlined invoke's unwind dest would
+ // give the parent funclet multiple unwind destinations, which is
+ // something that subsequent EH table generation can't handle and
+ // that the veirifer rejects. So when we see such a call, leave it
+ // as "unwind to caller".
+ UnwindDestToken = getUnwindDestToken(ParentPad, FuncletUnwindMap);
+ if (UnwindDestToken && !isa<ConstantTokenNone>(UnwindDestToken))
+ continue;
+ } else {
+ // This catchswitch has no parent to inherit constraints from, and
+ // none of its descendants can have an unwind edge that exits it and
+ // targets another funclet in the inlinee. It may or may not have a
+ // descendant that definitively has an unwind to caller. In either
+ // case, we'll have to assume that any unwinds out of it may need to
+ // be routed to the caller, so treat it as though it has a definitive
+ // unwind to caller.
+ UnwindDestToken = ConstantTokenNone::get(Caller->getContext());
+ }
+ auto *NewCatchSwitch = CatchSwitchInst::Create(
+ CatchSwitch->getParentPad(), UnwindDest,
+ CatchSwitch->getNumHandlers(), CatchSwitch->getName(),
+ CatchSwitch);
+ for (BasicBlock *PadBB : CatchSwitch->handlers())
+ NewCatchSwitch->addHandler(PadBB);
+ // Propagate info for the old catchswitch over to the new one in
+ // the unwind map. This also serves to short-circuit any subsequent
+ // checks for the unwind dest of this catchswitch, which would get
+ // confused if they found the outer handler in the callee.
+ FuncletUnwindMap[NewCatchSwitch] = UnwindDestToken;
+ Replacement = NewCatchSwitch;
+ }
+ } else if (!isa<FuncletPadInst>(I)) {
+ llvm_unreachable("unexpected EHPad!");
+ }
+
+ if (Replacement) {
+ Replacement->takeName(I);
+ I->replaceAllUsesWith(Replacement);
+ I->eraseFromParent();
+ UpdatePHINodes(&*BB);
+ }
+ }
+
+ if (InlinedCodeInfo.ContainsCalls)
+ for (Function::iterator BB = FirstNewBlock->getIterator(),
+ E = Caller->end();
+ BB != E; ++BB)
+ if (BasicBlock *NewBB = HandleCallsInBlockInlinedThroughInvoke(
+ &*BB, UnwindDest, &FuncletUnwindMap))
+ // Update any PHI nodes in the exceptional block to indicate that there
+ // is now a new entry in them.
+ UpdatePHINodes(NewBB);
+
+ // Now that everything is happy, we have one final detail. The PHI nodes in
+ // the exception destination block still have entries due to the original
+ // invoke instruction. Eliminate these entries (which might even delete the
+ // PHI node) now.
+ UnwindDest->removePredecessor(InvokeBB);
+}
+
/// When inlining a call site that has !llvm.mem.parallel_loop_access,
/// !llvm.access.group, !alias.scope or !noalias metadata, that metadata should
/// be propagated to all memory-accessing cloned instructions.
@@ -788,14 +788,14 @@ static void PropagateCallSiteMetadata(CallBase &CB, Function::iterator FStart,
MDNode *AliasScope = CB.getMetadata(LLVMContext::MD_alias_scope);
MDNode *NoAlias = CB.getMetadata(LLVMContext::MD_noalias);
if (!MemParallelLoopAccess && !AccessGroup && !AliasScope && !NoAlias)
- return;
-
+ return;
+
for (BasicBlock &BB : make_range(FStart, FEnd)) {
for (Instruction &I : BB) {
// This metadata is only relevant for instructions that access memory.
if (!I.mayReadOrWriteMemory())
continue;
-
+
if (MemParallelLoopAccess) {
// TODO: This probably should not overwrite MemParalleLoopAccess.
MemParallelLoopAccess = MDNode::concatenate(
@@ -804,7 +804,7 @@ static void PropagateCallSiteMetadata(CallBase &CB, Function::iterator FStart,
I.setMetadata(LLVMContext::MD_mem_parallel_loop_access,
MemParallelLoopAccess);
}
-
+
if (AccessGroup)
I.setMetadata(LLVMContext::MD_access_group, uniteAccessGroups(
I.getMetadata(LLVMContext::MD_access_group), AccessGroup));
@@ -817,22 +817,22 @@ static void PropagateCallSiteMetadata(CallBase &CB, Function::iterator FStart,
I.setMetadata(LLVMContext::MD_noalias, MDNode::concatenate(
I.getMetadata(LLVMContext::MD_noalias), NoAlias));
}
- }
-}
-
+ }
+}
+
/// Utility for cloning !noalias and !alias.scope metadata. When a code region
/// using scoped alias metadata is inlined, the aliasing relationships may not
/// hold between the two version. It is necessary to create a deep clone of the
/// metadata, putting the two versions in separate scope domains.
class ScopedAliasMetadataDeepCloner {
using MetadataMap = DenseMap<const MDNode *, TrackingMDNodeRef>;
- SetVector<const MDNode *> MD;
+ SetVector<const MDNode *> MD;
MetadataMap MDMap;
void addRecursiveMetadataUses();
-
+
public:
ScopedAliasMetadataDeepCloner(const Function *F);
-
+
/// Create a new clone of the scoped alias metadata, which will be used by
/// subsequent remap() calls.
void clone();
@@ -847,59 +847,59 @@ ScopedAliasMetadataDeepCloner::ScopedAliasMetadataDeepCloner(
for (const BasicBlock &BB : *F) {
for (const Instruction &I : BB) {
if (const MDNode *M = I.getMetadata(LLVMContext::MD_alias_scope))
- MD.insert(M);
+ MD.insert(M);
if (const MDNode *M = I.getMetadata(LLVMContext::MD_noalias))
- MD.insert(M);
+ MD.insert(M);
// We also need to clone the metadata in noalias intrinsics.
if (const auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
MD.insert(Decl->getScopeList());
- }
+ }
}
addRecursiveMetadataUses();
}
-
+
void ScopedAliasMetadataDeepCloner::addRecursiveMetadataUses() {
- SmallVector<const Metadata *, 16> Queue(MD.begin(), MD.end());
- while (!Queue.empty()) {
- const MDNode *M = cast<MDNode>(Queue.pop_back_val());
+ SmallVector<const Metadata *, 16> Queue(MD.begin(), MD.end());
+ while (!Queue.empty()) {
+ const MDNode *M = cast<MDNode>(Queue.pop_back_val());
for (const Metadata *Op : M->operands())
if (const MDNode *OpMD = dyn_cast<MDNode>(Op))
if (MD.insert(OpMD))
Queue.push_back(OpMD);
- }
+ }
}
-
+
void ScopedAliasMetadataDeepCloner::clone() {
assert(MDMap.empty() && "clone() already called ?");
- SmallVector<TempMDTuple, 16> DummyNodes;
- for (const MDNode *I : MD) {
+ SmallVector<TempMDTuple, 16> DummyNodes;
+ for (const MDNode *I : MD) {
DummyNodes.push_back(MDTuple::getTemporary(I->getContext(), None));
- MDMap[I].reset(DummyNodes.back().get());
- }
-
- // Create new metadata nodes to replace the dummy nodes, replacing old
- // metadata references with either a dummy node or an already-created new
- // node.
+ MDMap[I].reset(DummyNodes.back().get());
+ }
+
+ // Create new metadata nodes to replace the dummy nodes, replacing old
+ // metadata references with either a dummy node or an already-created new
+ // node.
SmallVector<Metadata *, 4> NewOps;
- for (const MDNode *I : MD) {
+ for (const MDNode *I : MD) {
for (const Metadata *Op : I->operands()) {
if (const MDNode *M = dyn_cast<MDNode>(Op))
- NewOps.push_back(MDMap[M]);
- else
+ NewOps.push_back(MDMap[M]);
+ else
NewOps.push_back(const_cast<Metadata *>(Op));
- }
-
+ }
+
MDNode *NewM = MDNode::get(I->getContext(), NewOps);
- MDTuple *TempM = cast<MDTuple>(MDMap[I]);
- assert(TempM->isTemporary() && "Expected temporary node");
-
- TempM->replaceAllUsesWith(NewM);
+ MDTuple *TempM = cast<MDTuple>(MDMap[I]);
+ assert(TempM->isTemporary() && "Expected temporary node");
+
+ TempM->replaceAllUsesWith(NewM);
NewOps.clear();
- }
+ }
}
-
+
void ScopedAliasMetadataDeepCloner::remap(Function::iterator FStart,
Function::iterator FEnd) {
if (MDMap.empty())
@@ -912,71 +912,71 @@ void ScopedAliasMetadataDeepCloner::remap(Function::iterator FStart,
if (MDNode *M = I.getMetadata(LLVMContext::MD_alias_scope))
if (MDNode *MNew = MDMap.lookup(M))
I.setMetadata(LLVMContext::MD_alias_scope, MNew);
-
+
if (MDNode *M = I.getMetadata(LLVMContext::MD_noalias))
if (MDNode *MNew = MDMap.lookup(M))
I.setMetadata(LLVMContext::MD_noalias, MNew);
-
+
if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
if (MDNode *MNew = MDMap.lookup(Decl->getScopeList()))
Decl->setScopeList(MNew);
}
- }
-}
-
-/// If the inlined function has noalias arguments,
-/// then add new alias scopes for each noalias argument, tag the mapped noalias
-/// parameters with noalias metadata specifying the new scope, and tag all
-/// non-derived loads, stores and memory intrinsics with the new alias scopes.
-static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
- const DataLayout &DL, AAResults *CalleeAAR) {
- if (!EnableNoAliasConversion)
- return;
-
- const Function *CalledFunc = CB.getCalledFunction();
- SmallVector<const Argument *, 4> NoAliasArgs;
-
- for (const Argument &Arg : CalledFunc->args())
- if (CB.paramHasAttr(Arg.getArgNo(), Attribute::NoAlias) && !Arg.use_empty())
- NoAliasArgs.push_back(&Arg);
-
- if (NoAliasArgs.empty())
- return;
-
- // To do a good job, if a noalias variable is captured, we need to know if
- // the capture point dominates the particular use we're considering.
- DominatorTree DT;
- DT.recalculate(const_cast<Function&>(*CalledFunc));
-
- // noalias indicates that pointer values based on the argument do not alias
- // pointer values which are not based on it. So we add a new "scope" for each
- // noalias function argument. Accesses using pointers based on that argument
- // become part of that alias scope, accesses using pointers not based on that
- // argument are tagged as noalias with that scope.
-
- DenseMap<const Argument *, MDNode *> NewScopes;
- MDBuilder MDB(CalledFunc->getContext());
-
- // Create a new scope domain for this function.
- MDNode *NewDomain =
- MDB.createAnonymousAliasScopeDomain(CalledFunc->getName());
- for (unsigned i = 0, e = NoAliasArgs.size(); i != e; ++i) {
- const Argument *A = NoAliasArgs[i];
-
- std::string Name = std::string(CalledFunc->getName());
- if (A->hasName()) {
- Name += ": %";
- Name += A->getName();
- } else {
- Name += ": argument ";
- Name += utostr(i);
- }
-
- // Note: We always create a new anonymous root here. This is true regardless
- // of the linkage of the callee because the aliasing "scope" is not just a
- // property of the callee, but also all control dependencies in the caller.
- MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
- NewScopes.insert(std::make_pair(A, NewScope));
+ }
+}
+
+/// If the inlined function has noalias arguments,
+/// then add new alias scopes for each noalias argument, tag the mapped noalias
+/// parameters with noalias metadata specifying the new scope, and tag all
+/// non-derived loads, stores and memory intrinsics with the new alias scopes.
+static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
+ const DataLayout &DL, AAResults *CalleeAAR) {
+ if (!EnableNoAliasConversion)
+ return;
+
+ const Function *CalledFunc = CB.getCalledFunction();
+ SmallVector<const Argument *, 4> NoAliasArgs;
+
+ for (const Argument &Arg : CalledFunc->args())
+ if (CB.paramHasAttr(Arg.getArgNo(), Attribute::NoAlias) && !Arg.use_empty())
+ NoAliasArgs.push_back(&Arg);
+
+ if (NoAliasArgs.empty())
+ return;
+
+ // To do a good job, if a noalias variable is captured, we need to know if
+ // the capture point dominates the particular use we're considering.
+ DominatorTree DT;
+ DT.recalculate(const_cast<Function&>(*CalledFunc));
+
+ // noalias indicates that pointer values based on the argument do not alias
+ // pointer values which are not based on it. So we add a new "scope" for each
+ // noalias function argument. Accesses using pointers based on that argument
+ // become part of that alias scope, accesses using pointers not based on that
+ // argument are tagged as noalias with that scope.
+
+ DenseMap<const Argument *, MDNode *> NewScopes;
+ MDBuilder MDB(CalledFunc->getContext());
+
+ // Create a new scope domain for this function.
+ MDNode *NewDomain =
+ MDB.createAnonymousAliasScopeDomain(CalledFunc->getName());
+ for (unsigned i = 0, e = NoAliasArgs.size(); i != e; ++i) {
+ const Argument *A = NoAliasArgs[i];
+
+ std::string Name = std::string(CalledFunc->getName());
+ if (A->hasName()) {
+ Name += ": %";
+ Name += A->getName();
+ } else {
+ Name += ": argument ";
+ Name += utostr(i);
+ }
+
+ // Note: We always create a new anonymous root here. This is true regardless
+ // of the linkage of the callee because the aliasing "scope" is not just a
+ // property of the callee, but also all control dependencies in the caller.
+ MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
+ NewScopes.insert(std::make_pair(A, NewScope));
if (UseNoAliasIntrinsic) {
// Introduce a llvm.experimental.noalias.scope.decl for the noalias
@@ -988,803 +988,803 @@ static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
// llvm.noalias intrinsic is introduced.
(void)NoAliasDecl;
}
- }
-
- // Iterate over all new instructions in the map; for all memory-access
- // instructions, add the alias scope metadata.
- for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
- VMI != VMIE; ++VMI) {
- if (const Instruction *I = dyn_cast<Instruction>(VMI->first)) {
- if (!VMI->second)
- continue;
-
- Instruction *NI = dyn_cast<Instruction>(VMI->second);
- if (!NI)
- continue;
-
- bool IsArgMemOnlyCall = false, IsFuncCall = false;
- SmallVector<const Value *, 2> PtrArgs;
-
- if (const LoadInst *LI = dyn_cast<LoadInst>(I))
- PtrArgs.push_back(LI->getPointerOperand());
- else if (const StoreInst *SI = dyn_cast<StoreInst>(I))
- PtrArgs.push_back(SI->getPointerOperand());
- else if (const VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
- PtrArgs.push_back(VAAI->getPointerOperand());
- else if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I))
- PtrArgs.push_back(CXI->getPointerOperand());
- else if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I))
- PtrArgs.push_back(RMWI->getPointerOperand());
- else if (const auto *Call = dyn_cast<CallBase>(I)) {
- // If we know that the call does not access memory, then we'll still
- // know that about the inlined clone of this call site, and we don't
- // need to add metadata.
- if (Call->doesNotAccessMemory())
- continue;
-
- IsFuncCall = true;
- if (CalleeAAR) {
- FunctionModRefBehavior MRB = CalleeAAR->getModRefBehavior(Call);
- if (AAResults::onlyAccessesArgPointees(MRB))
- IsArgMemOnlyCall = true;
- }
-
- for (Value *Arg : Call->args()) {
- // We need to check the underlying objects of all arguments, not just
- // the pointer arguments, because we might be passing pointers as
- // integers, etc.
- // However, if we know that the call only accesses pointer arguments,
- // then we only need to check the pointer arguments.
- if (IsArgMemOnlyCall && !Arg->getType()->isPointerTy())
- continue;
-
- PtrArgs.push_back(Arg);
- }
- }
-
- // If we found no pointers, then this instruction is not suitable for
- // pairing with an instruction to receive aliasing metadata.
- // However, if this is a call, this we might just alias with none of the
- // noalias arguments.
- if (PtrArgs.empty() && !IsFuncCall)
- continue;
-
- // It is possible that there is only one underlying object, but you
- // need to go through several PHIs to see it, and thus could be
- // repeated in the Objects list.
- SmallPtrSet<const Value *, 4> ObjSet;
- SmallVector<Metadata *, 4> Scopes, NoAliases;
-
- SmallSetVector<const Argument *, 4> NAPtrArgs;
- for (const Value *V : PtrArgs) {
- SmallVector<const Value *, 4> Objects;
+ }
+
+ // Iterate over all new instructions in the map; for all memory-access
+ // instructions, add the alias scope metadata.
+ for (ValueToValueMapTy::iterator VMI = VMap.begin(), VMIE = VMap.end();
+ VMI != VMIE; ++VMI) {
+ if (const Instruction *I = dyn_cast<Instruction>(VMI->first)) {
+ if (!VMI->second)
+ continue;
+
+ Instruction *NI = dyn_cast<Instruction>(VMI->second);
+ if (!NI)
+ continue;
+
+ bool IsArgMemOnlyCall = false, IsFuncCall = false;
+ SmallVector<const Value *, 2> PtrArgs;
+
+ if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+ PtrArgs.push_back(LI->getPointerOperand());
+ else if (const StoreInst *SI = dyn_cast<StoreInst>(I))
+ PtrArgs.push_back(SI->getPointerOperand());
+ else if (const VAArgInst *VAAI = dyn_cast<VAArgInst>(I))
+ PtrArgs.push_back(VAAI->getPointerOperand());
+ else if (const AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(I))
+ PtrArgs.push_back(CXI->getPointerOperand());
+ else if (const AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I))
+ PtrArgs.push_back(RMWI->getPointerOperand());
+ else if (const auto *Call = dyn_cast<CallBase>(I)) {
+ // If we know that the call does not access memory, then we'll still
+ // know that about the inlined clone of this call site, and we don't
+ // need to add metadata.
+ if (Call->doesNotAccessMemory())
+ continue;
+
+ IsFuncCall = true;
+ if (CalleeAAR) {
+ FunctionModRefBehavior MRB = CalleeAAR->getModRefBehavior(Call);
+ if (AAResults::onlyAccessesArgPointees(MRB))
+ IsArgMemOnlyCall = true;
+ }
+
+ for (Value *Arg : Call->args()) {
+ // We need to check the underlying objects of all arguments, not just
+ // the pointer arguments, because we might be passing pointers as
+ // integers, etc.
+ // However, if we know that the call only accesses pointer arguments,
+ // then we only need to check the pointer arguments.
+ if (IsArgMemOnlyCall && !Arg->getType()->isPointerTy())
+ continue;
+
+ PtrArgs.push_back(Arg);
+ }
+ }
+
+ // If we found no pointers, then this instruction is not suitable for
+ // pairing with an instruction to receive aliasing metadata.
+ // However, if this is a call, this we might just alias with none of the
+ // noalias arguments.
+ if (PtrArgs.empty() && !IsFuncCall)
+ continue;
+
+ // It is possible that there is only one underlying object, but you
+ // need to go through several PHIs to see it, and thus could be
+ // repeated in the Objects list.
+ SmallPtrSet<const Value *, 4> ObjSet;
+ SmallVector<Metadata *, 4> Scopes, NoAliases;
+
+ SmallSetVector<const Argument *, 4> NAPtrArgs;
+ for (const Value *V : PtrArgs) {
+ SmallVector<const Value *, 4> Objects;
getUnderlyingObjects(V, Objects, /* LI = */ nullptr);
-
- for (const Value *O : Objects)
- ObjSet.insert(O);
- }
-
- // Figure out if we're derived from anything that is not a noalias
- // argument.
- bool CanDeriveViaCapture = false, UsesAliasingPtr = false;
- for (const Value *V : ObjSet) {
- // Is this value a constant that cannot be derived from any pointer
- // value (we need to exclude constant expressions, for example, that
- // are formed from arithmetic on global symbols).
- bool IsNonPtrConst = isa<ConstantInt>(V) || isa<ConstantFP>(V) ||
- isa<ConstantPointerNull>(V) ||
- isa<ConstantDataVector>(V) || isa<UndefValue>(V);
- if (IsNonPtrConst)
- continue;
-
- // If this is anything other than a noalias argument, then we cannot
- // completely describe the aliasing properties using alias.scope
- // metadata (and, thus, won't add any).
- if (const Argument *A = dyn_cast<Argument>(V)) {
- if (!CB.paramHasAttr(A->getArgNo(), Attribute::NoAlias))
- UsesAliasingPtr = true;
- } else {
- UsesAliasingPtr = true;
- }
-
- // If this is not some identified function-local object (which cannot
- // directly alias a noalias argument), or some other argument (which,
- // by definition, also cannot alias a noalias argument), then we could
- // alias a noalias argument that has been captured).
- if (!isa<Argument>(V) &&
- !isIdentifiedFunctionLocal(const_cast<Value*>(V)))
- CanDeriveViaCapture = true;
- }
-
- // A function call can always get captured noalias pointers (via other
- // parameters, globals, etc.).
- if (IsFuncCall && !IsArgMemOnlyCall)
- CanDeriveViaCapture = true;
-
- // First, we want to figure out all of the sets with which we definitely
- // don't alias. Iterate over all noalias set, and add those for which:
- // 1. The noalias argument is not in the set of objects from which we
- // definitely derive.
- // 2. The noalias argument has not yet been captured.
- // An arbitrary function that might load pointers could see captured
- // noalias arguments via other noalias arguments or globals, and so we
- // must always check for prior capture.
- for (const Argument *A : NoAliasArgs) {
- if (!ObjSet.count(A) && (!CanDeriveViaCapture ||
- // It might be tempting to skip the
- // PointerMayBeCapturedBefore check if
- // A->hasNoCaptureAttr() is true, but this is
- // incorrect because nocapture only guarantees
- // that no copies outlive the function, not
- // that the value cannot be locally captured.
- !PointerMayBeCapturedBefore(A,
- /* ReturnCaptures */ false,
- /* StoreCaptures */ false, I, &DT)))
- NoAliases.push_back(NewScopes[A]);
- }
-
- if (!NoAliases.empty())
- NI->setMetadata(LLVMContext::MD_noalias,
- MDNode::concatenate(
- NI->getMetadata(LLVMContext::MD_noalias),
- MDNode::get(CalledFunc->getContext(), NoAliases)));
-
- // Next, we want to figure out all of the sets to which we might belong.
- // We might belong to a set if the noalias argument is in the set of
- // underlying objects. If there is some non-noalias argument in our list
- // of underlying objects, then we cannot add a scope because the fact
- // that some access does not alias with any set of our noalias arguments
- // cannot itself guarantee that it does not alias with this access
- // (because there is some pointer of unknown origin involved and the
- // other access might also depend on this pointer). We also cannot add
- // scopes to arbitrary functions unless we know they don't access any
- // non-parameter pointer-values.
- bool CanAddScopes = !UsesAliasingPtr;
- if (CanAddScopes && IsFuncCall)
- CanAddScopes = IsArgMemOnlyCall;
-
- if (CanAddScopes)
- for (const Argument *A : NoAliasArgs) {
- if (ObjSet.count(A))
- Scopes.push_back(NewScopes[A]);
- }
-
- if (!Scopes.empty())
- NI->setMetadata(
- LLVMContext::MD_alias_scope,
- MDNode::concatenate(NI->getMetadata(LLVMContext::MD_alias_scope),
- MDNode::get(CalledFunc->getContext(), Scopes)));
- }
- }
-}
-
-static bool MayContainThrowingOrExitingCall(Instruction *Begin,
- Instruction *End) {
-
- assert(Begin->getParent() == End->getParent() &&
- "Expected to be in same basic block!");
- unsigned NumInstChecked = 0;
- // Check that all instructions in the range [Begin, End) are guaranteed to
- // transfer execution to successor.
- for (auto &I : make_range(Begin->getIterator(), End->getIterator()))
- if (NumInstChecked++ > InlinerAttributeWindow ||
- !isGuaranteedToTransferExecutionToSuccessor(&I))
- return true;
- return false;
-}
-
-static AttrBuilder IdentifyValidAttributes(CallBase &CB) {
-
- AttrBuilder AB(CB.getAttributes(), AttributeList::ReturnIndex);
- if (AB.empty())
- return AB;
- AttrBuilder Valid;
- // Only allow these white listed attributes to be propagated back to the
- // callee. This is because other attributes may only be valid on the call
- // itself, i.e. attributes such as signext and zeroext.
- if (auto DerefBytes = AB.getDereferenceableBytes())
- Valid.addDereferenceableAttr(DerefBytes);
- if (auto DerefOrNullBytes = AB.getDereferenceableOrNullBytes())
- Valid.addDereferenceableOrNullAttr(DerefOrNullBytes);
- if (AB.contains(Attribute::NoAlias))
- Valid.addAttribute(Attribute::NoAlias);
- if (AB.contains(Attribute::NonNull))
- Valid.addAttribute(Attribute::NonNull);
- return Valid;
-}
-
-static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
- if (!UpdateReturnAttributes)
- return;
-
- AttrBuilder Valid = IdentifyValidAttributes(CB);
- if (Valid.empty())
- return;
- auto *CalledFunction = CB.getCalledFunction();
- auto &Context = CalledFunction->getContext();
-
- for (auto &BB : *CalledFunction) {
- auto *RI = dyn_cast<ReturnInst>(BB.getTerminator());
- if (!RI || !isa<CallBase>(RI->getOperand(0)))
- continue;
- auto *RetVal = cast<CallBase>(RI->getOperand(0));
- // Sanity check that the cloned RetVal exists and is a call, otherwise we
- // cannot add the attributes on the cloned RetVal.
- // Simplification during inlining could have transformed the cloned
- // instruction.
- auto *NewRetVal = dyn_cast_or_null<CallBase>(VMap.lookup(RetVal));
- if (!NewRetVal)
- continue;
- // Backward propagation of attributes to the returned value may be incorrect
- // if it is control flow dependent.
- // Consider:
- // @callee {
- // %rv = call @foo()
- // %rv2 = call @bar()
- // if (%rv2 != null)
- // return %rv2
- // if (%rv == null)
- // exit()
- // return %rv
- // }
- // caller() {
- // %val = call nonnull @callee()
- // }
- // Here we cannot add the nonnull attribute on either foo or bar. So, we
- // limit the check to both RetVal and RI are in the same basic block and
- // there are no throwing/exiting instructions between these instructions.
- if (RI->getParent() != RetVal->getParent() ||
- MayContainThrowingOrExitingCall(RetVal, RI))
- continue;
- // Add to the existing attributes of NewRetVal, i.e. the cloned call
- // instruction.
- // NB! When we have the same attribute already existing on NewRetVal, but
- // with a differing value, the AttributeList's merge API honours the already
- // existing attribute value (i.e. attributes such as dereferenceable,
- // dereferenceable_or_null etc). See AttrBuilder::merge for more details.
- AttributeList AL = NewRetVal->getAttributes();
- AttributeList NewAL =
- AL.addAttributes(Context, AttributeList::ReturnIndex, Valid);
- NewRetVal->setAttributes(NewAL);
- }
-}
-
-/// If the inlined function has non-byval align arguments, then
-/// add @llvm.assume-based alignment assumptions to preserve this information.
-static void AddAlignmentAssumptions(CallBase &CB, InlineFunctionInfo &IFI) {
- if (!PreserveAlignmentAssumptions || !IFI.GetAssumptionCache)
- return;
-
- AssumptionCache *AC = &IFI.GetAssumptionCache(*CB.getCaller());
- auto &DL = CB.getCaller()->getParent()->getDataLayout();
-
- // To avoid inserting redundant assumptions, we should check for assumptions
- // already in the caller. To do this, we might need a DT of the caller.
- DominatorTree DT;
- bool DTCalculated = false;
-
- Function *CalledFunc = CB.getCalledFunction();
- for (Argument &Arg : CalledFunc->args()) {
- unsigned Align = Arg.getType()->isPointerTy() ? Arg.getParamAlignment() : 0;
+
+ for (const Value *O : Objects)
+ ObjSet.insert(O);
+ }
+
+ // Figure out if we're derived from anything that is not a noalias
+ // argument.
+ bool CanDeriveViaCapture = false, UsesAliasingPtr = false;
+ for (const Value *V : ObjSet) {
+ // Is this value a constant that cannot be derived from any pointer
+ // value (we need to exclude constant expressions, for example, that
+ // are formed from arithmetic on global symbols).
+ bool IsNonPtrConst = isa<ConstantInt>(V) || isa<ConstantFP>(V) ||
+ isa<ConstantPointerNull>(V) ||
+ isa<ConstantDataVector>(V) || isa<UndefValue>(V);
+ if (IsNonPtrConst)
+ continue;
+
+ // If this is anything other than a noalias argument, then we cannot
+ // completely describe the aliasing properties using alias.scope
+ // metadata (and, thus, won't add any).
+ if (const Argument *A = dyn_cast<Argument>(V)) {
+ if (!CB.paramHasAttr(A->getArgNo(), Attribute::NoAlias))
+ UsesAliasingPtr = true;
+ } else {
+ UsesAliasingPtr = true;
+ }
+
+ // If this is not some identified function-local object (which cannot
+ // directly alias a noalias argument), or some other argument (which,
+ // by definition, also cannot alias a noalias argument), then we could
+ // alias a noalias argument that has been captured).
+ if (!isa<Argument>(V) &&
+ !isIdentifiedFunctionLocal(const_cast<Value*>(V)))
+ CanDeriveViaCapture = true;
+ }
+
+ // A function call can always get captured noalias pointers (via other
+ // parameters, globals, etc.).
+ if (IsFuncCall && !IsArgMemOnlyCall)
+ CanDeriveViaCapture = true;
+
+ // First, we want to figure out all of the sets with which we definitely
+ // don't alias. Iterate over all noalias set, and add those for which:
+ // 1. The noalias argument is not in the set of objects from which we
+ // definitely derive.
+ // 2. The noalias argument has not yet been captured.
+ // An arbitrary function that might load pointers could see captured
+ // noalias arguments via other noalias arguments or globals, and so we
+ // must always check for prior capture.
+ for (const Argument *A : NoAliasArgs) {
+ if (!ObjSet.count(A) && (!CanDeriveViaCapture ||
+ // It might be tempting to skip the
+ // PointerMayBeCapturedBefore check if
+ // A->hasNoCaptureAttr() is true, but this is
+ // incorrect because nocapture only guarantees
+ // that no copies outlive the function, not
+ // that the value cannot be locally captured.
+ !PointerMayBeCapturedBefore(A,
+ /* ReturnCaptures */ false,
+ /* StoreCaptures */ false, I, &DT)))
+ NoAliases.push_back(NewScopes[A]);
+ }
+
+ if (!NoAliases.empty())
+ NI->setMetadata(LLVMContext::MD_noalias,
+ MDNode::concatenate(
+ NI->getMetadata(LLVMContext::MD_noalias),
+ MDNode::get(CalledFunc->getContext(), NoAliases)));
+
+ // Next, we want to figure out all of the sets to which we might belong.
+ // We might belong to a set if the noalias argument is in the set of
+ // underlying objects. If there is some non-noalias argument in our list
+ // of underlying objects, then we cannot add a scope because the fact
+ // that some access does not alias with any set of our noalias arguments
+ // cannot itself guarantee that it does not alias with this access
+ // (because there is some pointer of unknown origin involved and the
+ // other access might also depend on this pointer). We also cannot add
+ // scopes to arbitrary functions unless we know they don't access any
+ // non-parameter pointer-values.
+ bool CanAddScopes = !UsesAliasingPtr;
+ if (CanAddScopes && IsFuncCall)
+ CanAddScopes = IsArgMemOnlyCall;
+
+ if (CanAddScopes)
+ for (const Argument *A : NoAliasArgs) {
+ if (ObjSet.count(A))
+ Scopes.push_back(NewScopes[A]);
+ }
+
+ if (!Scopes.empty())
+ NI->setMetadata(
+ LLVMContext::MD_alias_scope,
+ MDNode::concatenate(NI->getMetadata(LLVMContext::MD_alias_scope),
+ MDNode::get(CalledFunc->getContext(), Scopes)));
+ }
+ }
+}
+
+static bool MayContainThrowingOrExitingCall(Instruction *Begin,
+ Instruction *End) {
+
+ assert(Begin->getParent() == End->getParent() &&
+ "Expected to be in same basic block!");
+ unsigned NumInstChecked = 0;
+ // Check that all instructions in the range [Begin, End) are guaranteed to
+ // transfer execution to successor.
+ for (auto &I : make_range(Begin->getIterator(), End->getIterator()))
+ if (NumInstChecked++ > InlinerAttributeWindow ||
+ !isGuaranteedToTransferExecutionToSuccessor(&I))
+ return true;
+ return false;
+}
+
+static AttrBuilder IdentifyValidAttributes(CallBase &CB) {
+
+ AttrBuilder AB(CB.getAttributes(), AttributeList::ReturnIndex);
+ if (AB.empty())
+ return AB;
+ AttrBuilder Valid;
+ // Only allow these white listed attributes to be propagated back to the
+ // callee. This is because other attributes may only be valid on the call
+ // itself, i.e. attributes such as signext and zeroext.
+ if (auto DerefBytes = AB.getDereferenceableBytes())
+ Valid.addDereferenceableAttr(DerefBytes);
+ if (auto DerefOrNullBytes = AB.getDereferenceableOrNullBytes())
+ Valid.addDereferenceableOrNullAttr(DerefOrNullBytes);
+ if (AB.contains(Attribute::NoAlias))
+ Valid.addAttribute(Attribute::NoAlias);
+ if (AB.contains(Attribute::NonNull))
+ Valid.addAttribute(Attribute::NonNull);
+ return Valid;
+}
+
+static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
+ if (!UpdateReturnAttributes)
+ return;
+
+ AttrBuilder Valid = IdentifyValidAttributes(CB);
+ if (Valid.empty())
+ return;
+ auto *CalledFunction = CB.getCalledFunction();
+ auto &Context = CalledFunction->getContext();
+
+ for (auto &BB : *CalledFunction) {
+ auto *RI = dyn_cast<ReturnInst>(BB.getTerminator());
+ if (!RI || !isa<CallBase>(RI->getOperand(0)))
+ continue;
+ auto *RetVal = cast<CallBase>(RI->getOperand(0));
+ // Sanity check that the cloned RetVal exists and is a call, otherwise we
+ // cannot add the attributes on the cloned RetVal.
+ // Simplification during inlining could have transformed the cloned
+ // instruction.
+ auto *NewRetVal = dyn_cast_or_null<CallBase>(VMap.lookup(RetVal));
+ if (!NewRetVal)
+ continue;
+ // Backward propagation of attributes to the returned value may be incorrect
+ // if it is control flow dependent.
+ // Consider:
+ // @callee {
+ // %rv = call @foo()
+ // %rv2 = call @bar()
+ // if (%rv2 != null)
+ // return %rv2
+ // if (%rv == null)
+ // exit()
+ // return %rv
+ // }
+ // caller() {
+ // %val = call nonnull @callee()
+ // }
+ // Here we cannot add the nonnull attribute on either foo or bar. So, we
+ // limit the check to both RetVal and RI are in the same basic block and
+ // there are no throwing/exiting instructions between these instructions.
+ if (RI->getParent() != RetVal->getParent() ||
+ MayContainThrowingOrExitingCall(RetVal, RI))
+ continue;
+ // Add to the existing attributes of NewRetVal, i.e. the cloned call
+ // instruction.
+ // NB! When we have the same attribute already existing on NewRetVal, but
+ // with a differing value, the AttributeList's merge API honours the already
+ // existing attribute value (i.e. attributes such as dereferenceable,
+ // dereferenceable_or_null etc). See AttrBuilder::merge for more details.
+ AttributeList AL = NewRetVal->getAttributes();
+ AttributeList NewAL =
+ AL.addAttributes(Context, AttributeList::ReturnIndex, Valid);
+ NewRetVal->setAttributes(NewAL);
+ }
+}
+
+/// If the inlined function has non-byval align arguments, then
+/// add @llvm.assume-based alignment assumptions to preserve this information.
+static void AddAlignmentAssumptions(CallBase &CB, InlineFunctionInfo &IFI) {
+ if (!PreserveAlignmentAssumptions || !IFI.GetAssumptionCache)
+ return;
+
+ AssumptionCache *AC = &IFI.GetAssumptionCache(*CB.getCaller());
+ auto &DL = CB.getCaller()->getParent()->getDataLayout();
+
+ // To avoid inserting redundant assumptions, we should check for assumptions
+ // already in the caller. To do this, we might need a DT of the caller.
+ DominatorTree DT;
+ bool DTCalculated = false;
+
+ Function *CalledFunc = CB.getCalledFunction();
+ for (Argument &Arg : CalledFunc->args()) {
+ unsigned Align = Arg.getType()->isPointerTy() ? Arg.getParamAlignment() : 0;
if (Align && !Arg.hasPassPointeeByValueCopyAttr() && !Arg.hasNUses(0)) {
- if (!DTCalculated) {
- DT.recalculate(*CB.getCaller());
- DTCalculated = true;
- }
-
- // If we can already prove the asserted alignment in the context of the
- // caller, then don't bother inserting the assumption.
- Value *ArgVal = CB.getArgOperand(Arg.getArgNo());
- if (getKnownAlignment(ArgVal, DL, &CB, AC, &DT) >= Align)
- continue;
-
- CallInst *NewAsmp =
- IRBuilder<>(&CB).CreateAlignmentAssumption(DL, ArgVal, Align);
- AC->registerAssumption(NewAsmp);
- }
- }
-}
-
-/// Once we have cloned code over from a callee into the caller,
-/// update the specified callgraph to reflect the changes we made.
-/// Note that it's possible that not all code was copied over, so only
-/// some edges of the callgraph may remain.
-static void UpdateCallGraphAfterInlining(CallBase &CB,
- Function::iterator FirstNewBlock,
- ValueToValueMapTy &VMap,
- InlineFunctionInfo &IFI) {
- CallGraph &CG = *IFI.CG;
- const Function *Caller = CB.getCaller();
- const Function *Callee = CB.getCalledFunction();
- CallGraphNode *CalleeNode = CG[Callee];
- CallGraphNode *CallerNode = CG[Caller];
-
- // Since we inlined some uninlined call sites in the callee into the caller,
- // add edges from the caller to all of the callees of the callee.
- CallGraphNode::iterator I = CalleeNode->begin(), E = CalleeNode->end();
-
- // Consider the case where CalleeNode == CallerNode.
- CallGraphNode::CalledFunctionsVector CallCache;
- if (CalleeNode == CallerNode) {
- CallCache.assign(I, E);
- I = CallCache.begin();
- E = CallCache.end();
- }
-
- for (; I != E; ++I) {
- // Skip 'refererence' call records.
- if (!I->first)
- continue;
-
- const Value *OrigCall = *I->first;
-
- ValueToValueMapTy::iterator VMI = VMap.find(OrigCall);
- // Only copy the edge if the call was inlined!
- if (VMI == VMap.end() || VMI->second == nullptr)
- continue;
-
- // If the call was inlined, but then constant folded, there is no edge to
- // add. Check for this case.
- auto *NewCall = dyn_cast<CallBase>(VMI->second);
- if (!NewCall)
- continue;
-
- // We do not treat intrinsic calls like real function calls because we
- // expect them to become inline code; do not add an edge for an intrinsic.
- if (NewCall->getCalledFunction() &&
- NewCall->getCalledFunction()->isIntrinsic())
- continue;
-
- // Remember that this call site got inlined for the client of
- // InlineFunction.
- IFI.InlinedCalls.push_back(NewCall);
-
- // It's possible that inlining the callsite will cause it to go from an
- // indirect to a direct call by resolving a function pointer. If this
- // happens, set the callee of the new call site to a more precise
- // destination. This can also happen if the call graph node of the caller
- // was just unnecessarily imprecise.
- if (!I->second->getFunction())
- if (Function *F = NewCall->getCalledFunction()) {
- // Indirect call site resolved to direct call.
- CallerNode->addCalledFunction(NewCall, CG[F]);
-
- continue;
- }
-
- CallerNode->addCalledFunction(NewCall, I->second);
- }
-
- // Update the call graph by deleting the edge from Callee to Caller. We must
- // do this after the loop above in case Caller and Callee are the same.
- CallerNode->removeCallEdgeFor(*cast<CallBase>(&CB));
-}
-
-static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
- BasicBlock *InsertBlock,
- InlineFunctionInfo &IFI) {
- Type *AggTy = cast<PointerType>(Src->getType())->getElementType();
- IRBuilder<> Builder(InsertBlock, InsertBlock->begin());
-
- Value *Size = Builder.getInt64(M->getDataLayout().getTypeStoreSize(AggTy));
-
- // Always generate a memcpy of alignment 1 here because we don't know
- // the alignment of the src pointer. Other optimizations can infer
- // better alignment.
- Builder.CreateMemCpy(Dst, /*DstAlign*/ Align(1), Src,
- /*SrcAlign*/ Align(1), Size);
-}
-
-/// When inlining a call site that has a byval argument,
-/// we have to make the implicit memcpy explicit by adding it.
-static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
- const Function *CalledFunc,
- InlineFunctionInfo &IFI,
- unsigned ByValAlignment) {
- PointerType *ArgTy = cast<PointerType>(Arg->getType());
- Type *AggTy = ArgTy->getElementType();
-
- Function *Caller = TheCall->getFunction();
- const DataLayout &DL = Caller->getParent()->getDataLayout();
-
- // If the called function is readonly, then it could not mutate the caller's
- // copy of the byval'd memory. In this case, it is safe to elide the copy and
- // temporary.
- if (CalledFunc->onlyReadsMemory()) {
- // If the byval argument has a specified alignment that is greater than the
- // passed in pointer, then we either have to round up the input pointer or
- // give up on this transformation.
- if (ByValAlignment <= 1) // 0 = unspecified, 1 = no particular alignment.
- return Arg;
-
- AssumptionCache *AC =
- IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr;
-
- // If the pointer is already known to be sufficiently aligned, or if we can
- // round it up to a larger alignment, then we don't need a temporary.
- if (getOrEnforceKnownAlignment(Arg, Align(ByValAlignment), DL, TheCall,
- AC) >= ByValAlignment)
- return Arg;
-
- // Otherwise, we have to make a memcpy to get a safe alignment. This is bad
- // for code quality, but rarely happens and is required for correctness.
- }
-
- // Create the alloca. If we have DataLayout, use nice alignment.
- Align Alignment(DL.getPrefTypeAlignment(AggTy));
-
- // If the byval had an alignment specified, we *must* use at least that
- // alignment, as it is required by the byval argument (and uses of the
- // pointer inside the callee).
- Alignment = max(Alignment, MaybeAlign(ByValAlignment));
-
- Value *NewAlloca =
- new AllocaInst(AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment,
- Arg->getName(), &*Caller->begin()->begin());
- IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca));
-
- // Uses of the argument in the function should use our new alloca
- // instead.
- return NewAlloca;
-}
-
-// Check whether this Value is used by a lifetime intrinsic.
-static bool isUsedByLifetimeMarker(Value *V) {
- for (User *U : V->users())
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U))
- if (II->isLifetimeStartOrEnd())
- return true;
- return false;
-}
-
-// Check whether the given alloca already has
-// lifetime.start or lifetime.end intrinsics.
-static bool hasLifetimeMarkers(AllocaInst *AI) {
- Type *Ty = AI->getType();
- Type *Int8PtrTy = Type::getInt8PtrTy(Ty->getContext(),
- Ty->getPointerAddressSpace());
- if (Ty == Int8PtrTy)
- return isUsedByLifetimeMarker(AI);
-
- // Do a scan to find all the casts to i8*.
- for (User *U : AI->users()) {
- if (U->getType() != Int8PtrTy) continue;
- if (U->stripPointerCasts() != AI) continue;
- if (isUsedByLifetimeMarker(U))
- return true;
- }
- return false;
-}
-
-/// Return the result of AI->isStaticAlloca() if AI were moved to the entry
-/// block. Allocas used in inalloca calls and allocas of dynamic array size
-/// cannot be static.
-static bool allocaWouldBeStaticInEntry(const AllocaInst *AI ) {
- return isa<Constant>(AI->getArraySize()) && !AI->isUsedWithInAlloca();
-}
-
-/// Returns a DebugLoc for a new DILocation which is a clone of \p OrigDL
-/// inlined at \p InlinedAt. \p IANodes is an inlined-at cache.
-static DebugLoc inlineDebugLoc(DebugLoc OrigDL, DILocation *InlinedAt,
- LLVMContext &Ctx,
- DenseMap<const MDNode *, MDNode *> &IANodes) {
- auto IA = DebugLoc::appendInlinedAt(OrigDL, InlinedAt, Ctx, IANodes);
+ if (!DTCalculated) {
+ DT.recalculate(*CB.getCaller());
+ DTCalculated = true;
+ }
+
+ // If we can already prove the asserted alignment in the context of the
+ // caller, then don't bother inserting the assumption.
+ Value *ArgVal = CB.getArgOperand(Arg.getArgNo());
+ if (getKnownAlignment(ArgVal, DL, &CB, AC, &DT) >= Align)
+ continue;
+
+ CallInst *NewAsmp =
+ IRBuilder<>(&CB).CreateAlignmentAssumption(DL, ArgVal, Align);
+ AC->registerAssumption(NewAsmp);
+ }
+ }
+}
+
+/// Once we have cloned code over from a callee into the caller,
+/// update the specified callgraph to reflect the changes we made.
+/// Note that it's possible that not all code was copied over, so only
+/// some edges of the callgraph may remain.
+static void UpdateCallGraphAfterInlining(CallBase &CB,
+ Function::iterator FirstNewBlock,
+ ValueToValueMapTy &VMap,
+ InlineFunctionInfo &IFI) {
+ CallGraph &CG = *IFI.CG;
+ const Function *Caller = CB.getCaller();
+ const Function *Callee = CB.getCalledFunction();
+ CallGraphNode *CalleeNode = CG[Callee];
+ CallGraphNode *CallerNode = CG[Caller];
+
+ // Since we inlined some uninlined call sites in the callee into the caller,
+ // add edges from the caller to all of the callees of the callee.
+ CallGraphNode::iterator I = CalleeNode->begin(), E = CalleeNode->end();
+
+ // Consider the case where CalleeNode == CallerNode.
+ CallGraphNode::CalledFunctionsVector CallCache;
+ if (CalleeNode == CallerNode) {
+ CallCache.assign(I, E);
+ I = CallCache.begin();
+ E = CallCache.end();
+ }
+
+ for (; I != E; ++I) {
+ // Skip 'refererence' call records.
+ if (!I->first)
+ continue;
+
+ const Value *OrigCall = *I->first;
+
+ ValueToValueMapTy::iterator VMI = VMap.find(OrigCall);
+ // Only copy the edge if the call was inlined!
+ if (VMI == VMap.end() || VMI->second == nullptr)
+ continue;
+
+ // If the call was inlined, but then constant folded, there is no edge to
+ // add. Check for this case.
+ auto *NewCall = dyn_cast<CallBase>(VMI->second);
+ if (!NewCall)
+ continue;
+
+ // We do not treat intrinsic calls like real function calls because we
+ // expect them to become inline code; do not add an edge for an intrinsic.
+ if (NewCall->getCalledFunction() &&
+ NewCall->getCalledFunction()->isIntrinsic())
+ continue;
+
+ // Remember that this call site got inlined for the client of
+ // InlineFunction.
+ IFI.InlinedCalls.push_back(NewCall);
+
+ // It's possible that inlining the callsite will cause it to go from an
+ // indirect to a direct call by resolving a function pointer. If this
+ // happens, set the callee of the new call site to a more precise
+ // destination. This can also happen if the call graph node of the caller
+ // was just unnecessarily imprecise.
+ if (!I->second->getFunction())
+ if (Function *F = NewCall->getCalledFunction()) {
+ // Indirect call site resolved to direct call.
+ CallerNode->addCalledFunction(NewCall, CG[F]);
+
+ continue;
+ }
+
+ CallerNode->addCalledFunction(NewCall, I->second);
+ }
+
+ // Update the call graph by deleting the edge from Callee to Caller. We must
+ // do this after the loop above in case Caller and Callee are the same.
+ CallerNode->removeCallEdgeFor(*cast<CallBase>(&CB));
+}
+
+static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
+ BasicBlock *InsertBlock,
+ InlineFunctionInfo &IFI) {
+ Type *AggTy = cast<PointerType>(Src->getType())->getElementType();
+ IRBuilder<> Builder(InsertBlock, InsertBlock->begin());
+
+ Value *Size = Builder.getInt64(M->getDataLayout().getTypeStoreSize(AggTy));
+
+ // Always generate a memcpy of alignment 1 here because we don't know
+ // the alignment of the src pointer. Other optimizations can infer
+ // better alignment.
+ Builder.CreateMemCpy(Dst, /*DstAlign*/ Align(1), Src,
+ /*SrcAlign*/ Align(1), Size);
+}
+
+/// When inlining a call site that has a byval argument,
+/// we have to make the implicit memcpy explicit by adding it.
+static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
+ const Function *CalledFunc,
+ InlineFunctionInfo &IFI,
+ unsigned ByValAlignment) {
+ PointerType *ArgTy = cast<PointerType>(Arg->getType());
+ Type *AggTy = ArgTy->getElementType();
+
+ Function *Caller = TheCall->getFunction();
+ const DataLayout &DL = Caller->getParent()->getDataLayout();
+
+ // If the called function is readonly, then it could not mutate the caller's
+ // copy of the byval'd memory. In this case, it is safe to elide the copy and
+ // temporary.
+ if (CalledFunc->onlyReadsMemory()) {
+ // If the byval argument has a specified alignment that is greater than the
+ // passed in pointer, then we either have to round up the input pointer or
+ // give up on this transformation.
+ if (ByValAlignment <= 1) // 0 = unspecified, 1 = no particular alignment.
+ return Arg;
+
+ AssumptionCache *AC =
+ IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr;
+
+ // If the pointer is already known to be sufficiently aligned, or if we can
+ // round it up to a larger alignment, then we don't need a temporary.
+ if (getOrEnforceKnownAlignment(Arg, Align(ByValAlignment), DL, TheCall,
+ AC) >= ByValAlignment)
+ return Arg;
+
+ // Otherwise, we have to make a memcpy to get a safe alignment. This is bad
+ // for code quality, but rarely happens and is required for correctness.
+ }
+
+ // Create the alloca. If we have DataLayout, use nice alignment.
+ Align Alignment(DL.getPrefTypeAlignment(AggTy));
+
+ // If the byval had an alignment specified, we *must* use at least that
+ // alignment, as it is required by the byval argument (and uses of the
+ // pointer inside the callee).
+ Alignment = max(Alignment, MaybeAlign(ByValAlignment));
+
+ Value *NewAlloca =
+ new AllocaInst(AggTy, DL.getAllocaAddrSpace(), nullptr, Alignment,
+ Arg->getName(), &*Caller->begin()->begin());
+ IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca));
+
+ // Uses of the argument in the function should use our new alloca
+ // instead.
+ return NewAlloca;
+}
+
+// Check whether this Value is used by a lifetime intrinsic.
+static bool isUsedByLifetimeMarker(Value *V) {
+ for (User *U : V->users())
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U))
+ if (II->isLifetimeStartOrEnd())
+ return true;
+ return false;
+}
+
+// Check whether the given alloca already has
+// lifetime.start or lifetime.end intrinsics.
+static bool hasLifetimeMarkers(AllocaInst *AI) {
+ Type *Ty = AI->getType();
+ Type *Int8PtrTy = Type::getInt8PtrTy(Ty->getContext(),
+ Ty->getPointerAddressSpace());
+ if (Ty == Int8PtrTy)
+ return isUsedByLifetimeMarker(AI);
+
+ // Do a scan to find all the casts to i8*.
+ for (User *U : AI->users()) {
+ if (U->getType() != Int8PtrTy) continue;
+ if (U->stripPointerCasts() != AI) continue;
+ if (isUsedByLifetimeMarker(U))
+ return true;
+ }
+ return false;
+}
+
+/// Return the result of AI->isStaticAlloca() if AI were moved to the entry
+/// block. Allocas used in inalloca calls and allocas of dynamic array size
+/// cannot be static.
+static bool allocaWouldBeStaticInEntry(const AllocaInst *AI ) {
+ return isa<Constant>(AI->getArraySize()) && !AI->isUsedWithInAlloca();
+}
+
+/// Returns a DebugLoc for a new DILocation which is a clone of \p OrigDL
+/// inlined at \p InlinedAt. \p IANodes is an inlined-at cache.
+static DebugLoc inlineDebugLoc(DebugLoc OrigDL, DILocation *InlinedAt,
+ LLVMContext &Ctx,
+ DenseMap<const MDNode *, MDNode *> &IANodes) {
+ auto IA = DebugLoc::appendInlinedAt(OrigDL, InlinedAt, Ctx, IANodes);
return DILocation::get(Ctx, OrigDL.getLine(), OrigDL.getCol(),
OrigDL.getScope(), IA);
-}
-
-/// Update inlined instructions' line numbers to
-/// to encode location where these instructions are inlined.
-static void fixupLineNumbers(Function *Fn, Function::iterator FI,
- Instruction *TheCall, bool CalleeHasDebugInfo) {
- const DebugLoc &TheCallDL = TheCall->getDebugLoc();
- if (!TheCallDL)
- return;
-
- auto &Ctx = Fn->getContext();
- DILocation *InlinedAtNode = TheCallDL;
-
- // Create a unique call site, not to be confused with any other call from the
- // same location.
- InlinedAtNode = DILocation::getDistinct(
- Ctx, InlinedAtNode->getLine(), InlinedAtNode->getColumn(),
- InlinedAtNode->getScope(), InlinedAtNode->getInlinedAt());
-
- // Cache the inlined-at nodes as they're built so they are reused, without
- // this every instruction's inlined-at chain would become distinct from each
- // other.
- DenseMap<const MDNode *, MDNode *> IANodes;
-
- // Check if we are not generating inline line tables and want to use
- // the call site location instead.
- bool NoInlineLineTables = Fn->hasFnAttribute("no-inline-line-tables");
-
- for (; FI != Fn->end(); ++FI) {
- for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
- BI != BE; ++BI) {
- // Loop metadata needs to be updated so that the start and end locs
- // reference inlined-at locations.
- auto updateLoopInfoLoc = [&Ctx, &InlinedAtNode, &IANodes](
- const DILocation &Loc) -> DILocation * {
- return inlineDebugLoc(&Loc, InlinedAtNode, Ctx, IANodes).get();
- };
- updateLoopMetadataDebugLocations(*BI, updateLoopInfoLoc);
-
- if (!NoInlineLineTables)
- if (DebugLoc DL = BI->getDebugLoc()) {
- DebugLoc IDL =
- inlineDebugLoc(DL, InlinedAtNode, BI->getContext(), IANodes);
- BI->setDebugLoc(IDL);
- continue;
- }
-
- if (CalleeHasDebugInfo && !NoInlineLineTables)
- continue;
-
- // If the inlined instruction has no line number, or if inline info
- // is not being generated, make it look as if it originates from the call
- // location. This is important for ((__always_inline, __nodebug__))
- // functions which must use caller location for all instructions in their
- // function body.
-
- // Don't update static allocas, as they may get moved later.
- if (auto *AI = dyn_cast<AllocaInst>(BI))
- if (allocaWouldBeStaticInEntry(AI))
- continue;
-
- BI->setDebugLoc(TheCallDL);
- }
-
- // Remove debug info intrinsics if we're not keeping inline info.
- if (NoInlineLineTables) {
- BasicBlock::iterator BI = FI->begin();
- while (BI != FI->end()) {
- if (isa<DbgInfoIntrinsic>(BI)) {
- BI = BI->eraseFromParent();
- continue;
- }
- ++BI;
- }
- }
-
- }
-}
-
-/// Update the block frequencies of the caller after a callee has been inlined.
-///
-/// Each block cloned into the caller has its block frequency scaled by the
-/// ratio of CallSiteFreq/CalleeEntryFreq. This ensures that the cloned copy of
-/// callee's entry block gets the same frequency as the callsite block and the
-/// relative frequencies of all cloned blocks remain the same after cloning.
-static void updateCallerBFI(BasicBlock *CallSiteBlock,
- const ValueToValueMapTy &VMap,
- BlockFrequencyInfo *CallerBFI,
- BlockFrequencyInfo *CalleeBFI,
- const BasicBlock &CalleeEntryBlock) {
- SmallPtrSet<BasicBlock *, 16> ClonedBBs;
- for (auto Entry : VMap) {
- if (!isa<BasicBlock>(Entry.first) || !Entry.second)
- continue;
- auto *OrigBB = cast<BasicBlock>(Entry.first);
- auto *ClonedBB = cast<BasicBlock>(Entry.second);
- uint64_t Freq = CalleeBFI->getBlockFreq(OrigBB).getFrequency();
- if (!ClonedBBs.insert(ClonedBB).second) {
- // Multiple blocks in the callee might get mapped to one cloned block in
- // the caller since we prune the callee as we clone it. When that happens,
- // we want to use the maximum among the original blocks' frequencies.
- uint64_t NewFreq = CallerBFI->getBlockFreq(ClonedBB).getFrequency();
- if (NewFreq > Freq)
- Freq = NewFreq;
- }
- CallerBFI->setBlockFreq(ClonedBB, Freq);
- }
- BasicBlock *EntryClone = cast<BasicBlock>(VMap.lookup(&CalleeEntryBlock));
- CallerBFI->setBlockFreqAndScale(
- EntryClone, CallerBFI->getBlockFreq(CallSiteBlock).getFrequency(),
- ClonedBBs);
-}
-
-/// Update the branch metadata for cloned call instructions.
-static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
- const ProfileCount &CalleeEntryCount,
- const CallBase &TheCall, ProfileSummaryInfo *PSI,
- BlockFrequencyInfo *CallerBFI) {
- if (!CalleeEntryCount.hasValue() || CalleeEntryCount.isSynthetic() ||
- CalleeEntryCount.getCount() < 1)
- return;
- auto CallSiteCount = PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
- int64_t CallCount =
+}
+
+/// Update inlined instructions' line numbers to
+/// to encode location where these instructions are inlined.
+static void fixupLineNumbers(Function *Fn, Function::iterator FI,
+ Instruction *TheCall, bool CalleeHasDebugInfo) {
+ const DebugLoc &TheCallDL = TheCall->getDebugLoc();
+ if (!TheCallDL)
+ return;
+
+ auto &Ctx = Fn->getContext();
+ DILocation *InlinedAtNode = TheCallDL;
+
+ // Create a unique call site, not to be confused with any other call from the
+ // same location.
+ InlinedAtNode = DILocation::getDistinct(
+ Ctx, InlinedAtNode->getLine(), InlinedAtNode->getColumn(),
+ InlinedAtNode->getScope(), InlinedAtNode->getInlinedAt());
+
+ // Cache the inlined-at nodes as they're built so they are reused, without
+ // this every instruction's inlined-at chain would become distinct from each
+ // other.
+ DenseMap<const MDNode *, MDNode *> IANodes;
+
+ // Check if we are not generating inline line tables and want to use
+ // the call site location instead.
+ bool NoInlineLineTables = Fn->hasFnAttribute("no-inline-line-tables");
+
+ for (; FI != Fn->end(); ++FI) {
+ for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
+ BI != BE; ++BI) {
+ // Loop metadata needs to be updated so that the start and end locs
+ // reference inlined-at locations.
+ auto updateLoopInfoLoc = [&Ctx, &InlinedAtNode, &IANodes](
+ const DILocation &Loc) -> DILocation * {
+ return inlineDebugLoc(&Loc, InlinedAtNode, Ctx, IANodes).get();
+ };
+ updateLoopMetadataDebugLocations(*BI, updateLoopInfoLoc);
+
+ if (!NoInlineLineTables)
+ if (DebugLoc DL = BI->getDebugLoc()) {
+ DebugLoc IDL =
+ inlineDebugLoc(DL, InlinedAtNode, BI->getContext(), IANodes);
+ BI->setDebugLoc(IDL);
+ continue;
+ }
+
+ if (CalleeHasDebugInfo && !NoInlineLineTables)
+ continue;
+
+ // If the inlined instruction has no line number, or if inline info
+ // is not being generated, make it look as if it originates from the call
+ // location. This is important for ((__always_inline, __nodebug__))
+ // functions which must use caller location for all instructions in their
+ // function body.
+
+ // Don't update static allocas, as they may get moved later.
+ if (auto *AI = dyn_cast<AllocaInst>(BI))
+ if (allocaWouldBeStaticInEntry(AI))
+ continue;
+
+ BI->setDebugLoc(TheCallDL);
+ }
+
+ // Remove debug info intrinsics if we're not keeping inline info.
+ if (NoInlineLineTables) {
+ BasicBlock::iterator BI = FI->begin();
+ while (BI != FI->end()) {
+ if (isa<DbgInfoIntrinsic>(BI)) {
+ BI = BI->eraseFromParent();
+ continue;
+ }
+ ++BI;
+ }
+ }
+
+ }
+}
+
+/// Update the block frequencies of the caller after a callee has been inlined.
+///
+/// Each block cloned into the caller has its block frequency scaled by the
+/// ratio of CallSiteFreq/CalleeEntryFreq. This ensures that the cloned copy of
+/// callee's entry block gets the same frequency as the callsite block and the
+/// relative frequencies of all cloned blocks remain the same after cloning.
+static void updateCallerBFI(BasicBlock *CallSiteBlock,
+ const ValueToValueMapTy &VMap,
+ BlockFrequencyInfo *CallerBFI,
+ BlockFrequencyInfo *CalleeBFI,
+ const BasicBlock &CalleeEntryBlock) {
+ SmallPtrSet<BasicBlock *, 16> ClonedBBs;
+ for (auto Entry : VMap) {
+ if (!isa<BasicBlock>(Entry.first) || !Entry.second)
+ continue;
+ auto *OrigBB = cast<BasicBlock>(Entry.first);
+ auto *ClonedBB = cast<BasicBlock>(Entry.second);
+ uint64_t Freq = CalleeBFI->getBlockFreq(OrigBB).getFrequency();
+ if (!ClonedBBs.insert(ClonedBB).second) {
+ // Multiple blocks in the callee might get mapped to one cloned block in
+ // the caller since we prune the callee as we clone it. When that happens,
+ // we want to use the maximum among the original blocks' frequencies.
+ uint64_t NewFreq = CallerBFI->getBlockFreq(ClonedBB).getFrequency();
+ if (NewFreq > Freq)
+ Freq = NewFreq;
+ }
+ CallerBFI->setBlockFreq(ClonedBB, Freq);
+ }
+ BasicBlock *EntryClone = cast<BasicBlock>(VMap.lookup(&CalleeEntryBlock));
+ CallerBFI->setBlockFreqAndScale(
+ EntryClone, CallerBFI->getBlockFreq(CallSiteBlock).getFrequency(),
+ ClonedBBs);
+}
+
+/// Update the branch metadata for cloned call instructions.
+static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
+ const ProfileCount &CalleeEntryCount,
+ const CallBase &TheCall, ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *CallerBFI) {
+ if (!CalleeEntryCount.hasValue() || CalleeEntryCount.isSynthetic() ||
+ CalleeEntryCount.getCount() < 1)
+ return;
+ auto CallSiteCount = PSI ? PSI->getProfileCount(TheCall, CallerBFI) : None;
+ int64_t CallCount =
std::min(CallSiteCount.getValueOr(0), CalleeEntryCount.getCount());
- updateProfileCallee(Callee, -CallCount, &VMap);
-}
-
-void llvm::updateProfileCallee(
- Function *Callee, int64_t entryDelta,
- const ValueMap<const Value *, WeakTrackingVH> *VMap) {
- auto CalleeCount = Callee->getEntryCount();
- if (!CalleeCount.hasValue())
- return;
-
- uint64_t priorEntryCount = CalleeCount.getCount();
- uint64_t newEntryCount;
-
- // Since CallSiteCount is an estimate, it could exceed the original callee
- // count and has to be set to 0 so guard against underflow.
- if (entryDelta < 0 && static_cast<uint64_t>(-entryDelta) > priorEntryCount)
- newEntryCount = 0;
- else
- newEntryCount = priorEntryCount + entryDelta;
-
- // During inlining ?
- if (VMap) {
- uint64_t cloneEntryCount = priorEntryCount - newEntryCount;
- for (auto Entry : *VMap)
- if (isa<CallInst>(Entry.first))
- if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
- CI->updateProfWeight(cloneEntryCount, priorEntryCount);
- }
-
- if (entryDelta) {
- Callee->setEntryCount(newEntryCount);
-
- for (BasicBlock &BB : *Callee)
- // No need to update the callsite if it is pruned during inlining.
- if (!VMap || VMap->count(&BB))
- for (Instruction &I : BB)
- if (CallInst *CI = dyn_cast<CallInst>(&I))
- CI->updateProfWeight(newEntryCount, priorEntryCount);
- }
-}
-
-/// This function inlines the called function into the basic block of the
-/// caller. This returns false if it is not possible to inline this call.
-/// The program is still in a well defined state if this occurs though.
-///
-/// Note that this only does one level of inlining. For example, if the
-/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
-/// exists in the instruction stream. Similarly this will inline a recursive
-/// function by one level.
-llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
- AAResults *CalleeAAR,
- bool InsertLifetime,
- Function *ForwardVarArgsTo) {
- assert(CB.getParent() && CB.getFunction() && "Instruction not in function!");
-
- // FIXME: we don't inline callbr yet.
- if (isa<CallBrInst>(CB))
- return InlineResult::failure("We don't inline callbr yet.");
-
- // If IFI has any state in it, zap it before we fill it in.
- IFI.reset();
-
- Function *CalledFunc = CB.getCalledFunction();
- if (!CalledFunc || // Can't inline external function or indirect
- CalledFunc->isDeclaration()) // call!
- return InlineResult::failure("external or indirect");
-
- // The inliner does not know how to inline through calls with operand bundles
- // in general ...
- if (CB.hasOperandBundles()) {
- for (int i = 0, e = CB.getNumOperandBundles(); i != e; ++i) {
- uint32_t Tag = CB.getOperandBundleAt(i).getTagID();
- // ... but it knows how to inline through "deopt" operand bundles ...
- if (Tag == LLVMContext::OB_deopt)
- continue;
- // ... and "funclet" operand bundles.
- if (Tag == LLVMContext::OB_funclet)
- continue;
-
- return InlineResult::failure("unsupported operand bundle");
- }
- }
-
- // If the call to the callee cannot throw, set the 'nounwind' flag on any
- // calls that we inline.
- bool MarkNoUnwind = CB.doesNotThrow();
-
- BasicBlock *OrigBB = CB.getParent();
- Function *Caller = OrigBB->getParent();
-
- // GC poses two hazards to inlining, which only occur when the callee has GC:
- // 1. If the caller has no GC, then the callee's GC must be propagated to the
- // caller.
- // 2. If the caller has a differing GC, it is invalid to inline.
- if (CalledFunc->hasGC()) {
- if (!Caller->hasGC())
- Caller->setGC(CalledFunc->getGC());
- else if (CalledFunc->getGC() != Caller->getGC())
- return InlineResult::failure("incompatible GC");
- }
-
- // Get the personality function from the callee if it contains a landing pad.
- Constant *CalledPersonality =
- CalledFunc->hasPersonalityFn()
- ? CalledFunc->getPersonalityFn()->stripPointerCasts()
- : nullptr;
-
- // Find the personality function used by the landing pads of the caller. If it
- // exists, then check to see that it matches the personality function used in
- // the callee.
- Constant *CallerPersonality =
- Caller->hasPersonalityFn()
- ? Caller->getPersonalityFn()->stripPointerCasts()
- : nullptr;
- if (CalledPersonality) {
- if (!CallerPersonality)
- Caller->setPersonalityFn(CalledPersonality);
- // If the personality functions match, then we can perform the
- // inlining. Otherwise, we can't inline.
- // TODO: This isn't 100% true. Some personality functions are proper
- // supersets of others and can be used in place of the other.
- else if (CalledPersonality != CallerPersonality)
- return InlineResult::failure("incompatible personality");
- }
-
- // We need to figure out which funclet the callsite was in so that we may
- // properly nest the callee.
- Instruction *CallSiteEHPad = nullptr;
- if (CallerPersonality) {
- EHPersonality Personality = classifyEHPersonality(CallerPersonality);
- if (isScopedEHPersonality(Personality)) {
- Optional<OperandBundleUse> ParentFunclet =
- CB.getOperandBundle(LLVMContext::OB_funclet);
- if (ParentFunclet)
- CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front());
-
- // OK, the inlining site is legal. What about the target function?
-
- if (CallSiteEHPad) {
- if (Personality == EHPersonality::MSVC_CXX) {
- // The MSVC personality cannot tolerate catches getting inlined into
- // cleanup funclets.
- if (isa<CleanupPadInst>(CallSiteEHPad)) {
- // Ok, the call site is within a cleanuppad. Let's check the callee
- // for catchpads.
- for (const BasicBlock &CalledBB : *CalledFunc) {
- if (isa<CatchSwitchInst>(CalledBB.getFirstNonPHI()))
- return InlineResult::failure("catch in cleanup funclet");
- }
- }
- } else if (isAsynchronousEHPersonality(Personality)) {
- // SEH is even less tolerant, there may not be any sort of exceptional
- // funclet in the callee.
- for (const BasicBlock &CalledBB : *CalledFunc) {
- if (CalledBB.isEHPad())
- return InlineResult::failure("SEH in cleanup funclet");
- }
- }
- }
- }
- }
-
- // Determine if we are dealing with a call in an EHPad which does not unwind
- // to caller.
- bool EHPadForCallUnwindsLocally = false;
- if (CallSiteEHPad && isa<CallInst>(CB)) {
- UnwindDestMemoTy FuncletUnwindMap;
- Value *CallSiteUnwindDestToken =
- getUnwindDestToken(CallSiteEHPad, FuncletUnwindMap);
-
- EHPadForCallUnwindsLocally =
- CallSiteUnwindDestToken &&
- !isa<ConstantTokenNone>(CallSiteUnwindDestToken);
- }
-
- // Get an iterator to the last basic block in the function, which will have
- // the new function inlined after it.
- Function::iterator LastBlock = --Caller->end();
-
- // Make sure to capture all of the return instructions from the cloned
- // function.
- SmallVector<ReturnInst*, 8> Returns;
- ClonedCodeInfo InlinedFunctionInfo;
- Function::iterator FirstNewBlock;
-
- { // Scope to destroy VMap after cloning.
- ValueToValueMapTy VMap;
- // Keep a list of pair (dst, src) to emit byval initializations.
- SmallVector<std::pair<Value*, Value*>, 4> ByValInit;
-
+ updateProfileCallee(Callee, -CallCount, &VMap);
+}
+
+void llvm::updateProfileCallee(
+ Function *Callee, int64_t entryDelta,
+ const ValueMap<const Value *, WeakTrackingVH> *VMap) {
+ auto CalleeCount = Callee->getEntryCount();
+ if (!CalleeCount.hasValue())
+ return;
+
+ uint64_t priorEntryCount = CalleeCount.getCount();
+ uint64_t newEntryCount;
+
+ // Since CallSiteCount is an estimate, it could exceed the original callee
+ // count and has to be set to 0 so guard against underflow.
+ if (entryDelta < 0 && static_cast<uint64_t>(-entryDelta) > priorEntryCount)
+ newEntryCount = 0;
+ else
+ newEntryCount = priorEntryCount + entryDelta;
+
+ // During inlining ?
+ if (VMap) {
+ uint64_t cloneEntryCount = priorEntryCount - newEntryCount;
+ for (auto Entry : *VMap)
+ if (isa<CallInst>(Entry.first))
+ if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
+ CI->updateProfWeight(cloneEntryCount, priorEntryCount);
+ }
+
+ if (entryDelta) {
+ Callee->setEntryCount(newEntryCount);
+
+ for (BasicBlock &BB : *Callee)
+ // No need to update the callsite if it is pruned during inlining.
+ if (!VMap || VMap->count(&BB))
+ for (Instruction &I : BB)
+ if (CallInst *CI = dyn_cast<CallInst>(&I))
+ CI->updateProfWeight(newEntryCount, priorEntryCount);
+ }
+}
+
+/// This function inlines the called function into the basic block of the
+/// caller. This returns false if it is not possible to inline this call.
+/// The program is still in a well defined state if this occurs though.
+///
+/// Note that this only does one level of inlining. For example, if the
+/// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
+/// exists in the instruction stream. Similarly this will inline a recursive
+/// function by one level.
+llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
+ AAResults *CalleeAAR,
+ bool InsertLifetime,
+ Function *ForwardVarArgsTo) {
+ assert(CB.getParent() && CB.getFunction() && "Instruction not in function!");
+
+ // FIXME: we don't inline callbr yet.
+ if (isa<CallBrInst>(CB))
+ return InlineResult::failure("We don't inline callbr yet.");
+
+ // If IFI has any state in it, zap it before we fill it in.
+ IFI.reset();
+
+ Function *CalledFunc = CB.getCalledFunction();
+ if (!CalledFunc || // Can't inline external function or indirect
+ CalledFunc->isDeclaration()) // call!
+ return InlineResult::failure("external or indirect");
+
+ // The inliner does not know how to inline through calls with operand bundles
+ // in general ...
+ if (CB.hasOperandBundles()) {
+ for (int i = 0, e = CB.getNumOperandBundles(); i != e; ++i) {
+ uint32_t Tag = CB.getOperandBundleAt(i).getTagID();
+ // ... but it knows how to inline through "deopt" operand bundles ...
+ if (Tag == LLVMContext::OB_deopt)
+ continue;
+ // ... and "funclet" operand bundles.
+ if (Tag == LLVMContext::OB_funclet)
+ continue;
+
+ return InlineResult::failure("unsupported operand bundle");
+ }
+ }
+
+ // If the call to the callee cannot throw, set the 'nounwind' flag on any
+ // calls that we inline.
+ bool MarkNoUnwind = CB.doesNotThrow();
+
+ BasicBlock *OrigBB = CB.getParent();
+ Function *Caller = OrigBB->getParent();
+
+ // GC poses two hazards to inlining, which only occur when the callee has GC:
+ // 1. If the caller has no GC, then the callee's GC must be propagated to the
+ // caller.
+ // 2. If the caller has a differing GC, it is invalid to inline.
+ if (CalledFunc->hasGC()) {
+ if (!Caller->hasGC())
+ Caller->setGC(CalledFunc->getGC());
+ else if (CalledFunc->getGC() != Caller->getGC())
+ return InlineResult::failure("incompatible GC");
+ }
+
+ // Get the personality function from the callee if it contains a landing pad.
+ Constant *CalledPersonality =
+ CalledFunc->hasPersonalityFn()
+ ? CalledFunc->getPersonalityFn()->stripPointerCasts()
+ : nullptr;
+
+ // Find the personality function used by the landing pads of the caller. If it
+ // exists, then check to see that it matches the personality function used in
+ // the callee.
+ Constant *CallerPersonality =
+ Caller->hasPersonalityFn()
+ ? Caller->getPersonalityFn()->stripPointerCasts()
+ : nullptr;
+ if (CalledPersonality) {
+ if (!CallerPersonality)
+ Caller->setPersonalityFn(CalledPersonality);
+ // If the personality functions match, then we can perform the
+ // inlining. Otherwise, we can't inline.
+ // TODO: This isn't 100% true. Some personality functions are proper
+ // supersets of others and can be used in place of the other.
+ else if (CalledPersonality != CallerPersonality)
+ return InlineResult::failure("incompatible personality");
+ }
+
+ // We need to figure out which funclet the callsite was in so that we may
+ // properly nest the callee.
+ Instruction *CallSiteEHPad = nullptr;
+ if (CallerPersonality) {
+ EHPersonality Personality = classifyEHPersonality(CallerPersonality);
+ if (isScopedEHPersonality(Personality)) {
+ Optional<OperandBundleUse> ParentFunclet =
+ CB.getOperandBundle(LLVMContext::OB_funclet);
+ if (ParentFunclet)
+ CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front());
+
+ // OK, the inlining site is legal. What about the target function?
+
+ if (CallSiteEHPad) {
+ if (Personality == EHPersonality::MSVC_CXX) {
+ // The MSVC personality cannot tolerate catches getting inlined into
+ // cleanup funclets.
+ if (isa<CleanupPadInst>(CallSiteEHPad)) {
+ // Ok, the call site is within a cleanuppad. Let's check the callee
+ // for catchpads.
+ for (const BasicBlock &CalledBB : *CalledFunc) {
+ if (isa<CatchSwitchInst>(CalledBB.getFirstNonPHI()))
+ return InlineResult::failure("catch in cleanup funclet");
+ }
+ }
+ } else if (isAsynchronousEHPersonality(Personality)) {
+ // SEH is even less tolerant, there may not be any sort of exceptional
+ // funclet in the callee.
+ for (const BasicBlock &CalledBB : *CalledFunc) {
+ if (CalledBB.isEHPad())
+ return InlineResult::failure("SEH in cleanup funclet");
+ }
+ }
+ }
+ }
+ }
+
+ // Determine if we are dealing with a call in an EHPad which does not unwind
+ // to caller.
+ bool EHPadForCallUnwindsLocally = false;
+ if (CallSiteEHPad && isa<CallInst>(CB)) {
+ UnwindDestMemoTy FuncletUnwindMap;
+ Value *CallSiteUnwindDestToken =
+ getUnwindDestToken(CallSiteEHPad, FuncletUnwindMap);
+
+ EHPadForCallUnwindsLocally =
+ CallSiteUnwindDestToken &&
+ !isa<ConstantTokenNone>(CallSiteUnwindDestToken);
+ }
+
+ // Get an iterator to the last basic block in the function, which will have
+ // the new function inlined after it.
+ Function::iterator LastBlock = --Caller->end();
+
+ // Make sure to capture all of the return instructions from the cloned
+ // function.
+ SmallVector<ReturnInst*, 8> Returns;
+ ClonedCodeInfo InlinedFunctionInfo;
+ Function::iterator FirstNewBlock;
+
+ { // Scope to destroy VMap after cloning.
+ ValueToValueMapTy VMap;
+ // Keep a list of pair (dst, src) to emit byval initializations.
+ SmallVector<std::pair<Value*, Value*>, 4> ByValInit;
+
// When inlining a function that contains noalias scope metadata,
// this metadata needs to be cloned so that the inlined blocks
// have different "unique scopes" at every call site.
@@ -1793,732 +1793,732 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
// callee.
ScopedAliasMetadataDeepCloner SAMetadataCloner(CB.getCalledFunction());
- auto &DL = Caller->getParent()->getDataLayout();
-
- // Calculate the vector of arguments to pass into the function cloner, which
- // matches up the formal to the actual argument values.
- auto AI = CB.arg_begin();
- unsigned ArgNo = 0;
- for (Function::arg_iterator I = CalledFunc->arg_begin(),
- E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) {
- Value *ActualArg = *AI;
-
- // When byval arguments actually inlined, we need to make the copy implied
- // by them explicit. However, we don't do this if the callee is readonly
- // or readnone, because the copy would be unneeded: the callee doesn't
- // modify the struct.
- if (CB.isByValArgument(ArgNo)) {
- ActualArg = HandleByValArgument(ActualArg, &CB, CalledFunc, IFI,
- CalledFunc->getParamAlignment(ArgNo));
- if (ActualArg != *AI)
- ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI));
- }
-
- VMap[&*I] = ActualArg;
- }
-
- // TODO: Remove this when users have been updated to the assume bundles.
- // Add alignment assumptions if necessary. We do this before the inlined
- // instructions are actually cloned into the caller so that we can easily
- // check what will be known at the start of the inlined code.
- AddAlignmentAssumptions(CB, IFI);
-
- AssumptionCache *AC =
- IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr;
-
- /// Preserve all attributes on of the call and its parameters.
- salvageKnowledge(&CB, AC);
-
- // We want the inliner to prune the code as it copies. We would LOVE to
- // have no dead or constant instructions leftover after inlining occurs
- // (which can happen, e.g., because an argument was constant), but we'll be
- // happy with whatever the cloner can do.
- CloneAndPruneFunctionInto(Caller, CalledFunc, VMap,
- /*ModuleLevelChanges=*/false, Returns, ".i",
- &InlinedFunctionInfo, &CB);
- // Remember the first block that is newly cloned over.
- FirstNewBlock = LastBlock; ++FirstNewBlock;
-
- if (IFI.CallerBFI != nullptr && IFI.CalleeBFI != nullptr)
- // Update the BFI of blocks cloned into the caller.
- updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
- CalledFunc->front());
-
- updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), CB,
- IFI.PSI, IFI.CallerBFI);
-
- // Inject byval arguments initialization.
- for (std::pair<Value*, Value*> &Init : ByValInit)
- HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(),
- &*FirstNewBlock, IFI);
-
- Optional<OperandBundleUse> ParentDeopt =
- CB.getOperandBundle(LLVMContext::OB_deopt);
- if (ParentDeopt) {
- SmallVector<OperandBundleDef, 2> OpDefs;
-
- for (auto &VH : InlinedFunctionInfo.OperandBundleCallSites) {
- CallBase *ICS = dyn_cast_or_null<CallBase>(VH);
- if (!ICS)
- continue; // instruction was DCE'd or RAUW'ed to undef
-
- OpDefs.clear();
-
- OpDefs.reserve(ICS->getNumOperandBundles());
-
- for (unsigned COBi = 0, COBe = ICS->getNumOperandBundles(); COBi < COBe;
- ++COBi) {
- auto ChildOB = ICS->getOperandBundleAt(COBi);
- if (ChildOB.getTagID() != LLVMContext::OB_deopt) {
- // If the inlined call has other operand bundles, let them be
- OpDefs.emplace_back(ChildOB);
- continue;
- }
-
- // It may be useful to separate this logic (of handling operand
- // bundles) out to a separate "policy" component if this gets crowded.
- // Prepend the parent's deoptimization continuation to the newly
- // inlined call's deoptimization continuation.
- std::vector<Value *> MergedDeoptArgs;
- MergedDeoptArgs.reserve(ParentDeopt->Inputs.size() +
- ChildOB.Inputs.size());
-
+ auto &DL = Caller->getParent()->getDataLayout();
+
+ // Calculate the vector of arguments to pass into the function cloner, which
+ // matches up the formal to the actual argument values.
+ auto AI = CB.arg_begin();
+ unsigned ArgNo = 0;
+ for (Function::arg_iterator I = CalledFunc->arg_begin(),
+ E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) {
+ Value *ActualArg = *AI;
+
+ // When byval arguments actually inlined, we need to make the copy implied
+ // by them explicit. However, we don't do this if the callee is readonly
+ // or readnone, because the copy would be unneeded: the callee doesn't
+ // modify the struct.
+ if (CB.isByValArgument(ArgNo)) {
+ ActualArg = HandleByValArgument(ActualArg, &CB, CalledFunc, IFI,
+ CalledFunc->getParamAlignment(ArgNo));
+ if (ActualArg != *AI)
+ ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI));
+ }
+
+ VMap[&*I] = ActualArg;
+ }
+
+ // TODO: Remove this when users have been updated to the assume bundles.
+ // Add alignment assumptions if necessary. We do this before the inlined
+ // instructions are actually cloned into the caller so that we can easily
+ // check what will be known at the start of the inlined code.
+ AddAlignmentAssumptions(CB, IFI);
+
+ AssumptionCache *AC =
+ IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr;
+
+ /// Preserve all attributes on of the call and its parameters.
+ salvageKnowledge(&CB, AC);
+
+ // We want the inliner to prune the code as it copies. We would LOVE to
+ // have no dead or constant instructions leftover after inlining occurs
+ // (which can happen, e.g., because an argument was constant), but we'll be
+ // happy with whatever the cloner can do.
+ CloneAndPruneFunctionInto(Caller, CalledFunc, VMap,
+ /*ModuleLevelChanges=*/false, Returns, ".i",
+ &InlinedFunctionInfo, &CB);
+ // Remember the first block that is newly cloned over.
+ FirstNewBlock = LastBlock; ++FirstNewBlock;
+
+ if (IFI.CallerBFI != nullptr && IFI.CalleeBFI != nullptr)
+ // Update the BFI of blocks cloned into the caller.
+ updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
+ CalledFunc->front());
+
+ updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), CB,
+ IFI.PSI, IFI.CallerBFI);
+
+ // Inject byval arguments initialization.
+ for (std::pair<Value*, Value*> &Init : ByValInit)
+ HandleByValArgumentInit(Init.first, Init.second, Caller->getParent(),
+ &*FirstNewBlock, IFI);
+
+ Optional<OperandBundleUse> ParentDeopt =
+ CB.getOperandBundle(LLVMContext::OB_deopt);
+ if (ParentDeopt) {
+ SmallVector<OperandBundleDef, 2> OpDefs;
+
+ for (auto &VH : InlinedFunctionInfo.OperandBundleCallSites) {
+ CallBase *ICS = dyn_cast_or_null<CallBase>(VH);
+ if (!ICS)
+ continue; // instruction was DCE'd or RAUW'ed to undef
+
+ OpDefs.clear();
+
+ OpDefs.reserve(ICS->getNumOperandBundles());
+
+ for (unsigned COBi = 0, COBe = ICS->getNumOperandBundles(); COBi < COBe;
+ ++COBi) {
+ auto ChildOB = ICS->getOperandBundleAt(COBi);
+ if (ChildOB.getTagID() != LLVMContext::OB_deopt) {
+ // If the inlined call has other operand bundles, let them be
+ OpDefs.emplace_back(ChildOB);
+ continue;
+ }
+
+ // It may be useful to separate this logic (of handling operand
+ // bundles) out to a separate "policy" component if this gets crowded.
+ // Prepend the parent's deoptimization continuation to the newly
+ // inlined call's deoptimization continuation.
+ std::vector<Value *> MergedDeoptArgs;
+ MergedDeoptArgs.reserve(ParentDeopt->Inputs.size() +
+ ChildOB.Inputs.size());
+
llvm::append_range(MergedDeoptArgs, ParentDeopt->Inputs);
llvm::append_range(MergedDeoptArgs, ChildOB.Inputs);
-
- OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs));
- }
-
- Instruction *NewI = CallBase::Create(ICS, OpDefs, ICS);
-
- // Note: the RAUW does the appropriate fixup in VMap, so we need to do
- // this even if the call returns void.
- ICS->replaceAllUsesWith(NewI);
-
- VH = nullptr;
- ICS->eraseFromParent();
- }
- }
-
- // Update the callgraph if requested.
- if (IFI.CG)
- UpdateCallGraphAfterInlining(CB, FirstNewBlock, VMap, IFI);
-
- // For 'nodebug' functions, the associated DISubprogram is always null.
- // Conservatively avoid propagating the callsite debug location to
- // instructions inlined from a function whose DISubprogram is not null.
- fixupLineNumbers(Caller, FirstNewBlock, &CB,
- CalledFunc->getSubprogram() != nullptr);
-
+
+ OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs));
+ }
+
+ Instruction *NewI = CallBase::Create(ICS, OpDefs, ICS);
+
+ // Note: the RAUW does the appropriate fixup in VMap, so we need to do
+ // this even if the call returns void.
+ ICS->replaceAllUsesWith(NewI);
+
+ VH = nullptr;
+ ICS->eraseFromParent();
+ }
+ }
+
+ // Update the callgraph if requested.
+ if (IFI.CG)
+ UpdateCallGraphAfterInlining(CB, FirstNewBlock, VMap, IFI);
+
+ // For 'nodebug' functions, the associated DISubprogram is always null.
+ // Conservatively avoid propagating the callsite debug location to
+ // instructions inlined from a function whose DISubprogram is not null.
+ fixupLineNumbers(Caller, FirstNewBlock, &CB,
+ CalledFunc->getSubprogram() != nullptr);
+
// Now clone the inlined noalias scope metadata.
SAMetadataCloner.clone();
SAMetadataCloner.remap(FirstNewBlock, Caller->end());
-
- // Add noalias metadata if necessary.
- AddAliasScopeMetadata(CB, VMap, DL, CalleeAAR);
-
- // Clone return attributes on the callsite into the calls within the inlined
- // function which feed into its return value.
- AddReturnAttributes(CB, VMap);
-
+
+ // Add noalias metadata if necessary.
+ AddAliasScopeMetadata(CB, VMap, DL, CalleeAAR);
+
+ // Clone return attributes on the callsite into the calls within the inlined
+ // function which feed into its return value.
+ AddReturnAttributes(CB, VMap);
+
// Propagate metadata on the callsite if necessary.
PropagateCallSiteMetadata(CB, FirstNewBlock, Caller->end());
-
- // Register any cloned assumptions.
- if (IFI.GetAssumptionCache)
- for (BasicBlock &NewBlock :
- make_range(FirstNewBlock->getIterator(), Caller->end()))
- for (Instruction &I : NewBlock)
- if (auto *II = dyn_cast<IntrinsicInst>(&I))
- if (II->getIntrinsicID() == Intrinsic::assume)
- IFI.GetAssumptionCache(*Caller).registerAssumption(II);
- }
-
- // If there are any alloca instructions in the block that used to be the entry
- // block for the callee, move them to the entry block of the caller. First
- // calculate which instruction they should be inserted before. We insert the
- // instructions at the end of the current alloca list.
- {
- BasicBlock::iterator InsertPoint = Caller->begin()->begin();
- for (BasicBlock::iterator I = FirstNewBlock->begin(),
- E = FirstNewBlock->end(); I != E; ) {
- AllocaInst *AI = dyn_cast<AllocaInst>(I++);
- if (!AI) continue;
-
- // If the alloca is now dead, remove it. This often occurs due to code
- // specialization.
- if (AI->use_empty()) {
- AI->eraseFromParent();
- continue;
- }
-
- if (!allocaWouldBeStaticInEntry(AI))
- continue;
-
- // Keep track of the static allocas that we inline into the caller.
- IFI.StaticAllocas.push_back(AI);
-
- // Scan for the block of allocas that we can move over, and move them
- // all at once.
- while (isa<AllocaInst>(I) &&
- !cast<AllocaInst>(I)->use_empty() &&
- allocaWouldBeStaticInEntry(cast<AllocaInst>(I))) {
- IFI.StaticAllocas.push_back(cast<AllocaInst>(I));
- ++I;
- }
-
- // Transfer all of the allocas over in a block. Using splice means
- // that the instructions aren't removed from the symbol table, then
- // reinserted.
- Caller->getEntryBlock().getInstList().splice(
- InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I);
- }
- }
-
- SmallVector<Value*,4> VarArgsToForward;
- SmallVector<AttributeSet, 4> VarArgsAttrs;
- for (unsigned i = CalledFunc->getFunctionType()->getNumParams();
- i < CB.getNumArgOperands(); i++) {
- VarArgsToForward.push_back(CB.getArgOperand(i));
- VarArgsAttrs.push_back(CB.getAttributes().getParamAttributes(i));
- }
-
- bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false;
- if (InlinedFunctionInfo.ContainsCalls) {
- CallInst::TailCallKind CallSiteTailKind = CallInst::TCK_None;
- if (CallInst *CI = dyn_cast<CallInst>(&CB))
- CallSiteTailKind = CI->getTailCallKind();
-
- // For inlining purposes, the "notail" marker is the same as no marker.
- if (CallSiteTailKind == CallInst::TCK_NoTail)
- CallSiteTailKind = CallInst::TCK_None;
-
- for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E;
- ++BB) {
- for (auto II = BB->begin(); II != BB->end();) {
- Instruction &I = *II++;
- CallInst *CI = dyn_cast<CallInst>(&I);
- if (!CI)
- continue;
-
- // Forward varargs from inlined call site to calls to the
- // ForwardVarArgsTo function, if requested, and to musttail calls.
- if (!VarArgsToForward.empty() &&
- ((ForwardVarArgsTo &&
- CI->getCalledFunction() == ForwardVarArgsTo) ||
- CI->isMustTailCall())) {
- // Collect attributes for non-vararg parameters.
- AttributeList Attrs = CI->getAttributes();
- SmallVector<AttributeSet, 8> ArgAttrs;
- if (!Attrs.isEmpty() || !VarArgsAttrs.empty()) {
- for (unsigned ArgNo = 0;
- ArgNo < CI->getFunctionType()->getNumParams(); ++ArgNo)
- ArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
- }
-
- // Add VarArg attributes.
- ArgAttrs.append(VarArgsAttrs.begin(), VarArgsAttrs.end());
- Attrs = AttributeList::get(CI->getContext(), Attrs.getFnAttributes(),
- Attrs.getRetAttributes(), ArgAttrs);
- // Add VarArgs to existing parameters.
- SmallVector<Value *, 6> Params(CI->arg_operands());
- Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
- CallInst *NewCI = CallInst::Create(
- CI->getFunctionType(), CI->getCalledOperand(), Params, "", CI);
- NewCI->setDebugLoc(CI->getDebugLoc());
- NewCI->setAttributes(Attrs);
- NewCI->setCallingConv(CI->getCallingConv());
- CI->replaceAllUsesWith(NewCI);
- CI->eraseFromParent();
- CI = NewCI;
- }
-
- if (Function *F = CI->getCalledFunction())
- InlinedDeoptimizeCalls |=
- F->getIntrinsicID() == Intrinsic::experimental_deoptimize;
-
- // We need to reduce the strength of any inlined tail calls. For
- // musttail, we have to avoid introducing potential unbounded stack
- // growth. For example, if functions 'f' and 'g' are mutually recursive
- // with musttail, we can inline 'g' into 'f' so long as we preserve
- // musttail on the cloned call to 'f'. If either the inlined call site
- // or the cloned call site is *not* musttail, the program already has
- // one frame of stack growth, so it's safe to remove musttail. Here is
- // a table of example transformations:
- //
- // f -> musttail g -> musttail f ==> f -> musttail f
- // f -> musttail g -> tail f ==> f -> tail f
- // f -> g -> musttail f ==> f -> f
- // f -> g -> tail f ==> f -> f
- //
- // Inlined notail calls should remain notail calls.
- CallInst::TailCallKind ChildTCK = CI->getTailCallKind();
- if (ChildTCK != CallInst::TCK_NoTail)
- ChildTCK = std::min(CallSiteTailKind, ChildTCK);
- CI->setTailCallKind(ChildTCK);
- InlinedMustTailCalls |= CI->isMustTailCall();
-
- // Calls inlined through a 'nounwind' call site should be marked
- // 'nounwind'.
- if (MarkNoUnwind)
- CI->setDoesNotThrow();
- }
- }
- }
-
- // Leave lifetime markers for the static alloca's, scoping them to the
- // function we just inlined.
- if (InsertLifetime && !IFI.StaticAllocas.empty()) {
- IRBuilder<> builder(&FirstNewBlock->front());
- for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) {
- AllocaInst *AI = IFI.StaticAllocas[ai];
- // Don't mark swifterror allocas. They can't have bitcast uses.
- if (AI->isSwiftError())
- continue;
-
- // If the alloca is already scoped to something smaller than the whole
- // function then there's no need to add redundant, less accurate markers.
- if (hasLifetimeMarkers(AI))
- continue;
-
- // Try to determine the size of the allocation.
- ConstantInt *AllocaSize = nullptr;
- if (ConstantInt *AIArraySize =
- dyn_cast<ConstantInt>(AI->getArraySize())) {
- auto &DL = Caller->getParent()->getDataLayout();
- Type *AllocaType = AI->getAllocatedType();
+
+ // Register any cloned assumptions.
+ if (IFI.GetAssumptionCache)
+ for (BasicBlock &NewBlock :
+ make_range(FirstNewBlock->getIterator(), Caller->end()))
+ for (Instruction &I : NewBlock)
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ IFI.GetAssumptionCache(*Caller).registerAssumption(II);
+ }
+
+ // If there are any alloca instructions in the block that used to be the entry
+ // block for the callee, move them to the entry block of the caller. First
+ // calculate which instruction they should be inserted before. We insert the
+ // instructions at the end of the current alloca list.
+ {
+ BasicBlock::iterator InsertPoint = Caller->begin()->begin();
+ for (BasicBlock::iterator I = FirstNewBlock->begin(),
+ E = FirstNewBlock->end(); I != E; ) {
+ AllocaInst *AI = dyn_cast<AllocaInst>(I++);
+ if (!AI) continue;
+
+ // If the alloca is now dead, remove it. This often occurs due to code
+ // specialization.
+ if (AI->use_empty()) {
+ AI->eraseFromParent();
+ continue;
+ }
+
+ if (!allocaWouldBeStaticInEntry(AI))
+ continue;
+
+ // Keep track of the static allocas that we inline into the caller.
+ IFI.StaticAllocas.push_back(AI);
+
+ // Scan for the block of allocas that we can move over, and move them
+ // all at once.
+ while (isa<AllocaInst>(I) &&
+ !cast<AllocaInst>(I)->use_empty() &&
+ allocaWouldBeStaticInEntry(cast<AllocaInst>(I))) {
+ IFI.StaticAllocas.push_back(cast<AllocaInst>(I));
+ ++I;
+ }
+
+ // Transfer all of the allocas over in a block. Using splice means
+ // that the instructions aren't removed from the symbol table, then
+ // reinserted.
+ Caller->getEntryBlock().getInstList().splice(
+ InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I);
+ }
+ }
+
+ SmallVector<Value*,4> VarArgsToForward;
+ SmallVector<AttributeSet, 4> VarArgsAttrs;
+ for (unsigned i = CalledFunc->getFunctionType()->getNumParams();
+ i < CB.getNumArgOperands(); i++) {
+ VarArgsToForward.push_back(CB.getArgOperand(i));
+ VarArgsAttrs.push_back(CB.getAttributes().getParamAttributes(i));
+ }
+
+ bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false;
+ if (InlinedFunctionInfo.ContainsCalls) {
+ CallInst::TailCallKind CallSiteTailKind = CallInst::TCK_None;
+ if (CallInst *CI = dyn_cast<CallInst>(&CB))
+ CallSiteTailKind = CI->getTailCallKind();
+
+ // For inlining purposes, the "notail" marker is the same as no marker.
+ if (CallSiteTailKind == CallInst::TCK_NoTail)
+ CallSiteTailKind = CallInst::TCK_None;
+
+ for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E;
+ ++BB) {
+ for (auto II = BB->begin(); II != BB->end();) {
+ Instruction &I = *II++;
+ CallInst *CI = dyn_cast<CallInst>(&I);
+ if (!CI)
+ continue;
+
+ // Forward varargs from inlined call site to calls to the
+ // ForwardVarArgsTo function, if requested, and to musttail calls.
+ if (!VarArgsToForward.empty() &&
+ ((ForwardVarArgsTo &&
+ CI->getCalledFunction() == ForwardVarArgsTo) ||
+ CI->isMustTailCall())) {
+ // Collect attributes for non-vararg parameters.
+ AttributeList Attrs = CI->getAttributes();
+ SmallVector<AttributeSet, 8> ArgAttrs;
+ if (!Attrs.isEmpty() || !VarArgsAttrs.empty()) {
+ for (unsigned ArgNo = 0;
+ ArgNo < CI->getFunctionType()->getNumParams(); ++ArgNo)
+ ArgAttrs.push_back(Attrs.getParamAttributes(ArgNo));
+ }
+
+ // Add VarArg attributes.
+ ArgAttrs.append(VarArgsAttrs.begin(), VarArgsAttrs.end());
+ Attrs = AttributeList::get(CI->getContext(), Attrs.getFnAttributes(),
+ Attrs.getRetAttributes(), ArgAttrs);
+ // Add VarArgs to existing parameters.
+ SmallVector<Value *, 6> Params(CI->arg_operands());
+ Params.append(VarArgsToForward.begin(), VarArgsToForward.end());
+ CallInst *NewCI = CallInst::Create(
+ CI->getFunctionType(), CI->getCalledOperand(), Params, "", CI);
+ NewCI->setDebugLoc(CI->getDebugLoc());
+ NewCI->setAttributes(Attrs);
+ NewCI->setCallingConv(CI->getCallingConv());
+ CI->replaceAllUsesWith(NewCI);
+ CI->eraseFromParent();
+ CI = NewCI;
+ }
+
+ if (Function *F = CI->getCalledFunction())
+ InlinedDeoptimizeCalls |=
+ F->getIntrinsicID() == Intrinsic::experimental_deoptimize;
+
+ // We need to reduce the strength of any inlined tail calls. For
+ // musttail, we have to avoid introducing potential unbounded stack
+ // growth. For example, if functions 'f' and 'g' are mutually recursive
+ // with musttail, we can inline 'g' into 'f' so long as we preserve
+ // musttail on the cloned call to 'f'. If either the inlined call site
+ // or the cloned call site is *not* musttail, the program already has
+ // one frame of stack growth, so it's safe to remove musttail. Here is
+ // a table of example transformations:
+ //
+ // f -> musttail g -> musttail f ==> f -> musttail f
+ // f -> musttail g -> tail f ==> f -> tail f
+ // f -> g -> musttail f ==> f -> f
+ // f -> g -> tail f ==> f -> f
+ //
+ // Inlined notail calls should remain notail calls.
+ CallInst::TailCallKind ChildTCK = CI->getTailCallKind();
+ if (ChildTCK != CallInst::TCK_NoTail)
+ ChildTCK = std::min(CallSiteTailKind, ChildTCK);
+ CI->setTailCallKind(ChildTCK);
+ InlinedMustTailCalls |= CI->isMustTailCall();
+
+ // Calls inlined through a 'nounwind' call site should be marked
+ // 'nounwind'.
+ if (MarkNoUnwind)
+ CI->setDoesNotThrow();
+ }
+ }
+ }
+
+ // Leave lifetime markers for the static alloca's, scoping them to the
+ // function we just inlined.
+ if (InsertLifetime && !IFI.StaticAllocas.empty()) {
+ IRBuilder<> builder(&FirstNewBlock->front());
+ for (unsigned ai = 0, ae = IFI.StaticAllocas.size(); ai != ae; ++ai) {
+ AllocaInst *AI = IFI.StaticAllocas[ai];
+ // Don't mark swifterror allocas. They can't have bitcast uses.
+ if (AI->isSwiftError())
+ continue;
+
+ // If the alloca is already scoped to something smaller than the whole
+ // function then there's no need to add redundant, less accurate markers.
+ if (hasLifetimeMarkers(AI))
+ continue;
+
+ // Try to determine the size of the allocation.
+ ConstantInt *AllocaSize = nullptr;
+ if (ConstantInt *AIArraySize =
+ dyn_cast<ConstantInt>(AI->getArraySize())) {
+ auto &DL = Caller->getParent()->getDataLayout();
+ Type *AllocaType = AI->getAllocatedType();
TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
- uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
-
- // Don't add markers for zero-sized allocas.
- if (AllocaArraySize == 0)
- continue;
-
- // Check that array size doesn't saturate uint64_t and doesn't
- // overflow when it's multiplied by type size.
+ uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
+
+ // Don't add markers for zero-sized allocas.
+ if (AllocaArraySize == 0)
+ continue;
+
+ // Check that array size doesn't saturate uint64_t and doesn't
+ // overflow when it's multiplied by type size.
if (!AllocaTypeSize.isScalable() &&
AllocaArraySize != std::numeric_limits<uint64_t>::max() &&
- std::numeric_limits<uint64_t>::max() / AllocaArraySize >=
+ std::numeric_limits<uint64_t>::max() / AllocaArraySize >=
AllocaTypeSize.getFixedSize()) {
- AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
- AllocaArraySize * AllocaTypeSize);
- }
- }
-
- builder.CreateLifetimeStart(AI, AllocaSize);
- for (ReturnInst *RI : Returns) {
- // Don't insert llvm.lifetime.end calls between a musttail or deoptimize
- // call and a return. The return kills all local allocas.
- if (InlinedMustTailCalls &&
- RI->getParent()->getTerminatingMustTailCall())
- continue;
- if (InlinedDeoptimizeCalls &&
- RI->getParent()->getTerminatingDeoptimizeCall())
- continue;
- IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize);
- }
- }
- }
-
- // If the inlined code contained dynamic alloca instructions, wrap the inlined
- // code with llvm.stacksave/llvm.stackrestore intrinsics.
- if (InlinedFunctionInfo.ContainsDynamicAllocas) {
- Module *M = Caller->getParent();
- // Get the two intrinsics we care about.
- Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave);
- Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore);
-
- // Insert the llvm.stacksave.
- CallInst *SavedPtr = IRBuilder<>(&*FirstNewBlock, FirstNewBlock->begin())
- .CreateCall(StackSave, {}, "savedstack");
-
- // Insert a call to llvm.stackrestore before any return instructions in the
- // inlined function.
- for (ReturnInst *RI : Returns) {
- // Don't insert llvm.stackrestore calls between a musttail or deoptimize
- // call and a return. The return will restore the stack pointer.
- if (InlinedMustTailCalls && RI->getParent()->getTerminatingMustTailCall())
- continue;
- if (InlinedDeoptimizeCalls && RI->getParent()->getTerminatingDeoptimizeCall())
- continue;
- IRBuilder<>(RI).CreateCall(StackRestore, SavedPtr);
- }
- }
-
- // If we are inlining for an invoke instruction, we must make sure to rewrite
- // any call instructions into invoke instructions. This is sensitive to which
- // funclet pads were top-level in the inlinee, so must be done before
- // rewriting the "parent pad" links.
- if (auto *II = dyn_cast<InvokeInst>(&CB)) {
- BasicBlock *UnwindDest = II->getUnwindDest();
- Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI();
- if (isa<LandingPadInst>(FirstNonPHI)) {
- HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo);
- } else {
- HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo);
- }
- }
-
- // Update the lexical scopes of the new funclets and callsites.
- // Anything that had 'none' as its parent is now nested inside the callsite's
- // EHPad.
-
- if (CallSiteEHPad) {
- for (Function::iterator BB = FirstNewBlock->getIterator(),
- E = Caller->end();
- BB != E; ++BB) {
- // Add bundle operands to any top-level call sites.
- SmallVector<OperandBundleDef, 1> OpBundles;
- for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) {
- CallBase *I = dyn_cast<CallBase>(&*BBI++);
- if (!I)
- continue;
-
- // Skip call sites which are nounwind intrinsics.
- auto *CalledFn =
- dyn_cast<Function>(I->getCalledOperand()->stripPointerCasts());
- if (CalledFn && CalledFn->isIntrinsic() && I->doesNotThrow())
- continue;
-
- // Skip call sites which already have a "funclet" bundle.
- if (I->getOperandBundle(LLVMContext::OB_funclet))
- continue;
-
- I->getOperandBundlesAsDefs(OpBundles);
- OpBundles.emplace_back("funclet", CallSiteEHPad);
-
- Instruction *NewInst = CallBase::Create(I, OpBundles, I);
- NewInst->takeName(I);
- I->replaceAllUsesWith(NewInst);
- I->eraseFromParent();
-
- OpBundles.clear();
- }
-
- // It is problematic if the inlinee has a cleanupret which unwinds to
- // caller and we inline it into a call site which doesn't unwind but into
- // an EH pad that does. Such an edge must be dynamically unreachable.
- // As such, we replace the cleanupret with unreachable.
- if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(BB->getTerminator()))
- if (CleanupRet->unwindsToCaller() && EHPadForCallUnwindsLocally)
- changeToUnreachable(CleanupRet, /*UseLLVMTrap=*/false);
-
- Instruction *I = BB->getFirstNonPHI();
- if (!I->isEHPad())
- continue;
-
- if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
- if (isa<ConstantTokenNone>(CatchSwitch->getParentPad()))
- CatchSwitch->setParentPad(CallSiteEHPad);
- } else {
- auto *FPI = cast<FuncletPadInst>(I);
- if (isa<ConstantTokenNone>(FPI->getParentPad()))
- FPI->setParentPad(CallSiteEHPad);
- }
- }
- }
-
- if (InlinedDeoptimizeCalls) {
- // We need to at least remove the deoptimizing returns from the Return set,
- // so that the control flow from those returns does not get merged into the
- // caller (but terminate it instead). If the caller's return type does not
- // match the callee's return type, we also need to change the return type of
- // the intrinsic.
- if (Caller->getReturnType() == CB.getType()) {
+ AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
+ AllocaArraySize * AllocaTypeSize);
+ }
+ }
+
+ builder.CreateLifetimeStart(AI, AllocaSize);
+ for (ReturnInst *RI : Returns) {
+ // Don't insert llvm.lifetime.end calls between a musttail or deoptimize
+ // call and a return. The return kills all local allocas.
+ if (InlinedMustTailCalls &&
+ RI->getParent()->getTerminatingMustTailCall())
+ continue;
+ if (InlinedDeoptimizeCalls &&
+ RI->getParent()->getTerminatingDeoptimizeCall())
+ continue;
+ IRBuilder<>(RI).CreateLifetimeEnd(AI, AllocaSize);
+ }
+ }
+ }
+
+ // If the inlined code contained dynamic alloca instructions, wrap the inlined
+ // code with llvm.stacksave/llvm.stackrestore intrinsics.
+ if (InlinedFunctionInfo.ContainsDynamicAllocas) {
+ Module *M = Caller->getParent();
+ // Get the two intrinsics we care about.
+ Function *StackSave = Intrinsic::getDeclaration(M, Intrinsic::stacksave);
+ Function *StackRestore=Intrinsic::getDeclaration(M,Intrinsic::stackrestore);
+
+ // Insert the llvm.stacksave.
+ CallInst *SavedPtr = IRBuilder<>(&*FirstNewBlock, FirstNewBlock->begin())
+ .CreateCall(StackSave, {}, "savedstack");
+
+ // Insert a call to llvm.stackrestore before any return instructions in the
+ // inlined function.
+ for (ReturnInst *RI : Returns) {
+ // Don't insert llvm.stackrestore calls between a musttail or deoptimize
+ // call and a return. The return will restore the stack pointer.
+ if (InlinedMustTailCalls && RI->getParent()->getTerminatingMustTailCall())
+ continue;
+ if (InlinedDeoptimizeCalls && RI->getParent()->getTerminatingDeoptimizeCall())
+ continue;
+ IRBuilder<>(RI).CreateCall(StackRestore, SavedPtr);
+ }
+ }
+
+ // If we are inlining for an invoke instruction, we must make sure to rewrite
+ // any call instructions into invoke instructions. This is sensitive to which
+ // funclet pads were top-level in the inlinee, so must be done before
+ // rewriting the "parent pad" links.
+ if (auto *II = dyn_cast<InvokeInst>(&CB)) {
+ BasicBlock *UnwindDest = II->getUnwindDest();
+ Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI();
+ if (isa<LandingPadInst>(FirstNonPHI)) {
+ HandleInlinedLandingPad(II, &*FirstNewBlock, InlinedFunctionInfo);
+ } else {
+ HandleInlinedEHPad(II, &*FirstNewBlock, InlinedFunctionInfo);
+ }
+ }
+
+ // Update the lexical scopes of the new funclets and callsites.
+ // Anything that had 'none' as its parent is now nested inside the callsite's
+ // EHPad.
+
+ if (CallSiteEHPad) {
+ for (Function::iterator BB = FirstNewBlock->getIterator(),
+ E = Caller->end();
+ BB != E; ++BB) {
+ // Add bundle operands to any top-level call sites.
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) {
+ CallBase *I = dyn_cast<CallBase>(&*BBI++);
+ if (!I)
+ continue;
+
+ // Skip call sites which are nounwind intrinsics.
+ auto *CalledFn =
+ dyn_cast<Function>(I->getCalledOperand()->stripPointerCasts());
+ if (CalledFn && CalledFn->isIntrinsic() && I->doesNotThrow())
+ continue;
+
+ // Skip call sites which already have a "funclet" bundle.
+ if (I->getOperandBundle(LLVMContext::OB_funclet))
+ continue;
+
+ I->getOperandBundlesAsDefs(OpBundles);
+ OpBundles.emplace_back("funclet", CallSiteEHPad);
+
+ Instruction *NewInst = CallBase::Create(I, OpBundles, I);
+ NewInst->takeName(I);
+ I->replaceAllUsesWith(NewInst);
+ I->eraseFromParent();
+
+ OpBundles.clear();
+ }
+
+ // It is problematic if the inlinee has a cleanupret which unwinds to
+ // caller and we inline it into a call site which doesn't unwind but into
+ // an EH pad that does. Such an edge must be dynamically unreachable.
+ // As such, we replace the cleanupret with unreachable.
+ if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(BB->getTerminator()))
+ if (CleanupRet->unwindsToCaller() && EHPadForCallUnwindsLocally)
+ changeToUnreachable(CleanupRet, /*UseLLVMTrap=*/false);
+
+ Instruction *I = BB->getFirstNonPHI();
+ if (!I->isEHPad())
+ continue;
+
+ if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(I)) {
+ if (isa<ConstantTokenNone>(CatchSwitch->getParentPad()))
+ CatchSwitch->setParentPad(CallSiteEHPad);
+ } else {
+ auto *FPI = cast<FuncletPadInst>(I);
+ if (isa<ConstantTokenNone>(FPI->getParentPad()))
+ FPI->setParentPad(CallSiteEHPad);
+ }
+ }
+ }
+
+ if (InlinedDeoptimizeCalls) {
+ // We need to at least remove the deoptimizing returns from the Return set,
+ // so that the control flow from those returns does not get merged into the
+ // caller (but terminate it instead). If the caller's return type does not
+ // match the callee's return type, we also need to change the return type of
+ // the intrinsic.
+ if (Caller->getReturnType() == CB.getType()) {
llvm::erase_if(Returns, [](ReturnInst *RI) {
- return RI->getParent()->getTerminatingDeoptimizeCall() != nullptr;
- });
- } else {
- SmallVector<ReturnInst *, 8> NormalReturns;
- Function *NewDeoptIntrinsic = Intrinsic::getDeclaration(
- Caller->getParent(), Intrinsic::experimental_deoptimize,
- {Caller->getReturnType()});
-
- for (ReturnInst *RI : Returns) {
- CallInst *DeoptCall = RI->getParent()->getTerminatingDeoptimizeCall();
- if (!DeoptCall) {
- NormalReturns.push_back(RI);
- continue;
- }
-
- // The calling convention on the deoptimize call itself may be bogus,
- // since the code we're inlining may have undefined behavior (and may
- // never actually execute at runtime); but all
- // @llvm.experimental.deoptimize declarations have to have the same
- // calling convention in a well-formed module.
- auto CallingConv = DeoptCall->getCalledFunction()->getCallingConv();
- NewDeoptIntrinsic->setCallingConv(CallingConv);
- auto *CurBB = RI->getParent();
- RI->eraseFromParent();
-
+ return RI->getParent()->getTerminatingDeoptimizeCall() != nullptr;
+ });
+ } else {
+ SmallVector<ReturnInst *, 8> NormalReturns;
+ Function *NewDeoptIntrinsic = Intrinsic::getDeclaration(
+ Caller->getParent(), Intrinsic::experimental_deoptimize,
+ {Caller->getReturnType()});
+
+ for (ReturnInst *RI : Returns) {
+ CallInst *DeoptCall = RI->getParent()->getTerminatingDeoptimizeCall();
+ if (!DeoptCall) {
+ NormalReturns.push_back(RI);
+ continue;
+ }
+
+ // The calling convention on the deoptimize call itself may be bogus,
+ // since the code we're inlining may have undefined behavior (and may
+ // never actually execute at runtime); but all
+ // @llvm.experimental.deoptimize declarations have to have the same
+ // calling convention in a well-formed module.
+ auto CallingConv = DeoptCall->getCalledFunction()->getCallingConv();
+ NewDeoptIntrinsic->setCallingConv(CallingConv);
+ auto *CurBB = RI->getParent();
+ RI->eraseFromParent();
+
SmallVector<Value *, 4> CallArgs(DeoptCall->args());
-
- SmallVector<OperandBundleDef, 1> OpBundles;
- DeoptCall->getOperandBundlesAsDefs(OpBundles);
- DeoptCall->eraseFromParent();
- assert(!OpBundles.empty() &&
- "Expected at least the deopt operand bundle");
-
- IRBuilder<> Builder(CurBB);
- CallInst *NewDeoptCall =
- Builder.CreateCall(NewDeoptIntrinsic, CallArgs, OpBundles);
- NewDeoptCall->setCallingConv(CallingConv);
- if (NewDeoptCall->getType()->isVoidTy())
- Builder.CreateRetVoid();
- else
- Builder.CreateRet(NewDeoptCall);
- }
-
- // Leave behind the normal returns so we can merge control flow.
- std::swap(Returns, NormalReturns);
- }
- }
-
- // Handle any inlined musttail call sites. In order for a new call site to be
- // musttail, the source of the clone and the inlined call site must have been
- // musttail. Therefore it's safe to return without merging control into the
- // phi below.
- if (InlinedMustTailCalls) {
- // Check if we need to bitcast the result of any musttail calls.
- Type *NewRetTy = Caller->getReturnType();
- bool NeedBitCast = !CB.use_empty() && CB.getType() != NewRetTy;
-
- // Handle the returns preceded by musttail calls separately.
- SmallVector<ReturnInst *, 8> NormalReturns;
- for (ReturnInst *RI : Returns) {
- CallInst *ReturnedMustTail =
- RI->getParent()->getTerminatingMustTailCall();
- if (!ReturnedMustTail) {
- NormalReturns.push_back(RI);
- continue;
- }
- if (!NeedBitCast)
- continue;
-
- // Delete the old return and any preceding bitcast.
- BasicBlock *CurBB = RI->getParent();
- auto *OldCast = dyn_cast_or_null<BitCastInst>(RI->getReturnValue());
- RI->eraseFromParent();
- if (OldCast)
- OldCast->eraseFromParent();
-
- // Insert a new bitcast and return with the right type.
- IRBuilder<> Builder(CurBB);
- Builder.CreateRet(Builder.CreateBitCast(ReturnedMustTail, NewRetTy));
- }
-
- // Leave behind the normal returns so we can merge control flow.
- std::swap(Returns, NormalReturns);
- }
-
- // Now that all of the transforms on the inlined code have taken place but
- // before we splice the inlined code into the CFG and lose track of which
- // blocks were actually inlined, collect the call sites. We only do this if
- // call graph updates weren't requested, as those provide value handle based
- // tracking of inlined call sites instead.
- if (InlinedFunctionInfo.ContainsCalls && !IFI.CG) {
- // Otherwise just collect the raw call sites that were inlined.
- for (BasicBlock &NewBB :
- make_range(FirstNewBlock->getIterator(), Caller->end()))
- for (Instruction &I : NewBB)
- if (auto *CB = dyn_cast<CallBase>(&I))
- IFI.InlinedCallSites.push_back(CB);
- }
-
- // If we cloned in _exactly one_ basic block, and if that block ends in a
- // return instruction, we splice the body of the inlined callee directly into
- // the calling basic block.
- if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) {
- // Move all of the instructions right before the call.
- OrigBB->getInstList().splice(CB.getIterator(), FirstNewBlock->getInstList(),
- FirstNewBlock->begin(), FirstNewBlock->end());
- // Remove the cloned basic block.
- Caller->getBasicBlockList().pop_back();
-
- // If the call site was an invoke instruction, add a branch to the normal
- // destination.
- if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
- BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), &CB);
- NewBr->setDebugLoc(Returns[0]->getDebugLoc());
- }
-
- // If the return instruction returned a value, replace uses of the call with
- // uses of the returned value.
- if (!CB.use_empty()) {
- ReturnInst *R = Returns[0];
- if (&CB == R->getReturnValue())
- CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
- else
- CB.replaceAllUsesWith(R->getReturnValue());
- }
- // Since we are now done with the Call/Invoke, we can delete it.
- CB.eraseFromParent();
-
- // Since we are now done with the return instruction, delete it also.
- Returns[0]->eraseFromParent();
-
- // We are now done with the inlining.
- return InlineResult::success();
- }
-
- // Otherwise, we have the normal case, of more than one block to inline or
- // multiple return sites.
-
- // We want to clone the entire callee function into the hole between the
- // "starter" and "ender" blocks. How we accomplish this depends on whether
- // this is an invoke instruction or a call instruction.
- BasicBlock *AfterCallBB;
- BranchInst *CreatedBranchToNormalDest = nullptr;
- if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
-
- // Add an unconditional branch to make this look like the CallInst case...
- CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), &CB);
-
- // Split the basic block. This guarantees that no PHI nodes will have to be
- // updated due to new incoming edges, and make the invoke case more
- // symmetric to the call case.
- AfterCallBB =
- OrigBB->splitBasicBlock(CreatedBranchToNormalDest->getIterator(),
- CalledFunc->getName() + ".exit");
-
- } else { // It's a call
- // If this is a call instruction, we need to split the basic block that
- // the call lives in.
- //
- AfterCallBB = OrigBB->splitBasicBlock(CB.getIterator(),
- CalledFunc->getName() + ".exit");
- }
-
- if (IFI.CallerBFI) {
- // Copy original BB's block frequency to AfterCallBB
- IFI.CallerBFI->setBlockFreq(
- AfterCallBB, IFI.CallerBFI->getBlockFreq(OrigBB).getFrequency());
- }
-
- // Change the branch that used to go to AfterCallBB to branch to the first
- // basic block of the inlined function.
- //
- Instruction *Br = OrigBB->getTerminator();
- assert(Br && Br->getOpcode() == Instruction::Br &&
- "splitBasicBlock broken!");
- Br->setOperand(0, &*FirstNewBlock);
-
- // Now that the function is correct, make it a little bit nicer. In
- // particular, move the basic blocks inserted from the end of the function
- // into the space made by splitting the source basic block.
- Caller->getBasicBlockList().splice(AfterCallBB->getIterator(),
- Caller->getBasicBlockList(), FirstNewBlock,
- Caller->end());
-
- // Handle all of the return instructions that we just cloned in, and eliminate
- // any users of the original call/invoke instruction.
- Type *RTy = CalledFunc->getReturnType();
-
- PHINode *PHI = nullptr;
- if (Returns.size() > 1) {
- // The PHI node should go at the front of the new basic block to merge all
- // possible incoming values.
- if (!CB.use_empty()) {
- PHI = PHINode::Create(RTy, Returns.size(), CB.getName(),
- &AfterCallBB->front());
- // Anything that used the result of the function call should now use the
- // PHI node as their operand.
- CB.replaceAllUsesWith(PHI);
- }
-
- // Loop over all of the return instructions adding entries to the PHI node
- // as appropriate.
- if (PHI) {
- for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
- ReturnInst *RI = Returns[i];
- assert(RI->getReturnValue()->getType() == PHI->getType() &&
- "Ret value not consistent in function!");
- PHI->addIncoming(RI->getReturnValue(), RI->getParent());
- }
- }
-
- // Add a branch to the merge points and remove return instructions.
- DebugLoc Loc;
- for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
- ReturnInst *RI = Returns[i];
- BranchInst* BI = BranchInst::Create(AfterCallBB, RI);
- Loc = RI->getDebugLoc();
- BI->setDebugLoc(Loc);
- RI->eraseFromParent();
- }
- // We need to set the debug location to *somewhere* inside the
- // inlined function. The line number may be nonsensical, but the
- // instruction will at least be associated with the right
- // function.
- if (CreatedBranchToNormalDest)
- CreatedBranchToNormalDest->setDebugLoc(Loc);
- } else if (!Returns.empty()) {
- // Otherwise, if there is exactly one return value, just replace anything
- // using the return value of the call with the computed value.
- if (!CB.use_empty()) {
- if (&CB == Returns[0]->getReturnValue())
- CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
- else
- CB.replaceAllUsesWith(Returns[0]->getReturnValue());
- }
-
- // Update PHI nodes that use the ReturnBB to use the AfterCallBB.
- BasicBlock *ReturnBB = Returns[0]->getParent();
- ReturnBB->replaceAllUsesWith(AfterCallBB);
-
- // Splice the code from the return block into the block that it will return
- // to, which contains the code that was after the call.
- AfterCallBB->getInstList().splice(AfterCallBB->begin(),
- ReturnBB->getInstList());
-
- if (CreatedBranchToNormalDest)
- CreatedBranchToNormalDest->setDebugLoc(Returns[0]->getDebugLoc());
-
- // Delete the return instruction now and empty ReturnBB now.
- Returns[0]->eraseFromParent();
- ReturnBB->eraseFromParent();
- } else if (!CB.use_empty()) {
- // No returns, but something is using the return value of the call. Just
- // nuke the result.
- CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
- }
-
- // Since we are now done with the Call/Invoke, we can delete it.
- CB.eraseFromParent();
-
- // If we inlined any musttail calls and the original return is now
- // unreachable, delete it. It can only contain a bitcast and ret.
+
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ DeoptCall->getOperandBundlesAsDefs(OpBundles);
+ DeoptCall->eraseFromParent();
+ assert(!OpBundles.empty() &&
+ "Expected at least the deopt operand bundle");
+
+ IRBuilder<> Builder(CurBB);
+ CallInst *NewDeoptCall =
+ Builder.CreateCall(NewDeoptIntrinsic, CallArgs, OpBundles);
+ NewDeoptCall->setCallingConv(CallingConv);
+ if (NewDeoptCall->getType()->isVoidTy())
+ Builder.CreateRetVoid();
+ else
+ Builder.CreateRet(NewDeoptCall);
+ }
+
+ // Leave behind the normal returns so we can merge control flow.
+ std::swap(Returns, NormalReturns);
+ }
+ }
+
+ // Handle any inlined musttail call sites. In order for a new call site to be
+ // musttail, the source of the clone and the inlined call site must have been
+ // musttail. Therefore it's safe to return without merging control into the
+ // phi below.
+ if (InlinedMustTailCalls) {
+ // Check if we need to bitcast the result of any musttail calls.
+ Type *NewRetTy = Caller->getReturnType();
+ bool NeedBitCast = !CB.use_empty() && CB.getType() != NewRetTy;
+
+ // Handle the returns preceded by musttail calls separately.
+ SmallVector<ReturnInst *, 8> NormalReturns;
+ for (ReturnInst *RI : Returns) {
+ CallInst *ReturnedMustTail =
+ RI->getParent()->getTerminatingMustTailCall();
+ if (!ReturnedMustTail) {
+ NormalReturns.push_back(RI);
+ continue;
+ }
+ if (!NeedBitCast)
+ continue;
+
+ // Delete the old return and any preceding bitcast.
+ BasicBlock *CurBB = RI->getParent();
+ auto *OldCast = dyn_cast_or_null<BitCastInst>(RI->getReturnValue());
+ RI->eraseFromParent();
+ if (OldCast)
+ OldCast->eraseFromParent();
+
+ // Insert a new bitcast and return with the right type.
+ IRBuilder<> Builder(CurBB);
+ Builder.CreateRet(Builder.CreateBitCast(ReturnedMustTail, NewRetTy));
+ }
+
+ // Leave behind the normal returns so we can merge control flow.
+ std::swap(Returns, NormalReturns);
+ }
+
+ // Now that all of the transforms on the inlined code have taken place but
+ // before we splice the inlined code into the CFG and lose track of which
+ // blocks were actually inlined, collect the call sites. We only do this if
+ // call graph updates weren't requested, as those provide value handle based
+ // tracking of inlined call sites instead.
+ if (InlinedFunctionInfo.ContainsCalls && !IFI.CG) {
+ // Otherwise just collect the raw call sites that were inlined.
+ for (BasicBlock &NewBB :
+ make_range(FirstNewBlock->getIterator(), Caller->end()))
+ for (Instruction &I : NewBB)
+ if (auto *CB = dyn_cast<CallBase>(&I))
+ IFI.InlinedCallSites.push_back(CB);
+ }
+
+ // If we cloned in _exactly one_ basic block, and if that block ends in a
+ // return instruction, we splice the body of the inlined callee directly into
+ // the calling basic block.
+ if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) {
+ // Move all of the instructions right before the call.
+ OrigBB->getInstList().splice(CB.getIterator(), FirstNewBlock->getInstList(),
+ FirstNewBlock->begin(), FirstNewBlock->end());
+ // Remove the cloned basic block.
+ Caller->getBasicBlockList().pop_back();
+
+ // If the call site was an invoke instruction, add a branch to the normal
+ // destination.
+ if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+ BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), &CB);
+ NewBr->setDebugLoc(Returns[0]->getDebugLoc());
+ }
+
+ // If the return instruction returned a value, replace uses of the call with
+ // uses of the returned value.
+ if (!CB.use_empty()) {
+ ReturnInst *R = Returns[0];
+ if (&CB == R->getReturnValue())
+ CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
+ else
+ CB.replaceAllUsesWith(R->getReturnValue());
+ }
+ // Since we are now done with the Call/Invoke, we can delete it.
+ CB.eraseFromParent();
+
+ // Since we are now done with the return instruction, delete it also.
+ Returns[0]->eraseFromParent();
+
+ // We are now done with the inlining.
+ return InlineResult::success();
+ }
+
+ // Otherwise, we have the normal case, of more than one block to inline or
+ // multiple return sites.
+
+ // We want to clone the entire callee function into the hole between the
+ // "starter" and "ender" blocks. How we accomplish this depends on whether
+ // this is an invoke instruction or a call instruction.
+ BasicBlock *AfterCallBB;
+ BranchInst *CreatedBranchToNormalDest = nullptr;
+ if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+
+ // Add an unconditional branch to make this look like the CallInst case...
+ CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), &CB);
+
+ // Split the basic block. This guarantees that no PHI nodes will have to be
+ // updated due to new incoming edges, and make the invoke case more
+ // symmetric to the call case.
+ AfterCallBB =
+ OrigBB->splitBasicBlock(CreatedBranchToNormalDest->getIterator(),
+ CalledFunc->getName() + ".exit");
+
+ } else { // It's a call
+ // If this is a call instruction, we need to split the basic block that
+ // the call lives in.
+ //
+ AfterCallBB = OrigBB->splitBasicBlock(CB.getIterator(),
+ CalledFunc->getName() + ".exit");
+ }
+
+ if (IFI.CallerBFI) {
+ // Copy original BB's block frequency to AfterCallBB
+ IFI.CallerBFI->setBlockFreq(
+ AfterCallBB, IFI.CallerBFI->getBlockFreq(OrigBB).getFrequency());
+ }
+
+ // Change the branch that used to go to AfterCallBB to branch to the first
+ // basic block of the inlined function.
+ //
+ Instruction *Br = OrigBB->getTerminator();
+ assert(Br && Br->getOpcode() == Instruction::Br &&
+ "splitBasicBlock broken!");
+ Br->setOperand(0, &*FirstNewBlock);
+
+ // Now that the function is correct, make it a little bit nicer. In
+ // particular, move the basic blocks inserted from the end of the function
+ // into the space made by splitting the source basic block.
+ Caller->getBasicBlockList().splice(AfterCallBB->getIterator(),
+ Caller->getBasicBlockList(), FirstNewBlock,
+ Caller->end());
+
+ // Handle all of the return instructions that we just cloned in, and eliminate
+ // any users of the original call/invoke instruction.
+ Type *RTy = CalledFunc->getReturnType();
+
+ PHINode *PHI = nullptr;
+ if (Returns.size() > 1) {
+ // The PHI node should go at the front of the new basic block to merge all
+ // possible incoming values.
+ if (!CB.use_empty()) {
+ PHI = PHINode::Create(RTy, Returns.size(), CB.getName(),
+ &AfterCallBB->front());
+ // Anything that used the result of the function call should now use the
+ // PHI node as their operand.
+ CB.replaceAllUsesWith(PHI);
+ }
+
+ // Loop over all of the return instructions adding entries to the PHI node
+ // as appropriate.
+ if (PHI) {
+ for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+ ReturnInst *RI = Returns[i];
+ assert(RI->getReturnValue()->getType() == PHI->getType() &&
+ "Ret value not consistent in function!");
+ PHI->addIncoming(RI->getReturnValue(), RI->getParent());
+ }
+ }
+
+ // Add a branch to the merge points and remove return instructions.
+ DebugLoc Loc;
+ for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
+ ReturnInst *RI = Returns[i];
+ BranchInst* BI = BranchInst::Create(AfterCallBB, RI);
+ Loc = RI->getDebugLoc();
+ BI->setDebugLoc(Loc);
+ RI->eraseFromParent();
+ }
+ // We need to set the debug location to *somewhere* inside the
+ // inlined function. The line number may be nonsensical, but the
+ // instruction will at least be associated with the right
+ // function.
+ if (CreatedBranchToNormalDest)
+ CreatedBranchToNormalDest->setDebugLoc(Loc);
+ } else if (!Returns.empty()) {
+ // Otherwise, if there is exactly one return value, just replace anything
+ // using the return value of the call with the computed value.
+ if (!CB.use_empty()) {
+ if (&CB == Returns[0]->getReturnValue())
+ CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
+ else
+ CB.replaceAllUsesWith(Returns[0]->getReturnValue());
+ }
+
+ // Update PHI nodes that use the ReturnBB to use the AfterCallBB.
+ BasicBlock *ReturnBB = Returns[0]->getParent();
+ ReturnBB->replaceAllUsesWith(AfterCallBB);
+
+ // Splice the code from the return block into the block that it will return
+ // to, which contains the code that was after the call.
+ AfterCallBB->getInstList().splice(AfterCallBB->begin(),
+ ReturnBB->getInstList());
+
+ if (CreatedBranchToNormalDest)
+ CreatedBranchToNormalDest->setDebugLoc(Returns[0]->getDebugLoc());
+
+ // Delete the return instruction now and empty ReturnBB now.
+ Returns[0]->eraseFromParent();
+ ReturnBB->eraseFromParent();
+ } else if (!CB.use_empty()) {
+ // No returns, but something is using the return value of the call. Just
+ // nuke the result.
+ CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
+ }
+
+ // Since we are now done with the Call/Invoke, we can delete it.
+ CB.eraseFromParent();
+
+ // If we inlined any musttail calls and the original return is now
+ // unreachable, delete it. It can only contain a bitcast and ret.
if (InlinedMustTailCalls && pred_empty(AfterCallBB))
- AfterCallBB->eraseFromParent();
-
- // We should always be able to fold the entry block of the function into the
- // single predecessor of the block...
- assert(cast<BranchInst>(Br)->isUnconditional() && "splitBasicBlock broken!");
- BasicBlock *CalleeEntry = cast<BranchInst>(Br)->getSuccessor(0);
-
- // Splice the code entry block into calling block, right before the
- // unconditional branch.
- CalleeEntry->replaceAllUsesWith(OrigBB); // Update PHI nodes
- OrigBB->getInstList().splice(Br->getIterator(), CalleeEntry->getInstList());
-
- // Remove the unconditional branch.
- OrigBB->getInstList().erase(Br);
-
- // Now we can remove the CalleeEntry block, which is now empty.
- Caller->getBasicBlockList().erase(CalleeEntry);
-
- // If we inserted a phi node, check to see if it has a single value (e.g. all
- // the entries are the same or undef). If so, remove the PHI so it doesn't
- // block other optimizations.
- if (PHI) {
- AssumptionCache *AC =
- IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr;
- auto &DL = Caller->getParent()->getDataLayout();
- if (Value *V = SimplifyInstruction(PHI, {DL, nullptr, nullptr, AC})) {
- PHI->replaceAllUsesWith(V);
- PHI->eraseFromParent();
- }
- }
-
- return InlineResult::success();
-}
+ AfterCallBB->eraseFromParent();
+
+ // We should always be able to fold the entry block of the function into the
+ // single predecessor of the block...
+ assert(cast<BranchInst>(Br)->isUnconditional() && "splitBasicBlock broken!");
+ BasicBlock *CalleeEntry = cast<BranchInst>(Br)->getSuccessor(0);
+
+ // Splice the code entry block into calling block, right before the
+ // unconditional branch.
+ CalleeEntry->replaceAllUsesWith(OrigBB); // Update PHI nodes
+ OrigBB->getInstList().splice(Br->getIterator(), CalleeEntry->getInstList());
+
+ // Remove the unconditional branch.
+ OrigBB->getInstList().erase(Br);
+
+ // Now we can remove the CalleeEntry block, which is now empty.
+ Caller->getBasicBlockList().erase(CalleeEntry);
+
+ // If we inserted a phi node, check to see if it has a single value (e.g. all
+ // the entries are the same or undef). If so, remove the PHI so it doesn't
+ // block other optimizations.
+ if (PHI) {
+ AssumptionCache *AC =
+ IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr;
+ auto &DL = Caller->getParent()->getDataLayout();
+ if (Value *V = SimplifyInstruction(PHI, {DL, nullptr, nullptr, AC})) {
+ PHI->replaceAllUsesWith(V);
+ PHI->eraseFromParent();
+ }
+ }
+
+ return InlineResult::success();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/InstructionNamer.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/InstructionNamer.cpp
index ad334034b0..f3499c9c8a 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/InstructionNamer.cpp
@@ -1,35 +1,35 @@
-//===- InstructionNamer.cpp - Give anonymous instructions names -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This is a little utility pass that gives instructions names, this is mostly
-// useful when diffing the effect of an optimization because deleting an
-// unnamed instruction can change all other instruction numbering, making the
-// diff very noisy.
-//
-//===----------------------------------------------------------------------===//
-
+//===- InstructionNamer.cpp - Give anonymous instructions names -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a little utility pass that gives instructions names, this is mostly
+// useful when diffing the effect of an optimization because deleting an
+// unnamed instruction can change all other instruction numbering, making the
+// diff very noisy.
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Utils/InstructionNamer.h"
-#include "llvm/IR/Function.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+
+using namespace llvm;
-using namespace llvm;
-
-namespace {
+namespace {
void nameInstructions(Function &F) {
for (auto &Arg : F.args()) {
if (!Arg.hasName())
Arg.setName("arg");
}
-
+
for (BasicBlock &BB : F) {
if (!BB.hasName())
BB.setName("bb");
@@ -37,39 +37,39 @@ void nameInstructions(Function &F) {
for (Instruction &I : BB) {
if (!I.hasName() && !I.getType()->isVoidTy())
I.setName("i");
- }
+ }
}
}
-
+
struct InstNamer : public FunctionPass {
static char ID; // Pass identification, replacement for typeid
InstNamer() : FunctionPass(ID) {
initializeInstNamerPass(*PassRegistry::getPassRegistry());
}
-
+
void getAnalysisUsage(AnalysisUsage &Info) const override {
Info.setPreservesAll();
}
-
+
bool runOnFunction(Function &F) override {
nameInstructions(F);
return true;
}
};
-
- char InstNamer::ID = 0;
+
+ char InstNamer::ID = 0;
} // namespace
-
-INITIALIZE_PASS(InstNamer, "instnamer",
- "Assign names to anonymous instructions", false, false)
-char &llvm::InstructionNamerID = InstNamer::ID;
-//===----------------------------------------------------------------------===//
-//
-// InstructionNamer - Give any unnamed non-void instructions "tmp" names.
-//
-FunctionPass *llvm::createInstructionNamerPass() {
- return new InstNamer();
-}
+
+INITIALIZE_PASS(InstNamer, "instnamer",
+ "Assign names to anonymous instructions", false, false)
+char &llvm::InstructionNamerID = InstNamer::ID;
+//===----------------------------------------------------------------------===//
+//
+// InstructionNamer - Give any unnamed non-void instructions "tmp" names.
+//
+FunctionPass *llvm::createInstructionNamerPass() {
+ return new InstNamer();
+}
PreservedAnalyses InstructionNamerPass::run(Function &F,
FunctionAnalysisManager &FAM) {
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/IntegerDivision.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/IntegerDivision.cpp
index ffb56f2fbe..9082049c82 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/IntegerDivision.cpp
@@ -1,673 +1,673 @@
-//===-- IntegerDivision.cpp - Expand integer division ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains an implementation of 32bit and 64bit scalar integer
-// division for targets that don't have native support. It's largely derived
-// from compiler-rt's implementations of __udivsi3 and __udivmoddi4,
-// but hand-tuned for targets that prefer less control flow.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/IntegerDivision.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "integer-division"
-
-/// Generate code to compute the remainder of two signed integers. Returns the
-/// remainder, which will have the sign of the dividend. Builder's insert point
-/// should be pointing where the caller wants code generated, e.g. at the srem
-/// instruction. This will generate a urem in the process, and Builder's insert
-/// point will be pointing at the uren (if present, i.e. not folded), ready to
-/// be expanded if the user wishes
-static Value *generateSignedRemainderCode(Value *Dividend, Value *Divisor,
- IRBuilder<> &Builder) {
- unsigned BitWidth = Dividend->getType()->getIntegerBitWidth();
- ConstantInt *Shift;
-
- if (BitWidth == 64) {
- Shift = Builder.getInt64(63);
- } else {
- assert(BitWidth == 32 && "Unexpected bit width");
- Shift = Builder.getInt32(31);
- }
-
- // Following instructions are generated for both i32 (shift 31) and
- // i64 (shift 63).
-
- // ; %dividend_sgn = ashr i32 %dividend, 31
- // ; %divisor_sgn = ashr i32 %divisor, 31
- // ; %dvd_xor = xor i32 %dividend, %dividend_sgn
- // ; %dvs_xor = xor i32 %divisor, %divisor_sgn
- // ; %u_dividend = sub i32 %dvd_xor, %dividend_sgn
- // ; %u_divisor = sub i32 %dvs_xor, %divisor_sgn
- // ; %urem = urem i32 %dividend, %divisor
- // ; %xored = xor i32 %urem, %dividend_sgn
- // ; %srem = sub i32 %xored, %dividend_sgn
- Value *DividendSign = Builder.CreateAShr(Dividend, Shift);
- Value *DivisorSign = Builder.CreateAShr(Divisor, Shift);
- Value *DvdXor = Builder.CreateXor(Dividend, DividendSign);
- Value *DvsXor = Builder.CreateXor(Divisor, DivisorSign);
- Value *UDividend = Builder.CreateSub(DvdXor, DividendSign);
- Value *UDivisor = Builder.CreateSub(DvsXor, DivisorSign);
- Value *URem = Builder.CreateURem(UDividend, UDivisor);
- Value *Xored = Builder.CreateXor(URem, DividendSign);
- Value *SRem = Builder.CreateSub(Xored, DividendSign);
-
- if (Instruction *URemInst = dyn_cast<Instruction>(URem))
- Builder.SetInsertPoint(URemInst);
-
- return SRem;
-}
-
-
-/// Generate code to compute the remainder of two unsigned integers. Returns the
-/// remainder. Builder's insert point should be pointing where the caller wants
-/// code generated, e.g. at the urem instruction. This will generate a udiv in
-/// the process, and Builder's insert point will be pointing at the udiv (if
-/// present, i.e. not folded), ready to be expanded if the user wishes
-static Value *generatedUnsignedRemainderCode(Value *Dividend, Value *Divisor,
- IRBuilder<> &Builder) {
- // Remainder = Dividend - Quotient*Divisor
-
- // Following instructions are generated for both i32 and i64
-
- // ; %quotient = udiv i32 %dividend, %divisor
- // ; %product = mul i32 %divisor, %quotient
- // ; %remainder = sub i32 %dividend, %product
- Value *Quotient = Builder.CreateUDiv(Dividend, Divisor);
- Value *Product = Builder.CreateMul(Divisor, Quotient);
- Value *Remainder = Builder.CreateSub(Dividend, Product);
-
- if (Instruction *UDiv = dyn_cast<Instruction>(Quotient))
- Builder.SetInsertPoint(UDiv);
-
- return Remainder;
-}
-
-/// Generate code to divide two signed integers. Returns the quotient, rounded
-/// towards 0. Builder's insert point should be pointing where the caller wants
-/// code generated, e.g. at the sdiv instruction. This will generate a udiv in
-/// the process, and Builder's insert point will be pointing at the udiv (if
-/// present, i.e. not folded), ready to be expanded if the user wishes.
-static Value *generateSignedDivisionCode(Value *Dividend, Value *Divisor,
- IRBuilder<> &Builder) {
- // Implementation taken from compiler-rt's __divsi3 and __divdi3
-
- unsigned BitWidth = Dividend->getType()->getIntegerBitWidth();
- ConstantInt *Shift;
-
- if (BitWidth == 64) {
- Shift = Builder.getInt64(63);
- } else {
- assert(BitWidth == 32 && "Unexpected bit width");
- Shift = Builder.getInt32(31);
- }
-
- // Following instructions are generated for both i32 (shift 31) and
- // i64 (shift 63).
-
- // ; %tmp = ashr i32 %dividend, 31
- // ; %tmp1 = ashr i32 %divisor, 31
- // ; %tmp2 = xor i32 %tmp, %dividend
- // ; %u_dvnd = sub nsw i32 %tmp2, %tmp
- // ; %tmp3 = xor i32 %tmp1, %divisor
- // ; %u_dvsr = sub nsw i32 %tmp3, %tmp1
- // ; %q_sgn = xor i32 %tmp1, %tmp
- // ; %q_mag = udiv i32 %u_dvnd, %u_dvsr
- // ; %tmp4 = xor i32 %q_mag, %q_sgn
- // ; %q = sub i32 %tmp4, %q_sgn
- Value *Tmp = Builder.CreateAShr(Dividend, Shift);
- Value *Tmp1 = Builder.CreateAShr(Divisor, Shift);
- Value *Tmp2 = Builder.CreateXor(Tmp, Dividend);
- Value *U_Dvnd = Builder.CreateSub(Tmp2, Tmp);
- Value *Tmp3 = Builder.CreateXor(Tmp1, Divisor);
- Value *U_Dvsr = Builder.CreateSub(Tmp3, Tmp1);
- Value *Q_Sgn = Builder.CreateXor(Tmp1, Tmp);
- Value *Q_Mag = Builder.CreateUDiv(U_Dvnd, U_Dvsr);
- Value *Tmp4 = Builder.CreateXor(Q_Mag, Q_Sgn);
- Value *Q = Builder.CreateSub(Tmp4, Q_Sgn);
-
- if (Instruction *UDiv = dyn_cast<Instruction>(Q_Mag))
- Builder.SetInsertPoint(UDiv);
-
- return Q;
-}
-
-/// Generates code to divide two unsigned scalar 32-bit or 64-bit integers.
-/// Returns the quotient, rounded towards 0. Builder's insert point should
-/// point where the caller wants code generated, e.g. at the udiv instruction.
-static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
- IRBuilder<> &Builder) {
- // The basic algorithm can be found in the compiler-rt project's
- // implementation of __udivsi3.c. Here, we do a lower-level IR based approach
- // that's been hand-tuned to lessen the amount of control flow involved.
-
- // Some helper values
- IntegerType *DivTy = cast<IntegerType>(Dividend->getType());
- unsigned BitWidth = DivTy->getBitWidth();
-
- ConstantInt *Zero;
- ConstantInt *One;
- ConstantInt *NegOne;
- ConstantInt *MSB;
-
- if (BitWidth == 64) {
- Zero = Builder.getInt64(0);
- One = Builder.getInt64(1);
- NegOne = ConstantInt::getSigned(DivTy, -1);
- MSB = Builder.getInt64(63);
- } else {
- assert(BitWidth == 32 && "Unexpected bit width");
- Zero = Builder.getInt32(0);
- One = Builder.getInt32(1);
- NegOne = ConstantInt::getSigned(DivTy, -1);
- MSB = Builder.getInt32(31);
- }
-
- ConstantInt *True = Builder.getTrue();
-
- BasicBlock *IBB = Builder.GetInsertBlock();
- Function *F = IBB->getParent();
- Function *CTLZ = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
- DivTy);
-
- // Our CFG is going to look like:
- // +---------------------+
- // | special-cases |
- // | ... |
- // +---------------------+
- // | |
- // | +----------+
- // | | bb1 |
- // | | ... |
- // | +----------+
- // | | |
- // | | +------------+
- // | | | preheader |
- // | | | ... |
- // | | +------------+
- // | | |
- // | | | +---+
- // | | | | |
- // | | +------------+ |
- // | | | do-while | |
- // | | | ... | |
- // | | +------------+ |
- // | | | | |
- // | +-----------+ +---+
- // | | loop-exit |
- // | | ... |
- // | +-----------+
- // | |
- // +-------+
- // | ... |
- // | end |
- // +-------+
- BasicBlock *SpecialCases = Builder.GetInsertBlock();
- SpecialCases->setName(Twine(SpecialCases->getName(), "_udiv-special-cases"));
- BasicBlock *End = SpecialCases->splitBasicBlock(Builder.GetInsertPoint(),
- "udiv-end");
- BasicBlock *LoopExit = BasicBlock::Create(Builder.getContext(),
- "udiv-loop-exit", F, End);
- BasicBlock *DoWhile = BasicBlock::Create(Builder.getContext(),
- "udiv-do-while", F, End);
- BasicBlock *Preheader = BasicBlock::Create(Builder.getContext(),
- "udiv-preheader", F, End);
- BasicBlock *BB1 = BasicBlock::Create(Builder.getContext(),
- "udiv-bb1", F, End);
-
- // We'll be overwriting the terminator to insert our extra blocks
- SpecialCases->getTerminator()->eraseFromParent();
-
- // Same instructions are generated for both i32 (msb 31) and i64 (msb 63).
-
- // First off, check for special cases: dividend or divisor is zero, divisor
- // is greater than dividend, and divisor is 1.
- // ; special-cases:
- // ; %ret0_1 = icmp eq i32 %divisor, 0
- // ; %ret0_2 = icmp eq i32 %dividend, 0
- // ; %ret0_3 = or i1 %ret0_1, %ret0_2
- // ; %tmp0 = tail call i32 @llvm.ctlz.i32(i32 %divisor, i1 true)
- // ; %tmp1 = tail call i32 @llvm.ctlz.i32(i32 %dividend, i1 true)
- // ; %sr = sub nsw i32 %tmp0, %tmp1
- // ; %ret0_4 = icmp ugt i32 %sr, 31
- // ; %ret0 = or i1 %ret0_3, %ret0_4
- // ; %retDividend = icmp eq i32 %sr, 31
- // ; %retVal = select i1 %ret0, i32 0, i32 %dividend
- // ; %earlyRet = or i1 %ret0, %retDividend
- // ; br i1 %earlyRet, label %end, label %bb1
- Builder.SetInsertPoint(SpecialCases);
- Value *Ret0_1 = Builder.CreateICmpEQ(Divisor, Zero);
- Value *Ret0_2 = Builder.CreateICmpEQ(Dividend, Zero);
- Value *Ret0_3 = Builder.CreateOr(Ret0_1, Ret0_2);
- Value *Tmp0 = Builder.CreateCall(CTLZ, {Divisor, True});
- Value *Tmp1 = Builder.CreateCall(CTLZ, {Dividend, True});
- Value *SR = Builder.CreateSub(Tmp0, Tmp1);
- Value *Ret0_4 = Builder.CreateICmpUGT(SR, MSB);
- Value *Ret0 = Builder.CreateOr(Ret0_3, Ret0_4);
- Value *RetDividend = Builder.CreateICmpEQ(SR, MSB);
- Value *RetVal = Builder.CreateSelect(Ret0, Zero, Dividend);
- Value *EarlyRet = Builder.CreateOr(Ret0, RetDividend);
- Builder.CreateCondBr(EarlyRet, End, BB1);
-
- // ; bb1: ; preds = %special-cases
- // ; %sr_1 = add i32 %sr, 1
- // ; %tmp2 = sub i32 31, %sr
- // ; %q = shl i32 %dividend, %tmp2
- // ; %skipLoop = icmp eq i32 %sr_1, 0
- // ; br i1 %skipLoop, label %loop-exit, label %preheader
- Builder.SetInsertPoint(BB1);
- Value *SR_1 = Builder.CreateAdd(SR, One);
- Value *Tmp2 = Builder.CreateSub(MSB, SR);
- Value *Q = Builder.CreateShl(Dividend, Tmp2);
- Value *SkipLoop = Builder.CreateICmpEQ(SR_1, Zero);
- Builder.CreateCondBr(SkipLoop, LoopExit, Preheader);
-
- // ; preheader: ; preds = %bb1
- // ; %tmp3 = lshr i32 %dividend, %sr_1
- // ; %tmp4 = add i32 %divisor, -1
- // ; br label %do-while
- Builder.SetInsertPoint(Preheader);
- Value *Tmp3 = Builder.CreateLShr(Dividend, SR_1);
- Value *Tmp4 = Builder.CreateAdd(Divisor, NegOne);
- Builder.CreateBr(DoWhile);
-
- // ; do-while: ; preds = %do-while, %preheader
- // ; %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
- // ; %sr_3 = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
- // ; %r_1 = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
- // ; %q_2 = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
- // ; %tmp5 = shl i32 %r_1, 1
- // ; %tmp6 = lshr i32 %q_2, 31
- // ; %tmp7 = or i32 %tmp5, %tmp6
- // ; %tmp8 = shl i32 %q_2, 1
- // ; %q_1 = or i32 %carry_1, %tmp8
- // ; %tmp9 = sub i32 %tmp4, %tmp7
- // ; %tmp10 = ashr i32 %tmp9, 31
- // ; %carry = and i32 %tmp10, 1
- // ; %tmp11 = and i32 %tmp10, %divisor
- // ; %r = sub i32 %tmp7, %tmp11
- // ; %sr_2 = add i32 %sr_3, -1
- // ; %tmp12 = icmp eq i32 %sr_2, 0
- // ; br i1 %tmp12, label %loop-exit, label %do-while
- Builder.SetInsertPoint(DoWhile);
- PHINode *Carry_1 = Builder.CreatePHI(DivTy, 2);
- PHINode *SR_3 = Builder.CreatePHI(DivTy, 2);
- PHINode *R_1 = Builder.CreatePHI(DivTy, 2);
- PHINode *Q_2 = Builder.CreatePHI(DivTy, 2);
- Value *Tmp5 = Builder.CreateShl(R_1, One);
- Value *Tmp6 = Builder.CreateLShr(Q_2, MSB);
- Value *Tmp7 = Builder.CreateOr(Tmp5, Tmp6);
- Value *Tmp8 = Builder.CreateShl(Q_2, One);
- Value *Q_1 = Builder.CreateOr(Carry_1, Tmp8);
- Value *Tmp9 = Builder.CreateSub(Tmp4, Tmp7);
- Value *Tmp10 = Builder.CreateAShr(Tmp9, MSB);
- Value *Carry = Builder.CreateAnd(Tmp10, One);
- Value *Tmp11 = Builder.CreateAnd(Tmp10, Divisor);
- Value *R = Builder.CreateSub(Tmp7, Tmp11);
- Value *SR_2 = Builder.CreateAdd(SR_3, NegOne);
- Value *Tmp12 = Builder.CreateICmpEQ(SR_2, Zero);
- Builder.CreateCondBr(Tmp12, LoopExit, DoWhile);
-
- // ; loop-exit: ; preds = %do-while, %bb1
- // ; %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
- // ; %q_3 = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
- // ; %tmp13 = shl i32 %q_3, 1
- // ; %q_4 = or i32 %carry_2, %tmp13
- // ; br label %end
- Builder.SetInsertPoint(LoopExit);
- PHINode *Carry_2 = Builder.CreatePHI(DivTy, 2);
- PHINode *Q_3 = Builder.CreatePHI(DivTy, 2);
- Value *Tmp13 = Builder.CreateShl(Q_3, One);
- Value *Q_4 = Builder.CreateOr(Carry_2, Tmp13);
- Builder.CreateBr(End);
-
- // ; end: ; preds = %loop-exit, %special-cases
- // ; %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
- // ; ret i32 %q_5
- Builder.SetInsertPoint(End, End->begin());
- PHINode *Q_5 = Builder.CreatePHI(DivTy, 2);
-
- // Populate the Phis, since all values have now been created. Our Phis were:
- // ; %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
- Carry_1->addIncoming(Zero, Preheader);
- Carry_1->addIncoming(Carry, DoWhile);
- // ; %sr_3 = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
- SR_3->addIncoming(SR_1, Preheader);
- SR_3->addIncoming(SR_2, DoWhile);
- // ; %r_1 = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
- R_1->addIncoming(Tmp3, Preheader);
- R_1->addIncoming(R, DoWhile);
- // ; %q_2 = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
- Q_2->addIncoming(Q, Preheader);
- Q_2->addIncoming(Q_1, DoWhile);
- // ; %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
- Carry_2->addIncoming(Zero, BB1);
- Carry_2->addIncoming(Carry, DoWhile);
- // ; %q_3 = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
- Q_3->addIncoming(Q, BB1);
- Q_3->addIncoming(Q_1, DoWhile);
- // ; %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
- Q_5->addIncoming(Q_4, LoopExit);
- Q_5->addIncoming(RetVal, SpecialCases);
-
- return Q_5;
-}
-
-/// Generate code to calculate the remainder of two integers, replacing Rem with
-/// the generated code. This currently generates code using the udiv expansion,
-/// but future work includes generating more specialized code, e.g. when more
-/// information about the operands are known. Implements both 32bit and 64bit
-/// scalar division.
-///
-/// Replace Rem with generated code.
-bool llvm::expandRemainder(BinaryOperator *Rem) {
- assert((Rem->getOpcode() == Instruction::SRem ||
- Rem->getOpcode() == Instruction::URem) &&
- "Trying to expand remainder from a non-remainder function");
-
- IRBuilder<> Builder(Rem);
-
- assert(!Rem->getType()->isVectorTy() && "Div over vectors not supported");
- assert((Rem->getType()->getIntegerBitWidth() == 32 ||
- Rem->getType()->getIntegerBitWidth() == 64) &&
- "Div of bitwidth other than 32 or 64 not supported");
-
- // First prepare the sign if it's a signed remainder
- if (Rem->getOpcode() == Instruction::SRem) {
- Value *Remainder = generateSignedRemainderCode(Rem->getOperand(0),
- Rem->getOperand(1), Builder);
-
- // Check whether this is the insert point while Rem is still valid.
- bool IsInsertPoint = Rem->getIterator() == Builder.GetInsertPoint();
- Rem->replaceAllUsesWith(Remainder);
- Rem->dropAllReferences();
- Rem->eraseFromParent();
-
- // If we didn't actually generate an urem instruction, we're done
- // This happens for example if the input were constant. In this case the
- // Builder insertion point was unchanged
- if (IsInsertPoint)
- return true;
-
- BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
- Rem = BO;
- }
-
- Value *Remainder = generatedUnsignedRemainderCode(Rem->getOperand(0),
- Rem->getOperand(1),
- Builder);
-
- Rem->replaceAllUsesWith(Remainder);
- Rem->dropAllReferences();
- Rem->eraseFromParent();
-
- // Expand the udiv
- if (BinaryOperator *UDiv = dyn_cast<BinaryOperator>(Builder.GetInsertPoint())) {
- assert(UDiv->getOpcode() == Instruction::UDiv && "Non-udiv in expansion?");
- expandDivision(UDiv);
- }
-
- return true;
-}
-
-
-/// Generate code to divide two integers, replacing Div with the generated
-/// code. This currently generates code similarly to compiler-rt's
-/// implementations, but future work includes generating more specialized code
-/// when more information about the operands are known. Implements both
-/// 32bit and 64bit scalar division.
-///
-/// Replace Div with generated code.
-bool llvm::expandDivision(BinaryOperator *Div) {
- assert((Div->getOpcode() == Instruction::SDiv ||
- Div->getOpcode() == Instruction::UDiv) &&
- "Trying to expand division from a non-division function");
-
- IRBuilder<> Builder(Div);
-
- assert(!Div->getType()->isVectorTy() && "Div over vectors not supported");
- assert((Div->getType()->getIntegerBitWidth() == 32 ||
- Div->getType()->getIntegerBitWidth() == 64) &&
- "Div of bitwidth other than 32 or 64 not supported");
-
- // First prepare the sign if it's a signed division
- if (Div->getOpcode() == Instruction::SDiv) {
- // Lower the code to unsigned division, and reset Div to point to the udiv.
- Value *Quotient = generateSignedDivisionCode(Div->getOperand(0),
- Div->getOperand(1), Builder);
-
- // Check whether this is the insert point while Div is still valid.
- bool IsInsertPoint = Div->getIterator() == Builder.GetInsertPoint();
- Div->replaceAllUsesWith(Quotient);
- Div->dropAllReferences();
- Div->eraseFromParent();
-
- // If we didn't actually generate an udiv instruction, we're done
- // This happens for example if the input were constant. In this case the
- // Builder insertion point was unchanged
- if (IsInsertPoint)
- return true;
-
- BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
- Div = BO;
- }
-
- // Insert the unsigned division code
- Value *Quotient = generateUnsignedDivisionCode(Div->getOperand(0),
- Div->getOperand(1),
- Builder);
- Div->replaceAllUsesWith(Quotient);
- Div->dropAllReferences();
- Div->eraseFromParent();
-
- return true;
-}
-
-/// Generate code to compute the remainder of two integers of bitwidth up to
-/// 32 bits. Uses the above routines and extends the inputs/truncates the
-/// outputs to operate in 32 bits; that is, these routines are good for targets
-/// that have no or very little suppport for smaller than 32 bit integer
-/// arithmetic.
-///
-/// Replace Rem with emulation code.
-bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
- assert((Rem->getOpcode() == Instruction::SRem ||
- Rem->getOpcode() == Instruction::URem) &&
- "Trying to expand remainder from a non-remainder function");
-
- Type *RemTy = Rem->getType();
- assert(!RemTy->isVectorTy() && "Div over vectors not supported");
-
- unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
-
- assert(RemTyBitWidth <= 32 &&
- "Div of bitwidth greater than 32 not supported");
-
- if (RemTyBitWidth == 32)
- return expandRemainder(Rem);
-
- // If bitwidth smaller than 32 extend inputs, extend output and proceed
- // with 32 bit division.
- IRBuilder<> Builder(Rem);
-
- Value *ExtDividend;
- Value *ExtDivisor;
- Value *ExtRem;
- Value *Trunc;
- Type *Int32Ty = Builder.getInt32Ty();
-
- if (Rem->getOpcode() == Instruction::SRem) {
- ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int32Ty);
- ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int32Ty);
- ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor);
- } else {
- ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int32Ty);
- ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int32Ty);
- ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor);
- }
- Trunc = Builder.CreateTrunc(ExtRem, RemTy);
-
- Rem->replaceAllUsesWith(Trunc);
- Rem->dropAllReferences();
- Rem->eraseFromParent();
-
- return expandRemainder(cast<BinaryOperator>(ExtRem));
-}
-
-/// Generate code to compute the remainder of two integers of bitwidth up to
-/// 64 bits. Uses the above routines and extends the inputs/truncates the
-/// outputs to operate in 64 bits.
-///
-/// Replace Rem with emulation code.
-bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) {
- assert((Rem->getOpcode() == Instruction::SRem ||
- Rem->getOpcode() == Instruction::URem) &&
- "Trying to expand remainder from a non-remainder function");
-
- Type *RemTy = Rem->getType();
- assert(!RemTy->isVectorTy() && "Div over vectors not supported");
-
- unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
-
- assert(RemTyBitWidth <= 64 && "Div of bitwidth greater than 64 not supported");
-
- if (RemTyBitWidth == 64)
- return expandRemainder(Rem);
-
- // If bitwidth smaller than 64 extend inputs, extend output and proceed
- // with 64 bit division.
- IRBuilder<> Builder(Rem);
-
- Value *ExtDividend;
- Value *ExtDivisor;
- Value *ExtRem;
- Value *Trunc;
- Type *Int64Ty = Builder.getInt64Ty();
-
- if (Rem->getOpcode() == Instruction::SRem) {
- ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int64Ty);
- ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int64Ty);
- ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor);
- } else {
- ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int64Ty);
- ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int64Ty);
- ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor);
- }
- Trunc = Builder.CreateTrunc(ExtRem, RemTy);
-
- Rem->replaceAllUsesWith(Trunc);
- Rem->dropAllReferences();
- Rem->eraseFromParent();
-
- return expandRemainder(cast<BinaryOperator>(ExtRem));
-}
-
-/// Generate code to divide two integers of bitwidth up to 32 bits. Uses the
-/// above routines and extends the inputs/truncates the outputs to operate
-/// in 32 bits; that is, these routines are good for targets that have no
-/// or very little support for smaller than 32 bit integer arithmetic.
-///
-/// Replace Div with emulation code.
-bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
- assert((Div->getOpcode() == Instruction::SDiv ||
- Div->getOpcode() == Instruction::UDiv) &&
- "Trying to expand division from a non-division function");
-
- Type *DivTy = Div->getType();
- assert(!DivTy->isVectorTy() && "Div over vectors not supported");
-
- unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
-
- assert(DivTyBitWidth <= 32 && "Div of bitwidth greater than 32 not supported");
-
- if (DivTyBitWidth == 32)
- return expandDivision(Div);
-
- // If bitwidth smaller than 32 extend inputs, extend output and proceed
- // with 32 bit division.
- IRBuilder<> Builder(Div);
-
- Value *ExtDividend;
- Value *ExtDivisor;
- Value *ExtDiv;
- Value *Trunc;
- Type *Int32Ty = Builder.getInt32Ty();
-
- if (Div->getOpcode() == Instruction::SDiv) {
- ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int32Ty);
- ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int32Ty);
- ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor);
- } else {
- ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int32Ty);
- ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int32Ty);
- ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor);
- }
- Trunc = Builder.CreateTrunc(ExtDiv, DivTy);
-
- Div->replaceAllUsesWith(Trunc);
- Div->dropAllReferences();
- Div->eraseFromParent();
-
- return expandDivision(cast<BinaryOperator>(ExtDiv));
-}
-
-/// Generate code to divide two integers of bitwidth up to 64 bits. Uses the
-/// above routines and extends the inputs/truncates the outputs to operate
-/// in 64 bits.
-///
-/// Replace Div with emulation code.
-bool llvm::expandDivisionUpTo64Bits(BinaryOperator *Div) {
- assert((Div->getOpcode() == Instruction::SDiv ||
- Div->getOpcode() == Instruction::UDiv) &&
- "Trying to expand division from a non-division function");
-
- Type *DivTy = Div->getType();
- assert(!DivTy->isVectorTy() && "Div over vectors not supported");
-
- unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
-
- assert(DivTyBitWidth <= 64 &&
- "Div of bitwidth greater than 64 not supported");
-
- if (DivTyBitWidth == 64)
- return expandDivision(Div);
-
- // If bitwidth smaller than 64 extend inputs, extend output and proceed
- // with 64 bit division.
- IRBuilder<> Builder(Div);
-
- Value *ExtDividend;
- Value *ExtDivisor;
- Value *ExtDiv;
- Value *Trunc;
- Type *Int64Ty = Builder.getInt64Ty();
-
- if (Div->getOpcode() == Instruction::SDiv) {
- ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int64Ty);
- ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int64Ty);
- ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor);
- } else {
- ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int64Ty);
- ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int64Ty);
- ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor);
- }
- Trunc = Builder.CreateTrunc(ExtDiv, DivTy);
-
- Div->replaceAllUsesWith(Trunc);
- Div->dropAllReferences();
- Div->eraseFromParent();
-
- return expandDivision(cast<BinaryOperator>(ExtDiv));
-}
+//===-- IntegerDivision.cpp - Expand integer division ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an implementation of 32bit and 64bit scalar integer
+// division for targets that don't have native support. It's largely derived
+// from compiler-rt's implementations of __udivsi3 and __udivmoddi4,
+// but hand-tuned for targets that prefer less control flow.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/IntegerDivision.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "integer-division"
+
+/// Generate code to compute the remainder of two signed integers. Returns the
+/// remainder, which will have the sign of the dividend. Builder's insert point
+/// should be pointing where the caller wants code generated, e.g. at the srem
+/// instruction. This will generate a urem in the process, and Builder's insert
+/// point will be pointing at the uren (if present, i.e. not folded), ready to
+/// be expanded if the user wishes
+static Value *generateSignedRemainderCode(Value *Dividend, Value *Divisor,
+ IRBuilder<> &Builder) {
+ unsigned BitWidth = Dividend->getType()->getIntegerBitWidth();
+ ConstantInt *Shift;
+
+ if (BitWidth == 64) {
+ Shift = Builder.getInt64(63);
+ } else {
+ assert(BitWidth == 32 && "Unexpected bit width");
+ Shift = Builder.getInt32(31);
+ }
+
+ // Following instructions are generated for both i32 (shift 31) and
+ // i64 (shift 63).
+
+ // ; %dividend_sgn = ashr i32 %dividend, 31
+ // ; %divisor_sgn = ashr i32 %divisor, 31
+ // ; %dvd_xor = xor i32 %dividend, %dividend_sgn
+ // ; %dvs_xor = xor i32 %divisor, %divisor_sgn
+ // ; %u_dividend = sub i32 %dvd_xor, %dividend_sgn
+ // ; %u_divisor = sub i32 %dvs_xor, %divisor_sgn
+ // ; %urem = urem i32 %dividend, %divisor
+ // ; %xored = xor i32 %urem, %dividend_sgn
+ // ; %srem = sub i32 %xored, %dividend_sgn
+ Value *DividendSign = Builder.CreateAShr(Dividend, Shift);
+ Value *DivisorSign = Builder.CreateAShr(Divisor, Shift);
+ Value *DvdXor = Builder.CreateXor(Dividend, DividendSign);
+ Value *DvsXor = Builder.CreateXor(Divisor, DivisorSign);
+ Value *UDividend = Builder.CreateSub(DvdXor, DividendSign);
+ Value *UDivisor = Builder.CreateSub(DvsXor, DivisorSign);
+ Value *URem = Builder.CreateURem(UDividend, UDivisor);
+ Value *Xored = Builder.CreateXor(URem, DividendSign);
+ Value *SRem = Builder.CreateSub(Xored, DividendSign);
+
+ if (Instruction *URemInst = dyn_cast<Instruction>(URem))
+ Builder.SetInsertPoint(URemInst);
+
+ return SRem;
+}
+
+
+/// Generate code to compute the remainder of two unsigned integers. Returns the
+/// remainder. Builder's insert point should be pointing where the caller wants
+/// code generated, e.g. at the urem instruction. This will generate a udiv in
+/// the process, and Builder's insert point will be pointing at the udiv (if
+/// present, i.e. not folded), ready to be expanded if the user wishes
+static Value *generatedUnsignedRemainderCode(Value *Dividend, Value *Divisor,
+ IRBuilder<> &Builder) {
+ // Remainder = Dividend - Quotient*Divisor
+
+ // Following instructions are generated for both i32 and i64
+
+ // ; %quotient = udiv i32 %dividend, %divisor
+ // ; %product = mul i32 %divisor, %quotient
+ // ; %remainder = sub i32 %dividend, %product
+ Value *Quotient = Builder.CreateUDiv(Dividend, Divisor);
+ Value *Product = Builder.CreateMul(Divisor, Quotient);
+ Value *Remainder = Builder.CreateSub(Dividend, Product);
+
+ if (Instruction *UDiv = dyn_cast<Instruction>(Quotient))
+ Builder.SetInsertPoint(UDiv);
+
+ return Remainder;
+}
+
+/// Generate code to divide two signed integers. Returns the quotient, rounded
+/// towards 0. Builder's insert point should be pointing where the caller wants
+/// code generated, e.g. at the sdiv instruction. This will generate a udiv in
+/// the process, and Builder's insert point will be pointing at the udiv (if
+/// present, i.e. not folded), ready to be expanded if the user wishes.
+static Value *generateSignedDivisionCode(Value *Dividend, Value *Divisor,
+ IRBuilder<> &Builder) {
+ // Implementation taken from compiler-rt's __divsi3 and __divdi3
+
+ unsigned BitWidth = Dividend->getType()->getIntegerBitWidth();
+ ConstantInt *Shift;
+
+ if (BitWidth == 64) {
+ Shift = Builder.getInt64(63);
+ } else {
+ assert(BitWidth == 32 && "Unexpected bit width");
+ Shift = Builder.getInt32(31);
+ }
+
+ // Following instructions are generated for both i32 (shift 31) and
+ // i64 (shift 63).
+
+ // ; %tmp = ashr i32 %dividend, 31
+ // ; %tmp1 = ashr i32 %divisor, 31
+ // ; %tmp2 = xor i32 %tmp, %dividend
+ // ; %u_dvnd = sub nsw i32 %tmp2, %tmp
+ // ; %tmp3 = xor i32 %tmp1, %divisor
+ // ; %u_dvsr = sub nsw i32 %tmp3, %tmp1
+ // ; %q_sgn = xor i32 %tmp1, %tmp
+ // ; %q_mag = udiv i32 %u_dvnd, %u_dvsr
+ // ; %tmp4 = xor i32 %q_mag, %q_sgn
+ // ; %q = sub i32 %tmp4, %q_sgn
+ Value *Tmp = Builder.CreateAShr(Dividend, Shift);
+ Value *Tmp1 = Builder.CreateAShr(Divisor, Shift);
+ Value *Tmp2 = Builder.CreateXor(Tmp, Dividend);
+ Value *U_Dvnd = Builder.CreateSub(Tmp2, Tmp);
+ Value *Tmp3 = Builder.CreateXor(Tmp1, Divisor);
+ Value *U_Dvsr = Builder.CreateSub(Tmp3, Tmp1);
+ Value *Q_Sgn = Builder.CreateXor(Tmp1, Tmp);
+ Value *Q_Mag = Builder.CreateUDiv(U_Dvnd, U_Dvsr);
+ Value *Tmp4 = Builder.CreateXor(Q_Mag, Q_Sgn);
+ Value *Q = Builder.CreateSub(Tmp4, Q_Sgn);
+
+ if (Instruction *UDiv = dyn_cast<Instruction>(Q_Mag))
+ Builder.SetInsertPoint(UDiv);
+
+ return Q;
+}
+
+/// Generates code to divide two unsigned scalar 32-bit or 64-bit integers.
+/// Returns the quotient, rounded towards 0. Builder's insert point should
+/// point where the caller wants code generated, e.g. at the udiv instruction.
+static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
+ IRBuilder<> &Builder) {
+ // The basic algorithm can be found in the compiler-rt project's
+ // implementation of __udivsi3.c. Here, we do a lower-level IR based approach
+ // that's been hand-tuned to lessen the amount of control flow involved.
+
+ // Some helper values
+ IntegerType *DivTy = cast<IntegerType>(Dividend->getType());
+ unsigned BitWidth = DivTy->getBitWidth();
+
+ ConstantInt *Zero;
+ ConstantInt *One;
+ ConstantInt *NegOne;
+ ConstantInt *MSB;
+
+ if (BitWidth == 64) {
+ Zero = Builder.getInt64(0);
+ One = Builder.getInt64(1);
+ NegOne = ConstantInt::getSigned(DivTy, -1);
+ MSB = Builder.getInt64(63);
+ } else {
+ assert(BitWidth == 32 && "Unexpected bit width");
+ Zero = Builder.getInt32(0);
+ One = Builder.getInt32(1);
+ NegOne = ConstantInt::getSigned(DivTy, -1);
+ MSB = Builder.getInt32(31);
+ }
+
+ ConstantInt *True = Builder.getTrue();
+
+ BasicBlock *IBB = Builder.GetInsertBlock();
+ Function *F = IBB->getParent();
+ Function *CTLZ = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
+ DivTy);
+
+ // Our CFG is going to look like:
+ // +---------------------+
+ // | special-cases |
+ // | ... |
+ // +---------------------+
+ // | |
+ // | +----------+
+ // | | bb1 |
+ // | | ... |
+ // | +----------+
+ // | | |
+ // | | +------------+
+ // | | | preheader |
+ // | | | ... |
+ // | | +------------+
+ // | | |
+ // | | | +---+
+ // | | | | |
+ // | | +------------+ |
+ // | | | do-while | |
+ // | | | ... | |
+ // | | +------------+ |
+ // | | | | |
+ // | +-----------+ +---+
+ // | | loop-exit |
+ // | | ... |
+ // | +-----------+
+ // | |
+ // +-------+
+ // | ... |
+ // | end |
+ // +-------+
+ BasicBlock *SpecialCases = Builder.GetInsertBlock();
+ SpecialCases->setName(Twine(SpecialCases->getName(), "_udiv-special-cases"));
+ BasicBlock *End = SpecialCases->splitBasicBlock(Builder.GetInsertPoint(),
+ "udiv-end");
+ BasicBlock *LoopExit = BasicBlock::Create(Builder.getContext(),
+ "udiv-loop-exit", F, End);
+ BasicBlock *DoWhile = BasicBlock::Create(Builder.getContext(),
+ "udiv-do-while", F, End);
+ BasicBlock *Preheader = BasicBlock::Create(Builder.getContext(),
+ "udiv-preheader", F, End);
+ BasicBlock *BB1 = BasicBlock::Create(Builder.getContext(),
+ "udiv-bb1", F, End);
+
+ // We'll be overwriting the terminator to insert our extra blocks
+ SpecialCases->getTerminator()->eraseFromParent();
+
+ // Same instructions are generated for both i32 (msb 31) and i64 (msb 63).
+
+ // First off, check for special cases: dividend or divisor is zero, divisor
+ // is greater than dividend, and divisor is 1.
+ // ; special-cases:
+ // ; %ret0_1 = icmp eq i32 %divisor, 0
+ // ; %ret0_2 = icmp eq i32 %dividend, 0
+ // ; %ret0_3 = or i1 %ret0_1, %ret0_2
+ // ; %tmp0 = tail call i32 @llvm.ctlz.i32(i32 %divisor, i1 true)
+ // ; %tmp1 = tail call i32 @llvm.ctlz.i32(i32 %dividend, i1 true)
+ // ; %sr = sub nsw i32 %tmp0, %tmp1
+ // ; %ret0_4 = icmp ugt i32 %sr, 31
+ // ; %ret0 = or i1 %ret0_3, %ret0_4
+ // ; %retDividend = icmp eq i32 %sr, 31
+ // ; %retVal = select i1 %ret0, i32 0, i32 %dividend
+ // ; %earlyRet = or i1 %ret0, %retDividend
+ // ; br i1 %earlyRet, label %end, label %bb1
+ Builder.SetInsertPoint(SpecialCases);
+ Value *Ret0_1 = Builder.CreateICmpEQ(Divisor, Zero);
+ Value *Ret0_2 = Builder.CreateICmpEQ(Dividend, Zero);
+ Value *Ret0_3 = Builder.CreateOr(Ret0_1, Ret0_2);
+ Value *Tmp0 = Builder.CreateCall(CTLZ, {Divisor, True});
+ Value *Tmp1 = Builder.CreateCall(CTLZ, {Dividend, True});
+ Value *SR = Builder.CreateSub(Tmp0, Tmp1);
+ Value *Ret0_4 = Builder.CreateICmpUGT(SR, MSB);
+ Value *Ret0 = Builder.CreateOr(Ret0_3, Ret0_4);
+ Value *RetDividend = Builder.CreateICmpEQ(SR, MSB);
+ Value *RetVal = Builder.CreateSelect(Ret0, Zero, Dividend);
+ Value *EarlyRet = Builder.CreateOr(Ret0, RetDividend);
+ Builder.CreateCondBr(EarlyRet, End, BB1);
+
+ // ; bb1: ; preds = %special-cases
+ // ; %sr_1 = add i32 %sr, 1
+ // ; %tmp2 = sub i32 31, %sr
+ // ; %q = shl i32 %dividend, %tmp2
+ // ; %skipLoop = icmp eq i32 %sr_1, 0
+ // ; br i1 %skipLoop, label %loop-exit, label %preheader
+ Builder.SetInsertPoint(BB1);
+ Value *SR_1 = Builder.CreateAdd(SR, One);
+ Value *Tmp2 = Builder.CreateSub(MSB, SR);
+ Value *Q = Builder.CreateShl(Dividend, Tmp2);
+ Value *SkipLoop = Builder.CreateICmpEQ(SR_1, Zero);
+ Builder.CreateCondBr(SkipLoop, LoopExit, Preheader);
+
+ // ; preheader: ; preds = %bb1
+ // ; %tmp3 = lshr i32 %dividend, %sr_1
+ // ; %tmp4 = add i32 %divisor, -1
+ // ; br label %do-while
+ Builder.SetInsertPoint(Preheader);
+ Value *Tmp3 = Builder.CreateLShr(Dividend, SR_1);
+ Value *Tmp4 = Builder.CreateAdd(Divisor, NegOne);
+ Builder.CreateBr(DoWhile);
+
+ // ; do-while: ; preds = %do-while, %preheader
+ // ; %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
+ // ; %sr_3 = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
+ // ; %r_1 = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
+ // ; %q_2 = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
+ // ; %tmp5 = shl i32 %r_1, 1
+ // ; %tmp6 = lshr i32 %q_2, 31
+ // ; %tmp7 = or i32 %tmp5, %tmp6
+ // ; %tmp8 = shl i32 %q_2, 1
+ // ; %q_1 = or i32 %carry_1, %tmp8
+ // ; %tmp9 = sub i32 %tmp4, %tmp7
+ // ; %tmp10 = ashr i32 %tmp9, 31
+ // ; %carry = and i32 %tmp10, 1
+ // ; %tmp11 = and i32 %tmp10, %divisor
+ // ; %r = sub i32 %tmp7, %tmp11
+ // ; %sr_2 = add i32 %sr_3, -1
+ // ; %tmp12 = icmp eq i32 %sr_2, 0
+ // ; br i1 %tmp12, label %loop-exit, label %do-while
+ Builder.SetInsertPoint(DoWhile);
+ PHINode *Carry_1 = Builder.CreatePHI(DivTy, 2);
+ PHINode *SR_3 = Builder.CreatePHI(DivTy, 2);
+ PHINode *R_1 = Builder.CreatePHI(DivTy, 2);
+ PHINode *Q_2 = Builder.CreatePHI(DivTy, 2);
+ Value *Tmp5 = Builder.CreateShl(R_1, One);
+ Value *Tmp6 = Builder.CreateLShr(Q_2, MSB);
+ Value *Tmp7 = Builder.CreateOr(Tmp5, Tmp6);
+ Value *Tmp8 = Builder.CreateShl(Q_2, One);
+ Value *Q_1 = Builder.CreateOr(Carry_1, Tmp8);
+ Value *Tmp9 = Builder.CreateSub(Tmp4, Tmp7);
+ Value *Tmp10 = Builder.CreateAShr(Tmp9, MSB);
+ Value *Carry = Builder.CreateAnd(Tmp10, One);
+ Value *Tmp11 = Builder.CreateAnd(Tmp10, Divisor);
+ Value *R = Builder.CreateSub(Tmp7, Tmp11);
+ Value *SR_2 = Builder.CreateAdd(SR_3, NegOne);
+ Value *Tmp12 = Builder.CreateICmpEQ(SR_2, Zero);
+ Builder.CreateCondBr(Tmp12, LoopExit, DoWhile);
+
+ // ; loop-exit: ; preds = %do-while, %bb1
+ // ; %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
+ // ; %q_3 = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
+ // ; %tmp13 = shl i32 %q_3, 1
+ // ; %q_4 = or i32 %carry_2, %tmp13
+ // ; br label %end
+ Builder.SetInsertPoint(LoopExit);
+ PHINode *Carry_2 = Builder.CreatePHI(DivTy, 2);
+ PHINode *Q_3 = Builder.CreatePHI(DivTy, 2);
+ Value *Tmp13 = Builder.CreateShl(Q_3, One);
+ Value *Q_4 = Builder.CreateOr(Carry_2, Tmp13);
+ Builder.CreateBr(End);
+
+ // ; end: ; preds = %loop-exit, %special-cases
+ // ; %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
+ // ; ret i32 %q_5
+ Builder.SetInsertPoint(End, End->begin());
+ PHINode *Q_5 = Builder.CreatePHI(DivTy, 2);
+
+ // Populate the Phis, since all values have now been created. Our Phis were:
+ // ; %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
+ Carry_1->addIncoming(Zero, Preheader);
+ Carry_1->addIncoming(Carry, DoWhile);
+ // ; %sr_3 = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
+ SR_3->addIncoming(SR_1, Preheader);
+ SR_3->addIncoming(SR_2, DoWhile);
+ // ; %r_1 = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
+ R_1->addIncoming(Tmp3, Preheader);
+ R_1->addIncoming(R, DoWhile);
+ // ; %q_2 = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
+ Q_2->addIncoming(Q, Preheader);
+ Q_2->addIncoming(Q_1, DoWhile);
+ // ; %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
+ Carry_2->addIncoming(Zero, BB1);
+ Carry_2->addIncoming(Carry, DoWhile);
+ // ; %q_3 = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
+ Q_3->addIncoming(Q, BB1);
+ Q_3->addIncoming(Q_1, DoWhile);
+ // ; %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
+ Q_5->addIncoming(Q_4, LoopExit);
+ Q_5->addIncoming(RetVal, SpecialCases);
+
+ return Q_5;
+}
+
+/// Generate code to calculate the remainder of two integers, replacing Rem with
+/// the generated code. This currently generates code using the udiv expansion,
+/// but future work includes generating more specialized code, e.g. when more
+/// information about the operands are known. Implements both 32bit and 64bit
+/// scalar division.
+///
+/// Replace Rem with generated code.
+bool llvm::expandRemainder(BinaryOperator *Rem) {
+ assert((Rem->getOpcode() == Instruction::SRem ||
+ Rem->getOpcode() == Instruction::URem) &&
+ "Trying to expand remainder from a non-remainder function");
+
+ IRBuilder<> Builder(Rem);
+
+ assert(!Rem->getType()->isVectorTy() && "Div over vectors not supported");
+ assert((Rem->getType()->getIntegerBitWidth() == 32 ||
+ Rem->getType()->getIntegerBitWidth() == 64) &&
+ "Div of bitwidth other than 32 or 64 not supported");
+
+ // First prepare the sign if it's a signed remainder
+ if (Rem->getOpcode() == Instruction::SRem) {
+ Value *Remainder = generateSignedRemainderCode(Rem->getOperand(0),
+ Rem->getOperand(1), Builder);
+
+ // Check whether this is the insert point while Rem is still valid.
+ bool IsInsertPoint = Rem->getIterator() == Builder.GetInsertPoint();
+ Rem->replaceAllUsesWith(Remainder);
+ Rem->dropAllReferences();
+ Rem->eraseFromParent();
+
+ // If we didn't actually generate an urem instruction, we're done
+ // This happens for example if the input were constant. In this case the
+ // Builder insertion point was unchanged
+ if (IsInsertPoint)
+ return true;
+
+ BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
+ Rem = BO;
+ }
+
+ Value *Remainder = generatedUnsignedRemainderCode(Rem->getOperand(0),
+ Rem->getOperand(1),
+ Builder);
+
+ Rem->replaceAllUsesWith(Remainder);
+ Rem->dropAllReferences();
+ Rem->eraseFromParent();
+
+ // Expand the udiv
+ if (BinaryOperator *UDiv = dyn_cast<BinaryOperator>(Builder.GetInsertPoint())) {
+ assert(UDiv->getOpcode() == Instruction::UDiv && "Non-udiv in expansion?");
+ expandDivision(UDiv);
+ }
+
+ return true;
+}
+
+
+/// Generate code to divide two integers, replacing Div with the generated
+/// code. This currently generates code similarly to compiler-rt's
+/// implementations, but future work includes generating more specialized code
+/// when more information about the operands are known. Implements both
+/// 32bit and 64bit scalar division.
+///
+/// Replace Div with generated code.
+bool llvm::expandDivision(BinaryOperator *Div) {
+ assert((Div->getOpcode() == Instruction::SDiv ||
+ Div->getOpcode() == Instruction::UDiv) &&
+ "Trying to expand division from a non-division function");
+
+ IRBuilder<> Builder(Div);
+
+ assert(!Div->getType()->isVectorTy() && "Div over vectors not supported");
+ assert((Div->getType()->getIntegerBitWidth() == 32 ||
+ Div->getType()->getIntegerBitWidth() == 64) &&
+ "Div of bitwidth other than 32 or 64 not supported");
+
+ // First prepare the sign if it's a signed division
+ if (Div->getOpcode() == Instruction::SDiv) {
+ // Lower the code to unsigned division, and reset Div to point to the udiv.
+ Value *Quotient = generateSignedDivisionCode(Div->getOperand(0),
+ Div->getOperand(1), Builder);
+
+ // Check whether this is the insert point while Div is still valid.
+ bool IsInsertPoint = Div->getIterator() == Builder.GetInsertPoint();
+ Div->replaceAllUsesWith(Quotient);
+ Div->dropAllReferences();
+ Div->eraseFromParent();
+
+ // If we didn't actually generate an udiv instruction, we're done
+ // This happens for example if the input were constant. In this case the
+ // Builder insertion point was unchanged
+ if (IsInsertPoint)
+ return true;
+
+ BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
+ Div = BO;
+ }
+
+ // Insert the unsigned division code
+ Value *Quotient = generateUnsignedDivisionCode(Div->getOperand(0),
+ Div->getOperand(1),
+ Builder);
+ Div->replaceAllUsesWith(Quotient);
+ Div->dropAllReferences();
+ Div->eraseFromParent();
+
+ return true;
+}
+
+/// Generate code to compute the remainder of two integers of bitwidth up to
+/// 32 bits. Uses the above routines and extends the inputs/truncates the
+/// outputs to operate in 32 bits; that is, these routines are good for targets
+/// that have no or very little suppport for smaller than 32 bit integer
+/// arithmetic.
+///
+/// Replace Rem with emulation code.
+bool llvm::expandRemainderUpTo32Bits(BinaryOperator *Rem) {
+ assert((Rem->getOpcode() == Instruction::SRem ||
+ Rem->getOpcode() == Instruction::URem) &&
+ "Trying to expand remainder from a non-remainder function");
+
+ Type *RemTy = Rem->getType();
+ assert(!RemTy->isVectorTy() && "Div over vectors not supported");
+
+ unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
+
+ assert(RemTyBitWidth <= 32 &&
+ "Div of bitwidth greater than 32 not supported");
+
+ if (RemTyBitWidth == 32)
+ return expandRemainder(Rem);
+
+ // If bitwidth smaller than 32 extend inputs, extend output and proceed
+ // with 32 bit division.
+ IRBuilder<> Builder(Rem);
+
+ Value *ExtDividend;
+ Value *ExtDivisor;
+ Value *ExtRem;
+ Value *Trunc;
+ Type *Int32Ty = Builder.getInt32Ty();
+
+ if (Rem->getOpcode() == Instruction::SRem) {
+ ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int32Ty);
+ ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int32Ty);
+ ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor);
+ } else {
+ ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int32Ty);
+ ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int32Ty);
+ ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor);
+ }
+ Trunc = Builder.CreateTrunc(ExtRem, RemTy);
+
+ Rem->replaceAllUsesWith(Trunc);
+ Rem->dropAllReferences();
+ Rem->eraseFromParent();
+
+ return expandRemainder(cast<BinaryOperator>(ExtRem));
+}
+
+/// Generate code to compute the remainder of two integers of bitwidth up to
+/// 64 bits. Uses the above routines and extends the inputs/truncates the
+/// outputs to operate in 64 bits.
+///
+/// Replace Rem with emulation code.
+bool llvm::expandRemainderUpTo64Bits(BinaryOperator *Rem) {
+ assert((Rem->getOpcode() == Instruction::SRem ||
+ Rem->getOpcode() == Instruction::URem) &&
+ "Trying to expand remainder from a non-remainder function");
+
+ Type *RemTy = Rem->getType();
+ assert(!RemTy->isVectorTy() && "Div over vectors not supported");
+
+ unsigned RemTyBitWidth = RemTy->getIntegerBitWidth();
+
+ assert(RemTyBitWidth <= 64 && "Div of bitwidth greater than 64 not supported");
+
+ if (RemTyBitWidth == 64)
+ return expandRemainder(Rem);
+
+ // If bitwidth smaller than 64 extend inputs, extend output and proceed
+ // with 64 bit division.
+ IRBuilder<> Builder(Rem);
+
+ Value *ExtDividend;
+ Value *ExtDivisor;
+ Value *ExtRem;
+ Value *Trunc;
+ Type *Int64Ty = Builder.getInt64Ty();
+
+ if (Rem->getOpcode() == Instruction::SRem) {
+ ExtDividend = Builder.CreateSExt(Rem->getOperand(0), Int64Ty);
+ ExtDivisor = Builder.CreateSExt(Rem->getOperand(1), Int64Ty);
+ ExtRem = Builder.CreateSRem(ExtDividend, ExtDivisor);
+ } else {
+ ExtDividend = Builder.CreateZExt(Rem->getOperand(0), Int64Ty);
+ ExtDivisor = Builder.CreateZExt(Rem->getOperand(1), Int64Ty);
+ ExtRem = Builder.CreateURem(ExtDividend, ExtDivisor);
+ }
+ Trunc = Builder.CreateTrunc(ExtRem, RemTy);
+
+ Rem->replaceAllUsesWith(Trunc);
+ Rem->dropAllReferences();
+ Rem->eraseFromParent();
+
+ return expandRemainder(cast<BinaryOperator>(ExtRem));
+}
+
+/// Generate code to divide two integers of bitwidth up to 32 bits. Uses the
+/// above routines and extends the inputs/truncates the outputs to operate
+/// in 32 bits; that is, these routines are good for targets that have no
+/// or very little support for smaller than 32 bit integer arithmetic.
+///
+/// Replace Div with emulation code.
+bool llvm::expandDivisionUpTo32Bits(BinaryOperator *Div) {
+ assert((Div->getOpcode() == Instruction::SDiv ||
+ Div->getOpcode() == Instruction::UDiv) &&
+ "Trying to expand division from a non-division function");
+
+ Type *DivTy = Div->getType();
+ assert(!DivTy->isVectorTy() && "Div over vectors not supported");
+
+ unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
+
+ assert(DivTyBitWidth <= 32 && "Div of bitwidth greater than 32 not supported");
+
+ if (DivTyBitWidth == 32)
+ return expandDivision(Div);
+
+ // If bitwidth smaller than 32 extend inputs, extend output and proceed
+ // with 32 bit division.
+ IRBuilder<> Builder(Div);
+
+ Value *ExtDividend;
+ Value *ExtDivisor;
+ Value *ExtDiv;
+ Value *Trunc;
+ Type *Int32Ty = Builder.getInt32Ty();
+
+ if (Div->getOpcode() == Instruction::SDiv) {
+ ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int32Ty);
+ ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int32Ty);
+ ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor);
+ } else {
+ ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int32Ty);
+ ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int32Ty);
+ ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor);
+ }
+ Trunc = Builder.CreateTrunc(ExtDiv, DivTy);
+
+ Div->replaceAllUsesWith(Trunc);
+ Div->dropAllReferences();
+ Div->eraseFromParent();
+
+ return expandDivision(cast<BinaryOperator>(ExtDiv));
+}
+
+/// Generate code to divide two integers of bitwidth up to 64 bits. Uses the
+/// above routines and extends the inputs/truncates the outputs to operate
+/// in 64 bits.
+///
+/// Replace Div with emulation code.
+bool llvm::expandDivisionUpTo64Bits(BinaryOperator *Div) {
+ assert((Div->getOpcode() == Instruction::SDiv ||
+ Div->getOpcode() == Instruction::UDiv) &&
+ "Trying to expand division from a non-division function");
+
+ Type *DivTy = Div->getType();
+ assert(!DivTy->isVectorTy() && "Div over vectors not supported");
+
+ unsigned DivTyBitWidth = DivTy->getIntegerBitWidth();
+
+ assert(DivTyBitWidth <= 64 &&
+ "Div of bitwidth greater than 64 not supported");
+
+ if (DivTyBitWidth == 64)
+ return expandDivision(Div);
+
+ // If bitwidth smaller than 64 extend inputs, extend output and proceed
+ // with 64 bit division.
+ IRBuilder<> Builder(Div);
+
+ Value *ExtDividend;
+ Value *ExtDivisor;
+ Value *ExtDiv;
+ Value *Trunc;
+ Type *Int64Ty = Builder.getInt64Ty();
+
+ if (Div->getOpcode() == Instruction::SDiv) {
+ ExtDividend = Builder.CreateSExt(Div->getOperand(0), Int64Ty);
+ ExtDivisor = Builder.CreateSExt(Div->getOperand(1), Int64Ty);
+ ExtDiv = Builder.CreateSDiv(ExtDividend, ExtDivisor);
+ } else {
+ ExtDividend = Builder.CreateZExt(Div->getOperand(0), Int64Ty);
+ ExtDivisor = Builder.CreateZExt(Div->getOperand(1), Int64Ty);
+ ExtDiv = Builder.CreateUDiv(ExtDividend, ExtDivisor);
+ }
+ Trunc = Builder.CreateTrunc(ExtDiv, DivTy);
+
+ Div->replaceAllUsesWith(Trunc);
+ Div->dropAllReferences();
+ Div->eraseFromParent();
+
+ return expandDivision(cast<BinaryOperator>(ExtDiv));
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LCSSA.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LCSSA.cpp
index c632f11e46..7437701f53 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LCSSA.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LCSSA.cpp
@@ -1,277 +1,277 @@
-//===-- LCSSA.cpp - Convert loops into loop-closed SSA form ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass transforms loops by placing phi nodes at the end of the loops for
-// all values that are live across the loop boundary. For example, it turns
-// the left into the right code:
-//
-// for (...) for (...)
-// if (c) if (c)
-// X1 = ... X1 = ...
-// else else
-// X2 = ... X2 = ...
-// X3 = phi(X1, X2) X3 = phi(X1, X2)
-// ... = X3 + 4 X4 = phi(X3)
-// ... = X4 + 4
-//
-// This is still valid LLVM; the extra phi nodes are purely redundant, and will
-// be trivially eliminated by InstCombine. The major benefit of this
-// transformation is that it makes many other loop optimizations, such as
-// LoopUnswitching, simpler.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/LCSSA.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
+//===-- LCSSA.cpp - Convert loops into loop-closed SSA form ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops by placing phi nodes at the end of the loops for
+// all values that are live across the loop boundary. For example, it turns
+// the left into the right code:
+//
+// for (...) for (...)
+// if (c) if (c)
+// X1 = ... X1 = ...
+// else else
+// X2 = ... X2 = ...
+// X3 = phi(X1, X2) X3 = phi(X1, X2)
+// ... = X3 + 4 X4 = phi(X3)
+// ... = X4 + 4
+//
+// This is still valid LLVM; the extra phi nodes are purely redundant, and will
+// be trivially eliminated by InstCombine. The major benefit of this
+// transformation is that it makes many other loop optimizations, such as
+// LoopUnswitching, simpler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LCSSA.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PredIteratorCache.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "lcssa"
-
-STATISTIC(NumLCSSA, "Number of live out of a loop variables");
-
-#ifdef EXPENSIVE_CHECKS
-static bool VerifyLoopLCSSA = true;
-#else
-static bool VerifyLoopLCSSA = false;
-#endif
-static cl::opt<bool, true>
- VerifyLoopLCSSAFlag("verify-loop-lcssa", cl::location(VerifyLoopLCSSA),
- cl::Hidden,
- cl::desc("Verify loop lcssa form (time consuming)"));
-
-/// Return true if the specified block is in the list.
-static bool isExitBlock(BasicBlock *BB,
- const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
- return is_contained(ExitBlocks, BB);
-}
-
-/// For every instruction from the worklist, check to see if it has any uses
-/// that are outside the current loop. If so, insert LCSSA PHI nodes and
-/// rewrite the uses.
-bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
- const DominatorTree &DT, const LoopInfo &LI,
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PredIteratorCache.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "lcssa"
+
+STATISTIC(NumLCSSA, "Number of live out of a loop variables");
+
+#ifdef EXPENSIVE_CHECKS
+static bool VerifyLoopLCSSA = true;
+#else
+static bool VerifyLoopLCSSA = false;
+#endif
+static cl::opt<bool, true>
+ VerifyLoopLCSSAFlag("verify-loop-lcssa", cl::location(VerifyLoopLCSSA),
+ cl::Hidden,
+ cl::desc("Verify loop lcssa form (time consuming)"));
+
+/// Return true if the specified block is in the list.
+static bool isExitBlock(BasicBlock *BB,
+ const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
+ return is_contained(ExitBlocks, BB);
+}
+
+/// For every instruction from the worklist, check to see if it has any uses
+/// that are outside the current loop. If so, insert LCSSA PHI nodes and
+/// rewrite the uses.
+bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
+ const DominatorTree &DT, const LoopInfo &LI,
ScalarEvolution *SE, IRBuilderBase &Builder,
SmallVectorImpl<PHINode *> *PHIsToRemove) {
- SmallVector<Use *, 16> UsesToRewrite;
+ SmallVector<Use *, 16> UsesToRewrite;
SmallSetVector<PHINode *, 16> LocalPHIsToRemove;
- PredIteratorCache PredCache;
- bool Changed = false;
-
+ PredIteratorCache PredCache;
+ bool Changed = false;
+
IRBuilderBase::InsertPointGuard InsertPtGuard(Builder);
- // Cache the Loop ExitBlocks across this loop. We expect to get a lot of
- // instructions within the same loops, computing the exit blocks is
- // expensive, and we're not mutating the loop structure.
- SmallDenseMap<Loop*, SmallVector<BasicBlock *,1>> LoopExitBlocks;
-
- while (!Worklist.empty()) {
- UsesToRewrite.clear();
-
- Instruction *I = Worklist.pop_back_val();
- assert(!I->getType()->isTokenTy() && "Tokens shouldn't be in the worklist");
- BasicBlock *InstBB = I->getParent();
- Loop *L = LI.getLoopFor(InstBB);
- assert(L && "Instruction belongs to a BB that's not part of a loop");
- if (!LoopExitBlocks.count(L))
- L->getExitBlocks(LoopExitBlocks[L]);
- assert(LoopExitBlocks.count(L));
- const SmallVectorImpl<BasicBlock *> &ExitBlocks = LoopExitBlocks[L];
-
- if (ExitBlocks.empty())
- continue;
-
- for (Use &U : I->uses()) {
- Instruction *User = cast<Instruction>(U.getUser());
- BasicBlock *UserBB = User->getParent();
+ // Cache the Loop ExitBlocks across this loop. We expect to get a lot of
+ // instructions within the same loops, computing the exit blocks is
+ // expensive, and we're not mutating the loop structure.
+ SmallDenseMap<Loop*, SmallVector<BasicBlock *,1>> LoopExitBlocks;
+
+ while (!Worklist.empty()) {
+ UsesToRewrite.clear();
+
+ Instruction *I = Worklist.pop_back_val();
+ assert(!I->getType()->isTokenTy() && "Tokens shouldn't be in the worklist");
+ BasicBlock *InstBB = I->getParent();
+ Loop *L = LI.getLoopFor(InstBB);
+ assert(L && "Instruction belongs to a BB that's not part of a loop");
+ if (!LoopExitBlocks.count(L))
+ L->getExitBlocks(LoopExitBlocks[L]);
+ assert(LoopExitBlocks.count(L));
+ const SmallVectorImpl<BasicBlock *> &ExitBlocks = LoopExitBlocks[L];
+
+ if (ExitBlocks.empty())
+ continue;
+
+ for (Use &U : I->uses()) {
+ Instruction *User = cast<Instruction>(U.getUser());
+ BasicBlock *UserBB = User->getParent();
// For practical purposes, we consider that the use in a PHI
// occurs in the respective predecessor block. For more info,
// see the `phi` doc in LangRef and the LCSSA doc.
- if (auto *PN = dyn_cast<PHINode>(User))
- UserBB = PN->getIncomingBlock(U);
-
- if (InstBB != UserBB && !L->contains(UserBB))
- UsesToRewrite.push_back(&U);
- }
-
- // If there are no uses outside the loop, exit with no change.
- if (UsesToRewrite.empty())
- continue;
-
- ++NumLCSSA; // We are applying the transformation
-
- // Invoke instructions are special in that their result value is not
- // available along their unwind edge. The code below tests to see whether
- // DomBB dominates the value, so adjust DomBB to the normal destination
- // block, which is effectively where the value is first usable.
- BasicBlock *DomBB = InstBB;
- if (auto *Inv = dyn_cast<InvokeInst>(I))
- DomBB = Inv->getNormalDest();
-
- const DomTreeNode *DomNode = DT.getNode(DomBB);
-
- SmallVector<PHINode *, 16> AddedPHIs;
- SmallVector<PHINode *, 8> PostProcessPHIs;
-
- SmallVector<PHINode *, 4> InsertedPHIs;
- SSAUpdater SSAUpdate(&InsertedPHIs);
- SSAUpdate.Initialize(I->getType(), I->getName());
-
- // Force re-computation of I, as some users now need to use the new PHI
- // node.
- if (SE)
- SE->forgetValue(I);
-
- // Insert the LCSSA phi's into all of the exit blocks dominated by the
- // value, and add them to the Phi's map.
- for (BasicBlock *ExitBB : ExitBlocks) {
- if (!DT.dominates(DomNode, DT.getNode(ExitBB)))
- continue;
-
- // If we already inserted something for this BB, don't reprocess it.
- if (SSAUpdate.HasValueForBlock(ExitBB))
- continue;
+ if (auto *PN = dyn_cast<PHINode>(User))
+ UserBB = PN->getIncomingBlock(U);
+
+ if (InstBB != UserBB && !L->contains(UserBB))
+ UsesToRewrite.push_back(&U);
+ }
+
+ // If there are no uses outside the loop, exit with no change.
+ if (UsesToRewrite.empty())
+ continue;
+
+ ++NumLCSSA; // We are applying the transformation
+
+ // Invoke instructions are special in that their result value is not
+ // available along their unwind edge. The code below tests to see whether
+ // DomBB dominates the value, so adjust DomBB to the normal destination
+ // block, which is effectively where the value is first usable.
+ BasicBlock *DomBB = InstBB;
+ if (auto *Inv = dyn_cast<InvokeInst>(I))
+ DomBB = Inv->getNormalDest();
+
+ const DomTreeNode *DomNode = DT.getNode(DomBB);
+
+ SmallVector<PHINode *, 16> AddedPHIs;
+ SmallVector<PHINode *, 8> PostProcessPHIs;
+
+ SmallVector<PHINode *, 4> InsertedPHIs;
+ SSAUpdater SSAUpdate(&InsertedPHIs);
+ SSAUpdate.Initialize(I->getType(), I->getName());
+
+ // Force re-computation of I, as some users now need to use the new PHI
+ // node.
+ if (SE)
+ SE->forgetValue(I);
+
+ // Insert the LCSSA phi's into all of the exit blocks dominated by the
+ // value, and add them to the Phi's map.
+ for (BasicBlock *ExitBB : ExitBlocks) {
+ if (!DT.dominates(DomNode, DT.getNode(ExitBB)))
+ continue;
+
+ // If we already inserted something for this BB, don't reprocess it.
+ if (SSAUpdate.HasValueForBlock(ExitBB))
+ continue;
Builder.SetInsertPoint(&ExitBB->front());
PHINode *PN = Builder.CreatePHI(I->getType(), PredCache.size(ExitBB),
I->getName() + ".lcssa");
- // Get the debug location from the original instruction.
- PN->setDebugLoc(I->getDebugLoc());
+ // Get the debug location from the original instruction.
+ PN->setDebugLoc(I->getDebugLoc());
// Add inputs from inside the loop for this PHI. This is valid
// because `I` dominates `ExitBB` (checked above). This implies
// that every incoming block/edge is dominated by `I` as well,
// i.e. we can add uses of `I` to those incoming edges/append to the incoming
// blocks without violating the SSA dominance property.
- for (BasicBlock *Pred : PredCache.get(ExitBB)) {
- PN->addIncoming(I, Pred);
-
- // If the exit block has a predecessor not within the loop, arrange for
- // the incoming value use corresponding to that predecessor to be
- // rewritten in terms of a different LCSSA PHI.
- if (!L->contains(Pred))
- UsesToRewrite.push_back(
- &PN->getOperandUse(PN->getOperandNumForIncomingValue(
- PN->getNumIncomingValues() - 1)));
- }
-
- AddedPHIs.push_back(PN);
-
- // Remember that this phi makes the value alive in this block.
- SSAUpdate.AddAvailableValue(ExitBB, PN);
-
- // LoopSimplify might fail to simplify some loops (e.g. when indirect
- // branches are involved). In such situations, it might happen that an
- // exit for Loop L1 is the header of a disjoint Loop L2. Thus, when we
- // create PHIs in such an exit block, we are also inserting PHIs into L2's
- // header. This could break LCSSA form for L2 because these inserted PHIs
- // can also have uses outside of L2. Remember all PHIs in such situation
- // as to revisit than later on. FIXME: Remove this if indirectbr support
- // into LoopSimplify gets improved.
- if (auto *OtherLoop = LI.getLoopFor(ExitBB))
- if (!L->contains(OtherLoop))
- PostProcessPHIs.push_back(PN);
- }
-
- // Rewrite all uses outside the loop in terms of the new PHIs we just
- // inserted.
- for (Use *UseToRewrite : UsesToRewrite) {
- Instruction *User = cast<Instruction>(UseToRewrite->getUser());
- BasicBlock *UserBB = User->getParent();
+ for (BasicBlock *Pred : PredCache.get(ExitBB)) {
+ PN->addIncoming(I, Pred);
+
+ // If the exit block has a predecessor not within the loop, arrange for
+ // the incoming value use corresponding to that predecessor to be
+ // rewritten in terms of a different LCSSA PHI.
+ if (!L->contains(Pred))
+ UsesToRewrite.push_back(
+ &PN->getOperandUse(PN->getOperandNumForIncomingValue(
+ PN->getNumIncomingValues() - 1)));
+ }
+
+ AddedPHIs.push_back(PN);
+
+ // Remember that this phi makes the value alive in this block.
+ SSAUpdate.AddAvailableValue(ExitBB, PN);
+
+ // LoopSimplify might fail to simplify some loops (e.g. when indirect
+ // branches are involved). In such situations, it might happen that an
+ // exit for Loop L1 is the header of a disjoint Loop L2. Thus, when we
+ // create PHIs in such an exit block, we are also inserting PHIs into L2's
+ // header. This could break LCSSA form for L2 because these inserted PHIs
+ // can also have uses outside of L2. Remember all PHIs in such situation
+ // as to revisit than later on. FIXME: Remove this if indirectbr support
+ // into LoopSimplify gets improved.
+ if (auto *OtherLoop = LI.getLoopFor(ExitBB))
+ if (!L->contains(OtherLoop))
+ PostProcessPHIs.push_back(PN);
+ }
+
+ // Rewrite all uses outside the loop in terms of the new PHIs we just
+ // inserted.
+ for (Use *UseToRewrite : UsesToRewrite) {
+ Instruction *User = cast<Instruction>(UseToRewrite->getUser());
+ BasicBlock *UserBB = User->getParent();
// For practical purposes, we consider that the use in a PHI
// occurs in the respective predecessor block. For more info,
// see the `phi` doc in LangRef and the LCSSA doc.
- if (auto *PN = dyn_cast<PHINode>(User))
- UserBB = PN->getIncomingBlock(*UseToRewrite);
-
+ if (auto *PN = dyn_cast<PHINode>(User))
+ UserBB = PN->getIncomingBlock(*UseToRewrite);
+
// If this use is in an exit block, rewrite to use the newly inserted PHI.
// This is required for correctness because SSAUpdate doesn't handle uses
// in the same block. It assumes the PHI we inserted is at the end of the
// block.
- if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
- UseToRewrite->set(&UserBB->front());
- continue;
- }
-
- // If we added a single PHI, it must dominate all uses and we can directly
- // rename it.
- if (AddedPHIs.size() == 1) {
- UseToRewrite->set(AddedPHIs[0]);
- continue;
- }
-
- // Otherwise, do full PHI insertion.
- SSAUpdate.RewriteUse(*UseToRewrite);
- }
-
- SmallVector<DbgValueInst *, 4> DbgValues;
- llvm::findDbgValues(DbgValues, I);
-
- // Update pre-existing debug value uses that reside outside the loop.
- auto &Ctx = I->getContext();
- for (auto DVI : DbgValues) {
- BasicBlock *UserBB = DVI->getParent();
- if (InstBB == UserBB || L->contains(UserBB))
- continue;
- // We currently only handle debug values residing in blocks that were
- // traversed while rewriting the uses. If we inserted just a single PHI,
- // we will handle all relevant debug values.
- Value *V = AddedPHIs.size() == 1 ? AddedPHIs[0]
- : SSAUpdate.FindValueForBlock(UserBB);
- if (V)
- DVI->setOperand(0, MetadataAsValue::get(Ctx, ValueAsMetadata::get(V)));
- }
-
- // SSAUpdater might have inserted phi-nodes inside other loops. We'll need
- // to post-process them to keep LCSSA form.
- for (PHINode *InsertedPN : InsertedPHIs) {
- if (auto *OtherLoop = LI.getLoopFor(InsertedPN->getParent()))
- if (!L->contains(OtherLoop))
- PostProcessPHIs.push_back(InsertedPN);
- }
-
- // Post process PHI instructions that were inserted into another disjoint
- // loop and update their exits properly.
- for (auto *PostProcessPN : PostProcessPHIs)
- if (!PostProcessPN->use_empty())
- Worklist.push_back(PostProcessPN);
-
- // Keep track of PHI nodes that we want to remove because they did not have
+ if (isa<PHINode>(UserBB->begin()) && isExitBlock(UserBB, ExitBlocks)) {
+ UseToRewrite->set(&UserBB->front());
+ continue;
+ }
+
+ // If we added a single PHI, it must dominate all uses and we can directly
+ // rename it.
+ if (AddedPHIs.size() == 1) {
+ UseToRewrite->set(AddedPHIs[0]);
+ continue;
+ }
+
+ // Otherwise, do full PHI insertion.
+ SSAUpdate.RewriteUse(*UseToRewrite);
+ }
+
+ SmallVector<DbgValueInst *, 4> DbgValues;
+ llvm::findDbgValues(DbgValues, I);
+
+ // Update pre-existing debug value uses that reside outside the loop.
+ auto &Ctx = I->getContext();
+ for (auto DVI : DbgValues) {
+ BasicBlock *UserBB = DVI->getParent();
+ if (InstBB == UserBB || L->contains(UserBB))
+ continue;
+ // We currently only handle debug values residing in blocks that were
+ // traversed while rewriting the uses. If we inserted just a single PHI,
+ // we will handle all relevant debug values.
+ Value *V = AddedPHIs.size() == 1 ? AddedPHIs[0]
+ : SSAUpdate.FindValueForBlock(UserBB);
+ if (V)
+ DVI->setOperand(0, MetadataAsValue::get(Ctx, ValueAsMetadata::get(V)));
+ }
+
+ // SSAUpdater might have inserted phi-nodes inside other loops. We'll need
+ // to post-process them to keep LCSSA form.
+ for (PHINode *InsertedPN : InsertedPHIs) {
+ if (auto *OtherLoop = LI.getLoopFor(InsertedPN->getParent()))
+ if (!L->contains(OtherLoop))
+ PostProcessPHIs.push_back(InsertedPN);
+ }
+
+ // Post process PHI instructions that were inserted into another disjoint
+ // loop and update their exits properly.
+ for (auto *PostProcessPN : PostProcessPHIs)
+ if (!PostProcessPN->use_empty())
+ Worklist.push_back(PostProcessPN);
+
+ // Keep track of PHI nodes that we want to remove because they did not have
// any uses rewritten.
- for (PHINode *PN : AddedPHIs)
- if (PN->use_empty())
+ for (PHINode *PN : AddedPHIs)
+ if (PN->use_empty())
LocalPHIsToRemove.insert(PN);
- Changed = true;
- }
+ Changed = true;
+ }
// Remove PHI nodes that did not have any uses rewritten or add them to
// PHIsToRemove, so the caller can remove them after some additional cleanup.
@@ -288,229 +288,229 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
if (PN->use_empty())
PN->eraseFromParent();
}
- return Changed;
-}
-
-// Compute the set of BasicBlocks in the loop `L` dominating at least one exit.
-static void computeBlocksDominatingExits(
- Loop &L, const DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks,
- SmallSetVector<BasicBlock *, 8> &BlocksDominatingExits) {
- // We start from the exit blocks, as every block trivially dominates itself
- // (not strictly).
+ return Changed;
+}
+
+// Compute the set of BasicBlocks in the loop `L` dominating at least one exit.
+static void computeBlocksDominatingExits(
+ Loop &L, const DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks,
+ SmallSetVector<BasicBlock *, 8> &BlocksDominatingExits) {
+ // We start from the exit blocks, as every block trivially dominates itself
+ // (not strictly).
SmallVector<BasicBlock *, 8> BBWorklist(ExitBlocks);
-
- while (!BBWorklist.empty()) {
- BasicBlock *BB = BBWorklist.pop_back_val();
-
- // Check if this is a loop header. If this is the case, we're done.
- if (L.getHeader() == BB)
- continue;
-
- // Otherwise, add its immediate predecessor in the dominator tree to the
- // worklist, unless we visited it already.
- BasicBlock *IDomBB = DT.getNode(BB)->getIDom()->getBlock();
-
- // Exit blocks can have an immediate dominator not beloinging to the
- // loop. For an exit block to be immediately dominated by another block
- // outside the loop, it implies not all paths from that dominator, to the
- // exit block, go through the loop.
- // Example:
- //
- // |---- A
- // | |
- // | B<--
- // | | |
- // |---> C --
- // |
- // D
- //
- // C is the exit block of the loop and it's immediately dominated by A,
- // which doesn't belong to the loop.
- if (!L.contains(IDomBB))
- continue;
-
- if (BlocksDominatingExits.insert(IDomBB))
- BBWorklist.push_back(IDomBB);
- }
-}
-
-bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
- ScalarEvolution *SE) {
- bool Changed = false;
-
-#ifdef EXPENSIVE_CHECKS
- // Verify all sub-loops are in LCSSA form already.
- for (Loop *SubLoop: L)
- assert(SubLoop->isRecursivelyLCSSAForm(DT, *LI) && "Subloop not in LCSSA!");
-#endif
-
- SmallVector<BasicBlock *, 8> ExitBlocks;
- L.getExitBlocks(ExitBlocks);
- if (ExitBlocks.empty())
- return false;
-
- SmallSetVector<BasicBlock *, 8> BlocksDominatingExits;
-
- // We want to avoid use-scanning leveraging dominance informations.
- // If a block doesn't dominate any of the loop exits, the none of the values
- // defined in the loop can be used outside.
- // We compute the set of blocks fullfilling the conditions in advance
- // walking the dominator tree upwards until we hit a loop header.
- computeBlocksDominatingExits(L, DT, ExitBlocks, BlocksDominatingExits);
-
- SmallVector<Instruction *, 8> Worklist;
-
- // Look at all the instructions in the loop, checking to see if they have uses
- // outside the loop. If so, put them into the worklist to rewrite those uses.
- for (BasicBlock *BB : BlocksDominatingExits) {
- // Skip blocks that are part of any sub-loops, they must be in LCSSA
- // already.
- if (LI->getLoopFor(BB) != &L)
- continue;
- for (Instruction &I : *BB) {
- // Reject two common cases fast: instructions with no uses (like stores)
- // and instructions with one use that is in the same block as this.
- if (I.use_empty() ||
- (I.hasOneUse() && I.user_back()->getParent() == BB &&
- !isa<PHINode>(I.user_back())))
- continue;
-
- // Tokens cannot be used in PHI nodes, so we skip over them.
- // We can run into tokens which are live out of a loop with catchswitch
- // instructions in Windows EH if the catchswitch has one catchpad which
- // is inside the loop and another which is not.
- if (I.getType()->isTokenTy())
- continue;
-
- Worklist.push_back(&I);
- }
- }
-
+
+ while (!BBWorklist.empty()) {
+ BasicBlock *BB = BBWorklist.pop_back_val();
+
+ // Check if this is a loop header. If this is the case, we're done.
+ if (L.getHeader() == BB)
+ continue;
+
+ // Otherwise, add its immediate predecessor in the dominator tree to the
+ // worklist, unless we visited it already.
+ BasicBlock *IDomBB = DT.getNode(BB)->getIDom()->getBlock();
+
+ // Exit blocks can have an immediate dominator not beloinging to the
+ // loop. For an exit block to be immediately dominated by another block
+ // outside the loop, it implies not all paths from that dominator, to the
+ // exit block, go through the loop.
+ // Example:
+ //
+ // |---- A
+ // | |
+ // | B<--
+ // | | |
+ // |---> C --
+ // |
+ // D
+ //
+ // C is the exit block of the loop and it's immediately dominated by A,
+ // which doesn't belong to the loop.
+ if (!L.contains(IDomBB))
+ continue;
+
+ if (BlocksDominatingExits.insert(IDomBB))
+ BBWorklist.push_back(IDomBB);
+ }
+}
+
+bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
+ ScalarEvolution *SE) {
+ bool Changed = false;
+
+#ifdef EXPENSIVE_CHECKS
+ // Verify all sub-loops are in LCSSA form already.
+ for (Loop *SubLoop: L)
+ assert(SubLoop->isRecursivelyLCSSAForm(DT, *LI) && "Subloop not in LCSSA!");
+#endif
+
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ L.getExitBlocks(ExitBlocks);
+ if (ExitBlocks.empty())
+ return false;
+
+ SmallSetVector<BasicBlock *, 8> BlocksDominatingExits;
+
+ // We want to avoid use-scanning leveraging dominance informations.
+ // If a block doesn't dominate any of the loop exits, the none of the values
+ // defined in the loop can be used outside.
+ // We compute the set of blocks fullfilling the conditions in advance
+ // walking the dominator tree upwards until we hit a loop header.
+ computeBlocksDominatingExits(L, DT, ExitBlocks, BlocksDominatingExits);
+
+ SmallVector<Instruction *, 8> Worklist;
+
+ // Look at all the instructions in the loop, checking to see if they have uses
+ // outside the loop. If so, put them into the worklist to rewrite those uses.
+ for (BasicBlock *BB : BlocksDominatingExits) {
+ // Skip blocks that are part of any sub-loops, they must be in LCSSA
+ // already.
+ if (LI->getLoopFor(BB) != &L)
+ continue;
+ for (Instruction &I : *BB) {
+ // Reject two common cases fast: instructions with no uses (like stores)
+ // and instructions with one use that is in the same block as this.
+ if (I.use_empty() ||
+ (I.hasOneUse() && I.user_back()->getParent() == BB &&
+ !isa<PHINode>(I.user_back())))
+ continue;
+
+ // Tokens cannot be used in PHI nodes, so we skip over them.
+ // We can run into tokens which are live out of a loop with catchswitch
+ // instructions in Windows EH if the catchswitch has one catchpad which
+ // is inside the loop and another which is not.
+ if (I.getType()->isTokenTy())
+ continue;
+
+ Worklist.push_back(&I);
+ }
+ }
+
IRBuilder<> Builder(L.getHeader()->getContext());
Changed = formLCSSAForInstructions(Worklist, DT, *LI, SE, Builder);
- // If we modified the code, remove any caches about the loop from SCEV to
- // avoid dangling entries.
- // FIXME: This is a big hammer, can we clear the cache more selectively?
- if (SE && Changed)
- SE->forgetLoop(&L);
-
- assert(L.isLCSSAForm(DT));
-
- return Changed;
-}
-
-/// Process a loop nest depth first.
-bool llvm::formLCSSARecursively(Loop &L, const DominatorTree &DT,
- const LoopInfo *LI, ScalarEvolution *SE) {
- bool Changed = false;
-
- // Recurse depth-first through inner loops.
- for (Loop *SubLoop : L.getSubLoops())
- Changed |= formLCSSARecursively(*SubLoop, DT, LI, SE);
-
- Changed |= formLCSSA(L, DT, LI, SE);
- return Changed;
-}
-
-/// Process all loops in the function, inner-most out.
-static bool formLCSSAOnAllLoops(const LoopInfo *LI, const DominatorTree &DT,
- ScalarEvolution *SE) {
- bool Changed = false;
- for (auto &L : *LI)
- Changed |= formLCSSARecursively(*L, DT, LI, SE);
- return Changed;
-}
-
-namespace {
-struct LCSSAWrapperPass : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- LCSSAWrapperPass() : FunctionPass(ID) {
- initializeLCSSAWrapperPassPass(*PassRegistry::getPassRegistry());
- }
-
- // Cached analysis information for the current function.
- DominatorTree *DT;
- LoopInfo *LI;
- ScalarEvolution *SE;
-
- bool runOnFunction(Function &F) override;
- void verifyAnalysis() const override {
- // This check is very expensive. On the loop intensive compiles it may cause
- // up to 10x slowdown. Currently it's disabled by default. LPPassManager
- // always does limited form of the LCSSA verification. Similar reasoning
- // was used for the LoopInfo verifier.
- if (VerifyLoopLCSSA) {
- assert(all_of(*LI,
- [&](Loop *L) {
- return L->isRecursivelyLCSSAForm(*DT, *LI);
- }) &&
- "LCSSA form is broken!");
- }
- };
-
- /// This transformation requires natural loop information & requires that
- /// loop preheaders be inserted into the CFG. It maintains both of these,
- /// as well as the CFG. It also requires dominator information.
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesCFG();
-
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreservedID(LoopSimplifyID);
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<BasicAAWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- AU.addPreserved<SCEVAAWrapperPass>();
- AU.addPreserved<BranchProbabilityInfoWrapperPass>();
- AU.addPreserved<MemorySSAWrapperPass>();
-
- // This is needed to perform LCSSA verification inside LPPassManager
- AU.addRequired<LCSSAVerificationPass>();
- AU.addPreserved<LCSSAVerificationPass>();
- }
-};
-}
-
-char LCSSAWrapperPass::ID = 0;
-INITIALIZE_PASS_BEGIN(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LCSSAVerificationPass)
-INITIALIZE_PASS_END(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass",
- false, false)
-
-Pass *llvm::createLCSSAPass() { return new LCSSAWrapperPass(); }
-char &llvm::LCSSAID = LCSSAWrapperPass::ID;
-
-/// Transform \p F into loop-closed SSA form.
-bool LCSSAWrapperPass::runOnFunction(Function &F) {
- LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
- SE = SEWP ? &SEWP->getSE() : nullptr;
-
- return formLCSSAOnAllLoops(LI, *DT, SE);
-}
-
-PreservedAnalyses LCSSAPass::run(Function &F, FunctionAnalysisManager &AM) {
- auto &LI = AM.getResult<LoopAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
- if (!formLCSSAOnAllLoops(&LI, DT, SE))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<BasicAA>();
- PA.preserve<GlobalsAA>();
- PA.preserve<SCEVAA>();
- PA.preserve<ScalarEvolutionAnalysis>();
- // BPI maps terminators to probabilities, since we don't modify the CFG, no
- // updates are needed to preserve it.
- PA.preserve<BranchProbabilityAnalysis>();
- PA.preserve<MemorySSAAnalysis>();
- return PA;
-}
+ // If we modified the code, remove any caches about the loop from SCEV to
+ // avoid dangling entries.
+ // FIXME: This is a big hammer, can we clear the cache more selectively?
+ if (SE && Changed)
+ SE->forgetLoop(&L);
+
+ assert(L.isLCSSAForm(DT));
+
+ return Changed;
+}
+
+/// Process a loop nest depth first.
+bool llvm::formLCSSARecursively(Loop &L, const DominatorTree &DT,
+ const LoopInfo *LI, ScalarEvolution *SE) {
+ bool Changed = false;
+
+ // Recurse depth-first through inner loops.
+ for (Loop *SubLoop : L.getSubLoops())
+ Changed |= formLCSSARecursively(*SubLoop, DT, LI, SE);
+
+ Changed |= formLCSSA(L, DT, LI, SE);
+ return Changed;
+}
+
+/// Process all loops in the function, inner-most out.
+static bool formLCSSAOnAllLoops(const LoopInfo *LI, const DominatorTree &DT,
+ ScalarEvolution *SE) {
+ bool Changed = false;
+ for (auto &L : *LI)
+ Changed |= formLCSSARecursively(*L, DT, LI, SE);
+ return Changed;
+}
+
+namespace {
+struct LCSSAWrapperPass : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ LCSSAWrapperPass() : FunctionPass(ID) {
+ initializeLCSSAWrapperPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ // Cached analysis information for the current function.
+ DominatorTree *DT;
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+
+ bool runOnFunction(Function &F) override;
+ void verifyAnalysis() const override {
+ // This check is very expensive. On the loop intensive compiles it may cause
+ // up to 10x slowdown. Currently it's disabled by default. LPPassManager
+ // always does limited form of the LCSSA verification. Similar reasoning
+ // was used for the LoopInfo verifier.
+ if (VerifyLoopLCSSA) {
+ assert(all_of(*LI,
+ [&](Loop *L) {
+ return L->isRecursivelyLCSSAForm(*DT, *LI);
+ }) &&
+ "LCSSA form is broken!");
+ }
+ };
+
+ /// This transformation requires natural loop information & requires that
+ /// loop preheaders be inserted into the CFG. It maintains both of these,
+ /// as well as the CFG. It also requires dominator information.
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreservedID(LoopSimplifyID);
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<SCEVAAWrapperPass>();
+ AU.addPreserved<BranchProbabilityInfoWrapperPass>();
+ AU.addPreserved<MemorySSAWrapperPass>();
+
+ // This is needed to perform LCSSA verification inside LPPassManager
+ AU.addRequired<LCSSAVerificationPass>();
+ AU.addPreserved<LCSSAVerificationPass>();
+ }
+};
+}
+
+char LCSSAWrapperPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LCSSAVerificationPass)
+INITIALIZE_PASS_END(LCSSAWrapperPass, "lcssa", "Loop-Closed SSA Form Pass",
+ false, false)
+
+Pass *llvm::createLCSSAPass() { return new LCSSAWrapperPass(); }
+char &llvm::LCSSAID = LCSSAWrapperPass::ID;
+
+/// Transform \p F into loop-closed SSA form.
+bool LCSSAWrapperPass::runOnFunction(Function &F) {
+ LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ SE = SEWP ? &SEWP->getSE() : nullptr;
+
+ return formLCSSAOnAllLoops(LI, *DT, SE);
+}
+
+PreservedAnalyses LCSSAPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
+ if (!formLCSSAOnAllLoops(&LI, DT, SE))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<BasicAA>();
+ PA.preserve<GlobalsAA>();
+ PA.preserve<SCEVAA>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+ // BPI maps terminators to probabilities, since we don't modify the CFG, no
+ // updates are needed to preserve it.
+ PA.preserve<BranchProbabilityAnalysis>();
+ PA.preserve<MemorySSAAnalysis>();
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LibCallsShrinkWrap.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
index 05446019c6..4c52fac6f7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LibCallsShrinkWrap.cpp
@@ -1,562 +1,562 @@
-//===-- LibCallsShrinkWrap.cpp ----------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass shrink-wraps a call to function if the result is not used.
-// The call can set errno but is otherwise side effect free. For example:
-// sqrt(val);
-// is transformed to
-// if (val < 0)
-// sqrt(val);
-// Even if the result of library call is not being used, the compiler cannot
-// safely delete the call because the function can set errno on error
-// conditions.
-// Note in many functions, the error condition solely depends on the incoming
-// parameter. In this optimization, we can generate the condition can lead to
-// the errno to shrink-wrap the call. Since the chances of hitting the error
-// condition is low, the runtime call is effectively eliminated.
-//
-// These partially dead calls are usually results of C++ abstraction penalty
-// exposed by inlining.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstVisitor.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "libcalls-shrinkwrap"
-
-STATISTIC(NumWrappedOneCond, "Number of One-Condition Wrappers Inserted");
-STATISTIC(NumWrappedTwoCond, "Number of Two-Condition Wrappers Inserted");
-
-namespace {
-class LibCallsShrinkWrapLegacyPass : public FunctionPass {
-public:
- static char ID; // Pass identification, replacement for typeid
- explicit LibCallsShrinkWrapLegacyPass() : FunctionPass(ID) {
- initializeLibCallsShrinkWrapLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
- void getAnalysisUsage(AnalysisUsage &AU) const override;
- bool runOnFunction(Function &F) override;
-};
-}
-
-char LibCallsShrinkWrapLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(LibCallsShrinkWrapLegacyPass, "libcalls-shrinkwrap",
- "Conditionally eliminate dead library calls", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(LibCallsShrinkWrapLegacyPass, "libcalls-shrinkwrap",
- "Conditionally eliminate dead library calls", false, false)
-
-namespace {
-class LibCallsShrinkWrap : public InstVisitor<LibCallsShrinkWrap> {
-public:
- LibCallsShrinkWrap(const TargetLibraryInfo &TLI, DominatorTree *DT)
- : TLI(TLI), DT(DT){};
- void visitCallInst(CallInst &CI) { checkCandidate(CI); }
- bool perform() {
- bool Changed = false;
- for (auto &CI : WorkList) {
- LLVM_DEBUG(dbgs() << "CDCE calls: " << CI->getCalledFunction()->getName()
- << "\n");
- if (perform(CI)) {
- Changed = true;
- LLVM_DEBUG(dbgs() << "Transformed\n");
- }
- }
- return Changed;
- }
-
-private:
- bool perform(CallInst *CI);
- void checkCandidate(CallInst &CI);
- void shrinkWrapCI(CallInst *CI, Value *Cond);
- bool performCallDomainErrorOnly(CallInst *CI, const LibFunc &Func);
- bool performCallErrors(CallInst *CI, const LibFunc &Func);
- bool performCallRangeErrorOnly(CallInst *CI, const LibFunc &Func);
- Value *generateOneRangeCond(CallInst *CI, const LibFunc &Func);
- Value *generateTwoRangeCond(CallInst *CI, const LibFunc &Func);
- Value *generateCondForPow(CallInst *CI, const LibFunc &Func);
-
- // Create an OR of two conditions.
- Value *createOrCond(CallInst *CI, CmpInst::Predicate Cmp, float Val,
- CmpInst::Predicate Cmp2, float Val2) {
- IRBuilder<> BBBuilder(CI);
- Value *Arg = CI->getArgOperand(0);
- auto Cond2 = createCond(BBBuilder, Arg, Cmp2, Val2);
- auto Cond1 = createCond(BBBuilder, Arg, Cmp, Val);
- return BBBuilder.CreateOr(Cond1, Cond2);
- }
-
- // Create a single condition using IRBuilder.
- Value *createCond(IRBuilder<> &BBBuilder, Value *Arg, CmpInst::Predicate Cmp,
- float Val) {
- Constant *V = ConstantFP::get(BBBuilder.getContext(), APFloat(Val));
- if (!Arg->getType()->isFloatTy())
- V = ConstantExpr::getFPExtend(V, Arg->getType());
- return BBBuilder.CreateFCmp(Cmp, Arg, V);
- }
-
- // Create a single condition.
- Value *createCond(CallInst *CI, CmpInst::Predicate Cmp, float Val) {
- IRBuilder<> BBBuilder(CI);
- Value *Arg = CI->getArgOperand(0);
- return createCond(BBBuilder, Arg, Cmp, Val);
- }
-
- const TargetLibraryInfo &TLI;
- DominatorTree *DT;
- SmallVector<CallInst *, 16> WorkList;
-};
-} // end anonymous namespace
-
-// Perform the transformation to calls with errno set by domain error.
-bool LibCallsShrinkWrap::performCallDomainErrorOnly(CallInst *CI,
- const LibFunc &Func) {
- Value *Cond = nullptr;
-
- switch (Func) {
- case LibFunc_acos: // DomainError: (x < -1 || x > 1)
- case LibFunc_acosf: // Same as acos
- case LibFunc_acosl: // Same as acos
- case LibFunc_asin: // DomainError: (x < -1 || x > 1)
- case LibFunc_asinf: // Same as asin
- case LibFunc_asinl: // Same as asin
- {
- ++NumWrappedTwoCond;
- Cond = createOrCond(CI, CmpInst::FCMP_OLT, -1.0f, CmpInst::FCMP_OGT, 1.0f);
- break;
- }
- case LibFunc_cos: // DomainError: (x == +inf || x == -inf)
- case LibFunc_cosf: // Same as cos
- case LibFunc_cosl: // Same as cos
- case LibFunc_sin: // DomainError: (x == +inf || x == -inf)
- case LibFunc_sinf: // Same as sin
- case LibFunc_sinl: // Same as sin
- {
- ++NumWrappedTwoCond;
- Cond = createOrCond(CI, CmpInst::FCMP_OEQ, INFINITY, CmpInst::FCMP_OEQ,
- -INFINITY);
- break;
- }
- case LibFunc_acosh: // DomainError: (x < 1)
- case LibFunc_acoshf: // Same as acosh
- case LibFunc_acoshl: // Same as acosh
- {
- ++NumWrappedOneCond;
- Cond = createCond(CI, CmpInst::FCMP_OLT, 1.0f);
- break;
- }
- case LibFunc_sqrt: // DomainError: (x < 0)
- case LibFunc_sqrtf: // Same as sqrt
- case LibFunc_sqrtl: // Same as sqrt
- {
- ++NumWrappedOneCond;
- Cond = createCond(CI, CmpInst::FCMP_OLT, 0.0f);
- break;
- }
- default:
- return false;
- }
- shrinkWrapCI(CI, Cond);
- return true;
-}
-
-// Perform the transformation to calls with errno set by range error.
-bool LibCallsShrinkWrap::performCallRangeErrorOnly(CallInst *CI,
- const LibFunc &Func) {
- Value *Cond = nullptr;
-
- switch (Func) {
- case LibFunc_cosh:
- case LibFunc_coshf:
- case LibFunc_coshl:
- case LibFunc_exp:
- case LibFunc_expf:
- case LibFunc_expl:
- case LibFunc_exp10:
- case LibFunc_exp10f:
- case LibFunc_exp10l:
- case LibFunc_exp2:
- case LibFunc_exp2f:
- case LibFunc_exp2l:
- case LibFunc_sinh:
- case LibFunc_sinhf:
- case LibFunc_sinhl: {
- Cond = generateTwoRangeCond(CI, Func);
- break;
- }
- case LibFunc_expm1: // RangeError: (709, inf)
- case LibFunc_expm1f: // RangeError: (88, inf)
- case LibFunc_expm1l: // RangeError: (11356, inf)
- {
- Cond = generateOneRangeCond(CI, Func);
- break;
- }
- default:
- return false;
- }
- shrinkWrapCI(CI, Cond);
- return true;
-}
-
-// Perform the transformation to calls with errno set by combination of errors.
-bool LibCallsShrinkWrap::performCallErrors(CallInst *CI,
- const LibFunc &Func) {
- Value *Cond = nullptr;
-
- switch (Func) {
- case LibFunc_atanh: // DomainError: (x < -1 || x > 1)
- // PoleError: (x == -1 || x == 1)
- // Overall Cond: (x <= -1 || x >= 1)
- case LibFunc_atanhf: // Same as atanh
- case LibFunc_atanhl: // Same as atanh
- {
- ++NumWrappedTwoCond;
- Cond = createOrCond(CI, CmpInst::FCMP_OLE, -1.0f, CmpInst::FCMP_OGE, 1.0f);
- break;
- }
- case LibFunc_log: // DomainError: (x < 0)
- // PoleError: (x == 0)
- // Overall Cond: (x <= 0)
- case LibFunc_logf: // Same as log
- case LibFunc_logl: // Same as log
- case LibFunc_log10: // Same as log
- case LibFunc_log10f: // Same as log
- case LibFunc_log10l: // Same as log
- case LibFunc_log2: // Same as log
- case LibFunc_log2f: // Same as log
- case LibFunc_log2l: // Same as log
- case LibFunc_logb: // Same as log
- case LibFunc_logbf: // Same as log
- case LibFunc_logbl: // Same as log
- {
- ++NumWrappedOneCond;
- Cond = createCond(CI, CmpInst::FCMP_OLE, 0.0f);
- break;
- }
- case LibFunc_log1p: // DomainError: (x < -1)
- // PoleError: (x == -1)
- // Overall Cond: (x <= -1)
- case LibFunc_log1pf: // Same as log1p
- case LibFunc_log1pl: // Same as log1p
- {
- ++NumWrappedOneCond;
- Cond = createCond(CI, CmpInst::FCMP_OLE, -1.0f);
- break;
- }
- case LibFunc_pow: // DomainError: x < 0 and y is noninteger
- // PoleError: x == 0 and y < 0
- // RangeError: overflow or underflow
- case LibFunc_powf:
- case LibFunc_powl: {
- Cond = generateCondForPow(CI, Func);
- if (Cond == nullptr)
- return false;
- break;
- }
- default:
- return false;
- }
- assert(Cond && "performCallErrors should not see an empty condition");
- shrinkWrapCI(CI, Cond);
- return true;
-}
-
-// Checks if CI is a candidate for shrinkwrapping and put it into work list if
-// true.
-void LibCallsShrinkWrap::checkCandidate(CallInst &CI) {
- if (CI.isNoBuiltin())
- return;
- // A possible improvement is to handle the calls with the return value being
- // used. If there is API for fast libcall implementation without setting
- // errno, we can use the same framework to direct/wrap the call to the fast
- // API in the error free path, and leave the original call in the slow path.
- if (!CI.use_empty())
- return;
-
- LibFunc Func;
- Function *Callee = CI.getCalledFunction();
- if (!Callee)
- return;
- if (!TLI.getLibFunc(*Callee, Func) || !TLI.has(Func))
- return;
-
- if (CI.getNumArgOperands() == 0)
- return;
- // TODO: Handle long double in other formats.
- Type *ArgType = CI.getArgOperand(0)->getType();
- if (!(ArgType->isFloatTy() || ArgType->isDoubleTy() ||
- ArgType->isX86_FP80Ty()))
- return;
-
- WorkList.push_back(&CI);
-}
-
-// Generate the upper bound condition for RangeError.
-Value *LibCallsShrinkWrap::generateOneRangeCond(CallInst *CI,
- const LibFunc &Func) {
- float UpperBound;
- switch (Func) {
- case LibFunc_expm1: // RangeError: (709, inf)
- UpperBound = 709.0f;
- break;
- case LibFunc_expm1f: // RangeError: (88, inf)
- UpperBound = 88.0f;
- break;
- case LibFunc_expm1l: // RangeError: (11356, inf)
- UpperBound = 11356.0f;
- break;
- default:
- llvm_unreachable("Unhandled library call!");
- }
-
- ++NumWrappedOneCond;
- return createCond(CI, CmpInst::FCMP_OGT, UpperBound);
-}
-
-// Generate the lower and upper bound condition for RangeError.
-Value *LibCallsShrinkWrap::generateTwoRangeCond(CallInst *CI,
- const LibFunc &Func) {
- float UpperBound, LowerBound;
- switch (Func) {
- case LibFunc_cosh: // RangeError: (x < -710 || x > 710)
- case LibFunc_sinh: // Same as cosh
- LowerBound = -710.0f;
- UpperBound = 710.0f;
- break;
- case LibFunc_coshf: // RangeError: (x < -89 || x > 89)
- case LibFunc_sinhf: // Same as coshf
- LowerBound = -89.0f;
- UpperBound = 89.0f;
- break;
- case LibFunc_coshl: // RangeError: (x < -11357 || x > 11357)
- case LibFunc_sinhl: // Same as coshl
- LowerBound = -11357.0f;
- UpperBound = 11357.0f;
- break;
- case LibFunc_exp: // RangeError: (x < -745 || x > 709)
- LowerBound = -745.0f;
- UpperBound = 709.0f;
- break;
- case LibFunc_expf: // RangeError: (x < -103 || x > 88)
- LowerBound = -103.0f;
- UpperBound = 88.0f;
- break;
- case LibFunc_expl: // RangeError: (x < -11399 || x > 11356)
- LowerBound = -11399.0f;
- UpperBound = 11356.0f;
- break;
- case LibFunc_exp10: // RangeError: (x < -323 || x > 308)
- LowerBound = -323.0f;
- UpperBound = 308.0f;
- break;
- case LibFunc_exp10f: // RangeError: (x < -45 || x > 38)
- LowerBound = -45.0f;
- UpperBound = 38.0f;
- break;
- case LibFunc_exp10l: // RangeError: (x < -4950 || x > 4932)
- LowerBound = -4950.0f;
- UpperBound = 4932.0f;
- break;
- case LibFunc_exp2: // RangeError: (x < -1074 || x > 1023)
- LowerBound = -1074.0f;
- UpperBound = 1023.0f;
- break;
- case LibFunc_exp2f: // RangeError: (x < -149 || x > 127)
- LowerBound = -149.0f;
- UpperBound = 127.0f;
- break;
- case LibFunc_exp2l: // RangeError: (x < -16445 || x > 11383)
- LowerBound = -16445.0f;
- UpperBound = 11383.0f;
- break;
- default:
- llvm_unreachable("Unhandled library call!");
- }
-
- ++NumWrappedTwoCond;
- return createOrCond(CI, CmpInst::FCMP_OGT, UpperBound, CmpInst::FCMP_OLT,
- LowerBound);
-}
-
-// For pow(x,y), We only handle the following cases:
-// (1) x is a constant && (x >= 1) && (x < MaxUInt8)
-// Cond is: (y > 127)
-// (2) x is a value coming from an integer type.
-// (2.1) if x's bit_size == 8
-// Cond: (x <= 0 || y > 128)
-// (2.2) if x's bit_size is 16
-// Cond: (x <= 0 || y > 64)
-// (2.3) if x's bit_size is 32
-// Cond: (x <= 0 || y > 32)
-// Support for powl(x,y) and powf(x,y) are TBD.
-//
-// Note that condition can be more conservative than the actual condition
-// (i.e. we might invoke the calls that will not set the errno.).
-//
-Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
- const LibFunc &Func) {
- // FIXME: LibFunc_powf and powl TBD.
- if (Func != LibFunc_pow) {
- LLVM_DEBUG(dbgs() << "Not handled powf() and powl()\n");
- return nullptr;
- }
-
- Value *Base = CI->getArgOperand(0);
- Value *Exp = CI->getArgOperand(1);
- IRBuilder<> BBBuilder(CI);
-
- // Constant Base case.
- if (ConstantFP *CF = dyn_cast<ConstantFP>(Base)) {
- double D = CF->getValueAPF().convertToDouble();
- if (D < 1.0f || D > APInt::getMaxValue(8).getZExtValue()) {
- LLVM_DEBUG(dbgs() << "Not handled pow(): constant base out of range\n");
- return nullptr;
- }
-
- ++NumWrappedOneCond;
- Constant *V = ConstantFP::get(CI->getContext(), APFloat(127.0f));
- if (!Exp->getType()->isFloatTy())
- V = ConstantExpr::getFPExtend(V, Exp->getType());
- return BBBuilder.CreateFCmp(CmpInst::FCMP_OGT, Exp, V);
- }
-
- // If the Base value coming from an integer type.
- Instruction *I = dyn_cast<Instruction>(Base);
- if (!I) {
- LLVM_DEBUG(dbgs() << "Not handled pow(): FP type base\n");
- return nullptr;
- }
- unsigned Opcode = I->getOpcode();
- if (Opcode == Instruction::UIToFP || Opcode == Instruction::SIToFP) {
- unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
- float UpperV = 0.0f;
- if (BW == 8)
- UpperV = 128.0f;
- else if (BW == 16)
- UpperV = 64.0f;
- else if (BW == 32)
- UpperV = 32.0f;
- else {
- LLVM_DEBUG(dbgs() << "Not handled pow(): type too wide\n");
- return nullptr;
- }
-
- ++NumWrappedTwoCond;
- Constant *V = ConstantFP::get(CI->getContext(), APFloat(UpperV));
- Constant *V0 = ConstantFP::get(CI->getContext(), APFloat(0.0f));
- if (!Exp->getType()->isFloatTy())
- V = ConstantExpr::getFPExtend(V, Exp->getType());
- if (!Base->getType()->isFloatTy())
- V0 = ConstantExpr::getFPExtend(V0, Exp->getType());
-
- Value *Cond = BBBuilder.CreateFCmp(CmpInst::FCMP_OGT, Exp, V);
- Value *Cond0 = BBBuilder.CreateFCmp(CmpInst::FCMP_OLE, Base, V0);
- return BBBuilder.CreateOr(Cond0, Cond);
- }
- LLVM_DEBUG(dbgs() << "Not handled pow(): base not from integer convert\n");
- return nullptr;
-}
-
-// Wrap conditions that can potentially generate errno to the library call.
-void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) {
- assert(Cond != nullptr && "ShrinkWrapCI is not expecting an empty call inst");
- MDNode *BranchWeights =
- MDBuilder(CI->getContext()).createBranchWeights(1, 2000);
-
- Instruction *NewInst =
- SplitBlockAndInsertIfThen(Cond, CI, false, BranchWeights, DT);
- BasicBlock *CallBB = NewInst->getParent();
- CallBB->setName("cdce.call");
- BasicBlock *SuccBB = CallBB->getSingleSuccessor();
- assert(SuccBB && "The split block should have a single successor");
- SuccBB->setName("cdce.end");
- CI->removeFromParent();
- CallBB->getInstList().insert(CallBB->getFirstInsertionPt(), CI);
- LLVM_DEBUG(dbgs() << "== Basic Block After ==");
- LLVM_DEBUG(dbgs() << *CallBB->getSinglePredecessor() << *CallBB
- << *CallBB->getSingleSuccessor() << "\n");
-}
-
-// Perform the transformation to a single candidate.
-bool LibCallsShrinkWrap::perform(CallInst *CI) {
- LibFunc Func;
- Function *Callee = CI->getCalledFunction();
- assert(Callee && "perform() should apply to a non-empty callee");
- TLI.getLibFunc(*Callee, Func);
- assert(Func && "perform() is not expecting an empty function");
-
- if (performCallDomainErrorOnly(CI, Func) || performCallRangeErrorOnly(CI, Func))
- return true;
- return performCallErrors(CI, Func);
-}
-
-void LibCallsShrinkWrapLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addRequired<TargetLibraryInfoWrapperPass>();
-}
-
-static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
- DominatorTree *DT) {
- if (F.hasFnAttribute(Attribute::OptimizeForSize))
- return false;
- LibCallsShrinkWrap CCDCE(TLI, DT);
- CCDCE.visit(F);
- bool Changed = CCDCE.perform();
-
-// Verify the dominator after we've updated it locally.
- assert(!DT || DT->verify(DominatorTree::VerificationLevel::Fast));
- return Changed;
-}
-
-bool LibCallsShrinkWrapLegacyPass::runOnFunction(Function &F) {
- auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
- auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
- auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
- return runImpl(F, TLI, DT);
-}
-
-namespace llvm {
-char &LibCallsShrinkWrapPassID = LibCallsShrinkWrapLegacyPass::ID;
-
-// Public interface to LibCallsShrinkWrap pass.
-FunctionPass *createLibCallsShrinkWrapPass() {
- return new LibCallsShrinkWrapLegacyPass();
-}
-
-PreservedAnalyses LibCallsShrinkWrapPass::run(Function &F,
- FunctionAnalysisManager &FAM) {
- auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
- auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
- if (!runImpl(F, TLI, DT))
- return PreservedAnalyses::all();
- auto PA = PreservedAnalyses();
- PA.preserve<GlobalsAA>();
- PA.preserve<DominatorTreeAnalysis>();
- return PA;
-}
-}
+//===-- LibCallsShrinkWrap.cpp ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass shrink-wraps a call to function if the result is not used.
+// The call can set errno but is otherwise side effect free. For example:
+// sqrt(val);
+// is transformed to
+// if (val < 0)
+// sqrt(val);
+// Even if the result of library call is not being used, the compiler cannot
+// safely delete the call because the function can set errno on error
+// conditions.
+// Note in many functions, the error condition solely depends on the incoming
+// parameter. In this optimization, we can generate the condition can lead to
+// the errno to shrink-wrap the call. Since the chances of hitting the error
+// condition is low, the runtime call is effectively eliminated.
+//
+// These partially dead calls are usually results of C++ abstraction penalty
+// exposed by inlining.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LibCallsShrinkWrap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstVisitor.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "libcalls-shrinkwrap"
+
+STATISTIC(NumWrappedOneCond, "Number of One-Condition Wrappers Inserted");
+STATISTIC(NumWrappedTwoCond, "Number of Two-Condition Wrappers Inserted");
+
+namespace {
+class LibCallsShrinkWrapLegacyPass : public FunctionPass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+ explicit LibCallsShrinkWrapLegacyPass() : FunctionPass(ID) {
+ initializeLibCallsShrinkWrapLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+ void getAnalysisUsage(AnalysisUsage &AU) const override;
+ bool runOnFunction(Function &F) override;
+};
+}
+
+char LibCallsShrinkWrapLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LibCallsShrinkWrapLegacyPass, "libcalls-shrinkwrap",
+ "Conditionally eliminate dead library calls", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(LibCallsShrinkWrapLegacyPass, "libcalls-shrinkwrap",
+ "Conditionally eliminate dead library calls", false, false)
+
+namespace {
+class LibCallsShrinkWrap : public InstVisitor<LibCallsShrinkWrap> {
+public:
+ LibCallsShrinkWrap(const TargetLibraryInfo &TLI, DominatorTree *DT)
+ : TLI(TLI), DT(DT){};
+ void visitCallInst(CallInst &CI) { checkCandidate(CI); }
+ bool perform() {
+ bool Changed = false;
+ for (auto &CI : WorkList) {
+ LLVM_DEBUG(dbgs() << "CDCE calls: " << CI->getCalledFunction()->getName()
+ << "\n");
+ if (perform(CI)) {
+ Changed = true;
+ LLVM_DEBUG(dbgs() << "Transformed\n");
+ }
+ }
+ return Changed;
+ }
+
+private:
+ bool perform(CallInst *CI);
+ void checkCandidate(CallInst &CI);
+ void shrinkWrapCI(CallInst *CI, Value *Cond);
+ bool performCallDomainErrorOnly(CallInst *CI, const LibFunc &Func);
+ bool performCallErrors(CallInst *CI, const LibFunc &Func);
+ bool performCallRangeErrorOnly(CallInst *CI, const LibFunc &Func);
+ Value *generateOneRangeCond(CallInst *CI, const LibFunc &Func);
+ Value *generateTwoRangeCond(CallInst *CI, const LibFunc &Func);
+ Value *generateCondForPow(CallInst *CI, const LibFunc &Func);
+
+ // Create an OR of two conditions.
+ Value *createOrCond(CallInst *CI, CmpInst::Predicate Cmp, float Val,
+ CmpInst::Predicate Cmp2, float Val2) {
+ IRBuilder<> BBBuilder(CI);
+ Value *Arg = CI->getArgOperand(0);
+ auto Cond2 = createCond(BBBuilder, Arg, Cmp2, Val2);
+ auto Cond1 = createCond(BBBuilder, Arg, Cmp, Val);
+ return BBBuilder.CreateOr(Cond1, Cond2);
+ }
+
+ // Create a single condition using IRBuilder.
+ Value *createCond(IRBuilder<> &BBBuilder, Value *Arg, CmpInst::Predicate Cmp,
+ float Val) {
+ Constant *V = ConstantFP::get(BBBuilder.getContext(), APFloat(Val));
+ if (!Arg->getType()->isFloatTy())
+ V = ConstantExpr::getFPExtend(V, Arg->getType());
+ return BBBuilder.CreateFCmp(Cmp, Arg, V);
+ }
+
+ // Create a single condition.
+ Value *createCond(CallInst *CI, CmpInst::Predicate Cmp, float Val) {
+ IRBuilder<> BBBuilder(CI);
+ Value *Arg = CI->getArgOperand(0);
+ return createCond(BBBuilder, Arg, Cmp, Val);
+ }
+
+ const TargetLibraryInfo &TLI;
+ DominatorTree *DT;
+ SmallVector<CallInst *, 16> WorkList;
+};
+} // end anonymous namespace
+
+// Perform the transformation to calls with errno set by domain error.
+bool LibCallsShrinkWrap::performCallDomainErrorOnly(CallInst *CI,
+ const LibFunc &Func) {
+ Value *Cond = nullptr;
+
+ switch (Func) {
+ case LibFunc_acos: // DomainError: (x < -1 || x > 1)
+ case LibFunc_acosf: // Same as acos
+ case LibFunc_acosl: // Same as acos
+ case LibFunc_asin: // DomainError: (x < -1 || x > 1)
+ case LibFunc_asinf: // Same as asin
+ case LibFunc_asinl: // Same as asin
+ {
+ ++NumWrappedTwoCond;
+ Cond = createOrCond(CI, CmpInst::FCMP_OLT, -1.0f, CmpInst::FCMP_OGT, 1.0f);
+ break;
+ }
+ case LibFunc_cos: // DomainError: (x == +inf || x == -inf)
+ case LibFunc_cosf: // Same as cos
+ case LibFunc_cosl: // Same as cos
+ case LibFunc_sin: // DomainError: (x == +inf || x == -inf)
+ case LibFunc_sinf: // Same as sin
+ case LibFunc_sinl: // Same as sin
+ {
+ ++NumWrappedTwoCond;
+ Cond = createOrCond(CI, CmpInst::FCMP_OEQ, INFINITY, CmpInst::FCMP_OEQ,
+ -INFINITY);
+ break;
+ }
+ case LibFunc_acosh: // DomainError: (x < 1)
+ case LibFunc_acoshf: // Same as acosh
+ case LibFunc_acoshl: // Same as acosh
+ {
+ ++NumWrappedOneCond;
+ Cond = createCond(CI, CmpInst::FCMP_OLT, 1.0f);
+ break;
+ }
+ case LibFunc_sqrt: // DomainError: (x < 0)
+ case LibFunc_sqrtf: // Same as sqrt
+ case LibFunc_sqrtl: // Same as sqrt
+ {
+ ++NumWrappedOneCond;
+ Cond = createCond(CI, CmpInst::FCMP_OLT, 0.0f);
+ break;
+ }
+ default:
+ return false;
+ }
+ shrinkWrapCI(CI, Cond);
+ return true;
+}
+
+// Perform the transformation to calls with errno set by range error.
+bool LibCallsShrinkWrap::performCallRangeErrorOnly(CallInst *CI,
+ const LibFunc &Func) {
+ Value *Cond = nullptr;
+
+ switch (Func) {
+ case LibFunc_cosh:
+ case LibFunc_coshf:
+ case LibFunc_coshl:
+ case LibFunc_exp:
+ case LibFunc_expf:
+ case LibFunc_expl:
+ case LibFunc_exp10:
+ case LibFunc_exp10f:
+ case LibFunc_exp10l:
+ case LibFunc_exp2:
+ case LibFunc_exp2f:
+ case LibFunc_exp2l:
+ case LibFunc_sinh:
+ case LibFunc_sinhf:
+ case LibFunc_sinhl: {
+ Cond = generateTwoRangeCond(CI, Func);
+ break;
+ }
+ case LibFunc_expm1: // RangeError: (709, inf)
+ case LibFunc_expm1f: // RangeError: (88, inf)
+ case LibFunc_expm1l: // RangeError: (11356, inf)
+ {
+ Cond = generateOneRangeCond(CI, Func);
+ break;
+ }
+ default:
+ return false;
+ }
+ shrinkWrapCI(CI, Cond);
+ return true;
+}
+
+// Perform the transformation to calls with errno set by combination of errors.
+bool LibCallsShrinkWrap::performCallErrors(CallInst *CI,
+ const LibFunc &Func) {
+ Value *Cond = nullptr;
+
+ switch (Func) {
+ case LibFunc_atanh: // DomainError: (x < -1 || x > 1)
+ // PoleError: (x == -1 || x == 1)
+ // Overall Cond: (x <= -1 || x >= 1)
+ case LibFunc_atanhf: // Same as atanh
+ case LibFunc_atanhl: // Same as atanh
+ {
+ ++NumWrappedTwoCond;
+ Cond = createOrCond(CI, CmpInst::FCMP_OLE, -1.0f, CmpInst::FCMP_OGE, 1.0f);
+ break;
+ }
+ case LibFunc_log: // DomainError: (x < 0)
+ // PoleError: (x == 0)
+ // Overall Cond: (x <= 0)
+ case LibFunc_logf: // Same as log
+ case LibFunc_logl: // Same as log
+ case LibFunc_log10: // Same as log
+ case LibFunc_log10f: // Same as log
+ case LibFunc_log10l: // Same as log
+ case LibFunc_log2: // Same as log
+ case LibFunc_log2f: // Same as log
+ case LibFunc_log2l: // Same as log
+ case LibFunc_logb: // Same as log
+ case LibFunc_logbf: // Same as log
+ case LibFunc_logbl: // Same as log
+ {
+ ++NumWrappedOneCond;
+ Cond = createCond(CI, CmpInst::FCMP_OLE, 0.0f);
+ break;
+ }
+ case LibFunc_log1p: // DomainError: (x < -1)
+ // PoleError: (x == -1)
+ // Overall Cond: (x <= -1)
+ case LibFunc_log1pf: // Same as log1p
+ case LibFunc_log1pl: // Same as log1p
+ {
+ ++NumWrappedOneCond;
+ Cond = createCond(CI, CmpInst::FCMP_OLE, -1.0f);
+ break;
+ }
+ case LibFunc_pow: // DomainError: x < 0 and y is noninteger
+ // PoleError: x == 0 and y < 0
+ // RangeError: overflow or underflow
+ case LibFunc_powf:
+ case LibFunc_powl: {
+ Cond = generateCondForPow(CI, Func);
+ if (Cond == nullptr)
+ return false;
+ break;
+ }
+ default:
+ return false;
+ }
+ assert(Cond && "performCallErrors should not see an empty condition");
+ shrinkWrapCI(CI, Cond);
+ return true;
+}
+
+// Checks if CI is a candidate for shrinkwrapping and put it into work list if
+// true.
+void LibCallsShrinkWrap::checkCandidate(CallInst &CI) {
+ if (CI.isNoBuiltin())
+ return;
+ // A possible improvement is to handle the calls with the return value being
+ // used. If there is API for fast libcall implementation without setting
+ // errno, we can use the same framework to direct/wrap the call to the fast
+ // API in the error free path, and leave the original call in the slow path.
+ if (!CI.use_empty())
+ return;
+
+ LibFunc Func;
+ Function *Callee = CI.getCalledFunction();
+ if (!Callee)
+ return;
+ if (!TLI.getLibFunc(*Callee, Func) || !TLI.has(Func))
+ return;
+
+ if (CI.getNumArgOperands() == 0)
+ return;
+ // TODO: Handle long double in other formats.
+ Type *ArgType = CI.getArgOperand(0)->getType();
+ if (!(ArgType->isFloatTy() || ArgType->isDoubleTy() ||
+ ArgType->isX86_FP80Ty()))
+ return;
+
+ WorkList.push_back(&CI);
+}
+
+// Generate the upper bound condition for RangeError.
+Value *LibCallsShrinkWrap::generateOneRangeCond(CallInst *CI,
+ const LibFunc &Func) {
+ float UpperBound;
+ switch (Func) {
+ case LibFunc_expm1: // RangeError: (709, inf)
+ UpperBound = 709.0f;
+ break;
+ case LibFunc_expm1f: // RangeError: (88, inf)
+ UpperBound = 88.0f;
+ break;
+ case LibFunc_expm1l: // RangeError: (11356, inf)
+ UpperBound = 11356.0f;
+ break;
+ default:
+ llvm_unreachable("Unhandled library call!");
+ }
+
+ ++NumWrappedOneCond;
+ return createCond(CI, CmpInst::FCMP_OGT, UpperBound);
+}
+
+// Generate the lower and upper bound condition for RangeError.
+Value *LibCallsShrinkWrap::generateTwoRangeCond(CallInst *CI,
+ const LibFunc &Func) {
+ float UpperBound, LowerBound;
+ switch (Func) {
+ case LibFunc_cosh: // RangeError: (x < -710 || x > 710)
+ case LibFunc_sinh: // Same as cosh
+ LowerBound = -710.0f;
+ UpperBound = 710.0f;
+ break;
+ case LibFunc_coshf: // RangeError: (x < -89 || x > 89)
+ case LibFunc_sinhf: // Same as coshf
+ LowerBound = -89.0f;
+ UpperBound = 89.0f;
+ break;
+ case LibFunc_coshl: // RangeError: (x < -11357 || x > 11357)
+ case LibFunc_sinhl: // Same as coshl
+ LowerBound = -11357.0f;
+ UpperBound = 11357.0f;
+ break;
+ case LibFunc_exp: // RangeError: (x < -745 || x > 709)
+ LowerBound = -745.0f;
+ UpperBound = 709.0f;
+ break;
+ case LibFunc_expf: // RangeError: (x < -103 || x > 88)
+ LowerBound = -103.0f;
+ UpperBound = 88.0f;
+ break;
+ case LibFunc_expl: // RangeError: (x < -11399 || x > 11356)
+ LowerBound = -11399.0f;
+ UpperBound = 11356.0f;
+ break;
+ case LibFunc_exp10: // RangeError: (x < -323 || x > 308)
+ LowerBound = -323.0f;
+ UpperBound = 308.0f;
+ break;
+ case LibFunc_exp10f: // RangeError: (x < -45 || x > 38)
+ LowerBound = -45.0f;
+ UpperBound = 38.0f;
+ break;
+ case LibFunc_exp10l: // RangeError: (x < -4950 || x > 4932)
+ LowerBound = -4950.0f;
+ UpperBound = 4932.0f;
+ break;
+ case LibFunc_exp2: // RangeError: (x < -1074 || x > 1023)
+ LowerBound = -1074.0f;
+ UpperBound = 1023.0f;
+ break;
+ case LibFunc_exp2f: // RangeError: (x < -149 || x > 127)
+ LowerBound = -149.0f;
+ UpperBound = 127.0f;
+ break;
+ case LibFunc_exp2l: // RangeError: (x < -16445 || x > 11383)
+ LowerBound = -16445.0f;
+ UpperBound = 11383.0f;
+ break;
+ default:
+ llvm_unreachable("Unhandled library call!");
+ }
+
+ ++NumWrappedTwoCond;
+ return createOrCond(CI, CmpInst::FCMP_OGT, UpperBound, CmpInst::FCMP_OLT,
+ LowerBound);
+}
+
+// For pow(x,y), We only handle the following cases:
+// (1) x is a constant && (x >= 1) && (x < MaxUInt8)
+// Cond is: (y > 127)
+// (2) x is a value coming from an integer type.
+// (2.1) if x's bit_size == 8
+// Cond: (x <= 0 || y > 128)
+// (2.2) if x's bit_size is 16
+// Cond: (x <= 0 || y > 64)
+// (2.3) if x's bit_size is 32
+// Cond: (x <= 0 || y > 32)
+// Support for powl(x,y) and powf(x,y) are TBD.
+//
+// Note that condition can be more conservative than the actual condition
+// (i.e. we might invoke the calls that will not set the errno.).
+//
+Value *LibCallsShrinkWrap::generateCondForPow(CallInst *CI,
+ const LibFunc &Func) {
+ // FIXME: LibFunc_powf and powl TBD.
+ if (Func != LibFunc_pow) {
+ LLVM_DEBUG(dbgs() << "Not handled powf() and powl()\n");
+ return nullptr;
+ }
+
+ Value *Base = CI->getArgOperand(0);
+ Value *Exp = CI->getArgOperand(1);
+ IRBuilder<> BBBuilder(CI);
+
+ // Constant Base case.
+ if (ConstantFP *CF = dyn_cast<ConstantFP>(Base)) {
+ double D = CF->getValueAPF().convertToDouble();
+ if (D < 1.0f || D > APInt::getMaxValue(8).getZExtValue()) {
+ LLVM_DEBUG(dbgs() << "Not handled pow(): constant base out of range\n");
+ return nullptr;
+ }
+
+ ++NumWrappedOneCond;
+ Constant *V = ConstantFP::get(CI->getContext(), APFloat(127.0f));
+ if (!Exp->getType()->isFloatTy())
+ V = ConstantExpr::getFPExtend(V, Exp->getType());
+ return BBBuilder.CreateFCmp(CmpInst::FCMP_OGT, Exp, V);
+ }
+
+ // If the Base value coming from an integer type.
+ Instruction *I = dyn_cast<Instruction>(Base);
+ if (!I) {
+ LLVM_DEBUG(dbgs() << "Not handled pow(): FP type base\n");
+ return nullptr;
+ }
+ unsigned Opcode = I->getOpcode();
+ if (Opcode == Instruction::UIToFP || Opcode == Instruction::SIToFP) {
+ unsigned BW = I->getOperand(0)->getType()->getPrimitiveSizeInBits();
+ float UpperV = 0.0f;
+ if (BW == 8)
+ UpperV = 128.0f;
+ else if (BW == 16)
+ UpperV = 64.0f;
+ else if (BW == 32)
+ UpperV = 32.0f;
+ else {
+ LLVM_DEBUG(dbgs() << "Not handled pow(): type too wide\n");
+ return nullptr;
+ }
+
+ ++NumWrappedTwoCond;
+ Constant *V = ConstantFP::get(CI->getContext(), APFloat(UpperV));
+ Constant *V0 = ConstantFP::get(CI->getContext(), APFloat(0.0f));
+ if (!Exp->getType()->isFloatTy())
+ V = ConstantExpr::getFPExtend(V, Exp->getType());
+ if (!Base->getType()->isFloatTy())
+ V0 = ConstantExpr::getFPExtend(V0, Exp->getType());
+
+ Value *Cond = BBBuilder.CreateFCmp(CmpInst::FCMP_OGT, Exp, V);
+ Value *Cond0 = BBBuilder.CreateFCmp(CmpInst::FCMP_OLE, Base, V0);
+ return BBBuilder.CreateOr(Cond0, Cond);
+ }
+ LLVM_DEBUG(dbgs() << "Not handled pow(): base not from integer convert\n");
+ return nullptr;
+}
+
+// Wrap conditions that can potentially generate errno to the library call.
+void LibCallsShrinkWrap::shrinkWrapCI(CallInst *CI, Value *Cond) {
+ assert(Cond != nullptr && "ShrinkWrapCI is not expecting an empty call inst");
+ MDNode *BranchWeights =
+ MDBuilder(CI->getContext()).createBranchWeights(1, 2000);
+
+ Instruction *NewInst =
+ SplitBlockAndInsertIfThen(Cond, CI, false, BranchWeights, DT);
+ BasicBlock *CallBB = NewInst->getParent();
+ CallBB->setName("cdce.call");
+ BasicBlock *SuccBB = CallBB->getSingleSuccessor();
+ assert(SuccBB && "The split block should have a single successor");
+ SuccBB->setName("cdce.end");
+ CI->removeFromParent();
+ CallBB->getInstList().insert(CallBB->getFirstInsertionPt(), CI);
+ LLVM_DEBUG(dbgs() << "== Basic Block After ==");
+ LLVM_DEBUG(dbgs() << *CallBB->getSinglePredecessor() << *CallBB
+ << *CallBB->getSingleSuccessor() << "\n");
+}
+
+// Perform the transformation to a single candidate.
+bool LibCallsShrinkWrap::perform(CallInst *CI) {
+ LibFunc Func;
+ Function *Callee = CI->getCalledFunction();
+ assert(Callee && "perform() should apply to a non-empty callee");
+ TLI.getLibFunc(*Callee, Func);
+ assert(Func && "perform() is not expecting an empty function");
+
+ if (performCallDomainErrorOnly(CI, Func) || performCallRangeErrorOnly(CI, Func))
+ return true;
+ return performCallErrors(CI, Func);
+}
+
+void LibCallsShrinkWrapLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<TargetLibraryInfoWrapperPass>();
+}
+
+static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
+ DominatorTree *DT) {
+ if (F.hasFnAttribute(Attribute::OptimizeForSize))
+ return false;
+ LibCallsShrinkWrap CCDCE(TLI, DT);
+ CCDCE.visit(F);
+ bool Changed = CCDCE.perform();
+
+// Verify the dominator after we've updated it locally.
+ assert(!DT || DT->verify(DominatorTree::VerificationLevel::Fast));
+ return Changed;
+}
+
+bool LibCallsShrinkWrapLegacyPass::runOnFunction(Function &F) {
+ auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+ auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+ auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
+ return runImpl(F, TLI, DT);
+}
+
+namespace llvm {
+char &LibCallsShrinkWrapPassID = LibCallsShrinkWrapLegacyPass::ID;
+
+// Public interface to LibCallsShrinkWrap pass.
+FunctionPass *createLibCallsShrinkWrapPass() {
+ return new LibCallsShrinkWrapLegacyPass();
+}
+
+PreservedAnalyses LibCallsShrinkWrapPass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+ auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
+ if (!runImpl(F, TLI, DT))
+ return PreservedAnalyses::all();
+ auto PA = PreservedAnalyses();
+ PA.preserve<GlobalsAA>();
+ PA.preserve<DominatorTreeAnalysis>();
+ return PA;
+}
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/Local.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/Local.cpp
index 3223fd6f65..ae26058c21 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/Local.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/Local.cpp
@@ -1,98 +1,98 @@
-//===- Local.cpp - Functions to perform local transformations -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This family of functions perform various local transformations to the
-// program.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseMapInfo.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/Analysis/AssumeBundleQueries.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LazyValueInfo.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/BinaryFormat/Dwarf.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/ConstantRange.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalObject.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/KnownBits.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <algorithm>
-#include <cassert>
-#include <climits>
-#include <cstdint>
-#include <iterator>
-#include <map>
-#include <utility>
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-#define DEBUG_TYPE "local"
-
-STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
+//===- Local.cpp - Functions to perform local transformations -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform various local transformations to the
+// program.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "local"
+
+STATISTIC(NumRemoved, "Number of unreachable basic blocks removed");
STATISTIC(NumPHICSEs, "Number of PHI's that got CSE'd");
-
+
static cl::opt<bool> PHICSEDebugHash(
"phicse-debug-hash",
#ifdef EXPENSIVE_CHECKS
@@ -110,51 +110,51 @@ static cl::opt<unsigned> PHICSENumPHISmallSize(
"When the basic block contains not more than this number of PHI nodes, "
"perform a (faster!) exhaustive search instead of set-driven one."));
-// Max recursion depth for collectBitParts used when detecting bswap and
-// bitreverse idioms
-static const unsigned BitPartRecursionMaxDepth = 64;
-
-//===----------------------------------------------------------------------===//
-// Local constant propagation.
-//
-
-/// ConstantFoldTerminator - If a terminator instruction is predicated on a
-/// constant value, convert it into an unconditional branch to the constant
-/// destination. This is a nontrivial operation because the successors of this
-/// basic block must have their PHI nodes updated.
-/// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch
-/// conditions and indirectbr addresses this might make dead if
-/// DeleteDeadConditions is true.
-bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
- const TargetLibraryInfo *TLI,
- DomTreeUpdater *DTU) {
- Instruction *T = BB->getTerminator();
- IRBuilder<> Builder(T);
-
- // Branch - See if we are conditional jumping on constant
- if (auto *BI = dyn_cast<BranchInst>(T)) {
- if (BI->isUnconditional()) return false; // Can't optimize uncond branch
-
- BasicBlock *Dest1 = BI->getSuccessor(0);
- BasicBlock *Dest2 = BI->getSuccessor(1);
-
- if (Dest2 == Dest1) { // Conditional branch to same location?
- // This branch matches something like this:
- // br bool %cond, label %Dest, label %Dest
- // and changes it into: br label %Dest
-
- // Let the basic block know that we are letting go of one copy of it.
- assert(BI->getParent() && "Terminator not inserted in block!");
- Dest1->removePredecessor(BI->getParent());
-
- // Replace the conditional branch with an unconditional one.
- Builder.CreateBr(Dest1);
- Value *Cond = BI->getCondition();
- BI->eraseFromParent();
- if (DeleteDeadConditions)
- RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
- return true;
- }
+// Max recursion depth for collectBitParts used when detecting bswap and
+// bitreverse idioms
+static const unsigned BitPartRecursionMaxDepth = 64;
+
+//===----------------------------------------------------------------------===//
+// Local constant propagation.
+//
+
+/// ConstantFoldTerminator - If a terminator instruction is predicated on a
+/// constant value, convert it into an unconditional branch to the constant
+/// destination. This is a nontrivial operation because the successors of this
+/// basic block must have their PHI nodes updated.
+/// Also calls RecursivelyDeleteTriviallyDeadInstructions() on any branch/switch
+/// conditions and indirectbr addresses this might make dead if
+/// DeleteDeadConditions is true.
+bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
+ const TargetLibraryInfo *TLI,
+ DomTreeUpdater *DTU) {
+ Instruction *T = BB->getTerminator();
+ IRBuilder<> Builder(T);
+
+ // Branch - See if we are conditional jumping on constant
+ if (auto *BI = dyn_cast<BranchInst>(T)) {
+ if (BI->isUnconditional()) return false; // Can't optimize uncond branch
+
+ BasicBlock *Dest1 = BI->getSuccessor(0);
+ BasicBlock *Dest2 = BI->getSuccessor(1);
+
+ if (Dest2 == Dest1) { // Conditional branch to same location?
+ // This branch matches something like this:
+ // br bool %cond, label %Dest, label %Dest
+ // and changes it into: br label %Dest
+
+ // Let the basic block know that we are letting go of one copy of it.
+ assert(BI->getParent() && "Terminator not inserted in block!");
+ Dest1->removePredecessor(BI->getParent());
+
+ // Replace the conditional branch with an unconditional one.
+ Builder.CreateBr(Dest1);
+ Value *Cond = BI->getCondition();
+ BI->eraseFromParent();
+ if (DeleteDeadConditions)
+ RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
+ return true;
+ }
if (auto *Cond = dyn_cast<ConstantInt>(BI->getCondition())) {
// Are we branching on constant?
@@ -174,109 +174,109 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
return true;
}
- return false;
- }
-
- if (auto *SI = dyn_cast<SwitchInst>(T)) {
- // If we are switching on a constant, we can convert the switch to an
- // unconditional branch.
- auto *CI = dyn_cast<ConstantInt>(SI->getCondition());
- BasicBlock *DefaultDest = SI->getDefaultDest();
- BasicBlock *TheOnlyDest = DefaultDest;
-
- // If the default is unreachable, ignore it when searching for TheOnlyDest.
- if (isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg()) &&
- SI->getNumCases() > 0) {
- TheOnlyDest = SI->case_begin()->getCaseSuccessor();
- }
-
+ return false;
+ }
+
+ if (auto *SI = dyn_cast<SwitchInst>(T)) {
+ // If we are switching on a constant, we can convert the switch to an
+ // unconditional branch.
+ auto *CI = dyn_cast<ConstantInt>(SI->getCondition());
+ BasicBlock *DefaultDest = SI->getDefaultDest();
+ BasicBlock *TheOnlyDest = DefaultDest;
+
+ // If the default is unreachable, ignore it when searching for TheOnlyDest.
+ if (isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg()) &&
+ SI->getNumCases() > 0) {
+ TheOnlyDest = SI->case_begin()->getCaseSuccessor();
+ }
+
bool Changed = false;
- // Figure out which case it goes to.
- for (auto i = SI->case_begin(), e = SI->case_end(); i != e;) {
- // Found case matching a constant operand?
- if (i->getCaseValue() == CI) {
- TheOnlyDest = i->getCaseSuccessor();
- break;
- }
-
- // Check to see if this branch is going to the same place as the default
- // dest. If so, eliminate it as an explicit compare.
- if (i->getCaseSuccessor() == DefaultDest) {
- MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
- unsigned NCases = SI->getNumCases();
- // Fold the case metadata into the default if there will be any branches
- // left, unless the metadata doesn't match the switch.
- if (NCases > 1 && MD && MD->getNumOperands() == 2 + NCases) {
- // Collect branch weights into a vector.
- SmallVector<uint32_t, 8> Weights;
- for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
- ++MD_i) {
- auto *CI = mdconst::extract<ConstantInt>(MD->getOperand(MD_i));
- Weights.push_back(CI->getValue().getZExtValue());
- }
- // Merge weight of this case to the default weight.
- unsigned idx = i->getCaseIndex();
- Weights[0] += Weights[idx+1];
- // Remove weight for this case.
- std::swap(Weights[idx+1], Weights.back());
- Weights.pop_back();
- SI->setMetadata(LLVMContext::MD_prof,
- MDBuilder(BB->getContext()).
- createBranchWeights(Weights));
- }
- // Remove this entry.
- BasicBlock *ParentBB = SI->getParent();
- DefaultDest->removePredecessor(ParentBB);
- i = SI->removeCase(i);
- e = SI->case_end();
+ // Figure out which case it goes to.
+ for (auto i = SI->case_begin(), e = SI->case_end(); i != e;) {
+ // Found case matching a constant operand?
+ if (i->getCaseValue() == CI) {
+ TheOnlyDest = i->getCaseSuccessor();
+ break;
+ }
+
+ // Check to see if this branch is going to the same place as the default
+ // dest. If so, eliminate it as an explicit compare.
+ if (i->getCaseSuccessor() == DefaultDest) {
+ MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
+ unsigned NCases = SI->getNumCases();
+ // Fold the case metadata into the default if there will be any branches
+ // left, unless the metadata doesn't match the switch.
+ if (NCases > 1 && MD && MD->getNumOperands() == 2 + NCases) {
+ // Collect branch weights into a vector.
+ SmallVector<uint32_t, 8> Weights;
+ for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
+ ++MD_i) {
+ auto *CI = mdconst::extract<ConstantInt>(MD->getOperand(MD_i));
+ Weights.push_back(CI->getValue().getZExtValue());
+ }
+ // Merge weight of this case to the default weight.
+ unsigned idx = i->getCaseIndex();
+ Weights[0] += Weights[idx+1];
+ // Remove weight for this case.
+ std::swap(Weights[idx+1], Weights.back());
+ Weights.pop_back();
+ SI->setMetadata(LLVMContext::MD_prof,
+ MDBuilder(BB->getContext()).
+ createBranchWeights(Weights));
+ }
+ // Remove this entry.
+ BasicBlock *ParentBB = SI->getParent();
+ DefaultDest->removePredecessor(ParentBB);
+ i = SI->removeCase(i);
+ e = SI->case_end();
Changed = true;
- continue;
- }
-
- // Otherwise, check to see if the switch only branches to one destination.
- // We do this by reseting "TheOnlyDest" to null when we find two non-equal
- // destinations.
- if (i->getCaseSuccessor() != TheOnlyDest)
- TheOnlyDest = nullptr;
-
- // Increment this iterator as we haven't removed the case.
- ++i;
- }
-
- if (CI && !TheOnlyDest) {
- // Branching on a constant, but not any of the cases, go to the default
- // successor.
- TheOnlyDest = SI->getDefaultDest();
- }
-
- // If we found a single destination that we can fold the switch into, do so
- // now.
- if (TheOnlyDest) {
- // Insert the new branch.
- Builder.CreateBr(TheOnlyDest);
- BasicBlock *BB = SI->getParent();
-
+ continue;
+ }
+
+ // Otherwise, check to see if the switch only branches to one destination.
+ // We do this by reseting "TheOnlyDest" to null when we find two non-equal
+ // destinations.
+ if (i->getCaseSuccessor() != TheOnlyDest)
+ TheOnlyDest = nullptr;
+
+ // Increment this iterator as we haven't removed the case.
+ ++i;
+ }
+
+ if (CI && !TheOnlyDest) {
+ // Branching on a constant, but not any of the cases, go to the default
+ // successor.
+ TheOnlyDest = SI->getDefaultDest();
+ }
+
+ // If we found a single destination that we can fold the switch into, do so
+ // now.
+ if (TheOnlyDest) {
+ // Insert the new branch.
+ Builder.CreateBr(TheOnlyDest);
+ BasicBlock *BB = SI->getParent();
+
SmallSetVector<BasicBlock *, 8> RemovedSuccessors;
- // Remove entries from PHI nodes which we no longer branch to...
+ // Remove entries from PHI nodes which we no longer branch to...
BasicBlock *SuccToKeep = TheOnlyDest;
- for (BasicBlock *Succ : successors(SI)) {
+ for (BasicBlock *Succ : successors(SI)) {
if (DTU && Succ != TheOnlyDest)
RemovedSuccessors.insert(Succ);
- // Found case matching a constant operand?
+ // Found case matching a constant operand?
if (Succ == SuccToKeep) {
SuccToKeep = nullptr; // Don't modify the first branch to TheOnlyDest
- } else {
- Succ->removePredecessor(BB);
- }
- }
-
- // Delete the old switch.
- Value *Cond = SI->getCondition();
- SI->eraseFromParent();
- if (DeleteDeadConditions)
- RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
+ } else {
+ Succ->removePredecessor(BB);
+ }
+ }
+
+ // Delete the old switch.
+ Value *Cond = SI->getCondition();
+ SI->eraseFromParent();
+ if (DeleteDeadConditions)
+ RecursivelyDeleteTriviallyDeadInstructions(Cond, TLI);
if (DTU) {
std::vector<DominatorTree::UpdateType> Updates;
Updates.reserve(RemovedSuccessors.size());
@@ -284,86 +284,86 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor});
DTU->applyUpdates(Updates);
}
- return true;
- }
-
- if (SI->getNumCases() == 1) {
- // Otherwise, we can fold this switch into a conditional branch
- // instruction if it has only one non-default destination.
- auto FirstCase = *SI->case_begin();
- Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
- FirstCase.getCaseValue(), "cond");
-
- // Insert the new branch.
- BranchInst *NewBr = Builder.CreateCondBr(Cond,
- FirstCase.getCaseSuccessor(),
- SI->getDefaultDest());
- MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
- if (MD && MD->getNumOperands() == 3) {
- ConstantInt *SICase =
- mdconst::dyn_extract<ConstantInt>(MD->getOperand(2));
- ConstantInt *SIDef =
- mdconst::dyn_extract<ConstantInt>(MD->getOperand(1));
- assert(SICase && SIDef);
- // The TrueWeight should be the weight for the single case of SI.
- NewBr->setMetadata(LLVMContext::MD_prof,
- MDBuilder(BB->getContext()).
- createBranchWeights(SICase->getValue().getZExtValue(),
- SIDef->getValue().getZExtValue()));
- }
-
- // Update make.implicit metadata to the newly-created conditional branch.
- MDNode *MakeImplicitMD = SI->getMetadata(LLVMContext::MD_make_implicit);
- if (MakeImplicitMD)
- NewBr->setMetadata(LLVMContext::MD_make_implicit, MakeImplicitMD);
-
- // Delete the old switch.
- SI->eraseFromParent();
- return true;
- }
+ return true;
+ }
+
+ if (SI->getNumCases() == 1) {
+ // Otherwise, we can fold this switch into a conditional branch
+ // instruction if it has only one non-default destination.
+ auto FirstCase = *SI->case_begin();
+ Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
+ FirstCase.getCaseValue(), "cond");
+
+ // Insert the new branch.
+ BranchInst *NewBr = Builder.CreateCondBr(Cond,
+ FirstCase.getCaseSuccessor(),
+ SI->getDefaultDest());
+ MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
+ if (MD && MD->getNumOperands() == 3) {
+ ConstantInt *SICase =
+ mdconst::dyn_extract<ConstantInt>(MD->getOperand(2));
+ ConstantInt *SIDef =
+ mdconst::dyn_extract<ConstantInt>(MD->getOperand(1));
+ assert(SICase && SIDef);
+ // The TrueWeight should be the weight for the single case of SI.
+ NewBr->setMetadata(LLVMContext::MD_prof,
+ MDBuilder(BB->getContext()).
+ createBranchWeights(SICase->getValue().getZExtValue(),
+ SIDef->getValue().getZExtValue()));
+ }
+
+ // Update make.implicit metadata to the newly-created conditional branch.
+ MDNode *MakeImplicitMD = SI->getMetadata(LLVMContext::MD_make_implicit);
+ if (MakeImplicitMD)
+ NewBr->setMetadata(LLVMContext::MD_make_implicit, MakeImplicitMD);
+
+ // Delete the old switch.
+ SI->eraseFromParent();
+ return true;
+ }
return Changed;
- }
-
- if (auto *IBI = dyn_cast<IndirectBrInst>(T)) {
- // indirectbr blockaddress(@F, @BB) -> br label @BB
- if (auto *BA =
- dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) {
- BasicBlock *TheOnlyDest = BA->getBasicBlock();
+ }
+
+ if (auto *IBI = dyn_cast<IndirectBrInst>(T)) {
+ // indirectbr blockaddress(@F, @BB) -> br label @BB
+ if (auto *BA =
+ dyn_cast<BlockAddress>(IBI->getAddress()->stripPointerCasts())) {
+ BasicBlock *TheOnlyDest = BA->getBasicBlock();
SmallSetVector<BasicBlock *, 8> RemovedSuccessors;
-
- // Insert the new branch.
- Builder.CreateBr(TheOnlyDest);
-
+
+ // Insert the new branch.
+ Builder.CreateBr(TheOnlyDest);
+
BasicBlock *SuccToKeep = TheOnlyDest;
- for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
+ for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
BasicBlock *DestBB = IBI->getDestination(i);
if (DTU && DestBB != TheOnlyDest)
RemovedSuccessors.insert(DestBB);
if (IBI->getDestination(i) == SuccToKeep) {
SuccToKeep = nullptr;
- } else {
+ } else {
DestBB->removePredecessor(BB);
- }
- }
- Value *Address = IBI->getAddress();
- IBI->eraseFromParent();
- if (DeleteDeadConditions)
- // Delete pointer cast instructions.
- RecursivelyDeleteTriviallyDeadInstructions(Address, TLI);
-
- // Also zap the blockaddress constant if there are no users remaining,
- // otherwise the destination is still marked as having its address taken.
- if (BA->use_empty())
- BA->destroyConstant();
-
- // If we didn't find our destination in the IBI successor list, then we
- // have undefined behavior. Replace the unconditional branch with an
- // 'unreachable' instruction.
+ }
+ }
+ Value *Address = IBI->getAddress();
+ IBI->eraseFromParent();
+ if (DeleteDeadConditions)
+ // Delete pointer cast instructions.
+ RecursivelyDeleteTriviallyDeadInstructions(Address, TLI);
+
+ // Also zap the blockaddress constant if there are no users remaining,
+ // otherwise the destination is still marked as having its address taken.
+ if (BA->use_empty())
+ BA->destroyConstant();
+
+ // If we didn't find our destination in the IBI successor list, then we
+ // have undefined behavior. Replace the unconditional branch with an
+ // 'unreachable' instruction.
if (SuccToKeep) {
- BB->getTerminator()->eraseFromParent();
- new UnreachableInst(BB->getContext(), BB);
- }
-
+ BB->getTerminator()->eraseFromParent();
+ new UnreachableInst(BB->getContext(), BB);
+ }
+
if (DTU) {
std::vector<DominatorTree::UpdateType> Updates;
Updates.reserve(RemovedSuccessors.size());
@@ -371,562 +371,562 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor});
DTU->applyUpdates(Updates);
}
- return true;
- }
- }
-
- return false;
-}
-
-//===----------------------------------------------------------------------===//
-// Local dead code elimination.
-//
-
-/// isInstructionTriviallyDead - Return true if the result produced by the
-/// instruction is not used, and the instruction has no side effects.
-///
-bool llvm::isInstructionTriviallyDead(Instruction *I,
- const TargetLibraryInfo *TLI) {
- if (!I->use_empty())
- return false;
- return wouldInstructionBeTriviallyDead(I, TLI);
-}
-
-bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
- const TargetLibraryInfo *TLI) {
- if (I->isTerminator())
- return false;
-
- // We don't want the landingpad-like instructions removed by anything this
- // general.
- if (I->isEHPad())
- return false;
-
- // We don't want debug info removed by anything this general, unless
- // debug info is empty.
- if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(I)) {
- if (DDI->getAddress())
- return false;
- return true;
- }
- if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(I)) {
- if (DVI->getValue())
- return false;
- return true;
- }
- if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(I)) {
- if (DLI->getLabel())
- return false;
- return true;
- }
-
+ return true;
+ }
+ }
+
+ return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Local dead code elimination.
+//
+
+/// isInstructionTriviallyDead - Return true if the result produced by the
+/// instruction is not used, and the instruction has no side effects.
+///
+bool llvm::isInstructionTriviallyDead(Instruction *I,
+ const TargetLibraryInfo *TLI) {
+ if (!I->use_empty())
+ return false;
+ return wouldInstructionBeTriviallyDead(I, TLI);
+}
+
+bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
+ const TargetLibraryInfo *TLI) {
+ if (I->isTerminator())
+ return false;
+
+ // We don't want the landingpad-like instructions removed by anything this
+ // general.
+ if (I->isEHPad())
+ return false;
+
+ // We don't want debug info removed by anything this general, unless
+ // debug info is empty.
+ if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(I)) {
+ if (DDI->getAddress())
+ return false;
+ return true;
+ }
+ if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(I)) {
+ if (DVI->getValue())
+ return false;
+ return true;
+ }
+ if (DbgLabelInst *DLI = dyn_cast<DbgLabelInst>(I)) {
+ if (DLI->getLabel())
+ return false;
+ return true;
+ }
+
if (!I->willReturn())
return false;
- if (!I->mayHaveSideEffects())
- return true;
-
- // Special case intrinsics that "may have side effects" but can be deleted
- // when dead.
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
- // Safe to delete llvm.stacksave and launder.invariant.group if dead.
- if (II->getIntrinsicID() == Intrinsic::stacksave ||
- II->getIntrinsicID() == Intrinsic::launder_invariant_group)
- return true;
-
- if (II->isLifetimeStartOrEnd()) {
- auto *Arg = II->getArgOperand(1);
- // Lifetime intrinsics are dead when their right-hand is undef.
- if (isa<UndefValue>(Arg))
- return true;
- // If the right-hand is an alloc, global, or argument and the only uses
- // are lifetime intrinsics then the intrinsics are dead.
- if (isa<AllocaInst>(Arg) || isa<GlobalValue>(Arg) || isa<Argument>(Arg))
- return llvm::all_of(Arg->uses(), [](Use &Use) {
- if (IntrinsicInst *IntrinsicUse =
- dyn_cast<IntrinsicInst>(Use.getUser()))
- return IntrinsicUse->isLifetimeStartOrEnd();
- return false;
- });
- return false;
- }
-
- // Assumptions are dead if their condition is trivially true. Guards on
- // true are operationally no-ops. In the future we can consider more
- // sophisticated tradeoffs for guards considering potential for check
- // widening, but for now we keep things simple.
- if ((II->getIntrinsicID() == Intrinsic::assume &&
- isAssumeWithEmptyBundle(*II)) ||
- II->getIntrinsicID() == Intrinsic::experimental_guard) {
- if (ConstantInt *Cond = dyn_cast<ConstantInt>(II->getArgOperand(0)))
- return !Cond->isZero();
-
- return false;
- }
- }
-
- if (isAllocLikeFn(I, TLI))
- return true;
-
- if (CallInst *CI = isFreeCall(I, TLI))
- if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
- return C->isNullValue() || isa<UndefValue>(C);
-
- if (auto *Call = dyn_cast<CallBase>(I))
- if (isMathLibCallNoop(Call, TLI))
- return true;
-
- return false;
-}
-
-/// RecursivelyDeleteTriviallyDeadInstructions - If the specified value is a
-/// trivially dead instruction, delete it. If that makes any of its operands
-/// trivially dead, delete them too, recursively. Return true if any
-/// instructions were deleted.
-bool llvm::RecursivelyDeleteTriviallyDeadInstructions(
+ if (!I->mayHaveSideEffects())
+ return true;
+
+ // Special case intrinsics that "may have side effects" but can be deleted
+ // when dead.
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+ // Safe to delete llvm.stacksave and launder.invariant.group if dead.
+ if (II->getIntrinsicID() == Intrinsic::stacksave ||
+ II->getIntrinsicID() == Intrinsic::launder_invariant_group)
+ return true;
+
+ if (II->isLifetimeStartOrEnd()) {
+ auto *Arg = II->getArgOperand(1);
+ // Lifetime intrinsics are dead when their right-hand is undef.
+ if (isa<UndefValue>(Arg))
+ return true;
+ // If the right-hand is an alloc, global, or argument and the only uses
+ // are lifetime intrinsics then the intrinsics are dead.
+ if (isa<AllocaInst>(Arg) || isa<GlobalValue>(Arg) || isa<Argument>(Arg))
+ return llvm::all_of(Arg->uses(), [](Use &Use) {
+ if (IntrinsicInst *IntrinsicUse =
+ dyn_cast<IntrinsicInst>(Use.getUser()))
+ return IntrinsicUse->isLifetimeStartOrEnd();
+ return false;
+ });
+ return false;
+ }
+
+ // Assumptions are dead if their condition is trivially true. Guards on
+ // true are operationally no-ops. In the future we can consider more
+ // sophisticated tradeoffs for guards considering potential for check
+ // widening, but for now we keep things simple.
+ if ((II->getIntrinsicID() == Intrinsic::assume &&
+ isAssumeWithEmptyBundle(*II)) ||
+ II->getIntrinsicID() == Intrinsic::experimental_guard) {
+ if (ConstantInt *Cond = dyn_cast<ConstantInt>(II->getArgOperand(0)))
+ return !Cond->isZero();
+
+ return false;
+ }
+ }
+
+ if (isAllocLikeFn(I, TLI))
+ return true;
+
+ if (CallInst *CI = isFreeCall(I, TLI))
+ if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
+ return C->isNullValue() || isa<UndefValue>(C);
+
+ if (auto *Call = dyn_cast<CallBase>(I))
+ if (isMathLibCallNoop(Call, TLI))
+ return true;
+
+ return false;
+}
+
+/// RecursivelyDeleteTriviallyDeadInstructions - If the specified value is a
+/// trivially dead instruction, delete it. If that makes any of its operands
+/// trivially dead, delete them too, recursively. Return true if any
+/// instructions were deleted.
+bool llvm::RecursivelyDeleteTriviallyDeadInstructions(
Value *V, const TargetLibraryInfo *TLI, MemorySSAUpdater *MSSAU,
std::function<void(Value *)> AboutToDeleteCallback) {
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I || !isInstructionTriviallyDead(I, TLI))
- return false;
-
- SmallVector<WeakTrackingVH, 16> DeadInsts;
- DeadInsts.push_back(I);
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I || !isInstructionTriviallyDead(I, TLI))
+ return false;
+
+ SmallVector<WeakTrackingVH, 16> DeadInsts;
+ DeadInsts.push_back(I);
RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU,
AboutToDeleteCallback);
-
- return true;
-}
-
-bool llvm::RecursivelyDeleteTriviallyDeadInstructionsPermissive(
- SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI,
+
+ return true;
+}
+
+bool llvm::RecursivelyDeleteTriviallyDeadInstructionsPermissive(
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI,
MemorySSAUpdater *MSSAU,
std::function<void(Value *)> AboutToDeleteCallback) {
- unsigned S = 0, E = DeadInsts.size(), Alive = 0;
- for (; S != E; ++S) {
- auto *I = cast<Instruction>(DeadInsts[S]);
- if (!isInstructionTriviallyDead(I)) {
- DeadInsts[S] = nullptr;
- ++Alive;
- }
- }
- if (Alive == E)
- return false;
+ unsigned S = 0, E = DeadInsts.size(), Alive = 0;
+ for (; S != E; ++S) {
+ auto *I = cast<Instruction>(DeadInsts[S]);
+ if (!isInstructionTriviallyDead(I)) {
+ DeadInsts[S] = nullptr;
+ ++Alive;
+ }
+ }
+ if (Alive == E)
+ return false;
RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU,
AboutToDeleteCallback);
- return true;
-}
-
-void llvm::RecursivelyDeleteTriviallyDeadInstructions(
- SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI,
+ return true;
+}
+
+void llvm::RecursivelyDeleteTriviallyDeadInstructions(
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI,
MemorySSAUpdater *MSSAU,
std::function<void(Value *)> AboutToDeleteCallback) {
- // Process the dead instruction list until empty.
- while (!DeadInsts.empty()) {
- Value *V = DeadInsts.pop_back_val();
- Instruction *I = cast_or_null<Instruction>(V);
- if (!I)
- continue;
- assert(isInstructionTriviallyDead(I, TLI) &&
- "Live instruction found in dead worklist!");
- assert(I->use_empty() && "Instructions with uses are not dead.");
-
- // Don't lose the debug info while deleting the instructions.
- salvageDebugInfo(*I);
-
+ // Process the dead instruction list until empty.
+ while (!DeadInsts.empty()) {
+ Value *V = DeadInsts.pop_back_val();
+ Instruction *I = cast_or_null<Instruction>(V);
+ if (!I)
+ continue;
+ assert(isInstructionTriviallyDead(I, TLI) &&
+ "Live instruction found in dead worklist!");
+ assert(I->use_empty() && "Instructions with uses are not dead.");
+
+ // Don't lose the debug info while deleting the instructions.
+ salvageDebugInfo(*I);
+
if (AboutToDeleteCallback)
AboutToDeleteCallback(I);
- // Null out all of the instruction's operands to see if any operand becomes
- // dead as we go.
- for (Use &OpU : I->operands()) {
- Value *OpV = OpU.get();
- OpU.set(nullptr);
-
- if (!OpV->use_empty())
- continue;
-
- // If the operand is an instruction that became dead as we nulled out the
- // operand, and if it is 'trivially' dead, delete it in a future loop
- // iteration.
- if (Instruction *OpI = dyn_cast<Instruction>(OpV))
- if (isInstructionTriviallyDead(OpI, TLI))
- DeadInsts.push_back(OpI);
- }
- if (MSSAU)
- MSSAU->removeMemoryAccess(I);
-
- I->eraseFromParent();
- }
-}
-
-bool llvm::replaceDbgUsesWithUndef(Instruction *I) {
- SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
- findDbgUsers(DbgUsers, I);
- for (auto *DII : DbgUsers) {
- Value *Undef = UndefValue::get(I->getType());
- DII->setOperand(0, MetadataAsValue::get(DII->getContext(),
- ValueAsMetadata::get(Undef)));
- }
- return !DbgUsers.empty();
-}
-
-/// areAllUsesEqual - Check whether the uses of a value are all the same.
-/// This is similar to Instruction::hasOneUse() except this will also return
-/// true when there are no uses or multiple uses that all refer to the same
-/// value.
-static bool areAllUsesEqual(Instruction *I) {
- Value::user_iterator UI = I->user_begin();
- Value::user_iterator UE = I->user_end();
- if (UI == UE)
- return true;
-
- User *TheUse = *UI;
- for (++UI; UI != UE; ++UI) {
- if (*UI != TheUse)
- return false;
- }
- return true;
-}
-
-/// RecursivelyDeleteDeadPHINode - If the specified value is an effectively
-/// dead PHI node, due to being a def-use chain of single-use nodes that
-/// either forms a cycle or is terminated by a trivially dead instruction,
-/// delete it. If that makes any of its operands trivially dead, delete them
-/// too, recursively. Return true if a change was made.
-bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,
- const TargetLibraryInfo *TLI,
- llvm::MemorySSAUpdater *MSSAU) {
- SmallPtrSet<Instruction*, 4> Visited;
- for (Instruction *I = PN; areAllUsesEqual(I) && !I->mayHaveSideEffects();
- I = cast<Instruction>(*I->user_begin())) {
- if (I->use_empty())
- return RecursivelyDeleteTriviallyDeadInstructions(I, TLI, MSSAU);
-
- // If we find an instruction more than once, we're on a cycle that
- // won't prove fruitful.
- if (!Visited.insert(I).second) {
- // Break the cycle and delete the instruction and its operands.
- I->replaceAllUsesWith(UndefValue::get(I->getType()));
- (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI, MSSAU);
- return true;
- }
- }
- return false;
-}
-
-static bool
-simplifyAndDCEInstruction(Instruction *I,
- SmallSetVector<Instruction *, 16> &WorkList,
- const DataLayout &DL,
- const TargetLibraryInfo *TLI) {
- if (isInstructionTriviallyDead(I, TLI)) {
- salvageDebugInfo(*I);
-
- // Null out all of the instruction's operands to see if any operand becomes
- // dead as we go.
- for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
- Value *OpV = I->getOperand(i);
- I->setOperand(i, nullptr);
-
- if (!OpV->use_empty() || I == OpV)
- continue;
-
- // If the operand is an instruction that became dead as we nulled out the
- // operand, and if it is 'trivially' dead, delete it in a future loop
- // iteration.
- if (Instruction *OpI = dyn_cast<Instruction>(OpV))
- if (isInstructionTriviallyDead(OpI, TLI))
- WorkList.insert(OpI);
- }
-
- I->eraseFromParent();
-
- return true;
- }
-
- if (Value *SimpleV = SimplifyInstruction(I, DL)) {
- // Add the users to the worklist. CAREFUL: an instruction can use itself,
- // in the case of a phi node.
- for (User *U : I->users()) {
- if (U != I) {
- WorkList.insert(cast<Instruction>(U));
- }
- }
-
- // Replace the instruction with its simplified value.
- bool Changed = false;
- if (!I->use_empty()) {
- I->replaceAllUsesWith(SimpleV);
- Changed = true;
- }
- if (isInstructionTriviallyDead(I, TLI)) {
- I->eraseFromParent();
- Changed = true;
- }
- return Changed;
- }
- return false;
-}
-
-/// SimplifyInstructionsInBlock - Scan the specified basic block and try to
-/// simplify any instructions in it and recursively delete dead instructions.
-///
-/// This returns true if it changed the code, note that it can delete
-/// instructions in other blocks as well in this block.
-bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
- const TargetLibraryInfo *TLI) {
- bool MadeChange = false;
- const DataLayout &DL = BB->getModule()->getDataLayout();
-
-#ifndef NDEBUG
- // In debug builds, ensure that the terminator of the block is never replaced
- // or deleted by these simplifications. The idea of simplification is that it
- // cannot introduce new instructions, and there is no way to replace the
- // terminator of a block without introducing a new instruction.
- AssertingVH<Instruction> TerminatorVH(&BB->back());
-#endif
-
- SmallSetVector<Instruction *, 16> WorkList;
- // Iterate over the original function, only adding insts to the worklist
- // if they actually need to be revisited. This avoids having to pre-init
- // the worklist with the entire function's worth of instructions.
- for (BasicBlock::iterator BI = BB->begin(), E = std::prev(BB->end());
- BI != E;) {
- assert(!BI->isTerminator());
- Instruction *I = &*BI;
- ++BI;
-
- // We're visiting this instruction now, so make sure it's not in the
- // worklist from an earlier visit.
- if (!WorkList.count(I))
- MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI);
- }
-
- while (!WorkList.empty()) {
- Instruction *I = WorkList.pop_back_val();
- MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI);
- }
- return MadeChange;
-}
-
-//===----------------------------------------------------------------------===//
-// Control Flow Graph Restructuring.
-//
-
-void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB,
- DomTreeUpdater *DTU) {
-
- // If BB has single-entry PHI nodes, fold them.
- while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
- Value *NewVal = PN->getIncomingValue(0);
- // Replace self referencing PHI with undef, it must be dead.
- if (NewVal == PN) NewVal = UndefValue::get(PN->getType());
- PN->replaceAllUsesWith(NewVal);
- PN->eraseFromParent();
- }
-
- BasicBlock *PredBB = DestBB->getSinglePredecessor();
- assert(PredBB && "Block doesn't have a single predecessor!");
-
- bool ReplaceEntryBB = false;
- if (PredBB == &DestBB->getParent()->getEntryBlock())
- ReplaceEntryBB = true;
-
- // DTU updates: Collect all the edges that enter
- // PredBB. These dominator edges will be redirected to DestBB.
- SmallVector<DominatorTree::UpdateType, 32> Updates;
-
- if (DTU) {
- for (auto I = pred_begin(PredBB), E = pred_end(PredBB); I != E; ++I) {
- // This predecessor of PredBB may already have DestBB as a successor.
+ // Null out all of the instruction's operands to see if any operand becomes
+ // dead as we go.
+ for (Use &OpU : I->operands()) {
+ Value *OpV = OpU.get();
+ OpU.set(nullptr);
+
+ if (!OpV->use_empty())
+ continue;
+
+ // If the operand is an instruction that became dead as we nulled out the
+ // operand, and if it is 'trivially' dead, delete it in a future loop
+ // iteration.
+ if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+ if (isInstructionTriviallyDead(OpI, TLI))
+ DeadInsts.push_back(OpI);
+ }
+ if (MSSAU)
+ MSSAU->removeMemoryAccess(I);
+
+ I->eraseFromParent();
+ }
+}
+
+bool llvm::replaceDbgUsesWithUndef(Instruction *I) {
+ SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
+ findDbgUsers(DbgUsers, I);
+ for (auto *DII : DbgUsers) {
+ Value *Undef = UndefValue::get(I->getType());
+ DII->setOperand(0, MetadataAsValue::get(DII->getContext(),
+ ValueAsMetadata::get(Undef)));
+ }
+ return !DbgUsers.empty();
+}
+
+/// areAllUsesEqual - Check whether the uses of a value are all the same.
+/// This is similar to Instruction::hasOneUse() except this will also return
+/// true when there are no uses or multiple uses that all refer to the same
+/// value.
+static bool areAllUsesEqual(Instruction *I) {
+ Value::user_iterator UI = I->user_begin();
+ Value::user_iterator UE = I->user_end();
+ if (UI == UE)
+ return true;
+
+ User *TheUse = *UI;
+ for (++UI; UI != UE; ++UI) {
+ if (*UI != TheUse)
+ return false;
+ }
+ return true;
+}
+
+/// RecursivelyDeleteDeadPHINode - If the specified value is an effectively
+/// dead PHI node, due to being a def-use chain of single-use nodes that
+/// either forms a cycle or is terminated by a trivially dead instruction,
+/// delete it. If that makes any of its operands trivially dead, delete them
+/// too, recursively. Return true if a change was made.
+bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,
+ const TargetLibraryInfo *TLI,
+ llvm::MemorySSAUpdater *MSSAU) {
+ SmallPtrSet<Instruction*, 4> Visited;
+ for (Instruction *I = PN; areAllUsesEqual(I) && !I->mayHaveSideEffects();
+ I = cast<Instruction>(*I->user_begin())) {
+ if (I->use_empty())
+ return RecursivelyDeleteTriviallyDeadInstructions(I, TLI, MSSAU);
+
+ // If we find an instruction more than once, we're on a cycle that
+ // won't prove fruitful.
+ if (!Visited.insert(I).second) {
+ // Break the cycle and delete the instruction and its operands.
+ I->replaceAllUsesWith(UndefValue::get(I->getType()));
+ (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI, MSSAU);
+ return true;
+ }
+ }
+ return false;
+}
+
+static bool
+simplifyAndDCEInstruction(Instruction *I,
+ SmallSetVector<Instruction *, 16> &WorkList,
+ const DataLayout &DL,
+ const TargetLibraryInfo *TLI) {
+ if (isInstructionTriviallyDead(I, TLI)) {
+ salvageDebugInfo(*I);
+
+ // Null out all of the instruction's operands to see if any operand becomes
+ // dead as we go.
+ for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+ Value *OpV = I->getOperand(i);
+ I->setOperand(i, nullptr);
+
+ if (!OpV->use_empty() || I == OpV)
+ continue;
+
+ // If the operand is an instruction that became dead as we nulled out the
+ // operand, and if it is 'trivially' dead, delete it in a future loop
+ // iteration.
+ if (Instruction *OpI = dyn_cast<Instruction>(OpV))
+ if (isInstructionTriviallyDead(OpI, TLI))
+ WorkList.insert(OpI);
+ }
+
+ I->eraseFromParent();
+
+ return true;
+ }
+
+ if (Value *SimpleV = SimplifyInstruction(I, DL)) {
+ // Add the users to the worklist. CAREFUL: an instruction can use itself,
+ // in the case of a phi node.
+ for (User *U : I->users()) {
+ if (U != I) {
+ WorkList.insert(cast<Instruction>(U));
+ }
+ }
+
+ // Replace the instruction with its simplified value.
+ bool Changed = false;
+ if (!I->use_empty()) {
+ I->replaceAllUsesWith(SimpleV);
+ Changed = true;
+ }
+ if (isInstructionTriviallyDead(I, TLI)) {
+ I->eraseFromParent();
+ Changed = true;
+ }
+ return Changed;
+ }
+ return false;
+}
+
+/// SimplifyInstructionsInBlock - Scan the specified basic block and try to
+/// simplify any instructions in it and recursively delete dead instructions.
+///
+/// This returns true if it changed the code, note that it can delete
+/// instructions in other blocks as well in this block.
+bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB,
+ const TargetLibraryInfo *TLI) {
+ bool MadeChange = false;
+ const DataLayout &DL = BB->getModule()->getDataLayout();
+
+#ifndef NDEBUG
+ // In debug builds, ensure that the terminator of the block is never replaced
+ // or deleted by these simplifications. The idea of simplification is that it
+ // cannot introduce new instructions, and there is no way to replace the
+ // terminator of a block without introducing a new instruction.
+ AssertingVH<Instruction> TerminatorVH(&BB->back());
+#endif
+
+ SmallSetVector<Instruction *, 16> WorkList;
+ // Iterate over the original function, only adding insts to the worklist
+ // if they actually need to be revisited. This avoids having to pre-init
+ // the worklist with the entire function's worth of instructions.
+ for (BasicBlock::iterator BI = BB->begin(), E = std::prev(BB->end());
+ BI != E;) {
+ assert(!BI->isTerminator());
+ Instruction *I = &*BI;
+ ++BI;
+
+ // We're visiting this instruction now, so make sure it's not in the
+ // worklist from an earlier visit.
+ if (!WorkList.count(I))
+ MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI);
+ }
+
+ while (!WorkList.empty()) {
+ Instruction *I = WorkList.pop_back_val();
+ MadeChange |= simplifyAndDCEInstruction(I, WorkList, DL, TLI);
+ }
+ return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+// Control Flow Graph Restructuring.
+//
+
+void llvm::MergeBasicBlockIntoOnlyPred(BasicBlock *DestBB,
+ DomTreeUpdater *DTU) {
+
+ // If BB has single-entry PHI nodes, fold them.
+ while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
+ Value *NewVal = PN->getIncomingValue(0);
+ // Replace self referencing PHI with undef, it must be dead.
+ if (NewVal == PN) NewVal = UndefValue::get(PN->getType());
+ PN->replaceAllUsesWith(NewVal);
+ PN->eraseFromParent();
+ }
+
+ BasicBlock *PredBB = DestBB->getSinglePredecessor();
+ assert(PredBB && "Block doesn't have a single predecessor!");
+
+ bool ReplaceEntryBB = false;
+ if (PredBB == &DestBB->getParent()->getEntryBlock())
+ ReplaceEntryBB = true;
+
+ // DTU updates: Collect all the edges that enter
+ // PredBB. These dominator edges will be redirected to DestBB.
+ SmallVector<DominatorTree::UpdateType, 32> Updates;
+
+ if (DTU) {
+ for (auto I = pred_begin(PredBB), E = pred_end(PredBB); I != E; ++I) {
+ // This predecessor of PredBB may already have DestBB as a successor.
if (!llvm::is_contained(successors(*I), DestBB))
- Updates.push_back({DominatorTree::Insert, *I, DestBB});
+ Updates.push_back({DominatorTree::Insert, *I, DestBB});
Updates.push_back({DominatorTree::Delete, *I, PredBB});
- }
+ }
Updates.push_back({DominatorTree::Delete, PredBB, DestBB});
- }
-
- // Zap anything that took the address of DestBB. Not doing this will give the
- // address an invalid value.
- if (DestBB->hasAddressTaken()) {
- BlockAddress *BA = BlockAddress::get(DestBB);
- Constant *Replacement =
- ConstantInt::get(Type::getInt32Ty(BA->getContext()), 1);
- BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(Replacement,
- BA->getType()));
- BA->destroyConstant();
- }
-
- // Anything that branched to PredBB now branches to DestBB.
- PredBB->replaceAllUsesWith(DestBB);
-
- // Splice all the instructions from PredBB to DestBB.
- PredBB->getTerminator()->eraseFromParent();
- DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList());
- new UnreachableInst(PredBB->getContext(), PredBB);
-
- // If the PredBB is the entry block of the function, move DestBB up to
- // become the entry block after we erase PredBB.
- if (ReplaceEntryBB)
- DestBB->moveAfter(PredBB);
-
- if (DTU) {
- assert(PredBB->getInstList().size() == 1 &&
- isa<UnreachableInst>(PredBB->getTerminator()) &&
- "The successor list of PredBB isn't empty before "
- "applying corresponding DTU updates.");
- DTU->applyUpdatesPermissive(Updates);
- DTU->deleteBB(PredBB);
- // Recalculation of DomTree is needed when updating a forward DomTree and
- // the Entry BB is replaced.
- if (ReplaceEntryBB && DTU->hasDomTree()) {
- // The entry block was removed and there is no external interface for
- // the dominator tree to be notified of this change. In this corner-case
- // we recalculate the entire tree.
- DTU->recalculate(*(DestBB->getParent()));
- }
- }
-
- else {
- PredBB->eraseFromParent(); // Nuke BB if DTU is nullptr.
- }
-}
-
-/// Return true if we can choose one of these values to use in place of the
-/// other. Note that we will always choose the non-undef value to keep.
-static bool CanMergeValues(Value *First, Value *Second) {
- return First == Second || isa<UndefValue>(First) || isa<UndefValue>(Second);
-}
-
-/// Return true if we can fold BB, an almost-empty BB ending in an unconditional
-/// branch to Succ, into Succ.
-///
-/// Assumption: Succ is the single successor for BB.
-static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
- assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!");
-
- LLVM_DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into "
- << Succ->getName() << "\n");
- // Shortcut, if there is only a single predecessor it must be BB and merging
- // is always safe
- if (Succ->getSinglePredecessor()) return true;
-
- // Make a list of the predecessors of BB
- SmallPtrSet<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB));
-
- // Look at all the phi nodes in Succ, to see if they present a conflict when
- // merging these blocks
- for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
- PHINode *PN = cast<PHINode>(I);
-
- // If the incoming value from BB is again a PHINode in
- // BB which has the same incoming value for *PI as PN does, we can
- // merge the phi nodes and then the blocks can still be merged
- PHINode *BBPN = dyn_cast<PHINode>(PN->getIncomingValueForBlock(BB));
- if (BBPN && BBPN->getParent() == BB) {
- for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) {
- BasicBlock *IBB = PN->getIncomingBlock(PI);
- if (BBPreds.count(IBB) &&
- !CanMergeValues(BBPN->getIncomingValueForBlock(IBB),
- PN->getIncomingValue(PI))) {
- LLVM_DEBUG(dbgs()
- << "Can't fold, phi node " << PN->getName() << " in "
- << Succ->getName() << " is conflicting with "
- << BBPN->getName() << " with regard to common predecessor "
- << IBB->getName() << "\n");
- return false;
- }
- }
- } else {
- Value* Val = PN->getIncomingValueForBlock(BB);
- for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) {
- // See if the incoming value for the common predecessor is equal to the
- // one for BB, in which case this phi node will not prevent the merging
- // of the block.
- BasicBlock *IBB = PN->getIncomingBlock(PI);
- if (BBPreds.count(IBB) &&
- !CanMergeValues(Val, PN->getIncomingValue(PI))) {
- LLVM_DEBUG(dbgs() << "Can't fold, phi node " << PN->getName()
- << " in " << Succ->getName()
- << " is conflicting with regard to common "
- << "predecessor " << IBB->getName() << "\n");
- return false;
- }
- }
- }
- }
-
- return true;
-}
-
-using PredBlockVector = SmallVector<BasicBlock *, 16>;
-using IncomingValueMap = DenseMap<BasicBlock *, Value *>;
-
-/// Determines the value to use as the phi node input for a block.
-///
-/// Select between \p OldVal any value that we know flows from \p BB
-/// to a particular phi on the basis of which one (if either) is not
-/// undef. Update IncomingValues based on the selected value.
-///
-/// \param OldVal The value we are considering selecting.
-/// \param BB The block that the value flows in from.
-/// \param IncomingValues A map from block-to-value for other phi inputs
-/// that we have examined.
-///
-/// \returns the selected value.
-static Value *selectIncomingValueForBlock(Value *OldVal, BasicBlock *BB,
- IncomingValueMap &IncomingValues) {
- if (!isa<UndefValue>(OldVal)) {
- assert((!IncomingValues.count(BB) ||
- IncomingValues.find(BB)->second == OldVal) &&
- "Expected OldVal to match incoming value from BB!");
-
- IncomingValues.insert(std::make_pair(BB, OldVal));
- return OldVal;
- }
-
- IncomingValueMap::const_iterator It = IncomingValues.find(BB);
- if (It != IncomingValues.end()) return It->second;
-
- return OldVal;
-}
-
-/// Create a map from block to value for the operands of a
-/// given phi.
-///
-/// Create a map from block to value for each non-undef value flowing
-/// into \p PN.
-///
-/// \param PN The phi we are collecting the map for.
-/// \param IncomingValues [out] The map from block to value for this phi.
-static void gatherIncomingValuesToPhi(PHINode *PN,
- IncomingValueMap &IncomingValues) {
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- BasicBlock *BB = PN->getIncomingBlock(i);
- Value *V = PN->getIncomingValue(i);
-
- if (!isa<UndefValue>(V))
- IncomingValues.insert(std::make_pair(BB, V));
- }
-}
-
-/// Replace the incoming undef values to a phi with the values
-/// from a block-to-value map.
-///
-/// \param PN The phi we are replacing the undefs in.
-/// \param IncomingValues A map from block to value.
-static void replaceUndefValuesInPhi(PHINode *PN,
- const IncomingValueMap &IncomingValues) {
+ }
+
+ // Zap anything that took the address of DestBB. Not doing this will give the
+ // address an invalid value.
+ if (DestBB->hasAddressTaken()) {
+ BlockAddress *BA = BlockAddress::get(DestBB);
+ Constant *Replacement =
+ ConstantInt::get(Type::getInt32Ty(BA->getContext()), 1);
+ BA->replaceAllUsesWith(ConstantExpr::getIntToPtr(Replacement,
+ BA->getType()));
+ BA->destroyConstant();
+ }
+
+ // Anything that branched to PredBB now branches to DestBB.
+ PredBB->replaceAllUsesWith(DestBB);
+
+ // Splice all the instructions from PredBB to DestBB.
+ PredBB->getTerminator()->eraseFromParent();
+ DestBB->getInstList().splice(DestBB->begin(), PredBB->getInstList());
+ new UnreachableInst(PredBB->getContext(), PredBB);
+
+ // If the PredBB is the entry block of the function, move DestBB up to
+ // become the entry block after we erase PredBB.
+ if (ReplaceEntryBB)
+ DestBB->moveAfter(PredBB);
+
+ if (DTU) {
+ assert(PredBB->getInstList().size() == 1 &&
+ isa<UnreachableInst>(PredBB->getTerminator()) &&
+ "The successor list of PredBB isn't empty before "
+ "applying corresponding DTU updates.");
+ DTU->applyUpdatesPermissive(Updates);
+ DTU->deleteBB(PredBB);
+ // Recalculation of DomTree is needed when updating a forward DomTree and
+ // the Entry BB is replaced.
+ if (ReplaceEntryBB && DTU->hasDomTree()) {
+ // The entry block was removed and there is no external interface for
+ // the dominator tree to be notified of this change. In this corner-case
+ // we recalculate the entire tree.
+ DTU->recalculate(*(DestBB->getParent()));
+ }
+ }
+
+ else {
+ PredBB->eraseFromParent(); // Nuke BB if DTU is nullptr.
+ }
+}
+
+/// Return true if we can choose one of these values to use in place of the
+/// other. Note that we will always choose the non-undef value to keep.
+static bool CanMergeValues(Value *First, Value *Second) {
+ return First == Second || isa<UndefValue>(First) || isa<UndefValue>(Second);
+}
+
+/// Return true if we can fold BB, an almost-empty BB ending in an unconditional
+/// branch to Succ, into Succ.
+///
+/// Assumption: Succ is the single successor for BB.
+static bool CanPropagatePredecessorsForPHIs(BasicBlock *BB, BasicBlock *Succ) {
+ assert(*succ_begin(BB) == Succ && "Succ is not successor of BB!");
+
+ LLVM_DEBUG(dbgs() << "Looking to fold " << BB->getName() << " into "
+ << Succ->getName() << "\n");
+ // Shortcut, if there is only a single predecessor it must be BB and merging
+ // is always safe
+ if (Succ->getSinglePredecessor()) return true;
+
+ // Make a list of the predecessors of BB
+ SmallPtrSet<BasicBlock*, 16> BBPreds(pred_begin(BB), pred_end(BB));
+
+ // Look at all the phi nodes in Succ, to see if they present a conflict when
+ // merging these blocks
+ for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PN = cast<PHINode>(I);
+
+ // If the incoming value from BB is again a PHINode in
+ // BB which has the same incoming value for *PI as PN does, we can
+ // merge the phi nodes and then the blocks can still be merged
+ PHINode *BBPN = dyn_cast<PHINode>(PN->getIncomingValueForBlock(BB));
+ if (BBPN && BBPN->getParent() == BB) {
+ for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) {
+ BasicBlock *IBB = PN->getIncomingBlock(PI);
+ if (BBPreds.count(IBB) &&
+ !CanMergeValues(BBPN->getIncomingValueForBlock(IBB),
+ PN->getIncomingValue(PI))) {
+ LLVM_DEBUG(dbgs()
+ << "Can't fold, phi node " << PN->getName() << " in "
+ << Succ->getName() << " is conflicting with "
+ << BBPN->getName() << " with regard to common predecessor "
+ << IBB->getName() << "\n");
+ return false;
+ }
+ }
+ } else {
+ Value* Val = PN->getIncomingValueForBlock(BB);
+ for (unsigned PI = 0, PE = PN->getNumIncomingValues(); PI != PE; ++PI) {
+ // See if the incoming value for the common predecessor is equal to the
+ // one for BB, in which case this phi node will not prevent the merging
+ // of the block.
+ BasicBlock *IBB = PN->getIncomingBlock(PI);
+ if (BBPreds.count(IBB) &&
+ !CanMergeValues(Val, PN->getIncomingValue(PI))) {
+ LLVM_DEBUG(dbgs() << "Can't fold, phi node " << PN->getName()
+ << " in " << Succ->getName()
+ << " is conflicting with regard to common "
+ << "predecessor " << IBB->getName() << "\n");
+ return false;
+ }
+ }
+ }
+ }
+
+ return true;
+}
+
+using PredBlockVector = SmallVector<BasicBlock *, 16>;
+using IncomingValueMap = DenseMap<BasicBlock *, Value *>;
+
+/// Determines the value to use as the phi node input for a block.
+///
+/// Select between \p OldVal any value that we know flows from \p BB
+/// to a particular phi on the basis of which one (if either) is not
+/// undef. Update IncomingValues based on the selected value.
+///
+/// \param OldVal The value we are considering selecting.
+/// \param BB The block that the value flows in from.
+/// \param IncomingValues A map from block-to-value for other phi inputs
+/// that we have examined.
+///
+/// \returns the selected value.
+static Value *selectIncomingValueForBlock(Value *OldVal, BasicBlock *BB,
+ IncomingValueMap &IncomingValues) {
+ if (!isa<UndefValue>(OldVal)) {
+ assert((!IncomingValues.count(BB) ||
+ IncomingValues.find(BB)->second == OldVal) &&
+ "Expected OldVal to match incoming value from BB!");
+
+ IncomingValues.insert(std::make_pair(BB, OldVal));
+ return OldVal;
+ }
+
+ IncomingValueMap::const_iterator It = IncomingValues.find(BB);
+ if (It != IncomingValues.end()) return It->second;
+
+ return OldVal;
+}
+
+/// Create a map from block to value for the operands of a
+/// given phi.
+///
+/// Create a map from block to value for each non-undef value flowing
+/// into \p PN.
+///
+/// \param PN The phi we are collecting the map for.
+/// \param IncomingValues [out] The map from block to value for this phi.
+static void gatherIncomingValuesToPhi(PHINode *PN,
+ IncomingValueMap &IncomingValues) {
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ BasicBlock *BB = PN->getIncomingBlock(i);
+ Value *V = PN->getIncomingValue(i);
+
+ if (!isa<UndefValue>(V))
+ IncomingValues.insert(std::make_pair(BB, V));
+ }
+}
+
+/// Replace the incoming undef values to a phi with the values
+/// from a block-to-value map.
+///
+/// \param PN The phi we are replacing the undefs in.
+/// \param IncomingValues A map from block to value.
+static void replaceUndefValuesInPhi(PHINode *PN,
+ const IncomingValueMap &IncomingValues) {
SmallVector<unsigned> TrueUndefOps;
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- Value *V = PN->getIncomingValue(i);
-
- if (!isa<UndefValue>(V)) continue;
-
- BasicBlock *BB = PN->getIncomingBlock(i);
- IncomingValueMap::const_iterator It = IncomingValues.find(BB);
-
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ Value *V = PN->getIncomingValue(i);
+
+ if (!isa<UndefValue>(V)) continue;
+
+ BasicBlock *BB = PN->getIncomingBlock(i);
+ IncomingValueMap::const_iterator It = IncomingValues.find(BB);
+
// Keep track of undef/poison incoming values. Those must match, so we fix
// them up below if needed.
// Note: this is conservatively correct, but we could try harder and group
@@ -938,8 +938,8 @@ static void replaceUndefValuesInPhi(PHINode *PN,
// There is a defined value for this incoming block, so map this undef
// incoming value to the defined value.
- PN->setIncomingValue(i, It->second);
- }
+ PN->setIncomingValue(i, It->second);
+ }
// If there are both undef and poison values incoming, then convert those
// values to undef. It is invalid to have different values for the same
@@ -951,204 +951,204 @@ static void replaceUndefValuesInPhi(PHINode *PN,
for (unsigned i : TrueUndefOps)
PN->setIncomingValue(i, UndefValue::get(PN->getType()));
}
-}
-
-/// Replace a value flowing from a block to a phi with
-/// potentially multiple instances of that value flowing from the
-/// block's predecessors to the phi.
-///
-/// \param BB The block with the value flowing into the phi.
-/// \param BBPreds The predecessors of BB.
-/// \param PN The phi that we are updating.
-static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB,
- const PredBlockVector &BBPreds,
- PHINode *PN) {
- Value *OldVal = PN->removeIncomingValue(BB, false);
- assert(OldVal && "No entry in PHI for Pred BB!");
-
- IncomingValueMap IncomingValues;
-
- // We are merging two blocks - BB, and the block containing PN - and
- // as a result we need to redirect edges from the predecessors of BB
- // to go to the block containing PN, and update PN
- // accordingly. Since we allow merging blocks in the case where the
- // predecessor and successor blocks both share some predecessors,
- // and where some of those common predecessors might have undef
- // values flowing into PN, we want to rewrite those values to be
- // consistent with the non-undef values.
-
- gatherIncomingValuesToPhi(PN, IncomingValues);
-
- // If this incoming value is one of the PHI nodes in BB, the new entries
- // in the PHI node are the entries from the old PHI.
- if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) {
- PHINode *OldValPN = cast<PHINode>(OldVal);
- for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i) {
- // Note that, since we are merging phi nodes and BB and Succ might
- // have common predecessors, we could end up with a phi node with
- // identical incoming branches. This will be cleaned up later (and
- // will trigger asserts if we try to clean it up now, without also
- // simplifying the corresponding conditional branch).
- BasicBlock *PredBB = OldValPN->getIncomingBlock(i);
- Value *PredVal = OldValPN->getIncomingValue(i);
- Value *Selected = selectIncomingValueForBlock(PredVal, PredBB,
- IncomingValues);
-
- // And add a new incoming value for this predecessor for the
- // newly retargeted branch.
- PN->addIncoming(Selected, PredBB);
- }
- } else {
- for (unsigned i = 0, e = BBPreds.size(); i != e; ++i) {
- // Update existing incoming values in PN for this
- // predecessor of BB.
- BasicBlock *PredBB = BBPreds[i];
- Value *Selected = selectIncomingValueForBlock(OldVal, PredBB,
- IncomingValues);
-
- // And add a new incoming value for this predecessor for the
- // newly retargeted branch.
- PN->addIncoming(Selected, PredBB);
- }
- }
-
- replaceUndefValuesInPhi(PN, IncomingValues);
-}
-
-bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
- DomTreeUpdater *DTU) {
- assert(BB != &BB->getParent()->getEntryBlock() &&
- "TryToSimplifyUncondBranchFromEmptyBlock called on entry block!");
-
- // We can't eliminate infinite loops.
- BasicBlock *Succ = cast<BranchInst>(BB->getTerminator())->getSuccessor(0);
- if (BB == Succ) return false;
-
- // Check to see if merging these blocks would cause conflicts for any of the
- // phi nodes in BB or Succ. If not, we can safely merge.
- if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false;
-
- // Check for cases where Succ has multiple predecessors and a PHI node in BB
- // has uses which will not disappear when the PHI nodes are merged. It is
- // possible to handle such cases, but difficult: it requires checking whether
- // BB dominates Succ, which is non-trivial to calculate in the case where
- // Succ has multiple predecessors. Also, it requires checking whether
- // constructing the necessary self-referential PHI node doesn't introduce any
- // conflicts; this isn't too difficult, but the previous code for doing this
- // was incorrect.
- //
- // Note that if this check finds a live use, BB dominates Succ, so BB is
- // something like a loop pre-header (or rarely, a part of an irreducible CFG);
- // folding the branch isn't profitable in that case anyway.
- if (!Succ->getSinglePredecessor()) {
- BasicBlock::iterator BBI = BB->begin();
- while (isa<PHINode>(*BBI)) {
- for (Use &U : BBI->uses()) {
- if (PHINode* PN = dyn_cast<PHINode>(U.getUser())) {
- if (PN->getIncomingBlock(U) != BB)
- return false;
- } else {
- return false;
- }
- }
- ++BBI;
- }
- }
-
- // We cannot fold the block if it's a branch to an already present callbr
- // successor because that creates duplicate successors.
- for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
- if (auto *CBI = dyn_cast<CallBrInst>((*I)->getTerminator())) {
- if (Succ == CBI->getDefaultDest())
- return false;
- for (unsigned i = 0, e = CBI->getNumIndirectDests(); i != e; ++i)
- if (Succ == CBI->getIndirectDest(i))
- return false;
- }
- }
-
- LLVM_DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
-
- SmallVector<DominatorTree::UpdateType, 32> Updates;
- if (DTU) {
- // All predecessors of BB will be moved to Succ.
+}
+
+/// Replace a value flowing from a block to a phi with
+/// potentially multiple instances of that value flowing from the
+/// block's predecessors to the phi.
+///
+/// \param BB The block with the value flowing into the phi.
+/// \param BBPreds The predecessors of BB.
+/// \param PN The phi that we are updating.
+static void redirectValuesFromPredecessorsToPhi(BasicBlock *BB,
+ const PredBlockVector &BBPreds,
+ PHINode *PN) {
+ Value *OldVal = PN->removeIncomingValue(BB, false);
+ assert(OldVal && "No entry in PHI for Pred BB!");
+
+ IncomingValueMap IncomingValues;
+
+ // We are merging two blocks - BB, and the block containing PN - and
+ // as a result we need to redirect edges from the predecessors of BB
+ // to go to the block containing PN, and update PN
+ // accordingly. Since we allow merging blocks in the case where the
+ // predecessor and successor blocks both share some predecessors,
+ // and where some of those common predecessors might have undef
+ // values flowing into PN, we want to rewrite those values to be
+ // consistent with the non-undef values.
+
+ gatherIncomingValuesToPhi(PN, IncomingValues);
+
+ // If this incoming value is one of the PHI nodes in BB, the new entries
+ // in the PHI node are the entries from the old PHI.
+ if (isa<PHINode>(OldVal) && cast<PHINode>(OldVal)->getParent() == BB) {
+ PHINode *OldValPN = cast<PHINode>(OldVal);
+ for (unsigned i = 0, e = OldValPN->getNumIncomingValues(); i != e; ++i) {
+ // Note that, since we are merging phi nodes and BB and Succ might
+ // have common predecessors, we could end up with a phi node with
+ // identical incoming branches. This will be cleaned up later (and
+ // will trigger asserts if we try to clean it up now, without also
+ // simplifying the corresponding conditional branch).
+ BasicBlock *PredBB = OldValPN->getIncomingBlock(i);
+ Value *PredVal = OldValPN->getIncomingValue(i);
+ Value *Selected = selectIncomingValueForBlock(PredVal, PredBB,
+ IncomingValues);
+
+ // And add a new incoming value for this predecessor for the
+ // newly retargeted branch.
+ PN->addIncoming(Selected, PredBB);
+ }
+ } else {
+ for (unsigned i = 0, e = BBPreds.size(); i != e; ++i) {
+ // Update existing incoming values in PN for this
+ // predecessor of BB.
+ BasicBlock *PredBB = BBPreds[i];
+ Value *Selected = selectIncomingValueForBlock(OldVal, PredBB,
+ IncomingValues);
+
+ // And add a new incoming value for this predecessor for the
+ // newly retargeted branch.
+ PN->addIncoming(Selected, PredBB);
+ }
+ }
+
+ replaceUndefValuesInPhi(PN, IncomingValues);
+}
+
+bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB,
+ DomTreeUpdater *DTU) {
+ assert(BB != &BB->getParent()->getEntryBlock() &&
+ "TryToSimplifyUncondBranchFromEmptyBlock called on entry block!");
+
+ // We can't eliminate infinite loops.
+ BasicBlock *Succ = cast<BranchInst>(BB->getTerminator())->getSuccessor(0);
+ if (BB == Succ) return false;
+
+ // Check to see if merging these blocks would cause conflicts for any of the
+ // phi nodes in BB or Succ. If not, we can safely merge.
+ if (!CanPropagatePredecessorsForPHIs(BB, Succ)) return false;
+
+ // Check for cases where Succ has multiple predecessors and a PHI node in BB
+ // has uses which will not disappear when the PHI nodes are merged. It is
+ // possible to handle such cases, but difficult: it requires checking whether
+ // BB dominates Succ, which is non-trivial to calculate in the case where
+ // Succ has multiple predecessors. Also, it requires checking whether
+ // constructing the necessary self-referential PHI node doesn't introduce any
+ // conflicts; this isn't too difficult, but the previous code for doing this
+ // was incorrect.
+ //
+ // Note that if this check finds a live use, BB dominates Succ, so BB is
+ // something like a loop pre-header (or rarely, a part of an irreducible CFG);
+ // folding the branch isn't profitable in that case anyway.
+ if (!Succ->getSinglePredecessor()) {
+ BasicBlock::iterator BBI = BB->begin();
+ while (isa<PHINode>(*BBI)) {
+ for (Use &U : BBI->uses()) {
+ if (PHINode* PN = dyn_cast<PHINode>(U.getUser())) {
+ if (PN->getIncomingBlock(U) != BB)
+ return false;
+ } else {
+ return false;
+ }
+ }
+ ++BBI;
+ }
+ }
+
+ // We cannot fold the block if it's a branch to an already present callbr
+ // successor because that creates duplicate successors.
+ for (auto I = pred_begin(BB), E = pred_end(BB); I != E; ++I) {
+ if (auto *CBI = dyn_cast<CallBrInst>((*I)->getTerminator())) {
+ if (Succ == CBI->getDefaultDest())
+ return false;
+ for (unsigned i = 0, e = CBI->getNumIndirectDests(); i != e; ++i)
+ if (Succ == CBI->getIndirectDest(i))
+ return false;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "Killing Trivial BB: \n" << *BB);
+
+ SmallVector<DominatorTree::UpdateType, 32> Updates;
+ if (DTU) {
+ // All predecessors of BB will be moved to Succ.
SmallSetVector<BasicBlock *, 8> Predecessors(pred_begin(BB), pred_end(BB));
Updates.reserve(Updates.size() + 2 * Predecessors.size());
for (auto *Predecessor : Predecessors) {
- // This predecessor of BB may already have Succ as a successor.
+ // This predecessor of BB may already have Succ as a successor.
if (!llvm::is_contained(successors(Predecessor), Succ))
Updates.push_back({DominatorTree::Insert, Predecessor, Succ});
Updates.push_back({DominatorTree::Delete, Predecessor, BB});
- }
+ }
Updates.push_back({DominatorTree::Delete, BB, Succ});
- }
-
- if (isa<PHINode>(Succ->begin())) {
- // If there is more than one pred of succ, and there are PHI nodes in
- // the successor, then we need to add incoming edges for the PHI nodes
- //
- const PredBlockVector BBPreds(pred_begin(BB), pred_end(BB));
-
- // Loop over all of the PHI nodes in the successor of BB.
- for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
- PHINode *PN = cast<PHINode>(I);
-
- redirectValuesFromPredecessorsToPhi(BB, BBPreds, PN);
- }
- }
-
- if (Succ->getSinglePredecessor()) {
- // BB is the only predecessor of Succ, so Succ will end up with exactly
- // the same predecessors BB had.
-
- // Copy over any phi, debug or lifetime instruction.
- BB->getTerminator()->eraseFromParent();
- Succ->getInstList().splice(Succ->getFirstNonPHI()->getIterator(),
- BB->getInstList());
- } else {
- while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
- // We explicitly check for such uses in CanPropagatePredecessorsForPHIs.
- assert(PN->use_empty() && "There shouldn't be any uses here!");
- PN->eraseFromParent();
- }
- }
-
- // If the unconditional branch we replaced contains llvm.loop metadata, we
- // add the metadata to the branch instructions in the predecessors.
- unsigned LoopMDKind = BB->getContext().getMDKindID("llvm.loop");
- Instruction *TI = BB->getTerminator();
- if (TI)
- if (MDNode *LoopMD = TI->getMetadata(LoopMDKind))
- for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
- BasicBlock *Pred = *PI;
- Pred->getTerminator()->setMetadata(LoopMDKind, LoopMD);
- }
-
- // Everything that jumped to BB now goes to Succ.
- BB->replaceAllUsesWith(Succ);
- if (!Succ->hasName()) Succ->takeName(BB);
-
- // Clear the successor list of BB to match updates applying to DTU later.
- if (BB->getTerminator())
- BB->getInstList().pop_back();
- new UnreachableInst(BB->getContext(), BB);
- assert(succ_empty(BB) && "The successor list of BB isn't empty before "
- "applying corresponding DTU updates.");
-
- if (DTU) {
+ }
+
+ if (isa<PHINode>(Succ->begin())) {
+ // If there is more than one pred of succ, and there are PHI nodes in
+ // the successor, then we need to add incoming edges for the PHI nodes
+ //
+ const PredBlockVector BBPreds(pred_begin(BB), pred_end(BB));
+
+ // Loop over all of the PHI nodes in the successor of BB.
+ for (BasicBlock::iterator I = Succ->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PN = cast<PHINode>(I);
+
+ redirectValuesFromPredecessorsToPhi(BB, BBPreds, PN);
+ }
+ }
+
+ if (Succ->getSinglePredecessor()) {
+ // BB is the only predecessor of Succ, so Succ will end up with exactly
+ // the same predecessors BB had.
+
+ // Copy over any phi, debug or lifetime instruction.
+ BB->getTerminator()->eraseFromParent();
+ Succ->getInstList().splice(Succ->getFirstNonPHI()->getIterator(),
+ BB->getInstList());
+ } else {
+ while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
+ // We explicitly check for such uses in CanPropagatePredecessorsForPHIs.
+ assert(PN->use_empty() && "There shouldn't be any uses here!");
+ PN->eraseFromParent();
+ }
+ }
+
+ // If the unconditional branch we replaced contains llvm.loop metadata, we
+ // add the metadata to the branch instructions in the predecessors.
+ unsigned LoopMDKind = BB->getContext().getMDKindID("llvm.loop");
+ Instruction *TI = BB->getTerminator();
+ if (TI)
+ if (MDNode *LoopMD = TI->getMetadata(LoopMDKind))
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+ BasicBlock *Pred = *PI;
+ Pred->getTerminator()->setMetadata(LoopMDKind, LoopMD);
+ }
+
+ // Everything that jumped to BB now goes to Succ.
+ BB->replaceAllUsesWith(Succ);
+ if (!Succ->hasName()) Succ->takeName(BB);
+
+ // Clear the successor list of BB to match updates applying to DTU later.
+ if (BB->getTerminator())
+ BB->getInstList().pop_back();
+ new UnreachableInst(BB->getContext(), BB);
+ assert(succ_empty(BB) && "The successor list of BB isn't empty before "
+ "applying corresponding DTU updates.");
+
+ if (DTU) {
DTU->applyUpdates(Updates);
- DTU->deleteBB(BB);
- } else {
- BB->eraseFromParent(); // Delete the old basic block.
- }
- return true;
-}
-
+ DTU->deleteBB(BB);
+ } else {
+ BB->eraseFromParent(); // Delete the old basic block.
+ }
+ return true;
+}
+
static bool EliminateDuplicatePHINodesNaiveImpl(BasicBlock *BB) {
- // This implementation doesn't currently consider undef operands
- // specially. Theoretically, two phis which are identical except for
- // one having an undef where the other doesn't could be collapsed.
-
+ // This implementation doesn't currently consider undef operands
+ // specially. Theoretically, two phis which are identical except for
+ // one having an undef where the other doesn't could be collapsed.
+
bool Changed = false;
// Examine each PHI.
@@ -1181,15 +1181,15 @@ static bool EliminateDuplicatePHINodesSetBasedImpl(BasicBlock *BB) {
// specially. Theoretically, two phis which are identical except for
// one having an undef where the other doesn't could be collapsed.
- struct PHIDenseMapInfo {
- static PHINode *getEmptyKey() {
- return DenseMapInfo<PHINode *>::getEmptyKey();
- }
-
- static PHINode *getTombstoneKey() {
- return DenseMapInfo<PHINode *>::getTombstoneKey();
- }
-
+ struct PHIDenseMapInfo {
+ static PHINode *getEmptyKey() {
+ return DenseMapInfo<PHINode *>::getEmptyKey();
+ }
+
+ static PHINode *getTombstoneKey() {
+ return DenseMapInfo<PHINode *>::getTombstoneKey();
+ }
+
static bool isSentinel(PHINode *PN) {
return PN == getEmptyKey() || PN == getTombstoneKey();
}
@@ -1197,14 +1197,14 @@ static bool EliminateDuplicatePHINodesSetBasedImpl(BasicBlock *BB) {
// WARNING: this logic must be kept in sync with
// Instruction::isIdenticalToWhenDefined()!
static unsigned getHashValueImpl(PHINode *PN) {
- // Compute a hash value on the operands. Instcombine will likely have
- // sorted them, which helps expose duplicates, but we have to check all
- // the operands to be safe in case instcombine hasn't run.
- return static_cast<unsigned>(hash_combine(
- hash_combine_range(PN->value_op_begin(), PN->value_op_end()),
- hash_combine_range(PN->block_begin(), PN->block_end())));
- }
-
+ // Compute a hash value on the operands. Instcombine will likely have
+ // sorted them, which helps expose duplicates, but we have to check all
+ // the operands to be safe in case instcombine hasn't run.
+ return static_cast<unsigned>(hash_combine(
+ hash_combine_range(PN->value_op_begin(), PN->value_op_end()),
+ hash_combine_range(PN->block_begin(), PN->block_end())));
+ }
+
static unsigned getHashValue(PHINode *PN) {
#ifndef NDEBUG
// If -phicse-debug-hash was specified, return a constant -- this
@@ -1219,9 +1219,9 @@ static bool EliminateDuplicatePHINodesSetBasedImpl(BasicBlock *BB) {
static bool isEqualImpl(PHINode *LHS, PHINode *RHS) {
if (isSentinel(LHS) || isSentinel(RHS))
- return LHS == RHS;
- return LHS->isIdenticalTo(RHS);
- }
+ return LHS == RHS;
+ return LHS->isIdenticalTo(RHS);
+ }
static bool isEqual(PHINode *LHS, PHINode *RHS) {
// These comparisons are nontrivial, so assert that equality implies
@@ -1231,33 +1231,33 @@ static bool EliminateDuplicatePHINodesSetBasedImpl(BasicBlock *BB) {
getHashValueImpl(LHS) == getHashValueImpl(RHS));
return Result;
}
- };
-
- // Set of unique PHINodes.
- DenseSet<PHINode *, PHIDenseMapInfo> PHISet;
+ };
+
+ // Set of unique PHINodes.
+ DenseSet<PHINode *, PHIDenseMapInfo> PHISet;
PHISet.reserve(4 * PHICSENumPHISmallSize);
-
- // Examine each PHI.
- bool Changed = false;
- for (auto I = BB->begin(); PHINode *PN = dyn_cast<PHINode>(I++);) {
- auto Inserted = PHISet.insert(PN);
- if (!Inserted.second) {
- // A duplicate. Replace this PHI with its duplicate.
+
+ // Examine each PHI.
+ bool Changed = false;
+ for (auto I = BB->begin(); PHINode *PN = dyn_cast<PHINode>(I++);) {
+ auto Inserted = PHISet.insert(PN);
+ if (!Inserted.second) {
+ // A duplicate. Replace this PHI with its duplicate.
++NumPHICSEs;
- PN->replaceAllUsesWith(*Inserted.first);
- PN->eraseFromParent();
- Changed = true;
-
- // The RAUW can change PHIs that we already visited. Start over from the
- // beginning.
- PHISet.clear();
- I = BB->begin();
- }
- }
-
- return Changed;
-}
-
+ PN->replaceAllUsesWith(*Inserted.first);
+ PN->eraseFromParent();
+ Changed = true;
+
+ // The RAUW can change PHIs that we already visited. Start over from the
+ // beginning.
+ PHISet.clear();
+ I = BB->begin();
+ }
+ }
+
+ return Changed;
+}
+
bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
if (
#ifndef NDEBUG
@@ -1267,7 +1267,7 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
return EliminateDuplicatePHINodesNaiveImpl(BB);
return EliminateDuplicatePHINodesSetBasedImpl(BB);
}
-
+
/// If the specified pointer points to an object that we control, try to modify
/// the object's alignment to PrefAlign. Returns a minimum known alignment of
/// the value after the operation, which may be lower than PrefAlign.
@@ -1277,9 +1277,9 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
/// and allocation instructions to their preferred alignment from the beginning.
static Align tryEnforceAlignment(Value *V, Align PrefAlign,
const DataLayout &DL) {
- V = V->stripPointerCasts();
-
- if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+ V = V->stripPointerCasts();
+
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
// TODO: Ideally, this function would not be called if PrefAlign is smaller
// than the current alignment, as the known bits calculation should have
// already taken it into account. However, this is not always the case,
@@ -1288,801 +1288,801 @@ static Align tryEnforceAlignment(Value *V, Align PrefAlign,
Align CurrentAlign = AI->getAlign();
if (PrefAlign <= CurrentAlign)
return CurrentAlign;
-
- // If the preferred alignment is greater than the natural stack alignment
- // then don't round up. This avoids dynamic stack realignment.
- if (DL.exceedsNaturalStackAlignment(PrefAlign))
+
+ // If the preferred alignment is greater than the natural stack alignment
+ // then don't round up. This avoids dynamic stack realignment.
+ if (DL.exceedsNaturalStackAlignment(PrefAlign))
return CurrentAlign;
- AI->setAlignment(PrefAlign);
- return PrefAlign;
- }
-
- if (auto *GO = dyn_cast<GlobalObject>(V)) {
- // TODO: as above, this shouldn't be necessary.
+ AI->setAlignment(PrefAlign);
+ return PrefAlign;
+ }
+
+ if (auto *GO = dyn_cast<GlobalObject>(V)) {
+ // TODO: as above, this shouldn't be necessary.
Align CurrentAlign = GO->getPointerAlignment(DL);
if (PrefAlign <= CurrentAlign)
return CurrentAlign;
-
- // If there is a large requested alignment and we can, bump up the alignment
- // of the global. If the memory we set aside for the global may not be the
- // memory used by the final program then it is impossible for us to reliably
- // enforce the preferred alignment.
- if (!GO->canIncreaseAlignment())
+
+ // If there is a large requested alignment and we can, bump up the alignment
+ // of the global. If the memory we set aside for the global may not be the
+ // memory used by the final program then it is impossible for us to reliably
+ // enforce the preferred alignment.
+ if (!GO->canIncreaseAlignment())
return CurrentAlign;
-
- GO->setAlignment(PrefAlign);
- return PrefAlign;
- }
-
+
+ GO->setAlignment(PrefAlign);
+ return PrefAlign;
+ }
+
return Align(1);
-}
-
-Align llvm::getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign,
- const DataLayout &DL,
- const Instruction *CxtI,
- AssumptionCache *AC,
- const DominatorTree *DT) {
- assert(V->getType()->isPointerTy() &&
- "getOrEnforceKnownAlignment expects a pointer!");
-
- KnownBits Known = computeKnownBits(V, DL, 0, AC, CxtI, DT);
- unsigned TrailZ = Known.countMinTrailingZeros();
-
- // Avoid trouble with ridiculously large TrailZ values, such as
- // those computed from a null pointer.
- // LLVM doesn't support alignments larger than (1 << MaxAlignmentExponent).
- TrailZ = std::min(TrailZ, +Value::MaxAlignmentExponent);
-
- Align Alignment = Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
-
- if (PrefAlign && *PrefAlign > Alignment)
+}
+
+Align llvm::getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign,
+ const DataLayout &DL,
+ const Instruction *CxtI,
+ AssumptionCache *AC,
+ const DominatorTree *DT) {
+ assert(V->getType()->isPointerTy() &&
+ "getOrEnforceKnownAlignment expects a pointer!");
+
+ KnownBits Known = computeKnownBits(V, DL, 0, AC, CxtI, DT);
+ unsigned TrailZ = Known.countMinTrailingZeros();
+
+ // Avoid trouble with ridiculously large TrailZ values, such as
+ // those computed from a null pointer.
+ // LLVM doesn't support alignments larger than (1 << MaxAlignmentExponent).
+ TrailZ = std::min(TrailZ, +Value::MaxAlignmentExponent);
+
+ Align Alignment = Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
+
+ if (PrefAlign && *PrefAlign > Alignment)
Alignment = std::max(Alignment, tryEnforceAlignment(V, *PrefAlign, DL));
-
- // We don't need to make any adjustment.
- return Alignment;
-}
-
-///===---------------------------------------------------------------------===//
-/// Dbg Intrinsic utilities
-///
-
-/// See if there is a dbg.value intrinsic for DIVar for the PHI node.
-static bool PhiHasDebugValue(DILocalVariable *DIVar,
- DIExpression *DIExpr,
- PHINode *APN) {
- // Since we can't guarantee that the original dbg.declare instrinsic
- // is removed by LowerDbgDeclare(), we need to make sure that we are
- // not inserting the same dbg.value intrinsic over and over.
- SmallVector<DbgValueInst *, 1> DbgValues;
- findDbgValues(DbgValues, APN);
- for (auto *DVI : DbgValues) {
- assert(DVI->getValue() == APN);
- if ((DVI->getVariable() == DIVar) && (DVI->getExpression() == DIExpr))
- return true;
- }
- return false;
-}
-
-/// Check if the alloc size of \p ValTy is large enough to cover the variable
-/// (or fragment of the variable) described by \p DII.
-///
-/// This is primarily intended as a helper for the different
-/// ConvertDebugDeclareToDebugValue functions. The dbg.declare/dbg.addr that is
-/// converted describes an alloca'd variable, so we need to use the
-/// alloc size of the value when doing the comparison. E.g. an i1 value will be
-/// identified as covering an n-bit fragment, if the store size of i1 is at
-/// least n bits.
-static bool valueCoversEntireFragment(Type *ValTy, DbgVariableIntrinsic *DII) {
- const DataLayout &DL = DII->getModule()->getDataLayout();
+
+ // We don't need to make any adjustment.
+ return Alignment;
+}
+
+///===---------------------------------------------------------------------===//
+/// Dbg Intrinsic utilities
+///
+
+/// See if there is a dbg.value intrinsic for DIVar for the PHI node.
+static bool PhiHasDebugValue(DILocalVariable *DIVar,
+ DIExpression *DIExpr,
+ PHINode *APN) {
+ // Since we can't guarantee that the original dbg.declare instrinsic
+ // is removed by LowerDbgDeclare(), we need to make sure that we are
+ // not inserting the same dbg.value intrinsic over and over.
+ SmallVector<DbgValueInst *, 1> DbgValues;
+ findDbgValues(DbgValues, APN);
+ for (auto *DVI : DbgValues) {
+ assert(DVI->getValue() == APN);
+ if ((DVI->getVariable() == DIVar) && (DVI->getExpression() == DIExpr))
+ return true;
+ }
+ return false;
+}
+
+/// Check if the alloc size of \p ValTy is large enough to cover the variable
+/// (or fragment of the variable) described by \p DII.
+///
+/// This is primarily intended as a helper for the different
+/// ConvertDebugDeclareToDebugValue functions. The dbg.declare/dbg.addr that is
+/// converted describes an alloca'd variable, so we need to use the
+/// alloc size of the value when doing the comparison. E.g. an i1 value will be
+/// identified as covering an n-bit fragment, if the store size of i1 is at
+/// least n bits.
+static bool valueCoversEntireFragment(Type *ValTy, DbgVariableIntrinsic *DII) {
+ const DataLayout &DL = DII->getModule()->getDataLayout();
TypeSize ValueSize = DL.getTypeAllocSizeInBits(ValTy);
if (Optional<uint64_t> FragmentSize = DII->getFragmentSizeInBits()) {
assert(!ValueSize.isScalable() &&
"Fragments don't work on scalable types.");
return ValueSize.getFixedSize() >= *FragmentSize;
}
- // We can't always calculate the size of the DI variable (e.g. if it is a
- // VLA). Try to use the size of the alloca that the dbg intrinsic describes
- // intead.
- if (DII->isAddressOfVariable())
- if (auto *AI = dyn_cast_or_null<AllocaInst>(DII->getVariableLocation()))
+ // We can't always calculate the size of the DI variable (e.g. if it is a
+ // VLA). Try to use the size of the alloca that the dbg intrinsic describes
+ // intead.
+ if (DII->isAddressOfVariable())
+ if (auto *AI = dyn_cast_or_null<AllocaInst>(DII->getVariableLocation()))
if (Optional<TypeSize> FragmentSize = AI->getAllocationSizeInBits(DL)) {
assert(ValueSize.isScalable() == FragmentSize->isScalable() &&
"Both sizes should agree on the scalable flag.");
return TypeSize::isKnownGE(ValueSize, *FragmentSize);
}
- // Could not determine size of variable. Conservatively return false.
- return false;
-}
-
-/// Produce a DebugLoc to use for each dbg.declare/inst pair that are promoted
-/// to a dbg.value. Because no machine insts can come from debug intrinsics,
-/// only the scope and inlinedAt is significant. Zero line numbers are used in
-/// case this DebugLoc leaks into any adjacent instructions.
-static DebugLoc getDebugValueLoc(DbgVariableIntrinsic *DII, Instruction *Src) {
- // Original dbg.declare must have a location.
- DebugLoc DeclareLoc = DII->getDebugLoc();
- MDNode *Scope = DeclareLoc.getScope();
- DILocation *InlinedAt = DeclareLoc.getInlinedAt();
- // Produce an unknown location with the correct scope / inlinedAt fields.
+ // Could not determine size of variable. Conservatively return false.
+ return false;
+}
+
+/// Produce a DebugLoc to use for each dbg.declare/inst pair that are promoted
+/// to a dbg.value. Because no machine insts can come from debug intrinsics,
+/// only the scope and inlinedAt is significant. Zero line numbers are used in
+/// case this DebugLoc leaks into any adjacent instructions.
+static DebugLoc getDebugValueLoc(DbgVariableIntrinsic *DII, Instruction *Src) {
+ // Original dbg.declare must have a location.
+ DebugLoc DeclareLoc = DII->getDebugLoc();
+ MDNode *Scope = DeclareLoc.getScope();
+ DILocation *InlinedAt = DeclareLoc.getInlinedAt();
+ // Produce an unknown location with the correct scope / inlinedAt fields.
return DILocation::get(DII->getContext(), 0, 0, Scope, InlinedAt);
-}
-
-/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
-/// that has an associated llvm.dbg.declare or llvm.dbg.addr intrinsic.
-void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
- StoreInst *SI, DIBuilder &Builder) {
- assert(DII->isAddressOfVariable());
- auto *DIVar = DII->getVariable();
- assert(DIVar && "Missing variable");
- auto *DIExpr = DII->getExpression();
- Value *DV = SI->getValueOperand();
-
- DebugLoc NewLoc = getDebugValueLoc(DII, SI);
-
- if (!valueCoversEntireFragment(DV->getType(), DII)) {
- // FIXME: If storing to a part of the variable described by the dbg.declare,
- // then we want to insert a dbg.value for the corresponding fragment.
- LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
- << *DII << '\n');
- // For now, when there is a store to parts of the variable (but we do not
- // know which part) we insert an dbg.value instrinsic to indicate that we
- // know nothing about the variable's content.
- DV = UndefValue::get(DV->getType());
- Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
- return;
- }
-
- Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
-}
-
-/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
-/// that has an associated llvm.dbg.declare or llvm.dbg.addr intrinsic.
-void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
- LoadInst *LI, DIBuilder &Builder) {
- auto *DIVar = DII->getVariable();
- auto *DIExpr = DII->getExpression();
- assert(DIVar && "Missing variable");
-
- if (!valueCoversEntireFragment(LI->getType(), DII)) {
- // FIXME: If only referring to a part of the variable described by the
- // dbg.declare, then we want to insert a dbg.value for the corresponding
- // fragment.
- LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
- << *DII << '\n');
- return;
- }
-
- DebugLoc NewLoc = getDebugValueLoc(DII, nullptr);
-
- // We are now tracking the loaded value instead of the address. In the
- // future if multi-location support is added to the IR, it might be
- // preferable to keep tracking both the loaded value and the original
- // address in case the alloca can not be elided.
- Instruction *DbgValue = Builder.insertDbgValueIntrinsic(
- LI, DIVar, DIExpr, NewLoc, (Instruction *)nullptr);
- DbgValue->insertAfter(LI);
-}
-
-/// Inserts a llvm.dbg.value intrinsic after a phi that has an associated
-/// llvm.dbg.declare or llvm.dbg.addr intrinsic.
-void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
- PHINode *APN, DIBuilder &Builder) {
- auto *DIVar = DII->getVariable();
- auto *DIExpr = DII->getExpression();
- assert(DIVar && "Missing variable");
-
- if (PhiHasDebugValue(DIVar, DIExpr, APN))
- return;
-
- if (!valueCoversEntireFragment(APN->getType(), DII)) {
- // FIXME: If only referring to a part of the variable described by the
- // dbg.declare, then we want to insert a dbg.value for the corresponding
- // fragment.
- LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
- << *DII << '\n');
- return;
- }
-
- BasicBlock *BB = APN->getParent();
- auto InsertionPt = BB->getFirstInsertionPt();
-
- DebugLoc NewLoc = getDebugValueLoc(DII, nullptr);
-
- // The block may be a catchswitch block, which does not have a valid
- // insertion point.
- // FIXME: Insert dbg.value markers in the successors when appropriate.
- if (InsertionPt != BB->end())
- Builder.insertDbgValueIntrinsic(APN, DIVar, DIExpr, NewLoc, &*InsertionPt);
-}
-
-/// Determine whether this alloca is either a VLA or an array.
-static bool isArray(AllocaInst *AI) {
- return AI->isArrayAllocation() ||
- (AI->getAllocatedType() && AI->getAllocatedType()->isArrayTy());
-}
-
-/// Determine whether this alloca is a structure.
-static bool isStructure(AllocaInst *AI) {
- return AI->getAllocatedType() && AI->getAllocatedType()->isStructTy();
-}
-
-/// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set
-/// of llvm.dbg.value intrinsics.
-bool llvm::LowerDbgDeclare(Function &F) {
- bool Changed = false;
- DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
- SmallVector<DbgDeclareInst *, 4> Dbgs;
- for (auto &FI : F)
- for (Instruction &BI : FI)
- if (auto DDI = dyn_cast<DbgDeclareInst>(&BI))
- Dbgs.push_back(DDI);
-
- if (Dbgs.empty())
- return Changed;
-
- for (auto &I : Dbgs) {
- DbgDeclareInst *DDI = I;
- AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
- // If this is an alloca for a scalar variable, insert a dbg.value
- // at each load and store to the alloca and erase the dbg.declare.
- // The dbg.values allow tracking a variable even if it is not
- // stored on the stack, while the dbg.declare can only describe
- // the stack slot (and at a lexical-scope granularity). Later
- // passes will attempt to elide the stack slot.
- if (!AI || isArray(AI) || isStructure(AI))
- continue;
-
- // A volatile load/store means that the alloca can't be elided anyway.
- if (llvm::any_of(AI->users(), [](User *U) -> bool {
- if (LoadInst *LI = dyn_cast<LoadInst>(U))
- return LI->isVolatile();
- if (StoreInst *SI = dyn_cast<StoreInst>(U))
- return SI->isVolatile();
- return false;
- }))
- continue;
-
- SmallVector<const Value *, 8> WorkList;
- WorkList.push_back(AI);
- while (!WorkList.empty()) {
- const Value *V = WorkList.pop_back_val();
- for (auto &AIUse : V->uses()) {
- User *U = AIUse.getUser();
- if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
- if (AIUse.getOperandNo() == 1)
- ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
- } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
- ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
- } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
- // This is a call by-value or some other instruction that takes a
- // pointer to the variable. Insert a *value* intrinsic that describes
- // the variable by dereferencing the alloca.
- if (!CI->isLifetimeStartOrEnd()) {
- DebugLoc NewLoc = getDebugValueLoc(DDI, nullptr);
- auto *DerefExpr =
- DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref);
- DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(), DerefExpr,
- NewLoc, CI);
- }
- } else if (BitCastInst *BI = dyn_cast<BitCastInst>(U)) {
- if (BI->getType()->isPointerTy())
- WorkList.push_back(BI);
- }
- }
- }
- DDI->eraseFromParent();
- Changed = true;
- }
-
- if (Changed)
- for (BasicBlock &BB : F)
- RemoveRedundantDbgInstrs(&BB);
-
- return Changed;
-}
-
-/// Propagate dbg.value intrinsics through the newly inserted PHIs.
-void llvm::insertDebugValuesForPHIs(BasicBlock *BB,
- SmallVectorImpl<PHINode *> &InsertedPHIs) {
- assert(BB && "No BasicBlock to clone dbg.value(s) from.");
- if (InsertedPHIs.size() == 0)
- return;
-
- // Map existing PHI nodes to their dbg.values.
- ValueToValueMapTy DbgValueMap;
- for (auto &I : *BB) {
- if (auto DbgII = dyn_cast<DbgVariableIntrinsic>(&I)) {
- if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
- DbgValueMap.insert({Loc, DbgII});
- }
- }
- if (DbgValueMap.size() == 0)
- return;
-
- // Then iterate through the new PHIs and look to see if they use one of the
- // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
- // propagate the info through the new PHI.
- LLVMContext &C = BB->getContext();
- for (auto PHI : InsertedPHIs) {
- BasicBlock *Parent = PHI->getParent();
- // Avoid inserting an intrinsic into an EH block.
- if (Parent->getFirstNonPHI()->isEHPad())
- continue;
- auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
- for (auto VI : PHI->operand_values()) {
- auto V = DbgValueMap.find(VI);
- if (V != DbgValueMap.end()) {
- auto *DbgII = cast<DbgVariableIntrinsic>(V->second);
- Instruction *NewDbgII = DbgII->clone();
- NewDbgII->setOperand(0, PhiMAV);
- auto InsertionPt = Parent->getFirstInsertionPt();
- assert(InsertionPt != Parent->end() && "Ill-formed basic block");
- NewDbgII->insertBefore(&*InsertionPt);
- }
- }
- }
-}
-
-/// Finds all intrinsics declaring local variables as living in the memory that
-/// 'V' points to. This may include a mix of dbg.declare and
-/// dbg.addr intrinsics.
-TinyPtrVector<DbgVariableIntrinsic *> llvm::FindDbgAddrUses(Value *V) {
- // This function is hot. Check whether the value has any metadata to avoid a
- // DenseMap lookup.
- if (!V->isUsedByMetadata())
- return {};
- auto *L = LocalAsMetadata::getIfExists(V);
- if (!L)
- return {};
- auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L);
- if (!MDV)
- return {};
-
- TinyPtrVector<DbgVariableIntrinsic *> Declares;
- for (User *U : MDV->users()) {
- if (auto *DII = dyn_cast<DbgVariableIntrinsic>(U))
- if (DII->isAddressOfVariable())
- Declares.push_back(DII);
- }
-
- return Declares;
-}
-
-TinyPtrVector<DbgDeclareInst *> llvm::FindDbgDeclareUses(Value *V) {
- TinyPtrVector<DbgDeclareInst *> DDIs;
- for (DbgVariableIntrinsic *DVI : FindDbgAddrUses(V))
- if (auto *DDI = dyn_cast<DbgDeclareInst>(DVI))
- DDIs.push_back(DDI);
- return DDIs;
-}
-
-void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) {
- // This function is hot. Check whether the value has any metadata to avoid a
- // DenseMap lookup.
- if (!V->isUsedByMetadata())
- return;
- if (auto *L = LocalAsMetadata::getIfExists(V))
- if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
- for (User *U : MDV->users())
- if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
- DbgValues.push_back(DVI);
-}
-
-void llvm::findDbgUsers(SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers,
- Value *V) {
- // This function is hot. Check whether the value has any metadata to avoid a
- // DenseMap lookup.
- if (!V->isUsedByMetadata())
- return;
- if (auto *L = LocalAsMetadata::getIfExists(V))
- if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
- for (User *U : MDV->users())
- if (DbgVariableIntrinsic *DII = dyn_cast<DbgVariableIntrinsic>(U))
- DbgUsers.push_back(DII);
-}
-
-bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
- DIBuilder &Builder, uint8_t DIExprFlags,
- int Offset) {
- auto DbgAddrs = FindDbgAddrUses(Address);
- for (DbgVariableIntrinsic *DII : DbgAddrs) {
- DebugLoc Loc = DII->getDebugLoc();
- auto *DIVar = DII->getVariable();
- auto *DIExpr = DII->getExpression();
- assert(DIVar && "Missing variable");
- DIExpr = DIExpression::prepend(DIExpr, DIExprFlags, Offset);
- // Insert llvm.dbg.declare immediately before DII, and remove old
- // llvm.dbg.declare.
- Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, DII);
- DII->eraseFromParent();
- }
- return !DbgAddrs.empty();
-}
-
-static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress,
- DIBuilder &Builder, int Offset) {
- DebugLoc Loc = DVI->getDebugLoc();
- auto *DIVar = DVI->getVariable();
- auto *DIExpr = DVI->getExpression();
- assert(DIVar && "Missing variable");
-
- // This is an alloca-based llvm.dbg.value. The first thing it should do with
- // the alloca pointer is dereference it. Otherwise we don't know how to handle
- // it and give up.
- if (!DIExpr || DIExpr->getNumElements() < 1 ||
- DIExpr->getElement(0) != dwarf::DW_OP_deref)
- return;
-
- // Insert the offset before the first deref.
- // We could just change the offset argument of dbg.value, but it's unsigned...
- if (Offset)
- DIExpr = DIExpression::prepend(DIExpr, 0, Offset);
-
- Builder.insertDbgValueIntrinsic(NewAddress, DIVar, DIExpr, Loc, DVI);
- DVI->eraseFromParent();
-}
-
-void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
- DIBuilder &Builder, int Offset) {
- if (auto *L = LocalAsMetadata::getIfExists(AI))
- if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L))
- for (auto UI = MDV->use_begin(), UE = MDV->use_end(); UI != UE;) {
- Use &U = *UI++;
- if (auto *DVI = dyn_cast<DbgValueInst>(U.getUser()))
- replaceOneDbgValueForAlloca(DVI, NewAllocaAddress, Builder, Offset);
- }
-}
-
-/// Wrap \p V in a ValueAsMetadata instance.
-static MetadataAsValue *wrapValueInMetadata(LLVMContext &C, Value *V) {
- return MetadataAsValue::get(C, ValueAsMetadata::get(V));
-}
-
-/// Where possible to salvage debug information for \p I do so
-/// and return True. If not possible mark undef and return False.
-void llvm::salvageDebugInfo(Instruction &I) {
- SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
- findDbgUsers(DbgUsers, &I);
- salvageDebugInfoForDbgValues(I, DbgUsers);
-}
-
-void llvm::salvageDebugInfoForDbgValues(
- Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers) {
- auto &Ctx = I.getContext();
- bool Salvaged = false;
- auto wrapMD = [&](Value *V) { return wrapValueInMetadata(Ctx, V); };
-
- for (auto *DII : DbgUsers) {
- // Do not add DW_OP_stack_value for DbgDeclare and DbgAddr, because they
- // are implicitly pointing out the value as a DWARF memory location
- // description.
- bool StackValue = isa<DbgValueInst>(DII);
-
- DIExpression *DIExpr =
- salvageDebugInfoImpl(I, DII->getExpression(), StackValue);
-
- // salvageDebugInfoImpl should fail on examining the first element of
- // DbgUsers, or none of them.
- if (!DIExpr)
- break;
-
- DII->setOperand(0, wrapMD(I.getOperand(0)));
- DII->setOperand(2, MetadataAsValue::get(Ctx, DIExpr));
- LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
- Salvaged = true;
- }
-
- if (Salvaged)
- return;
-
- for (auto *DII : DbgUsers) {
- Value *Undef = UndefValue::get(I.getType());
- DII->setOperand(0, MetadataAsValue::get(DII->getContext(),
- ValueAsMetadata::get(Undef)));
- }
-}
-
-DIExpression *llvm::salvageDebugInfoImpl(Instruction &I,
- DIExpression *SrcDIExpr,
- bool WithStackValue) {
- auto &M = *I.getModule();
- auto &DL = M.getDataLayout();
-
- // Apply a vector of opcodes to the source DIExpression.
- auto doSalvage = [&](SmallVectorImpl<uint64_t> &Ops) -> DIExpression * {
- DIExpression *DIExpr = SrcDIExpr;
- if (!Ops.empty()) {
- DIExpr = DIExpression::prependOpcodes(DIExpr, Ops, WithStackValue);
- }
- return DIExpr;
- };
-
- // Apply the given offset to the source DIExpression.
- auto applyOffset = [&](uint64_t Offset) -> DIExpression * {
- SmallVector<uint64_t, 8> Ops;
- DIExpression::appendOffset(Ops, Offset);
- return doSalvage(Ops);
- };
-
- // initializer-list helper for applying operators to the source DIExpression.
- auto applyOps = [&](ArrayRef<uint64_t> Opcodes) -> DIExpression * {
- SmallVector<uint64_t, 8> Ops(Opcodes.begin(), Opcodes.end());
- return doSalvage(Ops);
- };
-
- if (auto *CI = dyn_cast<CastInst>(&I)) {
- // No-op casts are irrelevant for debug info.
- if (CI->isNoopCast(DL))
- return SrcDIExpr;
-
- Type *Type = CI->getType();
- // Casts other than Trunc, SExt, or ZExt to scalar types cannot be salvaged.
- if (Type->isVectorTy() ||
- !(isa<TruncInst>(&I) || isa<SExtInst>(&I) || isa<ZExtInst>(&I)))
- return nullptr;
-
- Value *FromValue = CI->getOperand(0);
- unsigned FromTypeBitSize = FromValue->getType()->getScalarSizeInBits();
- unsigned ToTypeBitSize = Type->getScalarSizeInBits();
-
- return applyOps(DIExpression::getExtOps(FromTypeBitSize, ToTypeBitSize,
- isa<SExtInst>(&I)));
- }
-
- if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
- unsigned BitWidth =
- M.getDataLayout().getIndexSizeInBits(GEP->getPointerAddressSpace());
- // Rewrite a constant GEP into a DIExpression.
- APInt Offset(BitWidth, 0);
- if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset)) {
- return applyOffset(Offset.getSExtValue());
- } else {
- return nullptr;
- }
- } else if (auto *BI = dyn_cast<BinaryOperator>(&I)) {
- // Rewrite binary operations with constant integer operands.
- auto *ConstInt = dyn_cast<ConstantInt>(I.getOperand(1));
- if (!ConstInt || ConstInt->getBitWidth() > 64)
- return nullptr;
-
- uint64_t Val = ConstInt->getSExtValue();
- switch (BI->getOpcode()) {
- case Instruction::Add:
- return applyOffset(Val);
- case Instruction::Sub:
- return applyOffset(-int64_t(Val));
- case Instruction::Mul:
- return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_mul});
- case Instruction::SDiv:
- return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_div});
- case Instruction::SRem:
- return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_mod});
- case Instruction::Or:
- return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_or});
- case Instruction::And:
- return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_and});
- case Instruction::Xor:
- return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_xor});
- case Instruction::Shl:
- return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_shl});
- case Instruction::LShr:
- return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_shr});
- case Instruction::AShr:
- return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_shra});
- default:
- // TODO: Salvage constants from each kind of binop we know about.
- return nullptr;
- }
- // *Not* to do: we should not attempt to salvage load instructions,
- // because the validity and lifetime of a dbg.value containing
- // DW_OP_deref becomes difficult to analyze. See PR40628 for examples.
- }
- return nullptr;
-}
-
-/// A replacement for a dbg.value expression.
-using DbgValReplacement = Optional<DIExpression *>;
-
-/// Point debug users of \p From to \p To using exprs given by \p RewriteExpr,
-/// possibly moving/undefing users to prevent use-before-def. Returns true if
-/// changes are made.
-static bool rewriteDebugUsers(
- Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT,
- function_ref<DbgValReplacement(DbgVariableIntrinsic &DII)> RewriteExpr) {
- // Find debug users of From.
- SmallVector<DbgVariableIntrinsic *, 1> Users;
- findDbgUsers(Users, &From);
- if (Users.empty())
- return false;
-
- // Prevent use-before-def of To.
- bool Changed = false;
- SmallPtrSet<DbgVariableIntrinsic *, 1> UndefOrSalvage;
- if (isa<Instruction>(&To)) {
- bool DomPointAfterFrom = From.getNextNonDebugInstruction() == &DomPoint;
-
- for (auto *DII : Users) {
- // It's common to see a debug user between From and DomPoint. Move it
- // after DomPoint to preserve the variable update without any reordering.
- if (DomPointAfterFrom && DII->getNextNonDebugInstruction() == &DomPoint) {
- LLVM_DEBUG(dbgs() << "MOVE: " << *DII << '\n');
- DII->moveAfter(&DomPoint);
- Changed = true;
-
- // Users which otherwise aren't dominated by the replacement value must
- // be salvaged or deleted.
- } else if (!DT.dominates(&DomPoint, DII)) {
- UndefOrSalvage.insert(DII);
- }
- }
- }
-
- // Update debug users without use-before-def risk.
- for (auto *DII : Users) {
- if (UndefOrSalvage.count(DII))
- continue;
-
- LLVMContext &Ctx = DII->getContext();
- DbgValReplacement DVR = RewriteExpr(*DII);
- if (!DVR)
- continue;
-
- DII->setOperand(0, wrapValueInMetadata(Ctx, &To));
- DII->setOperand(2, MetadataAsValue::get(Ctx, *DVR));
- LLVM_DEBUG(dbgs() << "REWRITE: " << *DII << '\n');
- Changed = true;
- }
-
- if (!UndefOrSalvage.empty()) {
- // Try to salvage the remaining debug users.
- salvageDebugInfo(From);
- Changed = true;
- }
-
- return Changed;
-}
-
-/// Check if a bitcast between a value of type \p FromTy to type \p ToTy would
-/// losslessly preserve the bits and semantics of the value. This predicate is
-/// symmetric, i.e swapping \p FromTy and \p ToTy should give the same result.
-///
-/// Note that Type::canLosslesslyBitCastTo is not suitable here because it
-/// allows semantically unequivalent bitcasts, such as <2 x i64> -> <4 x i32>,
-/// and also does not allow lossless pointer <-> integer conversions.
-static bool isBitCastSemanticsPreserving(const DataLayout &DL, Type *FromTy,
- Type *ToTy) {
- // Trivially compatible types.
- if (FromTy == ToTy)
- return true;
-
- // Handle compatible pointer <-> integer conversions.
- if (FromTy->isIntOrPtrTy() && ToTy->isIntOrPtrTy()) {
- bool SameSize = DL.getTypeSizeInBits(FromTy) == DL.getTypeSizeInBits(ToTy);
- bool LosslessConversion = !DL.isNonIntegralPointerType(FromTy) &&
- !DL.isNonIntegralPointerType(ToTy);
- return SameSize && LosslessConversion;
- }
-
- // TODO: This is not exhaustive.
- return false;
-}
-
-bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
- Instruction &DomPoint, DominatorTree &DT) {
- // Exit early if From has no debug users.
- if (!From.isUsedByMetadata())
- return false;
-
- assert(&From != &To && "Can't replace something with itself");
-
- Type *FromTy = From.getType();
- Type *ToTy = To.getType();
-
- auto Identity = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement {
- return DII.getExpression();
- };
-
- // Handle no-op conversions.
- Module &M = *From.getModule();
- const DataLayout &DL = M.getDataLayout();
- if (isBitCastSemanticsPreserving(DL, FromTy, ToTy))
- return rewriteDebugUsers(From, To, DomPoint, DT, Identity);
-
- // Handle integer-to-integer widening and narrowing.
- // FIXME: Use DW_OP_convert when it's available everywhere.
- if (FromTy->isIntegerTy() && ToTy->isIntegerTy()) {
- uint64_t FromBits = FromTy->getPrimitiveSizeInBits();
- uint64_t ToBits = ToTy->getPrimitiveSizeInBits();
- assert(FromBits != ToBits && "Unexpected no-op conversion");
-
- // When the width of the result grows, assume that a debugger will only
- // access the low `FromBits` bits when inspecting the source variable.
- if (FromBits < ToBits)
- return rewriteDebugUsers(From, To, DomPoint, DT, Identity);
-
- // The width of the result has shrunk. Use sign/zero extension to describe
- // the source variable's high bits.
- auto SignOrZeroExt = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement {
- DILocalVariable *Var = DII.getVariable();
-
- // Without knowing signedness, sign/zero extension isn't possible.
- auto Signedness = Var->getSignedness();
- if (!Signedness)
- return None;
-
- bool Signed = *Signedness == DIBasicType::Signedness::Signed;
- return DIExpression::appendExt(DII.getExpression(), ToBits, FromBits,
- Signed);
- };
- return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt);
- }
-
- // TODO: Floating-point conversions, vectors.
- return false;
-}
-
+}
+
+/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
+/// that has an associated llvm.dbg.declare or llvm.dbg.addr intrinsic.
+void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
+ StoreInst *SI, DIBuilder &Builder) {
+ assert(DII->isAddressOfVariable());
+ auto *DIVar = DII->getVariable();
+ assert(DIVar && "Missing variable");
+ auto *DIExpr = DII->getExpression();
+ Value *DV = SI->getValueOperand();
+
+ DebugLoc NewLoc = getDebugValueLoc(DII, SI);
+
+ if (!valueCoversEntireFragment(DV->getType(), DII)) {
+ // FIXME: If storing to a part of the variable described by the dbg.declare,
+ // then we want to insert a dbg.value for the corresponding fragment.
+ LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+ << *DII << '\n');
+ // For now, when there is a store to parts of the variable (but we do not
+ // know which part) we insert an dbg.value instrinsic to indicate that we
+ // know nothing about the variable's content.
+ DV = UndefValue::get(DV->getType());
+ Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
+ return;
+ }
+
+ Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
+}
+
+/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
+/// that has an associated llvm.dbg.declare or llvm.dbg.addr intrinsic.
+void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
+ LoadInst *LI, DIBuilder &Builder) {
+ auto *DIVar = DII->getVariable();
+ auto *DIExpr = DII->getExpression();
+ assert(DIVar && "Missing variable");
+
+ if (!valueCoversEntireFragment(LI->getType(), DII)) {
+ // FIXME: If only referring to a part of the variable described by the
+ // dbg.declare, then we want to insert a dbg.value for the corresponding
+ // fragment.
+ LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+ << *DII << '\n');
+ return;
+ }
+
+ DebugLoc NewLoc = getDebugValueLoc(DII, nullptr);
+
+ // We are now tracking the loaded value instead of the address. In the
+ // future if multi-location support is added to the IR, it might be
+ // preferable to keep tracking both the loaded value and the original
+ // address in case the alloca can not be elided.
+ Instruction *DbgValue = Builder.insertDbgValueIntrinsic(
+ LI, DIVar, DIExpr, NewLoc, (Instruction *)nullptr);
+ DbgValue->insertAfter(LI);
+}
+
+/// Inserts a llvm.dbg.value intrinsic after a phi that has an associated
+/// llvm.dbg.declare or llvm.dbg.addr intrinsic.
+void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
+ PHINode *APN, DIBuilder &Builder) {
+ auto *DIVar = DII->getVariable();
+ auto *DIExpr = DII->getExpression();
+ assert(DIVar && "Missing variable");
+
+ if (PhiHasDebugValue(DIVar, DIExpr, APN))
+ return;
+
+ if (!valueCoversEntireFragment(APN->getType(), DII)) {
+ // FIXME: If only referring to a part of the variable described by the
+ // dbg.declare, then we want to insert a dbg.value for the corresponding
+ // fragment.
+ LLVM_DEBUG(dbgs() << "Failed to convert dbg.declare to dbg.value: "
+ << *DII << '\n');
+ return;
+ }
+
+ BasicBlock *BB = APN->getParent();
+ auto InsertionPt = BB->getFirstInsertionPt();
+
+ DebugLoc NewLoc = getDebugValueLoc(DII, nullptr);
+
+ // The block may be a catchswitch block, which does not have a valid
+ // insertion point.
+ // FIXME: Insert dbg.value markers in the successors when appropriate.
+ if (InsertionPt != BB->end())
+ Builder.insertDbgValueIntrinsic(APN, DIVar, DIExpr, NewLoc, &*InsertionPt);
+}
+
+/// Determine whether this alloca is either a VLA or an array.
+static bool isArray(AllocaInst *AI) {
+ return AI->isArrayAllocation() ||
+ (AI->getAllocatedType() && AI->getAllocatedType()->isArrayTy());
+}
+
+/// Determine whether this alloca is a structure.
+static bool isStructure(AllocaInst *AI) {
+ return AI->getAllocatedType() && AI->getAllocatedType()->isStructTy();
+}
+
+/// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set
+/// of llvm.dbg.value intrinsics.
+bool llvm::LowerDbgDeclare(Function &F) {
+ bool Changed = false;
+ DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
+ SmallVector<DbgDeclareInst *, 4> Dbgs;
+ for (auto &FI : F)
+ for (Instruction &BI : FI)
+ if (auto DDI = dyn_cast<DbgDeclareInst>(&BI))
+ Dbgs.push_back(DDI);
+
+ if (Dbgs.empty())
+ return Changed;
+
+ for (auto &I : Dbgs) {
+ DbgDeclareInst *DDI = I;
+ AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress());
+ // If this is an alloca for a scalar variable, insert a dbg.value
+ // at each load and store to the alloca and erase the dbg.declare.
+ // The dbg.values allow tracking a variable even if it is not
+ // stored on the stack, while the dbg.declare can only describe
+ // the stack slot (and at a lexical-scope granularity). Later
+ // passes will attempt to elide the stack slot.
+ if (!AI || isArray(AI) || isStructure(AI))
+ continue;
+
+ // A volatile load/store means that the alloca can't be elided anyway.
+ if (llvm::any_of(AI->users(), [](User *U) -> bool {
+ if (LoadInst *LI = dyn_cast<LoadInst>(U))
+ return LI->isVolatile();
+ if (StoreInst *SI = dyn_cast<StoreInst>(U))
+ return SI->isVolatile();
+ return false;
+ }))
+ continue;
+
+ SmallVector<const Value *, 8> WorkList;
+ WorkList.push_back(AI);
+ while (!WorkList.empty()) {
+ const Value *V = WorkList.pop_back_val();
+ for (auto &AIUse : V->uses()) {
+ User *U = AIUse.getUser();
+ if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
+ if (AIUse.getOperandNo() == 1)
+ ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+ } else if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
+ ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
+ } else if (CallInst *CI = dyn_cast<CallInst>(U)) {
+ // This is a call by-value or some other instruction that takes a
+ // pointer to the variable. Insert a *value* intrinsic that describes
+ // the variable by dereferencing the alloca.
+ if (!CI->isLifetimeStartOrEnd()) {
+ DebugLoc NewLoc = getDebugValueLoc(DDI, nullptr);
+ auto *DerefExpr =
+ DIExpression::append(DDI->getExpression(), dwarf::DW_OP_deref);
+ DIB.insertDbgValueIntrinsic(AI, DDI->getVariable(), DerefExpr,
+ NewLoc, CI);
+ }
+ } else if (BitCastInst *BI = dyn_cast<BitCastInst>(U)) {
+ if (BI->getType()->isPointerTy())
+ WorkList.push_back(BI);
+ }
+ }
+ }
+ DDI->eraseFromParent();
+ Changed = true;
+ }
+
+ if (Changed)
+ for (BasicBlock &BB : F)
+ RemoveRedundantDbgInstrs(&BB);
+
+ return Changed;
+}
+
+/// Propagate dbg.value intrinsics through the newly inserted PHIs.
+void llvm::insertDebugValuesForPHIs(BasicBlock *BB,
+ SmallVectorImpl<PHINode *> &InsertedPHIs) {
+ assert(BB && "No BasicBlock to clone dbg.value(s) from.");
+ if (InsertedPHIs.size() == 0)
+ return;
+
+ // Map existing PHI nodes to their dbg.values.
+ ValueToValueMapTy DbgValueMap;
+ for (auto &I : *BB) {
+ if (auto DbgII = dyn_cast<DbgVariableIntrinsic>(&I)) {
+ if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
+ DbgValueMap.insert({Loc, DbgII});
+ }
+ }
+ if (DbgValueMap.size() == 0)
+ return;
+
+ // Then iterate through the new PHIs and look to see if they use one of the
+ // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
+ // propagate the info through the new PHI.
+ LLVMContext &C = BB->getContext();
+ for (auto PHI : InsertedPHIs) {
+ BasicBlock *Parent = PHI->getParent();
+ // Avoid inserting an intrinsic into an EH block.
+ if (Parent->getFirstNonPHI()->isEHPad())
+ continue;
+ auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
+ for (auto VI : PHI->operand_values()) {
+ auto V = DbgValueMap.find(VI);
+ if (V != DbgValueMap.end()) {
+ auto *DbgII = cast<DbgVariableIntrinsic>(V->second);
+ Instruction *NewDbgII = DbgII->clone();
+ NewDbgII->setOperand(0, PhiMAV);
+ auto InsertionPt = Parent->getFirstInsertionPt();
+ assert(InsertionPt != Parent->end() && "Ill-formed basic block");
+ NewDbgII->insertBefore(&*InsertionPt);
+ }
+ }
+ }
+}
+
+/// Finds all intrinsics declaring local variables as living in the memory that
+/// 'V' points to. This may include a mix of dbg.declare and
+/// dbg.addr intrinsics.
+TinyPtrVector<DbgVariableIntrinsic *> llvm::FindDbgAddrUses(Value *V) {
+ // This function is hot. Check whether the value has any metadata to avoid a
+ // DenseMap lookup.
+ if (!V->isUsedByMetadata())
+ return {};
+ auto *L = LocalAsMetadata::getIfExists(V);
+ if (!L)
+ return {};
+ auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L);
+ if (!MDV)
+ return {};
+
+ TinyPtrVector<DbgVariableIntrinsic *> Declares;
+ for (User *U : MDV->users()) {
+ if (auto *DII = dyn_cast<DbgVariableIntrinsic>(U))
+ if (DII->isAddressOfVariable())
+ Declares.push_back(DII);
+ }
+
+ return Declares;
+}
+
+TinyPtrVector<DbgDeclareInst *> llvm::FindDbgDeclareUses(Value *V) {
+ TinyPtrVector<DbgDeclareInst *> DDIs;
+ for (DbgVariableIntrinsic *DVI : FindDbgAddrUses(V))
+ if (auto *DDI = dyn_cast<DbgDeclareInst>(DVI))
+ DDIs.push_back(DDI);
+ return DDIs;
+}
+
+void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) {
+ // This function is hot. Check whether the value has any metadata to avoid a
+ // DenseMap lookup.
+ if (!V->isUsedByMetadata())
+ return;
+ if (auto *L = LocalAsMetadata::getIfExists(V))
+ if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
+ for (User *U : MDV->users())
+ if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(U))
+ DbgValues.push_back(DVI);
+}
+
+void llvm::findDbgUsers(SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers,
+ Value *V) {
+ // This function is hot. Check whether the value has any metadata to avoid a
+ // DenseMap lookup.
+ if (!V->isUsedByMetadata())
+ return;
+ if (auto *L = LocalAsMetadata::getIfExists(V))
+ if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
+ for (User *U : MDV->users())
+ if (DbgVariableIntrinsic *DII = dyn_cast<DbgVariableIntrinsic>(U))
+ DbgUsers.push_back(DII);
+}
+
+bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
+ DIBuilder &Builder, uint8_t DIExprFlags,
+ int Offset) {
+ auto DbgAddrs = FindDbgAddrUses(Address);
+ for (DbgVariableIntrinsic *DII : DbgAddrs) {
+ DebugLoc Loc = DII->getDebugLoc();
+ auto *DIVar = DII->getVariable();
+ auto *DIExpr = DII->getExpression();
+ assert(DIVar && "Missing variable");
+ DIExpr = DIExpression::prepend(DIExpr, DIExprFlags, Offset);
+ // Insert llvm.dbg.declare immediately before DII, and remove old
+ // llvm.dbg.declare.
+ Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, DII);
+ DII->eraseFromParent();
+ }
+ return !DbgAddrs.empty();
+}
+
+static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress,
+ DIBuilder &Builder, int Offset) {
+ DebugLoc Loc = DVI->getDebugLoc();
+ auto *DIVar = DVI->getVariable();
+ auto *DIExpr = DVI->getExpression();
+ assert(DIVar && "Missing variable");
+
+ // This is an alloca-based llvm.dbg.value. The first thing it should do with
+ // the alloca pointer is dereference it. Otherwise we don't know how to handle
+ // it and give up.
+ if (!DIExpr || DIExpr->getNumElements() < 1 ||
+ DIExpr->getElement(0) != dwarf::DW_OP_deref)
+ return;
+
+ // Insert the offset before the first deref.
+ // We could just change the offset argument of dbg.value, but it's unsigned...
+ if (Offset)
+ DIExpr = DIExpression::prepend(DIExpr, 0, Offset);
+
+ Builder.insertDbgValueIntrinsic(NewAddress, DIVar, DIExpr, Loc, DVI);
+ DVI->eraseFromParent();
+}
+
+void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
+ DIBuilder &Builder, int Offset) {
+ if (auto *L = LocalAsMetadata::getIfExists(AI))
+ if (auto *MDV = MetadataAsValue::getIfExists(AI->getContext(), L))
+ for (auto UI = MDV->use_begin(), UE = MDV->use_end(); UI != UE;) {
+ Use &U = *UI++;
+ if (auto *DVI = dyn_cast<DbgValueInst>(U.getUser()))
+ replaceOneDbgValueForAlloca(DVI, NewAllocaAddress, Builder, Offset);
+ }
+}
+
+/// Wrap \p V in a ValueAsMetadata instance.
+static MetadataAsValue *wrapValueInMetadata(LLVMContext &C, Value *V) {
+ return MetadataAsValue::get(C, ValueAsMetadata::get(V));
+}
+
+/// Where possible to salvage debug information for \p I do so
+/// and return True. If not possible mark undef and return False.
+void llvm::salvageDebugInfo(Instruction &I) {
+ SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
+ findDbgUsers(DbgUsers, &I);
+ salvageDebugInfoForDbgValues(I, DbgUsers);
+}
+
+void llvm::salvageDebugInfoForDbgValues(
+ Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers) {
+ auto &Ctx = I.getContext();
+ bool Salvaged = false;
+ auto wrapMD = [&](Value *V) { return wrapValueInMetadata(Ctx, V); };
+
+ for (auto *DII : DbgUsers) {
+ // Do not add DW_OP_stack_value for DbgDeclare and DbgAddr, because they
+ // are implicitly pointing out the value as a DWARF memory location
+ // description.
+ bool StackValue = isa<DbgValueInst>(DII);
+
+ DIExpression *DIExpr =
+ salvageDebugInfoImpl(I, DII->getExpression(), StackValue);
+
+ // salvageDebugInfoImpl should fail on examining the first element of
+ // DbgUsers, or none of them.
+ if (!DIExpr)
+ break;
+
+ DII->setOperand(0, wrapMD(I.getOperand(0)));
+ DII->setOperand(2, MetadataAsValue::get(Ctx, DIExpr));
+ LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
+ Salvaged = true;
+ }
+
+ if (Salvaged)
+ return;
+
+ for (auto *DII : DbgUsers) {
+ Value *Undef = UndefValue::get(I.getType());
+ DII->setOperand(0, MetadataAsValue::get(DII->getContext(),
+ ValueAsMetadata::get(Undef)));
+ }
+}
+
+DIExpression *llvm::salvageDebugInfoImpl(Instruction &I,
+ DIExpression *SrcDIExpr,
+ bool WithStackValue) {
+ auto &M = *I.getModule();
+ auto &DL = M.getDataLayout();
+
+ // Apply a vector of opcodes to the source DIExpression.
+ auto doSalvage = [&](SmallVectorImpl<uint64_t> &Ops) -> DIExpression * {
+ DIExpression *DIExpr = SrcDIExpr;
+ if (!Ops.empty()) {
+ DIExpr = DIExpression::prependOpcodes(DIExpr, Ops, WithStackValue);
+ }
+ return DIExpr;
+ };
+
+ // Apply the given offset to the source DIExpression.
+ auto applyOffset = [&](uint64_t Offset) -> DIExpression * {
+ SmallVector<uint64_t, 8> Ops;
+ DIExpression::appendOffset(Ops, Offset);
+ return doSalvage(Ops);
+ };
+
+ // initializer-list helper for applying operators to the source DIExpression.
+ auto applyOps = [&](ArrayRef<uint64_t> Opcodes) -> DIExpression * {
+ SmallVector<uint64_t, 8> Ops(Opcodes.begin(), Opcodes.end());
+ return doSalvage(Ops);
+ };
+
+ if (auto *CI = dyn_cast<CastInst>(&I)) {
+ // No-op casts are irrelevant for debug info.
+ if (CI->isNoopCast(DL))
+ return SrcDIExpr;
+
+ Type *Type = CI->getType();
+ // Casts other than Trunc, SExt, or ZExt to scalar types cannot be salvaged.
+ if (Type->isVectorTy() ||
+ !(isa<TruncInst>(&I) || isa<SExtInst>(&I) || isa<ZExtInst>(&I)))
+ return nullptr;
+
+ Value *FromValue = CI->getOperand(0);
+ unsigned FromTypeBitSize = FromValue->getType()->getScalarSizeInBits();
+ unsigned ToTypeBitSize = Type->getScalarSizeInBits();
+
+ return applyOps(DIExpression::getExtOps(FromTypeBitSize, ToTypeBitSize,
+ isa<SExtInst>(&I)));
+ }
+
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ unsigned BitWidth =
+ M.getDataLayout().getIndexSizeInBits(GEP->getPointerAddressSpace());
+ // Rewrite a constant GEP into a DIExpression.
+ APInt Offset(BitWidth, 0);
+ if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset)) {
+ return applyOffset(Offset.getSExtValue());
+ } else {
+ return nullptr;
+ }
+ } else if (auto *BI = dyn_cast<BinaryOperator>(&I)) {
+ // Rewrite binary operations with constant integer operands.
+ auto *ConstInt = dyn_cast<ConstantInt>(I.getOperand(1));
+ if (!ConstInt || ConstInt->getBitWidth() > 64)
+ return nullptr;
+
+ uint64_t Val = ConstInt->getSExtValue();
+ switch (BI->getOpcode()) {
+ case Instruction::Add:
+ return applyOffset(Val);
+ case Instruction::Sub:
+ return applyOffset(-int64_t(Val));
+ case Instruction::Mul:
+ return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_mul});
+ case Instruction::SDiv:
+ return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_div});
+ case Instruction::SRem:
+ return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_mod});
+ case Instruction::Or:
+ return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_or});
+ case Instruction::And:
+ return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_and});
+ case Instruction::Xor:
+ return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_xor});
+ case Instruction::Shl:
+ return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_shl});
+ case Instruction::LShr:
+ return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_shr});
+ case Instruction::AShr:
+ return applyOps({dwarf::DW_OP_constu, Val, dwarf::DW_OP_shra});
+ default:
+ // TODO: Salvage constants from each kind of binop we know about.
+ return nullptr;
+ }
+ // *Not* to do: we should not attempt to salvage load instructions,
+ // because the validity and lifetime of a dbg.value containing
+ // DW_OP_deref becomes difficult to analyze. See PR40628 for examples.
+ }
+ return nullptr;
+}
+
+/// A replacement for a dbg.value expression.
+using DbgValReplacement = Optional<DIExpression *>;
+
+/// Point debug users of \p From to \p To using exprs given by \p RewriteExpr,
+/// possibly moving/undefing users to prevent use-before-def. Returns true if
+/// changes are made.
+static bool rewriteDebugUsers(
+ Instruction &From, Value &To, Instruction &DomPoint, DominatorTree &DT,
+ function_ref<DbgValReplacement(DbgVariableIntrinsic &DII)> RewriteExpr) {
+ // Find debug users of From.
+ SmallVector<DbgVariableIntrinsic *, 1> Users;
+ findDbgUsers(Users, &From);
+ if (Users.empty())
+ return false;
+
+ // Prevent use-before-def of To.
+ bool Changed = false;
+ SmallPtrSet<DbgVariableIntrinsic *, 1> UndefOrSalvage;
+ if (isa<Instruction>(&To)) {
+ bool DomPointAfterFrom = From.getNextNonDebugInstruction() == &DomPoint;
+
+ for (auto *DII : Users) {
+ // It's common to see a debug user between From and DomPoint. Move it
+ // after DomPoint to preserve the variable update without any reordering.
+ if (DomPointAfterFrom && DII->getNextNonDebugInstruction() == &DomPoint) {
+ LLVM_DEBUG(dbgs() << "MOVE: " << *DII << '\n');
+ DII->moveAfter(&DomPoint);
+ Changed = true;
+
+ // Users which otherwise aren't dominated by the replacement value must
+ // be salvaged or deleted.
+ } else if (!DT.dominates(&DomPoint, DII)) {
+ UndefOrSalvage.insert(DII);
+ }
+ }
+ }
+
+ // Update debug users without use-before-def risk.
+ for (auto *DII : Users) {
+ if (UndefOrSalvage.count(DII))
+ continue;
+
+ LLVMContext &Ctx = DII->getContext();
+ DbgValReplacement DVR = RewriteExpr(*DII);
+ if (!DVR)
+ continue;
+
+ DII->setOperand(0, wrapValueInMetadata(Ctx, &To));
+ DII->setOperand(2, MetadataAsValue::get(Ctx, *DVR));
+ LLVM_DEBUG(dbgs() << "REWRITE: " << *DII << '\n');
+ Changed = true;
+ }
+
+ if (!UndefOrSalvage.empty()) {
+ // Try to salvage the remaining debug users.
+ salvageDebugInfo(From);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+/// Check if a bitcast between a value of type \p FromTy to type \p ToTy would
+/// losslessly preserve the bits and semantics of the value. This predicate is
+/// symmetric, i.e swapping \p FromTy and \p ToTy should give the same result.
+///
+/// Note that Type::canLosslesslyBitCastTo is not suitable here because it
+/// allows semantically unequivalent bitcasts, such as <2 x i64> -> <4 x i32>,
+/// and also does not allow lossless pointer <-> integer conversions.
+static bool isBitCastSemanticsPreserving(const DataLayout &DL, Type *FromTy,
+ Type *ToTy) {
+ // Trivially compatible types.
+ if (FromTy == ToTy)
+ return true;
+
+ // Handle compatible pointer <-> integer conversions.
+ if (FromTy->isIntOrPtrTy() && ToTy->isIntOrPtrTy()) {
+ bool SameSize = DL.getTypeSizeInBits(FromTy) == DL.getTypeSizeInBits(ToTy);
+ bool LosslessConversion = !DL.isNonIntegralPointerType(FromTy) &&
+ !DL.isNonIntegralPointerType(ToTy);
+ return SameSize && LosslessConversion;
+ }
+
+ // TODO: This is not exhaustive.
+ return false;
+}
+
+bool llvm::replaceAllDbgUsesWith(Instruction &From, Value &To,
+ Instruction &DomPoint, DominatorTree &DT) {
+ // Exit early if From has no debug users.
+ if (!From.isUsedByMetadata())
+ return false;
+
+ assert(&From != &To && "Can't replace something with itself");
+
+ Type *FromTy = From.getType();
+ Type *ToTy = To.getType();
+
+ auto Identity = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement {
+ return DII.getExpression();
+ };
+
+ // Handle no-op conversions.
+ Module &M = *From.getModule();
+ const DataLayout &DL = M.getDataLayout();
+ if (isBitCastSemanticsPreserving(DL, FromTy, ToTy))
+ return rewriteDebugUsers(From, To, DomPoint, DT, Identity);
+
+ // Handle integer-to-integer widening and narrowing.
+ // FIXME: Use DW_OP_convert when it's available everywhere.
+ if (FromTy->isIntegerTy() && ToTy->isIntegerTy()) {
+ uint64_t FromBits = FromTy->getPrimitiveSizeInBits();
+ uint64_t ToBits = ToTy->getPrimitiveSizeInBits();
+ assert(FromBits != ToBits && "Unexpected no-op conversion");
+
+ // When the width of the result grows, assume that a debugger will only
+ // access the low `FromBits` bits when inspecting the source variable.
+ if (FromBits < ToBits)
+ return rewriteDebugUsers(From, To, DomPoint, DT, Identity);
+
+ // The width of the result has shrunk. Use sign/zero extension to describe
+ // the source variable's high bits.
+ auto SignOrZeroExt = [&](DbgVariableIntrinsic &DII) -> DbgValReplacement {
+ DILocalVariable *Var = DII.getVariable();
+
+ // Without knowing signedness, sign/zero extension isn't possible.
+ auto Signedness = Var->getSignedness();
+ if (!Signedness)
+ return None;
+
+ bool Signed = *Signedness == DIBasicType::Signedness::Signed;
+ return DIExpression::appendExt(DII.getExpression(), ToBits, FromBits,
+ Signed);
+ };
+ return rewriteDebugUsers(From, To, DomPoint, DT, SignOrZeroExt);
+ }
+
+ // TODO: Floating-point conversions, vectors.
+ return false;
+}
+
std::pair<unsigned, unsigned>
llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
- unsigned NumDeadInst = 0;
+ unsigned NumDeadInst = 0;
unsigned NumDeadDbgInst = 0;
- // Delete the instructions backwards, as it has a reduced likelihood of
- // having to update as many def-use and use-def chains.
- Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
- while (EndInst != &BB->front()) {
- // Delete the next to last instruction.
- Instruction *Inst = &*--EndInst->getIterator();
- if (!Inst->use_empty() && !Inst->getType()->isTokenTy())
- Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
- if (Inst->isEHPad() || Inst->getType()->isTokenTy()) {
- EndInst = Inst;
- continue;
- }
+ // Delete the instructions backwards, as it has a reduced likelihood of
+ // having to update as many def-use and use-def chains.
+ Instruction *EndInst = BB->getTerminator(); // Last not to be deleted.
+ while (EndInst != &BB->front()) {
+ // Delete the next to last instruction.
+ Instruction *Inst = &*--EndInst->getIterator();
+ if (!Inst->use_empty() && !Inst->getType()->isTokenTy())
+ Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
+ if (Inst->isEHPad() || Inst->getType()->isTokenTy()) {
+ EndInst = Inst;
+ continue;
+ }
if (isa<DbgInfoIntrinsic>(Inst))
++NumDeadDbgInst;
else
- ++NumDeadInst;
- Inst->eraseFromParent();
- }
+ ++NumDeadInst;
+ Inst->eraseFromParent();
+ }
return {NumDeadInst, NumDeadDbgInst};
-}
-
-unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
- bool PreserveLCSSA, DomTreeUpdater *DTU,
- MemorySSAUpdater *MSSAU) {
- BasicBlock *BB = I->getParent();
-
- if (MSSAU)
- MSSAU->changeToUnreachable(I);
-
+}
+
+unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
+ bool PreserveLCSSA, DomTreeUpdater *DTU,
+ MemorySSAUpdater *MSSAU) {
+ BasicBlock *BB = I->getParent();
+
+ if (MSSAU)
+ MSSAU->changeToUnreachable(I);
+
SmallSetVector<BasicBlock *, 8> UniqueSuccessors;
- // Loop over all of the successors, removing BB's entry from any PHI
- // nodes.
- for (BasicBlock *Successor : successors(BB)) {
- Successor->removePredecessor(BB, PreserveLCSSA);
- if (DTU)
+ // Loop over all of the successors, removing BB's entry from any PHI
+ // nodes.
+ for (BasicBlock *Successor : successors(BB)) {
+ Successor->removePredecessor(BB, PreserveLCSSA);
+ if (DTU)
UniqueSuccessors.insert(Successor);
- }
- // Insert a call to llvm.trap right before this. This turns the undefined
- // behavior into a hard fail instead of falling through into random code.
- if (UseLLVMTrap) {
- Function *TrapFn =
- Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap);
- CallInst *CallTrap = CallInst::Create(TrapFn, "", I);
- CallTrap->setDebugLoc(I->getDebugLoc());
- }
- auto *UI = new UnreachableInst(I->getContext(), I);
- UI->setDebugLoc(I->getDebugLoc());
-
- // All instructions after this are dead.
- unsigned NumInstrsRemoved = 0;
- BasicBlock::iterator BBI = I->getIterator(), BBE = BB->end();
- while (BBI != BBE) {
- if (!BBI->use_empty())
- BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
- BB->getInstList().erase(BBI++);
- ++NumInstrsRemoved;
- }
+ }
+ // Insert a call to llvm.trap right before this. This turns the undefined
+ // behavior into a hard fail instead of falling through into random code.
+ if (UseLLVMTrap) {
+ Function *TrapFn =
+ Intrinsic::getDeclaration(BB->getParent()->getParent(), Intrinsic::trap);
+ CallInst *CallTrap = CallInst::Create(TrapFn, "", I);
+ CallTrap->setDebugLoc(I->getDebugLoc());
+ }
+ auto *UI = new UnreachableInst(I->getContext(), I);
+ UI->setDebugLoc(I->getDebugLoc());
+
+ // All instructions after this are dead.
+ unsigned NumInstrsRemoved = 0;
+ BasicBlock::iterator BBI = I->getIterator(), BBE = BB->end();
+ while (BBI != BBE) {
+ if (!BBI->use_empty())
+ BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+ BB->getInstList().erase(BBI++);
+ ++NumInstrsRemoved;
+ }
if (DTU) {
SmallVector<DominatorTree::UpdateType, 8> Updates;
Updates.reserve(UniqueSuccessors.size());
@@ -2090,892 +2090,892 @@ unsigned llvm::changeToUnreachable(Instruction *I, bool UseLLVMTrap,
Updates.push_back({DominatorTree::Delete, BB, UniqueSuccessor});
DTU->applyUpdates(Updates);
}
- return NumInstrsRemoved;
-}
-
-CallInst *llvm::createCallMatchingInvoke(InvokeInst *II) {
+ return NumInstrsRemoved;
+}
+
+CallInst *llvm::createCallMatchingInvoke(InvokeInst *II) {
SmallVector<Value *, 8> Args(II->args());
- SmallVector<OperandBundleDef, 1> OpBundles;
- II->getOperandBundlesAsDefs(OpBundles);
- CallInst *NewCall = CallInst::Create(II->getFunctionType(),
- II->getCalledOperand(), Args, OpBundles);
- NewCall->setCallingConv(II->getCallingConv());
- NewCall->setAttributes(II->getAttributes());
- NewCall->setDebugLoc(II->getDebugLoc());
- NewCall->copyMetadata(*II);
-
- // If the invoke had profile metadata, try converting them for CallInst.
- uint64_t TotalWeight;
- if (NewCall->extractProfTotalWeight(TotalWeight)) {
- // Set the total weight if it fits into i32, otherwise reset.
- MDBuilder MDB(NewCall->getContext());
- auto NewWeights = uint32_t(TotalWeight) != TotalWeight
- ? nullptr
- : MDB.createBranchWeights({uint32_t(TotalWeight)});
- NewCall->setMetadata(LLVMContext::MD_prof, NewWeights);
- }
-
- return NewCall;
-}
-
-/// changeToCall - Convert the specified invoke into a normal call.
-void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
- CallInst *NewCall = createCallMatchingInvoke(II);
- NewCall->takeName(II);
- NewCall->insertBefore(II);
- II->replaceAllUsesWith(NewCall);
-
- // Follow the call by a branch to the normal destination.
- BasicBlock *NormalDestBB = II->getNormalDest();
- BranchInst::Create(NormalDestBB, II);
-
- // Update PHI nodes in the unwind destination
- BasicBlock *BB = II->getParent();
- BasicBlock *UnwindDestBB = II->getUnwindDest();
- UnwindDestBB->removePredecessor(BB);
- II->eraseFromParent();
- if (DTU)
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ II->getOperandBundlesAsDefs(OpBundles);
+ CallInst *NewCall = CallInst::Create(II->getFunctionType(),
+ II->getCalledOperand(), Args, OpBundles);
+ NewCall->setCallingConv(II->getCallingConv());
+ NewCall->setAttributes(II->getAttributes());
+ NewCall->setDebugLoc(II->getDebugLoc());
+ NewCall->copyMetadata(*II);
+
+ // If the invoke had profile metadata, try converting them for CallInst.
+ uint64_t TotalWeight;
+ if (NewCall->extractProfTotalWeight(TotalWeight)) {
+ // Set the total weight if it fits into i32, otherwise reset.
+ MDBuilder MDB(NewCall->getContext());
+ auto NewWeights = uint32_t(TotalWeight) != TotalWeight
+ ? nullptr
+ : MDB.createBranchWeights({uint32_t(TotalWeight)});
+ NewCall->setMetadata(LLVMContext::MD_prof, NewWeights);
+ }
+
+ return NewCall;
+}
+
+/// changeToCall - Convert the specified invoke into a normal call.
+void llvm::changeToCall(InvokeInst *II, DomTreeUpdater *DTU) {
+ CallInst *NewCall = createCallMatchingInvoke(II);
+ NewCall->takeName(II);
+ NewCall->insertBefore(II);
+ II->replaceAllUsesWith(NewCall);
+
+ // Follow the call by a branch to the normal destination.
+ BasicBlock *NormalDestBB = II->getNormalDest();
+ BranchInst::Create(NormalDestBB, II);
+
+ // Update PHI nodes in the unwind destination
+ BasicBlock *BB = II->getParent();
+ BasicBlock *UnwindDestBB = II->getUnwindDest();
+ UnwindDestBB->removePredecessor(BB);
+ II->eraseFromParent();
+ if (DTU)
DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}});
-}
-
-BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
- BasicBlock *UnwindEdge) {
- BasicBlock *BB = CI->getParent();
-
- // Convert this function call into an invoke instruction. First, split the
- // basic block.
- BasicBlock *Split =
- BB->splitBasicBlock(CI->getIterator(), CI->getName() + ".noexc");
-
- // Delete the unconditional branch inserted by splitBasicBlock
- BB->getInstList().pop_back();
-
- // Create the new invoke instruction.
+}
+
+BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
+ BasicBlock *UnwindEdge) {
+ BasicBlock *BB = CI->getParent();
+
+ // Convert this function call into an invoke instruction. First, split the
+ // basic block.
+ BasicBlock *Split =
+ BB->splitBasicBlock(CI->getIterator(), CI->getName() + ".noexc");
+
+ // Delete the unconditional branch inserted by splitBasicBlock
+ BB->getInstList().pop_back();
+
+ // Create the new invoke instruction.
SmallVector<Value *, 8> InvokeArgs(CI->args());
- SmallVector<OperandBundleDef, 1> OpBundles;
-
- CI->getOperandBundlesAsDefs(OpBundles);
-
- // Note: we're round tripping operand bundles through memory here, and that
- // can potentially be avoided with a cleverer API design that we do not have
- // as of this time.
-
- InvokeInst *II =
- InvokeInst::Create(CI->getFunctionType(), CI->getCalledOperand(), Split,
- UnwindEdge, InvokeArgs, OpBundles, CI->getName(), BB);
- II->setDebugLoc(CI->getDebugLoc());
- II->setCallingConv(CI->getCallingConv());
- II->setAttributes(CI->getAttributes());
-
- // Make sure that anything using the call now uses the invoke! This also
- // updates the CallGraph if present, because it uses a WeakTrackingVH.
- CI->replaceAllUsesWith(II);
-
- // Delete the original call
- Split->getInstList().pop_front();
- return Split;
-}
-
-static bool markAliveBlocks(Function &F,
- SmallPtrSetImpl<BasicBlock *> &Reachable,
- DomTreeUpdater *DTU = nullptr) {
- SmallVector<BasicBlock*, 128> Worklist;
- BasicBlock *BB = &F.front();
- Worklist.push_back(BB);
- Reachable.insert(BB);
- bool Changed = false;
- do {
- BB = Worklist.pop_back_val();
-
- // Do a quick scan of the basic block, turning any obviously unreachable
- // instructions into LLVM unreachable insts. The instruction combining pass
- // canonicalizes unreachable insts into stores to null or undef.
- for (Instruction &I : *BB) {
- if (auto *CI = dyn_cast<CallInst>(&I)) {
- Value *Callee = CI->getCalledOperand();
- // Handle intrinsic calls.
- if (Function *F = dyn_cast<Function>(Callee)) {
- auto IntrinsicID = F->getIntrinsicID();
- // Assumptions that are known to be false are equivalent to
- // unreachable. Also, if the condition is undefined, then we make the
- // choice most beneficial to the optimizer, and choose that to also be
- // unreachable.
- if (IntrinsicID == Intrinsic::assume) {
- if (match(CI->getArgOperand(0), m_CombineOr(m_Zero(), m_Undef()))) {
- // Don't insert a call to llvm.trap right before the unreachable.
- changeToUnreachable(CI, false, false, DTU);
- Changed = true;
- break;
- }
- } else if (IntrinsicID == Intrinsic::experimental_guard) {
- // A call to the guard intrinsic bails out of the current
- // compilation unit if the predicate passed to it is false. If the
- // predicate is a constant false, then we know the guard will bail
- // out of the current compile unconditionally, so all code following
- // it is dead.
- //
- // Note: unlike in llvm.assume, it is not "obviously profitable" for
- // guards to treat `undef` as `false` since a guard on `undef` can
- // still be useful for widening.
- if (match(CI->getArgOperand(0), m_Zero()))
- if (!isa<UnreachableInst>(CI->getNextNode())) {
- changeToUnreachable(CI->getNextNode(), /*UseLLVMTrap=*/false,
- false, DTU);
- Changed = true;
- break;
- }
- }
- } else if ((isa<ConstantPointerNull>(Callee) &&
- !NullPointerIsDefined(CI->getFunction())) ||
- isa<UndefValue>(Callee)) {
- changeToUnreachable(CI, /*UseLLVMTrap=*/false, false, DTU);
- Changed = true;
- break;
- }
- if (CI->doesNotReturn() && !CI->isMustTailCall()) {
- // If we found a call to a no-return function, insert an unreachable
- // instruction after it. Make sure there isn't *already* one there
- // though.
- if (!isa<UnreachableInst>(CI->getNextNode())) {
- // Don't insert a call to llvm.trap right before the unreachable.
- changeToUnreachable(CI->getNextNode(), false, false, DTU);
- Changed = true;
- }
- break;
- }
- } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
- // Store to undef and store to null are undefined and used to signal
- // that they should be changed to unreachable by passes that can't
- // modify the CFG.
-
- // Don't touch volatile stores.
- if (SI->isVolatile()) continue;
-
- Value *Ptr = SI->getOperand(1);
-
- if (isa<UndefValue>(Ptr) ||
- (isa<ConstantPointerNull>(Ptr) &&
- !NullPointerIsDefined(SI->getFunction(),
- SI->getPointerAddressSpace()))) {
- changeToUnreachable(SI, true, false, DTU);
- Changed = true;
- break;
- }
- }
- }
-
- Instruction *Terminator = BB->getTerminator();
- if (auto *II = dyn_cast<InvokeInst>(Terminator)) {
- // Turn invokes that call 'nounwind' functions into ordinary calls.
- Value *Callee = II->getCalledOperand();
- if ((isa<ConstantPointerNull>(Callee) &&
- !NullPointerIsDefined(BB->getParent())) ||
- isa<UndefValue>(Callee)) {
- changeToUnreachable(II, true, false, DTU);
- Changed = true;
- } else if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(&F)) {
- if (II->use_empty() && II->onlyReadsMemory()) {
- // jump to the normal destination branch.
- BasicBlock *NormalDestBB = II->getNormalDest();
- BasicBlock *UnwindDestBB = II->getUnwindDest();
- BranchInst::Create(NormalDestBB, II);
- UnwindDestBB->removePredecessor(II->getParent());
- II->eraseFromParent();
- if (DTU)
+ SmallVector<OperandBundleDef, 1> OpBundles;
+
+ CI->getOperandBundlesAsDefs(OpBundles);
+
+ // Note: we're round tripping operand bundles through memory here, and that
+ // can potentially be avoided with a cleverer API design that we do not have
+ // as of this time.
+
+ InvokeInst *II =
+ InvokeInst::Create(CI->getFunctionType(), CI->getCalledOperand(), Split,
+ UnwindEdge, InvokeArgs, OpBundles, CI->getName(), BB);
+ II->setDebugLoc(CI->getDebugLoc());
+ II->setCallingConv(CI->getCallingConv());
+ II->setAttributes(CI->getAttributes());
+
+ // Make sure that anything using the call now uses the invoke! This also
+ // updates the CallGraph if present, because it uses a WeakTrackingVH.
+ CI->replaceAllUsesWith(II);
+
+ // Delete the original call
+ Split->getInstList().pop_front();
+ return Split;
+}
+
+static bool markAliveBlocks(Function &F,
+ SmallPtrSetImpl<BasicBlock *> &Reachable,
+ DomTreeUpdater *DTU = nullptr) {
+ SmallVector<BasicBlock*, 128> Worklist;
+ BasicBlock *BB = &F.front();
+ Worklist.push_back(BB);
+ Reachable.insert(BB);
+ bool Changed = false;
+ do {
+ BB = Worklist.pop_back_val();
+
+ // Do a quick scan of the basic block, turning any obviously unreachable
+ // instructions into LLVM unreachable insts. The instruction combining pass
+ // canonicalizes unreachable insts into stores to null or undef.
+ for (Instruction &I : *BB) {
+ if (auto *CI = dyn_cast<CallInst>(&I)) {
+ Value *Callee = CI->getCalledOperand();
+ // Handle intrinsic calls.
+ if (Function *F = dyn_cast<Function>(Callee)) {
+ auto IntrinsicID = F->getIntrinsicID();
+ // Assumptions that are known to be false are equivalent to
+ // unreachable. Also, if the condition is undefined, then we make the
+ // choice most beneficial to the optimizer, and choose that to also be
+ // unreachable.
+ if (IntrinsicID == Intrinsic::assume) {
+ if (match(CI->getArgOperand(0), m_CombineOr(m_Zero(), m_Undef()))) {
+ // Don't insert a call to llvm.trap right before the unreachable.
+ changeToUnreachable(CI, false, false, DTU);
+ Changed = true;
+ break;
+ }
+ } else if (IntrinsicID == Intrinsic::experimental_guard) {
+ // A call to the guard intrinsic bails out of the current
+ // compilation unit if the predicate passed to it is false. If the
+ // predicate is a constant false, then we know the guard will bail
+ // out of the current compile unconditionally, so all code following
+ // it is dead.
+ //
+ // Note: unlike in llvm.assume, it is not "obviously profitable" for
+ // guards to treat `undef` as `false` since a guard on `undef` can
+ // still be useful for widening.
+ if (match(CI->getArgOperand(0), m_Zero()))
+ if (!isa<UnreachableInst>(CI->getNextNode())) {
+ changeToUnreachable(CI->getNextNode(), /*UseLLVMTrap=*/false,
+ false, DTU);
+ Changed = true;
+ break;
+ }
+ }
+ } else if ((isa<ConstantPointerNull>(Callee) &&
+ !NullPointerIsDefined(CI->getFunction())) ||
+ isa<UndefValue>(Callee)) {
+ changeToUnreachable(CI, /*UseLLVMTrap=*/false, false, DTU);
+ Changed = true;
+ break;
+ }
+ if (CI->doesNotReturn() && !CI->isMustTailCall()) {
+ // If we found a call to a no-return function, insert an unreachable
+ // instruction after it. Make sure there isn't *already* one there
+ // though.
+ if (!isa<UnreachableInst>(CI->getNextNode())) {
+ // Don't insert a call to llvm.trap right before the unreachable.
+ changeToUnreachable(CI->getNextNode(), false, false, DTU);
+ Changed = true;
+ }
+ break;
+ }
+ } else if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ // Store to undef and store to null are undefined and used to signal
+ // that they should be changed to unreachable by passes that can't
+ // modify the CFG.
+
+ // Don't touch volatile stores.
+ if (SI->isVolatile()) continue;
+
+ Value *Ptr = SI->getOperand(1);
+
+ if (isa<UndefValue>(Ptr) ||
+ (isa<ConstantPointerNull>(Ptr) &&
+ !NullPointerIsDefined(SI->getFunction(),
+ SI->getPointerAddressSpace()))) {
+ changeToUnreachable(SI, true, false, DTU);
+ Changed = true;
+ break;
+ }
+ }
+ }
+
+ Instruction *Terminator = BB->getTerminator();
+ if (auto *II = dyn_cast<InvokeInst>(Terminator)) {
+ // Turn invokes that call 'nounwind' functions into ordinary calls.
+ Value *Callee = II->getCalledOperand();
+ if ((isa<ConstantPointerNull>(Callee) &&
+ !NullPointerIsDefined(BB->getParent())) ||
+ isa<UndefValue>(Callee)) {
+ changeToUnreachable(II, true, false, DTU);
+ Changed = true;
+ } else if (II->doesNotThrow() && canSimplifyInvokeNoUnwind(&F)) {
+ if (II->use_empty() && II->onlyReadsMemory()) {
+ // jump to the normal destination branch.
+ BasicBlock *NormalDestBB = II->getNormalDest();
+ BasicBlock *UnwindDestBB = II->getUnwindDest();
+ BranchInst::Create(NormalDestBB, II);
+ UnwindDestBB->removePredecessor(II->getParent());
+ II->eraseFromParent();
+ if (DTU)
DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDestBB}});
- } else
- changeToCall(II, DTU);
- Changed = true;
- }
- } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Terminator)) {
- // Remove catchpads which cannot be reached.
- struct CatchPadDenseMapInfo {
- static CatchPadInst *getEmptyKey() {
- return DenseMapInfo<CatchPadInst *>::getEmptyKey();
- }
-
- static CatchPadInst *getTombstoneKey() {
- return DenseMapInfo<CatchPadInst *>::getTombstoneKey();
- }
-
- static unsigned getHashValue(CatchPadInst *CatchPad) {
- return static_cast<unsigned>(hash_combine_range(
- CatchPad->value_op_begin(), CatchPad->value_op_end()));
- }
-
- static bool isEqual(CatchPadInst *LHS, CatchPadInst *RHS) {
- if (LHS == getEmptyKey() || LHS == getTombstoneKey() ||
- RHS == getEmptyKey() || RHS == getTombstoneKey())
- return LHS == RHS;
- return LHS->isIdenticalTo(RHS);
- }
- };
-
+ } else
+ changeToCall(II, DTU);
+ Changed = true;
+ }
+ } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Terminator)) {
+ // Remove catchpads which cannot be reached.
+ struct CatchPadDenseMapInfo {
+ static CatchPadInst *getEmptyKey() {
+ return DenseMapInfo<CatchPadInst *>::getEmptyKey();
+ }
+
+ static CatchPadInst *getTombstoneKey() {
+ return DenseMapInfo<CatchPadInst *>::getTombstoneKey();
+ }
+
+ static unsigned getHashValue(CatchPadInst *CatchPad) {
+ return static_cast<unsigned>(hash_combine_range(
+ CatchPad->value_op_begin(), CatchPad->value_op_end()));
+ }
+
+ static bool isEqual(CatchPadInst *LHS, CatchPadInst *RHS) {
+ if (LHS == getEmptyKey() || LHS == getTombstoneKey() ||
+ RHS == getEmptyKey() || RHS == getTombstoneKey())
+ return LHS == RHS;
+ return LHS->isIdenticalTo(RHS);
+ }
+ };
+
SmallMapVector<BasicBlock *, int, 8> NumPerSuccessorCases;
- // Set of unique CatchPads.
- SmallDenseMap<CatchPadInst *, detail::DenseSetEmpty, 4,
- CatchPadDenseMapInfo, detail::DenseSetPair<CatchPadInst *>>
- HandlerSet;
- detail::DenseSetEmpty Empty;
- for (CatchSwitchInst::handler_iterator I = CatchSwitch->handler_begin(),
- E = CatchSwitch->handler_end();
- I != E; ++I) {
- BasicBlock *HandlerBB = *I;
+ // Set of unique CatchPads.
+ SmallDenseMap<CatchPadInst *, detail::DenseSetEmpty, 4,
+ CatchPadDenseMapInfo, detail::DenseSetPair<CatchPadInst *>>
+ HandlerSet;
+ detail::DenseSetEmpty Empty;
+ for (CatchSwitchInst::handler_iterator I = CatchSwitch->handler_begin(),
+ E = CatchSwitch->handler_end();
+ I != E; ++I) {
+ BasicBlock *HandlerBB = *I;
++NumPerSuccessorCases[HandlerBB];
- auto *CatchPad = cast<CatchPadInst>(HandlerBB->getFirstNonPHI());
- if (!HandlerSet.insert({CatchPad, Empty}).second) {
+ auto *CatchPad = cast<CatchPadInst>(HandlerBB->getFirstNonPHI());
+ if (!HandlerSet.insert({CatchPad, Empty}).second) {
--NumPerSuccessorCases[HandlerBB];
- CatchSwitch->removeHandler(I);
- --I;
- --E;
- Changed = true;
- }
- }
+ CatchSwitch->removeHandler(I);
+ --I;
+ --E;
+ Changed = true;
+ }
+ }
std::vector<DominatorTree::UpdateType> Updates;
for (const std::pair<BasicBlock *, int> &I : NumPerSuccessorCases)
if (I.second == 0)
Updates.push_back({DominatorTree::Delete, BB, I.first});
if (DTU)
DTU->applyUpdates(Updates);
- }
-
- Changed |= ConstantFoldTerminator(BB, true, nullptr, DTU);
- for (BasicBlock *Successor : successors(BB))
- if (Reachable.insert(Successor).second)
- Worklist.push_back(Successor);
- } while (!Worklist.empty());
- return Changed;
-}
-
-void llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) {
- Instruction *TI = BB->getTerminator();
-
- if (auto *II = dyn_cast<InvokeInst>(TI)) {
- changeToCall(II, DTU);
- return;
- }
-
- Instruction *NewTI;
- BasicBlock *UnwindDest;
-
- if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
- NewTI = CleanupReturnInst::Create(CRI->getCleanupPad(), nullptr, CRI);
- UnwindDest = CRI->getUnwindDest();
- } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) {
- auto *NewCatchSwitch = CatchSwitchInst::Create(
- CatchSwitch->getParentPad(), nullptr, CatchSwitch->getNumHandlers(),
- CatchSwitch->getName(), CatchSwitch);
- for (BasicBlock *PadBB : CatchSwitch->handlers())
- NewCatchSwitch->addHandler(PadBB);
-
- NewTI = NewCatchSwitch;
- UnwindDest = CatchSwitch->getUnwindDest();
- } else {
- llvm_unreachable("Could not find unwind successor");
- }
-
- NewTI->takeName(TI);
- NewTI->setDebugLoc(TI->getDebugLoc());
- UnwindDest->removePredecessor(BB);
- TI->replaceAllUsesWith(NewTI);
- TI->eraseFromParent();
- if (DTU)
+ }
+
+ Changed |= ConstantFoldTerminator(BB, true, nullptr, DTU);
+ for (BasicBlock *Successor : successors(BB))
+ if (Reachable.insert(Successor).second)
+ Worklist.push_back(Successor);
+ } while (!Worklist.empty());
+ return Changed;
+}
+
+void llvm::removeUnwindEdge(BasicBlock *BB, DomTreeUpdater *DTU) {
+ Instruction *TI = BB->getTerminator();
+
+ if (auto *II = dyn_cast<InvokeInst>(TI)) {
+ changeToCall(II, DTU);
+ return;
+ }
+
+ Instruction *NewTI;
+ BasicBlock *UnwindDest;
+
+ if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
+ NewTI = CleanupReturnInst::Create(CRI->getCleanupPad(), nullptr, CRI);
+ UnwindDest = CRI->getUnwindDest();
+ } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) {
+ auto *NewCatchSwitch = CatchSwitchInst::Create(
+ CatchSwitch->getParentPad(), nullptr, CatchSwitch->getNumHandlers(),
+ CatchSwitch->getName(), CatchSwitch);
+ for (BasicBlock *PadBB : CatchSwitch->handlers())
+ NewCatchSwitch->addHandler(PadBB);
+
+ NewTI = NewCatchSwitch;
+ UnwindDest = CatchSwitch->getUnwindDest();
+ } else {
+ llvm_unreachable("Could not find unwind successor");
+ }
+
+ NewTI->takeName(TI);
+ NewTI->setDebugLoc(TI->getDebugLoc());
+ UnwindDest->removePredecessor(BB);
+ TI->replaceAllUsesWith(NewTI);
+ TI->eraseFromParent();
+ if (DTU)
DTU->applyUpdates({{DominatorTree::Delete, BB, UnwindDest}});
-}
-
-/// removeUnreachableBlocks - Remove blocks that are not reachable, even
-/// if they are in a dead cycle. Return true if a change was made, false
-/// otherwise.
-bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
- MemorySSAUpdater *MSSAU) {
- SmallPtrSet<BasicBlock *, 16> Reachable;
- bool Changed = markAliveBlocks(F, Reachable, DTU);
-
- // If there are unreachable blocks in the CFG...
- if (Reachable.size() == F.size())
- return Changed;
-
- assert(Reachable.size() < F.size());
-
+}
+
+/// removeUnreachableBlocks - Remove blocks that are not reachable, even
+/// if they are in a dead cycle. Return true if a change was made, false
+/// otherwise.
+bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
+ MemorySSAUpdater *MSSAU) {
+ SmallPtrSet<BasicBlock *, 16> Reachable;
+ bool Changed = markAliveBlocks(F, Reachable, DTU);
+
+ // If there are unreachable blocks in the CFG...
+ if (Reachable.size() == F.size())
+ return Changed;
+
+ assert(Reachable.size() < F.size());
+
// Are there any blocks left to actually delete?
SmallSetVector<BasicBlock *, 8> BlocksToRemove;
- for (BasicBlock &BB : F) {
- // Skip reachable basic blocks
- if (Reachable.count(&BB))
- continue;
+ for (BasicBlock &BB : F) {
+ // Skip reachable basic blocks
+ if (Reachable.count(&BB))
+ continue;
// Skip already-deleted blocks
if (DTU && DTU->isBBPendingDeletion(&BB))
continue;
BlocksToRemove.insert(&BB);
- }
-
+ }
+
if (BlocksToRemove.empty())
return Changed;
Changed = true;
NumRemoved += BlocksToRemove.size();
- if (MSSAU)
+ if (MSSAU)
MSSAU->removeBlocks(BlocksToRemove);
-
+
// Loop over all of the basic blocks that are up for removal, dropping all of
- // their internal references. Update DTU if available.
- std::vector<DominatorTree::UpdateType> Updates;
+ // their internal references. Update DTU if available.
+ std::vector<DominatorTree::UpdateType> Updates;
for (auto *BB : BlocksToRemove) {
SmallSetVector<BasicBlock *, 8> UniqueSuccessors;
- for (BasicBlock *Successor : successors(BB)) {
+ for (BasicBlock *Successor : successors(BB)) {
// Only remove references to BB in reachable successors of BB.
if (Reachable.count(Successor))
- Successor->removePredecessor(BB);
- if (DTU)
+ Successor->removePredecessor(BB);
+ if (DTU)
UniqueSuccessors.insert(Successor);
- }
- BB->dropAllReferences();
- if (DTU) {
- Instruction *TI = BB->getTerminator();
- assert(TI && "Basic block should have a terminator");
- // Terminators like invoke can have users. We have to replace their users,
- // before removing them.
- if (!TI->use_empty())
- TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
- TI->eraseFromParent();
- new UnreachableInst(BB->getContext(), BB);
- assert(succ_empty(BB) && "The successor list of BB isn't empty before "
- "applying corresponding DTU updates.");
+ }
+ BB->dropAllReferences();
+ if (DTU) {
+ Instruction *TI = BB->getTerminator();
+ assert(TI && "Basic block should have a terminator");
+ // Terminators like invoke can have users. We have to replace their users,
+ // before removing them.
+ if (!TI->use_empty())
+ TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+ TI->eraseFromParent();
+ new UnreachableInst(BB->getContext(), BB);
+ assert(succ_empty(BB) && "The successor list of BB isn't empty before "
+ "applying corresponding DTU updates.");
Updates.reserve(Updates.size() + UniqueSuccessors.size());
for (auto *UniqueSuccessor : UniqueSuccessors)
Updates.push_back({DominatorTree::Delete, BB, UniqueSuccessor});
- }
- }
-
- if (DTU) {
+ }
+ }
+
+ if (DTU) {
DTU->applyUpdates(Updates);
for (auto *BB : BlocksToRemove)
- DTU->deleteBB(BB);
- } else {
+ DTU->deleteBB(BB);
+ } else {
for (auto *BB : BlocksToRemove)
- BB->eraseFromParent();
- }
-
+ BB->eraseFromParent();
+ }
+
return Changed;
-}
-
-void llvm::combineMetadata(Instruction *K, const Instruction *J,
- ArrayRef<unsigned> KnownIDs, bool DoesKMove) {
- SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
- K->dropUnknownNonDebugMetadata(KnownIDs);
- K->getAllMetadataOtherThanDebugLoc(Metadata);
- for (const auto &MD : Metadata) {
- unsigned Kind = MD.first;
- MDNode *JMD = J->getMetadata(Kind);
- MDNode *KMD = MD.second;
-
- switch (Kind) {
- default:
- K->setMetadata(Kind, nullptr); // Remove unknown metadata
- break;
- case LLVMContext::MD_dbg:
- llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg");
- case LLVMContext::MD_tbaa:
- K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
- break;
- case LLVMContext::MD_alias_scope:
- K->setMetadata(Kind, MDNode::getMostGenericAliasScope(JMD, KMD));
- break;
- case LLVMContext::MD_noalias:
- case LLVMContext::MD_mem_parallel_loop_access:
- K->setMetadata(Kind, MDNode::intersect(JMD, KMD));
- break;
- case LLVMContext::MD_access_group:
- K->setMetadata(LLVMContext::MD_access_group,
- intersectAccessGroups(K, J));
- break;
- case LLVMContext::MD_range:
-
- // If K does move, use most generic range. Otherwise keep the range of
- // K.
- if (DoesKMove)
- // FIXME: If K does move, we should drop the range info and nonnull.
- // Currently this function is used with DoesKMove in passes
- // doing hoisting/sinking and the current behavior of using the
- // most generic range is correct in those cases.
- K->setMetadata(Kind, MDNode::getMostGenericRange(JMD, KMD));
- break;
- case LLVMContext::MD_fpmath:
- K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
- break;
- case LLVMContext::MD_invariant_load:
- // Only set the !invariant.load if it is present in both instructions.
- K->setMetadata(Kind, JMD);
- break;
- case LLVMContext::MD_nonnull:
- // If K does move, keep nonull if it is present in both instructions.
- if (DoesKMove)
- K->setMetadata(Kind, JMD);
- break;
- case LLVMContext::MD_invariant_group:
- // Preserve !invariant.group in K.
- break;
- case LLVMContext::MD_align:
- K->setMetadata(Kind,
- MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
- break;
- case LLVMContext::MD_dereferenceable:
- case LLVMContext::MD_dereferenceable_or_null:
- K->setMetadata(Kind,
- MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
- break;
- case LLVMContext::MD_preserve_access_index:
- // Preserve !preserve.access.index in K.
- break;
- }
- }
- // Set !invariant.group from J if J has it. If both instructions have it
- // then we will just pick it from J - even when they are different.
- // Also make sure that K is load or store - f.e. combining bitcast with load
- // could produce bitcast with invariant.group metadata, which is invalid.
- // FIXME: we should try to preserve both invariant.group md if they are
- // different, but right now instruction can only have one invariant.group.
- if (auto *JMD = J->getMetadata(LLVMContext::MD_invariant_group))
- if (isa<LoadInst>(K) || isa<StoreInst>(K))
- K->setMetadata(LLVMContext::MD_invariant_group, JMD);
-}
-
-void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J,
- bool KDominatesJ) {
- unsigned KnownIDs[] = {
- LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
- LLVMContext::MD_noalias, LLVMContext::MD_range,
- LLVMContext::MD_invariant_load, LLVMContext::MD_nonnull,
- LLVMContext::MD_invariant_group, LLVMContext::MD_align,
- LLVMContext::MD_dereferenceable,
- LLVMContext::MD_dereferenceable_or_null,
- LLVMContext::MD_access_group, LLVMContext::MD_preserve_access_index};
- combineMetadata(K, J, KnownIDs, KDominatesJ);
-}
-
-void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) {
- SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
- Source.getAllMetadata(MD);
- MDBuilder MDB(Dest.getContext());
- Type *NewType = Dest.getType();
- const DataLayout &DL = Source.getModule()->getDataLayout();
- for (const auto &MDPair : MD) {
- unsigned ID = MDPair.first;
- MDNode *N = MDPair.second;
- // Note, essentially every kind of metadata should be preserved here! This
- // routine is supposed to clone a load instruction changing *only its type*.
- // The only metadata it makes sense to drop is metadata which is invalidated
- // when the pointer type changes. This should essentially never be the case
- // in LLVM, but we explicitly switch over only known metadata to be
- // conservatively correct. If you are adding metadata to LLVM which pertains
- // to loads, you almost certainly want to add it here.
- switch (ID) {
- case LLVMContext::MD_dbg:
- case LLVMContext::MD_tbaa:
- case LLVMContext::MD_prof:
- case LLVMContext::MD_fpmath:
- case LLVMContext::MD_tbaa_struct:
- case LLVMContext::MD_invariant_load:
- case LLVMContext::MD_alias_scope:
- case LLVMContext::MD_noalias:
- case LLVMContext::MD_nontemporal:
- case LLVMContext::MD_mem_parallel_loop_access:
- case LLVMContext::MD_access_group:
- // All of these directly apply.
- Dest.setMetadata(ID, N);
- break;
-
- case LLVMContext::MD_nonnull:
- copyNonnullMetadata(Source, N, Dest);
- break;
-
- case LLVMContext::MD_align:
- case LLVMContext::MD_dereferenceable:
- case LLVMContext::MD_dereferenceable_or_null:
- // These only directly apply if the new type is also a pointer.
- if (NewType->isPointerTy())
- Dest.setMetadata(ID, N);
- break;
-
- case LLVMContext::MD_range:
- copyRangeMetadata(DL, Source, N, Dest);
- break;
- }
- }
-}
-
-void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) {
- auto *ReplInst = dyn_cast<Instruction>(Repl);
- if (!ReplInst)
- return;
-
- // Patch the replacement so that it is not more restrictive than the value
- // being replaced.
- // Note that if 'I' is a load being replaced by some operation,
- // for example, by an arithmetic operation, then andIRFlags()
- // would just erase all math flags from the original arithmetic
- // operation, which is clearly not wanted and not needed.
- if (!isa<LoadInst>(I))
- ReplInst->andIRFlags(I);
-
- // FIXME: If both the original and replacement value are part of the
- // same control-flow region (meaning that the execution of one
- // guarantees the execution of the other), then we can combine the
- // noalias scopes here and do better than the general conservative
- // answer used in combineMetadata().
-
- // In general, GVN unifies expressions over different control-flow
- // regions, and so we need a conservative combination of the noalias
- // scopes.
- static const unsigned KnownIDs[] = {
- LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
- LLVMContext::MD_noalias, LLVMContext::MD_range,
- LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load,
- LLVMContext::MD_invariant_group, LLVMContext::MD_nonnull,
- LLVMContext::MD_access_group, LLVMContext::MD_preserve_access_index};
- combineMetadata(ReplInst, I, KnownIDs, false);
-}
-
-template <typename RootType, typename DominatesFn>
-static unsigned replaceDominatedUsesWith(Value *From, Value *To,
- const RootType &Root,
- const DominatesFn &Dominates) {
- assert(From->getType() == To->getType());
-
- unsigned Count = 0;
- for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
- UI != UE;) {
- Use &U = *UI++;
- if (!Dominates(Root, U))
- continue;
- U.set(To);
- LLVM_DEBUG(dbgs() << "Replace dominated use of '" << From->getName()
- << "' as " << *To << " in " << *U << "\n");
- ++Count;
- }
- return Count;
-}
-
-unsigned llvm::replaceNonLocalUsesWith(Instruction *From, Value *To) {
- assert(From->getType() == To->getType());
- auto *BB = From->getParent();
- unsigned Count = 0;
-
- for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
- UI != UE;) {
- Use &U = *UI++;
- auto *I = cast<Instruction>(U.getUser());
- if (I->getParent() == BB)
- continue;
- U.set(To);
- ++Count;
- }
- return Count;
-}
-
-unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
- DominatorTree &DT,
- const BasicBlockEdge &Root) {
- auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) {
- return DT.dominates(Root, U);
- };
- return ::replaceDominatedUsesWith(From, To, Root, Dominates);
-}
-
-unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
- DominatorTree &DT,
- const BasicBlock *BB) {
- auto ProperlyDominates = [&DT](const BasicBlock *BB, const Use &U) {
- auto *I = cast<Instruction>(U.getUser())->getParent();
- return DT.properlyDominates(BB, I);
- };
- return ::replaceDominatedUsesWith(From, To, BB, ProperlyDominates);
-}
-
-bool llvm::callsGCLeafFunction(const CallBase *Call,
- const TargetLibraryInfo &TLI) {
- // Check if the function is specifically marked as a gc leaf function.
- if (Call->hasFnAttr("gc-leaf-function"))
- return true;
- if (const Function *F = Call->getCalledFunction()) {
- if (F->hasFnAttribute("gc-leaf-function"))
- return true;
-
+}
+
+void llvm::combineMetadata(Instruction *K, const Instruction *J,
+ ArrayRef<unsigned> KnownIDs, bool DoesKMove) {
+ SmallVector<std::pair<unsigned, MDNode *>, 4> Metadata;
+ K->dropUnknownNonDebugMetadata(KnownIDs);
+ K->getAllMetadataOtherThanDebugLoc(Metadata);
+ for (const auto &MD : Metadata) {
+ unsigned Kind = MD.first;
+ MDNode *JMD = J->getMetadata(Kind);
+ MDNode *KMD = MD.second;
+
+ switch (Kind) {
+ default:
+ K->setMetadata(Kind, nullptr); // Remove unknown metadata
+ break;
+ case LLVMContext::MD_dbg:
+ llvm_unreachable("getAllMetadataOtherThanDebugLoc returned a MD_dbg");
+ case LLVMContext::MD_tbaa:
+ K->setMetadata(Kind, MDNode::getMostGenericTBAA(JMD, KMD));
+ break;
+ case LLVMContext::MD_alias_scope:
+ K->setMetadata(Kind, MDNode::getMostGenericAliasScope(JMD, KMD));
+ break;
+ case LLVMContext::MD_noalias:
+ case LLVMContext::MD_mem_parallel_loop_access:
+ K->setMetadata(Kind, MDNode::intersect(JMD, KMD));
+ break;
+ case LLVMContext::MD_access_group:
+ K->setMetadata(LLVMContext::MD_access_group,
+ intersectAccessGroups(K, J));
+ break;
+ case LLVMContext::MD_range:
+
+ // If K does move, use most generic range. Otherwise keep the range of
+ // K.
+ if (DoesKMove)
+ // FIXME: If K does move, we should drop the range info and nonnull.
+ // Currently this function is used with DoesKMove in passes
+ // doing hoisting/sinking and the current behavior of using the
+ // most generic range is correct in those cases.
+ K->setMetadata(Kind, MDNode::getMostGenericRange(JMD, KMD));
+ break;
+ case LLVMContext::MD_fpmath:
+ K->setMetadata(Kind, MDNode::getMostGenericFPMath(JMD, KMD));
+ break;
+ case LLVMContext::MD_invariant_load:
+ // Only set the !invariant.load if it is present in both instructions.
+ K->setMetadata(Kind, JMD);
+ break;
+ case LLVMContext::MD_nonnull:
+ // If K does move, keep nonull if it is present in both instructions.
+ if (DoesKMove)
+ K->setMetadata(Kind, JMD);
+ break;
+ case LLVMContext::MD_invariant_group:
+ // Preserve !invariant.group in K.
+ break;
+ case LLVMContext::MD_align:
+ K->setMetadata(Kind,
+ MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
+ break;
+ case LLVMContext::MD_dereferenceable:
+ case LLVMContext::MD_dereferenceable_or_null:
+ K->setMetadata(Kind,
+ MDNode::getMostGenericAlignmentOrDereferenceable(JMD, KMD));
+ break;
+ case LLVMContext::MD_preserve_access_index:
+ // Preserve !preserve.access.index in K.
+ break;
+ }
+ }
+ // Set !invariant.group from J if J has it. If both instructions have it
+ // then we will just pick it from J - even when they are different.
+ // Also make sure that K is load or store - f.e. combining bitcast with load
+ // could produce bitcast with invariant.group metadata, which is invalid.
+ // FIXME: we should try to preserve both invariant.group md if they are
+ // different, but right now instruction can only have one invariant.group.
+ if (auto *JMD = J->getMetadata(LLVMContext::MD_invariant_group))
+ if (isa<LoadInst>(K) || isa<StoreInst>(K))
+ K->setMetadata(LLVMContext::MD_invariant_group, JMD);
+}
+
+void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J,
+ bool KDominatesJ) {
+ unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias, LLVMContext::MD_range,
+ LLVMContext::MD_invariant_load, LLVMContext::MD_nonnull,
+ LLVMContext::MD_invariant_group, LLVMContext::MD_align,
+ LLVMContext::MD_dereferenceable,
+ LLVMContext::MD_dereferenceable_or_null,
+ LLVMContext::MD_access_group, LLVMContext::MD_preserve_access_index};
+ combineMetadata(K, J, KnownIDs, KDominatesJ);
+}
+
+void llvm::copyMetadataForLoad(LoadInst &Dest, const LoadInst &Source) {
+ SmallVector<std::pair<unsigned, MDNode *>, 8> MD;
+ Source.getAllMetadata(MD);
+ MDBuilder MDB(Dest.getContext());
+ Type *NewType = Dest.getType();
+ const DataLayout &DL = Source.getModule()->getDataLayout();
+ for (const auto &MDPair : MD) {
+ unsigned ID = MDPair.first;
+ MDNode *N = MDPair.second;
+ // Note, essentially every kind of metadata should be preserved here! This
+ // routine is supposed to clone a load instruction changing *only its type*.
+ // The only metadata it makes sense to drop is metadata which is invalidated
+ // when the pointer type changes. This should essentially never be the case
+ // in LLVM, but we explicitly switch over only known metadata to be
+ // conservatively correct. If you are adding metadata to LLVM which pertains
+ // to loads, you almost certainly want to add it here.
+ switch (ID) {
+ case LLVMContext::MD_dbg:
+ case LLVMContext::MD_tbaa:
+ case LLVMContext::MD_prof:
+ case LLVMContext::MD_fpmath:
+ case LLVMContext::MD_tbaa_struct:
+ case LLVMContext::MD_invariant_load:
+ case LLVMContext::MD_alias_scope:
+ case LLVMContext::MD_noalias:
+ case LLVMContext::MD_nontemporal:
+ case LLVMContext::MD_mem_parallel_loop_access:
+ case LLVMContext::MD_access_group:
+ // All of these directly apply.
+ Dest.setMetadata(ID, N);
+ break;
+
+ case LLVMContext::MD_nonnull:
+ copyNonnullMetadata(Source, N, Dest);
+ break;
+
+ case LLVMContext::MD_align:
+ case LLVMContext::MD_dereferenceable:
+ case LLVMContext::MD_dereferenceable_or_null:
+ // These only directly apply if the new type is also a pointer.
+ if (NewType->isPointerTy())
+ Dest.setMetadata(ID, N);
+ break;
+
+ case LLVMContext::MD_range:
+ copyRangeMetadata(DL, Source, N, Dest);
+ break;
+ }
+ }
+}
+
+void llvm::patchReplacementInstruction(Instruction *I, Value *Repl) {
+ auto *ReplInst = dyn_cast<Instruction>(Repl);
+ if (!ReplInst)
+ return;
+
+ // Patch the replacement so that it is not more restrictive than the value
+ // being replaced.
+ // Note that if 'I' is a load being replaced by some operation,
+ // for example, by an arithmetic operation, then andIRFlags()
+ // would just erase all math flags from the original arithmetic
+ // operation, which is clearly not wanted and not needed.
+ if (!isa<LoadInst>(I))
+ ReplInst->andIRFlags(I);
+
+ // FIXME: If both the original and replacement value are part of the
+ // same control-flow region (meaning that the execution of one
+ // guarantees the execution of the other), then we can combine the
+ // noalias scopes here and do better than the general conservative
+ // answer used in combineMetadata().
+
+ // In general, GVN unifies expressions over different control-flow
+ // regions, and so we need a conservative combination of the noalias
+ // scopes.
+ static const unsigned KnownIDs[] = {
+ LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
+ LLVMContext::MD_noalias, LLVMContext::MD_range,
+ LLVMContext::MD_fpmath, LLVMContext::MD_invariant_load,
+ LLVMContext::MD_invariant_group, LLVMContext::MD_nonnull,
+ LLVMContext::MD_access_group, LLVMContext::MD_preserve_access_index};
+ combineMetadata(ReplInst, I, KnownIDs, false);
+}
+
+template <typename RootType, typename DominatesFn>
+static unsigned replaceDominatedUsesWith(Value *From, Value *To,
+ const RootType &Root,
+ const DominatesFn &Dominates) {
+ assert(From->getType() == To->getType());
+
+ unsigned Count = 0;
+ for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
+ UI != UE;) {
+ Use &U = *UI++;
+ if (!Dominates(Root, U))
+ continue;
+ U.set(To);
+ LLVM_DEBUG(dbgs() << "Replace dominated use of '" << From->getName()
+ << "' as " << *To << " in " << *U << "\n");
+ ++Count;
+ }
+ return Count;
+}
+
+unsigned llvm::replaceNonLocalUsesWith(Instruction *From, Value *To) {
+ assert(From->getType() == To->getType());
+ auto *BB = From->getParent();
+ unsigned Count = 0;
+
+ for (Value::use_iterator UI = From->use_begin(), UE = From->use_end();
+ UI != UE;) {
+ Use &U = *UI++;
+ auto *I = cast<Instruction>(U.getUser());
+ if (I->getParent() == BB)
+ continue;
+ U.set(To);
+ ++Count;
+ }
+ return Count;
+}
+
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+ DominatorTree &DT,
+ const BasicBlockEdge &Root) {
+ auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) {
+ return DT.dominates(Root, U);
+ };
+ return ::replaceDominatedUsesWith(From, To, Root, Dominates);
+}
+
+unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To,
+ DominatorTree &DT,
+ const BasicBlock *BB) {
+ auto ProperlyDominates = [&DT](const BasicBlock *BB, const Use &U) {
+ auto *I = cast<Instruction>(U.getUser())->getParent();
+ return DT.properlyDominates(BB, I);
+ };
+ return ::replaceDominatedUsesWith(From, To, BB, ProperlyDominates);
+}
+
+bool llvm::callsGCLeafFunction(const CallBase *Call,
+ const TargetLibraryInfo &TLI) {
+ // Check if the function is specifically marked as a gc leaf function.
+ if (Call->hasFnAttr("gc-leaf-function"))
+ return true;
+ if (const Function *F = Call->getCalledFunction()) {
+ if (F->hasFnAttribute("gc-leaf-function"))
+ return true;
+
if (auto IID = F->getIntrinsicID()) {
- // Most LLVM intrinsics do not take safepoints.
- return IID != Intrinsic::experimental_gc_statepoint &&
+ // Most LLVM intrinsics do not take safepoints.
+ return IID != Intrinsic::experimental_gc_statepoint &&
IID != Intrinsic::experimental_deoptimize &&
IID != Intrinsic::memcpy_element_unordered_atomic &&
IID != Intrinsic::memmove_element_unordered_atomic;
}
- }
-
- // Lib calls can be materialized by some passes, and won't be
- // marked as 'gc-leaf-function.' All available Libcalls are
- // GC-leaf.
- LibFunc LF;
- if (TLI.getLibFunc(*Call, LF)) {
- return TLI.has(LF);
- }
-
- return false;
-}
-
-void llvm::copyNonnullMetadata(const LoadInst &OldLI, MDNode *N,
- LoadInst &NewLI) {
- auto *NewTy = NewLI.getType();
-
- // This only directly applies if the new type is also a pointer.
- if (NewTy->isPointerTy()) {
- NewLI.setMetadata(LLVMContext::MD_nonnull, N);
- return;
- }
-
- // The only other translation we can do is to integral loads with !range
- // metadata.
- if (!NewTy->isIntegerTy())
- return;
-
- MDBuilder MDB(NewLI.getContext());
- const Value *Ptr = OldLI.getPointerOperand();
- auto *ITy = cast<IntegerType>(NewTy);
- auto *NullInt = ConstantExpr::getPtrToInt(
- ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy);
- auto *NonNullInt = ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1));
- NewLI.setMetadata(LLVMContext::MD_range,
- MDB.createRange(NonNullInt, NullInt));
-}
-
-void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
- MDNode *N, LoadInst &NewLI) {
- auto *NewTy = NewLI.getType();
-
- // Give up unless it is converted to a pointer where there is a single very
- // valuable mapping we can do reliably.
- // FIXME: It would be nice to propagate this in more ways, but the type
- // conversions make it hard.
- if (!NewTy->isPointerTy())
- return;
-
- unsigned BitWidth = DL.getPointerTypeSizeInBits(NewTy);
- if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) {
- MDNode *NN = MDNode::get(OldLI.getContext(), None);
- NewLI.setMetadata(LLVMContext::MD_nonnull, NN);
- }
-}
-
-void llvm::dropDebugUsers(Instruction &I) {
- SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
- findDbgUsers(DbgUsers, &I);
- for (auto *DII : DbgUsers)
- DII->eraseFromParent();
-}
-
-void llvm::hoistAllInstructionsInto(BasicBlock *DomBlock, Instruction *InsertPt,
- BasicBlock *BB) {
- // Since we are moving the instructions out of its basic block, we do not
- // retain their original debug locations (DILocations) and debug intrinsic
- // instructions.
- //
- // Doing so would degrade the debugging experience and adversely affect the
- // accuracy of profiling information.
- //
- // Currently, when hoisting the instructions, we take the following actions:
- // - Remove their debug intrinsic instructions.
- // - Set their debug locations to the values from the insertion point.
- //
- // As per PR39141 (comment #8), the more fundamental reason why the dbg.values
- // need to be deleted, is because there will not be any instructions with a
- // DILocation in either branch left after performing the transformation. We
- // can only insert a dbg.value after the two branches are joined again.
- //
- // See PR38762, PR39243 for more details.
- //
- // TODO: Extend llvm.dbg.value to take more than one SSA Value (PR39141) to
- // encode predicated DIExpressions that yield different results on different
- // code paths.
- for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
- Instruction *I = &*II;
- I->dropUnknownNonDebugMetadata();
- if (I->isUsedByMetadata())
- dropDebugUsers(*I);
- if (isa<DbgInfoIntrinsic>(I)) {
- // Remove DbgInfo Intrinsics.
- II = I->eraseFromParent();
- continue;
- }
- I->setDebugLoc(InsertPt->getDebugLoc());
- ++II;
- }
- DomBlock->getInstList().splice(InsertPt->getIterator(), BB->getInstList(),
- BB->begin(),
- BB->getTerminator()->getIterator());
-}
-
-namespace {
-
-/// A potential constituent of a bitreverse or bswap expression. See
-/// collectBitParts for a fuller explanation.
-struct BitPart {
- BitPart(Value *P, unsigned BW) : Provider(P) {
- Provenance.resize(BW);
- }
-
- /// The Value that this is a bitreverse/bswap of.
- Value *Provider;
-
- /// The "provenance" of each bit. Provenance[A] = B means that bit A
- /// in Provider becomes bit B in the result of this expression.
- SmallVector<int8_t, 32> Provenance; // int8_t means max size is i128.
-
- enum { Unset = -1 };
-};
-
-} // end anonymous namespace
-
-/// Analyze the specified subexpression and see if it is capable of providing
-/// pieces of a bswap or bitreverse. The subexpression provides a potential
+ }
+
+ // Lib calls can be materialized by some passes, and won't be
+ // marked as 'gc-leaf-function.' All available Libcalls are
+ // GC-leaf.
+ LibFunc LF;
+ if (TLI.getLibFunc(*Call, LF)) {
+ return TLI.has(LF);
+ }
+
+ return false;
+}
+
+void llvm::copyNonnullMetadata(const LoadInst &OldLI, MDNode *N,
+ LoadInst &NewLI) {
+ auto *NewTy = NewLI.getType();
+
+ // This only directly applies if the new type is also a pointer.
+ if (NewTy->isPointerTy()) {
+ NewLI.setMetadata(LLVMContext::MD_nonnull, N);
+ return;
+ }
+
+ // The only other translation we can do is to integral loads with !range
+ // metadata.
+ if (!NewTy->isIntegerTy())
+ return;
+
+ MDBuilder MDB(NewLI.getContext());
+ const Value *Ptr = OldLI.getPointerOperand();
+ auto *ITy = cast<IntegerType>(NewTy);
+ auto *NullInt = ConstantExpr::getPtrToInt(
+ ConstantPointerNull::get(cast<PointerType>(Ptr->getType())), ITy);
+ auto *NonNullInt = ConstantExpr::getAdd(NullInt, ConstantInt::get(ITy, 1));
+ NewLI.setMetadata(LLVMContext::MD_range,
+ MDB.createRange(NonNullInt, NullInt));
+}
+
+void llvm::copyRangeMetadata(const DataLayout &DL, const LoadInst &OldLI,
+ MDNode *N, LoadInst &NewLI) {
+ auto *NewTy = NewLI.getType();
+
+ // Give up unless it is converted to a pointer where there is a single very
+ // valuable mapping we can do reliably.
+ // FIXME: It would be nice to propagate this in more ways, but the type
+ // conversions make it hard.
+ if (!NewTy->isPointerTy())
+ return;
+
+ unsigned BitWidth = DL.getPointerTypeSizeInBits(NewTy);
+ if (!getConstantRangeFromMetadata(*N).contains(APInt(BitWidth, 0))) {
+ MDNode *NN = MDNode::get(OldLI.getContext(), None);
+ NewLI.setMetadata(LLVMContext::MD_nonnull, NN);
+ }
+}
+
+void llvm::dropDebugUsers(Instruction &I) {
+ SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
+ findDbgUsers(DbgUsers, &I);
+ for (auto *DII : DbgUsers)
+ DII->eraseFromParent();
+}
+
+void llvm::hoistAllInstructionsInto(BasicBlock *DomBlock, Instruction *InsertPt,
+ BasicBlock *BB) {
+ // Since we are moving the instructions out of its basic block, we do not
+ // retain their original debug locations (DILocations) and debug intrinsic
+ // instructions.
+ //
+ // Doing so would degrade the debugging experience and adversely affect the
+ // accuracy of profiling information.
+ //
+ // Currently, when hoisting the instructions, we take the following actions:
+ // - Remove their debug intrinsic instructions.
+ // - Set their debug locations to the values from the insertion point.
+ //
+ // As per PR39141 (comment #8), the more fundamental reason why the dbg.values
+ // need to be deleted, is because there will not be any instructions with a
+ // DILocation in either branch left after performing the transformation. We
+ // can only insert a dbg.value after the two branches are joined again.
+ //
+ // See PR38762, PR39243 for more details.
+ //
+ // TODO: Extend llvm.dbg.value to take more than one SSA Value (PR39141) to
+ // encode predicated DIExpressions that yield different results on different
+ // code paths.
+ for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); II != IE;) {
+ Instruction *I = &*II;
+ I->dropUnknownNonDebugMetadata();
+ if (I->isUsedByMetadata())
+ dropDebugUsers(*I);
+ if (isa<DbgInfoIntrinsic>(I)) {
+ // Remove DbgInfo Intrinsics.
+ II = I->eraseFromParent();
+ continue;
+ }
+ I->setDebugLoc(InsertPt->getDebugLoc());
+ ++II;
+ }
+ DomBlock->getInstList().splice(InsertPt->getIterator(), BB->getInstList(),
+ BB->begin(),
+ BB->getTerminator()->getIterator());
+}
+
+namespace {
+
+/// A potential constituent of a bitreverse or bswap expression. See
+/// collectBitParts for a fuller explanation.
+struct BitPart {
+ BitPart(Value *P, unsigned BW) : Provider(P) {
+ Provenance.resize(BW);
+ }
+
+ /// The Value that this is a bitreverse/bswap of.
+ Value *Provider;
+
+ /// The "provenance" of each bit. Provenance[A] = B means that bit A
+ /// in Provider becomes bit B in the result of this expression.
+ SmallVector<int8_t, 32> Provenance; // int8_t means max size is i128.
+
+ enum { Unset = -1 };
+};
+
+} // end anonymous namespace
+
+/// Analyze the specified subexpression and see if it is capable of providing
+/// pieces of a bswap or bitreverse. The subexpression provides a potential
/// piece of a bswap or bitreverse if it can be proved that each non-zero bit in
-/// the output of the expression came from a corresponding bit in some other
-/// value. This function is recursive, and the end result is a mapping of
-/// bitnumber to bitnumber. It is the caller's responsibility to validate that
-/// the bitnumber to bitnumber mapping is correct for a bswap or bitreverse.
-///
-/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know
-/// that the expression deposits the low byte of %X into the high byte of the
-/// result and that all other bits are zero. This expression is accepted and a
-/// BitPart is returned with Provider set to %X and Provenance[24-31] set to
-/// [0-7].
-///
+/// the output of the expression came from a corresponding bit in some other
+/// value. This function is recursive, and the end result is a mapping of
+/// bitnumber to bitnumber. It is the caller's responsibility to validate that
+/// the bitnumber to bitnumber mapping is correct for a bswap or bitreverse.
+///
+/// For example, if the current subexpression if "(shl i32 %X, 24)" then we know
+/// that the expression deposits the low byte of %X into the high byte of the
+/// result and that all other bits are zero. This expression is accepted and a
+/// BitPart is returned with Provider set to %X and Provenance[24-31] set to
+/// [0-7].
+///
/// For vector types, all analysis is performed at the per-element level. No
/// cross-element analysis is supported (shuffle/insertion/reduction), and all
/// constant masks must be splatted across all elements.
///
-/// To avoid revisiting values, the BitPart results are memoized into the
-/// provided map. To avoid unnecessary copying of BitParts, BitParts are
-/// constructed in-place in the \c BPS map. Because of this \c BPS needs to
-/// store BitParts objects, not pointers. As we need the concept of a nullptr
-/// BitParts (Value has been analyzed and the analysis failed), we an Optional
-/// type instead to provide the same functionality.
-///
-/// Because we pass around references into \c BPS, we must use a container that
-/// does not invalidate internal references (std::map instead of DenseMap).
-static const Optional<BitPart> &
-collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
- std::map<Value *, Optional<BitPart>> &BPS, int Depth) {
- auto I = BPS.find(V);
- if (I != BPS.end())
- return I->second;
-
- auto &Result = BPS[V] = None;
+/// To avoid revisiting values, the BitPart results are memoized into the
+/// provided map. To avoid unnecessary copying of BitParts, BitParts are
+/// constructed in-place in the \c BPS map. Because of this \c BPS needs to
+/// store BitParts objects, not pointers. As we need the concept of a nullptr
+/// BitParts (Value has been analyzed and the analysis failed), we an Optional
+/// type instead to provide the same functionality.
+///
+/// Because we pass around references into \c BPS, we must use a container that
+/// does not invalidate internal references (std::map instead of DenseMap).
+static const Optional<BitPart> &
+collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
+ std::map<Value *, Optional<BitPart>> &BPS, int Depth) {
+ auto I = BPS.find(V);
+ if (I != BPS.end())
+ return I->second;
+
+ auto &Result = BPS[V] = None;
auto BitWidth = V->getType()->getScalarSizeInBits();
-
- // Prevent stack overflow by limiting the recursion depth
- if (Depth == BitPartRecursionMaxDepth) {
- LLVM_DEBUG(dbgs() << "collectBitParts max recursion depth reached.\n");
- return Result;
- }
-
+
+ // Prevent stack overflow by limiting the recursion depth
+ if (Depth == BitPartRecursionMaxDepth) {
+ LLVM_DEBUG(dbgs() << "collectBitParts max recursion depth reached.\n");
+ return Result;
+ }
+
if (auto *I = dyn_cast<Instruction>(V)) {
Value *X, *Y;
const APInt *C;
- // If this is an or instruction, it may be an inner node of the bswap.
+ // If this is an or instruction, it may be an inner node of the bswap.
if (match(V, m_Or(m_Value(X), m_Value(Y)))) {
const auto &A =
collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
const auto &B =
collectBitParts(Y, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
- if (!A || !B)
- return Result;
-
- // Try and merge the two together.
- if (!A->Provider || A->Provider != B->Provider)
- return Result;
-
- Result = BitPart(A->Provider, BitWidth);
+ if (!A || !B)
+ return Result;
+
+ // Try and merge the two together.
+ if (!A->Provider || A->Provider != B->Provider)
+ return Result;
+
+ Result = BitPart(A->Provider, BitWidth);
for (unsigned BitIdx = 0; BitIdx < BitWidth; ++BitIdx) {
if (A->Provenance[BitIdx] != BitPart::Unset &&
B->Provenance[BitIdx] != BitPart::Unset &&
A->Provenance[BitIdx] != B->Provenance[BitIdx])
- return Result = None;
-
+ return Result = None;
+
if (A->Provenance[BitIdx] == BitPart::Unset)
Result->Provenance[BitIdx] = B->Provenance[BitIdx];
- else
+ else
Result->Provenance[BitIdx] = A->Provenance[BitIdx];
- }
-
- return Result;
- }
-
- // If this is a logical shift by a constant, recurse then shift the result.
+ }
+
+ return Result;
+ }
+
+ // If this is a logical shift by a constant, recurse then shift the result.
if (match(V, m_LogicalShift(m_Value(X), m_APInt(C)))) {
const APInt &BitShift = *C;
- // Ensure the shift amount is defined.
+ // Ensure the shift amount is defined.
if (BitShift.uge(BitWidth))
- return Result;
-
+ return Result;
+
const auto &Res =
collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
- if (!Res)
- return Result;
- Result = Res;
-
- // Perform the "shift" on BitProvenance.
- auto &P = Result->Provenance;
- if (I->getOpcode() == Instruction::Shl) {
+ if (!Res)
+ return Result;
+ Result = Res;
+
+ // Perform the "shift" on BitProvenance.
+ auto &P = Result->Provenance;
+ if (I->getOpcode() == Instruction::Shl) {
P.erase(std::prev(P.end(), BitShift.getZExtValue()), P.end());
P.insert(P.begin(), BitShift.getZExtValue(), BitPart::Unset);
- } else {
+ } else {
P.erase(P.begin(), std::next(P.begin(), BitShift.getZExtValue()));
P.insert(P.end(), BitShift.getZExtValue(), BitPart::Unset);
- }
-
- return Result;
- }
-
- // If this is a logical 'and' with a mask that clears bits, recurse then
- // unset the appropriate bits.
+ }
+
+ return Result;
+ }
+
+ // If this is a logical 'and' with a mask that clears bits, recurse then
+ // unset the appropriate bits.
if (match(V, m_And(m_Value(X), m_APInt(C)))) {
const APInt &AndMask = *C;
-
- // Check that the mask allows a multiple of 8 bits for a bswap, for an
- // early exit.
- unsigned NumMaskedBits = AndMask.countPopulation();
+
+ // Check that the mask allows a multiple of 8 bits for a bswap, for an
+ // early exit.
+ unsigned NumMaskedBits = AndMask.countPopulation();
if (!MatchBitReversals && (NumMaskedBits % 8) != 0)
- return Result;
-
+ return Result;
+
const auto &Res =
collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
- if (!Res)
- return Result;
- Result = Res;
-
+ if (!Res)
+ return Result;
+ Result = Res;
+
for (unsigned BitIdx = 0; BitIdx < BitWidth; ++BitIdx)
- // If the AndMask is zero for this bit, clear the bit.
+ // If the AndMask is zero for this bit, clear the bit.
if (AndMask[BitIdx] == 0)
Result->Provenance[BitIdx] = BitPart::Unset;
- return Result;
- }
-
- // If this is a zext instruction zero extend the result.
+ return Result;
+ }
+
+ // If this is a zext instruction zero extend the result.
if (match(V, m_ZExt(m_Value(X)))) {
const auto &Res =
collectBitParts(X, MatchBSwaps, MatchBitReversals, BPS, Depth + 1);
- if (!Res)
- return Result;
-
- Result = BitPart(Res->Provider, BitWidth);
+ if (!Res)
+ return Result;
+
+ Result = BitPart(Res->Provider, BitWidth);
auto NarrowBitWidth = X->getType()->getScalarSizeInBits();
for (unsigned BitIdx = 0; BitIdx < NarrowBitWidth; ++BitIdx)
Result->Provenance[BitIdx] = Res->Provenance[BitIdx];
for (unsigned BitIdx = NarrowBitWidth; BitIdx < BitWidth; ++BitIdx)
Result->Provenance[BitIdx] = BitPart::Unset;
- return Result;
- }
+ return Result;
+ }
// BITREVERSE - most likely due to us previous matching a partial
// bitreverse.
@@ -3037,58 +3037,58 @@ collectBitParts(Value *V, bool MatchBSwaps, bool MatchBitReversals,
Result->Provenance[BitIdx] = RHS->Provenance[BitIdx + StartBitRHS];
return Result;
}
- }
-
- // Okay, we got to something that isn't a shift, 'or' or 'and'. This must be
- // the input value to the bswap/bitreverse.
- Result = BitPart(V, BitWidth);
+ }
+
+ // Okay, we got to something that isn't a shift, 'or' or 'and'. This must be
+ // the input value to the bswap/bitreverse.
+ Result = BitPart(V, BitWidth);
for (unsigned BitIdx = 0; BitIdx < BitWidth; ++BitIdx)
Result->Provenance[BitIdx] = BitIdx;
- return Result;
-}
-
-static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To,
- unsigned BitWidth) {
- if (From % 8 != To % 8)
- return false;
- // Convert from bit indices to byte indices and check for a byte reversal.
- From >>= 3;
- To >>= 3;
- BitWidth >>= 3;
- return From == BitWidth - To - 1;
-}
-
-static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
- unsigned BitWidth) {
- return From == BitWidth - To - 1;
-}
-
-bool llvm::recognizeBSwapOrBitReverseIdiom(
- Instruction *I, bool MatchBSwaps, bool MatchBitReversals,
- SmallVectorImpl<Instruction *> &InsertedInsts) {
- if (Operator::getOpcode(I) != Instruction::Or)
- return false;
- if (!MatchBSwaps && !MatchBitReversals)
- return false;
+ return Result;
+}
+
+static bool bitTransformIsCorrectForBSwap(unsigned From, unsigned To,
+ unsigned BitWidth) {
+ if (From % 8 != To % 8)
+ return false;
+ // Convert from bit indices to byte indices and check for a byte reversal.
+ From >>= 3;
+ To >>= 3;
+ BitWidth >>= 3;
+ return From == BitWidth - To - 1;
+}
+
+static bool bitTransformIsCorrectForBitReverse(unsigned From, unsigned To,
+ unsigned BitWidth) {
+ return From == BitWidth - To - 1;
+}
+
+bool llvm::recognizeBSwapOrBitReverseIdiom(
+ Instruction *I, bool MatchBSwaps, bool MatchBitReversals,
+ SmallVectorImpl<Instruction *> &InsertedInsts) {
+ if (Operator::getOpcode(I) != Instruction::Or)
+ return false;
+ if (!MatchBSwaps && !MatchBitReversals)
+ return false;
Type *ITy = I->getType();
if (!ITy->isIntOrIntVectorTy() || ITy->getScalarSizeInBits() > 128)
return false; // Can't do integer/elements > 128 bits.
-
+
Type *DemandedTy = ITy;
if (I->hasOneUse())
if (auto *Trunc = dyn_cast<TruncInst>(I->user_back()))
DemandedTy = Trunc->getType();
-
- // Try to find all the pieces corresponding to the bswap.
- std::map<Value *, Optional<BitPart>> BPS;
- auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS, 0);
- if (!Res)
- return false;
+
+ // Try to find all the pieces corresponding to the bswap.
+ std::map<Value *, Optional<BitPart>> BPS;
+ auto Res = collectBitParts(I, MatchBSwaps, MatchBitReversals, BPS, 0);
+ if (!Res)
+ return false;
ArrayRef<int8_t> BitProvenance = Res->Provenance;
assert(all_of(BitProvenance,
[](int8_t I) { return I == BitPart::Unset || 0 <= I; }) &&
"Illegal bit provenance index");
-
+
// If the upper bits are zero, then attempt to perform as a truncated op.
if (BitProvenance.back() == BitPart::Unset) {
while (!BitProvenance.empty() && BitProvenance.back() == BitPart::Unset)
@@ -3105,8 +3105,8 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
if (DemandedBW > ITy->getScalarSizeInBits())
return false;
- // Now, is the bit permutation correct for a bswap or a bitreverse? We can
- // only byteswap values with an even number of bytes.
+ // Now, is the bit permutation correct for a bswap or a bitreverse? We can
+ // only byteswap values with an even number of bytes.
APInt DemandedMask = APInt::getAllOnesValue(DemandedBW);
bool OKForBSwap = MatchBSwaps && (DemandedBW % 16) == 0;
bool OKForBitReverse = MatchBitReversals;
@@ -3120,16 +3120,16 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
DemandedBW);
OKForBitReverse &= bitTransformIsCorrectForBitReverse(BitProvenance[BitIdx],
BitIdx, DemandedBW);
- }
-
- Intrinsic::ID Intrin;
+ }
+
+ Intrinsic::ID Intrin;
if (OKForBSwap)
- Intrin = Intrinsic::bswap;
+ Intrin = Intrinsic::bswap;
else if (OKForBitReverse)
- Intrin = Intrinsic::bitreverse;
- else
- return false;
-
+ Intrin = Intrinsic::bitreverse;
+ else
+ return false;
+
Function *F = Intrinsic::getDeclaration(I->getModule(), Intrin, DemandedTy);
Value *Provider = Res->Provider;
@@ -3153,130 +3153,130 @@ bool llvm::recognizeBSwapOrBitReverseIdiom(
// We may need to zeroextend back to the result type.
if (ITy != Result->getType()) {
auto *ExtInst = CastInst::CreateIntegerCast(Result, ITy, false, "zext", I);
- InsertedInsts.push_back(ExtInst);
- }
-
- return true;
-}
-
-// CodeGen has special handling for some string functions that may replace
-// them with target-specific intrinsics. Since that'd skip our interceptors
-// in ASan/MSan/TSan/DFSan, and thus make us miss some memory accesses,
-// we mark affected calls as NoBuiltin, which will disable optimization
-// in CodeGen.
-void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(
- CallInst *CI, const TargetLibraryInfo *TLI) {
- Function *F = CI->getCalledFunction();
- LibFunc Func;
- if (F && !F->hasLocalLinkage() && F->hasName() &&
- TLI->getLibFunc(F->getName(), Func) && TLI->hasOptimizedCodeGen(Func) &&
- !F->doesNotAccessMemory())
- CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin);
-}
-
-bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
- // We can't have a PHI with a metadata type.
- if (I->getOperand(OpIdx)->getType()->isMetadataTy())
- return false;
-
- // Early exit.
- if (!isa<Constant>(I->getOperand(OpIdx)))
- return true;
-
- switch (I->getOpcode()) {
- default:
- return true;
- case Instruction::Call:
- case Instruction::Invoke: {
- const auto &CB = cast<CallBase>(*I);
-
- // Can't handle inline asm. Skip it.
- if (CB.isInlineAsm())
- return false;
-
- // Constant bundle operands may need to retain their constant-ness for
- // correctness.
- if (CB.isBundleOperand(OpIdx))
- return false;
-
- if (OpIdx < CB.getNumArgOperands()) {
- // Some variadic intrinsics require constants in the variadic arguments,
- // which currently aren't markable as immarg.
- if (isa<IntrinsicInst>(CB) &&
- OpIdx >= CB.getFunctionType()->getNumParams()) {
- // This is known to be OK for stackmap.
- return CB.getIntrinsicID() == Intrinsic::experimental_stackmap;
- }
-
- // gcroot is a special case, since it requires a constant argument which
- // isn't also required to be a simple ConstantInt.
- if (CB.getIntrinsicID() == Intrinsic::gcroot)
- return false;
-
- // Some intrinsic operands are required to be immediates.
- return !CB.paramHasAttr(OpIdx, Attribute::ImmArg);
- }
-
- // It is never allowed to replace the call argument to an intrinsic, but it
- // may be possible for a call.
- return !isa<IntrinsicInst>(CB);
- }
- case Instruction::ShuffleVector:
- // Shufflevector masks are constant.
- return OpIdx != 2;
- case Instruction::Switch:
- case Instruction::ExtractValue:
- // All operands apart from the first are constant.
- return OpIdx == 0;
- case Instruction::InsertValue:
- // All operands apart from the first and the second are constant.
- return OpIdx < 2;
- case Instruction::Alloca:
- // Static allocas (constant size in the entry block) are handled by
- // prologue/epilogue insertion so they're free anyway. We definitely don't
- // want to make them non-constant.
- return !cast<AllocaInst>(I)->isStaticAlloca();
- case Instruction::GetElementPtr:
- if (OpIdx == 0)
- return true;
- gep_type_iterator It = gep_type_begin(I);
- for (auto E = std::next(It, OpIdx); It != E; ++It)
- if (It.isStruct())
- return false;
- return true;
- }
-}
-
-Value *llvm::invertCondition(Value *Condition) {
- // First: Check if it's a constant
- if (Constant *C = dyn_cast<Constant>(Condition))
- return ConstantExpr::getNot(C);
-
- // Second: If the condition is already inverted, return the original value
- Value *NotCondition;
- if (match(Condition, m_Not(m_Value(NotCondition))))
- return NotCondition;
-
- BasicBlock *Parent = nullptr;
- Instruction *Inst = dyn_cast<Instruction>(Condition);
- if (Inst)
- Parent = Inst->getParent();
- else if (Argument *Arg = dyn_cast<Argument>(Condition))
- Parent = &Arg->getParent()->getEntryBlock();
- assert(Parent && "Unsupported condition to invert");
-
- // Third: Check all the users for an invert
- for (User *U : Condition->users())
- if (Instruction *I = dyn_cast<Instruction>(U))
- if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition))))
- return I;
-
- // Last option: Create a new instruction
- auto *Inverted =
- BinaryOperator::CreateNot(Condition, Condition->getName() + ".inv");
- if (Inst && !isa<PHINode>(Inst))
- Inverted->insertAfter(Inst);
- else
- Inverted->insertBefore(&*Parent->getFirstInsertionPt());
- return Inverted;
-}
+ InsertedInsts.push_back(ExtInst);
+ }
+
+ return true;
+}
+
+// CodeGen has special handling for some string functions that may replace
+// them with target-specific intrinsics. Since that'd skip our interceptors
+// in ASan/MSan/TSan/DFSan, and thus make us miss some memory accesses,
+// we mark affected calls as NoBuiltin, which will disable optimization
+// in CodeGen.
+void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(
+ CallInst *CI, const TargetLibraryInfo *TLI) {
+ Function *F = CI->getCalledFunction();
+ LibFunc Func;
+ if (F && !F->hasLocalLinkage() && F->hasName() &&
+ TLI->getLibFunc(F->getName(), Func) && TLI->hasOptimizedCodeGen(Func) &&
+ !F->doesNotAccessMemory())
+ CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin);
+}
+
+bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
+ // We can't have a PHI with a metadata type.
+ if (I->getOperand(OpIdx)->getType()->isMetadataTy())
+ return false;
+
+ // Early exit.
+ if (!isa<Constant>(I->getOperand(OpIdx)))
+ return true;
+
+ switch (I->getOpcode()) {
+ default:
+ return true;
+ case Instruction::Call:
+ case Instruction::Invoke: {
+ const auto &CB = cast<CallBase>(*I);
+
+ // Can't handle inline asm. Skip it.
+ if (CB.isInlineAsm())
+ return false;
+
+ // Constant bundle operands may need to retain their constant-ness for
+ // correctness.
+ if (CB.isBundleOperand(OpIdx))
+ return false;
+
+ if (OpIdx < CB.getNumArgOperands()) {
+ // Some variadic intrinsics require constants in the variadic arguments,
+ // which currently aren't markable as immarg.
+ if (isa<IntrinsicInst>(CB) &&
+ OpIdx >= CB.getFunctionType()->getNumParams()) {
+ // This is known to be OK for stackmap.
+ return CB.getIntrinsicID() == Intrinsic::experimental_stackmap;
+ }
+
+ // gcroot is a special case, since it requires a constant argument which
+ // isn't also required to be a simple ConstantInt.
+ if (CB.getIntrinsicID() == Intrinsic::gcroot)
+ return false;
+
+ // Some intrinsic operands are required to be immediates.
+ return !CB.paramHasAttr(OpIdx, Attribute::ImmArg);
+ }
+
+ // It is never allowed to replace the call argument to an intrinsic, but it
+ // may be possible for a call.
+ return !isa<IntrinsicInst>(CB);
+ }
+ case Instruction::ShuffleVector:
+ // Shufflevector masks are constant.
+ return OpIdx != 2;
+ case Instruction::Switch:
+ case Instruction::ExtractValue:
+ // All operands apart from the first are constant.
+ return OpIdx == 0;
+ case Instruction::InsertValue:
+ // All operands apart from the first and the second are constant.
+ return OpIdx < 2;
+ case Instruction::Alloca:
+ // Static allocas (constant size in the entry block) are handled by
+ // prologue/epilogue insertion so they're free anyway. We definitely don't
+ // want to make them non-constant.
+ return !cast<AllocaInst>(I)->isStaticAlloca();
+ case Instruction::GetElementPtr:
+ if (OpIdx == 0)
+ return true;
+ gep_type_iterator It = gep_type_begin(I);
+ for (auto E = std::next(It, OpIdx); It != E; ++It)
+ if (It.isStruct())
+ return false;
+ return true;
+ }
+}
+
+Value *llvm::invertCondition(Value *Condition) {
+ // First: Check if it's a constant
+ if (Constant *C = dyn_cast<Constant>(Condition))
+ return ConstantExpr::getNot(C);
+
+ // Second: If the condition is already inverted, return the original value
+ Value *NotCondition;
+ if (match(Condition, m_Not(m_Value(NotCondition))))
+ return NotCondition;
+
+ BasicBlock *Parent = nullptr;
+ Instruction *Inst = dyn_cast<Instruction>(Condition);
+ if (Inst)
+ Parent = Inst->getParent();
+ else if (Argument *Arg = dyn_cast<Argument>(Condition))
+ Parent = &Arg->getParent()->getEntryBlock();
+ assert(Parent && "Unsupported condition to invert");
+
+ // Third: Check all the users for an invert
+ for (User *U : Condition->users())
+ if (Instruction *I = dyn_cast<Instruction>(U))
+ if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition))))
+ return I;
+
+ // Last option: Create a new instruction
+ auto *Inverted =
+ BinaryOperator::CreateNot(Condition, Condition->getName() + ".inv");
+ if (Inst && !isa<PHINode>(Inst))
+ Inverted->insertAfter(Inst);
+ else
+ Inverted->insertBefore(&*Parent->getFirstInsertionPt());
+ return Inverted;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LoopRotationUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LoopRotationUtils.cpp
index f83c968e91..b678efdc8d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -1,406 +1,406 @@
-//===----------------- LoopRotationUtils.cpp -----------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides utilities to convert a loop into a loop with bottom test.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/LoopRotationUtils.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+//===----------------- LoopRotationUtils.cpp -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides utilities to convert a loop into a loop with bottom test.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopRotationUtils.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-rotate"
-
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-rotate"
+
STATISTIC(NumNotRotatedDueToHeaderSize,
"Number of loops not rotated due to the header size");
-STATISTIC(NumRotated, "Number of loops rotated");
-
-static cl::opt<bool>
- MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden,
- cl::desc("Allow loop rotation multiple times in order to reach "
- "a better latch exit"));
-
-namespace {
-/// A simple loop rotation transformation.
-class LoopRotate {
- const unsigned MaxHeaderSize;
- LoopInfo *LI;
- const TargetTransformInfo *TTI;
- AssumptionCache *AC;
- DominatorTree *DT;
- ScalarEvolution *SE;
- MemorySSAUpdater *MSSAU;
- const SimplifyQuery &SQ;
- bool RotationOnly;
- bool IsUtilMode;
+STATISTIC(NumRotated, "Number of loops rotated");
+
+static cl::opt<bool>
+ MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden,
+ cl::desc("Allow loop rotation multiple times in order to reach "
+ "a better latch exit"));
+
+namespace {
+/// A simple loop rotation transformation.
+class LoopRotate {
+ const unsigned MaxHeaderSize;
+ LoopInfo *LI;
+ const TargetTransformInfo *TTI;
+ AssumptionCache *AC;
+ DominatorTree *DT;
+ ScalarEvolution *SE;
+ MemorySSAUpdater *MSSAU;
+ const SimplifyQuery &SQ;
+ bool RotationOnly;
+ bool IsUtilMode;
bool PrepareForLTO;
-
-public:
- LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
- const TargetTransformInfo *TTI, AssumptionCache *AC,
- DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
+
+public:
+ LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ DominatorTree *DT, ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
const SimplifyQuery &SQ, bool RotationOnly, bool IsUtilMode,
bool PrepareForLTO)
- : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
- MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly),
+ : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE),
+ MSSAU(MSSAU), SQ(SQ), RotationOnly(RotationOnly),
IsUtilMode(IsUtilMode), PrepareForLTO(PrepareForLTO) {}
- bool processLoop(Loop *L);
-
-private:
- bool rotateLoop(Loop *L, bool SimplifiedLatch);
- bool simplifyLoopLatch(Loop *L);
-};
-} // end anonymous namespace
-
-/// Insert (K, V) pair into the ValueToValueMap, and verify the key did not
-/// previously exist in the map, and the value was inserted.
-static void InsertNewValueIntoMap(ValueToValueMapTy &VM, Value *K, Value *V) {
- bool Inserted = VM.insert({K, V}).second;
- assert(Inserted);
- (void)Inserted;
-}
-/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
-/// old header into the preheader. If there were uses of the values produced by
-/// these instruction that were outside of the loop, we have to insert PHI nodes
-/// to merge the two values. Do this now.
-static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
- BasicBlock *OrigPreheader,
- ValueToValueMapTy &ValueMap,
- SmallVectorImpl<PHINode*> *InsertedPHIs) {
- // Remove PHI node entries that are no longer live.
- BasicBlock::iterator I, E = OrigHeader->end();
- for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
- PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
-
- // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
- // as necessary.
- SSAUpdater SSA(InsertedPHIs);
- for (I = OrigHeader->begin(); I != E; ++I) {
- Value *OrigHeaderVal = &*I;
-
- // If there are no uses of the value (e.g. because it returns void), there
- // is nothing to rewrite.
- if (OrigHeaderVal->use_empty())
- continue;
-
- Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
-
- // The value now exits in two versions: the initial value in the preheader
- // and the loop "next" value in the original header.
- SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
- SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
- SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
-
- // Visit each use of the OrigHeader instruction.
- for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
- UE = OrigHeaderVal->use_end();
- UI != UE;) {
- // Grab the use before incrementing the iterator.
- Use &U = *UI;
-
- // Increment the iterator before removing the use from the list.
- ++UI;
-
- // SSAUpdater can't handle a non-PHI use in the same block as an
- // earlier def. We can easily handle those cases manually.
- Instruction *UserInst = cast<Instruction>(U.getUser());
- if (!isa<PHINode>(UserInst)) {
- BasicBlock *UserBB = UserInst->getParent();
-
- // The original users in the OrigHeader are already using the
- // original definitions.
- if (UserBB == OrigHeader)
- continue;
-
- // Users in the OrigPreHeader need to use the value to which the
- // original definitions are mapped.
- if (UserBB == OrigPreheader) {
- U = OrigPreHeaderVal;
- continue;
- }
- }
-
- // Anything else can be handled by SSAUpdater.
- SSA.RewriteUse(U);
- }
-
- // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
- // intrinsics.
- SmallVector<DbgValueInst *, 1> DbgValues;
- llvm::findDbgValues(DbgValues, OrigHeaderVal);
- for (auto &DbgValue : DbgValues) {
- // The original users in the OrigHeader are already using the original
- // definitions.
- BasicBlock *UserBB = DbgValue->getParent();
- if (UserBB == OrigHeader)
- continue;
-
- // Users in the OrigPreHeader need to use the value to which the
- // original definitions are mapped and anything else can be handled by
- // the SSAUpdater. To avoid adding PHINodes, check if the value is
- // available in UserBB, if not substitute undef.
- Value *NewVal;
- if (UserBB == OrigPreheader)
- NewVal = OrigPreHeaderVal;
- else if (SSA.HasValueForBlock(UserBB))
- NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
- else
- NewVal = UndefValue::get(OrigHeaderVal->getType());
- DbgValue->setOperand(0,
- MetadataAsValue::get(OrigHeaderVal->getContext(),
- ValueAsMetadata::get(NewVal)));
- }
- }
-}
-
-// Assuming both header and latch are exiting, look for a phi which is only
-// used outside the loop (via a LCSSA phi) in the exit from the header.
-// This means that rotating the loop can remove the phi.
-static bool profitableToRotateLoopExitingLatch(Loop *L) {
- BasicBlock *Header = L->getHeader();
- BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator());
- assert(BI && BI->isConditional() && "need header with conditional exit");
- BasicBlock *HeaderExit = BI->getSuccessor(0);
- if (L->contains(HeaderExit))
- HeaderExit = BI->getSuccessor(1);
-
- for (auto &Phi : Header->phis()) {
- // Look for uses of this phi in the loop/via exits other than the header.
- if (llvm::any_of(Phi.users(), [HeaderExit](const User *U) {
- return cast<Instruction>(U)->getParent() != HeaderExit;
- }))
- continue;
- return true;
- }
- return false;
-}
-
-// Check that latch exit is deoptimizing (which means - very unlikely to happen)
-// and there is another exit from the loop which is non-deoptimizing.
-// If we rotate latch to that exit our loop has a better chance of being fully
-// canonical.
-//
-// It can give false positives in some rare cases.
-static bool canRotateDeoptimizingLatchExit(Loop *L) {
- BasicBlock *Latch = L->getLoopLatch();
- assert(Latch && "need latch");
- BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());
- // Need normal exiting latch.
- if (!BI || !BI->isConditional())
- return false;
-
- BasicBlock *Exit = BI->getSuccessor(1);
- if (L->contains(Exit))
- Exit = BI->getSuccessor(0);
-
- // Latch exit is non-deoptimizing, no need to rotate.
- if (!Exit->getPostdominatingDeoptimizeCall())
- return false;
-
- SmallVector<BasicBlock *, 4> Exits;
- L->getUniqueExitBlocks(Exits);
- if (!Exits.empty()) {
- // There is at least one non-deoptimizing exit.
- //
- // Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact,
- // as it can conservatively return false for deoptimizing exits with
- // complex enough control flow down to deoptimize call.
- //
- // That means here we can report success for a case where
- // all exits are deoptimizing but one of them has complex enough
- // control flow (e.g. with loops).
- //
- // That should be a very rare case and false positives for this function
- // have compile-time effect only.
- return any_of(Exits, [](const BasicBlock *BB) {
- return !BB->getPostdominatingDeoptimizeCall();
- });
- }
- return false;
-}
-
-/// Rotate loop LP. Return true if the loop is rotated.
-///
-/// \param SimplifiedLatch is true if the latch was just folded into the final
-/// loop exit. In this case we may want to rotate even though the new latch is
-/// now an exiting branch. This rotation would have happened had the latch not
-/// been simplified. However, if SimplifiedLatch is false, then we avoid
-/// rotating loops in which the latch exits to avoid excessive or endless
-/// rotation. LoopRotate should be repeatable and converge to a canonical
-/// form. This property is satisfied because simplifying the loop latch can only
-/// happen once across multiple invocations of the LoopRotate pass.
-///
-/// If -loop-rotate-multi is enabled we can do multiple rotations in one go
-/// so to reach a suitable (non-deoptimizing) exit.
-bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
- // If the loop has only one block then there is not much to rotate.
- if (L->getBlocks().size() == 1)
- return false;
-
- bool Rotated = false;
- do {
- BasicBlock *OrigHeader = L->getHeader();
- BasicBlock *OrigLatch = L->getLoopLatch();
-
- BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
- if (!BI || BI->isUnconditional())
- return Rotated;
-
- // If the loop header is not one of the loop exiting blocks then
- // either this loop is already rotated or it is not
- // suitable for loop rotation transformations.
- if (!L->isLoopExiting(OrigHeader))
- return Rotated;
-
- // If the loop latch already contains a branch that leaves the loop then the
- // loop is already rotated.
- if (!OrigLatch)
- return Rotated;
-
- // Rotate if either the loop latch does *not* exit the loop, or if the loop
- // latch was just simplified. Or if we think it will be profitable.
- if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
- !profitableToRotateLoopExitingLatch(L) &&
- !canRotateDeoptimizingLatchExit(L))
- return Rotated;
-
- // Check size of original header and reject loop if it is very big or we can't
- // duplicate blocks inside it.
- {
- SmallPtrSet<const Value *, 32> EphValues;
- CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
- CodeMetrics Metrics;
+ bool processLoop(Loop *L);
+
+private:
+ bool rotateLoop(Loop *L, bool SimplifiedLatch);
+ bool simplifyLoopLatch(Loop *L);
+};
+} // end anonymous namespace
+
+/// Insert (K, V) pair into the ValueToValueMap, and verify the key did not
+/// previously exist in the map, and the value was inserted.
+static void InsertNewValueIntoMap(ValueToValueMapTy &VM, Value *K, Value *V) {
+ bool Inserted = VM.insert({K, V}).second;
+ assert(Inserted);
+ (void)Inserted;
+}
+/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
+/// old header into the preheader. If there were uses of the values produced by
+/// these instruction that were outside of the loop, we have to insert PHI nodes
+/// to merge the two values. Do this now.
+static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
+ BasicBlock *OrigPreheader,
+ ValueToValueMapTy &ValueMap,
+ SmallVectorImpl<PHINode*> *InsertedPHIs) {
+ // Remove PHI node entries that are no longer live.
+ BasicBlock::iterator I, E = OrigHeader->end();
+ for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
+ PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
+
+ // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
+ // as necessary.
+ SSAUpdater SSA(InsertedPHIs);
+ for (I = OrigHeader->begin(); I != E; ++I) {
+ Value *OrigHeaderVal = &*I;
+
+ // If there are no uses of the value (e.g. because it returns void), there
+ // is nothing to rewrite.
+ if (OrigHeaderVal->use_empty())
+ continue;
+
+ Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
+
+ // The value now exits in two versions: the initial value in the preheader
+ // and the loop "next" value in the original header.
+ SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
+ SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
+ SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
+
+ // Visit each use of the OrigHeader instruction.
+ for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
+ UE = OrigHeaderVal->use_end();
+ UI != UE;) {
+ // Grab the use before incrementing the iterator.
+ Use &U = *UI;
+
+ // Increment the iterator before removing the use from the list.
+ ++UI;
+
+ // SSAUpdater can't handle a non-PHI use in the same block as an
+ // earlier def. We can easily handle those cases manually.
+ Instruction *UserInst = cast<Instruction>(U.getUser());
+ if (!isa<PHINode>(UserInst)) {
+ BasicBlock *UserBB = UserInst->getParent();
+
+ // The original users in the OrigHeader are already using the
+ // original definitions.
+ if (UserBB == OrigHeader)
+ continue;
+
+ // Users in the OrigPreHeader need to use the value to which the
+ // original definitions are mapped.
+ if (UserBB == OrigPreheader) {
+ U = OrigPreHeaderVal;
+ continue;
+ }
+ }
+
+ // Anything else can be handled by SSAUpdater.
+ SSA.RewriteUse(U);
+ }
+
+ // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
+ // intrinsics.
+ SmallVector<DbgValueInst *, 1> DbgValues;
+ llvm::findDbgValues(DbgValues, OrigHeaderVal);
+ for (auto &DbgValue : DbgValues) {
+ // The original users in the OrigHeader are already using the original
+ // definitions.
+ BasicBlock *UserBB = DbgValue->getParent();
+ if (UserBB == OrigHeader)
+ continue;
+
+ // Users in the OrigPreHeader need to use the value to which the
+ // original definitions are mapped and anything else can be handled by
+ // the SSAUpdater. To avoid adding PHINodes, check if the value is
+ // available in UserBB, if not substitute undef.
+ Value *NewVal;
+ if (UserBB == OrigPreheader)
+ NewVal = OrigPreHeaderVal;
+ else if (SSA.HasValueForBlock(UserBB))
+ NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
+ else
+ NewVal = UndefValue::get(OrigHeaderVal->getType());
+ DbgValue->setOperand(0,
+ MetadataAsValue::get(OrigHeaderVal->getContext(),
+ ValueAsMetadata::get(NewVal)));
+ }
+ }
+}
+
+// Assuming both header and latch are exiting, look for a phi which is only
+// used outside the loop (via a LCSSA phi) in the exit from the header.
+// This means that rotating the loop can remove the phi.
+static bool profitableToRotateLoopExitingLatch(Loop *L) {
+ BasicBlock *Header = L->getHeader();
+ BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator());
+ assert(BI && BI->isConditional() && "need header with conditional exit");
+ BasicBlock *HeaderExit = BI->getSuccessor(0);
+ if (L->contains(HeaderExit))
+ HeaderExit = BI->getSuccessor(1);
+
+ for (auto &Phi : Header->phis()) {
+ // Look for uses of this phi in the loop/via exits other than the header.
+ if (llvm::any_of(Phi.users(), [HeaderExit](const User *U) {
+ return cast<Instruction>(U)->getParent() != HeaderExit;
+ }))
+ continue;
+ return true;
+ }
+ return false;
+}
+
+// Check that latch exit is deoptimizing (which means - very unlikely to happen)
+// and there is another exit from the loop which is non-deoptimizing.
+// If we rotate latch to that exit our loop has a better chance of being fully
+// canonical.
+//
+// It can give false positives in some rare cases.
+static bool canRotateDeoptimizingLatchExit(Loop *L) {
+ BasicBlock *Latch = L->getLoopLatch();
+ assert(Latch && "need latch");
+ BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());
+ // Need normal exiting latch.
+ if (!BI || !BI->isConditional())
+ return false;
+
+ BasicBlock *Exit = BI->getSuccessor(1);
+ if (L->contains(Exit))
+ Exit = BI->getSuccessor(0);
+
+ // Latch exit is non-deoptimizing, no need to rotate.
+ if (!Exit->getPostdominatingDeoptimizeCall())
+ return false;
+
+ SmallVector<BasicBlock *, 4> Exits;
+ L->getUniqueExitBlocks(Exits);
+ if (!Exits.empty()) {
+ // There is at least one non-deoptimizing exit.
+ //
+ // Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact,
+ // as it can conservatively return false for deoptimizing exits with
+ // complex enough control flow down to deoptimize call.
+ //
+ // That means here we can report success for a case where
+ // all exits are deoptimizing but one of them has complex enough
+ // control flow (e.g. with loops).
+ //
+ // That should be a very rare case and false positives for this function
+ // have compile-time effect only.
+ return any_of(Exits, [](const BasicBlock *BB) {
+ return !BB->getPostdominatingDeoptimizeCall();
+ });
+ }
+ return false;
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+///
+/// \param SimplifiedLatch is true if the latch was just folded into the final
+/// loop exit. In this case we may want to rotate even though the new latch is
+/// now an exiting branch. This rotation would have happened had the latch not
+/// been simplified. However, if SimplifiedLatch is false, then we avoid
+/// rotating loops in which the latch exits to avoid excessive or endless
+/// rotation. LoopRotate should be repeatable and converge to a canonical
+/// form. This property is satisfied because simplifying the loop latch can only
+/// happen once across multiple invocations of the LoopRotate pass.
+///
+/// If -loop-rotate-multi is enabled we can do multiple rotations in one go
+/// so to reach a suitable (non-deoptimizing) exit.
+bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
+ // If the loop has only one block then there is not much to rotate.
+ if (L->getBlocks().size() == 1)
+ return false;
+
+ bool Rotated = false;
+ do {
+ BasicBlock *OrigHeader = L->getHeader();
+ BasicBlock *OrigLatch = L->getLoopLatch();
+
+ BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+ if (!BI || BI->isUnconditional())
+ return Rotated;
+
+ // If the loop header is not one of the loop exiting blocks then
+ // either this loop is already rotated or it is not
+ // suitable for loop rotation transformations.
+ if (!L->isLoopExiting(OrigHeader))
+ return Rotated;
+
+ // If the loop latch already contains a branch that leaves the loop then the
+ // loop is already rotated.
+ if (!OrigLatch)
+ return Rotated;
+
+ // Rotate if either the loop latch does *not* exit the loop, or if the loop
+ // latch was just simplified. Or if we think it will be profitable.
+ if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
+ !profitableToRotateLoopExitingLatch(L) &&
+ !canRotateDeoptimizingLatchExit(L))
+ return Rotated;
+
+ // Check size of original header and reject loop if it is very big or we can't
+ // duplicate blocks inside it.
+ {
+ SmallPtrSet<const Value *, 32> EphValues;
+ CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+ CodeMetrics Metrics;
Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues, PrepareForLTO);
- if (Metrics.notDuplicatable) {
- LLVM_DEBUG(
- dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
- << " instructions: ";
- L->dump());
- return Rotated;
- }
- if (Metrics.convergent) {
- LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
- "instructions: ";
- L->dump());
- return Rotated;
- }
- if (Metrics.NumInsts > MaxHeaderSize) {
- LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
- << Metrics.NumInsts
- << " instructions, which is more than the threshold ("
- << MaxHeaderSize << " instructions): ";
- L->dump());
+ if (Metrics.notDuplicatable) {
+ LLVM_DEBUG(
+ dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+ << " instructions: ";
+ L->dump());
+ return Rotated;
+ }
+ if (Metrics.convergent) {
+ LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+ "instructions: ";
+ L->dump());
+ return Rotated;
+ }
+ if (Metrics.NumInsts > MaxHeaderSize) {
+ LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
+ << Metrics.NumInsts
+ << " instructions, which is more than the threshold ("
+ << MaxHeaderSize << " instructions): ";
+ L->dump());
++NumNotRotatedDueToHeaderSize;
- return Rotated;
- }
+ return Rotated;
+ }
// When preparing for LTO, avoid rotating loops with calls that could be
// inlined during the LTO stage.
if (PrepareForLTO && Metrics.NumInlineCandidates > 0)
return Rotated;
- }
-
- // Now, this loop is suitable for rotation.
- BasicBlock *OrigPreheader = L->getLoopPreheader();
-
- // If the loop could not be converted to canonical form, it must have an
- // indirectbr in it, just give up.
- if (!OrigPreheader || !L->hasDedicatedExits())
- return Rotated;
-
- // Anything ScalarEvolution may know about this loop or the PHI nodes
- // in its header will soon be invalidated. We should also invalidate
- // all outer loops because insertion and deletion of blocks that happens
- // during the rotation may violate invariants related to backedge taken
- // infos in them.
- if (SE)
- SE->forgetTopmostLoop(L);
-
- LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- // Find new Loop header. NewHeader is a Header's one and only successor
- // that is inside loop. Header's other successor is outside the
- // loop. Otherwise loop is not suitable for rotation.
- BasicBlock *Exit = BI->getSuccessor(0);
- BasicBlock *NewHeader = BI->getSuccessor(1);
- if (L->contains(Exit))
- std::swap(Exit, NewHeader);
- assert(NewHeader && "Unable to determine new loop header");
- assert(L->contains(NewHeader) && !L->contains(Exit) &&
- "Unable to determine loop header and exit blocks");
-
- // This code assumes that the new header has exactly one predecessor.
- // Remove any single-entry PHI nodes in it.
- assert(NewHeader->getSinglePredecessor() &&
- "New header doesn't have one pred!");
- FoldSingleEntryPHINodes(NewHeader);
-
- // Begin by walking OrigHeader and populating ValueMap with an entry for
- // each Instruction.
- BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
- ValueToValueMapTy ValueMap, ValueMapMSSA;
-
- // For PHI nodes, the value available in OldPreHeader is just the
- // incoming value from OldPreHeader.
- for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
- InsertNewValueIntoMap(ValueMap, PN,
- PN->getIncomingValueForBlock(OrigPreheader));
-
- // For the rest of the instructions, either hoist to the OrigPreheader if
- // possible or create a clone in the OldPreHeader if not.
- Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
-
- // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
- using DbgIntrinsicHash =
- std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
- auto makeHash = [](DbgVariableIntrinsic *D) -> DbgIntrinsicHash {
- return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
- };
- SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
- for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
- I != E; ++I) {
- if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&*I))
- DbgIntrinsics.insert(makeHash(DII));
- else
- break;
- }
-
+ }
+
+ // Now, this loop is suitable for rotation.
+ BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+ // If the loop could not be converted to canonical form, it must have an
+ // indirectbr in it, just give up.
+ if (!OrigPreheader || !L->hasDedicatedExits())
+ return Rotated;
+
+ // Anything ScalarEvolution may know about this loop or the PHI nodes
+ // in its header will soon be invalidated. We should also invalidate
+ // all outer loops because insertion and deletion of blocks that happens
+ // during the rotation may violate invariants related to backedge taken
+ // infos in them.
+ if (SE)
+ SE->forgetTopmostLoop(L);
+
+ LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ // Find new Loop header. NewHeader is a Header's one and only successor
+ // that is inside loop. Header's other successor is outside the
+ // loop. Otherwise loop is not suitable for rotation.
+ BasicBlock *Exit = BI->getSuccessor(0);
+ BasicBlock *NewHeader = BI->getSuccessor(1);
+ if (L->contains(Exit))
+ std::swap(Exit, NewHeader);
+ assert(NewHeader && "Unable to determine new loop header");
+ assert(L->contains(NewHeader) && !L->contains(Exit) &&
+ "Unable to determine loop header and exit blocks");
+
+ // This code assumes that the new header has exactly one predecessor.
+ // Remove any single-entry PHI nodes in it.
+ assert(NewHeader->getSinglePredecessor() &&
+ "New header doesn't have one pred!");
+ FoldSingleEntryPHINodes(NewHeader);
+
+ // Begin by walking OrigHeader and populating ValueMap with an entry for
+ // each Instruction.
+ BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+ ValueToValueMapTy ValueMap, ValueMapMSSA;
+
+ // For PHI nodes, the value available in OldPreHeader is just the
+ // incoming value from OldPreHeader.
+ for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+ InsertNewValueIntoMap(ValueMap, PN,
+ PN->getIncomingValueForBlock(OrigPreheader));
+
+ // For the rest of the instructions, either hoist to the OrigPreheader if
+ // possible or create a clone in the OldPreHeader if not.
+ Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
+
+ // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
+ using DbgIntrinsicHash =
+ std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
+ auto makeHash = [](DbgVariableIntrinsic *D) -> DbgIntrinsicHash {
+ return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
+ };
+ SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
+ for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
+ I != E; ++I) {
+ if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&*I))
+ DbgIntrinsics.insert(makeHash(DII));
+ else
+ break;
+ }
+
// Remember the local noalias scope declarations in the header. After the
// rotation, they must be duplicated and the scope must be cloned. This
// avoids unwanted interaction across iterations.
@@ -409,66 +409,66 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
NoAliasDeclInstructions.push_back(Decl);
- while (I != E) {
- Instruction *Inst = &*I++;
-
- // If the instruction's operands are invariant and it doesn't read or write
- // memory, then it is safe to hoist. Doing this doesn't change the order of
- // execution in the preheader, but does prevent the instruction from
- // executing in each iteration of the loop. This means it is safe to hoist
- // something that might trap, but isn't safe to hoist something that reads
- // memory (without proving that the loop doesn't write).
- if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
- !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
- !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
- Inst->moveBefore(LoopEntryBranch);
- continue;
- }
-
- // Otherwise, create a duplicate of the instruction.
- Instruction *C = Inst->clone();
-
- // Eagerly remap the operands of the instruction.
- RemapInstruction(C, ValueMap,
- RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-
- // Avoid inserting the same intrinsic twice.
- if (auto *DII = dyn_cast<DbgVariableIntrinsic>(C))
- if (DbgIntrinsics.count(makeHash(DII))) {
- C->deleteValue();
- continue;
- }
-
- // With the operands remapped, see if the instruction constant folds or is
- // otherwise simplifyable. This commonly occurs because the entry from PHI
- // nodes allows icmps and other instructions to fold.
- Value *V = SimplifyInstruction(C, SQ);
- if (V && LI->replacementPreservesLCSSAForm(C, V)) {
- // If so, then delete the temporary instruction and stick the folded value
- // in the map.
- InsertNewValueIntoMap(ValueMap, Inst, V);
- if (!C->mayHaveSideEffects()) {
- C->deleteValue();
- C = nullptr;
- }
- } else {
- InsertNewValueIntoMap(ValueMap, Inst, C);
- }
- if (C) {
- // Otherwise, stick the new instruction into the new block!
- C->setName(Inst->getName());
- C->insertBefore(LoopEntryBranch);
-
- if (auto *II = dyn_cast<IntrinsicInst>(C))
- if (II->getIntrinsicID() == Intrinsic::assume)
- AC->registerAssumption(II);
- // MemorySSA cares whether the cloned instruction was inserted or not, and
- // not whether it can be remapped to a simplified value.
- if (MSSAU)
- InsertNewValueIntoMap(ValueMapMSSA, Inst, C);
- }
- }
-
+ while (I != E) {
+ Instruction *Inst = &*I++;
+
+ // If the instruction's operands are invariant and it doesn't read or write
+ // memory, then it is safe to hoist. Doing this doesn't change the order of
+ // execution in the preheader, but does prevent the instruction from
+ // executing in each iteration of the loop. This means it is safe to hoist
+ // something that might trap, but isn't safe to hoist something that reads
+ // memory (without proving that the loop doesn't write).
+ if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+ !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
+ !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
+ Inst->moveBefore(LoopEntryBranch);
+ continue;
+ }
+
+ // Otherwise, create a duplicate of the instruction.
+ Instruction *C = Inst->clone();
+
+ // Eagerly remap the operands of the instruction.
+ RemapInstruction(C, ValueMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+
+ // Avoid inserting the same intrinsic twice.
+ if (auto *DII = dyn_cast<DbgVariableIntrinsic>(C))
+ if (DbgIntrinsics.count(makeHash(DII))) {
+ C->deleteValue();
+ continue;
+ }
+
+ // With the operands remapped, see if the instruction constant folds or is
+ // otherwise simplifyable. This commonly occurs because the entry from PHI
+ // nodes allows icmps and other instructions to fold.
+ Value *V = SimplifyInstruction(C, SQ);
+ if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+ // If so, then delete the temporary instruction and stick the folded value
+ // in the map.
+ InsertNewValueIntoMap(ValueMap, Inst, V);
+ if (!C->mayHaveSideEffects()) {
+ C->deleteValue();
+ C = nullptr;
+ }
+ } else {
+ InsertNewValueIntoMap(ValueMap, Inst, C);
+ }
+ if (C) {
+ // Otherwise, stick the new instruction into the new block!
+ C->setName(Inst->getName());
+ C->insertBefore(LoopEntryBranch);
+
+ if (auto *II = dyn_cast<IntrinsicInst>(C))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
+ // MemorySSA cares whether the cloned instruction was inserted or not, and
+ // not whether it can be remapped to a simplified value.
+ if (MSSAU)
+ InsertNewValueIntoMap(ValueMapMSSA, Inst, C);
+ }
+ }
+
if (!NoAliasDeclInstructions.empty()) {
// There are noalias scope declarations:
// (general):
@@ -532,300 +532,300 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
}
}
- // Along with all the other instructions, we just cloned OrigHeader's
- // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
- // successors by duplicating their incoming values for OrigHeader.
- for (BasicBlock *SuccBB : successors(OrigHeader))
- for (BasicBlock::iterator BI = SuccBB->begin();
- PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
- PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
-
- // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
- // OrigPreHeader's old terminator (the original branch into the loop), and
- // remove the corresponding incoming values from the PHI nodes in OrigHeader.
- LoopEntryBranch->eraseFromParent();
-
- // Update MemorySSA before the rewrite call below changes the 1:1
- // instruction:cloned_instruction_or_value mapping.
- if (MSSAU) {
- InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);
- MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,
- ValueMapMSSA);
- }
-
- SmallVector<PHINode*, 2> InsertedPHIs;
- // If there were any uses of instructions in the duplicated block outside the
- // loop, update them, inserting PHI nodes as required
- RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
- &InsertedPHIs);
-
- // Attach dbg.value intrinsics to the new phis if that phi uses a value that
- // previously had debug metadata attached. This keeps the debug info
- // up-to-date in the loop body.
- if (!InsertedPHIs.empty())
- insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
-
- // NewHeader is now the header of the loop.
- L->moveToHeader(NewHeader);
- assert(L->getHeader() == NewHeader && "Latch block is our new header");
-
- // Inform DT about changes to the CFG.
- if (DT) {
- // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
- // the DT about the removed edge to the OrigHeader (that got removed).
- SmallVector<DominatorTree::UpdateType, 3> Updates;
- Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
- Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
- Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
-
- if (MSSAU) {
+ // Along with all the other instructions, we just cloned OrigHeader's
+ // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+ // successors by duplicating their incoming values for OrigHeader.
+ for (BasicBlock *SuccBB : successors(OrigHeader))
+ for (BasicBlock::iterator BI = SuccBB->begin();
+ PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+ PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+ // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+ // OrigPreHeader's old terminator (the original branch into the loop), and
+ // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+ LoopEntryBranch->eraseFromParent();
+
+ // Update MemorySSA before the rewrite call below changes the 1:1
+ // instruction:cloned_instruction_or_value mapping.
+ if (MSSAU) {
+ InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);
+ MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,
+ ValueMapMSSA);
+ }
+
+ SmallVector<PHINode*, 2> InsertedPHIs;
+ // If there were any uses of instructions in the duplicated block outside the
+ // loop, update them, inserting PHI nodes as required
+ RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+ &InsertedPHIs);
+
+ // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+ // previously had debug metadata attached. This keeps the debug info
+ // up-to-date in the loop body.
+ if (!InsertedPHIs.empty())
+ insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
+
+ // NewHeader is now the header of the loop.
+ L->moveToHeader(NewHeader);
+ assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+ // Inform DT about changes to the CFG.
+ if (DT) {
+ // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+ // the DT about the removed edge to the OrigHeader (that got removed).
+ SmallVector<DominatorTree::UpdateType, 3> Updates;
+ Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
+ Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
+ Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
+
+ if (MSSAU) {
MSSAU->applyUpdates(Updates, *DT, /*UpdateDT=*/true);
- if (VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
+ if (VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
} else {
DT->applyUpdates(Updates);
- }
- }
-
- // At this point, we've finished our major CFG changes. As part of cloning
- // the loop into the preheader we've simplified instructions and the
- // duplicated conditional branch may now be branching on a constant. If it is
- // branching on a constant and if that constant means that we enter the loop,
- // then we fold away the cond branch to an uncond branch. This simplifies the
- // loop in cases important for nested loops, and it also means we don't have
- // to split as many edges.
- BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
- assert(PHBI->isConditional() && "Should be clone of BI condbr!");
- if (!isa<ConstantInt>(PHBI->getCondition()) ||
- PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
- NewHeader) {
- // The conditional branch can't be folded, handle the general case.
- // Split edges as necessary to preserve LoopSimplify form.
-
- // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
- // thus is not a preheader anymore.
- // Split the edge to form a real preheader.
- BasicBlock *NewPH = SplitCriticalEdge(
- OrigPreheader, NewHeader,
- CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
- NewPH->setName(NewHeader->getName() + ".lr.ph");
-
- // Preserve canonical loop form, which means that 'Exit' should have only
- // one predecessor. Note that Exit could be an exit block for multiple
- // nested loops, causing both of the edges to now be critical and need to
- // be split.
- SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
- bool SplitLatchEdge = false;
- for (BasicBlock *ExitPred : ExitPreds) {
- // We only need to split loop exit edges.
- Loop *PredLoop = LI->getLoopFor(ExitPred);
- if (!PredLoop || PredLoop->contains(Exit) ||
- ExitPred->getTerminator()->isIndirectTerminator())
- continue;
- SplitLatchEdge |= L->getLoopLatch() == ExitPred;
- BasicBlock *ExitSplit = SplitCriticalEdge(
- ExitPred, Exit,
- CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
- ExitSplit->moveBefore(Exit);
- }
- assert(SplitLatchEdge &&
- "Despite splitting all preds, failed to split latch exit?");
- } else {
- // We can fold the conditional branch in the preheader, this makes things
- // simpler. The first step is to remove the extra edge to the Exit block.
- Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
- BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
- NewBI->setDebugLoc(PHBI->getDebugLoc());
- PHBI->eraseFromParent();
-
- // With our CFG finalized, update DomTree if it is available.
- if (DT) DT->deleteEdge(OrigPreheader, Exit);
-
- // Update MSSA too, if available.
- if (MSSAU)
- MSSAU->removeEdge(OrigPreheader, Exit);
- }
-
- assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
- assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- // Now that the CFG and DomTree are in a consistent state again, try to merge
- // the OrigHeader block into OrigLatch. This will succeed if they are
- // connected by an unconditional branch. This is just a cleanup so the
- // emitted code isn't too gross in this common case.
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ }
+ }
+
+ // At this point, we've finished our major CFG changes. As part of cloning
+ // the loop into the preheader we've simplified instructions and the
+ // duplicated conditional branch may now be branching on a constant. If it is
+ // branching on a constant and if that constant means that we enter the loop,
+ // then we fold away the cond branch to an uncond branch. This simplifies the
+ // loop in cases important for nested loops, and it also means we don't have
+ // to split as many edges.
+ BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+ assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+ if (!isa<ConstantInt>(PHBI->getCondition()) ||
+ PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
+ NewHeader) {
+ // The conditional branch can't be folded, handle the general case.
+ // Split edges as necessary to preserve LoopSimplify form.
+
+ // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+ // thus is not a preheader anymore.
+ // Split the edge to form a real preheader.
+ BasicBlock *NewPH = SplitCriticalEdge(
+ OrigPreheader, NewHeader,
+ CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
+ NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+ // Preserve canonical loop form, which means that 'Exit' should have only
+ // one predecessor. Note that Exit could be an exit block for multiple
+ // nested loops, causing both of the edges to now be critical and need to
+ // be split.
+ SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
+ bool SplitLatchEdge = false;
+ for (BasicBlock *ExitPred : ExitPreds) {
+ // We only need to split loop exit edges.
+ Loop *PredLoop = LI->getLoopFor(ExitPred);
+ if (!PredLoop || PredLoop->contains(Exit) ||
+ ExitPred->getTerminator()->isIndirectTerminator())
+ continue;
+ SplitLatchEdge |= L->getLoopLatch() == ExitPred;
+ BasicBlock *ExitSplit = SplitCriticalEdge(
+ ExitPred, Exit,
+ CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
+ ExitSplit->moveBefore(Exit);
+ }
+ assert(SplitLatchEdge &&
+ "Despite splitting all preds, failed to split latch exit?");
+ } else {
+ // We can fold the conditional branch in the preheader, this makes things
+ // simpler. The first step is to remove the extra edge to the Exit block.
+ Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+ BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+ NewBI->setDebugLoc(PHBI->getDebugLoc());
+ PHBI->eraseFromParent();
+
+ // With our CFG finalized, update DomTree if it is available.
+ if (DT) DT->deleteEdge(OrigPreheader, Exit);
+
+ // Update MSSA too, if available.
+ if (MSSAU)
+ MSSAU->removeEdge(OrigPreheader, Exit);
+ }
+
+ assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+ assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ // Now that the CFG and DomTree are in a consistent state again, try to merge
+ // the OrigHeader block into OrigLatch. This will succeed if they are
+ // connected by an unconditional branch. This is just a cleanup so the
+ // emitted code isn't too gross in this common case.
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
BasicBlock *PredBB = OrigHeader->getUniquePredecessor();
bool DidMerge = MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
if (DidMerge)
RemoveRedundantDbgInstrs(PredBB);
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
-
- ++NumRotated;
-
- Rotated = true;
- SimplifiedLatch = false;
-
- // Check that new latch is a deoptimizing exit and then repeat rotation if possible.
- // Deoptimizing latch exit is not a generally typical case, so we just loop over.
- // TODO: if it becomes a performance bottleneck extend rotation algorithm
- // to handle multiple rotations in one go.
- } while (MultiRotate && canRotateDeoptimizingLatchExit(L));
-
-
- return true;
-}
-
-/// Determine whether the instructions in this range may be safely and cheaply
-/// speculated. This is not an important enough situation to develop complex
-/// heuristics. We handle a single arithmetic instruction along with any type
-/// conversions.
-static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
- BasicBlock::iterator End, Loop *L) {
- bool seenIncrement = false;
- bool MultiExitLoop = false;
-
- if (!L->getExitingBlock())
- MultiExitLoop = true;
-
- for (BasicBlock::iterator I = Begin; I != End; ++I) {
-
- if (!isSafeToSpeculativelyExecute(&*I))
- return false;
-
- if (isa<DbgInfoIntrinsic>(I))
- continue;
-
- switch (I->getOpcode()) {
- default:
- return false;
- case Instruction::GetElementPtr:
- // GEPs are cheap if all indices are constant.
- if (!cast<GEPOperator>(I)->hasAllConstantIndices())
- return false;
- // fall-thru to increment case
- LLVM_FALLTHROUGH;
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr: {
- Value *IVOpnd =
- !isa<Constant>(I->getOperand(0))
- ? I->getOperand(0)
- : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
- if (!IVOpnd)
- return false;
-
- // If increment operand is used outside of the loop, this speculation
- // could cause extra live range interference.
- if (MultiExitLoop) {
- for (User *UseI : IVOpnd->users()) {
- auto *UserInst = cast<Instruction>(UseI);
- if (!L->contains(UserInst))
- return false;
- }
- }
-
- if (seenIncrement)
- return false;
- seenIncrement = true;
- break;
- }
- case Instruction::Trunc:
- case Instruction::ZExt:
- case Instruction::SExt:
- // ignore type conversions
- break;
- }
- }
- return true;
-}
-
-/// Fold the loop tail into the loop exit by speculating the loop tail
-/// instructions. Typically, this is a single post-increment. In the case of a
-/// simple 2-block loop, hoisting the increment can be much better than
-/// duplicating the entire loop header. In the case of loops with early exits,
-/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
-/// canonical form so downstream passes can handle it.
-///
-/// I don't believe this invalidates SCEV.
-bool LoopRotate::simplifyLoopLatch(Loop *L) {
- BasicBlock *Latch = L->getLoopLatch();
- if (!Latch || Latch->hasAddressTaken())
- return false;
-
- BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
- if (!Jmp || !Jmp->isUnconditional())
- return false;
-
- BasicBlock *LastExit = Latch->getSinglePredecessor();
- if (!LastExit || !L->isLoopExiting(LastExit))
- return false;
-
- BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
- if (!BI)
- return false;
-
- if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
- return false;
-
- LLVM_DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
- << LastExit->getName() << "\n");
-
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- MergeBlockIntoPredecessor(Latch, &DTU, LI, MSSAU, nullptr,
- /*PredecessorWithTwoSuccessors=*/true);
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- return true;
-}
-
-/// Rotate \c L, and return true if any modification was made.
-bool LoopRotate::processLoop(Loop *L) {
- // Save the loop metadata.
- MDNode *LoopMD = L->getLoopID();
-
- bool SimplifiedLatch = false;
-
- // Simplify the loop latch before attempting to rotate the header
- // upward. Rotation may not be needed if the loop tail can be folded into the
- // loop exit.
- if (!RotationOnly)
- SimplifiedLatch = simplifyLoopLatch(L);
-
- bool MadeChange = rotateLoop(L, SimplifiedLatch);
- assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
- "Loop latch should be exiting after loop-rotate.");
-
- // Restore the loop metadata.
- // NB! We presume LoopRotation DOESN'T ADD its own metadata.
- if ((MadeChange || SimplifiedLatch) && LoopMD)
- L->setLoopID(LoopMD);
-
- return MadeChange || SimplifiedLatch;
-}
-
-
-/// The utility to convert a loop into a loop with bottom test.
-bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
- AssumptionCache *AC, DominatorTree *DT,
- ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
- const SimplifyQuery &SQ, bool RotationOnly = true,
- unsigned Threshold = unsigned(-1),
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+
+ ++NumRotated;
+
+ Rotated = true;
+ SimplifiedLatch = false;
+
+ // Check that new latch is a deoptimizing exit and then repeat rotation if possible.
+ // Deoptimizing latch exit is not a generally typical case, so we just loop over.
+ // TODO: if it becomes a performance bottleneck extend rotation algorithm
+ // to handle multiple rotations in one go.
+ } while (MultiRotate && canRotateDeoptimizingLatchExit(L));
+
+
+ return true;
+}
+
+/// Determine whether the instructions in this range may be safely and cheaply
+/// speculated. This is not an important enough situation to develop complex
+/// heuristics. We handle a single arithmetic instruction along with any type
+/// conversions.
+static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
+ BasicBlock::iterator End, Loop *L) {
+ bool seenIncrement = false;
+ bool MultiExitLoop = false;
+
+ if (!L->getExitingBlock())
+ MultiExitLoop = true;
+
+ for (BasicBlock::iterator I = Begin; I != End; ++I) {
+
+ if (!isSafeToSpeculativelyExecute(&*I))
+ return false;
+
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+
+ switch (I->getOpcode()) {
+ default:
+ return false;
+ case Instruction::GetElementPtr:
+ // GEPs are cheap if all indices are constant.
+ if (!cast<GEPOperator>(I)->hasAllConstantIndices())
+ return false;
+ // fall-thru to increment case
+ LLVM_FALLTHROUGH;
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr: {
+ Value *IVOpnd =
+ !isa<Constant>(I->getOperand(0))
+ ? I->getOperand(0)
+ : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
+ if (!IVOpnd)
+ return false;
+
+ // If increment operand is used outside of the loop, this speculation
+ // could cause extra live range interference.
+ if (MultiExitLoop) {
+ for (User *UseI : IVOpnd->users()) {
+ auto *UserInst = cast<Instruction>(UseI);
+ if (!L->contains(UserInst))
+ return false;
+ }
+ }
+
+ if (seenIncrement)
+ return false;
+ seenIncrement = true;
+ break;
+ }
+ case Instruction::Trunc:
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ // ignore type conversions
+ break;
+ }
+ }
+ return true;
+}
+
+/// Fold the loop tail into the loop exit by speculating the loop tail
+/// instructions. Typically, this is a single post-increment. In the case of a
+/// simple 2-block loop, hoisting the increment can be much better than
+/// duplicating the entire loop header. In the case of loops with early exits,
+/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
+/// canonical form so downstream passes can handle it.
+///
+/// I don't believe this invalidates SCEV.
+bool LoopRotate::simplifyLoopLatch(Loop *L) {
+ BasicBlock *Latch = L->getLoopLatch();
+ if (!Latch || Latch->hasAddressTaken())
+ return false;
+
+ BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!Jmp || !Jmp->isUnconditional())
+ return false;
+
+ BasicBlock *LastExit = Latch->getSinglePredecessor();
+ if (!LastExit || !L->isLoopExiting(LastExit))
+ return false;
+
+ BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
+ if (!BI)
+ return false;
+
+ if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
+ << LastExit->getName() << "\n");
+
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ MergeBlockIntoPredecessor(Latch, &DTU, LI, MSSAU, nullptr,
+ /*PredecessorWithTwoSuccessors=*/true);
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ return true;
+}
+
+/// Rotate \c L, and return true if any modification was made.
+bool LoopRotate::processLoop(Loop *L) {
+ // Save the loop metadata.
+ MDNode *LoopMD = L->getLoopID();
+
+ bool SimplifiedLatch = false;
+
+ // Simplify the loop latch before attempting to rotate the header
+ // upward. Rotation may not be needed if the loop tail can be folded into the
+ // loop exit.
+ if (!RotationOnly)
+ SimplifiedLatch = simplifyLoopLatch(L);
+
+ bool MadeChange = rotateLoop(L, SimplifiedLatch);
+ assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
+ "Loop latch should be exiting after loop-rotate.");
+
+ // Restore the loop metadata.
+ // NB! We presume LoopRotation DOESN'T ADD its own metadata.
+ if ((MadeChange || SimplifiedLatch) && LoopMD)
+ L->setLoopID(LoopMD);
+
+ return MadeChange || SimplifiedLatch;
+}
+
+
+/// The utility to convert a loop into a loop with bottom test.
+bool llvm::LoopRotation(Loop *L, LoopInfo *LI, const TargetTransformInfo *TTI,
+ AssumptionCache *AC, DominatorTree *DT,
+ ScalarEvolution *SE, MemorySSAUpdater *MSSAU,
+ const SimplifyQuery &SQ, bool RotationOnly = true,
+ unsigned Threshold = unsigned(-1),
bool IsUtilMode = true, bool PrepareForLTO) {
- LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly,
+ LoopRotate LR(Threshold, LI, TTI, AC, DT, SE, MSSAU, SQ, RotationOnly,
IsUtilMode, PrepareForLTO);
- return LR.processLoop(L);
-}
+ return LR.processLoop(L);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LoopSimplify.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LoopSimplify.cpp
index bb724747a2..2e104334ad 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LoopSimplify.cpp
@@ -1,946 +1,946 @@
-//===- LoopSimplify.cpp - Loop Canonicalization Pass ----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass performs several transformations to transform natural loops into a
-// simpler form, which makes subsequent analyses and transformations simpler and
-// more effective.
-//
-// Loop pre-header insertion guarantees that there is a single, non-critical
-// entry edge from outside of the loop to the loop header. This simplifies a
-// number of analyses and transformations, such as LICM.
-//
-// Loop exit-block insertion guarantees that all exit blocks from the loop
-// (blocks which are outside of the loop that have predecessors inside of the
-// loop) only have predecessors from inside of the loop (and are thus dominated
-// by the loop header). This simplifies transformations such as store-sinking
-// that are built into LICM.
-//
-// This pass also guarantees that loops will have exactly one backedge.
-//
-// Indirectbr instructions introduce several complications. If the loop
-// contains or is entered by an indirectbr instruction, it may not be possible
-// to transform the loop and make these guarantees. Client code should check
-// that these conditions are true before relying on them.
-//
-// Similar complications arise from callbr instructions, particularly in
-// asm-goto where blockaddress expressions are used.
-//
-// Note that the simplifycfg pass will clean up blocks which are split out but
-// end up being unnecessary, so usage of this pass should not pessimize
-// generated code.
-//
-// This pass obviously modifies the CFG, but updates loop information and
-// dominator information.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/LoopSimplify.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SetOperations.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/DependenceAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-simplify"
-
-STATISTIC(NumNested , "Number of nested loops split out");
-
-// If the block isn't already, move the new block to right after some 'outside
-// block' block. This prevents the preheader from being placed inside the loop
-// body, e.g. when the loop hasn't been rotated.
-static void placeSplitBlockCarefully(BasicBlock *NewBB,
- SmallVectorImpl<BasicBlock *> &SplitPreds,
- Loop *L) {
- // Check to see if NewBB is already well placed.
- Function::iterator BBI = --NewBB->getIterator();
- for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
- if (&*BBI == SplitPreds[i])
- return;
- }
-
- // If it isn't already after an outside block, move it after one. This is
- // always good as it makes the uncond branch from the outside block into a
- // fall-through.
-
- // Figure out *which* outside block to put this after. Prefer an outside
- // block that neighbors a BB actually in the loop.
- BasicBlock *FoundBB = nullptr;
- for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
- Function::iterator BBI = SplitPreds[i]->getIterator();
- if (++BBI != NewBB->getParent()->end() && L->contains(&*BBI)) {
- FoundBB = SplitPreds[i];
- break;
- }
- }
-
- // If our heuristic for a *good* bb to place this after doesn't find
- // anything, just pick something. It's likely better than leaving it within
- // the loop.
- if (!FoundBB)
- FoundBB = SplitPreds[0];
- NewBB->moveAfter(FoundBB);
-}
-
-/// InsertPreheaderForLoop - Once we discover that a loop doesn't have a
-/// preheader, this method is called to insert one. This method has two phases:
-/// preheader insertion and analysis updating.
-///
-BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT,
- LoopInfo *LI, MemorySSAUpdater *MSSAU,
- bool PreserveLCSSA) {
- BasicBlock *Header = L->getHeader();
-
- // Compute the set of predecessors of the loop that are not in the loop.
- SmallVector<BasicBlock*, 8> OutsideBlocks;
- for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
- PI != PE; ++PI) {
- BasicBlock *P = *PI;
- if (!L->contains(P)) { // Coming in from outside the loop?
- // If the loop is branched to from an indirect terminator, we won't
- // be able to fully transform the loop, because it prohibits
- // edge splitting.
- if (P->getTerminator()->isIndirectTerminator())
- return nullptr;
-
- // Keep track of it.
- OutsideBlocks.push_back(P);
- }
- }
-
- // Split out the loop pre-header.
- BasicBlock *PreheaderBB;
- PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", DT,
- LI, MSSAU, PreserveLCSSA);
- if (!PreheaderBB)
- return nullptr;
-
- LLVM_DEBUG(dbgs() << "LoopSimplify: Creating pre-header "
- << PreheaderBB->getName() << "\n");
-
- // Make sure that NewBB is put someplace intelligent, which doesn't mess up
- // code layout too horribly.
- placeSplitBlockCarefully(PreheaderBB, OutsideBlocks, L);
-
- return PreheaderBB;
-}
-
-/// Add the specified block, and all of its predecessors, to the specified set,
-/// if it's not already in there. Stop predecessor traversal when we reach
-/// StopBlock.
-static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
+//===- LoopSimplify.cpp - Loop Canonicalization Pass ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs several transformations to transform natural loops into a
+// simpler form, which makes subsequent analyses and transformations simpler and
+// more effective.
+//
+// Loop pre-header insertion guarantees that there is a single, non-critical
+// entry edge from outside of the loop to the loop header. This simplifies a
+// number of analyses and transformations, such as LICM.
+//
+// Loop exit-block insertion guarantees that all exit blocks from the loop
+// (blocks which are outside of the loop that have predecessors inside of the
+// loop) only have predecessors from inside of the loop (and are thus dominated
+// by the loop header). This simplifies transformations such as store-sinking
+// that are built into LICM.
+//
+// This pass also guarantees that loops will have exactly one backedge.
+//
+// Indirectbr instructions introduce several complications. If the loop
+// contains or is entered by an indirectbr instruction, it may not be possible
+// to transform the loop and make these guarantees. Client code should check
+// that these conditions are true before relying on them.
+//
+// Similar complications arise from callbr instructions, particularly in
+// asm-goto where blockaddress expressions are used.
+//
+// Note that the simplifycfg pass will clean up blocks which are split out but
+// end up being unnecessary, so usage of this pass should not pessimize
+// generated code.
+//
+// This pass obviously modifies the CFG, but updates loop information and
+// dominator information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-simplify"
+
+STATISTIC(NumNested , "Number of nested loops split out");
+
+// If the block isn't already, move the new block to right after some 'outside
+// block' block. This prevents the preheader from being placed inside the loop
+// body, e.g. when the loop hasn't been rotated.
+static void placeSplitBlockCarefully(BasicBlock *NewBB,
+ SmallVectorImpl<BasicBlock *> &SplitPreds,
+ Loop *L) {
+ // Check to see if NewBB is already well placed.
+ Function::iterator BBI = --NewBB->getIterator();
+ for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
+ if (&*BBI == SplitPreds[i])
+ return;
+ }
+
+ // If it isn't already after an outside block, move it after one. This is
+ // always good as it makes the uncond branch from the outside block into a
+ // fall-through.
+
+ // Figure out *which* outside block to put this after. Prefer an outside
+ // block that neighbors a BB actually in the loop.
+ BasicBlock *FoundBB = nullptr;
+ for (unsigned i = 0, e = SplitPreds.size(); i != e; ++i) {
+ Function::iterator BBI = SplitPreds[i]->getIterator();
+ if (++BBI != NewBB->getParent()->end() && L->contains(&*BBI)) {
+ FoundBB = SplitPreds[i];
+ break;
+ }
+ }
+
+ // If our heuristic for a *good* bb to place this after doesn't find
+ // anything, just pick something. It's likely better than leaving it within
+ // the loop.
+ if (!FoundBB)
+ FoundBB = SplitPreds[0];
+ NewBB->moveAfter(FoundBB);
+}
+
+/// InsertPreheaderForLoop - Once we discover that a loop doesn't have a
+/// preheader, this method is called to insert one. This method has two phases:
+/// preheader insertion and analysis updating.
+///
+BasicBlock *llvm::InsertPreheaderForLoop(Loop *L, DominatorTree *DT,
+ LoopInfo *LI, MemorySSAUpdater *MSSAU,
+ bool PreserveLCSSA) {
+ BasicBlock *Header = L->getHeader();
+
+ // Compute the set of predecessors of the loop that are not in the loop.
+ SmallVector<BasicBlock*, 8> OutsideBlocks;
+ for (pred_iterator PI = pred_begin(Header), PE = pred_end(Header);
+ PI != PE; ++PI) {
+ BasicBlock *P = *PI;
+ if (!L->contains(P)) { // Coming in from outside the loop?
+ // If the loop is branched to from an indirect terminator, we won't
+ // be able to fully transform the loop, because it prohibits
+ // edge splitting.
+ if (P->getTerminator()->isIndirectTerminator())
+ return nullptr;
+
+ // Keep track of it.
+ OutsideBlocks.push_back(P);
+ }
+ }
+
+ // Split out the loop pre-header.
+ BasicBlock *PreheaderBB;
+ PreheaderBB = SplitBlockPredecessors(Header, OutsideBlocks, ".preheader", DT,
+ LI, MSSAU, PreserveLCSSA);
+ if (!PreheaderBB)
+ return nullptr;
+
+ LLVM_DEBUG(dbgs() << "LoopSimplify: Creating pre-header "
+ << PreheaderBB->getName() << "\n");
+
+ // Make sure that NewBB is put someplace intelligent, which doesn't mess up
+ // code layout too horribly.
+ placeSplitBlockCarefully(PreheaderBB, OutsideBlocks, L);
+
+ return PreheaderBB;
+}
+
+/// Add the specified block, and all of its predecessors, to the specified set,
+/// if it's not already in there. Stop predecessor traversal when we reach
+/// StopBlock.
+static void addBlockAndPredsToSet(BasicBlock *InputBB, BasicBlock *StopBlock,
SmallPtrSetImpl<BasicBlock *> &Blocks) {
- SmallVector<BasicBlock *, 8> Worklist;
- Worklist.push_back(InputBB);
- do {
- BasicBlock *BB = Worklist.pop_back_val();
- if (Blocks.insert(BB).second && BB != StopBlock)
- // If BB is not already processed and it is not a stop block then
- // insert its predecessor in the work list
+ SmallVector<BasicBlock *, 8> Worklist;
+ Worklist.push_back(InputBB);
+ do {
+ BasicBlock *BB = Worklist.pop_back_val();
+ if (Blocks.insert(BB).second && BB != StopBlock)
+ // If BB is not already processed and it is not a stop block then
+ // insert its predecessor in the work list
append_range(Worklist, predecessors(BB));
- } while (!Worklist.empty());
-}
-
-/// The first part of loop-nestification is to find a PHI node that tells
-/// us how to partition the loops.
-static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,
- AssumptionCache *AC) {
- const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
- for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
- PHINode *PN = cast<PHINode>(I);
- ++I;
- if (Value *V = SimplifyInstruction(PN, {DL, nullptr, DT, AC})) {
- // This is a degenerate PHI already, don't modify it!
- PN->replaceAllUsesWith(V);
- PN->eraseFromParent();
- continue;
- }
-
- // Scan this PHI node looking for a use of the PHI node by itself.
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
- if (PN->getIncomingValue(i) == PN &&
- L->contains(PN->getIncomingBlock(i)))
- // We found something tasty to remove.
- return PN;
- }
- return nullptr;
-}
-
-/// If this loop has multiple backedges, try to pull one of them out into
-/// a nested loop.
-///
-/// This is important for code that looks like
-/// this:
-///
-/// Loop:
-/// ...
-/// br cond, Loop, Next
-/// ...
-/// br cond2, Loop, Out
-///
-/// To identify this common case, we look at the PHI nodes in the header of the
-/// loop. PHI nodes with unchanging values on one backedge correspond to values
-/// that change in the "outer" loop, but not in the "inner" loop.
-///
-/// If we are able to separate out a loop, return the new outer loop that was
-/// created.
-///
-static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
- DominatorTree *DT, LoopInfo *LI,
- ScalarEvolution *SE, bool PreserveLCSSA,
- AssumptionCache *AC, MemorySSAUpdater *MSSAU) {
- // Don't try to separate loops without a preheader.
- if (!Preheader)
- return nullptr;
-
- // Treat the presence of convergent functions conservatively. The
- // transformation is invalid if calls to certain convergent
- // functions (like an AMDGPU barrier) get included in the resulting
- // inner loop. But blocks meant for the inner loop will be
- // identified later at a point where it's too late to abort the
- // transformation. Also, the convergent attribute is not really
- // sufficient to express the semantics of functions that are
- // affected by this transformation. So we choose to back off if such
- // a function call is present until a better alternative becomes
- // available. This is similar to the conservative treatment of
- // convergent function calls in GVNHoist and JumpThreading.
- for (auto BB : L->blocks()) {
- for (auto &II : *BB) {
- if (auto CI = dyn_cast<CallBase>(&II)) {
- if (CI->isConvergent()) {
- return nullptr;
- }
- }
- }
- }
-
- // The header is not a landing pad; preheader insertion should ensure this.
- BasicBlock *Header = L->getHeader();
- assert(!Header->isEHPad() && "Can't insert backedge to EH pad");
-
- PHINode *PN = findPHIToPartitionLoops(L, DT, AC);
- if (!PN) return nullptr; // No known way to partition.
-
- // Pull out all predecessors that have varying values in the loop. This
- // handles the case when a PHI node has multiple instances of itself as
- // arguments.
- SmallVector<BasicBlock*, 8> OuterLoopPreds;
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- if (PN->getIncomingValue(i) != PN ||
- !L->contains(PN->getIncomingBlock(i))) {
- // We can't split indirect control flow edges.
- if (PN->getIncomingBlock(i)->getTerminator()->isIndirectTerminator())
- return nullptr;
- OuterLoopPreds.push_back(PN->getIncomingBlock(i));
- }
- }
- LLVM_DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
-
- // If ScalarEvolution is around and knows anything about values in
- // this loop, tell it to forget them, because we're about to
- // substantially change it.
- if (SE)
- SE->forgetLoop(L);
-
- BasicBlock *NewBB = SplitBlockPredecessors(Header, OuterLoopPreds, ".outer",
- DT, LI, MSSAU, PreserveLCSSA);
-
- // Make sure that NewBB is put someplace intelligent, which doesn't mess up
- // code layout too horribly.
- placeSplitBlockCarefully(NewBB, OuterLoopPreds, L);
-
- // Create the new outer loop.
- Loop *NewOuter = LI->AllocateLoop();
-
- // Change the parent loop to use the outer loop as its child now.
- if (Loop *Parent = L->getParentLoop())
- Parent->replaceChildLoopWith(L, NewOuter);
- else
- LI->changeTopLevelLoop(L, NewOuter);
-
- // L is now a subloop of our outer loop.
- NewOuter->addChildLoop(L);
-
- for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
- I != E; ++I)
- NewOuter->addBlockEntry(*I);
-
- // Now reset the header in L, which had been moved by
- // SplitBlockPredecessors for the outer loop.
- L->moveToHeader(Header);
-
- // Determine which blocks should stay in L and which should be moved out to
- // the Outer loop now.
+ } while (!Worklist.empty());
+}
+
+/// The first part of loop-nestification is to find a PHI node that tells
+/// us how to partition the loops.
+static PHINode *findPHIToPartitionLoops(Loop *L, DominatorTree *DT,
+ AssumptionCache *AC) {
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ) {
+ PHINode *PN = cast<PHINode>(I);
+ ++I;
+ if (Value *V = SimplifyInstruction(PN, {DL, nullptr, DT, AC})) {
+ // This is a degenerate PHI already, don't modify it!
+ PN->replaceAllUsesWith(V);
+ PN->eraseFromParent();
+ continue;
+ }
+
+ // Scan this PHI node looking for a use of the PHI node by itself.
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+ if (PN->getIncomingValue(i) == PN &&
+ L->contains(PN->getIncomingBlock(i)))
+ // We found something tasty to remove.
+ return PN;
+ }
+ return nullptr;
+}
+
+/// If this loop has multiple backedges, try to pull one of them out into
+/// a nested loop.
+///
+/// This is important for code that looks like
+/// this:
+///
+/// Loop:
+/// ...
+/// br cond, Loop, Next
+/// ...
+/// br cond2, Loop, Out
+///
+/// To identify this common case, we look at the PHI nodes in the header of the
+/// loop. PHI nodes with unchanging values on one backedge correspond to values
+/// that change in the "outer" loop, but not in the "inner" loop.
+///
+/// If we are able to separate out a loop, return the new outer loop that was
+/// created.
+///
+static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
+ DominatorTree *DT, LoopInfo *LI,
+ ScalarEvolution *SE, bool PreserveLCSSA,
+ AssumptionCache *AC, MemorySSAUpdater *MSSAU) {
+ // Don't try to separate loops without a preheader.
+ if (!Preheader)
+ return nullptr;
+
+ // Treat the presence of convergent functions conservatively. The
+ // transformation is invalid if calls to certain convergent
+ // functions (like an AMDGPU barrier) get included in the resulting
+ // inner loop. But blocks meant for the inner loop will be
+ // identified later at a point where it's too late to abort the
+ // transformation. Also, the convergent attribute is not really
+ // sufficient to express the semantics of functions that are
+ // affected by this transformation. So we choose to back off if such
+ // a function call is present until a better alternative becomes
+ // available. This is similar to the conservative treatment of
+ // convergent function calls in GVNHoist and JumpThreading.
+ for (auto BB : L->blocks()) {
+ for (auto &II : *BB) {
+ if (auto CI = dyn_cast<CallBase>(&II)) {
+ if (CI->isConvergent()) {
+ return nullptr;
+ }
+ }
+ }
+ }
+
+ // The header is not a landing pad; preheader insertion should ensure this.
+ BasicBlock *Header = L->getHeader();
+ assert(!Header->isEHPad() && "Can't insert backedge to EH pad");
+
+ PHINode *PN = findPHIToPartitionLoops(L, DT, AC);
+ if (!PN) return nullptr; // No known way to partition.
+
+ // Pull out all predecessors that have varying values in the loop. This
+ // handles the case when a PHI node has multiple instances of itself as
+ // arguments.
+ SmallVector<BasicBlock*, 8> OuterLoopPreds;
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ if (PN->getIncomingValue(i) != PN ||
+ !L->contains(PN->getIncomingBlock(i))) {
+ // We can't split indirect control flow edges.
+ if (PN->getIncomingBlock(i)->getTerminator()->isIndirectTerminator())
+ return nullptr;
+ OuterLoopPreds.push_back(PN->getIncomingBlock(i));
+ }
+ }
+ LLVM_DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
+
+ // If ScalarEvolution is around and knows anything about values in
+ // this loop, tell it to forget them, because we're about to
+ // substantially change it.
+ if (SE)
+ SE->forgetLoop(L);
+
+ BasicBlock *NewBB = SplitBlockPredecessors(Header, OuterLoopPreds, ".outer",
+ DT, LI, MSSAU, PreserveLCSSA);
+
+ // Make sure that NewBB is put someplace intelligent, which doesn't mess up
+ // code layout too horribly.
+ placeSplitBlockCarefully(NewBB, OuterLoopPreds, L);
+
+ // Create the new outer loop.
+ Loop *NewOuter = LI->AllocateLoop();
+
+ // Change the parent loop to use the outer loop as its child now.
+ if (Loop *Parent = L->getParentLoop())
+ Parent->replaceChildLoopWith(L, NewOuter);
+ else
+ LI->changeTopLevelLoop(L, NewOuter);
+
+ // L is now a subloop of our outer loop.
+ NewOuter->addChildLoop(L);
+
+ for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+ I != E; ++I)
+ NewOuter->addBlockEntry(*I);
+
+ // Now reset the header in L, which had been moved by
+ // SplitBlockPredecessors for the outer loop.
+ L->moveToHeader(Header);
+
+ // Determine which blocks should stay in L and which should be moved out to
+ // the Outer loop now.
SmallPtrSet<BasicBlock *, 4> BlocksInL;
for (BasicBlock *P : predecessors(Header)) {
- if (DT->dominates(Header, P))
- addBlockAndPredsToSet(P, Header, BlocksInL);
- }
-
- // Scan all of the loop children of L, moving them to OuterLoop if they are
- // not part of the inner loop.
- const std::vector<Loop*> &SubLoops = L->getSubLoops();
- for (size_t I = 0; I != SubLoops.size(); )
- if (BlocksInL.count(SubLoops[I]->getHeader()))
- ++I; // Loop remains in L
- else
- NewOuter->addChildLoop(L->removeChildLoop(SubLoops.begin() + I));
-
- SmallVector<BasicBlock *, 8> OuterLoopBlocks;
- OuterLoopBlocks.push_back(NewBB);
- // Now that we know which blocks are in L and which need to be moved to
- // OuterLoop, move any blocks that need it.
- for (unsigned i = 0; i != L->getBlocks().size(); ++i) {
- BasicBlock *BB = L->getBlocks()[i];
- if (!BlocksInL.count(BB)) {
- // Move this block to the parent, updating the exit blocks sets
- L->removeBlockFromLoop(BB);
- if ((*LI)[BB] == L) {
- LI->changeLoopFor(BB, NewOuter);
- OuterLoopBlocks.push_back(BB);
- }
- --i;
- }
- }
-
- // Split edges to exit blocks from the inner loop, if they emerged in the
- // process of separating the outer one.
- formDedicatedExitBlocks(L, DT, LI, MSSAU, PreserveLCSSA);
-
- if (PreserveLCSSA) {
- // Fix LCSSA form for L. Some values, which previously were only used inside
- // L, can now be used in NewOuter loop. We need to insert phi-nodes for them
- // in corresponding exit blocks.
- // We don't need to form LCSSA recursively, because there cannot be uses
- // inside a newly created loop of defs from inner loops as those would
- // already be a use of an LCSSA phi node.
- formLCSSA(*L, *DT, LI, SE);
-
- assert(NewOuter->isRecursivelyLCSSAForm(*DT, *LI) &&
- "LCSSA is broken after separating nested loops!");
- }
-
- return NewOuter;
-}
-
-/// This method is called when the specified loop has more than one
-/// backedge in it.
-///
-/// If this occurs, revector all of these backedges to target a new basic block
-/// and have that block branch to the loop header. This ensures that loops
-/// have exactly one backedge.
-static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
- DominatorTree *DT, LoopInfo *LI,
- MemorySSAUpdater *MSSAU) {
- assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!");
-
- // Get information about the loop
- BasicBlock *Header = L->getHeader();
- Function *F = Header->getParent();
-
- // Unique backedge insertion currently depends on having a preheader.
- if (!Preheader)
- return nullptr;
-
- // The header is not an EH pad; preheader insertion should ensure this.
- assert(!Header->isEHPad() && "Can't insert backedge to EH pad");
-
- // Figure out which basic blocks contain back-edges to the loop header.
- std::vector<BasicBlock*> BackedgeBlocks;
- for (pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I){
- BasicBlock *P = *I;
-
- // Indirect edges cannot be split, so we must fail if we find one.
- if (P->getTerminator()->isIndirectTerminator())
- return nullptr;
-
- if (P != Preheader) BackedgeBlocks.push_back(P);
- }
-
- // Create and insert the new backedge block...
- BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(),
- Header->getName() + ".backedge", F);
- BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
- BETerminator->setDebugLoc(Header->getFirstNonPHI()->getDebugLoc());
-
- LLVM_DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block "
- << BEBlock->getName() << "\n");
-
- // Move the new backedge block to right after the last backedge block.
- Function::iterator InsertPos = ++BackedgeBlocks.back()->getIterator();
- F->getBasicBlockList().splice(InsertPos, F->getBasicBlockList(), BEBlock);
-
- // Now that the block has been inserted into the function, create PHI nodes in
- // the backedge block which correspond to any PHI nodes in the header block.
- for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
- PHINode *PN = cast<PHINode>(I);
- PHINode *NewPN = PHINode::Create(PN->getType(), BackedgeBlocks.size(),
- PN->getName()+".be", BETerminator);
-
- // Loop over the PHI node, moving all entries except the one for the
- // preheader over to the new PHI node.
- unsigned PreheaderIdx = ~0U;
- bool HasUniqueIncomingValue = true;
- Value *UniqueValue = nullptr;
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- BasicBlock *IBB = PN->getIncomingBlock(i);
- Value *IV = PN->getIncomingValue(i);
- if (IBB == Preheader) {
- PreheaderIdx = i;
- } else {
- NewPN->addIncoming(IV, IBB);
- if (HasUniqueIncomingValue) {
- if (!UniqueValue)
- UniqueValue = IV;
- else if (UniqueValue != IV)
- HasUniqueIncomingValue = false;
- }
- }
- }
-
- // Delete all of the incoming values from the old PN except the preheader's
- assert(PreheaderIdx != ~0U && "PHI has no preheader entry??");
- if (PreheaderIdx != 0) {
- PN->setIncomingValue(0, PN->getIncomingValue(PreheaderIdx));
- PN->setIncomingBlock(0, PN->getIncomingBlock(PreheaderIdx));
- }
- // Nuke all entries except the zero'th.
- for (unsigned i = 0, e = PN->getNumIncomingValues()-1; i != e; ++i)
- PN->removeIncomingValue(e-i, false);
-
- // Finally, add the newly constructed PHI node as the entry for the BEBlock.
- PN->addIncoming(NewPN, BEBlock);
-
- // As an optimization, if all incoming values in the new PhiNode (which is a
- // subset of the incoming values of the old PHI node) have the same value,
- // eliminate the PHI Node.
- if (HasUniqueIncomingValue) {
- NewPN->replaceAllUsesWith(UniqueValue);
- BEBlock->getInstList().erase(NewPN);
- }
- }
-
- // Now that all of the PHI nodes have been inserted and adjusted, modify the
- // backedge blocks to jump to the BEBlock instead of the header.
- // If one of the backedges has llvm.loop metadata attached, we remove
- // it from the backedge and add it to BEBlock.
- unsigned LoopMDKind = BEBlock->getContext().getMDKindID("llvm.loop");
- MDNode *LoopMD = nullptr;
- for (unsigned i = 0, e = BackedgeBlocks.size(); i != e; ++i) {
- Instruction *TI = BackedgeBlocks[i]->getTerminator();
- if (!LoopMD)
- LoopMD = TI->getMetadata(LoopMDKind);
- TI->setMetadata(LoopMDKind, nullptr);
- TI->replaceSuccessorWith(Header, BEBlock);
- }
- BEBlock->getTerminator()->setMetadata(LoopMDKind, LoopMD);
-
- //===--- Update all analyses which we must preserve now -----------------===//
-
- // Update Loop Information - we know that this block is now in the current
- // loop and all parent loops.
- L->addBasicBlockToLoop(BEBlock, *LI);
-
- // Update dominator information
- DT->splitBlock(BEBlock);
-
- if (MSSAU)
- MSSAU->updatePhisWhenInsertingUniqueBackedgeBlock(Header, Preheader,
- BEBlock);
-
- return BEBlock;
-}
-
-/// Simplify one loop and queue further loops for simplification.
-static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
- DominatorTree *DT, LoopInfo *LI,
- ScalarEvolution *SE, AssumptionCache *AC,
- MemorySSAUpdater *MSSAU, bool PreserveLCSSA) {
- bool Changed = false;
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
-ReprocessLoop:
-
- // Check to see that no blocks (other than the header) in this loop have
- // predecessors that are not in the loop. This is not valid for natural
- // loops, but can occur if the blocks are unreachable. Since they are
- // unreachable we can just shamelessly delete those CFG edges!
- for (Loop::block_iterator BB = L->block_begin(), E = L->block_end();
- BB != E; ++BB) {
- if (*BB == L->getHeader()) continue;
-
- SmallPtrSet<BasicBlock*, 4> BadPreds;
- for (pred_iterator PI = pred_begin(*BB),
- PE = pred_end(*BB); PI != PE; ++PI) {
- BasicBlock *P = *PI;
- if (!L->contains(P))
- BadPreds.insert(P);
- }
-
- // Delete each unique out-of-loop (and thus dead) predecessor.
- for (BasicBlock *P : BadPreds) {
-
- LLVM_DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor "
- << P->getName() << "\n");
-
- // Zap the dead pred's terminator and replace it with unreachable.
- Instruction *TI = P->getTerminator();
- changeToUnreachable(TI, /*UseLLVMTrap=*/false, PreserveLCSSA,
- /*DTU=*/nullptr, MSSAU);
- Changed = true;
- }
- }
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- // If there are exiting blocks with branches on undef, resolve the undef in
- // the direction which will exit the loop. This will help simplify loop
- // trip count computations.
- SmallVector<BasicBlock*, 8> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
- for (BasicBlock *ExitingBlock : ExitingBlocks)
- if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()))
- if (BI->isConditional()) {
- if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) {
-
- LLVM_DEBUG(dbgs()
- << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
- << ExitingBlock->getName() << "\n");
-
- BI->setCondition(ConstantInt::get(Cond->getType(),
- !L->contains(BI->getSuccessor(0))));
-
- Changed = true;
- }
- }
-
- // Does the loop already have a preheader? If so, don't insert one.
- BasicBlock *Preheader = L->getLoopPreheader();
- if (!Preheader) {
- Preheader = InsertPreheaderForLoop(L, DT, LI, MSSAU, PreserveLCSSA);
- if (Preheader)
- Changed = true;
- }
-
- // Next, check to make sure that all exit nodes of the loop only have
- // predecessors that are inside of the loop. This check guarantees that the
- // loop preheader/header will dominate the exit blocks. If the exit block has
- // predecessors from outside of the loop, split the edge now.
- if (formDedicatedExitBlocks(L, DT, LI, MSSAU, PreserveLCSSA))
- Changed = true;
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- // If the header has more than two predecessors at this point (from the
- // preheader and from multiple backedges), we must adjust the loop.
- BasicBlock *LoopLatch = L->getLoopLatch();
- if (!LoopLatch) {
- // If this is really a nested loop, rip it out into a child loop. Don't do
- // this for loops with a giant number of backedges, just factor them into a
- // common backedge instead.
- if (L->getNumBackEdges() < 8) {
- if (Loop *OuterL = separateNestedLoop(L, Preheader, DT, LI, SE,
- PreserveLCSSA, AC, MSSAU)) {
- ++NumNested;
- // Enqueue the outer loop as it should be processed next in our
- // depth-first nest walk.
- Worklist.push_back(OuterL);
-
- // This is a big restructuring change, reprocess the whole loop.
- Changed = true;
- // GCC doesn't tail recursion eliminate this.
- // FIXME: It isn't clear we can't rely on LLVM to TRE this.
- goto ReprocessLoop;
- }
- }
-
- // If we either couldn't, or didn't want to, identify nesting of the loops,
- // insert a new block that all backedges target, then make it jump to the
- // loop header.
- LoopLatch = insertUniqueBackedgeBlock(L, Preheader, DT, LI, MSSAU);
- if (LoopLatch)
- Changed = true;
- }
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-
- // Scan over the PHI nodes in the loop header. Since they now have only two
- // incoming values (the loop is canonicalized), we may have simplified the PHI
- // down to 'X = phi [X, Y]', which should be replaced with 'Y'.
- PHINode *PN;
- for (BasicBlock::iterator I = L->getHeader()->begin();
- (PN = dyn_cast<PHINode>(I++)); )
- if (Value *V = SimplifyInstruction(PN, {DL, nullptr, DT, AC})) {
- if (SE) SE->forgetValue(PN);
- if (!PreserveLCSSA || LI->replacementPreservesLCSSAForm(PN, V)) {
- PN->replaceAllUsesWith(V);
- PN->eraseFromParent();
- Changed = true;
- }
- }
-
- // If this loop has multiple exits and the exits all go to the same
- // block, attempt to merge the exits. This helps several passes, such
- // as LoopRotation, which do not support loops with multiple exits.
- // SimplifyCFG also does this (and this code uses the same utility
- // function), however this code is loop-aware, where SimplifyCFG is
- // not. That gives it the advantage of being able to hoist
- // loop-invariant instructions out of the way to open up more
- // opportunities, and the disadvantage of having the responsibility
- // to preserve dominator information.
- auto HasUniqueExitBlock = [&]() {
- BasicBlock *UniqueExit = nullptr;
- for (auto *ExitingBB : ExitingBlocks)
- for (auto *SuccBB : successors(ExitingBB)) {
- if (L->contains(SuccBB))
- continue;
-
- if (!UniqueExit)
- UniqueExit = SuccBB;
- else if (UniqueExit != SuccBB)
- return false;
- }
-
- return true;
- };
- if (HasUniqueExitBlock()) {
- for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
- BasicBlock *ExitingBlock = ExitingBlocks[i];
- if (!ExitingBlock->getSinglePredecessor()) continue;
- BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
- if (!BI || !BI->isConditional()) continue;
- CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition());
- if (!CI || CI->getParent() != ExitingBlock) continue;
-
- // Attempt to hoist out all instructions except for the
- // comparison and the branch.
- bool AllInvariant = true;
- bool AnyInvariant = false;
- for (auto I = ExitingBlock->instructionsWithoutDebug().begin(); &*I != BI; ) {
- Instruction *Inst = &*I++;
- if (Inst == CI)
- continue;
- if (!L->makeLoopInvariant(
- Inst, AnyInvariant,
- Preheader ? Preheader->getTerminator() : nullptr, MSSAU)) {
- AllInvariant = false;
- break;
- }
- }
- if (AnyInvariant) {
- Changed = true;
- // The loop disposition of all SCEV expressions that depend on any
- // hoisted values have also changed.
- if (SE)
- SE->forgetLoopDispositions(L);
- }
- if (!AllInvariant) continue;
-
- // The block has now been cleared of all instructions except for
- // a comparison and a conditional branch. SimplifyCFG may be able
- // to fold it now.
+ if (DT->dominates(Header, P))
+ addBlockAndPredsToSet(P, Header, BlocksInL);
+ }
+
+ // Scan all of the loop children of L, moving them to OuterLoop if they are
+ // not part of the inner loop.
+ const std::vector<Loop*> &SubLoops = L->getSubLoops();
+ for (size_t I = 0; I != SubLoops.size(); )
+ if (BlocksInL.count(SubLoops[I]->getHeader()))
+ ++I; // Loop remains in L
+ else
+ NewOuter->addChildLoop(L->removeChildLoop(SubLoops.begin() + I));
+
+ SmallVector<BasicBlock *, 8> OuterLoopBlocks;
+ OuterLoopBlocks.push_back(NewBB);
+ // Now that we know which blocks are in L and which need to be moved to
+ // OuterLoop, move any blocks that need it.
+ for (unsigned i = 0; i != L->getBlocks().size(); ++i) {
+ BasicBlock *BB = L->getBlocks()[i];
+ if (!BlocksInL.count(BB)) {
+ // Move this block to the parent, updating the exit blocks sets
+ L->removeBlockFromLoop(BB);
+ if ((*LI)[BB] == L) {
+ LI->changeLoopFor(BB, NewOuter);
+ OuterLoopBlocks.push_back(BB);
+ }
+ --i;
+ }
+ }
+
+ // Split edges to exit blocks from the inner loop, if they emerged in the
+ // process of separating the outer one.
+ formDedicatedExitBlocks(L, DT, LI, MSSAU, PreserveLCSSA);
+
+ if (PreserveLCSSA) {
+ // Fix LCSSA form for L. Some values, which previously were only used inside
+ // L, can now be used in NewOuter loop. We need to insert phi-nodes for them
+ // in corresponding exit blocks.
+ // We don't need to form LCSSA recursively, because there cannot be uses
+ // inside a newly created loop of defs from inner loops as those would
+ // already be a use of an LCSSA phi node.
+ formLCSSA(*L, *DT, LI, SE);
+
+ assert(NewOuter->isRecursivelyLCSSAForm(*DT, *LI) &&
+ "LCSSA is broken after separating nested loops!");
+ }
+
+ return NewOuter;
+}
+
+/// This method is called when the specified loop has more than one
+/// backedge in it.
+///
+/// If this occurs, revector all of these backedges to target a new basic block
+/// and have that block branch to the loop header. This ensures that loops
+/// have exactly one backedge.
+static BasicBlock *insertUniqueBackedgeBlock(Loop *L, BasicBlock *Preheader,
+ DominatorTree *DT, LoopInfo *LI,
+ MemorySSAUpdater *MSSAU) {
+ assert(L->getNumBackEdges() > 1 && "Must have > 1 backedge!");
+
+ // Get information about the loop
+ BasicBlock *Header = L->getHeader();
+ Function *F = Header->getParent();
+
+ // Unique backedge insertion currently depends on having a preheader.
+ if (!Preheader)
+ return nullptr;
+
+ // The header is not an EH pad; preheader insertion should ensure this.
+ assert(!Header->isEHPad() && "Can't insert backedge to EH pad");
+
+ // Figure out which basic blocks contain back-edges to the loop header.
+ std::vector<BasicBlock*> BackedgeBlocks;
+ for (pred_iterator I = pred_begin(Header), E = pred_end(Header); I != E; ++I){
+ BasicBlock *P = *I;
+
+ // Indirect edges cannot be split, so we must fail if we find one.
+ if (P->getTerminator()->isIndirectTerminator())
+ return nullptr;
+
+ if (P != Preheader) BackedgeBlocks.push_back(P);
+ }
+
+ // Create and insert the new backedge block...
+ BasicBlock *BEBlock = BasicBlock::Create(Header->getContext(),
+ Header->getName() + ".backedge", F);
+ BranchInst *BETerminator = BranchInst::Create(Header, BEBlock);
+ BETerminator->setDebugLoc(Header->getFirstNonPHI()->getDebugLoc());
+
+ LLVM_DEBUG(dbgs() << "LoopSimplify: Inserting unique backedge block "
+ << BEBlock->getName() << "\n");
+
+ // Move the new backedge block to right after the last backedge block.
+ Function::iterator InsertPos = ++BackedgeBlocks.back()->getIterator();
+ F->getBasicBlockList().splice(InsertPos, F->getBasicBlockList(), BEBlock);
+
+ // Now that the block has been inserted into the function, create PHI nodes in
+ // the backedge block which correspond to any PHI nodes in the header block.
+ for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+ PHINode *PN = cast<PHINode>(I);
+ PHINode *NewPN = PHINode::Create(PN->getType(), BackedgeBlocks.size(),
+ PN->getName()+".be", BETerminator);
+
+ // Loop over the PHI node, moving all entries except the one for the
+ // preheader over to the new PHI node.
+ unsigned PreheaderIdx = ~0U;
+ bool HasUniqueIncomingValue = true;
+ Value *UniqueValue = nullptr;
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ BasicBlock *IBB = PN->getIncomingBlock(i);
+ Value *IV = PN->getIncomingValue(i);
+ if (IBB == Preheader) {
+ PreheaderIdx = i;
+ } else {
+ NewPN->addIncoming(IV, IBB);
+ if (HasUniqueIncomingValue) {
+ if (!UniqueValue)
+ UniqueValue = IV;
+ else if (UniqueValue != IV)
+ HasUniqueIncomingValue = false;
+ }
+ }
+ }
+
+ // Delete all of the incoming values from the old PN except the preheader's
+ assert(PreheaderIdx != ~0U && "PHI has no preheader entry??");
+ if (PreheaderIdx != 0) {
+ PN->setIncomingValue(0, PN->getIncomingValue(PreheaderIdx));
+ PN->setIncomingBlock(0, PN->getIncomingBlock(PreheaderIdx));
+ }
+ // Nuke all entries except the zero'th.
+ for (unsigned i = 0, e = PN->getNumIncomingValues()-1; i != e; ++i)
+ PN->removeIncomingValue(e-i, false);
+
+ // Finally, add the newly constructed PHI node as the entry for the BEBlock.
+ PN->addIncoming(NewPN, BEBlock);
+
+ // As an optimization, if all incoming values in the new PhiNode (which is a
+ // subset of the incoming values of the old PHI node) have the same value,
+ // eliminate the PHI Node.
+ if (HasUniqueIncomingValue) {
+ NewPN->replaceAllUsesWith(UniqueValue);
+ BEBlock->getInstList().erase(NewPN);
+ }
+ }
+
+ // Now that all of the PHI nodes have been inserted and adjusted, modify the
+ // backedge blocks to jump to the BEBlock instead of the header.
+ // If one of the backedges has llvm.loop metadata attached, we remove
+ // it from the backedge and add it to BEBlock.
+ unsigned LoopMDKind = BEBlock->getContext().getMDKindID("llvm.loop");
+ MDNode *LoopMD = nullptr;
+ for (unsigned i = 0, e = BackedgeBlocks.size(); i != e; ++i) {
+ Instruction *TI = BackedgeBlocks[i]->getTerminator();
+ if (!LoopMD)
+ LoopMD = TI->getMetadata(LoopMDKind);
+ TI->setMetadata(LoopMDKind, nullptr);
+ TI->replaceSuccessorWith(Header, BEBlock);
+ }
+ BEBlock->getTerminator()->setMetadata(LoopMDKind, LoopMD);
+
+ //===--- Update all analyses which we must preserve now -----------------===//
+
+ // Update Loop Information - we know that this block is now in the current
+ // loop and all parent loops.
+ L->addBasicBlockToLoop(BEBlock, *LI);
+
+ // Update dominator information
+ DT->splitBlock(BEBlock);
+
+ if (MSSAU)
+ MSSAU->updatePhisWhenInsertingUniqueBackedgeBlock(Header, Preheader,
+ BEBlock);
+
+ return BEBlock;
+}
+
+/// Simplify one loop and queue further loops for simplification.
+static bool simplifyOneLoop(Loop *L, SmallVectorImpl<Loop *> &Worklist,
+ DominatorTree *DT, LoopInfo *LI,
+ ScalarEvolution *SE, AssumptionCache *AC,
+ MemorySSAUpdater *MSSAU, bool PreserveLCSSA) {
+ bool Changed = false;
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ReprocessLoop:
+
+ // Check to see that no blocks (other than the header) in this loop have
+ // predecessors that are not in the loop. This is not valid for natural
+ // loops, but can occur if the blocks are unreachable. Since they are
+ // unreachable we can just shamelessly delete those CFG edges!
+ for (Loop::block_iterator BB = L->block_begin(), E = L->block_end();
+ BB != E; ++BB) {
+ if (*BB == L->getHeader()) continue;
+
+ SmallPtrSet<BasicBlock*, 4> BadPreds;
+ for (pred_iterator PI = pred_begin(*BB),
+ PE = pred_end(*BB); PI != PE; ++PI) {
+ BasicBlock *P = *PI;
+ if (!L->contains(P))
+ BadPreds.insert(P);
+ }
+
+ // Delete each unique out-of-loop (and thus dead) predecessor.
+ for (BasicBlock *P : BadPreds) {
+
+ LLVM_DEBUG(dbgs() << "LoopSimplify: Deleting edge from dead predecessor "
+ << P->getName() << "\n");
+
+ // Zap the dead pred's terminator and replace it with unreachable.
+ Instruction *TI = P->getTerminator();
+ changeToUnreachable(TI, /*UseLLVMTrap=*/false, PreserveLCSSA,
+ /*DTU=*/nullptr, MSSAU);
+ Changed = true;
+ }
+ }
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ // If there are exiting blocks with branches on undef, resolve the undef in
+ // the direction which will exit the loop. This will help simplify loop
+ // trip count computations.
+ SmallVector<BasicBlock*, 8> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+ for (BasicBlock *ExitingBlock : ExitingBlocks)
+ if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()))
+ if (BI->isConditional()) {
+ if (UndefValue *Cond = dyn_cast<UndefValue>(BI->getCondition())) {
+
+ LLVM_DEBUG(dbgs()
+ << "LoopSimplify: Resolving \"br i1 undef\" to exit in "
+ << ExitingBlock->getName() << "\n");
+
+ BI->setCondition(ConstantInt::get(Cond->getType(),
+ !L->contains(BI->getSuccessor(0))));
+
+ Changed = true;
+ }
+ }
+
+ // Does the loop already have a preheader? If so, don't insert one.
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader) {
+ Preheader = InsertPreheaderForLoop(L, DT, LI, MSSAU, PreserveLCSSA);
+ if (Preheader)
+ Changed = true;
+ }
+
+ // Next, check to make sure that all exit nodes of the loop only have
+ // predecessors that are inside of the loop. This check guarantees that the
+ // loop preheader/header will dominate the exit blocks. If the exit block has
+ // predecessors from outside of the loop, split the edge now.
+ if (formDedicatedExitBlocks(L, DT, LI, MSSAU, PreserveLCSSA))
+ Changed = true;
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ // If the header has more than two predecessors at this point (from the
+ // preheader and from multiple backedges), we must adjust the loop.
+ BasicBlock *LoopLatch = L->getLoopLatch();
+ if (!LoopLatch) {
+ // If this is really a nested loop, rip it out into a child loop. Don't do
+ // this for loops with a giant number of backedges, just factor them into a
+ // common backedge instead.
+ if (L->getNumBackEdges() < 8) {
+ if (Loop *OuterL = separateNestedLoop(L, Preheader, DT, LI, SE,
+ PreserveLCSSA, AC, MSSAU)) {
+ ++NumNested;
+ // Enqueue the outer loop as it should be processed next in our
+ // depth-first nest walk.
+ Worklist.push_back(OuterL);
+
+ // This is a big restructuring change, reprocess the whole loop.
+ Changed = true;
+ // GCC doesn't tail recursion eliminate this.
+ // FIXME: It isn't clear we can't rely on LLVM to TRE this.
+ goto ReprocessLoop;
+ }
+ }
+
+ // If we either couldn't, or didn't want to, identify nesting of the loops,
+ // insert a new block that all backedges target, then make it jump to the
+ // loop header.
+ LoopLatch = insertUniqueBackedgeBlock(L, Preheader, DT, LI, MSSAU);
+ if (LoopLatch)
+ Changed = true;
+ }
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+ // Scan over the PHI nodes in the loop header. Since they now have only two
+ // incoming values (the loop is canonicalized), we may have simplified the PHI
+ // down to 'X = phi [X, Y]', which should be replaced with 'Y'.
+ PHINode *PN;
+ for (BasicBlock::iterator I = L->getHeader()->begin();
+ (PN = dyn_cast<PHINode>(I++)); )
+ if (Value *V = SimplifyInstruction(PN, {DL, nullptr, DT, AC})) {
+ if (SE) SE->forgetValue(PN);
+ if (!PreserveLCSSA || LI->replacementPreservesLCSSAForm(PN, V)) {
+ PN->replaceAllUsesWith(V);
+ PN->eraseFromParent();
+ Changed = true;
+ }
+ }
+
+ // If this loop has multiple exits and the exits all go to the same
+ // block, attempt to merge the exits. This helps several passes, such
+ // as LoopRotation, which do not support loops with multiple exits.
+ // SimplifyCFG also does this (and this code uses the same utility
+ // function), however this code is loop-aware, where SimplifyCFG is
+ // not. That gives it the advantage of being able to hoist
+ // loop-invariant instructions out of the way to open up more
+ // opportunities, and the disadvantage of having the responsibility
+ // to preserve dominator information.
+ auto HasUniqueExitBlock = [&]() {
+ BasicBlock *UniqueExit = nullptr;
+ for (auto *ExitingBB : ExitingBlocks)
+ for (auto *SuccBB : successors(ExitingBB)) {
+ if (L->contains(SuccBB))
+ continue;
+
+ if (!UniqueExit)
+ UniqueExit = SuccBB;
+ else if (UniqueExit != SuccBB)
+ return false;
+ }
+
+ return true;
+ };
+ if (HasUniqueExitBlock()) {
+ for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+ BasicBlock *ExitingBlock = ExitingBlocks[i];
+ if (!ExitingBlock->getSinglePredecessor()) continue;
+ BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+ if (!BI || !BI->isConditional()) continue;
+ CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition());
+ if (!CI || CI->getParent() != ExitingBlock) continue;
+
+ // Attempt to hoist out all instructions except for the
+ // comparison and the branch.
+ bool AllInvariant = true;
+ bool AnyInvariant = false;
+ for (auto I = ExitingBlock->instructionsWithoutDebug().begin(); &*I != BI; ) {
+ Instruction *Inst = &*I++;
+ if (Inst == CI)
+ continue;
+ if (!L->makeLoopInvariant(
+ Inst, AnyInvariant,
+ Preheader ? Preheader->getTerminator() : nullptr, MSSAU)) {
+ AllInvariant = false;
+ break;
+ }
+ }
+ if (AnyInvariant) {
+ Changed = true;
+ // The loop disposition of all SCEV expressions that depend on any
+ // hoisted values have also changed.
+ if (SE)
+ SE->forgetLoopDispositions(L);
+ }
+ if (!AllInvariant) continue;
+
+ // The block has now been cleared of all instructions except for
+ // a comparison and a conditional branch. SimplifyCFG may be able
+ // to fold it now.
if (!FoldBranchToCommonDest(BI, /*DTU=*/nullptr, MSSAU))
- continue;
-
- // Success. The block is now dead, so remove it from the loop,
- // update the dominator tree and delete it.
- LLVM_DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block "
- << ExitingBlock->getName() << "\n");
-
+ continue;
+
+ // Success. The block is now dead, so remove it from the loop,
+ // update the dominator tree and delete it.
+ LLVM_DEBUG(dbgs() << "LoopSimplify: Eliminating exiting block "
+ << ExitingBlock->getName() << "\n");
+
assert(pred_empty(ExitingBlock));
- Changed = true;
- LI->removeBlock(ExitingBlock);
-
- DomTreeNode *Node = DT->getNode(ExitingBlock);
- while (!Node->isLeaf()) {
- DomTreeNode *Child = Node->back();
- DT->changeImmediateDominator(Child, Node->getIDom());
- }
- DT->eraseNode(ExitingBlock);
- if (MSSAU) {
- SmallSetVector<BasicBlock *, 8> ExitBlockSet;
- ExitBlockSet.insert(ExitingBlock);
- MSSAU->removeBlocks(ExitBlockSet);
- }
-
- BI->getSuccessor(0)->removePredecessor(
- ExitingBlock, /* KeepOneInputPHIs */ PreserveLCSSA);
- BI->getSuccessor(1)->removePredecessor(
- ExitingBlock, /* KeepOneInputPHIs */ PreserveLCSSA);
- ExitingBlock->eraseFromParent();
- }
- }
-
- // Changing exit conditions for blocks may affect exit counts of this loop and
- // any of its paretns, so we must invalidate the entire subtree if we've made
- // any changes.
- if (Changed && SE)
- SE->forgetTopmostLoop(L);
-
- if (MSSAU && VerifyMemorySSA)
- MSSAU->getMemorySSA()->verifyMemorySSA();
-
- return Changed;
-}
-
-bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI,
- ScalarEvolution *SE, AssumptionCache *AC,
- MemorySSAUpdater *MSSAU, bool PreserveLCSSA) {
- bool Changed = false;
-
-#ifndef NDEBUG
- // If we're asked to preserve LCSSA, the loop nest needs to start in LCSSA
- // form.
- if (PreserveLCSSA) {
- assert(DT && "DT not available.");
- assert(LI && "LI not available.");
- assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
- "Requested to preserve LCSSA, but it's already broken.");
- }
-#endif
-
- // Worklist maintains our depth-first queue of loops in this nest to process.
- SmallVector<Loop *, 4> Worklist;
- Worklist.push_back(L);
-
- // Walk the worklist from front to back, pushing newly found sub loops onto
- // the back. This will let us process loops from back to front in depth-first
- // order. We can use this simple process because loops form a tree.
- for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
- Loop *L2 = Worklist[Idx];
- Worklist.append(L2->begin(), L2->end());
- }
-
- while (!Worklist.empty())
- Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, DT, LI, SE,
- AC, MSSAU, PreserveLCSSA);
-
- return Changed;
-}
-
-namespace {
- struct LoopSimplify : public FunctionPass {
- static char ID; // Pass identification, replacement for typeid
- LoopSimplify() : FunctionPass(ID) {
- initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
-
- // We need loop information to identify the loops...
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
-
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
-
- AU.addPreserved<BasicAAWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- AU.addPreserved<SCEVAAWrapperPass>();
- AU.addPreservedID(LCSSAID);
- AU.addPreserved<DependenceAnalysisWrapperPass>();
- AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added.
- AU.addPreserved<BranchProbabilityInfoWrapperPass>();
- if (EnableMSSALoopDependency)
- AU.addPreserved<MemorySSAWrapperPass>();
- }
-
- /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
- void verifyAnalysis() const override;
- };
-}
-
-char LoopSimplify::ID = 0;
-INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
- "Canonicalize natural loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",
- "Canonicalize natural loops", false, false)
-
-// Publicly exposed interface to pass...
-char &llvm::LoopSimplifyID = LoopSimplify::ID;
-Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
-
-/// runOnFunction - Run down all loops in the CFG (recursively, but we could do
-/// it in any convenient order) inserting preheaders...
-///
-bool LoopSimplify::runOnFunction(Function &F) {
- bool Changed = false;
- LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
- ScalarEvolution *SE = SEWP ? &SEWP->getSE() : nullptr;
- AssumptionCache *AC =
- &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- MemorySSA *MSSA = nullptr;
- std::unique_ptr<MemorySSAUpdater> MSSAU;
- if (EnableMSSALoopDependency) {
- auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
- if (MSSAAnalysis) {
- MSSA = &MSSAAnalysis->getMSSA();
- MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
- }
- }
-
- bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
-
- // Simplify each loop nest in the function.
+ Changed = true;
+ LI->removeBlock(ExitingBlock);
+
+ DomTreeNode *Node = DT->getNode(ExitingBlock);
+ while (!Node->isLeaf()) {
+ DomTreeNode *Child = Node->back();
+ DT->changeImmediateDominator(Child, Node->getIDom());
+ }
+ DT->eraseNode(ExitingBlock);
+ if (MSSAU) {
+ SmallSetVector<BasicBlock *, 8> ExitBlockSet;
+ ExitBlockSet.insert(ExitingBlock);
+ MSSAU->removeBlocks(ExitBlockSet);
+ }
+
+ BI->getSuccessor(0)->removePredecessor(
+ ExitingBlock, /* KeepOneInputPHIs */ PreserveLCSSA);
+ BI->getSuccessor(1)->removePredecessor(
+ ExitingBlock, /* KeepOneInputPHIs */ PreserveLCSSA);
+ ExitingBlock->eraseFromParent();
+ }
+ }
+
+ // Changing exit conditions for blocks may affect exit counts of this loop and
+ // any of its paretns, so we must invalidate the entire subtree if we've made
+ // any changes.
+ if (Changed && SE)
+ SE->forgetTopmostLoop(L);
+
+ if (MSSAU && VerifyMemorySSA)
+ MSSAU->getMemorySSA()->verifyMemorySSA();
+
+ return Changed;
+}
+
+bool llvm::simplifyLoop(Loop *L, DominatorTree *DT, LoopInfo *LI,
+ ScalarEvolution *SE, AssumptionCache *AC,
+ MemorySSAUpdater *MSSAU, bool PreserveLCSSA) {
+ bool Changed = false;
+
+#ifndef NDEBUG
+ // If we're asked to preserve LCSSA, the loop nest needs to start in LCSSA
+ // form.
+ if (PreserveLCSSA) {
+ assert(DT && "DT not available.");
+ assert(LI && "LI not available.");
+ assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+ "Requested to preserve LCSSA, but it's already broken.");
+ }
+#endif
+
+ // Worklist maintains our depth-first queue of loops in this nest to process.
+ SmallVector<Loop *, 4> Worklist;
+ Worklist.push_back(L);
+
+ // Walk the worklist from front to back, pushing newly found sub loops onto
+ // the back. This will let us process loops from back to front in depth-first
+ // order. We can use this simple process because loops form a tree.
+ for (unsigned Idx = 0; Idx != Worklist.size(); ++Idx) {
+ Loop *L2 = Worklist[Idx];
+ Worklist.append(L2->begin(), L2->end());
+ }
+
+ while (!Worklist.empty())
+ Changed |= simplifyOneLoop(Worklist.pop_back_val(), Worklist, DT, LI, SE,
+ AC, MSSAU, PreserveLCSSA);
+
+ return Changed;
+}
+
+namespace {
+ struct LoopSimplify : public FunctionPass {
+ static char ID; // Pass identification, replacement for typeid
+ LoopSimplify() : FunctionPass(ID) {
+ initializeLoopSimplifyPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+
+ // We need loop information to identify the loops...
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<SCEVAAWrapperPass>();
+ AU.addPreservedID(LCSSAID);
+ AU.addPreserved<DependenceAnalysisWrapperPass>();
+ AU.addPreservedID(BreakCriticalEdgesID); // No critical edges added.
+ AU.addPreserved<BranchProbabilityInfoWrapperPass>();
+ if (EnableMSSALoopDependency)
+ AU.addPreserved<MemorySSAWrapperPass>();
+ }
+
+ /// verifyAnalysis() - Verify LoopSimplifyForm's guarantees.
+ void verifyAnalysis() const override;
+ };
+}
+
+char LoopSimplify::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopSimplify, "loop-simplify",
+ "Canonicalize natural loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(LoopSimplify, "loop-simplify",
+ "Canonicalize natural loops", false, false)
+
+// Publicly exposed interface to pass...
+char &llvm::LoopSimplifyID = LoopSimplify::ID;
+Pass *llvm::createLoopSimplifyPass() { return new LoopSimplify(); }
+
+/// runOnFunction - Run down all loops in the CFG (recursively, but we could do
+/// it in any convenient order) inserting preheaders...
+///
+bool LoopSimplify::runOnFunction(Function &F) {
+ bool Changed = false;
+ LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
+ ScalarEvolution *SE = SEWP ? &SEWP->getSE() : nullptr;
+ AssumptionCache *AC =
+ &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ MemorySSA *MSSA = nullptr;
+ std::unique_ptr<MemorySSAUpdater> MSSAU;
+ if (EnableMSSALoopDependency) {
+ auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+ if (MSSAAnalysis) {
+ MSSA = &MSSAAnalysis->getMSSA();
+ MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+ }
+ }
+
+ bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+
+ // Simplify each loop nest in the function.
for (auto *L : *LI)
Changed |= simplifyLoop(L, DT, LI, SE, AC, MSSAU.get(), PreserveLCSSA);
-
-#ifndef NDEBUG
- if (PreserveLCSSA) {
- bool InLCSSA = all_of(
- *LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT, *LI); });
- assert(InLCSSA && "LCSSA is broken after loop-simplify.");
- }
-#endif
- return Changed;
-}
-
-PreservedAnalyses LoopSimplifyPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- bool Changed = false;
- LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
- DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
- ScalarEvolution *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
- AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
- auto *MSSAAnalysis = AM.getCachedResult<MemorySSAAnalysis>(F);
- std::unique_ptr<MemorySSAUpdater> MSSAU;
- if (MSSAAnalysis) {
- auto *MSSA = &MSSAAnalysis->getMSSA();
- MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
- }
-
-
- // Note that we don't preserve LCSSA in the new PM, if you need it run LCSSA
- // after simplifying the loops. MemorySSA is preserved if it exists.
+
+#ifndef NDEBUG
+ if (PreserveLCSSA) {
+ bool InLCSSA = all_of(
+ *LI, [&](Loop *L) { return L->isRecursivelyLCSSAForm(*DT, *LI); });
+ assert(InLCSSA && "LCSSA is broken after loop-simplify.");
+ }
+#endif
+ return Changed;
+}
+
+PreservedAnalyses LoopSimplifyPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ bool Changed = false;
+ LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
+ DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ ScalarEvolution *SE = AM.getCachedResult<ScalarEvolutionAnalysis>(F);
+ AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+ auto *MSSAAnalysis = AM.getCachedResult<MemorySSAAnalysis>(F);
+ std::unique_ptr<MemorySSAUpdater> MSSAU;
+ if (MSSAAnalysis) {
+ auto *MSSA = &MSSAAnalysis->getMSSA();
+ MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+ }
+
+
+ // Note that we don't preserve LCSSA in the new PM, if you need it run LCSSA
+ // after simplifying the loops. MemorySSA is preserved if it exists.
for (auto *L : *LI)
- Changed |=
+ Changed |=
simplifyLoop(L, DT, LI, SE, AC, MSSAU.get(), /*PreserveLCSSA*/ false);
-
- if (!Changed)
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserve<DominatorTreeAnalysis>();
- PA.preserve<LoopAnalysis>();
- PA.preserve<BasicAA>();
- PA.preserve<GlobalsAA>();
- PA.preserve<SCEVAA>();
- PA.preserve<ScalarEvolutionAnalysis>();
- PA.preserve<DependenceAnalysis>();
- if (MSSAAnalysis)
- PA.preserve<MemorySSAAnalysis>();
- // BPI maps conditional terminators to probabilities, LoopSimplify can insert
- // blocks, but it does so only by splitting existing blocks and edges. This
- // results in the interesting property that all new terminators inserted are
- // unconditional branches which do not appear in BPI. All deletions are
- // handled via ValueHandle callbacks w/in BPI.
- PA.preserve<BranchProbabilityAnalysis>();
- return PA;
-}
-
-// FIXME: Restore this code when we re-enable verification in verifyAnalysis
-// below.
-#if 0
-static void verifyLoop(Loop *L) {
- // Verify subloops.
- for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
- verifyLoop(*I);
-
- // It used to be possible to just assert L->isLoopSimplifyForm(), however
- // with the introduction of indirectbr, there are now cases where it's
- // not possible to transform a loop as necessary. We can at least check
- // that there is an indirectbr near any time there's trouble.
-
- // Indirectbr can interfere with preheader and unique backedge insertion.
- if (!L->getLoopPreheader() || !L->getLoopLatch()) {
- bool HasIndBrPred = false;
- for (pred_iterator PI = pred_begin(L->getHeader()),
- PE = pred_end(L->getHeader()); PI != PE; ++PI)
- if (isa<IndirectBrInst>((*PI)->getTerminator())) {
- HasIndBrPred = true;
- break;
- }
- assert(HasIndBrPred &&
- "LoopSimplify has no excuse for missing loop header info!");
- (void)HasIndBrPred;
- }
-
- // Indirectbr can interfere with exit block canonicalization.
- if (!L->hasDedicatedExits()) {
- bool HasIndBrExiting = false;
- SmallVector<BasicBlock*, 8> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
- for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
- if (isa<IndirectBrInst>((ExitingBlocks[i])->getTerminator())) {
- HasIndBrExiting = true;
- break;
- }
- }
-
- assert(HasIndBrExiting &&
- "LoopSimplify has no excuse for missing exit block info!");
- (void)HasIndBrExiting;
- }
-}
-#endif
-
-void LoopSimplify::verifyAnalysis() const {
- // FIXME: This routine is being called mid-way through the loop pass manager
- // as loop passes destroy this analysis. That's actually fine, but we have no
- // way of expressing that here. Once all of the passes that destroy this are
- // hoisted out of the loop pass manager we can add back verification here.
-#if 0
- for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
- verifyLoop(*I);
-#endif
-}
+
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserve<DominatorTreeAnalysis>();
+ PA.preserve<LoopAnalysis>();
+ PA.preserve<BasicAA>();
+ PA.preserve<GlobalsAA>();
+ PA.preserve<SCEVAA>();
+ PA.preserve<ScalarEvolutionAnalysis>();
+ PA.preserve<DependenceAnalysis>();
+ if (MSSAAnalysis)
+ PA.preserve<MemorySSAAnalysis>();
+ // BPI maps conditional terminators to probabilities, LoopSimplify can insert
+ // blocks, but it does so only by splitting existing blocks and edges. This
+ // results in the interesting property that all new terminators inserted are
+ // unconditional branches which do not appear in BPI. All deletions are
+ // handled via ValueHandle callbacks w/in BPI.
+ PA.preserve<BranchProbabilityAnalysis>();
+ return PA;
+}
+
+// FIXME: Restore this code when we re-enable verification in verifyAnalysis
+// below.
+#if 0
+static void verifyLoop(Loop *L) {
+ // Verify subloops.
+ for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+ verifyLoop(*I);
+
+ // It used to be possible to just assert L->isLoopSimplifyForm(), however
+ // with the introduction of indirectbr, there are now cases where it's
+ // not possible to transform a loop as necessary. We can at least check
+ // that there is an indirectbr near any time there's trouble.
+
+ // Indirectbr can interfere with preheader and unique backedge insertion.
+ if (!L->getLoopPreheader() || !L->getLoopLatch()) {
+ bool HasIndBrPred = false;
+ for (pred_iterator PI = pred_begin(L->getHeader()),
+ PE = pred_end(L->getHeader()); PI != PE; ++PI)
+ if (isa<IndirectBrInst>((*PI)->getTerminator())) {
+ HasIndBrPred = true;
+ break;
+ }
+ assert(HasIndBrPred &&
+ "LoopSimplify has no excuse for missing loop header info!");
+ (void)HasIndBrPred;
+ }
+
+ // Indirectbr can interfere with exit block canonicalization.
+ if (!L->hasDedicatedExits()) {
+ bool HasIndBrExiting = false;
+ SmallVector<BasicBlock*, 8> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+ for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+ if (isa<IndirectBrInst>((ExitingBlocks[i])->getTerminator())) {
+ HasIndBrExiting = true;
+ break;
+ }
+ }
+
+ assert(HasIndBrExiting &&
+ "LoopSimplify has no excuse for missing exit block info!");
+ (void)HasIndBrExiting;
+ }
+}
+#endif
+
+void LoopSimplify::verifyAnalysis() const {
+ // FIXME: This routine is being called mid-way through the loop pass manager
+ // as loop passes destroy this analysis. That's actually fine, but we have no
+ // way of expressing that here. Once all of the passes that destroy this are
+ // hoisted out of the loop pass manager we can add back verification here.
+#if 0
+ for (LoopInfo::iterator I = LI->begin(), E = LI->end(); I != E; ++I)
+ verifyLoop(*I);
+#endif
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnroll.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnroll.cpp
index 6426a5636f..d4cd574052 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnroll.cpp
@@ -1,367 +1,367 @@
-//===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements some loop unrolling utilities. It does not define any
-// actual pass or policy, but provides a single function to perform loop
-// unrolling.
-//
-// The process of unrolling can produce extraneous basic blocks linked with
-// unconditional branches. This will be corrected in the future.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/ADT/ilist_iterator.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/ValueMap.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/GenericDomTree.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
+//===-- UnrollLoop.cpp - Loop unrolling utilities -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some loop unrolling utilities. It does not define any
+// actual pass or policy, but provides a single function to perform loop
+// unrolling.
+//
+// The process of unrolling can produce extraneous basic blocks linked with
+// unconditional branches. This will be corrected in the future.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/ilist_iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/LoopPeel.h"
-#include "llvm/Transforms/Utils/LoopSimplify.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SimplifyIndVar.h"
-#include "llvm/Transforms/Utils/UnrollLoop.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <algorithm>
-#include <assert.h>
-#include <type_traits>
-#include <vector>
-
-namespace llvm {
-class DataLayout;
-class Value;
-} // namespace llvm
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-unroll"
-
-// TODO: Should these be here or in LoopUnroll?
-STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
-STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
-STATISTIC(NumUnrolledNotLatch, "Number of loops unrolled without a conditional "
- "latch (completely or otherwise)");
-
-static cl::opt<bool>
-UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
- cl::desc("Allow runtime unrolled loops to be unrolled "
- "with epilog instead of prolog."));
-
-static cl::opt<bool>
-UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
- cl::desc("Verify domtree after unrolling"),
-#ifdef EXPENSIVE_CHECKS
- cl::init(true)
-#else
- cl::init(false)
-#endif
- );
-
-/// Check if unrolling created a situation where we need to insert phi nodes to
-/// preserve LCSSA form.
-/// \param Blocks is a vector of basic blocks representing unrolled loop.
-/// \param L is the outer loop.
-/// It's possible that some of the blocks are in L, and some are not. In this
-/// case, if there is a use is outside L, and definition is inside L, we need to
-/// insert a phi-node, otherwise LCSSA will be broken.
-/// The function is just a helper function for llvm::UnrollLoop that returns
-/// true if this situation occurs, indicating that LCSSA needs to be fixed.
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <assert.h>
+#include <type_traits>
+#include <vector>
+
+namespace llvm {
+class DataLayout;
+class Value;
+} // namespace llvm
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll"
+
+// TODO: Should these be here or in LoopUnroll?
+STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
+STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
+STATISTIC(NumUnrolledNotLatch, "Number of loops unrolled without a conditional "
+ "latch (completely or otherwise)");
+
+static cl::opt<bool>
+UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
+ cl::desc("Allow runtime unrolled loops to be unrolled "
+ "with epilog instead of prolog."));
+
+static cl::opt<bool>
+UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
+ cl::desc("Verify domtree after unrolling"),
+#ifdef EXPENSIVE_CHECKS
+ cl::init(true)
+#else
+ cl::init(false)
+#endif
+ );
+
+/// Check if unrolling created a situation where we need to insert phi nodes to
+/// preserve LCSSA form.
+/// \param Blocks is a vector of basic blocks representing unrolled loop.
+/// \param L is the outer loop.
+/// It's possible that some of the blocks are in L, and some are not. In this
+/// case, if there is a use is outside L, and definition is inside L, we need to
+/// insert a phi-node, otherwise LCSSA will be broken.
+/// The function is just a helper function for llvm::UnrollLoop that returns
+/// true if this situation occurs, indicating that LCSSA needs to be fixed.
static bool needToInsertPhisForLCSSA(Loop *L,
const std::vector<BasicBlock *> &Blocks,
- LoopInfo *LI) {
- for (BasicBlock *BB : Blocks) {
- if (LI->getLoopFor(BB) == L)
- continue;
- for (Instruction &I : *BB) {
- for (Use &U : I.operands()) {
+ LoopInfo *LI) {
+ for (BasicBlock *BB : Blocks) {
+ if (LI->getLoopFor(BB) == L)
+ continue;
+ for (Instruction &I : *BB) {
+ for (Use &U : I.operands()) {
if (const auto *Def = dyn_cast<Instruction>(U)) {
- Loop *DefLoop = LI->getLoopFor(Def->getParent());
- if (!DefLoop)
- continue;
- if (DefLoop->contains(L))
- return true;
- }
- }
- }
- }
- return false;
-}
-
-/// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary
-/// and adds a mapping from the original loop to the new loop to NewLoops.
-/// Returns nullptr if no new loop was created and a pointer to the
-/// original loop OriginalBB was part of otherwise.
-const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
- BasicBlock *ClonedBB, LoopInfo *LI,
- NewLoopsMap &NewLoops) {
- // Figure out which loop New is in.
- const Loop *OldLoop = LI->getLoopFor(OriginalBB);
- assert(OldLoop && "Should (at least) be in the loop being unrolled!");
-
- Loop *&NewLoop = NewLoops[OldLoop];
- if (!NewLoop) {
- // Found a new sub-loop.
- assert(OriginalBB == OldLoop->getHeader() &&
- "Header should be first in RPO");
-
- NewLoop = LI->AllocateLoop();
- Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
-
- if (NewLoopParent)
- NewLoopParent->addChildLoop(NewLoop);
- else
- LI->addTopLevelLoop(NewLoop);
-
- NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
- return OldLoop;
- } else {
- NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
- return nullptr;
- }
-}
-
-/// The function chooses which type of unroll (epilog or prolog) is more
-/// profitabale.
-/// Epilog unroll is more profitable when there is PHI that starts from
-/// constant. In this case epilog will leave PHI start from constant,
-/// but prolog will convert it to non-constant.
-///
-/// loop:
-/// PN = PHI [I, Latch], [CI, PreHeader]
-/// I = foo(PN)
-/// ...
-///
-/// Epilog unroll case.
-/// loop:
-/// PN = PHI [I2, Latch], [CI, PreHeader]
-/// I1 = foo(PN)
-/// I2 = foo(I1)
-/// ...
-/// Prolog unroll case.
-/// NewPN = PHI [PrologI, Prolog], [CI, PreHeader]
-/// loop:
-/// PN = PHI [I2, Latch], [NewPN, PreHeader]
-/// I1 = foo(PN)
-/// I2 = foo(I1)
-/// ...
-///
-static bool isEpilogProfitable(Loop *L) {
- BasicBlock *PreHeader = L->getLoopPreheader();
- BasicBlock *Header = L->getHeader();
- assert(PreHeader && Header);
- for (const PHINode &PN : Header->phis()) {
- if (isa<ConstantInt>(PN.getIncomingValueForBlock(PreHeader)))
- return true;
- }
- return false;
-}
-
-/// Perform some cleanup and simplifications on loops after unrolling. It is
-/// useful to simplify the IV's in the new loop, as well as do a quick
-/// simplify/dce pass of the instructions.
-void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
- ScalarEvolution *SE, DominatorTree *DT,
- AssumptionCache *AC,
- const TargetTransformInfo *TTI) {
- // Simplify any new induction variables in the partially unrolled loop.
- if (SE && SimplifyIVs) {
- SmallVector<WeakTrackingVH, 16> DeadInsts;
- simplifyLoopIVs(L, SE, DT, LI, TTI, DeadInsts);
-
- // Aggressively clean up dead instructions that simplifyLoopIVs already
- // identified. Any remaining should be cleaned up below.
- while (!DeadInsts.empty()) {
- Value *V = DeadInsts.pop_back_val();
- if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
- RecursivelyDeleteTriviallyDeadInstructions(Inst);
- }
- }
-
- // At this point, the code is well formed. We now do a quick sweep over the
- // inserted code, doing constant propagation and dead code elimination as we
- // go.
- const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
- for (BasicBlock *BB : L->getBlocks()) {
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
- Instruction *Inst = &*I++;
-
- if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
- if (LI->replacementPreservesLCSSAForm(Inst, V))
- Inst->replaceAllUsesWith(V);
- if (isInstructionTriviallyDead(Inst))
- BB->getInstList().erase(Inst);
- }
- }
-
- // TODO: after peeling or unrolling, previously loop variant conditions are
- // likely to fold to constants, eagerly propagating those here will require
- // fewer cleanup passes to be run. Alternatively, a LoopEarlyCSE might be
- // appropriate.
-}
-
-/// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling
-/// can only fail when the loop's latch block is not terminated by a conditional
-/// branch instruction. However, if the trip count (and multiple) are not known,
-/// loop unrolling will mostly produce more code that is no faster.
-///
-/// TripCount is the upper bound of the iteration on which control exits
-/// LatchBlock. Control may exit the loop prior to TripCount iterations either
-/// via an early branch in other loop block or via LatchBlock terminator. This
-/// is relaxed from the general definition of trip count which is the number of
-/// times the loop header executes. Note that UnrollLoop assumes that the loop
-/// counter test is in LatchBlock in order to remove unnecesssary instances of
-/// the test. If control can exit the loop from the LatchBlock's terminator
-/// prior to TripCount iterations, flag PreserveCondBr needs to be set.
-///
-/// PreserveCondBr indicates whether the conditional branch of the LatchBlock
-/// needs to be preserved. It is needed when we use trip count upper bound to
-/// fully unroll the loop. If PreserveOnlyFirst is also set then only the first
-/// conditional branch needs to be preserved.
-///
-/// Similarly, TripMultiple divides the number of times that the LatchBlock may
-/// execute without exiting the loop.
-///
-/// If AllowRuntime is true then UnrollLoop will consider unrolling loops that
-/// have a runtime (i.e. not compile time constant) trip count. Unrolling these
-/// loops require a unroll "prologue" that runs "RuntimeTripCount % Count"
-/// iterations before branching into the unrolled loop. UnrollLoop will not
-/// runtime-unroll the loop if computing RuntimeTripCount will be expensive and
-/// AllowExpensiveTripCount is false.
-///
-/// If we want to perform PGO-based loop peeling, PeelCount is set to the
-/// number of iterations we want to peel off.
-///
-/// The LoopInfo Analysis that is passed will be kept consistent.
-///
-/// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
-/// DominatorTree if they are non-null.
-///
-/// If RemainderLoop is non-null, it will receive the remainder loop (if
-/// required and not fully unrolled).
-LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
- ScalarEvolution *SE, DominatorTree *DT,
- AssumptionCache *AC,
- const TargetTransformInfo *TTI,
- OptimizationRemarkEmitter *ORE,
- bool PreserveLCSSA, Loop **RemainderLoop) {
-
+ Loop *DefLoop = LI->getLoopFor(Def->getParent());
+ if (!DefLoop)
+ continue;
+ if (DefLoop->contains(L))
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+}
+
+/// Adds ClonedBB to LoopInfo, creates a new loop for ClonedBB if necessary
+/// and adds a mapping from the original loop to the new loop to NewLoops.
+/// Returns nullptr if no new loop was created and a pointer to the
+/// original loop OriginalBB was part of otherwise.
+const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
+ BasicBlock *ClonedBB, LoopInfo *LI,
+ NewLoopsMap &NewLoops) {
+ // Figure out which loop New is in.
+ const Loop *OldLoop = LI->getLoopFor(OriginalBB);
+ assert(OldLoop && "Should (at least) be in the loop being unrolled!");
+
+ Loop *&NewLoop = NewLoops[OldLoop];
+ if (!NewLoop) {
+ // Found a new sub-loop.
+ assert(OriginalBB == OldLoop->getHeader() &&
+ "Header should be first in RPO");
+
+ NewLoop = LI->AllocateLoop();
+ Loop *NewLoopParent = NewLoops.lookup(OldLoop->getParentLoop());
+
+ if (NewLoopParent)
+ NewLoopParent->addChildLoop(NewLoop);
+ else
+ LI->addTopLevelLoop(NewLoop);
+
+ NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
+ return OldLoop;
+ } else {
+ NewLoop->addBasicBlockToLoop(ClonedBB, *LI);
+ return nullptr;
+ }
+}
+
+/// The function chooses which type of unroll (epilog or prolog) is more
+/// profitabale.
+/// Epilog unroll is more profitable when there is PHI that starts from
+/// constant. In this case epilog will leave PHI start from constant,
+/// but prolog will convert it to non-constant.
+///
+/// loop:
+/// PN = PHI [I, Latch], [CI, PreHeader]
+/// I = foo(PN)
+/// ...
+///
+/// Epilog unroll case.
+/// loop:
+/// PN = PHI [I2, Latch], [CI, PreHeader]
+/// I1 = foo(PN)
+/// I2 = foo(I1)
+/// ...
+/// Prolog unroll case.
+/// NewPN = PHI [PrologI, Prolog], [CI, PreHeader]
+/// loop:
+/// PN = PHI [I2, Latch], [NewPN, PreHeader]
+/// I1 = foo(PN)
+/// I2 = foo(I1)
+/// ...
+///
+static bool isEpilogProfitable(Loop *L) {
+ BasicBlock *PreHeader = L->getLoopPreheader();
+ BasicBlock *Header = L->getHeader();
+ assert(PreHeader && Header);
+ for (const PHINode &PN : Header->phis()) {
+ if (isa<ConstantInt>(PN.getIncomingValueForBlock(PreHeader)))
+ return true;
+ }
+ return false;
+}
+
+/// Perform some cleanup and simplifications on loops after unrolling. It is
+/// useful to simplify the IV's in the new loop, as well as do a quick
+/// simplify/dce pass of the instructions.
+void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
+ ScalarEvolution *SE, DominatorTree *DT,
+ AssumptionCache *AC,
+ const TargetTransformInfo *TTI) {
+ // Simplify any new induction variables in the partially unrolled loop.
+ if (SE && SimplifyIVs) {
+ SmallVector<WeakTrackingVH, 16> DeadInsts;
+ simplifyLoopIVs(L, SE, DT, LI, TTI, DeadInsts);
+
+ // Aggressively clean up dead instructions that simplifyLoopIVs already
+ // identified. Any remaining should be cleaned up below.
+ while (!DeadInsts.empty()) {
+ Value *V = DeadInsts.pop_back_val();
+ if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
+ RecursivelyDeleteTriviallyDeadInstructions(Inst);
+ }
+ }
+
+ // At this point, the code is well formed. We now do a quick sweep over the
+ // inserted code, doing constant propagation and dead code elimination as we
+ // go.
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+ for (BasicBlock *BB : L->getBlocks()) {
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *Inst = &*I++;
+
+ if (Value *V = SimplifyInstruction(Inst, {DL, nullptr, DT, AC}))
+ if (LI->replacementPreservesLCSSAForm(Inst, V))
+ Inst->replaceAllUsesWith(V);
+ if (isInstructionTriviallyDead(Inst))
+ BB->getInstList().erase(Inst);
+ }
+ }
+
+ // TODO: after peeling or unrolling, previously loop variant conditions are
+ // likely to fold to constants, eagerly propagating those here will require
+ // fewer cleanup passes to be run. Alternatively, a LoopEarlyCSE might be
+ // appropriate.
+}
+
+/// Unroll the given loop by Count. The loop must be in LCSSA form. Unrolling
+/// can only fail when the loop's latch block is not terminated by a conditional
+/// branch instruction. However, if the trip count (and multiple) are not known,
+/// loop unrolling will mostly produce more code that is no faster.
+///
+/// TripCount is the upper bound of the iteration on which control exits
+/// LatchBlock. Control may exit the loop prior to TripCount iterations either
+/// via an early branch in other loop block or via LatchBlock terminator. This
+/// is relaxed from the general definition of trip count which is the number of
+/// times the loop header executes. Note that UnrollLoop assumes that the loop
+/// counter test is in LatchBlock in order to remove unnecesssary instances of
+/// the test. If control can exit the loop from the LatchBlock's terminator
+/// prior to TripCount iterations, flag PreserveCondBr needs to be set.
+///
+/// PreserveCondBr indicates whether the conditional branch of the LatchBlock
+/// needs to be preserved. It is needed when we use trip count upper bound to
+/// fully unroll the loop. If PreserveOnlyFirst is also set then only the first
+/// conditional branch needs to be preserved.
+///
+/// Similarly, TripMultiple divides the number of times that the LatchBlock may
+/// execute without exiting the loop.
+///
+/// If AllowRuntime is true then UnrollLoop will consider unrolling loops that
+/// have a runtime (i.e. not compile time constant) trip count. Unrolling these
+/// loops require a unroll "prologue" that runs "RuntimeTripCount % Count"
+/// iterations before branching into the unrolled loop. UnrollLoop will not
+/// runtime-unroll the loop if computing RuntimeTripCount will be expensive and
+/// AllowExpensiveTripCount is false.
+///
+/// If we want to perform PGO-based loop peeling, PeelCount is set to the
+/// number of iterations we want to peel off.
+///
+/// The LoopInfo Analysis that is passed will be kept consistent.
+///
+/// This utility preserves LoopInfo. It will also preserve ScalarEvolution and
+/// DominatorTree if they are non-null.
+///
+/// If RemainderLoop is non-null, it will receive the remainder loop (if
+/// required and not fully unrolled).
+LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
+ ScalarEvolution *SE, DominatorTree *DT,
+ AssumptionCache *AC,
+ const TargetTransformInfo *TTI,
+ OptimizationRemarkEmitter *ORE,
+ bool PreserveLCSSA, Loop **RemainderLoop) {
+
if (!L->getLoopPreheader()) {
- LLVM_DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n");
- return LoopUnrollResult::Unmodified;
- }
-
+ LLVM_DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
if (!L->getLoopLatch()) {
- LLVM_DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n");
- return LoopUnrollResult::Unmodified;
- }
-
- // Loops with indirectbr cannot be cloned.
- if (!L->isSafeToClone()) {
- LLVM_DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n");
- return LoopUnrollResult::Unmodified;
- }
-
+ LLVM_DEBUG(dbgs() << " Can't unroll; loop exit-block-insertion failed.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ // Loops with indirectbr cannot be cloned.
+ if (!L->isSafeToClone()) {
+ LLVM_DEBUG(dbgs() << " Can't unroll; Loop body cannot be cloned.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
if (L->getHeader()->hasAddressTaken()) {
- // The loop-rotate pass can be helpful to avoid this in many cases.
- LLVM_DEBUG(
- dbgs() << " Won't unroll loop: address of header block is taken.\n");
- return LoopUnrollResult::Unmodified;
- }
-
- if (ULO.TripCount != 0)
- LLVM_DEBUG(dbgs() << " Trip Count = " << ULO.TripCount << "\n");
- if (ULO.TripMultiple != 1)
- LLVM_DEBUG(dbgs() << " Trip Multiple = " << ULO.TripMultiple << "\n");
-
- // Effectively "DCE" unrolled iterations that are beyond the tripcount
- // and will never be executed.
- if (ULO.TripCount != 0 && ULO.Count > ULO.TripCount)
- ULO.Count = ULO.TripCount;
-
- // Don't enter the unroll code if there is nothing to do.
- if (ULO.TripCount == 0 && ULO.Count < 2 && ULO.PeelCount == 0) {
- LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
- return LoopUnrollResult::Unmodified;
- }
-
- assert(ULO.Count > 0);
- assert(ULO.TripMultiple > 0);
- assert(ULO.TripCount == 0 || ULO.TripCount % ULO.TripMultiple == 0);
-
- // Are we eliminating the loop control altogether?
- bool CompletelyUnroll = ULO.Count == ULO.TripCount;
-
- // We assume a run-time trip count if the compiler cannot
- // figure out the loop trip count and the unroll-runtime
- // flag is specified.
- bool RuntimeTripCount =
- (ULO.TripCount == 0 && ULO.Count > 0 && ULO.AllowRuntime);
-
- assert((!RuntimeTripCount || !ULO.PeelCount) &&
- "Did not expect runtime trip-count unrolling "
- "and peeling for the same loop");
-
- bool Peeled = false;
- if (ULO.PeelCount) {
- Peeled = peelLoop(L, ULO.PeelCount, LI, SE, DT, AC, PreserveLCSSA);
-
- // Successful peeling may result in a change in the loop preheader/trip
- // counts. If we later unroll the loop, we want these to be updated.
- if (Peeled) {
- // According to our guards and profitability checks the only
- // meaningful exit should be latch block. Other exits go to deopt,
- // so we do not worry about them.
- BasicBlock *ExitingBlock = L->getLoopLatch();
- assert(ExitingBlock && "Loop without exiting block?");
- assert(L->isLoopExiting(ExitingBlock) && "Latch is not exiting?");
- ULO.TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
- ULO.TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
- }
- }
-
+ // The loop-rotate pass can be helpful to avoid this in many cases.
+ LLVM_DEBUG(
+ dbgs() << " Won't unroll loop: address of header block is taken.\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ if (ULO.TripCount != 0)
+ LLVM_DEBUG(dbgs() << " Trip Count = " << ULO.TripCount << "\n");
+ if (ULO.TripMultiple != 1)
+ LLVM_DEBUG(dbgs() << " Trip Multiple = " << ULO.TripMultiple << "\n");
+
+ // Effectively "DCE" unrolled iterations that are beyond the tripcount
+ // and will never be executed.
+ if (ULO.TripCount != 0 && ULO.Count > ULO.TripCount)
+ ULO.Count = ULO.TripCount;
+
+ // Don't enter the unroll code if there is nothing to do.
+ if (ULO.TripCount == 0 && ULO.Count < 2 && ULO.PeelCount == 0) {
+ LLVM_DEBUG(dbgs() << "Won't unroll; almost nothing to do\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ assert(ULO.Count > 0);
+ assert(ULO.TripMultiple > 0);
+ assert(ULO.TripCount == 0 || ULO.TripCount % ULO.TripMultiple == 0);
+
+ // Are we eliminating the loop control altogether?
+ bool CompletelyUnroll = ULO.Count == ULO.TripCount;
+
+ // We assume a run-time trip count if the compiler cannot
+ // figure out the loop trip count and the unroll-runtime
+ // flag is specified.
+ bool RuntimeTripCount =
+ (ULO.TripCount == 0 && ULO.Count > 0 && ULO.AllowRuntime);
+
+ assert((!RuntimeTripCount || !ULO.PeelCount) &&
+ "Did not expect runtime trip-count unrolling "
+ "and peeling for the same loop");
+
+ bool Peeled = false;
+ if (ULO.PeelCount) {
+ Peeled = peelLoop(L, ULO.PeelCount, LI, SE, DT, AC, PreserveLCSSA);
+
+ // Successful peeling may result in a change in the loop preheader/trip
+ // counts. If we later unroll the loop, we want these to be updated.
+ if (Peeled) {
+ // According to our guards and profitability checks the only
+ // meaningful exit should be latch block. Other exits go to deopt,
+ // so we do not worry about them.
+ BasicBlock *ExitingBlock = L->getLoopLatch();
+ assert(ExitingBlock && "Loop without exiting block?");
+ assert(L->isLoopExiting(ExitingBlock) && "Latch is not exiting?");
+ ULO.TripCount = SE->getSmallConstantTripCount(L, ExitingBlock);
+ ULO.TripMultiple = SE->getSmallConstantTripMultiple(L, ExitingBlock);
+ }
+ }
+
// All these values should be taken only after peeling because they might have
// changed.
BasicBlock *Preheader = L->getLoopPreheader();
@@ -414,280 +414,280 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
dbgs() << " No single exiting block\n";
});
- // Loops containing convergent instructions must have a count that divides
- // their TripMultiple.
- LLVM_DEBUG(
- {
- bool HasConvergent = false;
- for (auto &BB : L->blocks())
- for (auto &I : *BB)
- if (auto *CB = dyn_cast<CallBase>(&I))
- HasConvergent |= CB->isConvergent();
- assert((!HasConvergent || ULO.TripMultiple % ULO.Count == 0) &&
- "Unroll count must divide trip multiple if loop contains a "
- "convergent operation.");
- });
-
- bool EpilogProfitability =
- UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
- : isEpilogProfitable(L);
-
- if (RuntimeTripCount && ULO.TripMultiple % ULO.Count != 0 &&
- !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,
- EpilogProfitability, ULO.UnrollRemainder,
- ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
- PreserveLCSSA, RemainderLoop)) {
- if (ULO.Force)
- RuntimeTripCount = false;
- else {
- LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
- "generated when assuming runtime trip count\n");
- return LoopUnrollResult::Unmodified;
- }
- }
-
- // If we know the trip count, we know the multiple...
- unsigned BreakoutTrip = 0;
- if (ULO.TripCount != 0) {
- BreakoutTrip = ULO.TripCount % ULO.Count;
- ULO.TripMultiple = 0;
- } else {
- // Figure out what multiple to use.
- BreakoutTrip = ULO.TripMultiple =
- (unsigned)GreatestCommonDivisor64(ULO.Count, ULO.TripMultiple);
- }
-
- using namespace ore;
- // Report the unrolling decision.
- if (CompletelyUnroll) {
- LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
- << " with trip count " << ULO.TripCount << "!\n");
- if (ORE)
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
- L->getHeader())
- << "completely unrolled loop with "
- << NV("UnrollCount", ULO.TripCount) << " iterations";
- });
- } else if (ULO.PeelCount) {
- LLVM_DEBUG(dbgs() << "PEELING loop %" << Header->getName()
- << " with iteration count " << ULO.PeelCount << "!\n");
- if (ORE)
- ORE->emit([&]() {
- return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
- L->getHeader())
- << " peeled loop by " << NV("PeelCount", ULO.PeelCount)
- << " iterations";
- });
- } else {
- auto DiagBuilder = [&]() {
- OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
- L->getHeader());
- return Diag << "unrolled loop by a factor of "
- << NV("UnrollCount", ULO.Count);
- };
-
- LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by "
- << ULO.Count);
- if (ULO.TripMultiple == 0 || BreakoutTrip != ULO.TripMultiple) {
- LLVM_DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
- if (ORE)
- ORE->emit([&]() {
- return DiagBuilder() << " with a breakout at trip "
- << NV("BreakoutTrip", BreakoutTrip);
- });
- } else if (ULO.TripMultiple != 1) {
- LLVM_DEBUG(dbgs() << " with " << ULO.TripMultiple << " trips per branch");
- if (ORE)
- ORE->emit([&]() {
- return DiagBuilder()
- << " with " << NV("TripMultiple", ULO.TripMultiple)
- << " trips per branch";
- });
- } else if (RuntimeTripCount) {
- LLVM_DEBUG(dbgs() << " with run-time trip count");
- if (ORE)
- ORE->emit(
- [&]() { return DiagBuilder() << " with run-time trip count"; });
- }
- LLVM_DEBUG(dbgs() << "!\n");
- }
-
- // We are going to make changes to this loop. SCEV may be keeping cached info
- // about it, in particular about backedge taken count. The changes we make
- // are guaranteed to invalidate this information for our loop. It is tempting
- // to only invalidate the loop being unrolled, but it is incorrect as long as
- // all exiting branches from all inner loops have impact on the outer loops,
- // and if something changes inside them then any of outer loops may also
- // change. When we forget outermost loop, we also forget all contained loops
- // and this is what we need here.
- if (SE) {
- if (ULO.ForgetAllSCEV)
- SE->forgetAllLoops();
- else
- SE->forgetTopmostLoop(L);
- }
-
- if (!LatchIsExiting)
- ++NumUnrolledNotLatch;
- Optional<bool> ContinueOnTrue = None;
- BasicBlock *LoopExit = nullptr;
- if (ExitingBI) {
- ContinueOnTrue = L->contains(ExitingBI->getSuccessor(0));
- LoopExit = ExitingBI->getSuccessor(*ContinueOnTrue);
- }
-
- // For the first iteration of the loop, we should use the precloned values for
- // PHI nodes. Insert associations now.
- ValueToValueMapTy LastValueMap;
- std::vector<PHINode*> OrigPHINode;
- for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
- OrigPHINode.push_back(cast<PHINode>(I));
- }
-
- std::vector<BasicBlock *> Headers;
- std::vector<BasicBlock *> ExitingBlocks;
- std::vector<BasicBlock *> ExitingSucc;
- std::vector<BasicBlock *> Latches;
- Headers.push_back(Header);
- Latches.push_back(LatchBlock);
- if (ExitingBI) {
- ExitingBlocks.push_back(ExitingBI->getParent());
- ExitingSucc.push_back(ExitingBI->getSuccessor(!(*ContinueOnTrue)));
- }
-
- // The current on-the-fly SSA update requires blocks to be processed in
- // reverse postorder so that LastValueMap contains the correct value at each
- // exit.
- LoopBlocksDFS DFS(L);
- DFS.perform(LI);
-
- // Stash the DFS iterators before adding blocks to the loop.
- LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
- LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
-
- std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks();
-
- // Loop Unrolling might create new loops. While we do preserve LoopInfo, we
- // might break loop-simplified form for these loops (as they, e.g., would
- // share the same exit blocks). We'll keep track of loops for which we can
- // break this so that later we can re-simplify them.
- SmallSetVector<Loop *, 4> LoopsToSimplify;
- for (Loop *SubLoop : *L)
- LoopsToSimplify.insert(SubLoop);
-
- if (Header->getParent()->isDebugInfoForProfiling())
- for (BasicBlock *BB : L->getBlocks())
- for (Instruction &I : *BB)
- if (!isa<DbgInfoIntrinsic>(&I))
- if (const DILocation *DIL = I.getDebugLoc()) {
- auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(ULO.Count);
- if (NewDIL)
- I.setDebugLoc(NewDIL.getValue());
- else
- LLVM_DEBUG(dbgs()
- << "Failed to create new discriminator: "
- << DIL->getFilename() << " Line: " << DIL->getLine());
- }
-
+ // Loops containing convergent instructions must have a count that divides
+ // their TripMultiple.
+ LLVM_DEBUG(
+ {
+ bool HasConvergent = false;
+ for (auto &BB : L->blocks())
+ for (auto &I : *BB)
+ if (auto *CB = dyn_cast<CallBase>(&I))
+ HasConvergent |= CB->isConvergent();
+ assert((!HasConvergent || ULO.TripMultiple % ULO.Count == 0) &&
+ "Unroll count must divide trip multiple if loop contains a "
+ "convergent operation.");
+ });
+
+ bool EpilogProfitability =
+ UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
+ : isEpilogProfitable(L);
+
+ if (RuntimeTripCount && ULO.TripMultiple % ULO.Count != 0 &&
+ !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,
+ EpilogProfitability, ULO.UnrollRemainder,
+ ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
+ PreserveLCSSA, RemainderLoop)) {
+ if (ULO.Force)
+ RuntimeTripCount = false;
+ else {
+ LLVM_DEBUG(dbgs() << "Won't unroll; remainder loop could not be "
+ "generated when assuming runtime trip count\n");
+ return LoopUnrollResult::Unmodified;
+ }
+ }
+
+ // If we know the trip count, we know the multiple...
+ unsigned BreakoutTrip = 0;
+ if (ULO.TripCount != 0) {
+ BreakoutTrip = ULO.TripCount % ULO.Count;
+ ULO.TripMultiple = 0;
+ } else {
+ // Figure out what multiple to use.
+ BreakoutTrip = ULO.TripMultiple =
+ (unsigned)GreatestCommonDivisor64(ULO.Count, ULO.TripMultiple);
+ }
+
+ using namespace ore;
+ // Report the unrolling decision.
+ if (CompletelyUnroll) {
+ LLVM_DEBUG(dbgs() << "COMPLETELY UNROLLING loop %" << Header->getName()
+ << " with trip count " << ULO.TripCount << "!\n");
+ if (ORE)
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
+ L->getHeader())
+ << "completely unrolled loop with "
+ << NV("UnrollCount", ULO.TripCount) << " iterations";
+ });
+ } else if (ULO.PeelCount) {
+ LLVM_DEBUG(dbgs() << "PEELING loop %" << Header->getName()
+ << " with iteration count " << ULO.PeelCount << "!\n");
+ if (ORE)
+ ORE->emit([&]() {
+ return OptimizationRemark(DEBUG_TYPE, "Peeled", L->getStartLoc(),
+ L->getHeader())
+ << " peeled loop by " << NV("PeelCount", ULO.PeelCount)
+ << " iterations";
+ });
+ } else {
+ auto DiagBuilder = [&]() {
+ OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
+ L->getHeader());
+ return Diag << "unrolled loop by a factor of "
+ << NV("UnrollCount", ULO.Count);
+ };
+
+ LLVM_DEBUG(dbgs() << "UNROLLING loop %" << Header->getName() << " by "
+ << ULO.Count);
+ if (ULO.TripMultiple == 0 || BreakoutTrip != ULO.TripMultiple) {
+ LLVM_DEBUG(dbgs() << " with a breakout at trip " << BreakoutTrip);
+ if (ORE)
+ ORE->emit([&]() {
+ return DiagBuilder() << " with a breakout at trip "
+ << NV("BreakoutTrip", BreakoutTrip);
+ });
+ } else if (ULO.TripMultiple != 1) {
+ LLVM_DEBUG(dbgs() << " with " << ULO.TripMultiple << " trips per branch");
+ if (ORE)
+ ORE->emit([&]() {
+ return DiagBuilder()
+ << " with " << NV("TripMultiple", ULO.TripMultiple)
+ << " trips per branch";
+ });
+ } else if (RuntimeTripCount) {
+ LLVM_DEBUG(dbgs() << " with run-time trip count");
+ if (ORE)
+ ORE->emit(
+ [&]() { return DiagBuilder() << " with run-time trip count"; });
+ }
+ LLVM_DEBUG(dbgs() << "!\n");
+ }
+
+ // We are going to make changes to this loop. SCEV may be keeping cached info
+ // about it, in particular about backedge taken count. The changes we make
+ // are guaranteed to invalidate this information for our loop. It is tempting
+ // to only invalidate the loop being unrolled, but it is incorrect as long as
+ // all exiting branches from all inner loops have impact on the outer loops,
+ // and if something changes inside them then any of outer loops may also
+ // change. When we forget outermost loop, we also forget all contained loops
+ // and this is what we need here.
+ if (SE) {
+ if (ULO.ForgetAllSCEV)
+ SE->forgetAllLoops();
+ else
+ SE->forgetTopmostLoop(L);
+ }
+
+ if (!LatchIsExiting)
+ ++NumUnrolledNotLatch;
+ Optional<bool> ContinueOnTrue = None;
+ BasicBlock *LoopExit = nullptr;
+ if (ExitingBI) {
+ ContinueOnTrue = L->contains(ExitingBI->getSuccessor(0));
+ LoopExit = ExitingBI->getSuccessor(*ContinueOnTrue);
+ }
+
+ // For the first iteration of the loop, we should use the precloned values for
+ // PHI nodes. Insert associations now.
+ ValueToValueMapTy LastValueMap;
+ std::vector<PHINode*> OrigPHINode;
+ for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+ OrigPHINode.push_back(cast<PHINode>(I));
+ }
+
+ std::vector<BasicBlock *> Headers;
+ std::vector<BasicBlock *> ExitingBlocks;
+ std::vector<BasicBlock *> ExitingSucc;
+ std::vector<BasicBlock *> Latches;
+ Headers.push_back(Header);
+ Latches.push_back(LatchBlock);
+ if (ExitingBI) {
+ ExitingBlocks.push_back(ExitingBI->getParent());
+ ExitingSucc.push_back(ExitingBI->getSuccessor(!(*ContinueOnTrue)));
+ }
+
+ // The current on-the-fly SSA update requires blocks to be processed in
+ // reverse postorder so that LastValueMap contains the correct value at each
+ // exit.
+ LoopBlocksDFS DFS(L);
+ DFS.perform(LI);
+
+ // Stash the DFS iterators before adding blocks to the loop.
+ LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
+ LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
+
+ std::vector<BasicBlock*> UnrolledLoopBlocks = L->getBlocks();
+
+ // Loop Unrolling might create new loops. While we do preserve LoopInfo, we
+ // might break loop-simplified form for these loops (as they, e.g., would
+ // share the same exit blocks). We'll keep track of loops for which we can
+ // break this so that later we can re-simplify them.
+ SmallSetVector<Loop *, 4> LoopsToSimplify;
+ for (Loop *SubLoop : *L)
+ LoopsToSimplify.insert(SubLoop);
+
+ if (Header->getParent()->isDebugInfoForProfiling())
+ for (BasicBlock *BB : L->getBlocks())
+ for (Instruction &I : *BB)
+ if (!isa<DbgInfoIntrinsic>(&I))
+ if (const DILocation *DIL = I.getDebugLoc()) {
+ auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(ULO.Count);
+ if (NewDIL)
+ I.setDebugLoc(NewDIL.getValue());
+ else
+ LLVM_DEBUG(dbgs()
+ << "Failed to create new discriminator: "
+ << DIL->getFilename() << " Line: " << DIL->getLine());
+ }
+
// Identify what noalias metadata is inside the loop: if it is inside the
// loop, the associated metadata must be cloned for each iteration.
SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
identifyNoAliasScopesToClone(L->getBlocks(), LoopLocalNoAliasDeclScopes);
- for (unsigned It = 1; It != ULO.Count; ++It) {
- SmallVector<BasicBlock *, 8> NewBlocks;
- SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
- NewLoops[L] = L;
-
- for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
- ValueToValueMapTy VMap;
- BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
- Header->getParent()->getBasicBlockList().push_back(New);
-
- assert((*BB != Header || LI->getLoopFor(*BB) == L) &&
- "Header should not be in a sub-loop");
- // Tell LI about New.
- const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
- if (OldLoop)
- LoopsToSimplify.insert(NewLoops[OldLoop]);
-
- if (*BB == Header)
- // Loop over all of the PHI nodes in the block, changing them to use
- // the incoming values from the previous block.
- for (PHINode *OrigPHI : OrigPHINode) {
- PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
- Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
- if (Instruction *InValI = dyn_cast<Instruction>(InVal))
- if (It > 1 && L->contains(InValI))
- InVal = LastValueMap[InValI];
- VMap[OrigPHI] = InVal;
- New->getInstList().erase(NewPHI);
- }
-
- // Update our running map of newest clones
- LastValueMap[*BB] = New;
- for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
- VI != VE; ++VI)
- LastValueMap[VI->first] = VI->second;
-
- // Add phi entries for newly created values to all exit blocks.
- for (BasicBlock *Succ : successors(*BB)) {
- if (L->contains(Succ))
- continue;
- for (PHINode &PHI : Succ->phis()) {
- Value *Incoming = PHI.getIncomingValueForBlock(*BB);
- ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
- if (It != LastValueMap.end())
- Incoming = It->second;
- PHI.addIncoming(Incoming, New);
- }
- }
- // Keep track of new headers and latches as we create them, so that
- // we can insert the proper branches later.
- if (*BB == Header)
- Headers.push_back(New);
- if (*BB == LatchBlock)
- Latches.push_back(New);
-
- // Keep track of the exiting block and its successor block contained in
- // the loop for the current iteration.
- if (ExitingBI) {
- if (*BB == ExitingBlocks[0])
- ExitingBlocks.push_back(New);
- if (*BB == ExitingSucc[0])
- ExitingSucc.push_back(New);
- }
-
- NewBlocks.push_back(New);
- UnrolledLoopBlocks.push_back(New);
-
- // Update DomTree: since we just copy the loop body, and each copy has a
- // dedicated entry block (copy of the header block), this header's copy
- // dominates all copied blocks. That means, dominance relations in the
- // copied body are the same as in the original body.
- if (DT) {
- if (*BB == Header)
- DT->addNewBlock(New, Latches[It - 1]);
- else {
- auto BBDomNode = DT->getNode(*BB);
- auto BBIDom = BBDomNode->getIDom();
- BasicBlock *OriginalBBIDom = BBIDom->getBlock();
- DT->addNewBlock(
- New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
- }
- }
- }
-
- // Remap all instructions in the most recent iteration
- remapInstructionsInBlocks(NewBlocks, LastValueMap);
- for (BasicBlock *NewBlock : NewBlocks) {
- for (Instruction &I : *NewBlock) {
- if (auto *II = dyn_cast<IntrinsicInst>(&I))
- if (II->getIntrinsicID() == Intrinsic::assume)
- AC->registerAssumption(II);
- }
- }
+ for (unsigned It = 1; It != ULO.Count; ++It) {
+ SmallVector<BasicBlock *, 8> NewBlocks;
+ SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
+ NewLoops[L] = L;
+
+ for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+ ValueToValueMapTy VMap;
+ BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
+ Header->getParent()->getBasicBlockList().push_back(New);
+
+ assert((*BB != Header || LI->getLoopFor(*BB) == L) &&
+ "Header should not be in a sub-loop");
+ // Tell LI about New.
+ const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
+ if (OldLoop)
+ LoopsToSimplify.insert(NewLoops[OldLoop]);
+
+ if (*BB == Header)
+ // Loop over all of the PHI nodes in the block, changing them to use
+ // the incoming values from the previous block.
+ for (PHINode *OrigPHI : OrigPHINode) {
+ PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
+ Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
+ if (Instruction *InValI = dyn_cast<Instruction>(InVal))
+ if (It > 1 && L->contains(InValI))
+ InVal = LastValueMap[InValI];
+ VMap[OrigPHI] = InVal;
+ New->getInstList().erase(NewPHI);
+ }
+
+ // Update our running map of newest clones
+ LastValueMap[*BB] = New;
+ for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
+ VI != VE; ++VI)
+ LastValueMap[VI->first] = VI->second;
+
+ // Add phi entries for newly created values to all exit blocks.
+ for (BasicBlock *Succ : successors(*BB)) {
+ if (L->contains(Succ))
+ continue;
+ for (PHINode &PHI : Succ->phis()) {
+ Value *Incoming = PHI.getIncomingValueForBlock(*BB);
+ ValueToValueMapTy::iterator It = LastValueMap.find(Incoming);
+ if (It != LastValueMap.end())
+ Incoming = It->second;
+ PHI.addIncoming(Incoming, New);
+ }
+ }
+ // Keep track of new headers and latches as we create them, so that
+ // we can insert the proper branches later.
+ if (*BB == Header)
+ Headers.push_back(New);
+ if (*BB == LatchBlock)
+ Latches.push_back(New);
+
+ // Keep track of the exiting block and its successor block contained in
+ // the loop for the current iteration.
+ if (ExitingBI) {
+ if (*BB == ExitingBlocks[0])
+ ExitingBlocks.push_back(New);
+ if (*BB == ExitingSucc[0])
+ ExitingSucc.push_back(New);
+ }
+
+ NewBlocks.push_back(New);
+ UnrolledLoopBlocks.push_back(New);
+
+ // Update DomTree: since we just copy the loop body, and each copy has a
+ // dedicated entry block (copy of the header block), this header's copy
+ // dominates all copied blocks. That means, dominance relations in the
+ // copied body are the same as in the original body.
+ if (DT) {
+ if (*BB == Header)
+ DT->addNewBlock(New, Latches[It - 1]);
+ else {
+ auto BBDomNode = DT->getNode(*BB);
+ auto BBIDom = BBDomNode->getIDom();
+ BasicBlock *OriginalBBIDom = BBIDom->getBlock();
+ DT->addNewBlock(
+ New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
+ }
+ }
+ }
+
+ // Remap all instructions in the most recent iteration
+ remapInstructionsInBlocks(NewBlocks, LastValueMap);
+ for (BasicBlock *NewBlock : NewBlocks) {
+ for (Instruction &I : *NewBlock) {
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
+ }
+ }
{
// Identify what other metadata depends on the cloned version. After
@@ -697,282 +697,282 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
cloneAndAdaptNoAliasScopes(LoopLocalNoAliasDeclScopes, NewBlocks,
Header->getContext(), ext);
}
- }
-
- // Loop over the PHI nodes in the original block, setting incoming values.
- for (PHINode *PN : OrigPHINode) {
- if (CompletelyUnroll) {
- PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
- Header->getInstList().erase(PN);
- } else if (ULO.Count > 1) {
- Value *InVal = PN->removeIncomingValue(LatchBlock, false);
- // If this value was defined in the loop, take the value defined by the
- // last iteration of the loop.
- if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
- if (L->contains(InValI))
- InVal = LastValueMap[InVal];
- }
- assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
- PN->addIncoming(InVal, Latches.back());
- }
- }
-
- auto setDest = [](BasicBlock *Src, BasicBlock *Dest, BasicBlock *BlockInLoop,
- bool NeedConditional, Optional<bool> ContinueOnTrue,
- bool IsDestLoopExit) {
- auto *Term = cast<BranchInst>(Src->getTerminator());
- if (NeedConditional) {
- // Update the conditional branch's successor for the following
- // iteration.
- assert(ContinueOnTrue.hasValue() &&
- "Expecting valid ContinueOnTrue when NeedConditional is true");
- Term->setSuccessor(!(*ContinueOnTrue), Dest);
- } else {
- // Remove phi operands at this loop exit
- if (!IsDestLoopExit) {
- BasicBlock *BB = Src;
- for (BasicBlock *Succ : successors(BB)) {
- // Preserve the incoming value from BB if we are jumping to the block
- // in the current loop.
- if (Succ == BlockInLoop)
- continue;
- for (PHINode &Phi : Succ->phis())
- Phi.removeIncomingValue(BB, false);
- }
- }
- // Replace the conditional branch with an unconditional one.
- BranchInst::Create(Dest, Term);
- Term->eraseFromParent();
- }
- };
-
- // Connect latches of the unrolled iterations to the headers of the next
- // iteration. If the latch is also the exiting block, the conditional branch
- // may have to be preserved.
- for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
- // The branch destination.
- unsigned j = (i + 1) % e;
- BasicBlock *Dest = Headers[j];
- bool NeedConditional = LatchIsExiting;
-
- if (LatchIsExiting) {
- if (RuntimeTripCount && j != 0)
- NeedConditional = false;
-
- // For a complete unroll, make the last iteration end with a branch
- // to the exit block.
- if (CompletelyUnroll) {
- if (j == 0)
- Dest = LoopExit;
- // If using trip count upper bound to completely unroll, we need to
- // keep the conditional branch except the last one because the loop
- // may exit after any iteration.
- assert(NeedConditional &&
- "NeedCondition cannot be modified by both complete "
- "unrolling and runtime unrolling");
- NeedConditional =
- (ULO.PreserveCondBr && j && !(ULO.PreserveOnlyFirst && i != 0));
- } else if (j != BreakoutTrip &&
- (ULO.TripMultiple == 0 || j % ULO.TripMultiple != 0)) {
- // If we know the trip count or a multiple of it, we can safely use an
- // unconditional branch for some iterations.
- NeedConditional = false;
- }
- }
-
- setDest(Latches[i], Dest, Headers[i], NeedConditional, ContinueOnTrue,
- Dest == LoopExit);
- }
-
- if (!LatchIsExiting) {
- // If the latch is not exiting, we may be able to simplify the conditional
- // branches in the unrolled exiting blocks.
- for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
- // The branch destination.
- unsigned j = (i + 1) % e;
- bool NeedConditional = true;
-
- if (RuntimeTripCount && j != 0)
- NeedConditional = false;
-
- if (CompletelyUnroll)
- // We cannot drop the conditional branch for the last condition, as we
- // may have to execute the loop body depending on the condition.
- NeedConditional = j == 0 || ULO.PreserveCondBr;
- else if (j != BreakoutTrip &&
- (ULO.TripMultiple == 0 || j % ULO.TripMultiple != 0))
- // If we know the trip count or a multiple of it, we can safely use an
- // unconditional branch for some iterations.
- NeedConditional = false;
-
- // Conditional branches from non-latch exiting block have successors
- // either in the same loop iteration or outside the loop. The branches are
- // already correct.
- if (NeedConditional)
- continue;
- setDest(ExitingBlocks[i], ExitingSucc[i], ExitingSucc[i], NeedConditional,
- None, false);
- }
-
- // When completely unrolling, the last latch becomes unreachable.
- if (CompletelyUnroll) {
- BranchInst *Term = cast<BranchInst>(Latches.back()->getTerminator());
- new UnreachableInst(Term->getContext(), Term);
- Term->eraseFromParent();
- }
- }
-
- // Update dominators of blocks we might reach through exits.
- // Immediate dominator of such block might change, because we add more
- // routes which can lead to the exit: we can now reach it from the copied
- // iterations too.
- if (DT && ULO.Count > 1) {
- for (auto *BB : OriginalLoopBlocks) {
- auto *BBDomNode = DT->getNode(BB);
- SmallVector<BasicBlock *, 16> ChildrenToUpdate;
- for (auto *ChildDomNode : BBDomNode->children()) {
- auto *ChildBB = ChildDomNode->getBlock();
- if (!L->contains(ChildBB))
- ChildrenToUpdate.push_back(ChildBB);
- }
- BasicBlock *NewIDom;
- if (ExitingBI && BB == ExitingBlocks[0]) {
- // The latch is special because we emit unconditional branches in
- // some cases where the original loop contained a conditional branch.
- // Since the latch is always at the bottom of the loop, if the latch
- // dominated an exit before unrolling, the new dominator of that exit
- // must also be a latch. Specifically, the dominator is the first
- // latch which ends in a conditional branch, or the last latch if
- // there is no such latch.
- // For loops exiting from non latch exiting block, we limit the
- // branch simplification to single exiting block loops.
- NewIDom = ExitingBlocks.back();
- for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
- Instruction *Term = ExitingBlocks[i]->getTerminator();
- if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
- NewIDom =
- DT->findNearestCommonDominator(ExitingBlocks[i], Latches[i]);
- break;
- }
- }
- } else {
- // The new idom of the block will be the nearest common dominator
- // of all copies of the previous idom. This is equivalent to the
- // nearest common dominator of the previous idom and the first latch,
- // which dominates all copies of the previous idom.
- NewIDom = DT->findNearestCommonDominator(BB, LatchBlock);
- }
- for (auto *ChildBB : ChildrenToUpdate)
- DT->changeImmediateDominator(ChildBB, NewIDom);
- }
- }
-
- assert(!DT || !UnrollVerifyDomtree ||
- DT->verify(DominatorTree::VerificationLevel::Fast));
-
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
- // Merge adjacent basic blocks, if possible.
- for (BasicBlock *Latch : Latches) {
- BranchInst *Term = dyn_cast<BranchInst>(Latch->getTerminator());
- assert((Term ||
- (CompletelyUnroll && !LatchIsExiting && Latch == Latches.back())) &&
- "Need a branch as terminator, except when fully unrolling with "
- "unconditional latch");
- if (Term && Term->isUnconditional()) {
- BasicBlock *Dest = Term->getSuccessor(0);
- BasicBlock *Fold = Dest->getUniquePredecessor();
- if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
- // Dest has been folded into Fold. Update our worklists accordingly.
- std::replace(Latches.begin(), Latches.end(), Dest, Fold);
+ }
+
+ // Loop over the PHI nodes in the original block, setting incoming values.
+ for (PHINode *PN : OrigPHINode) {
+ if (CompletelyUnroll) {
+ PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
+ Header->getInstList().erase(PN);
+ } else if (ULO.Count > 1) {
+ Value *InVal = PN->removeIncomingValue(LatchBlock, false);
+ // If this value was defined in the loop, take the value defined by the
+ // last iteration of the loop.
+ if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
+ if (L->contains(InValI))
+ InVal = LastValueMap[InVal];
+ }
+ assert(Latches.back() == LastValueMap[LatchBlock] && "bad last latch");
+ PN->addIncoming(InVal, Latches.back());
+ }
+ }
+
+ auto setDest = [](BasicBlock *Src, BasicBlock *Dest, BasicBlock *BlockInLoop,
+ bool NeedConditional, Optional<bool> ContinueOnTrue,
+ bool IsDestLoopExit) {
+ auto *Term = cast<BranchInst>(Src->getTerminator());
+ if (NeedConditional) {
+ // Update the conditional branch's successor for the following
+ // iteration.
+ assert(ContinueOnTrue.hasValue() &&
+ "Expecting valid ContinueOnTrue when NeedConditional is true");
+ Term->setSuccessor(!(*ContinueOnTrue), Dest);
+ } else {
+ // Remove phi operands at this loop exit
+ if (!IsDestLoopExit) {
+ BasicBlock *BB = Src;
+ for (BasicBlock *Succ : successors(BB)) {
+ // Preserve the incoming value from BB if we are jumping to the block
+ // in the current loop.
+ if (Succ == BlockInLoop)
+ continue;
+ for (PHINode &Phi : Succ->phis())
+ Phi.removeIncomingValue(BB, false);
+ }
+ }
+ // Replace the conditional branch with an unconditional one.
+ BranchInst::Create(Dest, Term);
+ Term->eraseFromParent();
+ }
+ };
+
+ // Connect latches of the unrolled iterations to the headers of the next
+ // iteration. If the latch is also the exiting block, the conditional branch
+ // may have to be preserved.
+ for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
+ // The branch destination.
+ unsigned j = (i + 1) % e;
+ BasicBlock *Dest = Headers[j];
+ bool NeedConditional = LatchIsExiting;
+
+ if (LatchIsExiting) {
+ if (RuntimeTripCount && j != 0)
+ NeedConditional = false;
+
+ // For a complete unroll, make the last iteration end with a branch
+ // to the exit block.
+ if (CompletelyUnroll) {
+ if (j == 0)
+ Dest = LoopExit;
+ // If using trip count upper bound to completely unroll, we need to
+ // keep the conditional branch except the last one because the loop
+ // may exit after any iteration.
+ assert(NeedConditional &&
+ "NeedCondition cannot be modified by both complete "
+ "unrolling and runtime unrolling");
+ NeedConditional =
+ (ULO.PreserveCondBr && j && !(ULO.PreserveOnlyFirst && i != 0));
+ } else if (j != BreakoutTrip &&
+ (ULO.TripMultiple == 0 || j % ULO.TripMultiple != 0)) {
+ // If we know the trip count or a multiple of it, we can safely use an
+ // unconditional branch for some iterations.
+ NeedConditional = false;
+ }
+ }
+
+ setDest(Latches[i], Dest, Headers[i], NeedConditional, ContinueOnTrue,
+ Dest == LoopExit);
+ }
+
+ if (!LatchIsExiting) {
+ // If the latch is not exiting, we may be able to simplify the conditional
+ // branches in the unrolled exiting blocks.
+ for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+ // The branch destination.
+ unsigned j = (i + 1) % e;
+ bool NeedConditional = true;
+
+ if (RuntimeTripCount && j != 0)
+ NeedConditional = false;
+
+ if (CompletelyUnroll)
+ // We cannot drop the conditional branch for the last condition, as we
+ // may have to execute the loop body depending on the condition.
+ NeedConditional = j == 0 || ULO.PreserveCondBr;
+ else if (j != BreakoutTrip &&
+ (ULO.TripMultiple == 0 || j % ULO.TripMultiple != 0))
+ // If we know the trip count or a multiple of it, we can safely use an
+ // unconditional branch for some iterations.
+ NeedConditional = false;
+
+ // Conditional branches from non-latch exiting block have successors
+ // either in the same loop iteration or outside the loop. The branches are
+ // already correct.
+ if (NeedConditional)
+ continue;
+ setDest(ExitingBlocks[i], ExitingSucc[i], ExitingSucc[i], NeedConditional,
+ None, false);
+ }
+
+ // When completely unrolling, the last latch becomes unreachable.
+ if (CompletelyUnroll) {
+ BranchInst *Term = cast<BranchInst>(Latches.back()->getTerminator());
+ new UnreachableInst(Term->getContext(), Term);
+ Term->eraseFromParent();
+ }
+ }
+
+ // Update dominators of blocks we might reach through exits.
+ // Immediate dominator of such block might change, because we add more
+ // routes which can lead to the exit: we can now reach it from the copied
+ // iterations too.
+ if (DT && ULO.Count > 1) {
+ for (auto *BB : OriginalLoopBlocks) {
+ auto *BBDomNode = DT->getNode(BB);
+ SmallVector<BasicBlock *, 16> ChildrenToUpdate;
+ for (auto *ChildDomNode : BBDomNode->children()) {
+ auto *ChildBB = ChildDomNode->getBlock();
+ if (!L->contains(ChildBB))
+ ChildrenToUpdate.push_back(ChildBB);
+ }
+ BasicBlock *NewIDom;
+ if (ExitingBI && BB == ExitingBlocks[0]) {
+ // The latch is special because we emit unconditional branches in
+ // some cases where the original loop contained a conditional branch.
+ // Since the latch is always at the bottom of the loop, if the latch
+ // dominated an exit before unrolling, the new dominator of that exit
+ // must also be a latch. Specifically, the dominator is the first
+ // latch which ends in a conditional branch, or the last latch if
+ // there is no such latch.
+ // For loops exiting from non latch exiting block, we limit the
+ // branch simplification to single exiting block loops.
+ NewIDom = ExitingBlocks.back();
+ for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+ Instruction *Term = ExitingBlocks[i]->getTerminator();
+ if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
+ NewIDom =
+ DT->findNearestCommonDominator(ExitingBlocks[i], Latches[i]);
+ break;
+ }
+ }
+ } else {
+ // The new idom of the block will be the nearest common dominator
+ // of all copies of the previous idom. This is equivalent to the
+ // nearest common dominator of the previous idom and the first latch,
+ // which dominates all copies of the previous idom.
+ NewIDom = DT->findNearestCommonDominator(BB, LatchBlock);
+ }
+ for (auto *ChildBB : ChildrenToUpdate)
+ DT->changeImmediateDominator(ChildBB, NewIDom);
+ }
+ }
+
+ assert(!DT || !UnrollVerifyDomtree ||
+ DT->verify(DominatorTree::VerificationLevel::Fast));
+
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ // Merge adjacent basic blocks, if possible.
+ for (BasicBlock *Latch : Latches) {
+ BranchInst *Term = dyn_cast<BranchInst>(Latch->getTerminator());
+ assert((Term ||
+ (CompletelyUnroll && !LatchIsExiting && Latch == Latches.back())) &&
+ "Need a branch as terminator, except when fully unrolling with "
+ "unconditional latch");
+ if (Term && Term->isUnconditional()) {
+ BasicBlock *Dest = Term->getSuccessor(0);
+ BasicBlock *Fold = Dest->getUniquePredecessor();
+ if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
+ // Dest has been folded into Fold. Update our worklists accordingly.
+ std::replace(Latches.begin(), Latches.end(), Dest, Fold);
llvm::erase_value(UnrolledLoopBlocks, Dest);
- }
- }
- }
- // Apply updates to the DomTree.
- DT = &DTU.getDomTree();
-
- // At this point, the code is well formed. We now simplify the unrolled loop,
- // doing constant propagation and dead code elimination as we go.
- simplifyLoopAfterUnroll(L, !CompletelyUnroll && (ULO.Count > 1 || Peeled), LI,
- SE, DT, AC, TTI);
-
- NumCompletelyUnrolled += CompletelyUnroll;
- ++NumUnrolled;
-
- Loop *OuterL = L->getParentLoop();
- // Update LoopInfo if the loop is completely removed.
- if (CompletelyUnroll)
- LI->erase(L);
-
- // After complete unrolling most of the blocks should be contained in OuterL.
- // However, some of them might happen to be out of OuterL (e.g. if they
- // precede a loop exit). In this case we might need to insert PHI nodes in
- // order to preserve LCSSA form.
- // We don't need to check this if we already know that we need to fix LCSSA
- // form.
- // TODO: For now we just recompute LCSSA for the outer loop in this case, but
- // it should be possible to fix it in-place.
- if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA)
- NeedToFixLCSSA |= ::needToInsertPhisForLCSSA(OuterL, UnrolledLoopBlocks, LI);
-
- // If we have a pass and a DominatorTree we should re-simplify impacted loops
- // to ensure subsequent analyses can rely on this form. We want to simplify
- // at least one layer outside of the loop that was unrolled so that any
- // changes to the parent loop exposed by the unrolling are considered.
- if (DT) {
- if (OuterL) {
- // OuterL includes all loops for which we can break loop-simplify, so
- // it's sufficient to simplify only it (it'll recursively simplify inner
- // loops too).
- if (NeedToFixLCSSA) {
- // LCSSA must be performed on the outermost affected loop. The unrolled
- // loop's last loop latch is guaranteed to be in the outermost loop
- // after LoopInfo's been updated by LoopInfo::erase.
- Loop *LatchLoop = LI->getLoopFor(Latches.back());
- Loop *FixLCSSALoop = OuterL;
- if (!FixLCSSALoop->contains(LatchLoop))
- while (FixLCSSALoop->getParentLoop() != LatchLoop)
- FixLCSSALoop = FixLCSSALoop->getParentLoop();
-
- formLCSSARecursively(*FixLCSSALoop, *DT, LI, SE);
- } else if (PreserveLCSSA) {
- assert(OuterL->isLCSSAForm(*DT) &&
- "Loops should be in LCSSA form after loop-unroll.");
- }
-
- // TODO: That potentially might be compile-time expensive. We should try
- // to fix the loop-simplified form incrementally.
- simplifyLoop(OuterL, DT, LI, SE, AC, nullptr, PreserveLCSSA);
- } else {
- // Simplify loops for which we might've broken loop-simplify form.
- for (Loop *SubLoop : LoopsToSimplify)
- simplifyLoop(SubLoop, DT, LI, SE, AC, nullptr, PreserveLCSSA);
- }
- }
-
- return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
- : LoopUnrollResult::PartiallyUnrolled;
-}
-
-/// Given an llvm.loop loop id metadata node, returns the loop hint metadata
-/// node with the given name (for example, "llvm.loop.unroll.count"). If no
-/// such metadata node exists, then nullptr is returned.
-MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
- // First operand should refer to the loop id itself.
- assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
- assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
-
- for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
- MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
- if (!MD)
- continue;
-
- MDString *S = dyn_cast<MDString>(MD->getOperand(0));
- if (!S)
- continue;
-
- if (Name.equals(S->getString()))
- return MD;
- }
- return nullptr;
-}
+ }
+ }
+ }
+ // Apply updates to the DomTree.
+ DT = &DTU.getDomTree();
+
+ // At this point, the code is well formed. We now simplify the unrolled loop,
+ // doing constant propagation and dead code elimination as we go.
+ simplifyLoopAfterUnroll(L, !CompletelyUnroll && (ULO.Count > 1 || Peeled), LI,
+ SE, DT, AC, TTI);
+
+ NumCompletelyUnrolled += CompletelyUnroll;
+ ++NumUnrolled;
+
+ Loop *OuterL = L->getParentLoop();
+ // Update LoopInfo if the loop is completely removed.
+ if (CompletelyUnroll)
+ LI->erase(L);
+
+ // After complete unrolling most of the blocks should be contained in OuterL.
+ // However, some of them might happen to be out of OuterL (e.g. if they
+ // precede a loop exit). In this case we might need to insert PHI nodes in
+ // order to preserve LCSSA form.
+ // We don't need to check this if we already know that we need to fix LCSSA
+ // form.
+ // TODO: For now we just recompute LCSSA for the outer loop in this case, but
+ // it should be possible to fix it in-place.
+ if (PreserveLCSSA && OuterL && CompletelyUnroll && !NeedToFixLCSSA)
+ NeedToFixLCSSA |= ::needToInsertPhisForLCSSA(OuterL, UnrolledLoopBlocks, LI);
+
+ // If we have a pass and a DominatorTree we should re-simplify impacted loops
+ // to ensure subsequent analyses can rely on this form. We want to simplify
+ // at least one layer outside of the loop that was unrolled so that any
+ // changes to the parent loop exposed by the unrolling are considered.
+ if (DT) {
+ if (OuterL) {
+ // OuterL includes all loops for which we can break loop-simplify, so
+ // it's sufficient to simplify only it (it'll recursively simplify inner
+ // loops too).
+ if (NeedToFixLCSSA) {
+ // LCSSA must be performed on the outermost affected loop. The unrolled
+ // loop's last loop latch is guaranteed to be in the outermost loop
+ // after LoopInfo's been updated by LoopInfo::erase.
+ Loop *LatchLoop = LI->getLoopFor(Latches.back());
+ Loop *FixLCSSALoop = OuterL;
+ if (!FixLCSSALoop->contains(LatchLoop))
+ while (FixLCSSALoop->getParentLoop() != LatchLoop)
+ FixLCSSALoop = FixLCSSALoop->getParentLoop();
+
+ formLCSSARecursively(*FixLCSSALoop, *DT, LI, SE);
+ } else if (PreserveLCSSA) {
+ assert(OuterL->isLCSSAForm(*DT) &&
+ "Loops should be in LCSSA form after loop-unroll.");
+ }
+
+ // TODO: That potentially might be compile-time expensive. We should try
+ // to fix the loop-simplified form incrementally.
+ simplifyLoop(OuterL, DT, LI, SE, AC, nullptr, PreserveLCSSA);
+ } else {
+ // Simplify loops for which we might've broken loop-simplify form.
+ for (Loop *SubLoop : LoopsToSimplify)
+ simplifyLoop(SubLoop, DT, LI, SE, AC, nullptr, PreserveLCSSA);
+ }
+ }
+
+ return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
+ : LoopUnrollResult::PartiallyUnrolled;
+}
+
+/// Given an llvm.loop loop id metadata node, returns the loop hint metadata
+/// node with the given name (for example, "llvm.loop.unroll.count"). If no
+/// such metadata node exists, then nullptr is returned.
+MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
+ // First operand should refer to the loop id itself.
+ assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+ assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+ for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+ MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+ if (!MD)
+ continue;
+
+ MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+ if (!S)
+ continue;
+
+ if (Name.equals(S->getString()))
+ return MD;
+ }
+ return nullptr;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index 6dd14c591e..6e32a2b865 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -1,985 +1,985 @@
-//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements loop unroll and jam as a routine, much like
-// LoopUnroll.cpp implements loop unroll.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/DependenceAnalysis.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/MustExecute.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/ValueMap.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GenericDomTree.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/UnrollLoop.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <assert.h>
-#include <memory>
-#include <type_traits>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-unroll-and-jam"
-
-STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
-STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
-
-typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
-
-// Partition blocks in an outer/inner loop pair into blocks before and after
-// the loop
-static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks,
- BasicBlockSet &AftBlocks, DominatorTree &DT) {
- Loop *SubLoop = L.getSubLoops()[0];
- BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
-
- for (BasicBlock *BB : L.blocks()) {
- if (!SubLoop->contains(BB)) {
- if (DT.dominates(SubLoopLatch, BB))
- AftBlocks.insert(BB);
- else
- ForeBlocks.insert(BB);
- }
- }
-
- // Check that all blocks in ForeBlocks together dominate the subloop
- // TODO: This might ideally be done better with a dominator/postdominators.
- BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
- for (BasicBlock *BB : ForeBlocks) {
- if (BB == SubLoopPreHeader)
- continue;
- Instruction *TI = BB->getTerminator();
- for (BasicBlock *Succ : successors(TI))
- if (!ForeBlocks.count(Succ))
- return false;
- }
-
- return true;
-}
-
-/// Partition blocks in a loop nest into blocks before and after each inner
-/// loop.
-static bool partitionOuterLoopBlocks(
- Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks,
- DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
- DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) {
- JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end());
-
- for (Loop *L : Root.getLoopsInPreorder()) {
- if (L == &JamLoop)
- break;
-
- if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT))
- return false;
- }
-
- return true;
-}
-
-// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more
-// than 2 levels loop.
-static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
- BasicBlockSet &ForeBlocks,
- BasicBlockSet &SubLoopBlocks,
- BasicBlockSet &AftBlocks,
- DominatorTree *DT) {
- SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
- return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT);
-}
-
-// Looks at the phi nodes in Header for values coming from Latch. For these
-// instructions and all their operands calls Visit on them, keeping going for
-// all the operands in AftBlocks. Returns false if Visit returns false,
-// otherwise returns true. This is used to process the instructions in the
-// Aft blocks that need to be moved before the subloop. It is used in two
-// places. One to check that the required set of instructions can be moved
-// before the loop. Then to collect the instructions to actually move in
-// moveHeaderPhiOperandsToForeBlocks.
-template <typename T>
-static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
- BasicBlockSet &AftBlocks, T Visit) {
- SmallVector<Instruction *, 8> Worklist;
- for (auto &Phi : Header->phis()) {
- Value *V = Phi.getIncomingValueForBlock(Latch);
- if (Instruction *I = dyn_cast<Instruction>(V))
- Worklist.push_back(I);
- }
-
- while (!Worklist.empty()) {
+//===-- LoopUnrollAndJam.cpp - Loop unrolling utilities -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements loop unroll and jam as a routine, much like
+// LoopUnroll.cpp implements loop unroll.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <assert.h>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll-and-jam"
+
+STATISTIC(NumUnrolledAndJammed, "Number of loops unroll and jammed");
+STATISTIC(NumCompletelyUnrolledAndJammed, "Number of loops unroll and jammed");
+
+typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
+
+// Partition blocks in an outer/inner loop pair into blocks before and after
+// the loop
+static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks,
+ BasicBlockSet &AftBlocks, DominatorTree &DT) {
+ Loop *SubLoop = L.getSubLoops()[0];
+ BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
+
+ for (BasicBlock *BB : L.blocks()) {
+ if (!SubLoop->contains(BB)) {
+ if (DT.dominates(SubLoopLatch, BB))
+ AftBlocks.insert(BB);
+ else
+ ForeBlocks.insert(BB);
+ }
+ }
+
+ // Check that all blocks in ForeBlocks together dominate the subloop
+ // TODO: This might ideally be done better with a dominator/postdominators.
+ BasicBlock *SubLoopPreHeader = SubLoop->getLoopPreheader();
+ for (BasicBlock *BB : ForeBlocks) {
+ if (BB == SubLoopPreHeader)
+ continue;
+ Instruction *TI = BB->getTerminator();
+ for (BasicBlock *Succ : successors(TI))
+ if (!ForeBlocks.count(Succ))
+ return false;
+ }
+
+ return true;
+}
+
+/// Partition blocks in a loop nest into blocks before and after each inner
+/// loop.
+static bool partitionOuterLoopBlocks(
+ Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks,
+ DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
+ DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) {
+ JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end());
+
+ for (Loop *L : Root.getLoopsInPreorder()) {
+ if (L == &JamLoop)
+ break;
+
+ if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT))
+ return false;
+ }
+
+ return true;
+}
+
+// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more
+// than 2 levels loop.
+static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
+ BasicBlockSet &ForeBlocks,
+ BasicBlockSet &SubLoopBlocks,
+ BasicBlockSet &AftBlocks,
+ DominatorTree *DT) {
+ SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
+ return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT);
+}
+
+// Looks at the phi nodes in Header for values coming from Latch. For these
+// instructions and all their operands calls Visit on them, keeping going for
+// all the operands in AftBlocks. Returns false if Visit returns false,
+// otherwise returns true. This is used to process the instructions in the
+// Aft blocks that need to be moved before the subloop. It is used in two
+// places. One to check that the required set of instructions can be moved
+// before the loop. Then to collect the instructions to actually move in
+// moveHeaderPhiOperandsToForeBlocks.
+template <typename T>
+static bool processHeaderPhiOperands(BasicBlock *Header, BasicBlock *Latch,
+ BasicBlockSet &AftBlocks, T Visit) {
+ SmallVector<Instruction *, 8> Worklist;
+ for (auto &Phi : Header->phis()) {
+ Value *V = Phi.getIncomingValueForBlock(Latch);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ Worklist.push_back(I);
+ }
+
+ while (!Worklist.empty()) {
Instruction *I = Worklist.pop_back_val();
- if (!Visit(I))
- return false;
-
- if (AftBlocks.count(I->getParent()))
- for (auto &U : I->operands())
- if (Instruction *II = dyn_cast<Instruction>(U))
- Worklist.push_back(II);
- }
-
- return true;
-}
-
-// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
-static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
- BasicBlock *Latch,
- Instruction *InsertLoc,
- BasicBlockSet &AftBlocks) {
- // We need to ensure we move the instructions in the correct order,
- // starting with the earliest required instruction and moving forward.
- std::vector<Instruction *> Visited;
- processHeaderPhiOperands(Header, Latch, AftBlocks,
- [&Visited, &AftBlocks](Instruction *I) {
- if (AftBlocks.count(I->getParent()))
- Visited.push_back(I);
- return true;
- });
-
- // Move all instructions in program order to before the InsertLoc
- BasicBlock *InsertLocBB = InsertLoc->getParent();
- for (Instruction *I : reverse(Visited)) {
- if (I->getParent() != InsertLocBB)
- I->moveBefore(InsertLoc);
- }
-}
-
-/*
- This method performs Unroll and Jam. For a simple loop like:
- for (i = ..)
- Fore(i)
- for (j = ..)
- SubLoop(i, j)
- Aft(i)
-
- Instead of doing normal inner or outer unrolling, we do:
- for (i = .., i+=2)
- Fore(i)
- Fore(i+1)
- for (j = ..)
- SubLoop(i, j)
- SubLoop(i+1, j)
- Aft(i)
- Aft(i+1)
-
- So the outer loop is essetially unrolled and then the inner loops are fused
- ("jammed") together into a single loop. This can increase speed when there
- are loads in SubLoop that are invariant to i, as they become shared between
- the now jammed inner loops.
-
- We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
- Fore blocks are those before the inner loop, Aft are those after. Normal
- Unroll code is used to copy each of these sets of blocks and the results are
- combined together into the final form above.
-
- isSafeToUnrollAndJam should be used prior to calling this to make sure the
- unrolling will be valid. Checking profitablility is also advisable.
-
- If EpilogueLoop is non-null, it receives the epilogue loop (if it was
- necessary to create one and not fully unrolled).
-*/
-LoopUnrollResult
-llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
- unsigned TripMultiple, bool UnrollRemainder,
- LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
- AssumptionCache *AC, const TargetTransformInfo *TTI,
- OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
-
- // When we enter here we should have already checked that it is safe
- BasicBlock *Header = L->getHeader();
- assert(Header && "No header.");
- assert(L->getSubLoops().size() == 1);
- Loop *SubLoop = *L->begin();
-
- // Don't enter the unroll code if there is nothing to do.
- if (TripCount == 0 && Count < 2) {
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
- return LoopUnrollResult::Unmodified;
- }
-
- assert(Count > 0);
- assert(TripMultiple > 0);
- assert(TripCount == 0 || TripCount % TripMultiple == 0);
-
- // Are we eliminating the loop control altogether?
- bool CompletelyUnroll = (Count == TripCount);
-
- // We use the runtime remainder in cases where we don't know trip multiple
- if (TripMultiple == 1 || TripMultiple % Count != 0) {
- if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
- /*UseEpilogRemainder*/ true,
- UnrollRemainder, /*ForgetAllSCEV*/ false,
- LI, SE, DT, AC, TTI, true, EpilogueLoop)) {
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
- "generated when assuming runtime trip count\n");
- return LoopUnrollResult::Unmodified;
- }
- }
-
- // Notify ScalarEvolution that the loop will be substantially changed,
- // if not outright eliminated.
- if (SE) {
- SE->forgetLoop(L);
- SE->forgetLoop(SubLoop);
- }
-
- using namespace ore;
- // Report the unrolling decision.
- if (CompletelyUnroll) {
- LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
- << Header->getName() << " with trip count " << TripCount
- << "!\n");
- ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
- L->getHeader())
- << "completely unroll and jammed loop with "
- << NV("UnrollCount", TripCount) << " iterations");
- } else {
- auto DiagBuilder = [&]() {
- OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
- L->getHeader());
- return Diag << "unroll and jammed loop by a factor of "
- << NV("UnrollCount", Count);
- };
-
- LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
- << " by " << Count);
- if (TripMultiple != 1) {
- LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
- ORE->emit([&]() {
- return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
- << " trips per branch";
- });
- } else {
- LLVM_DEBUG(dbgs() << " with run-time trip count");
- ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
- }
- LLVM_DEBUG(dbgs() << "!\n");
- }
-
- BasicBlock *Preheader = L->getLoopPreheader();
- BasicBlock *LatchBlock = L->getLoopLatch();
- assert(Preheader && "No preheader");
- assert(LatchBlock && "No latch block");
- BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
- assert(BI && !BI->isUnconditional());
- bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
- BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
- bool SubLoopContinueOnTrue = SubLoop->contains(
- SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
-
- // Partition blocks in an outer/inner loop pair into blocks before and after
- // the loop
- BasicBlockSet SubLoopBlocks;
- BasicBlockSet ForeBlocks;
- BasicBlockSet AftBlocks;
- partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
- DT);
-
- // We keep track of the entering/first and exiting/last block of each of
- // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
- // blocks easier.
- std::vector<BasicBlock *> ForeBlocksFirst;
- std::vector<BasicBlock *> ForeBlocksLast;
- std::vector<BasicBlock *> SubLoopBlocksFirst;
- std::vector<BasicBlock *> SubLoopBlocksLast;
- std::vector<BasicBlock *> AftBlocksFirst;
- std::vector<BasicBlock *> AftBlocksLast;
- ForeBlocksFirst.push_back(Header);
- ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
- SubLoopBlocksFirst.push_back(SubLoop->getHeader());
- SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
- AftBlocksFirst.push_back(SubLoop->getExitBlock());
- AftBlocksLast.push_back(L->getExitingBlock());
- // Maps Blocks[0] -> Blocks[It]
- ValueToValueMapTy LastValueMap;
-
- // Move any instructions from fore phi operands from AftBlocks into Fore.
- moveHeaderPhiOperandsToForeBlocks(
- Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks);
-
- // The current on-the-fly SSA update requires blocks to be processed in
- // reverse postorder so that LastValueMap contains the correct value at each
- // exit.
- LoopBlocksDFS DFS(L);
- DFS.perform(LI);
- // Stash the DFS iterators before adding blocks to the loop.
- LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
- LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
-
- if (Header->getParent()->isDebugInfoForProfiling())
- for (BasicBlock *BB : L->getBlocks())
- for (Instruction &I : *BB)
- if (!isa<DbgInfoIntrinsic>(&I))
- if (const DILocation *DIL = I.getDebugLoc()) {
- auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
- if (NewDIL)
- I.setDebugLoc(NewDIL.getValue());
- else
- LLVM_DEBUG(dbgs()
- << "Failed to create new discriminator: "
- << DIL->getFilename() << " Line: " << DIL->getLine());
- }
-
- // Copy all blocks
- for (unsigned It = 1; It != Count; ++It) {
- SmallVector<BasicBlock *, 8> NewBlocks;
- // Maps Blocks[It] -> Blocks[It-1]
- DenseMap<Value *, Value *> PrevItValueMap;
- SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
- NewLoops[L] = L;
- NewLoops[SubLoop] = SubLoop;
-
- for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
- ValueToValueMapTy VMap;
- BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
- Header->getParent()->getBasicBlockList().push_back(New);
-
- // Tell LI about New.
- addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
-
- if (ForeBlocks.count(*BB)) {
- if (*BB == ForeBlocksFirst[0])
- ForeBlocksFirst.push_back(New);
- if (*BB == ForeBlocksLast[0])
- ForeBlocksLast.push_back(New);
- } else if (SubLoopBlocks.count(*BB)) {
- if (*BB == SubLoopBlocksFirst[0])
- SubLoopBlocksFirst.push_back(New);
- if (*BB == SubLoopBlocksLast[0])
- SubLoopBlocksLast.push_back(New);
- } else if (AftBlocks.count(*BB)) {
- if (*BB == AftBlocksFirst[0])
- AftBlocksFirst.push_back(New);
- if (*BB == AftBlocksLast[0])
- AftBlocksLast.push_back(New);
- } else {
- llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
- }
-
- // Update our running maps of newest clones
- PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
- LastValueMap[*BB] = New;
- for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
- VI != VE; ++VI) {
- PrevItValueMap[VI->second] =
- const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
- LastValueMap[VI->first] = VI->second;
- }
-
- NewBlocks.push_back(New);
-
- // Update DomTree:
- if (*BB == ForeBlocksFirst[0])
- DT->addNewBlock(New, ForeBlocksLast[It - 1]);
- else if (*BB == SubLoopBlocksFirst[0])
- DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
- else if (*BB == AftBlocksFirst[0])
- DT->addNewBlock(New, AftBlocksLast[It - 1]);
- else {
- // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
- // structure.
- auto BBDomNode = DT->getNode(*BB);
- auto BBIDom = BBDomNode->getIDom();
- BasicBlock *OriginalBBIDom = BBIDom->getBlock();
- assert(OriginalBBIDom);
- assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
- DT->addNewBlock(
- New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
- }
- }
-
- // Remap all instructions in the most recent iteration
- remapInstructionsInBlocks(NewBlocks, LastValueMap);
- for (BasicBlock *NewBlock : NewBlocks) {
- for (Instruction &I : *NewBlock) {
- if (auto *II = dyn_cast<IntrinsicInst>(&I))
- if (II->getIntrinsicID() == Intrinsic::assume)
- AC->registerAssumption(II);
- }
- }
-
- // Alter the ForeBlocks phi's, pointing them at the latest version of the
- // value from the previous iteration's phis
- for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
- Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
- assert(OldValue && "should have incoming edge from Aft[It]");
- Value *NewValue = OldValue;
- if (Value *PrevValue = PrevItValueMap[OldValue])
- NewValue = PrevValue;
-
- assert(Phi.getNumOperands() == 2);
- Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
- Phi.setIncomingValue(0, NewValue);
- Phi.removeIncomingValue(1);
- }
- }
-
- // Now that all the basic blocks for the unrolled iterations are in place,
- // finish up connecting the blocks and phi nodes. At this point LastValueMap
- // is the last unrolled iterations values.
-
- // Update Phis in BB from OldBB to point to NewBB and use the latest value
- // from LastValueMap
- auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
- BasicBlock *NewBB,
- ValueToValueMapTy &LastValueMap) {
- for (PHINode &Phi : BB->phis()) {
- for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
- if (Phi.getIncomingBlock(b) == OldBB) {
- Value *OldValue = Phi.getIncomingValue(b);
- if (Value *LastValue = LastValueMap[OldValue])
- Phi.setIncomingValue(b, LastValue);
- Phi.setIncomingBlock(b, NewBB);
- break;
- }
- }
- }
- };
- // Move all the phis from Src into Dest
- auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
- Instruction *insertPoint = Dest->getFirstNonPHI();
- while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
- Phi->moveBefore(insertPoint);
- };
-
- // Update the PHI values outside the loop to point to the last block
- updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
- LastValueMap);
-
- // Update ForeBlocks successors and phi nodes
- BranchInst *ForeTerm =
- cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
- assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
- ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]);
-
- if (CompletelyUnroll) {
- while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
- Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
- Phi->getParent()->getInstList().erase(Phi);
- }
- } else {
- // Update the PHI values to point to the last aft block
- updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
- AftBlocksLast.back(), LastValueMap);
- }
-
- for (unsigned It = 1; It != Count; It++) {
- // Remap ForeBlock successors from previous iteration to this
- BranchInst *ForeTerm =
- cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
- assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
- ForeTerm->setSuccessor(0, ForeBlocksFirst[It]);
- }
-
- // Subloop successors and phis
- BranchInst *SubTerm =
- cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
- SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
- SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
+ if (!Visit(I))
+ return false;
+
+ if (AftBlocks.count(I->getParent()))
+ for (auto &U : I->operands())
+ if (Instruction *II = dyn_cast<Instruction>(U))
+ Worklist.push_back(II);
+ }
+
+ return true;
+}
+
+// Move the phi operands of Header from Latch out of AftBlocks to InsertLoc.
+static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
+ BasicBlock *Latch,
+ Instruction *InsertLoc,
+ BasicBlockSet &AftBlocks) {
+ // We need to ensure we move the instructions in the correct order,
+ // starting with the earliest required instruction and moving forward.
+ std::vector<Instruction *> Visited;
+ processHeaderPhiOperands(Header, Latch, AftBlocks,
+ [&Visited, &AftBlocks](Instruction *I) {
+ if (AftBlocks.count(I->getParent()))
+ Visited.push_back(I);
+ return true;
+ });
+
+ // Move all instructions in program order to before the InsertLoc
+ BasicBlock *InsertLocBB = InsertLoc->getParent();
+ for (Instruction *I : reverse(Visited)) {
+ if (I->getParent() != InsertLocBB)
+ I->moveBefore(InsertLoc);
+ }
+}
+
+/*
+ This method performs Unroll and Jam. For a simple loop like:
+ for (i = ..)
+ Fore(i)
+ for (j = ..)
+ SubLoop(i, j)
+ Aft(i)
+
+ Instead of doing normal inner or outer unrolling, we do:
+ for (i = .., i+=2)
+ Fore(i)
+ Fore(i+1)
+ for (j = ..)
+ SubLoop(i, j)
+ SubLoop(i+1, j)
+ Aft(i)
+ Aft(i+1)
+
+ So the outer loop is essetially unrolled and then the inner loops are fused
+ ("jammed") together into a single loop. This can increase speed when there
+ are loads in SubLoop that are invariant to i, as they become shared between
+ the now jammed inner loops.
+
+ We do this by spliting the blocks in the loop into Fore, Subloop and Aft.
+ Fore blocks are those before the inner loop, Aft are those after. Normal
+ Unroll code is used to copy each of these sets of blocks and the results are
+ combined together into the final form above.
+
+ isSafeToUnrollAndJam should be used prior to calling this to make sure the
+ unrolling will be valid. Checking profitablility is also advisable.
+
+ If EpilogueLoop is non-null, it receives the epilogue loop (if it was
+ necessary to create one and not fully unrolled).
+*/
+LoopUnrollResult
+llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
+ unsigned TripMultiple, bool UnrollRemainder,
+ LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+ AssumptionCache *AC, const TargetTransformInfo *TTI,
+ OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
+
+ // When we enter here we should have already checked that it is safe
+ BasicBlock *Header = L->getHeader();
+ assert(Header && "No header.");
+ assert(L->getSubLoops().size() == 1);
+ Loop *SubLoop = *L->begin();
+
+ // Don't enter the unroll code if there is nothing to do.
+ if (TripCount == 0 && Count < 2) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; almost nothing to do\n");
+ return LoopUnrollResult::Unmodified;
+ }
+
+ assert(Count > 0);
+ assert(TripMultiple > 0);
+ assert(TripCount == 0 || TripCount % TripMultiple == 0);
+
+ // Are we eliminating the loop control altogether?
+ bool CompletelyUnroll = (Count == TripCount);
+
+ // We use the runtime remainder in cases where we don't know trip multiple
+ if (TripMultiple == 1 || TripMultiple % Count != 0) {
+ if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
+ /*UseEpilogRemainder*/ true,
+ UnrollRemainder, /*ForgetAllSCEV*/ false,
+ LI, SE, DT, AC, TTI, true, EpilogueLoop)) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
+ "generated when assuming runtime trip count\n");
+ return LoopUnrollResult::Unmodified;
+ }
+ }
+
+ // Notify ScalarEvolution that the loop will be substantially changed,
+ // if not outright eliminated.
+ if (SE) {
+ SE->forgetLoop(L);
+ SE->forgetLoop(SubLoop);
+ }
+
+ using namespace ore;
+ // Report the unrolling decision.
+ if (CompletelyUnroll) {
+ LLVM_DEBUG(dbgs() << "COMPLETELY UNROLL AND JAMMING loop %"
+ << Header->getName() << " with trip count " << TripCount
+ << "!\n");
+ ORE->emit(OptimizationRemark(DEBUG_TYPE, "FullyUnrolled", L->getStartLoc(),
+ L->getHeader())
+ << "completely unroll and jammed loop with "
+ << NV("UnrollCount", TripCount) << " iterations");
+ } else {
+ auto DiagBuilder = [&]() {
+ OptimizationRemark Diag(DEBUG_TYPE, "PartialUnrolled", L->getStartLoc(),
+ L->getHeader());
+ return Diag << "unroll and jammed loop by a factor of "
+ << NV("UnrollCount", Count);
+ };
+
+ LLVM_DEBUG(dbgs() << "UNROLL AND JAMMING loop %" << Header->getName()
+ << " by " << Count);
+ if (TripMultiple != 1) {
+ LLVM_DEBUG(dbgs() << " with " << TripMultiple << " trips per branch");
+ ORE->emit([&]() {
+ return DiagBuilder() << " with " << NV("TripMultiple", TripMultiple)
+ << " trips per branch";
+ });
+ } else {
+ LLVM_DEBUG(dbgs() << " with run-time trip count");
+ ORE->emit([&]() { return DiagBuilder() << " with run-time trip count"; });
+ }
+ LLVM_DEBUG(dbgs() << "!\n");
+ }
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+ BasicBlock *LatchBlock = L->getLoopLatch();
+ assert(Preheader && "No preheader");
+ assert(LatchBlock && "No latch block");
+ BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+ assert(BI && !BI->isUnconditional());
+ bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
+ BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
+ bool SubLoopContinueOnTrue = SubLoop->contains(
+ SubLoop->getLoopLatch()->getTerminator()->getSuccessor(0));
+
+ // Partition blocks in an outer/inner loop pair into blocks before and after
+ // the loop
+ BasicBlockSet SubLoopBlocks;
+ BasicBlockSet ForeBlocks;
+ BasicBlockSet AftBlocks;
+ partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks, AftBlocks,
+ DT);
+
+ // We keep track of the entering/first and exiting/last block of each of
+ // Fore/SubLoop/Aft in each iteration. This helps make the stapling up of
+ // blocks easier.
+ std::vector<BasicBlock *> ForeBlocksFirst;
+ std::vector<BasicBlock *> ForeBlocksLast;
+ std::vector<BasicBlock *> SubLoopBlocksFirst;
+ std::vector<BasicBlock *> SubLoopBlocksLast;
+ std::vector<BasicBlock *> AftBlocksFirst;
+ std::vector<BasicBlock *> AftBlocksLast;
+ ForeBlocksFirst.push_back(Header);
+ ForeBlocksLast.push_back(SubLoop->getLoopPreheader());
+ SubLoopBlocksFirst.push_back(SubLoop->getHeader());
+ SubLoopBlocksLast.push_back(SubLoop->getExitingBlock());
+ AftBlocksFirst.push_back(SubLoop->getExitBlock());
+ AftBlocksLast.push_back(L->getExitingBlock());
+ // Maps Blocks[0] -> Blocks[It]
+ ValueToValueMapTy LastValueMap;
+
+ // Move any instructions from fore phi operands from AftBlocks into Fore.
+ moveHeaderPhiOperandsToForeBlocks(
+ Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks);
+
+ // The current on-the-fly SSA update requires blocks to be processed in
+ // reverse postorder so that LastValueMap contains the correct value at each
+ // exit.
+ LoopBlocksDFS DFS(L);
+ DFS.perform(LI);
+ // Stash the DFS iterators before adding blocks to the loop.
+ LoopBlocksDFS::RPOIterator BlockBegin = DFS.beginRPO();
+ LoopBlocksDFS::RPOIterator BlockEnd = DFS.endRPO();
+
+ if (Header->getParent()->isDebugInfoForProfiling())
+ for (BasicBlock *BB : L->getBlocks())
+ for (Instruction &I : *BB)
+ if (!isa<DbgInfoIntrinsic>(&I))
+ if (const DILocation *DIL = I.getDebugLoc()) {
+ auto NewDIL = DIL->cloneByMultiplyingDuplicationFactor(Count);
+ if (NewDIL)
+ I.setDebugLoc(NewDIL.getValue());
+ else
+ LLVM_DEBUG(dbgs()
+ << "Failed to create new discriminator: "
+ << DIL->getFilename() << " Line: " << DIL->getLine());
+ }
+
+ // Copy all blocks
+ for (unsigned It = 1; It != Count; ++It) {
+ SmallVector<BasicBlock *, 8> NewBlocks;
+ // Maps Blocks[It] -> Blocks[It-1]
+ DenseMap<Value *, Value *> PrevItValueMap;
+ SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
+ NewLoops[L] = L;
+ NewLoops[SubLoop] = SubLoop;
+
+ for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+ ValueToValueMapTy VMap;
+ BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
+ Header->getParent()->getBasicBlockList().push_back(New);
+
+ // Tell LI about New.
+ addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
+
+ if (ForeBlocks.count(*BB)) {
+ if (*BB == ForeBlocksFirst[0])
+ ForeBlocksFirst.push_back(New);
+ if (*BB == ForeBlocksLast[0])
+ ForeBlocksLast.push_back(New);
+ } else if (SubLoopBlocks.count(*BB)) {
+ if (*BB == SubLoopBlocksFirst[0])
+ SubLoopBlocksFirst.push_back(New);
+ if (*BB == SubLoopBlocksLast[0])
+ SubLoopBlocksLast.push_back(New);
+ } else if (AftBlocks.count(*BB)) {
+ if (*BB == AftBlocksFirst[0])
+ AftBlocksFirst.push_back(New);
+ if (*BB == AftBlocksLast[0])
+ AftBlocksLast.push_back(New);
+ } else {
+ llvm_unreachable("BB being cloned should be in Fore/Sub/Aft");
+ }
+
+ // Update our running maps of newest clones
+ PrevItValueMap[New] = (It == 1 ? *BB : LastValueMap[*BB]);
+ LastValueMap[*BB] = New;
+ for (ValueToValueMapTy::iterator VI = VMap.begin(), VE = VMap.end();
+ VI != VE; ++VI) {
+ PrevItValueMap[VI->second] =
+ const_cast<Value *>(It == 1 ? VI->first : LastValueMap[VI->first]);
+ LastValueMap[VI->first] = VI->second;
+ }
+
+ NewBlocks.push_back(New);
+
+ // Update DomTree:
+ if (*BB == ForeBlocksFirst[0])
+ DT->addNewBlock(New, ForeBlocksLast[It - 1]);
+ else if (*BB == SubLoopBlocksFirst[0])
+ DT->addNewBlock(New, SubLoopBlocksLast[It - 1]);
+ else if (*BB == AftBlocksFirst[0])
+ DT->addNewBlock(New, AftBlocksLast[It - 1]);
+ else {
+ // Each set of blocks (Fore/Sub/Aft) will have the same internal domtree
+ // structure.
+ auto BBDomNode = DT->getNode(*BB);
+ auto BBIDom = BBDomNode->getIDom();
+ BasicBlock *OriginalBBIDom = BBIDom->getBlock();
+ assert(OriginalBBIDom);
+ assert(LastValueMap[cast<Value>(OriginalBBIDom)]);
+ DT->addNewBlock(
+ New, cast<BasicBlock>(LastValueMap[cast<Value>(OriginalBBIDom)]));
+ }
+ }
+
+ // Remap all instructions in the most recent iteration
+ remapInstructionsInBlocks(NewBlocks, LastValueMap);
+ for (BasicBlock *NewBlock : NewBlocks) {
+ for (Instruction &I : *NewBlock) {
+ if (auto *II = dyn_cast<IntrinsicInst>(&I))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
+ }
+ }
+
+ // Alter the ForeBlocks phi's, pointing them at the latest version of the
+ // value from the previous iteration's phis
+ for (PHINode &Phi : ForeBlocksFirst[It]->phis()) {
+ Value *OldValue = Phi.getIncomingValueForBlock(AftBlocksLast[It]);
+ assert(OldValue && "should have incoming edge from Aft[It]");
+ Value *NewValue = OldValue;
+ if (Value *PrevValue = PrevItValueMap[OldValue])
+ NewValue = PrevValue;
+
+ assert(Phi.getNumOperands() == 2);
+ Phi.setIncomingBlock(0, ForeBlocksLast[It - 1]);
+ Phi.setIncomingValue(0, NewValue);
+ Phi.removeIncomingValue(1);
+ }
+ }
+
+ // Now that all the basic blocks for the unrolled iterations are in place,
+ // finish up connecting the blocks and phi nodes. At this point LastValueMap
+ // is the last unrolled iterations values.
+
+ // Update Phis in BB from OldBB to point to NewBB and use the latest value
+ // from LastValueMap
+ auto updatePHIBlocksAndValues = [](BasicBlock *BB, BasicBlock *OldBB,
+ BasicBlock *NewBB,
+ ValueToValueMapTy &LastValueMap) {
+ for (PHINode &Phi : BB->phis()) {
+ for (unsigned b = 0; b < Phi.getNumIncomingValues(); ++b) {
+ if (Phi.getIncomingBlock(b) == OldBB) {
+ Value *OldValue = Phi.getIncomingValue(b);
+ if (Value *LastValue = LastValueMap[OldValue])
+ Phi.setIncomingValue(b, LastValue);
+ Phi.setIncomingBlock(b, NewBB);
+ break;
+ }
+ }
+ }
+ };
+ // Move all the phis from Src into Dest
+ auto movePHIs = [](BasicBlock *Src, BasicBlock *Dest) {
+ Instruction *insertPoint = Dest->getFirstNonPHI();
+ while (PHINode *Phi = dyn_cast<PHINode>(Src->begin()))
+ Phi->moveBefore(insertPoint);
+ };
+
+ // Update the PHI values outside the loop to point to the last block
+ updatePHIBlocksAndValues(LoopExit, AftBlocksLast[0], AftBlocksLast.back(),
+ LastValueMap);
+
+ // Update ForeBlocks successors and phi nodes
+ BranchInst *ForeTerm =
+ cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
+ assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
+ ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]);
+
+ if (CompletelyUnroll) {
+ while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
+ Phi->replaceAllUsesWith(Phi->getIncomingValueForBlock(Preheader));
+ Phi->getParent()->getInstList().erase(Phi);
+ }
+ } else {
+ // Update the PHI values to point to the last aft block
+ updatePHIBlocksAndValues(ForeBlocksFirst[0], AftBlocksLast[0],
+ AftBlocksLast.back(), LastValueMap);
+ }
+
+ for (unsigned It = 1; It != Count; It++) {
+ // Remap ForeBlock successors from previous iteration to this
+ BranchInst *ForeTerm =
+ cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
+ assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
+ ForeTerm->setSuccessor(0, ForeBlocksFirst[It]);
+ }
+
+ // Subloop successors and phis
+ BranchInst *SubTerm =
+ cast<BranchInst>(SubLoopBlocksLast.back()->getTerminator());
+ SubTerm->setSuccessor(!SubLoopContinueOnTrue, SubLoopBlocksFirst[0]);
+ SubTerm->setSuccessor(SubLoopContinueOnTrue, AftBlocksFirst[0]);
SubLoopBlocksFirst[0]->replacePhiUsesWith(ForeBlocksLast[0],
ForeBlocksLast.back());
SubLoopBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],
SubLoopBlocksLast.back());
-
- for (unsigned It = 1; It != Count; It++) {
- // Replace the conditional branch of the previous iteration subloop with an
- // unconditional one to this one
- BranchInst *SubTerm =
- cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
- BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
- SubTerm->eraseFromParent();
-
+
+ for (unsigned It = 1; It != Count; It++) {
+ // Replace the conditional branch of the previous iteration subloop with an
+ // unconditional one to this one
+ BranchInst *SubTerm =
+ cast<BranchInst>(SubLoopBlocksLast[It - 1]->getTerminator());
+ BranchInst::Create(SubLoopBlocksFirst[It], SubTerm);
+ SubTerm->eraseFromParent();
+
SubLoopBlocksFirst[It]->replacePhiUsesWith(ForeBlocksLast[It],
ForeBlocksLast.back());
SubLoopBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
SubLoopBlocksLast.back());
- movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
- }
-
- // Aft blocks successors and phis
- BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
- if (CompletelyUnroll) {
- BranchInst::Create(LoopExit, AftTerm);
- AftTerm->eraseFromParent();
- } else {
- AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
- assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit &&
- "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit");
- }
+ movePHIs(SubLoopBlocksFirst[It], SubLoopBlocksFirst[0]);
+ }
+
+ // Aft blocks successors and phis
+ BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
+ if (CompletelyUnroll) {
+ BranchInst::Create(LoopExit, AftTerm);
+ AftTerm->eraseFromParent();
+ } else {
+ AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
+ assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit &&
+ "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit");
+ }
AftBlocksFirst[0]->replacePhiUsesWith(SubLoopBlocksLast[0],
SubLoopBlocksLast.back());
-
- for (unsigned It = 1; It != Count; It++) {
- // Replace the conditional branch of the previous iteration subloop with an
- // unconditional one to this one
- BranchInst *AftTerm =
- cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
- BranchInst::Create(AftBlocksFirst[It], AftTerm);
- AftTerm->eraseFromParent();
-
+
+ for (unsigned It = 1; It != Count; It++) {
+ // Replace the conditional branch of the previous iteration subloop with an
+ // unconditional one to this one
+ BranchInst *AftTerm =
+ cast<BranchInst>(AftBlocksLast[It - 1]->getTerminator());
+ BranchInst::Create(AftBlocksFirst[It], AftTerm);
+ AftTerm->eraseFromParent();
+
AftBlocksFirst[It]->replacePhiUsesWith(SubLoopBlocksLast[It],
SubLoopBlocksLast.back());
- movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
- }
-
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
- // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
- // new ones required.
- if (Count != 1) {
- SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
- DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
- SubLoopBlocksFirst[0]);
- DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
- SubLoopBlocksLast[0], AftBlocksFirst[0]);
-
- DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
- ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
- DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
- SubLoopBlocksLast.back(), AftBlocksFirst[0]);
- DTU.applyUpdatesPermissive(DTUpdates);
- }
-
- // Merge adjacent basic blocks, if possible.
- SmallPtrSet<BasicBlock *, 16> MergeBlocks;
- MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
- MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
- MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
-
- MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI);
-
- // Apply updates to the DomTree.
- DT = &DTU.getDomTree();
-
- // At this point, the code is well formed. We now do a quick sweep over the
- // inserted code, doing constant propagation and dead code elimination as we
- // go.
- simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI);
- simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC,
- TTI);
-
- NumCompletelyUnrolledAndJammed += CompletelyUnroll;
- ++NumUnrolledAndJammed;
-
- // Update LoopInfo if the loop is completely removed.
- if (CompletelyUnroll)
- LI->erase(L);
-
-#ifndef NDEBUG
- // We shouldn't have done anything to break loop simplify form or LCSSA.
- Loop *OutestLoop = SubLoop->getParentLoop()
- ? SubLoop->getParentLoop()->getParentLoop()
- ? SubLoop->getParentLoop()->getParentLoop()
- : SubLoop->getParentLoop()
- : SubLoop;
- assert(DT->verify());
- LI->verify(*DT);
- assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
- if (!CompletelyUnroll)
- assert(L->isLoopSimplifyForm());
- assert(SubLoop->isLoopSimplifyForm());
- SE->verify();
-#endif
-
- return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
- : LoopUnrollResult::PartiallyUnrolled;
-}
-
-static bool getLoadsAndStores(BasicBlockSet &Blocks,
- SmallVector<Instruction *, 4> &MemInstr) {
- // Scan the BBs and collect legal loads and stores.
- // Returns false if non-simple loads/stores are found.
- for (BasicBlock *BB : Blocks) {
- for (Instruction &I : *BB) {
- if (auto *Ld = dyn_cast<LoadInst>(&I)) {
- if (!Ld->isSimple())
- return false;
- MemInstr.push_back(&I);
- } else if (auto *St = dyn_cast<StoreInst>(&I)) {
- if (!St->isSimple())
- return false;
- MemInstr.push_back(&I);
- } else if (I.mayReadOrWriteMemory()) {
- return false;
- }
- }
- }
- return true;
-}
-
-static bool preservesForwardDependence(Instruction *Src, Instruction *Dst,
- unsigned UnrollLevel, unsigned JamLevel,
- bool Sequentialized, Dependence *D) {
- // UnrollLevel might carry the dependency Src --> Dst
- // Does a different loop after unrolling?
- for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
- ++CurLoopDepth) {
- auto JammedDir = D->getDirection(CurLoopDepth);
- if (JammedDir == Dependence::DVEntry::LT)
- return true;
-
- if (JammedDir & Dependence::DVEntry::GT)
- return false;
- }
-
- return true;
-}
-
-static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst,
- unsigned UnrollLevel, unsigned JamLevel,
- bool Sequentialized, Dependence *D) {
- // UnrollLevel might carry the dependency Dst --> Src
- for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
- ++CurLoopDepth) {
- auto JammedDir = D->getDirection(CurLoopDepth);
- if (JammedDir == Dependence::DVEntry::GT)
- return true;
-
- if (JammedDir & Dependence::DVEntry::LT)
- return false;
- }
-
- // Backward dependencies are only preserved if not interleaved.
- return Sequentialized;
-}
-
-// Check whether it is semantically safe Src and Dst considering any potential
-// dependency between them.
-//
-// @param UnrollLevel The level of the loop being unrolled
-// @param JamLevel The level of the loop being jammed; if Src and Dst are on
-// different levels, the outermost common loop counts as jammed level
-//
-// @return true if is safe and false if there is a dependency violation.
-static bool checkDependency(Instruction *Src, Instruction *Dst,
- unsigned UnrollLevel, unsigned JamLevel,
- bool Sequentialized, DependenceInfo &DI) {
- assert(UnrollLevel <= JamLevel &&
- "Expecting JamLevel to be at least UnrollLevel");
-
- if (Src == Dst)
- return true;
- // Ignore Input dependencies.
- if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
- return true;
-
- // Check whether unroll-and-jam may violate a dependency.
- // By construction, every dependency will be lexicographically non-negative
- // (if it was, it would violate the current execution order), such as
- // (0,0,>,*,*)
- // Unroll-and-jam changes the GT execution of two executions to the same
- // iteration of the chosen unroll level. That is, a GT dependence becomes a GE
- // dependence (or EQ, if we fully unrolled the loop) at the loop's position:
- // (0,0,>=,*,*)
- // Now, the dependency is not necessarily non-negative anymore, i.e.
- // unroll-and-jam may violate correctness.
- std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true);
- if (!D)
- return true;
- assert(D->isOrdered() && "Expected an output, flow or anti dep.");
-
- if (D->isConfused()) {
- LLVM_DEBUG(dbgs() << " Confused dependency between:\n"
- << " " << *Src << "\n"
- << " " << *Dst << "\n");
- return false;
- }
-
- // If outer levels (levels enclosing the loop being unroll-and-jammed) have a
- // non-equal direction, then the locations accessed in the inner levels cannot
- // overlap in memory. We assumes the indexes never overlap into neighboring
- // dimensions.
- for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth)
- if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ))
- return true;
-
- auto UnrollDirection = D->getDirection(UnrollLevel);
-
- // If the distance carried by the unrolled loop is 0, then after unrolling
- // that distance will become non-zero resulting in non-overlapping accesses in
- // the inner loops.
- if (UnrollDirection == Dependence::DVEntry::EQ)
- return true;
-
- if (UnrollDirection & Dependence::DVEntry::LT &&
- !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel,
- Sequentialized, D.get()))
- return false;
-
- if (UnrollDirection & Dependence::DVEntry::GT &&
- !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel,
- Sequentialized, D.get()))
- return false;
-
- return true;
-}
-
-static bool
-checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks,
- const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
- const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap,
- DependenceInfo &DI, LoopInfo &LI) {
- SmallVector<BasicBlockSet, 8> AllBlocks;
- for (Loop *L : Root.getLoopsInPreorder())
- if (ForeBlocksMap.find(L) != ForeBlocksMap.end())
- AllBlocks.push_back(ForeBlocksMap.lookup(L));
- AllBlocks.push_back(SubLoopBlocks);
- for (Loop *L : Root.getLoopsInPreorder())
- if (AftBlocksMap.find(L) != AftBlocksMap.end())
- AllBlocks.push_back(AftBlocksMap.lookup(L));
-
- unsigned LoopDepth = Root.getLoopDepth();
- SmallVector<Instruction *, 4> EarlierLoadsAndStores;
- SmallVector<Instruction *, 4> CurrentLoadsAndStores;
- for (BasicBlockSet &Blocks : AllBlocks) {
- CurrentLoadsAndStores.clear();
- if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores))
- return false;
-
- Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent());
- unsigned CurLoopDepth = CurLoop->getLoopDepth();
-
- for (auto *Earlier : EarlierLoadsAndStores) {
- Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent());
- unsigned EarlierDepth = EarlierLoop->getLoopDepth();
- unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth);
- for (auto *Later : CurrentLoadsAndStores) {
- if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false,
- DI))
- return false;
- }
- }
-
- size_t NumInsts = CurrentLoadsAndStores.size();
- for (size_t I = 0; I < NumInsts; ++I) {
- for (size_t J = I; J < NumInsts; ++J) {
- if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J],
- LoopDepth, CurLoopDepth, true, DI))
- return false;
- }
- }
-
- EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(),
- CurrentLoadsAndStores.end());
- }
- return true;
-}
-
-static bool isEligibleLoopForm(const Loop &Root) {
- // Root must have a child.
- if (Root.getSubLoops().size() != 1)
- return false;
-
- const Loop *L = &Root;
- do {
- // All loops in Root need to be in simplify and rotated form.
- if (!L->isLoopSimplifyForm())
- return false;
-
- if (!L->isRotatedForm())
- return false;
-
- if (L->getHeader()->hasAddressTaken()) {
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
- return false;
- }
-
- unsigned SubLoopsSize = L->getSubLoops().size();
- if (SubLoopsSize == 0)
- return true;
-
- // Only one child is allowed.
- if (SubLoopsSize != 1)
- return false;
-
- L = L->getSubLoops()[0];
- } while (L);
-
- return true;
-}
-
-static Loop *getInnerMostLoop(Loop *L) {
- while (!L->getSubLoops().empty())
- L = L->getSubLoops()[0];
- return L;
-}
-
-bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
- DependenceInfo &DI, LoopInfo &LI) {
- if (!isEligibleLoopForm(*L)) {
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n");
- return false;
- }
-
- /* We currently handle outer loops like this:
- |
- ForeFirst <------\ }
- Blocks | } ForeBlocks of L
- ForeLast | }
- | |
- ... |
- | |
- ForeFirst <----\ | }
- Blocks | | } ForeBlocks of a inner loop of L
- ForeLast | | }
- | | |
- JamLoopFirst <\ | | }
- Blocks | | | } JamLoopBlocks of the innermost loop
- JamLoopLast -/ | | }
- | | |
- AftFirst | | }
- Blocks | | } AftBlocks of a inner loop of L
- AftLast ------/ | }
- | |
- ... |
- | |
- AftFirst | }
- Blocks | } AftBlocks of L
- AftLast --------/ }
- |
-
- There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
- and AftBlocks, providing that there is one edge from Fores to SubLoops,
- one edge from SubLoops to Afts and a single outer loop exit (from Afts).
- In practice we currently limit Aft blocks to a single block, and limit
- things further in the profitablility checks of the unroll and jam pass.
-
- Because of the way we rearrange basic blocks, we also require that
- the Fore blocks of L on all unrolled iterations are safe to move before the
- blocks of the direct child of L of all iterations. So we require that the
- phi node looping operands of ForeHeader can be moved to at least the end of
- ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and
- match up Phi's correctly.
-
- i.e. The old order of blocks used to be
- (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2.
- It needs to be safe to transform this to
- (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2.
-
- There are then a number of checks along the lines of no calls, no
- exceptions, inner loop IV is consistent, etc. Note that for loops requiring
- runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
- UnrollAndJamLoop if the trip count cannot be easily calculated.
- */
-
- // Split blocks into Fore/SubLoop/Aft based on dominators
- Loop *JamLoop = getInnerMostLoop(L);
- BasicBlockSet SubLoopBlocks;
- DenseMap<Loop *, BasicBlockSet> ForeBlocksMap;
- DenseMap<Loop *, BasicBlockSet> AftBlocksMap;
- if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap,
- AftBlocksMap, DT)) {
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
- return false;
- }
-
- // Aft blocks may need to move instructions to fore blocks, which becomes more
- // difficult if there are multiple (potentially conditionally executed)
- // blocks. For now we just exclude loops with multiple aft blocks.
- if (AftBlocksMap[L].size() != 1) {
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
- "multiple blocks after the loop\n");
- return false;
- }
-
- // Check inner loop backedge count is consistent on all iterations of the
- // outer loop
- if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) {
- return !hasIterationCountInvariantInParent(SubLoop, SE);
- })) {
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
- "not consistent on each iteration\n");
- return false;
- }
-
- // Check the loop safety info for exceptions.
- SimpleLoopSafetyInfo LSI;
- LSI.computeLoopSafetyInfo(L);
- if (LSI.anyBlockMayThrow()) {
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
- return false;
- }
-
- // We've ruled out the easy stuff and now need to check that there are no
- // interdependencies which may prevent us from moving the:
- // ForeBlocks before Subloop and AftBlocks.
- // Subloop before AftBlocks.
- // ForeBlock phi operands before the subloop
-
- // Make sure we can move all instructions we need to before the subloop
- BasicBlock *Header = L->getHeader();
- BasicBlock *Latch = L->getLoopLatch();
- BasicBlockSet AftBlocks = AftBlocksMap[L];
- Loop *SubLoop = L->getSubLoops()[0];
- if (!processHeaderPhiOperands(
- Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
- if (SubLoop->contains(I->getParent()))
- return false;
- if (AftBlocks.count(I->getParent())) {
- // If we hit a phi node in afts we know we are done (probably
- // LCSSA)
- if (isa<PHINode>(I))
- return false;
- // Can't move instructions with side effects or memory
- // reads/writes
- if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
- return false;
- }
- // Keep going
- return true;
- })) {
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
- "instructions after subloop to before it\n");
- return false;
- }
-
- // Check for memory dependencies which prohibit the unrolling we are doing.
- // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
- // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
- if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI,
- LI)) {
- LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
- return false;
- }
-
- return true;
-}
+ movePHIs(AftBlocksFirst[It], AftBlocksFirst[0]);
+ }
+
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy);
+ // Dominator Tree. Remove the old links between Fore, Sub and Aft, adding the
+ // new ones required.
+ if (Count != 1) {
+ SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+ DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete, ForeBlocksLast[0],
+ SubLoopBlocksFirst[0]);
+ DTUpdates.emplace_back(DominatorTree::UpdateKind::Delete,
+ SubLoopBlocksLast[0], AftBlocksFirst[0]);
+
+ DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
+ ForeBlocksLast.back(), SubLoopBlocksFirst[0]);
+ DTUpdates.emplace_back(DominatorTree::UpdateKind::Insert,
+ SubLoopBlocksLast.back(), AftBlocksFirst[0]);
+ DTU.applyUpdatesPermissive(DTUpdates);
+ }
+
+ // Merge adjacent basic blocks, if possible.
+ SmallPtrSet<BasicBlock *, 16> MergeBlocks;
+ MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
+ MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
+ MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
+
+ MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI);
+
+ // Apply updates to the DomTree.
+ DT = &DTU.getDomTree();
+
+ // At this point, the code is well formed. We now do a quick sweep over the
+ // inserted code, doing constant propagation and dead code elimination as we
+ // go.
+ simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI);
+ simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC,
+ TTI);
+
+ NumCompletelyUnrolledAndJammed += CompletelyUnroll;
+ ++NumUnrolledAndJammed;
+
+ // Update LoopInfo if the loop is completely removed.
+ if (CompletelyUnroll)
+ LI->erase(L);
+
+#ifndef NDEBUG
+ // We shouldn't have done anything to break loop simplify form or LCSSA.
+ Loop *OutestLoop = SubLoop->getParentLoop()
+ ? SubLoop->getParentLoop()->getParentLoop()
+ ? SubLoop->getParentLoop()->getParentLoop()
+ : SubLoop->getParentLoop()
+ : SubLoop;
+ assert(DT->verify());
+ LI->verify(*DT);
+ assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
+ if (!CompletelyUnroll)
+ assert(L->isLoopSimplifyForm());
+ assert(SubLoop->isLoopSimplifyForm());
+ SE->verify();
+#endif
+
+ return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
+ : LoopUnrollResult::PartiallyUnrolled;
+}
+
+static bool getLoadsAndStores(BasicBlockSet &Blocks,
+ SmallVector<Instruction *, 4> &MemInstr) {
+ // Scan the BBs and collect legal loads and stores.
+ // Returns false if non-simple loads/stores are found.
+ for (BasicBlock *BB : Blocks) {
+ for (Instruction &I : *BB) {
+ if (auto *Ld = dyn_cast<LoadInst>(&I)) {
+ if (!Ld->isSimple())
+ return false;
+ MemInstr.push_back(&I);
+ } else if (auto *St = dyn_cast<StoreInst>(&I)) {
+ if (!St->isSimple())
+ return false;
+ MemInstr.push_back(&I);
+ } else if (I.mayReadOrWriteMemory()) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+static bool preservesForwardDependence(Instruction *Src, Instruction *Dst,
+ unsigned UnrollLevel, unsigned JamLevel,
+ bool Sequentialized, Dependence *D) {
+ // UnrollLevel might carry the dependency Src --> Dst
+ // Does a different loop after unrolling?
+ for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
+ ++CurLoopDepth) {
+ auto JammedDir = D->getDirection(CurLoopDepth);
+ if (JammedDir == Dependence::DVEntry::LT)
+ return true;
+
+ if (JammedDir & Dependence::DVEntry::GT)
+ return false;
+ }
+
+ return true;
+}
+
+static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst,
+ unsigned UnrollLevel, unsigned JamLevel,
+ bool Sequentialized, Dependence *D) {
+ // UnrollLevel might carry the dependency Dst --> Src
+ for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
+ ++CurLoopDepth) {
+ auto JammedDir = D->getDirection(CurLoopDepth);
+ if (JammedDir == Dependence::DVEntry::GT)
+ return true;
+
+ if (JammedDir & Dependence::DVEntry::LT)
+ return false;
+ }
+
+ // Backward dependencies are only preserved if not interleaved.
+ return Sequentialized;
+}
+
+// Check whether it is semantically safe Src and Dst considering any potential
+// dependency between them.
+//
+// @param UnrollLevel The level of the loop being unrolled
+// @param JamLevel The level of the loop being jammed; if Src and Dst are on
+// different levels, the outermost common loop counts as jammed level
+//
+// @return true if is safe and false if there is a dependency violation.
+static bool checkDependency(Instruction *Src, Instruction *Dst,
+ unsigned UnrollLevel, unsigned JamLevel,
+ bool Sequentialized, DependenceInfo &DI) {
+ assert(UnrollLevel <= JamLevel &&
+ "Expecting JamLevel to be at least UnrollLevel");
+
+ if (Src == Dst)
+ return true;
+ // Ignore Input dependencies.
+ if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
+ return true;
+
+ // Check whether unroll-and-jam may violate a dependency.
+ // By construction, every dependency will be lexicographically non-negative
+ // (if it was, it would violate the current execution order), such as
+ // (0,0,>,*,*)
+ // Unroll-and-jam changes the GT execution of two executions to the same
+ // iteration of the chosen unroll level. That is, a GT dependence becomes a GE
+ // dependence (or EQ, if we fully unrolled the loop) at the loop's position:
+ // (0,0,>=,*,*)
+ // Now, the dependency is not necessarily non-negative anymore, i.e.
+ // unroll-and-jam may violate correctness.
+ std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true);
+ if (!D)
+ return true;
+ assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+
+ if (D->isConfused()) {
+ LLVM_DEBUG(dbgs() << " Confused dependency between:\n"
+ << " " << *Src << "\n"
+ << " " << *Dst << "\n");
+ return false;
+ }
+
+ // If outer levels (levels enclosing the loop being unroll-and-jammed) have a
+ // non-equal direction, then the locations accessed in the inner levels cannot
+ // overlap in memory. We assumes the indexes never overlap into neighboring
+ // dimensions.
+ for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth)
+ if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ))
+ return true;
+
+ auto UnrollDirection = D->getDirection(UnrollLevel);
+
+ // If the distance carried by the unrolled loop is 0, then after unrolling
+ // that distance will become non-zero resulting in non-overlapping accesses in
+ // the inner loops.
+ if (UnrollDirection == Dependence::DVEntry::EQ)
+ return true;
+
+ if (UnrollDirection & Dependence::DVEntry::LT &&
+ !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel,
+ Sequentialized, D.get()))
+ return false;
+
+ if (UnrollDirection & Dependence::DVEntry::GT &&
+ !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel,
+ Sequentialized, D.get()))
+ return false;
+
+ return true;
+}
+
+static bool
+checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks,
+ const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
+ const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap,
+ DependenceInfo &DI, LoopInfo &LI) {
+ SmallVector<BasicBlockSet, 8> AllBlocks;
+ for (Loop *L : Root.getLoopsInPreorder())
+ if (ForeBlocksMap.find(L) != ForeBlocksMap.end())
+ AllBlocks.push_back(ForeBlocksMap.lookup(L));
+ AllBlocks.push_back(SubLoopBlocks);
+ for (Loop *L : Root.getLoopsInPreorder())
+ if (AftBlocksMap.find(L) != AftBlocksMap.end())
+ AllBlocks.push_back(AftBlocksMap.lookup(L));
+
+ unsigned LoopDepth = Root.getLoopDepth();
+ SmallVector<Instruction *, 4> EarlierLoadsAndStores;
+ SmallVector<Instruction *, 4> CurrentLoadsAndStores;
+ for (BasicBlockSet &Blocks : AllBlocks) {
+ CurrentLoadsAndStores.clear();
+ if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores))
+ return false;
+
+ Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent());
+ unsigned CurLoopDepth = CurLoop->getLoopDepth();
+
+ for (auto *Earlier : EarlierLoadsAndStores) {
+ Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent());
+ unsigned EarlierDepth = EarlierLoop->getLoopDepth();
+ unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth);
+ for (auto *Later : CurrentLoadsAndStores) {
+ if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false,
+ DI))
+ return false;
+ }
+ }
+
+ size_t NumInsts = CurrentLoadsAndStores.size();
+ for (size_t I = 0; I < NumInsts; ++I) {
+ for (size_t J = I; J < NumInsts; ++J) {
+ if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J],
+ LoopDepth, CurLoopDepth, true, DI))
+ return false;
+ }
+ }
+
+ EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(),
+ CurrentLoadsAndStores.end());
+ }
+ return true;
+}
+
+static bool isEligibleLoopForm(const Loop &Root) {
+ // Root must have a child.
+ if (Root.getSubLoops().size() != 1)
+ return false;
+
+ const Loop *L = &Root;
+ do {
+ // All loops in Root need to be in simplify and rotated form.
+ if (!L->isLoopSimplifyForm())
+ return false;
+
+ if (!L->isRotatedForm())
+ return false;
+
+ if (L->getHeader()->hasAddressTaken()) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
+ return false;
+ }
+
+ unsigned SubLoopsSize = L->getSubLoops().size();
+ if (SubLoopsSize == 0)
+ return true;
+
+ // Only one child is allowed.
+ if (SubLoopsSize != 1)
+ return false;
+
+ L = L->getSubLoops()[0];
+ } while (L);
+
+ return true;
+}
+
+static Loop *getInnerMostLoop(Loop *L) {
+ while (!L->getSubLoops().empty())
+ L = L->getSubLoops()[0];
+ return L;
+}
+
+bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
+ DependenceInfo &DI, LoopInfo &LI) {
+ if (!isEligibleLoopForm(*L)) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n");
+ return false;
+ }
+
+ /* We currently handle outer loops like this:
+ |
+ ForeFirst <------\ }
+ Blocks | } ForeBlocks of L
+ ForeLast | }
+ | |
+ ... |
+ | |
+ ForeFirst <----\ | }
+ Blocks | | } ForeBlocks of a inner loop of L
+ ForeLast | | }
+ | | |
+ JamLoopFirst <\ | | }
+ Blocks | | | } JamLoopBlocks of the innermost loop
+ JamLoopLast -/ | | }
+ | | |
+ AftFirst | | }
+ Blocks | | } AftBlocks of a inner loop of L
+ AftLast ------/ | }
+ | |
+ ... |
+ | |
+ AftFirst | }
+ Blocks | } AftBlocks of L
+ AftLast --------/ }
+ |
+
+ There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
+ and AftBlocks, providing that there is one edge from Fores to SubLoops,
+ one edge from SubLoops to Afts and a single outer loop exit (from Afts).
+ In practice we currently limit Aft blocks to a single block, and limit
+ things further in the profitablility checks of the unroll and jam pass.
+
+ Because of the way we rearrange basic blocks, we also require that
+ the Fore blocks of L on all unrolled iterations are safe to move before the
+ blocks of the direct child of L of all iterations. So we require that the
+ phi node looping operands of ForeHeader can be moved to at least the end of
+ ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and
+ match up Phi's correctly.
+
+ i.e. The old order of blocks used to be
+ (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2.
+ It needs to be safe to transform this to
+ (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2.
+
+ There are then a number of checks along the lines of no calls, no
+ exceptions, inner loop IV is consistent, etc. Note that for loops requiring
+ runtime unrolling, UnrollRuntimeLoopRemainder can also fail in
+ UnrollAndJamLoop if the trip count cannot be easily calculated.
+ */
+
+ // Split blocks into Fore/SubLoop/Aft based on dominators
+ Loop *JamLoop = getInnerMostLoop(L);
+ BasicBlockSet SubLoopBlocks;
+ DenseMap<Loop *, BasicBlockSet> ForeBlocksMap;
+ DenseMap<Loop *, BasicBlockSet> AftBlocksMap;
+ if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap,
+ AftBlocksMap, DT)) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
+ return false;
+ }
+
+ // Aft blocks may need to move instructions to fore blocks, which becomes more
+ // difficult if there are multiple (potentially conditionally executed)
+ // blocks. For now we just exclude loops with multiple aft blocks.
+ if (AftBlocksMap[L].size() != 1) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
+ "multiple blocks after the loop\n");
+ return false;
+ }
+
+ // Check inner loop backedge count is consistent on all iterations of the
+ // outer loop
+ if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) {
+ return !hasIterationCountInvariantInParent(SubLoop, SE);
+ })) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
+ "not consistent on each iteration\n");
+ return false;
+ }
+
+ // Check the loop safety info for exceptions.
+ SimpleLoopSafetyInfo LSI;
+ LSI.computeLoopSafetyInfo(L);
+ if (LSI.anyBlockMayThrow()) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Something may throw\n");
+ return false;
+ }
+
+ // We've ruled out the easy stuff and now need to check that there are no
+ // interdependencies which may prevent us from moving the:
+ // ForeBlocks before Subloop and AftBlocks.
+ // Subloop before AftBlocks.
+ // ForeBlock phi operands before the subloop
+
+ // Make sure we can move all instructions we need to before the subloop
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *Latch = L->getLoopLatch();
+ BasicBlockSet AftBlocks = AftBlocksMap[L];
+ Loop *SubLoop = L->getSubLoops()[0];
+ if (!processHeaderPhiOperands(
+ Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
+ if (SubLoop->contains(I->getParent()))
+ return false;
+ if (AftBlocks.count(I->getParent())) {
+ // If we hit a phi node in afts we know we are done (probably
+ // LCSSA)
+ if (isa<PHINode>(I))
+ return false;
+ // Can't move instructions with side effects or memory
+ // reads/writes
+ if (I->mayHaveSideEffects() || I->mayReadOrWriteMemory())
+ return false;
+ }
+ // Keep going
+ return true;
+ })) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; can't move required "
+ "instructions after subloop to before it\n");
+ return false;
+ }
+
+ // Check for memory dependencies which prohibit the unrolling we are doing.
+ // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
+ // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
+ if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI,
+ LI)) {
+ LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
+ return false;
+ }
+
+ return true;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 6f73c51db5..0abf62be15 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -1,510 +1,510 @@
-//===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements some loop unrolling utilities for loops with run-time
-// trip counts. See LoopUnroll.cpp for unrolling loops with compile-time
-// trip counts.
-//
-// The functions in this file are used to generate extra code when the
-// run-time trip count modulo the unroll factor is not 0. When this is the
-// case, we need to generate code to execute these 'left over' iterations.
-//
-// The current strategy generates an if-then-else sequence prior to the
-// unrolled loop to execute the 'left over' iterations before or after the
-// unrolled loop.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Dominators.h"
+//===-- UnrollLoopRuntime.cpp - Runtime Loop unrolling utilities ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements some loop unrolling utilities for loops with run-time
+// trip counts. See LoopUnroll.cpp for unrolling loops with compile-time
+// trip counts.
+//
+// The functions in this file are used to generate extra code when the
+// run-time trip count modulo the unroll factor is not 0. When this is the
+// case, we need to generate code to execute these 'left over' iterations.
+//
+// The current strategy generates an if-then-else sequence prior to the
+// unrolled loop to execute the 'left over' iterations before or after the
+// unrolled loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Utils/UnrollLoop.h"
-#include <algorithm>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "loop-unroll"
-
-STATISTIC(NumRuntimeUnrolled,
- "Number of loops unrolled with run-time trip counts");
-static cl::opt<bool> UnrollRuntimeMultiExit(
- "unroll-runtime-multi-exit", cl::init(false), cl::Hidden,
- cl::desc("Allow runtime unrolling for loops with multiple exits, when "
- "epilog is generated"));
-
-/// Connect the unrolling prolog code to the original loop.
-/// The unrolling prolog code contains code to execute the
-/// 'extra' iterations if the run-time trip count modulo the
-/// unroll count is non-zero.
-///
-/// This function performs the following:
-/// - Create PHI nodes at prolog end block to combine values
-/// that exit the prolog code and jump around the prolog.
-/// - Add a PHI operand to a PHI node at the loop exit block
-/// for values that exit the prolog and go around the loop.
-/// - Branch around the original loop if the trip count is less
-/// than the unroll factor.
-///
-static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
- BasicBlock *PrologExit,
- BasicBlock *OriginalLoopLatchExit,
- BasicBlock *PreHeader, BasicBlock *NewPreHeader,
- ValueToValueMapTy &VMap, DominatorTree *DT,
- LoopInfo *LI, bool PreserveLCSSA) {
- // Loop structure should be the following:
- // Preheader
- // PrologHeader
- // ...
- // PrologLatch
- // PrologExit
- // NewPreheader
- // Header
- // ...
- // Latch
- // LatchExit
- BasicBlock *Latch = L->getLoopLatch();
- assert(Latch && "Loop must have a latch");
- BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);
-
- // Create a PHI node for each outgoing value from the original loop
- // (which means it is an outgoing value from the prolog code too).
- // The new PHI node is inserted in the prolog end basic block.
- // The new PHI node value is added as an operand of a PHI node in either
- // the loop header or the loop exit block.
- for (BasicBlock *Succ : successors(Latch)) {
- for (PHINode &PN : Succ->phis()) {
- // Add a new PHI node to the prolog end block and add the
- // appropriate incoming values.
- // TODO: This code assumes that the PrologExit (or the LatchExit block for
- // prolog loop) contains only one predecessor from the loop, i.e. the
- // PrologLatch. When supporting multiple-exiting block loops, we can have
- // two or more blocks that have the LatchExit as the target in the
- // original loop.
- PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr",
- PrologExit->getFirstNonPHI());
- // Adding a value to the new PHI node from the original loop preheader.
- // This is the value that skips all the prolog code.
- if (L->contains(&PN)) {
- // Succ is loop header.
- NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader),
- PreHeader);
- } else {
- // Succ is LatchExit.
- NewPN->addIncoming(UndefValue::get(PN.getType()), PreHeader);
- }
-
- Value *V = PN.getIncomingValueForBlock(Latch);
- if (Instruction *I = dyn_cast<Instruction>(V)) {
- if (L->contains(I)) {
- V = VMap.lookup(I);
- }
- }
- // Adding a value to the new PHI node from the last prolog block
- // that was created.
- NewPN->addIncoming(V, PrologLatch);
-
- // Update the existing PHI node operand with the value from the
- // new PHI node. How this is done depends on if the existing
- // PHI node is in the original loop block, or the exit block.
- if (L->contains(&PN))
- PN.setIncomingValueForBlock(NewPreHeader, NewPN);
- else
- PN.addIncoming(NewPN, PrologExit);
- }
- }
-
- // Make sure that created prolog loop is in simplified form
- SmallVector<BasicBlock *, 4> PrologExitPreds;
- Loop *PrologLoop = LI->getLoopFor(PrologLatch);
- if (PrologLoop) {
- for (BasicBlock *PredBB : predecessors(PrologExit))
- if (PrologLoop->contains(PredBB))
- PrologExitPreds.push_back(PredBB);
-
- SplitBlockPredecessors(PrologExit, PrologExitPreds, ".unr-lcssa", DT, LI,
- nullptr, PreserveLCSSA);
- }
-
- // Create a branch around the original loop, which is taken if there are no
- // iterations remaining to be executed after running the prologue.
- Instruction *InsertPt = PrologExit->getTerminator();
- IRBuilder<> B(InsertPt);
-
- assert(Count != 0 && "nonsensical Count!");
-
- // If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1)
- // This means %xtraiter is (BECount + 1) and all of the iterations of this
- // loop were executed by the prologue. Note that if BECount <u (Count - 1)
- // then (BECount + 1) cannot unsigned-overflow.
- Value *BrLoopExit =
- B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1));
- // Split the exit to maintain loop canonicalization guarantees
- SmallVector<BasicBlock *, 4> Preds(predecessors(OriginalLoopLatchExit));
- SplitBlockPredecessors(OriginalLoopLatchExit, Preds, ".unr-lcssa", DT, LI,
- nullptr, PreserveLCSSA);
- // Add the branch to the exit block (around the unrolled loop)
- B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader);
- InsertPt->eraseFromParent();
- if (DT)
- DT->changeImmediateDominator(OriginalLoopLatchExit, PrologExit);
-}
-
-/// Connect the unrolling epilog code to the original loop.
-/// The unrolling epilog code contains code to execute the
-/// 'extra' iterations if the run-time trip count modulo the
-/// unroll count is non-zero.
-///
-/// This function performs the following:
-/// - Update PHI nodes at the unrolling loop exit and epilog loop exit
-/// - Create PHI nodes at the unrolling loop exit to combine
-/// values that exit the unrolling loop code and jump around it.
-/// - Update PHI operands in the epilog loop by the new PHI nodes
-/// - Branch around the epilog loop if extra iters (ModVal) is zero.
-///
-static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
- BasicBlock *Exit, BasicBlock *PreHeader,
- BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
- ValueToValueMapTy &VMap, DominatorTree *DT,
- LoopInfo *LI, bool PreserveLCSSA) {
- BasicBlock *Latch = L->getLoopLatch();
- assert(Latch && "Loop must have a latch");
- BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
-
- // Loop structure should be the following:
- //
- // PreHeader
- // NewPreHeader
- // Header
- // ...
- // Latch
- // NewExit (PN)
- // EpilogPreHeader
- // EpilogHeader
- // ...
- // EpilogLatch
- // Exit (EpilogPN)
-
- // Update PHI nodes at NewExit and Exit.
- for (PHINode &PN : NewExit->phis()) {
- // PN should be used in another PHI located in Exit block as
- // Exit was split by SplitBlockPredecessors into Exit and NewExit
- // Basicaly it should look like:
- // NewExit:
- // PN = PHI [I, Latch]
- // ...
- // Exit:
- // EpilogPN = PHI [PN, EpilogPreHeader]
- //
- // There is EpilogPreHeader incoming block instead of NewExit as
- // NewExit was spilt 1 more time to get EpilogPreHeader.
- assert(PN.hasOneUse() && "The phi should have 1 use");
- PHINode *EpilogPN = cast<PHINode>(PN.use_begin()->getUser());
- assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
-
- // Add incoming PreHeader from branch around the Loop
- PN.addIncoming(UndefValue::get(PN.getType()), PreHeader);
-
- Value *V = PN.getIncomingValueForBlock(Latch);
- Instruction *I = dyn_cast<Instruction>(V);
- if (I && L->contains(I))
- // If value comes from an instruction in the loop add VMap value.
- V = VMap.lookup(I);
- // For the instruction out of the loop, constant or undefined value
- // insert value itself.
- EpilogPN->addIncoming(V, EpilogLatch);
-
- assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&
- "EpilogPN should have EpilogPreHeader incoming block");
- // Change EpilogPreHeader incoming block to NewExit.
- EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader),
- NewExit);
- // Now PHIs should look like:
- // NewExit:
- // PN = PHI [I, Latch], [undef, PreHeader]
- // ...
- // Exit:
- // EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
- }
-
- // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
- // Update corresponding PHI nodes in epilog loop.
- for (BasicBlock *Succ : successors(Latch)) {
- // Skip this as we already updated phis in exit blocks.
- if (!L->contains(Succ))
- continue;
- for (PHINode &PN : Succ->phis()) {
- // Add new PHI nodes to the loop exit block and update epilog
- // PHIs with the new PHI values.
- PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr",
- NewExit->getFirstNonPHI());
- // Adding a value to the new PHI node from the unrolling loop preheader.
- NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader);
- // Adding a value to the new PHI node from the unrolling loop latch.
- NewPN->addIncoming(PN.getIncomingValueForBlock(Latch), Latch);
-
- // Update the existing PHI node operand with the value from the new PHI
- // node. Corresponding instruction in epilog loop should be PHI.
- PHINode *VPN = cast<PHINode>(VMap[&PN]);
- VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN);
- }
- }
-
- Instruction *InsertPt = NewExit->getTerminator();
- IRBuilder<> B(InsertPt);
- Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");
- assert(Exit && "Loop must have a single exit block only");
- // Split the epilogue exit to maintain loop canonicalization guarantees
- SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
- SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, nullptr,
- PreserveLCSSA);
- // Add the branch to the exit block (around the unrolling loop)
- B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
- InsertPt->eraseFromParent();
- if (DT)
- DT->changeImmediateDominator(Exit, NewExit);
-
- // Split the main loop exit to maintain canonicalization guarantees.
- SmallVector<BasicBlock*, 4> NewExitPreds{Latch};
- SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI, nullptr,
- PreserveLCSSA);
-}
-
-/// Create a clone of the blocks in a loop and connect them together.
-/// If CreateRemainderLoop is false, loop structure will not be cloned,
-/// otherwise a new loop will be created including all cloned blocks, and the
-/// iterator of it switches to count NewIter down to 0.
-/// The cloned blocks should be inserted between InsertTop and InsertBot.
-/// If loop structure is cloned InsertTop should be new preheader, InsertBot
-/// new loop exit.
-/// Return the new cloned loop that is created when CreateRemainderLoop is true.
-static Loop *
-CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
- const bool UseEpilogRemainder, const bool UnrollRemainder,
- BasicBlock *InsertTop,
- BasicBlock *InsertBot, BasicBlock *Preheader,
- std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
- ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) {
- StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
- BasicBlock *Header = L->getHeader();
- BasicBlock *Latch = L->getLoopLatch();
- Function *F = Header->getParent();
- LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
- LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
- Loop *ParentLoop = L->getParentLoop();
- NewLoopsMap NewLoops;
- NewLoops[ParentLoop] = ParentLoop;
- if (!CreateRemainderLoop)
- NewLoops[L] = ParentLoop;
-
- // For each block in the original loop, create a new copy,
- // and update the value map with the newly created values.
- for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
- BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
- NewBlocks.push_back(NewBB);
-
- // If we're unrolling the outermost loop, there's no remainder loop,
- // and this block isn't in a nested loop, then the new block is not
- // in any loop. Otherwise, add it to loopinfo.
- if (CreateRemainderLoop || LI->getLoopFor(*BB) != L || ParentLoop)
- addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);
-
- VMap[*BB] = NewBB;
- if (Header == *BB) {
- // For the first block, add a CFG connection to this newly
- // created block.
- InsertTop->getTerminator()->setSuccessor(0, NewBB);
- }
-
- if (DT) {
- if (Header == *BB) {
- // The header is dominated by the preheader.
- DT->addNewBlock(NewBB, InsertTop);
- } else {
- // Copy information from original loop to unrolled loop.
- BasicBlock *IDomBB = DT->getNode(*BB)->getIDom()->getBlock();
- DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
- }
- }
-
- if (Latch == *BB) {
- // For the last block, if CreateRemainderLoop is false, create a direct
- // jump to InsertBot. If not, create a loop back to cloned head.
- VMap.erase((*BB)->getTerminator());
- BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
- BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
- IRBuilder<> Builder(LatchBR);
- if (!CreateRemainderLoop) {
- Builder.CreateBr(InsertBot);
- } else {
- PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
- suffix + ".iter",
- FirstLoopBB->getFirstNonPHI());
- Value *IdxSub =
- Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
- NewIdx->getName() + ".sub");
- Value *IdxCmp =
- Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp");
- Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot);
- NewIdx->addIncoming(NewIter, InsertTop);
- NewIdx->addIncoming(IdxSub, NewBB);
- }
- LatchBR->eraseFromParent();
- }
- }
-
- // Change the incoming values to the ones defined in the preheader or
- // cloned loop.
- for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
- PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
- if (!CreateRemainderLoop) {
- if (UseEpilogRemainder) {
- unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
- NewPHI->setIncomingBlock(idx, InsertTop);
- NewPHI->removeIncomingValue(Latch, false);
- } else {
- VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
- cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
- }
- } else {
- unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
- NewPHI->setIncomingBlock(idx, InsertTop);
- BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
- idx = NewPHI->getBasicBlockIndex(Latch);
- Value *InVal = NewPHI->getIncomingValue(idx);
- NewPHI->setIncomingBlock(idx, NewLatch);
- if (Value *V = VMap.lookup(InVal))
- NewPHI->setIncomingValue(idx, V);
- }
- }
- if (CreateRemainderLoop) {
- Loop *NewLoop = NewLoops[L];
- assert(NewLoop && "L should have been cloned");
- MDNode *LoopID = NewLoop->getLoopID();
-
- // Only add loop metadata if the loop is not going to be completely
- // unrolled.
- if (UnrollRemainder)
- return NewLoop;
-
- Optional<MDNode *> NewLoopID = makeFollowupLoopID(
- LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder});
- if (NewLoopID.hasValue()) {
- NewLoop->setLoopID(NewLoopID.getValue());
-
- // Do not setLoopAlreadyUnrolled if loop attributes have been defined
- // explicitly.
- return NewLoop;
- }
-
- // Add unroll disable metadata to disable future unrolling for this loop.
- NewLoop->setLoopAlreadyUnrolled();
- return NewLoop;
- }
- else
- return nullptr;
-}
-
-/// Returns true if we can safely unroll a multi-exit/exiting loop. OtherExits
-/// is populated with all the loop exit blocks other than the LatchExit block.
-static bool canSafelyUnrollMultiExitLoop(Loop *L, BasicBlock *LatchExit,
- bool PreserveLCSSA,
- bool UseEpilogRemainder) {
-
- // We currently have some correctness constrains in unrolling a multi-exit
- // loop. Check for these below.
-
- // We rely on LCSSA form being preserved when the exit blocks are transformed.
- if (!PreserveLCSSA)
- return false;
-
- // TODO: Support multiple exiting blocks jumping to the `LatchExit` when
- // UnrollRuntimeMultiExit is true. This will need updating the logic in
- // connectEpilog/connectProlog.
- if (!LatchExit->getSinglePredecessor()) {
- LLVM_DEBUG(
- dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
- "predecessor.\n");
- return false;
- }
- // FIXME: We bail out of multi-exit unrolling when epilog loop is generated
- // and L is an inner loop. This is because in presence of multiple exits, the
- // outer loop is incorrect: we do not add the EpilogPreheader and exit to the
- // outer loop. This is automatically handled in the prolog case, so we do not
- // have that bug in prolog generation.
- if (UseEpilogRemainder && L->getParentLoop())
- return false;
-
- // All constraints have been satisfied.
- return true;
-}
-
-/// Returns true if we can profitably unroll the multi-exit loop L. Currently,
-/// we return true only if UnrollRuntimeMultiExit is set to true.
-static bool canProfitablyUnrollMultiExitLoop(
- Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits, BasicBlock *LatchExit,
- bool PreserveLCSSA, bool UseEpilogRemainder) {
-
-#if !defined(NDEBUG)
- assert(canSafelyUnrollMultiExitLoop(L, LatchExit, PreserveLCSSA,
- UseEpilogRemainder) &&
- "Should be safe to unroll before checking profitability!");
-#endif
-
- // Priority goes to UnrollRuntimeMultiExit if it's supplied.
- if (UnrollRuntimeMultiExit.getNumOccurrences())
- return UnrollRuntimeMultiExit;
-
- // The main pain point with multi-exit loop unrolling is that once unrolled,
- // we will not be able to merge all blocks into a straight line code.
- // There are branches within the unrolled loop that go to the OtherExits.
- // The second point is the increase in code size, but this is true
- // irrespective of multiple exits.
-
- // Note: Both the heuristics below are coarse grained. We are essentially
- // enabling unrolling of loops that have a single side exit other than the
- // normal LatchExit (i.e. exiting into a deoptimize block).
- // The heuristics considered are:
- // 1. low number of branches in the unrolled version.
- // 2. high predictability of these extra branches.
- // We avoid unrolling loops that have more than two exiting blocks. This
- // limits the total number of branches in the unrolled loop to be atmost
- // the unroll factor (since one of the exiting blocks is the latch block).
- SmallVector<BasicBlock*, 4> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
- if (ExitingBlocks.size() > 2)
- return false;
-
- // The second heuristic is that L has one exit other than the latchexit and
- // that exit is a deoptimize block. We know that deoptimize blocks are rarely
- // taken, which also implies the branch leading to the deoptimize block is
- // highly predictable.
- return (OtherExits.size() == 1 &&
- OtherExits[0]->getTerminatingDeoptimizeCall());
- // TODO: These can be fine-tuned further to consider code size or deopt states
- // that are captured by the deoptimize exit block.
- // Also, we can extend this to support more cases, if we actually
- // know of kinds of multiexit loops that would benefit from unrolling.
-}
-
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/UnrollLoop.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loop-unroll"
+
+STATISTIC(NumRuntimeUnrolled,
+ "Number of loops unrolled with run-time trip counts");
+static cl::opt<bool> UnrollRuntimeMultiExit(
+ "unroll-runtime-multi-exit", cl::init(false), cl::Hidden,
+ cl::desc("Allow runtime unrolling for loops with multiple exits, when "
+ "epilog is generated"));
+
+/// Connect the unrolling prolog code to the original loop.
+/// The unrolling prolog code contains code to execute the
+/// 'extra' iterations if the run-time trip count modulo the
+/// unroll count is non-zero.
+///
+/// This function performs the following:
+/// - Create PHI nodes at prolog end block to combine values
+/// that exit the prolog code and jump around the prolog.
+/// - Add a PHI operand to a PHI node at the loop exit block
+/// for values that exit the prolog and go around the loop.
+/// - Branch around the original loop if the trip count is less
+/// than the unroll factor.
+///
+static void ConnectProlog(Loop *L, Value *BECount, unsigned Count,
+ BasicBlock *PrologExit,
+ BasicBlock *OriginalLoopLatchExit,
+ BasicBlock *PreHeader, BasicBlock *NewPreHeader,
+ ValueToValueMapTy &VMap, DominatorTree *DT,
+ LoopInfo *LI, bool PreserveLCSSA) {
+ // Loop structure should be the following:
+ // Preheader
+ // PrologHeader
+ // ...
+ // PrologLatch
+ // PrologExit
+ // NewPreheader
+ // Header
+ // ...
+ // Latch
+ // LatchExit
+ BasicBlock *Latch = L->getLoopLatch();
+ assert(Latch && "Loop must have a latch");
+ BasicBlock *PrologLatch = cast<BasicBlock>(VMap[Latch]);
+
+ // Create a PHI node for each outgoing value from the original loop
+ // (which means it is an outgoing value from the prolog code too).
+ // The new PHI node is inserted in the prolog end basic block.
+ // The new PHI node value is added as an operand of a PHI node in either
+ // the loop header or the loop exit block.
+ for (BasicBlock *Succ : successors(Latch)) {
+ for (PHINode &PN : Succ->phis()) {
+ // Add a new PHI node to the prolog end block and add the
+ // appropriate incoming values.
+ // TODO: This code assumes that the PrologExit (or the LatchExit block for
+ // prolog loop) contains only one predecessor from the loop, i.e. the
+ // PrologLatch. When supporting multiple-exiting block loops, we can have
+ // two or more blocks that have the LatchExit as the target in the
+ // original loop.
+ PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr",
+ PrologExit->getFirstNonPHI());
+ // Adding a value to the new PHI node from the original loop preheader.
+ // This is the value that skips all the prolog code.
+ if (L->contains(&PN)) {
+ // Succ is loop header.
+ NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader),
+ PreHeader);
+ } else {
+ // Succ is LatchExit.
+ NewPN->addIncoming(UndefValue::get(PN.getType()), PreHeader);
+ }
+
+ Value *V = PN.getIncomingValueForBlock(Latch);
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ if (L->contains(I)) {
+ V = VMap.lookup(I);
+ }
+ }
+ // Adding a value to the new PHI node from the last prolog block
+ // that was created.
+ NewPN->addIncoming(V, PrologLatch);
+
+ // Update the existing PHI node operand with the value from the
+ // new PHI node. How this is done depends on if the existing
+ // PHI node is in the original loop block, or the exit block.
+ if (L->contains(&PN))
+ PN.setIncomingValueForBlock(NewPreHeader, NewPN);
+ else
+ PN.addIncoming(NewPN, PrologExit);
+ }
+ }
+
+ // Make sure that created prolog loop is in simplified form
+ SmallVector<BasicBlock *, 4> PrologExitPreds;
+ Loop *PrologLoop = LI->getLoopFor(PrologLatch);
+ if (PrologLoop) {
+ for (BasicBlock *PredBB : predecessors(PrologExit))
+ if (PrologLoop->contains(PredBB))
+ PrologExitPreds.push_back(PredBB);
+
+ SplitBlockPredecessors(PrologExit, PrologExitPreds, ".unr-lcssa", DT, LI,
+ nullptr, PreserveLCSSA);
+ }
+
+ // Create a branch around the original loop, which is taken if there are no
+ // iterations remaining to be executed after running the prologue.
+ Instruction *InsertPt = PrologExit->getTerminator();
+ IRBuilder<> B(InsertPt);
+
+ assert(Count != 0 && "nonsensical Count!");
+
+ // If BECount <u (Count - 1) then (BECount + 1) % Count == (BECount + 1)
+ // This means %xtraiter is (BECount + 1) and all of the iterations of this
+ // loop were executed by the prologue. Note that if BECount <u (Count - 1)
+ // then (BECount + 1) cannot unsigned-overflow.
+ Value *BrLoopExit =
+ B.CreateICmpULT(BECount, ConstantInt::get(BECount->getType(), Count - 1));
+ // Split the exit to maintain loop canonicalization guarantees
+ SmallVector<BasicBlock *, 4> Preds(predecessors(OriginalLoopLatchExit));
+ SplitBlockPredecessors(OriginalLoopLatchExit, Preds, ".unr-lcssa", DT, LI,
+ nullptr, PreserveLCSSA);
+ // Add the branch to the exit block (around the unrolled loop)
+ B.CreateCondBr(BrLoopExit, OriginalLoopLatchExit, NewPreHeader);
+ InsertPt->eraseFromParent();
+ if (DT)
+ DT->changeImmediateDominator(OriginalLoopLatchExit, PrologExit);
+}
+
+/// Connect the unrolling epilog code to the original loop.
+/// The unrolling epilog code contains code to execute the
+/// 'extra' iterations if the run-time trip count modulo the
+/// unroll count is non-zero.
+///
+/// This function performs the following:
+/// - Update PHI nodes at the unrolling loop exit and epilog loop exit
+/// - Create PHI nodes at the unrolling loop exit to combine
+/// values that exit the unrolling loop code and jump around it.
+/// - Update PHI operands in the epilog loop by the new PHI nodes
+/// - Branch around the epilog loop if extra iters (ModVal) is zero.
+///
+static void ConnectEpilog(Loop *L, Value *ModVal, BasicBlock *NewExit,
+ BasicBlock *Exit, BasicBlock *PreHeader,
+ BasicBlock *EpilogPreHeader, BasicBlock *NewPreHeader,
+ ValueToValueMapTy &VMap, DominatorTree *DT,
+ LoopInfo *LI, bool PreserveLCSSA) {
+ BasicBlock *Latch = L->getLoopLatch();
+ assert(Latch && "Loop must have a latch");
+ BasicBlock *EpilogLatch = cast<BasicBlock>(VMap[Latch]);
+
+ // Loop structure should be the following:
+ //
+ // PreHeader
+ // NewPreHeader
+ // Header
+ // ...
+ // Latch
+ // NewExit (PN)
+ // EpilogPreHeader
+ // EpilogHeader
+ // ...
+ // EpilogLatch
+ // Exit (EpilogPN)
+
+ // Update PHI nodes at NewExit and Exit.
+ for (PHINode &PN : NewExit->phis()) {
+ // PN should be used in another PHI located in Exit block as
+ // Exit was split by SplitBlockPredecessors into Exit and NewExit
+ // Basicaly it should look like:
+ // NewExit:
+ // PN = PHI [I, Latch]
+ // ...
+ // Exit:
+ // EpilogPN = PHI [PN, EpilogPreHeader]
+ //
+ // There is EpilogPreHeader incoming block instead of NewExit as
+ // NewExit was spilt 1 more time to get EpilogPreHeader.
+ assert(PN.hasOneUse() && "The phi should have 1 use");
+ PHINode *EpilogPN = cast<PHINode>(PN.use_begin()->getUser());
+ assert(EpilogPN->getParent() == Exit && "EpilogPN should be in Exit block");
+
+ // Add incoming PreHeader from branch around the Loop
+ PN.addIncoming(UndefValue::get(PN.getType()), PreHeader);
+
+ Value *V = PN.getIncomingValueForBlock(Latch);
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (I && L->contains(I))
+ // If value comes from an instruction in the loop add VMap value.
+ V = VMap.lookup(I);
+ // For the instruction out of the loop, constant or undefined value
+ // insert value itself.
+ EpilogPN->addIncoming(V, EpilogLatch);
+
+ assert(EpilogPN->getBasicBlockIndex(EpilogPreHeader) >= 0 &&
+ "EpilogPN should have EpilogPreHeader incoming block");
+ // Change EpilogPreHeader incoming block to NewExit.
+ EpilogPN->setIncomingBlock(EpilogPN->getBasicBlockIndex(EpilogPreHeader),
+ NewExit);
+ // Now PHIs should look like:
+ // NewExit:
+ // PN = PHI [I, Latch], [undef, PreHeader]
+ // ...
+ // Exit:
+ // EpilogPN = PHI [PN, NewExit], [VMap[I], EpilogLatch]
+ }
+
+ // Create PHI nodes at NewExit (from the unrolling loop Latch and PreHeader).
+ // Update corresponding PHI nodes in epilog loop.
+ for (BasicBlock *Succ : successors(Latch)) {
+ // Skip this as we already updated phis in exit blocks.
+ if (!L->contains(Succ))
+ continue;
+ for (PHINode &PN : Succ->phis()) {
+ // Add new PHI nodes to the loop exit block and update epilog
+ // PHIs with the new PHI values.
+ PHINode *NewPN = PHINode::Create(PN.getType(), 2, PN.getName() + ".unr",
+ NewExit->getFirstNonPHI());
+ // Adding a value to the new PHI node from the unrolling loop preheader.
+ NewPN->addIncoming(PN.getIncomingValueForBlock(NewPreHeader), PreHeader);
+ // Adding a value to the new PHI node from the unrolling loop latch.
+ NewPN->addIncoming(PN.getIncomingValueForBlock(Latch), Latch);
+
+ // Update the existing PHI node operand with the value from the new PHI
+ // node. Corresponding instruction in epilog loop should be PHI.
+ PHINode *VPN = cast<PHINode>(VMap[&PN]);
+ VPN->setIncomingValueForBlock(EpilogPreHeader, NewPN);
+ }
+ }
+
+ Instruction *InsertPt = NewExit->getTerminator();
+ IRBuilder<> B(InsertPt);
+ Value *BrLoopExit = B.CreateIsNotNull(ModVal, "lcmp.mod");
+ assert(Exit && "Loop must have a single exit block only");
+ // Split the epilogue exit to maintain loop canonicalization guarantees
+ SmallVector<BasicBlock*, 4> Preds(predecessors(Exit));
+ SplitBlockPredecessors(Exit, Preds, ".epilog-lcssa", DT, LI, nullptr,
+ PreserveLCSSA);
+ // Add the branch to the exit block (around the unrolling loop)
+ B.CreateCondBr(BrLoopExit, EpilogPreHeader, Exit);
+ InsertPt->eraseFromParent();
+ if (DT)
+ DT->changeImmediateDominator(Exit, NewExit);
+
+ // Split the main loop exit to maintain canonicalization guarantees.
+ SmallVector<BasicBlock*, 4> NewExitPreds{Latch};
+ SplitBlockPredecessors(NewExit, NewExitPreds, ".loopexit", DT, LI, nullptr,
+ PreserveLCSSA);
+}
+
+/// Create a clone of the blocks in a loop and connect them together.
+/// If CreateRemainderLoop is false, loop structure will not be cloned,
+/// otherwise a new loop will be created including all cloned blocks, and the
+/// iterator of it switches to count NewIter down to 0.
+/// The cloned blocks should be inserted between InsertTop and InsertBot.
+/// If loop structure is cloned InsertTop should be new preheader, InsertBot
+/// new loop exit.
+/// Return the new cloned loop that is created when CreateRemainderLoop is true.
+static Loop *
+CloneLoopBlocks(Loop *L, Value *NewIter, const bool CreateRemainderLoop,
+ const bool UseEpilogRemainder, const bool UnrollRemainder,
+ BasicBlock *InsertTop,
+ BasicBlock *InsertBot, BasicBlock *Preheader,
+ std::vector<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
+ ValueToValueMapTy &VMap, DominatorTree *DT, LoopInfo *LI) {
+ StringRef suffix = UseEpilogRemainder ? "epil" : "prol";
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *Latch = L->getLoopLatch();
+ Function *F = Header->getParent();
+ LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
+ LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
+ Loop *ParentLoop = L->getParentLoop();
+ NewLoopsMap NewLoops;
+ NewLoops[ParentLoop] = ParentLoop;
+ if (!CreateRemainderLoop)
+ NewLoops[L] = ParentLoop;
+
+ // For each block in the original loop, create a new copy,
+ // and update the value map with the newly created values.
+ for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
+ BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
+ NewBlocks.push_back(NewBB);
+
+ // If we're unrolling the outermost loop, there's no remainder loop,
+ // and this block isn't in a nested loop, then the new block is not
+ // in any loop. Otherwise, add it to loopinfo.
+ if (CreateRemainderLoop || LI->getLoopFor(*BB) != L || ParentLoop)
+ addClonedBlockToLoopInfo(*BB, NewBB, LI, NewLoops);
+
+ VMap[*BB] = NewBB;
+ if (Header == *BB) {
+ // For the first block, add a CFG connection to this newly
+ // created block.
+ InsertTop->getTerminator()->setSuccessor(0, NewBB);
+ }
+
+ if (DT) {
+ if (Header == *BB) {
+ // The header is dominated by the preheader.
+ DT->addNewBlock(NewBB, InsertTop);
+ } else {
+ // Copy information from original loop to unrolled loop.
+ BasicBlock *IDomBB = DT->getNode(*BB)->getIDom()->getBlock();
+ DT->addNewBlock(NewBB, cast<BasicBlock>(VMap[IDomBB]));
+ }
+ }
+
+ if (Latch == *BB) {
+ // For the last block, if CreateRemainderLoop is false, create a direct
+ // jump to InsertBot. If not, create a loop back to cloned head.
+ VMap.erase((*BB)->getTerminator());
+ BasicBlock *FirstLoopBB = cast<BasicBlock>(VMap[Header]);
+ BranchInst *LatchBR = cast<BranchInst>(NewBB->getTerminator());
+ IRBuilder<> Builder(LatchBR);
+ if (!CreateRemainderLoop) {
+ Builder.CreateBr(InsertBot);
+ } else {
+ PHINode *NewIdx = PHINode::Create(NewIter->getType(), 2,
+ suffix + ".iter",
+ FirstLoopBB->getFirstNonPHI());
+ Value *IdxSub =
+ Builder.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+ NewIdx->getName() + ".sub");
+ Value *IdxCmp =
+ Builder.CreateIsNotNull(IdxSub, NewIdx->getName() + ".cmp");
+ Builder.CreateCondBr(IdxCmp, FirstLoopBB, InsertBot);
+ NewIdx->addIncoming(NewIter, InsertTop);
+ NewIdx->addIncoming(IdxSub, NewBB);
+ }
+ LatchBR->eraseFromParent();
+ }
+ }
+
+ // Change the incoming values to the ones defined in the preheader or
+ // cloned loop.
+ for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+ PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
+ if (!CreateRemainderLoop) {
+ if (UseEpilogRemainder) {
+ unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
+ NewPHI->setIncomingBlock(idx, InsertTop);
+ NewPHI->removeIncomingValue(Latch, false);
+ } else {
+ VMap[&*I] = NewPHI->getIncomingValueForBlock(Preheader);
+ cast<BasicBlock>(VMap[Header])->getInstList().erase(NewPHI);
+ }
+ } else {
+ unsigned idx = NewPHI->getBasicBlockIndex(Preheader);
+ NewPHI->setIncomingBlock(idx, InsertTop);
+ BasicBlock *NewLatch = cast<BasicBlock>(VMap[Latch]);
+ idx = NewPHI->getBasicBlockIndex(Latch);
+ Value *InVal = NewPHI->getIncomingValue(idx);
+ NewPHI->setIncomingBlock(idx, NewLatch);
+ if (Value *V = VMap.lookup(InVal))
+ NewPHI->setIncomingValue(idx, V);
+ }
+ }
+ if (CreateRemainderLoop) {
+ Loop *NewLoop = NewLoops[L];
+ assert(NewLoop && "L should have been cloned");
+ MDNode *LoopID = NewLoop->getLoopID();
+
+ // Only add loop metadata if the loop is not going to be completely
+ // unrolled.
+ if (UnrollRemainder)
+ return NewLoop;
+
+ Optional<MDNode *> NewLoopID = makeFollowupLoopID(
+ LoopID, {LLVMLoopUnrollFollowupAll, LLVMLoopUnrollFollowupRemainder});
+ if (NewLoopID.hasValue()) {
+ NewLoop->setLoopID(NewLoopID.getValue());
+
+ // Do not setLoopAlreadyUnrolled if loop attributes have been defined
+ // explicitly.
+ return NewLoop;
+ }
+
+ // Add unroll disable metadata to disable future unrolling for this loop.
+ NewLoop->setLoopAlreadyUnrolled();
+ return NewLoop;
+ }
+ else
+ return nullptr;
+}
+
+/// Returns true if we can safely unroll a multi-exit/exiting loop. OtherExits
+/// is populated with all the loop exit blocks other than the LatchExit block.
+static bool canSafelyUnrollMultiExitLoop(Loop *L, BasicBlock *LatchExit,
+ bool PreserveLCSSA,
+ bool UseEpilogRemainder) {
+
+ // We currently have some correctness constrains in unrolling a multi-exit
+ // loop. Check for these below.
+
+ // We rely on LCSSA form being preserved when the exit blocks are transformed.
+ if (!PreserveLCSSA)
+ return false;
+
+ // TODO: Support multiple exiting blocks jumping to the `LatchExit` when
+ // UnrollRuntimeMultiExit is true. This will need updating the logic in
+ // connectEpilog/connectProlog.
+ if (!LatchExit->getSinglePredecessor()) {
+ LLVM_DEBUG(
+ dbgs() << "Bailout for multi-exit handling when latch exit has >1 "
+ "predecessor.\n");
+ return false;
+ }
+ // FIXME: We bail out of multi-exit unrolling when epilog loop is generated
+ // and L is an inner loop. This is because in presence of multiple exits, the
+ // outer loop is incorrect: we do not add the EpilogPreheader and exit to the
+ // outer loop. This is automatically handled in the prolog case, so we do not
+ // have that bug in prolog generation.
+ if (UseEpilogRemainder && L->getParentLoop())
+ return false;
+
+ // All constraints have been satisfied.
+ return true;
+}
+
+/// Returns true if we can profitably unroll the multi-exit loop L. Currently,
+/// we return true only if UnrollRuntimeMultiExit is set to true.
+static bool canProfitablyUnrollMultiExitLoop(
+ Loop *L, SmallVectorImpl<BasicBlock *> &OtherExits, BasicBlock *LatchExit,
+ bool PreserveLCSSA, bool UseEpilogRemainder) {
+
+#if !defined(NDEBUG)
+ assert(canSafelyUnrollMultiExitLoop(L, LatchExit, PreserveLCSSA,
+ UseEpilogRemainder) &&
+ "Should be safe to unroll before checking profitability!");
+#endif
+
+ // Priority goes to UnrollRuntimeMultiExit if it's supplied.
+ if (UnrollRuntimeMultiExit.getNumOccurrences())
+ return UnrollRuntimeMultiExit;
+
+ // The main pain point with multi-exit loop unrolling is that once unrolled,
+ // we will not be able to merge all blocks into a straight line code.
+ // There are branches within the unrolled loop that go to the OtherExits.
+ // The second point is the increase in code size, but this is true
+ // irrespective of multiple exits.
+
+ // Note: Both the heuristics below are coarse grained. We are essentially
+ // enabling unrolling of loops that have a single side exit other than the
+ // normal LatchExit (i.e. exiting into a deoptimize block).
+ // The heuristics considered are:
+ // 1. low number of branches in the unrolled version.
+ // 2. high predictability of these extra branches.
+ // We avoid unrolling loops that have more than two exiting blocks. This
+ // limits the total number of branches in the unrolled loop to be atmost
+ // the unroll factor (since one of the exiting blocks is the latch block).
+ SmallVector<BasicBlock*, 4> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+ if (ExitingBlocks.size() > 2)
+ return false;
+
+ // The second heuristic is that L has one exit other than the latchexit and
+ // that exit is a deoptimize block. We know that deoptimize blocks are rarely
+ // taken, which also implies the branch leading to the deoptimize block is
+ // highly predictable.
+ return (OtherExits.size() == 1 &&
+ OtherExits[0]->getTerminatingDeoptimizeCall());
+ // TODO: These can be fine-tuned further to consider code size or deopt states
+ // that are captured by the deoptimize exit block.
+ // Also, we can extend this to support more cases, if we actually
+ // know of kinds of multiexit loops that would benefit from unrolling.
+}
+
// Assign the maximum possible trip count as the back edge weight for the
// remainder loop if the original loop comes with a branch weight.
static void updateLatchBranchWeightsForRemainderLoop(Loop *OrigLoop,
@@ -531,459 +531,459 @@ static void updateLatchBranchWeightsForRemainderLoop(Loop *OrigLoop,
}
}
-/// Insert code in the prolog/epilog code when unrolling a loop with a
-/// run-time trip-count.
-///
-/// This method assumes that the loop unroll factor is total number
-/// of loop bodies in the loop after unrolling. (Some folks refer
-/// to the unroll factor as the number of *extra* copies added).
-/// We assume also that the loop unroll factor is a power-of-two. So, after
-/// unrolling the loop, the number of loop bodies executed is 2,
-/// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch
-/// instruction in SimplifyCFG.cpp. Then, the backend decides how code for
-/// the switch instruction is generated.
-///
-/// ***Prolog case***
-/// extraiters = tripcount % loopfactor
-/// if (extraiters == 0) jump Loop:
-/// else jump Prol:
-/// Prol: LoopBody;
-/// extraiters -= 1 // Omitted if unroll factor is 2.
-/// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
-/// if (tripcount < loopfactor) jump End:
-/// Loop:
-/// ...
-/// End:
-///
-/// ***Epilog case***
-/// extraiters = tripcount % loopfactor
-/// if (tripcount < loopfactor) jump LoopExit:
-/// unroll_iters = tripcount - extraiters
-/// Loop: LoopBody; (executes unroll_iter times);
-/// unroll_iter -= 1
-/// if (unroll_iter != 0) jump Loop:
-/// LoopExit:
-/// if (extraiters == 0) jump EpilExit:
-/// Epil: LoopBody; (executes extraiters times)
-/// extraiters -= 1 // Omitted if unroll factor is 2.
-/// if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
-/// EpilExit:
-
-bool llvm::UnrollRuntimeLoopRemainder(
- Loop *L, unsigned Count, bool AllowExpensiveTripCount,
- bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
- LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
- const TargetTransformInfo *TTI, bool PreserveLCSSA, Loop **ResultLoop) {
- LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
- LLVM_DEBUG(L->dump());
- LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
- : dbgs() << "Using prolog remainder.\n");
-
- // Make sure the loop is in canonical form.
- if (!L->isLoopSimplifyForm()) {
- LLVM_DEBUG(dbgs() << "Not in simplify form!\n");
- return false;
- }
-
- // Guaranteed by LoopSimplifyForm.
- BasicBlock *Latch = L->getLoopLatch();
- BasicBlock *Header = L->getHeader();
-
- BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
-
- if (!LatchBR || LatchBR->isUnconditional()) {
- // The loop-rotate pass can be helpful to avoid this in many cases.
- LLVM_DEBUG(
- dbgs()
- << "Loop latch not terminated by a conditional branch.\n");
- return false;
- }
-
- unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
- BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
-
- if (L->contains(LatchExit)) {
- // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
- // targets of the Latch be an exit block out of the loop.
- LLVM_DEBUG(
- dbgs()
- << "One of the loop latch successors must be the exit block.\n");
- return false;
- }
-
- // These are exit blocks other than the target of the latch exiting block.
- SmallVector<BasicBlock *, 4> OtherExits;
- L->getUniqueNonLatchExitBlocks(OtherExits);
- bool isMultiExitUnrollingEnabled =
- canSafelyUnrollMultiExitLoop(L, LatchExit, PreserveLCSSA,
- UseEpilogRemainder) &&
- canProfitablyUnrollMultiExitLoop(L, OtherExits, LatchExit, PreserveLCSSA,
- UseEpilogRemainder);
- // Support only single exit and exiting block unless multi-exit loop unrolling is enabled.
- if (!isMultiExitUnrollingEnabled &&
- (!L->getExitingBlock() || OtherExits.size())) {
- LLVM_DEBUG(
- dbgs()
- << "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
- "enabled!\n");
- return false;
- }
- // Use Scalar Evolution to compute the trip count. This allows more loops to
- // be unrolled than relying on induction var simplification.
- if (!SE)
- return false;
-
- // Only unroll loops with a computable trip count, and the trip count needs
- // to be an int value (allowing a pointer type is a TODO item).
- // We calculate the backedge count by using getExitCount on the Latch block,
- // which is proven to be the only exiting block in this loop. This is same as
- // calculating getBackedgeTakenCount on the loop (which computes SCEV for all
- // exiting blocks).
- const SCEV *BECountSC = SE->getExitCount(L, Latch);
- if (isa<SCEVCouldNotCompute>(BECountSC) ||
- !BECountSC->getType()->isIntegerTy()) {
- LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
- return false;
- }
-
- unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
-
- // Add 1 since the backedge count doesn't include the first loop iteration.
- const SCEV *TripCountSC =
- SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
- if (isa<SCEVCouldNotCompute>(TripCountSC)) {
- LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
- return false;
- }
-
- BasicBlock *PreHeader = L->getLoopPreheader();
- BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
- const DataLayout &DL = Header->getModule()->getDataLayout();
- SCEVExpander Expander(*SE, DL, "loop-unroll");
- if (!AllowExpensiveTripCount &&
- Expander.isHighCostExpansion(TripCountSC, L, SCEVCheapExpansionBudget,
- TTI, PreHeaderBR)) {
- LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
- return false;
- }
-
- // This constraint lets us deal with an overflowing trip count easily; see the
- // comment on ModVal below.
- if (Log2_32(Count) > BEWidth) {
- LLVM_DEBUG(
- dbgs()
- << "Count failed constraint on overflow trip count calculation.\n");
- return false;
- }
-
- // Loop structure is the following:
- //
- // PreHeader
- // Header
- // ...
- // Latch
- // LatchExit
-
- BasicBlock *NewPreHeader;
- BasicBlock *NewExit = nullptr;
- BasicBlock *PrologExit = nullptr;
- BasicBlock *EpilogPreHeader = nullptr;
- BasicBlock *PrologPreHeader = nullptr;
-
- if (UseEpilogRemainder) {
- // If epilog remainder
- // Split PreHeader to insert a branch around loop for unrolling.
- NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
- NewPreHeader->setName(PreHeader->getName() + ".new");
- // Split LatchExit to create phi nodes from branch above.
- SmallVector<BasicBlock*, 4> Preds(predecessors(LatchExit));
- NewExit = SplitBlockPredecessors(LatchExit, Preds, ".unr-lcssa", DT, LI,
- nullptr, PreserveLCSSA);
- // NewExit gets its DebugLoc from LatchExit, which is not part of the
- // original Loop.
- // Fix this by setting Loop's DebugLoc to NewExit.
- auto *NewExitTerminator = NewExit->getTerminator();
- NewExitTerminator->setDebugLoc(Header->getTerminator()->getDebugLoc());
- // Split NewExit to insert epilog remainder loop.
- EpilogPreHeader = SplitBlock(NewExit, NewExitTerminator, DT, LI);
- EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
- } else {
- // If prolog remainder
- // Split the original preheader twice to insert prolog remainder loop
- PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI);
- PrologPreHeader->setName(Header->getName() + ".prol.preheader");
- PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(),
- DT, LI);
- PrologExit->setName(Header->getName() + ".prol.loopexit");
- // Split PrologExit to get NewPreHeader.
- NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI);
- NewPreHeader->setName(PreHeader->getName() + ".new");
- }
- // Loop structure should be the following:
- // Epilog Prolog
- //
- // PreHeader PreHeader
- // *NewPreHeader *PrologPreHeader
- // Header *PrologExit
- // ... *NewPreHeader
- // Latch Header
- // *NewExit ...
- // *EpilogPreHeader Latch
- // LatchExit LatchExit
-
- // Calculate conditions for branch around loop for unrolling
- // in epilog case and around prolog remainder loop in prolog case.
- // Compute the number of extra iterations required, which is:
- // extra iterations = run-time trip count % loop unroll factor
- PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
- Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
- PreHeaderBR);
- Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
- PreHeaderBR);
- IRBuilder<> B(PreHeaderBR);
- Value *ModVal;
- // Calculate ModVal = (BECount + 1) % Count.
- // Note that TripCount is BECount + 1.
- if (isPowerOf2_32(Count)) {
- // When Count is power of 2 we don't BECount for epilog case, however we'll
- // need it for a branch around unrolling loop for prolog case.
- ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
- // 1. There are no iterations to be run in the prolog/epilog loop.
- // OR
- // 2. The addition computing TripCount overflowed.
- //
- // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
- // the number of iterations that remain to be run in the original loop is a
- // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
- // explicitly check this above).
- } else {
- // As (BECount + 1) can potentially unsigned overflow we count
- // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
- Value *ModValTmp = B.CreateURem(BECount,
- ConstantInt::get(BECount->getType(),
- Count));
- Value *ModValAdd = B.CreateAdd(ModValTmp,
- ConstantInt::get(ModValTmp->getType(), 1));
- // At that point (BECount % Count) + 1 could be equal to Count.
- // To handle this case we need to take mod by Count one more time.
- ModVal = B.CreateURem(ModValAdd,
- ConstantInt::get(BECount->getType(), Count),
- "xtraiter");
- }
- Value *BranchVal =
- UseEpilogRemainder ? B.CreateICmpULT(BECount,
- ConstantInt::get(BECount->getType(),
- Count - 1)) :
- B.CreateIsNotNull(ModVal, "lcmp.mod");
- BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader;
- BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
- // Branch to either remainder (extra iterations) loop or unrolling loop.
- B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop);
- PreHeaderBR->eraseFromParent();
- if (DT) {
- if (UseEpilogRemainder)
- DT->changeImmediateDominator(NewExit, PreHeader);
- else
- DT->changeImmediateDominator(PrologExit, PreHeader);
- }
- Function *F = Header->getParent();
- // Get an ordered list of blocks in the loop to help with the ordering of the
- // cloned blocks in the prolog/epilog code
- LoopBlocksDFS LoopBlocks(L);
- LoopBlocks.perform(LI);
-
- //
- // For each extra loop iteration, create a copy of the loop's basic blocks
- // and generate a condition that branches to the copy depending on the
- // number of 'left over' iterations.
- //
- std::vector<BasicBlock *> NewBlocks;
- ValueToValueMapTy VMap;
-
- // For unroll factor 2 remainder loop will have 1 iterations.
- // Do not create 1 iteration loop.
- bool CreateRemainderLoop = (Count != 2);
-
- // Clone all the basic blocks in the loop. If Count is 2, we don't clone
- // the loop, otherwise we create a cloned loop to execute the extra
- // iterations. This function adds the appropriate CFG connections.
- BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
- BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
- Loop *remainderLoop = CloneLoopBlocks(
- L, ModVal, CreateRemainderLoop, UseEpilogRemainder, UnrollRemainder,
- InsertTop, InsertBot,
- NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
-
+/// Insert code in the prolog/epilog code when unrolling a loop with a
+/// run-time trip-count.
+///
+/// This method assumes that the loop unroll factor is total number
+/// of loop bodies in the loop after unrolling. (Some folks refer
+/// to the unroll factor as the number of *extra* copies added).
+/// We assume also that the loop unroll factor is a power-of-two. So, after
+/// unrolling the loop, the number of loop bodies executed is 2,
+/// 4, 8, etc. Note - LLVM converts the if-then-sequence to a switch
+/// instruction in SimplifyCFG.cpp. Then, the backend decides how code for
+/// the switch instruction is generated.
+///
+/// ***Prolog case***
+/// extraiters = tripcount % loopfactor
+/// if (extraiters == 0) jump Loop:
+/// else jump Prol:
+/// Prol: LoopBody;
+/// extraiters -= 1 // Omitted if unroll factor is 2.
+/// if (extraiters != 0) jump Prol: // Omitted if unroll factor is 2.
+/// if (tripcount < loopfactor) jump End:
+/// Loop:
+/// ...
+/// End:
+///
+/// ***Epilog case***
+/// extraiters = tripcount % loopfactor
+/// if (tripcount < loopfactor) jump LoopExit:
+/// unroll_iters = tripcount - extraiters
+/// Loop: LoopBody; (executes unroll_iter times);
+/// unroll_iter -= 1
+/// if (unroll_iter != 0) jump Loop:
+/// LoopExit:
+/// if (extraiters == 0) jump EpilExit:
+/// Epil: LoopBody; (executes extraiters times)
+/// extraiters -= 1 // Omitted if unroll factor is 2.
+/// if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
+/// EpilExit:
+
+bool llvm::UnrollRuntimeLoopRemainder(
+ Loop *L, unsigned Count, bool AllowExpensiveTripCount,
+ bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
+ LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+ const TargetTransformInfo *TTI, bool PreserveLCSSA, Loop **ResultLoop) {
+ LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
+ LLVM_DEBUG(L->dump());
+ LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
+ : dbgs() << "Using prolog remainder.\n");
+
+ // Make sure the loop is in canonical form.
+ if (!L->isLoopSimplifyForm()) {
+ LLVM_DEBUG(dbgs() << "Not in simplify form!\n");
+ return false;
+ }
+
+ // Guaranteed by LoopSimplifyForm.
+ BasicBlock *Latch = L->getLoopLatch();
+ BasicBlock *Header = L->getHeader();
+
+ BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+
+ if (!LatchBR || LatchBR->isUnconditional()) {
+ // The loop-rotate pass can be helpful to avoid this in many cases.
+ LLVM_DEBUG(
+ dbgs()
+ << "Loop latch not terminated by a conditional branch.\n");
+ return false;
+ }
+
+ unsigned ExitIndex = LatchBR->getSuccessor(0) == Header ? 1 : 0;
+ BasicBlock *LatchExit = LatchBR->getSuccessor(ExitIndex);
+
+ if (L->contains(LatchExit)) {
+ // Cloning the loop basic blocks (`CloneLoopBlocks`) requires that one of the
+ // targets of the Latch be an exit block out of the loop.
+ LLVM_DEBUG(
+ dbgs()
+ << "One of the loop latch successors must be the exit block.\n");
+ return false;
+ }
+
+ // These are exit blocks other than the target of the latch exiting block.
+ SmallVector<BasicBlock *, 4> OtherExits;
+ L->getUniqueNonLatchExitBlocks(OtherExits);
+ bool isMultiExitUnrollingEnabled =
+ canSafelyUnrollMultiExitLoop(L, LatchExit, PreserveLCSSA,
+ UseEpilogRemainder) &&
+ canProfitablyUnrollMultiExitLoop(L, OtherExits, LatchExit, PreserveLCSSA,
+ UseEpilogRemainder);
+ // Support only single exit and exiting block unless multi-exit loop unrolling is enabled.
+ if (!isMultiExitUnrollingEnabled &&
+ (!L->getExitingBlock() || OtherExits.size())) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Multiple exit/exiting blocks in loop and multi-exit unrolling not "
+ "enabled!\n");
+ return false;
+ }
+ // Use Scalar Evolution to compute the trip count. This allows more loops to
+ // be unrolled than relying on induction var simplification.
+ if (!SE)
+ return false;
+
+ // Only unroll loops with a computable trip count, and the trip count needs
+ // to be an int value (allowing a pointer type is a TODO item).
+ // We calculate the backedge count by using getExitCount on the Latch block,
+ // which is proven to be the only exiting block in this loop. This is same as
+ // calculating getBackedgeTakenCount on the loop (which computes SCEV for all
+ // exiting blocks).
+ const SCEV *BECountSC = SE->getExitCount(L, Latch);
+ if (isa<SCEVCouldNotCompute>(BECountSC) ||
+ !BECountSC->getType()->isIntegerTy()) {
+ LLVM_DEBUG(dbgs() << "Could not compute exit block SCEV\n");
+ return false;
+ }
+
+ unsigned BEWidth = cast<IntegerType>(BECountSC->getType())->getBitWidth();
+
+ // Add 1 since the backedge count doesn't include the first loop iteration.
+ const SCEV *TripCountSC =
+ SE->getAddExpr(BECountSC, SE->getConstant(BECountSC->getType(), 1));
+ if (isa<SCEVCouldNotCompute>(TripCountSC)) {
+ LLVM_DEBUG(dbgs() << "Could not compute trip count SCEV.\n");
+ return false;
+ }
+
+ BasicBlock *PreHeader = L->getLoopPreheader();
+ BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
+ const DataLayout &DL = Header->getModule()->getDataLayout();
+ SCEVExpander Expander(*SE, DL, "loop-unroll");
+ if (!AllowExpensiveTripCount &&
+ Expander.isHighCostExpansion(TripCountSC, L, SCEVCheapExpansionBudget,
+ TTI, PreHeaderBR)) {
+ LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
+ return false;
+ }
+
+ // This constraint lets us deal with an overflowing trip count easily; see the
+ // comment on ModVal below.
+ if (Log2_32(Count) > BEWidth) {
+ LLVM_DEBUG(
+ dbgs()
+ << "Count failed constraint on overflow trip count calculation.\n");
+ return false;
+ }
+
+ // Loop structure is the following:
+ //
+ // PreHeader
+ // Header
+ // ...
+ // Latch
+ // LatchExit
+
+ BasicBlock *NewPreHeader;
+ BasicBlock *NewExit = nullptr;
+ BasicBlock *PrologExit = nullptr;
+ BasicBlock *EpilogPreHeader = nullptr;
+ BasicBlock *PrologPreHeader = nullptr;
+
+ if (UseEpilogRemainder) {
+ // If epilog remainder
+ // Split PreHeader to insert a branch around loop for unrolling.
+ NewPreHeader = SplitBlock(PreHeader, PreHeader->getTerminator(), DT, LI);
+ NewPreHeader->setName(PreHeader->getName() + ".new");
+ // Split LatchExit to create phi nodes from branch above.
+ SmallVector<BasicBlock*, 4> Preds(predecessors(LatchExit));
+ NewExit = SplitBlockPredecessors(LatchExit, Preds, ".unr-lcssa", DT, LI,
+ nullptr, PreserveLCSSA);
+ // NewExit gets its DebugLoc from LatchExit, which is not part of the
+ // original Loop.
+ // Fix this by setting Loop's DebugLoc to NewExit.
+ auto *NewExitTerminator = NewExit->getTerminator();
+ NewExitTerminator->setDebugLoc(Header->getTerminator()->getDebugLoc());
+ // Split NewExit to insert epilog remainder loop.
+ EpilogPreHeader = SplitBlock(NewExit, NewExitTerminator, DT, LI);
+ EpilogPreHeader->setName(Header->getName() + ".epil.preheader");
+ } else {
+ // If prolog remainder
+ // Split the original preheader twice to insert prolog remainder loop
+ PrologPreHeader = SplitEdge(PreHeader, Header, DT, LI);
+ PrologPreHeader->setName(Header->getName() + ".prol.preheader");
+ PrologExit = SplitBlock(PrologPreHeader, PrologPreHeader->getTerminator(),
+ DT, LI);
+ PrologExit->setName(Header->getName() + ".prol.loopexit");
+ // Split PrologExit to get NewPreHeader.
+ NewPreHeader = SplitBlock(PrologExit, PrologExit->getTerminator(), DT, LI);
+ NewPreHeader->setName(PreHeader->getName() + ".new");
+ }
+ // Loop structure should be the following:
+ // Epilog Prolog
+ //
+ // PreHeader PreHeader
+ // *NewPreHeader *PrologPreHeader
+ // Header *PrologExit
+ // ... *NewPreHeader
+ // Latch Header
+ // *NewExit ...
+ // *EpilogPreHeader Latch
+ // LatchExit LatchExit
+
+ // Calculate conditions for branch around loop for unrolling
+ // in epilog case and around prolog remainder loop in prolog case.
+ // Compute the number of extra iterations required, which is:
+ // extra iterations = run-time trip count % loop unroll factor
+ PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
+ Value *TripCount = Expander.expandCodeFor(TripCountSC, TripCountSC->getType(),
+ PreHeaderBR);
+ Value *BECount = Expander.expandCodeFor(BECountSC, BECountSC->getType(),
+ PreHeaderBR);
+ IRBuilder<> B(PreHeaderBR);
+ Value *ModVal;
+ // Calculate ModVal = (BECount + 1) % Count.
+ // Note that TripCount is BECount + 1.
+ if (isPowerOf2_32(Count)) {
+ // When Count is power of 2 we don't BECount for epilog case, however we'll
+ // need it for a branch around unrolling loop for prolog case.
+ ModVal = B.CreateAnd(TripCount, Count - 1, "xtraiter");
+ // 1. There are no iterations to be run in the prolog/epilog loop.
+ // OR
+ // 2. The addition computing TripCount overflowed.
+ //
+ // If (2) is true, we know that TripCount really is (1 << BEWidth) and so
+ // the number of iterations that remain to be run in the original loop is a
+ // multiple Count == (1 << Log2(Count)) because Log2(Count) <= BEWidth (we
+ // explicitly check this above).
+ } else {
+ // As (BECount + 1) can potentially unsigned overflow we count
+ // (BECount % Count) + 1 which is overflow safe as BECount % Count < Count.
+ Value *ModValTmp = B.CreateURem(BECount,
+ ConstantInt::get(BECount->getType(),
+ Count));
+ Value *ModValAdd = B.CreateAdd(ModValTmp,
+ ConstantInt::get(ModValTmp->getType(), 1));
+ // At that point (BECount % Count) + 1 could be equal to Count.
+ // To handle this case we need to take mod by Count one more time.
+ ModVal = B.CreateURem(ModValAdd,
+ ConstantInt::get(BECount->getType(), Count),
+ "xtraiter");
+ }
+ Value *BranchVal =
+ UseEpilogRemainder ? B.CreateICmpULT(BECount,
+ ConstantInt::get(BECount->getType(),
+ Count - 1)) :
+ B.CreateIsNotNull(ModVal, "lcmp.mod");
+ BasicBlock *RemainderLoop = UseEpilogRemainder ? NewExit : PrologPreHeader;
+ BasicBlock *UnrollingLoop = UseEpilogRemainder ? NewPreHeader : PrologExit;
+ // Branch to either remainder (extra iterations) loop or unrolling loop.
+ B.CreateCondBr(BranchVal, RemainderLoop, UnrollingLoop);
+ PreHeaderBR->eraseFromParent();
+ if (DT) {
+ if (UseEpilogRemainder)
+ DT->changeImmediateDominator(NewExit, PreHeader);
+ else
+ DT->changeImmediateDominator(PrologExit, PreHeader);
+ }
+ Function *F = Header->getParent();
+ // Get an ordered list of blocks in the loop to help with the ordering of the
+ // cloned blocks in the prolog/epilog code
+ LoopBlocksDFS LoopBlocks(L);
+ LoopBlocks.perform(LI);
+
+ //
+ // For each extra loop iteration, create a copy of the loop's basic blocks
+ // and generate a condition that branches to the copy depending on the
+ // number of 'left over' iterations.
+ //
+ std::vector<BasicBlock *> NewBlocks;
+ ValueToValueMapTy VMap;
+
+ // For unroll factor 2 remainder loop will have 1 iterations.
+ // Do not create 1 iteration loop.
+ bool CreateRemainderLoop = (Count != 2);
+
+ // Clone all the basic blocks in the loop. If Count is 2, we don't clone
+ // the loop, otherwise we create a cloned loop to execute the extra
+ // iterations. This function adds the appropriate CFG connections.
+ BasicBlock *InsertBot = UseEpilogRemainder ? LatchExit : PrologExit;
+ BasicBlock *InsertTop = UseEpilogRemainder ? EpilogPreHeader : PrologPreHeader;
+ Loop *remainderLoop = CloneLoopBlocks(
+ L, ModVal, CreateRemainderLoop, UseEpilogRemainder, UnrollRemainder,
+ InsertTop, InsertBot,
+ NewPreHeader, NewBlocks, LoopBlocks, VMap, DT, LI);
+
// Assign the maximum possible trip count as the back edge weight for the
// remainder loop if the original loop comes with a branch weight.
if (remainderLoop && !UnrollRemainder)
updateLatchBranchWeightsForRemainderLoop(L, remainderLoop, Count);
- // Insert the cloned blocks into the function.
- F->getBasicBlockList().splice(InsertBot->getIterator(),
- F->getBasicBlockList(),
- NewBlocks[0]->getIterator(),
- F->end());
-
- // Now the loop blocks are cloned and the other exiting blocks from the
- // remainder are connected to the original Loop's exit blocks. The remaining
- // work is to update the phi nodes in the original loop, and take in the
- // values from the cloned region.
- for (auto *BB : OtherExits) {
- for (auto &II : *BB) {
-
- // Given we preserve LCSSA form, we know that the values used outside the
- // loop will be used through these phi nodes at the exit blocks that are
- // transformed below.
- if (!isa<PHINode>(II))
- break;
- PHINode *Phi = cast<PHINode>(&II);
- unsigned oldNumOperands = Phi->getNumIncomingValues();
- // Add the incoming values from the remainder code to the end of the phi
- // node.
- for (unsigned i =0; i < oldNumOperands; i++){
- Value *newVal = VMap.lookup(Phi->getIncomingValue(i));
- // newVal can be a constant or derived from values outside the loop, and
- // hence need not have a VMap value. Also, since lookup already generated
- // a default "null" VMap entry for this value, we need to populate that
- // VMap entry correctly, with the mapped entry being itself.
- if (!newVal) {
- newVal = Phi->getIncomingValue(i);
- VMap[Phi->getIncomingValue(i)] = Phi->getIncomingValue(i);
- }
- Phi->addIncoming(newVal,
- cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)]));
- }
- }
-#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
- for (BasicBlock *SuccBB : successors(BB)) {
- assert(!(any_of(OtherExits,
- [SuccBB](BasicBlock *EB) { return EB == SuccBB; }) ||
- SuccBB == LatchExit) &&
- "Breaks the definition of dedicated exits!");
- }
-#endif
- }
-
- // Update the immediate dominator of the exit blocks and blocks that are
- // reachable from the exit blocks. This is needed because we now have paths
- // from both the original loop and the remainder code reaching the exit
- // blocks. While the IDom of these exit blocks were from the original loop,
- // now the IDom is the preheader (which decides whether the original loop or
- // remainder code should run).
- if (DT && !L->getExitingBlock()) {
- SmallVector<BasicBlock *, 16> ChildrenToUpdate;
- // NB! We have to examine the dom children of all loop blocks, not just
- // those which are the IDom of the exit blocks. This is because blocks
- // reachable from the exit blocks can have their IDom as the nearest common
- // dominator of the exit blocks.
- for (auto *BB : L->blocks()) {
- auto *DomNodeBB = DT->getNode(BB);
- for (auto *DomChild : DomNodeBB->children()) {
- auto *DomChildBB = DomChild->getBlock();
- if (!L->contains(LI->getLoopFor(DomChildBB)))
- ChildrenToUpdate.push_back(DomChildBB);
- }
- }
- for (auto *BB : ChildrenToUpdate)
- DT->changeImmediateDominator(BB, PreHeader);
- }
-
- // Loop structure should be the following:
- // Epilog Prolog
- //
- // PreHeader PreHeader
- // NewPreHeader PrologPreHeader
- // Header PrologHeader
- // ... ...
- // Latch PrologLatch
- // NewExit PrologExit
- // EpilogPreHeader NewPreHeader
- // EpilogHeader Header
- // ... ...
- // EpilogLatch Latch
- // LatchExit LatchExit
-
- // Rewrite the cloned instruction operands to use the values created when the
- // clone is created.
- for (BasicBlock *BB : NewBlocks) {
- for (Instruction &I : *BB) {
- RemapInstruction(&I, VMap,
- RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
- }
- }
-
- if (UseEpilogRemainder) {
- // Connect the epilog code to the original loop and update the
- // PHI functions.
- ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader,
- EpilogPreHeader, NewPreHeader, VMap, DT, LI,
- PreserveLCSSA);
-
- // Update counter in loop for unrolling.
- // I should be multiply of Count.
- IRBuilder<> B2(NewPreHeader->getTerminator());
- Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
- BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
- B2.SetInsertPoint(LatchBR);
- PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
- Header->getFirstNonPHI());
- Value *IdxSub =
- B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
- NewIdx->getName() + ".nsub");
- Value *IdxCmp;
- if (LatchBR->getSuccessor(0) == Header)
- IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp");
- else
- IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
- NewIdx->addIncoming(TestVal, NewPreHeader);
- NewIdx->addIncoming(IdxSub, Latch);
- LatchBR->setCondition(IdxCmp);
- } else {
- // Connect the prolog code to the original loop and update the
- // PHI functions.
- ConnectProlog(L, BECount, Count, PrologExit, LatchExit, PreHeader,
- NewPreHeader, VMap, DT, LI, PreserveLCSSA);
- }
-
- // If this loop is nested, then the loop unroller changes the code in the any
- // of its parent loops, so the Scalar Evolution pass needs to be run again.
- SE->forgetTopmostLoop(L);
-
- // Verify that the Dom Tree is correct.
-#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
- if (DT)
- assert(DT->verify(DominatorTree::VerificationLevel::Full));
-#endif
-
- // Canonicalize to LoopSimplifyForm both original and remainder loops. We
- // cannot rely on the LoopUnrollPass to do this because it only does
- // canonicalization for parent/subloops and not the sibling loops.
- if (OtherExits.size() > 0) {
- // Generate dedicated exit blocks for the original loop, to preserve
- // LoopSimplifyForm.
- formDedicatedExitBlocks(L, DT, LI, nullptr, PreserveLCSSA);
- // Generate dedicated exit blocks for the remainder loop if one exists, to
- // preserve LoopSimplifyForm.
- if (remainderLoop)
- formDedicatedExitBlocks(remainderLoop, DT, LI, nullptr, PreserveLCSSA);
- }
-
- auto UnrollResult = LoopUnrollResult::Unmodified;
- if (remainderLoop && UnrollRemainder) {
- LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
- UnrollResult =
- UnrollLoop(remainderLoop,
- {/*Count*/ Count - 1, /*TripCount*/ Count - 1,
- /*Force*/ false, /*AllowRuntime*/ false,
- /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true,
- /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1,
- /*PeelCount*/ 0, /*UnrollRemainder*/ false, ForgetAllSCEV},
- LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA);
- }
-
- if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
- *ResultLoop = remainderLoop;
- NumRuntimeUnrolled++;
- return true;
-}
+ // Insert the cloned blocks into the function.
+ F->getBasicBlockList().splice(InsertBot->getIterator(),
+ F->getBasicBlockList(),
+ NewBlocks[0]->getIterator(),
+ F->end());
+
+ // Now the loop blocks are cloned and the other exiting blocks from the
+ // remainder are connected to the original Loop's exit blocks. The remaining
+ // work is to update the phi nodes in the original loop, and take in the
+ // values from the cloned region.
+ for (auto *BB : OtherExits) {
+ for (auto &II : *BB) {
+
+ // Given we preserve LCSSA form, we know that the values used outside the
+ // loop will be used through these phi nodes at the exit blocks that are
+ // transformed below.
+ if (!isa<PHINode>(II))
+ break;
+ PHINode *Phi = cast<PHINode>(&II);
+ unsigned oldNumOperands = Phi->getNumIncomingValues();
+ // Add the incoming values from the remainder code to the end of the phi
+ // node.
+ for (unsigned i =0; i < oldNumOperands; i++){
+ Value *newVal = VMap.lookup(Phi->getIncomingValue(i));
+ // newVal can be a constant or derived from values outside the loop, and
+ // hence need not have a VMap value. Also, since lookup already generated
+ // a default "null" VMap entry for this value, we need to populate that
+ // VMap entry correctly, with the mapped entry being itself.
+ if (!newVal) {
+ newVal = Phi->getIncomingValue(i);
+ VMap[Phi->getIncomingValue(i)] = Phi->getIncomingValue(i);
+ }
+ Phi->addIncoming(newVal,
+ cast<BasicBlock>(VMap[Phi->getIncomingBlock(i)]));
+ }
+ }
+#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
+ for (BasicBlock *SuccBB : successors(BB)) {
+ assert(!(any_of(OtherExits,
+ [SuccBB](BasicBlock *EB) { return EB == SuccBB; }) ||
+ SuccBB == LatchExit) &&
+ "Breaks the definition of dedicated exits!");
+ }
+#endif
+ }
+
+ // Update the immediate dominator of the exit blocks and blocks that are
+ // reachable from the exit blocks. This is needed because we now have paths
+ // from both the original loop and the remainder code reaching the exit
+ // blocks. While the IDom of these exit blocks were from the original loop,
+ // now the IDom is the preheader (which decides whether the original loop or
+ // remainder code should run).
+ if (DT && !L->getExitingBlock()) {
+ SmallVector<BasicBlock *, 16> ChildrenToUpdate;
+ // NB! We have to examine the dom children of all loop blocks, not just
+ // those which are the IDom of the exit blocks. This is because blocks
+ // reachable from the exit blocks can have their IDom as the nearest common
+ // dominator of the exit blocks.
+ for (auto *BB : L->blocks()) {
+ auto *DomNodeBB = DT->getNode(BB);
+ for (auto *DomChild : DomNodeBB->children()) {
+ auto *DomChildBB = DomChild->getBlock();
+ if (!L->contains(LI->getLoopFor(DomChildBB)))
+ ChildrenToUpdate.push_back(DomChildBB);
+ }
+ }
+ for (auto *BB : ChildrenToUpdate)
+ DT->changeImmediateDominator(BB, PreHeader);
+ }
+
+ // Loop structure should be the following:
+ // Epilog Prolog
+ //
+ // PreHeader PreHeader
+ // NewPreHeader PrologPreHeader
+ // Header PrologHeader
+ // ... ...
+ // Latch PrologLatch
+ // NewExit PrologExit
+ // EpilogPreHeader NewPreHeader
+ // EpilogHeader Header
+ // ... ...
+ // EpilogLatch Latch
+ // LatchExit LatchExit
+
+ // Rewrite the cloned instruction operands to use the values created when the
+ // clone is created.
+ for (BasicBlock *BB : NewBlocks) {
+ for (Instruction &I : *BB) {
+ RemapInstruction(&I, VMap,
+ RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+ }
+ }
+
+ if (UseEpilogRemainder) {
+ // Connect the epilog code to the original loop and update the
+ // PHI functions.
+ ConnectEpilog(L, ModVal, NewExit, LatchExit, PreHeader,
+ EpilogPreHeader, NewPreHeader, VMap, DT, LI,
+ PreserveLCSSA);
+
+ // Update counter in loop for unrolling.
+ // I should be multiply of Count.
+ IRBuilder<> B2(NewPreHeader->getTerminator());
+ Value *TestVal = B2.CreateSub(TripCount, ModVal, "unroll_iter");
+ BranchInst *LatchBR = cast<BranchInst>(Latch->getTerminator());
+ B2.SetInsertPoint(LatchBR);
+ PHINode *NewIdx = PHINode::Create(TestVal->getType(), 2, "niter",
+ Header->getFirstNonPHI());
+ Value *IdxSub =
+ B2.CreateSub(NewIdx, ConstantInt::get(NewIdx->getType(), 1),
+ NewIdx->getName() + ".nsub");
+ Value *IdxCmp;
+ if (LatchBR->getSuccessor(0) == Header)
+ IdxCmp = B2.CreateIsNotNull(IdxSub, NewIdx->getName() + ".ncmp");
+ else
+ IdxCmp = B2.CreateIsNull(IdxSub, NewIdx->getName() + ".ncmp");
+ NewIdx->addIncoming(TestVal, NewPreHeader);
+ NewIdx->addIncoming(IdxSub, Latch);
+ LatchBR->setCondition(IdxCmp);
+ } else {
+ // Connect the prolog code to the original loop and update the
+ // PHI functions.
+ ConnectProlog(L, BECount, Count, PrologExit, LatchExit, PreHeader,
+ NewPreHeader, VMap, DT, LI, PreserveLCSSA);
+ }
+
+ // If this loop is nested, then the loop unroller changes the code in the any
+ // of its parent loops, so the Scalar Evolution pass needs to be run again.
+ SE->forgetTopmostLoop(L);
+
+ // Verify that the Dom Tree is correct.
+#if defined(EXPENSIVE_CHECKS) && !defined(NDEBUG)
+ if (DT)
+ assert(DT->verify(DominatorTree::VerificationLevel::Full));
+#endif
+
+ // Canonicalize to LoopSimplifyForm both original and remainder loops. We
+ // cannot rely on the LoopUnrollPass to do this because it only does
+ // canonicalization for parent/subloops and not the sibling loops.
+ if (OtherExits.size() > 0) {
+ // Generate dedicated exit blocks for the original loop, to preserve
+ // LoopSimplifyForm.
+ formDedicatedExitBlocks(L, DT, LI, nullptr, PreserveLCSSA);
+ // Generate dedicated exit blocks for the remainder loop if one exists, to
+ // preserve LoopSimplifyForm.
+ if (remainderLoop)
+ formDedicatedExitBlocks(remainderLoop, DT, LI, nullptr, PreserveLCSSA);
+ }
+
+ auto UnrollResult = LoopUnrollResult::Unmodified;
+ if (remainderLoop && UnrollRemainder) {
+ LLVM_DEBUG(dbgs() << "Unrolling remainder loop\n");
+ UnrollResult =
+ UnrollLoop(remainderLoop,
+ {/*Count*/ Count - 1, /*TripCount*/ Count - 1,
+ /*Force*/ false, /*AllowRuntime*/ false,
+ /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true,
+ /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1,
+ /*PeelCount*/ 0, /*UnrollRemainder*/ false, ForgetAllSCEV},
+ LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA);
+ }
+
+ if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
+ *ResultLoop = remainderLoop;
+ NumRuntimeUnrolled++;
+ return true;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUtils.cpp
index a220f9d25a..f0f423e981 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LoopUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LoopUtils.cpp
@@ -1,307 +1,307 @@
-//===-- LoopUtils.cpp - Loop Utility functions -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines common loop utility functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/PriorityWorklist.h"
-#include "llvm/ADT/ScopeExit.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/DomTreeUpdater.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/MustExecute.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/KnownBits.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-static cl::opt<bool> ForceReductionIntrinsic(
- "force-reduction-intrinsics", cl::Hidden,
- cl::desc("Force creating reduction intrinsics for testing."),
- cl::init(false));
-
-#define DEBUG_TYPE "loop-utils"
-
-static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";
-static const char *LLVMLoopDisableLICM = "llvm.licm.disable";
+//===-- LoopUtils.cpp - Loop Utility functions -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines common loop utility functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PriorityWorklist.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+static cl::opt<bool> ForceReductionIntrinsic(
+ "force-reduction-intrinsics", cl::Hidden,
+ cl::desc("Force creating reduction intrinsics for testing."),
+ cl::init(false));
+
+#define DEBUG_TYPE "loop-utils"
+
+static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";
+static const char *LLVMLoopDisableLICM = "llvm.licm.disable";
static const char *LLVMLoopMustProgress = "llvm.loop.mustprogress";
-
-bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
- MemorySSAUpdater *MSSAU,
- bool PreserveLCSSA) {
- bool Changed = false;
-
- // We re-use a vector for the in-loop predecesosrs.
- SmallVector<BasicBlock *, 4> InLoopPredecessors;
-
- auto RewriteExit = [&](BasicBlock *BB) {
- assert(InLoopPredecessors.empty() &&
- "Must start with an empty predecessors list!");
- auto Cleanup = make_scope_exit([&] { InLoopPredecessors.clear(); });
-
- // See if there are any non-loop predecessors of this exit block and
- // keep track of the in-loop predecessors.
- bool IsDedicatedExit = true;
- for (auto *PredBB : predecessors(BB))
- if (L->contains(PredBB)) {
- if (isa<IndirectBrInst>(PredBB->getTerminator()))
- // We cannot rewrite exiting edges from an indirectbr.
- return false;
- if (isa<CallBrInst>(PredBB->getTerminator()))
- // We cannot rewrite exiting edges from a callbr.
- return false;
-
- InLoopPredecessors.push_back(PredBB);
- } else {
- IsDedicatedExit = false;
- }
-
- assert(!InLoopPredecessors.empty() && "Must have *some* loop predecessor!");
-
- // Nothing to do if this is already a dedicated exit.
- if (IsDedicatedExit)
- return false;
-
- auto *NewExitBB = SplitBlockPredecessors(
- BB, InLoopPredecessors, ".loopexit", DT, LI, MSSAU, PreserveLCSSA);
-
- if (!NewExitBB)
- LLVM_DEBUG(
- dbgs() << "WARNING: Can't create a dedicated exit block for loop: "
- << *L << "\n");
- else
- LLVM_DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
- << NewExitBB->getName() << "\n");
- return true;
- };
-
- // Walk the exit blocks directly rather than building up a data structure for
- // them, but only visit each one once.
- SmallPtrSet<BasicBlock *, 4> Visited;
- for (auto *BB : L->blocks())
- for (auto *SuccBB : successors(BB)) {
- // We're looking for exit blocks so skip in-loop successors.
- if (L->contains(SuccBB))
- continue;
-
- // Visit each exit block exactly once.
- if (!Visited.insert(SuccBB).second)
- continue;
-
- Changed |= RewriteExit(SuccBB);
- }
-
- return Changed;
-}
-
-/// Returns the instructions that use values defined in the loop.
-SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
- SmallVector<Instruction *, 8> UsedOutside;
-
- for (auto *Block : L->getBlocks())
- // FIXME: I believe that this could use copy_if if the Inst reference could
- // be adapted into a pointer.
- for (auto &Inst : *Block) {
- auto Users = Inst.users();
- if (any_of(Users, [&](User *U) {
- auto *Use = cast<Instruction>(U);
- return !L->contains(Use->getParent());
- }))
- UsedOutside.push_back(&Inst);
- }
-
- return UsedOutside;
-}
-
-void llvm::getLoopAnalysisUsage(AnalysisUsage &AU) {
- // By definition, all loop passes need the LoopInfo analysis and the
- // Dominator tree it depends on. Because they all participate in the loop
- // pass manager, they must also preserve these.
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
-
- // We must also preserve LoopSimplify and LCSSA. We locally access their IDs
- // here because users shouldn't directly get them from this header.
- extern char &LoopSimplifyID;
- extern char &LCSSAID;
- AU.addRequiredID(LoopSimplifyID);
- AU.addPreservedID(LoopSimplifyID);
- AU.addRequiredID(LCSSAID);
- AU.addPreservedID(LCSSAID);
- // This is used in the LPPassManager to perform LCSSA verification on passes
- // which preserve lcssa form
- AU.addRequired<LCSSAVerificationPass>();
- AU.addPreserved<LCSSAVerificationPass>();
-
- // Loop passes are designed to run inside of a loop pass manager which means
- // that any function analyses they require must be required by the first loop
- // pass in the manager (so that it is computed before the loop pass manager
- // runs) and preserved by all loop pasess in the manager. To make this
- // reasonably robust, the set needed for most loop passes is maintained here.
- // If your loop pass requires an analysis not listed here, you will need to
- // carefully audit the loop pass manager nesting structure that results.
- AU.addRequired<AAResultsWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<BasicAAWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<SCEVAAWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addPreserved<ScalarEvolutionWrapperPass>();
- // FIXME: When all loop passes preserve MemorySSA, it can be required and
- // preserved here instead of the individual handling in each pass.
-}
-
-/// Manually defined generic "LoopPass" dependency initialization. This is used
-/// to initialize the exact set of passes from above in \c
-/// getLoopAnalysisUsage. It can be used within a loop pass's initialization
-/// with:
-///
-/// INITIALIZE_PASS_DEPENDENCY(LoopPass)
-///
-/// As-if "LoopPass" were a pass.
-void llvm::initializeLoopPassPass(PassRegistry &Registry) {
- INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
- INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
- INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
- INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
- INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
- INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
- INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
- INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
- INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
- INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-}
-
-/// Create MDNode for input string.
-static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
- LLVMContext &Context = TheLoop->getHeader()->getContext();
- Metadata *MDs[] = {
- MDString::get(Context, Name),
- ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
- return MDNode::get(Context, MDs);
-}
-
-/// Set input string into loop metadata by keeping other values intact.
-/// If the string is already in loop metadata update value if it is
-/// different.
-void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *StringMD,
- unsigned V) {
- SmallVector<Metadata *, 4> MDs(1);
- // If the loop already has metadata, retain it.
- MDNode *LoopID = TheLoop->getLoopID();
- if (LoopID) {
- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
- MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
- // If it is of form key = value, try to parse it.
- if (Node->getNumOperands() == 2) {
- MDString *S = dyn_cast<MDString>(Node->getOperand(0));
- if (S && S->getString().equals(StringMD)) {
- ConstantInt *IntMD =
- mdconst::extract_or_null<ConstantInt>(Node->getOperand(1));
- if (IntMD && IntMD->getSExtValue() == V)
- // It is already in place. Do nothing.
- return;
- // We need to update the value, so just skip it here and it will
- // be added after copying other existed nodes.
- continue;
- }
- }
- MDs.push_back(Node);
- }
- }
- // Add new metadata.
- MDs.push_back(createStringMetadata(TheLoop, StringMD, V));
- // Replace current metadata node with new one.
- LLVMContext &Context = TheLoop->getHeader()->getContext();
- MDNode *NewLoopID = MDNode::get(Context, MDs);
- // Set operand 0 to refer to the loop id itself.
- NewLoopID->replaceOperandWith(0, NewLoopID);
- TheLoop->setLoopID(NewLoopID);
-}
-
-/// Find string metadata for loop
-///
-/// If it has a value (e.g. {"llvm.distribute", 1} return the value as an
-/// operand or null otherwise. If the string metadata is not found return
-/// Optional's not-a-value.
-Optional<const MDOperand *> llvm::findStringMetadataForLoop(const Loop *TheLoop,
- StringRef Name) {
- MDNode *MD = findOptionMDForLoop(TheLoop, Name);
- if (!MD)
- return None;
- switch (MD->getNumOperands()) {
- case 1:
- return nullptr;
- case 2:
- return &MD->getOperand(1);
- default:
- llvm_unreachable("loop metadata has 0 or 1 operand");
- }
-}
-
-static Optional<bool> getOptionalBoolLoopAttribute(const Loop *TheLoop,
- StringRef Name) {
- MDNode *MD = findOptionMDForLoop(TheLoop, Name);
- if (!MD)
- return None;
- switch (MD->getNumOperands()) {
- case 1:
- // When the value is absent it is interpreted as 'attribute set'.
- return true;
- case 2:
- if (ConstantInt *IntMD =
- mdconst::extract_or_null<ConstantInt>(MD->getOperand(1).get()))
- return IntMD->getZExtValue();
- return true;
- }
- llvm_unreachable("unexpected number of options");
-}
-
+
+bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
+ MemorySSAUpdater *MSSAU,
+ bool PreserveLCSSA) {
+ bool Changed = false;
+
+ // We re-use a vector for the in-loop predecesosrs.
+ SmallVector<BasicBlock *, 4> InLoopPredecessors;
+
+ auto RewriteExit = [&](BasicBlock *BB) {
+ assert(InLoopPredecessors.empty() &&
+ "Must start with an empty predecessors list!");
+ auto Cleanup = make_scope_exit([&] { InLoopPredecessors.clear(); });
+
+ // See if there are any non-loop predecessors of this exit block and
+ // keep track of the in-loop predecessors.
+ bool IsDedicatedExit = true;
+ for (auto *PredBB : predecessors(BB))
+ if (L->contains(PredBB)) {
+ if (isa<IndirectBrInst>(PredBB->getTerminator()))
+ // We cannot rewrite exiting edges from an indirectbr.
+ return false;
+ if (isa<CallBrInst>(PredBB->getTerminator()))
+ // We cannot rewrite exiting edges from a callbr.
+ return false;
+
+ InLoopPredecessors.push_back(PredBB);
+ } else {
+ IsDedicatedExit = false;
+ }
+
+ assert(!InLoopPredecessors.empty() && "Must have *some* loop predecessor!");
+
+ // Nothing to do if this is already a dedicated exit.
+ if (IsDedicatedExit)
+ return false;
+
+ auto *NewExitBB = SplitBlockPredecessors(
+ BB, InLoopPredecessors, ".loopexit", DT, LI, MSSAU, PreserveLCSSA);
+
+ if (!NewExitBB)
+ LLVM_DEBUG(
+ dbgs() << "WARNING: Can't create a dedicated exit block for loop: "
+ << *L << "\n");
+ else
+ LLVM_DEBUG(dbgs() << "LoopSimplify: Creating dedicated exit block "
+ << NewExitBB->getName() << "\n");
+ return true;
+ };
+
+ // Walk the exit blocks directly rather than building up a data structure for
+ // them, but only visit each one once.
+ SmallPtrSet<BasicBlock *, 4> Visited;
+ for (auto *BB : L->blocks())
+ for (auto *SuccBB : successors(BB)) {
+ // We're looking for exit blocks so skip in-loop successors.
+ if (L->contains(SuccBB))
+ continue;
+
+ // Visit each exit block exactly once.
+ if (!Visited.insert(SuccBB).second)
+ continue;
+
+ Changed |= RewriteExit(SuccBB);
+ }
+
+ return Changed;
+}
+
+/// Returns the instructions that use values defined in the loop.
+SmallVector<Instruction *, 8> llvm::findDefsUsedOutsideOfLoop(Loop *L) {
+ SmallVector<Instruction *, 8> UsedOutside;
+
+ for (auto *Block : L->getBlocks())
+ // FIXME: I believe that this could use copy_if if the Inst reference could
+ // be adapted into a pointer.
+ for (auto &Inst : *Block) {
+ auto Users = Inst.users();
+ if (any_of(Users, [&](User *U) {
+ auto *Use = cast<Instruction>(U);
+ return !L->contains(Use->getParent());
+ }))
+ UsedOutside.push_back(&Inst);
+ }
+
+ return UsedOutside;
+}
+
+void llvm::getLoopAnalysisUsage(AnalysisUsage &AU) {
+ // By definition, all loop passes need the LoopInfo analysis and the
+ // Dominator tree it depends on. Because they all participate in the loop
+ // pass manager, they must also preserve these.
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+
+ // We must also preserve LoopSimplify and LCSSA. We locally access their IDs
+ // here because users shouldn't directly get them from this header.
+ extern char &LoopSimplifyID;
+ extern char &LCSSAID;
+ AU.addRequiredID(LoopSimplifyID);
+ AU.addPreservedID(LoopSimplifyID);
+ AU.addRequiredID(LCSSAID);
+ AU.addPreservedID(LCSSAID);
+ // This is used in the LPPassManager to perform LCSSA verification on passes
+ // which preserve lcssa form
+ AU.addRequired<LCSSAVerificationPass>();
+ AU.addPreserved<LCSSAVerificationPass>();
+
+ // Loop passes are designed to run inside of a loop pass manager which means
+ // that any function analyses they require must be required by the first loop
+ // pass in the manager (so that it is computed before the loop pass manager
+ // runs) and preserved by all loop pasess in the manager. To make this
+ // reasonably robust, the set needed for most loop passes is maintained here.
+ // If your loop pass requires an analysis not listed here, you will need to
+ // carefully audit the loop pass manager nesting structure that results.
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<SCEVAAWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addPreserved<ScalarEvolutionWrapperPass>();
+ // FIXME: When all loop passes preserve MemorySSA, it can be required and
+ // preserved here instead of the individual handling in each pass.
+}
+
+/// Manually defined generic "LoopPass" dependency initialization. This is used
+/// to initialize the exact set of passes from above in \c
+/// getLoopAnalysisUsage. It can be used within a loop pass's initialization
+/// with:
+///
+/// INITIALIZE_PASS_DEPENDENCY(LoopPass)
+///
+/// As-if "LoopPass" were a pass.
+void llvm::initializeLoopPassPass(PassRegistry &Registry) {
+ INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+ INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+ INITIALIZE_PASS_DEPENDENCY(LCSSAWrapperPass)
+ INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+ INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+ INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+ INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+}
+
+/// Create MDNode for input string.
+static MDNode *createStringMetadata(Loop *TheLoop, StringRef Name, unsigned V) {
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ Metadata *MDs[] = {
+ MDString::get(Context, Name),
+ ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(Context), V))};
+ return MDNode::get(Context, MDs);
+}
+
+/// Set input string into loop metadata by keeping other values intact.
+/// If the string is already in loop metadata update value if it is
+/// different.
+void llvm::addStringMetadataToLoop(Loop *TheLoop, const char *StringMD,
+ unsigned V) {
+ SmallVector<Metadata *, 4> MDs(1);
+ // If the loop already has metadata, retain it.
+ MDNode *LoopID = TheLoop->getLoopID();
+ if (LoopID) {
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ MDNode *Node = cast<MDNode>(LoopID->getOperand(i));
+ // If it is of form key = value, try to parse it.
+ if (Node->getNumOperands() == 2) {
+ MDString *S = dyn_cast<MDString>(Node->getOperand(0));
+ if (S && S->getString().equals(StringMD)) {
+ ConstantInt *IntMD =
+ mdconst::extract_or_null<ConstantInt>(Node->getOperand(1));
+ if (IntMD && IntMD->getSExtValue() == V)
+ // It is already in place. Do nothing.
+ return;
+ // We need to update the value, so just skip it here and it will
+ // be added after copying other existed nodes.
+ continue;
+ }
+ }
+ MDs.push_back(Node);
+ }
+ }
+ // Add new metadata.
+ MDs.push_back(createStringMetadata(TheLoop, StringMD, V));
+ // Replace current metadata node with new one.
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+ TheLoop->setLoopID(NewLoopID);
+}
+
+/// Find string metadata for loop
+///
+/// If it has a value (e.g. {"llvm.distribute", 1} return the value as an
+/// operand or null otherwise. If the string metadata is not found return
+/// Optional's not-a-value.
+Optional<const MDOperand *> llvm::findStringMetadataForLoop(const Loop *TheLoop,
+ StringRef Name) {
+ MDNode *MD = findOptionMDForLoop(TheLoop, Name);
+ if (!MD)
+ return None;
+ switch (MD->getNumOperands()) {
+ case 1:
+ return nullptr;
+ case 2:
+ return &MD->getOperand(1);
+ default:
+ llvm_unreachable("loop metadata has 0 or 1 operand");
+ }
+}
+
+static Optional<bool> getOptionalBoolLoopAttribute(const Loop *TheLoop,
+ StringRef Name) {
+ MDNode *MD = findOptionMDForLoop(TheLoop, Name);
+ if (!MD)
+ return None;
+ switch (MD->getNumOperands()) {
+ case 1:
+ // When the value is absent it is interpreted as 'attribute set'.
+ return true;
+ case 2:
+ if (ConstantInt *IntMD =
+ mdconst::extract_or_null<ConstantInt>(MD->getOperand(1).get()))
+ return IntMD->getZExtValue();
+ return true;
+ }
+ llvm_unreachable("unexpected number of options");
+}
+
bool llvm::getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name) {
- return getOptionalBoolLoopAttribute(TheLoop, Name).getValueOr(false);
-}
-
+ return getOptionalBoolLoopAttribute(TheLoop, Name).getValueOr(false);
+}
+
Optional<ElementCount>
llvm::getOptionalElementCountLoopAttribute(Loop *TheLoop) {
Optional<int> Width =
@@ -316,292 +316,292 @@ llvm::getOptionalElementCountLoopAttribute(Loop *TheLoop) {
return None;
}
-llvm::Optional<int> llvm::getOptionalIntLoopAttribute(Loop *TheLoop,
- StringRef Name) {
- const MDOperand *AttrMD =
- findStringMetadataForLoop(TheLoop, Name).getValueOr(nullptr);
- if (!AttrMD)
- return None;
-
- ConstantInt *IntMD = mdconst::extract_or_null<ConstantInt>(AttrMD->get());
- if (!IntMD)
- return None;
-
- return IntMD->getSExtValue();
-}
-
-Optional<MDNode *> llvm::makeFollowupLoopID(
- MDNode *OrigLoopID, ArrayRef<StringRef> FollowupOptions,
- const char *InheritOptionsExceptPrefix, bool AlwaysNew) {
- if (!OrigLoopID) {
- if (AlwaysNew)
- return nullptr;
- return None;
- }
-
- assert(OrigLoopID->getOperand(0) == OrigLoopID);
-
- bool InheritAllAttrs = !InheritOptionsExceptPrefix;
- bool InheritSomeAttrs =
- InheritOptionsExceptPrefix && InheritOptionsExceptPrefix[0] != '\0';
- SmallVector<Metadata *, 8> MDs;
- MDs.push_back(nullptr);
-
- bool Changed = false;
- if (InheritAllAttrs || InheritSomeAttrs) {
+llvm::Optional<int> llvm::getOptionalIntLoopAttribute(Loop *TheLoop,
+ StringRef Name) {
+ const MDOperand *AttrMD =
+ findStringMetadataForLoop(TheLoop, Name).getValueOr(nullptr);
+ if (!AttrMD)
+ return None;
+
+ ConstantInt *IntMD = mdconst::extract_or_null<ConstantInt>(AttrMD->get());
+ if (!IntMD)
+ return None;
+
+ return IntMD->getSExtValue();
+}
+
+Optional<MDNode *> llvm::makeFollowupLoopID(
+ MDNode *OrigLoopID, ArrayRef<StringRef> FollowupOptions,
+ const char *InheritOptionsExceptPrefix, bool AlwaysNew) {
+ if (!OrigLoopID) {
+ if (AlwaysNew)
+ return nullptr;
+ return None;
+ }
+
+ assert(OrigLoopID->getOperand(0) == OrigLoopID);
+
+ bool InheritAllAttrs = !InheritOptionsExceptPrefix;
+ bool InheritSomeAttrs =
+ InheritOptionsExceptPrefix && InheritOptionsExceptPrefix[0] != '\0';
+ SmallVector<Metadata *, 8> MDs;
+ MDs.push_back(nullptr);
+
+ bool Changed = false;
+ if (InheritAllAttrs || InheritSomeAttrs) {
for (const MDOperand &Existing : drop_begin(OrigLoopID->operands())) {
- MDNode *Op = cast<MDNode>(Existing.get());
-
- auto InheritThisAttribute = [InheritSomeAttrs,
- InheritOptionsExceptPrefix](MDNode *Op) {
- if (!InheritSomeAttrs)
- return false;
-
- // Skip malformatted attribute metadata nodes.
- if (Op->getNumOperands() == 0)
- return true;
- Metadata *NameMD = Op->getOperand(0).get();
- if (!isa<MDString>(NameMD))
- return true;
- StringRef AttrName = cast<MDString>(NameMD)->getString();
-
- // Do not inherit excluded attributes.
- return !AttrName.startswith(InheritOptionsExceptPrefix);
- };
-
- if (InheritThisAttribute(Op))
- MDs.push_back(Op);
- else
- Changed = true;
- }
- } else {
- // Modified if we dropped at least one attribute.
- Changed = OrigLoopID->getNumOperands() > 1;
- }
-
- bool HasAnyFollowup = false;
- for (StringRef OptionName : FollowupOptions) {
- MDNode *FollowupNode = findOptionMDForLoopID(OrigLoopID, OptionName);
- if (!FollowupNode)
- continue;
-
- HasAnyFollowup = true;
+ MDNode *Op = cast<MDNode>(Existing.get());
+
+ auto InheritThisAttribute = [InheritSomeAttrs,
+ InheritOptionsExceptPrefix](MDNode *Op) {
+ if (!InheritSomeAttrs)
+ return false;
+
+ // Skip malformatted attribute metadata nodes.
+ if (Op->getNumOperands() == 0)
+ return true;
+ Metadata *NameMD = Op->getOperand(0).get();
+ if (!isa<MDString>(NameMD))
+ return true;
+ StringRef AttrName = cast<MDString>(NameMD)->getString();
+
+ // Do not inherit excluded attributes.
+ return !AttrName.startswith(InheritOptionsExceptPrefix);
+ };
+
+ if (InheritThisAttribute(Op))
+ MDs.push_back(Op);
+ else
+ Changed = true;
+ }
+ } else {
+ // Modified if we dropped at least one attribute.
+ Changed = OrigLoopID->getNumOperands() > 1;
+ }
+
+ bool HasAnyFollowup = false;
+ for (StringRef OptionName : FollowupOptions) {
+ MDNode *FollowupNode = findOptionMDForLoopID(OrigLoopID, OptionName);
+ if (!FollowupNode)
+ continue;
+
+ HasAnyFollowup = true;
for (const MDOperand &Option : drop_begin(FollowupNode->operands())) {
- MDs.push_back(Option.get());
- Changed = true;
- }
- }
-
- // Attributes of the followup loop not specified explicity, so signal to the
- // transformation pass to add suitable attributes.
- if (!AlwaysNew && !HasAnyFollowup)
- return None;
-
- // If no attributes were added or remove, the previous loop Id can be reused.
- if (!AlwaysNew && !Changed)
- return OrigLoopID;
-
- // No attributes is equivalent to having no !llvm.loop metadata at all.
- if (MDs.size() == 1)
- return nullptr;
-
- // Build the new loop ID.
- MDTuple *FollowupLoopID = MDNode::get(OrigLoopID->getContext(), MDs);
- FollowupLoopID->replaceOperandWith(0, FollowupLoopID);
- return FollowupLoopID;
-}
-
-bool llvm::hasDisableAllTransformsHint(const Loop *L) {
- return getBooleanLoopAttribute(L, LLVMLoopDisableNonforced);
-}
-
-bool llvm::hasDisableLICMTransformsHint(const Loop *L) {
- return getBooleanLoopAttribute(L, LLVMLoopDisableLICM);
-}
-
+ MDs.push_back(Option.get());
+ Changed = true;
+ }
+ }
+
+ // Attributes of the followup loop not specified explicity, so signal to the
+ // transformation pass to add suitable attributes.
+ if (!AlwaysNew && !HasAnyFollowup)
+ return None;
+
+ // If no attributes were added or remove, the previous loop Id can be reused.
+ if (!AlwaysNew && !Changed)
+ return OrigLoopID;
+
+ // No attributes is equivalent to having no !llvm.loop metadata at all.
+ if (MDs.size() == 1)
+ return nullptr;
+
+ // Build the new loop ID.
+ MDTuple *FollowupLoopID = MDNode::get(OrigLoopID->getContext(), MDs);
+ FollowupLoopID->replaceOperandWith(0, FollowupLoopID);
+ return FollowupLoopID;
+}
+
+bool llvm::hasDisableAllTransformsHint(const Loop *L) {
+ return getBooleanLoopAttribute(L, LLVMLoopDisableNonforced);
+}
+
+bool llvm::hasDisableLICMTransformsHint(const Loop *L) {
+ return getBooleanLoopAttribute(L, LLVMLoopDisableLICM);
+}
+
bool llvm::hasMustProgress(const Loop *L) {
return getBooleanLoopAttribute(L, LLVMLoopMustProgress);
}
-TransformationMode llvm::hasUnrollTransformation(Loop *L) {
- if (getBooleanLoopAttribute(L, "llvm.loop.unroll.disable"))
- return TM_SuppressedByUser;
-
- Optional<int> Count =
- getOptionalIntLoopAttribute(L, "llvm.loop.unroll.count");
- if (Count.hasValue())
- return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser;
-
- if (getBooleanLoopAttribute(L, "llvm.loop.unroll.enable"))
- return TM_ForcedByUser;
-
- if (getBooleanLoopAttribute(L, "llvm.loop.unroll.full"))
- return TM_ForcedByUser;
-
- if (hasDisableAllTransformsHint(L))
- return TM_Disable;
-
- return TM_Unspecified;
-}
-
-TransformationMode llvm::hasUnrollAndJamTransformation(Loop *L) {
- if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.disable"))
- return TM_SuppressedByUser;
-
- Optional<int> Count =
- getOptionalIntLoopAttribute(L, "llvm.loop.unroll_and_jam.count");
- if (Count.hasValue())
- return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser;
-
- if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.enable"))
- return TM_ForcedByUser;
-
- if (hasDisableAllTransformsHint(L))
- return TM_Disable;
-
- return TM_Unspecified;
-}
-
-TransformationMode llvm::hasVectorizeTransformation(Loop *L) {
- Optional<bool> Enable =
- getOptionalBoolLoopAttribute(L, "llvm.loop.vectorize.enable");
-
- if (Enable == false)
- return TM_SuppressedByUser;
-
+TransformationMode llvm::hasUnrollTransformation(Loop *L) {
+ if (getBooleanLoopAttribute(L, "llvm.loop.unroll.disable"))
+ return TM_SuppressedByUser;
+
+ Optional<int> Count =
+ getOptionalIntLoopAttribute(L, "llvm.loop.unroll.count");
+ if (Count.hasValue())
+ return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser;
+
+ if (getBooleanLoopAttribute(L, "llvm.loop.unroll.enable"))
+ return TM_ForcedByUser;
+
+ if (getBooleanLoopAttribute(L, "llvm.loop.unroll.full"))
+ return TM_ForcedByUser;
+
+ if (hasDisableAllTransformsHint(L))
+ return TM_Disable;
+
+ return TM_Unspecified;
+}
+
+TransformationMode llvm::hasUnrollAndJamTransformation(Loop *L) {
+ if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.disable"))
+ return TM_SuppressedByUser;
+
+ Optional<int> Count =
+ getOptionalIntLoopAttribute(L, "llvm.loop.unroll_and_jam.count");
+ if (Count.hasValue())
+ return Count.getValue() == 1 ? TM_SuppressedByUser : TM_ForcedByUser;
+
+ if (getBooleanLoopAttribute(L, "llvm.loop.unroll_and_jam.enable"))
+ return TM_ForcedByUser;
+
+ if (hasDisableAllTransformsHint(L))
+ return TM_Disable;
+
+ return TM_Unspecified;
+}
+
+TransformationMode llvm::hasVectorizeTransformation(Loop *L) {
+ Optional<bool> Enable =
+ getOptionalBoolLoopAttribute(L, "llvm.loop.vectorize.enable");
+
+ if (Enable == false)
+ return TM_SuppressedByUser;
+
Optional<ElementCount> VectorizeWidth =
getOptionalElementCountLoopAttribute(L);
- Optional<int> InterleaveCount =
- getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
-
- // 'Forcing' vector width and interleave count to one effectively disables
- // this tranformation.
+ Optional<int> InterleaveCount =
+ getOptionalIntLoopAttribute(L, "llvm.loop.interleave.count");
+
+ // 'Forcing' vector width and interleave count to one effectively disables
+ // this tranformation.
if (Enable == true && VectorizeWidth && VectorizeWidth->isScalar() &&
InterleaveCount == 1)
- return TM_SuppressedByUser;
-
- if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
- return TM_Disable;
-
- if (Enable == true)
- return TM_ForcedByUser;
-
+ return TM_SuppressedByUser;
+
+ if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
+ return TM_Disable;
+
+ if (Enable == true)
+ return TM_ForcedByUser;
+
if ((VectorizeWidth && VectorizeWidth->isScalar()) && InterleaveCount == 1)
- return TM_Disable;
-
+ return TM_Disable;
+
if ((VectorizeWidth && VectorizeWidth->isVector()) || InterleaveCount > 1)
- return TM_Enable;
-
- if (hasDisableAllTransformsHint(L))
- return TM_Disable;
-
- return TM_Unspecified;
-}
-
-TransformationMode llvm::hasDistributeTransformation(Loop *L) {
- if (getBooleanLoopAttribute(L, "llvm.loop.distribute.enable"))
- return TM_ForcedByUser;
-
- if (hasDisableAllTransformsHint(L))
- return TM_Disable;
-
- return TM_Unspecified;
-}
-
-TransformationMode llvm::hasLICMVersioningTransformation(Loop *L) {
- if (getBooleanLoopAttribute(L, "llvm.loop.licm_versioning.disable"))
- return TM_SuppressedByUser;
-
- if (hasDisableAllTransformsHint(L))
- return TM_Disable;
-
- return TM_Unspecified;
-}
-
-/// Does a BFS from a given node to all of its children inside a given loop.
-/// The returned vector of nodes includes the starting point.
-SmallVector<DomTreeNode *, 16>
-llvm::collectChildrenInLoop(DomTreeNode *N, const Loop *CurLoop) {
- SmallVector<DomTreeNode *, 16> Worklist;
- auto AddRegionToWorklist = [&](DomTreeNode *DTN) {
- // Only include subregions in the top level loop.
- BasicBlock *BB = DTN->getBlock();
- if (CurLoop->contains(BB))
- Worklist.push_back(DTN);
- };
-
- AddRegionToWorklist(N);
-
- for (size_t I = 0; I < Worklist.size(); I++) {
- for (DomTreeNode *Child : Worklist[I]->children())
- AddRegionToWorklist(Child);
- }
-
- return Worklist;
-}
-
-void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
- LoopInfo *LI, MemorySSA *MSSA) {
- assert((!DT || L->isLCSSAForm(*DT)) && "Expected LCSSA!");
- auto *Preheader = L->getLoopPreheader();
- assert(Preheader && "Preheader should exist!");
-
- std::unique_ptr<MemorySSAUpdater> MSSAU;
- if (MSSA)
- MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
-
- // Now that we know the removal is safe, remove the loop by changing the
- // branch from the preheader to go to the single exit block.
- //
- // Because we're deleting a large chunk of code at once, the sequence in which
- // we remove things is very important to avoid invalidation issues.
-
- // Tell ScalarEvolution that the loop is deleted. Do this before
- // deleting the loop so that ScalarEvolution can look at the loop
- // to determine what it needs to clean up.
- if (SE)
- SE->forgetLoop(L);
-
- auto *OldBr = dyn_cast<BranchInst>(Preheader->getTerminator());
- assert(OldBr && "Preheader must end with a branch");
- assert(OldBr->isUnconditional() && "Preheader must have a single successor");
- // Connect the preheader to the exit block. Keep the old edge to the header
- // around to perform the dominator tree update in two separate steps
- // -- #1 insertion of the edge preheader -> exit and #2 deletion of the edge
- // preheader -> header.
- //
- //
- // 0. Preheader 1. Preheader 2. Preheader
- // | | | |
- // V | V |
- // Header <--\ | Header <--\ | Header <--\
- // | | | | | | | | | | |
- // | V | | | V | | | V |
- // | Body --/ | | Body --/ | | Body --/
- // V V V V V
- // Exit Exit Exit
- //
- // By doing this is two separate steps we can perform the dominator tree
- // update without using the batch update API.
- //
- // Even when the loop is never executed, we cannot remove the edge from the
- // source block to the exit block. Consider the case where the unexecuted loop
- // branches back to an outer loop. If we deleted the loop and removed the edge
- // coming to this inner loop, this will break the outer loop structure (by
- // deleting the backedge of the outer loop). If the outer loop is indeed a
- // non-loop, it will be deleted in a future iteration of loop deletion pass.
- IRBuilder<> Builder(OldBr);
-
+ return TM_Enable;
+
+ if (hasDisableAllTransformsHint(L))
+ return TM_Disable;
+
+ return TM_Unspecified;
+}
+
+TransformationMode llvm::hasDistributeTransformation(Loop *L) {
+ if (getBooleanLoopAttribute(L, "llvm.loop.distribute.enable"))
+ return TM_ForcedByUser;
+
+ if (hasDisableAllTransformsHint(L))
+ return TM_Disable;
+
+ return TM_Unspecified;
+}
+
+TransformationMode llvm::hasLICMVersioningTransformation(Loop *L) {
+ if (getBooleanLoopAttribute(L, "llvm.loop.licm_versioning.disable"))
+ return TM_SuppressedByUser;
+
+ if (hasDisableAllTransformsHint(L))
+ return TM_Disable;
+
+ return TM_Unspecified;
+}
+
+/// Does a BFS from a given node to all of its children inside a given loop.
+/// The returned vector of nodes includes the starting point.
+SmallVector<DomTreeNode *, 16>
+llvm::collectChildrenInLoop(DomTreeNode *N, const Loop *CurLoop) {
+ SmallVector<DomTreeNode *, 16> Worklist;
+ auto AddRegionToWorklist = [&](DomTreeNode *DTN) {
+ // Only include subregions in the top level loop.
+ BasicBlock *BB = DTN->getBlock();
+ if (CurLoop->contains(BB))
+ Worklist.push_back(DTN);
+ };
+
+ AddRegionToWorklist(N);
+
+ for (size_t I = 0; I < Worklist.size(); I++) {
+ for (DomTreeNode *Child : Worklist[I]->children())
+ AddRegionToWorklist(Child);
+ }
+
+ return Worklist;
+}
+
+void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
+ LoopInfo *LI, MemorySSA *MSSA) {
+ assert((!DT || L->isLCSSAForm(*DT)) && "Expected LCSSA!");
+ auto *Preheader = L->getLoopPreheader();
+ assert(Preheader && "Preheader should exist!");
+
+ std::unique_ptr<MemorySSAUpdater> MSSAU;
+ if (MSSA)
+ MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+
+ // Now that we know the removal is safe, remove the loop by changing the
+ // branch from the preheader to go to the single exit block.
+ //
+ // Because we're deleting a large chunk of code at once, the sequence in which
+ // we remove things is very important to avoid invalidation issues.
+
+ // Tell ScalarEvolution that the loop is deleted. Do this before
+ // deleting the loop so that ScalarEvolution can look at the loop
+ // to determine what it needs to clean up.
+ if (SE)
+ SE->forgetLoop(L);
+
+ auto *OldBr = dyn_cast<BranchInst>(Preheader->getTerminator());
+ assert(OldBr && "Preheader must end with a branch");
+ assert(OldBr->isUnconditional() && "Preheader must have a single successor");
+ // Connect the preheader to the exit block. Keep the old edge to the header
+ // around to perform the dominator tree update in two separate steps
+ // -- #1 insertion of the edge preheader -> exit and #2 deletion of the edge
+ // preheader -> header.
+ //
+ //
+ // 0. Preheader 1. Preheader 2. Preheader
+ // | | | |
+ // V | V |
+ // Header <--\ | Header <--\ | Header <--\
+ // | | | | | | | | | | |
+ // | V | | | V | | | V |
+ // | Body --/ | | Body --/ | | Body --/
+ // V V V V V
+ // Exit Exit Exit
+ //
+ // By doing this is two separate steps we can perform the dominator tree
+ // update without using the batch update API.
+ //
+ // Even when the loop is never executed, we cannot remove the edge from the
+ // source block to the exit block. Consider the case where the unexecuted loop
+ // branches back to an outer loop. If we deleted the loop and removed the edge
+ // coming to this inner loop, this will break the outer loop structure (by
+ // deleting the backedge of the outer loop). If the outer loop is indeed a
+ // non-loop, it will be deleted in a future iteration of loop deletion pass.
+ IRBuilder<> Builder(OldBr);
+
auto *ExitBlock = L->getUniqueExitBlock();
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
if (ExitBlock) {
assert(ExitBlock && "Should have a unique exit block!");
assert(L->hasDedicatedExits() && "Loop should have dedicated exits!");
-
+
Builder.CreateCondBr(Builder.getFalse(), L->getHeader(), ExitBlock);
// Remove the old branch. The conditional branch becomes a new terminator.
OldBr->eraseFromParent();
-
+
// Rewrite phis in the exit block to get their inputs from the Preheader
// instead of the exiting block.
for (PHINode &P : ExitBlock->phis()) {
@@ -623,7 +623,7 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
assert((P.getNumIncomingValues() == 1 &&
P.getIncomingBlock(PredIndex) == Preheader) &&
"Should have exactly one value and that's from the preheader!");
- }
+ }
if (DT) {
DTU.applyUpdates({{DominatorTree::Insert, Preheader, ExitBlock}});
@@ -647,25 +647,25 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
Builder.SetInsertPoint(OldBr);
Builder.CreateUnreachable();
Preheader->getTerminator()->eraseFromParent();
- }
-
- if (DT) {
- DTU.applyUpdates({{DominatorTree::Delete, Preheader, L->getHeader()}});
- if (MSSA) {
- MSSAU->applyUpdates({{DominatorTree::Delete, Preheader, L->getHeader()}},
- *DT);
- SmallSetVector<BasicBlock *, 8> DeadBlockSet(L->block_begin(),
- L->block_end());
- MSSAU->removeBlocks(DeadBlockSet);
- if (VerifyMemorySSA)
- MSSA->verifyMemorySSA();
- }
- }
-
- // Use a map to unique and a vector to guarantee deterministic ordering.
- llvm::SmallDenseSet<std::pair<DIVariable *, DIExpression *>, 4> DeadDebugSet;
- llvm::SmallVector<DbgVariableIntrinsic *, 4> DeadDebugInst;
-
+ }
+
+ if (DT) {
+ DTU.applyUpdates({{DominatorTree::Delete, Preheader, L->getHeader()}});
+ if (MSSA) {
+ MSSAU->applyUpdates({{DominatorTree::Delete, Preheader, L->getHeader()}},
+ *DT);
+ SmallSetVector<BasicBlock *, 8> DeadBlockSet(L->block_begin(),
+ L->block_end());
+ MSSAU->removeBlocks(DeadBlockSet);
+ if (VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+ }
+ }
+
+ // Use a map to unique and a vector to guarantee deterministic ordering.
+ llvm::SmallDenseSet<std::pair<DIVariable *, DIExpression *>, 4> DeadDebugSet;
+ llvm::SmallVector<DbgVariableIntrinsic *, 4> DeadDebugInst;
+
if (ExitBlock) {
// Given LCSSA form is satisfied, we should not have users of instructions
// within the dead loop outside of the loop. However, LCSSA doesn't take
@@ -701,8 +701,8 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
continue;
DeadDebugSet.insert({DVI->getVariable(), DVI->getExpression()});
DeadDebugInst.push_back(DVI);
- }
-
+ }
+
// After the loop has been deleted all the values defined and modified
// inside the loop are going to be unavailable.
// Since debug values in the loop have been deleted, inserting an undef
@@ -718,49 +718,49 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
DVI->getVariable(), DVI->getExpression(),
DVI->getDebugLoc(), InsertDbgValueBefore);
}
-
- // Remove the block from the reference counting scheme, so that we can
- // delete it freely later.
- for (auto *Block : L->blocks())
- Block->dropAllReferences();
-
- if (MSSA && VerifyMemorySSA)
- MSSA->verifyMemorySSA();
-
- if (LI) {
- // Erase the instructions and the blocks without having to worry
- // about ordering because we already dropped the references.
- // NOTE: This iteration is safe because erasing the block does not remove
- // its entry from the loop's block list. We do that in the next section.
- for (Loop::block_iterator LpI = L->block_begin(), LpE = L->block_end();
- LpI != LpE; ++LpI)
- (*LpI)->eraseFromParent();
-
- // Finally, the blocks from loopinfo. This has to happen late because
- // otherwise our loop iterators won't work.
-
- SmallPtrSet<BasicBlock *, 8> blocks;
- blocks.insert(L->block_begin(), L->block_end());
- for (BasicBlock *BB : blocks)
- LI->removeBlock(BB);
-
- // The last step is to update LoopInfo now that we've eliminated this loop.
- // Note: LoopInfo::erase remove the given loop and relink its subloops with
- // its parent. While removeLoop/removeChildLoop remove the given loop but
- // not relink its subloops, which is what we want.
- if (Loop *ParentLoop = L->getParentLoop()) {
- Loop::iterator I = find(*ParentLoop, L);
- assert(I != ParentLoop->end() && "Couldn't find loop");
- ParentLoop->removeChildLoop(I);
- } else {
- Loop::iterator I = find(*LI, L);
- assert(I != LI->end() && "Couldn't find loop");
- LI->removeLoop(I);
- }
- LI->destroy(L);
- }
-}
-
+
+ // Remove the block from the reference counting scheme, so that we can
+ // delete it freely later.
+ for (auto *Block : L->blocks())
+ Block->dropAllReferences();
+
+ if (MSSA && VerifyMemorySSA)
+ MSSA->verifyMemorySSA();
+
+ if (LI) {
+ // Erase the instructions and the blocks without having to worry
+ // about ordering because we already dropped the references.
+ // NOTE: This iteration is safe because erasing the block does not remove
+ // its entry from the loop's block list. We do that in the next section.
+ for (Loop::block_iterator LpI = L->block_begin(), LpE = L->block_end();
+ LpI != LpE; ++LpI)
+ (*LpI)->eraseFromParent();
+
+ // Finally, the blocks from loopinfo. This has to happen late because
+ // otherwise our loop iterators won't work.
+
+ SmallPtrSet<BasicBlock *, 8> blocks;
+ blocks.insert(L->block_begin(), L->block_end());
+ for (BasicBlock *BB : blocks)
+ LI->removeBlock(BB);
+
+ // The last step is to update LoopInfo now that we've eliminated this loop.
+ // Note: LoopInfo::erase remove the given loop and relink its subloops with
+ // its parent. While removeLoop/removeChildLoop remove the given loop but
+ // not relink its subloops, which is what we want.
+ if (Loop *ParentLoop = L->getParentLoop()) {
+ Loop::iterator I = find(*ParentLoop, L);
+ assert(I != ParentLoop->end() && "Couldn't find loop");
+ ParentLoop->removeChildLoop(I);
+ } else {
+ Loop::iterator I = find(*LI, L);
+ assert(I != LI->end() && "Couldn't find loop");
+ LI->removeLoop(I);
+ }
+ LI->destroy(L);
+ }
+}
+
static Loop *getOutermostLoop(Loop *L) {
while (Loop *Parent = L->getParentLoop())
L = Parent;
@@ -806,227 +806,227 @@ void llvm::breakLoopBackedge(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
}
-/// Checks if \p L has single exit through latch block except possibly
-/// "deoptimizing" exits. Returns branch instruction terminating the loop
-/// latch if above check is successful, nullptr otherwise.
-static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
- BasicBlock *Latch = L->getLoopLatch();
- if (!Latch)
- return nullptr;
-
- BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator());
- if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch))
- return nullptr;
-
- assert((LatchBR->getSuccessor(0) == L->getHeader() ||
- LatchBR->getSuccessor(1) == L->getHeader()) &&
- "At least one edge out of the latch must go to the header");
-
- SmallVector<BasicBlock *, 4> ExitBlocks;
- L->getUniqueNonLatchExitBlocks(ExitBlocks);
- if (any_of(ExitBlocks, [](const BasicBlock *EB) {
- return !EB->getTerminatingDeoptimizeCall();
- }))
- return nullptr;
-
- return LatchBR;
-}
-
-Optional<unsigned>
-llvm::getLoopEstimatedTripCount(Loop *L,
- unsigned *EstimatedLoopInvocationWeight) {
- // Support loops with an exiting latch and other existing exists only
- // deoptimize.
- BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
- if (!LatchBranch)
- return None;
-
- // To estimate the number of times the loop body was executed, we want to
- // know the number of times the backedge was taken, vs. the number of times
- // we exited the loop.
- uint64_t BackedgeTakenWeight, LatchExitWeight;
- if (!LatchBranch->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
- return None;
-
- if (LatchBranch->getSuccessor(0) != L->getHeader())
- std::swap(BackedgeTakenWeight, LatchExitWeight);
-
- if (!LatchExitWeight)
- return None;
-
- if (EstimatedLoopInvocationWeight)
- *EstimatedLoopInvocationWeight = LatchExitWeight;
-
- // Estimated backedge taken count is a ratio of the backedge taken weight by
- // the weight of the edge exiting the loop, rounded to nearest.
- uint64_t BackedgeTakenCount =
- llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight);
- // Estimated trip count is one plus estimated backedge taken count.
- return BackedgeTakenCount + 1;
-}
-
-bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
- unsigned EstimatedloopInvocationWeight) {
- // Support loops with an exiting latch and other existing exists only
- // deoptimize.
- BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
- if (!LatchBranch)
- return false;
-
- // Calculate taken and exit weights.
- unsigned LatchExitWeight = 0;
- unsigned BackedgeTakenWeight = 0;
-
- if (EstimatedTripCount > 0) {
- LatchExitWeight = EstimatedloopInvocationWeight;
- BackedgeTakenWeight = (EstimatedTripCount - 1) * LatchExitWeight;
- }
-
- // Make a swap if back edge is taken when condition is "false".
- if (LatchBranch->getSuccessor(0) != L->getHeader())
- std::swap(BackedgeTakenWeight, LatchExitWeight);
-
- MDBuilder MDB(LatchBranch->getContext());
-
- // Set/Update profile metadata.
- LatchBranch->setMetadata(
- LLVMContext::MD_prof,
- MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight));
-
- return true;
-}
-
-bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
- ScalarEvolution &SE) {
- Loop *OuterL = InnerLoop->getParentLoop();
- if (!OuterL)
- return true;
-
- // Get the backedge taken count for the inner loop
- BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
- const SCEV *InnerLoopBECountSC = SE.getExitCount(InnerLoop, InnerLoopLatch);
- if (isa<SCEVCouldNotCompute>(InnerLoopBECountSC) ||
- !InnerLoopBECountSC->getType()->isIntegerTy())
- return false;
-
- // Get whether count is invariant to the outer loop
- ScalarEvolution::LoopDisposition LD =
- SE.getLoopDisposition(InnerLoopBECountSC, OuterL);
- if (LD != ScalarEvolution::LoopInvariant)
- return false;
-
- return true;
-}
-
+/// Checks if \p L has single exit through latch block except possibly
+/// "deoptimizing" exits. Returns branch instruction terminating the loop
+/// latch if above check is successful, nullptr otherwise.
+static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
+ BasicBlock *Latch = L->getLoopLatch();
+ if (!Latch)
+ return nullptr;
+
+ BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch))
+ return nullptr;
+
+ assert((LatchBR->getSuccessor(0) == L->getHeader() ||
+ LatchBR->getSuccessor(1) == L->getHeader()) &&
+ "At least one edge out of the latch must go to the header");
+
+ SmallVector<BasicBlock *, 4> ExitBlocks;
+ L->getUniqueNonLatchExitBlocks(ExitBlocks);
+ if (any_of(ExitBlocks, [](const BasicBlock *EB) {
+ return !EB->getTerminatingDeoptimizeCall();
+ }))
+ return nullptr;
+
+ return LatchBR;
+}
+
+Optional<unsigned>
+llvm::getLoopEstimatedTripCount(Loop *L,
+ unsigned *EstimatedLoopInvocationWeight) {
+ // Support loops with an exiting latch and other existing exists only
+ // deoptimize.
+ BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+ if (!LatchBranch)
+ return None;
+
+ // To estimate the number of times the loop body was executed, we want to
+ // know the number of times the backedge was taken, vs. the number of times
+ // we exited the loop.
+ uint64_t BackedgeTakenWeight, LatchExitWeight;
+ if (!LatchBranch->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
+ return None;
+
+ if (LatchBranch->getSuccessor(0) != L->getHeader())
+ std::swap(BackedgeTakenWeight, LatchExitWeight);
+
+ if (!LatchExitWeight)
+ return None;
+
+ if (EstimatedLoopInvocationWeight)
+ *EstimatedLoopInvocationWeight = LatchExitWeight;
+
+ // Estimated backedge taken count is a ratio of the backedge taken weight by
+ // the weight of the edge exiting the loop, rounded to nearest.
+ uint64_t BackedgeTakenCount =
+ llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight);
+ // Estimated trip count is one plus estimated backedge taken count.
+ return BackedgeTakenCount + 1;
+}
+
+bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
+ unsigned EstimatedloopInvocationWeight) {
+ // Support loops with an exiting latch and other existing exists only
+ // deoptimize.
+ BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+ if (!LatchBranch)
+ return false;
+
+ // Calculate taken and exit weights.
+ unsigned LatchExitWeight = 0;
+ unsigned BackedgeTakenWeight = 0;
+
+ if (EstimatedTripCount > 0) {
+ LatchExitWeight = EstimatedloopInvocationWeight;
+ BackedgeTakenWeight = (EstimatedTripCount - 1) * LatchExitWeight;
+ }
+
+ // Make a swap if back edge is taken when condition is "false".
+ if (LatchBranch->getSuccessor(0) != L->getHeader())
+ std::swap(BackedgeTakenWeight, LatchExitWeight);
+
+ MDBuilder MDB(LatchBranch->getContext());
+
+ // Set/Update profile metadata.
+ LatchBranch->setMetadata(
+ LLVMContext::MD_prof,
+ MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight));
+
+ return true;
+}
+
+bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
+ ScalarEvolution &SE) {
+ Loop *OuterL = InnerLoop->getParentLoop();
+ if (!OuterL)
+ return true;
+
+ // Get the backedge taken count for the inner loop
+ BasicBlock *InnerLoopLatch = InnerLoop->getLoopLatch();
+ const SCEV *InnerLoopBECountSC = SE.getExitCount(InnerLoop, InnerLoopLatch);
+ if (isa<SCEVCouldNotCompute>(InnerLoopBECountSC) ||
+ !InnerLoopBECountSC->getType()->isIntegerTy())
+ return false;
+
+ // Get whether count is invariant to the outer loop
+ ScalarEvolution::LoopDisposition LD =
+ SE.getLoopDisposition(InnerLoopBECountSC, OuterL);
+ if (LD != ScalarEvolution::LoopInvariant)
+ return false;
+
+ return true;
+}
+
Value *llvm::createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left,
Value *Right) {
CmpInst::Predicate Pred;
- switch (RK) {
- default:
- llvm_unreachable("Unknown min/max recurrence kind");
+ switch (RK) {
+ default:
+ llvm_unreachable("Unknown min/max recurrence kind");
case RecurKind::UMin:
Pred = CmpInst::ICMP_ULT;
- break;
+ break;
case RecurKind::UMax:
Pred = CmpInst::ICMP_UGT;
- break;
+ break;
case RecurKind::SMin:
Pred = CmpInst::ICMP_SLT;
- break;
+ break;
case RecurKind::SMax:
Pred = CmpInst::ICMP_SGT;
- break;
+ break;
case RecurKind::FMin:
Pred = CmpInst::FCMP_OLT;
- break;
+ break;
case RecurKind::FMax:
Pred = CmpInst::FCMP_OGT;
- break;
- }
-
- // We only match FP sequences that are 'fast', so we can unconditionally
- // set it on any generated instructions.
- IRBuilderBase::FastMathFlagGuard FMFG(Builder);
- FastMathFlags FMF;
- FMF.setFast();
- Builder.setFastMathFlags(FMF);
+ break;
+ }
+
+ // We only match FP sequences that are 'fast', so we can unconditionally
+ // set it on any generated instructions.
+ IRBuilderBase::FastMathFlagGuard FMFG(Builder);
+ FastMathFlags FMF;
+ FMF.setFast();
+ Builder.setFastMathFlags(FMF);
Value *Cmp = Builder.CreateCmp(Pred, Left, Right, "rdx.minmax.cmp");
- Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
- return Select;
-}
-
-// Helper to generate an ordered reduction.
+ Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
+ return Select;
+}
+
+// Helper to generate an ordered reduction.
Value *llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
unsigned Op, RecurKind RdxKind,
ArrayRef<Value *> RedOps) {
- unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
-
- // Extract and apply reduction ops in ascending order:
- // e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1]
- Value *Result = Acc;
- for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) {
- Value *Ext =
- Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx));
-
- if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
- Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext,
- "bin.rdx");
- } else {
+ unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
+
+ // Extract and apply reduction ops in ascending order:
+ // e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1]
+ Value *Result = Acc;
+ for (unsigned ExtractIdx = 0; ExtractIdx != VF; ++ExtractIdx) {
+ Value *Ext =
+ Builder.CreateExtractElement(Src, Builder.getInt32(ExtractIdx));
+
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+ Result = Builder.CreateBinOp((Instruction::BinaryOps)Op, Result, Ext,
+ "bin.rdx");
+ } else {
assert(RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind) &&
- "Invalid min/max");
+ "Invalid min/max");
Result = createMinMaxOp(Builder, RdxKind, Result, Ext);
- }
-
- if (!RedOps.empty())
- propagateIRFlags(Result, RedOps);
- }
-
- return Result;
-}
-
-// Helper to generate a log2 shuffle reduction.
+ }
+
+ if (!RedOps.empty())
+ propagateIRFlags(Result, RedOps);
+ }
+
+ return Result;
+}
+
+// Helper to generate a log2 shuffle reduction.
Value *llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src,
unsigned Op, RecurKind RdxKind,
ArrayRef<Value *> RedOps) {
- unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
- // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
- // and vector ops, reducing the set of values being computed by half each
- // round.
- assert(isPowerOf2_32(VF) &&
- "Reduction emission only supported for pow2 vectors!");
- Value *TmpVec = Src;
- SmallVector<int, 32> ShuffleMask(VF);
- for (unsigned i = VF; i != 1; i >>= 1) {
- // Move the upper half of the vector to the lower half.
- for (unsigned j = 0; j != i / 2; ++j)
- ShuffleMask[j] = i / 2 + j;
-
- // Fill the rest of the mask with undef.
- std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1);
-
+ unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
+ // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+ // and vector ops, reducing the set of values being computed by half each
+ // round.
+ assert(isPowerOf2_32(VF) &&
+ "Reduction emission only supported for pow2 vectors!");
+ Value *TmpVec = Src;
+ SmallVector<int, 32> ShuffleMask(VF);
+ for (unsigned i = VF; i != 1; i >>= 1) {
+ // Move the upper half of the vector to the lower half.
+ for (unsigned j = 0; j != i / 2; ++j)
+ ShuffleMask[j] = i / 2 + j;
+
+ // Fill the rest of the mask with undef.
+ std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1);
+
Value *Shuf = Builder.CreateShuffleVector(TmpVec, ShuffleMask, "rdx.shuf");
-
- if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
- // The builder propagates its fast-math-flags setting.
- TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
- "bin.rdx");
- } else {
+
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
+ // The builder propagates its fast-math-flags setting.
+ TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
+ "bin.rdx");
+ } else {
assert(RecurrenceDescriptor::isMinMaxRecurrenceKind(RdxKind) &&
- "Invalid min/max");
+ "Invalid min/max");
TmpVec = createMinMaxOp(Builder, RdxKind, TmpVec, Shuf);
- }
- if (!RedOps.empty())
- propagateIRFlags(TmpVec, RedOps);
-
- // We may compute the reassociated scalar ops in a way that does not
- // preserve nsw/nuw etc. Conservatively, drop those flags.
- if (auto *ReductionInst = dyn_cast<Instruction>(TmpVec))
- ReductionInst->dropPoisonGeneratingFlags();
- }
- // The result is in the first element of the vector.
- return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
-}
-
+ }
+ if (!RedOps.empty())
+ propagateIRFlags(TmpVec, RedOps);
+
+ // We may compute the reassociated scalar ops in a way that does not
+ // preserve nsw/nuw etc. Conservatively, drop those flags.
+ if (auto *ReductionInst = dyn_cast<Instruction>(TmpVec))
+ ReductionInst->dropPoisonGeneratingFlags();
+ }
+ // The result is in the first element of the vector.
+ return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+}
+
Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
const TargetTransformInfo *TTI,
Value *Src, RecurKind RdxKind,
@@ -1039,7 +1039,7 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
if (!ForceReductionIntrinsic &&
!TTI->useReductionIntrinsic(Opcode, Src->getType(), RdxFlags))
return getShuffleReduction(Builder, Src, Opcode, RdxKind, RedOps);
-
+
auto *SrcVecEltTy = cast<VectorType>(Src->getType())->getElementType();
switch (RdxKind) {
case RecurKind::Add:
@@ -1069,656 +1069,656 @@ Value *llvm::createSimpleTargetReduction(IRBuilderBase &Builder,
return Builder.CreateFPMaxReduce(Src);
case RecurKind::FMin:
return Builder.CreateFPMinReduce(Src);
- default:
- llvm_unreachable("Unhandled opcode");
- }
-}
-
-Value *llvm::createTargetReduction(IRBuilderBase &B,
- const TargetTransformInfo *TTI,
+ default:
+ llvm_unreachable("Unhandled opcode");
+ }
+}
+
+Value *llvm::createTargetReduction(IRBuilderBase &B,
+ const TargetTransformInfo *TTI,
RecurrenceDescriptor &Desc, Value *Src) {
- // TODO: Support in-order reductions based on the recurrence descriptor.
- // All ops in the reduction inherit fast-math-flags from the recurrence
- // descriptor.
- IRBuilderBase::FastMathFlagGuard FMFGuard(B);
- B.setFastMathFlags(Desc.getFastMathFlags());
+ // TODO: Support in-order reductions based on the recurrence descriptor.
+ // All ops in the reduction inherit fast-math-flags from the recurrence
+ // descriptor.
+ IRBuilderBase::FastMathFlagGuard FMFGuard(B);
+ B.setFastMathFlags(Desc.getFastMathFlags());
return createSimpleTargetReduction(B, TTI, Src, Desc.getRecurrenceKind());
-}
-
-void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue) {
- auto *VecOp = dyn_cast<Instruction>(I);
- if (!VecOp)
- return;
- auto *Intersection = (OpValue == nullptr) ? dyn_cast<Instruction>(VL[0])
- : dyn_cast<Instruction>(OpValue);
- if (!Intersection)
- return;
- const unsigned Opcode = Intersection->getOpcode();
- VecOp->copyIRFlags(Intersection);
- for (auto *V : VL) {
- auto *Instr = dyn_cast<Instruction>(V);
- if (!Instr)
- continue;
- if (OpValue == nullptr || Opcode == Instr->getOpcode())
- VecOp->andIRFlags(V);
- }
-}
-
-bool llvm::isKnownNegativeInLoop(const SCEV *S, const Loop *L,
- ScalarEvolution &SE) {
- const SCEV *Zero = SE.getZero(S->getType());
- return SE.isAvailableAtLoopEntry(S, L) &&
- SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SLT, S, Zero);
-}
-
-bool llvm::isKnownNonNegativeInLoop(const SCEV *S, const Loop *L,
- ScalarEvolution &SE) {
- const SCEV *Zero = SE.getZero(S->getType());
- return SE.isAvailableAtLoopEntry(S, L) &&
- SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SGE, S, Zero);
-}
-
-bool llvm::cannotBeMinInLoop(const SCEV *S, const Loop *L, ScalarEvolution &SE,
- bool Signed) {
- unsigned BitWidth = cast<IntegerType>(S->getType())->getBitWidth();
- APInt Min = Signed ? APInt::getSignedMinValue(BitWidth) :
- APInt::getMinValue(BitWidth);
- auto Predicate = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
- return SE.isAvailableAtLoopEntry(S, L) &&
- SE.isLoopEntryGuardedByCond(L, Predicate, S,
- SE.getConstant(Min));
-}
-
-bool llvm::cannotBeMaxInLoop(const SCEV *S, const Loop *L, ScalarEvolution &SE,
- bool Signed) {
- unsigned BitWidth = cast<IntegerType>(S->getType())->getBitWidth();
- APInt Max = Signed ? APInt::getSignedMaxValue(BitWidth) :
- APInt::getMaxValue(BitWidth);
- auto Predicate = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
- return SE.isAvailableAtLoopEntry(S, L) &&
- SE.isLoopEntryGuardedByCond(L, Predicate, S,
- SE.getConstant(Max));
-}
-
-//===----------------------------------------------------------------------===//
-// rewriteLoopExitValues - Optimize IV users outside the loop.
-// As a side effect, reduces the amount of IV processing within the loop.
-//===----------------------------------------------------------------------===//
-
-// Return true if the SCEV expansion generated by the rewriter can replace the
-// original value. SCEV guarantees that it produces the same value, but the way
-// it is produced may be illegal IR. Ideally, this function will only be
-// called for verification.
-static bool isValidRewrite(ScalarEvolution *SE, Value *FromVal, Value *ToVal) {
- // If an SCEV expression subsumed multiple pointers, its expansion could
- // reassociate the GEP changing the base pointer. This is illegal because the
- // final address produced by a GEP chain must be inbounds relative to its
- // underlying object. Otherwise basic alias analysis, among other things,
- // could fail in a dangerous way. Ultimately, SCEV will be improved to avoid
- // producing an expression involving multiple pointers. Until then, we must
- // bail out here.
- //
+}
+
+void llvm::propagateIRFlags(Value *I, ArrayRef<Value *> VL, Value *OpValue) {
+ auto *VecOp = dyn_cast<Instruction>(I);
+ if (!VecOp)
+ return;
+ auto *Intersection = (OpValue == nullptr) ? dyn_cast<Instruction>(VL[0])
+ : dyn_cast<Instruction>(OpValue);
+ if (!Intersection)
+ return;
+ const unsigned Opcode = Intersection->getOpcode();
+ VecOp->copyIRFlags(Intersection);
+ for (auto *V : VL) {
+ auto *Instr = dyn_cast<Instruction>(V);
+ if (!Instr)
+ continue;
+ if (OpValue == nullptr || Opcode == Instr->getOpcode())
+ VecOp->andIRFlags(V);
+ }
+}
+
+bool llvm::isKnownNegativeInLoop(const SCEV *S, const Loop *L,
+ ScalarEvolution &SE) {
+ const SCEV *Zero = SE.getZero(S->getType());
+ return SE.isAvailableAtLoopEntry(S, L) &&
+ SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SLT, S, Zero);
+}
+
+bool llvm::isKnownNonNegativeInLoop(const SCEV *S, const Loop *L,
+ ScalarEvolution &SE) {
+ const SCEV *Zero = SE.getZero(S->getType());
+ return SE.isAvailableAtLoopEntry(S, L) &&
+ SE.isLoopEntryGuardedByCond(L, ICmpInst::ICMP_SGE, S, Zero);
+}
+
+bool llvm::cannotBeMinInLoop(const SCEV *S, const Loop *L, ScalarEvolution &SE,
+ bool Signed) {
+ unsigned BitWidth = cast<IntegerType>(S->getType())->getBitWidth();
+ APInt Min = Signed ? APInt::getSignedMinValue(BitWidth) :
+ APInt::getMinValue(BitWidth);
+ auto Predicate = Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+ return SE.isAvailableAtLoopEntry(S, L) &&
+ SE.isLoopEntryGuardedByCond(L, Predicate, S,
+ SE.getConstant(Min));
+}
+
+bool llvm::cannotBeMaxInLoop(const SCEV *S, const Loop *L, ScalarEvolution &SE,
+ bool Signed) {
+ unsigned BitWidth = cast<IntegerType>(S->getType())->getBitWidth();
+ APInt Max = Signed ? APInt::getSignedMaxValue(BitWidth) :
+ APInt::getMaxValue(BitWidth);
+ auto Predicate = Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+ return SE.isAvailableAtLoopEntry(S, L) &&
+ SE.isLoopEntryGuardedByCond(L, Predicate, S,
+ SE.getConstant(Max));
+}
+
+//===----------------------------------------------------------------------===//
+// rewriteLoopExitValues - Optimize IV users outside the loop.
+// As a side effect, reduces the amount of IV processing within the loop.
+//===----------------------------------------------------------------------===//
+
+// Return true if the SCEV expansion generated by the rewriter can replace the
+// original value. SCEV guarantees that it produces the same value, but the way
+// it is produced may be illegal IR. Ideally, this function will only be
+// called for verification.
+static bool isValidRewrite(ScalarEvolution *SE, Value *FromVal, Value *ToVal) {
+ // If an SCEV expression subsumed multiple pointers, its expansion could
+ // reassociate the GEP changing the base pointer. This is illegal because the
+ // final address produced by a GEP chain must be inbounds relative to its
+ // underlying object. Otherwise basic alias analysis, among other things,
+ // could fail in a dangerous way. Ultimately, SCEV will be improved to avoid
+ // producing an expression involving multiple pointers. Until then, we must
+ // bail out here.
+ //
// Retrieve the pointer operand of the GEP. Don't use getUnderlyingObject
- // because it understands lcssa phis while SCEV does not.
- Value *FromPtr = FromVal;
- Value *ToPtr = ToVal;
- if (auto *GEP = dyn_cast<GEPOperator>(FromVal))
- FromPtr = GEP->getPointerOperand();
-
- if (auto *GEP = dyn_cast<GEPOperator>(ToVal))
- ToPtr = GEP->getPointerOperand();
-
- if (FromPtr != FromVal || ToPtr != ToVal) {
- // Quickly check the common case
- if (FromPtr == ToPtr)
- return true;
-
- // SCEV may have rewritten an expression that produces the GEP's pointer
- // operand. That's ok as long as the pointer operand has the same base
+ // because it understands lcssa phis while SCEV does not.
+ Value *FromPtr = FromVal;
+ Value *ToPtr = ToVal;
+ if (auto *GEP = dyn_cast<GEPOperator>(FromVal))
+ FromPtr = GEP->getPointerOperand();
+
+ if (auto *GEP = dyn_cast<GEPOperator>(ToVal))
+ ToPtr = GEP->getPointerOperand();
+
+ if (FromPtr != FromVal || ToPtr != ToVal) {
+ // Quickly check the common case
+ if (FromPtr == ToPtr)
+ return true;
+
+ // SCEV may have rewritten an expression that produces the GEP's pointer
+ // operand. That's ok as long as the pointer operand has the same base
// pointer. Unlike getUnderlyingObject(), getPointerBase() will find the
- // base of a recurrence. This handles the case in which SCEV expansion
- // converts a pointer type recurrence into a nonrecurrent pointer base
- // indexed by an integer recurrence.
-
- // If the GEP base pointer is a vector of pointers, abort.
- if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy())
- return false;
-
- const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr));
- const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr));
- if (FromBase == ToBase)
- return true;
-
- LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: GEP rewrite bail out "
- << *FromBase << " != " << *ToBase << "\n");
-
- return false;
- }
- return true;
-}
-
-static bool hasHardUserWithinLoop(const Loop *L, const Instruction *I) {
- SmallPtrSet<const Instruction *, 8> Visited;
- SmallVector<const Instruction *, 8> WorkList;
- Visited.insert(I);
- WorkList.push_back(I);
- while (!WorkList.empty()) {
- const Instruction *Curr = WorkList.pop_back_val();
- // This use is outside the loop, nothing to do.
- if (!L->contains(Curr))
- continue;
- // Do we assume it is a "hard" use which will not be eliminated easily?
- if (Curr->mayHaveSideEffects())
- return true;
- // Otherwise, add all its users to worklist.
- for (auto U : Curr->users()) {
- auto *UI = cast<Instruction>(U);
- if (Visited.insert(UI).second)
- WorkList.push_back(UI);
- }
- }
- return false;
-}
-
-// Collect information about PHI nodes which can be transformed in
-// rewriteLoopExitValues.
-struct RewritePhi {
- PHINode *PN; // For which PHI node is this replacement?
- unsigned Ith; // For which incoming value?
- const SCEV *ExpansionSCEV; // The SCEV of the incoming value we are rewriting.
- Instruction *ExpansionPoint; // Where we'd like to expand that SCEV?
- bool HighCost; // Is this expansion a high-cost?
-
- Value *Expansion = nullptr;
- bool ValidRewrite = false;
-
- RewritePhi(PHINode *P, unsigned I, const SCEV *Val, Instruction *ExpansionPt,
- bool H)
- : PN(P), Ith(I), ExpansionSCEV(Val), ExpansionPoint(ExpansionPt),
- HighCost(H) {}
-};
-
-// Check whether it is possible to delete the loop after rewriting exit
-// value. If it is possible, ignore ReplaceExitValue and do rewriting
-// aggressively.
-static bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) {
- BasicBlock *Preheader = L->getLoopPreheader();
- // If there is no preheader, the loop will not be deleted.
- if (!Preheader)
- return false;
-
- // In LoopDeletion pass Loop can be deleted when ExitingBlocks.size() > 1.
- // We obviate multiple ExitingBlocks case for simplicity.
- // TODO: If we see testcase with multiple ExitingBlocks can be deleted
- // after exit value rewriting, we can enhance the logic here.
- SmallVector<BasicBlock *, 4> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
- SmallVector<BasicBlock *, 8> ExitBlocks;
- L->getUniqueExitBlocks(ExitBlocks);
- if (ExitBlocks.size() != 1 || ExitingBlocks.size() != 1)
- return false;
-
- BasicBlock *ExitBlock = ExitBlocks[0];
- BasicBlock::iterator BI = ExitBlock->begin();
- while (PHINode *P = dyn_cast<PHINode>(BI)) {
- Value *Incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
-
- // If the Incoming value of P is found in RewritePhiSet, we know it
- // could be rewritten to use a loop invariant value in transformation
- // phase later. Skip it in the loop invariant check below.
- bool found = false;
- for (const RewritePhi &Phi : RewritePhiSet) {
- if (!Phi.ValidRewrite)
- continue;
- unsigned i = Phi.Ith;
- if (Phi.PN == P && (Phi.PN)->getIncomingValue(i) == Incoming) {
- found = true;
- break;
- }
- }
-
- Instruction *I;
- if (!found && (I = dyn_cast<Instruction>(Incoming)))
- if (!L->hasLoopInvariantOperands(I))
- return false;
-
- ++BI;
- }
-
- for (auto *BB : L->blocks())
- if (llvm::any_of(*BB, [](Instruction &I) {
- return I.mayHaveSideEffects();
- }))
- return false;
-
- return true;
-}
-
-int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
- ScalarEvolution *SE,
- const TargetTransformInfo *TTI,
- SCEVExpander &Rewriter, DominatorTree *DT,
- ReplaceExitVal ReplaceExitValue,
- SmallVector<WeakTrackingVH, 16> &DeadInsts) {
- // Check a pre-condition.
- assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
- "Indvars did not preserve LCSSA!");
-
- SmallVector<BasicBlock*, 8> ExitBlocks;
- L->getUniqueExitBlocks(ExitBlocks);
-
- SmallVector<RewritePhi, 8> RewritePhiSet;
- // Find all values that are computed inside the loop, but used outside of it.
- // Because of LCSSA, these values will only occur in LCSSA PHI Nodes. Scan
- // the exit blocks of the loop to find them.
- for (BasicBlock *ExitBB : ExitBlocks) {
- // If there are no PHI nodes in this exit block, then no values defined
- // inside the loop are used on this path, skip it.
- PHINode *PN = dyn_cast<PHINode>(ExitBB->begin());
- if (!PN) continue;
-
- unsigned NumPreds = PN->getNumIncomingValues();
-
- // Iterate over all of the PHI nodes.
- BasicBlock::iterator BBI = ExitBB->begin();
- while ((PN = dyn_cast<PHINode>(BBI++))) {
- if (PN->use_empty())
- continue; // dead use, don't replace it
-
- if (!SE->isSCEVable(PN->getType()))
- continue;
-
- // It's necessary to tell ScalarEvolution about this explicitly so that
- // it can walk the def-use list and forget all SCEVs, as it may not be
- // watching the PHI itself. Once the new exit value is in place, there
- // may not be a def-use connection between the loop and every instruction
- // which got a SCEVAddRecExpr for that loop.
- SE->forgetValue(PN);
-
- // Iterate over all of the values in all the PHI nodes.
- for (unsigned i = 0; i != NumPreds; ++i) {
- // If the value being merged in is not integer or is not defined
- // in the loop, skip it.
- Value *InVal = PN->getIncomingValue(i);
- if (!isa<Instruction>(InVal))
- continue;
-
- // If this pred is for a subloop, not L itself, skip it.
- if (LI->getLoopFor(PN->getIncomingBlock(i)) != L)
- continue; // The Block is in a subloop, skip it.
-
- // Check that InVal is defined in the loop.
- Instruction *Inst = cast<Instruction>(InVal);
- if (!L->contains(Inst))
- continue;
-
- // Okay, this instruction has a user outside of the current loop
- // and varies predictably *inside* the loop. Evaluate the value it
- // contains when the loop exits, if possible. We prefer to start with
- // expressions which are true for all exits (so as to maximize
- // expression reuse by the SCEVExpander), but resort to per-exit
- // evaluation if that fails.
- const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
- if (isa<SCEVCouldNotCompute>(ExitValue) ||
- !SE->isLoopInvariant(ExitValue, L) ||
- !isSafeToExpand(ExitValue, *SE)) {
- // TODO: This should probably be sunk into SCEV in some way; maybe a
- // getSCEVForExit(SCEV*, L, ExitingBB)? It can be generalized for
- // most SCEV expressions and other recurrence types (e.g. shift
- // recurrences). Is there existing code we can reuse?
- const SCEV *ExitCount = SE->getExitCount(L, PN->getIncomingBlock(i));
- if (isa<SCEVCouldNotCompute>(ExitCount))
- continue;
- if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Inst)))
- if (AddRec->getLoop() == L)
- ExitValue = AddRec->evaluateAtIteration(ExitCount, *SE);
- if (isa<SCEVCouldNotCompute>(ExitValue) ||
- !SE->isLoopInvariant(ExitValue, L) ||
- !isSafeToExpand(ExitValue, *SE))
- continue;
- }
-
- // Computing the value outside of the loop brings no benefit if it is
- // definitely used inside the loop in a way which can not be optimized
- // away. Avoid doing so unless we know we have a value which computes
- // the ExitValue already. TODO: This should be merged into SCEV
- // expander to leverage its knowledge of existing expressions.
- if (ReplaceExitValue != AlwaysRepl && !isa<SCEVConstant>(ExitValue) &&
- !isa<SCEVUnknown>(ExitValue) && hasHardUserWithinLoop(L, Inst))
- continue;
-
- // Check if expansions of this SCEV would count as being high cost.
- bool HighCost = Rewriter.isHighCostExpansion(
- ExitValue, L, SCEVCheapExpansionBudget, TTI, Inst);
-
- // Note that we must not perform expansions until after
- // we query *all* the costs, because if we perform temporary expansion
- // inbetween, one that we might not intend to keep, said expansion
- // *may* affect cost calculation of the the next SCEV's we'll query,
- // and next SCEV may errneously get smaller cost.
-
- // Collect all the candidate PHINodes to be rewritten.
- RewritePhiSet.emplace_back(PN, i, ExitValue, Inst, HighCost);
- }
- }
- }
-
- // Now that we've done preliminary filtering and billed all the SCEV's,
- // we can perform the last sanity check - the expansion must be valid.
- for (RewritePhi &Phi : RewritePhiSet) {
- Phi.Expansion = Rewriter.expandCodeFor(Phi.ExpansionSCEV, Phi.PN->getType(),
- Phi.ExpansionPoint);
-
- LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: AfterLoopVal = "
- << *(Phi.Expansion) << '\n'
- << " LoopVal = " << *(Phi.ExpansionPoint) << "\n");
-
- // FIXME: isValidRewrite() is a hack. it should be an assert, eventually.
- Phi.ValidRewrite = isValidRewrite(SE, Phi.ExpansionPoint, Phi.Expansion);
- if (!Phi.ValidRewrite) {
- DeadInsts.push_back(Phi.Expansion);
- continue;
- }
-
-#ifndef NDEBUG
- // If we reuse an instruction from a loop which is neither L nor one of
- // its containing loops, we end up breaking LCSSA form for this loop by
- // creating a new use of its instruction.
- if (auto *ExitInsn = dyn_cast<Instruction>(Phi.Expansion))
- if (auto *EVL = LI->getLoopFor(ExitInsn->getParent()))
- if (EVL != L)
- assert(EVL->contains(L) && "LCSSA breach detected!");
-#endif
- }
-
- // TODO: after isValidRewrite() is an assertion, evaluate whether
- // it is beneficial to change how we calculate high-cost:
- // if we have SCEV 'A' which we know we will expand, should we calculate
- // the cost of other SCEV's after expanding SCEV 'A',
- // thus potentially giving cost bonus to those other SCEV's?
-
- bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet);
- int NumReplaced = 0;
-
- // Transformation.
- for (const RewritePhi &Phi : RewritePhiSet) {
- if (!Phi.ValidRewrite)
- continue;
-
- PHINode *PN = Phi.PN;
- Value *ExitVal = Phi.Expansion;
-
- // Only do the rewrite when the ExitValue can be expanded cheaply.
- // If LoopCanBeDel is true, rewrite exit value aggressively.
- if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) {
- DeadInsts.push_back(ExitVal);
- continue;
- }
-
- NumReplaced++;
- Instruction *Inst = cast<Instruction>(PN->getIncomingValue(Phi.Ith));
- PN->setIncomingValue(Phi.Ith, ExitVal);
-
- // If this instruction is dead now, delete it. Don't do it now to avoid
- // invalidating iterators.
- if (isInstructionTriviallyDead(Inst, TLI))
- DeadInsts.push_back(Inst);
-
- // Replace PN with ExitVal if that is legal and does not break LCSSA.
- if (PN->getNumIncomingValues() == 1 &&
- LI->replacementPreservesLCSSAForm(PN, ExitVal)) {
- PN->replaceAllUsesWith(ExitVal);
- PN->eraseFromParent();
- }
- }
-
- // The insertion point instruction may have been deleted; clear it out
- // so that the rewriter doesn't trip over it later.
- Rewriter.clearInsertPoint();
- return NumReplaced;
-}
-
-/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for
-/// \p OrigLoop.
-void llvm::setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop,
- Loop *RemainderLoop, uint64_t UF) {
- assert(UF > 0 && "Zero unrolled factor is not supported");
- assert(UnrolledLoop != RemainderLoop &&
- "Unrolled and Remainder loops are expected to distinct");
-
- // Get number of iterations in the original scalar loop.
- unsigned OrigLoopInvocationWeight = 0;
- Optional<unsigned> OrigAverageTripCount =
- getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
- if (!OrigAverageTripCount)
- return;
-
- // Calculate number of iterations in unrolled loop.
- unsigned UnrolledAverageTripCount = *OrigAverageTripCount / UF;
- // Calculate number of iterations for remainder loop.
- unsigned RemainderAverageTripCount = *OrigAverageTripCount % UF;
-
- setLoopEstimatedTripCount(UnrolledLoop, UnrolledAverageTripCount,
- OrigLoopInvocationWeight);
- setLoopEstimatedTripCount(RemainderLoop, RemainderAverageTripCount,
- OrigLoopInvocationWeight);
-}
-
-/// Utility that implements appending of loops onto a worklist.
-/// Loops are added in preorder (analogous for reverse postorder for trees),
-/// and the worklist is processed LIFO.
-template <typename RangeT>
-void llvm::appendReversedLoopsToWorklist(
- RangeT &&Loops, SmallPriorityWorklist<Loop *, 4> &Worklist) {
- // We use an internal worklist to build up the preorder traversal without
- // recursion.
- SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
-
- // We walk the initial sequence of loops in reverse because we generally want
- // to visit defs before uses and the worklist is LIFO.
- for (Loop *RootL : Loops) {
- assert(PreOrderLoops.empty() && "Must start with an empty preorder walk.");
- assert(PreOrderWorklist.empty() &&
- "Must start with an empty preorder walk worklist.");
- PreOrderWorklist.push_back(RootL);
- do {
- Loop *L = PreOrderWorklist.pop_back_val();
- PreOrderWorklist.append(L->begin(), L->end());
- PreOrderLoops.push_back(L);
- } while (!PreOrderWorklist.empty());
-
- Worklist.insert(std::move(PreOrderLoops));
- PreOrderLoops.clear();
- }
-}
-
-template <typename RangeT>
-void llvm::appendLoopsToWorklist(RangeT &&Loops,
- SmallPriorityWorklist<Loop *, 4> &Worklist) {
- appendReversedLoopsToWorklist(reverse(Loops), Worklist);
-}
-
-template void llvm::appendLoopsToWorklist<ArrayRef<Loop *> &>(
- ArrayRef<Loop *> &Loops, SmallPriorityWorklist<Loop *, 4> &Worklist);
-
-template void
-llvm::appendLoopsToWorklist<Loop &>(Loop &L,
- SmallPriorityWorklist<Loop *, 4> &Worklist);
-
-void llvm::appendLoopsToWorklist(LoopInfo &LI,
- SmallPriorityWorklist<Loop *, 4> &Worklist) {
- appendReversedLoopsToWorklist(LI, Worklist);
-}
-
-Loop *llvm::cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
- LoopInfo *LI, LPPassManager *LPM) {
- Loop &New = *LI->AllocateLoop();
- if (PL)
- PL->addChildLoop(&New);
- else
- LI->addTopLevelLoop(&New);
-
- if (LPM)
- LPM->addLoop(New);
-
- // Add all of the blocks in L to the new loop.
- for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
- I != E; ++I)
- if (LI->getLoopFor(*I) == L)
- New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
-
- // Add all of the subloops to the new loop.
- for (Loop *I : *L)
- cloneLoop(I, &New, VM, LI, LPM);
-
- return &New;
-}
-
-/// IR Values for the lower and upper bounds of a pointer evolution. We
-/// need to use value-handles because SCEV expansion can invalidate previously
-/// expanded values. Thus expansion of a pointer can invalidate the bounds for
-/// a previous one.
-struct PointerBounds {
- TrackingVH<Value> Start;
- TrackingVH<Value> End;
-};
-
-/// Expand code for the lower and upper bound of the pointer group \p CG
-/// in \p TheLoop. \return the values for the bounds.
-static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG,
- Loop *TheLoop, Instruction *Loc,
- SCEVExpander &Exp, ScalarEvolution *SE) {
- // TODO: Add helper to retrieve pointers to CG.
- Value *Ptr = CG->RtCheck.Pointers[CG->Members[0]].PointerValue;
- const SCEV *Sc = SE->getSCEV(Ptr);
-
- unsigned AS = Ptr->getType()->getPointerAddressSpace();
- LLVMContext &Ctx = Loc->getContext();
-
- // Use this type for pointer arithmetic.
- Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
-
- if (SE->isLoopInvariant(Sc, TheLoop)) {
- LLVM_DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:"
- << *Ptr << "\n");
- // Ptr could be in the loop body. If so, expand a new one at the correct
- // location.
- Instruction *Inst = dyn_cast<Instruction>(Ptr);
- Value *NewPtr = (Inst && TheLoop->contains(Inst))
- ? Exp.expandCodeFor(Sc, PtrArithTy, Loc)
- : Ptr;
- // We must return a half-open range, which means incrementing Sc.
- const SCEV *ScPlusOne = SE->getAddExpr(Sc, SE->getOne(PtrArithTy));
- Value *NewPtrPlusOne = Exp.expandCodeFor(ScPlusOne, PtrArithTy, Loc);
- return {NewPtr, NewPtrPlusOne};
- } else {
- Value *Start = nullptr, *End = nullptr;
- LLVM_DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
- Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc);
- End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc);
- LLVM_DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High
- << "\n");
- return {Start, End};
- }
-}
-
-/// Turns a collection of checks into a collection of expanded upper and
-/// lower bounds for both pointers in the check.
-static SmallVector<std::pair<PointerBounds, PointerBounds>, 4>
-expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L,
- Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp) {
- SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds;
-
- // Here we're relying on the SCEV Expander's cache to only emit code for the
- // same bounds once.
- transform(PointerChecks, std::back_inserter(ChecksWithBounds),
- [&](const RuntimePointerCheck &Check) {
- PointerBounds First = expandBounds(Check.first, L, Loc, Exp, SE),
- Second =
- expandBounds(Check.second, L, Loc, Exp, SE);
- return std::make_pair(First, Second);
- });
-
- return ChecksWithBounds;
-}
-
-std::pair<Instruction *, Instruction *> llvm::addRuntimeChecks(
- Instruction *Loc, Loop *TheLoop,
- const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
- ScalarEvolution *SE) {
- // TODO: Move noalias annotation code from LoopVersioning here and share with LV if possible.
- // TODO: Pass RtPtrChecking instead of PointerChecks and SE separately, if possible
- const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
- SCEVExpander Exp(*SE, DL, "induction");
- auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, SE, Exp);
-
- LLVMContext &Ctx = Loc->getContext();
- Instruction *FirstInst = nullptr;
- IRBuilder<> ChkBuilder(Loc);
- // Our instructions might fold to a constant.
- Value *MemoryRuntimeCheck = nullptr;
-
- // FIXME: this helper is currently a duplicate of the one in
- // LoopVectorize.cpp.
- auto GetFirstInst = [](Instruction *FirstInst, Value *V,
- Instruction *Loc) -> Instruction * {
- if (FirstInst)
- return FirstInst;
- if (Instruction *I = dyn_cast<Instruction>(V))
- return I->getParent() == Loc->getParent() ? I : nullptr;
- return nullptr;
- };
-
- for (const auto &Check : ExpandedChecks) {
- const PointerBounds &A = Check.first, &B = Check.second;
- // Check if two pointers (A and B) conflict where conflict is computed as:
- // start(A) <= end(B) && start(B) <= end(A)
- unsigned AS0 = A.Start->getType()->getPointerAddressSpace();
- unsigned AS1 = B.Start->getType()->getPointerAddressSpace();
-
- assert((AS0 == B.End->getType()->getPointerAddressSpace()) &&
- (AS1 == A.End->getType()->getPointerAddressSpace()) &&
- "Trying to bounds check pointers with different address spaces");
-
- Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
- Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
-
- Value *Start0 = ChkBuilder.CreateBitCast(A.Start, PtrArithTy0, "bc");
- Value *Start1 = ChkBuilder.CreateBitCast(B.Start, PtrArithTy1, "bc");
- Value *End0 = ChkBuilder.CreateBitCast(A.End, PtrArithTy1, "bc");
- Value *End1 = ChkBuilder.CreateBitCast(B.End, PtrArithTy0, "bc");
-
- // [A|B].Start points to the first accessed byte under base [A|B].
- // [A|B].End points to the last accessed byte, plus one.
- // There is no conflict when the intervals are disjoint:
- // NoConflict = (B.Start >= A.End) || (A.Start >= B.End)
- //
- // bound0 = (B.Start < A.End)
- // bound1 = (A.Start < B.End)
- // IsConflict = bound0 & bound1
- Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0");
- FirstInst = GetFirstInst(FirstInst, Cmp0, Loc);
- Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1");
- FirstInst = GetFirstInst(FirstInst, Cmp1, Loc);
- Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
- FirstInst = GetFirstInst(FirstInst, IsConflict, Loc);
- if (MemoryRuntimeCheck) {
- IsConflict =
- ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
- FirstInst = GetFirstInst(FirstInst, IsConflict, Loc);
- }
- MemoryRuntimeCheck = IsConflict;
- }
-
- if (!MemoryRuntimeCheck)
- return std::make_pair(nullptr, nullptr);
-
- // We have to do this trickery because the IRBuilder might fold the check to a
- // constant expression in which case there is no Instruction anchored in a
- // the block.
- Instruction *Check =
- BinaryOperator::CreateAnd(MemoryRuntimeCheck, ConstantInt::getTrue(Ctx));
- ChkBuilder.Insert(Check, "memcheck.conflict");
- FirstInst = GetFirstInst(FirstInst, Check, Loc);
- return std::make_pair(FirstInst, Check);
-}
+ // base of a recurrence. This handles the case in which SCEV expansion
+ // converts a pointer type recurrence into a nonrecurrent pointer base
+ // indexed by an integer recurrence.
+
+ // If the GEP base pointer is a vector of pointers, abort.
+ if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy())
+ return false;
+
+ const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr));
+ const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr));
+ if (FromBase == ToBase)
+ return true;
+
+ LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: GEP rewrite bail out "
+ << *FromBase << " != " << *ToBase << "\n");
+
+ return false;
+ }
+ return true;
+}
+
+static bool hasHardUserWithinLoop(const Loop *L, const Instruction *I) {
+ SmallPtrSet<const Instruction *, 8> Visited;
+ SmallVector<const Instruction *, 8> WorkList;
+ Visited.insert(I);
+ WorkList.push_back(I);
+ while (!WorkList.empty()) {
+ const Instruction *Curr = WorkList.pop_back_val();
+ // This use is outside the loop, nothing to do.
+ if (!L->contains(Curr))
+ continue;
+ // Do we assume it is a "hard" use which will not be eliminated easily?
+ if (Curr->mayHaveSideEffects())
+ return true;
+ // Otherwise, add all its users to worklist.
+ for (auto U : Curr->users()) {
+ auto *UI = cast<Instruction>(U);
+ if (Visited.insert(UI).second)
+ WorkList.push_back(UI);
+ }
+ }
+ return false;
+}
+
+// Collect information about PHI nodes which can be transformed in
+// rewriteLoopExitValues.
+struct RewritePhi {
+ PHINode *PN; // For which PHI node is this replacement?
+ unsigned Ith; // For which incoming value?
+ const SCEV *ExpansionSCEV; // The SCEV of the incoming value we are rewriting.
+ Instruction *ExpansionPoint; // Where we'd like to expand that SCEV?
+ bool HighCost; // Is this expansion a high-cost?
+
+ Value *Expansion = nullptr;
+ bool ValidRewrite = false;
+
+ RewritePhi(PHINode *P, unsigned I, const SCEV *Val, Instruction *ExpansionPt,
+ bool H)
+ : PN(P), Ith(I), ExpansionSCEV(Val), ExpansionPoint(ExpansionPt),
+ HighCost(H) {}
+};
+
+// Check whether it is possible to delete the loop after rewriting exit
+// value. If it is possible, ignore ReplaceExitValue and do rewriting
+// aggressively.
+static bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) {
+ BasicBlock *Preheader = L->getLoopPreheader();
+ // If there is no preheader, the loop will not be deleted.
+ if (!Preheader)
+ return false;
+
+ // In LoopDeletion pass Loop can be deleted when ExitingBlocks.size() > 1.
+ // We obviate multiple ExitingBlocks case for simplicity.
+ // TODO: If we see testcase with multiple ExitingBlocks can be deleted
+ // after exit value rewriting, we can enhance the logic here.
+ SmallVector<BasicBlock *, 4> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+ SmallVector<BasicBlock *, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+ if (ExitBlocks.size() != 1 || ExitingBlocks.size() != 1)
+ return false;
+
+ BasicBlock *ExitBlock = ExitBlocks[0];
+ BasicBlock::iterator BI = ExitBlock->begin();
+ while (PHINode *P = dyn_cast<PHINode>(BI)) {
+ Value *Incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
+
+ // If the Incoming value of P is found in RewritePhiSet, we know it
+ // could be rewritten to use a loop invariant value in transformation
+ // phase later. Skip it in the loop invariant check below.
+ bool found = false;
+ for (const RewritePhi &Phi : RewritePhiSet) {
+ if (!Phi.ValidRewrite)
+ continue;
+ unsigned i = Phi.Ith;
+ if (Phi.PN == P && (Phi.PN)->getIncomingValue(i) == Incoming) {
+ found = true;
+ break;
+ }
+ }
+
+ Instruction *I;
+ if (!found && (I = dyn_cast<Instruction>(Incoming)))
+ if (!L->hasLoopInvariantOperands(I))
+ return false;
+
+ ++BI;
+ }
+
+ for (auto *BB : L->blocks())
+ if (llvm::any_of(*BB, [](Instruction &I) {
+ return I.mayHaveSideEffects();
+ }))
+ return false;
+
+ return true;
+}
+
+int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
+ ScalarEvolution *SE,
+ const TargetTransformInfo *TTI,
+ SCEVExpander &Rewriter, DominatorTree *DT,
+ ReplaceExitVal ReplaceExitValue,
+ SmallVector<WeakTrackingVH, 16> &DeadInsts) {
+ // Check a pre-condition.
+ assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+ "Indvars did not preserve LCSSA!");
+
+ SmallVector<BasicBlock*, 8> ExitBlocks;
+ L->getUniqueExitBlocks(ExitBlocks);
+
+ SmallVector<RewritePhi, 8> RewritePhiSet;
+ // Find all values that are computed inside the loop, but used outside of it.
+ // Because of LCSSA, these values will only occur in LCSSA PHI Nodes. Scan
+ // the exit blocks of the loop to find them.
+ for (BasicBlock *ExitBB : ExitBlocks) {
+ // If there are no PHI nodes in this exit block, then no values defined
+ // inside the loop are used on this path, skip it.
+ PHINode *PN = dyn_cast<PHINode>(ExitBB->begin());
+ if (!PN) continue;
+
+ unsigned NumPreds = PN->getNumIncomingValues();
+
+ // Iterate over all of the PHI nodes.
+ BasicBlock::iterator BBI = ExitBB->begin();
+ while ((PN = dyn_cast<PHINode>(BBI++))) {
+ if (PN->use_empty())
+ continue; // dead use, don't replace it
+
+ if (!SE->isSCEVable(PN->getType()))
+ continue;
+
+ // It's necessary to tell ScalarEvolution about this explicitly so that
+ // it can walk the def-use list and forget all SCEVs, as it may not be
+ // watching the PHI itself. Once the new exit value is in place, there
+ // may not be a def-use connection between the loop and every instruction
+ // which got a SCEVAddRecExpr for that loop.
+ SE->forgetValue(PN);
+
+ // Iterate over all of the values in all the PHI nodes.
+ for (unsigned i = 0; i != NumPreds; ++i) {
+ // If the value being merged in is not integer or is not defined
+ // in the loop, skip it.
+ Value *InVal = PN->getIncomingValue(i);
+ if (!isa<Instruction>(InVal))
+ continue;
+
+ // If this pred is for a subloop, not L itself, skip it.
+ if (LI->getLoopFor(PN->getIncomingBlock(i)) != L)
+ continue; // The Block is in a subloop, skip it.
+
+ // Check that InVal is defined in the loop.
+ Instruction *Inst = cast<Instruction>(InVal);
+ if (!L->contains(Inst))
+ continue;
+
+ // Okay, this instruction has a user outside of the current loop
+ // and varies predictably *inside* the loop. Evaluate the value it
+ // contains when the loop exits, if possible. We prefer to start with
+ // expressions which are true for all exits (so as to maximize
+ // expression reuse by the SCEVExpander), but resort to per-exit
+ // evaluation if that fails.
+ const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
+ if (isa<SCEVCouldNotCompute>(ExitValue) ||
+ !SE->isLoopInvariant(ExitValue, L) ||
+ !isSafeToExpand(ExitValue, *SE)) {
+ // TODO: This should probably be sunk into SCEV in some way; maybe a
+ // getSCEVForExit(SCEV*, L, ExitingBB)? It can be generalized for
+ // most SCEV expressions and other recurrence types (e.g. shift
+ // recurrences). Is there existing code we can reuse?
+ const SCEV *ExitCount = SE->getExitCount(L, PN->getIncomingBlock(i));
+ if (isa<SCEVCouldNotCompute>(ExitCount))
+ continue;
+ if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Inst)))
+ if (AddRec->getLoop() == L)
+ ExitValue = AddRec->evaluateAtIteration(ExitCount, *SE);
+ if (isa<SCEVCouldNotCompute>(ExitValue) ||
+ !SE->isLoopInvariant(ExitValue, L) ||
+ !isSafeToExpand(ExitValue, *SE))
+ continue;
+ }
+
+ // Computing the value outside of the loop brings no benefit if it is
+ // definitely used inside the loop in a way which can not be optimized
+ // away. Avoid doing so unless we know we have a value which computes
+ // the ExitValue already. TODO: This should be merged into SCEV
+ // expander to leverage its knowledge of existing expressions.
+ if (ReplaceExitValue != AlwaysRepl && !isa<SCEVConstant>(ExitValue) &&
+ !isa<SCEVUnknown>(ExitValue) && hasHardUserWithinLoop(L, Inst))
+ continue;
+
+ // Check if expansions of this SCEV would count as being high cost.
+ bool HighCost = Rewriter.isHighCostExpansion(
+ ExitValue, L, SCEVCheapExpansionBudget, TTI, Inst);
+
+ // Note that we must not perform expansions until after
+ // we query *all* the costs, because if we perform temporary expansion
+ // inbetween, one that we might not intend to keep, said expansion
+ // *may* affect cost calculation of the the next SCEV's we'll query,
+ // and next SCEV may errneously get smaller cost.
+
+ // Collect all the candidate PHINodes to be rewritten.
+ RewritePhiSet.emplace_back(PN, i, ExitValue, Inst, HighCost);
+ }
+ }
+ }
+
+ // Now that we've done preliminary filtering and billed all the SCEV's,
+ // we can perform the last sanity check - the expansion must be valid.
+ for (RewritePhi &Phi : RewritePhiSet) {
+ Phi.Expansion = Rewriter.expandCodeFor(Phi.ExpansionSCEV, Phi.PN->getType(),
+ Phi.ExpansionPoint);
+
+ LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: AfterLoopVal = "
+ << *(Phi.Expansion) << '\n'
+ << " LoopVal = " << *(Phi.ExpansionPoint) << "\n");
+
+ // FIXME: isValidRewrite() is a hack. it should be an assert, eventually.
+ Phi.ValidRewrite = isValidRewrite(SE, Phi.ExpansionPoint, Phi.Expansion);
+ if (!Phi.ValidRewrite) {
+ DeadInsts.push_back(Phi.Expansion);
+ continue;
+ }
+
+#ifndef NDEBUG
+ // If we reuse an instruction from a loop which is neither L nor one of
+ // its containing loops, we end up breaking LCSSA form for this loop by
+ // creating a new use of its instruction.
+ if (auto *ExitInsn = dyn_cast<Instruction>(Phi.Expansion))
+ if (auto *EVL = LI->getLoopFor(ExitInsn->getParent()))
+ if (EVL != L)
+ assert(EVL->contains(L) && "LCSSA breach detected!");
+#endif
+ }
+
+ // TODO: after isValidRewrite() is an assertion, evaluate whether
+ // it is beneficial to change how we calculate high-cost:
+ // if we have SCEV 'A' which we know we will expand, should we calculate
+ // the cost of other SCEV's after expanding SCEV 'A',
+ // thus potentially giving cost bonus to those other SCEV's?
+
+ bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet);
+ int NumReplaced = 0;
+
+ // Transformation.
+ for (const RewritePhi &Phi : RewritePhiSet) {
+ if (!Phi.ValidRewrite)
+ continue;
+
+ PHINode *PN = Phi.PN;
+ Value *ExitVal = Phi.Expansion;
+
+ // Only do the rewrite when the ExitValue can be expanded cheaply.
+ // If LoopCanBeDel is true, rewrite exit value aggressively.
+ if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) {
+ DeadInsts.push_back(ExitVal);
+ continue;
+ }
+
+ NumReplaced++;
+ Instruction *Inst = cast<Instruction>(PN->getIncomingValue(Phi.Ith));
+ PN->setIncomingValue(Phi.Ith, ExitVal);
+
+ // If this instruction is dead now, delete it. Don't do it now to avoid
+ // invalidating iterators.
+ if (isInstructionTriviallyDead(Inst, TLI))
+ DeadInsts.push_back(Inst);
+
+ // Replace PN with ExitVal if that is legal and does not break LCSSA.
+ if (PN->getNumIncomingValues() == 1 &&
+ LI->replacementPreservesLCSSAForm(PN, ExitVal)) {
+ PN->replaceAllUsesWith(ExitVal);
+ PN->eraseFromParent();
+ }
+ }
+
+ // The insertion point instruction may have been deleted; clear it out
+ // so that the rewriter doesn't trip over it later.
+ Rewriter.clearInsertPoint();
+ return NumReplaced;
+}
+
+/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for
+/// \p OrigLoop.
+void llvm::setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop,
+ Loop *RemainderLoop, uint64_t UF) {
+ assert(UF > 0 && "Zero unrolled factor is not supported");
+ assert(UnrolledLoop != RemainderLoop &&
+ "Unrolled and Remainder loops are expected to distinct");
+
+ // Get number of iterations in the original scalar loop.
+ unsigned OrigLoopInvocationWeight = 0;
+ Optional<unsigned> OrigAverageTripCount =
+ getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
+ if (!OrigAverageTripCount)
+ return;
+
+ // Calculate number of iterations in unrolled loop.
+ unsigned UnrolledAverageTripCount = *OrigAverageTripCount / UF;
+ // Calculate number of iterations for remainder loop.
+ unsigned RemainderAverageTripCount = *OrigAverageTripCount % UF;
+
+ setLoopEstimatedTripCount(UnrolledLoop, UnrolledAverageTripCount,
+ OrigLoopInvocationWeight);
+ setLoopEstimatedTripCount(RemainderLoop, RemainderAverageTripCount,
+ OrigLoopInvocationWeight);
+}
+
+/// Utility that implements appending of loops onto a worklist.
+/// Loops are added in preorder (analogous for reverse postorder for trees),
+/// and the worklist is processed LIFO.
+template <typename RangeT>
+void llvm::appendReversedLoopsToWorklist(
+ RangeT &&Loops, SmallPriorityWorklist<Loop *, 4> &Worklist) {
+ // We use an internal worklist to build up the preorder traversal without
+ // recursion.
+ SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
+
+ // We walk the initial sequence of loops in reverse because we generally want
+ // to visit defs before uses and the worklist is LIFO.
+ for (Loop *RootL : Loops) {
+ assert(PreOrderLoops.empty() && "Must start with an empty preorder walk.");
+ assert(PreOrderWorklist.empty() &&
+ "Must start with an empty preorder walk worklist.");
+ PreOrderWorklist.push_back(RootL);
+ do {
+ Loop *L = PreOrderWorklist.pop_back_val();
+ PreOrderWorklist.append(L->begin(), L->end());
+ PreOrderLoops.push_back(L);
+ } while (!PreOrderWorklist.empty());
+
+ Worklist.insert(std::move(PreOrderLoops));
+ PreOrderLoops.clear();
+ }
+}
+
+template <typename RangeT>
+void llvm::appendLoopsToWorklist(RangeT &&Loops,
+ SmallPriorityWorklist<Loop *, 4> &Worklist) {
+ appendReversedLoopsToWorklist(reverse(Loops), Worklist);
+}
+
+template void llvm::appendLoopsToWorklist<ArrayRef<Loop *> &>(
+ ArrayRef<Loop *> &Loops, SmallPriorityWorklist<Loop *, 4> &Worklist);
+
+template void
+llvm::appendLoopsToWorklist<Loop &>(Loop &L,
+ SmallPriorityWorklist<Loop *, 4> &Worklist);
+
+void llvm::appendLoopsToWorklist(LoopInfo &LI,
+ SmallPriorityWorklist<Loop *, 4> &Worklist) {
+ appendReversedLoopsToWorklist(LI, Worklist);
+}
+
+Loop *llvm::cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
+ LoopInfo *LI, LPPassManager *LPM) {
+ Loop &New = *LI->AllocateLoop();
+ if (PL)
+ PL->addChildLoop(&New);
+ else
+ LI->addTopLevelLoop(&New);
+
+ if (LPM)
+ LPM->addLoop(New);
+
+ // Add all of the blocks in L to the new loop.
+ for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+ I != E; ++I)
+ if (LI->getLoopFor(*I) == L)
+ New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
+
+ // Add all of the subloops to the new loop.
+ for (Loop *I : *L)
+ cloneLoop(I, &New, VM, LI, LPM);
+
+ return &New;
+}
+
+/// IR Values for the lower and upper bounds of a pointer evolution. We
+/// need to use value-handles because SCEV expansion can invalidate previously
+/// expanded values. Thus expansion of a pointer can invalidate the bounds for
+/// a previous one.
+struct PointerBounds {
+ TrackingVH<Value> Start;
+ TrackingVH<Value> End;
+};
+
+/// Expand code for the lower and upper bound of the pointer group \p CG
+/// in \p TheLoop. \return the values for the bounds.
+static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG,
+ Loop *TheLoop, Instruction *Loc,
+ SCEVExpander &Exp, ScalarEvolution *SE) {
+ // TODO: Add helper to retrieve pointers to CG.
+ Value *Ptr = CG->RtCheck.Pointers[CG->Members[0]].PointerValue;
+ const SCEV *Sc = SE->getSCEV(Ptr);
+
+ unsigned AS = Ptr->getType()->getPointerAddressSpace();
+ LLVMContext &Ctx = Loc->getContext();
+
+ // Use this type for pointer arithmetic.
+ Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
+
+ if (SE->isLoopInvariant(Sc, TheLoop)) {
+ LLVM_DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:"
+ << *Ptr << "\n");
+ // Ptr could be in the loop body. If so, expand a new one at the correct
+ // location.
+ Instruction *Inst = dyn_cast<Instruction>(Ptr);
+ Value *NewPtr = (Inst && TheLoop->contains(Inst))
+ ? Exp.expandCodeFor(Sc, PtrArithTy, Loc)
+ : Ptr;
+ // We must return a half-open range, which means incrementing Sc.
+ const SCEV *ScPlusOne = SE->getAddExpr(Sc, SE->getOne(PtrArithTy));
+ Value *NewPtrPlusOne = Exp.expandCodeFor(ScPlusOne, PtrArithTy, Loc);
+ return {NewPtr, NewPtrPlusOne};
+ } else {
+ Value *Start = nullptr, *End = nullptr;
+ LLVM_DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
+ Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc);
+ End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc);
+ LLVM_DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High
+ << "\n");
+ return {Start, End};
+ }
+}
+
+/// Turns a collection of checks into a collection of expanded upper and
+/// lower bounds for both pointers in the check.
+static SmallVector<std::pair<PointerBounds, PointerBounds>, 4>
+expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L,
+ Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp) {
+ SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds;
+
+ // Here we're relying on the SCEV Expander's cache to only emit code for the
+ // same bounds once.
+ transform(PointerChecks, std::back_inserter(ChecksWithBounds),
+ [&](const RuntimePointerCheck &Check) {
+ PointerBounds First = expandBounds(Check.first, L, Loc, Exp, SE),
+ Second =
+ expandBounds(Check.second, L, Loc, Exp, SE);
+ return std::make_pair(First, Second);
+ });
+
+ return ChecksWithBounds;
+}
+
+std::pair<Instruction *, Instruction *> llvm::addRuntimeChecks(
+ Instruction *Loc, Loop *TheLoop,
+ const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
+ ScalarEvolution *SE) {
+ // TODO: Move noalias annotation code from LoopVersioning here and share with LV if possible.
+ // TODO: Pass RtPtrChecking instead of PointerChecks and SE separately, if possible
+ const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+ SCEVExpander Exp(*SE, DL, "induction");
+ auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, SE, Exp);
+
+ LLVMContext &Ctx = Loc->getContext();
+ Instruction *FirstInst = nullptr;
+ IRBuilder<> ChkBuilder(Loc);
+ // Our instructions might fold to a constant.
+ Value *MemoryRuntimeCheck = nullptr;
+
+ // FIXME: this helper is currently a duplicate of the one in
+ // LoopVectorize.cpp.
+ auto GetFirstInst = [](Instruction *FirstInst, Value *V,
+ Instruction *Loc) -> Instruction * {
+ if (FirstInst)
+ return FirstInst;
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ return I->getParent() == Loc->getParent() ? I : nullptr;
+ return nullptr;
+ };
+
+ for (const auto &Check : ExpandedChecks) {
+ const PointerBounds &A = Check.first, &B = Check.second;
+ // Check if two pointers (A and B) conflict where conflict is computed as:
+ // start(A) <= end(B) && start(B) <= end(A)
+ unsigned AS0 = A.Start->getType()->getPointerAddressSpace();
+ unsigned AS1 = B.Start->getType()->getPointerAddressSpace();
+
+ assert((AS0 == B.End->getType()->getPointerAddressSpace()) &&
+ (AS1 == A.End->getType()->getPointerAddressSpace()) &&
+ "Trying to bounds check pointers with different address spaces");
+
+ Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
+ Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
+
+ Value *Start0 = ChkBuilder.CreateBitCast(A.Start, PtrArithTy0, "bc");
+ Value *Start1 = ChkBuilder.CreateBitCast(B.Start, PtrArithTy1, "bc");
+ Value *End0 = ChkBuilder.CreateBitCast(A.End, PtrArithTy1, "bc");
+ Value *End1 = ChkBuilder.CreateBitCast(B.End, PtrArithTy0, "bc");
+
+ // [A|B].Start points to the first accessed byte under base [A|B].
+ // [A|B].End points to the last accessed byte, plus one.
+ // There is no conflict when the intervals are disjoint:
+ // NoConflict = (B.Start >= A.End) || (A.Start >= B.End)
+ //
+ // bound0 = (B.Start < A.End)
+ // bound1 = (A.Start < B.End)
+ // IsConflict = bound0 & bound1
+ Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0");
+ FirstInst = GetFirstInst(FirstInst, Cmp0, Loc);
+ Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1");
+ FirstInst = GetFirstInst(FirstInst, Cmp1, Loc);
+ Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
+ FirstInst = GetFirstInst(FirstInst, IsConflict, Loc);
+ if (MemoryRuntimeCheck) {
+ IsConflict =
+ ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
+ FirstInst = GetFirstInst(FirstInst, IsConflict, Loc);
+ }
+ MemoryRuntimeCheck = IsConflict;
+ }
+
+ if (!MemoryRuntimeCheck)
+ return std::make_pair(nullptr, nullptr);
+
+ // We have to do this trickery because the IRBuilder might fold the check to a
+ // constant expression in which case there is no Instruction anchored in a
+ // the block.
+ Instruction *Check =
+ BinaryOperator::CreateAnd(MemoryRuntimeCheck, ConstantInt::getTrue(Ctx));
+ ChkBuilder.Insert(Check, "memcheck.conflict");
+ FirstInst = GetFirstInst(FirstInst, Check, Loc);
+ return std::make_pair(FirstInst, Check);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LoopVersioning.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LoopVersioning.cpp
index 43a9f270c0..599bd1feb2 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LoopVersioning.cpp
@@ -1,41 +1,41 @@
-//===- LoopVersioning.cpp - Utility to version a loop ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines a utility class to perform loop versioning. The versioned
-// loop speculates that otherwise may-aliasing memory accesses don't overlap and
-// emits checks to prove this.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/LoopVersioning.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
+//===- LoopVersioning.cpp - Utility to version a loop ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a utility class to perform loop versioning. The versioned
+// loop speculates that otherwise may-aliasing memory accesses don't overlap and
+// emits checks to prove this.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/MemorySSA.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-
-using namespace llvm;
-
-static cl::opt<bool>
- AnnotateNoAlias("loop-version-annotate-no-alias", cl::init(true),
- cl::Hidden,
- cl::desc("Add no-alias annotation for instructions that "
- "are disambiguated by memchecks"));
-
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+ AnnotateNoAlias("loop-version-annotate-no-alias", cl::init(true),
+ cl::Hidden,
+ cl::desc("Add no-alias annotation for instructions that "
+ "are disambiguated by memchecks"));
+
LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI,
ArrayRef<RuntimePointerCheck> Checks, Loop *L,
LoopInfo *LI, DominatorTree *DT,
@@ -43,217 +43,217 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI,
: VersionedLoop(L), NonVersionedLoop(nullptr),
AliasChecks(Checks.begin(), Checks.end()),
Preds(LAI.getPSE().getUnionPredicate()), LAI(LAI), LI(LI), DT(DT),
- SE(SE) {
+ SE(SE) {
assert(L->getUniqueExitBlock() && "No single exit block");
-}
-
-void LoopVersioning::versionLoop(
- const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
+}
+
+void LoopVersioning::versionLoop(
+ const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
assert(VersionedLoop->isLoopSimplifyForm() &&
"Loop is not in loop-simplify form");
- Instruction *FirstCheckInst;
- Instruction *MemRuntimeCheck;
- Value *SCEVRuntimeCheck;
- Value *RuntimeCheck = nullptr;
-
- // Add the memcheck in the original preheader (this is empty initially).
- BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader();
- const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
- std::tie(FirstCheckInst, MemRuntimeCheck) =
- addRuntimeChecks(RuntimeCheckBB->getTerminator(), VersionedLoop,
- AliasChecks, RtPtrChecking.getSE());
-
- SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
- "scev.check");
- SCEVRuntimeCheck =
+ Instruction *FirstCheckInst;
+ Instruction *MemRuntimeCheck;
+ Value *SCEVRuntimeCheck;
+ Value *RuntimeCheck = nullptr;
+
+ // Add the memcheck in the original preheader (this is empty initially).
+ BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader();
+ const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
+ std::tie(FirstCheckInst, MemRuntimeCheck) =
+ addRuntimeChecks(RuntimeCheckBB->getTerminator(), VersionedLoop,
+ AliasChecks, RtPtrChecking.getSE());
+
+ SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
+ "scev.check");
+ SCEVRuntimeCheck =
Exp.expandCodeForPredicate(&Preds, RuntimeCheckBB->getTerminator());
- auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck);
-
- // Discard the SCEV runtime check if it is always true.
- if (CI && CI->isZero())
- SCEVRuntimeCheck = nullptr;
-
- if (MemRuntimeCheck && SCEVRuntimeCheck) {
- RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck,
- SCEVRuntimeCheck, "lver.safe");
- if (auto *I = dyn_cast<Instruction>(RuntimeCheck))
- I->insertBefore(RuntimeCheckBB->getTerminator());
- } else
- RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck;
-
- assert(RuntimeCheck && "called even though we don't need "
- "any runtime checks");
-
- // Rename the block to make the IR more readable.
- RuntimeCheckBB->setName(VersionedLoop->getHeader()->getName() +
- ".lver.check");
-
- // Create empty preheader for the loop (and after cloning for the
- // non-versioned loop).
- BasicBlock *PH =
- SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI,
- nullptr, VersionedLoop->getHeader()->getName() + ".ph");
-
- // Clone the loop including the preheader.
- //
- // FIXME: This does not currently preserve SimplifyLoop because the exit
- // block is a join between the two loops.
- SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks;
- NonVersionedLoop =
- cloneLoopWithPreheader(PH, RuntimeCheckBB, VersionedLoop, VMap,
- ".lver.orig", LI, DT, NonVersionedLoopBlocks);
- remapInstructionsInBlocks(NonVersionedLoopBlocks, VMap);
-
- // Insert the conditional branch based on the result of the memchecks.
- Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
- BranchInst::Create(NonVersionedLoop->getLoopPreheader(),
- VersionedLoop->getLoopPreheader(), RuntimeCheck, OrigTerm);
- OrigTerm->eraseFromParent();
-
- // The loops merge in the original exit block. This is now dominated by the
- // memchecking block.
- DT->changeImmediateDominator(VersionedLoop->getExitBlock(), RuntimeCheckBB);
-
- // Adds the necessary PHI nodes for the versioned loops based on the
- // loop-defined values used outside of the loop.
- addPHINodes(DefsUsedOutside);
+ auto *CI = dyn_cast<ConstantInt>(SCEVRuntimeCheck);
+
+ // Discard the SCEV runtime check if it is always true.
+ if (CI && CI->isZero())
+ SCEVRuntimeCheck = nullptr;
+
+ if (MemRuntimeCheck && SCEVRuntimeCheck) {
+ RuntimeCheck = BinaryOperator::Create(Instruction::Or, MemRuntimeCheck,
+ SCEVRuntimeCheck, "lver.safe");
+ if (auto *I = dyn_cast<Instruction>(RuntimeCheck))
+ I->insertBefore(RuntimeCheckBB->getTerminator());
+ } else
+ RuntimeCheck = MemRuntimeCheck ? MemRuntimeCheck : SCEVRuntimeCheck;
+
+ assert(RuntimeCheck && "called even though we don't need "
+ "any runtime checks");
+
+ // Rename the block to make the IR more readable.
+ RuntimeCheckBB->setName(VersionedLoop->getHeader()->getName() +
+ ".lver.check");
+
+ // Create empty preheader for the loop (and after cloning for the
+ // non-versioned loop).
+ BasicBlock *PH =
+ SplitBlock(RuntimeCheckBB, RuntimeCheckBB->getTerminator(), DT, LI,
+ nullptr, VersionedLoop->getHeader()->getName() + ".ph");
+
+ // Clone the loop including the preheader.
+ //
+ // FIXME: This does not currently preserve SimplifyLoop because the exit
+ // block is a join between the two loops.
+ SmallVector<BasicBlock *, 8> NonVersionedLoopBlocks;
+ NonVersionedLoop =
+ cloneLoopWithPreheader(PH, RuntimeCheckBB, VersionedLoop, VMap,
+ ".lver.orig", LI, DT, NonVersionedLoopBlocks);
+ remapInstructionsInBlocks(NonVersionedLoopBlocks, VMap);
+
+ // Insert the conditional branch based on the result of the memchecks.
+ Instruction *OrigTerm = RuntimeCheckBB->getTerminator();
+ BranchInst::Create(NonVersionedLoop->getLoopPreheader(),
+ VersionedLoop->getLoopPreheader(), RuntimeCheck, OrigTerm);
+ OrigTerm->eraseFromParent();
+
+ // The loops merge in the original exit block. This is now dominated by the
+ // memchecking block.
+ DT->changeImmediateDominator(VersionedLoop->getExitBlock(), RuntimeCheckBB);
+
+ // Adds the necessary PHI nodes for the versioned loops based on the
+ // loop-defined values used outside of the loop.
+ addPHINodes(DefsUsedOutside);
formDedicatedExitBlocks(NonVersionedLoop, DT, LI, nullptr, true);
formDedicatedExitBlocks(VersionedLoop, DT, LI, nullptr, true);
assert(NonVersionedLoop->isLoopSimplifyForm() &&
VersionedLoop->isLoopSimplifyForm() &&
"The versioned loops should be in simplify form.");
-}
-
-void LoopVersioning::addPHINodes(
- const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
- BasicBlock *PHIBlock = VersionedLoop->getExitBlock();
- assert(PHIBlock && "No single successor to loop exit block");
- PHINode *PN;
-
- // First add a single-operand PHI for each DefsUsedOutside if one does not
- // exists yet.
- for (auto *Inst : DefsUsedOutside) {
- // See if we have a single-operand PHI with the value defined by the
- // original loop.
- for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
- if (PN->getIncomingValue(0) == Inst)
- break;
- }
- // If not create it.
- if (!PN) {
- PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver",
- &PHIBlock->front());
- SmallVector<User*, 8> UsersToUpdate;
- for (User *U : Inst->users())
- if (!VersionedLoop->contains(cast<Instruction>(U)->getParent()))
- UsersToUpdate.push_back(U);
- for (User *U : UsersToUpdate)
- U->replaceUsesOfWith(Inst, PN);
- PN->addIncoming(Inst, VersionedLoop->getExitingBlock());
- }
- }
-
- // Then for each PHI add the operand for the edge from the cloned loop.
- for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
- assert(PN->getNumOperands() == 1 &&
- "Exit block should only have on predecessor");
-
- // If the definition was cloned used that otherwise use the same value.
- Value *ClonedValue = PN->getIncomingValue(0);
- auto Mapped = VMap.find(ClonedValue);
- if (Mapped != VMap.end())
- ClonedValue = Mapped->second;
-
- PN->addIncoming(ClonedValue, NonVersionedLoop->getExitingBlock());
- }
-}
-
-void LoopVersioning::prepareNoAliasMetadata() {
- // We need to turn the no-alias relation between pointer checking groups into
- // no-aliasing annotations between instructions.
- //
- // We accomplish this by mapping each pointer checking group (a set of
- // pointers memchecked together) to an alias scope and then also mapping each
- // group to the list of scopes it can't alias.
-
- const RuntimePointerChecking *RtPtrChecking = LAI.getRuntimePointerChecking();
- LLVMContext &Context = VersionedLoop->getHeader()->getContext();
-
- // First allocate an aliasing scope for each pointer checking group.
- //
- // While traversing through the checking groups in the loop, also create a
- // reverse map from pointers to the pointer checking group they were assigned
- // to.
- MDBuilder MDB(Context);
- MDNode *Domain = MDB.createAnonymousAliasScopeDomain("LVerDomain");
-
- for (const auto &Group : RtPtrChecking->CheckingGroups) {
- GroupToScope[&Group] = MDB.createAnonymousAliasScope(Domain);
-
- for (unsigned PtrIdx : Group.Members)
- PtrToGroup[RtPtrChecking->getPointerInfo(PtrIdx).PointerValue] = &Group;
- }
-
- // Go through the checks and for each pointer group, collect the scopes for
- // each non-aliasing pointer group.
- DenseMap<const RuntimeCheckingPtrGroup *, SmallVector<Metadata *, 4>>
- GroupToNonAliasingScopes;
-
- for (const auto &Check : AliasChecks)
- GroupToNonAliasingScopes[Check.first].push_back(GroupToScope[Check.second]);
-
- // Finally, transform the above to actually map to scope list which is what
- // the metadata uses.
-
- for (auto Pair : GroupToNonAliasingScopes)
- GroupToNonAliasingScopeList[Pair.first] = MDNode::get(Context, Pair.second);
-}
-
-void LoopVersioning::annotateLoopWithNoAlias() {
- if (!AnnotateNoAlias)
- return;
-
- // First prepare the maps.
- prepareNoAliasMetadata();
-
- // Add the scope and no-alias metadata to the instructions.
- for (Instruction *I : LAI.getDepChecker().getMemoryInstructions()) {
- annotateInstWithNoAlias(I);
- }
-}
-
-void LoopVersioning::annotateInstWithNoAlias(Instruction *VersionedInst,
- const Instruction *OrigInst) {
- if (!AnnotateNoAlias)
- return;
-
- LLVMContext &Context = VersionedLoop->getHeader()->getContext();
- const Value *Ptr = isa<LoadInst>(OrigInst)
- ? cast<LoadInst>(OrigInst)->getPointerOperand()
- : cast<StoreInst>(OrigInst)->getPointerOperand();
-
- // Find the group for the pointer and then add the scope metadata.
- auto Group = PtrToGroup.find(Ptr);
- if (Group != PtrToGroup.end()) {
- VersionedInst->setMetadata(
- LLVMContext::MD_alias_scope,
- MDNode::concatenate(
- VersionedInst->getMetadata(LLVMContext::MD_alias_scope),
- MDNode::get(Context, GroupToScope[Group->second])));
-
- // Add the no-alias metadata.
- auto NonAliasingScopeList = GroupToNonAliasingScopeList.find(Group->second);
- if (NonAliasingScopeList != GroupToNonAliasingScopeList.end())
- VersionedInst->setMetadata(
- LLVMContext::MD_noalias,
- MDNode::concatenate(
- VersionedInst->getMetadata(LLVMContext::MD_noalias),
- NonAliasingScopeList->second));
- }
-}
-
-namespace {
+}
+
+void LoopVersioning::addPHINodes(
+ const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
+ BasicBlock *PHIBlock = VersionedLoop->getExitBlock();
+ assert(PHIBlock && "No single successor to loop exit block");
+ PHINode *PN;
+
+ // First add a single-operand PHI for each DefsUsedOutside if one does not
+ // exists yet.
+ for (auto *Inst : DefsUsedOutside) {
+ // See if we have a single-operand PHI with the value defined by the
+ // original loop.
+ for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
+ if (PN->getIncomingValue(0) == Inst)
+ break;
+ }
+ // If not create it.
+ if (!PN) {
+ PN = PHINode::Create(Inst->getType(), 2, Inst->getName() + ".lver",
+ &PHIBlock->front());
+ SmallVector<User*, 8> UsersToUpdate;
+ for (User *U : Inst->users())
+ if (!VersionedLoop->contains(cast<Instruction>(U)->getParent()))
+ UsersToUpdate.push_back(U);
+ for (User *U : UsersToUpdate)
+ U->replaceUsesOfWith(Inst, PN);
+ PN->addIncoming(Inst, VersionedLoop->getExitingBlock());
+ }
+ }
+
+ // Then for each PHI add the operand for the edge from the cloned loop.
+ for (auto I = PHIBlock->begin(); (PN = dyn_cast<PHINode>(I)); ++I) {
+ assert(PN->getNumOperands() == 1 &&
+ "Exit block should only have on predecessor");
+
+ // If the definition was cloned used that otherwise use the same value.
+ Value *ClonedValue = PN->getIncomingValue(0);
+ auto Mapped = VMap.find(ClonedValue);
+ if (Mapped != VMap.end())
+ ClonedValue = Mapped->second;
+
+ PN->addIncoming(ClonedValue, NonVersionedLoop->getExitingBlock());
+ }
+}
+
+void LoopVersioning::prepareNoAliasMetadata() {
+ // We need to turn the no-alias relation between pointer checking groups into
+ // no-aliasing annotations between instructions.
+ //
+ // We accomplish this by mapping each pointer checking group (a set of
+ // pointers memchecked together) to an alias scope and then also mapping each
+ // group to the list of scopes it can't alias.
+
+ const RuntimePointerChecking *RtPtrChecking = LAI.getRuntimePointerChecking();
+ LLVMContext &Context = VersionedLoop->getHeader()->getContext();
+
+ // First allocate an aliasing scope for each pointer checking group.
+ //
+ // While traversing through the checking groups in the loop, also create a
+ // reverse map from pointers to the pointer checking group they were assigned
+ // to.
+ MDBuilder MDB(Context);
+ MDNode *Domain = MDB.createAnonymousAliasScopeDomain("LVerDomain");
+
+ for (const auto &Group : RtPtrChecking->CheckingGroups) {
+ GroupToScope[&Group] = MDB.createAnonymousAliasScope(Domain);
+
+ for (unsigned PtrIdx : Group.Members)
+ PtrToGroup[RtPtrChecking->getPointerInfo(PtrIdx).PointerValue] = &Group;
+ }
+
+ // Go through the checks and for each pointer group, collect the scopes for
+ // each non-aliasing pointer group.
+ DenseMap<const RuntimeCheckingPtrGroup *, SmallVector<Metadata *, 4>>
+ GroupToNonAliasingScopes;
+
+ for (const auto &Check : AliasChecks)
+ GroupToNonAliasingScopes[Check.first].push_back(GroupToScope[Check.second]);
+
+ // Finally, transform the above to actually map to scope list which is what
+ // the metadata uses.
+
+ for (auto Pair : GroupToNonAliasingScopes)
+ GroupToNonAliasingScopeList[Pair.first] = MDNode::get(Context, Pair.second);
+}
+
+void LoopVersioning::annotateLoopWithNoAlias() {
+ if (!AnnotateNoAlias)
+ return;
+
+ // First prepare the maps.
+ prepareNoAliasMetadata();
+
+ // Add the scope and no-alias metadata to the instructions.
+ for (Instruction *I : LAI.getDepChecker().getMemoryInstructions()) {
+ annotateInstWithNoAlias(I);
+ }
+}
+
+void LoopVersioning::annotateInstWithNoAlias(Instruction *VersionedInst,
+ const Instruction *OrigInst) {
+ if (!AnnotateNoAlias)
+ return;
+
+ LLVMContext &Context = VersionedLoop->getHeader()->getContext();
+ const Value *Ptr = isa<LoadInst>(OrigInst)
+ ? cast<LoadInst>(OrigInst)->getPointerOperand()
+ : cast<StoreInst>(OrigInst)->getPointerOperand();
+
+ // Find the group for the pointer and then add the scope metadata.
+ auto Group = PtrToGroup.find(Ptr);
+ if (Group != PtrToGroup.end()) {
+ VersionedInst->setMetadata(
+ LLVMContext::MD_alias_scope,
+ MDNode::concatenate(
+ VersionedInst->getMetadata(LLVMContext::MD_alias_scope),
+ MDNode::get(Context, GroupToScope[Group->second])));
+
+ // Add the no-alias metadata.
+ auto NonAliasingScopeList = GroupToNonAliasingScopeList.find(Group->second);
+ if (NonAliasingScopeList != GroupToNonAliasingScopeList.end())
+ VersionedInst->setMetadata(
+ LLVMContext::MD_noalias,
+ MDNode::concatenate(
+ VersionedInst->getMetadata(LLVMContext::MD_noalias),
+ NonAliasingScopeList->second));
+ }
+}
+
+namespace {
bool runImpl(LoopInfo *LI, function_ref<const LoopAccessInfo &(Loop &)> GetLAA,
DominatorTree *DT, ScalarEvolution *SE) {
// Build up a worklist of inner-loops to version. This is necessary as the
@@ -288,59 +288,59 @@ bool runImpl(LoopInfo *LI, function_ref<const LoopAccessInfo &(Loop &)> GetLAA,
return Changed;
}
-/// Also expose this is a pass. Currently this is only used for
-/// unit-testing. It adds all memchecks necessary to remove all may-aliasing
-/// array accesses from the loop.
+/// Also expose this is a pass. Currently this is only used for
+/// unit-testing. It adds all memchecks necessary to remove all may-aliasing
+/// array accesses from the loop.
class LoopVersioningLegacyPass : public FunctionPass {
-public:
+public:
LoopVersioningLegacyPass() : FunctionPass(ID) {
initializeLoopVersioningLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ }
+
+ bool runOnFunction(Function &F) override {
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
auto GetLAA = [&](Loop &L) -> const LoopAccessInfo & {
return getAnalysis<LoopAccessLegacyAnalysis>().getInfo(&L);
};
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
-
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
return runImpl(LI, GetLAA, DT, SE);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addRequired<LoopAccessLegacyAnalysis>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- }
-
- static char ID;
-};
-}
-
-#define LVER_OPTION "loop-versioning"
-#define DEBUG_TYPE LVER_OPTION
-
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addRequired<LoopAccessLegacyAnalysis>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ }
+
+ static char ID;
+};
+}
+
+#define LVER_OPTION "loop-versioning"
+#define DEBUG_TYPE LVER_OPTION
+
char LoopVersioningLegacyPass::ID;
-static const char LVer_name[] = "Loop Versioning";
-
+static const char LVer_name[] = "Loop Versioning";
+
INITIALIZE_PASS_BEGIN(LoopVersioningLegacyPass, LVER_OPTION, LVer_name, false,
false)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
INITIALIZE_PASS_END(LoopVersioningLegacyPass, LVER_OPTION, LVer_name, false,
false)
-
-namespace llvm {
+
+namespace llvm {
FunctionPass *createLoopVersioningLegacyPass() {
return new LoopVersioningLegacyPass();
-}
+}
PreservedAnalyses LoopVersioningPass::run(Function &F,
FunctionAnalysisManager &AM) {
@@ -365,5 +365,5 @@ PreservedAnalyses LoopVersioningPass::run(Function &F,
if (runImpl(&LI, GetLAA, &DT, &SE))
return PreservedAnalyses::none();
return PreservedAnalyses::all();
-}
+}
} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LowerInvoke.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LowerInvoke.cpp
index 39f55a9ca6..fe0ff5899d 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LowerInvoke.cpp
@@ -1,97 +1,97 @@
-//===- LowerInvoke.cpp - Eliminate Invoke instructions --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This transformation is designed for use by code generators which do not yet
-// support stack unwinding. This pass converts 'invoke' instructions to 'call'
-// instructions, so that any exception-handling 'landingpad' blocks become dead
-// code (which can be removed by running the '-simplifycfg' pass afterwards).
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/LowerInvoke.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "lowerinvoke"
-
-STATISTIC(NumInvokes, "Number of invokes replaced");
-
-namespace {
- class LowerInvokeLegacyPass : public FunctionPass {
- public:
- static char ID; // Pass identification, replacement for typeid
- explicit LowerInvokeLegacyPass() : FunctionPass(ID) {
- initializeLowerInvokeLegacyPassPass(*PassRegistry::getPassRegistry());
- }
- bool runOnFunction(Function &F) override;
- };
-}
-
-char LowerInvokeLegacyPass::ID = 0;
-INITIALIZE_PASS(LowerInvokeLegacyPass, "lowerinvoke",
- "Lower invoke and unwind, for unwindless code generators",
- false, false)
-
-static bool runImpl(Function &F) {
- bool Changed = false;
- for (BasicBlock &BB : F)
- if (InvokeInst *II = dyn_cast<InvokeInst>(BB.getTerminator())) {
+//===- LowerInvoke.cpp - Eliminate Invoke instructions --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation is designed for use by code generators which do not yet
+// support stack unwinding. This pass converts 'invoke' instructions to 'call'
+// instructions, so that any exception-handling 'landingpad' blocks become dead
+// code (which can be removed by running the '-simplifycfg' pass afterwards).
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LowerInvoke.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "lowerinvoke"
+
+STATISTIC(NumInvokes, "Number of invokes replaced");
+
+namespace {
+ class LowerInvokeLegacyPass : public FunctionPass {
+ public:
+ static char ID; // Pass identification, replacement for typeid
+ explicit LowerInvokeLegacyPass() : FunctionPass(ID) {
+ initializeLowerInvokeLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+ bool runOnFunction(Function &F) override;
+ };
+}
+
+char LowerInvokeLegacyPass::ID = 0;
+INITIALIZE_PASS(LowerInvokeLegacyPass, "lowerinvoke",
+ "Lower invoke and unwind, for unwindless code generators",
+ false, false)
+
+static bool runImpl(Function &F) {
+ bool Changed = false;
+ for (BasicBlock &BB : F)
+ if (InvokeInst *II = dyn_cast<InvokeInst>(BB.getTerminator())) {
SmallVector<Value *, 16> CallArgs(II->args());
- SmallVector<OperandBundleDef, 1> OpBundles;
- II->getOperandBundlesAsDefs(OpBundles);
- // Insert a normal call instruction...
- CallInst *NewCall =
- CallInst::Create(II->getFunctionType(), II->getCalledOperand(),
- CallArgs, OpBundles, "", II);
- NewCall->takeName(II);
- NewCall->setCallingConv(II->getCallingConv());
- NewCall->setAttributes(II->getAttributes());
- NewCall->setDebugLoc(II->getDebugLoc());
- II->replaceAllUsesWith(NewCall);
-
- // Insert an unconditional branch to the normal destination.
- BranchInst::Create(II->getNormalDest(), II);
-
- // Remove any PHI node entries from the exception destination.
- II->getUnwindDest()->removePredecessor(&BB);
-
- // Remove the invoke instruction now.
- BB.getInstList().erase(II);
-
- ++NumInvokes;
- Changed = true;
- }
- return Changed;
-}
-
-bool LowerInvokeLegacyPass::runOnFunction(Function &F) {
- return runImpl(F);
-}
-
-namespace llvm {
-char &LowerInvokePassID = LowerInvokeLegacyPass::ID;
-
-// Public Interface To the LowerInvoke pass.
-FunctionPass *createLowerInvokePass() { return new LowerInvokeLegacyPass(); }
-
-PreservedAnalyses LowerInvokePass::run(Function &F,
- FunctionAnalysisManager &AM) {
- bool Changed = runImpl(F);
- if (!Changed)
- return PreservedAnalyses::all();
-
- return PreservedAnalyses::none();
-}
-}
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ II->getOperandBundlesAsDefs(OpBundles);
+ // Insert a normal call instruction...
+ CallInst *NewCall =
+ CallInst::Create(II->getFunctionType(), II->getCalledOperand(),
+ CallArgs, OpBundles, "", II);
+ NewCall->takeName(II);
+ NewCall->setCallingConv(II->getCallingConv());
+ NewCall->setAttributes(II->getAttributes());
+ NewCall->setDebugLoc(II->getDebugLoc());
+ II->replaceAllUsesWith(NewCall);
+
+ // Insert an unconditional branch to the normal destination.
+ BranchInst::Create(II->getNormalDest(), II);
+
+ // Remove any PHI node entries from the exception destination.
+ II->getUnwindDest()->removePredecessor(&BB);
+
+ // Remove the invoke instruction now.
+ BB.getInstList().erase(II);
+
+ ++NumInvokes;
+ Changed = true;
+ }
+ return Changed;
+}
+
+bool LowerInvokeLegacyPass::runOnFunction(Function &F) {
+ return runImpl(F);
+}
+
+namespace llvm {
+char &LowerInvokePassID = LowerInvokeLegacyPass::ID;
+
+// Public Interface To the LowerInvoke pass.
+FunctionPass *createLowerInvokePass() { return new LowerInvokeLegacyPass(); }
+
+PreservedAnalyses LowerInvokePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ bool Changed = runImpl(F);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 389c5108cb..616b4e8eb0 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -1,467 +1,467 @@
-//===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-
-using namespace llvm;
-
-void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
- Value *DstAddr, ConstantInt *CopyLen,
- Align SrcAlign, Align DstAlign,
- bool SrcIsVolatile, bool DstIsVolatile,
- const TargetTransformInfo &TTI) {
- // No need to expand zero length copies.
- if (CopyLen->isZero())
- return;
-
- BasicBlock *PreLoopBB = InsertBefore->getParent();
- BasicBlock *PostLoopBB = nullptr;
- Function *ParentFunc = PreLoopBB->getParent();
- LLVMContext &Ctx = PreLoopBB->getContext();
- const DataLayout &DL = ParentFunc->getParent()->getDataLayout();
-
- unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
- unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
-
- Type *TypeOfCopyLen = CopyLen->getType();
- Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
- Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
-
- unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
- uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
-
- if (LoopEndCount != 0) {
- // Split
- PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "memcpy-split");
- BasicBlock *LoopBB =
- BasicBlock::Create(Ctx, "load-store-loop", ParentFunc, PostLoopBB);
- PreLoopBB->getTerminator()->setSuccessor(0, LoopBB);
-
- IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
-
- // Cast the Src and Dst pointers to pointers to the loop operand type (if
- // needed).
- PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
- PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
- if (SrcAddr->getType() != SrcOpType) {
- SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType);
- }
- if (DstAddr->getType() != DstOpType) {
- DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
- }
-
- Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
- Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
-
- IRBuilder<> LoopBuilder(LoopBB);
- PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index");
- LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB);
- // Loop Body
- Value *SrcGEP =
- LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
- Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
- PartSrcAlign, SrcIsVolatile);
- Value *DstGEP =
- LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
- LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
-
- Value *NewIndex =
- LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U));
- LoopIndex->addIncoming(NewIndex, LoopBB);
-
- // Create the loop branch condition.
- Constant *LoopEndCI = ConstantInt::get(TypeOfCopyLen, LoopEndCount);
- LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopEndCI),
- LoopBB, PostLoopBB);
- }
-
- uint64_t BytesCopied = LoopEndCount * LoopOpSize;
- uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
- if (RemainingBytes) {
- IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI()
- : InsertBefore);
-
- SmallVector<Type *, 5> RemainingOps;
- TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
- SrcAS, DstAS, SrcAlign.value(),
- DstAlign.value());
-
- for (auto OpTy : RemainingOps) {
- Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
- Align PartDstAlign(commonAlignment(DstAlign, BytesCopied));
-
- // Calaculate the new index
- unsigned OperandSize = DL.getTypeStoreSize(OpTy);
- uint64_t GepIndex = BytesCopied / OperandSize;
- assert(GepIndex * OperandSize == BytesCopied &&
- "Division should have no Remainder!");
- // Cast source to operand type and load
- PointerType *SrcPtrType = PointerType::get(OpTy, SrcAS);
- Value *CastedSrc = SrcAddr->getType() == SrcPtrType
- ? SrcAddr
- : RBuilder.CreateBitCast(SrcAddr, SrcPtrType);
- Value *SrcGEP = RBuilder.CreateInBoundsGEP(
- OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex));
- Value *Load =
- RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile);
-
- // Cast destination to operand type and store.
- PointerType *DstPtrType = PointerType::get(OpTy, DstAS);
- Value *CastedDst = DstAddr->getType() == DstPtrType
- ? DstAddr
- : RBuilder.CreateBitCast(DstAddr, DstPtrType);
- Value *DstGEP = RBuilder.CreateInBoundsGEP(
- OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex));
- RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
-
- BytesCopied += OperandSize;
- }
- }
- assert(BytesCopied == CopyLen->getZExtValue() &&
- "Bytes copied should match size in the call!");
-}
-
-void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
- Value *SrcAddr, Value *DstAddr,
- Value *CopyLen, Align SrcAlign,
- Align DstAlign, bool SrcIsVolatile,
- bool DstIsVolatile,
- const TargetTransformInfo &TTI) {
- BasicBlock *PreLoopBB = InsertBefore->getParent();
- BasicBlock *PostLoopBB =
- PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion");
-
- Function *ParentFunc = PreLoopBB->getParent();
- const DataLayout &DL = ParentFunc->getParent()->getDataLayout();
- LLVMContext &Ctx = PreLoopBB->getContext();
- unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
- unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
-
- Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
- Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
- unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
-
- IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
-
- PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
- PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
- if (SrcAddr->getType() != SrcOpType) {
- SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType);
- }
- if (DstAddr->getType() != DstOpType) {
- DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
- }
-
- // Calculate the loop trip count, and remaining bytes to copy after the loop.
- Type *CopyLenType = CopyLen->getType();
- IntegerType *ILengthType = dyn_cast<IntegerType>(CopyLenType);
- assert(ILengthType &&
- "expected size argument to memcpy to be an integer type!");
- Type *Int8Type = Type::getInt8Ty(Ctx);
- bool LoopOpIsInt8 = LoopOpType == Int8Type;
- ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
- Value *RuntimeLoopCount = LoopOpIsInt8 ?
- CopyLen :
- PLBuilder.CreateUDiv(CopyLen, CILoopOpSize);
- BasicBlock *LoopBB =
- BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
- IRBuilder<> LoopBuilder(LoopBB);
-
- Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
- Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
-
- PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index");
- LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
-
- Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
- Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP, PartSrcAlign,
- SrcIsVolatile);
- Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
- LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
-
- Value *NewIndex =
- LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U));
- LoopIndex->addIncoming(NewIndex, LoopBB);
-
- if (!LoopOpIsInt8) {
- // Add in the
- Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
- Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
-
- // Loop body for the residual copy.
- BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual",
- PreLoopBB->getParent(),
- PostLoopBB);
- // Residual loop header.
- BasicBlock *ResHeaderBB = BasicBlock::Create(
- Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr);
-
- // Need to update the pre-loop basic block to branch to the correct place.
- // branch to the main loop if the count is non-zero, branch to the residual
- // loop if the copy size is smaller then 1 iteration of the main loop but
- // non-zero and finally branch to after the residual loop if the memcpy
- // size is zero.
- ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
- PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
- LoopBB, ResHeaderBB);
- PreLoopBB->getTerminator()->eraseFromParent();
-
- LoopBuilder.CreateCondBr(
- LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
- ResHeaderBB);
-
- // Determine if we need to branch to the residual loop or bypass it.
- IRBuilder<> RHBuilder(ResHeaderBB);
- RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidual, Zero),
- ResLoopBB, PostLoopBB);
-
- // Copy the residual with single byte load/store loop.
- IRBuilder<> ResBuilder(ResLoopBB);
- PHINode *ResidualIndex =
- ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index");
- ResidualIndex->addIncoming(Zero, ResHeaderBB);
-
- Value *SrcAsInt8 =
- ResBuilder.CreateBitCast(SrcAddr, PointerType::get(Int8Type, SrcAS));
- Value *DstAsInt8 =
- ResBuilder.CreateBitCast(DstAddr, PointerType::get(Int8Type, DstAS));
- Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex);
- Value *SrcGEP =
- ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset);
- Value *Load = ResBuilder.CreateAlignedLoad(Int8Type, SrcGEP, PartSrcAlign,
- SrcIsVolatile);
- Value *DstGEP =
- ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset);
- ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
-
- Value *ResNewIndex =
- ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U));
- ResidualIndex->addIncoming(ResNewIndex, ResLoopBB);
-
- // Create the loop branch condition.
- ResBuilder.CreateCondBr(
- ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidual), ResLoopBB,
- PostLoopBB);
- } else {
- // In this case the loop operand type was a byte, and there is no need for a
- // residual loop to copy the remaining memory after the main loop.
- // We do however need to patch up the control flow by creating the
- // terminators for the preloop block and the memcpy loop.
- ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
- PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
- LoopBB, PostLoopBB);
- PreLoopBB->getTerminator()->eraseFromParent();
- LoopBuilder.CreateCondBr(
- LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
- PostLoopBB);
- }
-}
-
-// Lower memmove to IR. memmove is required to correctly copy overlapping memory
-// regions; therefore, it has to check the relative positions of the source and
-// destination pointers and choose the copy direction accordingly.
-//
-// The code below is an IR rendition of this C function:
-//
-// void* memmove(void* dst, const void* src, size_t n) {
-// unsigned char* d = dst;
-// const unsigned char* s = src;
-// if (s < d) {
-// // copy backwards
-// while (n--) {
-// d[n] = s[n];
-// }
-// } else {
-// // copy forward
-// for (size_t i = 0; i < n; ++i) {
-// d[i] = s[i];
-// }
-// }
-// return dst;
-// }
-static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
- Value *DstAddr, Value *CopyLen, Align SrcAlign,
- Align DstAlign, bool SrcIsVolatile,
- bool DstIsVolatile) {
- Type *TypeOfCopyLen = CopyLen->getType();
- BasicBlock *OrigBB = InsertBefore->getParent();
- Function *F = OrigBB->getParent();
- const DataLayout &DL = F->getParent()->getDataLayout();
-
- Type *EltTy = cast<PointerType>(SrcAddr->getType())->getElementType();
-
- // Create the a comparison of src and dst, based on which we jump to either
- // the forward-copy part of the function (if src >= dst) or the backwards-copy
- // part (if src < dst).
- // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
- // structure. Its block terminators (unconditional branches) are replaced by
- // the appropriate conditional branches when the loop is built.
- ICmpInst *PtrCompare = new ICmpInst(InsertBefore, ICmpInst::ICMP_ULT,
- SrcAddr, DstAddr, "compare_src_dst");
- Instruction *ThenTerm, *ElseTerm;
- SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore, &ThenTerm,
- &ElseTerm);
-
- // Each part of the function consists of two blocks:
- // copy_backwards: used to skip the loop when n == 0
- // copy_backwards_loop: the actual backwards loop BB
- // copy_forward: used to skip the loop when n == 0
- // copy_forward_loop: the actual forward loop BB
- BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
- CopyBackwardsBB->setName("copy_backwards");
- BasicBlock *CopyForwardBB = ElseTerm->getParent();
- CopyForwardBB->setName("copy_forward");
- BasicBlock *ExitBB = InsertBefore->getParent();
- ExitBB->setName("memmove_done");
-
- unsigned PartSize = DL.getTypeStoreSize(EltTy);
- Align PartSrcAlign(commonAlignment(SrcAlign, PartSize));
- Align PartDstAlign(commonAlignment(DstAlign, PartSize));
-
- // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
- // between both backwards and forward copy clauses.
- ICmpInst *CompareN =
- new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
- ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
-
- // Copying backwards.
- BasicBlock *LoopBB =
- BasicBlock::Create(F->getContext(), "copy_backwards_loop", F, CopyForwardBB);
- IRBuilder<> LoopBuilder(LoopBB);
- PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
- Value *IndexPtr = LoopBuilder.CreateSub(
- LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
- Value *Element = LoopBuilder.CreateAlignedLoad(
- EltTy, LoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, IndexPtr),
- PartSrcAlign, "element");
- LoopBuilder.CreateAlignedStore(
- Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr),
- PartDstAlign);
- LoopBuilder.CreateCondBr(
- LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
- ExitBB, LoopBB);
- LoopPhi->addIncoming(IndexPtr, LoopBB);
- LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
- BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
- ThenTerm->eraseFromParent();
-
- // Copying forward.
- BasicBlock *FwdLoopBB =
- BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB);
- IRBuilder<> FwdLoopBuilder(FwdLoopBB);
- PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
- Value *SrcGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi);
- Value *FwdElement =
- FwdLoopBuilder.CreateAlignedLoad(EltTy, SrcGEP, PartSrcAlign, "element");
- Value *DstGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi);
- FwdLoopBuilder.CreateAlignedStore(FwdElement, DstGEP, PartDstAlign);
- Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
- FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
- FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
- ExitBB, FwdLoopBB);
- FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
- FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
-
- BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
- ElseTerm->eraseFromParent();
-}
-
-static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
- Value *CopyLen, Value *SetValue, Align DstAlign,
- bool IsVolatile) {
- Type *TypeOfCopyLen = CopyLen->getType();
- BasicBlock *OrigBB = InsertBefore->getParent();
- Function *F = OrigBB->getParent();
- const DataLayout &DL = F->getParent()->getDataLayout();
- BasicBlock *NewBB =
- OrigBB->splitBasicBlock(InsertBefore, "split");
- BasicBlock *LoopBB
- = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
-
- IRBuilder<> Builder(OrigBB->getTerminator());
-
- // Cast pointer to the type of value getting stored
- unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
- DstAddr = Builder.CreateBitCast(DstAddr,
- PointerType::get(SetValue->getType(), dstAS));
-
- Builder.CreateCondBr(
- Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
- LoopBB);
- OrigBB->getTerminator()->eraseFromParent();
-
- unsigned PartSize = DL.getTypeStoreSize(SetValue->getType());
- Align PartAlign(commonAlignment(DstAlign, PartSize));
-
- IRBuilder<> LoopBuilder(LoopBB);
- PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
- LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
-
- LoopBuilder.CreateAlignedStore(
- SetValue,
- LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
- PartAlign, IsVolatile);
-
- Value *NewIndex =
- LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
- LoopIndex->addIncoming(NewIndex, LoopBB);
-
- LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
- NewBB);
-}
-
-void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
- const TargetTransformInfo &TTI) {
- if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
- createMemCpyLoopKnownSize(
- /* InsertBefore */ Memcpy,
- /* SrcAddr */ Memcpy->getRawSource(),
- /* DstAddr */ Memcpy->getRawDest(),
- /* CopyLen */ CI,
- /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
- /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
- /* SrcIsVolatile */ Memcpy->isVolatile(),
- /* DstIsVolatile */ Memcpy->isVolatile(),
- /* TargetTransformInfo */ TTI);
- } else {
- createMemCpyLoopUnknownSize(
- /* InsertBefore */ Memcpy,
- /* SrcAddr */ Memcpy->getRawSource(),
- /* DstAddr */ Memcpy->getRawDest(),
- /* CopyLen */ Memcpy->getLength(),
- /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
- /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
- /* SrcIsVolatile */ Memcpy->isVolatile(),
- /* DstIsVolatile */ Memcpy->isVolatile(),
- /* TargetTransfomrInfo */ TTI);
- }
-}
-
-void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
- createMemMoveLoop(/* InsertBefore */ Memmove,
- /* SrcAddr */ Memmove->getRawSource(),
- /* DstAddr */ Memmove->getRawDest(),
- /* CopyLen */ Memmove->getLength(),
- /* SrcAlign */ Memmove->getSourceAlign().valueOrOne(),
- /* DestAlign */ Memmove->getDestAlign().valueOrOne(),
- /* SrcIsVolatile */ Memmove->isVolatile(),
- /* DstIsVolatile */ Memmove->isVolatile());
-}
-
-void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
- createMemSetLoop(/* InsertBefore */ Memset,
- /* DstAddr */ Memset->getRawDest(),
- /* CopyLen */ Memset->getLength(),
- /* SetValue */ Memset->getValue(),
- /* Alignment */ Memset->getDestAlign().valueOrOne(),
- Memset->isVolatile());
-}
+//===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+using namespace llvm;
+
+void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
+ Value *DstAddr, ConstantInt *CopyLen,
+ Align SrcAlign, Align DstAlign,
+ bool SrcIsVolatile, bool DstIsVolatile,
+ const TargetTransformInfo &TTI) {
+ // No need to expand zero length copies.
+ if (CopyLen->isZero())
+ return;
+
+ BasicBlock *PreLoopBB = InsertBefore->getParent();
+ BasicBlock *PostLoopBB = nullptr;
+ Function *ParentFunc = PreLoopBB->getParent();
+ LLVMContext &Ctx = PreLoopBB->getContext();
+ const DataLayout &DL = ParentFunc->getParent()->getDataLayout();
+
+ unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+ unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+ Type *TypeOfCopyLen = CopyLen->getType();
+ Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+ Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+
+ unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+ uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
+
+ if (LoopEndCount != 0) {
+ // Split
+ PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "memcpy-split");
+ BasicBlock *LoopBB =
+ BasicBlock::Create(Ctx, "load-store-loop", ParentFunc, PostLoopBB);
+ PreLoopBB->getTerminator()->setSuccessor(0, LoopBB);
+
+ IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
+
+ // Cast the Src and Dst pointers to pointers to the loop operand type (if
+ // needed).
+ PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
+ PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
+ if (SrcAddr->getType() != SrcOpType) {
+ SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType);
+ }
+ if (DstAddr->getType() != DstOpType) {
+ DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
+ }
+
+ Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+ Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+
+ IRBuilder<> LoopBuilder(LoopBB);
+ PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index");
+ LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB);
+ // Loop Body
+ Value *SrcGEP =
+ LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+ Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
+ PartSrcAlign, SrcIsVolatile);
+ Value *DstGEP =
+ LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+ LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
+
+ Value *NewIndex =
+ LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U));
+ LoopIndex->addIncoming(NewIndex, LoopBB);
+
+ // Create the loop branch condition.
+ Constant *LoopEndCI = ConstantInt::get(TypeOfCopyLen, LoopEndCount);
+ LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, LoopEndCI),
+ LoopBB, PostLoopBB);
+ }
+
+ uint64_t BytesCopied = LoopEndCount * LoopOpSize;
+ uint64_t RemainingBytes = CopyLen->getZExtValue() - BytesCopied;
+ if (RemainingBytes) {
+ IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI()
+ : InsertBefore);
+
+ SmallVector<Type *, 5> RemainingOps;
+ TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
+ SrcAS, DstAS, SrcAlign.value(),
+ DstAlign.value());
+
+ for (auto OpTy : RemainingOps) {
+ Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
+ Align PartDstAlign(commonAlignment(DstAlign, BytesCopied));
+
+ // Calaculate the new index
+ unsigned OperandSize = DL.getTypeStoreSize(OpTy);
+ uint64_t GepIndex = BytesCopied / OperandSize;
+ assert(GepIndex * OperandSize == BytesCopied &&
+ "Division should have no Remainder!");
+ // Cast source to operand type and load
+ PointerType *SrcPtrType = PointerType::get(OpTy, SrcAS);
+ Value *CastedSrc = SrcAddr->getType() == SrcPtrType
+ ? SrcAddr
+ : RBuilder.CreateBitCast(SrcAddr, SrcPtrType);
+ Value *SrcGEP = RBuilder.CreateInBoundsGEP(
+ OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex));
+ Value *Load =
+ RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile);
+
+ // Cast destination to operand type and store.
+ PointerType *DstPtrType = PointerType::get(OpTy, DstAS);
+ Value *CastedDst = DstAddr->getType() == DstPtrType
+ ? DstAddr
+ : RBuilder.CreateBitCast(DstAddr, DstPtrType);
+ Value *DstGEP = RBuilder.CreateInBoundsGEP(
+ OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex));
+ RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
+
+ BytesCopied += OperandSize;
+ }
+ }
+ assert(BytesCopied == CopyLen->getZExtValue() &&
+ "Bytes copied should match size in the call!");
+}
+
+void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
+ Value *SrcAddr, Value *DstAddr,
+ Value *CopyLen, Align SrcAlign,
+ Align DstAlign, bool SrcIsVolatile,
+ bool DstIsVolatile,
+ const TargetTransformInfo &TTI) {
+ BasicBlock *PreLoopBB = InsertBefore->getParent();
+ BasicBlock *PostLoopBB =
+ PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion");
+
+ Function *ParentFunc = PreLoopBB->getParent();
+ const DataLayout &DL = ParentFunc->getParent()->getDataLayout();
+ LLVMContext &Ctx = PreLoopBB->getContext();
+ unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+ unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+ Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+ Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+ unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
+
+ IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
+
+ PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
+ PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
+ if (SrcAddr->getType() != SrcOpType) {
+ SrcAddr = PLBuilder.CreateBitCast(SrcAddr, SrcOpType);
+ }
+ if (DstAddr->getType() != DstOpType) {
+ DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
+ }
+
+ // Calculate the loop trip count, and remaining bytes to copy after the loop.
+ Type *CopyLenType = CopyLen->getType();
+ IntegerType *ILengthType = dyn_cast<IntegerType>(CopyLenType);
+ assert(ILengthType &&
+ "expected size argument to memcpy to be an integer type!");
+ Type *Int8Type = Type::getInt8Ty(Ctx);
+ bool LoopOpIsInt8 = LoopOpType == Int8Type;
+ ConstantInt *CILoopOpSize = ConstantInt::get(ILengthType, LoopOpSize);
+ Value *RuntimeLoopCount = LoopOpIsInt8 ?
+ CopyLen :
+ PLBuilder.CreateUDiv(CopyLen, CILoopOpSize);
+ BasicBlock *LoopBB =
+ BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
+ IRBuilder<> LoopBuilder(LoopBB);
+
+ Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+ Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+
+ PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index");
+ LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
+
+ Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
+ Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP, PartSrcAlign,
+ SrcIsVolatile);
+ Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
+ LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
+
+ Value *NewIndex =
+ LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U));
+ LoopIndex->addIncoming(NewIndex, LoopBB);
+
+ if (!LoopOpIsInt8) {
+ // Add in the
+ Value *RuntimeResidual = PLBuilder.CreateURem(CopyLen, CILoopOpSize);
+ Value *RuntimeBytesCopied = PLBuilder.CreateSub(CopyLen, RuntimeResidual);
+
+ // Loop body for the residual copy.
+ BasicBlock *ResLoopBB = BasicBlock::Create(Ctx, "loop-memcpy-residual",
+ PreLoopBB->getParent(),
+ PostLoopBB);
+ // Residual loop header.
+ BasicBlock *ResHeaderBB = BasicBlock::Create(
+ Ctx, "loop-memcpy-residual-header", PreLoopBB->getParent(), nullptr);
+
+ // Need to update the pre-loop basic block to branch to the correct place.
+ // branch to the main loop if the count is non-zero, branch to the residual
+ // loop if the copy size is smaller then 1 iteration of the main loop but
+ // non-zero and finally branch to after the residual loop if the memcpy
+ // size is zero.
+ ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
+ PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+ LoopBB, ResHeaderBB);
+ PreLoopBB->getTerminator()->eraseFromParent();
+
+ LoopBuilder.CreateCondBr(
+ LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+ ResHeaderBB);
+
+ // Determine if we need to branch to the residual loop or bypass it.
+ IRBuilder<> RHBuilder(ResHeaderBB);
+ RHBuilder.CreateCondBr(RHBuilder.CreateICmpNE(RuntimeResidual, Zero),
+ ResLoopBB, PostLoopBB);
+
+ // Copy the residual with single byte load/store loop.
+ IRBuilder<> ResBuilder(ResLoopBB);
+ PHINode *ResidualIndex =
+ ResBuilder.CreatePHI(CopyLenType, 2, "residual-loop-index");
+ ResidualIndex->addIncoming(Zero, ResHeaderBB);
+
+ Value *SrcAsInt8 =
+ ResBuilder.CreateBitCast(SrcAddr, PointerType::get(Int8Type, SrcAS));
+ Value *DstAsInt8 =
+ ResBuilder.CreateBitCast(DstAddr, PointerType::get(Int8Type, DstAS));
+ Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex);
+ Value *SrcGEP =
+ ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset);
+ Value *Load = ResBuilder.CreateAlignedLoad(Int8Type, SrcGEP, PartSrcAlign,
+ SrcIsVolatile);
+ Value *DstGEP =
+ ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset);
+ ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
+
+ Value *ResNewIndex =
+ ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U));
+ ResidualIndex->addIncoming(ResNewIndex, ResLoopBB);
+
+ // Create the loop branch condition.
+ ResBuilder.CreateCondBr(
+ ResBuilder.CreateICmpULT(ResNewIndex, RuntimeResidual), ResLoopBB,
+ PostLoopBB);
+ } else {
+ // In this case the loop operand type was a byte, and there is no need for a
+ // residual loop to copy the remaining memory after the main loop.
+ // We do however need to patch up the control flow by creating the
+ // terminators for the preloop block and the memcpy loop.
+ ConstantInt *Zero = ConstantInt::get(ILengthType, 0U);
+ PLBuilder.CreateCondBr(PLBuilder.CreateICmpNE(RuntimeLoopCount, Zero),
+ LoopBB, PostLoopBB);
+ PreLoopBB->getTerminator()->eraseFromParent();
+ LoopBuilder.CreateCondBr(
+ LoopBuilder.CreateICmpULT(NewIndex, RuntimeLoopCount), LoopBB,
+ PostLoopBB);
+ }
+}
+
+// Lower memmove to IR. memmove is required to correctly copy overlapping memory
+// regions; therefore, it has to check the relative positions of the source and
+// destination pointers and choose the copy direction accordingly.
+//
+// The code below is an IR rendition of this C function:
+//
+// void* memmove(void* dst, const void* src, size_t n) {
+// unsigned char* d = dst;
+// const unsigned char* s = src;
+// if (s < d) {
+// // copy backwards
+// while (n--) {
+// d[n] = s[n];
+// }
+// } else {
+// // copy forward
+// for (size_t i = 0; i < n; ++i) {
+// d[i] = s[i];
+// }
+// }
+// return dst;
+// }
+static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
+ Value *DstAddr, Value *CopyLen, Align SrcAlign,
+ Align DstAlign, bool SrcIsVolatile,
+ bool DstIsVolatile) {
+ Type *TypeOfCopyLen = CopyLen->getType();
+ BasicBlock *OrigBB = InsertBefore->getParent();
+ Function *F = OrigBB->getParent();
+ const DataLayout &DL = F->getParent()->getDataLayout();
+
+ Type *EltTy = cast<PointerType>(SrcAddr->getType())->getElementType();
+
+ // Create the a comparison of src and dst, based on which we jump to either
+ // the forward-copy part of the function (if src >= dst) or the backwards-copy
+ // part (if src < dst).
+ // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
+ // structure. Its block terminators (unconditional branches) are replaced by
+ // the appropriate conditional branches when the loop is built.
+ ICmpInst *PtrCompare = new ICmpInst(InsertBefore, ICmpInst::ICMP_ULT,
+ SrcAddr, DstAddr, "compare_src_dst");
+ Instruction *ThenTerm, *ElseTerm;
+ SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore, &ThenTerm,
+ &ElseTerm);
+
+ // Each part of the function consists of two blocks:
+ // copy_backwards: used to skip the loop when n == 0
+ // copy_backwards_loop: the actual backwards loop BB
+ // copy_forward: used to skip the loop when n == 0
+ // copy_forward_loop: the actual forward loop BB
+ BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
+ CopyBackwardsBB->setName("copy_backwards");
+ BasicBlock *CopyForwardBB = ElseTerm->getParent();
+ CopyForwardBB->setName("copy_forward");
+ BasicBlock *ExitBB = InsertBefore->getParent();
+ ExitBB->setName("memmove_done");
+
+ unsigned PartSize = DL.getTypeStoreSize(EltTy);
+ Align PartSrcAlign(commonAlignment(SrcAlign, PartSize));
+ Align PartDstAlign(commonAlignment(DstAlign, PartSize));
+
+ // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
+ // between both backwards and forward copy clauses.
+ ICmpInst *CompareN =
+ new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
+ ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
+
+ // Copying backwards.
+ BasicBlock *LoopBB =
+ BasicBlock::Create(F->getContext(), "copy_backwards_loop", F, CopyForwardBB);
+ IRBuilder<> LoopBuilder(LoopBB);
+ PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+ Value *IndexPtr = LoopBuilder.CreateSub(
+ LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
+ Value *Element = LoopBuilder.CreateAlignedLoad(
+ EltTy, LoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, IndexPtr),
+ PartSrcAlign, "element");
+ LoopBuilder.CreateAlignedStore(
+ Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr),
+ PartDstAlign);
+ LoopBuilder.CreateCondBr(
+ LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
+ ExitBB, LoopBB);
+ LoopPhi->addIncoming(IndexPtr, LoopBB);
+ LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
+ BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
+ ThenTerm->eraseFromParent();
+
+ // Copying forward.
+ BasicBlock *FwdLoopBB =
+ BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB);
+ IRBuilder<> FwdLoopBuilder(FwdLoopBB);
+ PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
+ Value *SrcGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi);
+ Value *FwdElement =
+ FwdLoopBuilder.CreateAlignedLoad(EltTy, SrcGEP, PartSrcAlign, "element");
+ Value *DstGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi);
+ FwdLoopBuilder.CreateAlignedStore(FwdElement, DstGEP, PartDstAlign);
+ Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
+ FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
+ FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
+ ExitBB, FwdLoopBB);
+ FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
+ FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
+
+ BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
+ ElseTerm->eraseFromParent();
+}
+
+static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
+ Value *CopyLen, Value *SetValue, Align DstAlign,
+ bool IsVolatile) {
+ Type *TypeOfCopyLen = CopyLen->getType();
+ BasicBlock *OrigBB = InsertBefore->getParent();
+ Function *F = OrigBB->getParent();
+ const DataLayout &DL = F->getParent()->getDataLayout();
+ BasicBlock *NewBB =
+ OrigBB->splitBasicBlock(InsertBefore, "split");
+ BasicBlock *LoopBB
+ = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
+
+ IRBuilder<> Builder(OrigBB->getTerminator());
+
+ // Cast pointer to the type of value getting stored
+ unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+ DstAddr = Builder.CreateBitCast(DstAddr,
+ PointerType::get(SetValue->getType(), dstAS));
+
+ Builder.CreateCondBr(
+ Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB,
+ LoopBB);
+ OrigBB->getTerminator()->eraseFromParent();
+
+ unsigned PartSize = DL.getTypeStoreSize(SetValue->getType());
+ Align PartAlign(commonAlignment(DstAlign, PartSize));
+
+ IRBuilder<> LoopBuilder(LoopBB);
+ PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+ LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
+
+ LoopBuilder.CreateAlignedStore(
+ SetValue,
+ LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
+ PartAlign, IsVolatile);
+
+ Value *NewIndex =
+ LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
+ LoopIndex->addIncoming(NewIndex, LoopBB);
+
+ LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+ NewBB);
+}
+
+void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
+ const TargetTransformInfo &TTI) {
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
+ createMemCpyLoopKnownSize(
+ /* InsertBefore */ Memcpy,
+ /* SrcAddr */ Memcpy->getRawSource(),
+ /* DstAddr */ Memcpy->getRawDest(),
+ /* CopyLen */ CI,
+ /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
+ /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
+ /* SrcIsVolatile */ Memcpy->isVolatile(),
+ /* DstIsVolatile */ Memcpy->isVolatile(),
+ /* TargetTransformInfo */ TTI);
+ } else {
+ createMemCpyLoopUnknownSize(
+ /* InsertBefore */ Memcpy,
+ /* SrcAddr */ Memcpy->getRawSource(),
+ /* DstAddr */ Memcpy->getRawDest(),
+ /* CopyLen */ Memcpy->getLength(),
+ /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
+ /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
+ /* SrcIsVolatile */ Memcpy->isVolatile(),
+ /* DstIsVolatile */ Memcpy->isVolatile(),
+ /* TargetTransfomrInfo */ TTI);
+ }
+}
+
+void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
+ createMemMoveLoop(/* InsertBefore */ Memmove,
+ /* SrcAddr */ Memmove->getRawSource(),
+ /* DstAddr */ Memmove->getRawDest(),
+ /* CopyLen */ Memmove->getLength(),
+ /* SrcAlign */ Memmove->getSourceAlign().valueOrOne(),
+ /* DestAlign */ Memmove->getDestAlign().valueOrOne(),
+ /* SrcIsVolatile */ Memmove->isVolatile(),
+ /* DstIsVolatile */ Memmove->isVolatile());
+}
+
+void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
+ createMemSetLoop(/* InsertBefore */ Memset,
+ /* DstAddr */ Memset->getRawDest(),
+ /* CopyLen */ Memset->getLength(),
+ /* SetValue */ Memset->getValue(),
+ /* Alignment */ Memset->getDestAlign().valueOrOne(),
+ Memset->isVolatile());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/LowerSwitch.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/LowerSwitch.cpp
index 20ceb21bee..ec8d7a7074 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/LowerSwitch.cpp
@@ -1,87 +1,87 @@
-//===- LowerSwitch.cpp - Eliminate Switch instructions --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// The LowerSwitch transformation rewrites switch instructions with a sequence
-// of branches, which allows targets to get away with not implementing the
-// switch instruction until it is convenient.
-//
-//===----------------------------------------------------------------------===//
-
+//===- LowerSwitch.cpp - Eliminate Switch instructions --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The LowerSwitch transformation rewrites switch instructions with a sequence
+// of branches, which allows targets to get away with not implementing the
+// switch instruction until it is convenient.
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Utils/LowerSwitch.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/LazyValueInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/ConstantRange.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instructions.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instructions.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/KnownBits.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <limits>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "lower-switch"
-
-namespace {
-
- struct IntRange {
- int64_t Low, High;
- };
-
-} // end anonymous namespace
-
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lower-switch"
+
namespace {
-// Return true iff R is covered by Ranges.
+
+ struct IntRange {
+ int64_t Low, High;
+ };
+
+} // end anonymous namespace
+
+namespace {
+// Return true iff R is covered by Ranges.
bool IsInRanges(const IntRange &R, const std::vector<IntRange> &Ranges) {
- // Note: Ranges must be sorted, non-overlapping and non-adjacent.
-
- // Find the first range whose High field is >= R.High,
- // then check if the Low field is <= R.Low. If so, we
- // have a Range that covers R.
- auto I = llvm::lower_bound(
- Ranges, R, [](IntRange A, IntRange B) { return A.High < B.High; });
- return I != Ranges.end() && I->Low <= R.Low;
-}
-
+ // Note: Ranges must be sorted, non-overlapping and non-adjacent.
+
+ // Find the first range whose High field is >= R.High,
+ // then check if the Low field is <= R.Low. If so, we
+ // have a Range that covers R.
+ auto I = llvm::lower_bound(
+ Ranges, R, [](IntRange A, IntRange B) { return A.High < B.High; });
+ return I != Ranges.end() && I->Low <= R.Low;
+}
+
struct CaseRange {
ConstantInt *Low;
ConstantInt *High;
BasicBlock *BB;
-
+
CaseRange(ConstantInt *low, ConstantInt *high, BasicBlock *bb)
: Low(low), High(high), BB(bb) {}
};
-
+
using CaseVector = std::vector<CaseRange>;
using CaseItr = std::vector<CaseRange>::iterator;
-
+
/// The comparison function for sorting the switch case values in the vector.
/// WARNING: Case ranges should be disjoint!
struct CaseCmp {
@@ -89,66 +89,66 @@ struct CaseCmp {
const ConstantInt *CI1 = cast<const ConstantInt>(C1.Low);
const ConstantInt *CI2 = cast<const ConstantInt>(C2.High);
return CI1->getValue().slt(CI2->getValue());
- }
+ }
};
-
-/// Used for debugging purposes.
-LLVM_ATTRIBUTE_USED
+
+/// Used for debugging purposes.
+LLVM_ATTRIBUTE_USED
raw_ostream &operator<<(raw_ostream &O, const CaseVector &C) {
- O << "[";
-
+ O << "[";
+
for (CaseVector::const_iterator B = C.begin(), E = C.end(); B != E;) {
- O << "[" << B->Low->getValue() << ", " << B->High->getValue() << "]";
- if (++B != E)
- O << ", ";
- }
-
- return O << "]";
-}
-
-/// Update the first occurrence of the "switch statement" BB in the PHI
-/// node with the "new" BB. The other occurrences will:
-///
-/// 1) Be updated by subsequent calls to this function. Switch statements may
-/// have more than one outcoming edge into the same BB if they all have the same
-/// value. When the switch statement is converted these incoming edges are now
-/// coming from multiple BBs.
-/// 2) Removed if subsequent incoming values now share the same case, i.e.,
-/// multiple outcome edges are condensed into one. This is necessary to keep the
-/// number of phi values equal to the number of branches to SuccBB.
+ O << "[" << B->Low->getValue() << ", " << B->High->getValue() << "]";
+ if (++B != E)
+ O << ", ";
+ }
+
+ return O << "]";
+}
+
+/// Update the first occurrence of the "switch statement" BB in the PHI
+/// node with the "new" BB. The other occurrences will:
+///
+/// 1) Be updated by subsequent calls to this function. Switch statements may
+/// have more than one outcoming edge into the same BB if they all have the same
+/// value. When the switch statement is converted these incoming edges are now
+/// coming from multiple BBs.
+/// 2) Removed if subsequent incoming values now share the same case, i.e.,
+/// multiple outcome edges are condensed into one. This is necessary to keep the
+/// number of phi values equal to the number of branches to SuccBB.
void FixPhis(
BasicBlock *SuccBB, BasicBlock *OrigBB, BasicBlock *NewBB,
const unsigned NumMergedCases = std::numeric_limits<unsigned>::max()) {
- for (BasicBlock::iterator I = SuccBB->begin(),
- IE = SuccBB->getFirstNonPHI()->getIterator();
- I != IE; ++I) {
- PHINode *PN = cast<PHINode>(I);
-
- // Only update the first occurrence.
- unsigned Idx = 0, E = PN->getNumIncomingValues();
- unsigned LocalNumMergedCases = NumMergedCases;
- for (; Idx != E; ++Idx) {
- if (PN->getIncomingBlock(Idx) == OrigBB) {
- PN->setIncomingBlock(Idx, NewBB);
- break;
- }
- }
-
- // Remove additional occurrences coming from condensed cases and keep the
- // number of incoming values equal to the number of branches to SuccBB.
- SmallVector<unsigned, 8> Indices;
- for (++Idx; LocalNumMergedCases > 0 && Idx < E; ++Idx)
- if (PN->getIncomingBlock(Idx) == OrigBB) {
- Indices.push_back(Idx);
- LocalNumMergedCases--;
- }
- // Remove incoming values in the reverse order to prevent invalidating
- // *successive* index.
- for (unsigned III : llvm::reverse(Indices))
- PN->removeIncomingValue(III);
- }
-}
-
+ for (BasicBlock::iterator I = SuccBB->begin(),
+ IE = SuccBB->getFirstNonPHI()->getIterator();
+ I != IE; ++I) {
+ PHINode *PN = cast<PHINode>(I);
+
+ // Only update the first occurrence.
+ unsigned Idx = 0, E = PN->getNumIncomingValues();
+ unsigned LocalNumMergedCases = NumMergedCases;
+ for (; Idx != E; ++Idx) {
+ if (PN->getIncomingBlock(Idx) == OrigBB) {
+ PN->setIncomingBlock(Idx, NewBB);
+ break;
+ }
+ }
+
+ // Remove additional occurrences coming from condensed cases and keep the
+ // number of incoming values equal to the number of branches to SuccBB.
+ SmallVector<unsigned, 8> Indices;
+ for (++Idx; LocalNumMergedCases > 0 && Idx < E; ++Idx)
+ if (PN->getIncomingBlock(Idx) == OrigBB) {
+ Indices.push_back(Idx);
+ LocalNumMergedCases--;
+ }
+ // Remove incoming values in the reverse order to prevent invalidating
+ // *successive* index.
+ for (unsigned III : llvm::reverse(Indices))
+ PN->removeIncomingValue(III);
+ }
+}
+
/// Create a new leaf block for the binary lookup tree. It checks if the
/// switch's value == the case's value. If not, then it jumps to the default
/// branch. At this point in the tree, the value can't be another valid case
@@ -213,312 +213,312 @@ BasicBlock *NewLeafBlock(CaseRange &Leaf, Value *Val, ConstantInt *LowerBound,
return NewLeaf;
}
-/// Convert the switch statement into a binary lookup of the case values.
-/// The function recursively builds this tree. LowerBound and UpperBound are
-/// used to keep track of the bounds for Val that have already been checked by
-/// a block emitted by one of the previous calls to switchConvert in the call
-/// stack.
+/// Convert the switch statement into a binary lookup of the case values.
+/// The function recursively builds this tree. LowerBound and UpperBound are
+/// used to keep track of the bounds for Val that have already been checked by
+/// a block emitted by one of the previous calls to switchConvert in the call
+/// stack.
BasicBlock *SwitchConvert(CaseItr Begin, CaseItr End, ConstantInt *LowerBound,
ConstantInt *UpperBound, Value *Val,
BasicBlock *Predecessor, BasicBlock *OrigBlock,
BasicBlock *Default,
const std::vector<IntRange> &UnreachableRanges) {
- assert(LowerBound && UpperBound && "Bounds must be initialized");
- unsigned Size = End - Begin;
-
- if (Size == 1) {
- // Check if the Case Range is perfectly squeezed in between
- // already checked Upper and Lower bounds. If it is then we can avoid
- // emitting the code that checks if the value actually falls in the range
- // because the bounds already tell us so.
- if (Begin->Low == LowerBound && Begin->High == UpperBound) {
- unsigned NumMergedCases = 0;
- NumMergedCases = UpperBound->getSExtValue() - LowerBound->getSExtValue();
+ assert(LowerBound && UpperBound && "Bounds must be initialized");
+ unsigned Size = End - Begin;
+
+ if (Size == 1) {
+ // Check if the Case Range is perfectly squeezed in between
+ // already checked Upper and Lower bounds. If it is then we can avoid
+ // emitting the code that checks if the value actually falls in the range
+ // because the bounds already tell us so.
+ if (Begin->Low == LowerBound && Begin->High == UpperBound) {
+ unsigned NumMergedCases = 0;
+ NumMergedCases = UpperBound->getSExtValue() - LowerBound->getSExtValue();
FixPhis(Begin->BB, OrigBlock, Predecessor, NumMergedCases);
- return Begin->BB;
- }
+ return Begin->BB;
+ }
return NewLeafBlock(*Begin, Val, LowerBound, UpperBound, OrigBlock,
- Default);
- }
-
- unsigned Mid = Size / 2;
- std::vector<CaseRange> LHS(Begin, Begin + Mid);
- LLVM_DEBUG(dbgs() << "LHS: " << LHS << "\n");
- std::vector<CaseRange> RHS(Begin + Mid, End);
- LLVM_DEBUG(dbgs() << "RHS: " << RHS << "\n");
-
- CaseRange &Pivot = *(Begin + Mid);
- LLVM_DEBUG(dbgs() << "Pivot ==> [" << Pivot.Low->getValue() << ", "
- << Pivot.High->getValue() << "]\n");
-
- // NewLowerBound here should never be the integer minimal value.
- // This is because it is computed from a case range that is never
- // the smallest, so there is always a case range that has at least
- // a smaller value.
- ConstantInt *NewLowerBound = Pivot.Low;
-
- // Because NewLowerBound is never the smallest representable integer
- // it is safe here to subtract one.
- ConstantInt *NewUpperBound = ConstantInt::get(NewLowerBound->getContext(),
- NewLowerBound->getValue() - 1);
-
- if (!UnreachableRanges.empty()) {
- // Check if the gap between LHS's highest and NewLowerBound is unreachable.
- int64_t GapLow = LHS.back().High->getSExtValue() + 1;
- int64_t GapHigh = NewLowerBound->getSExtValue() - 1;
- IntRange Gap = { GapLow, GapHigh };
- if (GapHigh >= GapLow && IsInRanges(Gap, UnreachableRanges))
- NewUpperBound = LHS.back().High;
- }
-
- LLVM_DEBUG(dbgs() << "LHS Bounds ==> [" << LowerBound->getSExtValue() << ", "
- << NewUpperBound->getSExtValue() << "]\n"
- << "RHS Bounds ==> [" << NewLowerBound->getSExtValue()
- << ", " << UpperBound->getSExtValue() << "]\n");
-
- // Create a new node that checks if the value is < pivot. Go to the
- // left branch if it is and right branch if not.
- Function* F = OrigBlock->getParent();
- BasicBlock* NewNode = BasicBlock::Create(Val->getContext(), "NodeBlock");
-
- ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT,
- Val, Pivot.Low, "Pivot");
-
+ Default);
+ }
+
+ unsigned Mid = Size / 2;
+ std::vector<CaseRange> LHS(Begin, Begin + Mid);
+ LLVM_DEBUG(dbgs() << "LHS: " << LHS << "\n");
+ std::vector<CaseRange> RHS(Begin + Mid, End);
+ LLVM_DEBUG(dbgs() << "RHS: " << RHS << "\n");
+
+ CaseRange &Pivot = *(Begin + Mid);
+ LLVM_DEBUG(dbgs() << "Pivot ==> [" << Pivot.Low->getValue() << ", "
+ << Pivot.High->getValue() << "]\n");
+
+ // NewLowerBound here should never be the integer minimal value.
+ // This is because it is computed from a case range that is never
+ // the smallest, so there is always a case range that has at least
+ // a smaller value.
+ ConstantInt *NewLowerBound = Pivot.Low;
+
+ // Because NewLowerBound is never the smallest representable integer
+ // it is safe here to subtract one.
+ ConstantInt *NewUpperBound = ConstantInt::get(NewLowerBound->getContext(),
+ NewLowerBound->getValue() - 1);
+
+ if (!UnreachableRanges.empty()) {
+ // Check if the gap between LHS's highest and NewLowerBound is unreachable.
+ int64_t GapLow = LHS.back().High->getSExtValue() + 1;
+ int64_t GapHigh = NewLowerBound->getSExtValue() - 1;
+ IntRange Gap = { GapLow, GapHigh };
+ if (GapHigh >= GapLow && IsInRanges(Gap, UnreachableRanges))
+ NewUpperBound = LHS.back().High;
+ }
+
+ LLVM_DEBUG(dbgs() << "LHS Bounds ==> [" << LowerBound->getSExtValue() << ", "
+ << NewUpperBound->getSExtValue() << "]\n"
+ << "RHS Bounds ==> [" << NewLowerBound->getSExtValue()
+ << ", " << UpperBound->getSExtValue() << "]\n");
+
+ // Create a new node that checks if the value is < pivot. Go to the
+ // left branch if it is and right branch if not.
+ Function* F = OrigBlock->getParent();
+ BasicBlock* NewNode = BasicBlock::Create(Val->getContext(), "NodeBlock");
+
+ ICmpInst* Comp = new ICmpInst(ICmpInst::ICMP_SLT,
+ Val, Pivot.Low, "Pivot");
+
BasicBlock *LBranch =
SwitchConvert(LHS.begin(), LHS.end(), LowerBound, NewUpperBound, Val,
NewNode, OrigBlock, Default, UnreachableRanges);
BasicBlock *RBranch =
SwitchConvert(RHS.begin(), RHS.end(), NewLowerBound, UpperBound, Val,
NewNode, OrigBlock, Default, UnreachableRanges);
-
- F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewNode);
- NewNode->getInstList().push_back(Comp);
-
- BranchInst::Create(LBranch, RBranch, Comp, NewNode);
- return NewNode;
-}
-
-/// Transform simple list of \p SI's cases into list of CaseRange's \p Cases.
-/// \post \p Cases wouldn't contain references to \p SI's default BB.
-/// \returns Number of \p SI's cases that do not reference \p SI's default BB.
+
+ F->getBasicBlockList().insert(++OrigBlock->getIterator(), NewNode);
+ NewNode->getInstList().push_back(Comp);
+
+ BranchInst::Create(LBranch, RBranch, Comp, NewNode);
+ return NewNode;
+}
+
+/// Transform simple list of \p SI's cases into list of CaseRange's \p Cases.
+/// \post \p Cases wouldn't contain references to \p SI's default BB.
+/// \returns Number of \p SI's cases that do not reference \p SI's default BB.
unsigned Clusterify(CaseVector &Cases, SwitchInst *SI) {
- unsigned NumSimpleCases = 0;
-
- // Start with "simple" cases
- for (auto Case : SI->cases()) {
- if (Case.getCaseSuccessor() == SI->getDefaultDest())
- continue;
- Cases.push_back(CaseRange(Case.getCaseValue(), Case.getCaseValue(),
- Case.getCaseSuccessor()));
- ++NumSimpleCases;
- }
-
- llvm::sort(Cases, CaseCmp());
-
- // Merge case into clusters
- if (Cases.size() >= 2) {
- CaseItr I = Cases.begin();
- for (CaseItr J = std::next(I), E = Cases.end(); J != E; ++J) {
- int64_t nextValue = J->Low->getSExtValue();
- int64_t currentValue = I->High->getSExtValue();
- BasicBlock* nextBB = J->BB;
- BasicBlock* currentBB = I->BB;
-
- // If the two neighboring cases go to the same destination, merge them
- // into a single case.
- assert(nextValue > currentValue && "Cases should be strictly ascending");
- if ((nextValue == currentValue + 1) && (currentBB == nextBB)) {
- I->High = J->High;
- // FIXME: Combine branch weights.
- } else if (++I != J) {
- *I = *J;
- }
- }
- Cases.erase(std::next(I), Cases.end());
- }
-
- return NumSimpleCases;
-}
-
-/// Replace the specified switch instruction with a sequence of chained if-then
-/// insts in a balanced binary search.
+ unsigned NumSimpleCases = 0;
+
+ // Start with "simple" cases
+ for (auto Case : SI->cases()) {
+ if (Case.getCaseSuccessor() == SI->getDefaultDest())
+ continue;
+ Cases.push_back(CaseRange(Case.getCaseValue(), Case.getCaseValue(),
+ Case.getCaseSuccessor()));
+ ++NumSimpleCases;
+ }
+
+ llvm::sort(Cases, CaseCmp());
+
+ // Merge case into clusters
+ if (Cases.size() >= 2) {
+ CaseItr I = Cases.begin();
+ for (CaseItr J = std::next(I), E = Cases.end(); J != E; ++J) {
+ int64_t nextValue = J->Low->getSExtValue();
+ int64_t currentValue = I->High->getSExtValue();
+ BasicBlock* nextBB = J->BB;
+ BasicBlock* currentBB = I->BB;
+
+ // If the two neighboring cases go to the same destination, merge them
+ // into a single case.
+ assert(nextValue > currentValue && "Cases should be strictly ascending");
+ if ((nextValue == currentValue + 1) && (currentBB == nextBB)) {
+ I->High = J->High;
+ // FIXME: Combine branch weights.
+ } else if (++I != J) {
+ *I = *J;
+ }
+ }
+ Cases.erase(std::next(I), Cases.end());
+ }
+
+ return NumSimpleCases;
+}
+
+/// Replace the specified switch instruction with a sequence of chained if-then
+/// insts in a balanced binary search.
void ProcessSwitchInst(SwitchInst *SI,
SmallPtrSetImpl<BasicBlock *> &DeleteList,
AssumptionCache *AC, LazyValueInfo *LVI) {
- BasicBlock *OrigBlock = SI->getParent();
- Function *F = OrigBlock->getParent();
- Value *Val = SI->getCondition(); // The value we are switching on...
- BasicBlock* Default = SI->getDefaultDest();
-
- // Don't handle unreachable blocks. If there are successors with phis, this
- // would leave them behind with missing predecessors.
- if ((OrigBlock != &F->getEntryBlock() && pred_empty(OrigBlock)) ||
- OrigBlock->getSinglePredecessor() == OrigBlock) {
- DeleteList.insert(OrigBlock);
- return;
- }
-
- // Prepare cases vector.
- CaseVector Cases;
- const unsigned NumSimpleCases = Clusterify(Cases, SI);
- LLVM_DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
- << ". Total non-default cases: " << NumSimpleCases
- << "\nCase clusters: " << Cases << "\n");
-
- // If there is only the default destination, just branch.
- if (Cases.empty()) {
- BranchInst::Create(Default, OrigBlock);
- // Remove all the references from Default's PHIs to OrigBlock, but one.
+ BasicBlock *OrigBlock = SI->getParent();
+ Function *F = OrigBlock->getParent();
+ Value *Val = SI->getCondition(); // The value we are switching on...
+ BasicBlock* Default = SI->getDefaultDest();
+
+ // Don't handle unreachable blocks. If there are successors with phis, this
+ // would leave them behind with missing predecessors.
+ if ((OrigBlock != &F->getEntryBlock() && pred_empty(OrigBlock)) ||
+ OrigBlock->getSinglePredecessor() == OrigBlock) {
+ DeleteList.insert(OrigBlock);
+ return;
+ }
+
+ // Prepare cases vector.
+ CaseVector Cases;
+ const unsigned NumSimpleCases = Clusterify(Cases, SI);
+ LLVM_DEBUG(dbgs() << "Clusterify finished. Total clusters: " << Cases.size()
+ << ". Total non-default cases: " << NumSimpleCases
+ << "\nCase clusters: " << Cases << "\n");
+
+ // If there is only the default destination, just branch.
+ if (Cases.empty()) {
+ BranchInst::Create(Default, OrigBlock);
+ // Remove all the references from Default's PHIs to OrigBlock, but one.
FixPhis(Default, OrigBlock, OrigBlock);
- SI->eraseFromParent();
- return;
- }
-
- ConstantInt *LowerBound = nullptr;
- ConstantInt *UpperBound = nullptr;
- bool DefaultIsUnreachableFromSwitch = false;
-
- if (isa<UnreachableInst>(Default->getFirstNonPHIOrDbg())) {
- // Make the bounds tightly fitted around the case value range, because we
- // know that the value passed to the switch must be exactly one of the case
- // values.
- LowerBound = Cases.front().Low;
- UpperBound = Cases.back().High;
- DefaultIsUnreachableFromSwitch = true;
- } else {
- // Constraining the range of the value being switched over helps eliminating
- // unreachable BBs and minimizing the number of `add` instructions
- // newLeafBlock ends up emitting. Running CorrelatedValuePropagation after
- // LowerSwitch isn't as good, and also much more expensive in terms of
- // compile time for the following reasons:
- // 1. it processes many kinds of instructions, not just switches;
- // 2. even if limited to icmp instructions only, it will have to process
- // roughly C icmp's per switch, where C is the number of cases in the
- // switch, while LowerSwitch only needs to call LVI once per switch.
- const DataLayout &DL = F->getParent()->getDataLayout();
- KnownBits Known = computeKnownBits(Val, DL, /*Depth=*/0, AC, SI);
- // TODO Shouldn't this create a signed range?
- ConstantRange KnownBitsRange =
- ConstantRange::fromKnownBits(Known, /*IsSigned=*/false);
+ SI->eraseFromParent();
+ return;
+ }
+
+ ConstantInt *LowerBound = nullptr;
+ ConstantInt *UpperBound = nullptr;
+ bool DefaultIsUnreachableFromSwitch = false;
+
+ if (isa<UnreachableInst>(Default->getFirstNonPHIOrDbg())) {
+ // Make the bounds tightly fitted around the case value range, because we
+ // know that the value passed to the switch must be exactly one of the case
+ // values.
+ LowerBound = Cases.front().Low;
+ UpperBound = Cases.back().High;
+ DefaultIsUnreachableFromSwitch = true;
+ } else {
+ // Constraining the range of the value being switched over helps eliminating
+ // unreachable BBs and minimizing the number of `add` instructions
+ // newLeafBlock ends up emitting. Running CorrelatedValuePropagation after
+ // LowerSwitch isn't as good, and also much more expensive in terms of
+ // compile time for the following reasons:
+ // 1. it processes many kinds of instructions, not just switches;
+ // 2. even if limited to icmp instructions only, it will have to process
+ // roughly C icmp's per switch, where C is the number of cases in the
+ // switch, while LowerSwitch only needs to call LVI once per switch.
+ const DataLayout &DL = F->getParent()->getDataLayout();
+ KnownBits Known = computeKnownBits(Val, DL, /*Depth=*/0, AC, SI);
+ // TODO Shouldn't this create a signed range?
+ ConstantRange KnownBitsRange =
+ ConstantRange::fromKnownBits(Known, /*IsSigned=*/false);
const ConstantRange LVIRange = LVI->getConstantRange(Val, SI);
- ConstantRange ValRange = KnownBitsRange.intersectWith(LVIRange);
- // We delegate removal of unreachable non-default cases to other passes. In
- // the unlikely event that some of them survived, we just conservatively
- // maintain the invariant that all the cases lie between the bounds. This
- // may, however, still render the default case effectively unreachable.
- APInt Low = Cases.front().Low->getValue();
- APInt High = Cases.back().High->getValue();
- APInt Min = APIntOps::smin(ValRange.getSignedMin(), Low);
- APInt Max = APIntOps::smax(ValRange.getSignedMax(), High);
-
- LowerBound = ConstantInt::get(SI->getContext(), Min);
- UpperBound = ConstantInt::get(SI->getContext(), Max);
- DefaultIsUnreachableFromSwitch = (Min + (NumSimpleCases - 1) == Max);
- }
-
- std::vector<IntRange> UnreachableRanges;
-
- if (DefaultIsUnreachableFromSwitch) {
- DenseMap<BasicBlock *, unsigned> Popularity;
- unsigned MaxPop = 0;
- BasicBlock *PopSucc = nullptr;
-
- IntRange R = {std::numeric_limits<int64_t>::min(),
- std::numeric_limits<int64_t>::max()};
- UnreachableRanges.push_back(R);
- for (const auto &I : Cases) {
- int64_t Low = I.Low->getSExtValue();
- int64_t High = I.High->getSExtValue();
-
- IntRange &LastRange = UnreachableRanges.back();
- if (LastRange.Low == Low) {
- // There is nothing left of the previous range.
- UnreachableRanges.pop_back();
- } else {
- // Terminate the previous range.
- assert(Low > LastRange.Low);
- LastRange.High = Low - 1;
- }
- if (High != std::numeric_limits<int64_t>::max()) {
- IntRange R = { High + 1, std::numeric_limits<int64_t>::max() };
- UnreachableRanges.push_back(R);
- }
-
- // Count popularity.
- int64_t N = High - Low + 1;
- unsigned &Pop = Popularity[I.BB];
- if ((Pop += N) > MaxPop) {
- MaxPop = Pop;
- PopSucc = I.BB;
- }
- }
-#ifndef NDEBUG
- /* UnreachableRanges should be sorted and the ranges non-adjacent. */
- for (auto I = UnreachableRanges.begin(), E = UnreachableRanges.end();
- I != E; ++I) {
- assert(I->Low <= I->High);
- auto Next = I + 1;
- if (Next != E) {
- assert(Next->Low > I->High);
- }
- }
-#endif
-
- // As the default block in the switch is unreachable, update the PHI nodes
- // (remove all of the references to the default block) to reflect this.
- const unsigned NumDefaultEdges = SI->getNumCases() + 1 - NumSimpleCases;
- for (unsigned I = 0; I < NumDefaultEdges; ++I)
- Default->removePredecessor(OrigBlock);
-
- // Use the most popular block as the new default, reducing the number of
- // cases.
- assert(MaxPop > 0 && PopSucc);
- Default = PopSucc;
+ ConstantRange ValRange = KnownBitsRange.intersectWith(LVIRange);
+ // We delegate removal of unreachable non-default cases to other passes. In
+ // the unlikely event that some of them survived, we just conservatively
+ // maintain the invariant that all the cases lie between the bounds. This
+ // may, however, still render the default case effectively unreachable.
+ APInt Low = Cases.front().Low->getValue();
+ APInt High = Cases.back().High->getValue();
+ APInt Min = APIntOps::smin(ValRange.getSignedMin(), Low);
+ APInt Max = APIntOps::smax(ValRange.getSignedMax(), High);
+
+ LowerBound = ConstantInt::get(SI->getContext(), Min);
+ UpperBound = ConstantInt::get(SI->getContext(), Max);
+ DefaultIsUnreachableFromSwitch = (Min + (NumSimpleCases - 1) == Max);
+ }
+
+ std::vector<IntRange> UnreachableRanges;
+
+ if (DefaultIsUnreachableFromSwitch) {
+ DenseMap<BasicBlock *, unsigned> Popularity;
+ unsigned MaxPop = 0;
+ BasicBlock *PopSucc = nullptr;
+
+ IntRange R = {std::numeric_limits<int64_t>::min(),
+ std::numeric_limits<int64_t>::max()};
+ UnreachableRanges.push_back(R);
+ for (const auto &I : Cases) {
+ int64_t Low = I.Low->getSExtValue();
+ int64_t High = I.High->getSExtValue();
+
+ IntRange &LastRange = UnreachableRanges.back();
+ if (LastRange.Low == Low) {
+ // There is nothing left of the previous range.
+ UnreachableRanges.pop_back();
+ } else {
+ // Terminate the previous range.
+ assert(Low > LastRange.Low);
+ LastRange.High = Low - 1;
+ }
+ if (High != std::numeric_limits<int64_t>::max()) {
+ IntRange R = { High + 1, std::numeric_limits<int64_t>::max() };
+ UnreachableRanges.push_back(R);
+ }
+
+ // Count popularity.
+ int64_t N = High - Low + 1;
+ unsigned &Pop = Popularity[I.BB];
+ if ((Pop += N) > MaxPop) {
+ MaxPop = Pop;
+ PopSucc = I.BB;
+ }
+ }
+#ifndef NDEBUG
+ /* UnreachableRanges should be sorted and the ranges non-adjacent. */
+ for (auto I = UnreachableRanges.begin(), E = UnreachableRanges.end();
+ I != E; ++I) {
+ assert(I->Low <= I->High);
+ auto Next = I + 1;
+ if (Next != E) {
+ assert(Next->Low > I->High);
+ }
+ }
+#endif
+
+ // As the default block in the switch is unreachable, update the PHI nodes
+ // (remove all of the references to the default block) to reflect this.
+ const unsigned NumDefaultEdges = SI->getNumCases() + 1 - NumSimpleCases;
+ for (unsigned I = 0; I < NumDefaultEdges; ++I)
+ Default->removePredecessor(OrigBlock);
+
+ // Use the most popular block as the new default, reducing the number of
+ // cases.
+ assert(MaxPop > 0 && PopSucc);
+ Default = PopSucc;
llvm::erase_if(Cases,
[PopSucc](const CaseRange &R) { return R.BB == PopSucc; });
-
- // If there are no cases left, just branch.
- if (Cases.empty()) {
- BranchInst::Create(Default, OrigBlock);
- SI->eraseFromParent();
- // As all the cases have been replaced with a single branch, only keep
- // one entry in the PHI nodes.
- for (unsigned I = 0 ; I < (MaxPop - 1) ; ++I)
- PopSucc->removePredecessor(OrigBlock);
- return;
- }
-
- // If the condition was a PHI node with the switch block as a predecessor
- // removing predecessors may have caused the condition to be erased.
- // Getting the condition value again here protects against that.
- Val = SI->getCondition();
- }
-
- // Create a new, empty default block so that the new hierarchy of
- // if-then statements go to this and the PHI nodes are happy.
- BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
- F->getBasicBlockList().insert(Default->getIterator(), NewDefault);
- BranchInst::Create(Default, NewDefault);
-
- BasicBlock *SwitchBlock =
+
+ // If there are no cases left, just branch.
+ if (Cases.empty()) {
+ BranchInst::Create(Default, OrigBlock);
+ SI->eraseFromParent();
+ // As all the cases have been replaced with a single branch, only keep
+ // one entry in the PHI nodes.
+ for (unsigned I = 0 ; I < (MaxPop - 1) ; ++I)
+ PopSucc->removePredecessor(OrigBlock);
+ return;
+ }
+
+ // If the condition was a PHI node with the switch block as a predecessor
+ // removing predecessors may have caused the condition to be erased.
+ // Getting the condition value again here protects against that.
+ Val = SI->getCondition();
+ }
+
+ // Create a new, empty default block so that the new hierarchy of
+ // if-then statements go to this and the PHI nodes are happy.
+ BasicBlock *NewDefault = BasicBlock::Create(SI->getContext(), "NewDefault");
+ F->getBasicBlockList().insert(Default->getIterator(), NewDefault);
+ BranchInst::Create(Default, NewDefault);
+
+ BasicBlock *SwitchBlock =
SwitchConvert(Cases.begin(), Cases.end(), LowerBound, UpperBound, Val,
- OrigBlock, OrigBlock, NewDefault, UnreachableRanges);
-
- // If there are entries in any PHI nodes for the default edge, make sure
- // to update them as well.
+ OrigBlock, OrigBlock, NewDefault, UnreachableRanges);
+
+ // If there are entries in any PHI nodes for the default edge, make sure
+ // to update them as well.
FixPhis(Default, OrigBlock, NewDefault);
-
- // Branch to our shiny new if-then stuff...
- BranchInst::Create(SwitchBlock, OrigBlock);
-
- // We are now done with the switch instruction, delete it.
- BasicBlock *OldDefault = SI->getDefaultDest();
- OrigBlock->getInstList().erase(SI);
-
- // If the Default block has no more predecessors just add it to DeleteList.
+
+ // Branch to our shiny new if-then stuff...
+ BranchInst::Create(SwitchBlock, OrigBlock);
+
+ // We are now done with the switch instruction, delete it.
+ BasicBlock *OldDefault = SI->getDefaultDest();
+ OrigBlock->getInstList().erase(SI);
+
+ // If the Default block has no more predecessors just add it to DeleteList.
if (pred_empty(OldDefault))
- DeleteList.insert(OldDefault);
-}
+ DeleteList.insert(OldDefault);
+}
bool LowerSwitch(Function &F, LazyValueInfo *LVI, AssumptionCache *AC) {
bool Changed = false;
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/Mem2Reg.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/Mem2Reg.cpp
index 6fd5672f08..5ad7aeb463 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/Mem2Reg.cpp
@@ -1,116 +1,116 @@
-//===- Mem2Reg.cpp - The -mem2reg pass, a wrapper around the Utils lib ----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass is a simple pass wrapper around the PromoteMemToReg function call
-// exposed by the Utils library.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/Mem2Reg.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/PassManager.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "mem2reg"
-
-STATISTIC(NumPromoted, "Number of alloca's promoted");
-
-static bool promoteMemoryToRegister(Function &F, DominatorTree &DT,
- AssumptionCache &AC) {
- std::vector<AllocaInst *> Allocas;
- BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function
- bool Changed = false;
-
- while (true) {
- Allocas.clear();
-
- // Find allocas that are safe to promote, by looking at all instructions in
- // the entry node
- for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
- if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca?
- if (isAllocaPromotable(AI))
- Allocas.push_back(AI);
-
- if (Allocas.empty())
- break;
-
- PromoteMemToReg(Allocas, DT, &AC);
- NumPromoted += Allocas.size();
- Changed = true;
- }
- return Changed;
-}
-
-PreservedAnalyses PromotePass::run(Function &F, FunctionAnalysisManager &AM) {
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &AC = AM.getResult<AssumptionAnalysis>(F);
- if (!promoteMemoryToRegister(F, DT, AC))
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return PA;
-}
-
-namespace {
-
-struct PromoteLegacyPass : public FunctionPass {
- // Pass identification, replacement for typeid
- static char ID;
-
- PromoteLegacyPass() : FunctionPass(ID) {
- initializePromoteLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- // runOnFunction - To run this pass, first we calculate the alloca
- // instructions that are safe for promotion, then we promote each one.
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- AssumptionCache &AC =
- getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- return promoteMemoryToRegister(F, DT, AC);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.setPreservesCFG();
- }
-};
-
-} // end anonymous namespace
-
-char PromoteLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(PromoteLegacyPass, "mem2reg", "Promote Memory to "
- "Register",
- false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(PromoteLegacyPass, "mem2reg", "Promote Memory to Register",
- false, false)
-
-// createPromoteMemoryToRegister - Provide an entry point to create this pass.
-FunctionPass *llvm::createPromoteMemoryToRegisterPass() {
- return new PromoteLegacyPass();
-}
+//===- Mem2Reg.cpp - The -mem2reg pass, a wrapper around the Utils lib ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is a simple pass wrapper around the PromoteMemToReg function call
+// exposed by the Utils library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/Mem2Reg.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mem2reg"
+
+STATISTIC(NumPromoted, "Number of alloca's promoted");
+
+static bool promoteMemoryToRegister(Function &F, DominatorTree &DT,
+ AssumptionCache &AC) {
+ std::vector<AllocaInst *> Allocas;
+ BasicBlock &BB = F.getEntryBlock(); // Get the entry node for the function
+ bool Changed = false;
+
+ while (true) {
+ Allocas.clear();
+
+ // Find allocas that are safe to promote, by looking at all instructions in
+ // the entry node
+ for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+ if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) // Is it an alloca?
+ if (isAllocaPromotable(AI))
+ Allocas.push_back(AI);
+
+ if (Allocas.empty())
+ break;
+
+ PromoteMemToReg(Allocas, DT, &AC);
+ NumPromoted += Allocas.size();
+ Changed = true;
+ }
+ return Changed;
+}
+
+PreservedAnalyses PromotePass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ if (!promoteMemoryToRegister(F, DT, AC))
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
+
+namespace {
+
+struct PromoteLegacyPass : public FunctionPass {
+ // Pass identification, replacement for typeid
+ static char ID;
+
+ PromoteLegacyPass() : FunctionPass(ID) {
+ initializePromoteLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ // runOnFunction - To run this pass, first we calculate the alloca
+ // instructions that are safe for promotion, then we promote each one.
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ AssumptionCache &AC =
+ getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ return promoteMemoryToRegister(F, DT, AC);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.setPreservesCFG();
+ }
+};
+
+} // end anonymous namespace
+
+char PromoteLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(PromoteLegacyPass, "mem2reg", "Promote Memory to "
+ "Register",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(PromoteLegacyPass, "mem2reg", "Promote Memory to Register",
+ false, false)
+
+// createPromoteMemoryToRegister - Provide an entry point to create this pass.
+FunctionPass *llvm::createPromoteMemoryToRegisterPass() {
+ return new PromoteLegacyPass();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/MetaRenamer.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/MetaRenamer.cpp
index 477d0588f6..e350320e75 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/MetaRenamer.cpp
@@ -1,85 +1,85 @@
-//===- MetaRenamer.cpp - Rename everything with metasyntatic names --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass renames everything with metasyntatic names. The intent is to use
-// this pass after bugpoint reduction to conceal the nature of the original
-// program.
-//
-//===----------------------------------------------------------------------===//
-
+//===- MetaRenamer.cpp - Rename everything with metasyntatic names --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass renames everything with metasyntatic names. The intent is to use
+// this pass after bugpoint reduction to conceal the nature of the original
+// program.
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Utils/MetaRenamer.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Module.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
#include "llvm/IR/PassManager.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/TypeFinder.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils.h"
-
-using namespace llvm;
-
-static const char *const metaNames[] = {
- // See http://en.wikipedia.org/wiki/Metasyntactic_variable
- "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge",
- "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam"
-};
-
-namespace {
+#include "llvm/IR/Type.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+
+using namespace llvm;
+
+static const char *const metaNames[] = {
+ // See http://en.wikipedia.org/wiki/Metasyntactic_variable
+ "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge",
+ "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam"
+};
+
+namespace {
// This PRNG is from the ISO C spec. It is intentionally simple and
// unsuitable for cryptographic use. We're just looking for enough
// variety to surprise and delight users.
struct PRNG {
unsigned long next;
-
+
void srand(unsigned int seed) { next = seed; }
-
+
int rand() {
next = next * 1103515245 + 12345;
return (unsigned int)(next / 65536) % 32768;
}
};
-
+
struct Renamer {
Renamer(unsigned int seed) { prng.srand(seed); }
-
+
const char *newName() {
return metaNames[prng.rand() % array_lengthof(metaNames)];
}
-
+
PRNG prng;
};
-
+
void MetaRename(Function &F) {
for (auto AI = F.arg_begin(), AE = F.arg_end(); AI != AE; ++AI)
if (!AI->getType()->isVoidTy())
AI->setName("arg");
-
+
for (auto &BB : F) {
BB.setName("bb");
-
+
for (auto &I : BB)
if (!I.getType()->isVoidTy())
I.setName("tmp");
}
}
-
+
void MetaRename(Module &M,
function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
// Seed our PRNG with simple additive sum of ModuleID. We're looking to
@@ -88,39 +88,39 @@ void MetaRename(Module &M,
unsigned int randSeed = 0;
for (auto C : M.getModuleIdentifier())
randSeed += C;
-
+
Renamer renamer(randSeed);
-
+
// Rename all aliases
for (auto AI = M.alias_begin(), AE = M.alias_end(); AI != AE; ++AI) {
StringRef Name = AI->getName();
if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
continue;
-
+
AI->setName("alias");
}
-
+
// Rename all global variables
for (auto GI = M.global_begin(), GE = M.global_end(); GI != GE; ++GI) {
StringRef Name = GI->getName();
if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
continue;
-
+
GI->setName("global");
}
-
+
// Rename all struct types
TypeFinder StructTypes;
StructTypes.run(M, true);
for (StructType *STy : StructTypes) {
if (STy->isLiteral() || STy->getName().empty())
continue;
-
+
SmallString<128> NameStorage;
STy->setName(
(Twine("struct.") + renamer.newName()).toStringRef(NameStorage));
}
-
+
// Rename all functions
for (auto &F : M) {
StringRef Name = F.getName();
@@ -130,29 +130,29 @@ void MetaRename(Module &M,
if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
GetTLI(F).getLibFunc(F, Tmp))
continue;
-
+
// Leave @main alone. The output of -metarenamer might be passed to
// lli for execution and the latter needs a main entry point.
if (Name != "main")
F.setName(renamer.newName());
-
+
MetaRename(F);
}
}
-
+
struct MetaRenamer : public ModulePass {
// Pass identification, replacement for typeid
static char ID;
-
+
MetaRenamer() : ModulePass(ID) {
initializeMetaRenamerPass(*PassRegistry::getPassRegistry());
}
-
+
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetLibraryInfoWrapperPass>();
AU.setPreservesAll();
}
-
+
bool runOnModule(Module &M) override {
auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
@@ -161,24 +161,24 @@ struct MetaRenamer : public ModulePass {
return true;
}
};
-
-} // end anonymous namespace
-
-char MetaRenamer::ID = 0;
-
-INITIALIZE_PASS_BEGIN(MetaRenamer, "metarenamer",
- "Assign new names to everything", false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(MetaRenamer, "metarenamer",
- "Assign new names to everything", false, false)
-
-//===----------------------------------------------------------------------===//
-//
-// MetaRenamer - Rename everything with metasyntactic names.
-//
-ModulePass *llvm::createMetaRenamerPass() {
- return new MetaRenamer();
-}
+
+} // end anonymous namespace
+
+char MetaRenamer::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MetaRenamer, "metarenamer",
+ "Assign new names to everything", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(MetaRenamer, "metarenamer",
+ "Assign new names to everything", false, false)
+
+//===----------------------------------------------------------------------===//
+//
+// MetaRenamer - Rename everything with metasyntactic names.
+//
+ModulePass *llvm::createMetaRenamerPass() {
+ return new MetaRenamer();
+}
PreservedAnalyses MetaRenamerPass::run(Module &M, ModuleAnalysisManager &AM) {
FunctionAnalysisManager &FAM =
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/ModuleUtils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/ModuleUtils.cpp
index fbd6ddecbb..ef9f18a228 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/ModuleUtils.cpp
@@ -1,320 +1,320 @@
-//===-- ModuleUtils.cpp - Functions to manipulate Modules -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This family of functions perform manipulations on Modules.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "moduleutils"
-
-static void appendToGlobalArray(const char *Array, Module &M, Function *F,
- int Priority, Constant *Data) {
- IRBuilder<> IRB(M.getContext());
- FunctionType *FnTy = FunctionType::get(IRB.getVoidTy(), false);
-
- // Get the current set of static global constructors and add the new ctor
- // to the list.
- SmallVector<Constant *, 16> CurrentCtors;
- StructType *EltTy = StructType::get(
- IRB.getInt32Ty(), PointerType::getUnqual(FnTy), IRB.getInt8PtrTy());
- if (GlobalVariable *GVCtor = M.getNamedGlobal(Array)) {
- if (Constant *Init = GVCtor->getInitializer()) {
- unsigned n = Init->getNumOperands();
- CurrentCtors.reserve(n + 1);
- for (unsigned i = 0; i != n; ++i)
- CurrentCtors.push_back(cast<Constant>(Init->getOperand(i)));
- }
- GVCtor->eraseFromParent();
- }
-
- // Build a 3 field global_ctor entry. We don't take a comdat key.
- Constant *CSVals[3];
- CSVals[0] = IRB.getInt32(Priority);
- CSVals[1] = F;
- CSVals[2] = Data ? ConstantExpr::getPointerCast(Data, IRB.getInt8PtrTy())
- : Constant::getNullValue(IRB.getInt8PtrTy());
- Constant *RuntimeCtorInit =
- ConstantStruct::get(EltTy, makeArrayRef(CSVals, EltTy->getNumElements()));
-
- CurrentCtors.push_back(RuntimeCtorInit);
-
- // Create a new initializer.
- ArrayType *AT = ArrayType::get(EltTy, CurrentCtors.size());
- Constant *NewInit = ConstantArray::get(AT, CurrentCtors);
-
- // Create the new global variable and replace all uses of
- // the old global variable with the new one.
- (void)new GlobalVariable(M, NewInit->getType(), false,
- GlobalValue::AppendingLinkage, NewInit, Array);
-}
-
-void llvm::appendToGlobalCtors(Module &M, Function *F, int Priority, Constant *Data) {
- appendToGlobalArray("llvm.global_ctors", M, F, Priority, Data);
-}
-
-void llvm::appendToGlobalDtors(Module &M, Function *F, int Priority, Constant *Data) {
- appendToGlobalArray("llvm.global_dtors", M, F, Priority, Data);
-}
-
-static void appendToUsedList(Module &M, StringRef Name, ArrayRef<GlobalValue *> Values) {
- GlobalVariable *GV = M.getGlobalVariable(Name);
- SmallPtrSet<Constant *, 16> InitAsSet;
- SmallVector<Constant *, 16> Init;
- if (GV) {
- auto *CA = cast<ConstantArray>(GV->getInitializer());
- for (auto &Op : CA->operands()) {
- Constant *C = cast_or_null<Constant>(Op);
- if (InitAsSet.insert(C).second)
- Init.push_back(C);
- }
- GV->eraseFromParent();
- }
-
- Type *Int8PtrTy = llvm::Type::getInt8PtrTy(M.getContext());
- for (auto *V : Values) {
- Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy);
- if (InitAsSet.insert(C).second)
- Init.push_back(C);
- }
-
- if (Init.empty())
- return;
-
- ArrayType *ATy = ArrayType::get(Int8PtrTy, Init.size());
- GV = new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
- ConstantArray::get(ATy, Init), Name);
- GV->setSection("llvm.metadata");
-}
-
-void llvm::appendToUsed(Module &M, ArrayRef<GlobalValue *> Values) {
- appendToUsedList(M, "llvm.used", Values);
-}
-
-void llvm::appendToCompilerUsed(Module &M, ArrayRef<GlobalValue *> Values) {
- appendToUsedList(M, "llvm.compiler.used", Values);
-}
-
-FunctionCallee
-llvm::declareSanitizerInitFunction(Module &M, StringRef InitName,
- ArrayRef<Type *> InitArgTypes) {
- assert(!InitName.empty() && "Expected init function name");
- return M.getOrInsertFunction(
- InitName,
- FunctionType::get(Type::getVoidTy(M.getContext()), InitArgTypes, false),
- AttributeList());
-}
-
-Function *llvm::createSanitizerCtor(Module &M, StringRef CtorName) {
- Function *Ctor = Function::Create(
- FunctionType::get(Type::getVoidTy(M.getContext()), false),
- GlobalValue::InternalLinkage, CtorName, &M);
- BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor);
- ReturnInst::Create(M.getContext(), CtorBB);
- return Ctor;
-}
-
-std::pair<Function *, FunctionCallee> llvm::createSanitizerCtorAndInitFunctions(
- Module &M, StringRef CtorName, StringRef InitName,
- ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs,
- StringRef VersionCheckName) {
- assert(!InitName.empty() && "Expected init function name");
- assert(InitArgs.size() == InitArgTypes.size() &&
- "Sanitizer's init function expects different number of arguments");
- FunctionCallee InitFunction =
- declareSanitizerInitFunction(M, InitName, InitArgTypes);
- Function *Ctor = createSanitizerCtor(M, CtorName);
- IRBuilder<> IRB(Ctor->getEntryBlock().getTerminator());
- IRB.CreateCall(InitFunction, InitArgs);
- if (!VersionCheckName.empty()) {
- FunctionCallee VersionCheckFunction = M.getOrInsertFunction(
- VersionCheckName, FunctionType::get(IRB.getVoidTy(), {}, false),
- AttributeList());
- IRB.CreateCall(VersionCheckFunction, {});
- }
- return std::make_pair(Ctor, InitFunction);
-}
-
-std::pair<Function *, FunctionCallee>
-llvm::getOrCreateSanitizerCtorAndInitFunctions(
- Module &M, StringRef CtorName, StringRef InitName,
- ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs,
- function_ref<void(Function *, FunctionCallee)> FunctionsCreatedCallback,
- StringRef VersionCheckName) {
- assert(!CtorName.empty() && "Expected ctor function name");
-
- if (Function *Ctor = M.getFunction(CtorName))
- // FIXME: Sink this logic into the module, similar to the handling of
- // globals. This will make moving to a concurrent model much easier.
- if (Ctor->arg_size() == 0 ||
- Ctor->getReturnType() == Type::getVoidTy(M.getContext()))
- return {Ctor, declareSanitizerInitFunction(M, InitName, InitArgTypes)};
-
- Function *Ctor;
- FunctionCallee InitFunction;
- std::tie(Ctor, InitFunction) = llvm::createSanitizerCtorAndInitFunctions(
- M, CtorName, InitName, InitArgTypes, InitArgs, VersionCheckName);
- FunctionsCreatedCallback(Ctor, InitFunction);
- return std::make_pair(Ctor, InitFunction);
-}
-
-Function *llvm::getOrCreateInitFunction(Module &M, StringRef Name) {
- assert(!Name.empty() && "Expected init function name");
- if (Function *F = M.getFunction(Name)) {
- if (F->arg_size() != 0 ||
- F->getReturnType() != Type::getVoidTy(M.getContext())) {
- std::string Err;
- raw_string_ostream Stream(Err);
- Stream << "Sanitizer interface function defined with wrong type: " << *F;
- report_fatal_error(Err);
- }
- return F;
- }
- Function *F =
- cast<Function>(M.getOrInsertFunction(Name, AttributeList(),
- Type::getVoidTy(M.getContext()))
- .getCallee());
-
- appendToGlobalCtors(M, F, 0);
-
- return F;
-}
-
-void llvm::filterDeadComdatFunctions(
- Module &M, SmallVectorImpl<Function *> &DeadComdatFunctions) {
- // Build a map from the comdat to the number of entries in that comdat we
- // think are dead. If this fully covers the comdat group, then the entire
- // group is dead. If we find another entry in the comdat group though, we'll
- // have to preserve the whole group.
- SmallDenseMap<Comdat *, int, 16> ComdatEntriesCovered;
- for (Function *F : DeadComdatFunctions) {
- Comdat *C = F->getComdat();
- assert(C && "Expected all input GVs to be in a comdat!");
- ComdatEntriesCovered[C] += 1;
- }
-
- auto CheckComdat = [&](Comdat &C) {
- auto CI = ComdatEntriesCovered.find(&C);
- if (CI == ComdatEntriesCovered.end())
- return;
-
- // If this could have been covered by a dead entry, just subtract one to
- // account for it.
- if (CI->second > 0) {
- CI->second -= 1;
- return;
- }
-
- // If we've already accounted for all the entries that were dead, the
- // entire comdat is alive so remove it from the map.
- ComdatEntriesCovered.erase(CI);
- };
-
- auto CheckAllComdats = [&] {
- for (Function &F : M.functions())
- if (Comdat *C = F.getComdat()) {
- CheckComdat(*C);
- if (ComdatEntriesCovered.empty())
- return;
- }
- for (GlobalVariable &GV : M.globals())
- if (Comdat *C = GV.getComdat()) {
- CheckComdat(*C);
- if (ComdatEntriesCovered.empty())
- return;
- }
- for (GlobalAlias &GA : M.aliases())
- if (Comdat *C = GA.getComdat()) {
- CheckComdat(*C);
- if (ComdatEntriesCovered.empty())
- return;
- }
- };
- CheckAllComdats();
-
- if (ComdatEntriesCovered.empty()) {
- DeadComdatFunctions.clear();
- return;
- }
-
- // Remove the entries that were not covering.
- erase_if(DeadComdatFunctions, [&](GlobalValue *GV) {
- return ComdatEntriesCovered.find(GV->getComdat()) ==
- ComdatEntriesCovered.end();
- });
-}
-
-std::string llvm::getUniqueModuleId(Module *M) {
- MD5 Md5;
- bool ExportsSymbols = false;
- auto AddGlobal = [&](GlobalValue &GV) {
- if (GV.isDeclaration() || GV.getName().startswith("llvm.") ||
- !GV.hasExternalLinkage() || GV.hasComdat())
- return;
- ExportsSymbols = true;
- Md5.update(GV.getName());
- Md5.update(ArrayRef<uint8_t>{0});
- };
-
- for (auto &F : *M)
- AddGlobal(F);
- for (auto &GV : M->globals())
- AddGlobal(GV);
- for (auto &GA : M->aliases())
- AddGlobal(GA);
- for (auto &IF : M->ifuncs())
- AddGlobal(IF);
-
- if (!ExportsSymbols)
- return "";
-
- MD5::MD5Result R;
- Md5.final(R);
-
- SmallString<32> Str;
- MD5::stringifyResult(R, Str);
- return ("$" + Str).str();
-}
-
-void VFABI::setVectorVariantNames(
- CallInst *CI, const SmallVector<std::string, 8> &VariantMappings) {
- if (VariantMappings.empty())
- return;
-
- SmallString<256> Buffer;
- llvm::raw_svector_ostream Out(Buffer);
- for (const std::string &VariantMapping : VariantMappings)
- Out << VariantMapping << ",";
- // Get rid of the trailing ','.
- assert(!Buffer.str().empty() && "Must have at least one char.");
- Buffer.pop_back();
-
- Module *M = CI->getModule();
-#ifndef NDEBUG
- for (const std::string &VariantMapping : VariantMappings) {
- LLVM_DEBUG(dbgs() << "VFABI: adding mapping '" << VariantMapping << "'\n");
- Optional<VFInfo> VI = VFABI::tryDemangleForVFABI(VariantMapping, *M);
- assert(VI.hasValue() && "Cannot add an invalid VFABI name.");
- assert(M->getNamedValue(VI.getValue().VectorName) &&
- "Cannot add variant to attribute: "
- "vector function declaration is missing.");
- }
-#endif
- CI->addAttribute(
- AttributeList::FunctionIndex,
- Attribute::get(M->getContext(), MappingsAttrName, Buffer.str()));
-}
+//===-- ModuleUtils.cpp - Functions to manipulate Modules -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of functions perform manipulations on Modules.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "moduleutils"
+
+static void appendToGlobalArray(const char *Array, Module &M, Function *F,
+ int Priority, Constant *Data) {
+ IRBuilder<> IRB(M.getContext());
+ FunctionType *FnTy = FunctionType::get(IRB.getVoidTy(), false);
+
+ // Get the current set of static global constructors and add the new ctor
+ // to the list.
+ SmallVector<Constant *, 16> CurrentCtors;
+ StructType *EltTy = StructType::get(
+ IRB.getInt32Ty(), PointerType::getUnqual(FnTy), IRB.getInt8PtrTy());
+ if (GlobalVariable *GVCtor = M.getNamedGlobal(Array)) {
+ if (Constant *Init = GVCtor->getInitializer()) {
+ unsigned n = Init->getNumOperands();
+ CurrentCtors.reserve(n + 1);
+ for (unsigned i = 0; i != n; ++i)
+ CurrentCtors.push_back(cast<Constant>(Init->getOperand(i)));
+ }
+ GVCtor->eraseFromParent();
+ }
+
+ // Build a 3 field global_ctor entry. We don't take a comdat key.
+ Constant *CSVals[3];
+ CSVals[0] = IRB.getInt32(Priority);
+ CSVals[1] = F;
+ CSVals[2] = Data ? ConstantExpr::getPointerCast(Data, IRB.getInt8PtrTy())
+ : Constant::getNullValue(IRB.getInt8PtrTy());
+ Constant *RuntimeCtorInit =
+ ConstantStruct::get(EltTy, makeArrayRef(CSVals, EltTy->getNumElements()));
+
+ CurrentCtors.push_back(RuntimeCtorInit);
+
+ // Create a new initializer.
+ ArrayType *AT = ArrayType::get(EltTy, CurrentCtors.size());
+ Constant *NewInit = ConstantArray::get(AT, CurrentCtors);
+
+ // Create the new global variable and replace all uses of
+ // the old global variable with the new one.
+ (void)new GlobalVariable(M, NewInit->getType(), false,
+ GlobalValue::AppendingLinkage, NewInit, Array);
+}
+
+void llvm::appendToGlobalCtors(Module &M, Function *F, int Priority, Constant *Data) {
+ appendToGlobalArray("llvm.global_ctors", M, F, Priority, Data);
+}
+
+void llvm::appendToGlobalDtors(Module &M, Function *F, int Priority, Constant *Data) {
+ appendToGlobalArray("llvm.global_dtors", M, F, Priority, Data);
+}
+
+static void appendToUsedList(Module &M, StringRef Name, ArrayRef<GlobalValue *> Values) {
+ GlobalVariable *GV = M.getGlobalVariable(Name);
+ SmallPtrSet<Constant *, 16> InitAsSet;
+ SmallVector<Constant *, 16> Init;
+ if (GV) {
+ auto *CA = cast<ConstantArray>(GV->getInitializer());
+ for (auto &Op : CA->operands()) {
+ Constant *C = cast_or_null<Constant>(Op);
+ if (InitAsSet.insert(C).second)
+ Init.push_back(C);
+ }
+ GV->eraseFromParent();
+ }
+
+ Type *Int8PtrTy = llvm::Type::getInt8PtrTy(M.getContext());
+ for (auto *V : Values) {
+ Constant *C = ConstantExpr::getBitCast(V, Int8PtrTy);
+ if (InitAsSet.insert(C).second)
+ Init.push_back(C);
+ }
+
+ if (Init.empty())
+ return;
+
+ ArrayType *ATy = ArrayType::get(Int8PtrTy, Init.size());
+ GV = new llvm::GlobalVariable(M, ATy, false, GlobalValue::AppendingLinkage,
+ ConstantArray::get(ATy, Init), Name);
+ GV->setSection("llvm.metadata");
+}
+
+void llvm::appendToUsed(Module &M, ArrayRef<GlobalValue *> Values) {
+ appendToUsedList(M, "llvm.used", Values);
+}
+
+void llvm::appendToCompilerUsed(Module &M, ArrayRef<GlobalValue *> Values) {
+ appendToUsedList(M, "llvm.compiler.used", Values);
+}
+
+FunctionCallee
+llvm::declareSanitizerInitFunction(Module &M, StringRef InitName,
+ ArrayRef<Type *> InitArgTypes) {
+ assert(!InitName.empty() && "Expected init function name");
+ return M.getOrInsertFunction(
+ InitName,
+ FunctionType::get(Type::getVoidTy(M.getContext()), InitArgTypes, false),
+ AttributeList());
+}
+
+Function *llvm::createSanitizerCtor(Module &M, StringRef CtorName) {
+ Function *Ctor = Function::Create(
+ FunctionType::get(Type::getVoidTy(M.getContext()), false),
+ GlobalValue::InternalLinkage, CtorName, &M);
+ BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor);
+ ReturnInst::Create(M.getContext(), CtorBB);
+ return Ctor;
+}
+
+std::pair<Function *, FunctionCallee> llvm::createSanitizerCtorAndInitFunctions(
+ Module &M, StringRef CtorName, StringRef InitName,
+ ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs,
+ StringRef VersionCheckName) {
+ assert(!InitName.empty() && "Expected init function name");
+ assert(InitArgs.size() == InitArgTypes.size() &&
+ "Sanitizer's init function expects different number of arguments");
+ FunctionCallee InitFunction =
+ declareSanitizerInitFunction(M, InitName, InitArgTypes);
+ Function *Ctor = createSanitizerCtor(M, CtorName);
+ IRBuilder<> IRB(Ctor->getEntryBlock().getTerminator());
+ IRB.CreateCall(InitFunction, InitArgs);
+ if (!VersionCheckName.empty()) {
+ FunctionCallee VersionCheckFunction = M.getOrInsertFunction(
+ VersionCheckName, FunctionType::get(IRB.getVoidTy(), {}, false),
+ AttributeList());
+ IRB.CreateCall(VersionCheckFunction, {});
+ }
+ return std::make_pair(Ctor, InitFunction);
+}
+
+std::pair<Function *, FunctionCallee>
+llvm::getOrCreateSanitizerCtorAndInitFunctions(
+ Module &M, StringRef CtorName, StringRef InitName,
+ ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs,
+ function_ref<void(Function *, FunctionCallee)> FunctionsCreatedCallback,
+ StringRef VersionCheckName) {
+ assert(!CtorName.empty() && "Expected ctor function name");
+
+ if (Function *Ctor = M.getFunction(CtorName))
+ // FIXME: Sink this logic into the module, similar to the handling of
+ // globals. This will make moving to a concurrent model much easier.
+ if (Ctor->arg_size() == 0 ||
+ Ctor->getReturnType() == Type::getVoidTy(M.getContext()))
+ return {Ctor, declareSanitizerInitFunction(M, InitName, InitArgTypes)};
+
+ Function *Ctor;
+ FunctionCallee InitFunction;
+ std::tie(Ctor, InitFunction) = llvm::createSanitizerCtorAndInitFunctions(
+ M, CtorName, InitName, InitArgTypes, InitArgs, VersionCheckName);
+ FunctionsCreatedCallback(Ctor, InitFunction);
+ return std::make_pair(Ctor, InitFunction);
+}
+
+Function *llvm::getOrCreateInitFunction(Module &M, StringRef Name) {
+ assert(!Name.empty() && "Expected init function name");
+ if (Function *F = M.getFunction(Name)) {
+ if (F->arg_size() != 0 ||
+ F->getReturnType() != Type::getVoidTy(M.getContext())) {
+ std::string Err;
+ raw_string_ostream Stream(Err);
+ Stream << "Sanitizer interface function defined with wrong type: " << *F;
+ report_fatal_error(Err);
+ }
+ return F;
+ }
+ Function *F =
+ cast<Function>(M.getOrInsertFunction(Name, AttributeList(),
+ Type::getVoidTy(M.getContext()))
+ .getCallee());
+
+ appendToGlobalCtors(M, F, 0);
+
+ return F;
+}
+
+void llvm::filterDeadComdatFunctions(
+ Module &M, SmallVectorImpl<Function *> &DeadComdatFunctions) {
+ // Build a map from the comdat to the number of entries in that comdat we
+ // think are dead. If this fully covers the comdat group, then the entire
+ // group is dead. If we find another entry in the comdat group though, we'll
+ // have to preserve the whole group.
+ SmallDenseMap<Comdat *, int, 16> ComdatEntriesCovered;
+ for (Function *F : DeadComdatFunctions) {
+ Comdat *C = F->getComdat();
+ assert(C && "Expected all input GVs to be in a comdat!");
+ ComdatEntriesCovered[C] += 1;
+ }
+
+ auto CheckComdat = [&](Comdat &C) {
+ auto CI = ComdatEntriesCovered.find(&C);
+ if (CI == ComdatEntriesCovered.end())
+ return;
+
+ // If this could have been covered by a dead entry, just subtract one to
+ // account for it.
+ if (CI->second > 0) {
+ CI->second -= 1;
+ return;
+ }
+
+ // If we've already accounted for all the entries that were dead, the
+ // entire comdat is alive so remove it from the map.
+ ComdatEntriesCovered.erase(CI);
+ };
+
+ auto CheckAllComdats = [&] {
+ for (Function &F : M.functions())
+ if (Comdat *C = F.getComdat()) {
+ CheckComdat(*C);
+ if (ComdatEntriesCovered.empty())
+ return;
+ }
+ for (GlobalVariable &GV : M.globals())
+ if (Comdat *C = GV.getComdat()) {
+ CheckComdat(*C);
+ if (ComdatEntriesCovered.empty())
+ return;
+ }
+ for (GlobalAlias &GA : M.aliases())
+ if (Comdat *C = GA.getComdat()) {
+ CheckComdat(*C);
+ if (ComdatEntriesCovered.empty())
+ return;
+ }
+ };
+ CheckAllComdats();
+
+ if (ComdatEntriesCovered.empty()) {
+ DeadComdatFunctions.clear();
+ return;
+ }
+
+ // Remove the entries that were not covering.
+ erase_if(DeadComdatFunctions, [&](GlobalValue *GV) {
+ return ComdatEntriesCovered.find(GV->getComdat()) ==
+ ComdatEntriesCovered.end();
+ });
+}
+
+std::string llvm::getUniqueModuleId(Module *M) {
+ MD5 Md5;
+ bool ExportsSymbols = false;
+ auto AddGlobal = [&](GlobalValue &GV) {
+ if (GV.isDeclaration() || GV.getName().startswith("llvm.") ||
+ !GV.hasExternalLinkage() || GV.hasComdat())
+ return;
+ ExportsSymbols = true;
+ Md5.update(GV.getName());
+ Md5.update(ArrayRef<uint8_t>{0});
+ };
+
+ for (auto &F : *M)
+ AddGlobal(F);
+ for (auto &GV : M->globals())
+ AddGlobal(GV);
+ for (auto &GA : M->aliases())
+ AddGlobal(GA);
+ for (auto &IF : M->ifuncs())
+ AddGlobal(IF);
+
+ if (!ExportsSymbols)
+ return "";
+
+ MD5::MD5Result R;
+ Md5.final(R);
+
+ SmallString<32> Str;
+ MD5::stringifyResult(R, Str);
+ return ("$" + Str).str();
+}
+
+void VFABI::setVectorVariantNames(
+ CallInst *CI, const SmallVector<std::string, 8> &VariantMappings) {
+ if (VariantMappings.empty())
+ return;
+
+ SmallString<256> Buffer;
+ llvm::raw_svector_ostream Out(Buffer);
+ for (const std::string &VariantMapping : VariantMappings)
+ Out << VariantMapping << ",";
+ // Get rid of the trailing ','.
+ assert(!Buffer.str().empty() && "Must have at least one char.");
+ Buffer.pop_back();
+
+ Module *M = CI->getModule();
+#ifndef NDEBUG
+ for (const std::string &VariantMapping : VariantMappings) {
+ LLVM_DEBUG(dbgs() << "VFABI: adding mapping '" << VariantMapping << "'\n");
+ Optional<VFInfo> VI = VFABI::tryDemangleForVFABI(VariantMapping, *M);
+ assert(VI.hasValue() && "Cannot add an invalid VFABI name.");
+ assert(M->getNamedValue(VI.getValue().VectorName) &&
+ "Cannot add variant to attribute: "
+ "vector function declaration is missing.");
+ }
+#endif
+ CI->addAttribute(
+ AttributeList::FunctionIndex,
+ Attribute::get(M->getContext(), MappingsAttrName, Buffer.str()));
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/NameAnonGlobals.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/NameAnonGlobals.cpp
index 1b036854fd..7083789267 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/NameAnonGlobals.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/NameAnonGlobals.cpp
@@ -1,120 +1,120 @@
-//===- NameAnonGlobals.cpp - ThinLTO Support: Name Unnamed Globals --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements naming anonymous globals to make sure they can be
-// referred to by ThinLTO.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/NameAnonGlobals.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/MD5.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-
-using namespace llvm;
-
-namespace {
-// Compute a "unique" hash for the module based on the name of the public
-// globals.
-class ModuleHasher {
- Module &TheModule;
- std::string TheHash;
-
-public:
- ModuleHasher(Module &M) : TheModule(M) {}
-
- /// Return the lazily computed hash.
- std::string &get() {
- if (!TheHash.empty())
- // Cache hit :)
- return TheHash;
-
- MD5 Hasher;
- for (auto &F : TheModule) {
- if (F.isDeclaration() || F.hasLocalLinkage() || !F.hasName())
- continue;
- auto Name = F.getName();
- Hasher.update(Name);
- }
- for (auto &GV : TheModule.globals()) {
- if (GV.isDeclaration() || GV.hasLocalLinkage() || !GV.hasName())
- continue;
- auto Name = GV.getName();
- Hasher.update(Name);
- }
-
- // Now return the result.
- MD5::MD5Result Hash;
- Hasher.final(Hash);
- SmallString<32> Result;
- MD5::stringifyResult(Hash, Result);
- TheHash = std::string(Result.str());
- return TheHash;
- }
-};
-} // end anonymous namespace
-
-// Rename all the anon globals in the module
-bool llvm::nameUnamedGlobals(Module &M) {
- bool Changed = false;
- ModuleHasher ModuleHash(M);
- int count = 0;
- auto RenameIfNeed = [&](GlobalValue &GV) {
- if (GV.hasName())
- return;
- GV.setName(Twine("anon.") + ModuleHash.get() + "." + Twine(count++));
- Changed = true;
- };
- for (auto &GO : M.global_objects())
- RenameIfNeed(GO);
- for (auto &GA : M.aliases())
- RenameIfNeed(GA);
-
- return Changed;
-}
-
-namespace {
-
-// Legacy pass that provides a name to every anon globals.
-class NameAnonGlobalLegacyPass : public ModulePass {
-
-public:
- /// Pass identification, replacement for typeid
- static char ID;
-
- /// Specify pass name for debug output
- StringRef getPassName() const override { return "Name Anon Globals"; }
-
- explicit NameAnonGlobalLegacyPass() : ModulePass(ID) {}
-
- bool runOnModule(Module &M) override { return nameUnamedGlobals(M); }
-};
-char NameAnonGlobalLegacyPass::ID = 0;
-
-} // anonymous namespace
-
-PreservedAnalyses NameAnonGlobalPass::run(Module &M,
- ModuleAnalysisManager &AM) {
- if (!nameUnamedGlobals(M))
- return PreservedAnalyses::all();
-
- return PreservedAnalyses::none();
-}
-
-INITIALIZE_PASS_BEGIN(NameAnonGlobalLegacyPass, "name-anon-globals",
- "Provide a name to nameless globals", false, false)
-INITIALIZE_PASS_END(NameAnonGlobalLegacyPass, "name-anon-globals",
- "Provide a name to nameless globals", false, false)
-
-namespace llvm {
-ModulePass *createNameAnonGlobalPass() {
- return new NameAnonGlobalLegacyPass();
-}
-}
+//===- NameAnonGlobals.cpp - ThinLTO Support: Name Unnamed Globals --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements naming anonymous globals to make sure they can be
+// referred to by ThinLTO.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/NameAnonGlobals.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+namespace {
+// Compute a "unique" hash for the module based on the name of the public
+// globals.
+class ModuleHasher {
+ Module &TheModule;
+ std::string TheHash;
+
+public:
+ ModuleHasher(Module &M) : TheModule(M) {}
+
+ /// Return the lazily computed hash.
+ std::string &get() {
+ if (!TheHash.empty())
+ // Cache hit :)
+ return TheHash;
+
+ MD5 Hasher;
+ for (auto &F : TheModule) {
+ if (F.isDeclaration() || F.hasLocalLinkage() || !F.hasName())
+ continue;
+ auto Name = F.getName();
+ Hasher.update(Name);
+ }
+ for (auto &GV : TheModule.globals()) {
+ if (GV.isDeclaration() || GV.hasLocalLinkage() || !GV.hasName())
+ continue;
+ auto Name = GV.getName();
+ Hasher.update(Name);
+ }
+
+ // Now return the result.
+ MD5::MD5Result Hash;
+ Hasher.final(Hash);
+ SmallString<32> Result;
+ MD5::stringifyResult(Hash, Result);
+ TheHash = std::string(Result.str());
+ return TheHash;
+ }
+};
+} // end anonymous namespace
+
+// Rename all the anon globals in the module
+bool llvm::nameUnamedGlobals(Module &M) {
+ bool Changed = false;
+ ModuleHasher ModuleHash(M);
+ int count = 0;
+ auto RenameIfNeed = [&](GlobalValue &GV) {
+ if (GV.hasName())
+ return;
+ GV.setName(Twine("anon.") + ModuleHash.get() + "." + Twine(count++));
+ Changed = true;
+ };
+ for (auto &GO : M.global_objects())
+ RenameIfNeed(GO);
+ for (auto &GA : M.aliases())
+ RenameIfNeed(GA);
+
+ return Changed;
+}
+
+namespace {
+
+// Legacy pass that provides a name to every anon globals.
+class NameAnonGlobalLegacyPass : public ModulePass {
+
+public:
+ /// Pass identification, replacement for typeid
+ static char ID;
+
+ /// Specify pass name for debug output
+ StringRef getPassName() const override { return "Name Anon Globals"; }
+
+ explicit NameAnonGlobalLegacyPass() : ModulePass(ID) {}
+
+ bool runOnModule(Module &M) override { return nameUnamedGlobals(M); }
+};
+char NameAnonGlobalLegacyPass::ID = 0;
+
+} // anonymous namespace
+
+PreservedAnalyses NameAnonGlobalPass::run(Module &M,
+ ModuleAnalysisManager &AM) {
+ if (!nameUnamedGlobals(M))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+INITIALIZE_PASS_BEGIN(NameAnonGlobalLegacyPass, "name-anon-globals",
+ "Provide a name to nameless globals", false, false)
+INITIALIZE_PASS_END(NameAnonGlobalLegacyPass, "name-anon-globals",
+ "Provide a name to nameless globals", false, false)
+
+namespace llvm {
+ModulePass *createNameAnonGlobalPass() {
+ return new NameAnonGlobalLegacyPass();
+}
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/PredicateInfo.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/PredicateInfo.cpp
index 8e2a2ba8de..3312a6f945 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/PredicateInfo.cpp
@@ -1,376 +1,376 @@
-//===-- PredicateInfo.cpp - PredicateInfo Builder--------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------===//
-//
-// This file implements the PredicateInfo class.
-//
-//===----------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/PredicateInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/IR/AssemblyAnnotationWriter.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/DebugCounter.h"
-#include "llvm/Support/FormattedStream.h"
-#include "llvm/Transforms/Utils.h"
-#include <algorithm>
-#define DEBUG_TYPE "predicateinfo"
-using namespace llvm;
-using namespace PatternMatch;
-
-INITIALIZE_PASS_BEGIN(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
- "PredicateInfo Printer", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_END(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
- "PredicateInfo Printer", false, false)
-static cl::opt<bool> VerifyPredicateInfo(
- "verify-predicateinfo", cl::init(false), cl::Hidden,
- cl::desc("Verify PredicateInfo in legacy printer pass."));
-DEBUG_COUNTER(RenameCounter, "predicateinfo-rename",
- "Controls which variables are renamed with predicateinfo");
-
+//===-- PredicateInfo.cpp - PredicateInfo Builder--------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------===//
+//
+// This file implements the PredicateInfo class.
+//
+//===----------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Transforms/Utils.h"
+#include <algorithm>
+#define DEBUG_TYPE "predicateinfo"
+using namespace llvm;
+using namespace PatternMatch;
+
+INITIALIZE_PASS_BEGIN(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
+ "PredicateInfo Printer", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
+ "PredicateInfo Printer", false, false)
+static cl::opt<bool> VerifyPredicateInfo(
+ "verify-predicateinfo", cl::init(false), cl::Hidden,
+ cl::desc("Verify PredicateInfo in legacy printer pass."));
+DEBUG_COUNTER(RenameCounter, "predicateinfo-rename",
+ "Controls which variables are renamed with predicateinfo");
+
// Maximum number of conditions considered for renaming for each branch/assume.
// This limits renaming of deep and/or chains.
static const unsigned MaxCondsPerBranch = 8;
-namespace {
-// Given a predicate info that is a type of branching terminator, get the
-// branching block.
-const BasicBlock *getBranchBlock(const PredicateBase *PB) {
- assert(isa<PredicateWithEdge>(PB) &&
- "Only branches and switches should have PHIOnly defs that "
- "require branch blocks.");
- return cast<PredicateWithEdge>(PB)->From;
-}
-
-// Given a predicate info that is a type of branching terminator, get the
-// branching terminator.
-static Instruction *getBranchTerminator(const PredicateBase *PB) {
- assert(isa<PredicateWithEdge>(PB) &&
- "Not a predicate info type we know how to get a terminator from.");
- return cast<PredicateWithEdge>(PB)->From->getTerminator();
-}
-
-// Given a predicate info that is a type of branching terminator, get the
-// edge this predicate info represents
-const std::pair<BasicBlock *, BasicBlock *>
-getBlockEdge(const PredicateBase *PB) {
- assert(isa<PredicateWithEdge>(PB) &&
- "Not a predicate info type we know how to get an edge from.");
- const auto *PEdge = cast<PredicateWithEdge>(PB);
- return std::make_pair(PEdge->From, PEdge->To);
-}
-}
-
-namespace llvm {
-enum LocalNum {
- // Operations that must appear first in the block.
- LN_First,
- // Operations that are somewhere in the middle of the block, and are sorted on
- // demand.
- LN_Middle,
- // Operations that must appear last in a block, like successor phi node uses.
- LN_Last
-};
-
-// Associate global and local DFS info with defs and uses, so we can sort them
-// into a global domination ordering.
-struct ValueDFS {
- int DFSIn = 0;
- int DFSOut = 0;
- unsigned int LocalNum = LN_Middle;
- // Only one of Def or Use will be set.
- Value *Def = nullptr;
- Use *U = nullptr;
- // Neither PInfo nor EdgeOnly participate in the ordering
- PredicateBase *PInfo = nullptr;
- bool EdgeOnly = false;
-};
-
-// Perform a strict weak ordering on instructions and arguments.
-static bool valueComesBefore(const Value *A, const Value *B) {
- auto *ArgA = dyn_cast_or_null<Argument>(A);
- auto *ArgB = dyn_cast_or_null<Argument>(B);
- if (ArgA && !ArgB)
- return true;
- if (ArgB && !ArgA)
- return false;
- if (ArgA && ArgB)
- return ArgA->getArgNo() < ArgB->getArgNo();
- return cast<Instruction>(A)->comesBefore(cast<Instruction>(B));
-}
-
-// This compares ValueDFS structures. Doing so allows us to walk the minimum
-// number of instructions necessary to compute our def/use ordering.
-struct ValueDFS_Compare {
- DominatorTree &DT;
- ValueDFS_Compare(DominatorTree &DT) : DT(DT) {}
-
- bool operator()(const ValueDFS &A, const ValueDFS &B) const {
- if (&A == &B)
- return false;
- // The only case we can't directly compare them is when they in the same
- // block, and both have localnum == middle. In that case, we have to use
- // comesbefore to see what the real ordering is, because they are in the
- // same basic block.
-
- assert((A.DFSIn != B.DFSIn || A.DFSOut == B.DFSOut) &&
- "Equal DFS-in numbers imply equal out numbers");
- bool SameBlock = A.DFSIn == B.DFSIn;
-
- // We want to put the def that will get used for a given set of phi uses,
- // before those phi uses.
- // So we sort by edge, then by def.
- // Note that only phi nodes uses and defs can come last.
- if (SameBlock && A.LocalNum == LN_Last && B.LocalNum == LN_Last)
- return comparePHIRelated(A, B);
-
- bool isADef = A.Def;
- bool isBDef = B.Def;
- if (!SameBlock || A.LocalNum != LN_Middle || B.LocalNum != LN_Middle)
- return std::tie(A.DFSIn, A.LocalNum, isADef) <
- std::tie(B.DFSIn, B.LocalNum, isBDef);
- return localComesBefore(A, B);
- }
-
- // For a phi use, or a non-materialized def, return the edge it represents.
- const std::pair<BasicBlock *, BasicBlock *>
- getBlockEdge(const ValueDFS &VD) const {
- if (!VD.Def && VD.U) {
- auto *PHI = cast<PHINode>(VD.U->getUser());
- return std::make_pair(PHI->getIncomingBlock(*VD.U), PHI->getParent());
- }
- // This is really a non-materialized def.
- return ::getBlockEdge(VD.PInfo);
- }
-
- // For two phi related values, return the ordering.
- bool comparePHIRelated(const ValueDFS &A, const ValueDFS &B) const {
- BasicBlock *ASrc, *ADest, *BSrc, *BDest;
- std::tie(ASrc, ADest) = getBlockEdge(A);
- std::tie(BSrc, BDest) = getBlockEdge(B);
-
-#ifndef NDEBUG
- // This function should only be used for values in the same BB, check that.
- DomTreeNode *DomASrc = DT.getNode(ASrc);
- DomTreeNode *DomBSrc = DT.getNode(BSrc);
- assert(DomASrc->getDFSNumIn() == (unsigned)A.DFSIn &&
- "DFS numbers for A should match the ones of the source block");
- assert(DomBSrc->getDFSNumIn() == (unsigned)B.DFSIn &&
- "DFS numbers for B should match the ones of the source block");
- assert(A.DFSIn == B.DFSIn && "Values must be in the same block");
-#endif
- (void)ASrc;
- (void)BSrc;
-
- // Use DFS numbers to compare destination blocks, to guarantee a
- // deterministic order.
- DomTreeNode *DomADest = DT.getNode(ADest);
- DomTreeNode *DomBDest = DT.getNode(BDest);
- unsigned AIn = DomADest->getDFSNumIn();
- unsigned BIn = DomBDest->getDFSNumIn();
- bool isADef = A.Def;
- bool isBDef = B.Def;
- assert((!A.Def || !A.U) && (!B.Def || !B.U) &&
- "Def and U cannot be set at the same time");
- // Now sort by edge destination and then defs before uses.
- return std::tie(AIn, isADef) < std::tie(BIn, isBDef);
- }
-
- // Get the definition of an instruction that occurs in the middle of a block.
- Value *getMiddleDef(const ValueDFS &VD) const {
- if (VD.Def)
- return VD.Def;
- // It's possible for the defs and uses to be null. For branches, the local
- // numbering will say the placed predicaeinfos should go first (IE
- // LN_beginning), so we won't be in this function. For assumes, we will end
- // up here, beause we need to order the def we will place relative to the
- // assume. So for the purpose of ordering, we pretend the def is right
- // after the assume, because that is where we will insert the info.
- if (!VD.U) {
- assert(VD.PInfo &&
- "No def, no use, and no predicateinfo should not occur");
- assert(isa<PredicateAssume>(VD.PInfo) &&
- "Middle of block should only occur for assumes");
- return cast<PredicateAssume>(VD.PInfo)->AssumeInst->getNextNode();
- }
- return nullptr;
- }
-
- // Return either the Def, if it's not null, or the user of the Use, if the def
- // is null.
- const Instruction *getDefOrUser(const Value *Def, const Use *U) const {
- if (Def)
- return cast<Instruction>(Def);
- return cast<Instruction>(U->getUser());
- }
-
- // This performs the necessary local basic block ordering checks to tell
- // whether A comes before B, where both are in the same basic block.
- bool localComesBefore(const ValueDFS &A, const ValueDFS &B) const {
- auto *ADef = getMiddleDef(A);
- auto *BDef = getMiddleDef(B);
-
- // See if we have real values or uses. If we have real values, we are
- // guaranteed they are instructions or arguments. No matter what, we are
- // guaranteed they are in the same block if they are instructions.
- auto *ArgA = dyn_cast_or_null<Argument>(ADef);
- auto *ArgB = dyn_cast_or_null<Argument>(BDef);
-
- if (ArgA || ArgB)
- return valueComesBefore(ArgA, ArgB);
-
- auto *AInst = getDefOrUser(ADef, A.U);
- auto *BInst = getDefOrUser(BDef, B.U);
- return valueComesBefore(AInst, BInst);
- }
-};
-
-class PredicateInfoBuilder {
- // Used to store information about each value we might rename.
- struct ValueInfo {
- SmallVector<PredicateBase *, 4> Infos;
- };
-
- PredicateInfo &PI;
- Function &F;
- DominatorTree &DT;
- AssumptionCache &AC;
-
- // This stores info about each operand or comparison result we make copies
- // of. The real ValueInfos start at index 1, index 0 is unused so that we
- // can more easily detect invalid indexing.
- SmallVector<ValueInfo, 32> ValueInfos;
-
- // This gives the index into the ValueInfos array for a given Value. Because
- // 0 is not a valid Value Info index, you can use DenseMap::lookup and tell
- // whether it returned a valid result.
- DenseMap<Value *, unsigned int> ValueInfoNums;
-
- // The set of edges along which we can only handle phi uses, due to critical
- // edges.
- DenseSet<std::pair<BasicBlock *, BasicBlock *>> EdgeUsesOnly;
-
- ValueInfo &getOrCreateValueInfo(Value *);
- const ValueInfo &getValueInfo(Value *) const;
-
- void processAssume(IntrinsicInst *, BasicBlock *,
- SmallVectorImpl<Value *> &OpsToRename);
- void processBranch(BranchInst *, BasicBlock *,
- SmallVectorImpl<Value *> &OpsToRename);
- void processSwitch(SwitchInst *, BasicBlock *,
- SmallVectorImpl<Value *> &OpsToRename);
- void renameUses(SmallVectorImpl<Value *> &OpsToRename);
- void addInfoFor(SmallVectorImpl<Value *> &OpsToRename, Value *Op,
- PredicateBase *PB);
-
- typedef SmallVectorImpl<ValueDFS> ValueDFSStack;
- void convertUsesToDFSOrdered(Value *, SmallVectorImpl<ValueDFS> &);
- Value *materializeStack(unsigned int &, ValueDFSStack &, Value *);
- bool stackIsInScope(const ValueDFSStack &, const ValueDFS &) const;
- void popStackUntilDFSScope(ValueDFSStack &, const ValueDFS &);
-
-public:
- PredicateInfoBuilder(PredicateInfo &PI, Function &F, DominatorTree &DT,
- AssumptionCache &AC)
- : PI(PI), F(F), DT(DT), AC(AC) {
- // Push an empty operand info so that we can detect 0 as not finding one
- ValueInfos.resize(1);
- }
-
- void buildPredicateInfo();
-};
-
-bool PredicateInfoBuilder::stackIsInScope(const ValueDFSStack &Stack,
- const ValueDFS &VDUse) const {
- if (Stack.empty())
- return false;
- // If it's a phi only use, make sure it's for this phi node edge, and that the
- // use is in a phi node. If it's anything else, and the top of the stack is
- // EdgeOnly, we need to pop the stack. We deliberately sort phi uses next to
- // the defs they must go with so that we can know it's time to pop the stack
- // when we hit the end of the phi uses for a given def.
- if (Stack.back().EdgeOnly) {
- if (!VDUse.U)
- return false;
- auto *PHI = dyn_cast<PHINode>(VDUse.U->getUser());
- if (!PHI)
- return false;
- // Check edge
- BasicBlock *EdgePred = PHI->getIncomingBlock(*VDUse.U);
- if (EdgePred != getBranchBlock(Stack.back().PInfo))
- return false;
-
- // Use dominates, which knows how to handle edge dominance.
- return DT.dominates(getBlockEdge(Stack.back().PInfo), *VDUse.U);
- }
-
- return (VDUse.DFSIn >= Stack.back().DFSIn &&
- VDUse.DFSOut <= Stack.back().DFSOut);
-}
-
-void PredicateInfoBuilder::popStackUntilDFSScope(ValueDFSStack &Stack,
- const ValueDFS &VD) {
- while (!Stack.empty() && !stackIsInScope(Stack, VD))
- Stack.pop_back();
-}
-
-// Convert the uses of Op into a vector of uses, associating global and local
-// DFS info with each one.
-void PredicateInfoBuilder::convertUsesToDFSOrdered(
- Value *Op, SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
- for (auto &U : Op->uses()) {
- if (auto *I = dyn_cast<Instruction>(U.getUser())) {
- ValueDFS VD;
- // Put the phi node uses in the incoming block.
- BasicBlock *IBlock;
- if (auto *PN = dyn_cast<PHINode>(I)) {
- IBlock = PN->getIncomingBlock(U);
- // Make phi node users appear last in the incoming block
- // they are from.
- VD.LocalNum = LN_Last;
- } else {
- // If it's not a phi node use, it is somewhere in the middle of the
- // block.
- IBlock = I->getParent();
- VD.LocalNum = LN_Middle;
- }
- DomTreeNode *DomNode = DT.getNode(IBlock);
- // It's possible our use is in an unreachable block. Skip it if so.
- if (!DomNode)
- continue;
- VD.DFSIn = DomNode->getDFSNumIn();
- VD.DFSOut = DomNode->getDFSNumOut();
- VD.U = &U;
- DFSOrderedSet.push_back(VD);
- }
- }
-}
-
+namespace {
+// Given a predicate info that is a type of branching terminator, get the
+// branching block.
+const BasicBlock *getBranchBlock(const PredicateBase *PB) {
+ assert(isa<PredicateWithEdge>(PB) &&
+ "Only branches and switches should have PHIOnly defs that "
+ "require branch blocks.");
+ return cast<PredicateWithEdge>(PB)->From;
+}
+
+// Given a predicate info that is a type of branching terminator, get the
+// branching terminator.
+static Instruction *getBranchTerminator(const PredicateBase *PB) {
+ assert(isa<PredicateWithEdge>(PB) &&
+ "Not a predicate info type we know how to get a terminator from.");
+ return cast<PredicateWithEdge>(PB)->From->getTerminator();
+}
+
+// Given a predicate info that is a type of branching terminator, get the
+// edge this predicate info represents
+const std::pair<BasicBlock *, BasicBlock *>
+getBlockEdge(const PredicateBase *PB) {
+ assert(isa<PredicateWithEdge>(PB) &&
+ "Not a predicate info type we know how to get an edge from.");
+ const auto *PEdge = cast<PredicateWithEdge>(PB);
+ return std::make_pair(PEdge->From, PEdge->To);
+}
+}
+
+namespace llvm {
+enum LocalNum {
+ // Operations that must appear first in the block.
+ LN_First,
+ // Operations that are somewhere in the middle of the block, and are sorted on
+ // demand.
+ LN_Middle,
+ // Operations that must appear last in a block, like successor phi node uses.
+ LN_Last
+};
+
+// Associate global and local DFS info with defs and uses, so we can sort them
+// into a global domination ordering.
+struct ValueDFS {
+ int DFSIn = 0;
+ int DFSOut = 0;
+ unsigned int LocalNum = LN_Middle;
+ // Only one of Def or Use will be set.
+ Value *Def = nullptr;
+ Use *U = nullptr;
+ // Neither PInfo nor EdgeOnly participate in the ordering
+ PredicateBase *PInfo = nullptr;
+ bool EdgeOnly = false;
+};
+
+// Perform a strict weak ordering on instructions and arguments.
+static bool valueComesBefore(const Value *A, const Value *B) {
+ auto *ArgA = dyn_cast_or_null<Argument>(A);
+ auto *ArgB = dyn_cast_or_null<Argument>(B);
+ if (ArgA && !ArgB)
+ return true;
+ if (ArgB && !ArgA)
+ return false;
+ if (ArgA && ArgB)
+ return ArgA->getArgNo() < ArgB->getArgNo();
+ return cast<Instruction>(A)->comesBefore(cast<Instruction>(B));
+}
+
+// This compares ValueDFS structures. Doing so allows us to walk the minimum
+// number of instructions necessary to compute our def/use ordering.
+struct ValueDFS_Compare {
+ DominatorTree &DT;
+ ValueDFS_Compare(DominatorTree &DT) : DT(DT) {}
+
+ bool operator()(const ValueDFS &A, const ValueDFS &B) const {
+ if (&A == &B)
+ return false;
+ // The only case we can't directly compare them is when they in the same
+ // block, and both have localnum == middle. In that case, we have to use
+ // comesbefore to see what the real ordering is, because they are in the
+ // same basic block.
+
+ assert((A.DFSIn != B.DFSIn || A.DFSOut == B.DFSOut) &&
+ "Equal DFS-in numbers imply equal out numbers");
+ bool SameBlock = A.DFSIn == B.DFSIn;
+
+ // We want to put the def that will get used for a given set of phi uses,
+ // before those phi uses.
+ // So we sort by edge, then by def.
+ // Note that only phi nodes uses and defs can come last.
+ if (SameBlock && A.LocalNum == LN_Last && B.LocalNum == LN_Last)
+ return comparePHIRelated(A, B);
+
+ bool isADef = A.Def;
+ bool isBDef = B.Def;
+ if (!SameBlock || A.LocalNum != LN_Middle || B.LocalNum != LN_Middle)
+ return std::tie(A.DFSIn, A.LocalNum, isADef) <
+ std::tie(B.DFSIn, B.LocalNum, isBDef);
+ return localComesBefore(A, B);
+ }
+
+ // For a phi use, or a non-materialized def, return the edge it represents.
+ const std::pair<BasicBlock *, BasicBlock *>
+ getBlockEdge(const ValueDFS &VD) const {
+ if (!VD.Def && VD.U) {
+ auto *PHI = cast<PHINode>(VD.U->getUser());
+ return std::make_pair(PHI->getIncomingBlock(*VD.U), PHI->getParent());
+ }
+ // This is really a non-materialized def.
+ return ::getBlockEdge(VD.PInfo);
+ }
+
+ // For two phi related values, return the ordering.
+ bool comparePHIRelated(const ValueDFS &A, const ValueDFS &B) const {
+ BasicBlock *ASrc, *ADest, *BSrc, *BDest;
+ std::tie(ASrc, ADest) = getBlockEdge(A);
+ std::tie(BSrc, BDest) = getBlockEdge(B);
+
+#ifndef NDEBUG
+ // This function should only be used for values in the same BB, check that.
+ DomTreeNode *DomASrc = DT.getNode(ASrc);
+ DomTreeNode *DomBSrc = DT.getNode(BSrc);
+ assert(DomASrc->getDFSNumIn() == (unsigned)A.DFSIn &&
+ "DFS numbers for A should match the ones of the source block");
+ assert(DomBSrc->getDFSNumIn() == (unsigned)B.DFSIn &&
+ "DFS numbers for B should match the ones of the source block");
+ assert(A.DFSIn == B.DFSIn && "Values must be in the same block");
+#endif
+ (void)ASrc;
+ (void)BSrc;
+
+ // Use DFS numbers to compare destination blocks, to guarantee a
+ // deterministic order.
+ DomTreeNode *DomADest = DT.getNode(ADest);
+ DomTreeNode *DomBDest = DT.getNode(BDest);
+ unsigned AIn = DomADest->getDFSNumIn();
+ unsigned BIn = DomBDest->getDFSNumIn();
+ bool isADef = A.Def;
+ bool isBDef = B.Def;
+ assert((!A.Def || !A.U) && (!B.Def || !B.U) &&
+ "Def and U cannot be set at the same time");
+ // Now sort by edge destination and then defs before uses.
+ return std::tie(AIn, isADef) < std::tie(BIn, isBDef);
+ }
+
+ // Get the definition of an instruction that occurs in the middle of a block.
+ Value *getMiddleDef(const ValueDFS &VD) const {
+ if (VD.Def)
+ return VD.Def;
+ // It's possible for the defs and uses to be null. For branches, the local
+ // numbering will say the placed predicaeinfos should go first (IE
+ // LN_beginning), so we won't be in this function. For assumes, we will end
+ // up here, beause we need to order the def we will place relative to the
+ // assume. So for the purpose of ordering, we pretend the def is right
+ // after the assume, because that is where we will insert the info.
+ if (!VD.U) {
+ assert(VD.PInfo &&
+ "No def, no use, and no predicateinfo should not occur");
+ assert(isa<PredicateAssume>(VD.PInfo) &&
+ "Middle of block should only occur for assumes");
+ return cast<PredicateAssume>(VD.PInfo)->AssumeInst->getNextNode();
+ }
+ return nullptr;
+ }
+
+ // Return either the Def, if it's not null, or the user of the Use, if the def
+ // is null.
+ const Instruction *getDefOrUser(const Value *Def, const Use *U) const {
+ if (Def)
+ return cast<Instruction>(Def);
+ return cast<Instruction>(U->getUser());
+ }
+
+ // This performs the necessary local basic block ordering checks to tell
+ // whether A comes before B, where both are in the same basic block.
+ bool localComesBefore(const ValueDFS &A, const ValueDFS &B) const {
+ auto *ADef = getMiddleDef(A);
+ auto *BDef = getMiddleDef(B);
+
+ // See if we have real values or uses. If we have real values, we are
+ // guaranteed they are instructions or arguments. No matter what, we are
+ // guaranteed they are in the same block if they are instructions.
+ auto *ArgA = dyn_cast_or_null<Argument>(ADef);
+ auto *ArgB = dyn_cast_or_null<Argument>(BDef);
+
+ if (ArgA || ArgB)
+ return valueComesBefore(ArgA, ArgB);
+
+ auto *AInst = getDefOrUser(ADef, A.U);
+ auto *BInst = getDefOrUser(BDef, B.U);
+ return valueComesBefore(AInst, BInst);
+ }
+};
+
+class PredicateInfoBuilder {
+ // Used to store information about each value we might rename.
+ struct ValueInfo {
+ SmallVector<PredicateBase *, 4> Infos;
+ };
+
+ PredicateInfo &PI;
+ Function &F;
+ DominatorTree &DT;
+ AssumptionCache &AC;
+
+ // This stores info about each operand or comparison result we make copies
+ // of. The real ValueInfos start at index 1, index 0 is unused so that we
+ // can more easily detect invalid indexing.
+ SmallVector<ValueInfo, 32> ValueInfos;
+
+ // This gives the index into the ValueInfos array for a given Value. Because
+ // 0 is not a valid Value Info index, you can use DenseMap::lookup and tell
+ // whether it returned a valid result.
+ DenseMap<Value *, unsigned int> ValueInfoNums;
+
+ // The set of edges along which we can only handle phi uses, due to critical
+ // edges.
+ DenseSet<std::pair<BasicBlock *, BasicBlock *>> EdgeUsesOnly;
+
+ ValueInfo &getOrCreateValueInfo(Value *);
+ const ValueInfo &getValueInfo(Value *) const;
+
+ void processAssume(IntrinsicInst *, BasicBlock *,
+ SmallVectorImpl<Value *> &OpsToRename);
+ void processBranch(BranchInst *, BasicBlock *,
+ SmallVectorImpl<Value *> &OpsToRename);
+ void processSwitch(SwitchInst *, BasicBlock *,
+ SmallVectorImpl<Value *> &OpsToRename);
+ void renameUses(SmallVectorImpl<Value *> &OpsToRename);
+ void addInfoFor(SmallVectorImpl<Value *> &OpsToRename, Value *Op,
+ PredicateBase *PB);
+
+ typedef SmallVectorImpl<ValueDFS> ValueDFSStack;
+ void convertUsesToDFSOrdered(Value *, SmallVectorImpl<ValueDFS> &);
+ Value *materializeStack(unsigned int &, ValueDFSStack &, Value *);
+ bool stackIsInScope(const ValueDFSStack &, const ValueDFS &) const;
+ void popStackUntilDFSScope(ValueDFSStack &, const ValueDFS &);
+
+public:
+ PredicateInfoBuilder(PredicateInfo &PI, Function &F, DominatorTree &DT,
+ AssumptionCache &AC)
+ : PI(PI), F(F), DT(DT), AC(AC) {
+ // Push an empty operand info so that we can detect 0 as not finding one
+ ValueInfos.resize(1);
+ }
+
+ void buildPredicateInfo();
+};
+
+bool PredicateInfoBuilder::stackIsInScope(const ValueDFSStack &Stack,
+ const ValueDFS &VDUse) const {
+ if (Stack.empty())
+ return false;
+ // If it's a phi only use, make sure it's for this phi node edge, and that the
+ // use is in a phi node. If it's anything else, and the top of the stack is
+ // EdgeOnly, we need to pop the stack. We deliberately sort phi uses next to
+ // the defs they must go with so that we can know it's time to pop the stack
+ // when we hit the end of the phi uses for a given def.
+ if (Stack.back().EdgeOnly) {
+ if (!VDUse.U)
+ return false;
+ auto *PHI = dyn_cast<PHINode>(VDUse.U->getUser());
+ if (!PHI)
+ return false;
+ // Check edge
+ BasicBlock *EdgePred = PHI->getIncomingBlock(*VDUse.U);
+ if (EdgePred != getBranchBlock(Stack.back().PInfo))
+ return false;
+
+ // Use dominates, which knows how to handle edge dominance.
+ return DT.dominates(getBlockEdge(Stack.back().PInfo), *VDUse.U);
+ }
+
+ return (VDUse.DFSIn >= Stack.back().DFSIn &&
+ VDUse.DFSOut <= Stack.back().DFSOut);
+}
+
+void PredicateInfoBuilder::popStackUntilDFSScope(ValueDFSStack &Stack,
+ const ValueDFS &VD) {
+ while (!Stack.empty() && !stackIsInScope(Stack, VD))
+ Stack.pop_back();
+}
+
+// Convert the uses of Op into a vector of uses, associating global and local
+// DFS info with each one.
+void PredicateInfoBuilder::convertUsesToDFSOrdered(
+ Value *Op, SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
+ for (auto &U : Op->uses()) {
+ if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+ ValueDFS VD;
+ // Put the phi node uses in the incoming block.
+ BasicBlock *IBlock;
+ if (auto *PN = dyn_cast<PHINode>(I)) {
+ IBlock = PN->getIncomingBlock(U);
+ // Make phi node users appear last in the incoming block
+ // they are from.
+ VD.LocalNum = LN_Last;
+ } else {
+ // If it's not a phi node use, it is somewhere in the middle of the
+ // block.
+ IBlock = I->getParent();
+ VD.LocalNum = LN_Middle;
+ }
+ DomTreeNode *DomNode = DT.getNode(IBlock);
+ // It's possible our use is in an unreachable block. Skip it if so.
+ if (!DomNode)
+ continue;
+ VD.DFSIn = DomNode->getDFSNumIn();
+ VD.DFSOut = DomNode->getDFSNumOut();
+ VD.U = &U;
+ DFSOrderedSet.push_back(VD);
+ }
+ }
+}
+
bool shouldRename(Value *V) {
// Only want real values, not constants. Additionally, operands with one use
// are only being used in the comparison, which means they will not be useful
@@ -378,33 +378,33 @@ bool shouldRename(Value *V) {
return (isa<Instruction>(V) || isa<Argument>(V)) && !V->hasOneUse();
}
-// Collect relevant operations from Comparison that we may want to insert copies
-// for.
-void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) {
- auto *Op0 = Comparison->getOperand(0);
- auto *Op1 = Comparison->getOperand(1);
- if (Op0 == Op1)
- return;
+// Collect relevant operations from Comparison that we may want to insert copies
+// for.
+void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) {
+ auto *Op0 = Comparison->getOperand(0);
+ auto *Op1 = Comparison->getOperand(1);
+ if (Op0 == Op1)
+ return;
CmpOperands.push_back(Op0);
CmpOperands.push_back(Op1);
-}
-
-// Add Op, PB to the list of value infos for Op, and mark Op to be renamed.
-void PredicateInfoBuilder::addInfoFor(SmallVectorImpl<Value *> &OpsToRename,
- Value *Op, PredicateBase *PB) {
- auto &OperandInfo = getOrCreateValueInfo(Op);
- if (OperandInfo.Infos.empty())
- OpsToRename.push_back(Op);
- PI.AllInfos.push_back(PB);
- OperandInfo.Infos.push_back(PB);
-}
-
-// Process an assume instruction and place relevant operations we want to rename
-// into OpsToRename.
-void PredicateInfoBuilder::processAssume(
- IntrinsicInst *II, BasicBlock *AssumeBB,
- SmallVectorImpl<Value *> &OpsToRename) {
+}
+
+// Add Op, PB to the list of value infos for Op, and mark Op to be renamed.
+void PredicateInfoBuilder::addInfoFor(SmallVectorImpl<Value *> &OpsToRename,
+ Value *Op, PredicateBase *PB) {
+ auto &OperandInfo = getOrCreateValueInfo(Op);
+ if (OperandInfo.Infos.empty())
+ OpsToRename.push_back(Op);
+ PI.AllInfos.push_back(PB);
+ OperandInfo.Infos.push_back(PB);
+}
+
+// Process an assume instruction and place relevant operations we want to rename
+// into OpsToRename.
+void PredicateInfoBuilder::processAssume(
+ IntrinsicInst *II, BasicBlock *AssumeBB,
+ SmallVectorImpl<Value *> &OpsToRename) {
SmallVector<Value *, 4> Worklist;
SmallPtrSet<Value *, 4> Visited;
Worklist.push_back(II->getOperand(0));
@@ -414,7 +414,7 @@ void PredicateInfoBuilder::processAssume(
continue;
if (Visited.size() > MaxCondsPerBranch)
break;
-
+
Value *Op0, *Op1;
if (match(Cond, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) {
Worklist.push_back(Op1);
@@ -430,19 +430,19 @@ void PredicateInfoBuilder::processAssume(
if (shouldRename(V)) {
auto *PA = new PredicateAssume(V, II, Cond);
addInfoFor(OpsToRename, V, PA);
- }
- }
- }
-}
-
-// Process a block terminating branch, and place relevant operations to be
-// renamed into OpsToRename.
-void PredicateInfoBuilder::processBranch(
- BranchInst *BI, BasicBlock *BranchBB,
- SmallVectorImpl<Value *> &OpsToRename) {
- BasicBlock *FirstBB = BI->getSuccessor(0);
- BasicBlock *SecondBB = BI->getSuccessor(1);
-
+ }
+ }
+ }
+}
+
+// Process a block terminating branch, and place relevant operations to be
+// renamed into OpsToRename.
+void PredicateInfoBuilder::processBranch(
+ BranchInst *BI, BasicBlock *BranchBB,
+ SmallVectorImpl<Value *> &OpsToRename) {
+ BasicBlock *FirstBB = BI->getSuccessor(0);
+ BasicBlock *SecondBB = BI->getSuccessor(1);
+
for (BasicBlock *Succ : {FirstBB, SecondBB}) {
bool TakenEdge = Succ == FirstBB;
// Don't try to insert on a self-edge. This is mainly because we will
@@ -456,10 +456,10 @@ void PredicateInfoBuilder::processBranch(
while (!Worklist.empty()) {
Value *Cond = Worklist.pop_back_val();
if (!Visited.insert(Cond).second)
- continue;
+ continue;
if (Visited.size() > MaxCondsPerBranch)
break;
-
+
Value *Op0, *Op1;
if (TakenEdge ? match(Cond, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))
: match(Cond, m_LogicalOr(m_Value(Op0), m_Value(Op1)))) {
@@ -481,324 +481,324 @@ void PredicateInfoBuilder::processBranch(
EdgeUsesOnly.insert({BranchBB, Succ});
}
}
- }
- }
-}
-// Process a block terminating switch, and place relevant operations to be
-// renamed into OpsToRename.
-void PredicateInfoBuilder::processSwitch(
- SwitchInst *SI, BasicBlock *BranchBB,
- SmallVectorImpl<Value *> &OpsToRename) {
- Value *Op = SI->getCondition();
- if ((!isa<Instruction>(Op) && !isa<Argument>(Op)) || Op->hasOneUse())
- return;
-
- // Remember how many outgoing edges there are to every successor.
- SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
- for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
- BasicBlock *TargetBlock = SI->getSuccessor(i);
- ++SwitchEdges[TargetBlock];
- }
-
- // Now propagate info for each case value
- for (auto C : SI->cases()) {
- BasicBlock *TargetBlock = C.getCaseSuccessor();
- if (SwitchEdges.lookup(TargetBlock) == 1) {
- PredicateSwitch *PS = new PredicateSwitch(
- Op, SI->getParent(), TargetBlock, C.getCaseValue(), SI);
- addInfoFor(OpsToRename, Op, PS);
- if (!TargetBlock->getSinglePredecessor())
- EdgeUsesOnly.insert({BranchBB, TargetBlock});
- }
- }
-}
-
-// Build predicate info for our function
-void PredicateInfoBuilder::buildPredicateInfo() {
- DT.updateDFSNumbers();
- // Collect operands to rename from all conditional branch terminators, as well
- // as assume statements.
- SmallVector<Value *, 8> OpsToRename;
- for (auto DTN : depth_first(DT.getRootNode())) {
- BasicBlock *BranchBB = DTN->getBlock();
- if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) {
- if (!BI->isConditional())
- continue;
- // Can't insert conditional information if they all go to the same place.
- if (BI->getSuccessor(0) == BI->getSuccessor(1))
- continue;
- processBranch(BI, BranchBB, OpsToRename);
- } else if (auto *SI = dyn_cast<SwitchInst>(BranchBB->getTerminator())) {
- processSwitch(SI, BranchBB, OpsToRename);
- }
- }
- for (auto &Assume : AC.assumptions()) {
- if (auto *II = dyn_cast_or_null<IntrinsicInst>(Assume))
- if (DT.isReachableFromEntry(II->getParent()))
- processAssume(II, II->getParent(), OpsToRename);
- }
- // Now rename all our operations.
- renameUses(OpsToRename);
-}
-
-// Create a ssa_copy declaration with custom mangling, because
-// Intrinsic::getDeclaration does not handle overloaded unnamed types properly:
-// all unnamed types get mangled to the same string. We use the pointer
-// to the type as name here, as it guarantees unique names for different
-// types and we remove the declarations when destroying PredicateInfo.
-// It is a workaround for PR38117, because solving it in a fully general way is
-// tricky (FIXME).
-static Function *getCopyDeclaration(Module *M, Type *Ty) {
- std::string Name = "llvm.ssa.copy." + utostr((uintptr_t) Ty);
- return cast<Function>(
- M->getOrInsertFunction(Name,
- getType(M->getContext(), Intrinsic::ssa_copy, Ty))
- .getCallee());
-}
-
-// Given the renaming stack, make all the operands currently on the stack real
-// by inserting them into the IR. Return the last operation's value.
-Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
- ValueDFSStack &RenameStack,
- Value *OrigOp) {
- // Find the first thing we have to materialize
- auto RevIter = RenameStack.rbegin();
- for (; RevIter != RenameStack.rend(); ++RevIter)
- if (RevIter->Def)
- break;
-
- size_t Start = RevIter - RenameStack.rbegin();
- // The maximum number of things we should be trying to materialize at once
- // right now is 4, depending on if we had an assume, a branch, and both used
- // and of conditions.
- for (auto RenameIter = RenameStack.end() - Start;
- RenameIter != RenameStack.end(); ++RenameIter) {
- auto *Op =
- RenameIter == RenameStack.begin() ? OrigOp : (RenameIter - 1)->Def;
- ValueDFS &Result = *RenameIter;
- auto *ValInfo = Result.PInfo;
- ValInfo->RenamedOp = (RenameStack.end() - Start) == RenameStack.begin()
- ? OrigOp
- : (RenameStack.end() - Start - 1)->Def;
- // For edge predicates, we can just place the operand in the block before
- // the terminator. For assume, we have to place it right before the assume
- // to ensure we dominate all of our uses. Always insert right before the
- // relevant instruction (terminator, assume), so that we insert in proper
- // order in the case of multiple predicateinfo in the same block.
- if (isa<PredicateWithEdge>(ValInfo)) {
- IRBuilder<> B(getBranchTerminator(ValInfo));
- Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
- if (IF->users().empty())
- PI.CreatedDeclarations.insert(IF);
- CallInst *PIC =
- B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
- PI.PredicateMap.insert({PIC, ValInfo});
- Result.Def = PIC;
- } else {
- auto *PAssume = dyn_cast<PredicateAssume>(ValInfo);
- assert(PAssume &&
- "Should not have gotten here without it being an assume");
- // Insert the predicate directly after the assume. While it also holds
- // directly before it, assume(i1 true) is not a useful fact.
- IRBuilder<> B(PAssume->AssumeInst->getNextNode());
- Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
- if (IF->users().empty())
- PI.CreatedDeclarations.insert(IF);
- CallInst *PIC = B.CreateCall(IF, Op);
- PI.PredicateMap.insert({PIC, ValInfo});
- Result.Def = PIC;
- }
- }
- return RenameStack.back().Def;
-}
-
-// Instead of the standard SSA renaming algorithm, which is O(Number of
-// instructions), and walks the entire dominator tree, we walk only the defs +
-// uses. The standard SSA renaming algorithm does not really rely on the
-// dominator tree except to order the stack push/pops of the renaming stacks, so
-// that defs end up getting pushed before hitting the correct uses. This does
-// not require the dominator tree, only the *order* of the dominator tree. The
-// complete and correct ordering of the defs and uses, in dominator tree is
-// contained in the DFS numbering of the dominator tree. So we sort the defs and
-// uses into the DFS ordering, and then just use the renaming stack as per
-// normal, pushing when we hit a def (which is a predicateinfo instruction),
-// popping when we are out of the dfs scope for that def, and replacing any uses
-// with top of stack if it exists. In order to handle liveness without
-// propagating liveness info, we don't actually insert the predicateinfo
-// instruction def until we see a use that it would dominate. Once we see such
-// a use, we materialize the predicateinfo instruction in the right place and
-// use it.
-//
-// TODO: Use this algorithm to perform fast single-variable renaming in
-// promotememtoreg and memoryssa.
-void PredicateInfoBuilder::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
- ValueDFS_Compare Compare(DT);
- // Compute liveness, and rename in O(uses) per Op.
- for (auto *Op : OpsToRename) {
- LLVM_DEBUG(dbgs() << "Visiting " << *Op << "\n");
- unsigned Counter = 0;
- SmallVector<ValueDFS, 16> OrderedUses;
- const auto &ValueInfo = getValueInfo(Op);
- // Insert the possible copies into the def/use list.
- // They will become real copies if we find a real use for them, and never
- // created otherwise.
- for (auto &PossibleCopy : ValueInfo.Infos) {
- ValueDFS VD;
- // Determine where we are going to place the copy by the copy type.
- // The predicate info for branches always come first, they will get
- // materialized in the split block at the top of the block.
- // The predicate info for assumes will be somewhere in the middle,
- // it will get materialized in front of the assume.
- if (const auto *PAssume = dyn_cast<PredicateAssume>(PossibleCopy)) {
- VD.LocalNum = LN_Middle;
- DomTreeNode *DomNode = DT.getNode(PAssume->AssumeInst->getParent());
- if (!DomNode)
- continue;
- VD.DFSIn = DomNode->getDFSNumIn();
- VD.DFSOut = DomNode->getDFSNumOut();
- VD.PInfo = PossibleCopy;
- OrderedUses.push_back(VD);
- } else if (isa<PredicateWithEdge>(PossibleCopy)) {
- // If we can only do phi uses, we treat it like it's in the branch
- // block, and handle it specially. We know that it goes last, and only
- // dominate phi uses.
- auto BlockEdge = getBlockEdge(PossibleCopy);
- if (EdgeUsesOnly.count(BlockEdge)) {
- VD.LocalNum = LN_Last;
- auto *DomNode = DT.getNode(BlockEdge.first);
- if (DomNode) {
- VD.DFSIn = DomNode->getDFSNumIn();
- VD.DFSOut = DomNode->getDFSNumOut();
- VD.PInfo = PossibleCopy;
- VD.EdgeOnly = true;
- OrderedUses.push_back(VD);
- }
- } else {
- // Otherwise, we are in the split block (even though we perform
- // insertion in the branch block).
- // Insert a possible copy at the split block and before the branch.
- VD.LocalNum = LN_First;
- auto *DomNode = DT.getNode(BlockEdge.second);
- if (DomNode) {
- VD.DFSIn = DomNode->getDFSNumIn();
- VD.DFSOut = DomNode->getDFSNumOut();
- VD.PInfo = PossibleCopy;
- OrderedUses.push_back(VD);
- }
- }
- }
- }
-
- convertUsesToDFSOrdered(Op, OrderedUses);
- // Here we require a stable sort because we do not bother to try to
- // assign an order to the operands the uses represent. Thus, two
- // uses in the same instruction do not have a strict sort order
- // currently and will be considered equal. We could get rid of the
- // stable sort by creating one if we wanted.
- llvm::stable_sort(OrderedUses, Compare);
- SmallVector<ValueDFS, 8> RenameStack;
- // For each use, sorted into dfs order, push values and replaces uses with
- // top of stack, which will represent the reaching def.
- for (auto &VD : OrderedUses) {
- // We currently do not materialize copy over copy, but we should decide if
- // we want to.
- bool PossibleCopy = VD.PInfo != nullptr;
- if (RenameStack.empty()) {
- LLVM_DEBUG(dbgs() << "Rename Stack is empty\n");
- } else {
- LLVM_DEBUG(dbgs() << "Rename Stack Top DFS numbers are ("
- << RenameStack.back().DFSIn << ","
- << RenameStack.back().DFSOut << ")\n");
- }
-
- LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
- << VD.DFSOut << ")\n");
-
- bool ShouldPush = (VD.Def || PossibleCopy);
- bool OutOfScope = !stackIsInScope(RenameStack, VD);
- if (OutOfScope || ShouldPush) {
- // Sync to our current scope.
- popStackUntilDFSScope(RenameStack, VD);
- if (ShouldPush) {
- RenameStack.push_back(VD);
- }
- }
- // If we get to this point, and the stack is empty we must have a use
- // with no renaming needed, just skip it.
- if (RenameStack.empty())
- continue;
- // Skip values, only want to rename the uses
- if (VD.Def || PossibleCopy)
- continue;
- if (!DebugCounter::shouldExecute(RenameCounter)) {
- LLVM_DEBUG(dbgs() << "Skipping execution due to debug counter\n");
- continue;
- }
- ValueDFS &Result = RenameStack.back();
-
- // If the possible copy dominates something, materialize our stack up to
- // this point. This ensures every comparison that affects our operation
- // ends up with predicateinfo.
- if (!Result.Def)
- Result.Def = materializeStack(Counter, RenameStack, Op);
-
- LLVM_DEBUG(dbgs() << "Found replacement " << *Result.Def << " for "
- << *VD.U->get() << " in " << *(VD.U->getUser())
- << "\n");
- assert(DT.dominates(cast<Instruction>(Result.Def), *VD.U) &&
- "Predicateinfo def should have dominated this use");
- VD.U->set(Result.Def);
- }
- }
-}
-
-PredicateInfoBuilder::ValueInfo &
-PredicateInfoBuilder::getOrCreateValueInfo(Value *Operand) {
- auto OIN = ValueInfoNums.find(Operand);
- if (OIN == ValueInfoNums.end()) {
- // This will grow it
- ValueInfos.resize(ValueInfos.size() + 1);
- // This will use the new size and give us a 0 based number of the info
- auto InsertResult = ValueInfoNums.insert({Operand, ValueInfos.size() - 1});
- assert(InsertResult.second && "Value info number already existed?");
- return ValueInfos[InsertResult.first->second];
- }
- return ValueInfos[OIN->second];
-}
-
-const PredicateInfoBuilder::ValueInfo &
-PredicateInfoBuilder::getValueInfo(Value *Operand) const {
- auto OINI = ValueInfoNums.lookup(Operand);
- assert(OINI != 0 && "Operand was not really in the Value Info Numbers");
- assert(OINI < ValueInfos.size() &&
- "Value Info Number greater than size of Value Info Table");
- return ValueInfos[OINI];
-}
-
-PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
- AssumptionCache &AC)
- : F(F) {
- PredicateInfoBuilder Builder(*this, F, DT, AC);
- Builder.buildPredicateInfo();
-}
-
-// Remove all declarations we created . The PredicateInfo consumers are
-// responsible for remove the ssa_copy calls created.
-PredicateInfo::~PredicateInfo() {
- // Collect function pointers in set first, as SmallSet uses a SmallVector
- // internally and we have to remove the asserting value handles first.
- SmallPtrSet<Function *, 20> FunctionPtrs;
- for (auto &F : CreatedDeclarations)
- FunctionPtrs.insert(&*F);
- CreatedDeclarations.clear();
-
- for (Function *F : FunctionPtrs) {
- assert(F->user_begin() == F->user_end() &&
- "PredicateInfo consumer did not remove all SSA copies.");
- F->eraseFromParent();
- }
-}
-
+ }
+ }
+}
+// Process a block terminating switch, and place relevant operations to be
+// renamed into OpsToRename.
+void PredicateInfoBuilder::processSwitch(
+ SwitchInst *SI, BasicBlock *BranchBB,
+ SmallVectorImpl<Value *> &OpsToRename) {
+ Value *Op = SI->getCondition();
+ if ((!isa<Instruction>(Op) && !isa<Argument>(Op)) || Op->hasOneUse())
+ return;
+
+ // Remember how many outgoing edges there are to every successor.
+ SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+ for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+ BasicBlock *TargetBlock = SI->getSuccessor(i);
+ ++SwitchEdges[TargetBlock];
+ }
+
+ // Now propagate info for each case value
+ for (auto C : SI->cases()) {
+ BasicBlock *TargetBlock = C.getCaseSuccessor();
+ if (SwitchEdges.lookup(TargetBlock) == 1) {
+ PredicateSwitch *PS = new PredicateSwitch(
+ Op, SI->getParent(), TargetBlock, C.getCaseValue(), SI);
+ addInfoFor(OpsToRename, Op, PS);
+ if (!TargetBlock->getSinglePredecessor())
+ EdgeUsesOnly.insert({BranchBB, TargetBlock});
+ }
+ }
+}
+
+// Build predicate info for our function
+void PredicateInfoBuilder::buildPredicateInfo() {
+ DT.updateDFSNumbers();
+ // Collect operands to rename from all conditional branch terminators, as well
+ // as assume statements.
+ SmallVector<Value *, 8> OpsToRename;
+ for (auto DTN : depth_first(DT.getRootNode())) {
+ BasicBlock *BranchBB = DTN->getBlock();
+ if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) {
+ if (!BI->isConditional())
+ continue;
+ // Can't insert conditional information if they all go to the same place.
+ if (BI->getSuccessor(0) == BI->getSuccessor(1))
+ continue;
+ processBranch(BI, BranchBB, OpsToRename);
+ } else if (auto *SI = dyn_cast<SwitchInst>(BranchBB->getTerminator())) {
+ processSwitch(SI, BranchBB, OpsToRename);
+ }
+ }
+ for (auto &Assume : AC.assumptions()) {
+ if (auto *II = dyn_cast_or_null<IntrinsicInst>(Assume))
+ if (DT.isReachableFromEntry(II->getParent()))
+ processAssume(II, II->getParent(), OpsToRename);
+ }
+ // Now rename all our operations.
+ renameUses(OpsToRename);
+}
+
+// Create a ssa_copy declaration with custom mangling, because
+// Intrinsic::getDeclaration does not handle overloaded unnamed types properly:
+// all unnamed types get mangled to the same string. We use the pointer
+// to the type as name here, as it guarantees unique names for different
+// types and we remove the declarations when destroying PredicateInfo.
+// It is a workaround for PR38117, because solving it in a fully general way is
+// tricky (FIXME).
+static Function *getCopyDeclaration(Module *M, Type *Ty) {
+ std::string Name = "llvm.ssa.copy." + utostr((uintptr_t) Ty);
+ return cast<Function>(
+ M->getOrInsertFunction(Name,
+ getType(M->getContext(), Intrinsic::ssa_copy, Ty))
+ .getCallee());
+}
+
+// Given the renaming stack, make all the operands currently on the stack real
+// by inserting them into the IR. Return the last operation's value.
+Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
+ ValueDFSStack &RenameStack,
+ Value *OrigOp) {
+ // Find the first thing we have to materialize
+ auto RevIter = RenameStack.rbegin();
+ for (; RevIter != RenameStack.rend(); ++RevIter)
+ if (RevIter->Def)
+ break;
+
+ size_t Start = RevIter - RenameStack.rbegin();
+ // The maximum number of things we should be trying to materialize at once
+ // right now is 4, depending on if we had an assume, a branch, and both used
+ // and of conditions.
+ for (auto RenameIter = RenameStack.end() - Start;
+ RenameIter != RenameStack.end(); ++RenameIter) {
+ auto *Op =
+ RenameIter == RenameStack.begin() ? OrigOp : (RenameIter - 1)->Def;
+ ValueDFS &Result = *RenameIter;
+ auto *ValInfo = Result.PInfo;
+ ValInfo->RenamedOp = (RenameStack.end() - Start) == RenameStack.begin()
+ ? OrigOp
+ : (RenameStack.end() - Start - 1)->Def;
+ // For edge predicates, we can just place the operand in the block before
+ // the terminator. For assume, we have to place it right before the assume
+ // to ensure we dominate all of our uses. Always insert right before the
+ // relevant instruction (terminator, assume), so that we insert in proper
+ // order in the case of multiple predicateinfo in the same block.
+ if (isa<PredicateWithEdge>(ValInfo)) {
+ IRBuilder<> B(getBranchTerminator(ValInfo));
+ Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
+ if (IF->users().empty())
+ PI.CreatedDeclarations.insert(IF);
+ CallInst *PIC =
+ B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
+ PI.PredicateMap.insert({PIC, ValInfo});
+ Result.Def = PIC;
+ } else {
+ auto *PAssume = dyn_cast<PredicateAssume>(ValInfo);
+ assert(PAssume &&
+ "Should not have gotten here without it being an assume");
+ // Insert the predicate directly after the assume. While it also holds
+ // directly before it, assume(i1 true) is not a useful fact.
+ IRBuilder<> B(PAssume->AssumeInst->getNextNode());
+ Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
+ if (IF->users().empty())
+ PI.CreatedDeclarations.insert(IF);
+ CallInst *PIC = B.CreateCall(IF, Op);
+ PI.PredicateMap.insert({PIC, ValInfo});
+ Result.Def = PIC;
+ }
+ }
+ return RenameStack.back().Def;
+}
+
+// Instead of the standard SSA renaming algorithm, which is O(Number of
+// instructions), and walks the entire dominator tree, we walk only the defs +
+// uses. The standard SSA renaming algorithm does not really rely on the
+// dominator tree except to order the stack push/pops of the renaming stacks, so
+// that defs end up getting pushed before hitting the correct uses. This does
+// not require the dominator tree, only the *order* of the dominator tree. The
+// complete and correct ordering of the defs and uses, in dominator tree is
+// contained in the DFS numbering of the dominator tree. So we sort the defs and
+// uses into the DFS ordering, and then just use the renaming stack as per
+// normal, pushing when we hit a def (which is a predicateinfo instruction),
+// popping when we are out of the dfs scope for that def, and replacing any uses
+// with top of stack if it exists. In order to handle liveness without
+// propagating liveness info, we don't actually insert the predicateinfo
+// instruction def until we see a use that it would dominate. Once we see such
+// a use, we materialize the predicateinfo instruction in the right place and
+// use it.
+//
+// TODO: Use this algorithm to perform fast single-variable renaming in
+// promotememtoreg and memoryssa.
+void PredicateInfoBuilder::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
+ ValueDFS_Compare Compare(DT);
+ // Compute liveness, and rename in O(uses) per Op.
+ for (auto *Op : OpsToRename) {
+ LLVM_DEBUG(dbgs() << "Visiting " << *Op << "\n");
+ unsigned Counter = 0;
+ SmallVector<ValueDFS, 16> OrderedUses;
+ const auto &ValueInfo = getValueInfo(Op);
+ // Insert the possible copies into the def/use list.
+ // They will become real copies if we find a real use for them, and never
+ // created otherwise.
+ for (auto &PossibleCopy : ValueInfo.Infos) {
+ ValueDFS VD;
+ // Determine where we are going to place the copy by the copy type.
+ // The predicate info for branches always come first, they will get
+ // materialized in the split block at the top of the block.
+ // The predicate info for assumes will be somewhere in the middle,
+ // it will get materialized in front of the assume.
+ if (const auto *PAssume = dyn_cast<PredicateAssume>(PossibleCopy)) {
+ VD.LocalNum = LN_Middle;
+ DomTreeNode *DomNode = DT.getNode(PAssume->AssumeInst->getParent());
+ if (!DomNode)
+ continue;
+ VD.DFSIn = DomNode->getDFSNumIn();
+ VD.DFSOut = DomNode->getDFSNumOut();
+ VD.PInfo = PossibleCopy;
+ OrderedUses.push_back(VD);
+ } else if (isa<PredicateWithEdge>(PossibleCopy)) {
+ // If we can only do phi uses, we treat it like it's in the branch
+ // block, and handle it specially. We know that it goes last, and only
+ // dominate phi uses.
+ auto BlockEdge = getBlockEdge(PossibleCopy);
+ if (EdgeUsesOnly.count(BlockEdge)) {
+ VD.LocalNum = LN_Last;
+ auto *DomNode = DT.getNode(BlockEdge.first);
+ if (DomNode) {
+ VD.DFSIn = DomNode->getDFSNumIn();
+ VD.DFSOut = DomNode->getDFSNumOut();
+ VD.PInfo = PossibleCopy;
+ VD.EdgeOnly = true;
+ OrderedUses.push_back(VD);
+ }
+ } else {
+ // Otherwise, we are in the split block (even though we perform
+ // insertion in the branch block).
+ // Insert a possible copy at the split block and before the branch.
+ VD.LocalNum = LN_First;
+ auto *DomNode = DT.getNode(BlockEdge.second);
+ if (DomNode) {
+ VD.DFSIn = DomNode->getDFSNumIn();
+ VD.DFSOut = DomNode->getDFSNumOut();
+ VD.PInfo = PossibleCopy;
+ OrderedUses.push_back(VD);
+ }
+ }
+ }
+ }
+
+ convertUsesToDFSOrdered(Op, OrderedUses);
+ // Here we require a stable sort because we do not bother to try to
+ // assign an order to the operands the uses represent. Thus, two
+ // uses in the same instruction do not have a strict sort order
+ // currently and will be considered equal. We could get rid of the
+ // stable sort by creating one if we wanted.
+ llvm::stable_sort(OrderedUses, Compare);
+ SmallVector<ValueDFS, 8> RenameStack;
+ // For each use, sorted into dfs order, push values and replaces uses with
+ // top of stack, which will represent the reaching def.
+ for (auto &VD : OrderedUses) {
+ // We currently do not materialize copy over copy, but we should decide if
+ // we want to.
+ bool PossibleCopy = VD.PInfo != nullptr;
+ if (RenameStack.empty()) {
+ LLVM_DEBUG(dbgs() << "Rename Stack is empty\n");
+ } else {
+ LLVM_DEBUG(dbgs() << "Rename Stack Top DFS numbers are ("
+ << RenameStack.back().DFSIn << ","
+ << RenameStack.back().DFSOut << ")\n");
+ }
+
+ LLVM_DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
+ << VD.DFSOut << ")\n");
+
+ bool ShouldPush = (VD.Def || PossibleCopy);
+ bool OutOfScope = !stackIsInScope(RenameStack, VD);
+ if (OutOfScope || ShouldPush) {
+ // Sync to our current scope.
+ popStackUntilDFSScope(RenameStack, VD);
+ if (ShouldPush) {
+ RenameStack.push_back(VD);
+ }
+ }
+ // If we get to this point, and the stack is empty we must have a use
+ // with no renaming needed, just skip it.
+ if (RenameStack.empty())
+ continue;
+ // Skip values, only want to rename the uses
+ if (VD.Def || PossibleCopy)
+ continue;
+ if (!DebugCounter::shouldExecute(RenameCounter)) {
+ LLVM_DEBUG(dbgs() << "Skipping execution due to debug counter\n");
+ continue;
+ }
+ ValueDFS &Result = RenameStack.back();
+
+ // If the possible copy dominates something, materialize our stack up to
+ // this point. This ensures every comparison that affects our operation
+ // ends up with predicateinfo.
+ if (!Result.Def)
+ Result.Def = materializeStack(Counter, RenameStack, Op);
+
+ LLVM_DEBUG(dbgs() << "Found replacement " << *Result.Def << " for "
+ << *VD.U->get() << " in " << *(VD.U->getUser())
+ << "\n");
+ assert(DT.dominates(cast<Instruction>(Result.Def), *VD.U) &&
+ "Predicateinfo def should have dominated this use");
+ VD.U->set(Result.Def);
+ }
+ }
+}
+
+PredicateInfoBuilder::ValueInfo &
+PredicateInfoBuilder::getOrCreateValueInfo(Value *Operand) {
+ auto OIN = ValueInfoNums.find(Operand);
+ if (OIN == ValueInfoNums.end()) {
+ // This will grow it
+ ValueInfos.resize(ValueInfos.size() + 1);
+ // This will use the new size and give us a 0 based number of the info
+ auto InsertResult = ValueInfoNums.insert({Operand, ValueInfos.size() - 1});
+ assert(InsertResult.second && "Value info number already existed?");
+ return ValueInfos[InsertResult.first->second];
+ }
+ return ValueInfos[OIN->second];
+}
+
+const PredicateInfoBuilder::ValueInfo &
+PredicateInfoBuilder::getValueInfo(Value *Operand) const {
+ auto OINI = ValueInfoNums.lookup(Operand);
+ assert(OINI != 0 && "Operand was not really in the Value Info Numbers");
+ assert(OINI < ValueInfos.size() &&
+ "Value Info Number greater than size of Value Info Table");
+ return ValueInfos[OINI];
+}
+
+PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
+ AssumptionCache &AC)
+ : F(F) {
+ PredicateInfoBuilder Builder(*this, F, DT, AC);
+ Builder.buildPredicateInfo();
+}
+
+// Remove all declarations we created . The PredicateInfo consumers are
+// responsible for remove the ssa_copy calls created.
+PredicateInfo::~PredicateInfo() {
+ // Collect function pointers in set first, as SmallSet uses a SmallVector
+ // internally and we have to remove the asserting value handles first.
+ SmallPtrSet<Function *, 20> FunctionPtrs;
+ for (auto &F : CreatedDeclarations)
+ FunctionPtrs.insert(&*F);
+ CreatedDeclarations.clear();
+
+ for (Function *F : FunctionPtrs) {
+ assert(F->user_begin() == F->user_end() &&
+ "PredicateInfo consumer did not remove all SSA copies.");
+ F->eraseFromParent();
+ }
+}
+
Optional<PredicateConstraint> PredicateBase::getConstraint() const {
switch (Type) {
case PT_Assume:
@@ -849,117 +849,117 @@ Optional<PredicateConstraint> PredicateBase::getConstraint() const {
llvm_unreachable("Unknown predicate type");
}
-void PredicateInfo::verifyPredicateInfo() const {}
-
-char PredicateInfoPrinterLegacyPass::ID = 0;
-
-PredicateInfoPrinterLegacyPass::PredicateInfoPrinterLegacyPass()
- : FunctionPass(ID) {
- initializePredicateInfoPrinterLegacyPassPass(
- *PassRegistry::getPassRegistry());
-}
-
-void PredicateInfoPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
- AU.setPreservesAll();
- AU.addRequiredTransitive<DominatorTreeWrapperPass>();
- AU.addRequired<AssumptionCacheTracker>();
-}
-
-// Replace ssa_copy calls created by PredicateInfo with their operand.
-static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) {
- for (auto I = inst_begin(F), E = inst_end(F); I != E;) {
- Instruction *Inst = &*I++;
- const auto *PI = PredInfo.getPredicateInfoFor(Inst);
- auto *II = dyn_cast<IntrinsicInst>(Inst);
- if (!PI || !II || II->getIntrinsicID() != Intrinsic::ssa_copy)
- continue;
-
- Inst->replaceAllUsesWith(II->getOperand(0));
- Inst->eraseFromParent();
- }
-}
-
-bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC);
- PredInfo->print(dbgs());
- if (VerifyPredicateInfo)
- PredInfo->verifyPredicateInfo();
-
- replaceCreatedSSACopys(*PredInfo, F);
- return false;
-}
-
-PreservedAnalyses PredicateInfoPrinterPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &AC = AM.getResult<AssumptionAnalysis>(F);
- OS << "PredicateInfo for function: " << F.getName() << "\n";
- auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC);
- PredInfo->print(OS);
-
- replaceCreatedSSACopys(*PredInfo, F);
- return PreservedAnalyses::all();
-}
-
-/// An assembly annotator class to print PredicateInfo information in
-/// comments.
-class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter {
- friend class PredicateInfo;
- const PredicateInfo *PredInfo;
-
-public:
- PredicateInfoAnnotatedWriter(const PredicateInfo *M) : PredInfo(M) {}
-
- void emitBasicBlockStartAnnot(const BasicBlock *BB,
- formatted_raw_ostream &OS) override {}
-
- void emitInstructionAnnot(const Instruction *I,
- formatted_raw_ostream &OS) override {
- if (const auto *PI = PredInfo->getPredicateInfoFor(I)) {
- OS << "; Has predicate info\n";
- if (const auto *PB = dyn_cast<PredicateBranch>(PI)) {
- OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge
- << " Comparison:" << *PB->Condition << " Edge: [";
- PB->From->printAsOperand(OS);
- OS << ",";
- PB->To->printAsOperand(OS);
- OS << "]";
- } else if (const auto *PS = dyn_cast<PredicateSwitch>(PI)) {
- OS << "; switch predicate info { CaseValue: " << *PS->CaseValue
- << " Switch:" << *PS->Switch << " Edge: [";
- PS->From->printAsOperand(OS);
- OS << ",";
- PS->To->printAsOperand(OS);
- OS << "]";
- } else if (const auto *PA = dyn_cast<PredicateAssume>(PI)) {
- OS << "; assume predicate info {"
- << " Comparison:" << *PA->Condition;
- }
- OS << ", RenamedOp: ";
- PI->RenamedOp->printAsOperand(OS, false);
- OS << " }\n";
- }
- }
-};
-
-void PredicateInfo::print(raw_ostream &OS) const {
- PredicateInfoAnnotatedWriter Writer(this);
- F.print(OS, &Writer);
-}
-
-void PredicateInfo::dump() const {
- PredicateInfoAnnotatedWriter Writer(this);
- F.print(dbgs(), &Writer);
-}
-
-PreservedAnalyses PredicateInfoVerifierPass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &AC = AM.getResult<AssumptionAnalysis>(F);
- std::make_unique<PredicateInfo>(F, DT, AC)->verifyPredicateInfo();
-
- return PreservedAnalyses::all();
-}
-}
+void PredicateInfo::verifyPredicateInfo() const {}
+
+char PredicateInfoPrinterLegacyPass::ID = 0;
+
+PredicateInfoPrinterLegacyPass::PredicateInfoPrinterLegacyPass()
+ : FunctionPass(ID) {
+ initializePredicateInfoPrinterLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+}
+
+void PredicateInfoPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.setPreservesAll();
+ AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+ AU.addRequired<AssumptionCacheTracker>();
+}
+
+// Replace ssa_copy calls created by PredicateInfo with their operand.
+static void replaceCreatedSSACopys(PredicateInfo &PredInfo, Function &F) {
+ for (auto I = inst_begin(F), E = inst_end(F); I != E;) {
+ Instruction *Inst = &*I++;
+ const auto *PI = PredInfo.getPredicateInfoFor(Inst);
+ auto *II = dyn_cast<IntrinsicInst>(Inst);
+ if (!PI || !II || II->getIntrinsicID() != Intrinsic::ssa_copy)
+ continue;
+
+ Inst->replaceAllUsesWith(II->getOperand(0));
+ Inst->eraseFromParent();
+ }
+}
+
+bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC);
+ PredInfo->print(dbgs());
+ if (VerifyPredicateInfo)
+ PredInfo->verifyPredicateInfo();
+
+ replaceCreatedSSACopys(*PredInfo, F);
+ return false;
+}
+
+PreservedAnalyses PredicateInfoPrinterPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ OS << "PredicateInfo for function: " << F.getName() << "\n";
+ auto PredInfo = std::make_unique<PredicateInfo>(F, DT, AC);
+ PredInfo->print(OS);
+
+ replaceCreatedSSACopys(*PredInfo, F);
+ return PreservedAnalyses::all();
+}
+
+/// An assembly annotator class to print PredicateInfo information in
+/// comments.
+class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter {
+ friend class PredicateInfo;
+ const PredicateInfo *PredInfo;
+
+public:
+ PredicateInfoAnnotatedWriter(const PredicateInfo *M) : PredInfo(M) {}
+
+ void emitBasicBlockStartAnnot(const BasicBlock *BB,
+ formatted_raw_ostream &OS) override {}
+
+ void emitInstructionAnnot(const Instruction *I,
+ formatted_raw_ostream &OS) override {
+ if (const auto *PI = PredInfo->getPredicateInfoFor(I)) {
+ OS << "; Has predicate info\n";
+ if (const auto *PB = dyn_cast<PredicateBranch>(PI)) {
+ OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge
+ << " Comparison:" << *PB->Condition << " Edge: [";
+ PB->From->printAsOperand(OS);
+ OS << ",";
+ PB->To->printAsOperand(OS);
+ OS << "]";
+ } else if (const auto *PS = dyn_cast<PredicateSwitch>(PI)) {
+ OS << "; switch predicate info { CaseValue: " << *PS->CaseValue
+ << " Switch:" << *PS->Switch << " Edge: [";
+ PS->From->printAsOperand(OS);
+ OS << ",";
+ PS->To->printAsOperand(OS);
+ OS << "]";
+ } else if (const auto *PA = dyn_cast<PredicateAssume>(PI)) {
+ OS << "; assume predicate info {"
+ << " Comparison:" << *PA->Condition;
+ }
+ OS << ", RenamedOp: ";
+ PI->RenamedOp->printAsOperand(OS, false);
+ OS << " }\n";
+ }
+ }
+};
+
+void PredicateInfo::print(raw_ostream &OS) const {
+ PredicateInfoAnnotatedWriter Writer(this);
+ F.print(OS, &Writer);
+}
+
+void PredicateInfo::dump() const {
+ PredicateInfoAnnotatedWriter Writer(this);
+ F.print(dbgs(), &Writer);
+}
+
+PreservedAnalyses PredicateInfoVerifierPass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ std::make_unique<PredicateInfo>(F, DT, AC)->verifyPredicateInfo();
+
+ return PreservedAnalyses::all();
+}
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index da5405593b..86bbb6a889 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -1,428 +1,428 @@
-//===- PromoteMemoryToRegister.cpp - Convert allocas to registers ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file promotes memory references to be register references. It promotes
-// alloca instructions which only have loads and stores as uses. An alloca is
-// transformed by using iterated dominator frontiers to place PHI nodes, then
-// traversing the function in depth-first order to rewrite loads and stores as
-// appropriate.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/IteratedDominanceFrontier.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DIBuilder.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include <algorithm>
-#include <cassert>
-#include <iterator>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "mem2reg"
-
-STATISTIC(NumLocalPromoted, "Number of alloca's promoted within one block");
-STATISTIC(NumSingleStore, "Number of alloca's promoted with a single store");
-STATISTIC(NumDeadAlloca, "Number of dead alloca's removed");
-STATISTIC(NumPHIInsert, "Number of PHI nodes inserted");
-
-bool llvm::isAllocaPromotable(const AllocaInst *AI) {
- // Only allow direct and non-volatile loads and stores...
- for (const User *U : AI->users()) {
- if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
- // Note that atomic loads can be transformed; atomic semantics do
- // not have any meaning for a local alloca.
- if (LI->isVolatile())
- return false;
- } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
- if (SI->getOperand(0) == AI)
- return false; // Don't allow a store OF the AI, only INTO the AI.
- // Note that atomic stores can be transformed; atomic semantics do
- // not have any meaning for a local alloca.
- if (SI->isVolatile())
- return false;
- } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
+//===- PromoteMemoryToRegister.cpp - Convert allocas to registers ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file promotes memory references to be register references. It promotes
+// alloca instructions which only have loads and stores as uses. An alloca is
+// transformed by using iterated dominator frontiers to place PHI nodes, then
+// traversing the function in depth-first order to rewrite loads and stores as
+// appropriate.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mem2reg"
+
+STATISTIC(NumLocalPromoted, "Number of alloca's promoted within one block");
+STATISTIC(NumSingleStore, "Number of alloca's promoted with a single store");
+STATISTIC(NumDeadAlloca, "Number of dead alloca's removed");
+STATISTIC(NumPHIInsert, "Number of PHI nodes inserted");
+
+bool llvm::isAllocaPromotable(const AllocaInst *AI) {
+ // Only allow direct and non-volatile loads and stores...
+ for (const User *U : AI->users()) {
+ if (const LoadInst *LI = dyn_cast<LoadInst>(U)) {
+ // Note that atomic loads can be transformed; atomic semantics do
+ // not have any meaning for a local alloca.
+ if (LI->isVolatile())
+ return false;
+ } else if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+ if (SI->getOperand(0) == AI)
+ return false; // Don't allow a store OF the AI, only INTO the AI.
+ // Note that atomic stores can be transformed; atomic semantics do
+ // not have any meaning for a local alloca.
+ if (SI->isVolatile())
+ return false;
+ } else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
if (!II->isLifetimeStartOrEnd() && !II->isDroppable())
- return false;
- } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
+ return false;
+ } else if (const BitCastInst *BCI = dyn_cast<BitCastInst>(U)) {
if (!onlyUsedByLifetimeMarkersOrDroppableInsts(BCI))
- return false;
- } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
- if (!GEPI->hasAllZeroIndices())
- return false;
+ return false;
+ } else if (const GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(U)) {
+ if (!GEPI->hasAllZeroIndices())
+ return false;
if (!onlyUsedByLifetimeMarkersOrDroppableInsts(GEPI))
- return false;
+ return false;
} else if (const AddrSpaceCastInst *ASCI = dyn_cast<AddrSpaceCastInst>(U)) {
if (!onlyUsedByLifetimeMarkers(ASCI))
return false;
- } else {
- return false;
- }
- }
-
- return true;
-}
-
-namespace {
-
-struct AllocaInfo {
+ } else {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+namespace {
+
+struct AllocaInfo {
using DbgUserVec = SmallVector<DbgVariableIntrinsic *, 1>;
- SmallVector<BasicBlock *, 32> DefiningBlocks;
- SmallVector<BasicBlock *, 32> UsingBlocks;
-
- StoreInst *OnlyStore;
- BasicBlock *OnlyBlock;
- bool OnlyUsedInOneBlock;
-
+ SmallVector<BasicBlock *, 32> DefiningBlocks;
+ SmallVector<BasicBlock *, 32> UsingBlocks;
+
+ StoreInst *OnlyStore;
+ BasicBlock *OnlyBlock;
+ bool OnlyUsedInOneBlock;
+
DbgUserVec DbgUsers;
-
- void clear() {
- DefiningBlocks.clear();
- UsingBlocks.clear();
- OnlyStore = nullptr;
- OnlyBlock = nullptr;
- OnlyUsedInOneBlock = true;
+
+ void clear() {
+ DefiningBlocks.clear();
+ UsingBlocks.clear();
+ OnlyStore = nullptr;
+ OnlyBlock = nullptr;
+ OnlyUsedInOneBlock = true;
DbgUsers.clear();
- }
-
- /// Scan the uses of the specified alloca, filling in the AllocaInfo used
- /// by the rest of the pass to reason about the uses of this alloca.
- void AnalyzeAlloca(AllocaInst *AI) {
- clear();
-
- // As we scan the uses of the alloca instruction, keep track of stores,
- // and decide whether all of the loads and stores to the alloca are within
- // the same basic block.
+ }
+
+ /// Scan the uses of the specified alloca, filling in the AllocaInfo used
+ /// by the rest of the pass to reason about the uses of this alloca.
+ void AnalyzeAlloca(AllocaInst *AI) {
+ clear();
+
+ // As we scan the uses of the alloca instruction, keep track of stores,
+ // and decide whether all of the loads and stores to the alloca are within
+ // the same basic block.
for (User *U : AI->users()) {
Instruction *User = cast<Instruction>(U);
-
- if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
- // Remember the basic blocks which define new values for the alloca
- DefiningBlocks.push_back(SI->getParent());
- OnlyStore = SI;
- } else {
- LoadInst *LI = cast<LoadInst>(User);
- // Otherwise it must be a load instruction, keep track of variable
- // reads.
- UsingBlocks.push_back(LI->getParent());
- }
-
- if (OnlyUsedInOneBlock) {
- if (!OnlyBlock)
- OnlyBlock = User->getParent();
- else if (OnlyBlock != User->getParent())
- OnlyUsedInOneBlock = false;
- }
- }
-
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+ // Remember the basic blocks which define new values for the alloca
+ DefiningBlocks.push_back(SI->getParent());
+ OnlyStore = SI;
+ } else {
+ LoadInst *LI = cast<LoadInst>(User);
+ // Otherwise it must be a load instruction, keep track of variable
+ // reads.
+ UsingBlocks.push_back(LI->getParent());
+ }
+
+ if (OnlyUsedInOneBlock) {
+ if (!OnlyBlock)
+ OnlyBlock = User->getParent();
+ else if (OnlyBlock != User->getParent())
+ OnlyUsedInOneBlock = false;
+ }
+ }
+
findDbgUsers(DbgUsers, AI);
- }
-};
-
-/// Data package used by RenamePass().
-struct RenamePassData {
- using ValVector = std::vector<Value *>;
- using LocationVector = std::vector<DebugLoc>;
-
- RenamePassData(BasicBlock *B, BasicBlock *P, ValVector V, LocationVector L)
- : BB(B), Pred(P), Values(std::move(V)), Locations(std::move(L)) {}
-
- BasicBlock *BB;
- BasicBlock *Pred;
- ValVector Values;
- LocationVector Locations;
-};
-
-/// This assigns and keeps a per-bb relative ordering of load/store
-/// instructions in the block that directly load or store an alloca.
-///
-/// This functionality is important because it avoids scanning large basic
-/// blocks multiple times when promoting many allocas in the same block.
-class LargeBlockInfo {
- /// For each instruction that we track, keep the index of the
- /// instruction.
- ///
- /// The index starts out as the number of the instruction from the start of
- /// the block.
- DenseMap<const Instruction *, unsigned> InstNumbers;
-
-public:
-
- /// This code only looks at accesses to allocas.
- static bool isInterestingInstruction(const Instruction *I) {
- return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) ||
- (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1)));
- }
-
- /// Get or calculate the index of the specified instruction.
- unsigned getInstructionIndex(const Instruction *I) {
- assert(isInterestingInstruction(I) &&
- "Not a load/store to/from an alloca?");
-
- // If we already have this instruction number, return it.
- DenseMap<const Instruction *, unsigned>::iterator It = InstNumbers.find(I);
- if (It != InstNumbers.end())
- return It->second;
-
- // Scan the whole block to get the instruction. This accumulates
- // information for every interesting instruction in the block, in order to
- // avoid gratuitus rescans.
- const BasicBlock *BB = I->getParent();
- unsigned InstNo = 0;
- for (const Instruction &BBI : *BB)
- if (isInterestingInstruction(&BBI))
- InstNumbers[&BBI] = InstNo++;
- It = InstNumbers.find(I);
-
- assert(It != InstNumbers.end() && "Didn't insert instruction?");
- return It->second;
- }
-
- void deleteValue(const Instruction *I) { InstNumbers.erase(I); }
-
- void clear() { InstNumbers.clear(); }
-};
-
-struct PromoteMem2Reg {
- /// The alloca instructions being promoted.
- std::vector<AllocaInst *> Allocas;
-
- DominatorTree &DT;
- DIBuilder DIB;
-
- /// A cache of @llvm.assume intrinsics used by SimplifyInstruction.
- AssumptionCache *AC;
-
- const SimplifyQuery SQ;
-
- /// Reverse mapping of Allocas.
- DenseMap<AllocaInst *, unsigned> AllocaLookup;
-
- /// The PhiNodes we're adding.
- ///
- /// That map is used to simplify some Phi nodes as we iterate over it, so
- /// it should have deterministic iterators. We could use a MapVector, but
- /// since we already maintain a map from BasicBlock* to a stable numbering
- /// (BBNumbers), the DenseMap is more efficient (also supports removal).
- DenseMap<std::pair<unsigned, unsigned>, PHINode *> NewPhiNodes;
-
- /// For each PHI node, keep track of which entry in Allocas it corresponds
- /// to.
- DenseMap<PHINode *, unsigned> PhiToAllocaMap;
-
- /// For each alloca, we keep track of the dbg.declare intrinsic that
- /// describes it, if any, so that we can convert it to a dbg.value
- /// intrinsic if the alloca gets promoted.
+ }
+};
+
+/// Data package used by RenamePass().
+struct RenamePassData {
+ using ValVector = std::vector<Value *>;
+ using LocationVector = std::vector<DebugLoc>;
+
+ RenamePassData(BasicBlock *B, BasicBlock *P, ValVector V, LocationVector L)
+ : BB(B), Pred(P), Values(std::move(V)), Locations(std::move(L)) {}
+
+ BasicBlock *BB;
+ BasicBlock *Pred;
+ ValVector Values;
+ LocationVector Locations;
+};
+
+/// This assigns and keeps a per-bb relative ordering of load/store
+/// instructions in the block that directly load or store an alloca.
+///
+/// This functionality is important because it avoids scanning large basic
+/// blocks multiple times when promoting many allocas in the same block.
+class LargeBlockInfo {
+ /// For each instruction that we track, keep the index of the
+ /// instruction.
+ ///
+ /// The index starts out as the number of the instruction from the start of
+ /// the block.
+ DenseMap<const Instruction *, unsigned> InstNumbers;
+
+public:
+
+ /// This code only looks at accesses to allocas.
+ static bool isInterestingInstruction(const Instruction *I) {
+ return (isa<LoadInst>(I) && isa<AllocaInst>(I->getOperand(0))) ||
+ (isa<StoreInst>(I) && isa<AllocaInst>(I->getOperand(1)));
+ }
+
+ /// Get or calculate the index of the specified instruction.
+ unsigned getInstructionIndex(const Instruction *I) {
+ assert(isInterestingInstruction(I) &&
+ "Not a load/store to/from an alloca?");
+
+ // If we already have this instruction number, return it.
+ DenseMap<const Instruction *, unsigned>::iterator It = InstNumbers.find(I);
+ if (It != InstNumbers.end())
+ return It->second;
+
+ // Scan the whole block to get the instruction. This accumulates
+ // information for every interesting instruction in the block, in order to
+ // avoid gratuitus rescans.
+ const BasicBlock *BB = I->getParent();
+ unsigned InstNo = 0;
+ for (const Instruction &BBI : *BB)
+ if (isInterestingInstruction(&BBI))
+ InstNumbers[&BBI] = InstNo++;
+ It = InstNumbers.find(I);
+
+ assert(It != InstNumbers.end() && "Didn't insert instruction?");
+ return It->second;
+ }
+
+ void deleteValue(const Instruction *I) { InstNumbers.erase(I); }
+
+ void clear() { InstNumbers.clear(); }
+};
+
+struct PromoteMem2Reg {
+ /// The alloca instructions being promoted.
+ std::vector<AllocaInst *> Allocas;
+
+ DominatorTree &DT;
+ DIBuilder DIB;
+
+ /// A cache of @llvm.assume intrinsics used by SimplifyInstruction.
+ AssumptionCache *AC;
+
+ const SimplifyQuery SQ;
+
+ /// Reverse mapping of Allocas.
+ DenseMap<AllocaInst *, unsigned> AllocaLookup;
+
+ /// The PhiNodes we're adding.
+ ///
+ /// That map is used to simplify some Phi nodes as we iterate over it, so
+ /// it should have deterministic iterators. We could use a MapVector, but
+ /// since we already maintain a map from BasicBlock* to a stable numbering
+ /// (BBNumbers), the DenseMap is more efficient (also supports removal).
+ DenseMap<std::pair<unsigned, unsigned>, PHINode *> NewPhiNodes;
+
+ /// For each PHI node, keep track of which entry in Allocas it corresponds
+ /// to.
+ DenseMap<PHINode *, unsigned> PhiToAllocaMap;
+
+ /// For each alloca, we keep track of the dbg.declare intrinsic that
+ /// describes it, if any, so that we can convert it to a dbg.value
+ /// intrinsic if the alloca gets promoted.
SmallVector<AllocaInfo::DbgUserVec, 8> AllocaDbgUsers;
-
- /// The set of basic blocks the renamer has already visited.
- SmallPtrSet<BasicBlock *, 16> Visited;
-
- /// Contains a stable numbering of basic blocks to avoid non-determinstic
- /// behavior.
- DenseMap<BasicBlock *, unsigned> BBNumbers;
-
- /// Lazily compute the number of predecessors a block has.
- DenseMap<const BasicBlock *, unsigned> BBNumPreds;
-
-public:
- PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
- AssumptionCache *AC)
- : Allocas(Allocas.begin(), Allocas.end()), DT(DT),
- DIB(*DT.getRoot()->getParent()->getParent(), /*AllowUnresolved*/ false),
- AC(AC), SQ(DT.getRoot()->getParent()->getParent()->getDataLayout(),
- nullptr, &DT, AC) {}
-
- void run();
-
-private:
- void RemoveFromAllocasList(unsigned &AllocaIdx) {
- Allocas[AllocaIdx] = Allocas.back();
- Allocas.pop_back();
- --AllocaIdx;
- }
-
- unsigned getNumPreds(const BasicBlock *BB) {
- unsigned &NP = BBNumPreds[BB];
- if (NP == 0)
- NP = pred_size(BB) + 1;
- return NP - 1;
- }
-
- void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info,
- const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
- SmallPtrSetImpl<BasicBlock *> &LiveInBlocks);
- void RenamePass(BasicBlock *BB, BasicBlock *Pred,
- RenamePassData::ValVector &IncVals,
- RenamePassData::LocationVector &IncLocs,
- std::vector<RenamePassData> &Worklist);
- bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version);
-};
-
-} // end anonymous namespace
-
-/// Given a LoadInst LI this adds assume(LI != null) after it.
-static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
- Function *AssumeIntrinsic =
- Intrinsic::getDeclaration(LI->getModule(), Intrinsic::assume);
- ICmpInst *LoadNotNull = new ICmpInst(ICmpInst::ICMP_NE, LI,
- Constant::getNullValue(LI->getType()));
- LoadNotNull->insertAfter(LI);
- CallInst *CI = CallInst::Create(AssumeIntrinsic, {LoadNotNull});
- CI->insertAfter(LoadNotNull);
- AC->registerAssumption(CI);
-}
-
+
+ /// The set of basic blocks the renamer has already visited.
+ SmallPtrSet<BasicBlock *, 16> Visited;
+
+ /// Contains a stable numbering of basic blocks to avoid non-determinstic
+ /// behavior.
+ DenseMap<BasicBlock *, unsigned> BBNumbers;
+
+ /// Lazily compute the number of predecessors a block has.
+ DenseMap<const BasicBlock *, unsigned> BBNumPreds;
+
+public:
+ PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
+ AssumptionCache *AC)
+ : Allocas(Allocas.begin(), Allocas.end()), DT(DT),
+ DIB(*DT.getRoot()->getParent()->getParent(), /*AllowUnresolved*/ false),
+ AC(AC), SQ(DT.getRoot()->getParent()->getParent()->getDataLayout(),
+ nullptr, &DT, AC) {}
+
+ void run();
+
+private:
+ void RemoveFromAllocasList(unsigned &AllocaIdx) {
+ Allocas[AllocaIdx] = Allocas.back();
+ Allocas.pop_back();
+ --AllocaIdx;
+ }
+
+ unsigned getNumPreds(const BasicBlock *BB) {
+ unsigned &NP = BBNumPreds[BB];
+ if (NP == 0)
+ NP = pred_size(BB) + 1;
+ return NP - 1;
+ }
+
+ void ComputeLiveInBlocks(AllocaInst *AI, AllocaInfo &Info,
+ const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+ SmallPtrSetImpl<BasicBlock *> &LiveInBlocks);
+ void RenamePass(BasicBlock *BB, BasicBlock *Pred,
+ RenamePassData::ValVector &IncVals,
+ RenamePassData::LocationVector &IncLocs,
+ std::vector<RenamePassData> &Worklist);
+ bool QueuePhiNode(BasicBlock *BB, unsigned AllocaIdx, unsigned &Version);
+};
+
+} // end anonymous namespace
+
+/// Given a LoadInst LI this adds assume(LI != null) after it.
+static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
+ Function *AssumeIntrinsic =
+ Intrinsic::getDeclaration(LI->getModule(), Intrinsic::assume);
+ ICmpInst *LoadNotNull = new ICmpInst(ICmpInst::ICMP_NE, LI,
+ Constant::getNullValue(LI->getType()));
+ LoadNotNull->insertAfter(LI);
+ CallInst *CI = CallInst::Create(AssumeIntrinsic, {LoadNotNull});
+ CI->insertAfter(LoadNotNull);
+ AC->registerAssumption(CI);
+}
+
static void removeIntrinsicUsers(AllocaInst *AI) {
- // Knowing that this alloca is promotable, we know that it's safe to kill all
- // instructions except for load and store.
-
+ // Knowing that this alloca is promotable, we know that it's safe to kill all
+ // instructions except for load and store.
+
for (auto UI = AI->use_begin(), UE = AI->use_end(); UI != UE;) {
Instruction *I = cast<Instruction>(UI->getUser());
Use &U = *UI;
- ++UI;
- if (isa<LoadInst>(I) || isa<StoreInst>(I))
- continue;
-
+ ++UI;
+ if (isa<LoadInst>(I) || isa<StoreInst>(I))
+ continue;
+
// Drop the use of AI in droppable instructions.
if (I->isDroppable()) {
I->dropDroppableUse(U);
continue;
}
- if (!I->getType()->isVoidTy()) {
- // The only users of this bitcast/GEP instruction are lifetime intrinsics.
- // Follow the use/def chain to erase them now instead of leaving it for
- // dead code elimination later.
+ if (!I->getType()->isVoidTy()) {
+ // The only users of this bitcast/GEP instruction are lifetime intrinsics.
+ // Follow the use/def chain to erase them now instead of leaving it for
+ // dead code elimination later.
for (auto UUI = I->use_begin(), UUE = I->use_end(); UUI != UUE;) {
Instruction *Inst = cast<Instruction>(UUI->getUser());
Use &UU = *UUI;
- ++UUI;
+ ++UUI;
// Drop the use of I in droppable instructions.
if (Inst->isDroppable()) {
Inst->dropDroppableUse(UU);
continue;
}
- Inst->eraseFromParent();
- }
- }
- I->eraseFromParent();
- }
-}
-
-/// Rewrite as many loads as possible given a single store.
-///
-/// When there is only a single store, we can use the domtree to trivially
-/// replace all of the dominated loads with the stored value. Do so, and return
-/// true if this has successfully promoted the alloca entirely. If this returns
-/// false there were some loads which were not dominated by the single store
-/// and thus must be phi-ed with undef. We fall back to the standard alloca
-/// promotion algorithm in that case.
-static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
- LargeBlockInfo &LBI, const DataLayout &DL,
- DominatorTree &DT, AssumptionCache *AC) {
- StoreInst *OnlyStore = Info.OnlyStore;
- bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
- BasicBlock *StoreBB = OnlyStore->getParent();
- int StoreIndex = -1;
-
- // Clear out UsingBlocks. We will reconstruct it here if needed.
- Info.UsingBlocks.clear();
-
+ Inst->eraseFromParent();
+ }
+ }
+ I->eraseFromParent();
+ }
+}
+
+/// Rewrite as many loads as possible given a single store.
+///
+/// When there is only a single store, we can use the domtree to trivially
+/// replace all of the dominated loads with the stored value. Do so, and return
+/// true if this has successfully promoted the alloca entirely. If this returns
+/// false there were some loads which were not dominated by the single store
+/// and thus must be phi-ed with undef. We fall back to the standard alloca
+/// promotion algorithm in that case.
+static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
+ LargeBlockInfo &LBI, const DataLayout &DL,
+ DominatorTree &DT, AssumptionCache *AC) {
+ StoreInst *OnlyStore = Info.OnlyStore;
+ bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
+ BasicBlock *StoreBB = OnlyStore->getParent();
+ int StoreIndex = -1;
+
+ // Clear out UsingBlocks. We will reconstruct it here if needed.
+ Info.UsingBlocks.clear();
+
for (User *U : make_early_inc_range(AI->users())) {
Instruction *UserInst = cast<Instruction>(U);
- if (UserInst == OnlyStore)
- continue;
- LoadInst *LI = cast<LoadInst>(UserInst);
-
- // Okay, if we have a load from the alloca, we want to replace it with the
- // only value stored to the alloca. We can do this if the value is
- // dominated by the store. If not, we use the rest of the mem2reg machinery
- // to insert the phi nodes as needed.
- if (!StoringGlobalVal) { // Non-instructions are always dominated.
- if (LI->getParent() == StoreBB) {
- // If we have a use that is in the same block as the store, compare the
- // indices of the two instructions to see which one came first. If the
- // load came before the store, we can't handle it.
- if (StoreIndex == -1)
- StoreIndex = LBI.getInstructionIndex(OnlyStore);
-
- if (unsigned(StoreIndex) > LBI.getInstructionIndex(LI)) {
- // Can't handle this load, bail out.
- Info.UsingBlocks.push_back(StoreBB);
- continue;
- }
- } else if (!DT.dominates(StoreBB, LI->getParent())) {
- // If the load and store are in different blocks, use BB dominance to
- // check their relationships. If the store doesn't dom the use, bail
- // out.
- Info.UsingBlocks.push_back(LI->getParent());
- continue;
- }
- }
-
- // Otherwise, we *can* safely rewrite this load.
- Value *ReplVal = OnlyStore->getOperand(0);
- // If the replacement value is the load, this must occur in unreachable
- // code.
- if (ReplVal == LI)
- ReplVal = UndefValue::get(LI->getType());
-
- // If the load was marked as nonnull we don't want to lose
- // that information when we erase this Load. So we preserve
- // it with an assume.
- if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
- !isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT))
- addAssumeNonNull(AC, LI);
-
- LI->replaceAllUsesWith(ReplVal);
- LI->eraseFromParent();
- LBI.deleteValue(LI);
- }
-
- // Finally, after the scan, check to see if the store is all that is left.
- if (!Info.UsingBlocks.empty())
- return false; // If not, we'll have to fall back for the remainder.
-
- // Record debuginfo for the store and remove the declaration's
- // debuginfo.
+ if (UserInst == OnlyStore)
+ continue;
+ LoadInst *LI = cast<LoadInst>(UserInst);
+
+ // Okay, if we have a load from the alloca, we want to replace it with the
+ // only value stored to the alloca. We can do this if the value is
+ // dominated by the store. If not, we use the rest of the mem2reg machinery
+ // to insert the phi nodes as needed.
+ if (!StoringGlobalVal) { // Non-instructions are always dominated.
+ if (LI->getParent() == StoreBB) {
+ // If we have a use that is in the same block as the store, compare the
+ // indices of the two instructions to see which one came first. If the
+ // load came before the store, we can't handle it.
+ if (StoreIndex == -1)
+ StoreIndex = LBI.getInstructionIndex(OnlyStore);
+
+ if (unsigned(StoreIndex) > LBI.getInstructionIndex(LI)) {
+ // Can't handle this load, bail out.
+ Info.UsingBlocks.push_back(StoreBB);
+ continue;
+ }
+ } else if (!DT.dominates(StoreBB, LI->getParent())) {
+ // If the load and store are in different blocks, use BB dominance to
+ // check their relationships. If the store doesn't dom the use, bail
+ // out.
+ Info.UsingBlocks.push_back(LI->getParent());
+ continue;
+ }
+ }
+
+ // Otherwise, we *can* safely rewrite this load.
+ Value *ReplVal = OnlyStore->getOperand(0);
+ // If the replacement value is the load, this must occur in unreachable
+ // code.
+ if (ReplVal == LI)
+ ReplVal = UndefValue::get(LI->getType());
+
+ // If the load was marked as nonnull we don't want to lose
+ // that information when we erase this Load. So we preserve
+ // it with an assume.
+ if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+ !isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT))
+ addAssumeNonNull(AC, LI);
+
+ LI->replaceAllUsesWith(ReplVal);
+ LI->eraseFromParent();
+ LBI.deleteValue(LI);
+ }
+
+ // Finally, after the scan, check to see if the store is all that is left.
+ if (!Info.UsingBlocks.empty())
+ return false; // If not, we'll have to fall back for the remainder.
+
+ // Record debuginfo for the store and remove the declaration's
+ // debuginfo.
for (DbgVariableIntrinsic *DII : Info.DbgUsers) {
if (DII->isAddressOfVariable()) {
DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
@@ -431,594 +431,594 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
} else if (DII->getExpression()->startsWithDeref()) {
DII->eraseFromParent();
}
- }
- // Remove the (now dead) store and alloca.
- Info.OnlyStore->eraseFromParent();
- LBI.deleteValue(Info.OnlyStore);
-
- AI->eraseFromParent();
- return true;
-}
-
-/// Many allocas are only used within a single basic block. If this is the
-/// case, avoid traversing the CFG and inserting a lot of potentially useless
-/// PHI nodes by just performing a single linear pass over the basic block
-/// using the Alloca.
-///
-/// If we cannot promote this alloca (because it is read before it is written),
-/// return false. This is necessary in cases where, due to control flow, the
-/// alloca is undefined only on some control flow paths. e.g. code like
-/// this is correct in LLVM IR:
-/// // A is an alloca with no stores so far
-/// for (...) {
-/// int t = *A;
-/// if (!first_iteration)
-/// use(t);
-/// *A = 42;
-/// }
-static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
- LargeBlockInfo &LBI,
- const DataLayout &DL,
- DominatorTree &DT,
- AssumptionCache *AC) {
- // The trickiest case to handle is when we have large blocks. Because of this,
- // this code is optimized assuming that large blocks happen. This does not
- // significantly pessimize the small block case. This uses LargeBlockInfo to
- // make it efficient to get the index of various operations in the block.
-
- // Walk the use-def list of the alloca, getting the locations of all stores.
- using StoresByIndexTy = SmallVector<std::pair<unsigned, StoreInst *>, 64>;
- StoresByIndexTy StoresByIndex;
-
- for (User *U : AI->users())
- if (StoreInst *SI = dyn_cast<StoreInst>(U))
- StoresByIndex.push_back(std::make_pair(LBI.getInstructionIndex(SI), SI));
-
- // Sort the stores by their index, making it efficient to do a lookup with a
- // binary search.
- llvm::sort(StoresByIndex, less_first());
-
- // Walk all of the loads from this alloca, replacing them with the nearest
- // store above them, if any.
+ }
+ // Remove the (now dead) store and alloca.
+ Info.OnlyStore->eraseFromParent();
+ LBI.deleteValue(Info.OnlyStore);
+
+ AI->eraseFromParent();
+ return true;
+}
+
+/// Many allocas are only used within a single basic block. If this is the
+/// case, avoid traversing the CFG and inserting a lot of potentially useless
+/// PHI nodes by just performing a single linear pass over the basic block
+/// using the Alloca.
+///
+/// If we cannot promote this alloca (because it is read before it is written),
+/// return false. This is necessary in cases where, due to control flow, the
+/// alloca is undefined only on some control flow paths. e.g. code like
+/// this is correct in LLVM IR:
+/// // A is an alloca with no stores so far
+/// for (...) {
+/// int t = *A;
+/// if (!first_iteration)
+/// use(t);
+/// *A = 42;
+/// }
+static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
+ LargeBlockInfo &LBI,
+ const DataLayout &DL,
+ DominatorTree &DT,
+ AssumptionCache *AC) {
+ // The trickiest case to handle is when we have large blocks. Because of this,
+ // this code is optimized assuming that large blocks happen. This does not
+ // significantly pessimize the small block case. This uses LargeBlockInfo to
+ // make it efficient to get the index of various operations in the block.
+
+ // Walk the use-def list of the alloca, getting the locations of all stores.
+ using StoresByIndexTy = SmallVector<std::pair<unsigned, StoreInst *>, 64>;
+ StoresByIndexTy StoresByIndex;
+
+ for (User *U : AI->users())
+ if (StoreInst *SI = dyn_cast<StoreInst>(U))
+ StoresByIndex.push_back(std::make_pair(LBI.getInstructionIndex(SI), SI));
+
+ // Sort the stores by their index, making it efficient to do a lookup with a
+ // binary search.
+ llvm::sort(StoresByIndex, less_first());
+
+ // Walk all of the loads from this alloca, replacing them with the nearest
+ // store above them, if any.
for (User *U : make_early_inc_range(AI->users())) {
LoadInst *LI = dyn_cast<LoadInst>(U);
- if (!LI)
- continue;
-
- unsigned LoadIdx = LBI.getInstructionIndex(LI);
-
- // Find the nearest store that has a lower index than this load.
- StoresByIndexTy::iterator I = llvm::lower_bound(
- StoresByIndex,
- std::make_pair(LoadIdx, static_cast<StoreInst *>(nullptr)),
- less_first());
- if (I == StoresByIndex.begin()) {
- if (StoresByIndex.empty())
- // If there are no stores, the load takes the undef value.
- LI->replaceAllUsesWith(UndefValue::get(LI->getType()));
- else
- // There is no store before this load, bail out (load may be affected
- // by the following stores - see main comment).
- return false;
- } else {
- // Otherwise, there was a store before this load, the load takes its value.
- // Note, if the load was marked as nonnull we don't want to lose that
- // information when we erase it. So we preserve it with an assume.
- Value *ReplVal = std::prev(I)->second->getOperand(0);
- if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
- !isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT))
- addAssumeNonNull(AC, LI);
-
- // If the replacement value is the load, this must occur in unreachable
- // code.
- if (ReplVal == LI)
- ReplVal = UndefValue::get(LI->getType());
-
- LI->replaceAllUsesWith(ReplVal);
- }
-
- LI->eraseFromParent();
- LBI.deleteValue(LI);
- }
-
- // Remove the (now dead) stores and alloca.
- while (!AI->use_empty()) {
- StoreInst *SI = cast<StoreInst>(AI->user_back());
- // Record debuginfo for the store before removing it.
+ if (!LI)
+ continue;
+
+ unsigned LoadIdx = LBI.getInstructionIndex(LI);
+
+ // Find the nearest store that has a lower index than this load.
+ StoresByIndexTy::iterator I = llvm::lower_bound(
+ StoresByIndex,
+ std::make_pair(LoadIdx, static_cast<StoreInst *>(nullptr)),
+ less_first());
+ if (I == StoresByIndex.begin()) {
+ if (StoresByIndex.empty())
+ // If there are no stores, the load takes the undef value.
+ LI->replaceAllUsesWith(UndefValue::get(LI->getType()));
+ else
+ // There is no store before this load, bail out (load may be affected
+ // by the following stores - see main comment).
+ return false;
+ } else {
+ // Otherwise, there was a store before this load, the load takes its value.
+ // Note, if the load was marked as nonnull we don't want to lose that
+ // information when we erase it. So we preserve it with an assume.
+ Value *ReplVal = std::prev(I)->second->getOperand(0);
+ if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+ !isKnownNonZero(ReplVal, DL, 0, AC, LI, &DT))
+ addAssumeNonNull(AC, LI);
+
+ // If the replacement value is the load, this must occur in unreachable
+ // code.
+ if (ReplVal == LI)
+ ReplVal = UndefValue::get(LI->getType());
+
+ LI->replaceAllUsesWith(ReplVal);
+ }
+
+ LI->eraseFromParent();
+ LBI.deleteValue(LI);
+ }
+
+ // Remove the (now dead) stores and alloca.
+ while (!AI->use_empty()) {
+ StoreInst *SI = cast<StoreInst>(AI->user_back());
+ // Record debuginfo for the store before removing it.
for (DbgVariableIntrinsic *DII : Info.DbgUsers) {
if (DII->isAddressOfVariable()) {
DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
ConvertDebugDeclareToDebugValue(DII, SI, DIB);
}
- }
- SI->eraseFromParent();
- LBI.deleteValue(SI);
- }
-
- AI->eraseFromParent();
-
- // The alloca's debuginfo can be removed as well.
+ }
+ SI->eraseFromParent();
+ LBI.deleteValue(SI);
+ }
+
+ AI->eraseFromParent();
+
+ // The alloca's debuginfo can be removed as well.
for (DbgVariableIntrinsic *DII : Info.DbgUsers)
if (DII->isAddressOfVariable() || DII->getExpression()->startsWithDeref())
DII->eraseFromParent();
-
- ++NumLocalPromoted;
- return true;
-}
-
-void PromoteMem2Reg::run() {
- Function &F = *DT.getRoot()->getParent();
-
+
+ ++NumLocalPromoted;
+ return true;
+}
+
+void PromoteMem2Reg::run() {
+ Function &F = *DT.getRoot()->getParent();
+
AllocaDbgUsers.resize(Allocas.size());
-
- AllocaInfo Info;
- LargeBlockInfo LBI;
- ForwardIDFCalculator IDF(DT);
-
- for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
- AllocaInst *AI = Allocas[AllocaNum];
-
- assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!");
- assert(AI->getParent()->getParent() == &F &&
- "All allocas should be in the same function, which is same as DF!");
-
+
+ AllocaInfo Info;
+ LargeBlockInfo LBI;
+ ForwardIDFCalculator IDF(DT);
+
+ for (unsigned AllocaNum = 0; AllocaNum != Allocas.size(); ++AllocaNum) {
+ AllocaInst *AI = Allocas[AllocaNum];
+
+ assert(isAllocaPromotable(AI) && "Cannot promote non-promotable alloca!");
+ assert(AI->getParent()->getParent() == &F &&
+ "All allocas should be in the same function, which is same as DF!");
+
removeIntrinsicUsers(AI);
-
- if (AI->use_empty()) {
- // If there are no uses of the alloca, just delete it now.
- AI->eraseFromParent();
-
- // Remove the alloca from the Allocas list, since it has been processed
- RemoveFromAllocasList(AllocaNum);
- ++NumDeadAlloca;
- continue;
- }
-
- // Calculate the set of read and write-locations for each alloca. This is
- // analogous to finding the 'uses' and 'definitions' of each variable.
- Info.AnalyzeAlloca(AI);
-
- // If there is only a single store to this value, replace any loads of
- // it that are directly dominated by the definition with the value stored.
- if (Info.DefiningBlocks.size() == 1) {
- if (rewriteSingleStoreAlloca(AI, Info, LBI, SQ.DL, DT, AC)) {
- // The alloca has been processed, move on.
- RemoveFromAllocasList(AllocaNum);
- ++NumSingleStore;
- continue;
- }
- }
-
- // If the alloca is only read and written in one basic block, just perform a
- // linear sweep over the block to eliminate it.
- if (Info.OnlyUsedInOneBlock &&
- promoteSingleBlockAlloca(AI, Info, LBI, SQ.DL, DT, AC)) {
- // The alloca has been processed, move on.
- RemoveFromAllocasList(AllocaNum);
- continue;
- }
-
- // If we haven't computed a numbering for the BB's in the function, do so
- // now.
- if (BBNumbers.empty()) {
- unsigned ID = 0;
- for (auto &BB : F)
- BBNumbers[&BB] = ID++;
- }
-
- // Remember the dbg.declare intrinsic describing this alloca, if any.
+
+ if (AI->use_empty()) {
+ // If there are no uses of the alloca, just delete it now.
+ AI->eraseFromParent();
+
+ // Remove the alloca from the Allocas list, since it has been processed
+ RemoveFromAllocasList(AllocaNum);
+ ++NumDeadAlloca;
+ continue;
+ }
+
+ // Calculate the set of read and write-locations for each alloca. This is
+ // analogous to finding the 'uses' and 'definitions' of each variable.
+ Info.AnalyzeAlloca(AI);
+
+ // If there is only a single store to this value, replace any loads of
+ // it that are directly dominated by the definition with the value stored.
+ if (Info.DefiningBlocks.size() == 1) {
+ if (rewriteSingleStoreAlloca(AI, Info, LBI, SQ.DL, DT, AC)) {
+ // The alloca has been processed, move on.
+ RemoveFromAllocasList(AllocaNum);
+ ++NumSingleStore;
+ continue;
+ }
+ }
+
+ // If the alloca is only read and written in one basic block, just perform a
+ // linear sweep over the block to eliminate it.
+ if (Info.OnlyUsedInOneBlock &&
+ promoteSingleBlockAlloca(AI, Info, LBI, SQ.DL, DT, AC)) {
+ // The alloca has been processed, move on.
+ RemoveFromAllocasList(AllocaNum);
+ continue;
+ }
+
+ // If we haven't computed a numbering for the BB's in the function, do so
+ // now.
+ if (BBNumbers.empty()) {
+ unsigned ID = 0;
+ for (auto &BB : F)
+ BBNumbers[&BB] = ID++;
+ }
+
+ // Remember the dbg.declare intrinsic describing this alloca, if any.
if (!Info.DbgUsers.empty())
AllocaDbgUsers[AllocaNum] = Info.DbgUsers;
-
- // Keep the reverse mapping of the 'Allocas' array for the rename pass.
- AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
-
- // Unique the set of defining blocks for efficient lookup.
- SmallPtrSet<BasicBlock *, 32> DefBlocks(Info.DefiningBlocks.begin(),
- Info.DefiningBlocks.end());
-
- // Determine which blocks the value is live in. These are blocks which lead
- // to uses.
- SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
- ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
-
- // At this point, we're committed to promoting the alloca using IDF's, and
- // the standard SSA construction algorithm. Determine which blocks need phi
- // nodes and see if we can optimize out some work by avoiding insertion of
- // dead phi nodes.
- IDF.setLiveInBlocks(LiveInBlocks);
- IDF.setDefiningBlocks(DefBlocks);
- SmallVector<BasicBlock *, 32> PHIBlocks;
- IDF.calculate(PHIBlocks);
- llvm::sort(PHIBlocks, [this](BasicBlock *A, BasicBlock *B) {
- return BBNumbers.find(A)->second < BBNumbers.find(B)->second;
- });
-
- unsigned CurrentVersion = 0;
- for (BasicBlock *BB : PHIBlocks)
- QueuePhiNode(BB, AllocaNum, CurrentVersion);
- }
-
- if (Allocas.empty())
- return; // All of the allocas must have been trivial!
-
- LBI.clear();
-
- // Set the incoming values for the basic block to be null values for all of
- // the alloca's. We do this in case there is a load of a value that has not
- // been stored yet. In this case, it will get this null value.
- RenamePassData::ValVector Values(Allocas.size());
- for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
- Values[i] = UndefValue::get(Allocas[i]->getAllocatedType());
-
- // When handling debug info, treat all incoming values as if they have unknown
- // locations until proven otherwise.
- RenamePassData::LocationVector Locations(Allocas.size());
-
- // Walks all basic blocks in the function performing the SSA rename algorithm
- // and inserting the phi nodes we marked as necessary
- std::vector<RenamePassData> RenamePassWorkList;
- RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values),
- std::move(Locations));
- do {
- RenamePassData RPD = std::move(RenamePassWorkList.back());
- RenamePassWorkList.pop_back();
- // RenamePass may add new worklist entries.
- RenamePass(RPD.BB, RPD.Pred, RPD.Values, RPD.Locations, RenamePassWorkList);
- } while (!RenamePassWorkList.empty());
-
- // The renamer uses the Visited set to avoid infinite loops. Clear it now.
- Visited.clear();
-
- // Remove the allocas themselves from the function.
- for (Instruction *A : Allocas) {
- // If there are any uses of the alloca instructions left, they must be in
- // unreachable basic blocks that were not processed by walking the dominator
- // tree. Just delete the users now.
- if (!A->use_empty())
- A->replaceAllUsesWith(UndefValue::get(A->getType()));
- A->eraseFromParent();
- }
-
- // Remove alloca's dbg.declare instrinsics from the function.
+
+ // Keep the reverse mapping of the 'Allocas' array for the rename pass.
+ AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
+
+ // Unique the set of defining blocks for efficient lookup.
+ SmallPtrSet<BasicBlock *, 32> DefBlocks(Info.DefiningBlocks.begin(),
+ Info.DefiningBlocks.end());
+
+ // Determine which blocks the value is live in. These are blocks which lead
+ // to uses.
+ SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
+ ComputeLiveInBlocks(AI, Info, DefBlocks, LiveInBlocks);
+
+ // At this point, we're committed to promoting the alloca using IDF's, and
+ // the standard SSA construction algorithm. Determine which blocks need phi
+ // nodes and see if we can optimize out some work by avoiding insertion of
+ // dead phi nodes.
+ IDF.setLiveInBlocks(LiveInBlocks);
+ IDF.setDefiningBlocks(DefBlocks);
+ SmallVector<BasicBlock *, 32> PHIBlocks;
+ IDF.calculate(PHIBlocks);
+ llvm::sort(PHIBlocks, [this](BasicBlock *A, BasicBlock *B) {
+ return BBNumbers.find(A)->second < BBNumbers.find(B)->second;
+ });
+
+ unsigned CurrentVersion = 0;
+ for (BasicBlock *BB : PHIBlocks)
+ QueuePhiNode(BB, AllocaNum, CurrentVersion);
+ }
+
+ if (Allocas.empty())
+ return; // All of the allocas must have been trivial!
+
+ LBI.clear();
+
+ // Set the incoming values for the basic block to be null values for all of
+ // the alloca's. We do this in case there is a load of a value that has not
+ // been stored yet. In this case, it will get this null value.
+ RenamePassData::ValVector Values(Allocas.size());
+ for (unsigned i = 0, e = Allocas.size(); i != e; ++i)
+ Values[i] = UndefValue::get(Allocas[i]->getAllocatedType());
+
+ // When handling debug info, treat all incoming values as if they have unknown
+ // locations until proven otherwise.
+ RenamePassData::LocationVector Locations(Allocas.size());
+
+ // Walks all basic blocks in the function performing the SSA rename algorithm
+ // and inserting the phi nodes we marked as necessary
+ std::vector<RenamePassData> RenamePassWorkList;
+ RenamePassWorkList.emplace_back(&F.front(), nullptr, std::move(Values),
+ std::move(Locations));
+ do {
+ RenamePassData RPD = std::move(RenamePassWorkList.back());
+ RenamePassWorkList.pop_back();
+ // RenamePass may add new worklist entries.
+ RenamePass(RPD.BB, RPD.Pred, RPD.Values, RPD.Locations, RenamePassWorkList);
+ } while (!RenamePassWorkList.empty());
+
+ // The renamer uses the Visited set to avoid infinite loops. Clear it now.
+ Visited.clear();
+
+ // Remove the allocas themselves from the function.
+ for (Instruction *A : Allocas) {
+ // If there are any uses of the alloca instructions left, they must be in
+ // unreachable basic blocks that were not processed by walking the dominator
+ // tree. Just delete the users now.
+ if (!A->use_empty())
+ A->replaceAllUsesWith(UndefValue::get(A->getType()));
+ A->eraseFromParent();
+ }
+
+ // Remove alloca's dbg.declare instrinsics from the function.
for (auto &DbgUsers : AllocaDbgUsers) {
for (auto *DII : DbgUsers)
if (DII->isAddressOfVariable() || DII->getExpression()->startsWithDeref())
DII->eraseFromParent();
}
-
- // Loop over all of the PHI nodes and see if there are any that we can get
- // rid of because they merge all of the same incoming values. This can
- // happen due to undef values coming into the PHI nodes. This process is
- // iterative, because eliminating one PHI node can cause others to be removed.
- bool EliminatedAPHI = true;
- while (EliminatedAPHI) {
- EliminatedAPHI = false;
-
- // Iterating over NewPhiNodes is deterministic, so it is safe to try to
- // simplify and RAUW them as we go. If it was not, we could add uses to
- // the values we replace with in a non-deterministic order, thus creating
- // non-deterministic def->use chains.
- for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator
- I = NewPhiNodes.begin(),
- E = NewPhiNodes.end();
- I != E;) {
- PHINode *PN = I->second;
-
- // If this PHI node merges one value and/or undefs, get the value.
- if (Value *V = SimplifyInstruction(PN, SQ)) {
- PN->replaceAllUsesWith(V);
- PN->eraseFromParent();
- NewPhiNodes.erase(I++);
- EliminatedAPHI = true;
- continue;
- }
- ++I;
- }
- }
-
- // At this point, the renamer has added entries to PHI nodes for all reachable
- // code. Unfortunately, there may be unreachable blocks which the renamer
- // hasn't traversed. If this is the case, the PHI nodes may not
- // have incoming values for all predecessors. Loop over all PHI nodes we have
- // created, inserting undef values if they are missing any incoming values.
- for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator
- I = NewPhiNodes.begin(),
- E = NewPhiNodes.end();
- I != E; ++I) {
- // We want to do this once per basic block. As such, only process a block
- // when we find the PHI that is the first entry in the block.
- PHINode *SomePHI = I->second;
- BasicBlock *BB = SomePHI->getParent();
- if (&BB->front() != SomePHI)
- continue;
-
- // Only do work here if there the PHI nodes are missing incoming values. We
- // know that all PHI nodes that were inserted in a block will have the same
- // number of incoming values, so we can just check any of them.
- if (SomePHI->getNumIncomingValues() == getNumPreds(BB))
- continue;
-
- // Get the preds for BB.
+
+ // Loop over all of the PHI nodes and see if there are any that we can get
+ // rid of because they merge all of the same incoming values. This can
+ // happen due to undef values coming into the PHI nodes. This process is
+ // iterative, because eliminating one PHI node can cause others to be removed.
+ bool EliminatedAPHI = true;
+ while (EliminatedAPHI) {
+ EliminatedAPHI = false;
+
+ // Iterating over NewPhiNodes is deterministic, so it is safe to try to
+ // simplify and RAUW them as we go. If it was not, we could add uses to
+ // the values we replace with in a non-deterministic order, thus creating
+ // non-deterministic def->use chains.
+ for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator
+ I = NewPhiNodes.begin(),
+ E = NewPhiNodes.end();
+ I != E;) {
+ PHINode *PN = I->second;
+
+ // If this PHI node merges one value and/or undefs, get the value.
+ if (Value *V = SimplifyInstruction(PN, SQ)) {
+ PN->replaceAllUsesWith(V);
+ PN->eraseFromParent();
+ NewPhiNodes.erase(I++);
+ EliminatedAPHI = true;
+ continue;
+ }
+ ++I;
+ }
+ }
+
+ // At this point, the renamer has added entries to PHI nodes for all reachable
+ // code. Unfortunately, there may be unreachable blocks which the renamer
+ // hasn't traversed. If this is the case, the PHI nodes may not
+ // have incoming values for all predecessors. Loop over all PHI nodes we have
+ // created, inserting undef values if they are missing any incoming values.
+ for (DenseMap<std::pair<unsigned, unsigned>, PHINode *>::iterator
+ I = NewPhiNodes.begin(),
+ E = NewPhiNodes.end();
+ I != E; ++I) {
+ // We want to do this once per basic block. As such, only process a block
+ // when we find the PHI that is the first entry in the block.
+ PHINode *SomePHI = I->second;
+ BasicBlock *BB = SomePHI->getParent();
+ if (&BB->front() != SomePHI)
+ continue;
+
+ // Only do work here if there the PHI nodes are missing incoming values. We
+ // know that all PHI nodes that were inserted in a block will have the same
+ // number of incoming values, so we can just check any of them.
+ if (SomePHI->getNumIncomingValues() == getNumPreds(BB))
+ continue;
+
+ // Get the preds for BB.
SmallVector<BasicBlock *, 16> Preds(predecessors(BB));
-
- // Ok, now we know that all of the PHI nodes are missing entries for some
- // basic blocks. Start by sorting the incoming predecessors for efficient
- // access.
- auto CompareBBNumbers = [this](BasicBlock *A, BasicBlock *B) {
- return BBNumbers.find(A)->second < BBNumbers.find(B)->second;
- };
- llvm::sort(Preds, CompareBBNumbers);
-
- // Now we loop through all BB's which have entries in SomePHI and remove
- // them from the Preds list.
- for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) {
- // Do a log(n) search of the Preds list for the entry we want.
- SmallVectorImpl<BasicBlock *>::iterator EntIt = llvm::lower_bound(
- Preds, SomePHI->getIncomingBlock(i), CompareBBNumbers);
- assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i) &&
- "PHI node has entry for a block which is not a predecessor!");
-
- // Remove the entry
- Preds.erase(EntIt);
- }
-
- // At this point, the blocks left in the preds list must have dummy
- // entries inserted into every PHI nodes for the block. Update all the phi
- // nodes in this block that we are inserting (there could be phis before
- // mem2reg runs).
- unsigned NumBadPreds = SomePHI->getNumIncomingValues();
- BasicBlock::iterator BBI = BB->begin();
- while ((SomePHI = dyn_cast<PHINode>(BBI++)) &&
- SomePHI->getNumIncomingValues() == NumBadPreds) {
- Value *UndefVal = UndefValue::get(SomePHI->getType());
- for (BasicBlock *Pred : Preds)
- SomePHI->addIncoming(UndefVal, Pred);
- }
- }
-
- NewPhiNodes.clear();
-}
-
-/// Determine which blocks the value is live in.
-///
-/// These are blocks which lead to uses. Knowing this allows us to avoid
-/// inserting PHI nodes into blocks which don't lead to uses (thus, the
-/// inserted phi nodes would be dead).
-void PromoteMem2Reg::ComputeLiveInBlocks(
- AllocaInst *AI, AllocaInfo &Info,
- const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
- SmallPtrSetImpl<BasicBlock *> &LiveInBlocks) {
- // To determine liveness, we must iterate through the predecessors of blocks
- // where the def is live. Blocks are added to the worklist if we need to
- // check their predecessors. Start with all the using blocks.
- SmallVector<BasicBlock *, 64> LiveInBlockWorklist(Info.UsingBlocks.begin(),
- Info.UsingBlocks.end());
-
- // If any of the using blocks is also a definition block, check to see if the
- // definition occurs before or after the use. If it happens before the use,
- // the value isn't really live-in.
- for (unsigned i = 0, e = LiveInBlockWorklist.size(); i != e; ++i) {
- BasicBlock *BB = LiveInBlockWorklist[i];
- if (!DefBlocks.count(BB))
- continue;
-
- // Okay, this is a block that both uses and defines the value. If the first
- // reference to the alloca is a def (store), then we know it isn't live-in.
- for (BasicBlock::iterator I = BB->begin();; ++I) {
- if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
- if (SI->getOperand(1) != AI)
- continue;
-
- // We found a store to the alloca before a load. The alloca is not
- // actually live-in here.
- LiveInBlockWorklist[i] = LiveInBlockWorklist.back();
- LiveInBlockWorklist.pop_back();
- --i;
- --e;
- break;
- }
-
- if (LoadInst *LI = dyn_cast<LoadInst>(I))
- // Okay, we found a load before a store to the alloca. It is actually
- // live into this block.
- if (LI->getOperand(0) == AI)
- break;
- }
- }
-
- // Now that we have a set of blocks where the phi is live-in, recursively add
- // their predecessors until we find the full region the value is live.
- while (!LiveInBlockWorklist.empty()) {
- BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
-
- // The block really is live in here, insert it into the set. If already in
- // the set, then it has already been processed.
- if (!LiveInBlocks.insert(BB).second)
- continue;
-
- // Since the value is live into BB, it is either defined in a predecessor or
- // live into it to. Add the preds to the worklist unless they are a
- // defining block.
- for (BasicBlock *P : predecessors(BB)) {
- // The value is not live into a predecessor if it defines the value.
- if (DefBlocks.count(P))
- continue;
-
- // Otherwise it is, add to the worklist.
- LiveInBlockWorklist.push_back(P);
- }
- }
-}
-
-/// Queue a phi-node to be added to a basic-block for a specific Alloca.
-///
-/// Returns true if there wasn't already a phi-node for that variable
-bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
- unsigned &Version) {
- // Look up the basic-block in question.
- PHINode *&PN = NewPhiNodes[std::make_pair(BBNumbers[BB], AllocaNo)];
-
- // If the BB already has a phi node added for the i'th alloca then we're done!
- if (PN)
- return false;
-
- // Create a PhiNode using the dereferenced type... and add the phi-node to the
- // BasicBlock.
- PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(), getNumPreds(BB),
- Allocas[AllocaNo]->getName() + "." + Twine(Version++),
- &BB->front());
- ++NumPHIInsert;
- PhiToAllocaMap[PN] = AllocaNo;
- return true;
-}
-
-/// Update the debug location of a phi. \p ApplyMergedLoc indicates whether to
-/// create a merged location incorporating \p DL, or to set \p DL directly.
-static void updateForIncomingValueLocation(PHINode *PN, DebugLoc DL,
- bool ApplyMergedLoc) {
- if (ApplyMergedLoc)
- PN->applyMergedLocation(PN->getDebugLoc(), DL);
- else
- PN->setDebugLoc(DL);
-}
-
-/// Recursively traverse the CFG of the function, renaming loads and
-/// stores to the allocas which we are promoting.
-///
-/// IncomingVals indicates what value each Alloca contains on exit from the
-/// predecessor block Pred.
-void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
- RenamePassData::ValVector &IncomingVals,
- RenamePassData::LocationVector &IncomingLocs,
- std::vector<RenamePassData> &Worklist) {
-NextIteration:
- // If we are inserting any phi nodes into this BB, they will already be in the
- // block.
- if (PHINode *APN = dyn_cast<PHINode>(BB->begin())) {
- // If we have PHI nodes to update, compute the number of edges from Pred to
- // BB.
- if (PhiToAllocaMap.count(APN)) {
- // We want to be able to distinguish between PHI nodes being inserted by
- // this invocation of mem2reg from those phi nodes that already existed in
- // the IR before mem2reg was run. We determine that APN is being inserted
- // because it is missing incoming edges. All other PHI nodes being
- // inserted by this pass of mem2reg will have the same number of incoming
- // operands so far. Remember this count.
- unsigned NewPHINumOperands = APN->getNumOperands();
-
+
+ // Ok, now we know that all of the PHI nodes are missing entries for some
+ // basic blocks. Start by sorting the incoming predecessors for efficient
+ // access.
+ auto CompareBBNumbers = [this](BasicBlock *A, BasicBlock *B) {
+ return BBNumbers.find(A)->second < BBNumbers.find(B)->second;
+ };
+ llvm::sort(Preds, CompareBBNumbers);
+
+ // Now we loop through all BB's which have entries in SomePHI and remove
+ // them from the Preds list.
+ for (unsigned i = 0, e = SomePHI->getNumIncomingValues(); i != e; ++i) {
+ // Do a log(n) search of the Preds list for the entry we want.
+ SmallVectorImpl<BasicBlock *>::iterator EntIt = llvm::lower_bound(
+ Preds, SomePHI->getIncomingBlock(i), CompareBBNumbers);
+ assert(EntIt != Preds.end() && *EntIt == SomePHI->getIncomingBlock(i) &&
+ "PHI node has entry for a block which is not a predecessor!");
+
+ // Remove the entry
+ Preds.erase(EntIt);
+ }
+
+ // At this point, the blocks left in the preds list must have dummy
+ // entries inserted into every PHI nodes for the block. Update all the phi
+ // nodes in this block that we are inserting (there could be phis before
+ // mem2reg runs).
+ unsigned NumBadPreds = SomePHI->getNumIncomingValues();
+ BasicBlock::iterator BBI = BB->begin();
+ while ((SomePHI = dyn_cast<PHINode>(BBI++)) &&
+ SomePHI->getNumIncomingValues() == NumBadPreds) {
+ Value *UndefVal = UndefValue::get(SomePHI->getType());
+ for (BasicBlock *Pred : Preds)
+ SomePHI->addIncoming(UndefVal, Pred);
+ }
+ }
+
+ NewPhiNodes.clear();
+}
+
+/// Determine which blocks the value is live in.
+///
+/// These are blocks which lead to uses. Knowing this allows us to avoid
+/// inserting PHI nodes into blocks which don't lead to uses (thus, the
+/// inserted phi nodes would be dead).
+void PromoteMem2Reg::ComputeLiveInBlocks(
+ AllocaInst *AI, AllocaInfo &Info,
+ const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+ SmallPtrSetImpl<BasicBlock *> &LiveInBlocks) {
+ // To determine liveness, we must iterate through the predecessors of blocks
+ // where the def is live. Blocks are added to the worklist if we need to
+ // check their predecessors. Start with all the using blocks.
+ SmallVector<BasicBlock *, 64> LiveInBlockWorklist(Info.UsingBlocks.begin(),
+ Info.UsingBlocks.end());
+
+ // If any of the using blocks is also a definition block, check to see if the
+ // definition occurs before or after the use. If it happens before the use,
+ // the value isn't really live-in.
+ for (unsigned i = 0, e = LiveInBlockWorklist.size(); i != e; ++i) {
+ BasicBlock *BB = LiveInBlockWorklist[i];
+ if (!DefBlocks.count(BB))
+ continue;
+
+ // Okay, this is a block that both uses and defines the value. If the first
+ // reference to the alloca is a def (store), then we know it isn't live-in.
+ for (BasicBlock::iterator I = BB->begin();; ++I) {
+ if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ if (SI->getOperand(1) != AI)
+ continue;
+
+ // We found a store to the alloca before a load. The alloca is not
+ // actually live-in here.
+ LiveInBlockWorklist[i] = LiveInBlockWorklist.back();
+ LiveInBlockWorklist.pop_back();
+ --i;
+ --e;
+ break;
+ }
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ // Okay, we found a load before a store to the alloca. It is actually
+ // live into this block.
+ if (LI->getOperand(0) == AI)
+ break;
+ }
+ }
+
+ // Now that we have a set of blocks where the phi is live-in, recursively add
+ // their predecessors until we find the full region the value is live.
+ while (!LiveInBlockWorklist.empty()) {
+ BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
+
+ // The block really is live in here, insert it into the set. If already in
+ // the set, then it has already been processed.
+ if (!LiveInBlocks.insert(BB).second)
+ continue;
+
+ // Since the value is live into BB, it is either defined in a predecessor or
+ // live into it to. Add the preds to the worklist unless they are a
+ // defining block.
+ for (BasicBlock *P : predecessors(BB)) {
+ // The value is not live into a predecessor if it defines the value.
+ if (DefBlocks.count(P))
+ continue;
+
+ // Otherwise it is, add to the worklist.
+ LiveInBlockWorklist.push_back(P);
+ }
+ }
+}
+
+/// Queue a phi-node to be added to a basic-block for a specific Alloca.
+///
+/// Returns true if there wasn't already a phi-node for that variable
+bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
+ unsigned &Version) {
+ // Look up the basic-block in question.
+ PHINode *&PN = NewPhiNodes[std::make_pair(BBNumbers[BB], AllocaNo)];
+
+ // If the BB already has a phi node added for the i'th alloca then we're done!
+ if (PN)
+ return false;
+
+ // Create a PhiNode using the dereferenced type... and add the phi-node to the
+ // BasicBlock.
+ PN = PHINode::Create(Allocas[AllocaNo]->getAllocatedType(), getNumPreds(BB),
+ Allocas[AllocaNo]->getName() + "." + Twine(Version++),
+ &BB->front());
+ ++NumPHIInsert;
+ PhiToAllocaMap[PN] = AllocaNo;
+ return true;
+}
+
+/// Update the debug location of a phi. \p ApplyMergedLoc indicates whether to
+/// create a merged location incorporating \p DL, or to set \p DL directly.
+static void updateForIncomingValueLocation(PHINode *PN, DebugLoc DL,
+ bool ApplyMergedLoc) {
+ if (ApplyMergedLoc)
+ PN->applyMergedLocation(PN->getDebugLoc(), DL);
+ else
+ PN->setDebugLoc(DL);
+}
+
+/// Recursively traverse the CFG of the function, renaming loads and
+/// stores to the allocas which we are promoting.
+///
+/// IncomingVals indicates what value each Alloca contains on exit from the
+/// predecessor block Pred.
+void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
+ RenamePassData::ValVector &IncomingVals,
+ RenamePassData::LocationVector &IncomingLocs,
+ std::vector<RenamePassData> &Worklist) {
+NextIteration:
+ // If we are inserting any phi nodes into this BB, they will already be in the
+ // block.
+ if (PHINode *APN = dyn_cast<PHINode>(BB->begin())) {
+ // If we have PHI nodes to update, compute the number of edges from Pred to
+ // BB.
+ if (PhiToAllocaMap.count(APN)) {
+ // We want to be able to distinguish between PHI nodes being inserted by
+ // this invocation of mem2reg from those phi nodes that already existed in
+ // the IR before mem2reg was run. We determine that APN is being inserted
+ // because it is missing incoming edges. All other PHI nodes being
+ // inserted by this pass of mem2reg will have the same number of incoming
+ // operands so far. Remember this count.
+ unsigned NewPHINumOperands = APN->getNumOperands();
+
unsigned NumEdges = llvm::count(successors(Pred), BB);
- assert(NumEdges && "Must be at least one edge from Pred to BB!");
-
- // Add entries for all the phis.
- BasicBlock::iterator PNI = BB->begin();
- do {
- unsigned AllocaNo = PhiToAllocaMap[APN];
-
- // Update the location of the phi node.
- updateForIncomingValueLocation(APN, IncomingLocs[AllocaNo],
- APN->getNumIncomingValues() > 0);
-
- // Add N incoming values to the PHI node.
- for (unsigned i = 0; i != NumEdges; ++i)
- APN->addIncoming(IncomingVals[AllocaNo], Pred);
-
- // The currently active variable for this block is now the PHI.
- IncomingVals[AllocaNo] = APN;
+ assert(NumEdges && "Must be at least one edge from Pred to BB!");
+
+ // Add entries for all the phis.
+ BasicBlock::iterator PNI = BB->begin();
+ do {
+ unsigned AllocaNo = PhiToAllocaMap[APN];
+
+ // Update the location of the phi node.
+ updateForIncomingValueLocation(APN, IncomingLocs[AllocaNo],
+ APN->getNumIncomingValues() > 0);
+
+ // Add N incoming values to the PHI node.
+ for (unsigned i = 0; i != NumEdges; ++i)
+ APN->addIncoming(IncomingVals[AllocaNo], Pred);
+
+ // The currently active variable for this block is now the PHI.
+ IncomingVals[AllocaNo] = APN;
for (DbgVariableIntrinsic *DII : AllocaDbgUsers[AllocaNo])
if (DII->isAddressOfVariable())
ConvertDebugDeclareToDebugValue(DII, APN, DIB);
-
- // Get the next phi node.
- ++PNI;
- APN = dyn_cast<PHINode>(PNI);
- if (!APN)
- break;
-
- // Verify that it is missing entries. If not, it is not being inserted
- // by this mem2reg invocation so we want to ignore it.
- } while (APN->getNumOperands() == NewPHINumOperands);
- }
- }
-
- // Don't revisit blocks.
- if (!Visited.insert(BB).second)
- return;
-
- for (BasicBlock::iterator II = BB->begin(); !II->isTerminator();) {
- Instruction *I = &*II++; // get the instruction, increment iterator
-
- if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
- AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand());
- if (!Src)
- continue;
-
- DenseMap<AllocaInst *, unsigned>::iterator AI = AllocaLookup.find(Src);
- if (AI == AllocaLookup.end())
- continue;
-
- Value *V = IncomingVals[AI->second];
-
- // If the load was marked as nonnull we don't want to lose
- // that information when we erase this Load. So we preserve
- // it with an assume.
- if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
- !isKnownNonZero(V, SQ.DL, 0, AC, LI, &DT))
- addAssumeNonNull(AC, LI);
-
- // Anything using the load now uses the current value.
- LI->replaceAllUsesWith(V);
- BB->getInstList().erase(LI);
- } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
- // Delete this instruction and mark the name as the current holder of the
- // value
- AllocaInst *Dest = dyn_cast<AllocaInst>(SI->getPointerOperand());
- if (!Dest)
- continue;
-
- DenseMap<AllocaInst *, unsigned>::iterator ai = AllocaLookup.find(Dest);
- if (ai == AllocaLookup.end())
- continue;
-
- // what value were we writing?
- unsigned AllocaNo = ai->second;
- IncomingVals[AllocaNo] = SI->getOperand(0);
-
- // Record debuginfo for the store before removing it.
- IncomingLocs[AllocaNo] = SI->getDebugLoc();
+
+ // Get the next phi node.
+ ++PNI;
+ APN = dyn_cast<PHINode>(PNI);
+ if (!APN)
+ break;
+
+ // Verify that it is missing entries. If not, it is not being inserted
+ // by this mem2reg invocation so we want to ignore it.
+ } while (APN->getNumOperands() == NewPHINumOperands);
+ }
+ }
+
+ // Don't revisit blocks.
+ if (!Visited.insert(BB).second)
+ return;
+
+ for (BasicBlock::iterator II = BB->begin(); !II->isTerminator();) {
+ Instruction *I = &*II++; // get the instruction, increment iterator
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+ AllocaInst *Src = dyn_cast<AllocaInst>(LI->getPointerOperand());
+ if (!Src)
+ continue;
+
+ DenseMap<AllocaInst *, unsigned>::iterator AI = AllocaLookup.find(Src);
+ if (AI == AllocaLookup.end())
+ continue;
+
+ Value *V = IncomingVals[AI->second];
+
+ // If the load was marked as nonnull we don't want to lose
+ // that information when we erase this Load. So we preserve
+ // it with an assume.
+ if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+ !isKnownNonZero(V, SQ.DL, 0, AC, LI, &DT))
+ addAssumeNonNull(AC, LI);
+
+ // Anything using the load now uses the current value.
+ LI->replaceAllUsesWith(V);
+ BB->getInstList().erase(LI);
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+ // Delete this instruction and mark the name as the current holder of the
+ // value
+ AllocaInst *Dest = dyn_cast<AllocaInst>(SI->getPointerOperand());
+ if (!Dest)
+ continue;
+
+ DenseMap<AllocaInst *, unsigned>::iterator ai = AllocaLookup.find(Dest);
+ if (ai == AllocaLookup.end())
+ continue;
+
+ // what value were we writing?
+ unsigned AllocaNo = ai->second;
+ IncomingVals[AllocaNo] = SI->getOperand(0);
+
+ // Record debuginfo for the store before removing it.
+ IncomingLocs[AllocaNo] = SI->getDebugLoc();
for (DbgVariableIntrinsic *DII : AllocaDbgUsers[ai->second])
if (DII->isAddressOfVariable())
ConvertDebugDeclareToDebugValue(DII, SI, DIB);
- BB->getInstList().erase(SI);
- }
- }
-
- // 'Recurse' to our successors.
- succ_iterator I = succ_begin(BB), E = succ_end(BB);
- if (I == E)
- return;
-
- // Keep track of the successors so we don't visit the same successor twice
- SmallPtrSet<BasicBlock *, 8> VisitedSuccs;
-
- // Handle the first successor without using the worklist.
- VisitedSuccs.insert(*I);
- Pred = BB;
- BB = *I;
- ++I;
-
- for (; I != E; ++I)
- if (VisitedSuccs.insert(*I).second)
- Worklist.emplace_back(*I, Pred, IncomingVals, IncomingLocs);
-
- goto NextIteration;
-}
-
-void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
- AssumptionCache *AC) {
- // If there is nothing to do, bail out...
- if (Allocas.empty())
- return;
-
- PromoteMem2Reg(Allocas, DT, AC).run();
-}
+ BB->getInstList().erase(SI);
+ }
+ }
+
+ // 'Recurse' to our successors.
+ succ_iterator I = succ_begin(BB), E = succ_end(BB);
+ if (I == E)
+ return;
+
+ // Keep track of the successors so we don't visit the same successor twice
+ SmallPtrSet<BasicBlock *, 8> VisitedSuccs;
+
+ // Handle the first successor without using the worklist.
+ VisitedSuccs.insert(*I);
+ Pred = BB;
+ BB = *I;
+ ++I;
+
+ for (; I != E; ++I)
+ if (VisitedSuccs.insert(*I).second)
+ Worklist.emplace_back(*I, Pred, IncomingVals, IncomingLocs);
+
+ goto NextIteration;
+}
+
+void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
+ AssumptionCache *AC) {
+ // If there is nothing to do, bail out...
+ if (Allocas.empty())
+ return;
+
+ PromoteMem2Reg(Allocas, DT, AC).run();
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdater.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdater.cpp
index 509027119c..c210d1c460 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdater.cpp
@@ -1,481 +1,481 @@
-//===- SSAUpdater.cpp - Unstructured SSA Update Tool ----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the SSAUpdater class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
-#include <cassert>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "ssaupdater"
-
-using AvailableValsTy = DenseMap<BasicBlock *, Value *>;
-
-static AvailableValsTy &getAvailableVals(void *AV) {
- return *static_cast<AvailableValsTy*>(AV);
-}
-
-SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode *> *NewPHI)
- : InsertedPHIs(NewPHI) {}
-
-SSAUpdater::~SSAUpdater() {
- delete static_cast<AvailableValsTy*>(AV);
-}
-
-void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
- if (!AV)
- AV = new AvailableValsTy();
- else
- getAvailableVals(AV).clear();
- ProtoType = Ty;
- ProtoName = std::string(Name);
-}
-
-bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const {
- return getAvailableVals(AV).count(BB);
-}
-
-Value *SSAUpdater::FindValueForBlock(BasicBlock *BB) const {
+//===- SSAUpdater.cpp - Unstructured SSA Update Tool ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SSAUpdater class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
+#include <cassert>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ssaupdater"
+
+using AvailableValsTy = DenseMap<BasicBlock *, Value *>;
+
+static AvailableValsTy &getAvailableVals(void *AV) {
+ return *static_cast<AvailableValsTy*>(AV);
+}
+
+SSAUpdater::SSAUpdater(SmallVectorImpl<PHINode *> *NewPHI)
+ : InsertedPHIs(NewPHI) {}
+
+SSAUpdater::~SSAUpdater() {
+ delete static_cast<AvailableValsTy*>(AV);
+}
+
+void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
+ if (!AV)
+ AV = new AvailableValsTy();
+ else
+ getAvailableVals(AV).clear();
+ ProtoType = Ty;
+ ProtoName = std::string(Name);
+}
+
+bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const {
+ return getAvailableVals(AV).count(BB);
+}
+
+Value *SSAUpdater::FindValueForBlock(BasicBlock *BB) const {
return getAvailableVals(AV).lookup(BB);
-}
-
-void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
- assert(ProtoType && "Need to initialize SSAUpdater");
- assert(ProtoType == V->getType() &&
- "All rewritten values must have the same type");
- getAvailableVals(AV)[BB] = V;
-}
-
-static bool IsEquivalentPHI(PHINode *PHI,
- SmallDenseMap<BasicBlock *, Value *, 8> &ValueMapping) {
- unsigned PHINumValues = PHI->getNumIncomingValues();
- if (PHINumValues != ValueMapping.size())
- return false;
-
- // Scan the phi to see if it matches.
- for (unsigned i = 0, e = PHINumValues; i != e; ++i)
- if (ValueMapping[PHI->getIncomingBlock(i)] !=
- PHI->getIncomingValue(i)) {
- return false;
- }
-
- return true;
-}
-
-Value *SSAUpdater::GetValueAtEndOfBlock(BasicBlock *BB) {
- Value *Res = GetValueAtEndOfBlockInternal(BB);
- return Res;
-}
-
-Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
- // If there is no definition of the renamed variable in this block, just use
- // GetValueAtEndOfBlock to do our work.
- if (!HasValueForBlock(BB))
- return GetValueAtEndOfBlock(BB);
-
- // Otherwise, we have the hard case. Get the live-in values for each
- // predecessor.
- SmallVector<std::pair<BasicBlock *, Value *>, 8> PredValues;
- Value *SingularValue = nullptr;
-
- // We can get our predecessor info by walking the pred_iterator list, but it
- // is relatively slow. If we already have PHI nodes in this block, walk one
- // of them to get the predecessor list instead.
- if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) {
- for (unsigned i = 0, e = SomePhi->getNumIncomingValues(); i != e; ++i) {
- BasicBlock *PredBB = SomePhi->getIncomingBlock(i);
- Value *PredVal = GetValueAtEndOfBlock(PredBB);
- PredValues.push_back(std::make_pair(PredBB, PredVal));
-
- // Compute SingularValue.
- if (i == 0)
- SingularValue = PredVal;
- else if (PredVal != SingularValue)
- SingularValue = nullptr;
- }
- } else {
- bool isFirstPred = true;
- for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
- BasicBlock *PredBB = *PI;
- Value *PredVal = GetValueAtEndOfBlock(PredBB);
- PredValues.push_back(std::make_pair(PredBB, PredVal));
-
- // Compute SingularValue.
- if (isFirstPred) {
- SingularValue = PredVal;
- isFirstPred = false;
- } else if (PredVal != SingularValue)
- SingularValue = nullptr;
- }
- }
-
- // If there are no predecessors, just return undef.
- if (PredValues.empty())
- return UndefValue::get(ProtoType);
-
- // Otherwise, if all the merged values are the same, just use it.
- if (SingularValue)
- return SingularValue;
-
- // Otherwise, we do need a PHI: check to see if we already have one available
- // in this block that produces the right value.
- if (isa<PHINode>(BB->begin())) {
- SmallDenseMap<BasicBlock *, Value *, 8> ValueMapping(PredValues.begin(),
- PredValues.end());
- for (PHINode &SomePHI : BB->phis()) {
- if (IsEquivalentPHI(&SomePHI, ValueMapping))
- return &SomePHI;
- }
- }
-
- // Ok, we have no way out, insert a new one now.
- PHINode *InsertedPHI = PHINode::Create(ProtoType, PredValues.size(),
- ProtoName, &BB->front());
-
- // Fill in all the predecessors of the PHI.
- for (const auto &PredValue : PredValues)
- InsertedPHI->addIncoming(PredValue.second, PredValue.first);
-
- // See if the PHI node can be merged to a single value. This can happen in
- // loop cases when we get a PHI of itself and one other value.
- if (Value *V =
- SimplifyInstruction(InsertedPHI, BB->getModule()->getDataLayout())) {
- InsertedPHI->eraseFromParent();
- return V;
- }
-
- // Set the DebugLoc of the inserted PHI, if available.
- DebugLoc DL;
- if (const Instruction *I = BB->getFirstNonPHI())
- DL = I->getDebugLoc();
- InsertedPHI->setDebugLoc(DL);
-
- // If the client wants to know about all new instructions, tell it.
- if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
-
- LLVM_DEBUG(dbgs() << " Inserted PHI: " << *InsertedPHI << "\n");
- return InsertedPHI;
-}
-
-void SSAUpdater::RewriteUse(Use &U) {
- Instruction *User = cast<Instruction>(U.getUser());
-
- Value *V;
- if (PHINode *UserPN = dyn_cast<PHINode>(User))
- V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U));
- else
- V = GetValueInMiddleOfBlock(User->getParent());
-
- U.set(V);
-}
-
-void SSAUpdater::RewriteUseAfterInsertions(Use &U) {
- Instruction *User = cast<Instruction>(U.getUser());
-
- Value *V;
- if (PHINode *UserPN = dyn_cast<PHINode>(User))
- V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U));
- else
- V = GetValueAtEndOfBlock(User->getParent());
-
- U.set(V);
-}
-
-namespace llvm {
-
-template<>
-class SSAUpdaterTraits<SSAUpdater> {
-public:
- using BlkT = BasicBlock;
- using ValT = Value *;
- using PhiT = PHINode;
- using BlkSucc_iterator = succ_iterator;
-
- static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return succ_begin(BB); }
- static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return succ_end(BB); }
-
- class PHI_iterator {
- private:
- PHINode *PHI;
- unsigned idx;
-
- public:
- explicit PHI_iterator(PHINode *P) // begin iterator
- : PHI(P), idx(0) {}
- PHI_iterator(PHINode *P, bool) // end iterator
- : PHI(P), idx(PHI->getNumIncomingValues()) {}
-
- PHI_iterator &operator++() { ++idx; return *this; }
- bool operator==(const PHI_iterator& x) const { return idx == x.idx; }
- bool operator!=(const PHI_iterator& x) const { return !operator==(x); }
-
- Value *getIncomingValue() { return PHI->getIncomingValue(idx); }
- BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); }
- };
-
- static PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
- static PHI_iterator PHI_end(PhiT *PHI) {
- return PHI_iterator(PHI, true);
- }
-
- /// FindPredecessorBlocks - Put the predecessors of Info->BB into the Preds
- /// vector, set Info->NumPreds, and allocate space in Info->Preds.
- static void FindPredecessorBlocks(BasicBlock *BB,
- SmallVectorImpl<BasicBlock *> *Preds) {
- // We can get our predecessor info by walking the pred_iterator list,
- // but it is relatively slow. If we already have PHI nodes in this
- // block, walk one of them to get the predecessor list instead.
+}
+
+void SSAUpdater::AddAvailableValue(BasicBlock *BB, Value *V) {
+ assert(ProtoType && "Need to initialize SSAUpdater");
+ assert(ProtoType == V->getType() &&
+ "All rewritten values must have the same type");
+ getAvailableVals(AV)[BB] = V;
+}
+
+static bool IsEquivalentPHI(PHINode *PHI,
+ SmallDenseMap<BasicBlock *, Value *, 8> &ValueMapping) {
+ unsigned PHINumValues = PHI->getNumIncomingValues();
+ if (PHINumValues != ValueMapping.size())
+ return false;
+
+ // Scan the phi to see if it matches.
+ for (unsigned i = 0, e = PHINumValues; i != e; ++i)
+ if (ValueMapping[PHI->getIncomingBlock(i)] !=
+ PHI->getIncomingValue(i)) {
+ return false;
+ }
+
+ return true;
+}
+
+Value *SSAUpdater::GetValueAtEndOfBlock(BasicBlock *BB) {
+ Value *Res = GetValueAtEndOfBlockInternal(BB);
+ return Res;
+}
+
+Value *SSAUpdater::GetValueInMiddleOfBlock(BasicBlock *BB) {
+ // If there is no definition of the renamed variable in this block, just use
+ // GetValueAtEndOfBlock to do our work.
+ if (!HasValueForBlock(BB))
+ return GetValueAtEndOfBlock(BB);
+
+ // Otherwise, we have the hard case. Get the live-in values for each
+ // predecessor.
+ SmallVector<std::pair<BasicBlock *, Value *>, 8> PredValues;
+ Value *SingularValue = nullptr;
+
+ // We can get our predecessor info by walking the pred_iterator list, but it
+ // is relatively slow. If we already have PHI nodes in this block, walk one
+ // of them to get the predecessor list instead.
+ if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin())) {
+ for (unsigned i = 0, e = SomePhi->getNumIncomingValues(); i != e; ++i) {
+ BasicBlock *PredBB = SomePhi->getIncomingBlock(i);
+ Value *PredVal = GetValueAtEndOfBlock(PredBB);
+ PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+ // Compute SingularValue.
+ if (i == 0)
+ SingularValue = PredVal;
+ else if (PredVal != SingularValue)
+ SingularValue = nullptr;
+ }
+ } else {
+ bool isFirstPred = true;
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+ BasicBlock *PredBB = *PI;
+ Value *PredVal = GetValueAtEndOfBlock(PredBB);
+ PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+ // Compute SingularValue.
+ if (isFirstPred) {
+ SingularValue = PredVal;
+ isFirstPred = false;
+ } else if (PredVal != SingularValue)
+ SingularValue = nullptr;
+ }
+ }
+
+ // If there are no predecessors, just return undef.
+ if (PredValues.empty())
+ return UndefValue::get(ProtoType);
+
+ // Otherwise, if all the merged values are the same, just use it.
+ if (SingularValue)
+ return SingularValue;
+
+ // Otherwise, we do need a PHI: check to see if we already have one available
+ // in this block that produces the right value.
+ if (isa<PHINode>(BB->begin())) {
+ SmallDenseMap<BasicBlock *, Value *, 8> ValueMapping(PredValues.begin(),
+ PredValues.end());
+ for (PHINode &SomePHI : BB->phis()) {
+ if (IsEquivalentPHI(&SomePHI, ValueMapping))
+ return &SomePHI;
+ }
+ }
+
+ // Ok, we have no way out, insert a new one now.
+ PHINode *InsertedPHI = PHINode::Create(ProtoType, PredValues.size(),
+ ProtoName, &BB->front());
+
+ // Fill in all the predecessors of the PHI.
+ for (const auto &PredValue : PredValues)
+ InsertedPHI->addIncoming(PredValue.second, PredValue.first);
+
+ // See if the PHI node can be merged to a single value. This can happen in
+ // loop cases when we get a PHI of itself and one other value.
+ if (Value *V =
+ SimplifyInstruction(InsertedPHI, BB->getModule()->getDataLayout())) {
+ InsertedPHI->eraseFromParent();
+ return V;
+ }
+
+ // Set the DebugLoc of the inserted PHI, if available.
+ DebugLoc DL;
+ if (const Instruction *I = BB->getFirstNonPHI())
+ DL = I->getDebugLoc();
+ InsertedPHI->setDebugLoc(DL);
+
+ // If the client wants to know about all new instructions, tell it.
+ if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
+
+ LLVM_DEBUG(dbgs() << " Inserted PHI: " << *InsertedPHI << "\n");
+ return InsertedPHI;
+}
+
+void SSAUpdater::RewriteUse(Use &U) {
+ Instruction *User = cast<Instruction>(U.getUser());
+
+ Value *V;
+ if (PHINode *UserPN = dyn_cast<PHINode>(User))
+ V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U));
+ else
+ V = GetValueInMiddleOfBlock(User->getParent());
+
+ U.set(V);
+}
+
+void SSAUpdater::RewriteUseAfterInsertions(Use &U) {
+ Instruction *User = cast<Instruction>(U.getUser());
+
+ Value *V;
+ if (PHINode *UserPN = dyn_cast<PHINode>(User))
+ V = GetValueAtEndOfBlock(UserPN->getIncomingBlock(U));
+ else
+ V = GetValueAtEndOfBlock(User->getParent());
+
+ U.set(V);
+}
+
+namespace llvm {
+
+template<>
+class SSAUpdaterTraits<SSAUpdater> {
+public:
+ using BlkT = BasicBlock;
+ using ValT = Value *;
+ using PhiT = PHINode;
+ using BlkSucc_iterator = succ_iterator;
+
+ static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return succ_begin(BB); }
+ static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return succ_end(BB); }
+
+ class PHI_iterator {
+ private:
+ PHINode *PHI;
+ unsigned idx;
+
+ public:
+ explicit PHI_iterator(PHINode *P) // begin iterator
+ : PHI(P), idx(0) {}
+ PHI_iterator(PHINode *P, bool) // end iterator
+ : PHI(P), idx(PHI->getNumIncomingValues()) {}
+
+ PHI_iterator &operator++() { ++idx; return *this; }
+ bool operator==(const PHI_iterator& x) const { return idx == x.idx; }
+ bool operator!=(const PHI_iterator& x) const { return !operator==(x); }
+
+ Value *getIncomingValue() { return PHI->getIncomingValue(idx); }
+ BasicBlock *getIncomingBlock() { return PHI->getIncomingBlock(idx); }
+ };
+
+ static PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
+ static PHI_iterator PHI_end(PhiT *PHI) {
+ return PHI_iterator(PHI, true);
+ }
+
+ /// FindPredecessorBlocks - Put the predecessors of Info->BB into the Preds
+ /// vector, set Info->NumPreds, and allocate space in Info->Preds.
+ static void FindPredecessorBlocks(BasicBlock *BB,
+ SmallVectorImpl<BasicBlock *> *Preds) {
+ // We can get our predecessor info by walking the pred_iterator list,
+ // but it is relatively slow. If we already have PHI nodes in this
+ // block, walk one of them to get the predecessor list instead.
if (PHINode *SomePhi = dyn_cast<PHINode>(BB->begin()))
append_range(*Preds, SomePhi->blocks());
else
append_range(*Preds, predecessors(BB));
- }
-
- /// GetUndefVal - Get an undefined value of the same type as the value
- /// being handled.
- static Value *GetUndefVal(BasicBlock *BB, SSAUpdater *Updater) {
- return UndefValue::get(Updater->ProtoType);
- }
-
- /// CreateEmptyPHI - Create a new PHI instruction in the specified block.
- /// Reserve space for the operands but do not fill them in yet.
- static Value *CreateEmptyPHI(BasicBlock *BB, unsigned NumPreds,
- SSAUpdater *Updater) {
- PHINode *PHI = PHINode::Create(Updater->ProtoType, NumPreds,
- Updater->ProtoName, &BB->front());
- return PHI;
- }
-
- /// AddPHIOperand - Add the specified value as an operand of the PHI for
- /// the specified predecessor block.
- static void AddPHIOperand(PHINode *PHI, Value *Val, BasicBlock *Pred) {
- PHI->addIncoming(Val, Pred);
- }
-
- /// ValueIsPHI - Check if a value is a PHI.
- static PHINode *ValueIsPHI(Value *Val, SSAUpdater *Updater) {
- return dyn_cast<PHINode>(Val);
- }
-
- /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source
- /// operands, i.e., it was just added.
- static PHINode *ValueIsNewPHI(Value *Val, SSAUpdater *Updater) {
- PHINode *PHI = ValueIsPHI(Val, Updater);
- if (PHI && PHI->getNumIncomingValues() == 0)
- return PHI;
- return nullptr;
- }
-
- /// GetPHIValue - For the specified PHI instruction, return the value
- /// that it defines.
- static Value *GetPHIValue(PHINode *PHI) {
- return PHI;
- }
-};
-
-} // end namespace llvm
-
-/// Check to see if AvailableVals has an entry for the specified BB and if so,
-/// return it. If not, construct SSA form by first calculating the required
-/// placement of PHIs and then inserting new PHIs where needed.
-Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
- AvailableValsTy &AvailableVals = getAvailableVals(AV);
- if (Value *V = AvailableVals[BB])
- return V;
-
- SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
- return Impl.GetValue(BB);
-}
-
-//===----------------------------------------------------------------------===//
-// LoadAndStorePromoter Implementation
-//===----------------------------------------------------------------------===//
-
-LoadAndStorePromoter::
-LoadAndStorePromoter(ArrayRef<const Instruction *> Insts,
- SSAUpdater &S, StringRef BaseName) : SSA(S) {
- if (Insts.empty()) return;
-
- const Value *SomeVal;
- if (const LoadInst *LI = dyn_cast<LoadInst>(Insts[0]))
- SomeVal = LI;
- else
- SomeVal = cast<StoreInst>(Insts[0])->getOperand(0);
-
- if (BaseName.empty())
- BaseName = SomeVal->getName();
- SSA.Initialize(SomeVal->getType(), BaseName);
-}
-
-void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
- // First step: bucket up uses of the alloca by the block they occur in.
- // This is important because we have to handle multiple defs/uses in a block
- // ourselves: SSAUpdater is purely for cross-block references.
- DenseMap<BasicBlock *, TinyPtrVector<Instruction *>> UsesByBlock;
-
- for (Instruction *User : Insts)
- UsesByBlock[User->getParent()].push_back(User);
-
- // Okay, now we can iterate over all the blocks in the function with uses,
- // processing them. Keep track of which loads are loading a live-in value.
- // Walk the uses in the use-list order to be determinstic.
- SmallVector<LoadInst *, 32> LiveInLoads;
- DenseMap<Value *, Value *> ReplacedLoads;
-
- for (Instruction *User : Insts) {
- BasicBlock *BB = User->getParent();
- TinyPtrVector<Instruction *> &BlockUses = UsesByBlock[BB];
-
- // If this block has already been processed, ignore this repeat use.
- if (BlockUses.empty()) continue;
-
- // Okay, this is the first use in the block. If this block just has a
- // single user in it, we can rewrite it trivially.
- if (BlockUses.size() == 1) {
- // If it is a store, it is a trivial def of the value in the block.
- if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
- updateDebugInfo(SI);
- SSA.AddAvailableValue(BB, SI->getOperand(0));
- } else
- // Otherwise it is a load, queue it to rewrite as a live-in load.
- LiveInLoads.push_back(cast<LoadInst>(User));
- BlockUses.clear();
- continue;
- }
-
- // Otherwise, check to see if this block is all loads.
- bool HasStore = false;
- for (Instruction *I : BlockUses) {
- if (isa<StoreInst>(I)) {
- HasStore = true;
- break;
- }
- }
-
- // If so, we can queue them all as live in loads. We don't have an
- // efficient way to tell which on is first in the block and don't want to
- // scan large blocks, so just add all loads as live ins.
- if (!HasStore) {
- for (Instruction *I : BlockUses)
- LiveInLoads.push_back(cast<LoadInst>(I));
- BlockUses.clear();
- continue;
- }
-
- // Otherwise, we have mixed loads and stores (or just a bunch of stores).
- // Since SSAUpdater is purely for cross-block values, we need to determine
- // the order of these instructions in the block. If the first use in the
- // block is a load, then it uses the live in value. The last store defines
- // the live out value. We handle this by doing a linear scan of the block.
- Value *StoredValue = nullptr;
- for (Instruction &I : *BB) {
- if (LoadInst *L = dyn_cast<LoadInst>(&I)) {
- // If this is a load from an unrelated pointer, ignore it.
- if (!isInstInList(L, Insts)) continue;
-
- // If we haven't seen a store yet, this is a live in use, otherwise
- // use the stored value.
- if (StoredValue) {
- replaceLoadWithValue(L, StoredValue);
- L->replaceAllUsesWith(StoredValue);
- ReplacedLoads[L] = StoredValue;
- } else {
- LiveInLoads.push_back(L);
- }
- continue;
- }
-
- if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
- // If this is a store to an unrelated pointer, ignore it.
- if (!isInstInList(SI, Insts)) continue;
- updateDebugInfo(SI);
-
- // Remember that this is the active value in the block.
- StoredValue = SI->getOperand(0);
- }
- }
-
- // The last stored value that happened is the live-out for the block.
- assert(StoredValue && "Already checked that there is a store in block");
- SSA.AddAvailableValue(BB, StoredValue);
- BlockUses.clear();
- }
-
- // Okay, now we rewrite all loads that use live-in values in the loop,
- // inserting PHI nodes as necessary.
- for (LoadInst *ALoad : LiveInLoads) {
- Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent());
- replaceLoadWithValue(ALoad, NewVal);
-
- // Avoid assertions in unreachable code.
- if (NewVal == ALoad) NewVal = UndefValue::get(NewVal->getType());
- ALoad->replaceAllUsesWith(NewVal);
- ReplacedLoads[ALoad] = NewVal;
- }
-
- // Allow the client to do stuff before we start nuking things.
- doExtraRewritesBeforeFinalDeletion();
-
- // Now that everything is rewritten, delete the old instructions from the
- // function. They should all be dead now.
- for (Instruction *User : Insts) {
- // If this is a load that still has uses, then the load must have been added
- // as a live value in the SSAUpdate data structure for a block (e.g. because
- // the loaded value was stored later). In this case, we need to recursively
- // propagate the updates until we get to the real value.
- if (!User->use_empty()) {
- Value *NewVal = ReplacedLoads[User];
- assert(NewVal && "not a replaced load?");
-
- // Propagate down to the ultimate replacee. The intermediately loads
- // could theoretically already have been deleted, so we don't want to
- // dereference the Value*'s.
- DenseMap<Value*, Value*>::iterator RLI = ReplacedLoads.find(NewVal);
- while (RLI != ReplacedLoads.end()) {
- NewVal = RLI->second;
- RLI = ReplacedLoads.find(NewVal);
- }
-
- replaceLoadWithValue(cast<LoadInst>(User), NewVal);
- User->replaceAllUsesWith(NewVal);
- }
-
- instructionDeleted(User);
- User->eraseFromParent();
- }
-}
-
-bool
-LoadAndStorePromoter::isInstInList(Instruction *I,
- const SmallVectorImpl<Instruction *> &Insts)
- const {
- return is_contained(Insts, I);
-}
+ }
+
+ /// GetUndefVal - Get an undefined value of the same type as the value
+ /// being handled.
+ static Value *GetUndefVal(BasicBlock *BB, SSAUpdater *Updater) {
+ return UndefValue::get(Updater->ProtoType);
+ }
+
+ /// CreateEmptyPHI - Create a new PHI instruction in the specified block.
+ /// Reserve space for the operands but do not fill them in yet.
+ static Value *CreateEmptyPHI(BasicBlock *BB, unsigned NumPreds,
+ SSAUpdater *Updater) {
+ PHINode *PHI = PHINode::Create(Updater->ProtoType, NumPreds,
+ Updater->ProtoName, &BB->front());
+ return PHI;
+ }
+
+ /// AddPHIOperand - Add the specified value as an operand of the PHI for
+ /// the specified predecessor block.
+ static void AddPHIOperand(PHINode *PHI, Value *Val, BasicBlock *Pred) {
+ PHI->addIncoming(Val, Pred);
+ }
+
+ /// ValueIsPHI - Check if a value is a PHI.
+ static PHINode *ValueIsPHI(Value *Val, SSAUpdater *Updater) {
+ return dyn_cast<PHINode>(Val);
+ }
+
+ /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source
+ /// operands, i.e., it was just added.
+ static PHINode *ValueIsNewPHI(Value *Val, SSAUpdater *Updater) {
+ PHINode *PHI = ValueIsPHI(Val, Updater);
+ if (PHI && PHI->getNumIncomingValues() == 0)
+ return PHI;
+ return nullptr;
+ }
+
+ /// GetPHIValue - For the specified PHI instruction, return the value
+ /// that it defines.
+ static Value *GetPHIValue(PHINode *PHI) {
+ return PHI;
+ }
+};
+
+} // end namespace llvm
+
+/// Check to see if AvailableVals has an entry for the specified BB and if so,
+/// return it. If not, construct SSA form by first calculating the required
+/// placement of PHIs and then inserting new PHIs where needed.
+Value *SSAUpdater::GetValueAtEndOfBlockInternal(BasicBlock *BB) {
+ AvailableValsTy &AvailableVals = getAvailableVals(AV);
+ if (Value *V = AvailableVals[BB])
+ return V;
+
+ SSAUpdaterImpl<SSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
+ return Impl.GetValue(BB);
+}
+
+//===----------------------------------------------------------------------===//
+// LoadAndStorePromoter Implementation
+//===----------------------------------------------------------------------===//
+
+LoadAndStorePromoter::
+LoadAndStorePromoter(ArrayRef<const Instruction *> Insts,
+ SSAUpdater &S, StringRef BaseName) : SSA(S) {
+ if (Insts.empty()) return;
+
+ const Value *SomeVal;
+ if (const LoadInst *LI = dyn_cast<LoadInst>(Insts[0]))
+ SomeVal = LI;
+ else
+ SomeVal = cast<StoreInst>(Insts[0])->getOperand(0);
+
+ if (BaseName.empty())
+ BaseName = SomeVal->getName();
+ SSA.Initialize(SomeVal->getType(), BaseName);
+}
+
+void LoadAndStorePromoter::run(const SmallVectorImpl<Instruction *> &Insts) {
+ // First step: bucket up uses of the alloca by the block they occur in.
+ // This is important because we have to handle multiple defs/uses in a block
+ // ourselves: SSAUpdater is purely for cross-block references.
+ DenseMap<BasicBlock *, TinyPtrVector<Instruction *>> UsesByBlock;
+
+ for (Instruction *User : Insts)
+ UsesByBlock[User->getParent()].push_back(User);
+
+ // Okay, now we can iterate over all the blocks in the function with uses,
+ // processing them. Keep track of which loads are loading a live-in value.
+ // Walk the uses in the use-list order to be determinstic.
+ SmallVector<LoadInst *, 32> LiveInLoads;
+ DenseMap<Value *, Value *> ReplacedLoads;
+
+ for (Instruction *User : Insts) {
+ BasicBlock *BB = User->getParent();
+ TinyPtrVector<Instruction *> &BlockUses = UsesByBlock[BB];
+
+ // If this block has already been processed, ignore this repeat use.
+ if (BlockUses.empty()) continue;
+
+ // Okay, this is the first use in the block. If this block just has a
+ // single user in it, we can rewrite it trivially.
+ if (BlockUses.size() == 1) {
+ // If it is a store, it is a trivial def of the value in the block.
+ if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+ updateDebugInfo(SI);
+ SSA.AddAvailableValue(BB, SI->getOperand(0));
+ } else
+ // Otherwise it is a load, queue it to rewrite as a live-in load.
+ LiveInLoads.push_back(cast<LoadInst>(User));
+ BlockUses.clear();
+ continue;
+ }
+
+ // Otherwise, check to see if this block is all loads.
+ bool HasStore = false;
+ for (Instruction *I : BlockUses) {
+ if (isa<StoreInst>(I)) {
+ HasStore = true;
+ break;
+ }
+ }
+
+ // If so, we can queue them all as live in loads. We don't have an
+ // efficient way to tell which on is first in the block and don't want to
+ // scan large blocks, so just add all loads as live ins.
+ if (!HasStore) {
+ for (Instruction *I : BlockUses)
+ LiveInLoads.push_back(cast<LoadInst>(I));
+ BlockUses.clear();
+ continue;
+ }
+
+ // Otherwise, we have mixed loads and stores (or just a bunch of stores).
+ // Since SSAUpdater is purely for cross-block values, we need to determine
+ // the order of these instructions in the block. If the first use in the
+ // block is a load, then it uses the live in value. The last store defines
+ // the live out value. We handle this by doing a linear scan of the block.
+ Value *StoredValue = nullptr;
+ for (Instruction &I : *BB) {
+ if (LoadInst *L = dyn_cast<LoadInst>(&I)) {
+ // If this is a load from an unrelated pointer, ignore it.
+ if (!isInstInList(L, Insts)) continue;
+
+ // If we haven't seen a store yet, this is a live in use, otherwise
+ // use the stored value.
+ if (StoredValue) {
+ replaceLoadWithValue(L, StoredValue);
+ L->replaceAllUsesWith(StoredValue);
+ ReplacedLoads[L] = StoredValue;
+ } else {
+ LiveInLoads.push_back(L);
+ }
+ continue;
+ }
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+ // If this is a store to an unrelated pointer, ignore it.
+ if (!isInstInList(SI, Insts)) continue;
+ updateDebugInfo(SI);
+
+ // Remember that this is the active value in the block.
+ StoredValue = SI->getOperand(0);
+ }
+ }
+
+ // The last stored value that happened is the live-out for the block.
+ assert(StoredValue && "Already checked that there is a store in block");
+ SSA.AddAvailableValue(BB, StoredValue);
+ BlockUses.clear();
+ }
+
+ // Okay, now we rewrite all loads that use live-in values in the loop,
+ // inserting PHI nodes as necessary.
+ for (LoadInst *ALoad : LiveInLoads) {
+ Value *NewVal = SSA.GetValueInMiddleOfBlock(ALoad->getParent());
+ replaceLoadWithValue(ALoad, NewVal);
+
+ // Avoid assertions in unreachable code.
+ if (NewVal == ALoad) NewVal = UndefValue::get(NewVal->getType());
+ ALoad->replaceAllUsesWith(NewVal);
+ ReplacedLoads[ALoad] = NewVal;
+ }
+
+ // Allow the client to do stuff before we start nuking things.
+ doExtraRewritesBeforeFinalDeletion();
+
+ // Now that everything is rewritten, delete the old instructions from the
+ // function. They should all be dead now.
+ for (Instruction *User : Insts) {
+ // If this is a load that still has uses, then the load must have been added
+ // as a live value in the SSAUpdate data structure for a block (e.g. because
+ // the loaded value was stored later). In this case, we need to recursively
+ // propagate the updates until we get to the real value.
+ if (!User->use_empty()) {
+ Value *NewVal = ReplacedLoads[User];
+ assert(NewVal && "not a replaced load?");
+
+ // Propagate down to the ultimate replacee. The intermediately loads
+ // could theoretically already have been deleted, so we don't want to
+ // dereference the Value*'s.
+ DenseMap<Value*, Value*>::iterator RLI = ReplacedLoads.find(NewVal);
+ while (RLI != ReplacedLoads.end()) {
+ NewVal = RLI->second;
+ RLI = ReplacedLoads.find(NewVal);
+ }
+
+ replaceLoadWithValue(cast<LoadInst>(User), NewVal);
+ User->replaceAllUsesWith(NewVal);
+ }
+
+ instructionDeleted(User);
+ User->eraseFromParent();
+ }
+}
+
+bool
+LoadAndStorePromoter::isInstInList(Instruction *I,
+ const SmallVectorImpl<Instruction *> &Insts)
+ const {
+ return is_contained(Insts, I);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdaterBulk.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdaterBulk.cpp
index 40aa28c7c2..917d5e0a1e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdaterBulk.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SSAUpdaterBulk.cpp
@@ -1,190 +1,190 @@
-//===- SSAUpdaterBulk.cpp - Unstructured SSA Update Tool ------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the SSAUpdaterBulk class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
-#include "llvm/Analysis/IteratedDominanceFrontier.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/Value.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "ssaupdaterbulk"
-
-/// Helper function for finding a block which should have a value for the given
-/// user. For PHI-nodes this block is the corresponding predecessor, for other
-/// instructions it's their parent block.
-static BasicBlock *getUserBB(Use *U) {
- auto *User = cast<Instruction>(U->getUser());
-
- if (auto *UserPN = dyn_cast<PHINode>(User))
- return UserPN->getIncomingBlock(*U);
- else
- return User->getParent();
-}
-
-/// Add a new variable to the SSA rewriter. This needs to be called before
-/// AddAvailableValue or AddUse calls.
-unsigned SSAUpdaterBulk::AddVariable(StringRef Name, Type *Ty) {
- unsigned Var = Rewrites.size();
- LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var << ": initialized with Ty = "
- << *Ty << ", Name = " << Name << "\n");
- RewriteInfo RI(Name, Ty);
- Rewrites.push_back(RI);
- return Var;
-}
-
-/// Indicate that a rewritten value is available in the specified block with the
-/// specified value.
-void SSAUpdaterBulk::AddAvailableValue(unsigned Var, BasicBlock *BB, Value *V) {
- assert(Var < Rewrites.size() && "Variable not found!");
- LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var
- << ": added new available value" << *V << " in "
- << BB->getName() << "\n");
- Rewrites[Var].Defines[BB] = V;
-}
-
-/// Record a use of the symbolic value. This use will be updated with a
-/// rewritten value when RewriteAllUses is called.
-void SSAUpdaterBulk::AddUse(unsigned Var, Use *U) {
- assert(Var < Rewrites.size() && "Variable not found!");
- LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var << ": added a use" << *U->get()
- << " in " << getUserBB(U)->getName() << "\n");
- Rewrites[Var].Uses.push_back(U);
-}
-
-/// Return true if the SSAUpdater already has a value for the specified variable
-/// in the specified block.
-bool SSAUpdaterBulk::HasValueForBlock(unsigned Var, BasicBlock *BB) {
- return (Var < Rewrites.size()) ? Rewrites[Var].Defines.count(BB) : false;
-}
-
-// Compute value at the given block BB. We either should already know it, or we
-// should be able to recursively reach it going up dominator tree.
-Value *SSAUpdaterBulk::computeValueAt(BasicBlock *BB, RewriteInfo &R,
- DominatorTree *DT) {
- if (!R.Defines.count(BB)) {
- if (DT->isReachableFromEntry(BB) && PredCache.get(BB).size()) {
- BasicBlock *IDom = DT->getNode(BB)->getIDom()->getBlock();
- Value *V = computeValueAt(IDom, R, DT);
- R.Defines[BB] = V;
- } else
- R.Defines[BB] = UndefValue::get(R.Ty);
- }
- return R.Defines[BB];
-}
-
-/// Given sets of UsingBlocks and DefBlocks, compute the set of LiveInBlocks.
-/// This is basically a subgraph limited by DefBlocks and UsingBlocks.
-static void
-ComputeLiveInBlocks(const SmallPtrSetImpl<BasicBlock *> &UsingBlocks,
- const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
- SmallPtrSetImpl<BasicBlock *> &LiveInBlocks,
- PredIteratorCache &PredCache) {
- // To determine liveness, we must iterate through the predecessors of blocks
- // where the def is live. Blocks are added to the worklist if we need to
- // check their predecessors. Start with all the using blocks.
- SmallVector<BasicBlock *, 64> LiveInBlockWorklist(UsingBlocks.begin(),
- UsingBlocks.end());
-
- // Now that we have a set of blocks where the phi is live-in, recursively add
- // their predecessors until we find the full region the value is live.
- while (!LiveInBlockWorklist.empty()) {
- BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
-
- // The block really is live in here, insert it into the set. If already in
- // the set, then it has already been processed.
- if (!LiveInBlocks.insert(BB).second)
- continue;
-
- // Since the value is live into BB, it is either defined in a predecessor or
- // live into it to. Add the preds to the worklist unless they are a
- // defining block.
- for (BasicBlock *P : PredCache.get(BB)) {
- // The value is not live into a predecessor if it defines the value.
- if (DefBlocks.count(P))
- continue;
-
- // Otherwise it is, add to the worklist.
- LiveInBlockWorklist.push_back(P);
- }
- }
-}
-
-/// Perform all the necessary updates, including new PHI-nodes insertion and the
-/// requested uses update.
-void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
- SmallVectorImpl<PHINode *> *InsertedPHIs) {
- for (auto &R : Rewrites) {
- // Compute locations for new phi-nodes.
- // For that we need to initialize DefBlocks from definitions in R.Defines,
- // UsingBlocks from uses in R.Uses, then compute LiveInBlocks, and then use
- // this set for computing iterated dominance frontier (IDF).
- // The IDF blocks are the blocks where we need to insert new phi-nodes.
- ForwardIDFCalculator IDF(*DT);
- LLVM_DEBUG(dbgs() << "SSAUpdater: rewriting " << R.Uses.size()
- << " use(s)\n");
-
- SmallPtrSet<BasicBlock *, 2> DefBlocks;
- for (auto &Def : R.Defines)
- DefBlocks.insert(Def.first);
- IDF.setDefiningBlocks(DefBlocks);
-
- SmallPtrSet<BasicBlock *, 2> UsingBlocks;
- for (Use *U : R.Uses)
- UsingBlocks.insert(getUserBB(U));
-
- SmallVector<BasicBlock *, 32> IDFBlocks;
- SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
- ComputeLiveInBlocks(UsingBlocks, DefBlocks, LiveInBlocks, PredCache);
- IDF.resetLiveInBlocks();
- IDF.setLiveInBlocks(LiveInBlocks);
- IDF.calculate(IDFBlocks);
-
- // We've computed IDF, now insert new phi-nodes there.
- SmallVector<PHINode *, 4> InsertedPHIsForVar;
- for (auto *FrontierBB : IDFBlocks) {
- IRBuilder<> B(FrontierBB, FrontierBB->begin());
- PHINode *PN = B.CreatePHI(R.Ty, 0, R.Name);
- R.Defines[FrontierBB] = PN;
- InsertedPHIsForVar.push_back(PN);
- if (InsertedPHIs)
- InsertedPHIs->push_back(PN);
- }
-
- // Fill in arguments of the inserted PHIs.
- for (auto *PN : InsertedPHIsForVar) {
- BasicBlock *PBB = PN->getParent();
- for (BasicBlock *Pred : PredCache.get(PBB))
- PN->addIncoming(computeValueAt(Pred, R, DT), Pred);
- }
-
- // Rewrite actual uses with the inserted definitions.
- SmallPtrSet<Use *, 4> ProcessedUses;
- for (Use *U : R.Uses) {
- if (!ProcessedUses.insert(U).second)
- continue;
- Value *V = computeValueAt(getUserBB(U), R, DT);
- Value *OldVal = U->get();
- assert(OldVal && "Invalid use!");
- // Notify that users of the existing value that it is being replaced.
- if (OldVal != V && OldVal->hasValueHandle())
- ValueHandleBase::ValueIsRAUWd(OldVal, V);
- LLVM_DEBUG(dbgs() << "SSAUpdater: replacing " << *OldVal << " with " << *V
- << "\n");
- U->set(V);
- }
- }
-}
+//===- SSAUpdaterBulk.cpp - Unstructured SSA Update Tool ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SSAUpdaterBulk class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SSAUpdaterBulk.h"
+#include "llvm/Analysis/IteratedDominanceFrontier.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/Value.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ssaupdaterbulk"
+
+/// Helper function for finding a block which should have a value for the given
+/// user. For PHI-nodes this block is the corresponding predecessor, for other
+/// instructions it's their parent block.
+static BasicBlock *getUserBB(Use *U) {
+ auto *User = cast<Instruction>(U->getUser());
+
+ if (auto *UserPN = dyn_cast<PHINode>(User))
+ return UserPN->getIncomingBlock(*U);
+ else
+ return User->getParent();
+}
+
+/// Add a new variable to the SSA rewriter. This needs to be called before
+/// AddAvailableValue or AddUse calls.
+unsigned SSAUpdaterBulk::AddVariable(StringRef Name, Type *Ty) {
+ unsigned Var = Rewrites.size();
+ LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var << ": initialized with Ty = "
+ << *Ty << ", Name = " << Name << "\n");
+ RewriteInfo RI(Name, Ty);
+ Rewrites.push_back(RI);
+ return Var;
+}
+
+/// Indicate that a rewritten value is available in the specified block with the
+/// specified value.
+void SSAUpdaterBulk::AddAvailableValue(unsigned Var, BasicBlock *BB, Value *V) {
+ assert(Var < Rewrites.size() && "Variable not found!");
+ LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var
+ << ": added new available value" << *V << " in "
+ << BB->getName() << "\n");
+ Rewrites[Var].Defines[BB] = V;
+}
+
+/// Record a use of the symbolic value. This use will be updated with a
+/// rewritten value when RewriteAllUses is called.
+void SSAUpdaterBulk::AddUse(unsigned Var, Use *U) {
+ assert(Var < Rewrites.size() && "Variable not found!");
+ LLVM_DEBUG(dbgs() << "SSAUpdater: Var=" << Var << ": added a use" << *U->get()
+ << " in " << getUserBB(U)->getName() << "\n");
+ Rewrites[Var].Uses.push_back(U);
+}
+
+/// Return true if the SSAUpdater already has a value for the specified variable
+/// in the specified block.
+bool SSAUpdaterBulk::HasValueForBlock(unsigned Var, BasicBlock *BB) {
+ return (Var < Rewrites.size()) ? Rewrites[Var].Defines.count(BB) : false;
+}
+
+// Compute value at the given block BB. We either should already know it, or we
+// should be able to recursively reach it going up dominator tree.
+Value *SSAUpdaterBulk::computeValueAt(BasicBlock *BB, RewriteInfo &R,
+ DominatorTree *DT) {
+ if (!R.Defines.count(BB)) {
+ if (DT->isReachableFromEntry(BB) && PredCache.get(BB).size()) {
+ BasicBlock *IDom = DT->getNode(BB)->getIDom()->getBlock();
+ Value *V = computeValueAt(IDom, R, DT);
+ R.Defines[BB] = V;
+ } else
+ R.Defines[BB] = UndefValue::get(R.Ty);
+ }
+ return R.Defines[BB];
+}
+
+/// Given sets of UsingBlocks and DefBlocks, compute the set of LiveInBlocks.
+/// This is basically a subgraph limited by DefBlocks and UsingBlocks.
+static void
+ComputeLiveInBlocks(const SmallPtrSetImpl<BasicBlock *> &UsingBlocks,
+ const SmallPtrSetImpl<BasicBlock *> &DefBlocks,
+ SmallPtrSetImpl<BasicBlock *> &LiveInBlocks,
+ PredIteratorCache &PredCache) {
+ // To determine liveness, we must iterate through the predecessors of blocks
+ // where the def is live. Blocks are added to the worklist if we need to
+ // check their predecessors. Start with all the using blocks.
+ SmallVector<BasicBlock *, 64> LiveInBlockWorklist(UsingBlocks.begin(),
+ UsingBlocks.end());
+
+ // Now that we have a set of blocks where the phi is live-in, recursively add
+ // their predecessors until we find the full region the value is live.
+ while (!LiveInBlockWorklist.empty()) {
+ BasicBlock *BB = LiveInBlockWorklist.pop_back_val();
+
+ // The block really is live in here, insert it into the set. If already in
+ // the set, then it has already been processed.
+ if (!LiveInBlocks.insert(BB).second)
+ continue;
+
+ // Since the value is live into BB, it is either defined in a predecessor or
+ // live into it to. Add the preds to the worklist unless they are a
+ // defining block.
+ for (BasicBlock *P : PredCache.get(BB)) {
+ // The value is not live into a predecessor if it defines the value.
+ if (DefBlocks.count(P))
+ continue;
+
+ // Otherwise it is, add to the worklist.
+ LiveInBlockWorklist.push_back(P);
+ }
+ }
+}
+
+/// Perform all the necessary updates, including new PHI-nodes insertion and the
+/// requested uses update.
+void SSAUpdaterBulk::RewriteAllUses(DominatorTree *DT,
+ SmallVectorImpl<PHINode *> *InsertedPHIs) {
+ for (auto &R : Rewrites) {
+ // Compute locations for new phi-nodes.
+ // For that we need to initialize DefBlocks from definitions in R.Defines,
+ // UsingBlocks from uses in R.Uses, then compute LiveInBlocks, and then use
+ // this set for computing iterated dominance frontier (IDF).
+ // The IDF blocks are the blocks where we need to insert new phi-nodes.
+ ForwardIDFCalculator IDF(*DT);
+ LLVM_DEBUG(dbgs() << "SSAUpdater: rewriting " << R.Uses.size()
+ << " use(s)\n");
+
+ SmallPtrSet<BasicBlock *, 2> DefBlocks;
+ for (auto &Def : R.Defines)
+ DefBlocks.insert(Def.first);
+ IDF.setDefiningBlocks(DefBlocks);
+
+ SmallPtrSet<BasicBlock *, 2> UsingBlocks;
+ for (Use *U : R.Uses)
+ UsingBlocks.insert(getUserBB(U));
+
+ SmallVector<BasicBlock *, 32> IDFBlocks;
+ SmallPtrSet<BasicBlock *, 32> LiveInBlocks;
+ ComputeLiveInBlocks(UsingBlocks, DefBlocks, LiveInBlocks, PredCache);
+ IDF.resetLiveInBlocks();
+ IDF.setLiveInBlocks(LiveInBlocks);
+ IDF.calculate(IDFBlocks);
+
+ // We've computed IDF, now insert new phi-nodes there.
+ SmallVector<PHINode *, 4> InsertedPHIsForVar;
+ for (auto *FrontierBB : IDFBlocks) {
+ IRBuilder<> B(FrontierBB, FrontierBB->begin());
+ PHINode *PN = B.CreatePHI(R.Ty, 0, R.Name);
+ R.Defines[FrontierBB] = PN;
+ InsertedPHIsForVar.push_back(PN);
+ if (InsertedPHIs)
+ InsertedPHIs->push_back(PN);
+ }
+
+ // Fill in arguments of the inserted PHIs.
+ for (auto *PN : InsertedPHIsForVar) {
+ BasicBlock *PBB = PN->getParent();
+ for (BasicBlock *Pred : PredCache.get(PBB))
+ PN->addIncoming(computeValueAt(Pred, R, DT), Pred);
+ }
+
+ // Rewrite actual uses with the inserted definitions.
+ SmallPtrSet<Use *, 4> ProcessedUses;
+ for (Use *U : R.Uses) {
+ if (!ProcessedUses.insert(U).second)
+ continue;
+ Value *V = computeValueAt(getUserBB(U), R, DT);
+ Value *OldVal = U->get();
+ assert(OldVal && "Invalid use!");
+ // Notify that users of the existing value that it is being replaced.
+ if (OldVal != V && OldVal->hasValueHandle())
+ ValueHandleBase::ValueIsRAUWd(OldVal, V);
+ LLVM_DEBUG(dbgs() << "SSAUpdater: replacing " << *OldVal << " with " << *V
+ << "\n");
+ U->set(V);
+ }
+ }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SanitizerStats.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SanitizerStats.cpp
index acecebe646..a1313c77ed 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SanitizerStats.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SanitizerStats.cpp
@@ -1,107 +1,107 @@
-//===- SanitizerStats.cpp - Sanitizer statistics gathering ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Implements code generation for sanitizer statistics gathering.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/SanitizerStats.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-
-using namespace llvm;
-
-SanitizerStatReport::SanitizerStatReport(Module *M) : M(M) {
- StatTy = ArrayType::get(Type::getInt8PtrTy(M->getContext()), 2);
- EmptyModuleStatsTy = makeModuleStatsTy();
-
- ModuleStatsGV = new GlobalVariable(*M, EmptyModuleStatsTy, false,
- GlobalValue::InternalLinkage, nullptr);
-}
-
-ArrayType *SanitizerStatReport::makeModuleStatsArrayTy() {
- return ArrayType::get(StatTy, Inits.size());
-}
-
-StructType *SanitizerStatReport::makeModuleStatsTy() {
- return StructType::get(M->getContext(), {Type::getInt8PtrTy(M->getContext()),
- Type::getInt32Ty(M->getContext()),
- makeModuleStatsArrayTy()});
-}
-
-void SanitizerStatReport::create(IRBuilder<> &B, SanitizerStatKind SK) {
- Function *F = B.GetInsertBlock()->getParent();
- Module *M = F->getParent();
- PointerType *Int8PtrTy = B.getInt8PtrTy();
- IntegerType *IntPtrTy = B.getIntPtrTy(M->getDataLayout());
- ArrayType *StatTy = ArrayType::get(Int8PtrTy, 2);
-
- Inits.push_back(ConstantArray::get(
- StatTy,
- {Constant::getNullValue(Int8PtrTy),
- ConstantExpr::getIntToPtr(
- ConstantInt::get(IntPtrTy, uint64_t(SK) << (IntPtrTy->getBitWidth() -
- kSanitizerStatKindBits)),
- Int8PtrTy)}));
-
- FunctionType *StatReportTy =
- FunctionType::get(B.getVoidTy(), Int8PtrTy, false);
- FunctionCallee StatReport =
- M->getOrInsertFunction("__sanitizer_stat_report", StatReportTy);
-
- auto InitAddr = ConstantExpr::getGetElementPtr(
- EmptyModuleStatsTy, ModuleStatsGV,
- ArrayRef<Constant *>{
- ConstantInt::get(IntPtrTy, 0), ConstantInt::get(B.getInt32Ty(), 2),
- ConstantInt::get(IntPtrTy, Inits.size() - 1),
- });
- B.CreateCall(StatReport, ConstantExpr::getBitCast(InitAddr, Int8PtrTy));
-}
-
-void SanitizerStatReport::finish() {
- if (Inits.empty()) {
- ModuleStatsGV->eraseFromParent();
- return;
- }
-
- PointerType *Int8PtrTy = Type::getInt8PtrTy(M->getContext());
- IntegerType *Int32Ty = Type::getInt32Ty(M->getContext());
- Type *VoidTy = Type::getVoidTy(M->getContext());
-
- // Create a new ModuleStatsGV to replace the old one. We can't just set the
- // old one's initializer because its type is different.
- auto NewModuleStatsGV = new GlobalVariable(
- *M, makeModuleStatsTy(), false, GlobalValue::InternalLinkage,
- ConstantStruct::getAnon(
- {Constant::getNullValue(Int8PtrTy),
- ConstantInt::get(Int32Ty, Inits.size()),
- ConstantArray::get(makeModuleStatsArrayTy(), Inits)}));
- ModuleStatsGV->replaceAllUsesWith(
- ConstantExpr::getBitCast(NewModuleStatsGV, ModuleStatsGV->getType()));
- ModuleStatsGV->eraseFromParent();
-
- // Create a global constructor to register NewModuleStatsGV.
- auto F = Function::Create(FunctionType::get(VoidTy, false),
- GlobalValue::InternalLinkage, "", M);
- auto BB = BasicBlock::Create(M->getContext(), "", F);
- IRBuilder<> B(BB);
-
- FunctionType *StatInitTy = FunctionType::get(VoidTy, Int8PtrTy, false);
- FunctionCallee StatInit =
- M->getOrInsertFunction("__sanitizer_stat_init", StatInitTy);
-
- B.CreateCall(StatInit, ConstantExpr::getBitCast(NewModuleStatsGV, Int8PtrTy));
- B.CreateRetVoid();
-
- appendToGlobalCtors(*M, F, 0);
-}
+//===- SanitizerStats.cpp - Sanitizer statistics gathering ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements code generation for sanitizer statistics gathering.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SanitizerStats.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+SanitizerStatReport::SanitizerStatReport(Module *M) : M(M) {
+ StatTy = ArrayType::get(Type::getInt8PtrTy(M->getContext()), 2);
+ EmptyModuleStatsTy = makeModuleStatsTy();
+
+ ModuleStatsGV = new GlobalVariable(*M, EmptyModuleStatsTy, false,
+ GlobalValue::InternalLinkage, nullptr);
+}
+
+ArrayType *SanitizerStatReport::makeModuleStatsArrayTy() {
+ return ArrayType::get(StatTy, Inits.size());
+}
+
+StructType *SanitizerStatReport::makeModuleStatsTy() {
+ return StructType::get(M->getContext(), {Type::getInt8PtrTy(M->getContext()),
+ Type::getInt32Ty(M->getContext()),
+ makeModuleStatsArrayTy()});
+}
+
+void SanitizerStatReport::create(IRBuilder<> &B, SanitizerStatKind SK) {
+ Function *F = B.GetInsertBlock()->getParent();
+ Module *M = F->getParent();
+ PointerType *Int8PtrTy = B.getInt8PtrTy();
+ IntegerType *IntPtrTy = B.getIntPtrTy(M->getDataLayout());
+ ArrayType *StatTy = ArrayType::get(Int8PtrTy, 2);
+
+ Inits.push_back(ConstantArray::get(
+ StatTy,
+ {Constant::getNullValue(Int8PtrTy),
+ ConstantExpr::getIntToPtr(
+ ConstantInt::get(IntPtrTy, uint64_t(SK) << (IntPtrTy->getBitWidth() -
+ kSanitizerStatKindBits)),
+ Int8PtrTy)}));
+
+ FunctionType *StatReportTy =
+ FunctionType::get(B.getVoidTy(), Int8PtrTy, false);
+ FunctionCallee StatReport =
+ M->getOrInsertFunction("__sanitizer_stat_report", StatReportTy);
+
+ auto InitAddr = ConstantExpr::getGetElementPtr(
+ EmptyModuleStatsTy, ModuleStatsGV,
+ ArrayRef<Constant *>{
+ ConstantInt::get(IntPtrTy, 0), ConstantInt::get(B.getInt32Ty(), 2),
+ ConstantInt::get(IntPtrTy, Inits.size() - 1),
+ });
+ B.CreateCall(StatReport, ConstantExpr::getBitCast(InitAddr, Int8PtrTy));
+}
+
+void SanitizerStatReport::finish() {
+ if (Inits.empty()) {
+ ModuleStatsGV->eraseFromParent();
+ return;
+ }
+
+ PointerType *Int8PtrTy = Type::getInt8PtrTy(M->getContext());
+ IntegerType *Int32Ty = Type::getInt32Ty(M->getContext());
+ Type *VoidTy = Type::getVoidTy(M->getContext());
+
+ // Create a new ModuleStatsGV to replace the old one. We can't just set the
+ // old one's initializer because its type is different.
+ auto NewModuleStatsGV = new GlobalVariable(
+ *M, makeModuleStatsTy(), false, GlobalValue::InternalLinkage,
+ ConstantStruct::getAnon(
+ {Constant::getNullValue(Int8PtrTy),
+ ConstantInt::get(Int32Ty, Inits.size()),
+ ConstantArray::get(makeModuleStatsArrayTy(), Inits)}));
+ ModuleStatsGV->replaceAllUsesWith(
+ ConstantExpr::getBitCast(NewModuleStatsGV, ModuleStatsGV->getType()));
+ ModuleStatsGV->eraseFromParent();
+
+ // Create a global constructor to register NewModuleStatsGV.
+ auto F = Function::Create(FunctionType::get(VoidTy, false),
+ GlobalValue::InternalLinkage, "", M);
+ auto BB = BasicBlock::Create(M->getContext(), "", F);
+ IRBuilder<> B(BB);
+
+ FunctionType *StatInitTy = FunctionType::get(VoidTy, Int8PtrTy, false);
+ FunctionCallee StatInit =
+ M->getOrInsertFunction("__sanitizer_stat_init", StatInitTy);
+
+ B.CreateCall(StatInit, ConstantExpr::getBitCast(NewModuleStatsGV, Int8PtrTy));
+ B.CreateRetVoid();
+
+ appendToGlobalCtors(*M, F, 0);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index 3aedba4753..6dbfb0b61f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -1,70 +1,70 @@
-//===- ScalarEvolutionExpander.cpp - Scalar Evolution Analysis ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the scalar evolution expander,
-// which is used to generate the code corresponding to a given scalar evolution
-// expression.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+//===- ScalarEvolutionExpander.cpp - Scalar Evolution Analysis ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the scalar evolution expander,
+// which is used to generate the code corresponding to a given scalar evolution
+// expression.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
#include "llvm/Transforms/Utils/LoopUtils.h"
-
-using namespace llvm;
-
-cl::opt<unsigned> llvm::SCEVCheapExpansionBudget(
- "scev-cheap-expansion-budget", cl::Hidden, cl::init(4),
- cl::desc("When performing SCEV expansion only if it is cheap to do, this "
- "controls the budget that is considered cheap (default = 4)"));
-
-using namespace PatternMatch;
-
-/// ReuseOrCreateCast - Arrange for there to be a cast of V to Ty at IP,
+
+using namespace llvm;
+
+cl::opt<unsigned> llvm::SCEVCheapExpansionBudget(
+ "scev-cheap-expansion-budget", cl::Hidden, cl::init(4),
+ cl::desc("When performing SCEV expansion only if it is cheap to do, this "
+ "controls the budget that is considered cheap (default = 4)"));
+
+using namespace PatternMatch;
+
+/// ReuseOrCreateCast - Arrange for there to be a cast of V to Ty at IP,
/// reusing an existing cast if a suitable one (= dominating IP) exists, or
-/// creating a new one.
-Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
- Instruction::CastOps Op,
- BasicBlock::iterator IP) {
- // This function must be called with the builder having a valid insertion
- // point. It doesn't need to be the actual IP where the uses of the returned
- // cast will be added, but it must dominate such IP.
- // We use this precondition to produce a cast that will dominate all its
- // uses. In particular, this is crucial for the case where the builder's
- // insertion point *is* the point where we were asked to put the cast.
- // Since we don't know the builder's insertion point is actually
- // where the uses will be added (only that it dominates it), we are
- // not allowed to move it.
- BasicBlock::iterator BIP = Builder.GetInsertPoint();
-
- Instruction *Ret = nullptr;
-
- // Check to see if there is already a cast!
+/// creating a new one.
+Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
+ Instruction::CastOps Op,
+ BasicBlock::iterator IP) {
+ // This function must be called with the builder having a valid insertion
+ // point. It doesn't need to be the actual IP where the uses of the returned
+ // cast will be added, but it must dominate such IP.
+ // We use this precondition to produce a cast that will dominate all its
+ // uses. In particular, this is crucial for the case where the builder's
+ // insertion point *is* the point where we were asked to put the cast.
+ // Since we don't know the builder's insertion point is actually
+ // where the uses will be added (only that it dominates it), we are
+ // not allowed to move it.
+ BasicBlock::iterator BIP = Builder.GetInsertPoint();
+
+ Instruction *Ret = nullptr;
+
+ // Check to see if there is already a cast!
for (User *U : V->users()) {
if (U->getType() != Ty)
continue;
CastInst *CI = dyn_cast<CastInst>(U);
if (!CI || CI->getOpcode() != Op)
continue;
-
+
// Found a suitable cast that is at IP or comes before IP. Use it. Note that
// the cast must also properly dominate the Builder's insertion point.
if (IP->getParent() == CI->getParent() && &*BIP != CI &&
@@ -74,58 +74,58 @@ Value *SCEVExpander::ReuseOrCreateCast(Value *V, Type *Ty,
}
}
- // Create a new cast.
+ // Create a new cast.
if (!Ret) {
- Ret = CastInst::Create(Op, V, Ty, V->getName(), &*IP);
+ Ret = CastInst::Create(Op, V, Ty, V->getName(), &*IP);
rememberInstruction(Ret);
}
-
- // We assert at the end of the function since IP might point to an
- // instruction with different dominance properties than a cast
- // (an invoke for example) and not dominate BIP (but the cast does).
- assert(SE.DT.dominates(Ret, &*BIP));
-
- return Ret;
-}
-
+
+ // We assert at the end of the function since IP might point to an
+ // instruction with different dominance properties than a cast
+ // (an invoke for example) and not dominate BIP (but the cast does).
+ assert(SE.DT.dominates(Ret, &*BIP));
+
+ return Ret;
+}
+
BasicBlock::iterator
SCEVExpander::findInsertPointAfter(Instruction *I, Instruction *MustDominate) {
- BasicBlock::iterator IP = ++I->getIterator();
- if (auto *II = dyn_cast<InvokeInst>(I))
- IP = II->getNormalDest()->begin();
-
- while (isa<PHINode>(IP))
- ++IP;
-
- if (isa<FuncletPadInst>(IP) || isa<LandingPadInst>(IP)) {
- ++IP;
- } else if (isa<CatchSwitchInst>(IP)) {
+ BasicBlock::iterator IP = ++I->getIterator();
+ if (auto *II = dyn_cast<InvokeInst>(I))
+ IP = II->getNormalDest()->begin();
+
+ while (isa<PHINode>(IP))
+ ++IP;
+
+ if (isa<FuncletPadInst>(IP) || isa<LandingPadInst>(IP)) {
+ ++IP;
+ } else if (isa<CatchSwitchInst>(IP)) {
IP = MustDominate->getParent()->getFirstInsertionPt();
- } else {
- assert(!IP->isEHPad() && "unexpected eh pad!");
- }
-
+ } else {
+ assert(!IP->isEHPad() && "unexpected eh pad!");
+ }
+
// Adjust insert point to be after instructions inserted by the expander, so
// we can re-use already inserted instructions. Avoid skipping past the
// original \p MustDominate, in case it is an inserted instruction.
while (isInsertedInstruction(&*IP) && &*IP != MustDominate)
++IP;
- return IP;
-}
-
-/// InsertNoopCastOfTo - Insert a cast of V to the specified type,
-/// which must be possible with a noop cast, doing what we can to share
-/// the casts.
-Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) {
- Instruction::CastOps Op = CastInst::getCastOpcode(V, false, Ty, false);
- assert((Op == Instruction::BitCast ||
- Op == Instruction::PtrToInt ||
- Op == Instruction::IntToPtr) &&
- "InsertNoopCastOfTo cannot perform non-noop casts!");
- assert(SE.getTypeSizeInBits(V->getType()) == SE.getTypeSizeInBits(Ty) &&
- "InsertNoopCastOfTo cannot change sizes!");
-
+ return IP;
+}
+
+/// InsertNoopCastOfTo - Insert a cast of V to the specified type,
+/// which must be possible with a noop cast, doing what we can to share
+/// the casts.
+Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) {
+ Instruction::CastOps Op = CastInst::getCastOpcode(V, false, Ty, false);
+ assert((Op == Instruction::BitCast ||
+ Op == Instruction::PtrToInt ||
+ Op == Instruction::IntToPtr) &&
+ "InsertNoopCastOfTo cannot perform non-noop casts!");
+ assert(SE.getTypeSizeInBits(V->getType()) == SE.getTypeSizeInBits(Ty) &&
+ "InsertNoopCastOfTo cannot change sizes!");
+
// inttoptr only works for integral pointers. For non-integral pointers, we
// can create a GEP on i8* null with the integral value as index. Note that
// it is safe to use GEP of null instead of inttoptr here, because only
@@ -142,1070 +142,1070 @@ Value *SCEVExpander::InsertNoopCastOfTo(Value *V, Type *Ty) {
return Builder.CreateBitCast(GEP, Ty);
}
}
- // Short-circuit unnecessary bitcasts.
- if (Op == Instruction::BitCast) {
- if (V->getType() == Ty)
- return V;
- if (CastInst *CI = dyn_cast<CastInst>(V)) {
- if (CI->getOperand(0)->getType() == Ty)
- return CI->getOperand(0);
- }
- }
- // Short-circuit unnecessary inttoptr<->ptrtoint casts.
- if ((Op == Instruction::PtrToInt || Op == Instruction::IntToPtr) &&
- SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(V->getType())) {
- if (CastInst *CI = dyn_cast<CastInst>(V))
- if ((CI->getOpcode() == Instruction::PtrToInt ||
- CI->getOpcode() == Instruction::IntToPtr) &&
- SE.getTypeSizeInBits(CI->getType()) ==
- SE.getTypeSizeInBits(CI->getOperand(0)->getType()))
- return CI->getOperand(0);
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
- if ((CE->getOpcode() == Instruction::PtrToInt ||
- CE->getOpcode() == Instruction::IntToPtr) &&
- SE.getTypeSizeInBits(CE->getType()) ==
- SE.getTypeSizeInBits(CE->getOperand(0)->getType()))
- return CE->getOperand(0);
- }
-
- // Fold a cast of a constant.
- if (Constant *C = dyn_cast<Constant>(V))
- return ConstantExpr::getCast(Op, C, Ty);
-
- // Cast the argument at the beginning of the entry block, after
- // any bitcasts of other arguments.
- if (Argument *A = dyn_cast<Argument>(V)) {
- BasicBlock::iterator IP = A->getParent()->getEntryBlock().begin();
- while ((isa<BitCastInst>(IP) &&
- isa<Argument>(cast<BitCastInst>(IP)->getOperand(0)) &&
- cast<BitCastInst>(IP)->getOperand(0) != A) ||
- isa<DbgInfoIntrinsic>(IP))
- ++IP;
- return ReuseOrCreateCast(A, Ty, Op, IP);
- }
-
- // Cast the instruction immediately after the instruction.
- Instruction *I = cast<Instruction>(V);
+ // Short-circuit unnecessary bitcasts.
+ if (Op == Instruction::BitCast) {
+ if (V->getType() == Ty)
+ return V;
+ if (CastInst *CI = dyn_cast<CastInst>(V)) {
+ if (CI->getOperand(0)->getType() == Ty)
+ return CI->getOperand(0);
+ }
+ }
+ // Short-circuit unnecessary inttoptr<->ptrtoint casts.
+ if ((Op == Instruction::PtrToInt || Op == Instruction::IntToPtr) &&
+ SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(V->getType())) {
+ if (CastInst *CI = dyn_cast<CastInst>(V))
+ if ((CI->getOpcode() == Instruction::PtrToInt ||
+ CI->getOpcode() == Instruction::IntToPtr) &&
+ SE.getTypeSizeInBits(CI->getType()) ==
+ SE.getTypeSizeInBits(CI->getOperand(0)->getType()))
+ return CI->getOperand(0);
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+ if ((CE->getOpcode() == Instruction::PtrToInt ||
+ CE->getOpcode() == Instruction::IntToPtr) &&
+ SE.getTypeSizeInBits(CE->getType()) ==
+ SE.getTypeSizeInBits(CE->getOperand(0)->getType()))
+ return CE->getOperand(0);
+ }
+
+ // Fold a cast of a constant.
+ if (Constant *C = dyn_cast<Constant>(V))
+ return ConstantExpr::getCast(Op, C, Ty);
+
+ // Cast the argument at the beginning of the entry block, after
+ // any bitcasts of other arguments.
+ if (Argument *A = dyn_cast<Argument>(V)) {
+ BasicBlock::iterator IP = A->getParent()->getEntryBlock().begin();
+ while ((isa<BitCastInst>(IP) &&
+ isa<Argument>(cast<BitCastInst>(IP)->getOperand(0)) &&
+ cast<BitCastInst>(IP)->getOperand(0) != A) ||
+ isa<DbgInfoIntrinsic>(IP))
+ ++IP;
+ return ReuseOrCreateCast(A, Ty, Op, IP);
+ }
+
+ // Cast the instruction immediately after the instruction.
+ Instruction *I = cast<Instruction>(V);
BasicBlock::iterator IP = findInsertPointAfter(I, &*Builder.GetInsertPoint());
- return ReuseOrCreateCast(I, Ty, Op, IP);
-}
-
-/// InsertBinop - Insert the specified binary operator, doing a small amount
-/// of work to avoid inserting an obviously redundant operation, and hoisting
-/// to an outer loop when the opportunity is there and it is safe.
-Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
- Value *LHS, Value *RHS,
- SCEV::NoWrapFlags Flags, bool IsSafeToHoist) {
- // Fold a binop with constant operands.
- if (Constant *CLHS = dyn_cast<Constant>(LHS))
- if (Constant *CRHS = dyn_cast<Constant>(RHS))
- return ConstantExpr::get(Opcode, CLHS, CRHS);
-
- // Do a quick scan to see if we have this binop nearby. If so, reuse it.
- unsigned ScanLimit = 6;
- BasicBlock::iterator BlockBegin = Builder.GetInsertBlock()->begin();
- // Scanning starts from the last instruction before the insertion point.
- BasicBlock::iterator IP = Builder.GetInsertPoint();
- if (IP != BlockBegin) {
- --IP;
- for (; ScanLimit; --IP, --ScanLimit) {
- // Don't count dbg.value against the ScanLimit, to avoid perturbing the
- // generated code.
- if (isa<DbgInfoIntrinsic>(IP))
- ScanLimit++;
-
- auto canGenerateIncompatiblePoison = [&Flags](Instruction *I) {
- // Ensure that no-wrap flags match.
- if (isa<OverflowingBinaryOperator>(I)) {
- if (I->hasNoSignedWrap() != (Flags & SCEV::FlagNSW))
- return true;
- if (I->hasNoUnsignedWrap() != (Flags & SCEV::FlagNUW))
- return true;
- }
- // Conservatively, do not use any instruction which has any of exact
- // flags installed.
- if (isa<PossiblyExactOperator>(I) && I->isExact())
- return true;
- return false;
- };
- if (IP->getOpcode() == (unsigned)Opcode && IP->getOperand(0) == LHS &&
- IP->getOperand(1) == RHS && !canGenerateIncompatiblePoison(&*IP))
- return &*IP;
- if (IP == BlockBegin) break;
- }
- }
-
- // Save the original insertion point so we can restore it when we're done.
- DebugLoc Loc = Builder.GetInsertPoint()->getDebugLoc();
- SCEVInsertPointGuard Guard(Builder, this);
-
- if (IsSafeToHoist) {
- // Move the insertion point out of as many loops as we can.
- while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
- if (!L->isLoopInvariant(LHS) || !L->isLoopInvariant(RHS)) break;
- BasicBlock *Preheader = L->getLoopPreheader();
- if (!Preheader) break;
-
- // Ok, move up a level.
- Builder.SetInsertPoint(Preheader->getTerminator());
- }
- }
-
- // If we haven't found this binop, insert it.
- Instruction *BO = cast<Instruction>(Builder.CreateBinOp(Opcode, LHS, RHS));
- BO->setDebugLoc(Loc);
- if (Flags & SCEV::FlagNUW)
- BO->setHasNoUnsignedWrap();
- if (Flags & SCEV::FlagNSW)
- BO->setHasNoSignedWrap();
-
- return BO;
-}
-
-/// FactorOutConstant - Test if S is divisible by Factor, using signed
-/// division. If so, update S with Factor divided out and return true.
-/// S need not be evenly divisible if a reasonable remainder can be
-/// computed.
-static bool FactorOutConstant(const SCEV *&S, const SCEV *&Remainder,
- const SCEV *Factor, ScalarEvolution &SE,
- const DataLayout &DL) {
- // Everything is divisible by one.
- if (Factor->isOne())
- return true;
-
- // x/x == 1.
- if (S == Factor) {
- S = SE.getConstant(S->getType(), 1);
- return true;
- }
-
- // For a Constant, check for a multiple of the given factor.
- if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
- // 0/x == 0.
- if (C->isZero())
- return true;
- // Check for divisibility.
- if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor)) {
- ConstantInt *CI =
- ConstantInt::get(SE.getContext(), C->getAPInt().sdiv(FC->getAPInt()));
- // If the quotient is zero and the remainder is non-zero, reject
- // the value at this scale. It will be considered for subsequent
- // smaller scales.
- if (!CI->isZero()) {
- const SCEV *Div = SE.getConstant(CI);
- S = Div;
- Remainder = SE.getAddExpr(
- Remainder, SE.getConstant(C->getAPInt().srem(FC->getAPInt())));
- return true;
- }
- }
- }
-
- // In a Mul, check if there is a constant operand which is a multiple
- // of the given factor.
- if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
- // Size is known, check if there is a constant operand which is a multiple
- // of the given factor. If so, we can factor it.
- if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor))
- if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
- if (!C->getAPInt().srem(FC->getAPInt())) {
+ return ReuseOrCreateCast(I, Ty, Op, IP);
+}
+
+/// InsertBinop - Insert the specified binary operator, doing a small amount
+/// of work to avoid inserting an obviously redundant operation, and hoisting
+/// to an outer loop when the opportunity is there and it is safe.
+Value *SCEVExpander::InsertBinop(Instruction::BinaryOps Opcode,
+ Value *LHS, Value *RHS,
+ SCEV::NoWrapFlags Flags, bool IsSafeToHoist) {
+ // Fold a binop with constant operands.
+ if (Constant *CLHS = dyn_cast<Constant>(LHS))
+ if (Constant *CRHS = dyn_cast<Constant>(RHS))
+ return ConstantExpr::get(Opcode, CLHS, CRHS);
+
+ // Do a quick scan to see if we have this binop nearby. If so, reuse it.
+ unsigned ScanLimit = 6;
+ BasicBlock::iterator BlockBegin = Builder.GetInsertBlock()->begin();
+ // Scanning starts from the last instruction before the insertion point.
+ BasicBlock::iterator IP = Builder.GetInsertPoint();
+ if (IP != BlockBegin) {
+ --IP;
+ for (; ScanLimit; --IP, --ScanLimit) {
+ // Don't count dbg.value against the ScanLimit, to avoid perturbing the
+ // generated code.
+ if (isa<DbgInfoIntrinsic>(IP))
+ ScanLimit++;
+
+ auto canGenerateIncompatiblePoison = [&Flags](Instruction *I) {
+ // Ensure that no-wrap flags match.
+ if (isa<OverflowingBinaryOperator>(I)) {
+ if (I->hasNoSignedWrap() != (Flags & SCEV::FlagNSW))
+ return true;
+ if (I->hasNoUnsignedWrap() != (Flags & SCEV::FlagNUW))
+ return true;
+ }
+ // Conservatively, do not use any instruction which has any of exact
+ // flags installed.
+ if (isa<PossiblyExactOperator>(I) && I->isExact())
+ return true;
+ return false;
+ };
+ if (IP->getOpcode() == (unsigned)Opcode && IP->getOperand(0) == LHS &&
+ IP->getOperand(1) == RHS && !canGenerateIncompatiblePoison(&*IP))
+ return &*IP;
+ if (IP == BlockBegin) break;
+ }
+ }
+
+ // Save the original insertion point so we can restore it when we're done.
+ DebugLoc Loc = Builder.GetInsertPoint()->getDebugLoc();
+ SCEVInsertPointGuard Guard(Builder, this);
+
+ if (IsSafeToHoist) {
+ // Move the insertion point out of as many loops as we can.
+ while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
+ if (!L->isLoopInvariant(LHS) || !L->isLoopInvariant(RHS)) break;
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader) break;
+
+ // Ok, move up a level.
+ Builder.SetInsertPoint(Preheader->getTerminator());
+ }
+ }
+
+ // If we haven't found this binop, insert it.
+ Instruction *BO = cast<Instruction>(Builder.CreateBinOp(Opcode, LHS, RHS));
+ BO->setDebugLoc(Loc);
+ if (Flags & SCEV::FlagNUW)
+ BO->setHasNoUnsignedWrap();
+ if (Flags & SCEV::FlagNSW)
+ BO->setHasNoSignedWrap();
+
+ return BO;
+}
+
+/// FactorOutConstant - Test if S is divisible by Factor, using signed
+/// division. If so, update S with Factor divided out and return true.
+/// S need not be evenly divisible if a reasonable remainder can be
+/// computed.
+static bool FactorOutConstant(const SCEV *&S, const SCEV *&Remainder,
+ const SCEV *Factor, ScalarEvolution &SE,
+ const DataLayout &DL) {
+ // Everything is divisible by one.
+ if (Factor->isOne())
+ return true;
+
+ // x/x == 1.
+ if (S == Factor) {
+ S = SE.getConstant(S->getType(), 1);
+ return true;
+ }
+
+ // For a Constant, check for a multiple of the given factor.
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
+ // 0/x == 0.
+ if (C->isZero())
+ return true;
+ // Check for divisibility.
+ if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor)) {
+ ConstantInt *CI =
+ ConstantInt::get(SE.getContext(), C->getAPInt().sdiv(FC->getAPInt()));
+ // If the quotient is zero and the remainder is non-zero, reject
+ // the value at this scale. It will be considered for subsequent
+ // smaller scales.
+ if (!CI->isZero()) {
+ const SCEV *Div = SE.getConstant(CI);
+ S = Div;
+ Remainder = SE.getAddExpr(
+ Remainder, SE.getConstant(C->getAPInt().srem(FC->getAPInt())));
+ return true;
+ }
+ }
+ }
+
+ // In a Mul, check if there is a constant operand which is a multiple
+ // of the given factor.
+ if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
+ // Size is known, check if there is a constant operand which is a multiple
+ // of the given factor. If so, we can factor it.
+ if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor))
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
+ if (!C->getAPInt().srem(FC->getAPInt())) {
SmallVector<const SCEV *, 4> NewMulOps(M->operands());
- NewMulOps[0] = SE.getConstant(C->getAPInt().sdiv(FC->getAPInt()));
- S = SE.getMulExpr(NewMulOps);
- return true;
- }
- }
-
- // In an AddRec, check if both start and step are divisible.
- if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(S)) {
- const SCEV *Step = A->getStepRecurrence(SE);
- const SCEV *StepRem = SE.getConstant(Step->getType(), 0);
- if (!FactorOutConstant(Step, StepRem, Factor, SE, DL))
- return false;
- if (!StepRem->isZero())
- return false;
- const SCEV *Start = A->getStart();
- if (!FactorOutConstant(Start, Remainder, Factor, SE, DL))
- return false;
- S = SE.getAddRecExpr(Start, Step, A->getLoop(),
- A->getNoWrapFlags(SCEV::FlagNW));
- return true;
- }
-
- return false;
-}
-
-/// SimplifyAddOperands - Sort and simplify a list of add operands. NumAddRecs
-/// is the number of SCEVAddRecExprs present, which are kept at the end of
-/// the list.
-///
-static void SimplifyAddOperands(SmallVectorImpl<const SCEV *> &Ops,
- Type *Ty,
- ScalarEvolution &SE) {
- unsigned NumAddRecs = 0;
- for (unsigned i = Ops.size(); i > 0 && isa<SCEVAddRecExpr>(Ops[i-1]); --i)
- ++NumAddRecs;
- // Group Ops into non-addrecs and addrecs.
- SmallVector<const SCEV *, 8> NoAddRecs(Ops.begin(), Ops.end() - NumAddRecs);
- SmallVector<const SCEV *, 8> AddRecs(Ops.end() - NumAddRecs, Ops.end());
- // Let ScalarEvolution sort and simplify the non-addrecs list.
- const SCEV *Sum = NoAddRecs.empty() ?
- SE.getConstant(Ty, 0) :
- SE.getAddExpr(NoAddRecs);
- // If it returned an add, use the operands. Otherwise it simplified
- // the sum into a single value, so just use that.
- Ops.clear();
- if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Sum))
- Ops.append(Add->op_begin(), Add->op_end());
- else if (!Sum->isZero())
- Ops.push_back(Sum);
- // Then append the addrecs.
- Ops.append(AddRecs.begin(), AddRecs.end());
-}
-
-/// SplitAddRecs - Flatten a list of add operands, moving addrec start values
-/// out to the top level. For example, convert {a + b,+,c} to a, b, {0,+,d}.
-/// This helps expose more opportunities for folding parts of the expressions
-/// into GEP indices.
-///
-static void SplitAddRecs(SmallVectorImpl<const SCEV *> &Ops,
- Type *Ty,
- ScalarEvolution &SE) {
- // Find the addrecs.
- SmallVector<const SCEV *, 8> AddRecs;
- for (unsigned i = 0, e = Ops.size(); i != e; ++i)
- while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Ops[i])) {
- const SCEV *Start = A->getStart();
- if (Start->isZero()) break;
- const SCEV *Zero = SE.getConstant(Ty, 0);
- AddRecs.push_back(SE.getAddRecExpr(Zero,
- A->getStepRecurrence(SE),
- A->getLoop(),
- A->getNoWrapFlags(SCEV::FlagNW)));
- if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Start)) {
- Ops[i] = Zero;
- Ops.append(Add->op_begin(), Add->op_end());
- e += Add->getNumOperands();
- } else {
- Ops[i] = Start;
- }
- }
- if (!AddRecs.empty()) {
- // Add the addrecs onto the end of the list.
- Ops.append(AddRecs.begin(), AddRecs.end());
- // Resort the operand list, moving any constants to the front.
- SimplifyAddOperands(Ops, Ty, SE);
- }
-}
-
-/// expandAddToGEP - Expand an addition expression with a pointer type into
-/// a GEP instead of using ptrtoint+arithmetic+inttoptr. This helps
-/// BasicAliasAnalysis and other passes analyze the result. See the rules
-/// for getelementptr vs. inttoptr in
-/// http://llvm.org/docs/LangRef.html#pointeraliasing
-/// for details.
-///
-/// Design note: The correctness of using getelementptr here depends on
-/// ScalarEvolution not recognizing inttoptr and ptrtoint operators, as
-/// they may introduce pointer arithmetic which may not be safely converted
-/// into getelementptr.
-///
-/// Design note: It might seem desirable for this function to be more
-/// loop-aware. If some of the indices are loop-invariant while others
-/// aren't, it might seem desirable to emit multiple GEPs, keeping the
-/// loop-invariant portions of the overall computation outside the loop.
-/// However, there are a few reasons this is not done here. Hoisting simple
-/// arithmetic is a low-level optimization that often isn't very
-/// important until late in the optimization process. In fact, passes
-/// like InstructionCombining will combine GEPs, even if it means
-/// pushing loop-invariant computation down into loops, so even if the
-/// GEPs were split here, the work would quickly be undone. The
-/// LoopStrengthReduction pass, which is usually run quite late (and
-/// after the last InstructionCombining pass), takes care of hoisting
-/// loop-invariant portions of expressions, after considering what
-/// can be folded using target addressing modes.
-///
-Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
- const SCEV *const *op_end,
- PointerType *PTy,
- Type *Ty,
- Value *V) {
- Type *OriginalElTy = PTy->getElementType();
- Type *ElTy = OriginalElTy;
- SmallVector<Value *, 4> GepIndices;
- SmallVector<const SCEV *, 8> Ops(op_begin, op_end);
- bool AnyNonZeroIndices = false;
-
- // Split AddRecs up into parts as either of the parts may be usable
- // without the other.
- SplitAddRecs(Ops, Ty, SE);
-
- Type *IntIdxTy = DL.getIndexType(PTy);
-
- // Descend down the pointer's type and attempt to convert the other
- // operands into GEP indices, at each level. The first index in a GEP
- // indexes into the array implied by the pointer operand; the rest of
- // the indices index into the element or field type selected by the
- // preceding index.
- for (;;) {
- // If the scale size is not 0, attempt to factor out a scale for
- // array indexing.
- SmallVector<const SCEV *, 8> ScaledOps;
- if (ElTy->isSized()) {
- const SCEV *ElSize = SE.getSizeOfExpr(IntIdxTy, ElTy);
- if (!ElSize->isZero()) {
- SmallVector<const SCEV *, 8> NewOps;
- for (const SCEV *Op : Ops) {
- const SCEV *Remainder = SE.getConstant(Ty, 0);
- if (FactorOutConstant(Op, Remainder, ElSize, SE, DL)) {
- // Op now has ElSize factored out.
- ScaledOps.push_back(Op);
- if (!Remainder->isZero())
- NewOps.push_back(Remainder);
- AnyNonZeroIndices = true;
- } else {
- // The operand was not divisible, so add it to the list of operands
- // we'll scan next iteration.
- NewOps.push_back(Op);
- }
- }
- // If we made any changes, update Ops.
- if (!ScaledOps.empty()) {
- Ops = NewOps;
- SimplifyAddOperands(Ops, Ty, SE);
- }
- }
- }
-
- // Record the scaled array index for this level of the type. If
- // we didn't find any operands that could be factored, tentatively
- // assume that element zero was selected (since the zero offset
- // would obviously be folded away).
+ NewMulOps[0] = SE.getConstant(C->getAPInt().sdiv(FC->getAPInt()));
+ S = SE.getMulExpr(NewMulOps);
+ return true;
+ }
+ }
+
+ // In an AddRec, check if both start and step are divisible.
+ if (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(S)) {
+ const SCEV *Step = A->getStepRecurrence(SE);
+ const SCEV *StepRem = SE.getConstant(Step->getType(), 0);
+ if (!FactorOutConstant(Step, StepRem, Factor, SE, DL))
+ return false;
+ if (!StepRem->isZero())
+ return false;
+ const SCEV *Start = A->getStart();
+ if (!FactorOutConstant(Start, Remainder, Factor, SE, DL))
+ return false;
+ S = SE.getAddRecExpr(Start, Step, A->getLoop(),
+ A->getNoWrapFlags(SCEV::FlagNW));
+ return true;
+ }
+
+ return false;
+}
+
+/// SimplifyAddOperands - Sort and simplify a list of add operands. NumAddRecs
+/// is the number of SCEVAddRecExprs present, which are kept at the end of
+/// the list.
+///
+static void SimplifyAddOperands(SmallVectorImpl<const SCEV *> &Ops,
+ Type *Ty,
+ ScalarEvolution &SE) {
+ unsigned NumAddRecs = 0;
+ for (unsigned i = Ops.size(); i > 0 && isa<SCEVAddRecExpr>(Ops[i-1]); --i)
+ ++NumAddRecs;
+ // Group Ops into non-addrecs and addrecs.
+ SmallVector<const SCEV *, 8> NoAddRecs(Ops.begin(), Ops.end() - NumAddRecs);
+ SmallVector<const SCEV *, 8> AddRecs(Ops.end() - NumAddRecs, Ops.end());
+ // Let ScalarEvolution sort and simplify the non-addrecs list.
+ const SCEV *Sum = NoAddRecs.empty() ?
+ SE.getConstant(Ty, 0) :
+ SE.getAddExpr(NoAddRecs);
+ // If it returned an add, use the operands. Otherwise it simplified
+ // the sum into a single value, so just use that.
+ Ops.clear();
+ if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Sum))
+ Ops.append(Add->op_begin(), Add->op_end());
+ else if (!Sum->isZero())
+ Ops.push_back(Sum);
+ // Then append the addrecs.
+ Ops.append(AddRecs.begin(), AddRecs.end());
+}
+
+/// SplitAddRecs - Flatten a list of add operands, moving addrec start values
+/// out to the top level. For example, convert {a + b,+,c} to a, b, {0,+,d}.
+/// This helps expose more opportunities for folding parts of the expressions
+/// into GEP indices.
+///
+static void SplitAddRecs(SmallVectorImpl<const SCEV *> &Ops,
+ Type *Ty,
+ ScalarEvolution &SE) {
+ // Find the addrecs.
+ SmallVector<const SCEV *, 8> AddRecs;
+ for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+ while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Ops[i])) {
+ const SCEV *Start = A->getStart();
+ if (Start->isZero()) break;
+ const SCEV *Zero = SE.getConstant(Ty, 0);
+ AddRecs.push_back(SE.getAddRecExpr(Zero,
+ A->getStepRecurrence(SE),
+ A->getLoop(),
+ A->getNoWrapFlags(SCEV::FlagNW)));
+ if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(Start)) {
+ Ops[i] = Zero;
+ Ops.append(Add->op_begin(), Add->op_end());
+ e += Add->getNumOperands();
+ } else {
+ Ops[i] = Start;
+ }
+ }
+ if (!AddRecs.empty()) {
+ // Add the addrecs onto the end of the list.
+ Ops.append(AddRecs.begin(), AddRecs.end());
+ // Resort the operand list, moving any constants to the front.
+ SimplifyAddOperands(Ops, Ty, SE);
+ }
+}
+
+/// expandAddToGEP - Expand an addition expression with a pointer type into
+/// a GEP instead of using ptrtoint+arithmetic+inttoptr. This helps
+/// BasicAliasAnalysis and other passes analyze the result. See the rules
+/// for getelementptr vs. inttoptr in
+/// http://llvm.org/docs/LangRef.html#pointeraliasing
+/// for details.
+///
+/// Design note: The correctness of using getelementptr here depends on
+/// ScalarEvolution not recognizing inttoptr and ptrtoint operators, as
+/// they may introduce pointer arithmetic which may not be safely converted
+/// into getelementptr.
+///
+/// Design note: It might seem desirable for this function to be more
+/// loop-aware. If some of the indices are loop-invariant while others
+/// aren't, it might seem desirable to emit multiple GEPs, keeping the
+/// loop-invariant portions of the overall computation outside the loop.
+/// However, there are a few reasons this is not done here. Hoisting simple
+/// arithmetic is a low-level optimization that often isn't very
+/// important until late in the optimization process. In fact, passes
+/// like InstructionCombining will combine GEPs, even if it means
+/// pushing loop-invariant computation down into loops, so even if the
+/// GEPs were split here, the work would quickly be undone. The
+/// LoopStrengthReduction pass, which is usually run quite late (and
+/// after the last InstructionCombining pass), takes care of hoisting
+/// loop-invariant portions of expressions, after considering what
+/// can be folded using target addressing modes.
+///
+Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
+ const SCEV *const *op_end,
+ PointerType *PTy,
+ Type *Ty,
+ Value *V) {
+ Type *OriginalElTy = PTy->getElementType();
+ Type *ElTy = OriginalElTy;
+ SmallVector<Value *, 4> GepIndices;
+ SmallVector<const SCEV *, 8> Ops(op_begin, op_end);
+ bool AnyNonZeroIndices = false;
+
+ // Split AddRecs up into parts as either of the parts may be usable
+ // without the other.
+ SplitAddRecs(Ops, Ty, SE);
+
+ Type *IntIdxTy = DL.getIndexType(PTy);
+
+ // Descend down the pointer's type and attempt to convert the other
+ // operands into GEP indices, at each level. The first index in a GEP
+ // indexes into the array implied by the pointer operand; the rest of
+ // the indices index into the element or field type selected by the
+ // preceding index.
+ for (;;) {
+ // If the scale size is not 0, attempt to factor out a scale for
+ // array indexing.
+ SmallVector<const SCEV *, 8> ScaledOps;
+ if (ElTy->isSized()) {
+ const SCEV *ElSize = SE.getSizeOfExpr(IntIdxTy, ElTy);
+ if (!ElSize->isZero()) {
+ SmallVector<const SCEV *, 8> NewOps;
+ for (const SCEV *Op : Ops) {
+ const SCEV *Remainder = SE.getConstant(Ty, 0);
+ if (FactorOutConstant(Op, Remainder, ElSize, SE, DL)) {
+ // Op now has ElSize factored out.
+ ScaledOps.push_back(Op);
+ if (!Remainder->isZero())
+ NewOps.push_back(Remainder);
+ AnyNonZeroIndices = true;
+ } else {
+ // The operand was not divisible, so add it to the list of operands
+ // we'll scan next iteration.
+ NewOps.push_back(Op);
+ }
+ }
+ // If we made any changes, update Ops.
+ if (!ScaledOps.empty()) {
+ Ops = NewOps;
+ SimplifyAddOperands(Ops, Ty, SE);
+ }
+ }
+ }
+
+ // Record the scaled array index for this level of the type. If
+ // we didn't find any operands that could be factored, tentatively
+ // assume that element zero was selected (since the zero offset
+ // would obviously be folded away).
Value *Scaled =
ScaledOps.empty()
? Constant::getNullValue(Ty)
: expandCodeForImpl(SE.getAddExpr(ScaledOps), Ty, false);
- GepIndices.push_back(Scaled);
-
- // Collect struct field index operands.
- while (StructType *STy = dyn_cast<StructType>(ElTy)) {
- bool FoundFieldNo = false;
- // An empty struct has no fields.
- if (STy->getNumElements() == 0) break;
- // Field offsets are known. See if a constant offset falls within any of
- // the struct fields.
- if (Ops.empty())
- break;
- if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[0]))
- if (SE.getTypeSizeInBits(C->getType()) <= 64) {
- const StructLayout &SL = *DL.getStructLayout(STy);
- uint64_t FullOffset = C->getValue()->getZExtValue();
- if (FullOffset < SL.getSizeInBytes()) {
- unsigned ElIdx = SL.getElementContainingOffset(FullOffset);
- GepIndices.push_back(
- ConstantInt::get(Type::getInt32Ty(Ty->getContext()), ElIdx));
- ElTy = STy->getTypeAtIndex(ElIdx);
- Ops[0] =
- SE.getConstant(Ty, FullOffset - SL.getElementOffset(ElIdx));
- AnyNonZeroIndices = true;
- FoundFieldNo = true;
- }
- }
- // If no struct field offsets were found, tentatively assume that
- // field zero was selected (since the zero offset would obviously
- // be folded away).
- if (!FoundFieldNo) {
- ElTy = STy->getTypeAtIndex(0u);
- GepIndices.push_back(
- Constant::getNullValue(Type::getInt32Ty(Ty->getContext())));
- }
- }
-
- if (ArrayType *ATy = dyn_cast<ArrayType>(ElTy))
- ElTy = ATy->getElementType();
- else
- // FIXME: Handle VectorType.
- // E.g., If ElTy is scalable vector, then ElSize is not a compile-time
- // constant, therefore can not be factored out. The generated IR is less
- // ideal with base 'V' cast to i8* and do ugly getelementptr over that.
- break;
- }
-
- // If none of the operands were convertible to proper GEP indices, cast
- // the base to i8* and do an ugly getelementptr with that. It's still
- // better than ptrtoint+arithmetic+inttoptr at least.
- if (!AnyNonZeroIndices) {
- // Cast the base to i8*.
- V = InsertNoopCastOfTo(V,
- Type::getInt8PtrTy(Ty->getContext(), PTy->getAddressSpace()));
-
- assert(!isa<Instruction>(V) ||
- SE.DT.dominates(cast<Instruction>(V), &*Builder.GetInsertPoint()));
-
- // Expand the operands for a plain byte offset.
+ GepIndices.push_back(Scaled);
+
+ // Collect struct field index operands.
+ while (StructType *STy = dyn_cast<StructType>(ElTy)) {
+ bool FoundFieldNo = false;
+ // An empty struct has no fields.
+ if (STy->getNumElements() == 0) break;
+ // Field offsets are known. See if a constant offset falls within any of
+ // the struct fields.
+ if (Ops.empty())
+ break;
+ if (const SCEVConstant *C = dyn_cast<SCEVConstant>(Ops[0]))
+ if (SE.getTypeSizeInBits(C->getType()) <= 64) {
+ const StructLayout &SL = *DL.getStructLayout(STy);
+ uint64_t FullOffset = C->getValue()->getZExtValue();
+ if (FullOffset < SL.getSizeInBytes()) {
+ unsigned ElIdx = SL.getElementContainingOffset(FullOffset);
+ GepIndices.push_back(
+ ConstantInt::get(Type::getInt32Ty(Ty->getContext()), ElIdx));
+ ElTy = STy->getTypeAtIndex(ElIdx);
+ Ops[0] =
+ SE.getConstant(Ty, FullOffset - SL.getElementOffset(ElIdx));
+ AnyNonZeroIndices = true;
+ FoundFieldNo = true;
+ }
+ }
+ // If no struct field offsets were found, tentatively assume that
+ // field zero was selected (since the zero offset would obviously
+ // be folded away).
+ if (!FoundFieldNo) {
+ ElTy = STy->getTypeAtIndex(0u);
+ GepIndices.push_back(
+ Constant::getNullValue(Type::getInt32Ty(Ty->getContext())));
+ }
+ }
+
+ if (ArrayType *ATy = dyn_cast<ArrayType>(ElTy))
+ ElTy = ATy->getElementType();
+ else
+ // FIXME: Handle VectorType.
+ // E.g., If ElTy is scalable vector, then ElSize is not a compile-time
+ // constant, therefore can not be factored out. The generated IR is less
+ // ideal with base 'V' cast to i8* and do ugly getelementptr over that.
+ break;
+ }
+
+ // If none of the operands were convertible to proper GEP indices, cast
+ // the base to i8* and do an ugly getelementptr with that. It's still
+ // better than ptrtoint+arithmetic+inttoptr at least.
+ if (!AnyNonZeroIndices) {
+ // Cast the base to i8*.
+ V = InsertNoopCastOfTo(V,
+ Type::getInt8PtrTy(Ty->getContext(), PTy->getAddressSpace()));
+
+ assert(!isa<Instruction>(V) ||
+ SE.DT.dominates(cast<Instruction>(V), &*Builder.GetInsertPoint()));
+
+ // Expand the operands for a plain byte offset.
Value *Idx = expandCodeForImpl(SE.getAddExpr(Ops), Ty, false);
-
- // Fold a GEP with constant operands.
- if (Constant *CLHS = dyn_cast<Constant>(V))
- if (Constant *CRHS = dyn_cast<Constant>(Idx))
- return ConstantExpr::getGetElementPtr(Type::getInt8Ty(Ty->getContext()),
- CLHS, CRHS);
-
- // Do a quick scan to see if we have this GEP nearby. If so, reuse it.
- unsigned ScanLimit = 6;
- BasicBlock::iterator BlockBegin = Builder.GetInsertBlock()->begin();
- // Scanning starts from the last instruction before the insertion point.
- BasicBlock::iterator IP = Builder.GetInsertPoint();
- if (IP != BlockBegin) {
- --IP;
- for (; ScanLimit; --IP, --ScanLimit) {
- // Don't count dbg.value against the ScanLimit, to avoid perturbing the
- // generated code.
- if (isa<DbgInfoIntrinsic>(IP))
- ScanLimit++;
- if (IP->getOpcode() == Instruction::GetElementPtr &&
- IP->getOperand(0) == V && IP->getOperand(1) == Idx)
- return &*IP;
- if (IP == BlockBegin) break;
- }
- }
-
- // Save the original insertion point so we can restore it when we're done.
- SCEVInsertPointGuard Guard(Builder, this);
-
- // Move the insertion point out of as many loops as we can.
- while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
- if (!L->isLoopInvariant(V) || !L->isLoopInvariant(Idx)) break;
- BasicBlock *Preheader = L->getLoopPreheader();
- if (!Preheader) break;
-
- // Ok, move up a level.
- Builder.SetInsertPoint(Preheader->getTerminator());
- }
-
- // Emit a GEP.
+
+ // Fold a GEP with constant operands.
+ if (Constant *CLHS = dyn_cast<Constant>(V))
+ if (Constant *CRHS = dyn_cast<Constant>(Idx))
+ return ConstantExpr::getGetElementPtr(Type::getInt8Ty(Ty->getContext()),
+ CLHS, CRHS);
+
+ // Do a quick scan to see if we have this GEP nearby. If so, reuse it.
+ unsigned ScanLimit = 6;
+ BasicBlock::iterator BlockBegin = Builder.GetInsertBlock()->begin();
+ // Scanning starts from the last instruction before the insertion point.
+ BasicBlock::iterator IP = Builder.GetInsertPoint();
+ if (IP != BlockBegin) {
+ --IP;
+ for (; ScanLimit; --IP, --ScanLimit) {
+ // Don't count dbg.value against the ScanLimit, to avoid perturbing the
+ // generated code.
+ if (isa<DbgInfoIntrinsic>(IP))
+ ScanLimit++;
+ if (IP->getOpcode() == Instruction::GetElementPtr &&
+ IP->getOperand(0) == V && IP->getOperand(1) == Idx)
+ return &*IP;
+ if (IP == BlockBegin) break;
+ }
+ }
+
+ // Save the original insertion point so we can restore it when we're done.
+ SCEVInsertPointGuard Guard(Builder, this);
+
+ // Move the insertion point out of as many loops as we can.
+ while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
+ if (!L->isLoopInvariant(V) || !L->isLoopInvariant(Idx)) break;
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader) break;
+
+ // Ok, move up a level.
+ Builder.SetInsertPoint(Preheader->getTerminator());
+ }
+
+ // Emit a GEP.
return Builder.CreateGEP(Builder.getInt8Ty(), V, Idx, "uglygep");
- }
-
- {
- SCEVInsertPointGuard Guard(Builder, this);
-
- // Move the insertion point out of as many loops as we can.
- while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
- if (!L->isLoopInvariant(V)) break;
-
- bool AnyIndexNotLoopInvariant = any_of(
- GepIndices, [L](Value *Op) { return !L->isLoopInvariant(Op); });
-
- if (AnyIndexNotLoopInvariant)
- break;
-
- BasicBlock *Preheader = L->getLoopPreheader();
- if (!Preheader) break;
-
- // Ok, move up a level.
- Builder.SetInsertPoint(Preheader->getTerminator());
- }
-
- // Insert a pretty getelementptr. Note that this GEP is not marked inbounds,
- // because ScalarEvolution may have changed the address arithmetic to
- // compute a value which is beyond the end of the allocated object.
- Value *Casted = V;
- if (V->getType() != PTy)
- Casted = InsertNoopCastOfTo(Casted, PTy);
- Value *GEP = Builder.CreateGEP(OriginalElTy, Casted, GepIndices, "scevgep");
- Ops.push_back(SE.getUnknown(GEP));
- }
-
- return expand(SE.getAddExpr(Ops));
-}
-
-Value *SCEVExpander::expandAddToGEP(const SCEV *Op, PointerType *PTy, Type *Ty,
- Value *V) {
- const SCEV *const Ops[1] = {Op};
- return expandAddToGEP(Ops, Ops + 1, PTy, Ty, V);
-}
-
-/// PickMostRelevantLoop - Given two loops pick the one that's most relevant for
-/// SCEV expansion. If they are nested, this is the most nested. If they are
-/// neighboring, pick the later.
-static const Loop *PickMostRelevantLoop(const Loop *A, const Loop *B,
- DominatorTree &DT) {
- if (!A) return B;
- if (!B) return A;
- if (A->contains(B)) return B;
- if (B->contains(A)) return A;
- if (DT.dominates(A->getHeader(), B->getHeader())) return B;
- if (DT.dominates(B->getHeader(), A->getHeader())) return A;
- return A; // Arbitrarily break the tie.
-}
-
-/// getRelevantLoop - Get the most relevant loop associated with the given
-/// expression, according to PickMostRelevantLoop.
-const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) {
- // Test whether we've already computed the most relevant loop for this SCEV.
- auto Pair = RelevantLoops.insert(std::make_pair(S, nullptr));
- if (!Pair.second)
- return Pair.first->second;
-
- if (isa<SCEVConstant>(S))
- // A constant has no relevant loops.
- return nullptr;
- if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
- if (const Instruction *I = dyn_cast<Instruction>(U->getValue()))
- return Pair.first->second = SE.LI.getLoopFor(I->getParent());
- // A non-instruction has no relevant loops.
- return nullptr;
- }
- if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) {
- const Loop *L = nullptr;
- if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
- L = AR->getLoop();
- for (const SCEV *Op : N->operands())
- L = PickMostRelevantLoop(L, getRelevantLoop(Op), SE.DT);
- return RelevantLoops[N] = L;
- }
- if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) {
- const Loop *Result = getRelevantLoop(C->getOperand());
- return RelevantLoops[C] = Result;
- }
- if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
- const Loop *Result = PickMostRelevantLoop(
- getRelevantLoop(D->getLHS()), getRelevantLoop(D->getRHS()), SE.DT);
- return RelevantLoops[D] = Result;
- }
- llvm_unreachable("Unexpected SCEV type!");
-}
-
-namespace {
-
-/// LoopCompare - Compare loops by PickMostRelevantLoop.
-class LoopCompare {
- DominatorTree &DT;
-public:
- explicit LoopCompare(DominatorTree &dt) : DT(dt) {}
-
- bool operator()(std::pair<const Loop *, const SCEV *> LHS,
- std::pair<const Loop *, const SCEV *> RHS) const {
- // Keep pointer operands sorted at the end.
- if (LHS.second->getType()->isPointerTy() !=
- RHS.second->getType()->isPointerTy())
- return LHS.second->getType()->isPointerTy();
-
- // Compare loops with PickMostRelevantLoop.
- if (LHS.first != RHS.first)
- return PickMostRelevantLoop(LHS.first, RHS.first, DT) != LHS.first;
-
- // If one operand is a non-constant negative and the other is not,
- // put the non-constant negative on the right so that a sub can
- // be used instead of a negate and add.
- if (LHS.second->isNonConstantNegative()) {
- if (!RHS.second->isNonConstantNegative())
- return false;
- } else if (RHS.second->isNonConstantNegative())
- return true;
-
- // Otherwise they are equivalent according to this comparison.
- return false;
- }
-};
-
-}
-
-Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
- Type *Ty = SE.getEffectiveSCEVType(S->getType());
-
- // Collect all the add operands in a loop, along with their associated loops.
- // Iterate in reverse so that constants are emitted last, all else equal, and
- // so that pointer operands are inserted first, which the code below relies on
- // to form more involved GEPs.
- SmallVector<std::pair<const Loop *, const SCEV *>, 8> OpsAndLoops;
- for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(S->op_end()),
- E(S->op_begin()); I != E; ++I)
- OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I));
-
- // Sort by loop. Use a stable sort so that constants follow non-constants and
- // pointer operands precede non-pointer operands.
- llvm::stable_sort(OpsAndLoops, LoopCompare(SE.DT));
-
- // Emit instructions to add all the operands. Hoist as much as possible
- // out of loops, and form meaningful getelementptrs where possible.
- Value *Sum = nullptr;
- for (auto I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E;) {
- const Loop *CurLoop = I->first;
- const SCEV *Op = I->second;
- if (!Sum) {
- // This is the first operand. Just expand it.
- Sum = expand(Op);
- ++I;
- } else if (PointerType *PTy = dyn_cast<PointerType>(Sum->getType())) {
- // The running sum expression is a pointer. Try to form a getelementptr
- // at this level with that as the base.
- SmallVector<const SCEV *, 4> NewOps;
- for (; I != E && I->first == CurLoop; ++I) {
- // If the operand is SCEVUnknown and not instructions, peek through
- // it, to enable more of it to be folded into the GEP.
- const SCEV *X = I->second;
- if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(X))
- if (!isa<Instruction>(U->getValue()))
- X = SE.getSCEV(U->getValue());
- NewOps.push_back(X);
- }
- Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, Sum);
- } else if (PointerType *PTy = dyn_cast<PointerType>(Op->getType())) {
- // The running sum is an integer, and there's a pointer at this level.
- // Try to form a getelementptr. If the running sum is instructions,
- // use a SCEVUnknown to avoid re-analyzing them.
- SmallVector<const SCEV *, 4> NewOps;
- NewOps.push_back(isa<Instruction>(Sum) ? SE.getUnknown(Sum) :
- SE.getSCEV(Sum));
- for (++I; I != E && I->first == CurLoop; ++I)
- NewOps.push_back(I->second);
- Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, expand(Op));
- } else if (Op->isNonConstantNegative()) {
- // Instead of doing a negate and add, just do a subtract.
+ }
+
+ {
+ SCEVInsertPointGuard Guard(Builder, this);
+
+ // Move the insertion point out of as many loops as we can.
+ while (const Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock())) {
+ if (!L->isLoopInvariant(V)) break;
+
+ bool AnyIndexNotLoopInvariant = any_of(
+ GepIndices, [L](Value *Op) { return !L->isLoopInvariant(Op); });
+
+ if (AnyIndexNotLoopInvariant)
+ break;
+
+ BasicBlock *Preheader = L->getLoopPreheader();
+ if (!Preheader) break;
+
+ // Ok, move up a level.
+ Builder.SetInsertPoint(Preheader->getTerminator());
+ }
+
+ // Insert a pretty getelementptr. Note that this GEP is not marked inbounds,
+ // because ScalarEvolution may have changed the address arithmetic to
+ // compute a value which is beyond the end of the allocated object.
+ Value *Casted = V;
+ if (V->getType() != PTy)
+ Casted = InsertNoopCastOfTo(Casted, PTy);
+ Value *GEP = Builder.CreateGEP(OriginalElTy, Casted, GepIndices, "scevgep");
+ Ops.push_back(SE.getUnknown(GEP));
+ }
+
+ return expand(SE.getAddExpr(Ops));
+}
+
+Value *SCEVExpander::expandAddToGEP(const SCEV *Op, PointerType *PTy, Type *Ty,
+ Value *V) {
+ const SCEV *const Ops[1] = {Op};
+ return expandAddToGEP(Ops, Ops + 1, PTy, Ty, V);
+}
+
+/// PickMostRelevantLoop - Given two loops pick the one that's most relevant for
+/// SCEV expansion. If they are nested, this is the most nested. If they are
+/// neighboring, pick the later.
+static const Loop *PickMostRelevantLoop(const Loop *A, const Loop *B,
+ DominatorTree &DT) {
+ if (!A) return B;
+ if (!B) return A;
+ if (A->contains(B)) return B;
+ if (B->contains(A)) return A;
+ if (DT.dominates(A->getHeader(), B->getHeader())) return B;
+ if (DT.dominates(B->getHeader(), A->getHeader())) return A;
+ return A; // Arbitrarily break the tie.
+}
+
+/// getRelevantLoop - Get the most relevant loop associated with the given
+/// expression, according to PickMostRelevantLoop.
+const Loop *SCEVExpander::getRelevantLoop(const SCEV *S) {
+ // Test whether we've already computed the most relevant loop for this SCEV.
+ auto Pair = RelevantLoops.insert(std::make_pair(S, nullptr));
+ if (!Pair.second)
+ return Pair.first->second;
+
+ if (isa<SCEVConstant>(S))
+ // A constant has no relevant loops.
+ return nullptr;
+ if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
+ if (const Instruction *I = dyn_cast<Instruction>(U->getValue()))
+ return Pair.first->second = SE.LI.getLoopFor(I->getParent());
+ // A non-instruction has no relevant loops.
+ return nullptr;
+ }
+ if (const SCEVNAryExpr *N = dyn_cast<SCEVNAryExpr>(S)) {
+ const Loop *L = nullptr;
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S))
+ L = AR->getLoop();
+ for (const SCEV *Op : N->operands())
+ L = PickMostRelevantLoop(L, getRelevantLoop(Op), SE.DT);
+ return RelevantLoops[N] = L;
+ }
+ if (const SCEVCastExpr *C = dyn_cast<SCEVCastExpr>(S)) {
+ const Loop *Result = getRelevantLoop(C->getOperand());
+ return RelevantLoops[C] = Result;
+ }
+ if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+ const Loop *Result = PickMostRelevantLoop(
+ getRelevantLoop(D->getLHS()), getRelevantLoop(D->getRHS()), SE.DT);
+ return RelevantLoops[D] = Result;
+ }
+ llvm_unreachable("Unexpected SCEV type!");
+}
+
+namespace {
+
+/// LoopCompare - Compare loops by PickMostRelevantLoop.
+class LoopCompare {
+ DominatorTree &DT;
+public:
+ explicit LoopCompare(DominatorTree &dt) : DT(dt) {}
+
+ bool operator()(std::pair<const Loop *, const SCEV *> LHS,
+ std::pair<const Loop *, const SCEV *> RHS) const {
+ // Keep pointer operands sorted at the end.
+ if (LHS.second->getType()->isPointerTy() !=
+ RHS.second->getType()->isPointerTy())
+ return LHS.second->getType()->isPointerTy();
+
+ // Compare loops with PickMostRelevantLoop.
+ if (LHS.first != RHS.first)
+ return PickMostRelevantLoop(LHS.first, RHS.first, DT) != LHS.first;
+
+ // If one operand is a non-constant negative and the other is not,
+ // put the non-constant negative on the right so that a sub can
+ // be used instead of a negate and add.
+ if (LHS.second->isNonConstantNegative()) {
+ if (!RHS.second->isNonConstantNegative())
+ return false;
+ } else if (RHS.second->isNonConstantNegative())
+ return true;
+
+ // Otherwise they are equivalent according to this comparison.
+ return false;
+ }
+};
+
+}
+
+Value *SCEVExpander::visitAddExpr(const SCEVAddExpr *S) {
+ Type *Ty = SE.getEffectiveSCEVType(S->getType());
+
+ // Collect all the add operands in a loop, along with their associated loops.
+ // Iterate in reverse so that constants are emitted last, all else equal, and
+ // so that pointer operands are inserted first, which the code below relies on
+ // to form more involved GEPs.
+ SmallVector<std::pair<const Loop *, const SCEV *>, 8> OpsAndLoops;
+ for (std::reverse_iterator<SCEVAddExpr::op_iterator> I(S->op_end()),
+ E(S->op_begin()); I != E; ++I)
+ OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I));
+
+ // Sort by loop. Use a stable sort so that constants follow non-constants and
+ // pointer operands precede non-pointer operands.
+ llvm::stable_sort(OpsAndLoops, LoopCompare(SE.DT));
+
+ // Emit instructions to add all the operands. Hoist as much as possible
+ // out of loops, and form meaningful getelementptrs where possible.
+ Value *Sum = nullptr;
+ for (auto I = OpsAndLoops.begin(), E = OpsAndLoops.end(); I != E;) {
+ const Loop *CurLoop = I->first;
+ const SCEV *Op = I->second;
+ if (!Sum) {
+ // This is the first operand. Just expand it.
+ Sum = expand(Op);
+ ++I;
+ } else if (PointerType *PTy = dyn_cast<PointerType>(Sum->getType())) {
+ // The running sum expression is a pointer. Try to form a getelementptr
+ // at this level with that as the base.
+ SmallVector<const SCEV *, 4> NewOps;
+ for (; I != E && I->first == CurLoop; ++I) {
+ // If the operand is SCEVUnknown and not instructions, peek through
+ // it, to enable more of it to be folded into the GEP.
+ const SCEV *X = I->second;
+ if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(X))
+ if (!isa<Instruction>(U->getValue()))
+ X = SE.getSCEV(U->getValue());
+ NewOps.push_back(X);
+ }
+ Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, Sum);
+ } else if (PointerType *PTy = dyn_cast<PointerType>(Op->getType())) {
+ // The running sum is an integer, and there's a pointer at this level.
+ // Try to form a getelementptr. If the running sum is instructions,
+ // use a SCEVUnknown to avoid re-analyzing them.
+ SmallVector<const SCEV *, 4> NewOps;
+ NewOps.push_back(isa<Instruction>(Sum) ? SE.getUnknown(Sum) :
+ SE.getSCEV(Sum));
+ for (++I; I != E && I->first == CurLoop; ++I)
+ NewOps.push_back(I->second);
+ Sum = expandAddToGEP(NewOps.begin(), NewOps.end(), PTy, Ty, expand(Op));
+ } else if (Op->isNonConstantNegative()) {
+ // Instead of doing a negate and add, just do a subtract.
Value *W = expandCodeForImpl(SE.getNegativeSCEV(Op), Ty, false);
- Sum = InsertNoopCastOfTo(Sum, Ty);
- Sum = InsertBinop(Instruction::Sub, Sum, W, SCEV::FlagAnyWrap,
- /*IsSafeToHoist*/ true);
- ++I;
- } else {
- // A simple add.
+ Sum = InsertNoopCastOfTo(Sum, Ty);
+ Sum = InsertBinop(Instruction::Sub, Sum, W, SCEV::FlagAnyWrap,
+ /*IsSafeToHoist*/ true);
+ ++I;
+ } else {
+ // A simple add.
Value *W = expandCodeForImpl(Op, Ty, false);
- Sum = InsertNoopCastOfTo(Sum, Ty);
- // Canonicalize a constant to the RHS.
- if (isa<Constant>(Sum)) std::swap(Sum, W);
- Sum = InsertBinop(Instruction::Add, Sum, W, S->getNoWrapFlags(),
- /*IsSafeToHoist*/ true);
- ++I;
- }
- }
-
- return Sum;
-}
-
-Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
- Type *Ty = SE.getEffectiveSCEVType(S->getType());
-
- // Collect all the mul operands in a loop, along with their associated loops.
- // Iterate in reverse so that constants are emitted last, all else equal.
- SmallVector<std::pair<const Loop *, const SCEV *>, 8> OpsAndLoops;
- for (std::reverse_iterator<SCEVMulExpr::op_iterator> I(S->op_end()),
- E(S->op_begin()); I != E; ++I)
- OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I));
-
- // Sort by loop. Use a stable sort so that constants follow non-constants.
- llvm::stable_sort(OpsAndLoops, LoopCompare(SE.DT));
-
- // Emit instructions to mul all the operands. Hoist as much as possible
- // out of loops.
- Value *Prod = nullptr;
- auto I = OpsAndLoops.begin();
-
- // Expand the calculation of X pow N in the following manner:
- // Let N = P1 + P2 + ... + PK, where all P are powers of 2. Then:
- // X pow N = (X pow P1) * (X pow P2) * ... * (X pow PK).
- const auto ExpandOpBinPowN = [this, &I, &OpsAndLoops, &Ty]() {
- auto E = I;
- // Calculate how many times the same operand from the same loop is included
- // into this power.
- uint64_t Exponent = 0;
- const uint64_t MaxExponent = UINT64_MAX >> 1;
- // No one sane will ever try to calculate such huge exponents, but if we
- // need this, we stop on UINT64_MAX / 2 because we need to exit the loop
- // below when the power of 2 exceeds our Exponent, and we want it to be
- // 1u << 31 at most to not deal with unsigned overflow.
- while (E != OpsAndLoops.end() && *I == *E && Exponent != MaxExponent) {
- ++Exponent;
- ++E;
- }
- assert(Exponent > 0 && "Trying to calculate a zeroth exponent of operand?");
-
- // Calculate powers with exponents 1, 2, 4, 8 etc. and include those of them
- // that are needed into the result.
+ Sum = InsertNoopCastOfTo(Sum, Ty);
+ // Canonicalize a constant to the RHS.
+ if (isa<Constant>(Sum)) std::swap(Sum, W);
+ Sum = InsertBinop(Instruction::Add, Sum, W, S->getNoWrapFlags(),
+ /*IsSafeToHoist*/ true);
+ ++I;
+ }
+ }
+
+ return Sum;
+}
+
+Value *SCEVExpander::visitMulExpr(const SCEVMulExpr *S) {
+ Type *Ty = SE.getEffectiveSCEVType(S->getType());
+
+ // Collect all the mul operands in a loop, along with their associated loops.
+ // Iterate in reverse so that constants are emitted last, all else equal.
+ SmallVector<std::pair<const Loop *, const SCEV *>, 8> OpsAndLoops;
+ for (std::reverse_iterator<SCEVMulExpr::op_iterator> I(S->op_end()),
+ E(S->op_begin()); I != E; ++I)
+ OpsAndLoops.push_back(std::make_pair(getRelevantLoop(*I), *I));
+
+ // Sort by loop. Use a stable sort so that constants follow non-constants.
+ llvm::stable_sort(OpsAndLoops, LoopCompare(SE.DT));
+
+ // Emit instructions to mul all the operands. Hoist as much as possible
+ // out of loops.
+ Value *Prod = nullptr;
+ auto I = OpsAndLoops.begin();
+
+ // Expand the calculation of X pow N in the following manner:
+ // Let N = P1 + P2 + ... + PK, where all P are powers of 2. Then:
+ // X pow N = (X pow P1) * (X pow P2) * ... * (X pow PK).
+ const auto ExpandOpBinPowN = [this, &I, &OpsAndLoops, &Ty]() {
+ auto E = I;
+ // Calculate how many times the same operand from the same loop is included
+ // into this power.
+ uint64_t Exponent = 0;
+ const uint64_t MaxExponent = UINT64_MAX >> 1;
+ // No one sane will ever try to calculate such huge exponents, but if we
+ // need this, we stop on UINT64_MAX / 2 because we need to exit the loop
+ // below when the power of 2 exceeds our Exponent, and we want it to be
+ // 1u << 31 at most to not deal with unsigned overflow.
+ while (E != OpsAndLoops.end() && *I == *E && Exponent != MaxExponent) {
+ ++Exponent;
+ ++E;
+ }
+ assert(Exponent > 0 && "Trying to calculate a zeroth exponent of operand?");
+
+ // Calculate powers with exponents 1, 2, 4, 8 etc. and include those of them
+ // that are needed into the result.
Value *P = expandCodeForImpl(I->second, Ty, false);
- Value *Result = nullptr;
- if (Exponent & 1)
- Result = P;
- for (uint64_t BinExp = 2; BinExp <= Exponent; BinExp <<= 1) {
- P = InsertBinop(Instruction::Mul, P, P, SCEV::FlagAnyWrap,
- /*IsSafeToHoist*/ true);
- if (Exponent & BinExp)
- Result = Result ? InsertBinop(Instruction::Mul, Result, P,
- SCEV::FlagAnyWrap,
- /*IsSafeToHoist*/ true)
- : P;
- }
-
- I = E;
- assert(Result && "Nothing was expanded?");
- return Result;
- };
-
- while (I != OpsAndLoops.end()) {
- if (!Prod) {
- // This is the first operand. Just expand it.
- Prod = ExpandOpBinPowN();
- } else if (I->second->isAllOnesValue()) {
- // Instead of doing a multiply by negative one, just do a negate.
- Prod = InsertNoopCastOfTo(Prod, Ty);
- Prod = InsertBinop(Instruction::Sub, Constant::getNullValue(Ty), Prod,
- SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true);
- ++I;
- } else {
- // A simple mul.
- Value *W = ExpandOpBinPowN();
- Prod = InsertNoopCastOfTo(Prod, Ty);
- // Canonicalize a constant to the RHS.
- if (isa<Constant>(Prod)) std::swap(Prod, W);
- const APInt *RHS;
- if (match(W, m_Power2(RHS))) {
- // Canonicalize Prod*(1<<C) to Prod<<C.
- assert(!Ty->isVectorTy() && "vector types are not SCEVable");
- auto NWFlags = S->getNoWrapFlags();
- // clear nsw flag if shl will produce poison value.
- if (RHS->logBase2() == RHS->getBitWidth() - 1)
- NWFlags = ScalarEvolution::clearFlags(NWFlags, SCEV::FlagNSW);
- Prod = InsertBinop(Instruction::Shl, Prod,
- ConstantInt::get(Ty, RHS->logBase2()), NWFlags,
- /*IsSafeToHoist*/ true);
- } else {
- Prod = InsertBinop(Instruction::Mul, Prod, W, S->getNoWrapFlags(),
- /*IsSafeToHoist*/ true);
- }
- }
- }
-
- return Prod;
-}
-
-Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
- Type *Ty = SE.getEffectiveSCEVType(S->getType());
-
+ Value *Result = nullptr;
+ if (Exponent & 1)
+ Result = P;
+ for (uint64_t BinExp = 2; BinExp <= Exponent; BinExp <<= 1) {
+ P = InsertBinop(Instruction::Mul, P, P, SCEV::FlagAnyWrap,
+ /*IsSafeToHoist*/ true);
+ if (Exponent & BinExp)
+ Result = Result ? InsertBinop(Instruction::Mul, Result, P,
+ SCEV::FlagAnyWrap,
+ /*IsSafeToHoist*/ true)
+ : P;
+ }
+
+ I = E;
+ assert(Result && "Nothing was expanded?");
+ return Result;
+ };
+
+ while (I != OpsAndLoops.end()) {
+ if (!Prod) {
+ // This is the first operand. Just expand it.
+ Prod = ExpandOpBinPowN();
+ } else if (I->second->isAllOnesValue()) {
+ // Instead of doing a multiply by negative one, just do a negate.
+ Prod = InsertNoopCastOfTo(Prod, Ty);
+ Prod = InsertBinop(Instruction::Sub, Constant::getNullValue(Ty), Prod,
+ SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true);
+ ++I;
+ } else {
+ // A simple mul.
+ Value *W = ExpandOpBinPowN();
+ Prod = InsertNoopCastOfTo(Prod, Ty);
+ // Canonicalize a constant to the RHS.
+ if (isa<Constant>(Prod)) std::swap(Prod, W);
+ const APInt *RHS;
+ if (match(W, m_Power2(RHS))) {
+ // Canonicalize Prod*(1<<C) to Prod<<C.
+ assert(!Ty->isVectorTy() && "vector types are not SCEVable");
+ auto NWFlags = S->getNoWrapFlags();
+ // clear nsw flag if shl will produce poison value.
+ if (RHS->logBase2() == RHS->getBitWidth() - 1)
+ NWFlags = ScalarEvolution::clearFlags(NWFlags, SCEV::FlagNSW);
+ Prod = InsertBinop(Instruction::Shl, Prod,
+ ConstantInt::get(Ty, RHS->logBase2()), NWFlags,
+ /*IsSafeToHoist*/ true);
+ } else {
+ Prod = InsertBinop(Instruction::Mul, Prod, W, S->getNoWrapFlags(),
+ /*IsSafeToHoist*/ true);
+ }
+ }
+ }
+
+ return Prod;
+}
+
+Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
+ Type *Ty = SE.getEffectiveSCEVType(S->getType());
+
Value *LHS = expandCodeForImpl(S->getLHS(), Ty, false);
- if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getRHS())) {
- const APInt &RHS = SC->getAPInt();
- if (RHS.isPowerOf2())
- return InsertBinop(Instruction::LShr, LHS,
- ConstantInt::get(Ty, RHS.logBase2()),
- SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true);
- }
-
+ if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(S->getRHS())) {
+ const APInt &RHS = SC->getAPInt();
+ if (RHS.isPowerOf2())
+ return InsertBinop(Instruction::LShr, LHS,
+ ConstantInt::get(Ty, RHS.logBase2()),
+ SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true);
+ }
+
Value *RHS = expandCodeForImpl(S->getRHS(), Ty, false);
- return InsertBinop(Instruction::UDiv, LHS, RHS, SCEV::FlagAnyWrap,
- /*IsSafeToHoist*/ SE.isKnownNonZero(S->getRHS()));
-}
-
-/// Move parts of Base into Rest to leave Base with the minimal
-/// expression that provides a pointer operand suitable for a
-/// GEP expansion.
-static void ExposePointerBase(const SCEV *&Base, const SCEV *&Rest,
- ScalarEvolution &SE) {
- while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Base)) {
- Base = A->getStart();
- Rest = SE.getAddExpr(Rest,
- SE.getAddRecExpr(SE.getConstant(A->getType(), 0),
- A->getStepRecurrence(SE),
- A->getLoop(),
- A->getNoWrapFlags(SCEV::FlagNW)));
- }
- if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(Base)) {
- Base = A->getOperand(A->getNumOperands()-1);
+ return InsertBinop(Instruction::UDiv, LHS, RHS, SCEV::FlagAnyWrap,
+ /*IsSafeToHoist*/ SE.isKnownNonZero(S->getRHS()));
+}
+
+/// Move parts of Base into Rest to leave Base with the minimal
+/// expression that provides a pointer operand suitable for a
+/// GEP expansion.
+static void ExposePointerBase(const SCEV *&Base, const SCEV *&Rest,
+ ScalarEvolution &SE) {
+ while (const SCEVAddRecExpr *A = dyn_cast<SCEVAddRecExpr>(Base)) {
+ Base = A->getStart();
+ Rest = SE.getAddExpr(Rest,
+ SE.getAddRecExpr(SE.getConstant(A->getType(), 0),
+ A->getStepRecurrence(SE),
+ A->getLoop(),
+ A->getNoWrapFlags(SCEV::FlagNW)));
+ }
+ if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(Base)) {
+ Base = A->getOperand(A->getNumOperands()-1);
SmallVector<const SCEV *, 8> NewAddOps(A->operands());
- NewAddOps.back() = Rest;
- Rest = SE.getAddExpr(NewAddOps);
- ExposePointerBase(Base, Rest, SE);
- }
-}
-
-/// Determine if this is a well-behaved chain of instructions leading back to
-/// the PHI. If so, it may be reused by expanded expressions.
-bool SCEVExpander::isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV,
- const Loop *L) {
- if (IncV->getNumOperands() == 0 || isa<PHINode>(IncV) ||
- (isa<CastInst>(IncV) && !isa<BitCastInst>(IncV)))
- return false;
- // If any of the operands don't dominate the insert position, bail.
- // Addrec operands are always loop-invariant, so this can only happen
- // if there are instructions which haven't been hoisted.
- if (L == IVIncInsertLoop) {
- for (User::op_iterator OI = IncV->op_begin()+1,
- OE = IncV->op_end(); OI != OE; ++OI)
- if (Instruction *OInst = dyn_cast<Instruction>(OI))
- if (!SE.DT.dominates(OInst, IVIncInsertPos))
- return false;
- }
- // Advance to the next instruction.
- IncV = dyn_cast<Instruction>(IncV->getOperand(0));
- if (!IncV)
- return false;
-
- if (IncV->mayHaveSideEffects())
- return false;
-
- if (IncV == PN)
- return true;
-
- return isNormalAddRecExprPHI(PN, IncV, L);
-}
-
-/// getIVIncOperand returns an induction variable increment's induction
-/// variable operand.
-///
-/// If allowScale is set, any type of GEP is allowed as long as the nonIV
-/// operands dominate InsertPos.
-///
-/// If allowScale is not set, ensure that a GEP increment conforms to one of the
-/// simple patterns generated by getAddRecExprPHILiterally and
-/// expandAddtoGEP. If the pattern isn't recognized, return NULL.
-Instruction *SCEVExpander::getIVIncOperand(Instruction *IncV,
- Instruction *InsertPos,
- bool allowScale) {
- if (IncV == InsertPos)
- return nullptr;
-
- switch (IncV->getOpcode()) {
- default:
- return nullptr;
- // Check for a simple Add/Sub or GEP of a loop invariant step.
- case Instruction::Add:
- case Instruction::Sub: {
- Instruction *OInst = dyn_cast<Instruction>(IncV->getOperand(1));
- if (!OInst || SE.DT.dominates(OInst, InsertPos))
- return dyn_cast<Instruction>(IncV->getOperand(0));
- return nullptr;
- }
- case Instruction::BitCast:
- return dyn_cast<Instruction>(IncV->getOperand(0));
- case Instruction::GetElementPtr:
- for (auto I = IncV->op_begin() + 1, E = IncV->op_end(); I != E; ++I) {
- if (isa<Constant>(*I))
- continue;
- if (Instruction *OInst = dyn_cast<Instruction>(*I)) {
- if (!SE.DT.dominates(OInst, InsertPos))
- return nullptr;
- }
- if (allowScale) {
- // allow any kind of GEP as long as it can be hoisted.
- continue;
- }
- // This must be a pointer addition of constants (pretty), which is already
- // handled, or some number of address-size elements (ugly). Ugly geps
- // have 2 operands. i1* is used by the expander to represent an
- // address-size element.
- if (IncV->getNumOperands() != 2)
- return nullptr;
- unsigned AS = cast<PointerType>(IncV->getType())->getAddressSpace();
- if (IncV->getType() != Type::getInt1PtrTy(SE.getContext(), AS)
- && IncV->getType() != Type::getInt8PtrTy(SE.getContext(), AS))
- return nullptr;
- break;
- }
- return dyn_cast<Instruction>(IncV->getOperand(0));
- }
-}
-
-/// If the insert point of the current builder or any of the builders on the
-/// stack of saved builders has 'I' as its insert point, update it to point to
-/// the instruction after 'I'. This is intended to be used when the instruction
-/// 'I' is being moved. If this fixup is not done and 'I' is moved to a
-/// different block, the inconsistent insert point (with a mismatched
-/// Instruction and Block) can lead to an instruction being inserted in a block
-/// other than its parent.
-void SCEVExpander::fixupInsertPoints(Instruction *I) {
- BasicBlock::iterator It(*I);
- BasicBlock::iterator NewInsertPt = std::next(It);
- if (Builder.GetInsertPoint() == It)
- Builder.SetInsertPoint(&*NewInsertPt);
- for (auto *InsertPtGuard : InsertPointGuards)
- if (InsertPtGuard->GetInsertPoint() == It)
- InsertPtGuard->SetInsertPoint(NewInsertPt);
-}
-
-/// hoistStep - Attempt to hoist a simple IV increment above InsertPos to make
-/// it available to other uses in this loop. Recursively hoist any operands,
-/// until we reach a value that dominates InsertPos.
-bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) {
- if (SE.DT.dominates(IncV, InsertPos))
- return true;
-
- // InsertPos must itself dominate IncV so that IncV's new position satisfies
- // its existing users.
- if (isa<PHINode>(InsertPos) ||
- !SE.DT.dominates(InsertPos->getParent(), IncV->getParent()))
- return false;
-
- if (!SE.LI.movementPreservesLCSSAForm(IncV, InsertPos))
- return false;
-
- // Check that the chain of IV operands leading back to Phi can be hoisted.
- SmallVector<Instruction*, 4> IVIncs;
- for(;;) {
- Instruction *Oper = getIVIncOperand(IncV, InsertPos, /*allowScale*/true);
- if (!Oper)
- return false;
- // IncV is safe to hoist.
- IVIncs.push_back(IncV);
- IncV = Oper;
- if (SE.DT.dominates(IncV, InsertPos))
- break;
- }
- for (auto I = IVIncs.rbegin(), E = IVIncs.rend(); I != E; ++I) {
- fixupInsertPoints(*I);
- (*I)->moveBefore(InsertPos);
- }
- return true;
-}
-
-/// Determine if this cyclic phi is in a form that would have been generated by
-/// LSR. We don't care if the phi was actually expanded in this pass, as long
-/// as it is in a low-cost form, for example, no implied multiplication. This
-/// should match any patterns generated by getAddRecExprPHILiterally and
-/// expandAddtoGEP.
-bool SCEVExpander::isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV,
- const Loop *L) {
- for(Instruction *IVOper = IncV;
- (IVOper = getIVIncOperand(IVOper, L->getLoopPreheader()->getTerminator(),
- /*allowScale=*/false));) {
- if (IVOper == PN)
- return true;
- }
- return false;
-}
-
-/// expandIVInc - Expand an IV increment at Builder's current InsertPos.
-/// Typically this is the LatchBlock terminator or IVIncInsertPos, but we may
-/// need to materialize IV increments elsewhere to handle difficult situations.
-Value *SCEVExpander::expandIVInc(PHINode *PN, Value *StepV, const Loop *L,
- Type *ExpandTy, Type *IntTy,
- bool useSubtract) {
- Value *IncV;
- // If the PHI is a pointer, use a GEP, otherwise use an add or sub.
- if (ExpandTy->isPointerTy()) {
- PointerType *GEPPtrTy = cast<PointerType>(ExpandTy);
- // If the step isn't constant, don't use an implicitly scaled GEP, because
- // that would require a multiply inside the loop.
- if (!isa<ConstantInt>(StepV))
- GEPPtrTy = PointerType::get(Type::getInt1Ty(SE.getContext()),
- GEPPtrTy->getAddressSpace());
- IncV = expandAddToGEP(SE.getSCEV(StepV), GEPPtrTy, IntTy, PN);
+ NewAddOps.back() = Rest;
+ Rest = SE.getAddExpr(NewAddOps);
+ ExposePointerBase(Base, Rest, SE);
+ }
+}
+
+/// Determine if this is a well-behaved chain of instructions leading back to
+/// the PHI. If so, it may be reused by expanded expressions.
+bool SCEVExpander::isNormalAddRecExprPHI(PHINode *PN, Instruction *IncV,
+ const Loop *L) {
+ if (IncV->getNumOperands() == 0 || isa<PHINode>(IncV) ||
+ (isa<CastInst>(IncV) && !isa<BitCastInst>(IncV)))
+ return false;
+ // If any of the operands don't dominate the insert position, bail.
+ // Addrec operands are always loop-invariant, so this can only happen
+ // if there are instructions which haven't been hoisted.
+ if (L == IVIncInsertLoop) {
+ for (User::op_iterator OI = IncV->op_begin()+1,
+ OE = IncV->op_end(); OI != OE; ++OI)
+ if (Instruction *OInst = dyn_cast<Instruction>(OI))
+ if (!SE.DT.dominates(OInst, IVIncInsertPos))
+ return false;
+ }
+ // Advance to the next instruction.
+ IncV = dyn_cast<Instruction>(IncV->getOperand(0));
+ if (!IncV)
+ return false;
+
+ if (IncV->mayHaveSideEffects())
+ return false;
+
+ if (IncV == PN)
+ return true;
+
+ return isNormalAddRecExprPHI(PN, IncV, L);
+}
+
+/// getIVIncOperand returns an induction variable increment's induction
+/// variable operand.
+///
+/// If allowScale is set, any type of GEP is allowed as long as the nonIV
+/// operands dominate InsertPos.
+///
+/// If allowScale is not set, ensure that a GEP increment conforms to one of the
+/// simple patterns generated by getAddRecExprPHILiterally and
+/// expandAddtoGEP. If the pattern isn't recognized, return NULL.
+Instruction *SCEVExpander::getIVIncOperand(Instruction *IncV,
+ Instruction *InsertPos,
+ bool allowScale) {
+ if (IncV == InsertPos)
+ return nullptr;
+
+ switch (IncV->getOpcode()) {
+ default:
+ return nullptr;
+ // Check for a simple Add/Sub or GEP of a loop invariant step.
+ case Instruction::Add:
+ case Instruction::Sub: {
+ Instruction *OInst = dyn_cast<Instruction>(IncV->getOperand(1));
+ if (!OInst || SE.DT.dominates(OInst, InsertPos))
+ return dyn_cast<Instruction>(IncV->getOperand(0));
+ return nullptr;
+ }
+ case Instruction::BitCast:
+ return dyn_cast<Instruction>(IncV->getOperand(0));
+ case Instruction::GetElementPtr:
+ for (auto I = IncV->op_begin() + 1, E = IncV->op_end(); I != E; ++I) {
+ if (isa<Constant>(*I))
+ continue;
+ if (Instruction *OInst = dyn_cast<Instruction>(*I)) {
+ if (!SE.DT.dominates(OInst, InsertPos))
+ return nullptr;
+ }
+ if (allowScale) {
+ // allow any kind of GEP as long as it can be hoisted.
+ continue;
+ }
+ // This must be a pointer addition of constants (pretty), which is already
+ // handled, or some number of address-size elements (ugly). Ugly geps
+ // have 2 operands. i1* is used by the expander to represent an
+ // address-size element.
+ if (IncV->getNumOperands() != 2)
+ return nullptr;
+ unsigned AS = cast<PointerType>(IncV->getType())->getAddressSpace();
+ if (IncV->getType() != Type::getInt1PtrTy(SE.getContext(), AS)
+ && IncV->getType() != Type::getInt8PtrTy(SE.getContext(), AS))
+ return nullptr;
+ break;
+ }
+ return dyn_cast<Instruction>(IncV->getOperand(0));
+ }
+}
+
+/// If the insert point of the current builder or any of the builders on the
+/// stack of saved builders has 'I' as its insert point, update it to point to
+/// the instruction after 'I'. This is intended to be used when the instruction
+/// 'I' is being moved. If this fixup is not done and 'I' is moved to a
+/// different block, the inconsistent insert point (with a mismatched
+/// Instruction and Block) can lead to an instruction being inserted in a block
+/// other than its parent.
+void SCEVExpander::fixupInsertPoints(Instruction *I) {
+ BasicBlock::iterator It(*I);
+ BasicBlock::iterator NewInsertPt = std::next(It);
+ if (Builder.GetInsertPoint() == It)
+ Builder.SetInsertPoint(&*NewInsertPt);
+ for (auto *InsertPtGuard : InsertPointGuards)
+ if (InsertPtGuard->GetInsertPoint() == It)
+ InsertPtGuard->SetInsertPoint(NewInsertPt);
+}
+
+/// hoistStep - Attempt to hoist a simple IV increment above InsertPos to make
+/// it available to other uses in this loop. Recursively hoist any operands,
+/// until we reach a value that dominates InsertPos.
+bool SCEVExpander::hoistIVInc(Instruction *IncV, Instruction *InsertPos) {
+ if (SE.DT.dominates(IncV, InsertPos))
+ return true;
+
+ // InsertPos must itself dominate IncV so that IncV's new position satisfies
+ // its existing users.
+ if (isa<PHINode>(InsertPos) ||
+ !SE.DT.dominates(InsertPos->getParent(), IncV->getParent()))
+ return false;
+
+ if (!SE.LI.movementPreservesLCSSAForm(IncV, InsertPos))
+ return false;
+
+ // Check that the chain of IV operands leading back to Phi can be hoisted.
+ SmallVector<Instruction*, 4> IVIncs;
+ for(;;) {
+ Instruction *Oper = getIVIncOperand(IncV, InsertPos, /*allowScale*/true);
+ if (!Oper)
+ return false;
+ // IncV is safe to hoist.
+ IVIncs.push_back(IncV);
+ IncV = Oper;
+ if (SE.DT.dominates(IncV, InsertPos))
+ break;
+ }
+ for (auto I = IVIncs.rbegin(), E = IVIncs.rend(); I != E; ++I) {
+ fixupInsertPoints(*I);
+ (*I)->moveBefore(InsertPos);
+ }
+ return true;
+}
+
+/// Determine if this cyclic phi is in a form that would have been generated by
+/// LSR. We don't care if the phi was actually expanded in this pass, as long
+/// as it is in a low-cost form, for example, no implied multiplication. This
+/// should match any patterns generated by getAddRecExprPHILiterally and
+/// expandAddtoGEP.
+bool SCEVExpander::isExpandedAddRecExprPHI(PHINode *PN, Instruction *IncV,
+ const Loop *L) {
+ for(Instruction *IVOper = IncV;
+ (IVOper = getIVIncOperand(IVOper, L->getLoopPreheader()->getTerminator(),
+ /*allowScale=*/false));) {
+ if (IVOper == PN)
+ return true;
+ }
+ return false;
+}
+
+/// expandIVInc - Expand an IV increment at Builder's current InsertPos.
+/// Typically this is the LatchBlock terminator or IVIncInsertPos, but we may
+/// need to materialize IV increments elsewhere to handle difficult situations.
+Value *SCEVExpander::expandIVInc(PHINode *PN, Value *StepV, const Loop *L,
+ Type *ExpandTy, Type *IntTy,
+ bool useSubtract) {
+ Value *IncV;
+ // If the PHI is a pointer, use a GEP, otherwise use an add or sub.
+ if (ExpandTy->isPointerTy()) {
+ PointerType *GEPPtrTy = cast<PointerType>(ExpandTy);
+ // If the step isn't constant, don't use an implicitly scaled GEP, because
+ // that would require a multiply inside the loop.
+ if (!isa<ConstantInt>(StepV))
+ GEPPtrTy = PointerType::get(Type::getInt1Ty(SE.getContext()),
+ GEPPtrTy->getAddressSpace());
+ IncV = expandAddToGEP(SE.getSCEV(StepV), GEPPtrTy, IntTy, PN);
if (IncV->getType() != PN->getType())
- IncV = Builder.CreateBitCast(IncV, PN->getType());
- } else {
- IncV = useSubtract ?
- Builder.CreateSub(PN, StepV, Twine(IVName) + ".iv.next") :
- Builder.CreateAdd(PN, StepV, Twine(IVName) + ".iv.next");
- }
- return IncV;
-}
-
-/// Hoist the addrec instruction chain rooted in the loop phi above the
-/// position. This routine assumes that this is possible (has been checked).
-void SCEVExpander::hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist,
- Instruction *Pos, PHINode *LoopPhi) {
- do {
- if (DT->dominates(InstToHoist, Pos))
- break;
- // Make sure the increment is where we want it. But don't move it
- // down past a potential existing post-inc user.
- fixupInsertPoints(InstToHoist);
- InstToHoist->moveBefore(Pos);
- Pos = InstToHoist;
- InstToHoist = cast<Instruction>(InstToHoist->getOperand(0));
- } while (InstToHoist != LoopPhi);
-}
-
-/// Check whether we can cheaply express the requested SCEV in terms of
-/// the available PHI SCEV by truncation and/or inversion of the step.
-static bool canBeCheaplyTransformed(ScalarEvolution &SE,
- const SCEVAddRecExpr *Phi,
- const SCEVAddRecExpr *Requested,
- bool &InvertStep) {
- Type *PhiTy = SE.getEffectiveSCEVType(Phi->getType());
- Type *RequestedTy = SE.getEffectiveSCEVType(Requested->getType());
-
- if (RequestedTy->getIntegerBitWidth() > PhiTy->getIntegerBitWidth())
- return false;
-
- // Try truncate it if necessary.
- Phi = dyn_cast<SCEVAddRecExpr>(SE.getTruncateOrNoop(Phi, RequestedTy));
- if (!Phi)
- return false;
-
- // Check whether truncation will help.
- if (Phi == Requested) {
- InvertStep = false;
- return true;
- }
-
- // Check whether inverting will help: {R,+,-1} == R - {0,+,1}.
- if (SE.getAddExpr(Requested->getStart(),
- SE.getNegativeSCEV(Requested)) == Phi) {
- InvertStep = true;
- return true;
- }
-
- return false;
-}
-
-static bool IsIncrementNSW(ScalarEvolution &SE, const SCEVAddRecExpr *AR) {
- if (!isa<IntegerType>(AR->getType()))
- return false;
-
- unsigned BitWidth = cast<IntegerType>(AR->getType())->getBitWidth();
- Type *WideTy = IntegerType::get(AR->getType()->getContext(), BitWidth * 2);
- const SCEV *Step = AR->getStepRecurrence(SE);
- const SCEV *OpAfterExtend = SE.getAddExpr(SE.getSignExtendExpr(Step, WideTy),
- SE.getSignExtendExpr(AR, WideTy));
- const SCEV *ExtendAfterOp =
- SE.getSignExtendExpr(SE.getAddExpr(AR, Step), WideTy);
- return ExtendAfterOp == OpAfterExtend;
-}
-
-static bool IsIncrementNUW(ScalarEvolution &SE, const SCEVAddRecExpr *AR) {
- if (!isa<IntegerType>(AR->getType()))
- return false;
-
- unsigned BitWidth = cast<IntegerType>(AR->getType())->getBitWidth();
- Type *WideTy = IntegerType::get(AR->getType()->getContext(), BitWidth * 2);
- const SCEV *Step = AR->getStepRecurrence(SE);
- const SCEV *OpAfterExtend = SE.getAddExpr(SE.getZeroExtendExpr(Step, WideTy),
- SE.getZeroExtendExpr(AR, WideTy));
- const SCEV *ExtendAfterOp =
- SE.getZeroExtendExpr(SE.getAddExpr(AR, Step), WideTy);
- return ExtendAfterOp == OpAfterExtend;
-}
-
-/// getAddRecExprPHILiterally - Helper for expandAddRecExprLiterally. Expand
-/// the base addrec, which is the addrec without any non-loop-dominating
-/// values, and return the PHI.
-PHINode *
-SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
- const Loop *L,
- Type *ExpandTy,
- Type *IntTy,
- Type *&TruncTy,
- bool &InvertStep) {
- assert((!IVIncInsertLoop||IVIncInsertPos) && "Uninitialized insert position");
-
- // Reuse a previously-inserted PHI, if present.
- BasicBlock *LatchBlock = L->getLoopLatch();
- if (LatchBlock) {
- PHINode *AddRecPhiMatch = nullptr;
- Instruction *IncV = nullptr;
- TruncTy = nullptr;
- InvertStep = false;
-
- // Only try partially matching scevs that need truncation and/or
- // step-inversion if we know this loop is outside the current loop.
- bool TryNonMatchingSCEV =
- IVIncInsertLoop &&
- SE.DT.properlyDominates(LatchBlock, IVIncInsertLoop->getHeader());
-
- for (PHINode &PN : L->getHeader()->phis()) {
- if (!SE.isSCEVable(PN.getType()))
- continue;
-
+ IncV = Builder.CreateBitCast(IncV, PN->getType());
+ } else {
+ IncV = useSubtract ?
+ Builder.CreateSub(PN, StepV, Twine(IVName) + ".iv.next") :
+ Builder.CreateAdd(PN, StepV, Twine(IVName) + ".iv.next");
+ }
+ return IncV;
+}
+
+/// Hoist the addrec instruction chain rooted in the loop phi above the
+/// position. This routine assumes that this is possible (has been checked).
+void SCEVExpander::hoistBeforePos(DominatorTree *DT, Instruction *InstToHoist,
+ Instruction *Pos, PHINode *LoopPhi) {
+ do {
+ if (DT->dominates(InstToHoist, Pos))
+ break;
+ // Make sure the increment is where we want it. But don't move it
+ // down past a potential existing post-inc user.
+ fixupInsertPoints(InstToHoist);
+ InstToHoist->moveBefore(Pos);
+ Pos = InstToHoist;
+ InstToHoist = cast<Instruction>(InstToHoist->getOperand(0));
+ } while (InstToHoist != LoopPhi);
+}
+
+/// Check whether we can cheaply express the requested SCEV in terms of
+/// the available PHI SCEV by truncation and/or inversion of the step.
+static bool canBeCheaplyTransformed(ScalarEvolution &SE,
+ const SCEVAddRecExpr *Phi,
+ const SCEVAddRecExpr *Requested,
+ bool &InvertStep) {
+ Type *PhiTy = SE.getEffectiveSCEVType(Phi->getType());
+ Type *RequestedTy = SE.getEffectiveSCEVType(Requested->getType());
+
+ if (RequestedTy->getIntegerBitWidth() > PhiTy->getIntegerBitWidth())
+ return false;
+
+ // Try truncate it if necessary.
+ Phi = dyn_cast<SCEVAddRecExpr>(SE.getTruncateOrNoop(Phi, RequestedTy));
+ if (!Phi)
+ return false;
+
+ // Check whether truncation will help.
+ if (Phi == Requested) {
+ InvertStep = false;
+ return true;
+ }
+
+ // Check whether inverting will help: {R,+,-1} == R - {0,+,1}.
+ if (SE.getAddExpr(Requested->getStart(),
+ SE.getNegativeSCEV(Requested)) == Phi) {
+ InvertStep = true;
+ return true;
+ }
+
+ return false;
+}
+
+static bool IsIncrementNSW(ScalarEvolution &SE, const SCEVAddRecExpr *AR) {
+ if (!isa<IntegerType>(AR->getType()))
+ return false;
+
+ unsigned BitWidth = cast<IntegerType>(AR->getType())->getBitWidth();
+ Type *WideTy = IntegerType::get(AR->getType()->getContext(), BitWidth * 2);
+ const SCEV *Step = AR->getStepRecurrence(SE);
+ const SCEV *OpAfterExtend = SE.getAddExpr(SE.getSignExtendExpr(Step, WideTy),
+ SE.getSignExtendExpr(AR, WideTy));
+ const SCEV *ExtendAfterOp =
+ SE.getSignExtendExpr(SE.getAddExpr(AR, Step), WideTy);
+ return ExtendAfterOp == OpAfterExtend;
+}
+
+static bool IsIncrementNUW(ScalarEvolution &SE, const SCEVAddRecExpr *AR) {
+ if (!isa<IntegerType>(AR->getType()))
+ return false;
+
+ unsigned BitWidth = cast<IntegerType>(AR->getType())->getBitWidth();
+ Type *WideTy = IntegerType::get(AR->getType()->getContext(), BitWidth * 2);
+ const SCEV *Step = AR->getStepRecurrence(SE);
+ const SCEV *OpAfterExtend = SE.getAddExpr(SE.getZeroExtendExpr(Step, WideTy),
+ SE.getZeroExtendExpr(AR, WideTy));
+ const SCEV *ExtendAfterOp =
+ SE.getZeroExtendExpr(SE.getAddExpr(AR, Step), WideTy);
+ return ExtendAfterOp == OpAfterExtend;
+}
+
+/// getAddRecExprPHILiterally - Helper for expandAddRecExprLiterally. Expand
+/// the base addrec, which is the addrec without any non-loop-dominating
+/// values, and return the PHI.
+PHINode *
+SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
+ const Loop *L,
+ Type *ExpandTy,
+ Type *IntTy,
+ Type *&TruncTy,
+ bool &InvertStep) {
+ assert((!IVIncInsertLoop||IVIncInsertPos) && "Uninitialized insert position");
+
+ // Reuse a previously-inserted PHI, if present.
+ BasicBlock *LatchBlock = L->getLoopLatch();
+ if (LatchBlock) {
+ PHINode *AddRecPhiMatch = nullptr;
+ Instruction *IncV = nullptr;
+ TruncTy = nullptr;
+ InvertStep = false;
+
+ // Only try partially matching scevs that need truncation and/or
+ // step-inversion if we know this loop is outside the current loop.
+ bool TryNonMatchingSCEV =
+ IVIncInsertLoop &&
+ SE.DT.properlyDominates(LatchBlock, IVIncInsertLoop->getHeader());
+
+ for (PHINode &PN : L->getHeader()->phis()) {
+ if (!SE.isSCEVable(PN.getType()))
+ continue;
+
// We should not look for a incomplete PHI. Getting SCEV for a incomplete
// PHI has no meaning at all.
if (!PN.isComplete()) {
@@ -1214,232 +1214,232 @@ SCEVExpander::getAddRecExprPHILiterally(const SCEVAddRecExpr *Normalized,
continue;
}
- const SCEVAddRecExpr *PhiSCEV = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(&PN));
- if (!PhiSCEV)
- continue;
-
- bool IsMatchingSCEV = PhiSCEV == Normalized;
- // We only handle truncation and inversion of phi recurrences for the
- // expanded expression if the expanded expression's loop dominates the
- // loop we insert to. Check now, so we can bail out early.
- if (!IsMatchingSCEV && !TryNonMatchingSCEV)
- continue;
-
- // TODO: this possibly can be reworked to avoid this cast at all.
- Instruction *TempIncV =
- dyn_cast<Instruction>(PN.getIncomingValueForBlock(LatchBlock));
- if (!TempIncV)
- continue;
-
- // Check whether we can reuse this PHI node.
- if (LSRMode) {
- if (!isExpandedAddRecExprPHI(&PN, TempIncV, L))
- continue;
- if (L == IVIncInsertLoop && !hoistIVInc(TempIncV, IVIncInsertPos))
- continue;
- } else {
- if (!isNormalAddRecExprPHI(&PN, TempIncV, L))
- continue;
- }
-
- // Stop if we have found an exact match SCEV.
- if (IsMatchingSCEV) {
- IncV = TempIncV;
- TruncTy = nullptr;
- InvertStep = false;
- AddRecPhiMatch = &PN;
- break;
- }
-
- // Try whether the phi can be translated into the requested form
- // (truncated and/or offset by a constant).
- if ((!TruncTy || InvertStep) &&
- canBeCheaplyTransformed(SE, PhiSCEV, Normalized, InvertStep)) {
- // Record the phi node. But don't stop we might find an exact match
- // later.
- AddRecPhiMatch = &PN;
- IncV = TempIncV;
- TruncTy = SE.getEffectiveSCEVType(Normalized->getType());
- }
- }
-
- if (AddRecPhiMatch) {
- // Potentially, move the increment. We have made sure in
- // isExpandedAddRecExprPHI or hoistIVInc that this is possible.
- if (L == IVIncInsertLoop)
- hoistBeforePos(&SE.DT, IncV, IVIncInsertPos, AddRecPhiMatch);
-
- // Ok, the add recurrence looks usable.
- // Remember this PHI, even in post-inc mode.
- InsertedValues.insert(AddRecPhiMatch);
- // Remember the increment.
- rememberInstruction(IncV);
+ const SCEVAddRecExpr *PhiSCEV = dyn_cast<SCEVAddRecExpr>(SE.getSCEV(&PN));
+ if (!PhiSCEV)
+ continue;
+
+ bool IsMatchingSCEV = PhiSCEV == Normalized;
+ // We only handle truncation and inversion of phi recurrences for the
+ // expanded expression if the expanded expression's loop dominates the
+ // loop we insert to. Check now, so we can bail out early.
+ if (!IsMatchingSCEV && !TryNonMatchingSCEV)
+ continue;
+
+ // TODO: this possibly can be reworked to avoid this cast at all.
+ Instruction *TempIncV =
+ dyn_cast<Instruction>(PN.getIncomingValueForBlock(LatchBlock));
+ if (!TempIncV)
+ continue;
+
+ // Check whether we can reuse this PHI node.
+ if (LSRMode) {
+ if (!isExpandedAddRecExprPHI(&PN, TempIncV, L))
+ continue;
+ if (L == IVIncInsertLoop && !hoistIVInc(TempIncV, IVIncInsertPos))
+ continue;
+ } else {
+ if (!isNormalAddRecExprPHI(&PN, TempIncV, L))
+ continue;
+ }
+
+ // Stop if we have found an exact match SCEV.
+ if (IsMatchingSCEV) {
+ IncV = TempIncV;
+ TruncTy = nullptr;
+ InvertStep = false;
+ AddRecPhiMatch = &PN;
+ break;
+ }
+
+ // Try whether the phi can be translated into the requested form
+ // (truncated and/or offset by a constant).
+ if ((!TruncTy || InvertStep) &&
+ canBeCheaplyTransformed(SE, PhiSCEV, Normalized, InvertStep)) {
+ // Record the phi node. But don't stop we might find an exact match
+ // later.
+ AddRecPhiMatch = &PN;
+ IncV = TempIncV;
+ TruncTy = SE.getEffectiveSCEVType(Normalized->getType());
+ }
+ }
+
+ if (AddRecPhiMatch) {
+ // Potentially, move the increment. We have made sure in
+ // isExpandedAddRecExprPHI or hoistIVInc that this is possible.
+ if (L == IVIncInsertLoop)
+ hoistBeforePos(&SE.DT, IncV, IVIncInsertPos, AddRecPhiMatch);
+
+ // Ok, the add recurrence looks usable.
+ // Remember this PHI, even in post-inc mode.
+ InsertedValues.insert(AddRecPhiMatch);
+ // Remember the increment.
+ rememberInstruction(IncV);
// Those values were not actually inserted but re-used.
ReusedValues.insert(AddRecPhiMatch);
ReusedValues.insert(IncV);
- return AddRecPhiMatch;
- }
- }
-
- // Save the original insertion point so we can restore it when we're done.
- SCEVInsertPointGuard Guard(Builder, this);
-
- // Another AddRec may need to be recursively expanded below. For example, if
- // this AddRec is quadratic, the StepV may itself be an AddRec in this
- // loop. Remove this loop from the PostIncLoops set before expanding such
- // AddRecs. Otherwise, we cannot find a valid position for the step
- // (i.e. StepV can never dominate its loop header). Ideally, we could do
- // SavedIncLoops.swap(PostIncLoops), but we generally have a single element,
- // so it's not worth implementing SmallPtrSet::swap.
- PostIncLoopSet SavedPostIncLoops = PostIncLoops;
- PostIncLoops.clear();
-
- // Expand code for the start value into the loop preheader.
- assert(L->getLoopPreheader() &&
- "Can't expand add recurrences without a loop preheader!");
+ return AddRecPhiMatch;
+ }
+ }
+
+ // Save the original insertion point so we can restore it when we're done.
+ SCEVInsertPointGuard Guard(Builder, this);
+
+ // Another AddRec may need to be recursively expanded below. For example, if
+ // this AddRec is quadratic, the StepV may itself be an AddRec in this
+ // loop. Remove this loop from the PostIncLoops set before expanding such
+ // AddRecs. Otherwise, we cannot find a valid position for the step
+ // (i.e. StepV can never dominate its loop header). Ideally, we could do
+ // SavedIncLoops.swap(PostIncLoops), but we generally have a single element,
+ // so it's not worth implementing SmallPtrSet::swap.
+ PostIncLoopSet SavedPostIncLoops = PostIncLoops;
+ PostIncLoops.clear();
+
+ // Expand code for the start value into the loop preheader.
+ assert(L->getLoopPreheader() &&
+ "Can't expand add recurrences without a loop preheader!");
Value *StartV =
expandCodeForImpl(Normalized->getStart(), ExpandTy,
L->getLoopPreheader()->getTerminator(), false);
-
- // StartV must have been be inserted into L's preheader to dominate the new
- // phi.
- assert(!isa<Instruction>(StartV) ||
- SE.DT.properlyDominates(cast<Instruction>(StartV)->getParent(),
- L->getHeader()));
-
- // Expand code for the step value. Do this before creating the PHI so that PHI
- // reuse code doesn't see an incomplete PHI.
- const SCEV *Step = Normalized->getStepRecurrence(SE);
- // If the stride is negative, insert a sub instead of an add for the increment
- // (unless it's a constant, because subtracts of constants are canonicalized
- // to adds).
- bool useSubtract = !ExpandTy->isPointerTy() && Step->isNonConstantNegative();
- if (useSubtract)
- Step = SE.getNegativeSCEV(Step);
- // Expand the step somewhere that dominates the loop header.
+
+ // StartV must have been be inserted into L's preheader to dominate the new
+ // phi.
+ assert(!isa<Instruction>(StartV) ||
+ SE.DT.properlyDominates(cast<Instruction>(StartV)->getParent(),
+ L->getHeader()));
+
+ // Expand code for the step value. Do this before creating the PHI so that PHI
+ // reuse code doesn't see an incomplete PHI.
+ const SCEV *Step = Normalized->getStepRecurrence(SE);
+ // If the stride is negative, insert a sub instead of an add for the increment
+ // (unless it's a constant, because subtracts of constants are canonicalized
+ // to adds).
+ bool useSubtract = !ExpandTy->isPointerTy() && Step->isNonConstantNegative();
+ if (useSubtract)
+ Step = SE.getNegativeSCEV(Step);
+ // Expand the step somewhere that dominates the loop header.
Value *StepV = expandCodeForImpl(
Step, IntTy, &*L->getHeader()->getFirstInsertionPt(), false);
-
- // The no-wrap behavior proved by IsIncrement(NUW|NSW) is only applicable if
- // we actually do emit an addition. It does not apply if we emit a
- // subtraction.
- bool IncrementIsNUW = !useSubtract && IsIncrementNUW(SE, Normalized);
- bool IncrementIsNSW = !useSubtract && IsIncrementNSW(SE, Normalized);
-
- // Create the PHI.
- BasicBlock *Header = L->getHeader();
- Builder.SetInsertPoint(Header, Header->begin());
- pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header);
- PHINode *PN = Builder.CreatePHI(ExpandTy, std::distance(HPB, HPE),
- Twine(IVName) + ".iv");
-
- // Create the step instructions and populate the PHI.
- for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
- BasicBlock *Pred = *HPI;
-
- // Add a start value.
- if (!L->contains(Pred)) {
- PN->addIncoming(StartV, Pred);
- continue;
- }
-
- // Create a step value and add it to the PHI.
- // If IVIncInsertLoop is non-null and equal to the addrec's loop, insert the
- // instructions at IVIncInsertPos.
- Instruction *InsertPos = L == IVIncInsertLoop ?
- IVIncInsertPos : Pred->getTerminator();
- Builder.SetInsertPoint(InsertPos);
- Value *IncV = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
-
- if (isa<OverflowingBinaryOperator>(IncV)) {
- if (IncrementIsNUW)
- cast<BinaryOperator>(IncV)->setHasNoUnsignedWrap();
- if (IncrementIsNSW)
- cast<BinaryOperator>(IncV)->setHasNoSignedWrap();
- }
- PN->addIncoming(IncV, Pred);
- }
-
- // After expanding subexpressions, restore the PostIncLoops set so the caller
- // can ensure that IVIncrement dominates the current uses.
- PostIncLoops = SavedPostIncLoops;
-
- // Remember this PHI, even in post-inc mode.
- InsertedValues.insert(PN);
-
- return PN;
-}
-
-Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
- Type *STy = S->getType();
- Type *IntTy = SE.getEffectiveSCEVType(STy);
- const Loop *L = S->getLoop();
-
- // Determine a normalized form of this expression, which is the expression
- // before any post-inc adjustment is made.
- const SCEVAddRecExpr *Normalized = S;
- if (PostIncLoops.count(L)) {
- PostIncLoopSet Loops;
- Loops.insert(L);
- Normalized = cast<SCEVAddRecExpr>(normalizeForPostIncUse(S, Loops, SE));
- }
-
- // Strip off any non-loop-dominating component from the addrec start.
- const SCEV *Start = Normalized->getStart();
- const SCEV *PostLoopOffset = nullptr;
- if (!SE.properlyDominates(Start, L->getHeader())) {
- PostLoopOffset = Start;
- Start = SE.getConstant(Normalized->getType(), 0);
- Normalized = cast<SCEVAddRecExpr>(
- SE.getAddRecExpr(Start, Normalized->getStepRecurrence(SE),
- Normalized->getLoop(),
- Normalized->getNoWrapFlags(SCEV::FlagNW)));
- }
-
- // Strip off any non-loop-dominating component from the addrec step.
- const SCEV *Step = Normalized->getStepRecurrence(SE);
- const SCEV *PostLoopScale = nullptr;
- if (!SE.dominates(Step, L->getHeader())) {
- PostLoopScale = Step;
- Step = SE.getConstant(Normalized->getType(), 1);
- if (!Start->isZero()) {
- // The normalization below assumes that Start is constant zero, so if
- // it isn't re-associate Start to PostLoopOffset.
- assert(!PostLoopOffset && "Start not-null but PostLoopOffset set?");
- PostLoopOffset = Start;
- Start = SE.getConstant(Normalized->getType(), 0);
- }
- Normalized =
- cast<SCEVAddRecExpr>(SE.getAddRecExpr(
- Start, Step, Normalized->getLoop(),
- Normalized->getNoWrapFlags(SCEV::FlagNW)));
- }
-
- // Expand the core addrec. If we need post-loop scaling, force it to
- // expand to an integer type to avoid the need for additional casting.
- Type *ExpandTy = PostLoopScale ? IntTy : STy;
- // We can't use a pointer type for the addrec if the pointer type is
- // non-integral.
- Type *AddRecPHIExpandTy =
- DL.isNonIntegralPointerType(STy) ? Normalized->getType() : ExpandTy;
-
- // In some cases, we decide to reuse an existing phi node but need to truncate
- // it and/or invert the step.
- Type *TruncTy = nullptr;
- bool InvertStep = false;
- PHINode *PN = getAddRecExprPHILiterally(Normalized, L, AddRecPHIExpandTy,
- IntTy, TruncTy, InvertStep);
-
- // Accommodate post-inc mode, if necessary.
- Value *Result;
- if (!PostIncLoops.count(L))
- Result = PN;
- else {
- // In PostInc mode, use the post-incremented value.
- BasicBlock *LatchBlock = L->getLoopLatch();
- assert(LatchBlock && "PostInc mode requires a unique loop latch!");
- Result = PN->getIncomingValueForBlock(LatchBlock);
-
+
+ // The no-wrap behavior proved by IsIncrement(NUW|NSW) is only applicable if
+ // we actually do emit an addition. It does not apply if we emit a
+ // subtraction.
+ bool IncrementIsNUW = !useSubtract && IsIncrementNUW(SE, Normalized);
+ bool IncrementIsNSW = !useSubtract && IsIncrementNSW(SE, Normalized);
+
+ // Create the PHI.
+ BasicBlock *Header = L->getHeader();
+ Builder.SetInsertPoint(Header, Header->begin());
+ pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header);
+ PHINode *PN = Builder.CreatePHI(ExpandTy, std::distance(HPB, HPE),
+ Twine(IVName) + ".iv");
+
+ // Create the step instructions and populate the PHI.
+ for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
+ BasicBlock *Pred = *HPI;
+
+ // Add a start value.
+ if (!L->contains(Pred)) {
+ PN->addIncoming(StartV, Pred);
+ continue;
+ }
+
+ // Create a step value and add it to the PHI.
+ // If IVIncInsertLoop is non-null and equal to the addrec's loop, insert the
+ // instructions at IVIncInsertPos.
+ Instruction *InsertPos = L == IVIncInsertLoop ?
+ IVIncInsertPos : Pred->getTerminator();
+ Builder.SetInsertPoint(InsertPos);
+ Value *IncV = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
+
+ if (isa<OverflowingBinaryOperator>(IncV)) {
+ if (IncrementIsNUW)
+ cast<BinaryOperator>(IncV)->setHasNoUnsignedWrap();
+ if (IncrementIsNSW)
+ cast<BinaryOperator>(IncV)->setHasNoSignedWrap();
+ }
+ PN->addIncoming(IncV, Pred);
+ }
+
+ // After expanding subexpressions, restore the PostIncLoops set so the caller
+ // can ensure that IVIncrement dominates the current uses.
+ PostIncLoops = SavedPostIncLoops;
+
+ // Remember this PHI, even in post-inc mode.
+ InsertedValues.insert(PN);
+
+ return PN;
+}
+
+Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
+ Type *STy = S->getType();
+ Type *IntTy = SE.getEffectiveSCEVType(STy);
+ const Loop *L = S->getLoop();
+
+ // Determine a normalized form of this expression, which is the expression
+ // before any post-inc adjustment is made.
+ const SCEVAddRecExpr *Normalized = S;
+ if (PostIncLoops.count(L)) {
+ PostIncLoopSet Loops;
+ Loops.insert(L);
+ Normalized = cast<SCEVAddRecExpr>(normalizeForPostIncUse(S, Loops, SE));
+ }
+
+ // Strip off any non-loop-dominating component from the addrec start.
+ const SCEV *Start = Normalized->getStart();
+ const SCEV *PostLoopOffset = nullptr;
+ if (!SE.properlyDominates(Start, L->getHeader())) {
+ PostLoopOffset = Start;
+ Start = SE.getConstant(Normalized->getType(), 0);
+ Normalized = cast<SCEVAddRecExpr>(
+ SE.getAddRecExpr(Start, Normalized->getStepRecurrence(SE),
+ Normalized->getLoop(),
+ Normalized->getNoWrapFlags(SCEV::FlagNW)));
+ }
+
+ // Strip off any non-loop-dominating component from the addrec step.
+ const SCEV *Step = Normalized->getStepRecurrence(SE);
+ const SCEV *PostLoopScale = nullptr;
+ if (!SE.dominates(Step, L->getHeader())) {
+ PostLoopScale = Step;
+ Step = SE.getConstant(Normalized->getType(), 1);
+ if (!Start->isZero()) {
+ // The normalization below assumes that Start is constant zero, so if
+ // it isn't re-associate Start to PostLoopOffset.
+ assert(!PostLoopOffset && "Start not-null but PostLoopOffset set?");
+ PostLoopOffset = Start;
+ Start = SE.getConstant(Normalized->getType(), 0);
+ }
+ Normalized =
+ cast<SCEVAddRecExpr>(SE.getAddRecExpr(
+ Start, Step, Normalized->getLoop(),
+ Normalized->getNoWrapFlags(SCEV::FlagNW)));
+ }
+
+ // Expand the core addrec. If we need post-loop scaling, force it to
+ // expand to an integer type to avoid the need for additional casting.
+ Type *ExpandTy = PostLoopScale ? IntTy : STy;
+ // We can't use a pointer type for the addrec if the pointer type is
+ // non-integral.
+ Type *AddRecPHIExpandTy =
+ DL.isNonIntegralPointerType(STy) ? Normalized->getType() : ExpandTy;
+
+ // In some cases, we decide to reuse an existing phi node but need to truncate
+ // it and/or invert the step.
+ Type *TruncTy = nullptr;
+ bool InvertStep = false;
+ PHINode *PN = getAddRecExprPHILiterally(Normalized, L, AddRecPHIExpandTy,
+ IntTy, TruncTy, InvertStep);
+
+ // Accommodate post-inc mode, if necessary.
+ Value *Result;
+ if (!PostIncLoops.count(L))
+ Result = PN;
+ else {
+ // In PostInc mode, use the post-incremented value.
+ BasicBlock *LatchBlock = L->getLoopLatch();
+ assert(LatchBlock && "PostInc mode requires a unique loop latch!");
+ Result = PN->getIncomingValueForBlock(LatchBlock);
+
// We might be introducing a new use of the post-inc IV that is not poison
// safe, in which case we should drop poison generating flags. Only keep
// those flags for which SCEV has proven that they always hold.
@@ -1451,361 +1451,361 @@ Value *SCEVExpander::expandAddRecExprLiterally(const SCEVAddRecExpr *S) {
I->setHasNoSignedWrap(false);
}
- // For an expansion to use the postinc form, the client must call
- // expandCodeFor with an InsertPoint that is either outside the PostIncLoop
- // or dominated by IVIncInsertPos.
- if (isa<Instruction>(Result) &&
- !SE.DT.dominates(cast<Instruction>(Result),
- &*Builder.GetInsertPoint())) {
- // The induction variable's postinc expansion does not dominate this use.
- // IVUsers tries to prevent this case, so it is rare. However, it can
- // happen when an IVUser outside the loop is not dominated by the latch
- // block. Adjusting IVIncInsertPos before expansion begins cannot handle
- // all cases. Consider a phi outside whose operand is replaced during
- // expansion with the value of the postinc user. Without fundamentally
- // changing the way postinc users are tracked, the only remedy is
- // inserting an extra IV increment. StepV might fold into PostLoopOffset,
- // but hopefully expandCodeFor handles that.
- bool useSubtract =
- !ExpandTy->isPointerTy() && Step->isNonConstantNegative();
- if (useSubtract)
- Step = SE.getNegativeSCEV(Step);
- Value *StepV;
- {
- // Expand the step somewhere that dominates the loop header.
- SCEVInsertPointGuard Guard(Builder, this);
+ // For an expansion to use the postinc form, the client must call
+ // expandCodeFor with an InsertPoint that is either outside the PostIncLoop
+ // or dominated by IVIncInsertPos.
+ if (isa<Instruction>(Result) &&
+ !SE.DT.dominates(cast<Instruction>(Result),
+ &*Builder.GetInsertPoint())) {
+ // The induction variable's postinc expansion does not dominate this use.
+ // IVUsers tries to prevent this case, so it is rare. However, it can
+ // happen when an IVUser outside the loop is not dominated by the latch
+ // block. Adjusting IVIncInsertPos before expansion begins cannot handle
+ // all cases. Consider a phi outside whose operand is replaced during
+ // expansion with the value of the postinc user. Without fundamentally
+ // changing the way postinc users are tracked, the only remedy is
+ // inserting an extra IV increment. StepV might fold into PostLoopOffset,
+ // but hopefully expandCodeFor handles that.
+ bool useSubtract =
+ !ExpandTy->isPointerTy() && Step->isNonConstantNegative();
+ if (useSubtract)
+ Step = SE.getNegativeSCEV(Step);
+ Value *StepV;
+ {
+ // Expand the step somewhere that dominates the loop header.
+ SCEVInsertPointGuard Guard(Builder, this);
StepV = expandCodeForImpl(
Step, IntTy, &*L->getHeader()->getFirstInsertionPt(), false);
- }
- Result = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
- }
- }
-
- // We have decided to reuse an induction variable of a dominating loop. Apply
- // truncation and/or inversion of the step.
- if (TruncTy) {
- Type *ResTy = Result->getType();
- // Normalize the result type.
- if (ResTy != SE.getEffectiveSCEVType(ResTy))
- Result = InsertNoopCastOfTo(Result, SE.getEffectiveSCEVType(ResTy));
- // Truncate the result.
+ }
+ Result = expandIVInc(PN, StepV, L, ExpandTy, IntTy, useSubtract);
+ }
+ }
+
+ // We have decided to reuse an induction variable of a dominating loop. Apply
+ // truncation and/or inversion of the step.
+ if (TruncTy) {
+ Type *ResTy = Result->getType();
+ // Normalize the result type.
+ if (ResTy != SE.getEffectiveSCEVType(ResTy))
+ Result = InsertNoopCastOfTo(Result, SE.getEffectiveSCEVType(ResTy));
+ // Truncate the result.
if (TruncTy != Result->getType())
- Result = Builder.CreateTrunc(Result, TruncTy);
+ Result = Builder.CreateTrunc(Result, TruncTy);
- // Invert the result.
+ // Invert the result.
if (InvertStep)
Result = Builder.CreateSub(
expandCodeForImpl(Normalized->getStart(), TruncTy, false), Result);
- }
-
- // Re-apply any non-loop-dominating scale.
- if (PostLoopScale) {
- assert(S->isAffine() && "Can't linearly scale non-affine recurrences.");
- Result = InsertNoopCastOfTo(Result, IntTy);
- Result = Builder.CreateMul(Result,
+ }
+
+ // Re-apply any non-loop-dominating scale.
+ if (PostLoopScale) {
+ assert(S->isAffine() && "Can't linearly scale non-affine recurrences.");
+ Result = InsertNoopCastOfTo(Result, IntTy);
+ Result = Builder.CreateMul(Result,
expandCodeForImpl(PostLoopScale, IntTy, false));
- }
-
- // Re-apply any non-loop-dominating offset.
- if (PostLoopOffset) {
- if (PointerType *PTy = dyn_cast<PointerType>(ExpandTy)) {
- if (Result->getType()->isIntegerTy()) {
+ }
+
+ // Re-apply any non-loop-dominating offset.
+ if (PostLoopOffset) {
+ if (PointerType *PTy = dyn_cast<PointerType>(ExpandTy)) {
+ if (Result->getType()->isIntegerTy()) {
Value *Base = expandCodeForImpl(PostLoopOffset, ExpandTy, false);
- Result = expandAddToGEP(SE.getUnknown(Result), PTy, IntTy, Base);
- } else {
- Result = expandAddToGEP(PostLoopOffset, PTy, IntTy, Result);
- }
- } else {
- Result = InsertNoopCastOfTo(Result, IntTy);
+ Result = expandAddToGEP(SE.getUnknown(Result), PTy, IntTy, Base);
+ } else {
+ Result = expandAddToGEP(PostLoopOffset, PTy, IntTy, Result);
+ }
+ } else {
+ Result = InsertNoopCastOfTo(Result, IntTy);
Result = Builder.CreateAdd(
Result, expandCodeForImpl(PostLoopOffset, IntTy, false));
- }
- }
-
- return Result;
-}
-
-Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
- // In canonical mode we compute the addrec as an expression of a canonical IV
- // using evaluateAtIteration and expand the resulting SCEV expression. This
- // way we avoid introducing new IVs to carry on the comutation of the addrec
- // throughout the loop.
- //
- // For nested addrecs evaluateAtIteration might need a canonical IV of a
- // type wider than the addrec itself. Emitting a canonical IV of the
- // proper type might produce non-legal types, for example expanding an i64
- // {0,+,2,+,1} addrec would need an i65 canonical IV. To avoid this just fall
- // back to non-canonical mode for nested addrecs.
- if (!CanonicalMode || (S->getNumOperands() > 2))
- return expandAddRecExprLiterally(S);
-
- Type *Ty = SE.getEffectiveSCEVType(S->getType());
- const Loop *L = S->getLoop();
-
- // First check for an existing canonical IV in a suitable type.
- PHINode *CanonicalIV = nullptr;
- if (PHINode *PN = L->getCanonicalInductionVariable())
- if (SE.getTypeSizeInBits(PN->getType()) >= SE.getTypeSizeInBits(Ty))
- CanonicalIV = PN;
-
- // Rewrite an AddRec in terms of the canonical induction variable, if
- // its type is more narrow.
- if (CanonicalIV &&
- SE.getTypeSizeInBits(CanonicalIV->getType()) >
- SE.getTypeSizeInBits(Ty)) {
- SmallVector<const SCEV *, 4> NewOps(S->getNumOperands());
- for (unsigned i = 0, e = S->getNumOperands(); i != e; ++i)
- NewOps[i] = SE.getAnyExtendExpr(S->op_begin()[i], CanonicalIV->getType());
- Value *V = expand(SE.getAddRecExpr(NewOps, S->getLoop(),
- S->getNoWrapFlags(SCEV::FlagNW)));
- BasicBlock::iterator NewInsertPt =
+ }
+ }
+
+ return Result;
+}
+
+Value *SCEVExpander::visitAddRecExpr(const SCEVAddRecExpr *S) {
+ // In canonical mode we compute the addrec as an expression of a canonical IV
+ // using evaluateAtIteration and expand the resulting SCEV expression. This
+ // way we avoid introducing new IVs to carry on the comutation of the addrec
+ // throughout the loop.
+ //
+ // For nested addrecs evaluateAtIteration might need a canonical IV of a
+ // type wider than the addrec itself. Emitting a canonical IV of the
+ // proper type might produce non-legal types, for example expanding an i64
+ // {0,+,2,+,1} addrec would need an i65 canonical IV. To avoid this just fall
+ // back to non-canonical mode for nested addrecs.
+ if (!CanonicalMode || (S->getNumOperands() > 2))
+ return expandAddRecExprLiterally(S);
+
+ Type *Ty = SE.getEffectiveSCEVType(S->getType());
+ const Loop *L = S->getLoop();
+
+ // First check for an existing canonical IV in a suitable type.
+ PHINode *CanonicalIV = nullptr;
+ if (PHINode *PN = L->getCanonicalInductionVariable())
+ if (SE.getTypeSizeInBits(PN->getType()) >= SE.getTypeSizeInBits(Ty))
+ CanonicalIV = PN;
+
+ // Rewrite an AddRec in terms of the canonical induction variable, if
+ // its type is more narrow.
+ if (CanonicalIV &&
+ SE.getTypeSizeInBits(CanonicalIV->getType()) >
+ SE.getTypeSizeInBits(Ty)) {
+ SmallVector<const SCEV *, 4> NewOps(S->getNumOperands());
+ for (unsigned i = 0, e = S->getNumOperands(); i != e; ++i)
+ NewOps[i] = SE.getAnyExtendExpr(S->op_begin()[i], CanonicalIV->getType());
+ Value *V = expand(SE.getAddRecExpr(NewOps, S->getLoop(),
+ S->getNoWrapFlags(SCEV::FlagNW)));
+ BasicBlock::iterator NewInsertPt =
findInsertPointAfter(cast<Instruction>(V), &*Builder.GetInsertPoint());
V = expandCodeForImpl(SE.getTruncateExpr(SE.getUnknown(V), Ty), nullptr,
&*NewInsertPt, false);
- return V;
- }
-
- // {X,+,F} --> X + {0,+,F}
- if (!S->getStart()->isZero()) {
+ return V;
+ }
+
+ // {X,+,F} --> X + {0,+,F}
+ if (!S->getStart()->isZero()) {
SmallVector<const SCEV *, 4> NewOps(S->operands());
- NewOps[0] = SE.getConstant(Ty, 0);
- const SCEV *Rest = SE.getAddRecExpr(NewOps, L,
- S->getNoWrapFlags(SCEV::FlagNW));
-
- // Turn things like ptrtoint+arithmetic+inttoptr into GEP. See the
- // comments on expandAddToGEP for details.
- const SCEV *Base = S->getStart();
- // Dig into the expression to find the pointer base for a GEP.
- const SCEV *ExposedRest = Rest;
- ExposePointerBase(Base, ExposedRest, SE);
- // If we found a pointer, expand the AddRec with a GEP.
- if (PointerType *PTy = dyn_cast<PointerType>(Base->getType())) {
- // Make sure the Base isn't something exotic, such as a multiplied
- // or divided pointer value. In those cases, the result type isn't
- // actually a pointer type.
- if (!isa<SCEVMulExpr>(Base) && !isa<SCEVUDivExpr>(Base)) {
- Value *StartV = expand(Base);
- assert(StartV->getType() == PTy && "Pointer type mismatch for GEP!");
- return expandAddToGEP(ExposedRest, PTy, Ty, StartV);
- }
- }
-
- // Just do a normal add. Pre-expand the operands to suppress folding.
- //
- // The LHS and RHS values are factored out of the expand call to make the
- // output independent of the argument evaluation order.
- const SCEV *AddExprLHS = SE.getUnknown(expand(S->getStart()));
- const SCEV *AddExprRHS = SE.getUnknown(expand(Rest));
- return expand(SE.getAddExpr(AddExprLHS, AddExprRHS));
- }
-
- // If we don't yet have a canonical IV, create one.
- if (!CanonicalIV) {
- // Create and insert the PHI node for the induction variable in the
- // specified loop.
- BasicBlock *Header = L->getHeader();
- pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header);
- CanonicalIV = PHINode::Create(Ty, std::distance(HPB, HPE), "indvar",
- &Header->front());
- rememberInstruction(CanonicalIV);
-
- SmallSet<BasicBlock *, 4> PredSeen;
- Constant *One = ConstantInt::get(Ty, 1);
- for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
- BasicBlock *HP = *HPI;
- if (!PredSeen.insert(HP).second) {
- // There must be an incoming value for each predecessor, even the
- // duplicates!
- CanonicalIV->addIncoming(CanonicalIV->getIncomingValueForBlock(HP), HP);
- continue;
- }
-
- if (L->contains(HP)) {
- // Insert a unit add instruction right before the terminator
- // corresponding to the back-edge.
- Instruction *Add = BinaryOperator::CreateAdd(CanonicalIV, One,
- "indvar.next",
- HP->getTerminator());
- Add->setDebugLoc(HP->getTerminator()->getDebugLoc());
- rememberInstruction(Add);
- CanonicalIV->addIncoming(Add, HP);
- } else {
- CanonicalIV->addIncoming(Constant::getNullValue(Ty), HP);
- }
- }
- }
-
- // {0,+,1} --> Insert a canonical induction variable into the loop!
- if (S->isAffine() && S->getOperand(1)->isOne()) {
- assert(Ty == SE.getEffectiveSCEVType(CanonicalIV->getType()) &&
- "IVs with types different from the canonical IV should "
- "already have been handled!");
- return CanonicalIV;
- }
-
- // {0,+,F} --> {0,+,1} * F
-
- // If this is a simple linear addrec, emit it now as a special case.
- if (S->isAffine()) // {0,+,F} --> i*F
- return
- expand(SE.getTruncateOrNoop(
- SE.getMulExpr(SE.getUnknown(CanonicalIV),
- SE.getNoopOrAnyExtend(S->getOperand(1),
- CanonicalIV->getType())),
- Ty));
-
- // If this is a chain of recurrences, turn it into a closed form, using the
- // folders, then expandCodeFor the closed form. This allows the folders to
- // simplify the expression without having to build a bunch of special code
- // into this folder.
- const SCEV *IH = SE.getUnknown(CanonicalIV); // Get I as a "symbolic" SCEV.
-
- // Promote S up to the canonical IV type, if the cast is foldable.
- const SCEV *NewS = S;
- const SCEV *Ext = SE.getNoopOrAnyExtend(S, CanonicalIV->getType());
- if (isa<SCEVAddRecExpr>(Ext))
- NewS = Ext;
-
- const SCEV *V = cast<SCEVAddRecExpr>(NewS)->evaluateAtIteration(IH, SE);
- //cerr << "Evaluated: " << *this << "\n to: " << *V << "\n";
-
- // Truncate the result down to the original type, if needed.
- const SCEV *T = SE.getTruncateOrNoop(V, Ty);
- return expand(T);
-}
-
+ NewOps[0] = SE.getConstant(Ty, 0);
+ const SCEV *Rest = SE.getAddRecExpr(NewOps, L,
+ S->getNoWrapFlags(SCEV::FlagNW));
+
+ // Turn things like ptrtoint+arithmetic+inttoptr into GEP. See the
+ // comments on expandAddToGEP for details.
+ const SCEV *Base = S->getStart();
+ // Dig into the expression to find the pointer base for a GEP.
+ const SCEV *ExposedRest = Rest;
+ ExposePointerBase(Base, ExposedRest, SE);
+ // If we found a pointer, expand the AddRec with a GEP.
+ if (PointerType *PTy = dyn_cast<PointerType>(Base->getType())) {
+ // Make sure the Base isn't something exotic, such as a multiplied
+ // or divided pointer value. In those cases, the result type isn't
+ // actually a pointer type.
+ if (!isa<SCEVMulExpr>(Base) && !isa<SCEVUDivExpr>(Base)) {
+ Value *StartV = expand(Base);
+ assert(StartV->getType() == PTy && "Pointer type mismatch for GEP!");
+ return expandAddToGEP(ExposedRest, PTy, Ty, StartV);
+ }
+ }
+
+ // Just do a normal add. Pre-expand the operands to suppress folding.
+ //
+ // The LHS and RHS values are factored out of the expand call to make the
+ // output independent of the argument evaluation order.
+ const SCEV *AddExprLHS = SE.getUnknown(expand(S->getStart()));
+ const SCEV *AddExprRHS = SE.getUnknown(expand(Rest));
+ return expand(SE.getAddExpr(AddExprLHS, AddExprRHS));
+ }
+
+ // If we don't yet have a canonical IV, create one.
+ if (!CanonicalIV) {
+ // Create and insert the PHI node for the induction variable in the
+ // specified loop.
+ BasicBlock *Header = L->getHeader();
+ pred_iterator HPB = pred_begin(Header), HPE = pred_end(Header);
+ CanonicalIV = PHINode::Create(Ty, std::distance(HPB, HPE), "indvar",
+ &Header->front());
+ rememberInstruction(CanonicalIV);
+
+ SmallSet<BasicBlock *, 4> PredSeen;
+ Constant *One = ConstantInt::get(Ty, 1);
+ for (pred_iterator HPI = HPB; HPI != HPE; ++HPI) {
+ BasicBlock *HP = *HPI;
+ if (!PredSeen.insert(HP).second) {
+ // There must be an incoming value for each predecessor, even the
+ // duplicates!
+ CanonicalIV->addIncoming(CanonicalIV->getIncomingValueForBlock(HP), HP);
+ continue;
+ }
+
+ if (L->contains(HP)) {
+ // Insert a unit add instruction right before the terminator
+ // corresponding to the back-edge.
+ Instruction *Add = BinaryOperator::CreateAdd(CanonicalIV, One,
+ "indvar.next",
+ HP->getTerminator());
+ Add->setDebugLoc(HP->getTerminator()->getDebugLoc());
+ rememberInstruction(Add);
+ CanonicalIV->addIncoming(Add, HP);
+ } else {
+ CanonicalIV->addIncoming(Constant::getNullValue(Ty), HP);
+ }
+ }
+ }
+
+ // {0,+,1} --> Insert a canonical induction variable into the loop!
+ if (S->isAffine() && S->getOperand(1)->isOne()) {
+ assert(Ty == SE.getEffectiveSCEVType(CanonicalIV->getType()) &&
+ "IVs with types different from the canonical IV should "
+ "already have been handled!");
+ return CanonicalIV;
+ }
+
+ // {0,+,F} --> {0,+,1} * F
+
+ // If this is a simple linear addrec, emit it now as a special case.
+ if (S->isAffine()) // {0,+,F} --> i*F
+ return
+ expand(SE.getTruncateOrNoop(
+ SE.getMulExpr(SE.getUnknown(CanonicalIV),
+ SE.getNoopOrAnyExtend(S->getOperand(1),
+ CanonicalIV->getType())),
+ Ty));
+
+ // If this is a chain of recurrences, turn it into a closed form, using the
+ // folders, then expandCodeFor the closed form. This allows the folders to
+ // simplify the expression without having to build a bunch of special code
+ // into this folder.
+ const SCEV *IH = SE.getUnknown(CanonicalIV); // Get I as a "symbolic" SCEV.
+
+ // Promote S up to the canonical IV type, if the cast is foldable.
+ const SCEV *NewS = S;
+ const SCEV *Ext = SE.getNoopOrAnyExtend(S, CanonicalIV->getType());
+ if (isa<SCEVAddRecExpr>(Ext))
+ NewS = Ext;
+
+ const SCEV *V = cast<SCEVAddRecExpr>(NewS)->evaluateAtIteration(IH, SE);
+ //cerr << "Evaluated: " << *this << "\n to: " << *V << "\n";
+
+ // Truncate the result down to the original type, if needed.
+ const SCEV *T = SE.getTruncateOrNoop(V, Ty);
+ return expand(T);
+}
+
Value *SCEVExpander::visitPtrToIntExpr(const SCEVPtrToIntExpr *S) {
Value *V =
expandCodeForImpl(S->getOperand(), S->getOperand()->getType(), false);
return Builder.CreatePtrToInt(V, S->getType());
}
-Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) {
- Type *Ty = SE.getEffectiveSCEVType(S->getType());
+Value *SCEVExpander::visitTruncateExpr(const SCEVTruncateExpr *S) {
+ Type *Ty = SE.getEffectiveSCEVType(S->getType());
Value *V = expandCodeForImpl(
S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType()),
false);
return Builder.CreateTrunc(V, Ty);
-}
-
-Value *SCEVExpander::visitZeroExtendExpr(const SCEVZeroExtendExpr *S) {
- Type *Ty = SE.getEffectiveSCEVType(S->getType());
+}
+
+Value *SCEVExpander::visitZeroExtendExpr(const SCEVZeroExtendExpr *S) {
+ Type *Ty = SE.getEffectiveSCEVType(S->getType());
Value *V = expandCodeForImpl(
S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType()),
false);
return Builder.CreateZExt(V, Ty);
-}
-
-Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) {
- Type *Ty = SE.getEffectiveSCEVType(S->getType());
+}
+
+Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) {
+ Type *Ty = SE.getEffectiveSCEVType(S->getType());
Value *V = expandCodeForImpl(
S->getOperand(), SE.getEffectiveSCEVType(S->getOperand()->getType()),
false);
return Builder.CreateSExt(V, Ty);
-}
-
-Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
- Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
- Type *Ty = LHS->getType();
- for (int i = S->getNumOperands()-2; i >= 0; --i) {
- // In the case of mixed integer and pointer types, do the
- // rest of the comparisons as integer.
- Type *OpTy = S->getOperand(i)->getType();
- if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
- Ty = SE.getEffectiveSCEVType(Ty);
- LHS = InsertNoopCastOfTo(LHS, Ty);
- }
+}
+
+Value *SCEVExpander::visitSMaxExpr(const SCEVSMaxExpr *S) {
+ Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
+ Type *Ty = LHS->getType();
+ for (int i = S->getNumOperands()-2; i >= 0; --i) {
+ // In the case of mixed integer and pointer types, do the
+ // rest of the comparisons as integer.
+ Type *OpTy = S->getOperand(i)->getType();
+ if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
+ Ty = SE.getEffectiveSCEVType(Ty);
+ LHS = InsertNoopCastOfTo(LHS, Ty);
+ }
Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
- Value *ICmp = Builder.CreateICmpSGT(LHS, RHS);
- Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smax");
- LHS = Sel;
- }
- // In the case of mixed integer and pointer types, cast the
- // final result back to the pointer type.
- if (LHS->getType() != S->getType())
- LHS = InsertNoopCastOfTo(LHS, S->getType());
- return LHS;
-}
-
-Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
- Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
- Type *Ty = LHS->getType();
- for (int i = S->getNumOperands()-2; i >= 0; --i) {
- // In the case of mixed integer and pointer types, do the
- // rest of the comparisons as integer.
- Type *OpTy = S->getOperand(i)->getType();
- if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
- Ty = SE.getEffectiveSCEVType(Ty);
- LHS = InsertNoopCastOfTo(LHS, Ty);
- }
+ Value *ICmp = Builder.CreateICmpSGT(LHS, RHS);
+ Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smax");
+ LHS = Sel;
+ }
+ // In the case of mixed integer and pointer types, cast the
+ // final result back to the pointer type.
+ if (LHS->getType() != S->getType())
+ LHS = InsertNoopCastOfTo(LHS, S->getType());
+ return LHS;
+}
+
+Value *SCEVExpander::visitUMaxExpr(const SCEVUMaxExpr *S) {
+ Value *LHS = expand(S->getOperand(S->getNumOperands()-1));
+ Type *Ty = LHS->getType();
+ for (int i = S->getNumOperands()-2; i >= 0; --i) {
+ // In the case of mixed integer and pointer types, do the
+ // rest of the comparisons as integer.
+ Type *OpTy = S->getOperand(i)->getType();
+ if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
+ Ty = SE.getEffectiveSCEVType(Ty);
+ LHS = InsertNoopCastOfTo(LHS, Ty);
+ }
Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
- Value *ICmp = Builder.CreateICmpUGT(LHS, RHS);
- Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umax");
- LHS = Sel;
- }
- // In the case of mixed integer and pointer types, cast the
- // final result back to the pointer type.
- if (LHS->getType() != S->getType())
- LHS = InsertNoopCastOfTo(LHS, S->getType());
- return LHS;
-}
-
-Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) {
- Value *LHS = expand(S->getOperand(S->getNumOperands() - 1));
- Type *Ty = LHS->getType();
- for (int i = S->getNumOperands() - 2; i >= 0; --i) {
- // In the case of mixed integer and pointer types, do the
- // rest of the comparisons as integer.
- Type *OpTy = S->getOperand(i)->getType();
- if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
- Ty = SE.getEffectiveSCEVType(Ty);
- LHS = InsertNoopCastOfTo(LHS, Ty);
- }
+ Value *ICmp = Builder.CreateICmpUGT(LHS, RHS);
+ Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umax");
+ LHS = Sel;
+ }
+ // In the case of mixed integer and pointer types, cast the
+ // final result back to the pointer type.
+ if (LHS->getType() != S->getType())
+ LHS = InsertNoopCastOfTo(LHS, S->getType());
+ return LHS;
+}
+
+Value *SCEVExpander::visitSMinExpr(const SCEVSMinExpr *S) {
+ Value *LHS = expand(S->getOperand(S->getNumOperands() - 1));
+ Type *Ty = LHS->getType();
+ for (int i = S->getNumOperands() - 2; i >= 0; --i) {
+ // In the case of mixed integer and pointer types, do the
+ // rest of the comparisons as integer.
+ Type *OpTy = S->getOperand(i)->getType();
+ if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
+ Ty = SE.getEffectiveSCEVType(Ty);
+ LHS = InsertNoopCastOfTo(LHS, Ty);
+ }
Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
- Value *ICmp = Builder.CreateICmpSLT(LHS, RHS);
- Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smin");
- LHS = Sel;
- }
- // In the case of mixed integer and pointer types, cast the
- // final result back to the pointer type.
- if (LHS->getType() != S->getType())
- LHS = InsertNoopCastOfTo(LHS, S->getType());
- return LHS;
-}
-
-Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) {
- Value *LHS = expand(S->getOperand(S->getNumOperands() - 1));
- Type *Ty = LHS->getType();
- for (int i = S->getNumOperands() - 2; i >= 0; --i) {
- // In the case of mixed integer and pointer types, do the
- // rest of the comparisons as integer.
- Type *OpTy = S->getOperand(i)->getType();
- if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
- Ty = SE.getEffectiveSCEVType(Ty);
- LHS = InsertNoopCastOfTo(LHS, Ty);
- }
+ Value *ICmp = Builder.CreateICmpSLT(LHS, RHS);
+ Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "smin");
+ LHS = Sel;
+ }
+ // In the case of mixed integer and pointer types, cast the
+ // final result back to the pointer type.
+ if (LHS->getType() != S->getType())
+ LHS = InsertNoopCastOfTo(LHS, S->getType());
+ return LHS;
+}
+
+Value *SCEVExpander::visitUMinExpr(const SCEVUMinExpr *S) {
+ Value *LHS = expand(S->getOperand(S->getNumOperands() - 1));
+ Type *Ty = LHS->getType();
+ for (int i = S->getNumOperands() - 2; i >= 0; --i) {
+ // In the case of mixed integer and pointer types, do the
+ // rest of the comparisons as integer.
+ Type *OpTy = S->getOperand(i)->getType();
+ if (OpTy->isIntegerTy() != Ty->isIntegerTy()) {
+ Ty = SE.getEffectiveSCEVType(Ty);
+ LHS = InsertNoopCastOfTo(LHS, Ty);
+ }
Value *RHS = expandCodeForImpl(S->getOperand(i), Ty, false);
- Value *ICmp = Builder.CreateICmpULT(LHS, RHS);
- Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umin");
- LHS = Sel;
- }
- // In the case of mixed integer and pointer types, cast the
- // final result back to the pointer type.
- if (LHS->getType() != S->getType())
- LHS = InsertNoopCastOfTo(LHS, S->getType());
- return LHS;
-}
-
+ Value *ICmp = Builder.CreateICmpULT(LHS, RHS);
+ Value *Sel = Builder.CreateSelect(ICmp, LHS, RHS, "umin");
+ LHS = Sel;
+ }
+ // In the case of mixed integer and pointer types, cast the
+ // final result back to the pointer type.
+ if (LHS->getType() != S->getType())
+ LHS = InsertNoopCastOfTo(LHS, S->getType());
+ return LHS;
+}
+
Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty,
Instruction *IP, bool Root) {
- setInsertPoint(IP);
+ setInsertPoint(IP);
Value *V = expandCodeForImpl(SH, Ty, Root);
return V;
-}
-
+}
+
Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) {
- // Expand the code for this SCEV.
- Value *V = expand(SH);
+ // Expand the code for this SCEV.
+ Value *V = expand(SH);
if (PreserveLCSSA) {
if (auto *Inst = dyn_cast<Instruction>(V)) {
@@ -1835,147 +1835,147 @@ Value *SCEVExpander::expandCodeForImpl(const SCEV *SH, Type *Ty, bool Root) {
}
InsertedExpressions[std::make_pair(SH, &*Builder.GetInsertPoint())] = V;
- if (Ty) {
- assert(SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(SH->getType()) &&
- "non-trivial casts should be done with the SCEVs directly!");
- V = InsertNoopCastOfTo(V, Ty);
- }
- return V;
-}
-
-ScalarEvolution::ValueOffsetPair
-SCEVExpander::FindValueInExprValueMap(const SCEV *S,
- const Instruction *InsertPt) {
- SetVector<ScalarEvolution::ValueOffsetPair> *Set = SE.getSCEVValues(S);
- // If the expansion is not in CanonicalMode, and the SCEV contains any
- // sub scAddRecExpr type SCEV, it is required to expand the SCEV literally.
- if (CanonicalMode || !SE.containsAddRecurrence(S)) {
- // If S is scConstant, it may be worse to reuse an existing Value.
- if (S->getSCEVType() != scConstant && Set) {
- // Choose a Value from the set which dominates the insertPt.
- // insertPt should be inside the Value's parent loop so as not to break
- // the LCSSA form.
- for (auto const &VOPair : *Set) {
- Value *V = VOPair.first;
- ConstantInt *Offset = VOPair.second;
- Instruction *EntInst = nullptr;
- if (V && isa<Instruction>(V) && (EntInst = cast<Instruction>(V)) &&
- S->getType() == V->getType() &&
- EntInst->getFunction() == InsertPt->getFunction() &&
- SE.DT.dominates(EntInst, InsertPt) &&
- (SE.LI.getLoopFor(EntInst->getParent()) == nullptr ||
- SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)))
- return {V, Offset};
- }
- }
- }
- return {nullptr, nullptr};
-}
-
-// The expansion of SCEV will either reuse a previous Value in ExprValueMap,
-// or expand the SCEV literally. Specifically, if the expansion is in LSRMode,
-// and the SCEV contains any sub scAddRecExpr type SCEV, it will be expanded
-// literally, to prevent LSR's transformed SCEV from being reverted. Otherwise,
-// the expansion will try to reuse Value from ExprValueMap, and only when it
-// fails, expand the SCEV literally.
-Value *SCEVExpander::expand(const SCEV *S) {
- // Compute an insertion point for this SCEV object. Hoist the instructions
- // as far out in the loop nest as possible.
- Instruction *InsertPt = &*Builder.GetInsertPoint();
-
- // We can move insertion point only if there is no div or rem operations
- // otherwise we are risky to move it over the check for zero denominator.
- auto SafeToHoist = [](const SCEV *S) {
- return !SCEVExprContains(S, [](const SCEV *S) {
- if (const auto *D = dyn_cast<SCEVUDivExpr>(S)) {
- if (const auto *SC = dyn_cast<SCEVConstant>(D->getRHS()))
- // Division by non-zero constants can be hoisted.
- return SC->getValue()->isZero();
- // All other divisions should not be moved as they may be
- // divisions by zero and should be kept within the
- // conditions of the surrounding loops that guard their
- // execution (see PR35406).
- return true;
- }
- return false;
- });
- };
- if (SafeToHoist(S)) {
- for (Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock());;
- L = L->getParentLoop()) {
- if (SE.isLoopInvariant(S, L)) {
- if (!L) break;
- if (BasicBlock *Preheader = L->getLoopPreheader())
- InsertPt = Preheader->getTerminator();
- else
- // LSR sets the insertion point for AddRec start/step values to the
- // block start to simplify value reuse, even though it's an invalid
- // position. SCEVExpander must correct for this in all cases.
- InsertPt = &*L->getHeader()->getFirstInsertionPt();
- } else {
- // If the SCEV is computable at this level, insert it into the header
- // after the PHIs (and after any other instructions that we've inserted
- // there) so that it is guaranteed to dominate any user inside the loop.
- if (L && SE.hasComputableLoopEvolution(S, L) && !PostIncLoops.count(L))
- InsertPt = &*L->getHeader()->getFirstInsertionPt();
-
- while (InsertPt->getIterator() != Builder.GetInsertPoint() &&
- (isInsertedInstruction(InsertPt) ||
+ if (Ty) {
+ assert(SE.getTypeSizeInBits(Ty) == SE.getTypeSizeInBits(SH->getType()) &&
+ "non-trivial casts should be done with the SCEVs directly!");
+ V = InsertNoopCastOfTo(V, Ty);
+ }
+ return V;
+}
+
+ScalarEvolution::ValueOffsetPair
+SCEVExpander::FindValueInExprValueMap(const SCEV *S,
+ const Instruction *InsertPt) {
+ SetVector<ScalarEvolution::ValueOffsetPair> *Set = SE.getSCEVValues(S);
+ // If the expansion is not in CanonicalMode, and the SCEV contains any
+ // sub scAddRecExpr type SCEV, it is required to expand the SCEV literally.
+ if (CanonicalMode || !SE.containsAddRecurrence(S)) {
+ // If S is scConstant, it may be worse to reuse an existing Value.
+ if (S->getSCEVType() != scConstant && Set) {
+ // Choose a Value from the set which dominates the insertPt.
+ // insertPt should be inside the Value's parent loop so as not to break
+ // the LCSSA form.
+ for (auto const &VOPair : *Set) {
+ Value *V = VOPair.first;
+ ConstantInt *Offset = VOPair.second;
+ Instruction *EntInst = nullptr;
+ if (V && isa<Instruction>(V) && (EntInst = cast<Instruction>(V)) &&
+ S->getType() == V->getType() &&
+ EntInst->getFunction() == InsertPt->getFunction() &&
+ SE.DT.dominates(EntInst, InsertPt) &&
+ (SE.LI.getLoopFor(EntInst->getParent()) == nullptr ||
+ SE.LI.getLoopFor(EntInst->getParent())->contains(InsertPt)))
+ return {V, Offset};
+ }
+ }
+ }
+ return {nullptr, nullptr};
+}
+
+// The expansion of SCEV will either reuse a previous Value in ExprValueMap,
+// or expand the SCEV literally. Specifically, if the expansion is in LSRMode,
+// and the SCEV contains any sub scAddRecExpr type SCEV, it will be expanded
+// literally, to prevent LSR's transformed SCEV from being reverted. Otherwise,
+// the expansion will try to reuse Value from ExprValueMap, and only when it
+// fails, expand the SCEV literally.
+Value *SCEVExpander::expand(const SCEV *S) {
+ // Compute an insertion point for this SCEV object. Hoist the instructions
+ // as far out in the loop nest as possible.
+ Instruction *InsertPt = &*Builder.GetInsertPoint();
+
+ // We can move insertion point only if there is no div or rem operations
+ // otherwise we are risky to move it over the check for zero denominator.
+ auto SafeToHoist = [](const SCEV *S) {
+ return !SCEVExprContains(S, [](const SCEV *S) {
+ if (const auto *D = dyn_cast<SCEVUDivExpr>(S)) {
+ if (const auto *SC = dyn_cast<SCEVConstant>(D->getRHS()))
+ // Division by non-zero constants can be hoisted.
+ return SC->getValue()->isZero();
+ // All other divisions should not be moved as they may be
+ // divisions by zero and should be kept within the
+ // conditions of the surrounding loops that guard their
+ // execution (see PR35406).
+ return true;
+ }
+ return false;
+ });
+ };
+ if (SafeToHoist(S)) {
+ for (Loop *L = SE.LI.getLoopFor(Builder.GetInsertBlock());;
+ L = L->getParentLoop()) {
+ if (SE.isLoopInvariant(S, L)) {
+ if (!L) break;
+ if (BasicBlock *Preheader = L->getLoopPreheader())
+ InsertPt = Preheader->getTerminator();
+ else
+ // LSR sets the insertion point for AddRec start/step values to the
+ // block start to simplify value reuse, even though it's an invalid
+ // position. SCEVExpander must correct for this in all cases.
+ InsertPt = &*L->getHeader()->getFirstInsertionPt();
+ } else {
+ // If the SCEV is computable at this level, insert it into the header
+ // after the PHIs (and after any other instructions that we've inserted
+ // there) so that it is guaranteed to dominate any user inside the loop.
+ if (L && SE.hasComputableLoopEvolution(S, L) && !PostIncLoops.count(L))
+ InsertPt = &*L->getHeader()->getFirstInsertionPt();
+
+ while (InsertPt->getIterator() != Builder.GetInsertPoint() &&
+ (isInsertedInstruction(InsertPt) ||
isa<DbgInfoIntrinsic>(InsertPt))) {
- InsertPt = &*std::next(InsertPt->getIterator());
+ InsertPt = &*std::next(InsertPt->getIterator());
}
- break;
- }
- }
- }
-
- // Check to see if we already expanded this here.
- auto I = InsertedExpressions.find(std::make_pair(S, InsertPt));
- if (I != InsertedExpressions.end())
- return I->second;
-
- SCEVInsertPointGuard Guard(Builder, this);
- Builder.SetInsertPoint(InsertPt);
-
- // Expand the expression into instructions.
- ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, InsertPt);
- Value *V = VO.first;
-
- if (!V)
- V = visit(S);
- else if (VO.second) {
- if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) {
- Type *Ety = Vty->getPointerElementType();
- int64_t Offset = VO.second->getSExtValue();
- int64_t ESize = SE.getTypeSizeInBits(Ety);
- if ((Offset * 8) % ESize == 0) {
- ConstantInt *Idx =
- ConstantInt::getSigned(VO.second->getType(), -(Offset * 8) / ESize);
- V = Builder.CreateGEP(Ety, V, Idx, "scevgep");
- } else {
- ConstantInt *Idx =
- ConstantInt::getSigned(VO.second->getType(), -Offset);
- unsigned AS = Vty->getAddressSpace();
- V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS));
- V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx,
- "uglygep");
- V = Builder.CreateBitCast(V, Vty);
- }
- } else {
- V = Builder.CreateSub(V, VO.second);
- }
- }
- // Remember the expanded value for this SCEV at this location.
- //
- // This is independent of PostIncLoops. The mapped value simply materializes
- // the expression at this insertion point. If the mapped value happened to be
- // a postinc expansion, it could be reused by a non-postinc user, but only if
- // its insertion point was already at the head of the loop.
- InsertedExpressions[std::make_pair(S, InsertPt)] = V;
- return V;
-}
-
-void SCEVExpander::rememberInstruction(Value *I) {
+ break;
+ }
+ }
+ }
+
+ // Check to see if we already expanded this here.
+ auto I = InsertedExpressions.find(std::make_pair(S, InsertPt));
+ if (I != InsertedExpressions.end())
+ return I->second;
+
+ SCEVInsertPointGuard Guard(Builder, this);
+ Builder.SetInsertPoint(InsertPt);
+
+ // Expand the expression into instructions.
+ ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, InsertPt);
+ Value *V = VO.first;
+
+ if (!V)
+ V = visit(S);
+ else if (VO.second) {
+ if (PointerType *Vty = dyn_cast<PointerType>(V->getType())) {
+ Type *Ety = Vty->getPointerElementType();
+ int64_t Offset = VO.second->getSExtValue();
+ int64_t ESize = SE.getTypeSizeInBits(Ety);
+ if ((Offset * 8) % ESize == 0) {
+ ConstantInt *Idx =
+ ConstantInt::getSigned(VO.second->getType(), -(Offset * 8) / ESize);
+ V = Builder.CreateGEP(Ety, V, Idx, "scevgep");
+ } else {
+ ConstantInt *Idx =
+ ConstantInt::getSigned(VO.second->getType(), -Offset);
+ unsigned AS = Vty->getAddressSpace();
+ V = Builder.CreateBitCast(V, Type::getInt8PtrTy(SE.getContext(), AS));
+ V = Builder.CreateGEP(Type::getInt8Ty(SE.getContext()), V, Idx,
+ "uglygep");
+ V = Builder.CreateBitCast(V, Vty);
+ }
+ } else {
+ V = Builder.CreateSub(V, VO.second);
+ }
+ }
+ // Remember the expanded value for this SCEV at this location.
+ //
+ // This is independent of PostIncLoops. The mapped value simply materializes
+ // the expression at this insertion point. If the mapped value happened to be
+ // a postinc expansion, it could be reused by a non-postinc user, but only if
+ // its insertion point was already at the head of the loop.
+ InsertedExpressions[std::make_pair(S, InsertPt)] = V;
+ return V;
+}
+
+void SCEVExpander::rememberInstruction(Value *I) {
auto DoInsert = [this](Value *V) {
if (!PostIncLoops.empty())
InsertedPostIncValues.insert(V);
@@ -1983,10 +1983,10 @@ void SCEVExpander::rememberInstruction(Value *I) {
InsertedValues.insert(V);
};
DoInsert(I);
-
+
if (!PreserveLCSSA)
return;
-
+
if (auto *Inst = dyn_cast<Instruction>(I)) {
// A new instruction has been added, which might introduce new uses outside
// a defining loop. Fix LCSSA from for each operand of the new instruction,
@@ -1995,190 +1995,190 @@ void SCEVExpander::rememberInstruction(Value *I) {
OpIdx++)
fixupLCSSAFormFor(Inst, OpIdx);
}
-}
-
-/// replaceCongruentIVs - Check for congruent phis in this loop header and
-/// replace them with their most canonical representative. Return the number of
-/// phis eliminated.
-///
-/// This does not depend on any SCEVExpander state but should be used in
-/// the same context that SCEVExpander is used.
-unsigned
-SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
- SmallVectorImpl<WeakTrackingVH> &DeadInsts,
- const TargetTransformInfo *TTI) {
- // Find integer phis in order of increasing width.
- SmallVector<PHINode*, 8> Phis;
- for (PHINode &PN : L->getHeader()->phis())
- Phis.push_back(&PN);
-
- if (TTI)
- llvm::sort(Phis, [](Value *LHS, Value *RHS) {
- // Put pointers at the back and make sure pointer < pointer = false.
- if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
- return RHS->getType()->isIntegerTy() && !LHS->getType()->isIntegerTy();
+}
+
+/// replaceCongruentIVs - Check for congruent phis in this loop header and
+/// replace them with their most canonical representative. Return the number of
+/// phis eliminated.
+///
+/// This does not depend on any SCEVExpander state but should be used in
+/// the same context that SCEVExpander is used.
+unsigned
+SCEVExpander::replaceCongruentIVs(Loop *L, const DominatorTree *DT,
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts,
+ const TargetTransformInfo *TTI) {
+ // Find integer phis in order of increasing width.
+ SmallVector<PHINode*, 8> Phis;
+ for (PHINode &PN : L->getHeader()->phis())
+ Phis.push_back(&PN);
+
+ if (TTI)
+ llvm::sort(Phis, [](Value *LHS, Value *RHS) {
+ // Put pointers at the back and make sure pointer < pointer = false.
+ if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
+ return RHS->getType()->isIntegerTy() && !LHS->getType()->isIntegerTy();
return RHS->getType()->getPrimitiveSizeInBits().getFixedSize() <
LHS->getType()->getPrimitiveSizeInBits().getFixedSize();
- });
-
- unsigned NumElim = 0;
- DenseMap<const SCEV *, PHINode *> ExprToIVMap;
- // Process phis from wide to narrow. Map wide phis to their truncation
- // so narrow phis can reuse them.
- for (PHINode *Phi : Phis) {
- auto SimplifyPHINode = [&](PHINode *PN) -> Value * {
- if (Value *V = SimplifyInstruction(PN, {DL, &SE.TLI, &SE.DT, &SE.AC}))
- return V;
- if (!SE.isSCEVable(PN->getType()))
- return nullptr;
- auto *Const = dyn_cast<SCEVConstant>(SE.getSCEV(PN));
- if (!Const)
- return nullptr;
- return Const->getValue();
- };
-
- // Fold constant phis. They may be congruent to other constant phis and
- // would confuse the logic below that expects proper IVs.
- if (Value *V = SimplifyPHINode(Phi)) {
- if (V->getType() != Phi->getType())
- continue;
- Phi->replaceAllUsesWith(V);
- DeadInsts.emplace_back(Phi);
- ++NumElim;
- DEBUG_WITH_TYPE(DebugType, dbgs()
- << "INDVARS: Eliminated constant iv: " << *Phi << '\n');
- continue;
- }
-
- if (!SE.isSCEVable(Phi->getType()))
- continue;
-
- PHINode *&OrigPhiRef = ExprToIVMap[SE.getSCEV(Phi)];
- if (!OrigPhiRef) {
- OrigPhiRef = Phi;
- if (Phi->getType()->isIntegerTy() && TTI &&
- TTI->isTruncateFree(Phi->getType(), Phis.back()->getType())) {
- // This phi can be freely truncated to the narrowest phi type. Map the
- // truncated expression to it so it will be reused for narrow types.
- const SCEV *TruncExpr =
- SE.getTruncateExpr(SE.getSCEV(Phi), Phis.back()->getType());
- ExprToIVMap[TruncExpr] = Phi;
- }
- continue;
- }
-
- // Replacing a pointer phi with an integer phi or vice-versa doesn't make
- // sense.
- if (OrigPhiRef->getType()->isPointerTy() != Phi->getType()->isPointerTy())
- continue;
-
- if (BasicBlock *LatchBlock = L->getLoopLatch()) {
- Instruction *OrigInc = dyn_cast<Instruction>(
- OrigPhiRef->getIncomingValueForBlock(LatchBlock));
- Instruction *IsomorphicInc =
- dyn_cast<Instruction>(Phi->getIncomingValueForBlock(LatchBlock));
-
- if (OrigInc && IsomorphicInc) {
- // If this phi has the same width but is more canonical, replace the
- // original with it. As part of the "more canonical" determination,
- // respect a prior decision to use an IV chain.
- if (OrigPhiRef->getType() == Phi->getType() &&
- !(ChainedPhis.count(Phi) ||
- isExpandedAddRecExprPHI(OrigPhiRef, OrigInc, L)) &&
- (ChainedPhis.count(Phi) ||
- isExpandedAddRecExprPHI(Phi, IsomorphicInc, L))) {
- std::swap(OrigPhiRef, Phi);
- std::swap(OrigInc, IsomorphicInc);
- }
- // Replacing the congruent phi is sufficient because acyclic
- // redundancy elimination, CSE/GVN, should handle the
- // rest. However, once SCEV proves that a phi is congruent,
- // it's often the head of an IV user cycle that is isomorphic
- // with the original phi. It's worth eagerly cleaning up the
- // common case of a single IV increment so that DeleteDeadPHIs
- // can remove cycles that had postinc uses.
- const SCEV *TruncExpr =
- SE.getTruncateOrNoop(SE.getSCEV(OrigInc), IsomorphicInc->getType());
- if (OrigInc != IsomorphicInc &&
- TruncExpr == SE.getSCEV(IsomorphicInc) &&
- SE.LI.replacementPreservesLCSSAForm(IsomorphicInc, OrigInc) &&
- hoistIVInc(OrigInc, IsomorphicInc)) {
- DEBUG_WITH_TYPE(DebugType,
- dbgs() << "INDVARS: Eliminated congruent iv.inc: "
- << *IsomorphicInc << '\n');
- Value *NewInc = OrigInc;
- if (OrigInc->getType() != IsomorphicInc->getType()) {
- Instruction *IP = nullptr;
- if (PHINode *PN = dyn_cast<PHINode>(OrigInc))
- IP = &*PN->getParent()->getFirstInsertionPt();
- else
- IP = OrigInc->getNextNode();
-
- IRBuilder<> Builder(IP);
- Builder.SetCurrentDebugLocation(IsomorphicInc->getDebugLoc());
- NewInc = Builder.CreateTruncOrBitCast(
- OrigInc, IsomorphicInc->getType(), IVName);
- }
- IsomorphicInc->replaceAllUsesWith(NewInc);
- DeadInsts.emplace_back(IsomorphicInc);
- }
- }
- }
- DEBUG_WITH_TYPE(DebugType, dbgs() << "INDVARS: Eliminated congruent iv: "
- << *Phi << '\n');
+ });
+
+ unsigned NumElim = 0;
+ DenseMap<const SCEV *, PHINode *> ExprToIVMap;
+ // Process phis from wide to narrow. Map wide phis to their truncation
+ // so narrow phis can reuse them.
+ for (PHINode *Phi : Phis) {
+ auto SimplifyPHINode = [&](PHINode *PN) -> Value * {
+ if (Value *V = SimplifyInstruction(PN, {DL, &SE.TLI, &SE.DT, &SE.AC}))
+ return V;
+ if (!SE.isSCEVable(PN->getType()))
+ return nullptr;
+ auto *Const = dyn_cast<SCEVConstant>(SE.getSCEV(PN));
+ if (!Const)
+ return nullptr;
+ return Const->getValue();
+ };
+
+ // Fold constant phis. They may be congruent to other constant phis and
+ // would confuse the logic below that expects proper IVs.
+ if (Value *V = SimplifyPHINode(Phi)) {
+ if (V->getType() != Phi->getType())
+ continue;
+ Phi->replaceAllUsesWith(V);
+ DeadInsts.emplace_back(Phi);
+ ++NumElim;
+ DEBUG_WITH_TYPE(DebugType, dbgs()
+ << "INDVARS: Eliminated constant iv: " << *Phi << '\n');
+ continue;
+ }
+
+ if (!SE.isSCEVable(Phi->getType()))
+ continue;
+
+ PHINode *&OrigPhiRef = ExprToIVMap[SE.getSCEV(Phi)];
+ if (!OrigPhiRef) {
+ OrigPhiRef = Phi;
+ if (Phi->getType()->isIntegerTy() && TTI &&
+ TTI->isTruncateFree(Phi->getType(), Phis.back()->getType())) {
+ // This phi can be freely truncated to the narrowest phi type. Map the
+ // truncated expression to it so it will be reused for narrow types.
+ const SCEV *TruncExpr =
+ SE.getTruncateExpr(SE.getSCEV(Phi), Phis.back()->getType());
+ ExprToIVMap[TruncExpr] = Phi;
+ }
+ continue;
+ }
+
+ // Replacing a pointer phi with an integer phi or vice-versa doesn't make
+ // sense.
+ if (OrigPhiRef->getType()->isPointerTy() != Phi->getType()->isPointerTy())
+ continue;
+
+ if (BasicBlock *LatchBlock = L->getLoopLatch()) {
+ Instruction *OrigInc = dyn_cast<Instruction>(
+ OrigPhiRef->getIncomingValueForBlock(LatchBlock));
+ Instruction *IsomorphicInc =
+ dyn_cast<Instruction>(Phi->getIncomingValueForBlock(LatchBlock));
+
+ if (OrigInc && IsomorphicInc) {
+ // If this phi has the same width but is more canonical, replace the
+ // original with it. As part of the "more canonical" determination,
+ // respect a prior decision to use an IV chain.
+ if (OrigPhiRef->getType() == Phi->getType() &&
+ !(ChainedPhis.count(Phi) ||
+ isExpandedAddRecExprPHI(OrigPhiRef, OrigInc, L)) &&
+ (ChainedPhis.count(Phi) ||
+ isExpandedAddRecExprPHI(Phi, IsomorphicInc, L))) {
+ std::swap(OrigPhiRef, Phi);
+ std::swap(OrigInc, IsomorphicInc);
+ }
+ // Replacing the congruent phi is sufficient because acyclic
+ // redundancy elimination, CSE/GVN, should handle the
+ // rest. However, once SCEV proves that a phi is congruent,
+ // it's often the head of an IV user cycle that is isomorphic
+ // with the original phi. It's worth eagerly cleaning up the
+ // common case of a single IV increment so that DeleteDeadPHIs
+ // can remove cycles that had postinc uses.
+ const SCEV *TruncExpr =
+ SE.getTruncateOrNoop(SE.getSCEV(OrigInc), IsomorphicInc->getType());
+ if (OrigInc != IsomorphicInc &&
+ TruncExpr == SE.getSCEV(IsomorphicInc) &&
+ SE.LI.replacementPreservesLCSSAForm(IsomorphicInc, OrigInc) &&
+ hoistIVInc(OrigInc, IsomorphicInc)) {
+ DEBUG_WITH_TYPE(DebugType,
+ dbgs() << "INDVARS: Eliminated congruent iv.inc: "
+ << *IsomorphicInc << '\n');
+ Value *NewInc = OrigInc;
+ if (OrigInc->getType() != IsomorphicInc->getType()) {
+ Instruction *IP = nullptr;
+ if (PHINode *PN = dyn_cast<PHINode>(OrigInc))
+ IP = &*PN->getParent()->getFirstInsertionPt();
+ else
+ IP = OrigInc->getNextNode();
+
+ IRBuilder<> Builder(IP);
+ Builder.SetCurrentDebugLocation(IsomorphicInc->getDebugLoc());
+ NewInc = Builder.CreateTruncOrBitCast(
+ OrigInc, IsomorphicInc->getType(), IVName);
+ }
+ IsomorphicInc->replaceAllUsesWith(NewInc);
+ DeadInsts.emplace_back(IsomorphicInc);
+ }
+ }
+ }
+ DEBUG_WITH_TYPE(DebugType, dbgs() << "INDVARS: Eliminated congruent iv: "
+ << *Phi << '\n');
DEBUG_WITH_TYPE(DebugType, dbgs() << "INDVARS: Original iv: "
<< *OrigPhiRef << '\n');
- ++NumElim;
- Value *NewIV = OrigPhiRef;
- if (OrigPhiRef->getType() != Phi->getType()) {
- IRBuilder<> Builder(&*L->getHeader()->getFirstInsertionPt());
- Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
- NewIV = Builder.CreateTruncOrBitCast(OrigPhiRef, Phi->getType(), IVName);
- }
- Phi->replaceAllUsesWith(NewIV);
- DeadInsts.emplace_back(Phi);
- }
- return NumElim;
-}
-
-Optional<ScalarEvolution::ValueOffsetPair>
-SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At,
- Loop *L) {
- using namespace llvm::PatternMatch;
-
- SmallVector<BasicBlock *, 4> ExitingBlocks;
- L->getExitingBlocks(ExitingBlocks);
-
- // Look for suitable value in simple conditions at the loop exits.
- for (BasicBlock *BB : ExitingBlocks) {
- ICmpInst::Predicate Pred;
- Instruction *LHS, *RHS;
-
- if (!match(BB->getTerminator(),
- m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)),
- m_BasicBlock(), m_BasicBlock())))
- continue;
-
- if (SE.getSCEV(LHS) == S && SE.DT.dominates(LHS, At))
- return ScalarEvolution::ValueOffsetPair(LHS, nullptr);
-
- if (SE.getSCEV(RHS) == S && SE.DT.dominates(RHS, At))
- return ScalarEvolution::ValueOffsetPair(RHS, nullptr);
- }
-
- // Use expand's logic which is used for reusing a previous Value in
- // ExprValueMap.
- ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, At);
- if (VO.first)
- return VO;
-
- // There is potential to make this significantly smarter, but this simple
- // heuristic already gets some interesting cases.
-
- // Can not find suitable value.
- return None;
-}
-
+ ++NumElim;
+ Value *NewIV = OrigPhiRef;
+ if (OrigPhiRef->getType() != Phi->getType()) {
+ IRBuilder<> Builder(&*L->getHeader()->getFirstInsertionPt());
+ Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
+ NewIV = Builder.CreateTruncOrBitCast(OrigPhiRef, Phi->getType(), IVName);
+ }
+ Phi->replaceAllUsesWith(NewIV);
+ DeadInsts.emplace_back(Phi);
+ }
+ return NumElim;
+}
+
+Optional<ScalarEvolution::ValueOffsetPair>
+SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At,
+ Loop *L) {
+ using namespace llvm::PatternMatch;
+
+ SmallVector<BasicBlock *, 4> ExitingBlocks;
+ L->getExitingBlocks(ExitingBlocks);
+
+ // Look for suitable value in simple conditions at the loop exits.
+ for (BasicBlock *BB : ExitingBlocks) {
+ ICmpInst::Predicate Pred;
+ Instruction *LHS, *RHS;
+
+ if (!match(BB->getTerminator(),
+ m_Br(m_ICmp(Pred, m_Instruction(LHS), m_Instruction(RHS)),
+ m_BasicBlock(), m_BasicBlock())))
+ continue;
+
+ if (SE.getSCEV(LHS) == S && SE.DT.dominates(LHS, At))
+ return ScalarEvolution::ValueOffsetPair(LHS, nullptr);
+
+ if (SE.getSCEV(RHS) == S && SE.DT.dominates(RHS, At))
+ return ScalarEvolution::ValueOffsetPair(RHS, nullptr);
+ }
+
+ // Use expand's logic which is used for reusing a previous Value in
+ // ExprValueMap.
+ ScalarEvolution::ValueOffsetPair VO = FindValueInExprValueMap(S, At);
+ if (VO.first)
+ return VO;
+
+ // There is potential to make this significantly smarter, but this simple
+ // heuristic already gets some interesting cases.
+
+ // Can not find suitable value.
+ return None;
+}
+
template<typename T> static int costAndCollectOperands(
const SCEVOperand &WorkItem, const TargetTransformInfo &TTI,
TargetTransformInfo::TargetCostKind CostKind,
@@ -2318,33 +2318,33 @@ template<typename T> static int costAndCollectOperands(
return Cost;
}
-bool SCEVExpander::isHighCostExpansionHelper(
+bool SCEVExpander::isHighCostExpansionHelper(
const SCEVOperand &WorkItem, Loop *L, const Instruction &At,
int &BudgetRemaining, const TargetTransformInfo &TTI,
SmallPtrSetImpl<const SCEV *> &Processed,
SmallVectorImpl<SCEVOperand> &Worklist) {
- if (BudgetRemaining < 0)
- return true; // Already run out of budget, give up.
-
+ if (BudgetRemaining < 0)
+ return true; // Already run out of budget, give up.
+
const SCEV *S = WorkItem.S;
- // Was the cost of expansion of this expression already accounted for?
+ // Was the cost of expansion of this expression already accounted for?
if (!isa<SCEVConstant>(S) && !Processed.insert(S).second)
- return false; // We have already accounted for this expression.
-
- // If we can find an existing value for this scev available at the point "At"
- // then consider the expression cheap.
- if (getRelatedExistingExpansion(S, &At, L))
- return false; // Consider the expression to be free.
-
+ return false; // We have already accounted for this expression.
+
+ // If we can find an existing value for this scev available at the point "At"
+ // then consider the expression cheap.
+ if (getRelatedExistingExpansion(S, &At, L))
+ return false; // Consider the expression to be free.
+
TargetTransformInfo::TargetCostKind CostKind =
L->getHeader()->getParent()->hasMinSize()
? TargetTransformInfo::TCK_CodeSize
: TargetTransformInfo::TCK_RecipThroughput;
- switch (S->getSCEVType()) {
+ switch (S->getSCEVType()) {
case scCouldNotCompute:
llvm_unreachable("Attempt to use a SCEVCouldNotCompute object!");
- case scUnknown:
+ case scUnknown:
// Assume to be zero-cost.
return false;
case scConstant: {
@@ -2356,7 +2356,7 @@ bool SCEVExpander::isHighCostExpansionHelper(
BudgetRemaining -= TTI.getIntImmCostInst(
WorkItem.ParentOpcode, WorkItem.OperandIdx, Imm, Ty, CostKind);
return BudgetRemaining < 0;
- }
+ }
case scTruncate:
case scPtrToInt:
case scZeroExtend:
@@ -2364,27 +2364,27 @@ bool SCEVExpander::isHighCostExpansionHelper(
int Cost =
costAndCollectOperands<SCEVCastExpr>(WorkItem, TTI, CostKind, Worklist);
BudgetRemaining -= Cost;
- return false; // Will answer upon next entry into this function.
- }
+ return false; // Will answer upon next entry into this function.
+ }
case scUDivExpr: {
- // UDivExpr is very likely a UDiv that ScalarEvolution's HowFarToZero or
- // HowManyLessThans produced to compute a precise expression, rather than a
- // UDiv from the user's code. If we can't find a UDiv in the code with some
- // simple searching, we need to account for it's cost.
-
- // At the beginning of this function we already tried to find existing
- // value for plain 'S'. Now try to lookup 'S + 1' since it is common
- // pattern involving division. This is just a simple search heuristic.
- if (getRelatedExistingExpansion(
- SE.getAddExpr(S, SE.getConstant(S->getType(), 1)), &At, L))
- return false; // Consider it to be free.
-
+ // UDivExpr is very likely a UDiv that ScalarEvolution's HowFarToZero or
+ // HowManyLessThans produced to compute a precise expression, rather than a
+ // UDiv from the user's code. If we can't find a UDiv in the code with some
+ // simple searching, we need to account for it's cost.
+
+ // At the beginning of this function we already tried to find existing
+ // value for plain 'S'. Now try to lookup 'S + 1' since it is common
+ // pattern involving division. This is just a simple search heuristic.
+ if (getRelatedExistingExpansion(
+ SE.getAddExpr(S, SE.getConstant(S->getType(), 1)), &At, L))
+ return false; // Consider it to be free.
+
int Cost =
costAndCollectOperands<SCEVUDivExpr>(WorkItem, TTI, CostKind, Worklist);
- // Need to count the cost of this UDiv.
+ // Need to count the cost of this UDiv.
BudgetRemaining -= Cost;
- return false; // Will answer upon next entry into this function.
- }
+ return false; // Will answer upon next entry into this function.
+ }
case scAddExpr:
case scMulExpr:
case scUMaxExpr:
@@ -2392,14 +2392,14 @@ bool SCEVExpander::isHighCostExpansionHelper(
case scUMinExpr:
case scSMinExpr: {
assert(cast<SCEVNAryExpr>(S)->getNumOperands() > 1 &&
- "Nary expr should have more than 1 operand.");
- // The simple nary expr will require one less op (or pair of ops)
- // than the number of it's terms.
+ "Nary expr should have more than 1 operand.");
+ // The simple nary expr will require one less op (or pair of ops)
+ // than the number of it's terms.
int Cost =
costAndCollectOperands<SCEVNAryExpr>(WorkItem, TTI, CostKind, Worklist);
BudgetRemaining -= Cost;
return BudgetRemaining < 0;
- }
+ }
case scAddRecExpr: {
assert(cast<SCEVAddRecExpr>(S)->getNumOperands() >= 2 &&
"Polynomial should be at least linear");
@@ -2409,173 +2409,173 @@ bool SCEVExpander::isHighCostExpansionHelper(
}
}
llvm_unreachable("Unknown SCEV kind!");
-}
-
-Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred,
- Instruction *IP) {
- assert(IP);
- switch (Pred->getKind()) {
- case SCEVPredicate::P_Union:
- return expandUnionPredicate(cast<SCEVUnionPredicate>(Pred), IP);
- case SCEVPredicate::P_Equal:
- return expandEqualPredicate(cast<SCEVEqualPredicate>(Pred), IP);
- case SCEVPredicate::P_Wrap: {
- auto *AddRecPred = cast<SCEVWrapPredicate>(Pred);
- return expandWrapPredicate(AddRecPred, IP);
- }
- }
- llvm_unreachable("Unknown SCEV predicate type");
-}
-
-Value *SCEVExpander::expandEqualPredicate(const SCEVEqualPredicate *Pred,
- Instruction *IP) {
+}
+
+Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred,
+ Instruction *IP) {
+ assert(IP);
+ switch (Pred->getKind()) {
+ case SCEVPredicate::P_Union:
+ return expandUnionPredicate(cast<SCEVUnionPredicate>(Pred), IP);
+ case SCEVPredicate::P_Equal:
+ return expandEqualPredicate(cast<SCEVEqualPredicate>(Pred), IP);
+ case SCEVPredicate::P_Wrap: {
+ auto *AddRecPred = cast<SCEVWrapPredicate>(Pred);
+ return expandWrapPredicate(AddRecPred, IP);
+ }
+ }
+ llvm_unreachable("Unknown SCEV predicate type");
+}
+
+Value *SCEVExpander::expandEqualPredicate(const SCEVEqualPredicate *Pred,
+ Instruction *IP) {
Value *Expr0 =
expandCodeForImpl(Pred->getLHS(), Pred->getLHS()->getType(), IP, false);
Value *Expr1 =
expandCodeForImpl(Pred->getRHS(), Pred->getRHS()->getType(), IP, false);
-
- Builder.SetInsertPoint(IP);
- auto *I = Builder.CreateICmpNE(Expr0, Expr1, "ident.check");
- return I;
-}
-
-Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
- Instruction *Loc, bool Signed) {
- assert(AR->isAffine() && "Cannot generate RT check for "
- "non-affine expression");
-
- SCEVUnionPredicate Pred;
- const SCEV *ExitCount =
- SE.getPredicatedBackedgeTakenCount(AR->getLoop(), Pred);
-
+
+ Builder.SetInsertPoint(IP);
+ auto *I = Builder.CreateICmpNE(Expr0, Expr1, "ident.check");
+ return I;
+}
+
+Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
+ Instruction *Loc, bool Signed) {
+ assert(AR->isAffine() && "Cannot generate RT check for "
+ "non-affine expression");
+
+ SCEVUnionPredicate Pred;
+ const SCEV *ExitCount =
+ SE.getPredicatedBackedgeTakenCount(AR->getLoop(), Pred);
+
assert(!isa<SCEVCouldNotCompute>(ExitCount) && "Invalid loop count");
-
- const SCEV *Step = AR->getStepRecurrence(SE);
- const SCEV *Start = AR->getStart();
-
- Type *ARTy = AR->getType();
- unsigned SrcBits = SE.getTypeSizeInBits(ExitCount->getType());
- unsigned DstBits = SE.getTypeSizeInBits(ARTy);
-
- // The expression {Start,+,Step} has nusw/nssw if
- // Step < 0, Start - |Step| * Backedge <= Start
- // Step >= 0, Start + |Step| * Backedge > Start
- // and |Step| * Backedge doesn't unsigned overflow.
-
- IntegerType *CountTy = IntegerType::get(Loc->getContext(), SrcBits);
- Builder.SetInsertPoint(Loc);
+
+ const SCEV *Step = AR->getStepRecurrence(SE);
+ const SCEV *Start = AR->getStart();
+
+ Type *ARTy = AR->getType();
+ unsigned SrcBits = SE.getTypeSizeInBits(ExitCount->getType());
+ unsigned DstBits = SE.getTypeSizeInBits(ARTy);
+
+ // The expression {Start,+,Step} has nusw/nssw if
+ // Step < 0, Start - |Step| * Backedge <= Start
+ // Step >= 0, Start + |Step| * Backedge > Start
+ // and |Step| * Backedge doesn't unsigned overflow.
+
+ IntegerType *CountTy = IntegerType::get(Loc->getContext(), SrcBits);
+ Builder.SetInsertPoint(Loc);
Value *TripCountVal = expandCodeForImpl(ExitCount, CountTy, Loc, false);
-
- IntegerType *Ty =
- IntegerType::get(Loc->getContext(), SE.getTypeSizeInBits(ARTy));
- Type *ARExpandTy = DL.isNonIntegralPointerType(ARTy) ? ARTy : Ty;
-
+
+ IntegerType *Ty =
+ IntegerType::get(Loc->getContext(), SE.getTypeSizeInBits(ARTy));
+ Type *ARExpandTy = DL.isNonIntegralPointerType(ARTy) ? ARTy : Ty;
+
Value *StepValue = expandCodeForImpl(Step, Ty, Loc, false);
Value *NegStepValue =
expandCodeForImpl(SE.getNegativeSCEV(Step), Ty, Loc, false);
Value *StartValue = expandCodeForImpl(Start, ARExpandTy, Loc, false);
-
- ConstantInt *Zero =
- ConstantInt::get(Loc->getContext(), APInt::getNullValue(DstBits));
-
- Builder.SetInsertPoint(Loc);
- // Compute |Step|
- Value *StepCompare = Builder.CreateICmp(ICmpInst::ICMP_SLT, StepValue, Zero);
- Value *AbsStep = Builder.CreateSelect(StepCompare, NegStepValue, StepValue);
-
- // Get the backedge taken count and truncate or extended to the AR type.
- Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty);
- auto *MulF = Intrinsic::getDeclaration(Loc->getModule(),
- Intrinsic::umul_with_overflow, Ty);
-
- // Compute |Step| * Backedge
- CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul");
- Value *MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
- Value *OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
-
- // Compute:
- // Start + |Step| * Backedge < Start
- // Start - |Step| * Backedge > Start
- Value *Add = nullptr, *Sub = nullptr;
- if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARExpandTy)) {
- const SCEV *MulS = SE.getSCEV(MulV);
- const SCEV *NegMulS = SE.getNegativeSCEV(MulS);
- Add = Builder.CreateBitCast(expandAddToGEP(MulS, ARPtrTy, Ty, StartValue),
- ARPtrTy);
- Sub = Builder.CreateBitCast(
- expandAddToGEP(NegMulS, ARPtrTy, Ty, StartValue), ARPtrTy);
- } else {
- Add = Builder.CreateAdd(StartValue, MulV);
- Sub = Builder.CreateSub(StartValue, MulV);
- }
-
- Value *EndCompareGT = Builder.CreateICmp(
- Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue);
-
- Value *EndCompareLT = Builder.CreateICmp(
- Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, Add, StartValue);
-
- // Select the answer based on the sign of Step.
- Value *EndCheck =
- Builder.CreateSelect(StepCompare, EndCompareGT, EndCompareLT);
-
- // If the backedge taken count type is larger than the AR type,
- // check that we don't drop any bits by truncating it. If we are
- // dropping bits, then we have overflow (unless the step is zero).
- if (SE.getTypeSizeInBits(CountTy) > SE.getTypeSizeInBits(Ty)) {
- auto MaxVal = APInt::getMaxValue(DstBits).zext(SrcBits);
- auto *BackedgeCheck =
- Builder.CreateICmp(ICmpInst::ICMP_UGT, TripCountVal,
- ConstantInt::get(Loc->getContext(), MaxVal));
- BackedgeCheck = Builder.CreateAnd(
- BackedgeCheck, Builder.CreateICmp(ICmpInst::ICMP_NE, StepValue, Zero));
-
- EndCheck = Builder.CreateOr(EndCheck, BackedgeCheck);
- }
-
+
+ ConstantInt *Zero =
+ ConstantInt::get(Loc->getContext(), APInt::getNullValue(DstBits));
+
+ Builder.SetInsertPoint(Loc);
+ // Compute |Step|
+ Value *StepCompare = Builder.CreateICmp(ICmpInst::ICMP_SLT, StepValue, Zero);
+ Value *AbsStep = Builder.CreateSelect(StepCompare, NegStepValue, StepValue);
+
+ // Get the backedge taken count and truncate or extended to the AR type.
+ Value *TruncTripCount = Builder.CreateZExtOrTrunc(TripCountVal, Ty);
+ auto *MulF = Intrinsic::getDeclaration(Loc->getModule(),
+ Intrinsic::umul_with_overflow, Ty);
+
+ // Compute |Step| * Backedge
+ CallInst *Mul = Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul");
+ Value *MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
+ Value *OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
+
+ // Compute:
+ // Start + |Step| * Backedge < Start
+ // Start - |Step| * Backedge > Start
+ Value *Add = nullptr, *Sub = nullptr;
+ if (PointerType *ARPtrTy = dyn_cast<PointerType>(ARExpandTy)) {
+ const SCEV *MulS = SE.getSCEV(MulV);
+ const SCEV *NegMulS = SE.getNegativeSCEV(MulS);
+ Add = Builder.CreateBitCast(expandAddToGEP(MulS, ARPtrTy, Ty, StartValue),
+ ARPtrTy);
+ Sub = Builder.CreateBitCast(
+ expandAddToGEP(NegMulS, ARPtrTy, Ty, StartValue), ARPtrTy);
+ } else {
+ Add = Builder.CreateAdd(StartValue, MulV);
+ Sub = Builder.CreateSub(StartValue, MulV);
+ }
+
+ Value *EndCompareGT = Builder.CreateICmp(
+ Signed ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT, Sub, StartValue);
+
+ Value *EndCompareLT = Builder.CreateICmp(
+ Signed ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT, Add, StartValue);
+
+ // Select the answer based on the sign of Step.
+ Value *EndCheck =
+ Builder.CreateSelect(StepCompare, EndCompareGT, EndCompareLT);
+
+ // If the backedge taken count type is larger than the AR type,
+ // check that we don't drop any bits by truncating it. If we are
+ // dropping bits, then we have overflow (unless the step is zero).
+ if (SE.getTypeSizeInBits(CountTy) > SE.getTypeSizeInBits(Ty)) {
+ auto MaxVal = APInt::getMaxValue(DstBits).zext(SrcBits);
+ auto *BackedgeCheck =
+ Builder.CreateICmp(ICmpInst::ICMP_UGT, TripCountVal,
+ ConstantInt::get(Loc->getContext(), MaxVal));
+ BackedgeCheck = Builder.CreateAnd(
+ BackedgeCheck, Builder.CreateICmp(ICmpInst::ICMP_NE, StepValue, Zero));
+
+ EndCheck = Builder.CreateOr(EndCheck, BackedgeCheck);
+ }
+
return Builder.CreateOr(EndCheck, OfMul);
-}
-
-Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred,
- Instruction *IP) {
- const auto *A = cast<SCEVAddRecExpr>(Pred->getExpr());
- Value *NSSWCheck = nullptr, *NUSWCheck = nullptr;
-
- // Add a check for NUSW
- if (Pred->getFlags() & SCEVWrapPredicate::IncrementNUSW)
- NUSWCheck = generateOverflowCheck(A, IP, false);
-
- // Add a check for NSSW
- if (Pred->getFlags() & SCEVWrapPredicate::IncrementNSSW)
- NSSWCheck = generateOverflowCheck(A, IP, true);
-
- if (NUSWCheck && NSSWCheck)
- return Builder.CreateOr(NUSWCheck, NSSWCheck);
-
- if (NUSWCheck)
- return NUSWCheck;
-
- if (NSSWCheck)
- return NSSWCheck;
-
- return ConstantInt::getFalse(IP->getContext());
-}
-
-Value *SCEVExpander::expandUnionPredicate(const SCEVUnionPredicate *Union,
- Instruction *IP) {
- auto *BoolType = IntegerType::get(IP->getContext(), 1);
- Value *Check = ConstantInt::getNullValue(BoolType);
-
- // Loop over all checks in this set.
- for (auto Pred : Union->getPredicates()) {
- auto *NextCheck = expandCodeForPredicate(Pred, IP);
- Builder.SetInsertPoint(IP);
- Check = Builder.CreateOr(Check, NextCheck);
- }
-
- return Check;
-}
-
+}
+
+Value *SCEVExpander::expandWrapPredicate(const SCEVWrapPredicate *Pred,
+ Instruction *IP) {
+ const auto *A = cast<SCEVAddRecExpr>(Pred->getExpr());
+ Value *NSSWCheck = nullptr, *NUSWCheck = nullptr;
+
+ // Add a check for NUSW
+ if (Pred->getFlags() & SCEVWrapPredicate::IncrementNUSW)
+ NUSWCheck = generateOverflowCheck(A, IP, false);
+
+ // Add a check for NSSW
+ if (Pred->getFlags() & SCEVWrapPredicate::IncrementNSSW)
+ NSSWCheck = generateOverflowCheck(A, IP, true);
+
+ if (NUSWCheck && NSSWCheck)
+ return Builder.CreateOr(NUSWCheck, NSSWCheck);
+
+ if (NUSWCheck)
+ return NUSWCheck;
+
+ if (NSSWCheck)
+ return NSSWCheck;
+
+ return ConstantInt::getFalse(IP->getContext());
+}
+
+Value *SCEVExpander::expandUnionPredicate(const SCEVUnionPredicate *Union,
+ Instruction *IP) {
+ auto *BoolType = IntegerType::get(IP->getContext(), 1);
+ Value *Check = ConstantInt::getNullValue(BoolType);
+
+ // Loop over all checks in this set.
+ for (auto Pred : Union->getPredicates()) {
+ auto *NextCheck = expandCodeForPredicate(Pred, IP);
+ Builder.SetInsertPoint(IP);
+ Check = Builder.CreateOr(Check, NextCheck);
+ }
+
+ return Check;
+}
+
Value *SCEVExpander::fixupLCSSAFormFor(Instruction *User, unsigned OpIdx) {
assert(PreserveLCSSA);
SmallVector<Instruction *, 1> ToUpdate;
@@ -2604,83 +2604,83 @@ Value *SCEVExpander::fixupLCSSAFormFor(Instruction *User, unsigned OpIdx) {
return User->getOperand(OpIdx);
}
-namespace {
-// Search for a SCEV subexpression that is not safe to expand. Any expression
-// that may expand to a !isSafeToSpeculativelyExecute value is unsafe, namely
-// UDiv expressions. We don't know if the UDiv is derived from an IR divide
-// instruction, but the important thing is that we prove the denominator is
-// nonzero before expansion.
-//
-// IVUsers already checks that IV-derived expressions are safe. So this check is
-// only needed when the expression includes some subexpression that is not IV
-// derived.
-//
-// Currently, we only allow division by a nonzero constant here. If this is
-// inadequate, we could easily allow division by SCEVUnknown by using
-// ValueTracking to check isKnownNonZero().
-//
-// We cannot generally expand recurrences unless the step dominates the loop
-// header. The expander handles the special case of affine recurrences by
-// scaling the recurrence outside the loop, but this technique isn't generally
-// applicable. Expanding a nested recurrence outside a loop requires computing
-// binomial coefficients. This could be done, but the recurrence has to be in a
-// perfectly reduced form, which can't be guaranteed.
-struct SCEVFindUnsafe {
- ScalarEvolution &SE;
- bool IsUnsafe;
-
- SCEVFindUnsafe(ScalarEvolution &se): SE(se), IsUnsafe(false) {}
-
- bool follow(const SCEV *S) {
- if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
- const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS());
- if (!SC || SC->getValue()->isZero()) {
- IsUnsafe = true;
- return false;
- }
- }
- if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
- const SCEV *Step = AR->getStepRecurrence(SE);
- if (!AR->isAffine() && !SE.dominates(Step, AR->getLoop()->getHeader())) {
- IsUnsafe = true;
- return false;
- }
- }
- return true;
- }
- bool isDone() const { return IsUnsafe; }
-};
-}
-
-namespace llvm {
-bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE) {
- SCEVFindUnsafe Search(SE);
- visitAll(S, Search);
- return !Search.IsUnsafe;
-}
-
-bool isSafeToExpandAt(const SCEV *S, const Instruction *InsertionPoint,
- ScalarEvolution &SE) {
- if (!isSafeToExpand(S, SE))
- return false;
- // We have to prove that the expanded site of S dominates InsertionPoint.
- // This is easy when not in the same block, but hard when S is an instruction
- // to be expanded somewhere inside the same block as our insertion point.
- // What we really need here is something analogous to an OrderedBasicBlock,
- // but for the moment, we paper over the problem by handling two common and
- // cheap to check cases.
- if (SE.properlyDominates(S, InsertionPoint->getParent()))
- return true;
- if (SE.dominates(S, InsertionPoint->getParent())) {
- if (InsertionPoint->getParent()->getTerminator() == InsertionPoint)
- return true;
- if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S))
- for (const Value *V : InsertionPoint->operand_values())
- if (V == U->getValue())
- return true;
- }
- return false;
-}
+namespace {
+// Search for a SCEV subexpression that is not safe to expand. Any expression
+// that may expand to a !isSafeToSpeculativelyExecute value is unsafe, namely
+// UDiv expressions. We don't know if the UDiv is derived from an IR divide
+// instruction, but the important thing is that we prove the denominator is
+// nonzero before expansion.
+//
+// IVUsers already checks that IV-derived expressions are safe. So this check is
+// only needed when the expression includes some subexpression that is not IV
+// derived.
+//
+// Currently, we only allow division by a nonzero constant here. If this is
+// inadequate, we could easily allow division by SCEVUnknown by using
+// ValueTracking to check isKnownNonZero().
+//
+// We cannot generally expand recurrences unless the step dominates the loop
+// header. The expander handles the special case of affine recurrences by
+// scaling the recurrence outside the loop, but this technique isn't generally
+// applicable. Expanding a nested recurrence outside a loop requires computing
+// binomial coefficients. This could be done, but the recurrence has to be in a
+// perfectly reduced form, which can't be guaranteed.
+struct SCEVFindUnsafe {
+ ScalarEvolution &SE;
+ bool IsUnsafe;
+
+ SCEVFindUnsafe(ScalarEvolution &se): SE(se), IsUnsafe(false) {}
+
+ bool follow(const SCEV *S) {
+ if (const SCEVUDivExpr *D = dyn_cast<SCEVUDivExpr>(S)) {
+ const SCEVConstant *SC = dyn_cast<SCEVConstant>(D->getRHS());
+ if (!SC || SC->getValue()->isZero()) {
+ IsUnsafe = true;
+ return false;
+ }
+ }
+ if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
+ const SCEV *Step = AR->getStepRecurrence(SE);
+ if (!AR->isAffine() && !SE.dominates(Step, AR->getLoop()->getHeader())) {
+ IsUnsafe = true;
+ return false;
+ }
+ }
+ return true;
+ }
+ bool isDone() const { return IsUnsafe; }
+};
+}
+
+namespace llvm {
+bool isSafeToExpand(const SCEV *S, ScalarEvolution &SE) {
+ SCEVFindUnsafe Search(SE);
+ visitAll(S, Search);
+ return !Search.IsUnsafe;
+}
+
+bool isSafeToExpandAt(const SCEV *S, const Instruction *InsertionPoint,
+ ScalarEvolution &SE) {
+ if (!isSafeToExpand(S, SE))
+ return false;
+ // We have to prove that the expanded site of S dominates InsertionPoint.
+ // This is easy when not in the same block, but hard when S is an instruction
+ // to be expanded somewhere inside the same block as our insertion point.
+ // What we really need here is something analogous to an OrderedBasicBlock,
+ // but for the moment, we paper over the problem by handling two common and
+ // cheap to check cases.
+ if (SE.properlyDominates(S, InsertionPoint->getParent()))
+ return true;
+ if (SE.dominates(S, InsertionPoint->getParent())) {
+ if (InsertionPoint->getParent()->getTerminator() == InsertionPoint)
+ return true;
+ if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S))
+ for (const Value *V : InsertionPoint->operand_values())
+ if (V == U->getValue())
+ return true;
+ }
+ return false;
+}
SCEVExpanderCleaner::~SCEVExpanderCleaner() {
// Result is used, nothing to remove.
@@ -2716,5 +2716,5 @@ SCEVExpanderCleaner::~SCEVExpanderCleaner() {
I->replaceAllUsesWith(UndefValue::get(I->getType()));
I->eraseFromParent();
}
-}
+}
}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyCFG.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyCFG.cpp
index af157e1a4d..de9560df97 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1,158 +1,158 @@
-//===- SimplifyCFG.cpp - Code to perform CFG simplification ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Peephole optimize the CFG.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
+//===- SimplifyCFG.cpp - Code to perform CFG simplification ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Peephole optimize the CFG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/Sequence.h"
-#include "llvm/ADT/SetOperations.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/GuardUtils.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/ConstantRange.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/NoFolder.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/GuardUtils.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
#include "llvm/IR/ValueHandle.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/KnownBits.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <algorithm>
-#include <cassert>
-#include <climits>
-#include <cstddef>
-#include <cstdint>
-#include <iterator>
-#include <map>
-#include <set>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-using namespace PatternMatch;
-
-#define DEBUG_TYPE "simplifycfg"
-
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <map>
+#include <set>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define DEBUG_TYPE "simplifycfg"
+
cl::opt<bool> llvm::RequireAndPreserveDomTree(
"simplifycfg-require-and-preserve-domtree", cl::Hidden, cl::ZeroOrMore,
cl::init(false),
cl::desc("Temorary development switch used to gradually uplift SimplifyCFG "
"into preserving DomTree,"));
-// Chosen as 2 so as to be cheap, but still to have enough power to fold
-// a select, so the "clamp" idiom (of a min followed by a max) will be caught.
-// To catch this, we need to fold a compare and a select, hence '2' being the
-// minimum reasonable default.
-static cl::opt<unsigned> PHINodeFoldingThreshold(
- "phi-node-folding-threshold", cl::Hidden, cl::init(2),
- cl::desc(
- "Control the amount of phi node folding to perform (default = 2)"));
-
-static cl::opt<unsigned> TwoEntryPHINodeFoldingThreshold(
- "two-entry-phi-node-folding-threshold", cl::Hidden, cl::init(4),
- cl::desc("Control the maximal total instruction cost that we are willing "
- "to speculatively execute to fold a 2-entry PHI node into a "
- "select (default = 4)"));
-
-static cl::opt<bool> DupRet(
- "simplifycfg-dup-ret", cl::Hidden, cl::init(false),
- cl::desc("Duplicate return instructions into unconditional branches"));
-
-static cl::opt<bool>
+// Chosen as 2 so as to be cheap, but still to have enough power to fold
+// a select, so the "clamp" idiom (of a min followed by a max) will be caught.
+// To catch this, we need to fold a compare and a select, hence '2' being the
+// minimum reasonable default.
+static cl::opt<unsigned> PHINodeFoldingThreshold(
+ "phi-node-folding-threshold", cl::Hidden, cl::init(2),
+ cl::desc(
+ "Control the amount of phi node folding to perform (default = 2)"));
+
+static cl::opt<unsigned> TwoEntryPHINodeFoldingThreshold(
+ "two-entry-phi-node-folding-threshold", cl::Hidden, cl::init(4),
+ cl::desc("Control the maximal total instruction cost that we are willing "
+ "to speculatively execute to fold a 2-entry PHI node into a "
+ "select (default = 4)"));
+
+static cl::opt<bool> DupRet(
+ "simplifycfg-dup-ret", cl::Hidden, cl::init(false),
+ cl::desc("Duplicate return instructions into unconditional branches"));
+
+static cl::opt<bool>
HoistCommon("simplifycfg-hoist-common", cl::Hidden, cl::init(true),
cl::desc("Hoist common instructions up to the parent block"));
static cl::opt<bool>
- SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
- cl::desc("Sink common instructions down to the end block"));
-
-static cl::opt<bool> HoistCondStores(
- "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
- cl::desc("Hoist conditional stores if an unconditional store precedes"));
-
-static cl::opt<bool> MergeCondStores(
- "simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true),
- cl::desc("Hoist conditional stores even if an unconditional store does not "
- "precede - hoist multiple conditional stores into a single "
- "predicated store"));
-
-static cl::opt<bool> MergeCondStoresAggressively(
- "simplifycfg-merge-cond-stores-aggressively", cl::Hidden, cl::init(false),
- cl::desc("When merging conditional stores, do so even if the resultant "
- "basic blocks are unlikely to be if-converted as a result"));
-
-static cl::opt<bool> SpeculateOneExpensiveInst(
- "speculate-one-expensive-inst", cl::Hidden, cl::init(true),
- cl::desc("Allow exactly one expensive instruction to be speculatively "
- "executed"));
-
-static cl::opt<unsigned> MaxSpeculationDepth(
- "max-speculation-depth", cl::Hidden, cl::init(10),
- cl::desc("Limit maximum recursion depth when calculating costs of "
- "speculatively executed instructions"));
-
-static cl::opt<int>
-MaxSmallBlockSize("simplifycfg-max-small-block-size", cl::Hidden, cl::init(10),
- cl::desc("Max size of a block which is still considered "
- "small enough to thread through"));
-
+ SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
+ cl::desc("Sink common instructions down to the end block"));
+
+static cl::opt<bool> HoistCondStores(
+ "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
+ cl::desc("Hoist conditional stores if an unconditional store precedes"));
+
+static cl::opt<bool> MergeCondStores(
+ "simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true),
+ cl::desc("Hoist conditional stores even if an unconditional store does not "
+ "precede - hoist multiple conditional stores into a single "
+ "predicated store"));
+
+static cl::opt<bool> MergeCondStoresAggressively(
+ "simplifycfg-merge-cond-stores-aggressively", cl::Hidden, cl::init(false),
+ cl::desc("When merging conditional stores, do so even if the resultant "
+ "basic blocks are unlikely to be if-converted as a result"));
+
+static cl::opt<bool> SpeculateOneExpensiveInst(
+ "speculate-one-expensive-inst", cl::Hidden, cl::init(true),
+ cl::desc("Allow exactly one expensive instruction to be speculatively "
+ "executed"));
+
+static cl::opt<unsigned> MaxSpeculationDepth(
+ "max-speculation-depth", cl::Hidden, cl::init(10),
+ cl::desc("Limit maximum recursion depth when calculating costs of "
+ "speculatively executed instructions"));
+
+static cl::opt<int>
+MaxSmallBlockSize("simplifycfg-max-small-block-size", cl::Hidden, cl::init(10),
+ cl::desc("Max size of a block which is still considered "
+ "small enough to thread through"));
+
// Two is chosen to allow one negation and a logical combine.
static cl::opt<unsigned>
BranchFoldThreshold("simplifycfg-branch-fold-threshold", cl::Hidden,
@@ -160,15 +160,15 @@ static cl::opt<unsigned>
cl::desc("Maximum cost of combining conditions when "
"folding branches"));
-STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
-STATISTIC(NumLinearMaps,
- "Number of switch instructions turned into linear mapping");
-STATISTIC(NumLookupTables,
- "Number of switch instructions turned into lookup tables");
-STATISTIC(
- NumLookupTablesHoles,
- "Number of switch instructions turned into lookup tables (holes checked)");
-STATISTIC(NumTableCmpReuses, "Number of reused switch table lookup compares");
+STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
+STATISTIC(NumLinearMaps,
+ "Number of switch instructions turned into linear mapping");
+STATISTIC(NumLookupTables,
+ "Number of switch instructions turned into lookup tables");
+STATISTIC(
+ NumLookupTablesHoles,
+ "Number of switch instructions turned into lookup tables (holes checked)");
+STATISTIC(NumTableCmpReuses, "Number of reused switch table lookup compares");
STATISTIC(NumFoldValueComparisonIntoPredecessors,
"Number of value comparisons folded into predecessor basic blocks");
STATISTIC(NumFoldBranchToCommonDest,
@@ -181,495 +181,495 @@ STATISTIC(NumHoistCommonInstrs,
STATISTIC(NumSinkCommonCode,
"Number of common instruction 'blocks' sunk down to the end block");
STATISTIC(NumSinkCommonInstrs,
- "Number of common instructions sunk down to the end block");
-STATISTIC(NumSpeculations, "Number of speculative executed instructions");
+ "Number of common instructions sunk down to the end block");
+STATISTIC(NumSpeculations, "Number of speculative executed instructions");
STATISTIC(NumInvokes,
"Number of invokes with empty resume blocks simplified into calls");
-
-namespace {
-
-// The first field contains the value that the switch produces when a certain
-// case group is selected, and the second field is a vector containing the
-// cases composing the case group.
-using SwitchCaseResultVectorTy =
- SmallVector<std::pair<Constant *, SmallVector<ConstantInt *, 4>>, 2>;
-
-// The first field contains the phi node that generates a result of the switch
-// and the second field contains the value generated for a certain case in the
-// switch for that PHI.
-using SwitchCaseResultsTy = SmallVector<std::pair<PHINode *, Constant *>, 4>;
-
-/// ValueEqualityComparisonCase - Represents a case of a switch.
-struct ValueEqualityComparisonCase {
- ConstantInt *Value;
- BasicBlock *Dest;
-
- ValueEqualityComparisonCase(ConstantInt *Value, BasicBlock *Dest)
- : Value(Value), Dest(Dest) {}
-
- bool operator<(ValueEqualityComparisonCase RHS) const {
- // Comparing pointers is ok as we only rely on the order for uniquing.
- return Value < RHS.Value;
- }
-
- bool operator==(BasicBlock *RHSDest) const { return Dest == RHSDest; }
-};
-
-class SimplifyCFGOpt {
- const TargetTransformInfo &TTI;
+
+namespace {
+
+// The first field contains the value that the switch produces when a certain
+// case group is selected, and the second field is a vector containing the
+// cases composing the case group.
+using SwitchCaseResultVectorTy =
+ SmallVector<std::pair<Constant *, SmallVector<ConstantInt *, 4>>, 2>;
+
+// The first field contains the phi node that generates a result of the switch
+// and the second field contains the value generated for a certain case in the
+// switch for that PHI.
+using SwitchCaseResultsTy = SmallVector<std::pair<PHINode *, Constant *>, 4>;
+
+/// ValueEqualityComparisonCase - Represents a case of a switch.
+struct ValueEqualityComparisonCase {
+ ConstantInt *Value;
+ BasicBlock *Dest;
+
+ ValueEqualityComparisonCase(ConstantInt *Value, BasicBlock *Dest)
+ : Value(Value), Dest(Dest) {}
+
+ bool operator<(ValueEqualityComparisonCase RHS) const {
+ // Comparing pointers is ok as we only rely on the order for uniquing.
+ return Value < RHS.Value;
+ }
+
+ bool operator==(BasicBlock *RHSDest) const { return Dest == RHSDest; }
+};
+
+class SimplifyCFGOpt {
+ const TargetTransformInfo &TTI;
DomTreeUpdater *DTU;
- const DataLayout &DL;
+ const DataLayout &DL;
ArrayRef<WeakVH> LoopHeaders;
- const SimplifyCFGOptions &Options;
- bool Resimplify;
-
- Value *isValueEqualityComparison(Instruction *TI);
- BasicBlock *GetValueEqualityComparisonCases(
- Instruction *TI, std::vector<ValueEqualityComparisonCase> &Cases);
- bool SimplifyEqualityComparisonWithOnlyPredecessor(Instruction *TI,
- BasicBlock *Pred,
- IRBuilder<> &Builder);
+ const SimplifyCFGOptions &Options;
+ bool Resimplify;
+
+ Value *isValueEqualityComparison(Instruction *TI);
+ BasicBlock *GetValueEqualityComparisonCases(
+ Instruction *TI, std::vector<ValueEqualityComparisonCase> &Cases);
+ bool SimplifyEqualityComparisonWithOnlyPredecessor(Instruction *TI,
+ BasicBlock *Pred,
+ IRBuilder<> &Builder);
bool PerformValueComparisonIntoPredecessorFolding(Instruction *TI, Value *&CV,
Instruction *PTI,
IRBuilder<> &Builder);
- bool FoldValueComparisonIntoPredecessors(Instruction *TI,
- IRBuilder<> &Builder);
-
- bool simplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
- bool simplifyResume(ResumeInst *RI, IRBuilder<> &Builder);
- bool simplifySingleResume(ResumeInst *RI);
- bool simplifyCommonResume(ResumeInst *RI);
- bool simplifyCleanupReturn(CleanupReturnInst *RI);
- bool simplifyUnreachable(UnreachableInst *UI);
- bool simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);
- bool simplifyIndirectBr(IndirectBrInst *IBI);
- bool simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder);
- bool simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder);
- bool simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder);
- bool SimplifyCondBranchToTwoReturns(BranchInst *BI, IRBuilder<> &Builder);
-
- bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
- IRBuilder<> &Builder);
-
- bool HoistThenElseCodeToIf(BranchInst *BI, const TargetTransformInfo &TTI);
- bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
- const TargetTransformInfo &TTI);
- bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
- BasicBlock *TrueBB, BasicBlock *FalseBB,
- uint32_t TrueWeight, uint32_t FalseWeight);
- bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
- const DataLayout &DL);
- bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select);
- bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI);
- bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder);
-
-public:
+ bool FoldValueComparisonIntoPredecessors(Instruction *TI,
+ IRBuilder<> &Builder);
+
+ bool simplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
+ bool simplifyResume(ResumeInst *RI, IRBuilder<> &Builder);
+ bool simplifySingleResume(ResumeInst *RI);
+ bool simplifyCommonResume(ResumeInst *RI);
+ bool simplifyCleanupReturn(CleanupReturnInst *RI);
+ bool simplifyUnreachable(UnreachableInst *UI);
+ bool simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);
+ bool simplifyIndirectBr(IndirectBrInst *IBI);
+ bool simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder);
+ bool simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder);
+ bool simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder);
+ bool SimplifyCondBranchToTwoReturns(BranchInst *BI, IRBuilder<> &Builder);
+
+ bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
+ IRBuilder<> &Builder);
+
+ bool HoistThenElseCodeToIf(BranchInst *BI, const TargetTransformInfo &TTI);
+ bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
+ const TargetTransformInfo &TTI);
+ bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
+ BasicBlock *TrueBB, BasicBlock *FalseBB,
+ uint32_t TrueWeight, uint32_t FalseWeight);
+ bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
+ const DataLayout &DL);
+ bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select);
+ bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI);
+ bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder);
+
+public:
SimplifyCFGOpt(const TargetTransformInfo &TTI, DomTreeUpdater *DTU,
const DataLayout &DL, ArrayRef<WeakVH> LoopHeaders,
- const SimplifyCFGOptions &Opts)
+ const SimplifyCFGOptions &Opts)
: TTI(TTI), DTU(DTU), DL(DL), LoopHeaders(LoopHeaders), Options(Opts) {
assert((!DTU || !DTU->hasPostDomTree()) &&
"SimplifyCFG is not yet capable of maintaining validity of a "
"PostDomTree, so don't ask for it.");
}
-
+
bool simplifyOnce(BasicBlock *BB);
bool simplifyOnceImpl(BasicBlock *BB);
- bool run(BasicBlock *BB);
-
- // Helper to set Resimplify and return change indication.
- bool requestResimplify() {
- Resimplify = true;
- return true;
- }
-};
-
-} // end anonymous namespace
-
-/// Return true if it is safe to merge these two
-/// terminator instructions together.
-static bool
-SafeToMergeTerminators(Instruction *SI1, Instruction *SI2,
- SmallSetVector<BasicBlock *, 4> *FailBlocks = nullptr) {
- if (SI1 == SI2)
- return false; // Can't merge with self!
-
- // It is not safe to merge these two switch instructions if they have a common
- // successor, and if that successor has a PHI node, and if *that* PHI node has
- // conflicting incoming values from the two switch blocks.
- BasicBlock *SI1BB = SI1->getParent();
- BasicBlock *SI2BB = SI2->getParent();
-
- SmallPtrSet<BasicBlock *, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
- bool Fail = false;
- for (BasicBlock *Succ : successors(SI2BB))
- if (SI1Succs.count(Succ))
- for (BasicBlock::iterator BBI = Succ->begin(); isa<PHINode>(BBI); ++BBI) {
- PHINode *PN = cast<PHINode>(BBI);
- if (PN->getIncomingValueForBlock(SI1BB) !=
- PN->getIncomingValueForBlock(SI2BB)) {
- if (FailBlocks)
- FailBlocks->insert(Succ);
- Fail = true;
- }
- }
-
- return !Fail;
-}
-
-/// Update PHI nodes in Succ to indicate that there will now be entries in it
-/// from the 'NewPred' block. The values that will be flowing into the PHI nodes
-/// will be the same as those coming in from ExistPred, an existing predecessor
-/// of Succ.
-static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
- BasicBlock *ExistPred,
- MemorySSAUpdater *MSSAU = nullptr) {
- for (PHINode &PN : Succ->phis())
- PN.addIncoming(PN.getIncomingValueForBlock(ExistPred), NewPred);
- if (MSSAU)
- if (auto *MPhi = MSSAU->getMemorySSA()->getMemoryAccess(Succ))
- MPhi->addIncoming(MPhi->getIncomingValueForBlock(ExistPred), NewPred);
-}
-
-/// Compute an abstract "cost" of speculating the given instruction,
-/// which is assumed to be safe to speculate. TCC_Free means cheap,
-/// TCC_Basic means less cheap, and TCC_Expensive means prohibitively
-/// expensive.
-static unsigned ComputeSpeculationCost(const User *I,
- const TargetTransformInfo &TTI) {
- assert(isSafeToSpeculativelyExecute(I) &&
- "Instruction is not safe to speculatively execute!");
- return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
-}
-
-/// If we have a merge point of an "if condition" as accepted above,
-/// return true if the specified value dominates the block. We
-/// don't handle the true generality of domination here, just a special case
-/// which works well enough for us.
-///
-/// If AggressiveInsts is non-null, and if V does not dominate BB, we check to
-/// see if V (which must be an instruction) and its recursive operands
-/// that do not dominate BB have a combined cost lower than CostRemaining and
-/// are non-trapping. If both are true, the instruction is inserted into the
-/// set and true is returned.
-///
-/// The cost for most non-trapping instructions is defined as 1 except for
-/// Select whose cost is 2.
-///
-/// After this function returns, CostRemaining is decreased by the cost of
-/// V plus its non-dominating operands. If that cost is greater than
-/// CostRemaining, false is returned and CostRemaining is undefined.
-static bool DominatesMergePoint(Value *V, BasicBlock *BB,
- SmallPtrSetImpl<Instruction *> &AggressiveInsts,
- int &BudgetRemaining,
- const TargetTransformInfo &TTI,
- unsigned Depth = 0) {
- // It is possible to hit a zero-cost cycle (phi/gep instructions for example),
- // so limit the recursion depth.
- // TODO: While this recursion limit does prevent pathological behavior, it
- // would be better to track visited instructions to avoid cycles.
- if (Depth == MaxSpeculationDepth)
- return false;
-
- Instruction *I = dyn_cast<Instruction>(V);
- if (!I) {
- // Non-instructions all dominate instructions, but not all constantexprs
- // can be executed unconditionally.
- if (ConstantExpr *C = dyn_cast<ConstantExpr>(V))
- if (C->canTrap())
- return false;
- return true;
- }
- BasicBlock *PBB = I->getParent();
-
- // We don't want to allow weird loops that might have the "if condition" in
- // the bottom of this block.
- if (PBB == BB)
- return false;
-
- // If this instruction is defined in a block that contains an unconditional
- // branch to BB, then it must be in the 'conditional' part of the "if
- // statement". If not, it definitely dominates the region.
- BranchInst *BI = dyn_cast<BranchInst>(PBB->getTerminator());
- if (!BI || BI->isConditional() || BI->getSuccessor(0) != BB)
- return true;
-
- // If we have seen this instruction before, don't count it again.
- if (AggressiveInsts.count(I))
- return true;
-
- // Okay, it looks like the instruction IS in the "condition". Check to
- // see if it's a cheap instruction to unconditionally compute, and if it
- // only uses stuff defined outside of the condition. If so, hoist it out.
- if (!isSafeToSpeculativelyExecute(I))
- return false;
-
- BudgetRemaining -= ComputeSpeculationCost(I, TTI);
-
- // Allow exactly one instruction to be speculated regardless of its cost
- // (as long as it is safe to do so).
- // This is intended to flatten the CFG even if the instruction is a division
- // or other expensive operation. The speculation of an expensive instruction
- // is expected to be undone in CodeGenPrepare if the speculation has not
- // enabled further IR optimizations.
- if (BudgetRemaining < 0 &&
- (!SpeculateOneExpensiveInst || !AggressiveInsts.empty() || Depth > 0))
- return false;
-
- // Okay, we can only really hoist these out if their operands do
- // not take us over the cost threshold.
- for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
- if (!DominatesMergePoint(*i, BB, AggressiveInsts, BudgetRemaining, TTI,
- Depth + 1))
- return false;
- // Okay, it's safe to do this! Remember this instruction.
- AggressiveInsts.insert(I);
- return true;
-}
-
-/// Extract ConstantInt from value, looking through IntToPtr
-/// and PointerNullValue. Return NULL if value is not a constant int.
-static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) {
- // Normal constant int.
- ConstantInt *CI = dyn_cast<ConstantInt>(V);
- if (CI || !isa<Constant>(V) || !V->getType()->isPointerTy())
- return CI;
-
- // This is some kind of pointer constant. Turn it into a pointer-sized
- // ConstantInt if possible.
- IntegerType *PtrTy = cast<IntegerType>(DL.getIntPtrType(V->getType()));
-
- // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*).
- if (isa<ConstantPointerNull>(V))
- return ConstantInt::get(PtrTy, 0);
-
- // IntToPtr const int.
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
- if (CE->getOpcode() == Instruction::IntToPtr)
- if (ConstantInt *CI = dyn_cast<ConstantInt>(CE->getOperand(0))) {
- // The constant is very likely to have the right type already.
- if (CI->getType() == PtrTy)
- return CI;
- else
- return cast<ConstantInt>(
- ConstantExpr::getIntegerCast(CI, PtrTy, /*isSigned=*/false));
- }
- return nullptr;
-}
-
-namespace {
-
-/// Given a chain of or (||) or and (&&) comparison of a value against a
-/// constant, this will try to recover the information required for a switch
-/// structure.
-/// It will depth-first traverse the chain of comparison, seeking for patterns
-/// like %a == 12 or %a < 4 and combine them to produce a set of integer
-/// representing the different cases for the switch.
-/// Note that if the chain is composed of '||' it will build the set of elements
-/// that matches the comparisons (i.e. any of this value validate the chain)
-/// while for a chain of '&&' it will build the set elements that make the test
-/// fail.
-struct ConstantComparesGatherer {
- const DataLayout &DL;
-
- /// Value found for the switch comparison
- Value *CompValue = nullptr;
-
- /// Extra clause to be checked before the switch
- Value *Extra = nullptr;
-
- /// Set of integers to match in switch
- SmallVector<ConstantInt *, 8> Vals;
-
- /// Number of comparisons matched in the and/or chain
- unsigned UsedICmps = 0;
-
- /// Construct and compute the result for the comparison instruction Cond
- ConstantComparesGatherer(Instruction *Cond, const DataLayout &DL) : DL(DL) {
- gather(Cond);
- }
-
- ConstantComparesGatherer(const ConstantComparesGatherer &) = delete;
- ConstantComparesGatherer &
- operator=(const ConstantComparesGatherer &) = delete;
-
-private:
- /// Try to set the current value used for the comparison, it succeeds only if
- /// it wasn't set before or if the new value is the same as the old one
- bool setValueOnce(Value *NewVal) {
- if (CompValue && CompValue != NewVal)
- return false;
- CompValue = NewVal;
- return (CompValue != nullptr);
- }
-
- /// Try to match Instruction "I" as a comparison against a constant and
- /// populates the array Vals with the set of values that match (or do not
- /// match depending on isEQ).
- /// Return false on failure. On success, the Value the comparison matched
- /// against is placed in CompValue.
- /// If CompValue is already set, the function is expected to fail if a match
- /// is found but the value compared to is different.
- bool matchInstruction(Instruction *I, bool isEQ) {
- // If this is an icmp against a constant, handle this as one of the cases.
- ICmpInst *ICI;
- ConstantInt *C;
- if (!((ICI = dyn_cast<ICmpInst>(I)) &&
- (C = GetConstantInt(I->getOperand(1), DL)))) {
- return false;
- }
-
- Value *RHSVal;
- const APInt *RHSC;
-
- // Pattern match a special case
- // (x & ~2^z) == y --> x == y || x == y|2^z
- // This undoes a transformation done by instcombine to fuse 2 compares.
- if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE)) {
- // It's a little bit hard to see why the following transformations are
- // correct. Here is a CVC3 program to verify them for 64-bit values:
-
- /*
- ONE : BITVECTOR(64) = BVZEROEXTEND(0bin1, 63);
- x : BITVECTOR(64);
- y : BITVECTOR(64);
- z : BITVECTOR(64);
- mask : BITVECTOR(64) = BVSHL(ONE, z);
- QUERY( (y & ~mask = y) =>
- ((x & ~mask = y) <=> (x = y OR x = (y | mask)))
- );
- QUERY( (y | mask = y) =>
- ((x | mask = y) <=> (x = y OR x = (y & ~mask)))
- );
- */
-
- // Please note that each pattern must be a dual implication (<--> or
- // iff). One directional implication can create spurious matches. If the
- // implication is only one-way, an unsatisfiable condition on the left
- // side can imply a satisfiable condition on the right side. Dual
- // implication ensures that satisfiable conditions are transformed to
- // other satisfiable conditions and unsatisfiable conditions are
- // transformed to other unsatisfiable conditions.
-
- // Here is a concrete example of a unsatisfiable condition on the left
- // implying a satisfiable condition on the right:
- //
- // mask = (1 << z)
- // (x & ~mask) == y --> (x == y || x == (y | mask))
- //
- // Substituting y = 3, z = 0 yields:
- // (x & -2) == 3 --> (x == 3 || x == 2)
-
- // Pattern match a special case:
- /*
- QUERY( (y & ~mask = y) =>
- ((x & ~mask = y) <=> (x = y OR x = (y | mask)))
- );
- */
- if (match(ICI->getOperand(0),
- m_And(m_Value(RHSVal), m_APInt(RHSC)))) {
- APInt Mask = ~*RHSC;
- if (Mask.isPowerOf2() && (C->getValue() & ~Mask) == C->getValue()) {
- // If we already have a value for the switch, it has to match!
- if (!setValueOnce(RHSVal))
- return false;
-
- Vals.push_back(C);
- Vals.push_back(
- ConstantInt::get(C->getContext(),
- C->getValue() | Mask));
- UsedICmps++;
- return true;
- }
- }
-
- // Pattern match a special case:
- /*
- QUERY( (y | mask = y) =>
- ((x | mask = y) <=> (x = y OR x = (y & ~mask)))
- );
- */
- if (match(ICI->getOperand(0),
- m_Or(m_Value(RHSVal), m_APInt(RHSC)))) {
- APInt Mask = *RHSC;
- if (Mask.isPowerOf2() && (C->getValue() | Mask) == C->getValue()) {
- // If we already have a value for the switch, it has to match!
- if (!setValueOnce(RHSVal))
- return false;
-
- Vals.push_back(C);
- Vals.push_back(ConstantInt::get(C->getContext(),
- C->getValue() & ~Mask));
- UsedICmps++;
- return true;
- }
- }
-
- // If we already have a value for the switch, it has to match!
- if (!setValueOnce(ICI->getOperand(0)))
- return false;
-
- UsedICmps++;
- Vals.push_back(C);
- return ICI->getOperand(0);
- }
-
- // If we have "x ult 3", for example, then we can add 0,1,2 to the set.
- ConstantRange Span = ConstantRange::makeAllowedICmpRegion(
- ICI->getPredicate(), C->getValue());
-
- // Shift the range if the compare is fed by an add. This is the range
- // compare idiom as emitted by instcombine.
- Value *CandidateVal = I->getOperand(0);
- if (match(I->getOperand(0), m_Add(m_Value(RHSVal), m_APInt(RHSC)))) {
- Span = Span.subtract(*RHSC);
- CandidateVal = RHSVal;
- }
-
- // If this is an and/!= check, then we are looking to build the set of
- // value that *don't* pass the and chain. I.e. to turn "x ugt 2" into
- // x != 0 && x != 1.
- if (!isEQ)
- Span = Span.inverse();
-
- // If there are a ton of values, we don't want to make a ginormous switch.
- if (Span.isSizeLargerThan(8) || Span.isEmptySet()) {
- return false;
- }
-
- // If we already have a value for the switch, it has to match!
- if (!setValueOnce(CandidateVal))
- return false;
-
- // Add all values from the range to the set
- for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
- Vals.push_back(ConstantInt::get(I->getContext(), Tmp));
-
- UsedICmps++;
- return true;
- }
-
- /// Given a potentially 'or'd or 'and'd together collection of icmp
- /// eq/ne/lt/gt instructions that compare a value against a constant, extract
- /// the value being compared, and stick the list constants into the Vals
- /// vector.
- /// One "Extra" case is allowed to differ from the other.
- void gather(Value *V) {
+ bool run(BasicBlock *BB);
+
+ // Helper to set Resimplify and return change indication.
+ bool requestResimplify() {
+ Resimplify = true;
+ return true;
+ }
+};
+
+} // end anonymous namespace
+
+/// Return true if it is safe to merge these two
+/// terminator instructions together.
+static bool
+SafeToMergeTerminators(Instruction *SI1, Instruction *SI2,
+ SmallSetVector<BasicBlock *, 4> *FailBlocks = nullptr) {
+ if (SI1 == SI2)
+ return false; // Can't merge with self!
+
+ // It is not safe to merge these two switch instructions if they have a common
+ // successor, and if that successor has a PHI node, and if *that* PHI node has
+ // conflicting incoming values from the two switch blocks.
+ BasicBlock *SI1BB = SI1->getParent();
+ BasicBlock *SI2BB = SI2->getParent();
+
+ SmallPtrSet<BasicBlock *, 16> SI1Succs(succ_begin(SI1BB), succ_end(SI1BB));
+ bool Fail = false;
+ for (BasicBlock *Succ : successors(SI2BB))
+ if (SI1Succs.count(Succ))
+ for (BasicBlock::iterator BBI = Succ->begin(); isa<PHINode>(BBI); ++BBI) {
+ PHINode *PN = cast<PHINode>(BBI);
+ if (PN->getIncomingValueForBlock(SI1BB) !=
+ PN->getIncomingValueForBlock(SI2BB)) {
+ if (FailBlocks)
+ FailBlocks->insert(Succ);
+ Fail = true;
+ }
+ }
+
+ return !Fail;
+}
+
+/// Update PHI nodes in Succ to indicate that there will now be entries in it
+/// from the 'NewPred' block. The values that will be flowing into the PHI nodes
+/// will be the same as those coming in from ExistPred, an existing predecessor
+/// of Succ.
+static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred,
+ BasicBlock *ExistPred,
+ MemorySSAUpdater *MSSAU = nullptr) {
+ for (PHINode &PN : Succ->phis())
+ PN.addIncoming(PN.getIncomingValueForBlock(ExistPred), NewPred);
+ if (MSSAU)
+ if (auto *MPhi = MSSAU->getMemorySSA()->getMemoryAccess(Succ))
+ MPhi->addIncoming(MPhi->getIncomingValueForBlock(ExistPred), NewPred);
+}
+
+/// Compute an abstract "cost" of speculating the given instruction,
+/// which is assumed to be safe to speculate. TCC_Free means cheap,
+/// TCC_Basic means less cheap, and TCC_Expensive means prohibitively
+/// expensive.
+static unsigned ComputeSpeculationCost(const User *I,
+ const TargetTransformInfo &TTI) {
+ assert(isSafeToSpeculativelyExecute(I) &&
+ "Instruction is not safe to speculatively execute!");
+ return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
+}
+
+/// If we have a merge point of an "if condition" as accepted above,
+/// return true if the specified value dominates the block. We
+/// don't handle the true generality of domination here, just a special case
+/// which works well enough for us.
+///
+/// If AggressiveInsts is non-null, and if V does not dominate BB, we check to
+/// see if V (which must be an instruction) and its recursive operands
+/// that do not dominate BB have a combined cost lower than CostRemaining and
+/// are non-trapping. If both are true, the instruction is inserted into the
+/// set and true is returned.
+///
+/// The cost for most non-trapping instructions is defined as 1 except for
+/// Select whose cost is 2.
+///
+/// After this function returns, CostRemaining is decreased by the cost of
+/// V plus its non-dominating operands. If that cost is greater than
+/// CostRemaining, false is returned and CostRemaining is undefined.
+static bool DominatesMergePoint(Value *V, BasicBlock *BB,
+ SmallPtrSetImpl<Instruction *> &AggressiveInsts,
+ int &BudgetRemaining,
+ const TargetTransformInfo &TTI,
+ unsigned Depth = 0) {
+ // It is possible to hit a zero-cost cycle (phi/gep instructions for example),
+ // so limit the recursion depth.
+ // TODO: While this recursion limit does prevent pathological behavior, it
+ // would be better to track visited instructions to avoid cycles.
+ if (Depth == MaxSpeculationDepth)
+ return false;
+
+ Instruction *I = dyn_cast<Instruction>(V);
+ if (!I) {
+ // Non-instructions all dominate instructions, but not all constantexprs
+ // can be executed unconditionally.
+ if (ConstantExpr *C = dyn_cast<ConstantExpr>(V))
+ if (C->canTrap())
+ return false;
+ return true;
+ }
+ BasicBlock *PBB = I->getParent();
+
+ // We don't want to allow weird loops that might have the "if condition" in
+ // the bottom of this block.
+ if (PBB == BB)
+ return false;
+
+ // If this instruction is defined in a block that contains an unconditional
+ // branch to BB, then it must be in the 'conditional' part of the "if
+ // statement". If not, it definitely dominates the region.
+ BranchInst *BI = dyn_cast<BranchInst>(PBB->getTerminator());
+ if (!BI || BI->isConditional() || BI->getSuccessor(0) != BB)
+ return true;
+
+ // If we have seen this instruction before, don't count it again.
+ if (AggressiveInsts.count(I))
+ return true;
+
+ // Okay, it looks like the instruction IS in the "condition". Check to
+ // see if it's a cheap instruction to unconditionally compute, and if it
+ // only uses stuff defined outside of the condition. If so, hoist it out.
+ if (!isSafeToSpeculativelyExecute(I))
+ return false;
+
+ BudgetRemaining -= ComputeSpeculationCost(I, TTI);
+
+ // Allow exactly one instruction to be speculated regardless of its cost
+ // (as long as it is safe to do so).
+ // This is intended to flatten the CFG even if the instruction is a division
+ // or other expensive operation. The speculation of an expensive instruction
+ // is expected to be undone in CodeGenPrepare if the speculation has not
+ // enabled further IR optimizations.
+ if (BudgetRemaining < 0 &&
+ (!SpeculateOneExpensiveInst || !AggressiveInsts.empty() || Depth > 0))
+ return false;
+
+ // Okay, we can only really hoist these out if their operands do
+ // not take us over the cost threshold.
+ for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i)
+ if (!DominatesMergePoint(*i, BB, AggressiveInsts, BudgetRemaining, TTI,
+ Depth + 1))
+ return false;
+ // Okay, it's safe to do this! Remember this instruction.
+ AggressiveInsts.insert(I);
+ return true;
+}
+
+/// Extract ConstantInt from value, looking through IntToPtr
+/// and PointerNullValue. Return NULL if value is not a constant int.
+static ConstantInt *GetConstantInt(Value *V, const DataLayout &DL) {
+ // Normal constant int.
+ ConstantInt *CI = dyn_cast<ConstantInt>(V);
+ if (CI || !isa<Constant>(V) || !V->getType()->isPointerTy())
+ return CI;
+
+ // This is some kind of pointer constant. Turn it into a pointer-sized
+ // ConstantInt if possible.
+ IntegerType *PtrTy = cast<IntegerType>(DL.getIntPtrType(V->getType()));
+
+ // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*).
+ if (isa<ConstantPointerNull>(V))
+ return ConstantInt::get(PtrTy, 0);
+
+ // IntToPtr const int.
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+ if (CE->getOpcode() == Instruction::IntToPtr)
+ if (ConstantInt *CI = dyn_cast<ConstantInt>(CE->getOperand(0))) {
+ // The constant is very likely to have the right type already.
+ if (CI->getType() == PtrTy)
+ return CI;
+ else
+ return cast<ConstantInt>(
+ ConstantExpr::getIntegerCast(CI, PtrTy, /*isSigned=*/false));
+ }
+ return nullptr;
+}
+
+namespace {
+
+/// Given a chain of or (||) or and (&&) comparison of a value against a
+/// constant, this will try to recover the information required for a switch
+/// structure.
+/// It will depth-first traverse the chain of comparison, seeking for patterns
+/// like %a == 12 or %a < 4 and combine them to produce a set of integer
+/// representing the different cases for the switch.
+/// Note that if the chain is composed of '||' it will build the set of elements
+/// that matches the comparisons (i.e. any of this value validate the chain)
+/// while for a chain of '&&' it will build the set elements that make the test
+/// fail.
+struct ConstantComparesGatherer {
+ const DataLayout &DL;
+
+ /// Value found for the switch comparison
+ Value *CompValue = nullptr;
+
+ /// Extra clause to be checked before the switch
+ Value *Extra = nullptr;
+
+ /// Set of integers to match in switch
+ SmallVector<ConstantInt *, 8> Vals;
+
+ /// Number of comparisons matched in the and/or chain
+ unsigned UsedICmps = 0;
+
+ /// Construct and compute the result for the comparison instruction Cond
+ ConstantComparesGatherer(Instruction *Cond, const DataLayout &DL) : DL(DL) {
+ gather(Cond);
+ }
+
+ ConstantComparesGatherer(const ConstantComparesGatherer &) = delete;
+ ConstantComparesGatherer &
+ operator=(const ConstantComparesGatherer &) = delete;
+
+private:
+ /// Try to set the current value used for the comparison, it succeeds only if
+ /// it wasn't set before or if the new value is the same as the old one
+ bool setValueOnce(Value *NewVal) {
+ if (CompValue && CompValue != NewVal)
+ return false;
+ CompValue = NewVal;
+ return (CompValue != nullptr);
+ }
+
+ /// Try to match Instruction "I" as a comparison against a constant and
+ /// populates the array Vals with the set of values that match (or do not
+ /// match depending on isEQ).
+ /// Return false on failure. On success, the Value the comparison matched
+ /// against is placed in CompValue.
+ /// If CompValue is already set, the function is expected to fail if a match
+ /// is found but the value compared to is different.
+ bool matchInstruction(Instruction *I, bool isEQ) {
+ // If this is an icmp against a constant, handle this as one of the cases.
+ ICmpInst *ICI;
+ ConstantInt *C;
+ if (!((ICI = dyn_cast<ICmpInst>(I)) &&
+ (C = GetConstantInt(I->getOperand(1), DL)))) {
+ return false;
+ }
+
+ Value *RHSVal;
+ const APInt *RHSC;
+
+ // Pattern match a special case
+ // (x & ~2^z) == y --> x == y || x == y|2^z
+ // This undoes a transformation done by instcombine to fuse 2 compares.
+ if (ICI->getPredicate() == (isEQ ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE)) {
+ // It's a little bit hard to see why the following transformations are
+ // correct. Here is a CVC3 program to verify them for 64-bit values:
+
+ /*
+ ONE : BITVECTOR(64) = BVZEROEXTEND(0bin1, 63);
+ x : BITVECTOR(64);
+ y : BITVECTOR(64);
+ z : BITVECTOR(64);
+ mask : BITVECTOR(64) = BVSHL(ONE, z);
+ QUERY( (y & ~mask = y) =>
+ ((x & ~mask = y) <=> (x = y OR x = (y | mask)))
+ );
+ QUERY( (y | mask = y) =>
+ ((x | mask = y) <=> (x = y OR x = (y & ~mask)))
+ );
+ */
+
+ // Please note that each pattern must be a dual implication (<--> or
+ // iff). One directional implication can create spurious matches. If the
+ // implication is only one-way, an unsatisfiable condition on the left
+ // side can imply a satisfiable condition on the right side. Dual
+ // implication ensures that satisfiable conditions are transformed to
+ // other satisfiable conditions and unsatisfiable conditions are
+ // transformed to other unsatisfiable conditions.
+
+ // Here is a concrete example of a unsatisfiable condition on the left
+ // implying a satisfiable condition on the right:
+ //
+ // mask = (1 << z)
+ // (x & ~mask) == y --> (x == y || x == (y | mask))
+ //
+ // Substituting y = 3, z = 0 yields:
+ // (x & -2) == 3 --> (x == 3 || x == 2)
+
+ // Pattern match a special case:
+ /*
+ QUERY( (y & ~mask = y) =>
+ ((x & ~mask = y) <=> (x = y OR x = (y | mask)))
+ );
+ */
+ if (match(ICI->getOperand(0),
+ m_And(m_Value(RHSVal), m_APInt(RHSC)))) {
+ APInt Mask = ~*RHSC;
+ if (Mask.isPowerOf2() && (C->getValue() & ~Mask) == C->getValue()) {
+ // If we already have a value for the switch, it has to match!
+ if (!setValueOnce(RHSVal))
+ return false;
+
+ Vals.push_back(C);
+ Vals.push_back(
+ ConstantInt::get(C->getContext(),
+ C->getValue() | Mask));
+ UsedICmps++;
+ return true;
+ }
+ }
+
+ // Pattern match a special case:
+ /*
+ QUERY( (y | mask = y) =>
+ ((x | mask = y) <=> (x = y OR x = (y & ~mask)))
+ );
+ */
+ if (match(ICI->getOperand(0),
+ m_Or(m_Value(RHSVal), m_APInt(RHSC)))) {
+ APInt Mask = *RHSC;
+ if (Mask.isPowerOf2() && (C->getValue() | Mask) == C->getValue()) {
+ // If we already have a value for the switch, it has to match!
+ if (!setValueOnce(RHSVal))
+ return false;
+
+ Vals.push_back(C);
+ Vals.push_back(ConstantInt::get(C->getContext(),
+ C->getValue() & ~Mask));
+ UsedICmps++;
+ return true;
+ }
+ }
+
+ // If we already have a value for the switch, it has to match!
+ if (!setValueOnce(ICI->getOperand(0)))
+ return false;
+
+ UsedICmps++;
+ Vals.push_back(C);
+ return ICI->getOperand(0);
+ }
+
+ // If we have "x ult 3", for example, then we can add 0,1,2 to the set.
+ ConstantRange Span = ConstantRange::makeAllowedICmpRegion(
+ ICI->getPredicate(), C->getValue());
+
+ // Shift the range if the compare is fed by an add. This is the range
+ // compare idiom as emitted by instcombine.
+ Value *CandidateVal = I->getOperand(0);
+ if (match(I->getOperand(0), m_Add(m_Value(RHSVal), m_APInt(RHSC)))) {
+ Span = Span.subtract(*RHSC);
+ CandidateVal = RHSVal;
+ }
+
+ // If this is an and/!= check, then we are looking to build the set of
+ // value that *don't* pass the and chain. I.e. to turn "x ugt 2" into
+ // x != 0 && x != 1.
+ if (!isEQ)
+ Span = Span.inverse();
+
+ // If there are a ton of values, we don't want to make a ginormous switch.
+ if (Span.isSizeLargerThan(8) || Span.isEmptySet()) {
+ return false;
+ }
+
+ // If we already have a value for the switch, it has to match!
+ if (!setValueOnce(CandidateVal))
+ return false;
+
+ // Add all values from the range to the set
+ for (APInt Tmp = Span.getLower(); Tmp != Span.getUpper(); ++Tmp)
+ Vals.push_back(ConstantInt::get(I->getContext(), Tmp));
+
+ UsedICmps++;
+ return true;
+ }
+
+ /// Given a potentially 'or'd or 'and'd together collection of icmp
+ /// eq/ne/lt/gt instructions that compare a value against a constant, extract
+ /// the value being compared, and stick the list constants into the Vals
+ /// vector.
+ /// One "Extra" case is allowed to differ from the other.
+ void gather(Value *V) {
bool isEQ = match(V, m_LogicalOr(m_Value(), m_Value()));
-
- // Keep a stack (SmallVector for efficiency) for depth-first traversal
- SmallVector<Value *, 8> DFT;
- SmallPtrSet<Value *, 8> Visited;
-
- // Initialize
- Visited.insert(V);
- DFT.push_back(V);
-
- while (!DFT.empty()) {
- V = DFT.pop_back_val();
-
- if (Instruction *I = dyn_cast<Instruction>(V)) {
- // If it is a || (or && depending on isEQ), process the operands.
+
+ // Keep a stack (SmallVector for efficiency) for depth-first traversal
+ SmallVector<Value *, 8> DFT;
+ SmallPtrSet<Value *, 8> Visited;
+
+ // Initialize
+ Visited.insert(V);
+ DFT.push_back(V);
+
+ while (!DFT.empty()) {
+ V = DFT.pop_back_val();
+
+ if (Instruction *I = dyn_cast<Instruction>(V)) {
+ // If it is a || (or && depending on isEQ), process the operands.
Value *Op0, *Op1;
if (isEQ ? match(I, m_LogicalOr(m_Value(Op0), m_Value(Op1)))
: match(I, m_LogicalAnd(m_Value(Op0), m_Value(Op1)))) {
@@ -678,245 +678,245 @@ private:
if (Visited.insert(Op0).second)
DFT.push_back(Op0);
- continue;
- }
-
- // Try to match the current instruction
- if (matchInstruction(I, isEQ))
- // Match succeed, continue the loop
- continue;
- }
-
- // One element of the sequence of || (or &&) could not be match as a
- // comparison against the same value as the others.
- // We allow only one "Extra" case to be checked before the switch
- if (!Extra) {
- Extra = V;
- continue;
- }
- // Failed to parse a proper sequence, abort now
- CompValue = nullptr;
- break;
- }
- }
-};
-
-} // end anonymous namespace
-
-static void EraseTerminatorAndDCECond(Instruction *TI,
- MemorySSAUpdater *MSSAU = nullptr) {
- Instruction *Cond = nullptr;
- if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
- Cond = dyn_cast<Instruction>(SI->getCondition());
- } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
- if (BI->isConditional())
- Cond = dyn_cast<Instruction>(BI->getCondition());
- } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(TI)) {
- Cond = dyn_cast<Instruction>(IBI->getAddress());
- }
-
- TI->eraseFromParent();
- if (Cond)
- RecursivelyDeleteTriviallyDeadInstructions(Cond, nullptr, MSSAU);
-}
-
-/// Return true if the specified terminator checks
-/// to see if a value is equal to constant integer value.
-Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) {
- Value *CV = nullptr;
- if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
- // Do not permit merging of large switch instructions into their
- // predecessors unless there is only one predecessor.
- if (!SI->getParent()->hasNPredecessorsOrMore(128 / SI->getNumSuccessors()))
- CV = SI->getCondition();
- } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
- if (BI->isConditional() && BI->getCondition()->hasOneUse())
- if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) {
- if (ICI->isEquality() && GetConstantInt(ICI->getOperand(1), DL))
- CV = ICI->getOperand(0);
- }
-
- // Unwrap any lossless ptrtoint cast.
- if (CV) {
- if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) {
- Value *Ptr = PTII->getPointerOperand();
- if (PTII->getType() == DL.getIntPtrType(Ptr->getType()))
- CV = Ptr;
- }
- }
- return CV;
-}
-
-/// Given a value comparison instruction,
-/// decode all of the 'cases' that it represents and return the 'default' block.
-BasicBlock *SimplifyCFGOpt::GetValueEqualityComparisonCases(
- Instruction *TI, std::vector<ValueEqualityComparisonCase> &Cases) {
- if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
- Cases.reserve(SI->getNumCases());
- for (auto Case : SI->cases())
- Cases.push_back(ValueEqualityComparisonCase(Case.getCaseValue(),
- Case.getCaseSuccessor()));
- return SI->getDefaultDest();
- }
-
- BranchInst *BI = cast<BranchInst>(TI);
- ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
- BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE);
- Cases.push_back(ValueEqualityComparisonCase(
- GetConstantInt(ICI->getOperand(1), DL), Succ));
- return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ);
-}
-
-/// Given a vector of bb/value pairs, remove any entries
-/// in the list that match the specified block.
-static void
-EliminateBlockCases(BasicBlock *BB,
- std::vector<ValueEqualityComparisonCase> &Cases) {
+ continue;
+ }
+
+ // Try to match the current instruction
+ if (matchInstruction(I, isEQ))
+ // Match succeed, continue the loop
+ continue;
+ }
+
+ // One element of the sequence of || (or &&) could not be match as a
+ // comparison against the same value as the others.
+ // We allow only one "Extra" case to be checked before the switch
+ if (!Extra) {
+ Extra = V;
+ continue;
+ }
+ // Failed to parse a proper sequence, abort now
+ CompValue = nullptr;
+ break;
+ }
+ }
+};
+
+} // end anonymous namespace
+
+static void EraseTerminatorAndDCECond(Instruction *TI,
+ MemorySSAUpdater *MSSAU = nullptr) {
+ Instruction *Cond = nullptr;
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ Cond = dyn_cast<Instruction>(SI->getCondition());
+ } else if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ if (BI->isConditional())
+ Cond = dyn_cast<Instruction>(BI->getCondition());
+ } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(TI)) {
+ Cond = dyn_cast<Instruction>(IBI->getAddress());
+ }
+
+ TI->eraseFromParent();
+ if (Cond)
+ RecursivelyDeleteTriviallyDeadInstructions(Cond, nullptr, MSSAU);
+}
+
+/// Return true if the specified terminator checks
+/// to see if a value is equal to constant integer value.
+Value *SimplifyCFGOpt::isValueEqualityComparison(Instruction *TI) {
+ Value *CV = nullptr;
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ // Do not permit merging of large switch instructions into their
+ // predecessors unless there is only one predecessor.
+ if (!SI->getParent()->hasNPredecessorsOrMore(128 / SI->getNumSuccessors()))
+ CV = SI->getCondition();
+ } else if (BranchInst *BI = dyn_cast<BranchInst>(TI))
+ if (BI->isConditional() && BI->getCondition()->hasOneUse())
+ if (ICmpInst *ICI = dyn_cast<ICmpInst>(BI->getCondition())) {
+ if (ICI->isEquality() && GetConstantInt(ICI->getOperand(1), DL))
+ CV = ICI->getOperand(0);
+ }
+
+ // Unwrap any lossless ptrtoint cast.
+ if (CV) {
+ if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV)) {
+ Value *Ptr = PTII->getPointerOperand();
+ if (PTII->getType() == DL.getIntPtrType(Ptr->getType()))
+ CV = Ptr;
+ }
+ }
+ return CV;
+}
+
+/// Given a value comparison instruction,
+/// decode all of the 'cases' that it represents and return the 'default' block.
+BasicBlock *SimplifyCFGOpt::GetValueEqualityComparisonCases(
+ Instruction *TI, std::vector<ValueEqualityComparisonCase> &Cases) {
+ if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+ Cases.reserve(SI->getNumCases());
+ for (auto Case : SI->cases())
+ Cases.push_back(ValueEqualityComparisonCase(Case.getCaseValue(),
+ Case.getCaseSuccessor()));
+ return SI->getDefaultDest();
+ }
+
+ BranchInst *BI = cast<BranchInst>(TI);
+ ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+ BasicBlock *Succ = BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_NE);
+ Cases.push_back(ValueEqualityComparisonCase(
+ GetConstantInt(ICI->getOperand(1), DL), Succ));
+ return BI->getSuccessor(ICI->getPredicate() == ICmpInst::ICMP_EQ);
+}
+
+/// Given a vector of bb/value pairs, remove any entries
+/// in the list that match the specified block.
+static void
+EliminateBlockCases(BasicBlock *BB,
+ std::vector<ValueEqualityComparisonCase> &Cases) {
llvm::erase_value(Cases, BB);
-}
-
-/// Return true if there are any keys in C1 that exist in C2 as well.
-static bool ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
- std::vector<ValueEqualityComparisonCase> &C2) {
- std::vector<ValueEqualityComparisonCase> *V1 = &C1, *V2 = &C2;
-
- // Make V1 be smaller than V2.
- if (V1->size() > V2->size())
- std::swap(V1, V2);
-
- if (V1->empty())
- return false;
- if (V1->size() == 1) {
- // Just scan V2.
- ConstantInt *TheVal = (*V1)[0].Value;
- for (unsigned i = 0, e = V2->size(); i != e; ++i)
- if (TheVal == (*V2)[i].Value)
- return true;
- }
-
- // Otherwise, just sort both lists and compare element by element.
- array_pod_sort(V1->begin(), V1->end());
- array_pod_sort(V2->begin(), V2->end());
- unsigned i1 = 0, i2 = 0, e1 = V1->size(), e2 = V2->size();
- while (i1 != e1 && i2 != e2) {
- if ((*V1)[i1].Value == (*V2)[i2].Value)
- return true;
- if ((*V1)[i1].Value < (*V2)[i2].Value)
- ++i1;
- else
- ++i2;
- }
- return false;
-}
-
-// Set branch weights on SwitchInst. This sets the metadata if there is at
-// least one non-zero weight.
-static void setBranchWeights(SwitchInst *SI, ArrayRef<uint32_t> Weights) {
- // Check that there is at least one non-zero weight. Otherwise, pass
- // nullptr to setMetadata which will erase the existing metadata.
- MDNode *N = nullptr;
- if (llvm::any_of(Weights, [](uint32_t W) { return W != 0; }))
- N = MDBuilder(SI->getParent()->getContext()).createBranchWeights(Weights);
- SI->setMetadata(LLVMContext::MD_prof, N);
-}
-
-// Similar to the above, but for branch and select instructions that take
-// exactly 2 weights.
-static void setBranchWeights(Instruction *I, uint32_t TrueWeight,
- uint32_t FalseWeight) {
- assert(isa<BranchInst>(I) || isa<SelectInst>(I));
- // Check that there is at least one non-zero weight. Otherwise, pass
- // nullptr to setMetadata which will erase the existing metadata.
- MDNode *N = nullptr;
- if (TrueWeight || FalseWeight)
- N = MDBuilder(I->getParent()->getContext())
- .createBranchWeights(TrueWeight, FalseWeight);
- I->setMetadata(LLVMContext::MD_prof, N);
-}
-
-/// If TI is known to be a terminator instruction and its block is known to
-/// only have a single predecessor block, check to see if that predecessor is
-/// also a value comparison with the same value, and if that comparison
-/// determines the outcome of this comparison. If so, simplify TI. This does a
-/// very limited form of jump threading.
-bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
- Instruction *TI, BasicBlock *Pred, IRBuilder<> &Builder) {
- Value *PredVal = isValueEqualityComparison(Pred->getTerminator());
- if (!PredVal)
- return false; // Not a value comparison in predecessor.
-
- Value *ThisVal = isValueEqualityComparison(TI);
- assert(ThisVal && "This isn't a value comparison!!");
- if (ThisVal != PredVal)
- return false; // Different predicates.
-
- // TODO: Preserve branch weight metadata, similarly to how
- // FoldValueComparisonIntoPredecessors preserves it.
-
- // Find out information about when control will move from Pred to TI's block.
- std::vector<ValueEqualityComparisonCase> PredCases;
- BasicBlock *PredDef =
- GetValueEqualityComparisonCases(Pred->getTerminator(), PredCases);
- EliminateBlockCases(PredDef, PredCases); // Remove default from cases.
-
- // Find information about how control leaves this block.
- std::vector<ValueEqualityComparisonCase> ThisCases;
- BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases);
- EliminateBlockCases(ThisDef, ThisCases); // Remove default from cases.
-
- // If TI's block is the default block from Pred's comparison, potentially
- // simplify TI based on this knowledge.
- if (PredDef == TI->getParent()) {
- // If we are here, we know that the value is none of those cases listed in
- // PredCases. If there are any cases in ThisCases that are in PredCases, we
- // can simplify TI.
- if (!ValuesOverlap(PredCases, ThisCases))
- return false;
-
- if (isa<BranchInst>(TI)) {
- // Okay, one of the successors of this condbr is dead. Convert it to a
- // uncond br.
- assert(ThisCases.size() == 1 && "Branch can only have one case!");
- // Insert the new branch.
- Instruction *NI = Builder.CreateBr(ThisDef);
- (void)NI;
-
- // Remove PHI node entries for the dead edge.
+}
+
+/// Return true if there are any keys in C1 that exist in C2 as well.
+static bool ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
+ std::vector<ValueEqualityComparisonCase> &C2) {
+ std::vector<ValueEqualityComparisonCase> *V1 = &C1, *V2 = &C2;
+
+ // Make V1 be smaller than V2.
+ if (V1->size() > V2->size())
+ std::swap(V1, V2);
+
+ if (V1->empty())
+ return false;
+ if (V1->size() == 1) {
+ // Just scan V2.
+ ConstantInt *TheVal = (*V1)[0].Value;
+ for (unsigned i = 0, e = V2->size(); i != e; ++i)
+ if (TheVal == (*V2)[i].Value)
+ return true;
+ }
+
+ // Otherwise, just sort both lists and compare element by element.
+ array_pod_sort(V1->begin(), V1->end());
+ array_pod_sort(V2->begin(), V2->end());
+ unsigned i1 = 0, i2 = 0, e1 = V1->size(), e2 = V2->size();
+ while (i1 != e1 && i2 != e2) {
+ if ((*V1)[i1].Value == (*V2)[i2].Value)
+ return true;
+ if ((*V1)[i1].Value < (*V2)[i2].Value)
+ ++i1;
+ else
+ ++i2;
+ }
+ return false;
+}
+
+// Set branch weights on SwitchInst. This sets the metadata if there is at
+// least one non-zero weight.
+static void setBranchWeights(SwitchInst *SI, ArrayRef<uint32_t> Weights) {
+ // Check that there is at least one non-zero weight. Otherwise, pass
+ // nullptr to setMetadata which will erase the existing metadata.
+ MDNode *N = nullptr;
+ if (llvm::any_of(Weights, [](uint32_t W) { return W != 0; }))
+ N = MDBuilder(SI->getParent()->getContext()).createBranchWeights(Weights);
+ SI->setMetadata(LLVMContext::MD_prof, N);
+}
+
+// Similar to the above, but for branch and select instructions that take
+// exactly 2 weights.
+static void setBranchWeights(Instruction *I, uint32_t TrueWeight,
+ uint32_t FalseWeight) {
+ assert(isa<BranchInst>(I) || isa<SelectInst>(I));
+ // Check that there is at least one non-zero weight. Otherwise, pass
+ // nullptr to setMetadata which will erase the existing metadata.
+ MDNode *N = nullptr;
+ if (TrueWeight || FalseWeight)
+ N = MDBuilder(I->getParent()->getContext())
+ .createBranchWeights(TrueWeight, FalseWeight);
+ I->setMetadata(LLVMContext::MD_prof, N);
+}
+
+/// If TI is known to be a terminator instruction and its block is known to
+/// only have a single predecessor block, check to see if that predecessor is
+/// also a value comparison with the same value, and if that comparison
+/// determines the outcome of this comparison. If so, simplify TI. This does a
+/// very limited form of jump threading.
+bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
+ Instruction *TI, BasicBlock *Pred, IRBuilder<> &Builder) {
+ Value *PredVal = isValueEqualityComparison(Pred->getTerminator());
+ if (!PredVal)
+ return false; // Not a value comparison in predecessor.
+
+ Value *ThisVal = isValueEqualityComparison(TI);
+ assert(ThisVal && "This isn't a value comparison!!");
+ if (ThisVal != PredVal)
+ return false; // Different predicates.
+
+ // TODO: Preserve branch weight metadata, similarly to how
+ // FoldValueComparisonIntoPredecessors preserves it.
+
+ // Find out information about when control will move from Pred to TI's block.
+ std::vector<ValueEqualityComparisonCase> PredCases;
+ BasicBlock *PredDef =
+ GetValueEqualityComparisonCases(Pred->getTerminator(), PredCases);
+ EliminateBlockCases(PredDef, PredCases); // Remove default from cases.
+
+ // Find information about how control leaves this block.
+ std::vector<ValueEqualityComparisonCase> ThisCases;
+ BasicBlock *ThisDef = GetValueEqualityComparisonCases(TI, ThisCases);
+ EliminateBlockCases(ThisDef, ThisCases); // Remove default from cases.
+
+ // If TI's block is the default block from Pred's comparison, potentially
+ // simplify TI based on this knowledge.
+ if (PredDef == TI->getParent()) {
+ // If we are here, we know that the value is none of those cases listed in
+ // PredCases. If there are any cases in ThisCases that are in PredCases, we
+ // can simplify TI.
+ if (!ValuesOverlap(PredCases, ThisCases))
+ return false;
+
+ if (isa<BranchInst>(TI)) {
+ // Okay, one of the successors of this condbr is dead. Convert it to a
+ // uncond br.
+ assert(ThisCases.size() == 1 && "Branch can only have one case!");
+ // Insert the new branch.
+ Instruction *NI = Builder.CreateBr(ThisDef);
+ (void)NI;
+
+ // Remove PHI node entries for the dead edge.
ThisCases[0].Dest->removePredecessor(PredDef);
-
- LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
- << "Through successor TI: " << *TI << "Leaving: " << *NI
- << "\n");
-
- EraseTerminatorAndDCECond(TI);
+
+ LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+ << "Through successor TI: " << *TI << "Leaving: " << *NI
+ << "\n");
+
+ EraseTerminatorAndDCECond(TI);
if (DTU)
DTU->applyUpdates(
{{DominatorTree::Delete, PredDef, ThisCases[0].Dest}});
- return true;
- }
-
- SwitchInstProfUpdateWrapper SI = *cast<SwitchInst>(TI);
- // Okay, TI has cases that are statically dead, prune them away.
- SmallPtrSet<Constant *, 16> DeadCases;
- for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
- DeadCases.insert(PredCases[i].Value);
-
- LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
- << "Through successor TI: " << *TI);
-
+ return true;
+ }
+
+ SwitchInstProfUpdateWrapper SI = *cast<SwitchInst>(TI);
+ // Okay, TI has cases that are statically dead, prune them away.
+ SmallPtrSet<Constant *, 16> DeadCases;
+ for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+ DeadCases.insert(PredCases[i].Value);
+
+ LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+ << "Through successor TI: " << *TI);
+
SmallMapVector<BasicBlock *, int, 8> NumPerSuccessorCases;
- for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
- --i;
+ for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
+ --i;
auto *Successor = i->getCaseSuccessor();
++NumPerSuccessorCases[Successor];
- if (DeadCases.count(i->getCaseValue())) {
+ if (DeadCases.count(i->getCaseValue())) {
Successor->removePredecessor(PredDef);
- SI.removeCase(i);
+ SI.removeCase(i);
--NumPerSuccessorCases[Successor];
- }
- }
+ }
+ }
std::vector<DominatorTree::UpdateType> Updates;
for (const std::pair<BasicBlock *, int> &I : NumPerSuccessorCases)
@@ -925,56 +925,56 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
if (DTU)
DTU->applyUpdates(Updates);
- LLVM_DEBUG(dbgs() << "Leaving: " << *TI << "\n");
- return true;
- }
-
- // Otherwise, TI's block must correspond to some matched value. Find out
- // which value (or set of values) this is.
- ConstantInt *TIV = nullptr;
- BasicBlock *TIBB = TI->getParent();
- for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
- if (PredCases[i].Dest == TIBB) {
- if (TIV)
- return false; // Cannot handle multiple values coming to this block.
- TIV = PredCases[i].Value;
- }
- assert(TIV && "No edge from pred to succ?");
-
- // Okay, we found the one constant that our value can be if we get into TI's
- // BB. Find out which successor will unconditionally be branched to.
- BasicBlock *TheRealDest = nullptr;
- for (unsigned i = 0, e = ThisCases.size(); i != e; ++i)
- if (ThisCases[i].Value == TIV) {
- TheRealDest = ThisCases[i].Dest;
- break;
- }
-
- // If not handled by any explicit cases, it is handled by the default case.
- if (!TheRealDest)
- TheRealDest = ThisDef;
-
+ LLVM_DEBUG(dbgs() << "Leaving: " << *TI << "\n");
+ return true;
+ }
+
+ // Otherwise, TI's block must correspond to some matched value. Find out
+ // which value (or set of values) this is.
+ ConstantInt *TIV = nullptr;
+ BasicBlock *TIBB = TI->getParent();
+ for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
+ if (PredCases[i].Dest == TIBB) {
+ if (TIV)
+ return false; // Cannot handle multiple values coming to this block.
+ TIV = PredCases[i].Value;
+ }
+ assert(TIV && "No edge from pred to succ?");
+
+ // Okay, we found the one constant that our value can be if we get into TI's
+ // BB. Find out which successor will unconditionally be branched to.
+ BasicBlock *TheRealDest = nullptr;
+ for (unsigned i = 0, e = ThisCases.size(); i != e; ++i)
+ if (ThisCases[i].Value == TIV) {
+ TheRealDest = ThisCases[i].Dest;
+ break;
+ }
+
+ // If not handled by any explicit cases, it is handled by the default case.
+ if (!TheRealDest)
+ TheRealDest = ThisDef;
+
SmallSetVector<BasicBlock *, 2> RemovedSuccs;
- // Remove PHI node entries for dead edges.
- BasicBlock *CheckEdge = TheRealDest;
- for (BasicBlock *Succ : successors(TIBB))
+ // Remove PHI node entries for dead edges.
+ BasicBlock *CheckEdge = TheRealDest;
+ for (BasicBlock *Succ : successors(TIBB))
if (Succ != CheckEdge) {
if (Succ != TheRealDest)
RemovedSuccs.insert(Succ);
- Succ->removePredecessor(TIBB);
+ Succ->removePredecessor(TIBB);
} else
- CheckEdge = nullptr;
-
- // Insert the new branch.
- Instruction *NI = Builder.CreateBr(TheRealDest);
- (void)NI;
-
- LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
- << "Through successor TI: " << *TI << "Leaving: " << *NI
- << "\n");
-
- EraseTerminatorAndDCECond(TI);
+ CheckEdge = nullptr;
+
+ // Insert the new branch.
+ Instruction *NI = Builder.CreateBr(TheRealDest);
+ (void)NI;
+
+ LLVM_DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
+ << "Through successor TI: " << *TI << "Leaving: " << *NI
+ << "\n");
+
+ EraseTerminatorAndDCECond(TI);
if (DTU) {
SmallVector<DominatorTree::UpdateType, 2> Updates;
Updates.reserve(RemovedSuccs.size());
@@ -982,86 +982,86 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
Updates.push_back({DominatorTree::Delete, TIBB, RemovedSucc});
DTU->applyUpdates(Updates);
}
- return true;
-}
-
-namespace {
-
-/// This class implements a stable ordering of constant
-/// integers that does not depend on their address. This is important for
-/// applications that sort ConstantInt's to ensure uniqueness.
-struct ConstantIntOrdering {
- bool operator()(const ConstantInt *LHS, const ConstantInt *RHS) const {
- return LHS->getValue().ult(RHS->getValue());
- }
-};
-
-} // end anonymous namespace
-
-static int ConstantIntSortPredicate(ConstantInt *const *P1,
- ConstantInt *const *P2) {
- const ConstantInt *LHS = *P1;
- const ConstantInt *RHS = *P2;
- if (LHS == RHS)
- return 0;
- return LHS->getValue().ult(RHS->getValue()) ? 1 : -1;
-}
-
-static inline bool HasBranchWeights(const Instruction *I) {
- MDNode *ProfMD = I->getMetadata(LLVMContext::MD_prof);
- if (ProfMD && ProfMD->getOperand(0))
- if (MDString *MDS = dyn_cast<MDString>(ProfMD->getOperand(0)))
- return MDS->getString().equals("branch_weights");
-
- return false;
-}
-
-/// Get Weights of a given terminator, the default weight is at the front
-/// of the vector. If TI is a conditional eq, we need to swap the branch-weight
-/// metadata.
-static void GetBranchWeights(Instruction *TI,
- SmallVectorImpl<uint64_t> &Weights) {
- MDNode *MD = TI->getMetadata(LLVMContext::MD_prof);
- assert(MD);
- for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) {
- ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(i));
- Weights.push_back(CI->getValue().getZExtValue());
- }
-
- // If TI is a conditional eq, the default case is the false case,
- // and the corresponding branch-weight data is at index 2. We swap the
- // default weight to be the first entry.
- if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
- assert(Weights.size() == 2);
- ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
- if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
- std::swap(Weights.front(), Weights.back());
- }
-}
-
-/// Keep halving the weights until all can fit in uint32_t.
-static void FitWeights(MutableArrayRef<uint64_t> Weights) {
- uint64_t Max = *std::max_element(Weights.begin(), Weights.end());
- if (Max > UINT_MAX) {
- unsigned Offset = 32 - countLeadingZeros(Max);
- for (uint64_t &I : Weights)
- I >>= Offset;
- }
-}
-
+ return true;
+}
+
+namespace {
+
+/// This class implements a stable ordering of constant
+/// integers that does not depend on their address. This is important for
+/// applications that sort ConstantInt's to ensure uniqueness.
+struct ConstantIntOrdering {
+ bool operator()(const ConstantInt *LHS, const ConstantInt *RHS) const {
+ return LHS->getValue().ult(RHS->getValue());
+ }
+};
+
+} // end anonymous namespace
+
+static int ConstantIntSortPredicate(ConstantInt *const *P1,
+ ConstantInt *const *P2) {
+ const ConstantInt *LHS = *P1;
+ const ConstantInt *RHS = *P2;
+ if (LHS == RHS)
+ return 0;
+ return LHS->getValue().ult(RHS->getValue()) ? 1 : -1;
+}
+
+static inline bool HasBranchWeights(const Instruction *I) {
+ MDNode *ProfMD = I->getMetadata(LLVMContext::MD_prof);
+ if (ProfMD && ProfMD->getOperand(0))
+ if (MDString *MDS = dyn_cast<MDString>(ProfMD->getOperand(0)))
+ return MDS->getString().equals("branch_weights");
+
+ return false;
+}
+
+/// Get Weights of a given terminator, the default weight is at the front
+/// of the vector. If TI is a conditional eq, we need to swap the branch-weight
+/// metadata.
+static void GetBranchWeights(Instruction *TI,
+ SmallVectorImpl<uint64_t> &Weights) {
+ MDNode *MD = TI->getMetadata(LLVMContext::MD_prof);
+ assert(MD);
+ for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) {
+ ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(i));
+ Weights.push_back(CI->getValue().getZExtValue());
+ }
+
+ // If TI is a conditional eq, the default case is the false case,
+ // and the corresponding branch-weight data is at index 2. We swap the
+ // default weight to be the first entry.
+ if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+ assert(Weights.size() == 2);
+ ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+ if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+ std::swap(Weights.front(), Weights.back());
+ }
+}
+
+/// Keep halving the weights until all can fit in uint32_t.
+static void FitWeights(MutableArrayRef<uint64_t> Weights) {
+ uint64_t Max = *std::max_element(Weights.begin(), Weights.end());
+ if (Max > UINT_MAX) {
+ unsigned Offset = 32 - countLeadingZeros(Max);
+ for (uint64_t &I : Weights)
+ I >>= Offset;
+ }
+}
+
static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
BasicBlock *BB, BasicBlock *PredBlock, ValueToValueMapTy &VMap) {
Instruction *PTI = PredBlock->getTerminator();
-
+
// If we have bonus instructions, clone them into the predecessor block.
// Note that there may be multiple predecessor blocks, so we cannot move
// bonus instructions to a predecessor block.
for (Instruction &BonusInst : *BB) {
if (isa<DbgInfoIntrinsic>(BonusInst) || BonusInst.isTerminator())
continue;
-
+
Instruction *NewBonusInst = BonusInst.clone();
-
+
if (PTI->getDebugLoc() != NewBonusInst->getDebugLoc()) {
// Unless the instruction has the same !dbg location as the original
// branch, drop it. When we fold the bonus instructions we want to make
@@ -1069,11 +1069,11 @@ static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
// dead code caused by folding dead branches.
NewBonusInst->setDebugLoc(DebugLoc());
}
-
+
RemapInstruction(NewBonusInst, VMap,
RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
VMap[&BonusInst] = NewBonusInst;
-
+
// If we moved a load, we cannot any longer claim any knowledge about
// its potential value. The previous information might have been valid
// only given the branch precondition.
@@ -1081,11 +1081,11 @@ static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
// semantics we don't understand. We *can* preserve !annotation, because
// it is tied to the instruction itself, not the value or position.
NewBonusInst->dropUnknownNonDebugMetadata(LLVMContext::MD_annotation);
-
+
PredBlock->getInstList().insert(PTI->getIterator(), NewBonusInst);
NewBonusInst->takeName(&BonusInst);
BonusInst.setName(NewBonusInst->getName() + ".old");
-
+
// Update (liveout) uses of bonus instructions,
// now that the bonus instruction has been cloned into predecessor.
SSAUpdater SSAUpdate;
@@ -1097,26 +1097,26 @@ static void CloneInstructionsIntoPredecessorBlockAndUpdateSSAUses(
SSAUpdate.RewriteUseAfterInsertions(U);
}
}
-
+
bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
Instruction *TI, Value *&CV, Instruction *PTI, IRBuilder<> &Builder) {
BasicBlock *BB = TI->getParent();
BasicBlock *Pred = PTI->getParent();
-
+
std::vector<DominatorTree::UpdateType> Updates;
-
+
// Figure out which 'cases' to copy from SI to PSI.
std::vector<ValueEqualityComparisonCase> BBCases;
BasicBlock *BBDefault = GetValueEqualityComparisonCases(TI, BBCases);
-
+
std::vector<ValueEqualityComparisonCase> PredCases;
BasicBlock *PredDefault = GetValueEqualityComparisonCases(PTI, PredCases);
-
+
// Based on whether the default edge from PTI goes to BB or not, fill in
// PredCases and PredDefault with the new switch cases we would like to
// build.
SmallMapVector<BasicBlock *, int, 8> NewSuccessors;
-
+
// Update the branch weight metadata along the way
SmallVector<uint64_t, 8> Weights;
bool PredHasWeights = HasBranchWeights(PTI);
@@ -1158,13 +1158,13 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
Weights[0] += Weights[i + 1];
std::swap(Weights[i + 1], Weights.back());
Weights.pop_back();
- }
-
+ }
+
PredCases.pop_back();
--i;
--e;
}
-
+
// Reconstruct the new switch statement we will be building.
if (PredDefault != BBDefault) {
PredDefault->removePredecessor(Pred);
@@ -1180,15 +1180,15 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
if (!PTIHandled.count(BBCases[i].Value) && BBCases[i].Dest != BBDefault) {
PredCases.push_back(BBCases[i]);
++NewSuccessors[BBCases[i].Dest];
- if (SuccHasWeights || PredHasWeights) {
+ if (SuccHasWeights || PredHasWeights) {
// The default weight is at index 0, so weight for the ith case
// should be at index i+1. Scale the cases from successor by
// PredDefaultWeight (Weights[0]).
Weights.push_back(Weights[0] * SuccWeights[i + 1]);
ValidTotalSuccWeight += SuccWeights[i + 1];
- }
+ }
}
-
+
if (SuccHasWeights || PredHasWeights) {
ValidTotalSuccWeight += SuccWeights[0];
// Scale the cases from predecessor by ValidTotalSuccWeight.
@@ -1206,19 +1206,19 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
if (PredCases[i].Dest == BB) {
PTIHandled.insert(PredCases[i].Value);
-
+
if (PredHasWeights || SuccHasWeights) {
WeightsForHandled[PredCases[i].Value] = Weights[i + 1];
std::swap(Weights[i + 1], Weights.back());
Weights.pop_back();
}
-
+
std::swap(PredCases[i], PredCases.back());
PredCases.pop_back();
--i;
--e;
}
-
+
// Okay, now we know which constants were sent to BB from the
// predecessor. Figure out where they will all go now.
for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
@@ -1229,8 +1229,8 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
PredCases.push_back(BBCases[i]);
++NewSuccessors[BBCases[i].Dest];
PTIHandled.erase(BBCases[i].Value); // This constant is taken care of
- }
-
+ }
+
// If there are any constants vectored to BB that TI doesn't handle,
// they must go to the default destination of TI.
for (ConstantInt *I : PTIHandled) {
@@ -1240,7 +1240,7 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
++NewSuccessors[BBDefault];
}
}
-
+
// Okay, at this point, we know which new successor Pred will get. Make
// sure we update the number of entries in the PHI nodes for these
// successors.
@@ -1253,24 +1253,24 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
if (!is_contained(successors(Pred), NewSuccessor.first))
Updates.push_back({DominatorTree::Insert, Pred, NewSuccessor.first});
}
-
+
Builder.SetInsertPoint(PTI);
// Convert pointer to int before we switch.
if (CV->getType()->isPointerTy()) {
CV =
Builder.CreatePtrToInt(CV, DL.getIntPtrType(CV->getType()), "magicptr");
}
-
+
// Now that the successors are updated, create the new Switch instruction.
SwitchInst *NewSI = Builder.CreateSwitch(CV, PredDefault, PredCases.size());
NewSI->setDebugLoc(PTI->getDebugLoc());
for (ValueEqualityComparisonCase &V : PredCases)
NewSI->addCase(V.Value, V.Dest);
-
+
if (PredHasWeights || SuccHasWeights) {
// Halve the weights if any of them cannot fit in an uint32_t
FitWeights(Weights);
-
+
SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
setBranchWeights(NewSI, MDWeights);
@@ -1291,15 +1291,15 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
BasicBlock::Create(BB->getContext(), "infloop", BB->getParent());
BranchInst::Create(InfLoopBlock, InfLoopBlock);
Updates.push_back({DominatorTree::Insert, InfLoopBlock, InfLoopBlock});
- }
+ }
NewSI->setSuccessor(i, InfLoopBlock);
}
-
+
if (InfLoopBlock)
Updates.push_back({DominatorTree::Insert, Pred, InfLoopBlock});
-
+
Updates.push_back({DominatorTree::Delete, Pred, BB});
-
+
if (DTU)
DTU->applyUpdates(Updates);
@@ -1339,702 +1339,702 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(Instruction *TI,
if (!SplitBlockPredecessors(Succ, TI->getParent(), ".fold.split", DTU))
return false;
}
- }
+ }
PerformValueComparisonIntoPredecessorFolding(TI, CV, PTI, Builder);
Changed = true;
- }
- return Changed;
-}
-
-// If we would need to insert a select that uses the value of this invoke
-// (comments in HoistThenElseCodeToIf explain why we would need to do this), we
-// can't hoist the invoke, as there is nowhere to put the select in this case.
-static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
- Instruction *I1, Instruction *I2) {
- for (BasicBlock *Succ : successors(BB1)) {
- for (const PHINode &PN : Succ->phis()) {
- Value *BB1V = PN.getIncomingValueForBlock(BB1);
- Value *BB2V = PN.getIncomingValueForBlock(BB2);
- if (BB1V != BB2V && (BB1V == I1 || BB2V == I2)) {
- return false;
- }
- }
- }
- return true;
-}
-
+ }
+ return Changed;
+}
+
+// If we would need to insert a select that uses the value of this invoke
+// (comments in HoistThenElseCodeToIf explain why we would need to do this), we
+// can't hoist the invoke, as there is nowhere to put the select in this case.
+static bool isSafeToHoistInvoke(BasicBlock *BB1, BasicBlock *BB2,
+ Instruction *I1, Instruction *I2) {
+ for (BasicBlock *Succ : successors(BB1)) {
+ for (const PHINode &PN : Succ->phis()) {
+ Value *BB1V = PN.getIncomingValueForBlock(BB1);
+ Value *BB2V = PN.getIncomingValueForBlock(BB2);
+ if (BB1V != BB2V && (BB1V == I1 || BB2V == I2)) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValueMayBeModified = false);
-
-/// Given a conditional branch that goes to BB1 and BB2, hoist any common code
-/// in the two blocks up into the branch block. The caller of this function
-/// guarantees that BI's block dominates BB1 and BB2.
-bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
- const TargetTransformInfo &TTI) {
- // This does very trivial matching, with limited scanning, to find identical
- // instructions in the two blocks. In particular, we don't want to get into
- // O(M*N) situations here where M and N are the sizes of BB1 and BB2. As
- // such, we currently just scan for obviously identical instructions in an
- // identical order.
- BasicBlock *BB1 = BI->getSuccessor(0); // The true destination.
- BasicBlock *BB2 = BI->getSuccessor(1); // The false destination
-
- BasicBlock::iterator BB1_Itr = BB1->begin();
- BasicBlock::iterator BB2_Itr = BB2->begin();
-
- Instruction *I1 = &*BB1_Itr++, *I2 = &*BB2_Itr++;
- // Skip debug info if it is not identical.
- DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
- DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
- if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
- while (isa<DbgInfoIntrinsic>(I1))
- I1 = &*BB1_Itr++;
- while (isa<DbgInfoIntrinsic>(I2))
- I2 = &*BB2_Itr++;
- }
- // FIXME: Can we define a safety predicate for CallBr?
- if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2) ||
- (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)) ||
- isa<CallBrInst>(I1))
- return false;
-
- BasicBlock *BIParent = BI->getParent();
-
- bool Changed = false;
+
+/// Given a conditional branch that goes to BB1 and BB2, hoist any common code
+/// in the two blocks up into the branch block. The caller of this function
+/// guarantees that BI's block dominates BB1 and BB2.
+bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
+ const TargetTransformInfo &TTI) {
+ // This does very trivial matching, with limited scanning, to find identical
+ // instructions in the two blocks. In particular, we don't want to get into
+ // O(M*N) situations here where M and N are the sizes of BB1 and BB2. As
+ // such, we currently just scan for obviously identical instructions in an
+ // identical order.
+ BasicBlock *BB1 = BI->getSuccessor(0); // The true destination.
+ BasicBlock *BB2 = BI->getSuccessor(1); // The false destination
+
+ BasicBlock::iterator BB1_Itr = BB1->begin();
+ BasicBlock::iterator BB2_Itr = BB2->begin();
+
+ Instruction *I1 = &*BB1_Itr++, *I2 = &*BB2_Itr++;
+ // Skip debug info if it is not identical.
+ DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
+ DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
+ if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
+ while (isa<DbgInfoIntrinsic>(I1))
+ I1 = &*BB1_Itr++;
+ while (isa<DbgInfoIntrinsic>(I2))
+ I2 = &*BB2_Itr++;
+ }
+ // FIXME: Can we define a safety predicate for CallBr?
+ if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2) ||
+ (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)) ||
+ isa<CallBrInst>(I1))
+ return false;
+
+ BasicBlock *BIParent = BI->getParent();
+
+ bool Changed = false;
auto _ = make_scope_exit([&]() {
if (Changed)
++NumHoistCommonCode;
});
- do {
- // If we are hoisting the terminator instruction, don't move one (making a
- // broken BB), instead clone it, and remove BI.
- if (I1->isTerminator())
- goto HoistTerminator;
-
- // If we're going to hoist a call, make sure that the two instructions we're
- // commoning/hoisting are both marked with musttail, or neither of them is
- // marked as such. Otherwise, we might end up in a situation where we hoist
- // from a block where the terminator is a `ret` to a block where the terminator
- // is a `br`, and `musttail` calls expect to be followed by a return.
- auto *C1 = dyn_cast<CallInst>(I1);
- auto *C2 = dyn_cast<CallInst>(I2);
- if (C1 && C2)
- if (C1->isMustTailCall() != C2->isMustTailCall())
- return Changed;
-
- if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2))
- return Changed;
-
- // If any of the two call sites has nomerge attribute, stop hoisting.
- if (const auto *CB1 = dyn_cast<CallBase>(I1))
- if (CB1->cannotMerge())
- return Changed;
- if (const auto *CB2 = dyn_cast<CallBase>(I2))
- if (CB2->cannotMerge())
- return Changed;
-
- if (isa<DbgInfoIntrinsic>(I1) || isa<DbgInfoIntrinsic>(I2)) {
- assert (isa<DbgInfoIntrinsic>(I1) && isa<DbgInfoIntrinsic>(I2));
- // The debug location is an integral part of a debug info intrinsic
- // and can't be separated from it or replaced. Instead of attempting
- // to merge locations, simply hoist both copies of the intrinsic.
- BIParent->getInstList().splice(BI->getIterator(),
- BB1->getInstList(), I1);
- BIParent->getInstList().splice(BI->getIterator(),
- BB2->getInstList(), I2);
- Changed = true;
- } else {
- // For a normal instruction, we just move one to right before the branch,
- // then replace all uses of the other with the first. Finally, we remove
- // the now redundant second instruction.
- BIParent->getInstList().splice(BI->getIterator(),
- BB1->getInstList(), I1);
- if (!I2->use_empty())
- I2->replaceAllUsesWith(I1);
- I1->andIRFlags(I2);
- unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
- LLVMContext::MD_range,
- LLVMContext::MD_fpmath,
- LLVMContext::MD_invariant_load,
- LLVMContext::MD_nonnull,
- LLVMContext::MD_invariant_group,
- LLVMContext::MD_align,
- LLVMContext::MD_dereferenceable,
- LLVMContext::MD_dereferenceable_or_null,
- LLVMContext::MD_mem_parallel_loop_access,
- LLVMContext::MD_access_group,
- LLVMContext::MD_preserve_access_index};
- combineMetadata(I1, I2, KnownIDs, true);
-
- // I1 and I2 are being combined into a single instruction. Its debug
- // location is the merged locations of the original instructions.
- I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
-
- I2->eraseFromParent();
- Changed = true;
- }
+ do {
+ // If we are hoisting the terminator instruction, don't move one (making a
+ // broken BB), instead clone it, and remove BI.
+ if (I1->isTerminator())
+ goto HoistTerminator;
+
+ // If we're going to hoist a call, make sure that the two instructions we're
+ // commoning/hoisting are both marked with musttail, or neither of them is
+ // marked as such. Otherwise, we might end up in a situation where we hoist
+ // from a block where the terminator is a `ret` to a block where the terminator
+ // is a `br`, and `musttail` calls expect to be followed by a return.
+ auto *C1 = dyn_cast<CallInst>(I1);
+ auto *C2 = dyn_cast<CallInst>(I2);
+ if (C1 && C2)
+ if (C1->isMustTailCall() != C2->isMustTailCall())
+ return Changed;
+
+ if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2))
+ return Changed;
+
+ // If any of the two call sites has nomerge attribute, stop hoisting.
+ if (const auto *CB1 = dyn_cast<CallBase>(I1))
+ if (CB1->cannotMerge())
+ return Changed;
+ if (const auto *CB2 = dyn_cast<CallBase>(I2))
+ if (CB2->cannotMerge())
+ return Changed;
+
+ if (isa<DbgInfoIntrinsic>(I1) || isa<DbgInfoIntrinsic>(I2)) {
+ assert (isa<DbgInfoIntrinsic>(I1) && isa<DbgInfoIntrinsic>(I2));
+ // The debug location is an integral part of a debug info intrinsic
+ // and can't be separated from it or replaced. Instead of attempting
+ // to merge locations, simply hoist both copies of the intrinsic.
+ BIParent->getInstList().splice(BI->getIterator(),
+ BB1->getInstList(), I1);
+ BIParent->getInstList().splice(BI->getIterator(),
+ BB2->getInstList(), I2);
+ Changed = true;
+ } else {
+ // For a normal instruction, we just move one to right before the branch,
+ // then replace all uses of the other with the first. Finally, we remove
+ // the now redundant second instruction.
+ BIParent->getInstList().splice(BI->getIterator(),
+ BB1->getInstList(), I1);
+ if (!I2->use_empty())
+ I2->replaceAllUsesWith(I1);
+ I1->andIRFlags(I2);
+ unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
+ LLVMContext::MD_range,
+ LLVMContext::MD_fpmath,
+ LLVMContext::MD_invariant_load,
+ LLVMContext::MD_nonnull,
+ LLVMContext::MD_invariant_group,
+ LLVMContext::MD_align,
+ LLVMContext::MD_dereferenceable,
+ LLVMContext::MD_dereferenceable_or_null,
+ LLVMContext::MD_mem_parallel_loop_access,
+ LLVMContext::MD_access_group,
+ LLVMContext::MD_preserve_access_index};
+ combineMetadata(I1, I2, KnownIDs, true);
+
+ // I1 and I2 are being combined into a single instruction. Its debug
+ // location is the merged locations of the original instructions.
+ I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+
+ I2->eraseFromParent();
+ Changed = true;
+ }
++NumHoistCommonInstrs;
-
- I1 = &*BB1_Itr++;
- I2 = &*BB2_Itr++;
- // Skip debug info if it is not identical.
- DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
- DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
- if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
- while (isa<DbgInfoIntrinsic>(I1))
- I1 = &*BB1_Itr++;
- while (isa<DbgInfoIntrinsic>(I2))
- I2 = &*BB2_Itr++;
- }
- } while (I1->isIdenticalToWhenDefined(I2));
-
- return true;
-
-HoistTerminator:
- // It may not be possible to hoist an invoke.
- // FIXME: Can we define a safety predicate for CallBr?
- if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))
- return Changed;
-
- // TODO: callbr hoisting currently disabled pending further study.
- if (isa<CallBrInst>(I1))
- return Changed;
-
- for (BasicBlock *Succ : successors(BB1)) {
- for (PHINode &PN : Succ->phis()) {
- Value *BB1V = PN.getIncomingValueForBlock(BB1);
- Value *BB2V = PN.getIncomingValueForBlock(BB2);
- if (BB1V == BB2V)
- continue;
-
- // Check for passingValueIsAlwaysUndefined here because we would rather
- // eliminate undefined control flow then converting it to a select.
- if (passingValueIsAlwaysUndefined(BB1V, &PN) ||
- passingValueIsAlwaysUndefined(BB2V, &PN))
- return Changed;
-
- if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V))
- return Changed;
- if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V))
- return Changed;
- }
- }
-
- // Okay, it is safe to hoist the terminator.
- Instruction *NT = I1->clone();
- BIParent->getInstList().insert(BI->getIterator(), NT);
- if (!NT->getType()->isVoidTy()) {
- I1->replaceAllUsesWith(NT);
- I2->replaceAllUsesWith(NT);
- NT->takeName(I1);
- }
+
+ I1 = &*BB1_Itr++;
+ I2 = &*BB2_Itr++;
+ // Skip debug info if it is not identical.
+ DbgInfoIntrinsic *DBI1 = dyn_cast<DbgInfoIntrinsic>(I1);
+ DbgInfoIntrinsic *DBI2 = dyn_cast<DbgInfoIntrinsic>(I2);
+ if (!DBI1 || !DBI2 || !DBI1->isIdenticalToWhenDefined(DBI2)) {
+ while (isa<DbgInfoIntrinsic>(I1))
+ I1 = &*BB1_Itr++;
+ while (isa<DbgInfoIntrinsic>(I2))
+ I2 = &*BB2_Itr++;
+ }
+ } while (I1->isIdenticalToWhenDefined(I2));
+
+ return true;
+
+HoistTerminator:
+ // It may not be possible to hoist an invoke.
+ // FIXME: Can we define a safety predicate for CallBr?
+ if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))
+ return Changed;
+
+ // TODO: callbr hoisting currently disabled pending further study.
+ if (isa<CallBrInst>(I1))
+ return Changed;
+
+ for (BasicBlock *Succ : successors(BB1)) {
+ for (PHINode &PN : Succ->phis()) {
+ Value *BB1V = PN.getIncomingValueForBlock(BB1);
+ Value *BB2V = PN.getIncomingValueForBlock(BB2);
+ if (BB1V == BB2V)
+ continue;
+
+ // Check for passingValueIsAlwaysUndefined here because we would rather
+ // eliminate undefined control flow then converting it to a select.
+ if (passingValueIsAlwaysUndefined(BB1V, &PN) ||
+ passingValueIsAlwaysUndefined(BB2V, &PN))
+ return Changed;
+
+ if (isa<ConstantExpr>(BB1V) && !isSafeToSpeculativelyExecute(BB1V))
+ return Changed;
+ if (isa<ConstantExpr>(BB2V) && !isSafeToSpeculativelyExecute(BB2V))
+ return Changed;
+ }
+ }
+
+ // Okay, it is safe to hoist the terminator.
+ Instruction *NT = I1->clone();
+ BIParent->getInstList().insert(BI->getIterator(), NT);
+ if (!NT->getType()->isVoidTy()) {
+ I1->replaceAllUsesWith(NT);
+ I2->replaceAllUsesWith(NT);
+ NT->takeName(I1);
+ }
Changed = true;
++NumHoistCommonInstrs;
-
- // Ensure terminator gets a debug location, even an unknown one, in case
- // it involves inlinable calls.
- NT->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
-
- // PHIs created below will adopt NT's merged DebugLoc.
- IRBuilder<NoFolder> Builder(NT);
-
- // Hoisting one of the terminators from our successor is a great thing.
- // Unfortunately, the successors of the if/else blocks may have PHI nodes in
- // them. If they do, all PHI entries for BB1/BB2 must agree for all PHI
- // nodes, so we insert select instruction to compute the final result.
- std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects;
- for (BasicBlock *Succ : successors(BB1)) {
- for (PHINode &PN : Succ->phis()) {
- Value *BB1V = PN.getIncomingValueForBlock(BB1);
- Value *BB2V = PN.getIncomingValueForBlock(BB2);
- if (BB1V == BB2V)
- continue;
-
- // These values do not agree. Insert a select instruction before NT
- // that determines the right value.
- SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
- if (!SI) {
- // Propagate fast-math-flags from phi node to its replacement select.
- IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
- if (isa<FPMathOperator>(PN))
- Builder.setFastMathFlags(PN.getFastMathFlags());
-
- SI = cast<SelectInst>(
- Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
- BB1V->getName() + "." + BB2V->getName(), BI));
- }
-
- // Make the PHI node use the select for all incoming values for BB1/BB2
- for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
- if (PN.getIncomingBlock(i) == BB1 || PN.getIncomingBlock(i) == BB2)
- PN.setIncomingValue(i, SI);
- }
- }
-
+
+ // Ensure terminator gets a debug location, even an unknown one, in case
+ // it involves inlinable calls.
+ NT->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+
+ // PHIs created below will adopt NT's merged DebugLoc.
+ IRBuilder<NoFolder> Builder(NT);
+
+ // Hoisting one of the terminators from our successor is a great thing.
+ // Unfortunately, the successors of the if/else blocks may have PHI nodes in
+ // them. If they do, all PHI entries for BB1/BB2 must agree for all PHI
+ // nodes, so we insert select instruction to compute the final result.
+ std::map<std::pair<Value *, Value *>, SelectInst *> InsertedSelects;
+ for (BasicBlock *Succ : successors(BB1)) {
+ for (PHINode &PN : Succ->phis()) {
+ Value *BB1V = PN.getIncomingValueForBlock(BB1);
+ Value *BB2V = PN.getIncomingValueForBlock(BB2);
+ if (BB1V == BB2V)
+ continue;
+
+ // These values do not agree. Insert a select instruction before NT
+ // that determines the right value.
+ SelectInst *&SI = InsertedSelects[std::make_pair(BB1V, BB2V)];
+ if (!SI) {
+ // Propagate fast-math-flags from phi node to its replacement select.
+ IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+ if (isa<FPMathOperator>(PN))
+ Builder.setFastMathFlags(PN.getFastMathFlags());
+
+ SI = cast<SelectInst>(
+ Builder.CreateSelect(BI->getCondition(), BB1V, BB2V,
+ BB1V->getName() + "." + BB2V->getName(), BI));
+ }
+
+ // Make the PHI node use the select for all incoming values for BB1/BB2
+ for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
+ if (PN.getIncomingBlock(i) == BB1 || PN.getIncomingBlock(i) == BB2)
+ PN.setIncomingValue(i, SI);
+ }
+ }
+
SmallVector<DominatorTree::UpdateType, 4> Updates;
- // Update any PHI nodes in our new successors.
+ // Update any PHI nodes in our new successors.
for (BasicBlock *Succ : successors(BB1)) {
- AddPredecessorToBlock(Succ, BIParent, BB1);
+ AddPredecessorToBlock(Succ, BIParent, BB1);
Updates.push_back({DominatorTree::Insert, BIParent, Succ});
}
for (BasicBlock *Succ : successors(BI))
Updates.push_back({DominatorTree::Delete, BIParent, Succ});
-
- EraseTerminatorAndDCECond(BI);
+
+ EraseTerminatorAndDCECond(BI);
if (DTU)
DTU->applyUpdates(Updates);
return Changed;
-}
-
-// Check lifetime markers.
-static bool isLifeTimeMarker(const Instruction *I) {
- if (auto II = dyn_cast<IntrinsicInst>(I)) {
- switch (II->getIntrinsicID()) {
- default:
- break;
- case Intrinsic::lifetime_start:
- case Intrinsic::lifetime_end:
- return true;
- }
- }
- return false;
-}
-
-// TODO: Refine this. This should avoid cases like turning constant memcpy sizes
-// into variables.
-static bool replacingOperandWithVariableIsCheap(const Instruction *I,
- int OpIdx) {
- return !isa<IntrinsicInst>(I);
-}
-
-// All instructions in Insts belong to different blocks that all unconditionally
-// branch to a common successor. Analyze each instruction and return true if it
-// would be possible to sink them into their successor, creating one common
-// instruction instead. For every value that would be required to be provided by
-// PHI node (because an operand varies in each input block), add to PHIOperands.
-static bool canSinkInstructions(
- ArrayRef<Instruction *> Insts,
- DenseMap<Instruction *, SmallVector<Value *, 4>> &PHIOperands) {
- // Prune out obviously bad instructions to move. Each instruction must have
- // exactly zero or one use, and we check later that use is by a single, common
- // PHI instruction in the successor.
- bool HasUse = !Insts.front()->user_empty();
- for (auto *I : Insts) {
- // These instructions may change or break semantics if moved.
- if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
- I->getType()->isTokenTy())
- return false;
-
+}
+
+// Check lifetime markers.
+static bool isLifeTimeMarker(const Instruction *I) {
+ if (auto II = dyn_cast<IntrinsicInst>(I)) {
+ switch (II->getIntrinsicID()) {
+ default:
+ break;
+ case Intrinsic::lifetime_start:
+ case Intrinsic::lifetime_end:
+ return true;
+ }
+ }
+ return false;
+}
+
+// TODO: Refine this. This should avoid cases like turning constant memcpy sizes
+// into variables.
+static bool replacingOperandWithVariableIsCheap(const Instruction *I,
+ int OpIdx) {
+ return !isa<IntrinsicInst>(I);
+}
+
+// All instructions in Insts belong to different blocks that all unconditionally
+// branch to a common successor. Analyze each instruction and return true if it
+// would be possible to sink them into their successor, creating one common
+// instruction instead. For every value that would be required to be provided by
+// PHI node (because an operand varies in each input block), add to PHIOperands.
+static bool canSinkInstructions(
+ ArrayRef<Instruction *> Insts,
+ DenseMap<Instruction *, SmallVector<Value *, 4>> &PHIOperands) {
+ // Prune out obviously bad instructions to move. Each instruction must have
+ // exactly zero or one use, and we check later that use is by a single, common
+ // PHI instruction in the successor.
+ bool HasUse = !Insts.front()->user_empty();
+ for (auto *I : Insts) {
+ // These instructions may change or break semantics if moved.
+ if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
+ I->getType()->isTokenTy())
+ return false;
+
// Do not try to sink an instruction in an infinite loop - it can cause
// this algorithm to infinite loop.
if (I->getParent()->getSingleSuccessor() == I->getParent())
return false;
- // Conservatively return false if I is an inline-asm instruction. Sinking
- // and merging inline-asm instructions can potentially create arguments
- // that cannot satisfy the inline-asm constraints.
- // If the instruction has nomerge attribute, return false.
- if (const auto *C = dyn_cast<CallBase>(I))
- if (C->isInlineAsm() || C->cannotMerge())
- return false;
-
- // Each instruction must have zero or one use.
- if (HasUse && !I->hasOneUse())
- return false;
- if (!HasUse && !I->user_empty())
- return false;
- }
-
- const Instruction *I0 = Insts.front();
- for (auto *I : Insts)
- if (!I->isSameOperationAs(I0))
- return false;
-
- // All instructions in Insts are known to be the same opcode. If they have a
- // use, check that the only user is a PHI or in the same block as the
- // instruction, because if a user is in the same block as an instruction we're
- // contemplating sinking, it must already be determined to be sinkable.
- if (HasUse) {
- auto *PNUse = dyn_cast<PHINode>(*I0->user_begin());
- auto *Succ = I0->getParent()->getTerminator()->getSuccessor(0);
- if (!all_of(Insts, [&PNUse,&Succ](const Instruction *I) -> bool {
- auto *U = cast<Instruction>(*I->user_begin());
- return (PNUse &&
- PNUse->getParent() == Succ &&
- PNUse->getIncomingValueForBlock(I->getParent()) == I) ||
- U->getParent() == I->getParent();
- }))
- return false;
- }
-
- // Because SROA can't handle speculating stores of selects, try not to sink
- // loads, stores or lifetime markers of allocas when we'd have to create a
- // PHI for the address operand. Also, because it is likely that loads or
- // stores of allocas will disappear when Mem2Reg/SROA is run, don't sink
- // them.
- // This can cause code churn which can have unintended consequences down
- // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244.
- // FIXME: This is a workaround for a deficiency in SROA - see
- // https://llvm.org/bugs/show_bug.cgi?id=30188
- if (isa<StoreInst>(I0) && any_of(Insts, [](const Instruction *I) {
- return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts());
- }))
- return false;
- if (isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) {
- return isa<AllocaInst>(I->getOperand(0)->stripPointerCasts());
- }))
- return false;
- if (isLifeTimeMarker(I0) && any_of(Insts, [](const Instruction *I) {
- return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts());
- }))
- return false;
-
- for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) {
- Value *Op = I0->getOperand(OI);
- if (Op->getType()->isTokenTy())
- // Don't touch any operand of token type.
- return false;
-
- auto SameAsI0 = [&I0, OI](const Instruction *I) {
- assert(I->getNumOperands() == I0->getNumOperands());
- return I->getOperand(OI) == I0->getOperand(OI);
- };
- if (!all_of(Insts, SameAsI0)) {
- if ((isa<Constant>(Op) && !replacingOperandWithVariableIsCheap(I0, OI)) ||
- !canReplaceOperandWithVariable(I0, OI))
- // We can't create a PHI from this GEP.
- return false;
- // Don't create indirect calls! The called value is the final operand.
- if (isa<CallBase>(I0) && OI == OE - 1) {
- // FIXME: if the call was *already* indirect, we should do this.
- return false;
- }
- for (auto *I : Insts)
- PHIOperands[I].push_back(I->getOperand(OI));
- }
- }
- return true;
-}
-
+ // Conservatively return false if I is an inline-asm instruction. Sinking
+ // and merging inline-asm instructions can potentially create arguments
+ // that cannot satisfy the inline-asm constraints.
+ // If the instruction has nomerge attribute, return false.
+ if (const auto *C = dyn_cast<CallBase>(I))
+ if (C->isInlineAsm() || C->cannotMerge())
+ return false;
+
+ // Each instruction must have zero or one use.
+ if (HasUse && !I->hasOneUse())
+ return false;
+ if (!HasUse && !I->user_empty())
+ return false;
+ }
+
+ const Instruction *I0 = Insts.front();
+ for (auto *I : Insts)
+ if (!I->isSameOperationAs(I0))
+ return false;
+
+ // All instructions in Insts are known to be the same opcode. If they have a
+ // use, check that the only user is a PHI or in the same block as the
+ // instruction, because if a user is in the same block as an instruction we're
+ // contemplating sinking, it must already be determined to be sinkable.
+ if (HasUse) {
+ auto *PNUse = dyn_cast<PHINode>(*I0->user_begin());
+ auto *Succ = I0->getParent()->getTerminator()->getSuccessor(0);
+ if (!all_of(Insts, [&PNUse,&Succ](const Instruction *I) -> bool {
+ auto *U = cast<Instruction>(*I->user_begin());
+ return (PNUse &&
+ PNUse->getParent() == Succ &&
+ PNUse->getIncomingValueForBlock(I->getParent()) == I) ||
+ U->getParent() == I->getParent();
+ }))
+ return false;
+ }
+
+ // Because SROA can't handle speculating stores of selects, try not to sink
+ // loads, stores or lifetime markers of allocas when we'd have to create a
+ // PHI for the address operand. Also, because it is likely that loads or
+ // stores of allocas will disappear when Mem2Reg/SROA is run, don't sink
+ // them.
+ // This can cause code churn which can have unintended consequences down
+ // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244.
+ // FIXME: This is a workaround for a deficiency in SROA - see
+ // https://llvm.org/bugs/show_bug.cgi?id=30188
+ if (isa<StoreInst>(I0) && any_of(Insts, [](const Instruction *I) {
+ return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts());
+ }))
+ return false;
+ if (isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) {
+ return isa<AllocaInst>(I->getOperand(0)->stripPointerCasts());
+ }))
+ return false;
+ if (isLifeTimeMarker(I0) && any_of(Insts, [](const Instruction *I) {
+ return isa<AllocaInst>(I->getOperand(1)->stripPointerCasts());
+ }))
+ return false;
+
+ for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) {
+ Value *Op = I0->getOperand(OI);
+ if (Op->getType()->isTokenTy())
+ // Don't touch any operand of token type.
+ return false;
+
+ auto SameAsI0 = [&I0, OI](const Instruction *I) {
+ assert(I->getNumOperands() == I0->getNumOperands());
+ return I->getOperand(OI) == I0->getOperand(OI);
+ };
+ if (!all_of(Insts, SameAsI0)) {
+ if ((isa<Constant>(Op) && !replacingOperandWithVariableIsCheap(I0, OI)) ||
+ !canReplaceOperandWithVariable(I0, OI))
+ // We can't create a PHI from this GEP.
+ return false;
+ // Don't create indirect calls! The called value is the final operand.
+ if (isa<CallBase>(I0) && OI == OE - 1) {
+ // FIXME: if the call was *already* indirect, we should do this.
+ return false;
+ }
+ for (auto *I : Insts)
+ PHIOperands[I].push_back(I->getOperand(OI));
+ }
+ }
+ return true;
+}
+
// Assuming canSinkInstructions(Blocks) has returned true, sink the last
-// instruction of every block in Blocks to their common successor, commoning
-// into one instruction.
-static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
- auto *BBEnd = Blocks[0]->getTerminator()->getSuccessor(0);
-
+// instruction of every block in Blocks to their common successor, commoning
+// into one instruction.
+static bool sinkLastInstruction(ArrayRef<BasicBlock*> Blocks) {
+ auto *BBEnd = Blocks[0]->getTerminator()->getSuccessor(0);
+
// canSinkInstructions returning true guarantees that every block has at
- // least one non-terminator instruction.
- SmallVector<Instruction*,4> Insts;
- for (auto *BB : Blocks) {
- Instruction *I = BB->getTerminator();
- do {
- I = I->getPrevNode();
- } while (isa<DbgInfoIntrinsic>(I) && I != &BB->front());
- if (!isa<DbgInfoIntrinsic>(I))
- Insts.push_back(I);
- }
-
- // The only checking we need to do now is that all users of all instructions
+ // least one non-terminator instruction.
+ SmallVector<Instruction*,4> Insts;
+ for (auto *BB : Blocks) {
+ Instruction *I = BB->getTerminator();
+ do {
+ I = I->getPrevNode();
+ } while (isa<DbgInfoIntrinsic>(I) && I != &BB->front());
+ if (!isa<DbgInfoIntrinsic>(I))
+ Insts.push_back(I);
+ }
+
+ // The only checking we need to do now is that all users of all instructions
// are the same PHI node. canSinkInstructions should have checked this but
// it is slightly over-aggressive - it gets confused by commutative
// instructions so double-check it here.
- Instruction *I0 = Insts.front();
- if (!I0->user_empty()) {
- auto *PNUse = dyn_cast<PHINode>(*I0->user_begin());
- if (!all_of(Insts, [&PNUse](const Instruction *I) -> bool {
- auto *U = cast<Instruction>(*I->user_begin());
- return U == PNUse;
- }))
- return false;
- }
-
+ Instruction *I0 = Insts.front();
+ if (!I0->user_empty()) {
+ auto *PNUse = dyn_cast<PHINode>(*I0->user_begin());
+ if (!all_of(Insts, [&PNUse](const Instruction *I) -> bool {
+ auto *U = cast<Instruction>(*I->user_begin());
+ return U == PNUse;
+ }))
+ return false;
+ }
+
// We don't need to do any more checking here; canSinkInstructions should
- // have done it all for us.
- SmallVector<Value*, 4> NewOperands;
- for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
+ // have done it all for us.
+ SmallVector<Value*, 4> NewOperands;
+ for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O) {
// This check is different to that in canSinkInstructions. There, we
- // cared about the global view once simplifycfg (and instcombine) have
- // completed - it takes into account PHIs that become trivially
- // simplifiable. However here we need a more local view; if an operand
- // differs we create a PHI and rely on instcombine to clean up the very
- // small mess we may make.
- bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) {
- return I->getOperand(O) != I0->getOperand(O);
- });
- if (!NeedPHI) {
- NewOperands.push_back(I0->getOperand(O));
- continue;
- }
-
- // Create a new PHI in the successor block and populate it.
- auto *Op = I0->getOperand(O);
- assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!");
- auto *PN = PHINode::Create(Op->getType(), Insts.size(),
- Op->getName() + ".sink", &BBEnd->front());
- for (auto *I : Insts)
- PN->addIncoming(I->getOperand(O), I->getParent());
- NewOperands.push_back(PN);
- }
-
- // Arbitrarily use I0 as the new "common" instruction; remap its operands
- // and move it to the start of the successor block.
- for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
- I0->getOperandUse(O).set(NewOperands[O]);
- I0->moveBefore(&*BBEnd->getFirstInsertionPt());
-
- // Update metadata and IR flags, and merge debug locations.
- for (auto *I : Insts)
- if (I != I0) {
- // The debug location for the "common" instruction is the merged locations
- // of all the commoned instructions. We start with the original location
- // of the "common" instruction and iteratively merge each location in the
- // loop below.
- // This is an N-way merge, which will be inefficient if I0 is a CallInst.
- // However, as N-way merge for CallInst is rare, so we use simplified API
- // instead of using complex API for N-way merge.
- I0->applyMergedLocation(I0->getDebugLoc(), I->getDebugLoc());
- combineMetadataForCSE(I0, I, true);
- I0->andIRFlags(I);
- }
-
- if (!I0->user_empty()) {
- // canSinkLastInstruction checked that all instructions were used by
- // one and only one PHI node. Find that now, RAUW it to our common
- // instruction and nuke it.
- auto *PN = cast<PHINode>(*I0->user_begin());
- PN->replaceAllUsesWith(I0);
- PN->eraseFromParent();
- }
-
- // Finally nuke all instructions apart from the common instruction.
- for (auto *I : Insts)
- if (I != I0)
- I->eraseFromParent();
-
- return true;
-}
-
-namespace {
-
- // LockstepReverseIterator - Iterates through instructions
- // in a set of blocks in reverse order from the first non-terminator.
- // For example (assume all blocks have size n):
- // LockstepReverseIterator I([B1, B2, B3]);
- // *I-- = [B1[n], B2[n], B3[n]];
- // *I-- = [B1[n-1], B2[n-1], B3[n-1]];
- // *I-- = [B1[n-2], B2[n-2], B3[n-2]];
- // ...
- class LockstepReverseIterator {
- ArrayRef<BasicBlock*> Blocks;
- SmallVector<Instruction*,4> Insts;
- bool Fail;
-
- public:
- LockstepReverseIterator(ArrayRef<BasicBlock*> Blocks) : Blocks(Blocks) {
- reset();
- }
-
- void reset() {
- Fail = false;
- Insts.clear();
- for (auto *BB : Blocks) {
- Instruction *Inst = BB->getTerminator();
- for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
- Inst = Inst->getPrevNode();
- if (!Inst) {
- // Block wasn't big enough.
- Fail = true;
- return;
- }
- Insts.push_back(Inst);
- }
- }
-
- bool isValid() const {
- return !Fail;
- }
-
- void operator--() {
- if (Fail)
- return;
- for (auto *&Inst : Insts) {
- for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
- Inst = Inst->getPrevNode();
- // Already at beginning of block.
- if (!Inst) {
- Fail = true;
- return;
- }
- }
- }
-
- ArrayRef<Instruction*> operator * () const {
- return Insts;
- }
- };
-
-} // end anonymous namespace
-
-/// Check whether BB's predecessors end with unconditional branches. If it is
-/// true, sink any common code from the predecessors to BB.
-/// We also allow one predecessor to end with conditional branch (but no more
-/// than one).
+ // cared about the global view once simplifycfg (and instcombine) have
+ // completed - it takes into account PHIs that become trivially
+ // simplifiable. However here we need a more local view; if an operand
+ // differs we create a PHI and rely on instcombine to clean up the very
+ // small mess we may make.
+ bool NeedPHI = any_of(Insts, [&I0, O](const Instruction *I) {
+ return I->getOperand(O) != I0->getOperand(O);
+ });
+ if (!NeedPHI) {
+ NewOperands.push_back(I0->getOperand(O));
+ continue;
+ }
+
+ // Create a new PHI in the successor block and populate it.
+ auto *Op = I0->getOperand(O);
+ assert(!Op->getType()->isTokenTy() && "Can't PHI tokens!");
+ auto *PN = PHINode::Create(Op->getType(), Insts.size(),
+ Op->getName() + ".sink", &BBEnd->front());
+ for (auto *I : Insts)
+ PN->addIncoming(I->getOperand(O), I->getParent());
+ NewOperands.push_back(PN);
+ }
+
+ // Arbitrarily use I0 as the new "common" instruction; remap its operands
+ // and move it to the start of the successor block.
+ for (unsigned O = 0, E = I0->getNumOperands(); O != E; ++O)
+ I0->getOperandUse(O).set(NewOperands[O]);
+ I0->moveBefore(&*BBEnd->getFirstInsertionPt());
+
+ // Update metadata and IR flags, and merge debug locations.
+ for (auto *I : Insts)
+ if (I != I0) {
+ // The debug location for the "common" instruction is the merged locations
+ // of all the commoned instructions. We start with the original location
+ // of the "common" instruction and iteratively merge each location in the
+ // loop below.
+ // This is an N-way merge, which will be inefficient if I0 is a CallInst.
+ // However, as N-way merge for CallInst is rare, so we use simplified API
+ // instead of using complex API for N-way merge.
+ I0->applyMergedLocation(I0->getDebugLoc(), I->getDebugLoc());
+ combineMetadataForCSE(I0, I, true);
+ I0->andIRFlags(I);
+ }
+
+ if (!I0->user_empty()) {
+ // canSinkLastInstruction checked that all instructions were used by
+ // one and only one PHI node. Find that now, RAUW it to our common
+ // instruction and nuke it.
+ auto *PN = cast<PHINode>(*I0->user_begin());
+ PN->replaceAllUsesWith(I0);
+ PN->eraseFromParent();
+ }
+
+ // Finally nuke all instructions apart from the common instruction.
+ for (auto *I : Insts)
+ if (I != I0)
+ I->eraseFromParent();
+
+ return true;
+}
+
+namespace {
+
+ // LockstepReverseIterator - Iterates through instructions
+ // in a set of blocks in reverse order from the first non-terminator.
+ // For example (assume all blocks have size n):
+ // LockstepReverseIterator I([B1, B2, B3]);
+ // *I-- = [B1[n], B2[n], B3[n]];
+ // *I-- = [B1[n-1], B2[n-1], B3[n-1]];
+ // *I-- = [B1[n-2], B2[n-2], B3[n-2]];
+ // ...
+ class LockstepReverseIterator {
+ ArrayRef<BasicBlock*> Blocks;
+ SmallVector<Instruction*,4> Insts;
+ bool Fail;
+
+ public:
+ LockstepReverseIterator(ArrayRef<BasicBlock*> Blocks) : Blocks(Blocks) {
+ reset();
+ }
+
+ void reset() {
+ Fail = false;
+ Insts.clear();
+ for (auto *BB : Blocks) {
+ Instruction *Inst = BB->getTerminator();
+ for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
+ Inst = Inst->getPrevNode();
+ if (!Inst) {
+ // Block wasn't big enough.
+ Fail = true;
+ return;
+ }
+ Insts.push_back(Inst);
+ }
+ }
+
+ bool isValid() const {
+ return !Fail;
+ }
+
+ void operator--() {
+ if (Fail)
+ return;
+ for (auto *&Inst : Insts) {
+ for (Inst = Inst->getPrevNode(); Inst && isa<DbgInfoIntrinsic>(Inst);)
+ Inst = Inst->getPrevNode();
+ // Already at beginning of block.
+ if (!Inst) {
+ Fail = true;
+ return;
+ }
+ }
+ }
+
+ ArrayRef<Instruction*> operator * () const {
+ return Insts;
+ }
+ };
+
+} // end anonymous namespace
+
+/// Check whether BB's predecessors end with unconditional branches. If it is
+/// true, sink any common code from the predecessors to BB.
+/// We also allow one predecessor to end with conditional branch (but no more
+/// than one).
static bool SinkCommonCodeFromPredecessors(BasicBlock *BB,
DomTreeUpdater *DTU) {
- // We support two situations:
- // (1) all incoming arcs are unconditional
- // (2) one incoming arc is conditional
- //
- // (2) is very common in switch defaults and
- // else-if patterns;
- //
- // if (a) f(1);
- // else if (b) f(2);
- //
- // produces:
- //
- // [if]
- // / \
- // [f(1)] [if]
- // | | \
- // | | |
- // | [f(2)]|
- // \ | /
- // [ end ]
- //
- // [end] has two unconditional predecessor arcs and one conditional. The
- // conditional refers to the implicit empty 'else' arc. This conditional
- // arc can also be caused by an empty default block in a switch.
- //
- // In this case, we attempt to sink code from all *unconditional* arcs.
- // If we can sink instructions from these arcs (determined during the scan
- // phase below) we insert a common successor for all unconditional arcs and
- // connect that to [end], to enable sinking:
- //
- // [if]
- // / \
- // [x(1)] [if]
- // | | \
- // | | \
- // | [x(2)] |
- // \ / |
- // [sink.split] |
- // \ /
- // [ end ]
- //
- SmallVector<BasicBlock*,4> UnconditionalPreds;
- Instruction *Cond = nullptr;
- for (auto *B : predecessors(BB)) {
- auto *T = B->getTerminator();
- if (isa<BranchInst>(T) && cast<BranchInst>(T)->isUnconditional())
- UnconditionalPreds.push_back(B);
- else if ((isa<BranchInst>(T) || isa<SwitchInst>(T)) && !Cond)
- Cond = T;
- else
- return false;
- }
- if (UnconditionalPreds.size() < 2)
- return false;
-
- // We take a two-step approach to tail sinking. First we scan from the end of
- // each block upwards in lockstep. If the n'th instruction from the end of each
- // block can be sunk, those instructions are added to ValuesToSink and we
- // carry on. If we can sink an instruction but need to PHI-merge some operands
- // (because they're not identical in each instruction) we add these to
- // PHIOperands.
- unsigned ScanIdx = 0;
- SmallPtrSet<Value*,4> InstructionsToSink;
- DenseMap<Instruction*, SmallVector<Value*,4>> PHIOperands;
- LockstepReverseIterator LRI(UnconditionalPreds);
- while (LRI.isValid() &&
- canSinkInstructions(*LRI, PHIOperands)) {
- LLVM_DEBUG(dbgs() << "SINK: instruction can be sunk: " << *(*LRI)[0]
- << "\n");
- InstructionsToSink.insert((*LRI).begin(), (*LRI).end());
- ++ScanIdx;
- --LRI;
- }
-
+ // We support two situations:
+ // (1) all incoming arcs are unconditional
+ // (2) one incoming arc is conditional
+ //
+ // (2) is very common in switch defaults and
+ // else-if patterns;
+ //
+ // if (a) f(1);
+ // else if (b) f(2);
+ //
+ // produces:
+ //
+ // [if]
+ // / \
+ // [f(1)] [if]
+ // | | \
+ // | | |
+ // | [f(2)]|
+ // \ | /
+ // [ end ]
+ //
+ // [end] has two unconditional predecessor arcs and one conditional. The
+ // conditional refers to the implicit empty 'else' arc. This conditional
+ // arc can also be caused by an empty default block in a switch.
+ //
+ // In this case, we attempt to sink code from all *unconditional* arcs.
+ // If we can sink instructions from these arcs (determined during the scan
+ // phase below) we insert a common successor for all unconditional arcs and
+ // connect that to [end], to enable sinking:
+ //
+ // [if]
+ // / \
+ // [x(1)] [if]
+ // | | \
+ // | | \
+ // | [x(2)] |
+ // \ / |
+ // [sink.split] |
+ // \ /
+ // [ end ]
+ //
+ SmallVector<BasicBlock*,4> UnconditionalPreds;
+ Instruction *Cond = nullptr;
+ for (auto *B : predecessors(BB)) {
+ auto *T = B->getTerminator();
+ if (isa<BranchInst>(T) && cast<BranchInst>(T)->isUnconditional())
+ UnconditionalPreds.push_back(B);
+ else if ((isa<BranchInst>(T) || isa<SwitchInst>(T)) && !Cond)
+ Cond = T;
+ else
+ return false;
+ }
+ if (UnconditionalPreds.size() < 2)
+ return false;
+
+ // We take a two-step approach to tail sinking. First we scan from the end of
+ // each block upwards in lockstep. If the n'th instruction from the end of each
+ // block can be sunk, those instructions are added to ValuesToSink and we
+ // carry on. If we can sink an instruction but need to PHI-merge some operands
+ // (because they're not identical in each instruction) we add these to
+ // PHIOperands.
+ unsigned ScanIdx = 0;
+ SmallPtrSet<Value*,4> InstructionsToSink;
+ DenseMap<Instruction*, SmallVector<Value*,4>> PHIOperands;
+ LockstepReverseIterator LRI(UnconditionalPreds);
+ while (LRI.isValid() &&
+ canSinkInstructions(*LRI, PHIOperands)) {
+ LLVM_DEBUG(dbgs() << "SINK: instruction can be sunk: " << *(*LRI)[0]
+ << "\n");
+ InstructionsToSink.insert((*LRI).begin(), (*LRI).end());
+ ++ScanIdx;
+ --LRI;
+ }
+
// If no instructions can be sunk, early-return.
if (ScanIdx == 0)
return false;
bool Changed = false;
- auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) {
- unsigned NumPHIdValues = 0;
- for (auto *I : *LRI)
- for (auto *V : PHIOperands[I])
- if (InstructionsToSink.count(V) == 0)
- ++NumPHIdValues;
- LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
- unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size();
- if ((NumPHIdValues % UnconditionalPreds.size()) != 0)
- NumPHIInsts++;
-
- return NumPHIInsts <= 1;
- };
-
+ auto ProfitableToSinkInstruction = [&](LockstepReverseIterator &LRI) {
+ unsigned NumPHIdValues = 0;
+ for (auto *I : *LRI)
+ for (auto *V : PHIOperands[I])
+ if (InstructionsToSink.count(V) == 0)
+ ++NumPHIdValues;
+ LLVM_DEBUG(dbgs() << "SINK: #phid values: " << NumPHIdValues << "\n");
+ unsigned NumPHIInsts = NumPHIdValues / UnconditionalPreds.size();
+ if ((NumPHIdValues % UnconditionalPreds.size()) != 0)
+ NumPHIInsts++;
+
+ return NumPHIInsts <= 1;
+ };
+
if (Cond) {
- // Check if we would actually sink anything first! This mutates the CFG and
- // adds an extra block. The goal in doing this is to allow instructions that
- // couldn't be sunk before to be sunk - obviously, speculatable instructions
- // (such as trunc, add) can be sunk and predicated already. So we check that
- // we're going to sink at least one non-speculatable instruction.
- LRI.reset();
- unsigned Idx = 0;
- bool Profitable = false;
- while (ProfitableToSinkInstruction(LRI) && Idx < ScanIdx) {
- if (!isSafeToSpeculativelyExecute((*LRI)[0])) {
- Profitable = true;
- break;
- }
- --LRI;
- ++Idx;
- }
- if (!Profitable)
- return false;
-
- LLVM_DEBUG(dbgs() << "SINK: Splitting edge\n");
- // We have a conditional edge and we're going to sink some instructions.
- // Insert a new block postdominating all blocks we're going to sink from.
+ // Check if we would actually sink anything first! This mutates the CFG and
+ // adds an extra block. The goal in doing this is to allow instructions that
+ // couldn't be sunk before to be sunk - obviously, speculatable instructions
+ // (such as trunc, add) can be sunk and predicated already. So we check that
+ // we're going to sink at least one non-speculatable instruction.
+ LRI.reset();
+ unsigned Idx = 0;
+ bool Profitable = false;
+ while (ProfitableToSinkInstruction(LRI) && Idx < ScanIdx) {
+ if (!isSafeToSpeculativelyExecute((*LRI)[0])) {
+ Profitable = true;
+ break;
+ }
+ --LRI;
+ ++Idx;
+ }
+ if (!Profitable)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "SINK: Splitting edge\n");
+ // We have a conditional edge and we're going to sink some instructions.
+ // Insert a new block postdominating all blocks we're going to sink from.
if (!SplitBlockPredecessors(BB, UnconditionalPreds, ".sink.split", DTU))
- // Edges couldn't be split.
- return false;
- Changed = true;
- }
-
- // Now that we've analyzed all potential sinking candidates, perform the
- // actual sink. We iteratively sink the last non-terminator of the source
- // blocks into their common successor unless doing so would require too
- // many PHI instructions to be generated (currently only one PHI is allowed
- // per sunk instruction).
- //
- // We can use InstructionsToSink to discount values needing PHI-merging that will
- // actually be sunk in a later iteration. This allows us to be more
- // aggressive in what we sink. This does allow a false positive where we
- // sink presuming a later value will also be sunk, but stop half way through
- // and never actually sink it which means we produce more PHIs than intended.
- // This is unlikely in practice though.
+ // Edges couldn't be split.
+ return false;
+ Changed = true;
+ }
+
+ // Now that we've analyzed all potential sinking candidates, perform the
+ // actual sink. We iteratively sink the last non-terminator of the source
+ // blocks into their common successor unless doing so would require too
+ // many PHI instructions to be generated (currently only one PHI is allowed
+ // per sunk instruction).
+ //
+ // We can use InstructionsToSink to discount values needing PHI-merging that will
+ // actually be sunk in a later iteration. This allows us to be more
+ // aggressive in what we sink. This does allow a false positive where we
+ // sink presuming a later value will also be sunk, but stop half way through
+ // and never actually sink it which means we produce more PHIs than intended.
+ // This is unlikely in practice though.
unsigned SinkIdx = 0;
for (; SinkIdx != ScanIdx; ++SinkIdx) {
- LLVM_DEBUG(dbgs() << "SINK: Sink: "
- << *UnconditionalPreds[0]->getTerminator()->getPrevNode()
- << "\n");
-
- // Because we've sunk every instruction in turn, the current instruction to
- // sink is always at index 0.
- LRI.reset();
- if (!ProfitableToSinkInstruction(LRI)) {
- // Too many PHIs would be created.
- LLVM_DEBUG(
- dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
- break;
- }
-
+ LLVM_DEBUG(dbgs() << "SINK: Sink: "
+ << *UnconditionalPreds[0]->getTerminator()->getPrevNode()
+ << "\n");
+
+ // Because we've sunk every instruction in turn, the current instruction to
+ // sink is always at index 0.
+ LRI.reset();
+ if (!ProfitableToSinkInstruction(LRI)) {
+ // Too many PHIs would be created.
+ LLVM_DEBUG(
+ dbgs() << "SINK: stopping here, too many PHIs would be created!\n");
+ break;
+ }
+
if (!sinkLastInstruction(UnconditionalPreds)) {
LLVM_DEBUG(
dbgs()
@@ -2043,76 +2043,76 @@ static bool SinkCommonCodeFromPredecessors(BasicBlock *BB,
}
NumSinkCommonInstrs++;
- Changed = true;
- }
+ Changed = true;
+ }
if (SinkIdx != 0)
++NumSinkCommonCode;
- return Changed;
-}
-
-/// Determine if we can hoist sink a sole store instruction out of a
-/// conditional block.
-///
-/// We are looking for code like the following:
-/// BrBB:
-/// store i32 %add, i32* %arrayidx2
-/// ... // No other stores or function calls (we could be calling a memory
-/// ... // function).
-/// %cmp = icmp ult %x, %y
-/// br i1 %cmp, label %EndBB, label %ThenBB
-/// ThenBB:
-/// store i32 %add5, i32* %arrayidx2
-/// br label EndBB
-/// EndBB:
-/// ...
-/// We are going to transform this into:
-/// BrBB:
-/// store i32 %add, i32* %arrayidx2
-/// ... //
-/// %cmp = icmp ult %x, %y
-/// %add.add5 = select i1 %cmp, i32 %add, %add5
-/// store i32 %add.add5, i32* %arrayidx2
-/// ...
-///
-/// \return The pointer to the value of the previous store if the store can be
-/// hoisted into the predecessor block. 0 otherwise.
-static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
- BasicBlock *StoreBB, BasicBlock *EndBB) {
- StoreInst *StoreToHoist = dyn_cast<StoreInst>(I);
- if (!StoreToHoist)
- return nullptr;
-
- // Volatile or atomic.
- if (!StoreToHoist->isSimple())
- return nullptr;
-
- Value *StorePtr = StoreToHoist->getPointerOperand();
-
- // Look for a store to the same pointer in BrBB.
- unsigned MaxNumInstToLookAt = 9;
+ return Changed;
+}
+
+/// Determine if we can hoist sink a sole store instruction out of a
+/// conditional block.
+///
+/// We are looking for code like the following:
+/// BrBB:
+/// store i32 %add, i32* %arrayidx2
+/// ... // No other stores or function calls (we could be calling a memory
+/// ... // function).
+/// %cmp = icmp ult %x, %y
+/// br i1 %cmp, label %EndBB, label %ThenBB
+/// ThenBB:
+/// store i32 %add5, i32* %arrayidx2
+/// br label EndBB
+/// EndBB:
+/// ...
+/// We are going to transform this into:
+/// BrBB:
+/// store i32 %add, i32* %arrayidx2
+/// ... //
+/// %cmp = icmp ult %x, %y
+/// %add.add5 = select i1 %cmp, i32 %add, %add5
+/// store i32 %add.add5, i32* %arrayidx2
+/// ...
+///
+/// \return The pointer to the value of the previous store if the store can be
+/// hoisted into the predecessor block. 0 otherwise.
+static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
+ BasicBlock *StoreBB, BasicBlock *EndBB) {
+ StoreInst *StoreToHoist = dyn_cast<StoreInst>(I);
+ if (!StoreToHoist)
+ return nullptr;
+
+ // Volatile or atomic.
+ if (!StoreToHoist->isSimple())
+ return nullptr;
+
+ Value *StorePtr = StoreToHoist->getPointerOperand();
+
+ // Look for a store to the same pointer in BrBB.
+ unsigned MaxNumInstToLookAt = 9;
// Skip pseudo probe intrinsic calls which are not really killing any memory
// accesses.
for (Instruction &CurI : reverse(BrBB->instructionsWithoutDebug(true))) {
- if (!MaxNumInstToLookAt)
- break;
- --MaxNumInstToLookAt;
-
- // Could be calling an instruction that affects memory like free().
- if (CurI.mayHaveSideEffects() && !isa<StoreInst>(CurI))
- return nullptr;
-
- if (auto *SI = dyn_cast<StoreInst>(&CurI)) {
- // Found the previous store make sure it stores to the same location.
- if (SI->getPointerOperand() == StorePtr)
- // Found the previous store, return its value operand.
- return SI->getValueOperand();
- return nullptr; // Unknown store.
- }
- }
-
- return nullptr;
-}
-
+ if (!MaxNumInstToLookAt)
+ break;
+ --MaxNumInstToLookAt;
+
+ // Could be calling an instruction that affects memory like free().
+ if (CurI.mayHaveSideEffects() && !isa<StoreInst>(CurI))
+ return nullptr;
+
+ if (auto *SI = dyn_cast<StoreInst>(&CurI)) {
+ // Found the previous store make sure it stores to the same location.
+ if (SI->getPointerOperand() == StorePtr)
+ // Found the previous store, return its value operand.
+ return SI->getValueOperand();
+ return nullptr; // Unknown store.
+ }
+ }
+
+ return nullptr;
+}
+
/// Estimate the cost of the insertion(s) and check that the PHI nodes can be
/// converted to selects.
static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
@@ -2172,86 +2172,86 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
return HaveRewritablePHIs;
}
-/// Speculate a conditional basic block flattening the CFG.
-///
-/// Note that this is a very risky transform currently. Speculating
-/// instructions like this is most often not desirable. Instead, there is an MI
-/// pass which can do it with full awareness of the resource constraints.
-/// However, some cases are "obvious" and we should do directly. An example of
-/// this is speculating a single, reasonably cheap instruction.
-///
-/// There is only one distinct advantage to flattening the CFG at the IR level:
-/// it makes very common but simplistic optimizations such as are common in
-/// instcombine and the DAG combiner more powerful by removing CFG edges and
-/// modeling their effects with easier to reason about SSA value graphs.
-///
-///
-/// An illustration of this transform is turning this IR:
-/// \code
-/// BB:
-/// %cmp = icmp ult %x, %y
-/// br i1 %cmp, label %EndBB, label %ThenBB
-/// ThenBB:
-/// %sub = sub %x, %y
-/// br label BB2
-/// EndBB:
-/// %phi = phi [ %sub, %ThenBB ], [ 0, %EndBB ]
-/// ...
-/// \endcode
-///
-/// Into this IR:
-/// \code
-/// BB:
-/// %cmp = icmp ult %x, %y
-/// %sub = sub %x, %y
-/// %cond = select i1 %cmp, 0, %sub
-/// ...
-/// \endcode
-///
-/// \returns true if the conditional block is removed.
-bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
- const TargetTransformInfo &TTI) {
- // Be conservative for now. FP select instruction can often be expensive.
- Value *BrCond = BI->getCondition();
- if (isa<FCmpInst>(BrCond))
- return false;
-
- BasicBlock *BB = BI->getParent();
- BasicBlock *EndBB = ThenBB->getTerminator()->getSuccessor(0);
+/// Speculate a conditional basic block flattening the CFG.
+///
+/// Note that this is a very risky transform currently. Speculating
+/// instructions like this is most often not desirable. Instead, there is an MI
+/// pass which can do it with full awareness of the resource constraints.
+/// However, some cases are "obvious" and we should do directly. An example of
+/// this is speculating a single, reasonably cheap instruction.
+///
+/// There is only one distinct advantage to flattening the CFG at the IR level:
+/// it makes very common but simplistic optimizations such as are common in
+/// instcombine and the DAG combiner more powerful by removing CFG edges and
+/// modeling their effects with easier to reason about SSA value graphs.
+///
+///
+/// An illustration of this transform is turning this IR:
+/// \code
+/// BB:
+/// %cmp = icmp ult %x, %y
+/// br i1 %cmp, label %EndBB, label %ThenBB
+/// ThenBB:
+/// %sub = sub %x, %y
+/// br label BB2
+/// EndBB:
+/// %phi = phi [ %sub, %ThenBB ], [ 0, %EndBB ]
+/// ...
+/// \endcode
+///
+/// Into this IR:
+/// \code
+/// BB:
+/// %cmp = icmp ult %x, %y
+/// %sub = sub %x, %y
+/// %cond = select i1 %cmp, 0, %sub
+/// ...
+/// \endcode
+///
+/// \returns true if the conditional block is removed.
+bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
+ const TargetTransformInfo &TTI) {
+ // Be conservative for now. FP select instruction can often be expensive.
+ Value *BrCond = BI->getCondition();
+ if (isa<FCmpInst>(BrCond))
+ return false;
+
+ BasicBlock *BB = BI->getParent();
+ BasicBlock *EndBB = ThenBB->getTerminator()->getSuccessor(0);
int BudgetRemaining =
PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
-
- // If ThenBB is actually on the false edge of the conditional branch, remember
- // to swap the select operands later.
- bool Invert = false;
- if (ThenBB != BI->getSuccessor(0)) {
- assert(ThenBB == BI->getSuccessor(1) && "No edge from 'if' block?");
- Invert = true;
- }
- assert(EndBB == BI->getSuccessor(!Invert) && "No edge from to end block");
-
- // Keep a count of how many times instructions are used within ThenBB when
- // they are candidates for sinking into ThenBB. Specifically:
- // - They are defined in BB, and
- // - They have no side effects, and
- // - All of their uses are in ThenBB.
- SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts;
-
- SmallVector<Instruction *, 4> SpeculatedDbgIntrinsics;
-
- unsigned SpeculatedInstructions = 0;
- Value *SpeculatedStoreValue = nullptr;
- StoreInst *SpeculatedStore = nullptr;
- for (BasicBlock::iterator BBI = ThenBB->begin(),
- BBE = std::prev(ThenBB->end());
- BBI != BBE; ++BBI) {
- Instruction *I = &*BBI;
- // Skip debug info.
- if (isa<DbgInfoIntrinsic>(I)) {
- SpeculatedDbgIntrinsics.push_back(I);
- continue;
- }
-
+
+ // If ThenBB is actually on the false edge of the conditional branch, remember
+ // to swap the select operands later.
+ bool Invert = false;
+ if (ThenBB != BI->getSuccessor(0)) {
+ assert(ThenBB == BI->getSuccessor(1) && "No edge from 'if' block?");
+ Invert = true;
+ }
+ assert(EndBB == BI->getSuccessor(!Invert) && "No edge from to end block");
+
+ // Keep a count of how many times instructions are used within ThenBB when
+ // they are candidates for sinking into ThenBB. Specifically:
+ // - They are defined in BB, and
+ // - They have no side effects, and
+ // - All of their uses are in ThenBB.
+ SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts;
+
+ SmallVector<Instruction *, 4> SpeculatedDbgIntrinsics;
+
+ unsigned SpeculatedInstructions = 0;
+ Value *SpeculatedStoreValue = nullptr;
+ StoreInst *SpeculatedStore = nullptr;
+ for (BasicBlock::iterator BBI = ThenBB->begin(),
+ BBE = std::prev(ThenBB->end());
+ BBI != BBE; ++BBI) {
+ Instruction *I = &*BBI;
+ // Skip debug info.
+ if (isa<DbgInfoIntrinsic>(I)) {
+ SpeculatedDbgIntrinsics.push_back(I);
+ continue;
+ }
+
// Skip pseudo probes. The consequence is we lose track of the branch
// probability for ThenBB, which is fine since the optimization here takes
// place regardless of the branch probability.
@@ -2260,51 +2260,51 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
continue;
}
- // Only speculatively execute a single instruction (not counting the
- // terminator) for now.
- ++SpeculatedInstructions;
- if (SpeculatedInstructions > 1)
- return false;
-
- // Don't hoist the instruction if it's unsafe or expensive.
- if (!isSafeToSpeculativelyExecute(I) &&
- !(HoistCondStores && (SpeculatedStoreValue = isSafeToSpeculateStore(
- I, BB, ThenBB, EndBB))))
- return false;
- if (!SpeculatedStoreValue &&
- ComputeSpeculationCost(I, TTI) >
- PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic)
- return false;
-
- // Store the store speculation candidate.
- if (SpeculatedStoreValue)
- SpeculatedStore = cast<StoreInst>(I);
-
- // Do not hoist the instruction if any of its operands are defined but not
- // used in BB. The transformation will prevent the operand from
- // being sunk into the use block.
- for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) {
- Instruction *OpI = dyn_cast<Instruction>(*i);
- if (!OpI || OpI->getParent() != BB || OpI->mayHaveSideEffects())
- continue; // Not a candidate for sinking.
-
- ++SinkCandidateUseCounts[OpI];
- }
- }
-
- // Consider any sink candidates which are only used in ThenBB as costs for
- // speculation. Note, while we iterate over a DenseMap here, we are summing
- // and so iteration order isn't significant.
- for (SmallDenseMap<Instruction *, unsigned, 4>::iterator
- I = SinkCandidateUseCounts.begin(),
- E = SinkCandidateUseCounts.end();
- I != E; ++I)
- if (I->first->hasNUses(I->second)) {
- ++SpeculatedInstructions;
- if (SpeculatedInstructions > 1)
- return false;
- }
-
+ // Only speculatively execute a single instruction (not counting the
+ // terminator) for now.
+ ++SpeculatedInstructions;
+ if (SpeculatedInstructions > 1)
+ return false;
+
+ // Don't hoist the instruction if it's unsafe or expensive.
+ if (!isSafeToSpeculativelyExecute(I) &&
+ !(HoistCondStores && (SpeculatedStoreValue = isSafeToSpeculateStore(
+ I, BB, ThenBB, EndBB))))
+ return false;
+ if (!SpeculatedStoreValue &&
+ ComputeSpeculationCost(I, TTI) >
+ PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic)
+ return false;
+
+ // Store the store speculation candidate.
+ if (SpeculatedStoreValue)
+ SpeculatedStore = cast<StoreInst>(I);
+
+ // Do not hoist the instruction if any of its operands are defined but not
+ // used in BB. The transformation will prevent the operand from
+ // being sunk into the use block.
+ for (User::op_iterator i = I->op_begin(), e = I->op_end(); i != e; ++i) {
+ Instruction *OpI = dyn_cast<Instruction>(*i);
+ if (!OpI || OpI->getParent() != BB || OpI->mayHaveSideEffects())
+ continue; // Not a candidate for sinking.
+
+ ++SinkCandidateUseCounts[OpI];
+ }
+ }
+
+ // Consider any sink candidates which are only used in ThenBB as costs for
+ // speculation. Note, while we iterate over a DenseMap here, we are summing
+ // and so iteration order isn't significant.
+ for (SmallDenseMap<Instruction *, unsigned, 4>::iterator
+ I = SinkCandidateUseCounts.begin(),
+ E = SinkCandidateUseCounts.end();
+ I != E; ++I)
+ if (I->first->hasNUses(I->second)) {
+ ++SpeculatedInstructions;
+ if (SpeculatedInstructions > 1)
+ return false;
+ }
+
// Check that we can insert the selects and that it's not too expensive to do
// so.
bool Convert = SpeculatedStore != nullptr;
@@ -2312,379 +2312,379 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
SpeculatedInstructions,
BudgetRemaining, TTI);
if (!Convert || BudgetRemaining < 0)
- return false;
-
- // If we get here, we can hoist the instruction and if-convert.
- LLVM_DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
-
- // Insert a select of the value of the speculated store.
- if (SpeculatedStoreValue) {
- IRBuilder<NoFolder> Builder(BI);
- Value *TrueV = SpeculatedStore->getValueOperand();
- Value *FalseV = SpeculatedStoreValue;
- if (Invert)
- std::swap(TrueV, FalseV);
- Value *S = Builder.CreateSelect(
- BrCond, TrueV, FalseV, "spec.store.select", BI);
- SpeculatedStore->setOperand(0, S);
- SpeculatedStore->applyMergedLocation(BI->getDebugLoc(),
- SpeculatedStore->getDebugLoc());
- }
-
- // Metadata can be dependent on the condition we are hoisting above.
- // Conservatively strip all metadata on the instruction. Drop the debug loc
- // to avoid making it appear as if the condition is a constant, which would
- // be misleading while debugging.
- for (auto &I : *ThenBB) {
- if (!SpeculatedStoreValue || &I != SpeculatedStore)
- I.setDebugLoc(DebugLoc());
- I.dropUnknownNonDebugMetadata();
- }
-
- // Hoist the instructions.
- BB->getInstList().splice(BI->getIterator(), ThenBB->getInstList(),
- ThenBB->begin(), std::prev(ThenBB->end()));
-
- // Insert selects and rewrite the PHI operands.
- IRBuilder<NoFolder> Builder(BI);
- for (PHINode &PN : EndBB->phis()) {
- unsigned OrigI = PN.getBasicBlockIndex(BB);
- unsigned ThenI = PN.getBasicBlockIndex(ThenBB);
- Value *OrigV = PN.getIncomingValue(OrigI);
- Value *ThenV = PN.getIncomingValue(ThenI);
-
- // Skip PHIs which are trivial.
- if (OrigV == ThenV)
- continue;
-
- // Create a select whose true value is the speculatively executed value and
- // false value is the pre-existing value. Swap them if the branch
- // destinations were inverted.
- Value *TrueV = ThenV, *FalseV = OrigV;
- if (Invert)
- std::swap(TrueV, FalseV);
- Value *V = Builder.CreateSelect(BrCond, TrueV, FalseV, "spec.select", BI);
- PN.setIncomingValue(OrigI, V);
- PN.setIncomingValue(ThenI, V);
- }
-
- // Remove speculated dbg intrinsics.
- // FIXME: Is it possible to do this in a more elegant way? Moving/merging the
- // dbg value for the different flows and inserting it after the select.
- for (Instruction *I : SpeculatedDbgIntrinsics)
- I->eraseFromParent();
-
- ++NumSpeculations;
- return true;
-}
-
-/// Return true if we can thread a branch across this block.
-static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
- int Size = 0;
-
- for (Instruction &I : BB->instructionsWithoutDebug()) {
- if (Size > MaxSmallBlockSize)
- return false; // Don't clone large BB's.
+ return false;
+
+ // If we get here, we can hoist the instruction and if-convert.
+ LLVM_DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
+
+ // Insert a select of the value of the speculated store.
+ if (SpeculatedStoreValue) {
+ IRBuilder<NoFolder> Builder(BI);
+ Value *TrueV = SpeculatedStore->getValueOperand();
+ Value *FalseV = SpeculatedStoreValue;
+ if (Invert)
+ std::swap(TrueV, FalseV);
+ Value *S = Builder.CreateSelect(
+ BrCond, TrueV, FalseV, "spec.store.select", BI);
+ SpeculatedStore->setOperand(0, S);
+ SpeculatedStore->applyMergedLocation(BI->getDebugLoc(),
+ SpeculatedStore->getDebugLoc());
+ }
+
+ // Metadata can be dependent on the condition we are hoisting above.
+ // Conservatively strip all metadata on the instruction. Drop the debug loc
+ // to avoid making it appear as if the condition is a constant, which would
+ // be misleading while debugging.
+ for (auto &I : *ThenBB) {
+ if (!SpeculatedStoreValue || &I != SpeculatedStore)
+ I.setDebugLoc(DebugLoc());
+ I.dropUnknownNonDebugMetadata();
+ }
+
+ // Hoist the instructions.
+ BB->getInstList().splice(BI->getIterator(), ThenBB->getInstList(),
+ ThenBB->begin(), std::prev(ThenBB->end()));
+
+ // Insert selects and rewrite the PHI operands.
+ IRBuilder<NoFolder> Builder(BI);
+ for (PHINode &PN : EndBB->phis()) {
+ unsigned OrigI = PN.getBasicBlockIndex(BB);
+ unsigned ThenI = PN.getBasicBlockIndex(ThenBB);
+ Value *OrigV = PN.getIncomingValue(OrigI);
+ Value *ThenV = PN.getIncomingValue(ThenI);
+
+ // Skip PHIs which are trivial.
+ if (OrigV == ThenV)
+ continue;
+
+ // Create a select whose true value is the speculatively executed value and
+ // false value is the pre-existing value. Swap them if the branch
+ // destinations were inverted.
+ Value *TrueV = ThenV, *FalseV = OrigV;
+ if (Invert)
+ std::swap(TrueV, FalseV);
+ Value *V = Builder.CreateSelect(BrCond, TrueV, FalseV, "spec.select", BI);
+ PN.setIncomingValue(OrigI, V);
+ PN.setIncomingValue(ThenI, V);
+ }
+
+ // Remove speculated dbg intrinsics.
+ // FIXME: Is it possible to do this in a more elegant way? Moving/merging the
+ // dbg value for the different flows and inserting it after the select.
+ for (Instruction *I : SpeculatedDbgIntrinsics)
+ I->eraseFromParent();
+
+ ++NumSpeculations;
+ return true;
+}
+
+/// Return true if we can thread a branch across this block.
+static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
+ int Size = 0;
+
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ if (Size > MaxSmallBlockSize)
+ return false; // Don't clone large BB's.
// Can't fold blocks that contain noduplicate or convergent calls.
if (CallInst *CI = dyn_cast<CallInst>(&I))
if (CI->cannotDuplicate() || CI->isConvergent())
return false;
- // We will delete Phis while threading, so Phis should not be accounted in
- // block's size
- if (!isa<PHINode>(I))
- ++Size;
-
- // We can only support instructions that do not define values that are
- // live outside of the current basic block.
- for (User *U : I.users()) {
- Instruction *UI = cast<Instruction>(U);
- if (UI->getParent() != BB || isa<PHINode>(UI))
- return false;
- }
-
- // Looks ok, continue checking.
- }
-
- return true;
-}
-
-/// If we have a conditional branch on a PHI node value that is defined in the
-/// same block as the branch and if any PHI entries are constants, thread edges
-/// corresponding to that entry to be branches to their ultimate destination.
+ // We will delete Phis while threading, so Phis should not be accounted in
+ // block's size
+ if (!isa<PHINode>(I))
+ ++Size;
+
+ // We can only support instructions that do not define values that are
+ // live outside of the current basic block.
+ for (User *U : I.users()) {
+ Instruction *UI = cast<Instruction>(U);
+ if (UI->getParent() != BB || isa<PHINode>(UI))
+ return false;
+ }
+
+ // Looks ok, continue checking.
+ }
+
+ return true;
+}
+
+/// If we have a conditional branch on a PHI node value that is defined in the
+/// same block as the branch and if any PHI entries are constants, thread edges
+/// corresponding to that entry to be branches to their ultimate destination.
static bool FoldCondBranchOnPHI(BranchInst *BI, DomTreeUpdater *DTU,
const DataLayout &DL, AssumptionCache *AC) {
- BasicBlock *BB = BI->getParent();
- PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
- // NOTE: we currently cannot transform this case if the PHI node is used
- // outside of the block.
- if (!PN || PN->getParent() != BB || !PN->hasOneUse())
- return false;
-
- // Degenerate case of a single entry PHI.
- if (PN->getNumIncomingValues() == 1) {
- FoldSingleEntryPHINodes(PN->getParent());
- return true;
- }
-
- // Now we know that this block has multiple preds and two succs.
- if (!BlockIsSimpleEnoughToThreadThrough(BB))
- return false;
-
- // Okay, this is a simple enough basic block. See if any phi values are
- // constants.
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i));
- if (!CB || !CB->getType()->isIntegerTy(1))
- continue;
-
- // Okay, we now know that all edges from PredBB should be revectored to
- // branch to RealDest.
- BasicBlock *PredBB = PN->getIncomingBlock(i);
- BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
-
- if (RealDest == BB)
- continue; // Skip self loops.
- // Skip if the predecessor's terminator is an indirect branch.
- if (isa<IndirectBrInst>(PredBB->getTerminator()))
- continue;
-
+ BasicBlock *BB = BI->getParent();
+ PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
+ // NOTE: we currently cannot transform this case if the PHI node is used
+ // outside of the block.
+ if (!PN || PN->getParent() != BB || !PN->hasOneUse())
+ return false;
+
+ // Degenerate case of a single entry PHI.
+ if (PN->getNumIncomingValues() == 1) {
+ FoldSingleEntryPHINodes(PN->getParent());
+ return true;
+ }
+
+ // Now we know that this block has multiple preds and two succs.
+ if (!BlockIsSimpleEnoughToThreadThrough(BB))
+ return false;
+
+ // Okay, this is a simple enough basic block. See if any phi values are
+ // constants.
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i));
+ if (!CB || !CB->getType()->isIntegerTy(1))
+ continue;
+
+ // Okay, we now know that all edges from PredBB should be revectored to
+ // branch to RealDest.
+ BasicBlock *PredBB = PN->getIncomingBlock(i);
+ BasicBlock *RealDest = BI->getSuccessor(!CB->getZExtValue());
+
+ if (RealDest == BB)
+ continue; // Skip self loops.
+ // Skip if the predecessor's terminator is an indirect branch.
+ if (isa<IndirectBrInst>(PredBB->getTerminator()))
+ continue;
+
SmallVector<DominatorTree::UpdateType, 3> Updates;
- // The dest block might have PHI nodes, other predecessors and other
- // difficult cases. Instead of being smart about this, just insert a new
- // block that jumps to the destination block, effectively splitting
- // the edge we are about to create.
- BasicBlock *EdgeBB =
- BasicBlock::Create(BB->getContext(), RealDest->getName() + ".critedge",
- RealDest->getParent(), RealDest);
- BranchInst *CritEdgeBranch = BranchInst::Create(RealDest, EdgeBB);
+ // The dest block might have PHI nodes, other predecessors and other
+ // difficult cases. Instead of being smart about this, just insert a new
+ // block that jumps to the destination block, effectively splitting
+ // the edge we are about to create.
+ BasicBlock *EdgeBB =
+ BasicBlock::Create(BB->getContext(), RealDest->getName() + ".critedge",
+ RealDest->getParent(), RealDest);
+ BranchInst *CritEdgeBranch = BranchInst::Create(RealDest, EdgeBB);
Updates.push_back({DominatorTree::Insert, EdgeBB, RealDest});
- CritEdgeBranch->setDebugLoc(BI->getDebugLoc());
-
- // Update PHI nodes.
- AddPredecessorToBlock(RealDest, EdgeBB, BB);
-
- // BB may have instructions that are being threaded over. Clone these
- // instructions into EdgeBB. We know that there will be no uses of the
- // cloned instructions outside of EdgeBB.
- BasicBlock::iterator InsertPt = EdgeBB->begin();
- DenseMap<Value *, Value *> TranslateMap; // Track translated values.
- for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
- if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
- TranslateMap[PN] = PN->getIncomingValueForBlock(PredBB);
- continue;
- }
- // Clone the instruction.
- Instruction *N = BBI->clone();
- if (BBI->hasName())
- N->setName(BBI->getName() + ".c");
-
- // Update operands due to translation.
- for (User::op_iterator i = N->op_begin(), e = N->op_end(); i != e; ++i) {
- DenseMap<Value *, Value *>::iterator PI = TranslateMap.find(*i);
- if (PI != TranslateMap.end())
- *i = PI->second;
- }
-
- // Check for trivial simplification.
- if (Value *V = SimplifyInstruction(N, {DL, nullptr, nullptr, AC})) {
- if (!BBI->use_empty())
- TranslateMap[&*BBI] = V;
- if (!N->mayHaveSideEffects()) {
- N->deleteValue(); // Instruction folded away, don't need actual inst
- N = nullptr;
- }
- } else {
- if (!BBI->use_empty())
- TranslateMap[&*BBI] = N;
- }
- if (N) {
- // Insert the new instruction into its new home.
- EdgeBB->getInstList().insert(InsertPt, N);
-
- // Register the new instruction with the assumption cache if necessary.
- if (AC && match(N, m_Intrinsic<Intrinsic::assume>()))
- AC->registerAssumption(cast<IntrinsicInst>(N));
- }
- }
-
- // Loop over all of the edges from PredBB to BB, changing them to branch
- // to EdgeBB instead.
- Instruction *PredBBTI = PredBB->getTerminator();
- for (unsigned i = 0, e = PredBBTI->getNumSuccessors(); i != e; ++i)
- if (PredBBTI->getSuccessor(i) == BB) {
- BB->removePredecessor(PredBB);
- PredBBTI->setSuccessor(i, EdgeBB);
- }
-
+ CritEdgeBranch->setDebugLoc(BI->getDebugLoc());
+
+ // Update PHI nodes.
+ AddPredecessorToBlock(RealDest, EdgeBB, BB);
+
+ // BB may have instructions that are being threaded over. Clone these
+ // instructions into EdgeBB. We know that there will be no uses of the
+ // cloned instructions outside of EdgeBB.
+ BasicBlock::iterator InsertPt = EdgeBB->begin();
+ DenseMap<Value *, Value *> TranslateMap; // Track translated values.
+ for (BasicBlock::iterator BBI = BB->begin(); &*BBI != BI; ++BBI) {
+ if (PHINode *PN = dyn_cast<PHINode>(BBI)) {
+ TranslateMap[PN] = PN->getIncomingValueForBlock(PredBB);
+ continue;
+ }
+ // Clone the instruction.
+ Instruction *N = BBI->clone();
+ if (BBI->hasName())
+ N->setName(BBI->getName() + ".c");
+
+ // Update operands due to translation.
+ for (User::op_iterator i = N->op_begin(), e = N->op_end(); i != e; ++i) {
+ DenseMap<Value *, Value *>::iterator PI = TranslateMap.find(*i);
+ if (PI != TranslateMap.end())
+ *i = PI->second;
+ }
+
+ // Check for trivial simplification.
+ if (Value *V = SimplifyInstruction(N, {DL, nullptr, nullptr, AC})) {
+ if (!BBI->use_empty())
+ TranslateMap[&*BBI] = V;
+ if (!N->mayHaveSideEffects()) {
+ N->deleteValue(); // Instruction folded away, don't need actual inst
+ N = nullptr;
+ }
+ } else {
+ if (!BBI->use_empty())
+ TranslateMap[&*BBI] = N;
+ }
+ if (N) {
+ // Insert the new instruction into its new home.
+ EdgeBB->getInstList().insert(InsertPt, N);
+
+ // Register the new instruction with the assumption cache if necessary.
+ if (AC && match(N, m_Intrinsic<Intrinsic::assume>()))
+ AC->registerAssumption(cast<IntrinsicInst>(N));
+ }
+ }
+
+ // Loop over all of the edges from PredBB to BB, changing them to branch
+ // to EdgeBB instead.
+ Instruction *PredBBTI = PredBB->getTerminator();
+ for (unsigned i = 0, e = PredBBTI->getNumSuccessors(); i != e; ++i)
+ if (PredBBTI->getSuccessor(i) == BB) {
+ BB->removePredecessor(PredBB);
+ PredBBTI->setSuccessor(i, EdgeBB);
+ }
+
Updates.push_back({DominatorTree::Insert, PredBB, EdgeBB});
Updates.push_back({DominatorTree::Delete, PredBB, BB});
if (DTU)
DTU->applyUpdates(Updates);
- // Recurse, simplifying any other constants.
+ // Recurse, simplifying any other constants.
return FoldCondBranchOnPHI(BI, DTU, DL, AC) || true;
- }
-
- return false;
-}
-
-/// Given a BB that starts with the specified two-entry PHI node,
-/// see if we can eliminate it.
-static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
+ }
+
+ return false;
+}
+
+/// Given a BB that starts with the specified two-entry PHI node,
+/// see if we can eliminate it.
+static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
DomTreeUpdater *DTU, const DataLayout &DL) {
- // Ok, this is a two entry PHI node. Check to see if this is a simple "if
- // statement", which has a very simple dominance structure. Basically, we
- // are trying to find the condition that is being branched on, which
- // subsequently causes this merge to happen. We really want control
- // dependence information for this check, but simplifycfg can't keep it up
- // to date, and this catches most of the cases we care about anyway.
- BasicBlock *BB = PN->getParent();
-
- BasicBlock *IfTrue, *IfFalse;
- Value *IfCond = GetIfCondition(BB, IfTrue, IfFalse);
- if (!IfCond ||
- // Don't bother if the branch will be constant folded trivially.
- isa<ConstantInt>(IfCond))
- return false;
-
- // Okay, we found that we can merge this two-entry phi node into a select.
- // Doing so would require us to fold *all* two entry phi nodes in this block.
- // At some point this becomes non-profitable (particularly if the target
- // doesn't support cmov's). Only do this transformation if there are two or
- // fewer PHI nodes in this block.
- unsigned NumPhis = 0;
- for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++NumPhis, ++I)
- if (NumPhis > 2)
- return false;
-
- // Loop over the PHI's seeing if we can promote them all to select
- // instructions. While we are at it, keep track of the instructions
- // that need to be moved to the dominating block.
- SmallPtrSet<Instruction *, 4> AggressiveInsts;
- int BudgetRemaining =
- TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
-
+ // Ok, this is a two entry PHI node. Check to see if this is a simple "if
+ // statement", which has a very simple dominance structure. Basically, we
+ // are trying to find the condition that is being branched on, which
+ // subsequently causes this merge to happen. We really want control
+ // dependence information for this check, but simplifycfg can't keep it up
+ // to date, and this catches most of the cases we care about anyway.
+ BasicBlock *BB = PN->getParent();
+
+ BasicBlock *IfTrue, *IfFalse;
+ Value *IfCond = GetIfCondition(BB, IfTrue, IfFalse);
+ if (!IfCond ||
+ // Don't bother if the branch will be constant folded trivially.
+ isa<ConstantInt>(IfCond))
+ return false;
+
+ // Okay, we found that we can merge this two-entry phi node into a select.
+ // Doing so would require us to fold *all* two entry phi nodes in this block.
+ // At some point this becomes non-profitable (particularly if the target
+ // doesn't support cmov's). Only do this transformation if there are two or
+ // fewer PHI nodes in this block.
+ unsigned NumPhis = 0;
+ for (BasicBlock::iterator I = BB->begin(); isa<PHINode>(I); ++NumPhis, ++I)
+ if (NumPhis > 2)
+ return false;
+
+ // Loop over the PHI's seeing if we can promote them all to select
+ // instructions. While we are at it, keep track of the instructions
+ // that need to be moved to the dominating block.
+ SmallPtrSet<Instruction *, 4> AggressiveInsts;
+ int BudgetRemaining =
+ TwoEntryPHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
+
bool Changed = false;
- for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
- PHINode *PN = cast<PHINode>(II++);
- if (Value *V = SimplifyInstruction(PN, {DL, PN})) {
- PN->replaceAllUsesWith(V);
- PN->eraseFromParent();
+ for (BasicBlock::iterator II = BB->begin(); isa<PHINode>(II);) {
+ PHINode *PN = cast<PHINode>(II++);
+ if (Value *V = SimplifyInstruction(PN, {DL, PN})) {
+ PN->replaceAllUsesWith(V);
+ PN->eraseFromParent();
Changed = true;
- continue;
- }
-
- if (!DominatesMergePoint(PN->getIncomingValue(0), BB, AggressiveInsts,
- BudgetRemaining, TTI) ||
- !DominatesMergePoint(PN->getIncomingValue(1), BB, AggressiveInsts,
- BudgetRemaining, TTI))
+ continue;
+ }
+
+ if (!DominatesMergePoint(PN->getIncomingValue(0), BB, AggressiveInsts,
+ BudgetRemaining, TTI) ||
+ !DominatesMergePoint(PN->getIncomingValue(1), BB, AggressiveInsts,
+ BudgetRemaining, TTI))
return Changed;
- }
-
- // If we folded the first phi, PN dangles at this point. Refresh it. If
- // we ran out of PHIs then we simplified them all.
- PN = dyn_cast<PHINode>(BB->begin());
- if (!PN)
- return true;
-
- // Return true if at least one of these is a 'not', and another is either
- // a 'not' too, or a constant.
- auto CanHoistNotFromBothValues = [](Value *V0, Value *V1) {
- if (!match(V0, m_Not(m_Value())))
- std::swap(V0, V1);
- auto Invertible = m_CombineOr(m_Not(m_Value()), m_AnyIntegralConstant());
- return match(V0, m_Not(m_Value())) && match(V1, Invertible);
- };
-
- // Don't fold i1 branches on PHIs which contain binary operators, unless one
- // of the incoming values is an 'not' and another one is freely invertible.
- // These can often be turned into switches and other things.
- if (PN->getType()->isIntegerTy(1) &&
- (isa<BinaryOperator>(PN->getIncomingValue(0)) ||
- isa<BinaryOperator>(PN->getIncomingValue(1)) ||
- isa<BinaryOperator>(IfCond)) &&
- !CanHoistNotFromBothValues(PN->getIncomingValue(0),
- PN->getIncomingValue(1)))
+ }
+
+ // If we folded the first phi, PN dangles at this point. Refresh it. If
+ // we ran out of PHIs then we simplified them all.
+ PN = dyn_cast<PHINode>(BB->begin());
+ if (!PN)
+ return true;
+
+ // Return true if at least one of these is a 'not', and another is either
+ // a 'not' too, or a constant.
+ auto CanHoistNotFromBothValues = [](Value *V0, Value *V1) {
+ if (!match(V0, m_Not(m_Value())))
+ std::swap(V0, V1);
+ auto Invertible = m_CombineOr(m_Not(m_Value()), m_AnyIntegralConstant());
+ return match(V0, m_Not(m_Value())) && match(V1, Invertible);
+ };
+
+ // Don't fold i1 branches on PHIs which contain binary operators, unless one
+ // of the incoming values is an 'not' and another one is freely invertible.
+ // These can often be turned into switches and other things.
+ if (PN->getType()->isIntegerTy(1) &&
+ (isa<BinaryOperator>(PN->getIncomingValue(0)) ||
+ isa<BinaryOperator>(PN->getIncomingValue(1)) ||
+ isa<BinaryOperator>(IfCond)) &&
+ !CanHoistNotFromBothValues(PN->getIncomingValue(0),
+ PN->getIncomingValue(1)))
return Changed;
-
- // If all PHI nodes are promotable, check to make sure that all instructions
- // in the predecessor blocks can be promoted as well. If not, we won't be able
- // to get rid of the control flow, so it's not worth promoting to select
- // instructions.
- BasicBlock *DomBlock = nullptr;
- BasicBlock *IfBlock1 = PN->getIncomingBlock(0);
- BasicBlock *IfBlock2 = PN->getIncomingBlock(1);
- if (cast<BranchInst>(IfBlock1->getTerminator())->isConditional()) {
- IfBlock1 = nullptr;
- } else {
- DomBlock = *pred_begin(IfBlock1);
- for (BasicBlock::iterator I = IfBlock1->begin(); !I->isTerminator(); ++I)
+
+ // If all PHI nodes are promotable, check to make sure that all instructions
+ // in the predecessor blocks can be promoted as well. If not, we won't be able
+ // to get rid of the control flow, so it's not worth promoting to select
+ // instructions.
+ BasicBlock *DomBlock = nullptr;
+ BasicBlock *IfBlock1 = PN->getIncomingBlock(0);
+ BasicBlock *IfBlock2 = PN->getIncomingBlock(1);
+ if (cast<BranchInst>(IfBlock1->getTerminator())->isConditional()) {
+ IfBlock1 = nullptr;
+ } else {
+ DomBlock = *pred_begin(IfBlock1);
+ for (BasicBlock::iterator I = IfBlock1->begin(); !I->isTerminator(); ++I)
if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I) &&
!isa<PseudoProbeInst>(I)) {
- // This is not an aggressive instruction that we can promote.
- // Because of this, we won't be able to get rid of the control flow, so
- // the xform is not worth it.
+ // This is not an aggressive instruction that we can promote.
+ // Because of this, we won't be able to get rid of the control flow, so
+ // the xform is not worth it.
return Changed;
- }
- }
-
- if (cast<BranchInst>(IfBlock2->getTerminator())->isConditional()) {
- IfBlock2 = nullptr;
- } else {
- DomBlock = *pred_begin(IfBlock2);
- for (BasicBlock::iterator I = IfBlock2->begin(); !I->isTerminator(); ++I)
+ }
+ }
+
+ if (cast<BranchInst>(IfBlock2->getTerminator())->isConditional()) {
+ IfBlock2 = nullptr;
+ } else {
+ DomBlock = *pred_begin(IfBlock2);
+ for (BasicBlock::iterator I = IfBlock2->begin(); !I->isTerminator(); ++I)
if (!AggressiveInsts.count(&*I) && !isa<DbgInfoIntrinsic>(I) &&
!isa<PseudoProbeInst>(I)) {
- // This is not an aggressive instruction that we can promote.
- // Because of this, we won't be able to get rid of the control flow, so
- // the xform is not worth it.
+ // This is not an aggressive instruction that we can promote.
+ // Because of this, we won't be able to get rid of the control flow, so
+ // the xform is not worth it.
return Changed;
- }
- }
- assert(DomBlock && "Failed to find root DomBlock");
-
- LLVM_DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond
- << " T: " << IfTrue->getName()
- << " F: " << IfFalse->getName() << "\n");
-
- // If we can still promote the PHI nodes after this gauntlet of tests,
- // do all of the PHI's now.
- Instruction *InsertPt = DomBlock->getTerminator();
- IRBuilder<NoFolder> Builder(InsertPt);
-
- // Move all 'aggressive' instructions, which are defined in the
- // conditional parts of the if's up to the dominating block.
- if (IfBlock1)
- hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock1);
- if (IfBlock2)
- hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock2);
-
- // Propagate fast-math-flags from phi nodes to replacement selects.
- IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
- while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
- if (isa<FPMathOperator>(PN))
- Builder.setFastMathFlags(PN->getFastMathFlags());
-
- // Change the PHI node into a select instruction.
- Value *TrueVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
- Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
-
- Value *Sel = Builder.CreateSelect(IfCond, TrueVal, FalseVal, "", InsertPt);
- PN->replaceAllUsesWith(Sel);
- Sel->takeName(PN);
- PN->eraseFromParent();
- }
-
- // At this point, IfBlock1 and IfBlock2 are both empty, so our if statement
- // has been flattened. Change DomBlock to jump directly to our new block to
- // avoid other simplifycfg's kicking in on the diamond.
- Instruction *OldTI = DomBlock->getTerminator();
- Builder.SetInsertPoint(OldTI);
- Builder.CreateBr(BB);
+ }
+ }
+ assert(DomBlock && "Failed to find root DomBlock");
+
+ LLVM_DEBUG(dbgs() << "FOUND IF CONDITION! " << *IfCond
+ << " T: " << IfTrue->getName()
+ << " F: " << IfFalse->getName() << "\n");
+
+ // If we can still promote the PHI nodes after this gauntlet of tests,
+ // do all of the PHI's now.
+ Instruction *InsertPt = DomBlock->getTerminator();
+ IRBuilder<NoFolder> Builder(InsertPt);
+
+ // Move all 'aggressive' instructions, which are defined in the
+ // conditional parts of the if's up to the dominating block.
+ if (IfBlock1)
+ hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock1);
+ if (IfBlock2)
+ hoistAllInstructionsInto(DomBlock, InsertPt, IfBlock2);
+
+ // Propagate fast-math-flags from phi nodes to replacement selects.
+ IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+ while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+ if (isa<FPMathOperator>(PN))
+ Builder.setFastMathFlags(PN->getFastMathFlags());
+
+ // Change the PHI node into a select instruction.
+ Value *TrueVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfFalse);
+ Value *FalseVal = PN->getIncomingValue(PN->getIncomingBlock(0) == IfTrue);
+
+ Value *Sel = Builder.CreateSelect(IfCond, TrueVal, FalseVal, "", InsertPt);
+ PN->replaceAllUsesWith(Sel);
+ Sel->takeName(PN);
+ PN->eraseFromParent();
+ }
+
+ // At this point, IfBlock1 and IfBlock2 are both empty, so our if statement
+ // has been flattened. Change DomBlock to jump directly to our new block to
+ // avoid other simplifycfg's kicking in on the diamond.
+ Instruction *OldTI = DomBlock->getTerminator();
+ Builder.SetInsertPoint(OldTI);
+ Builder.CreateBr(BB);
SmallVector<DominatorTree::UpdateType, 3> Updates;
if (DTU) {
@@ -2693,43 +2693,43 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
Updates.push_back({DominatorTree::Delete, DomBlock, Successor});
}
- OldTI->eraseFromParent();
+ OldTI->eraseFromParent();
if (DTU)
DTU->applyUpdates(Updates);
- return true;
-}
-
-/// If we found a conditional branch that goes to two returning blocks,
-/// try to merge them together into one return,
-/// introducing a select if the return values disagree.
-bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
- IRBuilder<> &Builder) {
+ return true;
+}
+
+/// If we found a conditional branch that goes to two returning blocks,
+/// try to merge them together into one return,
+/// introducing a select if the return values disagree.
+bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
+ IRBuilder<> &Builder) {
auto *BB = BI->getParent();
- assert(BI->isConditional() && "Must be a conditional branch");
- BasicBlock *TrueSucc = BI->getSuccessor(0);
- BasicBlock *FalseSucc = BI->getSuccessor(1);
+ assert(BI->isConditional() && "Must be a conditional branch");
+ BasicBlock *TrueSucc = BI->getSuccessor(0);
+ BasicBlock *FalseSucc = BI->getSuccessor(1);
// NOTE: destinations may match, this could be degenerate uncond branch.
- ReturnInst *TrueRet = cast<ReturnInst>(TrueSucc->getTerminator());
- ReturnInst *FalseRet = cast<ReturnInst>(FalseSucc->getTerminator());
-
- // Check to ensure both blocks are empty (just a return) or optionally empty
- // with PHI nodes. If there are other instructions, merging would cause extra
- // computation on one path or the other.
- if (!TrueSucc->getFirstNonPHIOrDbg()->isTerminator())
- return false;
- if (!FalseSucc->getFirstNonPHIOrDbg()->isTerminator())
- return false;
-
- Builder.SetInsertPoint(BI);
- // Okay, we found a branch that is going to two return nodes. If
- // there is no return value for this function, just change the
- // branch into a return.
- if (FalseRet->getNumOperands() == 0) {
+ ReturnInst *TrueRet = cast<ReturnInst>(TrueSucc->getTerminator());
+ ReturnInst *FalseRet = cast<ReturnInst>(FalseSucc->getTerminator());
+
+ // Check to ensure both blocks are empty (just a return) or optionally empty
+ // with PHI nodes. If there are other instructions, merging would cause extra
+ // computation on one path or the other.
+ if (!TrueSucc->getFirstNonPHIOrDbg()->isTerminator())
+ return false;
+ if (!FalseSucc->getFirstNonPHIOrDbg()->isTerminator())
+ return false;
+
+ Builder.SetInsertPoint(BI);
+ // Okay, we found a branch that is going to two return nodes. If
+ // there is no return value for this function, just change the
+ // branch into a return.
+ if (FalseRet->getNumOperands() == 0) {
TrueSucc->removePredecessor(BB);
FalseSucc->removePredecessor(BB);
- Builder.CreateRetVoid();
- EraseTerminatorAndDCECond(BI);
+ Builder.CreateRetVoid();
+ EraseTerminatorAndDCECond(BI);
if (DTU) {
SmallVector<DominatorTree::UpdateType, 2> Updates;
Updates.push_back({DominatorTree::Delete, BB, TrueSucc});
@@ -2737,62 +2737,62 @@ bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
Updates.push_back({DominatorTree::Delete, BB, FalseSucc});
DTU->applyUpdates(Updates);
}
- return true;
- }
-
- // Otherwise, figure out what the true and false return values are
- // so we can insert a new select instruction.
- Value *TrueValue = TrueRet->getReturnValue();
- Value *FalseValue = FalseRet->getReturnValue();
-
- // Unwrap any PHI nodes in the return blocks.
- if (PHINode *TVPN = dyn_cast_or_null<PHINode>(TrueValue))
- if (TVPN->getParent() == TrueSucc)
+ return true;
+ }
+
+ // Otherwise, figure out what the true and false return values are
+ // so we can insert a new select instruction.
+ Value *TrueValue = TrueRet->getReturnValue();
+ Value *FalseValue = FalseRet->getReturnValue();
+
+ // Unwrap any PHI nodes in the return blocks.
+ if (PHINode *TVPN = dyn_cast_or_null<PHINode>(TrueValue))
+ if (TVPN->getParent() == TrueSucc)
TrueValue = TVPN->getIncomingValueForBlock(BB);
- if (PHINode *FVPN = dyn_cast_or_null<PHINode>(FalseValue))
- if (FVPN->getParent() == FalseSucc)
+ if (PHINode *FVPN = dyn_cast_or_null<PHINode>(FalseValue))
+ if (FVPN->getParent() == FalseSucc)
FalseValue = FVPN->getIncomingValueForBlock(BB);
-
- // In order for this transformation to be safe, we must be able to
- // unconditionally execute both operands to the return. This is
- // normally the case, but we could have a potentially-trapping
- // constant expression that prevents this transformation from being
- // safe.
- if (ConstantExpr *TCV = dyn_cast_or_null<ConstantExpr>(TrueValue))
- if (TCV->canTrap())
- return false;
- if (ConstantExpr *FCV = dyn_cast_or_null<ConstantExpr>(FalseValue))
- if (FCV->canTrap())
- return false;
-
- // Okay, we collected all the mapped values and checked them for sanity, and
- // defined to really do this transformation. First, update the CFG.
+
+ // In order for this transformation to be safe, we must be able to
+ // unconditionally execute both operands to the return. This is
+ // normally the case, but we could have a potentially-trapping
+ // constant expression that prevents this transformation from being
+ // safe.
+ if (ConstantExpr *TCV = dyn_cast_or_null<ConstantExpr>(TrueValue))
+ if (TCV->canTrap())
+ return false;
+ if (ConstantExpr *FCV = dyn_cast_or_null<ConstantExpr>(FalseValue))
+ if (FCV->canTrap())
+ return false;
+
+ // Okay, we collected all the mapped values and checked them for sanity, and
+ // defined to really do this transformation. First, update the CFG.
TrueSucc->removePredecessor(BB);
FalseSucc->removePredecessor(BB);
-
- // Insert select instructions where needed.
- Value *BrCond = BI->getCondition();
- if (TrueValue) {
- // Insert a select if the results differ.
- if (TrueValue == FalseValue || isa<UndefValue>(FalseValue)) {
- } else if (isa<UndefValue>(TrueValue)) {
- TrueValue = FalseValue;
- } else {
- TrueValue =
- Builder.CreateSelect(BrCond, TrueValue, FalseValue, "retval", BI);
- }
- }
-
- Value *RI =
- !TrueValue ? Builder.CreateRetVoid() : Builder.CreateRet(TrueValue);
-
- (void)RI;
-
- LLVM_DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
- << "\n " << *BI << "\nNewRet = " << *RI << "\nTRUEBLOCK: "
- << *TrueSucc << "\nFALSEBLOCK: " << *FalseSucc);
-
- EraseTerminatorAndDCECond(BI);
+
+ // Insert select instructions where needed.
+ Value *BrCond = BI->getCondition();
+ if (TrueValue) {
+ // Insert a select if the results differ.
+ if (TrueValue == FalseValue || isa<UndefValue>(FalseValue)) {
+ } else if (isa<UndefValue>(TrueValue)) {
+ TrueValue = FalseValue;
+ } else {
+ TrueValue =
+ Builder.CreateSelect(BrCond, TrueValue, FalseValue, "retval", BI);
+ }
+ }
+
+ Value *RI =
+ !TrueValue ? Builder.CreateRetVoid() : Builder.CreateRet(TrueValue);
+
+ (void)RI;
+
+ LLVM_DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
+ << "\n " << *BI << "\nNewRet = " << *RI << "\nTRUEBLOCK: "
+ << *TrueSucc << "\nFALSEBLOCK: " << *FalseSucc);
+
+ EraseTerminatorAndDCECond(BI);
if (DTU) {
SmallVector<DominatorTree::UpdateType, 2> Updates;
Updates.push_back({DominatorTree::Delete, BB, TrueSucc});
@@ -2800,33 +2800,33 @@ bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
Updates.push_back({DominatorTree::Delete, BB, FalseSucc});
DTU->applyUpdates(Updates);
}
-
- return true;
-}
-
-/// Return true if either PBI or BI has branch weight available, and store
-/// the weights in {Pred|Succ}{True|False}Weight. If one of PBI and BI does
-/// not have branch weight, use 1:1 as its weight.
-static bool extractPredSuccWeights(BranchInst *PBI, BranchInst *BI,
- uint64_t &PredTrueWeight,
- uint64_t &PredFalseWeight,
- uint64_t &SuccTrueWeight,
- uint64_t &SuccFalseWeight) {
- bool PredHasWeights =
- PBI->extractProfMetadata(PredTrueWeight, PredFalseWeight);
- bool SuccHasWeights =
- BI->extractProfMetadata(SuccTrueWeight, SuccFalseWeight);
- if (PredHasWeights || SuccHasWeights) {
- if (!PredHasWeights)
- PredTrueWeight = PredFalseWeight = 1;
- if (!SuccHasWeights)
- SuccTrueWeight = SuccFalseWeight = 1;
- return true;
- } else {
- return false;
- }
-}
-
+
+ return true;
+}
+
+/// Return true if either PBI or BI has branch weight available, and store
+/// the weights in {Pred|Succ}{True|False}Weight. If one of PBI and BI does
+/// not have branch weight, use 1:1 as its weight.
+static bool extractPredSuccWeights(BranchInst *PBI, BranchInst *BI,
+ uint64_t &PredTrueWeight,
+ uint64_t &PredFalseWeight,
+ uint64_t &SuccTrueWeight,
+ uint64_t &SuccFalseWeight) {
+ bool PredHasWeights =
+ PBI->extractProfMetadata(PredTrueWeight, PredFalseWeight);
+ bool SuccHasWeights =
+ BI->extractProfMetadata(SuccTrueWeight, SuccFalseWeight);
+ if (PredHasWeights || SuccHasWeights) {
+ if (!PredHasWeights)
+ PredTrueWeight = PredFalseWeight = 1;
+ if (!SuccHasWeights)
+ SuccTrueWeight = SuccFalseWeight = 1;
+ return true;
+ } else {
+ return false;
+ }
+}
+
// Determine if the two branches share a common destination,
// and deduce a glue that we need to use to join branch's conditions
// to arrive at the common destination.
@@ -2967,91 +2967,91 @@ static bool PerformBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
return true;
}
-/// If this basic block is simple enough, and if a predecessor branches to us
-/// and one of our successors, fold the block into the predecessor and use
-/// logical operations to pick the right destination.
+/// If this basic block is simple enough, and if a predecessor branches to us
+/// and one of our successors, fold the block into the predecessor and use
+/// logical operations to pick the right destination.
bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
MemorySSAUpdater *MSSAU,
const TargetTransformInfo *TTI,
- unsigned BonusInstThreshold) {
+ unsigned BonusInstThreshold) {
// If this block ends with an unconditional branch,
// let SpeculativelyExecuteBB() deal with it.
if (!BI->isConditional())
return false;
- BasicBlock *BB = BI->getParent();
-
- const unsigned PredCount = pred_size(BB);
-
- bool Changed = false;
-
+ BasicBlock *BB = BI->getParent();
+
+ const unsigned PredCount = pred_size(BB);
+
+ bool Changed = false;
+
TargetTransformInfo::TargetCostKind CostKind =
BB->getParent()->hasMinSize() ? TargetTransformInfo::TCK_CodeSize
: TargetTransformInfo::TCK_SizeAndLatency;
-
+
Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
-
- if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
- Cond->getParent() != BB || !Cond->hasOneUse())
- return Changed;
-
- // Only allow this transformation if computing the condition doesn't involve
- // too many instructions and these involved instructions can be executed
- // unconditionally. We denote all involved instructions except the condition
- // as "bonus instructions", and only allow this transformation when the
- // number of the bonus instructions we'll need to create when cloning into
- // each predecessor does not exceed a certain threshold.
- unsigned NumBonusInsts = 0;
+
+ if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
+ Cond->getParent() != BB || !Cond->hasOneUse())
+ return Changed;
+
+ // Only allow this transformation if computing the condition doesn't involve
+ // too many instructions and these involved instructions can be executed
+ // unconditionally. We denote all involved instructions except the condition
+ // as "bonus instructions", and only allow this transformation when the
+ // number of the bonus instructions we'll need to create when cloning into
+ // each predecessor does not exceed a certain threshold.
+ unsigned NumBonusInsts = 0;
for (Instruction &I : *BB) {
// Don't check the branch condition comparison itself.
if (&I == Cond)
- continue;
+ continue;
// Ignore dbg intrinsics, and the terminator.
if (isa<DbgInfoIntrinsic>(I) || isa<BranchInst>(I))
continue;
// I must be safe to execute unconditionally.
if (!isSafeToSpeculativelyExecute(&I))
- return Changed;
-
- // Account for the cost of duplicating this instruction into each
- // predecessor.
- NumBonusInsts += PredCount;
- // Early exits once we reach the limit.
- if (NumBonusInsts > BonusInstThreshold)
- return Changed;
- }
-
- // Cond is known to be a compare or binary operator. Check to make sure that
- // neither operand is a potentially-trapping constant expression.
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
- if (CE->canTrap())
- return Changed;
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1)))
- if (CE->canTrap())
- return Changed;
-
- // Finally, don't infinitely unroll conditional loops.
+ return Changed;
+
+ // Account for the cost of duplicating this instruction into each
+ // predecessor.
+ NumBonusInsts += PredCount;
+ // Early exits once we reach the limit.
+ if (NumBonusInsts > BonusInstThreshold)
+ return Changed;
+ }
+
+ // Cond is known to be a compare or binary operator. Check to make sure that
+ // neither operand is a potentially-trapping constant expression.
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
+ if (CE->canTrap())
+ return Changed;
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1)))
+ if (CE->canTrap())
+ return Changed;
+
+ // Finally, don't infinitely unroll conditional loops.
if (is_contained(successors(BB), BB))
- return Changed;
-
- for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
- BasicBlock *PredBlock = *PI;
- BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
-
- // Check that we have two conditional branches. If there is a PHI node in
- // the common successor, verify that the same value flows in from both
- // blocks.
+ return Changed;
+
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+ BasicBlock *PredBlock = *PI;
+ BranchInst *PBI = dyn_cast<BranchInst>(PredBlock->getTerminator());
+
+ // Check that we have two conditional branches. If there is a PHI node in
+ // the common successor, verify that the same value flows in from both
+ // blocks.
if (!PBI || PBI->isUnconditional() || !SafeToMergeTerminators(BI, PBI))
- continue;
-
- // Determine if the two branches share a common destination.
+ continue;
+
+ // Determine if the two branches share a common destination.
Instruction::BinaryOps Opc;
bool InvertPredCond;
if (auto Recepie = CheckIfCondBranchesShareCommonDestination(BI, PBI))
std::tie(Opc, InvertPredCond) = *Recepie;
else
continue;
-
+
// Check the cost of inserting the necessary logic before performing the
// transformation.
if (TTI) {
@@ -3060,712 +3060,712 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, DomTreeUpdater *DTU,
if (InvertPredCond && (!PBI->getCondition()->hasOneUse() ||
!isa<CmpInst>(PBI->getCondition())))
Cost += TTI->getArithmeticInstrCost(Instruction::Xor, Ty, CostKind);
-
+
if (Cost > BranchFoldThreshold)
- continue;
- }
-
+ continue;
+ }
+
return PerformBranchToCommonDestFolding(BI, PBI, DTU, MSSAU);
- }
- return Changed;
-}
-
-// If there is only one store in BB1 and BB2, return it, otherwise return
-// nullptr.
-static StoreInst *findUniqueStoreInBlocks(BasicBlock *BB1, BasicBlock *BB2) {
- StoreInst *S = nullptr;
- for (auto *BB : {BB1, BB2}) {
- if (!BB)
- continue;
- for (auto &I : *BB)
- if (auto *SI = dyn_cast<StoreInst>(&I)) {
- if (S)
- // Multiple stores seen.
- return nullptr;
- else
- S = SI;
- }
- }
- return S;
-}
-
-static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
- Value *AlternativeV = nullptr) {
- // PHI is going to be a PHI node that allows the value V that is defined in
- // BB to be referenced in BB's only successor.
- //
- // If AlternativeV is nullptr, the only value we care about in PHI is V. It
- // doesn't matter to us what the other operand is (it'll never get used). We
- // could just create a new PHI with an undef incoming value, but that could
- // increase register pressure if EarlyCSE/InstCombine can't fold it with some
- // other PHI. So here we directly look for some PHI in BB's successor with V
- // as an incoming operand. If we find one, we use it, else we create a new
- // one.
- //
- // If AlternativeV is not nullptr, we care about both incoming values in PHI.
- // PHI must be exactly: phi <ty> [ %BB, %V ], [ %OtherBB, %AlternativeV]
- // where OtherBB is the single other predecessor of BB's only successor.
- PHINode *PHI = nullptr;
- BasicBlock *Succ = BB->getSingleSuccessor();
-
- for (auto I = Succ->begin(); isa<PHINode>(I); ++I)
- if (cast<PHINode>(I)->getIncomingValueForBlock(BB) == V) {
- PHI = cast<PHINode>(I);
- if (!AlternativeV)
- break;
-
- assert(Succ->hasNPredecessors(2));
- auto PredI = pred_begin(Succ);
- BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI;
- if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV)
- break;
- PHI = nullptr;
- }
- if (PHI)
- return PHI;
-
- // If V is not an instruction defined in BB, just return it.
- if (!AlternativeV &&
- (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != BB))
- return V;
-
- PHI = PHINode::Create(V->getType(), 2, "simplifycfg.merge", &Succ->front());
- PHI->addIncoming(V, BB);
- for (BasicBlock *PredBB : predecessors(Succ))
- if (PredBB != BB)
- PHI->addIncoming(
- AlternativeV ? AlternativeV : UndefValue::get(V->getType()), PredBB);
- return PHI;
-}
-
+ }
+ return Changed;
+}
+
+// If there is only one store in BB1 and BB2, return it, otherwise return
+// nullptr.
+static StoreInst *findUniqueStoreInBlocks(BasicBlock *BB1, BasicBlock *BB2) {
+ StoreInst *S = nullptr;
+ for (auto *BB : {BB1, BB2}) {
+ if (!BB)
+ continue;
+ for (auto &I : *BB)
+ if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ if (S)
+ // Multiple stores seen.
+ return nullptr;
+ else
+ S = SI;
+ }
+ }
+ return S;
+}
+
+static Value *ensureValueAvailableInSuccessor(Value *V, BasicBlock *BB,
+ Value *AlternativeV = nullptr) {
+ // PHI is going to be a PHI node that allows the value V that is defined in
+ // BB to be referenced in BB's only successor.
+ //
+ // If AlternativeV is nullptr, the only value we care about in PHI is V. It
+ // doesn't matter to us what the other operand is (it'll never get used). We
+ // could just create a new PHI with an undef incoming value, but that could
+ // increase register pressure if EarlyCSE/InstCombine can't fold it with some
+ // other PHI. So here we directly look for some PHI in BB's successor with V
+ // as an incoming operand. If we find one, we use it, else we create a new
+ // one.
+ //
+ // If AlternativeV is not nullptr, we care about both incoming values in PHI.
+ // PHI must be exactly: phi <ty> [ %BB, %V ], [ %OtherBB, %AlternativeV]
+ // where OtherBB is the single other predecessor of BB's only successor.
+ PHINode *PHI = nullptr;
+ BasicBlock *Succ = BB->getSingleSuccessor();
+
+ for (auto I = Succ->begin(); isa<PHINode>(I); ++I)
+ if (cast<PHINode>(I)->getIncomingValueForBlock(BB) == V) {
+ PHI = cast<PHINode>(I);
+ if (!AlternativeV)
+ break;
+
+ assert(Succ->hasNPredecessors(2));
+ auto PredI = pred_begin(Succ);
+ BasicBlock *OtherPredBB = *PredI == BB ? *++PredI : *PredI;
+ if (PHI->getIncomingValueForBlock(OtherPredBB) == AlternativeV)
+ break;
+ PHI = nullptr;
+ }
+ if (PHI)
+ return PHI;
+
+ // If V is not an instruction defined in BB, just return it.
+ if (!AlternativeV &&
+ (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != BB))
+ return V;
+
+ PHI = PHINode::Create(V->getType(), 2, "simplifycfg.merge", &Succ->front());
+ PHI->addIncoming(V, BB);
+ for (BasicBlock *PredBB : predecessors(Succ))
+ if (PredBB != BB)
+ PHI->addIncoming(
+ AlternativeV ? AlternativeV : UndefValue::get(V->getType()), PredBB);
+ return PHI;
+}
+
static bool mergeConditionalStoreToAddress(
BasicBlock *PTB, BasicBlock *PFB, BasicBlock *QTB, BasicBlock *QFB,
BasicBlock *PostBB, Value *Address, bool InvertPCond, bool InvertQCond,
DomTreeUpdater *DTU, const DataLayout &DL, const TargetTransformInfo &TTI) {
- // For every pointer, there must be exactly two stores, one coming from
- // PTB or PFB, and the other from QTB or QFB. We don't support more than one
- // store (to any address) in PTB,PFB or QTB,QFB.
- // FIXME: We could relax this restriction with a bit more work and performance
- // testing.
- StoreInst *PStore = findUniqueStoreInBlocks(PTB, PFB);
- StoreInst *QStore = findUniqueStoreInBlocks(QTB, QFB);
- if (!PStore || !QStore)
- return false;
-
- // Now check the stores are compatible.
- if (!QStore->isUnordered() || !PStore->isUnordered())
- return false;
-
- // Check that sinking the store won't cause program behavior changes. Sinking
- // the store out of the Q blocks won't change any behavior as we're sinking
- // from a block to its unconditional successor. But we're moving a store from
- // the P blocks down through the middle block (QBI) and past both QFB and QTB.
- // So we need to check that there are no aliasing loads or stores in
- // QBI, QTB and QFB. We also need to check there are no conflicting memory
- // operations between PStore and the end of its parent block.
- //
- // The ideal way to do this is to query AliasAnalysis, but we don't
- // preserve AA currently so that is dangerous. Be super safe and just
- // check there are no other memory operations at all.
- for (auto &I : *QFB->getSinglePredecessor())
- if (I.mayReadOrWriteMemory())
- return false;
- for (auto &I : *QFB)
- if (&I != QStore && I.mayReadOrWriteMemory())
- return false;
- if (QTB)
- for (auto &I : *QTB)
- if (&I != QStore && I.mayReadOrWriteMemory())
- return false;
- for (auto I = BasicBlock::iterator(PStore), E = PStore->getParent()->end();
- I != E; ++I)
- if (&*I != PStore && I->mayReadOrWriteMemory())
- return false;
-
- // If we're not in aggressive mode, we only optimize if we have some
- // confidence that by optimizing we'll allow P and/or Q to be if-converted.
- auto IsWorthwhile = [&](BasicBlock *BB, ArrayRef<StoreInst *> FreeStores) {
- if (!BB)
- return true;
- // Heuristic: if the block can be if-converted/phi-folded and the
- // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
- // thread this store.
- int BudgetRemaining =
- PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
- for (auto &I : BB->instructionsWithoutDebug()) {
- // Consider terminator instruction to be free.
- if (I.isTerminator())
- continue;
- // If this is one the stores that we want to speculate out of this BB,
- // then don't count it's cost, consider it to be free.
- if (auto *S = dyn_cast<StoreInst>(&I))
- if (llvm::find(FreeStores, S))
- continue;
- // Else, we have a white-list of instructions that we are ak speculating.
- if (!isa<BinaryOperator>(I) && !isa<GetElementPtrInst>(I))
- return false; // Not in white-list - not worthwhile folding.
- // And finally, if this is a non-free instruction that we are okay
- // speculating, ensure that we consider the speculation budget.
- BudgetRemaining -= TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
- if (BudgetRemaining < 0)
- return false; // Eagerly refuse to fold as soon as we're out of budget.
- }
- assert(BudgetRemaining >= 0 &&
- "When we run out of budget we will eagerly return from within the "
- "per-instruction loop.");
- return true;
- };
-
+ // For every pointer, there must be exactly two stores, one coming from
+ // PTB or PFB, and the other from QTB or QFB. We don't support more than one
+ // store (to any address) in PTB,PFB or QTB,QFB.
+ // FIXME: We could relax this restriction with a bit more work and performance
+ // testing.
+ StoreInst *PStore = findUniqueStoreInBlocks(PTB, PFB);
+ StoreInst *QStore = findUniqueStoreInBlocks(QTB, QFB);
+ if (!PStore || !QStore)
+ return false;
+
+ // Now check the stores are compatible.
+ if (!QStore->isUnordered() || !PStore->isUnordered())
+ return false;
+
+ // Check that sinking the store won't cause program behavior changes. Sinking
+ // the store out of the Q blocks won't change any behavior as we're sinking
+ // from a block to its unconditional successor. But we're moving a store from
+ // the P blocks down through the middle block (QBI) and past both QFB and QTB.
+ // So we need to check that there are no aliasing loads or stores in
+ // QBI, QTB and QFB. We also need to check there are no conflicting memory
+ // operations between PStore and the end of its parent block.
+ //
+ // The ideal way to do this is to query AliasAnalysis, but we don't
+ // preserve AA currently so that is dangerous. Be super safe and just
+ // check there are no other memory operations at all.
+ for (auto &I : *QFB->getSinglePredecessor())
+ if (I.mayReadOrWriteMemory())
+ return false;
+ for (auto &I : *QFB)
+ if (&I != QStore && I.mayReadOrWriteMemory())
+ return false;
+ if (QTB)
+ for (auto &I : *QTB)
+ if (&I != QStore && I.mayReadOrWriteMemory())
+ return false;
+ for (auto I = BasicBlock::iterator(PStore), E = PStore->getParent()->end();
+ I != E; ++I)
+ if (&*I != PStore && I->mayReadOrWriteMemory())
+ return false;
+
+ // If we're not in aggressive mode, we only optimize if we have some
+ // confidence that by optimizing we'll allow P and/or Q to be if-converted.
+ auto IsWorthwhile = [&](BasicBlock *BB, ArrayRef<StoreInst *> FreeStores) {
+ if (!BB)
+ return true;
+ // Heuristic: if the block can be if-converted/phi-folded and the
+ // instructions inside are all cheap (arithmetic/GEPs), it's worthwhile to
+ // thread this store.
+ int BudgetRemaining =
+ PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic;
+ for (auto &I : BB->instructionsWithoutDebug()) {
+ // Consider terminator instruction to be free.
+ if (I.isTerminator())
+ continue;
+ // If this is one the stores that we want to speculate out of this BB,
+ // then don't count it's cost, consider it to be free.
+ if (auto *S = dyn_cast<StoreInst>(&I))
+ if (llvm::find(FreeStores, S))
+ continue;
+ // Else, we have a white-list of instructions that we are ak speculating.
+ if (!isa<BinaryOperator>(I) && !isa<GetElementPtrInst>(I))
+ return false; // Not in white-list - not worthwhile folding.
+ // And finally, if this is a non-free instruction that we are okay
+ // speculating, ensure that we consider the speculation budget.
+ BudgetRemaining -= TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
+ if (BudgetRemaining < 0)
+ return false; // Eagerly refuse to fold as soon as we're out of budget.
+ }
+ assert(BudgetRemaining >= 0 &&
+ "When we run out of budget we will eagerly return from within the "
+ "per-instruction loop.");
+ return true;
+ };
+
const std::array<StoreInst *, 2> FreeStores = {PStore, QStore};
- if (!MergeCondStoresAggressively &&
- (!IsWorthwhile(PTB, FreeStores) || !IsWorthwhile(PFB, FreeStores) ||
- !IsWorthwhile(QTB, FreeStores) || !IsWorthwhile(QFB, FreeStores)))
- return false;
-
- // If PostBB has more than two predecessors, we need to split it so we can
- // sink the store.
- if (std::next(pred_begin(PostBB), 2) != pred_end(PostBB)) {
- // We know that QFB's only successor is PostBB. And QFB has a single
- // predecessor. If QTB exists, then its only successor is also PostBB.
- // If QTB does not exist, then QFB's only predecessor has a conditional
- // branch to QFB and PostBB.
- BasicBlock *TruePred = QTB ? QTB : QFB->getSinglePredecessor();
+ if (!MergeCondStoresAggressively &&
+ (!IsWorthwhile(PTB, FreeStores) || !IsWorthwhile(PFB, FreeStores) ||
+ !IsWorthwhile(QTB, FreeStores) || !IsWorthwhile(QFB, FreeStores)))
+ return false;
+
+ // If PostBB has more than two predecessors, we need to split it so we can
+ // sink the store.
+ if (std::next(pred_begin(PostBB), 2) != pred_end(PostBB)) {
+ // We know that QFB's only successor is PostBB. And QFB has a single
+ // predecessor. If QTB exists, then its only successor is also PostBB.
+ // If QTB does not exist, then QFB's only predecessor has a conditional
+ // branch to QFB and PostBB.
+ BasicBlock *TruePred = QTB ? QTB : QFB->getSinglePredecessor();
BasicBlock *NewBB =
SplitBlockPredecessors(PostBB, {QFB, TruePred}, "condstore.split", DTU);
- if (!NewBB)
- return false;
- PostBB = NewBB;
- }
-
- // OK, we're going to sink the stores to PostBB. The store has to be
- // conditional though, so first create the predicate.
- Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator())
- ->getCondition();
- Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator())
- ->getCondition();
-
- Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(),
- PStore->getParent());
- Value *QPHI = ensureValueAvailableInSuccessor(QStore->getValueOperand(),
- QStore->getParent(), PPHI);
-
- IRBuilder<> QB(&*PostBB->getFirstInsertionPt());
-
- Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond);
- Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond);
-
- if (InvertPCond)
- PPred = QB.CreateNot(PPred);
- if (InvertQCond)
- QPred = QB.CreateNot(QPred);
- Value *CombinedPred = QB.CreateOr(PPred, QPred);
-
+ if (!NewBB)
+ return false;
+ PostBB = NewBB;
+ }
+
+ // OK, we're going to sink the stores to PostBB. The store has to be
+ // conditional though, so first create the predicate.
+ Value *PCond = cast<BranchInst>(PFB->getSinglePredecessor()->getTerminator())
+ ->getCondition();
+ Value *QCond = cast<BranchInst>(QFB->getSinglePredecessor()->getTerminator())
+ ->getCondition();
+
+ Value *PPHI = ensureValueAvailableInSuccessor(PStore->getValueOperand(),
+ PStore->getParent());
+ Value *QPHI = ensureValueAvailableInSuccessor(QStore->getValueOperand(),
+ QStore->getParent(), PPHI);
+
+ IRBuilder<> QB(&*PostBB->getFirstInsertionPt());
+
+ Value *PPred = PStore->getParent() == PTB ? PCond : QB.CreateNot(PCond);
+ Value *QPred = QStore->getParent() == QTB ? QCond : QB.CreateNot(QCond);
+
+ if (InvertPCond)
+ PPred = QB.CreateNot(PPred);
+ if (InvertQCond)
+ QPred = QB.CreateNot(QPred);
+ Value *CombinedPred = QB.CreateOr(PPred, QPred);
+
auto *T = SplitBlockAndInsertIfThen(CombinedPred, &*QB.GetInsertPoint(),
/*Unreachable=*/false,
/*BranchWeights=*/nullptr, DTU);
- QB.SetInsertPoint(T);
- StoreInst *SI = cast<StoreInst>(QB.CreateStore(QPHI, Address));
- AAMDNodes AAMD;
- PStore->getAAMetadata(AAMD, /*Merge=*/false);
- PStore->getAAMetadata(AAMD, /*Merge=*/true);
- SI->setAAMetadata(AAMD);
- // Choose the minimum alignment. If we could prove both stores execute, we
- // could use biggest one. In this case, though, we only know that one of the
- // stores executes. And we don't know it's safe to take the alignment from a
- // store that doesn't execute.
- SI->setAlignment(std::min(PStore->getAlign(), QStore->getAlign()));
-
- QStore->eraseFromParent();
- PStore->eraseFromParent();
-
- return true;
-}
-
-static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
+ QB.SetInsertPoint(T);
+ StoreInst *SI = cast<StoreInst>(QB.CreateStore(QPHI, Address));
+ AAMDNodes AAMD;
+ PStore->getAAMetadata(AAMD, /*Merge=*/false);
+ PStore->getAAMetadata(AAMD, /*Merge=*/true);
+ SI->setAAMetadata(AAMD);
+ // Choose the minimum alignment. If we could prove both stores execute, we
+ // could use biggest one. In this case, though, we only know that one of the
+ // stores executes. And we don't know it's safe to take the alignment from a
+ // store that doesn't execute.
+ SI->setAlignment(std::min(PStore->getAlign(), QStore->getAlign()));
+
+ QStore->eraseFromParent();
+ PStore->eraseFromParent();
+
+ return true;
+}
+
+static bool mergeConditionalStores(BranchInst *PBI, BranchInst *QBI,
DomTreeUpdater *DTU, const DataLayout &DL,
- const TargetTransformInfo &TTI) {
- // The intention here is to find diamonds or triangles (see below) where each
- // conditional block contains a store to the same address. Both of these
- // stores are conditional, so they can't be unconditionally sunk. But it may
- // be profitable to speculatively sink the stores into one merged store at the
- // end, and predicate the merged store on the union of the two conditions of
- // PBI and QBI.
- //
- // This can reduce the number of stores executed if both of the conditions are
- // true, and can allow the blocks to become small enough to be if-converted.
- // This optimization will also chain, so that ladders of test-and-set
- // sequences can be if-converted away.
- //
- // We only deal with simple diamonds or triangles:
- //
- // PBI or PBI or a combination of the two
- // / \ | \
- // PTB PFB | PFB
- // \ / | /
- // QBI QBI
- // / \ | \
- // QTB QFB | QFB
- // \ / | /
- // PostBB PostBB
- //
- // We model triangles as a type of diamond with a nullptr "true" block.
- // Triangles are canonicalized so that the fallthrough edge is represented by
- // a true condition, as in the diagram above.
- BasicBlock *PTB = PBI->getSuccessor(0);
- BasicBlock *PFB = PBI->getSuccessor(1);
- BasicBlock *QTB = QBI->getSuccessor(0);
- BasicBlock *QFB = QBI->getSuccessor(1);
- BasicBlock *PostBB = QFB->getSingleSuccessor();
-
- // Make sure we have a good guess for PostBB. If QTB's only successor is
- // QFB, then QFB is a better PostBB.
- if (QTB->getSingleSuccessor() == QFB)
- PostBB = QFB;
-
- // If we couldn't find a good PostBB, stop.
- if (!PostBB)
- return false;
-
- bool InvertPCond = false, InvertQCond = false;
- // Canonicalize fallthroughs to the true branches.
- if (PFB == QBI->getParent()) {
- std::swap(PFB, PTB);
- InvertPCond = true;
- }
- if (QFB == PostBB) {
- std::swap(QFB, QTB);
- InvertQCond = true;
- }
-
- // From this point on we can assume PTB or QTB may be fallthroughs but PFB
- // and QFB may not. Model fallthroughs as a nullptr block.
- if (PTB == QBI->getParent())
- PTB = nullptr;
- if (QTB == PostBB)
- QTB = nullptr;
-
- // Legality bailouts. We must have at least the non-fallthrough blocks and
- // the post-dominating block, and the non-fallthroughs must only have one
- // predecessor.
- auto HasOnePredAndOneSucc = [](BasicBlock *BB, BasicBlock *P, BasicBlock *S) {
- return BB->getSinglePredecessor() == P && BB->getSingleSuccessor() == S;
- };
- if (!HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) ||
- !HasOnePredAndOneSucc(QFB, QBI->getParent(), PostBB))
- return false;
- if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) ||
- (QTB && !HasOnePredAndOneSucc(QTB, QBI->getParent(), PostBB)))
- return false;
- if (!QBI->getParent()->hasNUses(2))
- return false;
-
- // OK, this is a sequence of two diamonds or triangles.
- // Check if there are stores in PTB or PFB that are repeated in QTB or QFB.
- SmallPtrSet<Value *, 4> PStoreAddresses, QStoreAddresses;
- for (auto *BB : {PTB, PFB}) {
- if (!BB)
- continue;
- for (auto &I : *BB)
- if (StoreInst *SI = dyn_cast<StoreInst>(&I))
- PStoreAddresses.insert(SI->getPointerOperand());
- }
- for (auto *BB : {QTB, QFB}) {
- if (!BB)
- continue;
- for (auto &I : *BB)
- if (StoreInst *SI = dyn_cast<StoreInst>(&I))
- QStoreAddresses.insert(SI->getPointerOperand());
- }
-
- set_intersect(PStoreAddresses, QStoreAddresses);
- // set_intersect mutates PStoreAddresses in place. Rename it here to make it
- // clear what it contains.
- auto &CommonAddresses = PStoreAddresses;
-
- bool Changed = false;
- for (auto *Address : CommonAddresses)
+ const TargetTransformInfo &TTI) {
+ // The intention here is to find diamonds or triangles (see below) where each
+ // conditional block contains a store to the same address. Both of these
+ // stores are conditional, so they can't be unconditionally sunk. But it may
+ // be profitable to speculatively sink the stores into one merged store at the
+ // end, and predicate the merged store on the union of the two conditions of
+ // PBI and QBI.
+ //
+ // This can reduce the number of stores executed if both of the conditions are
+ // true, and can allow the blocks to become small enough to be if-converted.
+ // This optimization will also chain, so that ladders of test-and-set
+ // sequences can be if-converted away.
+ //
+ // We only deal with simple diamonds or triangles:
+ //
+ // PBI or PBI or a combination of the two
+ // / \ | \
+ // PTB PFB | PFB
+ // \ / | /
+ // QBI QBI
+ // / \ | \
+ // QTB QFB | QFB
+ // \ / | /
+ // PostBB PostBB
+ //
+ // We model triangles as a type of diamond with a nullptr "true" block.
+ // Triangles are canonicalized so that the fallthrough edge is represented by
+ // a true condition, as in the diagram above.
+ BasicBlock *PTB = PBI->getSuccessor(0);
+ BasicBlock *PFB = PBI->getSuccessor(1);
+ BasicBlock *QTB = QBI->getSuccessor(0);
+ BasicBlock *QFB = QBI->getSuccessor(1);
+ BasicBlock *PostBB = QFB->getSingleSuccessor();
+
+ // Make sure we have a good guess for PostBB. If QTB's only successor is
+ // QFB, then QFB is a better PostBB.
+ if (QTB->getSingleSuccessor() == QFB)
+ PostBB = QFB;
+
+ // If we couldn't find a good PostBB, stop.
+ if (!PostBB)
+ return false;
+
+ bool InvertPCond = false, InvertQCond = false;
+ // Canonicalize fallthroughs to the true branches.
+ if (PFB == QBI->getParent()) {
+ std::swap(PFB, PTB);
+ InvertPCond = true;
+ }
+ if (QFB == PostBB) {
+ std::swap(QFB, QTB);
+ InvertQCond = true;
+ }
+
+ // From this point on we can assume PTB or QTB may be fallthroughs but PFB
+ // and QFB may not. Model fallthroughs as a nullptr block.
+ if (PTB == QBI->getParent())
+ PTB = nullptr;
+ if (QTB == PostBB)
+ QTB = nullptr;
+
+ // Legality bailouts. We must have at least the non-fallthrough blocks and
+ // the post-dominating block, and the non-fallthroughs must only have one
+ // predecessor.
+ auto HasOnePredAndOneSucc = [](BasicBlock *BB, BasicBlock *P, BasicBlock *S) {
+ return BB->getSinglePredecessor() == P && BB->getSingleSuccessor() == S;
+ };
+ if (!HasOnePredAndOneSucc(PFB, PBI->getParent(), QBI->getParent()) ||
+ !HasOnePredAndOneSucc(QFB, QBI->getParent(), PostBB))
+ return false;
+ if ((PTB && !HasOnePredAndOneSucc(PTB, PBI->getParent(), QBI->getParent())) ||
+ (QTB && !HasOnePredAndOneSucc(QTB, QBI->getParent(), PostBB)))
+ return false;
+ if (!QBI->getParent()->hasNUses(2))
+ return false;
+
+ // OK, this is a sequence of two diamonds or triangles.
+ // Check if there are stores in PTB or PFB that are repeated in QTB or QFB.
+ SmallPtrSet<Value *, 4> PStoreAddresses, QStoreAddresses;
+ for (auto *BB : {PTB, PFB}) {
+ if (!BB)
+ continue;
+ for (auto &I : *BB)
+ if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+ PStoreAddresses.insert(SI->getPointerOperand());
+ }
+ for (auto *BB : {QTB, QFB}) {
+ if (!BB)
+ continue;
+ for (auto &I : *BB)
+ if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+ QStoreAddresses.insert(SI->getPointerOperand());
+ }
+
+ set_intersect(PStoreAddresses, QStoreAddresses);
+ // set_intersect mutates PStoreAddresses in place. Rename it here to make it
+ // clear what it contains.
+ auto &CommonAddresses = PStoreAddresses;
+
+ bool Changed = false;
+ for (auto *Address : CommonAddresses)
Changed |=
mergeConditionalStoreToAddress(PTB, PFB, QTB, QFB, PostBB, Address,
InvertPCond, InvertQCond, DTU, DL, TTI);
- return Changed;
-}
-
-/// If the previous block ended with a widenable branch, determine if reusing
-/// the target block is profitable and legal. This will have the effect of
-/// "widening" PBI, but doesn't require us to reason about hosting safety.
+ return Changed;
+}
+
+/// If the previous block ended with a widenable branch, determine if reusing
+/// the target block is profitable and legal. This will have the effect of
+/// "widening" PBI, but doesn't require us to reason about hosting safety.
static bool tryWidenCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
DomTreeUpdater *DTU) {
- // TODO: This can be generalized in two important ways:
- // 1) We can allow phi nodes in IfFalseBB and simply reuse all the input
- // values from the PBI edge.
- // 2) We can sink side effecting instructions into BI's fallthrough
- // successor provided they doesn't contribute to computation of
- // BI's condition.
- Value *CondWB, *WC;
- BasicBlock *IfTrueBB, *IfFalseBB;
- if (!parseWidenableBranch(PBI, CondWB, WC, IfTrueBB, IfFalseBB) ||
- IfTrueBB != BI->getParent() || !BI->getParent()->getSinglePredecessor())
- return false;
- if (!IfFalseBB->phis().empty())
- return false; // TODO
- // Use lambda to lazily compute expensive condition after cheap ones.
- auto NoSideEffects = [](BasicBlock &BB) {
- return !llvm::any_of(BB, [](const Instruction &I) {
- return I.mayWriteToMemory() || I.mayHaveSideEffects();
- });
- };
- if (BI->getSuccessor(1) != IfFalseBB && // no inf looping
- BI->getSuccessor(1)->getTerminatingDeoptimizeCall() && // profitability
- NoSideEffects(*BI->getParent())) {
+ // TODO: This can be generalized in two important ways:
+ // 1) We can allow phi nodes in IfFalseBB and simply reuse all the input
+ // values from the PBI edge.
+ // 2) We can sink side effecting instructions into BI's fallthrough
+ // successor provided they doesn't contribute to computation of
+ // BI's condition.
+ Value *CondWB, *WC;
+ BasicBlock *IfTrueBB, *IfFalseBB;
+ if (!parseWidenableBranch(PBI, CondWB, WC, IfTrueBB, IfFalseBB) ||
+ IfTrueBB != BI->getParent() || !BI->getParent()->getSinglePredecessor())
+ return false;
+ if (!IfFalseBB->phis().empty())
+ return false; // TODO
+ // Use lambda to lazily compute expensive condition after cheap ones.
+ auto NoSideEffects = [](BasicBlock &BB) {
+ return !llvm::any_of(BB, [](const Instruction &I) {
+ return I.mayWriteToMemory() || I.mayHaveSideEffects();
+ });
+ };
+ if (BI->getSuccessor(1) != IfFalseBB && // no inf looping
+ BI->getSuccessor(1)->getTerminatingDeoptimizeCall() && // profitability
+ NoSideEffects(*BI->getParent())) {
auto *OldSuccessor = BI->getSuccessor(1);
OldSuccessor->removePredecessor(BI->getParent());
- BI->setSuccessor(1, IfFalseBB);
+ BI->setSuccessor(1, IfFalseBB);
if (DTU)
DTU->applyUpdates(
{{DominatorTree::Insert, BI->getParent(), IfFalseBB},
{DominatorTree::Delete, BI->getParent(), OldSuccessor}});
- return true;
- }
- if (BI->getSuccessor(0) != IfFalseBB && // no inf looping
- BI->getSuccessor(0)->getTerminatingDeoptimizeCall() && // profitability
- NoSideEffects(*BI->getParent())) {
+ return true;
+ }
+ if (BI->getSuccessor(0) != IfFalseBB && // no inf looping
+ BI->getSuccessor(0)->getTerminatingDeoptimizeCall() && // profitability
+ NoSideEffects(*BI->getParent())) {
auto *OldSuccessor = BI->getSuccessor(0);
OldSuccessor->removePredecessor(BI->getParent());
- BI->setSuccessor(0, IfFalseBB);
+ BI->setSuccessor(0, IfFalseBB);
if (DTU)
DTU->applyUpdates(
{{DominatorTree::Insert, BI->getParent(), IfFalseBB},
{DominatorTree::Delete, BI->getParent(), OldSuccessor}});
- return true;
- }
- return false;
-}
-
-/// If we have a conditional branch as a predecessor of another block,
-/// this function tries to simplify it. We know
-/// that PBI and BI are both conditional branches, and BI is in one of the
-/// successor blocks of PBI - PBI branches to BI.
-static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
+ return true;
+ }
+ return false;
+}
+
+/// If we have a conditional branch as a predecessor of another block,
+/// this function tries to simplify it. We know
+/// that PBI and BI are both conditional branches, and BI is in one of the
+/// successor blocks of PBI - PBI branches to BI.
+static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
DomTreeUpdater *DTU,
- const DataLayout &DL,
- const TargetTransformInfo &TTI) {
- assert(PBI->isConditional() && BI->isConditional());
- BasicBlock *BB = BI->getParent();
-
- // If this block ends with a branch instruction, and if there is a
- // predecessor that ends on a branch of the same condition, make
- // this conditional branch redundant.
- if (PBI->getCondition() == BI->getCondition() &&
- PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
- // Okay, the outcome of this conditional branch is statically
- // knowable. If this block had a single pred, handle specially.
- if (BB->getSinglePredecessor()) {
- // Turn this into a branch on constant.
- bool CondIsTrue = PBI->getSuccessor(0) == BB;
- BI->setCondition(
- ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue));
- return true; // Nuke the branch on constant.
- }
-
- // Otherwise, if there are multiple predecessors, insert a PHI that merges
- // in the constant and simplify the block result. Subsequent passes of
- // simplifycfg will thread the block.
- if (BlockIsSimpleEnoughToThreadThrough(BB)) {
- pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
- PHINode *NewPN = PHINode::Create(
- Type::getInt1Ty(BB->getContext()), std::distance(PB, PE),
- BI->getCondition()->getName() + ".pr", &BB->front());
- // Okay, we're going to insert the PHI node. Since PBI is not the only
- // predecessor, compute the PHI'd conditional value for all of the preds.
- // Any predecessor where the condition is not computable we keep symbolic.
- for (pred_iterator PI = PB; PI != PE; ++PI) {
- BasicBlock *P = *PI;
- if ((PBI = dyn_cast<BranchInst>(P->getTerminator())) && PBI != BI &&
- PBI->isConditional() && PBI->getCondition() == BI->getCondition() &&
- PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
- bool CondIsTrue = PBI->getSuccessor(0) == BB;
- NewPN->addIncoming(
- ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue),
- P);
- } else {
- NewPN->addIncoming(BI->getCondition(), P);
- }
- }
-
- BI->setCondition(NewPN);
- return true;
- }
- }
-
- // If the previous block ended with a widenable branch, determine if reusing
- // the target block is profitable and legal. This will have the effect of
- // "widening" PBI, but doesn't require us to reason about hosting safety.
+ const DataLayout &DL,
+ const TargetTransformInfo &TTI) {
+ assert(PBI->isConditional() && BI->isConditional());
+ BasicBlock *BB = BI->getParent();
+
+ // If this block ends with a branch instruction, and if there is a
+ // predecessor that ends on a branch of the same condition, make
+ // this conditional branch redundant.
+ if (PBI->getCondition() == BI->getCondition() &&
+ PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+ // Okay, the outcome of this conditional branch is statically
+ // knowable. If this block had a single pred, handle specially.
+ if (BB->getSinglePredecessor()) {
+ // Turn this into a branch on constant.
+ bool CondIsTrue = PBI->getSuccessor(0) == BB;
+ BI->setCondition(
+ ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue));
+ return true; // Nuke the branch on constant.
+ }
+
+ // Otherwise, if there are multiple predecessors, insert a PHI that merges
+ // in the constant and simplify the block result. Subsequent passes of
+ // simplifycfg will thread the block.
+ if (BlockIsSimpleEnoughToThreadThrough(BB)) {
+ pred_iterator PB = pred_begin(BB), PE = pred_end(BB);
+ PHINode *NewPN = PHINode::Create(
+ Type::getInt1Ty(BB->getContext()), std::distance(PB, PE),
+ BI->getCondition()->getName() + ".pr", &BB->front());
+ // Okay, we're going to insert the PHI node. Since PBI is not the only
+ // predecessor, compute the PHI'd conditional value for all of the preds.
+ // Any predecessor where the condition is not computable we keep symbolic.
+ for (pred_iterator PI = PB; PI != PE; ++PI) {
+ BasicBlock *P = *PI;
+ if ((PBI = dyn_cast<BranchInst>(P->getTerminator())) && PBI != BI &&
+ PBI->isConditional() && PBI->getCondition() == BI->getCondition() &&
+ PBI->getSuccessor(0) != PBI->getSuccessor(1)) {
+ bool CondIsTrue = PBI->getSuccessor(0) == BB;
+ NewPN->addIncoming(
+ ConstantInt::get(Type::getInt1Ty(BB->getContext()), CondIsTrue),
+ P);
+ } else {
+ NewPN->addIncoming(BI->getCondition(), P);
+ }
+ }
+
+ BI->setCondition(NewPN);
+ return true;
+ }
+ }
+
+ // If the previous block ended with a widenable branch, determine if reusing
+ // the target block is profitable and legal. This will have the effect of
+ // "widening" PBI, but doesn't require us to reason about hosting safety.
if (tryWidenCondBranchToCondBranch(PBI, BI, DTU))
- return true;
-
- if (auto *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
- if (CE->canTrap())
- return false;
-
- // If both branches are conditional and both contain stores to the same
- // address, remove the stores from the conditionals and create a conditional
- // merged store at the end.
+ return true;
+
+ if (auto *CE = dyn_cast<ConstantExpr>(BI->getCondition()))
+ if (CE->canTrap())
+ return false;
+
+ // If both branches are conditional and both contain stores to the same
+ // address, remove the stores from the conditionals and create a conditional
+ // merged store at the end.
if (MergeCondStores && mergeConditionalStores(PBI, BI, DTU, DL, TTI))
- return true;
-
- // If this is a conditional branch in an empty block, and if any
- // predecessors are a conditional branch to one of our destinations,
- // fold the conditions into logical ops and one cond br.
-
- // Ignore dbg intrinsics.
- if (&*BB->instructionsWithoutDebug().begin() != BI)
- return false;
-
- int PBIOp, BIOp;
- if (PBI->getSuccessor(0) == BI->getSuccessor(0)) {
- PBIOp = 0;
- BIOp = 0;
- } else if (PBI->getSuccessor(0) == BI->getSuccessor(1)) {
- PBIOp = 0;
- BIOp = 1;
- } else if (PBI->getSuccessor(1) == BI->getSuccessor(0)) {
- PBIOp = 1;
- BIOp = 0;
- } else if (PBI->getSuccessor(1) == BI->getSuccessor(1)) {
- PBIOp = 1;
- BIOp = 1;
- } else {
- return false;
- }
-
- // Check to make sure that the other destination of this branch
- // isn't BB itself. If so, this is an infinite loop that will
- // keep getting unwound.
- if (PBI->getSuccessor(PBIOp) == BB)
- return false;
-
- // Do not perform this transformation if it would require
- // insertion of a large number of select instructions. For targets
- // without predication/cmovs, this is a big pessimization.
-
- // Also do not perform this transformation if any phi node in the common
- // destination block can trap when reached by BB or PBB (PR17073). In that
- // case, it would be unsafe to hoist the operation into a select instruction.
-
- BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
+ return true;
+
+ // If this is a conditional branch in an empty block, and if any
+ // predecessors are a conditional branch to one of our destinations,
+ // fold the conditions into logical ops and one cond br.
+
+ // Ignore dbg intrinsics.
+ if (&*BB->instructionsWithoutDebug().begin() != BI)
+ return false;
+
+ int PBIOp, BIOp;
+ if (PBI->getSuccessor(0) == BI->getSuccessor(0)) {
+ PBIOp = 0;
+ BIOp = 0;
+ } else if (PBI->getSuccessor(0) == BI->getSuccessor(1)) {
+ PBIOp = 0;
+ BIOp = 1;
+ } else if (PBI->getSuccessor(1) == BI->getSuccessor(0)) {
+ PBIOp = 1;
+ BIOp = 0;
+ } else if (PBI->getSuccessor(1) == BI->getSuccessor(1)) {
+ PBIOp = 1;
+ BIOp = 1;
+ } else {
+ return false;
+ }
+
+ // Check to make sure that the other destination of this branch
+ // isn't BB itself. If so, this is an infinite loop that will
+ // keep getting unwound.
+ if (PBI->getSuccessor(PBIOp) == BB)
+ return false;
+
+ // Do not perform this transformation if it would require
+ // insertion of a large number of select instructions. For targets
+ // without predication/cmovs, this is a big pessimization.
+
+ // Also do not perform this transformation if any phi node in the common
+ // destination block can trap when reached by BB or PBB (PR17073). In that
+ // case, it would be unsafe to hoist the operation into a select instruction.
+
+ BasicBlock *CommonDest = PBI->getSuccessor(PBIOp);
BasicBlock *RemovedDest = PBI->getSuccessor(PBIOp ^ 1);
- unsigned NumPhis = 0;
- for (BasicBlock::iterator II = CommonDest->begin(); isa<PHINode>(II);
- ++II, ++NumPhis) {
- if (NumPhis > 2) // Disable this xform.
- return false;
-
- PHINode *PN = cast<PHINode>(II);
- Value *BIV = PN->getIncomingValueForBlock(BB);
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BIV))
- if (CE->canTrap())
- return false;
-
- unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent());
- Value *PBIV = PN->getIncomingValue(PBBIdx);
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(PBIV))
- if (CE->canTrap())
- return false;
- }
-
- // Finally, if everything is ok, fold the branches to logical ops.
- BasicBlock *OtherDest = BI->getSuccessor(BIOp ^ 1);
-
- LLVM_DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
- << "AND: " << *BI->getParent());
-
+ unsigned NumPhis = 0;
+ for (BasicBlock::iterator II = CommonDest->begin(); isa<PHINode>(II);
+ ++II, ++NumPhis) {
+ if (NumPhis > 2) // Disable this xform.
+ return false;
+
+ PHINode *PN = cast<PHINode>(II);
+ Value *BIV = PN->getIncomingValueForBlock(BB);
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BIV))
+ if (CE->canTrap())
+ return false;
+
+ unsigned PBBIdx = PN->getBasicBlockIndex(PBI->getParent());
+ Value *PBIV = PN->getIncomingValue(PBBIdx);
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(PBIV))
+ if (CE->canTrap())
+ return false;
+ }
+
+ // Finally, if everything is ok, fold the branches to logical ops.
+ BasicBlock *OtherDest = BI->getSuccessor(BIOp ^ 1);
+
+ LLVM_DEBUG(dbgs() << "FOLDING BRs:" << *PBI->getParent()
+ << "AND: " << *BI->getParent());
+
SmallVector<DominatorTree::UpdateType, 5> Updates;
- // If OtherDest *is* BB, then BB is a basic block with a single conditional
- // branch in it, where one edge (OtherDest) goes back to itself but the other
- // exits. We don't *know* that the program avoids the infinite loop
- // (even though that seems likely). If we do this xform naively, we'll end up
- // recursively unpeeling the loop. Since we know that (after the xform is
- // done) that the block *is* infinite if reached, we just make it an obviously
- // infinite loop with no cond branch.
- if (OtherDest == BB) {
- // Insert it at the end of the function, because it's either code,
- // or it won't matter if it's hot. :)
- BasicBlock *InfLoopBlock =
- BasicBlock::Create(BB->getContext(), "infloop", BB->getParent());
- BranchInst::Create(InfLoopBlock, InfLoopBlock);
+ // If OtherDest *is* BB, then BB is a basic block with a single conditional
+ // branch in it, where one edge (OtherDest) goes back to itself but the other
+ // exits. We don't *know* that the program avoids the infinite loop
+ // (even though that seems likely). If we do this xform naively, we'll end up
+ // recursively unpeeling the loop. Since we know that (after the xform is
+ // done) that the block *is* infinite if reached, we just make it an obviously
+ // infinite loop with no cond branch.
+ if (OtherDest == BB) {
+ // Insert it at the end of the function, because it's either code,
+ // or it won't matter if it's hot. :)
+ BasicBlock *InfLoopBlock =
+ BasicBlock::Create(BB->getContext(), "infloop", BB->getParent());
+ BranchInst::Create(InfLoopBlock, InfLoopBlock);
Updates.push_back({DominatorTree::Insert, InfLoopBlock, InfLoopBlock});
- OtherDest = InfLoopBlock;
- }
-
- LLVM_DEBUG(dbgs() << *PBI->getParent()->getParent());
-
- // BI may have other predecessors. Because of this, we leave
- // it alone, but modify PBI.
-
- // Make sure we get to CommonDest on True&True directions.
- Value *PBICond = PBI->getCondition();
- IRBuilder<NoFolder> Builder(PBI);
- if (PBIOp)
- PBICond = Builder.CreateNot(PBICond, PBICond->getName() + ".not");
-
- Value *BICond = BI->getCondition();
- if (BIOp)
- BICond = Builder.CreateNot(BICond, BICond->getName() + ".not");
-
- // Merge the conditions.
- Value *Cond = Builder.CreateOr(PBICond, BICond, "brmerge");
-
- // Modify PBI to branch on the new condition to the new dests.
- PBI->setCondition(Cond);
- PBI->setSuccessor(0, CommonDest);
- PBI->setSuccessor(1, OtherDest);
-
+ OtherDest = InfLoopBlock;
+ }
+
+ LLVM_DEBUG(dbgs() << *PBI->getParent()->getParent());
+
+ // BI may have other predecessors. Because of this, we leave
+ // it alone, but modify PBI.
+
+ // Make sure we get to CommonDest on True&True directions.
+ Value *PBICond = PBI->getCondition();
+ IRBuilder<NoFolder> Builder(PBI);
+ if (PBIOp)
+ PBICond = Builder.CreateNot(PBICond, PBICond->getName() + ".not");
+
+ Value *BICond = BI->getCondition();
+ if (BIOp)
+ BICond = Builder.CreateNot(BICond, BICond->getName() + ".not");
+
+ // Merge the conditions.
+ Value *Cond = Builder.CreateOr(PBICond, BICond, "brmerge");
+
+ // Modify PBI to branch on the new condition to the new dests.
+ PBI->setCondition(Cond);
+ PBI->setSuccessor(0, CommonDest);
+ PBI->setSuccessor(1, OtherDest);
+
Updates.push_back({DominatorTree::Insert, PBI->getParent(), OtherDest});
Updates.push_back({DominatorTree::Delete, PBI->getParent(), RemovedDest});
if (DTU)
DTU->applyUpdates(Updates);
- // Update branch weight for PBI.
- uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
- uint64_t PredCommon, PredOther, SuccCommon, SuccOther;
- bool HasWeights =
- extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
- SuccTrueWeight, SuccFalseWeight);
- if (HasWeights) {
- PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
- PredOther = PBIOp ? PredTrueWeight : PredFalseWeight;
- SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
- SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
- // The weight to CommonDest should be PredCommon * SuccTotal +
- // PredOther * SuccCommon.
- // The weight to OtherDest should be PredOther * SuccOther.
- uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther) +
- PredOther * SuccCommon,
- PredOther * SuccOther};
- // Halve the weights if any of them cannot fit in an uint32_t
- FitWeights(NewWeights);
-
- setBranchWeights(PBI, NewWeights[0], NewWeights[1]);
- }
-
- // OtherDest may have phi nodes. If so, add an entry from PBI's
- // block that are identical to the entries for BI's block.
- AddPredecessorToBlock(OtherDest, PBI->getParent(), BB);
-
- // We know that the CommonDest already had an edge from PBI to
- // it. If it has PHIs though, the PHIs may have different
- // entries for BB and PBI's BB. If so, insert a select to make
- // them agree.
- for (PHINode &PN : CommonDest->phis()) {
- Value *BIV = PN.getIncomingValueForBlock(BB);
- unsigned PBBIdx = PN.getBasicBlockIndex(PBI->getParent());
- Value *PBIV = PN.getIncomingValue(PBBIdx);
- if (BIV != PBIV) {
- // Insert a select in PBI to pick the right value.
- SelectInst *NV = cast<SelectInst>(
- Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName() + ".mux"));
- PN.setIncomingValue(PBBIdx, NV);
- // Although the select has the same condition as PBI, the original branch
- // weights for PBI do not apply to the new select because the select's
- // 'logical' edges are incoming edges of the phi that is eliminated, not
- // the outgoing edges of PBI.
- if (HasWeights) {
- uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
- uint64_t PredOther = PBIOp ? PredTrueWeight : PredFalseWeight;
- uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
- uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
- // The weight to PredCommonDest should be PredCommon * SuccTotal.
- // The weight to PredOtherDest should be PredOther * SuccCommon.
- uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther),
- PredOther * SuccCommon};
-
- FitWeights(NewWeights);
-
- setBranchWeights(NV, NewWeights[0], NewWeights[1]);
- }
- }
- }
-
- LLVM_DEBUG(dbgs() << "INTO: " << *PBI->getParent());
- LLVM_DEBUG(dbgs() << *PBI->getParent()->getParent());
-
- // This basic block is probably dead. We know it has at least
- // one fewer predecessor.
- return true;
-}
-
-// Simplifies a terminator by replacing it with a branch to TrueBB if Cond is
-// true or to FalseBB if Cond is false.
-// Takes care of updating the successors and removing the old terminator.
-// Also makes sure not to introduce new successors by assuming that edges to
-// non-successor TrueBBs and FalseBBs aren't reachable.
-bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
- Value *Cond, BasicBlock *TrueBB,
- BasicBlock *FalseBB,
- uint32_t TrueWeight,
- uint32_t FalseWeight) {
+ // Update branch weight for PBI.
+ uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+ uint64_t PredCommon, PredOther, SuccCommon, SuccOther;
+ bool HasWeights =
+ extractPredSuccWeights(PBI, BI, PredTrueWeight, PredFalseWeight,
+ SuccTrueWeight, SuccFalseWeight);
+ if (HasWeights) {
+ PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
+ PredOther = PBIOp ? PredTrueWeight : PredFalseWeight;
+ SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
+ SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
+ // The weight to CommonDest should be PredCommon * SuccTotal +
+ // PredOther * SuccCommon.
+ // The weight to OtherDest should be PredOther * SuccOther.
+ uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther) +
+ PredOther * SuccCommon,
+ PredOther * SuccOther};
+ // Halve the weights if any of them cannot fit in an uint32_t
+ FitWeights(NewWeights);
+
+ setBranchWeights(PBI, NewWeights[0], NewWeights[1]);
+ }
+
+ // OtherDest may have phi nodes. If so, add an entry from PBI's
+ // block that are identical to the entries for BI's block.
+ AddPredecessorToBlock(OtherDest, PBI->getParent(), BB);
+
+ // We know that the CommonDest already had an edge from PBI to
+ // it. If it has PHIs though, the PHIs may have different
+ // entries for BB and PBI's BB. If so, insert a select to make
+ // them agree.
+ for (PHINode &PN : CommonDest->phis()) {
+ Value *BIV = PN.getIncomingValueForBlock(BB);
+ unsigned PBBIdx = PN.getBasicBlockIndex(PBI->getParent());
+ Value *PBIV = PN.getIncomingValue(PBBIdx);
+ if (BIV != PBIV) {
+ // Insert a select in PBI to pick the right value.
+ SelectInst *NV = cast<SelectInst>(
+ Builder.CreateSelect(PBICond, PBIV, BIV, PBIV->getName() + ".mux"));
+ PN.setIncomingValue(PBBIdx, NV);
+ // Although the select has the same condition as PBI, the original branch
+ // weights for PBI do not apply to the new select because the select's
+ // 'logical' edges are incoming edges of the phi that is eliminated, not
+ // the outgoing edges of PBI.
+ if (HasWeights) {
+ uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
+ uint64_t PredOther = PBIOp ? PredTrueWeight : PredFalseWeight;
+ uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
+ uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
+ // The weight to PredCommonDest should be PredCommon * SuccTotal.
+ // The weight to PredOtherDest should be PredOther * SuccCommon.
+ uint64_t NewWeights[2] = {PredCommon * (SuccCommon + SuccOther),
+ PredOther * SuccCommon};
+
+ FitWeights(NewWeights);
+
+ setBranchWeights(NV, NewWeights[0], NewWeights[1]);
+ }
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "INTO: " << *PBI->getParent());
+ LLVM_DEBUG(dbgs() << *PBI->getParent()->getParent());
+
+ // This basic block is probably dead. We know it has at least
+ // one fewer predecessor.
+ return true;
+}
+
+// Simplifies a terminator by replacing it with a branch to TrueBB if Cond is
+// true or to FalseBB if Cond is false.
+// Takes care of updating the successors and removing the old terminator.
+// Also makes sure not to introduce new successors by assuming that edges to
+// non-successor TrueBBs and FalseBBs aren't reachable.
+bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
+ Value *Cond, BasicBlock *TrueBB,
+ BasicBlock *FalseBB,
+ uint32_t TrueWeight,
+ uint32_t FalseWeight) {
auto *BB = OldTerm->getParent();
- // Remove any superfluous successor edges from the CFG.
- // First, figure out which successors to preserve.
- // If TrueBB and FalseBB are equal, only try to preserve one copy of that
- // successor.
- BasicBlock *KeepEdge1 = TrueBB;
- BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr;
-
+ // Remove any superfluous successor edges from the CFG.
+ // First, figure out which successors to preserve.
+ // If TrueBB and FalseBB are equal, only try to preserve one copy of that
+ // successor.
+ BasicBlock *KeepEdge1 = TrueBB;
+ BasicBlock *KeepEdge2 = TrueBB != FalseBB ? FalseBB : nullptr;
+
SmallSetVector<BasicBlock *, 2> RemovedSuccessors;
- // Then remove the rest.
- for (BasicBlock *Succ : successors(OldTerm)) {
- // Make sure only to keep exactly one copy of each edge.
- if (Succ == KeepEdge1)
- KeepEdge1 = nullptr;
- else if (Succ == KeepEdge2)
- KeepEdge2 = nullptr;
+ // Then remove the rest.
+ for (BasicBlock *Succ : successors(OldTerm)) {
+ // Make sure only to keep exactly one copy of each edge.
+ if (Succ == KeepEdge1)
+ KeepEdge1 = nullptr;
+ else if (Succ == KeepEdge2)
+ KeepEdge2 = nullptr;
else {
Succ->removePredecessor(BB,
- /*KeepOneInputPHIs=*/true);
+ /*KeepOneInputPHIs=*/true);
if (Succ != TrueBB && Succ != FalseBB)
RemovedSuccessors.insert(Succ);
}
- }
-
- IRBuilder<> Builder(OldTerm);
- Builder.SetCurrentDebugLocation(OldTerm->getDebugLoc());
-
- // Insert an appropriate new terminator.
- if (!KeepEdge1 && !KeepEdge2) {
+ }
+
+ IRBuilder<> Builder(OldTerm);
+ Builder.SetCurrentDebugLocation(OldTerm->getDebugLoc());
+
+ // Insert an appropriate new terminator.
+ if (!KeepEdge1 && !KeepEdge2) {
if (TrueBB == FalseBB) {
- // We were only looking for one successor, and it was present.
- // Create an unconditional branch to it.
- Builder.CreateBr(TrueBB);
+ // We were only looking for one successor, and it was present.
+ // Create an unconditional branch to it.
+ Builder.CreateBr(TrueBB);
} else {
- // We found both of the successors we were looking for.
- // Create a conditional branch sharing the condition of the select.
- BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
- if (TrueWeight != FalseWeight)
- setBranchWeights(NewBI, TrueWeight, FalseWeight);
- }
- } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
- // Neither of the selected blocks were successors, so this
- // terminator must be unreachable.
- new UnreachableInst(OldTerm->getContext(), OldTerm);
- } else {
- // One of the selected values was a successor, but the other wasn't.
- // Insert an unconditional branch to the one that was found;
- // the edge to the one that wasn't must be unreachable.
+ // We found both of the successors we were looking for.
+ // Create a conditional branch sharing the condition of the select.
+ BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
+ if (TrueWeight != FalseWeight)
+ setBranchWeights(NewBI, TrueWeight, FalseWeight);
+ }
+ } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
+ // Neither of the selected blocks were successors, so this
+ // terminator must be unreachable.
+ new UnreachableInst(OldTerm->getContext(), OldTerm);
+ } else {
+ // One of the selected values was a successor, but the other wasn't.
+ // Insert an unconditional branch to the one that was found;
+ // the edge to the one that wasn't must be unreachable.
if (!KeepEdge1) {
- // Only TrueBB was found.
- Builder.CreateBr(TrueBB);
+ // Only TrueBB was found.
+ Builder.CreateBr(TrueBB);
} else {
- // Only FalseBB was found.
- Builder.CreateBr(FalseBB);
+ // Only FalseBB was found.
+ Builder.CreateBr(FalseBB);
}
- }
-
- EraseTerminatorAndDCECond(OldTerm);
+ }
+
+ EraseTerminatorAndDCECond(OldTerm);
if (DTU) {
SmallVector<DominatorTree::UpdateType, 2> Updates;
@@ -3775,326 +3775,326 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
DTU->applyUpdates(Updates);
}
- return true;
-}
-
-// Replaces
-// (switch (select cond, X, Y)) on constant X, Y
-// with a branch - conditional if X and Y lead to distinct BBs,
-// unconditional otherwise.
-bool SimplifyCFGOpt::SimplifySwitchOnSelect(SwitchInst *SI,
- SelectInst *Select) {
- // Check for constant integer values in the select.
- ConstantInt *TrueVal = dyn_cast<ConstantInt>(Select->getTrueValue());
- ConstantInt *FalseVal = dyn_cast<ConstantInt>(Select->getFalseValue());
- if (!TrueVal || !FalseVal)
- return false;
-
- // Find the relevant condition and destinations.
- Value *Condition = Select->getCondition();
- BasicBlock *TrueBB = SI->findCaseValue(TrueVal)->getCaseSuccessor();
- BasicBlock *FalseBB = SI->findCaseValue(FalseVal)->getCaseSuccessor();
-
- // Get weight for TrueBB and FalseBB.
- uint32_t TrueWeight = 0, FalseWeight = 0;
- SmallVector<uint64_t, 8> Weights;
- bool HasWeights = HasBranchWeights(SI);
- if (HasWeights) {
- GetBranchWeights(SI, Weights);
- if (Weights.size() == 1 + SI->getNumCases()) {
- TrueWeight =
- (uint32_t)Weights[SI->findCaseValue(TrueVal)->getSuccessorIndex()];
- FalseWeight =
- (uint32_t)Weights[SI->findCaseValue(FalseVal)->getSuccessorIndex()];
- }
- }
-
- // Perform the actual simplification.
- return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB, TrueWeight,
- FalseWeight);
-}
-
-// Replaces
-// (indirectbr (select cond, blockaddress(@fn, BlockA),
-// blockaddress(@fn, BlockB)))
-// with
-// (br cond, BlockA, BlockB).
-bool SimplifyCFGOpt::SimplifyIndirectBrOnSelect(IndirectBrInst *IBI,
- SelectInst *SI) {
- // Check that both operands of the select are block addresses.
- BlockAddress *TBA = dyn_cast<BlockAddress>(SI->getTrueValue());
- BlockAddress *FBA = dyn_cast<BlockAddress>(SI->getFalseValue());
- if (!TBA || !FBA)
- return false;
-
- // Extract the actual blocks.
- BasicBlock *TrueBB = TBA->getBasicBlock();
- BasicBlock *FalseBB = FBA->getBasicBlock();
-
- // Perform the actual simplification.
- return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB, 0,
- 0);
-}
-
-/// This is called when we find an icmp instruction
-/// (a seteq/setne with a constant) as the only instruction in a
-/// block that ends with an uncond branch. We are looking for a very specific
-/// pattern that occurs when "A == 1 || A == 2 || A == 3" gets simplified. In
-/// this case, we merge the first two "or's of icmp" into a switch, but then the
-/// default value goes to an uncond block with a seteq in it, we get something
-/// like:
-///
-/// switch i8 %A, label %DEFAULT [ i8 1, label %end i8 2, label %end ]
-/// DEFAULT:
-/// %tmp = icmp eq i8 %A, 92
-/// br label %end
-/// end:
-/// ... = phi i1 [ true, %entry ], [ %tmp, %DEFAULT ], [ true, %entry ]
-///
-/// We prefer to split the edge to 'end' so that there is a true/false entry to
-/// the PHI, merging the third icmp into the switch.
-bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt(
- ICmpInst *ICI, IRBuilder<> &Builder) {
- BasicBlock *BB = ICI->getParent();
-
- // If the block has any PHIs in it or the icmp has multiple uses, it is too
- // complex.
- if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse())
- return false;
-
- Value *V = ICI->getOperand(0);
- ConstantInt *Cst = cast<ConstantInt>(ICI->getOperand(1));
-
- // The pattern we're looking for is where our only predecessor is a switch on
- // 'V' and this block is the default case for the switch. In this case we can
- // fold the compared value into the switch to simplify things.
- BasicBlock *Pred = BB->getSinglePredecessor();
- if (!Pred || !isa<SwitchInst>(Pred->getTerminator()))
- return false;
-
- SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator());
- if (SI->getCondition() != V)
- return false;
-
- // If BB is reachable on a non-default case, then we simply know the value of
- // V in this block. Substitute it and constant fold the icmp instruction
- // away.
- if (SI->getDefaultDest() != BB) {
- ConstantInt *VVal = SI->findCaseDest(BB);
- assert(VVal && "Should have a unique destination value");
- ICI->setOperand(0, VVal);
-
- if (Value *V = SimplifyInstruction(ICI, {DL, ICI})) {
- ICI->replaceAllUsesWith(V);
- ICI->eraseFromParent();
- }
- // BB is now empty, so it is likely to simplify away.
- return requestResimplify();
- }
-
- // Ok, the block is reachable from the default dest. If the constant we're
- // comparing exists in one of the other edges, then we can constant fold ICI
- // and zap it.
- if (SI->findCaseValue(Cst) != SI->case_default()) {
- Value *V;
- if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
- V = ConstantInt::getFalse(BB->getContext());
- else
- V = ConstantInt::getTrue(BB->getContext());
-
- ICI->replaceAllUsesWith(V);
- ICI->eraseFromParent();
- // BB is now empty, so it is likely to simplify away.
- return requestResimplify();
- }
-
- // The use of the icmp has to be in the 'end' block, by the only PHI node in
- // the block.
- BasicBlock *SuccBlock = BB->getTerminator()->getSuccessor(0);
- PHINode *PHIUse = dyn_cast<PHINode>(ICI->user_back());
- if (PHIUse == nullptr || PHIUse != &SuccBlock->front() ||
- isa<PHINode>(++BasicBlock::iterator(PHIUse)))
- return false;
-
- // If the icmp is a SETEQ, then the default dest gets false, the new edge gets
- // true in the PHI.
- Constant *DefaultCst = ConstantInt::getTrue(BB->getContext());
- Constant *NewCst = ConstantInt::getFalse(BB->getContext());
-
- if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
- std::swap(DefaultCst, NewCst);
-
- // Replace ICI (which is used by the PHI for the default value) with true or
- // false depending on if it is EQ or NE.
- ICI->replaceAllUsesWith(DefaultCst);
- ICI->eraseFromParent();
-
+ return true;
+}
+
+// Replaces
+// (switch (select cond, X, Y)) on constant X, Y
+// with a branch - conditional if X and Y lead to distinct BBs,
+// unconditional otherwise.
+bool SimplifyCFGOpt::SimplifySwitchOnSelect(SwitchInst *SI,
+ SelectInst *Select) {
+ // Check for constant integer values in the select.
+ ConstantInt *TrueVal = dyn_cast<ConstantInt>(Select->getTrueValue());
+ ConstantInt *FalseVal = dyn_cast<ConstantInt>(Select->getFalseValue());
+ if (!TrueVal || !FalseVal)
+ return false;
+
+ // Find the relevant condition and destinations.
+ Value *Condition = Select->getCondition();
+ BasicBlock *TrueBB = SI->findCaseValue(TrueVal)->getCaseSuccessor();
+ BasicBlock *FalseBB = SI->findCaseValue(FalseVal)->getCaseSuccessor();
+
+ // Get weight for TrueBB and FalseBB.
+ uint32_t TrueWeight = 0, FalseWeight = 0;
+ SmallVector<uint64_t, 8> Weights;
+ bool HasWeights = HasBranchWeights(SI);
+ if (HasWeights) {
+ GetBranchWeights(SI, Weights);
+ if (Weights.size() == 1 + SI->getNumCases()) {
+ TrueWeight =
+ (uint32_t)Weights[SI->findCaseValue(TrueVal)->getSuccessorIndex()];
+ FalseWeight =
+ (uint32_t)Weights[SI->findCaseValue(FalseVal)->getSuccessorIndex()];
+ }
+ }
+
+ // Perform the actual simplification.
+ return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB, TrueWeight,
+ FalseWeight);
+}
+
+// Replaces
+// (indirectbr (select cond, blockaddress(@fn, BlockA),
+// blockaddress(@fn, BlockB)))
+// with
+// (br cond, BlockA, BlockB).
+bool SimplifyCFGOpt::SimplifyIndirectBrOnSelect(IndirectBrInst *IBI,
+ SelectInst *SI) {
+ // Check that both operands of the select are block addresses.
+ BlockAddress *TBA = dyn_cast<BlockAddress>(SI->getTrueValue());
+ BlockAddress *FBA = dyn_cast<BlockAddress>(SI->getFalseValue());
+ if (!TBA || !FBA)
+ return false;
+
+ // Extract the actual blocks.
+ BasicBlock *TrueBB = TBA->getBasicBlock();
+ BasicBlock *FalseBB = FBA->getBasicBlock();
+
+ // Perform the actual simplification.
+ return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB, 0,
+ 0);
+}
+
+/// This is called when we find an icmp instruction
+/// (a seteq/setne with a constant) as the only instruction in a
+/// block that ends with an uncond branch. We are looking for a very specific
+/// pattern that occurs when "A == 1 || A == 2 || A == 3" gets simplified. In
+/// this case, we merge the first two "or's of icmp" into a switch, but then the
+/// default value goes to an uncond block with a seteq in it, we get something
+/// like:
+///
+/// switch i8 %A, label %DEFAULT [ i8 1, label %end i8 2, label %end ]
+/// DEFAULT:
+/// %tmp = icmp eq i8 %A, 92
+/// br label %end
+/// end:
+/// ... = phi i1 [ true, %entry ], [ %tmp, %DEFAULT ], [ true, %entry ]
+///
+/// We prefer to split the edge to 'end' so that there is a true/false entry to
+/// the PHI, merging the third icmp into the switch.
+bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt(
+ ICmpInst *ICI, IRBuilder<> &Builder) {
+ BasicBlock *BB = ICI->getParent();
+
+ // If the block has any PHIs in it or the icmp has multiple uses, it is too
+ // complex.
+ if (isa<PHINode>(BB->begin()) || !ICI->hasOneUse())
+ return false;
+
+ Value *V = ICI->getOperand(0);
+ ConstantInt *Cst = cast<ConstantInt>(ICI->getOperand(1));
+
+ // The pattern we're looking for is where our only predecessor is a switch on
+ // 'V' and this block is the default case for the switch. In this case we can
+ // fold the compared value into the switch to simplify things.
+ BasicBlock *Pred = BB->getSinglePredecessor();
+ if (!Pred || !isa<SwitchInst>(Pred->getTerminator()))
+ return false;
+
+ SwitchInst *SI = cast<SwitchInst>(Pred->getTerminator());
+ if (SI->getCondition() != V)
+ return false;
+
+ // If BB is reachable on a non-default case, then we simply know the value of
+ // V in this block. Substitute it and constant fold the icmp instruction
+ // away.
+ if (SI->getDefaultDest() != BB) {
+ ConstantInt *VVal = SI->findCaseDest(BB);
+ assert(VVal && "Should have a unique destination value");
+ ICI->setOperand(0, VVal);
+
+ if (Value *V = SimplifyInstruction(ICI, {DL, ICI})) {
+ ICI->replaceAllUsesWith(V);
+ ICI->eraseFromParent();
+ }
+ // BB is now empty, so it is likely to simplify away.
+ return requestResimplify();
+ }
+
+ // Ok, the block is reachable from the default dest. If the constant we're
+ // comparing exists in one of the other edges, then we can constant fold ICI
+ // and zap it.
+ if (SI->findCaseValue(Cst) != SI->case_default()) {
+ Value *V;
+ if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+ V = ConstantInt::getFalse(BB->getContext());
+ else
+ V = ConstantInt::getTrue(BB->getContext());
+
+ ICI->replaceAllUsesWith(V);
+ ICI->eraseFromParent();
+ // BB is now empty, so it is likely to simplify away.
+ return requestResimplify();
+ }
+
+ // The use of the icmp has to be in the 'end' block, by the only PHI node in
+ // the block.
+ BasicBlock *SuccBlock = BB->getTerminator()->getSuccessor(0);
+ PHINode *PHIUse = dyn_cast<PHINode>(ICI->user_back());
+ if (PHIUse == nullptr || PHIUse != &SuccBlock->front() ||
+ isa<PHINode>(++BasicBlock::iterator(PHIUse)))
+ return false;
+
+ // If the icmp is a SETEQ, then the default dest gets false, the new edge gets
+ // true in the PHI.
+ Constant *DefaultCst = ConstantInt::getTrue(BB->getContext());
+ Constant *NewCst = ConstantInt::getFalse(BB->getContext());
+
+ if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+ std::swap(DefaultCst, NewCst);
+
+ // Replace ICI (which is used by the PHI for the default value) with true or
+ // false depending on if it is EQ or NE.
+ ICI->replaceAllUsesWith(DefaultCst);
+ ICI->eraseFromParent();
+
SmallVector<DominatorTree::UpdateType, 2> Updates;
- // Okay, the switch goes to this block on a default value. Add an edge from
- // the switch to the merge point on the compared value.
- BasicBlock *NewBB =
- BasicBlock::Create(BB->getContext(), "switch.edge", BB->getParent(), BB);
- {
- SwitchInstProfUpdateWrapper SIW(*SI);
- auto W0 = SIW.getSuccessorWeight(0);
- SwitchInstProfUpdateWrapper::CaseWeightOpt NewW;
- if (W0) {
- NewW = ((uint64_t(*W0) + 1) >> 1);
- SIW.setSuccessorWeight(0, *NewW);
- }
- SIW.addCase(Cst, NewBB, NewW);
+ // Okay, the switch goes to this block on a default value. Add an edge from
+ // the switch to the merge point on the compared value.
+ BasicBlock *NewBB =
+ BasicBlock::Create(BB->getContext(), "switch.edge", BB->getParent(), BB);
+ {
+ SwitchInstProfUpdateWrapper SIW(*SI);
+ auto W0 = SIW.getSuccessorWeight(0);
+ SwitchInstProfUpdateWrapper::CaseWeightOpt NewW;
+ if (W0) {
+ NewW = ((uint64_t(*W0) + 1) >> 1);
+ SIW.setSuccessorWeight(0, *NewW);
+ }
+ SIW.addCase(Cst, NewBB, NewW);
Updates.push_back({DominatorTree::Insert, Pred, NewBB});
- }
-
- // NewBB branches to the phi block, add the uncond branch and the phi entry.
- Builder.SetInsertPoint(NewBB);
- Builder.SetCurrentDebugLocation(SI->getDebugLoc());
- Builder.CreateBr(SuccBlock);
+ }
+
+ // NewBB branches to the phi block, add the uncond branch and the phi entry.
+ Builder.SetInsertPoint(NewBB);
+ Builder.SetCurrentDebugLocation(SI->getDebugLoc());
+ Builder.CreateBr(SuccBlock);
Updates.push_back({DominatorTree::Insert, NewBB, SuccBlock});
- PHIUse->addIncoming(NewCst, NewBB);
+ PHIUse->addIncoming(NewCst, NewBB);
if (DTU)
DTU->applyUpdates(Updates);
- return true;
-}
-
-/// The specified branch is a conditional branch.
-/// Check to see if it is branching on an or/and chain of icmp instructions, and
-/// fold it into a switch instruction if so.
-bool SimplifyCFGOpt::SimplifyBranchOnICmpChain(BranchInst *BI,
- IRBuilder<> &Builder,
- const DataLayout &DL) {
- Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
- if (!Cond)
- return false;
-
- // Change br (X == 0 | X == 1), T, F into a switch instruction.
- // If this is a bunch of seteq's or'd together, or if it's a bunch of
- // 'setne's and'ed together, collect them.
-
- // Try to gather values from a chain of and/or to be turned into a switch
- ConstantComparesGatherer ConstantCompare(Cond, DL);
- // Unpack the result
- SmallVectorImpl<ConstantInt *> &Values = ConstantCompare.Vals;
- Value *CompVal = ConstantCompare.CompValue;
- unsigned UsedICmps = ConstantCompare.UsedICmps;
- Value *ExtraCase = ConstantCompare.Extra;
-
- // If we didn't have a multiply compared value, fail.
- if (!CompVal)
- return false;
-
- // Avoid turning single icmps into a switch.
- if (UsedICmps <= 1)
- return false;
-
+ return true;
+}
+
+/// The specified branch is a conditional branch.
+/// Check to see if it is branching on an or/and chain of icmp instructions, and
+/// fold it into a switch instruction if so.
+bool SimplifyCFGOpt::SimplifyBranchOnICmpChain(BranchInst *BI,
+ IRBuilder<> &Builder,
+ const DataLayout &DL) {
+ Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
+ if (!Cond)
+ return false;
+
+ // Change br (X == 0 | X == 1), T, F into a switch instruction.
+ // If this is a bunch of seteq's or'd together, or if it's a bunch of
+ // 'setne's and'ed together, collect them.
+
+ // Try to gather values from a chain of and/or to be turned into a switch
+ ConstantComparesGatherer ConstantCompare(Cond, DL);
+ // Unpack the result
+ SmallVectorImpl<ConstantInt *> &Values = ConstantCompare.Vals;
+ Value *CompVal = ConstantCompare.CompValue;
+ unsigned UsedICmps = ConstantCompare.UsedICmps;
+ Value *ExtraCase = ConstantCompare.Extra;
+
+ // If we didn't have a multiply compared value, fail.
+ if (!CompVal)
+ return false;
+
+ // Avoid turning single icmps into a switch.
+ if (UsedICmps <= 1)
+ return false;
+
bool TrueWhenEqual = match(Cond, m_LogicalOr(m_Value(), m_Value()));
-
- // There might be duplicate constants in the list, which the switch
- // instruction can't handle, remove them now.
- array_pod_sort(Values.begin(), Values.end(), ConstantIntSortPredicate);
- Values.erase(std::unique(Values.begin(), Values.end()), Values.end());
-
- // If Extra was used, we require at least two switch values to do the
- // transformation. A switch with one value is just a conditional branch.
- if (ExtraCase && Values.size() < 2)
- return false;
-
- // TODO: Preserve branch weight metadata, similarly to how
- // FoldValueComparisonIntoPredecessors preserves it.
-
- // Figure out which block is which destination.
- BasicBlock *DefaultBB = BI->getSuccessor(1);
- BasicBlock *EdgeBB = BI->getSuccessor(0);
- if (!TrueWhenEqual)
- std::swap(DefaultBB, EdgeBB);
-
- BasicBlock *BB = BI->getParent();
-
- // MSAN does not like undefs as branch condition which can be introduced
- // with "explicit branch".
- if (ExtraCase && BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory))
- return false;
-
- LLVM_DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
- << " cases into SWITCH. BB is:\n"
- << *BB);
-
+
+ // There might be duplicate constants in the list, which the switch
+ // instruction can't handle, remove them now.
+ array_pod_sort(Values.begin(), Values.end(), ConstantIntSortPredicate);
+ Values.erase(std::unique(Values.begin(), Values.end()), Values.end());
+
+ // If Extra was used, we require at least two switch values to do the
+ // transformation. A switch with one value is just a conditional branch.
+ if (ExtraCase && Values.size() < 2)
+ return false;
+
+ // TODO: Preserve branch weight metadata, similarly to how
+ // FoldValueComparisonIntoPredecessors preserves it.
+
+ // Figure out which block is which destination.
+ BasicBlock *DefaultBB = BI->getSuccessor(1);
+ BasicBlock *EdgeBB = BI->getSuccessor(0);
+ if (!TrueWhenEqual)
+ std::swap(DefaultBB, EdgeBB);
+
+ BasicBlock *BB = BI->getParent();
+
+ // MSAN does not like undefs as branch condition which can be introduced
+ // with "explicit branch".
+ if (ExtraCase && BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "Converting 'icmp' chain with " << Values.size()
+ << " cases into SWITCH. BB is:\n"
+ << *BB);
+
SmallVector<DominatorTree::UpdateType, 2> Updates;
- // If there are any extra values that couldn't be folded into the switch
- // then we evaluate them with an explicit branch first. Split the block
- // right before the condbr to handle it.
- if (ExtraCase) {
+ // If there are any extra values that couldn't be folded into the switch
+ // then we evaluate them with an explicit branch first. Split the block
+ // right before the condbr to handle it.
+ if (ExtraCase) {
BasicBlock *NewBB = SplitBlock(BB, BI, DTU, /*LI=*/nullptr,
/*MSSAU=*/nullptr, "switch.early.test");
- // Remove the uncond branch added to the old block.
- Instruction *OldTI = BB->getTerminator();
- Builder.SetInsertPoint(OldTI);
-
- if (TrueWhenEqual)
- Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB);
- else
- Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
-
- OldTI->eraseFromParent();
-
+ // Remove the uncond branch added to the old block.
+ Instruction *OldTI = BB->getTerminator();
+ Builder.SetInsertPoint(OldTI);
+
+ if (TrueWhenEqual)
+ Builder.CreateCondBr(ExtraCase, EdgeBB, NewBB);
+ else
+ Builder.CreateCondBr(ExtraCase, NewBB, EdgeBB);
+
+ OldTI->eraseFromParent();
+
Updates.push_back({DominatorTree::Insert, BB, EdgeBB});
- // If there are PHI nodes in EdgeBB, then we need to add a new entry to them
- // for the edge we just added.
- AddPredecessorToBlock(EdgeBB, BB, NewBB);
-
- LLVM_DEBUG(dbgs() << " ** 'icmp' chain unhandled condition: " << *ExtraCase
- << "\nEXTRABB = " << *BB);
- BB = NewBB;
- }
-
- Builder.SetInsertPoint(BI);
- // Convert pointer to int before we switch.
- if (CompVal->getType()->isPointerTy()) {
- CompVal = Builder.CreatePtrToInt(
- CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr");
- }
-
- // Create the new switch instruction now.
- SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
-
- // Add all of the 'cases' to the switch instruction.
- for (unsigned i = 0, e = Values.size(); i != e; ++i)
- New->addCase(Values[i], EdgeBB);
-
- // We added edges from PI to the EdgeBB. As such, if there were any
- // PHI nodes in EdgeBB, they need entries to be added corresponding to
- // the number of edges added.
- for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) {
- PHINode *PN = cast<PHINode>(BBI);
- Value *InVal = PN->getIncomingValueForBlock(BB);
- for (unsigned i = 0, e = Values.size() - 1; i != e; ++i)
- PN->addIncoming(InVal, BB);
- }
-
- // Erase the old branch instruction.
- EraseTerminatorAndDCECond(BI);
+ // If there are PHI nodes in EdgeBB, then we need to add a new entry to them
+ // for the edge we just added.
+ AddPredecessorToBlock(EdgeBB, BB, NewBB);
+
+ LLVM_DEBUG(dbgs() << " ** 'icmp' chain unhandled condition: " << *ExtraCase
+ << "\nEXTRABB = " << *BB);
+ BB = NewBB;
+ }
+
+ Builder.SetInsertPoint(BI);
+ // Convert pointer to int before we switch.
+ if (CompVal->getType()->isPointerTy()) {
+ CompVal = Builder.CreatePtrToInt(
+ CompVal, DL.getIntPtrType(CompVal->getType()), "magicptr");
+ }
+
+ // Create the new switch instruction now.
+ SwitchInst *New = Builder.CreateSwitch(CompVal, DefaultBB, Values.size());
+
+ // Add all of the 'cases' to the switch instruction.
+ for (unsigned i = 0, e = Values.size(); i != e; ++i)
+ New->addCase(Values[i], EdgeBB);
+
+ // We added edges from PI to the EdgeBB. As such, if there were any
+ // PHI nodes in EdgeBB, they need entries to be added corresponding to
+ // the number of edges added.
+ for (BasicBlock::iterator BBI = EdgeBB->begin(); isa<PHINode>(BBI); ++BBI) {
+ PHINode *PN = cast<PHINode>(BBI);
+ Value *InVal = PN->getIncomingValueForBlock(BB);
+ for (unsigned i = 0, e = Values.size() - 1; i != e; ++i)
+ PN->addIncoming(InVal, BB);
+ }
+
+ // Erase the old branch instruction.
+ EraseTerminatorAndDCECond(BI);
if (DTU)
DTU->applyUpdates(Updates);
-
- LLVM_DEBUG(dbgs() << " ** 'icmp' chain result is:\n" << *BB << '\n');
- return true;
-}
-
-bool SimplifyCFGOpt::simplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
- if (isa<PHINode>(RI->getValue()))
- return simplifyCommonResume(RI);
- else if (isa<LandingPadInst>(RI->getParent()->getFirstNonPHI()) &&
- RI->getValue() == RI->getParent()->getFirstNonPHI())
- // The resume must unwind the exception that caused control to branch here.
- return simplifySingleResume(RI);
-
- return false;
-}
-
+
+ LLVM_DEBUG(dbgs() << " ** 'icmp' chain result is:\n" << *BB << '\n');
+ return true;
+}
+
+bool SimplifyCFGOpt::simplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
+ if (isa<PHINode>(RI->getValue()))
+ return simplifyCommonResume(RI);
+ else if (isa<LandingPadInst>(RI->getParent()->getFirstNonPHI()) &&
+ RI->getValue() == RI->getParent()->getFirstNonPHI())
+ // The resume must unwind the exception that caused control to branch here.
+ return simplifySingleResume(RI);
+
+ return false;
+}
+
// Check if cleanup block is empty
static bool isCleanupBlockEmpty(iterator_range<BasicBlock::iterator> R) {
for (Instruction &I : R) {
@@ -4116,234 +4116,234 @@ static bool isCleanupBlockEmpty(iterator_range<BasicBlock::iterator> R) {
return true;
}
-// Simplify resume that is shared by several landing pads (phi of landing pad).
-bool SimplifyCFGOpt::simplifyCommonResume(ResumeInst *RI) {
- BasicBlock *BB = RI->getParent();
-
+// Simplify resume that is shared by several landing pads (phi of landing pad).
+bool SimplifyCFGOpt::simplifyCommonResume(ResumeInst *RI) {
+ BasicBlock *BB = RI->getParent();
+
// Check that there are no other instructions except for debug and lifetime
// intrinsics between the phi's and resume instruction.
if (!isCleanupBlockEmpty(
make_range(RI->getParent()->getFirstNonPHI(), BB->getTerminator())))
return false;
-
- SmallSetVector<BasicBlock *, 4> TrivialUnwindBlocks;
- auto *PhiLPInst = cast<PHINode>(RI->getValue());
-
- // Check incoming blocks to see if any of them are trivial.
- for (unsigned Idx = 0, End = PhiLPInst->getNumIncomingValues(); Idx != End;
- Idx++) {
- auto *IncomingBB = PhiLPInst->getIncomingBlock(Idx);
- auto *IncomingValue = PhiLPInst->getIncomingValue(Idx);
-
- // If the block has other successors, we can not delete it because
- // it has other dependents.
- if (IncomingBB->getUniqueSuccessor() != BB)
- continue;
-
- auto *LandingPad = dyn_cast<LandingPadInst>(IncomingBB->getFirstNonPHI());
- // Not the landing pad that caused the control to branch here.
- if (IncomingValue != LandingPad)
- continue;
-
+
+ SmallSetVector<BasicBlock *, 4> TrivialUnwindBlocks;
+ auto *PhiLPInst = cast<PHINode>(RI->getValue());
+
+ // Check incoming blocks to see if any of them are trivial.
+ for (unsigned Idx = 0, End = PhiLPInst->getNumIncomingValues(); Idx != End;
+ Idx++) {
+ auto *IncomingBB = PhiLPInst->getIncomingBlock(Idx);
+ auto *IncomingValue = PhiLPInst->getIncomingValue(Idx);
+
+ // If the block has other successors, we can not delete it because
+ // it has other dependents.
+ if (IncomingBB->getUniqueSuccessor() != BB)
+ continue;
+
+ auto *LandingPad = dyn_cast<LandingPadInst>(IncomingBB->getFirstNonPHI());
+ // Not the landing pad that caused the control to branch here.
+ if (IncomingValue != LandingPad)
+ continue;
+
if (isCleanupBlockEmpty(
make_range(LandingPad->getNextNode(), IncomingBB->getTerminator())))
- TrivialUnwindBlocks.insert(IncomingBB);
- }
-
- // If no trivial unwind blocks, don't do any simplifications.
- if (TrivialUnwindBlocks.empty())
- return false;
-
- // Turn all invokes that unwind here into calls.
- for (auto *TrivialBB : TrivialUnwindBlocks) {
- // Blocks that will be simplified should be removed from the phi node.
- // Note there could be multiple edges to the resume block, and we need
- // to remove them all.
- while (PhiLPInst->getBasicBlockIndex(TrivialBB) != -1)
- BB->removePredecessor(TrivialBB, true);
-
- for (pred_iterator PI = pred_begin(TrivialBB), PE = pred_end(TrivialBB);
- PI != PE;) {
- BasicBlock *Pred = *PI++;
+ TrivialUnwindBlocks.insert(IncomingBB);
+ }
+
+ // If no trivial unwind blocks, don't do any simplifications.
+ if (TrivialUnwindBlocks.empty())
+ return false;
+
+ // Turn all invokes that unwind here into calls.
+ for (auto *TrivialBB : TrivialUnwindBlocks) {
+ // Blocks that will be simplified should be removed from the phi node.
+ // Note there could be multiple edges to the resume block, and we need
+ // to remove them all.
+ while (PhiLPInst->getBasicBlockIndex(TrivialBB) != -1)
+ BB->removePredecessor(TrivialBB, true);
+
+ for (pred_iterator PI = pred_begin(TrivialBB), PE = pred_end(TrivialBB);
+ PI != PE;) {
+ BasicBlock *Pred = *PI++;
removeUnwindEdge(Pred, DTU);
++NumInvokes;
- }
-
- // In each SimplifyCFG run, only the current processed block can be erased.
- // Otherwise, it will break the iteration of SimplifyCFG pass. So instead
- // of erasing TrivialBB, we only remove the branch to the common resume
- // block so that we can later erase the resume block since it has no
- // predecessors.
- TrivialBB->getTerminator()->eraseFromParent();
- new UnreachableInst(RI->getContext(), TrivialBB);
+ }
+
+ // In each SimplifyCFG run, only the current processed block can be erased.
+ // Otherwise, it will break the iteration of SimplifyCFG pass. So instead
+ // of erasing TrivialBB, we only remove the branch to the common resume
+ // block so that we can later erase the resume block since it has no
+ // predecessors.
+ TrivialBB->getTerminator()->eraseFromParent();
+ new UnreachableInst(RI->getContext(), TrivialBB);
if (DTU)
DTU->applyUpdates({{DominatorTree::Delete, TrivialBB, BB}});
- }
-
- // Delete the resume block if all its predecessors have been removed.
+ }
+
+ // Delete the resume block if all its predecessors have been removed.
if (pred_empty(BB)) {
if (DTU)
DTU->deleteBB(BB);
else
BB->eraseFromParent();
}
-
- return !TrivialUnwindBlocks.empty();
-}
-
-// Simplify resume that is only used by a single (non-phi) landing pad.
-bool SimplifyCFGOpt::simplifySingleResume(ResumeInst *RI) {
- BasicBlock *BB = RI->getParent();
- auto *LPInst = cast<LandingPadInst>(BB->getFirstNonPHI());
- assert(RI->getValue() == LPInst &&
- "Resume must unwind the exception that caused control to here");
-
- // Check that there are no other instructions except for debug intrinsics.
+
+ return !TrivialUnwindBlocks.empty();
+}
+
+// Simplify resume that is only used by a single (non-phi) landing pad.
+bool SimplifyCFGOpt::simplifySingleResume(ResumeInst *RI) {
+ BasicBlock *BB = RI->getParent();
+ auto *LPInst = cast<LandingPadInst>(BB->getFirstNonPHI());
+ assert(RI->getValue() == LPInst &&
+ "Resume must unwind the exception that caused control to here");
+
+ // Check that there are no other instructions except for debug intrinsics.
if (!isCleanupBlockEmpty(
make_range<Instruction *>(LPInst->getNextNode(), RI)))
- return false;
-
- // Turn all invokes that unwind here into calls and delete the basic block.
- for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
- BasicBlock *Pred = *PI++;
+ return false;
+
+ // Turn all invokes that unwind here into calls and delete the basic block.
+ for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
+ BasicBlock *Pred = *PI++;
removeUnwindEdge(Pred, DTU);
++NumInvokes;
- }
-
- // The landingpad is now unreachable. Zap it.
+ }
+
+ // The landingpad is now unreachable. Zap it.
if (DTU)
DTU->deleteBB(BB);
else
BB->eraseFromParent();
- return true;
-}
-
+ return true;
+}
+
static bool removeEmptyCleanup(CleanupReturnInst *RI, DomTreeUpdater *DTU) {
- // If this is a trivial cleanup pad that executes no instructions, it can be
- // eliminated. If the cleanup pad continues to the caller, any predecessor
- // that is an EH pad will be updated to continue to the caller and any
- // predecessor that terminates with an invoke instruction will have its invoke
- // instruction converted to a call instruction. If the cleanup pad being
- // simplified does not continue to the caller, each predecessor will be
- // updated to continue to the unwind destination of the cleanup pad being
- // simplified.
- BasicBlock *BB = RI->getParent();
- CleanupPadInst *CPInst = RI->getCleanupPad();
- if (CPInst->getParent() != BB)
- // This isn't an empty cleanup.
- return false;
-
- // We cannot kill the pad if it has multiple uses. This typically arises
- // from unreachable basic blocks.
- if (!CPInst->hasOneUse())
- return false;
-
- // Check that there are no other instructions except for benign intrinsics.
+ // If this is a trivial cleanup pad that executes no instructions, it can be
+ // eliminated. If the cleanup pad continues to the caller, any predecessor
+ // that is an EH pad will be updated to continue to the caller and any
+ // predecessor that terminates with an invoke instruction will have its invoke
+ // instruction converted to a call instruction. If the cleanup pad being
+ // simplified does not continue to the caller, each predecessor will be
+ // updated to continue to the unwind destination of the cleanup pad being
+ // simplified.
+ BasicBlock *BB = RI->getParent();
+ CleanupPadInst *CPInst = RI->getCleanupPad();
+ if (CPInst->getParent() != BB)
+ // This isn't an empty cleanup.
+ return false;
+
+ // We cannot kill the pad if it has multiple uses. This typically arises
+ // from unreachable basic blocks.
+ if (!CPInst->hasOneUse())
+ return false;
+
+ // Check that there are no other instructions except for benign intrinsics.
if (!isCleanupBlockEmpty(
make_range<Instruction *>(CPInst->getNextNode(), RI)))
- return false;
-
- // If the cleanup return we are simplifying unwinds to the caller, this will
- // set UnwindDest to nullptr.
- BasicBlock *UnwindDest = RI->getUnwindDest();
- Instruction *DestEHPad = UnwindDest ? UnwindDest->getFirstNonPHI() : nullptr;
-
- // We're about to remove BB from the control flow. Before we do, sink any
- // PHINodes into the unwind destination. Doing this before changing the
- // control flow avoids some potentially slow checks, since we can currently
- // be certain that UnwindDest and BB have no common predecessors (since they
- // are both EH pads).
- if (UnwindDest) {
- // First, go through the PHI nodes in UnwindDest and update any nodes that
- // reference the block we are removing
- for (BasicBlock::iterator I = UnwindDest->begin(),
- IE = DestEHPad->getIterator();
- I != IE; ++I) {
- PHINode *DestPN = cast<PHINode>(I);
-
- int Idx = DestPN->getBasicBlockIndex(BB);
- // Since BB unwinds to UnwindDest, it has to be in the PHI node.
- assert(Idx != -1);
- // This PHI node has an incoming value that corresponds to a control
- // path through the cleanup pad we are removing. If the incoming
- // value is in the cleanup pad, it must be a PHINode (because we
- // verified above that the block is otherwise empty). Otherwise, the
- // value is either a constant or a value that dominates the cleanup
- // pad being removed.
- //
- // Because BB and UnwindDest are both EH pads, all of their
- // predecessors must unwind to these blocks, and since no instruction
- // can have multiple unwind destinations, there will be no overlap in
- // incoming blocks between SrcPN and DestPN.
- Value *SrcVal = DestPN->getIncomingValue(Idx);
- PHINode *SrcPN = dyn_cast<PHINode>(SrcVal);
-
- // Remove the entry for the block we are deleting.
- DestPN->removeIncomingValue(Idx, false);
-
- if (SrcPN && SrcPN->getParent() == BB) {
- // If the incoming value was a PHI node in the cleanup pad we are
- // removing, we need to merge that PHI node's incoming values into
- // DestPN.
- for (unsigned SrcIdx = 0, SrcE = SrcPN->getNumIncomingValues();
- SrcIdx != SrcE; ++SrcIdx) {
- DestPN->addIncoming(SrcPN->getIncomingValue(SrcIdx),
- SrcPN->getIncomingBlock(SrcIdx));
- }
- } else {
- // Otherwise, the incoming value came from above BB and
- // so we can just reuse it. We must associate all of BB's
- // predecessors with this value.
- for (auto *pred : predecessors(BB)) {
- DestPN->addIncoming(SrcVal, pred);
- }
- }
- }
-
- // Sink any remaining PHI nodes directly into UnwindDest.
- Instruction *InsertPt = DestEHPad;
- for (BasicBlock::iterator I = BB->begin(),
- IE = BB->getFirstNonPHI()->getIterator();
- I != IE;) {
- // The iterator must be incremented here because the instructions are
- // being moved to another block.
- PHINode *PN = cast<PHINode>(I++);
- if (PN->use_empty() || !PN->isUsedOutsideOfBlock(BB))
- // If the PHI node has no uses or all of its uses are in this basic
- // block (meaning they are debug or lifetime intrinsics), just leave
- // it. It will be erased when we erase BB below.
- continue;
-
- // Otherwise, sink this PHI node into UnwindDest.
- // Any predecessors to UnwindDest which are not already represented
- // must be back edges which inherit the value from the path through
- // BB. In this case, the PHI value must reference itself.
- for (auto *pred : predecessors(UnwindDest))
- if (pred != BB)
- PN->addIncoming(PN, pred);
- PN->moveBefore(InsertPt);
- }
- }
-
+ return false;
+
+ // If the cleanup return we are simplifying unwinds to the caller, this will
+ // set UnwindDest to nullptr.
+ BasicBlock *UnwindDest = RI->getUnwindDest();
+ Instruction *DestEHPad = UnwindDest ? UnwindDest->getFirstNonPHI() : nullptr;
+
+ // We're about to remove BB from the control flow. Before we do, sink any
+ // PHINodes into the unwind destination. Doing this before changing the
+ // control flow avoids some potentially slow checks, since we can currently
+ // be certain that UnwindDest and BB have no common predecessors (since they
+ // are both EH pads).
+ if (UnwindDest) {
+ // First, go through the PHI nodes in UnwindDest and update any nodes that
+ // reference the block we are removing
+ for (BasicBlock::iterator I = UnwindDest->begin(),
+ IE = DestEHPad->getIterator();
+ I != IE; ++I) {
+ PHINode *DestPN = cast<PHINode>(I);
+
+ int Idx = DestPN->getBasicBlockIndex(BB);
+ // Since BB unwinds to UnwindDest, it has to be in the PHI node.
+ assert(Idx != -1);
+ // This PHI node has an incoming value that corresponds to a control
+ // path through the cleanup pad we are removing. If the incoming
+ // value is in the cleanup pad, it must be a PHINode (because we
+ // verified above that the block is otherwise empty). Otherwise, the
+ // value is either a constant or a value that dominates the cleanup
+ // pad being removed.
+ //
+ // Because BB and UnwindDest are both EH pads, all of their
+ // predecessors must unwind to these blocks, and since no instruction
+ // can have multiple unwind destinations, there will be no overlap in
+ // incoming blocks between SrcPN and DestPN.
+ Value *SrcVal = DestPN->getIncomingValue(Idx);
+ PHINode *SrcPN = dyn_cast<PHINode>(SrcVal);
+
+ // Remove the entry for the block we are deleting.
+ DestPN->removeIncomingValue(Idx, false);
+
+ if (SrcPN && SrcPN->getParent() == BB) {
+ // If the incoming value was a PHI node in the cleanup pad we are
+ // removing, we need to merge that PHI node's incoming values into
+ // DestPN.
+ for (unsigned SrcIdx = 0, SrcE = SrcPN->getNumIncomingValues();
+ SrcIdx != SrcE; ++SrcIdx) {
+ DestPN->addIncoming(SrcPN->getIncomingValue(SrcIdx),
+ SrcPN->getIncomingBlock(SrcIdx));
+ }
+ } else {
+ // Otherwise, the incoming value came from above BB and
+ // so we can just reuse it. We must associate all of BB's
+ // predecessors with this value.
+ for (auto *pred : predecessors(BB)) {
+ DestPN->addIncoming(SrcVal, pred);
+ }
+ }
+ }
+
+ // Sink any remaining PHI nodes directly into UnwindDest.
+ Instruction *InsertPt = DestEHPad;
+ for (BasicBlock::iterator I = BB->begin(),
+ IE = BB->getFirstNonPHI()->getIterator();
+ I != IE;) {
+ // The iterator must be incremented here because the instructions are
+ // being moved to another block.
+ PHINode *PN = cast<PHINode>(I++);
+ if (PN->use_empty() || !PN->isUsedOutsideOfBlock(BB))
+ // If the PHI node has no uses or all of its uses are in this basic
+ // block (meaning they are debug or lifetime intrinsics), just leave
+ // it. It will be erased when we erase BB below.
+ continue;
+
+ // Otherwise, sink this PHI node into UnwindDest.
+ // Any predecessors to UnwindDest which are not already represented
+ // must be back edges which inherit the value from the path through
+ // BB. In this case, the PHI value must reference itself.
+ for (auto *pred : predecessors(UnwindDest))
+ if (pred != BB)
+ PN->addIncoming(PN, pred);
+ PN->moveBefore(InsertPt);
+ }
+ }
+
std::vector<DominatorTree::UpdateType> Updates;
- for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
- // The iterator must be updated here because we are removing this pred.
- BasicBlock *PredBB = *PI++;
- if (UnwindDest == nullptr) {
+ for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
+ // The iterator must be updated here because we are removing this pred.
+ BasicBlock *PredBB = *PI++;
+ if (UnwindDest == nullptr) {
if (DTU)
DTU->applyUpdates(Updates);
Updates.clear();
removeUnwindEdge(PredBB, DTU);
++NumInvokes;
- } else {
- Instruction *TI = PredBB->getTerminator();
- TI->replaceUsesOfWith(BB, UnwindDest);
+ } else {
+ Instruction *TI = PredBB->getTerminator();
+ TI->replaceUsesOfWith(BB, UnwindDest);
Updates.push_back({DominatorTree::Insert, PredBB, UnwindDest});
Updates.push_back({DominatorTree::Delete, PredBB, BB});
- }
- }
-
+ }
+ }
+
if (DTU) {
DTU->applyUpdates(Updates);
DTU->deleteBB(BB);
@@ -4351,250 +4351,250 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI, DomTreeUpdater *DTU) {
// The cleanup pad is now unreachable. Zap it.
BB->eraseFromParent();
- return true;
-}
-
-// Try to merge two cleanuppads together.
-static bool mergeCleanupPad(CleanupReturnInst *RI) {
- // Skip any cleanuprets which unwind to caller, there is nothing to merge
- // with.
- BasicBlock *UnwindDest = RI->getUnwindDest();
- if (!UnwindDest)
- return false;
-
- // This cleanupret isn't the only predecessor of this cleanuppad, it wouldn't
- // be safe to merge without code duplication.
- if (UnwindDest->getSinglePredecessor() != RI->getParent())
- return false;
-
- // Verify that our cleanuppad's unwind destination is another cleanuppad.
- auto *SuccessorCleanupPad = dyn_cast<CleanupPadInst>(&UnwindDest->front());
- if (!SuccessorCleanupPad)
- return false;
-
- CleanupPadInst *PredecessorCleanupPad = RI->getCleanupPad();
- // Replace any uses of the successor cleanupad with the predecessor pad
- // The only cleanuppad uses should be this cleanupret, it's cleanupret and
- // funclet bundle operands.
- SuccessorCleanupPad->replaceAllUsesWith(PredecessorCleanupPad);
- // Remove the old cleanuppad.
- SuccessorCleanupPad->eraseFromParent();
- // Now, we simply replace the cleanupret with a branch to the unwind
- // destination.
- BranchInst::Create(UnwindDest, RI->getParent());
- RI->eraseFromParent();
-
- return true;
-}
-
-bool SimplifyCFGOpt::simplifyCleanupReturn(CleanupReturnInst *RI) {
- // It is possible to transiantly have an undef cleanuppad operand because we
- // have deleted some, but not all, dead blocks.
- // Eventually, this block will be deleted.
- if (isa<UndefValue>(RI->getOperand(0)))
- return false;
-
- if (mergeCleanupPad(RI))
- return true;
-
+ return true;
+}
+
+// Try to merge two cleanuppads together.
+static bool mergeCleanupPad(CleanupReturnInst *RI) {
+ // Skip any cleanuprets which unwind to caller, there is nothing to merge
+ // with.
+ BasicBlock *UnwindDest = RI->getUnwindDest();
+ if (!UnwindDest)
+ return false;
+
+ // This cleanupret isn't the only predecessor of this cleanuppad, it wouldn't
+ // be safe to merge without code duplication.
+ if (UnwindDest->getSinglePredecessor() != RI->getParent())
+ return false;
+
+ // Verify that our cleanuppad's unwind destination is another cleanuppad.
+ auto *SuccessorCleanupPad = dyn_cast<CleanupPadInst>(&UnwindDest->front());
+ if (!SuccessorCleanupPad)
+ return false;
+
+ CleanupPadInst *PredecessorCleanupPad = RI->getCleanupPad();
+ // Replace any uses of the successor cleanupad with the predecessor pad
+ // The only cleanuppad uses should be this cleanupret, it's cleanupret and
+ // funclet bundle operands.
+ SuccessorCleanupPad->replaceAllUsesWith(PredecessorCleanupPad);
+ // Remove the old cleanuppad.
+ SuccessorCleanupPad->eraseFromParent();
+ // Now, we simply replace the cleanupret with a branch to the unwind
+ // destination.
+ BranchInst::Create(UnwindDest, RI->getParent());
+ RI->eraseFromParent();
+
+ return true;
+}
+
+bool SimplifyCFGOpt::simplifyCleanupReturn(CleanupReturnInst *RI) {
+ // It is possible to transiantly have an undef cleanuppad operand because we
+ // have deleted some, but not all, dead blocks.
+ // Eventually, this block will be deleted.
+ if (isa<UndefValue>(RI->getOperand(0)))
+ return false;
+
+ if (mergeCleanupPad(RI))
+ return true;
+
if (removeEmptyCleanup(RI, DTU))
- return true;
-
- return false;
-}
-
-bool SimplifyCFGOpt::simplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
- BasicBlock *BB = RI->getParent();
- if (!BB->getFirstNonPHIOrDbg()->isTerminator())
- return false;
-
- // Find predecessors that end with branches.
- SmallVector<BasicBlock *, 8> UncondBranchPreds;
- SmallVector<BranchInst *, 8> CondBranchPreds;
- for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
- BasicBlock *P = *PI;
- Instruction *PTI = P->getTerminator();
- if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) {
- if (BI->isUnconditional())
- UncondBranchPreds.push_back(P);
- else
- CondBranchPreds.push_back(BI);
- }
- }
-
- // If we found some, do the transformation!
- if (!UncondBranchPreds.empty() && DupRet) {
- while (!UncondBranchPreds.empty()) {
- BasicBlock *Pred = UncondBranchPreds.pop_back_val();
- LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
- << "INTO UNCOND BRANCH PRED: " << *Pred);
+ return true;
+
+ return false;
+}
+
+bool SimplifyCFGOpt::simplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
+ BasicBlock *BB = RI->getParent();
+ if (!BB->getFirstNonPHIOrDbg()->isTerminator())
+ return false;
+
+ // Find predecessors that end with branches.
+ SmallVector<BasicBlock *, 8> UncondBranchPreds;
+ SmallVector<BranchInst *, 8> CondBranchPreds;
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
+ BasicBlock *P = *PI;
+ Instruction *PTI = P->getTerminator();
+ if (BranchInst *BI = dyn_cast<BranchInst>(PTI)) {
+ if (BI->isUnconditional())
+ UncondBranchPreds.push_back(P);
+ else
+ CondBranchPreds.push_back(BI);
+ }
+ }
+
+ // If we found some, do the transformation!
+ if (!UncondBranchPreds.empty() && DupRet) {
+ while (!UncondBranchPreds.empty()) {
+ BasicBlock *Pred = UncondBranchPreds.pop_back_val();
+ LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
+ << "INTO UNCOND BRANCH PRED: " << *Pred);
(void)FoldReturnIntoUncondBranch(RI, BB, Pred, DTU);
- }
-
- // If we eliminated all predecessors of the block, delete the block now.
- if (pred_empty(BB)) {
- // We know there are no successors, so just nuke the block.
+ }
+
+ // If we eliminated all predecessors of the block, delete the block now.
+ if (pred_empty(BB)) {
+ // We know there are no successors, so just nuke the block.
if (DTU)
DTU->deleteBB(BB);
else
BB->eraseFromParent();
- }
-
- return true;
- }
-
- // Check out all of the conditional branches going to this return
- // instruction. If any of them just select between returns, change the
- // branch itself into a select/return pair.
- while (!CondBranchPreds.empty()) {
- BranchInst *BI = CondBranchPreds.pop_back_val();
-
- // Check to see if the non-BB successor is also a return block.
- if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) &&
- isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) &&
- SimplifyCondBranchToTwoReturns(BI, Builder))
- return true;
- }
- return false;
-}
-
-bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
- BasicBlock *BB = UI->getParent();
-
- bool Changed = false;
-
- // If there are any instructions immediately before the unreachable that can
- // be removed, do so.
- while (UI->getIterator() != BB->begin()) {
- BasicBlock::iterator BBI = UI->getIterator();
- --BBI;
- // Do not delete instructions that can have side effects which might cause
- // the unreachable to not be reachable; specifically, calls and volatile
- // operations may have this effect.
- if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI))
- break;
-
- if (BBI->mayHaveSideEffects()) {
- if (auto *SI = dyn_cast<StoreInst>(BBI)) {
- if (SI->isVolatile())
- break;
- } else if (auto *LI = dyn_cast<LoadInst>(BBI)) {
- if (LI->isVolatile())
- break;
- } else if (auto *RMWI = dyn_cast<AtomicRMWInst>(BBI)) {
- if (RMWI->isVolatile())
- break;
- } else if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(BBI)) {
- if (CXI->isVolatile())
- break;
- } else if (isa<CatchPadInst>(BBI)) {
- // A catchpad may invoke exception object constructors and such, which
- // in some languages can be arbitrary code, so be conservative by
- // default.
- // For CoreCLR, it just involves a type test, so can be removed.
- if (classifyEHPersonality(BB->getParent()->getPersonalityFn()) !=
- EHPersonality::CoreCLR)
- break;
- } else if (!isa<FenceInst>(BBI) && !isa<VAArgInst>(BBI) &&
- !isa<LandingPadInst>(BBI)) {
- break;
- }
- // Note that deleting LandingPad's here is in fact okay, although it
- // involves a bit of subtle reasoning. If this inst is a LandingPad,
- // all the predecessors of this block will be the unwind edges of Invokes,
- // and we can therefore guarantee this block will be erased.
- }
-
- // Delete this instruction (any uses are guaranteed to be dead)
- if (!BBI->use_empty())
- BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
- BBI->eraseFromParent();
- Changed = true;
- }
-
- // If the unreachable instruction is the first in the block, take a gander
- // at all of the predecessors of this instruction, and simplify them.
- if (&BB->front() != UI)
- return Changed;
-
+ }
+
+ return true;
+ }
+
+ // Check out all of the conditional branches going to this return
+ // instruction. If any of them just select between returns, change the
+ // branch itself into a select/return pair.
+ while (!CondBranchPreds.empty()) {
+ BranchInst *BI = CondBranchPreds.pop_back_val();
+
+ // Check to see if the non-BB successor is also a return block.
+ if (isa<ReturnInst>(BI->getSuccessor(0)->getTerminator()) &&
+ isa<ReturnInst>(BI->getSuccessor(1)->getTerminator()) &&
+ SimplifyCondBranchToTwoReturns(BI, Builder))
+ return true;
+ }
+ return false;
+}
+
+bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
+ BasicBlock *BB = UI->getParent();
+
+ bool Changed = false;
+
+ // If there are any instructions immediately before the unreachable that can
+ // be removed, do so.
+ while (UI->getIterator() != BB->begin()) {
+ BasicBlock::iterator BBI = UI->getIterator();
+ --BBI;
+ // Do not delete instructions that can have side effects which might cause
+ // the unreachable to not be reachable; specifically, calls and volatile
+ // operations may have this effect.
+ if (isa<CallInst>(BBI) && !isa<DbgInfoIntrinsic>(BBI))
+ break;
+
+ if (BBI->mayHaveSideEffects()) {
+ if (auto *SI = dyn_cast<StoreInst>(BBI)) {
+ if (SI->isVolatile())
+ break;
+ } else if (auto *LI = dyn_cast<LoadInst>(BBI)) {
+ if (LI->isVolatile())
+ break;
+ } else if (auto *RMWI = dyn_cast<AtomicRMWInst>(BBI)) {
+ if (RMWI->isVolatile())
+ break;
+ } else if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(BBI)) {
+ if (CXI->isVolatile())
+ break;
+ } else if (isa<CatchPadInst>(BBI)) {
+ // A catchpad may invoke exception object constructors and such, which
+ // in some languages can be arbitrary code, so be conservative by
+ // default.
+ // For CoreCLR, it just involves a type test, so can be removed.
+ if (classifyEHPersonality(BB->getParent()->getPersonalityFn()) !=
+ EHPersonality::CoreCLR)
+ break;
+ } else if (!isa<FenceInst>(BBI) && !isa<VAArgInst>(BBI) &&
+ !isa<LandingPadInst>(BBI)) {
+ break;
+ }
+ // Note that deleting LandingPad's here is in fact okay, although it
+ // involves a bit of subtle reasoning. If this inst is a LandingPad,
+ // all the predecessors of this block will be the unwind edges of Invokes,
+ // and we can therefore guarantee this block will be erased.
+ }
+
+ // Delete this instruction (any uses are guaranteed to be dead)
+ if (!BBI->use_empty())
+ BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+ BBI->eraseFromParent();
+ Changed = true;
+ }
+
+ // If the unreachable instruction is the first in the block, take a gander
+ // at all of the predecessors of this instruction, and simplify them.
+ if (&BB->front() != UI)
+ return Changed;
+
std::vector<DominatorTree::UpdateType> Updates;
SmallSetVector<BasicBlock *, 8> Preds(pred_begin(BB), pred_end(BB));
- for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
+ for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
auto *Predecessor = Preds[i];
Instruction *TI = Predecessor->getTerminator();
- IRBuilder<> Builder(TI);
- if (auto *BI = dyn_cast<BranchInst>(TI)) {
+ IRBuilder<> Builder(TI);
+ if (auto *BI = dyn_cast<BranchInst>(TI)) {
// We could either have a proper unconditional branch,
// or a degenerate conditional branch with matching destinations.
if (all_of(BI->successors(),
[BB](auto *Successor) { return Successor == BB; })) {
- new UnreachableInst(TI->getContext(), TI);
- TI->eraseFromParent();
- Changed = true;
- } else {
+ new UnreachableInst(TI->getContext(), TI);
+ TI->eraseFromParent();
+ Changed = true;
+ } else {
assert(BI->isConditional() && "Can't get here with an uncond branch.");
- Value* Cond = BI->getCondition();
+ Value* Cond = BI->getCondition();
assert(BI->getSuccessor(0) != BI->getSuccessor(1) &&
"The destinations are guaranteed to be different here.");
- if (BI->getSuccessor(0) == BB) {
- Builder.CreateAssumption(Builder.CreateNot(Cond));
- Builder.CreateBr(BI->getSuccessor(1));
- } else {
- assert(BI->getSuccessor(1) == BB && "Incorrect CFG");
- Builder.CreateAssumption(Cond);
- Builder.CreateBr(BI->getSuccessor(0));
- }
- EraseTerminatorAndDCECond(BI);
- Changed = true;
- }
+ if (BI->getSuccessor(0) == BB) {
+ Builder.CreateAssumption(Builder.CreateNot(Cond));
+ Builder.CreateBr(BI->getSuccessor(1));
+ } else {
+ assert(BI->getSuccessor(1) == BB && "Incorrect CFG");
+ Builder.CreateAssumption(Cond);
+ Builder.CreateBr(BI->getSuccessor(0));
+ }
+ EraseTerminatorAndDCECond(BI);
+ Changed = true;
+ }
Updates.push_back({DominatorTree::Delete, Predecessor, BB});
- } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
- SwitchInstProfUpdateWrapper SU(*SI);
- for (auto i = SU->case_begin(), e = SU->case_end(); i != e;) {
- if (i->getCaseSuccessor() != BB) {
- ++i;
- continue;
- }
- BB->removePredecessor(SU->getParent());
- i = SU.removeCase(i);
- e = SU->case_end();
- Changed = true;
- }
+ } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
+ SwitchInstProfUpdateWrapper SU(*SI);
+ for (auto i = SU->case_begin(), e = SU->case_end(); i != e;) {
+ if (i->getCaseSuccessor() != BB) {
+ ++i;
+ continue;
+ }
+ BB->removePredecessor(SU->getParent());
+ i = SU.removeCase(i);
+ e = SU->case_end();
+ Changed = true;
+ }
// Note that the default destination can't be removed!
if (SI->getDefaultDest() != BB)
Updates.push_back({DominatorTree::Delete, Predecessor, BB});
- } else if (auto *II = dyn_cast<InvokeInst>(TI)) {
- if (II->getUnwindDest() == BB) {
+ } else if (auto *II = dyn_cast<InvokeInst>(TI)) {
+ if (II->getUnwindDest() == BB) {
if (DTU)
DTU->applyUpdates(Updates);
Updates.clear();
removeUnwindEdge(TI->getParent(), DTU);
- Changed = true;
- }
- } else if (auto *CSI = dyn_cast<CatchSwitchInst>(TI)) {
- if (CSI->getUnwindDest() == BB) {
+ Changed = true;
+ }
+ } else if (auto *CSI = dyn_cast<CatchSwitchInst>(TI)) {
+ if (CSI->getUnwindDest() == BB) {
if (DTU)
DTU->applyUpdates(Updates);
Updates.clear();
removeUnwindEdge(TI->getParent(), DTU);
- Changed = true;
- continue;
- }
-
- for (CatchSwitchInst::handler_iterator I = CSI->handler_begin(),
- E = CSI->handler_end();
- I != E; ++I) {
- if (*I == BB) {
- CSI->removeHandler(I);
- --I;
- --E;
- Changed = true;
- }
- }
+ Changed = true;
+ continue;
+ }
+
+ for (CatchSwitchInst::handler_iterator I = CSI->handler_begin(),
+ E = CSI->handler_end();
+ I != E; ++I) {
+ if (*I == BB) {
+ CSI->removeHandler(I);
+ --I;
+ --E;
+ Changed = true;
+ }
+ }
Updates.push_back({DominatorTree::Delete, Predecessor, BB});
- if (CSI->getNumHandlers() == 0) {
- if (CSI->hasUnwindDest()) {
+ if (CSI->getNumHandlers() == 0) {
+ if (CSI->hasUnwindDest()) {
// Redirect all predecessors of the block containing CatchSwitchInst
// to instead branch to the CatchSwitchInst's unwind destination.
for (auto *PredecessorOfPredecessor : predecessors(Predecessor)) {
@@ -4604,66 +4604,66 @@ bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
{DominatorTree::Delete, PredecessorOfPredecessor, Predecessor});
}
Predecessor->replaceAllUsesWith(CSI->getUnwindDest());
- } else {
- // Rewrite all preds to unwind to caller (or from invoke to call).
+ } else {
+ // Rewrite all preds to unwind to caller (or from invoke to call).
if (DTU)
DTU->applyUpdates(Updates);
Updates.clear();
SmallVector<BasicBlock *, 8> EHPreds(predecessors(Predecessor));
- for (BasicBlock *EHPred : EHPreds)
+ for (BasicBlock *EHPred : EHPreds)
removeUnwindEdge(EHPred, DTU);
- }
- // The catchswitch is no longer reachable.
- new UnreachableInst(CSI->getContext(), CSI);
- CSI->eraseFromParent();
- Changed = true;
- }
+ }
+ // The catchswitch is no longer reachable.
+ new UnreachableInst(CSI->getContext(), CSI);
+ CSI->eraseFromParent();
+ Changed = true;
+ }
} else if (auto *CRI = dyn_cast<CleanupReturnInst>(TI)) {
(void)CRI;
assert(CRI->hasUnwindDest() && CRI->getUnwindDest() == BB &&
"Expected to always have an unwind to BB.");
Updates.push_back({DominatorTree::Delete, Predecessor, BB});
- new UnreachableInst(TI->getContext(), TI);
- TI->eraseFromParent();
- Changed = true;
- }
- }
-
+ new UnreachableInst(TI->getContext(), TI);
+ TI->eraseFromParent();
+ Changed = true;
+ }
+ }
+
if (DTU)
DTU->applyUpdates(Updates);
- // If this block is now dead, remove it.
- if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) {
- // We know there are no successors, so just nuke the block.
+ // If this block is now dead, remove it.
+ if (pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) {
+ // We know there are no successors, so just nuke the block.
if (DTU)
DTU->deleteBB(BB);
else
BB->eraseFromParent();
- return true;
- }
-
- return Changed;
-}
-
-static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
- assert(Cases.size() >= 1);
-
- array_pod_sort(Cases.begin(), Cases.end(), ConstantIntSortPredicate);
- for (size_t I = 1, E = Cases.size(); I != E; ++I) {
- if (Cases[I - 1]->getValue() != Cases[I]->getValue() + 1)
- return false;
- }
- return true;
-}
-
+ return true;
+ }
+
+ return Changed;
+}
+
+static bool CasesAreContiguous(SmallVectorImpl<ConstantInt *> &Cases) {
+ assert(Cases.size() >= 1);
+
+ array_pod_sort(Cases.begin(), Cases.end(), ConstantIntSortPredicate);
+ for (size_t I = 1, E = Cases.size(); I != E; ++I) {
+ if (Cases[I - 1]->getValue() != Cases[I]->getValue() + 1)
+ return false;
+ }
+ return true;
+}
+
static void createUnreachableSwitchDefault(SwitchInst *Switch,
DomTreeUpdater *DTU) {
- LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
+ LLVM_DEBUG(dbgs() << "SimplifyCFG: switch default is dead.\n");
auto *BB = Switch->getParent();
BasicBlock *NewDefaultBlock = SplitBlockPredecessors(
Switch->getDefaultDest(), Switch->getParent(), "", DTU);
auto *OrigDefaultBlock = Switch->getDefaultDest();
- Switch->setDefaultDest(&*NewDefaultBlock);
+ Switch->setDefaultDest(&*NewDefaultBlock);
if (DTU)
DTU->applyUpdates({{DominatorTree::Insert, BB, &*NewDefaultBlock},
{DominatorTree::Delete, BB, OrigDefaultBlock}});
@@ -4671,200 +4671,200 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch,
SmallVector<DominatorTree::UpdateType, 2> Updates;
for (auto *Successor : successors(NewDefaultBlock))
Updates.push_back({DominatorTree::Delete, NewDefaultBlock, Successor});
- auto *NewTerminator = NewDefaultBlock->getTerminator();
- new UnreachableInst(Switch->getContext(), NewTerminator);
- EraseTerminatorAndDCECond(NewTerminator);
+ auto *NewTerminator = NewDefaultBlock->getTerminator();
+ new UnreachableInst(Switch->getContext(), NewTerminator);
+ EraseTerminatorAndDCECond(NewTerminator);
if (DTU)
DTU->applyUpdates(Updates);
-}
-
-/// Turn a switch with two reachable destinations into an integer range
-/// comparison and branch.
-bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
- IRBuilder<> &Builder) {
- assert(SI->getNumCases() > 1 && "Degenerate switch?");
-
- bool HasDefault =
- !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
-
+}
+
+/// Turn a switch with two reachable destinations into an integer range
+/// comparison and branch.
+bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
+ IRBuilder<> &Builder) {
+ assert(SI->getNumCases() > 1 && "Degenerate switch?");
+
+ bool HasDefault =
+ !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+
auto *BB = SI->getParent();
- // Partition the cases into two sets with different destinations.
- BasicBlock *DestA = HasDefault ? SI->getDefaultDest() : nullptr;
- BasicBlock *DestB = nullptr;
- SmallVector<ConstantInt *, 16> CasesA;
- SmallVector<ConstantInt *, 16> CasesB;
-
- for (auto Case : SI->cases()) {
- BasicBlock *Dest = Case.getCaseSuccessor();
- if (!DestA)
- DestA = Dest;
- if (Dest == DestA) {
- CasesA.push_back(Case.getCaseValue());
- continue;
- }
- if (!DestB)
- DestB = Dest;
- if (Dest == DestB) {
- CasesB.push_back(Case.getCaseValue());
- continue;
- }
- return false; // More than two destinations.
- }
-
- assert(DestA && DestB &&
- "Single-destination switch should have been folded.");
- assert(DestA != DestB);
- assert(DestB != SI->getDefaultDest());
- assert(!CasesB.empty() && "There must be non-default cases.");
- assert(!CasesA.empty() || HasDefault);
-
- // Figure out if one of the sets of cases form a contiguous range.
- SmallVectorImpl<ConstantInt *> *ContiguousCases = nullptr;
- BasicBlock *ContiguousDest = nullptr;
- BasicBlock *OtherDest = nullptr;
- if (!CasesA.empty() && CasesAreContiguous(CasesA)) {
- ContiguousCases = &CasesA;
- ContiguousDest = DestA;
- OtherDest = DestB;
- } else if (CasesAreContiguous(CasesB)) {
- ContiguousCases = &CasesB;
- ContiguousDest = DestB;
- OtherDest = DestA;
- } else
- return false;
-
- // Start building the compare and branch.
-
- Constant *Offset = ConstantExpr::getNeg(ContiguousCases->back());
- Constant *NumCases =
- ConstantInt::get(Offset->getType(), ContiguousCases->size());
-
- Value *Sub = SI->getCondition();
- if (!Offset->isNullValue())
- Sub = Builder.CreateAdd(Sub, Offset, Sub->getName() + ".off");
-
- Value *Cmp;
- // If NumCases overflowed, then all possible values jump to the successor.
- if (NumCases->isNullValue() && !ContiguousCases->empty())
- Cmp = ConstantInt::getTrue(SI->getContext());
- else
- Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
- BranchInst *NewBI = Builder.CreateCondBr(Cmp, ContiguousDest, OtherDest);
-
- // Update weight for the newly-created conditional branch.
- if (HasBranchWeights(SI)) {
- SmallVector<uint64_t, 8> Weights;
- GetBranchWeights(SI, Weights);
- if (Weights.size() == 1 + SI->getNumCases()) {
- uint64_t TrueWeight = 0;
- uint64_t FalseWeight = 0;
- for (size_t I = 0, E = Weights.size(); I != E; ++I) {
- if (SI->getSuccessor(I) == ContiguousDest)
- TrueWeight += Weights[I];
- else
- FalseWeight += Weights[I];
- }
- while (TrueWeight > UINT32_MAX || FalseWeight > UINT32_MAX) {
- TrueWeight /= 2;
- FalseWeight /= 2;
- }
- setBranchWeights(NewBI, TrueWeight, FalseWeight);
- }
- }
-
- // Prune obsolete incoming values off the successors' PHI nodes.
- for (auto BBI = ContiguousDest->begin(); isa<PHINode>(BBI); ++BBI) {
- unsigned PreviousEdges = ContiguousCases->size();
- if (ContiguousDest == SI->getDefaultDest())
- ++PreviousEdges;
- for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
- cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
- }
- for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) {
- unsigned PreviousEdges = SI->getNumCases() - ContiguousCases->size();
- if (OtherDest == SI->getDefaultDest())
- ++PreviousEdges;
- for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
- cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
- }
-
- // Clean up the default block - it may have phis or other instructions before
- // the unreachable terminator.
- if (!HasDefault)
+ // Partition the cases into two sets with different destinations.
+ BasicBlock *DestA = HasDefault ? SI->getDefaultDest() : nullptr;
+ BasicBlock *DestB = nullptr;
+ SmallVector<ConstantInt *, 16> CasesA;
+ SmallVector<ConstantInt *, 16> CasesB;
+
+ for (auto Case : SI->cases()) {
+ BasicBlock *Dest = Case.getCaseSuccessor();
+ if (!DestA)
+ DestA = Dest;
+ if (Dest == DestA) {
+ CasesA.push_back(Case.getCaseValue());
+ continue;
+ }
+ if (!DestB)
+ DestB = Dest;
+ if (Dest == DestB) {
+ CasesB.push_back(Case.getCaseValue());
+ continue;
+ }
+ return false; // More than two destinations.
+ }
+
+ assert(DestA && DestB &&
+ "Single-destination switch should have been folded.");
+ assert(DestA != DestB);
+ assert(DestB != SI->getDefaultDest());
+ assert(!CasesB.empty() && "There must be non-default cases.");
+ assert(!CasesA.empty() || HasDefault);
+
+ // Figure out if one of the sets of cases form a contiguous range.
+ SmallVectorImpl<ConstantInt *> *ContiguousCases = nullptr;
+ BasicBlock *ContiguousDest = nullptr;
+ BasicBlock *OtherDest = nullptr;
+ if (!CasesA.empty() && CasesAreContiguous(CasesA)) {
+ ContiguousCases = &CasesA;
+ ContiguousDest = DestA;
+ OtherDest = DestB;
+ } else if (CasesAreContiguous(CasesB)) {
+ ContiguousCases = &CasesB;
+ ContiguousDest = DestB;
+ OtherDest = DestA;
+ } else
+ return false;
+
+ // Start building the compare and branch.
+
+ Constant *Offset = ConstantExpr::getNeg(ContiguousCases->back());
+ Constant *NumCases =
+ ConstantInt::get(Offset->getType(), ContiguousCases->size());
+
+ Value *Sub = SI->getCondition();
+ if (!Offset->isNullValue())
+ Sub = Builder.CreateAdd(Sub, Offset, Sub->getName() + ".off");
+
+ Value *Cmp;
+ // If NumCases overflowed, then all possible values jump to the successor.
+ if (NumCases->isNullValue() && !ContiguousCases->empty())
+ Cmp = ConstantInt::getTrue(SI->getContext());
+ else
+ Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
+ BranchInst *NewBI = Builder.CreateCondBr(Cmp, ContiguousDest, OtherDest);
+
+ // Update weight for the newly-created conditional branch.
+ if (HasBranchWeights(SI)) {
+ SmallVector<uint64_t, 8> Weights;
+ GetBranchWeights(SI, Weights);
+ if (Weights.size() == 1 + SI->getNumCases()) {
+ uint64_t TrueWeight = 0;
+ uint64_t FalseWeight = 0;
+ for (size_t I = 0, E = Weights.size(); I != E; ++I) {
+ if (SI->getSuccessor(I) == ContiguousDest)
+ TrueWeight += Weights[I];
+ else
+ FalseWeight += Weights[I];
+ }
+ while (TrueWeight > UINT32_MAX || FalseWeight > UINT32_MAX) {
+ TrueWeight /= 2;
+ FalseWeight /= 2;
+ }
+ setBranchWeights(NewBI, TrueWeight, FalseWeight);
+ }
+ }
+
+ // Prune obsolete incoming values off the successors' PHI nodes.
+ for (auto BBI = ContiguousDest->begin(); isa<PHINode>(BBI); ++BBI) {
+ unsigned PreviousEdges = ContiguousCases->size();
+ if (ContiguousDest == SI->getDefaultDest())
+ ++PreviousEdges;
+ for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
+ cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
+ }
+ for (auto BBI = OtherDest->begin(); isa<PHINode>(BBI); ++BBI) {
+ unsigned PreviousEdges = SI->getNumCases() - ContiguousCases->size();
+ if (OtherDest == SI->getDefaultDest())
+ ++PreviousEdges;
+ for (unsigned I = 0, E = PreviousEdges - 1; I != E; ++I)
+ cast<PHINode>(BBI)->removeIncomingValue(SI->getParent());
+ }
+
+ // Clean up the default block - it may have phis or other instructions before
+ // the unreachable terminator.
+ if (!HasDefault)
createUnreachableSwitchDefault(SI, DTU);
-
+
auto *UnreachableDefault = SI->getDefaultDest();
- // Drop the switch.
- SI->eraseFromParent();
-
+ // Drop the switch.
+ SI->eraseFromParent();
+
if (!HasDefault && DTU)
DTU->applyUpdates({{DominatorTree::Delete, BB, UnreachableDefault}});
- return true;
-}
-
-/// Compute masked bits for the condition of a switch
-/// and use it to remove dead cases.
+ return true;
+}
+
+/// Compute masked bits for the condition of a switch
+/// and use it to remove dead cases.
static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
AssumptionCache *AC,
- const DataLayout &DL) {
- Value *Cond = SI->getCondition();
- unsigned Bits = Cond->getType()->getIntegerBitWidth();
- KnownBits Known = computeKnownBits(Cond, DL, 0, AC, SI);
-
- // We can also eliminate cases by determining that their values are outside of
- // the limited range of the condition based on how many significant (non-sign)
- // bits are in the condition value.
- unsigned ExtraSignBits = ComputeNumSignBits(Cond, DL, 0, AC, SI) - 1;
- unsigned MaxSignificantBitsInCond = Bits - ExtraSignBits;
-
- // Gather dead cases.
- SmallVector<ConstantInt *, 8> DeadCases;
+ const DataLayout &DL) {
+ Value *Cond = SI->getCondition();
+ unsigned Bits = Cond->getType()->getIntegerBitWidth();
+ KnownBits Known = computeKnownBits(Cond, DL, 0, AC, SI);
+
+ // We can also eliminate cases by determining that their values are outside of
+ // the limited range of the condition based on how many significant (non-sign)
+ // bits are in the condition value.
+ unsigned ExtraSignBits = ComputeNumSignBits(Cond, DL, 0, AC, SI) - 1;
+ unsigned MaxSignificantBitsInCond = Bits - ExtraSignBits;
+
+ // Gather dead cases.
+ SmallVector<ConstantInt *, 8> DeadCases;
SmallMapVector<BasicBlock *, int, 8> NumPerSuccessorCases;
- for (auto &Case : SI->cases()) {
+ for (auto &Case : SI->cases()) {
auto *Successor = Case.getCaseSuccessor();
++NumPerSuccessorCases[Successor];
- const APInt &CaseVal = Case.getCaseValue()->getValue();
- if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
- (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
- DeadCases.push_back(Case.getCaseValue());
+ const APInt &CaseVal = Case.getCaseValue()->getValue();
+ if (Known.Zero.intersects(CaseVal) || !Known.One.isSubsetOf(CaseVal) ||
+ (CaseVal.getMinSignedBits() > MaxSignificantBitsInCond)) {
+ DeadCases.push_back(Case.getCaseValue());
--NumPerSuccessorCases[Successor];
- LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal
- << " is dead.\n");
- }
- }
-
- // If we can prove that the cases must cover all possible values, the
- // default destination becomes dead and we can remove it. If we know some
- // of the bits in the value, we can use that to more precisely compute the
- // number of possible unique case values.
- bool HasDefault =
- !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
- const unsigned NumUnknownBits =
- Bits - (Known.Zero | Known.One).countPopulation();
- assert(NumUnknownBits <= Bits);
- if (HasDefault && DeadCases.empty() &&
- NumUnknownBits < 64 /* avoid overflow */ &&
- SI->getNumCases() == (1ULL << NumUnknownBits)) {
+ LLVM_DEBUG(dbgs() << "SimplifyCFG: switch case " << CaseVal
+ << " is dead.\n");
+ }
+ }
+
+ // If we can prove that the cases must cover all possible values, the
+ // default destination becomes dead and we can remove it. If we know some
+ // of the bits in the value, we can use that to more precisely compute the
+ // number of possible unique case values.
+ bool HasDefault =
+ !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+ const unsigned NumUnknownBits =
+ Bits - (Known.Zero | Known.One).countPopulation();
+ assert(NumUnknownBits <= Bits);
+ if (HasDefault && DeadCases.empty() &&
+ NumUnknownBits < 64 /* avoid overflow */ &&
+ SI->getNumCases() == (1ULL << NumUnknownBits)) {
createUnreachableSwitchDefault(SI, DTU);
- return true;
- }
-
- if (DeadCases.empty())
- return false;
-
- SwitchInstProfUpdateWrapper SIW(*SI);
- for (ConstantInt *DeadCase : DeadCases) {
- SwitchInst::CaseIt CaseI = SI->findCaseValue(DeadCase);
- assert(CaseI != SI->case_default() &&
- "Case was not found. Probably mistake in DeadCases forming.");
- // Prune unused values from PHI nodes.
- CaseI->getCaseSuccessor()->removePredecessor(SI->getParent());
- SIW.removeCase(CaseI);
- }
-
+ return true;
+ }
+
+ if (DeadCases.empty())
+ return false;
+
+ SwitchInstProfUpdateWrapper SIW(*SI);
+ for (ConstantInt *DeadCase : DeadCases) {
+ SwitchInst::CaseIt CaseI = SI->findCaseValue(DeadCase);
+ assert(CaseI != SI->case_default() &&
+ "Case was not found. Probably mistake in DeadCases forming.");
+ // Prune unused values from PHI nodes.
+ CaseI->getCaseSuccessor()->removePredecessor(SI->getParent());
+ SIW.removeCase(CaseI);
+ }
+
std::vector<DominatorTree::UpdateType> Updates;
for (const std::pair<BasicBlock *, int> &I : NumPerSuccessorCases)
if (I.second == 0)
@@ -4872,366 +4872,366 @@ static bool eliminateDeadSwitchCases(SwitchInst *SI, DomTreeUpdater *DTU,
if (DTU)
DTU->applyUpdates(Updates);
- return true;
-}
-
-/// If BB would be eligible for simplification by
-/// TryToSimplifyUncondBranchFromEmptyBlock (i.e. it is empty and terminated
-/// by an unconditional branch), look at the phi node for BB in the successor
-/// block and see if the incoming value is equal to CaseValue. If so, return
-/// the phi node, and set PhiIndex to BB's index in the phi node.
-static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
- BasicBlock *BB, int *PhiIndex) {
- if (BB->getFirstNonPHIOrDbg() != BB->getTerminator())
- return nullptr; // BB must be empty to be a candidate for simplification.
- if (!BB->getSinglePredecessor())
- return nullptr; // BB must be dominated by the switch.
-
- BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
- if (!Branch || !Branch->isUnconditional())
- return nullptr; // Terminator must be unconditional branch.
-
- BasicBlock *Succ = Branch->getSuccessor(0);
-
- for (PHINode &PHI : Succ->phis()) {
- int Idx = PHI.getBasicBlockIndex(BB);
- assert(Idx >= 0 && "PHI has no entry for predecessor?");
-
- Value *InValue = PHI.getIncomingValue(Idx);
- if (InValue != CaseValue)
- continue;
-
- *PhiIndex = Idx;
- return &PHI;
- }
-
- return nullptr;
-}
-
-/// Try to forward the condition of a switch instruction to a phi node
-/// dominated by the switch, if that would mean that some of the destination
-/// blocks of the switch can be folded away. Return true if a change is made.
-static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
- using ForwardingNodesMap = DenseMap<PHINode *, SmallVector<int, 4>>;
-
- ForwardingNodesMap ForwardingNodes;
- BasicBlock *SwitchBlock = SI->getParent();
- bool Changed = false;
- for (auto &Case : SI->cases()) {
- ConstantInt *CaseValue = Case.getCaseValue();
- BasicBlock *CaseDest = Case.getCaseSuccessor();
-
- // Replace phi operands in successor blocks that are using the constant case
- // value rather than the switch condition variable:
- // switchbb:
- // switch i32 %x, label %default [
- // i32 17, label %succ
- // ...
- // succ:
- // %r = phi i32 ... [ 17, %switchbb ] ...
- // -->
- // %r = phi i32 ... [ %x, %switchbb ] ...
-
- for (PHINode &Phi : CaseDest->phis()) {
- // This only works if there is exactly 1 incoming edge from the switch to
- // a phi. If there is >1, that means multiple cases of the switch map to 1
- // value in the phi, and that phi value is not the switch condition. Thus,
- // this transform would not make sense (the phi would be invalid because
- // a phi can't have different incoming values from the same block).
- int SwitchBBIdx = Phi.getBasicBlockIndex(SwitchBlock);
- if (Phi.getIncomingValue(SwitchBBIdx) == CaseValue &&
- count(Phi.blocks(), SwitchBlock) == 1) {
- Phi.setIncomingValue(SwitchBBIdx, SI->getCondition());
- Changed = true;
- }
- }
-
- // Collect phi nodes that are indirectly using this switch's case constants.
- int PhiIdx;
- if (auto *Phi = FindPHIForConditionForwarding(CaseValue, CaseDest, &PhiIdx))
- ForwardingNodes[Phi].push_back(PhiIdx);
- }
-
- for (auto &ForwardingNode : ForwardingNodes) {
- PHINode *Phi = ForwardingNode.first;
- SmallVectorImpl<int> &Indexes = ForwardingNode.second;
- if (Indexes.size() < 2)
- continue;
-
- for (int Index : Indexes)
- Phi->setIncomingValue(Index, SI->getCondition());
- Changed = true;
- }
-
- return Changed;
-}
-
-/// Return true if the backend will be able to handle
-/// initializing an array of constants like C.
-static bool ValidLookupTableConstant(Constant *C, const TargetTransformInfo &TTI) {
- if (C->isThreadDependent())
- return false;
- if (C->isDLLImportDependent())
- return false;
-
- if (!isa<ConstantFP>(C) && !isa<ConstantInt>(C) &&
- !isa<ConstantPointerNull>(C) && !isa<GlobalValue>(C) &&
- !isa<UndefValue>(C) && !isa<ConstantExpr>(C))
- return false;
-
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
- if (!CE->isGEPWithNoNotionalOverIndexing())
- return false;
- if (!ValidLookupTableConstant(CE->getOperand(0), TTI))
- return false;
- }
-
- if (!TTI.shouldBuildLookupTablesForConstant(C))
- return false;
-
- return true;
-}
-
-/// If V is a Constant, return it. Otherwise, try to look up
-/// its constant value in ConstantPool, returning 0 if it's not there.
-static Constant *
-LookupConstant(Value *V,
- const SmallDenseMap<Value *, Constant *> &ConstantPool) {
- if (Constant *C = dyn_cast<Constant>(V))
- return C;
- return ConstantPool.lookup(V);
-}
-
-/// Try to fold instruction I into a constant. This works for
-/// simple instructions such as binary operations where both operands are
-/// constant or can be replaced by constants from the ConstantPool. Returns the
-/// resulting constant on success, 0 otherwise.
-static Constant *
-ConstantFold(Instruction *I, const DataLayout &DL,
- const SmallDenseMap<Value *, Constant *> &ConstantPool) {
- if (SelectInst *Select = dyn_cast<SelectInst>(I)) {
- Constant *A = LookupConstant(Select->getCondition(), ConstantPool);
- if (!A)
- return nullptr;
- if (A->isAllOnesValue())
- return LookupConstant(Select->getTrueValue(), ConstantPool);
- if (A->isNullValue())
- return LookupConstant(Select->getFalseValue(), ConstantPool);
- return nullptr;
- }
-
- SmallVector<Constant *, 4> COps;
- for (unsigned N = 0, E = I->getNumOperands(); N != E; ++N) {
- if (Constant *A = LookupConstant(I->getOperand(N), ConstantPool))
- COps.push_back(A);
- else
- return nullptr;
- }
-
- if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
- return ConstantFoldCompareInstOperands(Cmp->getPredicate(), COps[0],
- COps[1], DL);
- }
-
- return ConstantFoldInstOperands(I, COps, DL);
-}
-
-/// Try to determine the resulting constant values in phi nodes
-/// at the common destination basic block, *CommonDest, for one of the case
-/// destionations CaseDest corresponding to value CaseVal (0 for the default
-/// case), of a switch instruction SI.
-static bool
-GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
- BasicBlock **CommonDest,
- SmallVectorImpl<std::pair<PHINode *, Constant *>> &Res,
- const DataLayout &DL, const TargetTransformInfo &TTI) {
- // The block from which we enter the common destination.
- BasicBlock *Pred = SI->getParent();
-
- // If CaseDest is empty except for some side-effect free instructions through
- // which we can constant-propagate the CaseVal, continue to its successor.
- SmallDenseMap<Value *, Constant *> ConstantPool;
- ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
- for (Instruction &I :CaseDest->instructionsWithoutDebug()) {
- if (I.isTerminator()) {
- // If the terminator is a simple branch, continue to the next block.
- if (I.getNumSuccessors() != 1 || I.isExceptionalTerminator())
- return false;
- Pred = CaseDest;
- CaseDest = I.getSuccessor(0);
- } else if (Constant *C = ConstantFold(&I, DL, ConstantPool)) {
- // Instruction is side-effect free and constant.
-
- // If the instruction has uses outside this block or a phi node slot for
- // the block, it is not safe to bypass the instruction since it would then
- // no longer dominate all its uses.
- for (auto &Use : I.uses()) {
- User *User = Use.getUser();
- if (Instruction *I = dyn_cast<Instruction>(User))
- if (I->getParent() == CaseDest)
- continue;
- if (PHINode *Phi = dyn_cast<PHINode>(User))
- if (Phi->getIncomingBlock(Use) == CaseDest)
- continue;
- return false;
- }
-
- ConstantPool.insert(std::make_pair(&I, C));
- } else {
- break;
- }
- }
-
- // If we did not have a CommonDest before, use the current one.
- if (!*CommonDest)
- *CommonDest = CaseDest;
- // If the destination isn't the common one, abort.
- if (CaseDest != *CommonDest)
- return false;
-
- // Get the values for this case from phi nodes in the destination block.
- for (PHINode &PHI : (*CommonDest)->phis()) {
- int Idx = PHI.getBasicBlockIndex(Pred);
- if (Idx == -1)
- continue;
-
- Constant *ConstVal =
- LookupConstant(PHI.getIncomingValue(Idx), ConstantPool);
- if (!ConstVal)
- return false;
-
- // Be conservative about which kinds of constants we support.
- if (!ValidLookupTableConstant(ConstVal, TTI))
- return false;
-
- Res.push_back(std::make_pair(&PHI, ConstVal));
- }
-
- return Res.size() > 0;
-}
-
-// Helper function used to add CaseVal to the list of cases that generate
-// Result. Returns the updated number of cases that generate this result.
-static uintptr_t MapCaseToResult(ConstantInt *CaseVal,
- SwitchCaseResultVectorTy &UniqueResults,
- Constant *Result) {
- for (auto &I : UniqueResults) {
- if (I.first == Result) {
- I.second.push_back(CaseVal);
- return I.second.size();
- }
- }
- UniqueResults.push_back(
- std::make_pair(Result, SmallVector<ConstantInt *, 4>(1, CaseVal)));
- return 1;
-}
-
-// Helper function that initializes a map containing
-// results for the PHI node of the common destination block for a switch
-// instruction. Returns false if multiple PHI nodes have been found or if
-// there is not a common destination block for the switch.
-static bool
-InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI, BasicBlock *&CommonDest,
- SwitchCaseResultVectorTy &UniqueResults,
- Constant *&DefaultResult, const DataLayout &DL,
- const TargetTransformInfo &TTI,
- uintptr_t MaxUniqueResults, uintptr_t MaxCasesPerResult) {
- for (auto &I : SI->cases()) {
- ConstantInt *CaseVal = I.getCaseValue();
-
- // Resulting value at phi nodes for this case value.
- SwitchCaseResultsTy Results;
- if (!GetCaseResults(SI, CaseVal, I.getCaseSuccessor(), &CommonDest, Results,
- DL, TTI))
- return false;
-
- // Only one value per case is permitted.
- if (Results.size() > 1)
- return false;
-
- // Add the case->result mapping to UniqueResults.
- const uintptr_t NumCasesForResult =
- MapCaseToResult(CaseVal, UniqueResults, Results.begin()->second);
-
- // Early out if there are too many cases for this result.
- if (NumCasesForResult > MaxCasesPerResult)
- return false;
-
- // Early out if there are too many unique results.
- if (UniqueResults.size() > MaxUniqueResults)
- return false;
-
- // Check the PHI consistency.
- if (!PHI)
- PHI = Results[0].first;
- else if (PHI != Results[0].first)
- return false;
- }
- // Find the default result value.
- SmallVector<std::pair<PHINode *, Constant *>, 1> DefaultResults;
- BasicBlock *DefaultDest = SI->getDefaultDest();
- GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResults,
- DL, TTI);
- // If the default value is not found abort unless the default destination
- // is unreachable.
- DefaultResult =
- DefaultResults.size() == 1 ? DefaultResults.begin()->second : nullptr;
- if ((!DefaultResult &&
- !isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg())))
- return false;
-
- return true;
-}
-
-// Helper function that checks if it is possible to transform a switch with only
-// two cases (or two cases + default) that produces a result into a select.
-// Example:
-// switch (a) {
-// case 10: %0 = icmp eq i32 %a, 10
-// return 10; %1 = select i1 %0, i32 10, i32 4
-// case 20: ----> %2 = icmp eq i32 %a, 20
-// return 2; %3 = select i1 %2, i32 2, i32 %1
-// default:
-// return 4;
-// }
-static Value *ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector,
- Constant *DefaultResult, Value *Condition,
- IRBuilder<> &Builder) {
- assert(ResultVector.size() == 2 &&
- "We should have exactly two unique results at this point");
- // If we are selecting between only two cases transform into a simple
- // select or a two-way select if default is possible.
- if (ResultVector[0].second.size() == 1 &&
- ResultVector[1].second.size() == 1) {
- ConstantInt *const FirstCase = ResultVector[0].second[0];
- ConstantInt *const SecondCase = ResultVector[1].second[0];
-
- bool DefaultCanTrigger = DefaultResult;
- Value *SelectValue = ResultVector[1].first;
- if (DefaultCanTrigger) {
- Value *const ValueCompare =
- Builder.CreateICmpEQ(Condition, SecondCase, "switch.selectcmp");
- SelectValue = Builder.CreateSelect(ValueCompare, ResultVector[1].first,
- DefaultResult, "switch.select");
- }
- Value *const ValueCompare =
- Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp");
- return Builder.CreateSelect(ValueCompare, ResultVector[0].first,
- SelectValue, "switch.select");
- }
-
- return nullptr;
-}
-
-// Helper function to cleanup a switch instruction that has been converted into
-// a select, fixing up PHI nodes and basic blocks.
-static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI,
- Value *SelectValue,
+ return true;
+}
+
+/// If BB would be eligible for simplification by
+/// TryToSimplifyUncondBranchFromEmptyBlock (i.e. it is empty and terminated
+/// by an unconditional branch), look at the phi node for BB in the successor
+/// block and see if the incoming value is equal to CaseValue. If so, return
+/// the phi node, and set PhiIndex to BB's index in the phi node.
+static PHINode *FindPHIForConditionForwarding(ConstantInt *CaseValue,
+ BasicBlock *BB, int *PhiIndex) {
+ if (BB->getFirstNonPHIOrDbg() != BB->getTerminator())
+ return nullptr; // BB must be empty to be a candidate for simplification.
+ if (!BB->getSinglePredecessor())
+ return nullptr; // BB must be dominated by the switch.
+
+ BranchInst *Branch = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!Branch || !Branch->isUnconditional())
+ return nullptr; // Terminator must be unconditional branch.
+
+ BasicBlock *Succ = Branch->getSuccessor(0);
+
+ for (PHINode &PHI : Succ->phis()) {
+ int Idx = PHI.getBasicBlockIndex(BB);
+ assert(Idx >= 0 && "PHI has no entry for predecessor?");
+
+ Value *InValue = PHI.getIncomingValue(Idx);
+ if (InValue != CaseValue)
+ continue;
+
+ *PhiIndex = Idx;
+ return &PHI;
+ }
+
+ return nullptr;
+}
+
+/// Try to forward the condition of a switch instruction to a phi node
+/// dominated by the switch, if that would mean that some of the destination
+/// blocks of the switch can be folded away. Return true if a change is made.
+static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
+ using ForwardingNodesMap = DenseMap<PHINode *, SmallVector<int, 4>>;
+
+ ForwardingNodesMap ForwardingNodes;
+ BasicBlock *SwitchBlock = SI->getParent();
+ bool Changed = false;
+ for (auto &Case : SI->cases()) {
+ ConstantInt *CaseValue = Case.getCaseValue();
+ BasicBlock *CaseDest = Case.getCaseSuccessor();
+
+ // Replace phi operands in successor blocks that are using the constant case
+ // value rather than the switch condition variable:
+ // switchbb:
+ // switch i32 %x, label %default [
+ // i32 17, label %succ
+ // ...
+ // succ:
+ // %r = phi i32 ... [ 17, %switchbb ] ...
+ // -->
+ // %r = phi i32 ... [ %x, %switchbb ] ...
+
+ for (PHINode &Phi : CaseDest->phis()) {
+ // This only works if there is exactly 1 incoming edge from the switch to
+ // a phi. If there is >1, that means multiple cases of the switch map to 1
+ // value in the phi, and that phi value is not the switch condition. Thus,
+ // this transform would not make sense (the phi would be invalid because
+ // a phi can't have different incoming values from the same block).
+ int SwitchBBIdx = Phi.getBasicBlockIndex(SwitchBlock);
+ if (Phi.getIncomingValue(SwitchBBIdx) == CaseValue &&
+ count(Phi.blocks(), SwitchBlock) == 1) {
+ Phi.setIncomingValue(SwitchBBIdx, SI->getCondition());
+ Changed = true;
+ }
+ }
+
+ // Collect phi nodes that are indirectly using this switch's case constants.
+ int PhiIdx;
+ if (auto *Phi = FindPHIForConditionForwarding(CaseValue, CaseDest, &PhiIdx))
+ ForwardingNodes[Phi].push_back(PhiIdx);
+ }
+
+ for (auto &ForwardingNode : ForwardingNodes) {
+ PHINode *Phi = ForwardingNode.first;
+ SmallVectorImpl<int> &Indexes = ForwardingNode.second;
+ if (Indexes.size() < 2)
+ continue;
+
+ for (int Index : Indexes)
+ Phi->setIncomingValue(Index, SI->getCondition());
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+/// Return true if the backend will be able to handle
+/// initializing an array of constants like C.
+static bool ValidLookupTableConstant(Constant *C, const TargetTransformInfo &TTI) {
+ if (C->isThreadDependent())
+ return false;
+ if (C->isDLLImportDependent())
+ return false;
+
+ if (!isa<ConstantFP>(C) && !isa<ConstantInt>(C) &&
+ !isa<ConstantPointerNull>(C) && !isa<GlobalValue>(C) &&
+ !isa<UndefValue>(C) && !isa<ConstantExpr>(C))
+ return false;
+
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+ if (!CE->isGEPWithNoNotionalOverIndexing())
+ return false;
+ if (!ValidLookupTableConstant(CE->getOperand(0), TTI))
+ return false;
+ }
+
+ if (!TTI.shouldBuildLookupTablesForConstant(C))
+ return false;
+
+ return true;
+}
+
+/// If V is a Constant, return it. Otherwise, try to look up
+/// its constant value in ConstantPool, returning 0 if it's not there.
+static Constant *
+LookupConstant(Value *V,
+ const SmallDenseMap<Value *, Constant *> &ConstantPool) {
+ if (Constant *C = dyn_cast<Constant>(V))
+ return C;
+ return ConstantPool.lookup(V);
+}
+
+/// Try to fold instruction I into a constant. This works for
+/// simple instructions such as binary operations where both operands are
+/// constant or can be replaced by constants from the ConstantPool. Returns the
+/// resulting constant on success, 0 otherwise.
+static Constant *
+ConstantFold(Instruction *I, const DataLayout &DL,
+ const SmallDenseMap<Value *, Constant *> &ConstantPool) {
+ if (SelectInst *Select = dyn_cast<SelectInst>(I)) {
+ Constant *A = LookupConstant(Select->getCondition(), ConstantPool);
+ if (!A)
+ return nullptr;
+ if (A->isAllOnesValue())
+ return LookupConstant(Select->getTrueValue(), ConstantPool);
+ if (A->isNullValue())
+ return LookupConstant(Select->getFalseValue(), ConstantPool);
+ return nullptr;
+ }
+
+ SmallVector<Constant *, 4> COps;
+ for (unsigned N = 0, E = I->getNumOperands(); N != E; ++N) {
+ if (Constant *A = LookupConstant(I->getOperand(N), ConstantPool))
+ COps.push_back(A);
+ else
+ return nullptr;
+ }
+
+ if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
+ return ConstantFoldCompareInstOperands(Cmp->getPredicate(), COps[0],
+ COps[1], DL);
+ }
+
+ return ConstantFoldInstOperands(I, COps, DL);
+}
+
+/// Try to determine the resulting constant values in phi nodes
+/// at the common destination basic block, *CommonDest, for one of the case
+/// destionations CaseDest corresponding to value CaseVal (0 for the default
+/// case), of a switch instruction SI.
+static bool
+GetCaseResults(SwitchInst *SI, ConstantInt *CaseVal, BasicBlock *CaseDest,
+ BasicBlock **CommonDest,
+ SmallVectorImpl<std::pair<PHINode *, Constant *>> &Res,
+ const DataLayout &DL, const TargetTransformInfo &TTI) {
+ // The block from which we enter the common destination.
+ BasicBlock *Pred = SI->getParent();
+
+ // If CaseDest is empty except for some side-effect free instructions through
+ // which we can constant-propagate the CaseVal, continue to its successor.
+ SmallDenseMap<Value *, Constant *> ConstantPool;
+ ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
+ for (Instruction &I :CaseDest->instructionsWithoutDebug()) {
+ if (I.isTerminator()) {
+ // If the terminator is a simple branch, continue to the next block.
+ if (I.getNumSuccessors() != 1 || I.isExceptionalTerminator())
+ return false;
+ Pred = CaseDest;
+ CaseDest = I.getSuccessor(0);
+ } else if (Constant *C = ConstantFold(&I, DL, ConstantPool)) {
+ // Instruction is side-effect free and constant.
+
+ // If the instruction has uses outside this block or a phi node slot for
+ // the block, it is not safe to bypass the instruction since it would then
+ // no longer dominate all its uses.
+ for (auto &Use : I.uses()) {
+ User *User = Use.getUser();
+ if (Instruction *I = dyn_cast<Instruction>(User))
+ if (I->getParent() == CaseDest)
+ continue;
+ if (PHINode *Phi = dyn_cast<PHINode>(User))
+ if (Phi->getIncomingBlock(Use) == CaseDest)
+ continue;
+ return false;
+ }
+
+ ConstantPool.insert(std::make_pair(&I, C));
+ } else {
+ break;
+ }
+ }
+
+ // If we did not have a CommonDest before, use the current one.
+ if (!*CommonDest)
+ *CommonDest = CaseDest;
+ // If the destination isn't the common one, abort.
+ if (CaseDest != *CommonDest)
+ return false;
+
+ // Get the values for this case from phi nodes in the destination block.
+ for (PHINode &PHI : (*CommonDest)->phis()) {
+ int Idx = PHI.getBasicBlockIndex(Pred);
+ if (Idx == -1)
+ continue;
+
+ Constant *ConstVal =
+ LookupConstant(PHI.getIncomingValue(Idx), ConstantPool);
+ if (!ConstVal)
+ return false;
+
+ // Be conservative about which kinds of constants we support.
+ if (!ValidLookupTableConstant(ConstVal, TTI))
+ return false;
+
+ Res.push_back(std::make_pair(&PHI, ConstVal));
+ }
+
+ return Res.size() > 0;
+}
+
+// Helper function used to add CaseVal to the list of cases that generate
+// Result. Returns the updated number of cases that generate this result.
+static uintptr_t MapCaseToResult(ConstantInt *CaseVal,
+ SwitchCaseResultVectorTy &UniqueResults,
+ Constant *Result) {
+ for (auto &I : UniqueResults) {
+ if (I.first == Result) {
+ I.second.push_back(CaseVal);
+ return I.second.size();
+ }
+ }
+ UniqueResults.push_back(
+ std::make_pair(Result, SmallVector<ConstantInt *, 4>(1, CaseVal)));
+ return 1;
+}
+
+// Helper function that initializes a map containing
+// results for the PHI node of the common destination block for a switch
+// instruction. Returns false if multiple PHI nodes have been found or if
+// there is not a common destination block for the switch.
+static bool
+InitializeUniqueCases(SwitchInst *SI, PHINode *&PHI, BasicBlock *&CommonDest,
+ SwitchCaseResultVectorTy &UniqueResults,
+ Constant *&DefaultResult, const DataLayout &DL,
+ const TargetTransformInfo &TTI,
+ uintptr_t MaxUniqueResults, uintptr_t MaxCasesPerResult) {
+ for (auto &I : SI->cases()) {
+ ConstantInt *CaseVal = I.getCaseValue();
+
+ // Resulting value at phi nodes for this case value.
+ SwitchCaseResultsTy Results;
+ if (!GetCaseResults(SI, CaseVal, I.getCaseSuccessor(), &CommonDest, Results,
+ DL, TTI))
+ return false;
+
+ // Only one value per case is permitted.
+ if (Results.size() > 1)
+ return false;
+
+ // Add the case->result mapping to UniqueResults.
+ const uintptr_t NumCasesForResult =
+ MapCaseToResult(CaseVal, UniqueResults, Results.begin()->second);
+
+ // Early out if there are too many cases for this result.
+ if (NumCasesForResult > MaxCasesPerResult)
+ return false;
+
+ // Early out if there are too many unique results.
+ if (UniqueResults.size() > MaxUniqueResults)
+ return false;
+
+ // Check the PHI consistency.
+ if (!PHI)
+ PHI = Results[0].first;
+ else if (PHI != Results[0].first)
+ return false;
+ }
+ // Find the default result value.
+ SmallVector<std::pair<PHINode *, Constant *>, 1> DefaultResults;
+ BasicBlock *DefaultDest = SI->getDefaultDest();
+ GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest, DefaultResults,
+ DL, TTI);
+ // If the default value is not found abort unless the default destination
+ // is unreachable.
+ DefaultResult =
+ DefaultResults.size() == 1 ? DefaultResults.begin()->second : nullptr;
+ if ((!DefaultResult &&
+ !isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg())))
+ return false;
+
+ return true;
+}
+
+// Helper function that checks if it is possible to transform a switch with only
+// two cases (or two cases + default) that produces a result into a select.
+// Example:
+// switch (a) {
+// case 10: %0 = icmp eq i32 %a, 10
+// return 10; %1 = select i1 %0, i32 10, i32 4
+// case 20: ----> %2 = icmp eq i32 %a, 20
+// return 2; %3 = select i1 %2, i32 2, i32 %1
+// default:
+// return 4;
+// }
+static Value *ConvertTwoCaseSwitch(const SwitchCaseResultVectorTy &ResultVector,
+ Constant *DefaultResult, Value *Condition,
+ IRBuilder<> &Builder) {
+ assert(ResultVector.size() == 2 &&
+ "We should have exactly two unique results at this point");
+ // If we are selecting between only two cases transform into a simple
+ // select or a two-way select if default is possible.
+ if (ResultVector[0].second.size() == 1 &&
+ ResultVector[1].second.size() == 1) {
+ ConstantInt *const FirstCase = ResultVector[0].second[0];
+ ConstantInt *const SecondCase = ResultVector[1].second[0];
+
+ bool DefaultCanTrigger = DefaultResult;
+ Value *SelectValue = ResultVector[1].first;
+ if (DefaultCanTrigger) {
+ Value *const ValueCompare =
+ Builder.CreateICmpEQ(Condition, SecondCase, "switch.selectcmp");
+ SelectValue = Builder.CreateSelect(ValueCompare, ResultVector[1].first,
+ DefaultResult, "switch.select");
+ }
+ Value *const ValueCompare =
+ Builder.CreateICmpEQ(Condition, FirstCase, "switch.selectcmp");
+ return Builder.CreateSelect(ValueCompare, ResultVector[0].first,
+ SelectValue, "switch.select");
+ }
+
+ return nullptr;
+}
+
+// Helper function to cleanup a switch instruction that has been converted into
+// a select, fixing up PHI nodes and basic blocks.
+static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI,
+ Value *SelectValue,
IRBuilder<> &Builder,
DomTreeUpdater *DTU) {
std::vector<DominatorTree::UpdateType> Updates;
- BasicBlock *SelectBB = SI->getParent();
+ BasicBlock *SelectBB = SI->getParent();
BasicBlock *DestBB = PHI->getParent();
if (!is_contained(predecessors(DestBB), SelectBB))
@@ -5240,861 +5240,861 @@ static void RemoveSwitchAfterSelectConversion(SwitchInst *SI, PHINode *PHI,
// Remove the switch.
- while (PHI->getBasicBlockIndex(SelectBB) >= 0)
- PHI->removeIncomingValue(SelectBB);
- PHI->addIncoming(SelectValue, SelectBB);
-
- for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
- BasicBlock *Succ = SI->getSuccessor(i);
-
+ while (PHI->getBasicBlockIndex(SelectBB) >= 0)
+ PHI->removeIncomingValue(SelectBB);
+ PHI->addIncoming(SelectValue, SelectBB);
+
+ for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
+ BasicBlock *Succ = SI->getSuccessor(i);
+
if (Succ == DestBB)
- continue;
- Succ->removePredecessor(SelectBB);
+ continue;
+ Succ->removePredecessor(SelectBB);
Updates.push_back({DominatorTree::Delete, SelectBB, Succ});
- }
- SI->eraseFromParent();
+ }
+ SI->eraseFromParent();
if (DTU)
DTU->applyUpdates(Updates);
-}
-
-/// If the switch is only used to initialize one or more
-/// phi nodes in a common successor block with only two different
-/// constant values, replace the switch with select.
-static bool switchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
+}
+
+/// If the switch is only used to initialize one or more
+/// phi nodes in a common successor block with only two different
+/// constant values, replace the switch with select.
+static bool switchToSelect(SwitchInst *SI, IRBuilder<> &Builder,
DomTreeUpdater *DTU, const DataLayout &DL,
- const TargetTransformInfo &TTI) {
- Value *const Cond = SI->getCondition();
- PHINode *PHI = nullptr;
- BasicBlock *CommonDest = nullptr;
- Constant *DefaultResult;
- SwitchCaseResultVectorTy UniqueResults;
- // Collect all the cases that will deliver the same value from the switch.
- if (!InitializeUniqueCases(SI, PHI, CommonDest, UniqueResults, DefaultResult,
- DL, TTI, 2, 1))
- return false;
- // Selects choose between maximum two values.
- if (UniqueResults.size() != 2)
- return false;
- assert(PHI != nullptr && "PHI for value select not found");
-
- Builder.SetInsertPoint(SI);
- Value *SelectValue =
- ConvertTwoCaseSwitch(UniqueResults, DefaultResult, Cond, Builder);
- if (SelectValue) {
+ const TargetTransformInfo &TTI) {
+ Value *const Cond = SI->getCondition();
+ PHINode *PHI = nullptr;
+ BasicBlock *CommonDest = nullptr;
+ Constant *DefaultResult;
+ SwitchCaseResultVectorTy UniqueResults;
+ // Collect all the cases that will deliver the same value from the switch.
+ if (!InitializeUniqueCases(SI, PHI, CommonDest, UniqueResults, DefaultResult,
+ DL, TTI, 2, 1))
+ return false;
+ // Selects choose between maximum two values.
+ if (UniqueResults.size() != 2)
+ return false;
+ assert(PHI != nullptr && "PHI for value select not found");
+
+ Builder.SetInsertPoint(SI);
+ Value *SelectValue =
+ ConvertTwoCaseSwitch(UniqueResults, DefaultResult, Cond, Builder);
+ if (SelectValue) {
RemoveSwitchAfterSelectConversion(SI, PHI, SelectValue, Builder, DTU);
- return true;
- }
- // The switch couldn't be converted into a select.
- return false;
-}
-
-namespace {
-
-/// This class represents a lookup table that can be used to replace a switch.
-class SwitchLookupTable {
-public:
- /// Create a lookup table to use as a switch replacement with the contents
- /// of Values, using DefaultValue to fill any holes in the table.
- SwitchLookupTable(
- Module &M, uint64_t TableSize, ConstantInt *Offset,
- const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
- Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName);
-
- /// Build instructions with Builder to retrieve the value at
- /// the position given by Index in the lookup table.
- Value *BuildLookup(Value *Index, IRBuilder<> &Builder);
-
- /// Return true if a table with TableSize elements of
- /// type ElementType would fit in a target-legal register.
- static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize,
- Type *ElementType);
-
-private:
- // Depending on the contents of the table, it can be represented in
- // different ways.
- enum {
- // For tables where each element contains the same value, we just have to
- // store that single value and return it for each lookup.
- SingleValueKind,
-
- // For tables where there is a linear relationship between table index
- // and values. We calculate the result with a simple multiplication
- // and addition instead of a table lookup.
- LinearMapKind,
-
- // For small tables with integer elements, we can pack them into a bitmap
- // that fits into a target-legal register. Values are retrieved by
- // shift and mask operations.
- BitMapKind,
-
- // The table is stored as an array of values. Values are retrieved by load
- // instructions from the table.
- ArrayKind
- } Kind;
-
- // For SingleValueKind, this is the single value.
- Constant *SingleValue = nullptr;
-
- // For BitMapKind, this is the bitmap.
- ConstantInt *BitMap = nullptr;
- IntegerType *BitMapElementTy = nullptr;
-
- // For LinearMapKind, these are the constants used to derive the value.
- ConstantInt *LinearOffset = nullptr;
- ConstantInt *LinearMultiplier = nullptr;
-
- // For ArrayKind, this is the array.
- GlobalVariable *Array = nullptr;
-};
-
-} // end anonymous namespace
-
-SwitchLookupTable::SwitchLookupTable(
- Module &M, uint64_t TableSize, ConstantInt *Offset,
- const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
- Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName) {
- assert(Values.size() && "Can't build lookup table without values!");
- assert(TableSize >= Values.size() && "Can't fit values in table!");
-
- // If all values in the table are equal, this is that value.
- SingleValue = Values.begin()->second;
-
- Type *ValueType = Values.begin()->second->getType();
-
- // Build up the table contents.
- SmallVector<Constant *, 64> TableContents(TableSize);
- for (size_t I = 0, E = Values.size(); I != E; ++I) {
- ConstantInt *CaseVal = Values[I].first;
- Constant *CaseRes = Values[I].second;
- assert(CaseRes->getType() == ValueType);
-
- uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue();
- TableContents[Idx] = CaseRes;
-
- if (CaseRes != SingleValue)
- SingleValue = nullptr;
- }
-
- // Fill in any holes in the table with the default result.
- if (Values.size() < TableSize) {
- assert(DefaultValue &&
- "Need a default value to fill the lookup table holes.");
- assert(DefaultValue->getType() == ValueType);
- for (uint64_t I = 0; I < TableSize; ++I) {
- if (!TableContents[I])
- TableContents[I] = DefaultValue;
- }
-
- if (DefaultValue != SingleValue)
- SingleValue = nullptr;
- }
-
- // If each element in the table contains the same value, we only need to store
- // that single value.
- if (SingleValue) {
- Kind = SingleValueKind;
- return;
- }
-
- // Check if we can derive the value with a linear transformation from the
- // table index.
- if (isa<IntegerType>(ValueType)) {
- bool LinearMappingPossible = true;
- APInt PrevVal;
- APInt DistToPrev;
- assert(TableSize >= 2 && "Should be a SingleValue table.");
- // Check if there is the same distance between two consecutive values.
- for (uint64_t I = 0; I < TableSize; ++I) {
- ConstantInt *ConstVal = dyn_cast<ConstantInt>(TableContents[I]);
- if (!ConstVal) {
- // This is an undef. We could deal with it, but undefs in lookup tables
- // are very seldom. It's probably not worth the additional complexity.
- LinearMappingPossible = false;
- break;
- }
- const APInt &Val = ConstVal->getValue();
- if (I != 0) {
- APInt Dist = Val - PrevVal;
- if (I == 1) {
- DistToPrev = Dist;
- } else if (Dist != DistToPrev) {
- LinearMappingPossible = false;
- break;
- }
- }
- PrevVal = Val;
- }
- if (LinearMappingPossible) {
- LinearOffset = cast<ConstantInt>(TableContents[0]);
- LinearMultiplier = ConstantInt::get(M.getContext(), DistToPrev);
- Kind = LinearMapKind;
- ++NumLinearMaps;
- return;
- }
- }
-
- // If the type is integer and the table fits in a register, build a bitmap.
- if (WouldFitInRegister(DL, TableSize, ValueType)) {
- IntegerType *IT = cast<IntegerType>(ValueType);
- APInt TableInt(TableSize * IT->getBitWidth(), 0);
- for (uint64_t I = TableSize; I > 0; --I) {
- TableInt <<= IT->getBitWidth();
- // Insert values into the bitmap. Undef values are set to zero.
- if (!isa<UndefValue>(TableContents[I - 1])) {
- ConstantInt *Val = cast<ConstantInt>(TableContents[I - 1]);
- TableInt |= Val->getValue().zext(TableInt.getBitWidth());
- }
- }
- BitMap = ConstantInt::get(M.getContext(), TableInt);
- BitMapElementTy = IT;
- Kind = BitMapKind;
- ++NumBitMaps;
- return;
- }
-
- // Store the table in an array.
- ArrayType *ArrayTy = ArrayType::get(ValueType, TableSize);
- Constant *Initializer = ConstantArray::get(ArrayTy, TableContents);
-
- Array = new GlobalVariable(M, ArrayTy, /*isConstant=*/true,
- GlobalVariable::PrivateLinkage, Initializer,
- "switch.table." + FuncName);
- Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
- // Set the alignment to that of an array items. We will be only loading one
- // value out of it.
- Array->setAlignment(Align(DL.getPrefTypeAlignment(ValueType)));
- Kind = ArrayKind;
-}
-
-Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) {
- switch (Kind) {
- case SingleValueKind:
- return SingleValue;
- case LinearMapKind: {
- // Derive the result value from the input value.
- Value *Result = Builder.CreateIntCast(Index, LinearMultiplier->getType(),
- false, "switch.idx.cast");
- if (!LinearMultiplier->isOne())
- Result = Builder.CreateMul(Result, LinearMultiplier, "switch.idx.mult");
- if (!LinearOffset->isZero())
- Result = Builder.CreateAdd(Result, LinearOffset, "switch.offset");
- return Result;
- }
- case BitMapKind: {
- // Type of the bitmap (e.g. i59).
- IntegerType *MapTy = BitMap->getType();
-
- // Cast Index to the same type as the bitmap.
- // Note: The Index is <= the number of elements in the table, so
- // truncating it to the width of the bitmask is safe.
- Value *ShiftAmt = Builder.CreateZExtOrTrunc(Index, MapTy, "switch.cast");
-
- // Multiply the shift amount by the element width.
- ShiftAmt = Builder.CreateMul(
- ShiftAmt, ConstantInt::get(MapTy, BitMapElementTy->getBitWidth()),
- "switch.shiftamt");
-
- // Shift down.
- Value *DownShifted =
- Builder.CreateLShr(BitMap, ShiftAmt, "switch.downshift");
- // Mask off.
- return Builder.CreateTrunc(DownShifted, BitMapElementTy, "switch.masked");
- }
- case ArrayKind: {
- // Make sure the table index will not overflow when treated as signed.
- IntegerType *IT = cast<IntegerType>(Index->getType());
- uint64_t TableSize =
- Array->getInitializer()->getType()->getArrayNumElements();
- if (TableSize > (1ULL << (IT->getBitWidth() - 1)))
- Index = Builder.CreateZExt(
- Index, IntegerType::get(IT->getContext(), IT->getBitWidth() + 1),
- "switch.tableidx.zext");
-
- Value *GEPIndices[] = {Builder.getInt32(0), Index};
- Value *GEP = Builder.CreateInBoundsGEP(Array->getValueType(), Array,
- GEPIndices, "switch.gep");
- return Builder.CreateLoad(
- cast<ArrayType>(Array->getValueType())->getElementType(), GEP,
- "switch.load");
- }
- }
- llvm_unreachable("Unknown lookup table kind!");
-}
-
-bool SwitchLookupTable::WouldFitInRegister(const DataLayout &DL,
- uint64_t TableSize,
- Type *ElementType) {
- auto *IT = dyn_cast<IntegerType>(ElementType);
- if (!IT)
- return false;
- // FIXME: If the type is wider than it needs to be, e.g. i8 but all values
- // are <= 15, we could try to narrow the type.
-
- // Avoid overflow, fitsInLegalInteger uses unsigned int for the width.
- if (TableSize >= UINT_MAX / IT->getBitWidth())
- return false;
- return DL.fitsInLegalInteger(TableSize * IT->getBitWidth());
-}
-
-/// Determine whether a lookup table should be built for this switch, based on
-/// the number of cases, size of the table, and the types of the results.
-static bool
-ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
- const TargetTransformInfo &TTI, const DataLayout &DL,
- const SmallDenseMap<PHINode *, Type *> &ResultTypes) {
- if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10)
- return false; // TableSize overflowed, or mul below might overflow.
-
- bool AllTablesFitInRegister = true;
- bool HasIllegalType = false;
- for (const auto &I : ResultTypes) {
- Type *Ty = I.second;
-
- // Saturate this flag to true.
- HasIllegalType = HasIllegalType || !TTI.isTypeLegal(Ty);
-
- // Saturate this flag to false.
- AllTablesFitInRegister =
- AllTablesFitInRegister &&
- SwitchLookupTable::WouldFitInRegister(DL, TableSize, Ty);
-
- // If both flags saturate, we're done. NOTE: This *only* works with
- // saturating flags, and all flags have to saturate first due to the
- // non-deterministic behavior of iterating over a dense map.
- if (HasIllegalType && !AllTablesFitInRegister)
- break;
- }
-
- // If each table would fit in a register, we should build it anyway.
- if (AllTablesFitInRegister)
- return true;
-
- // Don't build a table that doesn't fit in-register if it has illegal types.
- if (HasIllegalType)
- return false;
-
- // The table density should be at least 40%. This is the same criterion as for
- // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
- // FIXME: Find the best cut-off.
- return SI->getNumCases() * 10 >= TableSize * 4;
-}
-
-/// Try to reuse the switch table index compare. Following pattern:
-/// \code
-/// if (idx < tablesize)
-/// r = table[idx]; // table does not contain default_value
-/// else
-/// r = default_value;
-/// if (r != default_value)
-/// ...
-/// \endcode
-/// Is optimized to:
-/// \code
-/// cond = idx < tablesize;
-/// if (cond)
-/// r = table[idx];
-/// else
-/// r = default_value;
-/// if (cond)
-/// ...
-/// \endcode
-/// Jump threading will then eliminate the second if(cond).
-static void reuseTableCompare(
- User *PhiUser, BasicBlock *PhiBlock, BranchInst *RangeCheckBranch,
- Constant *DefaultValue,
- const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values) {
- ICmpInst *CmpInst = dyn_cast<ICmpInst>(PhiUser);
- if (!CmpInst)
- return;
-
- // We require that the compare is in the same block as the phi so that jump
- // threading can do its work afterwards.
- if (CmpInst->getParent() != PhiBlock)
- return;
-
- Constant *CmpOp1 = dyn_cast<Constant>(CmpInst->getOperand(1));
- if (!CmpOp1)
- return;
-
- Value *RangeCmp = RangeCheckBranch->getCondition();
- Constant *TrueConst = ConstantInt::getTrue(RangeCmp->getType());
- Constant *FalseConst = ConstantInt::getFalse(RangeCmp->getType());
-
- // Check if the compare with the default value is constant true or false.
- Constant *DefaultConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
- DefaultValue, CmpOp1, true);
- if (DefaultConst != TrueConst && DefaultConst != FalseConst)
- return;
-
- // Check if the compare with the case values is distinct from the default
- // compare result.
- for (auto ValuePair : Values) {
- Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
- ValuePair.second, CmpOp1, true);
- if (!CaseConst || CaseConst == DefaultConst || isa<UndefValue>(CaseConst))
- return;
- assert((CaseConst == TrueConst || CaseConst == FalseConst) &&
- "Expect true or false as compare result.");
- }
-
- // Check if the branch instruction dominates the phi node. It's a simple
- // dominance check, but sufficient for our needs.
- // Although this check is invariant in the calling loops, it's better to do it
- // at this late stage. Practically we do it at most once for a switch.
- BasicBlock *BranchBlock = RangeCheckBranch->getParent();
- for (auto PI = pred_begin(PhiBlock), E = pred_end(PhiBlock); PI != E; ++PI) {
- BasicBlock *Pred = *PI;
- if (Pred != BranchBlock && Pred->getUniquePredecessor() != BranchBlock)
- return;
- }
-
- if (DefaultConst == FalseConst) {
- // The compare yields the same result. We can replace it.
- CmpInst->replaceAllUsesWith(RangeCmp);
- ++NumTableCmpReuses;
- } else {
- // The compare yields the same result, just inverted. We can replace it.
- Value *InvertedTableCmp = BinaryOperator::CreateXor(
- RangeCmp, ConstantInt::get(RangeCmp->getType(), 1), "inverted.cmp",
- RangeCheckBranch);
- CmpInst->replaceAllUsesWith(InvertedTableCmp);
- ++NumTableCmpReuses;
- }
-}
-
-/// If the switch is only used to initialize one or more phi nodes in a common
-/// successor block with different constant values, replace the switch with
-/// lookup tables.
-static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
+ return true;
+ }
+ // The switch couldn't be converted into a select.
+ return false;
+}
+
+namespace {
+
+/// This class represents a lookup table that can be used to replace a switch.
+class SwitchLookupTable {
+public:
+ /// Create a lookup table to use as a switch replacement with the contents
+ /// of Values, using DefaultValue to fill any holes in the table.
+ SwitchLookupTable(
+ Module &M, uint64_t TableSize, ConstantInt *Offset,
+ const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
+ Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName);
+
+ /// Build instructions with Builder to retrieve the value at
+ /// the position given by Index in the lookup table.
+ Value *BuildLookup(Value *Index, IRBuilder<> &Builder);
+
+ /// Return true if a table with TableSize elements of
+ /// type ElementType would fit in a target-legal register.
+ static bool WouldFitInRegister(const DataLayout &DL, uint64_t TableSize,
+ Type *ElementType);
+
+private:
+ // Depending on the contents of the table, it can be represented in
+ // different ways.
+ enum {
+ // For tables where each element contains the same value, we just have to
+ // store that single value and return it for each lookup.
+ SingleValueKind,
+
+ // For tables where there is a linear relationship between table index
+ // and values. We calculate the result with a simple multiplication
+ // and addition instead of a table lookup.
+ LinearMapKind,
+
+ // For small tables with integer elements, we can pack them into a bitmap
+ // that fits into a target-legal register. Values are retrieved by
+ // shift and mask operations.
+ BitMapKind,
+
+ // The table is stored as an array of values. Values are retrieved by load
+ // instructions from the table.
+ ArrayKind
+ } Kind;
+
+ // For SingleValueKind, this is the single value.
+ Constant *SingleValue = nullptr;
+
+ // For BitMapKind, this is the bitmap.
+ ConstantInt *BitMap = nullptr;
+ IntegerType *BitMapElementTy = nullptr;
+
+ // For LinearMapKind, these are the constants used to derive the value.
+ ConstantInt *LinearOffset = nullptr;
+ ConstantInt *LinearMultiplier = nullptr;
+
+ // For ArrayKind, this is the array.
+ GlobalVariable *Array = nullptr;
+};
+
+} // end anonymous namespace
+
+SwitchLookupTable::SwitchLookupTable(
+ Module &M, uint64_t TableSize, ConstantInt *Offset,
+ const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values,
+ Constant *DefaultValue, const DataLayout &DL, const StringRef &FuncName) {
+ assert(Values.size() && "Can't build lookup table without values!");
+ assert(TableSize >= Values.size() && "Can't fit values in table!");
+
+ // If all values in the table are equal, this is that value.
+ SingleValue = Values.begin()->second;
+
+ Type *ValueType = Values.begin()->second->getType();
+
+ // Build up the table contents.
+ SmallVector<Constant *, 64> TableContents(TableSize);
+ for (size_t I = 0, E = Values.size(); I != E; ++I) {
+ ConstantInt *CaseVal = Values[I].first;
+ Constant *CaseRes = Values[I].second;
+ assert(CaseRes->getType() == ValueType);
+
+ uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue();
+ TableContents[Idx] = CaseRes;
+
+ if (CaseRes != SingleValue)
+ SingleValue = nullptr;
+ }
+
+ // Fill in any holes in the table with the default result.
+ if (Values.size() < TableSize) {
+ assert(DefaultValue &&
+ "Need a default value to fill the lookup table holes.");
+ assert(DefaultValue->getType() == ValueType);
+ for (uint64_t I = 0; I < TableSize; ++I) {
+ if (!TableContents[I])
+ TableContents[I] = DefaultValue;
+ }
+
+ if (DefaultValue != SingleValue)
+ SingleValue = nullptr;
+ }
+
+ // If each element in the table contains the same value, we only need to store
+ // that single value.
+ if (SingleValue) {
+ Kind = SingleValueKind;
+ return;
+ }
+
+ // Check if we can derive the value with a linear transformation from the
+ // table index.
+ if (isa<IntegerType>(ValueType)) {
+ bool LinearMappingPossible = true;
+ APInt PrevVal;
+ APInt DistToPrev;
+ assert(TableSize >= 2 && "Should be a SingleValue table.");
+ // Check if there is the same distance between two consecutive values.
+ for (uint64_t I = 0; I < TableSize; ++I) {
+ ConstantInt *ConstVal = dyn_cast<ConstantInt>(TableContents[I]);
+ if (!ConstVal) {
+ // This is an undef. We could deal with it, but undefs in lookup tables
+ // are very seldom. It's probably not worth the additional complexity.
+ LinearMappingPossible = false;
+ break;
+ }
+ const APInt &Val = ConstVal->getValue();
+ if (I != 0) {
+ APInt Dist = Val - PrevVal;
+ if (I == 1) {
+ DistToPrev = Dist;
+ } else if (Dist != DistToPrev) {
+ LinearMappingPossible = false;
+ break;
+ }
+ }
+ PrevVal = Val;
+ }
+ if (LinearMappingPossible) {
+ LinearOffset = cast<ConstantInt>(TableContents[0]);
+ LinearMultiplier = ConstantInt::get(M.getContext(), DistToPrev);
+ Kind = LinearMapKind;
+ ++NumLinearMaps;
+ return;
+ }
+ }
+
+ // If the type is integer and the table fits in a register, build a bitmap.
+ if (WouldFitInRegister(DL, TableSize, ValueType)) {
+ IntegerType *IT = cast<IntegerType>(ValueType);
+ APInt TableInt(TableSize * IT->getBitWidth(), 0);
+ for (uint64_t I = TableSize; I > 0; --I) {
+ TableInt <<= IT->getBitWidth();
+ // Insert values into the bitmap. Undef values are set to zero.
+ if (!isa<UndefValue>(TableContents[I - 1])) {
+ ConstantInt *Val = cast<ConstantInt>(TableContents[I - 1]);
+ TableInt |= Val->getValue().zext(TableInt.getBitWidth());
+ }
+ }
+ BitMap = ConstantInt::get(M.getContext(), TableInt);
+ BitMapElementTy = IT;
+ Kind = BitMapKind;
+ ++NumBitMaps;
+ return;
+ }
+
+ // Store the table in an array.
+ ArrayType *ArrayTy = ArrayType::get(ValueType, TableSize);
+ Constant *Initializer = ConstantArray::get(ArrayTy, TableContents);
+
+ Array = new GlobalVariable(M, ArrayTy, /*isConstant=*/true,
+ GlobalVariable::PrivateLinkage, Initializer,
+ "switch.table." + FuncName);
+ Array->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+ // Set the alignment to that of an array items. We will be only loading one
+ // value out of it.
+ Array->setAlignment(Align(DL.getPrefTypeAlignment(ValueType)));
+ Kind = ArrayKind;
+}
+
+Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) {
+ switch (Kind) {
+ case SingleValueKind:
+ return SingleValue;
+ case LinearMapKind: {
+ // Derive the result value from the input value.
+ Value *Result = Builder.CreateIntCast(Index, LinearMultiplier->getType(),
+ false, "switch.idx.cast");
+ if (!LinearMultiplier->isOne())
+ Result = Builder.CreateMul(Result, LinearMultiplier, "switch.idx.mult");
+ if (!LinearOffset->isZero())
+ Result = Builder.CreateAdd(Result, LinearOffset, "switch.offset");
+ return Result;
+ }
+ case BitMapKind: {
+ // Type of the bitmap (e.g. i59).
+ IntegerType *MapTy = BitMap->getType();
+
+ // Cast Index to the same type as the bitmap.
+ // Note: The Index is <= the number of elements in the table, so
+ // truncating it to the width of the bitmask is safe.
+ Value *ShiftAmt = Builder.CreateZExtOrTrunc(Index, MapTy, "switch.cast");
+
+ // Multiply the shift amount by the element width.
+ ShiftAmt = Builder.CreateMul(
+ ShiftAmt, ConstantInt::get(MapTy, BitMapElementTy->getBitWidth()),
+ "switch.shiftamt");
+
+ // Shift down.
+ Value *DownShifted =
+ Builder.CreateLShr(BitMap, ShiftAmt, "switch.downshift");
+ // Mask off.
+ return Builder.CreateTrunc(DownShifted, BitMapElementTy, "switch.masked");
+ }
+ case ArrayKind: {
+ // Make sure the table index will not overflow when treated as signed.
+ IntegerType *IT = cast<IntegerType>(Index->getType());
+ uint64_t TableSize =
+ Array->getInitializer()->getType()->getArrayNumElements();
+ if (TableSize > (1ULL << (IT->getBitWidth() - 1)))
+ Index = Builder.CreateZExt(
+ Index, IntegerType::get(IT->getContext(), IT->getBitWidth() + 1),
+ "switch.tableidx.zext");
+
+ Value *GEPIndices[] = {Builder.getInt32(0), Index};
+ Value *GEP = Builder.CreateInBoundsGEP(Array->getValueType(), Array,
+ GEPIndices, "switch.gep");
+ return Builder.CreateLoad(
+ cast<ArrayType>(Array->getValueType())->getElementType(), GEP,
+ "switch.load");
+ }
+ }
+ llvm_unreachable("Unknown lookup table kind!");
+}
+
+bool SwitchLookupTable::WouldFitInRegister(const DataLayout &DL,
+ uint64_t TableSize,
+ Type *ElementType) {
+ auto *IT = dyn_cast<IntegerType>(ElementType);
+ if (!IT)
+ return false;
+ // FIXME: If the type is wider than it needs to be, e.g. i8 but all values
+ // are <= 15, we could try to narrow the type.
+
+ // Avoid overflow, fitsInLegalInteger uses unsigned int for the width.
+ if (TableSize >= UINT_MAX / IT->getBitWidth())
+ return false;
+ return DL.fitsInLegalInteger(TableSize * IT->getBitWidth());
+}
+
+/// Determine whether a lookup table should be built for this switch, based on
+/// the number of cases, size of the table, and the types of the results.
+static bool
+ShouldBuildLookupTable(SwitchInst *SI, uint64_t TableSize,
+ const TargetTransformInfo &TTI, const DataLayout &DL,
+ const SmallDenseMap<PHINode *, Type *> &ResultTypes) {
+ if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10)
+ return false; // TableSize overflowed, or mul below might overflow.
+
+ bool AllTablesFitInRegister = true;
+ bool HasIllegalType = false;
+ for (const auto &I : ResultTypes) {
+ Type *Ty = I.second;
+
+ // Saturate this flag to true.
+ HasIllegalType = HasIllegalType || !TTI.isTypeLegal(Ty);
+
+ // Saturate this flag to false.
+ AllTablesFitInRegister =
+ AllTablesFitInRegister &&
+ SwitchLookupTable::WouldFitInRegister(DL, TableSize, Ty);
+
+ // If both flags saturate, we're done. NOTE: This *only* works with
+ // saturating flags, and all flags have to saturate first due to the
+ // non-deterministic behavior of iterating over a dense map.
+ if (HasIllegalType && !AllTablesFitInRegister)
+ break;
+ }
+
+ // If each table would fit in a register, we should build it anyway.
+ if (AllTablesFitInRegister)
+ return true;
+
+ // Don't build a table that doesn't fit in-register if it has illegal types.
+ if (HasIllegalType)
+ return false;
+
+ // The table density should be at least 40%. This is the same criterion as for
+ // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
+ // FIXME: Find the best cut-off.
+ return SI->getNumCases() * 10 >= TableSize * 4;
+}
+
+/// Try to reuse the switch table index compare. Following pattern:
+/// \code
+/// if (idx < tablesize)
+/// r = table[idx]; // table does not contain default_value
+/// else
+/// r = default_value;
+/// if (r != default_value)
+/// ...
+/// \endcode
+/// Is optimized to:
+/// \code
+/// cond = idx < tablesize;
+/// if (cond)
+/// r = table[idx];
+/// else
+/// r = default_value;
+/// if (cond)
+/// ...
+/// \endcode
+/// Jump threading will then eliminate the second if(cond).
+static void reuseTableCompare(
+ User *PhiUser, BasicBlock *PhiBlock, BranchInst *RangeCheckBranch,
+ Constant *DefaultValue,
+ const SmallVectorImpl<std::pair<ConstantInt *, Constant *>> &Values) {
+ ICmpInst *CmpInst = dyn_cast<ICmpInst>(PhiUser);
+ if (!CmpInst)
+ return;
+
+ // We require that the compare is in the same block as the phi so that jump
+ // threading can do its work afterwards.
+ if (CmpInst->getParent() != PhiBlock)
+ return;
+
+ Constant *CmpOp1 = dyn_cast<Constant>(CmpInst->getOperand(1));
+ if (!CmpOp1)
+ return;
+
+ Value *RangeCmp = RangeCheckBranch->getCondition();
+ Constant *TrueConst = ConstantInt::getTrue(RangeCmp->getType());
+ Constant *FalseConst = ConstantInt::getFalse(RangeCmp->getType());
+
+ // Check if the compare with the default value is constant true or false.
+ Constant *DefaultConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
+ DefaultValue, CmpOp1, true);
+ if (DefaultConst != TrueConst && DefaultConst != FalseConst)
+ return;
+
+ // Check if the compare with the case values is distinct from the default
+ // compare result.
+ for (auto ValuePair : Values) {
+ Constant *CaseConst = ConstantExpr::getICmp(CmpInst->getPredicate(),
+ ValuePair.second, CmpOp1, true);
+ if (!CaseConst || CaseConst == DefaultConst || isa<UndefValue>(CaseConst))
+ return;
+ assert((CaseConst == TrueConst || CaseConst == FalseConst) &&
+ "Expect true or false as compare result.");
+ }
+
+ // Check if the branch instruction dominates the phi node. It's a simple
+ // dominance check, but sufficient for our needs.
+ // Although this check is invariant in the calling loops, it's better to do it
+ // at this late stage. Practically we do it at most once for a switch.
+ BasicBlock *BranchBlock = RangeCheckBranch->getParent();
+ for (auto PI = pred_begin(PhiBlock), E = pred_end(PhiBlock); PI != E; ++PI) {
+ BasicBlock *Pred = *PI;
+ if (Pred != BranchBlock && Pred->getUniquePredecessor() != BranchBlock)
+ return;
+ }
+
+ if (DefaultConst == FalseConst) {
+ // The compare yields the same result. We can replace it.
+ CmpInst->replaceAllUsesWith(RangeCmp);
+ ++NumTableCmpReuses;
+ } else {
+ // The compare yields the same result, just inverted. We can replace it.
+ Value *InvertedTableCmp = BinaryOperator::CreateXor(
+ RangeCmp, ConstantInt::get(RangeCmp->getType(), 1), "inverted.cmp",
+ RangeCheckBranch);
+ CmpInst->replaceAllUsesWith(InvertedTableCmp);
+ ++NumTableCmpReuses;
+ }
+}
+
+/// If the switch is only used to initialize one or more phi nodes in a common
+/// successor block with different constant values, replace the switch with
+/// lookup tables.
+static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
DomTreeUpdater *DTU, const DataLayout &DL,
- const TargetTransformInfo &TTI) {
- assert(SI->getNumCases() > 1 && "Degenerate switch?");
-
+ const TargetTransformInfo &TTI) {
+ assert(SI->getNumCases() > 1 && "Degenerate switch?");
+
BasicBlock *BB = SI->getParent();
Function *Fn = BB->getParent();
- // Only build lookup table when we have a target that supports it or the
- // attribute is not set.
- if (!TTI.shouldBuildLookupTables() ||
- (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true"))
- return false;
-
- // FIXME: If the switch is too sparse for a lookup table, perhaps we could
- // split off a dense part and build a lookup table for that.
-
- // FIXME: This creates arrays of GEPs to constant strings, which means each
- // GEP needs a runtime relocation in PIC code. We should just build one big
- // string and lookup indices into that.
-
- // Ignore switches with less than three cases. Lookup tables will not make
- // them faster, so we don't analyze them.
- if (SI->getNumCases() < 3)
- return false;
-
- // Figure out the corresponding result for each case value and phi node in the
- // common destination, as well as the min and max case values.
- assert(!SI->cases().empty());
- SwitchInst::CaseIt CI = SI->case_begin();
- ConstantInt *MinCaseVal = CI->getCaseValue();
- ConstantInt *MaxCaseVal = CI->getCaseValue();
-
- BasicBlock *CommonDest = nullptr;
-
- using ResultListTy = SmallVector<std::pair<ConstantInt *, Constant *>, 4>;
- SmallDenseMap<PHINode *, ResultListTy> ResultLists;
-
- SmallDenseMap<PHINode *, Constant *> DefaultResults;
- SmallDenseMap<PHINode *, Type *> ResultTypes;
- SmallVector<PHINode *, 4> PHIs;
-
- for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
- ConstantInt *CaseVal = CI->getCaseValue();
- if (CaseVal->getValue().slt(MinCaseVal->getValue()))
- MinCaseVal = CaseVal;
- if (CaseVal->getValue().sgt(MaxCaseVal->getValue()))
- MaxCaseVal = CaseVal;
-
- // Resulting value at phi nodes for this case value.
- using ResultsTy = SmallVector<std::pair<PHINode *, Constant *>, 4>;
- ResultsTy Results;
- if (!GetCaseResults(SI, CaseVal, CI->getCaseSuccessor(), &CommonDest,
- Results, DL, TTI))
- return false;
-
- // Append the result from this case to the list for each phi.
- for (const auto &I : Results) {
- PHINode *PHI = I.first;
- Constant *Value = I.second;
- if (!ResultLists.count(PHI))
- PHIs.push_back(PHI);
- ResultLists[PHI].push_back(std::make_pair(CaseVal, Value));
- }
- }
-
- // Keep track of the result types.
- for (PHINode *PHI : PHIs) {
- ResultTypes[PHI] = ResultLists[PHI][0].second->getType();
- }
-
- uint64_t NumResults = ResultLists[PHIs[0]].size();
- APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue();
- uint64_t TableSize = RangeSpread.getLimitedValue() + 1;
- bool TableHasHoles = (NumResults < TableSize);
-
- // If the table has holes, we need a constant result for the default case
- // or a bitmask that fits in a register.
- SmallVector<std::pair<PHINode *, Constant *>, 4> DefaultResultsList;
- bool HasDefaultResults =
- GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest,
- DefaultResultsList, DL, TTI);
-
- bool NeedMask = (TableHasHoles && !HasDefaultResults);
- if (NeedMask) {
- // As an extra penalty for the validity test we require more cases.
- if (SI->getNumCases() < 4) // FIXME: Find best threshold value (benchmark).
- return false;
- if (!DL.fitsInLegalInteger(TableSize))
- return false;
- }
-
- for (const auto &I : DefaultResultsList) {
- PHINode *PHI = I.first;
- Constant *Result = I.second;
- DefaultResults[PHI] = Result;
- }
-
- if (!ShouldBuildLookupTable(SI, TableSize, TTI, DL, ResultTypes))
- return false;
-
+ // Only build lookup table when we have a target that supports it or the
+ // attribute is not set.
+ if (!TTI.shouldBuildLookupTables() ||
+ (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true"))
+ return false;
+
+ // FIXME: If the switch is too sparse for a lookup table, perhaps we could
+ // split off a dense part and build a lookup table for that.
+
+ // FIXME: This creates arrays of GEPs to constant strings, which means each
+ // GEP needs a runtime relocation in PIC code. We should just build one big
+ // string and lookup indices into that.
+
+ // Ignore switches with less than three cases. Lookup tables will not make
+ // them faster, so we don't analyze them.
+ if (SI->getNumCases() < 3)
+ return false;
+
+ // Figure out the corresponding result for each case value and phi node in the
+ // common destination, as well as the min and max case values.
+ assert(!SI->cases().empty());
+ SwitchInst::CaseIt CI = SI->case_begin();
+ ConstantInt *MinCaseVal = CI->getCaseValue();
+ ConstantInt *MaxCaseVal = CI->getCaseValue();
+
+ BasicBlock *CommonDest = nullptr;
+
+ using ResultListTy = SmallVector<std::pair<ConstantInt *, Constant *>, 4>;
+ SmallDenseMap<PHINode *, ResultListTy> ResultLists;
+
+ SmallDenseMap<PHINode *, Constant *> DefaultResults;
+ SmallDenseMap<PHINode *, Type *> ResultTypes;
+ SmallVector<PHINode *, 4> PHIs;
+
+ for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
+ ConstantInt *CaseVal = CI->getCaseValue();
+ if (CaseVal->getValue().slt(MinCaseVal->getValue()))
+ MinCaseVal = CaseVal;
+ if (CaseVal->getValue().sgt(MaxCaseVal->getValue()))
+ MaxCaseVal = CaseVal;
+
+ // Resulting value at phi nodes for this case value.
+ using ResultsTy = SmallVector<std::pair<PHINode *, Constant *>, 4>;
+ ResultsTy Results;
+ if (!GetCaseResults(SI, CaseVal, CI->getCaseSuccessor(), &CommonDest,
+ Results, DL, TTI))
+ return false;
+
+ // Append the result from this case to the list for each phi.
+ for (const auto &I : Results) {
+ PHINode *PHI = I.first;
+ Constant *Value = I.second;
+ if (!ResultLists.count(PHI))
+ PHIs.push_back(PHI);
+ ResultLists[PHI].push_back(std::make_pair(CaseVal, Value));
+ }
+ }
+
+ // Keep track of the result types.
+ for (PHINode *PHI : PHIs) {
+ ResultTypes[PHI] = ResultLists[PHI][0].second->getType();
+ }
+
+ uint64_t NumResults = ResultLists[PHIs[0]].size();
+ APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue();
+ uint64_t TableSize = RangeSpread.getLimitedValue() + 1;
+ bool TableHasHoles = (NumResults < TableSize);
+
+ // If the table has holes, we need a constant result for the default case
+ // or a bitmask that fits in a register.
+ SmallVector<std::pair<PHINode *, Constant *>, 4> DefaultResultsList;
+ bool HasDefaultResults =
+ GetCaseResults(SI, nullptr, SI->getDefaultDest(), &CommonDest,
+ DefaultResultsList, DL, TTI);
+
+ bool NeedMask = (TableHasHoles && !HasDefaultResults);
+ if (NeedMask) {
+ // As an extra penalty for the validity test we require more cases.
+ if (SI->getNumCases() < 4) // FIXME: Find best threshold value (benchmark).
+ return false;
+ if (!DL.fitsInLegalInteger(TableSize))
+ return false;
+ }
+
+ for (const auto &I : DefaultResultsList) {
+ PHINode *PHI = I.first;
+ Constant *Result = I.second;
+ DefaultResults[PHI] = Result;
+ }
+
+ if (!ShouldBuildLookupTable(SI, TableSize, TTI, DL, ResultTypes))
+ return false;
+
std::vector<DominatorTree::UpdateType> Updates;
- // Create the BB that does the lookups.
- Module &Mod = *CommonDest->getParent()->getParent();
- BasicBlock *LookupBB = BasicBlock::Create(
- Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest);
-
- // Compute the table index value.
- Builder.SetInsertPoint(SI);
- Value *TableIndex;
- if (MinCaseVal->isNullValue())
- TableIndex = SI->getCondition();
- else
- TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal,
- "switch.tableidx");
-
- // Compute the maximum table size representable by the integer type we are
- // switching upon.
- unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits();
- uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize;
- assert(MaxTableSize >= TableSize &&
- "It is impossible for a switch to have more entries than the max "
- "representable value of its input integer type's size.");
-
- // If the default destination is unreachable, or if the lookup table covers
- // all values of the conditional variable, branch directly to the lookup table
- // BB. Otherwise, check that the condition is within the case range.
- const bool DefaultIsReachable =
- !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
- const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize);
- BranchInst *RangeCheckBranch = nullptr;
-
- if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
- Builder.CreateBr(LookupBB);
+ // Create the BB that does the lookups.
+ Module &Mod = *CommonDest->getParent()->getParent();
+ BasicBlock *LookupBB = BasicBlock::Create(
+ Mod.getContext(), "switch.lookup", CommonDest->getParent(), CommonDest);
+
+ // Compute the table index value.
+ Builder.SetInsertPoint(SI);
+ Value *TableIndex;
+ if (MinCaseVal->isNullValue())
+ TableIndex = SI->getCondition();
+ else
+ TableIndex = Builder.CreateSub(SI->getCondition(), MinCaseVal,
+ "switch.tableidx");
+
+ // Compute the maximum table size representable by the integer type we are
+ // switching upon.
+ unsigned CaseSize = MinCaseVal->getType()->getPrimitiveSizeInBits();
+ uint64_t MaxTableSize = CaseSize > 63 ? UINT64_MAX : 1ULL << CaseSize;
+ assert(MaxTableSize >= TableSize &&
+ "It is impossible for a switch to have more entries than the max "
+ "representable value of its input integer type's size.");
+
+ // If the default destination is unreachable, or if the lookup table covers
+ // all values of the conditional variable, branch directly to the lookup table
+ // BB. Otherwise, check that the condition is within the case range.
+ const bool DefaultIsReachable =
+ !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
+ const bool GeneratingCoveredLookupTable = (MaxTableSize == TableSize);
+ BranchInst *RangeCheckBranch = nullptr;
+
+ if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
+ Builder.CreateBr(LookupBB);
Updates.push_back({DominatorTree::Insert, BB, LookupBB});
- // Note: We call removeProdecessor later since we need to be able to get the
- // PHI value for the default case in case we're using a bit mask.
- } else {
- Value *Cmp = Builder.CreateICmpULT(
- TableIndex, ConstantInt::get(MinCaseVal->getType(), TableSize));
- RangeCheckBranch =
- Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
+ // Note: We call removeProdecessor later since we need to be able to get the
+ // PHI value for the default case in case we're using a bit mask.
+ } else {
+ Value *Cmp = Builder.CreateICmpULT(
+ TableIndex, ConstantInt::get(MinCaseVal->getType(), TableSize));
+ RangeCheckBranch =
+ Builder.CreateCondBr(Cmp, LookupBB, SI->getDefaultDest());
Updates.push_back({DominatorTree::Insert, BB, LookupBB});
- }
-
- // Populate the BB that does the lookups.
- Builder.SetInsertPoint(LookupBB);
-
- if (NeedMask) {
- // Before doing the lookup, we do the hole check. The LookupBB is therefore
- // re-purposed to do the hole check, and we create a new LookupBB.
- BasicBlock *MaskBB = LookupBB;
- MaskBB->setName("switch.hole_check");
- LookupBB = BasicBlock::Create(Mod.getContext(), "switch.lookup",
- CommonDest->getParent(), CommonDest);
-
- // Make the mask's bitwidth at least 8-bit and a power-of-2 to avoid
- // unnecessary illegal types.
- uint64_t TableSizePowOf2 = NextPowerOf2(std::max(7ULL, TableSize - 1ULL));
- APInt MaskInt(TableSizePowOf2, 0);
- APInt One(TableSizePowOf2, 1);
- // Build bitmask; fill in a 1 bit for every case.
- const ResultListTy &ResultList = ResultLists[PHIs[0]];
- for (size_t I = 0, E = ResultList.size(); I != E; ++I) {
- uint64_t Idx = (ResultList[I].first->getValue() - MinCaseVal->getValue())
- .getLimitedValue();
- MaskInt |= One << Idx;
- }
- ConstantInt *TableMask = ConstantInt::get(Mod.getContext(), MaskInt);
-
- // Get the TableIndex'th bit of the bitmask.
- // If this bit is 0 (meaning hole) jump to the default destination,
- // else continue with table lookup.
- IntegerType *MapTy = TableMask->getType();
- Value *MaskIndex =
- Builder.CreateZExtOrTrunc(TableIndex, MapTy, "switch.maskindex");
- Value *Shifted = Builder.CreateLShr(TableMask, MaskIndex, "switch.shifted");
- Value *LoBit = Builder.CreateTrunc(
- Shifted, Type::getInt1Ty(Mod.getContext()), "switch.lobit");
- Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest());
+ }
+
+ // Populate the BB that does the lookups.
+ Builder.SetInsertPoint(LookupBB);
+
+ if (NeedMask) {
+ // Before doing the lookup, we do the hole check. The LookupBB is therefore
+ // re-purposed to do the hole check, and we create a new LookupBB.
+ BasicBlock *MaskBB = LookupBB;
+ MaskBB->setName("switch.hole_check");
+ LookupBB = BasicBlock::Create(Mod.getContext(), "switch.lookup",
+ CommonDest->getParent(), CommonDest);
+
+ // Make the mask's bitwidth at least 8-bit and a power-of-2 to avoid
+ // unnecessary illegal types.
+ uint64_t TableSizePowOf2 = NextPowerOf2(std::max(7ULL, TableSize - 1ULL));
+ APInt MaskInt(TableSizePowOf2, 0);
+ APInt One(TableSizePowOf2, 1);
+ // Build bitmask; fill in a 1 bit for every case.
+ const ResultListTy &ResultList = ResultLists[PHIs[0]];
+ for (size_t I = 0, E = ResultList.size(); I != E; ++I) {
+ uint64_t Idx = (ResultList[I].first->getValue() - MinCaseVal->getValue())
+ .getLimitedValue();
+ MaskInt |= One << Idx;
+ }
+ ConstantInt *TableMask = ConstantInt::get(Mod.getContext(), MaskInt);
+
+ // Get the TableIndex'th bit of the bitmask.
+ // If this bit is 0 (meaning hole) jump to the default destination,
+ // else continue with table lookup.
+ IntegerType *MapTy = TableMask->getType();
+ Value *MaskIndex =
+ Builder.CreateZExtOrTrunc(TableIndex, MapTy, "switch.maskindex");
+ Value *Shifted = Builder.CreateLShr(TableMask, MaskIndex, "switch.shifted");
+ Value *LoBit = Builder.CreateTrunc(
+ Shifted, Type::getInt1Ty(Mod.getContext()), "switch.lobit");
+ Builder.CreateCondBr(LoBit, LookupBB, SI->getDefaultDest());
Updates.push_back({DominatorTree::Insert, MaskBB, LookupBB});
Updates.push_back({DominatorTree::Insert, MaskBB, SI->getDefaultDest()});
- Builder.SetInsertPoint(LookupBB);
+ Builder.SetInsertPoint(LookupBB);
AddPredecessorToBlock(SI->getDefaultDest(), MaskBB, BB);
- }
-
- if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
- // We cached PHINodes in PHIs. To avoid accessing deleted PHINodes later,
- // do not delete PHINodes here.
+ }
+
+ if (!DefaultIsReachable || GeneratingCoveredLookupTable) {
+ // We cached PHINodes in PHIs. To avoid accessing deleted PHINodes later,
+ // do not delete PHINodes here.
SI->getDefaultDest()->removePredecessor(BB,
- /*KeepOneInputPHIs=*/true);
+ /*KeepOneInputPHIs=*/true);
Updates.push_back({DominatorTree::Delete, BB, SI->getDefaultDest()});
- }
-
- bool ReturnedEarly = false;
- for (PHINode *PHI : PHIs) {
- const ResultListTy &ResultList = ResultLists[PHI];
-
- // If using a bitmask, use any value to fill the lookup table holes.
- Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI];
- StringRef FuncName = Fn->getName();
- SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL,
- FuncName);
-
- Value *Result = Table.BuildLookup(TableIndex, Builder);
-
- // If the result is used to return immediately from the function, we want to
- // do that right here.
- if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->user_begin()) &&
- PHI->user_back() == CommonDest->getFirstNonPHIOrDbg()) {
- Builder.CreateRet(Result);
- ReturnedEarly = true;
- break;
- }
-
- // Do a small peephole optimization: re-use the switch table compare if
- // possible.
- if (!TableHasHoles && HasDefaultResults && RangeCheckBranch) {
- BasicBlock *PhiBlock = PHI->getParent();
- // Search for compare instructions which use the phi.
- for (auto *User : PHI->users()) {
- reuseTableCompare(User, PhiBlock, RangeCheckBranch, DV, ResultList);
- }
- }
-
- PHI->addIncoming(Result, LookupBB);
- }
-
+ }
+
+ bool ReturnedEarly = false;
+ for (PHINode *PHI : PHIs) {
+ const ResultListTy &ResultList = ResultLists[PHI];
+
+ // If using a bitmask, use any value to fill the lookup table holes.
+ Constant *DV = NeedMask ? ResultLists[PHI][0].second : DefaultResults[PHI];
+ StringRef FuncName = Fn->getName();
+ SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultList, DV, DL,
+ FuncName);
+
+ Value *Result = Table.BuildLookup(TableIndex, Builder);
+
+ // If the result is used to return immediately from the function, we want to
+ // do that right here.
+ if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->user_begin()) &&
+ PHI->user_back() == CommonDest->getFirstNonPHIOrDbg()) {
+ Builder.CreateRet(Result);
+ ReturnedEarly = true;
+ break;
+ }
+
+ // Do a small peephole optimization: re-use the switch table compare if
+ // possible.
+ if (!TableHasHoles && HasDefaultResults && RangeCheckBranch) {
+ BasicBlock *PhiBlock = PHI->getParent();
+ // Search for compare instructions which use the phi.
+ for (auto *User : PHI->users()) {
+ reuseTableCompare(User, PhiBlock, RangeCheckBranch, DV, ResultList);
+ }
+ }
+
+ PHI->addIncoming(Result, LookupBB);
+ }
+
if (!ReturnedEarly) {
- Builder.CreateBr(CommonDest);
+ Builder.CreateBr(CommonDest);
Updates.push_back({DominatorTree::Insert, LookupBB, CommonDest});
}
-
- // Remove the switch.
+
+ // Remove the switch.
SmallSetVector<BasicBlock *, 8> RemovedSuccessors;
- for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
- BasicBlock *Succ = SI->getSuccessor(i);
-
- if (Succ == SI->getDefaultDest())
- continue;
+ for (unsigned i = 0, e = SI->getNumSuccessors(); i < e; ++i) {
+ BasicBlock *Succ = SI->getSuccessor(i);
+
+ if (Succ == SI->getDefaultDest())
+ continue;
Succ->removePredecessor(BB);
RemovedSuccessors.insert(Succ);
- }
- SI->eraseFromParent();
-
+ }
+ SI->eraseFromParent();
+
if (DTU) {
for (BasicBlock *RemovedSuccessor : RemovedSuccessors)
Updates.push_back({DominatorTree::Delete, BB, RemovedSuccessor});
DTU->applyUpdates(Updates);
}
- ++NumLookupTables;
- if (NeedMask)
- ++NumLookupTablesHoles;
- return true;
-}
-
-static bool isSwitchDense(ArrayRef<int64_t> Values) {
- // See also SelectionDAGBuilder::isDense(), which this function was based on.
- uint64_t Diff = (uint64_t)Values.back() - (uint64_t)Values.front();
- uint64_t Range = Diff + 1;
- uint64_t NumCases = Values.size();
- // 40% is the default density for building a jump table in optsize/minsize mode.
- uint64_t MinDensity = 40;
-
- return NumCases * 100 >= Range * MinDensity;
-}
-
-/// Try to transform a switch that has "holes" in it to a contiguous sequence
-/// of cases.
-///
-/// A switch such as: switch(i) {case 5: case 9: case 13: case 17:} can be
-/// range-reduced to: switch ((i-5) / 4) {case 0: case 1: case 2: case 3:}.
-///
-/// This converts a sparse switch into a dense switch which allows better
-/// lowering and could also allow transforming into a lookup table.
-static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
- const DataLayout &DL,
- const TargetTransformInfo &TTI) {
- auto *CondTy = cast<IntegerType>(SI->getCondition()->getType());
- if (CondTy->getIntegerBitWidth() > 64 ||
- !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
- return false;
- // Only bother with this optimization if there are more than 3 switch cases;
- // SDAG will only bother creating jump tables for 4 or more cases.
- if (SI->getNumCases() < 4)
- return false;
-
- // This transform is agnostic to the signedness of the input or case values. We
- // can treat the case values as signed or unsigned. We can optimize more common
- // cases such as a sequence crossing zero {-4,0,4,8} if we interpret case values
- // as signed.
- SmallVector<int64_t,4> Values;
- for (auto &C : SI->cases())
- Values.push_back(C.getCaseValue()->getValue().getSExtValue());
- llvm::sort(Values);
-
- // If the switch is already dense, there's nothing useful to do here.
- if (isSwitchDense(Values))
- return false;
-
- // First, transform the values such that they start at zero and ascend.
- int64_t Base = Values[0];
- for (auto &V : Values)
- V -= (uint64_t)(Base);
-
- // Now we have signed numbers that have been shifted so that, given enough
- // precision, there are no negative values. Since the rest of the transform
- // is bitwise only, we switch now to an unsigned representation.
-
- // This transform can be done speculatively because it is so cheap - it
- // results in a single rotate operation being inserted.
- // FIXME: It's possible that optimizing a switch on powers of two might also
- // be beneficial - flag values are often powers of two and we could use a CLZ
- // as the key function.
-
- // countTrailingZeros(0) returns 64. As Values is guaranteed to have more than
- // one element and LLVM disallows duplicate cases, Shift is guaranteed to be
- // less than 64.
- unsigned Shift = 64;
- for (auto &V : Values)
- Shift = std::min(Shift, countTrailingZeros((uint64_t)V));
- assert(Shift < 64);
- if (Shift > 0)
- for (auto &V : Values)
- V = (int64_t)((uint64_t)V >> Shift);
-
- if (!isSwitchDense(Values))
- // Transform didn't create a dense switch.
- return false;
-
- // The obvious transform is to shift the switch condition right and emit a
- // check that the condition actually cleanly divided by GCD, i.e.
- // C & (1 << Shift - 1) == 0
- // inserting a new CFG edge to handle the case where it didn't divide cleanly.
- //
- // A cheaper way of doing this is a simple ROTR(C, Shift). This performs the
- // shift and puts the shifted-off bits in the uppermost bits. If any of these
- // are nonzero then the switch condition will be very large and will hit the
- // default case.
-
- auto *Ty = cast<IntegerType>(SI->getCondition()->getType());
- Builder.SetInsertPoint(SI);
- auto *ShiftC = ConstantInt::get(Ty, Shift);
- auto *Sub = Builder.CreateSub(SI->getCondition(), ConstantInt::get(Ty, Base));
- auto *LShr = Builder.CreateLShr(Sub, ShiftC);
- auto *Shl = Builder.CreateShl(Sub, Ty->getBitWidth() - Shift);
- auto *Rot = Builder.CreateOr(LShr, Shl);
- SI->replaceUsesOfWith(SI->getCondition(), Rot);
-
- for (auto Case : SI->cases()) {
- auto *Orig = Case.getCaseValue();
- auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base);
- Case.setValue(
- cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(ShiftC->getValue()))));
- }
- return true;
-}
-
-bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
- BasicBlock *BB = SI->getParent();
-
- if (isValueEqualityComparison(SI)) {
- // If we only have one predecessor, and if it is a branch on this value,
- // see if that predecessor totally determines the outcome of this switch.
- if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
- if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
- return requestResimplify();
-
- Value *Cond = SI->getCondition();
- if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
- if (SimplifySwitchOnSelect(SI, Select))
- return requestResimplify();
-
- // If the block only contains the switch, see if we can fold the block
- // away into any preds.
- if (SI == &*BB->instructionsWithoutDebug().begin())
- if (FoldValueComparisonIntoPredecessors(SI, Builder))
- return requestResimplify();
- }
-
- // Try to transform the switch into an icmp and a branch.
- if (TurnSwitchRangeIntoICmp(SI, Builder))
- return requestResimplify();
-
- // Remove unreachable cases.
+ ++NumLookupTables;
+ if (NeedMask)
+ ++NumLookupTablesHoles;
+ return true;
+}
+
+static bool isSwitchDense(ArrayRef<int64_t> Values) {
+ // See also SelectionDAGBuilder::isDense(), which this function was based on.
+ uint64_t Diff = (uint64_t)Values.back() - (uint64_t)Values.front();
+ uint64_t Range = Diff + 1;
+ uint64_t NumCases = Values.size();
+ // 40% is the default density for building a jump table in optsize/minsize mode.
+ uint64_t MinDensity = 40;
+
+ return NumCases * 100 >= Range * MinDensity;
+}
+
+/// Try to transform a switch that has "holes" in it to a contiguous sequence
+/// of cases.
+///
+/// A switch such as: switch(i) {case 5: case 9: case 13: case 17:} can be
+/// range-reduced to: switch ((i-5) / 4) {case 0: case 1: case 2: case 3:}.
+///
+/// This converts a sparse switch into a dense switch which allows better
+/// lowering and could also allow transforming into a lookup table.
+static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
+ const DataLayout &DL,
+ const TargetTransformInfo &TTI) {
+ auto *CondTy = cast<IntegerType>(SI->getCondition()->getType());
+ if (CondTy->getIntegerBitWidth() > 64 ||
+ !DL.fitsInLegalInteger(CondTy->getIntegerBitWidth()))
+ return false;
+ // Only bother with this optimization if there are more than 3 switch cases;
+ // SDAG will only bother creating jump tables for 4 or more cases.
+ if (SI->getNumCases() < 4)
+ return false;
+
+ // This transform is agnostic to the signedness of the input or case values. We
+ // can treat the case values as signed or unsigned. We can optimize more common
+ // cases such as a sequence crossing zero {-4,0,4,8} if we interpret case values
+ // as signed.
+ SmallVector<int64_t,4> Values;
+ for (auto &C : SI->cases())
+ Values.push_back(C.getCaseValue()->getValue().getSExtValue());
+ llvm::sort(Values);
+
+ // If the switch is already dense, there's nothing useful to do here.
+ if (isSwitchDense(Values))
+ return false;
+
+ // First, transform the values such that they start at zero and ascend.
+ int64_t Base = Values[0];
+ for (auto &V : Values)
+ V -= (uint64_t)(Base);
+
+ // Now we have signed numbers that have been shifted so that, given enough
+ // precision, there are no negative values. Since the rest of the transform
+ // is bitwise only, we switch now to an unsigned representation.
+
+ // This transform can be done speculatively because it is so cheap - it
+ // results in a single rotate operation being inserted.
+ // FIXME: It's possible that optimizing a switch on powers of two might also
+ // be beneficial - flag values are often powers of two and we could use a CLZ
+ // as the key function.
+
+ // countTrailingZeros(0) returns 64. As Values is guaranteed to have more than
+ // one element and LLVM disallows duplicate cases, Shift is guaranteed to be
+ // less than 64.
+ unsigned Shift = 64;
+ for (auto &V : Values)
+ Shift = std::min(Shift, countTrailingZeros((uint64_t)V));
+ assert(Shift < 64);
+ if (Shift > 0)
+ for (auto &V : Values)
+ V = (int64_t)((uint64_t)V >> Shift);
+
+ if (!isSwitchDense(Values))
+ // Transform didn't create a dense switch.
+ return false;
+
+ // The obvious transform is to shift the switch condition right and emit a
+ // check that the condition actually cleanly divided by GCD, i.e.
+ // C & (1 << Shift - 1) == 0
+ // inserting a new CFG edge to handle the case where it didn't divide cleanly.
+ //
+ // A cheaper way of doing this is a simple ROTR(C, Shift). This performs the
+ // shift and puts the shifted-off bits in the uppermost bits. If any of these
+ // are nonzero then the switch condition will be very large and will hit the
+ // default case.
+
+ auto *Ty = cast<IntegerType>(SI->getCondition()->getType());
+ Builder.SetInsertPoint(SI);
+ auto *ShiftC = ConstantInt::get(Ty, Shift);
+ auto *Sub = Builder.CreateSub(SI->getCondition(), ConstantInt::get(Ty, Base));
+ auto *LShr = Builder.CreateLShr(Sub, ShiftC);
+ auto *Shl = Builder.CreateShl(Sub, Ty->getBitWidth() - Shift);
+ auto *Rot = Builder.CreateOr(LShr, Shl);
+ SI->replaceUsesOfWith(SI->getCondition(), Rot);
+
+ for (auto Case : SI->cases()) {
+ auto *Orig = Case.getCaseValue();
+ auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base);
+ Case.setValue(
+ cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(ShiftC->getValue()))));
+ }
+ return true;
+}
+
+bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
+ BasicBlock *BB = SI->getParent();
+
+ if (isValueEqualityComparison(SI)) {
+ // If we only have one predecessor, and if it is a branch on this value,
+ // see if that predecessor totally determines the outcome of this switch.
+ if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+ if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
+ return requestResimplify();
+
+ Value *Cond = SI->getCondition();
+ if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
+ if (SimplifySwitchOnSelect(SI, Select))
+ return requestResimplify();
+
+ // If the block only contains the switch, see if we can fold the block
+ // away into any preds.
+ if (SI == &*BB->instructionsWithoutDebug().begin())
+ if (FoldValueComparisonIntoPredecessors(SI, Builder))
+ return requestResimplify();
+ }
+
+ // Try to transform the switch into an icmp and a branch.
+ if (TurnSwitchRangeIntoICmp(SI, Builder))
+ return requestResimplify();
+
+ // Remove unreachable cases.
if (eliminateDeadSwitchCases(SI, DTU, Options.AC, DL))
- return requestResimplify();
-
+ return requestResimplify();
+
if (switchToSelect(SI, Builder, DTU, DL, TTI))
- return requestResimplify();
-
- if (Options.ForwardSwitchCondToPhi && ForwardSwitchConditionToPHI(SI))
- return requestResimplify();
-
- // The conversion from switch to lookup tables results in difficult-to-analyze
- // code and makes pruning branches much harder. This is a problem if the
- // switch expression itself can still be restricted as a result of inlining or
- // CVP. Therefore, only apply this transformation during late stages of the
- // optimisation pipeline.
- if (Options.ConvertSwitchToLookupTable &&
+ return requestResimplify();
+
+ if (Options.ForwardSwitchCondToPhi && ForwardSwitchConditionToPHI(SI))
+ return requestResimplify();
+
+ // The conversion from switch to lookup tables results in difficult-to-analyze
+ // code and makes pruning branches much harder. This is a problem if the
+ // switch expression itself can still be restricted as a result of inlining or
+ // CVP. Therefore, only apply this transformation during late stages of the
+ // optimisation pipeline.
+ if (Options.ConvertSwitchToLookupTable &&
SwitchToLookupTable(SI, Builder, DTU, DL, TTI))
- return requestResimplify();
-
- if (ReduceSwitchRange(SI, Builder, DL, TTI))
- return requestResimplify();
-
- return false;
-}
-
-bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
- BasicBlock *BB = IBI->getParent();
- bool Changed = false;
-
- // Eliminate redundant destinations.
- SmallPtrSet<Value *, 8> Succs;
+ return requestResimplify();
+
+ if (ReduceSwitchRange(SI, Builder, DL, TTI))
+ return requestResimplify();
+
+ return false;
+}
+
+bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
+ BasicBlock *BB = IBI->getParent();
+ bool Changed = false;
+
+ // Eliminate redundant destinations.
+ SmallPtrSet<Value *, 8> Succs;
SmallSetVector<BasicBlock *, 8> RemovedSuccs;
- for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
- BasicBlock *Dest = IBI->getDestination(i);
- if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
+ for (unsigned i = 0, e = IBI->getNumDestinations(); i != e; ++i) {
+ BasicBlock *Dest = IBI->getDestination(i);
+ if (!Dest->hasAddressTaken() || !Succs.insert(Dest).second) {
if (!Dest->hasAddressTaken())
RemovedSuccs.insert(Dest);
- Dest->removePredecessor(BB);
- IBI->removeDestination(i);
- --i;
- --e;
- Changed = true;
- }
- }
-
+ Dest->removePredecessor(BB);
+ IBI->removeDestination(i);
+ --i;
+ --e;
+ Changed = true;
+ }
+ }
+
if (DTU) {
std::vector<DominatorTree::UpdateType> Updates;
Updates.reserve(RemovedSuccs.size());
@@ -6103,329 +6103,329 @@ bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
DTU->applyUpdates(Updates);
}
- if (IBI->getNumDestinations() == 0) {
- // If the indirectbr has no successors, change it to unreachable.
- new UnreachableInst(IBI->getContext(), IBI);
- EraseTerminatorAndDCECond(IBI);
- return true;
- }
-
- if (IBI->getNumDestinations() == 1) {
- // If the indirectbr has one successor, change it to a direct branch.
- BranchInst::Create(IBI->getDestination(0), IBI);
- EraseTerminatorAndDCECond(IBI);
- return true;
- }
-
- if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
- if (SimplifyIndirectBrOnSelect(IBI, SI))
- return requestResimplify();
- }
- return Changed;
-}
-
-/// Given an block with only a single landing pad and a unconditional branch
-/// try to find another basic block which this one can be merged with. This
-/// handles cases where we have multiple invokes with unique landing pads, but
-/// a shared handler.
-///
-/// We specifically choose to not worry about merging non-empty blocks
-/// here. That is a PRE/scheduling problem and is best solved elsewhere. In
-/// practice, the optimizer produces empty landing pad blocks quite frequently
-/// when dealing with exception dense code. (see: instcombine, gvn, if-else
-/// sinking in this file)
-///
-/// This is primarily a code size optimization. We need to avoid performing
-/// any transform which might inhibit optimization (such as our ability to
-/// specialize a particular handler via tail commoning). We do this by not
-/// merging any blocks which require us to introduce a phi. Since the same
-/// values are flowing through both blocks, we don't lose any ability to
-/// specialize. If anything, we make such specialization more likely.
-///
-/// TODO - This transformation could remove entries from a phi in the target
-/// block when the inputs in the phi are the same for the two blocks being
-/// merged. In some cases, this could result in removal of the PHI entirely.
-static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
+ if (IBI->getNumDestinations() == 0) {
+ // If the indirectbr has no successors, change it to unreachable.
+ new UnreachableInst(IBI->getContext(), IBI);
+ EraseTerminatorAndDCECond(IBI);
+ return true;
+ }
+
+ if (IBI->getNumDestinations() == 1) {
+ // If the indirectbr has one successor, change it to a direct branch.
+ BranchInst::Create(IBI->getDestination(0), IBI);
+ EraseTerminatorAndDCECond(IBI);
+ return true;
+ }
+
+ if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
+ if (SimplifyIndirectBrOnSelect(IBI, SI))
+ return requestResimplify();
+ }
+ return Changed;
+}
+
+/// Given an block with only a single landing pad and a unconditional branch
+/// try to find another basic block which this one can be merged with. This
+/// handles cases where we have multiple invokes with unique landing pads, but
+/// a shared handler.
+///
+/// We specifically choose to not worry about merging non-empty blocks
+/// here. That is a PRE/scheduling problem and is best solved elsewhere. In
+/// practice, the optimizer produces empty landing pad blocks quite frequently
+/// when dealing with exception dense code. (see: instcombine, gvn, if-else
+/// sinking in this file)
+///
+/// This is primarily a code size optimization. We need to avoid performing
+/// any transform which might inhibit optimization (such as our ability to
+/// specialize a particular handler via tail commoning). We do this by not
+/// merging any blocks which require us to introduce a phi. Since the same
+/// values are flowing through both blocks, we don't lose any ability to
+/// specialize. If anything, we make such specialization more likely.
+///
+/// TODO - This transformation could remove entries from a phi in the target
+/// block when the inputs in the phi are the same for the two blocks being
+/// merged. In some cases, this could result in removal of the PHI entirely.
+static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
BasicBlock *BB, DomTreeUpdater *DTU) {
- auto Succ = BB->getUniqueSuccessor();
- assert(Succ);
- // If there's a phi in the successor block, we'd likely have to introduce
- // a phi into the merged landing pad block.
- if (isa<PHINode>(*Succ->begin()))
- return false;
-
- for (BasicBlock *OtherPred : predecessors(Succ)) {
- if (BB == OtherPred)
- continue;
- BasicBlock::iterator I = OtherPred->begin();
- LandingPadInst *LPad2 = dyn_cast<LandingPadInst>(I);
- if (!LPad2 || !LPad2->isIdenticalTo(LPad))
- continue;
- for (++I; isa<DbgInfoIntrinsic>(I); ++I)
- ;
- BranchInst *BI2 = dyn_cast<BranchInst>(I);
- if (!BI2 || !BI2->isIdenticalTo(BI))
- continue;
-
+ auto Succ = BB->getUniqueSuccessor();
+ assert(Succ);
+ // If there's a phi in the successor block, we'd likely have to introduce
+ // a phi into the merged landing pad block.
+ if (isa<PHINode>(*Succ->begin()))
+ return false;
+
+ for (BasicBlock *OtherPred : predecessors(Succ)) {
+ if (BB == OtherPred)
+ continue;
+ BasicBlock::iterator I = OtherPred->begin();
+ LandingPadInst *LPad2 = dyn_cast<LandingPadInst>(I);
+ if (!LPad2 || !LPad2->isIdenticalTo(LPad))
+ continue;
+ for (++I; isa<DbgInfoIntrinsic>(I); ++I)
+ ;
+ BranchInst *BI2 = dyn_cast<BranchInst>(I);
+ if (!BI2 || !BI2->isIdenticalTo(BI))
+ continue;
+
std::vector<DominatorTree::UpdateType> Updates;
- // We've found an identical block. Update our predecessors to take that
- // path instead and make ourselves dead.
- SmallPtrSet<BasicBlock *, 16> Preds;
- Preds.insert(pred_begin(BB), pred_end(BB));
- for (BasicBlock *Pred : Preds) {
- InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
- assert(II->getNormalDest() != BB && II->getUnwindDest() == BB &&
- "unexpected successor");
- II->setUnwindDest(OtherPred);
+ // We've found an identical block. Update our predecessors to take that
+ // path instead and make ourselves dead.
+ SmallPtrSet<BasicBlock *, 16> Preds;
+ Preds.insert(pred_begin(BB), pred_end(BB));
+ for (BasicBlock *Pred : Preds) {
+ InvokeInst *II = cast<InvokeInst>(Pred->getTerminator());
+ assert(II->getNormalDest() != BB && II->getUnwindDest() == BB &&
+ "unexpected successor");
+ II->setUnwindDest(OtherPred);
Updates.push_back({DominatorTree::Insert, Pred, OtherPred});
Updates.push_back({DominatorTree::Delete, Pred, BB});
- }
-
- // The debug info in OtherPred doesn't cover the merged control flow that
- // used to go through BB. We need to delete it or update it.
- for (auto I = OtherPred->begin(), E = OtherPred->end(); I != E;) {
- Instruction &Inst = *I;
- I++;
- if (isa<DbgInfoIntrinsic>(Inst))
- Inst.eraseFromParent();
- }
-
- SmallPtrSet<BasicBlock *, 16> Succs;
- Succs.insert(succ_begin(BB), succ_end(BB));
- for (BasicBlock *Succ : Succs) {
- Succ->removePredecessor(BB);
+ }
+
+ // The debug info in OtherPred doesn't cover the merged control flow that
+ // used to go through BB. We need to delete it or update it.
+ for (auto I = OtherPred->begin(), E = OtherPred->end(); I != E;) {
+ Instruction &Inst = *I;
+ I++;
+ if (isa<DbgInfoIntrinsic>(Inst))
+ Inst.eraseFromParent();
+ }
+
+ SmallPtrSet<BasicBlock *, 16> Succs;
+ Succs.insert(succ_begin(BB), succ_end(BB));
+ for (BasicBlock *Succ : Succs) {
+ Succ->removePredecessor(BB);
Updates.push_back({DominatorTree::Delete, BB, Succ});
- }
-
- IRBuilder<> Builder(BI);
- Builder.CreateUnreachable();
- BI->eraseFromParent();
+ }
+
+ IRBuilder<> Builder(BI);
+ Builder.CreateUnreachable();
+ BI->eraseFromParent();
if (DTU)
DTU->applyUpdates(Updates);
- return true;
- }
- return false;
-}
-
-bool SimplifyCFGOpt::simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder) {
- return Branch->isUnconditional() ? simplifyUncondBranch(Branch, Builder)
- : simplifyCondBranch(Branch, Builder);
-}
-
-bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
- IRBuilder<> &Builder) {
- BasicBlock *BB = BI->getParent();
- BasicBlock *Succ = BI->getSuccessor(0);
-
- // If the Terminator is the only non-phi instruction, simplify the block.
- // If LoopHeader is provided, check if the block or its successor is a loop
- // header. (This is for early invocations before loop simplify and
- // vectorization to keep canonical loop forms for nested loops. These blocks
- // can be eliminated when the pass is invoked later in the back-end.)
- // Note that if BB has only one predecessor then we do not introduce new
- // backedge, so we can eliminate BB.
- bool NeedCanonicalLoop =
- Options.NeedCanonicalLoop &&
+ return true;
+ }
+ return false;
+}
+
+bool SimplifyCFGOpt::simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder) {
+ return Branch->isUnconditional() ? simplifyUncondBranch(Branch, Builder)
+ : simplifyCondBranch(Branch, Builder);
+}
+
+bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
+ IRBuilder<> &Builder) {
+ BasicBlock *BB = BI->getParent();
+ BasicBlock *Succ = BI->getSuccessor(0);
+
+ // If the Terminator is the only non-phi instruction, simplify the block.
+ // If LoopHeader is provided, check if the block or its successor is a loop
+ // header. (This is for early invocations before loop simplify and
+ // vectorization to keep canonical loop forms for nested loops. These blocks
+ // can be eliminated when the pass is invoked later in the back-end.)
+ // Note that if BB has only one predecessor then we do not introduce new
+ // backedge, so we can eliminate BB.
+ bool NeedCanonicalLoop =
+ Options.NeedCanonicalLoop &&
(!LoopHeaders.empty() && BB->hasNPredecessorsOrMore(2) &&
(is_contained(LoopHeaders, BB) || is_contained(LoopHeaders, Succ)));
- BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
- if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
+ BasicBlock::iterator I = BB->getFirstNonPHIOrDbg()->getIterator();
+ if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
!NeedCanonicalLoop && TryToSimplifyUncondBranchFromEmptyBlock(BB, DTU))
- return true;
-
- // If the only instruction in the block is a seteq/setne comparison against a
- // constant, try to simplify the block.
- if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
- if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) {
- for (++I; isa<DbgInfoIntrinsic>(I); ++I)
- ;
- if (I->isTerminator() &&
- tryToSimplifyUncondBranchWithICmpInIt(ICI, Builder))
- return true;
- }
-
- // See if we can merge an empty landing pad block with another which is
- // equivalent.
- if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) {
- for (++I; isa<DbgInfoIntrinsic>(I); ++I)
- ;
+ return true;
+
+ // If the only instruction in the block is a seteq/setne comparison against a
+ // constant, try to simplify the block.
+ if (ICmpInst *ICI = dyn_cast<ICmpInst>(I))
+ if (ICI->isEquality() && isa<ConstantInt>(ICI->getOperand(1))) {
+ for (++I; isa<DbgInfoIntrinsic>(I); ++I)
+ ;
+ if (I->isTerminator() &&
+ tryToSimplifyUncondBranchWithICmpInIt(ICI, Builder))
+ return true;
+ }
+
+ // See if we can merge an empty landing pad block with another which is
+ // equivalent.
+ if (LandingPadInst *LPad = dyn_cast<LandingPadInst>(I)) {
+ for (++I; isa<DbgInfoIntrinsic>(I); ++I)
+ ;
if (I->isTerminator() && TryToMergeLandingPad(LPad, BI, BB, DTU))
- return true;
- }
-
- // If this basic block is ONLY a compare and a branch, and if a predecessor
- // branches to us and our successor, fold the comparison into the
- // predecessor and use logical operations to update the incoming value
- // for PHI nodes in common successor.
+ return true;
+ }
+
+ // If this basic block is ONLY a compare and a branch, and if a predecessor
+ // branches to us and our successor, fold the comparison into the
+ // predecessor and use logical operations to update the incoming value
+ // for PHI nodes in common successor.
if (FoldBranchToCommonDest(BI, DTU, /*MSSAU=*/nullptr, &TTI,
Options.BonusInstThreshold))
- return requestResimplify();
- return false;
-}
-
-static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
- BasicBlock *PredPred = nullptr;
- for (auto *P : predecessors(BB)) {
- BasicBlock *PPred = P->getSinglePredecessor();
- if (!PPred || (PredPred && PredPred != PPred))
- return nullptr;
- PredPred = PPred;
- }
- return PredPred;
-}
-
-bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
- BasicBlock *BB = BI->getParent();
- if (!Options.SimplifyCondBranch)
- return false;
-
- // Conditional branch
- if (isValueEqualityComparison(BI)) {
- // If we only have one predecessor, and if it is a branch on this value,
- // see if that predecessor totally determines the outcome of this
- // switch.
- if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
- if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
- return requestResimplify();
-
- // This block must be empty, except for the setcond inst, if it exists.
- // Ignore dbg intrinsics.
- auto I = BB->instructionsWithoutDebug().begin();
- if (&*I == BI) {
- if (FoldValueComparisonIntoPredecessors(BI, Builder))
- return requestResimplify();
- } else if (&*I == cast<Instruction>(BI->getCondition())) {
- ++I;
- if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
- return requestResimplify();
- }
- }
-
- // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction.
- if (SimplifyBranchOnICmpChain(BI, Builder, DL))
- return true;
-
- // If this basic block has dominating predecessor blocks and the dominating
- // blocks' conditions imply BI's condition, we know the direction of BI.
- Optional<bool> Imp = isImpliedByDomCondition(BI->getCondition(), BI, DL);
- if (Imp) {
- // Turn this into a branch on constant.
- auto *OldCond = BI->getCondition();
- ConstantInt *TorF = *Imp ? ConstantInt::getTrue(BB->getContext())
- : ConstantInt::getFalse(BB->getContext());
- BI->setCondition(TorF);
- RecursivelyDeleteTriviallyDeadInstructions(OldCond);
- return requestResimplify();
- }
-
- // If this basic block is ONLY a compare and a branch, and if a predecessor
- // branches to us and one of our successors, fold the comparison into the
- // predecessor and use logical operations to pick the right destination.
+ return requestResimplify();
+ return false;
+}
+
+static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
+ BasicBlock *PredPred = nullptr;
+ for (auto *P : predecessors(BB)) {
+ BasicBlock *PPred = P->getSinglePredecessor();
+ if (!PPred || (PredPred && PredPred != PPred))
+ return nullptr;
+ PredPred = PPred;
+ }
+ return PredPred;
+}
+
+bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
+ BasicBlock *BB = BI->getParent();
+ if (!Options.SimplifyCondBranch)
+ return false;
+
+ // Conditional branch
+ if (isValueEqualityComparison(BI)) {
+ // If we only have one predecessor, and if it is a branch on this value,
+ // see if that predecessor totally determines the outcome of this
+ // switch.
+ if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+ if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
+ return requestResimplify();
+
+ // This block must be empty, except for the setcond inst, if it exists.
+ // Ignore dbg intrinsics.
+ auto I = BB->instructionsWithoutDebug().begin();
+ if (&*I == BI) {
+ if (FoldValueComparisonIntoPredecessors(BI, Builder))
+ return requestResimplify();
+ } else if (&*I == cast<Instruction>(BI->getCondition())) {
+ ++I;
+ if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
+ return requestResimplify();
+ }
+ }
+
+ // Try to turn "br (X == 0 | X == 1), T, F" into a switch instruction.
+ if (SimplifyBranchOnICmpChain(BI, Builder, DL))
+ return true;
+
+ // If this basic block has dominating predecessor blocks and the dominating
+ // blocks' conditions imply BI's condition, we know the direction of BI.
+ Optional<bool> Imp = isImpliedByDomCondition(BI->getCondition(), BI, DL);
+ if (Imp) {
+ // Turn this into a branch on constant.
+ auto *OldCond = BI->getCondition();
+ ConstantInt *TorF = *Imp ? ConstantInt::getTrue(BB->getContext())
+ : ConstantInt::getFalse(BB->getContext());
+ BI->setCondition(TorF);
+ RecursivelyDeleteTriviallyDeadInstructions(OldCond);
+ return requestResimplify();
+ }
+
+ // If this basic block is ONLY a compare and a branch, and if a predecessor
+ // branches to us and one of our successors, fold the comparison into the
+ // predecessor and use logical operations to pick the right destination.
if (FoldBranchToCommonDest(BI, DTU, /*MSSAU=*/nullptr, &TTI,
Options.BonusInstThreshold))
- return requestResimplify();
-
- // We have a conditional branch to two blocks that are only reachable
- // from BI. We know that the condbr dominates the two blocks, so see if
- // there is any identical code in the "then" and "else" blocks. If so, we
- // can hoist it up to the branching block.
- if (BI->getSuccessor(0)->getSinglePredecessor()) {
- if (BI->getSuccessor(1)->getSinglePredecessor()) {
+ return requestResimplify();
+
+ // We have a conditional branch to two blocks that are only reachable
+ // from BI. We know that the condbr dominates the two blocks, so see if
+ // there is any identical code in the "then" and "else" blocks. If so, we
+ // can hoist it up to the branching block.
+ if (BI->getSuccessor(0)->getSinglePredecessor()) {
+ if (BI->getSuccessor(1)->getSinglePredecessor()) {
if (HoistCommon && Options.HoistCommonInsts)
if (HoistThenElseCodeToIf(BI, TTI))
return requestResimplify();
- } else {
- // If Successor #1 has multiple preds, we may be able to conditionally
- // execute Successor #0 if it branches to Successor #1.
- Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator();
- if (Succ0TI->getNumSuccessors() == 1 &&
- Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
- if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), TTI))
- return requestResimplify();
- }
- } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
- // If Successor #0 has multiple preds, we may be able to conditionally
- // execute Successor #1 if it branches to Successor #0.
- Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator();
- if (Succ1TI->getNumSuccessors() == 1 &&
- Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
- if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), TTI))
- return requestResimplify();
- }
-
- // If this is a branch on a phi node in the current block, thread control
- // through this block if any PHI node entries are constants.
- if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
- if (PN->getParent() == BI->getParent())
+ } else {
+ // If Successor #1 has multiple preds, we may be able to conditionally
+ // execute Successor #0 if it branches to Successor #1.
+ Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator();
+ if (Succ0TI->getNumSuccessors() == 1 &&
+ Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
+ if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0), TTI))
+ return requestResimplify();
+ }
+ } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
+ // If Successor #0 has multiple preds, we may be able to conditionally
+ // execute Successor #1 if it branches to Successor #0.
+ Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator();
+ if (Succ1TI->getNumSuccessors() == 1 &&
+ Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
+ if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1), TTI))
+ return requestResimplify();
+ }
+
+ // If this is a branch on a phi node in the current block, thread control
+ // through this block if any PHI node entries are constants.
+ if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
+ if (PN->getParent() == BI->getParent())
if (FoldCondBranchOnPHI(BI, DTU, DL, Options.AC))
- return requestResimplify();
-
- // Scan predecessor blocks for conditional branches.
- for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
- if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
- if (PBI != BI && PBI->isConditional())
+ return requestResimplify();
+
+ // Scan predecessor blocks for conditional branches.
+ for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+ if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
+ if (PBI != BI && PBI->isConditional())
if (SimplifyCondBranchToCondBranch(PBI, BI, DTU, DL, TTI))
- return requestResimplify();
-
- // Look for diamond patterns.
- if (MergeCondStores)
- if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB))
- if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator()))
- if (PBI != BI && PBI->isConditional())
+ return requestResimplify();
+
+ // Look for diamond patterns.
+ if (MergeCondStores)
+ if (BasicBlock *PrevBB = allPredecessorsComeFromSameSource(BB))
+ if (BranchInst *PBI = dyn_cast<BranchInst>(PrevBB->getTerminator()))
+ if (PBI != BI && PBI->isConditional())
if (mergeConditionalStores(PBI, BI, DTU, DL, TTI))
- return requestResimplify();
-
- return false;
-}
-
-/// Check if passing a value to an instruction will cause undefined behavior.
+ return requestResimplify();
+
+ return false;
+}
+
+/// Check if passing a value to an instruction will cause undefined behavior.
static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValueMayBeModified) {
- Constant *C = dyn_cast<Constant>(V);
- if (!C)
- return false;
-
- if (I->use_empty())
- return false;
-
- if (C->isNullValue() || isa<UndefValue>(C)) {
- // Only look at the first use, avoid hurting compile time with long uselists
- User *Use = *I->user_begin();
-
- // Now make sure that there are no instructions in between that can alter
- // control flow (eg. calls)
- for (BasicBlock::iterator
- i = ++BasicBlock::iterator(I),
- UI = BasicBlock::iterator(dyn_cast<Instruction>(Use));
- i != UI; ++i)
- if (i == I->getParent()->end() || i->mayHaveSideEffects())
- return false;
-
- // Look through GEPs. A load from a GEP derived from NULL is still undefined
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Use))
+ Constant *C = dyn_cast<Constant>(V);
+ if (!C)
+ return false;
+
+ if (I->use_empty())
+ return false;
+
+ if (C->isNullValue() || isa<UndefValue>(C)) {
+ // Only look at the first use, avoid hurting compile time with long uselists
+ User *Use = *I->user_begin();
+
+ // Now make sure that there are no instructions in between that can alter
+ // control flow (eg. calls)
+ for (BasicBlock::iterator
+ i = ++BasicBlock::iterator(I),
+ UI = BasicBlock::iterator(dyn_cast<Instruction>(Use));
+ i != UI; ++i)
+ if (i == I->getParent()->end() || i->mayHaveSideEffects())
+ return false;
+
+ // Look through GEPs. A load from a GEP derived from NULL is still undefined
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Use))
if (GEP->getPointerOperand() == I) {
if (!GEP->isInBounds() || !GEP->hasAllZeroIndices())
PtrValueMayBeModified = true;
return passingValueIsAlwaysUndefined(V, GEP, PtrValueMayBeModified);
}
-
- // Look through bitcasts.
- if (BitCastInst *BC = dyn_cast<BitCastInst>(Use))
+
+ // Look through bitcasts.
+ if (BitCastInst *BC = dyn_cast<BitCastInst>(Use))
return passingValueIsAlwaysUndefined(V, BC, PtrValueMayBeModified);
-
- // Load from null is undefined.
- if (LoadInst *LI = dyn_cast<LoadInst>(Use))
- if (!LI->isVolatile())
- return !NullPointerIsDefined(LI->getFunction(),
- LI->getPointerAddressSpace());
-
- // Store to null is undefined.
- if (StoreInst *SI = dyn_cast<StoreInst>(Use))
- if (!SI->isVolatile())
- return (!NullPointerIsDefined(SI->getFunction(),
- SI->getPointerAddressSpace())) &&
- SI->getPointerOperand() == I;
-
+
+ // Load from null is undefined.
+ if (LoadInst *LI = dyn_cast<LoadInst>(Use))
+ if (!LI->isVolatile())
+ return !NullPointerIsDefined(LI->getFunction(),
+ LI->getPointerAddressSpace());
+
+ // Store to null is undefined.
+ if (StoreInst *SI = dyn_cast<StoreInst>(Use))
+ if (!SI->isVolatile())
+ return (!NullPointerIsDefined(SI->getFunction(),
+ SI->getPointerAddressSpace())) &&
+ SI->getPointerOperand() == I;
+
if (auto *CB = dyn_cast<CallBase>(Use)) {
if (C->isNullValue() && NullPointerIsDefined(CB->getFunction()))
return false;
@@ -6455,114 +6455,114 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValu
}
}
}
- }
- return false;
-}
-
-/// If BB has an incoming value that will always trigger undefined behavior
-/// (eg. null pointer dereference), remove the branch leading here.
+ }
+ return false;
+}
+
+/// If BB has an incoming value that will always trigger undefined behavior
+/// (eg. null pointer dereference), remove the branch leading here.
static bool removeUndefIntroducingPredecessor(BasicBlock *BB,
DomTreeUpdater *DTU) {
- for (PHINode &PHI : BB->phis())
- for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i)
- if (passingValueIsAlwaysUndefined(PHI.getIncomingValue(i), &PHI)) {
+ for (PHINode &PHI : BB->phis())
+ for (unsigned i = 0, e = PHI.getNumIncomingValues(); i != e; ++i)
+ if (passingValueIsAlwaysUndefined(PHI.getIncomingValue(i), &PHI)) {
BasicBlock *Predecessor = PHI.getIncomingBlock(i);
Instruction *T = Predecessor->getTerminator();
- IRBuilder<> Builder(T);
- if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
+ IRBuilder<> Builder(T);
+ if (BranchInst *BI = dyn_cast<BranchInst>(T)) {
BB->removePredecessor(Predecessor);
- // Turn uncoditional branches into unreachables and remove the dead
- // destination from conditional branches.
- if (BI->isUnconditional())
- Builder.CreateUnreachable();
- else
- Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1)
- : BI->getSuccessor(0));
- BI->eraseFromParent();
+ // Turn uncoditional branches into unreachables and remove the dead
+ // destination from conditional branches.
+ if (BI->isUnconditional())
+ Builder.CreateUnreachable();
+ else
+ Builder.CreateBr(BI->getSuccessor(0) == BB ? BI->getSuccessor(1)
+ : BI->getSuccessor(0));
+ BI->eraseFromParent();
if (DTU)
DTU->applyUpdates({{DominatorTree::Delete, Predecessor, BB}});
- return true;
- }
- // TODO: SwitchInst.
- }
-
- return false;
-}
-
+ return true;
+ }
+ // TODO: SwitchInst.
+ }
+
+ return false;
+}
+
bool SimplifyCFGOpt::simplifyOnceImpl(BasicBlock *BB) {
- bool Changed = false;
-
- assert(BB && BB->getParent() && "Block not embedded in function!");
- assert(BB->getTerminator() && "Degenerate basic block encountered!");
-
- // Remove basic blocks that have no predecessors (except the entry block)...
- // or that just have themself as a predecessor. These are unreachable.
- if ((pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) ||
- BB->getSinglePredecessor() == BB) {
- LLVM_DEBUG(dbgs() << "Removing BB: \n" << *BB);
+ bool Changed = false;
+
+ assert(BB && BB->getParent() && "Block not embedded in function!");
+ assert(BB->getTerminator() && "Degenerate basic block encountered!");
+
+ // Remove basic blocks that have no predecessors (except the entry block)...
+ // or that just have themself as a predecessor. These are unreachable.
+ if ((pred_empty(BB) && BB != &BB->getParent()->getEntryBlock()) ||
+ BB->getSinglePredecessor() == BB) {
+ LLVM_DEBUG(dbgs() << "Removing BB: \n" << *BB);
DeleteDeadBlock(BB, DTU);
- return true;
- }
-
- // Check to see if we can constant propagate this terminator instruction
- // away...
+ return true;
+ }
+
+ // Check to see if we can constant propagate this terminator instruction
+ // away...
Changed |= ConstantFoldTerminator(BB, /*DeleteDeadConditions=*/true,
/*TLI=*/nullptr, DTU);
-
- // Check for and eliminate duplicate PHI nodes in this block.
- Changed |= EliminateDuplicatePHINodes(BB);
-
- // Check for and remove branches that will always cause undefined behavior.
+
+ // Check for and eliminate duplicate PHI nodes in this block.
+ Changed |= EliminateDuplicatePHINodes(BB);
+
+ // Check for and remove branches that will always cause undefined behavior.
Changed |= removeUndefIntroducingPredecessor(BB, DTU);
-
- // Merge basic blocks into their predecessor if there is only one distinct
- // pred, and if there is only one distinct successor of the predecessor, and
- // if there are no PHI nodes.
+
+ // Merge basic blocks into their predecessor if there is only one distinct
+ // pred, and if there is only one distinct successor of the predecessor, and
+ // if there are no PHI nodes.
if (MergeBlockIntoPredecessor(BB, DTU))
- return true;
-
- if (SinkCommon && Options.SinkCommonInsts)
+ return true;
+
+ if (SinkCommon && Options.SinkCommonInsts)
Changed |= SinkCommonCodeFromPredecessors(BB, DTU);
-
- IRBuilder<> Builder(BB);
-
- if (Options.FoldTwoEntryPHINode) {
- // If there is a trivial two-entry PHI node in this basic block, and we can
- // eliminate it, do so now.
- if (auto *PN = dyn_cast<PHINode>(BB->begin()))
- if (PN->getNumIncomingValues() == 2)
+
+ IRBuilder<> Builder(BB);
+
+ if (Options.FoldTwoEntryPHINode) {
+ // If there is a trivial two-entry PHI node in this basic block, and we can
+ // eliminate it, do so now.
+ if (auto *PN = dyn_cast<PHINode>(BB->begin()))
+ if (PN->getNumIncomingValues() == 2)
Changed |= FoldTwoEntryPHINode(PN, TTI, DTU, DL);
- }
-
- Instruction *Terminator = BB->getTerminator();
- Builder.SetInsertPoint(Terminator);
- switch (Terminator->getOpcode()) {
- case Instruction::Br:
- Changed |= simplifyBranch(cast<BranchInst>(Terminator), Builder);
- break;
- case Instruction::Ret:
- Changed |= simplifyReturn(cast<ReturnInst>(Terminator), Builder);
- break;
- case Instruction::Resume:
- Changed |= simplifyResume(cast<ResumeInst>(Terminator), Builder);
- break;
- case Instruction::CleanupRet:
- Changed |= simplifyCleanupReturn(cast<CleanupReturnInst>(Terminator));
- break;
- case Instruction::Switch:
- Changed |= simplifySwitch(cast<SwitchInst>(Terminator), Builder);
- break;
- case Instruction::Unreachable:
- Changed |= simplifyUnreachable(cast<UnreachableInst>(Terminator));
- break;
- case Instruction::IndirectBr:
- Changed |= simplifyIndirectBr(cast<IndirectBrInst>(Terminator));
- break;
- }
-
- return Changed;
-}
-
+ }
+
+ Instruction *Terminator = BB->getTerminator();
+ Builder.SetInsertPoint(Terminator);
+ switch (Terminator->getOpcode()) {
+ case Instruction::Br:
+ Changed |= simplifyBranch(cast<BranchInst>(Terminator), Builder);
+ break;
+ case Instruction::Ret:
+ Changed |= simplifyReturn(cast<ReturnInst>(Terminator), Builder);
+ break;
+ case Instruction::Resume:
+ Changed |= simplifyResume(cast<ResumeInst>(Terminator), Builder);
+ break;
+ case Instruction::CleanupRet:
+ Changed |= simplifyCleanupReturn(cast<CleanupReturnInst>(Terminator));
+ break;
+ case Instruction::Switch:
+ Changed |= simplifySwitch(cast<SwitchInst>(Terminator), Builder);
+ break;
+ case Instruction::Unreachable:
+ Changed |= simplifyUnreachable(cast<UnreachableInst>(Terminator));
+ break;
+ case Instruction::IndirectBr:
+ Changed |= simplifyIndirectBr(cast<IndirectBrInst>(Terminator));
+ break;
+ }
+
+ return Changed;
+}
+
bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
bool Changed = simplifyOnceImpl(BB);
@@ -6574,30 +6574,30 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
return Changed;
}
-bool SimplifyCFGOpt::run(BasicBlock *BB) {
+bool SimplifyCFGOpt::run(BasicBlock *BB) {
assert((!RequireAndPreserveDomTree ||
(DTU &&
DTU->getDomTree().verify(DominatorTree::VerificationLevel::Full))) &&
"Original domtree is invalid?");
- bool Changed = false;
-
- // Repeated simplify BB as long as resimplification is requested.
- do {
- Resimplify = false;
-
- // Perform one round of simplifcation. Resimplify flag will be set if
- // another iteration is requested.
- Changed |= simplifyOnce(BB);
- } while (Resimplify);
-
- return Changed;
-}
-
-bool llvm::simplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
+ bool Changed = false;
+
+ // Repeated simplify BB as long as resimplification is requested.
+ do {
+ Resimplify = false;
+
+ // Perform one round of simplifcation. Resimplify flag will be set if
+ // another iteration is requested.
+ Changed |= simplifyOnce(BB);
+ } while (Resimplify);
+
+ return Changed;
+}
+
+bool llvm::simplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
DomTreeUpdater *DTU, const SimplifyCFGOptions &Options,
ArrayRef<WeakVH> LoopHeaders) {
return SimplifyCFGOpt(TTI, RequireAndPreserveDomTree ? DTU : nullptr,
BB->getModule()->getDataLayout(), LoopHeaders, Options)
- .run(BB);
-}
+ .run(BB);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyIndVar.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyIndVar.cpp
index d81357a967..290c04a7ad 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1,973 +1,973 @@
-//===-- SimplifyIndVar.cpp - Induction variable simplification ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements induction variable simplification. It does
-// not define any actual pass or policy, but provides a single function to
-// simplify a loop's induction variables based on ScalarEvolution.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/SimplifyIndVar.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "indvars"
-
-STATISTIC(NumElimIdentity, "Number of IV identities eliminated");
-STATISTIC(NumElimOperand, "Number of IV operands folded into a use");
-STATISTIC(NumFoldedUser, "Number of IV users folded into a constant");
-STATISTIC(NumElimRem , "Number of IV remainder operations eliminated");
-STATISTIC(
- NumSimplifiedSDiv,
- "Number of IV signed division operations converted to unsigned division");
-STATISTIC(
- NumSimplifiedSRem,
- "Number of IV signed remainder operations converted to unsigned remainder");
-STATISTIC(NumElimCmp , "Number of IV comparisons eliminated");
-
-namespace {
- /// This is a utility for simplifying induction variables
- /// based on ScalarEvolution. It is the primary instrument of the
- /// IndvarSimplify pass, but it may also be directly invoked to cleanup after
- /// other loop passes that preserve SCEV.
- class SimplifyIndvar {
- Loop *L;
- LoopInfo *LI;
- ScalarEvolution *SE;
- DominatorTree *DT;
- const TargetTransformInfo *TTI;
- SCEVExpander &Rewriter;
- SmallVectorImpl<WeakTrackingVH> &DeadInsts;
-
- bool Changed;
-
- public:
- SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, DominatorTree *DT,
- LoopInfo *LI, const TargetTransformInfo *TTI,
- SCEVExpander &Rewriter,
- SmallVectorImpl<WeakTrackingVH> &Dead)
- : L(Loop), LI(LI), SE(SE), DT(DT), TTI(TTI), Rewriter(Rewriter),
- DeadInsts(Dead), Changed(false) {
- assert(LI && "IV simplification requires LoopInfo");
- }
-
- bool hasChanged() const { return Changed; }
-
- /// Iteratively perform simplification on a worklist of users of the
- /// specified induction variable. This is the top-level driver that applies
- /// all simplifications to users of an IV.
- void simplifyUsers(PHINode *CurrIV, IVVisitor *V = nullptr);
-
- Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand);
-
- bool eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand);
- bool replaceIVUserWithLoopInvariant(Instruction *UseInst);
-
- bool eliminateOverflowIntrinsic(WithOverflowInst *WO);
- bool eliminateSaturatingIntrinsic(SaturatingInst *SI);
- bool eliminateTrunc(TruncInst *TI);
- bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
- bool makeIVComparisonInvariant(ICmpInst *ICmp, Value *IVOperand);
- void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
- void simplifyIVRemainder(BinaryOperator *Rem, Value *IVOperand,
- bool IsSigned);
- void replaceRemWithNumerator(BinaryOperator *Rem);
- void replaceRemWithNumeratorOrZero(BinaryOperator *Rem);
- void replaceSRemWithURem(BinaryOperator *Rem);
- bool eliminateSDiv(BinaryOperator *SDiv);
- bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand);
- bool strengthenRightShift(BinaryOperator *BO, Value *IVOperand);
- };
-}
-
-/// Fold an IV operand into its use. This removes increments of an
-/// aligned IV when used by a instruction that ignores the low bits.
-///
-/// IVOperand is guaranteed SCEVable, but UseInst may not be.
-///
-/// Return the operand of IVOperand for this induction variable if IVOperand can
-/// be folded (in case more folding opportunities have been exposed).
-/// Otherwise return null.
-Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) {
- Value *IVSrc = nullptr;
- const unsigned OperIdx = 0;
- const SCEV *FoldedExpr = nullptr;
- bool MustDropExactFlag = false;
- switch (UseInst->getOpcode()) {
- default:
- return nullptr;
- case Instruction::UDiv:
- case Instruction::LShr:
- // We're only interested in the case where we know something about
- // the numerator and have a constant denominator.
- if (IVOperand != UseInst->getOperand(OperIdx) ||
- !isa<ConstantInt>(UseInst->getOperand(1)))
- return nullptr;
-
- // Attempt to fold a binary operator with constant operand.
- // e.g. ((I + 1) >> 2) => I >> 2
- if (!isa<BinaryOperator>(IVOperand)
- || !isa<ConstantInt>(IVOperand->getOperand(1)))
- return nullptr;
-
- IVSrc = IVOperand->getOperand(0);
- // IVSrc must be the (SCEVable) IV, since the other operand is const.
- assert(SE->isSCEVable(IVSrc->getType()) && "Expect SCEVable IV operand");
-
- ConstantInt *D = cast<ConstantInt>(UseInst->getOperand(1));
- if (UseInst->getOpcode() == Instruction::LShr) {
- // Get a constant for the divisor. See createSCEV.
- uint32_t BitWidth = cast<IntegerType>(UseInst->getType())->getBitWidth();
- if (D->getValue().uge(BitWidth))
- return nullptr;
-
- D = ConstantInt::get(UseInst->getContext(),
- APInt::getOneBitSet(BitWidth, D->getZExtValue()));
- }
- FoldedExpr = SE->getUDivExpr(SE->getSCEV(IVSrc), SE->getSCEV(D));
- // We might have 'exact' flag set at this point which will no longer be
- // correct after we make the replacement.
- if (UseInst->isExact() &&
- SE->getSCEV(IVSrc) != SE->getMulExpr(FoldedExpr, SE->getSCEV(D)))
- MustDropExactFlag = true;
- }
- // We have something that might fold it's operand. Compare SCEVs.
- if (!SE->isSCEVable(UseInst->getType()))
- return nullptr;
-
- // Bypass the operand if SCEV can prove it has no effect.
- if (SE->getSCEV(UseInst) != FoldedExpr)
- return nullptr;
-
- LLVM_DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand
- << " -> " << *UseInst << '\n');
-
- UseInst->setOperand(OperIdx, IVSrc);
- assert(SE->getSCEV(UseInst) == FoldedExpr && "bad SCEV with folded oper");
-
- if (MustDropExactFlag)
- UseInst->dropPoisonGeneratingFlags();
-
- ++NumElimOperand;
- Changed = true;
- if (IVOperand->use_empty())
- DeadInsts.emplace_back(IVOperand);
- return IVSrc;
-}
-
-bool SimplifyIndvar::makeIVComparisonInvariant(ICmpInst *ICmp,
- Value *IVOperand) {
- unsigned IVOperIdx = 0;
- ICmpInst::Predicate Pred = ICmp->getPredicate();
- if (IVOperand != ICmp->getOperand(0)) {
- // Swapped
- assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
- IVOperIdx = 1;
- Pred = ICmpInst::getSwappedPredicate(Pred);
- }
-
- // Get the SCEVs for the ICmp operands (in the specific context of the
- // current loop)
- const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
- const SCEV *S = SE->getSCEVAtScope(ICmp->getOperand(IVOperIdx), ICmpLoop);
- const SCEV *X = SE->getSCEVAtScope(ICmp->getOperand(1 - IVOperIdx), ICmpLoop);
-
- auto *PN = dyn_cast<PHINode>(IVOperand);
- if (!PN)
- return false;
+//===-- SimplifyIndVar.cpp - Induction variable simplification ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements induction variable simplification. It does
+// not define any actual pass or policy, but provides a single function to
+// simplify a loop's induction variables based on ScalarEvolution.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "indvars"
+
+STATISTIC(NumElimIdentity, "Number of IV identities eliminated");
+STATISTIC(NumElimOperand, "Number of IV operands folded into a use");
+STATISTIC(NumFoldedUser, "Number of IV users folded into a constant");
+STATISTIC(NumElimRem , "Number of IV remainder operations eliminated");
+STATISTIC(
+ NumSimplifiedSDiv,
+ "Number of IV signed division operations converted to unsigned division");
+STATISTIC(
+ NumSimplifiedSRem,
+ "Number of IV signed remainder operations converted to unsigned remainder");
+STATISTIC(NumElimCmp , "Number of IV comparisons eliminated");
+
+namespace {
+ /// This is a utility for simplifying induction variables
+ /// based on ScalarEvolution. It is the primary instrument of the
+ /// IndvarSimplify pass, but it may also be directly invoked to cleanup after
+ /// other loop passes that preserve SCEV.
+ class SimplifyIndvar {
+ Loop *L;
+ LoopInfo *LI;
+ ScalarEvolution *SE;
+ DominatorTree *DT;
+ const TargetTransformInfo *TTI;
+ SCEVExpander &Rewriter;
+ SmallVectorImpl<WeakTrackingVH> &DeadInsts;
+
+ bool Changed;
+
+ public:
+ SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, DominatorTree *DT,
+ LoopInfo *LI, const TargetTransformInfo *TTI,
+ SCEVExpander &Rewriter,
+ SmallVectorImpl<WeakTrackingVH> &Dead)
+ : L(Loop), LI(LI), SE(SE), DT(DT), TTI(TTI), Rewriter(Rewriter),
+ DeadInsts(Dead), Changed(false) {
+ assert(LI && "IV simplification requires LoopInfo");
+ }
+
+ bool hasChanged() const { return Changed; }
+
+ /// Iteratively perform simplification on a worklist of users of the
+ /// specified induction variable. This is the top-level driver that applies
+ /// all simplifications to users of an IV.
+ void simplifyUsers(PHINode *CurrIV, IVVisitor *V = nullptr);
+
+ Value *foldIVUser(Instruction *UseInst, Instruction *IVOperand);
+
+ bool eliminateIdentitySCEV(Instruction *UseInst, Instruction *IVOperand);
+ bool replaceIVUserWithLoopInvariant(Instruction *UseInst);
+
+ bool eliminateOverflowIntrinsic(WithOverflowInst *WO);
+ bool eliminateSaturatingIntrinsic(SaturatingInst *SI);
+ bool eliminateTrunc(TruncInst *TI);
+ bool eliminateIVUser(Instruction *UseInst, Instruction *IVOperand);
+ bool makeIVComparisonInvariant(ICmpInst *ICmp, Value *IVOperand);
+ void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
+ void simplifyIVRemainder(BinaryOperator *Rem, Value *IVOperand,
+ bool IsSigned);
+ void replaceRemWithNumerator(BinaryOperator *Rem);
+ void replaceRemWithNumeratorOrZero(BinaryOperator *Rem);
+ void replaceSRemWithURem(BinaryOperator *Rem);
+ bool eliminateSDiv(BinaryOperator *SDiv);
+ bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand);
+ bool strengthenRightShift(BinaryOperator *BO, Value *IVOperand);
+ };
+}
+
+/// Fold an IV operand into its use. This removes increments of an
+/// aligned IV when used by a instruction that ignores the low bits.
+///
+/// IVOperand is guaranteed SCEVable, but UseInst may not be.
+///
+/// Return the operand of IVOperand for this induction variable if IVOperand can
+/// be folded (in case more folding opportunities have been exposed).
+/// Otherwise return null.
+Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand) {
+ Value *IVSrc = nullptr;
+ const unsigned OperIdx = 0;
+ const SCEV *FoldedExpr = nullptr;
+ bool MustDropExactFlag = false;
+ switch (UseInst->getOpcode()) {
+ default:
+ return nullptr;
+ case Instruction::UDiv:
+ case Instruction::LShr:
+ // We're only interested in the case where we know something about
+ // the numerator and have a constant denominator.
+ if (IVOperand != UseInst->getOperand(OperIdx) ||
+ !isa<ConstantInt>(UseInst->getOperand(1)))
+ return nullptr;
+
+ // Attempt to fold a binary operator with constant operand.
+ // e.g. ((I + 1) >> 2) => I >> 2
+ if (!isa<BinaryOperator>(IVOperand)
+ || !isa<ConstantInt>(IVOperand->getOperand(1)))
+ return nullptr;
+
+ IVSrc = IVOperand->getOperand(0);
+ // IVSrc must be the (SCEVable) IV, since the other operand is const.
+ assert(SE->isSCEVable(IVSrc->getType()) && "Expect SCEVable IV operand");
+
+ ConstantInt *D = cast<ConstantInt>(UseInst->getOperand(1));
+ if (UseInst->getOpcode() == Instruction::LShr) {
+ // Get a constant for the divisor. See createSCEV.
+ uint32_t BitWidth = cast<IntegerType>(UseInst->getType())->getBitWidth();
+ if (D->getValue().uge(BitWidth))
+ return nullptr;
+
+ D = ConstantInt::get(UseInst->getContext(),
+ APInt::getOneBitSet(BitWidth, D->getZExtValue()));
+ }
+ FoldedExpr = SE->getUDivExpr(SE->getSCEV(IVSrc), SE->getSCEV(D));
+ // We might have 'exact' flag set at this point which will no longer be
+ // correct after we make the replacement.
+ if (UseInst->isExact() &&
+ SE->getSCEV(IVSrc) != SE->getMulExpr(FoldedExpr, SE->getSCEV(D)))
+ MustDropExactFlag = true;
+ }
+ // We have something that might fold it's operand. Compare SCEVs.
+ if (!SE->isSCEVable(UseInst->getType()))
+ return nullptr;
+
+ // Bypass the operand if SCEV can prove it has no effect.
+ if (SE->getSCEV(UseInst) != FoldedExpr)
+ return nullptr;
+
+ LLVM_DEBUG(dbgs() << "INDVARS: Eliminated IV operand: " << *IVOperand
+ << " -> " << *UseInst << '\n');
+
+ UseInst->setOperand(OperIdx, IVSrc);
+ assert(SE->getSCEV(UseInst) == FoldedExpr && "bad SCEV with folded oper");
+
+ if (MustDropExactFlag)
+ UseInst->dropPoisonGeneratingFlags();
+
+ ++NumElimOperand;
+ Changed = true;
+ if (IVOperand->use_empty())
+ DeadInsts.emplace_back(IVOperand);
+ return IVSrc;
+}
+
+bool SimplifyIndvar::makeIVComparisonInvariant(ICmpInst *ICmp,
+ Value *IVOperand) {
+ unsigned IVOperIdx = 0;
+ ICmpInst::Predicate Pred = ICmp->getPredicate();
+ if (IVOperand != ICmp->getOperand(0)) {
+ // Swapped
+ assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
+ IVOperIdx = 1;
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ }
+
+ // Get the SCEVs for the ICmp operands (in the specific context of the
+ // current loop)
+ const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
+ const SCEV *S = SE->getSCEVAtScope(ICmp->getOperand(IVOperIdx), ICmpLoop);
+ const SCEV *X = SE->getSCEVAtScope(ICmp->getOperand(1 - IVOperIdx), ICmpLoop);
+
+ auto *PN = dyn_cast<PHINode>(IVOperand);
+ if (!PN)
+ return false;
auto LIP = SE->getLoopInvariantPredicate(Pred, S, X, L);
if (!LIP)
- return false;
+ return false;
ICmpInst::Predicate InvariantPredicate = LIP->Pred;
const SCEV *InvariantLHS = LIP->LHS;
const SCEV *InvariantRHS = LIP->RHS;
-
- // Rewrite the comparison to a loop invariant comparison if it can be done
- // cheaply, where cheaply means "we don't need to emit any new
- // instructions".
-
- SmallDenseMap<const SCEV*, Value*> CheapExpansions;
- CheapExpansions[S] = ICmp->getOperand(IVOperIdx);
- CheapExpansions[X] = ICmp->getOperand(1 - IVOperIdx);
-
- // TODO: Support multiple entry loops? (We currently bail out of these in
- // the IndVarSimplify pass)
- if (auto *BB = L->getLoopPredecessor()) {
- const int Idx = PN->getBasicBlockIndex(BB);
- if (Idx >= 0) {
- Value *Incoming = PN->getIncomingValue(Idx);
- const SCEV *IncomingS = SE->getSCEV(Incoming);
- CheapExpansions[IncomingS] = Incoming;
- }
- }
- Value *NewLHS = CheapExpansions[InvariantLHS];
- Value *NewRHS = CheapExpansions[InvariantRHS];
-
- if (!NewLHS)
- if (auto *ConstLHS = dyn_cast<SCEVConstant>(InvariantLHS))
- NewLHS = ConstLHS->getValue();
- if (!NewRHS)
- if (auto *ConstRHS = dyn_cast<SCEVConstant>(InvariantRHS))
- NewRHS = ConstRHS->getValue();
-
- if (!NewLHS || !NewRHS)
- // We could not find an existing value to replace either LHS or RHS.
- // Generating new instructions has subtler tradeoffs, so avoid doing that
- // for now.
- return false;
-
- LLVM_DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n');
- ICmp->setPredicate(InvariantPredicate);
- ICmp->setOperand(0, NewLHS);
- ICmp->setOperand(1, NewRHS);
- return true;
-}
-
-/// SimplifyIVUsers helper for eliminating useless
-/// comparisons against an induction variable.
-void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
- unsigned IVOperIdx = 0;
- ICmpInst::Predicate Pred = ICmp->getPredicate();
- ICmpInst::Predicate OriginalPred = Pred;
- if (IVOperand != ICmp->getOperand(0)) {
- // Swapped
- assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
- IVOperIdx = 1;
- Pred = ICmpInst::getSwappedPredicate(Pred);
- }
-
- // Get the SCEVs for the ICmp operands (in the specific context of the
- // current loop)
- const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
- const SCEV *S = SE->getSCEVAtScope(ICmp->getOperand(IVOperIdx), ICmpLoop);
- const SCEV *X = SE->getSCEVAtScope(ICmp->getOperand(1 - IVOperIdx), ICmpLoop);
-
- // If the condition is always true or always false, replace it with
- // a constant value.
- if (SE->isKnownPredicate(Pred, S, X)) {
- ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
- DeadInsts.emplace_back(ICmp);
- LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
- } else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) {
- ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
- DeadInsts.emplace_back(ICmp);
- LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
- } else if (makeIVComparisonInvariant(ICmp, IVOperand)) {
- // fallthrough to end of function
- } else if (ICmpInst::isSigned(OriginalPred) &&
- SE->isKnownNonNegative(S) && SE->isKnownNonNegative(X)) {
- // If we were unable to make anything above, all we can is to canonicalize
- // the comparison hoping that it will open the doors for other
- // optimizations. If we find out that we compare two non-negative values,
- // we turn the instruction's predicate to its unsigned version. Note that
- // we cannot rely on Pred here unless we check if we have swapped it.
- assert(ICmp->getPredicate() == OriginalPred && "Predicate changed?");
- LLVM_DEBUG(dbgs() << "INDVARS: Turn to unsigned comparison: " << *ICmp
- << '\n');
- ICmp->setPredicate(ICmpInst::getUnsignedPredicate(OriginalPred));
- } else
- return;
-
- ++NumElimCmp;
- Changed = true;
-}
-
-bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) {
- // Get the SCEVs for the ICmp operands.
- auto *N = SE->getSCEV(SDiv->getOperand(0));
- auto *D = SE->getSCEV(SDiv->getOperand(1));
-
- // Simplify unnecessary loops away.
- const Loop *L = LI->getLoopFor(SDiv->getParent());
- N = SE->getSCEVAtScope(N, L);
- D = SE->getSCEVAtScope(D, L);
-
- // Replace sdiv by udiv if both of the operands are non-negative
- if (SE->isKnownNonNegative(N) && SE->isKnownNonNegative(D)) {
- auto *UDiv = BinaryOperator::Create(
- BinaryOperator::UDiv, SDiv->getOperand(0), SDiv->getOperand(1),
- SDiv->getName() + ".udiv", SDiv);
- UDiv->setIsExact(SDiv->isExact());
- SDiv->replaceAllUsesWith(UDiv);
- LLVM_DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
- ++NumSimplifiedSDiv;
- Changed = true;
- DeadInsts.push_back(SDiv);
- return true;
- }
-
- return false;
-}
-
-// i %s n -> i %u n if i >= 0 and n >= 0
-void SimplifyIndvar::replaceSRemWithURem(BinaryOperator *Rem) {
- auto *N = Rem->getOperand(0), *D = Rem->getOperand(1);
- auto *URem = BinaryOperator::Create(BinaryOperator::URem, N, D,
- Rem->getName() + ".urem", Rem);
- Rem->replaceAllUsesWith(URem);
- LLVM_DEBUG(dbgs() << "INDVARS: Simplified srem: " << *Rem << '\n');
- ++NumSimplifiedSRem;
- Changed = true;
- DeadInsts.emplace_back(Rem);
-}
-
-// i % n --> i if i is in [0,n).
-void SimplifyIndvar::replaceRemWithNumerator(BinaryOperator *Rem) {
- Rem->replaceAllUsesWith(Rem->getOperand(0));
- LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
- ++NumElimRem;
- Changed = true;
- DeadInsts.emplace_back(Rem);
-}
-
-// (i+1) % n --> (i+1)==n?0:(i+1) if i is in [0,n).
-void SimplifyIndvar::replaceRemWithNumeratorOrZero(BinaryOperator *Rem) {
- auto *T = Rem->getType();
- auto *N = Rem->getOperand(0), *D = Rem->getOperand(1);
- ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ, N, D);
- SelectInst *Sel =
- SelectInst::Create(ICmp, ConstantInt::get(T, 0), N, "iv.rem", Rem);
- Rem->replaceAllUsesWith(Sel);
- LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
- ++NumElimRem;
- Changed = true;
- DeadInsts.emplace_back(Rem);
-}
-
-/// SimplifyIVUsers helper for eliminating useless remainder operations
-/// operating on an induction variable or replacing srem by urem.
-void SimplifyIndvar::simplifyIVRemainder(BinaryOperator *Rem, Value *IVOperand,
- bool IsSigned) {
- auto *NValue = Rem->getOperand(0);
- auto *DValue = Rem->getOperand(1);
- // We're only interested in the case where we know something about
- // the numerator, unless it is a srem, because we want to replace srem by urem
- // in general.
- bool UsedAsNumerator = IVOperand == NValue;
- if (!UsedAsNumerator && !IsSigned)
- return;
-
- const SCEV *N = SE->getSCEV(NValue);
-
- // Simplify unnecessary loops away.
- const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent());
- N = SE->getSCEVAtScope(N, ICmpLoop);
-
- bool IsNumeratorNonNegative = !IsSigned || SE->isKnownNonNegative(N);
-
- // Do not proceed if the Numerator may be negative
- if (!IsNumeratorNonNegative)
- return;
-
- const SCEV *D = SE->getSCEV(DValue);
- D = SE->getSCEVAtScope(D, ICmpLoop);
-
- if (UsedAsNumerator) {
- auto LT = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
- if (SE->isKnownPredicate(LT, N, D)) {
- replaceRemWithNumerator(Rem);
- return;
- }
-
- auto *T = Rem->getType();
- const auto *NLessOne = SE->getMinusSCEV(N, SE->getOne(T));
- if (SE->isKnownPredicate(LT, NLessOne, D)) {
- replaceRemWithNumeratorOrZero(Rem);
- return;
- }
- }
-
- // Try to replace SRem with URem, if both N and D are known non-negative.
- // Since we had already check N, we only need to check D now
- if (!IsSigned || !SE->isKnownNonNegative(D))
- return;
-
- replaceSRemWithURem(Rem);
-}
-
-static bool willNotOverflow(ScalarEvolution *SE, Instruction::BinaryOps BinOp,
- bool Signed, const SCEV *LHS, const SCEV *RHS) {
- const SCEV *(ScalarEvolution::*Operation)(const SCEV *, const SCEV *,
- SCEV::NoWrapFlags, unsigned);
- switch (BinOp) {
- default:
- llvm_unreachable("Unsupported binary op");
- case Instruction::Add:
- Operation = &ScalarEvolution::getAddExpr;
- break;
- case Instruction::Sub:
- Operation = &ScalarEvolution::getMinusSCEV;
- break;
- case Instruction::Mul:
- Operation = &ScalarEvolution::getMulExpr;
- break;
- }
-
- const SCEV *(ScalarEvolution::*Extension)(const SCEV *, Type *, unsigned) =
- Signed ? &ScalarEvolution::getSignExtendExpr
- : &ScalarEvolution::getZeroExtendExpr;
-
- // Check ext(LHS op RHS) == ext(LHS) op ext(RHS)
- auto *NarrowTy = cast<IntegerType>(LHS->getType());
- auto *WideTy =
- IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2);
-
- const SCEV *A =
- (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0),
- WideTy, 0);
- const SCEV *B =
- (SE->*Operation)((SE->*Extension)(LHS, WideTy, 0),
- (SE->*Extension)(RHS, WideTy, 0), SCEV::FlagAnyWrap, 0);
- return A == B;
-}
-
-bool SimplifyIndvar::eliminateOverflowIntrinsic(WithOverflowInst *WO) {
- const SCEV *LHS = SE->getSCEV(WO->getLHS());
- const SCEV *RHS = SE->getSCEV(WO->getRHS());
- if (!willNotOverflow(SE, WO->getBinaryOp(), WO->isSigned(), LHS, RHS))
- return false;
-
- // Proved no overflow, nuke the overflow check and, if possible, the overflow
- // intrinsic as well.
-
- BinaryOperator *NewResult = BinaryOperator::Create(
- WO->getBinaryOp(), WO->getLHS(), WO->getRHS(), "", WO);
-
- if (WO->isSigned())
- NewResult->setHasNoSignedWrap(true);
- else
- NewResult->setHasNoUnsignedWrap(true);
-
- SmallVector<ExtractValueInst *, 4> ToDelete;
-
- for (auto *U : WO->users()) {
- if (auto *EVI = dyn_cast<ExtractValueInst>(U)) {
- if (EVI->getIndices()[0] == 1)
- EVI->replaceAllUsesWith(ConstantInt::getFalse(WO->getContext()));
- else {
- assert(EVI->getIndices()[0] == 0 && "Only two possibilities!");
- EVI->replaceAllUsesWith(NewResult);
- }
- ToDelete.push_back(EVI);
- }
- }
-
- for (auto *EVI : ToDelete)
- EVI->eraseFromParent();
-
- if (WO->use_empty())
- WO->eraseFromParent();
-
+
+ // Rewrite the comparison to a loop invariant comparison if it can be done
+ // cheaply, where cheaply means "we don't need to emit any new
+ // instructions".
+
+ SmallDenseMap<const SCEV*, Value*> CheapExpansions;
+ CheapExpansions[S] = ICmp->getOperand(IVOperIdx);
+ CheapExpansions[X] = ICmp->getOperand(1 - IVOperIdx);
+
+ // TODO: Support multiple entry loops? (We currently bail out of these in
+ // the IndVarSimplify pass)
+ if (auto *BB = L->getLoopPredecessor()) {
+ const int Idx = PN->getBasicBlockIndex(BB);
+ if (Idx >= 0) {
+ Value *Incoming = PN->getIncomingValue(Idx);
+ const SCEV *IncomingS = SE->getSCEV(Incoming);
+ CheapExpansions[IncomingS] = Incoming;
+ }
+ }
+ Value *NewLHS = CheapExpansions[InvariantLHS];
+ Value *NewRHS = CheapExpansions[InvariantRHS];
+
+ if (!NewLHS)
+ if (auto *ConstLHS = dyn_cast<SCEVConstant>(InvariantLHS))
+ NewLHS = ConstLHS->getValue();
+ if (!NewRHS)
+ if (auto *ConstRHS = dyn_cast<SCEVConstant>(InvariantRHS))
+ NewRHS = ConstRHS->getValue();
+
+ if (!NewLHS || !NewRHS)
+ // We could not find an existing value to replace either LHS or RHS.
+ // Generating new instructions has subtler tradeoffs, so avoid doing that
+ // for now.
+ return false;
+
+ LLVM_DEBUG(dbgs() << "INDVARS: Simplified comparison: " << *ICmp << '\n');
+ ICmp->setPredicate(InvariantPredicate);
+ ICmp->setOperand(0, NewLHS);
+ ICmp->setOperand(1, NewRHS);
+ return true;
+}
+
+/// SimplifyIVUsers helper for eliminating useless
+/// comparisons against an induction variable.
+void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
+ unsigned IVOperIdx = 0;
+ ICmpInst::Predicate Pred = ICmp->getPredicate();
+ ICmpInst::Predicate OriginalPred = Pred;
+ if (IVOperand != ICmp->getOperand(0)) {
+ // Swapped
+ assert(IVOperand == ICmp->getOperand(1) && "Can't find IVOperand");
+ IVOperIdx = 1;
+ Pred = ICmpInst::getSwappedPredicate(Pred);
+ }
+
+ // Get the SCEVs for the ICmp operands (in the specific context of the
+ // current loop)
+ const Loop *ICmpLoop = LI->getLoopFor(ICmp->getParent());
+ const SCEV *S = SE->getSCEVAtScope(ICmp->getOperand(IVOperIdx), ICmpLoop);
+ const SCEV *X = SE->getSCEVAtScope(ICmp->getOperand(1 - IVOperIdx), ICmpLoop);
+
+ // If the condition is always true or always false, replace it with
+ // a constant value.
+ if (SE->isKnownPredicate(Pred, S, X)) {
+ ICmp->replaceAllUsesWith(ConstantInt::getTrue(ICmp->getContext()));
+ DeadInsts.emplace_back(ICmp);
+ LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+ } else if (SE->isKnownPredicate(ICmpInst::getInversePredicate(Pred), S, X)) {
+ ICmp->replaceAllUsesWith(ConstantInt::getFalse(ICmp->getContext()));
+ DeadInsts.emplace_back(ICmp);
+ LLVM_DEBUG(dbgs() << "INDVARS: Eliminated comparison: " << *ICmp << '\n');
+ } else if (makeIVComparisonInvariant(ICmp, IVOperand)) {
+ // fallthrough to end of function
+ } else if (ICmpInst::isSigned(OriginalPred) &&
+ SE->isKnownNonNegative(S) && SE->isKnownNonNegative(X)) {
+ // If we were unable to make anything above, all we can is to canonicalize
+ // the comparison hoping that it will open the doors for other
+ // optimizations. If we find out that we compare two non-negative values,
+ // we turn the instruction's predicate to its unsigned version. Note that
+ // we cannot rely on Pred here unless we check if we have swapped it.
+ assert(ICmp->getPredicate() == OriginalPred && "Predicate changed?");
+ LLVM_DEBUG(dbgs() << "INDVARS: Turn to unsigned comparison: " << *ICmp
+ << '\n');
+ ICmp->setPredicate(ICmpInst::getUnsignedPredicate(OriginalPred));
+ } else
+ return;
+
+ ++NumElimCmp;
+ Changed = true;
+}
+
+bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) {
+ // Get the SCEVs for the ICmp operands.
+ auto *N = SE->getSCEV(SDiv->getOperand(0));
+ auto *D = SE->getSCEV(SDiv->getOperand(1));
+
+ // Simplify unnecessary loops away.
+ const Loop *L = LI->getLoopFor(SDiv->getParent());
+ N = SE->getSCEVAtScope(N, L);
+ D = SE->getSCEVAtScope(D, L);
+
+ // Replace sdiv by udiv if both of the operands are non-negative
+ if (SE->isKnownNonNegative(N) && SE->isKnownNonNegative(D)) {
+ auto *UDiv = BinaryOperator::Create(
+ BinaryOperator::UDiv, SDiv->getOperand(0), SDiv->getOperand(1),
+ SDiv->getName() + ".udiv", SDiv);
+ UDiv->setIsExact(SDiv->isExact());
+ SDiv->replaceAllUsesWith(UDiv);
+ LLVM_DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
+ ++NumSimplifiedSDiv;
+ Changed = true;
+ DeadInsts.push_back(SDiv);
+ return true;
+ }
+
+ return false;
+}
+
+// i %s n -> i %u n if i >= 0 and n >= 0
+void SimplifyIndvar::replaceSRemWithURem(BinaryOperator *Rem) {
+ auto *N = Rem->getOperand(0), *D = Rem->getOperand(1);
+ auto *URem = BinaryOperator::Create(BinaryOperator::URem, N, D,
+ Rem->getName() + ".urem", Rem);
+ Rem->replaceAllUsesWith(URem);
+ LLVM_DEBUG(dbgs() << "INDVARS: Simplified srem: " << *Rem << '\n');
+ ++NumSimplifiedSRem;
+ Changed = true;
+ DeadInsts.emplace_back(Rem);
+}
+
+// i % n --> i if i is in [0,n).
+void SimplifyIndvar::replaceRemWithNumerator(BinaryOperator *Rem) {
+ Rem->replaceAllUsesWith(Rem->getOperand(0));
+ LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+ ++NumElimRem;
+ Changed = true;
+ DeadInsts.emplace_back(Rem);
+}
+
+// (i+1) % n --> (i+1)==n?0:(i+1) if i is in [0,n).
+void SimplifyIndvar::replaceRemWithNumeratorOrZero(BinaryOperator *Rem) {
+ auto *T = Rem->getType();
+ auto *N = Rem->getOperand(0), *D = Rem->getOperand(1);
+ ICmpInst *ICmp = new ICmpInst(Rem, ICmpInst::ICMP_EQ, N, D);
+ SelectInst *Sel =
+ SelectInst::Create(ICmp, ConstantInt::get(T, 0), N, "iv.rem", Rem);
+ Rem->replaceAllUsesWith(Sel);
+ LLVM_DEBUG(dbgs() << "INDVARS: Simplified rem: " << *Rem << '\n');
+ ++NumElimRem;
Changed = true;
- return true;
-}
-
-bool SimplifyIndvar::eliminateSaturatingIntrinsic(SaturatingInst *SI) {
- const SCEV *LHS = SE->getSCEV(SI->getLHS());
- const SCEV *RHS = SE->getSCEV(SI->getRHS());
- if (!willNotOverflow(SE, SI->getBinaryOp(), SI->isSigned(), LHS, RHS))
- return false;
-
- BinaryOperator *BO = BinaryOperator::Create(
- SI->getBinaryOp(), SI->getLHS(), SI->getRHS(), SI->getName(), SI);
- if (SI->isSigned())
- BO->setHasNoSignedWrap();
- else
- BO->setHasNoUnsignedWrap();
-
- SI->replaceAllUsesWith(BO);
- DeadInsts.emplace_back(SI);
- Changed = true;
- return true;
-}
-
-bool SimplifyIndvar::eliminateTrunc(TruncInst *TI) {
- // It is always legal to replace
- // icmp <pred> i32 trunc(iv), n
- // with
- // icmp <pred> i64 sext(trunc(iv)), sext(n), if pred is signed predicate.
- // Or with
- // icmp <pred> i64 zext(trunc(iv)), zext(n), if pred is unsigned predicate.
- // Or with either of these if pred is an equality predicate.
- //
- // If we can prove that iv == sext(trunc(iv)) or iv == zext(trunc(iv)) for
- // every comparison which uses trunc, it means that we can replace each of
- // them with comparison of iv against sext/zext(n). We no longer need trunc
- // after that.
- //
- // TODO: Should we do this if we can widen *some* comparisons, but not all
- // of them? Sometimes it is enough to enable other optimizations, but the
- // trunc instruction will stay in the loop.
- Value *IV = TI->getOperand(0);
- Type *IVTy = IV->getType();
- const SCEV *IVSCEV = SE->getSCEV(IV);
- const SCEV *TISCEV = SE->getSCEV(TI);
-
- // Check if iv == zext(trunc(iv)) and if iv == sext(trunc(iv)). If so, we can
- // get rid of trunc
- bool DoesSExtCollapse = false;
- bool DoesZExtCollapse = false;
- if (IVSCEV == SE->getSignExtendExpr(TISCEV, IVTy))
- DoesSExtCollapse = true;
- if (IVSCEV == SE->getZeroExtendExpr(TISCEV, IVTy))
- DoesZExtCollapse = true;
-
- // If neither sext nor zext does collapse, it is not profitable to do any
- // transform. Bail.
- if (!DoesSExtCollapse && !DoesZExtCollapse)
- return false;
-
- // Collect users of the trunc that look like comparisons against invariants.
- // Bail if we find something different.
- SmallVector<ICmpInst *, 4> ICmpUsers;
- for (auto *U : TI->users()) {
- // We don't care about users in unreachable blocks.
- if (isa<Instruction>(U) &&
- !DT->isReachableFromEntry(cast<Instruction>(U)->getParent()))
- continue;
- ICmpInst *ICI = dyn_cast<ICmpInst>(U);
- if (!ICI) return false;
- assert(L->contains(ICI->getParent()) && "LCSSA form broken?");
- if (!(ICI->getOperand(0) == TI && L->isLoopInvariant(ICI->getOperand(1))) &&
- !(ICI->getOperand(1) == TI && L->isLoopInvariant(ICI->getOperand(0))))
- return false;
- // If we cannot get rid of trunc, bail.
- if (ICI->isSigned() && !DoesSExtCollapse)
- return false;
- if (ICI->isUnsigned() && !DoesZExtCollapse)
- return false;
- // For equality, either signed or unsigned works.
- ICmpUsers.push_back(ICI);
- }
-
- auto CanUseZExt = [&](ICmpInst *ICI) {
- // Unsigned comparison can be widened as unsigned.
- if (ICI->isUnsigned())
- return true;
- // Is it profitable to do zext?
- if (!DoesZExtCollapse)
- return false;
- // For equality, we can safely zext both parts.
- if (ICI->isEquality())
- return true;
- // Otherwise we can only use zext when comparing two non-negative or two
- // negative values. But in practice, we will never pass DoesZExtCollapse
- // check for a negative value, because zext(trunc(x)) is non-negative. So
- // it only make sense to check for non-negativity here.
- const SCEV *SCEVOP1 = SE->getSCEV(ICI->getOperand(0));
- const SCEV *SCEVOP2 = SE->getSCEV(ICI->getOperand(1));
- return SE->isKnownNonNegative(SCEVOP1) && SE->isKnownNonNegative(SCEVOP2);
- };
- // Replace all comparisons against trunc with comparisons against IV.
- for (auto *ICI : ICmpUsers) {
- bool IsSwapped = L->isLoopInvariant(ICI->getOperand(0));
- auto *Op1 = IsSwapped ? ICI->getOperand(0) : ICI->getOperand(1);
- Instruction *Ext = nullptr;
- // For signed/unsigned predicate, replace the old comparison with comparison
- // of immediate IV against sext/zext of the invariant argument. If we can
- // use either sext or zext (i.e. we are dealing with equality predicate),
- // then prefer zext as a more canonical form.
- // TODO: If we see a signed comparison which can be turned into unsigned,
- // we can do it here for canonicalization purposes.
- ICmpInst::Predicate Pred = ICI->getPredicate();
- if (IsSwapped) Pred = ICmpInst::getSwappedPredicate(Pred);
- if (CanUseZExt(ICI)) {
- assert(DoesZExtCollapse && "Unprofitable zext?");
- Ext = new ZExtInst(Op1, IVTy, "zext", ICI);
- Pred = ICmpInst::getUnsignedPredicate(Pred);
- } else {
- assert(DoesSExtCollapse && "Unprofitable sext?");
- Ext = new SExtInst(Op1, IVTy, "sext", ICI);
- assert(Pred == ICmpInst::getSignedPredicate(Pred) && "Must be signed!");
- }
- bool Changed;
- L->makeLoopInvariant(Ext, Changed);
- (void)Changed;
- ICmpInst *NewICI = new ICmpInst(ICI, Pred, IV, Ext);
- ICI->replaceAllUsesWith(NewICI);
- DeadInsts.emplace_back(ICI);
- }
-
- // Trunc no longer needed.
- TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
- DeadInsts.emplace_back(TI);
- return true;
-}
-
-/// Eliminate an operation that consumes a simple IV and has no observable
-/// side-effect given the range of IV values. IVOperand is guaranteed SCEVable,
-/// but UseInst may not be.
-bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
- Instruction *IVOperand) {
- if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
- eliminateIVComparison(ICmp, IVOperand);
- return true;
- }
- if (BinaryOperator *Bin = dyn_cast<BinaryOperator>(UseInst)) {
- bool IsSRem = Bin->getOpcode() == Instruction::SRem;
- if (IsSRem || Bin->getOpcode() == Instruction::URem) {
- simplifyIVRemainder(Bin, IVOperand, IsSRem);
- return true;
- }
-
- if (Bin->getOpcode() == Instruction::SDiv)
- return eliminateSDiv(Bin);
- }
-
- if (auto *WO = dyn_cast<WithOverflowInst>(UseInst))
- if (eliminateOverflowIntrinsic(WO))
- return true;
-
- if (auto *SI = dyn_cast<SaturatingInst>(UseInst))
- if (eliminateSaturatingIntrinsic(SI))
- return true;
-
- if (auto *TI = dyn_cast<TruncInst>(UseInst))
- if (eliminateTrunc(TI))
- return true;
-
- if (eliminateIdentitySCEV(UseInst, IVOperand))
- return true;
-
- return false;
-}
-
-static Instruction *GetLoopInvariantInsertPosition(Loop *L, Instruction *Hint) {
- if (auto *BB = L->getLoopPreheader())
- return BB->getTerminator();
-
- return Hint;
-}
-
-/// Replace the UseInst with a loop invariant expression if it is safe.
-bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) {
- if (!SE->isSCEVable(I->getType()))
- return false;
-
- // Get the symbolic expression for this instruction.
- const SCEV *S = SE->getSCEV(I);
-
- if (!SE->isLoopInvariant(S, L))
- return false;
-
- // Do not generate something ridiculous even if S is loop invariant.
- if (Rewriter.isHighCostExpansion(S, L, SCEVCheapExpansionBudget, TTI, I))
- return false;
-
- auto *IP = GetLoopInvariantInsertPosition(L, I);
-
- if (!isSafeToExpandAt(S, IP, *SE)) {
- LLVM_DEBUG(dbgs() << "INDVARS: Can not replace IV user: " << *I
- << " with non-speculable loop invariant: " << *S << '\n');
- return false;
- }
-
- auto *Invariant = Rewriter.expandCodeFor(S, I->getType(), IP);
-
- I->replaceAllUsesWith(Invariant);
- LLVM_DEBUG(dbgs() << "INDVARS: Replace IV user: " << *I
- << " with loop invariant: " << *S << '\n');
- ++NumFoldedUser;
- Changed = true;
- DeadInsts.emplace_back(I);
- return true;
-}
-
-/// Eliminate any operation that SCEV can prove is an identity function.
-bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst,
- Instruction *IVOperand) {
- if (!SE->isSCEVable(UseInst->getType()) ||
- (UseInst->getType() != IVOperand->getType()) ||
- (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand)))
- return false;
-
- // getSCEV(X) == getSCEV(Y) does not guarantee that X and Y are related in the
- // dominator tree, even if X is an operand to Y. For instance, in
- //
- // %iv = phi i32 {0,+,1}
- // br %cond, label %left, label %merge
- //
- // left:
- // %X = add i32 %iv, 0
- // br label %merge
- //
- // merge:
- // %M = phi (%X, %iv)
- //
- // getSCEV(%M) == getSCEV(%X) == {0,+,1}, but %X does not dominate %M, and
- // %M.replaceAllUsesWith(%X) would be incorrect.
-
- if (isa<PHINode>(UseInst))
- // If UseInst is not a PHI node then we know that IVOperand dominates
- // UseInst directly from the legality of SSA.
- if (!DT || !DT->dominates(IVOperand, UseInst))
- return false;
-
- if (!LI->replacementPreservesLCSSAForm(UseInst, IVOperand))
- return false;
-
- LLVM_DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
-
- UseInst->replaceAllUsesWith(IVOperand);
- ++NumElimIdentity;
- Changed = true;
- DeadInsts.emplace_back(UseInst);
- return true;
-}
-
-/// Annotate BO with nsw / nuw if it provably does not signed-overflow /
-/// unsigned-overflow. Returns true if anything changed, false otherwise.
-bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
- Value *IVOperand) {
- // Fastpath: we don't have any work to do if `BO` is `nuw` and `nsw`.
- if (BO->hasNoUnsignedWrap() && BO->hasNoSignedWrap())
- return false;
-
- if (BO->getOpcode() != Instruction::Add &&
- BO->getOpcode() != Instruction::Sub &&
- BO->getOpcode() != Instruction::Mul)
- return false;
-
- const SCEV *LHS = SE->getSCEV(BO->getOperand(0));
- const SCEV *RHS = SE->getSCEV(BO->getOperand(1));
- bool Changed = false;
-
- if (!BO->hasNoUnsignedWrap() &&
- willNotOverflow(SE, BO->getOpcode(), /* Signed */ false, LHS, RHS)) {
- BO->setHasNoUnsignedWrap();
- SE->forgetValue(BO);
- Changed = true;
- }
-
- if (!BO->hasNoSignedWrap() &&
- willNotOverflow(SE, BO->getOpcode(), /* Signed */ true, LHS, RHS)) {
- BO->setHasNoSignedWrap();
- SE->forgetValue(BO);
- Changed = true;
- }
-
- return Changed;
-}
-
-/// Annotate the Shr in (X << IVOperand) >> C as exact using the
-/// information from the IV's range. Returns true if anything changed, false
-/// otherwise.
-bool SimplifyIndvar::strengthenRightShift(BinaryOperator *BO,
- Value *IVOperand) {
- using namespace llvm::PatternMatch;
-
- if (BO->getOpcode() == Instruction::Shl) {
- bool Changed = false;
- ConstantRange IVRange = SE->getUnsignedRange(SE->getSCEV(IVOperand));
- for (auto *U : BO->users()) {
- const APInt *C;
- if (match(U,
- m_AShr(m_Shl(m_Value(), m_Specific(IVOperand)), m_APInt(C))) ||
- match(U,
- m_LShr(m_Shl(m_Value(), m_Specific(IVOperand)), m_APInt(C)))) {
- BinaryOperator *Shr = cast<BinaryOperator>(U);
- if (!Shr->isExact() && IVRange.getUnsignedMin().uge(*C)) {
- Shr->setIsExact(true);
- Changed = true;
- }
- }
- }
- return Changed;
- }
-
- return false;
-}
-
-/// Add all uses of Def to the current IV's worklist.
-static void pushIVUsers(
- Instruction *Def, Loop *L,
- SmallPtrSet<Instruction*,16> &Simplified,
- SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) {
-
- for (User *U : Def->users()) {
- Instruction *UI = cast<Instruction>(U);
-
- // Avoid infinite or exponential worklist processing.
- // Also ensure unique worklist users.
- // If Def is a LoopPhi, it may not be in the Simplified set, so check for
- // self edges first.
- if (UI == Def)
- continue;
-
- // Only change the current Loop, do not change the other parts (e.g. other
- // Loops).
- if (!L->contains(UI))
- continue;
-
- // Do not push the same instruction more than once.
- if (!Simplified.insert(UI).second)
- continue;
-
- SimpleIVUsers.push_back(std::make_pair(UI, Def));
- }
-}
-
-/// Return true if this instruction generates a simple SCEV
-/// expression in terms of that IV.
-///
-/// This is similar to IVUsers' isInteresting() but processes each instruction
-/// non-recursively when the operand is already known to be a simpleIVUser.
-///
-static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) {
- if (!SE->isSCEVable(I->getType()))
- return false;
-
- // Get the symbolic expression for this instruction.
- const SCEV *S = SE->getSCEV(I);
-
- // Only consider affine recurrences.
- const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
- if (AR && AR->getLoop() == L)
- return true;
-
- return false;
-}
-
-/// Iteratively perform simplification on a worklist of users
-/// of the specified induction variable. Each successive simplification may push
-/// more users which may themselves be candidates for simplification.
-///
-/// This algorithm does not require IVUsers analysis. Instead, it simplifies
-/// instructions in-place during analysis. Rather than rewriting induction
-/// variables bottom-up from their users, it transforms a chain of IVUsers
-/// top-down, updating the IR only when it encounters a clear optimization
-/// opportunity.
-///
-/// Once DisableIVRewrite is default, LSR will be the only client of IVUsers.
-///
-void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
- if (!SE->isSCEVable(CurrIV->getType()))
- return;
-
- // Instructions processed by SimplifyIndvar for CurrIV.
- SmallPtrSet<Instruction*,16> Simplified;
-
- // Use-def pairs if IV users waiting to be processed for CurrIV.
- SmallVector<std::pair<Instruction*, Instruction*>, 8> SimpleIVUsers;
-
- // Push users of the current LoopPhi. In rare cases, pushIVUsers may be
- // called multiple times for the same LoopPhi. This is the proper thing to
- // do for loop header phis that use each other.
- pushIVUsers(CurrIV, L, Simplified, SimpleIVUsers);
-
- while (!SimpleIVUsers.empty()) {
- std::pair<Instruction*, Instruction*> UseOper =
- SimpleIVUsers.pop_back_val();
- Instruction *UseInst = UseOper.first;
-
- // If a user of the IndVar is trivially dead, we prefer just to mark it dead
- // rather than try to do some complex analysis or transformation (such as
- // widening) basing on it.
- // TODO: Propagate TLI and pass it here to handle more cases.
- if (isInstructionTriviallyDead(UseInst, /* TLI */ nullptr)) {
- DeadInsts.emplace_back(UseInst);
- continue;
- }
-
- // Bypass back edges to avoid extra work.
- if (UseInst == CurrIV) continue;
-
- // Try to replace UseInst with a loop invariant before any other
- // simplifications.
- if (replaceIVUserWithLoopInvariant(UseInst))
- continue;
-
- Instruction *IVOperand = UseOper.second;
- for (unsigned N = 0; IVOperand; ++N) {
- assert(N <= Simplified.size() && "runaway iteration");
-
- Value *NewOper = foldIVUser(UseInst, IVOperand);
- if (!NewOper)
- break; // done folding
- IVOperand = dyn_cast<Instruction>(NewOper);
- }
- if (!IVOperand)
- continue;
-
- if (eliminateIVUser(UseInst, IVOperand)) {
- pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers);
- continue;
- }
-
- if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseInst)) {
- if ((isa<OverflowingBinaryOperator>(BO) &&
- strengthenOverflowingOperation(BO, IVOperand)) ||
- (isa<ShlOperator>(BO) && strengthenRightShift(BO, IVOperand))) {
- // re-queue uses of the now modified binary operator and fall
- // through to the checks that remain.
- pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers);
- }
- }
-
- CastInst *Cast = dyn_cast<CastInst>(UseInst);
- if (V && Cast) {
- V->visitCast(Cast);
- continue;
- }
- if (isSimpleIVUser(UseInst, L, SE)) {
- pushIVUsers(UseInst, L, Simplified, SimpleIVUsers);
- }
- }
-}
-
-namespace llvm {
-
-void IVVisitor::anchor() { }
-
-/// Simplify instructions that use this induction variable
-/// by using ScalarEvolution to analyze the IV's recurrence.
-bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT,
- LoopInfo *LI, const TargetTransformInfo *TTI,
- SmallVectorImpl<WeakTrackingVH> &Dead,
- SCEVExpander &Rewriter, IVVisitor *V) {
- SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, TTI,
- Rewriter, Dead);
- SIV.simplifyUsers(CurrIV, V);
- return SIV.hasChanged();
-}
-
-/// Simplify users of induction variables within this
-/// loop. This does not actually change or add IVs.
-bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
- LoopInfo *LI, const TargetTransformInfo *TTI,
- SmallVectorImpl<WeakTrackingVH> &Dead) {
- SCEVExpander Rewriter(*SE, SE->getDataLayout(), "indvars");
-#ifndef NDEBUG
- Rewriter.setDebugType(DEBUG_TYPE);
-#endif
- bool Changed = false;
- for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
- Changed |=
- simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, TTI, Dead, Rewriter);
- }
- return Changed;
-}
-
-} // namespace llvm
+ DeadInsts.emplace_back(Rem);
+}
+
+/// SimplifyIVUsers helper for eliminating useless remainder operations
+/// operating on an induction variable or replacing srem by urem.
+void SimplifyIndvar::simplifyIVRemainder(BinaryOperator *Rem, Value *IVOperand,
+ bool IsSigned) {
+ auto *NValue = Rem->getOperand(0);
+ auto *DValue = Rem->getOperand(1);
+ // We're only interested in the case where we know something about
+ // the numerator, unless it is a srem, because we want to replace srem by urem
+ // in general.
+ bool UsedAsNumerator = IVOperand == NValue;
+ if (!UsedAsNumerator && !IsSigned)
+ return;
+
+ const SCEV *N = SE->getSCEV(NValue);
+
+ // Simplify unnecessary loops away.
+ const Loop *ICmpLoop = LI->getLoopFor(Rem->getParent());
+ N = SE->getSCEVAtScope(N, ICmpLoop);
+
+ bool IsNumeratorNonNegative = !IsSigned || SE->isKnownNonNegative(N);
+
+ // Do not proceed if the Numerator may be negative
+ if (!IsNumeratorNonNegative)
+ return;
+
+ const SCEV *D = SE->getSCEV(DValue);
+ D = SE->getSCEVAtScope(D, ICmpLoop);
+
+ if (UsedAsNumerator) {
+ auto LT = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
+ if (SE->isKnownPredicate(LT, N, D)) {
+ replaceRemWithNumerator(Rem);
+ return;
+ }
+
+ auto *T = Rem->getType();
+ const auto *NLessOne = SE->getMinusSCEV(N, SE->getOne(T));
+ if (SE->isKnownPredicate(LT, NLessOne, D)) {
+ replaceRemWithNumeratorOrZero(Rem);
+ return;
+ }
+ }
+
+ // Try to replace SRem with URem, if both N and D are known non-negative.
+ // Since we had already check N, we only need to check D now
+ if (!IsSigned || !SE->isKnownNonNegative(D))
+ return;
+
+ replaceSRemWithURem(Rem);
+}
+
+static bool willNotOverflow(ScalarEvolution *SE, Instruction::BinaryOps BinOp,
+ bool Signed, const SCEV *LHS, const SCEV *RHS) {
+ const SCEV *(ScalarEvolution::*Operation)(const SCEV *, const SCEV *,
+ SCEV::NoWrapFlags, unsigned);
+ switch (BinOp) {
+ default:
+ llvm_unreachable("Unsupported binary op");
+ case Instruction::Add:
+ Operation = &ScalarEvolution::getAddExpr;
+ break;
+ case Instruction::Sub:
+ Operation = &ScalarEvolution::getMinusSCEV;
+ break;
+ case Instruction::Mul:
+ Operation = &ScalarEvolution::getMulExpr;
+ break;
+ }
+
+ const SCEV *(ScalarEvolution::*Extension)(const SCEV *, Type *, unsigned) =
+ Signed ? &ScalarEvolution::getSignExtendExpr
+ : &ScalarEvolution::getZeroExtendExpr;
+
+ // Check ext(LHS op RHS) == ext(LHS) op ext(RHS)
+ auto *NarrowTy = cast<IntegerType>(LHS->getType());
+ auto *WideTy =
+ IntegerType::get(NarrowTy->getContext(), NarrowTy->getBitWidth() * 2);
+
+ const SCEV *A =
+ (SE->*Extension)((SE->*Operation)(LHS, RHS, SCEV::FlagAnyWrap, 0),
+ WideTy, 0);
+ const SCEV *B =
+ (SE->*Operation)((SE->*Extension)(LHS, WideTy, 0),
+ (SE->*Extension)(RHS, WideTy, 0), SCEV::FlagAnyWrap, 0);
+ return A == B;
+}
+
+bool SimplifyIndvar::eliminateOverflowIntrinsic(WithOverflowInst *WO) {
+ const SCEV *LHS = SE->getSCEV(WO->getLHS());
+ const SCEV *RHS = SE->getSCEV(WO->getRHS());
+ if (!willNotOverflow(SE, WO->getBinaryOp(), WO->isSigned(), LHS, RHS))
+ return false;
+
+ // Proved no overflow, nuke the overflow check and, if possible, the overflow
+ // intrinsic as well.
+
+ BinaryOperator *NewResult = BinaryOperator::Create(
+ WO->getBinaryOp(), WO->getLHS(), WO->getRHS(), "", WO);
+
+ if (WO->isSigned())
+ NewResult->setHasNoSignedWrap(true);
+ else
+ NewResult->setHasNoUnsignedWrap(true);
+
+ SmallVector<ExtractValueInst *, 4> ToDelete;
+
+ for (auto *U : WO->users()) {
+ if (auto *EVI = dyn_cast<ExtractValueInst>(U)) {
+ if (EVI->getIndices()[0] == 1)
+ EVI->replaceAllUsesWith(ConstantInt::getFalse(WO->getContext()));
+ else {
+ assert(EVI->getIndices()[0] == 0 && "Only two possibilities!");
+ EVI->replaceAllUsesWith(NewResult);
+ }
+ ToDelete.push_back(EVI);
+ }
+ }
+
+ for (auto *EVI : ToDelete)
+ EVI->eraseFromParent();
+
+ if (WO->use_empty())
+ WO->eraseFromParent();
+
+ Changed = true;
+ return true;
+}
+
+bool SimplifyIndvar::eliminateSaturatingIntrinsic(SaturatingInst *SI) {
+ const SCEV *LHS = SE->getSCEV(SI->getLHS());
+ const SCEV *RHS = SE->getSCEV(SI->getRHS());
+ if (!willNotOverflow(SE, SI->getBinaryOp(), SI->isSigned(), LHS, RHS))
+ return false;
+
+ BinaryOperator *BO = BinaryOperator::Create(
+ SI->getBinaryOp(), SI->getLHS(), SI->getRHS(), SI->getName(), SI);
+ if (SI->isSigned())
+ BO->setHasNoSignedWrap();
+ else
+ BO->setHasNoUnsignedWrap();
+
+ SI->replaceAllUsesWith(BO);
+ DeadInsts.emplace_back(SI);
+ Changed = true;
+ return true;
+}
+
+bool SimplifyIndvar::eliminateTrunc(TruncInst *TI) {
+ // It is always legal to replace
+ // icmp <pred> i32 trunc(iv), n
+ // with
+ // icmp <pred> i64 sext(trunc(iv)), sext(n), if pred is signed predicate.
+ // Or with
+ // icmp <pred> i64 zext(trunc(iv)), zext(n), if pred is unsigned predicate.
+ // Or with either of these if pred is an equality predicate.
+ //
+ // If we can prove that iv == sext(trunc(iv)) or iv == zext(trunc(iv)) for
+ // every comparison which uses trunc, it means that we can replace each of
+ // them with comparison of iv against sext/zext(n). We no longer need trunc
+ // after that.
+ //
+ // TODO: Should we do this if we can widen *some* comparisons, but not all
+ // of them? Sometimes it is enough to enable other optimizations, but the
+ // trunc instruction will stay in the loop.
+ Value *IV = TI->getOperand(0);
+ Type *IVTy = IV->getType();
+ const SCEV *IVSCEV = SE->getSCEV(IV);
+ const SCEV *TISCEV = SE->getSCEV(TI);
+
+ // Check if iv == zext(trunc(iv)) and if iv == sext(trunc(iv)). If so, we can
+ // get rid of trunc
+ bool DoesSExtCollapse = false;
+ bool DoesZExtCollapse = false;
+ if (IVSCEV == SE->getSignExtendExpr(TISCEV, IVTy))
+ DoesSExtCollapse = true;
+ if (IVSCEV == SE->getZeroExtendExpr(TISCEV, IVTy))
+ DoesZExtCollapse = true;
+
+ // If neither sext nor zext does collapse, it is not profitable to do any
+ // transform. Bail.
+ if (!DoesSExtCollapse && !DoesZExtCollapse)
+ return false;
+
+ // Collect users of the trunc that look like comparisons against invariants.
+ // Bail if we find something different.
+ SmallVector<ICmpInst *, 4> ICmpUsers;
+ for (auto *U : TI->users()) {
+ // We don't care about users in unreachable blocks.
+ if (isa<Instruction>(U) &&
+ !DT->isReachableFromEntry(cast<Instruction>(U)->getParent()))
+ continue;
+ ICmpInst *ICI = dyn_cast<ICmpInst>(U);
+ if (!ICI) return false;
+ assert(L->contains(ICI->getParent()) && "LCSSA form broken?");
+ if (!(ICI->getOperand(0) == TI && L->isLoopInvariant(ICI->getOperand(1))) &&
+ !(ICI->getOperand(1) == TI && L->isLoopInvariant(ICI->getOperand(0))))
+ return false;
+ // If we cannot get rid of trunc, bail.
+ if (ICI->isSigned() && !DoesSExtCollapse)
+ return false;
+ if (ICI->isUnsigned() && !DoesZExtCollapse)
+ return false;
+ // For equality, either signed or unsigned works.
+ ICmpUsers.push_back(ICI);
+ }
+
+ auto CanUseZExt = [&](ICmpInst *ICI) {
+ // Unsigned comparison can be widened as unsigned.
+ if (ICI->isUnsigned())
+ return true;
+ // Is it profitable to do zext?
+ if (!DoesZExtCollapse)
+ return false;
+ // For equality, we can safely zext both parts.
+ if (ICI->isEquality())
+ return true;
+ // Otherwise we can only use zext when comparing two non-negative or two
+ // negative values. But in practice, we will never pass DoesZExtCollapse
+ // check for a negative value, because zext(trunc(x)) is non-negative. So
+ // it only make sense to check for non-negativity here.
+ const SCEV *SCEVOP1 = SE->getSCEV(ICI->getOperand(0));
+ const SCEV *SCEVOP2 = SE->getSCEV(ICI->getOperand(1));
+ return SE->isKnownNonNegative(SCEVOP1) && SE->isKnownNonNegative(SCEVOP2);
+ };
+ // Replace all comparisons against trunc with comparisons against IV.
+ for (auto *ICI : ICmpUsers) {
+ bool IsSwapped = L->isLoopInvariant(ICI->getOperand(0));
+ auto *Op1 = IsSwapped ? ICI->getOperand(0) : ICI->getOperand(1);
+ Instruction *Ext = nullptr;
+ // For signed/unsigned predicate, replace the old comparison with comparison
+ // of immediate IV against sext/zext of the invariant argument. If we can
+ // use either sext or zext (i.e. we are dealing with equality predicate),
+ // then prefer zext as a more canonical form.
+ // TODO: If we see a signed comparison which can be turned into unsigned,
+ // we can do it here for canonicalization purposes.
+ ICmpInst::Predicate Pred = ICI->getPredicate();
+ if (IsSwapped) Pred = ICmpInst::getSwappedPredicate(Pred);
+ if (CanUseZExt(ICI)) {
+ assert(DoesZExtCollapse && "Unprofitable zext?");
+ Ext = new ZExtInst(Op1, IVTy, "zext", ICI);
+ Pred = ICmpInst::getUnsignedPredicate(Pred);
+ } else {
+ assert(DoesSExtCollapse && "Unprofitable sext?");
+ Ext = new SExtInst(Op1, IVTy, "sext", ICI);
+ assert(Pred == ICmpInst::getSignedPredicate(Pred) && "Must be signed!");
+ }
+ bool Changed;
+ L->makeLoopInvariant(Ext, Changed);
+ (void)Changed;
+ ICmpInst *NewICI = new ICmpInst(ICI, Pred, IV, Ext);
+ ICI->replaceAllUsesWith(NewICI);
+ DeadInsts.emplace_back(ICI);
+ }
+
+ // Trunc no longer needed.
+ TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+ DeadInsts.emplace_back(TI);
+ return true;
+}
+
+/// Eliminate an operation that consumes a simple IV and has no observable
+/// side-effect given the range of IV values. IVOperand is guaranteed SCEVable,
+/// but UseInst may not be.
+bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
+ Instruction *IVOperand) {
+ if (ICmpInst *ICmp = dyn_cast<ICmpInst>(UseInst)) {
+ eliminateIVComparison(ICmp, IVOperand);
+ return true;
+ }
+ if (BinaryOperator *Bin = dyn_cast<BinaryOperator>(UseInst)) {
+ bool IsSRem = Bin->getOpcode() == Instruction::SRem;
+ if (IsSRem || Bin->getOpcode() == Instruction::URem) {
+ simplifyIVRemainder(Bin, IVOperand, IsSRem);
+ return true;
+ }
+
+ if (Bin->getOpcode() == Instruction::SDiv)
+ return eliminateSDiv(Bin);
+ }
+
+ if (auto *WO = dyn_cast<WithOverflowInst>(UseInst))
+ if (eliminateOverflowIntrinsic(WO))
+ return true;
+
+ if (auto *SI = dyn_cast<SaturatingInst>(UseInst))
+ if (eliminateSaturatingIntrinsic(SI))
+ return true;
+
+ if (auto *TI = dyn_cast<TruncInst>(UseInst))
+ if (eliminateTrunc(TI))
+ return true;
+
+ if (eliminateIdentitySCEV(UseInst, IVOperand))
+ return true;
+
+ return false;
+}
+
+static Instruction *GetLoopInvariantInsertPosition(Loop *L, Instruction *Hint) {
+ if (auto *BB = L->getLoopPreheader())
+ return BB->getTerminator();
+
+ return Hint;
+}
+
+/// Replace the UseInst with a loop invariant expression if it is safe.
+bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) {
+ if (!SE->isSCEVable(I->getType()))
+ return false;
+
+ // Get the symbolic expression for this instruction.
+ const SCEV *S = SE->getSCEV(I);
+
+ if (!SE->isLoopInvariant(S, L))
+ return false;
+
+ // Do not generate something ridiculous even if S is loop invariant.
+ if (Rewriter.isHighCostExpansion(S, L, SCEVCheapExpansionBudget, TTI, I))
+ return false;
+
+ auto *IP = GetLoopInvariantInsertPosition(L, I);
+
+ if (!isSafeToExpandAt(S, IP, *SE)) {
+ LLVM_DEBUG(dbgs() << "INDVARS: Can not replace IV user: " << *I
+ << " with non-speculable loop invariant: " << *S << '\n');
+ return false;
+ }
+
+ auto *Invariant = Rewriter.expandCodeFor(S, I->getType(), IP);
+
+ I->replaceAllUsesWith(Invariant);
+ LLVM_DEBUG(dbgs() << "INDVARS: Replace IV user: " << *I
+ << " with loop invariant: " << *S << '\n');
+ ++NumFoldedUser;
+ Changed = true;
+ DeadInsts.emplace_back(I);
+ return true;
+}
+
+/// Eliminate any operation that SCEV can prove is an identity function.
+bool SimplifyIndvar::eliminateIdentitySCEV(Instruction *UseInst,
+ Instruction *IVOperand) {
+ if (!SE->isSCEVable(UseInst->getType()) ||
+ (UseInst->getType() != IVOperand->getType()) ||
+ (SE->getSCEV(UseInst) != SE->getSCEV(IVOperand)))
+ return false;
+
+ // getSCEV(X) == getSCEV(Y) does not guarantee that X and Y are related in the
+ // dominator tree, even if X is an operand to Y. For instance, in
+ //
+ // %iv = phi i32 {0,+,1}
+ // br %cond, label %left, label %merge
+ //
+ // left:
+ // %X = add i32 %iv, 0
+ // br label %merge
+ //
+ // merge:
+ // %M = phi (%X, %iv)
+ //
+ // getSCEV(%M) == getSCEV(%X) == {0,+,1}, but %X does not dominate %M, and
+ // %M.replaceAllUsesWith(%X) would be incorrect.
+
+ if (isa<PHINode>(UseInst))
+ // If UseInst is not a PHI node then we know that IVOperand dominates
+ // UseInst directly from the legality of SSA.
+ if (!DT || !DT->dominates(IVOperand, UseInst))
+ return false;
+
+ if (!LI->replacementPreservesLCSSAForm(UseInst, IVOperand))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "INDVARS: Eliminated identity: " << *UseInst << '\n');
+
+ UseInst->replaceAllUsesWith(IVOperand);
+ ++NumElimIdentity;
+ Changed = true;
+ DeadInsts.emplace_back(UseInst);
+ return true;
+}
+
+/// Annotate BO with nsw / nuw if it provably does not signed-overflow /
+/// unsigned-overflow. Returns true if anything changed, false otherwise.
+bool SimplifyIndvar::strengthenOverflowingOperation(BinaryOperator *BO,
+ Value *IVOperand) {
+ // Fastpath: we don't have any work to do if `BO` is `nuw` and `nsw`.
+ if (BO->hasNoUnsignedWrap() && BO->hasNoSignedWrap())
+ return false;
+
+ if (BO->getOpcode() != Instruction::Add &&
+ BO->getOpcode() != Instruction::Sub &&
+ BO->getOpcode() != Instruction::Mul)
+ return false;
+
+ const SCEV *LHS = SE->getSCEV(BO->getOperand(0));
+ const SCEV *RHS = SE->getSCEV(BO->getOperand(1));
+ bool Changed = false;
+
+ if (!BO->hasNoUnsignedWrap() &&
+ willNotOverflow(SE, BO->getOpcode(), /* Signed */ false, LHS, RHS)) {
+ BO->setHasNoUnsignedWrap();
+ SE->forgetValue(BO);
+ Changed = true;
+ }
+
+ if (!BO->hasNoSignedWrap() &&
+ willNotOverflow(SE, BO->getOpcode(), /* Signed */ true, LHS, RHS)) {
+ BO->setHasNoSignedWrap();
+ SE->forgetValue(BO);
+ Changed = true;
+ }
+
+ return Changed;
+}
+
+/// Annotate the Shr in (X << IVOperand) >> C as exact using the
+/// information from the IV's range. Returns true if anything changed, false
+/// otherwise.
+bool SimplifyIndvar::strengthenRightShift(BinaryOperator *BO,
+ Value *IVOperand) {
+ using namespace llvm::PatternMatch;
+
+ if (BO->getOpcode() == Instruction::Shl) {
+ bool Changed = false;
+ ConstantRange IVRange = SE->getUnsignedRange(SE->getSCEV(IVOperand));
+ for (auto *U : BO->users()) {
+ const APInt *C;
+ if (match(U,
+ m_AShr(m_Shl(m_Value(), m_Specific(IVOperand)), m_APInt(C))) ||
+ match(U,
+ m_LShr(m_Shl(m_Value(), m_Specific(IVOperand)), m_APInt(C)))) {
+ BinaryOperator *Shr = cast<BinaryOperator>(U);
+ if (!Shr->isExact() && IVRange.getUnsignedMin().uge(*C)) {
+ Shr->setIsExact(true);
+ Changed = true;
+ }
+ }
+ }
+ return Changed;
+ }
+
+ return false;
+}
+
+/// Add all uses of Def to the current IV's worklist.
+static void pushIVUsers(
+ Instruction *Def, Loop *L,
+ SmallPtrSet<Instruction*,16> &Simplified,
+ SmallVectorImpl< std::pair<Instruction*,Instruction*> > &SimpleIVUsers) {
+
+ for (User *U : Def->users()) {
+ Instruction *UI = cast<Instruction>(U);
+
+ // Avoid infinite or exponential worklist processing.
+ // Also ensure unique worklist users.
+ // If Def is a LoopPhi, it may not be in the Simplified set, so check for
+ // self edges first.
+ if (UI == Def)
+ continue;
+
+ // Only change the current Loop, do not change the other parts (e.g. other
+ // Loops).
+ if (!L->contains(UI))
+ continue;
+
+ // Do not push the same instruction more than once.
+ if (!Simplified.insert(UI).second)
+ continue;
+
+ SimpleIVUsers.push_back(std::make_pair(UI, Def));
+ }
+}
+
+/// Return true if this instruction generates a simple SCEV
+/// expression in terms of that IV.
+///
+/// This is similar to IVUsers' isInteresting() but processes each instruction
+/// non-recursively when the operand is already known to be a simpleIVUser.
+///
+static bool isSimpleIVUser(Instruction *I, const Loop *L, ScalarEvolution *SE) {
+ if (!SE->isSCEVable(I->getType()))
+ return false;
+
+ // Get the symbolic expression for this instruction.
+ const SCEV *S = SE->getSCEV(I);
+
+ // Only consider affine recurrences.
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S);
+ if (AR && AR->getLoop() == L)
+ return true;
+
+ return false;
+}
+
+/// Iteratively perform simplification on a worklist of users
+/// of the specified induction variable. Each successive simplification may push
+/// more users which may themselves be candidates for simplification.
+///
+/// This algorithm does not require IVUsers analysis. Instead, it simplifies
+/// instructions in-place during analysis. Rather than rewriting induction
+/// variables bottom-up from their users, it transforms a chain of IVUsers
+/// top-down, updating the IR only when it encounters a clear optimization
+/// opportunity.
+///
+/// Once DisableIVRewrite is default, LSR will be the only client of IVUsers.
+///
+void SimplifyIndvar::simplifyUsers(PHINode *CurrIV, IVVisitor *V) {
+ if (!SE->isSCEVable(CurrIV->getType()))
+ return;
+
+ // Instructions processed by SimplifyIndvar for CurrIV.
+ SmallPtrSet<Instruction*,16> Simplified;
+
+ // Use-def pairs if IV users waiting to be processed for CurrIV.
+ SmallVector<std::pair<Instruction*, Instruction*>, 8> SimpleIVUsers;
+
+ // Push users of the current LoopPhi. In rare cases, pushIVUsers may be
+ // called multiple times for the same LoopPhi. This is the proper thing to
+ // do for loop header phis that use each other.
+ pushIVUsers(CurrIV, L, Simplified, SimpleIVUsers);
+
+ while (!SimpleIVUsers.empty()) {
+ std::pair<Instruction*, Instruction*> UseOper =
+ SimpleIVUsers.pop_back_val();
+ Instruction *UseInst = UseOper.first;
+
+ // If a user of the IndVar is trivially dead, we prefer just to mark it dead
+ // rather than try to do some complex analysis or transformation (such as
+ // widening) basing on it.
+ // TODO: Propagate TLI and pass it here to handle more cases.
+ if (isInstructionTriviallyDead(UseInst, /* TLI */ nullptr)) {
+ DeadInsts.emplace_back(UseInst);
+ continue;
+ }
+
+ // Bypass back edges to avoid extra work.
+ if (UseInst == CurrIV) continue;
+
+ // Try to replace UseInst with a loop invariant before any other
+ // simplifications.
+ if (replaceIVUserWithLoopInvariant(UseInst))
+ continue;
+
+ Instruction *IVOperand = UseOper.second;
+ for (unsigned N = 0; IVOperand; ++N) {
+ assert(N <= Simplified.size() && "runaway iteration");
+
+ Value *NewOper = foldIVUser(UseInst, IVOperand);
+ if (!NewOper)
+ break; // done folding
+ IVOperand = dyn_cast<Instruction>(NewOper);
+ }
+ if (!IVOperand)
+ continue;
+
+ if (eliminateIVUser(UseInst, IVOperand)) {
+ pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers);
+ continue;
+ }
+
+ if (BinaryOperator *BO = dyn_cast<BinaryOperator>(UseInst)) {
+ if ((isa<OverflowingBinaryOperator>(BO) &&
+ strengthenOverflowingOperation(BO, IVOperand)) ||
+ (isa<ShlOperator>(BO) && strengthenRightShift(BO, IVOperand))) {
+ // re-queue uses of the now modified binary operator and fall
+ // through to the checks that remain.
+ pushIVUsers(IVOperand, L, Simplified, SimpleIVUsers);
+ }
+ }
+
+ CastInst *Cast = dyn_cast<CastInst>(UseInst);
+ if (V && Cast) {
+ V->visitCast(Cast);
+ continue;
+ }
+ if (isSimpleIVUser(UseInst, L, SE)) {
+ pushIVUsers(UseInst, L, Simplified, SimpleIVUsers);
+ }
+ }
+}
+
+namespace llvm {
+
+void IVVisitor::anchor() { }
+
+/// Simplify instructions that use this induction variable
+/// by using ScalarEvolution to analyze the IV's recurrence.
+bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT,
+ LoopInfo *LI, const TargetTransformInfo *TTI,
+ SmallVectorImpl<WeakTrackingVH> &Dead,
+ SCEVExpander &Rewriter, IVVisitor *V) {
+ SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, TTI,
+ Rewriter, Dead);
+ SIV.simplifyUsers(CurrIV, V);
+ return SIV.hasChanged();
+}
+
+/// Simplify users of induction variables within this
+/// loop. This does not actually change or add IVs.
+bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
+ LoopInfo *LI, const TargetTransformInfo *TTI,
+ SmallVectorImpl<WeakTrackingVH> &Dead) {
+ SCEVExpander Rewriter(*SE, SE->getDataLayout(), "indvars");
+#ifndef NDEBUG
+ Rewriter.setDebugType(DEBUG_TYPE);
+#endif
+ bool Changed = false;
+ for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
+ Changed |=
+ simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, TTI, Dead, Rewriter);
+ }
+ return Changed;
+}
+
+} // namespace llvm
//===----------------------------------------------------------------------===//
// Widen Induction Variables - Extend the width of an IV to cover its
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyLibCalls.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyLibCalls.cpp
index cbe7799239..f9a9dd237b 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1,617 +1,617 @@
-//===------ SimplifyLibCalls.cpp - Library calls simplifier ---------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the library calls simplifier. It does not implement
-// any pass, but can't be used by other passes to do simplifications.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
-#include "llvm/ADT/APSInt.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/KnownBits.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/Transforms/Utils/SizeOpts.h"
-
-using namespace llvm;
-using namespace PatternMatch;
-
-static cl::opt<bool>
- EnableUnsafeFPShrink("enable-double-float-shrink", cl::Hidden,
- cl::init(false),
- cl::desc("Enable unsafe double to float "
- "shrinking for math lib calls"));
-
-//===----------------------------------------------------------------------===//
-// Helper Functions
-//===----------------------------------------------------------------------===//
-
-static bool ignoreCallingConv(LibFunc Func) {
- return Func == LibFunc_abs || Func == LibFunc_labs ||
- Func == LibFunc_llabs || Func == LibFunc_strlen;
-}
-
-static bool isCallingConvCCompatible(CallInst *CI) {
- switch(CI->getCallingConv()) {
- default:
- return false;
- case llvm::CallingConv::C:
- return true;
- case llvm::CallingConv::ARM_APCS:
- case llvm::CallingConv::ARM_AAPCS:
- case llvm::CallingConv::ARM_AAPCS_VFP: {
-
- // The iOS ABI diverges from the standard in some cases, so for now don't
- // try to simplify those calls.
- if (Triple(CI->getModule()->getTargetTriple()).isiOS())
- return false;
-
- auto *FuncTy = CI->getFunctionType();
-
- if (!FuncTy->getReturnType()->isPointerTy() &&
- !FuncTy->getReturnType()->isIntegerTy() &&
- !FuncTy->getReturnType()->isVoidTy())
- return false;
-
- for (auto Param : FuncTy->params()) {
- if (!Param->isPointerTy() && !Param->isIntegerTy())
- return false;
- }
- return true;
- }
- }
- return false;
-}
-
-/// Return true if it is only used in equality comparisons with With.
-static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
- for (User *U : V->users()) {
- if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
- if (IC->isEquality() && IC->getOperand(1) == With)
- continue;
- // Unknown instruction.
- return false;
- }
- return true;
-}
-
-static bool callHasFloatingPointArgument(const CallInst *CI) {
- return any_of(CI->operands(), [](const Use &OI) {
- return OI->getType()->isFloatingPointTy();
- });
-}
-
-static bool callHasFP128Argument(const CallInst *CI) {
- return any_of(CI->operands(), [](const Use &OI) {
- return OI->getType()->isFP128Ty();
- });
-}
-
-static Value *convertStrToNumber(CallInst *CI, StringRef &Str, int64_t Base) {
- if (Base < 2 || Base > 36)
- // handle special zero base
- if (Base != 0)
- return nullptr;
-
- char *End;
- std::string nptr = Str.str();
- errno = 0;
- long long int Result = strtoll(nptr.c_str(), &End, Base);
- if (errno)
- return nullptr;
-
- // if we assume all possible target locales are ASCII supersets,
- // then if strtoll successfully parses a number on the host,
- // it will also successfully parse the same way on the target
- if (*End != '\0')
- return nullptr;
-
- if (!isIntN(CI->getType()->getPrimitiveSizeInBits(), Result))
- return nullptr;
-
- return ConstantInt::get(CI->getType(), Result);
-}
-
-static bool isOnlyUsedInComparisonWithZero(Value *V) {
- for (User *U : V->users()) {
- if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
- if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
- if (C->isNullValue())
- continue;
- // Unknown instruction.
- return false;
- }
- return true;
-}
-
-static bool canTransformToMemCmp(CallInst *CI, Value *Str, uint64_t Len,
- const DataLayout &DL) {
- if (!isOnlyUsedInComparisonWithZero(CI))
- return false;
-
- if (!isDereferenceableAndAlignedPointer(Str, Align(1), APInt(64, Len), DL))
- return false;
-
- if (CI->getFunction()->hasFnAttribute(Attribute::SanitizeMemory))
- return false;
-
- return true;
-}
-
-static void annotateDereferenceableBytes(CallInst *CI,
- ArrayRef<unsigned> ArgNos,
- uint64_t DereferenceableBytes) {
- const Function *F = CI->getCaller();
- if (!F)
- return;
- for (unsigned ArgNo : ArgNos) {
- uint64_t DerefBytes = DereferenceableBytes;
- unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace();
- if (!llvm::NullPointerIsDefined(F, AS) ||
- CI->paramHasAttr(ArgNo, Attribute::NonNull))
- DerefBytes = std::max(CI->getDereferenceableOrNullBytes(
- ArgNo + AttributeList::FirstArgIndex),
- DereferenceableBytes);
-
- if (CI->getDereferenceableBytes(ArgNo + AttributeList::FirstArgIndex) <
- DerefBytes) {
- CI->removeParamAttr(ArgNo, Attribute::Dereferenceable);
- if (!llvm::NullPointerIsDefined(F, AS) ||
- CI->paramHasAttr(ArgNo, Attribute::NonNull))
- CI->removeParamAttr(ArgNo, Attribute::DereferenceableOrNull);
- CI->addParamAttr(ArgNo, Attribute::getWithDereferenceableBytes(
- CI->getContext(), DerefBytes));
- }
- }
-}
-
-static void annotateNonNullBasedOnAccess(CallInst *CI,
- ArrayRef<unsigned> ArgNos) {
- Function *F = CI->getCaller();
- if (!F)
- return;
-
- for (unsigned ArgNo : ArgNos) {
- if (CI->paramHasAttr(ArgNo, Attribute::NonNull))
- continue;
- unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace();
- if (llvm::NullPointerIsDefined(F, AS))
- continue;
-
- CI->addParamAttr(ArgNo, Attribute::NonNull);
- annotateDereferenceableBytes(CI, ArgNo, 1);
- }
-}
-
-static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> ArgNos,
- Value *Size, const DataLayout &DL) {
- if (ConstantInt *LenC = dyn_cast<ConstantInt>(Size)) {
- annotateNonNullBasedOnAccess(CI, ArgNos);
- annotateDereferenceableBytes(CI, ArgNos, LenC->getZExtValue());
- } else if (isKnownNonZero(Size, DL)) {
- annotateNonNullBasedOnAccess(CI, ArgNos);
- const APInt *X, *Y;
- uint64_t DerefMin = 1;
- if (match(Size, m_Select(m_Value(), m_APInt(X), m_APInt(Y)))) {
- DerefMin = std::min(X->getZExtValue(), Y->getZExtValue());
- annotateDereferenceableBytes(CI, ArgNos, DerefMin);
- }
- }
-}
-
-//===----------------------------------------------------------------------===//
-// String and Memory Library Call Optimizations
-//===----------------------------------------------------------------------===//
-
-Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilderBase &B) {
- // Extract some information from the instruction
- Value *Dst = CI->getArgOperand(0);
- Value *Src = CI->getArgOperand(1);
- annotateNonNullBasedOnAccess(CI, {0, 1});
-
- // See if we can get the length of the input string.
- uint64_t Len = GetStringLength(Src);
- if (Len)
- annotateDereferenceableBytes(CI, 1, Len);
- else
- return nullptr;
- --Len; // Unbias length.
-
- // Handle the simple, do-nothing case: strcat(x, "") -> x
- if (Len == 0)
- return Dst;
-
- return emitStrLenMemCpy(Src, Dst, Len, B);
-}
-
-Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
- IRBuilderBase &B) {
- // We need to find the end of the destination string. That's where the
- // memory is to be moved to. We just generate a call to strlen.
- Value *DstLen = emitStrLen(Dst, B, DL, TLI);
- if (!DstLen)
- return nullptr;
-
- // Now that we have the destination's length, we must index into the
- // destination's pointer to get the actual memcpy destination (end of
- // the string .. we're concatenating).
- Value *CpyDst = B.CreateGEP(B.getInt8Ty(), Dst, DstLen, "endptr");
-
- // We have enough information to now generate the memcpy call to do the
- // concatenation for us. Make a memcpy to copy the nul byte with align = 1.
- B.CreateMemCpy(
- CpyDst, Align(1), Src, Align(1),
- ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1));
- return Dst;
-}
-
-Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilderBase &B) {
- // Extract some information from the instruction.
- Value *Dst = CI->getArgOperand(0);
- Value *Src = CI->getArgOperand(1);
- Value *Size = CI->getArgOperand(2);
- uint64_t Len;
- annotateNonNullBasedOnAccess(CI, 0);
- if (isKnownNonZero(Size, DL))
- annotateNonNullBasedOnAccess(CI, 1);
-
- // We don't do anything if length is not constant.
- ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size);
- if (LengthArg) {
- Len = LengthArg->getZExtValue();
- // strncat(x, c, 0) -> x
- if (!Len)
- return Dst;
- } else {
- return nullptr;
- }
-
- // See if we can get the length of the input string.
- uint64_t SrcLen = GetStringLength(Src);
- if (SrcLen) {
- annotateDereferenceableBytes(CI, 1, SrcLen);
- --SrcLen; // Unbias length.
- } else {
- return nullptr;
- }
-
- // strncat(x, "", c) -> x
- if (SrcLen == 0)
- return Dst;
-
- // We don't optimize this case.
- if (Len < SrcLen)
- return nullptr;
-
- // strncat(x, s, c) -> strcat(x, s)
- // s is constant so the strcat can be optimized further.
- return emitStrLenMemCpy(Src, Dst, SrcLen, B);
-}
-
-Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) {
- Function *Callee = CI->getCalledFunction();
- FunctionType *FT = Callee->getFunctionType();
- Value *SrcStr = CI->getArgOperand(0);
- annotateNonNullBasedOnAccess(CI, 0);
-
- // If the second operand is non-constant, see if we can compute the length
- // of the input string and turn this into memchr.
- ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
- if (!CharC) {
- uint64_t Len = GetStringLength(SrcStr);
- if (Len)
- annotateDereferenceableBytes(CI, 0, Len);
- else
- return nullptr;
- if (!FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32.
- return nullptr;
-
- return emitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len),
- B, DL, TLI);
- }
-
- // Otherwise, the character is a constant, see if the first argument is
- // a string literal. If so, we can constant fold.
- StringRef Str;
- if (!getConstantStringInfo(SrcStr, Str)) {
- if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
- if (Value *StrLen = emitStrLen(SrcStr, B, DL, TLI))
- return B.CreateGEP(B.getInt8Ty(), SrcStr, StrLen, "strchr");
- return nullptr;
- }
-
- // Compute the offset, make sure to handle the case when we're searching for
- // zero (a weird way to spell strlen).
- size_t I = (0xFF & CharC->getSExtValue()) == 0
- ? Str.size()
- : Str.find(CharC->getSExtValue());
- if (I == StringRef::npos) // Didn't find the char. strchr returns null.
- return Constant::getNullValue(CI->getType());
-
- // strchr(s+n,c) -> gep(s+n+i,c)
- return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strchr");
-}
-
-Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilderBase &B) {
- Value *SrcStr = CI->getArgOperand(0);
- ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
- annotateNonNullBasedOnAccess(CI, 0);
-
- // Cannot fold anything if we're not looking for a constant.
- if (!CharC)
- return nullptr;
-
- StringRef Str;
- if (!getConstantStringInfo(SrcStr, Str)) {
- // strrchr(s, 0) -> strchr(s, 0)
- if (CharC->isZero())
- return emitStrChr(SrcStr, '\0', B, TLI);
- return nullptr;
- }
-
- // Compute the offset.
- size_t I = (0xFF & CharC->getSExtValue()) == 0
- ? Str.size()
- : Str.rfind(CharC->getSExtValue());
- if (I == StringRef::npos) // Didn't find the char. Return null.
- return Constant::getNullValue(CI->getType());
-
- // strrchr(s+n,c) -> gep(s+n+i,c)
- return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strrchr");
-}
-
-Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) {
- Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
- if (Str1P == Str2P) // strcmp(x,x) -> 0
- return ConstantInt::get(CI->getType(), 0);
-
- StringRef Str1, Str2;
- bool HasStr1 = getConstantStringInfo(Str1P, Str1);
- bool HasStr2 = getConstantStringInfo(Str2P, Str2);
-
- // strcmp(x, y) -> cnst (if both x and y are constant strings)
- if (HasStr1 && HasStr2)
- return ConstantInt::get(CI->getType(), Str1.compare(Str2));
-
- if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x
- return B.CreateNeg(B.CreateZExt(
- B.CreateLoad(B.getInt8Ty(), Str2P, "strcmpload"), CI->getType()));
-
- if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
- return B.CreateZExt(B.CreateLoad(B.getInt8Ty(), Str1P, "strcmpload"),
- CI->getType());
-
- // strcmp(P, "x") -> memcmp(P, "x", 2)
- uint64_t Len1 = GetStringLength(Str1P);
- if (Len1)
- annotateDereferenceableBytes(CI, 0, Len1);
- uint64_t Len2 = GetStringLength(Str2P);
- if (Len2)
- annotateDereferenceableBytes(CI, 1, Len2);
-
- if (Len1 && Len2) {
- return emitMemCmp(Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()),
- std::min(Len1, Len2)),
- B, DL, TLI);
- }
-
- // strcmp to memcmp
- if (!HasStr1 && HasStr2) {
- if (canTransformToMemCmp(CI, Str1P, Len2, DL))
- return emitMemCmp(
- Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL,
- TLI);
- } else if (HasStr1 && !HasStr2) {
- if (canTransformToMemCmp(CI, Str2P, Len1, DL))
- return emitMemCmp(
- Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL,
- TLI);
- }
-
- annotateNonNullBasedOnAccess(CI, {0, 1});
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) {
- Value *Str1P = CI->getArgOperand(0);
- Value *Str2P = CI->getArgOperand(1);
- Value *Size = CI->getArgOperand(2);
- if (Str1P == Str2P) // strncmp(x,x,n) -> 0
- return ConstantInt::get(CI->getType(), 0);
-
- if (isKnownNonZero(Size, DL))
- annotateNonNullBasedOnAccess(CI, {0, 1});
- // Get the length argument if it is constant.
- uint64_t Length;
- if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size))
- Length = LengthArg->getZExtValue();
- else
- return nullptr;
-
- if (Length == 0) // strncmp(x,y,0) -> 0
- return ConstantInt::get(CI->getType(), 0);
-
- if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
- return emitMemCmp(Str1P, Str2P, Size, B, DL, TLI);
-
- StringRef Str1, Str2;
- bool HasStr1 = getConstantStringInfo(Str1P, Str1);
- bool HasStr2 = getConstantStringInfo(Str2P, Str2);
-
- // strncmp(x, y) -> cnst (if both x and y are constant strings)
- if (HasStr1 && HasStr2) {
- StringRef SubStr1 = Str1.substr(0, Length);
- StringRef SubStr2 = Str2.substr(0, Length);
- return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
- }
-
- if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> -*x
- return B.CreateNeg(B.CreateZExt(
- B.CreateLoad(B.getInt8Ty(), Str2P, "strcmpload"), CI->getType()));
-
- if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x
- return B.CreateZExt(B.CreateLoad(B.getInt8Ty(), Str1P, "strcmpload"),
- CI->getType());
-
- uint64_t Len1 = GetStringLength(Str1P);
- if (Len1)
- annotateDereferenceableBytes(CI, 0, Len1);
- uint64_t Len2 = GetStringLength(Str2P);
- if (Len2)
- annotateDereferenceableBytes(CI, 1, Len2);
-
- // strncmp to memcmp
- if (!HasStr1 && HasStr2) {
- Len2 = std::min(Len2, Length);
- if (canTransformToMemCmp(CI, Str1P, Len2, DL))
- return emitMemCmp(
- Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL,
- TLI);
- } else if (HasStr1 && !HasStr2) {
- Len1 = std::min(Len1, Length);
- if (canTransformToMemCmp(CI, Str2P, Len1, DL))
- return emitMemCmp(
- Str1P, Str2P,
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL,
- TLI);
- }
-
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilderBase &B) {
- Value *Src = CI->getArgOperand(0);
- ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
- uint64_t SrcLen = GetStringLength(Src);
- if (SrcLen && Size) {
- annotateDereferenceableBytes(CI, 0, SrcLen);
- if (SrcLen <= Size->getZExtValue() + 1)
- return emitStrDup(Src, B, TLI);
- }
-
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) {
- Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
- if (Dst == Src) // strcpy(x,x) -> x
- return Src;
-
- annotateNonNullBasedOnAccess(CI, {0, 1});
- // See if we can get the length of the input string.
- uint64_t Len = GetStringLength(Src);
- if (Len)
- annotateDereferenceableBytes(CI, 1, Len);
- else
- return nullptr;
-
- // We have enough information to now generate the memcpy call to do the
- // copy for us. Make a memcpy to copy the nul byte with align = 1.
- CallInst *NewCI =
- B.CreateMemCpy(Dst, Align(1), Src, Align(1),
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
- NewCI->setAttributes(CI->getAttributes());
+//===------ SimplifyLibCalls.cpp - Library calls simplifier ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the library calls simplifier. It does not implement
+// any pass, but can't be used by other passes to do simplifications.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+static cl::opt<bool>
+ EnableUnsafeFPShrink("enable-double-float-shrink", cl::Hidden,
+ cl::init(false),
+ cl::desc("Enable unsafe double to float "
+ "shrinking for math lib calls"));
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+static bool ignoreCallingConv(LibFunc Func) {
+ return Func == LibFunc_abs || Func == LibFunc_labs ||
+ Func == LibFunc_llabs || Func == LibFunc_strlen;
+}
+
+static bool isCallingConvCCompatible(CallInst *CI) {
+ switch(CI->getCallingConv()) {
+ default:
+ return false;
+ case llvm::CallingConv::C:
+ return true;
+ case llvm::CallingConv::ARM_APCS:
+ case llvm::CallingConv::ARM_AAPCS:
+ case llvm::CallingConv::ARM_AAPCS_VFP: {
+
+ // The iOS ABI diverges from the standard in some cases, so for now don't
+ // try to simplify those calls.
+ if (Triple(CI->getModule()->getTargetTriple()).isiOS())
+ return false;
+
+ auto *FuncTy = CI->getFunctionType();
+
+ if (!FuncTy->getReturnType()->isPointerTy() &&
+ !FuncTy->getReturnType()->isIntegerTy() &&
+ !FuncTy->getReturnType()->isVoidTy())
+ return false;
+
+ for (auto Param : FuncTy->params()) {
+ if (!Param->isPointerTy() && !Param->isIntegerTy())
+ return false;
+ }
+ return true;
+ }
+ }
+ return false;
+}
+
+/// Return true if it is only used in equality comparisons with With.
+static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
+ for (User *U : V->users()) {
+ if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
+ if (IC->isEquality() && IC->getOperand(1) == With)
+ continue;
+ // Unknown instruction.
+ return false;
+ }
+ return true;
+}
+
+static bool callHasFloatingPointArgument(const CallInst *CI) {
+ return any_of(CI->operands(), [](const Use &OI) {
+ return OI->getType()->isFloatingPointTy();
+ });
+}
+
+static bool callHasFP128Argument(const CallInst *CI) {
+ return any_of(CI->operands(), [](const Use &OI) {
+ return OI->getType()->isFP128Ty();
+ });
+}
+
+static Value *convertStrToNumber(CallInst *CI, StringRef &Str, int64_t Base) {
+ if (Base < 2 || Base > 36)
+ // handle special zero base
+ if (Base != 0)
+ return nullptr;
+
+ char *End;
+ std::string nptr = Str.str();
+ errno = 0;
+ long long int Result = strtoll(nptr.c_str(), &End, Base);
+ if (errno)
+ return nullptr;
+
+ // if we assume all possible target locales are ASCII supersets,
+ // then if strtoll successfully parses a number on the host,
+ // it will also successfully parse the same way on the target
+ if (*End != '\0')
+ return nullptr;
+
+ if (!isIntN(CI->getType()->getPrimitiveSizeInBits(), Result))
+ return nullptr;
+
+ return ConstantInt::get(CI->getType(), Result);
+}
+
+static bool isOnlyUsedInComparisonWithZero(Value *V) {
+ for (User *U : V->users()) {
+ if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
+ if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+ if (C->isNullValue())
+ continue;
+ // Unknown instruction.
+ return false;
+ }
+ return true;
+}
+
+static bool canTransformToMemCmp(CallInst *CI, Value *Str, uint64_t Len,
+ const DataLayout &DL) {
+ if (!isOnlyUsedInComparisonWithZero(CI))
+ return false;
+
+ if (!isDereferenceableAndAlignedPointer(Str, Align(1), APInt(64, Len), DL))
+ return false;
+
+ if (CI->getFunction()->hasFnAttribute(Attribute::SanitizeMemory))
+ return false;
+
+ return true;
+}
+
+static void annotateDereferenceableBytes(CallInst *CI,
+ ArrayRef<unsigned> ArgNos,
+ uint64_t DereferenceableBytes) {
+ const Function *F = CI->getCaller();
+ if (!F)
+ return;
+ for (unsigned ArgNo : ArgNos) {
+ uint64_t DerefBytes = DereferenceableBytes;
+ unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace();
+ if (!llvm::NullPointerIsDefined(F, AS) ||
+ CI->paramHasAttr(ArgNo, Attribute::NonNull))
+ DerefBytes = std::max(CI->getDereferenceableOrNullBytes(
+ ArgNo + AttributeList::FirstArgIndex),
+ DereferenceableBytes);
+
+ if (CI->getDereferenceableBytes(ArgNo + AttributeList::FirstArgIndex) <
+ DerefBytes) {
+ CI->removeParamAttr(ArgNo, Attribute::Dereferenceable);
+ if (!llvm::NullPointerIsDefined(F, AS) ||
+ CI->paramHasAttr(ArgNo, Attribute::NonNull))
+ CI->removeParamAttr(ArgNo, Attribute::DereferenceableOrNull);
+ CI->addParamAttr(ArgNo, Attribute::getWithDereferenceableBytes(
+ CI->getContext(), DerefBytes));
+ }
+ }
+}
+
+static void annotateNonNullBasedOnAccess(CallInst *CI,
+ ArrayRef<unsigned> ArgNos) {
+ Function *F = CI->getCaller();
+ if (!F)
+ return;
+
+ for (unsigned ArgNo : ArgNos) {
+ if (CI->paramHasAttr(ArgNo, Attribute::NonNull))
+ continue;
+ unsigned AS = CI->getArgOperand(ArgNo)->getType()->getPointerAddressSpace();
+ if (llvm::NullPointerIsDefined(F, AS))
+ continue;
+
+ CI->addParamAttr(ArgNo, Attribute::NonNull);
+ annotateDereferenceableBytes(CI, ArgNo, 1);
+ }
+}
+
+static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> ArgNos,
+ Value *Size, const DataLayout &DL) {
+ if (ConstantInt *LenC = dyn_cast<ConstantInt>(Size)) {
+ annotateNonNullBasedOnAccess(CI, ArgNos);
+ annotateDereferenceableBytes(CI, ArgNos, LenC->getZExtValue());
+ } else if (isKnownNonZero(Size, DL)) {
+ annotateNonNullBasedOnAccess(CI, ArgNos);
+ const APInt *X, *Y;
+ uint64_t DerefMin = 1;
+ if (match(Size, m_Select(m_Value(), m_APInt(X), m_APInt(Y)))) {
+ DerefMin = std::min(X->getZExtValue(), Y->getZExtValue());
+ annotateDereferenceableBytes(CI, ArgNos, DerefMin);
+ }
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// String and Memory Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilderBase &B) {
+ // Extract some information from the instruction
+ Value *Dst = CI->getArgOperand(0);
+ Value *Src = CI->getArgOperand(1);
+ annotateNonNullBasedOnAccess(CI, {0, 1});
+
+ // See if we can get the length of the input string.
+ uint64_t Len = GetStringLength(Src);
+ if (Len)
+ annotateDereferenceableBytes(CI, 1, Len);
+ else
+ return nullptr;
+ --Len; // Unbias length.
+
+ // Handle the simple, do-nothing case: strcat(x, "") -> x
+ if (Len == 0)
+ return Dst;
+
+ return emitStrLenMemCpy(Src, Dst, Len, B);
+}
+
+Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
+ IRBuilderBase &B) {
+ // We need to find the end of the destination string. That's where the
+ // memory is to be moved to. We just generate a call to strlen.
+ Value *DstLen = emitStrLen(Dst, B, DL, TLI);
+ if (!DstLen)
+ return nullptr;
+
+ // Now that we have the destination's length, we must index into the
+ // destination's pointer to get the actual memcpy destination (end of
+ // the string .. we're concatenating).
+ Value *CpyDst = B.CreateGEP(B.getInt8Ty(), Dst, DstLen, "endptr");
+
+ // We have enough information to now generate the memcpy call to do the
+ // concatenation for us. Make a memcpy to copy the nul byte with align = 1.
+ B.CreateMemCpy(
+ CpyDst, Align(1), Src, Align(1),
+ ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1));
+ return Dst;
+}
+
+Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilderBase &B) {
+ // Extract some information from the instruction.
+ Value *Dst = CI->getArgOperand(0);
+ Value *Src = CI->getArgOperand(1);
+ Value *Size = CI->getArgOperand(2);
+ uint64_t Len;
+ annotateNonNullBasedOnAccess(CI, 0);
+ if (isKnownNonZero(Size, DL))
+ annotateNonNullBasedOnAccess(CI, 1);
+
+ // We don't do anything if length is not constant.
+ ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size);
+ if (LengthArg) {
+ Len = LengthArg->getZExtValue();
+ // strncat(x, c, 0) -> x
+ if (!Len)
+ return Dst;
+ } else {
+ return nullptr;
+ }
+
+ // See if we can get the length of the input string.
+ uint64_t SrcLen = GetStringLength(Src);
+ if (SrcLen) {
+ annotateDereferenceableBytes(CI, 1, SrcLen);
+ --SrcLen; // Unbias length.
+ } else {
+ return nullptr;
+ }
+
+ // strncat(x, "", c) -> x
+ if (SrcLen == 0)
+ return Dst;
+
+ // We don't optimize this case.
+ if (Len < SrcLen)
+ return nullptr;
+
+ // strncat(x, s, c) -> strcat(x, s)
+ // s is constant so the strcat can be optimized further.
+ return emitStrLenMemCpy(Src, Dst, SrcLen, B);
+}
+
+Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) {
+ Function *Callee = CI->getCalledFunction();
+ FunctionType *FT = Callee->getFunctionType();
+ Value *SrcStr = CI->getArgOperand(0);
+ annotateNonNullBasedOnAccess(CI, 0);
+
+ // If the second operand is non-constant, see if we can compute the length
+ // of the input string and turn this into memchr.
+ ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ if (!CharC) {
+ uint64_t Len = GetStringLength(SrcStr);
+ if (Len)
+ annotateDereferenceableBytes(CI, 0, Len);
+ else
+ return nullptr;
+ if (!FT->getParamType(1)->isIntegerTy(32)) // memchr needs i32.
+ return nullptr;
+
+ return emitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len),
+ B, DL, TLI);
+ }
+
+ // Otherwise, the character is a constant, see if the first argument is
+ // a string literal. If so, we can constant fold.
+ StringRef Str;
+ if (!getConstantStringInfo(SrcStr, Str)) {
+ if (CharC->isZero()) // strchr(p, 0) -> p + strlen(p)
+ if (Value *StrLen = emitStrLen(SrcStr, B, DL, TLI))
+ return B.CreateGEP(B.getInt8Ty(), SrcStr, StrLen, "strchr");
+ return nullptr;
+ }
+
+ // Compute the offset, make sure to handle the case when we're searching for
+ // zero (a weird way to spell strlen).
+ size_t I = (0xFF & CharC->getSExtValue()) == 0
+ ? Str.size()
+ : Str.find(CharC->getSExtValue());
+ if (I == StringRef::npos) // Didn't find the char. strchr returns null.
+ return Constant::getNullValue(CI->getType());
+
+ // strchr(s+n,c) -> gep(s+n+i,c)
+ return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strchr");
+}
+
+Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilderBase &B) {
+ Value *SrcStr = CI->getArgOperand(0);
+ ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ annotateNonNullBasedOnAccess(CI, 0);
+
+ // Cannot fold anything if we're not looking for a constant.
+ if (!CharC)
+ return nullptr;
+
+ StringRef Str;
+ if (!getConstantStringInfo(SrcStr, Str)) {
+ // strrchr(s, 0) -> strchr(s, 0)
+ if (CharC->isZero())
+ return emitStrChr(SrcStr, '\0', B, TLI);
+ return nullptr;
+ }
+
+ // Compute the offset.
+ size_t I = (0xFF & CharC->getSExtValue()) == 0
+ ? Str.size()
+ : Str.rfind(CharC->getSExtValue());
+ if (I == StringRef::npos) // Didn't find the char. Return null.
+ return Constant::getNullValue(CI->getType());
+
+ // strrchr(s+n,c) -> gep(s+n+i,c)
+ return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strrchr");
+}
+
+Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) {
+ Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
+ if (Str1P == Str2P) // strcmp(x,x) -> 0
+ return ConstantInt::get(CI->getType(), 0);
+
+ StringRef Str1, Str2;
+ bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+ bool HasStr2 = getConstantStringInfo(Str2P, Str2);
+
+ // strcmp(x, y) -> cnst (if both x and y are constant strings)
+ if (HasStr1 && HasStr2)
+ return ConstantInt::get(CI->getType(), Str1.compare(Str2));
+
+ if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x
+ return B.CreateNeg(B.CreateZExt(
+ B.CreateLoad(B.getInt8Ty(), Str2P, "strcmpload"), CI->getType()));
+
+ if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
+ return B.CreateZExt(B.CreateLoad(B.getInt8Ty(), Str1P, "strcmpload"),
+ CI->getType());
+
+ // strcmp(P, "x") -> memcmp(P, "x", 2)
+ uint64_t Len1 = GetStringLength(Str1P);
+ if (Len1)
+ annotateDereferenceableBytes(CI, 0, Len1);
+ uint64_t Len2 = GetStringLength(Str2P);
+ if (Len2)
+ annotateDereferenceableBytes(CI, 1, Len2);
+
+ if (Len1 && Len2) {
+ return emitMemCmp(Str1P, Str2P,
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+ std::min(Len1, Len2)),
+ B, DL, TLI);
+ }
+
+ // strcmp to memcmp
+ if (!HasStr1 && HasStr2) {
+ if (canTransformToMemCmp(CI, Str1P, Len2, DL))
+ return emitMemCmp(
+ Str1P, Str2P,
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL,
+ TLI);
+ } else if (HasStr1 && !HasStr2) {
+ if (canTransformToMemCmp(CI, Str2P, Len1, DL))
+ return emitMemCmp(
+ Str1P, Str2P,
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL,
+ TLI);
+ }
+
+ annotateNonNullBasedOnAccess(CI, {0, 1});
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) {
+ Value *Str1P = CI->getArgOperand(0);
+ Value *Str2P = CI->getArgOperand(1);
+ Value *Size = CI->getArgOperand(2);
+ if (Str1P == Str2P) // strncmp(x,x,n) -> 0
+ return ConstantInt::get(CI->getType(), 0);
+
+ if (isKnownNonZero(Size, DL))
+ annotateNonNullBasedOnAccess(CI, {0, 1});
+ // Get the length argument if it is constant.
+ uint64_t Length;
+ if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size))
+ Length = LengthArg->getZExtValue();
+ else
+ return nullptr;
+
+ if (Length == 0) // strncmp(x,y,0) -> 0
+ return ConstantInt::get(CI->getType(), 0);
+
+ if (Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
+ return emitMemCmp(Str1P, Str2P, Size, B, DL, TLI);
+
+ StringRef Str1, Str2;
+ bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+ bool HasStr2 = getConstantStringInfo(Str2P, Str2);
+
+ // strncmp(x, y) -> cnst (if both x and y are constant strings)
+ if (HasStr1 && HasStr2) {
+ StringRef SubStr1 = Str1.substr(0, Length);
+ StringRef SubStr2 = Str2.substr(0, Length);
+ return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
+ }
+
+ if (HasStr1 && Str1.empty()) // strncmp("", x, n) -> -*x
+ return B.CreateNeg(B.CreateZExt(
+ B.CreateLoad(B.getInt8Ty(), Str2P, "strcmpload"), CI->getType()));
+
+ if (HasStr2 && Str2.empty()) // strncmp(x, "", n) -> *x
+ return B.CreateZExt(B.CreateLoad(B.getInt8Ty(), Str1P, "strcmpload"),
+ CI->getType());
+
+ uint64_t Len1 = GetStringLength(Str1P);
+ if (Len1)
+ annotateDereferenceableBytes(CI, 0, Len1);
+ uint64_t Len2 = GetStringLength(Str2P);
+ if (Len2)
+ annotateDereferenceableBytes(CI, 1, Len2);
+
+ // strncmp to memcmp
+ if (!HasStr1 && HasStr2) {
+ Len2 = std::min(Len2, Length);
+ if (canTransformToMemCmp(CI, Str1P, Len2, DL))
+ return emitMemCmp(
+ Str1P, Str2P,
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len2), B, DL,
+ TLI);
+ } else if (HasStr1 && !HasStr2) {
+ Len1 = std::min(Len1, Length);
+ if (canTransformToMemCmp(CI, Str2P, Len1, DL))
+ return emitMemCmp(
+ Str1P, Str2P,
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len1), B, DL,
+ TLI);
+ }
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilderBase &B) {
+ Value *Src = CI->getArgOperand(0);
+ ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ uint64_t SrcLen = GetStringLength(Src);
+ if (SrcLen && Size) {
+ annotateDereferenceableBytes(CI, 0, SrcLen);
+ if (SrcLen <= Size->getZExtValue() + 1)
+ return emitStrDup(Src, B, TLI);
+ }
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) {
+ Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+ if (Dst == Src) // strcpy(x,x) -> x
+ return Src;
+
+ annotateNonNullBasedOnAccess(CI, {0, 1});
+ // See if we can get the length of the input string.
+ uint64_t Len = GetStringLength(Src);
+ if (Len)
+ annotateDereferenceableBytes(CI, 1, Len);
+ else
+ return nullptr;
+
+ // We have enough information to now generate the memcpy call to do the
+ // copy for us. Make a memcpy to copy the nul byte with align = 1.
+ CallInst *NewCI =
+ B.CreateMemCpy(Dst, Align(1), Src, Align(1),
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
+ NewCI->setAttributes(CI->getAttributes());
NewCI->removeAttributes(AttributeList::ReturnIndex,
AttributeFuncs::typeIncompatible(NewCI->getType()));
- return Dst;
-}
-
-Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
- Function *Callee = CI->getCalledFunction();
- Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
- if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x)
- Value *StrLen = emitStrLen(Src, B, DL, TLI);
- return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
- }
-
- // See if we can get the length of the input string.
- uint64_t Len = GetStringLength(Src);
- if (Len)
- annotateDereferenceableBytes(CI, 1, Len);
- else
- return nullptr;
-
- Type *PT = Callee->getFunctionType()->getParamType(0);
- Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len);
- Value *DstEnd = B.CreateGEP(B.getInt8Ty(), Dst,
- ConstantInt::get(DL.getIntPtrType(PT), Len - 1));
-
- // We have enough information to now generate the memcpy call to do the
- // copy for us. Make a memcpy to copy the nul byte with align = 1.
- CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), LenV);
- NewCI->setAttributes(CI->getAttributes());
+ return Dst;
+}
+
+Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
+ Function *Callee = CI->getCalledFunction();
+ Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+ if (Dst == Src) { // stpcpy(x,x) -> x+strlen(x)
+ Value *StrLen = emitStrLen(Src, B, DL, TLI);
+ return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
+ }
+
+ // See if we can get the length of the input string.
+ uint64_t Len = GetStringLength(Src);
+ if (Len)
+ annotateDereferenceableBytes(CI, 1, Len);
+ else
+ return nullptr;
+
+ Type *PT = Callee->getFunctionType()->getParamType(0);
+ Value *LenV = ConstantInt::get(DL.getIntPtrType(PT), Len);
+ Value *DstEnd = B.CreateGEP(B.getInt8Ty(), Dst,
+ ConstantInt::get(DL.getIntPtrType(PT), Len - 1));
+
+ // We have enough information to now generate the memcpy call to do the
+ // copy for us. Make a memcpy to copy the nul byte with align = 1.
+ CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), LenV);
+ NewCI->setAttributes(CI->getAttributes());
NewCI->removeAttributes(AttributeList::ReturnIndex,
AttributeFuncs::typeIncompatible(NewCI->getType()));
- return DstEnd;
-}
-
-Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
- Function *Callee = CI->getCalledFunction();
- Value *Dst = CI->getArgOperand(0);
- Value *Src = CI->getArgOperand(1);
- Value *Size = CI->getArgOperand(2);
- annotateNonNullBasedOnAccess(CI, 0);
- if (isKnownNonZero(Size, DL))
- annotateNonNullBasedOnAccess(CI, 1);
-
- uint64_t Len;
- if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size))
- Len = LengthArg->getZExtValue();
- else
- return nullptr;
-
- // strncpy(x, y, 0) -> x
- if (Len == 0)
- return Dst;
-
- // See if we can get the length of the input string.
- uint64_t SrcLen = GetStringLength(Src);
- if (SrcLen) {
- annotateDereferenceableBytes(CI, 1, SrcLen);
- --SrcLen; // Unbias length.
- } else {
- return nullptr;
- }
-
- if (SrcLen == 0) {
- // strncpy(x, "", y) -> memset(align 1 x, '\0', y)
- CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, Align(1));
- AttrBuilder ArgAttrs(CI->getAttributes().getParamAttributes(0));
- NewCI->setAttributes(NewCI->getAttributes().addParamAttributes(
- CI->getContext(), 0, ArgAttrs));
- return Dst;
- }
-
+ return DstEnd;
+}
+
+Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
+ Function *Callee = CI->getCalledFunction();
+ Value *Dst = CI->getArgOperand(0);
+ Value *Src = CI->getArgOperand(1);
+ Value *Size = CI->getArgOperand(2);
+ annotateNonNullBasedOnAccess(CI, 0);
+ if (isKnownNonZero(Size, DL))
+ annotateNonNullBasedOnAccess(CI, 1);
+
+ uint64_t Len;
+ if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(Size))
+ Len = LengthArg->getZExtValue();
+ else
+ return nullptr;
+
+ // strncpy(x, y, 0) -> x
+ if (Len == 0)
+ return Dst;
+
+ // See if we can get the length of the input string.
+ uint64_t SrcLen = GetStringLength(Src);
+ if (SrcLen) {
+ annotateDereferenceableBytes(CI, 1, SrcLen);
+ --SrcLen; // Unbias length.
+ } else {
+ return nullptr;
+ }
+
+ if (SrcLen == 0) {
+ // strncpy(x, "", y) -> memset(align 1 x, '\0', y)
+ CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, Align(1));
+ AttrBuilder ArgAttrs(CI->getAttributes().getParamAttributes(0));
+ NewCI->setAttributes(NewCI->getAttributes().addParamAttributes(
+ CI->getContext(), 0, ArgAttrs));
+ return Dst;
+ }
+
// strncpy(a, "a", 4) - > memcpy(a, "a\0\0\0", 4)
if (Len > SrcLen + 1) {
if (Len <= 128) {
@@ -625,1034 +625,1034 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
return nullptr;
}
}
-
- Type *PT = Callee->getFunctionType()->getParamType(0);
- // strncpy(x, s, c) -> memcpy(align 1 x, align 1 s, c) [s and c are constant]
- CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1),
- ConstantInt::get(DL.getIntPtrType(PT), Len));
- NewCI->setAttributes(CI->getAttributes());
+
+ Type *PT = Callee->getFunctionType()->getParamType(0);
+ // strncpy(x, s, c) -> memcpy(align 1 x, align 1 s, c) [s and c are constant]
+ CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1),
+ ConstantInt::get(DL.getIntPtrType(PT), Len));
+ NewCI->setAttributes(CI->getAttributes());
NewCI->removeAttributes(AttributeList::ReturnIndex,
AttributeFuncs::typeIncompatible(NewCI->getType()));
- return Dst;
-}
-
-Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
- unsigned CharSize) {
- Value *Src = CI->getArgOperand(0);
-
- // Constant folding: strlen("xyz") -> 3
- if (uint64_t Len = GetStringLength(Src, CharSize))
- return ConstantInt::get(CI->getType(), Len - 1);
-
- // If s is a constant pointer pointing to a string literal, we can fold
- // strlen(s + x) to strlen(s) - x, when x is known to be in the range
- // [0, strlen(s)] or the string has a single null terminator '\0' at the end.
- // We only try to simplify strlen when the pointer s points to an array
- // of i8. Otherwise, we would need to scale the offset x before doing the
- // subtraction. This will make the optimization more complex, and it's not
- // very useful because calling strlen for a pointer of other types is
- // very uncommon.
- if (GEPOperator *GEP = dyn_cast<GEPOperator>(Src)) {
- if (!isGEPBasedOnPointerToString(GEP, CharSize))
- return nullptr;
-
- ConstantDataArraySlice Slice;
- if (getConstantDataArrayInfo(GEP->getOperand(0), Slice, CharSize)) {
- uint64_t NullTermIdx;
- if (Slice.Array == nullptr) {
- NullTermIdx = 0;
- } else {
- NullTermIdx = ~((uint64_t)0);
- for (uint64_t I = 0, E = Slice.Length; I < E; ++I) {
- if (Slice.Array->getElementAsInteger(I + Slice.Offset) == 0) {
- NullTermIdx = I;
- break;
- }
- }
- // If the string does not have '\0', leave it to strlen to compute
- // its length.
- if (NullTermIdx == ~((uint64_t)0))
- return nullptr;
- }
-
- Value *Offset = GEP->getOperand(2);
- KnownBits Known = computeKnownBits(Offset, DL, 0, nullptr, CI, nullptr);
- Known.Zero.flipAllBits();
- uint64_t ArrSize =
- cast<ArrayType>(GEP->getSourceElementType())->getNumElements();
-
- // KnownZero's bits are flipped, so zeros in KnownZero now represent
- // bits known to be zeros in Offset, and ones in KnowZero represent
- // bits unknown in Offset. Therefore, Offset is known to be in range
- // [0, NullTermIdx] when the flipped KnownZero is non-negative and
- // unsigned-less-than NullTermIdx.
- //
- // If Offset is not provably in the range [0, NullTermIdx], we can still
- // optimize if we can prove that the program has undefined behavior when
- // Offset is outside that range. That is the case when GEP->getOperand(0)
- // is a pointer to an object whose memory extent is NullTermIdx+1.
- if ((Known.Zero.isNonNegative() && Known.Zero.ule(NullTermIdx)) ||
- (GEP->isInBounds() && isa<GlobalVariable>(GEP->getOperand(0)) &&
- NullTermIdx == ArrSize - 1)) {
- Offset = B.CreateSExtOrTrunc(Offset, CI->getType());
- return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx),
- Offset);
- }
- }
- }
-
- // strlen(x?"foo":"bars") --> x ? 3 : 4
- if (SelectInst *SI = dyn_cast<SelectInst>(Src)) {
- uint64_t LenTrue = GetStringLength(SI->getTrueValue(), CharSize);
- uint64_t LenFalse = GetStringLength(SI->getFalseValue(), CharSize);
- if (LenTrue && LenFalse) {
- ORE.emit([&]() {
- return OptimizationRemark("instcombine", "simplify-libcalls", CI)
- << "folded strlen(select) to select of constants";
- });
- return B.CreateSelect(SI->getCondition(),
- ConstantInt::get(CI->getType(), LenTrue - 1),
- ConstantInt::get(CI->getType(), LenFalse - 1));
- }
- }
-
- // strlen(x) != 0 --> *x != 0
- // strlen(x) == 0 --> *x == 0
- if (isOnlyUsedInZeroEqualityComparison(CI))
- return B.CreateZExt(B.CreateLoad(B.getIntNTy(CharSize), Src, "strlenfirst"),
- CI->getType());
-
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilderBase &B) {
- if (Value *V = optimizeStringLength(CI, B, 8))
- return V;
- annotateNonNullBasedOnAccess(CI, 0);
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilderBase &B) {
- Module &M = *CI->getModule();
- unsigned WCharSize = TLI->getWCharSize(M) * 8;
- // We cannot perform this optimization without wchar_size metadata.
- if (WCharSize == 0)
- return nullptr;
-
- return optimizeStringLength(CI, B, WCharSize);
-}
-
-Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilderBase &B) {
- StringRef S1, S2;
- bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
- bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
-
- // strpbrk(s, "") -> nullptr
- // strpbrk("", s) -> nullptr
- if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
- return Constant::getNullValue(CI->getType());
-
- // Constant folding.
- if (HasS1 && HasS2) {
- size_t I = S1.find_first_of(S2);
- if (I == StringRef::npos) // No match.
- return Constant::getNullValue(CI->getType());
-
- return B.CreateGEP(B.getInt8Ty(), CI->getArgOperand(0), B.getInt64(I),
- "strpbrk");
- }
-
- // strpbrk(s, "a") -> strchr(s, 'a')
- if (HasS2 && S2.size() == 1)
- return emitStrChr(CI->getArgOperand(0), S2[0], B, TLI);
-
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilderBase &B) {
- Value *EndPtr = CI->getArgOperand(1);
- if (isa<ConstantPointerNull>(EndPtr)) {
- // With a null EndPtr, this function won't capture the main argument.
- // It would be readonly too, except that it still may write to errno.
- CI->addParamAttr(0, Attribute::NoCapture);
- }
-
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilderBase &B) {
- StringRef S1, S2;
- bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
- bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
-
- // strspn(s, "") -> 0
- // strspn("", s) -> 0
- if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
- return Constant::getNullValue(CI->getType());
-
- // Constant folding.
- if (HasS1 && HasS2) {
- size_t Pos = S1.find_first_not_of(S2);
- if (Pos == StringRef::npos)
- Pos = S1.size();
- return ConstantInt::get(CI->getType(), Pos);
- }
-
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilderBase &B) {
- StringRef S1, S2;
- bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
- bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
-
- // strcspn("", s) -> 0
- if (HasS1 && S1.empty())
- return Constant::getNullValue(CI->getType());
-
- // Constant folding.
- if (HasS1 && HasS2) {
- size_t Pos = S1.find_first_of(S2);
- if (Pos == StringRef::npos)
- Pos = S1.size();
- return ConstantInt::get(CI->getType(), Pos);
- }
-
- // strcspn(s, "") -> strlen(s)
- if (HasS2 && S2.empty())
- return emitStrLen(CI->getArgOperand(0), B, DL, TLI);
-
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilderBase &B) {
- // fold strstr(x, x) -> x.
- if (CI->getArgOperand(0) == CI->getArgOperand(1))
- return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
-
- // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
- if (isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
- Value *StrLen = emitStrLen(CI->getArgOperand(1), B, DL, TLI);
- if (!StrLen)
- return nullptr;
- Value *StrNCmp = emitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
- StrLen, B, DL, TLI);
- if (!StrNCmp)
- return nullptr;
- for (auto UI = CI->user_begin(), UE = CI->user_end(); UI != UE;) {
- ICmpInst *Old = cast<ICmpInst>(*UI++);
- Value *Cmp =
- B.CreateICmp(Old->getPredicate(), StrNCmp,
- ConstantInt::getNullValue(StrNCmp->getType()), "cmp");
- replaceAllUsesWith(Old, Cmp);
- }
- return CI;
- }
-
- // See if either input string is a constant string.
- StringRef SearchStr, ToFindStr;
- bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
- bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
-
- // fold strstr(x, "") -> x.
- if (HasStr2 && ToFindStr.empty())
- return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
-
- // If both strings are known, constant fold it.
- if (HasStr1 && HasStr2) {
- size_t Offset = SearchStr.find(ToFindStr);
-
- if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
- return Constant::getNullValue(CI->getType());
-
- // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
- Value *Result = castToCStr(CI->getArgOperand(0), B);
- Result =
- B.CreateConstInBoundsGEP1_64(B.getInt8Ty(), Result, Offset, "strstr");
- return B.CreateBitCast(Result, CI->getType());
- }
-
- // fold strstr(x, "y") -> strchr(x, 'y').
- if (HasStr2 && ToFindStr.size() == 1) {
- Value *StrChr = emitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI);
- return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr;
- }
-
- annotateNonNullBasedOnAccess(CI, {0, 1});
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeMemRChr(CallInst *CI, IRBuilderBase &B) {
- if (isKnownNonZero(CI->getOperand(2), DL))
- annotateNonNullBasedOnAccess(CI, 0);
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) {
- Value *SrcStr = CI->getArgOperand(0);
- Value *Size = CI->getArgOperand(2);
- annotateNonNullAndDereferenceable(CI, 0, Size, DL);
- ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
- ConstantInt *LenC = dyn_cast<ConstantInt>(Size);
-
- // memchr(x, y, 0) -> null
- if (LenC) {
- if (LenC->isZero())
- return Constant::getNullValue(CI->getType());
- } else {
- // From now on we need at least constant length and string.
- return nullptr;
- }
-
- StringRef Str;
- if (!getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false))
- return nullptr;
-
- // Truncate the string to LenC. If Str is smaller than LenC we will still only
- // scan the string, as reading past the end of it is undefined and we can just
- // return null if we don't find the char.
- Str = Str.substr(0, LenC->getZExtValue());
-
- // If the char is variable but the input str and length are not we can turn
- // this memchr call into a simple bit field test. Of course this only works
- // when the return value is only checked against null.
- //
- // It would be really nice to reuse switch lowering here but we can't change
- // the CFG at this point.
- //
- // memchr("\r\n", C, 2) != nullptr -> (1 << C & ((1 << '\r') | (1 << '\n')))
- // != 0
- // after bounds check.
- if (!CharC && !Str.empty() && isOnlyUsedInZeroEqualityComparison(CI)) {
- unsigned char Max =
- *std::max_element(reinterpret_cast<const unsigned char *>(Str.begin()),
- reinterpret_cast<const unsigned char *>(Str.end()));
-
- // Make sure the bit field we're about to create fits in a register on the
- // target.
- // FIXME: On a 64 bit architecture this prevents us from using the
- // interesting range of alpha ascii chars. We could do better by emitting
- // two bitfields or shifting the range by 64 if no lower chars are used.
- if (!DL.fitsInLegalInteger(Max + 1))
- return nullptr;
-
- // For the bit field use a power-of-2 type with at least 8 bits to avoid
- // creating unnecessary illegal types.
- unsigned char Width = NextPowerOf2(std::max((unsigned char)7, Max));
-
- // Now build the bit field.
- APInt Bitfield(Width, 0);
- for (char C : Str)
- Bitfield.setBit((unsigned char)C);
- Value *BitfieldC = B.getInt(Bitfield);
-
- // Adjust width of "C" to the bitfield width, then mask off the high bits.
- Value *C = B.CreateZExtOrTrunc(CI->getArgOperand(1), BitfieldC->getType());
- C = B.CreateAnd(C, B.getIntN(Width, 0xFF));
-
- // First check that the bit field access is within bounds.
- Value *Bounds = B.CreateICmp(ICmpInst::ICMP_ULT, C, B.getIntN(Width, Width),
- "memchr.bounds");
-
- // Create code that checks if the given bit is set in the field.
- Value *Shl = B.CreateShl(B.getIntN(Width, 1ULL), C);
- Value *Bits = B.CreateIsNotNull(B.CreateAnd(Shl, BitfieldC), "memchr.bits");
-
- // Finally merge both checks and cast to pointer type. The inttoptr
- // implicitly zexts the i1 to intptr type.
- return B.CreateIntToPtr(B.CreateAnd(Bounds, Bits, "memchr"), CI->getType());
- }
-
- // Check if all arguments are constants. If so, we can constant fold.
- if (!CharC)
- return nullptr;
-
- // Compute the offset.
- size_t I = Str.find(CharC->getSExtValue() & 0xFF);
- if (I == StringRef::npos) // Didn't find the char. memchr returns null.
- return Constant::getNullValue(CI->getType());
-
- // memchr(s+n,c,l) -> gep(s+n+i,c)
- return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "memchr");
-}
-
-static Value *optimizeMemCmpConstantSize(CallInst *CI, Value *LHS, Value *RHS,
- uint64_t Len, IRBuilderBase &B,
- const DataLayout &DL) {
- if (Len == 0) // memcmp(s1,s2,0) -> 0
- return Constant::getNullValue(CI->getType());
-
- // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
- if (Len == 1) {
- Value *LHSV =
- B.CreateZExt(B.CreateLoad(B.getInt8Ty(), castToCStr(LHS, B), "lhsc"),
- CI->getType(), "lhsv");
- Value *RHSV =
- B.CreateZExt(B.CreateLoad(B.getInt8Ty(), castToCStr(RHS, B), "rhsc"),
- CI->getType(), "rhsv");
- return B.CreateSub(LHSV, RHSV, "chardiff");
- }
-
- // memcmp(S1,S2,N/8)==0 -> (*(intN_t*)S1 != *(intN_t*)S2)==0
- // TODO: The case where both inputs are constants does not need to be limited
- // to legal integers or equality comparison. See block below this.
- if (DL.isLegalInteger(Len * 8) && isOnlyUsedInZeroEqualityComparison(CI)) {
- IntegerType *IntType = IntegerType::get(CI->getContext(), Len * 8);
- unsigned PrefAlignment = DL.getPrefTypeAlignment(IntType);
-
- // First, see if we can fold either argument to a constant.
- Value *LHSV = nullptr;
- if (auto *LHSC = dyn_cast<Constant>(LHS)) {
- LHSC = ConstantExpr::getBitCast(LHSC, IntType->getPointerTo());
- LHSV = ConstantFoldLoadFromConstPtr(LHSC, IntType, DL);
- }
- Value *RHSV = nullptr;
- if (auto *RHSC = dyn_cast<Constant>(RHS)) {
- RHSC = ConstantExpr::getBitCast(RHSC, IntType->getPointerTo());
- RHSV = ConstantFoldLoadFromConstPtr(RHSC, IntType, DL);
- }
-
- // Don't generate unaligned loads. If either source is constant data,
- // alignment doesn't matter for that source because there is no load.
- if ((LHSV || getKnownAlignment(LHS, DL, CI) >= PrefAlignment) &&
- (RHSV || getKnownAlignment(RHS, DL, CI) >= PrefAlignment)) {
- if (!LHSV) {
- Type *LHSPtrTy =
- IntType->getPointerTo(LHS->getType()->getPointerAddressSpace());
- LHSV = B.CreateLoad(IntType, B.CreateBitCast(LHS, LHSPtrTy), "lhsv");
- }
- if (!RHSV) {
- Type *RHSPtrTy =
- IntType->getPointerTo(RHS->getType()->getPointerAddressSpace());
- RHSV = B.CreateLoad(IntType, B.CreateBitCast(RHS, RHSPtrTy), "rhsv");
- }
- return B.CreateZExt(B.CreateICmpNE(LHSV, RHSV), CI->getType(), "memcmp");
- }
- }
-
- // Constant folding: memcmp(x, y, Len) -> constant (all arguments are const).
- // TODO: This is limited to i8 arrays.
- StringRef LHSStr, RHSStr;
- if (getConstantStringInfo(LHS, LHSStr) &&
- getConstantStringInfo(RHS, RHSStr)) {
- // Make sure we're not reading out-of-bounds memory.
- if (Len > LHSStr.size() || Len > RHSStr.size())
- return nullptr;
- // Fold the memcmp and normalize the result. This way we get consistent
- // results across multiple platforms.
- uint64_t Ret = 0;
- int Cmp = memcmp(LHSStr.data(), RHSStr.data(), Len);
- if (Cmp < 0)
- Ret = -1;
- else if (Cmp > 0)
- Ret = 1;
- return ConstantInt::get(CI->getType(), Ret);
- }
-
- return nullptr;
-}
-
-// Most simplifications for memcmp also apply to bcmp.
-Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI,
- IRBuilderBase &B) {
- Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
- Value *Size = CI->getArgOperand(2);
-
- if (LHS == RHS) // memcmp(s,s,x) -> 0
- return Constant::getNullValue(CI->getType());
-
- annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
- // Handle constant lengths.
- ConstantInt *LenC = dyn_cast<ConstantInt>(Size);
- if (!LenC)
- return nullptr;
-
- // memcmp(d,s,0) -> 0
- if (LenC->getZExtValue() == 0)
- return Constant::getNullValue(CI->getType());
-
- if (Value *Res =
- optimizeMemCmpConstantSize(CI, LHS, RHS, LenC->getZExtValue(), B, DL))
- return Res;
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilderBase &B) {
- if (Value *V = optimizeMemCmpBCmpCommon(CI, B))
- return V;
-
- // memcmp(x, y, Len) == 0 -> bcmp(x, y, Len) == 0
- // bcmp can be more efficient than memcmp because it only has to know that
- // there is a difference, not how different one is to the other.
- if (TLI->has(LibFunc_bcmp) && isOnlyUsedInZeroEqualityComparison(CI)) {
- Value *LHS = CI->getArgOperand(0);
- Value *RHS = CI->getArgOperand(1);
- Value *Size = CI->getArgOperand(2);
- return emitBCmp(LHS, RHS, Size, B, DL, TLI);
- }
-
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeBCmp(CallInst *CI, IRBuilderBase &B) {
- return optimizeMemCmpBCmpCommon(CI, B);
-}
-
-Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilderBase &B) {
- Value *Size = CI->getArgOperand(2);
- annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
- if (isa<IntrinsicInst>(CI))
- return nullptr;
-
- // memcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n)
- CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align(1),
- CI->getArgOperand(1), Align(1), Size);
- NewCI->setAttributes(CI->getAttributes());
+ return Dst;
+}
+
+Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
+ unsigned CharSize) {
+ Value *Src = CI->getArgOperand(0);
+
+ // Constant folding: strlen("xyz") -> 3
+ if (uint64_t Len = GetStringLength(Src, CharSize))
+ return ConstantInt::get(CI->getType(), Len - 1);
+
+ // If s is a constant pointer pointing to a string literal, we can fold
+ // strlen(s + x) to strlen(s) - x, when x is known to be in the range
+ // [0, strlen(s)] or the string has a single null terminator '\0' at the end.
+ // We only try to simplify strlen when the pointer s points to an array
+ // of i8. Otherwise, we would need to scale the offset x before doing the
+ // subtraction. This will make the optimization more complex, and it's not
+ // very useful because calling strlen for a pointer of other types is
+ // very uncommon.
+ if (GEPOperator *GEP = dyn_cast<GEPOperator>(Src)) {
+ if (!isGEPBasedOnPointerToString(GEP, CharSize))
+ return nullptr;
+
+ ConstantDataArraySlice Slice;
+ if (getConstantDataArrayInfo(GEP->getOperand(0), Slice, CharSize)) {
+ uint64_t NullTermIdx;
+ if (Slice.Array == nullptr) {
+ NullTermIdx = 0;
+ } else {
+ NullTermIdx = ~((uint64_t)0);
+ for (uint64_t I = 0, E = Slice.Length; I < E; ++I) {
+ if (Slice.Array->getElementAsInteger(I + Slice.Offset) == 0) {
+ NullTermIdx = I;
+ break;
+ }
+ }
+ // If the string does not have '\0', leave it to strlen to compute
+ // its length.
+ if (NullTermIdx == ~((uint64_t)0))
+ return nullptr;
+ }
+
+ Value *Offset = GEP->getOperand(2);
+ KnownBits Known = computeKnownBits(Offset, DL, 0, nullptr, CI, nullptr);
+ Known.Zero.flipAllBits();
+ uint64_t ArrSize =
+ cast<ArrayType>(GEP->getSourceElementType())->getNumElements();
+
+ // KnownZero's bits are flipped, so zeros in KnownZero now represent
+ // bits known to be zeros in Offset, and ones in KnowZero represent
+ // bits unknown in Offset. Therefore, Offset is known to be in range
+ // [0, NullTermIdx] when the flipped KnownZero is non-negative and
+ // unsigned-less-than NullTermIdx.
+ //
+ // If Offset is not provably in the range [0, NullTermIdx], we can still
+ // optimize if we can prove that the program has undefined behavior when
+ // Offset is outside that range. That is the case when GEP->getOperand(0)
+ // is a pointer to an object whose memory extent is NullTermIdx+1.
+ if ((Known.Zero.isNonNegative() && Known.Zero.ule(NullTermIdx)) ||
+ (GEP->isInBounds() && isa<GlobalVariable>(GEP->getOperand(0)) &&
+ NullTermIdx == ArrSize - 1)) {
+ Offset = B.CreateSExtOrTrunc(Offset, CI->getType());
+ return B.CreateSub(ConstantInt::get(CI->getType(), NullTermIdx),
+ Offset);
+ }
+ }
+ }
+
+ // strlen(x?"foo":"bars") --> x ? 3 : 4
+ if (SelectInst *SI = dyn_cast<SelectInst>(Src)) {
+ uint64_t LenTrue = GetStringLength(SI->getTrueValue(), CharSize);
+ uint64_t LenFalse = GetStringLength(SI->getFalseValue(), CharSize);
+ if (LenTrue && LenFalse) {
+ ORE.emit([&]() {
+ return OptimizationRemark("instcombine", "simplify-libcalls", CI)
+ << "folded strlen(select) to select of constants";
+ });
+ return B.CreateSelect(SI->getCondition(),
+ ConstantInt::get(CI->getType(), LenTrue - 1),
+ ConstantInt::get(CI->getType(), LenFalse - 1));
+ }
+ }
+
+ // strlen(x) != 0 --> *x != 0
+ // strlen(x) == 0 --> *x == 0
+ if (isOnlyUsedInZeroEqualityComparison(CI))
+ return B.CreateZExt(B.CreateLoad(B.getIntNTy(CharSize), Src, "strlenfirst"),
+ CI->getType());
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilderBase &B) {
+ if (Value *V = optimizeStringLength(CI, B, 8))
+ return V;
+ annotateNonNullBasedOnAccess(CI, 0);
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilderBase &B) {
+ Module &M = *CI->getModule();
+ unsigned WCharSize = TLI->getWCharSize(M) * 8;
+ // We cannot perform this optimization without wchar_size metadata.
+ if (WCharSize == 0)
+ return nullptr;
+
+ return optimizeStringLength(CI, B, WCharSize);
+}
+
+Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilderBase &B) {
+ StringRef S1, S2;
+ bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+ bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+ // strpbrk(s, "") -> nullptr
+ // strpbrk("", s) -> nullptr
+ if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+ return Constant::getNullValue(CI->getType());
+
+ // Constant folding.
+ if (HasS1 && HasS2) {
+ size_t I = S1.find_first_of(S2);
+ if (I == StringRef::npos) // No match.
+ return Constant::getNullValue(CI->getType());
+
+ return B.CreateGEP(B.getInt8Ty(), CI->getArgOperand(0), B.getInt64(I),
+ "strpbrk");
+ }
+
+ // strpbrk(s, "a") -> strchr(s, 'a')
+ if (HasS2 && S2.size() == 1)
+ return emitStrChr(CI->getArgOperand(0), S2[0], B, TLI);
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilderBase &B) {
+ Value *EndPtr = CI->getArgOperand(1);
+ if (isa<ConstantPointerNull>(EndPtr)) {
+ // With a null EndPtr, this function won't capture the main argument.
+ // It would be readonly too, except that it still may write to errno.
+ CI->addParamAttr(0, Attribute::NoCapture);
+ }
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilderBase &B) {
+ StringRef S1, S2;
+ bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+ bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+ // strspn(s, "") -> 0
+ // strspn("", s) -> 0
+ if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+ return Constant::getNullValue(CI->getType());
+
+ // Constant folding.
+ if (HasS1 && HasS2) {
+ size_t Pos = S1.find_first_not_of(S2);
+ if (Pos == StringRef::npos)
+ Pos = S1.size();
+ return ConstantInt::get(CI->getType(), Pos);
+ }
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilderBase &B) {
+ StringRef S1, S2;
+ bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+ bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+ // strcspn("", s) -> 0
+ if (HasS1 && S1.empty())
+ return Constant::getNullValue(CI->getType());
+
+ // Constant folding.
+ if (HasS1 && HasS2) {
+ size_t Pos = S1.find_first_of(S2);
+ if (Pos == StringRef::npos)
+ Pos = S1.size();
+ return ConstantInt::get(CI->getType(), Pos);
+ }
+
+ // strcspn(s, "") -> strlen(s)
+ if (HasS2 && S2.empty())
+ return emitStrLen(CI->getArgOperand(0), B, DL, TLI);
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilderBase &B) {
+ // fold strstr(x, x) -> x.
+ if (CI->getArgOperand(0) == CI->getArgOperand(1))
+ return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+ // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
+ if (isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
+ Value *StrLen = emitStrLen(CI->getArgOperand(1), B, DL, TLI);
+ if (!StrLen)
+ return nullptr;
+ Value *StrNCmp = emitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
+ StrLen, B, DL, TLI);
+ if (!StrNCmp)
+ return nullptr;
+ for (auto UI = CI->user_begin(), UE = CI->user_end(); UI != UE;) {
+ ICmpInst *Old = cast<ICmpInst>(*UI++);
+ Value *Cmp =
+ B.CreateICmp(Old->getPredicate(), StrNCmp,
+ ConstantInt::getNullValue(StrNCmp->getType()), "cmp");
+ replaceAllUsesWith(Old, Cmp);
+ }
+ return CI;
+ }
+
+ // See if either input string is a constant string.
+ StringRef SearchStr, ToFindStr;
+ bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
+ bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
+
+ // fold strstr(x, "") -> x.
+ if (HasStr2 && ToFindStr.empty())
+ return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+ // If both strings are known, constant fold it.
+ if (HasStr1 && HasStr2) {
+ size_t Offset = SearchStr.find(ToFindStr);
+
+ if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
+ return Constant::getNullValue(CI->getType());
+
+ // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
+ Value *Result = castToCStr(CI->getArgOperand(0), B);
+ Result =
+ B.CreateConstInBoundsGEP1_64(B.getInt8Ty(), Result, Offset, "strstr");
+ return B.CreateBitCast(Result, CI->getType());
+ }
+
+ // fold strstr(x, "y") -> strchr(x, 'y').
+ if (HasStr2 && ToFindStr.size() == 1) {
+ Value *StrChr = emitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TLI);
+ return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : nullptr;
+ }
+
+ annotateNonNullBasedOnAccess(CI, {0, 1});
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeMemRChr(CallInst *CI, IRBuilderBase &B) {
+ if (isKnownNonZero(CI->getOperand(2), DL))
+ annotateNonNullBasedOnAccess(CI, 0);
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) {
+ Value *SrcStr = CI->getArgOperand(0);
+ Value *Size = CI->getArgOperand(2);
+ annotateNonNullAndDereferenceable(CI, 0, Size, DL);
+ ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ ConstantInt *LenC = dyn_cast<ConstantInt>(Size);
+
+ // memchr(x, y, 0) -> null
+ if (LenC) {
+ if (LenC->isZero())
+ return Constant::getNullValue(CI->getType());
+ } else {
+ // From now on we need at least constant length and string.
+ return nullptr;
+ }
+
+ StringRef Str;
+ if (!getConstantStringInfo(SrcStr, Str, 0, /*TrimAtNul=*/false))
+ return nullptr;
+
+ // Truncate the string to LenC. If Str is smaller than LenC we will still only
+ // scan the string, as reading past the end of it is undefined and we can just
+ // return null if we don't find the char.
+ Str = Str.substr(0, LenC->getZExtValue());
+
+ // If the char is variable but the input str and length are not we can turn
+ // this memchr call into a simple bit field test. Of course this only works
+ // when the return value is only checked against null.
+ //
+ // It would be really nice to reuse switch lowering here but we can't change
+ // the CFG at this point.
+ //
+ // memchr("\r\n", C, 2) != nullptr -> (1 << C & ((1 << '\r') | (1 << '\n')))
+ // != 0
+ // after bounds check.
+ if (!CharC && !Str.empty() && isOnlyUsedInZeroEqualityComparison(CI)) {
+ unsigned char Max =
+ *std::max_element(reinterpret_cast<const unsigned char *>(Str.begin()),
+ reinterpret_cast<const unsigned char *>(Str.end()));
+
+ // Make sure the bit field we're about to create fits in a register on the
+ // target.
+ // FIXME: On a 64 bit architecture this prevents us from using the
+ // interesting range of alpha ascii chars. We could do better by emitting
+ // two bitfields or shifting the range by 64 if no lower chars are used.
+ if (!DL.fitsInLegalInteger(Max + 1))
+ return nullptr;
+
+ // For the bit field use a power-of-2 type with at least 8 bits to avoid
+ // creating unnecessary illegal types.
+ unsigned char Width = NextPowerOf2(std::max((unsigned char)7, Max));
+
+ // Now build the bit field.
+ APInt Bitfield(Width, 0);
+ for (char C : Str)
+ Bitfield.setBit((unsigned char)C);
+ Value *BitfieldC = B.getInt(Bitfield);
+
+ // Adjust width of "C" to the bitfield width, then mask off the high bits.
+ Value *C = B.CreateZExtOrTrunc(CI->getArgOperand(1), BitfieldC->getType());
+ C = B.CreateAnd(C, B.getIntN(Width, 0xFF));
+
+ // First check that the bit field access is within bounds.
+ Value *Bounds = B.CreateICmp(ICmpInst::ICMP_ULT, C, B.getIntN(Width, Width),
+ "memchr.bounds");
+
+ // Create code that checks if the given bit is set in the field.
+ Value *Shl = B.CreateShl(B.getIntN(Width, 1ULL), C);
+ Value *Bits = B.CreateIsNotNull(B.CreateAnd(Shl, BitfieldC), "memchr.bits");
+
+ // Finally merge both checks and cast to pointer type. The inttoptr
+ // implicitly zexts the i1 to intptr type.
+ return B.CreateIntToPtr(B.CreateAnd(Bounds, Bits, "memchr"), CI->getType());
+ }
+
+ // Check if all arguments are constants. If so, we can constant fold.
+ if (!CharC)
+ return nullptr;
+
+ // Compute the offset.
+ size_t I = Str.find(CharC->getSExtValue() & 0xFF);
+ if (I == StringRef::npos) // Didn't find the char. memchr returns null.
+ return Constant::getNullValue(CI->getType());
+
+ // memchr(s+n,c,l) -> gep(s+n+i,c)
+ return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "memchr");
+}
+
+static Value *optimizeMemCmpConstantSize(CallInst *CI, Value *LHS, Value *RHS,
+ uint64_t Len, IRBuilderBase &B,
+ const DataLayout &DL) {
+ if (Len == 0) // memcmp(s1,s2,0) -> 0
+ return Constant::getNullValue(CI->getType());
+
+ // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
+ if (Len == 1) {
+ Value *LHSV =
+ B.CreateZExt(B.CreateLoad(B.getInt8Ty(), castToCStr(LHS, B), "lhsc"),
+ CI->getType(), "lhsv");
+ Value *RHSV =
+ B.CreateZExt(B.CreateLoad(B.getInt8Ty(), castToCStr(RHS, B), "rhsc"),
+ CI->getType(), "rhsv");
+ return B.CreateSub(LHSV, RHSV, "chardiff");
+ }
+
+ // memcmp(S1,S2,N/8)==0 -> (*(intN_t*)S1 != *(intN_t*)S2)==0
+ // TODO: The case where both inputs are constants does not need to be limited
+ // to legal integers or equality comparison. See block below this.
+ if (DL.isLegalInteger(Len * 8) && isOnlyUsedInZeroEqualityComparison(CI)) {
+ IntegerType *IntType = IntegerType::get(CI->getContext(), Len * 8);
+ unsigned PrefAlignment = DL.getPrefTypeAlignment(IntType);
+
+ // First, see if we can fold either argument to a constant.
+ Value *LHSV = nullptr;
+ if (auto *LHSC = dyn_cast<Constant>(LHS)) {
+ LHSC = ConstantExpr::getBitCast(LHSC, IntType->getPointerTo());
+ LHSV = ConstantFoldLoadFromConstPtr(LHSC, IntType, DL);
+ }
+ Value *RHSV = nullptr;
+ if (auto *RHSC = dyn_cast<Constant>(RHS)) {
+ RHSC = ConstantExpr::getBitCast(RHSC, IntType->getPointerTo());
+ RHSV = ConstantFoldLoadFromConstPtr(RHSC, IntType, DL);
+ }
+
+ // Don't generate unaligned loads. If either source is constant data,
+ // alignment doesn't matter for that source because there is no load.
+ if ((LHSV || getKnownAlignment(LHS, DL, CI) >= PrefAlignment) &&
+ (RHSV || getKnownAlignment(RHS, DL, CI) >= PrefAlignment)) {
+ if (!LHSV) {
+ Type *LHSPtrTy =
+ IntType->getPointerTo(LHS->getType()->getPointerAddressSpace());
+ LHSV = B.CreateLoad(IntType, B.CreateBitCast(LHS, LHSPtrTy), "lhsv");
+ }
+ if (!RHSV) {
+ Type *RHSPtrTy =
+ IntType->getPointerTo(RHS->getType()->getPointerAddressSpace());
+ RHSV = B.CreateLoad(IntType, B.CreateBitCast(RHS, RHSPtrTy), "rhsv");
+ }
+ return B.CreateZExt(B.CreateICmpNE(LHSV, RHSV), CI->getType(), "memcmp");
+ }
+ }
+
+ // Constant folding: memcmp(x, y, Len) -> constant (all arguments are const).
+ // TODO: This is limited to i8 arrays.
+ StringRef LHSStr, RHSStr;
+ if (getConstantStringInfo(LHS, LHSStr) &&
+ getConstantStringInfo(RHS, RHSStr)) {
+ // Make sure we're not reading out-of-bounds memory.
+ if (Len > LHSStr.size() || Len > RHSStr.size())
+ return nullptr;
+ // Fold the memcmp and normalize the result. This way we get consistent
+ // results across multiple platforms.
+ uint64_t Ret = 0;
+ int Cmp = memcmp(LHSStr.data(), RHSStr.data(), Len);
+ if (Cmp < 0)
+ Ret = -1;
+ else if (Cmp > 0)
+ Ret = 1;
+ return ConstantInt::get(CI->getType(), Ret);
+ }
+
+ return nullptr;
+}
+
+// Most simplifications for memcmp also apply to bcmp.
+Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI,
+ IRBuilderBase &B) {
+ Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
+ Value *Size = CI->getArgOperand(2);
+
+ if (LHS == RHS) // memcmp(s,s,x) -> 0
+ return Constant::getNullValue(CI->getType());
+
+ annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
+ // Handle constant lengths.
+ ConstantInt *LenC = dyn_cast<ConstantInt>(Size);
+ if (!LenC)
+ return nullptr;
+
+ // memcmp(d,s,0) -> 0
+ if (LenC->getZExtValue() == 0)
+ return Constant::getNullValue(CI->getType());
+
+ if (Value *Res =
+ optimizeMemCmpConstantSize(CI, LHS, RHS, LenC->getZExtValue(), B, DL))
+ return Res;
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilderBase &B) {
+ if (Value *V = optimizeMemCmpBCmpCommon(CI, B))
+ return V;
+
+ // memcmp(x, y, Len) == 0 -> bcmp(x, y, Len) == 0
+ // bcmp can be more efficient than memcmp because it only has to know that
+ // there is a difference, not how different one is to the other.
+ if (TLI->has(LibFunc_bcmp) && isOnlyUsedInZeroEqualityComparison(CI)) {
+ Value *LHS = CI->getArgOperand(0);
+ Value *RHS = CI->getArgOperand(1);
+ Value *Size = CI->getArgOperand(2);
+ return emitBCmp(LHS, RHS, Size, B, DL, TLI);
+ }
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeBCmp(CallInst *CI, IRBuilderBase &B) {
+ return optimizeMemCmpBCmpCommon(CI, B);
+}
+
+Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilderBase &B) {
+ Value *Size = CI->getArgOperand(2);
+ annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
+ if (isa<IntrinsicInst>(CI))
+ return nullptr;
+
+ // memcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n)
+ CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align(1),
+ CI->getArgOperand(1), Align(1), Size);
+ NewCI->setAttributes(CI->getAttributes());
NewCI->removeAttributes(AttributeList::ReturnIndex,
AttributeFuncs::typeIncompatible(NewCI->getType()));
- return CI->getArgOperand(0);
-}
-
-Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) {
- Value *Dst = CI->getArgOperand(0);
- Value *Src = CI->getArgOperand(1);
- ConstantInt *StopChar = dyn_cast<ConstantInt>(CI->getArgOperand(2));
- ConstantInt *N = dyn_cast<ConstantInt>(CI->getArgOperand(3));
- StringRef SrcStr;
- if (CI->use_empty() && Dst == Src)
- return Dst;
- // memccpy(d, s, c, 0) -> nullptr
- if (N) {
- if (N->isNullValue())
- return Constant::getNullValue(CI->getType());
- if (!getConstantStringInfo(Src, SrcStr, /*Offset=*/0,
- /*TrimAtNul=*/false) ||
- !StopChar)
- return nullptr;
- } else {
- return nullptr;
- }
-
- // Wrap arg 'c' of type int to char
- size_t Pos = SrcStr.find(StopChar->getSExtValue() & 0xFF);
- if (Pos == StringRef::npos) {
- if (N->getZExtValue() <= SrcStr.size()) {
- B.CreateMemCpy(Dst, Align(1), Src, Align(1), CI->getArgOperand(3));
- return Constant::getNullValue(CI->getType());
- }
- return nullptr;
- }
-
- Value *NewN =
- ConstantInt::get(N->getType(), std::min(uint64_t(Pos + 1), N->getZExtValue()));
- // memccpy -> llvm.memcpy
- B.CreateMemCpy(Dst, Align(1), Src, Align(1), NewN);
- return Pos + 1 <= N->getZExtValue()
- ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, NewN)
- : Constant::getNullValue(CI->getType());
-}
-
-Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilderBase &B) {
- Value *Dst = CI->getArgOperand(0);
- Value *N = CI->getArgOperand(2);
- // mempcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n), x + n
- CallInst *NewCI =
- B.CreateMemCpy(Dst, Align(1), CI->getArgOperand(1), Align(1), N);
+ return CI->getArgOperand(0);
+}
+
+Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) {
+ Value *Dst = CI->getArgOperand(0);
+ Value *Src = CI->getArgOperand(1);
+ ConstantInt *StopChar = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+ ConstantInt *N = dyn_cast<ConstantInt>(CI->getArgOperand(3));
+ StringRef SrcStr;
+ if (CI->use_empty() && Dst == Src)
+ return Dst;
+ // memccpy(d, s, c, 0) -> nullptr
+ if (N) {
+ if (N->isNullValue())
+ return Constant::getNullValue(CI->getType());
+ if (!getConstantStringInfo(Src, SrcStr, /*Offset=*/0,
+ /*TrimAtNul=*/false) ||
+ !StopChar)
+ return nullptr;
+ } else {
+ return nullptr;
+ }
+
+ // Wrap arg 'c' of type int to char
+ size_t Pos = SrcStr.find(StopChar->getSExtValue() & 0xFF);
+ if (Pos == StringRef::npos) {
+ if (N->getZExtValue() <= SrcStr.size()) {
+ B.CreateMemCpy(Dst, Align(1), Src, Align(1), CI->getArgOperand(3));
+ return Constant::getNullValue(CI->getType());
+ }
+ return nullptr;
+ }
+
+ Value *NewN =
+ ConstantInt::get(N->getType(), std::min(uint64_t(Pos + 1), N->getZExtValue()));
+ // memccpy -> llvm.memcpy
+ B.CreateMemCpy(Dst, Align(1), Src, Align(1), NewN);
+ return Pos + 1 <= N->getZExtValue()
+ ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, NewN)
+ : Constant::getNullValue(CI->getType());
+}
+
+Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilderBase &B) {
+ Value *Dst = CI->getArgOperand(0);
+ Value *N = CI->getArgOperand(2);
+ // mempcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n), x + n
+ CallInst *NewCI =
+ B.CreateMemCpy(Dst, Align(1), CI->getArgOperand(1), Align(1), N);
// Propagate attributes, but memcpy has no return value, so make sure that
// any return attributes are compliant.
// TODO: Attach return value attributes to the 1st operand to preserve them?
- NewCI->setAttributes(CI->getAttributes());
+ NewCI->setAttributes(CI->getAttributes());
NewCI->removeAttributes(AttributeList::ReturnIndex,
AttributeFuncs::typeIncompatible(NewCI->getType()));
- return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N);
-}
-
-Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilderBase &B) {
- Value *Size = CI->getArgOperand(2);
- annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
- if (isa<IntrinsicInst>(CI))
- return nullptr;
-
- // memmove(x, y, n) -> llvm.memmove(align 1 x, align 1 y, n)
- CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align(1),
- CI->getArgOperand(1), Align(1), Size);
- NewCI->setAttributes(CI->getAttributes());
+ return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N);
+}
+
+Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilderBase &B) {
+ Value *Size = CI->getArgOperand(2);
+ annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
+ if (isa<IntrinsicInst>(CI))
+ return nullptr;
+
+ // memmove(x, y, n) -> llvm.memmove(align 1 x, align 1 y, n)
+ CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align(1),
+ CI->getArgOperand(1), Align(1), Size);
+ NewCI->setAttributes(CI->getAttributes());
NewCI->removeAttributes(AttributeList::ReturnIndex,
AttributeFuncs::typeIncompatible(NewCI->getType()));
- return CI->getArgOperand(0);
-}
-
-/// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n).
-Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilderBase &B) {
- // This has to be a memset of zeros (bzero).
- auto *FillValue = dyn_cast<ConstantInt>(Memset->getArgOperand(1));
- if (!FillValue || FillValue->getZExtValue() != 0)
- return nullptr;
-
- // TODO: We should handle the case where the malloc has more than one use.
- // This is necessary to optimize common patterns such as when the result of
- // the malloc is checked against null or when a memset intrinsic is used in
- // place of a memset library call.
- auto *Malloc = dyn_cast<CallInst>(Memset->getArgOperand(0));
- if (!Malloc || !Malloc->hasOneUse())
- return nullptr;
-
- // Is the inner call really malloc()?
- Function *InnerCallee = Malloc->getCalledFunction();
- if (!InnerCallee)
- return nullptr;
-
- LibFunc Func;
- if (!TLI->getLibFunc(*InnerCallee, Func) || !TLI->has(Func) ||
- Func != LibFunc_malloc)
- return nullptr;
-
- // The memset must cover the same number of bytes that are malloc'd.
- if (Memset->getArgOperand(2) != Malloc->getArgOperand(0))
- return nullptr;
-
- // Replace the malloc with a calloc. We need the data layout to know what the
- // actual size of a 'size_t' parameter is.
- B.SetInsertPoint(Malloc->getParent(), ++Malloc->getIterator());
- const DataLayout &DL = Malloc->getModule()->getDataLayout();
- IntegerType *SizeType = DL.getIntPtrType(B.GetInsertBlock()->getContext());
- if (Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1),
- Malloc->getArgOperand(0),
- Malloc->getAttributes(), B, *TLI)) {
- substituteInParent(Malloc, Calloc);
- return Calloc;
- }
-
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilderBase &B) {
- Value *Size = CI->getArgOperand(2);
- annotateNonNullAndDereferenceable(CI, 0, Size, DL);
- if (isa<IntrinsicInst>(CI))
- return nullptr;
-
- if (auto *Calloc = foldMallocMemset(CI, B))
- return Calloc;
-
- // memset(p, v, n) -> llvm.memset(align 1 p, v, n)
- Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
- CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align(1));
- NewCI->setAttributes(CI->getAttributes());
+ return CI->getArgOperand(0);
+}
+
+/// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n).
+Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilderBase &B) {
+ // This has to be a memset of zeros (bzero).
+ auto *FillValue = dyn_cast<ConstantInt>(Memset->getArgOperand(1));
+ if (!FillValue || FillValue->getZExtValue() != 0)
+ return nullptr;
+
+ // TODO: We should handle the case where the malloc has more than one use.
+ // This is necessary to optimize common patterns such as when the result of
+ // the malloc is checked against null or when a memset intrinsic is used in
+ // place of a memset library call.
+ auto *Malloc = dyn_cast<CallInst>(Memset->getArgOperand(0));
+ if (!Malloc || !Malloc->hasOneUse())
+ return nullptr;
+
+ // Is the inner call really malloc()?
+ Function *InnerCallee = Malloc->getCalledFunction();
+ if (!InnerCallee)
+ return nullptr;
+
+ LibFunc Func;
+ if (!TLI->getLibFunc(*InnerCallee, Func) || !TLI->has(Func) ||
+ Func != LibFunc_malloc)
+ return nullptr;
+
+ // The memset must cover the same number of bytes that are malloc'd.
+ if (Memset->getArgOperand(2) != Malloc->getArgOperand(0))
+ return nullptr;
+
+ // Replace the malloc with a calloc. We need the data layout to know what the
+ // actual size of a 'size_t' parameter is.
+ B.SetInsertPoint(Malloc->getParent(), ++Malloc->getIterator());
+ const DataLayout &DL = Malloc->getModule()->getDataLayout();
+ IntegerType *SizeType = DL.getIntPtrType(B.GetInsertBlock()->getContext());
+ if (Value *Calloc = emitCalloc(ConstantInt::get(SizeType, 1),
+ Malloc->getArgOperand(0),
+ Malloc->getAttributes(), B, *TLI)) {
+ substituteInParent(Malloc, Calloc);
+ return Calloc;
+ }
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilderBase &B) {
+ Value *Size = CI->getArgOperand(2);
+ annotateNonNullAndDereferenceable(CI, 0, Size, DL);
+ if (isa<IntrinsicInst>(CI))
+ return nullptr;
+
+ if (auto *Calloc = foldMallocMemset(CI, B))
+ return Calloc;
+
+ // memset(p, v, n) -> llvm.memset(align 1 p, v, n)
+ Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
+ CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align(1));
+ NewCI->setAttributes(CI->getAttributes());
NewCI->removeAttributes(AttributeList::ReturnIndex,
AttributeFuncs::typeIncompatible(NewCI->getType()));
- return CI->getArgOperand(0);
-}
-
-Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) {
- if (isa<ConstantPointerNull>(CI->getArgOperand(0)))
- return emitMalloc(CI->getArgOperand(1), B, DL, TLI);
-
- return nullptr;
-}
-
-//===----------------------------------------------------------------------===//
-// Math Library Optimizations
-//===----------------------------------------------------------------------===//
-
-// Replace a libcall \p CI with a call to intrinsic \p IID
-static Value *replaceUnaryCall(CallInst *CI, IRBuilderBase &B,
- Intrinsic::ID IID) {
- // Propagate fast-math flags from the existing call to the new call.
- IRBuilderBase::FastMathFlagGuard Guard(B);
- B.setFastMathFlags(CI->getFastMathFlags());
-
- Module *M = CI->getModule();
- Value *V = CI->getArgOperand(0);
- Function *F = Intrinsic::getDeclaration(M, IID, CI->getType());
- CallInst *NewCall = B.CreateCall(F, V);
- NewCall->takeName(CI);
- return NewCall;
-}
-
-/// Return a variant of Val with float type.
-/// Currently this works in two cases: If Val is an FPExtension of a float
-/// value to something bigger, simply return the operand.
-/// If Val is a ConstantFP but can be converted to a float ConstantFP without
-/// loss of precision do so.
-static Value *valueHasFloatPrecision(Value *Val) {
- if (FPExtInst *Cast = dyn_cast<FPExtInst>(Val)) {
- Value *Op = Cast->getOperand(0);
- if (Op->getType()->isFloatTy())
- return Op;
- }
- if (ConstantFP *Const = dyn_cast<ConstantFP>(Val)) {
- APFloat F = Const->getValueAPF();
- bool losesInfo;
- (void)F.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
- &losesInfo);
- if (!losesInfo)
- return ConstantFP::get(Const->getContext(), F);
- }
- return nullptr;
-}
-
-/// Shrink double -> float functions.
-static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B,
- bool isBinary, bool isPrecise = false) {
- Function *CalleeFn = CI->getCalledFunction();
- if (!CI->getType()->isDoubleTy() || !CalleeFn)
- return nullptr;
-
- // If not all the uses of the function are converted to float, then bail out.
- // This matters if the precision of the result is more important than the
- // precision of the arguments.
- if (isPrecise)
- for (User *U : CI->users()) {
- FPTruncInst *Cast = dyn_cast<FPTruncInst>(U);
- if (!Cast || !Cast->getType()->isFloatTy())
- return nullptr;
- }
-
- // If this is something like 'g((double) float)', convert to 'gf(float)'.
- Value *V[2];
- V[0] = valueHasFloatPrecision(CI->getArgOperand(0));
- V[1] = isBinary ? valueHasFloatPrecision(CI->getArgOperand(1)) : nullptr;
- if (!V[0] || (isBinary && !V[1]))
- return nullptr;
-
- // If call isn't an intrinsic, check that it isn't within a function with the
- // same name as the float version of this call, otherwise the result is an
- // infinite loop. For example, from MinGW-w64:
- //
- // float expf(float val) { return (float) exp((double) val); }
- StringRef CalleeName = CalleeFn->getName();
- bool IsIntrinsic = CalleeFn->isIntrinsic();
- if (!IsIntrinsic) {
- StringRef CallerName = CI->getFunction()->getName();
- if (!CallerName.empty() && CallerName.back() == 'f' &&
- CallerName.size() == (CalleeName.size() + 1) &&
- CallerName.startswith(CalleeName))
- return nullptr;
- }
-
- // Propagate the math semantics from the current function to the new function.
- IRBuilderBase::FastMathFlagGuard Guard(B);
- B.setFastMathFlags(CI->getFastMathFlags());
-
- // g((double) float) -> (double) gf(float)
- Value *R;
- if (IsIntrinsic) {
- Module *M = CI->getModule();
- Intrinsic::ID IID = CalleeFn->getIntrinsicID();
- Function *Fn = Intrinsic::getDeclaration(M, IID, B.getFloatTy());
- R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]);
- } else {
- AttributeList CalleeAttrs = CalleeFn->getAttributes();
- R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], CalleeName, B, CalleeAttrs)
- : emitUnaryFloatFnCall(V[0], CalleeName, B, CalleeAttrs);
- }
- return B.CreateFPExt(R, B.getDoubleTy());
-}
-
-/// Shrink double -> float for unary functions.
-static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilderBase &B,
- bool isPrecise = false) {
- return optimizeDoubleFP(CI, B, false, isPrecise);
-}
-
-/// Shrink double -> float for binary functions.
-static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilderBase &B,
- bool isPrecise = false) {
- return optimizeDoubleFP(CI, B, true, isPrecise);
-}
-
-// cabs(z) -> sqrt((creal(z)*creal(z)) + (cimag(z)*cimag(z)))
-Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilderBase &B) {
- if (!CI->isFast())
- return nullptr;
-
- // Propagate fast-math flags from the existing call to new instructions.
- IRBuilderBase::FastMathFlagGuard Guard(B);
- B.setFastMathFlags(CI->getFastMathFlags());
-
- Value *Real, *Imag;
- if (CI->getNumArgOperands() == 1) {
- Value *Op = CI->getArgOperand(0);
- assert(Op->getType()->isArrayTy() && "Unexpected signature for cabs!");
- Real = B.CreateExtractValue(Op, 0, "real");
- Imag = B.CreateExtractValue(Op, 1, "imag");
- } else {
- assert(CI->getNumArgOperands() == 2 && "Unexpected signature for cabs!");
- Real = CI->getArgOperand(0);
- Imag = CI->getArgOperand(1);
- }
-
- Value *RealReal = B.CreateFMul(Real, Real);
- Value *ImagImag = B.CreateFMul(Imag, Imag);
-
- Function *FSqrt = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::sqrt,
- CI->getType());
- return B.CreateCall(FSqrt, B.CreateFAdd(RealReal, ImagImag), "cabs");
-}
-
-static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func,
- IRBuilderBase &B) {
- if (!isa<FPMathOperator>(Call))
- return nullptr;
-
- IRBuilderBase::FastMathFlagGuard Guard(B);
- B.setFastMathFlags(Call->getFastMathFlags());
-
- // TODO: Can this be shared to also handle LLVM intrinsics?
- Value *X;
- switch (Func) {
- case LibFunc_sin:
- case LibFunc_sinf:
- case LibFunc_sinl:
- case LibFunc_tan:
- case LibFunc_tanf:
- case LibFunc_tanl:
- // sin(-X) --> -sin(X)
- // tan(-X) --> -tan(X)
- if (match(Call->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X)))))
- return B.CreateFNeg(B.CreateCall(Call->getCalledFunction(), X));
- break;
- case LibFunc_cos:
- case LibFunc_cosf:
- case LibFunc_cosl:
- // cos(-X) --> cos(X)
- if (match(Call->getArgOperand(0), m_FNeg(m_Value(X))))
- return B.CreateCall(Call->getCalledFunction(), X, "cos");
- break;
- default:
- break;
- }
- return nullptr;
-}
-
-static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilderBase &B) {
- // Multiplications calculated using Addition Chains.
- // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
-
- assert(Exp != 0 && "Incorrect exponent 0 not handled");
-
- if (InnerChain[Exp])
- return InnerChain[Exp];
-
- static const unsigned AddChain[33][2] = {
- {0, 0}, // Unused.
- {0, 0}, // Unused (base case = pow1).
- {1, 1}, // Unused (pre-computed).
- {1, 2}, {2, 2}, {2, 3}, {3, 3}, {2, 5}, {4, 4},
- {1, 8}, {5, 5}, {1, 10}, {6, 6}, {4, 9}, {7, 7},
- {3, 12}, {8, 8}, {8, 9}, {2, 16}, {1, 18}, {10, 10},
- {6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13},
- {3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16},
- };
-
- InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B),
- getPow(InnerChain, AddChain[Exp][1], B));
- return InnerChain[Exp];
-}
-
-// Return a properly extended 32-bit integer if the operation is an itofp.
-static Value *getIntToFPVal(Value *I2F, IRBuilderBase &B) {
- if (isa<SIToFPInst>(I2F) || isa<UIToFPInst>(I2F)) {
- Value *Op = cast<Instruction>(I2F)->getOperand(0);
- // Make sure that the exponent fits inside an int32_t,
- // thus avoiding any range issues that FP has not.
- unsigned BitWidth = Op->getType()->getPrimitiveSizeInBits();
- if (BitWidth < 32 ||
- (BitWidth == 32 && isa<SIToFPInst>(I2F)))
- return isa<SIToFPInst>(I2F) ? B.CreateSExt(Op, B.getInt32Ty())
- : B.CreateZExt(Op, B.getInt32Ty());
- }
-
- return nullptr;
-}
-
-/// Use exp{,2}(x * y) for pow(exp{,2}(x), y);
-/// ldexp(1.0, x) for pow(2.0, itofp(x)); exp2(n * x) for pow(2.0 ** n, x);
-/// exp10(x) for pow(10.0, x); exp2(log2(n) * x) for pow(n, x).
-Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
- Value *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1);
- AttributeList Attrs; // Attributes are only meaningful on the original call
- Module *Mod = Pow->getModule();
- Type *Ty = Pow->getType();
- bool Ignored;
-
- // Evaluate special cases related to a nested function as the base.
-
- // pow(exp(x), y) -> exp(x * y)
- // pow(exp2(x), y) -> exp2(x * y)
- // If exp{,2}() is used only once, it is better to fold two transcendental
- // math functions into one. If used again, exp{,2}() would still have to be
- // called with the original argument, then keep both original transcendental
- // functions. However, this transformation is only safe with fully relaxed
- // math semantics, since, besides rounding differences, it changes overflow
- // and underflow behavior quite dramatically. For example:
- // pow(exp(1000), 0.001) = pow(inf, 0.001) = inf
- // Whereas:
- // exp(1000 * 0.001) = exp(1)
- // TODO: Loosen the requirement for fully relaxed math semantics.
- // TODO: Handle exp10() when more targets have it available.
- CallInst *BaseFn = dyn_cast<CallInst>(Base);
- if (BaseFn && BaseFn->hasOneUse() && BaseFn->isFast() && Pow->isFast()) {
- LibFunc LibFn;
-
- Function *CalleeFn = BaseFn->getCalledFunction();
- if (CalleeFn &&
- TLI->getLibFunc(CalleeFn->getName(), LibFn) && TLI->has(LibFn)) {
- StringRef ExpName;
- Intrinsic::ID ID;
- Value *ExpFn;
- LibFunc LibFnFloat, LibFnDouble, LibFnLongDouble;
-
- switch (LibFn) {
- default:
- return nullptr;
- case LibFunc_expf: case LibFunc_exp: case LibFunc_expl:
- ExpName = TLI->getName(LibFunc_exp);
- ID = Intrinsic::exp;
- LibFnFloat = LibFunc_expf;
- LibFnDouble = LibFunc_exp;
- LibFnLongDouble = LibFunc_expl;
- break;
- case LibFunc_exp2f: case LibFunc_exp2: case LibFunc_exp2l:
- ExpName = TLI->getName(LibFunc_exp2);
- ID = Intrinsic::exp2;
- LibFnFloat = LibFunc_exp2f;
- LibFnDouble = LibFunc_exp2;
- LibFnLongDouble = LibFunc_exp2l;
- break;
- }
-
- // Create new exp{,2}() with the product as its argument.
- Value *FMul = B.CreateFMul(BaseFn->getArgOperand(0), Expo, "mul");
- ExpFn = BaseFn->doesNotAccessMemory()
- ? B.CreateCall(Intrinsic::getDeclaration(Mod, ID, Ty),
- FMul, ExpName)
- : emitUnaryFloatFnCall(FMul, TLI, LibFnDouble, LibFnFloat,
- LibFnLongDouble, B,
- BaseFn->getAttributes());
-
- // Since the new exp{,2}() is different from the original one, dead code
- // elimination cannot be trusted to remove it, since it may have side
- // effects (e.g., errno). When the only consumer for the original
- // exp{,2}() is pow(), then it has to be explicitly erased.
- substituteInParent(BaseFn, ExpFn);
- return ExpFn;
- }
- }
-
- // Evaluate special cases related to a constant base.
-
- const APFloat *BaseF;
- if (!match(Pow->getArgOperand(0), m_APFloat(BaseF)))
- return nullptr;
-
- // pow(2.0, itofp(x)) -> ldexp(1.0, x)
- if (match(Base, m_SpecificFP(2.0)) &&
- (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) &&
- hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
- if (Value *ExpoI = getIntToFPVal(Expo, B))
- return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI, TLI,
- LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
- B, Attrs);
- }
-
- // pow(2.0 ** n, x) -> exp2(n * x)
- if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) {
- APFloat BaseR = APFloat(1.0);
- BaseR.convert(BaseF->getSemantics(), APFloat::rmTowardZero, &Ignored);
- BaseR = BaseR / *BaseF;
- bool IsInteger = BaseF->isInteger(), IsReciprocal = BaseR.isInteger();
- const APFloat *NF = IsReciprocal ? &BaseR : BaseF;
- APSInt NI(64, false);
- if ((IsInteger || IsReciprocal) &&
- NF->convertToInteger(NI, APFloat::rmTowardZero, &Ignored) ==
- APFloat::opOK &&
- NI > 1 && NI.isPowerOf2()) {
- double N = NI.logBase2() * (IsReciprocal ? -1.0 : 1.0);
- Value *FMul = B.CreateFMul(Expo, ConstantFP::get(Ty, N), "mul");
- if (Pow->doesNotAccessMemory())
- return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty),
- FMul, "exp2");
- else
- return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
- LibFunc_exp2l, B, Attrs);
- }
- }
-
- // pow(10.0, x) -> exp10(x)
- // TODO: There is no exp10() intrinsic yet, but some day there shall be one.
- if (match(Base, m_SpecificFP(10.0)) &&
- hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
- return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f,
- LibFunc_exp10l, B, Attrs);
-
- // pow(x, y) -> exp2(log2(x) * y)
- if (Pow->hasApproxFunc() && Pow->hasNoNaNs() && BaseF->isFiniteNonZero() &&
- !BaseF->isNegative()) {
- // pow(1, inf) is defined to be 1 but exp2(log2(1) * inf) evaluates to NaN.
- // Luckily optimizePow has already handled the x == 1 case.
- assert(!match(Base, m_FPOne()) &&
- "pow(1.0, y) should have been simplified earlier!");
-
- Value *Log = nullptr;
- if (Ty->isFloatTy())
- Log = ConstantFP::get(Ty, std::log2(BaseF->convertToFloat()));
- else if (Ty->isDoubleTy())
- Log = ConstantFP::get(Ty, std::log2(BaseF->convertToDouble()));
-
- if (Log) {
- Value *FMul = B.CreateFMul(Log, Expo, "mul");
- if (Pow->doesNotAccessMemory())
- return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty),
- FMul, "exp2");
- else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l))
- return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
- LibFunc_exp2l, B, Attrs);
- }
- }
-
- return nullptr;
-}
-
-static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno,
- Module *M, IRBuilderBase &B,
- const TargetLibraryInfo *TLI) {
- // If errno is never set, then use the intrinsic for sqrt().
- if (NoErrno) {
- Function *SqrtFn =
- Intrinsic::getDeclaration(M, Intrinsic::sqrt, V->getType());
- return B.CreateCall(SqrtFn, V, "sqrt");
- }
-
- // Otherwise, use the libcall for sqrt().
- if (hasFloatFn(TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf, LibFunc_sqrtl))
- // TODO: We also should check that the target can in fact lower the sqrt()
- // libcall. We currently have no way to ask this question, so we ask if
- // the target has a sqrt() libcall, which is not exactly the same.
- return emitUnaryFloatFnCall(V, TLI, LibFunc_sqrt, LibFunc_sqrtf,
- LibFunc_sqrtl, B, Attrs);
-
- return nullptr;
-}
-
-/// Use square root in place of pow(x, +/-0.5).
-Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilderBase &B) {
- Value *Sqrt, *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1);
- AttributeList Attrs; // Attributes are only meaningful on the original call
- Module *Mod = Pow->getModule();
- Type *Ty = Pow->getType();
-
- const APFloat *ExpoF;
- if (!match(Expo, m_APFloat(ExpoF)) ||
- (!ExpoF->isExactlyValue(0.5) && !ExpoF->isExactlyValue(-0.5)))
- return nullptr;
-
- // Converting pow(X, -0.5) to 1/sqrt(X) may introduce an extra rounding step,
- // so that requires fast-math-flags (afn or reassoc).
- if (ExpoF->isNegative() && (!Pow->hasApproxFunc() && !Pow->hasAllowReassoc()))
- return nullptr;
-
+ return CI->getArgOperand(0);
+}
+
+Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) {
+ if (isa<ConstantPointerNull>(CI->getArgOperand(0)))
+ return emitMalloc(CI->getArgOperand(1), B, DL, TLI);
+
+ return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Math Library Optimizations
+//===----------------------------------------------------------------------===//
+
+// Replace a libcall \p CI with a call to intrinsic \p IID
+static Value *replaceUnaryCall(CallInst *CI, IRBuilderBase &B,
+ Intrinsic::ID IID) {
+ // Propagate fast-math flags from the existing call to the new call.
+ IRBuilderBase::FastMathFlagGuard Guard(B);
+ B.setFastMathFlags(CI->getFastMathFlags());
+
+ Module *M = CI->getModule();
+ Value *V = CI->getArgOperand(0);
+ Function *F = Intrinsic::getDeclaration(M, IID, CI->getType());
+ CallInst *NewCall = B.CreateCall(F, V);
+ NewCall->takeName(CI);
+ return NewCall;
+}
+
+/// Return a variant of Val with float type.
+/// Currently this works in two cases: If Val is an FPExtension of a float
+/// value to something bigger, simply return the operand.
+/// If Val is a ConstantFP but can be converted to a float ConstantFP without
+/// loss of precision do so.
+static Value *valueHasFloatPrecision(Value *Val) {
+ if (FPExtInst *Cast = dyn_cast<FPExtInst>(Val)) {
+ Value *Op = Cast->getOperand(0);
+ if (Op->getType()->isFloatTy())
+ return Op;
+ }
+ if (ConstantFP *Const = dyn_cast<ConstantFP>(Val)) {
+ APFloat F = Const->getValueAPF();
+ bool losesInfo;
+ (void)F.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
+ &losesInfo);
+ if (!losesInfo)
+ return ConstantFP::get(Const->getContext(), F);
+ }
+ return nullptr;
+}
+
+/// Shrink double -> float functions.
+static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B,
+ bool isBinary, bool isPrecise = false) {
+ Function *CalleeFn = CI->getCalledFunction();
+ if (!CI->getType()->isDoubleTy() || !CalleeFn)
+ return nullptr;
+
+ // If not all the uses of the function are converted to float, then bail out.
+ // This matters if the precision of the result is more important than the
+ // precision of the arguments.
+ if (isPrecise)
+ for (User *U : CI->users()) {
+ FPTruncInst *Cast = dyn_cast<FPTruncInst>(U);
+ if (!Cast || !Cast->getType()->isFloatTy())
+ return nullptr;
+ }
+
+ // If this is something like 'g((double) float)', convert to 'gf(float)'.
+ Value *V[2];
+ V[0] = valueHasFloatPrecision(CI->getArgOperand(0));
+ V[1] = isBinary ? valueHasFloatPrecision(CI->getArgOperand(1)) : nullptr;
+ if (!V[0] || (isBinary && !V[1]))
+ return nullptr;
+
+ // If call isn't an intrinsic, check that it isn't within a function with the
+ // same name as the float version of this call, otherwise the result is an
+ // infinite loop. For example, from MinGW-w64:
+ //
+ // float expf(float val) { return (float) exp((double) val); }
+ StringRef CalleeName = CalleeFn->getName();
+ bool IsIntrinsic = CalleeFn->isIntrinsic();
+ if (!IsIntrinsic) {
+ StringRef CallerName = CI->getFunction()->getName();
+ if (!CallerName.empty() && CallerName.back() == 'f' &&
+ CallerName.size() == (CalleeName.size() + 1) &&
+ CallerName.startswith(CalleeName))
+ return nullptr;
+ }
+
+ // Propagate the math semantics from the current function to the new function.
+ IRBuilderBase::FastMathFlagGuard Guard(B);
+ B.setFastMathFlags(CI->getFastMathFlags());
+
+ // g((double) float) -> (double) gf(float)
+ Value *R;
+ if (IsIntrinsic) {
+ Module *M = CI->getModule();
+ Intrinsic::ID IID = CalleeFn->getIntrinsicID();
+ Function *Fn = Intrinsic::getDeclaration(M, IID, B.getFloatTy());
+ R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]);
+ } else {
+ AttributeList CalleeAttrs = CalleeFn->getAttributes();
+ R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], CalleeName, B, CalleeAttrs)
+ : emitUnaryFloatFnCall(V[0], CalleeName, B, CalleeAttrs);
+ }
+ return B.CreateFPExt(R, B.getDoubleTy());
+}
+
+/// Shrink double -> float for unary functions.
+static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilderBase &B,
+ bool isPrecise = false) {
+ return optimizeDoubleFP(CI, B, false, isPrecise);
+}
+
+/// Shrink double -> float for binary functions.
+static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilderBase &B,
+ bool isPrecise = false) {
+ return optimizeDoubleFP(CI, B, true, isPrecise);
+}
+
+// cabs(z) -> sqrt((creal(z)*creal(z)) + (cimag(z)*cimag(z)))
+Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilderBase &B) {
+ if (!CI->isFast())
+ return nullptr;
+
+ // Propagate fast-math flags from the existing call to new instructions.
+ IRBuilderBase::FastMathFlagGuard Guard(B);
+ B.setFastMathFlags(CI->getFastMathFlags());
+
+ Value *Real, *Imag;
+ if (CI->getNumArgOperands() == 1) {
+ Value *Op = CI->getArgOperand(0);
+ assert(Op->getType()->isArrayTy() && "Unexpected signature for cabs!");
+ Real = B.CreateExtractValue(Op, 0, "real");
+ Imag = B.CreateExtractValue(Op, 1, "imag");
+ } else {
+ assert(CI->getNumArgOperands() == 2 && "Unexpected signature for cabs!");
+ Real = CI->getArgOperand(0);
+ Imag = CI->getArgOperand(1);
+ }
+
+ Value *RealReal = B.CreateFMul(Real, Real);
+ Value *ImagImag = B.CreateFMul(Imag, Imag);
+
+ Function *FSqrt = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::sqrt,
+ CI->getType());
+ return B.CreateCall(FSqrt, B.CreateFAdd(RealReal, ImagImag), "cabs");
+}
+
+static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func,
+ IRBuilderBase &B) {
+ if (!isa<FPMathOperator>(Call))
+ return nullptr;
+
+ IRBuilderBase::FastMathFlagGuard Guard(B);
+ B.setFastMathFlags(Call->getFastMathFlags());
+
+ // TODO: Can this be shared to also handle LLVM intrinsics?
+ Value *X;
+ switch (Func) {
+ case LibFunc_sin:
+ case LibFunc_sinf:
+ case LibFunc_sinl:
+ case LibFunc_tan:
+ case LibFunc_tanf:
+ case LibFunc_tanl:
+ // sin(-X) --> -sin(X)
+ // tan(-X) --> -tan(X)
+ if (match(Call->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X)))))
+ return B.CreateFNeg(B.CreateCall(Call->getCalledFunction(), X));
+ break;
+ case LibFunc_cos:
+ case LibFunc_cosf:
+ case LibFunc_cosl:
+ // cos(-X) --> cos(X)
+ if (match(Call->getArgOperand(0), m_FNeg(m_Value(X))))
+ return B.CreateCall(Call->getCalledFunction(), X, "cos");
+ break;
+ default:
+ break;
+ }
+ return nullptr;
+}
+
+static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilderBase &B) {
+ // Multiplications calculated using Addition Chains.
+ // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
+
+ assert(Exp != 0 && "Incorrect exponent 0 not handled");
+
+ if (InnerChain[Exp])
+ return InnerChain[Exp];
+
+ static const unsigned AddChain[33][2] = {
+ {0, 0}, // Unused.
+ {0, 0}, // Unused (base case = pow1).
+ {1, 1}, // Unused (pre-computed).
+ {1, 2}, {2, 2}, {2, 3}, {3, 3}, {2, 5}, {4, 4},
+ {1, 8}, {5, 5}, {1, 10}, {6, 6}, {4, 9}, {7, 7},
+ {3, 12}, {8, 8}, {8, 9}, {2, 16}, {1, 18}, {10, 10},
+ {6, 15}, {11, 11}, {3, 20}, {12, 12}, {8, 17}, {13, 13},
+ {3, 24}, {14, 14}, {4, 25}, {15, 15}, {3, 28}, {16, 16},
+ };
+
+ InnerChain[Exp] = B.CreateFMul(getPow(InnerChain, AddChain[Exp][0], B),
+ getPow(InnerChain, AddChain[Exp][1], B));
+ return InnerChain[Exp];
+}
+
+// Return a properly extended 32-bit integer if the operation is an itofp.
+static Value *getIntToFPVal(Value *I2F, IRBuilderBase &B) {
+ if (isa<SIToFPInst>(I2F) || isa<UIToFPInst>(I2F)) {
+ Value *Op = cast<Instruction>(I2F)->getOperand(0);
+ // Make sure that the exponent fits inside an int32_t,
+ // thus avoiding any range issues that FP has not.
+ unsigned BitWidth = Op->getType()->getPrimitiveSizeInBits();
+ if (BitWidth < 32 ||
+ (BitWidth == 32 && isa<SIToFPInst>(I2F)))
+ return isa<SIToFPInst>(I2F) ? B.CreateSExt(Op, B.getInt32Ty())
+ : B.CreateZExt(Op, B.getInt32Ty());
+ }
+
+ return nullptr;
+}
+
+/// Use exp{,2}(x * y) for pow(exp{,2}(x), y);
+/// ldexp(1.0, x) for pow(2.0, itofp(x)); exp2(n * x) for pow(2.0 ** n, x);
+/// exp10(x) for pow(10.0, x); exp2(log2(n) * x) for pow(n, x).
+Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
+ Value *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1);
+ AttributeList Attrs; // Attributes are only meaningful on the original call
+ Module *Mod = Pow->getModule();
+ Type *Ty = Pow->getType();
+ bool Ignored;
+
+ // Evaluate special cases related to a nested function as the base.
+
+ // pow(exp(x), y) -> exp(x * y)
+ // pow(exp2(x), y) -> exp2(x * y)
+ // If exp{,2}() is used only once, it is better to fold two transcendental
+ // math functions into one. If used again, exp{,2}() would still have to be
+ // called with the original argument, then keep both original transcendental
+ // functions. However, this transformation is only safe with fully relaxed
+ // math semantics, since, besides rounding differences, it changes overflow
+ // and underflow behavior quite dramatically. For example:
+ // pow(exp(1000), 0.001) = pow(inf, 0.001) = inf
+ // Whereas:
+ // exp(1000 * 0.001) = exp(1)
+ // TODO: Loosen the requirement for fully relaxed math semantics.
+ // TODO: Handle exp10() when more targets have it available.
+ CallInst *BaseFn = dyn_cast<CallInst>(Base);
+ if (BaseFn && BaseFn->hasOneUse() && BaseFn->isFast() && Pow->isFast()) {
+ LibFunc LibFn;
+
+ Function *CalleeFn = BaseFn->getCalledFunction();
+ if (CalleeFn &&
+ TLI->getLibFunc(CalleeFn->getName(), LibFn) && TLI->has(LibFn)) {
+ StringRef ExpName;
+ Intrinsic::ID ID;
+ Value *ExpFn;
+ LibFunc LibFnFloat, LibFnDouble, LibFnLongDouble;
+
+ switch (LibFn) {
+ default:
+ return nullptr;
+ case LibFunc_expf: case LibFunc_exp: case LibFunc_expl:
+ ExpName = TLI->getName(LibFunc_exp);
+ ID = Intrinsic::exp;
+ LibFnFloat = LibFunc_expf;
+ LibFnDouble = LibFunc_exp;
+ LibFnLongDouble = LibFunc_expl;
+ break;
+ case LibFunc_exp2f: case LibFunc_exp2: case LibFunc_exp2l:
+ ExpName = TLI->getName(LibFunc_exp2);
+ ID = Intrinsic::exp2;
+ LibFnFloat = LibFunc_exp2f;
+ LibFnDouble = LibFunc_exp2;
+ LibFnLongDouble = LibFunc_exp2l;
+ break;
+ }
+
+ // Create new exp{,2}() with the product as its argument.
+ Value *FMul = B.CreateFMul(BaseFn->getArgOperand(0), Expo, "mul");
+ ExpFn = BaseFn->doesNotAccessMemory()
+ ? B.CreateCall(Intrinsic::getDeclaration(Mod, ID, Ty),
+ FMul, ExpName)
+ : emitUnaryFloatFnCall(FMul, TLI, LibFnDouble, LibFnFloat,
+ LibFnLongDouble, B,
+ BaseFn->getAttributes());
+
+ // Since the new exp{,2}() is different from the original one, dead code
+ // elimination cannot be trusted to remove it, since it may have side
+ // effects (e.g., errno). When the only consumer for the original
+ // exp{,2}() is pow(), then it has to be explicitly erased.
+ substituteInParent(BaseFn, ExpFn);
+ return ExpFn;
+ }
+ }
+
+ // Evaluate special cases related to a constant base.
+
+ const APFloat *BaseF;
+ if (!match(Pow->getArgOperand(0), m_APFloat(BaseF)))
+ return nullptr;
+
+ // pow(2.0, itofp(x)) -> ldexp(1.0, x)
+ if (match(Base, m_SpecificFP(2.0)) &&
+ (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo)) &&
+ hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+ if (Value *ExpoI = getIntToFPVal(Expo, B))
+ return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), ExpoI, TLI,
+ LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
+ B, Attrs);
+ }
+
+ // pow(2.0 ** n, x) -> exp2(n * x)
+ if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l)) {
+ APFloat BaseR = APFloat(1.0);
+ BaseR.convert(BaseF->getSemantics(), APFloat::rmTowardZero, &Ignored);
+ BaseR = BaseR / *BaseF;
+ bool IsInteger = BaseF->isInteger(), IsReciprocal = BaseR.isInteger();
+ const APFloat *NF = IsReciprocal ? &BaseR : BaseF;
+ APSInt NI(64, false);
+ if ((IsInteger || IsReciprocal) &&
+ NF->convertToInteger(NI, APFloat::rmTowardZero, &Ignored) ==
+ APFloat::opOK &&
+ NI > 1 && NI.isPowerOf2()) {
+ double N = NI.logBase2() * (IsReciprocal ? -1.0 : 1.0);
+ Value *FMul = B.CreateFMul(Expo, ConstantFP::get(Ty, N), "mul");
+ if (Pow->doesNotAccessMemory())
+ return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty),
+ FMul, "exp2");
+ else
+ return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
+ LibFunc_exp2l, B, Attrs);
+ }
+ }
+
+ // pow(10.0, x) -> exp10(x)
+ // TODO: There is no exp10() intrinsic yet, but some day there shall be one.
+ if (match(Base, m_SpecificFP(10.0)) &&
+ hasFloatFn(TLI, Ty, LibFunc_exp10, LibFunc_exp10f, LibFunc_exp10l))
+ return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f,
+ LibFunc_exp10l, B, Attrs);
+
+ // pow(x, y) -> exp2(log2(x) * y)
+ if (Pow->hasApproxFunc() && Pow->hasNoNaNs() && BaseF->isFiniteNonZero() &&
+ !BaseF->isNegative()) {
+ // pow(1, inf) is defined to be 1 but exp2(log2(1) * inf) evaluates to NaN.
+ // Luckily optimizePow has already handled the x == 1 case.
+ assert(!match(Base, m_FPOne()) &&
+ "pow(1.0, y) should have been simplified earlier!");
+
+ Value *Log = nullptr;
+ if (Ty->isFloatTy())
+ Log = ConstantFP::get(Ty, std::log2(BaseF->convertToFloat()));
+ else if (Ty->isDoubleTy())
+ Log = ConstantFP::get(Ty, std::log2(BaseF->convertToDouble()));
+
+ if (Log) {
+ Value *FMul = B.CreateFMul(Log, Expo, "mul");
+ if (Pow->doesNotAccessMemory())
+ return B.CreateCall(Intrinsic::getDeclaration(Mod, Intrinsic::exp2, Ty),
+ FMul, "exp2");
+ else if (hasFloatFn(TLI, Ty, LibFunc_exp2, LibFunc_exp2f, LibFunc_exp2l))
+ return emitUnaryFloatFnCall(FMul, TLI, LibFunc_exp2, LibFunc_exp2f,
+ LibFunc_exp2l, B, Attrs);
+ }
+ }
+
+ return nullptr;
+}
+
+static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno,
+ Module *M, IRBuilderBase &B,
+ const TargetLibraryInfo *TLI) {
+ // If errno is never set, then use the intrinsic for sqrt().
+ if (NoErrno) {
+ Function *SqrtFn =
+ Intrinsic::getDeclaration(M, Intrinsic::sqrt, V->getType());
+ return B.CreateCall(SqrtFn, V, "sqrt");
+ }
+
+ // Otherwise, use the libcall for sqrt().
+ if (hasFloatFn(TLI, V->getType(), LibFunc_sqrt, LibFunc_sqrtf, LibFunc_sqrtl))
+ // TODO: We also should check that the target can in fact lower the sqrt()
+ // libcall. We currently have no way to ask this question, so we ask if
+ // the target has a sqrt() libcall, which is not exactly the same.
+ return emitUnaryFloatFnCall(V, TLI, LibFunc_sqrt, LibFunc_sqrtf,
+ LibFunc_sqrtl, B, Attrs);
+
+ return nullptr;
+}
+
+/// Use square root in place of pow(x, +/-0.5).
+Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilderBase &B) {
+ Value *Sqrt, *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1);
+ AttributeList Attrs; // Attributes are only meaningful on the original call
+ Module *Mod = Pow->getModule();
+ Type *Ty = Pow->getType();
+
+ const APFloat *ExpoF;
+ if (!match(Expo, m_APFloat(ExpoF)) ||
+ (!ExpoF->isExactlyValue(0.5) && !ExpoF->isExactlyValue(-0.5)))
+ return nullptr;
+
+ // Converting pow(X, -0.5) to 1/sqrt(X) may introduce an extra rounding step,
+ // so that requires fast-math-flags (afn or reassoc).
+ if (ExpoF->isNegative() && (!Pow->hasApproxFunc() && !Pow->hasAllowReassoc()))
+ return nullptr;
+
// If we have a pow() library call (accesses memory) and we can't guarantee
// that the base is not an infinity, give up:
// pow(-Inf, 0.5) is optionally required to have a result of +Inf (not setting
@@ -1661,867 +1661,867 @@ Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilderBase &B) {
!isKnownNeverInfinity(Base, TLI))
return nullptr;
- Sqrt = getSqrtCall(Base, Attrs, Pow->doesNotAccessMemory(), Mod, B, TLI);
- if (!Sqrt)
- return nullptr;
-
- // Handle signed zero base by expanding to fabs(sqrt(x)).
- if (!Pow->hasNoSignedZeros()) {
- Function *FAbsFn = Intrinsic::getDeclaration(Mod, Intrinsic::fabs, Ty);
- Sqrt = B.CreateCall(FAbsFn, Sqrt, "abs");
- }
-
- // Handle non finite base by expanding to
- // (x == -infinity ? +infinity : sqrt(x)).
- if (!Pow->hasNoInfs()) {
- Value *PosInf = ConstantFP::getInfinity(Ty),
- *NegInf = ConstantFP::getInfinity(Ty, true);
- Value *FCmp = B.CreateFCmpOEQ(Base, NegInf, "isinf");
- Sqrt = B.CreateSelect(FCmp, PosInf, Sqrt);
- }
-
- // If the exponent is negative, then get the reciprocal.
- if (ExpoF->isNegative())
- Sqrt = B.CreateFDiv(ConstantFP::get(Ty, 1.0), Sqrt, "reciprocal");
-
- return Sqrt;
-}
-
-static Value *createPowWithIntegerExponent(Value *Base, Value *Expo, Module *M,
- IRBuilderBase &B) {
- Value *Args[] = {Base, Expo};
- Function *F = Intrinsic::getDeclaration(M, Intrinsic::powi, Base->getType());
- return B.CreateCall(F, Args);
-}
-
-Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
- Value *Base = Pow->getArgOperand(0);
- Value *Expo = Pow->getArgOperand(1);
- Function *Callee = Pow->getCalledFunction();
- StringRef Name = Callee->getName();
- Type *Ty = Pow->getType();
- Module *M = Pow->getModule();
- Value *Shrunk = nullptr;
- bool AllowApprox = Pow->hasApproxFunc();
- bool Ignored;
-
- // Propagate the math semantics from the call to any created instructions.
- IRBuilderBase::FastMathFlagGuard Guard(B);
- B.setFastMathFlags(Pow->getFastMathFlags());
-
- // Shrink pow() to powf() if the arguments are single precision,
- // unless the result is expected to be double precision.
- if (UnsafeFPShrink && Name == TLI->getName(LibFunc_pow) &&
- hasFloatVersion(Name))
- Shrunk = optimizeBinaryDoubleFP(Pow, B, true);
-
- // Evaluate special cases related to the base.
-
- // pow(1.0, x) -> 1.0
- if (match(Base, m_FPOne()))
- return Base;
-
- if (Value *Exp = replacePowWithExp(Pow, B))
- return Exp;
-
- // Evaluate special cases related to the exponent.
-
- // pow(x, -1.0) -> 1.0 / x
- if (match(Expo, m_SpecificFP(-1.0)))
- return B.CreateFDiv(ConstantFP::get(Ty, 1.0), Base, "reciprocal");
-
- // pow(x, +/-0.0) -> 1.0
- if (match(Expo, m_AnyZeroFP()))
- return ConstantFP::get(Ty, 1.0);
-
- // pow(x, 1.0) -> x
- if (match(Expo, m_FPOne()))
- return Base;
-
- // pow(x, 2.0) -> x * x
- if (match(Expo, m_SpecificFP(2.0)))
- return B.CreateFMul(Base, Base, "square");
-
- if (Value *Sqrt = replacePowWithSqrt(Pow, B))
- return Sqrt;
-
- // pow(x, n) -> x * x * x * ...
- const APFloat *ExpoF;
+ Sqrt = getSqrtCall(Base, Attrs, Pow->doesNotAccessMemory(), Mod, B, TLI);
+ if (!Sqrt)
+ return nullptr;
+
+ // Handle signed zero base by expanding to fabs(sqrt(x)).
+ if (!Pow->hasNoSignedZeros()) {
+ Function *FAbsFn = Intrinsic::getDeclaration(Mod, Intrinsic::fabs, Ty);
+ Sqrt = B.CreateCall(FAbsFn, Sqrt, "abs");
+ }
+
+ // Handle non finite base by expanding to
+ // (x == -infinity ? +infinity : sqrt(x)).
+ if (!Pow->hasNoInfs()) {
+ Value *PosInf = ConstantFP::getInfinity(Ty),
+ *NegInf = ConstantFP::getInfinity(Ty, true);
+ Value *FCmp = B.CreateFCmpOEQ(Base, NegInf, "isinf");
+ Sqrt = B.CreateSelect(FCmp, PosInf, Sqrt);
+ }
+
+ // If the exponent is negative, then get the reciprocal.
+ if (ExpoF->isNegative())
+ Sqrt = B.CreateFDiv(ConstantFP::get(Ty, 1.0), Sqrt, "reciprocal");
+
+ return Sqrt;
+}
+
+static Value *createPowWithIntegerExponent(Value *Base, Value *Expo, Module *M,
+ IRBuilderBase &B) {
+ Value *Args[] = {Base, Expo};
+ Function *F = Intrinsic::getDeclaration(M, Intrinsic::powi, Base->getType());
+ return B.CreateCall(F, Args);
+}
+
+Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
+ Value *Base = Pow->getArgOperand(0);
+ Value *Expo = Pow->getArgOperand(1);
+ Function *Callee = Pow->getCalledFunction();
+ StringRef Name = Callee->getName();
+ Type *Ty = Pow->getType();
+ Module *M = Pow->getModule();
+ Value *Shrunk = nullptr;
+ bool AllowApprox = Pow->hasApproxFunc();
+ bool Ignored;
+
+ // Propagate the math semantics from the call to any created instructions.
+ IRBuilderBase::FastMathFlagGuard Guard(B);
+ B.setFastMathFlags(Pow->getFastMathFlags());
+
+ // Shrink pow() to powf() if the arguments are single precision,
+ // unless the result is expected to be double precision.
+ if (UnsafeFPShrink && Name == TLI->getName(LibFunc_pow) &&
+ hasFloatVersion(Name))
+ Shrunk = optimizeBinaryDoubleFP(Pow, B, true);
+
+ // Evaluate special cases related to the base.
+
+ // pow(1.0, x) -> 1.0
+ if (match(Base, m_FPOne()))
+ return Base;
+
+ if (Value *Exp = replacePowWithExp(Pow, B))
+ return Exp;
+
+ // Evaluate special cases related to the exponent.
+
+ // pow(x, -1.0) -> 1.0 / x
+ if (match(Expo, m_SpecificFP(-1.0)))
+ return B.CreateFDiv(ConstantFP::get(Ty, 1.0), Base, "reciprocal");
+
+ // pow(x, +/-0.0) -> 1.0
+ if (match(Expo, m_AnyZeroFP()))
+ return ConstantFP::get(Ty, 1.0);
+
+ // pow(x, 1.0) -> x
+ if (match(Expo, m_FPOne()))
+ return Base;
+
+ // pow(x, 2.0) -> x * x
+ if (match(Expo, m_SpecificFP(2.0)))
+ return B.CreateFMul(Base, Base, "square");
+
+ if (Value *Sqrt = replacePowWithSqrt(Pow, B))
+ return Sqrt;
+
+ // pow(x, n) -> x * x * x * ...
+ const APFloat *ExpoF;
if (AllowApprox && match(Expo, m_APFloat(ExpoF)) &&
!ExpoF->isExactlyValue(0.5) && !ExpoF->isExactlyValue(-0.5)) {
- // We limit to a max of 7 multiplications, thus the maximum exponent is 32.
- // If the exponent is an integer+0.5 we generate a call to sqrt and an
- // additional fmul.
- // TODO: This whole transformation should be backend specific (e.g. some
- // backends might prefer libcalls or the limit for the exponent might
- // be different) and it should also consider optimizing for size.
- APFloat LimF(ExpoF->getSemantics(), 33),
- ExpoA(abs(*ExpoF));
- if (ExpoA < LimF) {
- // This transformation applies to integer or integer+0.5 exponents only.
- // For integer+0.5, we create a sqrt(Base) call.
- Value *Sqrt = nullptr;
- if (!ExpoA.isInteger()) {
- APFloat Expo2 = ExpoA;
- // To check if ExpoA is an integer + 0.5, we add it to itself. If there
- // is no floating point exception and the result is an integer, then
- // ExpoA == integer + 0.5
- if (Expo2.add(ExpoA, APFloat::rmNearestTiesToEven) != APFloat::opOK)
- return nullptr;
-
- if (!Expo2.isInteger())
- return nullptr;
-
- Sqrt = getSqrtCall(Base, Pow->getCalledFunction()->getAttributes(),
- Pow->doesNotAccessMemory(), M, B, TLI);
+ // We limit to a max of 7 multiplications, thus the maximum exponent is 32.
+ // If the exponent is an integer+0.5 we generate a call to sqrt and an
+ // additional fmul.
+ // TODO: This whole transformation should be backend specific (e.g. some
+ // backends might prefer libcalls or the limit for the exponent might
+ // be different) and it should also consider optimizing for size.
+ APFloat LimF(ExpoF->getSemantics(), 33),
+ ExpoA(abs(*ExpoF));
+ if (ExpoA < LimF) {
+ // This transformation applies to integer or integer+0.5 exponents only.
+ // For integer+0.5, we create a sqrt(Base) call.
+ Value *Sqrt = nullptr;
+ if (!ExpoA.isInteger()) {
+ APFloat Expo2 = ExpoA;
+ // To check if ExpoA is an integer + 0.5, we add it to itself. If there
+ // is no floating point exception and the result is an integer, then
+ // ExpoA == integer + 0.5
+ if (Expo2.add(ExpoA, APFloat::rmNearestTiesToEven) != APFloat::opOK)
+ return nullptr;
+
+ if (!Expo2.isInteger())
+ return nullptr;
+
+ Sqrt = getSqrtCall(Base, Pow->getCalledFunction()->getAttributes(),
+ Pow->doesNotAccessMemory(), M, B, TLI);
if (!Sqrt)
return nullptr;
- }
-
- // We will memoize intermediate products of the Addition Chain.
- Value *InnerChain[33] = {nullptr};
- InnerChain[1] = Base;
- InnerChain[2] = B.CreateFMul(Base, Base, "square");
-
- // We cannot readily convert a non-double type (like float) to a double.
- // So we first convert it to something which could be converted to double.
- ExpoA.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &Ignored);
- Value *FMul = getPow(InnerChain, ExpoA.convertToDouble(), B);
-
- // Expand pow(x, y+0.5) to pow(x, y) * sqrt(x).
- if (Sqrt)
- FMul = B.CreateFMul(FMul, Sqrt);
-
- // If the exponent is negative, then get the reciprocal.
- if (ExpoF->isNegative())
- FMul = B.CreateFDiv(ConstantFP::get(Ty, 1.0), FMul, "reciprocal");
-
- return FMul;
- }
-
- APSInt IntExpo(32, /*isUnsigned=*/false);
- // powf(x, n) -> powi(x, n) if n is a constant signed integer value
- if (ExpoF->isInteger() &&
- ExpoF->convertToInteger(IntExpo, APFloat::rmTowardZero, &Ignored) ==
- APFloat::opOK) {
- return createPowWithIntegerExponent(
- Base, ConstantInt::get(B.getInt32Ty(), IntExpo), M, B);
- }
- }
-
- // powf(x, itofp(y)) -> powi(x, y)
- if (AllowApprox && (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo))) {
- if (Value *ExpoI = getIntToFPVal(Expo, B))
- return createPowWithIntegerExponent(Base, ExpoI, M, B);
- }
-
- return Shrunk;
-}
-
-Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
- Function *Callee = CI->getCalledFunction();
- AttributeList Attrs; // Attributes are only meaningful on the original call
- StringRef Name = Callee->getName();
- Value *Ret = nullptr;
- if (UnsafeFPShrink && Name == TLI->getName(LibFunc_exp2) &&
- hasFloatVersion(Name))
- Ret = optimizeUnaryDoubleFP(CI, B, true);
-
- Type *Ty = CI->getType();
- Value *Op = CI->getArgOperand(0);
-
- // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x)) if sizeof(x) <= 32
- // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x)) if sizeof(x) < 32
- if ((isa<SIToFPInst>(Op) || isa<UIToFPInst>(Op)) &&
- hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
- if (Value *Exp = getIntToFPVal(Op, B))
- return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), Exp, TLI,
- LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
- B, Attrs);
- }
-
- return Ret;
-}
-
-Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) {
- // If we can shrink the call to a float function rather than a double
- // function, do that first.
- Function *Callee = CI->getCalledFunction();
- StringRef Name = Callee->getName();
- if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name))
- if (Value *Ret = optimizeBinaryDoubleFP(CI, B))
- return Ret;
-
- // The LLVM intrinsics minnum/maxnum correspond to fmin/fmax. Canonicalize to
- // the intrinsics for improved optimization (for example, vectorization).
- // No-signed-zeros is implied by the definitions of fmax/fmin themselves.
- // From the C standard draft WG14/N1256:
- // "Ideally, fmax would be sensitive to the sign of zero, for example
- // fmax(-0.0, +0.0) would return +0; however, implementation in software
- // might be impractical."
- IRBuilderBase::FastMathFlagGuard Guard(B);
- FastMathFlags FMF = CI->getFastMathFlags();
- FMF.setNoSignedZeros();
- B.setFastMathFlags(FMF);
-
- Intrinsic::ID IID = Callee->getName().startswith("fmin") ? Intrinsic::minnum
- : Intrinsic::maxnum;
- Function *F = Intrinsic::getDeclaration(CI->getModule(), IID, CI->getType());
- return B.CreateCall(F, { CI->getArgOperand(0), CI->getArgOperand(1) });
-}
-
-Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
- Function *LogFn = Log->getCalledFunction();
- AttributeList Attrs; // Attributes are only meaningful on the original call
- StringRef LogNm = LogFn->getName();
- Intrinsic::ID LogID = LogFn->getIntrinsicID();
- Module *Mod = Log->getModule();
- Type *Ty = Log->getType();
- Value *Ret = nullptr;
-
- if (UnsafeFPShrink && hasFloatVersion(LogNm))
- Ret = optimizeUnaryDoubleFP(Log, B, true);
-
- // The earlier call must also be 'fast' in order to do these transforms.
- CallInst *Arg = dyn_cast<CallInst>(Log->getArgOperand(0));
- if (!Log->isFast() || !Arg || !Arg->isFast() || !Arg->hasOneUse())
- return Ret;
-
- LibFunc LogLb, ExpLb, Exp2Lb, Exp10Lb, PowLb;
-
- // This is only applicable to log(), log2(), log10().
- if (TLI->getLibFunc(LogNm, LogLb))
- switch (LogLb) {
- case LibFunc_logf:
- LogID = Intrinsic::log;
- ExpLb = LibFunc_expf;
- Exp2Lb = LibFunc_exp2f;
- Exp10Lb = LibFunc_exp10f;
- PowLb = LibFunc_powf;
- break;
- case LibFunc_log:
- LogID = Intrinsic::log;
- ExpLb = LibFunc_exp;
- Exp2Lb = LibFunc_exp2;
- Exp10Lb = LibFunc_exp10;
- PowLb = LibFunc_pow;
- break;
- case LibFunc_logl:
- LogID = Intrinsic::log;
- ExpLb = LibFunc_expl;
- Exp2Lb = LibFunc_exp2l;
- Exp10Lb = LibFunc_exp10l;
- PowLb = LibFunc_powl;
- break;
- case LibFunc_log2f:
- LogID = Intrinsic::log2;
- ExpLb = LibFunc_expf;
- Exp2Lb = LibFunc_exp2f;
- Exp10Lb = LibFunc_exp10f;
- PowLb = LibFunc_powf;
- break;
- case LibFunc_log2:
- LogID = Intrinsic::log2;
- ExpLb = LibFunc_exp;
- Exp2Lb = LibFunc_exp2;
- Exp10Lb = LibFunc_exp10;
- PowLb = LibFunc_pow;
- break;
- case LibFunc_log2l:
- LogID = Intrinsic::log2;
- ExpLb = LibFunc_expl;
- Exp2Lb = LibFunc_exp2l;
- Exp10Lb = LibFunc_exp10l;
- PowLb = LibFunc_powl;
- break;
- case LibFunc_log10f:
- LogID = Intrinsic::log10;
- ExpLb = LibFunc_expf;
- Exp2Lb = LibFunc_exp2f;
- Exp10Lb = LibFunc_exp10f;
- PowLb = LibFunc_powf;
- break;
- case LibFunc_log10:
- LogID = Intrinsic::log10;
- ExpLb = LibFunc_exp;
- Exp2Lb = LibFunc_exp2;
- Exp10Lb = LibFunc_exp10;
- PowLb = LibFunc_pow;
- break;
- case LibFunc_log10l:
- LogID = Intrinsic::log10;
- ExpLb = LibFunc_expl;
- Exp2Lb = LibFunc_exp2l;
- Exp10Lb = LibFunc_exp10l;
- PowLb = LibFunc_powl;
- break;
- default:
- return Ret;
- }
- else if (LogID == Intrinsic::log || LogID == Intrinsic::log2 ||
- LogID == Intrinsic::log10) {
- if (Ty->getScalarType()->isFloatTy()) {
- ExpLb = LibFunc_expf;
- Exp2Lb = LibFunc_exp2f;
- Exp10Lb = LibFunc_exp10f;
- PowLb = LibFunc_powf;
- } else if (Ty->getScalarType()->isDoubleTy()) {
- ExpLb = LibFunc_exp;
- Exp2Lb = LibFunc_exp2;
- Exp10Lb = LibFunc_exp10;
- PowLb = LibFunc_pow;
- } else
- return Ret;
- } else
- return Ret;
-
- IRBuilderBase::FastMathFlagGuard Guard(B);
- B.setFastMathFlags(FastMathFlags::getFast());
-
- Intrinsic::ID ArgID = Arg->getIntrinsicID();
- LibFunc ArgLb = NotLibFunc;
- TLI->getLibFunc(*Arg, ArgLb);
-
- // log(pow(x,y)) -> y*log(x)
- if (ArgLb == PowLb || ArgID == Intrinsic::pow) {
- Value *LogX =
- Log->doesNotAccessMemory()
- ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
- Arg->getOperand(0), "log")
- : emitUnaryFloatFnCall(Arg->getOperand(0), LogNm, B, Attrs);
- Value *MulY = B.CreateFMul(Arg->getArgOperand(1), LogX, "mul");
- // Since pow() may have side effects, e.g. errno,
- // dead code elimination may not be trusted to remove it.
- substituteInParent(Arg, MulY);
- return MulY;
- }
-
- // log(exp{,2,10}(y)) -> y*log({e,2,10})
- // TODO: There is no exp10() intrinsic yet.
- if (ArgLb == ExpLb || ArgLb == Exp2Lb || ArgLb == Exp10Lb ||
- ArgID == Intrinsic::exp || ArgID == Intrinsic::exp2) {
- Constant *Eul;
- if (ArgLb == ExpLb || ArgID == Intrinsic::exp)
- // FIXME: Add more precise value of e for long double.
- Eul = ConstantFP::get(Log->getType(), numbers::e);
- else if (ArgLb == Exp2Lb || ArgID == Intrinsic::exp2)
- Eul = ConstantFP::get(Log->getType(), 2.0);
- else
- Eul = ConstantFP::get(Log->getType(), 10.0);
- Value *LogE = Log->doesNotAccessMemory()
- ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
- Eul, "log")
- : emitUnaryFloatFnCall(Eul, LogNm, B, Attrs);
- Value *MulY = B.CreateFMul(Arg->getArgOperand(0), LogE, "mul");
- // Since exp() may have side effects, e.g. errno,
- // dead code elimination may not be trusted to remove it.
- substituteInParent(Arg, MulY);
- return MulY;
- }
-
- return Ret;
-}
-
-Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
- Function *Callee = CI->getCalledFunction();
- Value *Ret = nullptr;
- // TODO: Once we have a way (other than checking for the existince of the
- // libcall) to tell whether our target can lower @llvm.sqrt, relax the
- // condition below.
- if (TLI->has(LibFunc_sqrtf) && (Callee->getName() == "sqrt" ||
- Callee->getIntrinsicID() == Intrinsic::sqrt))
- Ret = optimizeUnaryDoubleFP(CI, B, true);
-
- if (!CI->isFast())
- return Ret;
-
- Instruction *I = dyn_cast<Instruction>(CI->getArgOperand(0));
- if (!I || I->getOpcode() != Instruction::FMul || !I->isFast())
- return Ret;
-
- // We're looking for a repeated factor in a multiplication tree,
- // so we can do this fold: sqrt(x * x) -> fabs(x);
- // or this fold: sqrt((x * x) * y) -> fabs(x) * sqrt(y).
- Value *Op0 = I->getOperand(0);
- Value *Op1 = I->getOperand(1);
- Value *RepeatOp = nullptr;
- Value *OtherOp = nullptr;
- if (Op0 == Op1) {
- // Simple match: the operands of the multiply are identical.
- RepeatOp = Op0;
- } else {
- // Look for a more complicated pattern: one of the operands is itself
- // a multiply, so search for a common factor in that multiply.
- // Note: We don't bother looking any deeper than this first level or for
- // variations of this pattern because instcombine's visitFMUL and/or the
- // reassociation pass should give us this form.
- Value *OtherMul0, *OtherMul1;
- if (match(Op0, m_FMul(m_Value(OtherMul0), m_Value(OtherMul1)))) {
- // Pattern: sqrt((x * y) * z)
- if (OtherMul0 == OtherMul1 && cast<Instruction>(Op0)->isFast()) {
- // Matched: sqrt((x * x) * z)
- RepeatOp = OtherMul0;
- OtherOp = Op1;
- }
- }
- }
- if (!RepeatOp)
- return Ret;
-
- // Fast math flags for any created instructions should match the sqrt
- // and multiply.
- IRBuilderBase::FastMathFlagGuard Guard(B);
- B.setFastMathFlags(I->getFastMathFlags());
-
- // If we found a repeated factor, hoist it out of the square root and
- // replace it with the fabs of that factor.
- Module *M = Callee->getParent();
- Type *ArgType = I->getType();
- Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType);
- Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs");
- if (OtherOp) {
- // If we found a non-repeated factor, we still need to get its square
- // root. We then multiply that by the value that was simplified out
- // of the square root calculation.
- Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType);
- Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt");
- return B.CreateFMul(FabsCall, SqrtCall);
- }
- return FabsCall;
-}
-
-// TODO: Generalize to handle any trig function and its inverse.
-Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilderBase &B) {
- Function *Callee = CI->getCalledFunction();
- Value *Ret = nullptr;
- StringRef Name = Callee->getName();
- if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name))
- Ret = optimizeUnaryDoubleFP(CI, B, true);
-
- Value *Op1 = CI->getArgOperand(0);
- auto *OpC = dyn_cast<CallInst>(Op1);
- if (!OpC)
- return Ret;
-
- // Both calls must be 'fast' in order to remove them.
- if (!CI->isFast() || !OpC->isFast())
- return Ret;
-
- // tan(atan(x)) -> x
- // tanf(atanf(x)) -> x
- // tanl(atanl(x)) -> x
- LibFunc Func;
- Function *F = OpC->getCalledFunction();
- if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
- ((Func == LibFunc_atan && Callee->getName() == "tan") ||
- (Func == LibFunc_atanf && Callee->getName() == "tanf") ||
- (Func == LibFunc_atanl && Callee->getName() == "tanl")))
- Ret = OpC->getArgOperand(0);
- return Ret;
-}
-
-static bool isTrigLibCall(CallInst *CI) {
- // We can only hope to do anything useful if we can ignore things like errno
- // and floating-point exceptions.
- // We already checked the prototype.
- return CI->hasFnAttr(Attribute::NoUnwind) &&
- CI->hasFnAttr(Attribute::ReadNone);
-}
-
-static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
- bool UseFloat, Value *&Sin, Value *&Cos,
- Value *&SinCos) {
- Type *ArgTy = Arg->getType();
- Type *ResTy;
- StringRef Name;
-
- Triple T(OrigCallee->getParent()->getTargetTriple());
- if (UseFloat) {
- Name = "__sincospif_stret";
-
- assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now");
- // x86_64 can't use {float, float} since that would be returned in both
- // xmm0 and xmm1, which isn't what a real struct would do.
- ResTy = T.getArch() == Triple::x86_64
- ? static_cast<Type *>(FixedVectorType::get(ArgTy, 2))
- : static_cast<Type *>(StructType::get(ArgTy, ArgTy));
- } else {
- Name = "__sincospi_stret";
- ResTy = StructType::get(ArgTy, ArgTy);
- }
-
- Module *M = OrigCallee->getParent();
- FunctionCallee Callee =
- M->getOrInsertFunction(Name, OrigCallee->getAttributes(), ResTy, ArgTy);
-
- if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
- // If the argument is an instruction, it must dominate all uses so put our
- // sincos call there.
- B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
- } else {
- // Otherwise (e.g. for a constant) the beginning of the function is as
- // good a place as any.
- BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock();
- B.SetInsertPoint(&EntryBB, EntryBB.begin());
- }
-
- SinCos = B.CreateCall(Callee, Arg, "sincospi");
-
- if (SinCos->getType()->isStructTy()) {
- Sin = B.CreateExtractValue(SinCos, 0, "sinpi");
- Cos = B.CreateExtractValue(SinCos, 1, "cospi");
- } else {
- Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0),
- "sinpi");
- Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
- "cospi");
- }
-}
-
-Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) {
- // Make sure the prototype is as expected, otherwise the rest of the
- // function is probably invalid and likely to abort.
- if (!isTrigLibCall(CI))
- return nullptr;
-
- Value *Arg = CI->getArgOperand(0);
- SmallVector<CallInst *, 1> SinCalls;
- SmallVector<CallInst *, 1> CosCalls;
- SmallVector<CallInst *, 1> SinCosCalls;
-
- bool IsFloat = Arg->getType()->isFloatTy();
-
- // Look for all compatible sinpi, cospi and sincospi calls with the same
- // argument. If there are enough (in some sense) we can make the
- // substitution.
- Function *F = CI->getFunction();
- for (User *U : Arg->users())
- classifyArgUse(U, F, IsFloat, SinCalls, CosCalls, SinCosCalls);
-
- // It's only worthwhile if both sinpi and cospi are actually used.
+ }
+
+ // We will memoize intermediate products of the Addition Chain.
+ Value *InnerChain[33] = {nullptr};
+ InnerChain[1] = Base;
+ InnerChain[2] = B.CreateFMul(Base, Base, "square");
+
+ // We cannot readily convert a non-double type (like float) to a double.
+ // So we first convert it to something which could be converted to double.
+ ExpoA.convert(APFloat::IEEEdouble(), APFloat::rmTowardZero, &Ignored);
+ Value *FMul = getPow(InnerChain, ExpoA.convertToDouble(), B);
+
+ // Expand pow(x, y+0.5) to pow(x, y) * sqrt(x).
+ if (Sqrt)
+ FMul = B.CreateFMul(FMul, Sqrt);
+
+ // If the exponent is negative, then get the reciprocal.
+ if (ExpoF->isNegative())
+ FMul = B.CreateFDiv(ConstantFP::get(Ty, 1.0), FMul, "reciprocal");
+
+ return FMul;
+ }
+
+ APSInt IntExpo(32, /*isUnsigned=*/false);
+ // powf(x, n) -> powi(x, n) if n is a constant signed integer value
+ if (ExpoF->isInteger() &&
+ ExpoF->convertToInteger(IntExpo, APFloat::rmTowardZero, &Ignored) ==
+ APFloat::opOK) {
+ return createPowWithIntegerExponent(
+ Base, ConstantInt::get(B.getInt32Ty(), IntExpo), M, B);
+ }
+ }
+
+ // powf(x, itofp(y)) -> powi(x, y)
+ if (AllowApprox && (isa<SIToFPInst>(Expo) || isa<UIToFPInst>(Expo))) {
+ if (Value *ExpoI = getIntToFPVal(Expo, B))
+ return createPowWithIntegerExponent(Base, ExpoI, M, B);
+ }
+
+ return Shrunk;
+}
+
+Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
+ Function *Callee = CI->getCalledFunction();
+ AttributeList Attrs; // Attributes are only meaningful on the original call
+ StringRef Name = Callee->getName();
+ Value *Ret = nullptr;
+ if (UnsafeFPShrink && Name == TLI->getName(LibFunc_exp2) &&
+ hasFloatVersion(Name))
+ Ret = optimizeUnaryDoubleFP(CI, B, true);
+
+ Type *Ty = CI->getType();
+ Value *Op = CI->getArgOperand(0);
+
+ // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x)) if sizeof(x) <= 32
+ // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x)) if sizeof(x) < 32
+ if ((isa<SIToFPInst>(Op) || isa<UIToFPInst>(Op)) &&
+ hasFloatFn(TLI, Ty, LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl)) {
+ if (Value *Exp = getIntToFPVal(Op, B))
+ return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), Exp, TLI,
+ LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
+ B, Attrs);
+ }
+
+ return Ret;
+}
+
+Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) {
+ // If we can shrink the call to a float function rather than a double
+ // function, do that first.
+ Function *Callee = CI->getCalledFunction();
+ StringRef Name = Callee->getName();
+ if ((Name == "fmin" || Name == "fmax") && hasFloatVersion(Name))
+ if (Value *Ret = optimizeBinaryDoubleFP(CI, B))
+ return Ret;
+
+ // The LLVM intrinsics minnum/maxnum correspond to fmin/fmax. Canonicalize to
+ // the intrinsics for improved optimization (for example, vectorization).
+ // No-signed-zeros is implied by the definitions of fmax/fmin themselves.
+ // From the C standard draft WG14/N1256:
+ // "Ideally, fmax would be sensitive to the sign of zero, for example
+ // fmax(-0.0, +0.0) would return +0; however, implementation in software
+ // might be impractical."
+ IRBuilderBase::FastMathFlagGuard Guard(B);
+ FastMathFlags FMF = CI->getFastMathFlags();
+ FMF.setNoSignedZeros();
+ B.setFastMathFlags(FMF);
+
+ Intrinsic::ID IID = Callee->getName().startswith("fmin") ? Intrinsic::minnum
+ : Intrinsic::maxnum;
+ Function *F = Intrinsic::getDeclaration(CI->getModule(), IID, CI->getType());
+ return B.CreateCall(F, { CI->getArgOperand(0), CI->getArgOperand(1) });
+}
+
+Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
+ Function *LogFn = Log->getCalledFunction();
+ AttributeList Attrs; // Attributes are only meaningful on the original call
+ StringRef LogNm = LogFn->getName();
+ Intrinsic::ID LogID = LogFn->getIntrinsicID();
+ Module *Mod = Log->getModule();
+ Type *Ty = Log->getType();
+ Value *Ret = nullptr;
+
+ if (UnsafeFPShrink && hasFloatVersion(LogNm))
+ Ret = optimizeUnaryDoubleFP(Log, B, true);
+
+ // The earlier call must also be 'fast' in order to do these transforms.
+ CallInst *Arg = dyn_cast<CallInst>(Log->getArgOperand(0));
+ if (!Log->isFast() || !Arg || !Arg->isFast() || !Arg->hasOneUse())
+ return Ret;
+
+ LibFunc LogLb, ExpLb, Exp2Lb, Exp10Lb, PowLb;
+
+ // This is only applicable to log(), log2(), log10().
+ if (TLI->getLibFunc(LogNm, LogLb))
+ switch (LogLb) {
+ case LibFunc_logf:
+ LogID = Intrinsic::log;
+ ExpLb = LibFunc_expf;
+ Exp2Lb = LibFunc_exp2f;
+ Exp10Lb = LibFunc_exp10f;
+ PowLb = LibFunc_powf;
+ break;
+ case LibFunc_log:
+ LogID = Intrinsic::log;
+ ExpLb = LibFunc_exp;
+ Exp2Lb = LibFunc_exp2;
+ Exp10Lb = LibFunc_exp10;
+ PowLb = LibFunc_pow;
+ break;
+ case LibFunc_logl:
+ LogID = Intrinsic::log;
+ ExpLb = LibFunc_expl;
+ Exp2Lb = LibFunc_exp2l;
+ Exp10Lb = LibFunc_exp10l;
+ PowLb = LibFunc_powl;
+ break;
+ case LibFunc_log2f:
+ LogID = Intrinsic::log2;
+ ExpLb = LibFunc_expf;
+ Exp2Lb = LibFunc_exp2f;
+ Exp10Lb = LibFunc_exp10f;
+ PowLb = LibFunc_powf;
+ break;
+ case LibFunc_log2:
+ LogID = Intrinsic::log2;
+ ExpLb = LibFunc_exp;
+ Exp2Lb = LibFunc_exp2;
+ Exp10Lb = LibFunc_exp10;
+ PowLb = LibFunc_pow;
+ break;
+ case LibFunc_log2l:
+ LogID = Intrinsic::log2;
+ ExpLb = LibFunc_expl;
+ Exp2Lb = LibFunc_exp2l;
+ Exp10Lb = LibFunc_exp10l;
+ PowLb = LibFunc_powl;
+ break;
+ case LibFunc_log10f:
+ LogID = Intrinsic::log10;
+ ExpLb = LibFunc_expf;
+ Exp2Lb = LibFunc_exp2f;
+ Exp10Lb = LibFunc_exp10f;
+ PowLb = LibFunc_powf;
+ break;
+ case LibFunc_log10:
+ LogID = Intrinsic::log10;
+ ExpLb = LibFunc_exp;
+ Exp2Lb = LibFunc_exp2;
+ Exp10Lb = LibFunc_exp10;
+ PowLb = LibFunc_pow;
+ break;
+ case LibFunc_log10l:
+ LogID = Intrinsic::log10;
+ ExpLb = LibFunc_expl;
+ Exp2Lb = LibFunc_exp2l;
+ Exp10Lb = LibFunc_exp10l;
+ PowLb = LibFunc_powl;
+ break;
+ default:
+ return Ret;
+ }
+ else if (LogID == Intrinsic::log || LogID == Intrinsic::log2 ||
+ LogID == Intrinsic::log10) {
+ if (Ty->getScalarType()->isFloatTy()) {
+ ExpLb = LibFunc_expf;
+ Exp2Lb = LibFunc_exp2f;
+ Exp10Lb = LibFunc_exp10f;
+ PowLb = LibFunc_powf;
+ } else if (Ty->getScalarType()->isDoubleTy()) {
+ ExpLb = LibFunc_exp;
+ Exp2Lb = LibFunc_exp2;
+ Exp10Lb = LibFunc_exp10;
+ PowLb = LibFunc_pow;
+ } else
+ return Ret;
+ } else
+ return Ret;
+
+ IRBuilderBase::FastMathFlagGuard Guard(B);
+ B.setFastMathFlags(FastMathFlags::getFast());
+
+ Intrinsic::ID ArgID = Arg->getIntrinsicID();
+ LibFunc ArgLb = NotLibFunc;
+ TLI->getLibFunc(*Arg, ArgLb);
+
+ // log(pow(x,y)) -> y*log(x)
+ if (ArgLb == PowLb || ArgID == Intrinsic::pow) {
+ Value *LogX =
+ Log->doesNotAccessMemory()
+ ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
+ Arg->getOperand(0), "log")
+ : emitUnaryFloatFnCall(Arg->getOperand(0), LogNm, B, Attrs);
+ Value *MulY = B.CreateFMul(Arg->getArgOperand(1), LogX, "mul");
+ // Since pow() may have side effects, e.g. errno,
+ // dead code elimination may not be trusted to remove it.
+ substituteInParent(Arg, MulY);
+ return MulY;
+ }
+
+ // log(exp{,2,10}(y)) -> y*log({e,2,10})
+ // TODO: There is no exp10() intrinsic yet.
+ if (ArgLb == ExpLb || ArgLb == Exp2Lb || ArgLb == Exp10Lb ||
+ ArgID == Intrinsic::exp || ArgID == Intrinsic::exp2) {
+ Constant *Eul;
+ if (ArgLb == ExpLb || ArgID == Intrinsic::exp)
+ // FIXME: Add more precise value of e for long double.
+ Eul = ConstantFP::get(Log->getType(), numbers::e);
+ else if (ArgLb == Exp2Lb || ArgID == Intrinsic::exp2)
+ Eul = ConstantFP::get(Log->getType(), 2.0);
+ else
+ Eul = ConstantFP::get(Log->getType(), 10.0);
+ Value *LogE = Log->doesNotAccessMemory()
+ ? B.CreateCall(Intrinsic::getDeclaration(Mod, LogID, Ty),
+ Eul, "log")
+ : emitUnaryFloatFnCall(Eul, LogNm, B, Attrs);
+ Value *MulY = B.CreateFMul(Arg->getArgOperand(0), LogE, "mul");
+ // Since exp() may have side effects, e.g. errno,
+ // dead code elimination may not be trusted to remove it.
+ substituteInParent(Arg, MulY);
+ return MulY;
+ }
+
+ return Ret;
+}
+
+Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
+ Function *Callee = CI->getCalledFunction();
+ Value *Ret = nullptr;
+ // TODO: Once we have a way (other than checking for the existince of the
+ // libcall) to tell whether our target can lower @llvm.sqrt, relax the
+ // condition below.
+ if (TLI->has(LibFunc_sqrtf) && (Callee->getName() == "sqrt" ||
+ Callee->getIntrinsicID() == Intrinsic::sqrt))
+ Ret = optimizeUnaryDoubleFP(CI, B, true);
+
+ if (!CI->isFast())
+ return Ret;
+
+ Instruction *I = dyn_cast<Instruction>(CI->getArgOperand(0));
+ if (!I || I->getOpcode() != Instruction::FMul || !I->isFast())
+ return Ret;
+
+ // We're looking for a repeated factor in a multiplication tree,
+ // so we can do this fold: sqrt(x * x) -> fabs(x);
+ // or this fold: sqrt((x * x) * y) -> fabs(x) * sqrt(y).
+ Value *Op0 = I->getOperand(0);
+ Value *Op1 = I->getOperand(1);
+ Value *RepeatOp = nullptr;
+ Value *OtherOp = nullptr;
+ if (Op0 == Op1) {
+ // Simple match: the operands of the multiply are identical.
+ RepeatOp = Op0;
+ } else {
+ // Look for a more complicated pattern: one of the operands is itself
+ // a multiply, so search for a common factor in that multiply.
+ // Note: We don't bother looking any deeper than this first level or for
+ // variations of this pattern because instcombine's visitFMUL and/or the
+ // reassociation pass should give us this form.
+ Value *OtherMul0, *OtherMul1;
+ if (match(Op0, m_FMul(m_Value(OtherMul0), m_Value(OtherMul1)))) {
+ // Pattern: sqrt((x * y) * z)
+ if (OtherMul0 == OtherMul1 && cast<Instruction>(Op0)->isFast()) {
+ // Matched: sqrt((x * x) * z)
+ RepeatOp = OtherMul0;
+ OtherOp = Op1;
+ }
+ }
+ }
+ if (!RepeatOp)
+ return Ret;
+
+ // Fast math flags for any created instructions should match the sqrt
+ // and multiply.
+ IRBuilderBase::FastMathFlagGuard Guard(B);
+ B.setFastMathFlags(I->getFastMathFlags());
+
+ // If we found a repeated factor, hoist it out of the square root and
+ // replace it with the fabs of that factor.
+ Module *M = Callee->getParent();
+ Type *ArgType = I->getType();
+ Function *Fabs = Intrinsic::getDeclaration(M, Intrinsic::fabs, ArgType);
+ Value *FabsCall = B.CreateCall(Fabs, RepeatOp, "fabs");
+ if (OtherOp) {
+ // If we found a non-repeated factor, we still need to get its square
+ // root. We then multiply that by the value that was simplified out
+ // of the square root calculation.
+ Function *Sqrt = Intrinsic::getDeclaration(M, Intrinsic::sqrt, ArgType);
+ Value *SqrtCall = B.CreateCall(Sqrt, OtherOp, "sqrt");
+ return B.CreateFMul(FabsCall, SqrtCall);
+ }
+ return FabsCall;
+}
+
+// TODO: Generalize to handle any trig function and its inverse.
+Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilderBase &B) {
+ Function *Callee = CI->getCalledFunction();
+ Value *Ret = nullptr;
+ StringRef Name = Callee->getName();
+ if (UnsafeFPShrink && Name == "tan" && hasFloatVersion(Name))
+ Ret = optimizeUnaryDoubleFP(CI, B, true);
+
+ Value *Op1 = CI->getArgOperand(0);
+ auto *OpC = dyn_cast<CallInst>(Op1);
+ if (!OpC)
+ return Ret;
+
+ // Both calls must be 'fast' in order to remove them.
+ if (!CI->isFast() || !OpC->isFast())
+ return Ret;
+
+ // tan(atan(x)) -> x
+ // tanf(atanf(x)) -> x
+ // tanl(atanl(x)) -> x
+ LibFunc Func;
+ Function *F = OpC->getCalledFunction();
+ if (F && TLI->getLibFunc(F->getName(), Func) && TLI->has(Func) &&
+ ((Func == LibFunc_atan && Callee->getName() == "tan") ||
+ (Func == LibFunc_atanf && Callee->getName() == "tanf") ||
+ (Func == LibFunc_atanl && Callee->getName() == "tanl")))
+ Ret = OpC->getArgOperand(0);
+ return Ret;
+}
+
+static bool isTrigLibCall(CallInst *CI) {
+ // We can only hope to do anything useful if we can ignore things like errno
+ // and floating-point exceptions.
+ // We already checked the prototype.
+ return CI->hasFnAttr(Attribute::NoUnwind) &&
+ CI->hasFnAttr(Attribute::ReadNone);
+}
+
+static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
+ bool UseFloat, Value *&Sin, Value *&Cos,
+ Value *&SinCos) {
+ Type *ArgTy = Arg->getType();
+ Type *ResTy;
+ StringRef Name;
+
+ Triple T(OrigCallee->getParent()->getTargetTriple());
+ if (UseFloat) {
+ Name = "__sincospif_stret";
+
+ assert(T.getArch() != Triple::x86 && "x86 messy and unsupported for now");
+ // x86_64 can't use {float, float} since that would be returned in both
+ // xmm0 and xmm1, which isn't what a real struct would do.
+ ResTy = T.getArch() == Triple::x86_64
+ ? static_cast<Type *>(FixedVectorType::get(ArgTy, 2))
+ : static_cast<Type *>(StructType::get(ArgTy, ArgTy));
+ } else {
+ Name = "__sincospi_stret";
+ ResTy = StructType::get(ArgTy, ArgTy);
+ }
+
+ Module *M = OrigCallee->getParent();
+ FunctionCallee Callee =
+ M->getOrInsertFunction(Name, OrigCallee->getAttributes(), ResTy, ArgTy);
+
+ if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
+ // If the argument is an instruction, it must dominate all uses so put our
+ // sincos call there.
+ B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
+ } else {
+ // Otherwise (e.g. for a constant) the beginning of the function is as
+ // good a place as any.
+ BasicBlock &EntryBB = B.GetInsertBlock()->getParent()->getEntryBlock();
+ B.SetInsertPoint(&EntryBB, EntryBB.begin());
+ }
+
+ SinCos = B.CreateCall(Callee, Arg, "sincospi");
+
+ if (SinCos->getType()->isStructTy()) {
+ Sin = B.CreateExtractValue(SinCos, 0, "sinpi");
+ Cos = B.CreateExtractValue(SinCos, 1, "cospi");
+ } else {
+ Sin = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 0),
+ "sinpi");
+ Cos = B.CreateExtractElement(SinCos, ConstantInt::get(B.getInt32Ty(), 1),
+ "cospi");
+ }
+}
+
+Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) {
+ // Make sure the prototype is as expected, otherwise the rest of the
+ // function is probably invalid and likely to abort.
+ if (!isTrigLibCall(CI))
+ return nullptr;
+
+ Value *Arg = CI->getArgOperand(0);
+ SmallVector<CallInst *, 1> SinCalls;
+ SmallVector<CallInst *, 1> CosCalls;
+ SmallVector<CallInst *, 1> SinCosCalls;
+
+ bool IsFloat = Arg->getType()->isFloatTy();
+
+ // Look for all compatible sinpi, cospi and sincospi calls with the same
+ // argument. If there are enough (in some sense) we can make the
+ // substitution.
+ Function *F = CI->getFunction();
+ for (User *U : Arg->users())
+ classifyArgUse(U, F, IsFloat, SinCalls, CosCalls, SinCosCalls);
+
+ // It's only worthwhile if both sinpi and cospi are actually used.
if (SinCalls.empty() || CosCalls.empty())
- return nullptr;
-
- Value *Sin, *Cos, *SinCos;
- insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, SinCos);
-
- auto replaceTrigInsts = [this](SmallVectorImpl<CallInst *> &Calls,
- Value *Res) {
- for (CallInst *C : Calls)
- replaceAllUsesWith(C, Res);
- };
-
- replaceTrigInsts(SinCalls, Sin);
- replaceTrigInsts(CosCalls, Cos);
- replaceTrigInsts(SinCosCalls, SinCos);
-
- return nullptr;
-}
-
-void LibCallSimplifier::classifyArgUse(
- Value *Val, Function *F, bool IsFloat,
- SmallVectorImpl<CallInst *> &SinCalls,
- SmallVectorImpl<CallInst *> &CosCalls,
- SmallVectorImpl<CallInst *> &SinCosCalls) {
- CallInst *CI = dyn_cast<CallInst>(Val);
-
+ return nullptr;
+
+ Value *Sin, *Cos, *SinCos;
+ insertSinCosCall(B, CI->getCalledFunction(), Arg, IsFloat, Sin, Cos, SinCos);
+
+ auto replaceTrigInsts = [this](SmallVectorImpl<CallInst *> &Calls,
+ Value *Res) {
+ for (CallInst *C : Calls)
+ replaceAllUsesWith(C, Res);
+ };
+
+ replaceTrigInsts(SinCalls, Sin);
+ replaceTrigInsts(CosCalls, Cos);
+ replaceTrigInsts(SinCosCalls, SinCos);
+
+ return nullptr;
+}
+
+void LibCallSimplifier::classifyArgUse(
+ Value *Val, Function *F, bool IsFloat,
+ SmallVectorImpl<CallInst *> &SinCalls,
+ SmallVectorImpl<CallInst *> &CosCalls,
+ SmallVectorImpl<CallInst *> &SinCosCalls) {
+ CallInst *CI = dyn_cast<CallInst>(Val);
+
if (!CI || CI->use_empty())
- return;
-
- // Don't consider calls in other functions.
- if (CI->getFunction() != F)
- return;
-
- Function *Callee = CI->getCalledFunction();
- LibFunc Func;
- if (!Callee || !TLI->getLibFunc(*Callee, Func) || !TLI->has(Func) ||
- !isTrigLibCall(CI))
- return;
-
- if (IsFloat) {
- if (Func == LibFunc_sinpif)
- SinCalls.push_back(CI);
- else if (Func == LibFunc_cospif)
- CosCalls.push_back(CI);
- else if (Func == LibFunc_sincospif_stret)
- SinCosCalls.push_back(CI);
- } else {
- if (Func == LibFunc_sinpi)
- SinCalls.push_back(CI);
- else if (Func == LibFunc_cospi)
- CosCalls.push_back(CI);
- else if (Func == LibFunc_sincospi_stret)
- SinCosCalls.push_back(CI);
- }
-}
-
-//===----------------------------------------------------------------------===//
-// Integer Library Call Optimizations
-//===----------------------------------------------------------------------===//
-
-Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilderBase &B) {
- // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
- Value *Op = CI->getArgOperand(0);
- Type *ArgType = Op->getType();
- Function *F = Intrinsic::getDeclaration(CI->getCalledFunction()->getParent(),
- Intrinsic::cttz, ArgType);
- Value *V = B.CreateCall(F, {Op, B.getTrue()}, "cttz");
- V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
- V = B.CreateIntCast(V, B.getInt32Ty(), false);
-
- Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType));
- return B.CreateSelect(Cond, V, B.getInt32(0));
-}
-
-Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilderBase &B) {
- // fls(x) -> (i32)(sizeInBits(x) - llvm.ctlz(x, false))
- Value *Op = CI->getArgOperand(0);
- Type *ArgType = Op->getType();
- Function *F = Intrinsic::getDeclaration(CI->getCalledFunction()->getParent(),
- Intrinsic::ctlz, ArgType);
- Value *V = B.CreateCall(F, {Op, B.getFalse()}, "ctlz");
- V = B.CreateSub(ConstantInt::get(V->getType(), ArgType->getIntegerBitWidth()),
- V);
- return B.CreateIntCast(V, CI->getType(), false);
-}
-
-Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilderBase &B) {
- // abs(x) -> x <s 0 ? -x : x
- // The negation has 'nsw' because abs of INT_MIN is undefined.
- Value *X = CI->getArgOperand(0);
- Value *IsNeg = B.CreateICmpSLT(X, Constant::getNullValue(X->getType()));
- Value *NegX = B.CreateNSWNeg(X, "neg");
- return B.CreateSelect(IsNeg, NegX, X);
-}
-
-Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilderBase &B) {
- // isdigit(c) -> (c-'0') <u 10
- Value *Op = CI->getArgOperand(0);
- Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp");
- Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit");
- return B.CreateZExt(Op, CI->getType());
-}
-
-Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilderBase &B) {
- // isascii(c) -> c <u 128
- Value *Op = CI->getArgOperand(0);
- Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii");
- return B.CreateZExt(Op, CI->getType());
-}
-
-Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilderBase &B) {
- // toascii(c) -> c & 0x7f
- return B.CreateAnd(CI->getArgOperand(0),
- ConstantInt::get(CI->getType(), 0x7F));
-}
-
-Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilderBase &B) {
- StringRef Str;
- if (!getConstantStringInfo(CI->getArgOperand(0), Str))
- return nullptr;
-
- return convertStrToNumber(CI, Str, 10);
-}
-
-Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilderBase &B) {
- StringRef Str;
- if (!getConstantStringInfo(CI->getArgOperand(0), Str))
- return nullptr;
-
- if (!isa<ConstantPointerNull>(CI->getArgOperand(1)))
- return nullptr;
-
- if (ConstantInt *CInt = dyn_cast<ConstantInt>(CI->getArgOperand(2))) {
- return convertStrToNumber(CI, Str, CInt->getSExtValue());
- }
-
- return nullptr;
-}
-
-//===----------------------------------------------------------------------===//
-// Formatting and IO Library Call Optimizations
-//===----------------------------------------------------------------------===//
-
-static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg);
-
-Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilderBase &B,
- int StreamArg) {
- Function *Callee = CI->getCalledFunction();
- // Error reporting calls should be cold, mark them as such.
- // This applies even to non-builtin calls: it is only a hint and applies to
- // functions that the frontend might not understand as builtins.
-
- // This heuristic was suggested in:
- // Improving Static Branch Prediction in a Compiler
- // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu
- // Proceedings of PACT'98, Oct. 1998, IEEE
- if (!CI->hasFnAttr(Attribute::Cold) &&
- isReportingError(Callee, CI, StreamArg)) {
- CI->addAttribute(AttributeList::FunctionIndex, Attribute::Cold);
- }
-
- return nullptr;
-}
-
-static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) {
- if (!Callee || !Callee->isDeclaration())
- return false;
-
- if (StreamArg < 0)
- return true;
-
- // These functions might be considered cold, but only if their stream
- // argument is stderr.
-
- if (StreamArg >= (int)CI->getNumArgOperands())
- return false;
- LoadInst *LI = dyn_cast<LoadInst>(CI->getArgOperand(StreamArg));
- if (!LI)
- return false;
- GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
- if (!GV || !GV->isDeclaration())
- return false;
- return GV->getName() == "stderr";
-}
-
-Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
- // Check for a fixed format string.
- StringRef FormatStr;
- if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr))
- return nullptr;
-
- // Empty format string -> noop.
- if (FormatStr.empty()) // Tolerate printf's declared void.
- return CI->use_empty() ? (Value *)CI : ConstantInt::get(CI->getType(), 0);
-
- // Do not do any of the following transformations if the printf return value
- // is used, in general the printf return value is not compatible with either
- // putchar() or puts().
- if (!CI->use_empty())
- return nullptr;
-
- // printf("x") -> putchar('x'), even for "%" and "%%".
- if (FormatStr.size() == 1 || FormatStr == "%%")
- return emitPutChar(B.getInt32(FormatStr[0]), B, TLI);
-
- // printf("%s", "a") --> putchar('a')
- if (FormatStr == "%s" && CI->getNumArgOperands() > 1) {
- StringRef ChrStr;
- if (!getConstantStringInfo(CI->getOperand(1), ChrStr))
- return nullptr;
- if (ChrStr.size() != 1)
- return nullptr;
- return emitPutChar(B.getInt32(ChrStr[0]), B, TLI);
- }
-
- // printf("foo\n") --> puts("foo")
- if (FormatStr[FormatStr.size() - 1] == '\n' &&
- FormatStr.find('%') == StringRef::npos) { // No format characters.
- // Create a string literal with no \n on it. We expect the constant merge
- // pass to be run after this pass, to merge duplicate strings.
- FormatStr = FormatStr.drop_back();
- Value *GV = B.CreateGlobalString(FormatStr, "str");
- return emitPutS(GV, B, TLI);
- }
-
- // Optimize specific format strings.
- // printf("%c", chr) --> putchar(chr)
- if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
- CI->getArgOperand(1)->getType()->isIntegerTy())
- return emitPutChar(CI->getArgOperand(1), B, TLI);
-
- // printf("%s\n", str) --> puts(str)
- if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
- CI->getArgOperand(1)->getType()->isPointerTy())
- return emitPutS(CI->getArgOperand(1), B, TLI);
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) {
-
- Function *Callee = CI->getCalledFunction();
- FunctionType *FT = Callee->getFunctionType();
- if (Value *V = optimizePrintFString(CI, B)) {
- return V;
- }
-
- // printf(format, ...) -> iprintf(format, ...) if no floating point
- // arguments.
- if (TLI->has(LibFunc_iprintf) && !callHasFloatingPointArgument(CI)) {
- Module *M = B.GetInsertBlock()->getParent()->getParent();
- FunctionCallee IPrintFFn =
- M->getOrInsertFunction("iprintf", FT, Callee->getAttributes());
- CallInst *New = cast<CallInst>(CI->clone());
- New->setCalledFunction(IPrintFFn);
- B.Insert(New);
- return New;
- }
-
- // printf(format, ...) -> __small_printf(format, ...) if no 128-bit floating point
- // arguments.
- if (TLI->has(LibFunc_small_printf) && !callHasFP128Argument(CI)) {
- Module *M = B.GetInsertBlock()->getParent()->getParent();
- auto SmallPrintFFn =
- M->getOrInsertFunction(TLI->getName(LibFunc_small_printf),
- FT, Callee->getAttributes());
- CallInst *New = cast<CallInst>(CI->clone());
- New->setCalledFunction(SmallPrintFFn);
- B.Insert(New);
- return New;
- }
-
- annotateNonNullBasedOnAccess(CI, 0);
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
- IRBuilderBase &B) {
- // Check for a fixed format string.
- StringRef FormatStr;
- if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
- return nullptr;
-
- // If we just have a format string (nothing else crazy) transform it.
- if (CI->getNumArgOperands() == 2) {
- // Make sure there's no % in the constant array. We could try to handle
- // %% -> % in the future if we cared.
- if (FormatStr.find('%') != StringRef::npos)
- return nullptr; // we found a format specifier, bail out.
-
- // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1)
- B.CreateMemCpy(
- CI->getArgOperand(0), Align(1), CI->getArgOperand(1), Align(1),
- ConstantInt::get(DL.getIntPtrType(CI->getContext()),
- FormatStr.size() + 1)); // Copy the null byte.
- return ConstantInt::get(CI->getType(), FormatStr.size());
- }
-
- // The remaining optimizations require the format string to be "%s" or "%c"
- // and have an extra operand.
- if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
- CI->getNumArgOperands() < 3)
- return nullptr;
-
- // Decode the second character of the format string.
- if (FormatStr[1] == 'c') {
- // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
- if (!CI->getArgOperand(2)->getType()->isIntegerTy())
- return nullptr;
- Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
- Value *Ptr = castToCStr(CI->getArgOperand(0), B);
- B.CreateStore(V, Ptr);
- Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
- B.CreateStore(B.getInt8(0), Ptr);
-
- return ConstantInt::get(CI->getType(), 1);
- }
-
- if (FormatStr[1] == 's') {
- // sprintf(dest, "%s", str) -> llvm.memcpy(align 1 dest, align 1 str,
- // strlen(str)+1)
- if (!CI->getArgOperand(2)->getType()->isPointerTy())
- return nullptr;
-
+ return;
+
+ // Don't consider calls in other functions.
+ if (CI->getFunction() != F)
+ return;
+
+ Function *Callee = CI->getCalledFunction();
+ LibFunc Func;
+ if (!Callee || !TLI->getLibFunc(*Callee, Func) || !TLI->has(Func) ||
+ !isTrigLibCall(CI))
+ return;
+
+ if (IsFloat) {
+ if (Func == LibFunc_sinpif)
+ SinCalls.push_back(CI);
+ else if (Func == LibFunc_cospif)
+ CosCalls.push_back(CI);
+ else if (Func == LibFunc_sincospif_stret)
+ SinCosCalls.push_back(CI);
+ } else {
+ if (Func == LibFunc_sinpi)
+ SinCalls.push_back(CI);
+ else if (Func == LibFunc_cospi)
+ CosCalls.push_back(CI);
+ else if (Func == LibFunc_sincospi_stret)
+ SinCosCalls.push_back(CI);
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// Integer Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilderBase &B) {
+ // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
+ Value *Op = CI->getArgOperand(0);
+ Type *ArgType = Op->getType();
+ Function *F = Intrinsic::getDeclaration(CI->getCalledFunction()->getParent(),
+ Intrinsic::cttz, ArgType);
+ Value *V = B.CreateCall(F, {Op, B.getTrue()}, "cttz");
+ V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
+ V = B.CreateIntCast(V, B.getInt32Ty(), false);
+
+ Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType));
+ return B.CreateSelect(Cond, V, B.getInt32(0));
+}
+
+Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilderBase &B) {
+ // fls(x) -> (i32)(sizeInBits(x) - llvm.ctlz(x, false))
+ Value *Op = CI->getArgOperand(0);
+ Type *ArgType = Op->getType();
+ Function *F = Intrinsic::getDeclaration(CI->getCalledFunction()->getParent(),
+ Intrinsic::ctlz, ArgType);
+ Value *V = B.CreateCall(F, {Op, B.getFalse()}, "ctlz");
+ V = B.CreateSub(ConstantInt::get(V->getType(), ArgType->getIntegerBitWidth()),
+ V);
+ return B.CreateIntCast(V, CI->getType(), false);
+}
+
+Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilderBase &B) {
+ // abs(x) -> x <s 0 ? -x : x
+ // The negation has 'nsw' because abs of INT_MIN is undefined.
+ Value *X = CI->getArgOperand(0);
+ Value *IsNeg = B.CreateICmpSLT(X, Constant::getNullValue(X->getType()));
+ Value *NegX = B.CreateNSWNeg(X, "neg");
+ return B.CreateSelect(IsNeg, NegX, X);
+}
+
+Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilderBase &B) {
+ // isdigit(c) -> (c-'0') <u 10
+ Value *Op = CI->getArgOperand(0);
+ Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp");
+ Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit");
+ return B.CreateZExt(Op, CI->getType());
+}
+
+Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilderBase &B) {
+ // isascii(c) -> c <u 128
+ Value *Op = CI->getArgOperand(0);
+ Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii");
+ return B.CreateZExt(Op, CI->getType());
+}
+
+Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilderBase &B) {
+ // toascii(c) -> c & 0x7f
+ return B.CreateAnd(CI->getArgOperand(0),
+ ConstantInt::get(CI->getType(), 0x7F));
+}
+
+Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilderBase &B) {
+ StringRef Str;
+ if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+ return nullptr;
+
+ return convertStrToNumber(CI, Str, 10);
+}
+
+Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilderBase &B) {
+ StringRef Str;
+ if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+ return nullptr;
+
+ if (!isa<ConstantPointerNull>(CI->getArgOperand(1)))
+ return nullptr;
+
+ if (ConstantInt *CInt = dyn_cast<ConstantInt>(CI->getArgOperand(2))) {
+ return convertStrToNumber(CI, Str, CInt->getSExtValue());
+ }
+
+ return nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Formatting and IO Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg);
+
+Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilderBase &B,
+ int StreamArg) {
+ Function *Callee = CI->getCalledFunction();
+ // Error reporting calls should be cold, mark them as such.
+ // This applies even to non-builtin calls: it is only a hint and applies to
+ // functions that the frontend might not understand as builtins.
+
+ // This heuristic was suggested in:
+ // Improving Static Branch Prediction in a Compiler
+ // Brian L. Deitrich, Ben-Chung Cheng, Wen-mei W. Hwu
+ // Proceedings of PACT'98, Oct. 1998, IEEE
+ if (!CI->hasFnAttr(Attribute::Cold) &&
+ isReportingError(Callee, CI, StreamArg)) {
+ CI->addAttribute(AttributeList::FunctionIndex, Attribute::Cold);
+ }
+
+ return nullptr;
+}
+
+static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) {
+ if (!Callee || !Callee->isDeclaration())
+ return false;
+
+ if (StreamArg < 0)
+ return true;
+
+ // These functions might be considered cold, but only if their stream
+ // argument is stderr.
+
+ if (StreamArg >= (int)CI->getNumArgOperands())
+ return false;
+ LoadInst *LI = dyn_cast<LoadInst>(CI->getArgOperand(StreamArg));
+ if (!LI)
+ return false;
+ GlobalVariable *GV = dyn_cast<GlobalVariable>(LI->getPointerOperand());
+ if (!GV || !GV->isDeclaration())
+ return false;
+ return GV->getName() == "stderr";
+}
+
+Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
+ // Check for a fixed format string.
+ StringRef FormatStr;
+ if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr))
+ return nullptr;
+
+ // Empty format string -> noop.
+ if (FormatStr.empty()) // Tolerate printf's declared void.
+ return CI->use_empty() ? (Value *)CI : ConstantInt::get(CI->getType(), 0);
+
+ // Do not do any of the following transformations if the printf return value
+ // is used, in general the printf return value is not compatible with either
+ // putchar() or puts().
+ if (!CI->use_empty())
+ return nullptr;
+
+ // printf("x") -> putchar('x'), even for "%" and "%%".
+ if (FormatStr.size() == 1 || FormatStr == "%%")
+ return emitPutChar(B.getInt32(FormatStr[0]), B, TLI);
+
+ // printf("%s", "a") --> putchar('a')
+ if (FormatStr == "%s" && CI->getNumArgOperands() > 1) {
+ StringRef ChrStr;
+ if (!getConstantStringInfo(CI->getOperand(1), ChrStr))
+ return nullptr;
+ if (ChrStr.size() != 1)
+ return nullptr;
+ return emitPutChar(B.getInt32(ChrStr[0]), B, TLI);
+ }
+
+ // printf("foo\n") --> puts("foo")
+ if (FormatStr[FormatStr.size() - 1] == '\n' &&
+ FormatStr.find('%') == StringRef::npos) { // No format characters.
+ // Create a string literal with no \n on it. We expect the constant merge
+ // pass to be run after this pass, to merge duplicate strings.
+ FormatStr = FormatStr.drop_back();
+ Value *GV = B.CreateGlobalString(FormatStr, "str");
+ return emitPutS(GV, B, TLI);
+ }
+
+ // Optimize specific format strings.
+ // printf("%c", chr) --> putchar(chr)
+ if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
+ CI->getArgOperand(1)->getType()->isIntegerTy())
+ return emitPutChar(CI->getArgOperand(1), B, TLI);
+
+ // printf("%s\n", str) --> puts(str)
+ if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
+ CI->getArgOperand(1)->getType()->isPointerTy())
+ return emitPutS(CI->getArgOperand(1), B, TLI);
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) {
+
+ Function *Callee = CI->getCalledFunction();
+ FunctionType *FT = Callee->getFunctionType();
+ if (Value *V = optimizePrintFString(CI, B)) {
+ return V;
+ }
+
+ // printf(format, ...) -> iprintf(format, ...) if no floating point
+ // arguments.
+ if (TLI->has(LibFunc_iprintf) && !callHasFloatingPointArgument(CI)) {
+ Module *M = B.GetInsertBlock()->getParent()->getParent();
+ FunctionCallee IPrintFFn =
+ M->getOrInsertFunction("iprintf", FT, Callee->getAttributes());
+ CallInst *New = cast<CallInst>(CI->clone());
+ New->setCalledFunction(IPrintFFn);
+ B.Insert(New);
+ return New;
+ }
+
+ // printf(format, ...) -> __small_printf(format, ...) if no 128-bit floating point
+ // arguments.
+ if (TLI->has(LibFunc_small_printf) && !callHasFP128Argument(CI)) {
+ Module *M = B.GetInsertBlock()->getParent()->getParent();
+ auto SmallPrintFFn =
+ M->getOrInsertFunction(TLI->getName(LibFunc_small_printf),
+ FT, Callee->getAttributes());
+ CallInst *New = cast<CallInst>(CI->clone());
+ New->setCalledFunction(SmallPrintFFn);
+ B.Insert(New);
+ return New;
+ }
+
+ annotateNonNullBasedOnAccess(CI, 0);
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
+ IRBuilderBase &B) {
+ // Check for a fixed format string.
+ StringRef FormatStr;
+ if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+ return nullptr;
+
+ // If we just have a format string (nothing else crazy) transform it.
+ if (CI->getNumArgOperands() == 2) {
+ // Make sure there's no % in the constant array. We could try to handle
+ // %% -> % in the future if we cared.
+ if (FormatStr.find('%') != StringRef::npos)
+ return nullptr; // we found a format specifier, bail out.
+
+ // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1)
+ B.CreateMemCpy(
+ CI->getArgOperand(0), Align(1), CI->getArgOperand(1), Align(1),
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+ FormatStr.size() + 1)); // Copy the null byte.
+ return ConstantInt::get(CI->getType(), FormatStr.size());
+ }
+
+ // The remaining optimizations require the format string to be "%s" or "%c"
+ // and have an extra operand.
+ if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+ CI->getNumArgOperands() < 3)
+ return nullptr;
+
+ // Decode the second character of the format string.
+ if (FormatStr[1] == 'c') {
+ // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+ if (!CI->getArgOperand(2)->getType()->isIntegerTy())
+ return nullptr;
+ Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
+ Value *Ptr = castToCStr(CI->getArgOperand(0), B);
+ B.CreateStore(V, Ptr);
+ Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
+ B.CreateStore(B.getInt8(0), Ptr);
+
+ return ConstantInt::get(CI->getType(), 1);
+ }
+
+ if (FormatStr[1] == 's') {
+ // sprintf(dest, "%s", str) -> llvm.memcpy(align 1 dest, align 1 str,
+ // strlen(str)+1)
+ if (!CI->getArgOperand(2)->getType()->isPointerTy())
+ return nullptr;
+
if (CI->use_empty())
// sprintf(dest, "%s", str) -> strcpy(dest, str)
return emitStrCpy(CI->getArgOperand(0), CI->getArgOperand(2), B, TLI);
@@ -2546,775 +2546,775 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
if (OptForSize)
return nullptr;
- Value *Len = emitStrLen(CI->getArgOperand(2), B, DL, TLI);
- if (!Len)
- return nullptr;
- Value *IncLen =
- B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc");
- B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(2),
- Align(1), IncLen);
-
- // The sprintf result is the unincremented number of bytes in the string.
- return B.CreateIntCast(Len, CI->getType(), false);
- }
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) {
- Function *Callee = CI->getCalledFunction();
- FunctionType *FT = Callee->getFunctionType();
- if (Value *V = optimizeSPrintFString(CI, B)) {
- return V;
- }
-
- // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
- // point arguments.
- if (TLI->has(LibFunc_siprintf) && !callHasFloatingPointArgument(CI)) {
- Module *M = B.GetInsertBlock()->getParent()->getParent();
- FunctionCallee SIPrintFFn =
- M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
- CallInst *New = cast<CallInst>(CI->clone());
- New->setCalledFunction(SIPrintFFn);
- B.Insert(New);
- return New;
- }
-
- // sprintf(str, format, ...) -> __small_sprintf(str, format, ...) if no 128-bit
- // floating point arguments.
- if (TLI->has(LibFunc_small_sprintf) && !callHasFP128Argument(CI)) {
- Module *M = B.GetInsertBlock()->getParent()->getParent();
- auto SmallSPrintFFn =
- M->getOrInsertFunction(TLI->getName(LibFunc_small_sprintf),
- FT, Callee->getAttributes());
- CallInst *New = cast<CallInst>(CI->clone());
- New->setCalledFunction(SmallSPrintFFn);
- B.Insert(New);
- return New;
- }
-
- annotateNonNullBasedOnAccess(CI, {0, 1});
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI,
- IRBuilderBase &B) {
- // Check for size
- ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
- if (!Size)
- return nullptr;
-
- uint64_t N = Size->getZExtValue();
- // Check for a fixed format string.
- StringRef FormatStr;
- if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr))
- return nullptr;
-
- // If we just have a format string (nothing else crazy) transform it.
- if (CI->getNumArgOperands() == 3) {
- // Make sure there's no % in the constant array. We could try to handle
- // %% -> % in the future if we cared.
- if (FormatStr.find('%') != StringRef::npos)
- return nullptr; // we found a format specifier, bail out.
-
- if (N == 0)
- return ConstantInt::get(CI->getType(), FormatStr.size());
- else if (N < FormatStr.size() + 1)
- return nullptr;
-
- // snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt,
- // strlen(fmt)+1)
- B.CreateMemCpy(
- CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1),
- ConstantInt::get(DL.getIntPtrType(CI->getContext()),
- FormatStr.size() + 1)); // Copy the null byte.
- return ConstantInt::get(CI->getType(), FormatStr.size());
- }
-
- // The remaining optimizations require the format string to be "%s" or "%c"
- // and have an extra operand.
- if (FormatStr.size() == 2 && FormatStr[0] == '%' &&
- CI->getNumArgOperands() == 4) {
-
- // Decode the second character of the format string.
- if (FormatStr[1] == 'c') {
- if (N == 0)
- return ConstantInt::get(CI->getType(), 1);
- else if (N == 1)
- return nullptr;
-
- // snprintf(dst, size, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
- if (!CI->getArgOperand(3)->getType()->isIntegerTy())
- return nullptr;
- Value *V = B.CreateTrunc(CI->getArgOperand(3), B.getInt8Ty(), "char");
- Value *Ptr = castToCStr(CI->getArgOperand(0), B);
- B.CreateStore(V, Ptr);
- Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
- B.CreateStore(B.getInt8(0), Ptr);
-
- return ConstantInt::get(CI->getType(), 1);
- }
-
- if (FormatStr[1] == 's') {
- // snprintf(dest, size, "%s", str) to llvm.memcpy(dest, str, len+1, 1)
- StringRef Str;
- if (!getConstantStringInfo(CI->getArgOperand(3), Str))
- return nullptr;
-
- if (N == 0)
- return ConstantInt::get(CI->getType(), Str.size());
- else if (N < Str.size() + 1)
- return nullptr;
-
- B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(3),
- Align(1), ConstantInt::get(CI->getType(), Str.size() + 1));
-
- // The snprintf result is the unincremented number of bytes in the string.
- return ConstantInt::get(CI->getType(), Str.size());
- }
- }
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilderBase &B) {
- if (Value *V = optimizeSnPrintFString(CI, B)) {
- return V;
- }
-
- if (isKnownNonZero(CI->getOperand(1), DL))
- annotateNonNullBasedOnAccess(CI, 0);
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
- IRBuilderBase &B) {
- optimizeErrorReporting(CI, B, 0);
-
- // All the optimizations depend on the format string.
- StringRef FormatStr;
- if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
- return nullptr;
-
- // Do not do any of the following transformations if the fprintf return
- // value is used, in general the fprintf return value is not compatible
- // with fwrite(), fputc() or fputs().
- if (!CI->use_empty())
- return nullptr;
-
- // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
- if (CI->getNumArgOperands() == 2) {
- // Could handle %% -> % if we cared.
- if (FormatStr.find('%') != StringRef::npos)
- return nullptr; // We found a format specifier.
-
- return emitFWrite(
- CI->getArgOperand(1),
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size()),
- CI->getArgOperand(0), B, DL, TLI);
- }
-
- // The remaining optimizations require the format string to be "%s" or "%c"
- // and have an extra operand.
- if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
- CI->getNumArgOperands() < 3)
- return nullptr;
-
- // Decode the second character of the format string.
- if (FormatStr[1] == 'c') {
- // fprintf(F, "%c", chr) --> fputc(chr, F)
- if (!CI->getArgOperand(2)->getType()->isIntegerTy())
- return nullptr;
- return emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
- }
-
- if (FormatStr[1] == 's') {
- // fprintf(F, "%s", str) --> fputs(str, F)
- if (!CI->getArgOperand(2)->getType()->isPointerTy())
- return nullptr;
- return emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
- }
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) {
- Function *Callee = CI->getCalledFunction();
- FunctionType *FT = Callee->getFunctionType();
- if (Value *V = optimizeFPrintFString(CI, B)) {
- return V;
- }
-
- // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
- // floating point arguments.
- if (TLI->has(LibFunc_fiprintf) && !callHasFloatingPointArgument(CI)) {
- Module *M = B.GetInsertBlock()->getParent()->getParent();
- FunctionCallee FIPrintFFn =
- M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
- CallInst *New = cast<CallInst>(CI->clone());
- New->setCalledFunction(FIPrintFFn);
- B.Insert(New);
- return New;
- }
-
- // fprintf(stream, format, ...) -> __small_fprintf(stream, format, ...) if no
- // 128-bit floating point arguments.
- if (TLI->has(LibFunc_small_fprintf) && !callHasFP128Argument(CI)) {
- Module *M = B.GetInsertBlock()->getParent()->getParent();
- auto SmallFPrintFFn =
- M->getOrInsertFunction(TLI->getName(LibFunc_small_fprintf),
- FT, Callee->getAttributes());
- CallInst *New = cast<CallInst>(CI->clone());
- New->setCalledFunction(SmallFPrintFFn);
- B.Insert(New);
- return New;
- }
-
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilderBase &B) {
- optimizeErrorReporting(CI, B, 3);
-
- // Get the element size and count.
- ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
- ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
- if (SizeC && CountC) {
- uint64_t Bytes = SizeC->getZExtValue() * CountC->getZExtValue();
-
- // If this is writing zero records, remove the call (it's a noop).
- if (Bytes == 0)
- return ConstantInt::get(CI->getType(), 0);
-
- // If this is writing one byte, turn it into fputc.
- // This optimisation is only valid, if the return value is unused.
- if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F)
- Value *Char = B.CreateLoad(B.getInt8Ty(),
- castToCStr(CI->getArgOperand(0), B), "char");
- Value *NewCI = emitFPutC(Char, CI->getArgOperand(3), B, TLI);
- return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
- }
- }
-
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilderBase &B) {
- optimizeErrorReporting(CI, B, 1);
-
- // Don't rewrite fputs to fwrite when optimising for size because fwrite
- // requires more arguments and thus extra MOVs are required.
- bool OptForSize = CI->getFunction()->hasOptSize() ||
- llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI,
- PGSOQueryType::IRPass);
- if (OptForSize)
- return nullptr;
-
- // We can't optimize if return value is used.
- if (!CI->use_empty())
- return nullptr;
-
- // fputs(s,F) --> fwrite(s,strlen(s),1,F)
- uint64_t Len = GetStringLength(CI->getArgOperand(0));
- if (!Len)
- return nullptr;
-
- // Known to have no uses (see above).
- return emitFWrite(
- CI->getArgOperand(0),
- ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1),
- CI->getArgOperand(1), B, DL, TLI);
-}
-
-Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) {
- annotateNonNullBasedOnAccess(CI, 0);
- if (!CI->use_empty())
- return nullptr;
-
- // Check for a constant string.
- // puts("") -> putchar('\n')
- StringRef Str;
- if (getConstantStringInfo(CI->getArgOperand(0), Str) && Str.empty())
- return emitPutChar(B.getInt32('\n'), B, TLI);
-
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) {
- // bcopy(src, dst, n) -> llvm.memmove(dst, src, n)
- return B.CreateMemMove(CI->getArgOperand(1), Align(1), CI->getArgOperand(0),
- Align(1), CI->getArgOperand(2));
-}
-
-bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
- LibFunc Func;
- SmallString<20> FloatFuncName = FuncName;
- FloatFuncName += 'f';
- if (TLI->getLibFunc(FloatFuncName, Func))
- return TLI->has(Func);
- return false;
-}
-
-Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
- IRBuilderBase &Builder) {
- LibFunc Func;
- Function *Callee = CI->getCalledFunction();
- // Check for string/memory library functions.
- if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
- // Make sure we never change the calling convention.
- assert((ignoreCallingConv(Func) ||
- isCallingConvCCompatible(CI)) &&
- "Optimizing string/memory libcall would change the calling convention");
- switch (Func) {
- case LibFunc_strcat:
- return optimizeStrCat(CI, Builder);
- case LibFunc_strncat:
- return optimizeStrNCat(CI, Builder);
- case LibFunc_strchr:
- return optimizeStrChr(CI, Builder);
- case LibFunc_strrchr:
- return optimizeStrRChr(CI, Builder);
- case LibFunc_strcmp:
- return optimizeStrCmp(CI, Builder);
- case LibFunc_strncmp:
- return optimizeStrNCmp(CI, Builder);
- case LibFunc_strcpy:
- return optimizeStrCpy(CI, Builder);
- case LibFunc_stpcpy:
- return optimizeStpCpy(CI, Builder);
- case LibFunc_strncpy:
- return optimizeStrNCpy(CI, Builder);
- case LibFunc_strlen:
- return optimizeStrLen(CI, Builder);
- case LibFunc_strpbrk:
- return optimizeStrPBrk(CI, Builder);
- case LibFunc_strndup:
- return optimizeStrNDup(CI, Builder);
- case LibFunc_strtol:
- case LibFunc_strtod:
- case LibFunc_strtof:
- case LibFunc_strtoul:
- case LibFunc_strtoll:
- case LibFunc_strtold:
- case LibFunc_strtoull:
- return optimizeStrTo(CI, Builder);
- case LibFunc_strspn:
- return optimizeStrSpn(CI, Builder);
- case LibFunc_strcspn:
- return optimizeStrCSpn(CI, Builder);
- case LibFunc_strstr:
- return optimizeStrStr(CI, Builder);
- case LibFunc_memchr:
- return optimizeMemChr(CI, Builder);
- case LibFunc_memrchr:
- return optimizeMemRChr(CI, Builder);
- case LibFunc_bcmp:
- return optimizeBCmp(CI, Builder);
- case LibFunc_memcmp:
- return optimizeMemCmp(CI, Builder);
- case LibFunc_memcpy:
- return optimizeMemCpy(CI, Builder);
- case LibFunc_memccpy:
- return optimizeMemCCpy(CI, Builder);
- case LibFunc_mempcpy:
- return optimizeMemPCpy(CI, Builder);
- case LibFunc_memmove:
- return optimizeMemMove(CI, Builder);
- case LibFunc_memset:
- return optimizeMemSet(CI, Builder);
- case LibFunc_realloc:
- return optimizeRealloc(CI, Builder);
- case LibFunc_wcslen:
- return optimizeWcslen(CI, Builder);
- case LibFunc_bcopy:
- return optimizeBCopy(CI, Builder);
- default:
- break;
- }
- }
- return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
- LibFunc Func,
- IRBuilderBase &Builder) {
- // Don't optimize calls that require strict floating point semantics.
- if (CI->isStrictFP())
- return nullptr;
-
- if (Value *V = optimizeTrigReflections(CI, Func, Builder))
- return V;
-
- switch (Func) {
- case LibFunc_sinpif:
- case LibFunc_sinpi:
- case LibFunc_cospif:
- case LibFunc_cospi:
- return optimizeSinCosPi(CI, Builder);
- case LibFunc_powf:
- case LibFunc_pow:
- case LibFunc_powl:
- return optimizePow(CI, Builder);
- case LibFunc_exp2l:
- case LibFunc_exp2:
- case LibFunc_exp2f:
- return optimizeExp2(CI, Builder);
- case LibFunc_fabsf:
- case LibFunc_fabs:
- case LibFunc_fabsl:
- return replaceUnaryCall(CI, Builder, Intrinsic::fabs);
- case LibFunc_sqrtf:
- case LibFunc_sqrt:
- case LibFunc_sqrtl:
- return optimizeSqrt(CI, Builder);
- case LibFunc_logf:
- case LibFunc_log:
- case LibFunc_logl:
- case LibFunc_log10f:
- case LibFunc_log10:
- case LibFunc_log10l:
- case LibFunc_log1pf:
- case LibFunc_log1p:
- case LibFunc_log1pl:
- case LibFunc_log2f:
- case LibFunc_log2:
- case LibFunc_log2l:
- case LibFunc_logbf:
- case LibFunc_logb:
- case LibFunc_logbl:
- return optimizeLog(CI, Builder);
- case LibFunc_tan:
- case LibFunc_tanf:
- case LibFunc_tanl:
- return optimizeTan(CI, Builder);
- case LibFunc_ceil:
- return replaceUnaryCall(CI, Builder, Intrinsic::ceil);
- case LibFunc_floor:
- return replaceUnaryCall(CI, Builder, Intrinsic::floor);
- case LibFunc_round:
- return replaceUnaryCall(CI, Builder, Intrinsic::round);
- case LibFunc_roundeven:
- return replaceUnaryCall(CI, Builder, Intrinsic::roundeven);
- case LibFunc_nearbyint:
- return replaceUnaryCall(CI, Builder, Intrinsic::nearbyint);
- case LibFunc_rint:
- return replaceUnaryCall(CI, Builder, Intrinsic::rint);
- case LibFunc_trunc:
- return replaceUnaryCall(CI, Builder, Intrinsic::trunc);
- case LibFunc_acos:
- case LibFunc_acosh:
- case LibFunc_asin:
- case LibFunc_asinh:
- case LibFunc_atan:
- case LibFunc_atanh:
- case LibFunc_cbrt:
- case LibFunc_cosh:
- case LibFunc_exp:
- case LibFunc_exp10:
- case LibFunc_expm1:
- case LibFunc_cos:
- case LibFunc_sin:
- case LibFunc_sinh:
- case LibFunc_tanh:
- if (UnsafeFPShrink && hasFloatVersion(CI->getCalledFunction()->getName()))
- return optimizeUnaryDoubleFP(CI, Builder, true);
- return nullptr;
- case LibFunc_copysign:
- if (hasFloatVersion(CI->getCalledFunction()->getName()))
- return optimizeBinaryDoubleFP(CI, Builder);
- return nullptr;
- case LibFunc_fminf:
- case LibFunc_fmin:
- case LibFunc_fminl:
- case LibFunc_fmaxf:
- case LibFunc_fmax:
- case LibFunc_fmaxl:
- return optimizeFMinFMax(CI, Builder);
- case LibFunc_cabs:
- case LibFunc_cabsf:
- case LibFunc_cabsl:
- return optimizeCAbs(CI, Builder);
- default:
- return nullptr;
- }
-}
-
-Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
- // TODO: Split out the code below that operates on FP calls so that
- // we can all non-FP calls with the StrictFP attribute to be
- // optimized.
- if (CI->isNoBuiltin())
- return nullptr;
-
- LibFunc Func;
- Function *Callee = CI->getCalledFunction();
- bool isCallingConvC = isCallingConvCCompatible(CI);
-
- SmallVector<OperandBundleDef, 2> OpBundles;
- CI->getOperandBundlesAsDefs(OpBundles);
-
- IRBuilderBase::OperandBundlesGuard Guard(Builder);
- Builder.setDefaultOperandBundles(OpBundles);
-
- // Command-line parameter overrides instruction attribute.
- // This can't be moved to optimizeFloatingPointLibCall() because it may be
- // used by the intrinsic optimizations.
- if (EnableUnsafeFPShrink.getNumOccurrences() > 0)
- UnsafeFPShrink = EnableUnsafeFPShrink;
- else if (isa<FPMathOperator>(CI) && CI->isFast())
- UnsafeFPShrink = true;
-
- // First, check for intrinsics.
- if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
- if (!isCallingConvC)
- return nullptr;
- // The FP intrinsics have corresponding constrained versions so we don't
- // need to check for the StrictFP attribute here.
- switch (II->getIntrinsicID()) {
- case Intrinsic::pow:
- return optimizePow(CI, Builder);
- case Intrinsic::exp2:
- return optimizeExp2(CI, Builder);
- case Intrinsic::log:
- case Intrinsic::log2:
- case Intrinsic::log10:
- return optimizeLog(CI, Builder);
- case Intrinsic::sqrt:
- return optimizeSqrt(CI, Builder);
- // TODO: Use foldMallocMemset() with memset intrinsic.
- case Intrinsic::memset:
- return optimizeMemSet(CI, Builder);
- case Intrinsic::memcpy:
- return optimizeMemCpy(CI, Builder);
- case Intrinsic::memmove:
- return optimizeMemMove(CI, Builder);
- default:
- return nullptr;
- }
- }
-
- // Also try to simplify calls to fortified library functions.
- if (Value *SimplifiedFortifiedCI =
- FortifiedSimplifier.optimizeCall(CI, Builder)) {
- // Try to further simplify the result.
- CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI);
- if (SimplifiedCI && SimplifiedCI->getCalledFunction()) {
- // Ensure that SimplifiedCI's uses are complete, since some calls have
- // their uses analyzed.
- replaceAllUsesWith(CI, SimplifiedCI);
-
- // Set insertion point to SimplifiedCI to guarantee we reach all uses
- // we might replace later on.
- IRBuilderBase::InsertPointGuard Guard(Builder);
- Builder.SetInsertPoint(SimplifiedCI);
- if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) {
- // If we were able to further simplify, remove the now redundant call.
- substituteInParent(SimplifiedCI, V);
- return V;
- }
- }
- return SimplifiedFortifiedCI;
- }
-
- // Then check for known library functions.
- if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
- // We never change the calling convention.
- if (!ignoreCallingConv(Func) && !isCallingConvC)
- return nullptr;
- if (Value *V = optimizeStringMemoryLibCall(CI, Builder))
- return V;
- if (Value *V = optimizeFloatingPointLibCall(CI, Func, Builder))
- return V;
- switch (Func) {
- case LibFunc_ffs:
- case LibFunc_ffsl:
- case LibFunc_ffsll:
- return optimizeFFS(CI, Builder);
- case LibFunc_fls:
- case LibFunc_flsl:
- case LibFunc_flsll:
- return optimizeFls(CI, Builder);
- case LibFunc_abs:
- case LibFunc_labs:
- case LibFunc_llabs:
- return optimizeAbs(CI, Builder);
- case LibFunc_isdigit:
- return optimizeIsDigit(CI, Builder);
- case LibFunc_isascii:
- return optimizeIsAscii(CI, Builder);
- case LibFunc_toascii:
- return optimizeToAscii(CI, Builder);
- case LibFunc_atoi:
- case LibFunc_atol:
- case LibFunc_atoll:
- return optimizeAtoi(CI, Builder);
- case LibFunc_strtol:
- case LibFunc_strtoll:
- return optimizeStrtol(CI, Builder);
- case LibFunc_printf:
- return optimizePrintF(CI, Builder);
- case LibFunc_sprintf:
- return optimizeSPrintF(CI, Builder);
- case LibFunc_snprintf:
- return optimizeSnPrintF(CI, Builder);
- case LibFunc_fprintf:
- return optimizeFPrintF(CI, Builder);
- case LibFunc_fwrite:
- return optimizeFWrite(CI, Builder);
- case LibFunc_fputs:
- return optimizeFPuts(CI, Builder);
- case LibFunc_puts:
- return optimizePuts(CI, Builder);
- case LibFunc_perror:
- return optimizeErrorReporting(CI, Builder);
- case LibFunc_vfprintf:
- case LibFunc_fiprintf:
- return optimizeErrorReporting(CI, Builder, 0);
- default:
- return nullptr;
- }
- }
- return nullptr;
-}
-
-LibCallSimplifier::LibCallSimplifier(
- const DataLayout &DL, const TargetLibraryInfo *TLI,
- OptimizationRemarkEmitter &ORE,
- BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
- function_ref<void(Instruction *, Value *)> Replacer,
- function_ref<void(Instruction *)> Eraser)
- : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE), BFI(BFI), PSI(PSI),
- UnsafeFPShrink(false), Replacer(Replacer), Eraser(Eraser) {}
-
-void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
- // Indirect through the replacer used in this instance.
- Replacer(I, With);
-}
-
-void LibCallSimplifier::eraseFromParent(Instruction *I) {
- Eraser(I);
-}
-
-// TODO:
-// Additional cases that we need to add to this file:
-//
-// cbrt:
-// * cbrt(expN(X)) -> expN(x/3)
-// * cbrt(sqrt(x)) -> pow(x,1/6)
-// * cbrt(cbrt(x)) -> pow(x,1/9)
-//
-// exp, expf, expl:
-// * exp(log(x)) -> x
-//
-// log, logf, logl:
-// * log(exp(x)) -> x
-// * log(exp(y)) -> y*log(e)
-// * log(exp10(y)) -> y*log(10)
-// * log(sqrt(x)) -> 0.5*log(x)
-//
-// pow, powf, powl:
-// * pow(sqrt(x),y) -> pow(x,y*0.5)
-// * pow(pow(x,y),z)-> pow(x,y*z)
-//
-// signbit:
-// * signbit(cnst) -> cnst'
-// * signbit(nncst) -> 0 (if pstv is a non-negative constant)
-//
-// sqrt, sqrtf, sqrtl:
-// * sqrt(expN(x)) -> expN(x*0.5)
-// * sqrt(Nroot(x)) -> pow(x,1/(2*N))
-// * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
-//
-
-//===----------------------------------------------------------------------===//
-// Fortified Library Call Optimizations
-//===----------------------------------------------------------------------===//
-
-bool
-FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
- unsigned ObjSizeOp,
- Optional<unsigned> SizeOp,
- Optional<unsigned> StrOp,
- Optional<unsigned> FlagOp) {
- // If this function takes a flag argument, the implementation may use it to
- // perform extra checks. Don't fold into the non-checking variant.
- if (FlagOp) {
- ConstantInt *Flag = dyn_cast<ConstantInt>(CI->getArgOperand(*FlagOp));
- if (!Flag || !Flag->isZero())
- return false;
- }
-
- if (SizeOp && CI->getArgOperand(ObjSizeOp) == CI->getArgOperand(*SizeOp))
- return true;
-
- if (ConstantInt *ObjSizeCI =
- dyn_cast<ConstantInt>(CI->getArgOperand(ObjSizeOp))) {
- if (ObjSizeCI->isMinusOne())
- return true;
- // If the object size wasn't -1 (unknown), bail out if we were asked to.
- if (OnlyLowerUnknownSize)
- return false;
- if (StrOp) {
- uint64_t Len = GetStringLength(CI->getArgOperand(*StrOp));
- // If the length is 0 we don't know how long it is and so we can't
- // remove the check.
- if (Len)
- annotateDereferenceableBytes(CI, *StrOp, Len);
- else
- return false;
- return ObjSizeCI->getZExtValue() >= Len;
- }
-
- if (SizeOp) {
- if (ConstantInt *SizeCI =
- dyn_cast<ConstantInt>(CI->getArgOperand(*SizeOp)))
- return ObjSizeCI->getZExtValue() >= SizeCI->getZExtValue();
- }
- }
- return false;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
- IRBuilderBase &B) {
- if (isFortifiedCallFoldable(CI, 3, 2)) {
- CallInst *NewCI =
- B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(1),
- Align(1), CI->getArgOperand(2));
- NewCI->setAttributes(CI->getAttributes());
+ Value *Len = emitStrLen(CI->getArgOperand(2), B, DL, TLI);
+ if (!Len)
+ return nullptr;
+ Value *IncLen =
+ B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc");
+ B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(2),
+ Align(1), IncLen);
+
+ // The sprintf result is the unincremented number of bytes in the string.
+ return B.CreateIntCast(Len, CI->getType(), false);
+ }
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) {
+ Function *Callee = CI->getCalledFunction();
+ FunctionType *FT = Callee->getFunctionType();
+ if (Value *V = optimizeSPrintFString(CI, B)) {
+ return V;
+ }
+
+ // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
+ // point arguments.
+ if (TLI->has(LibFunc_siprintf) && !callHasFloatingPointArgument(CI)) {
+ Module *M = B.GetInsertBlock()->getParent()->getParent();
+ FunctionCallee SIPrintFFn =
+ M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
+ CallInst *New = cast<CallInst>(CI->clone());
+ New->setCalledFunction(SIPrintFFn);
+ B.Insert(New);
+ return New;
+ }
+
+ // sprintf(str, format, ...) -> __small_sprintf(str, format, ...) if no 128-bit
+ // floating point arguments.
+ if (TLI->has(LibFunc_small_sprintf) && !callHasFP128Argument(CI)) {
+ Module *M = B.GetInsertBlock()->getParent()->getParent();
+ auto SmallSPrintFFn =
+ M->getOrInsertFunction(TLI->getName(LibFunc_small_sprintf),
+ FT, Callee->getAttributes());
+ CallInst *New = cast<CallInst>(CI->clone());
+ New->setCalledFunction(SmallSPrintFFn);
+ B.Insert(New);
+ return New;
+ }
+
+ annotateNonNullBasedOnAccess(CI, {0, 1});
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI,
+ IRBuilderBase &B) {
+ // Check for size
+ ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ if (!Size)
+ return nullptr;
+
+ uint64_t N = Size->getZExtValue();
+ // Check for a fixed format string.
+ StringRef FormatStr;
+ if (!getConstantStringInfo(CI->getArgOperand(2), FormatStr))
+ return nullptr;
+
+ // If we just have a format string (nothing else crazy) transform it.
+ if (CI->getNumArgOperands() == 3) {
+ // Make sure there's no % in the constant array. We could try to handle
+ // %% -> % in the future if we cared.
+ if (FormatStr.find('%') != StringRef::npos)
+ return nullptr; // we found a format specifier, bail out.
+
+ if (N == 0)
+ return ConstantInt::get(CI->getType(), FormatStr.size());
+ else if (N < FormatStr.size() + 1)
+ return nullptr;
+
+ // snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt,
+ // strlen(fmt)+1)
+ B.CreateMemCpy(
+ CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1),
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()),
+ FormatStr.size() + 1)); // Copy the null byte.
+ return ConstantInt::get(CI->getType(), FormatStr.size());
+ }
+
+ // The remaining optimizations require the format string to be "%s" or "%c"
+ // and have an extra operand.
+ if (FormatStr.size() == 2 && FormatStr[0] == '%' &&
+ CI->getNumArgOperands() == 4) {
+
+ // Decode the second character of the format string.
+ if (FormatStr[1] == 'c') {
+ if (N == 0)
+ return ConstantInt::get(CI->getType(), 1);
+ else if (N == 1)
+ return nullptr;
+
+ // snprintf(dst, size, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+ if (!CI->getArgOperand(3)->getType()->isIntegerTy())
+ return nullptr;
+ Value *V = B.CreateTrunc(CI->getArgOperand(3), B.getInt8Ty(), "char");
+ Value *Ptr = castToCStr(CI->getArgOperand(0), B);
+ B.CreateStore(V, Ptr);
+ Ptr = B.CreateGEP(B.getInt8Ty(), Ptr, B.getInt32(1), "nul");
+ B.CreateStore(B.getInt8(0), Ptr);
+
+ return ConstantInt::get(CI->getType(), 1);
+ }
+
+ if (FormatStr[1] == 's') {
+ // snprintf(dest, size, "%s", str) to llvm.memcpy(dest, str, len+1, 1)
+ StringRef Str;
+ if (!getConstantStringInfo(CI->getArgOperand(3), Str))
+ return nullptr;
+
+ if (N == 0)
+ return ConstantInt::get(CI->getType(), Str.size());
+ else if (N < Str.size() + 1)
+ return nullptr;
+
+ B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(3),
+ Align(1), ConstantInt::get(CI->getType(), Str.size() + 1));
+
+ // The snprintf result is the unincremented number of bytes in the string.
+ return ConstantInt::get(CI->getType(), Str.size());
+ }
+ }
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilderBase &B) {
+ if (Value *V = optimizeSnPrintFString(CI, B)) {
+ return V;
+ }
+
+ if (isKnownNonZero(CI->getOperand(1), DL))
+ annotateNonNullBasedOnAccess(CI, 0);
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
+ IRBuilderBase &B) {
+ optimizeErrorReporting(CI, B, 0);
+
+ // All the optimizations depend on the format string.
+ StringRef FormatStr;
+ if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+ return nullptr;
+
+ // Do not do any of the following transformations if the fprintf return
+ // value is used, in general the fprintf return value is not compatible
+ // with fwrite(), fputc() or fputs().
+ if (!CI->use_empty())
+ return nullptr;
+
+ // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
+ if (CI->getNumArgOperands() == 2) {
+ // Could handle %% -> % if we cared.
+ if (FormatStr.find('%') != StringRef::npos)
+ return nullptr; // We found a format specifier.
+
+ return emitFWrite(
+ CI->getArgOperand(1),
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), FormatStr.size()),
+ CI->getArgOperand(0), B, DL, TLI);
+ }
+
+ // The remaining optimizations require the format string to be "%s" or "%c"
+ // and have an extra operand.
+ if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+ CI->getNumArgOperands() < 3)
+ return nullptr;
+
+ // Decode the second character of the format string.
+ if (FormatStr[1] == 'c') {
+ // fprintf(F, "%c", chr) --> fputc(chr, F)
+ if (!CI->getArgOperand(2)->getType()->isIntegerTy())
+ return nullptr;
+ return emitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
+ }
+
+ if (FormatStr[1] == 's') {
+ // fprintf(F, "%s", str) --> fputs(str, F)
+ if (!CI->getArgOperand(2)->getType()->isPointerTy())
+ return nullptr;
+ return emitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TLI);
+ }
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) {
+ Function *Callee = CI->getCalledFunction();
+ FunctionType *FT = Callee->getFunctionType();
+ if (Value *V = optimizeFPrintFString(CI, B)) {
+ return V;
+ }
+
+ // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
+ // floating point arguments.
+ if (TLI->has(LibFunc_fiprintf) && !callHasFloatingPointArgument(CI)) {
+ Module *M = B.GetInsertBlock()->getParent()->getParent();
+ FunctionCallee FIPrintFFn =
+ M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
+ CallInst *New = cast<CallInst>(CI->clone());
+ New->setCalledFunction(FIPrintFFn);
+ B.Insert(New);
+ return New;
+ }
+
+ // fprintf(stream, format, ...) -> __small_fprintf(stream, format, ...) if no
+ // 128-bit floating point arguments.
+ if (TLI->has(LibFunc_small_fprintf) && !callHasFP128Argument(CI)) {
+ Module *M = B.GetInsertBlock()->getParent()->getParent();
+ auto SmallFPrintFFn =
+ M->getOrInsertFunction(TLI->getName(LibFunc_small_fprintf),
+ FT, Callee->getAttributes());
+ CallInst *New = cast<CallInst>(CI->clone());
+ New->setCalledFunction(SmallFPrintFFn);
+ B.Insert(New);
+ return New;
+ }
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilderBase &B) {
+ optimizeErrorReporting(CI, B, 3);
+
+ // Get the element size and count.
+ ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+ ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+ if (SizeC && CountC) {
+ uint64_t Bytes = SizeC->getZExtValue() * CountC->getZExtValue();
+
+ // If this is writing zero records, remove the call (it's a noop).
+ if (Bytes == 0)
+ return ConstantInt::get(CI->getType(), 0);
+
+ // If this is writing one byte, turn it into fputc.
+ // This optimisation is only valid, if the return value is unused.
+ if (Bytes == 1 && CI->use_empty()) { // fwrite(S,1,1,F) -> fputc(S[0],F)
+ Value *Char = B.CreateLoad(B.getInt8Ty(),
+ castToCStr(CI->getArgOperand(0), B), "char");
+ Value *NewCI = emitFPutC(Char, CI->getArgOperand(3), B, TLI);
+ return NewCI ? ConstantInt::get(CI->getType(), 1) : nullptr;
+ }
+ }
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilderBase &B) {
+ optimizeErrorReporting(CI, B, 1);
+
+ // Don't rewrite fputs to fwrite when optimising for size because fwrite
+ // requires more arguments and thus extra MOVs are required.
+ bool OptForSize = CI->getFunction()->hasOptSize() ||
+ llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI,
+ PGSOQueryType::IRPass);
+ if (OptForSize)
+ return nullptr;
+
+ // We can't optimize if return value is used.
+ if (!CI->use_empty())
+ return nullptr;
+
+ // fputs(s,F) --> fwrite(s,strlen(s),1,F)
+ uint64_t Len = GetStringLength(CI->getArgOperand(0));
+ if (!Len)
+ return nullptr;
+
+ // Known to have no uses (see above).
+ return emitFWrite(
+ CI->getArgOperand(0),
+ ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len - 1),
+ CI->getArgOperand(1), B, DL, TLI);
+}
+
+Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) {
+ annotateNonNullBasedOnAccess(CI, 0);
+ if (!CI->use_empty())
+ return nullptr;
+
+ // Check for a constant string.
+ // puts("") -> putchar('\n')
+ StringRef Str;
+ if (getConstantStringInfo(CI->getArgOperand(0), Str) && Str.empty())
+ return emitPutChar(B.getInt32('\n'), B, TLI);
+
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) {
+ // bcopy(src, dst, n) -> llvm.memmove(dst, src, n)
+ return B.CreateMemMove(CI->getArgOperand(1), Align(1), CI->getArgOperand(0),
+ Align(1), CI->getArgOperand(2));
+}
+
+bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
+ LibFunc Func;
+ SmallString<20> FloatFuncName = FuncName;
+ FloatFuncName += 'f';
+ if (TLI->getLibFunc(FloatFuncName, Func))
+ return TLI->has(Func);
+ return false;
+}
+
+Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
+ IRBuilderBase &Builder) {
+ LibFunc Func;
+ Function *Callee = CI->getCalledFunction();
+ // Check for string/memory library functions.
+ if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
+ // Make sure we never change the calling convention.
+ assert((ignoreCallingConv(Func) ||
+ isCallingConvCCompatible(CI)) &&
+ "Optimizing string/memory libcall would change the calling convention");
+ switch (Func) {
+ case LibFunc_strcat:
+ return optimizeStrCat(CI, Builder);
+ case LibFunc_strncat:
+ return optimizeStrNCat(CI, Builder);
+ case LibFunc_strchr:
+ return optimizeStrChr(CI, Builder);
+ case LibFunc_strrchr:
+ return optimizeStrRChr(CI, Builder);
+ case LibFunc_strcmp:
+ return optimizeStrCmp(CI, Builder);
+ case LibFunc_strncmp:
+ return optimizeStrNCmp(CI, Builder);
+ case LibFunc_strcpy:
+ return optimizeStrCpy(CI, Builder);
+ case LibFunc_stpcpy:
+ return optimizeStpCpy(CI, Builder);
+ case LibFunc_strncpy:
+ return optimizeStrNCpy(CI, Builder);
+ case LibFunc_strlen:
+ return optimizeStrLen(CI, Builder);
+ case LibFunc_strpbrk:
+ return optimizeStrPBrk(CI, Builder);
+ case LibFunc_strndup:
+ return optimizeStrNDup(CI, Builder);
+ case LibFunc_strtol:
+ case LibFunc_strtod:
+ case LibFunc_strtof:
+ case LibFunc_strtoul:
+ case LibFunc_strtoll:
+ case LibFunc_strtold:
+ case LibFunc_strtoull:
+ return optimizeStrTo(CI, Builder);
+ case LibFunc_strspn:
+ return optimizeStrSpn(CI, Builder);
+ case LibFunc_strcspn:
+ return optimizeStrCSpn(CI, Builder);
+ case LibFunc_strstr:
+ return optimizeStrStr(CI, Builder);
+ case LibFunc_memchr:
+ return optimizeMemChr(CI, Builder);
+ case LibFunc_memrchr:
+ return optimizeMemRChr(CI, Builder);
+ case LibFunc_bcmp:
+ return optimizeBCmp(CI, Builder);
+ case LibFunc_memcmp:
+ return optimizeMemCmp(CI, Builder);
+ case LibFunc_memcpy:
+ return optimizeMemCpy(CI, Builder);
+ case LibFunc_memccpy:
+ return optimizeMemCCpy(CI, Builder);
+ case LibFunc_mempcpy:
+ return optimizeMemPCpy(CI, Builder);
+ case LibFunc_memmove:
+ return optimizeMemMove(CI, Builder);
+ case LibFunc_memset:
+ return optimizeMemSet(CI, Builder);
+ case LibFunc_realloc:
+ return optimizeRealloc(CI, Builder);
+ case LibFunc_wcslen:
+ return optimizeWcslen(CI, Builder);
+ case LibFunc_bcopy:
+ return optimizeBCopy(CI, Builder);
+ default:
+ break;
+ }
+ }
+ return nullptr;
+}
+
+Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
+ LibFunc Func,
+ IRBuilderBase &Builder) {
+ // Don't optimize calls that require strict floating point semantics.
+ if (CI->isStrictFP())
+ return nullptr;
+
+ if (Value *V = optimizeTrigReflections(CI, Func, Builder))
+ return V;
+
+ switch (Func) {
+ case LibFunc_sinpif:
+ case LibFunc_sinpi:
+ case LibFunc_cospif:
+ case LibFunc_cospi:
+ return optimizeSinCosPi(CI, Builder);
+ case LibFunc_powf:
+ case LibFunc_pow:
+ case LibFunc_powl:
+ return optimizePow(CI, Builder);
+ case LibFunc_exp2l:
+ case LibFunc_exp2:
+ case LibFunc_exp2f:
+ return optimizeExp2(CI, Builder);
+ case LibFunc_fabsf:
+ case LibFunc_fabs:
+ case LibFunc_fabsl:
+ return replaceUnaryCall(CI, Builder, Intrinsic::fabs);
+ case LibFunc_sqrtf:
+ case LibFunc_sqrt:
+ case LibFunc_sqrtl:
+ return optimizeSqrt(CI, Builder);
+ case LibFunc_logf:
+ case LibFunc_log:
+ case LibFunc_logl:
+ case LibFunc_log10f:
+ case LibFunc_log10:
+ case LibFunc_log10l:
+ case LibFunc_log1pf:
+ case LibFunc_log1p:
+ case LibFunc_log1pl:
+ case LibFunc_log2f:
+ case LibFunc_log2:
+ case LibFunc_log2l:
+ case LibFunc_logbf:
+ case LibFunc_logb:
+ case LibFunc_logbl:
+ return optimizeLog(CI, Builder);
+ case LibFunc_tan:
+ case LibFunc_tanf:
+ case LibFunc_tanl:
+ return optimizeTan(CI, Builder);
+ case LibFunc_ceil:
+ return replaceUnaryCall(CI, Builder, Intrinsic::ceil);
+ case LibFunc_floor:
+ return replaceUnaryCall(CI, Builder, Intrinsic::floor);
+ case LibFunc_round:
+ return replaceUnaryCall(CI, Builder, Intrinsic::round);
+ case LibFunc_roundeven:
+ return replaceUnaryCall(CI, Builder, Intrinsic::roundeven);
+ case LibFunc_nearbyint:
+ return replaceUnaryCall(CI, Builder, Intrinsic::nearbyint);
+ case LibFunc_rint:
+ return replaceUnaryCall(CI, Builder, Intrinsic::rint);
+ case LibFunc_trunc:
+ return replaceUnaryCall(CI, Builder, Intrinsic::trunc);
+ case LibFunc_acos:
+ case LibFunc_acosh:
+ case LibFunc_asin:
+ case LibFunc_asinh:
+ case LibFunc_atan:
+ case LibFunc_atanh:
+ case LibFunc_cbrt:
+ case LibFunc_cosh:
+ case LibFunc_exp:
+ case LibFunc_exp10:
+ case LibFunc_expm1:
+ case LibFunc_cos:
+ case LibFunc_sin:
+ case LibFunc_sinh:
+ case LibFunc_tanh:
+ if (UnsafeFPShrink && hasFloatVersion(CI->getCalledFunction()->getName()))
+ return optimizeUnaryDoubleFP(CI, Builder, true);
+ return nullptr;
+ case LibFunc_copysign:
+ if (hasFloatVersion(CI->getCalledFunction()->getName()))
+ return optimizeBinaryDoubleFP(CI, Builder);
+ return nullptr;
+ case LibFunc_fminf:
+ case LibFunc_fmin:
+ case LibFunc_fminl:
+ case LibFunc_fmaxf:
+ case LibFunc_fmax:
+ case LibFunc_fmaxl:
+ return optimizeFMinFMax(CI, Builder);
+ case LibFunc_cabs:
+ case LibFunc_cabsf:
+ case LibFunc_cabsl:
+ return optimizeCAbs(CI, Builder);
+ default:
+ return nullptr;
+ }
+}
+
+Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
+ // TODO: Split out the code below that operates on FP calls so that
+ // we can all non-FP calls with the StrictFP attribute to be
+ // optimized.
+ if (CI->isNoBuiltin())
+ return nullptr;
+
+ LibFunc Func;
+ Function *Callee = CI->getCalledFunction();
+ bool isCallingConvC = isCallingConvCCompatible(CI);
+
+ SmallVector<OperandBundleDef, 2> OpBundles;
+ CI->getOperandBundlesAsDefs(OpBundles);
+
+ IRBuilderBase::OperandBundlesGuard Guard(Builder);
+ Builder.setDefaultOperandBundles(OpBundles);
+
+ // Command-line parameter overrides instruction attribute.
+ // This can't be moved to optimizeFloatingPointLibCall() because it may be
+ // used by the intrinsic optimizations.
+ if (EnableUnsafeFPShrink.getNumOccurrences() > 0)
+ UnsafeFPShrink = EnableUnsafeFPShrink;
+ else if (isa<FPMathOperator>(CI) && CI->isFast())
+ UnsafeFPShrink = true;
+
+ // First, check for intrinsics.
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+ if (!isCallingConvC)
+ return nullptr;
+ // The FP intrinsics have corresponding constrained versions so we don't
+ // need to check for the StrictFP attribute here.
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::pow:
+ return optimizePow(CI, Builder);
+ case Intrinsic::exp2:
+ return optimizeExp2(CI, Builder);
+ case Intrinsic::log:
+ case Intrinsic::log2:
+ case Intrinsic::log10:
+ return optimizeLog(CI, Builder);
+ case Intrinsic::sqrt:
+ return optimizeSqrt(CI, Builder);
+ // TODO: Use foldMallocMemset() with memset intrinsic.
+ case Intrinsic::memset:
+ return optimizeMemSet(CI, Builder);
+ case Intrinsic::memcpy:
+ return optimizeMemCpy(CI, Builder);
+ case Intrinsic::memmove:
+ return optimizeMemMove(CI, Builder);
+ default:
+ return nullptr;
+ }
+ }
+
+ // Also try to simplify calls to fortified library functions.
+ if (Value *SimplifiedFortifiedCI =
+ FortifiedSimplifier.optimizeCall(CI, Builder)) {
+ // Try to further simplify the result.
+ CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI);
+ if (SimplifiedCI && SimplifiedCI->getCalledFunction()) {
+ // Ensure that SimplifiedCI's uses are complete, since some calls have
+ // their uses analyzed.
+ replaceAllUsesWith(CI, SimplifiedCI);
+
+ // Set insertion point to SimplifiedCI to guarantee we reach all uses
+ // we might replace later on.
+ IRBuilderBase::InsertPointGuard Guard(Builder);
+ Builder.SetInsertPoint(SimplifiedCI);
+ if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) {
+ // If we were able to further simplify, remove the now redundant call.
+ substituteInParent(SimplifiedCI, V);
+ return V;
+ }
+ }
+ return SimplifiedFortifiedCI;
+ }
+
+ // Then check for known library functions.
+ if (TLI->getLibFunc(*Callee, Func) && TLI->has(Func)) {
+ // We never change the calling convention.
+ if (!ignoreCallingConv(Func) && !isCallingConvC)
+ return nullptr;
+ if (Value *V = optimizeStringMemoryLibCall(CI, Builder))
+ return V;
+ if (Value *V = optimizeFloatingPointLibCall(CI, Func, Builder))
+ return V;
+ switch (Func) {
+ case LibFunc_ffs:
+ case LibFunc_ffsl:
+ case LibFunc_ffsll:
+ return optimizeFFS(CI, Builder);
+ case LibFunc_fls:
+ case LibFunc_flsl:
+ case LibFunc_flsll:
+ return optimizeFls(CI, Builder);
+ case LibFunc_abs:
+ case LibFunc_labs:
+ case LibFunc_llabs:
+ return optimizeAbs(CI, Builder);
+ case LibFunc_isdigit:
+ return optimizeIsDigit(CI, Builder);
+ case LibFunc_isascii:
+ return optimizeIsAscii(CI, Builder);
+ case LibFunc_toascii:
+ return optimizeToAscii(CI, Builder);
+ case LibFunc_atoi:
+ case LibFunc_atol:
+ case LibFunc_atoll:
+ return optimizeAtoi(CI, Builder);
+ case LibFunc_strtol:
+ case LibFunc_strtoll:
+ return optimizeStrtol(CI, Builder);
+ case LibFunc_printf:
+ return optimizePrintF(CI, Builder);
+ case LibFunc_sprintf:
+ return optimizeSPrintF(CI, Builder);
+ case LibFunc_snprintf:
+ return optimizeSnPrintF(CI, Builder);
+ case LibFunc_fprintf:
+ return optimizeFPrintF(CI, Builder);
+ case LibFunc_fwrite:
+ return optimizeFWrite(CI, Builder);
+ case LibFunc_fputs:
+ return optimizeFPuts(CI, Builder);
+ case LibFunc_puts:
+ return optimizePuts(CI, Builder);
+ case LibFunc_perror:
+ return optimizeErrorReporting(CI, Builder);
+ case LibFunc_vfprintf:
+ case LibFunc_fiprintf:
+ return optimizeErrorReporting(CI, Builder, 0);
+ default:
+ return nullptr;
+ }
+ }
+ return nullptr;
+}
+
+LibCallSimplifier::LibCallSimplifier(
+ const DataLayout &DL, const TargetLibraryInfo *TLI,
+ OptimizationRemarkEmitter &ORE,
+ BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
+ function_ref<void(Instruction *, Value *)> Replacer,
+ function_ref<void(Instruction *)> Eraser)
+ : FortifiedSimplifier(TLI), DL(DL), TLI(TLI), ORE(ORE), BFI(BFI), PSI(PSI),
+ UnsafeFPShrink(false), Replacer(Replacer), Eraser(Eraser) {}
+
+void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) {
+ // Indirect through the replacer used in this instance.
+ Replacer(I, With);
+}
+
+void LibCallSimplifier::eraseFromParent(Instruction *I) {
+ Eraser(I);
+}
+
+// TODO:
+// Additional cases that we need to add to this file:
+//
+// cbrt:
+// * cbrt(expN(X)) -> expN(x/3)
+// * cbrt(sqrt(x)) -> pow(x,1/6)
+// * cbrt(cbrt(x)) -> pow(x,1/9)
+//
+// exp, expf, expl:
+// * exp(log(x)) -> x
+//
+// log, logf, logl:
+// * log(exp(x)) -> x
+// * log(exp(y)) -> y*log(e)
+// * log(exp10(y)) -> y*log(10)
+// * log(sqrt(x)) -> 0.5*log(x)
+//
+// pow, powf, powl:
+// * pow(sqrt(x),y) -> pow(x,y*0.5)
+// * pow(pow(x,y),z)-> pow(x,y*z)
+//
+// signbit:
+// * signbit(cnst) -> cnst'
+// * signbit(nncst) -> 0 (if pstv is a non-negative constant)
+//
+// sqrt, sqrtf, sqrtl:
+// * sqrt(expN(x)) -> expN(x*0.5)
+// * sqrt(Nroot(x)) -> pow(x,1/(2*N))
+// * sqrt(pow(x,y)) -> pow(|x|,y*0.5)
+//
+
+//===----------------------------------------------------------------------===//
+// Fortified Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+bool
+FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
+ unsigned ObjSizeOp,
+ Optional<unsigned> SizeOp,
+ Optional<unsigned> StrOp,
+ Optional<unsigned> FlagOp) {
+ // If this function takes a flag argument, the implementation may use it to
+ // perform extra checks. Don't fold into the non-checking variant.
+ if (FlagOp) {
+ ConstantInt *Flag = dyn_cast<ConstantInt>(CI->getArgOperand(*FlagOp));
+ if (!Flag || !Flag->isZero())
+ return false;
+ }
+
+ if (SizeOp && CI->getArgOperand(ObjSizeOp) == CI->getArgOperand(*SizeOp))
+ return true;
+
+ if (ConstantInt *ObjSizeCI =
+ dyn_cast<ConstantInt>(CI->getArgOperand(ObjSizeOp))) {
+ if (ObjSizeCI->isMinusOne())
+ return true;
+ // If the object size wasn't -1 (unknown), bail out if we were asked to.
+ if (OnlyLowerUnknownSize)
+ return false;
+ if (StrOp) {
+ uint64_t Len = GetStringLength(CI->getArgOperand(*StrOp));
+ // If the length is 0 we don't know how long it is and so we can't
+ // remove the check.
+ if (Len)
+ annotateDereferenceableBytes(CI, *StrOp, Len);
+ else
+ return false;
+ return ObjSizeCI->getZExtValue() >= Len;
+ }
+
+ if (SizeOp) {
+ if (ConstantInt *SizeCI =
+ dyn_cast<ConstantInt>(CI->getArgOperand(*SizeOp)))
+ return ObjSizeCI->getZExtValue() >= SizeCI->getZExtValue();
+ }
+ }
+ return false;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
+ IRBuilderBase &B) {
+ if (isFortifiedCallFoldable(CI, 3, 2)) {
+ CallInst *NewCI =
+ B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(1),
+ Align(1), CI->getArgOperand(2));
+ NewCI->setAttributes(CI->getAttributes());
NewCI->removeAttributes(AttributeList::ReturnIndex,
AttributeFuncs::typeIncompatible(NewCI->getType()));
- return CI->getArgOperand(0);
- }
- return nullptr;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
- IRBuilderBase &B) {
- if (isFortifiedCallFoldable(CI, 3, 2)) {
- CallInst *NewCI =
- B.CreateMemMove(CI->getArgOperand(0), Align(1), CI->getArgOperand(1),
- Align(1), CI->getArgOperand(2));
- NewCI->setAttributes(CI->getAttributes());
+ return CI->getArgOperand(0);
+ }
+ return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
+ IRBuilderBase &B) {
+ if (isFortifiedCallFoldable(CI, 3, 2)) {
+ CallInst *NewCI =
+ B.CreateMemMove(CI->getArgOperand(0), Align(1), CI->getArgOperand(1),
+ Align(1), CI->getArgOperand(2));
+ NewCI->setAttributes(CI->getAttributes());
NewCI->removeAttributes(AttributeList::ReturnIndex,
AttributeFuncs::typeIncompatible(NewCI->getType()));
- return CI->getArgOperand(0);
- }
- return nullptr;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
- IRBuilderBase &B) {
- // TODO: Try foldMallocMemset() here.
-
- if (isFortifiedCallFoldable(CI, 3, 2)) {
- Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
- CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val,
- CI->getArgOperand(2), Align(1));
- NewCI->setAttributes(CI->getAttributes());
+ return CI->getArgOperand(0);
+ }
+ return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
+ IRBuilderBase &B) {
+ // TODO: Try foldMallocMemset() here.
+
+ if (isFortifiedCallFoldable(CI, 3, 2)) {
+ Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
+ CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val,
+ CI->getArgOperand(2), Align(1));
+ NewCI->setAttributes(CI->getAttributes());
NewCI->removeAttributes(AttributeList::ReturnIndex,
AttributeFuncs::typeIncompatible(NewCI->getType()));
- return CI->getArgOperand(0);
- }
- return nullptr;
-}
-
+ return CI->getArgOperand(0);
+ }
+ return nullptr;
+}
+
Value *FortifiedLibCallSimplifier::optimizeMemPCpyChk(CallInst *CI,
IRBuilderBase &B) {
const DataLayout &DL = CI->getModule()->getDataLayout();
@@ -3331,233 +3331,233 @@ Value *FortifiedLibCallSimplifier::optimizeMemPCpyChk(CallInst *CI,
return nullptr;
}
-Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
- IRBuilderBase &B,
- LibFunc Func) {
- const DataLayout &DL = CI->getModule()->getDataLayout();
- Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1),
- *ObjSize = CI->getArgOperand(2);
-
- // __stpcpy_chk(x,x,...) -> x+strlen(x)
- if (Func == LibFunc_stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) {
- Value *StrLen = emitStrLen(Src, B, DL, TLI);
- return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
- }
-
- // If a) we don't have any length information, or b) we know this will
- // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our
- // st[rp]cpy_chk call which may fail at runtime if the size is too long.
- // TODO: It might be nice to get a maximum length out of the possible
- // string lengths for varying.
- if (isFortifiedCallFoldable(CI, 2, None, 1)) {
- if (Func == LibFunc_strcpy_chk)
- return emitStrCpy(Dst, Src, B, TLI);
- else
- return emitStpCpy(Dst, Src, B, TLI);
- }
-
- if (OnlyLowerUnknownSize)
- return nullptr;
-
- // Maybe we can stil fold __st[rp]cpy_chk to __memcpy_chk.
- uint64_t Len = GetStringLength(Src);
- if (Len)
- annotateDereferenceableBytes(CI, 1, Len);
- else
- return nullptr;
-
- Type *SizeTTy = DL.getIntPtrType(CI->getContext());
- Value *LenV = ConstantInt::get(SizeTTy, Len);
- Value *Ret = emitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI);
- // If the function was an __stpcpy_chk, and we were able to fold it into
- // a __memcpy_chk, we still need to return the correct end pointer.
- if (Ret && Func == LibFunc_stpcpy_chk)
- return B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(SizeTTy, Len - 1));
- return Ret;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeStrLenChk(CallInst *CI,
- IRBuilderBase &B) {
- if (isFortifiedCallFoldable(CI, 1, None, 0))
- return emitStrLen(CI->getArgOperand(0), B, CI->getModule()->getDataLayout(),
- TLI);
- return nullptr;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
- IRBuilderBase &B,
- LibFunc Func) {
- if (isFortifiedCallFoldable(CI, 3, 2)) {
- if (Func == LibFunc_strncpy_chk)
- return emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), B, TLI);
- else
- return emitStpNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), B, TLI);
- }
-
- return nullptr;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeMemCCpyChk(CallInst *CI,
- IRBuilderBase &B) {
- if (isFortifiedCallFoldable(CI, 4, 3))
- return emitMemCCpy(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), CI->getArgOperand(3), B, TLI);
-
- return nullptr;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeSNPrintfChk(CallInst *CI,
- IRBuilderBase &B) {
- if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) {
+Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
+ IRBuilderBase &B,
+ LibFunc Func) {
+ const DataLayout &DL = CI->getModule()->getDataLayout();
+ Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1),
+ *ObjSize = CI->getArgOperand(2);
+
+ // __stpcpy_chk(x,x,...) -> x+strlen(x)
+ if (Func == LibFunc_stpcpy_chk && !OnlyLowerUnknownSize && Dst == Src) {
+ Value *StrLen = emitStrLen(Src, B, DL, TLI);
+ return StrLen ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, StrLen) : nullptr;
+ }
+
+ // If a) we don't have any length information, or b) we know this will
+ // fit then just lower to a plain st[rp]cpy. Otherwise we'll keep our
+ // st[rp]cpy_chk call which may fail at runtime if the size is too long.
+ // TODO: It might be nice to get a maximum length out of the possible
+ // string lengths for varying.
+ if (isFortifiedCallFoldable(CI, 2, None, 1)) {
+ if (Func == LibFunc_strcpy_chk)
+ return emitStrCpy(Dst, Src, B, TLI);
+ else
+ return emitStpCpy(Dst, Src, B, TLI);
+ }
+
+ if (OnlyLowerUnknownSize)
+ return nullptr;
+
+ // Maybe we can stil fold __st[rp]cpy_chk to __memcpy_chk.
+ uint64_t Len = GetStringLength(Src);
+ if (Len)
+ annotateDereferenceableBytes(CI, 1, Len);
+ else
+ return nullptr;
+
+ Type *SizeTTy = DL.getIntPtrType(CI->getContext());
+ Value *LenV = ConstantInt::get(SizeTTy, Len);
+ Value *Ret = emitMemCpyChk(Dst, Src, LenV, ObjSize, B, DL, TLI);
+ // If the function was an __stpcpy_chk, and we were able to fold it into
+ // a __memcpy_chk, we still need to return the correct end pointer.
+ if (Ret && Func == LibFunc_stpcpy_chk)
+ return B.CreateGEP(B.getInt8Ty(), Dst, ConstantInt::get(SizeTTy, Len - 1));
+ return Ret;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrLenChk(CallInst *CI,
+ IRBuilderBase &B) {
+ if (isFortifiedCallFoldable(CI, 1, None, 0))
+ return emitStrLen(CI->getArgOperand(0), B, CI->getModule()->getDataLayout(),
+ TLI);
+ return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
+ IRBuilderBase &B,
+ LibFunc Func) {
+ if (isFortifiedCallFoldable(CI, 3, 2)) {
+ if (Func == LibFunc_strncpy_chk)
+ return emitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), B, TLI);
+ else
+ return emitStpNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), B, TLI);
+ }
+
+ return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeMemCCpyChk(CallInst *CI,
+ IRBuilderBase &B) {
+ if (isFortifiedCallFoldable(CI, 4, 3))
+ return emitMemCCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), CI->getArgOperand(3), B, TLI);
+
+ return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeSNPrintfChk(CallInst *CI,
+ IRBuilderBase &B) {
+ if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) {
SmallVector<Value *, 8> VariadicArgs(drop_begin(CI->args(), 5));
- return emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(4), VariadicArgs, B, TLI);
- }
-
- return nullptr;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI,
- IRBuilderBase &B) {
- if (isFortifiedCallFoldable(CI, 2, None, None, 1)) {
+ return emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(4), VariadicArgs, B, TLI);
+ }
+
+ return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI,
+ IRBuilderBase &B) {
+ if (isFortifiedCallFoldable(CI, 2, None, None, 1)) {
SmallVector<Value *, 8> VariadicArgs(drop_begin(CI->args(), 4));
- return emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), VariadicArgs,
- B, TLI);
- }
-
- return nullptr;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI,
- IRBuilderBase &B) {
- if (isFortifiedCallFoldable(CI, 2))
- return emitStrCat(CI->getArgOperand(0), CI->getArgOperand(1), B, TLI);
-
- return nullptr;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI,
- IRBuilderBase &B) {
- if (isFortifiedCallFoldable(CI, 3))
- return emitStrLCat(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), B, TLI);
-
- return nullptr;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI,
- IRBuilderBase &B) {
- if (isFortifiedCallFoldable(CI, 3))
- return emitStrNCat(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), B, TLI);
-
- return nullptr;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI,
- IRBuilderBase &B) {
- if (isFortifiedCallFoldable(CI, 3))
- return emitStrLCpy(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(2), B, TLI);
-
- return nullptr;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI,
- IRBuilderBase &B) {
- if (isFortifiedCallFoldable(CI, 3, 1, None, 2))
- return emitVSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
- CI->getArgOperand(4), CI->getArgOperand(5), B, TLI);
-
- return nullptr;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeVSPrintfChk(CallInst *CI,
- IRBuilderBase &B) {
- if (isFortifiedCallFoldable(CI, 2, None, None, 1))
- return emitVSPrintf(CI->getArgOperand(0), CI->getArgOperand(3),
- CI->getArgOperand(4), B, TLI);
-
- return nullptr;
-}
-
-Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI,
- IRBuilderBase &Builder) {
- // FIXME: We shouldn't be changing "nobuiltin" or TLI unavailable calls here.
- // Some clang users checked for _chk libcall availability using:
- // __has_builtin(__builtin___memcpy_chk)
- // When compiling with -fno-builtin, this is always true.
- // When passing -ffreestanding/-mkernel, which both imply -fno-builtin, we
- // end up with fortified libcalls, which isn't acceptable in a freestanding
- // environment which only provides their non-fortified counterparts.
- //
- // Until we change clang and/or teach external users to check for availability
- // differently, disregard the "nobuiltin" attribute and TLI::has.
- //
- // PR23093.
-
- LibFunc Func;
- Function *Callee = CI->getCalledFunction();
- bool isCallingConvC = isCallingConvCCompatible(CI);
-
- SmallVector<OperandBundleDef, 2> OpBundles;
- CI->getOperandBundlesAsDefs(OpBundles);
-
- IRBuilderBase::OperandBundlesGuard Guard(Builder);
- Builder.setDefaultOperandBundles(OpBundles);
-
- // First, check that this is a known library functions and that the prototype
- // is correct.
- if (!TLI->getLibFunc(*Callee, Func))
- return nullptr;
-
- // We never change the calling convention.
- if (!ignoreCallingConv(Func) && !isCallingConvC)
- return nullptr;
-
- switch (Func) {
- case LibFunc_memcpy_chk:
- return optimizeMemCpyChk(CI, Builder);
+ return emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), VariadicArgs,
+ B, TLI);
+ }
+
+ return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI,
+ IRBuilderBase &B) {
+ if (isFortifiedCallFoldable(CI, 2))
+ return emitStrCat(CI->getArgOperand(0), CI->getArgOperand(1), B, TLI);
+
+ return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI,
+ IRBuilderBase &B) {
+ if (isFortifiedCallFoldable(CI, 3))
+ return emitStrLCat(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), B, TLI);
+
+ return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI,
+ IRBuilderBase &B) {
+ if (isFortifiedCallFoldable(CI, 3))
+ return emitStrNCat(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), B, TLI);
+
+ return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI,
+ IRBuilderBase &B) {
+ if (isFortifiedCallFoldable(CI, 3))
+ return emitStrLCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(2), B, TLI);
+
+ return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI,
+ IRBuilderBase &B) {
+ if (isFortifiedCallFoldable(CI, 3, 1, None, 2))
+ return emitVSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
+ CI->getArgOperand(4), CI->getArgOperand(5), B, TLI);
+
+ return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeVSPrintfChk(CallInst *CI,
+ IRBuilderBase &B) {
+ if (isFortifiedCallFoldable(CI, 2, None, None, 1))
+ return emitVSPrintf(CI->getArgOperand(0), CI->getArgOperand(3),
+ CI->getArgOperand(4), B, TLI);
+
+ return nullptr;
+}
+
+Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI,
+ IRBuilderBase &Builder) {
+ // FIXME: We shouldn't be changing "nobuiltin" or TLI unavailable calls here.
+ // Some clang users checked for _chk libcall availability using:
+ // __has_builtin(__builtin___memcpy_chk)
+ // When compiling with -fno-builtin, this is always true.
+ // When passing -ffreestanding/-mkernel, which both imply -fno-builtin, we
+ // end up with fortified libcalls, which isn't acceptable in a freestanding
+ // environment which only provides their non-fortified counterparts.
+ //
+ // Until we change clang and/or teach external users to check for availability
+ // differently, disregard the "nobuiltin" attribute and TLI::has.
+ //
+ // PR23093.
+
+ LibFunc Func;
+ Function *Callee = CI->getCalledFunction();
+ bool isCallingConvC = isCallingConvCCompatible(CI);
+
+ SmallVector<OperandBundleDef, 2> OpBundles;
+ CI->getOperandBundlesAsDefs(OpBundles);
+
+ IRBuilderBase::OperandBundlesGuard Guard(Builder);
+ Builder.setDefaultOperandBundles(OpBundles);
+
+ // First, check that this is a known library functions and that the prototype
+ // is correct.
+ if (!TLI->getLibFunc(*Callee, Func))
+ return nullptr;
+
+ // We never change the calling convention.
+ if (!ignoreCallingConv(Func) && !isCallingConvC)
+ return nullptr;
+
+ switch (Func) {
+ case LibFunc_memcpy_chk:
+ return optimizeMemCpyChk(CI, Builder);
case LibFunc_mempcpy_chk:
return optimizeMemPCpyChk(CI, Builder);
- case LibFunc_memmove_chk:
- return optimizeMemMoveChk(CI, Builder);
- case LibFunc_memset_chk:
- return optimizeMemSetChk(CI, Builder);
- case LibFunc_stpcpy_chk:
- case LibFunc_strcpy_chk:
- return optimizeStrpCpyChk(CI, Builder, Func);
- case LibFunc_strlen_chk:
- return optimizeStrLenChk(CI, Builder);
- case LibFunc_stpncpy_chk:
- case LibFunc_strncpy_chk:
- return optimizeStrpNCpyChk(CI, Builder, Func);
- case LibFunc_memccpy_chk:
- return optimizeMemCCpyChk(CI, Builder);
- case LibFunc_snprintf_chk:
- return optimizeSNPrintfChk(CI, Builder);
- case LibFunc_sprintf_chk:
- return optimizeSPrintfChk(CI, Builder);
- case LibFunc_strcat_chk:
- return optimizeStrCatChk(CI, Builder);
- case LibFunc_strlcat_chk:
- return optimizeStrLCat(CI, Builder);
- case LibFunc_strncat_chk:
- return optimizeStrNCatChk(CI, Builder);
- case LibFunc_strlcpy_chk:
- return optimizeStrLCpyChk(CI, Builder);
- case LibFunc_vsnprintf_chk:
- return optimizeVSNPrintfChk(CI, Builder);
- case LibFunc_vsprintf_chk:
- return optimizeVSPrintfChk(CI, Builder);
- default:
- break;
- }
- return nullptr;
-}
-
-FortifiedLibCallSimplifier::FortifiedLibCallSimplifier(
- const TargetLibraryInfo *TLI, bool OnlyLowerUnknownSize)
- : TLI(TLI), OnlyLowerUnknownSize(OnlyLowerUnknownSize) {}
+ case LibFunc_memmove_chk:
+ return optimizeMemMoveChk(CI, Builder);
+ case LibFunc_memset_chk:
+ return optimizeMemSetChk(CI, Builder);
+ case LibFunc_stpcpy_chk:
+ case LibFunc_strcpy_chk:
+ return optimizeStrpCpyChk(CI, Builder, Func);
+ case LibFunc_strlen_chk:
+ return optimizeStrLenChk(CI, Builder);
+ case LibFunc_stpncpy_chk:
+ case LibFunc_strncpy_chk:
+ return optimizeStrpNCpyChk(CI, Builder, Func);
+ case LibFunc_memccpy_chk:
+ return optimizeMemCCpyChk(CI, Builder);
+ case LibFunc_snprintf_chk:
+ return optimizeSNPrintfChk(CI, Builder);
+ case LibFunc_sprintf_chk:
+ return optimizeSPrintfChk(CI, Builder);
+ case LibFunc_strcat_chk:
+ return optimizeStrCatChk(CI, Builder);
+ case LibFunc_strlcat_chk:
+ return optimizeStrLCat(CI, Builder);
+ case LibFunc_strncat_chk:
+ return optimizeStrNCatChk(CI, Builder);
+ case LibFunc_strlcpy_chk:
+ return optimizeStrLCpyChk(CI, Builder);
+ case LibFunc_vsnprintf_chk:
+ return optimizeVSNPrintfChk(CI, Builder);
+ case LibFunc_vsprintf_chk:
+ return optimizeVSPrintfChk(CI, Builder);
+ default:
+ break;
+ }
+ return nullptr;
+}
+
+FortifiedLibCallSimplifier::FortifiedLibCallSimplifier(
+ const TargetLibraryInfo *TLI, bool OnlyLowerUnknownSize)
+ : TLI(TLI), OnlyLowerUnknownSize(OnlyLowerUnknownSize) {}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SizeOpts.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SizeOpts.cpp
index 02abd43851..beeb60698f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SizeOpts.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SizeOpts.cpp
@@ -1,111 +1,111 @@
-//===-- SizeOpts.cpp - code size optimization related code ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains some shared code size optimization related code.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/SizeOpts.h"
-
-using namespace llvm;
-
-cl::opt<bool> EnablePGSO(
- "pgso", cl::Hidden, cl::init(true),
- cl::desc("Enable the profile guided size optimizations. "));
-
-cl::opt<bool> PGSOLargeWorkingSetSizeOnly(
- "pgso-lwss-only", cl::Hidden, cl::init(true),
- cl::desc("Apply the profile guided size optimizations only "
- "if the working set size is large (except for cold code.)"));
-
-cl::opt<bool> PGSOColdCodeOnly(
- "pgso-cold-code-only", cl::Hidden, cl::init(false),
- cl::desc("Apply the profile guided size optimizations only "
- "to cold code."));
-
-cl::opt<bool> PGSOColdCodeOnlyForInstrPGO(
- "pgso-cold-code-only-for-instr-pgo", cl::Hidden, cl::init(false),
- cl::desc("Apply the profile guided size optimizations only "
- "to cold code under instrumentation PGO."));
-
-cl::opt<bool> PGSOColdCodeOnlyForSamplePGO(
- "pgso-cold-code-only-for-sample-pgo", cl::Hidden, cl::init(false),
- cl::desc("Apply the profile guided size optimizations only "
- "to cold code under sample PGO."));
-
-cl::opt<bool> PGSOColdCodeOnlyForPartialSamplePGO(
- "pgso-cold-code-only-for-partial-sample-pgo", cl::Hidden, cl::init(false),
- cl::desc("Apply the profile guided size optimizations only "
- "to cold code under partial-profile sample PGO."));
-
-cl::opt<bool> ForcePGSO(
- "force-pgso", cl::Hidden, cl::init(false),
- cl::desc("Force the (profiled-guided) size optimizations. "));
-
-cl::opt<int> PgsoCutoffInstrProf(
- "pgso-cutoff-instr-prof", cl::Hidden, cl::init(950000), cl::ZeroOrMore,
- cl::desc("The profile guided size optimization profile summary cutoff "
- "for instrumentation profile."));
-
-cl::opt<int> PgsoCutoffSampleProf(
- "pgso-cutoff-sample-prof", cl::Hidden, cl::init(990000), cl::ZeroOrMore,
- cl::desc("The profile guided size optimization profile summary cutoff "
- "for sample profile."));
-
-namespace {
-struct BasicBlockBFIAdapter {
- static bool isFunctionColdInCallGraph(const Function *F,
- ProfileSummaryInfo *PSI,
- BlockFrequencyInfo &BFI) {
- return PSI->isFunctionColdInCallGraph(F, BFI);
- }
- static bool isFunctionHotInCallGraphNthPercentile(int CutOff,
- const Function *F,
- ProfileSummaryInfo *PSI,
- BlockFrequencyInfo &BFI) {
- return PSI->isFunctionHotInCallGraphNthPercentile(CutOff, F, BFI);
- }
- static bool isFunctionColdInCallGraphNthPercentile(int CutOff,
- const Function *F,
- ProfileSummaryInfo *PSI,
- BlockFrequencyInfo &BFI) {
- return PSI->isFunctionColdInCallGraphNthPercentile(CutOff, F, BFI);
- }
- static bool isColdBlock(const BasicBlock *BB,
- ProfileSummaryInfo *PSI,
- BlockFrequencyInfo *BFI) {
- return PSI->isColdBlock(BB, BFI);
- }
- static bool isHotBlockNthPercentile(int CutOff,
- const BasicBlock *BB,
- ProfileSummaryInfo *PSI,
- BlockFrequencyInfo *BFI) {
- return PSI->isHotBlockNthPercentile(CutOff, BB, BFI);
- }
- static bool isColdBlockNthPercentile(int CutOff, const BasicBlock *BB,
- ProfileSummaryInfo *PSI,
- BlockFrequencyInfo *BFI) {
- return PSI->isColdBlockNthPercentile(CutOff, BB, BFI);
- }
-};
-} // end anonymous namespace
-
-bool llvm::shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI,
- BlockFrequencyInfo *BFI,
- PGSOQueryType QueryType) {
- return shouldFuncOptimizeForSizeImpl<BasicBlockBFIAdapter>(F, PSI, BFI,
- QueryType);
-}
-
-bool llvm::shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI,
- BlockFrequencyInfo *BFI,
- PGSOQueryType QueryType) {
- assert(BB);
- return shouldOptimizeForSizeImpl<BasicBlockBFIAdapter>(BB, PSI, BFI,
- QueryType);
-}
+//===-- SizeOpts.cpp - code size optimization related code ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains some shared code size optimization related code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SizeOpts.h"
+
+using namespace llvm;
+
+cl::opt<bool> EnablePGSO(
+ "pgso", cl::Hidden, cl::init(true),
+ cl::desc("Enable the profile guided size optimizations. "));
+
+cl::opt<bool> PGSOLargeWorkingSetSizeOnly(
+ "pgso-lwss-only", cl::Hidden, cl::init(true),
+ cl::desc("Apply the profile guided size optimizations only "
+ "if the working set size is large (except for cold code.)"));
+
+cl::opt<bool> PGSOColdCodeOnly(
+ "pgso-cold-code-only", cl::Hidden, cl::init(false),
+ cl::desc("Apply the profile guided size optimizations only "
+ "to cold code."));
+
+cl::opt<bool> PGSOColdCodeOnlyForInstrPGO(
+ "pgso-cold-code-only-for-instr-pgo", cl::Hidden, cl::init(false),
+ cl::desc("Apply the profile guided size optimizations only "
+ "to cold code under instrumentation PGO."));
+
+cl::opt<bool> PGSOColdCodeOnlyForSamplePGO(
+ "pgso-cold-code-only-for-sample-pgo", cl::Hidden, cl::init(false),
+ cl::desc("Apply the profile guided size optimizations only "
+ "to cold code under sample PGO."));
+
+cl::opt<bool> PGSOColdCodeOnlyForPartialSamplePGO(
+ "pgso-cold-code-only-for-partial-sample-pgo", cl::Hidden, cl::init(false),
+ cl::desc("Apply the profile guided size optimizations only "
+ "to cold code under partial-profile sample PGO."));
+
+cl::opt<bool> ForcePGSO(
+ "force-pgso", cl::Hidden, cl::init(false),
+ cl::desc("Force the (profiled-guided) size optimizations. "));
+
+cl::opt<int> PgsoCutoffInstrProf(
+ "pgso-cutoff-instr-prof", cl::Hidden, cl::init(950000), cl::ZeroOrMore,
+ cl::desc("The profile guided size optimization profile summary cutoff "
+ "for instrumentation profile."));
+
+cl::opt<int> PgsoCutoffSampleProf(
+ "pgso-cutoff-sample-prof", cl::Hidden, cl::init(990000), cl::ZeroOrMore,
+ cl::desc("The profile guided size optimization profile summary cutoff "
+ "for sample profile."));
+
+namespace {
+struct BasicBlockBFIAdapter {
+ static bool isFunctionColdInCallGraph(const Function *F,
+ ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo &BFI) {
+ return PSI->isFunctionColdInCallGraph(F, BFI);
+ }
+ static bool isFunctionHotInCallGraphNthPercentile(int CutOff,
+ const Function *F,
+ ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo &BFI) {
+ return PSI->isFunctionHotInCallGraphNthPercentile(CutOff, F, BFI);
+ }
+ static bool isFunctionColdInCallGraphNthPercentile(int CutOff,
+ const Function *F,
+ ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo &BFI) {
+ return PSI->isFunctionColdInCallGraphNthPercentile(CutOff, F, BFI);
+ }
+ static bool isColdBlock(const BasicBlock *BB,
+ ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI) {
+ return PSI->isColdBlock(BB, BFI);
+ }
+ static bool isHotBlockNthPercentile(int CutOff,
+ const BasicBlock *BB,
+ ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI) {
+ return PSI->isHotBlockNthPercentile(CutOff, BB, BFI);
+ }
+ static bool isColdBlockNthPercentile(int CutOff, const BasicBlock *BB,
+ ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI) {
+ return PSI->isColdBlockNthPercentile(CutOff, BB, BFI);
+ }
+};
+} // end anonymous namespace
+
+bool llvm::shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI,
+ PGSOQueryType QueryType) {
+ return shouldFuncOptimizeForSizeImpl<BasicBlockBFIAdapter>(F, PSI, BFI,
+ QueryType);
+}
+
+bool llvm::shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI,
+ PGSOQueryType QueryType) {
+ assert(BB);
+ return shouldOptimizeForSizeImpl<BasicBlockBFIAdapter>(BB, PSI, BFI,
+ QueryType);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SplitModule.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SplitModule.cpp
index eb27914fc7..e2c387cb89 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SplitModule.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SplitModule.cpp
@@ -1,284 +1,284 @@
-//===- SplitModule.cpp - Split a module into partitions -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the function llvm::SplitModule, which splits a module
-// into multiple linkable partitions. It can be used to implement parallel code
-// generation for link-time optimization.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/SplitModule.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/EquivalenceClasses.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Comdat.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalObject.h"
-#include "llvm/IR/GlobalIndirectSymbol.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MD5.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include <algorithm>
-#include <cassert>
-#include <iterator>
-#include <memory>
-#include <queue>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "split-module"
-
-namespace {
-
-using ClusterMapType = EquivalenceClasses<const GlobalValue *>;
-using ComdatMembersType = DenseMap<const Comdat *, const GlobalValue *>;
-using ClusterIDMapType = DenseMap<const GlobalValue *, unsigned>;
-
-} // end anonymous namespace
-
-static void addNonConstUser(ClusterMapType &GVtoClusterMap,
- const GlobalValue *GV, const User *U) {
- assert((!isa<Constant>(U) || isa<GlobalValue>(U)) && "Bad user");
-
- if (const Instruction *I = dyn_cast<Instruction>(U)) {
- const GlobalValue *F = I->getParent()->getParent();
- GVtoClusterMap.unionSets(GV, F);
- } else if (isa<GlobalIndirectSymbol>(U) || isa<Function>(U) ||
- isa<GlobalVariable>(U)) {
- GVtoClusterMap.unionSets(GV, cast<GlobalValue>(U));
- } else {
- llvm_unreachable("Underimplemented use case");
- }
-}
-
-// Adds all GlobalValue users of V to the same cluster as GV.
-static void addAllGlobalValueUsers(ClusterMapType &GVtoClusterMap,
- const GlobalValue *GV, const Value *V) {
- for (auto *U : V->users()) {
- SmallVector<const User *, 4> Worklist;
- Worklist.push_back(U);
- while (!Worklist.empty()) {
- const User *UU = Worklist.pop_back_val();
- // For each constant that is not a GV (a pure const) recurse.
- if (isa<Constant>(UU) && !isa<GlobalValue>(UU)) {
- Worklist.append(UU->user_begin(), UU->user_end());
- continue;
- }
- addNonConstUser(GVtoClusterMap, GV, UU);
- }
- }
-}
-
-// Find partitions for module in the way that no locals need to be
-// globalized.
-// Try to balance pack those partitions into N files since this roughly equals
-// thread balancing for the backend codegen step.
-static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap,
- unsigned N) {
- // At this point module should have the proper mix of globals and locals.
- // As we attempt to partition this module, we must not change any
- // locals to globals.
- LLVM_DEBUG(dbgs() << "Partition module with (" << M->size()
- << ")functions\n");
- ClusterMapType GVtoClusterMap;
- ComdatMembersType ComdatMembers;
-
- auto recordGVSet = [&GVtoClusterMap, &ComdatMembers](GlobalValue &GV) {
- if (GV.isDeclaration())
- return;
-
- if (!GV.hasName())
- GV.setName("__llvmsplit_unnamed");
-
- // Comdat groups must not be partitioned. For comdat groups that contain
- // locals, record all their members here so we can keep them together.
- // Comdat groups that only contain external globals are already handled by
- // the MD5-based partitioning.
- if (const Comdat *C = GV.getComdat()) {
- auto &Member = ComdatMembers[C];
- if (Member)
- GVtoClusterMap.unionSets(Member, &GV);
- else
- Member = &GV;
- }
-
- // For aliases we should not separate them from their aliasees regardless
- // of linkage.
- if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(&GV)) {
- if (const GlobalObject *Base = GIS->getBaseObject())
- GVtoClusterMap.unionSets(&GV, Base);
- }
-
- if (const Function *F = dyn_cast<Function>(&GV)) {
- for (const BasicBlock &BB : *F) {
- BlockAddress *BA = BlockAddress::lookup(&BB);
- if (!BA || !BA->isConstantUsed())
- continue;
- addAllGlobalValueUsers(GVtoClusterMap, F, BA);
- }
- }
-
- if (GV.hasLocalLinkage())
- addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV);
- };
-
- llvm::for_each(M->functions(), recordGVSet);
- llvm::for_each(M->globals(), recordGVSet);
- llvm::for_each(M->aliases(), recordGVSet);
-
- // Assigned all GVs to merged clusters while balancing number of objects in
- // each.
- auto CompareClusters = [](const std::pair<unsigned, unsigned> &a,
- const std::pair<unsigned, unsigned> &b) {
- if (a.second || b.second)
- return a.second > b.second;
- else
- return a.first > b.first;
- };
-
- std::priority_queue<std::pair<unsigned, unsigned>,
- std::vector<std::pair<unsigned, unsigned>>,
- decltype(CompareClusters)>
- BalancinQueue(CompareClusters);
- // Pre-populate priority queue with N slot blanks.
- for (unsigned i = 0; i < N; ++i)
- BalancinQueue.push(std::make_pair(i, 0));
-
- using SortType = std::pair<unsigned, ClusterMapType::iterator>;
-
- SmallVector<SortType, 64> Sets;
- SmallPtrSet<const GlobalValue *, 32> Visited;
-
- // To guarantee determinism, we have to sort SCC according to size.
- // When size is the same, use leader's name.
- for (ClusterMapType::iterator I = GVtoClusterMap.begin(),
- E = GVtoClusterMap.end(); I != E; ++I)
- if (I->isLeader())
- Sets.push_back(
- std::make_pair(std::distance(GVtoClusterMap.member_begin(I),
- GVtoClusterMap.member_end()), I));
-
- llvm::sort(Sets, [](const SortType &a, const SortType &b) {
- if (a.first == b.first)
- return a.second->getData()->getName() > b.second->getData()->getName();
- else
- return a.first > b.first;
- });
-
- for (auto &I : Sets) {
- unsigned CurrentClusterID = BalancinQueue.top().first;
- unsigned CurrentClusterSize = BalancinQueue.top().second;
- BalancinQueue.pop();
-
- LLVM_DEBUG(dbgs() << "Root[" << CurrentClusterID << "] cluster_size("
- << I.first << ") ----> " << I.second->getData()->getName()
- << "\n");
-
- for (ClusterMapType::member_iterator MI =
- GVtoClusterMap.findLeader(I.second);
- MI != GVtoClusterMap.member_end(); ++MI) {
- if (!Visited.insert(*MI).second)
- continue;
- LLVM_DEBUG(dbgs() << "----> " << (*MI)->getName()
- << ((*MI)->hasLocalLinkage() ? " l " : " e ") << "\n");
- Visited.insert(*MI);
- ClusterIDMap[*MI] = CurrentClusterID;
- CurrentClusterSize++;
- }
- // Add this set size to the number of entries in this cluster.
- BalancinQueue.push(std::make_pair(CurrentClusterID, CurrentClusterSize));
- }
-}
-
-static void externalize(GlobalValue *GV) {
- if (GV->hasLocalLinkage()) {
- GV->setLinkage(GlobalValue::ExternalLinkage);
- GV->setVisibility(GlobalValue::HiddenVisibility);
- }
-
- // Unnamed entities must be named consistently between modules. setName will
- // give a distinct name to each such entity.
- if (!GV->hasName())
- GV->setName("__llvmsplit_unnamed");
-}
-
-// Returns whether GV should be in partition (0-based) I of N.
-static bool isInPartition(const GlobalValue *GV, unsigned I, unsigned N) {
- if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(GV))
- if (const GlobalObject *Base = GIS->getBaseObject())
- GV = Base;
-
- StringRef Name;
- if (const Comdat *C = GV->getComdat())
- Name = C->getName();
- else
- Name = GV->getName();
-
- // Partition by MD5 hash. We only need a few bits for evenness as the number
- // of partitions will generally be in the 1-2 figure range; the low 16 bits
- // are enough.
- MD5 H;
- MD5::MD5Result R;
- H.update(Name);
- H.final(R);
- return (R[0] | (R[1] << 8)) % N == I;
-}
-
-void llvm::SplitModule(
- std::unique_ptr<Module> M, unsigned N,
- function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback,
- bool PreserveLocals) {
- if (!PreserveLocals) {
- for (Function &F : *M)
- externalize(&F);
- for (GlobalVariable &GV : M->globals())
- externalize(&GV);
- for (GlobalAlias &GA : M->aliases())
- externalize(&GA);
- for (GlobalIFunc &GIF : M->ifuncs())
- externalize(&GIF);
- }
-
- // This performs splitting without a need for externalization, which might not
- // always be possible.
- ClusterIDMapType ClusterIDMap;
- findPartitions(M.get(), ClusterIDMap, N);
-
- // FIXME: We should be able to reuse M as the last partition instead of
- // cloning it.
- for (unsigned I = 0; I < N; ++I) {
- ValueToValueMapTy VMap;
- std::unique_ptr<Module> MPart(
- CloneModule(*M, VMap, [&](const GlobalValue *GV) {
- if (ClusterIDMap.count(GV))
- return (ClusterIDMap[GV] == I);
- else
- return isInPartition(GV, I, N);
- }));
- if (I != 0)
- MPart->setModuleInlineAsm("");
- ModuleCallback(std::move(MPart));
- }
-}
+//===- SplitModule.cpp - Split a module into partitions -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the function llvm::SplitModule, which splits a module
+// into multiple linkable partitions. It can be used to implement parallel code
+// generation for link-time optimization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SplitModule.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/EquivalenceClasses.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalIndirectSymbol.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <queue>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "split-module"
+
+namespace {
+
+using ClusterMapType = EquivalenceClasses<const GlobalValue *>;
+using ComdatMembersType = DenseMap<const Comdat *, const GlobalValue *>;
+using ClusterIDMapType = DenseMap<const GlobalValue *, unsigned>;
+
+} // end anonymous namespace
+
+static void addNonConstUser(ClusterMapType &GVtoClusterMap,
+ const GlobalValue *GV, const User *U) {
+ assert((!isa<Constant>(U) || isa<GlobalValue>(U)) && "Bad user");
+
+ if (const Instruction *I = dyn_cast<Instruction>(U)) {
+ const GlobalValue *F = I->getParent()->getParent();
+ GVtoClusterMap.unionSets(GV, F);
+ } else if (isa<GlobalIndirectSymbol>(U) || isa<Function>(U) ||
+ isa<GlobalVariable>(U)) {
+ GVtoClusterMap.unionSets(GV, cast<GlobalValue>(U));
+ } else {
+ llvm_unreachable("Underimplemented use case");
+ }
+}
+
+// Adds all GlobalValue users of V to the same cluster as GV.
+static void addAllGlobalValueUsers(ClusterMapType &GVtoClusterMap,
+ const GlobalValue *GV, const Value *V) {
+ for (auto *U : V->users()) {
+ SmallVector<const User *, 4> Worklist;
+ Worklist.push_back(U);
+ while (!Worklist.empty()) {
+ const User *UU = Worklist.pop_back_val();
+ // For each constant that is not a GV (a pure const) recurse.
+ if (isa<Constant>(UU) && !isa<GlobalValue>(UU)) {
+ Worklist.append(UU->user_begin(), UU->user_end());
+ continue;
+ }
+ addNonConstUser(GVtoClusterMap, GV, UU);
+ }
+ }
+}
+
+// Find partitions for module in the way that no locals need to be
+// globalized.
+// Try to balance pack those partitions into N files since this roughly equals
+// thread balancing for the backend codegen step.
+static void findPartitions(Module *M, ClusterIDMapType &ClusterIDMap,
+ unsigned N) {
+ // At this point module should have the proper mix of globals and locals.
+ // As we attempt to partition this module, we must not change any
+ // locals to globals.
+ LLVM_DEBUG(dbgs() << "Partition module with (" << M->size()
+ << ")functions\n");
+ ClusterMapType GVtoClusterMap;
+ ComdatMembersType ComdatMembers;
+
+ auto recordGVSet = [&GVtoClusterMap, &ComdatMembers](GlobalValue &GV) {
+ if (GV.isDeclaration())
+ return;
+
+ if (!GV.hasName())
+ GV.setName("__llvmsplit_unnamed");
+
+ // Comdat groups must not be partitioned. For comdat groups that contain
+ // locals, record all their members here so we can keep them together.
+ // Comdat groups that only contain external globals are already handled by
+ // the MD5-based partitioning.
+ if (const Comdat *C = GV.getComdat()) {
+ auto &Member = ComdatMembers[C];
+ if (Member)
+ GVtoClusterMap.unionSets(Member, &GV);
+ else
+ Member = &GV;
+ }
+
+ // For aliases we should not separate them from their aliasees regardless
+ // of linkage.
+ if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(&GV)) {
+ if (const GlobalObject *Base = GIS->getBaseObject())
+ GVtoClusterMap.unionSets(&GV, Base);
+ }
+
+ if (const Function *F = dyn_cast<Function>(&GV)) {
+ for (const BasicBlock &BB : *F) {
+ BlockAddress *BA = BlockAddress::lookup(&BB);
+ if (!BA || !BA->isConstantUsed())
+ continue;
+ addAllGlobalValueUsers(GVtoClusterMap, F, BA);
+ }
+ }
+
+ if (GV.hasLocalLinkage())
+ addAllGlobalValueUsers(GVtoClusterMap, &GV, &GV);
+ };
+
+ llvm::for_each(M->functions(), recordGVSet);
+ llvm::for_each(M->globals(), recordGVSet);
+ llvm::for_each(M->aliases(), recordGVSet);
+
+ // Assigned all GVs to merged clusters while balancing number of objects in
+ // each.
+ auto CompareClusters = [](const std::pair<unsigned, unsigned> &a,
+ const std::pair<unsigned, unsigned> &b) {
+ if (a.second || b.second)
+ return a.second > b.second;
+ else
+ return a.first > b.first;
+ };
+
+ std::priority_queue<std::pair<unsigned, unsigned>,
+ std::vector<std::pair<unsigned, unsigned>>,
+ decltype(CompareClusters)>
+ BalancinQueue(CompareClusters);
+ // Pre-populate priority queue with N slot blanks.
+ for (unsigned i = 0; i < N; ++i)
+ BalancinQueue.push(std::make_pair(i, 0));
+
+ using SortType = std::pair<unsigned, ClusterMapType::iterator>;
+
+ SmallVector<SortType, 64> Sets;
+ SmallPtrSet<const GlobalValue *, 32> Visited;
+
+ // To guarantee determinism, we have to sort SCC according to size.
+ // When size is the same, use leader's name.
+ for (ClusterMapType::iterator I = GVtoClusterMap.begin(),
+ E = GVtoClusterMap.end(); I != E; ++I)
+ if (I->isLeader())
+ Sets.push_back(
+ std::make_pair(std::distance(GVtoClusterMap.member_begin(I),
+ GVtoClusterMap.member_end()), I));
+
+ llvm::sort(Sets, [](const SortType &a, const SortType &b) {
+ if (a.first == b.first)
+ return a.second->getData()->getName() > b.second->getData()->getName();
+ else
+ return a.first > b.first;
+ });
+
+ for (auto &I : Sets) {
+ unsigned CurrentClusterID = BalancinQueue.top().first;
+ unsigned CurrentClusterSize = BalancinQueue.top().second;
+ BalancinQueue.pop();
+
+ LLVM_DEBUG(dbgs() << "Root[" << CurrentClusterID << "] cluster_size("
+ << I.first << ") ----> " << I.second->getData()->getName()
+ << "\n");
+
+ for (ClusterMapType::member_iterator MI =
+ GVtoClusterMap.findLeader(I.second);
+ MI != GVtoClusterMap.member_end(); ++MI) {
+ if (!Visited.insert(*MI).second)
+ continue;
+ LLVM_DEBUG(dbgs() << "----> " << (*MI)->getName()
+ << ((*MI)->hasLocalLinkage() ? " l " : " e ") << "\n");
+ Visited.insert(*MI);
+ ClusterIDMap[*MI] = CurrentClusterID;
+ CurrentClusterSize++;
+ }
+ // Add this set size to the number of entries in this cluster.
+ BalancinQueue.push(std::make_pair(CurrentClusterID, CurrentClusterSize));
+ }
+}
+
+static void externalize(GlobalValue *GV) {
+ if (GV->hasLocalLinkage()) {
+ GV->setLinkage(GlobalValue::ExternalLinkage);
+ GV->setVisibility(GlobalValue::HiddenVisibility);
+ }
+
+ // Unnamed entities must be named consistently between modules. setName will
+ // give a distinct name to each such entity.
+ if (!GV->hasName())
+ GV->setName("__llvmsplit_unnamed");
+}
+
+// Returns whether GV should be in partition (0-based) I of N.
+static bool isInPartition(const GlobalValue *GV, unsigned I, unsigned N) {
+ if (auto *GIS = dyn_cast<GlobalIndirectSymbol>(GV))
+ if (const GlobalObject *Base = GIS->getBaseObject())
+ GV = Base;
+
+ StringRef Name;
+ if (const Comdat *C = GV->getComdat())
+ Name = C->getName();
+ else
+ Name = GV->getName();
+
+ // Partition by MD5 hash. We only need a few bits for evenness as the number
+ // of partitions will generally be in the 1-2 figure range; the low 16 bits
+ // are enough.
+ MD5 H;
+ MD5::MD5Result R;
+ H.update(Name);
+ H.final(R);
+ return (R[0] | (R[1] << 8)) % N == I;
+}
+
+void llvm::SplitModule(
+ std::unique_ptr<Module> M, unsigned N,
+ function_ref<void(std::unique_ptr<Module> MPart)> ModuleCallback,
+ bool PreserveLocals) {
+ if (!PreserveLocals) {
+ for (Function &F : *M)
+ externalize(&F);
+ for (GlobalVariable &GV : M->globals())
+ externalize(&GV);
+ for (GlobalAlias &GA : M->aliases())
+ externalize(&GA);
+ for (GlobalIFunc &GIF : M->ifuncs())
+ externalize(&GIF);
+ }
+
+ // This performs splitting without a need for externalization, which might not
+ // always be possible.
+ ClusterIDMapType ClusterIDMap;
+ findPartitions(M.get(), ClusterIDMap, N);
+
+ // FIXME: We should be able to reuse M as the last partition instead of
+ // cloning it.
+ for (unsigned I = 0; I < N; ++I) {
+ ValueToValueMapTy VMap;
+ std::unique_ptr<Module> MPart(
+ CloneModule(*M, VMap, [&](const GlobalValue *GV) {
+ if (ClusterIDMap.count(GV))
+ return (ClusterIDMap[GV] == I);
+ else
+ return isInPartition(GV, I, N);
+ }));
+ if (I != 0)
+ MPart->setModuleInlineAsm("");
+ ModuleCallback(std::move(MPart));
+ }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/StripGCRelocates.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/StripGCRelocates.cpp
index 461edd8755..1fa574f04c 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/StripGCRelocates.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -1,62 +1,62 @@
-//===- StripGCRelocates.cpp - Remove gc.relocates inserted by RewriteStatePoints===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This is a little utility pass that removes the gc.relocates inserted by
-// RewriteStatepointsForGC. Note that the generated IR is incorrect,
-// but this is useful as a single pass in itself, for analysis of IR, without
-// the GC.relocates. The statepoint and gc.result instrinsics would still be
-// present.
-//===----------------------------------------------------------------------===//
-
+//===- StripGCRelocates.cpp - Remove gc.relocates inserted by RewriteStatePoints===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a little utility pass that removes the gc.relocates inserted by
+// RewriteStatepointsForGC. Note that the generated IR is incorrect,
+// but this is useful as a single pass in itself, for analysis of IR, without
+// the GC.relocates. The statepoint and gc.result instrinsics would still be
+// present.
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Utils/StripGCRelocates.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Statepoint.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
static bool stripGCRelocates(Function &F) {
- // Nothing to do for declarations.
- if (F.isDeclaration())
- return false;
- SmallVector<GCRelocateInst *, 20> GCRelocates;
- // TODO: We currently do not handle gc.relocates that are in landing pads,
- // i.e. not bound to a single statepoint token.
- for (Instruction &I : instructions(F)) {
- if (auto *GCR = dyn_cast<GCRelocateInst>(&I))
- if (isa<GCStatepointInst>(GCR->getOperand(0)))
- GCRelocates.push_back(GCR);
- }
- // All gc.relocates are bound to a single statepoint token. The order of
- // visiting gc.relocates for deletion does not matter.
- for (GCRelocateInst *GCRel : GCRelocates) {
- Value *OrigPtr = GCRel->getDerivedPtr();
- Value *ReplaceGCRel = OrigPtr;
-
- // All gc_relocates are i8 addrspace(1)* typed, we need a bitcast from i8
- // addrspace(1)* to the type of the OrigPtr, if the are not the same.
- if (GCRel->getType() != OrigPtr->getType())
- ReplaceGCRel = new BitCastInst(OrigPtr, GCRel->getType(), "cast", GCRel);
-
- // Replace all uses of gc.relocate and delete the gc.relocate
- // There maybe unncessary bitcasts back to the OrigPtr type, an instcombine
- // pass would clear this up.
- GCRel->replaceAllUsesWith(ReplaceGCRel);
- GCRel->eraseFromParent();
- }
- return !GCRelocates.empty();
-}
-
+ // Nothing to do for declarations.
+ if (F.isDeclaration())
+ return false;
+ SmallVector<GCRelocateInst *, 20> GCRelocates;
+ // TODO: We currently do not handle gc.relocates that are in landing pads,
+ // i.e. not bound to a single statepoint token.
+ for (Instruction &I : instructions(F)) {
+ if (auto *GCR = dyn_cast<GCRelocateInst>(&I))
+ if (isa<GCStatepointInst>(GCR->getOperand(0)))
+ GCRelocates.push_back(GCR);
+ }
+ // All gc.relocates are bound to a single statepoint token. The order of
+ // visiting gc.relocates for deletion does not matter.
+ for (GCRelocateInst *GCRel : GCRelocates) {
+ Value *OrigPtr = GCRel->getDerivedPtr();
+ Value *ReplaceGCRel = OrigPtr;
+
+ // All gc_relocates are i8 addrspace(1)* typed, we need a bitcast from i8
+ // addrspace(1)* to the type of the OrigPtr, if the are not the same.
+ if (GCRel->getType() != OrigPtr->getType())
+ ReplaceGCRel = new BitCastInst(OrigPtr, GCRel->getType(), "cast", GCRel);
+
+ // Replace all uses of gc.relocate and delete the gc.relocate
+ // There maybe unncessary bitcasts back to the OrigPtr type, an instcombine
+ // pass would clear this up.
+ GCRel->replaceAllUsesWith(ReplaceGCRel);
+ GCRel->eraseFromParent();
+ }
+ return !GCRelocates.empty();
+}
+
PreservedAnalyses StripGCRelocates::run(Function &F,
FunctionAnalysisManager &AM) {
if (!stripGCRelocates(F))
@@ -84,5 +84,5 @@ char StripGCRelocatesLegacy::ID = 0;
} // namespace
INITIALIZE_PASS(StripGCRelocatesLegacy, "strip-gc-relocates",
- "Strip gc.relocates inserted through RewriteStatepointsForGC",
- true, false)
+ "Strip gc.relocates inserted through RewriteStatepointsForGC",
+ true, false)
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
index 0a45a21e43..10fda4df51 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/StripNonLineTableDebugInfo.cpp
@@ -1,48 +1,48 @@
-//===- StripNonLineTableDebugInfo.cpp -- Strip parts of Debug Info --------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
+//===- StripNonLineTableDebugInfo.cpp -- Strip parts of Debug Info --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Utils/StripNonLineTableDebugInfo.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils.h"
-using namespace llvm;
-
-namespace {
-
-/// This pass strips all debug info that is not related line tables.
-/// The result will be the same as if the program where compiled with
-/// -gline-tables-only.
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+using namespace llvm;
+
+namespace {
+
+/// This pass strips all debug info that is not related line tables.
+/// The result will be the same as if the program where compiled with
+/// -gline-tables-only.
struct StripNonLineTableDebugLegacyPass : public ModulePass {
- static char ID; // Pass identification, replacement for typeid
+ static char ID; // Pass identification, replacement for typeid
StripNonLineTableDebugLegacyPass() : ModulePass(ID) {
initializeStripNonLineTableDebugLegacyPassPass(
*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.setPreservesAll();
- }
-
- bool runOnModule(Module &M) override {
- return llvm::stripNonLineTableDebugInfo(M);
- }
-};
-}
-
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesAll();
+ }
+
+ bool runOnModule(Module &M) override {
+ return llvm::stripNonLineTableDebugInfo(M);
+ }
+};
+}
+
char StripNonLineTableDebugLegacyPass::ID = 0;
INITIALIZE_PASS(StripNonLineTableDebugLegacyPass,
"strip-nonlinetable-debuginfo",
- "Strip all debug info except linetables", false, false)
-
+ "Strip all debug info except linetables", false, false)
+
ModulePass *llvm::createStripNonLineTableDebugLegacyPass() {
return new StripNonLineTableDebugLegacyPass();
-}
+}
PreservedAnalyses
StripNonLineTableDebugInfoPass::run(Module &M, ModuleAnalysisManager &AM) {
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/SymbolRewriter.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/SymbolRewriter.cpp
index 17299dfaf5..ec4ea848a5 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -1,587 +1,587 @@
-//===- SymbolRewriter.cpp - Symbol Rewriter -------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// SymbolRewriter is a LLVM pass which can rewrite symbols transparently within
-// existing code. It is implemented as a compiler pass and is configured via a
-// YAML configuration file.
-//
-// The YAML configuration file format is as follows:
-//
-// RewriteMapFile := RewriteDescriptors
-// RewriteDescriptors := RewriteDescriptor | RewriteDescriptors
-// RewriteDescriptor := RewriteDescriptorType ':' '{' RewriteDescriptorFields '}'
-// RewriteDescriptorFields := RewriteDescriptorField | RewriteDescriptorFields
-// RewriteDescriptorField := FieldIdentifier ':' FieldValue ','
-// RewriteDescriptorType := Identifier
-// FieldIdentifier := Identifier
-// FieldValue := Identifier
-// Identifier := [0-9a-zA-Z]+
-//
-// Currently, the following descriptor types are supported:
-//
-// - function: (function rewriting)
-// + Source (original name of the function)
-// + Target (explicit transformation)
-// + Transform (pattern transformation)
-// + Naked (boolean, whether the function is undecorated)
-// - global variable: (external linkage global variable rewriting)
-// + Source (original name of externally visible variable)
-// + Target (explicit transformation)
-// + Transform (pattern transformation)
-// - global alias: (global alias rewriting)
-// + Source (original name of the aliased name)
-// + Target (explicit transformation)
-// + Transform (pattern transformation)
-//
-// Note that source and exactly one of [Target, Transform] must be provided
-//
-// New rewrite descriptors can be created. Addding a new rewrite descriptor
-// involves:
-//
-// a) extended the rewrite descriptor kind enumeration
-// (<anonymous>::RewriteDescriptor::RewriteDescriptorType)
-// b) implementing the new descriptor
-// (c.f. <anonymous>::ExplicitRewriteFunctionDescriptor)
-// c) extending the rewrite map parser
-// (<anonymous>::RewriteMapParser::parseEntry)
-//
-// Specify to rewrite the symbols using the `-rewrite-symbols` option, and
-// specify the map file to use for the rewriting via the `-rewrite-map-file`
-// option.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/SymbolRewriter.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/ilist.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/IR/Comdat.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalAlias.h"
-#include "llvm/IR/GlobalObject.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Regex.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/YAMLParser.h"
-#include <memory>
-#include <string>
-#include <vector>
-
-using namespace llvm;
-using namespace SymbolRewriter;
-
-#define DEBUG_TYPE "symbol-rewriter"
-
-static cl::list<std::string> RewriteMapFiles("rewrite-map-file",
- cl::desc("Symbol Rewrite Map"),
- cl::value_desc("filename"),
- cl::Hidden);
-
-static void rewriteComdat(Module &M, GlobalObject *GO,
- const std::string &Source,
- const std::string &Target) {
- if (Comdat *CD = GO->getComdat()) {
- auto &Comdats = M.getComdatSymbolTable();
-
- Comdat *C = M.getOrInsertComdat(Target);
- C->setSelectionKind(CD->getSelectionKind());
- GO->setComdat(C);
-
- Comdats.erase(Comdats.find(Source));
- }
-}
-
-namespace {
-
-template <RewriteDescriptor::Type DT, typename ValueType,
- ValueType *(Module::*Get)(StringRef) const>
-class ExplicitRewriteDescriptor : public RewriteDescriptor {
-public:
- const std::string Source;
- const std::string Target;
-
- ExplicitRewriteDescriptor(StringRef S, StringRef T, const bool Naked)
- : RewriteDescriptor(DT),
- Source(std::string(Naked ? StringRef("\01" + S.str()) : S)),
- Target(std::string(T)) {}
-
- bool performOnModule(Module &M) override;
-
- static bool classof(const RewriteDescriptor *RD) {
- return RD->getType() == DT;
- }
-};
-
-} // end anonymous namespace
-
-template <RewriteDescriptor::Type DT, typename ValueType,
- ValueType *(Module::*Get)(StringRef) const>
-bool ExplicitRewriteDescriptor<DT, ValueType, Get>::performOnModule(Module &M) {
- bool Changed = false;
- if (ValueType *S = (M.*Get)(Source)) {
- if (GlobalObject *GO = dyn_cast<GlobalObject>(S))
- rewriteComdat(M, GO, Source, Target);
-
- if (Value *T = (M.*Get)(Target))
- S->setValueName(T->getValueName());
- else
- S->setName(Target);
-
- Changed = true;
- }
- return Changed;
-}
-
-namespace {
-
-template <RewriteDescriptor::Type DT, typename ValueType,
- ValueType *(Module::*Get)(StringRef) const,
- iterator_range<typename iplist<ValueType>::iterator>
- (Module::*Iterator)()>
-class PatternRewriteDescriptor : public RewriteDescriptor {
-public:
- const std::string Pattern;
- const std::string Transform;
-
- PatternRewriteDescriptor(StringRef P, StringRef T)
- : RewriteDescriptor(DT), Pattern(std::string(P)),
- Transform(std::string(T)) {}
-
- bool performOnModule(Module &M) override;
-
- static bool classof(const RewriteDescriptor *RD) {
- return RD->getType() == DT;
- }
-};
-
-} // end anonymous namespace
-
-template <RewriteDescriptor::Type DT, typename ValueType,
- ValueType *(Module::*Get)(StringRef) const,
- iterator_range<typename iplist<ValueType>::iterator>
- (Module::*Iterator)()>
-bool PatternRewriteDescriptor<DT, ValueType, Get, Iterator>::
-performOnModule(Module &M) {
- bool Changed = false;
- for (auto &C : (M.*Iterator)()) {
- std::string Error;
-
- std::string Name = Regex(Pattern).sub(Transform, C.getName(), &Error);
- if (!Error.empty())
- report_fatal_error("unable to transforn " + C.getName() + " in " +
- M.getModuleIdentifier() + ": " + Error);
-
- if (C.getName() == Name)
- continue;
-
- if (GlobalObject *GO = dyn_cast<GlobalObject>(&C))
- rewriteComdat(M, GO, std::string(C.getName()), Name);
-
- if (Value *V = (M.*Get)(Name))
- C.setValueName(V->getValueName());
- else
- C.setName(Name);
-
- Changed = true;
- }
- return Changed;
-}
-
-namespace {
-
-/// Represents a rewrite for an explicitly named (function) symbol. Both the
-/// source function name and target function name of the transformation are
-/// explicitly spelt out.
-using ExplicitRewriteFunctionDescriptor =
- ExplicitRewriteDescriptor<RewriteDescriptor::Type::Function, Function,
- &Module::getFunction>;
-
-/// Represents a rewrite for an explicitly named (global variable) symbol. Both
-/// the source variable name and target variable name are spelt out. This
-/// applies only to module level variables.
-using ExplicitRewriteGlobalVariableDescriptor =
- ExplicitRewriteDescriptor<RewriteDescriptor::Type::GlobalVariable,
- GlobalVariable, &Module::getGlobalVariable>;
-
-/// Represents a rewrite for an explicitly named global alias. Both the source
-/// and target name are explicitly spelt out.
-using ExplicitRewriteNamedAliasDescriptor =
- ExplicitRewriteDescriptor<RewriteDescriptor::Type::NamedAlias, GlobalAlias,
- &Module::getNamedAlias>;
-
-/// Represents a rewrite for a regular expression based pattern for functions.
-/// A pattern for the function name is provided and a transformation for that
-/// pattern to determine the target function name create the rewrite rule.
-using PatternRewriteFunctionDescriptor =
- PatternRewriteDescriptor<RewriteDescriptor::Type::Function, Function,
- &Module::getFunction, &Module::functions>;
-
-/// Represents a rewrite for a global variable based upon a matching pattern.
-/// Each global variable matching the provided pattern will be transformed as
-/// described in the transformation pattern for the target. Applies only to
-/// module level variables.
-using PatternRewriteGlobalVariableDescriptor =
- PatternRewriteDescriptor<RewriteDescriptor::Type::GlobalVariable,
- GlobalVariable, &Module::getGlobalVariable,
- &Module::globals>;
-
-/// PatternRewriteNamedAliasDescriptor - represents a rewrite for global
-/// aliases which match a given pattern. The provided transformation will be
-/// applied to each of the matching names.
-using PatternRewriteNamedAliasDescriptor =
- PatternRewriteDescriptor<RewriteDescriptor::Type::NamedAlias, GlobalAlias,
- &Module::getNamedAlias, &Module::aliases>;
-
-} // end anonymous namespace
-
-bool RewriteMapParser::parse(const std::string &MapFile,
- RewriteDescriptorList *DL) {
- ErrorOr<std::unique_ptr<MemoryBuffer>> Mapping =
- MemoryBuffer::getFile(MapFile);
-
- if (!Mapping)
- report_fatal_error("unable to read rewrite map '" + MapFile + "': " +
- Mapping.getError().message());
-
- if (!parse(*Mapping, DL))
- report_fatal_error("unable to parse rewrite map '" + MapFile + "'");
-
- return true;
-}
-
-bool RewriteMapParser::parse(std::unique_ptr<MemoryBuffer> &MapFile,
- RewriteDescriptorList *DL) {
- SourceMgr SM;
- yaml::Stream YS(MapFile->getBuffer(), SM);
-
- for (auto &Document : YS) {
- yaml::MappingNode *DescriptorList;
-
- // ignore empty documents
- if (isa<yaml::NullNode>(Document.getRoot()))
- continue;
-
- DescriptorList = dyn_cast<yaml::MappingNode>(Document.getRoot());
- if (!DescriptorList) {
- YS.printError(Document.getRoot(), "DescriptorList node must be a map");
- return false;
- }
-
- for (auto &Descriptor : *DescriptorList)
- if (!parseEntry(YS, Descriptor, DL))
- return false;
- }
-
- return true;
-}
-
-bool RewriteMapParser::parseEntry(yaml::Stream &YS, yaml::KeyValueNode &Entry,
- RewriteDescriptorList *DL) {
- yaml::ScalarNode *Key;
- yaml::MappingNode *Value;
- SmallString<32> KeyStorage;
- StringRef RewriteType;
-
- Key = dyn_cast<yaml::ScalarNode>(Entry.getKey());
- if (!Key) {
- YS.printError(Entry.getKey(), "rewrite type must be a scalar");
- return false;
- }
-
- Value = dyn_cast<yaml::MappingNode>(Entry.getValue());
- if (!Value) {
- YS.printError(Entry.getValue(), "rewrite descriptor must be a map");
- return false;
- }
-
- RewriteType = Key->getValue(KeyStorage);
- if (RewriteType.equals("function"))
- return parseRewriteFunctionDescriptor(YS, Key, Value, DL);
- else if (RewriteType.equals("global variable"))
- return parseRewriteGlobalVariableDescriptor(YS, Key, Value, DL);
- else if (RewriteType.equals("global alias"))
- return parseRewriteGlobalAliasDescriptor(YS, Key, Value, DL);
-
- YS.printError(Entry.getKey(), "unknown rewrite type");
- return false;
-}
-
-bool RewriteMapParser::
-parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
- yaml::MappingNode *Descriptor,
- RewriteDescriptorList *DL) {
- bool Naked = false;
- std::string Source;
- std::string Target;
- std::string Transform;
-
- for (auto &Field : *Descriptor) {
- yaml::ScalarNode *Key;
- yaml::ScalarNode *Value;
- SmallString<32> KeyStorage;
- SmallString<32> ValueStorage;
- StringRef KeyValue;
-
- Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
- if (!Key) {
- YS.printError(Field.getKey(), "descriptor key must be a scalar");
- return false;
- }
-
- Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
- if (!Value) {
- YS.printError(Field.getValue(), "descriptor value must be a scalar");
- return false;
- }
-
- KeyValue = Key->getValue(KeyStorage);
- if (KeyValue.equals("source")) {
- std::string Error;
-
- Source = std::string(Value->getValue(ValueStorage));
- if (!Regex(Source).isValid(Error)) {
- YS.printError(Field.getKey(), "invalid regex: " + Error);
- return false;
- }
- } else if (KeyValue.equals("target")) {
- Target = std::string(Value->getValue(ValueStorage));
- } else if (KeyValue.equals("transform")) {
- Transform = std::string(Value->getValue(ValueStorage));
- } else if (KeyValue.equals("naked")) {
- std::string Undecorated;
-
- Undecorated = std::string(Value->getValue(ValueStorage));
- Naked = StringRef(Undecorated).lower() == "true" || Undecorated == "1";
- } else {
- YS.printError(Field.getKey(), "unknown key for function");
- return false;
- }
- }
-
- if (Transform.empty() == Target.empty()) {
- YS.printError(Descriptor,
- "exactly one of transform or target must be specified");
- return false;
- }
-
- // TODO see if there is a more elegant solution to selecting the rewrite
- // descriptor type
- if (!Target.empty())
- DL->push_back(std::make_unique<ExplicitRewriteFunctionDescriptor>(
- Source, Target, Naked));
- else
- DL->push_back(
- std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform));
-
- return true;
-}
-
-bool RewriteMapParser::
-parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
- yaml::MappingNode *Descriptor,
- RewriteDescriptorList *DL) {
- std::string Source;
- std::string Target;
- std::string Transform;
-
- for (auto &Field : *Descriptor) {
- yaml::ScalarNode *Key;
- yaml::ScalarNode *Value;
- SmallString<32> KeyStorage;
- SmallString<32> ValueStorage;
- StringRef KeyValue;
-
- Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
- if (!Key) {
- YS.printError(Field.getKey(), "descriptor Key must be a scalar");
- return false;
- }
-
- Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
- if (!Value) {
- YS.printError(Field.getValue(), "descriptor value must be a scalar");
- return false;
- }
-
- KeyValue = Key->getValue(KeyStorage);
- if (KeyValue.equals("source")) {
- std::string Error;
-
- Source = std::string(Value->getValue(ValueStorage));
- if (!Regex(Source).isValid(Error)) {
- YS.printError(Field.getKey(), "invalid regex: " + Error);
- return false;
- }
- } else if (KeyValue.equals("target")) {
- Target = std::string(Value->getValue(ValueStorage));
- } else if (KeyValue.equals("transform")) {
- Transform = std::string(Value->getValue(ValueStorage));
- } else {
- YS.printError(Field.getKey(), "unknown Key for Global Variable");
- return false;
- }
- }
-
- if (Transform.empty() == Target.empty()) {
- YS.printError(Descriptor,
- "exactly one of transform or target must be specified");
- return false;
- }
-
- if (!Target.empty())
- DL->push_back(std::make_unique<ExplicitRewriteGlobalVariableDescriptor>(
- Source, Target,
- /*Naked*/ false));
- else
- DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>(
- Source, Transform));
-
- return true;
-}
-
-bool RewriteMapParser::
-parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
- yaml::MappingNode *Descriptor,
- RewriteDescriptorList *DL) {
- std::string Source;
- std::string Target;
- std::string Transform;
-
- for (auto &Field : *Descriptor) {
- yaml::ScalarNode *Key;
- yaml::ScalarNode *Value;
- SmallString<32> KeyStorage;
- SmallString<32> ValueStorage;
- StringRef KeyValue;
-
- Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
- if (!Key) {
- YS.printError(Field.getKey(), "descriptor key must be a scalar");
- return false;
- }
-
- Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
- if (!Value) {
- YS.printError(Field.getValue(), "descriptor value must be a scalar");
- return false;
- }
-
- KeyValue = Key->getValue(KeyStorage);
- if (KeyValue.equals("source")) {
- std::string Error;
-
- Source = std::string(Value->getValue(ValueStorage));
- if (!Regex(Source).isValid(Error)) {
- YS.printError(Field.getKey(), "invalid regex: " + Error);
- return false;
- }
- } else if (KeyValue.equals("target")) {
- Target = std::string(Value->getValue(ValueStorage));
- } else if (KeyValue.equals("transform")) {
- Transform = std::string(Value->getValue(ValueStorage));
- } else {
- YS.printError(Field.getKey(), "unknown key for Global Alias");
- return false;
- }
- }
-
- if (Transform.empty() == Target.empty()) {
- YS.printError(Descriptor,
- "exactly one of transform or target must be specified");
- return false;
- }
-
- if (!Target.empty())
- DL->push_back(std::make_unique<ExplicitRewriteNamedAliasDescriptor>(
- Source, Target,
- /*Naked*/ false));
- else
- DL->push_back(std::make_unique<PatternRewriteNamedAliasDescriptor>(
- Source, Transform));
-
- return true;
-}
-
-namespace {
-
-class RewriteSymbolsLegacyPass : public ModulePass {
-public:
- static char ID; // Pass identification, replacement for typeid
-
- RewriteSymbolsLegacyPass();
- RewriteSymbolsLegacyPass(SymbolRewriter::RewriteDescriptorList &DL);
-
- bool runOnModule(Module &M) override;
-
-private:
- RewriteSymbolPass Impl;
-};
-
-} // end anonymous namespace
-
-char RewriteSymbolsLegacyPass::ID = 0;
-
-RewriteSymbolsLegacyPass::RewriteSymbolsLegacyPass() : ModulePass(ID) {
- initializeRewriteSymbolsLegacyPassPass(*PassRegistry::getPassRegistry());
-}
-
-RewriteSymbolsLegacyPass::RewriteSymbolsLegacyPass(
- SymbolRewriter::RewriteDescriptorList &DL)
- : ModulePass(ID), Impl(DL) {}
-
-bool RewriteSymbolsLegacyPass::runOnModule(Module &M) {
- return Impl.runImpl(M);
-}
-
-PreservedAnalyses RewriteSymbolPass::run(Module &M, ModuleAnalysisManager &AM) {
- if (!runImpl(M))
- return PreservedAnalyses::all();
-
- return PreservedAnalyses::none();
-}
-
-bool RewriteSymbolPass::runImpl(Module &M) {
- bool Changed;
-
- Changed = false;
- for (auto &Descriptor : Descriptors)
- Changed |= Descriptor->performOnModule(M);
-
- return Changed;
-}
-
-void RewriteSymbolPass::loadAndParseMapFiles() {
- const std::vector<std::string> MapFiles(RewriteMapFiles);
- SymbolRewriter::RewriteMapParser Parser;
-
- for (const auto &MapFile : MapFiles)
- Parser.parse(MapFile, &Descriptors);
-}
-
-INITIALIZE_PASS(RewriteSymbolsLegacyPass, "rewrite-symbols", "Rewrite Symbols",
- false, false)
-
-ModulePass *llvm::createRewriteSymbolsPass() {
- return new RewriteSymbolsLegacyPass();
-}
-
-ModulePass *
-llvm::createRewriteSymbolsPass(SymbolRewriter::RewriteDescriptorList &DL) {
- return new RewriteSymbolsLegacyPass(DL);
-}
+//===- SymbolRewriter.cpp - Symbol Rewriter -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// SymbolRewriter is a LLVM pass which can rewrite symbols transparently within
+// existing code. It is implemented as a compiler pass and is configured via a
+// YAML configuration file.
+//
+// The YAML configuration file format is as follows:
+//
+// RewriteMapFile := RewriteDescriptors
+// RewriteDescriptors := RewriteDescriptor | RewriteDescriptors
+// RewriteDescriptor := RewriteDescriptorType ':' '{' RewriteDescriptorFields '}'
+// RewriteDescriptorFields := RewriteDescriptorField | RewriteDescriptorFields
+// RewriteDescriptorField := FieldIdentifier ':' FieldValue ','
+// RewriteDescriptorType := Identifier
+// FieldIdentifier := Identifier
+// FieldValue := Identifier
+// Identifier := [0-9a-zA-Z]+
+//
+// Currently, the following descriptor types are supported:
+//
+// - function: (function rewriting)
+// + Source (original name of the function)
+// + Target (explicit transformation)
+// + Transform (pattern transformation)
+// + Naked (boolean, whether the function is undecorated)
+// - global variable: (external linkage global variable rewriting)
+// + Source (original name of externally visible variable)
+// + Target (explicit transformation)
+// + Transform (pattern transformation)
+// - global alias: (global alias rewriting)
+// + Source (original name of the aliased name)
+// + Target (explicit transformation)
+// + Transform (pattern transformation)
+//
+// Note that source and exactly one of [Target, Transform] must be provided
+//
+// New rewrite descriptors can be created. Addding a new rewrite descriptor
+// involves:
+//
+// a) extended the rewrite descriptor kind enumeration
+// (<anonymous>::RewriteDescriptor::RewriteDescriptorType)
+// b) implementing the new descriptor
+// (c.f. <anonymous>::ExplicitRewriteFunctionDescriptor)
+// c) extending the rewrite map parser
+// (<anonymous>::RewriteMapParser::parseEntry)
+//
+// Specify to rewrite the symbols using the `-rewrite-symbols` option, and
+// specify the map file to use for the rewriting via the `-rewrite-map-file`
+// option.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SymbolRewriter.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/IR/Comdat.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/YAMLParser.h"
+#include <memory>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace SymbolRewriter;
+
+#define DEBUG_TYPE "symbol-rewriter"
+
+static cl::list<std::string> RewriteMapFiles("rewrite-map-file",
+ cl::desc("Symbol Rewrite Map"),
+ cl::value_desc("filename"),
+ cl::Hidden);
+
+static void rewriteComdat(Module &M, GlobalObject *GO,
+ const std::string &Source,
+ const std::string &Target) {
+ if (Comdat *CD = GO->getComdat()) {
+ auto &Comdats = M.getComdatSymbolTable();
+
+ Comdat *C = M.getOrInsertComdat(Target);
+ C->setSelectionKind(CD->getSelectionKind());
+ GO->setComdat(C);
+
+ Comdats.erase(Comdats.find(Source));
+ }
+}
+
+namespace {
+
+template <RewriteDescriptor::Type DT, typename ValueType,
+ ValueType *(Module::*Get)(StringRef) const>
+class ExplicitRewriteDescriptor : public RewriteDescriptor {
+public:
+ const std::string Source;
+ const std::string Target;
+
+ ExplicitRewriteDescriptor(StringRef S, StringRef T, const bool Naked)
+ : RewriteDescriptor(DT),
+ Source(std::string(Naked ? StringRef("\01" + S.str()) : S)),
+ Target(std::string(T)) {}
+
+ bool performOnModule(Module &M) override;
+
+ static bool classof(const RewriteDescriptor *RD) {
+ return RD->getType() == DT;
+ }
+};
+
+} // end anonymous namespace
+
+template <RewriteDescriptor::Type DT, typename ValueType,
+ ValueType *(Module::*Get)(StringRef) const>
+bool ExplicitRewriteDescriptor<DT, ValueType, Get>::performOnModule(Module &M) {
+ bool Changed = false;
+ if (ValueType *S = (M.*Get)(Source)) {
+ if (GlobalObject *GO = dyn_cast<GlobalObject>(S))
+ rewriteComdat(M, GO, Source, Target);
+
+ if (Value *T = (M.*Get)(Target))
+ S->setValueName(T->getValueName());
+ else
+ S->setName(Target);
+
+ Changed = true;
+ }
+ return Changed;
+}
+
+namespace {
+
+template <RewriteDescriptor::Type DT, typename ValueType,
+ ValueType *(Module::*Get)(StringRef) const,
+ iterator_range<typename iplist<ValueType>::iterator>
+ (Module::*Iterator)()>
+class PatternRewriteDescriptor : public RewriteDescriptor {
+public:
+ const std::string Pattern;
+ const std::string Transform;
+
+ PatternRewriteDescriptor(StringRef P, StringRef T)
+ : RewriteDescriptor(DT), Pattern(std::string(P)),
+ Transform(std::string(T)) {}
+
+ bool performOnModule(Module &M) override;
+
+ static bool classof(const RewriteDescriptor *RD) {
+ return RD->getType() == DT;
+ }
+};
+
+} // end anonymous namespace
+
+template <RewriteDescriptor::Type DT, typename ValueType,
+ ValueType *(Module::*Get)(StringRef) const,
+ iterator_range<typename iplist<ValueType>::iterator>
+ (Module::*Iterator)()>
+bool PatternRewriteDescriptor<DT, ValueType, Get, Iterator>::
+performOnModule(Module &M) {
+ bool Changed = false;
+ for (auto &C : (M.*Iterator)()) {
+ std::string Error;
+
+ std::string Name = Regex(Pattern).sub(Transform, C.getName(), &Error);
+ if (!Error.empty())
+ report_fatal_error("unable to transforn " + C.getName() + " in " +
+ M.getModuleIdentifier() + ": " + Error);
+
+ if (C.getName() == Name)
+ continue;
+
+ if (GlobalObject *GO = dyn_cast<GlobalObject>(&C))
+ rewriteComdat(M, GO, std::string(C.getName()), Name);
+
+ if (Value *V = (M.*Get)(Name))
+ C.setValueName(V->getValueName());
+ else
+ C.setName(Name);
+
+ Changed = true;
+ }
+ return Changed;
+}
+
+namespace {
+
+/// Represents a rewrite for an explicitly named (function) symbol. Both the
+/// source function name and target function name of the transformation are
+/// explicitly spelt out.
+using ExplicitRewriteFunctionDescriptor =
+ ExplicitRewriteDescriptor<RewriteDescriptor::Type::Function, Function,
+ &Module::getFunction>;
+
+/// Represents a rewrite for an explicitly named (global variable) symbol. Both
+/// the source variable name and target variable name are spelt out. This
+/// applies only to module level variables.
+using ExplicitRewriteGlobalVariableDescriptor =
+ ExplicitRewriteDescriptor<RewriteDescriptor::Type::GlobalVariable,
+ GlobalVariable, &Module::getGlobalVariable>;
+
+/// Represents a rewrite for an explicitly named global alias. Both the source
+/// and target name are explicitly spelt out.
+using ExplicitRewriteNamedAliasDescriptor =
+ ExplicitRewriteDescriptor<RewriteDescriptor::Type::NamedAlias, GlobalAlias,
+ &Module::getNamedAlias>;
+
+/// Represents a rewrite for a regular expression based pattern for functions.
+/// A pattern for the function name is provided and a transformation for that
+/// pattern to determine the target function name create the rewrite rule.
+using PatternRewriteFunctionDescriptor =
+ PatternRewriteDescriptor<RewriteDescriptor::Type::Function, Function,
+ &Module::getFunction, &Module::functions>;
+
+/// Represents a rewrite for a global variable based upon a matching pattern.
+/// Each global variable matching the provided pattern will be transformed as
+/// described in the transformation pattern for the target. Applies only to
+/// module level variables.
+using PatternRewriteGlobalVariableDescriptor =
+ PatternRewriteDescriptor<RewriteDescriptor::Type::GlobalVariable,
+ GlobalVariable, &Module::getGlobalVariable,
+ &Module::globals>;
+
+/// PatternRewriteNamedAliasDescriptor - represents a rewrite for global
+/// aliases which match a given pattern. The provided transformation will be
+/// applied to each of the matching names.
+using PatternRewriteNamedAliasDescriptor =
+ PatternRewriteDescriptor<RewriteDescriptor::Type::NamedAlias, GlobalAlias,
+ &Module::getNamedAlias, &Module::aliases>;
+
+} // end anonymous namespace
+
+bool RewriteMapParser::parse(const std::string &MapFile,
+ RewriteDescriptorList *DL) {
+ ErrorOr<std::unique_ptr<MemoryBuffer>> Mapping =
+ MemoryBuffer::getFile(MapFile);
+
+ if (!Mapping)
+ report_fatal_error("unable to read rewrite map '" + MapFile + "': " +
+ Mapping.getError().message());
+
+ if (!parse(*Mapping, DL))
+ report_fatal_error("unable to parse rewrite map '" + MapFile + "'");
+
+ return true;
+}
+
+bool RewriteMapParser::parse(std::unique_ptr<MemoryBuffer> &MapFile,
+ RewriteDescriptorList *DL) {
+ SourceMgr SM;
+ yaml::Stream YS(MapFile->getBuffer(), SM);
+
+ for (auto &Document : YS) {
+ yaml::MappingNode *DescriptorList;
+
+ // ignore empty documents
+ if (isa<yaml::NullNode>(Document.getRoot()))
+ continue;
+
+ DescriptorList = dyn_cast<yaml::MappingNode>(Document.getRoot());
+ if (!DescriptorList) {
+ YS.printError(Document.getRoot(), "DescriptorList node must be a map");
+ return false;
+ }
+
+ for (auto &Descriptor : *DescriptorList)
+ if (!parseEntry(YS, Descriptor, DL))
+ return false;
+ }
+
+ return true;
+}
+
+bool RewriteMapParser::parseEntry(yaml::Stream &YS, yaml::KeyValueNode &Entry,
+ RewriteDescriptorList *DL) {
+ yaml::ScalarNode *Key;
+ yaml::MappingNode *Value;
+ SmallString<32> KeyStorage;
+ StringRef RewriteType;
+
+ Key = dyn_cast<yaml::ScalarNode>(Entry.getKey());
+ if (!Key) {
+ YS.printError(Entry.getKey(), "rewrite type must be a scalar");
+ return false;
+ }
+
+ Value = dyn_cast<yaml::MappingNode>(Entry.getValue());
+ if (!Value) {
+ YS.printError(Entry.getValue(), "rewrite descriptor must be a map");
+ return false;
+ }
+
+ RewriteType = Key->getValue(KeyStorage);
+ if (RewriteType.equals("function"))
+ return parseRewriteFunctionDescriptor(YS, Key, Value, DL);
+ else if (RewriteType.equals("global variable"))
+ return parseRewriteGlobalVariableDescriptor(YS, Key, Value, DL);
+ else if (RewriteType.equals("global alias"))
+ return parseRewriteGlobalAliasDescriptor(YS, Key, Value, DL);
+
+ YS.printError(Entry.getKey(), "unknown rewrite type");
+ return false;
+}
+
+bool RewriteMapParser::
+parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
+ yaml::MappingNode *Descriptor,
+ RewriteDescriptorList *DL) {
+ bool Naked = false;
+ std::string Source;
+ std::string Target;
+ std::string Transform;
+
+ for (auto &Field : *Descriptor) {
+ yaml::ScalarNode *Key;
+ yaml::ScalarNode *Value;
+ SmallString<32> KeyStorage;
+ SmallString<32> ValueStorage;
+ StringRef KeyValue;
+
+ Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
+ if (!Key) {
+ YS.printError(Field.getKey(), "descriptor key must be a scalar");
+ return false;
+ }
+
+ Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
+ if (!Value) {
+ YS.printError(Field.getValue(), "descriptor value must be a scalar");
+ return false;
+ }
+
+ KeyValue = Key->getValue(KeyStorage);
+ if (KeyValue.equals("source")) {
+ std::string Error;
+
+ Source = std::string(Value->getValue(ValueStorage));
+ if (!Regex(Source).isValid(Error)) {
+ YS.printError(Field.getKey(), "invalid regex: " + Error);
+ return false;
+ }
+ } else if (KeyValue.equals("target")) {
+ Target = std::string(Value->getValue(ValueStorage));
+ } else if (KeyValue.equals("transform")) {
+ Transform = std::string(Value->getValue(ValueStorage));
+ } else if (KeyValue.equals("naked")) {
+ std::string Undecorated;
+
+ Undecorated = std::string(Value->getValue(ValueStorage));
+ Naked = StringRef(Undecorated).lower() == "true" || Undecorated == "1";
+ } else {
+ YS.printError(Field.getKey(), "unknown key for function");
+ return false;
+ }
+ }
+
+ if (Transform.empty() == Target.empty()) {
+ YS.printError(Descriptor,
+ "exactly one of transform or target must be specified");
+ return false;
+ }
+
+ // TODO see if there is a more elegant solution to selecting the rewrite
+ // descriptor type
+ if (!Target.empty())
+ DL->push_back(std::make_unique<ExplicitRewriteFunctionDescriptor>(
+ Source, Target, Naked));
+ else
+ DL->push_back(
+ std::make_unique<PatternRewriteFunctionDescriptor>(Source, Transform));
+
+ return true;
+}
+
+bool RewriteMapParser::
+parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
+ yaml::MappingNode *Descriptor,
+ RewriteDescriptorList *DL) {
+ std::string Source;
+ std::string Target;
+ std::string Transform;
+
+ for (auto &Field : *Descriptor) {
+ yaml::ScalarNode *Key;
+ yaml::ScalarNode *Value;
+ SmallString<32> KeyStorage;
+ SmallString<32> ValueStorage;
+ StringRef KeyValue;
+
+ Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
+ if (!Key) {
+ YS.printError(Field.getKey(), "descriptor Key must be a scalar");
+ return false;
+ }
+
+ Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
+ if (!Value) {
+ YS.printError(Field.getValue(), "descriptor value must be a scalar");
+ return false;
+ }
+
+ KeyValue = Key->getValue(KeyStorage);
+ if (KeyValue.equals("source")) {
+ std::string Error;
+
+ Source = std::string(Value->getValue(ValueStorage));
+ if (!Regex(Source).isValid(Error)) {
+ YS.printError(Field.getKey(), "invalid regex: " + Error);
+ return false;
+ }
+ } else if (KeyValue.equals("target")) {
+ Target = std::string(Value->getValue(ValueStorage));
+ } else if (KeyValue.equals("transform")) {
+ Transform = std::string(Value->getValue(ValueStorage));
+ } else {
+ YS.printError(Field.getKey(), "unknown Key for Global Variable");
+ return false;
+ }
+ }
+
+ if (Transform.empty() == Target.empty()) {
+ YS.printError(Descriptor,
+ "exactly one of transform or target must be specified");
+ return false;
+ }
+
+ if (!Target.empty())
+ DL->push_back(std::make_unique<ExplicitRewriteGlobalVariableDescriptor>(
+ Source, Target,
+ /*Naked*/ false));
+ else
+ DL->push_back(std::make_unique<PatternRewriteGlobalVariableDescriptor>(
+ Source, Transform));
+
+ return true;
+}
+
+bool RewriteMapParser::
+parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
+ yaml::MappingNode *Descriptor,
+ RewriteDescriptorList *DL) {
+ std::string Source;
+ std::string Target;
+ std::string Transform;
+
+ for (auto &Field : *Descriptor) {
+ yaml::ScalarNode *Key;
+ yaml::ScalarNode *Value;
+ SmallString<32> KeyStorage;
+ SmallString<32> ValueStorage;
+ StringRef KeyValue;
+
+ Key = dyn_cast<yaml::ScalarNode>(Field.getKey());
+ if (!Key) {
+ YS.printError(Field.getKey(), "descriptor key must be a scalar");
+ return false;
+ }
+
+ Value = dyn_cast<yaml::ScalarNode>(Field.getValue());
+ if (!Value) {
+ YS.printError(Field.getValue(), "descriptor value must be a scalar");
+ return false;
+ }
+
+ KeyValue = Key->getValue(KeyStorage);
+ if (KeyValue.equals("source")) {
+ std::string Error;
+
+ Source = std::string(Value->getValue(ValueStorage));
+ if (!Regex(Source).isValid(Error)) {
+ YS.printError(Field.getKey(), "invalid regex: " + Error);
+ return false;
+ }
+ } else if (KeyValue.equals("target")) {
+ Target = std::string(Value->getValue(ValueStorage));
+ } else if (KeyValue.equals("transform")) {
+ Transform = std::string(Value->getValue(ValueStorage));
+ } else {
+ YS.printError(Field.getKey(), "unknown key for Global Alias");
+ return false;
+ }
+ }
+
+ if (Transform.empty() == Target.empty()) {
+ YS.printError(Descriptor,
+ "exactly one of transform or target must be specified");
+ return false;
+ }
+
+ if (!Target.empty())
+ DL->push_back(std::make_unique<ExplicitRewriteNamedAliasDescriptor>(
+ Source, Target,
+ /*Naked*/ false));
+ else
+ DL->push_back(std::make_unique<PatternRewriteNamedAliasDescriptor>(
+ Source, Transform));
+
+ return true;
+}
+
+namespace {
+
+class RewriteSymbolsLegacyPass : public ModulePass {
+public:
+ static char ID; // Pass identification, replacement for typeid
+
+ RewriteSymbolsLegacyPass();
+ RewriteSymbolsLegacyPass(SymbolRewriter::RewriteDescriptorList &DL);
+
+ bool runOnModule(Module &M) override;
+
+private:
+ RewriteSymbolPass Impl;
+};
+
+} // end anonymous namespace
+
+char RewriteSymbolsLegacyPass::ID = 0;
+
+RewriteSymbolsLegacyPass::RewriteSymbolsLegacyPass() : ModulePass(ID) {
+ initializeRewriteSymbolsLegacyPassPass(*PassRegistry::getPassRegistry());
+}
+
+RewriteSymbolsLegacyPass::RewriteSymbolsLegacyPass(
+ SymbolRewriter::RewriteDescriptorList &DL)
+ : ModulePass(ID), Impl(DL) {}
+
+bool RewriteSymbolsLegacyPass::runOnModule(Module &M) {
+ return Impl.runImpl(M);
+}
+
+PreservedAnalyses RewriteSymbolPass::run(Module &M, ModuleAnalysisManager &AM) {
+ if (!runImpl(M))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+bool RewriteSymbolPass::runImpl(Module &M) {
+ bool Changed;
+
+ Changed = false;
+ for (auto &Descriptor : Descriptors)
+ Changed |= Descriptor->performOnModule(M);
+
+ return Changed;
+}
+
+void RewriteSymbolPass::loadAndParseMapFiles() {
+ const std::vector<std::string> MapFiles(RewriteMapFiles);
+ SymbolRewriter::RewriteMapParser Parser;
+
+ for (const auto &MapFile : MapFiles)
+ Parser.parse(MapFile, &Descriptors);
+}
+
+INITIALIZE_PASS(RewriteSymbolsLegacyPass, "rewrite-symbols", "Rewrite Symbols",
+ false, false)
+
+ModulePass *llvm::createRewriteSymbolsPass() {
+ return new RewriteSymbolsLegacyPass();
+}
+
+ModulePass *
+llvm::createRewriteSymbolsPass(SymbolRewriter::RewriteDescriptorList &DL) {
+ return new RewriteSymbolsLegacyPass(DL);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index 34e5b067a2..3631733713 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -1,60 +1,60 @@
-//===- UnifyFunctionExitNodes.cpp - Make all functions have a single exit -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
+//===- UnifyFunctionExitNodes.cpp - Make all functions have a single exit -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
// This pass is used to ensure that functions have at most one return and one
// unreachable instruction in them.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Type.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Transforms/Utils.h"
-using namespace llvm;
-
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils.h"
+using namespace llvm;
+
char UnifyFunctionExitNodesLegacyPass::ID = 0;
-
+
UnifyFunctionExitNodesLegacyPass::UnifyFunctionExitNodesLegacyPass()
: FunctionPass(ID) {
initializeUnifyFunctionExitNodesLegacyPassPass(
*PassRegistry::getPassRegistry());
-}
-
+}
+
INITIALIZE_PASS(UnifyFunctionExitNodesLegacyPass, "mergereturn",
- "Unify function exit nodes", false, false)
-
-Pass *llvm::createUnifyFunctionExitNodesPass() {
+ "Unify function exit nodes", false, false)
+
+Pass *llvm::createUnifyFunctionExitNodesPass() {
return new UnifyFunctionExitNodesLegacyPass();
-}
-
+}
+
void UnifyFunctionExitNodesLegacyPass::getAnalysisUsage(
AnalysisUsage &AU) const {
- // We preserve the non-critical-edgeness property
- AU.addPreservedID(BreakCriticalEdgesID);
- // This is a cluster of orthogonal Transforms
- AU.addPreservedID(LowerSwitchID);
-}
-
+ // We preserve the non-critical-edgeness property
+ AU.addPreservedID(BreakCriticalEdgesID);
+ // This is a cluster of orthogonal Transforms
+ AU.addPreservedID(LowerSwitchID);
+}
+
namespace {
bool unifyUnreachableBlocks(Function &F) {
std::vector<BasicBlock *> UnreachableBlocks;
- for (BasicBlock &I : F)
+ for (BasicBlock &I : F)
if (isa<UnreachableInst>(I.getTerminator()))
- UnreachableBlocks.push_back(&I);
-
+ UnreachableBlocks.push_back(&I);
+
if (UnreachableBlocks.size() <= 1)
return false;
-
+
BasicBlock *UnreachableBlock =
BasicBlock::Create(F.getContext(), "UnifiedUnreachableBlock", &F);
new UnreachableInst(F.getContext(), UnreachableBlock);
@@ -62,8 +62,8 @@ bool unifyUnreachableBlocks(Function &F) {
for (BasicBlock *BB : UnreachableBlocks) {
BB->getInstList().pop_back(); // Remove the unreachable inst.
BranchInst::Create(UnreachableBlock, BB);
- }
-
+ }
+
return true;
}
@@ -75,39 +75,39 @@ bool unifyReturnBlocks(Function &F) {
ReturningBlocks.push_back(&I);
if (ReturningBlocks.size() <= 1)
- return false;
-
+ return false;
+
// Insert a new basic block into the function, add PHI nodes (if the function
// returns values), and convert all of the return instructions into
// unconditional branches.
- BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(),
- "UnifiedReturnBlock", &F);
-
- PHINode *PN = nullptr;
- if (F.getReturnType()->isVoidTy()) {
- ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
- } else {
- // If the function doesn't return void... add a PHI node to the block...
- PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
- "UnifiedRetVal");
- NewRetBlock->getInstList().push_back(PN);
- ReturnInst::Create(F.getContext(), PN, NewRetBlock);
- }
-
- // Loop over all of the blocks, replacing the return instruction with an
- // unconditional branch.
- for (BasicBlock *BB : ReturningBlocks) {
- // Add an incoming element to the PHI node for every return instruction that
- // is merging into this new block...
- if (PN)
- PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
-
- BB->getInstList().pop_back(); // Remove the return insn
- BranchInst::Create(NewRetBlock, BB);
- }
-
- return true;
-}
+ BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(),
+ "UnifiedReturnBlock", &F);
+
+ PHINode *PN = nullptr;
+ if (F.getReturnType()->isVoidTy()) {
+ ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+ } else {
+ // If the function doesn't return void... add a PHI node to the block...
+ PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
+ "UnifiedRetVal");
+ NewRetBlock->getInstList().push_back(PN);
+ ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+ }
+
+ // Loop over all of the blocks, replacing the return instruction with an
+ // unconditional branch.
+ for (BasicBlock *BB : ReturningBlocks) {
+ // Add an incoming element to the PHI node for every return instruction that
+ // is merging into this new block...
+ if (PN)
+ PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+ BB->getInstList().pop_back(); // Remove the return insn
+ BranchInst::Create(NewRetBlock, BB);
+ }
+
+ return true;
+}
} // namespace
// Unify all exit nodes of the CFG by creating a new BasicBlock, and converting
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/UnifyLoopExits.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/UnifyLoopExits.cpp
index dc73534be7..0b718ed613 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/UnifyLoopExits.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -1,223 +1,223 @@
-//===- UnifyLoopExits.cpp - Redirect exiting edges to one block -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// For each natural loop with multiple exit blocks, this pass creates a new
-// block N such that all exiting blocks now branch to N, and then control flow
-// is redistributed to all the original exit blocks.
-//
-// Limitation: This assumes that all terminators in the CFG are direct branches
-// (the "br" instruction). The presence of any other control flow
-// such as indirectbr, switch or callbr will cause an assert.
-//
-//===----------------------------------------------------------------------===//
-
+//===- UnifyLoopExits.cpp - Redirect exiting edges to one block -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// For each natural loop with multiple exit blocks, this pass creates a new
+// block N such that all exiting blocks now branch to N, and then control flow
+// is redistributed to all the original exit blocks.
+//
+// Limitation: This assumes that all terminators in the CFG are direct branches
+// (the "br" instruction). The presence of any other control flow
+// such as indirectbr, switch or callbr will cause an assert.
+//
+//===----------------------------------------------------------------------===//
+
#include "llvm/Transforms/Utils/UnifyLoopExits.h"
#include "llvm/ADT/MapVector.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Transforms/Utils.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-
-#define DEBUG_TYPE "unify-loop-exits"
-
-using namespace llvm;
-
-namespace {
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "unify-loop-exits"
+
+using namespace llvm;
+
+namespace {
struct UnifyLoopExitsLegacyPass : public FunctionPass {
- static char ID;
+ static char ID;
UnifyLoopExitsLegacyPass() : FunctionPass(ID) {
initializeUnifyLoopExitsLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequiredID(LowerSwitchID);
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addPreservedID(LowerSwitchID);
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- }
-
- bool runOnFunction(Function &F) override;
-};
-} // namespace
-
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequiredID(LowerSwitchID);
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addPreservedID(LowerSwitchID);
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ }
+
+ bool runOnFunction(Function &F) override;
+};
+} // namespace
+
char UnifyLoopExitsLegacyPass::ID = 0;
-
+
FunctionPass *llvm::createUnifyLoopExitsPass() {
return new UnifyLoopExitsLegacyPass();
}
-
+
INITIALIZE_PASS_BEGIN(UnifyLoopExitsLegacyPass, "unify-loop-exits",
- "Fixup each natural loop to have a single exit block",
- false /* Only looks at CFG */, false /* Analysis Pass */)
+ "Fixup each natural loop to have a single exit block",
+ false /* Only looks at CFG */, false /* Analysis Pass */)
INITIALIZE_PASS_DEPENDENCY(LowerSwitchLegacyPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
INITIALIZE_PASS_END(UnifyLoopExitsLegacyPass, "unify-loop-exits",
- "Fixup each natural loop to have a single exit block",
- false /* Only looks at CFG */, false /* Analysis Pass */)
-
-// The current transform introduces new control flow paths which may break the
-// SSA requirement that every def must dominate all its uses. For example,
-// consider a value D defined inside the loop that is used by some instruction
-// U outside the loop. It follows that D dominates U, since the original
-// program has valid SSA form. After merging the exits, all paths from D to U
-// now flow through the unified exit block. In addition, there may be other
-// paths that do not pass through D, but now reach the unified exit
-// block. Thus, D no longer dominates U.
-//
-// Restore the dominance by creating a phi for each such D at the new unified
-// loop exit. But when doing this, ignore any uses U that are in the new unified
-// loop exit, since those were introduced specially when the block was created.
-//
-// The use of SSAUpdater seems like overkill for this operation. The location
-// for creating the new PHI is well-known, and also the set of incoming blocks
-// to the new PHI.
-static void restoreSSA(const DominatorTree &DT, const Loop *L,
- const SetVector<BasicBlock *> &Incoming,
- BasicBlock *LoopExitBlock) {
- using InstVector = SmallVector<Instruction *, 8>;
+ "Fixup each natural loop to have a single exit block",
+ false /* Only looks at CFG */, false /* Analysis Pass */)
+
+// The current transform introduces new control flow paths which may break the
+// SSA requirement that every def must dominate all its uses. For example,
+// consider a value D defined inside the loop that is used by some instruction
+// U outside the loop. It follows that D dominates U, since the original
+// program has valid SSA form. After merging the exits, all paths from D to U
+// now flow through the unified exit block. In addition, there may be other
+// paths that do not pass through D, but now reach the unified exit
+// block. Thus, D no longer dominates U.
+//
+// Restore the dominance by creating a phi for each such D at the new unified
+// loop exit. But when doing this, ignore any uses U that are in the new unified
+// loop exit, since those were introduced specially when the block was created.
+//
+// The use of SSAUpdater seems like overkill for this operation. The location
+// for creating the new PHI is well-known, and also the set of incoming blocks
+// to the new PHI.
+static void restoreSSA(const DominatorTree &DT, const Loop *L,
+ const SetVector<BasicBlock *> &Incoming,
+ BasicBlock *LoopExitBlock) {
+ using InstVector = SmallVector<Instruction *, 8>;
using IIMap = MapVector<Instruction *, InstVector>;
- IIMap ExternalUsers;
- for (auto BB : L->blocks()) {
- for (auto &I : *BB) {
- for (auto &U : I.uses()) {
- auto UserInst = cast<Instruction>(U.getUser());
- auto UserBlock = UserInst->getParent();
- if (UserBlock == LoopExitBlock)
- continue;
- if (L->contains(UserBlock))
- continue;
- LLVM_DEBUG(dbgs() << "added ext use for " << I.getName() << "("
- << BB->getName() << ")"
- << ": " << UserInst->getName() << "("
- << UserBlock->getName() << ")"
- << "\n");
- ExternalUsers[&I].push_back(UserInst);
- }
- }
- }
-
- for (auto II : ExternalUsers) {
- // For each Def used outside the loop, create NewPhi in
- // LoopExitBlock. NewPhi receives Def only along exiting blocks that
- // dominate it, while the remaining values are undefined since those paths
- // didn't exist in the original CFG.
- auto Def = II.first;
- LLVM_DEBUG(dbgs() << "externally used: " << Def->getName() << "\n");
- auto NewPhi = PHINode::Create(Def->getType(), Incoming.size(),
- Def->getName() + ".moved",
- LoopExitBlock->getTerminator());
- for (auto In : Incoming) {
- LLVM_DEBUG(dbgs() << "predecessor " << In->getName() << ": ");
- if (Def->getParent() == In || DT.dominates(Def, In)) {
- LLVM_DEBUG(dbgs() << "dominated\n");
- NewPhi->addIncoming(Def, In);
- } else {
- LLVM_DEBUG(dbgs() << "not dominated\n");
- NewPhi->addIncoming(UndefValue::get(Def->getType()), In);
- }
- }
-
- LLVM_DEBUG(dbgs() << "external users:");
- for (auto U : II.second) {
- LLVM_DEBUG(dbgs() << " " << U->getName());
- U->replaceUsesOfWith(Def, NewPhi);
- }
- LLVM_DEBUG(dbgs() << "\n");
- }
-}
-
-static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
- // To unify the loop exits, we need a list of the exiting blocks as
- // well as exit blocks. The functions for locating these lists both
- // traverse the entire loop body. It is more efficient to first
- // locate the exiting blocks and then examine their successors to
- // locate the exit blocks.
- SetVector<BasicBlock *> ExitingBlocks;
- SetVector<BasicBlock *> Exits;
-
- // We need SetVectors, but the Loop API takes a vector, so we use a temporary.
- SmallVector<BasicBlock *, 8> Temp;
- L->getExitingBlocks(Temp);
- for (auto BB : Temp) {
- ExitingBlocks.insert(BB);
- for (auto S : successors(BB)) {
- auto SL = LI.getLoopFor(S);
- // A successor is not an exit if it is directly or indirectly in the
- // current loop.
- if (SL == L || L->contains(SL))
- continue;
- Exits.insert(S);
- }
- }
-
- LLVM_DEBUG(
- dbgs() << "Found exit blocks:";
- for (auto Exit : Exits) {
- dbgs() << " " << Exit->getName();
- }
- dbgs() << "\n";
-
- dbgs() << "Found exiting blocks:";
- for (auto EB : ExitingBlocks) {
- dbgs() << " " << EB->getName();
- }
- dbgs() << "\n";);
-
- if (Exits.size() <= 1) {
- LLVM_DEBUG(dbgs() << "loop does not have multiple exits; nothing to do\n");
- return false;
- }
-
- SmallVector<BasicBlock *, 8> GuardBlocks;
- DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
- auto LoopExitBlock = CreateControlFlowHub(&DTU, GuardBlocks, ExitingBlocks,
- Exits, "loop.exit");
-
- restoreSSA(DT, L, ExitingBlocks, LoopExitBlock);
-
-#if defined(EXPENSIVE_CHECKS)
- assert(DT.verify(DominatorTree::VerificationLevel::Full));
-#else
- assert(DT.verify(DominatorTree::VerificationLevel::Fast));
-#endif // EXPENSIVE_CHECKS
- L->verifyLoop();
-
- // The guard blocks were created outside the loop, so they need to become
- // members of the parent loop.
- if (auto ParentLoop = L->getParentLoop()) {
- for (auto G : GuardBlocks) {
- ParentLoop->addBasicBlockToLoop(G, LI);
- }
- ParentLoop->verifyLoop();
- }
-
-#if defined(EXPENSIVE_CHECKS)
- LI.verify(DT);
-#endif // EXPENSIVE_CHECKS
-
- return true;
-}
-
+ IIMap ExternalUsers;
+ for (auto BB : L->blocks()) {
+ for (auto &I : *BB) {
+ for (auto &U : I.uses()) {
+ auto UserInst = cast<Instruction>(U.getUser());
+ auto UserBlock = UserInst->getParent();
+ if (UserBlock == LoopExitBlock)
+ continue;
+ if (L->contains(UserBlock))
+ continue;
+ LLVM_DEBUG(dbgs() << "added ext use for " << I.getName() << "("
+ << BB->getName() << ")"
+ << ": " << UserInst->getName() << "("
+ << UserBlock->getName() << ")"
+ << "\n");
+ ExternalUsers[&I].push_back(UserInst);
+ }
+ }
+ }
+
+ for (auto II : ExternalUsers) {
+ // For each Def used outside the loop, create NewPhi in
+ // LoopExitBlock. NewPhi receives Def only along exiting blocks that
+ // dominate it, while the remaining values are undefined since those paths
+ // didn't exist in the original CFG.
+ auto Def = II.first;
+ LLVM_DEBUG(dbgs() << "externally used: " << Def->getName() << "\n");
+ auto NewPhi = PHINode::Create(Def->getType(), Incoming.size(),
+ Def->getName() + ".moved",
+ LoopExitBlock->getTerminator());
+ for (auto In : Incoming) {
+ LLVM_DEBUG(dbgs() << "predecessor " << In->getName() << ": ");
+ if (Def->getParent() == In || DT.dominates(Def, In)) {
+ LLVM_DEBUG(dbgs() << "dominated\n");
+ NewPhi->addIncoming(Def, In);
+ } else {
+ LLVM_DEBUG(dbgs() << "not dominated\n");
+ NewPhi->addIncoming(UndefValue::get(Def->getType()), In);
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "external users:");
+ for (auto U : II.second) {
+ LLVM_DEBUG(dbgs() << " " << U->getName());
+ U->replaceUsesOfWith(Def, NewPhi);
+ }
+ LLVM_DEBUG(dbgs() << "\n");
+ }
+}
+
+static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
+ // To unify the loop exits, we need a list of the exiting blocks as
+ // well as exit blocks. The functions for locating these lists both
+ // traverse the entire loop body. It is more efficient to first
+ // locate the exiting blocks and then examine their successors to
+ // locate the exit blocks.
+ SetVector<BasicBlock *> ExitingBlocks;
+ SetVector<BasicBlock *> Exits;
+
+ // We need SetVectors, but the Loop API takes a vector, so we use a temporary.
+ SmallVector<BasicBlock *, 8> Temp;
+ L->getExitingBlocks(Temp);
+ for (auto BB : Temp) {
+ ExitingBlocks.insert(BB);
+ for (auto S : successors(BB)) {
+ auto SL = LI.getLoopFor(S);
+ // A successor is not an exit if it is directly or indirectly in the
+ // current loop.
+ if (SL == L || L->contains(SL))
+ continue;
+ Exits.insert(S);
+ }
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "Found exit blocks:";
+ for (auto Exit : Exits) {
+ dbgs() << " " << Exit->getName();
+ }
+ dbgs() << "\n";
+
+ dbgs() << "Found exiting blocks:";
+ for (auto EB : ExitingBlocks) {
+ dbgs() << " " << EB->getName();
+ }
+ dbgs() << "\n";);
+
+ if (Exits.size() <= 1) {
+ LLVM_DEBUG(dbgs() << "loop does not have multiple exits; nothing to do\n");
+ return false;
+ }
+
+ SmallVector<BasicBlock *, 8> GuardBlocks;
+ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ auto LoopExitBlock = CreateControlFlowHub(&DTU, GuardBlocks, ExitingBlocks,
+ Exits, "loop.exit");
+
+ restoreSSA(DT, L, ExitingBlocks, LoopExitBlock);
+
+#if defined(EXPENSIVE_CHECKS)
+ assert(DT.verify(DominatorTree::VerificationLevel::Full));
+#else
+ assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+#endif // EXPENSIVE_CHECKS
+ L->verifyLoop();
+
+ // The guard blocks were created outside the loop, so they need to become
+ // members of the parent loop.
+ if (auto ParentLoop = L->getParentLoop()) {
+ for (auto G : GuardBlocks) {
+ ParentLoop->addBasicBlockToLoop(G, LI);
+ }
+ ParentLoop->verifyLoop();
+ }
+
+#if defined(EXPENSIVE_CHECKS)
+ LI.verify(DT);
+#endif // EXPENSIVE_CHECKS
+
+ return true;
+}
+
static bool runImpl(LoopInfo &LI, DominatorTree &DT) {
-
- bool Changed = false;
- auto Loops = LI.getLoopsInPreorder();
- for (auto L : Loops) {
- LLVM_DEBUG(dbgs() << "Loop: " << L->getHeader()->getName() << " (depth: "
- << LI.getLoopDepth(L->getHeader()) << ")\n");
- Changed |= unifyLoopExits(DT, LI, L);
- }
- return Changed;
-}
+
+ bool Changed = false;
+ auto Loops = LI.getLoopsInPreorder();
+ for (auto L : Loops) {
+ LLVM_DEBUG(dbgs() << "Loop: " << L->getHeader()->getName() << " (depth: "
+ << LI.getLoopDepth(L->getHeader()) << ")\n");
+ Changed |= unifyLoopExits(DT, LI, L);
+ }
+ return Changed;
+}
bool UnifyLoopExitsLegacyPass::runOnFunction(Function &F) {
LLVM_DEBUG(dbgs() << "===== Unifying loop exits in function " << F.getName()
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp
index 3afa0b8c77..c57cec6be6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp
@@ -1,48 +1,48 @@
-//===- UniqueInternalLinkageNames.cpp - Unique Internal Linkage Sym Names -===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements unique naming of internal linkage symbols with option
-// -funique-internal-linkage-symbols.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/UniqueInternalLinkageNames.h"
-#include "llvm/ADT/SmallString.h"
+//===- UniqueInternalLinkageNames.cpp - Unique Internal Linkage Sym Names -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements unique naming of internal linkage symbols with option
+// -funique-internal-linkage-symbols.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/UniqueInternalLinkageNames.h"
+#include "llvm/ADT/SmallString.h"
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/MDBuilder.h"
-#include "llvm/IR/Module.h"
-#include "llvm/InitializePasses.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/MD5.h"
-#include "llvm/Transforms/Utils/ModuleUtils.h"
-
-using namespace llvm;
-
-static bool uniqueifyInternalLinkageNames(Module &M) {
- llvm::MD5 Md5;
- Md5.update(M.getSourceFileName());
- llvm::MD5::MD5Result R;
- Md5.final(R);
- SmallString<32> Str;
- llvm::MD5::stringifyResult(R, Str);
+#include "llvm/Support/MD5.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+static bool uniqueifyInternalLinkageNames(Module &M) {
+ llvm::MD5 Md5;
+ Md5.update(M.getSourceFileName());
+ llvm::MD5::MD5Result R;
+ Md5.final(R);
+ SmallString<32> Str;
+ llvm::MD5::stringifyResult(R, Str);
// Convert MD5hash to Decimal. Demangler suffixes can either contain numbers
// or characters but not both.
APInt IntHash = APInt(128, Str.str(), 16);
// Prepend "__uniq" before the hash for tools like profilers to understand that
// this symbol is of internal linkage type.
std::string ModuleNameHash = (Twine(".__uniq.") + Twine(IntHash.toString(10, false))).str();
- bool Changed = false;
+ bool Changed = false;
MDBuilder MDB(M.getContext());
-
- // Append the module hash to all internal linkage functions.
- for (auto &F : M) {
- if (F.hasInternalLinkage()) {
- F.setName(F.getName() + ModuleNameHash);
+
+ // Append the module hash to all internal linkage functions.
+ for (auto &F : M) {
+ if (F.hasInternalLinkage()) {
+ F.setName(F.getName() + ModuleNameHash);
F.addFnAttr("sample-profile-suffix-elision-policy", "selected");
// Replace linkage names in the debug metadata.
if (DISubprogram *SP = F.getSubprogram()) {
@@ -55,64 +55,64 @@ static bool uniqueifyInternalLinkageNames(Module &M) {
}
}
}
- Changed = true;
- }
- }
-
- // Append the module hash to all internal linkage globals.
- for (auto &GV : M.globals()) {
- if (GV.hasInternalLinkage()) {
- GV.setName(GV.getName() + ModuleNameHash);
- Changed = true;
- }
- }
- return Changed;
-}
-
-namespace {
-
-// Legacy pass that provides a name to every anon globals.
-class UniqueInternalLinkageNamesLegacyPass : public ModulePass {
-
-public:
- /// Pass identification, replacement for typeid
- static char ID;
-
- /// Specify pass name for debug output
- StringRef getPassName() const override {
- return "Unique Internal Linkage Names";
- }
-
- explicit UniqueInternalLinkageNamesLegacyPass() : ModulePass(ID) {
- initializeUniqueInternalLinkageNamesLegacyPassPass(
- *PassRegistry::getPassRegistry());
- }
-
- bool runOnModule(Module &M) override {
- return uniqueifyInternalLinkageNames(M);
- }
-};
-
-char UniqueInternalLinkageNamesLegacyPass::ID = 0;
-} // anonymous namespace
-
-PreservedAnalyses
-UniqueInternalLinkageNamesPass::run(Module &M, ModuleAnalysisManager &AM) {
- if (!uniqueifyInternalLinkageNames(M))
- return PreservedAnalyses::all();
-
- return PreservedAnalyses::none();
-}
-
-INITIALIZE_PASS_BEGIN(UniqueInternalLinkageNamesLegacyPass,
- "unique-internal-linkage-names",
- "Uniqueify internal linkage names", false, false)
-INITIALIZE_PASS_END(UniqueInternalLinkageNamesLegacyPass,
- "unique-internal-linkage-names",
- "Uniqueify Internal linkage names", false, false)
-
-namespace llvm {
-ModulePass *createUniqueInternalLinkageNamesPass() {
- return new UniqueInternalLinkageNamesLegacyPass();
-}
-} // namespace llvm
+ Changed = true;
+ }
+ }
+
+ // Append the module hash to all internal linkage globals.
+ for (auto &GV : M.globals()) {
+ if (GV.hasInternalLinkage()) {
+ GV.setName(GV.getName() + ModuleNameHash);
+ Changed = true;
+ }
+ }
+ return Changed;
+}
+
+namespace {
+
+// Legacy pass that provides a name to every anon globals.
+class UniqueInternalLinkageNamesLegacyPass : public ModulePass {
+
+public:
+ /// Pass identification, replacement for typeid
+ static char ID;
+
+ /// Specify pass name for debug output
+ StringRef getPassName() const override {
+ return "Unique Internal Linkage Names";
+ }
+
+ explicit UniqueInternalLinkageNamesLegacyPass() : ModulePass(ID) {
+ initializeUniqueInternalLinkageNamesLegacyPassPass(
+ *PassRegistry::getPassRegistry());
+ }
+
+ bool runOnModule(Module &M) override {
+ return uniqueifyInternalLinkageNames(M);
+ }
+};
+
+char UniqueInternalLinkageNamesLegacyPass::ID = 0;
+} // anonymous namespace
+
+PreservedAnalyses
+UniqueInternalLinkageNamesPass::run(Module &M, ModuleAnalysisManager &AM) {
+ if (!uniqueifyInternalLinkageNames(M))
+ return PreservedAnalyses::all();
+
+ return PreservedAnalyses::none();
+}
+
+INITIALIZE_PASS_BEGIN(UniqueInternalLinkageNamesLegacyPass,
+ "unique-internal-linkage-names",
+ "Uniqueify internal linkage names", false, false)
+INITIALIZE_PASS_END(UniqueInternalLinkageNamesLegacyPass,
+ "unique-internal-linkage-names",
+ "Uniqueify Internal linkage names", false, false)
+
+namespace llvm {
+ModulePass *createUniqueInternalLinkageNamesPass() {
+ return new UniqueInternalLinkageNamesLegacyPass();
+}
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/Utils.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/Utils.cpp
index b55bfc7d52..73c0532f3f 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/Utils.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/Utils.cpp
@@ -1,66 +1,66 @@
-//===-- Utils.cpp - TransformUtils Infrastructure -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the common initialization infrastructure for the
-// TransformUtils library.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils.h"
-#include "llvm-c/Initialization.h"
-#include "llvm-c/Transforms/Utils.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/PassRegistry.h"
-
-using namespace llvm;
-
-/// initializeTransformUtils - Initialize all passes in the TransformUtils
-/// library.
-void llvm::initializeTransformUtils(PassRegistry &Registry) {
- initializeAddDiscriminatorsLegacyPassPass(Registry);
- initializeAssumeSimplifyPassLegacyPassPass(Registry);
- initializeAssumeBuilderPassLegacyPassPass(Registry);
- initializeBreakCriticalEdgesPass(Registry);
- initializeCanonicalizeAliasesLegacyPassPass(Registry);
- initializeCanonicalizeFreezeInLoopsPass(Registry);
- initializeInstNamerPass(Registry);
- initializeLCSSAWrapperPassPass(Registry);
- initializeLibCallsShrinkWrapLegacyPassPass(Registry);
- initializeLoopSimplifyPass(Registry);
- initializeLowerInvokeLegacyPassPass(Registry);
+//===-- Utils.cpp - TransformUtils Infrastructure -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the common initialization infrastructure for the
+// TransformUtils library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Utils.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+/// initializeTransformUtils - Initialize all passes in the TransformUtils
+/// library.
+void llvm::initializeTransformUtils(PassRegistry &Registry) {
+ initializeAddDiscriminatorsLegacyPassPass(Registry);
+ initializeAssumeSimplifyPassLegacyPassPass(Registry);
+ initializeAssumeBuilderPassLegacyPassPass(Registry);
+ initializeBreakCriticalEdgesPass(Registry);
+ initializeCanonicalizeAliasesLegacyPassPass(Registry);
+ initializeCanonicalizeFreezeInLoopsPass(Registry);
+ initializeInstNamerPass(Registry);
+ initializeLCSSAWrapperPassPass(Registry);
+ initializeLibCallsShrinkWrapLegacyPassPass(Registry);
+ initializeLoopSimplifyPass(Registry);
+ initializeLowerInvokeLegacyPassPass(Registry);
initializeLowerSwitchLegacyPassPass(Registry);
- initializeNameAnonGlobalLegacyPassPass(Registry);
- initializePromoteLegacyPassPass(Registry);
+ initializeNameAnonGlobalLegacyPassPass(Registry);
+ initializePromoteLegacyPassPass(Registry);
initializeStripNonLineTableDebugLegacyPassPass(Registry);
initializeUnifyFunctionExitNodesLegacyPassPass(Registry);
- initializeMetaRenamerPass(Registry);
+ initializeMetaRenamerPass(Registry);
initializeStripGCRelocatesLegacyPass(Registry);
- initializePredicateInfoPrinterLegacyPassPass(Registry);
- initializeInjectTLIMappingsLegacyPass(Registry);
- initializeFixIrreduciblePass(Registry);
+ initializePredicateInfoPrinterLegacyPassPass(Registry);
+ initializeInjectTLIMappingsLegacyPass(Registry);
+ initializeFixIrreduciblePass(Registry);
initializeUnifyLoopExitsLegacyPassPass(Registry);
- initializeUniqueInternalLinkageNamesLegacyPassPass(Registry);
-}
-
-/// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
-void LLVMInitializeTransformUtils(LLVMPassRegistryRef R) {
- initializeTransformUtils(*unwrap(R));
-}
-
-void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLowerSwitchPass());
-}
-
-void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createPromoteMemoryToRegisterPass());
-}
-
-void LLVMAddAddDiscriminatorsPass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createAddDiscriminatorsPass());
-}
+ initializeUniqueInternalLinkageNamesLegacyPassPass(Registry);
+}
+
+/// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
+void LLVMInitializeTransformUtils(LLVMPassRegistryRef R) {
+ initializeTransformUtils(*unwrap(R));
+}
+
+void LLVMAddLowerSwitchPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLowerSwitchPass());
+}
+
+void LLVMAddPromoteMemoryToRegisterPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createPromoteMemoryToRegisterPass());
+}
+
+void LLVMAddAddDiscriminatorsPass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createAddDiscriminatorsPass());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/VNCoercion.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/VNCoercion.cpp
index b718ce8b4d..61cd8595a7 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/VNCoercion.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/VNCoercion.cpp
@@ -1,57 +1,57 @@
-#include "llvm/Transforms/Utils/VNCoercion.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/Support/Debug.h"
-
-#define DEBUG_TYPE "vncoerce"
-
-namespace llvm {
-namespace VNCoercion {
-
-static bool isFirstClassAggregateOrScalableType(Type *Ty) {
- return Ty->isStructTy() || Ty->isArrayTy() || isa<ScalableVectorType>(Ty);
-}
-
-/// Return true if coerceAvailableValueToLoadType will succeed.
-bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
- const DataLayout &DL) {
- Type *StoredTy = StoredVal->getType();
-
- if (StoredTy == LoadTy)
- return true;
-
- // If the loaded/stored value is a first class array/struct, or scalable type,
- // don't try to transform them. We need to be able to bitcast to integer.
- if (isFirstClassAggregateOrScalableType(LoadTy) ||
- isFirstClassAggregateOrScalableType(StoredTy))
- return false;
-
- uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedSize();
-
- // The store size must be byte-aligned to support future type casts.
- if (llvm::alignTo(StoreSize, 8) != StoreSize)
- return false;
-
- // The store has to be at least as big as the load.
- if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedSize())
- return false;
-
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "vncoerce"
+
+namespace llvm {
+namespace VNCoercion {
+
+static bool isFirstClassAggregateOrScalableType(Type *Ty) {
+ return Ty->isStructTy() || Ty->isArrayTy() || isa<ScalableVectorType>(Ty);
+}
+
+/// Return true if coerceAvailableValueToLoadType will succeed.
+bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
+ const DataLayout &DL) {
+ Type *StoredTy = StoredVal->getType();
+
+ if (StoredTy == LoadTy)
+ return true;
+
+ // If the loaded/stored value is a first class array/struct, or scalable type,
+ // don't try to transform them. We need to be able to bitcast to integer.
+ if (isFirstClassAggregateOrScalableType(LoadTy) ||
+ isFirstClassAggregateOrScalableType(StoredTy))
+ return false;
+
+ uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedSize();
+
+ // The store size must be byte-aligned to support future type casts.
+ if (llvm::alignTo(StoreSize, 8) != StoreSize)
+ return false;
+
+ // The store has to be at least as big as the load.
+ if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedSize())
+ return false;
+
bool StoredNI = DL.isNonIntegralPointerType(StoredTy->getScalarType());
bool LoadNI = DL.isNonIntegralPointerType(LoadTy->getScalarType());
- // Don't coerce non-integral pointers to integers or vice versa.
+ // Don't coerce non-integral pointers to integers or vice versa.
if (StoredNI != LoadNI) {
- // As a special case, allow coercion of memset used to initialize
- // an array w/null. Despite non-integral pointers not generally having a
- // specific bit pattern, we do assume null is zero.
- if (auto *CI = dyn_cast<Constant>(StoredVal))
- return CI->isNullValue();
- return false;
+ // As a special case, allow coercion of memset used to initialize
+ // an array w/null. Despite non-integral pointers not generally having a
+ // specific bit pattern, we do assume null is zero.
+ if (auto *CI = dyn_cast<Constant>(StoredVal))
+ return CI->isNullValue();
+ return false;
} else if (StoredNI && LoadNI &&
StoredTy->getPointerAddressSpace() !=
LoadTy->getPointerAddressSpace()) {
return false;
- }
+ }
// The implementation below uses inttoptr for vectors of unequal size; we
@@ -60,570 +60,570 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
if (StoredNI && StoreSize != DL.getTypeSizeInBits(LoadTy).getFixedSize())
return false;
- return true;
-}
-
-template <class T, class HelperClass>
-static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
- HelperClass &Helper,
- const DataLayout &DL) {
- assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
- "precondition violation - materialization can't fail");
- if (auto *C = dyn_cast<Constant>(StoredVal))
- StoredVal = ConstantFoldConstant(C, DL);
-
- // If this is already the right type, just return it.
- Type *StoredValTy = StoredVal->getType();
-
- uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedSize();
- uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedSize();
-
- // If the store and reload are the same size, we can always reuse it.
- if (StoredValSize == LoadedValSize) {
- // Pointer to Pointer -> use bitcast.
- if (StoredValTy->isPtrOrPtrVectorTy() && LoadedTy->isPtrOrPtrVectorTy()) {
- StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
- } else {
- // Convert source pointers to integers, which can be bitcast.
- if (StoredValTy->isPtrOrPtrVectorTy()) {
- StoredValTy = DL.getIntPtrType(StoredValTy);
- StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
- }
-
- Type *TypeToCastTo = LoadedTy;
- if (TypeToCastTo->isPtrOrPtrVectorTy())
- TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
-
- if (StoredValTy != TypeToCastTo)
- StoredVal = Helper.CreateBitCast(StoredVal, TypeToCastTo);
-
- // Cast to pointer if the load needs a pointer type.
- if (LoadedTy->isPtrOrPtrVectorTy())
- StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
- }
-
- if (auto *C = dyn_cast<ConstantExpr>(StoredVal))
- StoredVal = ConstantFoldConstant(C, DL);
-
- return StoredVal;
- }
- // If the loaded value is smaller than the available value, then we can
- // extract out a piece from it. If the available value is too small, then we
- // can't do anything.
- assert(StoredValSize >= LoadedValSize &&
- "canCoerceMustAliasedValueToLoad fail");
-
- // Convert source pointers to integers, which can be manipulated.
- if (StoredValTy->isPtrOrPtrVectorTy()) {
- StoredValTy = DL.getIntPtrType(StoredValTy);
- StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
- }
-
- // Convert vectors and fp to integer, which can be manipulated.
- if (!StoredValTy->isIntegerTy()) {
- StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize);
- StoredVal = Helper.CreateBitCast(StoredVal, StoredValTy);
- }
-
- // If this is a big-endian system, we need to shift the value down to the low
- // bits so that a truncate will work.
- if (DL.isBigEndian()) {
- uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy).getFixedSize() -
- DL.getTypeStoreSizeInBits(LoadedTy).getFixedSize();
- StoredVal = Helper.CreateLShr(
- StoredVal, ConstantInt::get(StoredVal->getType(), ShiftAmt));
- }
-
- // Truncate the integer to the right size now.
- Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize);
- StoredVal = Helper.CreateTruncOrBitCast(StoredVal, NewIntTy);
-
- if (LoadedTy != NewIntTy) {
- // If the result is a pointer, inttoptr.
- if (LoadedTy->isPtrOrPtrVectorTy())
- StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
- else
- // Otherwise, bitcast.
- StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
- }
-
- if (auto *C = dyn_cast<Constant>(StoredVal))
- StoredVal = ConstantFoldConstant(C, DL);
-
- return StoredVal;
-}
-
-/// If we saw a store of a value to memory, and
-/// then a load from a must-aliased pointer of a different type, try to coerce
-/// the stored value. LoadedTy is the type of the load we want to replace.
-/// IRB is IRBuilder used to insert new instructions.
-///
-/// If we can't do it, return null.
-Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
- IRBuilderBase &IRB,
- const DataLayout &DL) {
- return coerceAvailableValueToLoadTypeHelper(StoredVal, LoadedTy, IRB, DL);
-}
-
-/// This function is called when we have a memdep query of a load that ends up
-/// being a clobbering memory write (store, memset, memcpy, memmove). This
-/// means that the write *may* provide bits used by the load but we can't be
-/// sure because the pointers don't must-alias.
-///
-/// Check this case to see if there is anything more we can do before we give
-/// up. This returns -1 if we have to give up, or a byte number in the stored
-/// value of the piece that feeds the load.
-static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
- Value *WritePtr,
- uint64_t WriteSizeInBits,
- const DataLayout &DL) {
- // If the loaded/stored value is a first class array/struct, or scalable type,
- // don't try to transform them. We need to be able to bitcast to integer.
- if (isFirstClassAggregateOrScalableType(LoadTy))
- return -1;
-
- int64_t StoreOffset = 0, LoadOffset = 0;
- Value *StoreBase =
- GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
- Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
- if (StoreBase != LoadBase)
- return -1;
-
- // If the load and store are to the exact same address, they should have been
- // a must alias. AA must have gotten confused.
- // FIXME: Study to see if/when this happens. One case is forwarding a memset
- // to a load from the base of the memset.
-
- // If the load and store don't overlap at all, the store doesn't provide
- // anything to the load. In this case, they really don't alias at all, AA
- // must have gotten confused.
- uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize();
-
- if ((WriteSizeInBits & 7) | (LoadSize & 7))
- return -1;
- uint64_t StoreSize = WriteSizeInBits / 8; // Convert to bytes.
- LoadSize /= 8;
-
- bool isAAFailure = false;
- if (StoreOffset < LoadOffset)
- isAAFailure = StoreOffset + int64_t(StoreSize) <= LoadOffset;
- else
- isAAFailure = LoadOffset + int64_t(LoadSize) <= StoreOffset;
-
- if (isAAFailure)
- return -1;
-
- // If the Load isn't completely contained within the stored bits, we don't
- // have all the bits to feed it. We could do something crazy in the future
- // (issue a smaller load then merge the bits in) but this seems unlikely to be
- // valuable.
- if (StoreOffset > LoadOffset ||
- StoreOffset + StoreSize < LoadOffset + LoadSize)
- return -1;
-
- // Okay, we can do this transformation. Return the number of bytes into the
- // store that the load is.
- return LoadOffset - StoreOffset;
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering store.
-int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
- StoreInst *DepSI, const DataLayout &DL) {
- auto *StoredVal = DepSI->getValueOperand();
-
- // Cannot handle reading from store of first-class aggregate or scalable type.
- if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
- return -1;
-
+ return true;
+}
+
+template <class T, class HelperClass>
+static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
+ HelperClass &Helper,
+ const DataLayout &DL) {
+ assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
+ "precondition violation - materialization can't fail");
+ if (auto *C = dyn_cast<Constant>(StoredVal))
+ StoredVal = ConstantFoldConstant(C, DL);
+
+ // If this is already the right type, just return it.
+ Type *StoredValTy = StoredVal->getType();
+
+ uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedSize();
+ uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedSize();
+
+ // If the store and reload are the same size, we can always reuse it.
+ if (StoredValSize == LoadedValSize) {
+ // Pointer to Pointer -> use bitcast.
+ if (StoredValTy->isPtrOrPtrVectorTy() && LoadedTy->isPtrOrPtrVectorTy()) {
+ StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
+ } else {
+ // Convert source pointers to integers, which can be bitcast.
+ if (StoredValTy->isPtrOrPtrVectorTy()) {
+ StoredValTy = DL.getIntPtrType(StoredValTy);
+ StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
+ }
+
+ Type *TypeToCastTo = LoadedTy;
+ if (TypeToCastTo->isPtrOrPtrVectorTy())
+ TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
+
+ if (StoredValTy != TypeToCastTo)
+ StoredVal = Helper.CreateBitCast(StoredVal, TypeToCastTo);
+
+ // Cast to pointer if the load needs a pointer type.
+ if (LoadedTy->isPtrOrPtrVectorTy())
+ StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
+ }
+
+ if (auto *C = dyn_cast<ConstantExpr>(StoredVal))
+ StoredVal = ConstantFoldConstant(C, DL);
+
+ return StoredVal;
+ }
+ // If the loaded value is smaller than the available value, then we can
+ // extract out a piece from it. If the available value is too small, then we
+ // can't do anything.
+ assert(StoredValSize >= LoadedValSize &&
+ "canCoerceMustAliasedValueToLoad fail");
+
+ // Convert source pointers to integers, which can be manipulated.
+ if (StoredValTy->isPtrOrPtrVectorTy()) {
+ StoredValTy = DL.getIntPtrType(StoredValTy);
+ StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
+ }
+
+ // Convert vectors and fp to integer, which can be manipulated.
+ if (!StoredValTy->isIntegerTy()) {
+ StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize);
+ StoredVal = Helper.CreateBitCast(StoredVal, StoredValTy);
+ }
+
+ // If this is a big-endian system, we need to shift the value down to the low
+ // bits so that a truncate will work.
+ if (DL.isBigEndian()) {
+ uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy).getFixedSize() -
+ DL.getTypeStoreSizeInBits(LoadedTy).getFixedSize();
+ StoredVal = Helper.CreateLShr(
+ StoredVal, ConstantInt::get(StoredVal->getType(), ShiftAmt));
+ }
+
+ // Truncate the integer to the right size now.
+ Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize);
+ StoredVal = Helper.CreateTruncOrBitCast(StoredVal, NewIntTy);
+
+ if (LoadedTy != NewIntTy) {
+ // If the result is a pointer, inttoptr.
+ if (LoadedTy->isPtrOrPtrVectorTy())
+ StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
+ else
+ // Otherwise, bitcast.
+ StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
+ }
+
+ if (auto *C = dyn_cast<Constant>(StoredVal))
+ StoredVal = ConstantFoldConstant(C, DL);
+
+ return StoredVal;
+}
+
+/// If we saw a store of a value to memory, and
+/// then a load from a must-aliased pointer of a different type, try to coerce
+/// the stored value. LoadedTy is the type of the load we want to replace.
+/// IRB is IRBuilder used to insert new instructions.
+///
+/// If we can't do it, return null.
+Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
+ IRBuilderBase &IRB,
+ const DataLayout &DL) {
+ return coerceAvailableValueToLoadTypeHelper(StoredVal, LoadedTy, IRB, DL);
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering memory write (store, memset, memcpy, memmove). This
+/// means that the write *may* provide bits used by the load but we can't be
+/// sure because the pointers don't must-alias.
+///
+/// Check this case to see if there is anything more we can do before we give
+/// up. This returns -1 if we have to give up, or a byte number in the stored
+/// value of the piece that feeds the load.
+static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
+ Value *WritePtr,
+ uint64_t WriteSizeInBits,
+ const DataLayout &DL) {
+ // If the loaded/stored value is a first class array/struct, or scalable type,
+ // don't try to transform them. We need to be able to bitcast to integer.
+ if (isFirstClassAggregateOrScalableType(LoadTy))
+ return -1;
+
+ int64_t StoreOffset = 0, LoadOffset = 0;
+ Value *StoreBase =
+ GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
+ Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
+ if (StoreBase != LoadBase)
+ return -1;
+
+ // If the load and store are to the exact same address, they should have been
+ // a must alias. AA must have gotten confused.
+ // FIXME: Study to see if/when this happens. One case is forwarding a memset
+ // to a load from the base of the memset.
+
+ // If the load and store don't overlap at all, the store doesn't provide
+ // anything to the load. In this case, they really don't alias at all, AA
+ // must have gotten confused.
+ uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize();
+
+ if ((WriteSizeInBits & 7) | (LoadSize & 7))
+ return -1;
+ uint64_t StoreSize = WriteSizeInBits / 8; // Convert to bytes.
+ LoadSize /= 8;
+
+ bool isAAFailure = false;
+ if (StoreOffset < LoadOffset)
+ isAAFailure = StoreOffset + int64_t(StoreSize) <= LoadOffset;
+ else
+ isAAFailure = LoadOffset + int64_t(LoadSize) <= StoreOffset;
+
+ if (isAAFailure)
+ return -1;
+
+ // If the Load isn't completely contained within the stored bits, we don't
+ // have all the bits to feed it. We could do something crazy in the future
+ // (issue a smaller load then merge the bits in) but this seems unlikely to be
+ // valuable.
+ if (StoreOffset > LoadOffset ||
+ StoreOffset + StoreSize < LoadOffset + LoadSize)
+ return -1;
+
+ // Okay, we can do this transformation. Return the number of bytes into the
+ // store that the load is.
+ return LoadOffset - StoreOffset;
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering store.
+int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
+ StoreInst *DepSI, const DataLayout &DL) {
+ auto *StoredVal = DepSI->getValueOperand();
+
+ // Cannot handle reading from store of first-class aggregate or scalable type.
+ if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
+ return -1;
+
if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DL))
return -1;
-
- Value *StorePtr = DepSI->getPointerOperand();
- uint64_t StoreSize =
- DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()).getFixedSize();
- return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize,
- DL);
-}
-
-/// Looks at a memory location for a load (specified by MemLocBase, Offs, and
-/// Size) and compares it against a load.
-///
-/// If the specified load could be safely widened to a larger integer load
-/// that is 1) still efficient, 2) safe for the target, and 3) would provide
-/// the specified memory location value, then this function returns the size
-/// in bytes of the load width to use. If not, this returns zero.
-static unsigned getLoadLoadClobberFullWidthSize(const Value *MemLocBase,
- int64_t MemLocOffs,
- unsigned MemLocSize,
- const LoadInst *LI) {
- // We can only extend simple integer loads.
- if (!isa<IntegerType>(LI->getType()) || !LI->isSimple())
- return 0;
-
- // Load widening is hostile to ThreadSanitizer: it may cause false positives
- // or make the reports more cryptic (access sizes are wrong).
- if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeThread))
- return 0;
-
- const DataLayout &DL = LI->getModule()->getDataLayout();
-
- // Get the base of this load.
- int64_t LIOffs = 0;
- const Value *LIBase =
- GetPointerBaseWithConstantOffset(LI->getPointerOperand(), LIOffs, DL);
-
- // If the two pointers are not based on the same pointer, we can't tell that
- // they are related.
- if (LIBase != MemLocBase)
- return 0;
-
- // Okay, the two values are based on the same pointer, but returned as
- // no-alias. This happens when we have things like two byte loads at "P+1"
- // and "P+3". Check to see if increasing the size of the "LI" load up to its
- // alignment (or the largest native integer type) will allow us to load all
- // the bits required by MemLoc.
-
- // If MemLoc is before LI, then no widening of LI will help us out.
- if (MemLocOffs < LIOffs)
- return 0;
-
- // Get the alignment of the load in bytes. We assume that it is safe to load
- // any legal integer up to this size without a problem. For example, if we're
- // looking at an i8 load on x86-32 that is known 1024 byte aligned, we can
- // widen it up to an i32 load. If it is known 2-byte aligned, we can widen it
- // to i16.
- unsigned LoadAlign = LI->getAlignment();
-
- int64_t MemLocEnd = MemLocOffs + MemLocSize;
-
- // If no amount of rounding up will let MemLoc fit into LI, then bail out.
- if (LIOffs + LoadAlign < MemLocEnd)
- return 0;
-
- // This is the size of the load to try. Start with the next larger power of
- // two.
- unsigned NewLoadByteSize = LI->getType()->getPrimitiveSizeInBits() / 8U;
- NewLoadByteSize = NextPowerOf2(NewLoadByteSize);
-
- while (true) {
- // If this load size is bigger than our known alignment or would not fit
- // into a native integer register, then we fail.
- if (NewLoadByteSize > LoadAlign ||
- !DL.fitsInLegalInteger(NewLoadByteSize * 8))
- return 0;
-
- if (LIOffs + NewLoadByteSize > MemLocEnd &&
- (LI->getParent()->getParent()->hasFnAttribute(
- Attribute::SanitizeAddress) ||
- LI->getParent()->getParent()->hasFnAttribute(
- Attribute::SanitizeHWAddress)))
- // We will be reading past the location accessed by the original program.
- // While this is safe in a regular build, Address Safety analysis tools
- // may start reporting false warnings. So, don't do widening.
- return 0;
-
- // If a load of this width would include all of MemLoc, then we succeed.
- if (LIOffs + NewLoadByteSize >= MemLocEnd)
- return NewLoadByteSize;
-
- NewLoadByteSize <<= 1;
- }
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being clobbered by another load. See if
-/// the other load can feed into the second load.
-int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
- const DataLayout &DL) {
- // Cannot handle reading from store of first-class aggregate yet.
- if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
- return -1;
-
+
+ Value *StorePtr = DepSI->getPointerOperand();
+ uint64_t StoreSize =
+ DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()).getFixedSize();
+ return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize,
+ DL);
+}
+
+/// Looks at a memory location for a load (specified by MemLocBase, Offs, and
+/// Size) and compares it against a load.
+///
+/// If the specified load could be safely widened to a larger integer load
+/// that is 1) still efficient, 2) safe for the target, and 3) would provide
+/// the specified memory location value, then this function returns the size
+/// in bytes of the load width to use. If not, this returns zero.
+static unsigned getLoadLoadClobberFullWidthSize(const Value *MemLocBase,
+ int64_t MemLocOffs,
+ unsigned MemLocSize,
+ const LoadInst *LI) {
+ // We can only extend simple integer loads.
+ if (!isa<IntegerType>(LI->getType()) || !LI->isSimple())
+ return 0;
+
+ // Load widening is hostile to ThreadSanitizer: it may cause false positives
+ // or make the reports more cryptic (access sizes are wrong).
+ if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeThread))
+ return 0;
+
+ const DataLayout &DL = LI->getModule()->getDataLayout();
+
+ // Get the base of this load.
+ int64_t LIOffs = 0;
+ const Value *LIBase =
+ GetPointerBaseWithConstantOffset(LI->getPointerOperand(), LIOffs, DL);
+
+ // If the two pointers are not based on the same pointer, we can't tell that
+ // they are related.
+ if (LIBase != MemLocBase)
+ return 0;
+
+ // Okay, the two values are based on the same pointer, but returned as
+ // no-alias. This happens when we have things like two byte loads at "P+1"
+ // and "P+3". Check to see if increasing the size of the "LI" load up to its
+ // alignment (or the largest native integer type) will allow us to load all
+ // the bits required by MemLoc.
+
+ // If MemLoc is before LI, then no widening of LI will help us out.
+ if (MemLocOffs < LIOffs)
+ return 0;
+
+ // Get the alignment of the load in bytes. We assume that it is safe to load
+ // any legal integer up to this size without a problem. For example, if we're
+ // looking at an i8 load on x86-32 that is known 1024 byte aligned, we can
+ // widen it up to an i32 load. If it is known 2-byte aligned, we can widen it
+ // to i16.
+ unsigned LoadAlign = LI->getAlignment();
+
+ int64_t MemLocEnd = MemLocOffs + MemLocSize;
+
+ // If no amount of rounding up will let MemLoc fit into LI, then bail out.
+ if (LIOffs + LoadAlign < MemLocEnd)
+ return 0;
+
+ // This is the size of the load to try. Start with the next larger power of
+ // two.
+ unsigned NewLoadByteSize = LI->getType()->getPrimitiveSizeInBits() / 8U;
+ NewLoadByteSize = NextPowerOf2(NewLoadByteSize);
+
+ while (true) {
+ // If this load size is bigger than our known alignment or would not fit
+ // into a native integer register, then we fail.
+ if (NewLoadByteSize > LoadAlign ||
+ !DL.fitsInLegalInteger(NewLoadByteSize * 8))
+ return 0;
+
+ if (LIOffs + NewLoadByteSize > MemLocEnd &&
+ (LI->getParent()->getParent()->hasFnAttribute(
+ Attribute::SanitizeAddress) ||
+ LI->getParent()->getParent()->hasFnAttribute(
+ Attribute::SanitizeHWAddress)))
+ // We will be reading past the location accessed by the original program.
+ // While this is safe in a regular build, Address Safety analysis tools
+ // may start reporting false warnings. So, don't do widening.
+ return 0;
+
+ // If a load of this width would include all of MemLoc, then we succeed.
+ if (LIOffs + NewLoadByteSize >= MemLocEnd)
+ return NewLoadByteSize;
+
+ NewLoadByteSize <<= 1;
+ }
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being clobbered by another load. See if
+/// the other load can feed into the second load.
+int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
+ const DataLayout &DL) {
+ // Cannot handle reading from store of first-class aggregate yet.
+ if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
+ return -1;
+
if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DL))
- return -1;
-
- Value *DepPtr = DepLI->getPointerOperand();
- uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()).getFixedSize();
- int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
- if (R != -1)
- return R;
-
- // If we have a load/load clobber an DepLI can be widened to cover this load,
- // then we should widen it!
- int64_t LoadOffs = 0;
- const Value *LoadBase =
- GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
- unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
-
- unsigned Size =
- getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI);
- if (Size == 0)
- return -1;
-
- // Check non-obvious conditions enforced by MDA which we rely on for being
- // able to materialize this potentially available value
- assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
- assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
-
- return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size * 8, DL);
-}
-
-int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
- MemIntrinsic *MI, const DataLayout &DL) {
- // If the mem operation is a non-constant size, we can't handle it.
- ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
- if (!SizeCst)
- return -1;
- uint64_t MemSizeInBits = SizeCst->getZExtValue() * 8;
-
- // If this is memset, we just need to see if the offset is valid in the size
- // of the memset..
- if (MI->getIntrinsicID() == Intrinsic::memset) {
- if (DL.isNonIntegralPointerType(LoadTy->getScalarType())) {
- auto *CI = dyn_cast<ConstantInt>(cast<MemSetInst>(MI)->getValue());
- if (!CI || !CI->isZero())
- return -1;
- }
- return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
- MemSizeInBits, DL);
- }
-
- // If we have a memcpy/memmove, the only case we can handle is if this is a
- // copy from constant memory. In that case, we can read directly from the
- // constant memory.
- MemTransferInst *MTI = cast<MemTransferInst>(MI);
-
- Constant *Src = dyn_cast<Constant>(MTI->getSource());
- if (!Src)
- return -1;
-
+ return -1;
+
+ Value *DepPtr = DepLI->getPointerOperand();
+ uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()).getFixedSize();
+ int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
+ if (R != -1)
+ return R;
+
+ // If we have a load/load clobber an DepLI can be widened to cover this load,
+ // then we should widen it!
+ int64_t LoadOffs = 0;
+ const Value *LoadBase =
+ GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
+ unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
+
+ unsigned Size =
+ getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI);
+ if (Size == 0)
+ return -1;
+
+ // Check non-obvious conditions enforced by MDA which we rely on for being
+ // able to materialize this potentially available value
+ assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
+ assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
+
+ return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size * 8, DL);
+}
+
+int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
+ MemIntrinsic *MI, const DataLayout &DL) {
+ // If the mem operation is a non-constant size, we can't handle it.
+ ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
+ if (!SizeCst)
+ return -1;
+ uint64_t MemSizeInBits = SizeCst->getZExtValue() * 8;
+
+ // If this is memset, we just need to see if the offset is valid in the size
+ // of the memset..
+ if (MI->getIntrinsicID() == Intrinsic::memset) {
+ if (DL.isNonIntegralPointerType(LoadTy->getScalarType())) {
+ auto *CI = dyn_cast<ConstantInt>(cast<MemSetInst>(MI)->getValue());
+ if (!CI || !CI->isZero())
+ return -1;
+ }
+ return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+ MemSizeInBits, DL);
+ }
+
+ // If we have a memcpy/memmove, the only case we can handle is if this is a
+ // copy from constant memory. In that case, we can read directly from the
+ // constant memory.
+ MemTransferInst *MTI = cast<MemTransferInst>(MI);
+
+ Constant *Src = dyn_cast<Constant>(MTI->getSource());
+ if (!Src)
+ return -1;
+
GlobalVariable *GV = dyn_cast<GlobalVariable>(getUnderlyingObject(Src));
- if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
- return -1;
-
- // See if the access is within the bounds of the transfer.
- int Offset = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
- MemSizeInBits, DL);
- if (Offset == -1)
- return Offset;
-
- unsigned AS = Src->getType()->getPointerAddressSpace();
- // Otherwise, see if we can constant fold a load from the constant with the
- // offset applied as appropriate.
- if (Offset) {
- Src = ConstantExpr::getBitCast(Src,
- Type::getInt8PtrTy(Src->getContext(), AS));
- Constant *OffsetCst =
- ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
- Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()),
- Src, OffsetCst);
- }
- Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
- if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
- return Offset;
- return -1;
-}
-
-template <class T, class HelperClass>
-static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
- HelperClass &Helper,
- const DataLayout &DL) {
- LLVMContext &Ctx = SrcVal->getType()->getContext();
-
- // If two pointers are in the same address space, they have the same size,
- // so we don't need to do any truncation, etc. This avoids introducing
- // ptrtoint instructions for pointers that may be non-integral.
- if (SrcVal->getType()->isPointerTy() && LoadTy->isPointerTy() &&
- cast<PointerType>(SrcVal->getType())->getAddressSpace() ==
- cast<PointerType>(LoadTy)->getAddressSpace()) {
- return SrcVal;
- }
-
- uint64_t StoreSize =
- (DL.getTypeSizeInBits(SrcVal->getType()).getFixedSize() + 7) / 8;
- uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedSize() + 7) / 8;
- // Compute which bits of the stored value are being used by the load. Convert
- // to an integer type to start with.
- if (SrcVal->getType()->isPtrOrPtrVectorTy())
- SrcVal = Helper.CreatePtrToInt(SrcVal, DL.getIntPtrType(SrcVal->getType()));
- if (!SrcVal->getType()->isIntegerTy())
- SrcVal = Helper.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize * 8));
-
- // Shift the bits to the least significant depending on endianness.
- unsigned ShiftAmt;
- if (DL.isLittleEndian())
- ShiftAmt = Offset * 8;
- else
- ShiftAmt = (StoreSize - LoadSize - Offset) * 8;
- if (ShiftAmt)
- SrcVal = Helper.CreateLShr(SrcVal,
- ConstantInt::get(SrcVal->getType(), ShiftAmt));
-
- if (LoadSize != StoreSize)
- SrcVal = Helper.CreateTruncOrBitCast(SrcVal,
- IntegerType::get(Ctx, LoadSize * 8));
- return SrcVal;
-}
-
-/// This function is called when we have a memdep query of a load that ends up
-/// being a clobbering store. This means that the store provides bits used by
-/// the load but the pointers don't must-alias. Check this case to see if
-/// there is anything more we can do before we give up.
-Value *getStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
- Instruction *InsertPt, const DataLayout &DL) {
-
- IRBuilder<> Builder(InsertPt);
- SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
- return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, Builder, DL);
-}
-
-Constant *getConstantStoreValueForLoad(Constant *SrcVal, unsigned Offset,
- Type *LoadTy, const DataLayout &DL) {
- ConstantFolder F;
- SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, F, DL);
- return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, F, DL);
-}
-
-/// This function is called when we have a memdep query of a load that ends up
-/// being a clobbering load. This means that the load *may* provide bits used
-/// by the load but we can't be sure because the pointers don't must-alias.
-/// Check this case to see if there is anything more we can do before we give
-/// up.
-Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
- Instruction *InsertPt, const DataLayout &DL) {
- // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
- // widen SrcVal out to a larger load.
- unsigned SrcValStoreSize =
- DL.getTypeStoreSize(SrcVal->getType()).getFixedSize();
- unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
- if (Offset + LoadSize > SrcValStoreSize) {
- assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
- assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
- // If we have a load/load clobber an DepLI can be widened to cover this
- // load, then we should widen it to the next power of 2 size big enough!
- unsigned NewLoadSize = Offset + LoadSize;
- if (!isPowerOf2_32(NewLoadSize))
- NewLoadSize = NextPowerOf2(NewLoadSize);
-
- Value *PtrVal = SrcVal->getPointerOperand();
- // Insert the new load after the old load. This ensures that subsequent
- // memdep queries will find the new load. We can't easily remove the old
- // load completely because it is already in the value numbering table.
- IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
- Type *DestTy = IntegerType::get(LoadTy->getContext(), NewLoadSize * 8);
- Type *DestPTy =
- PointerType::get(DestTy, PtrVal->getType()->getPointerAddressSpace());
- Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
- PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
- LoadInst *NewLoad = Builder.CreateLoad(DestTy, PtrVal);
- NewLoad->takeName(SrcVal);
- NewLoad->setAlignment(SrcVal->getAlign());
-
- LLVM_DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
- LLVM_DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
-
- // Replace uses of the original load with the wider load. On a big endian
- // system, we need to shift down to get the relevant bits.
- Value *RV = NewLoad;
- if (DL.isBigEndian())
- RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8);
- RV = Builder.CreateTrunc(RV, SrcVal->getType());
- SrcVal->replaceAllUsesWith(RV);
-
- SrcVal = NewLoad;
- }
-
- return getStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
-}
-
-Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset,
- Type *LoadTy, const DataLayout &DL) {
- unsigned SrcValStoreSize =
- DL.getTypeStoreSize(SrcVal->getType()).getFixedSize();
- unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
- if (Offset + LoadSize > SrcValStoreSize)
- return nullptr;
- return getConstantStoreValueForLoad(SrcVal, Offset, LoadTy, DL);
-}
-
-template <class T, class HelperClass>
-T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset,
- Type *LoadTy, HelperClass &Helper,
- const DataLayout &DL) {
- LLVMContext &Ctx = LoadTy->getContext();
- uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize() / 8;
-
- // We know that this method is only called when the mem transfer fully
- // provides the bits for the load.
- if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
- // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
- // independently of what the offset is.
- T *Val = cast<T>(MSI->getValue());
- if (LoadSize != 1)
- Val =
- Helper.CreateZExtOrBitCast(Val, IntegerType::get(Ctx, LoadSize * 8));
- T *OneElt = Val;
-
- // Splat the value out to the right number of bits.
- for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize;) {
- // If we can double the number of bytes set, do it.
- if (NumBytesSet * 2 <= LoadSize) {
- T *ShVal = Helper.CreateShl(
- Val, ConstantInt::get(Val->getType(), NumBytesSet * 8));
- Val = Helper.CreateOr(Val, ShVal);
- NumBytesSet <<= 1;
- continue;
- }
-
- // Otherwise insert one byte at a time.
- T *ShVal = Helper.CreateShl(Val, ConstantInt::get(Val->getType(), 1 * 8));
- Val = Helper.CreateOr(OneElt, ShVal);
- ++NumBytesSet;
- }
-
- return coerceAvailableValueToLoadTypeHelper(Val, LoadTy, Helper, DL);
- }
-
- // Otherwise, this is a memcpy/memmove from a constant global.
- MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
- Constant *Src = cast<Constant>(MTI->getSource());
-
- unsigned AS = Src->getType()->getPointerAddressSpace();
- // Otherwise, see if we can constant fold a load from the constant with the
- // offset applied as appropriate.
- if (Offset) {
- Src = ConstantExpr::getBitCast(Src,
- Type::getInt8PtrTy(Src->getContext(), AS));
- Constant *OffsetCst =
- ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
- Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()),
- Src, OffsetCst);
- }
- Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
- return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering mem intrinsic.
-Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
- Type *LoadTy, Instruction *InsertPt,
- const DataLayout &DL) {
- IRBuilder<> Builder(InsertPt);
- return getMemInstValueForLoadHelper<Value, IRBuilder<>>(SrcInst, Offset,
- LoadTy, Builder, DL);
-}
-
-Constant *getConstantMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
- Type *LoadTy, const DataLayout &DL) {
- // The only case analyzeLoadFromClobberingMemInst cannot be converted to a
- // constant is when it's a memset of a non-constant.
- if (auto *MSI = dyn_cast<MemSetInst>(SrcInst))
- if (!isa<Constant>(MSI->getValue()))
- return nullptr;
- ConstantFolder F;
- return getMemInstValueForLoadHelper<Constant, ConstantFolder>(SrcInst, Offset,
- LoadTy, F, DL);
-}
-} // namespace VNCoercion
-} // namespace llvm
+ if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
+ return -1;
+
+ // See if the access is within the bounds of the transfer.
+ int Offset = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+ MemSizeInBits, DL);
+ if (Offset == -1)
+ return Offset;
+
+ unsigned AS = Src->getType()->getPointerAddressSpace();
+ // Otherwise, see if we can constant fold a load from the constant with the
+ // offset applied as appropriate.
+ if (Offset) {
+ Src = ConstantExpr::getBitCast(Src,
+ Type::getInt8PtrTy(Src->getContext(), AS));
+ Constant *OffsetCst =
+ ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+ Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()),
+ Src, OffsetCst);
+ }
+ Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+ if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
+ return Offset;
+ return -1;
+}
+
+template <class T, class HelperClass>
+static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
+ HelperClass &Helper,
+ const DataLayout &DL) {
+ LLVMContext &Ctx = SrcVal->getType()->getContext();
+
+ // If two pointers are in the same address space, they have the same size,
+ // so we don't need to do any truncation, etc. This avoids introducing
+ // ptrtoint instructions for pointers that may be non-integral.
+ if (SrcVal->getType()->isPointerTy() && LoadTy->isPointerTy() &&
+ cast<PointerType>(SrcVal->getType())->getAddressSpace() ==
+ cast<PointerType>(LoadTy)->getAddressSpace()) {
+ return SrcVal;
+ }
+
+ uint64_t StoreSize =
+ (DL.getTypeSizeInBits(SrcVal->getType()).getFixedSize() + 7) / 8;
+ uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedSize() + 7) / 8;
+ // Compute which bits of the stored value are being used by the load. Convert
+ // to an integer type to start with.
+ if (SrcVal->getType()->isPtrOrPtrVectorTy())
+ SrcVal = Helper.CreatePtrToInt(SrcVal, DL.getIntPtrType(SrcVal->getType()));
+ if (!SrcVal->getType()->isIntegerTy())
+ SrcVal = Helper.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize * 8));
+
+ // Shift the bits to the least significant depending on endianness.
+ unsigned ShiftAmt;
+ if (DL.isLittleEndian())
+ ShiftAmt = Offset * 8;
+ else
+ ShiftAmt = (StoreSize - LoadSize - Offset) * 8;
+ if (ShiftAmt)
+ SrcVal = Helper.CreateLShr(SrcVal,
+ ConstantInt::get(SrcVal->getType(), ShiftAmt));
+
+ if (LoadSize != StoreSize)
+ SrcVal = Helper.CreateTruncOrBitCast(SrcVal,
+ IntegerType::get(Ctx, LoadSize * 8));
+ return SrcVal;
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering store. This means that the store provides bits used by
+/// the load but the pointers don't must-alias. Check this case to see if
+/// there is anything more we can do before we give up.
+Value *getStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
+ Instruction *InsertPt, const DataLayout &DL) {
+
+ IRBuilder<> Builder(InsertPt);
+ SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
+ return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, Builder, DL);
+}
+
+Constant *getConstantStoreValueForLoad(Constant *SrcVal, unsigned Offset,
+ Type *LoadTy, const DataLayout &DL) {
+ ConstantFolder F;
+ SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, F, DL);
+ return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, F, DL);
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering load. This means that the load *may* provide bits used
+/// by the load but we can't be sure because the pointers don't must-alias.
+/// Check this case to see if there is anything more we can do before we give
+/// up.
+Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
+ Instruction *InsertPt, const DataLayout &DL) {
+ // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
+ // widen SrcVal out to a larger load.
+ unsigned SrcValStoreSize =
+ DL.getTypeStoreSize(SrcVal->getType()).getFixedSize();
+ unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
+ if (Offset + LoadSize > SrcValStoreSize) {
+ assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
+ assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
+ // If we have a load/load clobber an DepLI can be widened to cover this
+ // load, then we should widen it to the next power of 2 size big enough!
+ unsigned NewLoadSize = Offset + LoadSize;
+ if (!isPowerOf2_32(NewLoadSize))
+ NewLoadSize = NextPowerOf2(NewLoadSize);
+
+ Value *PtrVal = SrcVal->getPointerOperand();
+ // Insert the new load after the old load. This ensures that subsequent
+ // memdep queries will find the new load. We can't easily remove the old
+ // load completely because it is already in the value numbering table.
+ IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
+ Type *DestTy = IntegerType::get(LoadTy->getContext(), NewLoadSize * 8);
+ Type *DestPTy =
+ PointerType::get(DestTy, PtrVal->getType()->getPointerAddressSpace());
+ Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
+ PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
+ LoadInst *NewLoad = Builder.CreateLoad(DestTy, PtrVal);
+ NewLoad->takeName(SrcVal);
+ NewLoad->setAlignment(SrcVal->getAlign());
+
+ LLVM_DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
+ LLVM_DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
+
+ // Replace uses of the original load with the wider load. On a big endian
+ // system, we need to shift down to get the relevant bits.
+ Value *RV = NewLoad;
+ if (DL.isBigEndian())
+ RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8);
+ RV = Builder.CreateTrunc(RV, SrcVal->getType());
+ SrcVal->replaceAllUsesWith(RV);
+
+ SrcVal = NewLoad;
+ }
+
+ return getStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
+}
+
+Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset,
+ Type *LoadTy, const DataLayout &DL) {
+ unsigned SrcValStoreSize =
+ DL.getTypeStoreSize(SrcVal->getType()).getFixedSize();
+ unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
+ if (Offset + LoadSize > SrcValStoreSize)
+ return nullptr;
+ return getConstantStoreValueForLoad(SrcVal, Offset, LoadTy, DL);
+}
+
+template <class T, class HelperClass>
+T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset,
+ Type *LoadTy, HelperClass &Helper,
+ const DataLayout &DL) {
+ LLVMContext &Ctx = LoadTy->getContext();
+ uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize() / 8;
+
+ // We know that this method is only called when the mem transfer fully
+ // provides the bits for the load.
+ if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
+ // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
+ // independently of what the offset is.
+ T *Val = cast<T>(MSI->getValue());
+ if (LoadSize != 1)
+ Val =
+ Helper.CreateZExtOrBitCast(Val, IntegerType::get(Ctx, LoadSize * 8));
+ T *OneElt = Val;
+
+ // Splat the value out to the right number of bits.
+ for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize;) {
+ // If we can double the number of bytes set, do it.
+ if (NumBytesSet * 2 <= LoadSize) {
+ T *ShVal = Helper.CreateShl(
+ Val, ConstantInt::get(Val->getType(), NumBytesSet * 8));
+ Val = Helper.CreateOr(Val, ShVal);
+ NumBytesSet <<= 1;
+ continue;
+ }
+
+ // Otherwise insert one byte at a time.
+ T *ShVal = Helper.CreateShl(Val, ConstantInt::get(Val->getType(), 1 * 8));
+ Val = Helper.CreateOr(OneElt, ShVal);
+ ++NumBytesSet;
+ }
+
+ return coerceAvailableValueToLoadTypeHelper(Val, LoadTy, Helper, DL);
+ }
+
+ // Otherwise, this is a memcpy/memmove from a constant global.
+ MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
+ Constant *Src = cast<Constant>(MTI->getSource());
+
+ unsigned AS = Src->getType()->getPointerAddressSpace();
+ // Otherwise, see if we can constant fold a load from the constant with the
+ // offset applied as appropriate.
+ if (Offset) {
+ Src = ConstantExpr::getBitCast(Src,
+ Type::getInt8PtrTy(Src->getContext(), AS));
+ Constant *OffsetCst =
+ ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+ Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()),
+ Src, OffsetCst);
+ }
+ Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+ return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering mem intrinsic.
+Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+ Type *LoadTy, Instruction *InsertPt,
+ const DataLayout &DL) {
+ IRBuilder<> Builder(InsertPt);
+ return getMemInstValueForLoadHelper<Value, IRBuilder<>>(SrcInst, Offset,
+ LoadTy, Builder, DL);
+}
+
+Constant *getConstantMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+ Type *LoadTy, const DataLayout &DL) {
+ // The only case analyzeLoadFromClobberingMemInst cannot be converted to a
+ // constant is when it's a memset of a non-constant.
+ if (auto *MSI = dyn_cast<MemSetInst>(SrcInst))
+ if (!isa<Constant>(MSI->getValue()))
+ return nullptr;
+ ConstantFolder F;
+ return getMemInstValueForLoadHelper<Constant, ConstantFolder>(SrcInst, Offset,
+ LoadTy, F, DL);
+}
+} // namespace VNCoercion
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/ValueMapper.cpp b/contrib/libs/llvm12/lib/Transforms/Utils/ValueMapper.cpp
index 1392ca041c..930e0b7ee0 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/ValueMapper.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/ValueMapper.cpp
@@ -1,906 +1,906 @@
-//===- ValueMapper.cpp - Interface shared by lib/Transforms/Utils ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the MapValue function, which is shared by various parts of
-// the lib/Transforms/Utils library.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalObject.h"
-#include "llvm/IR/GlobalIndirectSymbol.h"
-#include "llvm/IR/GlobalVariable.h"
-#include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include <cassert>
-#include <limits>
-#include <memory>
-#include <utility>
-
-using namespace llvm;
-
-// Out of line method to get vtable etc for class.
-void ValueMapTypeRemapper::anchor() {}
-void ValueMaterializer::anchor() {}
-
-namespace {
-
-/// A basic block used in a BlockAddress whose function body is not yet
-/// materialized.
-struct DelayedBasicBlock {
- BasicBlock *OldBB;
- std::unique_ptr<BasicBlock> TempBB;
-
- DelayedBasicBlock(const BlockAddress &Old)
- : OldBB(Old.getBasicBlock()),
- TempBB(BasicBlock::Create(Old.getContext())) {}
-};
-
-struct WorklistEntry {
- enum EntryKind {
- MapGlobalInit,
- MapAppendingVar,
- MapGlobalIndirectSymbol,
- RemapFunction
- };
- struct GVInitTy {
- GlobalVariable *GV;
- Constant *Init;
- };
- struct AppendingGVTy {
- GlobalVariable *GV;
- Constant *InitPrefix;
- };
- struct GlobalIndirectSymbolTy {
- GlobalIndirectSymbol *GIS;
- Constant *Target;
- };
-
- unsigned Kind : 2;
- unsigned MCID : 29;
- unsigned AppendingGVIsOldCtorDtor : 1;
- unsigned AppendingGVNumNewMembers;
- union {
- GVInitTy GVInit;
- AppendingGVTy AppendingGV;
- GlobalIndirectSymbolTy GlobalIndirectSymbol;
- Function *RemapF;
- } Data;
-};
-
-struct MappingContext {
- ValueToValueMapTy *VM;
- ValueMaterializer *Materializer = nullptr;
-
- /// Construct a MappingContext with a value map and materializer.
- explicit MappingContext(ValueToValueMapTy &VM,
- ValueMaterializer *Materializer = nullptr)
- : VM(&VM), Materializer(Materializer) {}
-};
-
-class Mapper {
- friend class MDNodeMapper;
-
-#ifndef NDEBUG
- DenseSet<GlobalValue *> AlreadyScheduled;
-#endif
-
- RemapFlags Flags;
- ValueMapTypeRemapper *TypeMapper;
- unsigned CurrentMCID = 0;
- SmallVector<MappingContext, 2> MCs;
- SmallVector<WorklistEntry, 4> Worklist;
- SmallVector<DelayedBasicBlock, 1> DelayedBBs;
- SmallVector<Constant *, 16> AppendingInits;
-
-public:
- Mapper(ValueToValueMapTy &VM, RemapFlags Flags,
- ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer)
- : Flags(Flags), TypeMapper(TypeMapper),
- MCs(1, MappingContext(VM, Materializer)) {}
-
- /// ValueMapper should explicitly call \a flush() before destruction.
- ~Mapper() { assert(!hasWorkToDo() && "Expected to be flushed"); }
-
- bool hasWorkToDo() const { return !Worklist.empty(); }
-
- unsigned
- registerAlternateMappingContext(ValueToValueMapTy &VM,
- ValueMaterializer *Materializer = nullptr) {
- MCs.push_back(MappingContext(VM, Materializer));
- return MCs.size() - 1;
- }
-
- void addFlags(RemapFlags Flags);
-
- void remapGlobalObjectMetadata(GlobalObject &GO);
-
- Value *mapValue(const Value *V);
- void remapInstruction(Instruction *I);
- void remapFunction(Function &F);
-
- Constant *mapConstant(const Constant *C) {
- return cast_or_null<Constant>(mapValue(C));
- }
-
- /// Map metadata.
- ///
- /// Find the mapping for MD. Guarantees that the return will be resolved
- /// (not an MDNode, or MDNode::isResolved() returns true).
- Metadata *mapMetadata(const Metadata *MD);
-
- void scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
- unsigned MCID);
- void scheduleMapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
- bool IsOldCtorDtor,
- ArrayRef<Constant *> NewMembers,
- unsigned MCID);
- void scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, Constant &Target,
- unsigned MCID);
- void scheduleRemapFunction(Function &F, unsigned MCID);
-
- void flush();
-
-private:
- void mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
- bool IsOldCtorDtor,
- ArrayRef<Constant *> NewMembers);
-
- ValueToValueMapTy &getVM() { return *MCs[CurrentMCID].VM; }
- ValueMaterializer *getMaterializer() { return MCs[CurrentMCID].Materializer; }
-
- Value *mapBlockAddress(const BlockAddress &BA);
-
- /// Map metadata that doesn't require visiting operands.
- Optional<Metadata *> mapSimpleMetadata(const Metadata *MD);
-
- Metadata *mapToMetadata(const Metadata *Key, Metadata *Val);
- Metadata *mapToSelf(const Metadata *MD);
-};
-
-class MDNodeMapper {
- Mapper &M;
-
- /// Data about a node in \a UniquedGraph.
- struct Data {
- bool HasChanged = false;
- unsigned ID = std::numeric_limits<unsigned>::max();
- TempMDNode Placeholder;
- };
-
- /// A graph of uniqued nodes.
- struct UniquedGraph {
- SmallDenseMap<const Metadata *, Data, 32> Info; // Node properties.
- SmallVector<MDNode *, 16> POT; // Post-order traversal.
-
- /// Propagate changed operands through the post-order traversal.
- ///
- /// Iteratively update \a Data::HasChanged for each node based on \a
- /// Data::HasChanged of its operands, until fixed point.
- void propagateChanges();
-
- /// Get a forward reference to a node to use as an operand.
- Metadata &getFwdReference(MDNode &Op);
- };
-
- /// Worklist of distinct nodes whose operands need to be remapped.
- SmallVector<MDNode *, 16> DistinctWorklist;
-
- // Storage for a UniquedGraph.
- SmallDenseMap<const Metadata *, Data, 32> InfoStorage;
- SmallVector<MDNode *, 16> POTStorage;
-
-public:
- MDNodeMapper(Mapper &M) : M(M) {}
-
- /// Map a metadata node (and its transitive operands).
- ///
- /// Map all the (unmapped) nodes in the subgraph under \c N. The iterative
- /// algorithm handles distinct nodes and uniqued node subgraphs using
- /// different strategies.
- ///
- /// Distinct nodes are immediately mapped and added to \a DistinctWorklist
- /// using \a mapDistinctNode(). Their mapping can always be computed
- /// immediately without visiting operands, even if their operands change.
- ///
- /// The mapping for uniqued nodes depends on whether their operands change.
- /// \a mapTopLevelUniquedNode() traverses the transitive uniqued subgraph of
- /// a node to calculate uniqued node mappings in bulk. Distinct leafs are
- /// added to \a DistinctWorklist with \a mapDistinctNode().
- ///
- /// After mapping \c N itself, this function remaps the operands of the
- /// distinct nodes in \a DistinctWorklist until the entire subgraph under \c
- /// N has been mapped.
- Metadata *map(const MDNode &N);
-
-private:
- /// Map a top-level uniqued node and the uniqued subgraph underneath it.
- ///
- /// This builds up a post-order traversal of the (unmapped) uniqued subgraph
- /// underneath \c FirstN and calculates the nodes' mapping. Each node uses
- /// the identity mapping (\a Mapper::mapToSelf()) as long as all of its
- /// operands uses the identity mapping.
- ///
- /// The algorithm works as follows:
- ///
- /// 1. \a createPOT(): traverse the uniqued subgraph under \c FirstN and
- /// save the post-order traversal in the given \a UniquedGraph, tracking
- /// nodes' operands change.
- ///
- /// 2. \a UniquedGraph::propagateChanges(): propagate changed operands
- /// through the \a UniquedGraph until fixed point, following the rule
- /// that if a node changes, any node that references must also change.
- ///
- /// 3. \a mapNodesInPOT(): map the uniqued nodes, creating new uniqued nodes
- /// (referencing new operands) where necessary.
- Metadata *mapTopLevelUniquedNode(const MDNode &FirstN);
-
- /// Try to map the operand of an \a MDNode.
- ///
- /// If \c Op is already mapped, return the mapping. If it's not an \a
- /// MDNode, compute and return the mapping. If it's a distinct \a MDNode,
- /// return the result of \a mapDistinctNode().
- ///
- /// \return None if \c Op is an unmapped uniqued \a MDNode.
- /// \post getMappedOp(Op) only returns None if this returns None.
- Optional<Metadata *> tryToMapOperand(const Metadata *Op);
-
- /// Map a distinct node.
- ///
- /// Return the mapping for the distinct node \c N, saving the result in \a
- /// DistinctWorklist for later remapping.
- ///
- /// \pre \c N is not yet mapped.
- /// \pre \c N.isDistinct().
- MDNode *mapDistinctNode(const MDNode &N);
-
- /// Get a previously mapped node.
- Optional<Metadata *> getMappedOp(const Metadata *Op) const;
-
- /// Create a post-order traversal of an unmapped uniqued node subgraph.
- ///
- /// This traverses the metadata graph deeply enough to map \c FirstN. It
- /// uses \a tryToMapOperand() (via \a Mapper::mapSimplifiedNode()), so any
- /// metadata that has already been mapped will not be part of the POT.
- ///
- /// Each node that has a changed operand from outside the graph (e.g., a
- /// distinct node, an already-mapped uniqued node, or \a ConstantAsMetadata)
- /// is marked with \a Data::HasChanged.
- ///
- /// \return \c true if any nodes in \c G have \a Data::HasChanged.
- /// \post \c G.POT is a post-order traversal ending with \c FirstN.
- /// \post \a Data::hasChanged in \c G.Info indicates whether any node needs
- /// to change because of operands outside the graph.
- bool createPOT(UniquedGraph &G, const MDNode &FirstN);
-
- /// Visit the operands of a uniqued node in the POT.
- ///
- /// Visit the operands in the range from \c I to \c E, returning the first
- /// uniqued node we find that isn't yet in \c G. \c I is always advanced to
- /// where to continue the loop through the operands.
- ///
- /// This sets \c HasChanged if any of the visited operands change.
- MDNode *visitOperands(UniquedGraph &G, MDNode::op_iterator &I,
- MDNode::op_iterator E, bool &HasChanged);
-
- /// Map all the nodes in the given uniqued graph.
- ///
- /// This visits all the nodes in \c G in post-order, using the identity
- /// mapping or creating a new node depending on \a Data::HasChanged.
- ///
- /// \pre \a getMappedOp() returns None for nodes in \c G, but not for any of
- /// their operands outside of \c G.
- /// \pre \a Data::HasChanged is true for a node in \c G iff any of its
- /// operands have changed.
- /// \post \a getMappedOp() returns the mapped node for every node in \c G.
- void mapNodesInPOT(UniquedGraph &G);
-
- /// Remap a node's operands using the given functor.
- ///
- /// Iterate through the operands of \c N and update them in place using \c
- /// mapOperand.
- ///
- /// \pre N.isDistinct() or N.isTemporary().
- template <class OperandMapper>
- void remapOperands(MDNode &N, OperandMapper mapOperand);
-};
-
-} // end anonymous namespace
-
-Value *Mapper::mapValue(const Value *V) {
- ValueToValueMapTy::iterator I = getVM().find(V);
-
- // If the value already exists in the map, use it.
- if (I != getVM().end()) {
- assert(I->second && "Unexpected null mapping");
- return I->second;
- }
-
- // If we have a materializer and it can materialize a value, use that.
- if (auto *Materializer = getMaterializer()) {
- if (Value *NewV = Materializer->materialize(const_cast<Value *>(V))) {
- getVM()[V] = NewV;
- return NewV;
- }
- }
-
- // Global values do not need to be seeded into the VM if they
- // are using the identity mapping.
- if (isa<GlobalValue>(V)) {
- if (Flags & RF_NullMapMissingGlobalValues)
- return nullptr;
- return getVM()[V] = const_cast<Value *>(V);
- }
-
- if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
- // Inline asm may need *type* remapping.
- FunctionType *NewTy = IA->getFunctionType();
- if (TypeMapper) {
- NewTy = cast<FunctionType>(TypeMapper->remapType(NewTy));
-
- if (NewTy != IA->getFunctionType())
- V = InlineAsm::get(NewTy, IA->getAsmString(), IA->getConstraintString(),
- IA->hasSideEffects(), IA->isAlignStack(),
- IA->getDialect());
- }
-
- return getVM()[V] = const_cast<Value *>(V);
- }
-
- if (const auto *MDV = dyn_cast<MetadataAsValue>(V)) {
- const Metadata *MD = MDV->getMetadata();
-
- if (auto *LAM = dyn_cast<LocalAsMetadata>(MD)) {
- // Look through to grab the local value.
- if (Value *LV = mapValue(LAM->getValue())) {
- if (V == LAM->getValue())
- return const_cast<Value *>(V);
- return MetadataAsValue::get(V->getContext(), ValueAsMetadata::get(LV));
- }
-
- // FIXME: always return nullptr once Verifier::verifyDominatesUse()
- // ensures metadata operands only reference defined SSA values.
- return (Flags & RF_IgnoreMissingLocals)
- ? nullptr
- : MetadataAsValue::get(V->getContext(),
- MDTuple::get(V->getContext(), None));
- }
-
- // If this is a module-level metadata and we know that nothing at the module
- // level is changing, then use an identity mapping.
- if (Flags & RF_NoModuleLevelChanges)
- return getVM()[V] = const_cast<Value *>(V);
-
- // Map the metadata and turn it into a value.
- auto *MappedMD = mapMetadata(MD);
- if (MD == MappedMD)
- return getVM()[V] = const_cast<Value *>(V);
- return getVM()[V] = MetadataAsValue::get(V->getContext(), MappedMD);
- }
-
- // Okay, this either must be a constant (which may or may not be mappable) or
- // is something that is not in the mapping table.
- Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V));
- if (!C)
- return nullptr;
-
- if (BlockAddress *BA = dyn_cast<BlockAddress>(C))
- return mapBlockAddress(*BA);
-
- auto mapValueOrNull = [this](Value *V) {
- auto Mapped = mapValue(V);
- assert((Mapped || (Flags & RF_NullMapMissingGlobalValues)) &&
- "Unexpected null mapping for constant operand without "
- "NullMapMissingGlobalValues flag");
- return Mapped;
- };
-
- // Otherwise, we have some other constant to remap. Start by checking to see
- // if all operands have an identity remapping.
- unsigned OpNo = 0, NumOperands = C->getNumOperands();
- Value *Mapped = nullptr;
- for (; OpNo != NumOperands; ++OpNo) {
- Value *Op = C->getOperand(OpNo);
- Mapped = mapValueOrNull(Op);
- if (!Mapped)
- return nullptr;
- if (Mapped != Op)
- break;
- }
-
- // See if the type mapper wants to remap the type as well.
- Type *NewTy = C->getType();
- if (TypeMapper)
- NewTy = TypeMapper->remapType(NewTy);
-
- // If the result type and all operands match up, then just insert an identity
- // mapping.
- if (OpNo == NumOperands && NewTy == C->getType())
- return getVM()[V] = C;
-
- // Okay, we need to create a new constant. We've already processed some or
- // all of the operands, set them all up now.
- SmallVector<Constant*, 8> Ops;
- Ops.reserve(NumOperands);
- for (unsigned j = 0; j != OpNo; ++j)
- Ops.push_back(cast<Constant>(C->getOperand(j)));
-
- // If one of the operands mismatch, push it and the other mapped operands.
- if (OpNo != NumOperands) {
- Ops.push_back(cast<Constant>(Mapped));
-
- // Map the rest of the operands that aren't processed yet.
- for (++OpNo; OpNo != NumOperands; ++OpNo) {
- Mapped = mapValueOrNull(C->getOperand(OpNo));
- if (!Mapped)
- return nullptr;
- Ops.push_back(cast<Constant>(Mapped));
- }
- }
- Type *NewSrcTy = nullptr;
- if (TypeMapper)
- if (auto *GEPO = dyn_cast<GEPOperator>(C))
- NewSrcTy = TypeMapper->remapType(GEPO->getSourceElementType());
-
- if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
- return getVM()[V] = CE->getWithOperands(Ops, NewTy, false, NewSrcTy);
- if (isa<ConstantArray>(C))
- return getVM()[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops);
- if (isa<ConstantStruct>(C))
- return getVM()[V] = ConstantStruct::get(cast<StructType>(NewTy), Ops);
- if (isa<ConstantVector>(C))
- return getVM()[V] = ConstantVector::get(Ops);
- // If this is a no-operand constant, it must be because the type was remapped.
- if (isa<UndefValue>(C))
- return getVM()[V] = UndefValue::get(NewTy);
- if (isa<ConstantAggregateZero>(C))
- return getVM()[V] = ConstantAggregateZero::get(NewTy);
- assert(isa<ConstantPointerNull>(C));
- return getVM()[V] = ConstantPointerNull::get(cast<PointerType>(NewTy));
-}
-
-Value *Mapper::mapBlockAddress(const BlockAddress &BA) {
- Function *F = cast<Function>(mapValue(BA.getFunction()));
-
- // F may not have materialized its initializer. In that case, create a
- // dummy basic block for now, and replace it once we've materialized all
- // the initializers.
- BasicBlock *BB;
- if (F->empty()) {
- DelayedBBs.push_back(DelayedBasicBlock(BA));
- BB = DelayedBBs.back().TempBB.get();
- } else {
- BB = cast_or_null<BasicBlock>(mapValue(BA.getBasicBlock()));
- }
-
- return getVM()[&BA] = BlockAddress::get(F, BB ? BB : BA.getBasicBlock());
-}
-
-Metadata *Mapper::mapToMetadata(const Metadata *Key, Metadata *Val) {
- getVM().MD()[Key].reset(Val);
- return Val;
-}
-
-Metadata *Mapper::mapToSelf(const Metadata *MD) {
- return mapToMetadata(MD, const_cast<Metadata *>(MD));
-}
-
-Optional<Metadata *> MDNodeMapper::tryToMapOperand(const Metadata *Op) {
- if (!Op)
- return nullptr;
-
- if (Optional<Metadata *> MappedOp = M.mapSimpleMetadata(Op)) {
-#ifndef NDEBUG
- if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op))
- assert((!*MappedOp || M.getVM().count(CMD->getValue()) ||
- M.getVM().getMappedMD(Op)) &&
- "Expected Value to be memoized");
- else
- assert((isa<MDString>(Op) || M.getVM().getMappedMD(Op)) &&
- "Expected result to be memoized");
-#endif
- return *MappedOp;
- }
-
- const MDNode &N = *cast<MDNode>(Op);
- if (N.isDistinct())
- return mapDistinctNode(N);
- return None;
-}
-
-static Metadata *cloneOrBuildODR(const MDNode &N) {
- auto *CT = dyn_cast<DICompositeType>(&N);
- // If ODR type uniquing is enabled, we would have uniqued composite types
- // with identifiers during bitcode reading, so we can just use CT.
- if (CT && CT->getContext().isODRUniquingDebugTypes() &&
- CT->getIdentifier() != "")
- return const_cast<DICompositeType *>(CT);
- return MDNode::replaceWithDistinct(N.clone());
-}
-
-MDNode *MDNodeMapper::mapDistinctNode(const MDNode &N) {
- assert(N.isDistinct() && "Expected a distinct node");
- assert(!M.getVM().getMappedMD(&N) && "Expected an unmapped node");
- DistinctWorklist.push_back(
- cast<MDNode>((M.Flags & RF_MoveDistinctMDs)
- ? M.mapToSelf(&N)
- : M.mapToMetadata(&N, cloneOrBuildODR(N))));
- return DistinctWorklist.back();
-}
-
-static ConstantAsMetadata *wrapConstantAsMetadata(const ConstantAsMetadata &CMD,
- Value *MappedV) {
- if (CMD.getValue() == MappedV)
- return const_cast<ConstantAsMetadata *>(&CMD);
- return MappedV ? ConstantAsMetadata::getConstant(MappedV) : nullptr;
-}
-
-Optional<Metadata *> MDNodeMapper::getMappedOp(const Metadata *Op) const {
- if (!Op)
- return nullptr;
-
- if (Optional<Metadata *> MappedOp = M.getVM().getMappedMD(Op))
- return *MappedOp;
-
- if (isa<MDString>(Op))
- return const_cast<Metadata *>(Op);
-
- if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op))
- return wrapConstantAsMetadata(*CMD, M.getVM().lookup(CMD->getValue()));
-
- return None;
-}
-
-Metadata &MDNodeMapper::UniquedGraph::getFwdReference(MDNode &Op) {
- auto Where = Info.find(&Op);
- assert(Where != Info.end() && "Expected a valid reference");
-
- auto &OpD = Where->second;
- if (!OpD.HasChanged)
- return Op;
-
- // Lazily construct a temporary node.
- if (!OpD.Placeholder)
- OpD.Placeholder = Op.clone();
-
- return *OpD.Placeholder;
-}
-
-template <class OperandMapper>
-void MDNodeMapper::remapOperands(MDNode &N, OperandMapper mapOperand) {
- assert(!N.isUniqued() && "Expected distinct or temporary nodes");
- for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
- Metadata *Old = N.getOperand(I);
- Metadata *New = mapOperand(Old);
-
- if (Old != New)
- N.replaceOperandWith(I, New);
- }
-}
-
-namespace {
-
-/// An entry in the worklist for the post-order traversal.
-struct POTWorklistEntry {
- MDNode *N; ///< Current node.
- MDNode::op_iterator Op; ///< Current operand of \c N.
-
- /// Keep a flag of whether operands have changed in the worklist to avoid
- /// hitting the map in \a UniquedGraph.
- bool HasChanged = false;
-
- POTWorklistEntry(MDNode &N) : N(&N), Op(N.op_begin()) {}
-};
-
-} // end anonymous namespace
-
-bool MDNodeMapper::createPOT(UniquedGraph &G, const MDNode &FirstN) {
- assert(G.Info.empty() && "Expected a fresh traversal");
- assert(FirstN.isUniqued() && "Expected uniqued node in POT");
-
- // Construct a post-order traversal of the uniqued subgraph under FirstN.
- bool AnyChanges = false;
- SmallVector<POTWorklistEntry, 16> Worklist;
- Worklist.push_back(POTWorklistEntry(const_cast<MDNode &>(FirstN)));
- (void)G.Info[&FirstN];
- while (!Worklist.empty()) {
- // Start or continue the traversal through the this node's operands.
- auto &WE = Worklist.back();
- if (MDNode *N = visitOperands(G, WE.Op, WE.N->op_end(), WE.HasChanged)) {
- // Push a new node to traverse first.
- Worklist.push_back(POTWorklistEntry(*N));
- continue;
- }
-
- // Push the node onto the POT.
- assert(WE.N->isUniqued() && "Expected only uniqued nodes");
- assert(WE.Op == WE.N->op_end() && "Expected to visit all operands");
- auto &D = G.Info[WE.N];
- AnyChanges |= D.HasChanged = WE.HasChanged;
- D.ID = G.POT.size();
- G.POT.push_back(WE.N);
-
- // Pop the node off the worklist.
- Worklist.pop_back();
- }
- return AnyChanges;
-}
-
-MDNode *MDNodeMapper::visitOperands(UniquedGraph &G, MDNode::op_iterator &I,
- MDNode::op_iterator E, bool &HasChanged) {
- while (I != E) {
- Metadata *Op = *I++; // Increment even on early return.
- if (Optional<Metadata *> MappedOp = tryToMapOperand(Op)) {
- // Check if the operand changes.
- HasChanged |= Op != *MappedOp;
- continue;
- }
-
- // A uniqued metadata node.
- MDNode &OpN = *cast<MDNode>(Op);
- assert(OpN.isUniqued() &&
- "Only uniqued operands cannot be mapped immediately");
- if (G.Info.insert(std::make_pair(&OpN, Data())).second)
- return &OpN; // This is a new one. Return it.
- }
- return nullptr;
-}
-
-void MDNodeMapper::UniquedGraph::propagateChanges() {
- bool AnyChanges;
- do {
- AnyChanges = false;
- for (MDNode *N : POT) {
- auto &D = Info[N];
- if (D.HasChanged)
- continue;
-
- if (llvm::none_of(N->operands(), [&](const Metadata *Op) {
- auto Where = Info.find(Op);
- return Where != Info.end() && Where->second.HasChanged;
- }))
- continue;
-
- AnyChanges = D.HasChanged = true;
- }
- } while (AnyChanges);
-}
-
-void MDNodeMapper::mapNodesInPOT(UniquedGraph &G) {
- // Construct uniqued nodes, building forward references as necessary.
- SmallVector<MDNode *, 16> CyclicNodes;
- for (auto *N : G.POT) {
- auto &D = G.Info[N];
- if (!D.HasChanged) {
- // The node hasn't changed.
- M.mapToSelf(N);
- continue;
- }
-
- // Remember whether this node had a placeholder.
- bool HadPlaceholder(D.Placeholder);
-
- // Clone the uniqued node and remap the operands.
- TempMDNode ClonedN = D.Placeholder ? std::move(D.Placeholder) : N->clone();
- remapOperands(*ClonedN, [this, &D, &G](Metadata *Old) {
- if (Optional<Metadata *> MappedOp = getMappedOp(Old))
- return *MappedOp;
- (void)D;
- assert(G.Info[Old].ID > D.ID && "Expected a forward reference");
- return &G.getFwdReference(*cast<MDNode>(Old));
- });
-
- auto *NewN = MDNode::replaceWithUniqued(std::move(ClonedN));
- M.mapToMetadata(N, NewN);
-
- // Nodes that were referenced out of order in the POT are involved in a
- // uniquing cycle.
- if (HadPlaceholder)
- CyclicNodes.push_back(NewN);
- }
-
- // Resolve cycles.
- for (auto *N : CyclicNodes)
- if (!N->isResolved())
- N->resolveCycles();
-}
-
-Metadata *MDNodeMapper::map(const MDNode &N) {
- assert(DistinctWorklist.empty() && "MDNodeMapper::map is not recursive");
- assert(!(M.Flags & RF_NoModuleLevelChanges) &&
- "MDNodeMapper::map assumes module-level changes");
-
- // Require resolved nodes whenever metadata might be remapped.
- assert(N.isResolved() && "Unexpected unresolved node");
-
- Metadata *MappedN =
- N.isUniqued() ? mapTopLevelUniquedNode(N) : mapDistinctNode(N);
- while (!DistinctWorklist.empty())
- remapOperands(*DistinctWorklist.pop_back_val(), [this](Metadata *Old) {
- if (Optional<Metadata *> MappedOp = tryToMapOperand(Old))
- return *MappedOp;
- return mapTopLevelUniquedNode(*cast<MDNode>(Old));
- });
- return MappedN;
-}
-
-Metadata *MDNodeMapper::mapTopLevelUniquedNode(const MDNode &FirstN) {
- assert(FirstN.isUniqued() && "Expected uniqued node");
-
- // Create a post-order traversal of uniqued nodes under FirstN.
- UniquedGraph G;
- if (!createPOT(G, FirstN)) {
- // Return early if no nodes have changed.
- for (const MDNode *N : G.POT)
- M.mapToSelf(N);
- return &const_cast<MDNode &>(FirstN);
- }
-
- // Update graph with all nodes that have changed.
- G.propagateChanges();
-
- // Map all the nodes in the graph.
- mapNodesInPOT(G);
-
- // Return the original node, remapped.
- return *getMappedOp(&FirstN);
-}
-
-Optional<Metadata *> Mapper::mapSimpleMetadata(const Metadata *MD) {
- // If the value already exists in the map, use it.
- if (Optional<Metadata *> NewMD = getVM().getMappedMD(MD))
- return *NewMD;
-
- if (isa<MDString>(MD))
- return const_cast<Metadata *>(MD);
-
- // This is a module-level metadata. If nothing at the module level is
- // changing, use an identity mapping.
- if ((Flags & RF_NoModuleLevelChanges))
- return const_cast<Metadata *>(MD);
-
- if (auto *CMD = dyn_cast<ConstantAsMetadata>(MD)) {
- // Don't memoize ConstantAsMetadata. Instead of lasting until the
- // LLVMContext is destroyed, they can be deleted when the GlobalValue they
- // reference is destructed. These aren't super common, so the extra
- // indirection isn't that expensive.
- return wrapConstantAsMetadata(*CMD, mapValue(CMD->getValue()));
- }
-
- assert(isa<MDNode>(MD) && "Expected a metadata node");
-
- return None;
-}
-
-Metadata *Mapper::mapMetadata(const Metadata *MD) {
- assert(MD && "Expected valid metadata");
- assert(!isa<LocalAsMetadata>(MD) && "Unexpected local metadata");
-
- if (Optional<Metadata *> NewMD = mapSimpleMetadata(MD))
- return *NewMD;
-
- return MDNodeMapper(*this).map(*cast<MDNode>(MD));
-}
-
-void Mapper::flush() {
- // Flush out the worklist of global values.
- while (!Worklist.empty()) {
- WorklistEntry E = Worklist.pop_back_val();
- CurrentMCID = E.MCID;
- switch (E.Kind) {
- case WorklistEntry::MapGlobalInit:
- E.Data.GVInit.GV->setInitializer(mapConstant(E.Data.GVInit.Init));
- remapGlobalObjectMetadata(*E.Data.GVInit.GV);
- break;
- case WorklistEntry::MapAppendingVar: {
- unsigned PrefixSize = AppendingInits.size() - E.AppendingGVNumNewMembers;
+//===- ValueMapper.cpp - Interface shared by lib/Transforms/Utils ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MapValue function, which is shared by various parts of
+// the lib/Transforms/Utils library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalIndirectSymbol.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <cassert>
+#include <limits>
+#include <memory>
+#include <utility>
+
+using namespace llvm;
+
+// Out of line method to get vtable etc for class.
+void ValueMapTypeRemapper::anchor() {}
+void ValueMaterializer::anchor() {}
+
+namespace {
+
+/// A basic block used in a BlockAddress whose function body is not yet
+/// materialized.
+struct DelayedBasicBlock {
+ BasicBlock *OldBB;
+ std::unique_ptr<BasicBlock> TempBB;
+
+ DelayedBasicBlock(const BlockAddress &Old)
+ : OldBB(Old.getBasicBlock()),
+ TempBB(BasicBlock::Create(Old.getContext())) {}
+};
+
+struct WorklistEntry {
+ enum EntryKind {
+ MapGlobalInit,
+ MapAppendingVar,
+ MapGlobalIndirectSymbol,
+ RemapFunction
+ };
+ struct GVInitTy {
+ GlobalVariable *GV;
+ Constant *Init;
+ };
+ struct AppendingGVTy {
+ GlobalVariable *GV;
+ Constant *InitPrefix;
+ };
+ struct GlobalIndirectSymbolTy {
+ GlobalIndirectSymbol *GIS;
+ Constant *Target;
+ };
+
+ unsigned Kind : 2;
+ unsigned MCID : 29;
+ unsigned AppendingGVIsOldCtorDtor : 1;
+ unsigned AppendingGVNumNewMembers;
+ union {
+ GVInitTy GVInit;
+ AppendingGVTy AppendingGV;
+ GlobalIndirectSymbolTy GlobalIndirectSymbol;
+ Function *RemapF;
+ } Data;
+};
+
+struct MappingContext {
+ ValueToValueMapTy *VM;
+ ValueMaterializer *Materializer = nullptr;
+
+ /// Construct a MappingContext with a value map and materializer.
+ explicit MappingContext(ValueToValueMapTy &VM,
+ ValueMaterializer *Materializer = nullptr)
+ : VM(&VM), Materializer(Materializer) {}
+};
+
+class Mapper {
+ friend class MDNodeMapper;
+
+#ifndef NDEBUG
+ DenseSet<GlobalValue *> AlreadyScheduled;
+#endif
+
+ RemapFlags Flags;
+ ValueMapTypeRemapper *TypeMapper;
+ unsigned CurrentMCID = 0;
+ SmallVector<MappingContext, 2> MCs;
+ SmallVector<WorklistEntry, 4> Worklist;
+ SmallVector<DelayedBasicBlock, 1> DelayedBBs;
+ SmallVector<Constant *, 16> AppendingInits;
+
+public:
+ Mapper(ValueToValueMapTy &VM, RemapFlags Flags,
+ ValueMapTypeRemapper *TypeMapper, ValueMaterializer *Materializer)
+ : Flags(Flags), TypeMapper(TypeMapper),
+ MCs(1, MappingContext(VM, Materializer)) {}
+
+ /// ValueMapper should explicitly call \a flush() before destruction.
+ ~Mapper() { assert(!hasWorkToDo() && "Expected to be flushed"); }
+
+ bool hasWorkToDo() const { return !Worklist.empty(); }
+
+ unsigned
+ registerAlternateMappingContext(ValueToValueMapTy &VM,
+ ValueMaterializer *Materializer = nullptr) {
+ MCs.push_back(MappingContext(VM, Materializer));
+ return MCs.size() - 1;
+ }
+
+ void addFlags(RemapFlags Flags);
+
+ void remapGlobalObjectMetadata(GlobalObject &GO);
+
+ Value *mapValue(const Value *V);
+ void remapInstruction(Instruction *I);
+ void remapFunction(Function &F);
+
+ Constant *mapConstant(const Constant *C) {
+ return cast_or_null<Constant>(mapValue(C));
+ }
+
+ /// Map metadata.
+ ///
+ /// Find the mapping for MD. Guarantees that the return will be resolved
+ /// (not an MDNode, or MDNode::isResolved() returns true).
+ Metadata *mapMetadata(const Metadata *MD);
+
+ void scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
+ unsigned MCID);
+ void scheduleMapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+ bool IsOldCtorDtor,
+ ArrayRef<Constant *> NewMembers,
+ unsigned MCID);
+ void scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS, Constant &Target,
+ unsigned MCID);
+ void scheduleRemapFunction(Function &F, unsigned MCID);
+
+ void flush();
+
+private:
+ void mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+ bool IsOldCtorDtor,
+ ArrayRef<Constant *> NewMembers);
+
+ ValueToValueMapTy &getVM() { return *MCs[CurrentMCID].VM; }
+ ValueMaterializer *getMaterializer() { return MCs[CurrentMCID].Materializer; }
+
+ Value *mapBlockAddress(const BlockAddress &BA);
+
+ /// Map metadata that doesn't require visiting operands.
+ Optional<Metadata *> mapSimpleMetadata(const Metadata *MD);
+
+ Metadata *mapToMetadata(const Metadata *Key, Metadata *Val);
+ Metadata *mapToSelf(const Metadata *MD);
+};
+
+class MDNodeMapper {
+ Mapper &M;
+
+ /// Data about a node in \a UniquedGraph.
+ struct Data {
+ bool HasChanged = false;
+ unsigned ID = std::numeric_limits<unsigned>::max();
+ TempMDNode Placeholder;
+ };
+
+ /// A graph of uniqued nodes.
+ struct UniquedGraph {
+ SmallDenseMap<const Metadata *, Data, 32> Info; // Node properties.
+ SmallVector<MDNode *, 16> POT; // Post-order traversal.
+
+ /// Propagate changed operands through the post-order traversal.
+ ///
+ /// Iteratively update \a Data::HasChanged for each node based on \a
+ /// Data::HasChanged of its operands, until fixed point.
+ void propagateChanges();
+
+ /// Get a forward reference to a node to use as an operand.
+ Metadata &getFwdReference(MDNode &Op);
+ };
+
+ /// Worklist of distinct nodes whose operands need to be remapped.
+ SmallVector<MDNode *, 16> DistinctWorklist;
+
+ // Storage for a UniquedGraph.
+ SmallDenseMap<const Metadata *, Data, 32> InfoStorage;
+ SmallVector<MDNode *, 16> POTStorage;
+
+public:
+ MDNodeMapper(Mapper &M) : M(M) {}
+
+ /// Map a metadata node (and its transitive operands).
+ ///
+ /// Map all the (unmapped) nodes in the subgraph under \c N. The iterative
+ /// algorithm handles distinct nodes and uniqued node subgraphs using
+ /// different strategies.
+ ///
+ /// Distinct nodes are immediately mapped and added to \a DistinctWorklist
+ /// using \a mapDistinctNode(). Their mapping can always be computed
+ /// immediately without visiting operands, even if their operands change.
+ ///
+ /// The mapping for uniqued nodes depends on whether their operands change.
+ /// \a mapTopLevelUniquedNode() traverses the transitive uniqued subgraph of
+ /// a node to calculate uniqued node mappings in bulk. Distinct leafs are
+ /// added to \a DistinctWorklist with \a mapDistinctNode().
+ ///
+ /// After mapping \c N itself, this function remaps the operands of the
+ /// distinct nodes in \a DistinctWorklist until the entire subgraph under \c
+ /// N has been mapped.
+ Metadata *map(const MDNode &N);
+
+private:
+ /// Map a top-level uniqued node and the uniqued subgraph underneath it.
+ ///
+ /// This builds up a post-order traversal of the (unmapped) uniqued subgraph
+ /// underneath \c FirstN and calculates the nodes' mapping. Each node uses
+ /// the identity mapping (\a Mapper::mapToSelf()) as long as all of its
+ /// operands uses the identity mapping.
+ ///
+ /// The algorithm works as follows:
+ ///
+ /// 1. \a createPOT(): traverse the uniqued subgraph under \c FirstN and
+ /// save the post-order traversal in the given \a UniquedGraph, tracking
+ /// nodes' operands change.
+ ///
+ /// 2. \a UniquedGraph::propagateChanges(): propagate changed operands
+ /// through the \a UniquedGraph until fixed point, following the rule
+ /// that if a node changes, any node that references must also change.
+ ///
+ /// 3. \a mapNodesInPOT(): map the uniqued nodes, creating new uniqued nodes
+ /// (referencing new operands) where necessary.
+ Metadata *mapTopLevelUniquedNode(const MDNode &FirstN);
+
+ /// Try to map the operand of an \a MDNode.
+ ///
+ /// If \c Op is already mapped, return the mapping. If it's not an \a
+ /// MDNode, compute and return the mapping. If it's a distinct \a MDNode,
+ /// return the result of \a mapDistinctNode().
+ ///
+ /// \return None if \c Op is an unmapped uniqued \a MDNode.
+ /// \post getMappedOp(Op) only returns None if this returns None.
+ Optional<Metadata *> tryToMapOperand(const Metadata *Op);
+
+ /// Map a distinct node.
+ ///
+ /// Return the mapping for the distinct node \c N, saving the result in \a
+ /// DistinctWorklist for later remapping.
+ ///
+ /// \pre \c N is not yet mapped.
+ /// \pre \c N.isDistinct().
+ MDNode *mapDistinctNode(const MDNode &N);
+
+ /// Get a previously mapped node.
+ Optional<Metadata *> getMappedOp(const Metadata *Op) const;
+
+ /// Create a post-order traversal of an unmapped uniqued node subgraph.
+ ///
+ /// This traverses the metadata graph deeply enough to map \c FirstN. It
+ /// uses \a tryToMapOperand() (via \a Mapper::mapSimplifiedNode()), so any
+ /// metadata that has already been mapped will not be part of the POT.
+ ///
+ /// Each node that has a changed operand from outside the graph (e.g., a
+ /// distinct node, an already-mapped uniqued node, or \a ConstantAsMetadata)
+ /// is marked with \a Data::HasChanged.
+ ///
+ /// \return \c true if any nodes in \c G have \a Data::HasChanged.
+ /// \post \c G.POT is a post-order traversal ending with \c FirstN.
+ /// \post \a Data::hasChanged in \c G.Info indicates whether any node needs
+ /// to change because of operands outside the graph.
+ bool createPOT(UniquedGraph &G, const MDNode &FirstN);
+
+ /// Visit the operands of a uniqued node in the POT.
+ ///
+ /// Visit the operands in the range from \c I to \c E, returning the first
+ /// uniqued node we find that isn't yet in \c G. \c I is always advanced to
+ /// where to continue the loop through the operands.
+ ///
+ /// This sets \c HasChanged if any of the visited operands change.
+ MDNode *visitOperands(UniquedGraph &G, MDNode::op_iterator &I,
+ MDNode::op_iterator E, bool &HasChanged);
+
+ /// Map all the nodes in the given uniqued graph.
+ ///
+ /// This visits all the nodes in \c G in post-order, using the identity
+ /// mapping or creating a new node depending on \a Data::HasChanged.
+ ///
+ /// \pre \a getMappedOp() returns None for nodes in \c G, but not for any of
+ /// their operands outside of \c G.
+ /// \pre \a Data::HasChanged is true for a node in \c G iff any of its
+ /// operands have changed.
+ /// \post \a getMappedOp() returns the mapped node for every node in \c G.
+ void mapNodesInPOT(UniquedGraph &G);
+
+ /// Remap a node's operands using the given functor.
+ ///
+ /// Iterate through the operands of \c N and update them in place using \c
+ /// mapOperand.
+ ///
+ /// \pre N.isDistinct() or N.isTemporary().
+ template <class OperandMapper>
+ void remapOperands(MDNode &N, OperandMapper mapOperand);
+};
+
+} // end anonymous namespace
+
+Value *Mapper::mapValue(const Value *V) {
+ ValueToValueMapTy::iterator I = getVM().find(V);
+
+ // If the value already exists in the map, use it.
+ if (I != getVM().end()) {
+ assert(I->second && "Unexpected null mapping");
+ return I->second;
+ }
+
+ // If we have a materializer and it can materialize a value, use that.
+ if (auto *Materializer = getMaterializer()) {
+ if (Value *NewV = Materializer->materialize(const_cast<Value *>(V))) {
+ getVM()[V] = NewV;
+ return NewV;
+ }
+ }
+
+ // Global values do not need to be seeded into the VM if they
+ // are using the identity mapping.
+ if (isa<GlobalValue>(V)) {
+ if (Flags & RF_NullMapMissingGlobalValues)
+ return nullptr;
+ return getVM()[V] = const_cast<Value *>(V);
+ }
+
+ if (const InlineAsm *IA = dyn_cast<InlineAsm>(V)) {
+ // Inline asm may need *type* remapping.
+ FunctionType *NewTy = IA->getFunctionType();
+ if (TypeMapper) {
+ NewTy = cast<FunctionType>(TypeMapper->remapType(NewTy));
+
+ if (NewTy != IA->getFunctionType())
+ V = InlineAsm::get(NewTy, IA->getAsmString(), IA->getConstraintString(),
+ IA->hasSideEffects(), IA->isAlignStack(),
+ IA->getDialect());
+ }
+
+ return getVM()[V] = const_cast<Value *>(V);
+ }
+
+ if (const auto *MDV = dyn_cast<MetadataAsValue>(V)) {
+ const Metadata *MD = MDV->getMetadata();
+
+ if (auto *LAM = dyn_cast<LocalAsMetadata>(MD)) {
+ // Look through to grab the local value.
+ if (Value *LV = mapValue(LAM->getValue())) {
+ if (V == LAM->getValue())
+ return const_cast<Value *>(V);
+ return MetadataAsValue::get(V->getContext(), ValueAsMetadata::get(LV));
+ }
+
+ // FIXME: always return nullptr once Verifier::verifyDominatesUse()
+ // ensures metadata operands only reference defined SSA values.
+ return (Flags & RF_IgnoreMissingLocals)
+ ? nullptr
+ : MetadataAsValue::get(V->getContext(),
+ MDTuple::get(V->getContext(), None));
+ }
+
+ // If this is a module-level metadata and we know that nothing at the module
+ // level is changing, then use an identity mapping.
+ if (Flags & RF_NoModuleLevelChanges)
+ return getVM()[V] = const_cast<Value *>(V);
+
+ // Map the metadata and turn it into a value.
+ auto *MappedMD = mapMetadata(MD);
+ if (MD == MappedMD)
+ return getVM()[V] = const_cast<Value *>(V);
+ return getVM()[V] = MetadataAsValue::get(V->getContext(), MappedMD);
+ }
+
+ // Okay, this either must be a constant (which may or may not be mappable) or
+ // is something that is not in the mapping table.
+ Constant *C = const_cast<Constant*>(dyn_cast<Constant>(V));
+ if (!C)
+ return nullptr;
+
+ if (BlockAddress *BA = dyn_cast<BlockAddress>(C))
+ return mapBlockAddress(*BA);
+
+ auto mapValueOrNull = [this](Value *V) {
+ auto Mapped = mapValue(V);
+ assert((Mapped || (Flags & RF_NullMapMissingGlobalValues)) &&
+ "Unexpected null mapping for constant operand without "
+ "NullMapMissingGlobalValues flag");
+ return Mapped;
+ };
+
+ // Otherwise, we have some other constant to remap. Start by checking to see
+ // if all operands have an identity remapping.
+ unsigned OpNo = 0, NumOperands = C->getNumOperands();
+ Value *Mapped = nullptr;
+ for (; OpNo != NumOperands; ++OpNo) {
+ Value *Op = C->getOperand(OpNo);
+ Mapped = mapValueOrNull(Op);
+ if (!Mapped)
+ return nullptr;
+ if (Mapped != Op)
+ break;
+ }
+
+ // See if the type mapper wants to remap the type as well.
+ Type *NewTy = C->getType();
+ if (TypeMapper)
+ NewTy = TypeMapper->remapType(NewTy);
+
+ // If the result type and all operands match up, then just insert an identity
+ // mapping.
+ if (OpNo == NumOperands && NewTy == C->getType())
+ return getVM()[V] = C;
+
+ // Okay, we need to create a new constant. We've already processed some or
+ // all of the operands, set them all up now.
+ SmallVector<Constant*, 8> Ops;
+ Ops.reserve(NumOperands);
+ for (unsigned j = 0; j != OpNo; ++j)
+ Ops.push_back(cast<Constant>(C->getOperand(j)));
+
+ // If one of the operands mismatch, push it and the other mapped operands.
+ if (OpNo != NumOperands) {
+ Ops.push_back(cast<Constant>(Mapped));
+
+ // Map the rest of the operands that aren't processed yet.
+ for (++OpNo; OpNo != NumOperands; ++OpNo) {
+ Mapped = mapValueOrNull(C->getOperand(OpNo));
+ if (!Mapped)
+ return nullptr;
+ Ops.push_back(cast<Constant>(Mapped));
+ }
+ }
+ Type *NewSrcTy = nullptr;
+ if (TypeMapper)
+ if (auto *GEPO = dyn_cast<GEPOperator>(C))
+ NewSrcTy = TypeMapper->remapType(GEPO->getSourceElementType());
+
+ if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+ return getVM()[V] = CE->getWithOperands(Ops, NewTy, false, NewSrcTy);
+ if (isa<ConstantArray>(C))
+ return getVM()[V] = ConstantArray::get(cast<ArrayType>(NewTy), Ops);
+ if (isa<ConstantStruct>(C))
+ return getVM()[V] = ConstantStruct::get(cast<StructType>(NewTy), Ops);
+ if (isa<ConstantVector>(C))
+ return getVM()[V] = ConstantVector::get(Ops);
+ // If this is a no-operand constant, it must be because the type was remapped.
+ if (isa<UndefValue>(C))
+ return getVM()[V] = UndefValue::get(NewTy);
+ if (isa<ConstantAggregateZero>(C))
+ return getVM()[V] = ConstantAggregateZero::get(NewTy);
+ assert(isa<ConstantPointerNull>(C));
+ return getVM()[V] = ConstantPointerNull::get(cast<PointerType>(NewTy));
+}
+
+Value *Mapper::mapBlockAddress(const BlockAddress &BA) {
+ Function *F = cast<Function>(mapValue(BA.getFunction()));
+
+ // F may not have materialized its initializer. In that case, create a
+ // dummy basic block for now, and replace it once we've materialized all
+ // the initializers.
+ BasicBlock *BB;
+ if (F->empty()) {
+ DelayedBBs.push_back(DelayedBasicBlock(BA));
+ BB = DelayedBBs.back().TempBB.get();
+ } else {
+ BB = cast_or_null<BasicBlock>(mapValue(BA.getBasicBlock()));
+ }
+
+ return getVM()[&BA] = BlockAddress::get(F, BB ? BB : BA.getBasicBlock());
+}
+
+Metadata *Mapper::mapToMetadata(const Metadata *Key, Metadata *Val) {
+ getVM().MD()[Key].reset(Val);
+ return Val;
+}
+
+Metadata *Mapper::mapToSelf(const Metadata *MD) {
+ return mapToMetadata(MD, const_cast<Metadata *>(MD));
+}
+
+Optional<Metadata *> MDNodeMapper::tryToMapOperand(const Metadata *Op) {
+ if (!Op)
+ return nullptr;
+
+ if (Optional<Metadata *> MappedOp = M.mapSimpleMetadata(Op)) {
+#ifndef NDEBUG
+ if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op))
+ assert((!*MappedOp || M.getVM().count(CMD->getValue()) ||
+ M.getVM().getMappedMD(Op)) &&
+ "Expected Value to be memoized");
+ else
+ assert((isa<MDString>(Op) || M.getVM().getMappedMD(Op)) &&
+ "Expected result to be memoized");
+#endif
+ return *MappedOp;
+ }
+
+ const MDNode &N = *cast<MDNode>(Op);
+ if (N.isDistinct())
+ return mapDistinctNode(N);
+ return None;
+}
+
+static Metadata *cloneOrBuildODR(const MDNode &N) {
+ auto *CT = dyn_cast<DICompositeType>(&N);
+ // If ODR type uniquing is enabled, we would have uniqued composite types
+ // with identifiers during bitcode reading, so we can just use CT.
+ if (CT && CT->getContext().isODRUniquingDebugTypes() &&
+ CT->getIdentifier() != "")
+ return const_cast<DICompositeType *>(CT);
+ return MDNode::replaceWithDistinct(N.clone());
+}
+
+MDNode *MDNodeMapper::mapDistinctNode(const MDNode &N) {
+ assert(N.isDistinct() && "Expected a distinct node");
+ assert(!M.getVM().getMappedMD(&N) && "Expected an unmapped node");
+ DistinctWorklist.push_back(
+ cast<MDNode>((M.Flags & RF_MoveDistinctMDs)
+ ? M.mapToSelf(&N)
+ : M.mapToMetadata(&N, cloneOrBuildODR(N))));
+ return DistinctWorklist.back();
+}
+
+static ConstantAsMetadata *wrapConstantAsMetadata(const ConstantAsMetadata &CMD,
+ Value *MappedV) {
+ if (CMD.getValue() == MappedV)
+ return const_cast<ConstantAsMetadata *>(&CMD);
+ return MappedV ? ConstantAsMetadata::getConstant(MappedV) : nullptr;
+}
+
+Optional<Metadata *> MDNodeMapper::getMappedOp(const Metadata *Op) const {
+ if (!Op)
+ return nullptr;
+
+ if (Optional<Metadata *> MappedOp = M.getVM().getMappedMD(Op))
+ return *MappedOp;
+
+ if (isa<MDString>(Op))
+ return const_cast<Metadata *>(Op);
+
+ if (auto *CMD = dyn_cast<ConstantAsMetadata>(Op))
+ return wrapConstantAsMetadata(*CMD, M.getVM().lookup(CMD->getValue()));
+
+ return None;
+}
+
+Metadata &MDNodeMapper::UniquedGraph::getFwdReference(MDNode &Op) {
+ auto Where = Info.find(&Op);
+ assert(Where != Info.end() && "Expected a valid reference");
+
+ auto &OpD = Where->second;
+ if (!OpD.HasChanged)
+ return Op;
+
+ // Lazily construct a temporary node.
+ if (!OpD.Placeholder)
+ OpD.Placeholder = Op.clone();
+
+ return *OpD.Placeholder;
+}
+
+template <class OperandMapper>
+void MDNodeMapper::remapOperands(MDNode &N, OperandMapper mapOperand) {
+ assert(!N.isUniqued() && "Expected distinct or temporary nodes");
+ for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) {
+ Metadata *Old = N.getOperand(I);
+ Metadata *New = mapOperand(Old);
+
+ if (Old != New)
+ N.replaceOperandWith(I, New);
+ }
+}
+
+namespace {
+
+/// An entry in the worklist for the post-order traversal.
+struct POTWorklistEntry {
+ MDNode *N; ///< Current node.
+ MDNode::op_iterator Op; ///< Current operand of \c N.
+
+ /// Keep a flag of whether operands have changed in the worklist to avoid
+ /// hitting the map in \a UniquedGraph.
+ bool HasChanged = false;
+
+ POTWorklistEntry(MDNode &N) : N(&N), Op(N.op_begin()) {}
+};
+
+} // end anonymous namespace
+
+bool MDNodeMapper::createPOT(UniquedGraph &G, const MDNode &FirstN) {
+ assert(G.Info.empty() && "Expected a fresh traversal");
+ assert(FirstN.isUniqued() && "Expected uniqued node in POT");
+
+ // Construct a post-order traversal of the uniqued subgraph under FirstN.
+ bool AnyChanges = false;
+ SmallVector<POTWorklistEntry, 16> Worklist;
+ Worklist.push_back(POTWorklistEntry(const_cast<MDNode &>(FirstN)));
+ (void)G.Info[&FirstN];
+ while (!Worklist.empty()) {
+ // Start or continue the traversal through the this node's operands.
+ auto &WE = Worklist.back();
+ if (MDNode *N = visitOperands(G, WE.Op, WE.N->op_end(), WE.HasChanged)) {
+ // Push a new node to traverse first.
+ Worklist.push_back(POTWorklistEntry(*N));
+ continue;
+ }
+
+ // Push the node onto the POT.
+ assert(WE.N->isUniqued() && "Expected only uniqued nodes");
+ assert(WE.Op == WE.N->op_end() && "Expected to visit all operands");
+ auto &D = G.Info[WE.N];
+ AnyChanges |= D.HasChanged = WE.HasChanged;
+ D.ID = G.POT.size();
+ G.POT.push_back(WE.N);
+
+ // Pop the node off the worklist.
+ Worklist.pop_back();
+ }
+ return AnyChanges;
+}
+
+MDNode *MDNodeMapper::visitOperands(UniquedGraph &G, MDNode::op_iterator &I,
+ MDNode::op_iterator E, bool &HasChanged) {
+ while (I != E) {
+ Metadata *Op = *I++; // Increment even on early return.
+ if (Optional<Metadata *> MappedOp = tryToMapOperand(Op)) {
+ // Check if the operand changes.
+ HasChanged |= Op != *MappedOp;
+ continue;
+ }
+
+ // A uniqued metadata node.
+ MDNode &OpN = *cast<MDNode>(Op);
+ assert(OpN.isUniqued() &&
+ "Only uniqued operands cannot be mapped immediately");
+ if (G.Info.insert(std::make_pair(&OpN, Data())).second)
+ return &OpN; // This is a new one. Return it.
+ }
+ return nullptr;
+}
+
+void MDNodeMapper::UniquedGraph::propagateChanges() {
+ bool AnyChanges;
+ do {
+ AnyChanges = false;
+ for (MDNode *N : POT) {
+ auto &D = Info[N];
+ if (D.HasChanged)
+ continue;
+
+ if (llvm::none_of(N->operands(), [&](const Metadata *Op) {
+ auto Where = Info.find(Op);
+ return Where != Info.end() && Where->second.HasChanged;
+ }))
+ continue;
+
+ AnyChanges = D.HasChanged = true;
+ }
+ } while (AnyChanges);
+}
+
+void MDNodeMapper::mapNodesInPOT(UniquedGraph &G) {
+ // Construct uniqued nodes, building forward references as necessary.
+ SmallVector<MDNode *, 16> CyclicNodes;
+ for (auto *N : G.POT) {
+ auto &D = G.Info[N];
+ if (!D.HasChanged) {
+ // The node hasn't changed.
+ M.mapToSelf(N);
+ continue;
+ }
+
+ // Remember whether this node had a placeholder.
+ bool HadPlaceholder(D.Placeholder);
+
+ // Clone the uniqued node and remap the operands.
+ TempMDNode ClonedN = D.Placeholder ? std::move(D.Placeholder) : N->clone();
+ remapOperands(*ClonedN, [this, &D, &G](Metadata *Old) {
+ if (Optional<Metadata *> MappedOp = getMappedOp(Old))
+ return *MappedOp;
+ (void)D;
+ assert(G.Info[Old].ID > D.ID && "Expected a forward reference");
+ return &G.getFwdReference(*cast<MDNode>(Old));
+ });
+
+ auto *NewN = MDNode::replaceWithUniqued(std::move(ClonedN));
+ M.mapToMetadata(N, NewN);
+
+ // Nodes that were referenced out of order in the POT are involved in a
+ // uniquing cycle.
+ if (HadPlaceholder)
+ CyclicNodes.push_back(NewN);
+ }
+
+ // Resolve cycles.
+ for (auto *N : CyclicNodes)
+ if (!N->isResolved())
+ N->resolveCycles();
+}
+
+Metadata *MDNodeMapper::map(const MDNode &N) {
+ assert(DistinctWorklist.empty() && "MDNodeMapper::map is not recursive");
+ assert(!(M.Flags & RF_NoModuleLevelChanges) &&
+ "MDNodeMapper::map assumes module-level changes");
+
+ // Require resolved nodes whenever metadata might be remapped.
+ assert(N.isResolved() && "Unexpected unresolved node");
+
+ Metadata *MappedN =
+ N.isUniqued() ? mapTopLevelUniquedNode(N) : mapDistinctNode(N);
+ while (!DistinctWorklist.empty())
+ remapOperands(*DistinctWorklist.pop_back_val(), [this](Metadata *Old) {
+ if (Optional<Metadata *> MappedOp = tryToMapOperand(Old))
+ return *MappedOp;
+ return mapTopLevelUniquedNode(*cast<MDNode>(Old));
+ });
+ return MappedN;
+}
+
+Metadata *MDNodeMapper::mapTopLevelUniquedNode(const MDNode &FirstN) {
+ assert(FirstN.isUniqued() && "Expected uniqued node");
+
+ // Create a post-order traversal of uniqued nodes under FirstN.
+ UniquedGraph G;
+ if (!createPOT(G, FirstN)) {
+ // Return early if no nodes have changed.
+ for (const MDNode *N : G.POT)
+ M.mapToSelf(N);
+ return &const_cast<MDNode &>(FirstN);
+ }
+
+ // Update graph with all nodes that have changed.
+ G.propagateChanges();
+
+ // Map all the nodes in the graph.
+ mapNodesInPOT(G);
+
+ // Return the original node, remapped.
+ return *getMappedOp(&FirstN);
+}
+
+Optional<Metadata *> Mapper::mapSimpleMetadata(const Metadata *MD) {
+ // If the value already exists in the map, use it.
+ if (Optional<Metadata *> NewMD = getVM().getMappedMD(MD))
+ return *NewMD;
+
+ if (isa<MDString>(MD))
+ return const_cast<Metadata *>(MD);
+
+ // This is a module-level metadata. If nothing at the module level is
+ // changing, use an identity mapping.
+ if ((Flags & RF_NoModuleLevelChanges))
+ return const_cast<Metadata *>(MD);
+
+ if (auto *CMD = dyn_cast<ConstantAsMetadata>(MD)) {
+ // Don't memoize ConstantAsMetadata. Instead of lasting until the
+ // LLVMContext is destroyed, they can be deleted when the GlobalValue they
+ // reference is destructed. These aren't super common, so the extra
+ // indirection isn't that expensive.
+ return wrapConstantAsMetadata(*CMD, mapValue(CMD->getValue()));
+ }
+
+ assert(isa<MDNode>(MD) && "Expected a metadata node");
+
+ return None;
+}
+
+Metadata *Mapper::mapMetadata(const Metadata *MD) {
+ assert(MD && "Expected valid metadata");
+ assert(!isa<LocalAsMetadata>(MD) && "Unexpected local metadata");
+
+ if (Optional<Metadata *> NewMD = mapSimpleMetadata(MD))
+ return *NewMD;
+
+ return MDNodeMapper(*this).map(*cast<MDNode>(MD));
+}
+
+void Mapper::flush() {
+ // Flush out the worklist of global values.
+ while (!Worklist.empty()) {
+ WorklistEntry E = Worklist.pop_back_val();
+ CurrentMCID = E.MCID;
+ switch (E.Kind) {
+ case WorklistEntry::MapGlobalInit:
+ E.Data.GVInit.GV->setInitializer(mapConstant(E.Data.GVInit.Init));
+ remapGlobalObjectMetadata(*E.Data.GVInit.GV);
+ break;
+ case WorklistEntry::MapAppendingVar: {
+ unsigned PrefixSize = AppendingInits.size() - E.AppendingGVNumNewMembers;
// mapAppendingVariable call can change AppendingInits if initalizer for
// the variable depends on another appending global, because of that inits
// need to be extracted and updated before the call.
SmallVector<Constant *, 8> NewInits(
drop_begin(AppendingInits, PrefixSize));
AppendingInits.resize(PrefixSize);
- mapAppendingVariable(*E.Data.AppendingGV.GV,
- E.Data.AppendingGV.InitPrefix,
+ mapAppendingVariable(*E.Data.AppendingGV.GV,
+ E.Data.AppendingGV.InitPrefix,
E.AppendingGVIsOldCtorDtor, makeArrayRef(NewInits));
- break;
- }
- case WorklistEntry::MapGlobalIndirectSymbol:
- E.Data.GlobalIndirectSymbol.GIS->setIndirectSymbol(
- mapConstant(E.Data.GlobalIndirectSymbol.Target));
- break;
- case WorklistEntry::RemapFunction:
- remapFunction(*E.Data.RemapF);
- break;
- }
- }
- CurrentMCID = 0;
-
- // Finish logic for block addresses now that all global values have been
- // handled.
- while (!DelayedBBs.empty()) {
- DelayedBasicBlock DBB = DelayedBBs.pop_back_val();
- BasicBlock *BB = cast_or_null<BasicBlock>(mapValue(DBB.OldBB));
- DBB.TempBB->replaceAllUsesWith(BB ? BB : DBB.OldBB);
- }
-}
-
-void Mapper::remapInstruction(Instruction *I) {
- // Remap operands.
- for (Use &Op : I->operands()) {
- Value *V = mapValue(Op);
- // If we aren't ignoring missing entries, assert that something happened.
- if (V)
- Op = V;
- else
- assert((Flags & RF_IgnoreMissingLocals) &&
- "Referenced value not in value map!");
- }
-
- // Remap phi nodes' incoming blocks.
- if (PHINode *PN = dyn_cast<PHINode>(I)) {
- for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
- Value *V = mapValue(PN->getIncomingBlock(i));
- // If we aren't ignoring missing entries, assert that something happened.
- if (V)
- PN->setIncomingBlock(i, cast<BasicBlock>(V));
- else
- assert((Flags & RF_IgnoreMissingLocals) &&
- "Referenced block not in value map!");
- }
- }
-
- // Remap attached metadata.
- SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
- I->getAllMetadata(MDs);
- for (const auto &MI : MDs) {
- MDNode *Old = MI.second;
- MDNode *New = cast_or_null<MDNode>(mapMetadata(Old));
- if (New != Old)
- I->setMetadata(MI.first, New);
- }
-
- if (!TypeMapper)
- return;
-
- // If the instruction's type is being remapped, do so now.
- if (auto *CB = dyn_cast<CallBase>(I)) {
- SmallVector<Type *, 3> Tys;
- FunctionType *FTy = CB->getFunctionType();
- Tys.reserve(FTy->getNumParams());
- for (Type *Ty : FTy->params())
- Tys.push_back(TypeMapper->remapType(Ty));
- CB->mutateFunctionType(FunctionType::get(
- TypeMapper->remapType(I->getType()), Tys, FTy->isVarArg()));
-
- LLVMContext &C = CB->getContext();
- AttributeList Attrs = CB->getAttributes();
- for (unsigned i = 0; i < Attrs.getNumAttrSets(); ++i) {
+ break;
+ }
+ case WorklistEntry::MapGlobalIndirectSymbol:
+ E.Data.GlobalIndirectSymbol.GIS->setIndirectSymbol(
+ mapConstant(E.Data.GlobalIndirectSymbol.Target));
+ break;
+ case WorklistEntry::RemapFunction:
+ remapFunction(*E.Data.RemapF);
+ break;
+ }
+ }
+ CurrentMCID = 0;
+
+ // Finish logic for block addresses now that all global values have been
+ // handled.
+ while (!DelayedBBs.empty()) {
+ DelayedBasicBlock DBB = DelayedBBs.pop_back_val();
+ BasicBlock *BB = cast_or_null<BasicBlock>(mapValue(DBB.OldBB));
+ DBB.TempBB->replaceAllUsesWith(BB ? BB : DBB.OldBB);
+ }
+}
+
+void Mapper::remapInstruction(Instruction *I) {
+ // Remap operands.
+ for (Use &Op : I->operands()) {
+ Value *V = mapValue(Op);
+ // If we aren't ignoring missing entries, assert that something happened.
+ if (V)
+ Op = V;
+ else
+ assert((Flags & RF_IgnoreMissingLocals) &&
+ "Referenced value not in value map!");
+ }
+
+ // Remap phi nodes' incoming blocks.
+ if (PHINode *PN = dyn_cast<PHINode>(I)) {
+ for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+ Value *V = mapValue(PN->getIncomingBlock(i));
+ // If we aren't ignoring missing entries, assert that something happened.
+ if (V)
+ PN->setIncomingBlock(i, cast<BasicBlock>(V));
+ else
+ assert((Flags & RF_IgnoreMissingLocals) &&
+ "Referenced block not in value map!");
+ }
+ }
+
+ // Remap attached metadata.
+ SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+ I->getAllMetadata(MDs);
+ for (const auto &MI : MDs) {
+ MDNode *Old = MI.second;
+ MDNode *New = cast_or_null<MDNode>(mapMetadata(Old));
+ if (New != Old)
+ I->setMetadata(MI.first, New);
+ }
+
+ if (!TypeMapper)
+ return;
+
+ // If the instruction's type is being remapped, do so now.
+ if (auto *CB = dyn_cast<CallBase>(I)) {
+ SmallVector<Type *, 3> Tys;
+ FunctionType *FTy = CB->getFunctionType();
+ Tys.reserve(FTy->getNumParams());
+ for (Type *Ty : FTy->params())
+ Tys.push_back(TypeMapper->remapType(Ty));
+ CB->mutateFunctionType(FunctionType::get(
+ TypeMapper->remapType(I->getType()), Tys, FTy->isVarArg()));
+
+ LLVMContext &C = CB->getContext();
+ AttributeList Attrs = CB->getAttributes();
+ for (unsigned i = 0; i < Attrs.getNumAttrSets(); ++i) {
for (Attribute::AttrKind TypedAttr :
{Attribute::ByVal, Attribute::StructRet, Attribute::ByRef}) {
if (Type *Ty = Attrs.getAttribute(i, TypedAttr).getValueAsType()) {
@@ -908,234 +908,234 @@ void Mapper::remapInstruction(Instruction *I) {
TypeMapper->remapType(Ty));
break;
}
- }
- }
- CB->setAttributes(Attrs);
- return;
- }
- if (auto *AI = dyn_cast<AllocaInst>(I))
- AI->setAllocatedType(TypeMapper->remapType(AI->getAllocatedType()));
- if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
- GEP->setSourceElementType(
- TypeMapper->remapType(GEP->getSourceElementType()));
- GEP->setResultElementType(
- TypeMapper->remapType(GEP->getResultElementType()));
- }
- I->mutateType(TypeMapper->remapType(I->getType()));
-}
-
-void Mapper::remapGlobalObjectMetadata(GlobalObject &GO) {
- SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
- GO.getAllMetadata(MDs);
- GO.clearMetadata();
- for (const auto &I : MDs)
- GO.addMetadata(I.first, *cast<MDNode>(mapMetadata(I.second)));
-}
-
-void Mapper::remapFunction(Function &F) {
- // Remap the operands.
- for (Use &Op : F.operands())
- if (Op)
- Op = mapValue(Op);
-
- // Remap the metadata attachments.
- remapGlobalObjectMetadata(F);
-
- // Remap the argument types.
- if (TypeMapper)
- for (Argument &A : F.args())
- A.mutateType(TypeMapper->remapType(A.getType()));
-
- // Remap the instructions.
- for (BasicBlock &BB : F)
- for (Instruction &I : BB)
- remapInstruction(&I);
-}
-
-void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
- bool IsOldCtorDtor,
- ArrayRef<Constant *> NewMembers) {
- SmallVector<Constant *, 16> Elements;
- if (InitPrefix) {
- unsigned NumElements =
- cast<ArrayType>(InitPrefix->getType())->getNumElements();
- for (unsigned I = 0; I != NumElements; ++I)
- Elements.push_back(InitPrefix->getAggregateElement(I));
- }
-
- PointerType *VoidPtrTy;
- Type *EltTy;
- if (IsOldCtorDtor) {
- // FIXME: This upgrade is done during linking to support the C API. See
- // also IRLinker::linkAppendingVarProto() in IRMover.cpp.
- VoidPtrTy = Type::getInt8Ty(GV.getContext())->getPointerTo();
- auto &ST = *cast<StructType>(NewMembers.front()->getType());
- Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy};
- EltTy = StructType::get(GV.getContext(), Tys, false);
- }
-
- for (auto *V : NewMembers) {
- Constant *NewV;
- if (IsOldCtorDtor) {
- auto *S = cast<ConstantStruct>(V);
- auto *E1 = cast<Constant>(mapValue(S->getOperand(0)));
- auto *E2 = cast<Constant>(mapValue(S->getOperand(1)));
- Constant *Null = Constant::getNullValue(VoidPtrTy);
- NewV = ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null);
- } else {
- NewV = cast_or_null<Constant>(mapValue(V));
- }
- Elements.push_back(NewV);
- }
-
- GV.setInitializer(ConstantArray::get(
- cast<ArrayType>(GV.getType()->getElementType()), Elements));
-}
-
-void Mapper::scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
- unsigned MCID) {
- assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
- assert(MCID < MCs.size() && "Invalid mapping context");
-
- WorklistEntry WE;
- WE.Kind = WorklistEntry::MapGlobalInit;
- WE.MCID = MCID;
- WE.Data.GVInit.GV = &GV;
- WE.Data.GVInit.Init = &Init;
- Worklist.push_back(WE);
-}
-
-void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV,
- Constant *InitPrefix,
- bool IsOldCtorDtor,
- ArrayRef<Constant *> NewMembers,
- unsigned MCID) {
- assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
- assert(MCID < MCs.size() && "Invalid mapping context");
-
- WorklistEntry WE;
- WE.Kind = WorklistEntry::MapAppendingVar;
- WE.MCID = MCID;
- WE.Data.AppendingGV.GV = &GV;
- WE.Data.AppendingGV.InitPrefix = InitPrefix;
- WE.AppendingGVIsOldCtorDtor = IsOldCtorDtor;
- WE.AppendingGVNumNewMembers = NewMembers.size();
- Worklist.push_back(WE);
- AppendingInits.append(NewMembers.begin(), NewMembers.end());
-}
-
-void Mapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS,
- Constant &Target, unsigned MCID) {
- assert(AlreadyScheduled.insert(&GIS).second && "Should not reschedule");
- assert(MCID < MCs.size() && "Invalid mapping context");
-
- WorklistEntry WE;
- WE.Kind = WorklistEntry::MapGlobalIndirectSymbol;
- WE.MCID = MCID;
- WE.Data.GlobalIndirectSymbol.GIS = &GIS;
- WE.Data.GlobalIndirectSymbol.Target = &Target;
- Worklist.push_back(WE);
-}
-
-void Mapper::scheduleRemapFunction(Function &F, unsigned MCID) {
- assert(AlreadyScheduled.insert(&F).second && "Should not reschedule");
- assert(MCID < MCs.size() && "Invalid mapping context");
-
- WorklistEntry WE;
- WE.Kind = WorklistEntry::RemapFunction;
- WE.MCID = MCID;
- WE.Data.RemapF = &F;
- Worklist.push_back(WE);
-}
-
-void Mapper::addFlags(RemapFlags Flags) {
- assert(!hasWorkToDo() && "Expected to have flushed the worklist");
- this->Flags = this->Flags | Flags;
-}
-
-static Mapper *getAsMapper(void *pImpl) {
- return reinterpret_cast<Mapper *>(pImpl);
-}
-
-namespace {
-
-class FlushingMapper {
- Mapper &M;
-
-public:
- explicit FlushingMapper(void *pImpl) : M(*getAsMapper(pImpl)) {
- assert(!M.hasWorkToDo() && "Expected to be flushed");
- }
-
- ~FlushingMapper() { M.flush(); }
-
- Mapper *operator->() const { return &M; }
-};
-
-} // end anonymous namespace
-
-ValueMapper::ValueMapper(ValueToValueMapTy &VM, RemapFlags Flags,
- ValueMapTypeRemapper *TypeMapper,
- ValueMaterializer *Materializer)
- : pImpl(new Mapper(VM, Flags, TypeMapper, Materializer)) {}
-
-ValueMapper::~ValueMapper() { delete getAsMapper(pImpl); }
-
-unsigned
-ValueMapper::registerAlternateMappingContext(ValueToValueMapTy &VM,
- ValueMaterializer *Materializer) {
- return getAsMapper(pImpl)->registerAlternateMappingContext(VM, Materializer);
-}
-
-void ValueMapper::addFlags(RemapFlags Flags) {
- FlushingMapper(pImpl)->addFlags(Flags);
-}
-
-Value *ValueMapper::mapValue(const Value &V) {
- return FlushingMapper(pImpl)->mapValue(&V);
-}
-
-Constant *ValueMapper::mapConstant(const Constant &C) {
- return cast_or_null<Constant>(mapValue(C));
-}
-
-Metadata *ValueMapper::mapMetadata(const Metadata &MD) {
- return FlushingMapper(pImpl)->mapMetadata(&MD);
-}
-
-MDNode *ValueMapper::mapMDNode(const MDNode &N) {
- return cast_or_null<MDNode>(mapMetadata(N));
-}
-
-void ValueMapper::remapInstruction(Instruction &I) {
- FlushingMapper(pImpl)->remapInstruction(&I);
-}
-
-void ValueMapper::remapFunction(Function &F) {
- FlushingMapper(pImpl)->remapFunction(F);
-}
-
-void ValueMapper::scheduleMapGlobalInitializer(GlobalVariable &GV,
- Constant &Init,
- unsigned MCID) {
- getAsMapper(pImpl)->scheduleMapGlobalInitializer(GV, Init, MCID);
-}
-
-void ValueMapper::scheduleMapAppendingVariable(GlobalVariable &GV,
- Constant *InitPrefix,
- bool IsOldCtorDtor,
- ArrayRef<Constant *> NewMembers,
- unsigned MCID) {
- getAsMapper(pImpl)->scheduleMapAppendingVariable(
- GV, InitPrefix, IsOldCtorDtor, NewMembers, MCID);
-}
-
-void ValueMapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS,
- Constant &Target,
- unsigned MCID) {
- getAsMapper(pImpl)->scheduleMapGlobalIndirectSymbol(GIS, Target, MCID);
-}
-
-void ValueMapper::scheduleRemapFunction(Function &F, unsigned MCID) {
- getAsMapper(pImpl)->scheduleRemapFunction(F, MCID);
-}
+ }
+ }
+ CB->setAttributes(Attrs);
+ return;
+ }
+ if (auto *AI = dyn_cast<AllocaInst>(I))
+ AI->setAllocatedType(TypeMapper->remapType(AI->getAllocatedType()));
+ if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+ GEP->setSourceElementType(
+ TypeMapper->remapType(GEP->getSourceElementType()));
+ GEP->setResultElementType(
+ TypeMapper->remapType(GEP->getResultElementType()));
+ }
+ I->mutateType(TypeMapper->remapType(I->getType()));
+}
+
+void Mapper::remapGlobalObjectMetadata(GlobalObject &GO) {
+ SmallVector<std::pair<unsigned, MDNode *>, 8> MDs;
+ GO.getAllMetadata(MDs);
+ GO.clearMetadata();
+ for (const auto &I : MDs)
+ GO.addMetadata(I.first, *cast<MDNode>(mapMetadata(I.second)));
+}
+
+void Mapper::remapFunction(Function &F) {
+ // Remap the operands.
+ for (Use &Op : F.operands())
+ if (Op)
+ Op = mapValue(Op);
+
+ // Remap the metadata attachments.
+ remapGlobalObjectMetadata(F);
+
+ // Remap the argument types.
+ if (TypeMapper)
+ for (Argument &A : F.args())
+ A.mutateType(TypeMapper->remapType(A.getType()));
+
+ // Remap the instructions.
+ for (BasicBlock &BB : F)
+ for (Instruction &I : BB)
+ remapInstruction(&I);
+}
+
+void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix,
+ bool IsOldCtorDtor,
+ ArrayRef<Constant *> NewMembers) {
+ SmallVector<Constant *, 16> Elements;
+ if (InitPrefix) {
+ unsigned NumElements =
+ cast<ArrayType>(InitPrefix->getType())->getNumElements();
+ for (unsigned I = 0; I != NumElements; ++I)
+ Elements.push_back(InitPrefix->getAggregateElement(I));
+ }
+
+ PointerType *VoidPtrTy;
+ Type *EltTy;
+ if (IsOldCtorDtor) {
+ // FIXME: This upgrade is done during linking to support the C API. See
+ // also IRLinker::linkAppendingVarProto() in IRMover.cpp.
+ VoidPtrTy = Type::getInt8Ty(GV.getContext())->getPointerTo();
+ auto &ST = *cast<StructType>(NewMembers.front()->getType());
+ Type *Tys[3] = {ST.getElementType(0), ST.getElementType(1), VoidPtrTy};
+ EltTy = StructType::get(GV.getContext(), Tys, false);
+ }
+
+ for (auto *V : NewMembers) {
+ Constant *NewV;
+ if (IsOldCtorDtor) {
+ auto *S = cast<ConstantStruct>(V);
+ auto *E1 = cast<Constant>(mapValue(S->getOperand(0)));
+ auto *E2 = cast<Constant>(mapValue(S->getOperand(1)));
+ Constant *Null = Constant::getNullValue(VoidPtrTy);
+ NewV = ConstantStruct::get(cast<StructType>(EltTy), E1, E2, Null);
+ } else {
+ NewV = cast_or_null<Constant>(mapValue(V));
+ }
+ Elements.push_back(NewV);
+ }
+
+ GV.setInitializer(ConstantArray::get(
+ cast<ArrayType>(GV.getType()->getElementType()), Elements));
+}
+
+void Mapper::scheduleMapGlobalInitializer(GlobalVariable &GV, Constant &Init,
+ unsigned MCID) {
+ assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
+ assert(MCID < MCs.size() && "Invalid mapping context");
+
+ WorklistEntry WE;
+ WE.Kind = WorklistEntry::MapGlobalInit;
+ WE.MCID = MCID;
+ WE.Data.GVInit.GV = &GV;
+ WE.Data.GVInit.Init = &Init;
+ Worklist.push_back(WE);
+}
+
+void Mapper::scheduleMapAppendingVariable(GlobalVariable &GV,
+ Constant *InitPrefix,
+ bool IsOldCtorDtor,
+ ArrayRef<Constant *> NewMembers,
+ unsigned MCID) {
+ assert(AlreadyScheduled.insert(&GV).second && "Should not reschedule");
+ assert(MCID < MCs.size() && "Invalid mapping context");
+
+ WorklistEntry WE;
+ WE.Kind = WorklistEntry::MapAppendingVar;
+ WE.MCID = MCID;
+ WE.Data.AppendingGV.GV = &GV;
+ WE.Data.AppendingGV.InitPrefix = InitPrefix;
+ WE.AppendingGVIsOldCtorDtor = IsOldCtorDtor;
+ WE.AppendingGVNumNewMembers = NewMembers.size();
+ Worklist.push_back(WE);
+ AppendingInits.append(NewMembers.begin(), NewMembers.end());
+}
+
+void Mapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS,
+ Constant &Target, unsigned MCID) {
+ assert(AlreadyScheduled.insert(&GIS).second && "Should not reschedule");
+ assert(MCID < MCs.size() && "Invalid mapping context");
+
+ WorklistEntry WE;
+ WE.Kind = WorklistEntry::MapGlobalIndirectSymbol;
+ WE.MCID = MCID;
+ WE.Data.GlobalIndirectSymbol.GIS = &GIS;
+ WE.Data.GlobalIndirectSymbol.Target = &Target;
+ Worklist.push_back(WE);
+}
+
+void Mapper::scheduleRemapFunction(Function &F, unsigned MCID) {
+ assert(AlreadyScheduled.insert(&F).second && "Should not reschedule");
+ assert(MCID < MCs.size() && "Invalid mapping context");
+
+ WorklistEntry WE;
+ WE.Kind = WorklistEntry::RemapFunction;
+ WE.MCID = MCID;
+ WE.Data.RemapF = &F;
+ Worklist.push_back(WE);
+}
+
+void Mapper::addFlags(RemapFlags Flags) {
+ assert(!hasWorkToDo() && "Expected to have flushed the worklist");
+ this->Flags = this->Flags | Flags;
+}
+
+static Mapper *getAsMapper(void *pImpl) {
+ return reinterpret_cast<Mapper *>(pImpl);
+}
+
+namespace {
+
+class FlushingMapper {
+ Mapper &M;
+
+public:
+ explicit FlushingMapper(void *pImpl) : M(*getAsMapper(pImpl)) {
+ assert(!M.hasWorkToDo() && "Expected to be flushed");
+ }
+
+ ~FlushingMapper() { M.flush(); }
+
+ Mapper *operator->() const { return &M; }
+};
+
+} // end anonymous namespace
+
+ValueMapper::ValueMapper(ValueToValueMapTy &VM, RemapFlags Flags,
+ ValueMapTypeRemapper *TypeMapper,
+ ValueMaterializer *Materializer)
+ : pImpl(new Mapper(VM, Flags, TypeMapper, Materializer)) {}
+
+ValueMapper::~ValueMapper() { delete getAsMapper(pImpl); }
+
+unsigned
+ValueMapper::registerAlternateMappingContext(ValueToValueMapTy &VM,
+ ValueMaterializer *Materializer) {
+ return getAsMapper(pImpl)->registerAlternateMappingContext(VM, Materializer);
+}
+
+void ValueMapper::addFlags(RemapFlags Flags) {
+ FlushingMapper(pImpl)->addFlags(Flags);
+}
+
+Value *ValueMapper::mapValue(const Value &V) {
+ return FlushingMapper(pImpl)->mapValue(&V);
+}
+
+Constant *ValueMapper::mapConstant(const Constant &C) {
+ return cast_or_null<Constant>(mapValue(C));
+}
+
+Metadata *ValueMapper::mapMetadata(const Metadata &MD) {
+ return FlushingMapper(pImpl)->mapMetadata(&MD);
+}
+
+MDNode *ValueMapper::mapMDNode(const MDNode &N) {
+ return cast_or_null<MDNode>(mapMetadata(N));
+}
+
+void ValueMapper::remapInstruction(Instruction &I) {
+ FlushingMapper(pImpl)->remapInstruction(&I);
+}
+
+void ValueMapper::remapFunction(Function &F) {
+ FlushingMapper(pImpl)->remapFunction(F);
+}
+
+void ValueMapper::scheduleMapGlobalInitializer(GlobalVariable &GV,
+ Constant &Init,
+ unsigned MCID) {
+ getAsMapper(pImpl)->scheduleMapGlobalInitializer(GV, Init, MCID);
+}
+
+void ValueMapper::scheduleMapAppendingVariable(GlobalVariable &GV,
+ Constant *InitPrefix,
+ bool IsOldCtorDtor,
+ ArrayRef<Constant *> NewMembers,
+ unsigned MCID) {
+ getAsMapper(pImpl)->scheduleMapAppendingVariable(
+ GV, InitPrefix, IsOldCtorDtor, NewMembers, MCID);
+}
+
+void ValueMapper::scheduleMapGlobalIndirectSymbol(GlobalIndirectSymbol &GIS,
+ Constant &Target,
+ unsigned MCID) {
+ getAsMapper(pImpl)->scheduleMapGlobalIndirectSymbol(GIS, Target, MCID);
+}
+
+void ValueMapper::scheduleRemapFunction(Function &F, unsigned MCID) {
+ getAsMapper(pImpl)->scheduleRemapFunction(F, MCID);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Utils/ya.make b/contrib/libs/llvm12/lib/Transforms/Utils/ya.make
index f7869c85cd..c07d5d6db6 100644
--- a/contrib/libs/llvm12/lib/Transforms/Utils/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/Utils/ya.make
@@ -1,104 +1,104 @@
-# Generated by devtools/yamaker.
-
-LIBRARY()
-
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
OWNER(
orivej
g:cpp-contrib
)
-
+
LICENSE(Apache-2.0 WITH LLVM-exception)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-PEERDIR(
+PEERDIR(
contrib/libs/llvm12
contrib/libs/llvm12/include
contrib/libs/llvm12/lib/Analysis
contrib/libs/llvm12/lib/IR
contrib/libs/llvm12/lib/Support
-)
-
+)
+
ADDINCL(
contrib/libs/llvm12/lib/Transforms/Utils
)
-
-NO_COMPILER_WARNINGS()
-
-NO_UTIL()
-
-SRCS(
- AMDGPUEmitPrintf.cpp
- ASanStackFrameLayout.cpp
- AddDiscriminators.cpp
- AssumeBundleBuilder.cpp
- BasicBlockUtils.cpp
- BreakCriticalEdges.cpp
- BuildLibCalls.cpp
- BypassSlowDivision.cpp
- CallGraphUpdater.cpp
- CallPromotionUtils.cpp
- CanonicalizeAliases.cpp
- CanonicalizeFreezeInLoops.cpp
- CloneFunction.cpp
- CloneModule.cpp
- CodeExtractor.cpp
- CodeMoverUtils.cpp
- CtorUtils.cpp
- Debugify.cpp
- DemoteRegToStack.cpp
- EntryExitInstrumenter.cpp
- EscapeEnumerator.cpp
- Evaluator.cpp
- FixIrreducible.cpp
- FlattenCFG.cpp
- FunctionComparator.cpp
- FunctionImportUtils.cpp
- GlobalStatus.cpp
- GuardUtils.cpp
- InjectTLIMappings.cpp
- InlineFunction.cpp
- InstructionNamer.cpp
- IntegerDivision.cpp
- LCSSA.cpp
- LibCallsShrinkWrap.cpp
- Local.cpp
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+ AMDGPUEmitPrintf.cpp
+ ASanStackFrameLayout.cpp
+ AddDiscriminators.cpp
+ AssumeBundleBuilder.cpp
+ BasicBlockUtils.cpp
+ BreakCriticalEdges.cpp
+ BuildLibCalls.cpp
+ BypassSlowDivision.cpp
+ CallGraphUpdater.cpp
+ CallPromotionUtils.cpp
+ CanonicalizeAliases.cpp
+ CanonicalizeFreezeInLoops.cpp
+ CloneFunction.cpp
+ CloneModule.cpp
+ CodeExtractor.cpp
+ CodeMoverUtils.cpp
+ CtorUtils.cpp
+ Debugify.cpp
+ DemoteRegToStack.cpp
+ EntryExitInstrumenter.cpp
+ EscapeEnumerator.cpp
+ Evaluator.cpp
+ FixIrreducible.cpp
+ FlattenCFG.cpp
+ FunctionComparator.cpp
+ FunctionImportUtils.cpp
+ GlobalStatus.cpp
+ GuardUtils.cpp
+ InjectTLIMappings.cpp
+ InlineFunction.cpp
+ InstructionNamer.cpp
+ IntegerDivision.cpp
+ LCSSA.cpp
+ LibCallsShrinkWrap.cpp
+ Local.cpp
LoopPeel.cpp
- LoopRotationUtils.cpp
- LoopSimplify.cpp
- LoopUnroll.cpp
- LoopUnrollAndJam.cpp
- LoopUnrollRuntime.cpp
- LoopUtils.cpp
- LoopVersioning.cpp
- LowerInvoke.cpp
- LowerMemIntrinsics.cpp
- LowerSwitch.cpp
+ LoopRotationUtils.cpp
+ LoopSimplify.cpp
+ LoopUnroll.cpp
+ LoopUnrollAndJam.cpp
+ LoopUnrollRuntime.cpp
+ LoopUtils.cpp
+ LoopVersioning.cpp
+ LowerInvoke.cpp
+ LowerMemIntrinsics.cpp
+ LowerSwitch.cpp
MatrixUtils.cpp
- Mem2Reg.cpp
- MetaRenamer.cpp
- ModuleUtils.cpp
- NameAnonGlobals.cpp
- PredicateInfo.cpp
- PromoteMemoryToRegister.cpp
- SSAUpdater.cpp
- SSAUpdaterBulk.cpp
- SanitizerStats.cpp
- ScalarEvolutionExpander.cpp
- SimplifyCFG.cpp
- SimplifyIndVar.cpp
- SimplifyLibCalls.cpp
- SizeOpts.cpp
- SplitModule.cpp
- StripGCRelocates.cpp
- StripNonLineTableDebugInfo.cpp
- SymbolRewriter.cpp
- UnifyFunctionExitNodes.cpp
- UnifyLoopExits.cpp
- UniqueInternalLinkageNames.cpp
- Utils.cpp
- VNCoercion.cpp
- ValueMapper.cpp
-)
-
-END()
+ Mem2Reg.cpp
+ MetaRenamer.cpp
+ ModuleUtils.cpp
+ NameAnonGlobals.cpp
+ PredicateInfo.cpp
+ PromoteMemoryToRegister.cpp
+ SSAUpdater.cpp
+ SSAUpdaterBulk.cpp
+ SanitizerStats.cpp
+ ScalarEvolutionExpander.cpp
+ SimplifyCFG.cpp
+ SimplifyIndVar.cpp
+ SimplifyLibCalls.cpp
+ SizeOpts.cpp
+ SplitModule.cpp
+ StripGCRelocates.cpp
+ StripNonLineTableDebugInfo.cpp
+ SymbolRewriter.cpp
+ UnifyFunctionExitNodes.cpp
+ UnifyLoopExits.cpp
+ UniqueInternalLinkageNames.cpp
+ Utils.cpp
+ VNCoercion.cpp
+ ValueMapper.cpp
+)
+
+END()
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 7abf30b46c..6ec5590d76 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -1,1315 +1,1315 @@
-//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass merges loads/stores to/from sequential memory addresses into vector
-// loads/stores. Although there's nothing GPU-specific in here, this pass is
-// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
-//
-// (For simplicity below we talk about loads only, but everything also applies
-// to stores.)
-//
-// This pass is intended to be run late in the pipeline, after other
-// vectorization opportunities have been exploited. So the assumption here is
-// that immediately following our new vector load we'll need to extract out the
-// individual elements of the load, so we can operate on them individually.
-//
-// On CPUs this transformation is usually not beneficial, because extracting the
-// elements of a vector register is expensive on most architectures. It's
-// usually better just to load each element individually into its own scalar
-// register.
-//
-// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
-// "vector load" loads directly into a series of scalar registers. In effect,
-// extracting the elements of the vector is free. It's therefore always
-// beneficial to vectorize a sequence of loads on these architectures.
-//
-// Vectorizing (perhaps a better name might be "coalescing") loads can have
-// large performance impacts on GPU kernels, and opportunities for vectorizing
-// are common in GPU code. This pass tries very hard to find such
-// opportunities; its runtime is quadratic in the number of loads in a BB.
-//
-// Some CPU architectures, such as ARM, have instructions that load into
-// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
-// could use this pass (with some modifications), but currently it implements
-// its own pass to do something similar to what we do here.
-
-#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/KnownBits.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Vectorize.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdlib>
-#include <tuple>
-#include <utility>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "load-store-vectorizer"
-
-STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
-STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
-
-// FIXME: Assuming stack alignment of 4 is always good enough
-static const unsigned StackAdjustedAlignment = 4;
-
-namespace {
-
-/// ChainID is an arbitrary token that is allowed to be different only for the
-/// accesses that are guaranteed to be considered non-consecutive by
-/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
-/// together and reducing the number of instructions the main search operates on
-/// at a time, i.e. this is to reduce compile time and nothing else as the main
-/// search has O(n^2) time complexity. The underlying type of ChainID should not
-/// be relied upon.
-using ChainID = const Value *;
-using InstrList = SmallVector<Instruction *, 8>;
-using InstrListMap = MapVector<ChainID, InstrList>;
-
-class Vectorizer {
- Function &F;
- AliasAnalysis &AA;
- DominatorTree &DT;
- ScalarEvolution &SE;
- TargetTransformInfo &TTI;
- const DataLayout &DL;
- IRBuilder<> Builder;
-
-public:
- Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
- ScalarEvolution &SE, TargetTransformInfo &TTI)
- : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
- DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
-
- bool run();
-
-private:
- unsigned getPointerAddressSpace(Value *I);
-
- static const unsigned MaxDepth = 3;
-
- bool isConsecutiveAccess(Value *A, Value *B);
- bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
- unsigned Depth = 0) const;
- bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
- unsigned Depth) const;
- bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
- unsigned Depth) const;
-
- /// After vectorization, reorder the instructions that I depends on
- /// (the instructions defining its operands), to ensure they dominate I.
- void reorder(Instruction *I);
-
- /// Returns the first and the last instructions in Chain.
- std::pair<BasicBlock::iterator, BasicBlock::iterator>
- getBoundaryInstrs(ArrayRef<Instruction *> Chain);
-
- /// Erases the original instructions after vectorizing.
- void eraseInstructions(ArrayRef<Instruction *> Chain);
-
- /// "Legalize" the vector type that would be produced by combining \p
- /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
- /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
- /// expected to have more than 4 elements.
- std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
- splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
-
- /// Finds the largest prefix of Chain that's vectorizable, checking for
- /// intervening instructions which may affect the memory accessed by the
- /// instructions within Chain.
- ///
- /// The elements of \p Chain must be all loads or all stores and must be in
- /// address order.
- ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
-
- /// Collects load and store instructions to vectorize.
- std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
-
- /// Processes the collected instructions, the \p Map. The values of \p Map
- /// should be all loads or all stores.
- bool vectorizeChains(InstrListMap &Map);
-
- /// Finds the load/stores to consecutive memory addresses and vectorizes them.
- bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
-
- /// Vectorizes the load instructions in Chain.
- bool
- vectorizeLoadChain(ArrayRef<Instruction *> Chain,
- SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
-
- /// Vectorizes the store instructions in Chain.
- bool
- vectorizeStoreChain(ArrayRef<Instruction *> Chain,
- SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
-
- /// Check if this load/store access is misaligned accesses.
- bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
- unsigned Alignment);
-};
-
-class LoadStoreVectorizerLegacyPass : public FunctionPass {
-public:
- static char ID;
-
- LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
- initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override;
-
- StringRef getPassName() const override {
- return "GPU Load and Store Vectorizer";
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.setPreservesCFG();
- }
-};
-
-} // end anonymous namespace
-
-char LoadStoreVectorizerLegacyPass::ID = 0;
-
-INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
- "Vectorize load and Store instructions", false, false)
-INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
- "Vectorize load and store instructions", false, false)
-
-Pass *llvm::createLoadStoreVectorizerPass() {
- return new LoadStoreVectorizerLegacyPass();
-}
-
-bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
- // Don't vectorize when the attribute NoImplicitFloat is used.
- if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
- return false;
-
- AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
- DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- TargetTransformInfo &TTI =
- getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-
- Vectorizer V(F, AA, DT, SE, TTI);
- return V.run();
-}
-
-PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
- // Don't vectorize when the attribute NoImplicitFloat is used.
- if (F.hasFnAttribute(Attribute::NoImplicitFloat))
- return PreservedAnalyses::all();
-
- AliasAnalysis &AA = AM.getResult<AAManager>(F);
- DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
- ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
- TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
-
- Vectorizer V(F, AA, DT, SE, TTI);
- bool Changed = V.run();
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- return Changed ? PA : PreservedAnalyses::all();
-}
-
-// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
-// vectors of Instructions.
-static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
- SmallVector<Value *, 8> VL(IL.begin(), IL.end());
- propagateMetadata(I, VL);
-}
-
-// Vectorizer Implementation
-bool Vectorizer::run() {
- bool Changed = false;
-
- // Scan the blocks in the function in post order.
- for (BasicBlock *BB : post_order(&F)) {
- InstrListMap LoadRefs, StoreRefs;
- std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
- Changed |= vectorizeChains(LoadRefs);
- Changed |= vectorizeChains(StoreRefs);
- }
-
- return Changed;
-}
-
-unsigned Vectorizer::getPointerAddressSpace(Value *I) {
- if (LoadInst *L = dyn_cast<LoadInst>(I))
- return L->getPointerAddressSpace();
- if (StoreInst *S = dyn_cast<StoreInst>(I))
- return S->getPointerAddressSpace();
- return -1;
-}
-
-// FIXME: Merge with llvm::isConsecutiveAccess
-bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
- Value *PtrA = getLoadStorePointerOperand(A);
- Value *PtrB = getLoadStorePointerOperand(B);
- unsigned ASA = getPointerAddressSpace(A);
- unsigned ASB = getPointerAddressSpace(B);
-
- // Check that the address spaces match and that the pointers are valid.
- if (!PtrA || !PtrB || (ASA != ASB))
- return false;
-
- // Make sure that A and B are different pointers of the same size type.
- Type *PtrATy = PtrA->getType()->getPointerElementType();
- Type *PtrBTy = PtrB->getType()->getPointerElementType();
- if (PtrA == PtrB ||
- PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
- DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
- DL.getTypeStoreSize(PtrATy->getScalarType()) !=
- DL.getTypeStoreSize(PtrBTy->getScalarType()))
- return false;
-
- unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
- APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
-
- return areConsecutivePointers(PtrA, PtrB, Size);
-}
-
-bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
- APInt PtrDelta, unsigned Depth) const {
- unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
- APInt OffsetA(PtrBitWidth, 0);
- APInt OffsetB(PtrBitWidth, 0);
- PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
- PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
-
- unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
-
- if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
- return false;
-
- // In case if we have to shrink the pointer
- // stripAndAccumulateInBoundsConstantOffsets should properly handle a
- // possible overflow and the value should fit into a smallest data type
- // used in the cast/gep chain.
- assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
- OffsetB.getMinSignedBits() <= NewPtrBitWidth);
-
- OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
- OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
- PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
-
- APInt OffsetDelta = OffsetB - OffsetA;
-
- // Check if they are based on the same pointer. That makes the offsets
- // sufficient.
- if (PtrA == PtrB)
- return OffsetDelta == PtrDelta;
-
- // Compute the necessary base pointer delta to have the necessary final delta
- // equal to the pointer delta requested.
- APInt BaseDelta = PtrDelta - OffsetDelta;
-
- // Compute the distance with SCEV between the base pointers.
- const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
- const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
- const SCEV *C = SE.getConstant(BaseDelta);
- const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
- if (X == PtrSCEVB)
- return true;
-
- // The above check will not catch the cases where one of the pointers is
- // factorized but the other one is not, such as (C + (S * (A + B))) vs
- // (AS + BS). Get the minus scev. That will allow re-combining the expresions
- // and getting the simplified difference.
- const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
- if (C == Dist)
- return true;
-
- // Sometimes even this doesn't work, because SCEV can't always see through
- // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
- // things the hard way.
- return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
-}
-
-bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
- APInt PtrDelta,
- unsigned Depth) const {
- auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
- auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
- if (!GEPA || !GEPB)
- return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
-
- // Look through GEPs after checking they're the same except for the last
- // index.
- if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
- GEPA->getPointerOperand() != GEPB->getPointerOperand())
- return false;
- gep_type_iterator GTIA = gep_type_begin(GEPA);
- gep_type_iterator GTIB = gep_type_begin(GEPB);
- for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
- if (GTIA.getOperand() != GTIB.getOperand())
- return false;
- ++GTIA;
- ++GTIB;
- }
-
- Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
- Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
- if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
- OpA->getType() != OpB->getType())
- return false;
-
- if (PtrDelta.isNegative()) {
- if (PtrDelta.isMinSignedValue())
- return false;
- PtrDelta.negate();
- std::swap(OpA, OpB);
- }
- uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
- if (PtrDelta.urem(Stride) != 0)
- return false;
- unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
- APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
-
- // Only look through a ZExt/SExt.
- if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
- return false;
-
- bool Signed = isa<SExtInst>(OpA);
-
- // At this point A could be a function parameter, i.e. not an instruction
- Value *ValA = OpA->getOperand(0);
- OpB = dyn_cast<Instruction>(OpB->getOperand(0));
- if (!OpB || ValA->getType() != OpB->getType())
- return false;
-
- // Now we need to prove that adding IdxDiff to ValA won't overflow.
- bool Safe = false;
- auto CheckFlags = [](Instruction *I, bool Signed) {
- BinaryOperator *BinOpI = cast<BinaryOperator>(I);
- return (Signed && BinOpI->hasNoSignedWrap()) ||
- (!Signed && BinOpI->hasNoUnsignedWrap());
- };
-
- // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
- // ValA, we're okay.
- if (OpB->getOpcode() == Instruction::Add &&
- isa<ConstantInt>(OpB->getOperand(1)) &&
- IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
- CheckFlags(OpB, Signed))
- Safe = true;
-
- // Second attempt: If both OpA and OpB is an add with NSW/NUW and with
- // the same LHS operand, we can guarantee that the transformation is safe
- // if we can prove that OpA won't overflow when IdxDiff added to the RHS
- // of OpA.
- // For example:
- // %tmp7 = add nsw i32 %tmp2, %v0
- // %tmp8 = sext i32 %tmp7 to i64
- // ...
- // %tmp11 = add nsw i32 %v0, 1
- // %tmp12 = add nsw i32 %tmp2, %tmp11
- // %tmp13 = sext i32 %tmp12 to i64
- //
- // Both %tmp7 and %tmp2 has the nsw flag and the first operand
- // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
- // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
- // nsw flag.
- OpA = dyn_cast<Instruction>(ValA);
- if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
- OpB->getOpcode() == Instruction::Add &&
- OpA->getOperand(0) == OpB->getOperand(0) && CheckFlags(OpA, Signed) &&
- CheckFlags(OpB, Signed)) {
- Value *RHSA = OpA->getOperand(1);
- Value *RHSB = OpB->getOperand(1);
- Instruction *OpRHSA = dyn_cast<Instruction>(RHSA);
- Instruction *OpRHSB = dyn_cast<Instruction>(RHSB);
- // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
- if (OpRHSB && OpRHSB->getOpcode() == Instruction::Add &&
- CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSB->getOperand(1))) {
- int64_t CstVal = cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue();
- if (OpRHSB->getOperand(0) == RHSA && IdxDiff.getSExtValue() == CstVal)
- Safe = true;
- }
- // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
- if (OpRHSA && OpRHSA->getOpcode() == Instruction::Add &&
- CheckFlags(OpRHSA, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1))) {
- int64_t CstVal = cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue();
- if (OpRHSA->getOperand(0) == RHSB && IdxDiff.getSExtValue() == -CstVal)
- Safe = true;
- }
- // Match `x +nsw/nuw (y +nsw/nuw c)` and
- // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
- if (OpRHSA && OpRHSB && OpRHSA->getOpcode() == Instruction::Add &&
- OpRHSB->getOpcode() == Instruction::Add && CheckFlags(OpRHSA, Signed) &&
- CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1)) &&
- isa<ConstantInt>(OpRHSB->getOperand(1))) {
- int64_t CstValA =
- cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue();
- int64_t CstValB =
- cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue();
- if (OpRHSA->getOperand(0) == OpRHSB->getOperand(0) &&
- IdxDiff.getSExtValue() == (CstValB - CstValA))
- Safe = true;
- }
- }
-
- unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
-
- // Third attempt:
- // If all set bits of IdxDiff or any higher order bit other than the sign bit
- // are known to be zero in ValA, we can add Diff to it while guaranteeing no
- // overflow of any sort.
- if (!Safe) {
- OpA = dyn_cast<Instruction>(ValA);
- if (!OpA)
- return false;
- KnownBits Known(BitWidth);
- computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
- APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
- if (Signed)
- BitsAllowedToBeSet.clearBit(BitWidth - 1);
- if (BitsAllowedToBeSet.ult(IdxDiff))
- return false;
- }
-
- const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
- const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
- const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
- const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
- return X == OffsetSCEVB;
-}
-
-bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
- const APInt &PtrDelta,
- unsigned Depth) const {
- if (Depth++ == MaxDepth)
- return false;
-
- if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
- if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
- return SelectA->getCondition() == SelectB->getCondition() &&
- areConsecutivePointers(SelectA->getTrueValue(),
- SelectB->getTrueValue(), PtrDelta, Depth) &&
- areConsecutivePointers(SelectA->getFalseValue(),
- SelectB->getFalseValue(), PtrDelta, Depth);
- }
- }
- return false;
-}
-
-void Vectorizer::reorder(Instruction *I) {
- SmallPtrSet<Instruction *, 16> InstructionsToMove;
- SmallVector<Instruction *, 16> Worklist;
-
- Worklist.push_back(I);
- while (!Worklist.empty()) {
- Instruction *IW = Worklist.pop_back_val();
- int NumOperands = IW->getNumOperands();
- for (int i = 0; i < NumOperands; i++) {
- Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
- if (!IM || IM->getOpcode() == Instruction::PHI)
- continue;
-
- // If IM is in another BB, no need to move it, because this pass only
- // vectorizes instructions within one BB.
- if (IM->getParent() != I->getParent())
- continue;
-
- if (!IM->comesBefore(I)) {
- InstructionsToMove.insert(IM);
- Worklist.push_back(IM);
- }
- }
- }
-
- // All instructions to move should follow I. Start from I, not from begin().
- for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
- ++BBI) {
- if (!InstructionsToMove.count(&*BBI))
- continue;
- Instruction *IM = &*BBI;
- --BBI;
- IM->removeFromParent();
- IM->insertBefore(I);
- }
-}
-
-std::pair<BasicBlock::iterator, BasicBlock::iterator>
-Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
- Instruction *C0 = Chain[0];
- BasicBlock::iterator FirstInstr = C0->getIterator();
- BasicBlock::iterator LastInstr = C0->getIterator();
-
- BasicBlock *BB = C0->getParent();
- unsigned NumFound = 0;
- for (Instruction &I : *BB) {
- if (!is_contained(Chain, &I))
- continue;
-
- ++NumFound;
- if (NumFound == 1) {
- FirstInstr = I.getIterator();
- }
- if (NumFound == Chain.size()) {
- LastInstr = I.getIterator();
- break;
- }
- }
-
- // Range is [first, last).
- return std::make_pair(FirstInstr, ++LastInstr);
-}
-
-void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
- SmallVector<Instruction *, 16> Instrs;
- for (Instruction *I : Chain) {
- Value *PtrOperand = getLoadStorePointerOperand(I);
- assert(PtrOperand && "Instruction must have a pointer operand.");
- Instrs.push_back(I);
- if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
- Instrs.push_back(GEP);
- }
-
- // Erase instructions.
- for (Instruction *I : Instrs)
- if (I->use_empty())
- I->eraseFromParent();
-}
-
-std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
-Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
- unsigned ElementSizeBits) {
- unsigned ElementSizeBytes = ElementSizeBits / 8;
- unsigned SizeBytes = ElementSizeBytes * Chain.size();
- unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
- if (NumLeft == Chain.size()) {
- if ((NumLeft & 1) == 0)
- NumLeft /= 2; // Split even in half
- else
- --NumLeft; // Split off last element
- } else if (NumLeft == 0)
- NumLeft = 1;
- return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
-}
-
-ArrayRef<Instruction *>
-Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
- // These are in BB order, unlike Chain, which is in address order.
- SmallVector<Instruction *, 16> MemoryInstrs;
- SmallVector<Instruction *, 16> ChainInstrs;
-
- bool IsLoadChain = isa<LoadInst>(Chain[0]);
- LLVM_DEBUG({
- for (Instruction *I : Chain) {
- if (IsLoadChain)
- assert(isa<LoadInst>(I) &&
- "All elements of Chain must be loads, or all must be stores.");
- else
- assert(isa<StoreInst>(I) &&
- "All elements of Chain must be loads, or all must be stores.");
- }
- });
-
- for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
- if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
- if (!is_contained(Chain, &I))
- MemoryInstrs.push_back(&I);
- else
- ChainInstrs.push_back(&I);
- } else if (isa<IntrinsicInst>(&I) &&
- cast<IntrinsicInst>(&I)->getIntrinsicID() ==
- Intrinsic::sideeffect) {
- // Ignore llvm.sideeffect calls.
+//===- LoadStoreVectorizer.cpp - GPU Load & Store Vectorizer --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass merges loads/stores to/from sequential memory addresses into vector
+// loads/stores. Although there's nothing GPU-specific in here, this pass is
+// motivated by the microarchitectural quirks of nVidia and AMD GPUs.
+//
+// (For simplicity below we talk about loads only, but everything also applies
+// to stores.)
+//
+// This pass is intended to be run late in the pipeline, after other
+// vectorization opportunities have been exploited. So the assumption here is
+// that immediately following our new vector load we'll need to extract out the
+// individual elements of the load, so we can operate on them individually.
+//
+// On CPUs this transformation is usually not beneficial, because extracting the
+// elements of a vector register is expensive on most architectures. It's
+// usually better just to load each element individually into its own scalar
+// register.
+//
+// However, nVidia and AMD GPUs don't have proper vector registers. Instead, a
+// "vector load" loads directly into a series of scalar registers. In effect,
+// extracting the elements of the vector is free. It's therefore always
+// beneficial to vectorize a sequence of loads on these architectures.
+//
+// Vectorizing (perhaps a better name might be "coalescing") loads can have
+// large performance impacts on GPU kernels, and opportunities for vectorizing
+// are common in GPU code. This pass tries very hard to find such
+// opportunities; its runtime is quadratic in the number of loads in a BB.
+//
+// Some CPU architectures, such as ARM, have instructions that load into
+// multiple scalar registers, similar to a GPU vectorized load. In theory ARM
+// could use this pass (with some modifications), but currently it implements
+// its own pass to do something similar to what we do here.
+
+#include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Vectorize.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "load-store-vectorizer"
+
+STATISTIC(NumVectorInstructions, "Number of vector accesses generated");
+STATISTIC(NumScalarsVectorized, "Number of scalar accesses vectorized");
+
+// FIXME: Assuming stack alignment of 4 is always good enough
+static const unsigned StackAdjustedAlignment = 4;
+
+namespace {
+
+/// ChainID is an arbitrary token that is allowed to be different only for the
+/// accesses that are guaranteed to be considered non-consecutive by
+/// Vectorizer::isConsecutiveAccess. It's used for grouping instructions
+/// together and reducing the number of instructions the main search operates on
+/// at a time, i.e. this is to reduce compile time and nothing else as the main
+/// search has O(n^2) time complexity. The underlying type of ChainID should not
+/// be relied upon.
+using ChainID = const Value *;
+using InstrList = SmallVector<Instruction *, 8>;
+using InstrListMap = MapVector<ChainID, InstrList>;
+
+class Vectorizer {
+ Function &F;
+ AliasAnalysis &AA;
+ DominatorTree &DT;
+ ScalarEvolution &SE;
+ TargetTransformInfo &TTI;
+ const DataLayout &DL;
+ IRBuilder<> Builder;
+
+public:
+ Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT,
+ ScalarEvolution &SE, TargetTransformInfo &TTI)
+ : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI),
+ DL(F.getParent()->getDataLayout()), Builder(SE.getContext()) {}
+
+ bool run();
+
+private:
+ unsigned getPointerAddressSpace(Value *I);
+
+ static const unsigned MaxDepth = 3;
+
+ bool isConsecutiveAccess(Value *A, Value *B);
+ bool areConsecutivePointers(Value *PtrA, Value *PtrB, APInt PtrDelta,
+ unsigned Depth = 0) const;
+ bool lookThroughComplexAddresses(Value *PtrA, Value *PtrB, APInt PtrDelta,
+ unsigned Depth) const;
+ bool lookThroughSelects(Value *PtrA, Value *PtrB, const APInt &PtrDelta,
+ unsigned Depth) const;
+
+ /// After vectorization, reorder the instructions that I depends on
+ /// (the instructions defining its operands), to ensure they dominate I.
+ void reorder(Instruction *I);
+
+ /// Returns the first and the last instructions in Chain.
+ std::pair<BasicBlock::iterator, BasicBlock::iterator>
+ getBoundaryInstrs(ArrayRef<Instruction *> Chain);
+
+ /// Erases the original instructions after vectorizing.
+ void eraseInstructions(ArrayRef<Instruction *> Chain);
+
+ /// "Legalize" the vector type that would be produced by combining \p
+ /// ElementSizeBits elements in \p Chain. Break into two pieces such that the
+ /// total size of each piece is 1, 2 or a multiple of 4 bytes. \p Chain is
+ /// expected to have more than 4 elements.
+ std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
+ splitOddVectorElts(ArrayRef<Instruction *> Chain, unsigned ElementSizeBits);
+
+ /// Finds the largest prefix of Chain that's vectorizable, checking for
+ /// intervening instructions which may affect the memory accessed by the
+ /// instructions within Chain.
+ ///
+ /// The elements of \p Chain must be all loads or all stores and must be in
+ /// address order.
+ ArrayRef<Instruction *> getVectorizablePrefix(ArrayRef<Instruction *> Chain);
+
+ /// Collects load and store instructions to vectorize.
+ std::pair<InstrListMap, InstrListMap> collectInstructions(BasicBlock *BB);
+
+ /// Processes the collected instructions, the \p Map. The values of \p Map
+ /// should be all loads or all stores.
+ bool vectorizeChains(InstrListMap &Map);
+
+ /// Finds the load/stores to consecutive memory addresses and vectorizes them.
+ bool vectorizeInstructions(ArrayRef<Instruction *> Instrs);
+
+ /// Vectorizes the load instructions in Chain.
+ bool
+ vectorizeLoadChain(ArrayRef<Instruction *> Chain,
+ SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
+
+ /// Vectorizes the store instructions in Chain.
+ bool
+ vectorizeStoreChain(ArrayRef<Instruction *> Chain,
+ SmallPtrSet<Instruction *, 16> *InstructionsProcessed);
+
+ /// Check if this load/store access is misaligned accesses.
+ bool accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
+ unsigned Alignment);
+};
+
+class LoadStoreVectorizerLegacyPass : public FunctionPass {
+public:
+ static char ID;
+
+ LoadStoreVectorizerLegacyPass() : FunctionPass(ID) {
+ initializeLoadStoreVectorizerLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override;
+
+ StringRef getPassName() const override {
+ return "GPU Load and Store Vectorizer";
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
+ }
+};
+
+} // end anonymous namespace
+
+char LoadStoreVectorizerLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
+ "Vectorize load and Store instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(LoadStoreVectorizerLegacyPass, DEBUG_TYPE,
+ "Vectorize load and store instructions", false, false)
+
+Pass *llvm::createLoadStoreVectorizerPass() {
+ return new LoadStoreVectorizerLegacyPass();
+}
+
+bool LoadStoreVectorizerLegacyPass::runOnFunction(Function &F) {
+ // Don't vectorize when the attribute NoImplicitFloat is used.
+ if (skipFunction(F) || F.hasFnAttribute(Attribute::NoImplicitFloat))
+ return false;
+
+ AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+ DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ TargetTransformInfo &TTI =
+ getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+ Vectorizer V(F, AA, DT, SE, TTI);
+ return V.run();
+}
+
+PreservedAnalyses LoadStoreVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+ // Don't vectorize when the attribute NoImplicitFloat is used.
+ if (F.hasFnAttribute(Attribute::NoImplicitFloat))
+ return PreservedAnalyses::all();
+
+ AliasAnalysis &AA = AM.getResult<AAManager>(F);
+ DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+
+ Vectorizer V(F, AA, DT, SE, TTI);
+ bool Changed = V.run();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ return Changed ? PA : PreservedAnalyses::all();
+}
+
+// The real propagateMetadata expects a SmallVector<Value*>, but we deal in
+// vectors of Instructions.
+static void propagateMetadata(Instruction *I, ArrayRef<Instruction *> IL) {
+ SmallVector<Value *, 8> VL(IL.begin(), IL.end());
+ propagateMetadata(I, VL);
+}
+
+// Vectorizer Implementation
+bool Vectorizer::run() {
+ bool Changed = false;
+
+ // Scan the blocks in the function in post order.
+ for (BasicBlock *BB : post_order(&F)) {
+ InstrListMap LoadRefs, StoreRefs;
+ std::tie(LoadRefs, StoreRefs) = collectInstructions(BB);
+ Changed |= vectorizeChains(LoadRefs);
+ Changed |= vectorizeChains(StoreRefs);
+ }
+
+ return Changed;
+}
+
+unsigned Vectorizer::getPointerAddressSpace(Value *I) {
+ if (LoadInst *L = dyn_cast<LoadInst>(I))
+ return L->getPointerAddressSpace();
+ if (StoreInst *S = dyn_cast<StoreInst>(I))
+ return S->getPointerAddressSpace();
+ return -1;
+}
+
+// FIXME: Merge with llvm::isConsecutiveAccess
+bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) {
+ Value *PtrA = getLoadStorePointerOperand(A);
+ Value *PtrB = getLoadStorePointerOperand(B);
+ unsigned ASA = getPointerAddressSpace(A);
+ unsigned ASB = getPointerAddressSpace(B);
+
+ // Check that the address spaces match and that the pointers are valid.
+ if (!PtrA || !PtrB || (ASA != ASB))
+ return false;
+
+ // Make sure that A and B are different pointers of the same size type.
+ Type *PtrATy = PtrA->getType()->getPointerElementType();
+ Type *PtrBTy = PtrB->getType()->getPointerElementType();
+ if (PtrA == PtrB ||
+ PtrATy->isVectorTy() != PtrBTy->isVectorTy() ||
+ DL.getTypeStoreSize(PtrATy) != DL.getTypeStoreSize(PtrBTy) ||
+ DL.getTypeStoreSize(PtrATy->getScalarType()) !=
+ DL.getTypeStoreSize(PtrBTy->getScalarType()))
+ return false;
+
+ unsigned PtrBitWidth = DL.getPointerSizeInBits(ASA);
+ APInt Size(PtrBitWidth, DL.getTypeStoreSize(PtrATy));
+
+ return areConsecutivePointers(PtrA, PtrB, Size);
+}
+
+bool Vectorizer::areConsecutivePointers(Value *PtrA, Value *PtrB,
+ APInt PtrDelta, unsigned Depth) const {
+ unsigned PtrBitWidth = DL.getPointerTypeSizeInBits(PtrA->getType());
+ APInt OffsetA(PtrBitWidth, 0);
+ APInt OffsetB(PtrBitWidth, 0);
+ PtrA = PtrA->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetA);
+ PtrB = PtrB->stripAndAccumulateInBoundsConstantOffsets(DL, OffsetB);
+
+ unsigned NewPtrBitWidth = DL.getTypeStoreSizeInBits(PtrA->getType());
+
+ if (NewPtrBitWidth != DL.getTypeStoreSizeInBits(PtrB->getType()))
+ return false;
+
+ // In case if we have to shrink the pointer
+ // stripAndAccumulateInBoundsConstantOffsets should properly handle a
+ // possible overflow and the value should fit into a smallest data type
+ // used in the cast/gep chain.
+ assert(OffsetA.getMinSignedBits() <= NewPtrBitWidth &&
+ OffsetB.getMinSignedBits() <= NewPtrBitWidth);
+
+ OffsetA = OffsetA.sextOrTrunc(NewPtrBitWidth);
+ OffsetB = OffsetB.sextOrTrunc(NewPtrBitWidth);
+ PtrDelta = PtrDelta.sextOrTrunc(NewPtrBitWidth);
+
+ APInt OffsetDelta = OffsetB - OffsetA;
+
+ // Check if they are based on the same pointer. That makes the offsets
+ // sufficient.
+ if (PtrA == PtrB)
+ return OffsetDelta == PtrDelta;
+
+ // Compute the necessary base pointer delta to have the necessary final delta
+ // equal to the pointer delta requested.
+ APInt BaseDelta = PtrDelta - OffsetDelta;
+
+ // Compute the distance with SCEV between the base pointers.
+ const SCEV *PtrSCEVA = SE.getSCEV(PtrA);
+ const SCEV *PtrSCEVB = SE.getSCEV(PtrB);
+ const SCEV *C = SE.getConstant(BaseDelta);
+ const SCEV *X = SE.getAddExpr(PtrSCEVA, C);
+ if (X == PtrSCEVB)
+ return true;
+
+ // The above check will not catch the cases where one of the pointers is
+ // factorized but the other one is not, such as (C + (S * (A + B))) vs
+ // (AS + BS). Get the minus scev. That will allow re-combining the expresions
+ // and getting the simplified difference.
+ const SCEV *Dist = SE.getMinusSCEV(PtrSCEVB, PtrSCEVA);
+ if (C == Dist)
+ return true;
+
+ // Sometimes even this doesn't work, because SCEV can't always see through
+ // patterns that look like (gep (ext (add (shl X, C1), C2))). Try checking
+ // things the hard way.
+ return lookThroughComplexAddresses(PtrA, PtrB, BaseDelta, Depth);
+}
+
+bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
+ APInt PtrDelta,
+ unsigned Depth) const {
+ auto *GEPA = dyn_cast<GetElementPtrInst>(PtrA);
+ auto *GEPB = dyn_cast<GetElementPtrInst>(PtrB);
+ if (!GEPA || !GEPB)
+ return lookThroughSelects(PtrA, PtrB, PtrDelta, Depth);
+
+ // Look through GEPs after checking they're the same except for the last
+ // index.
+ if (GEPA->getNumOperands() != GEPB->getNumOperands() ||
+ GEPA->getPointerOperand() != GEPB->getPointerOperand())
+ return false;
+ gep_type_iterator GTIA = gep_type_begin(GEPA);
+ gep_type_iterator GTIB = gep_type_begin(GEPB);
+ for (unsigned I = 0, E = GEPA->getNumIndices() - 1; I < E; ++I) {
+ if (GTIA.getOperand() != GTIB.getOperand())
+ return false;
+ ++GTIA;
+ ++GTIB;
+ }
+
+ Instruction *OpA = dyn_cast<Instruction>(GTIA.getOperand());
+ Instruction *OpB = dyn_cast<Instruction>(GTIB.getOperand());
+ if (!OpA || !OpB || OpA->getOpcode() != OpB->getOpcode() ||
+ OpA->getType() != OpB->getType())
+ return false;
+
+ if (PtrDelta.isNegative()) {
+ if (PtrDelta.isMinSignedValue())
+ return false;
+ PtrDelta.negate();
+ std::swap(OpA, OpB);
+ }
+ uint64_t Stride = DL.getTypeAllocSize(GTIA.getIndexedType());
+ if (PtrDelta.urem(Stride) != 0)
+ return false;
+ unsigned IdxBitWidth = OpA->getType()->getScalarSizeInBits();
+ APInt IdxDiff = PtrDelta.udiv(Stride).zextOrSelf(IdxBitWidth);
+
+ // Only look through a ZExt/SExt.
+ if (!isa<SExtInst>(OpA) && !isa<ZExtInst>(OpA))
+ return false;
+
+ bool Signed = isa<SExtInst>(OpA);
+
+ // At this point A could be a function parameter, i.e. not an instruction
+ Value *ValA = OpA->getOperand(0);
+ OpB = dyn_cast<Instruction>(OpB->getOperand(0));
+ if (!OpB || ValA->getType() != OpB->getType())
+ return false;
+
+ // Now we need to prove that adding IdxDiff to ValA won't overflow.
+ bool Safe = false;
+ auto CheckFlags = [](Instruction *I, bool Signed) {
+ BinaryOperator *BinOpI = cast<BinaryOperator>(I);
+ return (Signed && BinOpI->hasNoSignedWrap()) ||
+ (!Signed && BinOpI->hasNoUnsignedWrap());
+ };
+
+ // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
+ // ValA, we're okay.
+ if (OpB->getOpcode() == Instruction::Add &&
+ isa<ConstantInt>(OpB->getOperand(1)) &&
+ IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
+ CheckFlags(OpB, Signed))
+ Safe = true;
+
+ // Second attempt: If both OpA and OpB is an add with NSW/NUW and with
+ // the same LHS operand, we can guarantee that the transformation is safe
+ // if we can prove that OpA won't overflow when IdxDiff added to the RHS
+ // of OpA.
+ // For example:
+ // %tmp7 = add nsw i32 %tmp2, %v0
+ // %tmp8 = sext i32 %tmp7 to i64
+ // ...
+ // %tmp11 = add nsw i32 %v0, 1
+ // %tmp12 = add nsw i32 %tmp2, %tmp11
+ // %tmp13 = sext i32 %tmp12 to i64
+ //
+ // Both %tmp7 and %tmp2 has the nsw flag and the first operand
+ // is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
+ // because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
+ // nsw flag.
+ OpA = dyn_cast<Instruction>(ValA);
+ if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
+ OpB->getOpcode() == Instruction::Add &&
+ OpA->getOperand(0) == OpB->getOperand(0) && CheckFlags(OpA, Signed) &&
+ CheckFlags(OpB, Signed)) {
+ Value *RHSA = OpA->getOperand(1);
+ Value *RHSB = OpB->getOperand(1);
+ Instruction *OpRHSA = dyn_cast<Instruction>(RHSA);
+ Instruction *OpRHSB = dyn_cast<Instruction>(RHSB);
+ // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
+ if (OpRHSB && OpRHSB->getOpcode() == Instruction::Add &&
+ CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSB->getOperand(1))) {
+ int64_t CstVal = cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue();
+ if (OpRHSB->getOperand(0) == RHSA && IdxDiff.getSExtValue() == CstVal)
+ Safe = true;
+ }
+ // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
+ if (OpRHSA && OpRHSA->getOpcode() == Instruction::Add &&
+ CheckFlags(OpRHSA, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1))) {
+ int64_t CstVal = cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue();
+ if (OpRHSA->getOperand(0) == RHSB && IdxDiff.getSExtValue() == -CstVal)
+ Safe = true;
+ }
+ // Match `x +nsw/nuw (y +nsw/nuw c)` and
+ // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
+ if (OpRHSA && OpRHSB && OpRHSA->getOpcode() == Instruction::Add &&
+ OpRHSB->getOpcode() == Instruction::Add && CheckFlags(OpRHSA, Signed) &&
+ CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1)) &&
+ isa<ConstantInt>(OpRHSB->getOperand(1))) {
+ int64_t CstValA =
+ cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue();
+ int64_t CstValB =
+ cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue();
+ if (OpRHSA->getOperand(0) == OpRHSB->getOperand(0) &&
+ IdxDiff.getSExtValue() == (CstValB - CstValA))
+ Safe = true;
+ }
+ }
+
+ unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
+
+ // Third attempt:
+ // If all set bits of IdxDiff or any higher order bit other than the sign bit
+ // are known to be zero in ValA, we can add Diff to it while guaranteeing no
+ // overflow of any sort.
+ if (!Safe) {
+ OpA = dyn_cast<Instruction>(ValA);
+ if (!OpA)
+ return false;
+ KnownBits Known(BitWidth);
+ computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT);
+ APInt BitsAllowedToBeSet = Known.Zero.zext(IdxDiff.getBitWidth());
+ if (Signed)
+ BitsAllowedToBeSet.clearBit(BitWidth - 1);
+ if (BitsAllowedToBeSet.ult(IdxDiff))
+ return false;
+ }
+
+ const SCEV *OffsetSCEVA = SE.getSCEV(ValA);
+ const SCEV *OffsetSCEVB = SE.getSCEV(OpB);
+ const SCEV *C = SE.getConstant(IdxDiff.trunc(BitWidth));
+ const SCEV *X = SE.getAddExpr(OffsetSCEVA, C);
+ return X == OffsetSCEVB;
+}
+
+bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
+ const APInt &PtrDelta,
+ unsigned Depth) const {
+ if (Depth++ == MaxDepth)
+ return false;
+
+ if (auto *SelectA = dyn_cast<SelectInst>(PtrA)) {
+ if (auto *SelectB = dyn_cast<SelectInst>(PtrB)) {
+ return SelectA->getCondition() == SelectB->getCondition() &&
+ areConsecutivePointers(SelectA->getTrueValue(),
+ SelectB->getTrueValue(), PtrDelta, Depth) &&
+ areConsecutivePointers(SelectA->getFalseValue(),
+ SelectB->getFalseValue(), PtrDelta, Depth);
+ }
+ }
+ return false;
+}
+
+void Vectorizer::reorder(Instruction *I) {
+ SmallPtrSet<Instruction *, 16> InstructionsToMove;
+ SmallVector<Instruction *, 16> Worklist;
+
+ Worklist.push_back(I);
+ while (!Worklist.empty()) {
+ Instruction *IW = Worklist.pop_back_val();
+ int NumOperands = IW->getNumOperands();
+ for (int i = 0; i < NumOperands; i++) {
+ Instruction *IM = dyn_cast<Instruction>(IW->getOperand(i));
+ if (!IM || IM->getOpcode() == Instruction::PHI)
+ continue;
+
+ // If IM is in another BB, no need to move it, because this pass only
+ // vectorizes instructions within one BB.
+ if (IM->getParent() != I->getParent())
+ continue;
+
+ if (!IM->comesBefore(I)) {
+ InstructionsToMove.insert(IM);
+ Worklist.push_back(IM);
+ }
+ }
+ }
+
+ // All instructions to move should follow I. Start from I, not from begin().
+ for (auto BBI = I->getIterator(), E = I->getParent()->end(); BBI != E;
+ ++BBI) {
+ if (!InstructionsToMove.count(&*BBI))
+ continue;
+ Instruction *IM = &*BBI;
+ --BBI;
+ IM->removeFromParent();
+ IM->insertBefore(I);
+ }
+}
+
+std::pair<BasicBlock::iterator, BasicBlock::iterator>
+Vectorizer::getBoundaryInstrs(ArrayRef<Instruction *> Chain) {
+ Instruction *C0 = Chain[0];
+ BasicBlock::iterator FirstInstr = C0->getIterator();
+ BasicBlock::iterator LastInstr = C0->getIterator();
+
+ BasicBlock *BB = C0->getParent();
+ unsigned NumFound = 0;
+ for (Instruction &I : *BB) {
+ if (!is_contained(Chain, &I))
+ continue;
+
+ ++NumFound;
+ if (NumFound == 1) {
+ FirstInstr = I.getIterator();
+ }
+ if (NumFound == Chain.size()) {
+ LastInstr = I.getIterator();
+ break;
+ }
+ }
+
+ // Range is [first, last).
+ return std::make_pair(FirstInstr, ++LastInstr);
+}
+
+void Vectorizer::eraseInstructions(ArrayRef<Instruction *> Chain) {
+ SmallVector<Instruction *, 16> Instrs;
+ for (Instruction *I : Chain) {
+ Value *PtrOperand = getLoadStorePointerOperand(I);
+ assert(PtrOperand && "Instruction must have a pointer operand.");
+ Instrs.push_back(I);
+ if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(PtrOperand))
+ Instrs.push_back(GEP);
+ }
+
+ // Erase instructions.
+ for (Instruction *I : Instrs)
+ if (I->use_empty())
+ I->eraseFromParent();
+}
+
+std::pair<ArrayRef<Instruction *>, ArrayRef<Instruction *>>
+Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
+ unsigned ElementSizeBits) {
+ unsigned ElementSizeBytes = ElementSizeBits / 8;
+ unsigned SizeBytes = ElementSizeBytes * Chain.size();
+ unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
+ if (NumLeft == Chain.size()) {
+ if ((NumLeft & 1) == 0)
+ NumLeft /= 2; // Split even in half
+ else
+ --NumLeft; // Split off last element
+ } else if (NumLeft == 0)
+ NumLeft = 1;
+ return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
+}
+
+ArrayRef<Instruction *>
+Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
+ // These are in BB order, unlike Chain, which is in address order.
+ SmallVector<Instruction *, 16> MemoryInstrs;
+ SmallVector<Instruction *, 16> ChainInstrs;
+
+ bool IsLoadChain = isa<LoadInst>(Chain[0]);
+ LLVM_DEBUG({
+ for (Instruction *I : Chain) {
+ if (IsLoadChain)
+ assert(isa<LoadInst>(I) &&
+ "All elements of Chain must be loads, or all must be stores.");
+ else
+ assert(isa<StoreInst>(I) &&
+ "All elements of Chain must be loads, or all must be stores.");
+ }
+ });
+
+ for (Instruction &I : make_range(getBoundaryInstrs(Chain))) {
+ if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
+ if (!is_contained(Chain, &I))
+ MemoryInstrs.push_back(&I);
+ else
+ ChainInstrs.push_back(&I);
+ } else if (isa<IntrinsicInst>(&I) &&
+ cast<IntrinsicInst>(&I)->getIntrinsicID() ==
+ Intrinsic::sideeffect) {
+ // Ignore llvm.sideeffect calls.
} else if (isa<IntrinsicInst>(&I) &&
cast<IntrinsicInst>(&I)->getIntrinsicID() ==
Intrinsic::pseudoprobe) {
// Ignore llvm.pseudoprobe calls.
- } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
- LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
- << '\n');
- break;
- } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
- LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
- << '\n');
- break;
- }
- }
-
- // Loop until we find an instruction in ChainInstrs that we can't vectorize.
- unsigned ChainInstrIdx = 0;
- Instruction *BarrierMemoryInstr = nullptr;
-
- for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
- Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
-
- // If a barrier memory instruction was found, chain instructions that follow
- // will not be added to the valid prefix.
- if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
- break;
-
- // Check (in BB order) if any instruction prevents ChainInstr from being
- // vectorized. Find and store the first such "conflicting" instruction.
- for (Instruction *MemInstr : MemoryInstrs) {
- // If a barrier memory instruction was found, do not check past it.
- if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
- break;
-
- auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
- auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
- if (MemLoad && ChainLoad)
- continue;
-
- // We can ignore the alias if the we have a load store pair and the load
- // is known to be invariant. The load cannot be clobbered by the store.
- auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
- return LI->hasMetadata(LLVMContext::MD_invariant_load);
- };
-
- // We can ignore the alias as long as the load comes before the store,
- // because that means we won't be moving the load past the store to
- // vectorize it (the vectorized load is inserted at the location of the
- // first load in the chain).
- if (isa<StoreInst>(MemInstr) && ChainLoad &&
- (IsInvariantLoad(ChainLoad) || ChainLoad->comesBefore(MemInstr)))
- continue;
-
- // Same case, but in reverse.
- if (MemLoad && isa<StoreInst>(ChainInstr) &&
- (IsInvariantLoad(MemLoad) || MemLoad->comesBefore(ChainInstr)))
- continue;
-
- if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
- MemoryLocation::get(ChainInstr))) {
- LLVM_DEBUG({
- dbgs() << "LSV: Found alias:\n"
- " Aliasing instruction and pointer:\n"
- << " " << *MemInstr << '\n'
- << " " << *getLoadStorePointerOperand(MemInstr) << '\n'
- << " Aliased instruction and pointer:\n"
- << " " << *ChainInstr << '\n'
- << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
- });
- // Save this aliasing memory instruction as a barrier, but allow other
- // instructions that precede the barrier to be vectorized with this one.
- BarrierMemoryInstr = MemInstr;
- break;
- }
- }
- // Continue the search only for store chains, since vectorizing stores that
- // precede an aliasing load is valid. Conversely, vectorizing loads is valid
- // up to an aliasing store, but should not pull loads from further down in
- // the basic block.
- if (IsLoadChain && BarrierMemoryInstr) {
- // The BarrierMemoryInstr is a store that precedes ChainInstr.
- assert(BarrierMemoryInstr->comesBefore(ChainInstr));
- break;
- }
- }
-
- // Find the largest prefix of Chain whose elements are all in
- // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
- // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
- // order.)
- SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
- ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
- unsigned ChainIdx = 0;
- for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
- if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
- break;
- }
- return Chain.slice(0, ChainIdx);
-}
-
+ } else if (IsLoadChain && (I.mayWriteToMemory() || I.mayThrow())) {
+ LLVM_DEBUG(dbgs() << "LSV: Found may-write/throw operation: " << I
+ << '\n');
+ break;
+ } else if (!IsLoadChain && (I.mayReadOrWriteMemory() || I.mayThrow())) {
+ LLVM_DEBUG(dbgs() << "LSV: Found may-read/write/throw operation: " << I
+ << '\n');
+ break;
+ }
+ }
+
+ // Loop until we find an instruction in ChainInstrs that we can't vectorize.
+ unsigned ChainInstrIdx = 0;
+ Instruction *BarrierMemoryInstr = nullptr;
+
+ for (unsigned E = ChainInstrs.size(); ChainInstrIdx < E; ++ChainInstrIdx) {
+ Instruction *ChainInstr = ChainInstrs[ChainInstrIdx];
+
+ // If a barrier memory instruction was found, chain instructions that follow
+ // will not be added to the valid prefix.
+ if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
+ break;
+
+ // Check (in BB order) if any instruction prevents ChainInstr from being
+ // vectorized. Find and store the first such "conflicting" instruction.
+ for (Instruction *MemInstr : MemoryInstrs) {
+ // If a barrier memory instruction was found, do not check past it.
+ if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
+ break;
+
+ auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
+ auto *ChainLoad = dyn_cast<LoadInst>(ChainInstr);
+ if (MemLoad && ChainLoad)
+ continue;
+
+ // We can ignore the alias if the we have a load store pair and the load
+ // is known to be invariant. The load cannot be clobbered by the store.
+ auto IsInvariantLoad = [](const LoadInst *LI) -> bool {
+ return LI->hasMetadata(LLVMContext::MD_invariant_load);
+ };
+
+ // We can ignore the alias as long as the load comes before the store,
+ // because that means we won't be moving the load past the store to
+ // vectorize it (the vectorized load is inserted at the location of the
+ // first load in the chain).
+ if (isa<StoreInst>(MemInstr) && ChainLoad &&
+ (IsInvariantLoad(ChainLoad) || ChainLoad->comesBefore(MemInstr)))
+ continue;
+
+ // Same case, but in reverse.
+ if (MemLoad && isa<StoreInst>(ChainInstr) &&
+ (IsInvariantLoad(MemLoad) || MemLoad->comesBefore(ChainInstr)))
+ continue;
+
+ if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
+ MemoryLocation::get(ChainInstr))) {
+ LLVM_DEBUG({
+ dbgs() << "LSV: Found alias:\n"
+ " Aliasing instruction and pointer:\n"
+ << " " << *MemInstr << '\n'
+ << " " << *getLoadStorePointerOperand(MemInstr) << '\n'
+ << " Aliased instruction and pointer:\n"
+ << " " << *ChainInstr << '\n'
+ << " " << *getLoadStorePointerOperand(ChainInstr) << '\n';
+ });
+ // Save this aliasing memory instruction as a barrier, but allow other
+ // instructions that precede the barrier to be vectorized with this one.
+ BarrierMemoryInstr = MemInstr;
+ break;
+ }
+ }
+ // Continue the search only for store chains, since vectorizing stores that
+ // precede an aliasing load is valid. Conversely, vectorizing loads is valid
+ // up to an aliasing store, but should not pull loads from further down in
+ // the basic block.
+ if (IsLoadChain && BarrierMemoryInstr) {
+ // The BarrierMemoryInstr is a store that precedes ChainInstr.
+ assert(BarrierMemoryInstr->comesBefore(ChainInstr));
+ break;
+ }
+ }
+
+ // Find the largest prefix of Chain whose elements are all in
+ // ChainInstrs[0, ChainInstrIdx). This is the largest vectorizable prefix of
+ // Chain. (Recall that Chain is in address order, but ChainInstrs is in BB
+ // order.)
+ SmallPtrSet<Instruction *, 8> VectorizableChainInstrs(
+ ChainInstrs.begin(), ChainInstrs.begin() + ChainInstrIdx);
+ unsigned ChainIdx = 0;
+ for (unsigned ChainLen = Chain.size(); ChainIdx < ChainLen; ++ChainIdx) {
+ if (!VectorizableChainInstrs.count(Chain[ChainIdx]))
+ break;
+ }
+ return Chain.slice(0, ChainIdx);
+}
+
static ChainID getChainID(const Value *Ptr) {
const Value *ObjPtr = getUnderlyingObject(Ptr);
- if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
- // The select's themselves are distinct instructions even if they share the
- // same condition and evaluate to consecutive pointers for true and false
- // values of the condition. Therefore using the select's themselves for
- // grouping instructions would put consecutive accesses into different lists
- // and they won't be even checked for being consecutive, and won't be
- // vectorized.
- return Sel->getCondition();
- }
- return ObjPtr;
-}
-
-std::pair<InstrListMap, InstrListMap>
-Vectorizer::collectInstructions(BasicBlock *BB) {
- InstrListMap LoadRefs;
- InstrListMap StoreRefs;
-
- for (Instruction &I : *BB) {
- if (!I.mayReadOrWriteMemory())
- continue;
-
- if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
- if (!LI->isSimple())
- continue;
-
- // Skip if it's not legal.
- if (!TTI.isLegalToVectorizeLoad(LI))
- continue;
-
- Type *Ty = LI->getType();
- if (!VectorType::isValidElementType(Ty->getScalarType()))
- continue;
-
- // Skip weird non-byte sizes. They probably aren't worth the effort of
- // handling correctly.
- unsigned TySize = DL.getTypeSizeInBits(Ty);
- if ((TySize % 8) != 0)
- continue;
-
- // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
- // functions are currently using an integer type for the vectorized
- // load/store, and does not support casting between the integer type and a
- // vector of pointers (e.g. i64 to <2 x i16*>)
- if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
- continue;
-
- Value *Ptr = LI->getPointerOperand();
- unsigned AS = Ptr->getType()->getPointerAddressSpace();
- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
-
- unsigned VF = VecRegSize / TySize;
- VectorType *VecTy = dyn_cast<VectorType>(Ty);
-
- // No point in looking at these if they're too big to vectorize.
- if (TySize > VecRegSize / 2 ||
- (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
- continue;
-
- // Make sure all the users of a vector are constant-index extracts.
- if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
- const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
- return EEI && isa<ConstantInt>(EEI->getOperand(1));
- }))
- continue;
-
- // Save the load locations.
+ if (const auto *Sel = dyn_cast<SelectInst>(ObjPtr)) {
+ // The select's themselves are distinct instructions even if they share the
+ // same condition and evaluate to consecutive pointers for true and false
+ // values of the condition. Therefore using the select's themselves for
+ // grouping instructions would put consecutive accesses into different lists
+ // and they won't be even checked for being consecutive, and won't be
+ // vectorized.
+ return Sel->getCondition();
+ }
+ return ObjPtr;
+}
+
+std::pair<InstrListMap, InstrListMap>
+Vectorizer::collectInstructions(BasicBlock *BB) {
+ InstrListMap LoadRefs;
+ InstrListMap StoreRefs;
+
+ for (Instruction &I : *BB) {
+ if (!I.mayReadOrWriteMemory())
+ continue;
+
+ if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+ if (!LI->isSimple())
+ continue;
+
+ // Skip if it's not legal.
+ if (!TTI.isLegalToVectorizeLoad(LI))
+ continue;
+
+ Type *Ty = LI->getType();
+ if (!VectorType::isValidElementType(Ty->getScalarType()))
+ continue;
+
+ // Skip weird non-byte sizes. They probably aren't worth the effort of
+ // handling correctly.
+ unsigned TySize = DL.getTypeSizeInBits(Ty);
+ if ((TySize % 8) != 0)
+ continue;
+
+ // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
+ // functions are currently using an integer type for the vectorized
+ // load/store, and does not support casting between the integer type and a
+ // vector of pointers (e.g. i64 to <2 x i16*>)
+ if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
+ continue;
+
+ Value *Ptr = LI->getPointerOperand();
+ unsigned AS = Ptr->getType()->getPointerAddressSpace();
+ unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+
+ unsigned VF = VecRegSize / TySize;
+ VectorType *VecTy = dyn_cast<VectorType>(Ty);
+
+ // No point in looking at these if they're too big to vectorize.
+ if (TySize > VecRegSize / 2 ||
+ (VecTy && TTI.getLoadVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
+ continue;
+
+ // Make sure all the users of a vector are constant-index extracts.
+ if (isa<VectorType>(Ty) && !llvm::all_of(LI->users(), [](const User *U) {
+ const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
+ return EEI && isa<ConstantInt>(EEI->getOperand(1));
+ }))
+ continue;
+
+ // Save the load locations.
const ChainID ID = getChainID(Ptr);
- LoadRefs[ID].push_back(LI);
- } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
- if (!SI->isSimple())
- continue;
-
- // Skip if it's not legal.
- if (!TTI.isLegalToVectorizeStore(SI))
- continue;
-
- Type *Ty = SI->getValueOperand()->getType();
- if (!VectorType::isValidElementType(Ty->getScalarType()))
- continue;
-
- // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
- // functions are currently using an integer type for the vectorized
- // load/store, and does not support casting between the integer type and a
- // vector of pointers (e.g. i64 to <2 x i16*>)
- if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
- continue;
-
- // Skip weird non-byte sizes. They probably aren't worth the effort of
- // handling correctly.
- unsigned TySize = DL.getTypeSizeInBits(Ty);
- if ((TySize % 8) != 0)
- continue;
-
- Value *Ptr = SI->getPointerOperand();
- unsigned AS = Ptr->getType()->getPointerAddressSpace();
- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
-
- unsigned VF = VecRegSize / TySize;
- VectorType *VecTy = dyn_cast<VectorType>(Ty);
-
- // No point in looking at these if they're too big to vectorize.
- if (TySize > VecRegSize / 2 ||
- (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
- continue;
-
- if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
- const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
- return EEI && isa<ConstantInt>(EEI->getOperand(1));
- }))
- continue;
-
- // Save store location.
+ LoadRefs[ID].push_back(LI);
+ } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
+ if (!SI->isSimple())
+ continue;
+
+ // Skip if it's not legal.
+ if (!TTI.isLegalToVectorizeStore(SI))
+ continue;
+
+ Type *Ty = SI->getValueOperand()->getType();
+ if (!VectorType::isValidElementType(Ty->getScalarType()))
+ continue;
+
+ // Skip vectors of pointers. The vectorizeLoadChain/vectorizeStoreChain
+ // functions are currently using an integer type for the vectorized
+ // load/store, and does not support casting between the integer type and a
+ // vector of pointers (e.g. i64 to <2 x i16*>)
+ if (Ty->isVectorTy() && Ty->isPtrOrPtrVectorTy())
+ continue;
+
+ // Skip weird non-byte sizes. They probably aren't worth the effort of
+ // handling correctly.
+ unsigned TySize = DL.getTypeSizeInBits(Ty);
+ if ((TySize % 8) != 0)
+ continue;
+
+ Value *Ptr = SI->getPointerOperand();
+ unsigned AS = Ptr->getType()->getPointerAddressSpace();
+ unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+
+ unsigned VF = VecRegSize / TySize;
+ VectorType *VecTy = dyn_cast<VectorType>(Ty);
+
+ // No point in looking at these if they're too big to vectorize.
+ if (TySize > VecRegSize / 2 ||
+ (VecTy && TTI.getStoreVectorFactor(VF, TySize, TySize / 8, VecTy) == 0))
+ continue;
+
+ if (isa<VectorType>(Ty) && !llvm::all_of(SI->users(), [](const User *U) {
+ const ExtractElementInst *EEI = dyn_cast<ExtractElementInst>(U);
+ return EEI && isa<ConstantInt>(EEI->getOperand(1));
+ }))
+ continue;
+
+ // Save store location.
const ChainID ID = getChainID(Ptr);
- StoreRefs[ID].push_back(SI);
- }
- }
-
- return {LoadRefs, StoreRefs};
-}
-
-bool Vectorizer::vectorizeChains(InstrListMap &Map) {
- bool Changed = false;
-
- for (const std::pair<ChainID, InstrList> &Chain : Map) {
- unsigned Size = Chain.second.size();
- if (Size < 2)
- continue;
-
- LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
-
- // Process the stores in chunks of 64.
- for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
- unsigned Len = std::min<unsigned>(CE - CI, 64);
- ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
- Changed |= vectorizeInstructions(Chunk);
- }
- }
-
- return Changed;
-}
-
-bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
- LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
- << " instructions.\n");
- SmallVector<int, 16> Heads, Tails;
- int ConsecutiveChain[64];
-
- // Do a quadratic search on all of the given loads/stores and find all of the
- // pairs of loads/stores that follow each other.
- for (int i = 0, e = Instrs.size(); i < e; ++i) {
- ConsecutiveChain[i] = -1;
- for (int j = e - 1; j >= 0; --j) {
- if (i == j)
- continue;
-
- if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
- if (ConsecutiveChain[i] != -1) {
- int CurDistance = std::abs(ConsecutiveChain[i] - i);
- int NewDistance = std::abs(ConsecutiveChain[i] - j);
- if (j < i || NewDistance > CurDistance)
- continue; // Should not insert.
- }
-
- Tails.push_back(j);
- Heads.push_back(i);
- ConsecutiveChain[i] = j;
- }
- }
- }
-
- bool Changed = false;
- SmallPtrSet<Instruction *, 16> InstructionsProcessed;
-
- for (int Head : Heads) {
- if (InstructionsProcessed.count(Instrs[Head]))
- continue;
- bool LongerChainExists = false;
- for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
- if (Head == Tails[TIt] &&
- !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
- LongerChainExists = true;
- break;
- }
- if (LongerChainExists)
- continue;
-
- // We found an instr that starts a chain. Now follow the chain and try to
- // vectorize it.
- SmallVector<Instruction *, 16> Operands;
- int I = Head;
- while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
- if (InstructionsProcessed.count(Instrs[I]))
- break;
-
- Operands.push_back(Instrs[I]);
- I = ConsecutiveChain[I];
- }
-
- bool Vectorized = false;
- if (isa<LoadInst>(*Operands.begin()))
- Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
- else
- Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
-
- Changed |= Vectorized;
- }
-
- return Changed;
-}
-
-bool Vectorizer::vectorizeStoreChain(
- ArrayRef<Instruction *> Chain,
- SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
- StoreInst *S0 = cast<StoreInst>(Chain[0]);
-
- // If the vector has an int element, default to int for the whole store.
- Type *StoreTy = nullptr;
- for (Instruction *I : Chain) {
- StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
- if (StoreTy->isIntOrIntVectorTy())
- break;
-
- if (StoreTy->isPtrOrPtrVectorTy()) {
- StoreTy = Type::getIntNTy(F.getParent()->getContext(),
- DL.getTypeSizeInBits(StoreTy));
- break;
- }
- }
- assert(StoreTy && "Failed to find store type");
-
- unsigned Sz = DL.getTypeSizeInBits(StoreTy);
- unsigned AS = S0->getPointerAddressSpace();
- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
- unsigned VF = VecRegSize / Sz;
- unsigned ChainSize = Chain.size();
- Align Alignment = S0->getAlign();
-
- if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
- return false;
- }
-
- ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
- if (NewChain.empty()) {
- // No vectorization possible.
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
- return false;
- }
- if (NewChain.size() == 1) {
- // Failed after the first instruction. Discard it and try the smaller chain.
- InstructionsProcessed->insert(NewChain.front());
- return false;
- }
-
- // Update Chain to the valid vectorizable subchain.
- Chain = NewChain;
- ChainSize = Chain.size();
-
- // Check if it's legal to vectorize this chain. If not, split the chain and
- // try again.
- unsigned EltSzInBytes = Sz / 8;
- unsigned SzInBytes = EltSzInBytes * ChainSize;
-
+ StoreRefs[ID].push_back(SI);
+ }
+ }
+
+ return {LoadRefs, StoreRefs};
+}
+
+bool Vectorizer::vectorizeChains(InstrListMap &Map) {
+ bool Changed = false;
+
+ for (const std::pair<ChainID, InstrList> &Chain : Map) {
+ unsigned Size = Chain.second.size();
+ if (Size < 2)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "LSV: Analyzing a chain of length " << Size << ".\n");
+
+ // Process the stores in chunks of 64.
+ for (unsigned CI = 0, CE = Size; CI < CE; CI += 64) {
+ unsigned Len = std::min<unsigned>(CE - CI, 64);
+ ArrayRef<Instruction *> Chunk(&Chain.second[CI], Len);
+ Changed |= vectorizeInstructions(Chunk);
+ }
+ }
+
+ return Changed;
+}
+
+bool Vectorizer::vectorizeInstructions(ArrayRef<Instruction *> Instrs) {
+ LLVM_DEBUG(dbgs() << "LSV: Vectorizing " << Instrs.size()
+ << " instructions.\n");
+ SmallVector<int, 16> Heads, Tails;
+ int ConsecutiveChain[64];
+
+ // Do a quadratic search on all of the given loads/stores and find all of the
+ // pairs of loads/stores that follow each other.
+ for (int i = 0, e = Instrs.size(); i < e; ++i) {
+ ConsecutiveChain[i] = -1;
+ for (int j = e - 1; j >= 0; --j) {
+ if (i == j)
+ continue;
+
+ if (isConsecutiveAccess(Instrs[i], Instrs[j])) {
+ if (ConsecutiveChain[i] != -1) {
+ int CurDistance = std::abs(ConsecutiveChain[i] - i);
+ int NewDistance = std::abs(ConsecutiveChain[i] - j);
+ if (j < i || NewDistance > CurDistance)
+ continue; // Should not insert.
+ }
+
+ Tails.push_back(j);
+ Heads.push_back(i);
+ ConsecutiveChain[i] = j;
+ }
+ }
+ }
+
+ bool Changed = false;
+ SmallPtrSet<Instruction *, 16> InstructionsProcessed;
+
+ for (int Head : Heads) {
+ if (InstructionsProcessed.count(Instrs[Head]))
+ continue;
+ bool LongerChainExists = false;
+ for (unsigned TIt = 0; TIt < Tails.size(); TIt++)
+ if (Head == Tails[TIt] &&
+ !InstructionsProcessed.count(Instrs[Heads[TIt]])) {
+ LongerChainExists = true;
+ break;
+ }
+ if (LongerChainExists)
+ continue;
+
+ // We found an instr that starts a chain. Now follow the chain and try to
+ // vectorize it.
+ SmallVector<Instruction *, 16> Operands;
+ int I = Head;
+ while (I != -1 && (is_contained(Tails, I) || is_contained(Heads, I))) {
+ if (InstructionsProcessed.count(Instrs[I]))
+ break;
+
+ Operands.push_back(Instrs[I]);
+ I = ConsecutiveChain[I];
+ }
+
+ bool Vectorized = false;
+ if (isa<LoadInst>(*Operands.begin()))
+ Vectorized = vectorizeLoadChain(Operands, &InstructionsProcessed);
+ else
+ Vectorized = vectorizeStoreChain(Operands, &InstructionsProcessed);
+
+ Changed |= Vectorized;
+ }
+
+ return Changed;
+}
+
+bool Vectorizer::vectorizeStoreChain(
+ ArrayRef<Instruction *> Chain,
+ SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
+ StoreInst *S0 = cast<StoreInst>(Chain[0]);
+
+ // If the vector has an int element, default to int for the whole store.
+ Type *StoreTy = nullptr;
+ for (Instruction *I : Chain) {
+ StoreTy = cast<StoreInst>(I)->getValueOperand()->getType();
+ if (StoreTy->isIntOrIntVectorTy())
+ break;
+
+ if (StoreTy->isPtrOrPtrVectorTy()) {
+ StoreTy = Type::getIntNTy(F.getParent()->getContext(),
+ DL.getTypeSizeInBits(StoreTy));
+ break;
+ }
+ }
+ assert(StoreTy && "Failed to find store type");
+
+ unsigned Sz = DL.getTypeSizeInBits(StoreTy);
+ unsigned AS = S0->getPointerAddressSpace();
+ unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+ unsigned VF = VecRegSize / Sz;
+ unsigned ChainSize = Chain.size();
+ Align Alignment = S0->getAlign();
+
+ if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
+ InstructionsProcessed->insert(Chain.begin(), Chain.end());
+ return false;
+ }
+
+ ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
+ if (NewChain.empty()) {
+ // No vectorization possible.
+ InstructionsProcessed->insert(Chain.begin(), Chain.end());
+ return false;
+ }
+ if (NewChain.size() == 1) {
+ // Failed after the first instruction. Discard it and try the smaller chain.
+ InstructionsProcessed->insert(NewChain.front());
+ return false;
+ }
+
+ // Update Chain to the valid vectorizable subchain.
+ Chain = NewChain;
+ ChainSize = Chain.size();
+
+ // Check if it's legal to vectorize this chain. If not, split the chain and
+ // try again.
+ unsigned EltSzInBytes = Sz / 8;
+ unsigned SzInBytes = EltSzInBytes * ChainSize;
+
FixedVectorType *VecTy;
auto *VecStoreTy = dyn_cast<FixedVectorType>(StoreTy);
- if (VecStoreTy)
- VecTy = FixedVectorType::get(StoreTy->getScalarType(),
- Chain.size() * VecStoreTy->getNumElements());
- else
- VecTy = FixedVectorType::get(StoreTy, Chain.size());
-
- // If it's more than the max vector size or the target has a better
- // vector factor, break it into two pieces.
- unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
- if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
- LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
- " Creating two separate arrays.\n");
- return vectorizeStoreChain(Chain.slice(0, TargetVF),
- InstructionsProcessed) |
- vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
- }
-
- LLVM_DEBUG({
- dbgs() << "LSV: Stores to vectorize:\n";
- for (Instruction *I : Chain)
- dbgs() << " " << *I << "\n";
- });
-
- // We won't try again to vectorize the elements of the chain, regardless of
- // whether we succeed below.
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
-
- // If the store is going to be misaligned, don't vectorize it.
- if (accessIsMisaligned(SzInBytes, AS, Alignment.value())) {
- if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
- auto Chains = splitOddVectorElts(Chain, Sz);
- return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
- vectorizeStoreChain(Chains.second, InstructionsProcessed);
- }
-
- Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
- Align(StackAdjustedAlignment),
- DL, S0, nullptr, &DT);
- if (NewAlign >= Alignment)
- Alignment = NewAlign;
- else
- return false;
- }
-
- if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
- auto Chains = splitOddVectorElts(Chain, Sz);
- return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
- vectorizeStoreChain(Chains.second, InstructionsProcessed);
- }
-
- BasicBlock::iterator First, Last;
- std::tie(First, Last) = getBoundaryInstrs(Chain);
- Builder.SetInsertPoint(&*Last);
-
- Value *Vec = UndefValue::get(VecTy);
-
- if (VecStoreTy) {
- unsigned VecWidth = VecStoreTy->getNumElements();
- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
- StoreInst *Store = cast<StoreInst>(Chain[I]);
- for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
- unsigned NewIdx = J + I * VecWidth;
- Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
- Builder.getInt32(J));
- if (Extract->getType() != StoreTy->getScalarType())
- Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
-
- Value *Insert =
- Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
- Vec = Insert;
- }
- }
- } else {
- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
- StoreInst *Store = cast<StoreInst>(Chain[I]);
- Value *Extract = Store->getValueOperand();
- if (Extract->getType() != StoreTy->getScalarType())
- Extract =
- Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
-
- Value *Insert =
- Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
- Vec = Insert;
- }
- }
-
- StoreInst *SI = Builder.CreateAlignedStore(
- Vec,
- Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
- Alignment);
- propagateMetadata(SI, Chain);
-
- eraseInstructions(Chain);
- ++NumVectorInstructions;
- NumScalarsVectorized += Chain.size();
- return true;
-}
-
-bool Vectorizer::vectorizeLoadChain(
- ArrayRef<Instruction *> Chain,
- SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
- LoadInst *L0 = cast<LoadInst>(Chain[0]);
-
- // If the vector has an int element, default to int for the whole load.
- Type *LoadTy = nullptr;
- for (const auto &V : Chain) {
- LoadTy = cast<LoadInst>(V)->getType();
- if (LoadTy->isIntOrIntVectorTy())
- break;
-
- if (LoadTy->isPtrOrPtrVectorTy()) {
- LoadTy = Type::getIntNTy(F.getParent()->getContext(),
- DL.getTypeSizeInBits(LoadTy));
- break;
- }
- }
- assert(LoadTy && "Can't determine LoadInst type from chain");
-
- unsigned Sz = DL.getTypeSizeInBits(LoadTy);
- unsigned AS = L0->getPointerAddressSpace();
- unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
- unsigned VF = VecRegSize / Sz;
- unsigned ChainSize = Chain.size();
- Align Alignment = L0->getAlign();
-
- if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
- return false;
- }
-
- ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
- if (NewChain.empty()) {
- // No vectorization possible.
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
- return false;
- }
- if (NewChain.size() == 1) {
- // Failed after the first instruction. Discard it and try the smaller chain.
- InstructionsProcessed->insert(NewChain.front());
- return false;
- }
-
- // Update Chain to the valid vectorizable subchain.
- Chain = NewChain;
- ChainSize = Chain.size();
-
- // Check if it's legal to vectorize this chain. If not, split the chain and
- // try again.
- unsigned EltSzInBytes = Sz / 8;
- unsigned SzInBytes = EltSzInBytes * ChainSize;
- VectorType *VecTy;
+ if (VecStoreTy)
+ VecTy = FixedVectorType::get(StoreTy->getScalarType(),
+ Chain.size() * VecStoreTy->getNumElements());
+ else
+ VecTy = FixedVectorType::get(StoreTy, Chain.size());
+
+ // If it's more than the max vector size or the target has a better
+ // vector factor, break it into two pieces.
+ unsigned TargetVF = TTI.getStoreVectorFactor(VF, Sz, SzInBytes, VecTy);
+ if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
+ LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+ " Creating two separate arrays.\n");
+ return vectorizeStoreChain(Chain.slice(0, TargetVF),
+ InstructionsProcessed) |
+ vectorizeStoreChain(Chain.slice(TargetVF), InstructionsProcessed);
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "LSV: Stores to vectorize:\n";
+ for (Instruction *I : Chain)
+ dbgs() << " " << *I << "\n";
+ });
+
+ // We won't try again to vectorize the elements of the chain, regardless of
+ // whether we succeed below.
+ InstructionsProcessed->insert(Chain.begin(), Chain.end());
+
+ // If the store is going to be misaligned, don't vectorize it.
+ if (accessIsMisaligned(SzInBytes, AS, Alignment.value())) {
+ if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
+ auto Chains = splitOddVectorElts(Chain, Sz);
+ return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
+ vectorizeStoreChain(Chains.second, InstructionsProcessed);
+ }
+
+ Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
+ Align(StackAdjustedAlignment),
+ DL, S0, nullptr, &DT);
+ if (NewAlign >= Alignment)
+ Alignment = NewAlign;
+ else
+ return false;
+ }
+
+ if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
+ auto Chains = splitOddVectorElts(Chain, Sz);
+ return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
+ vectorizeStoreChain(Chains.second, InstructionsProcessed);
+ }
+
+ BasicBlock::iterator First, Last;
+ std::tie(First, Last) = getBoundaryInstrs(Chain);
+ Builder.SetInsertPoint(&*Last);
+
+ Value *Vec = UndefValue::get(VecTy);
+
+ if (VecStoreTy) {
+ unsigned VecWidth = VecStoreTy->getNumElements();
+ for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+ StoreInst *Store = cast<StoreInst>(Chain[I]);
+ for (unsigned J = 0, NE = VecStoreTy->getNumElements(); J != NE; ++J) {
+ unsigned NewIdx = J + I * VecWidth;
+ Value *Extract = Builder.CreateExtractElement(Store->getValueOperand(),
+ Builder.getInt32(J));
+ if (Extract->getType() != StoreTy->getScalarType())
+ Extract = Builder.CreateBitCast(Extract, StoreTy->getScalarType());
+
+ Value *Insert =
+ Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(NewIdx));
+ Vec = Insert;
+ }
+ }
+ } else {
+ for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+ StoreInst *Store = cast<StoreInst>(Chain[I]);
+ Value *Extract = Store->getValueOperand();
+ if (Extract->getType() != StoreTy->getScalarType())
+ Extract =
+ Builder.CreateBitOrPointerCast(Extract, StoreTy->getScalarType());
+
+ Value *Insert =
+ Builder.CreateInsertElement(Vec, Extract, Builder.getInt32(I));
+ Vec = Insert;
+ }
+ }
+
+ StoreInst *SI = Builder.CreateAlignedStore(
+ Vec,
+ Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
+ Alignment);
+ propagateMetadata(SI, Chain);
+
+ eraseInstructions(Chain);
+ ++NumVectorInstructions;
+ NumScalarsVectorized += Chain.size();
+ return true;
+}
+
+bool Vectorizer::vectorizeLoadChain(
+ ArrayRef<Instruction *> Chain,
+ SmallPtrSet<Instruction *, 16> *InstructionsProcessed) {
+ LoadInst *L0 = cast<LoadInst>(Chain[0]);
+
+ // If the vector has an int element, default to int for the whole load.
+ Type *LoadTy = nullptr;
+ for (const auto &V : Chain) {
+ LoadTy = cast<LoadInst>(V)->getType();
+ if (LoadTy->isIntOrIntVectorTy())
+ break;
+
+ if (LoadTy->isPtrOrPtrVectorTy()) {
+ LoadTy = Type::getIntNTy(F.getParent()->getContext(),
+ DL.getTypeSizeInBits(LoadTy));
+ break;
+ }
+ }
+ assert(LoadTy && "Can't determine LoadInst type from chain");
+
+ unsigned Sz = DL.getTypeSizeInBits(LoadTy);
+ unsigned AS = L0->getPointerAddressSpace();
+ unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
+ unsigned VF = VecRegSize / Sz;
+ unsigned ChainSize = Chain.size();
+ Align Alignment = L0->getAlign();
+
+ if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
+ InstructionsProcessed->insert(Chain.begin(), Chain.end());
+ return false;
+ }
+
+ ArrayRef<Instruction *> NewChain = getVectorizablePrefix(Chain);
+ if (NewChain.empty()) {
+ // No vectorization possible.
+ InstructionsProcessed->insert(Chain.begin(), Chain.end());
+ return false;
+ }
+ if (NewChain.size() == 1) {
+ // Failed after the first instruction. Discard it and try the smaller chain.
+ InstructionsProcessed->insert(NewChain.front());
+ return false;
+ }
+
+ // Update Chain to the valid vectorizable subchain.
+ Chain = NewChain;
+ ChainSize = Chain.size();
+
+ // Check if it's legal to vectorize this chain. If not, split the chain and
+ // try again.
+ unsigned EltSzInBytes = Sz / 8;
+ unsigned SzInBytes = EltSzInBytes * ChainSize;
+ VectorType *VecTy;
auto *VecLoadTy = dyn_cast<FixedVectorType>(LoadTy);
- if (VecLoadTy)
- VecTy = FixedVectorType::get(LoadTy->getScalarType(),
- Chain.size() * VecLoadTy->getNumElements());
- else
- VecTy = FixedVectorType::get(LoadTy, Chain.size());
-
- // If it's more than the max vector size or the target has a better
- // vector factor, break it into two pieces.
- unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
- if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
- LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
- " Creating two separate arrays.\n");
- return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
- vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
- }
-
- // We won't try again to vectorize the elements of the chain, regardless of
- // whether we succeed below.
- InstructionsProcessed->insert(Chain.begin(), Chain.end());
-
- // If the load is going to be misaligned, don't vectorize it.
- if (accessIsMisaligned(SzInBytes, AS, Alignment.value())) {
- if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
- auto Chains = splitOddVectorElts(Chain, Sz);
- return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
- vectorizeLoadChain(Chains.second, InstructionsProcessed);
- }
-
- Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
- Align(StackAdjustedAlignment),
- DL, L0, nullptr, &DT);
- if (NewAlign >= Alignment)
- Alignment = NewAlign;
- else
- return false;
- }
-
- if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
- auto Chains = splitOddVectorElts(Chain, Sz);
- return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
- vectorizeLoadChain(Chains.second, InstructionsProcessed);
- }
-
- LLVM_DEBUG({
- dbgs() << "LSV: Loads to vectorize:\n";
- for (Instruction *I : Chain)
- I->dump();
- });
-
- // getVectorizablePrefix already computed getBoundaryInstrs. The value of
- // Last may have changed since then, but the value of First won't have. If it
- // matters, we could compute getBoundaryInstrs only once and reuse it here.
- BasicBlock::iterator First, Last;
- std::tie(First, Last) = getBoundaryInstrs(Chain);
- Builder.SetInsertPoint(&*First);
-
- Value *Bitcast =
- Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
- LoadInst *LI =
- Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
- propagateMetadata(LI, Chain);
-
- if (VecLoadTy) {
- SmallVector<Instruction *, 16> InstrsToErase;
-
- unsigned VecWidth = VecLoadTy->getNumElements();
- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
- for (auto Use : Chain[I]->users()) {
- // All users of vector loads are ExtractElement instructions with
- // constant indices, otherwise we would have bailed before now.
- Instruction *UI = cast<Instruction>(Use);
- unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
- unsigned NewIdx = Idx + I * VecWidth;
- Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
- UI->getName());
- if (V->getType() != UI->getType())
- V = Builder.CreateBitCast(V, UI->getType());
-
- // Replace the old instruction.
- UI->replaceAllUsesWith(V);
- InstrsToErase.push_back(UI);
- }
- }
-
- // Bitcast might not be an Instruction, if the value being loaded is a
- // constant. In that case, no need to reorder anything.
- if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
- reorder(BitcastInst);
-
- for (auto I : InstrsToErase)
- I->eraseFromParent();
- } else {
- for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
- Value *CV = Chain[I];
- Value *V =
- Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
- if (V->getType() != CV->getType()) {
- V = Builder.CreateBitOrPointerCast(V, CV->getType());
- }
-
- // Replace the old instruction.
- CV->replaceAllUsesWith(V);
- }
-
- if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
- reorder(BitcastInst);
- }
-
- eraseInstructions(Chain);
-
- ++NumVectorInstructions;
- NumScalarsVectorized += Chain.size();
- return true;
-}
-
-bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
- unsigned Alignment) {
- if (Alignment % SzInBytes == 0)
- return false;
-
- bool Fast = false;
- bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
- SzInBytes * 8, AddressSpace,
- Alignment, &Fast);
- LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
- << " and fast? " << Fast << "\n";);
- return !Allows || !Fast;
-}
+ if (VecLoadTy)
+ VecTy = FixedVectorType::get(LoadTy->getScalarType(),
+ Chain.size() * VecLoadTy->getNumElements());
+ else
+ VecTy = FixedVectorType::get(LoadTy, Chain.size());
+
+ // If it's more than the max vector size or the target has a better
+ // vector factor, break it into two pieces.
+ unsigned TargetVF = TTI.getLoadVectorFactor(VF, Sz, SzInBytes, VecTy);
+ if (ChainSize > VF || (VF != TargetVF && TargetVF < ChainSize)) {
+ LLVM_DEBUG(dbgs() << "LSV: Chain doesn't match with the vector factor."
+ " Creating two separate arrays.\n");
+ return vectorizeLoadChain(Chain.slice(0, TargetVF), InstructionsProcessed) |
+ vectorizeLoadChain(Chain.slice(TargetVF), InstructionsProcessed);
+ }
+
+ // We won't try again to vectorize the elements of the chain, regardless of
+ // whether we succeed below.
+ InstructionsProcessed->insert(Chain.begin(), Chain.end());
+
+ // If the load is going to be misaligned, don't vectorize it.
+ if (accessIsMisaligned(SzInBytes, AS, Alignment.value())) {
+ if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
+ auto Chains = splitOddVectorElts(Chain, Sz);
+ return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
+ vectorizeLoadChain(Chains.second, InstructionsProcessed);
+ }
+
+ Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
+ Align(StackAdjustedAlignment),
+ DL, L0, nullptr, &DT);
+ if (NewAlign >= Alignment)
+ Alignment = NewAlign;
+ else
+ return false;
+ }
+
+ if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
+ auto Chains = splitOddVectorElts(Chain, Sz);
+ return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
+ vectorizeLoadChain(Chains.second, InstructionsProcessed);
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "LSV: Loads to vectorize:\n";
+ for (Instruction *I : Chain)
+ I->dump();
+ });
+
+ // getVectorizablePrefix already computed getBoundaryInstrs. The value of
+ // Last may have changed since then, but the value of First won't have. If it
+ // matters, we could compute getBoundaryInstrs only once and reuse it here.
+ BasicBlock::iterator First, Last;
+ std::tie(First, Last) = getBoundaryInstrs(Chain);
+ Builder.SetInsertPoint(&*First);
+
+ Value *Bitcast =
+ Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
+ LoadInst *LI =
+ Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
+ propagateMetadata(LI, Chain);
+
+ if (VecLoadTy) {
+ SmallVector<Instruction *, 16> InstrsToErase;
+
+ unsigned VecWidth = VecLoadTy->getNumElements();
+ for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+ for (auto Use : Chain[I]->users()) {
+ // All users of vector loads are ExtractElement instructions with
+ // constant indices, otherwise we would have bailed before now.
+ Instruction *UI = cast<Instruction>(Use);
+ unsigned Idx = cast<ConstantInt>(UI->getOperand(1))->getZExtValue();
+ unsigned NewIdx = Idx + I * VecWidth;
+ Value *V = Builder.CreateExtractElement(LI, Builder.getInt32(NewIdx),
+ UI->getName());
+ if (V->getType() != UI->getType())
+ V = Builder.CreateBitCast(V, UI->getType());
+
+ // Replace the old instruction.
+ UI->replaceAllUsesWith(V);
+ InstrsToErase.push_back(UI);
+ }
+ }
+
+ // Bitcast might not be an Instruction, if the value being loaded is a
+ // constant. In that case, no need to reorder anything.
+ if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
+ reorder(BitcastInst);
+
+ for (auto I : InstrsToErase)
+ I->eraseFromParent();
+ } else {
+ for (unsigned I = 0, E = Chain.size(); I != E; ++I) {
+ Value *CV = Chain[I];
+ Value *V =
+ Builder.CreateExtractElement(LI, Builder.getInt32(I), CV->getName());
+ if (V->getType() != CV->getType()) {
+ V = Builder.CreateBitOrPointerCast(V, CV->getType());
+ }
+
+ // Replace the old instruction.
+ CV->replaceAllUsesWith(V);
+ }
+
+ if (Instruction *BitcastInst = dyn_cast<Instruction>(Bitcast))
+ reorder(BitcastInst);
+ }
+
+ eraseInstructions(Chain);
+
+ ++NumVectorInstructions;
+ NumScalarsVectorized += Chain.size();
+ return true;
+}
+
+bool Vectorizer::accessIsMisaligned(unsigned SzInBytes, unsigned AddressSpace,
+ unsigned Alignment) {
+ if (Alignment % SzInBytes == 0)
+ return false;
+
+ bool Fast = false;
+ bool Allows = TTI.allowsMisalignedMemoryAccesses(F.getParent()->getContext(),
+ SzInBytes * 8, AddressSpace,
+ Alignment, &Fast);
+ LLVM_DEBUG(dbgs() << "LSV: Target said misaligned is allowed? " << Allows
+ << " and fast? " << Fast << "\n";);
+ return !Allows || !Fast;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 184340599b..b8c21a0e1c 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1,1099 +1,1099 @@
-//===- LoopVectorizationLegality.cpp --------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides loop vectorization legality analysis. Original code
-// resided in LoopVectorize.cpp for a long time.
-//
-// At this point, it is implemented as a utility class, not as an analysis
-// pass. It should be easy to create an analysis pass around it if there
-// is a need (but D45420 needs to happen first).
-//
-
-#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/LoopInfo.h"
+//===- LoopVectorizationLegality.cpp --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides loop vectorization legality analysis. Original code
+// resided in LoopVectorize.cpp for a long time.
+//
+// At this point, it is implemented as a utility class, not as an analysis
+// pass. It should be easy to create an analysis pass around it if there
+// is a need (but D45420 needs to happen first).
+//
+
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/PatternMatch.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
#include "llvm/Transforms/Utils/SizeOpts.h"
-#include "llvm/Transforms/Vectorize/LoopVectorize.h"
-
-using namespace llvm;
-using namespace PatternMatch;
-
-#define LV_NAME "loop-vectorize"
-#define DEBUG_TYPE LV_NAME
-
-extern cl::opt<bool> EnableVPlanPredication;
-
-static cl::opt<bool>
- EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
- cl::desc("Enable if-conversion during vectorization."));
-
-static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
- "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
- cl::desc("The maximum allowed number of runtime memory checks with a "
- "vectorize(enable) pragma."));
-
-static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
- "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
- cl::desc("The maximum number of SCEV checks allowed."));
-
-static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
- "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
- cl::desc("The maximum number of SCEV checks allowed with a "
- "vectorize(enable) pragma"));
-
-/// Maximum vectorization interleave count.
-static const unsigned MaxInterleaveFactor = 16;
-
-namespace llvm {
-
-bool LoopVectorizeHints::Hint::validate(unsigned Val) {
- switch (Kind) {
- case HK_WIDTH:
- return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
- case HK_UNROLL:
- return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
- case HK_FORCE:
- return (Val <= 1);
- case HK_ISVECTORIZED:
- case HK_PREDICATE:
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+
+using namespace llvm;
+using namespace PatternMatch;
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+extern cl::opt<bool> EnableVPlanPredication;
+
+static cl::opt<bool>
+ EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+ cl::desc("Enable if-conversion during vectorization."));
+
+static cl::opt<unsigned> PragmaVectorizeMemoryCheckThreshold(
+ "pragma-vectorize-memory-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum allowed number of runtime memory checks with a "
+ "vectorize(enable) pragma."));
+
+static cl::opt<unsigned> VectorizeSCEVCheckThreshold(
+ "vectorize-scev-check-threshold", cl::init(16), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed."));
+
+static cl::opt<unsigned> PragmaVectorizeSCEVCheckThreshold(
+ "pragma-vectorize-scev-check-threshold", cl::init(128), cl::Hidden,
+ cl::desc("The maximum number of SCEV checks allowed with a "
+ "vectorize(enable) pragma"));
+
+/// Maximum vectorization interleave count.
+static const unsigned MaxInterleaveFactor = 16;
+
+namespace llvm {
+
+bool LoopVectorizeHints::Hint::validate(unsigned Val) {
+ switch (Kind) {
+ case HK_WIDTH:
+ return isPowerOf2_32(Val) && Val <= VectorizerParams::MaxVectorWidth;
+ case HK_UNROLL:
+ return isPowerOf2_32(Val) && Val <= MaxInterleaveFactor;
+ case HK_FORCE:
+ return (Val <= 1);
+ case HK_ISVECTORIZED:
+ case HK_PREDICATE:
case HK_SCALABLE:
- return (Val == 0 || Val == 1);
- }
- return false;
-}
-
-LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
- bool InterleaveOnlyWhenForced,
- OptimizationRemarkEmitter &ORE)
- : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
- Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
- Force("vectorize.enable", FK_Undefined, HK_FORCE),
- IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
+ return (Val == 0 || Val == 1);
+ }
+ return false;
+}
+
+LoopVectorizeHints::LoopVectorizeHints(const Loop *L,
+ bool InterleaveOnlyWhenForced,
+ OptimizationRemarkEmitter &ORE)
+ : Width("vectorize.width", VectorizerParams::VectorizationFactor, HK_WIDTH),
+ Interleave("interleave.count", InterleaveOnlyWhenForced, HK_UNROLL),
+ Force("vectorize.enable", FK_Undefined, HK_FORCE),
+ IsVectorized("isvectorized", 0, HK_ISVECTORIZED),
Predicate("vectorize.predicate.enable", FK_Undefined, HK_PREDICATE),
Scalable("vectorize.scalable.enable", false, HK_SCALABLE), TheLoop(L),
- ORE(ORE) {
- // Populate values with existing loop metadata.
- getHintsFromMetadata();
-
- // force-vector-interleave overrides DisableInterleaving.
- if (VectorizerParams::isInterleaveForced())
- Interleave.Value = VectorizerParams::VectorizationInterleave;
-
- if (IsVectorized.Value != 1)
- // If the vectorization width and interleaving count are both 1 then
- // consider the loop to have been already vectorized because there's
- // nothing more that we can do.
+ ORE(ORE) {
+ // Populate values with existing loop metadata.
+ getHintsFromMetadata();
+
+ // force-vector-interleave overrides DisableInterleaving.
+ if (VectorizerParams::isInterleaveForced())
+ Interleave.Value = VectorizerParams::VectorizationInterleave;
+
+ if (IsVectorized.Value != 1)
+ // If the vectorization width and interleaving count are both 1 then
+ // consider the loop to have been already vectorized because there's
+ // nothing more that we can do.
IsVectorized.Value =
getWidth() == ElementCount::getFixed(1) && Interleave.Value == 1;
- LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs()
- << "LV: Interleaving disabled by the pass manager\n");
-}
-
-void LoopVectorizeHints::setAlreadyVectorized() {
- LLVMContext &Context = TheLoop->getHeader()->getContext();
-
- MDNode *IsVectorizedMD = MDNode::get(
- Context,
- {MDString::get(Context, "llvm.loop.isvectorized"),
- ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))});
- MDNode *LoopID = TheLoop->getLoopID();
- MDNode *NewLoopID =
- makePostTransformationMetadata(Context, LoopID,
- {Twine(Prefix(), "vectorize.").str(),
- Twine(Prefix(), "interleave.").str()},
- {IsVectorizedMD});
- TheLoop->setLoopID(NewLoopID);
-
- // Update internal cache.
- IsVectorized.Value = 1;
-}
-
-bool LoopVectorizeHints::allowVectorization(
- Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {
- if (getForce() == LoopVectorizeHints::FK_Disabled) {
- LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
- emitRemarkWithHints();
- return false;
- }
-
- if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
- LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
- emitRemarkWithHints();
- return false;
- }
-
- if (getIsVectorized() == 1) {
- LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
- // FIXME: Add interleave.disable metadata. This will allow
- // vectorize.disable to be used without disabling the pass and errors
- // to differentiate between disabled vectorization and a width of 1.
- ORE.emit([&]() {
- return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
- "AllDisabled", L->getStartLoc(),
- L->getHeader())
- << "loop not vectorized: vectorization and interleaving are "
- "explicitly disabled, or the loop has already been "
- "vectorized";
- });
- return false;
- }
-
- return true;
-}
-
-void LoopVectorizeHints::emitRemarkWithHints() const {
- using namespace ore;
-
- ORE.emit([&]() {
- if (Force.Value == LoopVectorizeHints::FK_Disabled)
- return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
- TheLoop->getStartLoc(),
- TheLoop->getHeader())
- << "loop not vectorized: vectorization is explicitly disabled";
- else {
- OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
- TheLoop->getStartLoc(), TheLoop->getHeader());
- R << "loop not vectorized";
- if (Force.Value == LoopVectorizeHints::FK_Enabled) {
- R << " (Force=" << NV("Force", true);
- if (Width.Value != 0)
+ LLVM_DEBUG(if (InterleaveOnlyWhenForced && Interleave.Value == 1) dbgs()
+ << "LV: Interleaving disabled by the pass manager\n");
+}
+
+void LoopVectorizeHints::setAlreadyVectorized() {
+ LLVMContext &Context = TheLoop->getHeader()->getContext();
+
+ MDNode *IsVectorizedMD = MDNode::get(
+ Context,
+ {MDString::get(Context, "llvm.loop.isvectorized"),
+ ConstantAsMetadata::get(ConstantInt::get(Context, APInt(32, 1)))});
+ MDNode *LoopID = TheLoop->getLoopID();
+ MDNode *NewLoopID =
+ makePostTransformationMetadata(Context, LoopID,
+ {Twine(Prefix(), "vectorize.").str(),
+ Twine(Prefix(), "interleave.").str()},
+ {IsVectorizedMD});
+ TheLoop->setLoopID(NewLoopID);
+
+ // Update internal cache.
+ IsVectorized.Value = 1;
+}
+
+bool LoopVectorizeHints::allowVectorization(
+ Function *F, Loop *L, bool VectorizeOnlyWhenForced) const {
+ if (getForce() == LoopVectorizeHints::FK_Disabled) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: #pragma vectorize disable.\n");
+ emitRemarkWithHints();
+ return false;
+ }
+
+ if (VectorizeOnlyWhenForced && getForce() != LoopVectorizeHints::FK_Enabled) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: No #pragma vectorize enable.\n");
+ emitRemarkWithHints();
+ return false;
+ }
+
+ if (getIsVectorized() == 1) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Disabled/already vectorized.\n");
+ // FIXME: Add interleave.disable metadata. This will allow
+ // vectorize.disable to be used without disabling the pass and errors
+ // to differentiate between disabled vectorization and a width of 1.
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysis(vectorizeAnalysisPassName(),
+ "AllDisabled", L->getStartLoc(),
+ L->getHeader())
+ << "loop not vectorized: vectorization and interleaving are "
+ "explicitly disabled, or the loop has already been "
+ "vectorized";
+ });
+ return false;
+ }
+
+ return true;
+}
+
+void LoopVectorizeHints::emitRemarkWithHints() const {
+ using namespace ore;
+
+ ORE.emit([&]() {
+ if (Force.Value == LoopVectorizeHints::FK_Disabled)
+ return OptimizationRemarkMissed(LV_NAME, "MissedExplicitlyDisabled",
+ TheLoop->getStartLoc(),
+ TheLoop->getHeader())
+ << "loop not vectorized: vectorization is explicitly disabled";
+ else {
+ OptimizationRemarkMissed R(LV_NAME, "MissedDetails",
+ TheLoop->getStartLoc(), TheLoop->getHeader());
+ R << "loop not vectorized";
+ if (Force.Value == LoopVectorizeHints::FK_Enabled) {
+ R << " (Force=" << NV("Force", true);
+ if (Width.Value != 0)
R << ", Vector Width=" << NV("VectorWidth", getWidth());
- if (Interleave.Value != 0)
- R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
- R << ")";
- }
- return R;
- }
- });
-}
-
-const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
+ if (Interleave.Value != 0)
+ R << ", Interleave Count=" << NV("InterleaveCount", Interleave.Value);
+ R << ")";
+ }
+ return R;
+ }
+ });
+}
+
+const char *LoopVectorizeHints::vectorizeAnalysisPassName() const {
if (getWidth() == ElementCount::getFixed(1))
- return LV_NAME;
- if (getForce() == LoopVectorizeHints::FK_Disabled)
- return LV_NAME;
+ return LV_NAME;
+ if (getForce() == LoopVectorizeHints::FK_Disabled)
+ return LV_NAME;
if (getForce() == LoopVectorizeHints::FK_Undefined && getWidth().isZero())
- return LV_NAME;
- return OptimizationRemarkAnalysis::AlwaysPrint;
-}
-
-void LoopVectorizeHints::getHintsFromMetadata() {
- MDNode *LoopID = TheLoop->getLoopID();
- if (!LoopID)
- return;
-
- // First operand should refer to the loop id itself.
- assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
- assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
-
- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
- const MDString *S = nullptr;
- SmallVector<Metadata *, 4> Args;
-
- // The expected hint is either a MDString or a MDNode with the first
- // operand a MDString.
- if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
- if (!MD || MD->getNumOperands() == 0)
- continue;
- S = dyn_cast<MDString>(MD->getOperand(0));
- for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
- Args.push_back(MD->getOperand(i));
- } else {
- S = dyn_cast<MDString>(LoopID->getOperand(i));
- assert(Args.size() == 0 && "too many arguments for MDString");
- }
-
- if (!S)
- continue;
-
- // Check if the hint starts with the loop metadata prefix.
- StringRef Name = S->getString();
- if (Args.size() == 1)
- setHint(Name, Args[0]);
- }
-}
-
-void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
- if (!Name.startswith(Prefix()))
- return;
- Name = Name.substr(Prefix().size(), StringRef::npos);
-
- const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
- if (!C)
- return;
- unsigned Val = C->getZExtValue();
-
+ return LV_NAME;
+ return OptimizationRemarkAnalysis::AlwaysPrint;
+}
+
+void LoopVectorizeHints::getHintsFromMetadata() {
+ MDNode *LoopID = TheLoop->getLoopID();
+ if (!LoopID)
+ return;
+
+ // First operand should refer to the loop id itself.
+ assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+ assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ const MDString *S = nullptr;
+ SmallVector<Metadata *, 4> Args;
+
+ // The expected hint is either a MDString or a MDNode with the first
+ // operand a MDString.
+ if (const MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i))) {
+ if (!MD || MD->getNumOperands() == 0)
+ continue;
+ S = dyn_cast<MDString>(MD->getOperand(0));
+ for (unsigned i = 1, ie = MD->getNumOperands(); i < ie; ++i)
+ Args.push_back(MD->getOperand(i));
+ } else {
+ S = dyn_cast<MDString>(LoopID->getOperand(i));
+ assert(Args.size() == 0 && "too many arguments for MDString");
+ }
+
+ if (!S)
+ continue;
+
+ // Check if the hint starts with the loop metadata prefix.
+ StringRef Name = S->getString();
+ if (Args.size() == 1)
+ setHint(Name, Args[0]);
+ }
+}
+
+void LoopVectorizeHints::setHint(StringRef Name, Metadata *Arg) {
+ if (!Name.startswith(Prefix()))
+ return;
+ Name = Name.substr(Prefix().size(), StringRef::npos);
+
+ const ConstantInt *C = mdconst::dyn_extract<ConstantInt>(Arg);
+ if (!C)
+ return;
+ unsigned Val = C->getZExtValue();
+
Hint *Hints[] = {&Width, &Interleave, &Force,
&IsVectorized, &Predicate, &Scalable};
- for (auto H : Hints) {
- if (Name == H->Name) {
- if (H->validate(Val))
- H->Value = Val;
- else
- LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
- break;
- }
- }
-}
-
-bool LoopVectorizationRequirements::doesNotMeet(
- Function *F, Loop *L, const LoopVectorizeHints &Hints) {
- const char *PassName = Hints.vectorizeAnalysisPassName();
- bool Failed = false;
- if (UnsafeAlgebraInst && !Hints.allowReordering()) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysisFPCommute(
- PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(),
- UnsafeAlgebraInst->getParent())
- << "loop not vectorized: cannot prove it is safe to reorder "
- "floating-point operations";
- });
- Failed = true;
- }
-
- // Test if runtime memcheck thresholds are exceeded.
- bool PragmaThresholdReached =
- NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
- bool ThresholdReached =
- NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
- if ((ThresholdReached && !Hints.allowReordering()) ||
- PragmaThresholdReached) {
- ORE.emit([&]() {
- return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
- L->getStartLoc(),
- L->getHeader())
- << "loop not vectorized: cannot prove it is safe to reorder "
- "memory operations";
- });
- LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
- Failed = true;
- }
-
- return Failed;
-}
-
-// Return true if the inner loop \p Lp is uniform with regard to the outer loop
-// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
-// executing the inner loop will execute the same iterations). This check is
-// very constrained for now but it will be relaxed in the future. \p Lp is
-// considered uniform if it meets all the following conditions:
-// 1) it has a canonical IV (starting from 0 and with stride 1),
-// 2) its latch terminator is a conditional branch and,
-// 3) its latch condition is a compare instruction whose operands are the
-// canonical IV and an OuterLp invariant.
-// This check doesn't take into account the uniformity of other conditions not
-// related to the loop latch because they don't affect the loop uniformity.
-//
-// NOTE: We decided to keep all these checks and its associated documentation
-// together so that we can easily have a picture of the current supported loop
-// nests. However, some of the current checks don't depend on \p OuterLp and
-// would be redundantly executed for each \p Lp if we invoked this function for
-// different candidate outer loops. This is not the case for now because we
-// don't currently have the infrastructure to evaluate multiple candidate outer
-// loops and \p OuterLp will be a fixed parameter while we only support explicit
-// outer loop vectorization. It's also very likely that these checks go away
-// before introducing the aforementioned infrastructure. However, if this is not
-// the case, we should move the \p OuterLp independent checks to a separate
-// function that is only executed once for each \p Lp.
-static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
- assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
-
- // If Lp is the outer loop, it's uniform by definition.
- if (Lp == OuterLp)
- return true;
- assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
-
- // 1.
- PHINode *IV = Lp->getCanonicalInductionVariable();
- if (!IV) {
- LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
- return false;
- }
-
- // 2.
- BasicBlock *Latch = Lp->getLoopLatch();
- auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
- if (!LatchBr || LatchBr->isUnconditional()) {
- LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
- return false;
- }
-
- // 3.
- auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
- if (!LatchCmp) {
- LLVM_DEBUG(
- dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
- return false;
- }
-
- Value *CondOp0 = LatchCmp->getOperand(0);
- Value *CondOp1 = LatchCmp->getOperand(1);
- Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
- if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
- !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
- LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
- return false;
- }
-
- return true;
-}
-
-// Return true if \p Lp and all its nested loops are uniform with regard to \p
-// OuterLp.
-static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
- if (!isUniformLoop(Lp, OuterLp))
- return false;
-
- // Check if nested loops are uniform.
- for (Loop *SubLp : *Lp)
- if (!isUniformLoopNest(SubLp, OuterLp))
- return false;
-
- return true;
-}
-
-/// Check whether it is safe to if-convert this phi node.
-///
-/// Phi nodes with constant expressions that can trap are not safe to if
-/// convert.
-static bool canIfConvertPHINodes(BasicBlock *BB) {
- for (PHINode &Phi : BB->phis()) {
- for (Value *V : Phi.incoming_values())
- if (auto *C = dyn_cast<Constant>(V))
- if (C->canTrap())
- return false;
- }
- return true;
-}
-
-static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
- if (Ty->isPointerTy())
- return DL.getIntPtrType(Ty);
-
- // It is possible that char's or short's overflow when we ask for the loop's
- // trip count, work around this by changing the type size.
- if (Ty->getScalarSizeInBits() < 32)
- return Type::getInt32Ty(Ty->getContext());
-
- return Ty;
-}
-
-static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
- Ty0 = convertPointerToIntegerType(DL, Ty0);
- Ty1 = convertPointerToIntegerType(DL, Ty1);
- if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
- return Ty0;
- return Ty1;
-}
-
-/// Check that the instruction has outside loop users and is not an
-/// identified reduction variable.
-static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
- SmallPtrSetImpl<Value *> &AllowedExit) {
- // Reductions, Inductions and non-header phis are allowed to have exit users. All
- // other instructions must not have external users.
- if (!AllowedExit.count(Inst))
- // Check that all of the users of the loop are inside the BB.
- for (User *U : Inst->users()) {
- Instruction *UI = cast<Instruction>(U);
- // This user may be a reduction exit value.
- if (!TheLoop->contains(UI)) {
- LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
- return true;
- }
- }
- return false;
-}
-
-int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
- const ValueToValueMap &Strides =
- getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
-
+ for (auto H : Hints) {
+ if (Name == H->Name) {
+ if (H->validate(Val))
+ H->Value = Val;
+ else
+ LLVM_DEBUG(dbgs() << "LV: ignoring invalid hint '" << Name << "'\n");
+ break;
+ }
+ }
+}
+
+bool LoopVectorizationRequirements::doesNotMeet(
+ Function *F, Loop *L, const LoopVectorizeHints &Hints) {
+ const char *PassName = Hints.vectorizeAnalysisPassName();
+ bool Failed = false;
+ if (UnsafeAlgebraInst && !Hints.allowReordering()) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysisFPCommute(
+ PassName, "CantReorderFPOps", UnsafeAlgebraInst->getDebugLoc(),
+ UnsafeAlgebraInst->getParent())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "floating-point operations";
+ });
+ Failed = true;
+ }
+
+ // Test if runtime memcheck thresholds are exceeded.
+ bool PragmaThresholdReached =
+ NumRuntimePointerChecks > PragmaVectorizeMemoryCheckThreshold;
+ bool ThresholdReached =
+ NumRuntimePointerChecks > VectorizerParams::RuntimeMemoryCheckThreshold;
+ if ((ThresholdReached && !Hints.allowReordering()) ||
+ PragmaThresholdReached) {
+ ORE.emit([&]() {
+ return OptimizationRemarkAnalysisAliasing(PassName, "CantReorderMemOps",
+ L->getStartLoc(),
+ L->getHeader())
+ << "loop not vectorized: cannot prove it is safe to reorder "
+ "memory operations";
+ });
+ LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
+ Failed = true;
+ }
+
+ return Failed;
+}
+
+// Return true if the inner loop \p Lp is uniform with regard to the outer loop
+// \p OuterLp (i.e., if the outer loop is vectorized, all the vector lanes
+// executing the inner loop will execute the same iterations). This check is
+// very constrained for now but it will be relaxed in the future. \p Lp is
+// considered uniform if it meets all the following conditions:
+// 1) it has a canonical IV (starting from 0 and with stride 1),
+// 2) its latch terminator is a conditional branch and,
+// 3) its latch condition is a compare instruction whose operands are the
+// canonical IV and an OuterLp invariant.
+// This check doesn't take into account the uniformity of other conditions not
+// related to the loop latch because they don't affect the loop uniformity.
+//
+// NOTE: We decided to keep all these checks and its associated documentation
+// together so that we can easily have a picture of the current supported loop
+// nests. However, some of the current checks don't depend on \p OuterLp and
+// would be redundantly executed for each \p Lp if we invoked this function for
+// different candidate outer loops. This is not the case for now because we
+// don't currently have the infrastructure to evaluate multiple candidate outer
+// loops and \p OuterLp will be a fixed parameter while we only support explicit
+// outer loop vectorization. It's also very likely that these checks go away
+// before introducing the aforementioned infrastructure. However, if this is not
+// the case, we should move the \p OuterLp independent checks to a separate
+// function that is only executed once for each \p Lp.
+static bool isUniformLoop(Loop *Lp, Loop *OuterLp) {
+ assert(Lp->getLoopLatch() && "Expected loop with a single latch.");
+
+ // If Lp is the outer loop, it's uniform by definition.
+ if (Lp == OuterLp)
+ return true;
+ assert(OuterLp->contains(Lp) && "OuterLp must contain Lp.");
+
+ // 1.
+ PHINode *IV = Lp->getCanonicalInductionVariable();
+ if (!IV) {
+ LLVM_DEBUG(dbgs() << "LV: Canonical IV not found.\n");
+ return false;
+ }
+
+ // 2.
+ BasicBlock *Latch = Lp->getLoopLatch();
+ auto *LatchBr = dyn_cast<BranchInst>(Latch->getTerminator());
+ if (!LatchBr || LatchBr->isUnconditional()) {
+ LLVM_DEBUG(dbgs() << "LV: Unsupported loop latch branch.\n");
+ return false;
+ }
+
+ // 3.
+ auto *LatchCmp = dyn_cast<CmpInst>(LatchBr->getCondition());
+ if (!LatchCmp) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Loop latch condition is not a compare instruction.\n");
+ return false;
+ }
+
+ Value *CondOp0 = LatchCmp->getOperand(0);
+ Value *CondOp1 = LatchCmp->getOperand(1);
+ Value *IVUpdate = IV->getIncomingValueForBlock(Latch);
+ if (!(CondOp0 == IVUpdate && OuterLp->isLoopInvariant(CondOp1)) &&
+ !(CondOp1 == IVUpdate && OuterLp->isLoopInvariant(CondOp0))) {
+ LLVM_DEBUG(dbgs() << "LV: Loop latch condition is not uniform.\n");
+ return false;
+ }
+
+ return true;
+}
+
+// Return true if \p Lp and all its nested loops are uniform with regard to \p
+// OuterLp.
+static bool isUniformLoopNest(Loop *Lp, Loop *OuterLp) {
+ if (!isUniformLoop(Lp, OuterLp))
+ return false;
+
+ // Check if nested loops are uniform.
+ for (Loop *SubLp : *Lp)
+ if (!isUniformLoopNest(SubLp, OuterLp))
+ return false;
+
+ return true;
+}
+
+/// Check whether it is safe to if-convert this phi node.
+///
+/// Phi nodes with constant expressions that can trap are not safe to if
+/// convert.
+static bool canIfConvertPHINodes(BasicBlock *BB) {
+ for (PHINode &Phi : BB->phis()) {
+ for (Value *V : Phi.incoming_values())
+ if (auto *C = dyn_cast<Constant>(V))
+ if (C->canTrap())
+ return false;
+ }
+ return true;
+}
+
+static Type *convertPointerToIntegerType(const DataLayout &DL, Type *Ty) {
+ if (Ty->isPointerTy())
+ return DL.getIntPtrType(Ty);
+
+ // It is possible that char's or short's overflow when we ask for the loop's
+ // trip count, work around this by changing the type size.
+ if (Ty->getScalarSizeInBits() < 32)
+ return Type::getInt32Ty(Ty->getContext());
+
+ return Ty;
+}
+
+static Type *getWiderType(const DataLayout &DL, Type *Ty0, Type *Ty1) {
+ Ty0 = convertPointerToIntegerType(DL, Ty0);
+ Ty1 = convertPointerToIntegerType(DL, Ty1);
+ if (Ty0->getScalarSizeInBits() > Ty1->getScalarSizeInBits())
+ return Ty0;
+ return Ty1;
+}
+
+/// Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+ SmallPtrSetImpl<Value *> &AllowedExit) {
+ // Reductions, Inductions and non-header phis are allowed to have exit users. All
+ // other instructions must not have external users.
+ if (!AllowedExit.count(Inst))
+ // Check that all of the users of the loop are inside the BB.
+ for (User *U : Inst->users()) {
+ Instruction *UI = cast<Instruction>(U);
+ // This user may be a reduction exit value.
+ if (!TheLoop->contains(UI)) {
+ LLVM_DEBUG(dbgs() << "LV: Found an outside user for : " << *UI << '\n');
+ return true;
+ }
+ }
+ return false;
+}
+
+int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+ const ValueToValueMap &Strides =
+ getSymbolicStrides() ? *getSymbolicStrides() : ValueToValueMap();
+
Function *F = TheLoop->getHeader()->getParent();
bool OptForSize = F->hasOptSize() ||
llvm::shouldOptimizeForSize(TheLoop->getHeader(), PSI, BFI,
PGSOQueryType::IRPass);
bool CanAddPredicate = !OptForSize;
- int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);
- if (Stride == 1 || Stride == -1)
- return Stride;
- return 0;
-}
-
-bool LoopVectorizationLegality::isUniform(Value *V) {
- return LAI->isUniform(V);
-}
-
-bool LoopVectorizationLegality::canVectorizeOuterLoop() {
+ int Stride = getPtrStride(PSE, Ptr, TheLoop, Strides, CanAddPredicate, false);
+ if (Stride == 1 || Stride == -1)
+ return Stride;
+ return 0;
+}
+
+bool LoopVectorizationLegality::isUniform(Value *V) {
+ return LAI->isUniform(V);
+}
+
+bool LoopVectorizationLegality::canVectorizeOuterLoop() {
assert(!TheLoop->isInnermost() && "We are not vectorizing an outer loop.");
- // Store the result and return it at the end instead of exiting early, in case
- // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
- bool Result = true;
- bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
-
- for (BasicBlock *BB : TheLoop->blocks()) {
- // Check whether the BB terminator is a BranchInst. Any other terminator is
- // not supported yet.
- auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
- if (!Br) {
- reportVectorizationFailure("Unsupported basic block terminator",
- "loop control flow is not understood by vectorizer",
- "CFGNotUnderstood", ORE, TheLoop);
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // Check whether the BranchInst is a supported one. Only unconditional
- // branches, conditional branches with an outer loop invariant condition or
- // backedges are supported.
- // FIXME: We skip these checks when VPlan predication is enabled as we
- // want to allow divergent branches. This whole check will be removed
- // once VPlan predication is on by default.
- if (!EnableVPlanPredication && Br && Br->isConditional() &&
- !TheLoop->isLoopInvariant(Br->getCondition()) &&
- !LI->isLoopHeader(Br->getSuccessor(0)) &&
- !LI->isLoopHeader(Br->getSuccessor(1))) {
- reportVectorizationFailure("Unsupported conditional branch",
- "loop control flow is not understood by vectorizer",
- "CFGNotUnderstood", ORE, TheLoop);
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
- }
-
- // Check whether inner loops are uniform. At this point, we only support
- // simple outer loops scenarios with uniform nested loops.
- if (!isUniformLoopNest(TheLoop /*loop nest*/,
- TheLoop /*context outer loop*/)) {
- reportVectorizationFailure("Outer loop contains divergent loops",
- "loop control flow is not understood by vectorizer",
- "CFGNotUnderstood", ORE, TheLoop);
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // Check whether we are able to set up outer loop induction.
- if (!setupOuterLoopInductions()) {
- reportVectorizationFailure("Unsupported outer loop Phi(s)",
- "Unsupported outer loop Phi(s)",
- "UnsupportedPhi", ORE, TheLoop);
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- return Result;
-}
-
-void LoopVectorizationLegality::addInductionPhi(
- PHINode *Phi, const InductionDescriptor &ID,
- SmallPtrSetImpl<Value *> &AllowedExit) {
- Inductions[Phi] = ID;
-
- // In case this induction also comes with casts that we know we can ignore
- // in the vectorized loop body, record them here. All casts could be recorded
- // here for ignoring, but suffices to record only the first (as it is the
- // only one that may bw used outside the cast sequence).
- const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
- if (!Casts.empty())
- InductionCastsToIgnore.insert(*Casts.begin());
-
- Type *PhiTy = Phi->getType();
- const DataLayout &DL = Phi->getModule()->getDataLayout();
-
- // Get the widest type.
- if (!PhiTy->isFloatingPointTy()) {
- if (!WidestIndTy)
- WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
- else
- WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
- }
-
- // Int inductions are special because we only allow one IV.
- if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
- ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
- isa<Constant>(ID.getStartValue()) &&
- cast<Constant>(ID.getStartValue())->isNullValue()) {
-
- // Use the phi node with the widest type as induction. Use the last
- // one if there are multiple (no good reason for doing this other
- // than it is expedient). We've checked that it begins at zero and
- // steps by one, so this is a canonical induction variable.
- if (!PrimaryInduction || PhiTy == WidestIndTy)
- PrimaryInduction = Phi;
- }
-
- // Both the PHI node itself, and the "post-increment" value feeding
- // back into the PHI node may have external users.
- // We can allow those uses, except if the SCEVs we have for them rely
- // on predicates that only hold within the loop, since allowing the exit
- // currently means re-using this SCEV outside the loop (see PR33706 for more
- // details).
- if (PSE.getUnionPredicate().isAlwaysTrue()) {
- AllowedExit.insert(Phi);
- AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
- }
-
- LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
-}
-
-bool LoopVectorizationLegality::setupOuterLoopInductions() {
- BasicBlock *Header = TheLoop->getHeader();
-
- // Returns true if a given Phi is a supported induction.
- auto isSupportedPhi = [&](PHINode &Phi) -> bool {
- InductionDescriptor ID;
- if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) &&
- ID.getKind() == InductionDescriptor::IK_IntInduction) {
- addInductionPhi(&Phi, ID, AllowedExit);
- return true;
- } else {
- // Bail out for any Phi in the outer loop header that is not a supported
- // induction.
- LLVM_DEBUG(
- dbgs()
- << "LV: Found unsupported PHI for outer loop vectorization.\n");
- return false;
- }
- };
-
- if (llvm::all_of(Header->phis(), isSupportedPhi))
- return true;
- else
- return false;
-}
-
-/// Checks if a function is scalarizable according to the TLI, in
-/// the sense that it should be vectorized and then expanded in
-/// multiple scalarcalls. This is represented in the
-/// TLI via mappings that do not specify a vector name, as in the
-/// following example:
-///
-/// const VecDesc VecIntrinsics[] = {
-/// {"llvm.phx.abs.i32", "", 4}
-/// };
-static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
- const StringRef ScalarName = CI.getCalledFunction()->getName();
- bool Scalarize = TLI.isFunctionVectorizable(ScalarName);
- // Check that all known VFs are not associated to a vector
- // function, i.e. the vector name is emty.
- if (Scalarize)
- for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName);
- VF <= WidestVF; VF *= 2) {
- Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF);
- }
- return Scalarize;
-}
-
-bool LoopVectorizationLegality::canVectorizeInstrs() {
- BasicBlock *Header = TheLoop->getHeader();
-
- // Look for the attribute signaling the absence of NaNs.
- Function &F = *Header->getParent();
- HasFunNoNaNAttr =
- F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
-
- // For each block in the loop.
- for (BasicBlock *BB : TheLoop->blocks()) {
- // Scan the instructions in the block and look for hazards.
- for (Instruction &I : *BB) {
- if (auto *Phi = dyn_cast<PHINode>(&I)) {
- Type *PhiTy = Phi->getType();
- // Check that this PHI type is allowed.
- if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
- !PhiTy->isPointerTy()) {
- reportVectorizationFailure("Found a non-int non-pointer PHI",
- "loop control flow is not understood by vectorizer",
- "CFGNotUnderstood", ORE, TheLoop);
- return false;
- }
-
- // If this PHINode is not in the header block, then we know that we
- // can convert it to select during if-conversion. No need to check if
- // the PHIs in this block are induction or reduction variables.
- if (BB != Header) {
- // Non-header phi nodes that have outside uses can be vectorized. Add
- // them to the list of allowed exits.
- // Unsafe cyclic dependencies with header phis are identified during
- // legalization for reduction, induction and first order
- // recurrences.
- AllowedExit.insert(&I);
- continue;
- }
-
- // We only allow if-converted PHIs with exactly two incoming values.
- if (Phi->getNumIncomingValues() != 2) {
- reportVectorizationFailure("Found an invalid PHI",
- "loop control flow is not understood by vectorizer",
- "CFGNotUnderstood", ORE, TheLoop, Phi);
- return false;
- }
-
- RecurrenceDescriptor RedDes;
- if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
- DT)) {
- if (RedDes.hasUnsafeAlgebra())
- Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
- AllowedExit.insert(RedDes.getLoopExitInstr());
- Reductions[Phi] = RedDes;
- continue;
- }
-
- // TODO: Instead of recording the AllowedExit, it would be good to record the
- // complementary set: NotAllowedExit. These include (but may not be
- // limited to):
- // 1. Reduction phis as they represent the one-before-last value, which
- // is not available when vectorized
- // 2. Induction phis and increment when SCEV predicates cannot be used
- // outside the loop - see addInductionPhi
- // 3. Non-Phis with outside uses when SCEV predicates cannot be used
- // outside the loop - see call to hasOutsideLoopUser in the non-phi
- // handling below
- // 4. FirstOrderRecurrence phis that can possibly be handled by
- // extraction.
- // By recording these, we can then reason about ways to vectorize each
- // of these NotAllowedExit.
- InductionDescriptor ID;
- if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
- addInductionPhi(Phi, ID, AllowedExit);
- if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
- Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
- continue;
- }
-
- if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
- SinkAfter, DT)) {
- AllowedExit.insert(Phi);
- FirstOrderRecurrences.insert(Phi);
- continue;
- }
-
- // As a last resort, coerce the PHI to a AddRec expression
- // and re-try classifying it a an induction PHI.
- if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
- addInductionPhi(Phi, ID, AllowedExit);
- continue;
- }
-
- reportVectorizationFailure("Found an unidentified PHI",
- "value that could not be identified as "
- "reduction is used outside the loop",
- "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
- return false;
- } // end of PHI handling
-
- // We handle calls that:
- // * Are debug info intrinsics.
- // * Have a mapping to an IR intrinsic.
- // * Have a vector version available.
- auto *CI = dyn_cast<CallInst>(&I);
-
- if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
- !isa<DbgInfoIntrinsic>(CI) &&
- !(CI->getCalledFunction() && TLI &&
- (!VFDatabase::getMappings(*CI).empty() ||
- isTLIScalarize(*TLI, *CI)))) {
- // If the call is a recognized math libary call, it is likely that
- // we can vectorize it given loosened floating-point constraints.
- LibFunc Func;
- bool IsMathLibCall =
- TLI && CI->getCalledFunction() &&
- CI->getType()->isFloatingPointTy() &&
- TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
- TLI->hasOptimizedCodeGen(Func);
-
- if (IsMathLibCall) {
- // TODO: Ideally, we should not use clang-specific language here,
- // but it's hard to provide meaningful yet generic advice.
- // Also, should this be guarded by allowExtraAnalysis() and/or be part
- // of the returned info from isFunctionVectorizable()?
- reportVectorizationFailure(
- "Found a non-intrinsic callsite",
- "library call cannot be vectorized. "
- "Try compiling with -fno-math-errno, -ffast-math, "
- "or similar flags",
- "CantVectorizeLibcall", ORE, TheLoop, CI);
- } else {
- reportVectorizationFailure("Found a non-intrinsic callsite",
- "call instruction cannot be vectorized",
- "CantVectorizeLibcall", ORE, TheLoop, CI);
- }
- return false;
- }
-
- // Some intrinsics have scalar arguments and should be same in order for
- // them to be vectorized (i.e. loop invariant).
- if (CI) {
- auto *SE = PSE.getSE();
- Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
- for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
- if (hasVectorInstrinsicScalarOpd(IntrinID, i)) {
- if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
- reportVectorizationFailure("Found unvectorizable intrinsic",
- "intrinsic instruction cannot be vectorized",
- "CantVectorizeIntrinsic", ORE, TheLoop, CI);
- return false;
- }
- }
- }
-
- // Check that the instruction return type is vectorizable.
- // Also, we can't vectorize extractelement instructions.
- if ((!VectorType::isValidElementType(I.getType()) &&
- !I.getType()->isVoidTy()) ||
- isa<ExtractElementInst>(I)) {
- reportVectorizationFailure("Found unvectorizable type",
- "instruction return type cannot be vectorized",
- "CantVectorizeInstructionReturnType", ORE, TheLoop, &I);
- return false;
- }
-
- // Check that the stored type is vectorizable.
- if (auto *ST = dyn_cast<StoreInst>(&I)) {
- Type *T = ST->getValueOperand()->getType();
- if (!VectorType::isValidElementType(T)) {
- reportVectorizationFailure("Store instruction cannot be vectorized",
- "store instruction cannot be vectorized",
- "CantVectorizeStore", ORE, TheLoop, ST);
- return false;
- }
-
- // For nontemporal stores, check that a nontemporal vector version is
- // supported on the target.
- if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
- // Arbitrarily try a vector of 2 elements.
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // Check whether the BB terminator is a BranchInst. Any other terminator is
+ // not supported yet.
+ auto *Br = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!Br) {
+ reportVectorizationFailure("Unsupported basic block terminator",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Check whether the BranchInst is a supported one. Only unconditional
+ // branches, conditional branches with an outer loop invariant condition or
+ // backedges are supported.
+ // FIXME: We skip these checks when VPlan predication is enabled as we
+ // want to allow divergent branches. This whole check will be removed
+ // once VPlan predication is on by default.
+ if (!EnableVPlanPredication && Br && Br->isConditional() &&
+ !TheLoop->isLoopInvariant(Br->getCondition()) &&
+ !LI->isLoopHeader(Br->getSuccessor(0)) &&
+ !LI->isLoopHeader(Br->getSuccessor(1))) {
+ reportVectorizationFailure("Unsupported conditional branch",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+ }
+
+ // Check whether inner loops are uniform. At this point, we only support
+ // simple outer loops scenarios with uniform nested loops.
+ if (!isUniformLoopNest(TheLoop /*loop nest*/,
+ TheLoop /*context outer loop*/)) {
+ reportVectorizationFailure("Outer loop contains divergent loops",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Check whether we are able to set up outer loop induction.
+ if (!setupOuterLoopInductions()) {
+ reportVectorizationFailure("Unsupported outer loop Phi(s)",
+ "Unsupported outer loop Phi(s)",
+ "UnsupportedPhi", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ return Result;
+}
+
+void LoopVectorizationLegality::addInductionPhi(
+ PHINode *Phi, const InductionDescriptor &ID,
+ SmallPtrSetImpl<Value *> &AllowedExit) {
+ Inductions[Phi] = ID;
+
+ // In case this induction also comes with casts that we know we can ignore
+ // in the vectorized loop body, record them here. All casts could be recorded
+ // here for ignoring, but suffices to record only the first (as it is the
+ // only one that may bw used outside the cast sequence).
+ const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
+ if (!Casts.empty())
+ InductionCastsToIgnore.insert(*Casts.begin());
+
+ Type *PhiTy = Phi->getType();
+ const DataLayout &DL = Phi->getModule()->getDataLayout();
+
+ // Get the widest type.
+ if (!PhiTy->isFloatingPointTy()) {
+ if (!WidestIndTy)
+ WidestIndTy = convertPointerToIntegerType(DL, PhiTy);
+ else
+ WidestIndTy = getWiderType(DL, PhiTy, WidestIndTy);
+ }
+
+ // Int inductions are special because we only allow one IV.
+ if (ID.getKind() == InductionDescriptor::IK_IntInduction &&
+ ID.getConstIntStepValue() && ID.getConstIntStepValue()->isOne() &&
+ isa<Constant>(ID.getStartValue()) &&
+ cast<Constant>(ID.getStartValue())->isNullValue()) {
+
+ // Use the phi node with the widest type as induction. Use the last
+ // one if there are multiple (no good reason for doing this other
+ // than it is expedient). We've checked that it begins at zero and
+ // steps by one, so this is a canonical induction variable.
+ if (!PrimaryInduction || PhiTy == WidestIndTy)
+ PrimaryInduction = Phi;
+ }
+
+ // Both the PHI node itself, and the "post-increment" value feeding
+ // back into the PHI node may have external users.
+ // We can allow those uses, except if the SCEVs we have for them rely
+ // on predicates that only hold within the loop, since allowing the exit
+ // currently means re-using this SCEV outside the loop (see PR33706 for more
+ // details).
+ if (PSE.getUnionPredicate().isAlwaysTrue()) {
+ AllowedExit.insert(Phi);
+ AllowedExit.insert(Phi->getIncomingValueForBlock(TheLoop->getLoopLatch()));
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: Found an induction variable.\n");
+}
+
+bool LoopVectorizationLegality::setupOuterLoopInductions() {
+ BasicBlock *Header = TheLoop->getHeader();
+
+ // Returns true if a given Phi is a supported induction.
+ auto isSupportedPhi = [&](PHINode &Phi) -> bool {
+ InductionDescriptor ID;
+ if (InductionDescriptor::isInductionPHI(&Phi, TheLoop, PSE, ID) &&
+ ID.getKind() == InductionDescriptor::IK_IntInduction) {
+ addInductionPhi(&Phi, ID, AllowedExit);
+ return true;
+ } else {
+ // Bail out for any Phi in the outer loop header that is not a supported
+ // induction.
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Found unsupported PHI for outer loop vectorization.\n");
+ return false;
+ }
+ };
+
+ if (llvm::all_of(Header->phis(), isSupportedPhi))
+ return true;
+ else
+ return false;
+}
+
+/// Checks if a function is scalarizable according to the TLI, in
+/// the sense that it should be vectorized and then expanded in
+/// multiple scalarcalls. This is represented in the
+/// TLI via mappings that do not specify a vector name, as in the
+/// following example:
+///
+/// const VecDesc VecIntrinsics[] = {
+/// {"llvm.phx.abs.i32", "", 4}
+/// };
+static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
+ const StringRef ScalarName = CI.getCalledFunction()->getName();
+ bool Scalarize = TLI.isFunctionVectorizable(ScalarName);
+ // Check that all known VFs are not associated to a vector
+ // function, i.e. the vector name is emty.
+ if (Scalarize)
+ for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName);
+ VF <= WidestVF; VF *= 2) {
+ Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF);
+ }
+ return Scalarize;
+}
+
+bool LoopVectorizationLegality::canVectorizeInstrs() {
+ BasicBlock *Header = TheLoop->getHeader();
+
+ // Look for the attribute signaling the absence of NaNs.
+ Function &F = *Header->getParent();
+ HasFunNoNaNAttr =
+ F.getFnAttribute("no-nans-fp-math").getValueAsString() == "true";
+
+ // For each block in the loop.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // Scan the instructions in the block and look for hazards.
+ for (Instruction &I : *BB) {
+ if (auto *Phi = dyn_cast<PHINode>(&I)) {
+ Type *PhiTy = Phi->getType();
+ // Check that this PHI type is allowed.
+ if (!PhiTy->isIntegerTy() && !PhiTy->isFloatingPointTy() &&
+ !PhiTy->isPointerTy()) {
+ reportVectorizationFailure("Found a non-int non-pointer PHI",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ return false;
+ }
+
+ // If this PHINode is not in the header block, then we know that we
+ // can convert it to select during if-conversion. No need to check if
+ // the PHIs in this block are induction or reduction variables.
+ if (BB != Header) {
+ // Non-header phi nodes that have outside uses can be vectorized. Add
+ // them to the list of allowed exits.
+ // Unsafe cyclic dependencies with header phis are identified during
+ // legalization for reduction, induction and first order
+ // recurrences.
+ AllowedExit.insert(&I);
+ continue;
+ }
+
+ // We only allow if-converted PHIs with exactly two incoming values.
+ if (Phi->getNumIncomingValues() != 2) {
+ reportVectorizationFailure("Found an invalid PHI",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop, Phi);
+ return false;
+ }
+
+ RecurrenceDescriptor RedDes;
+ if (RecurrenceDescriptor::isReductionPHI(Phi, TheLoop, RedDes, DB, AC,
+ DT)) {
+ if (RedDes.hasUnsafeAlgebra())
+ Requirements->addUnsafeAlgebraInst(RedDes.getUnsafeAlgebraInst());
+ AllowedExit.insert(RedDes.getLoopExitInstr());
+ Reductions[Phi] = RedDes;
+ continue;
+ }
+
+ // TODO: Instead of recording the AllowedExit, it would be good to record the
+ // complementary set: NotAllowedExit. These include (but may not be
+ // limited to):
+ // 1. Reduction phis as they represent the one-before-last value, which
+ // is not available when vectorized
+ // 2. Induction phis and increment when SCEV predicates cannot be used
+ // outside the loop - see addInductionPhi
+ // 3. Non-Phis with outside uses when SCEV predicates cannot be used
+ // outside the loop - see call to hasOutsideLoopUser in the non-phi
+ // handling below
+ // 4. FirstOrderRecurrence phis that can possibly be handled by
+ // extraction.
+ // By recording these, we can then reason about ways to vectorize each
+ // of these NotAllowedExit.
+ InductionDescriptor ID;
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID)) {
+ addInductionPhi(Phi, ID, AllowedExit);
+ if (ID.hasUnsafeAlgebra() && !HasFunNoNaNAttr)
+ Requirements->addUnsafeAlgebraInst(ID.getUnsafeAlgebraInst());
+ continue;
+ }
+
+ if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
+ SinkAfter, DT)) {
+ AllowedExit.insert(Phi);
+ FirstOrderRecurrences.insert(Phi);
+ continue;
+ }
+
+ // As a last resort, coerce the PHI to a AddRec expression
+ // and re-try classifying it a an induction PHI.
+ if (InductionDescriptor::isInductionPHI(Phi, TheLoop, PSE, ID, true)) {
+ addInductionPhi(Phi, ID, AllowedExit);
+ continue;
+ }
+
+ reportVectorizationFailure("Found an unidentified PHI",
+ "value that could not be identified as "
+ "reduction is used outside the loop",
+ "NonReductionValueUsedOutsideLoop", ORE, TheLoop, Phi);
+ return false;
+ } // end of PHI handling
+
+ // We handle calls that:
+ // * Are debug info intrinsics.
+ // * Have a mapping to an IR intrinsic.
+ // * Have a vector version available.
+ auto *CI = dyn_cast<CallInst>(&I);
+
+ if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
+ !isa<DbgInfoIntrinsic>(CI) &&
+ !(CI->getCalledFunction() && TLI &&
+ (!VFDatabase::getMappings(*CI).empty() ||
+ isTLIScalarize(*TLI, *CI)))) {
+ // If the call is a recognized math libary call, it is likely that
+ // we can vectorize it given loosened floating-point constraints.
+ LibFunc Func;
+ bool IsMathLibCall =
+ TLI && CI->getCalledFunction() &&
+ CI->getType()->isFloatingPointTy() &&
+ TLI->getLibFunc(CI->getCalledFunction()->getName(), Func) &&
+ TLI->hasOptimizedCodeGen(Func);
+
+ if (IsMathLibCall) {
+ // TODO: Ideally, we should not use clang-specific language here,
+ // but it's hard to provide meaningful yet generic advice.
+ // Also, should this be guarded by allowExtraAnalysis() and/or be part
+ // of the returned info from isFunctionVectorizable()?
+ reportVectorizationFailure(
+ "Found a non-intrinsic callsite",
+ "library call cannot be vectorized. "
+ "Try compiling with -fno-math-errno, -ffast-math, "
+ "or similar flags",
+ "CantVectorizeLibcall", ORE, TheLoop, CI);
+ } else {
+ reportVectorizationFailure("Found a non-intrinsic callsite",
+ "call instruction cannot be vectorized",
+ "CantVectorizeLibcall", ORE, TheLoop, CI);
+ }
+ return false;
+ }
+
+ // Some intrinsics have scalar arguments and should be same in order for
+ // them to be vectorized (i.e. loop invariant).
+ if (CI) {
+ auto *SE = PSE.getSE();
+ Intrinsic::ID IntrinID = getVectorIntrinsicIDForCall(CI, TLI);
+ for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
+ if (hasVectorInstrinsicScalarOpd(IntrinID, i)) {
+ if (!SE->isLoopInvariant(PSE.getSCEV(CI->getOperand(i)), TheLoop)) {
+ reportVectorizationFailure("Found unvectorizable intrinsic",
+ "intrinsic instruction cannot be vectorized",
+ "CantVectorizeIntrinsic", ORE, TheLoop, CI);
+ return false;
+ }
+ }
+ }
+
+ // Check that the instruction return type is vectorizable.
+ // Also, we can't vectorize extractelement instructions.
+ if ((!VectorType::isValidElementType(I.getType()) &&
+ !I.getType()->isVoidTy()) ||
+ isa<ExtractElementInst>(I)) {
+ reportVectorizationFailure("Found unvectorizable type",
+ "instruction return type cannot be vectorized",
+ "CantVectorizeInstructionReturnType", ORE, TheLoop, &I);
+ return false;
+ }
+
+ // Check that the stored type is vectorizable.
+ if (auto *ST = dyn_cast<StoreInst>(&I)) {
+ Type *T = ST->getValueOperand()->getType();
+ if (!VectorType::isValidElementType(T)) {
+ reportVectorizationFailure("Store instruction cannot be vectorized",
+ "store instruction cannot be vectorized",
+ "CantVectorizeStore", ORE, TheLoop, ST);
+ return false;
+ }
+
+ // For nontemporal stores, check that a nontemporal vector version is
+ // supported on the target.
+ if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
+ // Arbitrarily try a vector of 2 elements.
auto *VecTy = FixedVectorType::get(T, /*NumElts=*/2);
- assert(VecTy && "did not find vectorized version of stored type");
- if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
- reportVectorizationFailure(
- "nontemporal store instruction cannot be vectorized",
- "nontemporal store instruction cannot be vectorized",
- "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
- return false;
- }
- }
-
- } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
- if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
- // For nontemporal loads, check that a nontemporal vector version is
- // supported on the target (arbitrarily try a vector of 2 elements).
+ assert(VecTy && "did not find vectorized version of stored type");
+ if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
+ reportVectorizationFailure(
+ "nontemporal store instruction cannot be vectorized",
+ "nontemporal store instruction cannot be vectorized",
+ "CantVectorizeNontemporalStore", ORE, TheLoop, ST);
+ return false;
+ }
+ }
+
+ } else if (auto *LD = dyn_cast<LoadInst>(&I)) {
+ if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
+ // For nontemporal loads, check that a nontemporal vector version is
+ // supported on the target (arbitrarily try a vector of 2 elements).
auto *VecTy = FixedVectorType::get(I.getType(), /*NumElts=*/2);
- assert(VecTy && "did not find vectorized version of load type");
- if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
- reportVectorizationFailure(
- "nontemporal load instruction cannot be vectorized",
- "nontemporal load instruction cannot be vectorized",
- "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
- return false;
- }
- }
-
- // FP instructions can allow unsafe algebra, thus vectorizable by
- // non-IEEE-754 compliant SIMD units.
- // This applies to floating-point math operations and calls, not memory
- // operations, shuffles, or casts, as they don't change precision or
- // semantics.
- } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
- !I.isFast()) {
- LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
- Hints->setPotentiallyUnsafe();
- }
-
- // Reduction instructions are allowed to have exit users.
- // All other instructions must not have external users.
- if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
- // We can safely vectorize loops where instructions within the loop are
- // used outside the loop only if the SCEV predicates within the loop is
- // same as outside the loop. Allowing the exit means reusing the SCEV
- // outside the loop.
- if (PSE.getUnionPredicate().isAlwaysTrue()) {
- AllowedExit.insert(&I);
- continue;
- }
- reportVectorizationFailure("Value cannot be used outside the loop",
- "value cannot be used outside the loop",
- "ValueUsedOutsideLoop", ORE, TheLoop, &I);
- return false;
- }
- } // next instr.
- }
-
- if (!PrimaryInduction) {
- if (Inductions.empty()) {
- reportVectorizationFailure("Did not find one integer induction var",
- "loop induction variable could not be identified",
- "NoInductionVariable", ORE, TheLoop);
- return false;
- } else if (!WidestIndTy) {
- reportVectorizationFailure("Did not find one integer induction var",
- "integer loop induction variable could not be identified",
- "NoIntegerInductionVariable", ORE, TheLoop);
- return false;
- } else {
- LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
- }
- }
-
- // For first order recurrences, we use the previous value (incoming value from
- // the latch) to check if it dominates all users of the recurrence. Bail out
- // if we have to sink such an instruction for another recurrence, as the
- // dominance requirement may not hold after sinking.
- BasicBlock *LoopLatch = TheLoop->getLoopLatch();
- if (any_of(FirstOrderRecurrences, [LoopLatch, this](const PHINode *Phi) {
- Instruction *V =
- cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch));
- return SinkAfter.find(V) != SinkAfter.end();
- }))
- return false;
-
- // Now we know the widest induction type, check if our found induction
- // is the same size. If it's not, unset it here and InnerLoopVectorizer
- // will create another.
- if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
- PrimaryInduction = nullptr;
-
- return true;
-}
-
-bool LoopVectorizationLegality::canVectorizeMemory() {
- LAI = &(*GetLAA)(*TheLoop);
- const OptimizationRemarkAnalysis *LAR = LAI->getReport();
- if (LAR) {
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
- "loop not vectorized: ", *LAR);
- });
- }
- if (!LAI->canVectorizeMemory())
- return false;
-
- if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
- reportVectorizationFailure("Stores to a uniform address",
- "write to a loop invariant address could not be vectorized",
- "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
- return false;
- }
- Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
- PSE.addPredicate(LAI->getPSE().getUnionPredicate());
-
- return true;
-}
-
-bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
- Value *In0 = const_cast<Value *>(V);
- PHINode *PN = dyn_cast_or_null<PHINode>(In0);
- if (!PN)
- return false;
-
- return Inductions.count(PN);
-}
-
-bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
- auto *Inst = dyn_cast<Instruction>(V);
- return (Inst && InductionCastsToIgnore.count(Inst));
-}
-
-bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
- return isInductionPhi(V) || isCastedInductionVariable(V);
-}
-
-bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
- return FirstOrderRecurrences.count(Phi);
-}
-
-bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
- return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
-}
-
-bool LoopVectorizationLegality::blockCanBePredicated(
+ assert(VecTy && "did not find vectorized version of load type");
+ if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
+ reportVectorizationFailure(
+ "nontemporal load instruction cannot be vectorized",
+ "nontemporal load instruction cannot be vectorized",
+ "CantVectorizeNontemporalLoad", ORE, TheLoop, LD);
+ return false;
+ }
+ }
+
+ // FP instructions can allow unsafe algebra, thus vectorizable by
+ // non-IEEE-754 compliant SIMD units.
+ // This applies to floating-point math operations and calls, not memory
+ // operations, shuffles, or casts, as they don't change precision or
+ // semantics.
+ } else if (I.getType()->isFloatingPointTy() && (CI || I.isBinaryOp()) &&
+ !I.isFast()) {
+ LLVM_DEBUG(dbgs() << "LV: Found FP op with unsafe algebra.\n");
+ Hints->setPotentiallyUnsafe();
+ }
+
+ // Reduction instructions are allowed to have exit users.
+ // All other instructions must not have external users.
+ if (hasOutsideLoopUser(TheLoop, &I, AllowedExit)) {
+ // We can safely vectorize loops where instructions within the loop are
+ // used outside the loop only if the SCEV predicates within the loop is
+ // same as outside the loop. Allowing the exit means reusing the SCEV
+ // outside the loop.
+ if (PSE.getUnionPredicate().isAlwaysTrue()) {
+ AllowedExit.insert(&I);
+ continue;
+ }
+ reportVectorizationFailure("Value cannot be used outside the loop",
+ "value cannot be used outside the loop",
+ "ValueUsedOutsideLoop", ORE, TheLoop, &I);
+ return false;
+ }
+ } // next instr.
+ }
+
+ if (!PrimaryInduction) {
+ if (Inductions.empty()) {
+ reportVectorizationFailure("Did not find one integer induction var",
+ "loop induction variable could not be identified",
+ "NoInductionVariable", ORE, TheLoop);
+ return false;
+ } else if (!WidestIndTy) {
+ reportVectorizationFailure("Did not find one integer induction var",
+ "integer loop induction variable could not be identified",
+ "NoIntegerInductionVariable", ORE, TheLoop);
+ return false;
+ } else {
+ LLVM_DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+ }
+ }
+
+ // For first order recurrences, we use the previous value (incoming value from
+ // the latch) to check if it dominates all users of the recurrence. Bail out
+ // if we have to sink such an instruction for another recurrence, as the
+ // dominance requirement may not hold after sinking.
+ BasicBlock *LoopLatch = TheLoop->getLoopLatch();
+ if (any_of(FirstOrderRecurrences, [LoopLatch, this](const PHINode *Phi) {
+ Instruction *V =
+ cast<Instruction>(Phi->getIncomingValueForBlock(LoopLatch));
+ return SinkAfter.find(V) != SinkAfter.end();
+ }))
+ return false;
+
+ // Now we know the widest induction type, check if our found induction
+ // is the same size. If it's not, unset it here and InnerLoopVectorizer
+ // will create another.
+ if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
+ PrimaryInduction = nullptr;
+
+ return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeMemory() {
+ LAI = &(*GetLAA)(*TheLoop);
+ const OptimizationRemarkAnalysis *LAR = LAI->getReport();
+ if (LAR) {
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(Hints->vectorizeAnalysisPassName(),
+ "loop not vectorized: ", *LAR);
+ });
+ }
+ if (!LAI->canVectorizeMemory())
+ return false;
+
+ if (LAI->hasDependenceInvolvingLoopInvariantAddress()) {
+ reportVectorizationFailure("Stores to a uniform address",
+ "write to a loop invariant address could not be vectorized",
+ "CantVectorizeStoreToLoopInvariantAddress", ORE, TheLoop);
+ return false;
+ }
+ Requirements->addRuntimePointerChecks(LAI->getNumRuntimePointerChecks());
+ PSE.addPredicate(LAI->getPSE().getUnionPredicate());
+
+ return true;
+}
+
+bool LoopVectorizationLegality::isInductionPhi(const Value *V) {
+ Value *In0 = const_cast<Value *>(V);
+ PHINode *PN = dyn_cast_or_null<PHINode>(In0);
+ if (!PN)
+ return false;
+
+ return Inductions.count(PN);
+}
+
+bool LoopVectorizationLegality::isCastedInductionVariable(const Value *V) {
+ auto *Inst = dyn_cast<Instruction>(V);
+ return (Inst && InductionCastsToIgnore.count(Inst));
+}
+
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+ return isInductionPhi(V) || isCastedInductionVariable(V);
+}
+
+bool LoopVectorizationLegality::isFirstOrderRecurrence(const PHINode *Phi) {
+ return FirstOrderRecurrences.count(Phi);
+}
+
+bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
+ return LoopAccessInfo::blockNeedsPredication(BB, TheLoop, DT);
+}
+
+bool LoopVectorizationLegality::blockCanBePredicated(
BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
SmallPtrSetImpl<const Instruction *> &MaskedOp,
SmallPtrSetImpl<Instruction *> &ConditionalAssumes) const {
- for (Instruction &I : *BB) {
- // Check that we don't have a constant expression that can trap as operand.
- for (Value *Operand : I.operands()) {
- if (auto *C = dyn_cast<Constant>(Operand))
- if (C->canTrap())
- return false;
- }
-
- // We can predicate blocks with calls to assume, as long as we drop them in
- // case we flatten the CFG via predication.
- if (match(&I, m_Intrinsic<Intrinsic::assume>())) {
- ConditionalAssumes.insert(&I);
- continue;
- }
-
+ for (Instruction &I : *BB) {
+ // Check that we don't have a constant expression that can trap as operand.
+ for (Value *Operand : I.operands()) {
+ if (auto *C = dyn_cast<Constant>(Operand))
+ if (C->canTrap())
+ return false;
+ }
+
+ // We can predicate blocks with calls to assume, as long as we drop them in
+ // case we flatten the CFG via predication.
+ if (match(&I, m_Intrinsic<Intrinsic::assume>())) {
+ ConditionalAssumes.insert(&I);
+ continue;
+ }
+
// Do not let llvm.experimental.noalias.scope.decl block the vectorization.
// TODO: there might be cases that it should block the vectorization. Let's
// ignore those for now.
if (isa<NoAliasScopeDeclInst>(&I))
continue;
- // We might be able to hoist the load.
- if (I.mayReadFromMemory()) {
- auto *LI = dyn_cast<LoadInst>(&I);
- if (!LI)
- return false;
- if (!SafePtrs.count(LI->getPointerOperand())) {
+ // We might be able to hoist the load.
+ if (I.mayReadFromMemory()) {
+ auto *LI = dyn_cast<LoadInst>(&I);
+ if (!LI)
+ return false;
+ if (!SafePtrs.count(LI->getPointerOperand())) {
MaskedOp.insert(LI);
- continue;
- }
- }
-
- if (I.mayWriteToMemory()) {
- auto *SI = dyn_cast<StoreInst>(&I);
- if (!SI)
- return false;
- // Predicated store requires some form of masking:
- // 1) masked store HW instruction,
- // 2) emulation via load-blend-store (only if safe and legal to do so,
- // be aware on the race conditions), or
- // 3) element-by-element predicate check and scalar store.
- MaskedOp.insert(SI);
- continue;
- }
- if (I.mayThrow())
- return false;
- }
-
- return true;
-}
-
-bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
- if (!EnableIfConversion) {
- reportVectorizationFailure("If-conversion is disabled",
- "if-conversion is disabled",
- "IfConversionDisabled",
- ORE, TheLoop);
- return false;
- }
-
- assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
-
- // A list of pointers which are known to be dereferenceable within scope of
- // the loop body for each iteration of the loop which executes. That is,
- // the memory pointed to can be dereferenced (with the access size implied by
- // the value's type) unconditionally within the loop header without
- // introducing a new fault.
- SmallPtrSet<Value *, 8> SafePointers;
-
- // Collect safe addresses.
- for (BasicBlock *BB : TheLoop->blocks()) {
- if (!blockNeedsPredication(BB)) {
- for (Instruction &I : *BB)
- if (auto *Ptr = getLoadStorePointerOperand(&I))
- SafePointers.insert(Ptr);
- continue;
- }
-
- // For a block which requires predication, a address may be safe to access
- // in the loop w/o predication if we can prove dereferenceability facts
- // sufficient to ensure it'll never fault within the loop. For the moment,
- // we restrict this to loads; stores are more complicated due to
- // concurrency restrictions.
- ScalarEvolution &SE = *PSE.getSE();
- for (Instruction &I : *BB) {
- LoadInst *LI = dyn_cast<LoadInst>(&I);
+ continue;
+ }
+ }
+
+ if (I.mayWriteToMemory()) {
+ auto *SI = dyn_cast<StoreInst>(&I);
+ if (!SI)
+ return false;
+ // Predicated store requires some form of masking:
+ // 1) masked store HW instruction,
+ // 2) emulation via load-blend-store (only if safe and legal to do so,
+ // be aware on the race conditions), or
+ // 3) element-by-element predicate check and scalar store.
+ MaskedOp.insert(SI);
+ continue;
+ }
+ if (I.mayThrow())
+ return false;
+ }
+
+ return true;
+}
+
+bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
+ if (!EnableIfConversion) {
+ reportVectorizationFailure("If-conversion is disabled",
+ "if-conversion is disabled",
+ "IfConversionDisabled",
+ ORE, TheLoop);
+ return false;
+ }
+
+ assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
+
+ // A list of pointers which are known to be dereferenceable within scope of
+ // the loop body for each iteration of the loop which executes. That is,
+ // the memory pointed to can be dereferenced (with the access size implied by
+ // the value's type) unconditionally within the loop header without
+ // introducing a new fault.
+ SmallPtrSet<Value *, 8> SafePointers;
+
+ // Collect safe addresses.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ if (!blockNeedsPredication(BB)) {
+ for (Instruction &I : *BB)
+ if (auto *Ptr = getLoadStorePointerOperand(&I))
+ SafePointers.insert(Ptr);
+ continue;
+ }
+
+ // For a block which requires predication, a address may be safe to access
+ // in the loop w/o predication if we can prove dereferenceability facts
+ // sufficient to ensure it'll never fault within the loop. For the moment,
+ // we restrict this to loads; stores are more complicated due to
+ // concurrency restrictions.
+ ScalarEvolution &SE = *PSE.getSE();
+ for (Instruction &I : *BB) {
+ LoadInst *LI = dyn_cast<LoadInst>(&I);
if (LI && !LI->getType()->isVectorTy() && !mustSuppressSpeculation(*LI) &&
- isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT))
- SafePointers.insert(LI->getPointerOperand());
- }
- }
-
- // Collect the blocks that need predication.
- BasicBlock *Header = TheLoop->getHeader();
- for (BasicBlock *BB : TheLoop->blocks()) {
- // We don't support switch statements inside loops.
- if (!isa<BranchInst>(BB->getTerminator())) {
- reportVectorizationFailure("Loop contains a switch statement",
- "loop contains a switch statement",
- "LoopContainsSwitch", ORE, TheLoop,
- BB->getTerminator());
- return false;
- }
-
- // We must be able to predicate all blocks that need to be predicated.
- if (blockNeedsPredication(BB)) {
+ isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT))
+ SafePointers.insert(LI->getPointerOperand());
+ }
+ }
+
+ // Collect the blocks that need predication.
+ BasicBlock *Header = TheLoop->getHeader();
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // We don't support switch statements inside loops.
+ if (!isa<BranchInst>(BB->getTerminator())) {
+ reportVectorizationFailure("Loop contains a switch statement",
+ "loop contains a switch statement",
+ "LoopContainsSwitch", ORE, TheLoop,
+ BB->getTerminator());
+ return false;
+ }
+
+ // We must be able to predicate all blocks that need to be predicated.
+ if (blockNeedsPredication(BB)) {
if (!blockCanBePredicated(BB, SafePointers, MaskedOp,
ConditionalAssumes)) {
- reportVectorizationFailure(
- "Control flow cannot be substituted for a select",
- "control flow cannot be substituted for a select",
- "NoCFGForSelect", ORE, TheLoop,
- BB->getTerminator());
- return false;
- }
- } else if (BB != Header && !canIfConvertPHINodes(BB)) {
- reportVectorizationFailure(
- "Control flow cannot be substituted for a select",
- "control flow cannot be substituted for a select",
- "NoCFGForSelect", ORE, TheLoop,
- BB->getTerminator());
- return false;
- }
- }
-
- // We can if-convert this loop.
- return true;
-}
-
-// Helper function to canVectorizeLoopNestCFG.
-bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
- bool UseVPlanNativePath) {
+ reportVectorizationFailure(
+ "Control flow cannot be substituted for a select",
+ "control flow cannot be substituted for a select",
+ "NoCFGForSelect", ORE, TheLoop,
+ BB->getTerminator());
+ return false;
+ }
+ } else if (BB != Header && !canIfConvertPHINodes(BB)) {
+ reportVectorizationFailure(
+ "Control flow cannot be substituted for a select",
+ "control flow cannot be substituted for a select",
+ "NoCFGForSelect", ORE, TheLoop,
+ BB->getTerminator());
+ return false;
+ }
+ }
+
+ // We can if-convert this loop.
+ return true;
+}
+
+// Helper function to canVectorizeLoopNestCFG.
+bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
+ bool UseVPlanNativePath) {
assert((UseVPlanNativePath || Lp->isInnermost()) &&
- "VPlan-native path is not enabled.");
-
- // TODO: ORE should be improved to show more accurate information when an
- // outer loop can't be vectorized because a nested loop is not understood or
- // legal. Something like: "outer_loop_location: loop not vectorized:
- // (inner_loop_location) loop control flow is not understood by vectorizer".
-
- // Store the result and return it at the end instead of exiting early, in case
- // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
- bool Result = true;
- bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
-
- // We must have a loop in canonical form. Loops with indirectbr in them cannot
- // be canonicalized.
- if (!Lp->getLoopPreheader()) {
- reportVectorizationFailure("Loop doesn't have a legal pre-header",
- "loop control flow is not understood by vectorizer",
- "CFGNotUnderstood", ORE, TheLoop);
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // We must have a single backedge.
- if (Lp->getNumBackEdges() != 1) {
- reportVectorizationFailure("The loop must have a single backedge",
- "loop control flow is not understood by vectorizer",
- "CFGNotUnderstood", ORE, TheLoop);
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
+ "VPlan-native path is not enabled.");
+
+ // TODO: ORE should be improved to show more accurate information when an
+ // outer loop can't be vectorized because a nested loop is not understood or
+ // legal. Something like: "outer_loop_location: loop not vectorized:
+ // (inner_loop_location) loop control flow is not understood by vectorizer".
+
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+
+ // We must have a loop in canonical form. Loops with indirectbr in them cannot
+ // be canonicalized.
+ if (!Lp->getLoopPreheader()) {
+ reportVectorizationFailure("Loop doesn't have a legal pre-header",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // We must have a single backedge.
+ if (Lp->getNumBackEdges() != 1) {
+ reportVectorizationFailure("The loop must have a single backedge",
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
// We currently must have a single "exit block" after the loop. Note that
// multiple "exiting blocks" inside the loop are allowed, provided they all
// reach the single exit block.
@@ -1102,186 +1102,186 @@ bool LoopVectorizationLegality::canVectorizeLoopCFG(Loop *Lp,
// update logic in a number of places.
if (!Lp->getUniqueExitBlock()) {
reportVectorizationFailure("The loop must have a unique exit block",
- "loop control flow is not understood by vectorizer",
- "CFGNotUnderstood", ORE, TheLoop);
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
- return Result;
-}
-
-bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
- Loop *Lp, bool UseVPlanNativePath) {
- // Store the result and return it at the end instead of exiting early, in case
- // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
- bool Result = true;
- bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
- if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // Recursively check whether the loop control flow of nested loops is
- // understood.
- for (Loop *SubLp : *Lp)
- if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) {
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- return Result;
-}
-
-bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
- // Store the result and return it at the end instead of exiting early, in case
- // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
- bool Result = true;
-
- bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
- // Check whether the loop-related control flow in the loop nest is expected by
- // vectorizer.
- if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // We need to have a loop header.
- LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
- << '\n');
-
- // Specific checks for outer loops. We skip the remaining legal checks at this
- // point because they don't support outer loops.
+ "loop control flow is not understood by vectorizer",
+ "CFGNotUnderstood", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+ return Result;
+}
+
+bool LoopVectorizationLegality::canVectorizeLoopNestCFG(
+ Loop *Lp, bool UseVPlanNativePath) {
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+ if (!canVectorizeLoopCFG(Lp, UseVPlanNativePath)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Recursively check whether the loop control flow of nested loops is
+ // understood.
+ for (Loop *SubLp : *Lp)
+ if (!canVectorizeLoopNestCFG(SubLp, UseVPlanNativePath)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ return Result;
+}
+
+bool LoopVectorizationLegality::canVectorize(bool UseVPlanNativePath) {
+ // Store the result and return it at the end instead of exiting early, in case
+ // allowExtraAnalysis is used to report multiple reasons for not vectorizing.
+ bool Result = true;
+
+ bool DoExtraAnalysis = ORE->allowExtraAnalysis(DEBUG_TYPE);
+ // Check whether the loop-related control flow in the loop nest is expected by
+ // vectorizer.
+ if (!canVectorizeLoopNestCFG(TheLoop, UseVPlanNativePath)) {
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // We need to have a loop header.
+ LLVM_DEBUG(dbgs() << "LV: Found a loop: " << TheLoop->getHeader()->getName()
+ << '\n');
+
+ // Specific checks for outer loops. We skip the remaining legal checks at this
+ // point because they don't support outer loops.
if (!TheLoop->isInnermost()) {
- assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
-
- if (!canVectorizeOuterLoop()) {
- reportVectorizationFailure("Unsupported outer loop",
- "unsupported outer loop",
- "UnsupportedOuterLoop",
- ORE, TheLoop);
- // TODO: Implement DoExtraAnalysis when subsequent legal checks support
- // outer loops.
- return false;
- }
-
- LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
- return Result;
- }
-
+ assert(UseVPlanNativePath && "VPlan-native path is not enabled.");
+
+ if (!canVectorizeOuterLoop()) {
+ reportVectorizationFailure("Unsupported outer loop",
+ "unsupported outer loop",
+ "UnsupportedOuterLoop",
+ ORE, TheLoop);
+ // TODO: Implement DoExtraAnalysis when subsequent legal checks support
+ // outer loops.
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: We can vectorize this outer loop!\n");
+ return Result;
+ }
+
assert(TheLoop->isInnermost() && "Inner loop expected.");
- // Check if we can if-convert non-single-bb loops.
- unsigned NumBlocks = TheLoop->getNumBlocks();
- if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
- LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // Check if we can vectorize the instructions and CFG in this loop.
- if (!canVectorizeInstrs()) {
- LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // Go over each instruction and look at memory deps.
- if (!canVectorizeMemory()) {
- LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
- << (LAI->getRuntimePointerChecking()->Need
- ? " (with a runtime bound check)"
- : "")
- << "!\n");
-
- unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
- if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
- SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
-
- if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
- reportVectorizationFailure("Too many SCEV checks needed",
- "Too many SCEV assumptions need to be made and checked at runtime",
- "TooManySCEVRunTimeChecks", ORE, TheLoop);
- if (DoExtraAnalysis)
- Result = false;
- else
- return false;
- }
-
- // Okay! We've done all the tests. If any have failed, return false. Otherwise
- // we can vectorize, and at this point we don't have any other mem analysis
- // which may limit our maximum vectorization factor, so just return true with
- // no restrictions.
- return Result;
-}
-
-bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
-
- LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
-
- SmallPtrSet<const Value *, 8> ReductionLiveOuts;
-
- for (auto &Reduction : getReductionVars())
- ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
-
- // TODO: handle non-reduction outside users when tail is folded by masking.
- for (auto *AE : AllowedExit) {
- // Check that all users of allowed exit values are inside the loop or
- // are the live-out of a reduction.
- if (ReductionLiveOuts.count(AE))
- continue;
- for (User *U : AE->users()) {
- Instruction *UI = cast<Instruction>(U);
- if (TheLoop->contains(UI))
- continue;
+ // Check if we can if-convert non-single-bb loops.
+ unsigned NumBlocks = TheLoop->getNumBlocks();
+ if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
+ LLVM_DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Check if we can vectorize the instructions and CFG in this loop.
+ if (!canVectorizeInstrs()) {
+ LLVM_DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Go over each instruction and look at memory deps.
+ if (!canVectorizeMemory()) {
+ LLVM_DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: We can vectorize this loop"
+ << (LAI->getRuntimePointerChecking()->Need
+ ? " (with a runtime bound check)"
+ : "")
+ << "!\n");
+
+ unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
+ if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
+ SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
+
+ if (PSE.getUnionPredicate().getComplexity() > SCEVThreshold) {
+ reportVectorizationFailure("Too many SCEV checks needed",
+ "Too many SCEV assumptions need to be made and checked at runtime",
+ "TooManySCEVRunTimeChecks", ORE, TheLoop);
+ if (DoExtraAnalysis)
+ Result = false;
+ else
+ return false;
+ }
+
+ // Okay! We've done all the tests. If any have failed, return false. Otherwise
+ // we can vectorize, and at this point we don't have any other mem analysis
+ // which may limit our maximum vectorization factor, so just return true with
+ // no restrictions.
+ return Result;
+}
+
+bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
+
+ LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
+
+ SmallPtrSet<const Value *, 8> ReductionLiveOuts;
+
+ for (auto &Reduction : getReductionVars())
+ ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
+
+ // TODO: handle non-reduction outside users when tail is folded by masking.
+ for (auto *AE : AllowedExit) {
+ // Check that all users of allowed exit values are inside the loop or
+ // are the live-out of a reduction.
+ if (ReductionLiveOuts.count(AE))
+ continue;
+ for (User *U : AE->users()) {
+ Instruction *UI = cast<Instruction>(U);
+ if (TheLoop->contains(UI))
+ continue;
LLVM_DEBUG(
dbgs()
<< "LV: Cannot fold tail by masking, loop has an outside user for "
<< *UI << "\n");
- return false;
- }
- }
-
- // The list of pointers that we can safely read and write to remains empty.
- SmallPtrSet<Value *, 8> SafePointers;
-
+ return false;
+ }
+ }
+
+ // The list of pointers that we can safely read and write to remains empty.
+ SmallPtrSet<Value *, 8> SafePointers;
+
SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
SmallPtrSet<Instruction *, 8> TmpConditionalAssumes;
- // Check and mark all blocks for predication, including those that ordinarily
- // do not need predication such as the header block.
- for (BasicBlock *BB : TheLoop->blocks()) {
+ // Check and mark all blocks for predication, including those that ordinarily
+ // do not need predication such as the header block.
+ for (BasicBlock *BB : TheLoop->blocks()) {
if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp,
TmpConditionalAssumes)) {
LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");
- return false;
- }
- }
-
- LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
+ return false;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end());
ConditionalAssumes.insert(TmpConditionalAssumes.begin(),
TmpConditionalAssumes.end());
- return true;
-}
-
-} // namespace llvm
+ return true;
+}
+
+} // namespace llvm
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index 1938b1f0f8..19797e6f78 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -1,278 +1,278 @@
-//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file provides a LoopVectorizationPlanner class.
-/// InnerLoopVectorizer vectorizes loops which contain only one basic
-/// LoopVectorizationPlanner - drives the vectorization process after having
-/// passed Legality checks.
-/// The planner builds and optimizes the Vectorization Plans which record the
-/// decisions how to vectorize the given loop. In particular, represent the
-/// control-flow of the vectorized version, the replication of instructions that
-/// are to be scalarized, and interleave access groups.
-///
-/// Also provides a VPlan-based builder utility analogous to IRBuilder.
-/// It provides an instruction-level API for generating VPInstructions while
-/// abstracting away the Recipe manipulation details.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
-#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
-
-#include "VPlan.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-
-namespace llvm {
-
-class LoopVectorizationLegality;
-class LoopVectorizationCostModel;
-class PredicatedScalarEvolution;
+//===- LoopVectorizationPlanner.h - Planner for LoopVectorization ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides a LoopVectorizationPlanner class.
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// LoopVectorizationPlanner - drives the vectorization process after having
+/// passed Legality checks.
+/// The planner builds and optimizes the Vectorization Plans which record the
+/// decisions how to vectorize the given loop. In particular, represent the
+/// control-flow of the vectorized version, the replication of instructions that
+/// are to be scalarized, and interleave access groups.
+///
+/// Also provides a VPlan-based builder utility analogous to IRBuilder.
+/// It provides an instruction-level API for generating VPInstructions while
+/// abstracting away the Recipe manipulation details.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
+#define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
+
+#include "VPlan.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+
+namespace llvm {
+
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+class PredicatedScalarEvolution;
class VPRecipeBuilder;
-
-/// VPlan-based builder utility analogous to IRBuilder.
-class VPBuilder {
- VPBasicBlock *BB = nullptr;
- VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
-
- VPInstruction *createInstruction(unsigned Opcode,
- ArrayRef<VPValue *> Operands) {
- VPInstruction *Instr = new VPInstruction(Opcode, Operands);
- if (BB)
- BB->insert(Instr, InsertPt);
- return Instr;
- }
-
- VPInstruction *createInstruction(unsigned Opcode,
- std::initializer_list<VPValue *> Operands) {
- return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
- }
-
-public:
- VPBuilder() {}
-
- /// Clear the insertion point: created instructions will not be inserted into
- /// a block.
- void clearInsertionPoint() {
- BB = nullptr;
- InsertPt = VPBasicBlock::iterator();
- }
-
- VPBasicBlock *getInsertBlock() const { return BB; }
- VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }
-
- /// InsertPoint - A saved insertion point.
- class VPInsertPoint {
- VPBasicBlock *Block = nullptr;
- VPBasicBlock::iterator Point;
-
- public:
- /// Creates a new insertion point which doesn't point to anything.
- VPInsertPoint() = default;
-
- /// Creates a new insertion point at the given location.
- VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint)
- : Block(InsertBlock), Point(InsertPoint) {}
-
- /// Returns true if this insert point is set.
- bool isSet() const { return Block != nullptr; }
-
- VPBasicBlock *getBlock() const { return Block; }
- VPBasicBlock::iterator getPoint() const { return Point; }
- };
-
- /// Sets the current insert point to a previously-saved location.
- void restoreIP(VPInsertPoint IP) {
- if (IP.isSet())
- setInsertPoint(IP.getBlock(), IP.getPoint());
- else
- clearInsertionPoint();
- }
-
- /// This specifies that created VPInstructions should be appended to the end
- /// of the specified block.
- void setInsertPoint(VPBasicBlock *TheBB) {
- assert(TheBB && "Attempting to set a null insert point");
- BB = TheBB;
- InsertPt = BB->end();
- }
-
- /// This specifies that created instructions should be inserted at the
- /// specified point.
- void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) {
- BB = TheBB;
- InsertPt = IP;
- }
-
- /// Insert and return the specified instruction.
- VPInstruction *insert(VPInstruction *I) const {
- BB->insert(I, InsertPt);
- return I;
- }
-
- /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
- /// its underlying Instruction.
- VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
- Instruction *Inst = nullptr) {
- VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
- NewVPInst->setUnderlyingValue(Inst);
- return NewVPInst;
- }
- VPValue *createNaryOp(unsigned Opcode,
- std::initializer_list<VPValue *> Operands,
- Instruction *Inst = nullptr) {
- return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
- }
-
- VPValue *createNot(VPValue *Operand) {
- return createInstruction(VPInstruction::Not, {Operand});
- }
-
- VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
- return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
- }
-
- VPValue *createOr(VPValue *LHS, VPValue *RHS) {
- return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
- }
-
+
+/// VPlan-based builder utility analogous to IRBuilder.
+class VPBuilder {
+ VPBasicBlock *BB = nullptr;
+ VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
+
+ VPInstruction *createInstruction(unsigned Opcode,
+ ArrayRef<VPValue *> Operands) {
+ VPInstruction *Instr = new VPInstruction(Opcode, Operands);
+ if (BB)
+ BB->insert(Instr, InsertPt);
+ return Instr;
+ }
+
+ VPInstruction *createInstruction(unsigned Opcode,
+ std::initializer_list<VPValue *> Operands) {
+ return createInstruction(Opcode, ArrayRef<VPValue *>(Operands));
+ }
+
+public:
+ VPBuilder() {}
+
+ /// Clear the insertion point: created instructions will not be inserted into
+ /// a block.
+ void clearInsertionPoint() {
+ BB = nullptr;
+ InsertPt = VPBasicBlock::iterator();
+ }
+
+ VPBasicBlock *getInsertBlock() const { return BB; }
+ VPBasicBlock::iterator getInsertPoint() const { return InsertPt; }
+
+ /// InsertPoint - A saved insertion point.
+ class VPInsertPoint {
+ VPBasicBlock *Block = nullptr;
+ VPBasicBlock::iterator Point;
+
+ public:
+ /// Creates a new insertion point which doesn't point to anything.
+ VPInsertPoint() = default;
+
+ /// Creates a new insertion point at the given location.
+ VPInsertPoint(VPBasicBlock *InsertBlock, VPBasicBlock::iterator InsertPoint)
+ : Block(InsertBlock), Point(InsertPoint) {}
+
+ /// Returns true if this insert point is set.
+ bool isSet() const { return Block != nullptr; }
+
+ VPBasicBlock *getBlock() const { return Block; }
+ VPBasicBlock::iterator getPoint() const { return Point; }
+ };
+
+ /// Sets the current insert point to a previously-saved location.
+ void restoreIP(VPInsertPoint IP) {
+ if (IP.isSet())
+ setInsertPoint(IP.getBlock(), IP.getPoint());
+ else
+ clearInsertionPoint();
+ }
+
+ /// This specifies that created VPInstructions should be appended to the end
+ /// of the specified block.
+ void setInsertPoint(VPBasicBlock *TheBB) {
+ assert(TheBB && "Attempting to set a null insert point");
+ BB = TheBB;
+ InsertPt = BB->end();
+ }
+
+ /// This specifies that created instructions should be inserted at the
+ /// specified point.
+ void setInsertPoint(VPBasicBlock *TheBB, VPBasicBlock::iterator IP) {
+ BB = TheBB;
+ InsertPt = IP;
+ }
+
+ /// Insert and return the specified instruction.
+ VPInstruction *insert(VPInstruction *I) const {
+ BB->insert(I, InsertPt);
+ return I;
+ }
+
+ /// Create an N-ary operation with \p Opcode, \p Operands and set \p Inst as
+ /// its underlying Instruction.
+ VPValue *createNaryOp(unsigned Opcode, ArrayRef<VPValue *> Operands,
+ Instruction *Inst = nullptr) {
+ VPInstruction *NewVPInst = createInstruction(Opcode, Operands);
+ NewVPInst->setUnderlyingValue(Inst);
+ return NewVPInst;
+ }
+ VPValue *createNaryOp(unsigned Opcode,
+ std::initializer_list<VPValue *> Operands,
+ Instruction *Inst = nullptr) {
+ return createNaryOp(Opcode, ArrayRef<VPValue *>(Operands), Inst);
+ }
+
+ VPValue *createNot(VPValue *Operand) {
+ return createInstruction(VPInstruction::Not, {Operand});
+ }
+
+ VPValue *createAnd(VPValue *LHS, VPValue *RHS) {
+ return createInstruction(Instruction::BinaryOps::And, {LHS, RHS});
+ }
+
+ VPValue *createOr(VPValue *LHS, VPValue *RHS) {
+ return createInstruction(Instruction::BinaryOps::Or, {LHS, RHS});
+ }
+
VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal) {
return createNaryOp(Instruction::Select, {Cond, TrueVal, FalseVal});
}
- //===--------------------------------------------------------------------===//
- // RAII helpers.
- //===--------------------------------------------------------------------===//
-
- /// RAII object that stores the current insertion point and restores it when
- /// the object is destroyed.
- class InsertPointGuard {
- VPBuilder &Builder;
- VPBasicBlock *Block;
- VPBasicBlock::iterator Point;
-
- public:
- InsertPointGuard(VPBuilder &B)
- : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {}
-
- InsertPointGuard(const InsertPointGuard &) = delete;
- InsertPointGuard &operator=(const InsertPointGuard &) = delete;
-
- ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); }
- };
-};
-
-/// TODO: The following VectorizationFactor was pulled out of
-/// LoopVectorizationCostModel class. LV also deals with
-/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
-/// We need to streamline them.
-
-/// Information about vectorization costs
-struct VectorizationFactor {
- // Vector width with best cost
+ //===--------------------------------------------------------------------===//
+ // RAII helpers.
+ //===--------------------------------------------------------------------===//
+
+ /// RAII object that stores the current insertion point and restores it when
+ /// the object is destroyed.
+ class InsertPointGuard {
+ VPBuilder &Builder;
+ VPBasicBlock *Block;
+ VPBasicBlock::iterator Point;
+
+ public:
+ InsertPointGuard(VPBuilder &B)
+ : Builder(B), Block(B.getInsertBlock()), Point(B.getInsertPoint()) {}
+
+ InsertPointGuard(const InsertPointGuard &) = delete;
+ InsertPointGuard &operator=(const InsertPointGuard &) = delete;
+
+ ~InsertPointGuard() { Builder.restoreIP(VPInsertPoint(Block, Point)); }
+ };
+};
+
+/// TODO: The following VectorizationFactor was pulled out of
+/// LoopVectorizationCostModel class. LV also deals with
+/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
+/// We need to streamline them.
+
+/// Information about vectorization costs
+struct VectorizationFactor {
+ // Vector width with best cost
ElementCount Width;
- // Cost of the loop with that width
- unsigned Cost;
-
- // Width 1 means no vectorization, cost 0 means uncomputed cost.
+ // Cost of the loop with that width
+ unsigned Cost;
+
+ // Width 1 means no vectorization, cost 0 means uncomputed cost.
static VectorizationFactor Disabled() {
return {ElementCount::getFixed(1), 0};
}
-
- bool operator==(const VectorizationFactor &rhs) const {
- return Width == rhs.Width && Cost == rhs.Cost;
- }
+
+ bool operator==(const VectorizationFactor &rhs) const {
+ return Width == rhs.Width && Cost == rhs.Cost;
+ }
bool operator!=(const VectorizationFactor &rhs) const {
return !(*this == rhs);
}
-};
-
-/// Planner drives the vectorization process after having passed
-/// Legality checks.
-class LoopVectorizationPlanner {
- /// The loop that we evaluate.
- Loop *OrigLoop;
-
- /// Loop Info analysis.
- LoopInfo *LI;
-
- /// Target Library Info.
- const TargetLibraryInfo *TLI;
-
- /// Target Transform Info.
- const TargetTransformInfo *TTI;
-
- /// The legality analysis.
- LoopVectorizationLegality *Legal;
-
- /// The profitability analysis.
- LoopVectorizationCostModel &CM;
-
- /// The interleaved access analysis.
- InterleavedAccessInfo &IAI;
-
- PredicatedScalarEvolution &PSE;
-
- SmallVector<VPlanPtr, 4> VPlans;
-
- /// This class is used to enable the VPlan to invoke a method of ILV. This is
- /// needed until the method is refactored out of ILV and becomes reusable.
- struct VPCallbackILV : public VPCallback {
- InnerLoopVectorizer &ILV;
-
- VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
-
- Value *getOrCreateVectorValues(Value *V, unsigned Part) override;
- Value *getOrCreateScalarValue(Value *V,
- const VPIteration &Instance) override;
- };
-
- /// A builder used to construct the current plan.
- VPBuilder Builder;
-
+};
+
+/// Planner drives the vectorization process after having passed
+/// Legality checks.
+class LoopVectorizationPlanner {
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// Target Transform Info.
+ const TargetTransformInfo *TTI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
+ /// The profitability analysis.
+ LoopVectorizationCostModel &CM;
+
+ /// The interleaved access analysis.
+ InterleavedAccessInfo &IAI;
+
+ PredicatedScalarEvolution &PSE;
+
+ SmallVector<VPlanPtr, 4> VPlans;
+
+ /// This class is used to enable the VPlan to invoke a method of ILV. This is
+ /// needed until the method is refactored out of ILV and becomes reusable.
+ struct VPCallbackILV : public VPCallback {
+ InnerLoopVectorizer &ILV;
+
+ VPCallbackILV(InnerLoopVectorizer &ILV) : ILV(ILV) {}
+
+ Value *getOrCreateVectorValues(Value *V, unsigned Part) override;
+ Value *getOrCreateScalarValue(Value *V,
+ const VPIteration &Instance) override;
+ };
+
+ /// A builder used to construct the current plan.
+ VPBuilder Builder;
+
/// The best number of elements of the vector types used in the
/// transformed loop. BestVF = None means that vectorization is
/// disabled.
Optional<ElementCount> BestVF = None;
- unsigned BestUF = 0;
-
-public:
- LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
- const TargetTransformInfo *TTI,
- LoopVectorizationLegality *Legal,
- LoopVectorizationCostModel &CM,
- InterleavedAccessInfo &IAI,
- PredicatedScalarEvolution &PSE)
- : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
- PSE(PSE) {}
-
- /// Plan how to best vectorize, return the best VF and its cost, or None if
- /// vectorization and interleaving should be avoided up front.
+ unsigned BestUF = 0;
+
+public:
+ LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI,
+ LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel &CM,
+ InterleavedAccessInfo &IAI,
+ PredicatedScalarEvolution &PSE)
+ : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
+ PSE(PSE) {}
+
+ /// Plan how to best vectorize, return the best VF and its cost, or None if
+ /// vectorization and interleaving should be avoided up front.
Optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC);
-
- /// Use the VPlan-native path to plan how to best vectorize, return the best
- /// VF and its cost.
+
+ /// Use the VPlan-native path to plan how to best vectorize, return the best
+ /// VF and its cost.
VectorizationFactor planInVPlanNativePath(ElementCount UserVF);
-
- /// Finalize the best decision and dispose of all other VPlans.
+
+ /// Finalize the best decision and dispose of all other VPlans.
void setBestPlan(ElementCount VF, unsigned UF);
-
- /// Generate the IR code for the body of the vectorized loop according to the
- /// best selected VPlan.
- void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
-
- void printPlans(raw_ostream &O) {
- for (const auto &Plan : VPlans)
- O << *Plan;
- }
-
+
+ /// Generate the IR code for the body of the vectorized loop according to the
+ /// best selected VPlan.
+ void executePlan(InnerLoopVectorizer &LB, DominatorTree *DT);
+
+ void printPlans(raw_ostream &O) {
+ for (const auto &Plan : VPlans)
+ O << *Plan;
+ }
+
/// Look through the existing plans and return true if we have one with all
/// the vectorization factors in question.
bool hasPlanWithVFs(const ArrayRef<ElementCount> VFs) const {
@@ -283,39 +283,39 @@ public:
});
}
- /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
- /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
- /// returned value holds for the entire \p Range.
- static bool
+ /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
+ /// \p Predicate on Range.Start, possibly decreasing Range.End such that the
+ /// returned value holds for the entire \p Range.
+ static bool
getDecisionAndClampRange(const std::function<bool(ElementCount)> &Predicate,
- VFRange &Range);
-
-protected:
- /// Collect the instructions from the original loop that would be trivially
- /// dead in the vectorized loop if generated.
- void collectTriviallyDeadInstructions(
- SmallPtrSetImpl<Instruction *> &DeadInstructions);
-
- /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
- /// according to the information gathered by Legal when it checked if it is
- /// legal to vectorize the loop.
+ VFRange &Range);
+
+protected:
+ /// Collect the instructions from the original loop that would be trivially
+ /// dead in the vectorized loop if generated.
+ void collectTriviallyDeadInstructions(
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+
+ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+ /// according to the information gathered by Legal when it checked if it is
+ /// legal to vectorize the loop.
void buildVPlans(ElementCount MinVF, ElementCount MaxVF);
-
-private:
- /// Build a VPlan according to the information gathered by Legal. \return a
- /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
- /// exclusive, possibly decreasing \p Range.End.
- VPlanPtr buildVPlan(VFRange &Range);
-
- /// Build a VPlan using VPRecipes according to the information gather by
- /// Legal. This method is only used for the legacy inner loop vectorizer.
- VPlanPtr buildVPlanWithVPRecipes(
+
+private:
+ /// Build a VPlan according to the information gathered by Legal. \return a
+ /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
+ /// exclusive, possibly decreasing \p Range.End.
+ VPlanPtr buildVPlan(VFRange &Range);
+
+ /// Build a VPlan using VPRecipes according to the information gather by
+ /// Legal. This method is only used for the legacy inner loop vectorizer.
+ VPlanPtr buildVPlanWithVPRecipes(
VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
- const DenseMap<Instruction *, Instruction *> &SinkAfter);
-
- /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
- /// according to the information gathered by Legal when it checked if it is
- /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
+ const DenseMap<Instruction *, Instruction *> &SinkAfter);
+
+ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
+ /// according to the information gathered by Legal when it checked if it is
+ /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
/// Adjust the recipes for any inloop reductions. The chain of instructions
@@ -324,8 +324,8 @@ private:
/// reduction chain.
void adjustRecipesForInLoopReductions(VPlanPtr &Plan,
VPRecipeBuilder &RecipeBuilder);
-};
-
-} // namespace llvm
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp
index 60048bab64..b456a97aa4 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1,180 +1,180 @@
-//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
-// and generates target-independent LLVM-IR.
-// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
-// of instructions in order to estimate the profitability of vectorization.
-//
-// The loop vectorizer combines consecutive loop iterations into a single
-// 'wide' iteration. After this transformation the index is incremented
-// by the SIMD vector width, and not by one.
-//
-// This pass has three parts:
-// 1. The main loop pass that drives the different parts.
-// 2. LoopVectorizationLegality - A unit that checks for the legality
-// of the vectorization.
-// 3. InnerLoopVectorizer - A unit that performs the actual
-// widening of instructions.
-// 4. LoopVectorizationCostModel - A unit that checks for the profitability
-// of vectorization. It decides on the optimal vector width, which
-// can be one, if vectorization is not profitable.
-//
-// There is a development effort going on to migrate loop vectorizer to the
-// VPlan infrastructure and to introduce outer loop vectorization support (see
-// docs/Proposal/VectorizationPlan.rst and
-// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
-// purpose, we temporarily introduced the VPlan-native vectorization path: an
-// alternative vectorization path that is natively implemented on top of the
-// VPlan infrastructure. See EnableVPlanNativePath for enabling.
-//
-//===----------------------------------------------------------------------===//
-//
-// The reduction-variable vectorization is based on the paper:
-// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
-//
-// Variable uniformity checks are inspired by:
-// Karrenberg, R. and Hack, S. Whole Function Vectorization.
-//
-// The interleaved access vectorization is based on the paper:
-// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
-// Data for SIMD
-//
-// Other ideas/concepts are from:
-// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
-//
-// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
-// Vectorizing Compilers.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Vectorize/LoopVectorize.h"
-#include "LoopVectorizationPlanner.h"
-#include "VPRecipeBuilder.h"
-#include "VPlan.h"
-#include "VPlanHCFGBuilder.h"
-#include "VPlanPredicator.h"
-#include "VPlanTransforms.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseMapInfo.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/BlockFrequencyInfo.h"
-#include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/DemandedBits.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/MemorySSA.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/DiagnosticInfo.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
+//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
+// and generates target-independent LLVM-IR.
+// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
+// of instructions in order to estimate the profitability of vectorization.
+//
+// The loop vectorizer combines consecutive loop iterations into a single
+// 'wide' iteration. After this transformation the index is incremented
+// by the SIMD vector width, and not by one.
+//
+// This pass has three parts:
+// 1. The main loop pass that drives the different parts.
+// 2. LoopVectorizationLegality - A unit that checks for the legality
+// of the vectorization.
+// 3. InnerLoopVectorizer - A unit that performs the actual
+// widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+// of vectorization. It decides on the optimal vector width, which
+// can be one, if vectorization is not profitable.
+//
+// There is a development effort going on to migrate loop vectorizer to the
+// VPlan infrastructure and to introduce outer loop vectorization support (see
+// docs/Proposal/VectorizationPlan.rst and
+// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
+// purpose, we temporarily introduced the VPlan-native vectorization path: an
+// alternative vectorization path that is natively implemented on top of the
+// VPlan infrastructure. See EnableVPlanNativePath for enabling.
+//
+//===----------------------------------------------------------------------===//
+//
+// The reduction-variable vectorization is based on the paper:
+// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
+//
+// Variable uniformity checks are inspired by:
+// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//
+// The interleaved access vectorization is based on the paper:
+// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
+// Data for SIMD
+//
+// Other ideas/concepts are from:
+// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
+//
+// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
+// Vectorizing Compilers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
+#include "LoopVectorizationPlanner.h"
+#include "VPRecipeBuilder.h"
+#include "VPlan.h"
+#include "VPlanHCFGBuilder.h"
+#include "VPlanPredicator.h"
+#include "VPlanTransforms.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/InstructionCost.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/InjectTLIMappings.h"
-#include "llvm/Transforms/Utils/LoopSimplify.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/LoopVersioning.h"
-#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Utils/SizeOpts.h"
-#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <cstdlib>
-#include <functional>
-#include <iterator>
-#include <limits>
-#include <memory>
-#include <string>
-#include <tuple>
-#include <utility>
-
-using namespace llvm;
-
-#define LV_NAME "loop-vectorize"
-#define DEBUG_TYPE LV_NAME
-
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/InjectTLIMappings.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
#ifndef NDEBUG
const char VerboseDebug[] = DEBUG_TYPE "-verbose";
#endif
-/// @{
-/// Metadata attribute names
+/// @{
+/// Metadata attribute names
const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
const char LLVMLoopVectorizeFollowupVectorized[] =
- "llvm.loop.vectorize.followup_vectorized";
+ "llvm.loop.vectorize.followup_vectorized";
const char LLVMLoopVectorizeFollowupEpilogue[] =
- "llvm.loop.vectorize.followup_epilogue";
-/// @}
-
-STATISTIC(LoopsVectorized, "Number of loops vectorized");
-STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
+ "llvm.loop.vectorize.followup_epilogue";
+/// @}
+
+STATISTIC(LoopsVectorized, "Number of loops vectorized");
+STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
-
+
static cl::opt<bool> EnableEpilogueVectorization(
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
cl::desc("Enable vectorization of epilogue loops."));
@@ -190,14 +190,14 @@ static cl::opt<unsigned> EpilogueVectorizationMinVF(
cl::desc("Only loops with vectorization factor equal to or larger than "
"the specified value are considered for epilogue vectorization."));
-/// Loops with a known constant trip count below this number are vectorized only
-/// if no scalar iteration overheads are incurred.
-static cl::opt<unsigned> TinyTripCountVectorThreshold(
- "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
- cl::desc("Loops with a constant trip count that is smaller than this "
- "value are vectorized only if no scalar iteration overheads "
- "are incurred."));
-
+/// Loops with a known constant trip count below this number are vectorized only
+/// if no scalar iteration overheads are incurred.
+static cl::opt<unsigned> TinyTripCountVectorThreshold(
+ "vectorizer-min-trip-count", cl::init(16), cl::Hidden,
+ cl::desc("Loops with a constant trip count that is smaller than this "
+ "value are vectorized only if no scalar iteration overheads "
+ "are incurred."));
+
// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
// that predication is preferred, and this lists all options. I.e., the
// vectorizer will try to fold the tail-loop (epilogue) into the vector body
@@ -210,7 +210,7 @@ namespace PreferPredicateTy {
PredicateOrDontVectorize
};
} // namespace PreferPredicateTy
-
+
static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
"prefer-predicate-over-epilogue",
cl::init(PreferPredicateTy::ScalarEpilogue),
@@ -229,97 +229,97 @@ static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
"prefers tail-folding, don't attempt vectorization if "
"tail-folding fails.")));
-static cl::opt<bool> MaximizeBandwidth(
- "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
- cl::desc("Maximize bandwidth when selecting vectorization factor which "
- "will be determined by the smallest type in loop."));
-
-static cl::opt<bool> EnableInterleavedMemAccesses(
- "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
- cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
-
-/// An interleave-group may need masking if it resides in a block that needs
+static cl::opt<bool> MaximizeBandwidth(
+ "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
+ cl::desc("Maximize bandwidth when selecting vectorization factor which "
+ "will be determined by the smallest type in loop."));
+
+static cl::opt<bool> EnableInterleavedMemAccesses(
+ "enable-interleaved-mem-accesses", cl::init(false), cl::Hidden,
+ cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
+
+/// An interleave-group may need masking if it resides in a block that needs
/// predication, or in order to mask away gaps.
-static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
- "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
- cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
-
-static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
- "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
- cl::desc("We don't interleave loops with a estimated constant trip count "
- "below this number"));
-
-static cl::opt<unsigned> ForceTargetNumScalarRegs(
- "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
- cl::desc("A flag that overrides the target's number of scalar registers."));
-
-static cl::opt<unsigned> ForceTargetNumVectorRegs(
- "force-target-num-vector-regs", cl::init(0), cl::Hidden,
- cl::desc("A flag that overrides the target's number of vector registers."));
-
-static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
- "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
- cl::desc("A flag that overrides the target's max interleave factor for "
- "scalar loops."));
-
-static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
- "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
- cl::desc("A flag that overrides the target's max interleave factor for "
- "vectorized loops."));
-
-static cl::opt<unsigned> ForceTargetInstructionCost(
- "force-target-instruction-cost", cl::init(0), cl::Hidden,
- cl::desc("A flag that overrides the target's expected cost for "
- "an instruction to a single constant value. Mostly "
- "useful for getting consistent testing."));
-
+static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
+ "enable-masked-interleaved-mem-accesses", cl::init(false), cl::Hidden,
+ cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
+
+static cl::opt<unsigned> TinyTripCountInterleaveThreshold(
+ "tiny-trip-count-interleave-threshold", cl::init(128), cl::Hidden,
+ cl::desc("We don't interleave loops with a estimated constant trip count "
+ "below this number"));
+
+static cl::opt<unsigned> ForceTargetNumScalarRegs(
+ "force-target-num-scalar-regs", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's number of scalar registers."));
+
+static cl::opt<unsigned> ForceTargetNumVectorRegs(
+ "force-target-num-vector-regs", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's number of vector registers."));
+
+static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
+ "force-target-max-scalar-interleave", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's max interleave factor for "
+ "scalar loops."));
+
+static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
+ "force-target-max-vector-interleave", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's max interleave factor for "
+ "vectorized loops."));
+
+static cl::opt<unsigned> ForceTargetInstructionCost(
+ "force-target-instruction-cost", cl::init(0), cl::Hidden,
+ cl::desc("A flag that overrides the target's expected cost for "
+ "an instruction to a single constant value. Mostly "
+ "useful for getting consistent testing."));
+
static cl::opt<bool> ForceTargetSupportsScalableVectors(
"force-target-supports-scalable-vectors", cl::init(false), cl::Hidden,
cl::desc(
"Pretend that scalable vectors are supported, even if the target does "
"not support them. This flag should only be used for testing."));
-static cl::opt<unsigned> SmallLoopCost(
- "small-loop-cost", cl::init(20), cl::Hidden,
- cl::desc(
- "The cost of a loop that is considered 'small' by the interleaver."));
-
-static cl::opt<bool> LoopVectorizeWithBlockFrequency(
- "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
- cl::desc("Enable the use of the block frequency analysis to access PGO "
- "heuristics minimizing code growth in cold regions and being more "
- "aggressive in hot regions."));
-
-// Runtime interleave loops for load/store throughput.
-static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
- "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
- cl::desc(
- "Enable runtime interleaving until load/store ports are saturated"));
-
+static cl::opt<unsigned> SmallLoopCost(
+ "small-loop-cost", cl::init(20), cl::Hidden,
+ cl::desc(
+ "The cost of a loop that is considered 'small' by the interleaver."));
+
+static cl::opt<bool> LoopVectorizeWithBlockFrequency(
+ "loop-vectorize-with-block-frequency", cl::init(true), cl::Hidden,
+ cl::desc("Enable the use of the block frequency analysis to access PGO "
+ "heuristics minimizing code growth in cold regions and being more "
+ "aggressive in hot regions."));
+
+// Runtime interleave loops for load/store throughput.
+static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
+ "enable-loadstore-runtime-interleave", cl::init(true), cl::Hidden,
+ cl::desc(
+ "Enable runtime interleaving until load/store ports are saturated"));
+
/// Interleave small loops with scalar reductions.
static cl::opt<bool> InterleaveSmallLoopScalarReduction(
"interleave-small-loop-scalar-reduction", cl::init(false), cl::Hidden,
cl::desc("Enable interleaving for loops with small iteration counts that "
"contain scalar reductions to expose ILP."));
-/// The number of stores in a loop that are allowed to need predication.
-static cl::opt<unsigned> NumberOfStoresToPredicate(
- "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
- cl::desc("Max number of stores to be predicated behind an if."));
-
-static cl::opt<bool> EnableIndVarRegisterHeur(
- "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
- cl::desc("Count the induction variable only once when interleaving"));
-
-static cl::opt<bool> EnableCondStoresVectorization(
- "enable-cond-stores-vec", cl::init(true), cl::Hidden,
- cl::desc("Enable if predication of stores during vectorization."));
-
-static cl::opt<unsigned> MaxNestedScalarReductionIC(
- "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
- cl::desc("The maximum interleave count to use when interleaving a scalar "
- "reduction in a nested loop."));
-
+/// The number of stores in a loop that are allowed to need predication.
+static cl::opt<unsigned> NumberOfStoresToPredicate(
+ "vectorize-num-stores-pred", cl::init(1), cl::Hidden,
+ cl::desc("Max number of stores to be predicated behind an if."));
+
+static cl::opt<bool> EnableIndVarRegisterHeur(
+ "enable-ind-var-reg-heur", cl::init(true), cl::Hidden,
+ cl::desc("Count the induction variable only once when interleaving"));
+
+static cl::opt<bool> EnableCondStoresVectorization(
+ "enable-cond-stores-vec", cl::init(true), cl::Hidden,
+ cl::desc("Enable if predication of stores during vectorization."));
+
+static cl::opt<unsigned> MaxNestedScalarReductionIC(
+ "max-nested-scalar-reduction-interleave", cl::init(2), cl::Hidden,
+ cl::desc("The maximum interleave count to use when interleaving a scalar "
+ "reduction in a nested loop."));
+
static cl::opt<bool>
PreferInLoopReductions("prefer-inloop-reductions", cl::init(false),
cl::Hidden,
@@ -331,135 +331,135 @@ static cl::opt<bool> PreferPredicatedReductionSelect(
cl::desc(
"Prefer predicating a reduction operation over an after loop select."));
-cl::opt<bool> EnableVPlanNativePath(
- "enable-vplan-native-path", cl::init(false), cl::Hidden,
- cl::desc("Enable VPlan-native vectorization path with "
- "support for outer loop vectorization."));
-
-// FIXME: Remove this switch once we have divergence analysis. Currently we
-// assume divergent non-backedge branches when this switch is true.
-cl::opt<bool> EnableVPlanPredication(
- "enable-vplan-predication", cl::init(false), cl::Hidden,
- cl::desc("Enable VPlan-native vectorization path predicator with "
- "support for outer loop vectorization."));
-
-// This flag enables the stress testing of the VPlan H-CFG construction in the
-// VPlan-native vectorization path. It must be used in conjuction with
-// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
-// verification of the H-CFGs built.
-static cl::opt<bool> VPlanBuildStressTest(
- "vplan-build-stress-test", cl::init(false), cl::Hidden,
- cl::desc(
- "Build VPlan for every supported loop nest in the function and bail "
- "out right after the build (stress test the VPlan H-CFG construction "
- "in the VPlan-native vectorization path)."));
-
-cl::opt<bool> llvm::EnableLoopInterleaving(
- "interleave-loops", cl::init(true), cl::Hidden,
- cl::desc("Enable loop interleaving in Loop vectorization passes"));
-cl::opt<bool> llvm::EnableLoopVectorization(
- "vectorize-loops", cl::init(true), cl::Hidden,
- cl::desc("Run the Loop vectorization passes"));
-
-/// A helper function that returns the type of loaded or stored value.
-static Type *getMemInstValueType(Value *I) {
- assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
- "Expected Load or Store instruction");
- if (auto *LI = dyn_cast<LoadInst>(I))
- return LI->getType();
- return cast<StoreInst>(I)->getValueOperand()->getType();
-}
-
-/// A helper function that returns true if the given type is irregular. The
-/// type is irregular if its allocated size doesn't equal the store size of an
+cl::opt<bool> EnableVPlanNativePath(
+ "enable-vplan-native-path", cl::init(false), cl::Hidden,
+ cl::desc("Enable VPlan-native vectorization path with "
+ "support for outer loop vectorization."));
+
+// FIXME: Remove this switch once we have divergence analysis. Currently we
+// assume divergent non-backedge branches when this switch is true.
+cl::opt<bool> EnableVPlanPredication(
+ "enable-vplan-predication", cl::init(false), cl::Hidden,
+ cl::desc("Enable VPlan-native vectorization path predicator with "
+ "support for outer loop vectorization."));
+
+// This flag enables the stress testing of the VPlan H-CFG construction in the
+// VPlan-native vectorization path. It must be used in conjuction with
+// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
+// verification of the H-CFGs built.
+static cl::opt<bool> VPlanBuildStressTest(
+ "vplan-build-stress-test", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Build VPlan for every supported loop nest in the function and bail "
+ "out right after the build (stress test the VPlan H-CFG construction "
+ "in the VPlan-native vectorization path)."));
+
+cl::opt<bool> llvm::EnableLoopInterleaving(
+ "interleave-loops", cl::init(true), cl::Hidden,
+ cl::desc("Enable loop interleaving in Loop vectorization passes"));
+cl::opt<bool> llvm::EnableLoopVectorization(
+ "vectorize-loops", cl::init(true), cl::Hidden,
+ cl::desc("Run the Loop vectorization passes"));
+
+/// A helper function that returns the type of loaded or stored value.
+static Type *getMemInstValueType(Value *I) {
+ assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+ "Expected Load or Store instruction");
+ if (auto *LI = dyn_cast<LoadInst>(I))
+ return LI->getType();
+ return cast<StoreInst>(I)->getValueOperand()->getType();
+}
+
+/// A helper function that returns true if the given type is irregular. The
+/// type is irregular if its allocated size doesn't equal the store size of an
/// element of the corresponding vector type.
static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
// Determine if an array of N elements of type Ty is "bitcast compatible"
// with a <N x Ty> vector.
// This is only true if there is no padding between the array elements.
- return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
-}
-
-/// A helper function that returns the reciprocal of the block probability of
-/// predicated blocks. If we return X, we are assuming the predicated block
-/// will execute once for every X iterations of the loop header.
-///
-/// TODO: We should use actual block probability here, if available. Currently,
-/// we always assume predicated blocks have a 50% chance of executing.
-static unsigned getReciprocalPredBlockProb() { return 2; }
-
-/// A helper function that adds a 'fast' flag to floating-point operations.
-static Value *addFastMathFlag(Value *V) {
- if (isa<FPMathOperator>(V))
- cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
- return V;
-}
-
-static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
- if (isa<FPMathOperator>(V))
- cast<Instruction>(V)->setFastMathFlags(FMF);
- return V;
-}
-
-/// A helper function that returns an integer or floating-point constant with
-/// value C.
-static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
- return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
- : ConstantFP::get(Ty, C);
-}
-
-/// Returns "best known" trip count for the specified loop \p L as defined by
-/// the following procedure:
-/// 1) Returns exact trip count if it is known.
-/// 2) Returns expected trip count according to profile data if any.
-/// 3) Returns upper bound estimate if it is known.
-/// 4) Returns None if all of the above failed.
-static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
- // Check if exact trip count is known.
- if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
- return ExpectedTC;
-
- // Check if there is an expected trip count available from profile data.
- if (LoopVectorizeWithBlockFrequency)
- if (auto EstimatedTC = getLoopEstimatedTripCount(L))
- return EstimatedTC;
-
- // Check if upper bound estimate is known.
- if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
- return ExpectedTC;
-
- return None;
-}
-
-namespace llvm {
-
-/// InnerLoopVectorizer vectorizes loops which contain only one basic
-/// block to a specified vectorization factor (VF).
-/// This class performs the widening of scalars into vectors, or multiple
-/// scalars. This class also implements the following features:
-/// * It inserts an epilogue loop for handling loops that don't have iteration
-/// counts that are known to be a multiple of the vectorization factor.
-/// * It handles the code generation for reduction variables.
-/// * Scalarization (implementation using scalars) of un-vectorizable
-/// instructions.
-/// InnerLoopVectorizer does not perform any vectorization-legality
-/// checks, and relies on the caller to check for the different legality
-/// aspects. The InnerLoopVectorizer relies on the
-/// LoopVectorizationLegality class to provide information about the induction
-/// and reduction variables that were found to a given vectorization factor.
-class InnerLoopVectorizer {
-public:
- InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
- LoopInfo *LI, DominatorTree *DT,
- const TargetLibraryInfo *TLI,
- const TargetTransformInfo *TTI, AssumptionCache *AC,
+ return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
+}
+
+/// A helper function that returns the reciprocal of the block probability of
+/// predicated blocks. If we return X, we are assuming the predicated block
+/// will execute once for every X iterations of the loop header.
+///
+/// TODO: We should use actual block probability here, if available. Currently,
+/// we always assume predicated blocks have a 50% chance of executing.
+static unsigned getReciprocalPredBlockProb() { return 2; }
+
+/// A helper function that adds a 'fast' flag to floating-point operations.
+static Value *addFastMathFlag(Value *V) {
+ if (isa<FPMathOperator>(V))
+ cast<Instruction>(V)->setFastMathFlags(FastMathFlags::getFast());
+ return V;
+}
+
+static Value *addFastMathFlag(Value *V, FastMathFlags FMF) {
+ if (isa<FPMathOperator>(V))
+ cast<Instruction>(V)->setFastMathFlags(FMF);
+ return V;
+}
+
+/// A helper function that returns an integer or floating-point constant with
+/// value C.
+static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
+ return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
+ : ConstantFP::get(Ty, C);
+}
+
+/// Returns "best known" trip count for the specified loop \p L as defined by
+/// the following procedure:
+/// 1) Returns exact trip count if it is known.
+/// 2) Returns expected trip count according to profile data if any.
+/// 3) Returns upper bound estimate if it is known.
+/// 4) Returns None if all of the above failed.
+static Optional<unsigned> getSmallBestKnownTC(ScalarEvolution &SE, Loop *L) {
+ // Check if exact trip count is known.
+ if (unsigned ExpectedTC = SE.getSmallConstantTripCount(L))
+ return ExpectedTC;
+
+ // Check if there is an expected trip count available from profile data.
+ if (LoopVectorizeWithBlockFrequency)
+ if (auto EstimatedTC = getLoopEstimatedTripCount(L))
+ return EstimatedTC;
+
+ // Check if upper bound estimate is known.
+ if (unsigned ExpectedTC = SE.getSmallConstantMaxTripCount(L))
+ return ExpectedTC;
+
+ return None;
+}
+
+namespace llvm {
+
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// block to a specified vectorization factor (VF).
+/// This class performs the widening of scalars into vectors, or multiple
+/// scalars. This class also implements the following features:
+/// * It inserts an epilogue loop for handling loops that don't have iteration
+/// counts that are known to be a multiple of the vectorization factor.
+/// * It handles the code generation for reduction variables.
+/// * Scalarization (implementation using scalars) of un-vectorizable
+/// instructions.
+/// InnerLoopVectorizer does not perform any vectorization-legality
+/// checks, and relies on the caller to check for the different legality
+/// aspects. The InnerLoopVectorizer relies on the
+/// LoopVectorizationLegality class to provide information about the induction
+/// and reduction variables that were found to a given vectorization factor.
+class InnerLoopVectorizer {
+public:
+ InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
- unsigned UnrollFactor, LoopVectorizationLegality *LVL,
+ unsigned UnrollFactor, LoopVectorizationLegality *LVL,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI)
- : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
- AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
- Builder(PSE.getSE()->getContext()),
+ : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
+ AC(AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
+ Builder(PSE.getSE()->getContext()),
VectorLoopValueMap(UnrollFactor, VecWidth), Legal(LVL), Cost(CM),
BFI(BFI), PSI(PSI) {
// Query this against the original loop and save it here because the profile
@@ -468,8 +468,8 @@ public:
OrigLoop->getHeader(), PSI, BFI, PGSOQueryType::IRPass);
}
- virtual ~InnerLoopVectorizer() = default;
-
+ virtual ~InnerLoopVectorizer() = default;
+
/// Create a new empty loop that will contain vectorized instructions later
/// on, while the old loop will be used as the scalar remainder. Control flow
/// is generated around the vectorized (and scalar epilogue) loops consisting
@@ -478,265 +478,265 @@ public:
/// In the case of epilogue vectorization, this function is overriden to
/// handle the more complex control flow around the loops.
virtual BasicBlock *createVectorizedLoopSkeleton();
-
- /// Widen a single instruction within the innermost loop.
+
+ /// Widen a single instruction within the innermost loop.
void widenInstruction(Instruction &I, VPValue *Def, VPUser &Operands,
- VPTransformState &State);
-
- /// Widen a single call instruction within the innermost loop.
+ VPTransformState &State);
+
+ /// Widen a single call instruction within the innermost loop.
void widenCallInstruction(CallInst &I, VPValue *Def, VPUser &ArgOperands,
- VPTransformState &State);
-
- /// Widen a single select instruction within the innermost loop.
+ VPTransformState &State);
+
+ /// Widen a single select instruction within the innermost loop.
void widenSelectInstruction(SelectInst &I, VPValue *VPDef, VPUser &Operands,
- bool InvariantCond, VPTransformState &State);
-
- /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
- void fixVectorizedLoop();
-
- // Return true if any runtime check is added.
- bool areSafetyChecksAdded() { return AddedSafetyChecks; }
-
- /// A type for vectorized values in the new loop. Each value from the
- /// original loop, when vectorized, is represented by UF vector values in the
- /// new unrolled loop, where UF is the unroll factor.
- using VectorParts = SmallVector<Value *, 2>;
-
- /// Vectorize a single GetElementPtrInst based on information gathered and
- /// decisions taken during planning.
+ bool InvariantCond, VPTransformState &State);
+
+ /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
+ void fixVectorizedLoop();
+
+ // Return true if any runtime check is added.
+ bool areSafetyChecksAdded() { return AddedSafetyChecks; }
+
+ /// A type for vectorized values in the new loop. Each value from the
+ /// original loop, when vectorized, is represented by UF vector values in the
+ /// new unrolled loop, where UF is the unroll factor.
+ using VectorParts = SmallVector<Value *, 2>;
+
+ /// Vectorize a single GetElementPtrInst based on information gathered and
+ /// decisions taken during planning.
void widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Indices,
unsigned UF, ElementCount VF, bool IsPtrLoopInvariant,
- SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
-
- /// Vectorize a single PHINode in a block. This method handles the induction
- /// variable canonicalization. It supports both VF = 1 for unrolled loops and
- /// arbitrary length vectors.
+ SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
+
+ /// Vectorize a single PHINode in a block. This method handles the induction
+ /// variable canonicalization. It supports both VF = 1 for unrolled loops and
+ /// arbitrary length vectors.
void widenPHIInstruction(Instruction *PN, RecurrenceDescriptor *RdxDesc,
Value *StartV, unsigned UF, ElementCount VF);
-
- /// A helper function to scalarize a single Instruction in the innermost loop.
- /// Generates a sequence of scalar instances for each lane between \p MinLane
- /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
- /// inclusive. Uses the VPValue operands from \p Operands instead of \p
- /// Instr's operands.
- void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
- const VPIteration &Instance, bool IfPredicateInstr,
- VPTransformState &State);
-
- /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
- /// is provided, the integer induction variable will first be truncated to
- /// the corresponding type.
+
+ /// A helper function to scalarize a single Instruction in the innermost loop.
+ /// Generates a sequence of scalar instances for each lane between \p MinLane
+ /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
+ /// inclusive. Uses the VPValue operands from \p Operands instead of \p
+ /// Instr's operands.
+ void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
+ const VPIteration &Instance, bool IfPredicateInstr,
+ VPTransformState &State);
+
+ /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
+ /// is provided, the integer induction variable will first be truncated to
+ /// the corresponding type.
void widenIntOrFpInduction(PHINode *IV, Value *Start,
TruncInst *Trunc = nullptr);
-
- /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
- /// vector or scalar value on-demand if one is not yet available. When
- /// vectorizing a loop, we visit the definition of an instruction before its
- /// uses. When visiting the definition, we either vectorize or scalarize the
- /// instruction, creating an entry for it in the corresponding map. (In some
- /// cases, such as induction variables, we will create both vector and scalar
- /// entries.) Then, as we encounter uses of the definition, we derive values
- /// for each scalar or vector use unless such a value is already available.
- /// For example, if we scalarize a definition and one of its uses is vector,
- /// we build the required vector on-demand with an insertelement sequence
- /// when visiting the use. Otherwise, if the use is scalar, we can use the
- /// existing scalar definition.
- ///
- /// Return a value in the new loop corresponding to \p V from the original
- /// loop at unroll index \p Part. If the value has already been vectorized,
- /// the corresponding vector entry in VectorLoopValueMap is returned. If,
- /// however, the value has a scalar entry in VectorLoopValueMap, we construct
- /// a new vector value on-demand by inserting the scalar values into a vector
- /// with an insertelement sequence. If the value has been neither vectorized
- /// nor scalarized, it must be loop invariant, so we simply broadcast the
- /// value into a vector.
- Value *getOrCreateVectorValue(Value *V, unsigned Part);
-
+
+ /// getOrCreateVectorValue and getOrCreateScalarValue coordinate to generate a
+ /// vector or scalar value on-demand if one is not yet available. When
+ /// vectorizing a loop, we visit the definition of an instruction before its
+ /// uses. When visiting the definition, we either vectorize or scalarize the
+ /// instruction, creating an entry for it in the corresponding map. (In some
+ /// cases, such as induction variables, we will create both vector and scalar
+ /// entries.) Then, as we encounter uses of the definition, we derive values
+ /// for each scalar or vector use unless such a value is already available.
+ /// For example, if we scalarize a definition and one of its uses is vector,
+ /// we build the required vector on-demand with an insertelement sequence
+ /// when visiting the use. Otherwise, if the use is scalar, we can use the
+ /// existing scalar definition.
+ ///
+ /// Return a value in the new loop corresponding to \p V from the original
+ /// loop at unroll index \p Part. If the value has already been vectorized,
+ /// the corresponding vector entry in VectorLoopValueMap is returned. If,
+ /// however, the value has a scalar entry in VectorLoopValueMap, we construct
+ /// a new vector value on-demand by inserting the scalar values into a vector
+ /// with an insertelement sequence. If the value has been neither vectorized
+ /// nor scalarized, it must be loop invariant, so we simply broadcast the
+ /// value into a vector.
+ Value *getOrCreateVectorValue(Value *V, unsigned Part);
+
void setVectorValue(Value *Scalar, unsigned Part, Value *Vector) {
VectorLoopValueMap.setVectorValue(Scalar, Part, Vector);
}
- /// Return a value in the new loop corresponding to \p V from the original
- /// loop at unroll and vector indices \p Instance. If the value has been
- /// vectorized but not scalarized, the necessary extractelement instruction
- /// will be generated.
- Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
-
- /// Construct the vector value of a scalarized value \p V one lane at a time.
- void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
-
- /// Try to vectorize interleaved access group \p Group with the base address
- /// given in \p Addr, optionally masking the vector operations if \p
- /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
- /// values in the vectorized loop.
- void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
+ /// Return a value in the new loop corresponding to \p V from the original
+ /// loop at unroll and vector indices \p Instance. If the value has been
+ /// vectorized but not scalarized, the necessary extractelement instruction
+ /// will be generated.
+ Value *getOrCreateScalarValue(Value *V, const VPIteration &Instance);
+
+ /// Construct the vector value of a scalarized value \p V one lane at a time.
+ void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
+
+ /// Try to vectorize interleaved access group \p Group with the base address
+ /// given in \p Addr, optionally masking the vector operations if \p
+ /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
+ /// values in the vectorized loop.
+ void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
ArrayRef<VPValue *> VPDefs,
- VPTransformState &State, VPValue *Addr,
+ VPTransformState &State, VPValue *Addr,
ArrayRef<VPValue *> StoredValues,
- VPValue *BlockInMask = nullptr);
-
- /// Vectorize Load and Store instructions with the base address given in \p
- /// Addr, optionally masking the vector operations if \p BlockInMask is
- /// non-null. Use \p State to translate given VPValues to IR values in the
- /// vectorized loop.
- void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
+ VPValue *BlockInMask = nullptr);
+
+ /// Vectorize Load and Store instructions with the base address given in \p
+ /// Addr, optionally masking the vector operations if \p BlockInMask is
+ /// non-null. Use \p State to translate given VPValues to IR values in the
+ /// vectorized loop.
+ void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
VPValue *Def, VPValue *Addr,
VPValue *StoredValue, VPValue *BlockInMask);
-
- /// Set the debug location in the builder using the debug location in
- /// the instruction.
- void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
-
- /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
- void fixNonInductionPHIs(void);
-
-protected:
- friend class LoopVectorizationPlanner;
-
- /// A small list of PHINodes.
- using PhiVector = SmallVector<PHINode *, 4>;
-
- /// A type for scalarized values in the new loop. Each value from the
- /// original loop, when scalarized, is represented by UF x VF scalar values
- /// in the new unrolled loop, where UF is the unroll factor and VF is the
- /// vectorization factor.
- using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
-
- /// Set up the values of the IVs correctly when exiting the vector loop.
- void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
- Value *CountRoundDown, Value *EndValue,
- BasicBlock *MiddleBlock);
-
- /// Create a new induction variable inside L.
- PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
- Value *Step, Instruction *DL);
-
- /// Handle all cross-iteration phis in the header.
- void fixCrossIterationPHIs();
-
- /// Fix a first-order recurrence. This is the second phase of vectorizing
- /// this phi node.
- void fixFirstOrderRecurrence(PHINode *Phi);
-
- /// Fix a reduction cross-iteration phi. This is the second phase of
- /// vectorizing this phi node.
- void fixReduction(PHINode *Phi);
-
- /// Clear NSW/NUW flags from reduction instructions if necessary.
- void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
-
+
+ /// Set the debug location in the builder using the debug location in
+ /// the instruction.
+ void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
+
+ /// Fix the non-induction PHIs in the OrigPHIsToFix vector.
+ void fixNonInductionPHIs(void);
+
+protected:
+ friend class LoopVectorizationPlanner;
+
+ /// A small list of PHINodes.
+ using PhiVector = SmallVector<PHINode *, 4>;
+
+ /// A type for scalarized values in the new loop. Each value from the
+ /// original loop, when scalarized, is represented by UF x VF scalar values
+ /// in the new unrolled loop, where UF is the unroll factor and VF is the
+ /// vectorization factor.
+ using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
+
+ /// Set up the values of the IVs correctly when exiting the vector loop.
+ void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
+ Value *CountRoundDown, Value *EndValue,
+ BasicBlock *MiddleBlock);
+
+ /// Create a new induction variable inside L.
+ PHINode *createInductionVariable(Loop *L, Value *Start, Value *End,
+ Value *Step, Instruction *DL);
+
+ /// Handle all cross-iteration phis in the header.
+ void fixCrossIterationPHIs();
+
+ /// Fix a first-order recurrence. This is the second phase of vectorizing
+ /// this phi node.
+ void fixFirstOrderRecurrence(PHINode *Phi);
+
+ /// Fix a reduction cross-iteration phi. This is the second phase of
+ /// vectorizing this phi node.
+ void fixReduction(PHINode *Phi);
+
+ /// Clear NSW/NUW flags from reduction instructions if necessary.
+ void clearReductionWrapFlags(RecurrenceDescriptor &RdxDesc);
+
/// Fixup the LCSSA phi nodes in the unique exit block. This simply
/// means we need to add the appropriate incoming value from the middle
/// block as exiting edges from the scalar epilogue loop (if present) are
/// already in place, and we exit the vector loop exclusively to the middle
/// block.
- void fixLCSSAPHIs();
-
- /// Iteratively sink the scalarized operands of a predicated instruction into
- /// the block that was created for it.
- void sinkScalarOperands(Instruction *PredInst);
-
- /// Shrinks vector element sizes to the smallest bitwidth they can be legally
- /// represented as.
- void truncateToMinimalBitwidths();
-
- /// Create a broadcast instruction. This method generates a broadcast
- /// instruction (shuffle) for loop invariant values and for the induction
- /// value. If this is the induction variable then we extend it to N, N+1, ...
- /// this is needed because each iteration in the loop corresponds to a SIMD
- /// element.
- virtual Value *getBroadcastInstrs(Value *V);
-
- /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
- /// to each vector element of Val. The sequence starts at StartIndex.
- /// \p Opcode is relevant for FP induction variable.
- virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
- Instruction::BinaryOps Opcode =
- Instruction::BinaryOpsEnd);
-
- /// Compute scalar induction steps. \p ScalarIV is the scalar induction
- /// variable on which to base the steps, \p Step is the size of the step, and
- /// \p EntryVal is the value from the original loop that maps to the steps.
- /// Note that \p EntryVal doesn't have to be an induction variable - it
- /// can also be a truncate instruction.
- void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
- const InductionDescriptor &ID);
-
- /// Create a vector induction phi node based on an existing scalar one. \p
- /// EntryVal is the value from the original loop that maps to the vector phi
- /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
- /// truncate instruction, instead of widening the original IV, we widen a
- /// version of the IV truncated to \p EntryVal's type.
- void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
+ void fixLCSSAPHIs();
+
+ /// Iteratively sink the scalarized operands of a predicated instruction into
+ /// the block that was created for it.
+ void sinkScalarOperands(Instruction *PredInst);
+
+ /// Shrinks vector element sizes to the smallest bitwidth they can be legally
+ /// represented as.
+ void truncateToMinimalBitwidths();
+
+ /// Create a broadcast instruction. This method generates a broadcast
+ /// instruction (shuffle) for loop invariant values and for the induction
+ /// value. If this is the induction variable then we extend it to N, N+1, ...
+ /// this is needed because each iteration in the loop corresponds to a SIMD
+ /// element.
+ virtual Value *getBroadcastInstrs(Value *V);
+
+ /// This function adds (StartIdx, StartIdx + Step, StartIdx + 2*Step, ...)
+ /// to each vector element of Val. The sequence starts at StartIndex.
+ /// \p Opcode is relevant for FP induction variable.
+ virtual Value *getStepVector(Value *Val, int StartIdx, Value *Step,
+ Instruction::BinaryOps Opcode =
+ Instruction::BinaryOpsEnd);
+
+ /// Compute scalar induction steps. \p ScalarIV is the scalar induction
+ /// variable on which to base the steps, \p Step is the size of the step, and
+ /// \p EntryVal is the value from the original loop that maps to the steps.
+ /// Note that \p EntryVal doesn't have to be an induction variable - it
+ /// can also be a truncate instruction.
+ void buildScalarSteps(Value *ScalarIV, Value *Step, Instruction *EntryVal,
+ const InductionDescriptor &ID);
+
+ /// Create a vector induction phi node based on an existing scalar one. \p
+ /// EntryVal is the value from the original loop that maps to the vector phi
+ /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
+ /// truncate instruction, instead of widening the original IV, we widen a
+ /// version of the IV truncated to \p EntryVal's type.
+ void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
Value *Step, Value *Start,
Instruction *EntryVal);
-
- /// Returns true if an instruction \p I should be scalarized instead of
- /// vectorized for the chosen vectorization factor.
- bool shouldScalarizeInstruction(Instruction *I) const;
-
- /// Returns true if we should generate a scalar version of \p IV.
- bool needsScalarInduction(Instruction *IV) const;
-
- /// If there is a cast involved in the induction variable \p ID, which should
- /// be ignored in the vectorized loop body, this function records the
- /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
- /// cast. We had already proved that the casted Phi is equal to the uncasted
- /// Phi in the vectorized loop (under a runtime guard), and therefore
- /// there is no need to vectorize the cast - the same value can be used in the
- /// vector loop for both the Phi and the cast.
- /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
- /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
- ///
- /// \p EntryVal is the value from the original loop that maps to the vector
- /// phi node and is used to distinguish what is the IV currently being
- /// processed - original one (if \p EntryVal is a phi corresponding to the
- /// original IV) or the "newly-created" one based on the proof mentioned above
- /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
- /// latter case \p EntryVal is a TruncInst and we must not record anything for
- /// that IV, but it's error-prone to expect callers of this routine to care
- /// about that, hence this explicit parameter.
- void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
- const Instruction *EntryVal,
- Value *VectorLoopValue,
- unsigned Part,
- unsigned Lane = UINT_MAX);
-
- /// Generate a shuffle sequence that will reverse the vector Vec.
- virtual Value *reverseVector(Value *Vec);
-
- /// Returns (and creates if needed) the original loop trip count.
- Value *getOrCreateTripCount(Loop *NewLoop);
-
- /// Returns (and creates if needed) the trip count of the widened loop.
- Value *getOrCreateVectorTripCount(Loop *NewLoop);
-
- /// Returns a bitcasted value to the requested vector type.
- /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
- Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
- const DataLayout &DL);
-
- /// Emit a bypass check to see if the vector trip count is zero, including if
- /// it overflows.
- void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
-
- /// Emit a bypass check to see if all of the SCEV assumptions we've
- /// had to make are correct.
- void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
-
- /// Emit bypass checks to check any memory assumptions we may have made.
- void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
-
- /// Compute the transformed value of Index at offset StartValue using step
- /// StepValue.
- /// For integer induction, returns StartValue + Index * StepValue.
- /// For pointer induction, returns StartValue[Index * StepValue].
- /// FIXME: The newly created binary instructions should contain nsw/nuw
- /// flags, which can be found from the original scalar operations.
- Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
- const DataLayout &DL,
- const InductionDescriptor &ID) const;
-
+
+ /// Returns true if an instruction \p I should be scalarized instead of
+ /// vectorized for the chosen vectorization factor.
+ bool shouldScalarizeInstruction(Instruction *I) const;
+
+ /// Returns true if we should generate a scalar version of \p IV.
+ bool needsScalarInduction(Instruction *IV) const;
+
+ /// If there is a cast involved in the induction variable \p ID, which should
+ /// be ignored in the vectorized loop body, this function records the
+ /// VectorLoopValue of the respective Phi also as the VectorLoopValue of the
+ /// cast. We had already proved that the casted Phi is equal to the uncasted
+ /// Phi in the vectorized loop (under a runtime guard), and therefore
+ /// there is no need to vectorize the cast - the same value can be used in the
+ /// vector loop for both the Phi and the cast.
+ /// If \p VectorLoopValue is a scalarized value, \p Lane is also specified,
+ /// Otherwise, \p VectorLoopValue is a widened/vectorized value.
+ ///
+ /// \p EntryVal is the value from the original loop that maps to the vector
+ /// phi node and is used to distinguish what is the IV currently being
+ /// processed - original one (if \p EntryVal is a phi corresponding to the
+ /// original IV) or the "newly-created" one based on the proof mentioned above
+ /// (see also buildScalarSteps() and createVectorIntOrFPInductionPHI()). In the
+ /// latter case \p EntryVal is a TruncInst and we must not record anything for
+ /// that IV, but it's error-prone to expect callers of this routine to care
+ /// about that, hence this explicit parameter.
+ void recordVectorLoopValueForInductionCast(const InductionDescriptor &ID,
+ const Instruction *EntryVal,
+ Value *VectorLoopValue,
+ unsigned Part,
+ unsigned Lane = UINT_MAX);
+
+ /// Generate a shuffle sequence that will reverse the vector Vec.
+ virtual Value *reverseVector(Value *Vec);
+
+ /// Returns (and creates if needed) the original loop trip count.
+ Value *getOrCreateTripCount(Loop *NewLoop);
+
+ /// Returns (and creates if needed) the trip count of the widened loop.
+ Value *getOrCreateVectorTripCount(Loop *NewLoop);
+
+ /// Returns a bitcasted value to the requested vector type.
+ /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
+ Value *createBitOrPointerCast(Value *V, VectorType *DstVTy,
+ const DataLayout &DL);
+
+ /// Emit a bypass check to see if the vector trip count is zero, including if
+ /// it overflows.
+ void emitMinimumIterationCountCheck(Loop *L, BasicBlock *Bypass);
+
+ /// Emit a bypass check to see if all of the SCEV assumptions we've
+ /// had to make are correct.
+ void emitSCEVChecks(Loop *L, BasicBlock *Bypass);
+
+ /// Emit bypass checks to check any memory assumptions we may have made.
+ void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
+
+ /// Compute the transformed value of Index at offset StartValue using step
+ /// StepValue.
+ /// For integer induction, returns StartValue + Index * StepValue.
+ /// For pointer induction, returns StartValue[Index * StepValue].
+ /// FIXME: The newly created binary instructions should contain nsw/nuw
+ /// flags, which can be found from the original scalar operations.
+ Value *emitTransformedIndex(IRBuilder<> &B, Value *Index, ScalarEvolution *SE,
+ const DataLayout &DL,
+ const InductionDescriptor &ID) const;
+
/// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
/// vector loop preheader, middle block and scalar preheader. Also
/// allocate a loop object for the new vector loop and return it.
@@ -759,137 +759,137 @@ protected:
/// the preheader of the completed vector loop.
BasicBlock *completeLoopSkeleton(Loop *L, MDNode *OrigLoopID);
- /// Add additional metadata to \p To that was not present on \p Orig.
- ///
- /// Currently this is used to add the noalias annotations based on the
- /// inserted memchecks. Use this for instructions that are *cloned* into the
- /// vector loop.
- void addNewMetadata(Instruction *To, const Instruction *Orig);
-
- /// Add metadata from one instruction to another.
- ///
- /// This includes both the original MDs from \p From and additional ones (\see
- /// addNewMetadata). Use this for *newly created* instructions in the vector
- /// loop.
- void addMetadata(Instruction *To, Instruction *From);
-
- /// Similar to the previous function but it adds the metadata to a
- /// vector of instructions.
- void addMetadata(ArrayRef<Value *> To, Instruction *From);
-
+ /// Add additional metadata to \p To that was not present on \p Orig.
+ ///
+ /// Currently this is used to add the noalias annotations based on the
+ /// inserted memchecks. Use this for instructions that are *cloned* into the
+ /// vector loop.
+ void addNewMetadata(Instruction *To, const Instruction *Orig);
+
+ /// Add metadata from one instruction to another.
+ ///
+ /// This includes both the original MDs from \p From and additional ones (\see
+ /// addNewMetadata). Use this for *newly created* instructions in the vector
+ /// loop.
+ void addMetadata(Instruction *To, Instruction *From);
+
+ /// Similar to the previous function but it adds the metadata to a
+ /// vector of instructions.
+ void addMetadata(ArrayRef<Value *> To, Instruction *From);
+
/// Allow subclasses to override and print debug traces before/after vplan
/// execution, when trace information is requested.
virtual void printDebugTracesAtStart(){};
virtual void printDebugTracesAtEnd(){};
- /// The original loop.
- Loop *OrigLoop;
-
- /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
- /// dynamic knowledge to simplify SCEV expressions and converts them to a
- /// more usable form.
- PredicatedScalarEvolution &PSE;
-
- /// Loop Info.
- LoopInfo *LI;
-
- /// Dominator Tree.
- DominatorTree *DT;
-
- /// Alias Analysis.
- AAResults *AA;
-
- /// Target Library Info.
- const TargetLibraryInfo *TLI;
-
- /// Target Transform Info.
- const TargetTransformInfo *TTI;
-
- /// Assumption Cache.
- AssumptionCache *AC;
-
- /// Interface to emit optimization remarks.
- OptimizationRemarkEmitter *ORE;
-
- /// LoopVersioning. It's only set up (non-null) if memchecks were
- /// used.
- ///
- /// This is currently only used to add no-alias metadata based on the
- /// memchecks. The actually versioning is performed manually.
- std::unique_ptr<LoopVersioning> LVer;
-
- /// The vectorization SIMD factor to use. Each vector will have this many
- /// vector elements.
+ /// The original loop.
+ Loop *OrigLoop;
+
+ /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
+ /// dynamic knowledge to simplify SCEV expressions and converts them to a
+ /// more usable form.
+ PredicatedScalarEvolution &PSE;
+
+ /// Loop Info.
+ LoopInfo *LI;
+
+ /// Dominator Tree.
+ DominatorTree *DT;
+
+ /// Alias Analysis.
+ AAResults *AA;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// Target Transform Info.
+ const TargetTransformInfo *TTI;
+
+ /// Assumption Cache.
+ AssumptionCache *AC;
+
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
+
+ /// LoopVersioning. It's only set up (non-null) if memchecks were
+ /// used.
+ ///
+ /// This is currently only used to add no-alias metadata based on the
+ /// memchecks. The actually versioning is performed manually.
+ std::unique_ptr<LoopVersioning> LVer;
+
+ /// The vectorization SIMD factor to use. Each vector will have this many
+ /// vector elements.
ElementCount VF;
-
- /// The vectorization unroll factor to use. Each scalar is vectorized to this
- /// many different vector instructions.
- unsigned UF;
-
- /// The builder that we use
- IRBuilder<> Builder;
-
- // --- Vectorization state ---
-
- /// The vector-loop preheader.
- BasicBlock *LoopVectorPreHeader;
-
- /// The scalar-loop preheader.
- BasicBlock *LoopScalarPreHeader;
-
- /// Middle Block between the vector and the scalar.
- BasicBlock *LoopMiddleBlock;
-
+
+ /// The vectorization unroll factor to use. Each scalar is vectorized to this
+ /// many different vector instructions.
+ unsigned UF;
+
+ /// The builder that we use
+ IRBuilder<> Builder;
+
+ // --- Vectorization state ---
+
+ /// The vector-loop preheader.
+ BasicBlock *LoopVectorPreHeader;
+
+ /// The scalar-loop preheader.
+ BasicBlock *LoopScalarPreHeader;
+
+ /// Middle Block between the vector and the scalar.
+ BasicBlock *LoopMiddleBlock;
+
/// The (unique) ExitBlock of the scalar loop. Note that
/// there can be multiple exiting edges reaching this block.
- BasicBlock *LoopExitBlock;
-
- /// The vector loop body.
- BasicBlock *LoopVectorBody;
-
- /// The scalar loop body.
- BasicBlock *LoopScalarBody;
-
- /// A list of all bypass blocks. The first block is the entry of the loop.
- SmallVector<BasicBlock *, 4> LoopBypassBlocks;
-
- /// The new Induction variable which was added to the new block.
- PHINode *Induction = nullptr;
-
- /// The induction variable of the old basic block.
- PHINode *OldInduction = nullptr;
-
- /// Maps values from the original loop to their corresponding values in the
- /// vectorized loop. A key value can map to either vector values, scalar
- /// values or both kinds of values, depending on whether the key was
- /// vectorized and scalarized.
- VectorizerValueMap VectorLoopValueMap;
-
- /// Store instructions that were predicated.
- SmallVector<Instruction *, 4> PredicatedInstructions;
-
- /// Trip count of the original loop.
- Value *TripCount = nullptr;
-
- /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
- Value *VectorTripCount = nullptr;
-
- /// The legality analysis.
- LoopVectorizationLegality *Legal;
-
- /// The profitablity analysis.
- LoopVectorizationCostModel *Cost;
-
- // Record whether runtime checks are added.
- bool AddedSafetyChecks = false;
-
- // Holds the end values for each induction variable. We save the end values
- // so we can later fix-up the external users of the induction variables.
- DenseMap<PHINode *, Value *> IVEndValues;
-
- // Vector of original scalar PHIs whose corresponding widened PHIs need to be
- // fixed up at the end of vector code generation.
- SmallVector<PHINode *, 8> OrigPHIsToFix;
+ BasicBlock *LoopExitBlock;
+
+ /// The vector loop body.
+ BasicBlock *LoopVectorBody;
+
+ /// The scalar loop body.
+ BasicBlock *LoopScalarBody;
+
+ /// A list of all bypass blocks. The first block is the entry of the loop.
+ SmallVector<BasicBlock *, 4> LoopBypassBlocks;
+
+ /// The new Induction variable which was added to the new block.
+ PHINode *Induction = nullptr;
+
+ /// The induction variable of the old basic block.
+ PHINode *OldInduction = nullptr;
+
+ /// Maps values from the original loop to their corresponding values in the
+ /// vectorized loop. A key value can map to either vector values, scalar
+ /// values or both kinds of values, depending on whether the key was
+ /// vectorized and scalarized.
+ VectorizerValueMap VectorLoopValueMap;
+
+ /// Store instructions that were predicated.
+ SmallVector<Instruction *, 4> PredicatedInstructions;
+
+ /// Trip count of the original loop.
+ Value *TripCount = nullptr;
+
+ /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
+ Value *VectorTripCount = nullptr;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
+ /// The profitablity analysis.
+ LoopVectorizationCostModel *Cost;
+
+ // Record whether runtime checks are added.
+ bool AddedSafetyChecks = false;
+
+ // Holds the end values for each induction variable. We save the end values
+ // so we can later fix-up the external users of the induction variables.
+ DenseMap<PHINode *, Value *> IVEndValues;
+
+ // Vector of original scalar PHIs whose corresponding widened PHIs need to be
+ // fixed up at the end of vector code generation.
+ SmallVector<PHINode *, 8> OrigPHIsToFix;
/// BFI and PSI are used to check for profile guided size optimizations.
BlockFrequencyInfo *BFI;
@@ -898,30 +898,30 @@ protected:
// Whether this loop should be optimized for size based on profile guided size
// optimizatios.
bool OptForSizeBasedOnProfile;
-};
-
-class InnerLoopUnroller : public InnerLoopVectorizer {
-public:
- InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
- LoopInfo *LI, DominatorTree *DT,
- const TargetLibraryInfo *TLI,
- const TargetTransformInfo *TTI, AssumptionCache *AC,
- OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
- LoopVectorizationLegality *LVL,
+};
+
+class InnerLoopUnroller : public InnerLoopVectorizer {
+public:
+ InnerLoopUnroller(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
+ LoopInfo *LI, DominatorTree *DT,
+ const TargetLibraryInfo *TLI,
+ const TargetTransformInfo *TTI, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, unsigned UnrollFactor,
+ LoopVectorizationLegality *LVL,
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
ProfileSummaryInfo *PSI)
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
ElementCount::getFixed(1), UnrollFactor, LVL, CM,
BFI, PSI) {}
-
-private:
- Value *getBroadcastInstrs(Value *V) override;
- Value *getStepVector(Value *Val, int StartIdx, Value *Step,
- Instruction::BinaryOps Opcode =
- Instruction::BinaryOpsEnd) override;
- Value *reverseVector(Value *Vec) override;
-};
-
+
+private:
+ Value *getBroadcastInstrs(Value *V) override;
+ Value *getStepVector(Value *Val, int StartIdx, Value *Step,
+ Instruction::BinaryOps Opcode =
+ Instruction::BinaryOpsEnd) override;
+ Value *reverseVector(Value *Vec) override;
+};
+
/// Encapsulate information regarding vectorization of a loop and its epilogue.
/// This information is meant to be updated and used across two stages of
/// epilogue vectorization.
@@ -1044,88 +1044,88 @@ protected:
void printDebugTracesAtStart() override;
void printDebugTracesAtEnd() override;
};
-} // end namespace llvm
-
-/// Look for a meaningful debug location on the instruction or it's
-/// operands.
-static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
- if (!I)
- return I;
-
- DebugLoc Empty;
- if (I->getDebugLoc() != Empty)
- return I;
-
- for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
- if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
- if (OpInst->getDebugLoc() != Empty)
- return OpInst;
- }
-
- return I;
-}
-
-void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
- if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
- const DILocation *DIL = Inst->getDebugLoc();
- if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
- !isa<DbgInfoIntrinsic>(Inst)) {
+} // end namespace llvm
+
+/// Look for a meaningful debug location on the instruction or it's
+/// operands.
+static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
+ if (!I)
+ return I;
+
+ DebugLoc Empty;
+ if (I->getDebugLoc() != Empty)
+ return I;
+
+ for (User::op_iterator OI = I->op_begin(), OE = I->op_end(); OI != OE; ++OI) {
+ if (Instruction *OpInst = dyn_cast<Instruction>(*OI))
+ if (OpInst->getDebugLoc() != Empty)
+ return OpInst;
+ }
+
+ return I;
+}
+
+void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
+ if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
+ const DILocation *DIL = Inst->getDebugLoc();
+ if (DIL && Inst->getFunction()->isDebugInfoForProfiling() &&
+ !isa<DbgInfoIntrinsic>(Inst)) {
assert(!VF.isScalable() && "scalable vectors not yet supported.");
auto NewDIL =
DIL->cloneByMultiplyingDuplicationFactor(UF * VF.getKnownMinValue());
- if (NewDIL)
- B.SetCurrentDebugLocation(NewDIL.getValue());
- else
- LLVM_DEBUG(dbgs()
- << "Failed to create new discriminator: "
- << DIL->getFilename() << " Line: " << DIL->getLine());
- }
- else
- B.SetCurrentDebugLocation(DIL);
- } else
- B.SetCurrentDebugLocation(DebugLoc());
-}
-
-/// Write a record \p DebugMsg about vectorization failure to the debug
-/// output stream. If \p I is passed, it is an instruction that prevents
-/// vectorization.
-#ifndef NDEBUG
-static void debugVectorizationFailure(const StringRef DebugMsg,
- Instruction *I) {
- dbgs() << "LV: Not vectorizing: " << DebugMsg;
- if (I != nullptr)
- dbgs() << " " << *I;
- else
- dbgs() << '.';
- dbgs() << '\n';
-}
-#endif
-
-/// Create an analysis remark that explains why vectorization failed
-///
-/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
-/// RemarkName is the identifier for the remark. If \p I is passed it is an
-/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
-/// the location of the remark. \return the remark object that can be
-/// streamed to.
-static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
- StringRef RemarkName, Loop *TheLoop, Instruction *I) {
- Value *CodeRegion = TheLoop->getHeader();
- DebugLoc DL = TheLoop->getStartLoc();
-
- if (I) {
- CodeRegion = I->getParent();
- // If there is no debug location attached to the instruction, revert back to
- // using the loop's.
- if (I->getDebugLoc())
- DL = I->getDebugLoc();
- }
-
- OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
- R << "loop not vectorized: ";
- return R;
-}
-
+ if (NewDIL)
+ B.SetCurrentDebugLocation(NewDIL.getValue());
+ else
+ LLVM_DEBUG(dbgs()
+ << "Failed to create new discriminator: "
+ << DIL->getFilename() << " Line: " << DIL->getLine());
+ }
+ else
+ B.SetCurrentDebugLocation(DIL);
+ } else
+ B.SetCurrentDebugLocation(DebugLoc());
+}
+
+/// Write a record \p DebugMsg about vectorization failure to the debug
+/// output stream. If \p I is passed, it is an instruction that prevents
+/// vectorization.
+#ifndef NDEBUG
+static void debugVectorizationFailure(const StringRef DebugMsg,
+ Instruction *I) {
+ dbgs() << "LV: Not vectorizing: " << DebugMsg;
+ if (I != nullptr)
+ dbgs() << " " << *I;
+ else
+ dbgs() << '.';
+ dbgs() << '\n';
+}
+#endif
+
+/// Create an analysis remark that explains why vectorization failed
+///
+/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
+/// RemarkName is the identifier for the remark. If \p I is passed it is an
+/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
+/// the location of the remark. \return the remark object that can be
+/// streamed to.
+static OptimizationRemarkAnalysis createLVAnalysis(const char *PassName,
+ StringRef RemarkName, Loop *TheLoop, Instruction *I) {
+ Value *CodeRegion = TheLoop->getHeader();
+ DebugLoc DL = TheLoop->getStartLoc();
+
+ if (I) {
+ CodeRegion = I->getParent();
+ // If there is no debug location attached to the instruction, revert back to
+ // using the loop's.
+ if (I->getDebugLoc())
+ DL = I->getDebugLoc();
+ }
+
+ OptimizationRemarkAnalysis R(PassName, RemarkName, DL, CodeRegion);
+ R << "loop not vectorized: ";
+ return R;
+}
+
/// Return a value for Step multiplied by VF.
static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
assert(isa<ConstantInt>(Step) && "Expected an integer step");
@@ -1135,427 +1135,427 @@ static Value *createStepForVF(IRBuilder<> &B, Constant *Step, ElementCount VF) {
return VF.isScalable() ? B.CreateVScale(StepVal) : StepVal;
}
-namespace llvm {
-
-void reportVectorizationFailure(const StringRef DebugMsg,
- const StringRef OREMsg, const StringRef ORETag,
- OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
- LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
- LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
- ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
- ORETag, TheLoop, I) << OREMsg);
-}
-
-} // end namespace llvm
-
-#ifndef NDEBUG
-/// \return string containing a file name and a line # for the given loop.
-static std::string getDebugLocString(const Loop *L) {
- std::string Result;
- if (L) {
- raw_string_ostream OS(Result);
- if (const DebugLoc LoopDbgLoc = L->getStartLoc())
- LoopDbgLoc.print(OS);
- else
- // Just print the module name.
- OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
- OS.flush();
- }
- return Result;
-}
-#endif
-
-void InnerLoopVectorizer::addNewMetadata(Instruction *To,
- const Instruction *Orig) {
- // If the loop was versioned with memchecks, add the corresponding no-alias
- // metadata.
- if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
- LVer->annotateInstWithNoAlias(To, Orig);
-}
-
-void InnerLoopVectorizer::addMetadata(Instruction *To,
- Instruction *From) {
- propagateMetadata(To, From);
- addNewMetadata(To, From);
-}
-
-void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
- Instruction *From) {
- for (Value *V : To) {
- if (Instruction *I = dyn_cast<Instruction>(V))
- addMetadata(I, From);
- }
-}
-
-namespace llvm {
-
-// Loop vectorization cost-model hints how the scalar epilogue loop should be
-// lowered.
-enum ScalarEpilogueLowering {
-
- // The default: allowing scalar epilogues.
- CM_ScalarEpilogueAllowed,
-
- // Vectorization with OptForSize: don't allow epilogues.
- CM_ScalarEpilogueNotAllowedOptSize,
-
- // A special case of vectorisation with OptForSize: loops with a very small
- // trip count are considered for vectorization under OptForSize, thereby
- // making sure the cost of their loop body is dominant, free of runtime
- // guards and scalar iteration overheads.
- CM_ScalarEpilogueNotAllowedLowTripLoop,
-
- // Loop hint predicate indicating an epilogue is undesired.
+namespace llvm {
+
+void reportVectorizationFailure(const StringRef DebugMsg,
+ const StringRef OREMsg, const StringRef ORETag,
+ OptimizationRemarkEmitter *ORE, Loop *TheLoop, Instruction *I) {
+ LLVM_DEBUG(debugVectorizationFailure(DebugMsg, I));
+ LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
+ ORE->emit(createLVAnalysis(Hints.vectorizeAnalysisPassName(),
+ ORETag, TheLoop, I) << OREMsg);
+}
+
+} // end namespace llvm
+
+#ifndef NDEBUG
+/// \return string containing a file name and a line # for the given loop.
+static std::string getDebugLocString(const Loop *L) {
+ std::string Result;
+ if (L) {
+ raw_string_ostream OS(Result);
+ if (const DebugLoc LoopDbgLoc = L->getStartLoc())
+ LoopDbgLoc.print(OS);
+ else
+ // Just print the module name.
+ OS << L->getHeader()->getParent()->getParent()->getModuleIdentifier();
+ OS.flush();
+ }
+ return Result;
+}
+#endif
+
+void InnerLoopVectorizer::addNewMetadata(Instruction *To,
+ const Instruction *Orig) {
+ // If the loop was versioned with memchecks, add the corresponding no-alias
+ // metadata.
+ if (LVer && (isa<LoadInst>(Orig) || isa<StoreInst>(Orig)))
+ LVer->annotateInstWithNoAlias(To, Orig);
+}
+
+void InnerLoopVectorizer::addMetadata(Instruction *To,
+ Instruction *From) {
+ propagateMetadata(To, From);
+ addNewMetadata(To, From);
+}
+
+void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
+ Instruction *From) {
+ for (Value *V : To) {
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ addMetadata(I, From);
+ }
+}
+
+namespace llvm {
+
+// Loop vectorization cost-model hints how the scalar epilogue loop should be
+// lowered.
+enum ScalarEpilogueLowering {
+
+ // The default: allowing scalar epilogues.
+ CM_ScalarEpilogueAllowed,
+
+ // Vectorization with OptForSize: don't allow epilogues.
+ CM_ScalarEpilogueNotAllowedOptSize,
+
+ // A special case of vectorisation with OptForSize: loops with a very small
+ // trip count are considered for vectorization under OptForSize, thereby
+ // making sure the cost of their loop body is dominant, free of runtime
+ // guards and scalar iteration overheads.
+ CM_ScalarEpilogueNotAllowedLowTripLoop,
+
+ // Loop hint predicate indicating an epilogue is undesired.
CM_ScalarEpilogueNotNeededUsePredicate,
// Directive indicating we must either tail fold or not vectorize
CM_ScalarEpilogueNotAllowedUsePredicate
-};
-
-/// LoopVectorizationCostModel - estimates the expected speedups due to
-/// vectorization.
-/// In many cases vectorization is not profitable. This can happen because of
-/// a number of reasons. In this class we mainly attempt to predict the
-/// expected speedup/slowdowns due to the supported instruction set. We use the
-/// TargetTransformInfo to query the different backends for the cost of
-/// different operations.
-class LoopVectorizationCostModel {
-public:
- LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
- PredicatedScalarEvolution &PSE, LoopInfo *LI,
- LoopVectorizationLegality *Legal,
- const TargetTransformInfo &TTI,
- const TargetLibraryInfo *TLI, DemandedBits *DB,
- AssumptionCache *AC,
- OptimizationRemarkEmitter *ORE, const Function *F,
- const LoopVectorizeHints *Hints,
- InterleavedAccessInfo &IAI)
- : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
- TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
- Hints(Hints), InterleaveInfo(IAI) {}
-
- /// \return An upper bound for the vectorization factor, or None if
- /// vectorization and interleaving should be avoided up front.
+};
+
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because of
+/// a number of reasons. In this class we mainly attempt to predict the
+/// expected speedup/slowdowns due to the supported instruction set. We use the
+/// TargetTransformInfo to query the different backends for the cost of
+/// different operations.
+class LoopVectorizationCostModel {
+public:
+ LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
+ PredicatedScalarEvolution &PSE, LoopInfo *LI,
+ LoopVectorizationLegality *Legal,
+ const TargetTransformInfo &TTI,
+ const TargetLibraryInfo *TLI, DemandedBits *DB,
+ AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, const Function *F,
+ const LoopVectorizeHints *Hints,
+ InterleavedAccessInfo &IAI)
+ : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
+ TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
+ Hints(Hints), InterleaveInfo(IAI) {}
+
+ /// \return An upper bound for the vectorization factor, or None if
+ /// vectorization and interleaving should be avoided up front.
Optional<ElementCount> computeMaxVF(ElementCount UserVF, unsigned UserIC);
-
- /// \return True if runtime checks are required for vectorization, and false
- /// otherwise.
- bool runtimeChecksRequired();
-
- /// \return The most profitable vectorization factor and the cost of that VF.
- /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
- /// then this vectorization factor will be selected if vectorization is
- /// possible.
+
+ /// \return True if runtime checks are required for vectorization, and false
+ /// otherwise.
+ bool runtimeChecksRequired();
+
+ /// \return The most profitable vectorization factor and the cost of that VF.
+ /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
+ /// then this vectorization factor will be selected if vectorization is
+ /// possible.
VectorizationFactor selectVectorizationFactor(ElementCount MaxVF);
VectorizationFactor
selectEpilogueVectorizationFactor(const ElementCount MaxVF,
const LoopVectorizationPlanner &LVP);
-
- /// Setup cost-based decisions for user vectorization factor.
+
+ /// Setup cost-based decisions for user vectorization factor.
void selectUserVectorizationFactor(ElementCount UserVF) {
- collectUniformsAndScalars(UserVF);
- collectInstsToScalarize(UserVF);
- }
-
- /// \return The size (in bits) of the smallest and widest types in the code
- /// that needs to be vectorized. We ignore values that remain scalar such as
- /// 64 bit loop indices.
- std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
-
- /// \return The desired interleave count.
- /// If interleave count has been specified by metadata it will be returned.
- /// Otherwise, the interleave count is computed and returned. VF and LoopCost
- /// are the selected vectorization factor and the cost of the selected VF.
+ collectUniformsAndScalars(UserVF);
+ collectInstsToScalarize(UserVF);
+ }
+
+ /// \return The size (in bits) of the smallest and widest types in the code
+ /// that needs to be vectorized. We ignore values that remain scalar such as
+ /// 64 bit loop indices.
+ std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
+
+ /// \return The desired interleave count.
+ /// If interleave count has been specified by metadata it will be returned.
+ /// Otherwise, the interleave count is computed and returned. VF and LoopCost
+ /// are the selected vectorization factor and the cost of the selected VF.
unsigned selectInterleaveCount(ElementCount VF, unsigned LoopCost);
-
- /// Memory access instruction may be vectorized in more than one way.
- /// Form of instruction after vectorization depends on cost.
- /// This function takes cost-based decisions for Load/Store instructions
- /// and collects them in a map. This decisions map is used for building
- /// the lists of loop-uniform and loop-scalar instructions.
- /// The calculated cost is saved with widening decision in order to
- /// avoid redundant calculations.
+
+ /// Memory access instruction may be vectorized in more than one way.
+ /// Form of instruction after vectorization depends on cost.
+ /// This function takes cost-based decisions for Load/Store instructions
+ /// and collects them in a map. This decisions map is used for building
+ /// the lists of loop-uniform and loop-scalar instructions.
+ /// The calculated cost is saved with widening decision in order to
+ /// avoid redundant calculations.
void setCostBasedWideningDecision(ElementCount VF);
-
- /// A struct that represents some properties of the register usage
- /// of a loop.
- struct RegisterUsage {
- /// Holds the number of loop invariant values that are used in the loop.
- /// The key is ClassID of target-provided register class.
- SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
- /// Holds the maximum number of concurrent live intervals in the loop.
- /// The key is ClassID of target-provided register class.
- SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
- };
-
- /// \return Returns information about the register usages of the loop for the
- /// given vectorization factors.
+
+ /// A struct that represents some properties of the register usage
+ /// of a loop.
+ struct RegisterUsage {
+ /// Holds the number of loop invariant values that are used in the loop.
+ /// The key is ClassID of target-provided register class.
+ SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
+ /// Holds the maximum number of concurrent live intervals in the loop.
+ /// The key is ClassID of target-provided register class.
+ SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
+ };
+
+ /// \return Returns information about the register usages of the loop for the
+ /// given vectorization factors.
SmallVector<RegisterUsage, 8>
calculateRegisterUsage(ArrayRef<ElementCount> VFs);
-
- /// Collect values we want to ignore in the cost model.
- void collectValuesToIgnore();
-
+
+ /// Collect values we want to ignore in the cost model.
+ void collectValuesToIgnore();
+
/// Split reductions into those that happen in the loop, and those that happen
/// outside. In loop reductions are collected into InLoopReductionChains.
void collectInLoopReductions();
- /// \returns The smallest bitwidth each instruction can be represented with.
- /// The vector equivalents of these instructions should be truncated to this
- /// type.
- const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
- return MinBWs;
- }
-
- /// \returns True if it is more profitable to scalarize instruction \p I for
- /// vectorization factor \p VF.
+ /// \returns The smallest bitwidth each instruction can be represented with.
+ /// The vector equivalents of these instructions should be truncated to this
+ /// type.
+ const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
+ return MinBWs;
+ }
+
+ /// \returns True if it is more profitable to scalarize instruction \p I for
+ /// vectorization factor \p VF.
bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
assert(VF.isVector() &&
"Profitable to scalarize relevant only for VF > 1.");
-
- // Cost model is not run in the VPlan-native path - return conservative
- // result until this changes.
- if (EnableVPlanNativePath)
- return false;
-
- auto Scalars = InstsToScalarize.find(VF);
- assert(Scalars != InstsToScalarize.end() &&
- "VF not yet analyzed for scalarization profitability");
- return Scalars->second.find(I) != Scalars->second.end();
- }
-
- /// Returns true if \p I is known to be uniform after vectorization.
+
+ // Cost model is not run in the VPlan-native path - return conservative
+ // result until this changes.
+ if (EnableVPlanNativePath)
+ return false;
+
+ auto Scalars = InstsToScalarize.find(VF);
+ assert(Scalars != InstsToScalarize.end() &&
+ "VF not yet analyzed for scalarization profitability");
+ return Scalars->second.find(I) != Scalars->second.end();
+ }
+
+ /// Returns true if \p I is known to be uniform after vectorization.
bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
if (VF.isScalar())
- return true;
-
- // Cost model is not run in the VPlan-native path - return conservative
- // result until this changes.
- if (EnableVPlanNativePath)
- return false;
-
- auto UniformsPerVF = Uniforms.find(VF);
- assert(UniformsPerVF != Uniforms.end() &&
- "VF not yet analyzed for uniformity");
- return UniformsPerVF->second.count(I);
- }
-
- /// Returns true if \p I is known to be scalar after vectorization.
+ return true;
+
+ // Cost model is not run in the VPlan-native path - return conservative
+ // result until this changes.
+ if (EnableVPlanNativePath)
+ return false;
+
+ auto UniformsPerVF = Uniforms.find(VF);
+ assert(UniformsPerVF != Uniforms.end() &&
+ "VF not yet analyzed for uniformity");
+ return UniformsPerVF->second.count(I);
+ }
+
+ /// Returns true if \p I is known to be scalar after vectorization.
bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
if (VF.isScalar())
- return true;
-
- // Cost model is not run in the VPlan-native path - return conservative
- // result until this changes.
- if (EnableVPlanNativePath)
- return false;
-
- auto ScalarsPerVF = Scalars.find(VF);
- assert(ScalarsPerVF != Scalars.end() &&
- "Scalar values are not calculated for VF");
- return ScalarsPerVF->second.count(I);
- }
-
- /// \returns True if instruction \p I can be truncated to a smaller bitwidth
- /// for vectorization factor \p VF.
+ return true;
+
+ // Cost model is not run in the VPlan-native path - return conservative
+ // result until this changes.
+ if (EnableVPlanNativePath)
+ return false;
+
+ auto ScalarsPerVF = Scalars.find(VF);
+ assert(ScalarsPerVF != Scalars.end() &&
+ "Scalar values are not calculated for VF");
+ return ScalarsPerVF->second.count(I);
+ }
+
+ /// \returns True if instruction \p I can be truncated to a smaller bitwidth
+ /// for vectorization factor \p VF.
bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
- !isProfitableToScalarize(I, VF) &&
- !isScalarAfterVectorization(I, VF);
- }
-
- /// Decision that was taken during cost calculation for memory instruction.
- enum InstWidening {
- CM_Unknown,
- CM_Widen, // For consecutive accesses with stride +1.
- CM_Widen_Reverse, // For consecutive accesses with stride -1.
- CM_Interleave,
- CM_GatherScatter,
- CM_Scalarize
- };
-
- /// Save vectorization decision \p W and \p Cost taken by the cost model for
- /// instruction \p I and vector width \p VF.
+ !isProfitableToScalarize(I, VF) &&
+ !isScalarAfterVectorization(I, VF);
+ }
+
+ /// Decision that was taken during cost calculation for memory instruction.
+ enum InstWidening {
+ CM_Unknown,
+ CM_Widen, // For consecutive accesses with stride +1.
+ CM_Widen_Reverse, // For consecutive accesses with stride -1.
+ CM_Interleave,
+ CM_GatherScatter,
+ CM_Scalarize
+ };
+
+ /// Save vectorization decision \p W and \p Cost taken by the cost model for
+ /// instruction \p I and vector width \p VF.
void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
InstructionCost Cost) {
assert(VF.isVector() && "Expected VF >=2");
- WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
- }
-
- /// Save vectorization decision \p W and \p Cost taken by the cost model for
- /// interleaving group \p Grp and vector width \p VF.
+ WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+ }
+
+ /// Save vectorization decision \p W and \p Cost taken by the cost model for
+ /// interleaving group \p Grp and vector width \p VF.
void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
ElementCount VF, InstWidening W,
InstructionCost Cost) {
assert(VF.isVector() && "Expected VF >=2");
- /// Broadcast this decicion to all instructions inside the group.
- /// But the cost will be assigned to one instruction only.
- for (unsigned i = 0; i < Grp->getFactor(); ++i) {
- if (auto *I = Grp->getMember(i)) {
- if (Grp->getInsertPos() == I)
- WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
- else
- WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
- }
- }
- }
-
- /// Return the cost model decision for the given instruction \p I and vector
- /// width \p VF. Return CM_Unknown if this instruction did not pass
- /// through the cost modeling.
+ /// Broadcast this decicion to all instructions inside the group.
+ /// But the cost will be assigned to one instruction only.
+ for (unsigned i = 0; i < Grp->getFactor(); ++i) {
+ if (auto *I = Grp->getMember(i)) {
+ if (Grp->getInsertPos() == I)
+ WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+ else
+ WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
+ }
+ }
+ }
+
+ /// Return the cost model decision for the given instruction \p I and vector
+ /// width \p VF. Return CM_Unknown if this instruction did not pass
+ /// through the cost modeling.
InstWidening getWideningDecision(Instruction *I, ElementCount VF) {
assert(VF.isVector() && "Expected VF to be a vector VF");
- // Cost model is not run in the VPlan-native path - return conservative
- // result until this changes.
- if (EnableVPlanNativePath)
- return CM_GatherScatter;
-
+ // Cost model is not run in the VPlan-native path - return conservative
+ // result until this changes.
+ if (EnableVPlanNativePath)
+ return CM_GatherScatter;
+
std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
- auto Itr = WideningDecisions.find(InstOnVF);
- if (Itr == WideningDecisions.end())
- return CM_Unknown;
- return Itr->second.first;
- }
-
- /// Return the vectorization cost for the given instruction \p I and vector
- /// width \p VF.
+ auto Itr = WideningDecisions.find(InstOnVF);
+ if (Itr == WideningDecisions.end())
+ return CM_Unknown;
+ return Itr->second.first;
+ }
+
+ /// Return the vectorization cost for the given instruction \p I and vector
+ /// width \p VF.
InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
assert(VF.isVector() && "Expected VF >=2");
std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
- assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
- "The cost is not calculated");
- return WideningDecisions[InstOnVF].second;
- }
-
- /// Return True if instruction \p I is an optimizable truncate whose operand
- /// is an induction variable. Such a truncate will be removed by adding a new
- /// induction variable with the destination type.
+ assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
+ "The cost is not calculated");
+ return WideningDecisions[InstOnVF].second;
+ }
+
+ /// Return True if instruction \p I is an optimizable truncate whose operand
+ /// is an induction variable. Such a truncate will be removed by adding a new
+ /// induction variable with the destination type.
bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
- // If the instruction is not a truncate, return false.
- auto *Trunc = dyn_cast<TruncInst>(I);
- if (!Trunc)
- return false;
-
- // Get the source and destination types of the truncate.
- Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
- Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
-
- // If the truncate is free for the given types, return false. Replacing a
- // free truncate with an induction variable would add an induction variable
- // update instruction to each iteration of the loop. We exclude from this
- // check the primary induction variable since it will need an update
- // instruction regardless.
- Value *Op = Trunc->getOperand(0);
- if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
- return false;
-
- // If the truncated value is not an induction variable, return false.
- return Legal->isInductionPhi(Op);
- }
-
- /// Collects the instructions to scalarize for each predicated instruction in
- /// the loop.
+ // If the instruction is not a truncate, return false.
+ auto *Trunc = dyn_cast<TruncInst>(I);
+ if (!Trunc)
+ return false;
+
+ // Get the source and destination types of the truncate.
+ Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
+ Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
+
+ // If the truncate is free for the given types, return false. Replacing a
+ // free truncate with an induction variable would add an induction variable
+ // update instruction to each iteration of the loop. We exclude from this
+ // check the primary induction variable since it will need an update
+ // instruction regardless.
+ Value *Op = Trunc->getOperand(0);
+ if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
+ return false;
+
+ // If the truncated value is not an induction variable, return false.
+ return Legal->isInductionPhi(Op);
+ }
+
+ /// Collects the instructions to scalarize for each predicated instruction in
+ /// the loop.
void collectInstsToScalarize(ElementCount VF);
-
- /// Collect Uniform and Scalar values for the given \p VF.
- /// The sets depend on CM decision for Load/Store instructions
- /// that may be vectorized as interleave, gather-scatter or scalarized.
+
+ /// Collect Uniform and Scalar values for the given \p VF.
+ /// The sets depend on CM decision for Load/Store instructions
+ /// that may be vectorized as interleave, gather-scatter or scalarized.
void collectUniformsAndScalars(ElementCount VF) {
- // Do the analysis once.
+ // Do the analysis once.
if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
- return;
- setCostBasedWideningDecision(VF);
- collectLoopUniforms(VF);
- collectLoopScalars(VF);
- }
-
- /// Returns true if the target machine supports masked store operation
- /// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
- return Legal->isConsecutivePtr(Ptr) &&
- TTI.isLegalMaskedStore(DataType, Alignment);
- }
-
- /// Returns true if the target machine supports masked load operation
- /// for the given \p DataType and kind of access to \p Ptr.
- bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
- return Legal->isConsecutivePtr(Ptr) &&
- TTI.isLegalMaskedLoad(DataType, Alignment);
- }
-
- /// Returns true if the target machine supports masked scatter operation
- /// for the given \p DataType.
- bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
- return TTI.isLegalMaskedScatter(DataType, Alignment);
- }
-
- /// Returns true if the target machine supports masked gather operation
- /// for the given \p DataType.
- bool isLegalMaskedGather(Type *DataType, Align Alignment) {
- return TTI.isLegalMaskedGather(DataType, Alignment);
- }
-
- /// Returns true if the target machine can represent \p V as a masked gather
- /// or scatter operation.
- bool isLegalGatherOrScatter(Value *V) {
- bool LI = isa<LoadInst>(V);
- bool SI = isa<StoreInst>(V);
- if (!LI && !SI)
- return false;
- auto *Ty = getMemInstValueType(V);
- Align Align = getLoadStoreAlignment(V);
- return (LI && isLegalMaskedGather(Ty, Align)) ||
- (SI && isLegalMaskedScatter(Ty, Align));
- }
-
- /// Returns true if \p I is an instruction that will be scalarized with
- /// predication. Such instructions include conditional stores and
- /// instructions that may divide by zero.
- /// If a non-zero VF has been calculated, we check if I will be scalarized
- /// predication for that VF.
+ return;
+ setCostBasedWideningDecision(VF);
+ collectLoopUniforms(VF);
+ collectLoopScalars(VF);
+ }
+
+ /// Returns true if the target machine supports masked store operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
+ return Legal->isConsecutivePtr(Ptr) &&
+ TTI.isLegalMaskedStore(DataType, Alignment);
+ }
+
+ /// Returns true if the target machine supports masked load operation
+ /// for the given \p DataType and kind of access to \p Ptr.
+ bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
+ return Legal->isConsecutivePtr(Ptr) &&
+ TTI.isLegalMaskedLoad(DataType, Alignment);
+ }
+
+ /// Returns true if the target machine supports masked scatter operation
+ /// for the given \p DataType.
+ bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
+ return TTI.isLegalMaskedScatter(DataType, Alignment);
+ }
+
+ /// Returns true if the target machine supports masked gather operation
+ /// for the given \p DataType.
+ bool isLegalMaskedGather(Type *DataType, Align Alignment) {
+ return TTI.isLegalMaskedGather(DataType, Alignment);
+ }
+
+ /// Returns true if the target machine can represent \p V as a masked gather
+ /// or scatter operation.
+ bool isLegalGatherOrScatter(Value *V) {
+ bool LI = isa<LoadInst>(V);
+ bool SI = isa<StoreInst>(V);
+ if (!LI && !SI)
+ return false;
+ auto *Ty = getMemInstValueType(V);
+ Align Align = getLoadStoreAlignment(V);
+ return (LI && isLegalMaskedGather(Ty, Align)) ||
+ (SI && isLegalMaskedScatter(Ty, Align));
+ }
+
+ /// Returns true if \p I is an instruction that will be scalarized with
+ /// predication. Such instructions include conditional stores and
+ /// instructions that may divide by zero.
+ /// If a non-zero VF has been calculated, we check if I will be scalarized
+ /// predication for that VF.
bool isScalarWithPredication(Instruction *I,
ElementCount VF = ElementCount::getFixed(1));
-
- // Returns true if \p I is an instruction that will be predicated either
- // through scalar predication or masked load/store or masked gather/scatter.
- // Superset of instructions that return true for isScalarWithPredication.
- bool isPredicatedInst(Instruction *I) {
- if (!blockNeedsPredication(I->getParent()))
- return false;
- // Loads and stores that need some form of masked operation are predicated
- // instructions.
- if (isa<LoadInst>(I) || isa<StoreInst>(I))
- return Legal->isMaskRequired(I);
- return isScalarWithPredication(I);
- }
-
- /// Returns true if \p I is a memory instruction with consecutive memory
- /// access that can be widened.
+
+ // Returns true if \p I is an instruction that will be predicated either
+ // through scalar predication or masked load/store or masked gather/scatter.
+ // Superset of instructions that return true for isScalarWithPredication.
+ bool isPredicatedInst(Instruction *I) {
+ if (!blockNeedsPredication(I->getParent()))
+ return false;
+ // Loads and stores that need some form of masked operation are predicated
+ // instructions.
+ if (isa<LoadInst>(I) || isa<StoreInst>(I))
+ return Legal->isMaskRequired(I);
+ return isScalarWithPredication(I);
+ }
+
+ /// Returns true if \p I is a memory instruction with consecutive memory
+ /// access that can be widened.
bool
memoryInstructionCanBeWidened(Instruction *I,
ElementCount VF = ElementCount::getFixed(1));
-
- /// Returns true if \p I is a memory instruction in an interleaved-group
- /// of memory accesses that can be vectorized with wide vector loads/stores
- /// and shuffles.
+
+ /// Returns true if \p I is a memory instruction in an interleaved-group
+ /// of memory accesses that can be vectorized with wide vector loads/stores
+ /// and shuffles.
bool
interleavedAccessCanBeWidened(Instruction *I,
ElementCount VF = ElementCount::getFixed(1));
-
- /// Check if \p Instr belongs to any interleaved access group.
- bool isAccessInterleaved(Instruction *Instr) {
- return InterleaveInfo.isInterleaved(Instr);
- }
-
- /// Get the interleaved access group that \p Instr belongs to.
- const InterleaveGroup<Instruction> *
- getInterleavedAccessGroup(Instruction *Instr) {
- return InterleaveInfo.getInterleaveGroup(Instr);
- }
-
+
+ /// Check if \p Instr belongs to any interleaved access group.
+ bool isAccessInterleaved(Instruction *Instr) {
+ return InterleaveInfo.isInterleaved(Instr);
+ }
+
+ /// Get the interleaved access group that \p Instr belongs to.
+ const InterleaveGroup<Instruction> *
+ getInterleavedAccessGroup(Instruction *Instr) {
+ return InterleaveInfo.getInterleaveGroup(Instr);
+ }
+
/// Returns true if we're required to use a scalar epilogue for at least
/// the final iteration of the original loop.
- bool requiresScalarEpilogue() const {
+ bool requiresScalarEpilogue() const {
if (!isScalarEpilogueAllowed())
return false;
// If we might exit from anywhere but the latch, must run the exiting
@@ -1563,21 +1563,21 @@ public:
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
return true;
return InterleaveInfo.requiresScalarEpilogue();
- }
-
- /// Returns true if a scalar epilogue is not allowed due to optsize or a
- /// loop hint annotation.
- bool isScalarEpilogueAllowed() const {
- return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
- }
-
- /// Returns true if all loop blocks should be masked to fold tail loop.
- bool foldTailByMasking() const { return FoldTailByMasking; }
-
- bool blockNeedsPredication(BasicBlock *BB) {
- return foldTailByMasking() || Legal->blockNeedsPredication(BB);
- }
-
+ }
+
+ /// Returns true if a scalar epilogue is not allowed due to optsize or a
+ /// loop hint annotation.
+ bool isScalarEpilogueAllowed() const {
+ return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
+ }
+
+ /// Returns true if all loop blocks should be masked to fold tail loop.
+ bool foldTailByMasking() const { return FoldTailByMasking; }
+
+ bool blockNeedsPredication(BasicBlock *BB) {
+ return foldTailByMasking() || Legal->blockNeedsPredication(BB);
+ }
+
/// A SmallMapVector to store the InLoop reduction op chains, mapping phi
/// nodes to the chain of instructions representing the reductions. Uses a
/// MapVector to ensure deterministic iteration order.
@@ -1594,143 +1594,143 @@ public:
return InLoopReductionChains.count(Phi);
}
- /// Estimate cost of an intrinsic call instruction CI if it were vectorized
- /// with factor VF. Return the cost of the instruction, including
- /// scalarization overhead if it's needed.
+ /// Estimate cost of an intrinsic call instruction CI if it were vectorized
+ /// with factor VF. Return the cost of the instruction, including
+ /// scalarization overhead if it's needed.
InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF);
-
- /// Estimate cost of a call instruction CI if it were vectorized with factor
- /// VF. Return the cost of the instruction, including scalarization overhead
- /// if it's needed. The flag NeedToScalarize shows if the call needs to be
- /// scalarized -
- /// i.e. either vector version isn't available, or is too expensive.
+
+ /// Estimate cost of a call instruction CI if it were vectorized with factor
+ /// VF. Return the cost of the instruction, including scalarization overhead
+ /// if it's needed. The flag NeedToScalarize shows if the call needs to be
+ /// scalarized -
+ /// i.e. either vector version isn't available, or is too expensive.
InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
bool &NeedToScalarize);
-
- /// Invalidates decisions already taken by the cost model.
- void invalidateCostModelingDecisions() {
- WideningDecisions.clear();
- Uniforms.clear();
- Scalars.clear();
- }
-
-private:
- unsigned NumPredStores = 0;
-
- /// \return An upper bound for the vectorization factor, a power-of-2 larger
- /// than zero. One is returned if vectorization should best be avoided due
- /// to cost.
+
+ /// Invalidates decisions already taken by the cost model.
+ void invalidateCostModelingDecisions() {
+ WideningDecisions.clear();
+ Uniforms.clear();
+ Scalars.clear();
+ }
+
+private:
+ unsigned NumPredStores = 0;
+
+ /// \return An upper bound for the vectorization factor, a power-of-2 larger
+ /// than zero. One is returned if vectorization should best be avoided due
+ /// to cost.
ElementCount computeFeasibleMaxVF(unsigned ConstTripCount,
ElementCount UserVF);
-
- /// The vectorization cost is a combination of the cost itself and a boolean
- /// indicating whether any of the contributing operations will actually
- /// operate on
- /// vector values after type legalization in the backend. If this latter value
- /// is
- /// false, then all operations will be scalarized (i.e. no vectorization has
- /// actually taken place).
+
+ /// The vectorization cost is a combination of the cost itself and a boolean
+ /// indicating whether any of the contributing operations will actually
+ /// operate on
+ /// vector values after type legalization in the backend. If this latter value
+ /// is
+ /// false, then all operations will be scalarized (i.e. no vectorization has
+ /// actually taken place).
using VectorizationCostTy = std::pair<InstructionCost, bool>;
-
- /// Returns the expected execution cost. The unit of the cost does
- /// not matter because we use the 'cost' units to compare different
- /// vector widths. The cost that is returned is *not* normalized by
- /// the factor width.
+
+ /// Returns the expected execution cost. The unit of the cost does
+ /// not matter because we use the 'cost' units to compare different
+ /// vector widths. The cost that is returned is *not* normalized by
+ /// the factor width.
VectorizationCostTy expectedCost(ElementCount VF);
-
- /// Returns the execution time cost of an instruction for a given vector
- /// width. Vector width of one means scalar.
+
+ /// Returns the execution time cost of an instruction for a given vector
+ /// width. Vector width of one means scalar.
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
-
- /// The cost-computation logic from getInstructionCost which provides
- /// the vector type as an output parameter.
+
+ /// The cost-computation logic from getInstructionCost which provides
+ /// the vector type as an output parameter.
InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
Type *&VectorTy);
-
+
/// Return the cost of instructions in an inloop reduction pattern, if I is
/// part of that pattern.
InstructionCost getReductionPatternCost(Instruction *I, ElementCount VF,
Type *VectorTy,
TTI::TargetCostKind CostKind);
- /// Calculate vectorization cost of memory instruction \p I.
+ /// Calculate vectorization cost of memory instruction \p I.
InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
-
- /// The cost computation for scalarized memory instruction.
+
+ /// The cost computation for scalarized memory instruction.
InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
-
- /// The cost computation for interleaving group of memory instructions.
+
+ /// The cost computation for interleaving group of memory instructions.
InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
-
- /// The cost computation for Gather/Scatter instruction.
+
+ /// The cost computation for Gather/Scatter instruction.
InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
-
- /// The cost computation for widening instruction \p I with consecutive
- /// memory access.
+
+ /// The cost computation for widening instruction \p I with consecutive
+ /// memory access.
InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
-
- /// The cost calculation for Load/Store instruction \p I with uniform pointer -
- /// Load: scalar load + broadcast.
- /// Store: scalar store + (loop invariant value stored? 0 : extract of last
- /// element)
+
+ /// The cost calculation for Load/Store instruction \p I with uniform pointer -
+ /// Load: scalar load + broadcast.
+ /// Store: scalar store + (loop invariant value stored? 0 : extract of last
+ /// element)
InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
-
- /// Estimate the overhead of scalarizing an instruction. This is a
- /// convenience wrapper for the type-based getScalarizationOverhead API.
+
+ /// Estimate the overhead of scalarizing an instruction. This is a
+ /// convenience wrapper for the type-based getScalarizationOverhead API.
InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF);
-
- /// Returns whether the instruction is a load or store and will be a emitted
- /// as a vector operation.
- bool isConsecutiveLoadOrStore(Instruction *I);
-
- /// Returns true if an artificially high cost for emulated masked memrefs
- /// should be used.
- bool useEmulatedMaskMemRefHack(Instruction *I);
-
- /// Map of scalar integer values to the smallest bitwidth they can be legally
- /// represented as. The vector equivalents of these values should be truncated
- /// to this type.
- MapVector<Instruction *, uint64_t> MinBWs;
-
- /// A type representing the costs for instructions if they were to be
- /// scalarized rather than vectorized. The entries are Instruction-Cost
- /// pairs.
+
+ /// Returns whether the instruction is a load or store and will be a emitted
+ /// as a vector operation.
+ bool isConsecutiveLoadOrStore(Instruction *I);
+
+ /// Returns true if an artificially high cost for emulated masked memrefs
+ /// should be used.
+ bool useEmulatedMaskMemRefHack(Instruction *I);
+
+ /// Map of scalar integer values to the smallest bitwidth they can be legally
+ /// represented as. The vector equivalents of these values should be truncated
+ /// to this type.
+ MapVector<Instruction *, uint64_t> MinBWs;
+
+ /// A type representing the costs for instructions if they were to be
+ /// scalarized rather than vectorized. The entries are Instruction-Cost
+ /// pairs.
using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
-
- /// A set containing all BasicBlocks that are known to present after
- /// vectorization as a predicated block.
- SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
-
- /// Records whether it is allowed to have the original scalar loop execute at
- /// least once. This may be needed as a fallback loop in case runtime
- /// aliasing/dependence checks fail, or to handle the tail/remainder
- /// iterations when the trip count is unknown or doesn't divide by the VF,
- /// or as a peel-loop to handle gaps in interleave-groups.
- /// Under optsize and when the trip count is very small we don't allow any
- /// iterations to execute in the scalar loop.
- ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
-
- /// All blocks of loop are to be masked to fold tail of scalar iterations.
- bool FoldTailByMasking = false;
-
- /// A map holding scalar costs for different vectorization factors. The
- /// presence of a cost for an instruction in the mapping indicates that the
- /// instruction will be scalarized when vectorizing with the associated
- /// vectorization factor. The entries are VF-ScalarCostTy pairs.
+
+ /// A set containing all BasicBlocks that are known to present after
+ /// vectorization as a predicated block.
+ SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
+
+ /// Records whether it is allowed to have the original scalar loop execute at
+ /// least once. This may be needed as a fallback loop in case runtime
+ /// aliasing/dependence checks fail, or to handle the tail/remainder
+ /// iterations when the trip count is unknown or doesn't divide by the VF,
+ /// or as a peel-loop to handle gaps in interleave-groups.
+ /// Under optsize and when the trip count is very small we don't allow any
+ /// iterations to execute in the scalar loop.
+ ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+
+ /// All blocks of loop are to be masked to fold tail of scalar iterations.
+ bool FoldTailByMasking = false;
+
+ /// A map holding scalar costs for different vectorization factors. The
+ /// presence of a cost for an instruction in the mapping indicates that the
+ /// instruction will be scalarized when vectorizing with the associated
+ /// vectorization factor. The entries are VF-ScalarCostTy pairs.
DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
-
- /// Holds the instructions known to be uniform after vectorization.
- /// The data is collected per VF.
+
+ /// Holds the instructions known to be uniform after vectorization.
+ /// The data is collected per VF.
DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
-
- /// Holds the instructions known to be scalar after vectorization.
- /// The data is collected per VF.
+
+ /// Holds the instructions known to be scalar after vectorization.
+ /// The data is collected per VF.
DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
-
- /// Holds the instructions (address computations) that are forced to be
- /// scalarized.
+
+ /// Holds the instructions (address computations) that are forced to be
+ /// scalarized.
DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
-
+
/// PHINodes of the reductions that should be expanded in-loop along with
/// their associated chains of reduction operations, in program order from top
/// (PHI) to bottom
@@ -1742,64 +1742,64 @@ private:
/// without having to loop through InLoopReductionChains.
DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
- /// Returns the expected difference in cost from scalarizing the expression
- /// feeding a predicated instruction \p PredInst. The instructions to
- /// scalarize and their scalar costs are collected in \p ScalarCosts. A
- /// non-negative return value implies the expression will be scalarized.
- /// Currently, only single-use chains are considered for scalarization.
- int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
+ /// Returns the expected difference in cost from scalarizing the expression
+ /// feeding a predicated instruction \p PredInst. The instructions to
+ /// scalarize and their scalar costs are collected in \p ScalarCosts. A
+ /// non-negative return value implies the expression will be scalarized.
+ /// Currently, only single-use chains are considered for scalarization.
+ int computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts,
ElementCount VF);
-
- /// Collect the instructions that are uniform after vectorization. An
- /// instruction is uniform if we represent it with a single scalar value in
- /// the vectorized loop corresponding to each vector iteration. Examples of
- /// uniform instructions include pointer operands of consecutive or
- /// interleaved memory accesses. Note that although uniformity implies an
- /// instruction will be scalar, the reverse is not true. In general, a
- /// scalarized instruction will be represented by VF scalar values in the
- /// vectorized loop, each corresponding to an iteration of the original
- /// scalar loop.
+
+ /// Collect the instructions that are uniform after vectorization. An
+ /// instruction is uniform if we represent it with a single scalar value in
+ /// the vectorized loop corresponding to each vector iteration. Examples of
+ /// uniform instructions include pointer operands of consecutive or
+ /// interleaved memory accesses. Note that although uniformity implies an
+ /// instruction will be scalar, the reverse is not true. In general, a
+ /// scalarized instruction will be represented by VF scalar values in the
+ /// vectorized loop, each corresponding to an iteration of the original
+ /// scalar loop.
void collectLoopUniforms(ElementCount VF);
-
- /// Collect the instructions that are scalar after vectorization. An
- /// instruction is scalar if it is known to be uniform or will be scalarized
- /// during vectorization. Non-uniform scalarized instructions will be
- /// represented by VF values in the vectorized loop, each corresponding to an
- /// iteration of the original scalar loop.
+
+ /// Collect the instructions that are scalar after vectorization. An
+ /// instruction is scalar if it is known to be uniform or will be scalarized
+ /// during vectorization. Non-uniform scalarized instructions will be
+ /// represented by VF values in the vectorized loop, each corresponding to an
+ /// iteration of the original scalar loop.
void collectLoopScalars(ElementCount VF);
-
- /// Keeps cost model vectorization decision and cost for instructions.
- /// Right now it is used for memory instructions only.
+
+ /// Keeps cost model vectorization decision and cost for instructions.
+ /// Right now it is used for memory instructions only.
using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
std::pair<InstWidening, InstructionCost>>;
-
- DecisionList WideningDecisions;
-
- /// Returns true if \p V is expected to be vectorized and it needs to be
- /// extracted.
+
+ DecisionList WideningDecisions;
+
+ /// Returns true if \p V is expected to be vectorized and it needs to be
+ /// extracted.
bool needsExtract(Value *V, ElementCount VF) const {
- Instruction *I = dyn_cast<Instruction>(V);
+ Instruction *I = dyn_cast<Instruction>(V);
if (VF.isScalar() || !I || !TheLoop->contains(I) ||
TheLoop->isLoopInvariant(I))
- return false;
-
- // Assume we can vectorize V (and hence we need extraction) if the
- // scalars are not computed yet. This can happen, because it is called
- // via getScalarizationOverhead from setCostBasedWideningDecision, before
- // the scalars are collected. That should be a safe assumption in most
- // cases, because we check if the operands have vectorizable types
- // beforehand in LoopVectorizationLegality.
- return Scalars.find(VF) == Scalars.end() ||
- !isScalarAfterVectorization(I, VF);
- };
-
- /// Returns a range containing only operands needing to be extracted.
- SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
+ return false;
+
+ // Assume we can vectorize V (and hence we need extraction) if the
+ // scalars are not computed yet. This can happen, because it is called
+ // via getScalarizationOverhead from setCostBasedWideningDecision, before
+ // the scalars are collected. That should be a safe assumption in most
+ // cases, because we check if the operands have vectorizable types
+ // beforehand in LoopVectorizationLegality.
+ return Scalars.find(VF) == Scalars.end() ||
+ !isScalarAfterVectorization(I, VF);
+ };
+
+ /// Returns a range containing only operands needing to be extracted.
+ SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
ElementCount VF) {
- return SmallVector<Value *, 4>(make_filter_range(
- Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
- }
-
+ return SmallVector<Value *, 4>(make_filter_range(
+ Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
+ }
+
/// Determines if we have the infrastructure to vectorize loop \p L and its
/// epilogue, assuming the main loop is vectorized by \p VF.
bool isCandidateForEpilogueVectorization(const Loop &L,
@@ -1810,539 +1810,539 @@ private:
/// \p VF is the vectorization factor chosen for the original loop.
bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
-public:
- /// The loop that we evaluate.
- Loop *TheLoop;
-
- /// Predicated scalar evolution analysis.
- PredicatedScalarEvolution &PSE;
-
- /// Loop Info analysis.
- LoopInfo *LI;
-
- /// Vectorization legality.
- LoopVectorizationLegality *Legal;
-
- /// Vector target information.
- const TargetTransformInfo &TTI;
-
- /// Target Library Info.
- const TargetLibraryInfo *TLI;
-
- /// Demanded bits analysis.
- DemandedBits *DB;
-
- /// Assumption cache.
- AssumptionCache *AC;
-
- /// Interface to emit optimization remarks.
- OptimizationRemarkEmitter *ORE;
-
- const Function *TheFunction;
-
- /// Loop Vectorize Hint.
- const LoopVectorizeHints *Hints;
-
- /// The interleave access information contains groups of interleaved accesses
- /// with the same stride and close to each other.
- InterleavedAccessInfo &InterleaveInfo;
-
- /// Values to ignore in the cost model.
- SmallPtrSet<const Value *, 16> ValuesToIgnore;
-
- /// Values to ignore in the cost model when VF > 1.
- SmallPtrSet<const Value *, 16> VecValuesToIgnore;
+public:
+ /// The loop that we evaluate.
+ Loop *TheLoop;
+
+ /// Predicated scalar evolution analysis.
+ PredicatedScalarEvolution &PSE;
+
+ /// Loop Info analysis.
+ LoopInfo *LI;
+
+ /// Vectorization legality.
+ LoopVectorizationLegality *Legal;
+
+ /// Vector target information.
+ const TargetTransformInfo &TTI;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// Demanded bits analysis.
+ DemandedBits *DB;
+
+ /// Assumption cache.
+ AssumptionCache *AC;
+
+ /// Interface to emit optimization remarks.
+ OptimizationRemarkEmitter *ORE;
+
+ const Function *TheFunction;
+
+ /// Loop Vectorize Hint.
+ const LoopVectorizeHints *Hints;
+
+ /// The interleave access information contains groups of interleaved accesses
+ /// with the same stride and close to each other.
+ InterleavedAccessInfo &InterleaveInfo;
+
+ /// Values to ignore in the cost model.
+ SmallPtrSet<const Value *, 16> ValuesToIgnore;
+
+ /// Values to ignore in the cost model when VF > 1.
+ SmallPtrSet<const Value *, 16> VecValuesToIgnore;
/// Profitable vector factors.
SmallVector<VectorizationFactor, 8> ProfitableVFs;
-};
-
-} // end namespace llvm
-
-// Return true if \p OuterLp is an outer loop annotated with hints for explicit
-// vectorization. The loop needs to be annotated with #pragma omp simd
-// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
-// vector length information is not provided, vectorization is not considered
-// explicit. Interleave hints are not allowed either. These limitations will be
-// relaxed in the future.
-// Please, note that we are currently forced to abuse the pragma 'clang
-// vectorize' semantics. This pragma provides *auto-vectorization hints*
-// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
-// provides *explicit vectorization hints* (LV can bypass legal checks and
-// assume that vectorization is legal). However, both hints are implemented
-// using the same metadata (llvm.loop.vectorize, processed by
-// LoopVectorizeHints). This will be fixed in the future when the native IR
-// representation for pragma 'omp simd' is introduced.
-static bool isExplicitVecOuterLoop(Loop *OuterLp,
- OptimizationRemarkEmitter *ORE) {
+};
+
+} // end namespace llvm
+
+// Return true if \p OuterLp is an outer loop annotated with hints for explicit
+// vectorization. The loop needs to be annotated with #pragma omp simd
+// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
+// vector length information is not provided, vectorization is not considered
+// explicit. Interleave hints are not allowed either. These limitations will be
+// relaxed in the future.
+// Please, note that we are currently forced to abuse the pragma 'clang
+// vectorize' semantics. This pragma provides *auto-vectorization hints*
+// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
+// provides *explicit vectorization hints* (LV can bypass legal checks and
+// assume that vectorization is legal). However, both hints are implemented
+// using the same metadata (llvm.loop.vectorize, processed by
+// LoopVectorizeHints). This will be fixed in the future when the native IR
+// representation for pragma 'omp simd' is introduced.
+static bool isExplicitVecOuterLoop(Loop *OuterLp,
+ OptimizationRemarkEmitter *ORE) {
assert(!OuterLp->isInnermost() && "This is not an outer loop");
- LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
-
- // Only outer loops with an explicit vectorization hint are supported.
- // Unannotated outer loops are ignored.
- if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
- return false;
-
- Function *Fn = OuterLp->getHeader()->getParent();
- if (!Hints.allowVectorization(Fn, OuterLp,
- true /*VectorizeOnlyWhenForced*/)) {
- LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
- return false;
- }
-
- if (Hints.getInterleave() > 1) {
- // TODO: Interleave support is future work.
- LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
- "outer loops.\n");
- Hints.emitRemarkWithHints();
- return false;
- }
-
- return true;
-}
-
-static void collectSupportedLoops(Loop &L, LoopInfo *LI,
- OptimizationRemarkEmitter *ORE,
- SmallVectorImpl<Loop *> &V) {
- // Collect inner loops and outer loops without irreducible control flow. For
- // now, only collect outer loops that have explicit vectorization hints. If we
- // are stress testing the VPlan H-CFG construction, we collect the outermost
- // loop of every loop nest.
+ LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
+
+ // Only outer loops with an explicit vectorization hint are supported.
+ // Unannotated outer loops are ignored.
+ if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
+ return false;
+
+ Function *Fn = OuterLp->getHeader()->getParent();
+ if (!Hints.allowVectorization(Fn, OuterLp,
+ true /*VectorizeOnlyWhenForced*/)) {
+ LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
+ return false;
+ }
+
+ if (Hints.getInterleave() > 1) {
+ // TODO: Interleave support is future work.
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
+ "outer loops.\n");
+ Hints.emitRemarkWithHints();
+ return false;
+ }
+
+ return true;
+}
+
+static void collectSupportedLoops(Loop &L, LoopInfo *LI,
+ OptimizationRemarkEmitter *ORE,
+ SmallVectorImpl<Loop *> &V) {
+ // Collect inner loops and outer loops without irreducible control flow. For
+ // now, only collect outer loops that have explicit vectorization hints. If we
+ // are stress testing the VPlan H-CFG construction, we collect the outermost
+ // loop of every loop nest.
if (L.isInnermost() || VPlanBuildStressTest ||
- (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
- LoopBlocksRPO RPOT(&L);
- RPOT.perform(LI);
- if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
- V.push_back(&L);
- // TODO: Collect inner loops inside marked outer loops in case
- // vectorization fails for the outer loop. Do not invoke
- // 'containsIrreducibleCFG' again for inner loops when the outer loop is
- // already known to be reducible. We can use an inherited attribute for
- // that.
- return;
- }
- }
- for (Loop *InnerL : L)
- collectSupportedLoops(*InnerL, LI, ORE, V);
-}
-
-namespace {
-
-/// The LoopVectorize Pass.
-struct LoopVectorize : public FunctionPass {
- /// Pass identification, replacement for typeid
- static char ID;
-
- LoopVectorizePass Impl;
-
- explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
- bool VectorizeOnlyWhenForced = false)
- : FunctionPass(ID),
- Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
- initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
- auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
- auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
- auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
- auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-
- std::function<const LoopAccessInfo &(Loop &)> GetLAA =
- [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
-
- return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
- GetLAA, *ORE, PSI).MadeAnyChange;
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<BlockFrequencyInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<LoopAccessLegacyAnalysis>();
- AU.addRequired<DemandedBitsWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addRequired<InjectTLIMappingsLegacy>();
-
- // We currently do not preserve loopinfo/dominator analyses with outer loop
- // vectorization. Until this is addressed, mark these analyses as preserved
- // only for non-VPlan-native path.
- // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
- if (!EnableVPlanNativePath) {
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- }
-
- AU.addPreserved<BasicAAWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addRequired<ProfileSummaryInfoWrapperPass>();
- }
-};
-
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
-// LoopVectorizationCostModel and LoopVectorizationPlanner.
-//===----------------------------------------------------------------------===//
-
-Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
- // We need to place the broadcast of invariant variables outside the loop,
- // but only if it's proven safe to do so. Else, broadcast will be inside
- // vector loop body.
- Instruction *Instr = dyn_cast<Instruction>(V);
- bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
- (!Instr ||
- DT->dominates(Instr->getParent(), LoopVectorPreHeader));
- // Place the code for broadcasting invariant variables in the new preheader.
- IRBuilder<>::InsertPointGuard Guard(Builder);
- if (SafeToHoist)
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
-
- // Broadcast the scalar into all locations in the vector.
- Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
-
- return Shuf;
-}
-
-void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
+ (EnableVPlanNativePath && isExplicitVecOuterLoop(&L, ORE))) {
+ LoopBlocksRPO RPOT(&L);
+ RPOT.perform(LI);
+ if (!containsIrreducibleCFG<const BasicBlock *>(RPOT, *LI)) {
+ V.push_back(&L);
+ // TODO: Collect inner loops inside marked outer loops in case
+ // vectorization fails for the outer loop. Do not invoke
+ // 'containsIrreducibleCFG' again for inner loops when the outer loop is
+ // already known to be reducible. We can use an inherited attribute for
+ // that.
+ return;
+ }
+ }
+ for (Loop *InnerL : L)
+ collectSupportedLoops(*InnerL, LI, ORE, V);
+}
+
+namespace {
+
+/// The LoopVectorize Pass.
+struct LoopVectorize : public FunctionPass {
+ /// Pass identification, replacement for typeid
+ static char ID;
+
+ LoopVectorizePass Impl;
+
+ explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
+ bool VectorizeOnlyWhenForced = false)
+ : FunctionPass(ID),
+ Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
+ initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+ auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *LAA = &getAnalysis<LoopAccessLegacyAnalysis>();
+ auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+ auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+ auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+
+ std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+ [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
+
+ return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
+ GetLAA, *ORE, PSI).MadeAnyChange;
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<BlockFrequencyInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<LoopAccessLegacyAnalysis>();
+ AU.addRequired<DemandedBitsWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addRequired<InjectTLIMappingsLegacy>();
+
+ // We currently do not preserve loopinfo/dominator analyses with outer loop
+ // vectorization. Until this is addressed, mark these analyses as preserved
+ // only for non-VPlan-native path.
+ // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
+ if (!EnableVPlanNativePath) {
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ }
+
+ AU.addPreserved<BasicAAWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ }
+};
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
+// LoopVectorizationCostModel and LoopVectorizationPlanner.
+//===----------------------------------------------------------------------===//
+
+Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
+ // We need to place the broadcast of invariant variables outside the loop,
+ // but only if it's proven safe to do so. Else, broadcast will be inside
+ // vector loop body.
+ Instruction *Instr = dyn_cast<Instruction>(V);
+ bool SafeToHoist = OrigLoop->isLoopInvariant(V) &&
+ (!Instr ||
+ DT->dominates(Instr->getParent(), LoopVectorPreHeader));
+ // Place the code for broadcasting invariant variables in the new preheader.
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+ if (SafeToHoist)
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
+ // Broadcast the scalar into all locations in the vector.
+ Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
+
+ return Shuf;
+}
+
+void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
const InductionDescriptor &II, Value *Step, Value *Start,
Instruction *EntryVal) {
- assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
- "Expected either an induction phi-node or a truncate of it!");
-
- // Construct the initial value of the vector IV in the vector loop preheader
- auto CurrIP = Builder.saveIP();
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
- if (isa<TruncInst>(EntryVal)) {
- assert(Start->getType()->isIntegerTy() &&
- "Truncation requires an integer type");
- auto *TruncType = cast<IntegerType>(EntryVal->getType());
- Step = Builder.CreateTrunc(Step, TruncType);
- Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
- }
- Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
- Value *SteppedStart =
- getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
-
- // We create vector phi nodes for both integer and floating-point induction
- // variables. Here, we determine the kind of arithmetic we will perform.
- Instruction::BinaryOps AddOp;
- Instruction::BinaryOps MulOp;
- if (Step->getType()->isIntegerTy()) {
- AddOp = Instruction::Add;
- MulOp = Instruction::Mul;
- } else {
- AddOp = II.getInductionOpcode();
- MulOp = Instruction::FMul;
- }
-
- // Multiply the vectorization factor by the step using integer or
- // floating-point arithmetic as appropriate.
+ assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+ "Expected either an induction phi-node or a truncate of it!");
+
+ // Construct the initial value of the vector IV in the vector loop preheader
+ auto CurrIP = Builder.saveIP();
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ if (isa<TruncInst>(EntryVal)) {
+ assert(Start->getType()->isIntegerTy() &&
+ "Truncation requires an integer type");
+ auto *TruncType = cast<IntegerType>(EntryVal->getType());
+ Step = Builder.CreateTrunc(Step, TruncType);
+ Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
+ }
+ Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
+ Value *SteppedStart =
+ getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
+
+ // We create vector phi nodes for both integer and floating-point induction
+ // variables. Here, we determine the kind of arithmetic we will perform.
+ Instruction::BinaryOps AddOp;
+ Instruction::BinaryOps MulOp;
+ if (Step->getType()->isIntegerTy()) {
+ AddOp = Instruction::Add;
+ MulOp = Instruction::Mul;
+ } else {
+ AddOp = II.getInductionOpcode();
+ MulOp = Instruction::FMul;
+ }
+
+ // Multiply the vectorization factor by the step using integer or
+ // floating-point arithmetic as appropriate.
Value *ConstVF =
getSignedIntOrFpConstant(Step->getType(), VF.getKnownMinValue());
- Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
-
- // Create a vector splat to use in the induction update.
- //
- // FIXME: If the step is non-constant, we create the vector splat with
- // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
- // handle a constant vector splat.
+ Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
+
+ // Create a vector splat to use in the induction update.
+ //
+ // FIXME: If the step is non-constant, we create the vector splat with
+ // IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
+ // handle a constant vector splat.
assert(!VF.isScalable() && "scalable vectors not yet supported.");
Value *SplatVF = isa<Constant>(Mul)
? ConstantVector::getSplat(VF, cast<Constant>(Mul))
: Builder.CreateVectorSplat(VF, Mul);
- Builder.restoreIP(CurrIP);
-
- // We may need to add the step a number of times, depending on the unroll
- // factor. The last of those goes into the PHI.
- PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
- &*LoopVectorBody->getFirstInsertionPt());
- VecInd->setDebugLoc(EntryVal->getDebugLoc());
- Instruction *LastInduction = VecInd;
- for (unsigned Part = 0; Part < UF; ++Part) {
- VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
-
- if (isa<TruncInst>(EntryVal))
- addMetadata(LastInduction, EntryVal);
- recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
-
- LastInduction = cast<Instruction>(addFastMathFlag(
- Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
- LastInduction->setDebugLoc(EntryVal->getDebugLoc());
- }
-
- // Move the last step to the end of the latch block. This ensures consistent
- // placement of all induction updates.
- auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
- auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
- auto *ICmp = cast<Instruction>(Br->getCondition());
- LastInduction->moveBefore(ICmp);
- LastInduction->setName("vec.ind.next");
-
- VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
- VecInd->addIncoming(LastInduction, LoopVectorLatch);
-}
-
-bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
- return Cost->isScalarAfterVectorization(I, VF) ||
- Cost->isProfitableToScalarize(I, VF);
-}
-
-bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
- if (shouldScalarizeInstruction(IV))
- return true;
- auto isScalarInst = [&](User *U) -> bool {
- auto *I = cast<Instruction>(U);
- return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
- };
- return llvm::any_of(IV->users(), isScalarInst);
-}
-
-void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
- const InductionDescriptor &ID, const Instruction *EntryVal,
- Value *VectorLoopVal, unsigned Part, unsigned Lane) {
- assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
- "Expected either an induction phi-node or a truncate of it!");
-
- // This induction variable is not the phi from the original loop but the
- // newly-created IV based on the proof that casted Phi is equal to the
- // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
- // re-uses the same InductionDescriptor that original IV uses but we don't
- // have to do any recording in this case - that is done when original IV is
- // processed.
- if (isa<TruncInst>(EntryVal))
- return;
-
- const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
- if (Casts.empty())
- return;
- // Only the first Cast instruction in the Casts vector is of interest.
- // The rest of the Casts (if exist) have no uses outside the
- // induction update chain itself.
- Instruction *CastInst = *Casts.begin();
- if (Lane < UINT_MAX)
- VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
- else
- VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
-}
-
+ Builder.restoreIP(CurrIP);
+
+ // We may need to add the step a number of times, depending on the unroll
+ // factor. The last of those goes into the PHI.
+ PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
+ &*LoopVectorBody->getFirstInsertionPt());
+ VecInd->setDebugLoc(EntryVal->getDebugLoc());
+ Instruction *LastInduction = VecInd;
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ VectorLoopValueMap.setVectorValue(EntryVal, Part, LastInduction);
+
+ if (isa<TruncInst>(EntryVal))
+ addMetadata(LastInduction, EntryVal);
+ recordVectorLoopValueForInductionCast(II, EntryVal, LastInduction, Part);
+
+ LastInduction = cast<Instruction>(addFastMathFlag(
+ Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
+ LastInduction->setDebugLoc(EntryVal->getDebugLoc());
+ }
+
+ // Move the last step to the end of the latch block. This ensures consistent
+ // placement of all induction updates.
+ auto *LoopVectorLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch();
+ auto *Br = cast<BranchInst>(LoopVectorLatch->getTerminator());
+ auto *ICmp = cast<Instruction>(Br->getCondition());
+ LastInduction->moveBefore(ICmp);
+ LastInduction->setName("vec.ind.next");
+
+ VecInd->addIncoming(SteppedStart, LoopVectorPreHeader);
+ VecInd->addIncoming(LastInduction, LoopVectorLatch);
+}
+
+bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
+ return Cost->isScalarAfterVectorization(I, VF) ||
+ Cost->isProfitableToScalarize(I, VF);
+}
+
+bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
+ if (shouldScalarizeInstruction(IV))
+ return true;
+ auto isScalarInst = [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return (OrigLoop->contains(I) && shouldScalarizeInstruction(I));
+ };
+ return llvm::any_of(IV->users(), isScalarInst);
+}
+
+void InnerLoopVectorizer::recordVectorLoopValueForInductionCast(
+ const InductionDescriptor &ID, const Instruction *EntryVal,
+ Value *VectorLoopVal, unsigned Part, unsigned Lane) {
+ assert((isa<PHINode>(EntryVal) || isa<TruncInst>(EntryVal)) &&
+ "Expected either an induction phi-node or a truncate of it!");
+
+ // This induction variable is not the phi from the original loop but the
+ // newly-created IV based on the proof that casted Phi is equal to the
+ // uncasted Phi in the vectorized loop (under a runtime guard possibly). It
+ // re-uses the same InductionDescriptor that original IV uses but we don't
+ // have to do any recording in this case - that is done when original IV is
+ // processed.
+ if (isa<TruncInst>(EntryVal))
+ return;
+
+ const SmallVectorImpl<Instruction *> &Casts = ID.getCastInsts();
+ if (Casts.empty())
+ return;
+ // Only the first Cast instruction in the Casts vector is of interest.
+ // The rest of the Casts (if exist) have no uses outside the
+ // induction update chain itself.
+ Instruction *CastInst = *Casts.begin();
+ if (Lane < UINT_MAX)
+ VectorLoopValueMap.setScalarValue(CastInst, {Part, Lane}, VectorLoopVal);
+ else
+ VectorLoopValueMap.setVectorValue(CastInst, Part, VectorLoopVal);
+}
+
void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, Value *Start,
TruncInst *Trunc) {
- assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
- "Primary induction variable must have an integer type");
-
- auto II = Legal->getInductionVars().find(IV);
- assert(II != Legal->getInductionVars().end() && "IV is not an induction");
-
- auto ID = II->second;
- assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
-
- // The value from the original loop to which we are mapping the new induction
- // variable.
- Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
-
- auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
-
- // Generate code for the induction step. Note that induction steps are
- // required to be loop-invariant
- auto CreateStepValue = [&](const SCEV *Step) -> Value * {
- assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
- "Induction step should be loop invariant");
- if (PSE.getSE()->isSCEVable(IV->getType())) {
- SCEVExpander Exp(*PSE.getSE(), DL, "induction");
- return Exp.expandCodeFor(Step, Step->getType(),
- LoopVectorPreHeader->getTerminator());
- }
- return cast<SCEVUnknown>(Step)->getValue();
- };
-
- // The scalar value to broadcast. This is derived from the canonical
- // induction variable. If a truncation type is given, truncate the canonical
- // induction variable and step. Otherwise, derive these values from the
- // induction descriptor.
- auto CreateScalarIV = [&](Value *&Step) -> Value * {
- Value *ScalarIV = Induction;
- if (IV != OldInduction) {
- ScalarIV = IV->getType()->isIntegerTy()
- ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
- : Builder.CreateCast(Instruction::SIToFP, Induction,
- IV->getType());
- ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
- ScalarIV->setName("offset.idx");
- }
- if (Trunc) {
- auto *TruncType = cast<IntegerType>(Trunc->getType());
- assert(Step->getType()->isIntegerTy() &&
- "Truncation requires an integer step");
- ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
- Step = Builder.CreateTrunc(Step, TruncType);
- }
- return ScalarIV;
- };
-
- // Create the vector values from the scalar IV, in the absence of creating a
- // vector IV.
- auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
- Value *Broadcasted = getBroadcastInstrs(ScalarIV);
- for (unsigned Part = 0; Part < UF; ++Part) {
+ assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
+ "Primary induction variable must have an integer type");
+
+ auto II = Legal->getInductionVars().find(IV);
+ assert(II != Legal->getInductionVars().end() && "IV is not an induction");
+
+ auto ID = II->second;
+ assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
+
+ // The value from the original loop to which we are mapping the new induction
+ // variable.
+ Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
+
+ auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+
+ // Generate code for the induction step. Note that induction steps are
+ // required to be loop-invariant
+ auto CreateStepValue = [&](const SCEV *Step) -> Value * {
+ assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
+ "Induction step should be loop invariant");
+ if (PSE.getSE()->isSCEVable(IV->getType())) {
+ SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+ return Exp.expandCodeFor(Step, Step->getType(),
+ LoopVectorPreHeader->getTerminator());
+ }
+ return cast<SCEVUnknown>(Step)->getValue();
+ };
+
+ // The scalar value to broadcast. This is derived from the canonical
+ // induction variable. If a truncation type is given, truncate the canonical
+ // induction variable and step. Otherwise, derive these values from the
+ // induction descriptor.
+ auto CreateScalarIV = [&](Value *&Step) -> Value * {
+ Value *ScalarIV = Induction;
+ if (IV != OldInduction) {
+ ScalarIV = IV->getType()->isIntegerTy()
+ ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
+ : Builder.CreateCast(Instruction::SIToFP, Induction,
+ IV->getType());
+ ScalarIV = emitTransformedIndex(Builder, ScalarIV, PSE.getSE(), DL, ID);
+ ScalarIV->setName("offset.idx");
+ }
+ if (Trunc) {
+ auto *TruncType = cast<IntegerType>(Trunc->getType());
+ assert(Step->getType()->isIntegerTy() &&
+ "Truncation requires an integer step");
+ ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
+ Step = Builder.CreateTrunc(Step, TruncType);
+ }
+ return ScalarIV;
+ };
+
+ // Create the vector values from the scalar IV, in the absence of creating a
+ // vector IV.
+ auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
+ Value *Broadcasted = getBroadcastInstrs(ScalarIV);
+ for (unsigned Part = 0; Part < UF; ++Part) {
assert(!VF.isScalable() && "scalable vectors not yet supported.");
- Value *EntryPart =
+ Value *EntryPart =
getStepVector(Broadcasted, VF.getKnownMinValue() * Part, Step,
ID.getInductionOpcode());
- VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
- if (Trunc)
- addMetadata(EntryPart, Trunc);
- recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
- }
- };
-
- // Now do the actual transformations, and start with creating the step value.
- Value *Step = CreateStepValue(ID.getStep());
+ VectorLoopValueMap.setVectorValue(EntryVal, Part, EntryPart);
+ if (Trunc)
+ addMetadata(EntryPart, Trunc);
+ recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
+ }
+ };
+
+ // Now do the actual transformations, and start with creating the step value.
+ Value *Step = CreateStepValue(ID.getStep());
if (VF.isZero() || VF.isScalar()) {
- Value *ScalarIV = CreateScalarIV(Step);
- CreateSplatIV(ScalarIV, Step);
- return;
- }
-
- // Determine if we want a scalar version of the induction variable. This is
- // true if the induction variable itself is not widened, or if it has at
- // least one user in the loop that is not widened.
- auto NeedsScalarIV = needsScalarInduction(EntryVal);
- if (!NeedsScalarIV) {
+ Value *ScalarIV = CreateScalarIV(Step);
+ CreateSplatIV(ScalarIV, Step);
+ return;
+ }
+
+ // Determine if we want a scalar version of the induction variable. This is
+ // true if the induction variable itself is not widened, or if it has at
+ // least one user in the loop that is not widened.
+ auto NeedsScalarIV = needsScalarInduction(EntryVal);
+ if (!NeedsScalarIV) {
createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
- return;
- }
-
- // Try to create a new independent vector induction variable. If we can't
- // create the phi node, we will splat the scalar induction variable in each
- // loop iteration.
- if (!shouldScalarizeInstruction(EntryVal)) {
+ return;
+ }
+
+ // Try to create a new independent vector induction variable. If we can't
+ // create the phi node, we will splat the scalar induction variable in each
+ // loop iteration.
+ if (!shouldScalarizeInstruction(EntryVal)) {
createVectorIntOrFpInductionPHI(ID, Step, Start, EntryVal);
- Value *ScalarIV = CreateScalarIV(Step);
- // Create scalar steps that can be used by instructions we will later
- // scalarize. Note that the addition of the scalar steps will not increase
- // the number of instructions in the loop in the common case prior to
- // InstCombine. We will be trading one vector extract for each scalar step.
- buildScalarSteps(ScalarIV, Step, EntryVal, ID);
- return;
- }
-
- // All IV users are scalar instructions, so only emit a scalar IV, not a
- // vectorised IV. Except when we tail-fold, then the splat IV feeds the
- // predicate used by the masked loads/stores.
- Value *ScalarIV = CreateScalarIV(Step);
- if (!Cost->isScalarEpilogueAllowed())
- CreateSplatIV(ScalarIV, Step);
- buildScalarSteps(ScalarIV, Step, EntryVal, ID);
-}
-
-Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
- Instruction::BinaryOps BinOp) {
- // Create and check the types.
+ Value *ScalarIV = CreateScalarIV(Step);
+ // Create scalar steps that can be used by instructions we will later
+ // scalarize. Note that the addition of the scalar steps will not increase
+ // the number of instructions in the loop in the common case prior to
+ // InstCombine. We will be trading one vector extract for each scalar step.
+ buildScalarSteps(ScalarIV, Step, EntryVal, ID);
+ return;
+ }
+
+ // All IV users are scalar instructions, so only emit a scalar IV, not a
+ // vectorised IV. Except when we tail-fold, then the splat IV feeds the
+ // predicate used by the masked loads/stores.
+ Value *ScalarIV = CreateScalarIV(Step);
+ if (!Cost->isScalarEpilogueAllowed())
+ CreateSplatIV(ScalarIV, Step);
+ buildScalarSteps(ScalarIV, Step, EntryVal, ID);
+}
+
+Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
+ Instruction::BinaryOps BinOp) {
+ // Create and check the types.
auto *ValVTy = cast<FixedVectorType>(Val->getType());
- int VLen = ValVTy->getNumElements();
-
- Type *STy = Val->getType()->getScalarType();
- assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
- "Induction Step must be an integer or FP");
- assert(Step->getType() == STy && "Step has wrong type");
-
- SmallVector<Constant *, 8> Indices;
-
- if (STy->isIntegerTy()) {
- // Create a vector of consecutive numbers from zero to VF.
- for (int i = 0; i < VLen; ++i)
- Indices.push_back(ConstantInt::get(STy, StartIdx + i));
-
- // Add the consecutive indices to the vector value.
- Constant *Cv = ConstantVector::get(Indices);
- assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
- Step = Builder.CreateVectorSplat(VLen, Step);
- assert(Step->getType() == Val->getType() && "Invalid step vec");
- // FIXME: The newly created binary instructions should contain nsw/nuw flags,
- // which can be found from the original scalar operations.
- Step = Builder.CreateMul(Cv, Step);
- return Builder.CreateAdd(Val, Step, "induction");
- }
-
- // Floating point induction.
- assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
- "Binary Opcode should be specified for FP induction");
- // Create a vector of consecutive numbers from zero to VF.
- for (int i = 0; i < VLen; ++i)
- Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
-
- // Add the consecutive indices to the vector value.
- Constant *Cv = ConstantVector::get(Indices);
-
- Step = Builder.CreateVectorSplat(VLen, Step);
-
- // Floating point operations had to be 'fast' to enable the induction.
- FastMathFlags Flags;
- Flags.setFast();
-
- Value *MulOp = Builder.CreateFMul(Cv, Step);
- if (isa<Instruction>(MulOp))
- // Have to check, MulOp may be a constant
- cast<Instruction>(MulOp)->setFastMathFlags(Flags);
-
- Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
- if (isa<Instruction>(BOp))
- cast<Instruction>(BOp)->setFastMathFlags(Flags);
- return BOp;
-}
-
-void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
- Instruction *EntryVal,
- const InductionDescriptor &ID) {
- // We shouldn't have to build scalar steps if we aren't vectorizing.
+ int VLen = ValVTy->getNumElements();
+
+ Type *STy = Val->getType()->getScalarType();
+ assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
+ "Induction Step must be an integer or FP");
+ assert(Step->getType() == STy && "Step has wrong type");
+
+ SmallVector<Constant *, 8> Indices;
+
+ if (STy->isIntegerTy()) {
+ // Create a vector of consecutive numbers from zero to VF.
+ for (int i = 0; i < VLen; ++i)
+ Indices.push_back(ConstantInt::get(STy, StartIdx + i));
+
+ // Add the consecutive indices to the vector value.
+ Constant *Cv = ConstantVector::get(Indices);
+ assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
+ Step = Builder.CreateVectorSplat(VLen, Step);
+ assert(Step->getType() == Val->getType() && "Invalid step vec");
+ // FIXME: The newly created binary instructions should contain nsw/nuw flags,
+ // which can be found from the original scalar operations.
+ Step = Builder.CreateMul(Cv, Step);
+ return Builder.CreateAdd(Val, Step, "induction");
+ }
+
+ // Floating point induction.
+ assert((BinOp == Instruction::FAdd || BinOp == Instruction::FSub) &&
+ "Binary Opcode should be specified for FP induction");
+ // Create a vector of consecutive numbers from zero to VF.
+ for (int i = 0; i < VLen; ++i)
+ Indices.push_back(ConstantFP::get(STy, (double)(StartIdx + i)));
+
+ // Add the consecutive indices to the vector value.
+ Constant *Cv = ConstantVector::get(Indices);
+
+ Step = Builder.CreateVectorSplat(VLen, Step);
+
+ // Floating point operations had to be 'fast' to enable the induction.
+ FastMathFlags Flags;
+ Flags.setFast();
+
+ Value *MulOp = Builder.CreateFMul(Cv, Step);
+ if (isa<Instruction>(MulOp))
+ // Have to check, MulOp may be a constant
+ cast<Instruction>(MulOp)->setFastMathFlags(Flags);
+
+ Value *BOp = Builder.CreateBinOp(BinOp, Val, MulOp, "induction");
+ if (isa<Instruction>(BOp))
+ cast<Instruction>(BOp)->setFastMathFlags(Flags);
+ return BOp;
+}
+
+void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
+ Instruction *EntryVal,
+ const InductionDescriptor &ID) {
+ // We shouldn't have to build scalar steps if we aren't vectorizing.
assert(VF.isVector() && "VF should be greater than one");
- // Get the value type and ensure it and the step have the same integer type.
- Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
- assert(ScalarIVTy == Step->getType() &&
- "Val and Step should have the same type");
-
- // We build scalar steps for both integer and floating-point induction
- // variables. Here, we determine the kind of arithmetic we will perform.
- Instruction::BinaryOps AddOp;
- Instruction::BinaryOps MulOp;
- if (ScalarIVTy->isIntegerTy()) {
- AddOp = Instruction::Add;
- MulOp = Instruction::Mul;
- } else {
- AddOp = ID.getInductionOpcode();
- MulOp = Instruction::FMul;
- }
-
- // Determine the number of scalars we need to generate for each unroll
- // iteration. If EntryVal is uniform, we only need to generate the first
- // lane. Otherwise, we generate all VF values.
- unsigned Lanes =
+ // Get the value type and ensure it and the step have the same integer type.
+ Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
+ assert(ScalarIVTy == Step->getType() &&
+ "Val and Step should have the same type");
+
+ // We build scalar steps for both integer and floating-point induction
+ // variables. Here, we determine the kind of arithmetic we will perform.
+ Instruction::BinaryOps AddOp;
+ Instruction::BinaryOps MulOp;
+ if (ScalarIVTy->isIntegerTy()) {
+ AddOp = Instruction::Add;
+ MulOp = Instruction::Mul;
+ } else {
+ AddOp = ID.getInductionOpcode();
+ MulOp = Instruction::FMul;
+ }
+
+ // Determine the number of scalars we need to generate for each unroll
+ // iteration. If EntryVal is uniform, we only need to generate the first
+ // lane. Otherwise, we generate all VF values.
+ unsigned Lanes =
Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF)
? 1
: VF.getKnownMinValue();
assert((!VF.isScalable() || Lanes == 1) &&
"Should never scalarize a scalable vector");
- // Compute the scalar steps and save the results in VectorLoopValueMap.
- for (unsigned Part = 0; Part < UF; ++Part) {
- for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
+ // Compute the scalar steps and save the results in VectorLoopValueMap.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
auto *IntStepTy = IntegerType::get(ScalarIVTy->getContext(),
ScalarIVTy->getScalarSizeInBits());
Value *StartIdx =
@@ -2356,685 +2356,685 @@ void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
assert((VF.isScalable() || isa<Constant>(StartIdx)) &&
"Expected StartIdx to be folded to a constant when VF is not "
"scalable");
- auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
- auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
- VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
- recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
- }
- }
-}
-
-Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
- assert(V != Induction && "The new induction variable should not be used.");
- assert(!V->getType()->isVectorTy() && "Can't widen a vector");
- assert(!V->getType()->isVoidTy() && "Type does not produce a value");
-
- // If we have a stride that is replaced by one, do it here. Defer this for
- // the VPlan-native path until we start running Legal checks in that path.
- if (!EnableVPlanNativePath && Legal->hasStride(V))
- V = ConstantInt::get(V->getType(), 1);
-
- // If we have a vector mapped to this value, return it.
- if (VectorLoopValueMap.hasVectorValue(V, Part))
- return VectorLoopValueMap.getVectorValue(V, Part);
-
- // If the value has not been vectorized, check if it has been scalarized
- // instead. If it has been scalarized, and we actually need the value in
- // vector form, we will construct the vector values on demand.
- if (VectorLoopValueMap.hasAnyScalarValue(V)) {
- Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
-
- // If we've scalarized a value, that value should be an instruction.
- auto *I = cast<Instruction>(V);
-
- // If we aren't vectorizing, we can just copy the scalar map values over to
- // the vector map.
+ auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
+ auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
+ VectorLoopValueMap.setScalarValue(EntryVal, {Part, Lane}, Add);
+ recordVectorLoopValueForInductionCast(ID, EntryVal, Add, Part, Lane);
+ }
+ }
+}
+
+Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
+ assert(V != Induction && "The new induction variable should not be used.");
+ assert(!V->getType()->isVectorTy() && "Can't widen a vector");
+ assert(!V->getType()->isVoidTy() && "Type does not produce a value");
+
+ // If we have a stride that is replaced by one, do it here. Defer this for
+ // the VPlan-native path until we start running Legal checks in that path.
+ if (!EnableVPlanNativePath && Legal->hasStride(V))
+ V = ConstantInt::get(V->getType(), 1);
+
+ // If we have a vector mapped to this value, return it.
+ if (VectorLoopValueMap.hasVectorValue(V, Part))
+ return VectorLoopValueMap.getVectorValue(V, Part);
+
+ // If the value has not been vectorized, check if it has been scalarized
+ // instead. If it has been scalarized, and we actually need the value in
+ // vector form, we will construct the vector values on demand.
+ if (VectorLoopValueMap.hasAnyScalarValue(V)) {
+ Value *ScalarValue = VectorLoopValueMap.getScalarValue(V, {Part, 0});
+
+ // If we've scalarized a value, that value should be an instruction.
+ auto *I = cast<Instruction>(V);
+
+ // If we aren't vectorizing, we can just copy the scalar map values over to
+ // the vector map.
if (VF.isScalar()) {
- VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
- return ScalarValue;
- }
-
- // Get the last scalar instruction we generated for V and Part. If the value
- // is known to be uniform after vectorization, this corresponds to lane zero
- // of the Part unroll iteration. Otherwise, the last instruction is the one
- // we created for the last vector lane of the Part unroll iteration.
+ VectorLoopValueMap.setVectorValue(V, Part, ScalarValue);
+ return ScalarValue;
+ }
+
+ // Get the last scalar instruction we generated for V and Part. If the value
+ // is known to be uniform after vectorization, this corresponds to lane zero
+ // of the Part unroll iteration. Otherwise, the last instruction is the one
+ // we created for the last vector lane of the Part unroll iteration.
unsigned LastLane = Cost->isUniformAfterVectorization(I, VF)
? 0
: VF.getKnownMinValue() - 1;
assert((!VF.isScalable() || LastLane == 0) &&
"Scalable vectorization can't lead to any scalarized values.");
- auto *LastInst = cast<Instruction>(
- VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
-
- // Set the insert point after the last scalarized instruction. This ensures
- // the insertelement sequence will directly follow the scalar definitions.
- auto OldIP = Builder.saveIP();
- auto NewIP = std::next(BasicBlock::iterator(LastInst));
- Builder.SetInsertPoint(&*NewIP);
-
- // However, if we are vectorizing, we need to construct the vector values.
- // If the value is known to be uniform after vectorization, we can just
- // broadcast the scalar value corresponding to lane zero for each unroll
- // iteration. Otherwise, we construct the vector values using insertelement
- // instructions. Since the resulting vectors are stored in
- // VectorLoopValueMap, we will only generate the insertelements once.
- Value *VectorValue = nullptr;
- if (Cost->isUniformAfterVectorization(I, VF)) {
- VectorValue = getBroadcastInstrs(ScalarValue);
- VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
- } else {
+ auto *LastInst = cast<Instruction>(
+ VectorLoopValueMap.getScalarValue(V, {Part, LastLane}));
+
+ // Set the insert point after the last scalarized instruction. This ensures
+ // the insertelement sequence will directly follow the scalar definitions.
+ auto OldIP = Builder.saveIP();
+ auto NewIP = std::next(BasicBlock::iterator(LastInst));
+ Builder.SetInsertPoint(&*NewIP);
+
+ // However, if we are vectorizing, we need to construct the vector values.
+ // If the value is known to be uniform after vectorization, we can just
+ // broadcast the scalar value corresponding to lane zero for each unroll
+ // iteration. Otherwise, we construct the vector values using insertelement
+ // instructions. Since the resulting vectors are stored in
+ // VectorLoopValueMap, we will only generate the insertelements once.
+ Value *VectorValue = nullptr;
+ if (Cost->isUniformAfterVectorization(I, VF)) {
+ VectorValue = getBroadcastInstrs(ScalarValue);
+ VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
+ } else {
// Initialize packing with insertelements to start from poison.
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
Value *Poison = PoisonValue::get(VectorType::get(V->getType(), VF));
VectorLoopValueMap.setVectorValue(V, Part, Poison);
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
- packScalarIntoVectorValue(V, {Part, Lane});
- VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
- }
- Builder.restoreIP(OldIP);
- return VectorValue;
- }
-
- // If this scalar is unknown, assume that it is a constant or that it is
- // loop invariant. Broadcast V and save the value for future uses.
- Value *B = getBroadcastInstrs(V);
- VectorLoopValueMap.setVectorValue(V, Part, B);
- return B;
-}
-
-Value *
-InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
- const VPIteration &Instance) {
- // If the value is not an instruction contained in the loop, it should
- // already be scalar.
- if (OrigLoop->isLoopInvariant(V))
- return V;
-
- assert(Instance.Lane > 0
- ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
- : true && "Uniform values only have lane zero");
-
- // If the value from the original loop has not been vectorized, it is
- // represented by UF x VF scalar values in the new loop. Return the requested
- // scalar value.
- if (VectorLoopValueMap.hasScalarValue(V, Instance))
- return VectorLoopValueMap.getScalarValue(V, Instance);
-
- // If the value has not been scalarized, get its entry in VectorLoopValueMap
- // for the given unroll part. If this entry is not a vector type (i.e., the
- // vectorization factor is one), there is no need to generate an
- // extractelement instruction.
- auto *U = getOrCreateVectorValue(V, Instance.Part);
- if (!U->getType()->isVectorTy()) {
+ packScalarIntoVectorValue(V, {Part, Lane});
+ VectorValue = VectorLoopValueMap.getVectorValue(V, Part);
+ }
+ Builder.restoreIP(OldIP);
+ return VectorValue;
+ }
+
+ // If this scalar is unknown, assume that it is a constant or that it is
+ // loop invariant. Broadcast V and save the value for future uses.
+ Value *B = getBroadcastInstrs(V);
+ VectorLoopValueMap.setVectorValue(V, Part, B);
+ return B;
+}
+
+Value *
+InnerLoopVectorizer::getOrCreateScalarValue(Value *V,
+ const VPIteration &Instance) {
+ // If the value is not an instruction contained in the loop, it should
+ // already be scalar.
+ if (OrigLoop->isLoopInvariant(V))
+ return V;
+
+ assert(Instance.Lane > 0
+ ? !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
+ : true && "Uniform values only have lane zero");
+
+ // If the value from the original loop has not been vectorized, it is
+ // represented by UF x VF scalar values in the new loop. Return the requested
+ // scalar value.
+ if (VectorLoopValueMap.hasScalarValue(V, Instance))
+ return VectorLoopValueMap.getScalarValue(V, Instance);
+
+ // If the value has not been scalarized, get its entry in VectorLoopValueMap
+ // for the given unroll part. If this entry is not a vector type (i.e., the
+ // vectorization factor is one), there is no need to generate an
+ // extractelement instruction.
+ auto *U = getOrCreateVectorValue(V, Instance.Part);
+ if (!U->getType()->isVectorTy()) {
assert(VF.isScalar() && "Value not scalarized has non-vector type");
- return U;
- }
-
- // Otherwise, the value from the original loop has been vectorized and is
- // represented by UF vector values. Extract and return the requested scalar
- // value from the appropriate vector lane.
- return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
-}
-
-void InnerLoopVectorizer::packScalarIntoVectorValue(
- Value *V, const VPIteration &Instance) {
- assert(V != Induction && "The new induction variable should not be used.");
- assert(!V->getType()->isVectorTy() && "Can't pack a vector");
- assert(!V->getType()->isVoidTy() && "Type does not produce a value");
-
- Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
- Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
- VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
- Builder.getInt32(Instance.Lane));
- VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
-}
-
-Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
- assert(Vec->getType()->isVectorTy() && "Invalid type");
+ return U;
+ }
+
+ // Otherwise, the value from the original loop has been vectorized and is
+ // represented by UF vector values. Extract and return the requested scalar
+ // value from the appropriate vector lane.
+ return Builder.CreateExtractElement(U, Builder.getInt32(Instance.Lane));
+}
+
+void InnerLoopVectorizer::packScalarIntoVectorValue(
+ Value *V, const VPIteration &Instance) {
+ assert(V != Induction && "The new induction variable should not be used.");
+ assert(!V->getType()->isVectorTy() && "Can't pack a vector");
+ assert(!V->getType()->isVoidTy() && "Type does not produce a value");
+
+ Value *ScalarInst = VectorLoopValueMap.getScalarValue(V, Instance);
+ Value *VectorValue = VectorLoopValueMap.getVectorValue(V, Instance.Part);
+ VectorValue = Builder.CreateInsertElement(VectorValue, ScalarInst,
+ Builder.getInt32(Instance.Lane));
+ VectorLoopValueMap.resetVectorValue(V, Instance.Part, VectorValue);
+}
+
+Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
+ assert(Vec->getType()->isVectorTy() && "Invalid type");
assert(!VF.isScalable() && "Cannot reverse scalable vectors");
- SmallVector<int, 8> ShuffleMask;
+ SmallVector<int, 8> ShuffleMask;
for (unsigned i = 0; i < VF.getKnownMinValue(); ++i)
ShuffleMask.push_back(VF.getKnownMinValue() - i - 1);
-
+
return Builder.CreateShuffleVector(Vec, ShuffleMask, "reverse");
-}
-
-// Return whether we allow using masked interleave-groups (for dealing with
-// strided loads/stores that reside in predicated blocks, or for dealing
-// with gaps).
-static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
- // If an override option has been passed in for interleaved accesses, use it.
- if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
- return EnableMaskedInterleavedMemAccesses;
-
- return TTI.enableMaskedInterleavedAccessVectorization();
-}
-
-// Try to vectorize the interleave group that \p Instr belongs to.
-//
-// E.g. Translate following interleaved load group (factor = 3):
-// for (i = 0; i < N; i+=3) {
-// R = Pic[i]; // Member of index 0
-// G = Pic[i+1]; // Member of index 1
-// B = Pic[i+2]; // Member of index 2
-// ... // do something to R, G, B
-// }
-// To:
-// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
+}
+
+// Return whether we allow using masked interleave-groups (for dealing with
+// strided loads/stores that reside in predicated blocks, or for dealing
+// with gaps).
+static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
+ // If an override option has been passed in for interleaved accesses, use it.
+ if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
+ return EnableMaskedInterleavedMemAccesses;
+
+ return TTI.enableMaskedInterleavedAccessVectorization();
+}
+
+// Try to vectorize the interleave group that \p Instr belongs to.
+//
+// E.g. Translate following interleaved load group (factor = 3):
+// for (i = 0; i < N; i+=3) {
+// R = Pic[i]; // Member of index 0
+// G = Pic[i+1]; // Member of index 1
+// B = Pic[i+2]; // Member of index 2
+// ... // do something to R, G, B
+// }
+// To:
+// %wide.vec = load <12 x i32> ; Read 4 tuples of R,G,B
// %R.vec = shuffle %wide.vec, poison, <0, 3, 6, 9> ; R elements
// %G.vec = shuffle %wide.vec, poison, <1, 4, 7, 10> ; G elements
// %B.vec = shuffle %wide.vec, poison, <2, 5, 8, 11> ; B elements
-//
-// Or translate following interleaved store group (factor = 3):
-// for (i = 0; i < N; i+=3) {
-// ... do something to R, G, B
-// Pic[i] = R; // Member of index 0
-// Pic[i+1] = G; // Member of index 1
-// Pic[i+2] = B; // Member of index 2
-// }
-// To:
-// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
+//
+// Or translate following interleaved store group (factor = 3):
+// for (i = 0; i < N; i+=3) {
+// ... do something to R, G, B
+// Pic[i] = R; // Member of index 0
+// Pic[i+1] = G; // Member of index 1
+// Pic[i+2] = B; // Member of index 2
+// }
+// To:
+// %R_G.vec = shuffle %R.vec, %G.vec, <0, 1, 2, ..., 7>
// %B_U.vec = shuffle %B.vec, poison, <0, 1, 2, 3, u, u, u, u>
-// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
-// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
-// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
-void InnerLoopVectorizer::vectorizeInterleaveGroup(
+// %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
+// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> ; Interleave R,G,B elements
+// store <12 x i32> %interleaved.vec ; Write 4 tuples of R,G,B
+void InnerLoopVectorizer::vectorizeInterleaveGroup(
const InterleaveGroup<Instruction> *Group, ArrayRef<VPValue *> VPDefs,
VPTransformState &State, VPValue *Addr, ArrayRef<VPValue *> StoredValues,
VPValue *BlockInMask) {
- Instruction *Instr = Group->getInsertPos();
- const DataLayout &DL = Instr->getModule()->getDataLayout();
-
- // Prepare for the vector type of the interleaved load/store.
- Type *ScalarTy = getMemInstValueType(Instr);
- unsigned InterleaveFactor = Group->getFactor();
+ Instruction *Instr = Group->getInsertPos();
+ const DataLayout &DL = Instr->getModule()->getDataLayout();
+
+ // Prepare for the vector type of the interleaved load/store.
+ Type *ScalarTy = getMemInstValueType(Instr);
+ unsigned InterleaveFactor = Group->getFactor();
assert(!VF.isScalable() && "scalable vectors not yet supported.");
auto *VecTy = VectorType::get(ScalarTy, VF * InterleaveFactor);
-
- // Prepare for the new pointers.
- SmallVector<Value *, 2> AddrParts;
- unsigned Index = Group->getIndex(Instr);
-
- // TODO: extend the masked interleaved-group support to reversed access.
- assert((!BlockInMask || !Group->isReverse()) &&
- "Reversed masked interleave-group not supported.");
-
- // If the group is reverse, adjust the index to refer to the last vector lane
- // instead of the first. We adjust the index from the first vector lane,
- // rather than directly getting the pointer for lane VF - 1, because the
- // pointer operand of the interleaved access is supposed to be uniform. For
- // uniform instructions, we're only required to generate a value for the
- // first vector lane in each unroll iteration.
+
+ // Prepare for the new pointers.
+ SmallVector<Value *, 2> AddrParts;
+ unsigned Index = Group->getIndex(Instr);
+
+ // TODO: extend the masked interleaved-group support to reversed access.
+ assert((!BlockInMask || !Group->isReverse()) &&
+ "Reversed masked interleave-group not supported.");
+
+ // If the group is reverse, adjust the index to refer to the last vector lane
+ // instead of the first. We adjust the index from the first vector lane,
+ // rather than directly getting the pointer for lane VF - 1, because the
+ // pointer operand of the interleaved access is supposed to be uniform. For
+ // uniform instructions, we're only required to generate a value for the
+ // first vector lane in each unroll iteration.
assert(!VF.isScalable() &&
"scalable vector reverse operation is not implemented");
- if (Group->isReverse())
+ if (Group->isReverse())
Index += (VF.getKnownMinValue() - 1) * Group->getFactor();
-
- for (unsigned Part = 0; Part < UF; Part++) {
- Value *AddrPart = State.get(Addr, {Part, 0});
- setDebugLocFromInst(Builder, AddrPart);
-
- // Notice current instruction could be any index. Need to adjust the address
- // to the member of index 0.
- //
- // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
- // b = A[i]; // Member of index 0
- // Current pointer is pointed to A[i+1], adjust it to A[i].
- //
- // E.g. A[i+1] = a; // Member of index 1
- // A[i] = b; // Member of index 0
- // A[i+2] = c; // Member of index 2 (Current instruction)
- // Current pointer is pointed to A[i+2], adjust it to A[i].
-
- bool InBounds = false;
- if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
- InBounds = gep->isInBounds();
- AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
- cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
-
- // Cast to the vector pointer type.
- unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
- Type *PtrTy = VecTy->getPointerTo(AddressSpace);
- AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
- }
-
- setDebugLocFromInst(Builder, Instr);
+
+ for (unsigned Part = 0; Part < UF; Part++) {
+ Value *AddrPart = State.get(Addr, {Part, 0});
+ setDebugLocFromInst(Builder, AddrPart);
+
+ // Notice current instruction could be any index. Need to adjust the address
+ // to the member of index 0.
+ //
+ // E.g. a = A[i+1]; // Member of index 1 (Current instruction)
+ // b = A[i]; // Member of index 0
+ // Current pointer is pointed to A[i+1], adjust it to A[i].
+ //
+ // E.g. A[i+1] = a; // Member of index 1
+ // A[i] = b; // Member of index 0
+ // A[i+2] = c; // Member of index 2 (Current instruction)
+ // Current pointer is pointed to A[i+2], adjust it to A[i].
+
+ bool InBounds = false;
+ if (auto *gep = dyn_cast<GetElementPtrInst>(AddrPart->stripPointerCasts()))
+ InBounds = gep->isInBounds();
+ AddrPart = Builder.CreateGEP(ScalarTy, AddrPart, Builder.getInt32(-Index));
+ cast<GetElementPtrInst>(AddrPart)->setIsInBounds(InBounds);
+
+ // Cast to the vector pointer type.
+ unsigned AddressSpace = AddrPart->getType()->getPointerAddressSpace();
+ Type *PtrTy = VecTy->getPointerTo(AddressSpace);
+ AddrParts.push_back(Builder.CreateBitCast(AddrPart, PtrTy));
+ }
+
+ setDebugLocFromInst(Builder, Instr);
Value *PoisonVec = PoisonValue::get(VecTy);
-
- Value *MaskForGaps = nullptr;
- if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
+
+ Value *MaskForGaps = nullptr;
+ if (Group->requiresScalarEpilogue() && !Cost->isScalarEpilogueAllowed()) {
assert(!VF.isScalable() && "scalable vectors not yet supported.");
MaskForGaps = createBitMaskForGaps(Builder, VF.getKnownMinValue(), *Group);
- assert(MaskForGaps && "Mask for Gaps is required but it is null");
- }
-
- // Vectorize the interleaved load group.
- if (isa<LoadInst>(Instr)) {
- // For each unroll part, create a wide load for the group.
- SmallVector<Value *, 2> NewLoads;
- for (unsigned Part = 0; Part < UF; Part++) {
- Instruction *NewLoad;
- if (BlockInMask || MaskForGaps) {
- assert(useMaskedInterleavedAccesses(*TTI) &&
- "masked interleaved groups are not allowed.");
- Value *GroupMask = MaskForGaps;
- if (BlockInMask) {
- Value *BlockInMaskPart = State.get(BlockInMask, Part);
+ assert(MaskForGaps && "Mask for Gaps is required but it is null");
+ }
+
+ // Vectorize the interleaved load group.
+ if (isa<LoadInst>(Instr)) {
+ // For each unroll part, create a wide load for the group.
+ SmallVector<Value *, 2> NewLoads;
+ for (unsigned Part = 0; Part < UF; Part++) {
+ Instruction *NewLoad;
+ if (BlockInMask || MaskForGaps) {
+ assert(useMaskedInterleavedAccesses(*TTI) &&
+ "masked interleaved groups are not allowed.");
+ Value *GroupMask = MaskForGaps;
+ if (BlockInMask) {
+ Value *BlockInMaskPart = State.get(BlockInMask, Part);
assert(!VF.isScalable() && "scalable vectors not yet supported.");
- Value *ShuffledMask = Builder.CreateShuffleVector(
+ Value *ShuffledMask = Builder.CreateShuffleVector(
BlockInMaskPart,
createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
"interleaved.mask");
- GroupMask = MaskForGaps
- ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
- MaskForGaps)
- : ShuffledMask;
- }
- NewLoad =
- Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
+ GroupMask = MaskForGaps
+ ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
+ MaskForGaps)
+ : ShuffledMask;
+ }
+ NewLoad =
+ Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
GroupMask, PoisonVec, "wide.masked.vec");
- }
- else
- NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
- Group->getAlign(), "wide.vec");
- Group->addMetadata(NewLoad);
- NewLoads.push_back(NewLoad);
- }
-
- // For each member in the group, shuffle out the appropriate data from the
- // wide loads.
+ }
+ else
+ NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
+ Group->getAlign(), "wide.vec");
+ Group->addMetadata(NewLoad);
+ NewLoads.push_back(NewLoad);
+ }
+
+ // For each member in the group, shuffle out the appropriate data from the
+ // wide loads.
unsigned J = 0;
- for (unsigned I = 0; I < InterleaveFactor; ++I) {
- Instruction *Member = Group->getMember(I);
-
- // Skip the gaps in the group.
- if (!Member)
- continue;
-
+ for (unsigned I = 0; I < InterleaveFactor; ++I) {
+ Instruction *Member = Group->getMember(I);
+
+ // Skip the gaps in the group.
+ if (!Member)
+ continue;
+
assert(!VF.isScalable() && "scalable vectors not yet supported.");
auto StrideMask =
createStrideMask(I, InterleaveFactor, VF.getKnownMinValue());
- for (unsigned Part = 0; Part < UF; Part++) {
- Value *StridedVec = Builder.CreateShuffleVector(
+ for (unsigned Part = 0; Part < UF; Part++) {
+ Value *StridedVec = Builder.CreateShuffleVector(
NewLoads[Part], StrideMask, "strided.vec");
-
- // If this member has different type, cast the result type.
- if (Member->getType() != ScalarTy) {
+
+ // If this member has different type, cast the result type.
+ if (Member->getType() != ScalarTy) {
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
- StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
- }
-
- if (Group->isReverse())
- StridedVec = reverseVector(StridedVec);
-
+ StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
+ }
+
+ if (Group->isReverse())
+ StridedVec = reverseVector(StridedVec);
+
State.set(VPDefs[J], Member, StridedVec, Part);
- }
+ }
++J;
- }
- return;
- }
-
- // The sub vector type for current instruction.
+ }
+ return;
+ }
+
+ // The sub vector type for current instruction.
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
auto *SubVT = VectorType::get(ScalarTy, VF);
-
- // Vectorize the interleaved store group.
- for (unsigned Part = 0; Part < UF; Part++) {
- // Collect the stored vector from each member.
- SmallVector<Value *, 4> StoredVecs;
- for (unsigned i = 0; i < InterleaveFactor; i++) {
- // Interleaved store group doesn't allow a gap, so each index has a member
+
+ // Vectorize the interleaved store group.
+ for (unsigned Part = 0; Part < UF; Part++) {
+ // Collect the stored vector from each member.
+ SmallVector<Value *, 4> StoredVecs;
+ for (unsigned i = 0; i < InterleaveFactor; i++) {
+ // Interleaved store group doesn't allow a gap, so each index has a member
assert(Group->getMember(i) && "Fail to get a member from an interleaved store group");
-
+
Value *StoredVec = State.get(StoredValues[i], Part);
- if (Group->isReverse())
- StoredVec = reverseVector(StoredVec);
-
- // If this member has different type, cast it to a unified type.
-
- if (StoredVec->getType() != SubVT)
- StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
-
- StoredVecs.push_back(StoredVec);
- }
-
- // Concatenate all vectors into a wide vector.
- Value *WideVec = concatenateVectors(Builder, StoredVecs);
-
- // Interleave the elements in the wide vector.
+ if (Group->isReverse())
+ StoredVec = reverseVector(StoredVec);
+
+ // If this member has different type, cast it to a unified type.
+
+ if (StoredVec->getType() != SubVT)
+ StoredVec = createBitOrPointerCast(StoredVec, SubVT, DL);
+
+ StoredVecs.push_back(StoredVec);
+ }
+
+ // Concatenate all vectors into a wide vector.
+ Value *WideVec = concatenateVectors(Builder, StoredVecs);
+
+ // Interleave the elements in the wide vector.
assert(!VF.isScalable() && "scalable vectors not yet supported.");
- Value *IVec = Builder.CreateShuffleVector(
+ Value *IVec = Builder.CreateShuffleVector(
WideVec, createInterleaveMask(VF.getKnownMinValue(), InterleaveFactor),
- "interleaved.vec");
-
- Instruction *NewStoreInstr;
- if (BlockInMask) {
- Value *BlockInMaskPart = State.get(BlockInMask, Part);
- Value *ShuffledMask = Builder.CreateShuffleVector(
+ "interleaved.vec");
+
+ Instruction *NewStoreInstr;
+ if (BlockInMask) {
+ Value *BlockInMaskPart = State.get(BlockInMask, Part);
+ Value *ShuffledMask = Builder.CreateShuffleVector(
BlockInMaskPart,
createReplicatedMask(InterleaveFactor, VF.getKnownMinValue()),
- "interleaved.mask");
- NewStoreInstr = Builder.CreateMaskedStore(
- IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
- }
- else
- NewStoreInstr =
- Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
-
- Group->addMetadata(NewStoreInstr);
- }
-}
-
+ "interleaved.mask");
+ NewStoreInstr = Builder.CreateMaskedStore(
+ IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
+ }
+ else
+ NewStoreInstr =
+ Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
+
+ Group->addMetadata(NewStoreInstr);
+ }
+}
+
void InnerLoopVectorizer::vectorizeMemoryInstruction(
Instruction *Instr, VPTransformState &State, VPValue *Def, VPValue *Addr,
VPValue *StoredValue, VPValue *BlockInMask) {
- // Attempt to issue a wide load.
- LoadInst *LI = dyn_cast<LoadInst>(Instr);
- StoreInst *SI = dyn_cast<StoreInst>(Instr);
-
- assert((LI || SI) && "Invalid Load/Store instruction");
- assert((!SI || StoredValue) && "No stored value provided for widened store");
- assert((!LI || !StoredValue) && "Stored value provided for widened load");
-
- LoopVectorizationCostModel::InstWidening Decision =
- Cost->getWideningDecision(Instr, VF);
- assert((Decision == LoopVectorizationCostModel::CM_Widen ||
- Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
- Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
- "CM decision is not to widen the memory instruction");
-
- Type *ScalarDataTy = getMemInstValueType(Instr);
+ // Attempt to issue a wide load.
+ LoadInst *LI = dyn_cast<LoadInst>(Instr);
+ StoreInst *SI = dyn_cast<StoreInst>(Instr);
+
+ assert((LI || SI) && "Invalid Load/Store instruction");
+ assert((!SI || StoredValue) && "No stored value provided for widened store");
+ assert((!LI || !StoredValue) && "Stored value provided for widened load");
+
+ LoopVectorizationCostModel::InstWidening Decision =
+ Cost->getWideningDecision(Instr, VF);
+ assert((Decision == LoopVectorizationCostModel::CM_Widen ||
+ Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
+ Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
+ "CM decision is not to widen the memory instruction");
+
+ Type *ScalarDataTy = getMemInstValueType(Instr);
auto *DataTy = VectorType::get(ScalarDataTy, VF);
- const Align Alignment = getLoadStoreAlignment(Instr);
-
- // Determine if the pointer operand of the access is either consecutive or
- // reverse consecutive.
- bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
- bool ConsecutiveStride =
- Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
- bool CreateGatherScatter =
- (Decision == LoopVectorizationCostModel::CM_GatherScatter);
-
- // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
- // gather/scatter. Otherwise Decision should have been to Scalarize.
- assert((ConsecutiveStride || CreateGatherScatter) &&
- "The instruction should be scalarized");
- (void)ConsecutiveStride;
-
- VectorParts BlockInMaskParts(UF);
- bool isMaskRequired = BlockInMask;
- if (isMaskRequired)
- for (unsigned Part = 0; Part < UF; ++Part)
- BlockInMaskParts[Part] = State.get(BlockInMask, Part);
-
- const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
- // Calculate the pointer for the specific unroll-part.
- GetElementPtrInst *PartPtr = nullptr;
-
- bool InBounds = false;
- if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
- InBounds = gep->isInBounds();
-
- if (Reverse) {
+ const Align Alignment = getLoadStoreAlignment(Instr);
+
+ // Determine if the pointer operand of the access is either consecutive or
+ // reverse consecutive.
+ bool Reverse = (Decision == LoopVectorizationCostModel::CM_Widen_Reverse);
+ bool ConsecutiveStride =
+ Reverse || (Decision == LoopVectorizationCostModel::CM_Widen);
+ bool CreateGatherScatter =
+ (Decision == LoopVectorizationCostModel::CM_GatherScatter);
+
+ // Either Ptr feeds a vector load/store, or a vector GEP should feed a vector
+ // gather/scatter. Otherwise Decision should have been to Scalarize.
+ assert((ConsecutiveStride || CreateGatherScatter) &&
+ "The instruction should be scalarized");
+ (void)ConsecutiveStride;
+
+ VectorParts BlockInMaskParts(UF);
+ bool isMaskRequired = BlockInMask;
+ if (isMaskRequired)
+ for (unsigned Part = 0; Part < UF; ++Part)
+ BlockInMaskParts[Part] = State.get(BlockInMask, Part);
+
+ const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * {
+ // Calculate the pointer for the specific unroll-part.
+ GetElementPtrInst *PartPtr = nullptr;
+
+ bool InBounds = false;
+ if (auto *gep = dyn_cast<GetElementPtrInst>(Ptr->stripPointerCasts()))
+ InBounds = gep->isInBounds();
+
+ if (Reverse) {
assert(!VF.isScalable() &&
"Reversing vectors is not yet supported for scalable vectors.");
- // If the address is consecutive but reversed, then the
- // wide store needs to start at the last vector element.
+ // If the address is consecutive but reversed, then the
+ // wide store needs to start at the last vector element.
PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
ScalarDataTy, Ptr, Builder.getInt32(-Part * VF.getKnownMinValue())));
- PartPtr->setIsInBounds(InBounds);
+ PartPtr->setIsInBounds(InBounds);
PartPtr = cast<GetElementPtrInst>(Builder.CreateGEP(
ScalarDataTy, PartPtr, Builder.getInt32(1 - VF.getKnownMinValue())));
- PartPtr->setIsInBounds(InBounds);
- if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
- BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
- } else {
+ PartPtr->setIsInBounds(InBounds);
+ if (isMaskRequired) // Reverse of a null all-one mask is a null mask.
+ BlockInMaskParts[Part] = reverseVector(BlockInMaskParts[Part]);
+ } else {
Value *Increment = createStepForVF(Builder, Builder.getInt32(Part), VF);
- PartPtr = cast<GetElementPtrInst>(
+ PartPtr = cast<GetElementPtrInst>(
Builder.CreateGEP(ScalarDataTy, Ptr, Increment));
- PartPtr->setIsInBounds(InBounds);
- }
-
- unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
- return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
- };
-
- // Handle Stores:
- if (SI) {
- setDebugLocFromInst(Builder, SI);
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Instruction *NewSI = nullptr;
- Value *StoredVal = State.get(StoredValue, Part);
- if (CreateGatherScatter) {
- Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
- Value *VectorGep = State.get(Addr, Part);
- NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
- MaskPart);
- } else {
- if (Reverse) {
- // If we store to reverse consecutive memory locations, then we need
- // to reverse the order of elements in the stored value.
- StoredVal = reverseVector(StoredVal);
- // We don't want to update the value in the map as it might be used in
- // another expression. So don't call resetVectorValue(StoredVal).
- }
- auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
- if (isMaskRequired)
- NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
- BlockInMaskParts[Part]);
- else
- NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
- }
- addMetadata(NewSI, SI);
- }
- return;
- }
-
- // Handle loads.
- assert(LI && "Must have a load instruction");
- setDebugLocFromInst(Builder, LI);
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *NewLI;
- if (CreateGatherScatter) {
- Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
- Value *VectorGep = State.get(Addr, Part);
- NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
- nullptr, "wide.masked.gather");
- addMetadata(NewLI, LI);
- } else {
- auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
- if (isMaskRequired)
- NewLI = Builder.CreateMaskedLoad(
+ PartPtr->setIsInBounds(InBounds);
+ }
+
+ unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
+ return Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
+ };
+
+ // Handle Stores:
+ if (SI) {
+ setDebugLocFromInst(Builder, SI);
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Instruction *NewSI = nullptr;
+ Value *StoredVal = State.get(StoredValue, Part);
+ if (CreateGatherScatter) {
+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+ Value *VectorGep = State.get(Addr, Part);
+ NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
+ MaskPart);
+ } else {
+ if (Reverse) {
+ // If we store to reverse consecutive memory locations, then we need
+ // to reverse the order of elements in the stored value.
+ StoredVal = reverseVector(StoredVal);
+ // We don't want to update the value in the map as it might be used in
+ // another expression. So don't call resetVectorValue(StoredVal).
+ }
+ auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
+ if (isMaskRequired)
+ NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
+ BlockInMaskParts[Part]);
+ else
+ NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
+ }
+ addMetadata(NewSI, SI);
+ }
+ return;
+ }
+
+ // Handle loads.
+ assert(LI && "Must have a load instruction");
+ setDebugLocFromInst(Builder, LI);
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *NewLI;
+ if (CreateGatherScatter) {
+ Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
+ Value *VectorGep = State.get(Addr, Part);
+ NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
+ nullptr, "wide.masked.gather");
+ addMetadata(NewLI, LI);
+ } else {
+ auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
+ if (isMaskRequired)
+ NewLI = Builder.CreateMaskedLoad(
VecPtr, Alignment, BlockInMaskParts[Part], PoisonValue::get(DataTy),
- "wide.masked.load");
- else
- NewLI =
- Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
-
- // Add metadata to the load, but setVectorValue to the reverse shuffle.
- addMetadata(NewLI, LI);
- if (Reverse)
- NewLI = reverseVector(NewLI);
- }
+ "wide.masked.load");
+ else
+ NewLI =
+ Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
+
+ // Add metadata to the load, but setVectorValue to the reverse shuffle.
+ addMetadata(NewLI, LI);
+ if (Reverse)
+ NewLI = reverseVector(NewLI);
+ }
State.set(Def, Instr, NewLI, Part);
- }
-}
-
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
- const VPIteration &Instance,
- bool IfPredicateInstr,
- VPTransformState &State) {
- assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
-
+ }
+}
+
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
+ const VPIteration &Instance,
+ bool IfPredicateInstr,
+ VPTransformState &State) {
+ assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
+
// llvm.experimental.noalias.scope.decl intrinsics must only be duplicated for
// the first lane and part.
if (isa<NoAliasScopeDeclInst>(Instr))
if (Instance.Lane != 0 || Instance.Part != 0)
return;
- setDebugLocFromInst(Builder, Instr);
-
- // Does this instruction return a value ?
- bool IsVoidRetTy = Instr->getType()->isVoidTy();
-
- Instruction *Cloned = Instr->clone();
- if (!IsVoidRetTy)
- Cloned->setName(Instr->getName() + ".cloned");
-
- // Replace the operands of the cloned instructions with their scalar
- // equivalents in the new loop.
- for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
+ setDebugLocFromInst(Builder, Instr);
+
+ // Does this instruction return a value ?
+ bool IsVoidRetTy = Instr->getType()->isVoidTy();
+
+ Instruction *Cloned = Instr->clone();
+ if (!IsVoidRetTy)
+ Cloned->setName(Instr->getName() + ".cloned");
+
+ // Replace the operands of the cloned instructions with their scalar
+ // equivalents in the new loop.
+ for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
auto *Operand = dyn_cast<Instruction>(Instr->getOperand(op));
auto InputInstance = Instance;
if (!Operand || !OrigLoop->contains(Operand) ||
(Cost->isUniformAfterVectorization(Operand, State.VF)))
InputInstance.Lane = 0;
auto *NewOp = State.get(User.getOperand(op), InputInstance);
- Cloned->setOperand(op, NewOp);
- }
- addNewMetadata(Cloned, Instr);
-
- // Place the cloned scalar in the new loop.
- Builder.Insert(Cloned);
-
+ Cloned->setOperand(op, NewOp);
+ }
+ addNewMetadata(Cloned, Instr);
+
+ // Place the cloned scalar in the new loop.
+ Builder.Insert(Cloned);
+
// TODO: Set result for VPValue of VPReciplicateRecipe. This requires
// representing scalar values in VPTransformState. Add the cloned scalar to
// the scalar map entry.
- VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
-
- // If we just cloned a new assumption, add it the assumption cache.
- if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
- if (II->getIntrinsicID() == Intrinsic::assume)
- AC->registerAssumption(II);
-
- // End if-block.
- if (IfPredicateInstr)
- PredicatedInstructions.push_back(Cloned);
-}
-
-PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
- Value *End, Value *Step,
- Instruction *DL) {
- BasicBlock *Header = L->getHeader();
- BasicBlock *Latch = L->getLoopLatch();
- // As we're just creating this loop, it's possible no latch exists
- // yet. If so, use the header as this will be a single block loop.
- if (!Latch)
- Latch = Header;
-
- IRBuilder<> Builder(&*Header->getFirstInsertionPt());
- Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
- setDebugLocFromInst(Builder, OldInst);
- auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
-
- Builder.SetInsertPoint(Latch->getTerminator());
- setDebugLocFromInst(Builder, OldInst);
-
- // Create i+1 and fill the PHINode.
- Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
- Induction->addIncoming(Start, L->getLoopPreheader());
- Induction->addIncoming(Next, Latch);
- // Create the compare.
- Value *ICmp = Builder.CreateICmpEQ(Next, End);
+ VectorLoopValueMap.setScalarValue(Instr, Instance, Cloned);
+
+ // If we just cloned a new assumption, add it the assumption cache.
+ if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
+ if (II->getIntrinsicID() == Intrinsic::assume)
+ AC->registerAssumption(II);
+
+ // End if-block.
+ if (IfPredicateInstr)
+ PredicatedInstructions.push_back(Cloned);
+}
+
+PHINode *InnerLoopVectorizer::createInductionVariable(Loop *L, Value *Start,
+ Value *End, Value *Step,
+ Instruction *DL) {
+ BasicBlock *Header = L->getHeader();
+ BasicBlock *Latch = L->getLoopLatch();
+ // As we're just creating this loop, it's possible no latch exists
+ // yet. If so, use the header as this will be a single block loop.
+ if (!Latch)
+ Latch = Header;
+
+ IRBuilder<> Builder(&*Header->getFirstInsertionPt());
+ Instruction *OldInst = getDebugLocFromInstOrOperands(OldInduction);
+ setDebugLocFromInst(Builder, OldInst);
+ auto *Induction = Builder.CreatePHI(Start->getType(), 2, "index");
+
+ Builder.SetInsertPoint(Latch->getTerminator());
+ setDebugLocFromInst(Builder, OldInst);
+
+ // Create i+1 and fill the PHINode.
+ Value *Next = Builder.CreateAdd(Induction, Step, "index.next");
+ Induction->addIncoming(Start, L->getLoopPreheader());
+ Induction->addIncoming(Next, Latch);
+ // Create the compare.
+ Value *ICmp = Builder.CreateICmpEQ(Next, End);
Builder.CreateCondBr(ICmp, L->getUniqueExitBlock(), Header);
-
- // Now we have two terminators. Remove the old one from the block.
- Latch->getTerminator()->eraseFromParent();
-
- return Induction;
-}
-
-Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
- if (TripCount)
- return TripCount;
-
- assert(L && "Create Trip Count for null loop.");
- IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
- // Find the loop boundaries.
- ScalarEvolution *SE = PSE.getSE();
- const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
+
+ // Now we have two terminators. Remove the old one from the block.
+ Latch->getTerminator()->eraseFromParent();
+
+ return Induction;
+}
+
+Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
+ if (TripCount)
+ return TripCount;
+
+ assert(L && "Create Trip Count for null loop.");
+ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+ // Find the loop boundaries.
+ ScalarEvolution *SE = PSE.getSE();
+ const SCEV *BackedgeTakenCount = PSE.getBackedgeTakenCount();
assert(!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
- "Invalid loop count");
-
- Type *IdxTy = Legal->getWidestInductionType();
- assert(IdxTy && "No type for induction");
-
- // The exit count might have the type of i64 while the phi is i32. This can
- // happen if we have an induction variable that is sign extended before the
- // compare. The only way that we get a backedge taken count is that the
- // induction variable was signed and as such will not overflow. In such a case
- // truncation is legal.
- if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
- IdxTy->getPrimitiveSizeInBits())
- BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
- BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
-
- // Get the total trip count from the count by adding 1.
- const SCEV *ExitCount = SE->getAddExpr(
- BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
-
- const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-
- // Expand the trip count and place the new instructions in the preheader.
- // Notice that the pre-header does not change, only the loop body.
- SCEVExpander Exp(*SE, DL, "induction");
-
- // Count holds the overall loop count (N).
- TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
- L->getLoopPreheader()->getTerminator());
-
- if (TripCount->getType()->isPointerTy())
- TripCount =
- CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
- L->getLoopPreheader()->getTerminator());
-
- return TripCount;
-}
-
-Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
- if (VectorTripCount)
- return VectorTripCount;
-
- Value *TC = getOrCreateTripCount(L);
- IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
-
- Type *Ty = TC->getType();
+ "Invalid loop count");
+
+ Type *IdxTy = Legal->getWidestInductionType();
+ assert(IdxTy && "No type for induction");
+
+ // The exit count might have the type of i64 while the phi is i32. This can
+ // happen if we have an induction variable that is sign extended before the
+ // compare. The only way that we get a backedge taken count is that the
+ // induction variable was signed and as such will not overflow. In such a case
+ // truncation is legal.
+ if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
+ IdxTy->getPrimitiveSizeInBits())
+ BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
+ BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
+
+ // Get the total trip count from the count by adding 1.
+ const SCEV *ExitCount = SE->getAddExpr(
+ BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
+
+ const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+
+ // Expand the trip count and place the new instructions in the preheader.
+ // Notice that the pre-header does not change, only the loop body.
+ SCEVExpander Exp(*SE, DL, "induction");
+
+ // Count holds the overall loop count (N).
+ TripCount = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+ L->getLoopPreheader()->getTerminator());
+
+ if (TripCount->getType()->isPointerTy())
+ TripCount =
+ CastInst::CreatePointerCast(TripCount, IdxTy, "exitcount.ptrcnt.to.int",
+ L->getLoopPreheader()->getTerminator());
+
+ return TripCount;
+}
+
+Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
+ if (VectorTripCount)
+ return VectorTripCount;
+
+ Value *TC = getOrCreateTripCount(L);
+ IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
+
+ Type *Ty = TC->getType();
// This is where we can make the step a runtime constant.
Value *Step = createStepForVF(Builder, ConstantInt::get(Ty, UF), VF);
-
- // If the tail is to be folded by masking, round the number of iterations N
- // up to a multiple of Step instead of rounding down. This is done by first
- // adding Step-1 and then rounding down. Note that it's ok if this addition
- // overflows: the vector induction variable will eventually wrap to zero given
- // that it starts at zero and its Step is a power of two; the loop will then
- // exit, with the last early-exit vector comparison also producing all-true.
- if (Cost->foldTailByMasking()) {
+
+ // If the tail is to be folded by masking, round the number of iterations N
+ // up to a multiple of Step instead of rounding down. This is done by first
+ // adding Step-1 and then rounding down. Note that it's ok if this addition
+ // overflows: the vector induction variable will eventually wrap to zero given
+ // that it starts at zero and its Step is a power of two; the loop will then
+ // exit, with the last early-exit vector comparison also producing all-true.
+ if (Cost->foldTailByMasking()) {
assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
- "VF*UF must be a power of 2 when folding tail by masking");
+ "VF*UF must be a power of 2 when folding tail by masking");
assert(!VF.isScalable() &&
"Tail folding not yet supported for scalable vectors");
TC = Builder.CreateAdd(
TC, ConstantInt::get(Ty, VF.getKnownMinValue() * UF - 1), "n.rnd.up");
- }
-
- // Now we need to generate the expression for the part of the loop that the
- // vectorized body will execute. This is equal to N - (N % Step) if scalar
- // iterations are not required for correctness, or N - Step, otherwise. Step
- // is equal to the vectorization factor (number of SIMD elements) times the
- // unroll factor (number of SIMD instructions).
- Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
-
+ }
+
+ // Now we need to generate the expression for the part of the loop that the
+ // vectorized body will execute. This is equal to N - (N % Step) if scalar
+ // iterations are not required for correctness, or N - Step, otherwise. Step
+ // is equal to the vectorization factor (number of SIMD elements) times the
+ // unroll factor (number of SIMD instructions).
+ Value *R = Builder.CreateURem(TC, Step, "n.mod.vf");
+
// There are two cases where we need to ensure (at least) the last iteration
// runs in the scalar remainder loop. Thus, if the step evenly divides
- // the trip count, we set the remainder to be equal to the step. If the step
- // does not evenly divide the trip count, no adjustment is necessary since
- // there will already be scalar iterations. Note that the minimum iterations
+ // the trip count, we set the remainder to be equal to the step. If the step
+ // does not evenly divide the trip count, no adjustment is necessary since
+ // there will already be scalar iterations. Note that the minimum iterations
// check ensures that N >= Step. The cases are:
// 1) If there is a non-reversed interleaved group that may speculatively
// access memory out-of-bounds.
@@ -3042,178 +3042,178 @@ Value *InnerLoopVectorizer::getOrCreateVectorTripCount(Loop *L) {
// the loop contains multiple exiting blocks, or a single exiting block
// which is not the latch.
if (VF.isVector() && Cost->requiresScalarEpilogue()) {
- auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
- R = Builder.CreateSelect(IsZero, Step, R);
- }
-
- VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
-
- return VectorTripCount;
-}
-
-Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
- const DataLayout &DL) {
- // Verify that V is a vector type with same number of elements as DstVTy.
+ auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
+ R = Builder.CreateSelect(IsZero, Step, R);
+ }
+
+ VectorTripCount = Builder.CreateSub(TC, R, "n.vec");
+
+ return VectorTripCount;
+}
+
+Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
+ const DataLayout &DL) {
+ // Verify that V is a vector type with same number of elements as DstVTy.
auto *DstFVTy = cast<FixedVectorType>(DstVTy);
unsigned VF = DstFVTy->getNumElements();
auto *SrcVecTy = cast<FixedVectorType>(V->getType());
- assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
- Type *SrcElemTy = SrcVecTy->getElementType();
+ assert((VF == SrcVecTy->getNumElements()) && "Vector dimensions do not match");
+ Type *SrcElemTy = SrcVecTy->getElementType();
Type *DstElemTy = DstFVTy->getElementType();
- assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
- "Vector elements must have same size");
-
- // Do a direct cast if element types are castable.
- if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
+ assert((DL.getTypeSizeInBits(SrcElemTy) == DL.getTypeSizeInBits(DstElemTy)) &&
+ "Vector elements must have same size");
+
+ // Do a direct cast if element types are castable.
+ if (CastInst::isBitOrNoopPointerCastable(SrcElemTy, DstElemTy, DL)) {
return Builder.CreateBitOrPointerCast(V, DstFVTy);
- }
- // V cannot be directly casted to desired vector type.
- // May happen when V is a floating point vector but DstVTy is a vector of
- // pointers or vice-versa. Handle this using a two-step bitcast using an
- // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
- assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
- "Only one type should be a pointer type");
- assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
- "Only one type should be a floating point type");
- Type *IntTy =
- IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
- auto *VecIntTy = FixedVectorType::get(IntTy, VF);
- Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
+ }
+ // V cannot be directly casted to desired vector type.
+ // May happen when V is a floating point vector but DstVTy is a vector of
+ // pointers or vice-versa. Handle this using a two-step bitcast using an
+ // intermediate Integer type for the bitcast i.e. Ptr <-> Int <-> Float.
+ assert((DstElemTy->isPointerTy() != SrcElemTy->isPointerTy()) &&
+ "Only one type should be a pointer type");
+ assert((DstElemTy->isFloatingPointTy() != SrcElemTy->isFloatingPointTy()) &&
+ "Only one type should be a floating point type");
+ Type *IntTy =
+ IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
+ auto *VecIntTy = FixedVectorType::get(IntTy, VF);
+ Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
-}
-
-void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
- BasicBlock *Bypass) {
- Value *Count = getOrCreateTripCount(L);
- // Reuse existing vector loop preheader for TC checks.
- // Note that new preheader block is generated for vector loop.
- BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
- IRBuilder<> Builder(TCCheckBlock->getTerminator());
-
- // Generate code to check if the loop's trip count is less than VF * UF, or
- // equal to it in case a scalar epilogue is required; this implies that the
- // vector trip count is zero. This check also covers the case where adding one
- // to the backedge-taken count overflowed leading to an incorrect trip count
- // of zero. In this case we will also jump to the scalar loop.
- auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
- : ICmpInst::ICMP_ULT;
-
- // If tail is to be folded, vector loop takes care of all iterations.
- Value *CheckMinIters = Builder.getFalse();
+}
+
+void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
+ BasicBlock *Bypass) {
+ Value *Count = getOrCreateTripCount(L);
+ // Reuse existing vector loop preheader for TC checks.
+ // Note that new preheader block is generated for vector loop.
+ BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
+ IRBuilder<> Builder(TCCheckBlock->getTerminator());
+
+ // Generate code to check if the loop's trip count is less than VF * UF, or
+ // equal to it in case a scalar epilogue is required; this implies that the
+ // vector trip count is zero. This check also covers the case where adding one
+ // to the backedge-taken count overflowed leading to an incorrect trip count
+ // of zero. In this case we will also jump to the scalar loop.
+ auto P = Cost->requiresScalarEpilogue() ? ICmpInst::ICMP_ULE
+ : ICmpInst::ICMP_ULT;
+
+ // If tail is to be folded, vector loop takes care of all iterations.
+ Value *CheckMinIters = Builder.getFalse();
if (!Cost->foldTailByMasking()) {
Value *Step =
createStepForVF(Builder, ConstantInt::get(Count->getType(), UF), VF);
CheckMinIters = Builder.CreateICmp(P, Count, Step, "min.iters.check");
}
- // Create new preheader for vector loop.
- LoopVectorPreHeader =
- SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
- "vector.ph");
-
- assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
- DT->getNode(Bypass)->getIDom()) &&
- "TC check is expected to dominate Bypass");
-
- // Update dominator for Bypass & LoopExit.
- DT->changeImmediateDominator(Bypass, TCCheckBlock);
- DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
-
- ReplaceInstWithInst(
- TCCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
- LoopBypassBlocks.push_back(TCCheckBlock);
-}
-
-void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
- // Reuse existing vector loop preheader for SCEV checks.
- // Note that new preheader block is generated for vector loop.
- BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
-
- // Generate the code to check that the SCEV assumptions that we made.
- // We want the new basic block to start at the first instruction in a
- // sequence of instructions that form a check.
- SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
- "scev.check");
- Value *SCEVCheck = Exp.expandCodeForPredicate(
- &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
-
- if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
- if (C->isZero())
- return;
-
+ // Create new preheader for vector loop.
+ LoopVectorPreHeader =
+ SplitBlock(TCCheckBlock, TCCheckBlock->getTerminator(), DT, LI, nullptr,
+ "vector.ph");
+
+ assert(DT->properlyDominates(DT->getNode(TCCheckBlock),
+ DT->getNode(Bypass)->getIDom()) &&
+ "TC check is expected to dominate Bypass");
+
+ // Update dominator for Bypass & LoopExit.
+ DT->changeImmediateDominator(Bypass, TCCheckBlock);
+ DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
+
+ ReplaceInstWithInst(
+ TCCheckBlock->getTerminator(),
+ BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters));
+ LoopBypassBlocks.push_back(TCCheckBlock);
+}
+
+void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
+ // Reuse existing vector loop preheader for SCEV checks.
+ // Note that new preheader block is generated for vector loop.
+ BasicBlock *const SCEVCheckBlock = LoopVectorPreHeader;
+
+ // Generate the code to check that the SCEV assumptions that we made.
+ // We want the new basic block to start at the first instruction in a
+ // sequence of instructions that form a check.
+ SCEVExpander Exp(*PSE.getSE(), Bypass->getModule()->getDataLayout(),
+ "scev.check");
+ Value *SCEVCheck = Exp.expandCodeForPredicate(
+ &PSE.getUnionPredicate(), SCEVCheckBlock->getTerminator());
+
+ if (auto *C = dyn_cast<ConstantInt>(SCEVCheck))
+ if (C->isZero())
+ return;
+
assert(!(SCEVCheckBlock->getParent()->hasOptSize() ||
(OptForSizeBasedOnProfile &&
Cost->Hints->getForce() != LoopVectorizeHints::FK_Enabled)) &&
- "Cannot SCEV check stride or overflow when optimizing for size");
-
- SCEVCheckBlock->setName("vector.scevcheck");
- // Create new preheader for vector loop.
- LoopVectorPreHeader =
- SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
- nullptr, "vector.ph");
-
- // Update dominator only if this is first RT check.
- if (LoopBypassBlocks.empty()) {
- DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
- DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
- }
-
- ReplaceInstWithInst(
- SCEVCheckBlock->getTerminator(),
- BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
- LoopBypassBlocks.push_back(SCEVCheckBlock);
- AddedSafetyChecks = true;
-}
-
-void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
- // VPlan-native path does not do any analysis for runtime checks currently.
- if (EnableVPlanNativePath)
- return;
-
- // Reuse existing vector loop preheader for runtime memory checks.
- // Note that new preheader block is generated for vector loop.
- BasicBlock *const MemCheckBlock = L->getLoopPreheader();
-
- // Generate the code that checks in runtime if arrays overlap. We put the
- // checks into a separate block to make the more common case of few elements
- // faster.
- auto *LAI = Legal->getLAI();
- const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
- if (!RtPtrChecking.Need)
- return;
-
+ "Cannot SCEV check stride or overflow when optimizing for size");
+
+ SCEVCheckBlock->setName("vector.scevcheck");
+ // Create new preheader for vector loop.
+ LoopVectorPreHeader =
+ SplitBlock(SCEVCheckBlock, SCEVCheckBlock->getTerminator(), DT, LI,
+ nullptr, "vector.ph");
+
+ // Update dominator only if this is first RT check.
+ if (LoopBypassBlocks.empty()) {
+ DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
+ DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
+ }
+
+ ReplaceInstWithInst(
+ SCEVCheckBlock->getTerminator(),
+ BranchInst::Create(Bypass, LoopVectorPreHeader, SCEVCheck));
+ LoopBypassBlocks.push_back(SCEVCheckBlock);
+ AddedSafetyChecks = true;
+}
+
+void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
+ // VPlan-native path does not do any analysis for runtime checks currently.
+ if (EnableVPlanNativePath)
+ return;
+
+ // Reuse existing vector loop preheader for runtime memory checks.
+ // Note that new preheader block is generated for vector loop.
+ BasicBlock *const MemCheckBlock = L->getLoopPreheader();
+
+ // Generate the code that checks in runtime if arrays overlap. We put the
+ // checks into a separate block to make the more common case of few elements
+ // faster.
+ auto *LAI = Legal->getLAI();
+ const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
+ if (!RtPtrChecking.Need)
+ return;
+
if (MemCheckBlock->getParent()->hasOptSize() || OptForSizeBasedOnProfile) {
- assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
- "Cannot emit memory checks when optimizing for size, unless forced "
- "to vectorize.");
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
- L->getStartLoc(), L->getHeader())
- << "Code-size may be reduced by not forcing "
- "vectorization, or by source-code modifications "
- "eliminating the need for runtime checks "
- "(e.g., adding 'restrict').";
- });
- }
-
- MemCheckBlock->setName("vector.memcheck");
- // Create new preheader for vector loop.
- LoopVectorPreHeader =
- SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
- "vector.ph");
-
+ assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
+ "Cannot emit memory checks when optimizing for size, unless forced "
+ "to vectorize.");
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
+ L->getStartLoc(), L->getHeader())
+ << "Code-size may be reduced by not forcing "
+ "vectorization, or by source-code modifications "
+ "eliminating the need for runtime checks "
+ "(e.g., adding 'restrict').";
+ });
+ }
+
+ MemCheckBlock->setName("vector.memcheck");
+ // Create new preheader for vector loop.
+ LoopVectorPreHeader =
+ SplitBlock(MemCheckBlock, MemCheckBlock->getTerminator(), DT, LI, nullptr,
+ "vector.ph");
+
auto *CondBranch = cast<BranchInst>(
Builder.CreateCondBr(Builder.getTrue(), Bypass, LoopVectorPreHeader));
ReplaceInstWithInst(MemCheckBlock->getTerminator(), CondBranch);
LoopBypassBlocks.push_back(MemCheckBlock);
AddedSafetyChecks = true;
- // Update dominator only if this is first RT check.
- if (LoopBypassBlocks.empty()) {
- DT->changeImmediateDominator(Bypass, MemCheckBlock);
- DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
- }
-
+ // Update dominator only if this is first RT check.
+ if (LoopBypassBlocks.empty()) {
+ DT->changeImmediateDominator(Bypass, MemCheckBlock);
+ DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
+ }
+
Instruction *FirstCheckInst;
Instruction *MemRuntimeCheck;
std::tie(FirstCheckInst, MemRuntimeCheck) =
@@ -3222,128 +3222,128 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
"claimed checks are required");
CondBranch->setCondition(MemRuntimeCheck);
-
- // We currently don't use LoopVersioning for the actual loop cloning but we
- // still use it to add the noalias metadata.
+
+ // We currently don't use LoopVersioning for the actual loop cloning but we
+ // still use it to add the noalias metadata.
LVer = std::make_unique<LoopVersioning>(
*Legal->getLAI(),
Legal->getLAI()->getRuntimePointerChecking()->getChecks(), OrigLoop, LI,
DT, PSE.getSE());
- LVer->prepareNoAliasMetadata();
-}
-
-Value *InnerLoopVectorizer::emitTransformedIndex(
- IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
- const InductionDescriptor &ID) const {
-
- SCEVExpander Exp(*SE, DL, "induction");
- auto Step = ID.getStep();
- auto StartValue = ID.getStartValue();
- assert(Index->getType() == Step->getType() &&
- "Index type does not match StepValue type");
-
- // Note: the IR at this point is broken. We cannot use SE to create any new
- // SCEV and then expand it, hoping that SCEV's simplification will give us
- // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
- // lead to various SCEV crashes. So all we can do is to use builder and rely
- // on InstCombine for future simplifications. Here we handle some trivial
- // cases only.
- auto CreateAdd = [&B](Value *X, Value *Y) {
- assert(X->getType() == Y->getType() && "Types don't match!");
- if (auto *CX = dyn_cast<ConstantInt>(X))
- if (CX->isZero())
- return Y;
- if (auto *CY = dyn_cast<ConstantInt>(Y))
- if (CY->isZero())
- return X;
- return B.CreateAdd(X, Y);
- };
-
- auto CreateMul = [&B](Value *X, Value *Y) {
- assert(X->getType() == Y->getType() && "Types don't match!");
- if (auto *CX = dyn_cast<ConstantInt>(X))
- if (CX->isOne())
- return Y;
- if (auto *CY = dyn_cast<ConstantInt>(Y))
- if (CY->isOne())
- return X;
- return B.CreateMul(X, Y);
- };
-
- // Get a suitable insert point for SCEV expansion. For blocks in the vector
- // loop, choose the end of the vector loop header (=LoopVectorBody), because
- // the DomTree is not kept up-to-date for additional blocks generated in the
- // vector loop. By using the header as insertion point, we guarantee that the
- // expanded instructions dominate all their uses.
- auto GetInsertPoint = [this, &B]() {
- BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
- if (InsertBB != LoopVectorBody &&
- LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
- return LoopVectorBody->getTerminator();
- return &*B.GetInsertPoint();
- };
- switch (ID.getKind()) {
- case InductionDescriptor::IK_IntInduction: {
- assert(Index->getType() == StartValue->getType() &&
- "Index type does not match StartValue type");
- if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
- return B.CreateSub(StartValue, Index);
- auto *Offset = CreateMul(
- Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
- return CreateAdd(StartValue, Offset);
- }
- case InductionDescriptor::IK_PtrInduction: {
- assert(isa<SCEVConstant>(Step) &&
- "Expected constant step for pointer induction");
- return B.CreateGEP(
- StartValue->getType()->getPointerElementType(), StartValue,
- CreateMul(Index,
- Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
- }
- case InductionDescriptor::IK_FpInduction: {
- assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
- auto InductionBinOp = ID.getInductionBinOp();
- assert(InductionBinOp &&
- (InductionBinOp->getOpcode() == Instruction::FAdd ||
- InductionBinOp->getOpcode() == Instruction::FSub) &&
- "Original bin op should be defined for FP induction");
-
- Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
-
- // Floating point operations had to be 'fast' to enable the induction.
- FastMathFlags Flags;
- Flags.setFast();
-
- Value *MulExp = B.CreateFMul(StepValue, Index);
- if (isa<Instruction>(MulExp))
- // We have to check, the MulExp may be a constant.
- cast<Instruction>(MulExp)->setFastMathFlags(Flags);
-
- Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
- "induction");
- if (isa<Instruction>(BOp))
- cast<Instruction>(BOp)->setFastMathFlags(Flags);
-
- return BOp;
- }
- case InductionDescriptor::IK_NoInduction:
- return nullptr;
- }
- llvm_unreachable("invalid enum");
-}
-
+ LVer->prepareNoAliasMetadata();
+}
+
+Value *InnerLoopVectorizer::emitTransformedIndex(
+ IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
+ const InductionDescriptor &ID) const {
+
+ SCEVExpander Exp(*SE, DL, "induction");
+ auto Step = ID.getStep();
+ auto StartValue = ID.getStartValue();
+ assert(Index->getType() == Step->getType() &&
+ "Index type does not match StepValue type");
+
+ // Note: the IR at this point is broken. We cannot use SE to create any new
+ // SCEV and then expand it, hoping that SCEV's simplification will give us
+ // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
+ // lead to various SCEV crashes. So all we can do is to use builder and rely
+ // on InstCombine for future simplifications. Here we handle some trivial
+ // cases only.
+ auto CreateAdd = [&B](Value *X, Value *Y) {
+ assert(X->getType() == Y->getType() && "Types don't match!");
+ if (auto *CX = dyn_cast<ConstantInt>(X))
+ if (CX->isZero())
+ return Y;
+ if (auto *CY = dyn_cast<ConstantInt>(Y))
+ if (CY->isZero())
+ return X;
+ return B.CreateAdd(X, Y);
+ };
+
+ auto CreateMul = [&B](Value *X, Value *Y) {
+ assert(X->getType() == Y->getType() && "Types don't match!");
+ if (auto *CX = dyn_cast<ConstantInt>(X))
+ if (CX->isOne())
+ return Y;
+ if (auto *CY = dyn_cast<ConstantInt>(Y))
+ if (CY->isOne())
+ return X;
+ return B.CreateMul(X, Y);
+ };
+
+ // Get a suitable insert point for SCEV expansion. For blocks in the vector
+ // loop, choose the end of the vector loop header (=LoopVectorBody), because
+ // the DomTree is not kept up-to-date for additional blocks generated in the
+ // vector loop. By using the header as insertion point, we guarantee that the
+ // expanded instructions dominate all their uses.
+ auto GetInsertPoint = [this, &B]() {
+ BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
+ if (InsertBB != LoopVectorBody &&
+ LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
+ return LoopVectorBody->getTerminator();
+ return &*B.GetInsertPoint();
+ };
+ switch (ID.getKind()) {
+ case InductionDescriptor::IK_IntInduction: {
+ assert(Index->getType() == StartValue->getType() &&
+ "Index type does not match StartValue type");
+ if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
+ return B.CreateSub(StartValue, Index);
+ auto *Offset = CreateMul(
+ Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
+ return CreateAdd(StartValue, Offset);
+ }
+ case InductionDescriptor::IK_PtrInduction: {
+ assert(isa<SCEVConstant>(Step) &&
+ "Expected constant step for pointer induction");
+ return B.CreateGEP(
+ StartValue->getType()->getPointerElementType(), StartValue,
+ CreateMul(Index,
+ Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
+ }
+ case InductionDescriptor::IK_FpInduction: {
+ assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
+ auto InductionBinOp = ID.getInductionBinOp();
+ assert(InductionBinOp &&
+ (InductionBinOp->getOpcode() == Instruction::FAdd ||
+ InductionBinOp->getOpcode() == Instruction::FSub) &&
+ "Original bin op should be defined for FP induction");
+
+ Value *StepValue = cast<SCEVUnknown>(Step)->getValue();
+
+ // Floating point operations had to be 'fast' to enable the induction.
+ FastMathFlags Flags;
+ Flags.setFast();
+
+ Value *MulExp = B.CreateFMul(StepValue, Index);
+ if (isa<Instruction>(MulExp))
+ // We have to check, the MulExp may be a constant.
+ cast<Instruction>(MulExp)->setFastMathFlags(Flags);
+
+ Value *BOp = B.CreateBinOp(InductionBinOp->getOpcode(), StartValue, MulExp,
+ "induction");
+ if (isa<Instruction>(BOp))
+ cast<Instruction>(BOp)->setFastMathFlags(Flags);
+
+ return BOp;
+ }
+ case InductionDescriptor::IK_NoInduction:
+ return nullptr;
+ }
+ llvm_unreachable("invalid enum");
+}
+
Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
- LoopScalarBody = OrigLoop->getHeader();
- LoopVectorPreHeader = OrigLoop->getLoopPreheader();
+ LoopScalarBody = OrigLoop->getHeader();
+ LoopVectorPreHeader = OrigLoop->getLoopPreheader();
LoopExitBlock = OrigLoop->getUniqueExitBlock();
- assert(LoopExitBlock && "Must have an exit block");
- assert(LoopVectorPreHeader && "Invalid loop structure");
-
- LoopMiddleBlock =
- SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
+ assert(LoopExitBlock && "Must have an exit block");
+ assert(LoopVectorPreHeader && "Invalid loop structure");
+
+ LoopMiddleBlock =
+ SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
LI, nullptr, Twine(Prefix) + "middle.block");
- LoopScalarPreHeader =
- SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
+ LoopScalarPreHeader =
+ SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
nullptr, Twine(Prefix) + "scalar.ph");
// Set up branch from middle block to the exit and scalar preheader blocks.
@@ -3355,31 +3355,31 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
- // We intentionally don't let SplitBlock to update LoopInfo since
- // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
- // LoopVectorBody is explicitly added to the correct place few lines later.
- LoopVectorBody =
- SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
+ // We intentionally don't let SplitBlock to update LoopInfo since
+ // LoopVectorBody should belong to another loop than LoopVectorPreHeader.
+ // LoopVectorBody is explicitly added to the correct place few lines later.
+ LoopVectorBody =
+ SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
nullptr, nullptr, Twine(Prefix) + "vector.body");
-
- // Update dominator for loop exit.
- DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
-
- // Create and register the new vector loop.
- Loop *Lp = LI->AllocateLoop();
- Loop *ParentLoop = OrigLoop->getParentLoop();
-
- // Insert the new loop into the loop nest and register the new basic blocks
- // before calling any utilities such as SCEV that require valid LoopInfo.
- if (ParentLoop) {
- ParentLoop->addChildLoop(Lp);
- } else {
- LI->addTopLevelLoop(Lp);
- }
- Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
+
+ // Update dominator for loop exit.
+ DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
+
+ // Create and register the new vector loop.
+ Loop *Lp = LI->AllocateLoop();
+ Loop *ParentLoop = OrigLoop->getParentLoop();
+
+ // Insert the new loop into the loop nest and register the new basic blocks
+ // before calling any utilities such as SCEV that require valid LoopInfo.
+ if (ParentLoop) {
+ ParentLoop->addChildLoop(Lp);
+ } else {
+ LI->addTopLevelLoop(Lp);
+ }
+ Lp->addBasicBlockToLoop(LoopVectorBody, *LI);
return Lp;
}
-
+
void InnerLoopVectorizer::createInductionResumeValues(
Loop *L, Value *VectorTripCount,
std::pair<BasicBlock *, Value *> AdditionalBypass) {
@@ -3387,37 +3387,37 @@ void InnerLoopVectorizer::createInductionResumeValues(
assert(((AdditionalBypass.first && AdditionalBypass.second) ||
(!AdditionalBypass.first && !AdditionalBypass.second)) &&
"Inconsistent information about additional bypass.");
- // We are going to resume the execution of the scalar loop.
- // Go over all of the induction variables that we found and fix the
- // PHIs that are left in the scalar version of the loop.
- // The starting values of PHI nodes depend on the counter of the last
- // iteration in the vectorized loop.
- // If we come from a bypass edge then we need to start from the original
- // start value.
- for (auto &InductionEntry : Legal->getInductionVars()) {
- PHINode *OrigPhi = InductionEntry.first;
- InductionDescriptor II = InductionEntry.second;
-
- // Create phi nodes to merge from the backedge-taken check block.
- PHINode *BCResumeVal =
- PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
- LoopScalarPreHeader->getTerminator());
- // Copy original phi DL over to the new one.
- BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
- Value *&EndValue = IVEndValues[OrigPhi];
+ // We are going to resume the execution of the scalar loop.
+ // Go over all of the induction variables that we found and fix the
+ // PHIs that are left in the scalar version of the loop.
+ // The starting values of PHI nodes depend on the counter of the last
+ // iteration in the vectorized loop.
+ // If we come from a bypass edge then we need to start from the original
+ // start value.
+ for (auto &InductionEntry : Legal->getInductionVars()) {
+ PHINode *OrigPhi = InductionEntry.first;
+ InductionDescriptor II = InductionEntry.second;
+
+ // Create phi nodes to merge from the backedge-taken check block.
+ PHINode *BCResumeVal =
+ PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
+ LoopScalarPreHeader->getTerminator());
+ // Copy original phi DL over to the new one.
+ BCResumeVal->setDebugLoc(OrigPhi->getDebugLoc());
+ Value *&EndValue = IVEndValues[OrigPhi];
Value *EndValueFromAdditionalBypass = AdditionalBypass.second;
- if (OrigPhi == OldInduction) {
- // We know what the end value is.
+ if (OrigPhi == OldInduction) {
+ // We know what the end value is.
EndValue = VectorTripCount;
- } else {
+ } else {
IRBuilder<> B(L->getLoopPreheader()->getTerminator());
- Type *StepType = II.getStep()->getType();
- Instruction::CastOps CastOp =
+ Type *StepType = II.getStep()->getType();
+ Instruction::CastOps CastOp =
CastInst::getCastOpcode(VectorTripCount, true, StepType, true);
Value *CRD = B.CreateCast(CastOp, VectorTripCount, StepType, "cast.crd");
- const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
- EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
- EndValue->setName("ind.end");
+ const DataLayout &DL = LoopScalarBody->getModule()->getDataLayout();
+ EndValue = emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
+ EndValue->setName("ind.end");
// Compute the end value for the additional bypass (if applicable).
if (AdditionalBypass.first) {
@@ -3430,84 +3430,84 @@ void InnerLoopVectorizer::createInductionResumeValues(
emitTransformedIndex(B, CRD, PSE.getSE(), DL, II);
EndValueFromAdditionalBypass->setName("ind.end");
}
- }
- // The new PHI merges the original incoming value, in case of a bypass,
- // or the value at the end of the vectorized loop.
- BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
-
- // Fix the scalar body counter (PHI node).
- // The old induction's phi node in the scalar body needs the truncated
- // value.
- for (BasicBlock *BB : LoopBypassBlocks)
- BCResumeVal->addIncoming(II.getStartValue(), BB);
+ }
+ // The new PHI merges the original incoming value, in case of a bypass,
+ // or the value at the end of the vectorized loop.
+ BCResumeVal->addIncoming(EndValue, LoopMiddleBlock);
+
+ // Fix the scalar body counter (PHI node).
+ // The old induction's phi node in the scalar body needs the truncated
+ // value.
+ for (BasicBlock *BB : LoopBypassBlocks)
+ BCResumeVal->addIncoming(II.getStartValue(), BB);
if (AdditionalBypass.first)
BCResumeVal->setIncomingValueForBlock(AdditionalBypass.first,
EndValueFromAdditionalBypass);
- OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
- }
+ OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
+ }
}
-
+
BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
MDNode *OrigLoopID) {
assert(L && "Expected valid loop.");
-
+
// The trip counts should be cached by now.
Value *Count = getOrCreateTripCount(L);
Value *VectorTripCount = getOrCreateVectorTripCount(L);
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
- // Add a check in the middle block to see if we have completed
- // all of the iterations in the first vector loop.
- // If (N - N%VF) == N, then we *don't* need to run the remainder.
- // If tail is to be folded, we know we don't need to run the remainder.
- if (!Cost->foldTailByMasking()) {
+ // Add a check in the middle block to see if we have completed
+ // all of the iterations in the first vector loop.
+ // If (N - N%VF) == N, then we *don't* need to run the remainder.
+ // If tail is to be folded, we know we don't need to run the remainder.
+ if (!Cost->foldTailByMasking()) {
Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
Count, VectorTripCount, "cmp.n",
LoopMiddleBlock->getTerminator());
-
+
// Here we use the same DebugLoc as the scalar loop latch terminator instead
- // of the corresponding compare because they may have ended up with
- // different line numbers and we want to avoid awkward line stepping while
- // debugging. Eg. if the compare has got a line number inside the loop.
+ // of the corresponding compare because they may have ended up with
+ // different line numbers and we want to avoid awkward line stepping while
+ // debugging. Eg. if the compare has got a line number inside the loop.
CmpN->setDebugLoc(ScalarLatchTerm->getDebugLoc());
cast<BranchInst>(LoopMiddleBlock->getTerminator())->setCondition(CmpN);
- }
-
- // Get ready to start creating new instructions into the vectorized body.
+ }
+
+ // Get ready to start creating new instructions into the vectorized body.
assert(LoopVectorPreHeader == L->getLoopPreheader() &&
- "Inconsistent vector loop preheader");
- Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
-
- Optional<MDNode *> VectorizedLoopID =
- makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
- LLVMLoopVectorizeFollowupVectorized});
- if (VectorizedLoopID.hasValue()) {
+ "Inconsistent vector loop preheader");
+ Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
+
+ Optional<MDNode *> VectorizedLoopID =
+ makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+ LLVMLoopVectorizeFollowupVectorized});
+ if (VectorizedLoopID.hasValue()) {
L->setLoopID(VectorizedLoopID.getValue());
-
- // Do not setAlreadyVectorized if loop attributes have been defined
- // explicitly.
- return LoopVectorPreHeader;
- }
-
- // Keep all loop hints from the original loop on the vector loop (we'll
- // replace the vectorizer-specific hints below).
- if (MDNode *LID = OrigLoop->getLoopID())
+
+ // Do not setAlreadyVectorized if loop attributes have been defined
+ // explicitly.
+ return LoopVectorPreHeader;
+ }
+
+ // Keep all loop hints from the original loop on the vector loop (we'll
+ // replace the vectorizer-specific hints below).
+ if (MDNode *LID = OrigLoop->getLoopID())
L->setLoopID(LID);
-
+
LoopVectorizeHints Hints(L, true, *ORE);
- Hints.setAlreadyVectorized();
-
-#ifdef EXPENSIVE_CHECKS
- assert(DT->verify(DominatorTree::VerificationLevel::Fast));
- LI->verify(*DT);
-#endif
-
- return LoopVectorPreHeader;
-}
-
+ Hints.setAlreadyVectorized();
+
+#ifdef EXPENSIVE_CHECKS
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+ LI->verify(*DT);
+#endif
+
+ return LoopVectorPreHeader;
+}
+
BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
/*
In this function we generate a new loop. The new loop will contain
@@ -3593,376 +3593,376 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
return completeLoopSkeleton(Lp, OrigLoopID);
}
-// Fix up external users of the induction variable. At this point, we are
-// in LCSSA form, with all external PHIs that use the IV having one input value,
-// coming from the remainder loop. We need those PHIs to also have a correct
-// value for the IV when arriving directly from the middle block.
-void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
- const InductionDescriptor &II,
- Value *CountRoundDown, Value *EndValue,
- BasicBlock *MiddleBlock) {
- // There are two kinds of external IV usages - those that use the value
- // computed in the last iteration (the PHI) and those that use the penultimate
- // value (the value that feeds into the phi from the loop latch).
- // We allow both, but they, obviously, have different values.
-
+// Fix up external users of the induction variable. At this point, we are
+// in LCSSA form, with all external PHIs that use the IV having one input value,
+// coming from the remainder loop. We need those PHIs to also have a correct
+// value for the IV when arriving directly from the middle block.
+void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
+ const InductionDescriptor &II,
+ Value *CountRoundDown, Value *EndValue,
+ BasicBlock *MiddleBlock) {
+ // There are two kinds of external IV usages - those that use the value
+ // computed in the last iteration (the PHI) and those that use the penultimate
+ // value (the value that feeds into the phi from the loop latch).
+ // We allow both, but they, obviously, have different values.
+
assert(OrigLoop->getUniqueExitBlock() && "Expected a single exit block");
-
- DenseMap<Value *, Value *> MissingVals;
-
- // An external user of the last iteration's value should see the value that
- // the remainder loop uses to initialize its own IV.
- Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
- for (User *U : PostInc->users()) {
- Instruction *UI = cast<Instruction>(U);
- if (!OrigLoop->contains(UI)) {
- assert(isa<PHINode>(UI) && "Expected LCSSA form");
- MissingVals[UI] = EndValue;
- }
- }
-
- // An external user of the penultimate value need to see EndValue - Step.
- // The simplest way to get this is to recompute it from the constituent SCEVs,
- // that is Start + (Step * (CRD - 1)).
- for (User *U : OrigPhi->users()) {
- auto *UI = cast<Instruction>(U);
- if (!OrigLoop->contains(UI)) {
- const DataLayout &DL =
- OrigLoop->getHeader()->getModule()->getDataLayout();
- assert(isa<PHINode>(UI) && "Expected LCSSA form");
-
- IRBuilder<> B(MiddleBlock->getTerminator());
- Value *CountMinusOne = B.CreateSub(
- CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
- Value *CMO =
- !II.getStep()->getType()->isIntegerTy()
- ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
- II.getStep()->getType())
- : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
- CMO->setName("cast.cmo");
- Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
- Escape->setName("ind.escape");
- MissingVals[UI] = Escape;
- }
- }
-
- for (auto &I : MissingVals) {
- PHINode *PHI = cast<PHINode>(I.first);
- // One corner case we have to handle is two IVs "chasing" each-other,
- // that is %IV2 = phi [...], [ %IV1, %latch ]
- // In this case, if IV1 has an external use, we need to avoid adding both
- // "last value of IV1" and "penultimate value of IV2". So, verify that we
- // don't already have an incoming value for the middle block.
- if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
- PHI->addIncoming(I.second, MiddleBlock);
- }
-}
-
-namespace {
-
-struct CSEDenseMapInfo {
- static bool canHandle(const Instruction *I) {
- return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
- isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
- }
-
- static inline Instruction *getEmptyKey() {
- return DenseMapInfo<Instruction *>::getEmptyKey();
- }
-
- static inline Instruction *getTombstoneKey() {
- return DenseMapInfo<Instruction *>::getTombstoneKey();
- }
-
- static unsigned getHashValue(const Instruction *I) {
- assert(canHandle(I) && "Unknown instruction!");
- return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
- I->value_op_end()));
- }
-
- static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
- if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
- LHS == getTombstoneKey() || RHS == getTombstoneKey())
- return LHS == RHS;
- return LHS->isIdenticalTo(RHS);
- }
-};
-
-} // end anonymous namespace
-
-///Perform cse of induction variable instructions.
-static void cse(BasicBlock *BB) {
- // Perform simple cse.
- SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
- for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
- Instruction *In = &*I++;
-
- if (!CSEDenseMapInfo::canHandle(In))
- continue;
-
- // Check if we can replace this instruction with any of the
- // visited instructions.
- if (Instruction *V = CSEMap.lookup(In)) {
- In->replaceAllUsesWith(V);
- In->eraseFromParent();
- continue;
- }
-
- CSEMap[In] = In;
- }
-}
-
+
+ DenseMap<Value *, Value *> MissingVals;
+
+ // An external user of the last iteration's value should see the value that
+ // the remainder loop uses to initialize its own IV.
+ Value *PostInc = OrigPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch());
+ for (User *U : PostInc->users()) {
+ Instruction *UI = cast<Instruction>(U);
+ if (!OrigLoop->contains(UI)) {
+ assert(isa<PHINode>(UI) && "Expected LCSSA form");
+ MissingVals[UI] = EndValue;
+ }
+ }
+
+ // An external user of the penultimate value need to see EndValue - Step.
+ // The simplest way to get this is to recompute it from the constituent SCEVs,
+ // that is Start + (Step * (CRD - 1)).
+ for (User *U : OrigPhi->users()) {
+ auto *UI = cast<Instruction>(U);
+ if (!OrigLoop->contains(UI)) {
+ const DataLayout &DL =
+ OrigLoop->getHeader()->getModule()->getDataLayout();
+ assert(isa<PHINode>(UI) && "Expected LCSSA form");
+
+ IRBuilder<> B(MiddleBlock->getTerminator());
+ Value *CountMinusOne = B.CreateSub(
+ CountRoundDown, ConstantInt::get(CountRoundDown->getType(), 1));
+ Value *CMO =
+ !II.getStep()->getType()->isIntegerTy()
+ ? B.CreateCast(Instruction::SIToFP, CountMinusOne,
+ II.getStep()->getType())
+ : B.CreateSExtOrTrunc(CountMinusOne, II.getStep()->getType());
+ CMO->setName("cast.cmo");
+ Value *Escape = emitTransformedIndex(B, CMO, PSE.getSE(), DL, II);
+ Escape->setName("ind.escape");
+ MissingVals[UI] = Escape;
+ }
+ }
+
+ for (auto &I : MissingVals) {
+ PHINode *PHI = cast<PHINode>(I.first);
+ // One corner case we have to handle is two IVs "chasing" each-other,
+ // that is %IV2 = phi [...], [ %IV1, %latch ]
+ // In this case, if IV1 has an external use, we need to avoid adding both
+ // "last value of IV1" and "penultimate value of IV2". So, verify that we
+ // don't already have an incoming value for the middle block.
+ if (PHI->getBasicBlockIndex(MiddleBlock) == -1)
+ PHI->addIncoming(I.second, MiddleBlock);
+ }
+}
+
+namespace {
+
+struct CSEDenseMapInfo {
+ static bool canHandle(const Instruction *I) {
+ return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+ isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
+ }
+
+ static inline Instruction *getEmptyKey() {
+ return DenseMapInfo<Instruction *>::getEmptyKey();
+ }
+
+ static inline Instruction *getTombstoneKey() {
+ return DenseMapInfo<Instruction *>::getTombstoneKey();
+ }
+
+ static unsigned getHashValue(const Instruction *I) {
+ assert(canHandle(I) && "Unknown instruction!");
+ return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
+ I->value_op_end()));
+ }
+
+ static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
+ if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
+ LHS == getTombstoneKey() || RHS == getTombstoneKey())
+ return LHS == RHS;
+ return LHS->isIdenticalTo(RHS);
+ }
+};
+
+} // end anonymous namespace
+
+///Perform cse of induction variable instructions.
+static void cse(BasicBlock *BB) {
+ // Perform simple cse.
+ SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
+ for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+ Instruction *In = &*I++;
+
+ if (!CSEDenseMapInfo::canHandle(In))
+ continue;
+
+ // Check if we can replace this instruction with any of the
+ // visited instructions.
+ if (Instruction *V = CSEMap.lookup(In)) {
+ In->replaceAllUsesWith(V);
+ In->eraseFromParent();
+ continue;
+ }
+
+ CSEMap[In] = In;
+ }
+}
+
InstructionCost
LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
bool &NeedToScalarize) {
assert(!VF.isScalable() && "scalable vectors not yet supported.");
- Function *F = CI->getCalledFunction();
- Type *ScalarRetTy = CI->getType();
- SmallVector<Type *, 4> Tys, ScalarTys;
- for (auto &ArgOp : CI->arg_operands())
- ScalarTys.push_back(ArgOp->getType());
-
- // Estimate cost of scalarized vector call. The source operands are assumed
- // to be vectors, so we need to extract individual elements from there,
- // execute VF scalar calls, and then gather the result into the vector return
- // value.
+ Function *F = CI->getCalledFunction();
+ Type *ScalarRetTy = CI->getType();
+ SmallVector<Type *, 4> Tys, ScalarTys;
+ for (auto &ArgOp : CI->arg_operands())
+ ScalarTys.push_back(ArgOp->getType());
+
+ // Estimate cost of scalarized vector call. The source operands are assumed
+ // to be vectors, so we need to extract individual elements from there,
+ // execute VF scalar calls, and then gather the result into the vector return
+ // value.
InstructionCost ScalarCallCost =
TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys, TTI::TCK_RecipThroughput);
if (VF.isScalar())
- return ScalarCallCost;
-
- // Compute corresponding vector type for return value and arguments.
- Type *RetTy = ToVectorTy(ScalarRetTy, VF);
- for (Type *ScalarTy : ScalarTys)
- Tys.push_back(ToVectorTy(ScalarTy, VF));
-
- // Compute costs of unpacking argument values for the scalar calls and
- // packing the return values to a vector.
+ return ScalarCallCost;
+
+ // Compute corresponding vector type for return value and arguments.
+ Type *RetTy = ToVectorTy(ScalarRetTy, VF);
+ for (Type *ScalarTy : ScalarTys)
+ Tys.push_back(ToVectorTy(ScalarTy, VF));
+
+ // Compute costs of unpacking argument values for the scalar calls and
+ // packing the return values to a vector.
InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
-
+
InstructionCost Cost =
ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
-
- // If we can't emit a vector call for this function, then the currently found
- // cost is the cost we need to return.
- NeedToScalarize = true;
+
+ // If we can't emit a vector call for this function, then the currently found
+ // cost is the cost we need to return.
+ NeedToScalarize = true;
VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
- Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
-
- if (!TLI || CI->isNoBuiltin() || !VecFunc)
- return Cost;
-
- // If the corresponding vector cost is cheaper, return its cost.
+ Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+
+ if (!TLI || CI->isNoBuiltin() || !VecFunc)
+ return Cost;
+
+ // If the corresponding vector cost is cheaper, return its cost.
InstructionCost VectorCallCost =
TTI.getCallInstrCost(nullptr, RetTy, Tys, TTI::TCK_RecipThroughput);
- if (VectorCallCost < Cost) {
- NeedToScalarize = false;
+ if (VectorCallCost < Cost) {
+ NeedToScalarize = false;
Cost = VectorCallCost;
- }
- return Cost;
-}
-
+ }
+ return Cost;
+}
+
InstructionCost
LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
ElementCount VF) {
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- assert(ID && "Expected intrinsic call!");
-
- IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
- return TTI.getIntrinsicInstrCost(CostAttrs,
- TargetTransformInfo::TCK_RecipThroughput);
-}
-
-static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
- auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
- auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
- return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
-}
-
-static Type *largestIntegerVectorType(Type *T1, Type *T2) {
- auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
- auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
- return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
-}
-
-void InnerLoopVectorizer::truncateToMinimalBitwidths() {
- // For every instruction `I` in MinBWs, truncate the operands, create a
- // truncated version of `I` and reextend its result. InstCombine runs
- // later and will remove any ext/trunc pairs.
- SmallPtrSet<Value *, 4> Erased;
- for (const auto &KV : Cost->getMinimalBitwidths()) {
- // If the value wasn't vectorized, we must maintain the original scalar
- // type. The absence of the value from VectorLoopValueMap indicates that it
- // wasn't vectorized.
- if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
- continue;
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *I = getOrCreateVectorValue(KV.first, Part);
- if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
- continue;
- Type *OriginalTy = I->getType();
- Type *ScalarTruncatedTy =
- IntegerType::get(OriginalTy->getContext(), KV.second);
- auto *TruncatedTy = FixedVectorType::get(
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ assert(ID && "Expected intrinsic call!");
+
+ IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
+ return TTI.getIntrinsicInstrCost(CostAttrs,
+ TargetTransformInfo::TCK_RecipThroughput);
+}
+
+static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
+ auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
+ auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
+ return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
+}
+
+static Type *largestIntegerVectorType(Type *T1, Type *T2) {
+ auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
+ auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
+ return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
+}
+
+void InnerLoopVectorizer::truncateToMinimalBitwidths() {
+ // For every instruction `I` in MinBWs, truncate the operands, create a
+ // truncated version of `I` and reextend its result. InstCombine runs
+ // later and will remove any ext/trunc pairs.
+ SmallPtrSet<Value *, 4> Erased;
+ for (const auto &KV : Cost->getMinimalBitwidths()) {
+ // If the value wasn't vectorized, we must maintain the original scalar
+ // type. The absence of the value from VectorLoopValueMap indicates that it
+ // wasn't vectorized.
+ if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
+ continue;
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *I = getOrCreateVectorValue(KV.first, Part);
+ if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
+ continue;
+ Type *OriginalTy = I->getType();
+ Type *ScalarTruncatedTy =
+ IntegerType::get(OriginalTy->getContext(), KV.second);
+ auto *TruncatedTy = FixedVectorType::get(
ScalarTruncatedTy,
cast<FixedVectorType>(OriginalTy)->getNumElements());
- if (TruncatedTy == OriginalTy)
- continue;
-
- IRBuilder<> B(cast<Instruction>(I));
- auto ShrinkOperand = [&](Value *V) -> Value * {
- if (auto *ZI = dyn_cast<ZExtInst>(V))
- if (ZI->getSrcTy() == TruncatedTy)
- return ZI->getOperand(0);
- return B.CreateZExtOrTrunc(V, TruncatedTy);
- };
-
- // The actual instruction modification depends on the instruction type,
- // unfortunately.
- Value *NewI = nullptr;
- if (auto *BO = dyn_cast<BinaryOperator>(I)) {
- NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
- ShrinkOperand(BO->getOperand(1)));
-
- // Any wrapping introduced by shrinking this operation shouldn't be
- // considered undefined behavior. So, we can't unconditionally copy
- // arithmetic wrapping flags to NewI.
- cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
- } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
- NewI =
- B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
- ShrinkOperand(CI->getOperand(1)));
- } else if (auto *SI = dyn_cast<SelectInst>(I)) {
- NewI = B.CreateSelect(SI->getCondition(),
- ShrinkOperand(SI->getTrueValue()),
- ShrinkOperand(SI->getFalseValue()));
- } else if (auto *CI = dyn_cast<CastInst>(I)) {
- switch (CI->getOpcode()) {
- default:
- llvm_unreachable("Unhandled cast!");
- case Instruction::Trunc:
- NewI = ShrinkOperand(CI->getOperand(0));
- break;
- case Instruction::SExt:
- NewI = B.CreateSExtOrTrunc(
- CI->getOperand(0),
- smallestIntegerVectorType(OriginalTy, TruncatedTy));
- break;
- case Instruction::ZExt:
- NewI = B.CreateZExtOrTrunc(
- CI->getOperand(0),
- smallestIntegerVectorType(OriginalTy, TruncatedTy));
- break;
- }
- } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
+ if (TruncatedTy == OriginalTy)
+ continue;
+
+ IRBuilder<> B(cast<Instruction>(I));
+ auto ShrinkOperand = [&](Value *V) -> Value * {
+ if (auto *ZI = dyn_cast<ZExtInst>(V))
+ if (ZI->getSrcTy() == TruncatedTy)
+ return ZI->getOperand(0);
+ return B.CreateZExtOrTrunc(V, TruncatedTy);
+ };
+
+ // The actual instruction modification depends on the instruction type,
+ // unfortunately.
+ Value *NewI = nullptr;
+ if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+ NewI = B.CreateBinOp(BO->getOpcode(), ShrinkOperand(BO->getOperand(0)),
+ ShrinkOperand(BO->getOperand(1)));
+
+ // Any wrapping introduced by shrinking this operation shouldn't be
+ // considered undefined behavior. So, we can't unconditionally copy
+ // arithmetic wrapping flags to NewI.
+ cast<BinaryOperator>(NewI)->copyIRFlags(I, /*IncludeWrapFlags=*/false);
+ } else if (auto *CI = dyn_cast<ICmpInst>(I)) {
+ NewI =
+ B.CreateICmp(CI->getPredicate(), ShrinkOperand(CI->getOperand(0)),
+ ShrinkOperand(CI->getOperand(1)));
+ } else if (auto *SI = dyn_cast<SelectInst>(I)) {
+ NewI = B.CreateSelect(SI->getCondition(),
+ ShrinkOperand(SI->getTrueValue()),
+ ShrinkOperand(SI->getFalseValue()));
+ } else if (auto *CI = dyn_cast<CastInst>(I)) {
+ switch (CI->getOpcode()) {
+ default:
+ llvm_unreachable("Unhandled cast!");
+ case Instruction::Trunc:
+ NewI = ShrinkOperand(CI->getOperand(0));
+ break;
+ case Instruction::SExt:
+ NewI = B.CreateSExtOrTrunc(
+ CI->getOperand(0),
+ smallestIntegerVectorType(OriginalTy, TruncatedTy));
+ break;
+ case Instruction::ZExt:
+ NewI = B.CreateZExtOrTrunc(
+ CI->getOperand(0),
+ smallestIntegerVectorType(OriginalTy, TruncatedTy));
+ break;
+ }
+ } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
auto Elements0 = cast<FixedVectorType>(SI->getOperand(0)->getType())
->getNumElements();
- auto *O0 = B.CreateZExtOrTrunc(
- SI->getOperand(0),
- FixedVectorType::get(ScalarTruncatedTy, Elements0));
+ auto *O0 = B.CreateZExtOrTrunc(
+ SI->getOperand(0),
+ FixedVectorType::get(ScalarTruncatedTy, Elements0));
auto Elements1 = cast<FixedVectorType>(SI->getOperand(1)->getType())
->getNumElements();
- auto *O1 = B.CreateZExtOrTrunc(
- SI->getOperand(1),
- FixedVectorType::get(ScalarTruncatedTy, Elements1));
-
- NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
- } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
- // Don't do anything with the operands, just extend the result.
- continue;
- } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
+ auto *O1 = B.CreateZExtOrTrunc(
+ SI->getOperand(1),
+ FixedVectorType::get(ScalarTruncatedTy, Elements1));
+
+ NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
+ } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
+ // Don't do anything with the operands, just extend the result.
+ continue;
+ } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
auto Elements = cast<FixedVectorType>(IE->getOperand(0)->getType())
->getNumElements();
- auto *O0 = B.CreateZExtOrTrunc(
- IE->getOperand(0),
- FixedVectorType::get(ScalarTruncatedTy, Elements));
- auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
- NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
- } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
+ auto *O0 = B.CreateZExtOrTrunc(
+ IE->getOperand(0),
+ FixedVectorType::get(ScalarTruncatedTy, Elements));
+ auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
+ NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
+ } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
auto Elements = cast<FixedVectorType>(EE->getOperand(0)->getType())
->getNumElements();
- auto *O0 = B.CreateZExtOrTrunc(
- EE->getOperand(0),
- FixedVectorType::get(ScalarTruncatedTy, Elements));
- NewI = B.CreateExtractElement(O0, EE->getOperand(2));
- } else {
- // If we don't know what to do, be conservative and don't do anything.
- continue;
- }
-
- // Lastly, extend the result.
- NewI->takeName(cast<Instruction>(I));
- Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
- I->replaceAllUsesWith(Res);
- cast<Instruction>(I)->eraseFromParent();
- Erased.insert(I);
- VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
- }
- }
-
- // We'll have created a bunch of ZExts that are now parentless. Clean up.
- for (const auto &KV : Cost->getMinimalBitwidths()) {
- // If the value wasn't vectorized, we must maintain the original scalar
- // type. The absence of the value from VectorLoopValueMap indicates that it
- // wasn't vectorized.
- if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
- continue;
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *I = getOrCreateVectorValue(KV.first, Part);
- ZExtInst *Inst = dyn_cast<ZExtInst>(I);
- if (Inst && Inst->use_empty()) {
- Value *NewI = Inst->getOperand(0);
- Inst->eraseFromParent();
- VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
- }
- }
- }
-}
-
-void InnerLoopVectorizer::fixVectorizedLoop() {
- // Insert truncates and extends for any truncated instructions as hints to
- // InstCombine.
+ auto *O0 = B.CreateZExtOrTrunc(
+ EE->getOperand(0),
+ FixedVectorType::get(ScalarTruncatedTy, Elements));
+ NewI = B.CreateExtractElement(O0, EE->getOperand(2));
+ } else {
+ // If we don't know what to do, be conservative and don't do anything.
+ continue;
+ }
+
+ // Lastly, extend the result.
+ NewI->takeName(cast<Instruction>(I));
+ Value *Res = B.CreateZExtOrTrunc(NewI, OriginalTy);
+ I->replaceAllUsesWith(Res);
+ cast<Instruction>(I)->eraseFromParent();
+ Erased.insert(I);
+ VectorLoopValueMap.resetVectorValue(KV.first, Part, Res);
+ }
+ }
+
+ // We'll have created a bunch of ZExts that are now parentless. Clean up.
+ for (const auto &KV : Cost->getMinimalBitwidths()) {
+ // If the value wasn't vectorized, we must maintain the original scalar
+ // type. The absence of the value from VectorLoopValueMap indicates that it
+ // wasn't vectorized.
+ if (!VectorLoopValueMap.hasAnyVectorValue(KV.first))
+ continue;
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *I = getOrCreateVectorValue(KV.first, Part);
+ ZExtInst *Inst = dyn_cast<ZExtInst>(I);
+ if (Inst && Inst->use_empty()) {
+ Value *NewI = Inst->getOperand(0);
+ Inst->eraseFromParent();
+ VectorLoopValueMap.resetVectorValue(KV.first, Part, NewI);
+ }
+ }
+ }
+}
+
+void InnerLoopVectorizer::fixVectorizedLoop() {
+ // Insert truncates and extends for any truncated instructions as hints to
+ // InstCombine.
if (VF.isVector())
- truncateToMinimalBitwidths();
-
- // Fix widened non-induction PHIs by setting up the PHI operands.
- if (OrigPHIsToFix.size()) {
- assert(EnableVPlanNativePath &&
- "Unexpected non-induction PHIs for fixup in non VPlan-native path");
- fixNonInductionPHIs();
- }
-
- // At this point every instruction in the original loop is widened to a
- // vector form. Now we need to fix the recurrences in the loop. These PHI
- // nodes are currently empty because we did not want to introduce cycles.
- // This is the second stage of vectorizing recurrences.
- fixCrossIterationPHIs();
-
- // Forget the original basic block.
- PSE.getSE()->forgetLoop(OrigLoop);
-
- // Fix-up external users of the induction variables.
- for (auto &Entry : Legal->getInductionVars())
- fixupIVUsers(Entry.first, Entry.second,
- getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
- IVEndValues[Entry.first], LoopMiddleBlock);
-
- fixLCSSAPHIs();
- for (Instruction *PI : PredicatedInstructions)
- sinkScalarOperands(&*PI);
-
- // Remove redundant induction instructions.
- cse(LoopVectorBody);
-
- // Set/update profile weights for the vector and remainder loops as original
- // loop iterations are now distributed among them. Note that original loop
- // represented by LoopScalarBody becomes remainder loop after vectorization.
- //
- // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
- // end up getting slightly roughened result but that should be OK since
- // profile is not inherently precise anyway. Note also possible bypass of
- // vector code caused by legality checks is ignored, assigning all the weight
- // to the vector loop, optimistically.
+ truncateToMinimalBitwidths();
+
+ // Fix widened non-induction PHIs by setting up the PHI operands.
+ if (OrigPHIsToFix.size()) {
+ assert(EnableVPlanNativePath &&
+ "Unexpected non-induction PHIs for fixup in non VPlan-native path");
+ fixNonInductionPHIs();
+ }
+
+ // At this point every instruction in the original loop is widened to a
+ // vector form. Now we need to fix the recurrences in the loop. These PHI
+ // nodes are currently empty because we did not want to introduce cycles.
+ // This is the second stage of vectorizing recurrences.
+ fixCrossIterationPHIs();
+
+ // Forget the original basic block.
+ PSE.getSE()->forgetLoop(OrigLoop);
+
+ // Fix-up external users of the induction variables.
+ for (auto &Entry : Legal->getInductionVars())
+ fixupIVUsers(Entry.first, Entry.second,
+ getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
+ IVEndValues[Entry.first], LoopMiddleBlock);
+
+ fixLCSSAPHIs();
+ for (Instruction *PI : PredicatedInstructions)
+ sinkScalarOperands(&*PI);
+
+ // Remove redundant induction instructions.
+ cse(LoopVectorBody);
+
+ // Set/update profile weights for the vector and remainder loops as original
+ // loop iterations are now distributed among them. Note that original loop
+ // represented by LoopScalarBody becomes remainder loop after vectorization.
+ //
+ // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
+ // end up getting slightly roughened result but that should be OK since
+ // profile is not inherently precise anyway. Note also possible bypass of
+ // vector code caused by legality checks is ignored, assigning all the weight
+ // to the vector loop, optimistically.
//
// For scalable vectorization we can't know at compile time how many iterations
// of the loop are handled in one vector iteration, so instead assume a pessimistic
@@ -3970,199 +3970,199 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
setProfileInfoAfterUnrolling(
LI->getLoopFor(LoopScalarBody), LI->getLoopFor(LoopVectorBody),
LI->getLoopFor(LoopScalarBody), VF.getKnownMinValue() * UF);
-}
-
-void InnerLoopVectorizer::fixCrossIterationPHIs() {
- // In order to support recurrences we need to be able to vectorize Phi nodes.
- // Phi nodes have cycles, so we need to vectorize them in two stages. This is
- // stage #2: We now need to fix the recurrences by adding incoming edges to
- // the currently empty PHI nodes. At this point every instruction in the
- // original loop is widened to a vector form so we can use them to construct
- // the incoming edges.
- for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
- // Handle first-order recurrences and reductions that need to be fixed.
- if (Legal->isFirstOrderRecurrence(&Phi))
- fixFirstOrderRecurrence(&Phi);
- else if (Legal->isReductionVariable(&Phi))
- fixReduction(&Phi);
- }
-}
-
-void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
- // This is the second phase of vectorizing first-order recurrences. An
- // overview of the transformation is described below. Suppose we have the
- // following loop.
- //
- // for (int i = 0; i < n; ++i)
- // b[i] = a[i] - a[i - 1];
- //
- // There is a first-order recurrence on "a". For this loop, the shorthand
- // scalar IR looks like:
- //
- // scalar.ph:
- // s_init = a[-1]
- // br scalar.body
- //
- // scalar.body:
- // i = phi [0, scalar.ph], [i+1, scalar.body]
- // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
- // s2 = a[i]
- // b[i] = s2 - s1
- // br cond, scalar.body, ...
- //
- // In this example, s1 is a recurrence because it's value depends on the
- // previous iteration. In the first phase of vectorization, we created a
- // temporary value for s1. We now complete the vectorization and produce the
- // shorthand vector IR shown below (for VF = 4, UF = 1).
- //
- // vector.ph:
- // v_init = vector(..., ..., ..., a[-1])
- // br vector.body
- //
- // vector.body
- // i = phi [0, vector.ph], [i+4, vector.body]
- // v1 = phi [v_init, vector.ph], [v2, vector.body]
- // v2 = a[i, i+1, i+2, i+3];
- // v3 = vector(v1(3), v2(0, 1, 2))
- // b[i, i+1, i+2, i+3] = v2 - v3
- // br cond, vector.body, middle.block
- //
- // middle.block:
- // x = v2(3)
- // br scalar.ph
- //
- // scalar.ph:
- // s_init = phi [x, middle.block], [a[-1], otherwise]
- // br scalar.body
- //
- // After execution completes the vector loop, we extract the next value of
- // the recurrence (x) to use as the initial value in the scalar loop.
-
- // Get the original loop preheader and single loop latch.
- auto *Preheader = OrigLoop->getLoopPreheader();
- auto *Latch = OrigLoop->getLoopLatch();
-
- // Get the initial and previous values of the scalar recurrence.
- auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
- auto *Previous = Phi->getIncomingValueForBlock(Latch);
-
- // Create a vector from the initial value.
- auto *VectorInit = ScalarInit;
+}
+
+void InnerLoopVectorizer::fixCrossIterationPHIs() {
+ // In order to support recurrences we need to be able to vectorize Phi nodes.
+ // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+ // stage #2: We now need to fix the recurrences by adding incoming edges to
+ // the currently empty PHI nodes. At this point every instruction in the
+ // original loop is widened to a vector form so we can use them to construct
+ // the incoming edges.
+ for (PHINode &Phi : OrigLoop->getHeader()->phis()) {
+ // Handle first-order recurrences and reductions that need to be fixed.
+ if (Legal->isFirstOrderRecurrence(&Phi))
+ fixFirstOrderRecurrence(&Phi);
+ else if (Legal->isReductionVariable(&Phi))
+ fixReduction(&Phi);
+ }
+}
+
+void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
+ // This is the second phase of vectorizing first-order recurrences. An
+ // overview of the transformation is described below. Suppose we have the
+ // following loop.
+ //
+ // for (int i = 0; i < n; ++i)
+ // b[i] = a[i] - a[i - 1];
+ //
+ // There is a first-order recurrence on "a". For this loop, the shorthand
+ // scalar IR looks like:
+ //
+ // scalar.ph:
+ // s_init = a[-1]
+ // br scalar.body
+ //
+ // scalar.body:
+ // i = phi [0, scalar.ph], [i+1, scalar.body]
+ // s1 = phi [s_init, scalar.ph], [s2, scalar.body]
+ // s2 = a[i]
+ // b[i] = s2 - s1
+ // br cond, scalar.body, ...
+ //
+ // In this example, s1 is a recurrence because it's value depends on the
+ // previous iteration. In the first phase of vectorization, we created a
+ // temporary value for s1. We now complete the vectorization and produce the
+ // shorthand vector IR shown below (for VF = 4, UF = 1).
+ //
+ // vector.ph:
+ // v_init = vector(..., ..., ..., a[-1])
+ // br vector.body
+ //
+ // vector.body
+ // i = phi [0, vector.ph], [i+4, vector.body]
+ // v1 = phi [v_init, vector.ph], [v2, vector.body]
+ // v2 = a[i, i+1, i+2, i+3];
+ // v3 = vector(v1(3), v2(0, 1, 2))
+ // b[i, i+1, i+2, i+3] = v2 - v3
+ // br cond, vector.body, middle.block
+ //
+ // middle.block:
+ // x = v2(3)
+ // br scalar.ph
+ //
+ // scalar.ph:
+ // s_init = phi [x, middle.block], [a[-1], otherwise]
+ // br scalar.body
+ //
+ // After execution completes the vector loop, we extract the next value of
+ // the recurrence (x) to use as the initial value in the scalar loop.
+
+ // Get the original loop preheader and single loop latch.
+ auto *Preheader = OrigLoop->getLoopPreheader();
+ auto *Latch = OrigLoop->getLoopLatch();
+
+ // Get the initial and previous values of the scalar recurrence.
+ auto *ScalarInit = Phi->getIncomingValueForBlock(Preheader);
+ auto *Previous = Phi->getIncomingValueForBlock(Latch);
+
+ // Create a vector from the initial value.
+ auto *VectorInit = ScalarInit;
if (VF.isVector()) {
- Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
- VectorInit = Builder.CreateInsertElement(
+ VectorInit = Builder.CreateInsertElement(
PoisonValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
Builder.getInt32(VF.getKnownMinValue() - 1), "vector.recur.init");
- }
-
- // We constructed a temporary phi node in the first phase of vectorization.
- // This phi node will eventually be deleted.
- Builder.SetInsertPoint(
- cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
-
- // Create a phi node for the new recurrence. The current value will either be
- // the initial value inserted into a vector or loop-varying vector value.
- auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
- VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
-
- // Get the vectorized previous value of the last part UF - 1. It appears last
- // among all unrolled iterations, due to the order of their construction.
- Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
-
- // Find and set the insertion point after the previous value if it is an
- // instruction.
- BasicBlock::iterator InsertPt;
- // Note that the previous value may have been constant-folded so it is not
- // guaranteed to be an instruction in the vector loop.
- // FIXME: Loop invariant values do not form recurrences. We should deal with
- // them earlier.
- if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
- InsertPt = LoopVectorBody->getFirstInsertionPt();
- else {
- Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
- if (isa<PHINode>(PreviousLastPart))
- // If the previous value is a phi node, we should insert after all the phi
- // nodes in the block containing the PHI to avoid breaking basic block
- // verification. Note that the basic block may be different to
- // LoopVectorBody, in case we predicate the loop.
- InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
- else
- InsertPt = ++PreviousInst->getIterator();
- }
- Builder.SetInsertPoint(&*InsertPt);
-
- // We will construct a vector for the recurrence by combining the values for
- // the current and previous iterations. This is the required shuffle mask.
+ }
+
+ // We constructed a temporary phi node in the first phase of vectorization.
+ // This phi node will eventually be deleted.
+ Builder.SetInsertPoint(
+ cast<Instruction>(VectorLoopValueMap.getVectorValue(Phi, 0)));
+
+ // Create a phi node for the new recurrence. The current value will either be
+ // the initial value inserted into a vector or loop-varying vector value.
+ auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
+ VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
+
+ // Get the vectorized previous value of the last part UF - 1. It appears last
+ // among all unrolled iterations, due to the order of their construction.
+ Value *PreviousLastPart = getOrCreateVectorValue(Previous, UF - 1);
+
+ // Find and set the insertion point after the previous value if it is an
+ // instruction.
+ BasicBlock::iterator InsertPt;
+ // Note that the previous value may have been constant-folded so it is not
+ // guaranteed to be an instruction in the vector loop.
+ // FIXME: Loop invariant values do not form recurrences. We should deal with
+ // them earlier.
+ if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousLastPart))
+ InsertPt = LoopVectorBody->getFirstInsertionPt();
+ else {
+ Instruction *PreviousInst = cast<Instruction>(PreviousLastPart);
+ if (isa<PHINode>(PreviousLastPart))
+ // If the previous value is a phi node, we should insert after all the phi
+ // nodes in the block containing the PHI to avoid breaking basic block
+ // verification. Note that the basic block may be different to
+ // LoopVectorBody, in case we predicate the loop.
+ InsertPt = PreviousInst->getParent()->getFirstInsertionPt();
+ else
+ InsertPt = ++PreviousInst->getIterator();
+ }
+ Builder.SetInsertPoint(&*InsertPt);
+
+ // We will construct a vector for the recurrence by combining the values for
+ // the current and previous iterations. This is the required shuffle mask.
assert(!VF.isScalable());
SmallVector<int, 8> ShuffleMask(VF.getKnownMinValue());
ShuffleMask[0] = VF.getKnownMinValue() - 1;
for (unsigned I = 1; I < VF.getKnownMinValue(); ++I)
ShuffleMask[I] = I + VF.getKnownMinValue() - 1;
-
- // The vector from which to take the initial value for the current iteration
- // (actual or unrolled). Initially, this is the vector phi node.
- Value *Incoming = VecPhi;
-
- // Shuffle the current and previous vector and update the vector parts.
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
- Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
+
+ // The vector from which to take the initial value for the current iteration
+ // (actual or unrolled). Initially, this is the vector phi node.
+ Value *Incoming = VecPhi;
+
+ // Shuffle the current and previous vector and update the vector parts.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
+ Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
auto *Shuffle =
VF.isVector()
? Builder.CreateShuffleVector(Incoming, PreviousPart, ShuffleMask)
: Incoming;
- PhiPart->replaceAllUsesWith(Shuffle);
- cast<Instruction>(PhiPart)->eraseFromParent();
- VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
- Incoming = PreviousPart;
- }
-
- // Fix the latch value of the new recurrence in the vector loop.
- VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
-
- // Extract the last vector element in the middle block. This will be the
- // initial value for the recurrence when jumping to the scalar loop.
- auto *ExtractForScalar = Incoming;
+ PhiPart->replaceAllUsesWith(Shuffle);
+ cast<Instruction>(PhiPart)->eraseFromParent();
+ VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
+ Incoming = PreviousPart;
+ }
+
+ // Fix the latch value of the new recurrence in the vector loop.
+ VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+
+ // Extract the last vector element in the middle block. This will be the
+ // initial value for the recurrence when jumping to the scalar loop.
+ auto *ExtractForScalar = Incoming;
if (VF.isVector()) {
- Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
- ExtractForScalar = Builder.CreateExtractElement(
+ Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
+ ExtractForScalar = Builder.CreateExtractElement(
ExtractForScalar, Builder.getInt32(VF.getKnownMinValue() - 1),
"vector.recur.extract");
- }
- // Extract the second last element in the middle block if the
- // Phi is used outside the loop. We need to extract the phi itself
- // and not the last element (the phi update in the current iteration). This
- // will be the value when jumping to the exit block from the LoopMiddleBlock,
- // when the scalar loop is not run at all.
- Value *ExtractForPhiUsedOutsideLoop = nullptr;
+ }
+ // Extract the second last element in the middle block if the
+ // Phi is used outside the loop. We need to extract the phi itself
+ // and not the last element (the phi update in the current iteration). This
+ // will be the value when jumping to the exit block from the LoopMiddleBlock,
+ // when the scalar loop is not run at all.
+ Value *ExtractForPhiUsedOutsideLoop = nullptr;
if (VF.isVector())
- ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
+ ExtractForPhiUsedOutsideLoop = Builder.CreateExtractElement(
Incoming, Builder.getInt32(VF.getKnownMinValue() - 2),
"vector.recur.extract.for.phi");
- // When loop is unrolled without vectorizing, initialize
- // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
- // `Incoming`. This is analogous to the vectorized case above: extracting the
- // second last element when VF > 1.
- else if (UF > 1)
- ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
-
- // Fix the initial value of the original recurrence in the scalar loop.
- Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
- auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
- for (auto *BB : predecessors(LoopScalarPreHeader)) {
- auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
- Start->addIncoming(Incoming, BB);
- }
-
- Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
- Phi->setName("scalar.recur");
-
- // Finally, fix users of the recurrence outside the loop. The users will need
- // either the last value of the scalar recurrence or the last value of the
- // vector recurrence we extracted in the middle block. Since the loop is in
- // LCSSA form, we just need to find all the phi nodes for the original scalar
- // recurrence in the exit block, and then add an edge for the middle block.
+ // When loop is unrolled without vectorizing, initialize
+ // ExtractForPhiUsedOutsideLoop with the value just prior to unrolled value of
+ // `Incoming`. This is analogous to the vectorized case above: extracting the
+ // second last element when VF > 1.
+ else if (UF > 1)
+ ExtractForPhiUsedOutsideLoop = getOrCreateVectorValue(Previous, UF - 2);
+
+ // Fix the initial value of the original recurrence in the scalar loop.
+ Builder.SetInsertPoint(&*LoopScalarPreHeader->begin());
+ auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
+ for (auto *BB : predecessors(LoopScalarPreHeader)) {
+ auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
+ Start->addIncoming(Incoming, BB);
+ }
+
+ Phi->setIncomingValueForBlock(LoopScalarPreHeader, Start);
+ Phi->setName("scalar.recur");
+
+ // Finally, fix users of the recurrence outside the loop. The users will need
+ // either the last value of the scalar recurrence or the last value of the
+ // vector recurrence we extracted in the middle block. Since the loop is in
+ // LCSSA form, we just need to find all the phi nodes for the original scalar
+ // recurrence in the exit block, and then add an edge for the middle block.
// Note that LCSSA does not imply single entry when the original scalar loop
// had multiple exiting edges (as we always run the last iteration in the
// scalar epilogue); in that case, the exiting path through middle will be
@@ -4170,67 +4170,67 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
if (any_of(LCSSAPhi.incoming_values(),
[Phi](Value *V) { return V == Phi; }))
- LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
-}
-
-void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
- // Get it's reduction variable descriptor.
- assert(Legal->isReductionVariable(Phi) &&
- "Unable to find the reduction variable");
- RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
-
+ LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
+}
+
+void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
+ // Get it's reduction variable descriptor.
+ assert(Legal->isReductionVariable(Phi) &&
+ "Unable to find the reduction variable");
+ RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
+
RecurKind RK = RdxDesc.getRecurrenceKind();
- TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
- Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
- setDebugLocFromInst(Builder, ReductionStartValue);
+ TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
+ Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
+ setDebugLocFromInst(Builder, ReductionStartValue);
bool IsInLoopReductionPhi = Cost->isInLoopReduction(Phi);
-
- // This is the vector-clone of the value that leaves the loop.
- Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
-
- // Wrap flags are in general invalid after vectorization, clear them.
- clearReductionWrapFlags(RdxDesc);
-
- // Fix the vector-loop phi.
-
- // Reductions do not have to start at zero. They can start with
- // any loop invariant values.
- BasicBlock *Latch = OrigLoop->getLoopLatch();
- Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
- Value *Val = getOrCreateVectorValue(LoopVal, Part);
- cast<PHINode>(VecRdxPhi)
- ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
- }
-
- // Before each round, move the insertion point right between
- // the PHIs and the values we are going to write.
- // This allows us to write both PHINodes and the extractelement
- // instructions.
- Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
-
- setDebugLocFromInst(Builder, LoopExitInst);
-
- // If tail is folded by masking, the vector value to leave the loop should be
- // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
+
+ // This is the vector-clone of the value that leaves the loop.
+ Type *VecTy = getOrCreateVectorValue(LoopExitInst, 0)->getType();
+
+ // Wrap flags are in general invalid after vectorization, clear them.
+ clearReductionWrapFlags(RdxDesc);
+
+ // Fix the vector-loop phi.
+
+ // Reductions do not have to start at zero. They can start with
+ // any loop invariant values.
+ BasicBlock *Latch = OrigLoop->getLoopLatch();
+ Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *VecRdxPhi = getOrCreateVectorValue(Phi, Part);
+ Value *Val = getOrCreateVectorValue(LoopVal, Part);
+ cast<PHINode>(VecRdxPhi)
+ ->addIncoming(Val, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
+ }
+
+ // Before each round, move the insertion point right between
+ // the PHIs and the values we are going to write.
+ // This allows us to write both PHINodes and the extractelement
+ // instructions.
+ Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+
+ setDebugLocFromInst(Builder, LoopExitInst);
+
+ // If tail is folded by masking, the vector value to leave the loop should be
+ // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
// instead of the former. For an inloop reduction the reduction will already
// be predicated, and does not need to be handled here.
if (Cost->foldTailByMasking() && !IsInLoopReductionPhi) {
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *VecLoopExitInst =
- VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
- Value *Sel = nullptr;
- for (User *U : VecLoopExitInst->users()) {
- if (isa<SelectInst>(U)) {
- assert(!Sel && "Reduction exit feeding two selects");
- Sel = U;
- } else
- assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
- }
- assert(Sel && "Reduction exit feeds no select");
- VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *VecLoopExitInst =
+ VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+ Value *Sel = nullptr;
+ for (User *U : VecLoopExitInst->users()) {
+ if (isa<SelectInst>(U)) {
+ assert(!Sel && "Reduction exit feeding two selects");
+ Sel = U;
+ } else
+ assert(isa<PHINode>(U) && "Reduction exit must feed Phi's or select");
+ }
+ assert(Sel && "Reduction exit feeds no select");
+ VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, Sel);
// If the target can create a predicated operator for the reduction at no
// extra cost in the loop (for example a predicated vadd), it can be
@@ -4246,140 +4246,140 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
VecRdxPhi->setIncomingValueForBlock(
LI->getLoopFor(LoopVectorBody)->getLoopLatch(), Sel);
}
- }
- }
-
- // If the vector reduction can be performed in a smaller type, we truncate
- // then extend the loop exit value to enable InstCombine to evaluate the
- // entire expression in the smaller type.
+ }
+ }
+
+ // If the vector reduction can be performed in a smaller type, we truncate
+ // then extend the loop exit value to enable InstCombine to evaluate the
+ // entire expression in the smaller type.
if (VF.isVector() && Phi->getType() != RdxDesc.getRecurrenceType()) {
assert(!IsInLoopReductionPhi && "Unexpected truncated inloop reduction!");
assert(!VF.isScalable() && "scalable vectors not yet supported.");
Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
- Builder.SetInsertPoint(
- LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
- VectorParts RdxParts(UF);
- for (unsigned Part = 0; Part < UF; ++Part) {
- RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
- Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
- Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
- : Builder.CreateZExt(Trunc, VecTy);
- for (Value::user_iterator UI = RdxParts[Part]->user_begin();
- UI != RdxParts[Part]->user_end();)
- if (*UI != Trunc) {
- (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
- RdxParts[Part] = Extnd;
- } else {
- ++UI;
- }
- }
- Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
- for (unsigned Part = 0; Part < UF; ++Part) {
- RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
- VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
- }
- }
-
- // Reduce all of the unrolled parts into a single vector.
- Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
+ Builder.SetInsertPoint(
+ LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
+ VectorParts RdxParts(UF);
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ RdxParts[Part] = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+ Value *Trunc = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+ Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
+ : Builder.CreateZExt(Trunc, VecTy);
+ for (Value::user_iterator UI = RdxParts[Part]->user_begin();
+ UI != RdxParts[Part]->user_end();)
+ if (*UI != Trunc) {
+ (*UI++)->replaceUsesOfWith(RdxParts[Part], Extnd);
+ RdxParts[Part] = Extnd;
+ } else {
+ ++UI;
+ }
+ }
+ Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ RdxParts[Part] = Builder.CreateTrunc(RdxParts[Part], RdxVecTy);
+ VectorLoopValueMap.resetVectorValue(LoopExitInst, Part, RdxParts[Part]);
+ }
+ }
+
+ // Reduce all of the unrolled parts into a single vector.
+ Value *ReducedPartRdx = VectorLoopValueMap.getVectorValue(LoopExitInst, 0);
unsigned Op = RecurrenceDescriptor::getOpcode(RK);
-
- // The middle block terminator has already been assigned a DebugLoc here (the
- // OrigLoop's single latch terminator). We want the whole middle block to
- // appear to execute on this line because: (a) it is all compiler generated,
- // (b) these instructions are always executed after evaluating the latch
- // conditional branch, and (c) other passes may add new predecessors which
- // terminate on this line. This is the easiest way to ensure we don't
- // accidentally cause an extra step back into the loop while debugging.
- setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
- for (unsigned Part = 1; Part < UF; ++Part) {
- Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
- if (Op != Instruction::ICmp && Op != Instruction::FCmp)
- // Floating point operations had to be 'fast' to enable the reduction.
- ReducedPartRdx = addFastMathFlag(
- Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
- ReducedPartRdx, "bin.rdx"),
- RdxDesc.getFastMathFlags());
- else
+
+ // The middle block terminator has already been assigned a DebugLoc here (the
+ // OrigLoop's single latch terminator). We want the whole middle block to
+ // appear to execute on this line because: (a) it is all compiler generated,
+ // (b) these instructions are always executed after evaluating the latch
+ // conditional branch, and (c) other passes may add new predecessors which
+ // terminate on this line. This is the easiest way to ensure we don't
+ // accidentally cause an extra step back into the loop while debugging.
+ setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator());
+ for (unsigned Part = 1; Part < UF; ++Part) {
+ Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part);
+ if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+ // Floating point operations had to be 'fast' to enable the reduction.
+ ReducedPartRdx = addFastMathFlag(
+ Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart,
+ ReducedPartRdx, "bin.rdx"),
+ RdxDesc.getFastMathFlags());
+ else
ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart);
- }
-
+ }
+
// Create the reduction after the loop. Note that inloop reductions create the
// target reduction in the loop using a Reduction recipe.
if (VF.isVector() && !IsInLoopReductionPhi) {
- ReducedPartRdx =
+ ReducedPartRdx =
createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx);
- // If the reduction can be performed in a smaller type, we need to extend
- // the reduction to the wider type before we branch to the original loop.
- if (Phi->getType() != RdxDesc.getRecurrenceType())
- ReducedPartRdx =
- RdxDesc.isSigned()
- ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
- : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
- }
-
- // Create a phi node that merges control-flow from the backedge-taken check
- // block and the middle block.
- PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
- LoopScalarPreHeader->getTerminator());
- for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
- BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
- BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
-
- // Now, we need to fix the users of the reduction variable
- // inside and outside of the scalar remainder loop.
-
+ // If the reduction can be performed in a smaller type, we need to extend
+ // the reduction to the wider type before we branch to the original loop.
+ if (Phi->getType() != RdxDesc.getRecurrenceType())
+ ReducedPartRdx =
+ RdxDesc.isSigned()
+ ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
+ : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
+ }
+
+ // Create a phi node that merges control-flow from the backedge-taken check
+ // block and the middle block.
+ PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
+ LoopScalarPreHeader->getTerminator());
+ for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
+ BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
+ // Now, we need to fix the users of the reduction variable
+ // inside and outside of the scalar remainder loop.
+
// We know that the loop is in LCSSA form. We need to update the PHI nodes
// in the exit blocks. See comment on analogous loop in
// fixFirstOrderRecurrence for a more complete explaination of the logic.
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
if (any_of(LCSSAPhi.incoming_values(),
[LoopExitInst](Value *V) { return V == LoopExitInst; }))
- LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
-
+ LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
// Fix the scalar loop reduction variable with the incoming reduction sum
// from the vector body and from the backedge value.
- int IncomingEdgeBlockIdx =
- Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
- assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
- // Pick the other block.
- int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
- Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
- Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
-}
-
-void InnerLoopVectorizer::clearReductionWrapFlags(
- RecurrenceDescriptor &RdxDesc) {
+ int IncomingEdgeBlockIdx =
+ Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
+ assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+ // Pick the other block.
+ int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
+ Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+ Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+}
+
+void InnerLoopVectorizer::clearReductionWrapFlags(
+ RecurrenceDescriptor &RdxDesc) {
RecurKind RK = RdxDesc.getRecurrenceKind();
if (RK != RecurKind::Add && RK != RecurKind::Mul)
- return;
-
- Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
- assert(LoopExitInstr && "null loop exit instruction");
- SmallVector<Instruction *, 8> Worklist;
- SmallPtrSet<Instruction *, 8> Visited;
- Worklist.push_back(LoopExitInstr);
- Visited.insert(LoopExitInstr);
-
- while (!Worklist.empty()) {
- Instruction *Cur = Worklist.pop_back_val();
- if (isa<OverflowingBinaryOperator>(Cur))
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *V = getOrCreateVectorValue(Cur, Part);
- cast<Instruction>(V)->dropPoisonGeneratingFlags();
- }
-
- for (User *U : Cur->users()) {
- Instruction *UI = cast<Instruction>(U);
- if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
- Visited.insert(UI).second)
- Worklist.push_back(UI);
- }
- }
-}
-
-void InnerLoopVectorizer::fixLCSSAPHIs() {
- for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
+ return;
+
+ Instruction *LoopExitInstr = RdxDesc.getLoopExitInstr();
+ assert(LoopExitInstr && "null loop exit instruction");
+ SmallVector<Instruction *, 8> Worklist;
+ SmallPtrSet<Instruction *, 8> Visited;
+ Worklist.push_back(LoopExitInstr);
+ Visited.insert(LoopExitInstr);
+
+ while (!Worklist.empty()) {
+ Instruction *Cur = Worklist.pop_back_val();
+ if (isa<OverflowingBinaryOperator>(Cur))
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *V = getOrCreateVectorValue(Cur, Part);
+ cast<Instruction>(V)->dropPoisonGeneratingFlags();
+ }
+
+ for (User *U : Cur->users()) {
+ Instruction *UI = cast<Instruction>(U);
+ if ((Cur != LoopExitInstr || OrigLoop->contains(UI->getParent())) &&
+ Visited.insert(UI).second)
+ Worklist.push_back(UI);
+ }
+ }
+}
+
+void InnerLoopVectorizer::fixLCSSAPHIs() {
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis()) {
if (LCSSAPhi.getBasicBlockIndex(LoopMiddleBlock) != -1)
// Some phis were already hand updated by the reduction and recurrence
// code above, leave them alone.
@@ -4401,206 +4401,206 @@ void InnerLoopVectorizer::fixLCSSAPHIs() {
Value *lastIncomingValue =
getOrCreateScalarValue(IncomingValue, { UF - 1, LastLane });
LCSSAPhi.addIncoming(lastIncomingValue, LoopMiddleBlock);
- }
-}
-
-void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
- // The basic block and loop containing the predicated instruction.
- auto *PredBB = PredInst->getParent();
- auto *VectorLoop = LI->getLoopFor(PredBB);
-
- // Initialize a worklist with the operands of the predicated instruction.
- SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
-
- // Holds instructions that we need to analyze again. An instruction may be
- // reanalyzed if we don't yet know if we can sink it or not.
- SmallVector<Instruction *, 8> InstsToReanalyze;
-
- // Returns true if a given use occurs in the predicated block. Phi nodes use
- // their operands in their corresponding predecessor blocks.
- auto isBlockOfUsePredicated = [&](Use &U) -> bool {
- auto *I = cast<Instruction>(U.getUser());
- BasicBlock *BB = I->getParent();
- if (auto *Phi = dyn_cast<PHINode>(I))
- BB = Phi->getIncomingBlock(
- PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
- return BB == PredBB;
- };
-
- // Iteratively sink the scalarized operands of the predicated instruction
- // into the block we created for it. When an instruction is sunk, it's
- // operands are then added to the worklist. The algorithm ends after one pass
- // through the worklist doesn't sink a single instruction.
- bool Changed;
- do {
- // Add the instructions that need to be reanalyzed to the worklist, and
- // reset the changed indicator.
- Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
- InstsToReanalyze.clear();
- Changed = false;
-
- while (!Worklist.empty()) {
- auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
-
- // We can't sink an instruction if it is a phi node, is already in the
- // predicated block, is not in the loop, or may have side effects.
- if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
- !VectorLoop->contains(I) || I->mayHaveSideEffects())
- continue;
-
- // It's legal to sink the instruction if all its uses occur in the
- // predicated block. Otherwise, there's nothing to do yet, and we may
- // need to reanalyze the instruction.
- if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
- InstsToReanalyze.push_back(I);
- continue;
- }
-
- // Move the instruction to the beginning of the predicated block, and add
- // it's operands to the worklist.
- I->moveBefore(&*PredBB->getFirstInsertionPt());
- Worklist.insert(I->op_begin(), I->op_end());
-
- // The sinking may have enabled other instructions to be sunk, so we will
- // need to iterate.
- Changed = true;
- }
- } while (Changed);
-}
-
-void InnerLoopVectorizer::fixNonInductionPHIs() {
- for (PHINode *OrigPhi : OrigPHIsToFix) {
- PHINode *NewPhi =
- cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
- unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
-
- SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
- predecessors(OrigPhi->getParent()));
- SmallVector<BasicBlock *, 2> VectorBBPredecessors(
- predecessors(NewPhi->getParent()));
- assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
- "Scalar and Vector BB should have the same number of predecessors");
-
- // The insertion point in Builder may be invalidated by the time we get
- // here. Force the Builder insertion point to something valid so that we do
- // not run into issues during insertion point restore in
- // getOrCreateVectorValue calls below.
- Builder.SetInsertPoint(NewPhi);
-
- // The predecessor order is preserved and we can rely on mapping between
- // scalar and vector block predecessors.
- for (unsigned i = 0; i < NumIncomingValues; ++i) {
- BasicBlock *NewPredBB = VectorBBPredecessors[i];
-
- // When looking up the new scalar/vector values to fix up, use incoming
- // values from original phi.
- Value *ScIncV =
- OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
-
- // Scalar incoming value may need a broadcast
- Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
- NewPhi->addIncoming(NewIncV, NewPredBB);
- }
- }
-}
-
+ }
+}
+
+void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
+ // The basic block and loop containing the predicated instruction.
+ auto *PredBB = PredInst->getParent();
+ auto *VectorLoop = LI->getLoopFor(PredBB);
+
+ // Initialize a worklist with the operands of the predicated instruction.
+ SetVector<Value *> Worklist(PredInst->op_begin(), PredInst->op_end());
+
+ // Holds instructions that we need to analyze again. An instruction may be
+ // reanalyzed if we don't yet know if we can sink it or not.
+ SmallVector<Instruction *, 8> InstsToReanalyze;
+
+ // Returns true if a given use occurs in the predicated block. Phi nodes use
+ // their operands in their corresponding predecessor blocks.
+ auto isBlockOfUsePredicated = [&](Use &U) -> bool {
+ auto *I = cast<Instruction>(U.getUser());
+ BasicBlock *BB = I->getParent();
+ if (auto *Phi = dyn_cast<PHINode>(I))
+ BB = Phi->getIncomingBlock(
+ PHINode::getIncomingValueNumForOperand(U.getOperandNo()));
+ return BB == PredBB;
+ };
+
+ // Iteratively sink the scalarized operands of the predicated instruction
+ // into the block we created for it. When an instruction is sunk, it's
+ // operands are then added to the worklist. The algorithm ends after one pass
+ // through the worklist doesn't sink a single instruction.
+ bool Changed;
+ do {
+ // Add the instructions that need to be reanalyzed to the worklist, and
+ // reset the changed indicator.
+ Worklist.insert(InstsToReanalyze.begin(), InstsToReanalyze.end());
+ InstsToReanalyze.clear();
+ Changed = false;
+
+ while (!Worklist.empty()) {
+ auto *I = dyn_cast<Instruction>(Worklist.pop_back_val());
+
+ // We can't sink an instruction if it is a phi node, is already in the
+ // predicated block, is not in the loop, or may have side effects.
+ if (!I || isa<PHINode>(I) || I->getParent() == PredBB ||
+ !VectorLoop->contains(I) || I->mayHaveSideEffects())
+ continue;
+
+ // It's legal to sink the instruction if all its uses occur in the
+ // predicated block. Otherwise, there's nothing to do yet, and we may
+ // need to reanalyze the instruction.
+ if (!llvm::all_of(I->uses(), isBlockOfUsePredicated)) {
+ InstsToReanalyze.push_back(I);
+ continue;
+ }
+
+ // Move the instruction to the beginning of the predicated block, and add
+ // it's operands to the worklist.
+ I->moveBefore(&*PredBB->getFirstInsertionPt());
+ Worklist.insert(I->op_begin(), I->op_end());
+
+ // The sinking may have enabled other instructions to be sunk, so we will
+ // need to iterate.
+ Changed = true;
+ }
+ } while (Changed);
+}
+
+void InnerLoopVectorizer::fixNonInductionPHIs() {
+ for (PHINode *OrigPhi : OrigPHIsToFix) {
+ PHINode *NewPhi =
+ cast<PHINode>(VectorLoopValueMap.getVectorValue(OrigPhi, 0));
+ unsigned NumIncomingValues = OrigPhi->getNumIncomingValues();
+
+ SmallVector<BasicBlock *, 2> ScalarBBPredecessors(
+ predecessors(OrigPhi->getParent()));
+ SmallVector<BasicBlock *, 2> VectorBBPredecessors(
+ predecessors(NewPhi->getParent()));
+ assert(ScalarBBPredecessors.size() == VectorBBPredecessors.size() &&
+ "Scalar and Vector BB should have the same number of predecessors");
+
+ // The insertion point in Builder may be invalidated by the time we get
+ // here. Force the Builder insertion point to something valid so that we do
+ // not run into issues during insertion point restore in
+ // getOrCreateVectorValue calls below.
+ Builder.SetInsertPoint(NewPhi);
+
+ // The predecessor order is preserved and we can rely on mapping between
+ // scalar and vector block predecessors.
+ for (unsigned i = 0; i < NumIncomingValues; ++i) {
+ BasicBlock *NewPredBB = VectorBBPredecessors[i];
+
+ // When looking up the new scalar/vector values to fix up, use incoming
+ // values from original phi.
+ Value *ScIncV =
+ OrigPhi->getIncomingValueForBlock(ScalarBBPredecessors[i]);
+
+ // Scalar incoming value may need a broadcast
+ Value *NewIncV = getOrCreateVectorValue(ScIncV, 0);
+ NewPhi->addIncoming(NewIncV, NewPredBB);
+ }
+ }
+}
+
void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef,
VPUser &Operands, unsigned UF,
ElementCount VF, bool IsPtrLoopInvariant,
- SmallBitVector &IsIndexLoopInvariant,
- VPTransformState &State) {
- // Construct a vector GEP by widening the operands of the scalar GEP as
- // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
- // results in a vector of pointers when at least one operand of the GEP
- // is vector-typed. Thus, to keep the representation compact, we only use
- // vector-typed operands for loop-varying values.
-
+ SmallBitVector &IsIndexLoopInvariant,
+ VPTransformState &State) {
+ // Construct a vector GEP by widening the operands of the scalar GEP as
+ // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+ // results in a vector of pointers when at least one operand of the GEP
+ // is vector-typed. Thus, to keep the representation compact, we only use
+ // vector-typed operands for loop-varying values.
+
if (VF.isVector() && IsPtrLoopInvariant && IsIndexLoopInvariant.all()) {
- // If we are vectorizing, but the GEP has only loop-invariant operands,
- // the GEP we build (by only using vector-typed operands for
- // loop-varying values) would be a scalar pointer. Thus, to ensure we
- // produce a vector of pointers, we need to either arbitrarily pick an
- // operand to broadcast, or broadcast a clone of the original GEP.
- // Here, we broadcast a clone of the original.
- //
- // TODO: If at some point we decide to scalarize instructions having
- // loop-invariant operands, this special case will no longer be
- // required. We would add the scalarization decision to
- // collectLoopScalars() and teach getVectorValue() to broadcast
- // the lane-zero scalar value.
- auto *Clone = Builder.Insert(GEP->clone());
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
+ // If we are vectorizing, but the GEP has only loop-invariant operands,
+ // the GEP we build (by only using vector-typed operands for
+ // loop-varying values) would be a scalar pointer. Thus, to ensure we
+ // produce a vector of pointers, we need to either arbitrarily pick an
+ // operand to broadcast, or broadcast a clone of the original GEP.
+ // Here, we broadcast a clone of the original.
+ //
+ // TODO: If at some point we decide to scalarize instructions having
+ // loop-invariant operands, this special case will no longer be
+ // required. We would add the scalarization decision to
+ // collectLoopScalars() and teach getVectorValue() to broadcast
+ // the lane-zero scalar value.
+ auto *Clone = Builder.Insert(GEP->clone());
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *EntryPart = Builder.CreateVectorSplat(VF, Clone);
State.set(VPDef, GEP, EntryPart, Part);
- addMetadata(EntryPart, GEP);
- }
- } else {
- // If the GEP has at least one loop-varying operand, we are sure to
- // produce a vector of pointers. But if we are only unrolling, we want
- // to produce a scalar GEP for each unroll part. Thus, the GEP we
- // produce with the code below will be scalar (if VF == 1) or vector
- // (otherwise). Note that for the unroll-only case, we still maintain
- // values in the vector mapping with initVector, as we do for other
- // instructions.
- for (unsigned Part = 0; Part < UF; ++Part) {
- // The pointer operand of the new GEP. If it's loop-invariant, we
- // won't broadcast it.
- auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
- : State.get(Operands.getOperand(0), Part);
-
- // Collect all the indices for the new GEP. If any index is
- // loop-invariant, we won't broadcast it.
- SmallVector<Value *, 4> Indices;
- for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
- VPValue *Operand = Operands.getOperand(I);
- if (IsIndexLoopInvariant[I - 1])
- Indices.push_back(State.get(Operand, {0, 0}));
- else
- Indices.push_back(State.get(Operand, Part));
- }
-
- // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
- // but it should be a vector, otherwise.
- auto *NewGEP =
- GEP->isInBounds()
- ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
- Indices)
- : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
+ addMetadata(EntryPart, GEP);
+ }
+ } else {
+ // If the GEP has at least one loop-varying operand, we are sure to
+ // produce a vector of pointers. But if we are only unrolling, we want
+ // to produce a scalar GEP for each unroll part. Thus, the GEP we
+ // produce with the code below will be scalar (if VF == 1) or vector
+ // (otherwise). Note that for the unroll-only case, we still maintain
+ // values in the vector mapping with initVector, as we do for other
+ // instructions.
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // The pointer operand of the new GEP. If it's loop-invariant, we
+ // won't broadcast it.
+ auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
+ : State.get(Operands.getOperand(0), Part);
+
+ // Collect all the indices for the new GEP. If any index is
+ // loop-invariant, we won't broadcast it.
+ SmallVector<Value *, 4> Indices;
+ for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
+ VPValue *Operand = Operands.getOperand(I);
+ if (IsIndexLoopInvariant[I - 1])
+ Indices.push_back(State.get(Operand, {0, 0}));
+ else
+ Indices.push_back(State.get(Operand, Part));
+ }
+
+ // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+ // but it should be a vector, otherwise.
+ auto *NewGEP =
+ GEP->isInBounds()
+ ? Builder.CreateInBoundsGEP(GEP->getSourceElementType(), Ptr,
+ Indices)
+ : Builder.CreateGEP(GEP->getSourceElementType(), Ptr, Indices);
assert((VF.isScalar() || NewGEP->getType()->isVectorTy()) &&
- "NewGEP is not a pointer vector");
+ "NewGEP is not a pointer vector");
State.set(VPDef, GEP, NewGEP, Part);
- addMetadata(NewGEP, GEP);
- }
- }
-}
-
+ addMetadata(NewGEP, GEP);
+ }
+ }
+}
+
void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
RecurrenceDescriptor *RdxDesc,
Value *StartV, unsigned UF,
ElementCount VF) {
assert(!VF.isScalable() && "scalable vectors not yet supported.");
- PHINode *P = cast<PHINode>(PN);
- if (EnableVPlanNativePath) {
- // Currently we enter here in the VPlan-native path for non-induction
- // PHIs where all control flow is uniform. We simply widen these PHIs.
- // Create a vector phi with no operands - the vector phi operands will be
- // set at the end of vector code generation.
- Type *VecTy =
+ PHINode *P = cast<PHINode>(PN);
+ if (EnableVPlanNativePath) {
+ // Currently we enter here in the VPlan-native path for non-induction
+ // PHIs where all control flow is uniform. We simply widen these PHIs.
+ // Create a vector phi with no operands - the vector phi operands will be
+ // set at the end of vector code generation.
+ Type *VecTy =
(VF.isScalar()) ? PN->getType() : VectorType::get(PN->getType(), VF);
- Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
- VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
- OrigPHIsToFix.push_back(P);
-
- return;
- }
-
- assert(PN->getParent() == OrigLoop->getHeader() &&
- "Non-header phis should have been handled elsewhere");
-
- // In order to support recurrences we need to be able to vectorize Phi nodes.
- // Phi nodes have cycles, so we need to vectorize them in two stages. This is
- // stage #1: We create a new vector PHI node with no incoming edges. We'll use
- // this value when we vectorize all of the instructions that use the PHI.
+ Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
+ VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
+ OrigPHIsToFix.push_back(P);
+
+ return;
+ }
+
+ assert(PN->getParent() == OrigLoop->getHeader() &&
+ "Non-header phis should have been handled elsewhere");
+
+ // In order to support recurrences we need to be able to vectorize Phi nodes.
+ // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+ // stage #1: We create a new vector PHI node with no incoming edges. We'll use
+ // this value when we vectorize all of the instructions that use the PHI.
if (RdxDesc || Legal->isFirstOrderRecurrence(P)) {
Value *Iden = nullptr;
bool ScalarPHI =
@@ -4637,44 +4637,44 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
}
}
- for (unsigned Part = 0; Part < UF; ++Part) {
- // This is phase one of vectorizing PHIs.
- Value *EntryPart = PHINode::Create(
- VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
- VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // This is phase one of vectorizing PHIs.
+ Value *EntryPart = PHINode::Create(
+ VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
+ VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
if (StartV) {
// Make sure to add the reduction start value only to the
// first unroll part.
Value *StartVal = (Part == 0) ? StartV : Iden;
cast<PHINode>(EntryPart)->addIncoming(StartVal, LoopVectorPreHeader);
}
- }
- return;
- }
-
+ }
+ return;
+ }
+
assert(!Legal->isReductionVariable(P) &&
"reductions should be handled above");
- setDebugLocFromInst(Builder, P);
-
- // This PHINode must be an induction variable.
- // Make sure that we know about it.
- assert(Legal->getInductionVars().count(P) && "Not an induction variable");
-
- InductionDescriptor II = Legal->getInductionVars().lookup(P);
- const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
-
- // FIXME: The newly created binary instructions should contain nsw/nuw flags,
- // which can be found from the original scalar operations.
- switch (II.getKind()) {
- case InductionDescriptor::IK_NoInduction:
- llvm_unreachable("Unknown induction");
- case InductionDescriptor::IK_IntInduction:
- case InductionDescriptor::IK_FpInduction:
- llvm_unreachable("Integer/fp induction is handled elsewhere.");
- case InductionDescriptor::IK_PtrInduction: {
- // Handle the pointer induction variable case.
- assert(P->getType()->isPointerTy() && "Unexpected type.");
+ setDebugLocFromInst(Builder, P);
+
+ // This PHINode must be an induction variable.
+ // Make sure that we know about it.
+ assert(Legal->getInductionVars().count(P) && "Not an induction variable");
+
+ InductionDescriptor II = Legal->getInductionVars().lookup(P);
+ const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+
+ // FIXME: The newly created binary instructions should contain nsw/nuw flags,
+ // which can be found from the original scalar operations.
+ switch (II.getKind()) {
+ case InductionDescriptor::IK_NoInduction:
+ llvm_unreachable("Unknown induction");
+ case InductionDescriptor::IK_IntInduction:
+ case InductionDescriptor::IK_FpInduction:
+ llvm_unreachable("Integer/fp induction is handled elsewhere.");
+ case InductionDescriptor::IK_PtrInduction: {
+ // Handle the pointer induction variable case.
+ assert(P->getType()->isPointerTy() && "Unexpected type.");
if (Cost->isScalarAfterVectorization(P, VF)) {
// This is the normalized GEP that starts counting at zero.
@@ -4695,9 +4695,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
SclrGep->setName("next.gep");
VectorLoopValueMap.setScalarValue(P, {Part, Lane}, SclrGep);
}
- }
+ }
return;
- }
+ }
assert(isa<SCEVConstant>(II.getStep()) &&
"Induction step not a SCEV constant!");
Type *PhiType = II.getStep()->getType();
@@ -4743,271 +4743,271 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
"vector.gep"));
VectorLoopValueMap.setVectorValue(P, Part, GEP);
}
- }
- }
-}
-
-/// A helper function for checking whether an integer division-related
-/// instruction may divide by zero (in which case it must be predicated if
-/// executed conditionally in the scalar code).
-/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
-/// Non-zero divisors that are non compile-time constants will not be
-/// converted into multiplication, so we will still end up scalarizing
-/// the division, but can do so w/o predication.
-static bool mayDivideByZero(Instruction &I) {
- assert((I.getOpcode() == Instruction::UDiv ||
- I.getOpcode() == Instruction::SDiv ||
- I.getOpcode() == Instruction::URem ||
- I.getOpcode() == Instruction::SRem) &&
- "Unexpected instruction");
- Value *Divisor = I.getOperand(1);
- auto *CInt = dyn_cast<ConstantInt>(Divisor);
- return !CInt || CInt->isZero();
-}
-
+ }
+ }
+}
+
+/// A helper function for checking whether an integer division-related
+/// instruction may divide by zero (in which case it must be predicated if
+/// executed conditionally in the scalar code).
+/// TODO: It may be worthwhile to generalize and check isKnownNonZero().
+/// Non-zero divisors that are non compile-time constants will not be
+/// converted into multiplication, so we will still end up scalarizing
+/// the division, but can do so w/o predication.
+static bool mayDivideByZero(Instruction &I) {
+ assert((I.getOpcode() == Instruction::UDiv ||
+ I.getOpcode() == Instruction::SDiv ||
+ I.getOpcode() == Instruction::URem ||
+ I.getOpcode() == Instruction::SRem) &&
+ "Unexpected instruction");
+ Value *Divisor = I.getOperand(1);
+ auto *CInt = dyn_cast<ConstantInt>(Divisor);
+ return !CInt || CInt->isZero();
+}
+
void InnerLoopVectorizer::widenInstruction(Instruction &I, VPValue *Def,
VPUser &User,
- VPTransformState &State) {
- switch (I.getOpcode()) {
- case Instruction::Call:
- case Instruction::Br:
- case Instruction::PHI:
- case Instruction::GetElementPtr:
- case Instruction::Select:
- llvm_unreachable("This instruction is handled by a different recipe.");
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::SRem:
- case Instruction::URem:
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::FNeg:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::FDiv:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- // Just widen unops and binops.
- setDebugLocFromInst(Builder, &I);
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- SmallVector<Value *, 2> Ops;
- for (VPValue *VPOp : User.operands())
- Ops.push_back(State.get(VPOp, Part));
-
- Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
-
- if (auto *VecOp = dyn_cast<Instruction>(V))
- VecOp->copyIRFlags(&I);
-
- // Use this vector value for all users of the original instruction.
+ VPTransformState &State) {
+ switch (I.getOpcode()) {
+ case Instruction::Call:
+ case Instruction::Br:
+ case Instruction::PHI:
+ case Instruction::GetElementPtr:
+ case Instruction::Select:
+ llvm_unreachable("This instruction is handled by a different recipe.");
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::FNeg:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // Just widen unops and binops.
+ setDebugLocFromInst(Builder, &I);
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ SmallVector<Value *, 2> Ops;
+ for (VPValue *VPOp : User.operands())
+ Ops.push_back(State.get(VPOp, Part));
+
+ Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
+
+ if (auto *VecOp = dyn_cast<Instruction>(V))
+ VecOp->copyIRFlags(&I);
+
+ // Use this vector value for all users of the original instruction.
State.set(Def, &I, V, Part);
- addMetadata(V, &I);
- }
-
- break;
- }
- case Instruction::ICmp:
- case Instruction::FCmp: {
- // Widen compares. Generate vector compares.
- bool FCmp = (I.getOpcode() == Instruction::FCmp);
- auto *Cmp = cast<CmpInst>(&I);
- setDebugLocFromInst(Builder, Cmp);
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *A = State.get(User.getOperand(0), Part);
- Value *B = State.get(User.getOperand(1), Part);
- Value *C = nullptr;
- if (FCmp) {
- // Propagate fast math flags.
- IRBuilder<>::FastMathFlagGuard FMFG(Builder);
- Builder.setFastMathFlags(Cmp->getFastMathFlags());
- C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
- } else {
- C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
- }
+ addMetadata(V, &I);
+ }
+
+ break;
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ // Widen compares. Generate vector compares.
+ bool FCmp = (I.getOpcode() == Instruction::FCmp);
+ auto *Cmp = cast<CmpInst>(&I);
+ setDebugLocFromInst(Builder, Cmp);
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *A = State.get(User.getOperand(0), Part);
+ Value *B = State.get(User.getOperand(1), Part);
+ Value *C = nullptr;
+ if (FCmp) {
+ // Propagate fast math flags.
+ IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+ Builder.setFastMathFlags(Cmp->getFastMathFlags());
+ C = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+ } else {
+ C = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+ }
State.set(Def, &I, C, Part);
- addMetadata(C, &I);
- }
-
- break;
- }
-
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
- auto *CI = cast<CastInst>(&I);
- setDebugLocFromInst(Builder, CI);
-
- /// Vectorize casts.
- Type *DestTy =
+ addMetadata(C, &I);
+ }
+
+ break;
+ }
+
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ auto *CI = cast<CastInst>(&I);
+ setDebugLocFromInst(Builder, CI);
+
+ /// Vectorize casts.
+ Type *DestTy =
(VF.isScalar()) ? CI->getType() : VectorType::get(CI->getType(), VF);
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *A = State.get(User.getOperand(0), Part);
- Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *A = State.get(User.getOperand(0), Part);
+ Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
State.set(Def, &I, Cast, Part);
- addMetadata(Cast, &I);
- }
- break;
- }
- default:
- // This instruction is not vectorized by simple widening.
- LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
- llvm_unreachable("Unhandled instruction!");
- } // end of switch.
-}
-
+ addMetadata(Cast, &I);
+ }
+ break;
+ }
+ default:
+ // This instruction is not vectorized by simple widening.
+ LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+ llvm_unreachable("Unhandled instruction!");
+ } // end of switch.
+}
+
void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPValue *Def,
VPUser &ArgOperands,
- VPTransformState &State) {
- assert(!isa<DbgInfoIntrinsic>(I) &&
- "DbgInfoIntrinsic should have been dropped during VPlan construction");
- setDebugLocFromInst(Builder, &I);
-
- Module *M = I.getParent()->getParent()->getParent();
- auto *CI = cast<CallInst>(&I);
-
- SmallVector<Type *, 4> Tys;
- for (Value *ArgOperand : CI->arg_operands())
+ VPTransformState &State) {
+ assert(!isa<DbgInfoIntrinsic>(I) &&
+ "DbgInfoIntrinsic should have been dropped during VPlan construction");
+ setDebugLocFromInst(Builder, &I);
+
+ Module *M = I.getParent()->getParent()->getParent();
+ auto *CI = cast<CallInst>(&I);
+
+ SmallVector<Type *, 4> Tys;
+ for (Value *ArgOperand : CI->arg_operands())
Tys.push_back(ToVectorTy(ArgOperand->getType(), VF.getKnownMinValue()));
-
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-
- // The flag shows whether we use Intrinsic or a usual Call for vectorized
- // version of the instruction.
- // Is it beneficial to perform intrinsic call compared to lib call?
- bool NeedToScalarize = false;
+
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ // The flag shows whether we use Intrinsic or a usual Call for vectorized
+ // version of the instruction.
+ // Is it beneficial to perform intrinsic call compared to lib call?
+ bool NeedToScalarize = false;
InstructionCost CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
InstructionCost IntrinsicCost = ID ? Cost->getVectorIntrinsicCost(CI, VF) : 0;
bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
- assert((UseVectorIntrinsic || !NeedToScalarize) &&
- "Instruction should be scalarized elsewhere.");
+ assert((UseVectorIntrinsic || !NeedToScalarize) &&
+ "Instruction should be scalarized elsewhere.");
assert(IntrinsicCost.isValid() && CallCost.isValid() &&
"Cannot have invalid costs while widening");
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- SmallVector<Value *, 4> Args;
- for (auto &I : enumerate(ArgOperands.operands())) {
- // Some intrinsics have a scalar argument - don't replace it with a
- // vector.
- Value *Arg;
- if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
- Arg = State.get(I.value(), Part);
- else
- Arg = State.get(I.value(), {0, 0});
- Args.push_back(Arg);
- }
-
- Function *VectorF;
- if (UseVectorIntrinsic) {
- // Use vector version of the intrinsic.
- Type *TysForDecl[] = {CI->getType()};
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ SmallVector<Value *, 4> Args;
+ for (auto &I : enumerate(ArgOperands.operands())) {
+ // Some intrinsics have a scalar argument - don't replace it with a
+ // vector.
+ Value *Arg;
+ if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
+ Arg = State.get(I.value(), Part);
+ else
+ Arg = State.get(I.value(), {0, 0});
+ Args.push_back(Arg);
+ }
+
+ Function *VectorF;
+ if (UseVectorIntrinsic) {
+ // Use vector version of the intrinsic.
+ Type *TysForDecl[] = {CI->getType()};
if (VF.isVector()) {
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
}
- VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
- assert(VectorF && "Can't retrieve vector intrinsic.");
- } else {
- // Use vector version of the function call.
+ VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+ assert(VectorF && "Can't retrieve vector intrinsic.");
+ } else {
+ // Use vector version of the function call.
const VFShape Shape = VFShape::get(*CI, VF, false /*HasGlobalPred*/);
-#ifndef NDEBUG
- assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
- "Can't create vector function.");
-#endif
- VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
- }
- SmallVector<OperandBundleDef, 1> OpBundles;
- CI->getOperandBundlesAsDefs(OpBundles);
- CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
-
- if (isa<FPMathOperator>(V))
- V->copyFastMathFlags(CI);
-
+#ifndef NDEBUG
+ assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
+ "Can't create vector function.");
+#endif
+ VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
+ }
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ CI->getOperandBundlesAsDefs(OpBundles);
+ CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
+
+ if (isa<FPMathOperator>(V))
+ V->copyFastMathFlags(CI);
+
State.set(Def, &I, V, Part);
- addMetadata(V, &I);
- }
-}
-
+ addMetadata(V, &I);
+ }
+}
+
void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I, VPValue *VPDef,
- VPUser &Operands,
- bool InvariantCond,
- VPTransformState &State) {
- setDebugLocFromInst(Builder, &I);
-
- // The condition can be loop invariant but still defined inside the
- // loop. This means that we can't just use the original 'cond' value.
- // We have to take the 'vectorized' value and pick the first lane.
- // Instcombine will make this a no-op.
- auto *InvarCond =
- InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
-
- for (unsigned Part = 0; Part < UF; ++Part) {
- Value *Cond =
- InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
- Value *Op0 = State.get(Operands.getOperand(1), Part);
- Value *Op1 = State.get(Operands.getOperand(2), Part);
- Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
+ VPUser &Operands,
+ bool InvariantCond,
+ VPTransformState &State) {
+ setDebugLocFromInst(Builder, &I);
+
+ // The condition can be loop invariant but still defined inside the
+ // loop. This means that we can't just use the original 'cond' value.
+ // We have to take the 'vectorized' value and pick the first lane.
+ // Instcombine will make this a no-op.
+ auto *InvarCond =
+ InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Value *Cond =
+ InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
+ Value *Op0 = State.get(Operands.getOperand(1), Part);
+ Value *Op1 = State.get(Operands.getOperand(2), Part);
+ Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
State.set(VPDef, &I, Sel, Part);
- addMetadata(Sel, &I);
- }
-}
-
+ addMetadata(Sel, &I);
+ }
+}
+
void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
- // We should not collect Scalars more than once per VF. Right now, this
- // function is called from collectUniformsAndScalars(), which already does
- // this check. Collecting Scalars for VF=1 does not make any sense.
+ // We should not collect Scalars more than once per VF. Right now, this
+ // function is called from collectUniformsAndScalars(), which already does
+ // this check. Collecting Scalars for VF=1 does not make any sense.
assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
- "This function should not be visited twice for the same VF");
-
- SmallSetVector<Instruction *, 8> Worklist;
-
- // These sets are used to seed the analysis with pointers used by memory
- // accesses that will remain scalar.
- SmallSetVector<Instruction *, 8> ScalarPtrs;
- SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
+ "This function should not be visited twice for the same VF");
+
+ SmallSetVector<Instruction *, 8> Worklist;
+
+ // These sets are used to seed the analysis with pointers used by memory
+ // accesses that will remain scalar.
+ SmallSetVector<Instruction *, 8> ScalarPtrs;
+ SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
auto *Latch = TheLoop->getLoopLatch();
-
- // A helper that returns true if the use of Ptr by MemAccess will be scalar.
- // The pointer operands of loads and stores will be scalar as long as the
- // memory access is not a gather or scatter operation. The value operand of a
- // store will remain scalar if the store is scalarized.
- auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
- InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
- assert(WideningDecision != CM_Unknown &&
- "Widening decision should be ready at this moment");
- if (auto *Store = dyn_cast<StoreInst>(MemAccess))
- if (Ptr == Store->getValueOperand())
- return WideningDecision == CM_Scalarize;
- assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
- "Ptr is neither a value or pointer operand");
- return WideningDecision != CM_GatherScatter;
- };
-
- // A helper that returns true if the given value is a bitcast or
- // getelementptr instruction contained in the loop.
- auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
- return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
- isa<GetElementPtrInst>(V)) &&
- !TheLoop->isLoopInvariant(V);
- };
-
+
+ // A helper that returns true if the use of Ptr by MemAccess will be scalar.
+ // The pointer operands of loads and stores will be scalar as long as the
+ // memory access is not a gather or scatter operation. The value operand of a
+ // store will remain scalar if the store is scalarized.
+ auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
+ InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
+ assert(WideningDecision != CM_Unknown &&
+ "Widening decision should be ready at this moment");
+ if (auto *Store = dyn_cast<StoreInst>(MemAccess))
+ if (Ptr == Store->getValueOperand())
+ return WideningDecision == CM_Scalarize;
+ assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
+ "Ptr is neither a value or pointer operand");
+ return WideningDecision != CM_GatherScatter;
+ };
+
+ // A helper that returns true if the given value is a bitcast or
+ // getelementptr instruction contained in the loop.
+ auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
+ return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
+ isa<GetElementPtrInst>(V)) &&
+ !TheLoop->isLoopInvariant(V);
+ };
+
auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
if (!isa<PHINode>(Ptr) ||
!Legal->getInductionVars().count(cast<PHINode>(Ptr)))
@@ -5023,7 +5023,7 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
// inserted into Worklist. If the use will be a scalar use, and the
// pointer is only used by memory accesses, we place the pointer in
// ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
- auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
+ auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
if (isScalarPtrInduction(MemAccess, Ptr)) {
Worklist.insert(cast<Instruction>(Ptr));
Instruction *Update = cast<Instruction>(
@@ -5035,286 +5035,286 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
<< "\n");
return;
}
- // We only care about bitcast and getelementptr instructions contained in
- // the loop.
- if (!isLoopVaryingBitCastOrGEP(Ptr))
- return;
-
- // If the pointer has already been identified as scalar (e.g., if it was
- // also identified as uniform), there's nothing to do.
- auto *I = cast<Instruction>(Ptr);
- if (Worklist.count(I))
- return;
-
- // If the use of the pointer will be a scalar use, and all users of the
- // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
- // place the pointer in PossibleNonScalarPtrs.
- if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
- return isa<LoadInst>(U) || isa<StoreInst>(U);
- }))
- ScalarPtrs.insert(I);
- else
- PossibleNonScalarPtrs.insert(I);
- };
-
- // We seed the scalars analysis with three classes of instructions: (1)
+ // We only care about bitcast and getelementptr instructions contained in
+ // the loop.
+ if (!isLoopVaryingBitCastOrGEP(Ptr))
+ return;
+
+ // If the pointer has already been identified as scalar (e.g., if it was
+ // also identified as uniform), there's nothing to do.
+ auto *I = cast<Instruction>(Ptr);
+ if (Worklist.count(I))
+ return;
+
+ // If the use of the pointer will be a scalar use, and all users of the
+ // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
+ // place the pointer in PossibleNonScalarPtrs.
+ if (isScalarUse(MemAccess, Ptr) && llvm::all_of(I->users(), [&](User *U) {
+ return isa<LoadInst>(U) || isa<StoreInst>(U);
+ }))
+ ScalarPtrs.insert(I);
+ else
+ PossibleNonScalarPtrs.insert(I);
+ };
+
+ // We seed the scalars analysis with three classes of instructions: (1)
// instructions marked uniform-after-vectorization and (2) bitcast,
// getelementptr and (pointer) phi instructions used by memory accesses
// requiring a scalar use.
- //
- // (1) Add to the worklist all instructions that have been identified as
- // uniform-after-vectorization.
- Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
-
- // (2) Add to the worklist all bitcast and getelementptr instructions used by
- // memory accesses requiring a scalar use. The pointer operands of loads and
- // stores will be scalar as long as the memory accesses is not a gather or
- // scatter operation. The value operand of a store will remain scalar if the
- // store is scalarized.
- for (auto *BB : TheLoop->blocks())
- for (auto &I : *BB) {
- if (auto *Load = dyn_cast<LoadInst>(&I)) {
- evaluatePtrUse(Load, Load->getPointerOperand());
- } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
- evaluatePtrUse(Store, Store->getPointerOperand());
- evaluatePtrUse(Store, Store->getValueOperand());
- }
- }
- for (auto *I : ScalarPtrs)
- if (!PossibleNonScalarPtrs.count(I)) {
- LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
- Worklist.insert(I);
- }
-
- // Insert the forced scalars.
- // FIXME: Currently widenPHIInstruction() often creates a dead vector
- // induction variable when the PHI user is scalarized.
- auto ForcedScalar = ForcedScalars.find(VF);
- if (ForcedScalar != ForcedScalars.end())
- for (auto *I : ForcedScalar->second)
- Worklist.insert(I);
-
- // Expand the worklist by looking through any bitcasts and getelementptr
- // instructions we've already identified as scalar. This is similar to the
- // expansion step in collectLoopUniforms(); however, here we're only
- // expanding to include additional bitcasts and getelementptr instructions.
- unsigned Idx = 0;
- while (Idx != Worklist.size()) {
- Instruction *Dst = Worklist[Idx++];
- if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
- continue;
- auto *Src = cast<Instruction>(Dst->getOperand(0));
- if (llvm::all_of(Src->users(), [&](User *U) -> bool {
- auto *J = cast<Instruction>(U);
- return !TheLoop->contains(J) || Worklist.count(J) ||
- ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
- isScalarUse(J, Src));
- })) {
- Worklist.insert(Src);
- LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
- }
- }
-
- // An induction variable will remain scalar if all users of the induction
- // variable and induction variable update remain scalar.
- for (auto &Induction : Legal->getInductionVars()) {
- auto *Ind = Induction.first;
- auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
-
- // If tail-folding is applied, the primary induction variable will be used
- // to feed a vector compare.
- if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
- continue;
-
- // Determine if all users of the induction variable are scalar after
- // vectorization.
- auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
- auto *I = cast<Instruction>(U);
- return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
- });
- if (!ScalarInd)
- continue;
-
- // Determine if all users of the induction variable update instruction are
- // scalar after vectorization.
- auto ScalarIndUpdate =
- llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
- auto *I = cast<Instruction>(U);
- return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
- });
- if (!ScalarIndUpdate)
- continue;
-
- // The induction variable and its update instruction will remain scalar.
- Worklist.insert(Ind);
- Worklist.insert(IndUpdate);
- LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
- LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
- << "\n");
- }
-
- Scalars[VF].insert(Worklist.begin(), Worklist.end());
-}
-
+ //
+ // (1) Add to the worklist all instructions that have been identified as
+ // uniform-after-vectorization.
+ Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
+
+ // (2) Add to the worklist all bitcast and getelementptr instructions used by
+ // memory accesses requiring a scalar use. The pointer operands of loads and
+ // stores will be scalar as long as the memory accesses is not a gather or
+ // scatter operation. The value operand of a store will remain scalar if the
+ // store is scalarized.
+ for (auto *BB : TheLoop->blocks())
+ for (auto &I : *BB) {
+ if (auto *Load = dyn_cast<LoadInst>(&I)) {
+ evaluatePtrUse(Load, Load->getPointerOperand());
+ } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
+ evaluatePtrUse(Store, Store->getPointerOperand());
+ evaluatePtrUse(Store, Store->getValueOperand());
+ }
+ }
+ for (auto *I : ScalarPtrs)
+ if (!PossibleNonScalarPtrs.count(I)) {
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
+ Worklist.insert(I);
+ }
+
+ // Insert the forced scalars.
+ // FIXME: Currently widenPHIInstruction() often creates a dead vector
+ // induction variable when the PHI user is scalarized.
+ auto ForcedScalar = ForcedScalars.find(VF);
+ if (ForcedScalar != ForcedScalars.end())
+ for (auto *I : ForcedScalar->second)
+ Worklist.insert(I);
+
+ // Expand the worklist by looking through any bitcasts and getelementptr
+ // instructions we've already identified as scalar. This is similar to the
+ // expansion step in collectLoopUniforms(); however, here we're only
+ // expanding to include additional bitcasts and getelementptr instructions.
+ unsigned Idx = 0;
+ while (Idx != Worklist.size()) {
+ Instruction *Dst = Worklist[Idx++];
+ if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
+ continue;
+ auto *Src = cast<Instruction>(Dst->getOperand(0));
+ if (llvm::all_of(Src->users(), [&](User *U) -> bool {
+ auto *J = cast<Instruction>(U);
+ return !TheLoop->contains(J) || Worklist.count(J) ||
+ ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
+ isScalarUse(J, Src));
+ })) {
+ Worklist.insert(Src);
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
+ }
+ }
+
+ // An induction variable will remain scalar if all users of the induction
+ // variable and induction variable update remain scalar.
+ for (auto &Induction : Legal->getInductionVars()) {
+ auto *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+
+ // If tail-folding is applied, the primary induction variable will be used
+ // to feed a vector compare.
+ if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
+ continue;
+
+ // Determine if all users of the induction variable are scalar after
+ // vectorization.
+ auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
+ });
+ if (!ScalarInd)
+ continue;
+
+ // Determine if all users of the induction variable update instruction are
+ // scalar after vectorization.
+ auto ScalarIndUpdate =
+ llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
+ });
+ if (!ScalarIndUpdate)
+ continue;
+
+ // The induction variable and its update instruction will remain scalar.
+ Worklist.insert(Ind);
+ Worklist.insert(IndUpdate);
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+ LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
+ << "\n");
+ }
+
+ Scalars[VF].insert(Worklist.begin(), Worklist.end());
+}
+
bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I,
ElementCount VF) {
- if (!blockNeedsPredication(I->getParent()))
- return false;
- switch(I->getOpcode()) {
- default:
- break;
- case Instruction::Load:
- case Instruction::Store: {
- if (!Legal->isMaskRequired(I))
- return false;
- auto *Ptr = getLoadStorePointerOperand(I);
- auto *Ty = getMemInstValueType(I);
- // We have already decided how to vectorize this instruction, get that
- // result.
+ if (!blockNeedsPredication(I->getParent()))
+ return false;
+ switch(I->getOpcode()) {
+ default:
+ break;
+ case Instruction::Load:
+ case Instruction::Store: {
+ if (!Legal->isMaskRequired(I))
+ return false;
+ auto *Ptr = getLoadStorePointerOperand(I);
+ auto *Ty = getMemInstValueType(I);
+ // We have already decided how to vectorize this instruction, get that
+ // result.
if (VF.isVector()) {
- InstWidening WideningDecision = getWideningDecision(I, VF);
- assert(WideningDecision != CM_Unknown &&
- "Widening decision should be ready at this moment");
- return WideningDecision == CM_Scalarize;
- }
- const Align Alignment = getLoadStoreAlignment(I);
- return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
- isLegalMaskedGather(Ty, Alignment))
- : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
- isLegalMaskedScatter(Ty, Alignment));
- }
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::SRem:
- case Instruction::URem:
- return mayDivideByZero(*I);
- }
- return false;
-}
-
+ InstWidening WideningDecision = getWideningDecision(I, VF);
+ assert(WideningDecision != CM_Unknown &&
+ "Widening decision should be ready at this moment");
+ return WideningDecision == CM_Scalarize;
+ }
+ const Align Alignment = getLoadStoreAlignment(I);
+ return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
+ isLegalMaskedGather(Ty, Alignment))
+ : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
+ isLegalMaskedScatter(Ty, Alignment));
+ }
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::SRem:
+ case Instruction::URem:
+ return mayDivideByZero(*I);
+ }
+ return false;
+}
+
bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
Instruction *I, ElementCount VF) {
- assert(isAccessInterleaved(I) && "Expecting interleaved access.");
- assert(getWideningDecision(I, VF) == CM_Unknown &&
- "Decision should not be set yet.");
- auto *Group = getInterleavedAccessGroup(I);
- assert(Group && "Must have a group.");
-
- // If the instruction's allocated size doesn't equal it's type size, it
- // requires padding and will be scalarized.
- auto &DL = I->getModule()->getDataLayout();
- auto *ScalarTy = getMemInstValueType(I);
+ assert(isAccessInterleaved(I) && "Expecting interleaved access.");
+ assert(getWideningDecision(I, VF) == CM_Unknown &&
+ "Decision should not be set yet.");
+ auto *Group = getInterleavedAccessGroup(I);
+ assert(Group && "Must have a group.");
+
+ // If the instruction's allocated size doesn't equal it's type size, it
+ // requires padding and will be scalarized.
+ auto &DL = I->getModule()->getDataLayout();
+ auto *ScalarTy = getMemInstValueType(I);
if (hasIrregularType(ScalarTy, DL))
- return false;
-
- // Check if masking is required.
- // A Group may need masking for one of two reasons: it resides in a block that
- // needs predication, or it was decided to use masking to deal with gaps.
- bool PredicatedAccessRequiresMasking =
- Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
- bool AccessWithGapsRequiresMasking =
- Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
- if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
- return true;
-
- // If masked interleaving is required, we expect that the user/target had
- // enabled it, because otherwise it either wouldn't have been created or
- // it should have been invalidated by the CostModel.
- assert(useMaskedInterleavedAccesses(TTI) &&
- "Masked interleave-groups for predicated accesses are not enabled.");
-
- auto *Ty = getMemInstValueType(I);
- const Align Alignment = getLoadStoreAlignment(I);
- return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
- : TTI.isLegalMaskedStore(Ty, Alignment);
-}
-
+ return false;
+
+ // Check if masking is required.
+ // A Group may need masking for one of two reasons: it resides in a block that
+ // needs predication, or it was decided to use masking to deal with gaps.
+ bool PredicatedAccessRequiresMasking =
+ Legal->blockNeedsPredication(I->getParent()) && Legal->isMaskRequired(I);
+ bool AccessWithGapsRequiresMasking =
+ Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
+ if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
+ return true;
+
+ // If masked interleaving is required, we expect that the user/target had
+ // enabled it, because otherwise it either wouldn't have been created or
+ // it should have been invalidated by the CostModel.
+ assert(useMaskedInterleavedAccesses(TTI) &&
+ "Masked interleave-groups for predicated accesses are not enabled.");
+
+ auto *Ty = getMemInstValueType(I);
+ const Align Alignment = getLoadStoreAlignment(I);
+ return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
+ : TTI.isLegalMaskedStore(Ty, Alignment);
+}
+
bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
Instruction *I, ElementCount VF) {
- // Get and ensure we have a valid memory instruction.
- LoadInst *LI = dyn_cast<LoadInst>(I);
- StoreInst *SI = dyn_cast<StoreInst>(I);
- assert((LI || SI) && "Invalid memory instruction");
-
- auto *Ptr = getLoadStorePointerOperand(I);
-
- // In order to be widened, the pointer should be consecutive, first of all.
- if (!Legal->isConsecutivePtr(Ptr))
- return false;
-
- // If the instruction is a store located in a predicated block, it will be
- // scalarized.
- if (isScalarWithPredication(I))
- return false;
-
- // If the instruction's allocated size doesn't equal it's type size, it
- // requires padding and will be scalarized.
- auto &DL = I->getModule()->getDataLayout();
- auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+ // Get and ensure we have a valid memory instruction.
+ LoadInst *LI = dyn_cast<LoadInst>(I);
+ StoreInst *SI = dyn_cast<StoreInst>(I);
+ assert((LI || SI) && "Invalid memory instruction");
+
+ auto *Ptr = getLoadStorePointerOperand(I);
+
+ // In order to be widened, the pointer should be consecutive, first of all.
+ if (!Legal->isConsecutivePtr(Ptr))
+ return false;
+
+ // If the instruction is a store located in a predicated block, it will be
+ // scalarized.
+ if (isScalarWithPredication(I))
+ return false;
+
+ // If the instruction's allocated size doesn't equal it's type size, it
+ // requires padding and will be scalarized.
+ auto &DL = I->getModule()->getDataLayout();
+ auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
if (hasIrregularType(ScalarTy, DL))
- return false;
-
- return true;
-}
-
+ return false;
+
+ return true;
+}
+
void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
- // We should not collect Uniforms more than once per VF. Right now,
- // this function is called from collectUniformsAndScalars(), which
- // already does this check. Collecting Uniforms for VF=1 does not make any
- // sense.
-
+ // We should not collect Uniforms more than once per VF. Right now,
+ // this function is called from collectUniformsAndScalars(), which
+ // already does this check. Collecting Uniforms for VF=1 does not make any
+ // sense.
+
assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
- "This function should not be visited twice for the same VF");
-
- // Visit the list of Uniforms. If we'll not find any uniform value, we'll
- // not analyze again. Uniforms.count(VF) will return 1.
- Uniforms[VF].clear();
-
- // We now know that the loop is vectorizable!
- // Collect instructions inside the loop that will remain uniform after
- // vectorization.
-
- // Global values, params and instructions outside of current loop are out of
- // scope.
- auto isOutOfScope = [&](Value *V) -> bool {
- Instruction *I = dyn_cast<Instruction>(V);
- return (!I || !TheLoop->contains(I));
- };
-
- SetVector<Instruction *> Worklist;
- BasicBlock *Latch = TheLoop->getLoopLatch();
-
- // Instructions that are scalar with predication must not be considered
- // uniform after vectorization, because that would create an erroneous
- // replicating region where only a single instance out of VF should be formed.
- // TODO: optimize such seldom cases if found important, see PR40816.
- auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
+ "This function should not be visited twice for the same VF");
+
+ // Visit the list of Uniforms. If we'll not find any uniform value, we'll
+ // not analyze again. Uniforms.count(VF) will return 1.
+ Uniforms[VF].clear();
+
+ // We now know that the loop is vectorizable!
+ // Collect instructions inside the loop that will remain uniform after
+ // vectorization.
+
+ // Global values, params and instructions outside of current loop are out of
+ // scope.
+ auto isOutOfScope = [&](Value *V) -> bool {
+ Instruction *I = dyn_cast<Instruction>(V);
+ return (!I || !TheLoop->contains(I));
+ };
+
+ SetVector<Instruction *> Worklist;
+ BasicBlock *Latch = TheLoop->getLoopLatch();
+
+ // Instructions that are scalar with predication must not be considered
+ // uniform after vectorization, because that would create an erroneous
+ // replicating region where only a single instance out of VF should be formed.
+ // TODO: optimize such seldom cases if found important, see PR40816.
+ auto addToWorklistIfAllowed = [&](Instruction *I) -> void {
if (isOutOfScope(I)) {
LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
<< *I << "\n");
return;
}
- if (isScalarWithPredication(I, VF)) {
- LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
- << *I << "\n");
- return;
- }
- LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
- Worklist.insert(I);
- };
-
- // Start with the conditional branch. If the branch condition is an
- // instruction contained in the loop that is only used by the branch, it is
- // uniform.
- auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
- if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
- addToWorklistIfAllowed(Cmp);
-
+ if (isScalarWithPredication(I, VF)) {
+ LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
+ << *I << "\n");
+ return;
+ }
+ LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
+ Worklist.insert(I);
+ };
+
+ // Start with the conditional branch. If the branch condition is an
+ // instruction contained in the loop that is only used by the branch, it is
+ // uniform.
+ auto *Cmp = dyn_cast<Instruction>(Latch->getTerminator()->getOperand(0));
+ if (Cmp && TheLoop->contains(Cmp) && Cmp->hasOneUse())
+ addToWorklistIfAllowed(Cmp);
+
auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
- InstWidening WideningDecision = getWideningDecision(I, VF);
- assert(WideningDecision != CM_Unknown &&
- "Widening decision should be ready at this moment");
-
+ InstWidening WideningDecision = getWideningDecision(I, VF);
+ assert(WideningDecision != CM_Unknown &&
+ "Widening decision should be ready at this moment");
+
// A uniform memory op is itself uniform. We exclude uniform stores
// here as they demand the last lane, not the first one.
if (isa<LoadInst>(I) && Legal->isUniformMemOp(*I)) {
@@ -5322,10 +5322,10 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
return true;
}
- return (WideningDecision == CM_Widen ||
- WideningDecision == CM_Widen_Reverse ||
- WideningDecision == CM_Interleave);
- };
+ return (WideningDecision == CM_Widen ||
+ WideningDecision == CM_Widen_Reverse ||
+ WideningDecision == CM_Interleave);
+ };
// Returns true if Ptr is the pointer operand of a memory access instruction
@@ -5343,24 +5343,24 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
// Scan the loop for instructions which are either a) known to have only
// lane 0 demanded or b) are uses which demand only lane 0 of their operand.
- for (auto *BB : TheLoop->blocks())
- for (auto &I : *BB) {
- // If there's no pointer operand, there's nothing to do.
+ for (auto *BB : TheLoop->blocks())
+ for (auto &I : *BB) {
+ // If there's no pointer operand, there's nothing to do.
auto *Ptr = getLoadStorePointerOperand(&I);
- if (!Ptr)
- continue;
-
+ if (!Ptr)
+ continue;
+
// A uniform memory op is itself uniform. We exclude uniform stores
// here as they demand the last lane, not the first one.
if (isa<LoadInst>(I) && Legal->isUniformMemOp(I))
addToWorklistIfAllowed(&I);
-
+
if (isUniformDecision(&I, VF)) {
assert(isVectorizedMemAccessUse(&I, Ptr) && "consistency check");
HasUniformUse.insert(Ptr);
}
- }
-
+ }
+
// Add to the worklist any operands which have *only* uniform (e.g. lane 0
// demanding) users. Since loops are assumed to be in LCSSA form, this
// disallows uses outside the loop as well.
@@ -5375,156 +5375,156 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
if (UsersAreMemAccesses)
addToWorklistIfAllowed(I);
}
-
- // Expand Worklist in topological order: whenever a new instruction
- // is added , its users should be already inside Worklist. It ensures
- // a uniform instruction will only be used by uniform instructions.
- unsigned idx = 0;
- while (idx != Worklist.size()) {
- Instruction *I = Worklist[idx++];
-
- for (auto OV : I->operand_values()) {
- // isOutOfScope operands cannot be uniform instructions.
- if (isOutOfScope(OV))
- continue;
- // First order recurrence Phi's should typically be considered
- // non-uniform.
- auto *OP = dyn_cast<PHINode>(OV);
- if (OP && Legal->isFirstOrderRecurrence(OP))
- continue;
- // If all the users of the operand are uniform, then add the
- // operand into the uniform worklist.
- auto *OI = cast<Instruction>(OV);
- if (llvm::all_of(OI->users(), [&](User *U) -> bool {
- auto *J = cast<Instruction>(U);
+
+ // Expand Worklist in topological order: whenever a new instruction
+ // is added , its users should be already inside Worklist. It ensures
+ // a uniform instruction will only be used by uniform instructions.
+ unsigned idx = 0;
+ while (idx != Worklist.size()) {
+ Instruction *I = Worklist[idx++];
+
+ for (auto OV : I->operand_values()) {
+ // isOutOfScope operands cannot be uniform instructions.
+ if (isOutOfScope(OV))
+ continue;
+ // First order recurrence Phi's should typically be considered
+ // non-uniform.
+ auto *OP = dyn_cast<PHINode>(OV);
+ if (OP && Legal->isFirstOrderRecurrence(OP))
+ continue;
+ // If all the users of the operand are uniform, then add the
+ // operand into the uniform worklist.
+ auto *OI = cast<Instruction>(OV);
+ if (llvm::all_of(OI->users(), [&](User *U) -> bool {
+ auto *J = cast<Instruction>(U);
return Worklist.count(J) || isVectorizedMemAccessUse(J, OI);
- }))
- addToWorklistIfAllowed(OI);
- }
- }
-
- // For an instruction to be added into Worklist above, all its users inside
- // the loop should also be in Worklist. However, this condition cannot be
- // true for phi nodes that form a cyclic dependence. We must process phi
- // nodes separately. An induction variable will remain uniform if all users
- // of the induction variable and induction variable update remain uniform.
- // The code below handles both pointer and non-pointer induction variables.
- for (auto &Induction : Legal->getInductionVars()) {
- auto *Ind = Induction.first;
- auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
-
- // Determine if all users of the induction variable are uniform after
- // vectorization.
- auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
- auto *I = cast<Instruction>(U);
- return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
- isVectorizedMemAccessUse(I, Ind);
- });
- if (!UniformInd)
- continue;
-
- // Determine if all users of the induction variable update instruction are
- // uniform after vectorization.
- auto UniformIndUpdate =
- llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
- auto *I = cast<Instruction>(U);
- return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
- isVectorizedMemAccessUse(I, IndUpdate);
- });
- if (!UniformIndUpdate)
- continue;
-
- // The induction variable and its update instruction will remain uniform.
- addToWorklistIfAllowed(Ind);
- addToWorklistIfAllowed(IndUpdate);
- }
-
- Uniforms[VF].insert(Worklist.begin(), Worklist.end());
-}
-
-bool LoopVectorizationCostModel::runtimeChecksRequired() {
- LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
-
- if (Legal->getRuntimePointerChecking()->Need) {
- reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
- "runtime pointer checks needed. Enable vectorization of this "
- "loop with '#pragma clang loop vectorize(enable)' when "
- "compiling with -Os/-Oz",
- "CantVersionLoopWithOptForSize", ORE, TheLoop);
- return true;
- }
-
- if (!PSE.getUnionPredicate().getPredicates().empty()) {
- reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
- "runtime SCEV checks needed. Enable vectorization of this "
- "loop with '#pragma clang loop vectorize(enable)' when "
- "compiling with -Os/-Oz",
- "CantVersionLoopWithOptForSize", ORE, TheLoop);
- return true;
- }
-
- // FIXME: Avoid specializing for stride==1 instead of bailing out.
- if (!Legal->getLAI()->getSymbolicStrides().empty()) {
- reportVectorizationFailure("Runtime stride check for small trip count",
- "runtime stride == 1 checks needed. Enable vectorization of "
- "this loop without such check by compiling with -Os/-Oz",
- "CantVersionLoopWithOptForSize", ORE, TheLoop);
- return true;
- }
-
- return false;
-}
-
+ }))
+ addToWorklistIfAllowed(OI);
+ }
+ }
+
+ // For an instruction to be added into Worklist above, all its users inside
+ // the loop should also be in Worklist. However, this condition cannot be
+ // true for phi nodes that form a cyclic dependence. We must process phi
+ // nodes separately. An induction variable will remain uniform if all users
+ // of the induction variable and induction variable update remain uniform.
+ // The code below handles both pointer and non-pointer induction variables.
+ for (auto &Induction : Legal->getInductionVars()) {
+ auto *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+
+ // Determine if all users of the induction variable are uniform after
+ // vectorization.
+ auto UniformInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
+ isVectorizedMemAccessUse(I, Ind);
+ });
+ if (!UniformInd)
+ continue;
+
+ // Determine if all users of the induction variable update instruction are
+ // uniform after vectorization.
+ auto UniformIndUpdate =
+ llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
+ auto *I = cast<Instruction>(U);
+ return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
+ isVectorizedMemAccessUse(I, IndUpdate);
+ });
+ if (!UniformIndUpdate)
+ continue;
+
+ // The induction variable and its update instruction will remain uniform.
+ addToWorklistIfAllowed(Ind);
+ addToWorklistIfAllowed(IndUpdate);
+ }
+
+ Uniforms[VF].insert(Worklist.begin(), Worklist.end());
+}
+
+bool LoopVectorizationCostModel::runtimeChecksRequired() {
+ LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
+
+ if (Legal->getRuntimePointerChecking()->Need) {
+ reportVectorizationFailure("Runtime ptr check is required with -Os/-Oz",
+ "runtime pointer checks needed. Enable vectorization of this "
+ "loop with '#pragma clang loop vectorize(enable)' when "
+ "compiling with -Os/-Oz",
+ "CantVersionLoopWithOptForSize", ORE, TheLoop);
+ return true;
+ }
+
+ if (!PSE.getUnionPredicate().getPredicates().empty()) {
+ reportVectorizationFailure("Runtime SCEV check is required with -Os/-Oz",
+ "runtime SCEV checks needed. Enable vectorization of this "
+ "loop with '#pragma clang loop vectorize(enable)' when "
+ "compiling with -Os/-Oz",
+ "CantVersionLoopWithOptForSize", ORE, TheLoop);
+ return true;
+ }
+
+ // FIXME: Avoid specializing for stride==1 instead of bailing out.
+ if (!Legal->getLAI()->getSymbolicStrides().empty()) {
+ reportVectorizationFailure("Runtime stride check for small trip count",
+ "runtime stride == 1 checks needed. Enable vectorization of "
+ "this loop without such check by compiling with -Os/-Oz",
+ "CantVersionLoopWithOptForSize", ORE, TheLoop);
+ return true;
+ }
+
+ return false;
+}
+
Optional<ElementCount>
LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
- if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
- // TODO: It may by useful to do since it's still likely to be dynamically
- // uniform if the target can skip.
- reportVectorizationFailure(
- "Not inserting runtime ptr check for divergent target",
- "runtime pointer checks needed. Not enabled for divergent target",
- "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
- return None;
- }
-
- unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
- LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
- if (TC == 1) {
- reportVectorizationFailure("Single iteration (non) loop",
- "loop trip count is one, irrelevant for vectorization",
- "SingleIterationLoop", ORE, TheLoop);
- return None;
- }
-
- switch (ScalarEpilogueStatus) {
- case CM_ScalarEpilogueAllowed:
+ if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
+ // TODO: It may by useful to do since it's still likely to be dynamically
+ // uniform if the target can skip.
+ reportVectorizationFailure(
+ "Not inserting runtime ptr check for divergent target",
+ "runtime pointer checks needed. Not enabled for divergent target",
+ "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
+ return None;
+ }
+
+ unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+ LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
+ if (TC == 1) {
+ reportVectorizationFailure("Single iteration (non) loop",
+ "loop trip count is one, irrelevant for vectorization",
+ "SingleIterationLoop", ORE, TheLoop);
+ return None;
+ }
+
+ switch (ScalarEpilogueStatus) {
+ case CM_ScalarEpilogueAllowed:
return computeFeasibleMaxVF(TC, UserVF);
case CM_ScalarEpilogueNotAllowedUsePredicate:
LLVM_FALLTHROUGH;
- case CM_ScalarEpilogueNotNeededUsePredicate:
- LLVM_DEBUG(
- dbgs() << "LV: vector predicate hint/switch found.\n"
- << "LV: Not allowing scalar epilogue, creating predicated "
- << "vector loop.\n");
- break;
- case CM_ScalarEpilogueNotAllowedLowTripLoop:
- // fallthrough as a special case of OptForSize
- case CM_ScalarEpilogueNotAllowedOptSize:
- if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
- LLVM_DEBUG(
- dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
- else
- LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
- << "count.\n");
-
- // Bail if runtime checks are required, which are not good when optimising
- // for size.
- if (runtimeChecksRequired())
- return None;
-
- break;
- }
-
+ case CM_ScalarEpilogueNotNeededUsePredicate:
+ LLVM_DEBUG(
+ dbgs() << "LV: vector predicate hint/switch found.\n"
+ << "LV: Not allowing scalar epilogue, creating predicated "
+ << "vector loop.\n");
+ break;
+ case CM_ScalarEpilogueNotAllowedLowTripLoop:
+ // fallthrough as a special case of OptForSize
+ case CM_ScalarEpilogueNotAllowedOptSize:
+ if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
+ LLVM_DEBUG(
+ dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+ else
+ LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
+ << "count.\n");
+
+ // Bail if runtime checks are required, which are not good when optimising
+ // for size.
+ if (runtimeChecksRequired())
+ return None;
+
+ break;
+ }
+
// The only loops we can vectorize without a scalar epilogue, are loops with
// a bottom-test and a single exiting block. We'd have to handle the fact
// that not every instruction executes on the last iteration. This will
@@ -5541,18 +5541,18 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return None;
}
- // Now try the tail folding
-
- // Invalidate interleave groups that require an epilogue if we can't mask
- // the interleave-group.
- if (!useMaskedInterleavedAccesses(TTI)) {
- assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
- "No decisions should have been taken at this point");
- // Note: There is no need to invalidate any cost modeling decisions here, as
- // non where taken so far.
- InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
- }
-
+ // Now try the tail folding
+
+ // Invalidate interleave groups that require an epilogue if we can't mask
+ // the interleave-group.
+ if (!useMaskedInterleavedAccesses(TTI)) {
+ assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
+ "No decisions should have been taken at this point");
+ // Note: There is no need to invalidate any cost modeling decisions here, as
+ // non where taken so far.
+ InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+ }
+
ElementCount MaxVF = computeFeasibleMaxVF(TC, UserVF);
assert(!MaxVF.isScalable() &&
"Scalable vectors do not yet support tail folding");
@@ -5569,20 +5569,20 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
const SCEV *Rem = SE->getURemExpr(
ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
if (Rem->isZero()) {
- // Accept MaxVF if we do not have a tail.
- LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
- return MaxVF;
- }
-
- // If we don't know the precise trip count, or if the trip count that we
- // found modulo the vectorization factor is not zero, try to fold the tail
- // by masking.
- // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
- if (Legal->prepareToFoldTailByMasking()) {
- FoldTailByMasking = true;
- return MaxVF;
- }
-
+ // Accept MaxVF if we do not have a tail.
+ LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+ return MaxVF;
+ }
+
+ // If we don't know the precise trip count, or if the trip count that we
+ // found modulo the vectorization factor is not zero, try to fold the tail
+ // by masking.
+ // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
+ if (Legal->prepareToFoldTailByMasking()) {
+ FoldTailByMasking = true;
+ return MaxVF;
+ }
+
// If there was a tail-folding hint/switch, but we can't fold the tail by
// masking, fallback to a vectorization with a scalar epilogue.
if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
@@ -5597,23 +5597,23 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
return None;
}
- if (TC == 0) {
- reportVectorizationFailure(
- "Unable to calculate the loop count due to complex control flow",
- "unable to calculate the loop count due to complex control flow",
- "UnknownLoopCountComplexCFG", ORE, TheLoop);
- return None;
- }
-
- reportVectorizationFailure(
- "Cannot optimize for size and vectorize at the same time.",
- "cannot optimize for size and vectorize at the same time. "
- "Enable vectorization of this loop with '#pragma clang loop "
- "vectorize(enable)' when compiling with -Os/-Oz",
- "NoTailLoopWithOptForSize", ORE, TheLoop);
- return None;
-}
-
+ if (TC == 0) {
+ reportVectorizationFailure(
+ "Unable to calculate the loop count due to complex control flow",
+ "unable to calculate the loop count due to complex control flow",
+ "UnknownLoopCountComplexCFG", ORE, TheLoop);
+ return None;
+ }
+
+ reportVectorizationFailure(
+ "Cannot optimize for size and vectorize at the same time.",
+ "cannot optimize for size and vectorize at the same time. "
+ "Enable vectorization of this loop with '#pragma clang loop "
+ "vectorize(enable)' when compiling with -Os/-Oz",
+ "NoTailLoopWithOptForSize", ORE, TheLoop);
+ return None;
+}
+
ElementCount
LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
ElementCount UserVF) {
@@ -5641,24 +5641,24 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
Legal->isSafeForAnyVectorWidth())
return UserVF;
- MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
- unsigned SmallestType, WidestType;
- std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
- unsigned WidestRegister = TTI.getRegisterBitWidth(true);
-
- // Get the maximum safe dependence distance in bits computed by LAA.
- // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
- // the memory accesses that is most restrictive (involved in the smallest
- // dependence distance).
+ MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
+ unsigned SmallestType, WidestType;
+ std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
+ unsigned WidestRegister = TTI.getRegisterBitWidth(true);
+
+ // Get the maximum safe dependence distance in bits computed by LAA.
+ // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
+ // the memory accesses that is most restrictive (involved in the smallest
+ // dependence distance).
unsigned MaxSafeVectorWidthInBits = Legal->getMaxSafeVectorWidthInBits();
-
+
// If the user vectorization factor is legally unsafe, clamp it to a safe
// value. Otherwise, return as is.
if (UserVF.isNonZero() && !IgnoreScalableUserVF) {
unsigned MaxSafeElements =
PowerOf2Floor(MaxSafeVectorWidthInBits / WidestType);
ElementCount MaxSafeVF = ElementCount::getFixed(MaxSafeElements);
-
+
if (UserVF.isScalable()) {
Optional<unsigned> MaxVScale = TTI.getMaxVScale();
@@ -5707,71 +5707,71 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount,
WidestRegister = std::min(WidestRegister, MaxSafeVectorWidthInBits);
- // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
- // Note that both WidestRegister and WidestType may not be a powers of 2.
- unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
-
- LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
- << " / " << WidestType << " bits.\n");
- LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
- << WidestRegister << " bits.\n");
-
+ // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
+ // Note that both WidestRegister and WidestType may not be a powers of 2.
+ unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
+
+ LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
+ << " / " << WidestType << " bits.\n");
+ LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
+ << WidestRegister << " bits.\n");
+
assert(MaxVectorSize <= WidestRegister &&
"Did not expect to pack so many elements"
" into one vector!");
- if (MaxVectorSize == 0) {
- LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
- MaxVectorSize = 1;
+ if (MaxVectorSize == 0) {
+ LLVM_DEBUG(dbgs() << "LV: The target has no vector registers.\n");
+ MaxVectorSize = 1;
return ElementCount::getFixed(MaxVectorSize);
- } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
- isPowerOf2_32(ConstTripCount)) {
- // We need to clamp the VF to be the ConstTripCount. There is no point in
- // choosing a higher viable VF as done in the loop below.
- LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
- << ConstTripCount << "\n");
- MaxVectorSize = ConstTripCount;
+ } else if (ConstTripCount && ConstTripCount < MaxVectorSize &&
+ isPowerOf2_32(ConstTripCount)) {
+ // We need to clamp the VF to be the ConstTripCount. There is no point in
+ // choosing a higher viable VF as done in the loop below.
+ LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to the constant trip count: "
+ << ConstTripCount << "\n");
+ MaxVectorSize = ConstTripCount;
return ElementCount::getFixed(MaxVectorSize);
- }
-
- unsigned MaxVF = MaxVectorSize;
- if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
- (MaximizeBandwidth && isScalarEpilogueAllowed())) {
- // Collect all viable vectorization factors larger than the default MaxVF
- // (i.e. MaxVectorSize).
+ }
+
+ unsigned MaxVF = MaxVectorSize;
+ if (TTI.shouldMaximizeVectorBandwidth(!isScalarEpilogueAllowed()) ||
+ (MaximizeBandwidth && isScalarEpilogueAllowed())) {
+ // Collect all viable vectorization factors larger than the default MaxVF
+ // (i.e. MaxVectorSize).
SmallVector<ElementCount, 8> VFs;
- unsigned NewMaxVectorSize = WidestRegister / SmallestType;
- for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
+ unsigned NewMaxVectorSize = WidestRegister / SmallestType;
+ for (unsigned VS = MaxVectorSize * 2; VS <= NewMaxVectorSize; VS *= 2)
VFs.push_back(ElementCount::getFixed(VS));
-
- // For each VF calculate its register usage.
- auto RUs = calculateRegisterUsage(VFs);
-
- // Select the largest VF which doesn't require more registers than existing
- // ones.
- for (int i = RUs.size() - 1; i >= 0; --i) {
- bool Selected = true;
- for (auto& pair : RUs[i].MaxLocalUsers) {
- unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
- if (pair.second > TargetNumRegisters)
- Selected = false;
- }
- if (Selected) {
+
+ // For each VF calculate its register usage.
+ auto RUs = calculateRegisterUsage(VFs);
+
+ // Select the largest VF which doesn't require more registers than existing
+ // ones.
+ for (int i = RUs.size() - 1; i >= 0; --i) {
+ bool Selected = true;
+ for (auto& pair : RUs[i].MaxLocalUsers) {
+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+ if (pair.second > TargetNumRegisters)
+ Selected = false;
+ }
+ if (Selected) {
MaxVF = VFs[i].getKnownMinValue();
- break;
- }
- }
- if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
- if (MaxVF < MinVF) {
- LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
- << ") with target's minimum: " << MinVF << '\n');
- MaxVF = MinVF;
- }
- }
- }
+ break;
+ }
+ }
+ if (unsigned MinVF = TTI.getMinimumVF(SmallestType)) {
+ if (MaxVF < MinVF) {
+ LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
+ << ") with target's minimum: " << MinVF << '\n');
+ MaxVF = MinVF;
+ }
+ }
+ }
return ElementCount::getFixed(MaxVF);
-}
-
-VectorizationFactor
+}
+
+VectorizationFactor
LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
// FIXME: This can be fixed for scalable vectors later, because at this stage
// the LoopVectorizer will only consider vectorizing a loop with scalable
@@ -5782,33 +5782,33 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
- unsigned Width = 1;
+ unsigned Width = 1;
const float ScalarCost = *ExpectedCost.getValue();
float Cost = ScalarCost;
-
- bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
+
+ bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
if (ForceVectorization && MaxVF.isVector()) {
- // Ignore scalar width, because the user explicitly wants vectorization.
- // Initialize cost to max so that VF = 2 is, at least, chosen during cost
- // evaluation.
- Cost = std::numeric_limits<float>::max();
- }
-
+ // Ignore scalar width, because the user explicitly wants vectorization.
+ // Initialize cost to max so that VF = 2 is, at least, chosen during cost
+ // evaluation.
+ Cost = std::numeric_limits<float>::max();
+ }
+
for (unsigned i = 2; i <= MaxVF.getFixedValue(); i *= 2) {
- // Notice that the vector loop needs to be executed less times, so
- // we need to divide the cost of the vector loops by the width of
- // the vector elements.
+ // Notice that the vector loop needs to be executed less times, so
+ // we need to divide the cost of the vector loops by the width of
+ // the vector elements.
VectorizationCostTy C = expectedCost(ElementCount::getFixed(i));
assert(C.first.isValid() && "Unexpected invalid cost for vector loop");
float VectorCost = *C.first.getValue() / (float)i;
- LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
- << " costs: " << (int)VectorCost << ".\n");
- if (!C.second && !ForceVectorization) {
- LLVM_DEBUG(
- dbgs() << "LV: Not considering vector loop of width " << i
- << " because it will not generate any vector instructions.\n");
- continue;
- }
+ LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
+ << " costs: " << (int)VectorCost << ".\n");
+ if (!C.second && !ForceVectorization) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Not considering vector loop of width " << i
+ << " because it will not generate any vector instructions.\n");
+ continue;
+ }
// If profitable add it to ProfitableVF list.
if (VectorCost < ScalarCost) {
@@ -5816,29 +5816,29 @@ LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) {
{ElementCount::getFixed(i), (unsigned)VectorCost}));
}
- if (VectorCost < Cost) {
- Cost = VectorCost;
- Width = i;
- }
- }
-
- if (!EnableCondStoresVectorization && NumPredStores) {
- reportVectorizationFailure("There are conditional stores.",
- "store that is conditionally executed prevents vectorization",
- "ConditionalStore", ORE, TheLoop);
- Width = 1;
- Cost = ScalarCost;
- }
-
- LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
- << "LV: Vectorization seems to be not beneficial, "
- << "but was forced by a user.\n");
- LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
+ if (VectorCost < Cost) {
+ Cost = VectorCost;
+ Width = i;
+ }
+ }
+
+ if (!EnableCondStoresVectorization && NumPredStores) {
+ reportVectorizationFailure("There are conditional stores.",
+ "store that is conditionally executed prevents vectorization",
+ "ConditionalStore", ORE, TheLoop);
+ Width = 1;
+ Cost = ScalarCost;
+ }
+
+ LLVM_DEBUG(if (ForceVectorization && Width > 1 && Cost >= ScalarCost) dbgs()
+ << "LV: Vectorization seems to be not beneficial, "
+ << "but was forced by a user.\n");
+ LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
VectorizationFactor Factor = {ElementCount::getFixed(Width),
(unsigned)(Width * Cost)};
- return Factor;
-}
-
+ return Factor;
+}
+
bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
const Loop &L, ElementCount VF) const {
// Cross iteration phis such as reductions need special handling and are
@@ -5959,163 +5959,163 @@ LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
return Result;
}
-std::pair<unsigned, unsigned>
-LoopVectorizationCostModel::getSmallestAndWidestTypes() {
- unsigned MinWidth = -1U;
- unsigned MaxWidth = 8;
- const DataLayout &DL = TheFunction->getParent()->getDataLayout();
-
- // For each block.
- for (BasicBlock *BB : TheLoop->blocks()) {
- // For each instruction in the loop.
- for (Instruction &I : BB->instructionsWithoutDebug()) {
- Type *T = I.getType();
-
- // Skip ignored values.
- if (ValuesToIgnore.count(&I))
- continue;
-
- // Only examine Loads, Stores and PHINodes.
- if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
- continue;
-
- // Examine PHI nodes that are reduction variables. Update the type to
- // account for the recurrence type.
- if (auto *PN = dyn_cast<PHINode>(&I)) {
- if (!Legal->isReductionVariable(PN))
- continue;
- RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
+std::pair<unsigned, unsigned>
+LoopVectorizationCostModel::getSmallestAndWidestTypes() {
+ unsigned MinWidth = -1U;
+ unsigned MaxWidth = 8;
+ const DataLayout &DL = TheFunction->getParent()->getDataLayout();
+
+ // For each block.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // For each instruction in the loop.
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ Type *T = I.getType();
+
+ // Skip ignored values.
+ if (ValuesToIgnore.count(&I))
+ continue;
+
+ // Only examine Loads, Stores and PHINodes.
+ if (!isa<LoadInst>(I) && !isa<StoreInst>(I) && !isa<PHINode>(I))
+ continue;
+
+ // Examine PHI nodes that are reduction variables. Update the type to
+ // account for the recurrence type.
+ if (auto *PN = dyn_cast<PHINode>(&I)) {
+ if (!Legal->isReductionVariable(PN))
+ continue;
+ RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
if (PreferInLoopReductions ||
TTI.preferInLoopReduction(RdxDesc.getOpcode(),
RdxDesc.getRecurrenceType(),
TargetTransformInfo::ReductionFlags()))
continue;
- T = RdxDesc.getRecurrenceType();
- }
-
- // Examine the stored values.
- if (auto *ST = dyn_cast<StoreInst>(&I))
- T = ST->getValueOperand()->getType();
-
- // Ignore loaded pointer types and stored pointer types that are not
- // vectorizable.
- //
- // FIXME: The check here attempts to predict whether a load or store will
- // be vectorized. We only know this for certain after a VF has
- // been selected. Here, we assume that if an access can be
- // vectorized, it will be. We should also look at extending this
- // optimization to non-pointer types.
- //
- if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
- !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
- continue;
-
- MinWidth = std::min(MinWidth,
- (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
- MaxWidth = std::max(MaxWidth,
- (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
- }
- }
-
- return {MinWidth, MaxWidth};
-}
-
+ T = RdxDesc.getRecurrenceType();
+ }
+
+ // Examine the stored values.
+ if (auto *ST = dyn_cast<StoreInst>(&I))
+ T = ST->getValueOperand()->getType();
+
+ // Ignore loaded pointer types and stored pointer types that are not
+ // vectorizable.
+ //
+ // FIXME: The check here attempts to predict whether a load or store will
+ // be vectorized. We only know this for certain after a VF has
+ // been selected. Here, we assume that if an access can be
+ // vectorized, it will be. We should also look at extending this
+ // optimization to non-pointer types.
+ //
+ if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
+ !isAccessInterleaved(&I) && !isLegalGatherOrScatter(&I))
+ continue;
+
+ MinWidth = std::min(MinWidth,
+ (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
+ MaxWidth = std::max(MaxWidth,
+ (unsigned)DL.getTypeSizeInBits(T->getScalarType()));
+ }
+ }
+
+ return {MinWidth, MaxWidth};
+}
+
unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
- unsigned LoopCost) {
- // -- The interleave heuristics --
- // We interleave the loop in order to expose ILP and reduce the loop overhead.
- // There are many micro-architectural considerations that we can't predict
- // at this level. For example, frontend pressure (on decode or fetch) due to
- // code size, or the number and capabilities of the execution ports.
- //
- // We use the following heuristics to select the interleave count:
- // 1. If the code has reductions, then we interleave to break the cross
- // iteration dependency.
- // 2. If the loop is really small, then we interleave to reduce the loop
- // overhead.
- // 3. We don't interleave if we think that we will spill registers to memory
- // due to the increased register pressure.
-
- if (!isScalarEpilogueAllowed())
- return 1;
-
- // We used the distance for the interleave count.
- if (Legal->getMaxSafeDepDistBytes() != -1U)
- return 1;
-
+ unsigned LoopCost) {
+ // -- The interleave heuristics --
+ // We interleave the loop in order to expose ILP and reduce the loop overhead.
+ // There are many micro-architectural considerations that we can't predict
+ // at this level. For example, frontend pressure (on decode or fetch) due to
+ // code size, or the number and capabilities of the execution ports.
+ //
+ // We use the following heuristics to select the interleave count:
+ // 1. If the code has reductions, then we interleave to break the cross
+ // iteration dependency.
+ // 2. If the loop is really small, then we interleave to reduce the loop
+ // overhead.
+ // 3. We don't interleave if we think that we will spill registers to memory
+ // due to the increased register pressure.
+
+ if (!isScalarEpilogueAllowed())
+ return 1;
+
+ // We used the distance for the interleave count.
+ if (Legal->getMaxSafeDepDistBytes() != -1U)
+ return 1;
+
auto BestKnownTC = getSmallBestKnownTC(*PSE.getSE(), TheLoop);
const bool HasReductions = !Legal->getReductionVars().empty();
- // Do not interleave loops with a relatively small known or estimated trip
+ // Do not interleave loops with a relatively small known or estimated trip
// count. But we will interleave when InterleaveSmallLoopScalarReduction is
// enabled, and the code has scalar reductions(HasReductions && VF = 1),
// because with the above conditions interleaving can expose ILP and break
// cross iteration dependences for reductions.
if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
!(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
- return 1;
-
- RegisterUsage R = calculateRegisterUsage({VF})[0];
- // We divide by these constants so assume that we have at least one
- // instruction that uses at least one register.
- for (auto& pair : R.MaxLocalUsers) {
- pair.second = std::max(pair.second, 1U);
- }
-
- // We calculate the interleave count using the following formula.
- // Subtract the number of loop invariants from the number of available
- // registers. These registers are used by all of the interleaved instances.
- // Next, divide the remaining registers by the number of registers that is
- // required by the loop, in order to estimate how many parallel instances
- // fit without causing spills. All of this is rounded down if necessary to be
- // a power of two. We want power of two interleave count to simplify any
- // addressing operations or alignment considerations.
- // We also want power of two interleave counts to ensure that the induction
- // variable of the vector loop wraps to zero, when tail is folded by masking;
- // this currently happens when OptForSize, in which case IC is set to 1 above.
- unsigned IC = UINT_MAX;
-
- for (auto& pair : R.MaxLocalUsers) {
- unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
- LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
- << " registers of "
- << TTI.getRegisterClassName(pair.first) << " register class\n");
+ return 1;
+
+ RegisterUsage R = calculateRegisterUsage({VF})[0];
+ // We divide by these constants so assume that we have at least one
+ // instruction that uses at least one register.
+ for (auto& pair : R.MaxLocalUsers) {
+ pair.second = std::max(pair.second, 1U);
+ }
+
+ // We calculate the interleave count using the following formula.
+ // Subtract the number of loop invariants from the number of available
+ // registers. These registers are used by all of the interleaved instances.
+ // Next, divide the remaining registers by the number of registers that is
+ // required by the loop, in order to estimate how many parallel instances
+ // fit without causing spills. All of this is rounded down if necessary to be
+ // a power of two. We want power of two interleave count to simplify any
+ // addressing operations or alignment considerations.
+ // We also want power of two interleave counts to ensure that the induction
+ // variable of the vector loop wraps to zero, when tail is folded by masking;
+ // this currently happens when OptForSize, in which case IC is set to 1 above.
+ unsigned IC = UINT_MAX;
+
+ for (auto& pair : R.MaxLocalUsers) {
+ unsigned TargetNumRegisters = TTI.getNumberOfRegisters(pair.first);
+ LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
+ << " registers of "
+ << TTI.getRegisterClassName(pair.first) << " register class\n");
if (VF.isScalar()) {
- if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
- TargetNumRegisters = ForceTargetNumScalarRegs;
- } else {
- if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
- TargetNumRegisters = ForceTargetNumVectorRegs;
- }
- unsigned MaxLocalUsers = pair.second;
- unsigned LoopInvariantRegs = 0;
- if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
- LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
-
- unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
- // Don't count the induction variable as interleaved.
- if (EnableIndVarRegisterHeur) {
- TmpIC =
- PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
- std::max(1U, (MaxLocalUsers - 1)));
- }
-
- IC = std::min(IC, TmpIC);
- }
-
- // Clamp the interleave ranges to reasonable counts.
+ if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
+ TargetNumRegisters = ForceTargetNumScalarRegs;
+ } else {
+ if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
+ TargetNumRegisters = ForceTargetNumVectorRegs;
+ }
+ unsigned MaxLocalUsers = pair.second;
+ unsigned LoopInvariantRegs = 0;
+ if (R.LoopInvariantRegs.find(pair.first) != R.LoopInvariantRegs.end())
+ LoopInvariantRegs = R.LoopInvariantRegs[pair.first];
+
+ unsigned TmpIC = PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs) / MaxLocalUsers);
+ // Don't count the induction variable as interleaved.
+ if (EnableIndVarRegisterHeur) {
+ TmpIC =
+ PowerOf2Floor((TargetNumRegisters - LoopInvariantRegs - 1) /
+ std::max(1U, (MaxLocalUsers - 1)));
+ }
+
+ IC = std::min(IC, TmpIC);
+ }
+
+ // Clamp the interleave ranges to reasonable counts.
unsigned MaxInterleaveCount =
TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
-
- // Check if the user has overridden the max.
+
+ // Check if the user has overridden the max.
if (VF.isScalar()) {
- if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
- MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
- } else {
- if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
- MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
- }
-
- // If trip count is known or estimated compile time constant, limit the
+ if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
+ MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
+ } else {
+ if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
+ MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
+ }
+
+ // If trip count is known or estimated compile time constant, limit the
// interleave count to be less than the trip count divided by VF, provided it
// is at least 1.
//
@@ -6125,24 +6125,24 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
// similar benefit as for fixed-width vectors. For now, we choose to leave
// the InterleaveCount as if vscale is '1', although if some information about
// the vector is known (e.g. min vector size), we can make a better decision.
- if (BestKnownTC) {
+ if (BestKnownTC) {
MaxInterleaveCount =
std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
// Make sure MaxInterleaveCount is greater than 0.
MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
- }
-
+ }
+
assert(MaxInterleaveCount > 0 &&
"Maximum interleave count must be greater than 0");
-
- // Clamp the calculated IC to be between the 1 and the max interleave count
- // that the target and trip count allows.
- if (IC > MaxInterleaveCount)
- IC = MaxInterleaveCount;
+
+ // Clamp the calculated IC to be between the 1 and the max interleave count
+ // that the target and trip count allows.
+ if (IC > MaxInterleaveCount)
+ IC = MaxInterleaveCount;
else
// Make sure IC is greater than 0.
IC = std::max(1u, IC);
-
+
assert(IC > 0 && "Interleave count must be greater than 0.");
// If we did not calculate the cost for VF (because the user selected the VF)
@@ -6154,57 +6154,57 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
assert(LoopCost && "Non-zero loop cost expected");
- // Interleave if we vectorized this loop and there is a reduction that could
- // benefit from interleaving.
+ // Interleave if we vectorized this loop and there is a reduction that could
+ // benefit from interleaving.
if (VF.isVector() && HasReductions) {
- LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
- return IC;
- }
-
- // Note that if we've already vectorized the loop we will have done the
- // runtime check and so interleaving won't require further checks.
- bool InterleavingRequiresRuntimePointerCheck =
+ LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
+ return IC;
+ }
+
+ // Note that if we've already vectorized the loop we will have done the
+ // runtime check and so interleaving won't require further checks.
+ bool InterleavingRequiresRuntimePointerCheck =
(VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
-
- // We want to interleave small loops in order to reduce the loop overhead and
- // potentially expose ILP opportunities.
+
+ // We want to interleave small loops in order to reduce the loop overhead and
+ // potentially expose ILP opportunities.
LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
<< "LV: IC is " << IC << '\n'
<< "LV: VF is " << VF << '\n');
const bool AggressivelyInterleaveReductions =
TTI.enableAggressiveInterleaving(HasReductions);
- if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
- // We assume that the cost overhead is 1 and we use the cost model
- // to estimate the cost of the loop and interleave until the cost of the
- // loop overhead is about 5% of the cost of the loop.
- unsigned SmallIC =
- std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
-
- // Interleave until store/load ports (estimated by max interleave count) are
- // saturated.
- unsigned NumStores = Legal->getNumStores();
- unsigned NumLoads = Legal->getNumLoads();
- unsigned StoresIC = IC / (NumStores ? NumStores : 1);
- unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
-
- // If we have a scalar reduction (vector reductions are already dealt with
- // by this point), we can increase the critical path length if the loop
- // we're interleaving is inside another loop. Limit, by default to 2, so the
- // critical path only gets increased by one reduction operation.
+ if (!InterleavingRequiresRuntimePointerCheck && LoopCost < SmallLoopCost) {
+ // We assume that the cost overhead is 1 and we use the cost model
+ // to estimate the cost of the loop and interleave until the cost of the
+ // loop overhead is about 5% of the cost of the loop.
+ unsigned SmallIC =
+ std::min(IC, (unsigned)PowerOf2Floor(SmallLoopCost / LoopCost));
+
+ // Interleave until store/load ports (estimated by max interleave count) are
+ // saturated.
+ unsigned NumStores = Legal->getNumStores();
+ unsigned NumLoads = Legal->getNumLoads();
+ unsigned StoresIC = IC / (NumStores ? NumStores : 1);
+ unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
+
+ // If we have a scalar reduction (vector reductions are already dealt with
+ // by this point), we can increase the critical path length if the loop
+ // we're interleaving is inside another loop. Limit, by default to 2, so the
+ // critical path only gets increased by one reduction operation.
if (HasReductions && TheLoop->getLoopDepth() > 1) {
- unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
- SmallIC = std::min(SmallIC, F);
- StoresIC = std::min(StoresIC, F);
- LoadsIC = std::min(LoadsIC, F);
- }
-
- if (EnableLoadStoreRuntimeInterleave &&
- std::max(StoresIC, LoadsIC) > SmallIC) {
- LLVM_DEBUG(
- dbgs() << "LV: Interleaving to saturate store or load ports.\n");
- return std::max(StoresIC, LoadsIC);
- }
-
+ unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
+ SmallIC = std::min(SmallIC, F);
+ StoresIC = std::min(StoresIC, F);
+ LoadsIC = std::min(LoadsIC, F);
+ }
+
+ if (EnableLoadStoreRuntimeInterleave &&
+ std::max(StoresIC, LoadsIC) > SmallIC) {
+ LLVM_DEBUG(
+ dbgs() << "LV: Interleaving to saturate store or load ports.\n");
+ return std::max(StoresIC, LoadsIC);
+ }
+
// If there are scalar reductions and TTI has enabled aggressive
// interleaving for reductions, we will interleave to expose ILP.
if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
@@ -6217,611 +6217,611 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
return SmallIC;
}
- }
-
- // Interleave if this is a large loop (small loops are already dealt with by
- // this point) that could benefit from interleaving.
+ }
+
+ // Interleave if this is a large loop (small loops are already dealt with by
+ // this point) that could benefit from interleaving.
if (AggressivelyInterleaveReductions) {
- LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
- return IC;
- }
-
- LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
- return 1;
-}
-
-SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
+ LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
+ return IC;
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
+ return 1;
+}
+
+SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
- // This function calculates the register usage by measuring the highest number
- // of values that are alive at a single location. Obviously, this is a very
- // rough estimation. We scan the loop in a topological order in order and
- // assign a number to each instruction. We use RPO to ensure that defs are
- // met before their users. We assume that each instruction that has in-loop
- // users starts an interval. We record every time that an in-loop value is
- // used, so we have a list of the first and last occurrences of each
- // instruction. Next, we transpose this data structure into a multi map that
- // holds the list of intervals that *end* at a specific location. This multi
- // map allows us to perform a linear search. We scan the instructions linearly
- // and record each time that a new interval starts, by placing it in a set.
- // If we find this value in the multi-map then we remove it from the set.
- // The max register usage is the maximum size of the set.
- // We also search for instructions that are defined outside the loop, but are
- // used inside the loop. We need this number separately from the max-interval
- // usage number because when we unroll, loop-invariant values do not take
- // more register.
- LoopBlocksDFS DFS(TheLoop);
- DFS.perform(LI);
-
- RegisterUsage RU;
-
- // Each 'key' in the map opens a new interval. The values
- // of the map are the index of the 'last seen' usage of the
- // instruction that is the key.
- using IntervalMap = DenseMap<Instruction *, unsigned>;
-
- // Maps instruction to its index.
- SmallVector<Instruction *, 64> IdxToInstr;
- // Marks the end of each interval.
- IntervalMap EndPoint;
- // Saves the list of instruction indices that are used in the loop.
- SmallPtrSet<Instruction *, 8> Ends;
- // Saves the list of values that are used in the loop but are
- // defined outside the loop, such as arguments and constants.
- SmallPtrSet<Value *, 8> LoopInvariants;
-
- for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
- for (Instruction &I : BB->instructionsWithoutDebug()) {
- IdxToInstr.push_back(&I);
-
- // Save the end location of each USE.
- for (Value *U : I.operands()) {
- auto *Instr = dyn_cast<Instruction>(U);
-
- // Ignore non-instruction values such as arguments, constants, etc.
- if (!Instr)
- continue;
-
- // If this instruction is outside the loop then record it and continue.
- if (!TheLoop->contains(Instr)) {
- LoopInvariants.insert(Instr);
- continue;
- }
-
- // Overwrite previous end points.
- EndPoint[Instr] = IdxToInstr.size();
- Ends.insert(Instr);
- }
- }
- }
-
- // Saves the list of intervals that end with the index in 'key'.
- using InstrList = SmallVector<Instruction *, 2>;
- DenseMap<unsigned, InstrList> TransposeEnds;
-
- // Transpose the EndPoints to a list of values that end at each index.
- for (auto &Interval : EndPoint)
- TransposeEnds[Interval.second].push_back(Interval.first);
-
- SmallPtrSet<Instruction *, 8> OpenIntervals;
- SmallVector<RegisterUsage, 8> RUs(VFs.size());
- SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
-
- LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
-
- // A lambda that gets the register usage for the given type and VF.
+ // This function calculates the register usage by measuring the highest number
+ // of values that are alive at a single location. Obviously, this is a very
+ // rough estimation. We scan the loop in a topological order in order and
+ // assign a number to each instruction. We use RPO to ensure that defs are
+ // met before their users. We assume that each instruction that has in-loop
+ // users starts an interval. We record every time that an in-loop value is
+ // used, so we have a list of the first and last occurrences of each
+ // instruction. Next, we transpose this data structure into a multi map that
+ // holds the list of intervals that *end* at a specific location. This multi
+ // map allows us to perform a linear search. We scan the instructions linearly
+ // and record each time that a new interval starts, by placing it in a set.
+ // If we find this value in the multi-map then we remove it from the set.
+ // The max register usage is the maximum size of the set.
+ // We also search for instructions that are defined outside the loop, but are
+ // used inside the loop. We need this number separately from the max-interval
+ // usage number because when we unroll, loop-invariant values do not take
+ // more register.
+ LoopBlocksDFS DFS(TheLoop);
+ DFS.perform(LI);
+
+ RegisterUsage RU;
+
+ // Each 'key' in the map opens a new interval. The values
+ // of the map are the index of the 'last seen' usage of the
+ // instruction that is the key.
+ using IntervalMap = DenseMap<Instruction *, unsigned>;
+
+ // Maps instruction to its index.
+ SmallVector<Instruction *, 64> IdxToInstr;
+ // Marks the end of each interval.
+ IntervalMap EndPoint;
+ // Saves the list of instruction indices that are used in the loop.
+ SmallPtrSet<Instruction *, 8> Ends;
+ // Saves the list of values that are used in the loop but are
+ // defined outside the loop, such as arguments and constants.
+ SmallPtrSet<Value *, 8> LoopInvariants;
+
+ for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ IdxToInstr.push_back(&I);
+
+ // Save the end location of each USE.
+ for (Value *U : I.operands()) {
+ auto *Instr = dyn_cast<Instruction>(U);
+
+ // Ignore non-instruction values such as arguments, constants, etc.
+ if (!Instr)
+ continue;
+
+ // If this instruction is outside the loop then record it and continue.
+ if (!TheLoop->contains(Instr)) {
+ LoopInvariants.insert(Instr);
+ continue;
+ }
+
+ // Overwrite previous end points.
+ EndPoint[Instr] = IdxToInstr.size();
+ Ends.insert(Instr);
+ }
+ }
+ }
+
+ // Saves the list of intervals that end with the index in 'key'.
+ using InstrList = SmallVector<Instruction *, 2>;
+ DenseMap<unsigned, InstrList> TransposeEnds;
+
+ // Transpose the EndPoints to a list of values that end at each index.
+ for (auto &Interval : EndPoint)
+ TransposeEnds[Interval.second].push_back(Interval.first);
+
+ SmallPtrSet<Instruction *, 8> OpenIntervals;
+ SmallVector<RegisterUsage, 8> RUs(VFs.size());
+ SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
+
+ LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
+
+ // A lambda that gets the register usage for the given type and VF.
const auto &TTICapture = TTI;
auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) {
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty))
- return 0U;
+ return 0U;
return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
- };
-
- for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
- Instruction *I = IdxToInstr[i];
-
- // Remove all of the instructions that end at this location.
- InstrList &List = TransposeEnds[i];
- for (Instruction *ToRemove : List)
- OpenIntervals.erase(ToRemove);
-
- // Ignore instructions that are never used within the loop.
- if (!Ends.count(I))
- continue;
-
- // Skip ignored values.
- if (ValuesToIgnore.count(I))
- continue;
-
- // For each VF find the maximum usage of registers.
- for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
- // Count the number of live intervals.
- SmallMapVector<unsigned, unsigned, 4> RegUsage;
-
+ };
+
+ for (unsigned int i = 0, s = IdxToInstr.size(); i < s; ++i) {
+ Instruction *I = IdxToInstr[i];
+
+ // Remove all of the instructions that end at this location.
+ InstrList &List = TransposeEnds[i];
+ for (Instruction *ToRemove : List)
+ OpenIntervals.erase(ToRemove);
+
+ // Ignore instructions that are never used within the loop.
+ if (!Ends.count(I))
+ continue;
+
+ // Skip ignored values.
+ if (ValuesToIgnore.count(I))
+ continue;
+
+ // For each VF find the maximum usage of registers.
+ for (unsigned j = 0, e = VFs.size(); j < e; ++j) {
+ // Count the number of live intervals.
+ SmallMapVector<unsigned, unsigned, 4> RegUsage;
+
if (VFs[j].isScalar()) {
- for (auto Inst : OpenIntervals) {
- unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
- if (RegUsage.find(ClassID) == RegUsage.end())
- RegUsage[ClassID] = 1;
- else
- RegUsage[ClassID] += 1;
- }
- } else {
- collectUniformsAndScalars(VFs[j]);
- for (auto Inst : OpenIntervals) {
- // Skip ignored values for VF > 1.
- if (VecValuesToIgnore.count(Inst))
- continue;
- if (isScalarAfterVectorization(Inst, VFs[j])) {
- unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
- if (RegUsage.find(ClassID) == RegUsage.end())
- RegUsage[ClassID] = 1;
- else
- RegUsage[ClassID] += 1;
- } else {
- unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
- if (RegUsage.find(ClassID) == RegUsage.end())
- RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
- else
- RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
- }
- }
- }
-
- for (auto& pair : RegUsage) {
- if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
- MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
- else
- MaxUsages[j][pair.first] = pair.second;
- }
- }
-
- LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
- << OpenIntervals.size() << '\n');
-
- // Add the current instruction to the list of open intervals.
- OpenIntervals.insert(I);
- }
-
- for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
- SmallMapVector<unsigned, unsigned, 4> Invariant;
-
- for (auto Inst : LoopInvariants) {
+ for (auto Inst : OpenIntervals) {
+ unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
+ if (RegUsage.find(ClassID) == RegUsage.end())
+ RegUsage[ClassID] = 1;
+ else
+ RegUsage[ClassID] += 1;
+ }
+ } else {
+ collectUniformsAndScalars(VFs[j]);
+ for (auto Inst : OpenIntervals) {
+ // Skip ignored values for VF > 1.
+ if (VecValuesToIgnore.count(Inst))
+ continue;
+ if (isScalarAfterVectorization(Inst, VFs[j])) {
+ unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
+ if (RegUsage.find(ClassID) == RegUsage.end())
+ RegUsage[ClassID] = 1;
+ else
+ RegUsage[ClassID] += 1;
+ } else {
+ unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType());
+ if (RegUsage.find(ClassID) == RegUsage.end())
+ RegUsage[ClassID] = GetRegUsage(Inst->getType(), VFs[j]);
+ else
+ RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
+ }
+ }
+ }
+
+ for (auto& pair : RegUsage) {
+ if (MaxUsages[j].find(pair.first) != MaxUsages[j].end())
+ MaxUsages[j][pair.first] = std::max(MaxUsages[j][pair.first], pair.second);
+ else
+ MaxUsages[j][pair.first] = pair.second;
+ }
+ }
+
+ LLVM_DEBUG(dbgs() << "LV(REG): At #" << i << " Interval # "
+ << OpenIntervals.size() << '\n');
+
+ // Add the current instruction to the list of open intervals.
+ OpenIntervals.insert(I);
+ }
+
+ for (unsigned i = 0, e = VFs.size(); i < e; ++i) {
+ SmallMapVector<unsigned, unsigned, 4> Invariant;
+
+ for (auto Inst : LoopInvariants) {
unsigned Usage =
VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
unsigned ClassID =
TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
- if (Invariant.find(ClassID) == Invariant.end())
- Invariant[ClassID] = Usage;
- else
- Invariant[ClassID] += Usage;
- }
-
- LLVM_DEBUG({
- dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
- dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
- << " item\n";
- for (const auto &pair : MaxUsages[i]) {
- dbgs() << "LV(REG): RegisterClass: "
- << TTI.getRegisterClassName(pair.first) << ", " << pair.second
- << " registers\n";
- }
- dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
- << " item\n";
- for (const auto &pair : Invariant) {
- dbgs() << "LV(REG): RegisterClass: "
- << TTI.getRegisterClassName(pair.first) << ", " << pair.second
- << " registers\n";
- }
- });
-
- RU.LoopInvariantRegs = Invariant;
- RU.MaxLocalUsers = MaxUsages[i];
- RUs[i] = RU;
- }
-
- return RUs;
-}
-
-bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
- // TODO: Cost model for emulated masked load/store is completely
- // broken. This hack guides the cost model to use an artificially
- // high enough value to practically disable vectorization with such
- // operations, except where previously deployed legality hack allowed
- // using very low cost values. This is to avoid regressions coming simply
- // from moving "masked load/store" check from legality to cost model.
- // Masked Load/Gather emulation was previously never allowed.
- // Limited number of Masked Store/Scatter emulation was allowed.
- assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
- return isa<LoadInst>(I) ||
- (isa<StoreInst>(I) &&
- NumPredStores > NumberOfStoresToPredicate);
-}
-
+ if (Invariant.find(ClassID) == Invariant.end())
+ Invariant[ClassID] = Usage;
+ else
+ Invariant[ClassID] += Usage;
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
+ dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
+ << " item\n";
+ for (const auto &pair : MaxUsages[i]) {
+ dbgs() << "LV(REG): RegisterClass: "
+ << TTI.getRegisterClassName(pair.first) << ", " << pair.second
+ << " registers\n";
+ }
+ dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
+ << " item\n";
+ for (const auto &pair : Invariant) {
+ dbgs() << "LV(REG): RegisterClass: "
+ << TTI.getRegisterClassName(pair.first) << ", " << pair.second
+ << " registers\n";
+ }
+ });
+
+ RU.LoopInvariantRegs = Invariant;
+ RU.MaxLocalUsers = MaxUsages[i];
+ RUs[i] = RU;
+ }
+
+ return RUs;
+}
+
+bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I){
+ // TODO: Cost model for emulated masked load/store is completely
+ // broken. This hack guides the cost model to use an artificially
+ // high enough value to practically disable vectorization with such
+ // operations, except where previously deployed legality hack allowed
+ // using very low cost values. This is to avoid regressions coming simply
+ // from moving "masked load/store" check from legality to cost model.
+ // Masked Load/Gather emulation was previously never allowed.
+ // Limited number of Masked Store/Scatter emulation was allowed.
+ assert(isPredicatedInst(I) && "Expecting a scalar emulated instruction");
+ return isa<LoadInst>(I) ||
+ (isa<StoreInst>(I) &&
+ NumPredStores > NumberOfStoresToPredicate);
+}
+
void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
- // If we aren't vectorizing the loop, or if we've already collected the
- // instructions to scalarize, there's nothing to do. Collection may already
- // have occurred if we have a user-selected VF and are now computing the
- // expected cost for interleaving.
+ // If we aren't vectorizing the loop, or if we've already collected the
+ // instructions to scalarize, there's nothing to do. Collection may already
+ // have occurred if we have a user-selected VF and are now computing the
+ // expected cost for interleaving.
if (VF.isScalar() || VF.isZero() ||
InstsToScalarize.find(VF) != InstsToScalarize.end())
- return;
-
- // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
- // not profitable to scalarize any instructions, the presence of VF in the
- // map will indicate that we've analyzed it already.
- ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
-
- // Find all the instructions that are scalar with predication in the loop and
- // determine if it would be better to not if-convert the blocks they are in.
- // If so, we also record the instructions to scalarize.
- for (BasicBlock *BB : TheLoop->blocks()) {
- if (!blockNeedsPredication(BB))
- continue;
- for (Instruction &I : *BB)
- if (isScalarWithPredication(&I)) {
- ScalarCostsTy ScalarCosts;
- // Do not apply discount logic if hacked cost is needed
- // for emulated masked memrefs.
- if (!useEmulatedMaskMemRefHack(&I) &&
- computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
- ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
- // Remember that BB will remain after vectorization.
- PredicatedBBsAfterVectorization.insert(BB);
- }
- }
-}
-
-int LoopVectorizationCostModel::computePredInstDiscount(
+ return;
+
+ // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
+ // not profitable to scalarize any instructions, the presence of VF in the
+ // map will indicate that we've analyzed it already.
+ ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
+
+ // Find all the instructions that are scalar with predication in the loop and
+ // determine if it would be better to not if-convert the blocks they are in.
+ // If so, we also record the instructions to scalarize.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ if (!blockNeedsPredication(BB))
+ continue;
+ for (Instruction &I : *BB)
+ if (isScalarWithPredication(&I)) {
+ ScalarCostsTy ScalarCosts;
+ // Do not apply discount logic if hacked cost is needed
+ // for emulated masked memrefs.
+ if (!useEmulatedMaskMemRefHack(&I) &&
+ computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+ ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
+ // Remember that BB will remain after vectorization.
+ PredicatedBBsAfterVectorization.insert(BB);
+ }
+ }
+}
+
+int LoopVectorizationCostModel::computePredInstDiscount(
Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
- assert(!isUniformAfterVectorization(PredInst, VF) &&
- "Instruction marked uniform-after-vectorization will be predicated");
-
- // Initialize the discount to zero, meaning that the scalar version and the
- // vector version cost the same.
+ assert(!isUniformAfterVectorization(PredInst, VF) &&
+ "Instruction marked uniform-after-vectorization will be predicated");
+
+ // Initialize the discount to zero, meaning that the scalar version and the
+ // vector version cost the same.
InstructionCost Discount = 0;
-
- // Holds instructions to analyze. The instructions we visit are mapped in
- // ScalarCosts. Those instructions are the ones that would be scalarized if
- // we find that the scalar version costs less.
- SmallVector<Instruction *, 8> Worklist;
-
- // Returns true if the given instruction can be scalarized.
- auto canBeScalarized = [&](Instruction *I) -> bool {
- // We only attempt to scalarize instructions forming a single-use chain
- // from the original predicated block that would otherwise be vectorized.
- // Although not strictly necessary, we give up on instructions we know will
- // already be scalar to avoid traversing chains that are unlikely to be
- // beneficial.
- if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
- isScalarAfterVectorization(I, VF))
- return false;
-
- // If the instruction is scalar with predication, it will be analyzed
- // separately. We ignore it within the context of PredInst.
- if (isScalarWithPredication(I))
- return false;
-
- // If any of the instruction's operands are uniform after vectorization,
- // the instruction cannot be scalarized. This prevents, for example, a
- // masked load from being scalarized.
- //
- // We assume we will only emit a value for lane zero of an instruction
- // marked uniform after vectorization, rather than VF identical values.
- // Thus, if we scalarize an instruction that uses a uniform, we would
- // create uses of values corresponding to the lanes we aren't emitting code
- // for. This behavior can be changed by allowing getScalarValue to clone
- // the lane zero values for uniforms rather than asserting.
- for (Use &U : I->operands())
- if (auto *J = dyn_cast<Instruction>(U.get()))
- if (isUniformAfterVectorization(J, VF))
- return false;
-
- // Otherwise, we can scalarize the instruction.
- return true;
- };
-
- // Compute the expected cost discount from scalarizing the entire expression
- // feeding the predicated instruction. We currently only consider expressions
- // that are single-use instruction chains.
- Worklist.push_back(PredInst);
- while (!Worklist.empty()) {
- Instruction *I = Worklist.pop_back_val();
-
- // If we've already analyzed the instruction, there's nothing to do.
- if (ScalarCosts.find(I) != ScalarCosts.end())
- continue;
-
- // Compute the cost of the vector instruction. Note that this cost already
- // includes the scalarization overhead of the predicated instruction.
+
+ // Holds instructions to analyze. The instructions we visit are mapped in
+ // ScalarCosts. Those instructions are the ones that would be scalarized if
+ // we find that the scalar version costs less.
+ SmallVector<Instruction *, 8> Worklist;
+
+ // Returns true if the given instruction can be scalarized.
+ auto canBeScalarized = [&](Instruction *I) -> bool {
+ // We only attempt to scalarize instructions forming a single-use chain
+ // from the original predicated block that would otherwise be vectorized.
+ // Although not strictly necessary, we give up on instructions we know will
+ // already be scalar to avoid traversing chains that are unlikely to be
+ // beneficial.
+ if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
+ isScalarAfterVectorization(I, VF))
+ return false;
+
+ // If the instruction is scalar with predication, it will be analyzed
+ // separately. We ignore it within the context of PredInst.
+ if (isScalarWithPredication(I))
+ return false;
+
+ // If any of the instruction's operands are uniform after vectorization,
+ // the instruction cannot be scalarized. This prevents, for example, a
+ // masked load from being scalarized.
+ //
+ // We assume we will only emit a value for lane zero of an instruction
+ // marked uniform after vectorization, rather than VF identical values.
+ // Thus, if we scalarize an instruction that uses a uniform, we would
+ // create uses of values corresponding to the lanes we aren't emitting code
+ // for. This behavior can be changed by allowing getScalarValue to clone
+ // the lane zero values for uniforms rather than asserting.
+ for (Use &U : I->operands())
+ if (auto *J = dyn_cast<Instruction>(U.get()))
+ if (isUniformAfterVectorization(J, VF))
+ return false;
+
+ // Otherwise, we can scalarize the instruction.
+ return true;
+ };
+
+ // Compute the expected cost discount from scalarizing the entire expression
+ // feeding the predicated instruction. We currently only consider expressions
+ // that are single-use instruction chains.
+ Worklist.push_back(PredInst);
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
+
+ // If we've already analyzed the instruction, there's nothing to do.
+ if (ScalarCosts.find(I) != ScalarCosts.end())
+ continue;
+
+ // Compute the cost of the vector instruction. Note that this cost already
+ // includes the scalarization overhead of the predicated instruction.
InstructionCost VectorCost = getInstructionCost(I, VF).first;
-
- // Compute the cost of the scalarized instruction. This cost is the cost of
- // the instruction as if it wasn't if-converted and instead remained in the
- // predicated block. We will scale this cost by block probability after
- // computing the scalarization overhead.
+
+ // Compute the cost of the scalarized instruction. This cost is the cost of
+ // the instruction as if it wasn't if-converted and instead remained in the
+ // predicated block. We will scale this cost by block probability after
+ // computing the scalarization overhead.
assert(!VF.isScalable() && "scalable vectors not yet supported.");
InstructionCost ScalarCost =
VF.getKnownMinValue() *
getInstructionCost(I, ElementCount::getFixed(1)).first;
-
- // Compute the scalarization overhead of needed insertelement instructions
- // and phi nodes.
- if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
- ScalarCost += TTI.getScalarizationOverhead(
- cast<VectorType>(ToVectorTy(I->getType(), VF)),
+
+ // Compute the scalarization overhead of needed insertelement instructions
+ // and phi nodes.
+ if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
+ ScalarCost += TTI.getScalarizationOverhead(
+ cast<VectorType>(ToVectorTy(I->getType(), VF)),
APInt::getAllOnesValue(VF.getKnownMinValue()), true, false);
assert(!VF.isScalable() && "scalable vectors not yet supported.");
ScalarCost +=
VF.getKnownMinValue() *
TTI.getCFInstrCost(Instruction::PHI, TTI::TCK_RecipThroughput);
- }
-
- // Compute the scalarization overhead of needed extractelement
- // instructions. For each of the instruction's operands, if the operand can
- // be scalarized, add it to the worklist; otherwise, account for the
- // overhead.
- for (Use &U : I->operands())
- if (auto *J = dyn_cast<Instruction>(U.get())) {
- assert(VectorType::isValidElementType(J->getType()) &&
- "Instruction has non-scalar type");
- if (canBeScalarized(J))
- Worklist.push_back(J);
+ }
+
+ // Compute the scalarization overhead of needed extractelement
+ // instructions. For each of the instruction's operands, if the operand can
+ // be scalarized, add it to the worklist; otherwise, account for the
+ // overhead.
+ for (Use &U : I->operands())
+ if (auto *J = dyn_cast<Instruction>(U.get())) {
+ assert(VectorType::isValidElementType(J->getType()) &&
+ "Instruction has non-scalar type");
+ if (canBeScalarized(J))
+ Worklist.push_back(J);
else if (needsExtract(J, VF)) {
assert(!VF.isScalable() && "scalable vectors not yet supported.");
- ScalarCost += TTI.getScalarizationOverhead(
- cast<VectorType>(ToVectorTy(J->getType(), VF)),
+ ScalarCost += TTI.getScalarizationOverhead(
+ cast<VectorType>(ToVectorTy(J->getType(), VF)),
APInt::getAllOnesValue(VF.getKnownMinValue()), false, true);
}
- }
-
- // Scale the total scalar cost by block probability.
- ScalarCost /= getReciprocalPredBlockProb();
-
- // Compute the discount. A non-negative discount means the vector version
- // of the instruction costs more, and scalarizing would be beneficial.
- Discount += VectorCost - ScalarCost;
- ScalarCosts[I] = ScalarCost;
- }
-
+ }
+
+ // Scale the total scalar cost by block probability.
+ ScalarCost /= getReciprocalPredBlockProb();
+
+ // Compute the discount. A non-negative discount means the vector version
+ // of the instruction costs more, and scalarizing would be beneficial.
+ Discount += VectorCost - ScalarCost;
+ ScalarCosts[I] = ScalarCost;
+ }
+
return *Discount.getValue();
-}
-
-LoopVectorizationCostModel::VectorizationCostTy
+}
+
+LoopVectorizationCostModel::VectorizationCostTy
LoopVectorizationCostModel::expectedCost(ElementCount VF) {
- VectorizationCostTy Cost;
-
- // For each block.
- for (BasicBlock *BB : TheLoop->blocks()) {
- VectorizationCostTy BlockCost;
-
- // For each instruction in the old loop.
- for (Instruction &I : BB->instructionsWithoutDebug()) {
- // Skip ignored values.
+ VectorizationCostTy Cost;
+
+ // For each block.
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ VectorizationCostTy BlockCost;
+
+ // For each instruction in the old loop.
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ // Skip ignored values.
if (ValuesToIgnore.count(&I) ||
(VF.isVector() && VecValuesToIgnore.count(&I)))
- continue;
-
- VectorizationCostTy C = getInstructionCost(&I, VF);
-
- // Check if we should override the cost.
- if (ForceTargetInstructionCost.getNumOccurrences() > 0)
+ continue;
+
+ VectorizationCostTy C = getInstructionCost(&I, VF);
+
+ // Check if we should override the cost.
+ if (ForceTargetInstructionCost.getNumOccurrences() > 0)
C.first = InstructionCost(ForceTargetInstructionCost);
-
- BlockCost.first += C.first;
- BlockCost.second |= C.second;
- LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
- << " for VF " << VF << " For instruction: " << I
- << '\n');
- }
-
- // If we are vectorizing a predicated block, it will have been
- // if-converted. This means that the block's instructions (aside from
- // stores and instructions that may divide by zero) will now be
- // unconditionally executed. For the scalar case, we may not always execute
+
+ BlockCost.first += C.first;
+ BlockCost.second |= C.second;
+ LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
+ << " for VF " << VF << " For instruction: " << I
+ << '\n');
+ }
+
+ // If we are vectorizing a predicated block, it will have been
+ // if-converted. This means that the block's instructions (aside from
+ // stores and instructions that may divide by zero) will now be
+ // unconditionally executed. For the scalar case, we may not always execute
// the predicated block, if it is an if-else block. Thus, scale the block's
// cost by the probability of executing it. blockNeedsPredication from
// Legal is used so as to not include all blocks in tail folded loops.
if (VF.isScalar() && Legal->blockNeedsPredication(BB))
- BlockCost.first /= getReciprocalPredBlockProb();
-
- Cost.first += BlockCost.first;
- Cost.second |= BlockCost.second;
- }
-
- return Cost;
-}
-
-/// Gets Address Access SCEV after verifying that the access pattern
-/// is loop invariant except the induction variable dependence.
-///
-/// This SCEV can be sent to the Target in order to estimate the address
-/// calculation cost.
-static const SCEV *getAddressAccessSCEV(
- Value *Ptr,
- LoopVectorizationLegality *Legal,
- PredicatedScalarEvolution &PSE,
- const Loop *TheLoop) {
-
- auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
- if (!Gep)
- return nullptr;
-
- // We are looking for a gep with all loop invariant indices except for one
- // which should be an induction variable.
- auto SE = PSE.getSE();
- unsigned NumOperands = Gep->getNumOperands();
- for (unsigned i = 1; i < NumOperands; ++i) {
- Value *Opd = Gep->getOperand(i);
- if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
- !Legal->isInductionVariable(Opd))
- return nullptr;
- }
-
- // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
- return PSE.getSCEV(Ptr);
-}
-
-static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
- return Legal->hasStride(I->getOperand(0)) ||
- Legal->hasStride(I->getOperand(1));
-}
-
+ BlockCost.first /= getReciprocalPredBlockProb();
+
+ Cost.first += BlockCost.first;
+ Cost.second |= BlockCost.second;
+ }
+
+ return Cost;
+}
+
+/// Gets Address Access SCEV after verifying that the access pattern
+/// is loop invariant except the induction variable dependence.
+///
+/// This SCEV can be sent to the Target in order to estimate the address
+/// calculation cost.
+static const SCEV *getAddressAccessSCEV(
+ Value *Ptr,
+ LoopVectorizationLegality *Legal,
+ PredicatedScalarEvolution &PSE,
+ const Loop *TheLoop) {
+
+ auto *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+ if (!Gep)
+ return nullptr;
+
+ // We are looking for a gep with all loop invariant indices except for one
+ // which should be an induction variable.
+ auto SE = PSE.getSE();
+ unsigned NumOperands = Gep->getNumOperands();
+ for (unsigned i = 1; i < NumOperands; ++i) {
+ Value *Opd = Gep->getOperand(i);
+ if (!SE->isLoopInvariant(SE->getSCEV(Opd), TheLoop) &&
+ !Legal->isInductionVariable(Opd))
+ return nullptr;
+ }
+
+ // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
+ return PSE.getSCEV(Ptr);
+}
+
+static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
+ return Legal->hasStride(I->getOperand(0)) ||
+ Legal->hasStride(I->getOperand(1));
+}
+
InstructionCost
LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
ElementCount VF) {
assert(VF.isVector() &&
"Scalarization cost of instruction implies vectorization.");
assert(!VF.isScalable() && "scalable vectors not yet supported.");
- Type *ValTy = getMemInstValueType(I);
- auto SE = PSE.getSE();
-
- unsigned AS = getLoadStoreAddressSpace(I);
- Value *Ptr = getLoadStorePointerOperand(I);
- Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
-
- // Figure out whether the access is strided and get the stride value
- // if it's known in compile time
- const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
-
- // Get the cost of the scalar memory instruction and address computation.
+ Type *ValTy = getMemInstValueType(I);
+ auto SE = PSE.getSE();
+
+ unsigned AS = getLoadStoreAddressSpace(I);
+ Value *Ptr = getLoadStorePointerOperand(I);
+ Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+
+ // Figure out whether the access is strided and get the stride value
+ // if it's known in compile time
+ const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
+
+ // Get the cost of the scalar memory instruction and address computation.
InstructionCost Cost =
VF.getKnownMinValue() * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
-
- // Don't pass *I here, since it is scalar but will actually be part of a
- // vectorized loop where the user of it is a vectorized instruction.
- const Align Alignment = getLoadStoreAlignment(I);
+
+ // Don't pass *I here, since it is scalar but will actually be part of a
+ // vectorized loop where the user of it is a vectorized instruction.
+ const Align Alignment = getLoadStoreAlignment(I);
Cost += VF.getKnownMinValue() *
TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
AS, TTI::TCK_RecipThroughput);
-
- // Get the overhead of the extractelement and insertelement instructions
- // we might create due to scalarization.
- Cost += getScalarizationOverhead(I, VF);
-
- // If we have a predicated store, it may not be executed for each vector
- // lane. Scale the cost by the probability of executing the predicated
- // block.
- if (isPredicatedInst(I)) {
- Cost /= getReciprocalPredBlockProb();
-
- if (useEmulatedMaskMemRefHack(I))
- // Artificially setting to a high enough value to practically disable
- // vectorization with such operations.
- Cost = 3000000;
- }
-
- return Cost;
-}
-
+
+ // Get the overhead of the extractelement and insertelement instructions
+ // we might create due to scalarization.
+ Cost += getScalarizationOverhead(I, VF);
+
+ // If we have a predicated store, it may not be executed for each vector
+ // lane. Scale the cost by the probability of executing the predicated
+ // block.
+ if (isPredicatedInst(I)) {
+ Cost /= getReciprocalPredBlockProb();
+
+ if (useEmulatedMaskMemRefHack(I))
+ // Artificially setting to a high enough value to practically disable
+ // vectorization with such operations.
+ Cost = 3000000;
+ }
+
+ return Cost;
+}
+
InstructionCost
LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
ElementCount VF) {
- Type *ValTy = getMemInstValueType(I);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
- Value *Ptr = getLoadStorePointerOperand(I);
- unsigned AS = getLoadStoreAddressSpace(I);
- int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
- assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
- "Stride should be 1 or -1 for consecutive memory access");
- const Align Alignment = getLoadStoreAlignment(I);
+ Type *ValTy = getMemInstValueType(I);
+ auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ Value *Ptr = getLoadStorePointerOperand(I);
+ unsigned AS = getLoadStoreAddressSpace(I);
+ int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+ enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+ assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
+ "Stride should be 1 or -1 for consecutive memory access");
+ const Align Alignment = getLoadStoreAlignment(I);
InstructionCost Cost = 0;
- if (Legal->isMaskRequired(I))
- Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
- CostKind);
- else
- Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
- CostKind, I);
-
- bool Reverse = ConsecutiveStride < 0;
- if (Reverse)
- Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
- return Cost;
-}
-
+ if (Legal->isMaskRequired(I))
+ Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
+ CostKind);
+ else
+ Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
+ CostKind, I);
+
+ bool Reverse = ConsecutiveStride < 0;
+ if (Reverse)
+ Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+ return Cost;
+}
+
InstructionCost
LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
ElementCount VF) {
assert(Legal->isUniformMemOp(*I));
- Type *ValTy = getMemInstValueType(I);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
- const Align Alignment = getLoadStoreAlignment(I);
- unsigned AS = getLoadStoreAddressSpace(I);
- enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
- if (isa<LoadInst>(I)) {
- return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
- CostKind) +
- TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
- }
- StoreInst *SI = cast<StoreInst>(I);
-
- bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
- return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
- CostKind) +
- (isLoopInvariantStoreValue
- ? 0
- : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
+ Type *ValTy = getMemInstValueType(I);
+ auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ const Align Alignment = getLoadStoreAlignment(I);
+ unsigned AS = getLoadStoreAddressSpace(I);
+ enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+ if (isa<LoadInst>(I)) {
+ return TTI.getAddressComputationCost(ValTy) +
+ TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
+ CostKind) +
+ TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+ }
+ StoreInst *SI = cast<StoreInst>(I);
+
+ bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
+ return TTI.getAddressComputationCost(ValTy) +
+ TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
+ CostKind) +
+ (isLoopInvariantStoreValue
+ ? 0
+ : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
VF.getKnownMinValue() - 1));
-}
-
+}
+
InstructionCost
LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
ElementCount VF) {
- Type *ValTy = getMemInstValueType(I);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
- const Align Alignment = getLoadStoreAlignment(I);
- const Value *Ptr = getLoadStorePointerOperand(I);
-
- return TTI.getAddressComputationCost(VectorTy) +
- TTI.getGatherScatterOpCost(
- I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
- TargetTransformInfo::TCK_RecipThroughput, I);
-}
-
+ Type *ValTy = getMemInstValueType(I);
+ auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ const Align Alignment = getLoadStoreAlignment(I);
+ const Value *Ptr = getLoadStorePointerOperand(I);
+
+ return TTI.getAddressComputationCost(VectorTy) +
+ TTI.getGatherScatterOpCost(
+ I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
+ TargetTransformInfo::TCK_RecipThroughput, I);
+}
+
InstructionCost
LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
ElementCount VF) {
- Type *ValTy = getMemInstValueType(I);
- auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
- unsigned AS = getLoadStoreAddressSpace(I);
-
- auto Group = getInterleavedAccessGroup(I);
- assert(Group && "Fail to get an interleaved access group.");
-
- unsigned InterleaveFactor = Group->getFactor();
+ Type *ValTy = getMemInstValueType(I);
+ auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+ unsigned AS = getLoadStoreAddressSpace(I);
+
+ auto Group = getInterleavedAccessGroup(I);
+ assert(Group && "Fail to get an interleaved access group.");
+
+ unsigned InterleaveFactor = Group->getFactor();
assert(!VF.isScalable() && "scalable vectors not yet supported.");
auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
-
- // Holds the indices of existing members in an interleaved load group.
- // An interleaved store group doesn't need this as it doesn't allow gaps.
- SmallVector<unsigned, 4> Indices;
- if (isa<LoadInst>(I)) {
- for (unsigned i = 0; i < InterleaveFactor; i++)
- if (Group->getMember(i))
- Indices.push_back(i);
- }
-
- // Calculate the cost of the whole interleaved group.
- bool UseMaskForGaps =
- Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
+
+ // Holds the indices of existing members in an interleaved load group.
+ // An interleaved store group doesn't need this as it doesn't allow gaps.
+ SmallVector<unsigned, 4> Indices;
+ if (isa<LoadInst>(I)) {
+ for (unsigned i = 0; i < InterleaveFactor; i++)
+ if (Group->getMember(i))
+ Indices.push_back(i);
+ }
+
+ // Calculate the cost of the whole interleaved group.
+ bool UseMaskForGaps =
+ Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
- I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
- AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
-
- if (Group->isReverse()) {
- // TODO: Add support for reversed masked interleaved access.
- assert(!Legal->isMaskRequired(I) &&
- "Reverse masked interleaved access not supported.");
- Cost += Group->getNumMembers() *
- TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
- }
- return Cost;
-}
-
+ I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
+ AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
+
+ if (Group->isReverse()) {
+ // TODO: Add support for reversed masked interleaved access.
+ assert(!Legal->isMaskRequired(I) &&
+ "Reverse masked interleaved access not supported.");
+ Cost += Group->getNumMembers() *
+ TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+ }
+ return Cost;
+}
+
InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
Instruction *I, ElementCount VF, Type *Ty, TTI::TargetCostKind CostKind) {
// Early exit for no inloop reductions
@@ -6935,270 +6935,270 @@ InstructionCost LoopVectorizationCostModel::getReductionPatternCost(
InstructionCost
LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
ElementCount VF) {
- // Calculate scalar cost only. Vectorization cost should be ready at this
- // moment.
+ // Calculate scalar cost only. Vectorization cost should be ready at this
+ // moment.
if (VF.isScalar()) {
- Type *ValTy = getMemInstValueType(I);
- const Align Alignment = getLoadStoreAlignment(I);
- unsigned AS = getLoadStoreAddressSpace(I);
-
- return TTI.getAddressComputationCost(ValTy) +
- TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
- TTI::TCK_RecipThroughput, I);
- }
- return getWideningCost(I, VF);
-}
-
-LoopVectorizationCostModel::VectorizationCostTy
+ Type *ValTy = getMemInstValueType(I);
+ const Align Alignment = getLoadStoreAlignment(I);
+ unsigned AS = getLoadStoreAddressSpace(I);
+
+ return TTI.getAddressComputationCost(ValTy) +
+ TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
+ TTI::TCK_RecipThroughput, I);
+ }
+ return getWideningCost(I, VF);
+}
+
+LoopVectorizationCostModel::VectorizationCostTy
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
ElementCount VF) {
- // If we know that this instruction will remain uniform, check the cost of
- // the scalar version.
- if (isUniformAfterVectorization(I, VF))
+ // If we know that this instruction will remain uniform, check the cost of
+ // the scalar version.
+ if (isUniformAfterVectorization(I, VF))
VF = ElementCount::getFixed(1);
-
+
if (VF.isVector() && isProfitableToScalarize(I, VF))
- return VectorizationCostTy(InstsToScalarize[VF][I], false);
-
- // Forced scalars do not have any scalarization overhead.
- auto ForcedScalar = ForcedScalars.find(VF);
+ return VectorizationCostTy(InstsToScalarize[VF][I], false);
+
+ // Forced scalars do not have any scalarization overhead.
+ auto ForcedScalar = ForcedScalars.find(VF);
if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
- auto InstSet = ForcedScalar->second;
- if (InstSet.count(I))
+ auto InstSet = ForcedScalar->second;
+ if (InstSet.count(I))
return VectorizationCostTy(
(getInstructionCost(I, ElementCount::getFixed(1)).first *
VF.getKnownMinValue()),
false);
- }
-
- Type *VectorTy;
+ }
+
+ Type *VectorTy;
InstructionCost C = getInstructionCost(I, VF, VectorTy);
-
- bool TypeNotScalarized =
+
+ bool TypeNotScalarized =
VF.isVector() && VectorTy->isVectorTy() &&
TTI.getNumberOfParts(VectorTy) < VF.getKnownMinValue();
- return VectorizationCostTy(C, TypeNotScalarized);
-}
-
+ return VectorizationCostTy(C, TypeNotScalarized);
+}
+
InstructionCost
LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
ElementCount VF) {
-
+
assert(!VF.isScalable() &&
"cannot compute scalarization overhead for scalable vectorization");
if (VF.isScalar())
- return 0;
-
+ return 0;
+
InstructionCost Cost = 0;
- Type *RetTy = ToVectorTy(I->getType(), VF);
- if (!RetTy->isVoidTy() &&
- (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
- Cost += TTI.getScalarizationOverhead(
+ Type *RetTy = ToVectorTy(I->getType(), VF);
+ if (!RetTy->isVoidTy() &&
+ (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
+ Cost += TTI.getScalarizationOverhead(
cast<VectorType>(RetTy), APInt::getAllOnesValue(VF.getKnownMinValue()),
true, false);
-
- // Some targets keep addresses scalar.
- if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
- return Cost;
-
- // Some targets support efficient element stores.
- if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
- return Cost;
-
- // Collect operands to consider.
- CallInst *CI = dyn_cast<CallInst>(I);
- Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
-
- // Skip operands that do not require extraction/scalarization and do not incur
- // any overhead.
- return Cost + TTI.getOperandsScalarizationOverhead(
+
+ // Some targets keep addresses scalar.
+ if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
+ return Cost;
+
+ // Some targets support efficient element stores.
+ if (isa<StoreInst>(I) && TTI.supportsEfficientVectorElementLoadStore())
+ return Cost;
+
+ // Collect operands to consider.
+ CallInst *CI = dyn_cast<CallInst>(I);
+ Instruction::op_range Ops = CI ? CI->arg_operands() : I->operands();
+
+ // Skip operands that do not require extraction/scalarization and do not incur
+ // any overhead.
+ return Cost + TTI.getOperandsScalarizationOverhead(
filterExtractingOperands(Ops, VF), VF.getKnownMinValue());
-}
-
+}
+
void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
if (VF.isScalar())
- return;
- NumPredStores = 0;
- for (BasicBlock *BB : TheLoop->blocks()) {
- // For each instruction in the old loop.
- for (Instruction &I : *BB) {
- Value *Ptr = getLoadStorePointerOperand(&I);
- if (!Ptr)
- continue;
-
- // TODO: We should generate better code and update the cost model for
- // predicated uniform stores. Today they are treated as any other
- // predicated store (see added test cases in
- // invariant-store-vectorization.ll).
- if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
- NumPredStores++;
-
+ return;
+ NumPredStores = 0;
+ for (BasicBlock *BB : TheLoop->blocks()) {
+ // For each instruction in the old loop.
+ for (Instruction &I : *BB) {
+ Value *Ptr = getLoadStorePointerOperand(&I);
+ if (!Ptr)
+ continue;
+
+ // TODO: We should generate better code and update the cost model for
+ // predicated uniform stores. Today they are treated as any other
+ // predicated store (see added test cases in
+ // invariant-store-vectorization.ll).
+ if (isa<StoreInst>(&I) && isScalarWithPredication(&I))
+ NumPredStores++;
+
if (Legal->isUniformMemOp(I)) {
- // TODO: Avoid replicating loads and stores instead of
- // relying on instcombine to remove them.
- // Load: Scalar load + broadcast
- // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
+ // TODO: Avoid replicating loads and stores instead of
+ // relying on instcombine to remove them.
+ // Load: Scalar load + broadcast
+ // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
InstructionCost Cost = getUniformMemOpCost(&I, VF);
- setWideningDecision(&I, VF, CM_Scalarize, Cost);
- continue;
- }
-
- // We assume that widening is the best solution when possible.
- if (memoryInstructionCanBeWidened(&I, VF)) {
+ setWideningDecision(&I, VF, CM_Scalarize, Cost);
+ continue;
+ }
+
+ // We assume that widening is the best solution when possible.
+ if (memoryInstructionCanBeWidened(&I, VF)) {
InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
- int ConsecutiveStride =
- Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
- assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
- "Expected consecutive stride.");
- InstWidening Decision =
- ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
- setWideningDecision(&I, VF, Decision, Cost);
- continue;
- }
-
- // Choose between Interleaving, Gather/Scatter or Scalarization.
+ int ConsecutiveStride =
+ Legal->isConsecutivePtr(getLoadStorePointerOperand(&I));
+ assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
+ "Expected consecutive stride.");
+ InstWidening Decision =
+ ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
+ setWideningDecision(&I, VF, Decision, Cost);
+ continue;
+ }
+
+ // Choose between Interleaving, Gather/Scatter or Scalarization.
InstructionCost InterleaveCost = std::numeric_limits<int>::max();
- unsigned NumAccesses = 1;
- if (isAccessInterleaved(&I)) {
- auto Group = getInterleavedAccessGroup(&I);
- assert(Group && "Fail to get an interleaved access group.");
-
- // Make one decision for the whole group.
- if (getWideningDecision(&I, VF) != CM_Unknown)
- continue;
-
- NumAccesses = Group->getNumMembers();
- if (interleavedAccessCanBeWidened(&I, VF))
- InterleaveCost = getInterleaveGroupCost(&I, VF);
- }
-
+ unsigned NumAccesses = 1;
+ if (isAccessInterleaved(&I)) {
+ auto Group = getInterleavedAccessGroup(&I);
+ assert(Group && "Fail to get an interleaved access group.");
+
+ // Make one decision for the whole group.
+ if (getWideningDecision(&I, VF) != CM_Unknown)
+ continue;
+
+ NumAccesses = Group->getNumMembers();
+ if (interleavedAccessCanBeWidened(&I, VF))
+ InterleaveCost = getInterleaveGroupCost(&I, VF);
+ }
+
InstructionCost GatherScatterCost =
- isLegalGatherOrScatter(&I)
- ? getGatherScatterCost(&I, VF) * NumAccesses
+ isLegalGatherOrScatter(&I)
+ ? getGatherScatterCost(&I, VF) * NumAccesses
: std::numeric_limits<int>::max();
-
+
InstructionCost ScalarizationCost =
- getMemInstScalarizationCost(&I, VF) * NumAccesses;
-
- // Choose better solution for the current VF,
- // write down this decision and use it during vectorization.
+ getMemInstScalarizationCost(&I, VF) * NumAccesses;
+
+ // Choose better solution for the current VF,
+ // write down this decision and use it during vectorization.
InstructionCost Cost;
- InstWidening Decision;
- if (InterleaveCost <= GatherScatterCost &&
- InterleaveCost < ScalarizationCost) {
- Decision = CM_Interleave;
- Cost = InterleaveCost;
- } else if (GatherScatterCost < ScalarizationCost) {
- Decision = CM_GatherScatter;
- Cost = GatherScatterCost;
- } else {
- Decision = CM_Scalarize;
- Cost = ScalarizationCost;
- }
- // If the instructions belongs to an interleave group, the whole group
- // receives the same decision. The whole group receives the cost, but
- // the cost will actually be assigned to one instruction.
- if (auto Group = getInterleavedAccessGroup(&I))
- setWideningDecision(Group, VF, Decision, Cost);
- else
- setWideningDecision(&I, VF, Decision, Cost);
- }
- }
-
- // Make sure that any load of address and any other address computation
- // remains scalar unless there is gather/scatter support. This avoids
- // inevitable extracts into address registers, and also has the benefit of
- // activating LSR more, since that pass can't optimize vectorized
- // addresses.
- if (TTI.prefersVectorizedAddressing())
- return;
-
- // Start with all scalar pointer uses.
- SmallPtrSet<Instruction *, 8> AddrDefs;
- for (BasicBlock *BB : TheLoop->blocks())
- for (Instruction &I : *BB) {
- Instruction *PtrDef =
- dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
- if (PtrDef && TheLoop->contains(PtrDef) &&
- getWideningDecision(&I, VF) != CM_GatherScatter)
- AddrDefs.insert(PtrDef);
- }
-
- // Add all instructions used to generate the addresses.
- SmallVector<Instruction *, 4> Worklist;
+ InstWidening Decision;
+ if (InterleaveCost <= GatherScatterCost &&
+ InterleaveCost < ScalarizationCost) {
+ Decision = CM_Interleave;
+ Cost = InterleaveCost;
+ } else if (GatherScatterCost < ScalarizationCost) {
+ Decision = CM_GatherScatter;
+ Cost = GatherScatterCost;
+ } else {
+ Decision = CM_Scalarize;
+ Cost = ScalarizationCost;
+ }
+ // If the instructions belongs to an interleave group, the whole group
+ // receives the same decision. The whole group receives the cost, but
+ // the cost will actually be assigned to one instruction.
+ if (auto Group = getInterleavedAccessGroup(&I))
+ setWideningDecision(Group, VF, Decision, Cost);
+ else
+ setWideningDecision(&I, VF, Decision, Cost);
+ }
+ }
+
+ // Make sure that any load of address and any other address computation
+ // remains scalar unless there is gather/scatter support. This avoids
+ // inevitable extracts into address registers, and also has the benefit of
+ // activating LSR more, since that pass can't optimize vectorized
+ // addresses.
+ if (TTI.prefersVectorizedAddressing())
+ return;
+
+ // Start with all scalar pointer uses.
+ SmallPtrSet<Instruction *, 8> AddrDefs;
+ for (BasicBlock *BB : TheLoop->blocks())
+ for (Instruction &I : *BB) {
+ Instruction *PtrDef =
+ dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
+ if (PtrDef && TheLoop->contains(PtrDef) &&
+ getWideningDecision(&I, VF) != CM_GatherScatter)
+ AddrDefs.insert(PtrDef);
+ }
+
+ // Add all instructions used to generate the addresses.
+ SmallVector<Instruction *, 4> Worklist;
append_range(Worklist, AddrDefs);
- while (!Worklist.empty()) {
- Instruction *I = Worklist.pop_back_val();
- for (auto &Op : I->operands())
- if (auto *InstOp = dyn_cast<Instruction>(Op))
- if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
- AddrDefs.insert(InstOp).second)
- Worklist.push_back(InstOp);
- }
-
- for (auto *I : AddrDefs) {
- if (isa<LoadInst>(I)) {
- // Setting the desired widening decision should ideally be handled in
- // by cost functions, but since this involves the task of finding out
- // if the loaded register is involved in an address computation, it is
- // instead changed here when we know this is the case.
- InstWidening Decision = getWideningDecision(I, VF);
- if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
- // Scalarize a widened load of address.
+ while (!Worklist.empty()) {
+ Instruction *I = Worklist.pop_back_val();
+ for (auto &Op : I->operands())
+ if (auto *InstOp = dyn_cast<Instruction>(Op))
+ if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(InstOp) &&
+ AddrDefs.insert(InstOp).second)
+ Worklist.push_back(InstOp);
+ }
+
+ for (auto *I : AddrDefs) {
+ if (isa<LoadInst>(I)) {
+ // Setting the desired widening decision should ideally be handled in
+ // by cost functions, but since this involves the task of finding out
+ // if the loaded register is involved in an address computation, it is
+ // instead changed here when we know this is the case.
+ InstWidening Decision = getWideningDecision(I, VF);
+ if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
+ // Scalarize a widened load of address.
setWideningDecision(
I, VF, CM_Scalarize,
(VF.getKnownMinValue() *
getMemoryInstructionCost(I, ElementCount::getFixed(1))));
- else if (auto Group = getInterleavedAccessGroup(I)) {
- // Scalarize an interleave group of address loads.
- for (unsigned I = 0; I < Group->getFactor(); ++I) {
- if (Instruction *Member = Group->getMember(I))
+ else if (auto Group = getInterleavedAccessGroup(I)) {
+ // Scalarize an interleave group of address loads.
+ for (unsigned I = 0; I < Group->getFactor(); ++I) {
+ if (Instruction *Member = Group->getMember(I))
setWideningDecision(
Member, VF, CM_Scalarize,
(VF.getKnownMinValue() *
getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
- }
- }
- } else
- // Make sure I gets scalarized and a cost estimate without
- // scalarization overhead.
- ForcedScalars[VF].insert(I);
- }
-}
-
+ }
+ }
+ } else
+ // Make sure I gets scalarized and a cost estimate without
+ // scalarization overhead.
+ ForcedScalars[VF].insert(I);
+ }
+}
+
InstructionCost
LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
Type *&VectorTy) {
- Type *RetTy = I->getType();
- if (canTruncateToMinimalBitwidth(I, VF))
- RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
- VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
- auto SE = PSE.getSE();
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
- // TODO: We need to estimate the cost of intrinsic calls.
- switch (I->getOpcode()) {
- case Instruction::GetElementPtr:
- // We mark this instruction as zero-cost because the cost of GEPs in
- // vectorized code depends on whether the corresponding memory instruction
- // is scalarized or not. Therefore, we handle GEPs with the memory
- // instruction cost.
- return 0;
- case Instruction::Br: {
- // In cases of scalarized and predicated instructions, there will be VF
- // predicated blocks in the vectorized loop. Each branch around these
- // blocks requires also an extract of its vector compare i1 element.
- bool ScalarPredicatedBB = false;
- BranchInst *BI = cast<BranchInst>(I);
+ Type *RetTy = I->getType();
+ if (canTruncateToMinimalBitwidth(I, VF))
+ RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
+ VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
+ auto SE = PSE.getSE();
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+ // TODO: We need to estimate the cost of intrinsic calls.
+ switch (I->getOpcode()) {
+ case Instruction::GetElementPtr:
+ // We mark this instruction as zero-cost because the cost of GEPs in
+ // vectorized code depends on whether the corresponding memory instruction
+ // is scalarized or not. Therefore, we handle GEPs with the memory
+ // instruction cost.
+ return 0;
+ case Instruction::Br: {
+ // In cases of scalarized and predicated instructions, there will be VF
+ // predicated blocks in the vectorized loop. Each branch around these
+ // blocks requires also an extract of its vector compare i1 element.
+ bool ScalarPredicatedBB = false;
+ BranchInst *BI = cast<BranchInst>(I);
if (VF.isVector() && BI->isConditional() &&
- (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
- PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
- ScalarPredicatedBB = true;
-
- if (ScalarPredicatedBB) {
- // Return cost for branches around scalarized and predicated blocks.
+ (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
+ PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
+ ScalarPredicatedBB = true;
+
+ if (ScalarPredicatedBB) {
+ // Return cost for branches around scalarized and predicated blocks.
assert(!VF.isScalable() && "scalable vectors not yet supported.");
- auto *Vec_i1Ty =
+ auto *Vec_i1Ty =
VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
return (TTI.getScalarizationOverhead(
Vec_i1Ty, APInt::getAllOnesValue(VF.getKnownMinValue()),
@@ -7206,86 +7206,86 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
(TTI.getCFInstrCost(Instruction::Br, CostKind) *
VF.getKnownMinValue()));
} else if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
- // The back-edge branch will remain, as will all scalar branches.
- return TTI.getCFInstrCost(Instruction::Br, CostKind);
- else
- // This branch will be eliminated by if-conversion.
- return 0;
- // Note: We currently assume zero cost for an unconditional branch inside
- // a predicated block since it will become a fall-through, although we
- // may decide in the future to call TTI for all branches.
- }
- case Instruction::PHI: {
- auto *Phi = cast<PHINode>(I);
-
- // First-order recurrences are replaced by vector shuffles inside the loop.
- // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
+ // The back-edge branch will remain, as will all scalar branches.
+ return TTI.getCFInstrCost(Instruction::Br, CostKind);
+ else
+ // This branch will be eliminated by if-conversion.
+ return 0;
+ // Note: We currently assume zero cost for an unconditional branch inside
+ // a predicated block since it will become a fall-through, although we
+ // may decide in the future to call TTI for all branches.
+ }
+ case Instruction::PHI: {
+ auto *Phi = cast<PHINode>(I);
+
+ // First-order recurrences are replaced by vector shuffles inside the loop.
+ // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
if (VF.isVector() && Legal->isFirstOrderRecurrence(Phi))
return TTI.getShuffleCost(
TargetTransformInfo::SK_ExtractSubvector, cast<VectorType>(VectorTy),
VF.getKnownMinValue() - 1, FixedVectorType::get(RetTy, 1));
-
- // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
- // converted into select instructions. We require N - 1 selects per phi
- // node, where N is the number of incoming values.
+
+ // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
+ // converted into select instructions. We require N - 1 selects per phi
+ // node, where N is the number of incoming values.
if (VF.isVector() && Phi->getParent() != TheLoop->getHeader())
- return (Phi->getNumIncomingValues() - 1) *
- TTI.getCmpSelInstrCost(
- Instruction::Select, ToVectorTy(Phi->getType(), VF),
- ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
+ return (Phi->getNumIncomingValues() - 1) *
+ TTI.getCmpSelInstrCost(
+ Instruction::Select, ToVectorTy(Phi->getType(), VF),
+ ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
CmpInst::BAD_ICMP_PREDICATE, CostKind);
-
- return TTI.getCFInstrCost(Instruction::PHI, CostKind);
- }
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::URem:
- case Instruction::SRem:
- // If we have a predicated instruction, it may not be executed for each
- // vector lane. Get the scalarization cost and scale this amount by the
- // probability of executing the predicated block. If the instruction is not
- // predicated, we fall through to the next case.
+
+ return TTI.getCFInstrCost(Instruction::PHI, CostKind);
+ }
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ // If we have a predicated instruction, it may not be executed for each
+ // vector lane. Get the scalarization cost and scale this amount by the
+ // probability of executing the predicated block. If the instruction is not
+ // predicated, we fall through to the next case.
if (VF.isVector() && isScalarWithPredication(I)) {
InstructionCost Cost = 0;
-
- // These instructions have a non-void type, so account for the phi nodes
- // that we will create. This cost is likely to be zero. The phi node
- // cost, if any, should be scaled by the block probability because it
- // models a copy at the end of each predicated block.
+
+ // These instructions have a non-void type, so account for the phi nodes
+ // that we will create. This cost is likely to be zero. The phi node
+ // cost, if any, should be scaled by the block probability because it
+ // models a copy at the end of each predicated block.
Cost += VF.getKnownMinValue() *
TTI.getCFInstrCost(Instruction::PHI, CostKind);
-
- // The cost of the non-predicated instruction.
+
+ // The cost of the non-predicated instruction.
Cost += VF.getKnownMinValue() *
TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
-
- // The cost of insertelement and extractelement instructions needed for
- // scalarization.
- Cost += getScalarizationOverhead(I, VF);
-
- // Scale the cost by the probability of executing the predicated blocks.
- // This assumes the predicated block for each vector lane is equally
- // likely.
- return Cost / getReciprocalPredBlockProb();
- }
- LLVM_FALLTHROUGH;
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::FDiv:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- // Since we will replace the stride by 1 the multiplication should go away.
- if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
- return 0;
+
+ // The cost of insertelement and extractelement instructions needed for
+ // scalarization.
+ Cost += getScalarizationOverhead(I, VF);
+
+ // Scale the cost by the probability of executing the predicated blocks.
+ // This assumes the predicated block for each vector lane is equally
+ // likely.
+ return Cost / getReciprocalPredBlockProb();
+ }
+ LLVM_FALLTHROUGH;
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::FDiv:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // Since we will replace the stride by 1 the multiplication should go away.
+ if (I->getOpcode() == Instruction::Mul && isStrideMul(I, Legal))
+ return 0;
// Detect reduction patterns
InstructionCost RedCost;
@@ -7293,77 +7293,77 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
.isValid())
return RedCost;
- // Certain instructions can be cheaper to vectorize if they have a constant
- // second vector operand. One example of this are shifts on x86.
- Value *Op2 = I->getOperand(1);
- TargetTransformInfo::OperandValueProperties Op2VP;
- TargetTransformInfo::OperandValueKind Op2VK =
- TTI.getOperandInfo(Op2, Op2VP);
- if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
- Op2VK = TargetTransformInfo::OK_UniformValue;
-
- SmallVector<const Value *, 4> Operands(I->operand_values());
+ // Certain instructions can be cheaper to vectorize if they have a constant
+ // second vector operand. One example of this are shifts on x86.
+ Value *Op2 = I->getOperand(1);
+ TargetTransformInfo::OperandValueProperties Op2VP;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TTI.getOperandInfo(Op2, Op2VP);
+ if (Op2VK == TargetTransformInfo::OK_AnyValue && Legal->isUniform(Op2))
+ Op2VK = TargetTransformInfo::OK_UniformValue;
+
+ SmallVector<const Value *, 4> Operands(I->operand_values());
unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
- return N * TTI.getArithmeticInstrCost(
- I->getOpcode(), VectorTy, CostKind,
- TargetTransformInfo::OK_AnyValue,
- Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
- }
- case Instruction::FNeg: {
+ return N * TTI.getArithmeticInstrCost(
+ I->getOpcode(), VectorTy, CostKind,
+ TargetTransformInfo::OK_AnyValue,
+ Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
+ }
+ case Instruction::FNeg: {
assert(!VF.isScalable() && "VF is assumed to be non scalable.");
unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
- return N * TTI.getArithmeticInstrCost(
- I->getOpcode(), VectorTy, CostKind,
- TargetTransformInfo::OK_AnyValue,
- TargetTransformInfo::OK_AnyValue,
- TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
- I->getOperand(0), I);
- }
- case Instruction::Select: {
- SelectInst *SI = cast<SelectInst>(I);
- const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
- bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
- Type *CondTy = SI->getCondition()->getType();
- if (!ScalarCond)
+ return N * TTI.getArithmeticInstrCost(
+ I->getOpcode(), VectorTy, CostKind,
+ TargetTransformInfo::OK_AnyValue,
+ TargetTransformInfo::OK_AnyValue,
+ TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
+ I->getOperand(0), I);
+ }
+ case Instruction::Select: {
+ SelectInst *SI = cast<SelectInst>(I);
+ const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+ bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+ Type *CondTy = SI->getCondition()->getType();
+ if (!ScalarCond)
CondTy = VectorType::get(CondTy, VF);
- return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
+ return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
- }
- case Instruction::ICmp:
- case Instruction::FCmp: {
- Type *ValTy = I->getOperand(0)->getType();
- Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
- if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
- ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
- VectorTy = ToVectorTy(ValTy, VF);
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ Type *ValTy = I->getOperand(0)->getType();
+ Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
+ if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
+ ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
+ VectorTy = ToVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
CmpInst::BAD_ICMP_PREDICATE, CostKind, I);
- }
- case Instruction::Store:
- case Instruction::Load: {
+ }
+ case Instruction::Store:
+ case Instruction::Load: {
ElementCount Width = VF;
if (Width.isVector()) {
- InstWidening Decision = getWideningDecision(I, Width);
- assert(Decision != CM_Unknown &&
- "CM decision should be taken at this point");
- if (Decision == CM_Scalarize)
+ InstWidening Decision = getWideningDecision(I, Width);
+ assert(Decision != CM_Unknown &&
+ "CM decision should be taken at this point");
+ if (Decision == CM_Scalarize)
Width = ElementCount::getFixed(1);
- }
- VectorTy = ToVectorTy(getMemInstValueType(I), Width);
- return getMemoryInstructionCost(I, VF);
- }
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
+ }
+ VectorTy = ToVectorTy(getMemInstValueType(I), Width);
+ return getMemoryInstructionCost(I, VF);
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
// Computes the CastContextHint from a Load/Store instruction.
auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
@@ -7405,128 +7405,128 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
CCH = ComputeCCH(Load);
}
- // We optimize the truncation of induction variables having constant
- // integer steps. The cost of these truncations is the same as the scalar
- // operation.
- if (isOptimizableIVTruncate(I, VF)) {
- auto *Trunc = cast<TruncInst>(I);
- return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
+ // We optimize the truncation of induction variables having constant
+ // integer steps. The cost of these truncations is the same as the scalar
+ // operation.
+ if (isOptimizableIVTruncate(I, VF)) {
+ auto *Trunc = cast<TruncInst>(I);
+ return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
Trunc->getSrcTy(), CCH, CostKind, Trunc);
- }
-
+ }
+
// Detect reduction patterns
InstructionCost RedCost;
if ((RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
.isValid())
return RedCost;
- Type *SrcScalarTy = I->getOperand(0)->getType();
- Type *SrcVecTy =
- VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
- if (canTruncateToMinimalBitwidth(I, VF)) {
- // This cast is going to be shrunk. This may remove the cast or it might
- // turn it into slightly different cast. For example, if MinBW == 16,
- // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
- //
- // Calculate the modified src and dest types.
- Type *MinVecTy = VectorTy;
+ Type *SrcScalarTy = I->getOperand(0)->getType();
+ Type *SrcVecTy =
+ VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
+ if (canTruncateToMinimalBitwidth(I, VF)) {
+ // This cast is going to be shrunk. This may remove the cast or it might
+ // turn it into slightly different cast. For example, if MinBW == 16,
+ // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
+ //
+ // Calculate the modified src and dest types.
+ Type *MinVecTy = VectorTy;
if (Opcode == Instruction::Trunc) {
- SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
- VectorTy =
- largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
+ SrcVecTy = smallestIntegerVectorType(SrcVecTy, MinVecTy);
+ VectorTy =
+ largestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
} else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
- SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
- VectorTy =
- smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
- }
- }
-
+ SrcVecTy = largestIntegerVectorType(SrcVecTy, MinVecTy);
+ VectorTy =
+ smallestIntegerVectorType(ToVectorTy(I->getType(), VF), MinVecTy);
+ }
+ }
+
assert(!VF.isScalable() && "VF is assumed to be non scalable");
unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1;
return N *
TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
- }
- case Instruction::Call: {
- bool NeedToScalarize;
- CallInst *CI = cast<CallInst>(I);
+ }
+ case Instruction::Call: {
+ bool NeedToScalarize;
+ CallInst *CI = cast<CallInst>(I);
InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
if (getVectorIntrinsicIDForCall(CI, TLI)) {
InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
return std::min(CallCost, IntrinsicCost);
}
- return CallCost;
- }
+ return CallCost;
+ }
case Instruction::ExtractValue:
return TTI.getInstructionCost(I, TTI::TCK_RecipThroughput);
- default:
- // The cost of executing VF copies of the scalar instruction. This opcode
- // is unknown. Assume that it is the same as 'mul'.
+ default:
+ // The cost of executing VF copies of the scalar instruction. This opcode
+ // is unknown. Assume that it is the same as 'mul'.
return VF.getKnownMinValue() * TTI.getArithmeticInstrCost(
Instruction::Mul, VectorTy, CostKind) +
- getScalarizationOverhead(I, VF);
- } // end of switch.
-}
-
-char LoopVectorize::ID = 0;
-
-static const char lv_name[] = "Loop Vectorization";
-
-INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
-INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
-INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
-
-namespace llvm {
-
-Pass *createLoopVectorizePass() { return new LoopVectorize(); }
-
-Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
- bool VectorizeOnlyWhenForced) {
- return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
-}
-
-} // end namespace llvm
-
-bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
- // Check if the pointer operand of a load or store instruction is
- // consecutive.
- if (auto *Ptr = getLoadStorePointerOperand(Inst))
- return Legal->isConsecutivePtr(Ptr);
- return false;
-}
-
-void LoopVectorizationCostModel::collectValuesToIgnore() {
- // Ignore ephemeral values.
- CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
-
- // Ignore type-promoting instructions we identified during reduction
- // detection.
- for (auto &Reduction : Legal->getReductionVars()) {
- RecurrenceDescriptor &RedDes = Reduction.second;
+ getScalarizationOverhead(I, VF);
+ } // end of switch.
+}
+
+char LoopVectorize::ID = 0;
+
+static const char lv_name[] = "Loop Vectorization";
+
+INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(BasicAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
+INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
+
+namespace llvm {
+
+Pass *createLoopVectorizePass() { return new LoopVectorize(); }
+
+Pass *createLoopVectorizePass(bool InterleaveOnlyWhenForced,
+ bool VectorizeOnlyWhenForced) {
+ return new LoopVectorize(InterleaveOnlyWhenForced, VectorizeOnlyWhenForced);
+}
+
+} // end namespace llvm
+
+bool LoopVectorizationCostModel::isConsecutiveLoadOrStore(Instruction *Inst) {
+ // Check if the pointer operand of a load or store instruction is
+ // consecutive.
+ if (auto *Ptr = getLoadStorePointerOperand(Inst))
+ return Legal->isConsecutivePtr(Ptr);
+ return false;
+}
+
+void LoopVectorizationCostModel::collectValuesToIgnore() {
+ // Ignore ephemeral values.
+ CodeMetrics::collectEphemeralValues(TheLoop, AC, ValuesToIgnore);
+
+ // Ignore type-promoting instructions we identified during reduction
+ // detection.
+ for (auto &Reduction : Legal->getReductionVars()) {
+ RecurrenceDescriptor &RedDes = Reduction.second;
const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
- VecValuesToIgnore.insert(Casts.begin(), Casts.end());
- }
- // Ignore type-casting instructions we identified during induction
- // detection.
- for (auto &Induction : Legal->getInductionVars()) {
- InductionDescriptor &IndDes = Induction.second;
- const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
- VecValuesToIgnore.insert(Casts.begin(), Casts.end());
- }
-}
-
+ VecValuesToIgnore.insert(Casts.begin(), Casts.end());
+ }
+ // Ignore type-casting instructions we identified during induction
+ // detection.
+ for (auto &Induction : Legal->getInductionVars()) {
+ InductionDescriptor &IndDes = Induction.second;
+ const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
+ VecValuesToIgnore.insert(Casts.begin(), Casts.end());
+ }
+}
+
void LoopVectorizationCostModel::collectInLoopReductions() {
for (auto &Reduction : Legal->getReductionVars()) {
PHINode *Phi = Reduction.first;
@@ -7564,82 +7564,82 @@ void LoopVectorizationCostModel::collectInLoopReductions() {
}
}
-// TODO: we could return a pair of values that specify the max VF and
-// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
-// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
-// doesn't have a cost model that can choose which plan to execute if
-// more than one is generated.
-static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
- LoopVectorizationCostModel &CM) {
- unsigned WidestType;
- std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
- return WidestVectorRegBits / WidestType;
-}
-
-VectorizationFactor
+// TODO: we could return a pair of values that specify the max VF and
+// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
+// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
+// doesn't have a cost model that can choose which plan to execute if
+// more than one is generated.
+static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
+ LoopVectorizationCostModel &CM) {
+ unsigned WidestType;
+ std::tie(std::ignore, WidestType) = CM.getSmallestAndWidestTypes();
+ return WidestVectorRegBits / WidestType;
+}
+
+VectorizationFactor
LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
assert(!UserVF.isScalable() && "scalable vectors not yet supported");
ElementCount VF = UserVF;
- // Outer loop handling: They may require CFG and instruction level
- // transformations before even evaluating whether vectorization is profitable.
- // Since we cannot modify the incoming IR, we need to build VPlan upfront in
- // the vectorization pipeline.
+ // Outer loop handling: They may require CFG and instruction level
+ // transformations before even evaluating whether vectorization is profitable.
+ // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+ // the vectorization pipeline.
if (!OrigLoop->isInnermost()) {
- // If the user doesn't provide a vectorization factor, determine a
- // reasonable one.
+ // If the user doesn't provide a vectorization factor, determine a
+ // reasonable one.
if (UserVF.isZero()) {
VF = ElementCount::getFixed(
determineVPlanVF(TTI->getRegisterBitWidth(true /* Vector*/), CM));
- LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
-
- // Make sure we have a VF > 1 for stress testing.
+ LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
+
+ // Make sure we have a VF > 1 for stress testing.
if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
- LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
- << "overriding computed VF.\n");
+ LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
+ << "overriding computed VF.\n");
VF = ElementCount::getFixed(4);
- }
- }
- assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+ }
+ }
+ assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
assert(isPowerOf2_32(VF.getKnownMinValue()) &&
"VF needs to be a power of two");
LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
<< "VF " << VF << " to build VPlans.\n");
- buildVPlans(VF, VF);
-
- // For VPlan build stress testing, we bail out after VPlan construction.
- if (VPlanBuildStressTest)
- return VectorizationFactor::Disabled();
-
+ buildVPlans(VF, VF);
+
+ // For VPlan build stress testing, we bail out after VPlan construction.
+ if (VPlanBuildStressTest)
+ return VectorizationFactor::Disabled();
+
return {VF, 0 /*Cost*/};
- }
-
- LLVM_DEBUG(
- dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
- "VPlan-native path.\n");
- return VectorizationFactor::Disabled();
-}
-
+ }
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
+ "VPlan-native path.\n");
+ return VectorizationFactor::Disabled();
+}
+
Optional<VectorizationFactor>
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
Optional<ElementCount> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
- if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
- return None;
-
- // Invalidate interleave groups if all blocks of loop will be predicated.
- if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
- !useMaskedInterleavedAccesses(*TTI)) {
- LLVM_DEBUG(
- dbgs()
- << "LV: Invalidate all interleaved groups due to fold-tail by masking "
- "which requires masked-interleaved support.\n");
- if (CM.InterleaveInfo.invalidateGroups())
- // Invalidating interleave groups also requires invalidating all decisions
- // based on them, which includes widening decisions and uniform and scalar
- // values.
- CM.invalidateCostModelingDecisions();
- }
-
+ if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
+ return None;
+
+ // Invalidate interleave groups if all blocks of loop will be predicated.
+ if (CM.blockNeedsPredication(OrigLoop->getHeader()) &&
+ !useMaskedInterleavedAccesses(*TTI)) {
+ LLVM_DEBUG(
+ dbgs()
+ << "LV: Invalidate all interleaved groups due to fold-tail by masking "
+ "which requires masked-interleaved support.\n");
+ if (CM.InterleaveInfo.invalidateGroups())
+ // Invalidating interleave groups also requires invalidating all decisions
+ // based on them, which includes widening decisions and uniform and scalar
+ // values.
+ CM.invalidateCostModelingDecisions();
+ }
+
ElementCount MaxVF = MaybeMaxVF.getValue();
assert(MaxVF.isNonZero() && "MaxVF is zero.");
@@ -7654,59 +7654,59 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
<< " VF " << VF << ".\n");
assert(isPowerOf2_32(VF.getKnownMinValue()) &&
"VF needs to be a power of two");
- // Collect the instructions (and their associated costs) that will be more
- // profitable to scalarize.
+ // Collect the instructions (and their associated costs) that will be more
+ // profitable to scalarize.
CM.selectUserVectorizationFactor(VF);
CM.collectInLoopReductions();
buildVPlansWithVPRecipes(VF, VF);
- LLVM_DEBUG(printPlans(dbgs()));
+ LLVM_DEBUG(printPlans(dbgs()));
return {{VF, 0}};
- }
-
+ }
+
assert(!MaxVF.isScalable() &&
"Scalable vectors not yet supported beyond this point");
-
+
for (ElementCount VF = ElementCount::getFixed(1);
ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
- // Collect Uniform and Scalar instructions after vectorization with VF.
- CM.collectUniformsAndScalars(VF);
-
- // Collect the instructions (and their associated costs) that will be more
- // profitable to scalarize.
+ // Collect Uniform and Scalar instructions after vectorization with VF.
+ CM.collectUniformsAndScalars(VF);
+
+ // Collect the instructions (and their associated costs) that will be more
+ // profitable to scalarize.
if (VF.isVector())
- CM.collectInstsToScalarize(VF);
- }
-
+ CM.collectInstsToScalarize(VF);
+ }
+
CM.collectInLoopReductions();
buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF);
- LLVM_DEBUG(printPlans(dbgs()));
+ LLVM_DEBUG(printPlans(dbgs()));
if (MaxVF.isScalar())
- return VectorizationFactor::Disabled();
-
- // Select the optimal vectorization factor.
- return CM.selectVectorizationFactor(MaxVF);
-}
-
+ return VectorizationFactor::Disabled();
+
+ // Select the optimal vectorization factor.
+ return CM.selectVectorizationFactor(MaxVF);
+}
+
void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) {
- LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
- << '\n');
- BestVF = VF;
- BestUF = UF;
-
- erase_if(VPlans, [VF](const VPlanPtr &Plan) {
- return !Plan->hasVF(VF);
- });
- assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
-}
-
-void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
- DominatorTree *DT) {
- // Perform the actual loop transformation.
-
- // 1. Create a new empty loop. Unlink the old loop and connect the new one.
- VPCallbackILV CallbackILV(ILV);
-
+ LLVM_DEBUG(dbgs() << "Setting best plan to VF=" << VF << ", UF=" << UF
+ << '\n');
+ BestVF = VF;
+ BestUF = UF;
+
+ erase_if(VPlans, [VF](const VPlanPtr &Plan) {
+ return !Plan->hasVF(VF);
+ });
+ assert(VPlans.size() == 1 && "Best VF has not a single VPlan.");
+}
+
+void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
+ DominatorTree *DT) {
+ // Perform the actual loop transformation.
+
+ // 1. Create a new empty loop. Unlink the old loop and connect the new one.
+ VPCallbackILV CallbackILV(ILV);
+
assert(BestVF.hasValue() && "Vectorization Factor is missing");
VPTransformState State{*BestVF,
@@ -7718,34 +7718,34 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
ILV.VectorLoopValueMap,
&ILV,
CallbackILV};
- State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
- State.TripCount = ILV.getOrCreateTripCount(nullptr);
- State.CanonicalIV = ILV.Induction;
-
+ State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
+ State.TripCount = ILV.getOrCreateTripCount(nullptr);
+ State.CanonicalIV = ILV.Induction;
+
ILV.printDebugTracesAtStart();
- //===------------------------------------------------===//
- //
- // Notice: any optimization or new instruction that go
- // into the code below should also be implemented in
- // the cost-model.
- //
- //===------------------------------------------------===//
-
- // 2. Copy and widen instructions from the old loop into the new loop.
- assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
- VPlans.front()->execute(&State);
-
- // 3. Fix the vectorized code: take care of header phi's, live-outs,
- // predication, updating analyses.
- ILV.fixVectorizedLoop();
+ //===------------------------------------------------===//
+ //
+ // Notice: any optimization or new instruction that go
+ // into the code below should also be implemented in
+ // the cost-model.
+ //
+ //===------------------------------------------------===//
+
+ // 2. Copy and widen instructions from the old loop into the new loop.
+ assert(VPlans.size() == 1 && "Not a single VPlan to execute.");
+ VPlans.front()->execute(&State);
+
+ // 3. Fix the vectorized code: take care of header phi's, live-outs,
+ // predication, updating analyses.
+ ILV.fixVectorizedLoop();
ILV.printDebugTracesAtEnd();
-}
-
-void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
- SmallPtrSetImpl<Instruction *> &DeadInstructions) {
-
+}
+
+void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
+ SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+
// We create new control-flow for the vectorized loop, so the original exit
// conditions will be dead after vectorization if it's only used by the
// terminator
@@ -7755,7 +7755,7 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
auto *Cmp = dyn_cast<Instruction>(BB->getTerminator()->getOperand(0));
if (!Cmp || !Cmp->hasOneUse())
continue;
-
+
// TODO: we should introduce a getUniqueExitingBlocks on Loop
if (!DeadInstructions.insert(Cmp).second)
continue;
@@ -7768,93 +7768,93 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
}
}
- // We create new "steps" for induction variable updates to which the original
- // induction variables map. An original update instruction will be dead if
- // all its users except the induction variable are dead.
+ // We create new "steps" for induction variable updates to which the original
+ // induction variables map. An original update instruction will be dead if
+ // all its users except the induction variable are dead.
auto *Latch = OrigLoop->getLoopLatch();
- for (auto &Induction : Legal->getInductionVars()) {
- PHINode *Ind = Induction.first;
- auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+ for (auto &Induction : Legal->getInductionVars()) {
+ PHINode *Ind = Induction.first;
+ auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
// If the tail is to be folded by masking, the primary induction variable,
// if exists, isn't dead: it will be used for masking. Don't kill it.
if (CM.foldTailByMasking() && IndUpdate == Legal->getPrimaryInduction())
continue;
- if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
- return U == Ind || DeadInstructions.count(cast<Instruction>(U));
- }))
- DeadInstructions.insert(IndUpdate);
-
- // We record as "Dead" also the type-casting instructions we had identified
- // during induction analysis. We don't need any handling for them in the
- // vectorized loop because we have proven that, under a proper runtime
- // test guarding the vectorized loop, the value of the phi, and the casted
- // value of the phi, are the same. The last instruction in this casting chain
- // will get its scalar/vector/widened def from the scalar/vector/widened def
- // of the respective phi node. Any other casts in the induction def-use chain
- // have no other uses outside the phi update chain, and will be ignored.
- InductionDescriptor &IndDes = Induction.second;
- const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
- DeadInstructions.insert(Casts.begin(), Casts.end());
- }
-}
-
-Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
-
-Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
-
-Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
- Instruction::BinaryOps BinOp) {
- // When unrolling and the VF is 1, we only need to add a simple scalar.
- Type *Ty = Val->getType();
- assert(!Ty->isVectorTy() && "Val must be a scalar");
-
- if (Ty->isFloatingPointTy()) {
- Constant *C = ConstantFP::get(Ty, (double)StartIdx);
-
- // Floating point operations had to be 'fast' to enable the unrolling.
- Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
- return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
- }
- Constant *C = ConstantInt::get(Ty, StartIdx);
- return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
-}
-
-static void AddRuntimeUnrollDisableMetaData(Loop *L) {
- SmallVector<Metadata *, 4> MDs;
- // Reserve first location for self reference to the LoopID metadata node.
- MDs.push_back(nullptr);
- bool IsUnrollMetadata = false;
- MDNode *LoopID = L->getLoopID();
- if (LoopID) {
- // First find existing loop unrolling disable metadata.
- for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
- auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
- if (MD) {
- const auto *S = dyn_cast<MDString>(MD->getOperand(0));
- IsUnrollMetadata =
- S && S->getString().startswith("llvm.loop.unroll.disable");
- }
- MDs.push_back(LoopID->getOperand(i));
- }
- }
-
- if (!IsUnrollMetadata) {
- // Add runtime unroll disable metadata.
- LLVMContext &Context = L->getHeader()->getContext();
- SmallVector<Metadata *, 1> DisableOperands;
- DisableOperands.push_back(
- MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
- MDNode *DisableNode = MDNode::get(Context, DisableOperands);
- MDs.push_back(DisableNode);
- MDNode *NewLoopID = MDNode::get(Context, MDs);
- // Set operand 0 to refer to the loop id itself.
- NewLoopID->replaceOperandWith(0, NewLoopID);
- L->setLoopID(NewLoopID);
- }
-}
-
+ if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
+ return U == Ind || DeadInstructions.count(cast<Instruction>(U));
+ }))
+ DeadInstructions.insert(IndUpdate);
+
+ // We record as "Dead" also the type-casting instructions we had identified
+ // during induction analysis. We don't need any handling for them in the
+ // vectorized loop because we have proven that, under a proper runtime
+ // test guarding the vectorized loop, the value of the phi, and the casted
+ // value of the phi, are the same. The last instruction in this casting chain
+ // will get its scalar/vector/widened def from the scalar/vector/widened def
+ // of the respective phi node. Any other casts in the induction def-use chain
+ // have no other uses outside the phi update chain, and will be ignored.
+ InductionDescriptor &IndDes = Induction.second;
+ const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
+ DeadInstructions.insert(Casts.begin(), Casts.end());
+ }
+}
+
+Value *InnerLoopUnroller::reverseVector(Value *Vec) { return Vec; }
+
+Value *InnerLoopUnroller::getBroadcastInstrs(Value *V) { return V; }
+
+Value *InnerLoopUnroller::getStepVector(Value *Val, int StartIdx, Value *Step,
+ Instruction::BinaryOps BinOp) {
+ // When unrolling and the VF is 1, we only need to add a simple scalar.
+ Type *Ty = Val->getType();
+ assert(!Ty->isVectorTy() && "Val must be a scalar");
+
+ if (Ty->isFloatingPointTy()) {
+ Constant *C = ConstantFP::get(Ty, (double)StartIdx);
+
+ // Floating point operations had to be 'fast' to enable the unrolling.
+ Value *MulOp = addFastMathFlag(Builder.CreateFMul(C, Step));
+ return addFastMathFlag(Builder.CreateBinOp(BinOp, Val, MulOp));
+ }
+ Constant *C = ConstantInt::get(Ty, StartIdx);
+ return Builder.CreateAdd(Val, Builder.CreateMul(C, Step), "induction");
+}
+
+static void AddRuntimeUnrollDisableMetaData(Loop *L) {
+ SmallVector<Metadata *, 4> MDs;
+ // Reserve first location for self reference to the LoopID metadata node.
+ MDs.push_back(nullptr);
+ bool IsUnrollMetadata = false;
+ MDNode *LoopID = L->getLoopID();
+ if (LoopID) {
+ // First find existing loop unrolling disable metadata.
+ for (unsigned i = 1, ie = LoopID->getNumOperands(); i < ie; ++i) {
+ auto *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+ if (MD) {
+ const auto *S = dyn_cast<MDString>(MD->getOperand(0));
+ IsUnrollMetadata =
+ S && S->getString().startswith("llvm.loop.unroll.disable");
+ }
+ MDs.push_back(LoopID->getOperand(i));
+ }
+ }
+
+ if (!IsUnrollMetadata) {
+ // Add runtime unroll disable metadata.
+ LLVMContext &Context = L->getHeader()->getContext();
+ SmallVector<Metadata *, 1> DisableOperands;
+ DisableOperands.push_back(
+ MDString::get(Context, "llvm.loop.unroll.runtime.disable"));
+ MDNode *DisableNode = MDNode::get(Context, DisableOperands);
+ MDs.push_back(DisableNode);
+ MDNode *NewLoopID = MDNode::get(Context, MDs);
+ // Set operand 0 to refer to the loop id itself.
+ NewLoopID->replaceOperandWith(0, NewLoopID);
+ L->setLoopID(NewLoopID);
+ }
+}
+
//===--------------------------------------------------------------------===//
// EpilogueVectorizerMainLoop
//===--------------------------------------------------------------------===//
@@ -8126,55 +8126,55 @@ void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
});
}
-bool LoopVectorizationPlanner::getDecisionAndClampRange(
+bool LoopVectorizationPlanner::getDecisionAndClampRange(
const std::function<bool(ElementCount)> &Predicate, VFRange &Range) {
assert(!Range.isEmpty() && "Trying to test an empty VF range.");
- bool PredicateAtRangeStart = Predicate(Range.Start);
-
+ bool PredicateAtRangeStart = Predicate(Range.Start);
+
for (ElementCount TmpVF = Range.Start * 2;
ElementCount::isKnownLT(TmpVF, Range.End); TmpVF *= 2)
- if (Predicate(TmpVF) != PredicateAtRangeStart) {
- Range.End = TmpVF;
- break;
- }
-
- return PredicateAtRangeStart;
-}
-
-/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
-/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
-/// of VF's starting at a given VF and extending it as much as possible. Each
-/// vectorization decision can potentially shorten this sub-range during
-/// buildVPlan().
+ if (Predicate(TmpVF) != PredicateAtRangeStart) {
+ Range.End = TmpVF;
+ break;
+ }
+
+ return PredicateAtRangeStart;
+}
+
+/// Build VPlans for the full range of feasible VF's = {\p MinVF, 2 * \p MinVF,
+/// 4 * \p MinVF, ..., \p MaxVF} by repeatedly building a VPlan for a sub-range
+/// of VF's starting at a given VF and extending it as much as possible. Each
+/// vectorization decision can potentially shorten this sub-range during
+/// buildVPlan().
void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
ElementCount MaxVF) {
auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
VFRange SubRange = {VF, MaxVFPlusOne};
- VPlans.push_back(buildVPlan(SubRange));
- VF = SubRange.End;
- }
-}
-
-VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
- VPlanPtr &Plan) {
- assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
-
- // Look for cached value.
- std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
- EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
- if (ECEntryIt != EdgeMaskCache.end())
- return ECEntryIt->second;
-
- VPValue *SrcMask = createBlockInMask(Src, Plan);
-
- // The terminator has to be a branch inst!
- BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
- assert(BI && "Unexpected terminator found");
-
- if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
- return EdgeMaskCache[Edge] = SrcMask;
-
+ VPlans.push_back(buildVPlan(SubRange));
+ VF = SubRange.End;
+ }
+}
+
+VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
+ VPlanPtr &Plan) {
+ assert(is_contained(predecessors(Dst), Src) && "Invalid edge");
+
+ // Look for cached value.
+ std::pair<BasicBlock *, BasicBlock *> Edge(Src, Dst);
+ EdgeMaskCacheTy::iterator ECEntryIt = EdgeMaskCache.find(Edge);
+ if (ECEntryIt != EdgeMaskCache.end())
+ return ECEntryIt->second;
+
+ VPValue *SrcMask = createBlockInMask(Src, Plan);
+
+ // The terminator has to be a branch inst!
+ BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+ assert(BI && "Unexpected terminator found");
+
+ if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
+ return EdgeMaskCache[Edge] = SrcMask;
+
// If source is an exiting block, we know the exit edge is dynamically dead
// in the vector loop, and thus we don't need to restrict the mask. Avoid
// adding uses of an otherwise potentially dead instruction.
@@ -8182,11 +8182,11 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
return EdgeMaskCache[Edge] = SrcMask;
VPValue *EdgeMask = Plan->getOrAddVPValue(BI->getCondition());
- assert(EdgeMask && "No Edge Mask found for condition");
-
- if (BI->getSuccessor(0) != Dst)
- EdgeMask = Builder.createNot(EdgeMask);
-
+ assert(EdgeMask && "No Edge Mask found for condition");
+
+ if (BI->getSuccessor(0) != Dst)
+ EdgeMask = Builder.createNot(EdgeMask);
+
if (SrcMask) { // Otherwise block in-mask is all-one, no need to AND.
// The condition is 'SrcMask && EdgeMask', which is equivalent to
// 'select i1 SrcMask, i1 EdgeMask, i1 false'.
@@ -8196,44 +8196,44 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
ConstantInt::getFalse(BI->getCondition()->getType()));
EdgeMask = Builder.createSelect(SrcMask, EdgeMask, False);
}
-
- return EdgeMaskCache[Edge] = EdgeMask;
-}
-
-VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
- assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
-
- // Look for cached value.
- BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
- if (BCEntryIt != BlockMaskCache.end())
- return BCEntryIt->second;
-
- // All-one mask is modelled as no-mask following the convention for masked
- // load/store/gather/scatter. Initialize BlockMask to no-mask.
- VPValue *BlockMask = nullptr;
-
- if (OrigLoop->getHeader() == BB) {
- if (!CM.blockNeedsPredication(BB))
- return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
-
+
+ return EdgeMaskCache[Edge] = EdgeMask;
+}
+
+VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
+ assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+
+ // Look for cached value.
+ BlockMaskCacheTy::iterator BCEntryIt = BlockMaskCache.find(BB);
+ if (BCEntryIt != BlockMaskCache.end())
+ return BCEntryIt->second;
+
+ // All-one mask is modelled as no-mask following the convention for masked
+ // load/store/gather/scatter. Initialize BlockMask to no-mask.
+ VPValue *BlockMask = nullptr;
+
+ if (OrigLoop->getHeader() == BB) {
+ if (!CM.blockNeedsPredication(BB))
+ return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
+
// Create the block in mask as the first non-phi instruction in the block.
VPBuilder::InsertPointGuard Guard(Builder);
auto NewInsertionPoint = Builder.getInsertBlock()->getFirstNonPhi();
Builder.setInsertPoint(Builder.getInsertBlock(), NewInsertionPoint);
- // Introduce the early-exit compare IV <= BTC to form header block mask.
- // This is used instead of IV < TC because TC may wrap, unlike BTC.
- // Start by constructing the desired canonical IV.
- VPValue *IV = nullptr;
- if (Legal->getPrimaryInduction())
+ // Introduce the early-exit compare IV <= BTC to form header block mask.
+ // This is used instead of IV < TC because TC may wrap, unlike BTC.
+ // Start by constructing the desired canonical IV.
+ VPValue *IV = nullptr;
+ if (Legal->getPrimaryInduction())
IV = Plan->getOrAddVPValue(Legal->getPrimaryInduction());
- else {
- auto IVRecipe = new VPWidenCanonicalIVRecipe();
+ else {
+ auto IVRecipe = new VPWidenCanonicalIVRecipe();
Builder.getInsertBlock()->insert(IVRecipe, NewInsertionPoint);
- IV = IVRecipe->getVPValue();
- }
- VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
- bool TailFolded = !CM.isScalarEpilogueAllowed();
+ IV = IVRecipe->getVPValue();
+ }
+ VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
+ bool TailFolded = !CM.isScalarEpilogueAllowed();
if (TailFolded && CM.TTI.emitGetActiveLaneMask()) {
// While ActiveLaneMask is a binary op that consumes the loop tripcount
@@ -8242,320 +8242,320 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
// happen.
BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV});
} else {
- BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
+ BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
}
- return BlockMaskCache[BB] = BlockMask;
- }
-
- // This is the block mask. We OR all incoming edges.
- for (auto *Predecessor : predecessors(BB)) {
- VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
- if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
- return BlockMaskCache[BB] = EdgeMask;
-
- if (!BlockMask) { // BlockMask has its initialized nullptr value.
- BlockMask = EdgeMask;
- continue;
- }
-
- BlockMask = Builder.createOr(BlockMask, EdgeMask);
- }
-
- return BlockMaskCache[BB] = BlockMask;
-}
-
+ return BlockMaskCache[BB] = BlockMask;
+ }
+
+ // This is the block mask. We OR all incoming edges.
+ for (auto *Predecessor : predecessors(BB)) {
+ VPValue *EdgeMask = createEdgeMask(Predecessor, BB, Plan);
+ if (!EdgeMask) // Mask of predecessor is all-one so mask of block is too.
+ return BlockMaskCache[BB] = EdgeMask;
+
+ if (!BlockMask) { // BlockMask has its initialized nullptr value.
+ BlockMask = EdgeMask;
+ continue;
+ }
+
+ BlockMask = Builder.createOr(BlockMask, EdgeMask);
+ }
+
+ return BlockMaskCache[BB] = BlockMask;
+}
+
VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
VPlanPtr &Plan) {
- assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
- "Must be called with either a load or store");
-
+ assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+ "Must be called with either a load or store");
+
auto willWiden = [&](ElementCount VF) -> bool {
if (VF.isScalar())
- return false;
- LoopVectorizationCostModel::InstWidening Decision =
- CM.getWideningDecision(I, VF);
- assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
- "CM decision should be taken at this point.");
- if (Decision == LoopVectorizationCostModel::CM_Interleave)
- return true;
- if (CM.isScalarAfterVectorization(I, VF) ||
- CM.isProfitableToScalarize(I, VF))
- return false;
- return Decision != LoopVectorizationCostModel::CM_Scalarize;
- };
-
- if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
- return nullptr;
-
- VPValue *Mask = nullptr;
- if (Legal->isMaskRequired(I))
- Mask = createBlockInMask(I->getParent(), Plan);
-
- VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
- if (LoadInst *Load = dyn_cast<LoadInst>(I))
- return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
-
- StoreInst *Store = cast<StoreInst>(I);
- VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
- return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
-}
-
-VPWidenIntOrFpInductionRecipe *
+ return false;
+ LoopVectorizationCostModel::InstWidening Decision =
+ CM.getWideningDecision(I, VF);
+ assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
+ "CM decision should be taken at this point.");
+ if (Decision == LoopVectorizationCostModel::CM_Interleave)
+ return true;
+ if (CM.isScalarAfterVectorization(I, VF) ||
+ CM.isProfitableToScalarize(I, VF))
+ return false;
+ return Decision != LoopVectorizationCostModel::CM_Scalarize;
+ };
+
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
+ return nullptr;
+
+ VPValue *Mask = nullptr;
+ if (Legal->isMaskRequired(I))
+ Mask = createBlockInMask(I->getParent(), Plan);
+
+ VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
+ if (LoadInst *Load = dyn_cast<LoadInst>(I))
+ return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
+
+ StoreInst *Store = cast<StoreInst>(I);
+ VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
+ return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
+}
+
+VPWidenIntOrFpInductionRecipe *
VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi, VPlan &Plan) const {
- // Check if this is an integer or fp induction. If so, build the recipe that
- // produces its scalar and vector values.
- InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
- if (II.getKind() == InductionDescriptor::IK_IntInduction ||
+ // Check if this is an integer or fp induction. If so, build the recipe that
+ // produces its scalar and vector values.
+ InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
+ if (II.getKind() == InductionDescriptor::IK_IntInduction ||
II.getKind() == InductionDescriptor::IK_FpInduction) {
VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
return new VPWidenIntOrFpInductionRecipe(Phi, Start);
}
-
- return nullptr;
-}
-
-VPWidenIntOrFpInductionRecipe *
+
+ return nullptr;
+}
+
+VPWidenIntOrFpInductionRecipe *
VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
VPlan &Plan) const {
- // Optimize the special case where the source is a constant integer
- // induction variable. Notice that we can only optimize the 'trunc' case
- // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
- // (c) other casts depend on pointer size.
-
- // Determine whether \p K is a truncation based on an induction variable that
- // can be optimized.
- auto isOptimizableIVTruncate =
+ // Optimize the special case where the source is a constant integer
+ // induction variable. Notice that we can only optimize the 'trunc' case
+ // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
+ // (c) other casts depend on pointer size.
+
+ // Determine whether \p K is a truncation based on an induction variable that
+ // can be optimized.
+ auto isOptimizableIVTruncate =
[&](Instruction *K) -> std::function<bool(ElementCount)> {
return [=](ElementCount VF) -> bool {
return CM.isOptimizableIVTruncate(K, VF);
};
- };
-
- if (LoopVectorizationPlanner::getDecisionAndClampRange(
+ };
+
+ if (LoopVectorizationPlanner::getDecisionAndClampRange(
isOptimizableIVTruncate(I), Range)) {
InductionDescriptor II =
Legal->getInductionVars().lookup(cast<PHINode>(I->getOperand(0)));
VPValue *Start = Plan.getOrAddVPValue(II.getStartValue());
- return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
+ return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
Start, I);
}
- return nullptr;
-}
-
-VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
- // We know that all PHIs in non-header blocks are converted into selects, so
- // we don't have to worry about the insertion order and we can just use the
- // builder. At this point we generate the predication tree. There may be
- // duplications since this is a simple recursive scan, but future
- // optimizations will clean it up.
-
- SmallVector<VPValue *, 2> Operands;
- unsigned NumIncoming = Phi->getNumIncomingValues();
- for (unsigned In = 0; In < NumIncoming; In++) {
- VPValue *EdgeMask =
- createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
- assert((EdgeMask || NumIncoming == 1) &&
- "Multiple predecessors with one having a full mask");
- Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
- if (EdgeMask)
- Operands.push_back(EdgeMask);
- }
- return new VPBlendRecipe(Phi, Operands);
-}
-
-VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
- VPlan &Plan) const {
-
- bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
+ return nullptr;
+}
+
+VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
+ // We know that all PHIs in non-header blocks are converted into selects, so
+ // we don't have to worry about the insertion order and we can just use the
+ // builder. At this point we generate the predication tree. There may be
+ // duplications since this is a simple recursive scan, but future
+ // optimizations will clean it up.
+
+ SmallVector<VPValue *, 2> Operands;
+ unsigned NumIncoming = Phi->getNumIncomingValues();
+ for (unsigned In = 0; In < NumIncoming; In++) {
+ VPValue *EdgeMask =
+ createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
+ assert((EdgeMask || NumIncoming == 1) &&
+ "Multiple predecessors with one having a full mask");
+ Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
+ if (EdgeMask)
+ Operands.push_back(EdgeMask);
+ }
+ return new VPBlendRecipe(Phi, Operands);
+}
+
+VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
+ VPlan &Plan) const {
+
+ bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
[this, CI](ElementCount VF) {
return CM.isScalarWithPredication(CI, VF);
},
- Range);
-
- if (IsPredicated)
- return nullptr;
-
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+ Range);
+
+ if (IsPredicated)
+ return nullptr;
+
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
ID == Intrinsic::pseudoprobe ||
ID == Intrinsic::experimental_noalias_scope_decl))
- return nullptr;
-
+ return nullptr;
+
auto willWiden = [&](ElementCount VF) -> bool {
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- // The following case may be scalarized depending on the VF.
- // The flag shows whether we use Intrinsic or a usual Call for vectorized
- // version of the instruction.
- // Is it beneficial to perform intrinsic call compared to lib call?
- bool NeedToScalarize = false;
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ // The following case may be scalarized depending on the VF.
+ // The flag shows whether we use Intrinsic or a usual Call for vectorized
+ // version of the instruction.
+ // Is it beneficial to perform intrinsic call compared to lib call?
+ bool NeedToScalarize = false;
InstructionCost CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
InstructionCost IntrinsicCost = ID ? CM.getVectorIntrinsicCost(CI, VF) : 0;
bool UseVectorIntrinsic = ID && IntrinsicCost <= CallCost;
assert(IntrinsicCost.isValid() && CallCost.isValid() &&
"Cannot have invalid costs while widening");
- return UseVectorIntrinsic || !NeedToScalarize;
- };
-
- if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
- return nullptr;
-
- return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
-}
-
-bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
- assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
- !isa<StoreInst>(I) && "Instruction should have been handled earlier");
- // Instruction should be widened, unless it is scalar after vectorization,
- // scalarization is profitable or it is predicated.
+ return UseVectorIntrinsic || !NeedToScalarize;
+ };
+
+ if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
+ return nullptr;
+
+ return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
+}
+
+bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
+ assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
+ !isa<StoreInst>(I) && "Instruction should have been handled earlier");
+ // Instruction should be widened, unless it is scalar after vectorization,
+ // scalarization is profitable or it is predicated.
auto WillScalarize = [this, I](ElementCount VF) -> bool {
- return CM.isScalarAfterVectorization(I, VF) ||
- CM.isProfitableToScalarize(I, VF) ||
- CM.isScalarWithPredication(I, VF);
- };
- return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
- Range);
-}
-
-VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
- auto IsVectorizableOpcode = [](unsigned Opcode) {
- switch (Opcode) {
- case Instruction::Add:
- case Instruction::And:
- case Instruction::AShr:
- case Instruction::BitCast:
- case Instruction::FAdd:
- case Instruction::FCmp:
- case Instruction::FDiv:
- case Instruction::FMul:
- case Instruction::FNeg:
- case Instruction::FPExt:
- case Instruction::FPToSI:
- case Instruction::FPToUI:
- case Instruction::FPTrunc:
- case Instruction::FRem:
- case Instruction::FSub:
- case Instruction::ICmp:
- case Instruction::IntToPtr:
- case Instruction::LShr:
- case Instruction::Mul:
- case Instruction::Or:
- case Instruction::PtrToInt:
- case Instruction::SDiv:
- case Instruction::Select:
- case Instruction::SExt:
- case Instruction::Shl:
- case Instruction::SIToFP:
- case Instruction::SRem:
- case Instruction::Sub:
- case Instruction::Trunc:
- case Instruction::UDiv:
- case Instruction::UIToFP:
- case Instruction::URem:
- case Instruction::Xor:
- case Instruction::ZExt:
- return true;
- }
- return false;
- };
-
- if (!IsVectorizableOpcode(I->getOpcode()))
- return nullptr;
-
- // Success: widen this instruction.
- return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
-}
-
-VPBasicBlock *VPRecipeBuilder::handleReplication(
- Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
- DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
- VPlanPtr &Plan) {
- bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
+ return CM.isScalarAfterVectorization(I, VF) ||
+ CM.isProfitableToScalarize(I, VF) ||
+ CM.isScalarWithPredication(I, VF);
+ };
+ return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
+ Range);
+}
+
+VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
+ auto IsVectorizableOpcode = [](unsigned Opcode) {
+ switch (Opcode) {
+ case Instruction::Add:
+ case Instruction::And:
+ case Instruction::AShr:
+ case Instruction::BitCast:
+ case Instruction::FAdd:
+ case Instruction::FCmp:
+ case Instruction::FDiv:
+ case Instruction::FMul:
+ case Instruction::FNeg:
+ case Instruction::FPExt:
+ case Instruction::FPToSI:
+ case Instruction::FPToUI:
+ case Instruction::FPTrunc:
+ case Instruction::FRem:
+ case Instruction::FSub:
+ case Instruction::ICmp:
+ case Instruction::IntToPtr:
+ case Instruction::LShr:
+ case Instruction::Mul:
+ case Instruction::Or:
+ case Instruction::PtrToInt:
+ case Instruction::SDiv:
+ case Instruction::Select:
+ case Instruction::SExt:
+ case Instruction::Shl:
+ case Instruction::SIToFP:
+ case Instruction::SRem:
+ case Instruction::Sub:
+ case Instruction::Trunc:
+ case Instruction::UDiv:
+ case Instruction::UIToFP:
+ case Instruction::URem:
+ case Instruction::Xor:
+ case Instruction::ZExt:
+ return true;
+ }
+ return false;
+ };
+
+ if (!IsVectorizableOpcode(I->getOpcode()))
+ return nullptr;
+
+ // Success: widen this instruction.
+ return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
+}
+
+VPBasicBlock *VPRecipeBuilder::handleReplication(
+ Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+ DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
+ VPlanPtr &Plan) {
+ bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
- Range);
-
- bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
+ Range);
+
+ bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
[&](ElementCount VF) { return CM.isScalarWithPredication(I, VF); },
Range);
-
- auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
- IsUniform, IsPredicated);
- setRecipe(I, Recipe);
+
+ auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
+ IsUniform, IsPredicated);
+ setRecipe(I, Recipe);
Plan->addVPValue(I, Recipe);
-
- // Find if I uses a predicated instruction. If so, it will use its scalar
- // value. Avoid hoisting the insert-element which packs the scalar value into
- // a vector value, as that happens iff all users use the vector value.
- for (auto &Op : I->operands())
- if (auto *PredInst = dyn_cast<Instruction>(Op))
- if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
- PredInst2Recipe[PredInst]->setAlsoPack(false);
-
- // Finalize the recipe for Instr, first if it is not predicated.
- if (!IsPredicated) {
- LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
- VPBB->appendRecipe(Recipe);
- return VPBB;
- }
- LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
- assert(VPBB->getSuccessors().empty() &&
- "VPBB has successors when handling predicated replication.");
- // Record predicated instructions for above packing optimizations.
- PredInst2Recipe[I] = Recipe;
- VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
- VPBlockUtils::insertBlockAfter(Region, VPBB);
- auto *RegSucc = new VPBasicBlock();
- VPBlockUtils::insertBlockAfter(RegSucc, Region);
- return RegSucc;
-}
-
-VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
- VPRecipeBase *PredRecipe,
- VPlanPtr &Plan) {
- // Instructions marked for predication are replicated and placed under an
- // if-then construct to prevent side-effects.
-
- // Generate recipes to compute the block mask for this region.
- VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
-
- // Build the triangular if-then region.
- std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
- assert(Instr->getParent() && "Predicated instruction not in any basic block");
- auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
- auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
+
+ // Find if I uses a predicated instruction. If so, it will use its scalar
+ // value. Avoid hoisting the insert-element which packs the scalar value into
+ // a vector value, as that happens iff all users use the vector value.
+ for (auto &Op : I->operands())
+ if (auto *PredInst = dyn_cast<Instruction>(Op))
+ if (PredInst2Recipe.find(PredInst) != PredInst2Recipe.end())
+ PredInst2Recipe[PredInst]->setAlsoPack(false);
+
+ // Finalize the recipe for Instr, first if it is not predicated.
+ if (!IsPredicated) {
+ LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
+ VPBB->appendRecipe(Recipe);
+ return VPBB;
+ }
+ LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
+ assert(VPBB->getSuccessors().empty() &&
+ "VPBB has successors when handling predicated replication.");
+ // Record predicated instructions for above packing optimizations.
+ PredInst2Recipe[I] = Recipe;
+ VPBlockBase *Region = createReplicateRegion(I, Recipe, Plan);
+ VPBlockUtils::insertBlockAfter(Region, VPBB);
+ auto *RegSucc = new VPBasicBlock();
+ VPBlockUtils::insertBlockAfter(RegSucc, Region);
+ return RegSucc;
+}
+
+VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
+ VPRecipeBase *PredRecipe,
+ VPlanPtr &Plan) {
+ // Instructions marked for predication are replicated and placed under an
+ // if-then construct to prevent side-effects.
+
+ // Generate recipes to compute the block mask for this region.
+ VPValue *BlockInMask = createBlockInMask(Instr->getParent(), Plan);
+
+ // Build the triangular if-then region.
+ std::string RegionName = (Twine("pred.") + Instr->getOpcodeName()).str();
+ assert(Instr->getParent() && "Predicated instruction not in any basic block");
+ auto *BOMRecipe = new VPBranchOnMaskRecipe(BlockInMask);
+ auto *Entry = new VPBasicBlock(Twine(RegionName) + ".entry", BOMRecipe);
auto *PHIRecipe = Instr->getType()->isVoidTy()
? nullptr
: new VPPredInstPHIRecipe(Plan->getOrAddVPValue(Instr));
- auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
- auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
- VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
-
- // Note: first set Entry as region entry and then connect successors starting
- // from it in order, to propagate the "parent" of each VPBasicBlock.
- VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
- VPBlockUtils::connectBlocks(Pred, Exit);
-
- return Region;
-}
-
-VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
- VFRange &Range,
- VPlanPtr &Plan) {
- // First, check for specific widening recipes that deal with calls, memory
- // operations, inductions and Phi nodes.
- if (auto *CI = dyn_cast<CallInst>(Instr))
- return tryToWidenCall(CI, Range, *Plan);
-
- if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
- return tryToWidenMemory(Instr, Range, Plan);
-
- VPRecipeBase *Recipe;
- if (auto Phi = dyn_cast<PHINode>(Instr)) {
- if (Phi->getParent() != OrigLoop->getHeader())
- return tryToBlend(Phi, Plan);
+ auto *Exit = new VPBasicBlock(Twine(RegionName) + ".continue", PHIRecipe);
+ auto *Pred = new VPBasicBlock(Twine(RegionName) + ".if", PredRecipe);
+ VPRegionBlock *Region = new VPRegionBlock(Entry, Exit, RegionName, true);
+
+ // Note: first set Entry as region entry and then connect successors starting
+ // from it in order, to propagate the "parent" of each VPBasicBlock.
+ VPBlockUtils::insertTwoBlocksAfter(Pred, Exit, BlockInMask, Entry);
+ VPBlockUtils::connectBlocks(Pred, Exit);
+
+ return Region;
+}
+
+VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
+ VFRange &Range,
+ VPlanPtr &Plan) {
+ // First, check for specific widening recipes that deal with calls, memory
+ // operations, inductions and Phi nodes.
+ if (auto *CI = dyn_cast<CallInst>(Instr))
+ return tryToWidenCall(CI, Range, *Plan);
+
+ if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
+ return tryToWidenMemory(Instr, Range, Plan);
+
+ VPRecipeBase *Recipe;
+ if (auto Phi = dyn_cast<PHINode>(Instr)) {
+ if (Phi->getParent() != OrigLoop->getHeader())
+ return tryToBlend(Phi, Plan);
if ((Recipe = tryToOptimizeInductionPHI(Phi, *Plan)))
- return Recipe;
+ return Recipe;
if (Legal->isReductionVariable(Phi)) {
RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[Phi];
@@ -8564,93 +8564,93 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
return new VPWidenPHIRecipe(Phi, RdxDesc, *StartV);
}
- return new VPWidenPHIRecipe(Phi);
- }
-
+ return new VPWidenPHIRecipe(Phi);
+ }
+
if (isa<TruncInst>(Instr) && (Recipe = tryToOptimizeInductionTruncate(
cast<TruncInst>(Instr), Range, *Plan)))
- return Recipe;
-
- if (!shouldWiden(Instr, Range))
- return nullptr;
-
- if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
- return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
- OrigLoop);
-
- if (auto *SI = dyn_cast<SelectInst>(Instr)) {
- bool InvariantCond =
- PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
- return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
- InvariantCond);
- }
-
- return tryToWiden(Instr, *Plan);
-}
-
+ return Recipe;
+
+ if (!shouldWiden(Instr, Range))
+ return nullptr;
+
+ if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
+ return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
+ OrigLoop);
+
+ if (auto *SI = dyn_cast<SelectInst>(Instr)) {
+ bool InvariantCond =
+ PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
+ return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
+ InvariantCond);
+ }
+
+ return tryToWiden(Instr, *Plan);
+}
+
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
ElementCount MaxVF) {
assert(OrigLoop->isInnermost() && "Inner loop expected.");
-
- // Collect instructions from the original loop that will become trivially dead
- // in the vectorized loop. We don't need to vectorize these instructions. For
- // example, original induction update instructions can become dead because we
- // separately emit induction "steps" when generating code for the new loop.
- // Similarly, we create a new latch condition when setting up the structure
- // of the new loop, so the old one can become dead.
- SmallPtrSet<Instruction *, 4> DeadInstructions;
- collectTriviallyDeadInstructions(DeadInstructions);
-
- // Add assume instructions we need to drop to DeadInstructions, to prevent
- // them from being added to the VPlan.
- // TODO: We only need to drop assumes in blocks that get flattend. If the
- // control flow is preserved, we should keep them.
- auto &ConditionalAssumes = Legal->getConditionalAssumes();
- DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
-
- DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
- // Dead instructions do not need sinking. Remove them from SinkAfter.
- for (Instruction *I : DeadInstructions)
- SinkAfter.erase(I);
-
+
+ // Collect instructions from the original loop that will become trivially dead
+ // in the vectorized loop. We don't need to vectorize these instructions. For
+ // example, original induction update instructions can become dead because we
+ // separately emit induction "steps" when generating code for the new loop.
+ // Similarly, we create a new latch condition when setting up the structure
+ // of the new loop, so the old one can become dead.
+ SmallPtrSet<Instruction *, 4> DeadInstructions;
+ collectTriviallyDeadInstructions(DeadInstructions);
+
+ // Add assume instructions we need to drop to DeadInstructions, to prevent
+ // them from being added to the VPlan.
+ // TODO: We only need to drop assumes in blocks that get flattend. If the
+ // control flow is preserved, we should keep them.
+ auto &ConditionalAssumes = Legal->getConditionalAssumes();
+ DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
+
+ DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
+ // Dead instructions do not need sinking. Remove them from SinkAfter.
+ for (Instruction *I : DeadInstructions)
+ SinkAfter.erase(I);
+
auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
VFRange SubRange = {VF, MaxVFPlusOne};
VPlans.push_back(
buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
- VF = SubRange.End;
- }
-}
-
-VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
+ VF = SubRange.End;
+ }
+}
+
+VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
- const DenseMap<Instruction *, Instruction *> &SinkAfter) {
-
- // Hold a mapping from predicated instructions to their recipes, in order to
- // fix their AlsoPack behavior if a user is determined to replicate and use a
- // scalar instead of vector value.
- DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
-
- SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
-
- VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
-
- // ---------------------------------------------------------------------------
- // Pre-construction: record ingredients whose recipes we'll need to further
- // process after constructing the initial VPlan.
- // ---------------------------------------------------------------------------
-
- // Mark instructions we'll need to sink later and their targets as
- // ingredients whose recipe we'll need to record.
- for (auto &Entry : SinkAfter) {
- RecipeBuilder.recordRecipeOf(Entry.first);
- RecipeBuilder.recordRecipeOf(Entry.second);
- }
+ const DenseMap<Instruction *, Instruction *> &SinkAfter) {
+
+ // Hold a mapping from predicated instructions to their recipes, in order to
+ // fix their AlsoPack behavior if a user is determined to replicate and use a
+ // scalar instead of vector value.
+ DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
+
+ SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
+
+ VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
+
+ // ---------------------------------------------------------------------------
+ // Pre-construction: record ingredients whose recipes we'll need to further
+ // process after constructing the initial VPlan.
+ // ---------------------------------------------------------------------------
+
+ // Mark instructions we'll need to sink later and their targets as
+ // ingredients whose recipe we'll need to record.
+ for (auto &Entry : SinkAfter) {
+ RecipeBuilder.recordRecipeOf(Entry.first);
+ RecipeBuilder.recordRecipeOf(Entry.second);
+ }
for (auto &Reduction : CM.getInLoopReductionChains()) {
PHINode *Phi = Reduction.first;
RecurKind Kind = Legal->getReductionVars()[Phi].getRecurrenceKind();
const SmallVector<Instruction *, 4> &ReductionOperations = Reduction.second;
-
+
RecipeBuilder.recordRecipeOf(Phi);
for (auto &R : ReductionOperations) {
RecipeBuilder.recordRecipeOf(R);
@@ -8661,100 +8661,100 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
}
}
- // For each interleave group which is relevant for this (possibly trimmed)
- // Range, add it to the set of groups to be later applied to the VPlan and add
- // placeholders for its members' Recipes which we'll be replacing with a
- // single VPInterleaveRecipe.
- for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
+ // For each interleave group which is relevant for this (possibly trimmed)
+ // Range, add it to the set of groups to be later applied to the VPlan and add
+ // placeholders for its members' Recipes which we'll be replacing with a
+ // single VPInterleaveRecipe.
+ for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
auto applyIG = [IG, this](ElementCount VF) -> bool {
return (VF.isVector() && // Query is illegal for VF == 1
- CM.getWideningDecision(IG->getInsertPos(), VF) ==
- LoopVectorizationCostModel::CM_Interleave);
- };
- if (!getDecisionAndClampRange(applyIG, Range))
- continue;
- InterleaveGroups.insert(IG);
- for (unsigned i = 0; i < IG->getFactor(); i++)
- if (Instruction *Member = IG->getMember(i))
- RecipeBuilder.recordRecipeOf(Member);
- };
-
- // ---------------------------------------------------------------------------
- // Build initial VPlan: Scan the body of the loop in a topological order to
- // visit each basic block after having visited its predecessor basic blocks.
- // ---------------------------------------------------------------------------
-
- // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
- auto Plan = std::make_unique<VPlan>();
- VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
- Plan->setEntry(VPBB);
-
- // Scan the body of the loop in a topological order to visit each basic block
- // after having visited its predecessor basic blocks.
- LoopBlocksDFS DFS(OrigLoop);
- DFS.perform(LI);
-
- for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
- // Relevant instructions from basic block BB will be grouped into VPRecipe
- // ingredients and fill a new VPBasicBlock.
- unsigned VPBBsForBB = 0;
- auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
- VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
- VPBB = FirstVPBBForBB;
- Builder.setInsertPoint(VPBB);
-
- // Introduce each ingredient into VPlan.
- // TODO: Model and preserve debug instrinsics in VPlan.
- for (Instruction &I : BB->instructionsWithoutDebug()) {
- Instruction *Instr = &I;
-
- // First filter out irrelevant instructions, to ensure no recipes are
- // built for them.
- if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
- continue;
-
- if (auto Recipe =
- RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
+ CM.getWideningDecision(IG->getInsertPos(), VF) ==
+ LoopVectorizationCostModel::CM_Interleave);
+ };
+ if (!getDecisionAndClampRange(applyIG, Range))
+ continue;
+ InterleaveGroups.insert(IG);
+ for (unsigned i = 0; i < IG->getFactor(); i++)
+ if (Instruction *Member = IG->getMember(i))
+ RecipeBuilder.recordRecipeOf(Member);
+ };
+
+ // ---------------------------------------------------------------------------
+ // Build initial VPlan: Scan the body of the loop in a topological order to
+ // visit each basic block after having visited its predecessor basic blocks.
+ // ---------------------------------------------------------------------------
+
+ // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
+ auto Plan = std::make_unique<VPlan>();
+ VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
+ Plan->setEntry(VPBB);
+
+ // Scan the body of the loop in a topological order to visit each basic block
+ // after having visited its predecessor basic blocks.
+ LoopBlocksDFS DFS(OrigLoop);
+ DFS.perform(LI);
+
+ for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
+ // Relevant instructions from basic block BB will be grouped into VPRecipe
+ // ingredients and fill a new VPBasicBlock.
+ unsigned VPBBsForBB = 0;
+ auto *FirstVPBBForBB = new VPBasicBlock(BB->getName());
+ VPBlockUtils::insertBlockAfter(FirstVPBBForBB, VPBB);
+ VPBB = FirstVPBBForBB;
+ Builder.setInsertPoint(VPBB);
+
+ // Introduce each ingredient into VPlan.
+ // TODO: Model and preserve debug instrinsics in VPlan.
+ for (Instruction &I : BB->instructionsWithoutDebug()) {
+ Instruction *Instr = &I;
+
+ // First filter out irrelevant instructions, to ensure no recipes are
+ // built for them.
+ if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
+ continue;
+
+ if (auto Recipe =
+ RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
for (auto *Def : Recipe->definedValues()) {
auto *UV = Def->getUnderlyingValue();
Plan->addVPValue(UV, Def);
}
- RecipeBuilder.setRecipe(Instr, Recipe);
- VPBB->appendRecipe(Recipe);
- continue;
- }
-
- // Otherwise, if all widening options failed, Instruction is to be
- // replicated. This may create a successor for VPBB.
- VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
- Instr, Range, VPBB, PredInst2Recipe, Plan);
- if (NextVPBB != VPBB) {
- VPBB = NextVPBB;
- VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
- : "");
- }
- }
- }
-
- // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
- // may also be empty, such as the last one VPBB, reflecting original
- // basic-blocks with no recipes.
- VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
- assert(PreEntry->empty() && "Expecting empty pre-entry block.");
- VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
- VPBlockUtils::disconnectBlocks(PreEntry, Entry);
- delete PreEntry;
-
- // ---------------------------------------------------------------------------
- // Transform initial VPlan: Apply previously taken decisions, in order, to
- // bring the VPlan to its final state.
- // ---------------------------------------------------------------------------
-
- // Apply Sink-After legal constraints.
- for (auto &Entry : SinkAfter) {
- VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
- VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
+ RecipeBuilder.setRecipe(Instr, Recipe);
+ VPBB->appendRecipe(Recipe);
+ continue;
+ }
+
+ // Otherwise, if all widening options failed, Instruction is to be
+ // replicated. This may create a successor for VPBB.
+ VPBasicBlock *NextVPBB = RecipeBuilder.handleReplication(
+ Instr, Range, VPBB, PredInst2Recipe, Plan);
+ if (NextVPBB != VPBB) {
+ VPBB = NextVPBB;
+ VPBB->setName(BB->hasName() ? BB->getName() + "." + Twine(VPBBsForBB++)
+ : "");
+ }
+ }
+ }
+
+ // Discard empty dummy pre-entry VPBasicBlock. Note that other VPBasicBlocks
+ // may also be empty, such as the last one VPBB, reflecting original
+ // basic-blocks with no recipes.
+ VPBasicBlock *PreEntry = cast<VPBasicBlock>(Plan->getEntry());
+ assert(PreEntry->empty() && "Expecting empty pre-entry block.");
+ VPBlockBase *Entry = Plan->setEntry(PreEntry->getSingleSuccessor());
+ VPBlockUtils::disconnectBlocks(PreEntry, Entry);
+ delete PreEntry;
+
+ // ---------------------------------------------------------------------------
+ // Transform initial VPlan: Apply previously taken decisions, in order, to
+ // bring the VPlan to its final state.
+ // ---------------------------------------------------------------------------
+
+ // Apply Sink-After legal constraints.
+ for (auto &Entry : SinkAfter) {
+ VPRecipeBase *Sink = RecipeBuilder.getRecipe(Entry.first);
+ VPRecipeBase *Target = RecipeBuilder.getRecipe(Entry.second);
// If the target is in a replication region, make sure to move Sink to the
// block after it, not into the replication region itself.
if (auto *Region =
@@ -8767,26 +8767,26 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
continue;
}
}
- Sink->moveAfter(Target);
- }
-
- // Interleave memory: for each Interleave Group we marked earlier as relevant
- // for this VPlan, replace the Recipes widening its memory instructions with a
- // single VPInterleaveRecipe at its insertion point.
- for (auto IG : InterleaveGroups) {
- auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
- RecipeBuilder.getRecipe(IG->getInsertPos()));
+ Sink->moveAfter(Target);
+ }
+
+ // Interleave memory: for each Interleave Group we marked earlier as relevant
+ // for this VPlan, replace the Recipes widening its memory instructions with a
+ // single VPInterleaveRecipe at its insertion point.
+ for (auto IG : InterleaveGroups) {
+ auto *Recipe = cast<VPWidenMemoryInstructionRecipe>(
+ RecipeBuilder.getRecipe(IG->getInsertPos()));
SmallVector<VPValue *, 4> StoredValues;
for (unsigned i = 0; i < IG->getFactor(); ++i)
if (auto *SI = dyn_cast_or_null<StoreInst>(IG->getMember(i)))
StoredValues.push_back(Plan->getOrAddVPValue(SI->getOperand(0)));
-
+
auto *VPIG = new VPInterleaveRecipe(IG, Recipe->getAddr(), StoredValues,
Recipe->getMask());
VPIG->insertBefore(Recipe);
unsigned J = 0;
- for (unsigned i = 0; i < IG->getFactor(); ++i)
- if (Instruction *Member = IG->getMember(i)) {
+ for (unsigned i = 0; i < IG->getFactor(); ++i)
+ if (Instruction *Member = IG->getMember(i)) {
if (!Member->getType()->isVoidTy()) {
VPValue *OriginalV = Plan->getVPValue(Member);
Plan->removeVPValueFor(Member);
@@ -8794,78 +8794,78 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
OriginalV->replaceAllUsesWith(VPIG->getVPValue(J));
J++;
}
- RecipeBuilder.getRecipe(Member)->eraseFromParent();
- }
- }
-
+ RecipeBuilder.getRecipe(Member)->eraseFromParent();
+ }
+ }
+
// Adjust the recipes for any inloop reductions.
if (Range.Start.isVector())
adjustRecipesForInLoopReductions(Plan, RecipeBuilder);
- // Finally, if tail is folded by masking, introduce selects between the phi
- // and the live-out instruction of each reduction, at the end of the latch.
+ // Finally, if tail is folded by masking, introduce selects between the phi
+ // and the live-out instruction of each reduction, at the end of the latch.
if (CM.foldTailByMasking() && !Legal->getReductionVars().empty()) {
- Builder.setInsertPoint(VPBB);
- auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
- for (auto &Reduction : Legal->getReductionVars()) {
+ Builder.setInsertPoint(VPBB);
+ auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
+ for (auto &Reduction : Legal->getReductionVars()) {
if (CM.isInLoopReduction(Reduction.first))
continue;
VPValue *Phi = Plan->getOrAddVPValue(Reduction.first);
VPValue *Red = Plan->getOrAddVPValue(Reduction.second.getLoopExitInstr());
- Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
- }
- }
-
- std::string PlanName;
- raw_string_ostream RSO(PlanName);
+ Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
+ }
+ }
+
+ std::string PlanName;
+ raw_string_ostream RSO(PlanName);
ElementCount VF = Range.Start;
- Plan->addVF(VF);
- RSO << "Initial VPlan for VF={" << VF;
+ Plan->addVF(VF);
+ RSO << "Initial VPlan for VF={" << VF;
for (VF *= 2; ElementCount::isKnownLT(VF, Range.End); VF *= 2) {
- Plan->addVF(VF);
- RSO << "," << VF;
- }
- RSO << "},UF>=1";
- RSO.flush();
- Plan->setName(PlanName);
-
- return Plan;
-}
-
-VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
- // Outer loop handling: They may require CFG and instruction level
- // transformations before even evaluating whether vectorization is profitable.
- // Since we cannot modify the incoming IR, we need to build VPlan upfront in
- // the vectorization pipeline.
+ Plan->addVF(VF);
+ RSO << "," << VF;
+ }
+ RSO << "},UF>=1";
+ RSO.flush();
+ Plan->setName(PlanName);
+
+ return Plan;
+}
+
+VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
+ // Outer loop handling: They may require CFG and instruction level
+ // transformations before even evaluating whether vectorization is profitable.
+ // Since we cannot modify the incoming IR, we need to build VPlan upfront in
+ // the vectorization pipeline.
assert(!OrigLoop->isInnermost());
- assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
-
- // Create new empty VPlan
- auto Plan = std::make_unique<VPlan>();
-
- // Build hierarchical CFG
- VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
- HCFGBuilder.buildHierarchicalCFG();
-
+ assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
+
+ // Create new empty VPlan
+ auto Plan = std::make_unique<VPlan>();
+
+ // Build hierarchical CFG
+ VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
+ HCFGBuilder.buildHierarchicalCFG();
+
for (ElementCount VF = Range.Start; ElementCount::isKnownLT(VF, Range.End);
VF *= 2)
- Plan->addVF(VF);
-
- if (EnableVPlanPredication) {
- VPlanPredicator VPP(*Plan);
- VPP.predicate();
-
- // Avoid running transformation to recipes until masked code generation in
- // VPlan-native path is in place.
- return Plan;
- }
-
- SmallPtrSet<Instruction *, 1> DeadInstructions;
- VPlanTransforms::VPInstructionsToVPRecipes(
- OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
- return Plan;
-}
-
+ Plan->addVF(VF);
+
+ if (EnableVPlanPredication) {
+ VPlanPredicator VPP(*Plan);
+ VPP.predicate();
+
+ // Avoid running transformation to recipes until masked code generation in
+ // VPlan-native path is in place.
+ return Plan;
+ }
+
+ SmallPtrSet<Instruction *, 1> DeadInstructions;
+ VPlanTransforms::VPInstructionsToVPRecipes(
+ OrigLoop, Plan, Legal->getInductionVars(), DeadInstructions);
+ return Plan;
+}
+
// Adjust the recipes for any inloop reductions. The chain of instructions
// leading from the loop exit instr to the phi need to be converted to
// reductions, with one operand being vector and the other being the scalar
@@ -8927,109 +8927,109 @@ void LoopVectorizationPlanner::adjustRecipesForInLoopReductions(
}
}
-Value* LoopVectorizationPlanner::VPCallbackILV::
-getOrCreateVectorValues(Value *V, unsigned Part) {
- return ILV.getOrCreateVectorValue(V, Part);
-}
-
-Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
- Value *V, const VPIteration &Instance) {
- return ILV.getOrCreateScalarValue(V, Instance);
-}
-
-void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
- O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
- IG->getInsertPos()->printAsOperand(O, false);
- O << ", ";
- getAddr()->printAsOperand(O, SlotTracker);
- VPValue *Mask = getMask();
- if (Mask) {
- O << ", ";
- Mask->printAsOperand(O, SlotTracker);
- }
- for (unsigned i = 0; i < IG->getFactor(); ++i)
- if (Instruction *I = IG->getMember(i))
- O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i;
-}
-
-void VPWidenCallRecipe::execute(VPTransformState &State) {
+Value* LoopVectorizationPlanner::VPCallbackILV::
+getOrCreateVectorValues(Value *V, unsigned Part) {
+ return ILV.getOrCreateVectorValue(V, Part);
+}
+
+Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
+ Value *V, const VPIteration &Instance) {
+ return ILV.getOrCreateScalarValue(V, Instance);
+}
+
+void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
+ O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+ IG->getInsertPos()->printAsOperand(O, false);
+ O << ", ";
+ getAddr()->printAsOperand(O, SlotTracker);
+ VPValue *Mask = getMask();
+ if (Mask) {
+ O << ", ";
+ Mask->printAsOperand(O, SlotTracker);
+ }
+ for (unsigned i = 0; i < IG->getFactor(); ++i)
+ if (Instruction *I = IG->getMember(i))
+ O << "\\l\" +\n" << Indent << "\" " << VPlanIngredient(I) << " " << i;
+}
+
+void VPWidenCallRecipe::execute(VPTransformState &State) {
State.ILV->widenCallInstruction(*cast<CallInst>(getUnderlyingInstr()), this,
*this, State);
-}
-
-void VPWidenSelectRecipe::execute(VPTransformState &State) {
+}
+
+void VPWidenSelectRecipe::execute(VPTransformState &State) {
State.ILV->widenSelectInstruction(*cast<SelectInst>(getUnderlyingInstr()),
this, *this, InvariantCond, State);
-}
-
-void VPWidenRecipe::execute(VPTransformState &State) {
+}
+
+void VPWidenRecipe::execute(VPTransformState &State) {
State.ILV->widenInstruction(*getUnderlyingInstr(), this, *this, State);
-}
-
-void VPWidenGEPRecipe::execute(VPTransformState &State) {
+}
+
+void VPWidenGEPRecipe::execute(VPTransformState &State) {
State.ILV->widenGEP(cast<GetElementPtrInst>(getUnderlyingInstr()), this,
*this, State.UF, State.VF, IsPtrLoopInvariant,
- IsIndexLoopInvariant, State);
-}
-
-void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
- assert(!State.Instance && "Int or FP induction being replicated.");
+ IsIndexLoopInvariant, State);
+}
+
+void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Int or FP induction being replicated.");
State.ILV->widenIntOrFpInduction(IV, getStartValue()->getLiveInIRValue(),
Trunc);
-}
-
-void VPWidenPHIRecipe::execute(VPTransformState &State) {
+}
+
+void VPWidenPHIRecipe::execute(VPTransformState &State) {
Value *StartV =
getStartValue() ? getStartValue()->getLiveInIRValue() : nullptr;
State.ILV->widenPHIInstruction(Phi, RdxDesc, StartV, State.UF, State.VF);
-}
-
-void VPBlendRecipe::execute(VPTransformState &State) {
- State.ILV->setDebugLocFromInst(State.Builder, Phi);
- // We know that all PHIs in non-header blocks are converted into
- // selects, so we don't have to worry about the insertion order and we
- // can just use the builder.
- // At this point we generate the predication tree. There may be
- // duplications since this is a simple recursive scan, but future
- // optimizations will clean it up.
-
- unsigned NumIncoming = getNumIncomingValues();
-
- // Generate a sequence of selects of the form:
- // SELECT(Mask3, In3,
- // SELECT(Mask2, In2,
- // SELECT(Mask1, In1,
- // In0)))
- // Note that Mask0 is never used: lanes for which no path reaches this phi and
- // are essentially undef are taken from In0.
- InnerLoopVectorizer::VectorParts Entry(State.UF);
- for (unsigned In = 0; In < NumIncoming; ++In) {
- for (unsigned Part = 0; Part < State.UF; ++Part) {
- // We might have single edge PHIs (blocks) - use an identity
- // 'select' for the first PHI operand.
- Value *In0 = State.get(getIncomingValue(In), Part);
- if (In == 0)
- Entry[Part] = In0; // Initialize with the first incoming value.
- else {
- // Select between the current value and the previous incoming edge
- // based on the incoming mask.
- Value *Cond = State.get(getMask(In), Part);
- Entry[Part] =
- State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
- }
- }
- }
- for (unsigned Part = 0; Part < State.UF; ++Part)
- State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
-}
-
-void VPInterleaveRecipe::execute(VPTransformState &State) {
- assert(!State.Instance && "Interleave group being replicated.");
+}
+
+void VPBlendRecipe::execute(VPTransformState &State) {
+ State.ILV->setDebugLocFromInst(State.Builder, Phi);
+ // We know that all PHIs in non-header blocks are converted into
+ // selects, so we don't have to worry about the insertion order and we
+ // can just use the builder.
+ // At this point we generate the predication tree. There may be
+ // duplications since this is a simple recursive scan, but future
+ // optimizations will clean it up.
+
+ unsigned NumIncoming = getNumIncomingValues();
+
+ // Generate a sequence of selects of the form:
+ // SELECT(Mask3, In3,
+ // SELECT(Mask2, In2,
+ // SELECT(Mask1, In1,
+ // In0)))
+ // Note that Mask0 is never used: lanes for which no path reaches this phi and
+ // are essentially undef are taken from In0.
+ InnerLoopVectorizer::VectorParts Entry(State.UF);
+ for (unsigned In = 0; In < NumIncoming; ++In) {
+ for (unsigned Part = 0; Part < State.UF; ++Part) {
+ // We might have single edge PHIs (blocks) - use an identity
+ // 'select' for the first PHI operand.
+ Value *In0 = State.get(getIncomingValue(In), Part);
+ if (In == 0)
+ Entry[Part] = In0; // Initialize with the first incoming value.
+ else {
+ // Select between the current value and the previous incoming edge
+ // based on the incoming mask.
+ Value *Cond = State.get(getMask(In), Part);
+ Entry[Part] =
+ State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
+ }
+ }
+ }
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ State.ValueMap.setVectorValue(Phi, Part, Entry[Part]);
+}
+
+void VPInterleaveRecipe::execute(VPTransformState &State) {
+ assert(!State.Instance && "Interleave group being replicated.");
State.ILV->vectorizeInterleaveGroup(IG, definedValues(), State, getAddr(),
getStoredValues(), getMask());
-}
-
+}
+
void VPReductionRecipe::execute(VPTransformState &State) {
assert(!State.Instance && "Reduction being replicated.");
for (unsigned Part = 0; Part < State.UF; ++Part) {
@@ -9062,116 +9062,116 @@ void VPReductionRecipe::execute(VPTransformState &State) {
}
}
-void VPReplicateRecipe::execute(VPTransformState &State) {
- if (State.Instance) { // Generate a single instance.
+void VPReplicateRecipe::execute(VPTransformState &State) {
+ if (State.Instance) { // Generate a single instance.
assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this,
*State.Instance, IsPredicated, State);
- // Insert scalar instance packing it into a vector.
+ // Insert scalar instance packing it into a vector.
if (AlsoPack && State.VF.isVector()) {
// If we're constructing lane 0, initialize to start from poison.
- if (State.Instance->Lane == 0) {
+ if (State.Instance->Lane == 0) {
assert(!State.VF.isScalable() && "VF is assumed to be non scalable.");
Value *Poison = PoisonValue::get(
VectorType::get(getUnderlyingValue()->getType(), State.VF));
State.ValueMap.setVectorValue(getUnderlyingInstr(),
State.Instance->Part, Poison);
- }
+ }
State.ILV->packScalarIntoVectorValue(getUnderlyingInstr(),
*State.Instance);
- }
- return;
- }
-
- // Generate scalar instances for all VF lanes of all UF parts, unless the
- // instruction is uniform inwhich case generate only the first lane for each
- // of the UF parts.
+ }
+ return;
+ }
+
+ // Generate scalar instances for all VF lanes of all UF parts, unless the
+ // instruction is uniform inwhich case generate only the first lane for each
+ // of the UF parts.
unsigned EndLane = IsUniform ? 1 : State.VF.getKnownMinValue();
assert((!State.VF.isScalable() || IsUniform) &&
"Can't scalarize a scalable vector");
- for (unsigned Part = 0; Part < State.UF; ++Part)
- for (unsigned Lane = 0; Lane < EndLane; ++Lane)
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ for (unsigned Lane = 0; Lane < EndLane; ++Lane)
State.ILV->scalarizeInstruction(getUnderlyingInstr(), *this, {Part, Lane},
- IsPredicated, State);
-}
-
-void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
- assert(State.Instance && "Branch on Mask works only on single instance.");
-
- unsigned Part = State.Instance->Part;
- unsigned Lane = State.Instance->Lane;
-
- Value *ConditionBit = nullptr;
- VPValue *BlockInMask = getMask();
- if (BlockInMask) {
- ConditionBit = State.get(BlockInMask, Part);
- if (ConditionBit->getType()->isVectorTy())
- ConditionBit = State.Builder.CreateExtractElement(
- ConditionBit, State.Builder.getInt32(Lane));
- } else // Block in mask is all-one.
- ConditionBit = State.Builder.getTrue();
-
- // Replace the temporary unreachable terminator with a new conditional branch,
- // whose two destinations will be set later when they are created.
- auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
- assert(isa<UnreachableInst>(CurrentTerminator) &&
- "Expected to replace unreachable terminator with conditional branch.");
- auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
- CondBr->setSuccessor(0, nullptr);
- ReplaceInstWithInst(CurrentTerminator, CondBr);
-}
-
-void VPPredInstPHIRecipe::execute(VPTransformState &State) {
- assert(State.Instance && "Predicated instruction PHI works per instance.");
+ IsPredicated, State);
+}
+
+void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
+ assert(State.Instance && "Branch on Mask works only on single instance.");
+
+ unsigned Part = State.Instance->Part;
+ unsigned Lane = State.Instance->Lane;
+
+ Value *ConditionBit = nullptr;
+ VPValue *BlockInMask = getMask();
+ if (BlockInMask) {
+ ConditionBit = State.get(BlockInMask, Part);
+ if (ConditionBit->getType()->isVectorTy())
+ ConditionBit = State.Builder.CreateExtractElement(
+ ConditionBit, State.Builder.getInt32(Lane));
+ } else // Block in mask is all-one.
+ ConditionBit = State.Builder.getTrue();
+
+ // Replace the temporary unreachable terminator with a new conditional branch,
+ // whose two destinations will be set later when they are created.
+ auto *CurrentTerminator = State.CFG.PrevBB->getTerminator();
+ assert(isa<UnreachableInst>(CurrentTerminator) &&
+ "Expected to replace unreachable terminator with conditional branch.");
+ auto *CondBr = BranchInst::Create(State.CFG.PrevBB, nullptr, ConditionBit);
+ CondBr->setSuccessor(0, nullptr);
+ ReplaceInstWithInst(CurrentTerminator, CondBr);
+}
+
+void VPPredInstPHIRecipe::execute(VPTransformState &State) {
+ assert(State.Instance && "Predicated instruction PHI works per instance.");
Instruction *ScalarPredInst =
cast<Instruction>(State.get(getOperand(0), *State.Instance));
- BasicBlock *PredicatedBB = ScalarPredInst->getParent();
- BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
- assert(PredicatingBB && "Predicated block has no single predecessor.");
-
- // By current pack/unpack logic we need to generate only a single phi node: if
- // a vector value for the predicated instruction exists at this point it means
- // the instruction has vector users only, and a phi for the vector value is
- // needed. In this case the recipe of the predicated instruction is marked to
- // also do that packing, thereby "hoisting" the insert-element sequence.
- // Otherwise, a phi node for the scalar value is needed.
- unsigned Part = State.Instance->Part;
+ BasicBlock *PredicatedBB = ScalarPredInst->getParent();
+ BasicBlock *PredicatingBB = PredicatedBB->getSinglePredecessor();
+ assert(PredicatingBB && "Predicated block has no single predecessor.");
+
+ // By current pack/unpack logic we need to generate only a single phi node: if
+ // a vector value for the predicated instruction exists at this point it means
+ // the instruction has vector users only, and a phi for the vector value is
+ // needed. In this case the recipe of the predicated instruction is marked to
+ // also do that packing, thereby "hoisting" the insert-element sequence.
+ // Otherwise, a phi node for the scalar value is needed.
+ unsigned Part = State.Instance->Part;
Instruction *PredInst =
cast<Instruction>(getOperand(0)->getUnderlyingValue());
- if (State.ValueMap.hasVectorValue(PredInst, Part)) {
- Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
- InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
- PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
- VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
- VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
- State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
- } else {
- Type *PredInstType = PredInst->getType();
- PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
+ if (State.ValueMap.hasVectorValue(PredInst, Part)) {
+ Value *VectorValue = State.ValueMap.getVectorValue(PredInst, Part);
+ InsertElementInst *IEI = cast<InsertElementInst>(VectorValue);
+ PHINode *VPhi = State.Builder.CreatePHI(IEI->getType(), 2);
+ VPhi->addIncoming(IEI->getOperand(0), PredicatingBB); // Unmodified vector.
+ VPhi->addIncoming(IEI, PredicatedBB); // New vector with inserted element.
+ State.ValueMap.resetVectorValue(PredInst, Part, VPhi); // Update cache.
+ } else {
+ Type *PredInstType = PredInst->getType();
+ PHINode *Phi = State.Builder.CreatePHI(PredInstType, 2);
Phi->addIncoming(PoisonValue::get(ScalarPredInst->getType()), PredicatingBB);
- Phi->addIncoming(ScalarPredInst, PredicatedBB);
- State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
- }
-}
-
-void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
+ Phi->addIncoming(ScalarPredInst, PredicatedBB);
+ State.ValueMap.resetScalarValue(PredInst, *State.Instance, Phi);
+ }
+}
+
+void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
VPValue *StoredValue = isStore() ? getStoredValue() : nullptr;
State.ILV->vectorizeMemoryInstruction(&Ingredient, State,
StoredValue ? nullptr : getVPValue(),
getAddr(), StoredValue, getMask());
-}
-
-// Determine how to lower the scalar epilogue, which depends on 1) optimising
-// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
-// predication, and 4) a TTI hook that analyses whether the loop is suitable
-// for predication.
-static ScalarEpilogueLowering getScalarEpilogueLowering(
- Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
- BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
- AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
- LoopVectorizationLegality &LVL) {
- // 1) OptSize takes precedence over all other options, i.e. if this is set,
- // don't look at hints or options, and don't request a scalar epilogue.
+}
+
+// Determine how to lower the scalar epilogue, which depends on 1) optimising
+// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
+// predication, and 4) a TTI hook that analyses whether the loop is suitable
+// for predication.
+static ScalarEpilogueLowering getScalarEpilogueLowering(
+ Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
+ BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+ AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+ LoopVectorizationLegality &LVL) {
+ // 1) OptSize takes precedence over all other options, i.e. if this is set,
+ // don't look at hints or options, and don't request a scalar epilogue.
// (For PGSO, as shouldOptimizeForSize isn't currently accessible from
// LoopAccessInfo (due to code dependency and not being able to reliably get
// PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
@@ -9181,8 +9181,8 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
if (F->hasOptSize() || (llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
PGSOQueryType::IRPass) &&
Hints.getForce() != LoopVectorizeHints::FK_Enabled))
- return CM_ScalarEpilogueNotAllowedOptSize;
-
+ return CM_ScalarEpilogueNotAllowedOptSize;
+
// 2) If set, obey the directives
if (PreferPredicateOverEpilogue.getNumOccurrences()) {
switch (PreferPredicateOverEpilogue) {
@@ -9194,356 +9194,356 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
return CM_ScalarEpilogueNotAllowedUsePredicate;
};
}
-
+
// 3) If set, obey the hints
switch (Hints.getPredicate()) {
case LoopVectorizeHints::FK_Enabled:
return CM_ScalarEpilogueNotNeededUsePredicate;
case LoopVectorizeHints::FK_Disabled:
- return CM_ScalarEpilogueAllowed;
+ return CM_ScalarEpilogueAllowed;
};
-
+
// 4) if the TTI hook indicates this is profitable, request predication.
if (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
LVL.getLAI()))
- return CM_ScalarEpilogueNotNeededUsePredicate;
-
- return CM_ScalarEpilogueAllowed;
-}
-
+ return CM_ScalarEpilogueNotNeededUsePredicate;
+
+ return CM_ScalarEpilogueAllowed;
+}
+
void VPTransformState::set(VPValue *Def, Value *IRDef, Value *V,
unsigned Part) {
set(Def, V, Part);
ILV->setVectorValue(IRDef, Part, V);
}
-// Process the loop in the VPlan-native vectorization path. This path builds
-// VPlan upfront in the vectorization pipeline, which allows to apply
-// VPlan-to-VPlan transformations from the very beginning without modifying the
-// input LLVM IR.
-static bool processLoopInVPlanNativePath(
- Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
- LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
- TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
- OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
- ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
-
+// Process the loop in the VPlan-native vectorization path. This path builds
+// VPlan upfront in the vectorization pipeline, which allows to apply
+// VPlan-to-VPlan transformations from the very beginning without modifying the
+// input LLVM IR.
+static bool processLoopInVPlanNativePath(
+ Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
+ LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
+ TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
+ OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
+ ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
+
if (isa<SCEVCouldNotCompute>(PSE.getBackedgeTakenCount())) {
- LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
- return false;
- }
- assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
- Function *F = L->getHeader()->getParent();
- InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
-
- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
- F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
-
- LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
- &Hints, IAI);
- // Use the planner for outer loop vectorization.
- // TODO: CM is not used at this point inside the planner. Turn CM into an
- // optional argument if we don't need it in the future.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
-
- // Get user vectorization factor.
+ LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
+ return false;
+ }
+ assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
+ Function *F = L->getHeader()->getParent();
+ InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
+
+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+ F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
+
+ LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
+ &Hints, IAI);
+ // Use the planner for outer loop vectorization.
+ // TODO: CM is not used at this point inside the planner. Turn CM into an
+ // optional argument if we don't need it in the future.
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
+
+ // Get user vectorization factor.
ElementCount UserVF = Hints.getWidth();
-
- // Plan how to best vectorize, return the best VF and its cost.
- const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
-
- // If we are stress testing VPlan builds, do not attempt to generate vector
- // code. Masked vector code generation support will follow soon.
- // Also, do not attempt to vectorize if no vector code will be produced.
- if (VPlanBuildStressTest || EnableVPlanPredication ||
- VectorizationFactor::Disabled() == VF)
- return false;
-
- LVP.setBestPlan(VF.Width, 1);
-
- InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
+
+ // Plan how to best vectorize, return the best VF and its cost.
+ const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
+
+ // If we are stress testing VPlan builds, do not attempt to generate vector
+ // code. Masked vector code generation support will follow soon.
+ // Also, do not attempt to vectorize if no vector code will be produced.
+ if (VPlanBuildStressTest || EnableVPlanPredication ||
+ VectorizationFactor::Disabled() == VF)
+ return false;
+
+ LVP.setBestPlan(VF.Width, 1);
+
+ InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, 1, LVL,
&CM, BFI, PSI);
- LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
- << L->getHeader()->getParent()->getName() << "\"\n");
- LVP.executePlan(LB, DT);
-
- // Mark the loop as already vectorized to avoid vectorizing again.
- Hints.setAlreadyVectorized();
-
- assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
- return true;
-}
-
-LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
- : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
- !EnableLoopInterleaving),
- VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
- !EnableLoopVectorization) {}
-
-bool LoopVectorizePass::processLoop(Loop *L) {
+ LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
+ << L->getHeader()->getParent()->getName() << "\"\n");
+ LVP.executePlan(LB, DT);
+
+ // Mark the loop as already vectorized to avoid vectorizing again.
+ Hints.setAlreadyVectorized();
+
+ assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
+ return true;
+}
+
+LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
+ : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
+ !EnableLoopInterleaving),
+ VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
+ !EnableLoopVectorization) {}
+
+bool LoopVectorizePass::processLoop(Loop *L) {
assert((EnableVPlanNativePath || L->isInnermost()) &&
- "VPlan-native path is not enabled. Only process inner loops.");
-
-#ifndef NDEBUG
- const std::string DebugLocStr = getDebugLocString(L);
-#endif /* NDEBUG */
-
- LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
- << L->getHeader()->getParent()->getName() << "\" from "
- << DebugLocStr << "\n");
-
- LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
-
- LLVM_DEBUG(
- dbgs() << "LV: Loop hints:"
- << " force="
- << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
- ? "disabled"
- : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
- ? "enabled"
- : "?"))
- << " width=" << Hints.getWidth()
- << " unroll=" << Hints.getInterleave() << "\n");
-
- // Function containing loop
- Function *F = L->getHeader()->getParent();
-
- // Looking at the diagnostic output is the only way to determine if a loop
- // was vectorized (other than looking at the IR or machine code), so it
- // is important to generate an optimization remark for each loop. Most of
- // these messages are generated as OptimizationRemarkAnalysis. Remarks
- // generated as OptimizationRemark and OptimizationRemarkMissed are
- // less verbose reporting vectorized loops and unvectorized loops that may
- // benefit from vectorization, respectively.
-
- if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
- LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
- return false;
- }
-
- PredicatedScalarEvolution PSE(*SE, *L);
-
- // Check if it is legal to vectorize the loop.
- LoopVectorizationRequirements Requirements(*ORE);
- LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
+ "VPlan-native path is not enabled. Only process inner loops.");
+
+#ifndef NDEBUG
+ const std::string DebugLocStr = getDebugLocString(L);
+#endif /* NDEBUG */
+
+ LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in \""
+ << L->getHeader()->getParent()->getName() << "\" from "
+ << DebugLocStr << "\n");
+
+ LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE);
+
+ LLVM_DEBUG(
+ dbgs() << "LV: Loop hints:"
+ << " force="
+ << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
+ ? "disabled"
+ : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
+ ? "enabled"
+ : "?"))
+ << " width=" << Hints.getWidth()
+ << " unroll=" << Hints.getInterleave() << "\n");
+
+ // Function containing loop
+ Function *F = L->getHeader()->getParent();
+
+ // Looking at the diagnostic output is the only way to determine if a loop
+ // was vectorized (other than looking at the IR or machine code), so it
+ // is important to generate an optimization remark for each loop. Most of
+ // these messages are generated as OptimizationRemarkAnalysis. Remarks
+ // generated as OptimizationRemark and OptimizationRemarkMissed are
+ // less verbose reporting vectorized loops and unvectorized loops that may
+ // benefit from vectorization, respectively.
+
+ if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
+ LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
+ return false;
+ }
+
+ PredicatedScalarEvolution PSE(*SE, *L);
+
+ // Check if it is legal to vectorize the loop.
+ LoopVectorizationRequirements Requirements(*ORE);
+ LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, AA, F, GetLAA, LI, ORE,
&Requirements, &Hints, DB, AC, BFI, PSI);
- if (!LVL.canVectorize(EnableVPlanNativePath)) {
- LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
- Hints.emitRemarkWithHints();
- return false;
- }
-
- // Check the function attributes and profiles to find out if this function
- // should be optimized for size.
- ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
- F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
-
- // Entrance to the VPlan-native vectorization path. Outer loops are processed
- // here. They may require CFG and instruction level transformations before
- // even evaluating whether vectorization is profitable. Since we cannot modify
- // the incoming IR, we need to build VPlan upfront in the vectorization
- // pipeline.
+ if (!LVL.canVectorize(EnableVPlanNativePath)) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
+ Hints.emitRemarkWithHints();
+ return false;
+ }
+
+ // Check the function attributes and profiles to find out if this function
+ // should be optimized for size.
+ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+ F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
+
+ // Entrance to the VPlan-native vectorization path. Outer loops are processed
+ // here. They may require CFG and instruction level transformations before
+ // even evaluating whether vectorization is profitable. Since we cannot modify
+ // the incoming IR, we need to build VPlan upfront in the vectorization
+ // pipeline.
if (!L->isInnermost())
- return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
- ORE, BFI, PSI, Hints);
-
+ return processLoopInVPlanNativePath(L, PSE, LI, DT, &LVL, TTI, TLI, DB, AC,
+ ORE, BFI, PSI, Hints);
+
assert(L->isInnermost() && "Inner loop expected.");
-
- // Check the loop for a trip count threshold: vectorize loops with a tiny trip
- // count by optimizing for size, to minimize overheads.
- auto ExpectedTC = getSmallBestKnownTC(*SE, L);
- if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
- LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
- << "This loop is worth vectorizing only if no scalar "
- << "iteration overheads are incurred.");
- if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
- LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
- else {
- LLVM_DEBUG(dbgs() << "\n");
- SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
- }
- }
-
- // Check the function attributes to see if implicit floats are allowed.
- // FIXME: This check doesn't seem possibly correct -- what if the loop is
- // an integer loop and the vector instructions selected are purely integer
- // vector instructions?
- if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
- reportVectorizationFailure(
- "Can't vectorize when the NoImplicitFloat attribute is used",
- "loop not vectorized due to NoImplicitFloat attribute",
- "NoImplicitFloat", ORE, L);
- Hints.emitRemarkWithHints();
- return false;
- }
-
- // Check if the target supports potentially unsafe FP vectorization.
- // FIXME: Add a check for the type of safety issue (denormal, signaling)
- // for the target we're vectorizing for, to make sure none of the
- // additional fp-math flags can help.
- if (Hints.isPotentiallyUnsafe() &&
- TTI->isFPVectorizationPotentiallyUnsafe()) {
- reportVectorizationFailure(
- "Potentially unsafe FP op prevents vectorization",
- "loop not vectorized due to unsafe FP support.",
- "UnsafeFP", ORE, L);
- Hints.emitRemarkWithHints();
- return false;
- }
-
- bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
- InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
-
- // If an override option has been passed in for interleaved accesses, use it.
- if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
- UseInterleaved = EnableInterleavedMemAccesses;
-
- // Analyze interleaved memory accesses.
- if (UseInterleaved) {
- IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
- }
-
- // Use the cost model.
- LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
- F, &Hints, IAI);
- CM.collectValuesToIgnore();
-
- // Use the planner for vectorization.
- LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
-
- // Get user vectorization factor and interleave count.
+
+ // Check the loop for a trip count threshold: vectorize loops with a tiny trip
+ // count by optimizing for size, to minimize overheads.
+ auto ExpectedTC = getSmallBestKnownTC(*SE, L);
+ if (ExpectedTC && *ExpectedTC < TinyTripCountVectorThreshold) {
+ LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
+ << "This loop is worth vectorizing only if no scalar "
+ << "iteration overheads are incurred.");
+ if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
+ LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
+ else {
+ LLVM_DEBUG(dbgs() << "\n");
+ SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
+ }
+ }
+
+ // Check the function attributes to see if implicit floats are allowed.
+ // FIXME: This check doesn't seem possibly correct -- what if the loop is
+ // an integer loop and the vector instructions selected are purely integer
+ // vector instructions?
+ if (F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+ reportVectorizationFailure(
+ "Can't vectorize when the NoImplicitFloat attribute is used",
+ "loop not vectorized due to NoImplicitFloat attribute",
+ "NoImplicitFloat", ORE, L);
+ Hints.emitRemarkWithHints();
+ return false;
+ }
+
+ // Check if the target supports potentially unsafe FP vectorization.
+ // FIXME: Add a check for the type of safety issue (denormal, signaling)
+ // for the target we're vectorizing for, to make sure none of the
+ // additional fp-math flags can help.
+ if (Hints.isPotentiallyUnsafe() &&
+ TTI->isFPVectorizationPotentiallyUnsafe()) {
+ reportVectorizationFailure(
+ "Potentially unsafe FP op prevents vectorization",
+ "loop not vectorized due to unsafe FP support.",
+ "UnsafeFP", ORE, L);
+ Hints.emitRemarkWithHints();
+ return false;
+ }
+
+ bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
+ InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
+
+ // If an override option has been passed in for interleaved accesses, use it.
+ if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
+ UseInterleaved = EnableInterleavedMemAccesses;
+
+ // Analyze interleaved memory accesses.
+ if (UseInterleaved) {
+ IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
+ }
+
+ // Use the cost model.
+ LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
+ F, &Hints, IAI);
+ CM.collectValuesToIgnore();
+
+ // Use the planner for vectorization.
+ LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
+
+ // Get user vectorization factor and interleave count.
ElementCount UserVF = Hints.getWidth();
- unsigned UserIC = Hints.getInterleave();
-
- // Plan how to best vectorize, return the best VF and its cost.
- Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
-
- VectorizationFactor VF = VectorizationFactor::Disabled();
- unsigned IC = 1;
-
- if (MaybeVF) {
- VF = *MaybeVF;
- // Select the interleave count.
- IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
- }
-
- // Identify the diagnostic messages that should be produced.
- std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
- bool VectorizeLoop = true, InterleaveLoop = true;
- if (Requirements.doesNotMeet(F, L, Hints)) {
- LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
- "requirements.\n");
- Hints.emitRemarkWithHints();
- return false;
- }
-
+ unsigned UserIC = Hints.getInterleave();
+
+ // Plan how to best vectorize, return the best VF and its cost.
+ Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
+
+ VectorizationFactor VF = VectorizationFactor::Disabled();
+ unsigned IC = 1;
+
+ if (MaybeVF) {
+ VF = *MaybeVF;
+ // Select the interleave count.
+ IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
+ }
+
+ // Identify the diagnostic messages that should be produced.
+ std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
+ bool VectorizeLoop = true, InterleaveLoop = true;
+ if (Requirements.doesNotMeet(F, L, Hints)) {
+ LLVM_DEBUG(dbgs() << "LV: Not vectorizing: loop did not meet vectorization "
+ "requirements.\n");
+ Hints.emitRemarkWithHints();
+ return false;
+ }
+
if (VF.Width.isScalar()) {
- LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
- VecDiagMsg = std::make_pair(
- "VectorizationNotBeneficial",
- "the cost-model indicates that vectorization is not beneficial");
- VectorizeLoop = false;
- }
-
- if (!MaybeVF && UserIC > 1) {
- // Tell the user interleaving was avoided up-front, despite being explicitly
- // requested.
- LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
- "interleaving should be avoided up front\n");
- IntDiagMsg = std::make_pair(
- "InterleavingAvoided",
- "Ignoring UserIC, because interleaving was avoided up front");
- InterleaveLoop = false;
- } else if (IC == 1 && UserIC <= 1) {
- // Tell the user interleaving is not beneficial.
- LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
- IntDiagMsg = std::make_pair(
- "InterleavingNotBeneficial",
- "the cost-model indicates that interleaving is not beneficial");
- InterleaveLoop = false;
- if (UserIC == 1) {
- IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
- IntDiagMsg.second +=
- " and is explicitly disabled or interleave count is set to 1";
- }
- } else if (IC > 1 && UserIC == 1) {
- // Tell the user interleaving is beneficial, but it explicitly disabled.
- LLVM_DEBUG(
- dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
- IntDiagMsg = std::make_pair(
- "InterleavingBeneficialButDisabled",
- "the cost-model indicates that interleaving is beneficial "
- "but is explicitly disabled or interleave count is set to 1");
- InterleaveLoop = false;
- }
-
- // Override IC if user provided an interleave count.
- IC = UserIC > 0 ? UserIC : IC;
-
- // Emit diagnostic messages, if any.
- const char *VAPassName = Hints.vectorizeAnalysisPassName();
- if (!VectorizeLoop && !InterleaveLoop) {
- // Do not vectorize or interleaving the loop.
- ORE->emit([&]() {
- return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
- L->getStartLoc(), L->getHeader())
- << VecDiagMsg.second;
- });
- ORE->emit([&]() {
- return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
- L->getStartLoc(), L->getHeader())
- << IntDiagMsg.second;
- });
- return false;
- } else if (!VectorizeLoop && InterleaveLoop) {
- LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
- L->getStartLoc(), L->getHeader())
- << VecDiagMsg.second;
- });
- } else if (VectorizeLoop && !InterleaveLoop) {
- LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
- << ") in " << DebugLocStr << '\n');
- ORE->emit([&]() {
- return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
- L->getStartLoc(), L->getHeader())
- << IntDiagMsg.second;
- });
- } else if (VectorizeLoop && InterleaveLoop) {
- LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
- << ") in " << DebugLocStr << '\n');
- LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
- }
-
- LVP.setBestPlan(VF.Width, IC);
-
- using namespace ore;
- bool DisableRuntimeUnroll = false;
- MDNode *OrigLoopID = L->getLoopID();
-
- if (!VectorizeLoop) {
- assert(IC > 1 && "interleave count should not be 1 or 0");
- // If we decided that it is not legal to vectorize the loop, then
- // interleave it.
+ LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+ VecDiagMsg = std::make_pair(
+ "VectorizationNotBeneficial",
+ "the cost-model indicates that vectorization is not beneficial");
+ VectorizeLoop = false;
+ }
+
+ if (!MaybeVF && UserIC > 1) {
+ // Tell the user interleaving was avoided up-front, despite being explicitly
+ // requested.
+ LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
+ "interleaving should be avoided up front\n");
+ IntDiagMsg = std::make_pair(
+ "InterleavingAvoided",
+ "Ignoring UserIC, because interleaving was avoided up front");
+ InterleaveLoop = false;
+ } else if (IC == 1 && UserIC <= 1) {
+ // Tell the user interleaving is not beneficial.
+ LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
+ IntDiagMsg = std::make_pair(
+ "InterleavingNotBeneficial",
+ "the cost-model indicates that interleaving is not beneficial");
+ InterleaveLoop = false;
+ if (UserIC == 1) {
+ IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
+ IntDiagMsg.second +=
+ " and is explicitly disabled or interleave count is set to 1";
+ }
+ } else if (IC > 1 && UserIC == 1) {
+ // Tell the user interleaving is beneficial, but it explicitly disabled.
+ LLVM_DEBUG(
+ dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
+ IntDiagMsg = std::make_pair(
+ "InterleavingBeneficialButDisabled",
+ "the cost-model indicates that interleaving is beneficial "
+ "but is explicitly disabled or interleave count is set to 1");
+ InterleaveLoop = false;
+ }
+
+ // Override IC if user provided an interleave count.
+ IC = UserIC > 0 ? UserIC : IC;
+
+ // Emit diagnostic messages, if any.
+ const char *VAPassName = Hints.vectorizeAnalysisPassName();
+ if (!VectorizeLoop && !InterleaveLoop) {
+ // Do not vectorize or interleaving the loop.
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << VecDiagMsg.second;
+ });
+ ORE->emit([&]() {
+ return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << IntDiagMsg.second;
+ });
+ return false;
+ } else if (!VectorizeLoop && InterleaveLoop) {
+ LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << VecDiagMsg.second;
+ });
+ } else if (VectorizeLoop && !InterleaveLoop) {
+ LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
+ << ") in " << DebugLocStr << '\n');
+ ORE->emit([&]() {
+ return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
+ L->getStartLoc(), L->getHeader())
+ << IntDiagMsg.second;
+ });
+ } else if (VectorizeLoop && InterleaveLoop) {
+ LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
+ << ") in " << DebugLocStr << '\n');
+ LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
+ }
+
+ LVP.setBestPlan(VF.Width, IC);
+
+ using namespace ore;
+ bool DisableRuntimeUnroll = false;
+ MDNode *OrigLoopID = L->getLoopID();
+
+ if (!VectorizeLoop) {
+ assert(IC > 1 && "interleave count should not be 1 or 0");
+ // If we decided that it is not legal to vectorize the loop, then
+ // interleave it.
InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM,
BFI, PSI);
- LVP.executePlan(Unroller, DT);
-
- ORE->emit([&]() {
- return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
- L->getHeader())
- << "interleaved loop (interleaved count: "
- << NV("InterleaveCount", IC) << ")";
- });
- } else {
- // If we decided that it is *legal* to vectorize the loop, then do it.
-
+ LVP.executePlan(Unroller, DT);
+
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
+ L->getHeader())
+ << "interleaved loop (interleaved count: "
+ << NV("InterleaveCount", IC) << ")";
+ });
+ } else {
+ // If we decided that it is *legal* to vectorize the loop, then do it.
+
// Consider vectorizing the epilogue too if it's profitable.
VectorizationFactor EpilogueVF =
CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
if (EpilogueVF.Width.isVector()) {
-
+
// The first pass vectorizes the main loop and creates a scalar epilogue
// to be vectorized by executing the plan (potentially with a different
// factor) again shortly afterwards.
@@ -9584,142 +9584,142 @@ bool LoopVectorizePass::processLoop(Loop *L) {
DisableRuntimeUnroll = true;
}
- // Report the vectorization decision.
- ORE->emit([&]() {
- return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
- L->getHeader())
- << "vectorized loop (vectorization width: "
- << NV("VectorizationFactor", VF.Width)
- << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
- });
- }
-
- Optional<MDNode *> RemainderLoopID =
- makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
- LLVMLoopVectorizeFollowupEpilogue});
- if (RemainderLoopID.hasValue()) {
- L->setLoopID(RemainderLoopID.getValue());
- } else {
- if (DisableRuntimeUnroll)
- AddRuntimeUnrollDisableMetaData(L);
-
- // Mark the loop as already vectorized to avoid vectorizing again.
- Hints.setAlreadyVectorized();
- }
-
- assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
- return true;
-}
-
-LoopVectorizeResult LoopVectorizePass::runImpl(
- Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
- DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
- DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
- std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
- OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
- SE = &SE_;
- LI = &LI_;
- TTI = &TTI_;
- DT = &DT_;
- BFI = &BFI_;
- TLI = TLI_;
- AA = &AA_;
- AC = &AC_;
- GetLAA = &GetLAA_;
- DB = &DB_;
- ORE = &ORE_;
- PSI = PSI_;
-
- // Don't attempt if
- // 1. the target claims to have no vector registers, and
- // 2. interleaving won't help ILP.
- //
- // The second condition is necessary because, even if the target has no
- // vector registers, loop vectorization may still enable scalar
- // interleaving.
- if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
- TTI->getMaxInterleaveFactor(1) < 2)
- return LoopVectorizeResult(false, false);
-
- bool Changed = false, CFGChanged = false;
-
- // The vectorizer requires loops to be in simplified form.
- // Since simplification may add new inner loops, it has to run before the
- // legality and profitability checks. This means running the loop vectorizer
- // will simplify all loops, regardless of whether anything end up being
- // vectorized.
- for (auto &L : *LI)
- Changed |= CFGChanged |=
- simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
-
- // Build up a worklist of inner-loops to vectorize. This is necessary as
- // the act of vectorizing or partially unrolling a loop creates new loops
- // and can invalidate iterators across the loops.
- SmallVector<Loop *, 8> Worklist;
-
- for (Loop *L : *LI)
- collectSupportedLoops(*L, LI, ORE, Worklist);
-
- LoopsAnalyzed += Worklist.size();
-
- // Now walk the identified inner loops.
- while (!Worklist.empty()) {
- Loop *L = Worklist.pop_back_val();
-
- // For the inner loops we actually process, form LCSSA to simplify the
- // transform.
- Changed |= formLCSSARecursively(*L, *DT, LI, SE);
-
- Changed |= CFGChanged |= processLoop(L);
- }
-
- // Process each loop nest in the function.
- return LoopVectorizeResult(Changed, CFGChanged);
-}
-
-PreservedAnalyses LoopVectorizePass::run(Function &F,
- FunctionAnalysisManager &AM) {
- auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
- auto &LI = AM.getResult<LoopAnalysis>(F);
- auto &TTI = AM.getResult<TargetIRAnalysis>(F);
- auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
- auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
- auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
- auto &AA = AM.getResult<AAManager>(F);
- auto &AC = AM.getResult<AssumptionAnalysis>(F);
- auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
- auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
- MemorySSA *MSSA = EnableMSSALoopDependency
- ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
- : nullptr;
-
- auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
- std::function<const LoopAccessInfo &(Loop &)> GetLAA =
- [&](Loop &L) -> const LoopAccessInfo & {
+ // Report the vectorization decision.
+ ORE->emit([&]() {
+ return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
+ L->getHeader())
+ << "vectorized loop (vectorization width: "
+ << NV("VectorizationFactor", VF.Width)
+ << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
+ });
+ }
+
+ Optional<MDNode *> RemainderLoopID =
+ makeFollowupLoopID(OrigLoopID, {LLVMLoopVectorizeFollowupAll,
+ LLVMLoopVectorizeFollowupEpilogue});
+ if (RemainderLoopID.hasValue()) {
+ L->setLoopID(RemainderLoopID.getValue());
+ } else {
+ if (DisableRuntimeUnroll)
+ AddRuntimeUnrollDisableMetaData(L);
+
+ // Mark the loop as already vectorized to avoid vectorizing again.
+ Hints.setAlreadyVectorized();
+ }
+
+ assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
+ return true;
+}
+
+LoopVectorizeResult LoopVectorizePass::runImpl(
+ Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
+ DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
+ DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
+ std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
+ OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
+ SE = &SE_;
+ LI = &LI_;
+ TTI = &TTI_;
+ DT = &DT_;
+ BFI = &BFI_;
+ TLI = TLI_;
+ AA = &AA_;
+ AC = &AC_;
+ GetLAA = &GetLAA_;
+ DB = &DB_;
+ ORE = &ORE_;
+ PSI = PSI_;
+
+ // Don't attempt if
+ // 1. the target claims to have no vector registers, and
+ // 2. interleaving won't help ILP.
+ //
+ // The second condition is necessary because, even if the target has no
+ // vector registers, loop vectorization may still enable scalar
+ // interleaving.
+ if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
+ TTI->getMaxInterleaveFactor(1) < 2)
+ return LoopVectorizeResult(false, false);
+
+ bool Changed = false, CFGChanged = false;
+
+ // The vectorizer requires loops to be in simplified form.
+ // Since simplification may add new inner loops, it has to run before the
+ // legality and profitability checks. This means running the loop vectorizer
+ // will simplify all loops, regardless of whether anything end up being
+ // vectorized.
+ for (auto &L : *LI)
+ Changed |= CFGChanged |=
+ simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
+
+ // Build up a worklist of inner-loops to vectorize. This is necessary as
+ // the act of vectorizing or partially unrolling a loop creates new loops
+ // and can invalidate iterators across the loops.
+ SmallVector<Loop *, 8> Worklist;
+
+ for (Loop *L : *LI)
+ collectSupportedLoops(*L, LI, ORE, Worklist);
+
+ LoopsAnalyzed += Worklist.size();
+
+ // Now walk the identified inner loops.
+ while (!Worklist.empty()) {
+ Loop *L = Worklist.pop_back_val();
+
+ // For the inner loops we actually process, form LCSSA to simplify the
+ // transform.
+ Changed |= formLCSSARecursively(*L, *DT, LI, SE);
+
+ Changed |= CFGChanged |= processLoop(L);
+ }
+
+ // Process each loop nest in the function.
+ return LoopVectorizeResult(Changed, CFGChanged);
+}
+
+PreservedAnalyses LoopVectorizePass::run(Function &F,
+ FunctionAnalysisManager &AM) {
+ auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto &LI = AM.getResult<LoopAnalysis>(F);
+ auto &TTI = AM.getResult<TargetIRAnalysis>(F);
+ auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+ auto &BFI = AM.getResult<BlockFrequencyAnalysis>(F);
+ auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+ auto &AA = AM.getResult<AAManager>(F);
+ auto &AC = AM.getResult<AssumptionAnalysis>(F);
+ auto &DB = AM.getResult<DemandedBitsAnalysis>(F);
+ auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+ MemorySSA *MSSA = EnableMSSALoopDependency
+ ? &AM.getResult<MemorySSAAnalysis>(F).getMSSA()
+ : nullptr;
+
+ auto &LAM = AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+ std::function<const LoopAccessInfo &(Loop &)> GetLAA =
+ [&](Loop &L) -> const LoopAccessInfo & {
LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE,
TLI, TTI, nullptr, MSSA};
- return LAM.getResult<LoopAccessAnalysis>(L, AR);
- };
- auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
- ProfileSummaryInfo *PSI =
- MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
- LoopVectorizeResult Result =
- runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
- if (!Result.MadeAnyChange)
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
-
- // We currently do not preserve loopinfo/dominator analyses with outer loop
- // vectorization. Until this is addressed, mark these analyses as preserved
- // only for non-VPlan-native path.
- // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
- if (!EnableVPlanNativePath) {
- PA.preserve<LoopAnalysis>();
- PA.preserve<DominatorTreeAnalysis>();
- }
- PA.preserve<BasicAA>();
- PA.preserve<GlobalsAA>();
- if (!Result.MadeCFGChange)
- PA.preserveSet<CFGAnalyses>();
- return PA;
-}
+ return LAM.getResult<LoopAccessAnalysis>(L, AR);
+ };
+ auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+ ProfileSummaryInfo *PSI =
+ MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+ LoopVectorizeResult Result =
+ runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
+ if (!Result.MadeAnyChange)
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+
+ // We currently do not preserve loopinfo/dominator analyses with outer loop
+ // vectorization. Until this is addressed, mark these analyses as preserved
+ // only for non-VPlan-native path.
+ // TODO: Preserve Loop and Dominator analyses for VPlan-native path.
+ if (!EnableVPlanNativePath) {
+ PA.preserve<LoopAnalysis>();
+ PA.preserve<DominatorTreeAnalysis>();
+ }
+ PA.preserve<BasicAA>();
+ PA.preserve<GlobalsAA>();
+ if (!Result.MadeCFGChange)
+ PA.preserveSet<CFGAnalyses>();
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 94741c5c33..0b63019791 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -1,243 +1,243 @@
-//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
-// stores that can be put together into vector-stores. Next, it attempts to
-// construct vectorizable tree using the use-def chains. If a profitable tree
-// was found, the SLP vectorizer performs vectorization on the tree.
-//
-// The pass is inspired by the work described in the paper:
-// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
+//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
+// stores that can be put together into vector-stores. Next, it attempts to
+// construct vectorizable tree using the use-def chains. If a profitable tree
+// was found, the SLP vectorizer performs vectorization on the tree.
+//
+// The pass is inspired by the work described in the paper:
+// "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/iterator.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/DemandedBits.h"
-#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/IVDescriptors.h"
-#include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Constant.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/NoFolder.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/IR/ValueHandle.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/DOTGraphTraits.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GraphWriter.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
#include "llvm/Support/InstructionCost.h"
-#include "llvm/Support/KnownBits.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/InjectTLIMappings.h"
-#include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Vectorize.h"
-#include <algorithm>
-#include <cassert>
-#include <cstdint>
-#include <iterator>
-#include <memory>
-#include <set>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-using namespace slpvectorizer;
-
-#define SV_NAME "slp-vectorizer"
-#define DEBUG_TYPE "SLP"
-
-STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
-
-cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
- cl::desc("Run the SLP vectorization passes"));
-
-static cl::opt<int>
- SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
- cl::desc("Only vectorize if you gain more than this "
- "number "));
-
-static cl::opt<bool>
-ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
- cl::desc("Attempt to vectorize horizontal reductions"));
-
-static cl::opt<bool> ShouldStartVectorizeHorAtStore(
- "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
- cl::desc(
- "Attempt to vectorize horizontal reductions feeding into a store"));
-
-static cl::opt<int>
-MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
- cl::desc("Attempt to vectorize for this register size in bits"));
-
+#include "llvm/Support/KnownBits.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/InjectTLIMappings.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Vectorize.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+using namespace slpvectorizer;
+
+#define SV_NAME "slp-vectorizer"
+#define DEBUG_TYPE "SLP"
+
+STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
+
+cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
+ cl::desc("Run the SLP vectorization passes"));
+
+static cl::opt<int>
+ SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
+ cl::desc("Only vectorize if you gain more than this "
+ "number "));
+
+static cl::opt<bool>
+ShouldVectorizeHor("slp-vectorize-hor", cl::init(true), cl::Hidden,
+ cl::desc("Attempt to vectorize horizontal reductions"));
+
+static cl::opt<bool> ShouldStartVectorizeHorAtStore(
+ "slp-vectorize-hor-store", cl::init(false), cl::Hidden,
+ cl::desc(
+ "Attempt to vectorize horizontal reductions feeding into a store"));
+
+static cl::opt<int>
+MaxVectorRegSizeOption("slp-max-reg-size", cl::init(128), cl::Hidden,
+ cl::desc("Attempt to vectorize for this register size in bits"));
+
static cl::opt<unsigned>
MaxVFOption("slp-max-vf", cl::init(0), cl::Hidden,
cl::desc("Maximum SLP vectorization factor (0=unlimited)"));
-static cl::opt<int>
-MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
- cl::desc("Maximum depth of the lookup for consecutive stores."));
-
-/// Limits the size of scheduling regions in a block.
-/// It avoid long compile times for _very_ large blocks where vector
-/// instructions are spread over a wide range.
-/// This limit is way higher than needed by real-world functions.
-static cl::opt<int>
-ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
- cl::desc("Limit the size of the SLP scheduling region per block"));
-
-static cl::opt<int> MinVectorRegSizeOption(
- "slp-min-reg-size", cl::init(128), cl::Hidden,
- cl::desc("Attempt to vectorize for this register size in bits"));
-
-static cl::opt<unsigned> RecursionMaxDepth(
- "slp-recursion-max-depth", cl::init(12), cl::Hidden,
- cl::desc("Limit the recursion depth when building a vectorizable tree"));
-
-static cl::opt<unsigned> MinTreeSize(
- "slp-min-tree-size", cl::init(3), cl::Hidden,
- cl::desc("Only vectorize small trees if they are fully vectorizable"));
-
-// The maximum depth that the look-ahead score heuristic will explore.
-// The higher this value, the higher the compilation time overhead.
-static cl::opt<int> LookAheadMaxDepth(
- "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
- cl::desc("The maximum look-ahead depth for operand reordering scores"));
-
-// The Look-ahead heuristic goes through the users of the bundle to calculate
-// the users cost in getExternalUsesCost(). To avoid compilation time increase
-// we limit the number of users visited to this value.
-static cl::opt<unsigned> LookAheadUsersBudget(
- "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
- cl::desc("The maximum number of users to visit while visiting the "
- "predecessors. This prevents compilation time increase."));
-
-static cl::opt<bool>
- ViewSLPTree("view-slp-tree", cl::Hidden,
- cl::desc("Display the SLP trees with Graphviz"));
-
-// Limit the number of alias checks. The limit is chosen so that
-// it has no negative effect on the llvm benchmarks.
-static const unsigned AliasedCheckLimit = 10;
-
-// Another limit for the alias checks: The maximum distance between load/store
-// instructions where alias checks are done.
-// This limit is useful for very large basic blocks.
-static const unsigned MaxMemDepDistance = 160;
-
-/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
-/// regions to be handled.
-static const int MinScheduleRegionSize = 16;
-
-/// Predicate for the element types that the SLP vectorizer supports.
-///
-/// The most important thing to filter here are types which are invalid in LLVM
-/// vectors. We also filter target specific types which have absolutely no
-/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
-/// avoids spending time checking the cost model and realizing that they will
-/// be inevitably scalarized.
-static bool isValidElementType(Type *Ty) {
- return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
- !Ty->isPPC_FP128Ty();
-}
-
-/// \returns true if all of the instructions in \p VL are in the same block or
-/// false otherwise.
-static bool allSameBlock(ArrayRef<Value *> VL) {
- Instruction *I0 = dyn_cast<Instruction>(VL[0]);
- if (!I0)
- return false;
- BasicBlock *BB = I0->getParent();
+static cl::opt<int>
+MaxStoreLookup("slp-max-store-lookup", cl::init(32), cl::Hidden,
+ cl::desc("Maximum depth of the lookup for consecutive stores."));
+
+/// Limits the size of scheduling regions in a block.
+/// It avoid long compile times for _very_ large blocks where vector
+/// instructions are spread over a wide range.
+/// This limit is way higher than needed by real-world functions.
+static cl::opt<int>
+ScheduleRegionSizeBudget("slp-schedule-budget", cl::init(100000), cl::Hidden,
+ cl::desc("Limit the size of the SLP scheduling region per block"));
+
+static cl::opt<int> MinVectorRegSizeOption(
+ "slp-min-reg-size", cl::init(128), cl::Hidden,
+ cl::desc("Attempt to vectorize for this register size in bits"));
+
+static cl::opt<unsigned> RecursionMaxDepth(
+ "slp-recursion-max-depth", cl::init(12), cl::Hidden,
+ cl::desc("Limit the recursion depth when building a vectorizable tree"));
+
+static cl::opt<unsigned> MinTreeSize(
+ "slp-min-tree-size", cl::init(3), cl::Hidden,
+ cl::desc("Only vectorize small trees if they are fully vectorizable"));
+
+// The maximum depth that the look-ahead score heuristic will explore.
+// The higher this value, the higher the compilation time overhead.
+static cl::opt<int> LookAheadMaxDepth(
+ "slp-max-look-ahead-depth", cl::init(2), cl::Hidden,
+ cl::desc("The maximum look-ahead depth for operand reordering scores"));
+
+// The Look-ahead heuristic goes through the users of the bundle to calculate
+// the users cost in getExternalUsesCost(). To avoid compilation time increase
+// we limit the number of users visited to this value.
+static cl::opt<unsigned> LookAheadUsersBudget(
+ "slp-look-ahead-users-budget", cl::init(2), cl::Hidden,
+ cl::desc("The maximum number of users to visit while visiting the "
+ "predecessors. This prevents compilation time increase."));
+
+static cl::opt<bool>
+ ViewSLPTree("view-slp-tree", cl::Hidden,
+ cl::desc("Display the SLP trees with Graphviz"));
+
+// Limit the number of alias checks. The limit is chosen so that
+// it has no negative effect on the llvm benchmarks.
+static const unsigned AliasedCheckLimit = 10;
+
+// Another limit for the alias checks: The maximum distance between load/store
+// instructions where alias checks are done.
+// This limit is useful for very large basic blocks.
+static const unsigned MaxMemDepDistance = 160;
+
+/// If the ScheduleRegionSizeBudget is exhausted, we allow small scheduling
+/// regions to be handled.
+static const int MinScheduleRegionSize = 16;
+
+/// Predicate for the element types that the SLP vectorizer supports.
+///
+/// The most important thing to filter here are types which are invalid in LLVM
+/// vectors. We also filter target specific types which have absolutely no
+/// meaningful vectorization path such as x86_fp80 and ppc_f128. This just
+/// avoids spending time checking the cost model and realizing that they will
+/// be inevitably scalarized.
+static bool isValidElementType(Type *Ty) {
+ return VectorType::isValidElementType(Ty) && !Ty->isX86_FP80Ty() &&
+ !Ty->isPPC_FP128Ty();
+}
+
+/// \returns true if all of the instructions in \p VL are in the same block or
+/// false otherwise.
+static bool allSameBlock(ArrayRef<Value *> VL) {
+ Instruction *I0 = dyn_cast<Instruction>(VL[0]);
+ if (!I0)
+ return false;
+ BasicBlock *BB = I0->getParent();
for (int I = 1, E = VL.size(); I < E; I++) {
auto *II = dyn_cast<Instruction>(VL[I]);
if (!II)
- return false;
-
+ return false;
+
if (BB != II->getParent())
- return false;
- }
- return true;
-}
-
-/// \returns True if all of the values in \p VL are constants (but not
-/// globals/constant expressions).
-static bool allConstant(ArrayRef<Value *> VL) {
- // Constant expressions and globals can't be vectorized like normal integer/FP
- // constants.
- for (Value *i : VL)
- if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))
- return false;
- return true;
-}
-
-/// \returns True if all of the values in \p VL are identical.
-static bool isSplat(ArrayRef<Value *> VL) {
- for (unsigned i = 1, e = VL.size(); i < e; ++i)
- if (VL[i] != VL[0])
- return false;
- return true;
-}
-
+ return false;
+ }
+ return true;
+}
+
+/// \returns True if all of the values in \p VL are constants (but not
+/// globals/constant expressions).
+static bool allConstant(ArrayRef<Value *> VL) {
+ // Constant expressions and globals can't be vectorized like normal integer/FP
+ // constants.
+ for (Value *i : VL)
+ if (!isa<Constant>(i) || isa<ConstantExpr>(i) || isa<GlobalValue>(i))
+ return false;
+ return true;
+}
+
+/// \returns True if all of the values in \p VL are identical.
+static bool isSplat(ArrayRef<Value *> VL) {
+ for (unsigned i = 1, e = VL.size(); i < e; ++i)
+ if (VL[i] != VL[0])
+ return false;
+ return true;
+}
+
/// \returns True if \p I is commutative, handles CmpInst and BinaryOperator.
-static bool isCommutative(Instruction *I) {
+static bool isCommutative(Instruction *I) {
if (auto *Cmp = dyn_cast<CmpInst>(I))
return Cmp->isCommutative();
if (auto *BO = dyn_cast<BinaryOperator>(I))
@@ -246,289 +246,289 @@ static bool isCommutative(Instruction *I) {
// we need to confirm that the caller code correctly handles Intrinsics
// for example (does not have 2 operands).
return false;
-}
-
-/// Checks if the vector of instructions can be represented as a shuffle, like:
-/// %x0 = extractelement <4 x i8> %x, i32 0
-/// %x3 = extractelement <4 x i8> %x, i32 3
-/// %y1 = extractelement <4 x i8> %y, i32 1
-/// %y2 = extractelement <4 x i8> %y, i32 2
-/// %x0x0 = mul i8 %x0, %x0
-/// %x3x3 = mul i8 %x3, %x3
-/// %y1y1 = mul i8 %y1, %y1
-/// %y2y2 = mul i8 %y2, %y2
+}
+
+/// Checks if the vector of instructions can be represented as a shuffle, like:
+/// %x0 = extractelement <4 x i8> %x, i32 0
+/// %x3 = extractelement <4 x i8> %x, i32 3
+/// %y1 = extractelement <4 x i8> %y, i32 1
+/// %y2 = extractelement <4 x i8> %y, i32 2
+/// %x0x0 = mul i8 %x0, %x0
+/// %x3x3 = mul i8 %x3, %x3
+/// %y1y1 = mul i8 %y1, %y1
+/// %y2y2 = mul i8 %y2, %y2
/// %ins1 = insertelement <4 x i8> poison, i8 %x0x0, i32 0
-/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
-/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
-/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
-/// ret <4 x i8> %ins4
-/// can be transformed into:
-/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
-/// i32 6>
-/// %2 = mul <4 x i8> %1, %1
-/// ret <4 x i8> %2
-/// We convert this initially to something like:
-/// %x0 = extractelement <4 x i8> %x, i32 0
-/// %x3 = extractelement <4 x i8> %x, i32 3
-/// %y1 = extractelement <4 x i8> %y, i32 1
-/// %y2 = extractelement <4 x i8> %y, i32 2
+/// %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
+/// %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
+/// %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
+/// ret <4 x i8> %ins4
+/// can be transformed into:
+/// %1 = shufflevector <4 x i8> %x, <4 x i8> %y, <4 x i32> <i32 0, i32 3, i32 5,
+/// i32 6>
+/// %2 = mul <4 x i8> %1, %1
+/// ret <4 x i8> %2
+/// We convert this initially to something like:
+/// %x0 = extractelement <4 x i8> %x, i32 0
+/// %x3 = extractelement <4 x i8> %x, i32 3
+/// %y1 = extractelement <4 x i8> %y, i32 1
+/// %y2 = extractelement <4 x i8> %y, i32 2
/// %1 = insertelement <4 x i8> poison, i8 %x0, i32 0
-/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
-/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
-/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
-/// %5 = mul <4 x i8> %4, %4
-/// %6 = extractelement <4 x i8> %5, i32 0
+/// %2 = insertelement <4 x i8> %1, i8 %x3, i32 1
+/// %3 = insertelement <4 x i8> %2, i8 %y1, i32 2
+/// %4 = insertelement <4 x i8> %3, i8 %y2, i32 3
+/// %5 = mul <4 x i8> %4, %4
+/// %6 = extractelement <4 x i8> %5, i32 0
/// %ins1 = insertelement <4 x i8> poison, i8 %6, i32 0
-/// %7 = extractelement <4 x i8> %5, i32 1
-/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
-/// %8 = extractelement <4 x i8> %5, i32 2
-/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
-/// %9 = extractelement <4 x i8> %5, i32 3
-/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
-/// ret <4 x i8> %ins4
-/// InstCombiner transforms this into a shuffle and vector mul
-/// TODO: Can we split off and reuse the shuffle mask detection from
-/// TargetTransformInfo::getInstructionThroughput?
-static Optional<TargetTransformInfo::ShuffleKind>
-isShuffle(ArrayRef<Value *> VL) {
- auto *EI0 = cast<ExtractElementInst>(VL[0]);
+/// %7 = extractelement <4 x i8> %5, i32 1
+/// %ins2 = insertelement <4 x i8> %ins1, i8 %7, i32 1
+/// %8 = extractelement <4 x i8> %5, i32 2
+/// %ins3 = insertelement <4 x i8> %ins2, i8 %8, i32 2
+/// %9 = extractelement <4 x i8> %5, i32 3
+/// %ins4 = insertelement <4 x i8> %ins3, i8 %9, i32 3
+/// ret <4 x i8> %ins4
+/// InstCombiner transforms this into a shuffle and vector mul
+/// TODO: Can we split off and reuse the shuffle mask detection from
+/// TargetTransformInfo::getInstructionThroughput?
+static Optional<TargetTransformInfo::ShuffleKind>
+isShuffle(ArrayRef<Value *> VL) {
+ auto *EI0 = cast<ExtractElementInst>(VL[0]);
unsigned Size =
cast<FixedVectorType>(EI0->getVectorOperandType())->getNumElements();
- Value *Vec1 = nullptr;
- Value *Vec2 = nullptr;
- enum ShuffleMode { Unknown, Select, Permute };
- ShuffleMode CommonShuffleMode = Unknown;
- for (unsigned I = 0, E = VL.size(); I < E; ++I) {
- auto *EI = cast<ExtractElementInst>(VL[I]);
- auto *Vec = EI->getVectorOperand();
- // All vector operands must have the same number of vector elements.
+ Value *Vec1 = nullptr;
+ Value *Vec2 = nullptr;
+ enum ShuffleMode { Unknown, Select, Permute };
+ ShuffleMode CommonShuffleMode = Unknown;
+ for (unsigned I = 0, E = VL.size(); I < E; ++I) {
+ auto *EI = cast<ExtractElementInst>(VL[I]);
+ auto *Vec = EI->getVectorOperand();
+ // All vector operands must have the same number of vector elements.
if (cast<FixedVectorType>(Vec->getType())->getNumElements() != Size)
- return None;
- auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
- if (!Idx)
- return None;
- // Undefined behavior if Idx is negative or >= Size.
- if (Idx->getValue().uge(Size))
- continue;
- unsigned IntIdx = Idx->getValue().getZExtValue();
+ return None;
+ auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
+ if (!Idx)
+ return None;
+ // Undefined behavior if Idx is negative or >= Size.
+ if (Idx->getValue().uge(Size))
+ continue;
+ unsigned IntIdx = Idx->getValue().getZExtValue();
// We can extractelement from undef or poison vector.
- if (isa<UndefValue>(Vec))
- continue;
- // For correct shuffling we have to have at most 2 different vector operands
- // in all extractelement instructions.
- if (!Vec1 || Vec1 == Vec)
- Vec1 = Vec;
- else if (!Vec2 || Vec2 == Vec)
- Vec2 = Vec;
- else
- return None;
- if (CommonShuffleMode == Permute)
- continue;
- // If the extract index is not the same as the operation number, it is a
- // permutation.
- if (IntIdx != I) {
- CommonShuffleMode = Permute;
- continue;
- }
- CommonShuffleMode = Select;
- }
- // If we're not crossing lanes in different vectors, consider it as blending.
- if (CommonShuffleMode == Select && Vec2)
- return TargetTransformInfo::SK_Select;
- // If Vec2 was never used, we have a permutation of a single vector, otherwise
- // we have permutation of 2 vectors.
- return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
- : TargetTransformInfo::SK_PermuteSingleSrc;
-}
-
-namespace {
-
-/// Main data required for vectorization of instructions.
-struct InstructionsState {
- /// The very first instruction in the list with the main opcode.
- Value *OpValue = nullptr;
-
- /// The main/alternate instruction.
- Instruction *MainOp = nullptr;
- Instruction *AltOp = nullptr;
-
- /// The main/alternate opcodes for the list of instructions.
- unsigned getOpcode() const {
- return MainOp ? MainOp->getOpcode() : 0;
- }
-
- unsigned getAltOpcode() const {
- return AltOp ? AltOp->getOpcode() : 0;
- }
-
- /// Some of the instructions in the list have alternate opcodes.
- bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
-
- bool isOpcodeOrAlt(Instruction *I) const {
- unsigned CheckedOpcode = I->getOpcode();
- return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
- }
-
- InstructionsState() = delete;
- InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
- : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
-};
-
-} // end anonymous namespace
-
-/// Chooses the correct key for scheduling data. If \p Op has the same (or
-/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
-/// OpValue.
-static Value *isOneOf(const InstructionsState &S, Value *Op) {
- auto *I = dyn_cast<Instruction>(Op);
- if (I && S.isOpcodeOrAlt(I))
- return Op;
- return S.OpValue;
-}
-
-/// \returns true if \p Opcode is allowed as part of of the main/alternate
-/// instruction for SLP vectorization.
-///
-/// Example of unsupported opcode is SDIV that can potentially cause UB if the
-/// "shuffled out" lane would result in division by zero.
-static bool isValidForAlternation(unsigned Opcode) {
- if (Instruction::isIntDivRem(Opcode))
- return false;
-
- return true;
-}
-
-/// \returns analysis of the Instructions in \p VL described in
-/// InstructionsState, the Opcode that we suppose the whole list
-/// could be vectorized even if its structure is diverse.
-static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
- unsigned BaseIndex = 0) {
- // Make sure these are all Instructions.
- if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
- return InstructionsState(VL[BaseIndex], nullptr, nullptr);
-
- bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
- bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
- unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
- unsigned AltOpcode = Opcode;
- unsigned AltIndex = BaseIndex;
-
- // Check for one alternate opcode from another BinaryOperator.
- // TODO - generalize to support all operators (types, calls etc.).
- for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
- unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
- if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
- if (InstOpcode == Opcode || InstOpcode == AltOpcode)
- continue;
- if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
- isValidForAlternation(Opcode)) {
- AltOpcode = InstOpcode;
- AltIndex = Cnt;
- continue;
- }
- } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
- Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
- Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
- if (Ty0 == Ty1) {
- if (InstOpcode == Opcode || InstOpcode == AltOpcode)
- continue;
- if (Opcode == AltOpcode) {
- assert(isValidForAlternation(Opcode) &&
- isValidForAlternation(InstOpcode) &&
- "Cast isn't safe for alternation, logic needs to be updated!");
- AltOpcode = InstOpcode;
- AltIndex = Cnt;
- continue;
- }
- }
- } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
- continue;
- return InstructionsState(VL[BaseIndex], nullptr, nullptr);
- }
-
- return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
- cast<Instruction>(VL[AltIndex]));
-}
-
-/// \returns true if all of the values in \p VL have the same type or false
-/// otherwise.
-static bool allSameType(ArrayRef<Value *> VL) {
- Type *Ty = VL[0]->getType();
- for (int i = 1, e = VL.size(); i < e; i++)
- if (VL[i]->getType() != Ty)
- return false;
-
- return true;
-}
-
-/// \returns True if Extract{Value,Element} instruction extracts element Idx.
-static Optional<unsigned> getExtractIndex(Instruction *E) {
- unsigned Opcode = E->getOpcode();
- assert((Opcode == Instruction::ExtractElement ||
- Opcode == Instruction::ExtractValue) &&
- "Expected extractelement or extractvalue instruction.");
- if (Opcode == Instruction::ExtractElement) {
- auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
- if (!CI)
- return None;
- return CI->getZExtValue();
- }
- ExtractValueInst *EI = cast<ExtractValueInst>(E);
- if (EI->getNumIndices() != 1)
- return None;
- return *EI->idx_begin();
-}
-
-/// \returns True if in-tree use also needs extract. This refers to
-/// possible scalar operand in vectorized instruction.
-static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
- TargetLibraryInfo *TLI) {
- unsigned Opcode = UserInst->getOpcode();
- switch (Opcode) {
- case Instruction::Load: {
- LoadInst *LI = cast<LoadInst>(UserInst);
- return (LI->getPointerOperand() == Scalar);
- }
- case Instruction::Store: {
- StoreInst *SI = cast<StoreInst>(UserInst);
- return (SI->getPointerOperand() == Scalar);
- }
- case Instruction::Call: {
- CallInst *CI = cast<CallInst>(UserInst);
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
- for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
- if (hasVectorInstrinsicScalarOpd(ID, i))
- return (CI->getArgOperand(i) == Scalar);
- }
- LLVM_FALLTHROUGH;
- }
- default:
- return false;
- }
-}
-
-/// \returns the AA location that is being access by the instruction.
+ if (isa<UndefValue>(Vec))
+ continue;
+ // For correct shuffling we have to have at most 2 different vector operands
+ // in all extractelement instructions.
+ if (!Vec1 || Vec1 == Vec)
+ Vec1 = Vec;
+ else if (!Vec2 || Vec2 == Vec)
+ Vec2 = Vec;
+ else
+ return None;
+ if (CommonShuffleMode == Permute)
+ continue;
+ // If the extract index is not the same as the operation number, it is a
+ // permutation.
+ if (IntIdx != I) {
+ CommonShuffleMode = Permute;
+ continue;
+ }
+ CommonShuffleMode = Select;
+ }
+ // If we're not crossing lanes in different vectors, consider it as blending.
+ if (CommonShuffleMode == Select && Vec2)
+ return TargetTransformInfo::SK_Select;
+ // If Vec2 was never used, we have a permutation of a single vector, otherwise
+ // we have permutation of 2 vectors.
+ return Vec2 ? TargetTransformInfo::SK_PermuteTwoSrc
+ : TargetTransformInfo::SK_PermuteSingleSrc;
+}
+
+namespace {
+
+/// Main data required for vectorization of instructions.
+struct InstructionsState {
+ /// The very first instruction in the list with the main opcode.
+ Value *OpValue = nullptr;
+
+ /// The main/alternate instruction.
+ Instruction *MainOp = nullptr;
+ Instruction *AltOp = nullptr;
+
+ /// The main/alternate opcodes for the list of instructions.
+ unsigned getOpcode() const {
+ return MainOp ? MainOp->getOpcode() : 0;
+ }
+
+ unsigned getAltOpcode() const {
+ return AltOp ? AltOp->getOpcode() : 0;
+ }
+
+ /// Some of the instructions in the list have alternate opcodes.
+ bool isAltShuffle() const { return getOpcode() != getAltOpcode(); }
+
+ bool isOpcodeOrAlt(Instruction *I) const {
+ unsigned CheckedOpcode = I->getOpcode();
+ return getOpcode() == CheckedOpcode || getAltOpcode() == CheckedOpcode;
+ }
+
+ InstructionsState() = delete;
+ InstructionsState(Value *OpValue, Instruction *MainOp, Instruction *AltOp)
+ : OpValue(OpValue), MainOp(MainOp), AltOp(AltOp) {}
+};
+
+} // end anonymous namespace
+
+/// Chooses the correct key for scheduling data. If \p Op has the same (or
+/// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is \p
+/// OpValue.
+static Value *isOneOf(const InstructionsState &S, Value *Op) {
+ auto *I = dyn_cast<Instruction>(Op);
+ if (I && S.isOpcodeOrAlt(I))
+ return Op;
+ return S.OpValue;
+}
+
+/// \returns true if \p Opcode is allowed as part of of the main/alternate
+/// instruction for SLP vectorization.
+///
+/// Example of unsupported opcode is SDIV that can potentially cause UB if the
+/// "shuffled out" lane would result in division by zero.
+static bool isValidForAlternation(unsigned Opcode) {
+ if (Instruction::isIntDivRem(Opcode))
+ return false;
+
+ return true;
+}
+
+/// \returns analysis of the Instructions in \p VL described in
+/// InstructionsState, the Opcode that we suppose the whole list
+/// could be vectorized even if its structure is diverse.
+static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
+ unsigned BaseIndex = 0) {
+ // Make sure these are all Instructions.
+ if (llvm::any_of(VL, [](Value *V) { return !isa<Instruction>(V); }))
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+
+ bool IsCastOp = isa<CastInst>(VL[BaseIndex]);
+ bool IsBinOp = isa<BinaryOperator>(VL[BaseIndex]);
+ unsigned Opcode = cast<Instruction>(VL[BaseIndex])->getOpcode();
+ unsigned AltOpcode = Opcode;
+ unsigned AltIndex = BaseIndex;
+
+ // Check for one alternate opcode from another BinaryOperator.
+ // TODO - generalize to support all operators (types, calls etc.).
+ for (int Cnt = 0, E = VL.size(); Cnt < E; Cnt++) {
+ unsigned InstOpcode = cast<Instruction>(VL[Cnt])->getOpcode();
+ if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
+ if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
+ isValidForAlternation(Opcode)) {
+ AltOpcode = InstOpcode;
+ AltIndex = Cnt;
+ continue;
+ }
+ } else if (IsCastOp && isa<CastInst>(VL[Cnt])) {
+ Type *Ty0 = cast<Instruction>(VL[BaseIndex])->getOperand(0)->getType();
+ Type *Ty1 = cast<Instruction>(VL[Cnt])->getOperand(0)->getType();
+ if (Ty0 == Ty1) {
+ if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ if (Opcode == AltOpcode) {
+ assert(isValidForAlternation(Opcode) &&
+ isValidForAlternation(InstOpcode) &&
+ "Cast isn't safe for alternation, logic needs to be updated!");
+ AltOpcode = InstOpcode;
+ AltIndex = Cnt;
+ continue;
+ }
+ }
+ } else if (InstOpcode == Opcode || InstOpcode == AltOpcode)
+ continue;
+ return InstructionsState(VL[BaseIndex], nullptr, nullptr);
+ }
+
+ return InstructionsState(VL[BaseIndex], cast<Instruction>(VL[BaseIndex]),
+ cast<Instruction>(VL[AltIndex]));
+}
+
+/// \returns true if all of the values in \p VL have the same type or false
+/// otherwise.
+static bool allSameType(ArrayRef<Value *> VL) {
+ Type *Ty = VL[0]->getType();
+ for (int i = 1, e = VL.size(); i < e; i++)
+ if (VL[i]->getType() != Ty)
+ return false;
+
+ return true;
+}
+
+/// \returns True if Extract{Value,Element} instruction extracts element Idx.
+static Optional<unsigned> getExtractIndex(Instruction *E) {
+ unsigned Opcode = E->getOpcode();
+ assert((Opcode == Instruction::ExtractElement ||
+ Opcode == Instruction::ExtractValue) &&
+ "Expected extractelement or extractvalue instruction.");
+ if (Opcode == Instruction::ExtractElement) {
+ auto *CI = dyn_cast<ConstantInt>(E->getOperand(1));
+ if (!CI)
+ return None;
+ return CI->getZExtValue();
+ }
+ ExtractValueInst *EI = cast<ExtractValueInst>(E);
+ if (EI->getNumIndices() != 1)
+ return None;
+ return *EI->idx_begin();
+}
+
+/// \returns True if in-tree use also needs extract. This refers to
+/// possible scalar operand in vectorized instruction.
+static bool InTreeUserNeedToExtract(Value *Scalar, Instruction *UserInst,
+ TargetLibraryInfo *TLI) {
+ unsigned Opcode = UserInst->getOpcode();
+ switch (Opcode) {
+ case Instruction::Load: {
+ LoadInst *LI = cast<LoadInst>(UserInst);
+ return (LI->getPointerOperand() == Scalar);
+ }
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(UserInst);
+ return (SI->getPointerOperand() == Scalar);
+ }
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(UserInst);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+ for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+ if (hasVectorInstrinsicScalarOpd(ID, i))
+ return (CI->getArgOperand(i) == Scalar);
+ }
+ LLVM_FALLTHROUGH;
+ }
+ default:
+ return false;
+ }
+}
+
+/// \returns the AA location that is being access by the instruction.
static MemoryLocation getLocation(Instruction *I, AAResults *AA) {
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return MemoryLocation::get(SI);
- if (LoadInst *LI = dyn_cast<LoadInst>(I))
- return MemoryLocation::get(LI);
- return MemoryLocation();
-}
-
-/// \returns True if the instruction is not a volatile or atomic load/store.
-static bool isSimple(Instruction *I) {
- if (LoadInst *LI = dyn_cast<LoadInst>(I))
- return LI->isSimple();
- if (StoreInst *SI = dyn_cast<StoreInst>(I))
- return SI->isSimple();
- if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
- return !MI->isVolatile();
- return true;
-}
-
-namespace llvm {
-
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return MemoryLocation::get(SI);
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return MemoryLocation::get(LI);
+ return MemoryLocation();
+}
+
+/// \returns True if the instruction is not a volatile or atomic load/store.
+static bool isSimple(Instruction *I) {
+ if (LoadInst *LI = dyn_cast<LoadInst>(I))
+ return LI->isSimple();
+ if (StoreInst *SI = dyn_cast<StoreInst>(I))
+ return SI->isSimple();
+ if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I))
+ return !MI->isVolatile();
+ return true;
+}
+
+namespace llvm {
+
static void inversePermutation(ArrayRef<unsigned> Indices,
SmallVectorImpl<int> &Mask) {
Mask.clear();
@@ -538,98 +538,98 @@ static void inversePermutation(ArrayRef<unsigned> Indices,
Mask[Indices[I]] = I;
}
-namespace slpvectorizer {
-
-/// Bottom Up SLP Vectorizer.
-class BoUpSLP {
- struct TreeEntry;
- struct ScheduleData;
-
-public:
- using ValueList = SmallVector<Value *, 8>;
- using InstrList = SmallVector<Instruction *, 16>;
- using ValueSet = SmallPtrSet<Value *, 16>;
- using StoreList = SmallVector<StoreInst *, 8>;
- using ExtraValueToDebugLocsMap =
- MapVector<Value *, SmallVector<Instruction *, 2>>;
+namespace slpvectorizer {
+
+/// Bottom Up SLP Vectorizer.
+class BoUpSLP {
+ struct TreeEntry;
+ struct ScheduleData;
+
+public:
+ using ValueList = SmallVector<Value *, 8>;
+ using InstrList = SmallVector<Instruction *, 16>;
+ using ValueSet = SmallPtrSet<Value *, 16>;
+ using StoreList = SmallVector<StoreInst *, 8>;
+ using ExtraValueToDebugLocsMap =
+ MapVector<Value *, SmallVector<Instruction *, 2>>;
using OrdersType = SmallVector<unsigned, 4>;
-
- BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
+
+ BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
TargetLibraryInfo *TLi, AAResults *Aa, LoopInfo *Li,
- DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
- const DataLayout *DL, OptimizationRemarkEmitter *ORE)
- : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
- DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
- CodeMetrics::collectEphemeralValues(F, AC, EphValues);
- // Use the vector register size specified by the target unless overridden
- // by a command-line option.
- // TODO: It would be better to limit the vectorization factor based on
- // data type rather than just register size. For example, x86 AVX has
- // 256-bit registers, but it does not support integer operations
- // at that width (that requires AVX2).
- if (MaxVectorRegSizeOption.getNumOccurrences())
- MaxVecRegSize = MaxVectorRegSizeOption;
- else
- MaxVecRegSize = TTI->getRegisterBitWidth(true);
-
- if (MinVectorRegSizeOption.getNumOccurrences())
- MinVecRegSize = MinVectorRegSizeOption;
- else
- MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
- }
-
- /// Vectorize the tree that starts with the elements in \p VL.
- /// Returns the vectorized root.
- Value *vectorizeTree();
-
- /// Vectorize the tree but with the list of externally used values \p
- /// ExternallyUsedValues. Values in this MapVector can be replaced but the
- /// generated extractvalue instructions.
- Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
-
- /// \returns the cost incurred by unwanted spills and fills, caused by
- /// holding live values over call sites.
+ DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB,
+ const DataLayout *DL, OptimizationRemarkEmitter *ORE)
+ : F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC),
+ DB(DB), DL(DL), ORE(ORE), Builder(Se->getContext()) {
+ CodeMetrics::collectEphemeralValues(F, AC, EphValues);
+ // Use the vector register size specified by the target unless overridden
+ // by a command-line option.
+ // TODO: It would be better to limit the vectorization factor based on
+ // data type rather than just register size. For example, x86 AVX has
+ // 256-bit registers, but it does not support integer operations
+ // at that width (that requires AVX2).
+ if (MaxVectorRegSizeOption.getNumOccurrences())
+ MaxVecRegSize = MaxVectorRegSizeOption;
+ else
+ MaxVecRegSize = TTI->getRegisterBitWidth(true);
+
+ if (MinVectorRegSizeOption.getNumOccurrences())
+ MinVecRegSize = MinVectorRegSizeOption;
+ else
+ MinVecRegSize = TTI->getMinVectorRegisterBitWidth();
+ }
+
+ /// Vectorize the tree that starts with the elements in \p VL.
+ /// Returns the vectorized root.
+ Value *vectorizeTree();
+
+ /// Vectorize the tree but with the list of externally used values \p
+ /// ExternallyUsedValues. Values in this MapVector can be replaced but the
+ /// generated extractvalue instructions.
+ Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
+
+ /// \returns the cost incurred by unwanted spills and fills, caused by
+ /// holding live values over call sites.
InstructionCost getSpillCost() const;
-
- /// \returns the vectorization cost of the subtree that starts at \p VL.
- /// A negative number means that this is profitable.
+
+ /// \returns the vectorization cost of the subtree that starts at \p VL.
+ /// A negative number means that this is profitable.
InstructionCost getTreeCost();
-
- /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
- /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
- void buildTree(ArrayRef<Value *> Roots,
- ArrayRef<Value *> UserIgnoreLst = None);
-
- /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
- /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
- /// into account (and updating it, if required) list of externally used
- /// values stored in \p ExternallyUsedValues.
- void buildTree(ArrayRef<Value *> Roots,
- ExtraValueToDebugLocsMap &ExternallyUsedValues,
- ArrayRef<Value *> UserIgnoreLst = None);
-
- /// Clear the internal data structures that are created by 'buildTree'.
- void deleteTree() {
- VectorizableTree.clear();
- ScalarToTreeEntry.clear();
- MustGather.clear();
- ExternalUses.clear();
- NumOpsWantToKeepOrder.clear();
- NumOpsWantToKeepOriginalOrder = 0;
- for (auto &Iter : BlocksSchedules) {
- BlockScheduling *BS = Iter.second.get();
- BS->clear();
- }
- MinBWs.clear();
- }
-
- unsigned getTreeSize() const { return VectorizableTree.size(); }
-
- /// Perform LICM and CSE on the newly generated gather sequences.
- void optimizeGatherSequence();
-
- /// \returns The best order of instructions for vectorization.
- Optional<ArrayRef<unsigned>> bestOrder() const {
+
+ /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+ /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
+ void buildTree(ArrayRef<Value *> Roots,
+ ArrayRef<Value *> UserIgnoreLst = None);
+
+ /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+ /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
+ /// into account (and updating it, if required) list of externally used
+ /// values stored in \p ExternallyUsedValues.
+ void buildTree(ArrayRef<Value *> Roots,
+ ExtraValueToDebugLocsMap &ExternallyUsedValues,
+ ArrayRef<Value *> UserIgnoreLst = None);
+
+ /// Clear the internal data structures that are created by 'buildTree'.
+ void deleteTree() {
+ VectorizableTree.clear();
+ ScalarToTreeEntry.clear();
+ MustGather.clear();
+ ExternalUses.clear();
+ NumOpsWantToKeepOrder.clear();
+ NumOpsWantToKeepOriginalOrder = 0;
+ for (auto &Iter : BlocksSchedules) {
+ BlockScheduling *BS = Iter.second.get();
+ BS->clear();
+ }
+ MinBWs.clear();
+ }
+
+ unsigned getTreeSize() const { return VectorizableTree.size(); }
+
+ /// Perform LICM and CSE on the newly generated gather sequences.
+ void optimizeGatherSequence();
+
+ /// \returns The best order of instructions for vectorization.
+ Optional<ArrayRef<unsigned>> bestOrder() const {
assert(llvm::all_of(
NumOpsWantToKeepOrder,
[this](const decltype(NumOpsWantToKeepOrder)::value_type &D) {
@@ -638,19 +638,19 @@ public:
}) &&
"All orders must have the same size as number of instructions in "
"tree node.");
- auto I = std::max_element(
- NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
- [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
- const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
- return D1.second < D2.second;
- });
- if (I == NumOpsWantToKeepOrder.end() ||
- I->getSecond() <= NumOpsWantToKeepOriginalOrder)
- return None;
-
- return makeArrayRef(I->getFirst());
- }
-
+ auto I = std::max_element(
+ NumOpsWantToKeepOrder.begin(), NumOpsWantToKeepOrder.end(),
+ [](const decltype(NumOpsWantToKeepOrder)::value_type &D1,
+ const decltype(NumOpsWantToKeepOrder)::value_type &D2) {
+ return D1.second < D2.second;
+ });
+ if (I == NumOpsWantToKeepOrder.end() ||
+ I->getSecond() <= NumOpsWantToKeepOriginalOrder)
+ return None;
+
+ return makeArrayRef(I->getFirst());
+ }
+
/// Builds the correct order for root instructions.
/// If some leaves have the same instructions to be vectorized, we may
/// incorrectly evaluate the best order for the root node (it is built for the
@@ -726,267 +726,267 @@ public:
"All indices must be initialized");
}
- /// \return The vector element size in bits to use when vectorizing the
- /// expression tree ending at \p V. If V is a store, the size is the width of
- /// the stored value. Otherwise, the size is the width of the largest loaded
- /// value reaching V. This method is used by the vectorizer to calculate
- /// vectorization factors.
- unsigned getVectorElementSize(Value *V);
-
- /// Compute the minimum type sizes required to represent the entries in a
- /// vectorizable tree.
- void computeMinimumValueSizes();
-
- // \returns maximum vector register size as set by TTI or overridden by cl::opt.
- unsigned getMaxVecRegSize() const {
- return MaxVecRegSize;
- }
-
- // \returns minimum vector register size as set by cl::opt.
- unsigned getMinVecRegSize() const {
- return MinVecRegSize;
- }
-
+ /// \return The vector element size in bits to use when vectorizing the
+ /// expression tree ending at \p V. If V is a store, the size is the width of
+ /// the stored value. Otherwise, the size is the width of the largest loaded
+ /// value reaching V. This method is used by the vectorizer to calculate
+ /// vectorization factors.
+ unsigned getVectorElementSize(Value *V);
+
+ /// Compute the minimum type sizes required to represent the entries in a
+ /// vectorizable tree.
+ void computeMinimumValueSizes();
+
+ // \returns maximum vector register size as set by TTI or overridden by cl::opt.
+ unsigned getMaxVecRegSize() const {
+ return MaxVecRegSize;
+ }
+
+ // \returns minimum vector register size as set by cl::opt.
+ unsigned getMinVecRegSize() const {
+ return MinVecRegSize;
+ }
+
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
unsigned MaxVF = MaxVFOption.getNumOccurrences() ?
MaxVFOption : TTI->getMaximumVF(ElemWidth, Opcode);
return MaxVF ? MaxVF : UINT_MAX;
}
- /// Check if homogeneous aggregate is isomorphic to some VectorType.
- /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
- /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
- /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
- ///
- /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
- unsigned canMapToVector(Type *T, const DataLayout &DL) const;
-
- /// \returns True if the VectorizableTree is both tiny and not fully
- /// vectorizable. We do not vectorize such trees.
- bool isTreeTinyAndNotFullyVectorizable() const;
-
- /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
- /// can be load combined in the backend. Load combining may not be allowed in
- /// the IR optimizer, so we do not want to alter the pattern. For example,
- /// partially transforming a scalar bswap() pattern into vector code is
- /// effectively impossible for the backend to undo.
- /// TODO: If load combining is allowed in the IR optimizer, this analysis
- /// may not be necessary.
+ /// Check if homogeneous aggregate is isomorphic to some VectorType.
+ /// Accepts homogeneous multidimensional aggregate of scalars/vectors like
+ /// {[4 x i16], [4 x i16]}, { <2 x float>, <2 x float> },
+ /// {{{i16, i16}, {i16, i16}}, {{i16, i16}, {i16, i16}}} and so on.
+ ///
+ /// \returns number of elements in vector if isomorphism exists, 0 otherwise.
+ unsigned canMapToVector(Type *T, const DataLayout &DL) const;
+
+ /// \returns True if the VectorizableTree is both tiny and not fully
+ /// vectorizable. We do not vectorize such trees.
+ bool isTreeTinyAndNotFullyVectorizable() const;
+
+ /// Assume that a legal-sized 'or'-reduction of shifted/zexted loaded values
+ /// can be load combined in the backend. Load combining may not be allowed in
+ /// the IR optimizer, so we do not want to alter the pattern. For example,
+ /// partially transforming a scalar bswap() pattern into vector code is
+ /// effectively impossible for the backend to undo.
+ /// TODO: If load combining is allowed in the IR optimizer, this analysis
+ /// may not be necessary.
bool isLoadCombineReductionCandidate(RecurKind RdxKind) const;
-
- /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
- /// can be load combined in the backend. Load combining may not be allowed in
- /// the IR optimizer, so we do not want to alter the pattern. For example,
- /// partially transforming a scalar bswap() pattern into vector code is
- /// effectively impossible for the backend to undo.
- /// TODO: If load combining is allowed in the IR optimizer, this analysis
- /// may not be necessary.
- bool isLoadCombineCandidate() const;
-
- OptimizationRemarkEmitter *getORE() { return ORE; }
-
- /// This structure holds any data we need about the edges being traversed
- /// during buildTree_rec(). We keep track of:
- /// (i) the user TreeEntry index, and
- /// (ii) the index of the edge.
- struct EdgeInfo {
- EdgeInfo() = default;
- EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
- : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
- /// The user TreeEntry.
- TreeEntry *UserTE = nullptr;
- /// The operand index of the use.
- unsigned EdgeIdx = UINT_MAX;
-#ifndef NDEBUG
- friend inline raw_ostream &operator<<(raw_ostream &OS,
- const BoUpSLP::EdgeInfo &EI) {
- EI.dump(OS);
- return OS;
- }
- /// Debug print.
- void dump(raw_ostream &OS) const {
- OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
- << " EdgeIdx:" << EdgeIdx << "}";
- }
- LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
-#endif
- };
-
- /// A helper data structure to hold the operands of a vector of instructions.
- /// This supports a fixed vector length for all operand vectors.
- class VLOperands {
- /// For each operand we need (i) the value, and (ii) the opcode that it
- /// would be attached to if the expression was in a left-linearized form.
- /// This is required to avoid illegal operand reordering.
- /// For example:
- /// \verbatim
- /// 0 Op1
- /// |/
- /// Op1 Op2 Linearized + Op2
- /// \ / ----------> |/
- /// - -
- ///
- /// Op1 - Op2 (0 + Op1) - Op2
- /// \endverbatim
- ///
- /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
- ///
- /// Another way to think of this is to track all the operations across the
- /// path from the operand all the way to the root of the tree and to
- /// calculate the operation that corresponds to this path. For example, the
- /// path from Op2 to the root crosses the RHS of the '-', therefore the
- /// corresponding operation is a '-' (which matches the one in the
- /// linearized tree, as shown above).
- ///
- /// For lack of a better term, we refer to this operation as Accumulated
- /// Path Operation (APO).
- struct OperandData {
- OperandData() = default;
- OperandData(Value *V, bool APO, bool IsUsed)
- : V(V), APO(APO), IsUsed(IsUsed) {}
- /// The operand value.
- Value *V = nullptr;
- /// TreeEntries only allow a single opcode, or an alternate sequence of
- /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
- /// APO. It is set to 'true' if 'V' is attached to an inverse operation
- /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
- /// (e.g., Add/Mul)
- bool APO = false;
- /// Helper data for the reordering function.
- bool IsUsed = false;
- };
-
- /// During operand reordering, we are trying to select the operand at lane
- /// that matches best with the operand at the neighboring lane. Our
- /// selection is based on the type of value we are looking for. For example,
- /// if the neighboring lane has a load, we need to look for a load that is
- /// accessing a consecutive address. These strategies are summarized in the
- /// 'ReorderingMode' enumerator.
- enum class ReorderingMode {
- Load, ///< Matching loads to consecutive memory addresses
- Opcode, ///< Matching instructions based on opcode (same or alternate)
- Constant, ///< Matching constants
- Splat, ///< Matching the same instruction multiple times (broadcast)
- Failed, ///< We failed to create a vectorizable group
- };
-
- using OperandDataVec = SmallVector<OperandData, 2>;
-
- /// A vector of operand vectors.
- SmallVector<OperandDataVec, 4> OpsVec;
-
- const DataLayout &DL;
- ScalarEvolution &SE;
- const BoUpSLP &R;
-
- /// \returns the operand data at \p OpIdx and \p Lane.
- OperandData &getData(unsigned OpIdx, unsigned Lane) {
- return OpsVec[OpIdx][Lane];
- }
-
- /// \returns the operand data at \p OpIdx and \p Lane. Const version.
- const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
- return OpsVec[OpIdx][Lane];
- }
-
- /// Clears the used flag for all entries.
- void clearUsed() {
- for (unsigned OpIdx = 0, NumOperands = getNumOperands();
- OpIdx != NumOperands; ++OpIdx)
- for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
- ++Lane)
- OpsVec[OpIdx][Lane].IsUsed = false;
- }
-
- /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
- void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
- std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
- }
-
- // The hard-coded scores listed here are not very important. When computing
- // the scores of matching one sub-tree with another, we are basically
- // counting the number of values that are matching. So even if all scores
- // are set to 1, we would still get a decent matching result.
- // However, sometimes we have to break ties. For example we may have to
- // choose between matching loads vs matching opcodes. This is what these
- // scores are helping us with: they provide the order of preference.
-
- /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
- static const int ScoreConsecutiveLoads = 3;
- /// ExtractElementInst from same vector and consecutive indexes.
- static const int ScoreConsecutiveExtracts = 3;
- /// Constants.
- static const int ScoreConstants = 2;
- /// Instructions with the same opcode.
- static const int ScoreSameOpcode = 2;
- /// Instructions with alt opcodes (e.g, add + sub).
- static const int ScoreAltOpcodes = 1;
- /// Identical instructions (a.k.a. splat or broadcast).
- static const int ScoreSplat = 1;
- /// Matching with an undef is preferable to failing.
- static const int ScoreUndef = 1;
- /// Score for failing to find a decent match.
- static const int ScoreFail = 0;
- /// User exteranl to the vectorized code.
- static const int ExternalUseCost = 1;
- /// The user is internal but in a different lane.
- static const int UserInDiffLaneCost = ExternalUseCost;
-
- /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
- static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
- ScalarEvolution &SE) {
- auto *LI1 = dyn_cast<LoadInst>(V1);
- auto *LI2 = dyn_cast<LoadInst>(V2);
- if (LI1 && LI2)
- return isConsecutiveAccess(LI1, LI2, DL, SE)
- ? VLOperands::ScoreConsecutiveLoads
- : VLOperands::ScoreFail;
-
- auto *C1 = dyn_cast<Constant>(V1);
- auto *C2 = dyn_cast<Constant>(V2);
- if (C1 && C2)
- return VLOperands::ScoreConstants;
-
- // Extracts from consecutive indexes of the same vector better score as
- // the extracts could be optimized away.
- Value *EV;
- ConstantInt *Ex1Idx, *Ex2Idx;
- if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&
- match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&
- Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())
- return VLOperands::ScoreConsecutiveExtracts;
-
- auto *I1 = dyn_cast<Instruction>(V1);
- auto *I2 = dyn_cast<Instruction>(V2);
- if (I1 && I2) {
- if (I1 == I2)
- return VLOperands::ScoreSplat;
- InstructionsState S = getSameOpcode({I1, I2});
- // Note: Only consider instructions with <= 2 operands to avoid
- // complexity explosion.
- if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
- return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
- : VLOperands::ScoreSameOpcode;
- }
-
- if (isa<UndefValue>(V2))
- return VLOperands::ScoreUndef;
-
- return VLOperands::ScoreFail;
- }
-
- /// Holds the values and their lane that are taking part in the look-ahead
- /// score calculation. This is used in the external uses cost calculation.
- SmallDenseMap<Value *, int> InLookAheadValues;
-
- /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
- /// either external to the vectorized code, or require shuffling.
- int getExternalUsesCost(const std::pair<Value *, int> &LHS,
- const std::pair<Value *, int> &RHS) {
- int Cost = 0;
- std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
- for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
- Value *V = Values[Idx].first;
+
+ /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
+ /// can be load combined in the backend. Load combining may not be allowed in
+ /// the IR optimizer, so we do not want to alter the pattern. For example,
+ /// partially transforming a scalar bswap() pattern into vector code is
+ /// effectively impossible for the backend to undo.
+ /// TODO: If load combining is allowed in the IR optimizer, this analysis
+ /// may not be necessary.
+ bool isLoadCombineCandidate() const;
+
+ OptimizationRemarkEmitter *getORE() { return ORE; }
+
+ /// This structure holds any data we need about the edges being traversed
+ /// during buildTree_rec(). We keep track of:
+ /// (i) the user TreeEntry index, and
+ /// (ii) the index of the edge.
+ struct EdgeInfo {
+ EdgeInfo() = default;
+ EdgeInfo(TreeEntry *UserTE, unsigned EdgeIdx)
+ : UserTE(UserTE), EdgeIdx(EdgeIdx) {}
+ /// The user TreeEntry.
+ TreeEntry *UserTE = nullptr;
+ /// The operand index of the use.
+ unsigned EdgeIdx = UINT_MAX;
+#ifndef NDEBUG
+ friend inline raw_ostream &operator<<(raw_ostream &OS,
+ const BoUpSLP::EdgeInfo &EI) {
+ EI.dump(OS);
+ return OS;
+ }
+ /// Debug print.
+ void dump(raw_ostream &OS) const {
+ OS << "{User:" << (UserTE ? std::to_string(UserTE->Idx) : "null")
+ << " EdgeIdx:" << EdgeIdx << "}";
+ }
+ LLVM_DUMP_METHOD void dump() const { dump(dbgs()); }
+#endif
+ };
+
+ /// A helper data structure to hold the operands of a vector of instructions.
+ /// This supports a fixed vector length for all operand vectors.
+ class VLOperands {
+ /// For each operand we need (i) the value, and (ii) the opcode that it
+ /// would be attached to if the expression was in a left-linearized form.
+ /// This is required to avoid illegal operand reordering.
+ /// For example:
+ /// \verbatim
+ /// 0 Op1
+ /// |/
+ /// Op1 Op2 Linearized + Op2
+ /// \ / ----------> |/
+ /// - -
+ ///
+ /// Op1 - Op2 (0 + Op1) - Op2
+ /// \endverbatim
+ ///
+ /// Value Op1 is attached to a '+' operation, and Op2 to a '-'.
+ ///
+ /// Another way to think of this is to track all the operations across the
+ /// path from the operand all the way to the root of the tree and to
+ /// calculate the operation that corresponds to this path. For example, the
+ /// path from Op2 to the root crosses the RHS of the '-', therefore the
+ /// corresponding operation is a '-' (which matches the one in the
+ /// linearized tree, as shown above).
+ ///
+ /// For lack of a better term, we refer to this operation as Accumulated
+ /// Path Operation (APO).
+ struct OperandData {
+ OperandData() = default;
+ OperandData(Value *V, bool APO, bool IsUsed)
+ : V(V), APO(APO), IsUsed(IsUsed) {}
+ /// The operand value.
+ Value *V = nullptr;
+ /// TreeEntries only allow a single opcode, or an alternate sequence of
+ /// them (e.g, +, -). Therefore, we can safely use a boolean value for the
+ /// APO. It is set to 'true' if 'V' is attached to an inverse operation
+ /// in the left-linearized form (e.g., Sub/Div), and 'false' otherwise
+ /// (e.g., Add/Mul)
+ bool APO = false;
+ /// Helper data for the reordering function.
+ bool IsUsed = false;
+ };
+
+ /// During operand reordering, we are trying to select the operand at lane
+ /// that matches best with the operand at the neighboring lane. Our
+ /// selection is based on the type of value we are looking for. For example,
+ /// if the neighboring lane has a load, we need to look for a load that is
+ /// accessing a consecutive address. These strategies are summarized in the
+ /// 'ReorderingMode' enumerator.
+ enum class ReorderingMode {
+ Load, ///< Matching loads to consecutive memory addresses
+ Opcode, ///< Matching instructions based on opcode (same or alternate)
+ Constant, ///< Matching constants
+ Splat, ///< Matching the same instruction multiple times (broadcast)
+ Failed, ///< We failed to create a vectorizable group
+ };
+
+ using OperandDataVec = SmallVector<OperandData, 2>;
+
+ /// A vector of operand vectors.
+ SmallVector<OperandDataVec, 4> OpsVec;
+
+ const DataLayout &DL;
+ ScalarEvolution &SE;
+ const BoUpSLP &R;
+
+ /// \returns the operand data at \p OpIdx and \p Lane.
+ OperandData &getData(unsigned OpIdx, unsigned Lane) {
+ return OpsVec[OpIdx][Lane];
+ }
+
+ /// \returns the operand data at \p OpIdx and \p Lane. Const version.
+ const OperandData &getData(unsigned OpIdx, unsigned Lane) const {
+ return OpsVec[OpIdx][Lane];
+ }
+
+ /// Clears the used flag for all entries.
+ void clearUsed() {
+ for (unsigned OpIdx = 0, NumOperands = getNumOperands();
+ OpIdx != NumOperands; ++OpIdx)
+ for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
+ ++Lane)
+ OpsVec[OpIdx][Lane].IsUsed = false;
+ }
+
+ /// Swap the operand at \p OpIdx1 with that one at \p OpIdx2.
+ void swap(unsigned OpIdx1, unsigned OpIdx2, unsigned Lane) {
+ std::swap(OpsVec[OpIdx1][Lane], OpsVec[OpIdx2][Lane]);
+ }
+
+ // The hard-coded scores listed here are not very important. When computing
+ // the scores of matching one sub-tree with another, we are basically
+ // counting the number of values that are matching. So even if all scores
+ // are set to 1, we would still get a decent matching result.
+ // However, sometimes we have to break ties. For example we may have to
+ // choose between matching loads vs matching opcodes. This is what these
+ // scores are helping us with: they provide the order of preference.
+
+ /// Loads from consecutive memory addresses, e.g. load(A[i]), load(A[i+1]).
+ static const int ScoreConsecutiveLoads = 3;
+ /// ExtractElementInst from same vector and consecutive indexes.
+ static const int ScoreConsecutiveExtracts = 3;
+ /// Constants.
+ static const int ScoreConstants = 2;
+ /// Instructions with the same opcode.
+ static const int ScoreSameOpcode = 2;
+ /// Instructions with alt opcodes (e.g, add + sub).
+ static const int ScoreAltOpcodes = 1;
+ /// Identical instructions (a.k.a. splat or broadcast).
+ static const int ScoreSplat = 1;
+ /// Matching with an undef is preferable to failing.
+ static const int ScoreUndef = 1;
+ /// Score for failing to find a decent match.
+ static const int ScoreFail = 0;
+ /// User exteranl to the vectorized code.
+ static const int ExternalUseCost = 1;
+ /// The user is internal but in a different lane.
+ static const int UserInDiffLaneCost = ExternalUseCost;
+
+ /// \returns the score of placing \p V1 and \p V2 in consecutive lanes.
+ static int getShallowScore(Value *V1, Value *V2, const DataLayout &DL,
+ ScalarEvolution &SE) {
+ auto *LI1 = dyn_cast<LoadInst>(V1);
+ auto *LI2 = dyn_cast<LoadInst>(V2);
+ if (LI1 && LI2)
+ return isConsecutiveAccess(LI1, LI2, DL, SE)
+ ? VLOperands::ScoreConsecutiveLoads
+ : VLOperands::ScoreFail;
+
+ auto *C1 = dyn_cast<Constant>(V1);
+ auto *C2 = dyn_cast<Constant>(V2);
+ if (C1 && C2)
+ return VLOperands::ScoreConstants;
+
+ // Extracts from consecutive indexes of the same vector better score as
+ // the extracts could be optimized away.
+ Value *EV;
+ ConstantInt *Ex1Idx, *Ex2Idx;
+ if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&
+ match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&
+ Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())
+ return VLOperands::ScoreConsecutiveExtracts;
+
+ auto *I1 = dyn_cast<Instruction>(V1);
+ auto *I2 = dyn_cast<Instruction>(V2);
+ if (I1 && I2) {
+ if (I1 == I2)
+ return VLOperands::ScoreSplat;
+ InstructionsState S = getSameOpcode({I1, I2});
+ // Note: Only consider instructions with <= 2 operands to avoid
+ // complexity explosion.
+ if (S.getOpcode() && S.MainOp->getNumOperands() <= 2)
+ return S.isAltShuffle() ? VLOperands::ScoreAltOpcodes
+ : VLOperands::ScoreSameOpcode;
+ }
+
+ if (isa<UndefValue>(V2))
+ return VLOperands::ScoreUndef;
+
+ return VLOperands::ScoreFail;
+ }
+
+ /// Holds the values and their lane that are taking part in the look-ahead
+ /// score calculation. This is used in the external uses cost calculation.
+ SmallDenseMap<Value *, int> InLookAheadValues;
+
+ /// \Returns the additinal cost due to uses of \p LHS and \p RHS that are
+ /// either external to the vectorized code, or require shuffling.
+ int getExternalUsesCost(const std::pair<Value *, int> &LHS,
+ const std::pair<Value *, int> &RHS) {
+ int Cost = 0;
+ std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
+ for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
+ Value *V = Values[Idx].first;
if (isa<Constant>(V)) {
// Since this is a function pass, it doesn't make semantic sense to
// walk the users of a subclass of Constant. The users could be in
@@ -995,776 +995,776 @@ public:
continue;
}
- // Calculate the absolute lane, using the minimum relative lane of LHS
- // and RHS as base and Idx as the offset.
- int Ln = std::min(LHS.second, RHS.second) + Idx;
- assert(Ln >= 0 && "Bad lane calculation");
- unsigned UsersBudget = LookAheadUsersBudget;
- for (User *U : V->users()) {
- if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
- // The user is in the VectorizableTree. Check if we need to insert.
- auto It = llvm::find(UserTE->Scalars, U);
- assert(It != UserTE->Scalars.end() && "U is in UserTE");
- int UserLn = std::distance(UserTE->Scalars.begin(), It);
- assert(UserLn >= 0 && "Bad lane");
- if (UserLn != Ln)
- Cost += UserInDiffLaneCost;
- } else {
- // Check if the user is in the look-ahead code.
- auto It2 = InLookAheadValues.find(U);
- if (It2 != InLookAheadValues.end()) {
- // The user is in the look-ahead code. Check the lane.
- if (It2->second != Ln)
- Cost += UserInDiffLaneCost;
- } else {
- // The user is neither in SLP tree nor in the look-ahead code.
- Cost += ExternalUseCost;
- }
- }
- // Limit the number of visited uses to cap compilation time.
- if (--UsersBudget == 0)
- break;
- }
- }
- return Cost;
- }
-
- /// Go through the operands of \p LHS and \p RHS recursively until \p
- /// MaxLevel, and return the cummulative score. For example:
- /// \verbatim
- /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
- /// \ / \ / \ / \ /
- /// + + + +
- /// G1 G2 G3 G4
- /// \endverbatim
- /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
- /// each level recursively, accumulating the score. It starts from matching
- /// the additions at level 0, then moves on to the loads (level 1). The
- /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
- /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
- /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
- /// Please note that the order of the operands does not matter, as we
- /// evaluate the score of all profitable combinations of operands. In
- /// other words the score of G1 and G4 is the same as G1 and G2. This
- /// heuristic is based on ideas described in:
- /// Look-ahead SLP: Auto-vectorization in the presence of commutative
- /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
- /// Luís F. W. Góes
- int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
- const std::pair<Value *, int> &RHS, int CurrLevel,
- int MaxLevel) {
-
- Value *V1 = LHS.first;
- Value *V2 = RHS.first;
- // Get the shallow score of V1 and V2.
- int ShallowScoreAtThisLevel =
- std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
- getExternalUsesCost(LHS, RHS));
- int Lane1 = LHS.second;
- int Lane2 = RHS.second;
-
- // If reached MaxLevel,
- // or if V1 and V2 are not instructions,
- // or if they are SPLAT,
- // or if they are not consecutive, early return the current cost.
- auto *I1 = dyn_cast<Instruction>(V1);
- auto *I2 = dyn_cast<Instruction>(V2);
- if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
- ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
- (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
- return ShallowScoreAtThisLevel;
- assert(I1 && I2 && "Should have early exited.");
-
- // Keep track of in-tree values for determining the external-use cost.
- InLookAheadValues[V1] = Lane1;
- InLookAheadValues[V2] = Lane2;
-
- // Contains the I2 operand indexes that got matched with I1 operands.
- SmallSet<unsigned, 4> Op2Used;
-
- // Recursion towards the operands of I1 and I2. We are trying all possbile
- // operand pairs, and keeping track of the best score.
- for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
- OpIdx1 != NumOperands1; ++OpIdx1) {
- // Try to pair op1I with the best operand of I2.
- int MaxTmpScore = 0;
- unsigned MaxOpIdx2 = 0;
- bool FoundBest = false;
- // If I2 is commutative try all combinations.
- unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
- unsigned ToIdx = isCommutative(I2)
- ? I2->getNumOperands()
- : std::min(I2->getNumOperands(), OpIdx1 + 1);
- assert(FromIdx <= ToIdx && "Bad index");
- for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
- // Skip operands already paired with OpIdx1.
- if (Op2Used.count(OpIdx2))
- continue;
- // Recursively calculate the cost at each level
- int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
- {I2->getOperand(OpIdx2), Lane2},
- CurrLevel + 1, MaxLevel);
- // Look for the best score.
- if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
- MaxTmpScore = TmpScore;
- MaxOpIdx2 = OpIdx2;
- FoundBest = true;
- }
- }
- if (FoundBest) {
- // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
- Op2Used.insert(MaxOpIdx2);
- ShallowScoreAtThisLevel += MaxTmpScore;
- }
- }
- return ShallowScoreAtThisLevel;
- }
-
- /// \Returns the look-ahead score, which tells us how much the sub-trees
- /// rooted at \p LHS and \p RHS match, the more they match the higher the
- /// score. This helps break ties in an informed way when we cannot decide on
- /// the order of the operands by just considering the immediate
- /// predecessors.
- int getLookAheadScore(const std::pair<Value *, int> &LHS,
- const std::pair<Value *, int> &RHS) {
- InLookAheadValues.clear();
- return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
- }
-
- // Search all operands in Ops[*][Lane] for the one that matches best
- // Ops[OpIdx][LastLane] and return its opreand index.
- // If no good match can be found, return None.
- Optional<unsigned>
- getBestOperand(unsigned OpIdx, int Lane, int LastLane,
- ArrayRef<ReorderingMode> ReorderingModes) {
- unsigned NumOperands = getNumOperands();
-
- // The operand of the previous lane at OpIdx.
- Value *OpLastLane = getData(OpIdx, LastLane).V;
-
- // Our strategy mode for OpIdx.
- ReorderingMode RMode = ReorderingModes[OpIdx];
-
- // The linearized opcode of the operand at OpIdx, Lane.
- bool OpIdxAPO = getData(OpIdx, Lane).APO;
-
- // The best operand index and its score.
- // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
- // are using the score to differentiate between the two.
- struct BestOpData {
- Optional<unsigned> Idx = None;
- unsigned Score = 0;
- } BestOp;
-
- // Iterate through all unused operands and look for the best.
- for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
- // Get the operand at Idx and Lane.
- OperandData &OpData = getData(Idx, Lane);
- Value *Op = OpData.V;
- bool OpAPO = OpData.APO;
-
- // Skip already selected operands.
- if (OpData.IsUsed)
- continue;
-
- // Skip if we are trying to move the operand to a position with a
- // different opcode in the linearized tree form. This would break the
- // semantics.
- if (OpAPO != OpIdxAPO)
- continue;
-
- // Look for an operand that matches the current mode.
- switch (RMode) {
- case ReorderingMode::Load:
- case ReorderingMode::Constant:
- case ReorderingMode::Opcode: {
- bool LeftToRight = Lane > LastLane;
- Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
- Value *OpRight = (LeftToRight) ? Op : OpLastLane;
- unsigned Score =
- getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
- if (Score > BestOp.Score) {
- BestOp.Idx = Idx;
- BestOp.Score = Score;
- }
- break;
- }
- case ReorderingMode::Splat:
- if (Op == OpLastLane)
- BestOp.Idx = Idx;
- break;
- case ReorderingMode::Failed:
- return None;
- }
- }
-
- if (BestOp.Idx) {
- getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
- return BestOp.Idx;
- }
- // If we could not find a good match return None.
- return None;
- }
-
- /// Helper for reorderOperandVecs. \Returns the lane that we should start
- /// reordering from. This is the one which has the least number of operands
- /// that can freely move about.
- unsigned getBestLaneToStartReordering() const {
- unsigned BestLane = 0;
- unsigned Min = UINT_MAX;
- for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
- ++Lane) {
- unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
- if (NumFreeOps < Min) {
- Min = NumFreeOps;
- BestLane = Lane;
- }
- }
- return BestLane;
- }
-
- /// \Returns the maximum number of operands that are allowed to be reordered
- /// for \p Lane. This is used as a heuristic for selecting the first lane to
- /// start operand reordering.
- unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
- unsigned CntTrue = 0;
- unsigned NumOperands = getNumOperands();
- // Operands with the same APO can be reordered. We therefore need to count
- // how many of them we have for each APO, like this: Cnt[APO] = x.
- // Since we only have two APOs, namely true and false, we can avoid using
- // a map. Instead we can simply count the number of operands that
- // correspond to one of them (in this case the 'true' APO), and calculate
- // the other by subtracting it from the total number of operands.
- for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
- if (getData(OpIdx, Lane).APO)
- ++CntTrue;
- unsigned CntFalse = NumOperands - CntTrue;
- return std::max(CntTrue, CntFalse);
- }
-
- /// Go through the instructions in VL and append their operands.
- void appendOperandsOfVL(ArrayRef<Value *> VL) {
- assert(!VL.empty() && "Bad VL");
- assert((empty() || VL.size() == getNumLanes()) &&
- "Expected same number of lanes");
- assert(isa<Instruction>(VL[0]) && "Expected instruction");
- unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
- OpsVec.resize(NumOperands);
- unsigned NumLanes = VL.size();
- for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
- OpsVec[OpIdx].resize(NumLanes);
- for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
- // Our tree has just 3 nodes: the root and two operands.
- // It is therefore trivial to get the APO. We only need to check the
- // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
- // RHS operand. The LHS operand of both add and sub is never attached
- // to an inversese operation in the linearized form, therefore its APO
- // is false. The RHS is true only if VL[Lane] is an inverse operation.
-
- // Since operand reordering is performed on groups of commutative
- // operations or alternating sequences (e.g., +, -), we can safely
- // tell the inverse operations by checking commutativity.
- bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
- bool APO = (OpIdx == 0) ? false : IsInverseOperation;
- OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
- APO, false};
- }
- }
- }
-
- /// \returns the number of operands.
- unsigned getNumOperands() const { return OpsVec.size(); }
-
- /// \returns the number of lanes.
- unsigned getNumLanes() const { return OpsVec[0].size(); }
-
- /// \returns the operand value at \p OpIdx and \p Lane.
- Value *getValue(unsigned OpIdx, unsigned Lane) const {
- return getData(OpIdx, Lane).V;
- }
-
- /// \returns true if the data structure is empty.
- bool empty() const { return OpsVec.empty(); }
-
- /// Clears the data.
- void clear() { OpsVec.clear(); }
-
- /// \Returns true if there are enough operands identical to \p Op to fill
- /// the whole vector.
- /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
- bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
- bool OpAPO = getData(OpIdx, Lane).APO;
- for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
- if (Ln == Lane)
- continue;
- // This is set to true if we found a candidate for broadcast at Lane.
- bool FoundCandidate = false;
- for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
- OperandData &Data = getData(OpI, Ln);
- if (Data.APO != OpAPO || Data.IsUsed)
- continue;
- if (Data.V == Op) {
- FoundCandidate = true;
- Data.IsUsed = true;
- break;
- }
- }
- if (!FoundCandidate)
- return false;
- }
- return true;
- }
-
- public:
- /// Initialize with all the operands of the instruction vector \p RootVL.
- VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
- ScalarEvolution &SE, const BoUpSLP &R)
- : DL(DL), SE(SE), R(R) {
- // Append all the operands of RootVL.
- appendOperandsOfVL(RootVL);
- }
-
- /// \Returns a value vector with the operands across all lanes for the
- /// opearnd at \p OpIdx.
- ValueList getVL(unsigned OpIdx) const {
- ValueList OpVL(OpsVec[OpIdx].size());
- assert(OpsVec[OpIdx].size() == getNumLanes() &&
- "Expected same num of lanes across all operands");
- for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
- OpVL[Lane] = OpsVec[OpIdx][Lane].V;
- return OpVL;
- }
-
- // Performs operand reordering for 2 or more operands.
- // The original operands are in OrigOps[OpIdx][Lane].
- // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
- void reorder() {
- unsigned NumOperands = getNumOperands();
- unsigned NumLanes = getNumLanes();
- // Each operand has its own mode. We are using this mode to help us select
- // the instructions for each lane, so that they match best with the ones
- // we have selected so far.
- SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
-
- // This is a greedy single-pass algorithm. We are going over each lane
- // once and deciding on the best order right away with no back-tracking.
- // However, in order to increase its effectiveness, we start with the lane
- // that has operands that can move the least. For example, given the
- // following lanes:
- // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
- // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
- // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
- // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
- // we will start at Lane 1, since the operands of the subtraction cannot
- // be reordered. Then we will visit the rest of the lanes in a circular
- // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
-
- // Find the first lane that we will start our search from.
- unsigned FirstLane = getBestLaneToStartReordering();
-
- // Initialize the modes.
- for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
- Value *OpLane0 = getValue(OpIdx, FirstLane);
- // Keep track if we have instructions with all the same opcode on one
- // side.
- if (isa<LoadInst>(OpLane0))
- ReorderingModes[OpIdx] = ReorderingMode::Load;
- else if (isa<Instruction>(OpLane0)) {
- // Check if OpLane0 should be broadcast.
- if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
- ReorderingModes[OpIdx] = ReorderingMode::Splat;
- else
- ReorderingModes[OpIdx] = ReorderingMode::Opcode;
- }
- else if (isa<Constant>(OpLane0))
- ReorderingModes[OpIdx] = ReorderingMode::Constant;
- else if (isa<Argument>(OpLane0))
- // Our best hope is a Splat. It may save some cost in some cases.
- ReorderingModes[OpIdx] = ReorderingMode::Splat;
- else
- // NOTE: This should be unreachable.
- ReorderingModes[OpIdx] = ReorderingMode::Failed;
- }
-
- // If the initial strategy fails for any of the operand indexes, then we
- // perform reordering again in a second pass. This helps avoid assigning
- // high priority to the failed strategy, and should improve reordering for
- // the non-failed operand indexes.
- for (int Pass = 0; Pass != 2; ++Pass) {
- // Skip the second pass if the first pass did not fail.
- bool StrategyFailed = false;
- // Mark all operand data as free to use.
- clearUsed();
- // We keep the original operand order for the FirstLane, so reorder the
- // rest of the lanes. We are visiting the nodes in a circular fashion,
- // using FirstLane as the center point and increasing the radius
- // distance.
- for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
- // Visit the lane on the right and then the lane on the left.
- for (int Direction : {+1, -1}) {
- int Lane = FirstLane + Direction * Distance;
- if (Lane < 0 || Lane >= (int)NumLanes)
- continue;
- int LastLane = Lane - Direction;
- assert(LastLane >= 0 && LastLane < (int)NumLanes &&
- "Out of bounds");
- // Look for a good match for each operand.
- for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
- // Search for the operand that matches SortedOps[OpIdx][Lane-1].
- Optional<unsigned> BestIdx =
- getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
- // By not selecting a value, we allow the operands that follow to
- // select a better matching value. We will get a non-null value in
- // the next run of getBestOperand().
- if (BestIdx) {
- // Swap the current operand with the one returned by
- // getBestOperand().
- swap(OpIdx, BestIdx.getValue(), Lane);
- } else {
- // We failed to find a best operand, set mode to 'Failed'.
- ReorderingModes[OpIdx] = ReorderingMode::Failed;
- // Enable the second pass.
- StrategyFailed = true;
- }
- }
- }
- }
- // Skip second pass if the strategy did not fail.
- if (!StrategyFailed)
- break;
- }
- }
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
- LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
- switch (RMode) {
- case ReorderingMode::Load:
- return "Load";
- case ReorderingMode::Opcode:
- return "Opcode";
- case ReorderingMode::Constant:
- return "Constant";
- case ReorderingMode::Splat:
- return "Splat";
- case ReorderingMode::Failed:
- return "Failed";
- }
- llvm_unreachable("Unimplemented Reordering Type");
- }
-
- LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
- raw_ostream &OS) {
- return OS << getModeStr(RMode);
- }
-
- /// Debug print.
- LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
- printMode(RMode, dbgs());
- }
-
- friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
- return printMode(RMode, OS);
- }
-
- LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
- const unsigned Indent = 2;
- unsigned Cnt = 0;
- for (const OperandDataVec &OpDataVec : OpsVec) {
- OS << "Operand " << Cnt++ << "\n";
- for (const OperandData &OpData : OpDataVec) {
- OS.indent(Indent) << "{";
- if (Value *V = OpData.V)
- OS << *V;
- else
- OS << "null";
- OS << ", APO:" << OpData.APO << "}\n";
- }
- OS << "\n";
- }
- return OS;
- }
-
- /// Debug print.
- LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
-#endif
- };
-
- /// Checks if the instruction is marked for deletion.
- bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
-
- /// Marks values operands for later deletion by replacing them with Undefs.
- void eraseInstructions(ArrayRef<Value *> AV);
-
- ~BoUpSLP();
-
-private:
- /// Checks if all users of \p I are the part of the vectorization tree.
- bool areAllUsersVectorized(Instruction *I) const;
-
- /// \returns the cost of the vectorizable entry.
+ // Calculate the absolute lane, using the minimum relative lane of LHS
+ // and RHS as base and Idx as the offset.
+ int Ln = std::min(LHS.second, RHS.second) + Idx;
+ assert(Ln >= 0 && "Bad lane calculation");
+ unsigned UsersBudget = LookAheadUsersBudget;
+ for (User *U : V->users()) {
+ if (const TreeEntry *UserTE = R.getTreeEntry(U)) {
+ // The user is in the VectorizableTree. Check if we need to insert.
+ auto It = llvm::find(UserTE->Scalars, U);
+ assert(It != UserTE->Scalars.end() && "U is in UserTE");
+ int UserLn = std::distance(UserTE->Scalars.begin(), It);
+ assert(UserLn >= 0 && "Bad lane");
+ if (UserLn != Ln)
+ Cost += UserInDiffLaneCost;
+ } else {
+ // Check if the user is in the look-ahead code.
+ auto It2 = InLookAheadValues.find(U);
+ if (It2 != InLookAheadValues.end()) {
+ // The user is in the look-ahead code. Check the lane.
+ if (It2->second != Ln)
+ Cost += UserInDiffLaneCost;
+ } else {
+ // The user is neither in SLP tree nor in the look-ahead code.
+ Cost += ExternalUseCost;
+ }
+ }
+ // Limit the number of visited uses to cap compilation time.
+ if (--UsersBudget == 0)
+ break;
+ }
+ }
+ return Cost;
+ }
+
+ /// Go through the operands of \p LHS and \p RHS recursively until \p
+ /// MaxLevel, and return the cummulative score. For example:
+ /// \verbatim
+ /// A[0] B[0] A[1] B[1] C[0] D[0] B[1] A[1]
+ /// \ / \ / \ / \ /
+ /// + + + +
+ /// G1 G2 G3 G4
+ /// \endverbatim
+ /// The getScoreAtLevelRec(G1, G2) function will try to match the nodes at
+ /// each level recursively, accumulating the score. It starts from matching
+ /// the additions at level 0, then moves on to the loads (level 1). The
+ /// score of G1 and G2 is higher than G1 and G3, because {A[0],A[1]} and
+ /// {B[0],B[1]} match with VLOperands::ScoreConsecutiveLoads, while
+ /// {A[0],C[0]} has a score of VLOperands::ScoreFail.
+ /// Please note that the order of the operands does not matter, as we
+ /// evaluate the score of all profitable combinations of operands. In
+ /// other words the score of G1 and G4 is the same as G1 and G2. This
+ /// heuristic is based on ideas described in:
+ /// Look-ahead SLP: Auto-vectorization in the presence of commutative
+ /// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
+ /// Luís F. W. Góes
+ int getScoreAtLevelRec(const std::pair<Value *, int> &LHS,
+ const std::pair<Value *, int> &RHS, int CurrLevel,
+ int MaxLevel) {
+
+ Value *V1 = LHS.first;
+ Value *V2 = RHS.first;
+ // Get the shallow score of V1 and V2.
+ int ShallowScoreAtThisLevel =
+ std::max((int)ScoreFail, getShallowScore(V1, V2, DL, SE) -
+ getExternalUsesCost(LHS, RHS));
+ int Lane1 = LHS.second;
+ int Lane2 = RHS.second;
+
+ // If reached MaxLevel,
+ // or if V1 and V2 are not instructions,
+ // or if they are SPLAT,
+ // or if they are not consecutive, early return the current cost.
+ auto *I1 = dyn_cast<Instruction>(V1);
+ auto *I2 = dyn_cast<Instruction>(V2);
+ if (CurrLevel == MaxLevel || !(I1 && I2) || I1 == I2 ||
+ ShallowScoreAtThisLevel == VLOperands::ScoreFail ||
+ (isa<LoadInst>(I1) && isa<LoadInst>(I2) && ShallowScoreAtThisLevel))
+ return ShallowScoreAtThisLevel;
+ assert(I1 && I2 && "Should have early exited.");
+
+ // Keep track of in-tree values for determining the external-use cost.
+ InLookAheadValues[V1] = Lane1;
+ InLookAheadValues[V2] = Lane2;
+
+ // Contains the I2 operand indexes that got matched with I1 operands.
+ SmallSet<unsigned, 4> Op2Used;
+
+ // Recursion towards the operands of I1 and I2. We are trying all possbile
+ // operand pairs, and keeping track of the best score.
+ for (unsigned OpIdx1 = 0, NumOperands1 = I1->getNumOperands();
+ OpIdx1 != NumOperands1; ++OpIdx1) {
+ // Try to pair op1I with the best operand of I2.
+ int MaxTmpScore = 0;
+ unsigned MaxOpIdx2 = 0;
+ bool FoundBest = false;
+ // If I2 is commutative try all combinations.
+ unsigned FromIdx = isCommutative(I2) ? 0 : OpIdx1;
+ unsigned ToIdx = isCommutative(I2)
+ ? I2->getNumOperands()
+ : std::min(I2->getNumOperands(), OpIdx1 + 1);
+ assert(FromIdx <= ToIdx && "Bad index");
+ for (unsigned OpIdx2 = FromIdx; OpIdx2 != ToIdx; ++OpIdx2) {
+ // Skip operands already paired with OpIdx1.
+ if (Op2Used.count(OpIdx2))
+ continue;
+ // Recursively calculate the cost at each level
+ int TmpScore = getScoreAtLevelRec({I1->getOperand(OpIdx1), Lane1},
+ {I2->getOperand(OpIdx2), Lane2},
+ CurrLevel + 1, MaxLevel);
+ // Look for the best score.
+ if (TmpScore > VLOperands::ScoreFail && TmpScore > MaxTmpScore) {
+ MaxTmpScore = TmpScore;
+ MaxOpIdx2 = OpIdx2;
+ FoundBest = true;
+ }
+ }
+ if (FoundBest) {
+ // Pair {OpIdx1, MaxOpIdx2} was found to be best. Never revisit it.
+ Op2Used.insert(MaxOpIdx2);
+ ShallowScoreAtThisLevel += MaxTmpScore;
+ }
+ }
+ return ShallowScoreAtThisLevel;
+ }
+
+ /// \Returns the look-ahead score, which tells us how much the sub-trees
+ /// rooted at \p LHS and \p RHS match, the more they match the higher the
+ /// score. This helps break ties in an informed way when we cannot decide on
+ /// the order of the operands by just considering the immediate
+ /// predecessors.
+ int getLookAheadScore(const std::pair<Value *, int> &LHS,
+ const std::pair<Value *, int> &RHS) {
+ InLookAheadValues.clear();
+ return getScoreAtLevelRec(LHS, RHS, 1, LookAheadMaxDepth);
+ }
+
+ // Search all operands in Ops[*][Lane] for the one that matches best
+ // Ops[OpIdx][LastLane] and return its opreand index.
+ // If no good match can be found, return None.
+ Optional<unsigned>
+ getBestOperand(unsigned OpIdx, int Lane, int LastLane,
+ ArrayRef<ReorderingMode> ReorderingModes) {
+ unsigned NumOperands = getNumOperands();
+
+ // The operand of the previous lane at OpIdx.
+ Value *OpLastLane = getData(OpIdx, LastLane).V;
+
+ // Our strategy mode for OpIdx.
+ ReorderingMode RMode = ReorderingModes[OpIdx];
+
+ // The linearized opcode of the operand at OpIdx, Lane.
+ bool OpIdxAPO = getData(OpIdx, Lane).APO;
+
+ // The best operand index and its score.
+ // Sometimes we have more than one option (e.g., Opcode and Undefs), so we
+ // are using the score to differentiate between the two.
+ struct BestOpData {
+ Optional<unsigned> Idx = None;
+ unsigned Score = 0;
+ } BestOp;
+
+ // Iterate through all unused operands and look for the best.
+ for (unsigned Idx = 0; Idx != NumOperands; ++Idx) {
+ // Get the operand at Idx and Lane.
+ OperandData &OpData = getData(Idx, Lane);
+ Value *Op = OpData.V;
+ bool OpAPO = OpData.APO;
+
+ // Skip already selected operands.
+ if (OpData.IsUsed)
+ continue;
+
+ // Skip if we are trying to move the operand to a position with a
+ // different opcode in the linearized tree form. This would break the
+ // semantics.
+ if (OpAPO != OpIdxAPO)
+ continue;
+
+ // Look for an operand that matches the current mode.
+ switch (RMode) {
+ case ReorderingMode::Load:
+ case ReorderingMode::Constant:
+ case ReorderingMode::Opcode: {
+ bool LeftToRight = Lane > LastLane;
+ Value *OpLeft = (LeftToRight) ? OpLastLane : Op;
+ Value *OpRight = (LeftToRight) ? Op : OpLastLane;
+ unsigned Score =
+ getLookAheadScore({OpLeft, LastLane}, {OpRight, Lane});
+ if (Score > BestOp.Score) {
+ BestOp.Idx = Idx;
+ BestOp.Score = Score;
+ }
+ break;
+ }
+ case ReorderingMode::Splat:
+ if (Op == OpLastLane)
+ BestOp.Idx = Idx;
+ break;
+ case ReorderingMode::Failed:
+ return None;
+ }
+ }
+
+ if (BestOp.Idx) {
+ getData(BestOp.Idx.getValue(), Lane).IsUsed = true;
+ return BestOp.Idx;
+ }
+ // If we could not find a good match return None.
+ return None;
+ }
+
+ /// Helper for reorderOperandVecs. \Returns the lane that we should start
+ /// reordering from. This is the one which has the least number of operands
+ /// that can freely move about.
+ unsigned getBestLaneToStartReordering() const {
+ unsigned BestLane = 0;
+ unsigned Min = UINT_MAX;
+ for (unsigned Lane = 0, NumLanes = getNumLanes(); Lane != NumLanes;
+ ++Lane) {
+ unsigned NumFreeOps = getMaxNumOperandsThatCanBeReordered(Lane);
+ if (NumFreeOps < Min) {
+ Min = NumFreeOps;
+ BestLane = Lane;
+ }
+ }
+ return BestLane;
+ }
+
+ /// \Returns the maximum number of operands that are allowed to be reordered
+ /// for \p Lane. This is used as a heuristic for selecting the first lane to
+ /// start operand reordering.
+ unsigned getMaxNumOperandsThatCanBeReordered(unsigned Lane) const {
+ unsigned CntTrue = 0;
+ unsigned NumOperands = getNumOperands();
+ // Operands with the same APO can be reordered. We therefore need to count
+ // how many of them we have for each APO, like this: Cnt[APO] = x.
+ // Since we only have two APOs, namely true and false, we can avoid using
+ // a map. Instead we can simply count the number of operands that
+ // correspond to one of them (in this case the 'true' APO), and calculate
+ // the other by subtracting it from the total number of operands.
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx)
+ if (getData(OpIdx, Lane).APO)
+ ++CntTrue;
+ unsigned CntFalse = NumOperands - CntTrue;
+ return std::max(CntTrue, CntFalse);
+ }
+
+ /// Go through the instructions in VL and append their operands.
+ void appendOperandsOfVL(ArrayRef<Value *> VL) {
+ assert(!VL.empty() && "Bad VL");
+ assert((empty() || VL.size() == getNumLanes()) &&
+ "Expected same number of lanes");
+ assert(isa<Instruction>(VL[0]) && "Expected instruction");
+ unsigned NumOperands = cast<Instruction>(VL[0])->getNumOperands();
+ OpsVec.resize(NumOperands);
+ unsigned NumLanes = VL.size();
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+ OpsVec[OpIdx].resize(NumLanes);
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ assert(isa<Instruction>(VL[Lane]) && "Expected instruction");
+ // Our tree has just 3 nodes: the root and two operands.
+ // It is therefore trivial to get the APO. We only need to check the
+ // opcode of VL[Lane] and whether the operand at OpIdx is the LHS or
+ // RHS operand. The LHS operand of both add and sub is never attached
+ // to an inversese operation in the linearized form, therefore its APO
+ // is false. The RHS is true only if VL[Lane] is an inverse operation.
+
+ // Since operand reordering is performed on groups of commutative
+ // operations or alternating sequences (e.g., +, -), we can safely
+ // tell the inverse operations by checking commutativity.
+ bool IsInverseOperation = !isCommutative(cast<Instruction>(VL[Lane]));
+ bool APO = (OpIdx == 0) ? false : IsInverseOperation;
+ OpsVec[OpIdx][Lane] = {cast<Instruction>(VL[Lane])->getOperand(OpIdx),
+ APO, false};
+ }
+ }
+ }
+
+ /// \returns the number of operands.
+ unsigned getNumOperands() const { return OpsVec.size(); }
+
+ /// \returns the number of lanes.
+ unsigned getNumLanes() const { return OpsVec[0].size(); }
+
+ /// \returns the operand value at \p OpIdx and \p Lane.
+ Value *getValue(unsigned OpIdx, unsigned Lane) const {
+ return getData(OpIdx, Lane).V;
+ }
+
+ /// \returns true if the data structure is empty.
+ bool empty() const { return OpsVec.empty(); }
+
+ /// Clears the data.
+ void clear() { OpsVec.clear(); }
+
+ /// \Returns true if there are enough operands identical to \p Op to fill
+ /// the whole vector.
+ /// Note: This modifies the 'IsUsed' flag, so a cleanUsed() must follow.
+ bool shouldBroadcast(Value *Op, unsigned OpIdx, unsigned Lane) {
+ bool OpAPO = getData(OpIdx, Lane).APO;
+ for (unsigned Ln = 0, Lns = getNumLanes(); Ln != Lns; ++Ln) {
+ if (Ln == Lane)
+ continue;
+ // This is set to true if we found a candidate for broadcast at Lane.
+ bool FoundCandidate = false;
+ for (unsigned OpI = 0, OpE = getNumOperands(); OpI != OpE; ++OpI) {
+ OperandData &Data = getData(OpI, Ln);
+ if (Data.APO != OpAPO || Data.IsUsed)
+ continue;
+ if (Data.V == Op) {
+ FoundCandidate = true;
+ Data.IsUsed = true;
+ break;
+ }
+ }
+ if (!FoundCandidate)
+ return false;
+ }
+ return true;
+ }
+
+ public:
+ /// Initialize with all the operands of the instruction vector \p RootVL.
+ VLOperands(ArrayRef<Value *> RootVL, const DataLayout &DL,
+ ScalarEvolution &SE, const BoUpSLP &R)
+ : DL(DL), SE(SE), R(R) {
+ // Append all the operands of RootVL.
+ appendOperandsOfVL(RootVL);
+ }
+
+ /// \Returns a value vector with the operands across all lanes for the
+ /// opearnd at \p OpIdx.
+ ValueList getVL(unsigned OpIdx) const {
+ ValueList OpVL(OpsVec[OpIdx].size());
+ assert(OpsVec[OpIdx].size() == getNumLanes() &&
+ "Expected same num of lanes across all operands");
+ for (unsigned Lane = 0, Lanes = getNumLanes(); Lane != Lanes; ++Lane)
+ OpVL[Lane] = OpsVec[OpIdx][Lane].V;
+ return OpVL;
+ }
+
+ // Performs operand reordering for 2 or more operands.
+ // The original operands are in OrigOps[OpIdx][Lane].
+ // The reordered operands are returned in 'SortedOps[OpIdx][Lane]'.
+ void reorder() {
+ unsigned NumOperands = getNumOperands();
+ unsigned NumLanes = getNumLanes();
+ // Each operand has its own mode. We are using this mode to help us select
+ // the instructions for each lane, so that they match best with the ones
+ // we have selected so far.
+ SmallVector<ReorderingMode, 2> ReorderingModes(NumOperands);
+
+ // This is a greedy single-pass algorithm. We are going over each lane
+ // once and deciding on the best order right away with no back-tracking.
+ // However, in order to increase its effectiveness, we start with the lane
+ // that has operands that can move the least. For example, given the
+ // following lanes:
+ // Lane 0 : A[0] = B[0] + C[0] // Visited 3rd
+ // Lane 1 : A[1] = C[1] - B[1] // Visited 1st
+ // Lane 2 : A[2] = B[2] + C[2] // Visited 2nd
+ // Lane 3 : A[3] = C[3] - B[3] // Visited 4th
+ // we will start at Lane 1, since the operands of the subtraction cannot
+ // be reordered. Then we will visit the rest of the lanes in a circular
+ // fashion. That is, Lanes 2, then Lane 0, and finally Lane 3.
+
+ // Find the first lane that we will start our search from.
+ unsigned FirstLane = getBestLaneToStartReordering();
+
+ // Initialize the modes.
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+ Value *OpLane0 = getValue(OpIdx, FirstLane);
+ // Keep track if we have instructions with all the same opcode on one
+ // side.
+ if (isa<LoadInst>(OpLane0))
+ ReorderingModes[OpIdx] = ReorderingMode::Load;
+ else if (isa<Instruction>(OpLane0)) {
+ // Check if OpLane0 should be broadcast.
+ if (shouldBroadcast(OpLane0, OpIdx, FirstLane))
+ ReorderingModes[OpIdx] = ReorderingMode::Splat;
+ else
+ ReorderingModes[OpIdx] = ReorderingMode::Opcode;
+ }
+ else if (isa<Constant>(OpLane0))
+ ReorderingModes[OpIdx] = ReorderingMode::Constant;
+ else if (isa<Argument>(OpLane0))
+ // Our best hope is a Splat. It may save some cost in some cases.
+ ReorderingModes[OpIdx] = ReorderingMode::Splat;
+ else
+ // NOTE: This should be unreachable.
+ ReorderingModes[OpIdx] = ReorderingMode::Failed;
+ }
+
+ // If the initial strategy fails for any of the operand indexes, then we
+ // perform reordering again in a second pass. This helps avoid assigning
+ // high priority to the failed strategy, and should improve reordering for
+ // the non-failed operand indexes.
+ for (int Pass = 0; Pass != 2; ++Pass) {
+ // Skip the second pass if the first pass did not fail.
+ bool StrategyFailed = false;
+ // Mark all operand data as free to use.
+ clearUsed();
+ // We keep the original operand order for the FirstLane, so reorder the
+ // rest of the lanes. We are visiting the nodes in a circular fashion,
+ // using FirstLane as the center point and increasing the radius
+ // distance.
+ for (unsigned Distance = 1; Distance != NumLanes; ++Distance) {
+ // Visit the lane on the right and then the lane on the left.
+ for (int Direction : {+1, -1}) {
+ int Lane = FirstLane + Direction * Distance;
+ if (Lane < 0 || Lane >= (int)NumLanes)
+ continue;
+ int LastLane = Lane - Direction;
+ assert(LastLane >= 0 && LastLane < (int)NumLanes &&
+ "Out of bounds");
+ // Look for a good match for each operand.
+ for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+ // Search for the operand that matches SortedOps[OpIdx][Lane-1].
+ Optional<unsigned> BestIdx =
+ getBestOperand(OpIdx, Lane, LastLane, ReorderingModes);
+ // By not selecting a value, we allow the operands that follow to
+ // select a better matching value. We will get a non-null value in
+ // the next run of getBestOperand().
+ if (BestIdx) {
+ // Swap the current operand with the one returned by
+ // getBestOperand().
+ swap(OpIdx, BestIdx.getValue(), Lane);
+ } else {
+ // We failed to find a best operand, set mode to 'Failed'.
+ ReorderingModes[OpIdx] = ReorderingMode::Failed;
+ // Enable the second pass.
+ StrategyFailed = true;
+ }
+ }
+ }
+ }
+ // Skip second pass if the strategy did not fail.
+ if (!StrategyFailed)
+ break;
+ }
+ }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+ LLVM_DUMP_METHOD static StringRef getModeStr(ReorderingMode RMode) {
+ switch (RMode) {
+ case ReorderingMode::Load:
+ return "Load";
+ case ReorderingMode::Opcode:
+ return "Opcode";
+ case ReorderingMode::Constant:
+ return "Constant";
+ case ReorderingMode::Splat:
+ return "Splat";
+ case ReorderingMode::Failed:
+ return "Failed";
+ }
+ llvm_unreachable("Unimplemented Reordering Type");
+ }
+
+ LLVM_DUMP_METHOD static raw_ostream &printMode(ReorderingMode RMode,
+ raw_ostream &OS) {
+ return OS << getModeStr(RMode);
+ }
+
+ /// Debug print.
+ LLVM_DUMP_METHOD static void dumpMode(ReorderingMode RMode) {
+ printMode(RMode, dbgs());
+ }
+
+ friend raw_ostream &operator<<(raw_ostream &OS, ReorderingMode RMode) {
+ return printMode(RMode, OS);
+ }
+
+ LLVM_DUMP_METHOD raw_ostream &print(raw_ostream &OS) const {
+ const unsigned Indent = 2;
+ unsigned Cnt = 0;
+ for (const OperandDataVec &OpDataVec : OpsVec) {
+ OS << "Operand " << Cnt++ << "\n";
+ for (const OperandData &OpData : OpDataVec) {
+ OS.indent(Indent) << "{";
+ if (Value *V = OpData.V)
+ OS << *V;
+ else
+ OS << "null";
+ OS << ", APO:" << OpData.APO << "}\n";
+ }
+ OS << "\n";
+ }
+ return OS;
+ }
+
+ /// Debug print.
+ LLVM_DUMP_METHOD void dump() const { print(dbgs()); }
+#endif
+ };
+
+ /// Checks if the instruction is marked for deletion.
+ bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
+
+ /// Marks values operands for later deletion by replacing them with Undefs.
+ void eraseInstructions(ArrayRef<Value *> AV);
+
+ ~BoUpSLP();
+
+private:
+ /// Checks if all users of \p I are the part of the vectorization tree.
+ bool areAllUsersVectorized(Instruction *I) const;
+
+ /// \returns the cost of the vectorizable entry.
InstructionCost getEntryCost(TreeEntry *E);
-
- /// This is the recursive part of buildTree.
- void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
- const EdgeInfo &EI);
-
- /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
- /// be vectorized to use the original vector (or aggregate "bitcast" to a
- /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
- /// returns false, setting \p CurrentOrder to either an empty vector or a
- /// non-identity permutation that allows to reuse extract instructions.
- bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
- SmallVectorImpl<unsigned> &CurrentOrder) const;
-
- /// Vectorize a single entry in the tree.
- Value *vectorizeTree(TreeEntry *E);
-
- /// Vectorize a single entry in the tree, starting in \p VL.
- Value *vectorizeTree(ArrayRef<Value *> VL);
-
- /// \returns the scalarization cost for this type. Scalarization in this
- /// context means the creation of vectors from a group of scalars.
+
+ /// This is the recursive part of buildTree.
+ void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth,
+ const EdgeInfo &EI);
+
+ /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can
+ /// be vectorized to use the original vector (or aggregate "bitcast" to a
+ /// vector) and sets \p CurrentOrder to the identity permutation; otherwise
+ /// returns false, setting \p CurrentOrder to either an empty vector or a
+ /// non-identity permutation that allows to reuse extract instructions.
+ bool canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+ SmallVectorImpl<unsigned> &CurrentOrder) const;
+
+ /// Vectorize a single entry in the tree.
+ Value *vectorizeTree(TreeEntry *E);
+
+ /// Vectorize a single entry in the tree, starting in \p VL.
+ Value *vectorizeTree(ArrayRef<Value *> VL);
+
+ /// \returns the scalarization cost for this type. Scalarization in this
+ /// context means the creation of vectors from a group of scalars.
InstructionCost
getGatherCost(FixedVectorType *Ty,
const DenseSet<unsigned> &ShuffledIndices) const;
-
- /// \returns the scalarization cost for this list of values. Assuming that
- /// this subtree gets vectorized, we may need to extract the values from the
- /// roots. This method calculates the cost of extracting the values.
+
+ /// \returns the scalarization cost for this list of values. Assuming that
+ /// this subtree gets vectorized, we may need to extract the values from the
+ /// roots. This method calculates the cost of extracting the values.
InstructionCost getGatherCost(ArrayRef<Value *> VL) const;
-
- /// Set the Builder insert point to one after the last instruction in
- /// the bundle
- void setInsertPointAfterBundle(TreeEntry *E);
-
- /// \returns a vector from a collection of scalars in \p VL.
+
+ /// Set the Builder insert point to one after the last instruction in
+ /// the bundle
+ void setInsertPointAfterBundle(TreeEntry *E);
+
+ /// \returns a vector from a collection of scalars in \p VL.
Value *gather(ArrayRef<Value *> VL);
-
- /// \returns whether the VectorizableTree is fully vectorizable and will
- /// be beneficial even the tree height is tiny.
- bool isFullyVectorizableTinyTree() const;
-
- /// Reorder commutative or alt operands to get better probability of
- /// generating vectorized code.
- static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right,
- const DataLayout &DL,
- ScalarEvolution &SE,
- const BoUpSLP &R);
- struct TreeEntry {
- using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
- TreeEntry(VecTreeTy &Container) : Container(Container) {}
-
- /// \returns true if the scalars in VL are equal to this entry.
- bool isSame(ArrayRef<Value *> VL) const {
- if (VL.size() == Scalars.size())
- return std::equal(VL.begin(), VL.end(), Scalars.begin());
- return VL.size() == ReuseShuffleIndices.size() &&
- std::equal(
- VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
- [this](Value *V, int Idx) { return V == Scalars[Idx]; });
- }
-
- /// A vector of scalars.
- ValueList Scalars;
-
- /// The Scalars are vectorized into this value. It is initialized to Null.
- Value *VectorizedValue = nullptr;
-
+
+ /// \returns whether the VectorizableTree is fully vectorizable and will
+ /// be beneficial even the tree height is tiny.
+ bool isFullyVectorizableTinyTree() const;
+
+ /// Reorder commutative or alt operands to get better probability of
+ /// generating vectorized code.
+ static void reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right,
+ const DataLayout &DL,
+ ScalarEvolution &SE,
+ const BoUpSLP &R);
+ struct TreeEntry {
+ using VecTreeTy = SmallVector<std::unique_ptr<TreeEntry>, 8>;
+ TreeEntry(VecTreeTy &Container) : Container(Container) {}
+
+ /// \returns true if the scalars in VL are equal to this entry.
+ bool isSame(ArrayRef<Value *> VL) const {
+ if (VL.size() == Scalars.size())
+ return std::equal(VL.begin(), VL.end(), Scalars.begin());
+ return VL.size() == ReuseShuffleIndices.size() &&
+ std::equal(
+ VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
+ [this](Value *V, int Idx) { return V == Scalars[Idx]; });
+ }
+
+ /// A vector of scalars.
+ ValueList Scalars;
+
+ /// The Scalars are vectorized into this value. It is initialized to Null.
+ Value *VectorizedValue = nullptr;
+
/// Do we need to gather this sequence or vectorize it
/// (either with vector instruction or with scatter/gather
/// intrinsics for store/load)?
enum EntryState { Vectorize, ScatterVectorize, NeedToGather };
- EntryState State;
-
- /// Does this sequence require some shuffling?
- SmallVector<int, 4> ReuseShuffleIndices;
-
- /// Does this entry require reordering?
+ EntryState State;
+
+ /// Does this sequence require some shuffling?
+ SmallVector<int, 4> ReuseShuffleIndices;
+
+ /// Does this entry require reordering?
SmallVector<unsigned, 4> ReorderIndices;
-
- /// Points back to the VectorizableTree.
- ///
- /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
- /// to be a pointer and needs to be able to initialize the child iterator.
- /// Thus we need a reference back to the container to translate the indices
- /// to entries.
- VecTreeTy &Container;
-
- /// The TreeEntry index containing the user of this entry. We can actually
- /// have multiple users so the data structure is not truly a tree.
- SmallVector<EdgeInfo, 1> UserTreeIndices;
-
- /// The index of this treeEntry in VectorizableTree.
- int Idx = -1;
-
- private:
- /// The operands of each instruction in each lane Operands[op_index][lane].
- /// Note: This helps avoid the replication of the code that performs the
- /// reordering of operands during buildTree_rec() and vectorizeTree().
- SmallVector<ValueList, 2> Operands;
-
- /// The main/alternate instruction.
- Instruction *MainOp = nullptr;
- Instruction *AltOp = nullptr;
-
- public:
- /// Set this bundle's \p OpIdx'th operand to \p OpVL.
- void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
- if (Operands.size() < OpIdx + 1)
- Operands.resize(OpIdx + 1);
- assert(Operands[OpIdx].size() == 0 && "Already resized?");
- Operands[OpIdx].resize(Scalars.size());
- for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
- Operands[OpIdx][Lane] = OpVL[Lane];
- }
-
- /// Set the operands of this bundle in their original order.
- void setOperandsInOrder() {
- assert(Operands.empty() && "Already initialized?");
- auto *I0 = cast<Instruction>(Scalars[0]);
- Operands.resize(I0->getNumOperands());
- unsigned NumLanes = Scalars.size();
- for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
- OpIdx != NumOperands; ++OpIdx) {
- Operands[OpIdx].resize(NumLanes);
- for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
- auto *I = cast<Instruction>(Scalars[Lane]);
- assert(I->getNumOperands() == NumOperands &&
- "Expected same number of operands");
- Operands[OpIdx][Lane] = I->getOperand(OpIdx);
- }
- }
- }
-
- /// \returns the \p OpIdx operand of this TreeEntry.
- ValueList &getOperand(unsigned OpIdx) {
- assert(OpIdx < Operands.size() && "Off bounds");
- return Operands[OpIdx];
- }
-
- /// \returns the number of operands.
- unsigned getNumOperands() const { return Operands.size(); }
-
- /// \return the single \p OpIdx operand.
- Value *getSingleOperand(unsigned OpIdx) const {
- assert(OpIdx < Operands.size() && "Off bounds");
- assert(!Operands[OpIdx].empty() && "No operand available");
- return Operands[OpIdx][0];
- }
-
- /// Some of the instructions in the list have alternate opcodes.
- bool isAltShuffle() const {
- return getOpcode() != getAltOpcode();
- }
-
- bool isOpcodeOrAlt(Instruction *I) const {
- unsigned CheckedOpcode = I->getOpcode();
- return (getOpcode() == CheckedOpcode ||
- getAltOpcode() == CheckedOpcode);
- }
-
- /// Chooses the correct key for scheduling data. If \p Op has the same (or
- /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
- /// \p OpValue.
- Value *isOneOf(Value *Op) const {
- auto *I = dyn_cast<Instruction>(Op);
- if (I && isOpcodeOrAlt(I))
- return Op;
- return MainOp;
- }
-
- void setOperations(const InstructionsState &S) {
- MainOp = S.MainOp;
- AltOp = S.AltOp;
- }
-
- Instruction *getMainOp() const {
- return MainOp;
- }
-
- Instruction *getAltOp() const {
- return AltOp;
- }
-
- /// The main/alternate opcodes for the list of instructions.
- unsigned getOpcode() const {
- return MainOp ? MainOp->getOpcode() : 0;
- }
-
- unsigned getAltOpcode() const {
- return AltOp ? AltOp->getOpcode() : 0;
- }
-
- /// Update operations state of this entry if reorder occurred.
- bool updateStateIfReorder() {
- if (ReorderIndices.empty())
- return false;
- InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
- setOperations(S);
- return true;
- }
-
-#ifndef NDEBUG
- /// Debug printer.
- LLVM_DUMP_METHOD void dump() const {
- dbgs() << Idx << ".\n";
- for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
- dbgs() << "Operand " << OpI << ":\n";
- for (const Value *V : Operands[OpI])
- dbgs().indent(2) << *V << "\n";
- }
- dbgs() << "Scalars: \n";
- for (Value *V : Scalars)
- dbgs().indent(2) << *V << "\n";
- dbgs() << "State: ";
- switch (State) {
- case Vectorize:
- dbgs() << "Vectorize\n";
- break;
+
+ /// Points back to the VectorizableTree.
+ ///
+ /// Only used for Graphviz right now. Unfortunately GraphTrait::NodeRef has
+ /// to be a pointer and needs to be able to initialize the child iterator.
+ /// Thus we need a reference back to the container to translate the indices
+ /// to entries.
+ VecTreeTy &Container;
+
+ /// The TreeEntry index containing the user of this entry. We can actually
+ /// have multiple users so the data structure is not truly a tree.
+ SmallVector<EdgeInfo, 1> UserTreeIndices;
+
+ /// The index of this treeEntry in VectorizableTree.
+ int Idx = -1;
+
+ private:
+ /// The operands of each instruction in each lane Operands[op_index][lane].
+ /// Note: This helps avoid the replication of the code that performs the
+ /// reordering of operands during buildTree_rec() and vectorizeTree().
+ SmallVector<ValueList, 2> Operands;
+
+ /// The main/alternate instruction.
+ Instruction *MainOp = nullptr;
+ Instruction *AltOp = nullptr;
+
+ public:
+ /// Set this bundle's \p OpIdx'th operand to \p OpVL.
+ void setOperand(unsigned OpIdx, ArrayRef<Value *> OpVL) {
+ if (Operands.size() < OpIdx + 1)
+ Operands.resize(OpIdx + 1);
+ assert(Operands[OpIdx].size() == 0 && "Already resized?");
+ Operands[OpIdx].resize(Scalars.size());
+ for (unsigned Lane = 0, E = Scalars.size(); Lane != E; ++Lane)
+ Operands[OpIdx][Lane] = OpVL[Lane];
+ }
+
+ /// Set the operands of this bundle in their original order.
+ void setOperandsInOrder() {
+ assert(Operands.empty() && "Already initialized?");
+ auto *I0 = cast<Instruction>(Scalars[0]);
+ Operands.resize(I0->getNumOperands());
+ unsigned NumLanes = Scalars.size();
+ for (unsigned OpIdx = 0, NumOperands = I0->getNumOperands();
+ OpIdx != NumOperands; ++OpIdx) {
+ Operands[OpIdx].resize(NumLanes);
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ auto *I = cast<Instruction>(Scalars[Lane]);
+ assert(I->getNumOperands() == NumOperands &&
+ "Expected same number of operands");
+ Operands[OpIdx][Lane] = I->getOperand(OpIdx);
+ }
+ }
+ }
+
+ /// \returns the \p OpIdx operand of this TreeEntry.
+ ValueList &getOperand(unsigned OpIdx) {
+ assert(OpIdx < Operands.size() && "Off bounds");
+ return Operands[OpIdx];
+ }
+
+ /// \returns the number of operands.
+ unsigned getNumOperands() const { return Operands.size(); }
+
+ /// \return the single \p OpIdx operand.
+ Value *getSingleOperand(unsigned OpIdx) const {
+ assert(OpIdx < Operands.size() && "Off bounds");
+ assert(!Operands[OpIdx].empty() && "No operand available");
+ return Operands[OpIdx][0];
+ }
+
+ /// Some of the instructions in the list have alternate opcodes.
+ bool isAltShuffle() const {
+ return getOpcode() != getAltOpcode();
+ }
+
+ bool isOpcodeOrAlt(Instruction *I) const {
+ unsigned CheckedOpcode = I->getOpcode();
+ return (getOpcode() == CheckedOpcode ||
+ getAltOpcode() == CheckedOpcode);
+ }
+
+ /// Chooses the correct key for scheduling data. If \p Op has the same (or
+ /// alternate) opcode as \p OpValue, the key is \p Op. Otherwise the key is
+ /// \p OpValue.
+ Value *isOneOf(Value *Op) const {
+ auto *I = dyn_cast<Instruction>(Op);
+ if (I && isOpcodeOrAlt(I))
+ return Op;
+ return MainOp;
+ }
+
+ void setOperations(const InstructionsState &S) {
+ MainOp = S.MainOp;
+ AltOp = S.AltOp;
+ }
+
+ Instruction *getMainOp() const {
+ return MainOp;
+ }
+
+ Instruction *getAltOp() const {
+ return AltOp;
+ }
+
+ /// The main/alternate opcodes for the list of instructions.
+ unsigned getOpcode() const {
+ return MainOp ? MainOp->getOpcode() : 0;
+ }
+
+ unsigned getAltOpcode() const {
+ return AltOp ? AltOp->getOpcode() : 0;
+ }
+
+ /// Update operations state of this entry if reorder occurred.
+ bool updateStateIfReorder() {
+ if (ReorderIndices.empty())
+ return false;
+ InstructionsState S = getSameOpcode(Scalars, ReorderIndices.front());
+ setOperations(S);
+ return true;
+ }
+
+#ifndef NDEBUG
+ /// Debug printer.
+ LLVM_DUMP_METHOD void dump() const {
+ dbgs() << Idx << ".\n";
+ for (unsigned OpI = 0, OpE = Operands.size(); OpI != OpE; ++OpI) {
+ dbgs() << "Operand " << OpI << ":\n";
+ for (const Value *V : Operands[OpI])
+ dbgs().indent(2) << *V << "\n";
+ }
+ dbgs() << "Scalars: \n";
+ for (Value *V : Scalars)
+ dbgs().indent(2) << *V << "\n";
+ dbgs() << "State: ";
+ switch (State) {
+ case Vectorize:
+ dbgs() << "Vectorize\n";
+ break;
case ScatterVectorize:
dbgs() << "ScatterVectorize\n";
break;
- case NeedToGather:
- dbgs() << "NeedToGather\n";
- break;
- }
- dbgs() << "MainOp: ";
- if (MainOp)
- dbgs() << *MainOp << "\n";
- else
- dbgs() << "NULL\n";
- dbgs() << "AltOp: ";
- if (AltOp)
- dbgs() << *AltOp << "\n";
- else
- dbgs() << "NULL\n";
- dbgs() << "VectorizedValue: ";
- if (VectorizedValue)
- dbgs() << *VectorizedValue << "\n";
- else
- dbgs() << "NULL\n";
- dbgs() << "ReuseShuffleIndices: ";
- if (ReuseShuffleIndices.empty())
+ case NeedToGather:
+ dbgs() << "NeedToGather\n";
+ break;
+ }
+ dbgs() << "MainOp: ";
+ if (MainOp)
+ dbgs() << *MainOp << "\n";
+ else
+ dbgs() << "NULL\n";
+ dbgs() << "AltOp: ";
+ if (AltOp)
+ dbgs() << *AltOp << "\n";
+ else
+ dbgs() << "NULL\n";
+ dbgs() << "VectorizedValue: ";
+ if (VectorizedValue)
+ dbgs() << *VectorizedValue << "\n";
+ else
+ dbgs() << "NULL\n";
+ dbgs() << "ReuseShuffleIndices: ";
+ if (ReuseShuffleIndices.empty())
dbgs() << "Empty";
- else
- for (unsigned ReuseIdx : ReuseShuffleIndices)
- dbgs() << ReuseIdx << ", ";
- dbgs() << "\n";
- dbgs() << "ReorderIndices: ";
- for (unsigned ReorderIdx : ReorderIndices)
- dbgs() << ReorderIdx << ", ";
- dbgs() << "\n";
- dbgs() << "UserTreeIndices: ";
- for (const auto &EInfo : UserTreeIndices)
- dbgs() << EInfo << ", ";
- dbgs() << "\n";
- }
-#endif
- };
-
+ else
+ for (unsigned ReuseIdx : ReuseShuffleIndices)
+ dbgs() << ReuseIdx << ", ";
+ dbgs() << "\n";
+ dbgs() << "ReorderIndices: ";
+ for (unsigned ReorderIdx : ReorderIndices)
+ dbgs() << ReorderIdx << ", ";
+ dbgs() << "\n";
+ dbgs() << "UserTreeIndices: ";
+ for (const auto &EInfo : UserTreeIndices)
+ dbgs() << EInfo << ", ";
+ dbgs() << "\n";
+ }
+#endif
+ };
+
#ifndef NDEBUG
void dumpTreeCosts(TreeEntry *E, InstructionCost ReuseShuffleCost,
InstructionCost VecCost,
@@ -1779,12 +1779,12 @@ private:
}
#endif
- /// Create a new VectorizableTree entry.
- TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
- const InstructionsState &S,
- const EdgeInfo &UserTreeIdx,
- ArrayRef<unsigned> ReuseShuffleIndices = None,
- ArrayRef<unsigned> ReorderIndices = None) {
+ /// Create a new VectorizableTree entry.
+ TreeEntry *newTreeEntry(ArrayRef<Value *> VL, Optional<ScheduleData *> Bundle,
+ const InstructionsState &S,
+ const EdgeInfo &UserTreeIdx,
+ ArrayRef<unsigned> ReuseShuffleIndices = None,
+ ArrayRef<unsigned> ReorderIndices = None) {
TreeEntry::EntryState EntryState =
Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather;
return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx,
@@ -1801,1097 +1801,1097 @@ private:
assert(((!Bundle && EntryState == TreeEntry::NeedToGather) ||
(Bundle && EntryState != TreeEntry::NeedToGather)) &&
"Need to vectorize gather entry?");
- VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
- TreeEntry *Last = VectorizableTree.back().get();
- Last->Idx = VectorizableTree.size() - 1;
- Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
+ VectorizableTree.push_back(std::make_unique<TreeEntry>(VectorizableTree));
+ TreeEntry *Last = VectorizableTree.back().get();
+ Last->Idx = VectorizableTree.size() - 1;
+ Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
Last->State = EntryState;
- Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
- ReuseShuffleIndices.end());
+ Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(),
+ ReuseShuffleIndices.end());
Last->ReorderIndices.append(ReorderIndices.begin(), ReorderIndices.end());
- Last->setOperations(S);
+ Last->setOperations(S);
if (Last->State != TreeEntry::NeedToGather) {
for (Value *V : VL) {
assert(!getTreeEntry(V) && "Scalar already in tree!");
ScalarToTreeEntry[V] = Last;
- }
- // Update the scheduler bundle to point to this TreeEntry.
- unsigned Lane = 0;
- for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
- BundleMember = BundleMember->NextInBundle) {
- BundleMember->TE = Last;
- BundleMember->Lane = Lane;
- ++Lane;
- }
- assert((!Bundle.getValue() || Lane == VL.size()) &&
- "Bundle and VL out of sync");
- } else {
- MustGather.insert(VL.begin(), VL.end());
- }
-
- if (UserTreeIdx.UserTE)
- Last->UserTreeIndices.push_back(UserTreeIdx);
-
- return Last;
- }
-
- /// -- Vectorization State --
- /// Holds all of the tree entries.
- TreeEntry::VecTreeTy VectorizableTree;
-
-#ifndef NDEBUG
- /// Debug printer.
- LLVM_DUMP_METHOD void dumpVectorizableTree() const {
- for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
- VectorizableTree[Id]->dump();
- dbgs() << "\n";
- }
- }
-#endif
-
+ }
+ // Update the scheduler bundle to point to this TreeEntry.
+ unsigned Lane = 0;
+ for (ScheduleData *BundleMember = Bundle.getValue(); BundleMember;
+ BundleMember = BundleMember->NextInBundle) {
+ BundleMember->TE = Last;
+ BundleMember->Lane = Lane;
+ ++Lane;
+ }
+ assert((!Bundle.getValue() || Lane == VL.size()) &&
+ "Bundle and VL out of sync");
+ } else {
+ MustGather.insert(VL.begin(), VL.end());
+ }
+
+ if (UserTreeIdx.UserTE)
+ Last->UserTreeIndices.push_back(UserTreeIdx);
+
+ return Last;
+ }
+
+ /// -- Vectorization State --
+ /// Holds all of the tree entries.
+ TreeEntry::VecTreeTy VectorizableTree;
+
+#ifndef NDEBUG
+ /// Debug printer.
+ LLVM_DUMP_METHOD void dumpVectorizableTree() const {
+ for (unsigned Id = 0, IdE = VectorizableTree.size(); Id != IdE; ++Id) {
+ VectorizableTree[Id]->dump();
+ dbgs() << "\n";
+ }
+ }
+#endif
+
TreeEntry *getTreeEntry(Value *V) { return ScalarToTreeEntry.lookup(V); }
-
- const TreeEntry *getTreeEntry(Value *V) const {
+
+ const TreeEntry *getTreeEntry(Value *V) const {
return ScalarToTreeEntry.lookup(V);
- }
-
- /// Maps a specific scalar to its tree entry.
- SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
-
- /// Maps a value to the proposed vectorizable size.
- SmallDenseMap<Value *, unsigned> InstrElementSize;
-
- /// A list of scalars that we found that we need to keep as scalars.
- ValueSet MustGather;
-
- /// This POD struct describes one external user in the vectorized tree.
- struct ExternalUser {
- ExternalUser(Value *S, llvm::User *U, int L)
- : Scalar(S), User(U), Lane(L) {}
-
- // Which scalar in our function.
- Value *Scalar;
-
- // Which user that uses the scalar.
- llvm::User *User;
-
- // Which lane does the scalar belong to.
- int Lane;
- };
- using UserList = SmallVector<ExternalUser, 16>;
-
- /// Checks if two instructions may access the same memory.
- ///
- /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
- /// is invariant in the calling loop.
- bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
- Instruction *Inst2) {
- // First check if the result is already in the cache.
- AliasCacheKey key = std::make_pair(Inst1, Inst2);
- Optional<bool> &result = AliasCache[key];
- if (result.hasValue()) {
- return result.getValue();
- }
- MemoryLocation Loc2 = getLocation(Inst2, AA);
- bool aliased = true;
- if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
- // Do the alias check.
- aliased = AA->alias(Loc1, Loc2);
- }
- // Store the result in the cache.
- result = aliased;
- return aliased;
- }
-
- using AliasCacheKey = std::pair<Instruction *, Instruction *>;
-
- /// Cache for alias results.
- /// TODO: consider moving this to the AliasAnalysis itself.
- DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
-
- /// Removes an instruction from its block and eventually deletes it.
- /// It's like Instruction::eraseFromParent() except that the actual deletion
- /// is delayed until BoUpSLP is destructed.
- /// This is required to ensure that there are no incorrect collisions in the
- /// AliasCache, which can happen if a new instruction is allocated at the
- /// same address as a previously deleted instruction.
- void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
- auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
- It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
- }
-
- /// Temporary store for deleted instructions. Instructions will be deleted
- /// eventually when the BoUpSLP is destructed.
- DenseMap<Instruction *, bool> DeletedInstructions;
-
- /// A list of values that need to extracted out of the tree.
- /// This list holds pairs of (Internal Scalar : External User). External User
- /// can be nullptr, it means that this Internal Scalar will be used later,
- /// after vectorization.
- UserList ExternalUses;
-
- /// Values used only by @llvm.assume calls.
- SmallPtrSet<const Value *, 32> EphValues;
-
- /// Holds all of the instructions that we gathered.
- SetVector<Instruction *> GatherSeq;
-
- /// A list of blocks that we are going to CSE.
- SetVector<BasicBlock *> CSEBlocks;
-
- /// Contains all scheduling relevant data for an instruction.
- /// A ScheduleData either represents a single instruction or a member of an
- /// instruction bundle (= a group of instructions which is combined into a
- /// vector instruction).
- struct ScheduleData {
- // The initial value for the dependency counters. It means that the
- // dependencies are not calculated yet.
- enum { InvalidDeps = -1 };
-
- ScheduleData() = default;
-
- void init(int BlockSchedulingRegionID, Value *OpVal) {
- FirstInBundle = this;
- NextInBundle = nullptr;
- NextLoadStore = nullptr;
- IsScheduled = false;
- SchedulingRegionID = BlockSchedulingRegionID;
- UnscheduledDepsInBundle = UnscheduledDeps;
- clearDependencies();
- OpValue = OpVal;
- TE = nullptr;
- Lane = -1;
- }
-
- /// Returns true if the dependency information has been calculated.
- bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
-
- /// Returns true for single instructions and for bundle representatives
- /// (= the head of a bundle).
- bool isSchedulingEntity() const { return FirstInBundle == this; }
-
- /// Returns true if it represents an instruction bundle and not only a
- /// single instruction.
- bool isPartOfBundle() const {
- return NextInBundle != nullptr || FirstInBundle != this;
- }
-
- /// Returns true if it is ready for scheduling, i.e. it has no more
- /// unscheduled depending instructions/bundles.
- bool isReady() const {
- assert(isSchedulingEntity() &&
- "can't consider non-scheduling entity for ready list");
- return UnscheduledDepsInBundle == 0 && !IsScheduled;
- }
-
- /// Modifies the number of unscheduled dependencies, also updating it for
- /// the whole bundle.
- int incrementUnscheduledDeps(int Incr) {
- UnscheduledDeps += Incr;
- return FirstInBundle->UnscheduledDepsInBundle += Incr;
- }
-
- /// Sets the number of unscheduled dependencies to the number of
- /// dependencies.
- void resetUnscheduledDeps() {
- incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
- }
-
- /// Clears all dependency information.
- void clearDependencies() {
- Dependencies = InvalidDeps;
- resetUnscheduledDeps();
- MemoryDependencies.clear();
- }
-
- void dump(raw_ostream &os) const {
- if (!isSchedulingEntity()) {
- os << "/ " << *Inst;
- } else if (NextInBundle) {
- os << '[' << *Inst;
- ScheduleData *SD = NextInBundle;
- while (SD) {
- os << ';' << *SD->Inst;
- SD = SD->NextInBundle;
- }
- os << ']';
- } else {
- os << *Inst;
- }
- }
-
- Instruction *Inst = nullptr;
-
- /// Points to the head in an instruction bundle (and always to this for
- /// single instructions).
- ScheduleData *FirstInBundle = nullptr;
-
- /// Single linked list of all instructions in a bundle. Null if it is a
- /// single instruction.
- ScheduleData *NextInBundle = nullptr;
-
- /// Single linked list of all memory instructions (e.g. load, store, call)
- /// in the block - until the end of the scheduling region.
- ScheduleData *NextLoadStore = nullptr;
-
- /// The dependent memory instructions.
- /// This list is derived on demand in calculateDependencies().
- SmallVector<ScheduleData *, 4> MemoryDependencies;
-
- /// This ScheduleData is in the current scheduling region if this matches
- /// the current SchedulingRegionID of BlockScheduling.
- int SchedulingRegionID = 0;
-
- /// Used for getting a "good" final ordering of instructions.
- int SchedulingPriority = 0;
-
- /// The number of dependencies. Constitutes of the number of users of the
- /// instruction plus the number of dependent memory instructions (if any).
- /// This value is calculated on demand.
- /// If InvalidDeps, the number of dependencies is not calculated yet.
- int Dependencies = InvalidDeps;
-
- /// The number of dependencies minus the number of dependencies of scheduled
- /// instructions. As soon as this is zero, the instruction/bundle gets ready
- /// for scheduling.
- /// Note that this is negative as long as Dependencies is not calculated.
- int UnscheduledDeps = InvalidDeps;
-
- /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
- /// single instructions.
- int UnscheduledDepsInBundle = InvalidDeps;
-
- /// True if this instruction is scheduled (or considered as scheduled in the
- /// dry-run).
- bool IsScheduled = false;
-
- /// Opcode of the current instruction in the schedule data.
- Value *OpValue = nullptr;
-
- /// The TreeEntry that this instruction corresponds to.
- TreeEntry *TE = nullptr;
-
- /// The lane of this node in the TreeEntry.
- int Lane = -1;
- };
-
-#ifndef NDEBUG
- friend inline raw_ostream &operator<<(raw_ostream &os,
- const BoUpSLP::ScheduleData &SD) {
- SD.dump(os);
- return os;
- }
-#endif
-
- friend struct GraphTraits<BoUpSLP *>;
- friend struct DOTGraphTraits<BoUpSLP *>;
-
- /// Contains all scheduling data for a basic block.
- struct BlockScheduling {
- BlockScheduling(BasicBlock *BB)
- : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
-
- void clear() {
- ReadyInsts.clear();
- ScheduleStart = nullptr;
- ScheduleEnd = nullptr;
- FirstLoadStoreInRegion = nullptr;
- LastLoadStoreInRegion = nullptr;
-
- // Reduce the maximum schedule region size by the size of the
- // previous scheduling run.
- ScheduleRegionSizeLimit -= ScheduleRegionSize;
- if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
- ScheduleRegionSizeLimit = MinScheduleRegionSize;
- ScheduleRegionSize = 0;
-
- // Make a new scheduling region, i.e. all existing ScheduleData is not
- // in the new region yet.
- ++SchedulingRegionID;
- }
-
- ScheduleData *getScheduleData(Value *V) {
- ScheduleData *SD = ScheduleDataMap[V];
- if (SD && SD->SchedulingRegionID == SchedulingRegionID)
- return SD;
- return nullptr;
- }
-
- ScheduleData *getScheduleData(Value *V, Value *Key) {
- if (V == Key)
- return getScheduleData(V);
- auto I = ExtraScheduleDataMap.find(V);
- if (I != ExtraScheduleDataMap.end()) {
- ScheduleData *SD = I->second[Key];
- if (SD && SD->SchedulingRegionID == SchedulingRegionID)
- return SD;
- }
- return nullptr;
- }
-
- bool isInSchedulingRegion(ScheduleData *SD) const {
- return SD->SchedulingRegionID == SchedulingRegionID;
- }
-
- /// Marks an instruction as scheduled and puts all dependent ready
- /// instructions into the ready-list.
- template <typename ReadyListType>
- void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
- SD->IsScheduled = true;
- LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
-
- ScheduleData *BundleMember = SD;
- while (BundleMember) {
- if (BundleMember->Inst != BundleMember->OpValue) {
- BundleMember = BundleMember->NextInBundle;
- continue;
- }
- // Handle the def-use chain dependencies.
-
- // Decrement the unscheduled counter and insert to ready list if ready.
- auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
- doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
- if (OpDef && OpDef->hasValidDependencies() &&
- OpDef->incrementUnscheduledDeps(-1) == 0) {
- // There are no more unscheduled dependencies after
- // decrementing, so we can put the dependent instruction
- // into the ready list.
- ScheduleData *DepBundle = OpDef->FirstInBundle;
- assert(!DepBundle->IsScheduled &&
- "already scheduled bundle gets ready");
- ReadyList.insert(DepBundle);
- LLVM_DEBUG(dbgs()
- << "SLP: gets ready (def): " << *DepBundle << "\n");
- }
- });
- };
-
- // If BundleMember is a vector bundle, its operands may have been
- // reordered duiring buildTree(). We therefore need to get its operands
- // through the TreeEntry.
- if (TreeEntry *TE = BundleMember->TE) {
- int Lane = BundleMember->Lane;
- assert(Lane >= 0 && "Lane not set");
-
- // Since vectorization tree is being built recursively this assertion
- // ensures that the tree entry has all operands set before reaching
- // this code. Couple of exceptions known at the moment are extracts
- // where their second (immediate) operand is not added. Since
- // immediates do not affect scheduler behavior this is considered
- // okay.
- auto *In = TE->getMainOp();
- assert(In &&
- (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
- In->getNumOperands() == TE->getNumOperands()) &&
- "Missed TreeEntry operands?");
- (void)In; // fake use to avoid build failure when assertions disabled
-
- for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
- OpIdx != NumOperands; ++OpIdx)
- if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
- DecrUnsched(I);
- } else {
- // If BundleMember is a stand-alone instruction, no operand reordering
- // has taken place, so we directly access its operands.
- for (Use &U : BundleMember->Inst->operands())
- if (auto *I = dyn_cast<Instruction>(U.get()))
- DecrUnsched(I);
- }
- // Handle the memory dependencies.
- for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
- if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
- // There are no more unscheduled dependencies after decrementing,
- // so we can put the dependent instruction into the ready list.
- ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
- assert(!DepBundle->IsScheduled &&
- "already scheduled bundle gets ready");
- ReadyList.insert(DepBundle);
- LLVM_DEBUG(dbgs()
- << "SLP: gets ready (mem): " << *DepBundle << "\n");
- }
- }
- BundleMember = BundleMember->NextInBundle;
- }
- }
-
- void doForAllOpcodes(Value *V,
- function_ref<void(ScheduleData *SD)> Action) {
- if (ScheduleData *SD = getScheduleData(V))
- Action(SD);
- auto I = ExtraScheduleDataMap.find(V);
- if (I != ExtraScheduleDataMap.end())
- for (auto &P : I->second)
- if (P.second->SchedulingRegionID == SchedulingRegionID)
- Action(P.second);
- }
-
- /// Put all instructions into the ReadyList which are ready for scheduling.
- template <typename ReadyListType>
- void initialFillReadyList(ReadyListType &ReadyList) {
- for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- doForAllOpcodes(I, [&](ScheduleData *SD) {
- if (SD->isSchedulingEntity() && SD->isReady()) {
- ReadyList.insert(SD);
- LLVM_DEBUG(dbgs()
- << "SLP: initially in ready list: " << *I << "\n");
- }
- });
- }
- }
-
- /// Checks if a bundle of instructions can be scheduled, i.e. has no
- /// cyclic dependencies. This is only a dry-run, no instructions are
- /// actually moved at this stage.
- /// \returns the scheduling bundle. The returned Optional value is non-None
- /// if \p VL is allowed to be scheduled.
- Optional<ScheduleData *>
- tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
- const InstructionsState &S);
-
- /// Un-bundles a group of instructions.
- void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
-
- /// Allocates schedule data chunk.
- ScheduleData *allocateScheduleDataChunks();
-
- /// Extends the scheduling region so that V is inside the region.
- /// \returns true if the region size is within the limit.
- bool extendSchedulingRegion(Value *V, const InstructionsState &S);
-
- /// Initialize the ScheduleData structures for new instructions in the
- /// scheduling region.
- void initScheduleData(Instruction *FromI, Instruction *ToI,
- ScheduleData *PrevLoadStore,
- ScheduleData *NextLoadStore);
-
- /// Updates the dependency information of a bundle and of all instructions/
- /// bundles which depend on the original bundle.
- void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
- BoUpSLP *SLP);
-
- /// Sets all instruction in the scheduling region to un-scheduled.
- void resetSchedule();
-
- BasicBlock *BB;
-
- /// Simple memory allocation for ScheduleData.
- std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
-
- /// The size of a ScheduleData array in ScheduleDataChunks.
- int ChunkSize;
-
- /// The allocator position in the current chunk, which is the last entry
- /// of ScheduleDataChunks.
- int ChunkPos;
-
- /// Attaches ScheduleData to Instruction.
- /// Note that the mapping survives during all vectorization iterations, i.e.
- /// ScheduleData structures are recycled.
- DenseMap<Value *, ScheduleData *> ScheduleDataMap;
-
- /// Attaches ScheduleData to Instruction with the leading key.
- DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
- ExtraScheduleDataMap;
-
- struct ReadyList : SmallVector<ScheduleData *, 8> {
- void insert(ScheduleData *SD) { push_back(SD); }
- };
-
- /// The ready-list for scheduling (only used for the dry-run).
- ReadyList ReadyInsts;
-
- /// The first instruction of the scheduling region.
- Instruction *ScheduleStart = nullptr;
-
- /// The first instruction _after_ the scheduling region.
- Instruction *ScheduleEnd = nullptr;
-
- /// The first memory accessing instruction in the scheduling region
- /// (can be null).
- ScheduleData *FirstLoadStoreInRegion = nullptr;
-
- /// The last memory accessing instruction in the scheduling region
- /// (can be null).
- ScheduleData *LastLoadStoreInRegion = nullptr;
-
- /// The current size of the scheduling region.
- int ScheduleRegionSize = 0;
-
- /// The maximum size allowed for the scheduling region.
- int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
-
- /// The ID of the scheduling region. For a new vectorization iteration this
- /// is incremented which "removes" all ScheduleData from the region.
- // Make sure that the initial SchedulingRegionID is greater than the
- // initial SchedulingRegionID in ScheduleData (which is 0).
- int SchedulingRegionID = 1;
- };
-
- /// Attaches the BlockScheduling structures to basic blocks.
- MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
-
- /// Performs the "real" scheduling. Done before vectorization is actually
- /// performed in a basic block.
- void scheduleBlock(BlockScheduling *BS);
-
- /// List of users to ignore during scheduling and that don't need extracting.
- ArrayRef<Value *> UserIgnoreList;
-
- /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
- /// sorted SmallVectors of unsigned.
- struct OrdersTypeDenseMapInfo {
- static OrdersType getEmptyKey() {
- OrdersType V;
- V.push_back(~1U);
- return V;
- }
-
- static OrdersType getTombstoneKey() {
- OrdersType V;
- V.push_back(~2U);
- return V;
- }
-
- static unsigned getHashValue(const OrdersType &V) {
- return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
- }
-
- static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
- return LHS == RHS;
- }
- };
-
- /// Contains orders of operations along with the number of bundles that have
- /// operations in this order. It stores only those orders that require
- /// reordering, if reordering is not required it is counted using \a
- /// NumOpsWantToKeepOriginalOrder.
- DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
- /// Number of bundles that do not require reordering.
- unsigned NumOpsWantToKeepOriginalOrder = 0;
-
- // Analysis and block reference.
- Function *F;
- ScalarEvolution *SE;
- TargetTransformInfo *TTI;
- TargetLibraryInfo *TLI;
+ }
+
+ /// Maps a specific scalar to its tree entry.
+ SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
+
+ /// Maps a value to the proposed vectorizable size.
+ SmallDenseMap<Value *, unsigned> InstrElementSize;
+
+ /// A list of scalars that we found that we need to keep as scalars.
+ ValueSet MustGather;
+
+ /// This POD struct describes one external user in the vectorized tree.
+ struct ExternalUser {
+ ExternalUser(Value *S, llvm::User *U, int L)
+ : Scalar(S), User(U), Lane(L) {}
+
+ // Which scalar in our function.
+ Value *Scalar;
+
+ // Which user that uses the scalar.
+ llvm::User *User;
+
+ // Which lane does the scalar belong to.
+ int Lane;
+ };
+ using UserList = SmallVector<ExternalUser, 16>;
+
+ /// Checks if two instructions may access the same memory.
+ ///
+ /// \p Loc1 is the location of \p Inst1. It is passed explicitly because it
+ /// is invariant in the calling loop.
+ bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1,
+ Instruction *Inst2) {
+ // First check if the result is already in the cache.
+ AliasCacheKey key = std::make_pair(Inst1, Inst2);
+ Optional<bool> &result = AliasCache[key];
+ if (result.hasValue()) {
+ return result.getValue();
+ }
+ MemoryLocation Loc2 = getLocation(Inst2, AA);
+ bool aliased = true;
+ if (Loc1.Ptr && Loc2.Ptr && isSimple(Inst1) && isSimple(Inst2)) {
+ // Do the alias check.
+ aliased = AA->alias(Loc1, Loc2);
+ }
+ // Store the result in the cache.
+ result = aliased;
+ return aliased;
+ }
+
+ using AliasCacheKey = std::pair<Instruction *, Instruction *>;
+
+ /// Cache for alias results.
+ /// TODO: consider moving this to the AliasAnalysis itself.
+ DenseMap<AliasCacheKey, Optional<bool>> AliasCache;
+
+ /// Removes an instruction from its block and eventually deletes it.
+ /// It's like Instruction::eraseFromParent() except that the actual deletion
+ /// is delayed until BoUpSLP is destructed.
+ /// This is required to ensure that there are no incorrect collisions in the
+ /// AliasCache, which can happen if a new instruction is allocated at the
+ /// same address as a previously deleted instruction.
+ void eraseInstruction(Instruction *I, bool ReplaceOpsWithUndef = false) {
+ auto It = DeletedInstructions.try_emplace(I, ReplaceOpsWithUndef).first;
+ It->getSecond() = It->getSecond() && ReplaceOpsWithUndef;
+ }
+
+ /// Temporary store for deleted instructions. Instructions will be deleted
+ /// eventually when the BoUpSLP is destructed.
+ DenseMap<Instruction *, bool> DeletedInstructions;
+
+ /// A list of values that need to extracted out of the tree.
+ /// This list holds pairs of (Internal Scalar : External User). External User
+ /// can be nullptr, it means that this Internal Scalar will be used later,
+ /// after vectorization.
+ UserList ExternalUses;
+
+ /// Values used only by @llvm.assume calls.
+ SmallPtrSet<const Value *, 32> EphValues;
+
+ /// Holds all of the instructions that we gathered.
+ SetVector<Instruction *> GatherSeq;
+
+ /// A list of blocks that we are going to CSE.
+ SetVector<BasicBlock *> CSEBlocks;
+
+ /// Contains all scheduling relevant data for an instruction.
+ /// A ScheduleData either represents a single instruction or a member of an
+ /// instruction bundle (= a group of instructions which is combined into a
+ /// vector instruction).
+ struct ScheduleData {
+ // The initial value for the dependency counters. It means that the
+ // dependencies are not calculated yet.
+ enum { InvalidDeps = -1 };
+
+ ScheduleData() = default;
+
+ void init(int BlockSchedulingRegionID, Value *OpVal) {
+ FirstInBundle = this;
+ NextInBundle = nullptr;
+ NextLoadStore = nullptr;
+ IsScheduled = false;
+ SchedulingRegionID = BlockSchedulingRegionID;
+ UnscheduledDepsInBundle = UnscheduledDeps;
+ clearDependencies();
+ OpValue = OpVal;
+ TE = nullptr;
+ Lane = -1;
+ }
+
+ /// Returns true if the dependency information has been calculated.
+ bool hasValidDependencies() const { return Dependencies != InvalidDeps; }
+
+ /// Returns true for single instructions and for bundle representatives
+ /// (= the head of a bundle).
+ bool isSchedulingEntity() const { return FirstInBundle == this; }
+
+ /// Returns true if it represents an instruction bundle and not only a
+ /// single instruction.
+ bool isPartOfBundle() const {
+ return NextInBundle != nullptr || FirstInBundle != this;
+ }
+
+ /// Returns true if it is ready for scheduling, i.e. it has no more
+ /// unscheduled depending instructions/bundles.
+ bool isReady() const {
+ assert(isSchedulingEntity() &&
+ "can't consider non-scheduling entity for ready list");
+ return UnscheduledDepsInBundle == 0 && !IsScheduled;
+ }
+
+ /// Modifies the number of unscheduled dependencies, also updating it for
+ /// the whole bundle.
+ int incrementUnscheduledDeps(int Incr) {
+ UnscheduledDeps += Incr;
+ return FirstInBundle->UnscheduledDepsInBundle += Incr;
+ }
+
+ /// Sets the number of unscheduled dependencies to the number of
+ /// dependencies.
+ void resetUnscheduledDeps() {
+ incrementUnscheduledDeps(Dependencies - UnscheduledDeps);
+ }
+
+ /// Clears all dependency information.
+ void clearDependencies() {
+ Dependencies = InvalidDeps;
+ resetUnscheduledDeps();
+ MemoryDependencies.clear();
+ }
+
+ void dump(raw_ostream &os) const {
+ if (!isSchedulingEntity()) {
+ os << "/ " << *Inst;
+ } else if (NextInBundle) {
+ os << '[' << *Inst;
+ ScheduleData *SD = NextInBundle;
+ while (SD) {
+ os << ';' << *SD->Inst;
+ SD = SD->NextInBundle;
+ }
+ os << ']';
+ } else {
+ os << *Inst;
+ }
+ }
+
+ Instruction *Inst = nullptr;
+
+ /// Points to the head in an instruction bundle (and always to this for
+ /// single instructions).
+ ScheduleData *FirstInBundle = nullptr;
+
+ /// Single linked list of all instructions in a bundle. Null if it is a
+ /// single instruction.
+ ScheduleData *NextInBundle = nullptr;
+
+ /// Single linked list of all memory instructions (e.g. load, store, call)
+ /// in the block - until the end of the scheduling region.
+ ScheduleData *NextLoadStore = nullptr;
+
+ /// The dependent memory instructions.
+ /// This list is derived on demand in calculateDependencies().
+ SmallVector<ScheduleData *, 4> MemoryDependencies;
+
+ /// This ScheduleData is in the current scheduling region if this matches
+ /// the current SchedulingRegionID of BlockScheduling.
+ int SchedulingRegionID = 0;
+
+ /// Used for getting a "good" final ordering of instructions.
+ int SchedulingPriority = 0;
+
+ /// The number of dependencies. Constitutes of the number of users of the
+ /// instruction plus the number of dependent memory instructions (if any).
+ /// This value is calculated on demand.
+ /// If InvalidDeps, the number of dependencies is not calculated yet.
+ int Dependencies = InvalidDeps;
+
+ /// The number of dependencies minus the number of dependencies of scheduled
+ /// instructions. As soon as this is zero, the instruction/bundle gets ready
+ /// for scheduling.
+ /// Note that this is negative as long as Dependencies is not calculated.
+ int UnscheduledDeps = InvalidDeps;
+
+ /// The sum of UnscheduledDeps in a bundle. Equals to UnscheduledDeps for
+ /// single instructions.
+ int UnscheduledDepsInBundle = InvalidDeps;
+
+ /// True if this instruction is scheduled (or considered as scheduled in the
+ /// dry-run).
+ bool IsScheduled = false;
+
+ /// Opcode of the current instruction in the schedule data.
+ Value *OpValue = nullptr;
+
+ /// The TreeEntry that this instruction corresponds to.
+ TreeEntry *TE = nullptr;
+
+ /// The lane of this node in the TreeEntry.
+ int Lane = -1;
+ };
+
+#ifndef NDEBUG
+ friend inline raw_ostream &operator<<(raw_ostream &os,
+ const BoUpSLP::ScheduleData &SD) {
+ SD.dump(os);
+ return os;
+ }
+#endif
+
+ friend struct GraphTraits<BoUpSLP *>;
+ friend struct DOTGraphTraits<BoUpSLP *>;
+
+ /// Contains all scheduling data for a basic block.
+ struct BlockScheduling {
+ BlockScheduling(BasicBlock *BB)
+ : BB(BB), ChunkSize(BB->size()), ChunkPos(ChunkSize) {}
+
+ void clear() {
+ ReadyInsts.clear();
+ ScheduleStart = nullptr;
+ ScheduleEnd = nullptr;
+ FirstLoadStoreInRegion = nullptr;
+ LastLoadStoreInRegion = nullptr;
+
+ // Reduce the maximum schedule region size by the size of the
+ // previous scheduling run.
+ ScheduleRegionSizeLimit -= ScheduleRegionSize;
+ if (ScheduleRegionSizeLimit < MinScheduleRegionSize)
+ ScheduleRegionSizeLimit = MinScheduleRegionSize;
+ ScheduleRegionSize = 0;
+
+ // Make a new scheduling region, i.e. all existing ScheduleData is not
+ // in the new region yet.
+ ++SchedulingRegionID;
+ }
+
+ ScheduleData *getScheduleData(Value *V) {
+ ScheduleData *SD = ScheduleDataMap[V];
+ if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+ return SD;
+ return nullptr;
+ }
+
+ ScheduleData *getScheduleData(Value *V, Value *Key) {
+ if (V == Key)
+ return getScheduleData(V);
+ auto I = ExtraScheduleDataMap.find(V);
+ if (I != ExtraScheduleDataMap.end()) {
+ ScheduleData *SD = I->second[Key];
+ if (SD && SD->SchedulingRegionID == SchedulingRegionID)
+ return SD;
+ }
+ return nullptr;
+ }
+
+ bool isInSchedulingRegion(ScheduleData *SD) const {
+ return SD->SchedulingRegionID == SchedulingRegionID;
+ }
+
+ /// Marks an instruction as scheduled and puts all dependent ready
+ /// instructions into the ready-list.
+ template <typename ReadyListType>
+ void schedule(ScheduleData *SD, ReadyListType &ReadyList) {
+ SD->IsScheduled = true;
+ LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n");
+
+ ScheduleData *BundleMember = SD;
+ while (BundleMember) {
+ if (BundleMember->Inst != BundleMember->OpValue) {
+ BundleMember = BundleMember->NextInBundle;
+ continue;
+ }
+ // Handle the def-use chain dependencies.
+
+ // Decrement the unscheduled counter and insert to ready list if ready.
+ auto &&DecrUnsched = [this, &ReadyList](Instruction *I) {
+ doForAllOpcodes(I, [&ReadyList](ScheduleData *OpDef) {
+ if (OpDef && OpDef->hasValidDependencies() &&
+ OpDef->incrementUnscheduledDeps(-1) == 0) {
+ // There are no more unscheduled dependencies after
+ // decrementing, so we can put the dependent instruction
+ // into the ready list.
+ ScheduleData *DepBundle = OpDef->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ ReadyList.insert(DepBundle);
+ LLVM_DEBUG(dbgs()
+ << "SLP: gets ready (def): " << *DepBundle << "\n");
+ }
+ });
+ };
+
+ // If BundleMember is a vector bundle, its operands may have been
+ // reordered duiring buildTree(). We therefore need to get its operands
+ // through the TreeEntry.
+ if (TreeEntry *TE = BundleMember->TE) {
+ int Lane = BundleMember->Lane;
+ assert(Lane >= 0 && "Lane not set");
+
+ // Since vectorization tree is being built recursively this assertion
+ // ensures that the tree entry has all operands set before reaching
+ // this code. Couple of exceptions known at the moment are extracts
+ // where their second (immediate) operand is not added. Since
+ // immediates do not affect scheduler behavior this is considered
+ // okay.
+ auto *In = TE->getMainOp();
+ assert(In &&
+ (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
+ In->getNumOperands() == TE->getNumOperands()) &&
+ "Missed TreeEntry operands?");
+ (void)In; // fake use to avoid build failure when assertions disabled
+
+ for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
+ OpIdx != NumOperands; ++OpIdx)
+ if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
+ DecrUnsched(I);
+ } else {
+ // If BundleMember is a stand-alone instruction, no operand reordering
+ // has taken place, so we directly access its operands.
+ for (Use &U : BundleMember->Inst->operands())
+ if (auto *I = dyn_cast<Instruction>(U.get()))
+ DecrUnsched(I);
+ }
+ // Handle the memory dependencies.
+ for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
+ if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
+ // There are no more unscheduled dependencies after decrementing,
+ // so we can put the dependent instruction into the ready list.
+ ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
+ assert(!DepBundle->IsScheduled &&
+ "already scheduled bundle gets ready");
+ ReadyList.insert(DepBundle);
+ LLVM_DEBUG(dbgs()
+ << "SLP: gets ready (mem): " << *DepBundle << "\n");
+ }
+ }
+ BundleMember = BundleMember->NextInBundle;
+ }
+ }
+
+ void doForAllOpcodes(Value *V,
+ function_ref<void(ScheduleData *SD)> Action) {
+ if (ScheduleData *SD = getScheduleData(V))
+ Action(SD);
+ auto I = ExtraScheduleDataMap.find(V);
+ if (I != ExtraScheduleDataMap.end())
+ for (auto &P : I->second)
+ if (P.second->SchedulingRegionID == SchedulingRegionID)
+ Action(P.second);
+ }
+
+ /// Put all instructions into the ReadyList which are ready for scheduling.
+ template <typename ReadyListType>
+ void initialFillReadyList(ReadyListType &ReadyList) {
+ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ doForAllOpcodes(I, [&](ScheduleData *SD) {
+ if (SD->isSchedulingEntity() && SD->isReady()) {
+ ReadyList.insert(SD);
+ LLVM_DEBUG(dbgs()
+ << "SLP: initially in ready list: " << *I << "\n");
+ }
+ });
+ }
+ }
+
+ /// Checks if a bundle of instructions can be scheduled, i.e. has no
+ /// cyclic dependencies. This is only a dry-run, no instructions are
+ /// actually moved at this stage.
+ /// \returns the scheduling bundle. The returned Optional value is non-None
+ /// if \p VL is allowed to be scheduled.
+ Optional<ScheduleData *>
+ tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+ const InstructionsState &S);
+
+ /// Un-bundles a group of instructions.
+ void cancelScheduling(ArrayRef<Value *> VL, Value *OpValue);
+
+ /// Allocates schedule data chunk.
+ ScheduleData *allocateScheduleDataChunks();
+
+ /// Extends the scheduling region so that V is inside the region.
+ /// \returns true if the region size is within the limit.
+ bool extendSchedulingRegion(Value *V, const InstructionsState &S);
+
+ /// Initialize the ScheduleData structures for new instructions in the
+ /// scheduling region.
+ void initScheduleData(Instruction *FromI, Instruction *ToI,
+ ScheduleData *PrevLoadStore,
+ ScheduleData *NextLoadStore);
+
+ /// Updates the dependency information of a bundle and of all instructions/
+ /// bundles which depend on the original bundle.
+ void calculateDependencies(ScheduleData *SD, bool InsertInReadyList,
+ BoUpSLP *SLP);
+
+ /// Sets all instruction in the scheduling region to un-scheduled.
+ void resetSchedule();
+
+ BasicBlock *BB;
+
+ /// Simple memory allocation for ScheduleData.
+ std::vector<std::unique_ptr<ScheduleData[]>> ScheduleDataChunks;
+
+ /// The size of a ScheduleData array in ScheduleDataChunks.
+ int ChunkSize;
+
+ /// The allocator position in the current chunk, which is the last entry
+ /// of ScheduleDataChunks.
+ int ChunkPos;
+
+ /// Attaches ScheduleData to Instruction.
+ /// Note that the mapping survives during all vectorization iterations, i.e.
+ /// ScheduleData structures are recycled.
+ DenseMap<Value *, ScheduleData *> ScheduleDataMap;
+
+ /// Attaches ScheduleData to Instruction with the leading key.
+ DenseMap<Value *, SmallDenseMap<Value *, ScheduleData *>>
+ ExtraScheduleDataMap;
+
+ struct ReadyList : SmallVector<ScheduleData *, 8> {
+ void insert(ScheduleData *SD) { push_back(SD); }
+ };
+
+ /// The ready-list for scheduling (only used for the dry-run).
+ ReadyList ReadyInsts;
+
+ /// The first instruction of the scheduling region.
+ Instruction *ScheduleStart = nullptr;
+
+ /// The first instruction _after_ the scheduling region.
+ Instruction *ScheduleEnd = nullptr;
+
+ /// The first memory accessing instruction in the scheduling region
+ /// (can be null).
+ ScheduleData *FirstLoadStoreInRegion = nullptr;
+
+ /// The last memory accessing instruction in the scheduling region
+ /// (can be null).
+ ScheduleData *LastLoadStoreInRegion = nullptr;
+
+ /// The current size of the scheduling region.
+ int ScheduleRegionSize = 0;
+
+ /// The maximum size allowed for the scheduling region.
+ int ScheduleRegionSizeLimit = ScheduleRegionSizeBudget;
+
+ /// The ID of the scheduling region. For a new vectorization iteration this
+ /// is incremented which "removes" all ScheduleData from the region.
+ // Make sure that the initial SchedulingRegionID is greater than the
+ // initial SchedulingRegionID in ScheduleData (which is 0).
+ int SchedulingRegionID = 1;
+ };
+
+ /// Attaches the BlockScheduling structures to basic blocks.
+ MapVector<BasicBlock *, std::unique_ptr<BlockScheduling>> BlocksSchedules;
+
+ /// Performs the "real" scheduling. Done before vectorization is actually
+ /// performed in a basic block.
+ void scheduleBlock(BlockScheduling *BS);
+
+ /// List of users to ignore during scheduling and that don't need extracting.
+ ArrayRef<Value *> UserIgnoreList;
+
+ /// A DenseMapInfo implementation for holding DenseMaps and DenseSets of
+ /// sorted SmallVectors of unsigned.
+ struct OrdersTypeDenseMapInfo {
+ static OrdersType getEmptyKey() {
+ OrdersType V;
+ V.push_back(~1U);
+ return V;
+ }
+
+ static OrdersType getTombstoneKey() {
+ OrdersType V;
+ V.push_back(~2U);
+ return V;
+ }
+
+ static unsigned getHashValue(const OrdersType &V) {
+ return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+ }
+
+ static bool isEqual(const OrdersType &LHS, const OrdersType &RHS) {
+ return LHS == RHS;
+ }
+ };
+
+ /// Contains orders of operations along with the number of bundles that have
+ /// operations in this order. It stores only those orders that require
+ /// reordering, if reordering is not required it is counted using \a
+ /// NumOpsWantToKeepOriginalOrder.
+ DenseMap<OrdersType, unsigned, OrdersTypeDenseMapInfo> NumOpsWantToKeepOrder;
+ /// Number of bundles that do not require reordering.
+ unsigned NumOpsWantToKeepOriginalOrder = 0;
+
+ // Analysis and block reference.
+ Function *F;
+ ScalarEvolution *SE;
+ TargetTransformInfo *TTI;
+ TargetLibraryInfo *TLI;
AAResults *AA;
- LoopInfo *LI;
- DominatorTree *DT;
- AssumptionCache *AC;
- DemandedBits *DB;
- const DataLayout *DL;
- OptimizationRemarkEmitter *ORE;
-
- unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
- unsigned MinVecRegSize; // Set by cl::opt (default: 128).
-
- /// Instruction builder to construct the vectorized tree.
- IRBuilder<> Builder;
-
- /// A map of scalar integer values to the smallest bit width with which they
- /// can legally be represented. The values map to (width, signed) pairs,
- /// where "width" indicates the minimum bit width and "signed" is True if the
- /// value must be signed-extended, rather than zero-extended, back to its
- /// original width.
- MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
-};
-
-} // end namespace slpvectorizer
-
-template <> struct GraphTraits<BoUpSLP *> {
- using TreeEntry = BoUpSLP::TreeEntry;
-
- /// NodeRef has to be a pointer per the GraphWriter.
- using NodeRef = TreeEntry *;
-
- using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
-
- /// Add the VectorizableTree to the index iterator to be able to return
- /// TreeEntry pointers.
- struct ChildIteratorType
- : public iterator_adaptor_base<
- ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
- ContainerTy &VectorizableTree;
-
- ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
- ContainerTy &VT)
- : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
-
- NodeRef operator*() { return I->UserTE; }
- };
-
- static NodeRef getEntryNode(BoUpSLP &R) {
- return R.VectorizableTree[0].get();
- }
-
- static ChildIteratorType child_begin(NodeRef N) {
- return {N->UserTreeIndices.begin(), N->Container};
- }
-
- static ChildIteratorType child_end(NodeRef N) {
- return {N->UserTreeIndices.end(), N->Container};
- }
-
- /// For the node iterator we just need to turn the TreeEntry iterator into a
- /// TreeEntry* iterator so that it dereferences to NodeRef.
- class nodes_iterator {
- using ItTy = ContainerTy::iterator;
- ItTy It;
-
- public:
- nodes_iterator(const ItTy &It2) : It(It2) {}
- NodeRef operator*() { return It->get(); }
- nodes_iterator operator++() {
- ++It;
- return *this;
- }
- bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
- };
-
- static nodes_iterator nodes_begin(BoUpSLP *R) {
- return nodes_iterator(R->VectorizableTree.begin());
- }
-
- static nodes_iterator nodes_end(BoUpSLP *R) {
- return nodes_iterator(R->VectorizableTree.end());
- }
-
- static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
-};
-
-template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
- using TreeEntry = BoUpSLP::TreeEntry;
-
- DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
-
- std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
- std::string Str;
- raw_string_ostream OS(Str);
- if (isSplat(Entry->Scalars)) {
- OS << "<splat> " << *Entry->Scalars[0];
- return Str;
- }
- for (auto V : Entry->Scalars) {
- OS << *V;
+ LoopInfo *LI;
+ DominatorTree *DT;
+ AssumptionCache *AC;
+ DemandedBits *DB;
+ const DataLayout *DL;
+ OptimizationRemarkEmitter *ORE;
+
+ unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt.
+ unsigned MinVecRegSize; // Set by cl::opt (default: 128).
+
+ /// Instruction builder to construct the vectorized tree.
+ IRBuilder<> Builder;
+
+ /// A map of scalar integer values to the smallest bit width with which they
+ /// can legally be represented. The values map to (width, signed) pairs,
+ /// where "width" indicates the minimum bit width and "signed" is True if the
+ /// value must be signed-extended, rather than zero-extended, back to its
+ /// original width.
+ MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
+};
+
+} // end namespace slpvectorizer
+
+template <> struct GraphTraits<BoUpSLP *> {
+ using TreeEntry = BoUpSLP::TreeEntry;
+
+ /// NodeRef has to be a pointer per the GraphWriter.
+ using NodeRef = TreeEntry *;
+
+ using ContainerTy = BoUpSLP::TreeEntry::VecTreeTy;
+
+ /// Add the VectorizableTree to the index iterator to be able to return
+ /// TreeEntry pointers.
+ struct ChildIteratorType
+ : public iterator_adaptor_base<
+ ChildIteratorType, SmallVector<BoUpSLP::EdgeInfo, 1>::iterator> {
+ ContainerTy &VectorizableTree;
+
+ ChildIteratorType(SmallVector<BoUpSLP::EdgeInfo, 1>::iterator W,
+ ContainerTy &VT)
+ : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
+
+ NodeRef operator*() { return I->UserTE; }
+ };
+
+ static NodeRef getEntryNode(BoUpSLP &R) {
+ return R.VectorizableTree[0].get();
+ }
+
+ static ChildIteratorType child_begin(NodeRef N) {
+ return {N->UserTreeIndices.begin(), N->Container};
+ }
+
+ static ChildIteratorType child_end(NodeRef N) {
+ return {N->UserTreeIndices.end(), N->Container};
+ }
+
+ /// For the node iterator we just need to turn the TreeEntry iterator into a
+ /// TreeEntry* iterator so that it dereferences to NodeRef.
+ class nodes_iterator {
+ using ItTy = ContainerTy::iterator;
+ ItTy It;
+
+ public:
+ nodes_iterator(const ItTy &It2) : It(It2) {}
+ NodeRef operator*() { return It->get(); }
+ nodes_iterator operator++() {
+ ++It;
+ return *this;
+ }
+ bool operator!=(const nodes_iterator &N2) const { return N2.It != It; }
+ };
+
+ static nodes_iterator nodes_begin(BoUpSLP *R) {
+ return nodes_iterator(R->VectorizableTree.begin());
+ }
+
+ static nodes_iterator nodes_end(BoUpSLP *R) {
+ return nodes_iterator(R->VectorizableTree.end());
+ }
+
+ static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
+};
+
+template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
+ using TreeEntry = BoUpSLP::TreeEntry;
+
+ DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+ std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
+ std::string Str;
+ raw_string_ostream OS(Str);
+ if (isSplat(Entry->Scalars)) {
+ OS << "<splat> " << *Entry->Scalars[0];
+ return Str;
+ }
+ for (auto V : Entry->Scalars) {
+ OS << *V;
if (llvm::any_of(R->ExternalUses, [&](const BoUpSLP::ExternalUser &EU) {
return EU.Scalar == V;
}))
- OS << " <extract>";
- OS << "\n";
- }
- return Str;
- }
-
- static std::string getNodeAttributes(const TreeEntry *Entry,
- const BoUpSLP *) {
- if (Entry->State == TreeEntry::NeedToGather)
- return "color=red";
- return "";
- }
-};
-
-} // end namespace llvm
-
-BoUpSLP::~BoUpSLP() {
- for (const auto &Pair : DeletedInstructions) {
- // Replace operands of ignored instructions with Undefs in case if they were
- // marked for deletion.
- if (Pair.getSecond()) {
- Value *Undef = UndefValue::get(Pair.getFirst()->getType());
- Pair.getFirst()->replaceAllUsesWith(Undef);
- }
- Pair.getFirst()->dropAllReferences();
- }
- for (const auto &Pair : DeletedInstructions) {
- assert(Pair.getFirst()->use_empty() &&
- "trying to erase instruction with users.");
- Pair.getFirst()->eraseFromParent();
- }
+ OS << " <extract>";
+ OS << "\n";
+ }
+ return Str;
+ }
+
+ static std::string getNodeAttributes(const TreeEntry *Entry,
+ const BoUpSLP *) {
+ if (Entry->State == TreeEntry::NeedToGather)
+ return "color=red";
+ return "";
+ }
+};
+
+} // end namespace llvm
+
+BoUpSLP::~BoUpSLP() {
+ for (const auto &Pair : DeletedInstructions) {
+ // Replace operands of ignored instructions with Undefs in case if they were
+ // marked for deletion.
+ if (Pair.getSecond()) {
+ Value *Undef = UndefValue::get(Pair.getFirst()->getType());
+ Pair.getFirst()->replaceAllUsesWith(Undef);
+ }
+ Pair.getFirst()->dropAllReferences();
+ }
+ for (const auto &Pair : DeletedInstructions) {
+ assert(Pair.getFirst()->use_empty() &&
+ "trying to erase instruction with users.");
+ Pair.getFirst()->eraseFromParent();
+ }
#ifdef EXPENSIVE_CHECKS
// If we could guarantee that this call is not extremely slow, we could
// remove the ifdef limitation (see PR47712).
- assert(!verifyFunction(*F, &dbgs()));
+ assert(!verifyFunction(*F, &dbgs()));
#endif
-}
-
-void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
- for (auto *V : AV) {
- if (auto *I = dyn_cast<Instruction>(V))
+}
+
+void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
+ for (auto *V : AV) {
+ if (auto *I = dyn_cast<Instruction>(V))
eraseInstruction(I, /*ReplaceOpsWithUndef=*/true);
- };
-}
-
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
- ArrayRef<Value *> UserIgnoreLst) {
- ExtraValueToDebugLocsMap ExternallyUsedValues;
- buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
-}
-
-void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
- ExtraValueToDebugLocsMap &ExternallyUsedValues,
- ArrayRef<Value *> UserIgnoreLst) {
- deleteTree();
- UserIgnoreList = UserIgnoreLst;
- if (!allSameType(Roots))
- return;
- buildTree_rec(Roots, 0, EdgeInfo());
-
- // Collect the values that we need to extract from the tree.
- for (auto &TEPtr : VectorizableTree) {
- TreeEntry *Entry = TEPtr.get();
-
- // No need to handle users of gathered values.
- if (Entry->State == TreeEntry::NeedToGather)
- continue;
-
- // For each lane:
- for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
- Value *Scalar = Entry->Scalars[Lane];
- int FoundLane = Lane;
- if (!Entry->ReuseShuffleIndices.empty()) {
- FoundLane =
- std::distance(Entry->ReuseShuffleIndices.begin(),
- llvm::find(Entry->ReuseShuffleIndices, FoundLane));
- }
-
- // Check if the scalar is externally used as an extra arg.
- auto ExtI = ExternallyUsedValues.find(Scalar);
- if (ExtI != ExternallyUsedValues.end()) {
- LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
- << Lane << " from " << *Scalar << ".\n");
- ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
- }
- for (User *U : Scalar->users()) {
- LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
-
- Instruction *UserInst = dyn_cast<Instruction>(U);
- if (!UserInst)
- continue;
-
- // Skip in-tree scalars that become vectors
- if (TreeEntry *UseEntry = getTreeEntry(U)) {
- Value *UseScalar = UseEntry->Scalars[0];
- // Some in-tree scalars will remain as scalar in vectorized
- // instructions. If that is the case, the one in Lane 0 will
- // be used.
- if (UseScalar != U ||
- !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
- LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
- << ".\n");
- assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
- continue;
- }
- }
-
- // Ignore users in the user ignore list.
- if (is_contained(UserIgnoreList, UserInst))
- continue;
-
- LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
- << Lane << " from " << *Scalar << ".\n");
- ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
- }
- }
- }
-}
-
-void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
- const EdgeInfo &UserTreeIdx) {
- assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
-
- InstructionsState S = getSameOpcode(VL);
- if (Depth == RecursionMaxDepth) {
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
- return;
- }
-
- // Don't handle vectors.
- if (S.OpValue->getType()->isVectorTy()) {
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
- return;
- }
-
- if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
- if (SI->getValueOperand()->getType()->isVectorTy()) {
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
- return;
- }
-
- // If all of the operands are identical or constant we have a simple solution.
- if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
- return;
- }
-
- // We now know that this is a vector of instructions of the same type from
- // the same block.
-
- // Don't vectorize ephemeral values.
- for (Value *V : VL) {
- if (EphValues.count(V)) {
- LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
- << ") is ephemeral.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
- return;
- }
- }
-
- // Check if this is a duplicate of another entry.
- if (TreeEntry *E = getTreeEntry(S.OpValue)) {
- LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
- if (!E->isSame(VL)) {
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
- return;
- }
- // Record the reuse of the tree node. FIXME, currently this is only used to
- // properly draw the graph rather than for the actual vectorization.
- E->UserTreeIndices.push_back(UserTreeIdx);
- LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
- << ".\n");
- return;
- }
-
- // Check that none of the instructions in the bundle are already in the tree.
- for (Value *V : VL) {
- auto *I = dyn_cast<Instruction>(V);
- if (!I)
- continue;
- if (getTreeEntry(I)) {
- LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
- << ") is already in tree.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
- return;
- }
- }
-
- // If any of the scalars is marked as a value that needs to stay scalar, then
- // we need to gather the scalars.
- // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
- for (Value *V : VL) {
- if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
- LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
- return;
- }
- }
-
- // Check that all of the users of the scalars that we want to vectorize are
- // schedulable.
- auto *VL0 = cast<Instruction>(S.OpValue);
- BasicBlock *BB = VL0->getParent();
-
- if (!DT->isReachableFromEntry(BB)) {
- // Don't go into unreachable blocks. They may contain instructions with
- // dependency cycles which confuse the final scheduling.
- LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
- return;
- }
-
- // Check that every instruction appears once in this bundle.
- SmallVector<unsigned, 4> ReuseShuffleIndicies;
- SmallVector<Value *, 4> UniqueValues;
- DenseMap<Value *, unsigned> UniquePositions;
- for (Value *V : VL) {
- auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
- ReuseShuffleIndicies.emplace_back(Res.first->second);
- if (Res.second)
- UniqueValues.emplace_back(V);
- }
- size_t NumUniqueScalarValues = UniqueValues.size();
- if (NumUniqueScalarValues == VL.size()) {
- ReuseShuffleIndicies.clear();
- } else {
- LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
- if (NumUniqueScalarValues <= 1 ||
- !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
- LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
- return;
- }
- VL = UniqueValues;
- }
-
- auto &BSRef = BlocksSchedules[BB];
- if (!BSRef)
- BSRef = std::make_unique<BlockScheduling>(BB);
-
- BlockScheduling &BS = *BSRef.get();
-
- Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
- if (!Bundle) {
- LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
- assert((!BS.getScheduleData(VL0) ||
- !BS.getScheduleData(VL0)->isPartOfBundle()) &&
- "tryScheduleBundle should cancelScheduling on failure");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
- }
- LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
-
- unsigned ShuffleOrOp = S.isAltShuffle() ?
- (unsigned) Instruction::ShuffleVector : S.getOpcode();
- switch (ShuffleOrOp) {
- case Instruction::PHI: {
- auto *PH = cast<PHINode>(VL0);
-
- // Check for terminator values (e.g. invoke).
+ };
+}
+
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+ ArrayRef<Value *> UserIgnoreLst) {
+ ExtraValueToDebugLocsMap ExternallyUsedValues;
+ buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
+}
+
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+ ExtraValueToDebugLocsMap &ExternallyUsedValues,
+ ArrayRef<Value *> UserIgnoreLst) {
+ deleteTree();
+ UserIgnoreList = UserIgnoreLst;
+ if (!allSameType(Roots))
+ return;
+ buildTree_rec(Roots, 0, EdgeInfo());
+
+ // Collect the values that we need to extract from the tree.
+ for (auto &TEPtr : VectorizableTree) {
+ TreeEntry *Entry = TEPtr.get();
+
+ // No need to handle users of gathered values.
+ if (Entry->State == TreeEntry::NeedToGather)
+ continue;
+
+ // For each lane:
+ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+ Value *Scalar = Entry->Scalars[Lane];
+ int FoundLane = Lane;
+ if (!Entry->ReuseShuffleIndices.empty()) {
+ FoundLane =
+ std::distance(Entry->ReuseShuffleIndices.begin(),
+ llvm::find(Entry->ReuseShuffleIndices, FoundLane));
+ }
+
+ // Check if the scalar is externally used as an extra arg.
+ auto ExtI = ExternallyUsedValues.find(Scalar);
+ if (ExtI != ExternallyUsedValues.end()) {
+ LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
+ << Lane << " from " << *Scalar << ".\n");
+ ExternalUses.emplace_back(Scalar, nullptr, FoundLane);
+ }
+ for (User *U : Scalar->users()) {
+ LLVM_DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
+
+ Instruction *UserInst = dyn_cast<Instruction>(U);
+ if (!UserInst)
+ continue;
+
+ // Skip in-tree scalars that become vectors
+ if (TreeEntry *UseEntry = getTreeEntry(U)) {
+ Value *UseScalar = UseEntry->Scalars[0];
+ // Some in-tree scalars will remain as scalar in vectorized
+ // instructions. If that is the case, the one in Lane 0 will
+ // be used.
+ if (UseScalar != U ||
+ !InTreeUserNeedToExtract(Scalar, UserInst, TLI)) {
+ LLVM_DEBUG(dbgs() << "SLP: \tInternal user will be removed:" << *U
+ << ".\n");
+ assert(UseEntry->State != TreeEntry::NeedToGather && "Bad state");
+ continue;
+ }
+ }
+
+ // Ignore users in the user ignore list.
+ if (is_contained(UserIgnoreList, UserInst))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *U << " from lane "
+ << Lane << " from " << *Scalar << ".\n");
+ ExternalUses.push_back(ExternalUser(Scalar, U, FoundLane));
+ }
+ }
+ }
+}
+
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
+ const EdgeInfo &UserTreeIdx) {
+ assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
+
+ InstructionsState S = getSameOpcode(VL);
+ if (Depth == RecursionMaxDepth) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+
+ // Don't handle vectors.
+ if (S.OpValue->getType()->isVectorTy()) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+
+ if (StoreInst *SI = dyn_cast<StoreInst>(S.OpValue))
+ if (SI->getValueOperand()->getType()->isVectorTy()) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+
+ // If all of the operands are identical or constant we have a simple solution.
+ if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !S.getOpcode()) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+
+ // We now know that this is a vector of instructions of the same type from
+ // the same block.
+
+ // Don't vectorize ephemeral values.
+ for (Value *V : VL) {
+ if (EphValues.count(V)) {
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
+ << ") is ephemeral.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+ }
+
+ // Check if this is a duplicate of another entry.
+ if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+ LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n");
+ if (!E->isSame(VL)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+ // Record the reuse of the tree node. FIXME, currently this is only used to
+ // properly draw the graph rather than for the actual vectorization.
+ E->UserTreeIndices.push_back(UserTreeIdx);
+ LLVM_DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *S.OpValue
+ << ".\n");
+ return;
+ }
+
+ // Check that none of the instructions in the bundle are already in the tree.
+ for (Value *V : VL) {
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I)
+ continue;
+ if (getTreeEntry(I)) {
+ LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V
+ << ") is already in tree.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+ }
+
+ // If any of the scalars is marked as a value that needs to stay scalar, then
+ // we need to gather the scalars.
+ // The reduction nodes (stored in UserIgnoreList) also should stay scalar.
+ for (Value *V : VL) {
+ if (MustGather.count(V) || is_contained(UserIgnoreList, V)) {
+ LLVM_DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+ }
+
+ // Check that all of the users of the scalars that we want to vectorize are
+ // schedulable.
+ auto *VL0 = cast<Instruction>(S.OpValue);
+ BasicBlock *BB = VL0->getParent();
+
+ if (!DT->isReachableFromEntry(BB)) {
+ // Don't go into unreachable blocks. They may contain instructions with
+ // dependency cycles which confuse the final scheduling.
+ LLVM_DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+
+ // Check that every instruction appears once in this bundle.
+ SmallVector<unsigned, 4> ReuseShuffleIndicies;
+ SmallVector<Value *, 4> UniqueValues;
+ DenseMap<Value *, unsigned> UniquePositions;
+ for (Value *V : VL) {
+ auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+ ReuseShuffleIndicies.emplace_back(Res.first->second);
+ if (Res.second)
+ UniqueValues.emplace_back(V);
+ }
+ size_t NumUniqueScalarValues = UniqueValues.size();
+ if (NumUniqueScalarValues == VL.size()) {
+ ReuseShuffleIndicies.clear();
+ } else {
+ LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n");
+ if (NumUniqueScalarValues <= 1 ||
+ !llvm::isPowerOf2_32(NumUniqueScalarValues)) {
+ LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx);
+ return;
+ }
+ VL = UniqueValues;
+ }
+
+ auto &BSRef = BlocksSchedules[BB];
+ if (!BSRef)
+ BSRef = std::make_unique<BlockScheduling>(BB);
+
+ BlockScheduling &BS = *BSRef.get();
+
+ Optional<ScheduleData *> Bundle = BS.tryScheduleBundle(VL, this, S);
+ if (!Bundle) {
+ LLVM_DEBUG(dbgs() << "SLP: We are not able to schedule this bundle!\n");
+ assert((!BS.getScheduleData(VL0) ||
+ !BS.getScheduleData(VL0)->isPartOfBundle()) &&
+ "tryScheduleBundle should cancelScheduling on failure");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+ LLVM_DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
+
+ unsigned ShuffleOrOp = S.isAltShuffle() ?
+ (unsigned) Instruction::ShuffleVector : S.getOpcode();
+ switch (ShuffleOrOp) {
+ case Instruction::PHI: {
+ auto *PH = cast<PHINode>(VL0);
+
+ // Check for terminator values (e.g. invoke).
for (Value *V : VL)
for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
- Instruction *Term = dyn_cast<Instruction>(
+ Instruction *Term = dyn_cast<Instruction>(
cast<PHINode>(V)->getIncomingValueForBlock(
PH->getIncomingBlock(I)));
- if (Term && Term->isTerminator()) {
- LLVM_DEBUG(dbgs()
- << "SLP: Need to swizzle PHINodes (terminator use).\n");
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
- }
- }
-
- TreeEntry *TE =
- newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
-
- // Keeps the reordered operands to avoid code duplication.
- SmallVector<ValueList, 2> OperandsVec;
+ if (Term && Term->isTerminator()) {
+ LLVM_DEBUG(dbgs()
+ << "SLP: Need to swizzle PHINodes (terminator use).\n");
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+ }
+
+ TreeEntry *TE =
+ newTreeEntry(VL, Bundle, S, UserTreeIdx, ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
+
+ // Keeps the reordered operands to avoid code duplication.
+ SmallVector<ValueList, 2> OperandsVec;
for (unsigned I = 0, E = PH->getNumIncomingValues(); I < E; ++I) {
- ValueList Operands;
- // Prepare the operand vector.
+ ValueList Operands;
+ // Prepare the operand vector.
for (Value *V : VL)
Operands.push_back(cast<PHINode>(V)->getIncomingValueForBlock(
PH->getIncomingBlock(I)));
TE->setOperand(I, Operands);
- OperandsVec.push_back(Operands);
- }
- for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
- buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
- return;
- }
- case Instruction::ExtractValue:
- case Instruction::ExtractElement: {
- OrdersType CurrentOrder;
- bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
- if (Reuse) {
- LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
- ++NumOpsWantToKeepOriginalOrder;
- newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- // This is a special case, as it does not gather, but at the same time
- // we are not extending buildTree_rec() towards the operands.
- ValueList Op0;
- Op0.assign(VL.size(), VL0->getOperand(0));
- VectorizableTree.back()->setOperand(0, Op0);
- return;
- }
- if (!CurrentOrder.empty()) {
- LLVM_DEBUG({
- dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
- "with order";
- for (unsigned Idx : CurrentOrder)
- dbgs() << " " << Idx;
- dbgs() << "\n";
- });
- // Insert new order with initial value 0, if it does not exist,
- // otherwise return the iterator to the existing one.
- newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ OperandsVec.push_back(Operands);
+ }
+ for (unsigned OpIdx = 0, OpE = OperandsVec.size(); OpIdx != OpE; ++OpIdx)
+ buildTree_rec(OperandsVec[OpIdx], Depth + 1, {TE, OpIdx});
+ return;
+ }
+ case Instruction::ExtractValue:
+ case Instruction::ExtractElement: {
+ OrdersType CurrentOrder;
+ bool Reuse = canReuseExtract(VL, VL0, CurrentOrder);
+ if (Reuse) {
+ LLVM_DEBUG(dbgs() << "SLP: Reusing or shuffling extract sequence.\n");
+ ++NumOpsWantToKeepOriginalOrder;
+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ // This is a special case, as it does not gather, but at the same time
+ // we are not extending buildTree_rec() towards the operands.
+ ValueList Op0;
+ Op0.assign(VL.size(), VL0->getOperand(0));
+ VectorizableTree.back()->setOperand(0, Op0);
+ return;
+ }
+ if (!CurrentOrder.empty()) {
+ LLVM_DEBUG({
+ dbgs() << "SLP: Reusing or shuffling of reordered extract sequence "
+ "with order";
+ for (unsigned Idx : CurrentOrder)
+ dbgs() << " " << Idx;
+ dbgs() << "\n";
+ });
+ // Insert new order with initial value 0, if it does not exist,
+ // otherwise return the iterator to the existing one.
+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies, CurrentOrder);
findRootOrder(CurrentOrder);
++NumOpsWantToKeepOrder[CurrentOrder];
- // This is a special case, as it does not gather, but at the same time
- // we are not extending buildTree_rec() towards the operands.
- ValueList Op0;
- Op0.assign(VL.size(), VL0->getOperand(0));
- VectorizableTree.back()->setOperand(0, Op0);
- return;
- }
- LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- BS.cancelScheduling(VL, VL0);
- return;
- }
- case Instruction::Load: {
- // Check that a vectorized load would load the same memory as a scalar
- // load. For example, we don't want to vectorize loads that are smaller
- // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
- // treats loading/storing it as an i8 struct. If we vectorize loads/stores
- // from such a struct, we read/write packed bits disagreeing with the
- // unvectorized version.
- Type *ScalarTy = VL0->getType();
-
- if (DL->getTypeSizeInBits(ScalarTy) !=
- DL->getTypeAllocSizeInBits(ScalarTy)) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
- return;
- }
-
- // Make sure all loads in the bundle are simple - we can't vectorize
- // atomic or volatile loads.
- SmallVector<Value *, 4> PointerOps(VL.size());
- auto POIter = PointerOps.begin();
- for (Value *V : VL) {
- auto *L = cast<LoadInst>(V);
- if (!L->isSimple()) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
- return;
- }
- *POIter = L->getPointerOperand();
- ++POIter;
- }
-
- OrdersType CurrentOrder;
- // Check the order of pointer operands.
- if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
- Value *Ptr0;
- Value *PtrN;
- if (CurrentOrder.empty()) {
- Ptr0 = PointerOps.front();
- PtrN = PointerOps.back();
- } else {
- Ptr0 = PointerOps[CurrentOrder.front()];
- PtrN = PointerOps[CurrentOrder.back()];
- }
- const SCEV *Scev0 = SE->getSCEV(Ptr0);
- const SCEV *ScevN = SE->getSCEV(PtrN);
- const auto *Diff =
- dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
- uint64_t Size = DL->getTypeAllocSize(ScalarTy);
- // Check that the sorted loads are consecutive.
- if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
- if (CurrentOrder.empty()) {
- // Original loads are consecutive and does not require reordering.
- ++NumOpsWantToKeepOriginalOrder;
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
- UserTreeIdx, ReuseShuffleIndicies);
- TE->setOperandsInOrder();
- LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
- } else {
- // Need to reorder.
- TreeEntry *TE =
- newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ // This is a special case, as it does not gather, but at the same time
+ // we are not extending buildTree_rec() towards the operands.
+ ValueList Op0;
+ Op0.assign(VL.size(), VL0->getOperand(0));
+ VectorizableTree.back()->setOperand(0, Op0);
+ return;
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n");
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ BS.cancelScheduling(VL, VL0);
+ return;
+ }
+ case Instruction::Load: {
+ // Check that a vectorized load would load the same memory as a scalar
+ // load. For example, we don't want to vectorize loads that are smaller
+ // than 8-bit. Even though we have a packed struct {<i2, i2, i2, i2>} LLVM
+ // treats loading/storing it as an i8 struct. If we vectorize loads/stores
+ // from such a struct, we read/write packed bits disagreeing with the
+ // unvectorized version.
+ Type *ScalarTy = VL0->getType();
+
+ if (DL->getTypeSizeInBits(ScalarTy) !=
+ DL->getTypeAllocSizeInBits(ScalarTy)) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
+ return;
+ }
+
+ // Make sure all loads in the bundle are simple - we can't vectorize
+ // atomic or volatile loads.
+ SmallVector<Value *, 4> PointerOps(VL.size());
+ auto POIter = PointerOps.begin();
+ for (Value *V : VL) {
+ auto *L = cast<LoadInst>(V);
+ if (!L->isSimple()) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
+ return;
+ }
+ *POIter = L->getPointerOperand();
+ ++POIter;
+ }
+
+ OrdersType CurrentOrder;
+ // Check the order of pointer operands.
+ if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+ Value *Ptr0;
+ Value *PtrN;
+ if (CurrentOrder.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
+ } else {
+ Ptr0 = PointerOps[CurrentOrder.front()];
+ PtrN = PointerOps[CurrentOrder.back()];
+ }
+ const SCEV *Scev0 = SE->getSCEV(Ptr0);
+ const SCEV *ScevN = SE->getSCEV(PtrN);
+ const auto *Diff =
+ dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
+ uint64_t Size = DL->getTypeAllocSize(ScalarTy);
+ // Check that the sorted loads are consecutive.
+ if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
+ if (CurrentOrder.empty()) {
+ // Original loads are consecutive and does not require reordering.
+ ++NumOpsWantToKeepOriginalOrder;
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
+ UserTreeIdx, ReuseShuffleIndicies);
+ TE->setOperandsInOrder();
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n");
+ } else {
+ // Need to reorder.
+ TreeEntry *TE =
+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies, CurrentOrder);
- TE->setOperandsInOrder();
- LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
+ TE->setOperandsInOrder();
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled loads.\n");
findRootOrder(CurrentOrder);
++NumOpsWantToKeepOrder[CurrentOrder];
- }
- return;
- }
+ }
+ return;
+ }
// Vectorizing non-consecutive loads with `llvm.masked.gather`.
TreeEntry *TE = newTreeEntry(VL, TreeEntry::ScatterVectorize, Bundle, S,
UserTreeIdx, ReuseShuffleIndicies);
@@ -2899,209 +2899,209 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
buildTree_rec(PointerOps, Depth + 1, {TE, 0});
LLVM_DEBUG(dbgs() << "SLP: added a vector of non-consecutive loads.\n");
return;
- }
-
- LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
- }
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
- Type *SrcTy = VL0->getOperand(0)->getType();
- for (Value *V : VL) {
- Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
- if (Ty != SrcTy || !isValidElementType(Ty)) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs()
- << "SLP: Gathering casts with different src types.\n");
- return;
- }
- }
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
-
- TE->setOperandsInOrder();
- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
- ValueList Operands;
- // Prepare the operand vector.
- for (Value *V : VL)
- Operands.push_back(cast<Instruction>(V)->getOperand(i));
-
- buildTree_rec(Operands, Depth + 1, {TE, i});
- }
- return;
- }
- case Instruction::ICmp:
- case Instruction::FCmp: {
- // Check that all of the compares have the same predicate.
- CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
- CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
- Type *ComparedTy = VL0->getOperand(0)->getType();
- for (Value *V : VL) {
- CmpInst *Cmp = cast<CmpInst>(V);
- if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
- Cmp->getOperand(0)->getType() != ComparedTy) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs()
- << "SLP: Gathering cmp with different predicate.\n");
- return;
- }
- }
-
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
-
- ValueList Left, Right;
- if (cast<CmpInst>(VL0)->isCommutative()) {
- // Commutative predicate - collect + sort operands of the instructions
- // so that each side is more likely to have the same opcode.
- assert(P0 == SwapP0 && "Commutative Predicate mismatch");
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
- } else {
- // Collect operands - commute if it uses the swapped predicate.
- for (Value *V : VL) {
- auto *Cmp = cast<CmpInst>(V);
- Value *LHS = Cmp->getOperand(0);
- Value *RHS = Cmp->getOperand(1);
- if (Cmp->getPredicate() != P0)
- std::swap(LHS, RHS);
- Left.push_back(LHS);
- Right.push_back(RHS);
- }
- }
- TE->setOperand(0, Left);
- TE->setOperand(1, Right);
- buildTree_rec(Left, Depth + 1, {TE, 0});
- buildTree_rec(Right, Depth + 1, {TE, 1});
- return;
- }
- case Instruction::Select:
- case Instruction::FNeg:
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
-
- // Sort operands of the instructions so that each side is more likely to
- // have the same opcode.
- if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
- ValueList Left, Right;
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
- TE->setOperand(0, Left);
- TE->setOperand(1, Right);
- buildTree_rec(Left, Depth + 1, {TE, 0});
- buildTree_rec(Right, Depth + 1, {TE, 1});
- return;
- }
-
- TE->setOperandsInOrder();
- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
- ValueList Operands;
- // Prepare the operand vector.
+ }
+
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-consecutive loads.\n");
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ Type *SrcTy = VL0->getOperand(0)->getType();
+ for (Value *V : VL) {
+ Type *Ty = cast<Instruction>(V)->getOperand(0)->getType();
+ if (Ty != SrcTy || !isValidElementType(Ty)) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs()
+ << "SLP: Gathering casts with different src types.\n");
+ return;
+ }
+ }
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of casts.\n");
+
+ TE->setOperandsInOrder();
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (Value *V : VL)
+ Operands.push_back(cast<Instruction>(V)->getOperand(i));
+
+ buildTree_rec(Operands, Depth + 1, {TE, i});
+ }
+ return;
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ // Check that all of the compares have the same predicate.
+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+ CmpInst::Predicate SwapP0 = CmpInst::getSwappedPredicate(P0);
+ Type *ComparedTy = VL0->getOperand(0)->getType();
+ for (Value *V : VL) {
+ CmpInst *Cmp = cast<CmpInst>(V);
+ if ((Cmp->getPredicate() != P0 && Cmp->getPredicate() != SwapP0) ||
+ Cmp->getOperand(0)->getType() != ComparedTy) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs()
+ << "SLP: Gathering cmp with different predicate.\n");
+ return;
+ }
+ }
+
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of compares.\n");
+
+ ValueList Left, Right;
+ if (cast<CmpInst>(VL0)->isCommutative()) {
+ // Commutative predicate - collect + sort operands of the instructions
+ // so that each side is more likely to have the same opcode.
+ assert(P0 == SwapP0 && "Commutative Predicate mismatch");
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+ } else {
+ // Collect operands - commute if it uses the swapped predicate.
+ for (Value *V : VL) {
+ auto *Cmp = cast<CmpInst>(V);
+ Value *LHS = Cmp->getOperand(0);
+ Value *RHS = Cmp->getOperand(1);
+ if (Cmp->getPredicate() != P0)
+ std::swap(LHS, RHS);
+ Left.push_back(LHS);
+ Right.push_back(RHS);
+ }
+ }
+ TE->setOperand(0, Left);
+ TE->setOperand(1, Right);
+ buildTree_rec(Left, Depth + 1, {TE, 0});
+ buildTree_rec(Right, Depth + 1, {TE, 1});
+ return;
+ }
+ case Instruction::Select:
+ case Instruction::FNeg:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of un/bin op.\n");
+
+ // Sort operands of the instructions so that each side is more likely to
+ // have the same opcode.
+ if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
+ ValueList Left, Right;
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+ TE->setOperand(0, Left);
+ TE->setOperand(1, Right);
+ buildTree_rec(Left, Depth + 1, {TE, 0});
+ buildTree_rec(Right, Depth + 1, {TE, 1});
+ return;
+ }
+
+ TE->setOperandsInOrder();
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (Value *V : VL)
+ Operands.push_back(cast<Instruction>(V)->getOperand(i));
+
+ buildTree_rec(Operands, Depth + 1, {TE, i});
+ }
+ return;
+ }
+ case Instruction::GetElementPtr: {
+ // We don't combine GEPs with complicated (nested) indexing.
+ for (Value *V : VL) {
+ if (cast<Instruction>(V)->getNumOperands() != 2) {
+ LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+ }
+
+ // We can't combine several GEPs into one vector if they operate on
+ // different types.
+ Type *Ty0 = VL0->getOperand(0)->getType();
+ for (Value *V : VL) {
+ Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
+ if (Ty0 != CurTy) {
+ LLVM_DEBUG(dbgs()
+ << "SLP: not-vectorizable GEP (different types).\n");
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+ }
+
+ // We don't combine GEPs with non-constant indexes.
+ Type *Ty1 = VL0->getOperand(1)->getType();
+ for (Value *V : VL) {
+ auto Op = cast<Instruction>(V)->getOperand(1);
+ if (!isa<ConstantInt>(Op) ||
+ (Op->getType() != Ty1 &&
+ Op->getType()->getScalarSizeInBits() >
+ DL->getIndexSizeInBits(
+ V->getType()->getPointerAddressSpace()))) {
+ LLVM_DEBUG(dbgs()
+ << "SLP: not-vectorizable GEP (non-constant indexes).\n");
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ return;
+ }
+ }
+
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
+ TE->setOperandsInOrder();
+ for (unsigned i = 0, e = 2; i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
for (Value *V : VL)
Operands.push_back(cast<Instruction>(V)->getOperand(i));
-
- buildTree_rec(Operands, Depth + 1, {TE, i});
- }
- return;
- }
- case Instruction::GetElementPtr: {
- // We don't combine GEPs with complicated (nested) indexing.
- for (Value *V : VL) {
- if (cast<Instruction>(V)->getNumOperands() != 2) {
- LLVM_DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
- }
- }
-
- // We can't combine several GEPs into one vector if they operate on
- // different types.
- Type *Ty0 = VL0->getOperand(0)->getType();
- for (Value *V : VL) {
- Type *CurTy = cast<Instruction>(V)->getOperand(0)->getType();
- if (Ty0 != CurTy) {
- LLVM_DEBUG(dbgs()
- << "SLP: not-vectorizable GEP (different types).\n");
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
- }
- }
-
- // We don't combine GEPs with non-constant indexes.
- Type *Ty1 = VL0->getOperand(1)->getType();
- for (Value *V : VL) {
- auto Op = cast<Instruction>(V)->getOperand(1);
- if (!isa<ConstantInt>(Op) ||
- (Op->getType() != Ty1 &&
- Op->getType()->getScalarSizeInBits() >
- DL->getIndexSizeInBits(
- V->getType()->getPointerAddressSpace()))) {
- LLVM_DEBUG(dbgs()
- << "SLP: not-vectorizable GEP (non-constant indexes).\n");
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- return;
- }
- }
-
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
- TE->setOperandsInOrder();
- for (unsigned i = 0, e = 2; i < e; ++i) {
- ValueList Operands;
- // Prepare the operand vector.
- for (Value *V : VL)
- Operands.push_back(cast<Instruction>(V)->getOperand(i));
-
- buildTree_rec(Operands, Depth + 1, {TE, i});
- }
- return;
- }
- case Instruction::Store: {
- // Check if the stores are consecutive or if we need to swizzle them.
- llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
+
+ buildTree_rec(Operands, Depth + 1, {TE, i});
+ }
+ return;
+ }
+ case Instruction::Store: {
+ // Check if the stores are consecutive or if we need to swizzle them.
+ llvm::Type *ScalarTy = cast<StoreInst>(VL0)->getValueOperand()->getType();
// Avoid types that are padded when being allocated as scalars, while
// being packed together in a vector (such as i1).
if (DL->getTypeSizeInBits(ScalarTy) !=
@@ -3112,511 +3112,511 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
LLVM_DEBUG(dbgs() << "SLP: Gathering stores of non-packed type.\n");
return;
}
- // Make sure all stores in the bundle are simple - we can't vectorize
- // atomic or volatile stores.
- SmallVector<Value *, 4> PointerOps(VL.size());
- ValueList Operands(VL.size());
- auto POIter = PointerOps.begin();
- auto OIter = Operands.begin();
- for (Value *V : VL) {
- auto *SI = cast<StoreInst>(V);
- if (!SI->isSimple()) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
- return;
- }
- *POIter = SI->getPointerOperand();
- *OIter = SI->getValueOperand();
- ++POIter;
- ++OIter;
- }
-
- OrdersType CurrentOrder;
- // Check the order of pointer operands.
- if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
- Value *Ptr0;
- Value *PtrN;
- if (CurrentOrder.empty()) {
- Ptr0 = PointerOps.front();
- PtrN = PointerOps.back();
- } else {
- Ptr0 = PointerOps[CurrentOrder.front()];
- PtrN = PointerOps[CurrentOrder.back()];
- }
- const SCEV *Scev0 = SE->getSCEV(Ptr0);
- const SCEV *ScevN = SE->getSCEV(PtrN);
- const auto *Diff =
- dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
- uint64_t Size = DL->getTypeAllocSize(ScalarTy);
- // Check that the sorted pointer operands are consecutive.
- if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
- if (CurrentOrder.empty()) {
- // Original stores are consecutive and does not require reordering.
- ++NumOpsWantToKeepOriginalOrder;
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
- UserTreeIdx, ReuseShuffleIndicies);
- TE->setOperandsInOrder();
- buildTree_rec(Operands, Depth + 1, {TE, 0});
- LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
- } else {
- TreeEntry *TE =
- newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ // Make sure all stores in the bundle are simple - we can't vectorize
+ // atomic or volatile stores.
+ SmallVector<Value *, 4> PointerOps(VL.size());
+ ValueList Operands(VL.size());
+ auto POIter = PointerOps.begin();
+ auto OIter = Operands.begin();
+ for (Value *V : VL) {
+ auto *SI = cast<StoreInst>(V);
+ if (!SI->isSimple()) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering non-simple stores.\n");
+ return;
+ }
+ *POIter = SI->getPointerOperand();
+ *OIter = SI->getValueOperand();
+ ++POIter;
+ ++OIter;
+ }
+
+ OrdersType CurrentOrder;
+ // Check the order of pointer operands.
+ if (llvm::sortPtrAccesses(PointerOps, *DL, *SE, CurrentOrder)) {
+ Value *Ptr0;
+ Value *PtrN;
+ if (CurrentOrder.empty()) {
+ Ptr0 = PointerOps.front();
+ PtrN = PointerOps.back();
+ } else {
+ Ptr0 = PointerOps[CurrentOrder.front()];
+ PtrN = PointerOps[CurrentOrder.back()];
+ }
+ const SCEV *Scev0 = SE->getSCEV(Ptr0);
+ const SCEV *ScevN = SE->getSCEV(PtrN);
+ const auto *Diff =
+ dyn_cast<SCEVConstant>(SE->getMinusSCEV(ScevN, Scev0));
+ uint64_t Size = DL->getTypeAllocSize(ScalarTy);
+ // Check that the sorted pointer operands are consecutive.
+ if (Diff && Diff->getAPInt() == (VL.size() - 1) * Size) {
+ if (CurrentOrder.empty()) {
+ // Original stores are consecutive and does not require reordering.
+ ++NumOpsWantToKeepOriginalOrder;
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S,
+ UserTreeIdx, ReuseShuffleIndicies);
+ TE->setOperandsInOrder();
+ buildTree_rec(Operands, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of stores.\n");
+ } else {
+ TreeEntry *TE =
+ newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
ReuseShuffleIndicies, CurrentOrder);
- TE->setOperandsInOrder();
- buildTree_rec(Operands, Depth + 1, {TE, 0});
- LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
+ TE->setOperandsInOrder();
+ buildTree_rec(Operands, Depth + 1, {TE, 0});
+ LLVM_DEBUG(dbgs() << "SLP: added a vector of jumbled stores.\n");
findRootOrder(CurrentOrder);
++NumOpsWantToKeepOrder[CurrentOrder];
- }
- return;
- }
- }
-
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
- return;
- }
- case Instruction::Call: {
- // Check if the calls are all to the same vectorizable intrinsic or
- // library function.
- CallInst *CI = cast<CallInst>(VL0);
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-
- VFShape Shape = VFShape::get(
+ }
+ return;
+ }
+ }
+
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
+ return;
+ }
+ case Instruction::Call: {
+ // Check if the calls are all to the same vectorizable intrinsic or
+ // library function.
+ CallInst *CI = cast<CallInst>(VL0);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ VFShape Shape = VFShape::get(
*CI, ElementCount::getFixed(static_cast<unsigned int>(VL.size())),
- false /*HasGlobalPred*/);
- Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
-
- if (!VecFunc && !isTriviallyVectorizable(ID)) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
- return;
- }
- Function *F = CI->getCalledFunction();
- unsigned NumArgs = CI->getNumArgOperands();
- SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
- for (unsigned j = 0; j != NumArgs; ++j)
- if (hasVectorInstrinsicScalarOpd(ID, j))
- ScalarArgs[j] = CI->getArgOperand(j);
- for (Value *V : VL) {
- CallInst *CI2 = dyn_cast<CallInst>(V);
- if (!CI2 || CI2->getCalledFunction() != F ||
- getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
- (VecFunc &&
- VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
- !CI->hasIdenticalOperandBundleSchema(*CI2)) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
- << "\n");
- return;
- }
- // Some intrinsics have scalar arguments and should be same in order for
- // them to be vectorized.
- for (unsigned j = 0; j != NumArgs; ++j) {
- if (hasVectorInstrinsicScalarOpd(ID, j)) {
- Value *A1J = CI2->getArgOperand(j);
- if (ScalarArgs[j] != A1J) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
- << " argument " << ScalarArgs[j] << "!=" << A1J
- << "\n");
- return;
- }
- }
- }
- // Verify that the bundle operands are identical between the two calls.
- if (CI->hasOperandBundles() &&
- !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
- CI->op_begin() + CI->getBundleOperandsEndIndex(),
- CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
- << *CI << "!=" << *V << '\n');
- return;
- }
- }
-
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- TE->setOperandsInOrder();
- for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
- ValueList Operands;
- // Prepare the operand vector.
- for (Value *V : VL) {
- auto *CI2 = cast<CallInst>(V);
- Operands.push_back(CI2->getArgOperand(i));
- }
- buildTree_rec(Operands, Depth + 1, {TE, i});
- }
- return;
- }
- case Instruction::ShuffleVector: {
- // If this is not an alternate sequence of opcode like add-sub
- // then do not vectorize this instruction.
- if (!S.isAltShuffle()) {
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
- return;
- }
- TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
-
- // Reorder operands if reordering would enable vectorization.
- if (isa<BinaryOperator>(VL0)) {
- ValueList Left, Right;
- reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
- TE->setOperand(0, Left);
- TE->setOperand(1, Right);
- buildTree_rec(Left, Depth + 1, {TE, 0});
- buildTree_rec(Right, Depth + 1, {TE, 1});
- return;
- }
-
- TE->setOperandsInOrder();
- for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
- ValueList Operands;
- // Prepare the operand vector.
- for (Value *V : VL)
- Operands.push_back(cast<Instruction>(V)->getOperand(i));
-
- buildTree_rec(Operands, Depth + 1, {TE, i});
- }
- return;
- }
- default:
- BS.cancelScheduling(VL, VL0);
- newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
- ReuseShuffleIndicies);
- LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
- return;
- }
-}
-
-unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
- unsigned N = 1;
- Type *EltTy = T;
-
- while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) ||
- isa<VectorType>(EltTy)) {
- if (auto *ST = dyn_cast<StructType>(EltTy)) {
- // Check that struct is homogeneous.
- for (const auto *Ty : ST->elements())
- if (Ty != *ST->element_begin())
- return 0;
- N *= ST->getNumElements();
- EltTy = *ST->element_begin();
- } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
- N *= AT->getNumElements();
- EltTy = AT->getElementType();
- } else {
+ false /*HasGlobalPred*/);
+ Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+
+ if (!VecFunc && !isTriviallyVectorizable(ID)) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
+ return;
+ }
+ Function *F = CI->getCalledFunction();
+ unsigned NumArgs = CI->getNumArgOperands();
+ SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
+ for (unsigned j = 0; j != NumArgs; ++j)
+ if (hasVectorInstrinsicScalarOpd(ID, j))
+ ScalarArgs[j] = CI->getArgOperand(j);
+ for (Value *V : VL) {
+ CallInst *CI2 = dyn_cast<CallInst>(V);
+ if (!CI2 || CI2->getCalledFunction() != F ||
+ getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
+ (VecFunc &&
+ VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
+ !CI->hasIdenticalOperandBundleSchema(*CI2)) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *V
+ << "\n");
+ return;
+ }
+ // Some intrinsics have scalar arguments and should be same in order for
+ // them to be vectorized.
+ for (unsigned j = 0; j != NumArgs; ++j) {
+ if (hasVectorInstrinsicScalarOpd(ID, j)) {
+ Value *A1J = CI2->getArgOperand(j);
+ if (ScalarArgs[j] != A1J) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
+ << " argument " << ScalarArgs[j] << "!=" << A1J
+ << "\n");
+ return;
+ }
+ }
+ }
+ // Verify that the bundle operands are identical between the two calls.
+ if (CI->hasOperandBundles() &&
+ !std::equal(CI->op_begin() + CI->getBundleOperandsStartIndex(),
+ CI->op_begin() + CI->getBundleOperandsEndIndex(),
+ CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:"
+ << *CI << "!=" << *V << '\n');
+ return;
+ }
+ }
+
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ TE->setOperandsInOrder();
+ for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (Value *V : VL) {
+ auto *CI2 = cast<CallInst>(V);
+ Operands.push_back(CI2->getArgOperand(i));
+ }
+ buildTree_rec(Operands, Depth + 1, {TE, i});
+ }
+ return;
+ }
+ case Instruction::ShuffleVector: {
+ // If this is not an alternate sequence of opcode like add-sub
+ // then do not vectorize this instruction.
+ if (!S.isAltShuffle()) {
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
+ return;
+ }
+ TreeEntry *TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
+
+ // Reorder operands if reordering would enable vectorization.
+ if (isa<BinaryOperator>(VL0)) {
+ ValueList Left, Right;
+ reorderInputsAccordingToOpcode(VL, Left, Right, *DL, *SE, *this);
+ TE->setOperand(0, Left);
+ TE->setOperand(1, Right);
+ buildTree_rec(Left, Depth + 1, {TE, 0});
+ buildTree_rec(Right, Depth + 1, {TE, 1});
+ return;
+ }
+
+ TE->setOperandsInOrder();
+ for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+ ValueList Operands;
+ // Prepare the operand vector.
+ for (Value *V : VL)
+ Operands.push_back(cast<Instruction>(V)->getOperand(i));
+
+ buildTree_rec(Operands, Depth + 1, {TE, i});
+ }
+ return;
+ }
+ default:
+ BS.cancelScheduling(VL, VL0);
+ newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
+ ReuseShuffleIndicies);
+ LLVM_DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
+ return;
+ }
+}
+
+unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
+ unsigned N = 1;
+ Type *EltTy = T;
+
+ while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) ||
+ isa<VectorType>(EltTy)) {
+ if (auto *ST = dyn_cast<StructType>(EltTy)) {
+ // Check that struct is homogeneous.
+ for (const auto *Ty : ST->elements())
+ if (Ty != *ST->element_begin())
+ return 0;
+ N *= ST->getNumElements();
+ EltTy = *ST->element_begin();
+ } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
+ N *= AT->getNumElements();
+ EltTy = AT->getElementType();
+ } else {
auto *VT = cast<FixedVectorType>(EltTy);
- N *= VT->getNumElements();
- EltTy = VT->getElementType();
- }
- }
-
- if (!isValidElementType(EltTy))
- return 0;
- uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
- if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
- return 0;
- return N;
-}
-
-bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
- SmallVectorImpl<unsigned> &CurrentOrder) const {
- Instruction *E0 = cast<Instruction>(OpValue);
- assert(E0->getOpcode() == Instruction::ExtractElement ||
- E0->getOpcode() == Instruction::ExtractValue);
- assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
- // Check if all of the extracts come from the same vector and from the
- // correct offset.
- Value *Vec = E0->getOperand(0);
-
- CurrentOrder.clear();
-
- // We have to extract from a vector/aggregate with the same number of elements.
- unsigned NElts;
- if (E0->getOpcode() == Instruction::ExtractValue) {
- const DataLayout &DL = E0->getModule()->getDataLayout();
- NElts = canMapToVector(Vec->getType(), DL);
- if (!NElts)
- return false;
- // Check if load can be rewritten as load of vector.
- LoadInst *LI = dyn_cast<LoadInst>(Vec);
- if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
- return false;
- } else {
+ N *= VT->getNumElements();
+ EltTy = VT->getElementType();
+ }
+ }
+
+ if (!isValidElementType(EltTy))
+ return 0;
+ uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
+ if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
+ return 0;
+ return N;
+}
+
+bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
+ SmallVectorImpl<unsigned> &CurrentOrder) const {
+ Instruction *E0 = cast<Instruction>(OpValue);
+ assert(E0->getOpcode() == Instruction::ExtractElement ||
+ E0->getOpcode() == Instruction::ExtractValue);
+ assert(E0->getOpcode() == getSameOpcode(VL).getOpcode() && "Invalid opcode");
+ // Check if all of the extracts come from the same vector and from the
+ // correct offset.
+ Value *Vec = E0->getOperand(0);
+
+ CurrentOrder.clear();
+
+ // We have to extract from a vector/aggregate with the same number of elements.
+ unsigned NElts;
+ if (E0->getOpcode() == Instruction::ExtractValue) {
+ const DataLayout &DL = E0->getModule()->getDataLayout();
+ NElts = canMapToVector(Vec->getType(), DL);
+ if (!NElts)
+ return false;
+ // Check if load can be rewritten as load of vector.
+ LoadInst *LI = dyn_cast<LoadInst>(Vec);
+ if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
+ return false;
+ } else {
NElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
- }
-
- if (NElts != VL.size())
- return false;
-
- // Check that all of the indices extract from the correct offset.
- bool ShouldKeepOrder = true;
- unsigned E = VL.size();
- // Assign to all items the initial value E + 1 so we can check if the extract
- // instruction index was used already.
- // Also, later we can check that all the indices are used and we have a
- // consecutive access in the extract instructions, by checking that no
- // element of CurrentOrder still has value E + 1.
- CurrentOrder.assign(E, E + 1);
- unsigned I = 0;
- for (; I < E; ++I) {
- auto *Inst = cast<Instruction>(VL[I]);
- if (Inst->getOperand(0) != Vec)
- break;
- Optional<unsigned> Idx = getExtractIndex(Inst);
- if (!Idx)
- break;
- const unsigned ExtIdx = *Idx;
- if (ExtIdx != I) {
- if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
- break;
- ShouldKeepOrder = false;
- CurrentOrder[ExtIdx] = I;
- } else {
- if (CurrentOrder[I] != E + 1)
- break;
- CurrentOrder[I] = I;
- }
- }
- if (I < E) {
- CurrentOrder.clear();
- return false;
- }
-
- return ShouldKeepOrder;
-}
-
-bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
+ }
+
+ if (NElts != VL.size())
+ return false;
+
+ // Check that all of the indices extract from the correct offset.
+ bool ShouldKeepOrder = true;
+ unsigned E = VL.size();
+ // Assign to all items the initial value E + 1 so we can check if the extract
+ // instruction index was used already.
+ // Also, later we can check that all the indices are used and we have a
+ // consecutive access in the extract instructions, by checking that no
+ // element of CurrentOrder still has value E + 1.
+ CurrentOrder.assign(E, E + 1);
+ unsigned I = 0;
+ for (; I < E; ++I) {
+ auto *Inst = cast<Instruction>(VL[I]);
+ if (Inst->getOperand(0) != Vec)
+ break;
+ Optional<unsigned> Idx = getExtractIndex(Inst);
+ if (!Idx)
+ break;
+ const unsigned ExtIdx = *Idx;
+ if (ExtIdx != I) {
+ if (ExtIdx >= E || CurrentOrder[ExtIdx] != E + 1)
+ break;
+ ShouldKeepOrder = false;
+ CurrentOrder[ExtIdx] = I;
+ } else {
+ if (CurrentOrder[I] != E + 1)
+ break;
+ CurrentOrder[I] = I;
+ }
+ }
+ if (I < E) {
+ CurrentOrder.clear();
+ return false;
+ }
+
+ return ShouldKeepOrder;
+}
+
+bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
return I->hasOneUse() || llvm::all_of(I->users(), [this](User *U) {
- return ScalarToTreeEntry.count(U) > 0;
- });
-}
-
+ return ScalarToTreeEntry.count(U) > 0;
+ });
+}
+
static std::pair<InstructionCost, InstructionCost>
getVectorCallCosts(CallInst *CI, FixedVectorType *VecTy,
TargetTransformInfo *TTI, TargetLibraryInfo *TLI) {
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-
- // Calculate the cost of the scalar and vector calls.
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ // Calculate the cost of the scalar and vector calls.
IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getElementCount());
auto IntrinsicCost =
- TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
-
+ TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
+
auto Shape = VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
VecTy->getNumElements())),
false /*HasGlobalPred*/);
- Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+ Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
auto LibCost = IntrinsicCost;
- if (!CI->isNoBuiltin() && VecFunc) {
- // Calculate the cost of the vector library call.
- SmallVector<Type *, 4> VecTys;
- for (Use &Arg : CI->args())
- VecTys.push_back(
- FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
-
- // If the corresponding vector call is cheaper, return its cost.
- LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
- TTI::TCK_RecipThroughput);
- }
- return {IntrinsicCost, LibCost};
-}
-
+ if (!CI->isNoBuiltin() && VecFunc) {
+ // Calculate the cost of the vector library call.
+ SmallVector<Type *, 4> VecTys;
+ for (Use &Arg : CI->args())
+ VecTys.push_back(
+ FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
+
+ // If the corresponding vector call is cheaper, return its cost.
+ LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
+ TTI::TCK_RecipThroughput);
+ }
+ return {IntrinsicCost, LibCost};
+}
+
InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
- ArrayRef<Value*> VL = E->Scalars;
-
- Type *ScalarTy = VL[0]->getType();
- if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
- ScalarTy = SI->getValueOperand()->getType();
- else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
- ScalarTy = CI->getOperand(0)->getType();
- auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
- TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-
- // If we have computed a smaller type for the expression, update VecTy so
- // that the costs will be accurate.
- if (MinBWs.count(VL[0]))
- VecTy = FixedVectorType::get(
- IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
-
- unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
- bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+ ArrayRef<Value*> VL = E->Scalars;
+
+ Type *ScalarTy = VL[0]->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
+ ScalarTy = CI->getOperand(0)->getType();
+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+ TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+ // If we have computed a smaller type for the expression, update VecTy so
+ // that the costs will be accurate.
+ if (MinBWs.count(VL[0]))
+ VecTy = FixedVectorType::get(
+ IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
+
+ unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
+ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
InstructionCost ReuseShuffleCost = 0;
- if (NeedToShuffleReuses) {
- ReuseShuffleCost =
- TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
- }
- if (E->State == TreeEntry::NeedToGather) {
- if (allConstant(VL))
- return 0;
- if (isSplat(VL)) {
- return ReuseShuffleCost +
- TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
- }
- if (E->getOpcode() == Instruction::ExtractElement &&
- allSameType(VL) && allSameBlock(VL)) {
- Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
- if (ShuffleKind.hasValue()) {
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost =
+ TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
+ if (E->State == TreeEntry::NeedToGather) {
+ if (allConstant(VL))
+ return 0;
+ if (isSplat(VL)) {
+ return ReuseShuffleCost +
+ TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+ }
+ if (E->getOpcode() == Instruction::ExtractElement &&
+ allSameType(VL) && allSameBlock(VL)) {
+ Optional<TargetTransformInfo::ShuffleKind> ShuffleKind = isShuffle(VL);
+ if (ShuffleKind.hasValue()) {
InstructionCost Cost =
TTI->getShuffleCost(ShuffleKind.getValue(), VecTy);
- for (auto *V : VL) {
- // If all users of instruction are going to be vectorized and this
- // instruction itself is not going to be vectorized, consider this
- // instruction as dead and remove its cost from the final cost of the
- // vectorized tree.
- if (areAllUsersVectorized(cast<Instruction>(V)) &&
- !ScalarToTreeEntry.count(V)) {
- auto *IO = cast<ConstantInt>(
- cast<ExtractElementInst>(V)->getIndexOperand());
- Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
- IO->getZExtValue());
- }
- }
- return ReuseShuffleCost + Cost;
- }
- }
- return ReuseShuffleCost + getGatherCost(VL);
- }
+ for (auto *V : VL) {
+ // If all users of instruction are going to be vectorized and this
+ // instruction itself is not going to be vectorized, consider this
+ // instruction as dead and remove its cost from the final cost of the
+ // vectorized tree.
+ if (areAllUsersVectorized(cast<Instruction>(V)) &&
+ !ScalarToTreeEntry.count(V)) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(V)->getIndexOperand());
+ Cost -= TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy,
+ IO->getZExtValue());
+ }
+ }
+ return ReuseShuffleCost + Cost;
+ }
+ }
+ return ReuseShuffleCost + getGatherCost(VL);
+ }
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize) &&
"Unhandled state");
- assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
- Instruction *VL0 = E->getMainOp();
- unsigned ShuffleOrOp =
- E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
- switch (ShuffleOrOp) {
- case Instruction::PHI:
- return 0;
-
- case Instruction::ExtractValue:
- case Instruction::ExtractElement: {
- if (NeedToShuffleReuses) {
- unsigned Idx = 0;
- for (unsigned I : E->ReuseShuffleIndices) {
- if (ShuffleOrOp == Instruction::ExtractElement) {
- auto *IO = cast<ConstantInt>(
- cast<ExtractElementInst>(VL[I])->getIndexOperand());
- Idx = IO->getZExtValue();
- ReuseShuffleCost -= TTI->getVectorInstrCost(
- Instruction::ExtractElement, VecTy, Idx);
- } else {
- ReuseShuffleCost -= TTI->getVectorInstrCost(
- Instruction::ExtractElement, VecTy, Idx);
- ++Idx;
- }
- }
- Idx = ReuseShuffleNumbers;
- for (Value *V : VL) {
- if (ShuffleOrOp == Instruction::ExtractElement) {
- auto *IO = cast<ConstantInt>(
- cast<ExtractElementInst>(V)->getIndexOperand());
- Idx = IO->getZExtValue();
- } else {
- --Idx;
- }
- ReuseShuffleCost +=
- TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
- }
- }
+ assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
+ Instruction *VL0 = E->getMainOp();
+ unsigned ShuffleOrOp =
+ E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+ switch (ShuffleOrOp) {
+ case Instruction::PHI:
+ return 0;
+
+ case Instruction::ExtractValue:
+ case Instruction::ExtractElement: {
+ if (NeedToShuffleReuses) {
+ unsigned Idx = 0;
+ for (unsigned I : E->ReuseShuffleIndices) {
+ if (ShuffleOrOp == Instruction::ExtractElement) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(VL[I])->getIndexOperand());
+ Idx = IO->getZExtValue();
+ ReuseShuffleCost -= TTI->getVectorInstrCost(
+ Instruction::ExtractElement, VecTy, Idx);
+ } else {
+ ReuseShuffleCost -= TTI->getVectorInstrCost(
+ Instruction::ExtractElement, VecTy, Idx);
+ ++Idx;
+ }
+ }
+ Idx = ReuseShuffleNumbers;
+ for (Value *V : VL) {
+ if (ShuffleOrOp == Instruction::ExtractElement) {
+ auto *IO = cast<ConstantInt>(
+ cast<ExtractElementInst>(V)->getIndexOperand());
+ Idx = IO->getZExtValue();
+ } else {
+ --Idx;
+ }
+ ReuseShuffleCost +=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
+ }
+ }
InstructionCost DeadCost = ReuseShuffleCost;
- if (!E->ReorderIndices.empty()) {
- // TODO: Merge this shuffle with the ReuseShuffleCost.
- DeadCost += TTI->getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
- }
+ if (!E->ReorderIndices.empty()) {
+ // TODO: Merge this shuffle with the ReuseShuffleCost.
+ DeadCost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
for (unsigned I = 0, E = VL.size(); I < E; ++I) {
Instruction *EI = cast<Instruction>(VL[I]);
- // If all users are going to be vectorized, instruction can be
- // considered as dead.
- // The same, if have only one user, it will be vectorized for sure.
+ // If all users are going to be vectorized, instruction can be
+ // considered as dead.
+ // The same, if have only one user, it will be vectorized for sure.
if (areAllUsersVectorized(EI)) {
- // Take credit for instruction that will become dead.
+ // Take credit for instruction that will become dead.
if (EI->hasOneUse()) {
Instruction *Ext = EI->user_back();
- if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
- all_of(Ext->users(),
- [](User *U) { return isa<GetElementPtrInst>(U); })) {
- // Use getExtractWithExtendCost() to calculate the cost of
- // extractelement/ext pair.
- DeadCost -= TTI->getExtractWithExtendCost(
+ if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+ all_of(Ext->users(),
+ [](User *U) { return isa<GetElementPtrInst>(U); })) {
+ // Use getExtractWithExtendCost() to calculate the cost of
+ // extractelement/ext pair.
+ DeadCost -= TTI->getExtractWithExtendCost(
Ext->getOpcode(), Ext->getType(), VecTy, I);
- // Add back the cost of s|zext which is subtracted separately.
- DeadCost += TTI->getCastInstrCost(
+ // Add back the cost of s|zext which is subtracted separately.
+ DeadCost += TTI->getCastInstrCost(
Ext->getOpcode(), Ext->getType(), EI->getType(),
TTI::getCastContextHint(Ext), CostKind, Ext);
- continue;
- }
- }
- DeadCost -=
+ continue;
+ }
+ }
+ DeadCost -=
TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, I);
- }
- }
- return DeadCost;
- }
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
- Type *SrcTy = VL0->getOperand(0)->getType();
+ }
+ }
+ return DeadCost;
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ Type *SrcTy = VL0->getOperand(0)->getType();
InstructionCost ScalarEltCost =
TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy,
TTI::getCastContextHint(VL0), CostKind, VL0);
- if (NeedToShuffleReuses) {
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
- }
-
- // Calculate the cost of this instruction.
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+
+ // Calculate the cost of this instruction.
InstructionCost ScalarCost = VL.size() * ScalarEltCost;
-
- auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
+
+ auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
InstructionCost VecCost = 0;
- // Check if the values are candidates to demote.
- if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
+ // Check if the values are candidates to demote.
+ if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
VecCost =
ReuseShuffleCost +
TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
TTI::getCastContextHint(VL0), CostKind, VL0);
- }
+ }
LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
- return VecCost - ScalarCost;
- }
- case Instruction::FCmp:
- case Instruction::ICmp:
- case Instruction::Select: {
- // Calculate the cost of this instruction.
+ return VecCost - ScalarCost;
+ }
+ case Instruction::FCmp:
+ case Instruction::ICmp:
+ case Instruction::Select: {
+ // Calculate the cost of this instruction.
InstructionCost ScalarEltCost =
TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy, Builder.getInt1Ty(),
CmpInst::BAD_ICMP_PREDICATE, CostKind, VL0);
- if (NeedToShuffleReuses) {
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
- }
- auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
+ auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
// Check if all entries in VL are either compares or selects with compares
@@ -3656,103 +3656,103 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
VecCost = std::min(VecCost, IntrinsicCost);
}
LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
- return ReuseShuffleCost + VecCost - ScalarCost;
- }
- case Instruction::FNeg:
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- // Certain instructions can be cheaper to vectorize if they have a
- // constant second vector operand.
- TargetTransformInfo::OperandValueKind Op1VK =
- TargetTransformInfo::OK_AnyValue;
- TargetTransformInfo::OperandValueKind Op2VK =
- TargetTransformInfo::OK_UniformConstantValue;
- TargetTransformInfo::OperandValueProperties Op1VP =
- TargetTransformInfo::OP_None;
- TargetTransformInfo::OperandValueProperties Op2VP =
- TargetTransformInfo::OP_PowerOf2;
-
- // If all operands are exactly the same ConstantInt then set the
- // operand kind to OK_UniformConstantValue.
- // If instead not all operands are constants, then set the operand kind
- // to OK_AnyValue. If all operands are constants but not the same,
- // then set the operand kind to OK_NonUniformConstantValue.
- ConstantInt *CInt0 = nullptr;
- for (unsigned i = 0, e = VL.size(); i < e; ++i) {
- const Instruction *I = cast<Instruction>(VL[i]);
- unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
- ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
- if (!CInt) {
- Op2VK = TargetTransformInfo::OK_AnyValue;
- Op2VP = TargetTransformInfo::OP_None;
- break;
- }
- if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
- !CInt->getValue().isPowerOf2())
- Op2VP = TargetTransformInfo::OP_None;
- if (i == 0) {
- CInt0 = CInt;
- continue;
- }
- if (CInt0 != CInt)
- Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
- }
-
- SmallVector<const Value *, 4> Operands(VL0->operand_values());
+ return ReuseShuffleCost + VecCost - ScalarCost;
+ }
+ case Instruction::FNeg:
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // Certain instructions can be cheaper to vectorize if they have a
+ // constant second vector operand.
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_UniformConstantValue;
+ TargetTransformInfo::OperandValueProperties Op1VP =
+ TargetTransformInfo::OP_None;
+ TargetTransformInfo::OperandValueProperties Op2VP =
+ TargetTransformInfo::OP_PowerOf2;
+
+ // If all operands are exactly the same ConstantInt then set the
+ // operand kind to OK_UniformConstantValue.
+ // If instead not all operands are constants, then set the operand kind
+ // to OK_AnyValue. If all operands are constants but not the same,
+ // then set the operand kind to OK_NonUniformConstantValue.
+ ConstantInt *CInt0 = nullptr;
+ for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+ const Instruction *I = cast<Instruction>(VL[i]);
+ unsigned OpIdx = isa<BinaryOperator>(I) ? 1 : 0;
+ ConstantInt *CInt = dyn_cast<ConstantInt>(I->getOperand(OpIdx));
+ if (!CInt) {
+ Op2VK = TargetTransformInfo::OK_AnyValue;
+ Op2VP = TargetTransformInfo::OP_None;
+ break;
+ }
+ if (Op2VP == TargetTransformInfo::OP_PowerOf2 &&
+ !CInt->getValue().isPowerOf2())
+ Op2VP = TargetTransformInfo::OP_None;
+ if (i == 0) {
+ CInt0 = CInt;
+ continue;
+ }
+ if (CInt0 != CInt)
+ Op2VK = TargetTransformInfo::OK_NonUniformConstantValue;
+ }
+
+ SmallVector<const Value *, 4> Operands(VL0->operand_values());
InstructionCost ScalarEltCost =
TTI->getArithmeticInstrCost(E->getOpcode(), ScalarTy, CostKind, Op1VK,
Op2VK, Op1VP, Op2VP, Operands, VL0);
- if (NeedToShuffleReuses) {
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
- }
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
InstructionCost VecCost =
TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind, Op1VK,
Op2VK, Op1VP, Op2VP, Operands, VL0);
LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
- return ReuseShuffleCost + VecCost - ScalarCost;
- }
- case Instruction::GetElementPtr: {
- TargetTransformInfo::OperandValueKind Op1VK =
- TargetTransformInfo::OK_AnyValue;
- TargetTransformInfo::OperandValueKind Op2VK =
- TargetTransformInfo::OK_UniformConstantValue;
-
+ return ReuseShuffleCost + VecCost - ScalarCost;
+ }
+ case Instruction::GetElementPtr: {
+ TargetTransformInfo::OperandValueKind Op1VK =
+ TargetTransformInfo::OK_AnyValue;
+ TargetTransformInfo::OperandValueKind Op2VK =
+ TargetTransformInfo::OK_UniformConstantValue;
+
InstructionCost ScalarEltCost = TTI->getArithmeticInstrCost(
Instruction::Add, ScalarTy, CostKind, Op1VK, Op2VK);
- if (NeedToShuffleReuses) {
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
- }
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
InstructionCost VecCost = TTI->getArithmeticInstrCost(
Instruction::Add, VecTy, CostKind, Op1VK, Op2VK);
LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
- return ReuseShuffleCost + VecCost - ScalarCost;
- }
- case Instruction::Load: {
- // Cost of wide load - cost of scalar loads.
- Align alignment = cast<LoadInst>(VL0)->getAlign();
+ return ReuseShuffleCost + VecCost - ScalarCost;
+ }
+ case Instruction::Load: {
+ // Cost of wide load - cost of scalar loads.
+ Align alignment = cast<LoadInst>(VL0)->getAlign();
InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
Instruction::Load, ScalarTy, alignment, 0, CostKind, VL0);
- if (NeedToShuffleReuses) {
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
- }
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
InstructionCost ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
InstructionCost VecLdCost;
if (E->State == TreeEntry::Vectorize) {
@@ -3764,220 +3764,220 @@ InstructionCost BoUpSLP::getEntryCost(TreeEntry *E) {
Instruction::Load, VecTy, cast<LoadInst>(VL0)->getPointerOperand(),
/*VariableMask=*/false, alignment, CostKind, VL0);
}
- if (!E->ReorderIndices.empty()) {
- // TODO: Merge this shuffle with the ReuseShuffleCost.
- VecLdCost += TTI->getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
- }
+ if (!E->ReorderIndices.empty()) {
+ // TODO: Merge this shuffle with the ReuseShuffleCost.
+ VecLdCost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecLdCost, ScalarLdCost));
- return ReuseShuffleCost + VecLdCost - ScalarLdCost;
- }
- case Instruction::Store: {
- // We know that we can merge the stores. Calculate the cost.
- bool IsReorder = !E->ReorderIndices.empty();
- auto *SI =
- cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
- Align Alignment = SI->getAlign();
+ return ReuseShuffleCost + VecLdCost - ScalarLdCost;
+ }
+ case Instruction::Store: {
+ // We know that we can merge the stores. Calculate the cost.
+ bool IsReorder = !E->ReorderIndices.empty();
+ auto *SI =
+ cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
+ Align Alignment = SI->getAlign();
InstructionCost ScalarEltCost = TTI->getMemoryOpCost(
Instruction::Store, ScalarTy, Alignment, 0, CostKind, VL0);
- if (NeedToShuffleReuses)
- ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ if (NeedToShuffleReuses)
+ ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
InstructionCost ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
InstructionCost VecStCost = TTI->getMemoryOpCost(
Instruction::Store, VecTy, Alignment, 0, CostKind, VL0);
- if (IsReorder) {
- // TODO: Merge this shuffle with the ReuseShuffleCost.
- VecStCost += TTI->getShuffleCost(
- TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
- }
+ if (IsReorder) {
+ // TODO: Merge this shuffle with the ReuseShuffleCost.
+ VecStCost += TTI->getShuffleCost(
+ TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecStCost, ScalarStCost));
- return ReuseShuffleCost + VecStCost - ScalarStCost;
- }
- case Instruction::Call: {
- CallInst *CI = cast<CallInst>(VL0);
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-
- // Calculate the cost of the scalar and vector calls.
+ return ReuseShuffleCost + VecStCost - ScalarStCost;
+ }
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(VL0);
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ // Calculate the cost of the scalar and vector calls.
IntrinsicCostAttributes CostAttrs(ID, *CI, ElementCount::getFixed(1), 1);
InstructionCost ScalarEltCost =
TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
- if (NeedToShuffleReuses) {
- ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
- }
+ if (NeedToShuffleReuses) {
+ ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
+ }
InstructionCost ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
-
- auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
+
+ auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
InstructionCost VecCallCost =
std::min(VecCallCosts.first, VecCallCosts.second);
-
- LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
- << " (" << VecCallCost << "-" << ScalarCallCost << ")"
- << " for " << *CI << "\n");
-
- return ReuseShuffleCost + VecCallCost - ScalarCallCost;
- }
- case Instruction::ShuffleVector: {
- assert(E->isAltShuffle() &&
- ((Instruction::isBinaryOp(E->getOpcode()) &&
- Instruction::isBinaryOp(E->getAltOpcode())) ||
- (Instruction::isCast(E->getOpcode()) &&
- Instruction::isCast(E->getAltOpcode()))) &&
- "Invalid Shuffle Vector Operand");
+
+ LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
+ << " (" << VecCallCost << "-" << ScalarCallCost << ")"
+ << " for " << *CI << "\n");
+
+ return ReuseShuffleCost + VecCallCost - ScalarCallCost;
+ }
+ case Instruction::ShuffleVector: {
+ assert(E->isAltShuffle() &&
+ ((Instruction::isBinaryOp(E->getOpcode()) &&
+ Instruction::isBinaryOp(E->getAltOpcode())) ||
+ (Instruction::isCast(E->getOpcode()) &&
+ Instruction::isCast(E->getAltOpcode()))) &&
+ "Invalid Shuffle Vector Operand");
InstructionCost ScalarCost = 0;
- if (NeedToShuffleReuses) {
- for (unsigned Idx : E->ReuseShuffleIndices) {
- Instruction *I = cast<Instruction>(VL[Idx]);
- ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
- }
- for (Value *V : VL) {
- Instruction *I = cast<Instruction>(V);
- ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
- }
- }
- for (Value *V : VL) {
- Instruction *I = cast<Instruction>(V);
- assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
- ScalarCost += TTI->getInstructionCost(I, CostKind);
- }
- // VecCost is equal to sum of the cost of creating 2 vectors
- // and the cost of creating shuffle.
+ if (NeedToShuffleReuses) {
+ for (unsigned Idx : E->ReuseShuffleIndices) {
+ Instruction *I = cast<Instruction>(VL[Idx]);
+ ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
+ }
+ for (Value *V : VL) {
+ Instruction *I = cast<Instruction>(V);
+ ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
+ }
+ }
+ for (Value *V : VL) {
+ Instruction *I = cast<Instruction>(V);
+ assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
+ ScalarCost += TTI->getInstructionCost(I, CostKind);
+ }
+ // VecCost is equal to sum of the cost of creating 2 vectors
+ // and the cost of creating shuffle.
InstructionCost VecCost = 0;
- if (Instruction::isBinaryOp(E->getOpcode())) {
- VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
- VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
- CostKind);
- } else {
- Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
- Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
- auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
- auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
- VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
+ if (Instruction::isBinaryOp(E->getOpcode())) {
+ VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
+ VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
+ CostKind);
+ } else {
+ Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
+ Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
+ auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
+ auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
+ VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
TTI::CastContextHint::None, CostKind);
- VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
+ VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
TTI::CastContextHint::None, CostKind);
- }
- VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
+ }
+ VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
LLVM_DEBUG(dumpTreeCosts(E, ReuseShuffleCost, VecCost, ScalarCost));
- return ReuseShuffleCost + VecCost - ScalarCost;
- }
- default:
- llvm_unreachable("Unknown instruction");
- }
-}
-
-bool BoUpSLP::isFullyVectorizableTinyTree() const {
- LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
- << VectorizableTree.size() << " is fully vectorizable .\n");
-
- // We only handle trees of heights 1 and 2.
- if (VectorizableTree.size() == 1 &&
- VectorizableTree[0]->State == TreeEntry::Vectorize)
- return true;
-
- if (VectorizableTree.size() != 2)
- return false;
-
- // Handle splat and all-constants stores.
- if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
- (allConstant(VectorizableTree[1]->Scalars) ||
- isSplat(VectorizableTree[1]->Scalars)))
- return true;
-
- // Gathering cost would be too much for tiny trees.
- if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
- VectorizableTree[1]->State == TreeEntry::NeedToGather)
- return false;
-
- return true;
-}
-
-static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
- TargetTransformInfo *TTI) {
- // Look past the root to find a source value. Arbitrarily follow the
- // path through operand 0 of any 'or'. Also, peek through optional
+ return ReuseShuffleCost + VecCost - ScalarCost;
+ }
+ default:
+ llvm_unreachable("Unknown instruction");
+ }
+}
+
+bool BoUpSLP::isFullyVectorizableTinyTree() const {
+ LLVM_DEBUG(dbgs() << "SLP: Check whether the tree with height "
+ << VectorizableTree.size() << " is fully vectorizable .\n");
+
+ // We only handle trees of heights 1 and 2.
+ if (VectorizableTree.size() == 1 &&
+ VectorizableTree[0]->State == TreeEntry::Vectorize)
+ return true;
+
+ if (VectorizableTree.size() != 2)
+ return false;
+
+ // Handle splat and all-constants stores.
+ if (VectorizableTree[0]->State == TreeEntry::Vectorize &&
+ (allConstant(VectorizableTree[1]->Scalars) ||
+ isSplat(VectorizableTree[1]->Scalars)))
+ return true;
+
+ // Gathering cost would be too much for tiny trees.
+ if (VectorizableTree[0]->State == TreeEntry::NeedToGather ||
+ VectorizableTree[1]->State == TreeEntry::NeedToGather)
+ return false;
+
+ return true;
+}
+
+static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
+ TargetTransformInfo *TTI) {
+ // Look past the root to find a source value. Arbitrarily follow the
+ // path through operand 0 of any 'or'. Also, peek through optional
// shift-left-by-multiple-of-8-bits.
- Value *ZextLoad = Root;
+ Value *ZextLoad = Root;
const APInt *ShAmtC;
- while (!isa<ConstantExpr>(ZextLoad) &&
- (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
+ while (!isa<ConstantExpr>(ZextLoad) &&
+ (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
(match(ZextLoad, m_Shl(m_Value(), m_APInt(ShAmtC))) &&
ShAmtC->urem(8) == 0)))
- ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
-
- // Check if the input is an extended load of the required or/shift expression.
- Value *LoadPtr;
- if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
- return false;
-
- // Require that the total load bit width is a legal integer type.
- // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
- // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
- Type *SrcTy = LoadPtr->getType()->getPointerElementType();
- unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
- if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
- return false;
-
- // Everything matched - assume that we can fold the whole sequence using
- // load combining.
- LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
- << *(cast<Instruction>(Root)) << "\n");
-
- return true;
-}
-
+ ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
+
+ // Check if the input is an extended load of the required or/shift expression.
+ Value *LoadPtr;
+ if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+ return false;
+
+ // Require that the total load bit width is a legal integer type.
+ // For example, <8 x i8> --> i64 is a legal integer on a 64-bit target.
+ // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
+ Type *SrcTy = LoadPtr->getType()->getPointerElementType();
+ unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
+ if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
+ return false;
+
+ // Everything matched - assume that we can fold the whole sequence using
+ // load combining.
+ LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
+ << *(cast<Instruction>(Root)) << "\n");
+
+ return true;
+}
+
bool BoUpSLP::isLoadCombineReductionCandidate(RecurKind RdxKind) const {
if (RdxKind != RecurKind::Or)
- return false;
-
- unsigned NumElts = VectorizableTree[0]->Scalars.size();
- Value *FirstReduced = VectorizableTree[0]->Scalars[0];
- return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
-}
-
-bool BoUpSLP::isLoadCombineCandidate() const {
- // Peek through a final sequence of stores and check if all operations are
- // likely to be load-combined.
- unsigned NumElts = VectorizableTree[0]->Scalars.size();
- for (Value *Scalar : VectorizableTree[0]->Scalars) {
- Value *X;
- if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
- !isLoadCombineCandidateImpl(X, NumElts, TTI))
- return false;
- }
- return true;
-}
-
-bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
- // We can vectorize the tree if its size is greater than or equal to the
- // minimum size specified by the MinTreeSize command line option.
- if (VectorizableTree.size() >= MinTreeSize)
- return false;
-
- // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
- // can vectorize it if we can prove it fully vectorizable.
- if (isFullyVectorizableTinyTree())
- return false;
-
- assert(VectorizableTree.empty()
- ? ExternalUses.empty()
- : true && "We shouldn't have any external users");
-
- // Otherwise, we can't vectorize the tree. It is both tiny and not fully
- // vectorizable.
- return true;
-}
-
+ return false;
+
+ unsigned NumElts = VectorizableTree[0]->Scalars.size();
+ Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+ return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
+}
+
+bool BoUpSLP::isLoadCombineCandidate() const {
+ // Peek through a final sequence of stores and check if all operations are
+ // likely to be load-combined.
+ unsigned NumElts = VectorizableTree[0]->Scalars.size();
+ for (Value *Scalar : VectorizableTree[0]->Scalars) {
+ Value *X;
+ if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
+ !isLoadCombineCandidateImpl(X, NumElts, TTI))
+ return false;
+ }
+ return true;
+}
+
+bool BoUpSLP::isTreeTinyAndNotFullyVectorizable() const {
+ // We can vectorize the tree if its size is greater than or equal to the
+ // minimum size specified by the MinTreeSize command line option.
+ if (VectorizableTree.size() >= MinTreeSize)
+ return false;
+
+ // If we have a tiny tree (a tree whose size is less than MinTreeSize), we
+ // can vectorize it if we can prove it fully vectorizable.
+ if (isFullyVectorizableTinyTree())
+ return false;
+
+ assert(VectorizableTree.empty()
+ ? ExternalUses.empty()
+ : true && "We shouldn't have any external users");
+
+ // Otherwise, we can't vectorize the tree. It is both tiny and not fully
+ // vectorizable.
+ return true;
+}
+
InstructionCost BoUpSLP::getSpillCost() const {
- // Walk from the bottom of the tree to the top, tracking which values are
- // live. When we see a call instruction that is not part of our tree,
- // query TTI to see if there is a cost to keeping values live over it
- // (for example, if spills and fills are required).
- unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
+ // Walk from the bottom of the tree to the top, tracking which values are
+ // live. When we see a call instruction that is not part of our tree,
+ // query TTI to see if there is a cost to keeping values live over it
+ // (for example, if spills and fills are required).
+ unsigned BundleWidth = VectorizableTree.front()->Scalars.size();
InstructionCost Cost = 0;
-
- SmallPtrSet<Instruction*, 4> LiveValues;
- Instruction *PrevInst = nullptr;
-
+
+ SmallPtrSet<Instruction*, 4> LiveValues;
+ Instruction *PrevInst = nullptr;
+
// The entries in VectorizableTree are not necessarily ordered by their
// position in basic blocks. Collect them and order them by dominance so later
// instructions are guaranteed to be visited first. For instructions in
@@ -3985,273 +3985,273 @@ InstructionCost BoUpSLP::getSpillCost() const {
// their order does not matter, as long as all instructions in a basic block
// are grouped together. Using dominance ensures a deterministic order.
SmallVector<Instruction *, 16> OrderedScalars;
- for (const auto &TEPtr : VectorizableTree) {
- Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
- if (!Inst)
- continue;
+ for (const auto &TEPtr : VectorizableTree) {
+ Instruction *Inst = dyn_cast<Instruction>(TEPtr->Scalars[0]);
+ if (!Inst)
+ continue;
OrderedScalars.push_back(Inst);
}
llvm::stable_sort(OrderedScalars, [this](Instruction *A, Instruction *B) {
return DT->dominates(B, A);
});
-
+
for (Instruction *Inst : OrderedScalars) {
- if (!PrevInst) {
- PrevInst = Inst;
- continue;
- }
-
- // Update LiveValues.
- LiveValues.erase(PrevInst);
- for (auto &J : PrevInst->operands()) {
- if (isa<Instruction>(&*J) && getTreeEntry(&*J))
- LiveValues.insert(cast<Instruction>(&*J));
- }
-
- LLVM_DEBUG({
- dbgs() << "SLP: #LV: " << LiveValues.size();
- for (auto *X : LiveValues)
- dbgs() << " " << X->getName();
- dbgs() << ", Looking at ";
- Inst->dump();
- });
-
- // Now find the sequence of instructions between PrevInst and Inst.
- unsigned NumCalls = 0;
- BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
- PrevInstIt =
- PrevInst->getIterator().getReverse();
- while (InstIt != PrevInstIt) {
- if (PrevInstIt == PrevInst->getParent()->rend()) {
- PrevInstIt = Inst->getParent()->rbegin();
- continue;
- }
-
- // Debug information does not impact spill cost.
- if ((isa<CallInst>(&*PrevInstIt) &&
- !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
- &*PrevInstIt != PrevInst)
- NumCalls++;
-
- ++PrevInstIt;
- }
-
- if (NumCalls) {
- SmallVector<Type*, 4> V;
- for (auto *II : LiveValues)
- V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
- Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
- }
-
- PrevInst = Inst;
- }
-
- return Cost;
-}
-
+ if (!PrevInst) {
+ PrevInst = Inst;
+ continue;
+ }
+
+ // Update LiveValues.
+ LiveValues.erase(PrevInst);
+ for (auto &J : PrevInst->operands()) {
+ if (isa<Instruction>(&*J) && getTreeEntry(&*J))
+ LiveValues.insert(cast<Instruction>(&*J));
+ }
+
+ LLVM_DEBUG({
+ dbgs() << "SLP: #LV: " << LiveValues.size();
+ for (auto *X : LiveValues)
+ dbgs() << " " << X->getName();
+ dbgs() << ", Looking at ";
+ Inst->dump();
+ });
+
+ // Now find the sequence of instructions between PrevInst and Inst.
+ unsigned NumCalls = 0;
+ BasicBlock::reverse_iterator InstIt = ++Inst->getIterator().getReverse(),
+ PrevInstIt =
+ PrevInst->getIterator().getReverse();
+ while (InstIt != PrevInstIt) {
+ if (PrevInstIt == PrevInst->getParent()->rend()) {
+ PrevInstIt = Inst->getParent()->rbegin();
+ continue;
+ }
+
+ // Debug information does not impact spill cost.
+ if ((isa<CallInst>(&*PrevInstIt) &&
+ !isa<DbgInfoIntrinsic>(&*PrevInstIt)) &&
+ &*PrevInstIt != PrevInst)
+ NumCalls++;
+
+ ++PrevInstIt;
+ }
+
+ if (NumCalls) {
+ SmallVector<Type*, 4> V;
+ for (auto *II : LiveValues)
+ V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
+ Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
+ }
+
+ PrevInst = Inst;
+ }
+
+ return Cost;
+}
+
InstructionCost BoUpSLP::getTreeCost() {
InstructionCost Cost = 0;
- LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
- << VectorizableTree.size() << ".\n");
-
- unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
-
- for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
- TreeEntry &TE = *VectorizableTree[I].get();
-
- // We create duplicate tree entries for gather sequences that have multiple
- // uses. However, we should not compute the cost of duplicate sequences.
- // For example, if we have a build vector (i.e., insertelement sequence)
- // that is used by more than one vector instruction, we only need to
- // compute the cost of the insertelement instructions once. The redundant
- // instructions will be eliminated by CSE.
- //
- // We should consider not creating duplicate tree entries for gather
- // sequences, and instead add additional edges to the tree representing
- // their uses. Since such an approach results in fewer total entries,
- // existing heuristics based on tree size may yield different results.
- //
- if (TE.State == TreeEntry::NeedToGather &&
- std::any_of(std::next(VectorizableTree.begin(), I + 1),
- VectorizableTree.end(),
- [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
- return EntryPtr->State == TreeEntry::NeedToGather &&
- EntryPtr->isSame(TE.Scalars);
- }))
- continue;
-
+ LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
+ << VectorizableTree.size() << ".\n");
+
+ unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
+
+ for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
+ TreeEntry &TE = *VectorizableTree[I].get();
+
+ // We create duplicate tree entries for gather sequences that have multiple
+ // uses. However, we should not compute the cost of duplicate sequences.
+ // For example, if we have a build vector (i.e., insertelement sequence)
+ // that is used by more than one vector instruction, we only need to
+ // compute the cost of the insertelement instructions once. The redundant
+ // instructions will be eliminated by CSE.
+ //
+ // We should consider not creating duplicate tree entries for gather
+ // sequences, and instead add additional edges to the tree representing
+ // their uses. Since such an approach results in fewer total entries,
+ // existing heuristics based on tree size may yield different results.
+ //
+ if (TE.State == TreeEntry::NeedToGather &&
+ std::any_of(std::next(VectorizableTree.begin(), I + 1),
+ VectorizableTree.end(),
+ [TE](const std::unique_ptr<TreeEntry> &EntryPtr) {
+ return EntryPtr->State == TreeEntry::NeedToGather &&
+ EntryPtr->isSame(TE.Scalars);
+ }))
+ continue;
+
InstructionCost C = getEntryCost(&TE);
Cost += C;
- LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
- << " for bundle that starts with " << *TE.Scalars[0]
+ LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C
+ << " for bundle that starts with " << *TE.Scalars[0]
<< ".\n"
<< "SLP: Current total cost = " << Cost << "\n");
- }
-
- SmallPtrSet<Value *, 16> ExtractCostCalculated;
+ }
+
+ SmallPtrSet<Value *, 16> ExtractCostCalculated;
InstructionCost ExtractCost = 0;
- for (ExternalUser &EU : ExternalUses) {
- // We only add extract cost once for the same scalar.
- if (!ExtractCostCalculated.insert(EU.Scalar).second)
- continue;
-
- // Uses by ephemeral values are free (because the ephemeral value will be
- // removed prior to code generation, and so the extraction will be
- // removed as well).
- if (EphValues.count(EU.User))
- continue;
-
- // If we plan to rewrite the tree in a smaller type, we will need to sign
- // extend the extracted value back to the original type. Here, we account
- // for the extract and the added cost of the sign extend if needed.
- auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
- auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
- if (MinBWs.count(ScalarRoot)) {
- auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
- auto Extend =
- MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
- VecTy = FixedVectorType::get(MinTy, BundleWidth);
- ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
- VecTy, EU.Lane);
- } else {
- ExtractCost +=
- TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
- }
- }
-
+ for (ExternalUser &EU : ExternalUses) {
+ // We only add extract cost once for the same scalar.
+ if (!ExtractCostCalculated.insert(EU.Scalar).second)
+ continue;
+
+ // Uses by ephemeral values are free (because the ephemeral value will be
+ // removed prior to code generation, and so the extraction will be
+ // removed as well).
+ if (EphValues.count(EU.User))
+ continue;
+
+ // If we plan to rewrite the tree in a smaller type, we will need to sign
+ // extend the extracted value back to the original type. Here, we account
+ // for the extract and the added cost of the sign extend if needed.
+ auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
+ auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
+ if (MinBWs.count(ScalarRoot)) {
+ auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
+ auto Extend =
+ MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
+ VecTy = FixedVectorType::get(MinTy, BundleWidth);
+ ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
+ VecTy, EU.Lane);
+ } else {
+ ExtractCost +=
+ TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, EU.Lane);
+ }
+ }
+
InstructionCost SpillCost = getSpillCost();
- Cost += SpillCost + ExtractCost;
-
+ Cost += SpillCost + ExtractCost;
+
#ifndef NDEBUG
SmallString<256> Str;
- {
+ {
raw_svector_ostream OS(Str);
- OS << "SLP: Spill Cost = " << SpillCost << ".\n"
- << "SLP: Extract Cost = " << ExtractCost << ".\n"
- << "SLP: Total Cost = " << Cost << ".\n";
- }
- LLVM_DEBUG(dbgs() << Str);
- if (ViewSLPTree)
- ViewGraph(this, "SLP" + F->getName(), false, Str);
+ OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+ << "SLP: Extract Cost = " << ExtractCost << ".\n"
+ << "SLP: Total Cost = " << Cost << ".\n";
+ }
+ LLVM_DEBUG(dbgs() << Str);
+ if (ViewSLPTree)
+ ViewGraph(this, "SLP" + F->getName(), false, Str);
#endif
-
- return Cost;
-}
-
+
+ return Cost;
+}
+
InstructionCost
BoUpSLP::getGatherCost(FixedVectorType *Ty,
const DenseSet<unsigned> &ShuffledIndices) const {
- unsigned NumElts = Ty->getNumElements();
- APInt DemandedElts = APInt::getNullValue(NumElts);
+ unsigned NumElts = Ty->getNumElements();
+ APInt DemandedElts = APInt::getNullValue(NumElts);
for (unsigned I = 0; I < NumElts; ++I)
if (!ShuffledIndices.count(I))
DemandedElts.setBit(I);
InstructionCost Cost =
TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
/*Extract*/ false);
- if (!ShuffledIndices.empty())
- Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
- return Cost;
-}
-
+ if (!ShuffledIndices.empty())
+ Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
+ return Cost;
+}
+
InstructionCost BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
- // Find the type of the operands in VL.
- Type *ScalarTy = VL[0]->getType();
- if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
- ScalarTy = SI->getValueOperand()->getType();
- auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
- // Find the cost of inserting/extracting values from the vector.
- // Check if the same elements are inserted several times and count them as
- // shuffle candidates.
- DenseSet<unsigned> ShuffledElements;
- DenseSet<Value *> UniqueElements;
- // Iterate in reverse order to consider insert elements with the high cost.
- for (unsigned I = VL.size(); I > 0; --I) {
- unsigned Idx = I - 1;
- if (!UniqueElements.insert(VL[Idx]).second)
- ShuffledElements.insert(Idx);
- }
- return getGatherCost(VecTy, ShuffledElements);
-}
-
-// Perform operand reordering on the instructions in VL and return the reordered
-// operands in Left and Right.
-void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
- SmallVectorImpl<Value *> &Left,
- SmallVectorImpl<Value *> &Right,
- const DataLayout &DL,
- ScalarEvolution &SE,
- const BoUpSLP &R) {
- if (VL.empty())
- return;
- VLOperands Ops(VL, DL, SE, R);
- // Reorder the operands in place.
- Ops.reorder();
- Left = Ops.getVL(0);
- Right = Ops.getVL(1);
-}
-
-void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
- // Get the basic block this bundle is in. All instructions in the bundle
- // should be in this block.
- auto *Front = E->getMainOp();
- auto *BB = Front->getParent();
+ // Find the type of the operands in VL.
+ Type *ScalarTy = VL[0]->getType();
+ if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+ ScalarTy = SI->getValueOperand()->getType();
+ auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+ // Find the cost of inserting/extracting values from the vector.
+ // Check if the same elements are inserted several times and count them as
+ // shuffle candidates.
+ DenseSet<unsigned> ShuffledElements;
+ DenseSet<Value *> UniqueElements;
+ // Iterate in reverse order to consider insert elements with the high cost.
+ for (unsigned I = VL.size(); I > 0; --I) {
+ unsigned Idx = I - 1;
+ if (!UniqueElements.insert(VL[Idx]).second)
+ ShuffledElements.insert(Idx);
+ }
+ return getGatherCost(VecTy, ShuffledElements);
+}
+
+// Perform operand reordering on the instructions in VL and return the reordered
+// operands in Left and Right.
+void BoUpSLP::reorderInputsAccordingToOpcode(ArrayRef<Value *> VL,
+ SmallVectorImpl<Value *> &Left,
+ SmallVectorImpl<Value *> &Right,
+ const DataLayout &DL,
+ ScalarEvolution &SE,
+ const BoUpSLP &R) {
+ if (VL.empty())
+ return;
+ VLOperands Ops(VL, DL, SE, R);
+ // Reorder the operands in place.
+ Ops.reorder();
+ Left = Ops.getVL(0);
+ Right = Ops.getVL(1);
+}
+
+void BoUpSLP::setInsertPointAfterBundle(TreeEntry *E) {
+ // Get the basic block this bundle is in. All instructions in the bundle
+ // should be in this block.
+ auto *Front = E->getMainOp();
+ auto *BB = Front->getParent();
assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool {
auto *I = cast<Instruction>(V);
return !E->isOpcodeOrAlt(I) || I->getParent() == BB;
}));
-
- // The last instruction in the bundle in program order.
- Instruction *LastInst = nullptr;
-
- // Find the last instruction. The common case should be that BB has been
- // scheduled, and the last instruction is VL.back(). So we start with
- // VL.back() and iterate over schedule data until we reach the end of the
- // bundle. The end of the bundle is marked by null ScheduleData.
- if (BlocksSchedules.count(BB)) {
- auto *Bundle =
- BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
- if (Bundle && Bundle->isPartOfBundle())
- for (; Bundle; Bundle = Bundle->NextInBundle)
- if (Bundle->OpValue == Bundle->Inst)
- LastInst = Bundle->Inst;
- }
-
- // LastInst can still be null at this point if there's either not an entry
- // for BB in BlocksSchedules or there's no ScheduleData available for
- // VL.back(). This can be the case if buildTree_rec aborts for various
- // reasons (e.g., the maximum recursion depth is reached, the maximum region
- // size is reached, etc.). ScheduleData is initialized in the scheduling
- // "dry-run".
- //
- // If this happens, we can still find the last instruction by brute force. We
- // iterate forwards from Front (inclusive) until we either see all
- // instructions in the bundle or reach the end of the block. If Front is the
- // last instruction in program order, LastInst will be set to Front, and we
- // will visit all the remaining instructions in the block.
- //
- // One of the reasons we exit early from buildTree_rec is to place an upper
- // bound on compile-time. Thus, taking an additional compile-time hit here is
- // not ideal. However, this should be exceedingly rare since it requires that
- // we both exit early from buildTree_rec and that the bundle be out-of-order
- // (causing us to iterate all the way to the end of the block).
- if (!LastInst) {
- SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
- for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
- if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
- LastInst = &I;
- if (Bundle.empty())
- break;
- }
- }
- assert(LastInst && "Failed to find last instruction in bundle");
-
- // Set the insertion point after the last instruction in the bundle. Set the
- // debug location to Front.
- Builder.SetInsertPoint(BB, ++LastInst->getIterator());
- Builder.SetCurrentDebugLocation(Front->getDebugLoc());
-}
-
+
+ // The last instruction in the bundle in program order.
+ Instruction *LastInst = nullptr;
+
+ // Find the last instruction. The common case should be that BB has been
+ // scheduled, and the last instruction is VL.back(). So we start with
+ // VL.back() and iterate over schedule data until we reach the end of the
+ // bundle. The end of the bundle is marked by null ScheduleData.
+ if (BlocksSchedules.count(BB)) {
+ auto *Bundle =
+ BlocksSchedules[BB]->getScheduleData(E->isOneOf(E->Scalars.back()));
+ if (Bundle && Bundle->isPartOfBundle())
+ for (; Bundle; Bundle = Bundle->NextInBundle)
+ if (Bundle->OpValue == Bundle->Inst)
+ LastInst = Bundle->Inst;
+ }
+
+ // LastInst can still be null at this point if there's either not an entry
+ // for BB in BlocksSchedules or there's no ScheduleData available for
+ // VL.back(). This can be the case if buildTree_rec aborts for various
+ // reasons (e.g., the maximum recursion depth is reached, the maximum region
+ // size is reached, etc.). ScheduleData is initialized in the scheduling
+ // "dry-run".
+ //
+ // If this happens, we can still find the last instruction by brute force. We
+ // iterate forwards from Front (inclusive) until we either see all
+ // instructions in the bundle or reach the end of the block. If Front is the
+ // last instruction in program order, LastInst will be set to Front, and we
+ // will visit all the remaining instructions in the block.
+ //
+ // One of the reasons we exit early from buildTree_rec is to place an upper
+ // bound on compile-time. Thus, taking an additional compile-time hit here is
+ // not ideal. However, this should be exceedingly rare since it requires that
+ // we both exit early from buildTree_rec and that the bundle be out-of-order
+ // (causing us to iterate all the way to the end of the block).
+ if (!LastInst) {
+ SmallPtrSet<Value *, 16> Bundle(E->Scalars.begin(), E->Scalars.end());
+ for (auto &I : make_range(BasicBlock::iterator(Front), BB->end())) {
+ if (Bundle.erase(&I) && E->isOpcodeOrAlt(&I))
+ LastInst = &I;
+ if (Bundle.empty())
+ break;
+ }
+ }
+ assert(LastInst && "Failed to find last instruction in bundle");
+
+ // Set the insertion point after the last instruction in the bundle. Set the
+ // debug location to Front.
+ Builder.SetInsertPoint(BB, ++LastInst->getIterator());
+ Builder.SetCurrentDebugLocation(Front->getDebugLoc());
+}
+
Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
Value *Val0 =
isa<StoreInst>(VL[0]) ? cast<StoreInst>(VL[0])->getValueOperand() : VL[0];
@@ -4274,337 +4274,337 @@ Value *BoUpSLP::gather(ArrayRef<Value *> VL) {
if (!Entry->ReuseShuffleIndices.empty()) {
FoundLane = std::distance(Entry->ReuseShuffleIndices.begin(),
find(Entry->ReuseShuffleIndices, FoundLane));
- }
+ }
ExternalUses.push_back(ExternalUser(Val, InsElt, FoundLane));
- }
- }
-
- return Vec;
-}
-
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
- InstructionsState S = getSameOpcode(VL);
- if (S.getOpcode()) {
- if (TreeEntry *E = getTreeEntry(S.OpValue)) {
- if (E->isSame(VL)) {
- Value *V = vectorizeTree(E);
- if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
- // We need to get the vectorized value but without shuffle.
- if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
- V = SV->getOperand(0);
- } else {
- // Reshuffle to get only unique values.
- SmallVector<int, 4> UniqueIdxs;
- SmallSet<int, 4> UsedIdxs;
- for (int Idx : E->ReuseShuffleIndices)
- if (UsedIdxs.insert(Idx).second)
- UniqueIdxs.emplace_back(Idx);
+ }
+ }
+
+ return Vec;
+}
+
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
+ InstructionsState S = getSameOpcode(VL);
+ if (S.getOpcode()) {
+ if (TreeEntry *E = getTreeEntry(S.OpValue)) {
+ if (E->isSame(VL)) {
+ Value *V = vectorizeTree(E);
+ if (VL.size() == E->Scalars.size() && !E->ReuseShuffleIndices.empty()) {
+ // We need to get the vectorized value but without shuffle.
+ if (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
+ V = SV->getOperand(0);
+ } else {
+ // Reshuffle to get only unique values.
+ SmallVector<int, 4> UniqueIdxs;
+ SmallSet<int, 4> UsedIdxs;
+ for (int Idx : E->ReuseShuffleIndices)
+ if (UsedIdxs.insert(Idx).second)
+ UniqueIdxs.emplace_back(Idx);
V = Builder.CreateShuffleVector(V, UniqueIdxs);
- }
- }
- return V;
- }
- }
- }
-
- // Check that every instruction appears once in this bundle.
- SmallVector<int, 4> ReuseShuffleIndicies;
- SmallVector<Value *, 4> UniqueValues;
- if (VL.size() > 2) {
- DenseMap<Value *, unsigned> UniquePositions;
- for (Value *V : VL) {
- auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
- ReuseShuffleIndicies.emplace_back(Res.first->second);
- if (Res.second || isa<Constant>(V))
- UniqueValues.emplace_back(V);
- }
- // Do not shuffle single element or if number of unique values is not power
- // of 2.
- if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
- !llvm::isPowerOf2_32(UniqueValues.size()))
- ReuseShuffleIndicies.clear();
- else
- VL = UniqueValues;
- }
-
+ }
+ }
+ return V;
+ }
+ }
+ }
+
+ // Check that every instruction appears once in this bundle.
+ SmallVector<int, 4> ReuseShuffleIndicies;
+ SmallVector<Value *, 4> UniqueValues;
+ if (VL.size() > 2) {
+ DenseMap<Value *, unsigned> UniquePositions;
+ for (Value *V : VL) {
+ auto Res = UniquePositions.try_emplace(V, UniqueValues.size());
+ ReuseShuffleIndicies.emplace_back(Res.first->second);
+ if (Res.second || isa<Constant>(V))
+ UniqueValues.emplace_back(V);
+ }
+ // Do not shuffle single element or if number of unique values is not power
+ // of 2.
+ if (UniqueValues.size() == VL.size() || UniqueValues.size() <= 1 ||
+ !llvm::isPowerOf2_32(UniqueValues.size()))
+ ReuseShuffleIndicies.clear();
+ else
+ VL = UniqueValues;
+ }
+
Value *Vec = gather(VL);
- if (!ReuseShuffleIndicies.empty()) {
+ if (!ReuseShuffleIndicies.empty()) {
Vec = Builder.CreateShuffleVector(Vec, ReuseShuffleIndicies, "shuffle");
if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
- }
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
return Vec;
-}
-
-Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
- IRBuilder<>::InsertPointGuard Guard(Builder);
-
- if (E->VectorizedValue) {
- LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
- return E->VectorizedValue;
- }
-
- bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
- if (E->State == TreeEntry::NeedToGather) {
- setInsertPointAfterBundle(E);
+}
+
+Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
+ IRBuilder<>::InsertPointGuard Guard(Builder);
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
+ return E->VectorizedValue;
+ }
+
+ bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
+ if (E->State == TreeEntry::NeedToGather) {
+ setInsertPointAfterBundle(E);
Value *Vec = gather(E->Scalars);
- if (NeedToShuffleReuses) {
+ if (NeedToShuffleReuses) {
Vec = Builder.CreateShuffleVector(Vec, E->ReuseShuffleIndices, "shuffle");
if (auto *I = dyn_cast<Instruction>(Vec)) {
- GatherSeq.insert(I);
- CSEBlocks.insert(I->getParent());
- }
- }
+ GatherSeq.insert(I);
+ CSEBlocks.insert(I->getParent());
+ }
+ }
E->VectorizedValue = Vec;
return Vec;
- }
-
+ }
+
assert((E->State == TreeEntry::Vectorize ||
E->State == TreeEntry::ScatterVectorize) &&
"Unhandled state");
- unsigned ShuffleOrOp =
- E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
+ unsigned ShuffleOrOp =
+ E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
Instruction *VL0 = E->getMainOp();
Type *ScalarTy = VL0->getType();
if (auto *Store = dyn_cast<StoreInst>(VL0))
ScalarTy = Store->getValueOperand()->getType();
auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
- switch (ShuffleOrOp) {
- case Instruction::PHI: {
- auto *PH = cast<PHINode>(VL0);
- Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
- Builder.SetCurrentDebugLocation(PH->getDebugLoc());
- PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
- Value *V = NewPhi;
+ switch (ShuffleOrOp) {
+ case Instruction::PHI: {
+ auto *PH = cast<PHINode>(VL0);
+ Builder.SetInsertPoint(PH->getParent()->getFirstNonPHI());
+ Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+ PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues());
+ Value *V = NewPhi;
if (NeedToShuffleReuses)
V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
- E->VectorizedValue = V;
-
- // PHINodes may have multiple entries from the same block. We want to
- // visit every block once.
- SmallPtrSet<BasicBlock*, 4> VisitedBBs;
-
- for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
- ValueList Operands;
- BasicBlock *IBB = PH->getIncomingBlock(i);
-
- if (!VisitedBBs.insert(IBB).second) {
- NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
- continue;
- }
-
- Builder.SetInsertPoint(IBB->getTerminator());
- Builder.SetCurrentDebugLocation(PH->getDebugLoc());
- Value *Vec = vectorizeTree(E->getOperand(i));
- NewPhi->addIncoming(Vec, IBB);
- }
-
- assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
- "Invalid number of incoming values");
- return V;
- }
-
- case Instruction::ExtractElement: {
- Value *V = E->getSingleOperand(0);
- if (!E->ReorderIndices.empty()) {
- SmallVector<int, 4> Mask;
- inversePermutation(E->ReorderIndices, Mask);
- Builder.SetInsertPoint(VL0);
+ E->VectorizedValue = V;
+
+ // PHINodes may have multiple entries from the same block. We want to
+ // visit every block once.
+ SmallPtrSet<BasicBlock*, 4> VisitedBBs;
+
+ for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
+ ValueList Operands;
+ BasicBlock *IBB = PH->getIncomingBlock(i);
+
+ if (!VisitedBBs.insert(IBB).second) {
+ NewPhi->addIncoming(NewPhi->getIncomingValueForBlock(IBB), IBB);
+ continue;
+ }
+
+ Builder.SetInsertPoint(IBB->getTerminator());
+ Builder.SetCurrentDebugLocation(PH->getDebugLoc());
+ Value *Vec = vectorizeTree(E->getOperand(i));
+ NewPhi->addIncoming(Vec, IBB);
+ }
+
+ assert(NewPhi->getNumIncomingValues() == PH->getNumIncomingValues() &&
+ "Invalid number of incoming values");
+ return V;
+ }
+
+ case Instruction::ExtractElement: {
+ Value *V = E->getSingleOperand(0);
+ if (!E->ReorderIndices.empty()) {
+ SmallVector<int, 4> Mask;
+ inversePermutation(E->ReorderIndices, Mask);
+ Builder.SetInsertPoint(VL0);
V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");
- }
- if (NeedToShuffleReuses) {
- // TODO: Merge this shuffle with the ReorderShuffleMask.
- if (E->ReorderIndices.empty())
- Builder.SetInsertPoint(VL0);
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
+ if (E->ReorderIndices.empty())
+ Builder.SetInsertPoint(VL0);
V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
- }
- E->VectorizedValue = V;
- return V;
- }
- case Instruction::ExtractValue: {
+ }
+ E->VectorizedValue = V;
+ return V;
+ }
+ case Instruction::ExtractValue: {
auto *LI = cast<LoadInst>(E->getSingleOperand(0));
- Builder.SetInsertPoint(LI);
+ Builder.SetInsertPoint(LI);
auto *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
- Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
- LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
- Value *NewV = propagateMetadata(V, E->Scalars);
- if (!E->ReorderIndices.empty()) {
- SmallVector<int, 4> Mask;
- inversePermutation(E->ReorderIndices, Mask);
+ Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+ LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
+ Value *NewV = propagateMetadata(V, E->Scalars);
+ if (!E->ReorderIndices.empty()) {
+ SmallVector<int, 4> Mask;
+ inversePermutation(E->ReorderIndices, Mask);
NewV = Builder.CreateShuffleVector(NewV, Mask, "reorder_shuffle");
- }
- if (NeedToShuffleReuses) {
- // TODO: Merge this shuffle with the ReorderShuffleMask.
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
NewV = Builder.CreateShuffleVector(NewV, E->ReuseShuffleIndices,
"shuffle");
- }
- E->VectorizedValue = NewV;
- return NewV;
- }
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
- setInsertPointAfterBundle(E);
-
- Value *InVec = vectorizeTree(E->getOperand(0));
-
- if (E->VectorizedValue) {
- LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
- return E->VectorizedValue;
- }
-
- auto *CI = cast<CastInst>(VL0);
- Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+ }
+ E->VectorizedValue = NewV;
+ return NewV;
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ setInsertPointAfterBundle(E);
+
+ Value *InVec = vectorizeTree(E->getOperand(0));
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+
+ auto *CI = cast<CastInst>(VL0);
+ Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
if (NeedToShuffleReuses)
V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
- E->VectorizedValue = V;
- ++NumVectorInstructions;
- return V;
- }
- case Instruction::FCmp:
- case Instruction::ICmp: {
- setInsertPointAfterBundle(E);
-
- Value *L = vectorizeTree(E->getOperand(0));
- Value *R = vectorizeTree(E->getOperand(1));
-
- if (E->VectorizedValue) {
- LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
- return E->VectorizedValue;
- }
-
- CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
- Value *V = Builder.CreateCmp(P0, L, R);
- propagateIRFlags(V, E->Scalars, VL0);
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::FCmp:
+ case Instruction::ICmp: {
+ setInsertPointAfterBundle(E);
+
+ Value *L = vectorizeTree(E->getOperand(0));
+ Value *R = vectorizeTree(E->getOperand(1));
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+
+ CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
+ Value *V = Builder.CreateCmp(P0, L, R);
+ propagateIRFlags(V, E->Scalars, VL0);
if (NeedToShuffleReuses)
V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
- E->VectorizedValue = V;
- ++NumVectorInstructions;
- return V;
- }
- case Instruction::Select: {
- setInsertPointAfterBundle(E);
-
- Value *Cond = vectorizeTree(E->getOperand(0));
- Value *True = vectorizeTree(E->getOperand(1));
- Value *False = vectorizeTree(E->getOperand(2));
-
- if (E->VectorizedValue) {
- LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
- return E->VectorizedValue;
- }
-
- Value *V = Builder.CreateSelect(Cond, True, False);
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::Select: {
+ setInsertPointAfterBundle(E);
+
+ Value *Cond = vectorizeTree(E->getOperand(0));
+ Value *True = vectorizeTree(E->getOperand(1));
+ Value *False = vectorizeTree(E->getOperand(2));
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+
+ Value *V = Builder.CreateSelect(Cond, True, False);
if (NeedToShuffleReuses)
V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
- E->VectorizedValue = V;
- ++NumVectorInstructions;
- return V;
- }
- case Instruction::FNeg: {
- setInsertPointAfterBundle(E);
-
- Value *Op = vectorizeTree(E->getOperand(0));
-
- if (E->VectorizedValue) {
- LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
- return E->VectorizedValue;
- }
-
- Value *V = Builder.CreateUnOp(
- static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
- propagateIRFlags(V, E->Scalars, VL0);
- if (auto *I = dyn_cast<Instruction>(V))
- V = propagateMetadata(I, E->Scalars);
-
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::FNeg: {
+ setInsertPointAfterBundle(E);
+
+ Value *Op = vectorizeTree(E->getOperand(0));
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+
+ Value *V = Builder.CreateUnOp(
+ static_cast<Instruction::UnaryOps>(E->getOpcode()), Op);
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (auto *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+
if (NeedToShuffleReuses)
V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
- E->VectorizedValue = V;
- ++NumVectorInstructions;
-
- return V;
- }
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- setInsertPointAfterBundle(E);
-
- Value *LHS = vectorizeTree(E->getOperand(0));
- Value *RHS = vectorizeTree(E->getOperand(1));
-
- if (E->VectorizedValue) {
- LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
- return E->VectorizedValue;
- }
-
- Value *V = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
- RHS);
- propagateIRFlags(V, E->Scalars, VL0);
- if (auto *I = dyn_cast<Instruction>(V))
- V = propagateMetadata(I, E->Scalars);
-
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+
+ return V;
+ }
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ setInsertPointAfterBundle(E);
+
+ Value *LHS = vectorizeTree(E->getOperand(0));
+ Value *RHS = vectorizeTree(E->getOperand(1));
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+
+ Value *V = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS,
+ RHS);
+ propagateIRFlags(V, E->Scalars, VL0);
+ if (auto *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+
if (NeedToShuffleReuses)
V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
- E->VectorizedValue = V;
- ++NumVectorInstructions;
-
- return V;
- }
- case Instruction::Load: {
- // Loads are inserted at the head of the tree because we don't want to
- // sink them all the way down past store instructions.
- bool IsReorder = E->updateStateIfReorder();
- if (IsReorder)
- VL0 = E->getMainOp();
- setInsertPointAfterBundle(E);
-
- LoadInst *LI = cast<LoadInst>(VL0);
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+
+ return V;
+ }
+ case Instruction::Load: {
+ // Loads are inserted at the head of the tree because we don't want to
+ // sink them all the way down past store instructions.
+ bool IsReorder = E->updateStateIfReorder();
+ if (IsReorder)
+ VL0 = E->getMainOp();
+ setInsertPointAfterBundle(E);
+
+ LoadInst *LI = cast<LoadInst>(VL0);
Instruction *NewLI;
- unsigned AS = LI->getPointerAddressSpace();
+ unsigned AS = LI->getPointerAddressSpace();
Value *PO = LI->getPointerOperand();
if (E->State == TreeEntry::Vectorize) {
-
+
Value *VecPtr = Builder.CreateBitCast(PO, VecTy->getPointerTo(AS));
-
+
// The pointer operand uses an in-tree scalar so we add the new BitCast
// to ExternalUses list to make sure that an extract will be generated
// in the future.
if (getTreeEntry(PO))
ExternalUses.emplace_back(PO, cast<User>(VecPtr), 0);
-
+
NewLI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
} else {
assert(E->State == TreeEntry::ScatterVectorize && "Unhandled state");
@@ -4618,922 +4618,922 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
}
Value *V = propagateMetadata(NewLI, E->Scalars);
- if (IsReorder) {
- SmallVector<int, 4> Mask;
- inversePermutation(E->ReorderIndices, Mask);
+ if (IsReorder) {
+ SmallVector<int, 4> Mask;
+ inversePermutation(E->ReorderIndices, Mask);
V = Builder.CreateShuffleVector(V, Mask, "reorder_shuffle");
- }
- if (NeedToShuffleReuses) {
- // TODO: Merge this shuffle with the ReorderShuffleMask.
+ }
+ if (NeedToShuffleReuses) {
+ // TODO: Merge this shuffle with the ReorderShuffleMask.
V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
- }
- E->VectorizedValue = V;
- ++NumVectorInstructions;
- return V;
- }
- case Instruction::Store: {
- bool IsReorder = !E->ReorderIndices.empty();
- auto *SI = cast<StoreInst>(
- IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
- unsigned AS = SI->getPointerAddressSpace();
-
- setInsertPointAfterBundle(E);
-
- Value *VecValue = vectorizeTree(E->getOperand(0));
- if (IsReorder) {
- SmallVector<int, 4> Mask(E->ReorderIndices.begin(),
- E->ReorderIndices.end());
+ }
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::Store: {
+ bool IsReorder = !E->ReorderIndices.empty();
+ auto *SI = cast<StoreInst>(
+ IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
+ unsigned AS = SI->getPointerAddressSpace();
+
+ setInsertPointAfterBundle(E);
+
+ Value *VecValue = vectorizeTree(E->getOperand(0));
+ if (IsReorder) {
+ SmallVector<int, 4> Mask(E->ReorderIndices.begin(),
+ E->ReorderIndices.end());
VecValue = Builder.CreateShuffleVector(VecValue, Mask, "reorder_shuf");
- }
- Value *ScalarPtr = SI->getPointerOperand();
- Value *VecPtr = Builder.CreateBitCast(
- ScalarPtr, VecValue->getType()->getPointerTo(AS));
- StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
- SI->getAlign());
-
- // The pointer operand uses an in-tree scalar, so add the new BitCast to
- // ExternalUses to make sure that an extract will be generated in the
- // future.
- if (getTreeEntry(ScalarPtr))
- ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
-
- Value *V = propagateMetadata(ST, E->Scalars);
+ }
+ Value *ScalarPtr = SI->getPointerOperand();
+ Value *VecPtr = Builder.CreateBitCast(
+ ScalarPtr, VecValue->getType()->getPointerTo(AS));
+ StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
+ SI->getAlign());
+
+ // The pointer operand uses an in-tree scalar, so add the new BitCast to
+ // ExternalUses to make sure that an extract will be generated in the
+ // future.
+ if (getTreeEntry(ScalarPtr))
+ ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
+
+ Value *V = propagateMetadata(ST, E->Scalars);
if (NeedToShuffleReuses)
V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
- E->VectorizedValue = V;
- ++NumVectorInstructions;
- return V;
- }
- case Instruction::GetElementPtr: {
- setInsertPointAfterBundle(E);
-
- Value *Op0 = vectorizeTree(E->getOperand(0));
-
- std::vector<Value *> OpVecs;
- for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
- ++j) {
- ValueList &VL = E->getOperand(j);
- // Need to cast all elements to the same type before vectorization to
- // avoid crash.
- Type *VL0Ty = VL0->getOperand(j)->getType();
- Type *Ty = llvm::all_of(
- VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })
- ? VL0Ty
- : DL->getIndexType(cast<GetElementPtrInst>(VL0)
- ->getPointerOperandType()
- ->getScalarType());
- for (Value *&V : VL) {
- auto *CI = cast<ConstantInt>(V);
- V = ConstantExpr::getIntegerCast(CI, Ty,
- CI->getValue().isSignBitSet());
- }
- Value *OpVec = vectorizeTree(VL);
- OpVecs.push_back(OpVec);
- }
-
- Value *V = Builder.CreateGEP(
- cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
- if (Instruction *I = dyn_cast<Instruction>(V))
- V = propagateMetadata(I, E->Scalars);
-
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::GetElementPtr: {
+ setInsertPointAfterBundle(E);
+
+ Value *Op0 = vectorizeTree(E->getOperand(0));
+
+ std::vector<Value *> OpVecs;
+ for (int j = 1, e = cast<GetElementPtrInst>(VL0)->getNumOperands(); j < e;
+ ++j) {
+ ValueList &VL = E->getOperand(j);
+ // Need to cast all elements to the same type before vectorization to
+ // avoid crash.
+ Type *VL0Ty = VL0->getOperand(j)->getType();
+ Type *Ty = llvm::all_of(
+ VL, [VL0Ty](Value *V) { return VL0Ty == V->getType(); })
+ ? VL0Ty
+ : DL->getIndexType(cast<GetElementPtrInst>(VL0)
+ ->getPointerOperandType()
+ ->getScalarType());
+ for (Value *&V : VL) {
+ auto *CI = cast<ConstantInt>(V);
+ V = ConstantExpr::getIntegerCast(CI, Ty,
+ CI->getValue().isSignBitSet());
+ }
+ Value *OpVec = vectorizeTree(VL);
+ OpVecs.push_back(OpVec);
+ }
+
+ Value *V = Builder.CreateGEP(
+ cast<GetElementPtrInst>(VL0)->getSourceElementType(), Op0, OpVecs);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
+
if (NeedToShuffleReuses)
V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
- E->VectorizedValue = V;
- ++NumVectorInstructions;
-
- return V;
- }
- case Instruction::Call: {
- CallInst *CI = cast<CallInst>(VL0);
- setInsertPointAfterBundle(E);
-
- Intrinsic::ID IID = Intrinsic::not_intrinsic;
- if (Function *FI = CI->getCalledFunction())
- IID = FI->getIntrinsicID();
-
- Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-
- auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
- bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
- VecCallCosts.first <= VecCallCosts.second;
-
- Value *ScalarArg = nullptr;
- std::vector<Value *> OpVecs;
- for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
- ValueList OpVL;
- // Some intrinsics have scalar arguments. This argument should not be
- // vectorized.
- if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
- CallInst *CEI = cast<CallInst>(VL0);
- ScalarArg = CEI->getArgOperand(j);
- OpVecs.push_back(CEI->getArgOperand(j));
- continue;
- }
-
- Value *OpVec = vectorizeTree(E->getOperand(j));
- LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
- OpVecs.push_back(OpVec);
- }
-
- Function *CF;
- if (!UseIntrinsic) {
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+
+ return V;
+ }
+ case Instruction::Call: {
+ CallInst *CI = cast<CallInst>(VL0);
+ setInsertPointAfterBundle(E);
+
+ Intrinsic::ID IID = Intrinsic::not_intrinsic;
+ if (Function *FI = CI->getCalledFunction())
+ IID = FI->getIntrinsicID();
+
+ Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+ auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
+ bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
+ VecCallCosts.first <= VecCallCosts.second;
+
+ Value *ScalarArg = nullptr;
+ std::vector<Value *> OpVecs;
+ for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
+ ValueList OpVL;
+ // Some intrinsics have scalar arguments. This argument should not be
+ // vectorized.
+ if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
+ CallInst *CEI = cast<CallInst>(VL0);
+ ScalarArg = CEI->getArgOperand(j);
+ OpVecs.push_back(CEI->getArgOperand(j));
+ continue;
+ }
+
+ Value *OpVec = vectorizeTree(E->getOperand(j));
+ LLVM_DEBUG(dbgs() << "SLP: OpVec[" << j << "]: " << *OpVec << "\n");
+ OpVecs.push_back(OpVec);
+ }
+
+ Function *CF;
+ if (!UseIntrinsic) {
VFShape Shape =
VFShape::get(*CI, ElementCount::getFixed(static_cast<unsigned>(
VecTy->getNumElements())),
false /*HasGlobalPred*/);
- CF = VFDatabase(*CI).getVectorizedFunction(Shape);
- } else {
- Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
- CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
- }
-
- SmallVector<OperandBundleDef, 1> OpBundles;
- CI->getOperandBundlesAsDefs(OpBundles);
- Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
-
- // The scalar argument uses an in-tree scalar so we add the new vectorized
- // call to ExternalUses list to make sure that an extract will be
- // generated in the future.
- if (ScalarArg && getTreeEntry(ScalarArg))
- ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
-
- propagateIRFlags(V, E->Scalars, VL0);
+ CF = VFDatabase(*CI).getVectorizedFunction(Shape);
+ } else {
+ Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
+ CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
+ }
+
+ SmallVector<OperandBundleDef, 1> OpBundles;
+ CI->getOperandBundlesAsDefs(OpBundles);
+ Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
+
+ // The scalar argument uses an in-tree scalar so we add the new vectorized
+ // call to ExternalUses list to make sure that an extract will be
+ // generated in the future.
+ if (ScalarArg && getTreeEntry(ScalarArg))
+ ExternalUses.push_back(ExternalUser(ScalarArg, cast<User>(V), 0));
+
+ propagateIRFlags(V, E->Scalars, VL0);
if (NeedToShuffleReuses)
V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
- E->VectorizedValue = V;
- ++NumVectorInstructions;
- return V;
- }
- case Instruction::ShuffleVector: {
- assert(E->isAltShuffle() &&
- ((Instruction::isBinaryOp(E->getOpcode()) &&
- Instruction::isBinaryOp(E->getAltOpcode())) ||
- (Instruction::isCast(E->getOpcode()) &&
- Instruction::isCast(E->getAltOpcode()))) &&
- "Invalid Shuffle Vector Operand");
-
- Value *LHS = nullptr, *RHS = nullptr;
- if (Instruction::isBinaryOp(E->getOpcode())) {
- setInsertPointAfterBundle(E);
- LHS = vectorizeTree(E->getOperand(0));
- RHS = vectorizeTree(E->getOperand(1));
- } else {
- setInsertPointAfterBundle(E);
- LHS = vectorizeTree(E->getOperand(0));
- }
-
- if (E->VectorizedValue) {
- LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
- return E->VectorizedValue;
- }
-
- Value *V0, *V1;
- if (Instruction::isBinaryOp(E->getOpcode())) {
- V0 = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
- V1 = Builder.CreateBinOp(
- static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
- } else {
- V0 = Builder.CreateCast(
- static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
- V1 = Builder.CreateCast(
- static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
- }
-
- // Create shuffle to take alternate operations from the vector.
- // Also, gather up main and alt scalar ops to propagate IR flags to
- // each vector operation.
- ValueList OpScalars, AltScalars;
- unsigned e = E->Scalars.size();
- SmallVector<int, 8> Mask(e);
- for (unsigned i = 0; i < e; ++i) {
- auto *OpInst = cast<Instruction>(E->Scalars[i]);
- assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
- if (OpInst->getOpcode() == E->getAltOpcode()) {
- Mask[i] = e + i;
- AltScalars.push_back(E->Scalars[i]);
- } else {
- Mask[i] = i;
- OpScalars.push_back(E->Scalars[i]);
- }
- }
-
- propagateIRFlags(V0, OpScalars);
- propagateIRFlags(V1, AltScalars);
-
- Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
- if (Instruction *I = dyn_cast<Instruction>(V))
- V = propagateMetadata(I, E->Scalars);
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+ return V;
+ }
+ case Instruction::ShuffleVector: {
+ assert(E->isAltShuffle() &&
+ ((Instruction::isBinaryOp(E->getOpcode()) &&
+ Instruction::isBinaryOp(E->getAltOpcode())) ||
+ (Instruction::isCast(E->getOpcode()) &&
+ Instruction::isCast(E->getAltOpcode()))) &&
+ "Invalid Shuffle Vector Operand");
+
+ Value *LHS = nullptr, *RHS = nullptr;
+ if (Instruction::isBinaryOp(E->getOpcode())) {
+ setInsertPointAfterBundle(E);
+ LHS = vectorizeTree(E->getOperand(0));
+ RHS = vectorizeTree(E->getOperand(1));
+ } else {
+ setInsertPointAfterBundle(E);
+ LHS = vectorizeTree(E->getOperand(0));
+ }
+
+ if (E->VectorizedValue) {
+ LLVM_DEBUG(dbgs() << "SLP: Diamond merged for " << *VL0 << ".\n");
+ return E->VectorizedValue;
+ }
+
+ Value *V0, *V1;
+ if (Instruction::isBinaryOp(E->getOpcode())) {
+ V0 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(E->getOpcode()), LHS, RHS);
+ V1 = Builder.CreateBinOp(
+ static_cast<Instruction::BinaryOps>(E->getAltOpcode()), LHS, RHS);
+ } else {
+ V0 = Builder.CreateCast(
+ static_cast<Instruction::CastOps>(E->getOpcode()), LHS, VecTy);
+ V1 = Builder.CreateCast(
+ static_cast<Instruction::CastOps>(E->getAltOpcode()), LHS, VecTy);
+ }
+
+ // Create shuffle to take alternate operations from the vector.
+ // Also, gather up main and alt scalar ops to propagate IR flags to
+ // each vector operation.
+ ValueList OpScalars, AltScalars;
+ unsigned e = E->Scalars.size();
+ SmallVector<int, 8> Mask(e);
+ for (unsigned i = 0; i < e; ++i) {
+ auto *OpInst = cast<Instruction>(E->Scalars[i]);
+ assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
+ if (OpInst->getOpcode() == E->getAltOpcode()) {
+ Mask[i] = e + i;
+ AltScalars.push_back(E->Scalars[i]);
+ } else {
+ Mask[i] = i;
+ OpScalars.push_back(E->Scalars[i]);
+ }
+ }
+
+ propagateIRFlags(V0, OpScalars);
+ propagateIRFlags(V1, AltScalars);
+
+ Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
+ if (Instruction *I = dyn_cast<Instruction>(V))
+ V = propagateMetadata(I, E->Scalars);
if (NeedToShuffleReuses)
V = Builder.CreateShuffleVector(V, E->ReuseShuffleIndices, "shuffle");
- E->VectorizedValue = V;
- ++NumVectorInstructions;
-
- return V;
- }
- default:
- llvm_unreachable("unknown inst");
- }
- return nullptr;
-}
-
-Value *BoUpSLP::vectorizeTree() {
- ExtraValueToDebugLocsMap ExternallyUsedValues;
- return vectorizeTree(ExternallyUsedValues);
-}
-
-Value *
-BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
- // All blocks must be scheduled before any instructions are inserted.
- for (auto &BSIter : BlocksSchedules) {
- scheduleBlock(BSIter.second.get());
- }
-
- Builder.SetInsertPoint(&F->getEntryBlock().front());
- auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
-
- // If the vectorized tree can be rewritten in a smaller type, we truncate the
- // vectorized root. InstCombine will then rewrite the entire expression. We
- // sign extend the extracted values below.
- auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
- if (MinBWs.count(ScalarRoot)) {
- if (auto *I = dyn_cast<Instruction>(VectorRoot))
- Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
- auto BundleWidth = VectorizableTree[0]->Scalars.size();
- auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
- auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
- auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
- VectorizableTree[0]->VectorizedValue = Trunc;
- }
-
- LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
- << " values .\n");
-
- // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
- // specified by ScalarType.
- auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
- if (!MinBWs.count(ScalarRoot))
- return Ex;
- if (MinBWs[ScalarRoot].second)
- return Builder.CreateSExt(Ex, ScalarType);
- return Builder.CreateZExt(Ex, ScalarType);
- };
-
- // Extract all of the elements with the external uses.
- for (const auto &ExternalUse : ExternalUses) {
- Value *Scalar = ExternalUse.Scalar;
- llvm::User *User = ExternalUse.User;
-
- // Skip users that we already RAUW. This happens when one instruction
- // has multiple uses of the same value.
- if (User && !is_contained(Scalar->users(), User))
- continue;
- TreeEntry *E = getTreeEntry(Scalar);
- assert(E && "Invalid scalar");
+ E->VectorizedValue = V;
+ ++NumVectorInstructions;
+
+ return V;
+ }
+ default:
+ llvm_unreachable("unknown inst");
+ }
+ return nullptr;
+}
+
+Value *BoUpSLP::vectorizeTree() {
+ ExtraValueToDebugLocsMap ExternallyUsedValues;
+ return vectorizeTree(ExternallyUsedValues);
+}
+
+Value *
+BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
+ // All blocks must be scheduled before any instructions are inserted.
+ for (auto &BSIter : BlocksSchedules) {
+ scheduleBlock(BSIter.second.get());
+ }
+
+ Builder.SetInsertPoint(&F->getEntryBlock().front());
+ auto *VectorRoot = vectorizeTree(VectorizableTree[0].get());
+
+ // If the vectorized tree can be rewritten in a smaller type, we truncate the
+ // vectorized root. InstCombine will then rewrite the entire expression. We
+ // sign extend the extracted values below.
+ auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
+ if (MinBWs.count(ScalarRoot)) {
+ if (auto *I = dyn_cast<Instruction>(VectorRoot))
+ Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
+ auto BundleWidth = VectorizableTree[0]->Scalars.size();
+ auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
+ auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
+ auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
+ VectorizableTree[0]->VectorizedValue = Trunc;
+ }
+
+ LLVM_DEBUG(dbgs() << "SLP: Extracting " << ExternalUses.size()
+ << " values .\n");
+
+ // If necessary, sign-extend or zero-extend ScalarRoot to the larger type
+ // specified by ScalarType.
+ auto extend = [&](Value *ScalarRoot, Value *Ex, Type *ScalarType) {
+ if (!MinBWs.count(ScalarRoot))
+ return Ex;
+ if (MinBWs[ScalarRoot].second)
+ return Builder.CreateSExt(Ex, ScalarType);
+ return Builder.CreateZExt(Ex, ScalarType);
+ };
+
+ // Extract all of the elements with the external uses.
+ for (const auto &ExternalUse : ExternalUses) {
+ Value *Scalar = ExternalUse.Scalar;
+ llvm::User *User = ExternalUse.User;
+
+ // Skip users that we already RAUW. This happens when one instruction
+ // has multiple uses of the same value.
+ if (User && !is_contained(Scalar->users(), User))
+ continue;
+ TreeEntry *E = getTreeEntry(Scalar);
+ assert(E && "Invalid scalar");
assert(E->State != TreeEntry::NeedToGather &&
"Extracting from a gather list");
-
- Value *Vec = E->VectorizedValue;
- assert(Vec && "Can't find vectorizable value");
-
- Value *Lane = Builder.getInt32(ExternalUse.Lane);
- // If User == nullptr, the Scalar is used as extra arg. Generate
- // ExtractElement instruction and update the record for this scalar in
- // ExternallyUsedValues.
- if (!User) {
- assert(ExternallyUsedValues.count(Scalar) &&
- "Scalar with nullptr as an external user must be registered in "
- "ExternallyUsedValues map");
- if (auto *VecI = dyn_cast<Instruction>(Vec)) {
- Builder.SetInsertPoint(VecI->getParent(),
- std::next(VecI->getIterator()));
- } else {
- Builder.SetInsertPoint(&F->getEntryBlock().front());
- }
- Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- Ex = extend(ScalarRoot, Ex, Scalar->getType());
- CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
- auto &Locs = ExternallyUsedValues[Scalar];
- ExternallyUsedValues.insert({Ex, Locs});
- ExternallyUsedValues.erase(Scalar);
- // Required to update internally referenced instructions.
- Scalar->replaceAllUsesWith(Ex);
- continue;
- }
-
- // Generate extracts for out-of-tree users.
- // Find the insertion point for the extractelement lane.
- if (auto *VecI = dyn_cast<Instruction>(Vec)) {
- if (PHINode *PH = dyn_cast<PHINode>(User)) {
- for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
- if (PH->getIncomingValue(i) == Scalar) {
- Instruction *IncomingTerminator =
- PH->getIncomingBlock(i)->getTerminator();
- if (isa<CatchSwitchInst>(IncomingTerminator)) {
- Builder.SetInsertPoint(VecI->getParent(),
- std::next(VecI->getIterator()));
- } else {
- Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
- }
- Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- Ex = extend(ScalarRoot, Ex, Scalar->getType());
- CSEBlocks.insert(PH->getIncomingBlock(i));
- PH->setOperand(i, Ex);
- }
- }
- } else {
- Builder.SetInsertPoint(cast<Instruction>(User));
- Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- Ex = extend(ScalarRoot, Ex, Scalar->getType());
- CSEBlocks.insert(cast<Instruction>(User)->getParent());
- User->replaceUsesOfWith(Scalar, Ex);
- }
- } else {
- Builder.SetInsertPoint(&F->getEntryBlock().front());
- Value *Ex = Builder.CreateExtractElement(Vec, Lane);
- Ex = extend(ScalarRoot, Ex, Scalar->getType());
- CSEBlocks.insert(&F->getEntryBlock());
- User->replaceUsesOfWith(Scalar, Ex);
- }
-
- LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
- }
-
- // For each vectorized value:
- for (auto &TEPtr : VectorizableTree) {
- TreeEntry *Entry = TEPtr.get();
-
- // No need to handle users of gathered values.
- if (Entry->State == TreeEntry::NeedToGather)
- continue;
-
- assert(Entry->VectorizedValue && "Can't find vectorizable value");
-
- // For each lane:
- for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
- Value *Scalar = Entry->Scalars[Lane];
-
-#ifndef NDEBUG
- Type *Ty = Scalar->getType();
- if (!Ty->isVoidTy()) {
- for (User *U : Scalar->users()) {
- LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
-
- // It is legal to delete users in the ignorelist.
- assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
- "Deleting out-of-tree value");
- }
- }
-#endif
- LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
- eraseInstruction(cast<Instruction>(Scalar));
- }
- }
-
- Builder.ClearInsertionPoint();
- InstrElementSize.clear();
-
- return VectorizableTree[0]->VectorizedValue;
-}
-
-void BoUpSLP::optimizeGatherSequence() {
- LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
- << " gather sequences instructions.\n");
- // LICM InsertElementInst sequences.
- for (Instruction *I : GatherSeq) {
- if (isDeleted(I))
- continue;
-
- // Check if this block is inside a loop.
- Loop *L = LI->getLoopFor(I->getParent());
- if (!L)
- continue;
-
- // Check if it has a preheader.
- BasicBlock *PreHeader = L->getLoopPreheader();
- if (!PreHeader)
- continue;
-
- // If the vector or the element that we insert into it are
- // instructions that are defined in this basic block then we can't
- // hoist this instruction.
- auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
- auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
- if (Op0 && L->contains(Op0))
- continue;
- if (Op1 && L->contains(Op1))
- continue;
-
- // We can hoist this instruction. Move it to the pre-header.
- I->moveBefore(PreHeader->getTerminator());
- }
-
- // Make a list of all reachable blocks in our CSE queue.
- SmallVector<const DomTreeNode *, 8> CSEWorkList;
- CSEWorkList.reserve(CSEBlocks.size());
- for (BasicBlock *BB : CSEBlocks)
- if (DomTreeNode *N = DT->getNode(BB)) {
- assert(DT->isReachableFromEntry(N));
- CSEWorkList.push_back(N);
- }
-
- // Sort blocks by domination. This ensures we visit a block after all blocks
- // dominating it are visited.
- llvm::stable_sort(CSEWorkList,
- [this](const DomTreeNode *A, const DomTreeNode *B) {
- return DT->properlyDominates(A, B);
- });
-
- // Perform O(N^2) search over the gather sequences and merge identical
- // instructions. TODO: We can further optimize this scan if we split the
- // instructions into different buckets based on the insert lane.
- SmallVector<Instruction *, 16> Visited;
- for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
+
+ Value *Vec = E->VectorizedValue;
+ assert(Vec && "Can't find vectorizable value");
+
+ Value *Lane = Builder.getInt32(ExternalUse.Lane);
+ // If User == nullptr, the Scalar is used as extra arg. Generate
+ // ExtractElement instruction and update the record for this scalar in
+ // ExternallyUsedValues.
+ if (!User) {
+ assert(ExternallyUsedValues.count(Scalar) &&
+ "Scalar with nullptr as an external user must be registered in "
+ "ExternallyUsedValues map");
+ if (auto *VecI = dyn_cast<Instruction>(Vec)) {
+ Builder.SetInsertPoint(VecI->getParent(),
+ std::next(VecI->getIterator()));
+ } else {
+ Builder.SetInsertPoint(&F->getEntryBlock().front());
+ }
+ Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
+ CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
+ auto &Locs = ExternallyUsedValues[Scalar];
+ ExternallyUsedValues.insert({Ex, Locs});
+ ExternallyUsedValues.erase(Scalar);
+ // Required to update internally referenced instructions.
+ Scalar->replaceAllUsesWith(Ex);
+ continue;
+ }
+
+ // Generate extracts for out-of-tree users.
+ // Find the insertion point for the extractelement lane.
+ if (auto *VecI = dyn_cast<Instruction>(Vec)) {
+ if (PHINode *PH = dyn_cast<PHINode>(User)) {
+ for (int i = 0, e = PH->getNumIncomingValues(); i != e; ++i) {
+ if (PH->getIncomingValue(i) == Scalar) {
+ Instruction *IncomingTerminator =
+ PH->getIncomingBlock(i)->getTerminator();
+ if (isa<CatchSwitchInst>(IncomingTerminator)) {
+ Builder.SetInsertPoint(VecI->getParent(),
+ std::next(VecI->getIterator()));
+ } else {
+ Builder.SetInsertPoint(PH->getIncomingBlock(i)->getTerminator());
+ }
+ Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
+ CSEBlocks.insert(PH->getIncomingBlock(i));
+ PH->setOperand(i, Ex);
+ }
+ }
+ } else {
+ Builder.SetInsertPoint(cast<Instruction>(User));
+ Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
+ CSEBlocks.insert(cast<Instruction>(User)->getParent());
+ User->replaceUsesOfWith(Scalar, Ex);
+ }
+ } else {
+ Builder.SetInsertPoint(&F->getEntryBlock().front());
+ Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+ Ex = extend(ScalarRoot, Ex, Scalar->getType());
+ CSEBlocks.insert(&F->getEntryBlock());
+ User->replaceUsesOfWith(Scalar, Ex);
+ }
+
+ LLVM_DEBUG(dbgs() << "SLP: Replaced:" << *User << ".\n");
+ }
+
+ // For each vectorized value:
+ for (auto &TEPtr : VectorizableTree) {
+ TreeEntry *Entry = TEPtr.get();
+
+ // No need to handle users of gathered values.
+ if (Entry->State == TreeEntry::NeedToGather)
+ continue;
+
+ assert(Entry->VectorizedValue && "Can't find vectorizable value");
+
+ // For each lane:
+ for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) {
+ Value *Scalar = Entry->Scalars[Lane];
+
+#ifndef NDEBUG
+ Type *Ty = Scalar->getType();
+ if (!Ty->isVoidTy()) {
+ for (User *U : Scalar->users()) {
+ LLVM_DEBUG(dbgs() << "SLP: \tvalidating user:" << *U << ".\n");
+
+ // It is legal to delete users in the ignorelist.
+ assert((getTreeEntry(U) || is_contained(UserIgnoreList, U)) &&
+ "Deleting out-of-tree value");
+ }
+ }
+#endif
+ LLVM_DEBUG(dbgs() << "SLP: \tErasing scalar:" << *Scalar << ".\n");
+ eraseInstruction(cast<Instruction>(Scalar));
+ }
+ }
+
+ Builder.ClearInsertionPoint();
+ InstrElementSize.clear();
+
+ return VectorizableTree[0]->VectorizedValue;
+}
+
+void BoUpSLP::optimizeGatherSequence() {
+ LLVM_DEBUG(dbgs() << "SLP: Optimizing " << GatherSeq.size()
+ << " gather sequences instructions.\n");
+ // LICM InsertElementInst sequences.
+ for (Instruction *I : GatherSeq) {
+ if (isDeleted(I))
+ continue;
+
+ // Check if this block is inside a loop.
+ Loop *L = LI->getLoopFor(I->getParent());
+ if (!L)
+ continue;
+
+ // Check if it has a preheader.
+ BasicBlock *PreHeader = L->getLoopPreheader();
+ if (!PreHeader)
+ continue;
+
+ // If the vector or the element that we insert into it are
+ // instructions that are defined in this basic block then we can't
+ // hoist this instruction.
+ auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+ if (Op0 && L->contains(Op0))
+ continue;
+ if (Op1 && L->contains(Op1))
+ continue;
+
+ // We can hoist this instruction. Move it to the pre-header.
+ I->moveBefore(PreHeader->getTerminator());
+ }
+
+ // Make a list of all reachable blocks in our CSE queue.
+ SmallVector<const DomTreeNode *, 8> CSEWorkList;
+ CSEWorkList.reserve(CSEBlocks.size());
+ for (BasicBlock *BB : CSEBlocks)
+ if (DomTreeNode *N = DT->getNode(BB)) {
+ assert(DT->isReachableFromEntry(N));
+ CSEWorkList.push_back(N);
+ }
+
+ // Sort blocks by domination. This ensures we visit a block after all blocks
+ // dominating it are visited.
+ llvm::stable_sort(CSEWorkList,
+ [this](const DomTreeNode *A, const DomTreeNode *B) {
+ return DT->properlyDominates(A, B);
+ });
+
+ // Perform O(N^2) search over the gather sequences and merge identical
+ // instructions. TODO: We can further optimize this scan if we split the
+ // instructions into different buckets based on the insert lane.
+ SmallVector<Instruction *, 16> Visited;
+ for (auto I = CSEWorkList.begin(), E = CSEWorkList.end(); I != E; ++I) {
assert(*I &&
(I == CSEWorkList.begin() || !DT->dominates(*I, *std::prev(I))) &&
- "Worklist not sorted properly!");
- BasicBlock *BB = (*I)->getBlock();
- // For all instructions in blocks containing gather sequences:
- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
- Instruction *In = &*it++;
- if (isDeleted(In))
- continue;
- if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
- continue;
-
- // Check if we can replace this instruction with any of the
- // visited instructions.
- for (Instruction *v : Visited) {
- if (In->isIdenticalTo(v) &&
- DT->dominates(v->getParent(), In->getParent())) {
- In->replaceAllUsesWith(v);
- eraseInstruction(In);
- In = nullptr;
- break;
- }
- }
- if (In) {
- assert(!is_contained(Visited, In));
- Visited.push_back(In);
- }
- }
- }
- CSEBlocks.clear();
- GatherSeq.clear();
-}
-
-// Groups the instructions to a bundle (which is then a single scheduling entity)
-// and schedules instructions until the bundle gets ready.
-Optional<BoUpSLP::ScheduleData *>
-BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
- const InstructionsState &S) {
- if (isa<PHINode>(S.OpValue))
- return nullptr;
-
- // Initialize the instruction bundle.
- Instruction *OldScheduleEnd = ScheduleEnd;
- ScheduleData *PrevInBundle = nullptr;
- ScheduleData *Bundle = nullptr;
- bool ReSchedule = false;
- LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
-
- // Make sure that the scheduling region contains all
- // instructions of the bundle.
- for (Value *V : VL) {
- if (!extendSchedulingRegion(V, S))
- return None;
- }
-
- for (Value *V : VL) {
- ScheduleData *BundleMember = getScheduleData(V);
- assert(BundleMember &&
- "no ScheduleData for bundle member (maybe not in same basic block)");
- if (BundleMember->IsScheduled) {
- // A bundle member was scheduled as single instruction before and now
- // needs to be scheduled as part of the bundle. We just get rid of the
- // existing schedule.
- LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
- << " was already scheduled\n");
- ReSchedule = true;
- }
- assert(BundleMember->isSchedulingEntity() &&
- "bundle member already part of other bundle");
- if (PrevInBundle) {
- PrevInBundle->NextInBundle = BundleMember;
- } else {
- Bundle = BundleMember;
- }
- BundleMember->UnscheduledDepsInBundle = 0;
- Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
-
- // Group the instructions to a bundle.
- BundleMember->FirstInBundle = Bundle;
- PrevInBundle = BundleMember;
- }
- if (ScheduleEnd != OldScheduleEnd) {
- // The scheduling region got new instructions at the lower end (or it is a
- // new region for the first bundle). This makes it necessary to
- // recalculate all dependencies.
- // It is seldom that this needs to be done a second time after adding the
- // initial bundle to the region.
- for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- doForAllOpcodes(I, [](ScheduleData *SD) {
- SD->clearDependencies();
- });
- }
- ReSchedule = true;
- }
- if (ReSchedule) {
- resetSchedule();
- initialFillReadyList(ReadyInsts);
- }
- assert(Bundle && "Failed to find schedule bundle");
-
- LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
- << BB->getName() << "\n");
-
- calculateDependencies(Bundle, true, SLP);
-
- // Now try to schedule the new bundle. As soon as the bundle is "ready" it
- // means that there are no cyclic dependencies and we can schedule it.
- // Note that's important that we don't "schedule" the bundle yet (see
- // cancelScheduling).
- while (!Bundle->isReady() && !ReadyInsts.empty()) {
-
+ "Worklist not sorted properly!");
+ BasicBlock *BB = (*I)->getBlock();
+ // For all instructions in blocks containing gather sequences:
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e;) {
+ Instruction *In = &*it++;
+ if (isDeleted(In))
+ continue;
+ if (!isa<InsertElementInst>(In) && !isa<ExtractElementInst>(In))
+ continue;
+
+ // Check if we can replace this instruction with any of the
+ // visited instructions.
+ for (Instruction *v : Visited) {
+ if (In->isIdenticalTo(v) &&
+ DT->dominates(v->getParent(), In->getParent())) {
+ In->replaceAllUsesWith(v);
+ eraseInstruction(In);
+ In = nullptr;
+ break;
+ }
+ }
+ if (In) {
+ assert(!is_contained(Visited, In));
+ Visited.push_back(In);
+ }
+ }
+ }
+ CSEBlocks.clear();
+ GatherSeq.clear();
+}
+
+// Groups the instructions to a bundle (which is then a single scheduling entity)
+// and schedules instructions until the bundle gets ready.
+Optional<BoUpSLP::ScheduleData *>
+BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef<Value *> VL, BoUpSLP *SLP,
+ const InstructionsState &S) {
+ if (isa<PHINode>(S.OpValue))
+ return nullptr;
+
+ // Initialize the instruction bundle.
+ Instruction *OldScheduleEnd = ScheduleEnd;
+ ScheduleData *PrevInBundle = nullptr;
+ ScheduleData *Bundle = nullptr;
+ bool ReSchedule = false;
+ LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.OpValue << "\n");
+
+ // Make sure that the scheduling region contains all
+ // instructions of the bundle.
+ for (Value *V : VL) {
+ if (!extendSchedulingRegion(V, S))
+ return None;
+ }
+
+ for (Value *V : VL) {
+ ScheduleData *BundleMember = getScheduleData(V);
+ assert(BundleMember &&
+ "no ScheduleData for bundle member (maybe not in same basic block)");
+ if (BundleMember->IsScheduled) {
+ // A bundle member was scheduled as single instruction before and now
+ // needs to be scheduled as part of the bundle. We just get rid of the
+ // existing schedule.
+ LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember
+ << " was already scheduled\n");
+ ReSchedule = true;
+ }
+ assert(BundleMember->isSchedulingEntity() &&
+ "bundle member already part of other bundle");
+ if (PrevInBundle) {
+ PrevInBundle->NextInBundle = BundleMember;
+ } else {
+ Bundle = BundleMember;
+ }
+ BundleMember->UnscheduledDepsInBundle = 0;
+ Bundle->UnscheduledDepsInBundle += BundleMember->UnscheduledDeps;
+
+ // Group the instructions to a bundle.
+ BundleMember->FirstInBundle = Bundle;
+ PrevInBundle = BundleMember;
+ }
+ if (ScheduleEnd != OldScheduleEnd) {
+ // The scheduling region got new instructions at the lower end (or it is a
+ // new region for the first bundle). This makes it necessary to
+ // recalculate all dependencies.
+ // It is seldom that this needs to be done a second time after adding the
+ // initial bundle to the region.
+ for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ doForAllOpcodes(I, [](ScheduleData *SD) {
+ SD->clearDependencies();
+ });
+ }
+ ReSchedule = true;
+ }
+ if (ReSchedule) {
+ resetSchedule();
+ initialFillReadyList(ReadyInsts);
+ }
+ assert(Bundle && "Failed to find schedule bundle");
+
+ LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << *Bundle << " in block "
+ << BB->getName() << "\n");
+
+ calculateDependencies(Bundle, true, SLP);
+
+ // Now try to schedule the new bundle. As soon as the bundle is "ready" it
+ // means that there are no cyclic dependencies and we can schedule it.
+ // Note that's important that we don't "schedule" the bundle yet (see
+ // cancelScheduling).
+ while (!Bundle->isReady() && !ReadyInsts.empty()) {
+
ScheduleData *pickedSD = ReadyInsts.pop_back_val();
-
- if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
- schedule(pickedSD, ReadyInsts);
- }
- }
- if (!Bundle->isReady()) {
- cancelScheduling(VL, S.OpValue);
- return None;
- }
- return Bundle;
-}
-
-void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
- Value *OpValue) {
- if (isa<PHINode>(OpValue))
- return;
-
- ScheduleData *Bundle = getScheduleData(OpValue);
- LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
- assert(!Bundle->IsScheduled &&
- "Can't cancel bundle which is already scheduled");
- assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
- "tried to unbundle something which is not a bundle");
-
- // Un-bundle: make single instructions out of the bundle.
- ScheduleData *BundleMember = Bundle;
- while (BundleMember) {
- assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
- BundleMember->FirstInBundle = BundleMember;
- ScheduleData *Next = BundleMember->NextInBundle;
- BundleMember->NextInBundle = nullptr;
- BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
- if (BundleMember->UnscheduledDepsInBundle == 0) {
- ReadyInsts.insert(BundleMember);
- }
- BundleMember = Next;
- }
-}
-
-BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
- // Allocate a new ScheduleData for the instruction.
- if (ChunkPos >= ChunkSize) {
- ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
- ChunkPos = 0;
- }
- return &(ScheduleDataChunks.back()[ChunkPos++]);
-}
-
-bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
- const InstructionsState &S) {
- if (getScheduleData(V, isOneOf(S, V)))
- return true;
- Instruction *I = dyn_cast<Instruction>(V);
- assert(I && "bundle member must be an instruction");
- assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
- auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
- ScheduleData *ISD = getScheduleData(I);
- if (!ISD)
- return false;
- assert(isInSchedulingRegion(ISD) &&
- "ScheduleData not in scheduling region");
- ScheduleData *SD = allocateScheduleDataChunks();
- SD->Inst = I;
- SD->init(SchedulingRegionID, S.OpValue);
- ExtraScheduleDataMap[I][S.OpValue] = SD;
- return true;
- };
- if (CheckSheduleForI(I))
- return true;
- if (!ScheduleStart) {
- // It's the first instruction in the new region.
- initScheduleData(I, I->getNextNode(), nullptr, nullptr);
- ScheduleStart = I;
- ScheduleEnd = I->getNextNode();
- if (isOneOf(S, I) != I)
- CheckSheduleForI(I);
- assert(ScheduleEnd && "tried to vectorize a terminator?");
- LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
- return true;
- }
- // Search up and down at the same time, because we don't know if the new
- // instruction is above or below the existing scheduling region.
- BasicBlock::reverse_iterator UpIter =
- ++ScheduleStart->getIterator().getReverse();
- BasicBlock::reverse_iterator UpperEnd = BB->rend();
- BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
- BasicBlock::iterator LowerEnd = BB->end();
- while (true) {
- if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
- LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
- return false;
- }
-
- if (UpIter != UpperEnd) {
- if (&*UpIter == I) {
- initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
- ScheduleStart = I;
- if (isOneOf(S, I) != I)
- CheckSheduleForI(I);
- LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
- << "\n");
- return true;
- }
- ++UpIter;
- }
- if (DownIter != LowerEnd) {
- if (&*DownIter == I) {
- initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
- nullptr);
- ScheduleEnd = I->getNextNode();
- if (isOneOf(S, I) != I)
- CheckSheduleForI(I);
- assert(ScheduleEnd && "tried to vectorize a terminator?");
- LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I
- << "\n");
- return true;
- }
- ++DownIter;
- }
- assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
- "instruction not found in block");
- }
- return true;
-}
-
-void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
- Instruction *ToI,
- ScheduleData *PrevLoadStore,
- ScheduleData *NextLoadStore) {
- ScheduleData *CurrentLoadStore = PrevLoadStore;
- for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
- ScheduleData *SD = ScheduleDataMap[I];
- if (!SD) {
- SD = allocateScheduleDataChunks();
- ScheduleDataMap[I] = SD;
- SD->Inst = I;
- }
- assert(!isInSchedulingRegion(SD) &&
- "new ScheduleData already in scheduling region");
- SD->init(SchedulingRegionID, I);
-
- if (I->mayReadOrWriteMemory() &&
- (!isa<IntrinsicInst>(I) ||
+
+ if (pickedSD->isSchedulingEntity() && pickedSD->isReady()) {
+ schedule(pickedSD, ReadyInsts);
+ }
+ }
+ if (!Bundle->isReady()) {
+ cancelScheduling(VL, S.OpValue);
+ return None;
+ }
+ return Bundle;
+}
+
+void BoUpSLP::BlockScheduling::cancelScheduling(ArrayRef<Value *> VL,
+ Value *OpValue) {
+ if (isa<PHINode>(OpValue))
+ return;
+
+ ScheduleData *Bundle = getScheduleData(OpValue);
+ LLVM_DEBUG(dbgs() << "SLP: cancel scheduling of " << *Bundle << "\n");
+ assert(!Bundle->IsScheduled &&
+ "Can't cancel bundle which is already scheduled");
+ assert(Bundle->isSchedulingEntity() && Bundle->isPartOfBundle() &&
+ "tried to unbundle something which is not a bundle");
+
+ // Un-bundle: make single instructions out of the bundle.
+ ScheduleData *BundleMember = Bundle;
+ while (BundleMember) {
+ assert(BundleMember->FirstInBundle == Bundle && "corrupt bundle links");
+ BundleMember->FirstInBundle = BundleMember;
+ ScheduleData *Next = BundleMember->NextInBundle;
+ BundleMember->NextInBundle = nullptr;
+ BundleMember->UnscheduledDepsInBundle = BundleMember->UnscheduledDeps;
+ if (BundleMember->UnscheduledDepsInBundle == 0) {
+ ReadyInsts.insert(BundleMember);
+ }
+ BundleMember = Next;
+ }
+}
+
+BoUpSLP::ScheduleData *BoUpSLP::BlockScheduling::allocateScheduleDataChunks() {
+ // Allocate a new ScheduleData for the instruction.
+ if (ChunkPos >= ChunkSize) {
+ ScheduleDataChunks.push_back(std::make_unique<ScheduleData[]>(ChunkSize));
+ ChunkPos = 0;
+ }
+ return &(ScheduleDataChunks.back()[ChunkPos++]);
+}
+
+bool BoUpSLP::BlockScheduling::extendSchedulingRegion(Value *V,
+ const InstructionsState &S) {
+ if (getScheduleData(V, isOneOf(S, V)))
+ return true;
+ Instruction *I = dyn_cast<Instruction>(V);
+ assert(I && "bundle member must be an instruction");
+ assert(!isa<PHINode>(I) && "phi nodes don't need to be scheduled");
+ auto &&CheckSheduleForI = [this, &S](Instruction *I) -> bool {
+ ScheduleData *ISD = getScheduleData(I);
+ if (!ISD)
+ return false;
+ assert(isInSchedulingRegion(ISD) &&
+ "ScheduleData not in scheduling region");
+ ScheduleData *SD = allocateScheduleDataChunks();
+ SD->Inst = I;
+ SD->init(SchedulingRegionID, S.OpValue);
+ ExtraScheduleDataMap[I][S.OpValue] = SD;
+ return true;
+ };
+ if (CheckSheduleForI(I))
+ return true;
+ if (!ScheduleStart) {
+ // It's the first instruction in the new region.
+ initScheduleData(I, I->getNextNode(), nullptr, nullptr);
+ ScheduleStart = I;
+ ScheduleEnd = I->getNextNode();
+ if (isOneOf(S, I) != I)
+ CheckSheduleForI(I);
+ assert(ScheduleEnd && "tried to vectorize a terminator?");
+ LLVM_DEBUG(dbgs() << "SLP: initialize schedule region to " << *I << "\n");
+ return true;
+ }
+ // Search up and down at the same time, because we don't know if the new
+ // instruction is above or below the existing scheduling region.
+ BasicBlock::reverse_iterator UpIter =
+ ++ScheduleStart->getIterator().getReverse();
+ BasicBlock::reverse_iterator UpperEnd = BB->rend();
+ BasicBlock::iterator DownIter = ScheduleEnd->getIterator();
+ BasicBlock::iterator LowerEnd = BB->end();
+ while (true) {
+ if (++ScheduleRegionSize > ScheduleRegionSizeLimit) {
+ LLVM_DEBUG(dbgs() << "SLP: exceeded schedule region size limit\n");
+ return false;
+ }
+
+ if (UpIter != UpperEnd) {
+ if (&*UpIter == I) {
+ initScheduleData(I, ScheduleStart, nullptr, FirstLoadStoreInRegion);
+ ScheduleStart = I;
+ if (isOneOf(S, I) != I)
+ CheckSheduleForI(I);
+ LLVM_DEBUG(dbgs() << "SLP: extend schedule region start to " << *I
+ << "\n");
+ return true;
+ }
+ ++UpIter;
+ }
+ if (DownIter != LowerEnd) {
+ if (&*DownIter == I) {
+ initScheduleData(ScheduleEnd, I->getNextNode(), LastLoadStoreInRegion,
+ nullptr);
+ ScheduleEnd = I->getNextNode();
+ if (isOneOf(S, I) != I)
+ CheckSheduleForI(I);
+ assert(ScheduleEnd && "tried to vectorize a terminator?");
+ LLVM_DEBUG(dbgs() << "SLP: extend schedule region end to " << *I
+ << "\n");
+ return true;
+ }
+ ++DownIter;
+ }
+ assert((UpIter != UpperEnd || DownIter != LowerEnd) &&
+ "instruction not found in block");
+ }
+ return true;
+}
+
+void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI,
+ Instruction *ToI,
+ ScheduleData *PrevLoadStore,
+ ScheduleData *NextLoadStore) {
+ ScheduleData *CurrentLoadStore = PrevLoadStore;
+ for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) {
+ ScheduleData *SD = ScheduleDataMap[I];
+ if (!SD) {
+ SD = allocateScheduleDataChunks();
+ ScheduleDataMap[I] = SD;
+ SD->Inst = I;
+ }
+ assert(!isInSchedulingRegion(SD) &&
+ "new ScheduleData already in scheduling region");
+ SD->init(SchedulingRegionID, I);
+
+ if (I->mayReadOrWriteMemory() &&
+ (!isa<IntrinsicInst>(I) ||
(cast<IntrinsicInst>(I)->getIntrinsicID() != Intrinsic::sideeffect &&
cast<IntrinsicInst>(I)->getIntrinsicID() !=
Intrinsic::pseudoprobe))) {
- // Update the linked list of memory accessing instructions.
- if (CurrentLoadStore) {
- CurrentLoadStore->NextLoadStore = SD;
- } else {
- FirstLoadStoreInRegion = SD;
- }
- CurrentLoadStore = SD;
- }
- }
- if (NextLoadStore) {
- if (CurrentLoadStore)
- CurrentLoadStore->NextLoadStore = NextLoadStore;
- } else {
- LastLoadStoreInRegion = CurrentLoadStore;
- }
-}
-
-void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
- bool InsertInReadyList,
- BoUpSLP *SLP) {
- assert(SD->isSchedulingEntity());
-
- SmallVector<ScheduleData *, 10> WorkList;
- WorkList.push_back(SD);
-
- while (!WorkList.empty()) {
+ // Update the linked list of memory accessing instructions.
+ if (CurrentLoadStore) {
+ CurrentLoadStore->NextLoadStore = SD;
+ } else {
+ FirstLoadStoreInRegion = SD;
+ }
+ CurrentLoadStore = SD;
+ }
+ }
+ if (NextLoadStore) {
+ if (CurrentLoadStore)
+ CurrentLoadStore->NextLoadStore = NextLoadStore;
+ } else {
+ LastLoadStoreInRegion = CurrentLoadStore;
+ }
+}
+
+void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleData *SD,
+ bool InsertInReadyList,
+ BoUpSLP *SLP) {
+ assert(SD->isSchedulingEntity());
+
+ SmallVector<ScheduleData *, 10> WorkList;
+ WorkList.push_back(SD);
+
+ while (!WorkList.empty()) {
ScheduleData *SD = WorkList.pop_back_val();
-
- ScheduleData *BundleMember = SD;
- while (BundleMember) {
- assert(isInSchedulingRegion(BundleMember));
- if (!BundleMember->hasValidDependencies()) {
-
- LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
- << "\n");
- BundleMember->Dependencies = 0;
- BundleMember->resetUnscheduledDeps();
-
- // Handle def-use chain dependencies.
- if (BundleMember->OpValue != BundleMember->Inst) {
- ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
- if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
- BundleMember->Dependencies++;
- ScheduleData *DestBundle = UseSD->FirstInBundle;
- if (!DestBundle->IsScheduled)
- BundleMember->incrementUnscheduledDeps(1);
- if (!DestBundle->hasValidDependencies())
- WorkList.push_back(DestBundle);
- }
- } else {
- for (User *U : BundleMember->Inst->users()) {
- if (isa<Instruction>(U)) {
- ScheduleData *UseSD = getScheduleData(U);
- if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
- BundleMember->Dependencies++;
- ScheduleData *DestBundle = UseSD->FirstInBundle;
- if (!DestBundle->IsScheduled)
- BundleMember->incrementUnscheduledDeps(1);
- if (!DestBundle->hasValidDependencies())
- WorkList.push_back(DestBundle);
- }
- } else {
- // I'm not sure if this can ever happen. But we need to be safe.
- // This lets the instruction/bundle never be scheduled and
- // eventually disable vectorization.
- BundleMember->Dependencies++;
- BundleMember->incrementUnscheduledDeps(1);
- }
- }
- }
-
- // Handle the memory dependencies.
- ScheduleData *DepDest = BundleMember->NextLoadStore;
- if (DepDest) {
- Instruction *SrcInst = BundleMember->Inst;
- MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
- bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
- unsigned numAliased = 0;
- unsigned DistToSrc = 1;
-
- while (DepDest) {
- assert(isInSchedulingRegion(DepDest));
-
- // We have two limits to reduce the complexity:
- // 1) AliasedCheckLimit: It's a small limit to reduce calls to
- // SLP->isAliased (which is the expensive part in this loop).
- // 2) MaxMemDepDistance: It's for very large blocks and it aborts
- // the whole loop (even if the loop is fast, it's quadratic).
- // It's important for the loop break condition (see below) to
- // check this limit even between two read-only instructions.
- if (DistToSrc >= MaxMemDepDistance ||
- ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
- (numAliased >= AliasedCheckLimit ||
- SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
-
- // We increment the counter only if the locations are aliased
- // (instead of counting all alias checks). This gives a better
- // balance between reduced runtime and accurate dependencies.
- numAliased++;
-
- DepDest->MemoryDependencies.push_back(BundleMember);
- BundleMember->Dependencies++;
- ScheduleData *DestBundle = DepDest->FirstInBundle;
- if (!DestBundle->IsScheduled) {
- BundleMember->incrementUnscheduledDeps(1);
- }
- if (!DestBundle->hasValidDependencies()) {
- WorkList.push_back(DestBundle);
- }
- }
- DepDest = DepDest->NextLoadStore;
-
- // Example, explaining the loop break condition: Let's assume our
- // starting instruction is i0 and MaxMemDepDistance = 3.
- //
- // +--------v--v--v
- // i0,i1,i2,i3,i4,i5,i6,i7,i8
- // +--------^--^--^
- //
- // MaxMemDepDistance let us stop alias-checking at i3 and we add
- // dependencies from i0 to i3,i4,.. (even if they are not aliased).
- // Previously we already added dependencies from i3 to i6,i7,i8
- // (because of MaxMemDepDistance). As we added a dependency from
- // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
- // and we can abort this loop at i6.
- if (DistToSrc >= 2 * MaxMemDepDistance)
- break;
- DistToSrc++;
- }
- }
- }
- BundleMember = BundleMember->NextInBundle;
- }
- if (InsertInReadyList && SD->isReady()) {
- ReadyInsts.push_back(SD);
- LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
- << "\n");
- }
- }
-}
-
-void BoUpSLP::BlockScheduling::resetSchedule() {
- assert(ScheduleStart &&
- "tried to reset schedule on block which has not been scheduled");
- for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
- doForAllOpcodes(I, [&](ScheduleData *SD) {
- assert(isInSchedulingRegion(SD) &&
- "ScheduleData not in scheduling region");
- SD->IsScheduled = false;
- SD->resetUnscheduledDeps();
- });
- }
- ReadyInsts.clear();
-}
-
-void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
- if (!BS->ScheduleStart)
- return;
-
- LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
-
- BS->resetSchedule();
-
- // For the real scheduling we use a more sophisticated ready-list: it is
- // sorted by the original instruction location. This lets the final schedule
- // be as close as possible to the original instruction order.
- struct ScheduleDataCompare {
- bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
- return SD2->SchedulingPriority < SD1->SchedulingPriority;
- }
- };
- std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
-
- // Ensure that all dependency data is updated and fill the ready-list with
- // initial instructions.
- int Idx = 0;
- int NumToSchedule = 0;
- for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
- I = I->getNextNode()) {
- BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
- assert(SD->isPartOfBundle() ==
- (getTreeEntry(SD->Inst) != nullptr) &&
- "scheduler and vectorizer bundle mismatch");
- SD->FirstInBundle->SchedulingPriority = Idx++;
- if (SD->isSchedulingEntity()) {
- BS->calculateDependencies(SD, false, this);
- NumToSchedule++;
- }
- });
- }
- BS->initialFillReadyList(ReadyInsts);
-
- Instruction *LastScheduledInst = BS->ScheduleEnd;
-
- // Do the "real" scheduling.
- while (!ReadyInsts.empty()) {
- ScheduleData *picked = *ReadyInsts.begin();
- ReadyInsts.erase(ReadyInsts.begin());
-
- // Move the scheduled instruction(s) to their dedicated places, if not
- // there yet.
- ScheduleData *BundleMember = picked;
- while (BundleMember) {
- Instruction *pickedInst = BundleMember->Inst;
- if (LastScheduledInst->getNextNode() != pickedInst) {
- BS->BB->getInstList().remove(pickedInst);
- BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
- pickedInst);
- }
- LastScheduledInst = pickedInst;
- BundleMember = BundleMember->NextInBundle;
- }
-
- BS->schedule(picked, ReadyInsts);
- NumToSchedule--;
- }
- assert(NumToSchedule == 0 && "could not schedule all instructions");
-
- // Avoid duplicate scheduling of the block.
- BS->ScheduleStart = nullptr;
-}
-
-unsigned BoUpSLP::getVectorElementSize(Value *V) {
+
+ ScheduleData *BundleMember = SD;
+ while (BundleMember) {
+ assert(isInSchedulingRegion(BundleMember));
+ if (!BundleMember->hasValidDependencies()) {
+
+ LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember
+ << "\n");
+ BundleMember->Dependencies = 0;
+ BundleMember->resetUnscheduledDeps();
+
+ // Handle def-use chain dependencies.
+ if (BundleMember->OpValue != BundleMember->Inst) {
+ ScheduleData *UseSD = getScheduleData(BundleMember->Inst);
+ if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = UseSD->FirstInBundle;
+ if (!DestBundle->IsScheduled)
+ BundleMember->incrementUnscheduledDeps(1);
+ if (!DestBundle->hasValidDependencies())
+ WorkList.push_back(DestBundle);
+ }
+ } else {
+ for (User *U : BundleMember->Inst->users()) {
+ if (isa<Instruction>(U)) {
+ ScheduleData *UseSD = getScheduleData(U);
+ if (UseSD && isInSchedulingRegion(UseSD->FirstInBundle)) {
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = UseSD->FirstInBundle;
+ if (!DestBundle->IsScheduled)
+ BundleMember->incrementUnscheduledDeps(1);
+ if (!DestBundle->hasValidDependencies())
+ WorkList.push_back(DestBundle);
+ }
+ } else {
+ // I'm not sure if this can ever happen. But we need to be safe.
+ // This lets the instruction/bundle never be scheduled and
+ // eventually disable vectorization.
+ BundleMember->Dependencies++;
+ BundleMember->incrementUnscheduledDeps(1);
+ }
+ }
+ }
+
+ // Handle the memory dependencies.
+ ScheduleData *DepDest = BundleMember->NextLoadStore;
+ if (DepDest) {
+ Instruction *SrcInst = BundleMember->Inst;
+ MemoryLocation SrcLoc = getLocation(SrcInst, SLP->AA);
+ bool SrcMayWrite = BundleMember->Inst->mayWriteToMemory();
+ unsigned numAliased = 0;
+ unsigned DistToSrc = 1;
+
+ while (DepDest) {
+ assert(isInSchedulingRegion(DepDest));
+
+ // We have two limits to reduce the complexity:
+ // 1) AliasedCheckLimit: It's a small limit to reduce calls to
+ // SLP->isAliased (which is the expensive part in this loop).
+ // 2) MaxMemDepDistance: It's for very large blocks and it aborts
+ // the whole loop (even if the loop is fast, it's quadratic).
+ // It's important for the loop break condition (see below) to
+ // check this limit even between two read-only instructions.
+ if (DistToSrc >= MaxMemDepDistance ||
+ ((SrcMayWrite || DepDest->Inst->mayWriteToMemory()) &&
+ (numAliased >= AliasedCheckLimit ||
+ SLP->isAliased(SrcLoc, SrcInst, DepDest->Inst)))) {
+
+ // We increment the counter only if the locations are aliased
+ // (instead of counting all alias checks). This gives a better
+ // balance between reduced runtime and accurate dependencies.
+ numAliased++;
+
+ DepDest->MemoryDependencies.push_back(BundleMember);
+ BundleMember->Dependencies++;
+ ScheduleData *DestBundle = DepDest->FirstInBundle;
+ if (!DestBundle->IsScheduled) {
+ BundleMember->incrementUnscheduledDeps(1);
+ }
+ if (!DestBundle->hasValidDependencies()) {
+ WorkList.push_back(DestBundle);
+ }
+ }
+ DepDest = DepDest->NextLoadStore;
+
+ // Example, explaining the loop break condition: Let's assume our
+ // starting instruction is i0 and MaxMemDepDistance = 3.
+ //
+ // +--------v--v--v
+ // i0,i1,i2,i3,i4,i5,i6,i7,i8
+ // +--------^--^--^
+ //
+ // MaxMemDepDistance let us stop alias-checking at i3 and we add
+ // dependencies from i0 to i3,i4,.. (even if they are not aliased).
+ // Previously we already added dependencies from i3 to i6,i7,i8
+ // (because of MaxMemDepDistance). As we added a dependency from
+ // i0 to i3, we have transitive dependencies from i0 to i6,i7,i8
+ // and we can abort this loop at i6.
+ if (DistToSrc >= 2 * MaxMemDepDistance)
+ break;
+ DistToSrc++;
+ }
+ }
+ }
+ BundleMember = BundleMember->NextInBundle;
+ }
+ if (InsertInReadyList && SD->isReady()) {
+ ReadyInsts.push_back(SD);
+ LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD->Inst
+ << "\n");
+ }
+ }
+}
+
+void BoUpSLP::BlockScheduling::resetSchedule() {
+ assert(ScheduleStart &&
+ "tried to reset schedule on block which has not been scheduled");
+ for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
+ doForAllOpcodes(I, [&](ScheduleData *SD) {
+ assert(isInSchedulingRegion(SD) &&
+ "ScheduleData not in scheduling region");
+ SD->IsScheduled = false;
+ SD->resetUnscheduledDeps();
+ });
+ }
+ ReadyInsts.clear();
+}
+
+void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
+ if (!BS->ScheduleStart)
+ return;
+
+ LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");
+
+ BS->resetSchedule();
+
+ // For the real scheduling we use a more sophisticated ready-list: it is
+ // sorted by the original instruction location. This lets the final schedule
+ // be as close as possible to the original instruction order.
+ struct ScheduleDataCompare {
+ bool operator()(ScheduleData *SD1, ScheduleData *SD2) const {
+ return SD2->SchedulingPriority < SD1->SchedulingPriority;
+ }
+ };
+ std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;
+
+ // Ensure that all dependency data is updated and fill the ready-list with
+ // initial instructions.
+ int Idx = 0;
+ int NumToSchedule = 0;
+ for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
+ I = I->getNextNode()) {
+ BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
+ assert(SD->isPartOfBundle() ==
+ (getTreeEntry(SD->Inst) != nullptr) &&
+ "scheduler and vectorizer bundle mismatch");
+ SD->FirstInBundle->SchedulingPriority = Idx++;
+ if (SD->isSchedulingEntity()) {
+ BS->calculateDependencies(SD, false, this);
+ NumToSchedule++;
+ }
+ });
+ }
+ BS->initialFillReadyList(ReadyInsts);
+
+ Instruction *LastScheduledInst = BS->ScheduleEnd;
+
+ // Do the "real" scheduling.
+ while (!ReadyInsts.empty()) {
+ ScheduleData *picked = *ReadyInsts.begin();
+ ReadyInsts.erase(ReadyInsts.begin());
+
+ // Move the scheduled instruction(s) to their dedicated places, if not
+ // there yet.
+ ScheduleData *BundleMember = picked;
+ while (BundleMember) {
+ Instruction *pickedInst = BundleMember->Inst;
+ if (LastScheduledInst->getNextNode() != pickedInst) {
+ BS->BB->getInstList().remove(pickedInst);
+ BS->BB->getInstList().insert(LastScheduledInst->getIterator(),
+ pickedInst);
+ }
+ LastScheduledInst = pickedInst;
+ BundleMember = BundleMember->NextInBundle;
+ }
+
+ BS->schedule(picked, ReadyInsts);
+ NumToSchedule--;
+ }
+ assert(NumToSchedule == 0 && "could not schedule all instructions");
+
+ // Avoid duplicate scheduling of the block.
+ BS->ScheduleStart = nullptr;
+}
+
+unsigned BoUpSLP::getVectorElementSize(Value *V) {
// If V is a store, just return the width of the stored value (or value
// truncated just before storing) without traversing the expression tree.
// This is the common case.
@@ -5543,891 +5543,891 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) {
else
return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
}
-
- auto E = InstrElementSize.find(V);
- if (E != InstrElementSize.end())
- return E->second;
-
- // If V is not a store, we can traverse the expression tree to find loads
- // that feed it. The type of the loaded value may indicate a more suitable
- // width than V's type. We want to base the vector element size on the width
- // of memory operations where possible.
- SmallVector<Instruction *, 16> Worklist;
- SmallPtrSet<Instruction *, 16> Visited;
- if (auto *I = dyn_cast<Instruction>(V)) {
- Worklist.push_back(I);
- Visited.insert(I);
- }
-
- // Traverse the expression tree in bottom-up order looking for loads. If we
- // encounter an instruction we don't yet handle, we give up.
- auto MaxWidth = 0u;
- auto FoundUnknownInst = false;
- while (!Worklist.empty() && !FoundUnknownInst) {
- auto *I = Worklist.pop_back_val();
-
- // We should only be looking at scalar instructions here. If the current
- // instruction has a vector type, give up.
- auto *Ty = I->getType();
- if (isa<VectorType>(Ty))
- FoundUnknownInst = true;
-
- // If the current instruction is a load, update MaxWidth to reflect the
- // width of the loaded value.
- else if (isa<LoadInst>(I))
- MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
-
- // Otherwise, we need to visit the operands of the instruction. We only
- // handle the interesting cases from buildTree here. If an operand is an
- // instruction we haven't yet visited, we add it to the worklist.
- else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
- isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
- for (Use &U : I->operands())
- if (auto *J = dyn_cast<Instruction>(U.get()))
- if (Visited.insert(J).second)
- Worklist.push_back(J);
- }
-
- // If we don't yet handle the instruction, give up.
- else
- FoundUnknownInst = true;
- }
-
- int Width = MaxWidth;
- // If we didn't encounter a memory access in the expression tree, or if we
- // gave up for some reason, just return the width of V. Otherwise, return the
- // maximum width we found.
- if (!MaxWidth || FoundUnknownInst)
- Width = DL->getTypeSizeInBits(V->getType());
-
- for (Instruction *I : Visited)
- InstrElementSize[I] = Width;
-
- return Width;
-}
-
-// Determine if a value V in a vectorizable expression Expr can be demoted to a
-// smaller type with a truncation. We collect the values that will be demoted
-// in ToDemote and additional roots that require investigating in Roots.
-static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
- SmallVectorImpl<Value *> &ToDemote,
- SmallVectorImpl<Value *> &Roots) {
- // We can always demote constants.
- if (isa<Constant>(V)) {
- ToDemote.push_back(V);
- return true;
- }
-
- // If the value is not an instruction in the expression with only one use, it
- // cannot be demoted.
- auto *I = dyn_cast<Instruction>(V);
- if (!I || !I->hasOneUse() || !Expr.count(I))
- return false;
-
- switch (I->getOpcode()) {
-
- // We can always demote truncations and extensions. Since truncations can
- // seed additional demotion, we save the truncated value.
- case Instruction::Trunc:
- Roots.push_back(I->getOperand(0));
- break;
- case Instruction::ZExt:
- case Instruction::SExt:
- break;
-
- // We can demote certain binary operations if we can demote both of their
- // operands.
- case Instruction::Add:
- case Instruction::Sub:
- case Instruction::Mul:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor:
- if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
- !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
- return false;
- break;
-
- // We can demote selects if we can demote their true and false values.
- case Instruction::Select: {
- SelectInst *SI = cast<SelectInst>(I);
- if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
- !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
- return false;
- break;
- }
-
- // We can demote phis if we can demote all their incoming operands. Note that
- // we don't need to worry about cycles since we ensure single use above.
- case Instruction::PHI: {
- PHINode *PN = cast<PHINode>(I);
- for (Value *IncValue : PN->incoming_values())
- if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
- return false;
- break;
- }
-
- // Otherwise, conservatively give up.
- default:
- return false;
- }
-
- // Record the value that we can demote.
- ToDemote.push_back(V);
- return true;
-}
-
-void BoUpSLP::computeMinimumValueSizes() {
- // If there are no external uses, the expression tree must be rooted by a
- // store. We can't demote in-memory values, so there is nothing to do here.
- if (ExternalUses.empty())
- return;
-
- // We only attempt to truncate integer expressions.
- auto &TreeRoot = VectorizableTree[0]->Scalars;
- auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
- if (!TreeRootIT)
- return;
-
- // If the expression is not rooted by a store, these roots should have
- // external uses. We will rely on InstCombine to rewrite the expression in
- // the narrower type. However, InstCombine only rewrites single-use values.
- // This means that if a tree entry other than a root is used externally, it
- // must have multiple uses and InstCombine will not rewrite it. The code
- // below ensures that only the roots are used externally.
- SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
- for (auto &EU : ExternalUses)
- if (!Expr.erase(EU.Scalar))
- return;
- if (!Expr.empty())
- return;
-
- // Collect the scalar values of the vectorizable expression. We will use this
- // context to determine which values can be demoted. If we see a truncation,
- // we mark it as seeding another demotion.
- for (auto &EntryPtr : VectorizableTree)
- Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());
-
- // Ensure the roots of the vectorizable tree don't form a cycle. They must
- // have a single external user that is not in the vectorizable tree.
- for (auto *Root : TreeRoot)
- if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
- return;
-
- // Conservatively determine if we can actually truncate the roots of the
- // expression. Collect the values that can be demoted in ToDemote and
- // additional roots that require investigating in Roots.
- SmallVector<Value *, 32> ToDemote;
- SmallVector<Value *, 4> Roots;
- for (auto *Root : TreeRoot)
- if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
- return;
-
- // The maximum bit width required to represent all the values that can be
- // demoted without loss of precision. It would be safe to truncate the roots
- // of the expression to this width.
- auto MaxBitWidth = 8u;
-
- // We first check if all the bits of the roots are demanded. If they're not,
- // we can truncate the roots to this narrower type.
- for (auto *Root : TreeRoot) {
- auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
- MaxBitWidth = std::max<unsigned>(
- Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
- }
-
- // True if the roots can be zero-extended back to their original type, rather
- // than sign-extended. We know that if the leading bits are not demanded, we
- // can safely zero-extend. So we initialize IsKnownPositive to True.
- bool IsKnownPositive = true;
-
- // If all the bits of the roots are demanded, we can try a little harder to
- // compute a narrower type. This can happen, for example, if the roots are
- // getelementptr indices. InstCombine promotes these indices to the pointer
- // width. Thus, all their bits are technically demanded even though the
- // address computation might be vectorized in a smaller type.
- //
- // We start by looking at each entry that can be demoted. We compute the
- // maximum bit width required to store the scalar by using ValueTracking to
- // compute the number of high-order bits we can truncate.
- if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
- llvm::all_of(TreeRoot, [](Value *R) {
- assert(R->hasOneUse() && "Root should have only one use!");
- return isa<GetElementPtrInst>(R->user_back());
- })) {
- MaxBitWidth = 8u;
-
- // Determine if the sign bit of all the roots is known to be zero. If not,
- // IsKnownPositive is set to False.
- IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
- KnownBits Known = computeKnownBits(R, *DL);
- return Known.isNonNegative();
- });
-
- // Determine the maximum number of bits required to store the scalar
- // values.
- for (auto *Scalar : ToDemote) {
- auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
- auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
- MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
- }
-
- // If we can't prove that the sign bit is zero, we must add one to the
- // maximum bit width to account for the unknown sign bit. This preserves
- // the existing sign bit so we can safely sign-extend the root back to the
- // original type. Otherwise, if we know the sign bit is zero, we will
- // zero-extend the root instead.
- //
- // FIXME: This is somewhat suboptimal, as there will be cases where adding
- // one to the maximum bit width will yield a larger-than-necessary
- // type. In general, we need to add an extra bit only if we can't
- // prove that the upper bit of the original type is equal to the
- // upper bit of the proposed smaller type. If these two bits are the
- // same (either zero or one) we know that sign-extending from the
- // smaller type will result in the same value. Here, since we can't
- // yet prove this, we are just making the proposed smaller type
- // larger to ensure correctness.
- if (!IsKnownPositive)
- ++MaxBitWidth;
- }
-
- // Round MaxBitWidth up to the next power-of-two.
- if (!isPowerOf2_64(MaxBitWidth))
- MaxBitWidth = NextPowerOf2(MaxBitWidth);
-
- // If the maximum bit width we compute is less than the with of the roots'
- // type, we can proceed with the narrowing. Otherwise, do nothing.
- if (MaxBitWidth >= TreeRootIT->getBitWidth())
- return;
-
- // If we can truncate the root, we must collect additional values that might
- // be demoted as a result. That is, those seeded by truncations we will
- // modify.
- while (!Roots.empty())
- collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
-
- // Finally, map the values we can demote to the maximum bit with we computed.
- for (auto *Scalar : ToDemote)
- MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
-}
-
-namespace {
-
-/// The SLPVectorizer Pass.
-struct SLPVectorizer : public FunctionPass {
- SLPVectorizerPass Impl;
-
- /// Pass identification, replacement for typeid
- static char ID;
-
- explicit SLPVectorizer() : FunctionPass(ID) {
- initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
- }
-
- bool doInitialization(Module &M) override {
- return false;
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
-
- auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
- auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
- auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
- auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
- auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
- auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
- auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
- auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
-
- return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- FunctionPass::getAnalysisUsage(AU);
- AU.addRequired<AssumptionCacheTracker>();
- AU.addRequired<ScalarEvolutionWrapperPass>();
- AU.addRequired<AAResultsWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.addRequired<LoopInfoWrapperPass>();
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<DemandedBitsWrapperPass>();
- AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
- AU.addRequired<InjectTLIMappingsLegacy>();
- AU.addPreserved<LoopInfoWrapperPass>();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.setPreservesCFG();
- }
-};
-
-} // end anonymous namespace
-
-PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
- auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
- auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
- auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
- auto *AA = &AM.getResult<AAManager>(F);
- auto *LI = &AM.getResult<LoopAnalysis>(F);
- auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
- auto *AC = &AM.getResult<AssumptionAnalysis>(F);
- auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
- auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-
- bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
- if (!Changed)
- return PreservedAnalyses::all();
-
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<AAManager>();
- PA.preserve<GlobalsAA>();
- return PA;
-}
-
-bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
- TargetTransformInfo *TTI_,
+
+ auto E = InstrElementSize.find(V);
+ if (E != InstrElementSize.end())
+ return E->second;
+
+ // If V is not a store, we can traverse the expression tree to find loads
+ // that feed it. The type of the loaded value may indicate a more suitable
+ // width than V's type. We want to base the vector element size on the width
+ // of memory operations where possible.
+ SmallVector<Instruction *, 16> Worklist;
+ SmallPtrSet<Instruction *, 16> Visited;
+ if (auto *I = dyn_cast<Instruction>(V)) {
+ Worklist.push_back(I);
+ Visited.insert(I);
+ }
+
+ // Traverse the expression tree in bottom-up order looking for loads. If we
+ // encounter an instruction we don't yet handle, we give up.
+ auto MaxWidth = 0u;
+ auto FoundUnknownInst = false;
+ while (!Worklist.empty() && !FoundUnknownInst) {
+ auto *I = Worklist.pop_back_val();
+
+ // We should only be looking at scalar instructions here. If the current
+ // instruction has a vector type, give up.
+ auto *Ty = I->getType();
+ if (isa<VectorType>(Ty))
+ FoundUnknownInst = true;
+
+ // If the current instruction is a load, update MaxWidth to reflect the
+ // width of the loaded value.
+ else if (isa<LoadInst>(I))
+ MaxWidth = std::max<unsigned>(MaxWidth, DL->getTypeSizeInBits(Ty));
+
+ // Otherwise, we need to visit the operands of the instruction. We only
+ // handle the interesting cases from buildTree here. If an operand is an
+ // instruction we haven't yet visited, we add it to the worklist.
+ else if (isa<PHINode>(I) || isa<CastInst>(I) || isa<GetElementPtrInst>(I) ||
+ isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
+ for (Use &U : I->operands())
+ if (auto *J = dyn_cast<Instruction>(U.get()))
+ if (Visited.insert(J).second)
+ Worklist.push_back(J);
+ }
+
+ // If we don't yet handle the instruction, give up.
+ else
+ FoundUnknownInst = true;
+ }
+
+ int Width = MaxWidth;
+ // If we didn't encounter a memory access in the expression tree, or if we
+ // gave up for some reason, just return the width of V. Otherwise, return the
+ // maximum width we found.
+ if (!MaxWidth || FoundUnknownInst)
+ Width = DL->getTypeSizeInBits(V->getType());
+
+ for (Instruction *I : Visited)
+ InstrElementSize[I] = Width;
+
+ return Width;
+}
+
+// Determine if a value V in a vectorizable expression Expr can be demoted to a
+// smaller type with a truncation. We collect the values that will be demoted
+// in ToDemote and additional roots that require investigating in Roots.
+static bool collectValuesToDemote(Value *V, SmallPtrSetImpl<Value *> &Expr,
+ SmallVectorImpl<Value *> &ToDemote,
+ SmallVectorImpl<Value *> &Roots) {
+ // We can always demote constants.
+ if (isa<Constant>(V)) {
+ ToDemote.push_back(V);
+ return true;
+ }
+
+ // If the value is not an instruction in the expression with only one use, it
+ // cannot be demoted.
+ auto *I = dyn_cast<Instruction>(V);
+ if (!I || !I->hasOneUse() || !Expr.count(I))
+ return false;
+
+ switch (I->getOpcode()) {
+
+ // We can always demote truncations and extensions. Since truncations can
+ // seed additional demotion, we save the truncated value.
+ case Instruction::Trunc:
+ Roots.push_back(I->getOperand(0));
+ break;
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ break;
+
+ // We can demote certain binary operations if we can demote both of their
+ // operands.
+ case Instruction::Add:
+ case Instruction::Sub:
+ case Instruction::Mul:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ if (!collectValuesToDemote(I->getOperand(0), Expr, ToDemote, Roots) ||
+ !collectValuesToDemote(I->getOperand(1), Expr, ToDemote, Roots))
+ return false;
+ break;
+
+ // We can demote selects if we can demote their true and false values.
+ case Instruction::Select: {
+ SelectInst *SI = cast<SelectInst>(I);
+ if (!collectValuesToDemote(SI->getTrueValue(), Expr, ToDemote, Roots) ||
+ !collectValuesToDemote(SI->getFalseValue(), Expr, ToDemote, Roots))
+ return false;
+ break;
+ }
+
+ // We can demote phis if we can demote all their incoming operands. Note that
+ // we don't need to worry about cycles since we ensure single use above.
+ case Instruction::PHI: {
+ PHINode *PN = cast<PHINode>(I);
+ for (Value *IncValue : PN->incoming_values())
+ if (!collectValuesToDemote(IncValue, Expr, ToDemote, Roots))
+ return false;
+ break;
+ }
+
+ // Otherwise, conservatively give up.
+ default:
+ return false;
+ }
+
+ // Record the value that we can demote.
+ ToDemote.push_back(V);
+ return true;
+}
+
+void BoUpSLP::computeMinimumValueSizes() {
+ // If there are no external uses, the expression tree must be rooted by a
+ // store. We can't demote in-memory values, so there is nothing to do here.
+ if (ExternalUses.empty())
+ return;
+
+ // We only attempt to truncate integer expressions.
+ auto &TreeRoot = VectorizableTree[0]->Scalars;
+ auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
+ if (!TreeRootIT)
+ return;
+
+ // If the expression is not rooted by a store, these roots should have
+ // external uses. We will rely on InstCombine to rewrite the expression in
+ // the narrower type. However, InstCombine only rewrites single-use values.
+ // This means that if a tree entry other than a root is used externally, it
+ // must have multiple uses and InstCombine will not rewrite it. The code
+ // below ensures that only the roots are used externally.
+ SmallPtrSet<Value *, 32> Expr(TreeRoot.begin(), TreeRoot.end());
+ for (auto &EU : ExternalUses)
+ if (!Expr.erase(EU.Scalar))
+ return;
+ if (!Expr.empty())
+ return;
+
+ // Collect the scalar values of the vectorizable expression. We will use this
+ // context to determine which values can be demoted. If we see a truncation,
+ // we mark it as seeding another demotion.
+ for (auto &EntryPtr : VectorizableTree)
+ Expr.insert(EntryPtr->Scalars.begin(), EntryPtr->Scalars.end());
+
+ // Ensure the roots of the vectorizable tree don't form a cycle. They must
+ // have a single external user that is not in the vectorizable tree.
+ for (auto *Root : TreeRoot)
+ if (!Root->hasOneUse() || Expr.count(*Root->user_begin()))
+ return;
+
+ // Conservatively determine if we can actually truncate the roots of the
+ // expression. Collect the values that can be demoted in ToDemote and
+ // additional roots that require investigating in Roots.
+ SmallVector<Value *, 32> ToDemote;
+ SmallVector<Value *, 4> Roots;
+ for (auto *Root : TreeRoot)
+ if (!collectValuesToDemote(Root, Expr, ToDemote, Roots))
+ return;
+
+ // The maximum bit width required to represent all the values that can be
+ // demoted without loss of precision. It would be safe to truncate the roots
+ // of the expression to this width.
+ auto MaxBitWidth = 8u;
+
+ // We first check if all the bits of the roots are demanded. If they're not,
+ // we can truncate the roots to this narrower type.
+ for (auto *Root : TreeRoot) {
+ auto Mask = DB->getDemandedBits(cast<Instruction>(Root));
+ MaxBitWidth = std::max<unsigned>(
+ Mask.getBitWidth() - Mask.countLeadingZeros(), MaxBitWidth);
+ }
+
+ // True if the roots can be zero-extended back to their original type, rather
+ // than sign-extended. We know that if the leading bits are not demanded, we
+ // can safely zero-extend. So we initialize IsKnownPositive to True.
+ bool IsKnownPositive = true;
+
+ // If all the bits of the roots are demanded, we can try a little harder to
+ // compute a narrower type. This can happen, for example, if the roots are
+ // getelementptr indices. InstCombine promotes these indices to the pointer
+ // width. Thus, all their bits are technically demanded even though the
+ // address computation might be vectorized in a smaller type.
+ //
+ // We start by looking at each entry that can be demoted. We compute the
+ // maximum bit width required to store the scalar by using ValueTracking to
+ // compute the number of high-order bits we can truncate.
+ if (MaxBitWidth == DL->getTypeSizeInBits(TreeRoot[0]->getType()) &&
+ llvm::all_of(TreeRoot, [](Value *R) {
+ assert(R->hasOneUse() && "Root should have only one use!");
+ return isa<GetElementPtrInst>(R->user_back());
+ })) {
+ MaxBitWidth = 8u;
+
+ // Determine if the sign bit of all the roots is known to be zero. If not,
+ // IsKnownPositive is set to False.
+ IsKnownPositive = llvm::all_of(TreeRoot, [&](Value *R) {
+ KnownBits Known = computeKnownBits(R, *DL);
+ return Known.isNonNegative();
+ });
+
+ // Determine the maximum number of bits required to store the scalar
+ // values.
+ for (auto *Scalar : ToDemote) {
+ auto NumSignBits = ComputeNumSignBits(Scalar, *DL, 0, AC, nullptr, DT);
+ auto NumTypeBits = DL->getTypeSizeInBits(Scalar->getType());
+ MaxBitWidth = std::max<unsigned>(NumTypeBits - NumSignBits, MaxBitWidth);
+ }
+
+ // If we can't prove that the sign bit is zero, we must add one to the
+ // maximum bit width to account for the unknown sign bit. This preserves
+ // the existing sign bit so we can safely sign-extend the root back to the
+ // original type. Otherwise, if we know the sign bit is zero, we will
+ // zero-extend the root instead.
+ //
+ // FIXME: This is somewhat suboptimal, as there will be cases where adding
+ // one to the maximum bit width will yield a larger-than-necessary
+ // type. In general, we need to add an extra bit only if we can't
+ // prove that the upper bit of the original type is equal to the
+ // upper bit of the proposed smaller type. If these two bits are the
+ // same (either zero or one) we know that sign-extending from the
+ // smaller type will result in the same value. Here, since we can't
+ // yet prove this, we are just making the proposed smaller type
+ // larger to ensure correctness.
+ if (!IsKnownPositive)
+ ++MaxBitWidth;
+ }
+
+ // Round MaxBitWidth up to the next power-of-two.
+ if (!isPowerOf2_64(MaxBitWidth))
+ MaxBitWidth = NextPowerOf2(MaxBitWidth);
+
+ // If the maximum bit width we compute is less than the with of the roots'
+ // type, we can proceed with the narrowing. Otherwise, do nothing.
+ if (MaxBitWidth >= TreeRootIT->getBitWidth())
+ return;
+
+ // If we can truncate the root, we must collect additional values that might
+ // be demoted as a result. That is, those seeded by truncations we will
+ // modify.
+ while (!Roots.empty())
+ collectValuesToDemote(Roots.pop_back_val(), Expr, ToDemote, Roots);
+
+ // Finally, map the values we can demote to the maximum bit with we computed.
+ for (auto *Scalar : ToDemote)
+ MinBWs[Scalar] = std::make_pair(MaxBitWidth, !IsKnownPositive);
+}
+
+namespace {
+
+/// The SLPVectorizer Pass.
+struct SLPVectorizer : public FunctionPass {
+ SLPVectorizerPass Impl;
+
+ /// Pass identification, replacement for typeid
+ static char ID;
+
+ explicit SLPVectorizer() : FunctionPass(ID) {
+ initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool doInitialization(Module &M) override {
+ return false;
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+
+ auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+ auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto *TLIP = getAnalysisIfAvailable<TargetLibraryInfoWrapperPass>();
+ auto *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
+ auto *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+ auto *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+ auto *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+ auto *DB = &getAnalysis<DemandedBitsWrapperPass>().getDemandedBits();
+ auto *ORE = &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+ return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ FunctionPass::getAnalysisUsage(AU);
+ AU.addRequired<AssumptionCacheTracker>();
+ AU.addRequired<ScalarEvolutionWrapperPass>();
+ AU.addRequired<AAResultsWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.addRequired<LoopInfoWrapperPass>();
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<DemandedBitsWrapperPass>();
+ AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+ AU.addRequired<InjectTLIMappingsLegacy>();
+ AU.addPreserved<LoopInfoWrapperPass>();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.setPreservesCFG();
+ }
+};
+
+} // end anonymous namespace
+
+PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &AM) {
+ auto *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+ auto *TTI = &AM.getResult<TargetIRAnalysis>(F);
+ auto *TLI = AM.getCachedResult<TargetLibraryAnalysis>(F);
+ auto *AA = &AM.getResult<AAManager>(F);
+ auto *LI = &AM.getResult<LoopAnalysis>(F);
+ auto *DT = &AM.getResult<DominatorTreeAnalysis>(F);
+ auto *AC = &AM.getResult<AssumptionAnalysis>(F);
+ auto *DB = &AM.getResult<DemandedBitsAnalysis>(F);
+ auto *ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+ bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE);
+ if (!Changed)
+ return PreservedAnalyses::all();
+
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<AAManager>();
+ PA.preserve<GlobalsAA>();
+ return PA;
+}
+
+bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
+ TargetTransformInfo *TTI_,
TargetLibraryInfo *TLI_, AAResults *AA_,
- LoopInfo *LI_, DominatorTree *DT_,
- AssumptionCache *AC_, DemandedBits *DB_,
- OptimizationRemarkEmitter *ORE_) {
- if (!RunSLPVectorization)
- return false;
- SE = SE_;
- TTI = TTI_;
- TLI = TLI_;
- AA = AA_;
- LI = LI_;
- DT = DT_;
- AC = AC_;
- DB = DB_;
- DL = &F.getParent()->getDataLayout();
-
- Stores.clear();
- GEPs.clear();
- bool Changed = false;
-
- // If the target claims to have no vector registers don't attempt
- // vectorization.
- if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
- return false;
-
- // Don't vectorize when the attribute NoImplicitFloat is used.
- if (F.hasFnAttribute(Attribute::NoImplicitFloat))
- return false;
-
- LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
-
- // Use the bottom up slp vectorizer to construct chains that start with
- // store instructions.
- BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
-
- // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
- // delete instructions.
-
- // Scan the blocks in the function in post order.
- for (auto BB : post_order(&F.getEntryBlock())) {
- collectSeedInstructions(BB);
-
- // Vectorize trees that end at stores.
- if (!Stores.empty()) {
- LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
- << " underlying objects.\n");
- Changed |= vectorizeStoreChains(R);
- }
-
- // Vectorize trees that end at reductions.
- Changed |= vectorizeChainsInBlock(BB, R);
-
- // Vectorize the index computations of getelementptr instructions. This
- // is primarily intended to catch gather-like idioms ending at
- // non-consecutive loads.
- if (!GEPs.empty()) {
- LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
- << " underlying objects.\n");
- Changed |= vectorizeGEPIndices(BB, R);
- }
- }
-
- if (Changed) {
- R.optimizeGatherSequence();
- LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
- }
- return Changed;
-}
-
-bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
- unsigned Idx) {
- LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
- << "\n");
- const unsigned Sz = R.getVectorElementSize(Chain[0]);
- const unsigned MinVF = R.getMinVecRegSize() / Sz;
- unsigned VF = Chain.size();
-
- if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
- return false;
-
- LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
- << "\n");
-
- R.buildTree(Chain);
- Optional<ArrayRef<unsigned>> Order = R.bestOrder();
- // TODO: Handle orders of size less than number of elements in the vector.
- if (Order && Order->size() == Chain.size()) {
- // TODO: reorder tree nodes without tree rebuilding.
- SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());
- llvm::transform(*Order, ReorderedOps.begin(),
- [Chain](const unsigned Idx) { return Chain[Idx]; });
- R.buildTree(ReorderedOps);
- }
- if (R.isTreeTinyAndNotFullyVectorizable())
- return false;
- if (R.isLoadCombineCandidate())
- return false;
-
- R.computeMinimumValueSizes();
-
+ LoopInfo *LI_, DominatorTree *DT_,
+ AssumptionCache *AC_, DemandedBits *DB_,
+ OptimizationRemarkEmitter *ORE_) {
+ if (!RunSLPVectorization)
+ return false;
+ SE = SE_;
+ TTI = TTI_;
+ TLI = TLI_;
+ AA = AA_;
+ LI = LI_;
+ DT = DT_;
+ AC = AC_;
+ DB = DB_;
+ DL = &F.getParent()->getDataLayout();
+
+ Stores.clear();
+ GEPs.clear();
+ bool Changed = false;
+
+ // If the target claims to have no vector registers don't attempt
+ // vectorization.
+ if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)))
+ return false;
+
+ // Don't vectorize when the attribute NoImplicitFloat is used.
+ if (F.hasFnAttribute(Attribute::NoImplicitFloat))
+ return false;
+
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing blocks in " << F.getName() << ".\n");
+
+ // Use the bottom up slp vectorizer to construct chains that start with
+ // store instructions.
+ BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_);
+
+ // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to
+ // delete instructions.
+
+ // Scan the blocks in the function in post order.
+ for (auto BB : post_order(&F.getEntryBlock())) {
+ collectSeedInstructions(BB);
+
+ // Vectorize trees that end at stores.
+ if (!Stores.empty()) {
+ LLVM_DEBUG(dbgs() << "SLP: Found stores for " << Stores.size()
+ << " underlying objects.\n");
+ Changed |= vectorizeStoreChains(R);
+ }
+
+ // Vectorize trees that end at reductions.
+ Changed |= vectorizeChainsInBlock(BB, R);
+
+ // Vectorize the index computations of getelementptr instructions. This
+ // is primarily intended to catch gather-like idioms ending at
+ // non-consecutive loads.
+ if (!GEPs.empty()) {
+ LLVM_DEBUG(dbgs() << "SLP: Found GEPs for " << GEPs.size()
+ << " underlying objects.\n");
+ Changed |= vectorizeGEPIndices(BB, R);
+ }
+ }
+
+ if (Changed) {
+ R.optimizeGatherSequence();
+ LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
+ }
+ return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
+ unsigned Idx) {
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length " << Chain.size()
+ << "\n");
+ const unsigned Sz = R.getVectorElementSize(Chain[0]);
+ const unsigned MinVF = R.getMinVecRegSize() / Sz;
+ unsigned VF = Chain.size();
+
+ if (!isPowerOf2_32(Sz) || !isPowerOf2_32(VF) || VF < 2 || VF < MinVF)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << VF << " stores at offset " << Idx
+ << "\n");
+
+ R.buildTree(Chain);
+ Optional<ArrayRef<unsigned>> Order = R.bestOrder();
+ // TODO: Handle orders of size less than number of elements in the vector.
+ if (Order && Order->size() == Chain.size()) {
+ // TODO: reorder tree nodes without tree rebuilding.
+ SmallVector<Value *, 4> ReorderedOps(Chain.rbegin(), Chain.rend());
+ llvm::transform(*Order, ReorderedOps.begin(),
+ [Chain](const unsigned Idx) { return Chain[Idx]; });
+ R.buildTree(ReorderedOps);
+ }
+ if (R.isTreeTinyAndNotFullyVectorizable())
+ return false;
+ if (R.isLoadCombineCandidate())
+ return false;
+
+ R.computeMinimumValueSizes();
+
InstructionCost Cost = R.getTreeCost();
-
+
LLVM_DEBUG(dbgs() << "SLP: Found cost = " << Cost << " for VF =" << VF << "\n");
- if (Cost < -SLPCostThreshold) {
+ if (Cost < -SLPCostThreshold) {
LLVM_DEBUG(dbgs() << "SLP: Decided to vectorize cost = " << Cost << "\n");
-
- using namespace ore;
-
- R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
- cast<StoreInst>(Chain[0]))
- << "Stores SLP vectorized with cost " << NV("Cost", Cost)
- << " and with tree size "
- << NV("TreeSize", R.getTreeSize()));
-
- R.vectorizeTree();
- return true;
- }
-
- return false;
-}
-
-bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
- BoUpSLP &R) {
- // We may run into multiple chains that merge into a single chain. We mark the
- // stores that we vectorized so that we don't visit the same store twice.
- BoUpSLP::ValueSet VectorizedStores;
- bool Changed = false;
-
- int E = Stores.size();
- SmallBitVector Tails(E, false);
- SmallVector<int, 16> ConsecutiveChain(E, E + 1);
- int MaxIter = MaxStoreLookup.getValue();
- int IterCnt;
- auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,
- &ConsecutiveChain](int K, int Idx) {
- if (IterCnt >= MaxIter)
- return true;
- ++IterCnt;
- if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))
- return false;
-
- Tails.set(Idx);
- ConsecutiveChain[K] = Idx;
- return true;
- };
- // Do a quadratic search on all of the given stores in reverse order and find
- // all of the pairs of stores that follow each other.
- for (int Idx = E - 1; Idx >= 0; --Idx) {
- // If a store has multiple consecutive store candidates, search according
- // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
- // This is because usually pairing with immediate succeeding or preceding
- // candidate create the best chance to find slp vectorization opportunity.
- const int MaxLookDepth = std::max(E - Idx, Idx + 1);
- IterCnt = 0;
- for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)
- if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
- (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
- break;
- }
-
- // For stores that start but don't end a link in the chain:
- for (int Cnt = E; Cnt > 0; --Cnt) {
- int I = Cnt - 1;
- if (ConsecutiveChain[I] == E + 1 || Tails.test(I))
- continue;
- // We found a store instr that starts a chain. Now follow the chain and try
- // to vectorize it.
- BoUpSLP::ValueList Operands;
- // Collect the chain into a list.
- while (I != E + 1 && !VectorizedStores.count(Stores[I])) {
- Operands.push_back(Stores[I]);
- // Move to the next value in the chain.
- I = ConsecutiveChain[I];
- }
-
- // If a vector register can't hold 1 element, we are done.
- unsigned MaxVecRegSize = R.getMaxVecRegSize();
+
+ using namespace ore;
+
+ R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized",
+ cast<StoreInst>(Chain[0]))
+ << "Stores SLP vectorized with cost " << NV("Cost", Cost)
+ << " and with tree size "
+ << NV("TreeSize", R.getTreeSize()));
+
+ R.vectorizeTree();
+ return true;
+ }
+
+ return false;
+}
+
+bool SLPVectorizerPass::vectorizeStores(ArrayRef<StoreInst *> Stores,
+ BoUpSLP &R) {
+ // We may run into multiple chains that merge into a single chain. We mark the
+ // stores that we vectorized so that we don't visit the same store twice.
+ BoUpSLP::ValueSet VectorizedStores;
+ bool Changed = false;
+
+ int E = Stores.size();
+ SmallBitVector Tails(E, false);
+ SmallVector<int, 16> ConsecutiveChain(E, E + 1);
+ int MaxIter = MaxStoreLookup.getValue();
+ int IterCnt;
+ auto &&FindConsecutiveAccess = [this, &Stores, &Tails, &IterCnt, MaxIter,
+ &ConsecutiveChain](int K, int Idx) {
+ if (IterCnt >= MaxIter)
+ return true;
+ ++IterCnt;
+ if (!isConsecutiveAccess(Stores[K], Stores[Idx], *DL, *SE))
+ return false;
+
+ Tails.set(Idx);
+ ConsecutiveChain[K] = Idx;
+ return true;
+ };
+ // Do a quadratic search on all of the given stores in reverse order and find
+ // all of the pairs of stores that follow each other.
+ for (int Idx = E - 1; Idx >= 0; --Idx) {
+ // If a store has multiple consecutive store candidates, search according
+ // to the sequence: Idx-1, Idx+1, Idx-2, Idx+2, ...
+ // This is because usually pairing with immediate succeeding or preceding
+ // candidate create the best chance to find slp vectorization opportunity.
+ const int MaxLookDepth = std::max(E - Idx, Idx + 1);
+ IterCnt = 0;
+ for (int Offset = 1, F = MaxLookDepth; Offset < F; ++Offset)
+ if ((Idx >= Offset && FindConsecutiveAccess(Idx - Offset, Idx)) ||
+ (Idx + Offset < E && FindConsecutiveAccess(Idx + Offset, Idx)))
+ break;
+ }
+
+ // For stores that start but don't end a link in the chain:
+ for (int Cnt = E; Cnt > 0; --Cnt) {
+ int I = Cnt - 1;
+ if (ConsecutiveChain[I] == E + 1 || Tails.test(I))
+ continue;
+ // We found a store instr that starts a chain. Now follow the chain and try
+ // to vectorize it.
+ BoUpSLP::ValueList Operands;
+ // Collect the chain into a list.
+ while (I != E + 1 && !VectorizedStores.count(Stores[I])) {
+ Operands.push_back(Stores[I]);
+ // Move to the next value in the chain.
+ I = ConsecutiveChain[I];
+ }
+
+ // If a vector register can't hold 1 element, we are done.
+ unsigned MaxVecRegSize = R.getMaxVecRegSize();
unsigned EltSize = R.getVectorElementSize(Operands[0]);
- if (MaxVecRegSize % EltSize != 0)
- continue;
-
- unsigned MaxElts = MaxVecRegSize / EltSize;
- // FIXME: Is division-by-2 the correct step? Should we assert that the
- // register size is a power-of-2?
- unsigned StartIdx = 0;
- for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) {
- for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
- ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
- if (!VectorizedStores.count(Slice.front()) &&
- !VectorizedStores.count(Slice.back()) &&
- vectorizeStoreChain(Slice, R, Cnt)) {
- // Mark the vectorized stores so that we don't vectorize them again.
- VectorizedStores.insert(Slice.begin(), Slice.end());
- Changed = true;
- // If we vectorized initial block, no need to try to vectorize it
- // again.
- if (Cnt == StartIdx)
- StartIdx += Size;
- Cnt += Size;
- continue;
- }
- ++Cnt;
- }
- // Check if the whole array was vectorized already - exit.
- if (StartIdx >= Operands.size())
- break;
- }
- }
-
- return Changed;
-}
-
-void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
- // Initialize the collections. We will make a single pass over the block.
- Stores.clear();
- GEPs.clear();
-
- // Visit the store and getelementptr instructions in BB and organize them in
- // Stores and GEPs according to the underlying objects of their pointer
- // operands.
- for (Instruction &I : *BB) {
- // Ignore store instructions that are volatile or have a pointer operand
- // that doesn't point to a scalar type.
- if (auto *SI = dyn_cast<StoreInst>(&I)) {
- if (!SI->isSimple())
- continue;
- if (!isValidElementType(SI->getValueOperand()->getType()))
- continue;
+ if (MaxVecRegSize % EltSize != 0)
+ continue;
+
+ unsigned MaxElts = MaxVecRegSize / EltSize;
+ // FIXME: Is division-by-2 the correct step? Should we assert that the
+ // register size is a power-of-2?
+ unsigned StartIdx = 0;
+ for (unsigned Size = llvm::PowerOf2Ceil(MaxElts); Size >= 2; Size /= 2) {
+ for (unsigned Cnt = StartIdx, E = Operands.size(); Cnt + Size <= E;) {
+ ArrayRef<Value *> Slice = makeArrayRef(Operands).slice(Cnt, Size);
+ if (!VectorizedStores.count(Slice.front()) &&
+ !VectorizedStores.count(Slice.back()) &&
+ vectorizeStoreChain(Slice, R, Cnt)) {
+ // Mark the vectorized stores so that we don't vectorize them again.
+ VectorizedStores.insert(Slice.begin(), Slice.end());
+ Changed = true;
+ // If we vectorized initial block, no need to try to vectorize it
+ // again.
+ if (Cnt == StartIdx)
+ StartIdx += Size;
+ Cnt += Size;
+ continue;
+ }
+ ++Cnt;
+ }
+ // Check if the whole array was vectorized already - exit.
+ if (StartIdx >= Operands.size())
+ break;
+ }
+ }
+
+ return Changed;
+}
+
+void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
+ // Initialize the collections. We will make a single pass over the block.
+ Stores.clear();
+ GEPs.clear();
+
+ // Visit the store and getelementptr instructions in BB and organize them in
+ // Stores and GEPs according to the underlying objects of their pointer
+ // operands.
+ for (Instruction &I : *BB) {
+ // Ignore store instructions that are volatile or have a pointer operand
+ // that doesn't point to a scalar type.
+ if (auto *SI = dyn_cast<StoreInst>(&I)) {
+ if (!SI->isSimple())
+ continue;
+ if (!isValidElementType(SI->getValueOperand()->getType()))
+ continue;
Stores[getUnderlyingObject(SI->getPointerOperand())].push_back(SI);
- }
-
- // Ignore getelementptr instructions that have more than one index, a
- // constant index, or a pointer operand that doesn't point to a scalar
- // type.
- else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
- auto Idx = GEP->idx_begin()->get();
- if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
- continue;
- if (!isValidElementType(Idx->getType()))
- continue;
- if (GEP->getType()->isVectorTy())
- continue;
- GEPs[GEP->getPointerOperand()].push_back(GEP);
- }
- }
-}
-
-bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
- if (!A || !B)
- return false;
- Value *VL[] = {A, B};
- return tryToVectorizeList(VL, R, /*AllowReorder=*/true);
-}
-
-bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
- bool AllowReorder,
- ArrayRef<Value *> InsertUses) {
- if (VL.size() < 2)
- return false;
-
- LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
- << VL.size() << ".\n");
-
- // Check that all of the parts are instructions of the same type,
- // we permit an alternate opcode via InstructionsState.
- InstructionsState S = getSameOpcode(VL);
- if (!S.getOpcode())
- return false;
-
- Instruction *I0 = cast<Instruction>(S.OpValue);
- // Make sure invalid types (including vector type) are rejected before
- // determining vectorization factor for scalar instructions.
- for (Value *V : VL) {
- Type *Ty = V->getType();
- if (!isValidElementType(Ty)) {
- // NOTE: the following will give user internal llvm type name, which may
- // not be useful.
- R.getORE()->emit([&]() {
- std::string type_str;
- llvm::raw_string_ostream rso(type_str);
- Ty->print(rso);
- return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
- << "Cannot SLP vectorize list: type "
- << rso.str() + " is unsupported by vectorizer";
- });
- return false;
- }
- }
-
- unsigned Sz = R.getVectorElementSize(I0);
- unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
- unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+ }
+
+ // Ignore getelementptr instructions that have more than one index, a
+ // constant index, or a pointer operand that doesn't point to a scalar
+ // type.
+ else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+ auto Idx = GEP->idx_begin()->get();
+ if (GEP->getNumIndices() > 1 || isa<Constant>(Idx))
+ continue;
+ if (!isValidElementType(Idx->getType()))
+ continue;
+ if (GEP->getType()->isVectorTy())
+ continue;
+ GEPs[GEP->getPointerOperand()].push_back(GEP);
+ }
+ }
+}
+
+bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
+ if (!A || !B)
+ return false;
+ Value *VL[] = {A, B};
+ return tryToVectorizeList(VL, R, /*AllowReorder=*/true);
+}
+
+bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
+ bool AllowReorder,
+ ArrayRef<Value *> InsertUses) {
+ if (VL.size() < 2)
+ return false;
+
+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
+ << VL.size() << ".\n");
+
+ // Check that all of the parts are instructions of the same type,
+ // we permit an alternate opcode via InstructionsState.
+ InstructionsState S = getSameOpcode(VL);
+ if (!S.getOpcode())
+ return false;
+
+ Instruction *I0 = cast<Instruction>(S.OpValue);
+ // Make sure invalid types (including vector type) are rejected before
+ // determining vectorization factor for scalar instructions.
+ for (Value *V : VL) {
+ Type *Ty = V->getType();
+ if (!isValidElementType(Ty)) {
+ // NOTE: the following will give user internal llvm type name, which may
+ // not be useful.
+ R.getORE()->emit([&]() {
+ std::string type_str;
+ llvm::raw_string_ostream rso(type_str);
+ Ty->print(rso);
+ return OptimizationRemarkMissed(SV_NAME, "UnsupportedType", I0)
+ << "Cannot SLP vectorize list: type "
+ << rso.str() + " is unsupported by vectorizer";
+ });
+ return false;
+ }
+ }
+
+ unsigned Sz = R.getVectorElementSize(I0);
+ unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
+ unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
MaxVF = std::min(R.getMaximumVF(Sz, S.getOpcode()), MaxVF);
- if (MaxVF < 2) {
- R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
- << "Cannot SLP vectorize list: vectorization factor "
- << "less than 2 is not supported";
- });
- return false;
- }
-
- bool Changed = false;
- bool CandidateFound = false;
+ if (MaxVF < 2) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
+ << "Cannot SLP vectorize list: vectorization factor "
+ << "less than 2 is not supported";
+ });
+ return false;
+ }
+
+ bool Changed = false;
+ bool CandidateFound = false;
InstructionCost MinCost = SLPCostThreshold.getValue();
-
- bool CompensateUseCost =
- !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
- return V && isa<InsertElementInst>(V);
- });
- assert((!CompensateUseCost || InsertUses.size() == VL.size()) &&
- "Each scalar expected to have an associated InsertElement user.");
-
- unsigned NextInst = 0, MaxInst = VL.size();
- for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
- // No actual vectorization should happen, if number of parts is the same as
- // provided vectorization factor (i.e. the scalar type is used for vector
- // code during codegen).
- auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF);
- if (TTI->getNumberOfParts(VecTy) == VF)
- continue;
- for (unsigned I = NextInst; I < MaxInst; ++I) {
- unsigned OpsWidth = 0;
-
- if (I + VF > MaxInst)
- OpsWidth = MaxInst - I;
- else
- OpsWidth = VF;
-
- if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
- break;
-
- ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
- // Check that a previous iteration of this loop did not delete the Value.
- if (llvm::any_of(Ops, [&R](Value *V) {
- auto *I = dyn_cast<Instruction>(V);
- return I && R.isDeleted(I);
- }))
- continue;
-
- LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
- << "\n");
-
- R.buildTree(Ops);
- Optional<ArrayRef<unsigned>> Order = R.bestOrder();
- // TODO: check if we can allow reordering for more cases.
- if (AllowReorder && Order) {
- // TODO: reorder tree nodes without tree rebuilding.
- // Conceptually, there is nothing actually preventing us from trying to
- // reorder a larger list. In fact, we do exactly this when vectorizing
- // reductions. However, at this point, we only expect to get here when
- // there are exactly two operations.
- assert(Ops.size() == 2);
- Value *ReorderedOps[] = {Ops[1], Ops[0]};
- R.buildTree(ReorderedOps, None);
- }
- if (R.isTreeTinyAndNotFullyVectorizable())
- continue;
-
- R.computeMinimumValueSizes();
+
+ bool CompensateUseCost =
+ !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
+ return V && isa<InsertElementInst>(V);
+ });
+ assert((!CompensateUseCost || InsertUses.size() == VL.size()) &&
+ "Each scalar expected to have an associated InsertElement user.");
+
+ unsigned NextInst = 0, MaxInst = VL.size();
+ for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
+ // No actual vectorization should happen, if number of parts is the same as
+ // provided vectorization factor (i.e. the scalar type is used for vector
+ // code during codegen).
+ auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF);
+ if (TTI->getNumberOfParts(VecTy) == VF)
+ continue;
+ for (unsigned I = NextInst; I < MaxInst; ++I) {
+ unsigned OpsWidth = 0;
+
+ if (I + VF > MaxInst)
+ OpsWidth = MaxInst - I;
+ else
+ OpsWidth = VF;
+
+ if (!isPowerOf2_32(OpsWidth) || OpsWidth < 2)
+ break;
+
+ ArrayRef<Value *> Ops = VL.slice(I, OpsWidth);
+ // Check that a previous iteration of this loop did not delete the Value.
+ if (llvm::any_of(Ops, [&R](Value *V) {
+ auto *I = dyn_cast<Instruction>(V);
+ return I && R.isDeleted(I);
+ }))
+ continue;
+
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing " << OpsWidth << " operations "
+ << "\n");
+
+ R.buildTree(Ops);
+ Optional<ArrayRef<unsigned>> Order = R.bestOrder();
+ // TODO: check if we can allow reordering for more cases.
+ if (AllowReorder && Order) {
+ // TODO: reorder tree nodes without tree rebuilding.
+ // Conceptually, there is nothing actually preventing us from trying to
+ // reorder a larger list. In fact, we do exactly this when vectorizing
+ // reductions. However, at this point, we only expect to get here when
+ // there are exactly two operations.
+ assert(Ops.size() == 2);
+ Value *ReorderedOps[] = {Ops[1], Ops[0]};
+ R.buildTree(ReorderedOps, None);
+ }
+ if (R.isTreeTinyAndNotFullyVectorizable())
+ continue;
+
+ R.computeMinimumValueSizes();
InstructionCost Cost = R.getTreeCost();
- CandidateFound = true;
- if (CompensateUseCost) {
- // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
- // rather than sum of single inserts as the latter may overestimate
- // cost. This work should imply improving cost estimation for extracts
- // that added in for external (for vectorization tree) users,i.e. that
- // part should also switch to same interface.
- // For example, the following case is projected code after SLP:
- // %4 = extractelement <4 x i64> %3, i32 0
+ CandidateFound = true;
+ if (CompensateUseCost) {
+ // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
+ // rather than sum of single inserts as the latter may overestimate
+ // cost. This work should imply improving cost estimation for extracts
+ // that added in for external (for vectorization tree) users,i.e. that
+ // part should also switch to same interface.
+ // For example, the following case is projected code after SLP:
+ // %4 = extractelement <4 x i64> %3, i32 0
// %v0 = insertelement <4 x i64> poison, i64 %4, i32 0
- // %5 = extractelement <4 x i64> %3, i32 1
- // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
- // %6 = extractelement <4 x i64> %3, i32 2
- // %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
- // %7 = extractelement <4 x i64> %3, i32 3
- // %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
- //
- // Extracts here added by SLP in order to feed users (the inserts) of
- // original scalars and contribute to "ExtractCost" at cost evaluation.
- // The inserts in turn form sequence to build an aggregate that
- // detected by findBuildAggregate routine.
- // SLP makes an assumption that such sequence will be optimized away
- // later (instcombine) so it tries to compensate ExctractCost with
- // cost of insert sequence.
- // Current per element cost calculation approach is not quite accurate
- // and tends to create bias toward favoring vectorization.
- // Switching to the TTI interface might help a bit.
- // Alternative solution could be pattern-match to detect a no-op or
- // shuffle.
+ // %5 = extractelement <4 x i64> %3, i32 1
+ // %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
+ // %6 = extractelement <4 x i64> %3, i32 2
+ // %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
+ // %7 = extractelement <4 x i64> %3, i32 3
+ // %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
+ //
+ // Extracts here added by SLP in order to feed users (the inserts) of
+ // original scalars and contribute to "ExtractCost" at cost evaluation.
+ // The inserts in turn form sequence to build an aggregate that
+ // detected by findBuildAggregate routine.
+ // SLP makes an assumption that such sequence will be optimized away
+ // later (instcombine) so it tries to compensate ExctractCost with
+ // cost of insert sequence.
+ // Current per element cost calculation approach is not quite accurate
+ // and tends to create bias toward favoring vectorization.
+ // Switching to the TTI interface might help a bit.
+ // Alternative solution could be pattern-match to detect a no-op or
+ // shuffle.
InstructionCost UserCost = 0;
- for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
- auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
- if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
- UserCost += TTI->getVectorInstrCost(
- Instruction::InsertElement, IE->getType(), CI->getZExtValue());
- }
- LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost
- << ".\n");
- Cost -= UserCost;
- }
-
- MinCost = std::min(MinCost, Cost);
-
- if (Cost < -SLPCostThreshold) {
- LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
- R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
- cast<Instruction>(Ops[0]))
- << "SLP vectorized with cost " << ore::NV("Cost", Cost)
- << " and with tree size "
- << ore::NV("TreeSize", R.getTreeSize()));
-
- R.vectorizeTree();
- // Move to the next bundle.
- I += VF - 1;
- NextInst = I + 1;
- Changed = true;
- }
- }
- }
-
- if (!Changed && CandidateFound) {
- R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
- << "List vectorization was possible but not beneficial with cost "
- << ore::NV("Cost", MinCost) << " >= "
- << ore::NV("Treshold", -SLPCostThreshold);
- });
- } else if (!Changed) {
- R.getORE()->emit([&]() {
- return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
- << "Cannot SLP vectorize list: vectorization was impossible"
- << " with available vectorization factors";
- });
- }
- return Changed;
-}
-
-bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
- if (!I)
- return false;
-
- if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
- return false;
-
- Value *P = I->getParent();
-
- // Vectorize in current basic block only.
- auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
- auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
- if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
- return false;
-
- // Try to vectorize V.
- if (tryToVectorizePair(Op0, Op1, R))
- return true;
-
- auto *A = dyn_cast<BinaryOperator>(Op0);
- auto *B = dyn_cast<BinaryOperator>(Op1);
- // Try to skip B.
- if (B && B->hasOneUse()) {
- auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
- auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
- if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
- return true;
- if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
- return true;
- }
-
- // Try to skip A.
- if (A && A->hasOneUse()) {
- auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
- auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
- if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
- return true;
- if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
- return true;
- }
- return false;
-}
-
-namespace {
-
-/// Model horizontal reductions.
-///
+ for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
+ auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
+ if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
+ UserCost += TTI->getVectorInstrCost(
+ Instruction::InsertElement, IE->getType(), CI->getZExtValue());
+ }
+ LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost
+ << ".\n");
+ Cost -= UserCost;
+ }
+
+ MinCost = std::min(MinCost, Cost);
+
+ if (Cost < -SLPCostThreshold) {
+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n");
+ R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList",
+ cast<Instruction>(Ops[0]))
+ << "SLP vectorized with cost " << ore::NV("Cost", Cost)
+ << " and with tree size "
+ << ore::NV("TreeSize", R.getTreeSize()));
+
+ R.vectorizeTree();
+ // Move to the next bundle.
+ I += VF - 1;
+ NextInst = I + 1;
+ Changed = true;
+ }
+ }
+ }
+
+ if (!Changed && CandidateFound) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "NotBeneficial", I0)
+ << "List vectorization was possible but not beneficial with cost "
+ << ore::NV("Cost", MinCost) << " >= "
+ << ore::NV("Treshold", -SLPCostThreshold);
+ });
+ } else if (!Changed) {
+ R.getORE()->emit([&]() {
+ return OptimizationRemarkMissed(SV_NAME, "NotPossible", I0)
+ << "Cannot SLP vectorize list: vectorization was impossible"
+ << " with available vectorization factors";
+ });
+ }
+ return Changed;
+}
+
+bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
+ if (!I)
+ return false;
+
+ if (!isa<BinaryOperator>(I) && !isa<CmpInst>(I))
+ return false;
+
+ Value *P = I->getParent();
+
+ // Vectorize in current basic block only.
+ auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
+ auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
+ if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
+ return false;
+
+ // Try to vectorize V.
+ if (tryToVectorizePair(Op0, Op1, R))
+ return true;
+
+ auto *A = dyn_cast<BinaryOperator>(Op0);
+ auto *B = dyn_cast<BinaryOperator>(Op1);
+ // Try to skip B.
+ if (B && B->hasOneUse()) {
+ auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+ auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+ if (B0 && B0->getParent() == P && tryToVectorizePair(A, B0, R))
+ return true;
+ if (B1 && B1->getParent() == P && tryToVectorizePair(A, B1, R))
+ return true;
+ }
+
+ // Try to skip A.
+ if (A && A->hasOneUse()) {
+ auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+ auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+ if (A0 && A0->getParent() == P && tryToVectorizePair(A0, B, R))
+ return true;
+ if (A1 && A1->getParent() == P && tryToVectorizePair(A1, B, R))
+ return true;
+ }
+ return false;
+}
+
+namespace {
+
+/// Model horizontal reductions.
+///
/// A horizontal reduction is a tree of reduction instructions that has values
/// that can be put into a vector as its leaves. For example:
-///
-/// mul mul mul mul
-/// \ / \ /
-/// + +
-/// \ /
-/// +
+///
+/// mul mul mul mul
+/// \ / \ /
+/// + +
+/// \ /
+/// +
/// This tree has "mul" as its leaf values and "+" as its reduction
/// instructions. A reduction can feed into a store or a binary operation
-/// feeding a phi.
-/// ...
-/// \ /
-/// +
-/// |
-/// phi +=
-///
-/// Or:
-/// ...
-/// \ /
-/// +
-/// |
-/// *p =
-///
-class HorizontalReduction {
- using ReductionOpsType = SmallVector<Value *, 16>;
- using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
+/// feeding a phi.
+/// ...
+/// \ /
+/// +
+/// |
+/// phi +=
+///
+/// Or:
+/// ...
+/// \ /
+/// +
+/// |
+/// *p =
+///
+class HorizontalReduction {
+ using ReductionOpsType = SmallVector<Value *, 16>;
+ using ReductionOpsListType = SmallVector<ReductionOpsType, 2>;
ReductionOpsListType ReductionOps;
- SmallVector<Value *, 32> ReducedVals;
- // Use map vector to make stable output.
- MapVector<Instruction *, Value *> ExtraArgs;
+ SmallVector<Value *, 32> ReducedVals;
+ // Use map vector to make stable output.
+ MapVector<Instruction *, Value *> ExtraArgs;
WeakTrackingVH ReductionRoot;
/// The type of reduction operation.
RecurKind RdxKind;
-
+
/// Checks if instruction is associative and can be vectorized.
static bool isVectorizable(RecurKind Kind, Instruction *I) {
if (Kind == RecurKind::None)
return false;
if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind))
return true;
-
+
if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
// FP min/max are associative except for NaN and -0.0. We do not
// have to rule out -0.0 here because the intrinsic semantics do not
// specify a fixed result for it.
return I->getFastMathFlags().noNaNs();
- }
-
+ }
+
return I->isAssociative();
}
-
+
/// Checks if the ParentStackElem.first should be marked as a reduction
/// operation with an extra argument or as extra argument itself.
void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
@@ -6446,9 +6446,9 @@ class HorizontalReduction {
// We ran into something like:
// ParentStackElem.first += ... + ExtraArg + ...
ExtraArgs[ParentStackElem.first] = ExtraArg;
- }
+ }
}
-
+
/// Creates reduction operation with the current opcode.
static Value *createOp(IRBuilder<> &Builder, RecurKind Kind, Value *LHS,
Value *RHS, const Twine &Name) {
@@ -6467,28 +6467,28 @@ class HorizontalReduction {
return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
case RecurKind::FMin:
return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
-
+
case RecurKind::SMax: {
Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- }
+ }
case RecurKind::SMin: {
Value *Cmp = Builder.CreateICmpSLT(LHS, RHS, Name);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- }
+ }
case RecurKind::UMax: {
Value *Cmp = Builder.CreateICmpUGT(LHS, RHS, Name);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- }
+ }
case RecurKind::UMin: {
Value *Cmp = Builder.CreateICmpULT(LHS, RHS, Name);
return Builder.CreateSelect(Cmp, LHS, RHS, Name);
- }
+ }
default:
llvm_unreachable("Unknown reduction operation.");
- }
+ }
}
-
+
/// Creates reduction operation with the current opcode with the IR flags
/// from \p ReductionOps.
static Value *createOp(IRBuilder<> &Builder, RecurKind RdxKind, Value *LHS,
@@ -6500,7 +6500,7 @@ class HorizontalReduction {
propagateIRFlags(Sel->getCondition(), ReductionOps[0]);
propagateIRFlags(Op, ReductionOps[1]);
return Op;
- }
+ }
propagateIRFlags(Op, ReductionOps[0]);
return Op;
}
@@ -6513,12 +6513,12 @@ class HorizontalReduction {
if (auto *Sel = dyn_cast<SelectInst>(Op)) {
propagateIRFlags(Sel->getCondition(),
cast<SelectInst>(I)->getCondition());
- }
- }
+ }
+ }
propagateIRFlags(Op, I);
return Op;
}
-
+
static RecurKind getRdxKind(Instruction *I) {
assert(I && "Expected instruction for reduction matching");
TargetTransformInfo::ReductionFlags RdxFlags;
@@ -6536,12 +6536,12 @@ class HorizontalReduction {
return RecurKind::FAdd;
if (match(I, m_FMul(m_Value(), m_Value())))
return RecurKind::FMul;
-
+
if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
return RecurKind::FMax;
if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
return RecurKind::FMin;
-
+
if (match(I, m_SMax(m_Value(), m_Value())))
return RecurKind::SMax;
if (match(I, m_SMin(m_Value(), m_Value())))
@@ -6550,7 +6550,7 @@ class HorizontalReduction {
return RecurKind::UMax;
if (match(I, m_UMin(m_Value(), m_Value())))
return RecurKind::UMin;
-
+
if (auto *Select = dyn_cast<SelectInst>(I)) {
// Try harder: look for min/max pattern based on instructions producing
// same values such as: select ((cmp Inst1, Inst2), Inst1, Inst2).
@@ -6566,11 +6566,11 @@ class HorizontalReduction {
CmpInst::Predicate Pred;
Instruction *L1;
Instruction *L2;
-
+
Value *LHS = Select->getTrueValue();
Value *RHS = Select->getFalseValue();
Value *Cond = Select->getCondition();
-
+
// TODO: Support inverse predicates.
if (match(Cond, m_Cmp(Pred, m_Specific(LHS), m_Instruction(L2)))) {
if (!isa<ExtractElementInst>(RHS) ||
@@ -6587,8 +6587,8 @@ class HorizontalReduction {
!L1->isIdenticalTo(cast<Instruction>(LHS)) ||
!L2->isIdenticalTo(cast<Instruction>(RHS)))
return RecurKind::None;
- }
-
+ }
+
TargetTransformInfo::ReductionFlags RdxFlags;
switch (Pred) {
default:
@@ -6605,16 +6605,16 @@ class HorizontalReduction {
case CmpInst::ICMP_ULT:
case CmpInst::ICMP_ULE:
return RecurKind::UMin;
- }
- }
+ }
+ }
return RecurKind::None;
}
-
+
/// Return true if this operation is a cmp+select idiom.
static bool isCmpSel(RecurKind Kind) {
return RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind);
}
-
+
/// Get the index of the first operand.
static unsigned getFirstOperandIndex(RecurKind Kind) {
// We allow calling this before 'Kind' is set, so handle that specially.
@@ -6622,12 +6622,12 @@ class HorizontalReduction {
return 0;
return isCmpSel(Kind) ? 1 : 0;
}
-
+
/// Total number of operands in the reduction operation.
static unsigned getNumberOfOperands(RecurKind Kind) {
return isCmpSel(Kind) ? 3 : 2;
}
-
+
/// Checks if the instruction is in basic block \p BB.
/// For a min/max reduction check that both compare and select are in \p BB.
static bool hasSameParent(RecurKind Kind, Instruction *I, BasicBlock *BB,
@@ -6635,10 +6635,10 @@ class HorizontalReduction {
if (IsRedOp && isCmpSel(Kind)) {
auto *Cmp = cast<Instruction>(cast<SelectInst>(I)->getCondition());
return I->getParent() == BB && Cmp && Cmp->getParent() == BB;
- }
+ }
return I->getParent() == BB;
- }
-
+ }
+
/// Expected number of uses for reduction operations/reduced values.
static bool hasRequiredNumberOfUses(RecurKind Kind, Instruction *I,
bool IsReductionOp) {
@@ -6648,11 +6648,11 @@ class HorizontalReduction {
return I->hasNUses(2) &&
(!IsReductionOp ||
cast<SelectInst>(I)->getCondition()->hasOneUse());
-
+
// Arithmetic reduction operation must be used once only.
return I->hasOneUse();
}
-
+
/// Initializes the list of reduction operations.
void initReductionOps(RecurKind Kind) {
if (isCmpSel(Kind))
@@ -6660,7 +6660,7 @@ class HorizontalReduction {
else
ReductionOps.assign(1, ReductionOpsType());
}
-
+
/// Add all reduction operations for the reduction instruction \p I.
void addReductionOps(RecurKind Kind, Instruction *I) {
assert(Kind != RecurKind::None && "Expected reduction operation.");
@@ -6669,9 +6669,9 @@ class HorizontalReduction {
ReductionOps[1].emplace_back(I);
} else {
ReductionOps[0].emplace_back(I);
- }
- }
-
+ }
+ }
+
static Value *getLHS(RecurKind Kind, Instruction *I) {
if (Kind == RecurKind::None)
return nullptr;
@@ -6683,90 +6683,90 @@ class HorizontalReduction {
return I->getOperand(getFirstOperandIndex(Kind) + 1);
}
-public:
- HorizontalReduction() = default;
-
- /// Try to find a reduction tree.
- bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
- assert((!Phi || is_contained(Phi->operands(), B)) &&
+public:
+ HorizontalReduction() = default;
+
+ /// Try to find a reduction tree.
+ bool matchAssociativeReduction(PHINode *Phi, Instruction *B) {
+ assert((!Phi || is_contained(Phi->operands(), B)) &&
"Phi needs to use the binary operator");
-
+
RdxKind = getRdxKind(B);
-
- // We could have a initial reductions that is not an add.
- // r *= v1 + v2 + v3 + v4
- // In such a case start looking for a tree rooted in the first '+'.
- if (Phi) {
+
+ // We could have a initial reductions that is not an add.
+ // r *= v1 + v2 + v3 + v4
+ // In such a case start looking for a tree rooted in the first '+'.
+ if (Phi) {
if (getLHS(RdxKind, B) == Phi) {
- Phi = nullptr;
+ Phi = nullptr;
B = dyn_cast<Instruction>(getRHS(RdxKind, B));
if (!B)
return false;
RdxKind = getRdxKind(B);
} else if (getRHS(RdxKind, B) == Phi) {
- Phi = nullptr;
+ Phi = nullptr;
B = dyn_cast<Instruction>(getLHS(RdxKind, B));
if (!B)
return false;
RdxKind = getRdxKind(B);
- }
- }
-
+ }
+ }
+
if (!isVectorizable(RdxKind, B))
- return false;
-
+ return false;
+
// Analyze "regular" integer/FP types for reductions - no target-specific
// types or pointers.
- Type *Ty = B->getType();
+ Type *Ty = B->getType();
if (!isValidElementType(Ty) || Ty->isPointerTy())
- return false;
-
- ReductionRoot = B;
-
+ return false;
+
+ ReductionRoot = B;
+
// The opcode for leaf values that we perform a reduction on.
// For example: load(x) + load(y) + load(z) + fptoui(w)
// The leaf opcode for 'w' does not match, so we don't include it as a
// potential candidate for the reduction.
unsigned LeafOpcode = 0;
- // Post order traverse the reduction tree starting at B. We only handle true
- // trees containing only binary operators.
- SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
+ // Post order traverse the reduction tree starting at B. We only handle true
+ // trees containing only binary operators.
+ SmallVector<std::pair<Instruction *, unsigned>, 32> Stack;
Stack.push_back(std::make_pair(B, getFirstOperandIndex(RdxKind)));
initReductionOps(RdxKind);
- while (!Stack.empty()) {
- Instruction *TreeN = Stack.back().first;
+ while (!Stack.empty()) {
+ Instruction *TreeN = Stack.back().first;
unsigned EdgeToVisit = Stack.back().second++;
const RecurKind TreeRdxKind = getRdxKind(TreeN);
bool IsReducedValue = TreeRdxKind != RdxKind;
-
+
// Postorder visit.
if (IsReducedValue || EdgeToVisit == getNumberOfOperands(TreeRdxKind)) {
- if (IsReducedValue)
- ReducedVals.push_back(TreeN);
- else {
- auto I = ExtraArgs.find(TreeN);
- if (I != ExtraArgs.end() && !I->second) {
- // Check if TreeN is an extra argument of its parent operation.
- if (Stack.size() <= 1) {
- // TreeN can't be an extra argument as it is a root reduction
- // operation.
- return false;
- }
- // Yes, TreeN is an extra argument, do not add it to a list of
- // reduction operations.
- // Stack[Stack.size() - 2] always points to the parent operation.
- markExtraArg(Stack[Stack.size() - 2], TreeN);
- ExtraArgs.erase(TreeN);
- } else
+ if (IsReducedValue)
+ ReducedVals.push_back(TreeN);
+ else {
+ auto I = ExtraArgs.find(TreeN);
+ if (I != ExtraArgs.end() && !I->second) {
+ // Check if TreeN is an extra argument of its parent operation.
+ if (Stack.size() <= 1) {
+ // TreeN can't be an extra argument as it is a root reduction
+ // operation.
+ return false;
+ }
+ // Yes, TreeN is an extra argument, do not add it to a list of
+ // reduction operations.
+ // Stack[Stack.size() - 2] always points to the parent operation.
+ markExtraArg(Stack[Stack.size() - 2], TreeN);
+ ExtraArgs.erase(TreeN);
+ } else
addReductionOps(RdxKind, TreeN);
- }
- // Retract.
- Stack.pop_back();
- continue;
- }
-
- // Visit left or right.
+ }
+ // Retract.
+ Stack.pop_back();
+ continue;
+ }
+
+ // Visit left or right.
Value *EdgeVal = TreeN->getOperand(EdgeToVisit);
auto *I = dyn_cast<Instruction>(EdgeVal);
if (!I) {
@@ -6791,31 +6791,31 @@ public:
if (IsRdxInst) {
// We need to be able to reassociate the reduction operations.
if (!isVectorizable(EdgeRdxKind, I)) {
- // I is an extra argument for TreeN (its parent operation).
- markExtraArg(Stack.back(), I);
- continue;
- }
+ // I is an extra argument for TreeN (its parent operation).
+ markExtraArg(Stack.back(), I);
+ continue;
+ }
} else if (!LeafOpcode) {
LeafOpcode = I->getOpcode();
- }
+ }
Stack.push_back(std::make_pair(I, getFirstOperandIndex(EdgeRdxKind)));
continue;
- }
+ }
// I is an extra argument for TreeN (its parent operation).
markExtraArg(Stack.back(), I);
- }
- return true;
- }
-
+ }
+ return true;
+ }
+
/// Attempt to vectorize the tree found by matchAssociativeReduction.
- bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
+ bool tryToReduce(BoUpSLP &V, TargetTransformInfo *TTI) {
// If there are a sufficient number of reduction values, reduce
// to a nearby power-of-2. We can safely generate oversized
- // vectors and rely on the backend to split them to legal sizes.
- unsigned NumReducedVals = ReducedVals.size();
- if (NumReducedVals < 4)
- return false;
-
+ // vectors and rely on the backend to split them to legal sizes.
+ unsigned NumReducedVals = ReducedVals.size();
+ if (NumReducedVals < 4)
+ return false;
+
// Intersect the fast-math-flags from all reduction operations.
FastMathFlags RdxFMF;
RdxFMF.set();
@@ -6825,33 +6825,33 @@ public:
RdxFMF &= FPMO->getFastMathFlags();
}
}
-
- IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
+
+ IRBuilder<> Builder(cast<Instruction>(ReductionRoot));
Builder.setFastMathFlags(RdxFMF);
-
- BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
+
+ BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
// The same extra argument may be used several times, so log each attempt
- // to use it.
+ // to use it.
for (const std::pair<Instruction *, Value *> &Pair : ExtraArgs) {
- assert(Pair.first && "DebugLoc must be set.");
- ExternallyUsedValues[Pair.second].push_back(Pair.first);
- }
-
- // The compare instruction of a min/max is the insertion point for new
- // instructions and may be replaced with a new compare instruction.
- auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
- assert(isa<SelectInst>(RdxRootInst) &&
- "Expected min/max reduction to have select root instruction");
- Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
- assert(isa<Instruction>(ScalarCond) &&
- "Expected min/max reduction to have compare condition");
- return cast<Instruction>(ScalarCond);
- };
-
- // The reduction root is used as the insertion point for new instructions,
- // so set it as externally used to prevent it from being deleted.
- ExternallyUsedValues[ReductionRoot];
- SmallVector<Value *, 16> IgnoreList;
+ assert(Pair.first && "DebugLoc must be set.");
+ ExternallyUsedValues[Pair.second].push_back(Pair.first);
+ }
+
+ // The compare instruction of a min/max is the insertion point for new
+ // instructions and may be replaced with a new compare instruction.
+ auto getCmpForMinMaxReduction = [](Instruction *RdxRootInst) {
+ assert(isa<SelectInst>(RdxRootInst) &&
+ "Expected min/max reduction to have select root instruction");
+ Value *ScalarCond = cast<SelectInst>(RdxRootInst)->getCondition();
+ assert(isa<Instruction>(ScalarCond) &&
+ "Expected min/max reduction to have compare condition");
+ return cast<Instruction>(ScalarCond);
+ };
+
+ // The reduction root is used as the insertion point for new instructions,
+ // so set it as externally used to prevent it from being deleted.
+ ExternallyUsedValues[ReductionRoot];
+ SmallVector<Value *, 16> IgnoreList;
for (ReductionOpsType &RdxOp : ReductionOps)
IgnoreList.append(RdxOp.begin(), RdxOp.end());
@@ -6886,28 +6886,28 @@ public:
Value *VectorizedTree = nullptr;
unsigned i = 0;
- while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
+ while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
ArrayRef<Value *> VL(&ReducedVals[i], ReduxWidth);
- V.buildTree(VL, ExternallyUsedValues, IgnoreList);
- Optional<ArrayRef<unsigned>> Order = V.bestOrder();
+ V.buildTree(VL, ExternallyUsedValues, IgnoreList);
+ Optional<ArrayRef<unsigned>> Order = V.bestOrder();
if (Order) {
assert(Order->size() == VL.size() &&
"Order size must be the same as number of vectorized "
"instructions.");
- // TODO: reorder tree nodes without tree rebuilding.
- SmallVector<Value *, 4> ReorderedOps(VL.size());
- llvm::transform(*Order, ReorderedOps.begin(),
- [VL](const unsigned Idx) { return VL[Idx]; });
- V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
- }
- if (V.isTreeTinyAndNotFullyVectorizable())
- break;
+ // TODO: reorder tree nodes without tree rebuilding.
+ SmallVector<Value *, 4> ReorderedOps(VL.size());
+ llvm::transform(*Order, ReorderedOps.begin(),
+ [VL](const unsigned Idx) { return VL[Idx]; });
+ V.buildTree(ReorderedOps, ExternallyUsedValues, IgnoreList);
+ }
+ if (V.isTreeTinyAndNotFullyVectorizable())
+ break;
if (V.isLoadCombineReductionCandidate(RdxKind))
- break;
-
- V.computeMinimumValueSizes();
-
- // Estimate cost.
+ break;
+
+ V.computeMinimumValueSizes();
+
+ // Estimate cost.
InstructionCost TreeCost = V.getTreeCost();
InstructionCost ReductionCost =
getReductionCost(TTI, ReducedVals[i], ReduxWidth);
@@ -6916,7 +6916,7 @@ public:
LLVM_DEBUG(dbgs() << "Encountered invalid baseline cost.\n");
return false;
}
- if (Cost >= -SLPCostThreshold) {
+ if (Cost >= -SLPCostThreshold) {
V.getORE()->emit([&]() {
return OptimizationRemarkMissed(SV_NAME, "HorSLPNotBeneficial",
cast<Instruction>(VL[0]))
@@ -6926,91 +6926,91 @@ public:
<< ore::NV("Threshold", -SLPCostThreshold);
});
break;
- }
-
- LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
- << Cost << ". (HorRdx)\n");
- V.getORE()->emit([&]() {
+ }
+
+ LLVM_DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:"
+ << Cost << ". (HorRdx)\n");
+ V.getORE()->emit([&]() {
return OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction",
cast<Instruction>(VL[0]))
<< "Vectorized horizontal reduction with cost "
<< ore::NV("Cost", Cost) << " and with tree size "
<< ore::NV("TreeSize", V.getTreeSize());
- });
-
- // Vectorize a tree.
- DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
- Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
-
+ });
+
+ // Vectorize a tree.
+ DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
+ Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
+
// Emit a reduction. If the root is a select (min/max idiom), the insert
- // point is the compare condition of that select.
- Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
+ // point is the compare condition of that select.
+ Instruction *RdxRootInst = cast<Instruction>(ReductionRoot);
if (isCmpSel(RdxKind))
- Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
- else
- Builder.SetInsertPoint(RdxRootInst);
-
- Value *ReducedSubTree =
- emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
+ Builder.SetInsertPoint(getCmpForMinMaxReduction(RdxRootInst));
+ else
+ Builder.SetInsertPoint(RdxRootInst);
+
+ Value *ReducedSubTree =
+ emitReduction(VectorizedRoot, Builder, ReduxWidth, TTI);
if (!VectorizedTree) {
// Initialize the final value in the reduction.
VectorizedTree = ReducedSubTree;
} else {
// Update the final value in the reduction.
- Builder.SetCurrentDebugLocation(Loc);
+ Builder.SetCurrentDebugLocation(Loc);
VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
ReducedSubTree, "op.rdx", ReductionOps);
}
- i += ReduxWidth;
- ReduxWidth = PowerOf2Floor(NumReducedVals - i);
- }
-
- if (VectorizedTree) {
- // Finish the reduction.
- for (; i < NumReducedVals; ++i) {
- auto *I = cast<Instruction>(ReducedVals[i]);
- Builder.SetCurrentDebugLocation(I->getDebugLoc());
+ i += ReduxWidth;
+ ReduxWidth = PowerOf2Floor(NumReducedVals - i);
+ }
+
+ if (VectorizedTree) {
+ // Finish the reduction.
+ for (; i < NumReducedVals; ++i) {
+ auto *I = cast<Instruction>(ReducedVals[i]);
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
VectorizedTree =
createOp(Builder, RdxKind, VectorizedTree, I, "", ReductionOps);
- }
- for (auto &Pair : ExternallyUsedValues) {
- // Add each externally used value to the final reduction.
- for (auto *I : Pair.second) {
- Builder.SetCurrentDebugLocation(I->getDebugLoc());
+ }
+ for (auto &Pair : ExternallyUsedValues) {
+ // Add each externally used value to the final reduction.
+ for (auto *I : Pair.second) {
+ Builder.SetCurrentDebugLocation(I->getDebugLoc());
VectorizedTree = createOp(Builder, RdxKind, VectorizedTree,
Pair.first, "op.extra", I);
- }
- }
-
- // Update users. For a min/max reduction that ends with a compare and
- // select, we also have to RAUW for the compare instruction feeding the
- // reduction root. That's because the original compare may have extra uses
- // besides the final select of the reduction.
+ }
+ }
+
+ // Update users. For a min/max reduction that ends with a compare and
+ // select, we also have to RAUW for the compare instruction feeding the
+ // reduction root. That's because the original compare may have extra uses
+ // besides the final select of the reduction.
if (isCmpSel(RdxKind)) {
- if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) {
- Instruction *ScalarCmp =
- getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot));
- ScalarCmp->replaceAllUsesWith(VecSelect->getCondition());
- }
- }
- ReductionRoot->replaceAllUsesWith(VectorizedTree);
-
- // Mark all scalar reduction ops for deletion, they are replaced by the
- // vector reductions.
- V.eraseInstructions(IgnoreList);
- }
- return VectorizedTree != nullptr;
- }
-
+ if (auto *VecSelect = dyn_cast<SelectInst>(VectorizedTree)) {
+ Instruction *ScalarCmp =
+ getCmpForMinMaxReduction(cast<Instruction>(ReductionRoot));
+ ScalarCmp->replaceAllUsesWith(VecSelect->getCondition());
+ }
+ }
+ ReductionRoot->replaceAllUsesWith(VectorizedTree);
+
+ // Mark all scalar reduction ops for deletion, they are replaced by the
+ // vector reductions.
+ V.eraseInstructions(IgnoreList);
+ }
+ return VectorizedTree != nullptr;
+ }
+
unsigned numReductionValues() const { return ReducedVals.size(); }
-
-private:
- /// Calculate the cost of a reduction.
+
+private:
+ /// Calculate the cost of a reduction.
InstructionCost getReductionCost(TargetTransformInfo *TTI,
Value *FirstReducedVal,
unsigned ReduxWidth) {
- Type *ScalarTy = FirstReducedVal->getType();
+ Type *ScalarTy = FirstReducedVal->getType();
FixedVectorType *VectorTy = FixedVectorType::get(ScalarTy, ReduxWidth);
InstructionCost VectorCost, ScalarCost;
switch (RdxKind) {
@@ -7025,7 +7025,7 @@ private:
VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy,
/*IsPairwiseForm=*/false);
ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
- break;
+ break;
}
case RecurKind::FMax:
case RecurKind::FMin: {
@@ -7037,8 +7037,8 @@ private:
TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) +
TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
CmpInst::makeCmpResultType(ScalarTy));
- break;
- }
+ break;
+ }
case RecurKind::SMax:
case RecurKind::SMin:
case RecurKind::UMax:
@@ -7051,36 +7051,36 @@ private:
/*IsPairwiseForm=*/false, IsUnsigned);
ScalarCost =
TTI->getCmpSelInstrCost(Instruction::ICmp, ScalarTy) +
- TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
- CmpInst::makeCmpResultType(ScalarTy));
- break;
+ TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy,
+ CmpInst::makeCmpResultType(ScalarTy));
+ break;
}
default:
- llvm_unreachable("Expected arithmetic or min/max reduction operation");
- }
-
+ llvm_unreachable("Expected arithmetic or min/max reduction operation");
+ }
+
// Scalar cost is repeated for N-1 elements.
ScalarCost *= (ReduxWidth - 1);
LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VectorCost - ScalarCost
- << " for reduction that starts with " << *FirstReducedVal
+ << " for reduction that starts with " << *FirstReducedVal
<< " (It is a splitting reduction)\n");
return VectorCost - ScalarCost;
- }
-
- /// Emit a horizontal reduction of the vectorized value.
- Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
- unsigned ReduxWidth, const TargetTransformInfo *TTI) {
- assert(VectorizedValue && "Need to have a vectorized tree node");
- assert(isPowerOf2_32(ReduxWidth) &&
- "We only handle power-of-two reductions for now");
-
+ }
+
+ /// Emit a horizontal reduction of the vectorized value.
+ Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
+ unsigned ReduxWidth, const TargetTransformInfo *TTI) {
+ assert(VectorizedValue && "Need to have a vectorized tree node");
+ assert(isPowerOf2_32(ReduxWidth) &&
+ "We only handle power-of-two reductions for now");
+
return createSimpleTargetReduction(Builder, TTI, VectorizedValue, RdxKind,
ReductionOps.back());
}
};
-
+
} // end anonymous namespace
-
+
static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
if (auto *IE = dyn_cast<InsertElementInst>(InsertInst))
return cast<FixedVectorType>(IE->getType())->getNumElements();
@@ -7105,10 +7105,10 @@ static Optional<unsigned> getAggregateSize(Instruction *InsertInst) {
return AggregateSize;
} else {
return None;
- }
+ }
} while (true);
}
-
+
static Optional<unsigned> getOperandIndex(Instruction *InsertInst,
unsigned OperandOffset) {
unsigned OperandIndex = OperandOffset;
@@ -7120,8 +7120,8 @@ static Optional<unsigned> getOperandIndex(Instruction *InsertInst,
return OperandIndex;
}
return None;
- }
-
+ }
+
auto *IV = cast<InsertValueInst>(InsertInst);
Type *CurrentType = IV->getType();
for (unsigned int Index : IV->indices()) {
@@ -7138,7 +7138,7 @@ static Optional<unsigned> getOperandIndex(Instruction *InsertInst,
}
return OperandIndex;
}
-
+
static bool findBuildAggregate_rec(Instruction *LastInsertInst,
TargetTransformInfo *TTI,
SmallVectorImpl<Value *> &BuildVectorOpds,
@@ -7169,28 +7169,28 @@ static bool findBuildAggregate_rec(Instruction *LastInsertInst,
return false;
}
-/// Recognize construction of vectors like
+/// Recognize construction of vectors like
/// %ra = insertelement <4 x float> poison, float %s0, i32 0
-/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
-/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
-/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
-/// starting from the last insertelement or insertvalue instruction.
-///
+/// %rb = insertelement <4 x float> %ra, float %s1, i32 1
+/// %rc = insertelement <4 x float> %rb, float %s2, i32 2
+/// %rd = insertelement <4 x float> %rc, float %s3, i32 3
+/// starting from the last insertelement or insertvalue instruction.
+///
/// Also recognize homogeneous aggregates like {<2 x float>, <2 x float>},
-/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
-/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
-///
-/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
-///
-/// \return true if it matches.
+/// {{float, float}, {float, float}}, [2 x {float, float}] and so on.
+/// See llvm/test/Transforms/SLPVectorizer/X86/pr42022.ll for examples.
+///
+/// Assume LastInsertInst is of InsertElementInst or InsertValueInst type.
+///
+/// \return true if it matches.
static bool findBuildAggregate(Instruction *LastInsertInst,
TargetTransformInfo *TTI,
- SmallVectorImpl<Value *> &BuildVectorOpds,
- SmallVectorImpl<Value *> &InsertElts) {
+ SmallVectorImpl<Value *> &BuildVectorOpds,
+ SmallVectorImpl<Value *> &InsertElts) {
- assert((isa<InsertElementInst>(LastInsertInst) ||
- isa<InsertValueInst>(LastInsertInst)) &&
- "Expected insertelement or insertvalue instruction!");
+ assert((isa<InsertElementInst>(LastInsertInst) ||
+ isa<InsertValueInst>(LastInsertInst)) &&
+ "Expected insertelement or insertvalue instruction!");
assert((BuildVectorOpds.empty() && InsertElts.empty()) &&
"Expected empty result vectors!");
@@ -7210,63 +7210,63 @@ static bool findBuildAggregate(Instruction *LastInsertInst,
}
return false;
-}
-
-static bool PhiTypeSorterFunc(Value *V, Value *V2) {
- return V->getType() < V2->getType();
-}
-
-/// Try and get a reduction value from a phi node.
-///
-/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
-/// if they come from either \p ParentBB or a containing loop latch.
-///
-/// \returns A candidate reduction value if possible, or \code nullptr \endcode
-/// if not possible.
-static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
- BasicBlock *ParentBB, LoopInfo *LI) {
- // There are situations where the reduction value is not dominated by the
- // reduction phi. Vectorizing such cases has been reported to cause
- // miscompiles. See PR25787.
- auto DominatedReduxValue = [&](Value *R) {
- return isa<Instruction>(R) &&
- DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
- };
-
- Value *Rdx = nullptr;
-
- // Return the incoming value if it comes from the same BB as the phi node.
- if (P->getIncomingBlock(0) == ParentBB) {
- Rdx = P->getIncomingValue(0);
- } else if (P->getIncomingBlock(1) == ParentBB) {
- Rdx = P->getIncomingValue(1);
- }
-
- if (Rdx && DominatedReduxValue(Rdx))
- return Rdx;
-
- // Otherwise, check whether we have a loop latch to look at.
- Loop *BBL = LI->getLoopFor(ParentBB);
- if (!BBL)
- return nullptr;
- BasicBlock *BBLatch = BBL->getLoopLatch();
- if (!BBLatch)
- return nullptr;
-
- // There is a loop latch, return the incoming value if it comes from
- // that. This reduction pattern occasionally turns up.
- if (P->getIncomingBlock(0) == BBLatch) {
- Rdx = P->getIncomingValue(0);
- } else if (P->getIncomingBlock(1) == BBLatch) {
- Rdx = P->getIncomingValue(1);
- }
-
- if (Rdx && DominatedReduxValue(Rdx))
- return Rdx;
-
- return nullptr;
-}
-
+}
+
+static bool PhiTypeSorterFunc(Value *V, Value *V2) {
+ return V->getType() < V2->getType();
+}
+
+/// Try and get a reduction value from a phi node.
+///
+/// Given a phi node \p P in a block \p ParentBB, consider possible reductions
+/// if they come from either \p ParentBB or a containing loop latch.
+///
+/// \returns A candidate reduction value if possible, or \code nullptr \endcode
+/// if not possible.
+static Value *getReductionValue(const DominatorTree *DT, PHINode *P,
+ BasicBlock *ParentBB, LoopInfo *LI) {
+ // There are situations where the reduction value is not dominated by the
+ // reduction phi. Vectorizing such cases has been reported to cause
+ // miscompiles. See PR25787.
+ auto DominatedReduxValue = [&](Value *R) {
+ return isa<Instruction>(R) &&
+ DT->dominates(P->getParent(), cast<Instruction>(R)->getParent());
+ };
+
+ Value *Rdx = nullptr;
+
+ // Return the incoming value if it comes from the same BB as the phi node.
+ if (P->getIncomingBlock(0) == ParentBB) {
+ Rdx = P->getIncomingValue(0);
+ } else if (P->getIncomingBlock(1) == ParentBB) {
+ Rdx = P->getIncomingValue(1);
+ }
+
+ if (Rdx && DominatedReduxValue(Rdx))
+ return Rdx;
+
+ // Otherwise, check whether we have a loop latch to look at.
+ Loop *BBL = LI->getLoopFor(ParentBB);
+ if (!BBL)
+ return nullptr;
+ BasicBlock *BBLatch = BBL->getLoopLatch();
+ if (!BBLatch)
+ return nullptr;
+
+ // There is a loop latch, return the incoming value if it comes from
+ // that. This reduction pattern occasionally turns up.
+ if (P->getIncomingBlock(0) == BBLatch) {
+ Rdx = P->getIncomingValue(0);
+ } else if (P->getIncomingBlock(1) == BBLatch) {
+ Rdx = P->getIncomingValue(1);
+ }
+
+ if (Rdx && DominatedReduxValue(Rdx))
+ return Rdx;
+
+ return nullptr;
+}
+
static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
return true;
@@ -7277,263 +7277,263 @@ static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
return false;
}
-/// Attempt to reduce a horizontal reduction.
-/// If it is legal to match a horizontal reduction feeding the phi node \a P
-/// with reduction operators \a Root (or one of its operands) in a basic block
-/// \a BB, then check if it can be done. If horizontal reduction is not found
-/// and root instruction is a binary operation, vectorization of the operands is
-/// attempted.
-/// \returns true if a horizontal reduction was matched and reduced or operands
-/// of one of the binary instruction were vectorized.
-/// \returns false if a horizontal reduction was not matched (or not possible)
-/// or no vectorization of any binary operation feeding \a Root instruction was
-/// performed.
-static bool tryToVectorizeHorReductionOrInstOperands(
- PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
- TargetTransformInfo *TTI,
- const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
- if (!ShouldVectorizeHor)
- return false;
-
- if (!Root)
- return false;
-
- if (Root->getParent() != BB || isa<PHINode>(Root))
- return false;
- // Start analysis starting from Root instruction. If horizontal reduction is
- // found, try to vectorize it. If it is not a horizontal reduction or
- // vectorization is not possible or not effective, and currently analyzed
- // instruction is a binary operation, try to vectorize the operands, using
- // pre-order DFS traversal order. If the operands were not vectorized, repeat
- // the same procedure considering each operand as a possible root of the
- // horizontal reduction.
- // Interrupt the process if the Root instruction itself was vectorized or all
- // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
- SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});
- SmallPtrSet<Value *, 8> VisitedInstrs;
- bool Res = false;
- while (!Stack.empty()) {
- Instruction *Inst;
- unsigned Level;
- std::tie(Inst, Level) = Stack.pop_back_val();
+/// Attempt to reduce a horizontal reduction.
+/// If it is legal to match a horizontal reduction feeding the phi node \a P
+/// with reduction operators \a Root (or one of its operands) in a basic block
+/// \a BB, then check if it can be done. If horizontal reduction is not found
+/// and root instruction is a binary operation, vectorization of the operands is
+/// attempted.
+/// \returns true if a horizontal reduction was matched and reduced or operands
+/// of one of the binary instruction were vectorized.
+/// \returns false if a horizontal reduction was not matched (or not possible)
+/// or no vectorization of any binary operation feeding \a Root instruction was
+/// performed.
+static bool tryToVectorizeHorReductionOrInstOperands(
+ PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
+ TargetTransformInfo *TTI,
+ const function_ref<bool(Instruction *, BoUpSLP &)> Vectorize) {
+ if (!ShouldVectorizeHor)
+ return false;
+
+ if (!Root)
+ return false;
+
+ if (Root->getParent() != BB || isa<PHINode>(Root))
+ return false;
+ // Start analysis starting from Root instruction. If horizontal reduction is
+ // found, try to vectorize it. If it is not a horizontal reduction or
+ // vectorization is not possible or not effective, and currently analyzed
+ // instruction is a binary operation, try to vectorize the operands, using
+ // pre-order DFS traversal order. If the operands were not vectorized, repeat
+ // the same procedure considering each operand as a possible root of the
+ // horizontal reduction.
+ // Interrupt the process if the Root instruction itself was vectorized or all
+ // sub-trees not higher that RecursionMaxDepth were analyzed/vectorized.
+ SmallVector<std::pair<Instruction *, unsigned>, 8> Stack(1, {Root, 0});
+ SmallPtrSet<Value *, 8> VisitedInstrs;
+ bool Res = false;
+ while (!Stack.empty()) {
+ Instruction *Inst;
+ unsigned Level;
+ std::tie(Inst, Level) = Stack.pop_back_val();
Value *B0, *B1;
bool IsBinop = matchRdxBop(Inst, B0, B1);
bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
if (IsBinop || IsSelect) {
- HorizontalReduction HorRdx;
- if (HorRdx.matchAssociativeReduction(P, Inst)) {
- if (HorRdx.tryToReduce(R, TTI)) {
- Res = true;
- // Set P to nullptr to avoid re-analysis of phi node in
- // matchAssociativeReduction function unless this is the root node.
- P = nullptr;
- continue;
- }
- }
+ HorizontalReduction HorRdx;
+ if (HorRdx.matchAssociativeReduction(P, Inst)) {
+ if (HorRdx.tryToReduce(R, TTI)) {
+ Res = true;
+ // Set P to nullptr to avoid re-analysis of phi node in
+ // matchAssociativeReduction function unless this is the root node.
+ P = nullptr;
+ continue;
+ }
+ }
if (P && IsBinop) {
Inst = dyn_cast<Instruction>(B0);
- if (Inst == P)
+ if (Inst == P)
Inst = dyn_cast<Instruction>(B1);
- if (!Inst) {
- // Set P to nullptr to avoid re-analysis of phi node in
- // matchAssociativeReduction function unless this is the root node.
- P = nullptr;
- continue;
- }
- }
- }
- // Set P to nullptr to avoid re-analysis of phi node in
- // matchAssociativeReduction function unless this is the root node.
- P = nullptr;
- if (Vectorize(Inst, R)) {
- Res = true;
- continue;
- }
-
- // Try to vectorize operands.
- // Continue analysis for the instruction from the same basic block only to
- // save compile time.
- if (++Level < RecursionMaxDepth)
- for (auto *Op : Inst->operand_values())
- if (VisitedInstrs.insert(Op).second)
- if (auto *I = dyn_cast<Instruction>(Op))
- if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB)
- Stack.emplace_back(I, Level);
- }
- return Res;
-}
-
-bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
- BasicBlock *BB, BoUpSLP &R,
- TargetTransformInfo *TTI) {
+ if (!Inst) {
+ // Set P to nullptr to avoid re-analysis of phi node in
+ // matchAssociativeReduction function unless this is the root node.
+ P = nullptr;
+ continue;
+ }
+ }
+ }
+ // Set P to nullptr to avoid re-analysis of phi node in
+ // matchAssociativeReduction function unless this is the root node.
+ P = nullptr;
+ if (Vectorize(Inst, R)) {
+ Res = true;
+ continue;
+ }
+
+ // Try to vectorize operands.
+ // Continue analysis for the instruction from the same basic block only to
+ // save compile time.
+ if (++Level < RecursionMaxDepth)
+ for (auto *Op : Inst->operand_values())
+ if (VisitedInstrs.insert(Op).second)
+ if (auto *I = dyn_cast<Instruction>(Op))
+ if (!isa<PHINode>(I) && !R.isDeleted(I) && I->getParent() == BB)
+ Stack.emplace_back(I, Level);
+ }
+ return Res;
+}
+
+bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
+ BasicBlock *BB, BoUpSLP &R,
+ TargetTransformInfo *TTI) {
auto *I = dyn_cast_or_null<Instruction>(V);
- if (!I)
- return false;
-
- if (!isa<BinaryOperator>(I))
- P = nullptr;
- // Try to match and vectorize a horizontal reduction.
- auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
- return tryToVectorize(I, R);
- };
- return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
- ExtraVectorization);
-}
-
-bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
- BasicBlock *BB, BoUpSLP &R) {
- const DataLayout &DL = BB->getModule()->getDataLayout();
- if (!R.canMapToVector(IVI->getType(), DL))
- return false;
-
- SmallVector<Value *, 16> BuildVectorOpds;
- SmallVector<Value *, 16> BuildVectorInsts;
+ if (!I)
+ return false;
+
+ if (!isa<BinaryOperator>(I))
+ P = nullptr;
+ // Try to match and vectorize a horizontal reduction.
+ auto &&ExtraVectorization = [this](Instruction *I, BoUpSLP &R) -> bool {
+ return tryToVectorize(I, R);
+ };
+ return tryToVectorizeHorReductionOrInstOperands(P, I, BB, R, TTI,
+ ExtraVectorization);
+}
+
+bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
+ BasicBlock *BB, BoUpSLP &R) {
+ const DataLayout &DL = BB->getModule()->getDataLayout();
+ if (!R.canMapToVector(IVI->getType(), DL))
+ return false;
+
+ SmallVector<Value *, 16> BuildVectorOpds;
+ SmallVector<Value *, 16> BuildVectorInsts;
if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
- return false;
-
- LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
- // Aggregate value is unlikely to be processed in vector register, we need to
- // extract scalars into scalar registers, so NeedExtraction is set true.
- return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
- BuildVectorInsts);
-}
-
-bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
- BasicBlock *BB, BoUpSLP &R) {
- SmallVector<Value *, 16> BuildVectorInsts;
- SmallVector<Value *, 16> BuildVectorOpds;
- if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
- (llvm::all_of(BuildVectorOpds,
- [](Value *V) { return isa<ExtractElementInst>(V); }) &&
- isShuffle(BuildVectorOpds)))
- return false;
-
- // Vectorize starting with the build vector operands ignoring the BuildVector
- // instructions for the purpose of scheduling and user extraction.
- return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
- BuildVectorInsts);
-}
-
-bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
- BoUpSLP &R) {
- if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
- return true;
-
- bool OpsChanged = false;
- for (int Idx = 0; Idx < 2; ++Idx) {
- OpsChanged |=
- vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
- }
- return OpsChanged;
-}
-
-bool SLPVectorizerPass::vectorizeSimpleInstructions(
- SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) {
- bool OpsChanged = false;
- for (auto *I : reverse(Instructions)) {
- if (R.isDeleted(I))
- continue;
- if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
- OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
- else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
- OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
- else if (auto *CI = dyn_cast<CmpInst>(I))
- OpsChanged |= vectorizeCmpInst(CI, BB, R);
- }
- Instructions.clear();
- return OpsChanged;
-}
-
-bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
- bool Changed = false;
- SmallVector<Value *, 4> Incoming;
- SmallPtrSet<Value *, 16> VisitedInstrs;
-
- bool HaveVectorizedPhiNodes = true;
- while (HaveVectorizedPhiNodes) {
- HaveVectorizedPhiNodes = false;
-
- // Collect the incoming values from the PHIs.
- Incoming.clear();
- for (Instruction &I : *BB) {
- PHINode *P = dyn_cast<PHINode>(&I);
- if (!P)
- break;
-
- if (!VisitedInstrs.count(P) && !R.isDeleted(P))
- Incoming.push_back(P);
- }
-
- // Sort by type.
- llvm::stable_sort(Incoming, PhiTypeSorterFunc);
-
- // Try to vectorize elements base on their type.
- for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
- E = Incoming.end();
- IncIt != E;) {
-
- // Look for the next elements with the same type.
- SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
- while (SameTypeIt != E &&
+ return false;
+
+ LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
+ // Aggregate value is unlikely to be processed in vector register, we need to
+ // extract scalars into scalar registers, so NeedExtraction is set true.
+ return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
+ BuildVectorInsts);
+}
+
+bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
+ BasicBlock *BB, BoUpSLP &R) {
+ SmallVector<Value *, 16> BuildVectorInsts;
+ SmallVector<Value *, 16> BuildVectorOpds;
+ if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
+ (llvm::all_of(BuildVectorOpds,
+ [](Value *V) { return isa<ExtractElementInst>(V); }) &&
+ isShuffle(BuildVectorOpds)))
+ return false;
+
+ // Vectorize starting with the build vector operands ignoring the BuildVector
+ // instructions for the purpose of scheduling and user extraction.
+ return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
+ BuildVectorInsts);
+}
+
+bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
+ BoUpSLP &R) {
+ if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R))
+ return true;
+
+ bool OpsChanged = false;
+ for (int Idx = 0; Idx < 2; ++Idx) {
+ OpsChanged |=
+ vectorizeRootInstruction(nullptr, CI->getOperand(Idx), BB, R, TTI);
+ }
+ return OpsChanged;
+}
+
+bool SLPVectorizerPass::vectorizeSimpleInstructions(
+ SmallVectorImpl<Instruction *> &Instructions, BasicBlock *BB, BoUpSLP &R) {
+ bool OpsChanged = false;
+ for (auto *I : reverse(Instructions)) {
+ if (R.isDeleted(I))
+ continue;
+ if (auto *LastInsertValue = dyn_cast<InsertValueInst>(I))
+ OpsChanged |= vectorizeInsertValueInst(LastInsertValue, BB, R);
+ else if (auto *LastInsertElem = dyn_cast<InsertElementInst>(I))
+ OpsChanged |= vectorizeInsertElementInst(LastInsertElem, BB, R);
+ else if (auto *CI = dyn_cast<CmpInst>(I))
+ OpsChanged |= vectorizeCmpInst(CI, BB, R);
+ }
+ Instructions.clear();
+ return OpsChanged;
+}
+
+bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
+ bool Changed = false;
+ SmallVector<Value *, 4> Incoming;
+ SmallPtrSet<Value *, 16> VisitedInstrs;
+
+ bool HaveVectorizedPhiNodes = true;
+ while (HaveVectorizedPhiNodes) {
+ HaveVectorizedPhiNodes = false;
+
+ // Collect the incoming values from the PHIs.
+ Incoming.clear();
+ for (Instruction &I : *BB) {
+ PHINode *P = dyn_cast<PHINode>(&I);
+ if (!P)
+ break;
+
+ if (!VisitedInstrs.count(P) && !R.isDeleted(P))
+ Incoming.push_back(P);
+ }
+
+ // Sort by type.
+ llvm::stable_sort(Incoming, PhiTypeSorterFunc);
+
+ // Try to vectorize elements base on their type.
+ for (SmallVector<Value *, 4>::iterator IncIt = Incoming.begin(),
+ E = Incoming.end();
+ IncIt != E;) {
+
+ // Look for the next elements with the same type.
+ SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
+ while (SameTypeIt != E &&
(*SameTypeIt)->getType() == (*IncIt)->getType()) {
- VisitedInstrs.insert(*SameTypeIt);
- ++SameTypeIt;
- }
-
- // Try to vectorize them.
- unsigned NumElts = (SameTypeIt - IncIt);
- LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
- << NumElts << ")\n");
- // The order in which the phi nodes appear in the program does not matter.
- // So allow tryToVectorizeList to reorder them if it is beneficial. This
- // is done when there are exactly two elements since tryToVectorizeList
- // asserts that there are only two values when AllowReorder is true.
- bool AllowReorder = NumElts == 2;
- if (NumElts > 1 &&
- tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
- // Success start over because instructions might have been changed.
- HaveVectorizedPhiNodes = true;
- Changed = true;
- break;
- }
-
- // Start over at the next instruction of a different type (or the end).
- IncIt = SameTypeIt;
- }
- }
-
- VisitedInstrs.clear();
-
- SmallVector<Instruction *, 8> PostProcessInstructions;
- SmallDenseSet<Instruction *, 4> KeyNodes;
- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ VisitedInstrs.insert(*SameTypeIt);
+ ++SameTypeIt;
+ }
+
+ // Try to vectorize them.
+ unsigned NumElts = (SameTypeIt - IncIt);
+ LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize starting at PHIs ("
+ << NumElts << ")\n");
+ // The order in which the phi nodes appear in the program does not matter.
+ // So allow tryToVectorizeList to reorder them if it is beneficial. This
+ // is done when there are exactly two elements since tryToVectorizeList
+ // asserts that there are only two values when AllowReorder is true.
+ bool AllowReorder = NumElts == 2;
+ if (NumElts > 1 &&
+ tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
+ // Success start over because instructions might have been changed.
+ HaveVectorizedPhiNodes = true;
+ Changed = true;
+ break;
+ }
+
+ // Start over at the next instruction of a different type (or the end).
+ IncIt = SameTypeIt;
+ }
+ }
+
+ VisitedInstrs.clear();
+
+ SmallVector<Instruction *, 8> PostProcessInstructions;
+ SmallDenseSet<Instruction *, 4> KeyNodes;
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
// Skip instructions with scalable type. The num of elements is unknown at
// compile-time for scalable type.
if (isa<ScalableVectorType>(it->getType()))
continue;
- // Skip instructions marked for the deletion.
- if (R.isDeleted(&*it))
- continue;
- // We may go through BB multiple times so skip the one we have checked.
- if (!VisitedInstrs.insert(&*it).second) {
+ // Skip instructions marked for the deletion.
+ if (R.isDeleted(&*it))
+ continue;
+ // We may go through BB multiple times so skip the one we have checked.
+ if (!VisitedInstrs.insert(&*it).second) {
if (it->use_empty() && KeyNodes.contains(&*it) &&
- vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
- // We would like to start over since some instructions are deleted
- // and the iterator may become invalid value.
- Changed = true;
- it = BB->begin();
- e = BB->end();
- }
- continue;
- }
-
- if (isa<DbgInfoIntrinsic>(it))
- continue;
-
- // Try to vectorize reductions that use PHINodes.
- if (PHINode *P = dyn_cast<PHINode>(it)) {
- // Check that the PHI is a reduction PHI.
+ vectorizeSimpleInstructions(PostProcessInstructions, BB, R)) {
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ }
+ continue;
+ }
+
+ if (isa<DbgInfoIntrinsic>(it))
+ continue;
+
+ // Try to vectorize reductions that use PHINodes.
+ if (PHINode *P = dyn_cast<PHINode>(it)) {
+ // Check that the PHI is a reduction PHI.
if (P->getNumIncomingValues() == 2) {
// Try to match and vectorize a horizontal reduction.
if (vectorizeRootInstruction(P, getReductionValue(DT, P, BB, LI), BB, R,
@@ -7554,169 +7554,169 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
if (BB == P->getIncomingBlock(I) ||
!DT->isReachableFromEntry(P->getIncomingBlock(I)))
continue;
-
+
Changed |= vectorizeRootInstruction(nullptr, P->getIncomingValue(I),
P->getIncomingBlock(I), R, TTI);
- }
- continue;
- }
-
- // Ran into an instruction without users, like terminator, or function call
- // with ignored return value, store. Ignore unused instructions (basing on
- // instruction type, except for CallInst and InvokeInst).
- if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
- isa<InvokeInst>(it))) {
- KeyNodes.insert(&*it);
- bool OpsChanged = false;
- if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
- for (auto *V : it->operand_values()) {
- // Try to match and vectorize a horizontal reduction.
- OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
- }
- }
- // Start vectorization of post-process list of instructions from the
- // top-tree instructions to try to vectorize as many instructions as
- // possible.
- OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
- if (OpsChanged) {
- // We would like to start over since some instructions are deleted
- // and the iterator may become invalid value.
- Changed = true;
- it = BB->begin();
- e = BB->end();
- continue;
- }
- }
-
- if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
- isa<InsertValueInst>(it))
- PostProcessInstructions.push_back(&*it);
- }
-
- return Changed;
-}
-
-bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
- auto Changed = false;
- for (auto &Entry : GEPs) {
- // If the getelementptr list has fewer than two elements, there's nothing
- // to do.
- if (Entry.second.size() < 2)
- continue;
-
- LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
- << Entry.second.size() << ".\n");
-
- // Process the GEP list in chunks suitable for the target's supported
- // vector size. If a vector register can't hold 1 element, we are done. We
- // are trying to vectorize the index computations, so the maximum number of
- // elements is based on the size of the index expression, rather than the
- // size of the GEP itself (the target's pointer size).
- unsigned MaxVecRegSize = R.getMaxVecRegSize();
- unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
- if (MaxVecRegSize < EltSize)
- continue;
-
- unsigned MaxElts = MaxVecRegSize / EltSize;
- for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
- auto Len = std::min<unsigned>(BE - BI, MaxElts);
+ }
+ continue;
+ }
+
+ // Ran into an instruction without users, like terminator, or function call
+ // with ignored return value, store. Ignore unused instructions (basing on
+ // instruction type, except for CallInst and InvokeInst).
+ if (it->use_empty() && (it->getType()->isVoidTy() || isa<CallInst>(it) ||
+ isa<InvokeInst>(it))) {
+ KeyNodes.insert(&*it);
+ bool OpsChanged = false;
+ if (ShouldStartVectorizeHorAtStore || !isa<StoreInst>(it)) {
+ for (auto *V : it->operand_values()) {
+ // Try to match and vectorize a horizontal reduction.
+ OpsChanged |= vectorizeRootInstruction(nullptr, V, BB, R, TTI);
+ }
+ }
+ // Start vectorization of post-process list of instructions from the
+ // top-tree instructions to try to vectorize as many instructions as
+ // possible.
+ OpsChanged |= vectorizeSimpleInstructions(PostProcessInstructions, BB, R);
+ if (OpsChanged) {
+ // We would like to start over since some instructions are deleted
+ // and the iterator may become invalid value.
+ Changed = true;
+ it = BB->begin();
+ e = BB->end();
+ continue;
+ }
+ }
+
+ if (isa<InsertElementInst>(it) || isa<CmpInst>(it) ||
+ isa<InsertValueInst>(it))
+ PostProcessInstructions.push_back(&*it);
+ }
+
+ return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
+ auto Changed = false;
+ for (auto &Entry : GEPs) {
+ // If the getelementptr list has fewer than two elements, there's nothing
+ // to do.
+ if (Entry.second.size() < 2)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a getelementptr list of length "
+ << Entry.second.size() << ".\n");
+
+ // Process the GEP list in chunks suitable for the target's supported
+ // vector size. If a vector register can't hold 1 element, we are done. We
+ // are trying to vectorize the index computations, so the maximum number of
+ // elements is based on the size of the index expression, rather than the
+ // size of the GEP itself (the target's pointer size).
+ unsigned MaxVecRegSize = R.getMaxVecRegSize();
+ unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
+ if (MaxVecRegSize < EltSize)
+ continue;
+
+ unsigned MaxElts = MaxVecRegSize / EltSize;
+ for (unsigned BI = 0, BE = Entry.second.size(); BI < BE; BI += MaxElts) {
+ auto Len = std::min<unsigned>(BE - BI, MaxElts);
ArrayRef<GetElementPtrInst *> GEPList(&Entry.second[BI], Len);
-
- // Initialize a set a candidate getelementptrs. Note that we use a
- // SetVector here to preserve program order. If the index computations
- // are vectorizable and begin with loads, we want to minimize the chance
- // of having to reorder them later.
- SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
-
- // Some of the candidates may have already been vectorized after we
- // initially collected them. If so, they are marked as deleted, so remove
- // them from the set of candidates.
- Candidates.remove_if(
- [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); });
-
- // Remove from the set of candidates all pairs of getelementptrs with
- // constant differences. Such getelementptrs are likely not good
- // candidates for vectorization in a bottom-up phase since one can be
- // computed from the other. We also ensure all candidate getelementptr
- // indices are unique.
- for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
- auto *GEPI = GEPList[I];
- if (!Candidates.count(GEPI))
- continue;
- auto *SCEVI = SE->getSCEV(GEPList[I]);
- for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
- auto *GEPJ = GEPList[J];
- auto *SCEVJ = SE->getSCEV(GEPList[J]);
- if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
- Candidates.remove(GEPI);
- Candidates.remove(GEPJ);
- } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
- Candidates.remove(GEPJ);
- }
- }
- }
-
- // We break out of the above computation as soon as we know there are
- // fewer than two candidates remaining.
- if (Candidates.size() < 2)
- continue;
-
- // Add the single, non-constant index of each candidate to the bundle. We
- // ensured the indices met these constraints when we originally collected
- // the getelementptrs.
- SmallVector<Value *, 16> Bundle(Candidates.size());
- auto BundleIndex = 0u;
- for (auto *V : Candidates) {
- auto *GEP = cast<GetElementPtrInst>(V);
- auto *GEPIdx = GEP->idx_begin()->get();
- assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
- Bundle[BundleIndex++] = GEPIdx;
- }
-
- // Try and vectorize the indices. We are currently only interested in
- // gather-like cases of the form:
- //
- // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
- //
- // where the loads of "a", the loads of "b", and the subtractions can be
- // performed in parallel. It's likely that detecting this pattern in a
- // bottom-up phase will be simpler and less costly than building a
- // full-blown top-down phase beginning at the consecutive loads.
- Changed |= tryToVectorizeList(Bundle, R);
- }
- }
- return Changed;
-}
-
-bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
- bool Changed = false;
- // Attempt to sort and vectorize each of the store-groups.
- for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
- ++it) {
- if (it->second.size() < 2)
- continue;
-
- LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
- << it->second.size() << ".\n");
-
- Changed |= vectorizeStores(it->second, R);
- }
- return Changed;
-}
-
-char SLPVectorizer::ID = 0;
-
-static const char lv_name[] = "SLP Vectorizer";
-
-INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
-INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
-INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
-
-Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
+
+ // Initialize a set a candidate getelementptrs. Note that we use a
+ // SetVector here to preserve program order. If the index computations
+ // are vectorizable and begin with loads, we want to minimize the chance
+ // of having to reorder them later.
+ SetVector<Value *> Candidates(GEPList.begin(), GEPList.end());
+
+ // Some of the candidates may have already been vectorized after we
+ // initially collected them. If so, they are marked as deleted, so remove
+ // them from the set of candidates.
+ Candidates.remove_if(
+ [&R](Value *I) { return R.isDeleted(cast<Instruction>(I)); });
+
+ // Remove from the set of candidates all pairs of getelementptrs with
+ // constant differences. Such getelementptrs are likely not good
+ // candidates for vectorization in a bottom-up phase since one can be
+ // computed from the other. We also ensure all candidate getelementptr
+ // indices are unique.
+ for (int I = 0, E = GEPList.size(); I < E && Candidates.size() > 1; ++I) {
+ auto *GEPI = GEPList[I];
+ if (!Candidates.count(GEPI))
+ continue;
+ auto *SCEVI = SE->getSCEV(GEPList[I]);
+ for (int J = I + 1; J < E && Candidates.size() > 1; ++J) {
+ auto *GEPJ = GEPList[J];
+ auto *SCEVJ = SE->getSCEV(GEPList[J]);
+ if (isa<SCEVConstant>(SE->getMinusSCEV(SCEVI, SCEVJ))) {
+ Candidates.remove(GEPI);
+ Candidates.remove(GEPJ);
+ } else if (GEPI->idx_begin()->get() == GEPJ->idx_begin()->get()) {
+ Candidates.remove(GEPJ);
+ }
+ }
+ }
+
+ // We break out of the above computation as soon as we know there are
+ // fewer than two candidates remaining.
+ if (Candidates.size() < 2)
+ continue;
+
+ // Add the single, non-constant index of each candidate to the bundle. We
+ // ensured the indices met these constraints when we originally collected
+ // the getelementptrs.
+ SmallVector<Value *, 16> Bundle(Candidates.size());
+ auto BundleIndex = 0u;
+ for (auto *V : Candidates) {
+ auto *GEP = cast<GetElementPtrInst>(V);
+ auto *GEPIdx = GEP->idx_begin()->get();
+ assert(GEP->getNumIndices() == 1 || !isa<Constant>(GEPIdx));
+ Bundle[BundleIndex++] = GEPIdx;
+ }
+
+ // Try and vectorize the indices. We are currently only interested in
+ // gather-like cases of the form:
+ //
+ // ... = g[a[0] - b[0]] + g[a[1] - b[1]] + ...
+ //
+ // where the loads of "a", the loads of "b", and the subtractions can be
+ // performed in parallel. It's likely that detecting this pattern in a
+ // bottom-up phase will be simpler and less costly than building a
+ // full-blown top-down phase beginning at the consecutive loads.
+ Changed |= tryToVectorizeList(Bundle, R);
+ }
+ }
+ return Changed;
+}
+
+bool SLPVectorizerPass::vectorizeStoreChains(BoUpSLP &R) {
+ bool Changed = false;
+ // Attempt to sort and vectorize each of the store-groups.
+ for (StoreListMap::iterator it = Stores.begin(), e = Stores.end(); it != e;
+ ++it) {
+ if (it->second.size() < 2)
+ continue;
+
+ LLVM_DEBUG(dbgs() << "SLP: Analyzing a store chain of length "
+ << it->second.size() << ".\n");
+
+ Changed |= vectorizeStores(it->second, R);
+ }
+ return Changed;
+}
+
+char SLPVectorizer::ID = 0;
+
+static const char lv_name[] = "SLP Vectorizer";
+
+INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
+INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
+
+Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h
index eebb58be8b..8737016760 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -1,161 +1,161 @@
-//===- VPRecipeBuilder.h - Helper class to build recipes --------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
-
-#include "LoopVectorizationPlanner.h"
-#include "VPlan.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/IR/IRBuilder.h"
-
-namespace llvm {
-
-class LoopVectorizationLegality;
-class LoopVectorizationCostModel;
-class TargetLibraryInfo;
-
-/// Helper class to create VPRecipies from IR instructions.
-class VPRecipeBuilder {
- /// The loop that we evaluate.
- Loop *OrigLoop;
-
- /// Target Library Info.
- const TargetLibraryInfo *TLI;
-
- /// The legality analysis.
- LoopVectorizationLegality *Legal;
-
- /// The profitablity analysis.
- LoopVectorizationCostModel &CM;
-
- PredicatedScalarEvolution &PSE;
-
- VPBuilder &Builder;
-
- /// When we if-convert we need to create edge masks. We have to cache values
- /// so that we don't end up with exponential recursion/IR. Note that
- /// if-conversion currently takes place during VPlan-construction, so these
- /// caches are only used at that stage.
- using EdgeMaskCacheTy =
- DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
- using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
- EdgeMaskCacheTy EdgeMaskCache;
- BlockMaskCacheTy BlockMaskCache;
-
- // VPlan-VPlan transformations support: Hold a mapping from ingredients to
- // their recipe. To save on memory, only do so for selected ingredients,
- // marked by having a nullptr entry in this map.
- DenseMap<Instruction *, VPRecipeBase *> Ingredient2Recipe;
-
- /// Check if \p I can be widened at the start of \p Range and possibly
- /// decrease the range such that the returned value holds for the entire \p
- /// Range. The function should not be called for memory instructions or calls.
- bool shouldWiden(Instruction *I, VFRange &Range) const;
-
- /// Check if the load or store instruction \p I should widened for \p
- /// Range.Start and potentially masked. Such instructions are handled by a
- /// recipe that takes an additional VPInstruction for the mask.
+//===- VPRecipeBuilder.h - Helper class to build recipes --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
+
+#include "LoopVectorizationPlanner.h"
+#include "VPlan.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/IRBuilder.h"
+
+namespace llvm {
+
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+class TargetLibraryInfo;
+
+/// Helper class to create VPRecipies from IR instructions.
+class VPRecipeBuilder {
+ /// The loop that we evaluate.
+ Loop *OrigLoop;
+
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
+ /// The legality analysis.
+ LoopVectorizationLegality *Legal;
+
+ /// The profitablity analysis.
+ LoopVectorizationCostModel &CM;
+
+ PredicatedScalarEvolution &PSE;
+
+ VPBuilder &Builder;
+
+ /// When we if-convert we need to create edge masks. We have to cache values
+ /// so that we don't end up with exponential recursion/IR. Note that
+ /// if-conversion currently takes place during VPlan-construction, so these
+ /// caches are only used at that stage.
+ using EdgeMaskCacheTy =
+ DenseMap<std::pair<BasicBlock *, BasicBlock *>, VPValue *>;
+ using BlockMaskCacheTy = DenseMap<BasicBlock *, VPValue *>;
+ EdgeMaskCacheTy EdgeMaskCache;
+ BlockMaskCacheTy BlockMaskCache;
+
+ // VPlan-VPlan transformations support: Hold a mapping from ingredients to
+ // their recipe. To save on memory, only do so for selected ingredients,
+ // marked by having a nullptr entry in this map.
+ DenseMap<Instruction *, VPRecipeBase *> Ingredient2Recipe;
+
+ /// Check if \p I can be widened at the start of \p Range and possibly
+ /// decrease the range such that the returned value holds for the entire \p
+ /// Range. The function should not be called for memory instructions or calls.
+ bool shouldWiden(Instruction *I, VFRange &Range) const;
+
+ /// Check if the load or store instruction \p I should widened for \p
+ /// Range.Start and potentially masked. Such instructions are handled by a
+ /// recipe that takes an additional VPInstruction for the mask.
VPRecipeBase *tryToWidenMemory(Instruction *I, VFRange &Range,
VPlanPtr &Plan);
-
- /// Check if an induction recipe should be constructed for \I. If so build and
- /// return it. If not, return null.
+
+ /// Check if an induction recipe should be constructed for \I. If so build and
+ /// return it. If not, return null.
VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi,
VPlan &Plan) const;
-
- /// Optimize the special case where the operand of \p I is a constant integer
- /// induction variable.
- VPWidenIntOrFpInductionRecipe *
+
+ /// Optimize the special case where the operand of \p I is a constant integer
+ /// induction variable.
+ VPWidenIntOrFpInductionRecipe *
tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range,
VPlan &Plan) const;
-
- /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
- /// a sequence of select instructions as the vectorizer currently performs
- /// full if-conversion.
- VPBlendRecipe *tryToBlend(PHINode *Phi, VPlanPtr &Plan);
-
- /// Handle call instructions. If \p CI can be widened for \p Range.Start,
- /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same
- /// decision from \p Range.Start to \p Range.End.
- VPWidenCallRecipe *tryToWidenCall(CallInst *CI, VFRange &Range,
- VPlan &Plan) const;
-
- /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
- /// if it can. The function should only be called if the cost-model indicates
- /// that widening should be performed.
- VPWidenRecipe *tryToWiden(Instruction *I, VPlan &Plan) const;
-
-public:
- VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
- LoopVectorizationLegality *Legal,
- LoopVectorizationCostModel &CM,
- PredicatedScalarEvolution &PSE, VPBuilder &Builder)
- : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), PSE(PSE),
- Builder(Builder) {}
-
- /// Check if a recipe can be create for \p I withing the given VF \p Range.
- /// If a recipe can be created, return it. Otherwise return nullptr.
- VPRecipeBase *tryToCreateWidenRecipe(Instruction *Instr, VFRange &Range,
- VPlanPtr &Plan);
-
- /// Set the recipe created for given ingredient. This operation is a no-op for
- /// ingredients that were not marked using a nullptr entry in the map.
- void setRecipe(Instruction *I, VPRecipeBase *R) {
- if (!Ingredient2Recipe.count(I))
- return;
- assert(Ingredient2Recipe[I] == nullptr &&
- "Recipe already set for ingredient");
- Ingredient2Recipe[I] = R;
- }
-
- /// A helper function that computes the predicate of the block BB, assuming
- /// that the header block of the loop is set to True. It returns the *entry*
- /// mask for the block BB.
- VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
-
- /// A helper function that computes the predicate of the edge between SRC
- /// and DST.
- VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
-
- /// Mark given ingredient for recording its recipe once one is created for
- /// it.
- void recordRecipeOf(Instruction *I) {
- assert((!Ingredient2Recipe.count(I) || Ingredient2Recipe[I] == nullptr) &&
- "Recipe already set for ingredient");
- Ingredient2Recipe[I] = nullptr;
- }
-
- /// Return the recipe created for given ingredient.
- VPRecipeBase *getRecipe(Instruction *I) {
- assert(Ingredient2Recipe.count(I) &&
- "Recording this ingredients recipe was not requested");
- assert(Ingredient2Recipe[I] != nullptr &&
- "Ingredient doesn't have a recipe");
- return Ingredient2Recipe[I];
- }
-
- /// Create a replicating region for instruction \p I that requires
- /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
- VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
- VPlanPtr &Plan);
-
- /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
- /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
- /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
- /// Region. Update the packing decision of predicated instructions if they
- /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
- /// \p Range.Start to \p Range.End.
- VPBasicBlock *handleReplication(
- Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
- DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
- VPlanPtr &Plan);
-};
-} // end namespace llvm
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
+
+ /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
+ /// a sequence of select instructions as the vectorizer currently performs
+ /// full if-conversion.
+ VPBlendRecipe *tryToBlend(PHINode *Phi, VPlanPtr &Plan);
+
+ /// Handle call instructions. If \p CI can be widened for \p Range.Start,
+ /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same
+ /// decision from \p Range.Start to \p Range.End.
+ VPWidenCallRecipe *tryToWidenCall(CallInst *CI, VFRange &Range,
+ VPlan &Plan) const;
+
+ /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
+ /// if it can. The function should only be called if the cost-model indicates
+ /// that widening should be performed.
+ VPWidenRecipe *tryToWiden(Instruction *I, VPlan &Plan) const;
+
+public:
+ VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
+ LoopVectorizationLegality *Legal,
+ LoopVectorizationCostModel &CM,
+ PredicatedScalarEvolution &PSE, VPBuilder &Builder)
+ : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), PSE(PSE),
+ Builder(Builder) {}
+
+ /// Check if a recipe can be create for \p I withing the given VF \p Range.
+ /// If a recipe can be created, return it. Otherwise return nullptr.
+ VPRecipeBase *tryToCreateWidenRecipe(Instruction *Instr, VFRange &Range,
+ VPlanPtr &Plan);
+
+ /// Set the recipe created for given ingredient. This operation is a no-op for
+ /// ingredients that were not marked using a nullptr entry in the map.
+ void setRecipe(Instruction *I, VPRecipeBase *R) {
+ if (!Ingredient2Recipe.count(I))
+ return;
+ assert(Ingredient2Recipe[I] == nullptr &&
+ "Recipe already set for ingredient");
+ Ingredient2Recipe[I] = R;
+ }
+
+ /// A helper function that computes the predicate of the block BB, assuming
+ /// that the header block of the loop is set to True. It returns the *entry*
+ /// mask for the block BB.
+ VPValue *createBlockInMask(BasicBlock *BB, VPlanPtr &Plan);
+
+ /// A helper function that computes the predicate of the edge between SRC
+ /// and DST.
+ VPValue *createEdgeMask(BasicBlock *Src, BasicBlock *Dst, VPlanPtr &Plan);
+
+ /// Mark given ingredient for recording its recipe once one is created for
+ /// it.
+ void recordRecipeOf(Instruction *I) {
+ assert((!Ingredient2Recipe.count(I) || Ingredient2Recipe[I] == nullptr) &&
+ "Recipe already set for ingredient");
+ Ingredient2Recipe[I] = nullptr;
+ }
+
+ /// Return the recipe created for given ingredient.
+ VPRecipeBase *getRecipe(Instruction *I) {
+ assert(Ingredient2Recipe.count(I) &&
+ "Recording this ingredients recipe was not requested");
+ assert(Ingredient2Recipe[I] != nullptr &&
+ "Ingredient doesn't have a recipe");
+ return Ingredient2Recipe[I];
+ }
+
+ /// Create a replicating region for instruction \p I that requires
+ /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
+ VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
+ VPlanPtr &Plan);
+
+ /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
+ /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
+ /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
+ /// Region. Update the packing decision of predicated instructions if they
+ /// feed \p I. Range.End may be decreased to ensure same recipe behavior from
+ /// \p Range.Start to \p Range.End.
+ VPBasicBlock *handleReplication(
+ Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
+ DenseMap<Instruction *, VPReplicateRecipe *> &PredInst2Recipe,
+ VPlanPtr &Plan);
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPRECIPEBUILDER_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp
index eaacde6f66..b26399e0ae 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.cpp
@@ -1,63 +1,63 @@
-//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This is the LLVM vectorization plan. It represents a candidate for
-/// vectorization, allowing to plan and optimize how to vectorize a given loop
-/// before generating LLVM-IR.
-/// The vectorizer uses vectorization plans to estimate the costs of potential
-/// candidates and if profitable to execute the desired plan, generating vector
-/// LLVM-IR code.
-///
-//===----------------------------------------------------------------------===//
-
-#include "VPlan.h"
-#include "VPlanDominatorTree.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/PostOrderIterator.h"
+//===- VPlan.cpp - Vectorizer Plan ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This is the LLVM vectorization plan. It represents a candidate for
+/// vectorization, allowing to plan and optimize how to vectorize a given loop
+/// before generating LLVM-IR.
+/// The vectorizer uses vectorization plans to estimate the costs of potential
+/// candidates and if profitable to execute the desired plan, generating vector
+/// LLVM-IR code.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlan.h"
+#include "VPlanDominatorTree.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
#include "llvm/Analysis/IVDescriptors.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GenericDomTreeConstruction.h"
-#include "llvm/Support/GraphWriter.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <cassert>
-#include <iterator>
-#include <string>
-#include <vector>
-
-using namespace llvm;
-extern cl::opt<bool> EnableVPlanNativePath;
-
-#define DEBUG_TYPE "vplan"
-
-raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
- const VPInstruction *Instr = dyn_cast<VPInstruction>(&V);
- VPSlotTracker SlotTracker(
- (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
- V.print(OS, SlotTracker);
- return OS;
-}
-
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTreeConstruction.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <iterator>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+extern cl::opt<bool> EnableVPlanNativePath;
+
+#define DEBUG_TYPE "vplan"
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
+ const VPInstruction *Instr = dyn_cast<VPInstruction>(&V);
+ VPSlotTracker SlotTracker(
+ (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
+ V.print(OS, SlotTracker);
+ return OS;
+}
+
VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def)
: SubclassID(SC), UnderlyingVal(UV), Def(Def) {
if (Def)
@@ -70,13 +70,13 @@ VPValue::~VPValue() {
Def->removeDefinedValue(this);
}
-void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
+void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
if (const VPRecipeBase *R = dyn_cast_or_null<VPRecipeBase>(Def))
R->print(OS, "", SlotTracker);
- else
- printAsOperand(OS, SlotTracker);
-}
-
+ else
+ printAsOperand(OS, SlotTracker);
+}
+
void VPValue::dump() const {
const VPRecipeBase *Instr = dyn_cast_or_null<VPRecipeBase>(this->Def);
VPSlotTracker SlotTracker(
@@ -121,91 +121,91 @@ VPUser *VPRecipeBase::toVPUser() {
return nullptr;
}
-// Get the top-most entry block of \p Start. This is the entry block of the
-// containing VPlan. This function is templated to support both const and non-const blocks
-template <typename T> static T *getPlanEntry(T *Start) {
- T *Next = Start;
- T *Current = Start;
- while ((Next = Next->getParent()))
- Current = Next;
-
- SmallSetVector<T *, 8> WorkList;
- WorkList.insert(Current);
-
- for (unsigned i = 0; i < WorkList.size(); i++) {
- T *Current = WorkList[i];
- if (Current->getNumPredecessors() == 0)
- return Current;
- auto &Predecessors = Current->getPredecessors();
- WorkList.insert(Predecessors.begin(), Predecessors.end());
- }
-
- llvm_unreachable("VPlan without any entry node without predecessors");
-}
-
-VPlan *VPBlockBase::getPlan() { return getPlanEntry(this)->Plan; }
-
-const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(this)->Plan; }
-
-/// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
-const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
- const VPBlockBase *Block = this;
- while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
- Block = Region->getEntry();
- return cast<VPBasicBlock>(Block);
-}
-
-VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
- VPBlockBase *Block = this;
- while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
- Block = Region->getEntry();
- return cast<VPBasicBlock>(Block);
-}
-
-void VPBlockBase::setPlan(VPlan *ParentPlan) {
- assert(ParentPlan->getEntry() == this &&
- "Can only set plan on its entry block.");
- Plan = ParentPlan;
-}
-
-/// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
-const VPBasicBlock *VPBlockBase::getExitBasicBlock() const {
- const VPBlockBase *Block = this;
- while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
- Block = Region->getExit();
- return cast<VPBasicBlock>(Block);
-}
-
-VPBasicBlock *VPBlockBase::getExitBasicBlock() {
- VPBlockBase *Block = this;
- while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
- Block = Region->getExit();
- return cast<VPBasicBlock>(Block);
-}
-
-VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() {
- if (!Successors.empty() || !Parent)
- return this;
- assert(Parent->getExit() == this &&
- "Block w/o successors not the exit of its parent.");
- return Parent->getEnclosingBlockWithSuccessors();
-}
-
-VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
- if (!Predecessors.empty() || !Parent)
- return this;
- assert(Parent->getEntry() == this &&
- "Block w/o predecessors not the entry of its parent.");
- return Parent->getEnclosingBlockWithPredecessors();
-}
-
-void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
+// Get the top-most entry block of \p Start. This is the entry block of the
+// containing VPlan. This function is templated to support both const and non-const blocks
+template <typename T> static T *getPlanEntry(T *Start) {
+ T *Next = Start;
+ T *Current = Start;
+ while ((Next = Next->getParent()))
+ Current = Next;
+
+ SmallSetVector<T *, 8> WorkList;
+ WorkList.insert(Current);
+
+ for (unsigned i = 0; i < WorkList.size(); i++) {
+ T *Current = WorkList[i];
+ if (Current->getNumPredecessors() == 0)
+ return Current;
+ auto &Predecessors = Current->getPredecessors();
+ WorkList.insert(Predecessors.begin(), Predecessors.end());
+ }
+
+ llvm_unreachable("VPlan without any entry node without predecessors");
+}
+
+VPlan *VPBlockBase::getPlan() { return getPlanEntry(this)->Plan; }
+
+const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(this)->Plan; }
+
+/// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
+const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
+ const VPBlockBase *Block = this;
+ while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ Block = Region->getEntry();
+ return cast<VPBasicBlock>(Block);
+}
+
+VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
+ VPBlockBase *Block = this;
+ while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ Block = Region->getEntry();
+ return cast<VPBasicBlock>(Block);
+}
+
+void VPBlockBase::setPlan(VPlan *ParentPlan) {
+ assert(ParentPlan->getEntry() == this &&
+ "Can only set plan on its entry block.");
+ Plan = ParentPlan;
+}
+
+/// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
+const VPBasicBlock *VPBlockBase::getExitBasicBlock() const {
+ const VPBlockBase *Block = this;
+ while (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ Block = Region->getExit();
+ return cast<VPBasicBlock>(Block);
+}
+
+VPBasicBlock *VPBlockBase::getExitBasicBlock() {
+ VPBlockBase *Block = this;
+ while (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ Block = Region->getExit();
+ return cast<VPBasicBlock>(Block);
+}
+
+VPBlockBase *VPBlockBase::getEnclosingBlockWithSuccessors() {
+ if (!Successors.empty() || !Parent)
+ return this;
+ assert(Parent->getExit() == this &&
+ "Block w/o successors not the exit of its parent.");
+ return Parent->getEnclosingBlockWithSuccessors();
+}
+
+VPBlockBase *VPBlockBase::getEnclosingBlockWithPredecessors() {
+ if (!Predecessors.empty() || !Parent)
+ return this;
+ assert(Parent->getEntry() == this &&
+ "Block w/o predecessors not the entry of its parent.");
+ return Parent->getEnclosingBlockWithPredecessors();
+}
+
+void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
SmallVector<VPBlockBase *, 8> Blocks(depth_first(Entry));
-
- for (VPBlockBase *Block : Blocks)
- delete Block;
-}
-
+
+ for (VPBlockBase *Block : Blocks)
+ delete Block;
+}
+
VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
iterator It = begin();
while (It != end() && (isa<VPWidenPHIRecipe>(&*It) ||
@@ -237,123 +237,123 @@ Value *VPTransformState::get(VPValue *Def, const VPIteration &Instance) {
return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance);
}
-BasicBlock *
-VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
- // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
- // Pred stands for Predessor. Prev stands for Previous - last visited/created.
- BasicBlock *PrevBB = CFG.PrevBB;
- BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(),
- PrevBB->getParent(), CFG.LastBB);
- LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
-
- // Hook up the new basic block to its predecessors.
- for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
- VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock();
- auto &PredVPSuccessors = PredVPBB->getSuccessors();
- BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
-
- // In outer loop vectorization scenario, the predecessor BBlock may not yet
- // be visited(backedge). Mark the VPBasicBlock for fixup at the end of
- // vectorization. We do not encounter this case in inner loop vectorization
- // as we start out by building a loop skeleton with the vector loop header
- // and latch blocks. As a result, we never enter this function for the
- // header block in the non VPlan-native path.
- if (!PredBB) {
- assert(EnableVPlanNativePath &&
- "Unexpected null predecessor in non VPlan-native path");
- CFG.VPBBsToFix.push_back(PredVPBB);
- continue;
- }
-
- assert(PredBB && "Predecessor basic-block not found building successor.");
- auto *PredBBTerminator = PredBB->getTerminator();
- LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
- if (isa<UnreachableInst>(PredBBTerminator)) {
- assert(PredVPSuccessors.size() == 1 &&
- "Predecessor ending w/o branch must have single successor.");
- PredBBTerminator->eraseFromParent();
- BranchInst::Create(NewBB, PredBB);
- } else {
- assert(PredVPSuccessors.size() == 2 &&
- "Predecessor ending with branch must have two successors.");
- unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
- assert(!PredBBTerminator->getSuccessor(idx) &&
- "Trying to reset an existing successor block.");
- PredBBTerminator->setSuccessor(idx, NewBB);
- }
- }
- return NewBB;
-}
-
-void VPBasicBlock::execute(VPTransformState *State) {
- bool Replica = State->Instance &&
- !(State->Instance->Part == 0 && State->Instance->Lane == 0);
- VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB;
- VPBlockBase *SingleHPred = nullptr;
- BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
-
- // 1. Create an IR basic block, or reuse the last one if possible.
- // The last IR basic block is reused, as an optimization, in three cases:
- // A. the first VPBB reuses the loop header BB - when PrevVPBB is null;
- // B. when the current VPBB has a single (hierarchical) predecessor which
- // is PrevVPBB and the latter has a single (hierarchical) successor; and
- // C. when the current VPBB is an entry of a region replica - where PrevVPBB
- // is the exit of this region from a previous instance, or the predecessor
- // of this region.
- if (PrevVPBB && /* A */
- !((SingleHPred = getSingleHierarchicalPredecessor()) &&
- SingleHPred->getExitBasicBlock() == PrevVPBB &&
- PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */
- !(Replica && getPredecessors().empty())) { /* C */
- NewBB = createEmptyBasicBlock(State->CFG);
- State->Builder.SetInsertPoint(NewBB);
- // Temporarily terminate with unreachable until CFG is rewired.
- UnreachableInst *Terminator = State->Builder.CreateUnreachable();
- State->Builder.SetInsertPoint(Terminator);
- // Register NewBB in its loop. In innermost loops its the same for all BB's.
- Loop *L = State->LI->getLoopFor(State->CFG.LastBB);
- L->addBasicBlockToLoop(NewBB, *State->LI);
- State->CFG.PrevBB = NewBB;
- }
-
- // 2. Fill the IR basic block with IR instructions.
- LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
- << " in BB:" << NewBB->getName() << '\n');
-
- State->CFG.VPBB2IRBB[this] = NewBB;
- State->CFG.PrevVPBB = this;
-
- for (VPRecipeBase &Recipe : Recipes)
- Recipe.execute(*State);
-
- VPValue *CBV;
- if (EnableVPlanNativePath && (CBV = getCondBit())) {
- Value *IRCBV = CBV->getUnderlyingValue();
- assert(IRCBV && "Unexpected null underlying value for condition bit");
-
- // Condition bit value in a VPBasicBlock is used as the branch selector. In
- // the VPlan-native path case, since all branches are uniform we generate a
- // branch instruction using the condition value from vector lane 0 and dummy
- // successors. The successors are fixed later when the successor blocks are
- // visited.
- Value *NewCond = State->Callback.getOrCreateVectorValues(IRCBV, 0);
- NewCond = State->Builder.CreateExtractElement(NewCond,
- State->Builder.getInt32(0));
-
- // Replace the temporary unreachable terminator with the new conditional
- // branch.
- auto *CurrentTerminator = NewBB->getTerminator();
- assert(isa<UnreachableInst>(CurrentTerminator) &&
- "Expected to replace unreachable terminator with conditional "
- "branch.");
- auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond);
- CondBr->setSuccessor(0, nullptr);
- ReplaceInstWithInst(CurrentTerminator, CondBr);
- }
-
- LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
-}
-
+BasicBlock *
+VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) {
+ // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks.
+ // Pred stands for Predessor. Prev stands for Previous - last visited/created.
+ BasicBlock *PrevBB = CFG.PrevBB;
+ BasicBlock *NewBB = BasicBlock::Create(PrevBB->getContext(), getName(),
+ PrevBB->getParent(), CFG.LastBB);
+ LLVM_DEBUG(dbgs() << "LV: created " << NewBB->getName() << '\n');
+
+ // Hook up the new basic block to its predecessors.
+ for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) {
+ VPBasicBlock *PredVPBB = PredVPBlock->getExitBasicBlock();
+ auto &PredVPSuccessors = PredVPBB->getSuccessors();
+ BasicBlock *PredBB = CFG.VPBB2IRBB[PredVPBB];
+
+ // In outer loop vectorization scenario, the predecessor BBlock may not yet
+ // be visited(backedge). Mark the VPBasicBlock for fixup at the end of
+ // vectorization. We do not encounter this case in inner loop vectorization
+ // as we start out by building a loop skeleton with the vector loop header
+ // and latch blocks. As a result, we never enter this function for the
+ // header block in the non VPlan-native path.
+ if (!PredBB) {
+ assert(EnableVPlanNativePath &&
+ "Unexpected null predecessor in non VPlan-native path");
+ CFG.VPBBsToFix.push_back(PredVPBB);
+ continue;
+ }
+
+ assert(PredBB && "Predecessor basic-block not found building successor.");
+ auto *PredBBTerminator = PredBB->getTerminator();
+ LLVM_DEBUG(dbgs() << "LV: draw edge from" << PredBB->getName() << '\n');
+ if (isa<UnreachableInst>(PredBBTerminator)) {
+ assert(PredVPSuccessors.size() == 1 &&
+ "Predecessor ending w/o branch must have single successor.");
+ PredBBTerminator->eraseFromParent();
+ BranchInst::Create(NewBB, PredBB);
+ } else {
+ assert(PredVPSuccessors.size() == 2 &&
+ "Predecessor ending with branch must have two successors.");
+ unsigned idx = PredVPSuccessors.front() == this ? 0 : 1;
+ assert(!PredBBTerminator->getSuccessor(idx) &&
+ "Trying to reset an existing successor block.");
+ PredBBTerminator->setSuccessor(idx, NewBB);
+ }
+ }
+ return NewBB;
+}
+
+void VPBasicBlock::execute(VPTransformState *State) {
+ bool Replica = State->Instance &&
+ !(State->Instance->Part == 0 && State->Instance->Lane == 0);
+ VPBasicBlock *PrevVPBB = State->CFG.PrevVPBB;
+ VPBlockBase *SingleHPred = nullptr;
+ BasicBlock *NewBB = State->CFG.PrevBB; // Reuse it if possible.
+
+ // 1. Create an IR basic block, or reuse the last one if possible.
+ // The last IR basic block is reused, as an optimization, in three cases:
+ // A. the first VPBB reuses the loop header BB - when PrevVPBB is null;
+ // B. when the current VPBB has a single (hierarchical) predecessor which
+ // is PrevVPBB and the latter has a single (hierarchical) successor; and
+ // C. when the current VPBB is an entry of a region replica - where PrevVPBB
+ // is the exit of this region from a previous instance, or the predecessor
+ // of this region.
+ if (PrevVPBB && /* A */
+ !((SingleHPred = getSingleHierarchicalPredecessor()) &&
+ SingleHPred->getExitBasicBlock() == PrevVPBB &&
+ PrevVPBB->getSingleHierarchicalSuccessor()) && /* B */
+ !(Replica && getPredecessors().empty())) { /* C */
+ NewBB = createEmptyBasicBlock(State->CFG);
+ State->Builder.SetInsertPoint(NewBB);
+ // Temporarily terminate with unreachable until CFG is rewired.
+ UnreachableInst *Terminator = State->Builder.CreateUnreachable();
+ State->Builder.SetInsertPoint(Terminator);
+ // Register NewBB in its loop. In innermost loops its the same for all BB's.
+ Loop *L = State->LI->getLoopFor(State->CFG.LastBB);
+ L->addBasicBlockToLoop(NewBB, *State->LI);
+ State->CFG.PrevBB = NewBB;
+ }
+
+ // 2. Fill the IR basic block with IR instructions.
+ LLVM_DEBUG(dbgs() << "LV: vectorizing VPBB:" << getName()
+ << " in BB:" << NewBB->getName() << '\n');
+
+ State->CFG.VPBB2IRBB[this] = NewBB;
+ State->CFG.PrevVPBB = this;
+
+ for (VPRecipeBase &Recipe : Recipes)
+ Recipe.execute(*State);
+
+ VPValue *CBV;
+ if (EnableVPlanNativePath && (CBV = getCondBit())) {
+ Value *IRCBV = CBV->getUnderlyingValue();
+ assert(IRCBV && "Unexpected null underlying value for condition bit");
+
+ // Condition bit value in a VPBasicBlock is used as the branch selector. In
+ // the VPlan-native path case, since all branches are uniform we generate a
+ // branch instruction using the condition value from vector lane 0 and dummy
+ // successors. The successors are fixed later when the successor blocks are
+ // visited.
+ Value *NewCond = State->Callback.getOrCreateVectorValues(IRCBV, 0);
+ NewCond = State->Builder.CreateExtractElement(NewCond,
+ State->Builder.getInt32(0));
+
+ // Replace the temporary unreachable terminator with the new conditional
+ // branch.
+ auto *CurrentTerminator = NewBB->getTerminator();
+ assert(isa<UnreachableInst>(CurrentTerminator) &&
+ "Expected to replace unreachable terminator with conditional "
+ "branch.");
+ auto *CondBr = BranchInst::Create(NewBB, nullptr, NewCond);
+ CondBr->setSuccessor(0, nullptr);
+ ReplaceInstWithInst(CurrentTerminator, CondBr);
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: filled BB:" << *NewBB);
+}
+
void VPBasicBlock::dropAllReferences(VPValue *NewValue) {
for (VPRecipeBase &R : Recipes) {
for (auto *Def : R.definedValues())
@@ -372,87 +372,87 @@ void VPRegionBlock::dropAllReferences(VPValue *NewValue) {
Block->dropAllReferences(NewValue);
}
-void VPRegionBlock::execute(VPTransformState *State) {
- ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry);
-
- if (!isReplicator()) {
- // Visit the VPBlocks connected to "this", starting from it.
- for (VPBlockBase *Block : RPOT) {
- if (EnableVPlanNativePath) {
- // The inner loop vectorization path does not represent loop preheader
- // and exit blocks as part of the VPlan. In the VPlan-native path, skip
- // vectorizing loop preheader block. In future, we may replace this
- // check with the check for loop preheader.
- if (Block->getNumPredecessors() == 0)
- continue;
-
- // Skip vectorizing loop exit block. In future, we may replace this
- // check with the check for loop exit.
- if (Block->getNumSuccessors() == 0)
- continue;
- }
-
- LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
- Block->execute(State);
- }
- return;
- }
-
- assert(!State->Instance && "Replicating a Region with non-null instance.");
-
- // Enter replicating mode.
- State->Instance = {0, 0};
-
- for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
- State->Instance->Part = Part;
+void VPRegionBlock::execute(VPTransformState *State) {
+ ReversePostOrderTraversal<VPBlockBase *> RPOT(Entry);
+
+ if (!isReplicator()) {
+ // Visit the VPBlocks connected to "this", starting from it.
+ for (VPBlockBase *Block : RPOT) {
+ if (EnableVPlanNativePath) {
+ // The inner loop vectorization path does not represent loop preheader
+ // and exit blocks as part of the VPlan. In the VPlan-native path, skip
+ // vectorizing loop preheader block. In future, we may replace this
+ // check with the check for loop preheader.
+ if (Block->getNumPredecessors() == 0)
+ continue;
+
+ // Skip vectorizing loop exit block. In future, we may replace this
+ // check with the check for loop exit.
+ if (Block->getNumSuccessors() == 0)
+ continue;
+ }
+
+ LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+ Block->execute(State);
+ }
+ return;
+ }
+
+ assert(!State->Instance && "Replicating a Region with non-null instance.");
+
+ // Enter replicating mode.
+ State->Instance = {0, 0};
+
+ for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part) {
+ State->Instance->Part = Part;
assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
++Lane) {
- State->Instance->Lane = Lane;
- // Visit the VPBlocks connected to \p this, starting from it.
- for (VPBlockBase *Block : RPOT) {
- LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
- Block->execute(State);
- }
- }
- }
-
- // Exit replicating mode.
- State->Instance.reset();
-}
-
-void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
- assert(!Parent && "Recipe already in some VPBasicBlock");
- assert(InsertPos->getParent() &&
- "Insertion position not in any VPBasicBlock");
- Parent = InsertPos->getParent();
- Parent->getRecipeList().insert(InsertPos->getIterator(), this);
-}
-
-void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
- assert(!Parent && "Recipe already in some VPBasicBlock");
- assert(InsertPos->getParent() &&
- "Insertion position not in any VPBasicBlock");
- Parent = InsertPos->getParent();
- Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
-}
-
-void VPRecipeBase::removeFromParent() {
- assert(getParent() && "Recipe not in any VPBasicBlock");
- getParent()->getRecipeList().remove(getIterator());
- Parent = nullptr;
-}
-
-iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
- assert(getParent() && "Recipe not in any VPBasicBlock");
- return getParent()->getRecipeList().erase(getIterator());
-}
-
-void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
- removeFromParent();
- insertAfter(InsertPos);
-}
-
+ State->Instance->Lane = Lane;
+ // Visit the VPBlocks connected to \p this, starting from it.
+ for (VPBlockBase *Block : RPOT) {
+ LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+ Block->execute(State);
+ }
+ }
+ }
+
+ // Exit replicating mode.
+ State->Instance.reset();
+}
+
+void VPRecipeBase::insertBefore(VPRecipeBase *InsertPos) {
+ assert(!Parent && "Recipe already in some VPBasicBlock");
+ assert(InsertPos->getParent() &&
+ "Insertion position not in any VPBasicBlock");
+ Parent = InsertPos->getParent();
+ Parent->getRecipeList().insert(InsertPos->getIterator(), this);
+}
+
+void VPRecipeBase::insertAfter(VPRecipeBase *InsertPos) {
+ assert(!Parent && "Recipe already in some VPBasicBlock");
+ assert(InsertPos->getParent() &&
+ "Insertion position not in any VPBasicBlock");
+ Parent = InsertPos->getParent();
+ Parent->getRecipeList().insertAfter(InsertPos->getIterator(), this);
+}
+
+void VPRecipeBase::removeFromParent() {
+ assert(getParent() && "Recipe not in any VPBasicBlock");
+ getParent()->getRecipeList().remove(getIterator());
+ Parent = nullptr;
+}
+
+iplist<VPRecipeBase>::iterator VPRecipeBase::eraseFromParent() {
+ assert(getParent() && "Recipe not in any VPBasicBlock");
+ return getParent()->getRecipeList().erase(getIterator());
+}
+
+void VPRecipeBase::moveAfter(VPRecipeBase *InsertPos) {
+ removeFromParent();
+ insertAfter(InsertPos);
+}
+
void VPRecipeBase::moveBefore(VPBasicBlock &BB,
iplist<VPRecipeBase>::iterator I) {
assert(I == BB.end() || I->getParent() == &BB);
@@ -461,395 +461,395 @@ void VPRecipeBase::moveBefore(VPBasicBlock &BB,
BB.getRecipeList().insert(I, this);
}
-void VPInstruction::generateInstruction(VPTransformState &State,
- unsigned Part) {
- IRBuilder<> &Builder = State.Builder;
-
- if (Instruction::isBinaryOp(getOpcode())) {
- Value *A = State.get(getOperand(0), Part);
- Value *B = State.get(getOperand(1), Part);
- Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B);
- State.set(this, V, Part);
- return;
- }
-
- switch (getOpcode()) {
- case VPInstruction::Not: {
- Value *A = State.get(getOperand(0), Part);
- Value *V = Builder.CreateNot(A);
- State.set(this, V, Part);
- break;
- }
- case VPInstruction::ICmpULE: {
- Value *IV = State.get(getOperand(0), Part);
- Value *TC = State.get(getOperand(1), Part);
- Value *V = Builder.CreateICmpULE(IV, TC);
- State.set(this, V, Part);
- break;
- }
- case Instruction::Select: {
- Value *Cond = State.get(getOperand(0), Part);
- Value *Op1 = State.get(getOperand(1), Part);
- Value *Op2 = State.get(getOperand(2), Part);
- Value *V = Builder.CreateSelect(Cond, Op1, Op2);
- State.set(this, V, Part);
- break;
- }
- case VPInstruction::ActiveLaneMask: {
- // Get first lane of vector induction variable.
- Value *VIVElem0 = State.get(getOperand(0), {Part, 0});
+void VPInstruction::generateInstruction(VPTransformState &State,
+ unsigned Part) {
+ IRBuilder<> &Builder = State.Builder;
+
+ if (Instruction::isBinaryOp(getOpcode())) {
+ Value *A = State.get(getOperand(0), Part);
+ Value *B = State.get(getOperand(1), Part);
+ Value *V = Builder.CreateBinOp((Instruction::BinaryOps)getOpcode(), A, B);
+ State.set(this, V, Part);
+ return;
+ }
+
+ switch (getOpcode()) {
+ case VPInstruction::Not: {
+ Value *A = State.get(getOperand(0), Part);
+ Value *V = Builder.CreateNot(A);
+ State.set(this, V, Part);
+ break;
+ }
+ case VPInstruction::ICmpULE: {
+ Value *IV = State.get(getOperand(0), Part);
+ Value *TC = State.get(getOperand(1), Part);
+ Value *V = Builder.CreateICmpULE(IV, TC);
+ State.set(this, V, Part);
+ break;
+ }
+ case Instruction::Select: {
+ Value *Cond = State.get(getOperand(0), Part);
+ Value *Op1 = State.get(getOperand(1), Part);
+ Value *Op2 = State.get(getOperand(2), Part);
+ Value *V = Builder.CreateSelect(Cond, Op1, Op2);
+ State.set(this, V, Part);
+ break;
+ }
+ case VPInstruction::ActiveLaneMask: {
+ // Get first lane of vector induction variable.
+ Value *VIVElem0 = State.get(getOperand(0), {Part, 0});
// Get the original loop tripcount.
Value *ScalarTC = State.TripCount;
-
- auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
+
+ auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
auto *PredTy = FixedVectorType::get(Int1Ty, State.VF.getKnownMinValue());
- Instruction *Call = Builder.CreateIntrinsic(
+ Instruction *Call = Builder.CreateIntrinsic(
Intrinsic::get_active_lane_mask, {PredTy, ScalarTC->getType()},
{VIVElem0, ScalarTC}, nullptr, "active.lane.mask");
- State.set(this, Call, Part);
- break;
- }
- default:
- llvm_unreachable("Unsupported opcode for instruction");
- }
-}
-
-void VPInstruction::execute(VPTransformState &State) {
- assert(!State.Instance && "VPInstruction executing an Instance");
- for (unsigned Part = 0; Part < State.UF; ++Part)
- generateInstruction(State, Part);
-}
-
+ State.set(this, Call, Part);
+ break;
+ }
+ default:
+ llvm_unreachable("Unsupported opcode for instruction");
+ }
+}
+
+void VPInstruction::execute(VPTransformState &State) {
+ assert(!State.Instance && "VPInstruction executing an Instance");
+ for (unsigned Part = 0; Part < State.UF; ++Part)
+ generateInstruction(State, Part);
+}
+
void VPInstruction::dump() const {
VPSlotTracker SlotTracker(getParent()->getPlan());
print(dbgs(), "", SlotTracker);
}
-void VPInstruction::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
+void VPInstruction::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << "EMIT ";
-
- if (hasResult()) {
- printAsOperand(O, SlotTracker);
- O << " = ";
- }
-
- switch (getOpcode()) {
- case VPInstruction::Not:
- O << "not";
- break;
- case VPInstruction::ICmpULE:
- O << "icmp ule";
- break;
- case VPInstruction::SLPLoad:
- O << "combined load";
- break;
- case VPInstruction::SLPStore:
- O << "combined store";
- break;
- case VPInstruction::ActiveLaneMask:
- O << "active lane mask";
- break;
-
- default:
- O << Instruction::getOpcodeName(getOpcode());
- }
-
- for (const VPValue *Operand : operands()) {
- O << " ";
- Operand->printAsOperand(O, SlotTracker);
- }
-}
-
-/// Generate the code inside the body of the vectorized loop. Assumes a single
-/// LoopVectorBody basic-block was created for this. Introduce additional
-/// basic-blocks as needed, and fill them all.
-void VPlan::execute(VPTransformState *State) {
- // -1. Check if the backedge taken count is needed, and if so build it.
- if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
- Value *TC = State->TripCount;
- IRBuilder<> Builder(State->CFG.PrevBB->getTerminator());
- auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1),
- "trip.count.minus.1");
- auto VF = State->VF;
- Value *VTCMO =
+
+ if (hasResult()) {
+ printAsOperand(O, SlotTracker);
+ O << " = ";
+ }
+
+ switch (getOpcode()) {
+ case VPInstruction::Not:
+ O << "not";
+ break;
+ case VPInstruction::ICmpULE:
+ O << "icmp ule";
+ break;
+ case VPInstruction::SLPLoad:
+ O << "combined load";
+ break;
+ case VPInstruction::SLPStore:
+ O << "combined store";
+ break;
+ case VPInstruction::ActiveLaneMask:
+ O << "active lane mask";
+ break;
+
+ default:
+ O << Instruction::getOpcodeName(getOpcode());
+ }
+
+ for (const VPValue *Operand : operands()) {
+ O << " ";
+ Operand->printAsOperand(O, SlotTracker);
+ }
+}
+
+/// Generate the code inside the body of the vectorized loop. Assumes a single
+/// LoopVectorBody basic-block was created for this. Introduce additional
+/// basic-blocks as needed, and fill them all.
+void VPlan::execute(VPTransformState *State) {
+ // -1. Check if the backedge taken count is needed, and if so build it.
+ if (BackedgeTakenCount && BackedgeTakenCount->getNumUsers()) {
+ Value *TC = State->TripCount;
+ IRBuilder<> Builder(State->CFG.PrevBB->getTerminator());
+ auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1),
+ "trip.count.minus.1");
+ auto VF = State->VF;
+ Value *VTCMO =
VF.isScalar() ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
- for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part)
- State->set(BackedgeTakenCount, VTCMO, Part);
- }
-
- // 0. Set the reverse mapping from VPValues to Values for code generation.
- for (auto &Entry : Value2VPValue)
- State->VPValue2Value[Entry.second] = Entry.first;
-
- BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB;
- BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor();
- assert(VectorHeaderBB && "Loop preheader does not have a single successor.");
-
- // 1. Make room to generate basic-blocks inside loop body if needed.
- BasicBlock *VectorLatchBB = VectorHeaderBB->splitBasicBlock(
- VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch");
- Loop *L = State->LI->getLoopFor(VectorHeaderBB);
- L->addBasicBlockToLoop(VectorLatchBB, *State->LI);
- // Remove the edge between Header and Latch to allow other connections.
- // Temporarily terminate with unreachable until CFG is rewired.
- // Note: this asserts the generated code's assumption that
- // getFirstInsertionPt() can be dereferenced into an Instruction.
- VectorHeaderBB->getTerminator()->eraseFromParent();
- State->Builder.SetInsertPoint(VectorHeaderBB);
- UnreachableInst *Terminator = State->Builder.CreateUnreachable();
- State->Builder.SetInsertPoint(Terminator);
-
- // 2. Generate code in loop body.
- State->CFG.PrevVPBB = nullptr;
- State->CFG.PrevBB = VectorHeaderBB;
- State->CFG.LastBB = VectorLatchBB;
-
- for (VPBlockBase *Block : depth_first(Entry))
- Block->execute(State);
-
- // Setup branch terminator successors for VPBBs in VPBBsToFix based on
- // VPBB's successors.
- for (auto VPBB : State->CFG.VPBBsToFix) {
- assert(EnableVPlanNativePath &&
- "Unexpected VPBBsToFix in non VPlan-native path");
- BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB];
- assert(BB && "Unexpected null basic block for VPBB");
-
- unsigned Idx = 0;
- auto *BBTerminator = BB->getTerminator();
-
- for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) {
- VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock();
- BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]);
- ++Idx;
- }
- }
-
- // 3. Merge the temporary latch created with the last basic-block filled.
- BasicBlock *LastBB = State->CFG.PrevBB;
- // Connect LastBB to VectorLatchBB to facilitate their merge.
- assert((EnableVPlanNativePath ||
- isa<UnreachableInst>(LastBB->getTerminator())) &&
- "Expected InnerLoop VPlan CFG to terminate with unreachable");
- assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) &&
- "Expected VPlan CFG to terminate with branch in NativePath");
- LastBB->getTerminator()->eraseFromParent();
- BranchInst::Create(VectorLatchBB, LastBB);
-
- // Merge LastBB with Latch.
- bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI);
- (void)Merged;
- assert(Merged && "Could not merge last basic block with latch.");
- VectorLatchBB = LastBB;
-
- // We do not attempt to preserve DT for outer loop vectorization currently.
- if (!EnableVPlanNativePath)
- updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB,
- L->getExitBlock());
-}
-
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD
-void VPlan::dump() const { dbgs() << *this << '\n'; }
-#endif
-
-void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
- BasicBlock *LoopLatchBB,
- BasicBlock *LoopExitBB) {
- BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor();
- assert(LoopHeaderBB && "Loop preheader does not have a single successor.");
- // The vector body may be more than a single basic-block by this point.
- // Update the dominator tree information inside the vector body by propagating
- // it from header to latch, expecting only triangular control-flow, if any.
- BasicBlock *PostDomSucc = nullptr;
- for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) {
- // Get the list of successors of this block.
- std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
- assert(Succs.size() <= 2 &&
- "Basic block in vector loop has more than 2 successors.");
- PostDomSucc = Succs[0];
- if (Succs.size() == 1) {
- assert(PostDomSucc->getSinglePredecessor() &&
- "PostDom successor has more than one predecessor.");
- DT->addNewBlock(PostDomSucc, BB);
- continue;
- }
- BasicBlock *InterimSucc = Succs[1];
- if (PostDomSucc->getSingleSuccessor() == InterimSucc) {
- PostDomSucc = Succs[1];
- InterimSucc = Succs[0];
- }
- assert(InterimSucc->getSingleSuccessor() == PostDomSucc &&
- "One successor of a basic block does not lead to the other.");
- assert(InterimSucc->getSinglePredecessor() &&
- "Interim successor has more than one predecessor.");
- assert(PostDomSucc->hasNPredecessors(2) &&
- "PostDom successor has more than two predecessors.");
- DT->addNewBlock(InterimSucc, BB);
- DT->addNewBlock(PostDomSucc, BB);
- }
- // Latch block is a new dominator for the loop exit.
- DT->changeImmediateDominator(LoopExitBB, LoopLatchBB);
- assert(DT->verify(DominatorTree::VerificationLevel::Fast));
-}
-
-const Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
- return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
- Twine(getOrCreateBID(Block));
-}
-
-const Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) {
- const std::string &Name = Block->getName();
- if (!Name.empty())
- return Name;
- return "VPB" + Twine(getOrCreateBID(Block));
-}
-
-void VPlanPrinter::dump() {
- Depth = 1;
- bumpIndent(0);
- OS << "digraph VPlan {\n";
- OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
- if (!Plan.getName().empty())
- OS << "\\n" << DOT::EscapeString(Plan.getName());
- if (Plan.BackedgeTakenCount) {
- OS << ", where:\\n";
- Plan.BackedgeTakenCount->print(OS, SlotTracker);
- OS << " := BackedgeTakenCount";
- }
- OS << "\"]\n";
- OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
- OS << "edge [fontname=Courier, fontsize=30]\n";
- OS << "compound=true\n";
-
- for (const VPBlockBase *Block : depth_first(Plan.getEntry()))
- dumpBlock(Block);
-
- OS << "}\n";
-}
-
-void VPlanPrinter::dumpBlock(const VPBlockBase *Block) {
- if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block))
- dumpBasicBlock(BasicBlock);
- else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
- dumpRegion(Region);
- else
- llvm_unreachable("Unsupported kind of VPBlock.");
-}
-
-void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To,
- bool Hidden, const Twine &Label) {
- // Due to "dot" we print an edge between two regions as an edge between the
- // exit basic block and the entry basic of the respective regions.
- const VPBlockBase *Tail = From->getExitBasicBlock();
- const VPBlockBase *Head = To->getEntryBasicBlock();
- OS << Indent << getUID(Tail) << " -> " << getUID(Head);
- OS << " [ label=\"" << Label << '\"';
- if (Tail != From)
- OS << " ltail=" << getUID(From);
- if (Head != To)
- OS << " lhead=" << getUID(To);
- if (Hidden)
- OS << "; splines=none";
- OS << "]\n";
-}
-
-void VPlanPrinter::dumpEdges(const VPBlockBase *Block) {
- auto &Successors = Block->getSuccessors();
- if (Successors.size() == 1)
- drawEdge(Block, Successors.front(), false, "");
- else if (Successors.size() == 2) {
- drawEdge(Block, Successors.front(), false, "T");
- drawEdge(Block, Successors.back(), false, "F");
- } else {
- unsigned SuccessorNumber = 0;
- for (auto *Successor : Successors)
- drawEdge(Block, Successor, false, Twine(SuccessorNumber++));
- }
-}
-
-void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
- OS << Indent << getUID(BasicBlock) << " [label =\n";
- bumpIndent(1);
- OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\"";
- bumpIndent(1);
-
- // Dump the block predicate.
- const VPValue *Pred = BasicBlock->getPredicate();
- if (Pred) {
+ for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part)
+ State->set(BackedgeTakenCount, VTCMO, Part);
+ }
+
+ // 0. Set the reverse mapping from VPValues to Values for code generation.
+ for (auto &Entry : Value2VPValue)
+ State->VPValue2Value[Entry.second] = Entry.first;
+
+ BasicBlock *VectorPreHeaderBB = State->CFG.PrevBB;
+ BasicBlock *VectorHeaderBB = VectorPreHeaderBB->getSingleSuccessor();
+ assert(VectorHeaderBB && "Loop preheader does not have a single successor.");
+
+ // 1. Make room to generate basic-blocks inside loop body if needed.
+ BasicBlock *VectorLatchBB = VectorHeaderBB->splitBasicBlock(
+ VectorHeaderBB->getFirstInsertionPt(), "vector.body.latch");
+ Loop *L = State->LI->getLoopFor(VectorHeaderBB);
+ L->addBasicBlockToLoop(VectorLatchBB, *State->LI);
+ // Remove the edge between Header and Latch to allow other connections.
+ // Temporarily terminate with unreachable until CFG is rewired.
+ // Note: this asserts the generated code's assumption that
+ // getFirstInsertionPt() can be dereferenced into an Instruction.
+ VectorHeaderBB->getTerminator()->eraseFromParent();
+ State->Builder.SetInsertPoint(VectorHeaderBB);
+ UnreachableInst *Terminator = State->Builder.CreateUnreachable();
+ State->Builder.SetInsertPoint(Terminator);
+
+ // 2. Generate code in loop body.
+ State->CFG.PrevVPBB = nullptr;
+ State->CFG.PrevBB = VectorHeaderBB;
+ State->CFG.LastBB = VectorLatchBB;
+
+ for (VPBlockBase *Block : depth_first(Entry))
+ Block->execute(State);
+
+ // Setup branch terminator successors for VPBBs in VPBBsToFix based on
+ // VPBB's successors.
+ for (auto VPBB : State->CFG.VPBBsToFix) {
+ assert(EnableVPlanNativePath &&
+ "Unexpected VPBBsToFix in non VPlan-native path");
+ BasicBlock *BB = State->CFG.VPBB2IRBB[VPBB];
+ assert(BB && "Unexpected null basic block for VPBB");
+
+ unsigned Idx = 0;
+ auto *BBTerminator = BB->getTerminator();
+
+ for (VPBlockBase *SuccVPBlock : VPBB->getHierarchicalSuccessors()) {
+ VPBasicBlock *SuccVPBB = SuccVPBlock->getEntryBasicBlock();
+ BBTerminator->setSuccessor(Idx, State->CFG.VPBB2IRBB[SuccVPBB]);
+ ++Idx;
+ }
+ }
+
+ // 3. Merge the temporary latch created with the last basic-block filled.
+ BasicBlock *LastBB = State->CFG.PrevBB;
+ // Connect LastBB to VectorLatchBB to facilitate their merge.
+ assert((EnableVPlanNativePath ||
+ isa<UnreachableInst>(LastBB->getTerminator())) &&
+ "Expected InnerLoop VPlan CFG to terminate with unreachable");
+ assert((!EnableVPlanNativePath || isa<BranchInst>(LastBB->getTerminator())) &&
+ "Expected VPlan CFG to terminate with branch in NativePath");
+ LastBB->getTerminator()->eraseFromParent();
+ BranchInst::Create(VectorLatchBB, LastBB);
+
+ // Merge LastBB with Latch.
+ bool Merged = MergeBlockIntoPredecessor(VectorLatchBB, nullptr, State->LI);
+ (void)Merged;
+ assert(Merged && "Could not merge last basic block with latch.");
+ VectorLatchBB = LastBB;
+
+ // We do not attempt to preserve DT for outer loop vectorization currently.
+ if (!EnableVPlanNativePath)
+ updateDominatorTree(State->DT, VectorPreHeaderBB, VectorLatchBB,
+ L->getExitBlock());
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void VPlan::dump() const { dbgs() << *this << '\n'; }
+#endif
+
+void VPlan::updateDominatorTree(DominatorTree *DT, BasicBlock *LoopPreHeaderBB,
+ BasicBlock *LoopLatchBB,
+ BasicBlock *LoopExitBB) {
+ BasicBlock *LoopHeaderBB = LoopPreHeaderBB->getSingleSuccessor();
+ assert(LoopHeaderBB && "Loop preheader does not have a single successor.");
+ // The vector body may be more than a single basic-block by this point.
+ // Update the dominator tree information inside the vector body by propagating
+ // it from header to latch, expecting only triangular control-flow, if any.
+ BasicBlock *PostDomSucc = nullptr;
+ for (auto *BB = LoopHeaderBB; BB != LoopLatchBB; BB = PostDomSucc) {
+ // Get the list of successors of this block.
+ std::vector<BasicBlock *> Succs(succ_begin(BB), succ_end(BB));
+ assert(Succs.size() <= 2 &&
+ "Basic block in vector loop has more than 2 successors.");
+ PostDomSucc = Succs[0];
+ if (Succs.size() == 1) {
+ assert(PostDomSucc->getSinglePredecessor() &&
+ "PostDom successor has more than one predecessor.");
+ DT->addNewBlock(PostDomSucc, BB);
+ continue;
+ }
+ BasicBlock *InterimSucc = Succs[1];
+ if (PostDomSucc->getSingleSuccessor() == InterimSucc) {
+ PostDomSucc = Succs[1];
+ InterimSucc = Succs[0];
+ }
+ assert(InterimSucc->getSingleSuccessor() == PostDomSucc &&
+ "One successor of a basic block does not lead to the other.");
+ assert(InterimSucc->getSinglePredecessor() &&
+ "Interim successor has more than one predecessor.");
+ assert(PostDomSucc->hasNPredecessors(2) &&
+ "PostDom successor has more than two predecessors.");
+ DT->addNewBlock(InterimSucc, BB);
+ DT->addNewBlock(PostDomSucc, BB);
+ }
+ // Latch block is a new dominator for the loop exit.
+ DT->changeImmediateDominator(LoopExitBB, LoopLatchBB);
+ assert(DT->verify(DominatorTree::VerificationLevel::Fast));
+}
+
+const Twine VPlanPrinter::getUID(const VPBlockBase *Block) {
+ return (isa<VPRegionBlock>(Block) ? "cluster_N" : "N") +
+ Twine(getOrCreateBID(Block));
+}
+
+const Twine VPlanPrinter::getOrCreateName(const VPBlockBase *Block) {
+ const std::string &Name = Block->getName();
+ if (!Name.empty())
+ return Name;
+ return "VPB" + Twine(getOrCreateBID(Block));
+}
+
+void VPlanPrinter::dump() {
+ Depth = 1;
+ bumpIndent(0);
+ OS << "digraph VPlan {\n";
+ OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
+ if (!Plan.getName().empty())
+ OS << "\\n" << DOT::EscapeString(Plan.getName());
+ if (Plan.BackedgeTakenCount) {
+ OS << ", where:\\n";
+ Plan.BackedgeTakenCount->print(OS, SlotTracker);
+ OS << " := BackedgeTakenCount";
+ }
+ OS << "\"]\n";
+ OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
+ OS << "edge [fontname=Courier, fontsize=30]\n";
+ OS << "compound=true\n";
+
+ for (const VPBlockBase *Block : depth_first(Plan.getEntry()))
+ dumpBlock(Block);
+
+ OS << "}\n";
+}
+
+void VPlanPrinter::dumpBlock(const VPBlockBase *Block) {
+ if (const VPBasicBlock *BasicBlock = dyn_cast<VPBasicBlock>(Block))
+ dumpBasicBlock(BasicBlock);
+ else if (const VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ dumpRegion(Region);
+ else
+ llvm_unreachable("Unsupported kind of VPBlock.");
+}
+
+void VPlanPrinter::drawEdge(const VPBlockBase *From, const VPBlockBase *To,
+ bool Hidden, const Twine &Label) {
+ // Due to "dot" we print an edge between two regions as an edge between the
+ // exit basic block and the entry basic of the respective regions.
+ const VPBlockBase *Tail = From->getExitBasicBlock();
+ const VPBlockBase *Head = To->getEntryBasicBlock();
+ OS << Indent << getUID(Tail) << " -> " << getUID(Head);
+ OS << " [ label=\"" << Label << '\"';
+ if (Tail != From)
+ OS << " ltail=" << getUID(From);
+ if (Head != To)
+ OS << " lhead=" << getUID(To);
+ if (Hidden)
+ OS << "; splines=none";
+ OS << "]\n";
+}
+
+void VPlanPrinter::dumpEdges(const VPBlockBase *Block) {
+ auto &Successors = Block->getSuccessors();
+ if (Successors.size() == 1)
+ drawEdge(Block, Successors.front(), false, "");
+ else if (Successors.size() == 2) {
+ drawEdge(Block, Successors.front(), false, "T");
+ drawEdge(Block, Successors.back(), false, "F");
+ } else {
+ unsigned SuccessorNumber = 0;
+ for (auto *Successor : Successors)
+ drawEdge(Block, Successor, false, Twine(SuccessorNumber++));
+ }
+}
+
+void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
+ OS << Indent << getUID(BasicBlock) << " [label =\n";
+ bumpIndent(1);
+ OS << Indent << "\"" << DOT::EscapeString(BasicBlock->getName()) << ":\\n\"";
+ bumpIndent(1);
+
+ // Dump the block predicate.
+ const VPValue *Pred = BasicBlock->getPredicate();
+ if (Pred) {
OS << " +\n" << Indent << " \"BlockPredicate: \"";
- if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) {
- PredI->printAsOperand(OS, SlotTracker);
- OS << " (" << DOT::EscapeString(PredI->getParent()->getName())
- << ")\\l\"";
- } else
- Pred->printAsOperand(OS, SlotTracker);
- }
-
- for (const VPRecipeBase &Recipe : *BasicBlock) {
+ if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) {
+ PredI->printAsOperand(OS, SlotTracker);
+ OS << " (" << DOT::EscapeString(PredI->getParent()->getName())
+ << ")\\l\"";
+ } else
+ Pred->printAsOperand(OS, SlotTracker);
+ }
+
+ for (const VPRecipeBase &Recipe : *BasicBlock) {
OS << " +\n" << Indent << "\"";
- Recipe.print(OS, Indent, SlotTracker);
- OS << "\\l\"";
- }
-
- // Dump the condition bit.
- const VPValue *CBV = BasicBlock->getCondBit();
- if (CBV) {
- OS << " +\n" << Indent << " \"CondBit: ";
- if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) {
- CBI->printAsOperand(OS, SlotTracker);
- OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\"";
- } else {
- CBV->printAsOperand(OS, SlotTracker);
- OS << "\"";
- }
- }
-
- bumpIndent(-2);
- OS << "\n" << Indent << "]\n";
- dumpEdges(BasicBlock);
-}
-
-void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
- OS << Indent << "subgraph " << getUID(Region) << " {\n";
- bumpIndent(1);
- OS << Indent << "fontname=Courier\n"
- << Indent << "label=\""
- << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ")
- << DOT::EscapeString(Region->getName()) << "\"\n";
- // Dump the blocks of the region.
- assert(Region->getEntry() && "Region contains no inner blocks.");
- for (const VPBlockBase *Block : depth_first(Region->getEntry()))
- dumpBlock(Block);
- bumpIndent(-1);
- OS << Indent << "}\n";
- dumpEdges(Region);
-}
-
+ Recipe.print(OS, Indent, SlotTracker);
+ OS << "\\l\"";
+ }
+
+ // Dump the condition bit.
+ const VPValue *CBV = BasicBlock->getCondBit();
+ if (CBV) {
+ OS << " +\n" << Indent << " \"CondBit: ";
+ if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) {
+ CBI->printAsOperand(OS, SlotTracker);
+ OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\"";
+ } else {
+ CBV->printAsOperand(OS, SlotTracker);
+ OS << "\"";
+ }
+ }
+
+ bumpIndent(-2);
+ OS << "\n" << Indent << "]\n";
+ dumpEdges(BasicBlock);
+}
+
+void VPlanPrinter::dumpRegion(const VPRegionBlock *Region) {
+ OS << Indent << "subgraph " << getUID(Region) << " {\n";
+ bumpIndent(1);
+ OS << Indent << "fontname=Courier\n"
+ << Indent << "label=\""
+ << DOT::EscapeString(Region->isReplicator() ? "<xVFxUF> " : "<x1> ")
+ << DOT::EscapeString(Region->getName()) << "\"\n";
+ // Dump the blocks of the region.
+ assert(Region->getEntry() && "Region contains no inner blocks.");
+ for (const VPBlockBase *Block : depth_first(Region->getEntry()))
+ dumpBlock(Block);
+ bumpIndent(-1);
+ OS << Indent << "}\n";
+ dumpEdges(Region);
+}
+
void VPlanPrinter::printAsIngredient(raw_ostream &O, const Value *V) {
- std::string IngredientString;
- raw_string_ostream RSO(IngredientString);
- if (auto *Inst = dyn_cast<Instruction>(V)) {
- if (!Inst->getType()->isVoidTy()) {
- Inst->printAsOperand(RSO, false);
- RSO << " = ";
- }
- RSO << Inst->getOpcodeName() << " ";
- unsigned E = Inst->getNumOperands();
- if (E > 0) {
- Inst->getOperand(0)->printAsOperand(RSO, false);
- for (unsigned I = 1; I < E; ++I)
- Inst->getOperand(I)->printAsOperand(RSO << ", ", false);
- }
- } else // !Inst
- V->printAsOperand(RSO, false);
- RSO.flush();
- O << DOT::EscapeString(IngredientString);
-}
-
-void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
+ std::string IngredientString;
+ raw_string_ostream RSO(IngredientString);
+ if (auto *Inst = dyn_cast<Instruction>(V)) {
+ if (!Inst->getType()->isVoidTy()) {
+ Inst->printAsOperand(RSO, false);
+ RSO << " = ";
+ }
+ RSO << Inst->getOpcodeName() << " ";
+ unsigned E = Inst->getNumOperands();
+ if (E > 0) {
+ Inst->getOperand(0)->printAsOperand(RSO, false);
+ for (unsigned I = 1; I < E; ++I)
+ Inst->getOperand(I)->printAsOperand(RSO << ", ", false);
+ }
+ } else // !Inst
+ V->printAsOperand(RSO, false);
+ RSO.flush();
+ O << DOT::EscapeString(IngredientString);
+}
+
+void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << "WIDEN-CALL ";
auto *CI = cast<CallInst>(getUnderlyingInstr());
@@ -863,10 +863,10 @@ void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
O << "call @" << CI->getCalledFunction()->getName() << "(";
printOperands(O, SlotTracker);
O << ")";
-}
-
-void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
+}
+
+void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << "WIDEN-SELECT ";
printAsOperand(O, SlotTracker);
O << " = select ";
@@ -876,66 +876,66 @@ void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
O << ", ";
getOperand(2)->printAsOperand(O, SlotTracker);
O << (InvariantCond ? " (condition is loop invariant)" : "");
-}
-
-void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
+}
+
+void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << "WIDEN ";
printAsOperand(O, SlotTracker);
O << " = " << getUnderlyingInstr()->getOpcodeName() << " ";
printOperands(O, SlotTracker);
-}
-
-void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
+}
+
+void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << "WIDEN-INDUCTION";
- if (Trunc) {
- O << "\\l\"";
- O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
- O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc);
- } else
- O << " " << VPlanIngredient(IV);
-}
-
-void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
+ if (Trunc) {
+ O << "\\l\"";
+ O << " +\n" << Indent << "\" " << VPlanIngredient(IV) << "\\l\"";
+ O << " +\n" << Indent << "\" " << VPlanIngredient(Trunc);
+ } else
+ O << " " << VPlanIngredient(IV);
+}
+
+void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << "WIDEN-GEP ";
- O << (IsPtrLoopInvariant ? "Inv" : "Var");
- size_t IndicesNumber = IsIndexLoopInvariant.size();
- for (size_t I = 0; I < IndicesNumber; ++I)
- O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
+ O << (IsPtrLoopInvariant ? "Inv" : "Var");
+ size_t IndicesNumber = IsIndexLoopInvariant.size();
+ for (size_t I = 0; I < IndicesNumber; ++I)
+ O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
O << " ";
printAsOperand(O, SlotTracker);
O << " = getelementptr ";
printOperands(O, SlotTracker);
-}
-
-void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
+}
+
+void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << "WIDEN-PHI " << VPlanIngredient(Phi);
-}
-
-void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
+}
+
+void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << "BLEND ";
- Phi->printAsOperand(O, false);
- O << " =";
- if (getNumIncomingValues() == 1) {
- // Not a User of any mask: not really blending, this is a
- // single-predecessor phi.
- O << " ";
- getIncomingValue(0)->printAsOperand(O, SlotTracker);
- } else {
- for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
- O << " ";
- getIncomingValue(I)->printAsOperand(O, SlotTracker);
- O << "/";
- getMask(I)->printAsOperand(O, SlotTracker);
- }
- }
-}
-
+ Phi->printAsOperand(O, false);
+ O << " =";
+ if (getNumIncomingValues() == 1) {
+ // Not a User of any mask: not really blending, this is a
+ // single-predecessor phi.
+ O << " ";
+ getIncomingValue(0)->printAsOperand(O, SlotTracker);
+ } else {
+ for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
+ O << " ";
+ getIncomingValue(I)->printAsOperand(O, SlotTracker);
+ O << "/";
+ getMask(I)->printAsOperand(O, SlotTracker);
+ }
+ }
+}
+
void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << "REDUCE ";
@@ -952,8 +952,8 @@ void VPReductionRecipe::print(raw_ostream &O, const Twine &Indent,
O << ")";
}
-void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
+void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << (IsUniform ? "CLONE " : "REPLICATE ");
if (!getUnderlyingInstr()->getType()->isVoidTy()) {
@@ -963,182 +963,182 @@ void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
O << Instruction::getOpcodeName(getUnderlyingInstr()->getOpcode()) << " ";
printOperands(O, SlotTracker);
- if (AlsoPack)
- O << " (S->V)";
-}
-
-void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
+ if (AlsoPack)
+ O << " (S->V)";
+}
+
+void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << "PHI-PREDICATED-INSTRUCTION ";
printOperands(O, SlotTracker);
-}
-
-void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
+}
+
+void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << "WIDEN ";
if (!isStore()) {
getVPValue()->printAsOperand(O, SlotTracker);
O << " = ";
- }
+ }
O << Instruction::getOpcodeName(Ingredient.getOpcode()) << " ";
printOperands(O, SlotTracker);
-}
-
-void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
- Value *CanonicalIV = State.CanonicalIV;
- Type *STy = CanonicalIV->getType();
- IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
+}
+
+void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
+ Value *CanonicalIV = State.CanonicalIV;
+ Type *STy = CanonicalIV->getType();
+ IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
ElementCount VF = State.VF;
assert(!VF.isScalable() && "the code following assumes non scalables ECs");
Value *VStart = VF.isScalar()
- ? CanonicalIV
+ ? CanonicalIV
: Builder.CreateVectorSplat(VF.getKnownMinValue(),
CanonicalIV, "broadcast");
- for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
- SmallVector<Constant *, 8> Indices;
+ for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
+ SmallVector<Constant *, 8> Indices;
for (unsigned Lane = 0; Lane < VF.getKnownMinValue(); ++Lane)
Indices.push_back(
ConstantInt::get(STy, Part * VF.getKnownMinValue() + Lane));
- // If VF == 1, there is only one iteration in the loop above, thus the
- // element pushed back into Indices is ConstantInt::get(STy, Part)
+ // If VF == 1, there is only one iteration in the loop above, thus the
+ // element pushed back into Indices is ConstantInt::get(STy, Part)
Constant *VStep =
VF.isScalar() ? Indices.back() : ConstantVector::get(Indices);
- // Add the consecutive indices to the vector value.
- Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
- State.set(getVPValue(), CanonicalVectorIV, Part);
- }
-}
-
-void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const {
+ // Add the consecutive indices to the vector value.
+ Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
+ State.set(getVPValue(), CanonicalVectorIV, Part);
+ }
+}
+
+void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const {
O << "EMIT ";
- getVPValue()->printAsOperand(O, SlotTracker);
- O << " = WIDEN-CANONICAL-INDUCTION";
-}
-
-template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
-
-void VPValue::replaceAllUsesWith(VPValue *New) {
+ getVPValue()->printAsOperand(O, SlotTracker);
+ O << " = WIDEN-CANONICAL-INDUCTION";
+}
+
+template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
+
+void VPValue::replaceAllUsesWith(VPValue *New) {
for (unsigned J = 0; J < getNumUsers();) {
VPUser *User = Users[J];
unsigned NumUsers = getNumUsers();
- for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
- if (User->getOperand(I) == this)
- User->setOperand(I, New);
+ for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
+ if (User->getOperand(I) == this)
+ User->setOperand(I, New);
// If a user got removed after updating the current user, the next user to
// update will be moved to the current position, so we only need to
// increment the index if the number of users did not change.
if (NumUsers == getNumUsers())
J++;
}
-}
-
-void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
- if (const Value *UV = getUnderlyingValue()) {
- OS << "ir<";
- UV->printAsOperand(OS, false);
- OS << ">";
- return;
- }
-
- unsigned Slot = Tracker.getSlot(this);
- if (Slot == unsigned(-1))
- OS << "<badref>";
- else
- OS << "vp<%" << Tracker.getSlot(this) << ">";
-}
-
+}
+
+void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
+ if (const Value *UV = getUnderlyingValue()) {
+ OS << "ir<";
+ UV->printAsOperand(OS, false);
+ OS << ">";
+ return;
+ }
+
+ unsigned Slot = Tracker.getSlot(this);
+ if (Slot == unsigned(-1))
+ OS << "<badref>";
+ else
+ OS << "vp<%" << Tracker.getSlot(this) << ">";
+}
+
void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const {
interleaveComma(operands(), O, [&O, &SlotTracker](VPValue *Op) {
Op->printAsOperand(O, SlotTracker);
});
}
-void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
- Old2NewTy &Old2New,
- InterleavedAccessInfo &IAI) {
- ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
- for (VPBlockBase *Base : RPOT) {
- visitBlock(Base, Old2New, IAI);
- }
-}
-
-void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
- InterleavedAccessInfo &IAI) {
- if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
- for (VPRecipeBase &VPI : *VPBB) {
- assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
- auto *VPInst = cast<VPInstruction>(&VPI);
- auto *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
- auto *IG = IAI.getInterleaveGroup(Inst);
- if (!IG)
- continue;
-
- auto NewIGIter = Old2New.find(IG);
- if (NewIGIter == Old2New.end())
- Old2New[IG] = new InterleaveGroup<VPInstruction>(
- IG->getFactor(), IG->isReverse(), IG->getAlign());
-
- if (Inst == IG->getInsertPos())
- Old2New[IG]->setInsertPos(VPInst);
-
- InterleaveGroupMap[VPInst] = Old2New[IG];
- InterleaveGroupMap[VPInst]->insertMember(
- VPInst, IG->getIndex(Inst),
- Align(IG->isReverse() ? (-1) * int(IG->getFactor())
- : IG->getFactor()));
- }
- } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
- visitRegion(Region, Old2New, IAI);
- else
- llvm_unreachable("Unsupported kind of VPBlock.");
-}
-
-VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
- InterleavedAccessInfo &IAI) {
- Old2NewTy Old2New;
- visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI);
-}
-
-void VPSlotTracker::assignSlot(const VPValue *V) {
- assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!");
- Slots[V] = NextSlot++;
-}
-
-void VPSlotTracker::assignSlots(const VPBlockBase *VPBB) {
- if (auto *Region = dyn_cast<VPRegionBlock>(VPBB))
- assignSlots(Region);
- else
- assignSlots(cast<VPBasicBlock>(VPBB));
-}
-
-void VPSlotTracker::assignSlots(const VPRegionBlock *Region) {
- ReversePostOrderTraversal<const VPBlockBase *> RPOT(Region->getEntry());
- for (const VPBlockBase *Block : RPOT)
- assignSlots(Block);
-}
-
-void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) {
- for (const VPRecipeBase &Recipe : *VPBB) {
+void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
+ Old2NewTy &Old2New,
+ InterleavedAccessInfo &IAI) {
+ ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
+ for (VPBlockBase *Base : RPOT) {
+ visitBlock(Base, Old2New, IAI);
+ }
+}
+
+void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
+ InterleavedAccessInfo &IAI) {
+ if (VPBasicBlock *VPBB = dyn_cast<VPBasicBlock>(Block)) {
+ for (VPRecipeBase &VPI : *VPBB) {
+ assert(isa<VPInstruction>(&VPI) && "Can only handle VPInstructions");
+ auto *VPInst = cast<VPInstruction>(&VPI);
+ auto *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
+ auto *IG = IAI.getInterleaveGroup(Inst);
+ if (!IG)
+ continue;
+
+ auto NewIGIter = Old2New.find(IG);
+ if (NewIGIter == Old2New.end())
+ Old2New[IG] = new InterleaveGroup<VPInstruction>(
+ IG->getFactor(), IG->isReverse(), IG->getAlign());
+
+ if (Inst == IG->getInsertPos())
+ Old2New[IG]->setInsertPos(VPInst);
+
+ InterleaveGroupMap[VPInst] = Old2New[IG];
+ InterleaveGroupMap[VPInst]->insertMember(
+ VPInst, IG->getIndex(Inst),
+ Align(IG->isReverse() ? (-1) * int(IG->getFactor())
+ : IG->getFactor()));
+ }
+ } else if (VPRegionBlock *Region = dyn_cast<VPRegionBlock>(Block))
+ visitRegion(Region, Old2New, IAI);
+ else
+ llvm_unreachable("Unsupported kind of VPBlock.");
+}
+
+VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
+ InterleavedAccessInfo &IAI) {
+ Old2NewTy Old2New;
+ visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI);
+}
+
+void VPSlotTracker::assignSlot(const VPValue *V) {
+ assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!");
+ Slots[V] = NextSlot++;
+}
+
+void VPSlotTracker::assignSlots(const VPBlockBase *VPBB) {
+ if (auto *Region = dyn_cast<VPRegionBlock>(VPBB))
+ assignSlots(Region);
+ else
+ assignSlots(cast<VPBasicBlock>(VPBB));
+}
+
+void VPSlotTracker::assignSlots(const VPRegionBlock *Region) {
+ ReversePostOrderTraversal<const VPBlockBase *> RPOT(Region->getEntry());
+ for (const VPBlockBase *Block : RPOT)
+ assignSlots(Block);
+}
+
+void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) {
+ for (const VPRecipeBase &Recipe : *VPBB) {
for (VPValue *Def : Recipe.definedValues())
assignSlot(Def);
- }
-}
-
-void VPSlotTracker::assignSlots(const VPlan &Plan) {
-
- for (const VPValue *V : Plan.VPExternalDefs)
- assignSlot(V);
-
- for (const VPValue *V : Plan.VPCBVs)
- assignSlot(V);
-
- if (Plan.BackedgeTakenCount)
- assignSlot(Plan.BackedgeTakenCount);
-
- ReversePostOrderTraversal<const VPBlockBase *> RPOT(Plan.getEntry());
- for (const VPBlockBase *Block : RPOT)
- assignSlots(Block);
-}
+ }
+}
+
+void VPSlotTracker::assignSlots(const VPlan &Plan) {
+
+ for (const VPValue *V : Plan.VPExternalDefs)
+ assignSlot(V);
+
+ for (const VPValue *V : Plan.VPCBVs)
+ assignSlot(V);
+
+ if (Plan.BackedgeTakenCount)
+ assignSlot(Plan.BackedgeTakenCount);
+
+ ReversePostOrderTraversal<const VPBlockBase *> RPOT(Plan.getEntry());
+ for (const VPBlockBase *Block : RPOT)
+ assignSlots(Block);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h
index 1fd54e42f2..2cce127cd4 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlan.h
@@ -1,73 +1,73 @@
-//===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// This file contains the declarations of the Vectorization Plan base classes:
-/// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual
-/// VPBlockBase, together implementing a Hierarchical CFG;
-/// 2. Specializations of GraphTraits that allow VPBlockBase graphs to be
-/// treated as proper graphs for generic algorithms;
-/// 3. Pure virtual VPRecipeBase serving as the base class for recipes contained
-/// within VPBasicBlocks;
-/// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned
-/// instruction;
-/// 5. The VPlan class holding a candidate for vectorization;
-/// 6. The VPlanPrinter class providing a way to print a plan in dot format;
-/// These are documented in docs/VectorizationPlan.rst.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
-
-#include "VPlanLoopInfo.h"
-#include "VPlanValue.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/GraphTraits.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/ADT/ilist.h"
-#include "llvm/ADT/ilist_node.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/IRBuilder.h"
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <map>
-#include <string>
-
-namespace llvm {
-
-class BasicBlock;
-class DominatorTree;
-class InnerLoopVectorizer;
-class LoopInfo;
-class raw_ostream;
+//===- VPlan.h - Represent A Vectorizer Plan --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file contains the declarations of the Vectorization Plan base classes:
+/// 1. VPBasicBlock and VPRegionBlock that inherit from a common pure virtual
+/// VPBlockBase, together implementing a Hierarchical CFG;
+/// 2. Specializations of GraphTraits that allow VPBlockBase graphs to be
+/// treated as proper graphs for generic algorithms;
+/// 3. Pure virtual VPRecipeBase serving as the base class for recipes contained
+/// within VPBasicBlocks;
+/// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned
+/// instruction;
+/// 5. The VPlan class holding a candidate for vectorization;
+/// 6. The VPlanPrinter class providing a way to print a plan in dot format;
+/// These are documented in docs/VectorizationPlan.rst.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
+
+#include "VPlanLoopInfo.h"
+#include "VPlanValue.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/IRBuilder.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <map>
+#include <string>
+
+namespace llvm {
+
+class BasicBlock;
+class DominatorTree;
+class InnerLoopVectorizer;
+class LoopInfo;
+class raw_ostream;
class RecurrenceDescriptor;
-class Value;
-class VPBasicBlock;
-class VPRegionBlock;
-class VPlan;
-class VPlanSlp;
-
-/// A range of powers-of-2 vectorization factors with fixed start and
-/// adjustable end. The range includes start and excludes end, e.g.,:
-/// [1, 9) = {1, 2, 4, 8}
-struct VFRange {
- // A power of 2.
+class Value;
+class VPBasicBlock;
+class VPRegionBlock;
+class VPlan;
+class VPlanSlp;
+
+/// A range of powers-of-2 vectorization factors with fixed start and
+/// adjustable end. The range includes start and excludes end, e.g.,:
+/// [1, 9) = {1, 2, 4, 8}
+struct VFRange {
+ // A power of 2.
const ElementCount Start;
-
- // Need not be a power of 2. If End <= Start range is empty.
+
+ // Need not be a power of 2. If End <= Start range is empty.
ElementCount End;
bool isEmpty() const {
@@ -81,221 +81,221 @@ struct VFRange {
assert(isPowerOf2_32(Start.getKnownMinValue()) &&
"Expected Start to be a power of 2");
}
-};
-
-using VPlanPtr = std::unique_ptr<VPlan>;
-
-/// In what follows, the term "input IR" refers to code that is fed into the
-/// vectorizer whereas the term "output IR" refers to code that is generated by
-/// the vectorizer.
-
-/// VPIteration represents a single point in the iteration space of the output
-/// (vectorized and/or unrolled) IR loop.
-struct VPIteration {
- /// in [0..UF)
- unsigned Part;
-
- /// in [0..VF)
- unsigned Lane;
-};
-
-/// This is a helper struct for maintaining vectorization state. It's used for
-/// mapping values from the original loop to their corresponding values in
-/// the new loop. Two mappings are maintained: one for vectorized values and
-/// one for scalarized values. Vectorized values are represented with UF
-/// vector values in the new loop, and scalarized values are represented with
-/// UF x VF scalar values in the new loop. UF and VF are the unroll and
-/// vectorization factors, respectively.
-///
-/// Entries can be added to either map with setVectorValue and setScalarValue,
-/// which assert that an entry was not already added before. If an entry is to
-/// replace an existing one, call resetVectorValue and resetScalarValue. This is
-/// currently needed to modify the mapped values during "fix-up" operations that
-/// occur once the first phase of widening is complete. These operations include
-/// type truncation and the second phase of recurrence widening.
-///
-/// Entries from either map can be retrieved using the getVectorValue and
-/// getScalarValue functions, which assert that the desired value exists.
-struct VectorizerValueMap {
- friend struct VPTransformState;
-
-private:
- /// The unroll factor. Each entry in the vector map contains UF vector values.
- unsigned UF;
-
- /// The vectorization factor. Each entry in the scalar map contains UF x VF
- /// scalar values.
+};
+
+using VPlanPtr = std::unique_ptr<VPlan>;
+
+/// In what follows, the term "input IR" refers to code that is fed into the
+/// vectorizer whereas the term "output IR" refers to code that is generated by
+/// the vectorizer.
+
+/// VPIteration represents a single point in the iteration space of the output
+/// (vectorized and/or unrolled) IR loop.
+struct VPIteration {
+ /// in [0..UF)
+ unsigned Part;
+
+ /// in [0..VF)
+ unsigned Lane;
+};
+
+/// This is a helper struct for maintaining vectorization state. It's used for
+/// mapping values from the original loop to their corresponding values in
+/// the new loop. Two mappings are maintained: one for vectorized values and
+/// one for scalarized values. Vectorized values are represented with UF
+/// vector values in the new loop, and scalarized values are represented with
+/// UF x VF scalar values in the new loop. UF and VF are the unroll and
+/// vectorization factors, respectively.
+///
+/// Entries can be added to either map with setVectorValue and setScalarValue,
+/// which assert that an entry was not already added before. If an entry is to
+/// replace an existing one, call resetVectorValue and resetScalarValue. This is
+/// currently needed to modify the mapped values during "fix-up" operations that
+/// occur once the first phase of widening is complete. These operations include
+/// type truncation and the second phase of recurrence widening.
+///
+/// Entries from either map can be retrieved using the getVectorValue and
+/// getScalarValue functions, which assert that the desired value exists.
+struct VectorizerValueMap {
+ friend struct VPTransformState;
+
+private:
+ /// The unroll factor. Each entry in the vector map contains UF vector values.
+ unsigned UF;
+
+ /// The vectorization factor. Each entry in the scalar map contains UF x VF
+ /// scalar values.
ElementCount VF;
-
- /// The vector and scalar map storage. We use std::map and not DenseMap
- /// because insertions to DenseMap invalidate its iterators.
- using VectorParts = SmallVector<Value *, 2>;
- using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
- std::map<Value *, VectorParts> VectorMapStorage;
- std::map<Value *, ScalarParts> ScalarMapStorage;
-
-public:
- /// Construct an empty map with the given unroll and vectorization factors.
+
+ /// The vector and scalar map storage. We use std::map and not DenseMap
+ /// because insertions to DenseMap invalidate its iterators.
+ using VectorParts = SmallVector<Value *, 2>;
+ using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
+ std::map<Value *, VectorParts> VectorMapStorage;
+ std::map<Value *, ScalarParts> ScalarMapStorage;
+
+public:
+ /// Construct an empty map with the given unroll and vectorization factors.
VectorizerValueMap(unsigned UF, ElementCount VF) : UF(UF), VF(VF) {}
-
- /// \return True if the map has any vector entry for \p Key.
- bool hasAnyVectorValue(Value *Key) const {
- return VectorMapStorage.count(Key);
- }
-
- /// \return True if the map has a vector entry for \p Key and \p Part.
- bool hasVectorValue(Value *Key, unsigned Part) const {
- assert(Part < UF && "Queried Vector Part is too large.");
- if (!hasAnyVectorValue(Key))
- return false;
- const VectorParts &Entry = VectorMapStorage.find(Key)->second;
- assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
- return Entry[Part] != nullptr;
- }
-
- /// \return True if the map has any scalar entry for \p Key.
- bool hasAnyScalarValue(Value *Key) const {
- return ScalarMapStorage.count(Key);
- }
-
- /// \return True if the map has a scalar entry for \p Key and \p Instance.
- bool hasScalarValue(Value *Key, const VPIteration &Instance) const {
- assert(Instance.Part < UF && "Queried Scalar Part is too large.");
+
+ /// \return True if the map has any vector entry for \p Key.
+ bool hasAnyVectorValue(Value *Key) const {
+ return VectorMapStorage.count(Key);
+ }
+
+ /// \return True if the map has a vector entry for \p Key and \p Part.
+ bool hasVectorValue(Value *Key, unsigned Part) const {
+ assert(Part < UF && "Queried Vector Part is too large.");
+ if (!hasAnyVectorValue(Key))
+ return false;
+ const VectorParts &Entry = VectorMapStorage.find(Key)->second;
+ assert(Entry.size() == UF && "VectorParts has wrong dimensions.");
+ return Entry[Part] != nullptr;
+ }
+
+ /// \return True if the map has any scalar entry for \p Key.
+ bool hasAnyScalarValue(Value *Key) const {
+ return ScalarMapStorage.count(Key);
+ }
+
+ /// \return True if the map has a scalar entry for \p Key and \p Instance.
+ bool hasScalarValue(Value *Key, const VPIteration &Instance) const {
+ assert(Instance.Part < UF && "Queried Scalar Part is too large.");
assert(Instance.Lane < VF.getKnownMinValue() &&
"Queried Scalar Lane is too large.");
- if (!hasAnyScalarValue(Key))
- return false;
- const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
- assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
+ if (!hasAnyScalarValue(Key))
+ return false;
+ const ScalarParts &Entry = ScalarMapStorage.find(Key)->second;
+ assert(Entry.size() == UF && "ScalarParts has wrong dimensions.");
assert(Entry[Instance.Part].size() == VF.getKnownMinValue() &&
- "ScalarParts has wrong dimensions.");
- return Entry[Instance.Part][Instance.Lane] != nullptr;
- }
-
- /// Retrieve the existing vector value that corresponds to \p Key and
- /// \p Part.
- Value *getVectorValue(Value *Key, unsigned Part) {
- assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
- return VectorMapStorage[Key][Part];
- }
-
- /// Retrieve the existing scalar value that corresponds to \p Key and
- /// \p Instance.
- Value *getScalarValue(Value *Key, const VPIteration &Instance) {
- assert(hasScalarValue(Key, Instance) && "Getting non-existent value.");
- return ScalarMapStorage[Key][Instance.Part][Instance.Lane];
- }
-
- /// Set a vector value associated with \p Key and \p Part. Assumes such a
- /// value is not already set. If it is, use resetVectorValue() instead.
- void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
- assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
- if (!VectorMapStorage.count(Key)) {
- VectorParts Entry(UF);
- VectorMapStorage[Key] = Entry;
- }
- VectorMapStorage[Key][Part] = Vector;
- }
-
- /// Set a scalar value associated with \p Key and \p Instance. Assumes such a
- /// value is not already set.
- void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar) {
- assert(!hasScalarValue(Key, Instance) && "Scalar value already set");
- if (!ScalarMapStorage.count(Key)) {
- ScalarParts Entry(UF);
- // TODO: Consider storing uniform values only per-part, as they occupy
- // lane 0 only, keeping the other VF-1 redundant entries null.
- for (unsigned Part = 0; Part < UF; ++Part)
+ "ScalarParts has wrong dimensions.");
+ return Entry[Instance.Part][Instance.Lane] != nullptr;
+ }
+
+ /// Retrieve the existing vector value that corresponds to \p Key and
+ /// \p Part.
+ Value *getVectorValue(Value *Key, unsigned Part) {
+ assert(hasVectorValue(Key, Part) && "Getting non-existent value.");
+ return VectorMapStorage[Key][Part];
+ }
+
+ /// Retrieve the existing scalar value that corresponds to \p Key and
+ /// \p Instance.
+ Value *getScalarValue(Value *Key, const VPIteration &Instance) {
+ assert(hasScalarValue(Key, Instance) && "Getting non-existent value.");
+ return ScalarMapStorage[Key][Instance.Part][Instance.Lane];
+ }
+
+ /// Set a vector value associated with \p Key and \p Part. Assumes such a
+ /// value is not already set. If it is, use resetVectorValue() instead.
+ void setVectorValue(Value *Key, unsigned Part, Value *Vector) {
+ assert(!hasVectorValue(Key, Part) && "Vector value already set for part");
+ if (!VectorMapStorage.count(Key)) {
+ VectorParts Entry(UF);
+ VectorMapStorage[Key] = Entry;
+ }
+ VectorMapStorage[Key][Part] = Vector;
+ }
+
+ /// Set a scalar value associated with \p Key and \p Instance. Assumes such a
+ /// value is not already set.
+ void setScalarValue(Value *Key, const VPIteration &Instance, Value *Scalar) {
+ assert(!hasScalarValue(Key, Instance) && "Scalar value already set");
+ if (!ScalarMapStorage.count(Key)) {
+ ScalarParts Entry(UF);
+ // TODO: Consider storing uniform values only per-part, as they occupy
+ // lane 0 only, keeping the other VF-1 redundant entries null.
+ for (unsigned Part = 0; Part < UF; ++Part)
Entry[Part].resize(VF.getKnownMinValue(), nullptr);
- ScalarMapStorage[Key] = Entry;
- }
- ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
- }
-
- /// Reset the vector value associated with \p Key for the given \p Part.
- /// This function can be used to update values that have already been
- /// vectorized. This is the case for "fix-up" operations including type
- /// truncation and the second phase of recurrence vectorization.
- void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
- assert(hasVectorValue(Key, Part) && "Vector value not set for part");
- VectorMapStorage[Key][Part] = Vector;
- }
-
- /// Reset the scalar value associated with \p Key for \p Part and \p Lane.
- /// This function can be used to update values that have already been
- /// scalarized. This is the case for "fix-up" operations including scalar phi
- /// nodes for scalarized and predicated instructions.
- void resetScalarValue(Value *Key, const VPIteration &Instance,
- Value *Scalar) {
- assert(hasScalarValue(Key, Instance) &&
- "Scalar value not set for part and lane");
- ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
- }
-};
-
-/// This class is used to enable the VPlan to invoke a method of ILV. This is
-/// needed until the method is refactored out of ILV and becomes reusable.
-struct VPCallback {
- virtual ~VPCallback() {}
- virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0;
- virtual Value *getOrCreateScalarValue(Value *V,
- const VPIteration &Instance) = 0;
-};
-
-/// VPTransformState holds information passed down when "executing" a VPlan,
-/// needed for generating the output IR.
-struct VPTransformState {
+ ScalarMapStorage[Key] = Entry;
+ }
+ ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
+ }
+
+ /// Reset the vector value associated with \p Key for the given \p Part.
+ /// This function can be used to update values that have already been
+ /// vectorized. This is the case for "fix-up" operations including type
+ /// truncation and the second phase of recurrence vectorization.
+ void resetVectorValue(Value *Key, unsigned Part, Value *Vector) {
+ assert(hasVectorValue(Key, Part) && "Vector value not set for part");
+ VectorMapStorage[Key][Part] = Vector;
+ }
+
+ /// Reset the scalar value associated with \p Key for \p Part and \p Lane.
+ /// This function can be used to update values that have already been
+ /// scalarized. This is the case for "fix-up" operations including scalar phi
+ /// nodes for scalarized and predicated instructions.
+ void resetScalarValue(Value *Key, const VPIteration &Instance,
+ Value *Scalar) {
+ assert(hasScalarValue(Key, Instance) &&
+ "Scalar value not set for part and lane");
+ ScalarMapStorage[Key][Instance.Part][Instance.Lane] = Scalar;
+ }
+};
+
+/// This class is used to enable the VPlan to invoke a method of ILV. This is
+/// needed until the method is refactored out of ILV and becomes reusable.
+struct VPCallback {
+ virtual ~VPCallback() {}
+ virtual Value *getOrCreateVectorValues(Value *V, unsigned Part) = 0;
+ virtual Value *getOrCreateScalarValue(Value *V,
+ const VPIteration &Instance) = 0;
+};
+
+/// VPTransformState holds information passed down when "executing" a VPlan,
+/// needed for generating the output IR.
+struct VPTransformState {
VPTransformState(ElementCount VF, unsigned UF, Loop *OrigLoop, LoopInfo *LI,
DominatorTree *DT, IRBuilder<> &Builder,
VectorizerValueMap &ValueMap, InnerLoopVectorizer *ILV,
VPCallback &Callback)
: VF(VF), UF(UF), Instance(), OrigLoop(OrigLoop), LI(LI), DT(DT),
Builder(Builder), ValueMap(ValueMap), ILV(ILV), Callback(Callback) {}
-
- /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
+
+ /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
ElementCount VF;
- unsigned UF;
-
- /// Hold the indices to generate specific scalar instructions. Null indicates
- /// that all instances are to be generated, using either scalar or vector
- /// instructions.
- Optional<VPIteration> Instance;
-
- struct DataState {
- /// A type for vectorized values in the new loop. Each value from the
- /// original loop, when vectorized, is represented by UF vector values in
- /// the new unrolled loop, where UF is the unroll factor.
- typedef SmallVector<Value *, 2> PerPartValuesTy;
-
- DenseMap<VPValue *, PerPartValuesTy> PerPartOutput;
+ unsigned UF;
+
+ /// Hold the indices to generate specific scalar instructions. Null indicates
+ /// that all instances are to be generated, using either scalar or vector
+ /// instructions.
+ Optional<VPIteration> Instance;
+
+ struct DataState {
+ /// A type for vectorized values in the new loop. Each value from the
+ /// original loop, when vectorized, is represented by UF vector values in
+ /// the new unrolled loop, where UF is the unroll factor.
+ typedef SmallVector<Value *, 2> PerPartValuesTy;
+
+ DenseMap<VPValue *, PerPartValuesTy> PerPartOutput;
using ScalarsPerPartValuesTy = SmallVector<SmallVector<Value *, 4>, 2>;
DenseMap<VPValue *, ScalarsPerPartValuesTy> PerPartScalars;
- } Data;
-
- /// Get the generated Value for a given VPValue and a given Part. Note that
- /// as some Defs are still created by ILV and managed in its ValueMap, this
- /// method will delegate the call to ILV in such cases in order to provide
- /// callers a consistent API.
- /// \see set.
- Value *get(VPValue *Def, unsigned Part) {
- // If Values have been set for this Def return the one relevant for \p Part.
- if (Data.PerPartOutput.count(Def))
- return Data.PerPartOutput[Def][Part];
- // Def is managed by ILV: bring the Values from ValueMap.
- return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part);
- }
-
- /// Get the generated Value for a given VPValue and given Part and Lane.
+ } Data;
+
+ /// Get the generated Value for a given VPValue and a given Part. Note that
+ /// as some Defs are still created by ILV and managed in its ValueMap, this
+ /// method will delegate the call to ILV in such cases in order to provide
+ /// callers a consistent API.
+ /// \see set.
+ Value *get(VPValue *Def, unsigned Part) {
+ // If Values have been set for this Def return the one relevant for \p Part.
+ if (Data.PerPartOutput.count(Def))
+ return Data.PerPartOutput[Def][Part];
+ // Def is managed by ILV: bring the Values from ValueMap.
+ return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part);
+ }
+
+ /// Get the generated Value for a given VPValue and given Part and Lane.
Value *get(VPValue *Def, const VPIteration &Instance);
-
+
bool hasVectorValue(VPValue *Def, unsigned Part) {
auto I = Data.PerPartOutput.find(Def);
return I != Data.PerPartOutput.end() && Part < I->second.size() &&
I->second[Part];
- }
-
+ }
+
bool hasScalarValue(VPValue *Def, VPIteration Instance) {
auto I = Data.PerPartScalars.find(Def);
if (I == Data.PerPartScalars.end())
@@ -305,16 +305,16 @@ struct VPTransformState {
I->second[Instance.Part][Instance.Lane];
}
- /// Set the generated Value for a given VPValue and a given Part.
- void set(VPValue *Def, Value *V, unsigned Part) {
- if (!Data.PerPartOutput.count(Def)) {
- DataState::PerPartValuesTy Entry(UF);
- Data.PerPartOutput[Def] = Entry;
- }
- Data.PerPartOutput[Def][Part] = V;
- }
+ /// Set the generated Value for a given VPValue and a given Part.
+ void set(VPValue *Def, Value *V, unsigned Part) {
+ if (!Data.PerPartOutput.count(Def)) {
+ DataState::PerPartValuesTy Entry(UF);
+ Data.PerPartOutput[Def] = Entry;
+ }
+ Data.PerPartOutput[Def][Part] = V;
+ }
void set(VPValue *Def, Value *IRDef, Value *V, unsigned Part);
-
+
void set(VPValue *Def, Value *V, const VPIteration &Instance) {
auto Iter = Data.PerPartScalars.insert({Def, {}});
auto &PerPartVec = Iter.first->second;
@@ -326,364 +326,364 @@ struct VPTransformState {
Scalars[Instance.Lane] = V;
}
- /// Hold state information used when constructing the CFG of the output IR,
- /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
- struct CFGState {
- /// The previous VPBasicBlock visited. Initially set to null.
- VPBasicBlock *PrevVPBB = nullptr;
-
- /// The previous IR BasicBlock created or used. Initially set to the new
- /// header BasicBlock.
- BasicBlock *PrevBB = nullptr;
-
- /// The last IR BasicBlock in the output IR. Set to the new latch
- /// BasicBlock, used for placing the newly created BasicBlocks.
- BasicBlock *LastBB = nullptr;
-
- /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
- /// of replication, maps the BasicBlock of the last replica created.
- SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
-
- /// Vector of VPBasicBlocks whose terminator instruction needs to be fixed
- /// up at the end of vector code generation.
- SmallVector<VPBasicBlock *, 8> VPBBsToFix;
-
- CFGState() = default;
- } CFG;
-
+ /// Hold state information used when constructing the CFG of the output IR,
+ /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks.
+ struct CFGState {
+ /// The previous VPBasicBlock visited. Initially set to null.
+ VPBasicBlock *PrevVPBB = nullptr;
+
+ /// The previous IR BasicBlock created or used. Initially set to the new
+ /// header BasicBlock.
+ BasicBlock *PrevBB = nullptr;
+
+ /// The last IR BasicBlock in the output IR. Set to the new latch
+ /// BasicBlock, used for placing the newly created BasicBlocks.
+ BasicBlock *LastBB = nullptr;
+
+ /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case
+ /// of replication, maps the BasicBlock of the last replica created.
+ SmallDenseMap<VPBasicBlock *, BasicBlock *> VPBB2IRBB;
+
+ /// Vector of VPBasicBlocks whose terminator instruction needs to be fixed
+ /// up at the end of vector code generation.
+ SmallVector<VPBasicBlock *, 8> VPBBsToFix;
+
+ CFGState() = default;
+ } CFG;
+
/// Hold a pointer to the original loop.
Loop *OrigLoop;
- /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
- LoopInfo *LI;
-
- /// Hold a pointer to Dominator Tree to register new basic blocks in the loop.
- DominatorTree *DT;
-
- /// Hold a reference to the IRBuilder used to generate output IR code.
- IRBuilder<> &Builder;
-
- /// Hold a reference to the Value state information used when generating the
- /// Values of the output IR.
- VectorizerValueMap &ValueMap;
-
- /// Hold a reference to a mapping between VPValues in VPlan and original
- /// Values they correspond to.
- VPValue2ValueTy VPValue2Value;
-
- /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
- Value *CanonicalIV = nullptr;
-
- /// Hold the trip count of the scalar loop.
- Value *TripCount = nullptr;
-
- /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
- InnerLoopVectorizer *ILV;
-
- VPCallback &Callback;
-};
-
-/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
-/// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
-class VPBlockBase {
- friend class VPBlockUtils;
-
- const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
-
- /// An optional name for the block.
- std::string Name;
-
- /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if
- /// it is a topmost VPBlockBase.
- VPRegionBlock *Parent = nullptr;
-
- /// List of predecessor blocks.
- SmallVector<VPBlockBase *, 1> Predecessors;
-
- /// List of successor blocks.
- SmallVector<VPBlockBase *, 1> Successors;
-
- /// Successor selector, null for zero or single successor blocks.
- VPValue *CondBit = nullptr;
-
- /// Current block predicate - null if the block does not need a predicate.
- VPValue *Predicate = nullptr;
-
- /// VPlan containing the block. Can only be set on the entry block of the
- /// plan.
- VPlan *Plan = nullptr;
-
- /// Add \p Successor as the last successor to this block.
- void appendSuccessor(VPBlockBase *Successor) {
- assert(Successor && "Cannot add nullptr successor!");
- Successors.push_back(Successor);
- }
-
- /// Add \p Predecessor as the last predecessor to this block.
- void appendPredecessor(VPBlockBase *Predecessor) {
- assert(Predecessor && "Cannot add nullptr predecessor!");
- Predecessors.push_back(Predecessor);
- }
-
- /// Remove \p Predecessor from the predecessors of this block.
- void removePredecessor(VPBlockBase *Predecessor) {
+ /// Hold a pointer to LoopInfo to register new basic blocks in the loop.
+ LoopInfo *LI;
+
+ /// Hold a pointer to Dominator Tree to register new basic blocks in the loop.
+ DominatorTree *DT;
+
+ /// Hold a reference to the IRBuilder used to generate output IR code.
+ IRBuilder<> &Builder;
+
+ /// Hold a reference to the Value state information used when generating the
+ /// Values of the output IR.
+ VectorizerValueMap &ValueMap;
+
+ /// Hold a reference to a mapping between VPValues in VPlan and original
+ /// Values they correspond to.
+ VPValue2ValueTy VPValue2Value;
+
+ /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
+ Value *CanonicalIV = nullptr;
+
+ /// Hold the trip count of the scalar loop.
+ Value *TripCount = nullptr;
+
+ /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods.
+ InnerLoopVectorizer *ILV;
+
+ VPCallback &Callback;
+};
+
+/// VPBlockBase is the building block of the Hierarchical Control-Flow Graph.
+/// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock.
+class VPBlockBase {
+ friend class VPBlockUtils;
+
+ const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
+
+ /// An optional name for the block.
+ std::string Name;
+
+ /// The immediate VPRegionBlock which this VPBlockBase belongs to, or null if
+ /// it is a topmost VPBlockBase.
+ VPRegionBlock *Parent = nullptr;
+
+ /// List of predecessor blocks.
+ SmallVector<VPBlockBase *, 1> Predecessors;
+
+ /// List of successor blocks.
+ SmallVector<VPBlockBase *, 1> Successors;
+
+ /// Successor selector, null for zero or single successor blocks.
+ VPValue *CondBit = nullptr;
+
+ /// Current block predicate - null if the block does not need a predicate.
+ VPValue *Predicate = nullptr;
+
+ /// VPlan containing the block. Can only be set on the entry block of the
+ /// plan.
+ VPlan *Plan = nullptr;
+
+ /// Add \p Successor as the last successor to this block.
+ void appendSuccessor(VPBlockBase *Successor) {
+ assert(Successor && "Cannot add nullptr successor!");
+ Successors.push_back(Successor);
+ }
+
+ /// Add \p Predecessor as the last predecessor to this block.
+ void appendPredecessor(VPBlockBase *Predecessor) {
+ assert(Predecessor && "Cannot add nullptr predecessor!");
+ Predecessors.push_back(Predecessor);
+ }
+
+ /// Remove \p Predecessor from the predecessors of this block.
+ void removePredecessor(VPBlockBase *Predecessor) {
auto Pos = find(Predecessors, Predecessor);
- assert(Pos && "Predecessor does not exist");
- Predecessors.erase(Pos);
- }
-
- /// Remove \p Successor from the successors of this block.
- void removeSuccessor(VPBlockBase *Successor) {
+ assert(Pos && "Predecessor does not exist");
+ Predecessors.erase(Pos);
+ }
+
+ /// Remove \p Successor from the successors of this block.
+ void removeSuccessor(VPBlockBase *Successor) {
auto Pos = find(Successors, Successor);
- assert(Pos && "Successor does not exist");
- Successors.erase(Pos);
- }
-
-protected:
- VPBlockBase(const unsigned char SC, const std::string &N)
- : SubclassID(SC), Name(N) {}
-
-public:
- /// An enumeration for keeping track of the concrete subclass of VPBlockBase
- /// that are actually instantiated. Values of this enumeration are kept in the
- /// SubclassID field of the VPBlockBase objects. They are used for concrete
- /// type identification.
- using VPBlockTy = enum { VPBasicBlockSC, VPRegionBlockSC };
-
- using VPBlocksTy = SmallVectorImpl<VPBlockBase *>;
-
- virtual ~VPBlockBase() = default;
-
- const std::string &getName() const { return Name; }
-
- void setName(const Twine &newName) { Name = newName.str(); }
-
- /// \return an ID for the concrete type of this object.
- /// This is used to implement the classof checks. This should not be used
- /// for any other purpose, as the values may change as LLVM evolves.
- unsigned getVPBlockID() const { return SubclassID; }
-
- VPRegionBlock *getParent() { return Parent; }
- const VPRegionBlock *getParent() const { return Parent; }
-
- /// \return A pointer to the plan containing the current block.
- VPlan *getPlan();
- const VPlan *getPlan() const;
-
- /// Sets the pointer of the plan containing the block. The block must be the
- /// entry block into the VPlan.
- void setPlan(VPlan *ParentPlan);
-
- void setParent(VPRegionBlock *P) { Parent = P; }
-
- /// \return the VPBasicBlock that is the entry of this VPBlockBase,
- /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
- /// VPBlockBase is a VPBasicBlock, it is returned.
- const VPBasicBlock *getEntryBasicBlock() const;
- VPBasicBlock *getEntryBasicBlock();
-
- /// \return the VPBasicBlock that is the exit of this VPBlockBase,
- /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
- /// VPBlockBase is a VPBasicBlock, it is returned.
- const VPBasicBlock *getExitBasicBlock() const;
- VPBasicBlock *getExitBasicBlock();
-
- const VPBlocksTy &getSuccessors() const { return Successors; }
- VPBlocksTy &getSuccessors() { return Successors; }
-
- const VPBlocksTy &getPredecessors() const { return Predecessors; }
- VPBlocksTy &getPredecessors() { return Predecessors; }
-
- /// \return the successor of this VPBlockBase if it has a single successor.
- /// Otherwise return a null pointer.
- VPBlockBase *getSingleSuccessor() const {
- return (Successors.size() == 1 ? *Successors.begin() : nullptr);
- }
-
- /// \return the predecessor of this VPBlockBase if it has a single
- /// predecessor. Otherwise return a null pointer.
- VPBlockBase *getSinglePredecessor() const {
- return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr);
- }
-
- size_t getNumSuccessors() const { return Successors.size(); }
- size_t getNumPredecessors() const { return Predecessors.size(); }
-
- /// An Enclosing Block of a block B is any block containing B, including B
- /// itself. \return the closest enclosing block starting from "this", which
- /// has successors. \return the root enclosing block if all enclosing blocks
- /// have no successors.
- VPBlockBase *getEnclosingBlockWithSuccessors();
-
- /// \return the closest enclosing block starting from "this", which has
- /// predecessors. \return the root enclosing block if all enclosing blocks
- /// have no predecessors.
- VPBlockBase *getEnclosingBlockWithPredecessors();
-
- /// \return the successors either attached directly to this VPBlockBase or, if
- /// this VPBlockBase is the exit block of a VPRegionBlock and has no
- /// successors of its own, search recursively for the first enclosing
- /// VPRegionBlock that has successors and return them. If no such
- /// VPRegionBlock exists, return the (empty) successors of the topmost
- /// VPBlockBase reached.
- const VPBlocksTy &getHierarchicalSuccessors() {
- return getEnclosingBlockWithSuccessors()->getSuccessors();
- }
-
- /// \return the hierarchical successor of this VPBlockBase if it has a single
- /// hierarchical successor. Otherwise return a null pointer.
- VPBlockBase *getSingleHierarchicalSuccessor() {
- return getEnclosingBlockWithSuccessors()->getSingleSuccessor();
- }
-
- /// \return the predecessors either attached directly to this VPBlockBase or,
- /// if this VPBlockBase is the entry block of a VPRegionBlock and has no
- /// predecessors of its own, search recursively for the first enclosing
- /// VPRegionBlock that has predecessors and return them. If no such
- /// VPRegionBlock exists, return the (empty) predecessors of the topmost
- /// VPBlockBase reached.
- const VPBlocksTy &getHierarchicalPredecessors() {
- return getEnclosingBlockWithPredecessors()->getPredecessors();
- }
-
- /// \return the hierarchical predecessor of this VPBlockBase if it has a
- /// single hierarchical predecessor. Otherwise return a null pointer.
- VPBlockBase *getSingleHierarchicalPredecessor() {
- return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
- }
-
- /// \return the condition bit selecting the successor.
- VPValue *getCondBit() { return CondBit; }
-
- const VPValue *getCondBit() const { return CondBit; }
-
- void setCondBit(VPValue *CV) { CondBit = CV; }
-
- VPValue *getPredicate() { return Predicate; }
-
- const VPValue *getPredicate() const { return Predicate; }
-
- void setPredicate(VPValue *Pred) { Predicate = Pred; }
-
- /// Set a given VPBlockBase \p Successor as the single successor of this
- /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
- /// This VPBlockBase must have no successors.
- void setOneSuccessor(VPBlockBase *Successor) {
- assert(Successors.empty() && "Setting one successor when others exist.");
- appendSuccessor(Successor);
- }
-
- /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two
- /// successors of this VPBlockBase. \p Condition is set as the successor
- /// selector. This VPBlockBase is not added as predecessor of \p IfTrue or \p
- /// IfFalse. This VPBlockBase must have no successors.
- void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
- VPValue *Condition) {
- assert(Successors.empty() && "Setting two successors when others exist.");
- assert(Condition && "Setting two successors without condition!");
- CondBit = Condition;
- appendSuccessor(IfTrue);
- appendSuccessor(IfFalse);
- }
-
- /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase.
- /// This VPBlockBase must have no predecessors. This VPBlockBase is not added
- /// as successor of any VPBasicBlock in \p NewPreds.
- void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) {
- assert(Predecessors.empty() && "Block predecessors already set.");
- for (auto *Pred : NewPreds)
- appendPredecessor(Pred);
- }
-
- /// Remove all the predecessor of this block.
- void clearPredecessors() { Predecessors.clear(); }
-
- /// Remove all the successors of this block and set to null its condition bit
- void clearSuccessors() {
- Successors.clear();
- CondBit = nullptr;
- }
-
- /// The method which generates the output IR that correspond to this
- /// VPBlockBase, thereby "executing" the VPlan.
- virtual void execute(struct VPTransformState *State) = 0;
-
- /// Delete all blocks reachable from a given VPBlockBase, inclusive.
- static void deleteCFG(VPBlockBase *Entry);
-
- void printAsOperand(raw_ostream &OS, bool PrintType) const {
- OS << getName();
- }
-
- void print(raw_ostream &OS) const {
- // TODO: Only printing VPBB name for now since we only have dot printing
- // support for VPInstructions/Recipes.
- printAsOperand(OS, false);
- }
-
- /// Return true if it is legal to hoist instructions into this block.
- bool isLegalToHoistInto() {
- // There are currently no constraints that prevent an instruction to be
- // hoisted into a VPBlockBase.
- return true;
- }
+ assert(Pos && "Successor does not exist");
+ Successors.erase(Pos);
+ }
+
+protected:
+ VPBlockBase(const unsigned char SC, const std::string &N)
+ : SubclassID(SC), Name(N) {}
+
+public:
+ /// An enumeration for keeping track of the concrete subclass of VPBlockBase
+ /// that are actually instantiated. Values of this enumeration are kept in the
+ /// SubclassID field of the VPBlockBase objects. They are used for concrete
+ /// type identification.
+ using VPBlockTy = enum { VPBasicBlockSC, VPRegionBlockSC };
+
+ using VPBlocksTy = SmallVectorImpl<VPBlockBase *>;
+
+ virtual ~VPBlockBase() = default;
+
+ const std::string &getName() const { return Name; }
+
+ void setName(const Twine &newName) { Name = newName.str(); }
+
+ /// \return an ID for the concrete type of this object.
+ /// This is used to implement the classof checks. This should not be used
+ /// for any other purpose, as the values may change as LLVM evolves.
+ unsigned getVPBlockID() const { return SubclassID; }
+
+ VPRegionBlock *getParent() { return Parent; }
+ const VPRegionBlock *getParent() const { return Parent; }
+
+ /// \return A pointer to the plan containing the current block.
+ VPlan *getPlan();
+ const VPlan *getPlan() const;
+
+ /// Sets the pointer of the plan containing the block. The block must be the
+ /// entry block into the VPlan.
+ void setPlan(VPlan *ParentPlan);
+
+ void setParent(VPRegionBlock *P) { Parent = P; }
+
+ /// \return the VPBasicBlock that is the entry of this VPBlockBase,
+ /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
+ /// VPBlockBase is a VPBasicBlock, it is returned.
+ const VPBasicBlock *getEntryBasicBlock() const;
+ VPBasicBlock *getEntryBasicBlock();
+
+ /// \return the VPBasicBlock that is the exit of this VPBlockBase,
+ /// recursively, if the latter is a VPRegionBlock. Otherwise, if this
+ /// VPBlockBase is a VPBasicBlock, it is returned.
+ const VPBasicBlock *getExitBasicBlock() const;
+ VPBasicBlock *getExitBasicBlock();
+
+ const VPBlocksTy &getSuccessors() const { return Successors; }
+ VPBlocksTy &getSuccessors() { return Successors; }
+
+ const VPBlocksTy &getPredecessors() const { return Predecessors; }
+ VPBlocksTy &getPredecessors() { return Predecessors; }
+
+ /// \return the successor of this VPBlockBase if it has a single successor.
+ /// Otherwise return a null pointer.
+ VPBlockBase *getSingleSuccessor() const {
+ return (Successors.size() == 1 ? *Successors.begin() : nullptr);
+ }
+
+ /// \return the predecessor of this VPBlockBase if it has a single
+ /// predecessor. Otherwise return a null pointer.
+ VPBlockBase *getSinglePredecessor() const {
+ return (Predecessors.size() == 1 ? *Predecessors.begin() : nullptr);
+ }
+
+ size_t getNumSuccessors() const { return Successors.size(); }
+ size_t getNumPredecessors() const { return Predecessors.size(); }
+
+ /// An Enclosing Block of a block B is any block containing B, including B
+ /// itself. \return the closest enclosing block starting from "this", which
+ /// has successors. \return the root enclosing block if all enclosing blocks
+ /// have no successors.
+ VPBlockBase *getEnclosingBlockWithSuccessors();
+
+ /// \return the closest enclosing block starting from "this", which has
+ /// predecessors. \return the root enclosing block if all enclosing blocks
+ /// have no predecessors.
+ VPBlockBase *getEnclosingBlockWithPredecessors();
+
+ /// \return the successors either attached directly to this VPBlockBase or, if
+ /// this VPBlockBase is the exit block of a VPRegionBlock and has no
+ /// successors of its own, search recursively for the first enclosing
+ /// VPRegionBlock that has successors and return them. If no such
+ /// VPRegionBlock exists, return the (empty) successors of the topmost
+ /// VPBlockBase reached.
+ const VPBlocksTy &getHierarchicalSuccessors() {
+ return getEnclosingBlockWithSuccessors()->getSuccessors();
+ }
+
+ /// \return the hierarchical successor of this VPBlockBase if it has a single
+ /// hierarchical successor. Otherwise return a null pointer.
+ VPBlockBase *getSingleHierarchicalSuccessor() {
+ return getEnclosingBlockWithSuccessors()->getSingleSuccessor();
+ }
+
+ /// \return the predecessors either attached directly to this VPBlockBase or,
+ /// if this VPBlockBase is the entry block of a VPRegionBlock and has no
+ /// predecessors of its own, search recursively for the first enclosing
+ /// VPRegionBlock that has predecessors and return them. If no such
+ /// VPRegionBlock exists, return the (empty) predecessors of the topmost
+ /// VPBlockBase reached.
+ const VPBlocksTy &getHierarchicalPredecessors() {
+ return getEnclosingBlockWithPredecessors()->getPredecessors();
+ }
+
+ /// \return the hierarchical predecessor of this VPBlockBase if it has a
+ /// single hierarchical predecessor. Otherwise return a null pointer.
+ VPBlockBase *getSingleHierarchicalPredecessor() {
+ return getEnclosingBlockWithPredecessors()->getSinglePredecessor();
+ }
+
+ /// \return the condition bit selecting the successor.
+ VPValue *getCondBit() { return CondBit; }
+
+ const VPValue *getCondBit() const { return CondBit; }
+
+ void setCondBit(VPValue *CV) { CondBit = CV; }
+
+ VPValue *getPredicate() { return Predicate; }
+
+ const VPValue *getPredicate() const { return Predicate; }
+
+ void setPredicate(VPValue *Pred) { Predicate = Pred; }
+
+ /// Set a given VPBlockBase \p Successor as the single successor of this
+ /// VPBlockBase. This VPBlockBase is not added as predecessor of \p Successor.
+ /// This VPBlockBase must have no successors.
+ void setOneSuccessor(VPBlockBase *Successor) {
+ assert(Successors.empty() && "Setting one successor when others exist.");
+ appendSuccessor(Successor);
+ }
+
+ /// Set two given VPBlockBases \p IfTrue and \p IfFalse to be the two
+ /// successors of this VPBlockBase. \p Condition is set as the successor
+ /// selector. This VPBlockBase is not added as predecessor of \p IfTrue or \p
+ /// IfFalse. This VPBlockBase must have no successors.
+ void setTwoSuccessors(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
+ VPValue *Condition) {
+ assert(Successors.empty() && "Setting two successors when others exist.");
+ assert(Condition && "Setting two successors without condition!");
+ CondBit = Condition;
+ appendSuccessor(IfTrue);
+ appendSuccessor(IfFalse);
+ }
+
+ /// Set each VPBasicBlock in \p NewPreds as predecessor of this VPBlockBase.
+ /// This VPBlockBase must have no predecessors. This VPBlockBase is not added
+ /// as successor of any VPBasicBlock in \p NewPreds.
+ void setPredecessors(ArrayRef<VPBlockBase *> NewPreds) {
+ assert(Predecessors.empty() && "Block predecessors already set.");
+ for (auto *Pred : NewPreds)
+ appendPredecessor(Pred);
+ }
+
+ /// Remove all the predecessor of this block.
+ void clearPredecessors() { Predecessors.clear(); }
+
+ /// Remove all the successors of this block and set to null its condition bit
+ void clearSuccessors() {
+ Successors.clear();
+ CondBit = nullptr;
+ }
+
+ /// The method which generates the output IR that correspond to this
+ /// VPBlockBase, thereby "executing" the VPlan.
+ virtual void execute(struct VPTransformState *State) = 0;
+
+ /// Delete all blocks reachable from a given VPBlockBase, inclusive.
+ static void deleteCFG(VPBlockBase *Entry);
+
+ void printAsOperand(raw_ostream &OS, bool PrintType) const {
+ OS << getName();
+ }
+
+ void print(raw_ostream &OS) const {
+ // TODO: Only printing VPBB name for now since we only have dot printing
+ // support for VPInstructions/Recipes.
+ printAsOperand(OS, false);
+ }
+
+ /// Return true if it is legal to hoist instructions into this block.
+ bool isLegalToHoistInto() {
+ // There are currently no constraints that prevent an instruction to be
+ // hoisted into a VPBlockBase.
+ return true;
+ }
/// Replace all operands of VPUsers in the block with \p NewValue and also
/// replaces all uses of VPValues defined in the block with NewValue.
virtual void dropAllReferences(VPValue *NewValue) = 0;
-};
-
-/// VPRecipeBase is a base class modeling a sequence of one or more output IR
+};
+
+/// VPRecipeBase is a base class modeling a sequence of one or more output IR
/// instructions. VPRecipeBase owns the the VPValues it defines through VPDef
/// and is responsible for deleting its defined values. Single-value
/// VPRecipeBases that also inherit from VPValue must make sure to inherit from
/// VPRecipeBase before VPValue.
class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
public VPDef {
- friend VPBasicBlock;
- friend class VPBlockUtils;
-
-
- /// Each VPRecipe belongs to a single VPBasicBlock.
- VPBasicBlock *Parent = nullptr;
-
-public:
+ friend VPBasicBlock;
+ friend class VPBlockUtils;
+
+
+ /// Each VPRecipe belongs to a single VPBasicBlock.
+ VPBasicBlock *Parent = nullptr;
+
+public:
VPRecipeBase(const unsigned char SC) : VPDef(SC) {}
- virtual ~VPRecipeBase() = default;
-
- /// \return the VPBasicBlock which this VPRecipe belongs to.
- VPBasicBlock *getParent() { return Parent; }
- const VPBasicBlock *getParent() const { return Parent; }
-
- /// The method which generates the output IR instructions that correspond to
- /// this VPRecipe, thereby "executing" the VPlan.
- virtual void execute(struct VPTransformState &State) = 0;
-
- /// Insert an unlinked recipe into a basic block immediately before
- /// the specified recipe.
- void insertBefore(VPRecipeBase *InsertPos);
-
- /// Insert an unlinked Recipe into a basic block immediately after
- /// the specified Recipe.
- void insertAfter(VPRecipeBase *InsertPos);
-
- /// Unlink this recipe from its current VPBasicBlock and insert it into
- /// the VPBasicBlock that MovePos lives in, right after MovePos.
- void moveAfter(VPRecipeBase *MovePos);
-
+ virtual ~VPRecipeBase() = default;
+
+ /// \return the VPBasicBlock which this VPRecipe belongs to.
+ VPBasicBlock *getParent() { return Parent; }
+ const VPBasicBlock *getParent() const { return Parent; }
+
+ /// The method which generates the output IR instructions that correspond to
+ /// this VPRecipe, thereby "executing" the VPlan.
+ virtual void execute(struct VPTransformState &State) = 0;
+
+ /// Insert an unlinked recipe into a basic block immediately before
+ /// the specified recipe.
+ void insertBefore(VPRecipeBase *InsertPos);
+
+ /// Insert an unlinked Recipe into a basic block immediately after
+ /// the specified Recipe.
+ void insertAfter(VPRecipeBase *InsertPos);
+
+ /// Unlink this recipe from its current VPBasicBlock and insert it into
+ /// the VPBasicBlock that MovePos lives in, right after MovePos.
+ void moveAfter(VPRecipeBase *MovePos);
+
/// Unlink this recipe and insert into BB before I.
///
/// \pre I is a valid iterator into BB.
void moveBefore(VPBasicBlock &BB, iplist<VPRecipeBase>::iterator I);
- /// This method unlinks 'this' from the containing basic block, but does not
- /// delete it.
- void removeFromParent();
-
- /// This method unlinks 'this' from the containing basic block and deletes it.
- ///
- /// \returns an iterator pointing to the element after the erased one
- iplist<VPRecipeBase>::iterator eraseFromParent();
+ /// This method unlinks 'this' from the containing basic block, but does not
+ /// delete it.
+ void removeFromParent();
+
+ /// This method unlinks 'this' from the containing basic block and deletes it.
+ ///
+ /// \returns an iterator pointing to the element after the erased one
+ iplist<VPRecipeBase>::iterator eraseFromParent();
/// Returns a pointer to a VPUser, if the recipe inherits from VPUser or
/// nullptr otherwise.
@@ -703,8 +703,8 @@ public:
// All VPDefs are also VPRecipeBases.
return true;
}
-};
-
+};
+
inline bool VPUser::classof(const VPDef *Def) {
return Def->getVPDefID() == VPRecipeBase::VPInstructionSC ||
Def->getVPDefID() == VPRecipeBase::VPWidenSC ||
@@ -719,39 +719,39 @@ inline bool VPUser::classof(const VPDef *Def) {
Def->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC;
}
-/// This is a concrete Recipe that models a single VPlan-level instruction.
-/// While as any Recipe it may generate a sequence of IR instructions when
-/// executed, these instructions would always form a single-def expression as
-/// the VPInstruction is also a single def-use vertex.
+/// This is a concrete Recipe that models a single VPlan-level instruction.
+/// While as any Recipe it may generate a sequence of IR instructions when
+/// executed, these instructions would always form a single-def expression as
+/// the VPInstruction is also a single def-use vertex.
class VPInstruction : public VPRecipeBase, public VPUser, public VPValue {
- friend class VPlanSlp;
-
-public:
- /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
- enum {
- Not = Instruction::OtherOpsEnd + 1,
- ICmpULE,
- SLPLoad,
- SLPStore,
- ActiveLaneMask,
- };
-
-private:
- typedef unsigned char OpcodeTy;
- OpcodeTy Opcode;
-
- /// Utility method serving execute(): generates a single instance of the
- /// modeled instruction.
- void generateInstruction(VPTransformState &State, unsigned Part);
-
-protected:
- void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
-
-public:
- VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
+ friend class VPlanSlp;
+
+public:
+ /// VPlan opcodes, extending LLVM IR with idiomatics instructions.
+ enum {
+ Not = Instruction::OtherOpsEnd + 1,
+ ICmpULE,
+ SLPLoad,
+ SLPStore,
+ ActiveLaneMask,
+ };
+
+private:
+ typedef unsigned char OpcodeTy;
+ OpcodeTy Opcode;
+
+ /// Utility method serving execute(): generates a single instance of the
+ /// modeled instruction.
+ void generateInstruction(VPTransformState &State, unsigned Part);
+
+protected:
+ void setUnderlyingInstr(Instruction *I) { setUnderlyingValue(I); }
+
+public:
+ VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands)
: VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser(Operands),
VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {}
-
+
VPInstruction(unsigned Opcode, ArrayRef<VPInstruction *> Operands)
: VPRecipeBase(VPRecipeBase::VPInstructionSC), VPUser({}),
VPValue(VPValue::VPVInstructionSC, nullptr, this), Opcode(Opcode) {
@@ -759,195 +759,195 @@ public:
addOperand(I->getVPValue());
}
- VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
- : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPValue *V) {
+ VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands)
+ : VPInstruction(Opcode, ArrayRef<VPValue *>(Operands)) {}
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPValue *V) {
return V->getVPValueID() == VPValue::VPVInstructionSC;
- }
-
- VPInstruction *clone() const {
- SmallVector<VPValue *, 2> Operands(operands());
- return new VPInstruction(Opcode, Operands);
- }
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
+ }
+
+ VPInstruction *clone() const {
+ SmallVector<VPValue *, 2> Operands(operands());
+ return new VPInstruction(Opcode, Operands);
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *R) {
return R->getVPDefID() == VPRecipeBase::VPInstructionSC;
- }
-
- unsigned getOpcode() const { return Opcode; }
-
- /// Generate the instruction.
- /// TODO: We currently execute only per-part unless a specific instance is
- /// provided.
- void execute(VPTransformState &State) override;
-
+ }
+
+ unsigned getOpcode() const { return Opcode; }
+
+ /// Generate the instruction.
+ /// TODO: We currently execute only per-part unless a specific instance is
+ /// provided.
+ void execute(VPTransformState &State) override;
+
/// Print the VPInstruction to \p O.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+
/// Print the VPInstruction to dbgs() (for debugging).
void dump() const;
-
- /// Return true if this instruction may modify memory.
- bool mayWriteToMemory() const {
- // TODO: we can use attributes of the called function to rule out memory
- // modifications.
- return Opcode == Instruction::Store || Opcode == Instruction::Call ||
- Opcode == Instruction::Invoke || Opcode == SLPStore;
- }
-
- bool hasResult() const {
- // CallInst may or may not have a result, depending on the called function.
- // Conservatively return calls have results for now.
- switch (getOpcode()) {
- case Instruction::Ret:
- case Instruction::Br:
- case Instruction::Store:
- case Instruction::Switch:
- case Instruction::IndirectBr:
- case Instruction::Resume:
- case Instruction::CatchRet:
- case Instruction::Unreachable:
- case Instruction::Fence:
- case Instruction::AtomicRMW:
- return false;
- default:
- return true;
- }
- }
-};
-
-/// VPWidenRecipe is a recipe for producing a copy of vector type its
-/// ingredient. This recipe covers most of the traditional vectorization cases
-/// where each ingredient transforms into a vectorized version of itself.
+
+ /// Return true if this instruction may modify memory.
+ bool mayWriteToMemory() const {
+ // TODO: we can use attributes of the called function to rule out memory
+ // modifications.
+ return Opcode == Instruction::Store || Opcode == Instruction::Call ||
+ Opcode == Instruction::Invoke || Opcode == SLPStore;
+ }
+
+ bool hasResult() const {
+ // CallInst may or may not have a result, depending on the called function.
+ // Conservatively return calls have results for now.
+ switch (getOpcode()) {
+ case Instruction::Ret:
+ case Instruction::Br:
+ case Instruction::Store:
+ case Instruction::Switch:
+ case Instruction::IndirectBr:
+ case Instruction::Resume:
+ case Instruction::CatchRet:
+ case Instruction::Unreachable:
+ case Instruction::Fence:
+ case Instruction::AtomicRMW:
+ return false;
+ default:
+ return true;
+ }
+ }
+};
+
+/// VPWidenRecipe is a recipe for producing a copy of vector type its
+/// ingredient. This recipe covers most of the traditional vectorization cases
+/// where each ingredient transforms into a vectorized version of itself.
class VPWidenRecipe : public VPRecipeBase, public VPValue, public VPUser {
-public:
- template <typename IterT>
- VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
+public:
+ template <typename IterT>
+ VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
: VPRecipeBase(VPRecipeBase::VPWidenSC),
VPValue(VPValue::VPVWidenSC, &I, this), VPUser(Operands) {}
-
- ~VPWidenRecipe() override = default;
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
+
+ ~VPWidenRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPRecipeBase::VPWidenSC;
- }
+ }
static inline bool classof(const VPValue *V) {
return V->getVPValueID() == VPValue::VPVWidenSC;
}
-
- /// Produce widened copies of all Ingredients.
- void execute(VPTransformState &State) override;
-
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-};
-
-/// A recipe for widening Call instructions.
+
+ /// Produce widened copies of all Ingredients.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+};
+
+/// A recipe for widening Call instructions.
class VPWidenCallRecipe : public VPRecipeBase, public VPUser, public VPValue {
-
-public:
- template <typename IterT>
- VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments)
+
+public:
+ template <typename IterT>
+ VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments)
: VPRecipeBase(VPRecipeBase::VPWidenCallSC), VPUser(CallArguments),
VPValue(VPValue::VPVWidenCallSC, &I, this) {}
-
- ~VPWidenCallRecipe() override = default;
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
+
+ ~VPWidenCallRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPRecipeBase::VPWidenCallSC;
- }
-
- /// Produce a widened version of the call instruction.
- void execute(VPTransformState &State) override;
-
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-};
-
-/// A recipe for widening select instructions.
+ }
+
+ /// Produce a widened version of the call instruction.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+};
+
+/// A recipe for widening select instructions.
class VPWidenSelectRecipe : public VPRecipeBase, public VPUser, public VPValue {
-
- /// Is the condition of the select loop invariant?
- bool InvariantCond;
-
-public:
- template <typename IterT>
- VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands,
- bool InvariantCond)
+
+ /// Is the condition of the select loop invariant?
+ bool InvariantCond;
+
+public:
+ template <typename IterT>
+ VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands,
+ bool InvariantCond)
: VPRecipeBase(VPRecipeBase::VPWidenSelectSC), VPUser(Operands),
VPValue(VPValue::VPVWidenSelectSC, &I, this),
- InvariantCond(InvariantCond) {}
-
- ~VPWidenSelectRecipe() override = default;
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
+ InvariantCond(InvariantCond) {}
+
+ ~VPWidenSelectRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPRecipeBase::VPWidenSelectSC;
- }
-
- /// Produce a widened version of the select instruction.
- void execute(VPTransformState &State) override;
-
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-};
-
-/// A recipe for handling GEP instructions.
+ }
+
+ /// Produce a widened version of the select instruction.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+};
+
+/// A recipe for handling GEP instructions.
class VPWidenGEPRecipe : public VPRecipeBase,
public VPUser,
public VPValue {
- bool IsPtrLoopInvariant;
- SmallBitVector IsIndexLoopInvariant;
-
-public:
- template <typename IterT>
+ bool IsPtrLoopInvariant;
+ SmallBitVector IsIndexLoopInvariant;
+
+public:
+ template <typename IterT>
VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands)
: VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands),
VPValue(VPWidenGEPSC, GEP, this),
IsIndexLoopInvariant(GEP->getNumIndices(), false) {}
template <typename IterT>
- VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands,
- Loop *OrigLoop)
+ VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands,
+ Loop *OrigLoop)
: VPRecipeBase(VPRecipeBase::VPWidenGEPSC), VPUser(Operands),
VPValue(VPValue::VPVWidenGEPSC, GEP, this),
- IsIndexLoopInvariant(GEP->getNumIndices(), false) {
- IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());
- for (auto Index : enumerate(GEP->indices()))
- IsIndexLoopInvariant[Index.index()] =
- OrigLoop->isLoopInvariant(Index.value().get());
- }
- ~VPWidenGEPRecipe() override = default;
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
+ IsIndexLoopInvariant(GEP->getNumIndices(), false) {
+ IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());
+ for (auto Index : enumerate(GEP->indices()))
+ IsIndexLoopInvariant[Index.index()] =
+ OrigLoop->isLoopInvariant(Index.value().get());
+ }
+ ~VPWidenGEPRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPRecipeBase::VPWidenGEPSC;
- }
-
- /// Generate the gep nodes.
- void execute(VPTransformState &State) override;
-
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-};
-
-/// A recipe for handling phi nodes of integer and floating-point inductions,
-/// producing their vector and scalar values.
+ }
+
+ /// Generate the gep nodes.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+};
+
+/// A recipe for handling phi nodes of integer and floating-point inductions,
+/// producing their vector and scalar values.
class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPUser {
- PHINode *IV;
- TruncInst *Trunc;
-
-public:
+ PHINode *IV;
+ TruncInst *Trunc;
+
+public:
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
TruncInst *Trunc = nullptr)
: VPRecipeBase(VPWidenIntOrFpInductionSC), VPUser({Start}), IV(IV),
@@ -957,35 +957,35 @@ public:
else
new VPValue(IV, this);
}
- ~VPWidenIntOrFpInductionRecipe() override = default;
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
+ ~VPWidenIntOrFpInductionRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPRecipeBase::VPWidenIntOrFpInductionSC;
- }
-
- /// Generate the vectorized and scalarized versions of the phi node as
- /// needed by their users.
- void execute(VPTransformState &State) override;
-
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
+ }
+
+ /// Generate the vectorized and scalarized versions of the phi node as
+ /// needed by their users.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
/// Returns the start value of the induction.
VPValue *getStartValue() { return getOperand(0); }
-};
-
-/// A recipe for handling all phi nodes except for integer and FP inductions.
+};
+
+/// A recipe for handling all phi nodes except for integer and FP inductions.
/// For reduction PHIs, RdxDesc must point to the corresponding recurrence
/// descriptor and the start value is the first operand of the recipe.
class VPWidenPHIRecipe : public VPRecipeBase, public VPUser {
- PHINode *Phi;
-
+ PHINode *Phi;
+
/// Descriptor for a reduction PHI.
RecurrenceDescriptor *RdxDesc = nullptr;
-public:
+public:
/// Create a new VPWidenPHIRecipe for the reduction \p Phi described by \p
/// RdxDesc.
VPWidenPHIRecipe(PHINode *Phi, RecurrenceDescriptor &RdxDesc, VPValue &Start)
@@ -998,78 +998,78 @@ public:
VPWidenPHIRecipe(PHINode *Phi) : VPRecipeBase(VPWidenPHISC), Phi(Phi) {
new VPValue(Phi, this);
}
- ~VPWidenPHIRecipe() override = default;
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
+ ~VPWidenPHIRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPRecipeBase::VPWidenPHISC;
- }
-
- /// Generate the phi/select nodes.
- void execute(VPTransformState &State) override;
-
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
+ }
+
+ /// Generate the phi/select nodes.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
/// Returns the start value of the phi, if it is a reduction.
VPValue *getStartValue() {
return getNumOperands() == 0 ? nullptr : getOperand(0);
}
-};
-
-/// A recipe for vectorizing a phi-node as a sequence of mask-based select
-/// instructions.
+};
+
+/// A recipe for vectorizing a phi-node as a sequence of mask-based select
+/// instructions.
class VPBlendRecipe : public VPRecipeBase, public VPUser {
- PHINode *Phi;
-
+ PHINode *Phi;
+
public:
- /// The blend operation is a User of the incoming values and of their
- /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value
- /// might be incoming with a full mask for which there is no VPValue.
- VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
+ /// The blend operation is a User of the incoming values and of their
+ /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value
+ /// might be incoming with a full mask for which there is no VPValue.
+ VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
: VPRecipeBase(VPBlendSC), VPUser(Operands), Phi(Phi) {
new VPValue(Phi, this);
- assert(Operands.size() > 0 &&
- ((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
- "Expected either a single incoming value or a positive even number "
- "of operands");
- }
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
+ assert(Operands.size() > 0 &&
+ ((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
+ "Expected either a single incoming value or a positive even number "
+ "of operands");
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPRecipeBase::VPBlendSC;
- }
-
- /// Return the number of incoming values, taking into account that a single
- /// incoming value has no mask.
+ }
+
+ /// Return the number of incoming values, taking into account that a single
+ /// incoming value has no mask.
unsigned getNumIncomingValues() const { return (getNumOperands() + 1) / 2; }
-
- /// Return incoming value number \p Idx.
+
+ /// Return incoming value number \p Idx.
VPValue *getIncomingValue(unsigned Idx) const { return getOperand(Idx * 2); }
-
- /// Return mask number \p Idx.
+
+ /// Return mask number \p Idx.
VPValue *getMask(unsigned Idx) const { return getOperand(Idx * 2 + 1); }
-
- /// Generate the phi/select nodes.
- void execute(VPTransformState &State) override;
-
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-};
-
-/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
+
+ /// Generate the phi/select nodes.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+};
+
+/// VPInterleaveRecipe is a recipe for transforming an interleave group of load
/// or stores into one wide load/store and shuffles. The first operand of a
/// VPInterleave recipe is the address, followed by the stored values, followed
/// by an optional mask.
class VPInterleaveRecipe : public VPRecipeBase, public VPUser {
- const InterleaveGroup<Instruction> *IG;
-
+ const InterleaveGroup<Instruction> *IG;
+
bool HasMask = false;
-public:
- VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
+public:
+ VPInterleaveRecipe(const InterleaveGroup<Instruction> *IG, VPValue *Addr,
ArrayRef<VPValue *> StoredValues, VPValue *Mask)
: VPRecipeBase(VPInterleaveSC), VPUser(Addr), IG(IG) {
for (unsigned i = 0; i < IG->getFactor(); ++i)
@@ -1085,26 +1085,26 @@ public:
HasMask = true;
addOperand(Mask);
}
- }
- ~VPInterleaveRecipe() override = default;
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
+ }
+ ~VPInterleaveRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPRecipeBase::VPInterleaveSC;
- }
-
- /// Return the address accessed by this recipe.
- VPValue *getAddr() const {
+ }
+
+ /// Return the address accessed by this recipe.
+ VPValue *getAddr() const {
return getOperand(0); // Address is the 1st, mandatory operand.
- }
-
- /// Return the mask used by this recipe. Note that a full mask is represented
- /// by a nullptr.
- VPValue *getMask() const {
- // Mask is optional and therefore the last, currently 2nd operand.
+ }
+
+ /// Return the mask used by this recipe. Note that a full mask is represented
+ /// by a nullptr.
+ VPValue *getMask() const {
+ // Mask is optional and therefore the last, currently 2nd operand.
return HasMask ? getOperand(getNumOperands() - 1) : nullptr;
- }
-
+ }
+
/// Return the VPValues stored by this interleave group. If it is a load
/// interleave group, return an empty ArrayRef.
ArrayRef<VPValue *> getStoredValues() const {
@@ -1114,16 +1114,16 @@ public:
.slice(1, getNumOperands() - (HasMask ? 2 : 1));
}
- /// Generate the wide load or store, and shuffles.
- void execute(VPTransformState &State) override;
-
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-
- const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
-};
-
+ /// Generate the wide load or store, and shuffles.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+
+ const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
+};
+
/// A recipe to represent inloop reduction operations, performing a reduction on
/// a vector operand into a scalar value, and adding the result to a chain.
/// The Operands are {ChainOp, VecOp, [Condition]}.
@@ -1174,976 +1174,976 @@ public:
}
};
-/// VPReplicateRecipe replicates a given instruction producing multiple scalar
-/// copies of the original scalar type, one per lane, instead of producing a
-/// single copy of widened type for all lanes. If the instruction is known to be
-/// uniform only one copy, per lane zero, will be generated.
+/// VPReplicateRecipe replicates a given instruction producing multiple scalar
+/// copies of the original scalar type, one per lane, instead of producing a
+/// single copy of widened type for all lanes. If the instruction is known to be
+/// uniform only one copy, per lane zero, will be generated.
class VPReplicateRecipe : public VPRecipeBase, public VPUser, public VPValue {
- /// Indicator if only a single replica per lane is needed.
- bool IsUniform;
-
- /// Indicator if the replicas are also predicated.
- bool IsPredicated;
-
- /// Indicator if the scalar values should also be packed into a vector.
- bool AlsoPack;
-
-public:
- template <typename IterT>
- VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
- bool IsUniform, bool IsPredicated = false)
+ /// Indicator if only a single replica per lane is needed.
+ bool IsUniform;
+
+ /// Indicator if the replicas are also predicated.
+ bool IsPredicated;
+
+ /// Indicator if the scalar values should also be packed into a vector.
+ bool AlsoPack;
+
+public:
+ template <typename IterT>
+ VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
+ bool IsUniform, bool IsPredicated = false)
: VPRecipeBase(VPReplicateSC), VPUser(Operands),
VPValue(VPVReplicateSC, I, this), IsUniform(IsUniform),
IsPredicated(IsPredicated) {
- // Retain the previous behavior of predicateInstructions(), where an
- // insert-element of a predicated instruction got hoisted into the
- // predicated basic block iff it was its only user. This is achieved by
- // having predicated instructions also pack their values into a vector by
- // default unless they have a replicated user which uses their scalar value.
- AlsoPack = IsPredicated && !I->use_empty();
- }
-
- ~VPReplicateRecipe() override = default;
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
+ // Retain the previous behavior of predicateInstructions(), where an
+ // insert-element of a predicated instruction got hoisted into the
+ // predicated basic block iff it was its only user. This is achieved by
+ // having predicated instructions also pack their values into a vector by
+ // default unless they have a replicated user which uses their scalar value.
+ AlsoPack = IsPredicated && !I->use_empty();
+ }
+
+ ~VPReplicateRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPRecipeBase::VPReplicateSC;
- }
-
+ }
+
static inline bool classof(const VPValue *V) {
return V->getVPValueID() == VPValue::VPVReplicateSC;
}
- /// Generate replicas of the desired Ingredient. Replicas will be generated
- /// for all parts and lanes unless a specific part and lane are specified in
- /// the \p State.
- void execute(VPTransformState &State) override;
-
- void setAlsoPack(bool Pack) { AlsoPack = Pack; }
-
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
+ /// Generate replicas of the desired Ingredient. Replicas will be generated
+ /// for all parts and lanes unless a specific part and lane are specified in
+ /// the \p State.
+ void execute(VPTransformState &State) override;
+
+ void setAlsoPack(bool Pack) { AlsoPack = Pack; }
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
bool isUniform() const { return IsUniform; }
-};
-
-/// A recipe for generating conditional branches on the bits of a mask.
+};
+
+/// A recipe for generating conditional branches on the bits of a mask.
class VPBranchOnMaskRecipe : public VPRecipeBase, public VPUser {
-public:
- VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) {
- if (BlockInMask) // nullptr means all-one mask.
+public:
+ VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) {
+ if (BlockInMask) // nullptr means all-one mask.
addOperand(BlockInMask);
- }
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPRecipeBase::VPBranchOnMaskSC;
- }
-
- /// Generate the extraction of the appropriate bit from the block mask and the
- /// conditional branch.
- void execute(VPTransformState &State) override;
-
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override {
- O << " +\n" << Indent << "\"BRANCH-ON-MASK ";
- if (VPValue *Mask = getMask())
+ }
+
+ /// Generate the extraction of the appropriate bit from the block mask and the
+ /// conditional branch.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override {
+ O << " +\n" << Indent << "\"BRANCH-ON-MASK ";
+ if (VPValue *Mask = getMask())
Mask->printAsOperand(O, SlotTracker);
- else
- O << " All-One";
- O << "\\l\"";
- }
-
- /// Return the mask used by this recipe. Note that a full mask is represented
- /// by a nullptr.
- VPValue *getMask() const {
+ else
+ O << " All-One";
+ O << "\\l\"";
+ }
+
+ /// Return the mask used by this recipe. Note that a full mask is represented
+ /// by a nullptr.
+ VPValue *getMask() const {
assert(getNumOperands() <= 1 && "should have either 0 or 1 operands");
- // Mask is optional.
+ // Mask is optional.
return getNumOperands() == 1 ? getOperand(0) : nullptr;
- }
-};
-
-/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
-/// control converges back from a Branch-on-Mask. The phi nodes are needed in
-/// order to merge values that are set under such a branch and feed their uses.
-/// The phi nodes can be scalar or vector depending on the users of the value.
-/// This recipe works in concert with VPBranchOnMaskRecipe.
+ }
+};
+
+/// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
+/// control converges back from a Branch-on-Mask. The phi nodes are needed in
+/// order to merge values that are set under such a branch and feed their uses.
+/// The phi nodes can be scalar or vector depending on the users of the value.
+/// This recipe works in concert with VPBranchOnMaskRecipe.
class VPPredInstPHIRecipe : public VPRecipeBase, public VPUser {
-
-public:
- /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
- /// nodes after merging back from a Branch-on-Mask.
+
+public:
+ /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
+ /// nodes after merging back from a Branch-on-Mask.
VPPredInstPHIRecipe(VPValue *PredV)
: VPRecipeBase(VPPredInstPHISC), VPUser(PredV) {
new VPValue(PredV->getUnderlyingValue(), this);
}
- ~VPPredInstPHIRecipe() override = default;
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
+ ~VPPredInstPHIRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPRecipeBase::VPPredInstPHISC;
- }
-
- /// Generates phi nodes for live-outs as needed to retain SSA form.
- void execute(VPTransformState &State) override;
-
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-};
-
-/// A Recipe for widening load/store operations.
-/// The recipe uses the following VPValues:
-/// - For load: Address, optional mask
-/// - For store: Address, stored value, optional mask
-/// TODO: We currently execute only per-part unless a specific instance is
-/// provided.
+ }
+
+ /// Generates phi nodes for live-outs as needed to retain SSA form.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+};
+
+/// A Recipe for widening load/store operations.
+/// The recipe uses the following VPValues:
+/// - For load: Address, optional mask
+/// - For store: Address, stored value, optional mask
+/// TODO: We currently execute only per-part unless a specific instance is
+/// provided.
class VPWidenMemoryInstructionRecipe : public VPRecipeBase,
public VPUser {
Instruction &Ingredient;
-
- void setMask(VPValue *Mask) {
- if (!Mask)
- return;
+
+ void setMask(VPValue *Mask) {
+ if (!Mask)
+ return;
addOperand(Mask);
- }
-
- bool isMasked() const {
+ }
+
+ bool isMasked() const {
return isStore() ? getNumOperands() == 3 : getNumOperands() == 2;
- }
-
-public:
- VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask)
+ }
+
+public:
+ VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask)
: VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr}),
Ingredient(Load) {
new VPValue(VPValue::VPVMemoryInstructionSC, &Load, this);
- setMask(Mask);
- }
-
- VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
- VPValue *StoredValue, VPValue *Mask)
+ setMask(Mask);
+ }
+
+ VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
+ VPValue *StoredValue, VPValue *Mask)
: VPRecipeBase(VPWidenMemoryInstructionSC), VPUser({Addr, StoredValue}),
Ingredient(Store) {
- setMask(Mask);
- }
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
+ setMask(Mask);
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPRecipeBase::VPWidenMemoryInstructionSC;
- }
-
- /// Return the address accessed by this recipe.
- VPValue *getAddr() const {
+ }
+
+ /// Return the address accessed by this recipe.
+ VPValue *getAddr() const {
return getOperand(0); // Address is the 1st, mandatory operand.
- }
-
- /// Return the mask used by this recipe. Note that a full mask is represented
- /// by a nullptr.
- VPValue *getMask() const {
- // Mask is optional and therefore the last operand.
+ }
+
+ /// Return the mask used by this recipe. Note that a full mask is represented
+ /// by a nullptr.
+ VPValue *getMask() const {
+ // Mask is optional and therefore the last operand.
return isMasked() ? getOperand(getNumOperands() - 1) : nullptr;
- }
-
+ }
+
/// Returns true if this recipe is a store.
bool isStore() const { return isa<StoreInst>(Ingredient); }
- /// Return the address accessed by this recipe.
- VPValue *getStoredValue() const {
+ /// Return the address accessed by this recipe.
+ VPValue *getStoredValue() const {
assert(isStore() && "Stored value only available for store instructions");
return getOperand(1); // Stored value is the 2nd, mandatory operand.
- }
-
- /// Generate the wide load/store.
- void execute(VPTransformState &State) override;
-
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-};
-
-/// A Recipe for widening the canonical induction variable of the vector loop.
-class VPWidenCanonicalIVRecipe : public VPRecipeBase {
+ }
+
+ /// Generate the wide load/store.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+};
+
+/// A Recipe for widening the canonical induction variable of the vector loop.
+class VPWidenCanonicalIVRecipe : public VPRecipeBase {
public:
VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) {
new VPValue(nullptr, this);
}
-
- ~VPWidenCanonicalIVRecipe() override = default;
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
+
+ ~VPWidenCanonicalIVRecipe() override = default;
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *D) {
return D->getVPDefID() == VPRecipeBase::VPWidenCanonicalIVSC;
- }
-
- /// Generate a canonical vector induction variable of the vector loop, with
- /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and
- /// step = <VF*UF, VF*UF, ..., VF*UF>.
- void execute(VPTransformState &State) override;
-
- /// Print the recipe.
- void print(raw_ostream &O, const Twine &Indent,
- VPSlotTracker &SlotTracker) const override;
-};
-
-/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
-/// holds a sequence of zero or more VPRecipe's each representing a sequence of
-/// output IR instructions.
-class VPBasicBlock : public VPBlockBase {
-public:
- using RecipeListTy = iplist<VPRecipeBase>;
-
-private:
- /// The VPRecipes held in the order of output instructions to generate.
- RecipeListTy Recipes;
-
-public:
- VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr)
- : VPBlockBase(VPBasicBlockSC, Name.str()) {
- if (Recipe)
- appendRecipe(Recipe);
- }
-
- ~VPBasicBlock() override { Recipes.clear(); }
-
- /// Instruction iterators...
- using iterator = RecipeListTy::iterator;
- using const_iterator = RecipeListTy::const_iterator;
- using reverse_iterator = RecipeListTy::reverse_iterator;
- using const_reverse_iterator = RecipeListTy::const_reverse_iterator;
-
- //===--------------------------------------------------------------------===//
- /// Recipe iterator methods
- ///
- inline iterator begin() { return Recipes.begin(); }
- inline const_iterator begin() const { return Recipes.begin(); }
- inline iterator end() { return Recipes.end(); }
- inline const_iterator end() const { return Recipes.end(); }
-
- inline reverse_iterator rbegin() { return Recipes.rbegin(); }
- inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); }
- inline reverse_iterator rend() { return Recipes.rend(); }
- inline const_reverse_iterator rend() const { return Recipes.rend(); }
-
- inline size_t size() const { return Recipes.size(); }
- inline bool empty() const { return Recipes.empty(); }
- inline const VPRecipeBase &front() const { return Recipes.front(); }
- inline VPRecipeBase &front() { return Recipes.front(); }
- inline const VPRecipeBase &back() const { return Recipes.back(); }
- inline VPRecipeBase &back() { return Recipes.back(); }
-
- /// Returns a reference to the list of recipes.
- RecipeListTy &getRecipeList() { return Recipes; }
-
- /// Returns a pointer to a member of the recipe list.
- static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) {
- return &VPBasicBlock::Recipes;
- }
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPBlockBase *V) {
- return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC;
- }
-
- void insert(VPRecipeBase *Recipe, iterator InsertPt) {
- assert(Recipe && "No recipe to append.");
- assert(!Recipe->Parent && "Recipe already in VPlan");
- Recipe->Parent = this;
- Recipes.insert(InsertPt, Recipe);
- }
-
- /// Augment the existing recipes of a VPBasicBlock with an additional
- /// \p Recipe as the last recipe.
- void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); }
-
- /// The method which generates the output IR instructions that correspond to
- /// this VPBasicBlock, thereby "executing" the VPlan.
- void execute(struct VPTransformState *State) override;
-
+ }
+
+ /// Generate a canonical vector induction variable of the vector loop, with
+ /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and
+ /// step = <VF*UF, VF*UF, ..., VF*UF>.
+ void execute(VPTransformState &State) override;
+
+ /// Print the recipe.
+ void print(raw_ostream &O, const Twine &Indent,
+ VPSlotTracker &SlotTracker) const override;
+};
+
+/// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
+/// holds a sequence of zero or more VPRecipe's each representing a sequence of
+/// output IR instructions.
+class VPBasicBlock : public VPBlockBase {
+public:
+ using RecipeListTy = iplist<VPRecipeBase>;
+
+private:
+ /// The VPRecipes held in the order of output instructions to generate.
+ RecipeListTy Recipes;
+
+public:
+ VPBasicBlock(const Twine &Name = "", VPRecipeBase *Recipe = nullptr)
+ : VPBlockBase(VPBasicBlockSC, Name.str()) {
+ if (Recipe)
+ appendRecipe(Recipe);
+ }
+
+ ~VPBasicBlock() override { Recipes.clear(); }
+
+ /// Instruction iterators...
+ using iterator = RecipeListTy::iterator;
+ using const_iterator = RecipeListTy::const_iterator;
+ using reverse_iterator = RecipeListTy::reverse_iterator;
+ using const_reverse_iterator = RecipeListTy::const_reverse_iterator;
+
+ //===--------------------------------------------------------------------===//
+ /// Recipe iterator methods
+ ///
+ inline iterator begin() { return Recipes.begin(); }
+ inline const_iterator begin() const { return Recipes.begin(); }
+ inline iterator end() { return Recipes.end(); }
+ inline const_iterator end() const { return Recipes.end(); }
+
+ inline reverse_iterator rbegin() { return Recipes.rbegin(); }
+ inline const_reverse_iterator rbegin() const { return Recipes.rbegin(); }
+ inline reverse_iterator rend() { return Recipes.rend(); }
+ inline const_reverse_iterator rend() const { return Recipes.rend(); }
+
+ inline size_t size() const { return Recipes.size(); }
+ inline bool empty() const { return Recipes.empty(); }
+ inline const VPRecipeBase &front() const { return Recipes.front(); }
+ inline VPRecipeBase &front() { return Recipes.front(); }
+ inline const VPRecipeBase &back() const { return Recipes.back(); }
+ inline VPRecipeBase &back() { return Recipes.back(); }
+
+ /// Returns a reference to the list of recipes.
+ RecipeListTy &getRecipeList() { return Recipes; }
+
+ /// Returns a pointer to a member of the recipe list.
+ static RecipeListTy VPBasicBlock::*getSublistAccess(VPRecipeBase *) {
+ return &VPBasicBlock::Recipes;
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPBlockBase *V) {
+ return V->getVPBlockID() == VPBlockBase::VPBasicBlockSC;
+ }
+
+ void insert(VPRecipeBase *Recipe, iterator InsertPt) {
+ assert(Recipe && "No recipe to append.");
+ assert(!Recipe->Parent && "Recipe already in VPlan");
+ Recipe->Parent = this;
+ Recipes.insert(InsertPt, Recipe);
+ }
+
+ /// Augment the existing recipes of a VPBasicBlock with an additional
+ /// \p Recipe as the last recipe.
+ void appendRecipe(VPRecipeBase *Recipe) { insert(Recipe, end()); }
+
+ /// The method which generates the output IR instructions that correspond to
+ /// this VPBasicBlock, thereby "executing" the VPlan.
+ void execute(struct VPTransformState *State) override;
+
/// Return the position of the first non-phi node recipe in the block.
iterator getFirstNonPhi();
void dropAllReferences(VPValue *NewValue) override;
-private:
- /// Create an IR BasicBlock to hold the output instructions generated by this
- /// VPBasicBlock, and return it. Update the CFGState accordingly.
- BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG);
-};
-
-/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
-/// which form a Single-Entry-Single-Exit subgraph of the output IR CFG.
-/// A VPRegionBlock may indicate that its contents are to be replicated several
-/// times. This is designed to support predicated scalarization, in which a
-/// scalar if-then code structure needs to be generated VF * UF times. Having
-/// this replication indicator helps to keep a single model for multiple
-/// candidate VF's. The actual replication takes place only once the desired VF
-/// and UF have been determined.
-class VPRegionBlock : public VPBlockBase {
- /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
- VPBlockBase *Entry;
-
- /// Hold the Single Exit of the SESE region modelled by the VPRegionBlock.
- VPBlockBase *Exit;
-
- /// An indicator whether this region is to generate multiple replicated
- /// instances of output IR corresponding to its VPBlockBases.
- bool IsReplicator;
-
-public:
- VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exit,
- const std::string &Name = "", bool IsReplicator = false)
- : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exit(Exit),
- IsReplicator(IsReplicator) {
- assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
- assert(Exit->getSuccessors().empty() && "Exit block has successors.");
- Entry->setParent(this);
- Exit->setParent(this);
- }
- VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
- : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exit(nullptr),
- IsReplicator(IsReplicator) {}
-
- ~VPRegionBlock() override {
+private:
+ /// Create an IR BasicBlock to hold the output instructions generated by this
+ /// VPBasicBlock, and return it. Update the CFGState accordingly.
+ BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG);
+};
+
+/// VPRegionBlock represents a collection of VPBasicBlocks and VPRegionBlocks
+/// which form a Single-Entry-Single-Exit subgraph of the output IR CFG.
+/// A VPRegionBlock may indicate that its contents are to be replicated several
+/// times. This is designed to support predicated scalarization, in which a
+/// scalar if-then code structure needs to be generated VF * UF times. Having
+/// this replication indicator helps to keep a single model for multiple
+/// candidate VF's. The actual replication takes place only once the desired VF
+/// and UF have been determined.
+class VPRegionBlock : public VPBlockBase {
+ /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
+ VPBlockBase *Entry;
+
+ /// Hold the Single Exit of the SESE region modelled by the VPRegionBlock.
+ VPBlockBase *Exit;
+
+ /// An indicator whether this region is to generate multiple replicated
+ /// instances of output IR corresponding to its VPBlockBases.
+ bool IsReplicator;
+
+public:
+ VPRegionBlock(VPBlockBase *Entry, VPBlockBase *Exit,
+ const std::string &Name = "", bool IsReplicator = false)
+ : VPBlockBase(VPRegionBlockSC, Name), Entry(Entry), Exit(Exit),
+ IsReplicator(IsReplicator) {
+ assert(Entry->getPredecessors().empty() && "Entry block has predecessors.");
+ assert(Exit->getSuccessors().empty() && "Exit block has successors.");
+ Entry->setParent(this);
+ Exit->setParent(this);
+ }
+ VPRegionBlock(const std::string &Name = "", bool IsReplicator = false)
+ : VPBlockBase(VPRegionBlockSC, Name), Entry(nullptr), Exit(nullptr),
+ IsReplicator(IsReplicator) {}
+
+ ~VPRegionBlock() override {
if (Entry) {
VPValue DummyValue;
Entry->dropAllReferences(&DummyValue);
- deleteCFG(Entry);
+ deleteCFG(Entry);
}
- }
-
- /// Method to support type inquiry through isa, cast, and dyn_cast.
- static inline bool classof(const VPBlockBase *V) {
- return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC;
- }
-
- const VPBlockBase *getEntry() const { return Entry; }
- VPBlockBase *getEntry() { return Entry; }
-
- /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
- /// EntryBlock must have no predecessors.
- void setEntry(VPBlockBase *EntryBlock) {
- assert(EntryBlock->getPredecessors().empty() &&
- "Entry block cannot have predecessors.");
- Entry = EntryBlock;
- EntryBlock->setParent(this);
- }
-
- // FIXME: DominatorTreeBase is doing 'A->getParent()->front()'. 'front' is a
- // specific interface of llvm::Function, instead of using
- // GraphTraints::getEntryNode. We should add a new template parameter to
- // DominatorTreeBase representing the Graph type.
- VPBlockBase &front() const { return *Entry; }
-
- const VPBlockBase *getExit() const { return Exit; }
- VPBlockBase *getExit() { return Exit; }
-
- /// Set \p ExitBlock as the exit VPBlockBase of this VPRegionBlock. \p
- /// ExitBlock must have no successors.
- void setExit(VPBlockBase *ExitBlock) {
- assert(ExitBlock->getSuccessors().empty() &&
- "Exit block cannot have successors.");
- Exit = ExitBlock;
- ExitBlock->setParent(this);
- }
-
- /// An indicator whether this region is to generate multiple replicated
- /// instances of output IR corresponding to its VPBlockBases.
- bool isReplicator() const { return IsReplicator; }
-
- /// The method which generates the output IR instructions that correspond to
- /// this VPRegionBlock, thereby "executing" the VPlan.
- void execute(struct VPTransformState *State) override;
+ }
+
+ /// Method to support type inquiry through isa, cast, and dyn_cast.
+ static inline bool classof(const VPBlockBase *V) {
+ return V->getVPBlockID() == VPBlockBase::VPRegionBlockSC;
+ }
+
+ const VPBlockBase *getEntry() const { return Entry; }
+ VPBlockBase *getEntry() { return Entry; }
+
+ /// Set \p EntryBlock as the entry VPBlockBase of this VPRegionBlock. \p
+ /// EntryBlock must have no predecessors.
+ void setEntry(VPBlockBase *EntryBlock) {
+ assert(EntryBlock->getPredecessors().empty() &&
+ "Entry block cannot have predecessors.");
+ Entry = EntryBlock;
+ EntryBlock->setParent(this);
+ }
+
+ // FIXME: DominatorTreeBase is doing 'A->getParent()->front()'. 'front' is a
+ // specific interface of llvm::Function, instead of using
+ // GraphTraints::getEntryNode. We should add a new template parameter to
+ // DominatorTreeBase representing the Graph type.
+ VPBlockBase &front() const { return *Entry; }
+
+ const VPBlockBase *getExit() const { return Exit; }
+ VPBlockBase *getExit() { return Exit; }
+
+ /// Set \p ExitBlock as the exit VPBlockBase of this VPRegionBlock. \p
+ /// ExitBlock must have no successors.
+ void setExit(VPBlockBase *ExitBlock) {
+ assert(ExitBlock->getSuccessors().empty() &&
+ "Exit block cannot have successors.");
+ Exit = ExitBlock;
+ ExitBlock->setParent(this);
+ }
+
+ /// An indicator whether this region is to generate multiple replicated
+ /// instances of output IR corresponding to its VPBlockBases.
+ bool isReplicator() const { return IsReplicator; }
+
+ /// The method which generates the output IR instructions that correspond to
+ /// this VPRegionBlock, thereby "executing" the VPlan.
+ void execute(struct VPTransformState *State) override;
void dropAllReferences(VPValue *NewValue) override;
-};
-
-//===----------------------------------------------------------------------===//
-// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs //
-//===----------------------------------------------------------------------===//
-
-// The following set of template specializations implement GraphTraits to treat
-// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note
-// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the
-// VPBlockBase is a VPRegionBlock, this specialization provides access to its
-// successors/predecessors but not to the blocks inside the region.
-
-template <> struct GraphTraits<VPBlockBase *> {
- using NodeRef = VPBlockBase *;
- using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
-
- static NodeRef getEntryNode(NodeRef N) { return N; }
-
- static inline ChildIteratorType child_begin(NodeRef N) {
- return N->getSuccessors().begin();
- }
-
- static inline ChildIteratorType child_end(NodeRef N) {
- return N->getSuccessors().end();
- }
-};
-
-template <> struct GraphTraits<const VPBlockBase *> {
- using NodeRef = const VPBlockBase *;
- using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator;
-
- static NodeRef getEntryNode(NodeRef N) { return N; }
-
- static inline ChildIteratorType child_begin(NodeRef N) {
- return N->getSuccessors().begin();
- }
-
- static inline ChildIteratorType child_end(NodeRef N) {
- return N->getSuccessors().end();
- }
-};
-
-// Inverse order specialization for VPBasicBlocks. Predecessors are used instead
-// of successors for the inverse traversal.
-template <> struct GraphTraits<Inverse<VPBlockBase *>> {
- using NodeRef = VPBlockBase *;
- using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
-
- static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; }
-
- static inline ChildIteratorType child_begin(NodeRef N) {
- return N->getPredecessors().begin();
- }
-
- static inline ChildIteratorType child_end(NodeRef N) {
- return N->getPredecessors().end();
- }
-};
-
-// The following set of template specializations implement GraphTraits to
-// treat VPRegionBlock as a graph and recurse inside its nodes. It's important
-// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases
-// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so
-// there won't be automatic recursion into other VPBlockBases that turn to be
-// VPRegionBlocks.
-
-template <>
-struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> {
- using GraphRef = VPRegionBlock *;
- using nodes_iterator = df_iterator<NodeRef>;
-
- static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
-
- static nodes_iterator nodes_begin(GraphRef N) {
- return nodes_iterator::begin(N->getEntry());
- }
-
- static nodes_iterator nodes_end(GraphRef N) {
- // df_iterator::end() returns an empty iterator so the node used doesn't
- // matter.
- return nodes_iterator::end(N);
- }
-};
-
-template <>
-struct GraphTraits<const VPRegionBlock *>
- : public GraphTraits<const VPBlockBase *> {
- using GraphRef = const VPRegionBlock *;
- using nodes_iterator = df_iterator<NodeRef>;
-
- static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
-
- static nodes_iterator nodes_begin(GraphRef N) {
- return nodes_iterator::begin(N->getEntry());
- }
-
- static nodes_iterator nodes_end(GraphRef N) {
- // df_iterator::end() returns an empty iterator so the node used doesn't
- // matter.
- return nodes_iterator::end(N);
- }
-};
-
-template <>
-struct GraphTraits<Inverse<VPRegionBlock *>>
- : public GraphTraits<Inverse<VPBlockBase *>> {
- using GraphRef = VPRegionBlock *;
- using nodes_iterator = df_iterator<NodeRef>;
-
- static NodeRef getEntryNode(Inverse<GraphRef> N) {
- return N.Graph->getExit();
- }
-
- static nodes_iterator nodes_begin(GraphRef N) {
- return nodes_iterator::begin(N->getExit());
- }
-
- static nodes_iterator nodes_end(GraphRef N) {
- // df_iterator::end() returns an empty iterator so the node used doesn't
- // matter.
- return nodes_iterator::end(N);
- }
-};
-
-/// VPlan models a candidate for vectorization, encoding various decisions take
-/// to produce efficient output IR, including which branches, basic-blocks and
-/// output IR instructions to generate, and their cost. VPlan holds a
-/// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry
-/// VPBlock.
-class VPlan {
- friend class VPlanPrinter;
- friend class VPSlotTracker;
-
- /// Hold the single entry to the Hierarchical CFG of the VPlan.
- VPBlockBase *Entry;
-
- /// Holds the VFs applicable to this VPlan.
+};
+
+//===----------------------------------------------------------------------===//
+// GraphTraits specializations for VPlan Hierarchical Control-Flow Graphs //
+//===----------------------------------------------------------------------===//
+
+// The following set of template specializations implement GraphTraits to treat
+// any VPBlockBase as a node in a graph of VPBlockBases. It's important to note
+// that VPBlockBase traits don't recurse into VPRegioBlocks, i.e., if the
+// VPBlockBase is a VPRegionBlock, this specialization provides access to its
+// successors/predecessors but not to the blocks inside the region.
+
+template <> struct GraphTraits<VPBlockBase *> {
+ using NodeRef = VPBlockBase *;
+ using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
+
+ static NodeRef getEntryNode(NodeRef N) { return N; }
+
+ static inline ChildIteratorType child_begin(NodeRef N) {
+ return N->getSuccessors().begin();
+ }
+
+ static inline ChildIteratorType child_end(NodeRef N) {
+ return N->getSuccessors().end();
+ }
+};
+
+template <> struct GraphTraits<const VPBlockBase *> {
+ using NodeRef = const VPBlockBase *;
+ using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::const_iterator;
+
+ static NodeRef getEntryNode(NodeRef N) { return N; }
+
+ static inline ChildIteratorType child_begin(NodeRef N) {
+ return N->getSuccessors().begin();
+ }
+
+ static inline ChildIteratorType child_end(NodeRef N) {
+ return N->getSuccessors().end();
+ }
+};
+
+// Inverse order specialization for VPBasicBlocks. Predecessors are used instead
+// of successors for the inverse traversal.
+template <> struct GraphTraits<Inverse<VPBlockBase *>> {
+ using NodeRef = VPBlockBase *;
+ using ChildIteratorType = SmallVectorImpl<VPBlockBase *>::iterator;
+
+ static NodeRef getEntryNode(Inverse<NodeRef> B) { return B.Graph; }
+
+ static inline ChildIteratorType child_begin(NodeRef N) {
+ return N->getPredecessors().begin();
+ }
+
+ static inline ChildIteratorType child_end(NodeRef N) {
+ return N->getPredecessors().end();
+ }
+};
+
+// The following set of template specializations implement GraphTraits to
+// treat VPRegionBlock as a graph and recurse inside its nodes. It's important
+// to note that the blocks inside the VPRegionBlock are treated as VPBlockBases
+// (i.e., no dyn_cast is performed, VPBlockBases specialization is used), so
+// there won't be automatic recursion into other VPBlockBases that turn to be
+// VPRegionBlocks.
+
+template <>
+struct GraphTraits<VPRegionBlock *> : public GraphTraits<VPBlockBase *> {
+ using GraphRef = VPRegionBlock *;
+ using nodes_iterator = df_iterator<NodeRef>;
+
+ static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
+
+ static nodes_iterator nodes_begin(GraphRef N) {
+ return nodes_iterator::begin(N->getEntry());
+ }
+
+ static nodes_iterator nodes_end(GraphRef N) {
+ // df_iterator::end() returns an empty iterator so the node used doesn't
+ // matter.
+ return nodes_iterator::end(N);
+ }
+};
+
+template <>
+struct GraphTraits<const VPRegionBlock *>
+ : public GraphTraits<const VPBlockBase *> {
+ using GraphRef = const VPRegionBlock *;
+ using nodes_iterator = df_iterator<NodeRef>;
+
+ static NodeRef getEntryNode(GraphRef N) { return N->getEntry(); }
+
+ static nodes_iterator nodes_begin(GraphRef N) {
+ return nodes_iterator::begin(N->getEntry());
+ }
+
+ static nodes_iterator nodes_end(GraphRef N) {
+ // df_iterator::end() returns an empty iterator so the node used doesn't
+ // matter.
+ return nodes_iterator::end(N);
+ }
+};
+
+template <>
+struct GraphTraits<Inverse<VPRegionBlock *>>
+ : public GraphTraits<Inverse<VPBlockBase *>> {
+ using GraphRef = VPRegionBlock *;
+ using nodes_iterator = df_iterator<NodeRef>;
+
+ static NodeRef getEntryNode(Inverse<GraphRef> N) {
+ return N.Graph->getExit();
+ }
+
+ static nodes_iterator nodes_begin(GraphRef N) {
+ return nodes_iterator::begin(N->getExit());
+ }
+
+ static nodes_iterator nodes_end(GraphRef N) {
+ // df_iterator::end() returns an empty iterator so the node used doesn't
+ // matter.
+ return nodes_iterator::end(N);
+ }
+};
+
+/// VPlan models a candidate for vectorization, encoding various decisions take
+/// to produce efficient output IR, including which branches, basic-blocks and
+/// output IR instructions to generate, and their cost. VPlan holds a
+/// Hierarchical-CFG of VPBasicBlocks and VPRegionBlocks rooted at an Entry
+/// VPBlock.
+class VPlan {
+ friend class VPlanPrinter;
+ friend class VPSlotTracker;
+
+ /// Hold the single entry to the Hierarchical CFG of the VPlan.
+ VPBlockBase *Entry;
+
+ /// Holds the VFs applicable to this VPlan.
SmallSetVector<ElementCount, 2> VFs;
-
- /// Holds the name of the VPlan, for printing.
- std::string Name;
-
- /// Holds all the external definitions created for this VPlan.
- // TODO: Introduce a specific representation for external definitions in
- // VPlan. External definitions must be immutable and hold a pointer to its
- // underlying IR that will be used to implement its structural comparison
- // (operators '==' and '<').
- SmallPtrSet<VPValue *, 16> VPExternalDefs;
-
- /// Represents the backedge taken count of the original loop, for folding
- /// the tail.
- VPValue *BackedgeTakenCount = nullptr;
-
- /// Holds a mapping between Values and their corresponding VPValue inside
- /// VPlan.
- Value2VPValueTy Value2VPValue;
-
+
+ /// Holds the name of the VPlan, for printing.
+ std::string Name;
+
+ /// Holds all the external definitions created for this VPlan.
+ // TODO: Introduce a specific representation for external definitions in
+ // VPlan. External definitions must be immutable and hold a pointer to its
+ // underlying IR that will be used to implement its structural comparison
+ // (operators '==' and '<').
+ SmallPtrSet<VPValue *, 16> VPExternalDefs;
+
+ /// Represents the backedge taken count of the original loop, for folding
+ /// the tail.
+ VPValue *BackedgeTakenCount = nullptr;
+
+ /// Holds a mapping between Values and their corresponding VPValue inside
+ /// VPlan.
+ Value2VPValueTy Value2VPValue;
+
/// Contains all VPValues that been allocated by addVPValue directly and need
/// to be free when the plan's destructor is called.
SmallVector<VPValue *, 16> VPValuesToFree;
- /// Holds the VPLoopInfo analysis for this VPlan.
- VPLoopInfo VPLInfo;
-
- /// Holds the condition bit values built during VPInstruction to VPRecipe transformation.
- SmallVector<VPValue *, 4> VPCBVs;
-
-public:
- VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
- if (Entry)
- Entry->setPlan(this);
- }
-
- ~VPlan() {
+ /// Holds the VPLoopInfo analysis for this VPlan.
+ VPLoopInfo VPLInfo;
+
+ /// Holds the condition bit values built during VPInstruction to VPRecipe transformation.
+ SmallVector<VPValue *, 4> VPCBVs;
+
+public:
+ VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
+ if (Entry)
+ Entry->setPlan(this);
+ }
+
+ ~VPlan() {
if (Entry) {
VPValue DummyValue;
for (VPBlockBase *Block : depth_first(Entry))
Block->dropAllReferences(&DummyValue);
- VPBlockBase::deleteCFG(Entry);
+ VPBlockBase::deleteCFG(Entry);
}
for (VPValue *VPV : VPValuesToFree)
delete VPV;
- if (BackedgeTakenCount)
- delete BackedgeTakenCount;
- for (VPValue *Def : VPExternalDefs)
- delete Def;
- for (VPValue *CBV : VPCBVs)
- delete CBV;
- }
-
- /// Generate the IR code for this VPlan.
- void execute(struct VPTransformState *State);
-
- VPBlockBase *getEntry() { return Entry; }
- const VPBlockBase *getEntry() const { return Entry; }
-
- VPBlockBase *setEntry(VPBlockBase *Block) {
- Entry = Block;
- Block->setPlan(this);
- return Entry;
- }
-
- /// The backedge taken count of the original loop.
- VPValue *getOrCreateBackedgeTakenCount() {
- if (!BackedgeTakenCount)
- BackedgeTakenCount = new VPValue();
- return BackedgeTakenCount;
- }
-
+ if (BackedgeTakenCount)
+ delete BackedgeTakenCount;
+ for (VPValue *Def : VPExternalDefs)
+ delete Def;
+ for (VPValue *CBV : VPCBVs)
+ delete CBV;
+ }
+
+ /// Generate the IR code for this VPlan.
+ void execute(struct VPTransformState *State);
+
+ VPBlockBase *getEntry() { return Entry; }
+ const VPBlockBase *getEntry() const { return Entry; }
+
+ VPBlockBase *setEntry(VPBlockBase *Block) {
+ Entry = Block;
+ Block->setPlan(this);
+ return Entry;
+ }
+
+ /// The backedge taken count of the original loop.
+ VPValue *getOrCreateBackedgeTakenCount() {
+ if (!BackedgeTakenCount)
+ BackedgeTakenCount = new VPValue();
+ return BackedgeTakenCount;
+ }
+
void addVF(ElementCount VF) { VFs.insert(VF); }
-
+
bool hasVF(ElementCount VF) { return VFs.count(VF); }
-
- const std::string &getName() const { return Name; }
-
- void setName(const Twine &newName) { Name = newName.str(); }
-
- /// Add \p VPVal to the pool of external definitions if it's not already
- /// in the pool.
- void addExternalDef(VPValue *VPVal) {
- VPExternalDefs.insert(VPVal);
- }
-
- /// Add \p CBV to the vector of condition bit values.
- void addCBV(VPValue *CBV) {
- VPCBVs.push_back(CBV);
- }
-
- void addVPValue(Value *V) {
- assert(V && "Trying to add a null Value to VPlan");
- assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
+
+ const std::string &getName() const { return Name; }
+
+ void setName(const Twine &newName) { Name = newName.str(); }
+
+ /// Add \p VPVal to the pool of external definitions if it's not already
+ /// in the pool.
+ void addExternalDef(VPValue *VPVal) {
+ VPExternalDefs.insert(VPVal);
+ }
+
+ /// Add \p CBV to the vector of condition bit values.
+ void addCBV(VPValue *CBV) {
+ VPCBVs.push_back(CBV);
+ }
+
+ void addVPValue(Value *V) {
+ assert(V && "Trying to add a null Value to VPlan");
+ assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
VPValue *VPV = new VPValue(V);
Value2VPValue[V] = VPV;
VPValuesToFree.push_back(VPV);
- }
-
+ }
+
void addVPValue(Value *V, VPValue *VPV) {
assert(V && "Trying to add a null Value to VPlan");
assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
Value2VPValue[V] = VPV;
}
- VPValue *getVPValue(Value *V) {
- assert(V && "Trying to get the VPValue of a null Value");
- assert(Value2VPValue.count(V) && "Value does not exist in VPlan");
- return Value2VPValue[V];
- }
-
- VPValue *getOrAddVPValue(Value *V) {
- assert(V && "Trying to get or add the VPValue of a null Value");
- if (!Value2VPValue.count(V))
- addVPValue(V);
- return getVPValue(V);
- }
-
+ VPValue *getVPValue(Value *V) {
+ assert(V && "Trying to get the VPValue of a null Value");
+ assert(Value2VPValue.count(V) && "Value does not exist in VPlan");
+ return Value2VPValue[V];
+ }
+
+ VPValue *getOrAddVPValue(Value *V) {
+ assert(V && "Trying to get or add the VPValue of a null Value");
+ if (!Value2VPValue.count(V))
+ addVPValue(V);
+ return getVPValue(V);
+ }
+
void removeVPValueFor(Value *V) { Value2VPValue.erase(V); }
- /// Return the VPLoopInfo analysis for this VPlan.
- VPLoopInfo &getVPLoopInfo() { return VPLInfo; }
- const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; }
-
- /// Dump the plan to stderr (for debugging).
- void dump() const;
-
- /// Returns a range mapping the values the range \p Operands to their
- /// corresponding VPValues.
- iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
- mapToVPValues(User::op_range Operands) {
- std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
- return getOrAddVPValue(Op);
- };
- return map_range(Operands, Fn);
- }
-
-private:
- /// Add to the given dominator tree the header block and every new basic block
- /// that was created between it and the latch block, inclusive.
- static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopLatchBB,
- BasicBlock *LoopPreHeaderBB,
- BasicBlock *LoopExitBB);
-};
-
-/// VPlanPrinter prints a given VPlan to a given output stream. The printing is
-/// indented and follows the dot format.
-class VPlanPrinter {
- friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan);
- friend inline raw_ostream &operator<<(raw_ostream &OS,
- const struct VPlanIngredient &I);
-
-private:
- raw_ostream &OS;
- const VPlan &Plan;
- unsigned Depth = 0;
- unsigned TabWidth = 2;
- std::string Indent;
- unsigned BID = 0;
- SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
-
- VPSlotTracker SlotTracker;
-
- VPlanPrinter(raw_ostream &O, const VPlan &P)
- : OS(O), Plan(P), SlotTracker(&P) {}
-
- /// Handle indentation.
- void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
-
- /// Print a given \p Block of the Plan.
- void dumpBlock(const VPBlockBase *Block);
-
- /// Print the information related to the CFG edges going out of a given
- /// \p Block, followed by printing the successor blocks themselves.
- void dumpEdges(const VPBlockBase *Block);
-
- /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
- /// its successor blocks.
- void dumpBasicBlock(const VPBasicBlock *BasicBlock);
-
- /// Print a given \p Region of the Plan.
- void dumpRegion(const VPRegionBlock *Region);
-
- unsigned getOrCreateBID(const VPBlockBase *Block) {
- return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
- }
-
- const Twine getOrCreateName(const VPBlockBase *Block);
-
- const Twine getUID(const VPBlockBase *Block);
-
- /// Print the information related to a CFG edge between two VPBlockBases.
- void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
- const Twine &Label);
-
- void dump();
-
+ /// Return the VPLoopInfo analysis for this VPlan.
+ VPLoopInfo &getVPLoopInfo() { return VPLInfo; }
+ const VPLoopInfo &getVPLoopInfo() const { return VPLInfo; }
+
+ /// Dump the plan to stderr (for debugging).
+ void dump() const;
+
+ /// Returns a range mapping the values the range \p Operands to their
+ /// corresponding VPValues.
+ iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
+ mapToVPValues(User::op_range Operands) {
+ std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
+ return getOrAddVPValue(Op);
+ };
+ return map_range(Operands, Fn);
+ }
+
+private:
+ /// Add to the given dominator tree the header block and every new basic block
+ /// that was created between it and the latch block, inclusive.
+ static void updateDominatorTree(DominatorTree *DT, BasicBlock *LoopLatchBB,
+ BasicBlock *LoopPreHeaderBB,
+ BasicBlock *LoopExitBB);
+};
+
+/// VPlanPrinter prints a given VPlan to a given output stream. The printing is
+/// indented and follows the dot format.
+class VPlanPrinter {
+ friend inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan);
+ friend inline raw_ostream &operator<<(raw_ostream &OS,
+ const struct VPlanIngredient &I);
+
+private:
+ raw_ostream &OS;
+ const VPlan &Plan;
+ unsigned Depth = 0;
+ unsigned TabWidth = 2;
+ std::string Indent;
+ unsigned BID = 0;
+ SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
+
+ VPSlotTracker SlotTracker;
+
+ VPlanPrinter(raw_ostream &O, const VPlan &P)
+ : OS(O), Plan(P), SlotTracker(&P) {}
+
+ /// Handle indentation.
+ void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
+
+ /// Print a given \p Block of the Plan.
+ void dumpBlock(const VPBlockBase *Block);
+
+ /// Print the information related to the CFG edges going out of a given
+ /// \p Block, followed by printing the successor blocks themselves.
+ void dumpEdges(const VPBlockBase *Block);
+
+ /// Print a given \p BasicBlock, including its VPRecipes, followed by printing
+ /// its successor blocks.
+ void dumpBasicBlock(const VPBasicBlock *BasicBlock);
+
+ /// Print a given \p Region of the Plan.
+ void dumpRegion(const VPRegionBlock *Region);
+
+ unsigned getOrCreateBID(const VPBlockBase *Block) {
+ return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++;
+ }
+
+ const Twine getOrCreateName(const VPBlockBase *Block);
+
+ const Twine getUID(const VPBlockBase *Block);
+
+ /// Print the information related to a CFG edge between two VPBlockBases.
+ void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden,
+ const Twine &Label);
+
+ void dump();
+
static void printAsIngredient(raw_ostream &O, const Value *V);
-};
-
-struct VPlanIngredient {
+};
+
+struct VPlanIngredient {
const Value *V;
-
+
VPlanIngredient(const Value *V) : V(V) {}
-};
-
-inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
- VPlanPrinter::printAsIngredient(OS, I.V);
- return OS;
-}
-
-inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) {
- VPlanPrinter Printer(OS, Plan);
- Printer.dump();
- return OS;
-}
-
-//===----------------------------------------------------------------------===//
-// VPlan Utilities
-//===----------------------------------------------------------------------===//
-
-/// Class that provides utilities for VPBlockBases in VPlan.
-class VPBlockUtils {
-public:
- VPBlockUtils() = delete;
-
- /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
- /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
- /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr
- /// has more than one successor, its conditional bit is propagated to \p
- /// NewBlock. \p NewBlock must have neither successors nor predecessors.
- static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
- assert(NewBlock->getSuccessors().empty() &&
- "Can't insert new block with successors.");
- // TODO: move successors from BlockPtr to NewBlock when this functionality
- // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr
- // already has successors.
- BlockPtr->setOneSuccessor(NewBlock);
- NewBlock->setPredecessors({BlockPtr});
- NewBlock->setParent(BlockPtr->getParent());
- }
-
- /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
- /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
- /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
- /// parent to \p IfTrue and \p IfFalse. \p Condition is set as the successor
- /// selector. \p BlockPtr must have no successors and \p IfTrue and \p IfFalse
- /// must have neither successors nor predecessors.
- static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
- VPValue *Condition, VPBlockBase *BlockPtr) {
- assert(IfTrue->getSuccessors().empty() &&
- "Can't insert IfTrue with successors.");
- assert(IfFalse->getSuccessors().empty() &&
- "Can't insert IfFalse with successors.");
- BlockPtr->setTwoSuccessors(IfTrue, IfFalse, Condition);
- IfTrue->setPredecessors({BlockPtr});
- IfFalse->setPredecessors({BlockPtr});
- IfTrue->setParent(BlockPtr->getParent());
- IfFalse->setParent(BlockPtr->getParent());
- }
-
- /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to
- /// the successors of \p From and \p From to the predecessors of \p To. Both
- /// VPBlockBases must have the same parent, which can be null. Both
- /// VPBlockBases can be already connected to other VPBlockBases.
- static void connectBlocks(VPBlockBase *From, VPBlockBase *To) {
- assert((From->getParent() == To->getParent()) &&
- "Can't connect two block with different parents");
- assert(From->getNumSuccessors() < 2 &&
- "Blocks can't have more than two successors.");
- From->appendSuccessor(To);
- To->appendPredecessor(From);
- }
-
- /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To
- /// from the successors of \p From and \p From from the predecessors of \p To.
- static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) {
- assert(To && "Successor to disconnect is null.");
- From->removeSuccessor(To);
- To->removePredecessor(From);
- }
-
- /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge.
- static bool isBackEdge(const VPBlockBase *FromBlock,
- const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) {
- assert(FromBlock->getParent() == ToBlock->getParent() &&
- FromBlock->getParent() && "Must be in same region");
- const VPLoop *FromLoop = VPLI->getLoopFor(FromBlock);
- const VPLoop *ToLoop = VPLI->getLoopFor(ToBlock);
- if (!FromLoop || !ToLoop || FromLoop != ToLoop)
- return false;
-
- // A back-edge is a branch from the loop latch to its header.
- return ToLoop->isLoopLatch(FromBlock) && ToBlock == ToLoop->getHeader();
- }
-
- /// Returns true if \p Block is a loop latch
- static bool blockIsLoopLatch(const VPBlockBase *Block,
- const VPLoopInfo *VPLInfo) {
- if (const VPLoop *ParentVPL = VPLInfo->getLoopFor(Block))
- return ParentVPL->isLoopLatch(Block);
-
- return false;
- }
-
- /// Count and return the number of succesors of \p PredBlock excluding any
- /// backedges.
- static unsigned countSuccessorsNoBE(VPBlockBase *PredBlock,
- VPLoopInfo *VPLI) {
- unsigned Count = 0;
- for (VPBlockBase *SuccBlock : PredBlock->getSuccessors()) {
- if (!VPBlockUtils::isBackEdge(PredBlock, SuccBlock, VPLI))
- Count++;
- }
- return Count;
- }
-};
-
-class VPInterleavedAccessInfo {
- DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *>
- InterleaveGroupMap;
-
- /// Type for mapping of instruction based interleave groups to VPInstruction
- /// interleave groups
- using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *,
- InterleaveGroup<VPInstruction> *>;
-
- /// Recursively \p Region and populate VPlan based interleave groups based on
- /// \p IAI.
- void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New,
- InterleavedAccessInfo &IAI);
- /// Recursively traverse \p Block and populate VPlan based interleave groups
- /// based on \p IAI.
- void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
- InterleavedAccessInfo &IAI);
-
-public:
- VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI);
-
- ~VPInterleavedAccessInfo() {
- SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet;
- // Avoid releasing a pointer twice.
- for (auto &I : InterleaveGroupMap)
- DelSet.insert(I.second);
- for (auto *Ptr : DelSet)
- delete Ptr;
- }
-
- /// Get the interleave group that \p Instr belongs to.
- ///
- /// \returns nullptr if doesn't have such group.
- InterleaveGroup<VPInstruction> *
- getInterleaveGroup(VPInstruction *Instr) const {
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS, const VPlanIngredient &I) {
+ VPlanPrinter::printAsIngredient(OS, I.V);
+ return OS;
+}
+
+inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) {
+ VPlanPrinter Printer(OS, Plan);
+ Printer.dump();
+ return OS;
+}
+
+//===----------------------------------------------------------------------===//
+// VPlan Utilities
+//===----------------------------------------------------------------------===//
+
+/// Class that provides utilities for VPBlockBases in VPlan.
+class VPBlockUtils {
+public:
+ VPBlockUtils() = delete;
+
+ /// Insert disconnected VPBlockBase \p NewBlock after \p BlockPtr. Add \p
+ /// NewBlock as successor of \p BlockPtr and \p BlockPtr as predecessor of \p
+ /// NewBlock, and propagate \p BlockPtr parent to \p NewBlock. If \p BlockPtr
+ /// has more than one successor, its conditional bit is propagated to \p
+ /// NewBlock. \p NewBlock must have neither successors nor predecessors.
+ static void insertBlockAfter(VPBlockBase *NewBlock, VPBlockBase *BlockPtr) {
+ assert(NewBlock->getSuccessors().empty() &&
+ "Can't insert new block with successors.");
+ // TODO: move successors from BlockPtr to NewBlock when this functionality
+ // is necessary. For now, setBlockSingleSuccessor will assert if BlockPtr
+ // already has successors.
+ BlockPtr->setOneSuccessor(NewBlock);
+ NewBlock->setPredecessors({BlockPtr});
+ NewBlock->setParent(BlockPtr->getParent());
+ }
+
+ /// Insert disconnected VPBlockBases \p IfTrue and \p IfFalse after \p
+ /// BlockPtr. Add \p IfTrue and \p IfFalse as succesors of \p BlockPtr and \p
+ /// BlockPtr as predecessor of \p IfTrue and \p IfFalse. Propagate \p BlockPtr
+ /// parent to \p IfTrue and \p IfFalse. \p Condition is set as the successor
+ /// selector. \p BlockPtr must have no successors and \p IfTrue and \p IfFalse
+ /// must have neither successors nor predecessors.
+ static void insertTwoBlocksAfter(VPBlockBase *IfTrue, VPBlockBase *IfFalse,
+ VPValue *Condition, VPBlockBase *BlockPtr) {
+ assert(IfTrue->getSuccessors().empty() &&
+ "Can't insert IfTrue with successors.");
+ assert(IfFalse->getSuccessors().empty() &&
+ "Can't insert IfFalse with successors.");
+ BlockPtr->setTwoSuccessors(IfTrue, IfFalse, Condition);
+ IfTrue->setPredecessors({BlockPtr});
+ IfFalse->setPredecessors({BlockPtr});
+ IfTrue->setParent(BlockPtr->getParent());
+ IfFalse->setParent(BlockPtr->getParent());
+ }
+
+ /// Connect VPBlockBases \p From and \p To bi-directionally. Append \p To to
+ /// the successors of \p From and \p From to the predecessors of \p To. Both
+ /// VPBlockBases must have the same parent, which can be null. Both
+ /// VPBlockBases can be already connected to other VPBlockBases.
+ static void connectBlocks(VPBlockBase *From, VPBlockBase *To) {
+ assert((From->getParent() == To->getParent()) &&
+ "Can't connect two block with different parents");
+ assert(From->getNumSuccessors() < 2 &&
+ "Blocks can't have more than two successors.");
+ From->appendSuccessor(To);
+ To->appendPredecessor(From);
+ }
+
+ /// Disconnect VPBlockBases \p From and \p To bi-directionally. Remove \p To
+ /// from the successors of \p From and \p From from the predecessors of \p To.
+ static void disconnectBlocks(VPBlockBase *From, VPBlockBase *To) {
+ assert(To && "Successor to disconnect is null.");
+ From->removeSuccessor(To);
+ To->removePredecessor(From);
+ }
+
+ /// Returns true if the edge \p FromBlock -> \p ToBlock is a back-edge.
+ static bool isBackEdge(const VPBlockBase *FromBlock,
+ const VPBlockBase *ToBlock, const VPLoopInfo *VPLI) {
+ assert(FromBlock->getParent() == ToBlock->getParent() &&
+ FromBlock->getParent() && "Must be in same region");
+ const VPLoop *FromLoop = VPLI->getLoopFor(FromBlock);
+ const VPLoop *ToLoop = VPLI->getLoopFor(ToBlock);
+ if (!FromLoop || !ToLoop || FromLoop != ToLoop)
+ return false;
+
+ // A back-edge is a branch from the loop latch to its header.
+ return ToLoop->isLoopLatch(FromBlock) && ToBlock == ToLoop->getHeader();
+ }
+
+ /// Returns true if \p Block is a loop latch
+ static bool blockIsLoopLatch(const VPBlockBase *Block,
+ const VPLoopInfo *VPLInfo) {
+ if (const VPLoop *ParentVPL = VPLInfo->getLoopFor(Block))
+ return ParentVPL->isLoopLatch(Block);
+
+ return false;
+ }
+
+ /// Count and return the number of succesors of \p PredBlock excluding any
+ /// backedges.
+ static unsigned countSuccessorsNoBE(VPBlockBase *PredBlock,
+ VPLoopInfo *VPLI) {
+ unsigned Count = 0;
+ for (VPBlockBase *SuccBlock : PredBlock->getSuccessors()) {
+ if (!VPBlockUtils::isBackEdge(PredBlock, SuccBlock, VPLI))
+ Count++;
+ }
+ return Count;
+ }
+};
+
+class VPInterleavedAccessInfo {
+ DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *>
+ InterleaveGroupMap;
+
+ /// Type for mapping of instruction based interleave groups to VPInstruction
+ /// interleave groups
+ using Old2NewTy = DenseMap<InterleaveGroup<Instruction> *,
+ InterleaveGroup<VPInstruction> *>;
+
+ /// Recursively \p Region and populate VPlan based interleave groups based on
+ /// \p IAI.
+ void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New,
+ InterleavedAccessInfo &IAI);
+ /// Recursively traverse \p Block and populate VPlan based interleave groups
+ /// based on \p IAI.
+ void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
+ InterleavedAccessInfo &IAI);
+
+public:
+ VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI);
+
+ ~VPInterleavedAccessInfo() {
+ SmallPtrSet<InterleaveGroup<VPInstruction> *, 4> DelSet;
+ // Avoid releasing a pointer twice.
+ for (auto &I : InterleaveGroupMap)
+ DelSet.insert(I.second);
+ for (auto *Ptr : DelSet)
+ delete Ptr;
+ }
+
+ /// Get the interleave group that \p Instr belongs to.
+ ///
+ /// \returns nullptr if doesn't have such group.
+ InterleaveGroup<VPInstruction> *
+ getInterleaveGroup(VPInstruction *Instr) const {
return InterleaveGroupMap.lookup(Instr);
- }
-};
-
-/// Class that maps (parts of) an existing VPlan to trees of combined
-/// VPInstructions.
-class VPlanSlp {
- enum class OpMode { Failed, Load, Opcode };
-
- /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
- /// DenseMap keys.
- struct BundleDenseMapInfo {
- static SmallVector<VPValue *, 4> getEmptyKey() {
- return {reinterpret_cast<VPValue *>(-1)};
- }
-
- static SmallVector<VPValue *, 4> getTombstoneKey() {
- return {reinterpret_cast<VPValue *>(-2)};
- }
-
- static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) {
- return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
- }
-
- static bool isEqual(const SmallVector<VPValue *, 4> &LHS,
- const SmallVector<VPValue *, 4> &RHS) {
- return LHS == RHS;
- }
- };
-
- /// Mapping of values in the original VPlan to a combined VPInstruction.
- DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo>
- BundleToCombined;
-
- VPInterleavedAccessInfo &IAI;
-
- /// Basic block to operate on. For now, only instructions in a single BB are
- /// considered.
- const VPBasicBlock &BB;
-
- /// Indicates whether we managed to combine all visited instructions or not.
- bool CompletelySLP = true;
-
- /// Width of the widest combined bundle in bits.
- unsigned WidestBundleBits = 0;
-
- using MultiNodeOpTy =
- typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
-
- // Input operand bundles for the current multi node. Each multi node operand
- // bundle contains values not matching the multi node's opcode. They will
- // be reordered in reorderMultiNodeOps, once we completed building a
- // multi node.
- SmallVector<MultiNodeOpTy, 4> MultiNodeOps;
-
- /// Indicates whether we are building a multi node currently.
- bool MultiNodeActive = false;
-
- /// Check if we can vectorize Operands together.
- bool areVectorizable(ArrayRef<VPValue *> Operands) const;
-
- /// Add combined instruction \p New for the bundle \p Operands.
- void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New);
-
- /// Indicate we hit a bundle we failed to combine. Returns nullptr for now.
- VPInstruction *markFailed();
-
- /// Reorder operands in the multi node to maximize sequential memory access
- /// and commutative operations.
- SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps();
-
- /// Choose the best candidate to use for the lane after \p Last. The set of
- /// candidates to choose from are values with an opcode matching \p Last's
- /// or loads consecutive to \p Last.
- std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last,
- SmallPtrSetImpl<VPValue *> &Candidates,
- VPInterleavedAccessInfo &IAI);
-
- /// Print bundle \p Values to dbgs().
- void dumpBundle(ArrayRef<VPValue *> Values);
-
-public:
- VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
-
+ }
+};
+
+/// Class that maps (parts of) an existing VPlan to trees of combined
+/// VPInstructions.
+class VPlanSlp {
+ enum class OpMode { Failed, Load, Opcode };
+
+ /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
+ /// DenseMap keys.
+ struct BundleDenseMapInfo {
+ static SmallVector<VPValue *, 4> getEmptyKey() {
+ return {reinterpret_cast<VPValue *>(-1)};
+ }
+
+ static SmallVector<VPValue *, 4> getTombstoneKey() {
+ return {reinterpret_cast<VPValue *>(-2)};
+ }
+
+ static unsigned getHashValue(const SmallVector<VPValue *, 4> &V) {
+ return static_cast<unsigned>(hash_combine_range(V.begin(), V.end()));
+ }
+
+ static bool isEqual(const SmallVector<VPValue *, 4> &LHS,
+ const SmallVector<VPValue *, 4> &RHS) {
+ return LHS == RHS;
+ }
+ };
+
+ /// Mapping of values in the original VPlan to a combined VPInstruction.
+ DenseMap<SmallVector<VPValue *, 4>, VPInstruction *, BundleDenseMapInfo>
+ BundleToCombined;
+
+ VPInterleavedAccessInfo &IAI;
+
+ /// Basic block to operate on. For now, only instructions in a single BB are
+ /// considered.
+ const VPBasicBlock &BB;
+
+ /// Indicates whether we managed to combine all visited instructions or not.
+ bool CompletelySLP = true;
+
+ /// Width of the widest combined bundle in bits.
+ unsigned WidestBundleBits = 0;
+
+ using MultiNodeOpTy =
+ typename std::pair<VPInstruction *, SmallVector<VPValue *, 4>>;
+
+ // Input operand bundles for the current multi node. Each multi node operand
+ // bundle contains values not matching the multi node's opcode. They will
+ // be reordered in reorderMultiNodeOps, once we completed building a
+ // multi node.
+ SmallVector<MultiNodeOpTy, 4> MultiNodeOps;
+
+ /// Indicates whether we are building a multi node currently.
+ bool MultiNodeActive = false;
+
+ /// Check if we can vectorize Operands together.
+ bool areVectorizable(ArrayRef<VPValue *> Operands) const;
+
+ /// Add combined instruction \p New for the bundle \p Operands.
+ void addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New);
+
+ /// Indicate we hit a bundle we failed to combine. Returns nullptr for now.
+ VPInstruction *markFailed();
+
+ /// Reorder operands in the multi node to maximize sequential memory access
+ /// and commutative operations.
+ SmallVector<MultiNodeOpTy, 4> reorderMultiNodeOps();
+
+ /// Choose the best candidate to use for the lane after \p Last. The set of
+ /// candidates to choose from are values with an opcode matching \p Last's
+ /// or loads consecutive to \p Last.
+ std::pair<OpMode, VPValue *> getBest(OpMode Mode, VPValue *Last,
+ SmallPtrSetImpl<VPValue *> &Candidates,
+ VPInterleavedAccessInfo &IAI);
+
+ /// Print bundle \p Values to dbgs().
+ void dumpBundle(ArrayRef<VPValue *> Values);
+
+public:
+ VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {}
+
~VPlanSlp() = default;
-
- /// Tries to build an SLP tree rooted at \p Operands and returns a
- /// VPInstruction combining \p Operands, if they can be combined.
- VPInstruction *buildGraph(ArrayRef<VPValue *> Operands);
-
- /// Return the width of the widest combined bundle in bits.
- unsigned getWidestBundleBits() const { return WidestBundleBits; }
-
- /// Return true if all visited instruction can be combined.
- bool isCompletelySLP() const { return CompletelySLP; }
-};
-} // end namespace llvm
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
+
+ /// Tries to build an SLP tree rooted at \p Operands and returns a
+ /// VPInstruction combining \p Operands, if they can be combined.
+ VPInstruction *buildGraph(ArrayRef<VPValue *> Operands);
+
+ /// Return the width of the widest combined bundle in bits.
+ unsigned getWidestBundleBits() const { return WidestBundleBits; }
+
+ /// Return true if all visited instruction can be combined.
+ bool isCompletelySLP() const { return CompletelySLP; }
+};
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanDominatorTree.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanDominatorTree.h
index 2087e620f7..a42ebc9ee9 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -1,41 +1,41 @@
-//===-- VPlanDominatorTree.h ------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file implements dominator tree analysis for a single level of a VPlan's
-/// H-CFG.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
-
-#include "VPlan.h"
-#include "llvm/ADT/GraphTraits.h"
-#include "llvm/IR/Dominators.h"
-
-namespace llvm {
-
-/// Template specialization of the standard LLVM dominator tree utility for
-/// VPBlockBases.
-using VPDominatorTree = DomTreeBase<VPBlockBase>;
-
-using VPDomTreeNode = DomTreeNodeBase<VPBlockBase>;
-
-/// Template specializations of GraphTraits for VPDomTreeNode.
-template <>
-struct GraphTraits<VPDomTreeNode *>
- : public DomTreeGraphTraitsBase<VPDomTreeNode,
- VPDomTreeNode::const_iterator> {};
-
-template <>
-struct GraphTraits<const VPDomTreeNode *>
- : public DomTreeGraphTraitsBase<const VPDomTreeNode,
- VPDomTreeNode::const_iterator> {};
-} // namespace llvm
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
+//===-- VPlanDominatorTree.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements dominator tree analysis for a single level of a VPlan's
+/// H-CFG.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
+
+#include "VPlan.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/IR/Dominators.h"
+
+namespace llvm {
+
+/// Template specialization of the standard LLVM dominator tree utility for
+/// VPBlockBases.
+using VPDominatorTree = DomTreeBase<VPBlockBase>;
+
+using VPDomTreeNode = DomTreeNodeBase<VPBlockBase>;
+
+/// Template specializations of GraphTraits for VPDomTreeNode.
+template <>
+struct GraphTraits<VPDomTreeNode *>
+ : public DomTreeGraphTraitsBase<VPDomTreeNode,
+ VPDomTreeNode::const_iterator> {};
+
+template <>
+struct GraphTraits<const VPDomTreeNode *>
+ : public DomTreeGraphTraitsBase<const VPDomTreeNode,
+ VPDomTreeNode::const_iterator> {};
+} // namespace llvm
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANDOMINATORTREE_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index f54b8958ae..df96f67288 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -1,354 +1,354 @@
-//===-- VPlanHCFGBuilder.cpp ----------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file implements the construction of a VPlan-based Hierarchical CFG
-/// (H-CFG) for an incoming IR. This construction comprises the following
-/// components and steps:
-//
-/// 1. PlainCFGBuilder class: builds a plain VPBasicBlock-based CFG that
-/// faithfully represents the CFG in the incoming IR. A VPRegionBlock (Top
-/// Region) is created to enclose and serve as parent of all the VPBasicBlocks
-/// in the plain CFG.
-/// NOTE: At this point, there is a direct correspondence between all the
-/// VPBasicBlocks created for the initial plain CFG and the incoming
-/// BasicBlocks. However, this might change in the future.
-///
-//===----------------------------------------------------------------------===//
-
-#include "VPlanHCFGBuilder.h"
-#include "LoopVectorizationPlanner.h"
-#include "llvm/Analysis/LoopIterator.h"
-
-#define DEBUG_TYPE "loop-vectorize"
-
-using namespace llvm;
-
-namespace {
-// Class that is used to build the plain CFG for the incoming IR.
-class PlainCFGBuilder {
-private:
- // The outermost loop of the input loop nest considered for vectorization.
- Loop *TheLoop;
-
- // Loop Info analysis.
- LoopInfo *LI;
-
- // Vectorization plan that we are working on.
- VPlan &Plan;
-
- // Output Top Region.
- VPRegionBlock *TopRegion = nullptr;
-
- // Builder of the VPlan instruction-level representation.
- VPBuilder VPIRBuilder;
-
- // NOTE: The following maps are intentionally destroyed after the plain CFG
- // construction because subsequent VPlan-to-VPlan transformation may
- // invalidate them.
- // Map incoming BasicBlocks to their newly-created VPBasicBlocks.
- DenseMap<BasicBlock *, VPBasicBlock *> BB2VPBB;
- // Map incoming Value definitions to their newly-created VPValues.
- DenseMap<Value *, VPValue *> IRDef2VPValue;
-
- // Hold phi node's that need to be fixed once the plain CFG has been built.
- SmallVector<PHINode *, 8> PhisToFix;
-
- // Utility functions.
- void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
- void fixPhiNodes();
- VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);
-#ifndef NDEBUG
- bool isExternalDef(Value *Val);
-#endif
- VPValue *getOrCreateVPOperand(Value *IRVal);
- void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
-
-public:
- PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
- : TheLoop(Lp), LI(LI), Plan(P) {}
-
- // Build the plain CFG and return its Top Region.
- VPRegionBlock *buildPlainCFG();
-};
-} // anonymous namespace
-
-// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB
-// must have no predecessors.
-void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {
- SmallVector<VPBlockBase *, 8> VPBBPreds;
- // Collect VPBB predecessors.
- for (BasicBlock *Pred : predecessors(BB))
- VPBBPreds.push_back(getOrCreateVPBB(Pred));
-
- VPBB->setPredecessors(VPBBPreds);
-}
-
-// Add operands to VPInstructions representing phi nodes from the input IR.
-void PlainCFGBuilder::fixPhiNodes() {
- for (auto *Phi : PhisToFix) {
- assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode.");
- VPValue *VPVal = IRDef2VPValue[Phi];
- assert(isa<VPInstruction>(VPVal) && "Expected VPInstruction for phi node.");
- auto *VPPhi = cast<VPInstruction>(VPVal);
- assert(VPPhi->getNumOperands() == 0 &&
- "Expected VPInstruction with no operands.");
-
- for (Value *Op : Phi->operands())
- VPPhi->addOperand(getOrCreateVPOperand(Op));
- }
-}
-
-// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an
-// existing one if it was already created.
-VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
- auto BlockIt = BB2VPBB.find(BB);
- if (BlockIt != BB2VPBB.end())
- // Retrieve existing VPBB.
- return BlockIt->second;
-
- // Create new VPBB.
- LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n");
- VPBasicBlock *VPBB = new VPBasicBlock(BB->getName());
- BB2VPBB[BB] = VPBB;
- VPBB->setParent(TopRegion);
- return VPBB;
-}
-
-#ifndef NDEBUG
-// Return true if \p Val is considered an external definition. An external
-// definition is either:
-// 1. A Value that is not an Instruction. This will be refined in the future.
-// 2. An Instruction that is outside of the CFG snippet represented in VPlan,
-// i.e., is not part of: a) the loop nest, b) outermost loop PH and, c)
-// outermost loop exits.
-bool PlainCFGBuilder::isExternalDef(Value *Val) {
- // All the Values that are not Instructions are considered external
- // definitions for now.
- Instruction *Inst = dyn_cast<Instruction>(Val);
- if (!Inst)
- return true;
-
- BasicBlock *InstParent = Inst->getParent();
- assert(InstParent && "Expected instruction parent.");
-
- // Check whether Instruction definition is in loop PH.
- BasicBlock *PH = TheLoop->getLoopPreheader();
- assert(PH && "Expected loop pre-header.");
-
- if (InstParent == PH)
- // Instruction definition is in outermost loop PH.
- return false;
-
- // Check whether Instruction definition is in the loop exit.
- BasicBlock *Exit = TheLoop->getUniqueExitBlock();
- assert(Exit && "Expected loop with single exit.");
- if (InstParent == Exit) {
- // Instruction definition is in outermost loop exit.
- return false;
- }
-
- // Check whether Instruction definition is in loop body.
- return !TheLoop->contains(Inst);
-}
-#endif
-
-// Create a new VPValue or retrieve an existing one for the Instruction's
-// operand \p IRVal. This function must only be used to create/retrieve VPValues
-// for *Instruction's operands* and not to create regular VPInstruction's. For
-// the latter, please, look at 'createVPInstructionsForVPBB'.
-VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
- auto VPValIt = IRDef2VPValue.find(IRVal);
- if (VPValIt != IRDef2VPValue.end())
- // Operand has an associated VPInstruction or VPValue that was previously
- // created.
- return VPValIt->second;
-
- // Operand doesn't have a previously created VPInstruction/VPValue. This
- // means that operand is:
- // A) a definition external to VPlan,
- // B) any other Value without specific representation in VPlan.
- // For now, we use VPValue to represent A and B and classify both as external
- // definitions. We may introduce specific VPValue subclasses for them in the
- // future.
- assert(isExternalDef(IRVal) && "Expected external definition as operand.");
-
- // A and B: Create VPValue and add it to the pool of external definitions and
- // to the Value->VPValue map.
- VPValue *NewVPVal = new VPValue(IRVal);
- Plan.addExternalDef(NewVPVal);
- IRDef2VPValue[IRVal] = NewVPVal;
- return NewVPVal;
-}
-
-// Create new VPInstructions in a VPBasicBlock, given its BasicBlock
-// counterpart. This function must be invoked in RPO so that the operands of a
-// VPInstruction in \p BB have been visited before (except for Phi nodes).
-void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
- BasicBlock *BB) {
- VPIRBuilder.setInsertPoint(VPBB);
- for (Instruction &InstRef : *BB) {
- Instruction *Inst = &InstRef;
-
- // There shouldn't be any VPValue for Inst at this point. Otherwise, we
- // visited Inst when we shouldn't, breaking the RPO traversal order.
- assert(!IRDef2VPValue.count(Inst) &&
- "Instruction shouldn't have been visited.");
-
- if (auto *Br = dyn_cast<BranchInst>(Inst)) {
- // Branch instruction is not explicitly represented in VPlan but we need
- // to represent its condition bit when it's conditional.
- if (Br->isConditional())
- getOrCreateVPOperand(Br->getCondition());
-
- // Skip the rest of the Instruction processing for Branch instructions.
- continue;
- }
-
- VPInstruction *NewVPInst;
- if (auto *Phi = dyn_cast<PHINode>(Inst)) {
- // Phi node's operands may have not been visited at this point. We create
- // an empty VPInstruction that we will fix once the whole plain CFG has
- // been built.
- NewVPInst = cast<VPInstruction>(VPIRBuilder.createNaryOp(
- Inst->getOpcode(), {} /*No operands*/, Inst));
- PhisToFix.push_back(Phi);
- } else {
- // Translate LLVM-IR operands into VPValue operands and set them in the
- // new VPInstruction.
- SmallVector<VPValue *, 4> VPOperands;
- for (Value *Op : Inst->operands())
- VPOperands.push_back(getOrCreateVPOperand(Op));
-
- // Build VPInstruction for any arbitraty Instruction without specific
- // representation in VPlan.
- NewVPInst = cast<VPInstruction>(
- VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
- }
-
- IRDef2VPValue[Inst] = NewVPInst;
- }
-}
-
-// Main interface to build the plain CFG.
-VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
- // 1. Create the Top Region. It will be the parent of all VPBBs.
- TopRegion = new VPRegionBlock("TopRegion", false /*isReplicator*/);
-
- // 2. Scan the body of the loop in a topological order to visit each basic
- // block after having visited its predecessor basic blocks. Create a VPBB for
- // each BB and link it to its successor and predecessor VPBBs. Note that
- // predecessors must be set in the same order as they are in the incomming IR.
- // Otherwise, there might be problems with existing phi nodes and algorithm
- // based on predecessors traversal.
-
- // Loop PH needs to be explicitly visited since it's not taken into account by
- // LoopBlocksDFS.
- BasicBlock *PreheaderBB = TheLoop->getLoopPreheader();
- assert((PreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
- "Unexpected loop preheader");
- VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(PreheaderBB);
- createVPInstructionsForVPBB(PreheaderVPBB, PreheaderBB);
- // Create empty VPBB for Loop H so that we can link PH->H.
- VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader());
- // Preheader's predecessors will be set during the loop RPO traversal below.
- PreheaderVPBB->setOneSuccessor(HeaderVPBB);
-
- LoopBlocksRPO RPO(TheLoop);
- RPO.perform(LI);
-
- for (BasicBlock *BB : RPO) {
- // Create or retrieve the VPBasicBlock for this BB and create its
- // VPInstructions.
- VPBasicBlock *VPBB = getOrCreateVPBB(BB);
- createVPInstructionsForVPBB(VPBB, BB);
-
- // Set VPBB successors. We create empty VPBBs for successors if they don't
- // exist already. Recipes will be created when the successor is visited
- // during the RPO traversal.
- Instruction *TI = BB->getTerminator();
- assert(TI && "Terminator expected.");
- unsigned NumSuccs = TI->getNumSuccessors();
-
- if (NumSuccs == 1) {
- VPBasicBlock *SuccVPBB = getOrCreateVPBB(TI->getSuccessor(0));
- assert(SuccVPBB && "VPBB Successor not found.");
- VPBB->setOneSuccessor(SuccVPBB);
- } else if (NumSuccs == 2) {
- VPBasicBlock *SuccVPBB0 = getOrCreateVPBB(TI->getSuccessor(0));
- assert(SuccVPBB0 && "Successor 0 not found.");
- VPBasicBlock *SuccVPBB1 = getOrCreateVPBB(TI->getSuccessor(1));
- assert(SuccVPBB1 && "Successor 1 not found.");
-
- // Get VPBB's condition bit.
- assert(isa<BranchInst>(TI) && "Unsupported terminator!");
- auto *Br = cast<BranchInst>(TI);
- Value *BrCond = Br->getCondition();
- // Look up the branch condition to get the corresponding VPValue
- // representing the condition bit in VPlan (which may be in another VPBB).
- assert(IRDef2VPValue.count(BrCond) &&
- "Missing condition bit in IRDef2VPValue!");
- VPValue *VPCondBit = IRDef2VPValue[BrCond];
-
- // Link successors using condition bit.
- VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1, VPCondBit);
- } else
- llvm_unreachable("Number of successors not supported.");
-
- // Set VPBB predecessors in the same order as they are in the incoming BB.
- setVPBBPredsFromBB(VPBB, BB);
- }
-
- // 3. Process outermost loop exit. We created an empty VPBB for the loop
- // single exit BB during the RPO traversal of the loop body but Instructions
- // weren't visited because it's not part of the the loop.
- BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock();
- assert(LoopExitBB && "Loops with multiple exits are not supported.");
- VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB];
- createVPInstructionsForVPBB(LoopExitVPBB, LoopExitBB);
- // Loop exit was already set as successor of the loop exiting BB.
- // We only set its predecessor VPBB now.
- setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB);
-
- // 4. The whole CFG has been built at this point so all the input Values must
- // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
- // VPlan operands.
- fixPhiNodes();
-
- // 5. Final Top Region setup. Set outermost loop pre-header and single exit as
- // Top Region entry and exit.
- TopRegion->setEntry(PreheaderVPBB);
- TopRegion->setExit(LoopExitVPBB);
- return TopRegion;
-}
-
-VPRegionBlock *VPlanHCFGBuilder::buildPlainCFG() {
- PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
- return PCFGBuilder.buildPlainCFG();
-}
-
-// Public interface to build a H-CFG.
-void VPlanHCFGBuilder::buildHierarchicalCFG() {
- // Build Top Region enclosing the plain CFG and set it as VPlan entry.
- VPRegionBlock *TopRegion = buildPlainCFG();
- Plan.setEntry(TopRegion);
- LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan);
-
- Verifier.verifyHierarchicalCFG(TopRegion);
-
- // Compute plain CFG dom tree for VPLInfo.
- VPDomTree.recalculate(*TopRegion);
- LLVM_DEBUG(dbgs() << "Dominator Tree after building the plain CFG.\n";
- VPDomTree.print(dbgs()));
-
- // Compute VPLInfo and keep it in Plan.
- VPLoopInfo &VPLInfo = Plan.getVPLoopInfo();
- VPLInfo.analyze(VPDomTree);
- LLVM_DEBUG(dbgs() << "VPLoop Info After buildPlainCFG:\n";
- VPLInfo.print(dbgs()));
-}
+//===-- VPlanHCFGBuilder.cpp ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the construction of a VPlan-based Hierarchical CFG
+/// (H-CFG) for an incoming IR. This construction comprises the following
+/// components and steps:
+//
+/// 1. PlainCFGBuilder class: builds a plain VPBasicBlock-based CFG that
+/// faithfully represents the CFG in the incoming IR. A VPRegionBlock (Top
+/// Region) is created to enclose and serve as parent of all the VPBasicBlocks
+/// in the plain CFG.
+/// NOTE: At this point, there is a direct correspondence between all the
+/// VPBasicBlocks created for the initial plain CFG and the incoming
+/// BasicBlocks. However, this might change in the future.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanHCFGBuilder.h"
+#include "LoopVectorizationPlanner.h"
+#include "llvm/Analysis/LoopIterator.h"
+
+#define DEBUG_TYPE "loop-vectorize"
+
+using namespace llvm;
+
+namespace {
+// Class that is used to build the plain CFG for the incoming IR.
+class PlainCFGBuilder {
+private:
+ // The outermost loop of the input loop nest considered for vectorization.
+ Loop *TheLoop;
+
+ // Loop Info analysis.
+ LoopInfo *LI;
+
+ // Vectorization plan that we are working on.
+ VPlan &Plan;
+
+ // Output Top Region.
+ VPRegionBlock *TopRegion = nullptr;
+
+ // Builder of the VPlan instruction-level representation.
+ VPBuilder VPIRBuilder;
+
+ // NOTE: The following maps are intentionally destroyed after the plain CFG
+ // construction because subsequent VPlan-to-VPlan transformation may
+ // invalidate them.
+ // Map incoming BasicBlocks to their newly-created VPBasicBlocks.
+ DenseMap<BasicBlock *, VPBasicBlock *> BB2VPBB;
+ // Map incoming Value definitions to their newly-created VPValues.
+ DenseMap<Value *, VPValue *> IRDef2VPValue;
+
+ // Hold phi node's that need to be fixed once the plain CFG has been built.
+ SmallVector<PHINode *, 8> PhisToFix;
+
+ // Utility functions.
+ void setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB);
+ void fixPhiNodes();
+ VPBasicBlock *getOrCreateVPBB(BasicBlock *BB);
+#ifndef NDEBUG
+ bool isExternalDef(Value *Val);
+#endif
+ VPValue *getOrCreateVPOperand(Value *IRVal);
+ void createVPInstructionsForVPBB(VPBasicBlock *VPBB, BasicBlock *BB);
+
+public:
+ PlainCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
+ : TheLoop(Lp), LI(LI), Plan(P) {}
+
+ // Build the plain CFG and return its Top Region.
+ VPRegionBlock *buildPlainCFG();
+};
+} // anonymous namespace
+
+// Set predecessors of \p VPBB in the same order as they are in \p BB. \p VPBB
+// must have no predecessors.
+void PlainCFGBuilder::setVPBBPredsFromBB(VPBasicBlock *VPBB, BasicBlock *BB) {
+ SmallVector<VPBlockBase *, 8> VPBBPreds;
+ // Collect VPBB predecessors.
+ for (BasicBlock *Pred : predecessors(BB))
+ VPBBPreds.push_back(getOrCreateVPBB(Pred));
+
+ VPBB->setPredecessors(VPBBPreds);
+}
+
+// Add operands to VPInstructions representing phi nodes from the input IR.
+void PlainCFGBuilder::fixPhiNodes() {
+ for (auto *Phi : PhisToFix) {
+ assert(IRDef2VPValue.count(Phi) && "Missing VPInstruction for PHINode.");
+ VPValue *VPVal = IRDef2VPValue[Phi];
+ assert(isa<VPInstruction>(VPVal) && "Expected VPInstruction for phi node.");
+ auto *VPPhi = cast<VPInstruction>(VPVal);
+ assert(VPPhi->getNumOperands() == 0 &&
+ "Expected VPInstruction with no operands.");
+
+ for (Value *Op : Phi->operands())
+ VPPhi->addOperand(getOrCreateVPOperand(Op));
+ }
+}
+
+// Create a new empty VPBasicBlock for an incoming BasicBlock or retrieve an
+// existing one if it was already created.
+VPBasicBlock *PlainCFGBuilder::getOrCreateVPBB(BasicBlock *BB) {
+ auto BlockIt = BB2VPBB.find(BB);
+ if (BlockIt != BB2VPBB.end())
+ // Retrieve existing VPBB.
+ return BlockIt->second;
+
+ // Create new VPBB.
+ LLVM_DEBUG(dbgs() << "Creating VPBasicBlock for " << BB->getName() << "\n");
+ VPBasicBlock *VPBB = new VPBasicBlock(BB->getName());
+ BB2VPBB[BB] = VPBB;
+ VPBB->setParent(TopRegion);
+ return VPBB;
+}
+
+#ifndef NDEBUG
+// Return true if \p Val is considered an external definition. An external
+// definition is either:
+// 1. A Value that is not an Instruction. This will be refined in the future.
+// 2. An Instruction that is outside of the CFG snippet represented in VPlan,
+// i.e., is not part of: a) the loop nest, b) outermost loop PH and, c)
+// outermost loop exits.
+bool PlainCFGBuilder::isExternalDef(Value *Val) {
+ // All the Values that are not Instructions are considered external
+ // definitions for now.
+ Instruction *Inst = dyn_cast<Instruction>(Val);
+ if (!Inst)
+ return true;
+
+ BasicBlock *InstParent = Inst->getParent();
+ assert(InstParent && "Expected instruction parent.");
+
+ // Check whether Instruction definition is in loop PH.
+ BasicBlock *PH = TheLoop->getLoopPreheader();
+ assert(PH && "Expected loop pre-header.");
+
+ if (InstParent == PH)
+ // Instruction definition is in outermost loop PH.
+ return false;
+
+ // Check whether Instruction definition is in the loop exit.
+ BasicBlock *Exit = TheLoop->getUniqueExitBlock();
+ assert(Exit && "Expected loop with single exit.");
+ if (InstParent == Exit) {
+ // Instruction definition is in outermost loop exit.
+ return false;
+ }
+
+ // Check whether Instruction definition is in loop body.
+ return !TheLoop->contains(Inst);
+}
+#endif
+
+// Create a new VPValue or retrieve an existing one for the Instruction's
+// operand \p IRVal. This function must only be used to create/retrieve VPValues
+// for *Instruction's operands* and not to create regular VPInstruction's. For
+// the latter, please, look at 'createVPInstructionsForVPBB'.
+VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
+ auto VPValIt = IRDef2VPValue.find(IRVal);
+ if (VPValIt != IRDef2VPValue.end())
+ // Operand has an associated VPInstruction or VPValue that was previously
+ // created.
+ return VPValIt->second;
+
+ // Operand doesn't have a previously created VPInstruction/VPValue. This
+ // means that operand is:
+ // A) a definition external to VPlan,
+ // B) any other Value without specific representation in VPlan.
+ // For now, we use VPValue to represent A and B and classify both as external
+ // definitions. We may introduce specific VPValue subclasses for them in the
+ // future.
+ assert(isExternalDef(IRVal) && "Expected external definition as operand.");
+
+ // A and B: Create VPValue and add it to the pool of external definitions and
+ // to the Value->VPValue map.
+ VPValue *NewVPVal = new VPValue(IRVal);
+ Plan.addExternalDef(NewVPVal);
+ IRDef2VPValue[IRVal] = NewVPVal;
+ return NewVPVal;
+}
+
+// Create new VPInstructions in a VPBasicBlock, given its BasicBlock
+// counterpart. This function must be invoked in RPO so that the operands of a
+// VPInstruction in \p BB have been visited before (except for Phi nodes).
+void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
+ BasicBlock *BB) {
+ VPIRBuilder.setInsertPoint(VPBB);
+ for (Instruction &InstRef : *BB) {
+ Instruction *Inst = &InstRef;
+
+ // There shouldn't be any VPValue for Inst at this point. Otherwise, we
+ // visited Inst when we shouldn't, breaking the RPO traversal order.
+ assert(!IRDef2VPValue.count(Inst) &&
+ "Instruction shouldn't have been visited.");
+
+ if (auto *Br = dyn_cast<BranchInst>(Inst)) {
+ // Branch instruction is not explicitly represented in VPlan but we need
+ // to represent its condition bit when it's conditional.
+ if (Br->isConditional())
+ getOrCreateVPOperand(Br->getCondition());
+
+ // Skip the rest of the Instruction processing for Branch instructions.
+ continue;
+ }
+
+ VPInstruction *NewVPInst;
+ if (auto *Phi = dyn_cast<PHINode>(Inst)) {
+ // Phi node's operands may have not been visited at this point. We create
+ // an empty VPInstruction that we will fix once the whole plain CFG has
+ // been built.
+ NewVPInst = cast<VPInstruction>(VPIRBuilder.createNaryOp(
+ Inst->getOpcode(), {} /*No operands*/, Inst));
+ PhisToFix.push_back(Phi);
+ } else {
+ // Translate LLVM-IR operands into VPValue operands and set them in the
+ // new VPInstruction.
+ SmallVector<VPValue *, 4> VPOperands;
+ for (Value *Op : Inst->operands())
+ VPOperands.push_back(getOrCreateVPOperand(Op));
+
+ // Build VPInstruction for any arbitraty Instruction without specific
+ // representation in VPlan.
+ NewVPInst = cast<VPInstruction>(
+ VPIRBuilder.createNaryOp(Inst->getOpcode(), VPOperands, Inst));
+ }
+
+ IRDef2VPValue[Inst] = NewVPInst;
+ }
+}
+
+// Main interface to build the plain CFG.
+VPRegionBlock *PlainCFGBuilder::buildPlainCFG() {
+ // 1. Create the Top Region. It will be the parent of all VPBBs.
+ TopRegion = new VPRegionBlock("TopRegion", false /*isReplicator*/);
+
+ // 2. Scan the body of the loop in a topological order to visit each basic
+ // block after having visited its predecessor basic blocks. Create a VPBB for
+ // each BB and link it to its successor and predecessor VPBBs. Note that
+ // predecessors must be set in the same order as they are in the incomming IR.
+ // Otherwise, there might be problems with existing phi nodes and algorithm
+ // based on predecessors traversal.
+
+ // Loop PH needs to be explicitly visited since it's not taken into account by
+ // LoopBlocksDFS.
+ BasicBlock *PreheaderBB = TheLoop->getLoopPreheader();
+ assert((PreheaderBB->getTerminator()->getNumSuccessors() == 1) &&
+ "Unexpected loop preheader");
+ VPBasicBlock *PreheaderVPBB = getOrCreateVPBB(PreheaderBB);
+ createVPInstructionsForVPBB(PreheaderVPBB, PreheaderBB);
+ // Create empty VPBB for Loop H so that we can link PH->H.
+ VPBlockBase *HeaderVPBB = getOrCreateVPBB(TheLoop->getHeader());
+ // Preheader's predecessors will be set during the loop RPO traversal below.
+ PreheaderVPBB->setOneSuccessor(HeaderVPBB);
+
+ LoopBlocksRPO RPO(TheLoop);
+ RPO.perform(LI);
+
+ for (BasicBlock *BB : RPO) {
+ // Create or retrieve the VPBasicBlock for this BB and create its
+ // VPInstructions.
+ VPBasicBlock *VPBB = getOrCreateVPBB(BB);
+ createVPInstructionsForVPBB(VPBB, BB);
+
+ // Set VPBB successors. We create empty VPBBs for successors if they don't
+ // exist already. Recipes will be created when the successor is visited
+ // during the RPO traversal.
+ Instruction *TI = BB->getTerminator();
+ assert(TI && "Terminator expected.");
+ unsigned NumSuccs = TI->getNumSuccessors();
+
+ if (NumSuccs == 1) {
+ VPBasicBlock *SuccVPBB = getOrCreateVPBB(TI->getSuccessor(0));
+ assert(SuccVPBB && "VPBB Successor not found.");
+ VPBB->setOneSuccessor(SuccVPBB);
+ } else if (NumSuccs == 2) {
+ VPBasicBlock *SuccVPBB0 = getOrCreateVPBB(TI->getSuccessor(0));
+ assert(SuccVPBB0 && "Successor 0 not found.");
+ VPBasicBlock *SuccVPBB1 = getOrCreateVPBB(TI->getSuccessor(1));
+ assert(SuccVPBB1 && "Successor 1 not found.");
+
+ // Get VPBB's condition bit.
+ assert(isa<BranchInst>(TI) && "Unsupported terminator!");
+ auto *Br = cast<BranchInst>(TI);
+ Value *BrCond = Br->getCondition();
+ // Look up the branch condition to get the corresponding VPValue
+ // representing the condition bit in VPlan (which may be in another VPBB).
+ assert(IRDef2VPValue.count(BrCond) &&
+ "Missing condition bit in IRDef2VPValue!");
+ VPValue *VPCondBit = IRDef2VPValue[BrCond];
+
+ // Link successors using condition bit.
+ VPBB->setTwoSuccessors(SuccVPBB0, SuccVPBB1, VPCondBit);
+ } else
+ llvm_unreachable("Number of successors not supported.");
+
+ // Set VPBB predecessors in the same order as they are in the incoming BB.
+ setVPBBPredsFromBB(VPBB, BB);
+ }
+
+ // 3. Process outermost loop exit. We created an empty VPBB for the loop
+ // single exit BB during the RPO traversal of the loop body but Instructions
+ // weren't visited because it's not part of the the loop.
+ BasicBlock *LoopExitBB = TheLoop->getUniqueExitBlock();
+ assert(LoopExitBB && "Loops with multiple exits are not supported.");
+ VPBasicBlock *LoopExitVPBB = BB2VPBB[LoopExitBB];
+ createVPInstructionsForVPBB(LoopExitVPBB, LoopExitBB);
+ // Loop exit was already set as successor of the loop exiting BB.
+ // We only set its predecessor VPBB now.
+ setVPBBPredsFromBB(LoopExitVPBB, LoopExitBB);
+
+ // 4. The whole CFG has been built at this point so all the input Values must
+ // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
+ // VPlan operands.
+ fixPhiNodes();
+
+ // 5. Final Top Region setup. Set outermost loop pre-header and single exit as
+ // Top Region entry and exit.
+ TopRegion->setEntry(PreheaderVPBB);
+ TopRegion->setExit(LoopExitVPBB);
+ return TopRegion;
+}
+
+VPRegionBlock *VPlanHCFGBuilder::buildPlainCFG() {
+ PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
+ return PCFGBuilder.buildPlainCFG();
+}
+
+// Public interface to build a H-CFG.
+void VPlanHCFGBuilder::buildHierarchicalCFG() {
+ // Build Top Region enclosing the plain CFG and set it as VPlan entry.
+ VPRegionBlock *TopRegion = buildPlainCFG();
+ Plan.setEntry(TopRegion);
+ LLVM_DEBUG(Plan.setName("HCFGBuilder: Plain CFG\n"); dbgs() << Plan);
+
+ Verifier.verifyHierarchicalCFG(TopRegion);
+
+ // Compute plain CFG dom tree for VPLInfo.
+ VPDomTree.recalculate(*TopRegion);
+ LLVM_DEBUG(dbgs() << "Dominator Tree after building the plain CFG.\n";
+ VPDomTree.print(dbgs()));
+
+ // Compute VPLInfo and keep it in Plan.
+ VPLoopInfo &VPLInfo = Plan.getVPLoopInfo();
+ VPLInfo.analyze(VPDomTree);
+ LLVM_DEBUG(dbgs() << "VPLoop Info After buildPlainCFG:\n";
+ VPLInfo.print(dbgs()));
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
index ba611ede14..238ee7e634 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -1,71 +1,71 @@
-//===-- VPlanHCFGBuilder.h --------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines the VPlanHCFGBuilder class which contains the public
-/// interface (buildHierarchicalCFG) to build a VPlan-based Hierarchical CFG
-/// (H-CFG) for an incoming IR.
-///
-/// A H-CFG in VPlan is a control-flow graph whose nodes are VPBasicBlocks
-/// and/or VPRegionBlocks (i.e., other H-CFGs). The outermost H-CFG of a VPlan
-/// consists of a VPRegionBlock, denoted Top Region, which encloses any other
-/// VPBlockBase in the H-CFG. This guarantees that any VPBlockBase in the H-CFG
-/// other than the Top Region will have a parent VPRegionBlock and allows us
-/// to easily add more nodes before/after the main vector loop (such as the
-/// reduction epilogue).
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
-
-#include "VPlan.h"
-#include "VPlanDominatorTree.h"
-#include "VPlanVerifier.h"
-
-namespace llvm {
-
-class Loop;
-class VPlanTestBase;
-
-/// Main class to build the VPlan H-CFG for an incoming IR.
-class VPlanHCFGBuilder {
- friend VPlanTestBase;
-
-private:
- // The outermost loop of the input loop nest considered for vectorization.
- Loop *TheLoop;
-
- // Loop Info analysis.
- LoopInfo *LI;
-
- // The VPlan that will contain the H-CFG we are building.
- VPlan &Plan;
-
- // VPlan verifier utility.
- VPlanVerifier Verifier;
-
- // Dominator analysis for VPlan plain CFG to be used in the
- // construction of the H-CFG. This analysis is no longer valid once regions
- // are introduced.
- VPDominatorTree VPDomTree;
-
- /// Build plain CFG for TheLoop. Return a new VPRegionBlock (TopRegion)
- /// enclosing the plain CFG.
- VPRegionBlock *buildPlainCFG();
-
-public:
- VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
- : TheLoop(Lp), LI(LI), Plan(P) {}
-
- /// Build H-CFG for TheLoop and update Plan accordingly.
- void buildHierarchicalCFG();
-};
-} // namespace llvm
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
+//===-- VPlanHCFGBuilder.h --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the VPlanHCFGBuilder class which contains the public
+/// interface (buildHierarchicalCFG) to build a VPlan-based Hierarchical CFG
+/// (H-CFG) for an incoming IR.
+///
+/// A H-CFG in VPlan is a control-flow graph whose nodes are VPBasicBlocks
+/// and/or VPRegionBlocks (i.e., other H-CFGs). The outermost H-CFG of a VPlan
+/// consists of a VPRegionBlock, denoted Top Region, which encloses any other
+/// VPBlockBase in the H-CFG. This guarantees that any VPBlockBase in the H-CFG
+/// other than the Top Region will have a parent VPRegionBlock and allows us
+/// to easily add more nodes before/after the main vector loop (such as the
+/// reduction epilogue).
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
+
+#include "VPlan.h"
+#include "VPlanDominatorTree.h"
+#include "VPlanVerifier.h"
+
+namespace llvm {
+
+class Loop;
+class VPlanTestBase;
+
+/// Main class to build the VPlan H-CFG for an incoming IR.
+class VPlanHCFGBuilder {
+ friend VPlanTestBase;
+
+private:
+ // The outermost loop of the input loop nest considered for vectorization.
+ Loop *TheLoop;
+
+ // Loop Info analysis.
+ LoopInfo *LI;
+
+ // The VPlan that will contain the H-CFG we are building.
+ VPlan &Plan;
+
+ // VPlan verifier utility.
+ VPlanVerifier Verifier;
+
+ // Dominator analysis for VPlan plain CFG to be used in the
+ // construction of the H-CFG. This analysis is no longer valid once regions
+ // are introduced.
+ VPDominatorTree VPDomTree;
+
+ /// Build plain CFG for TheLoop. Return a new VPRegionBlock (TopRegion)
+ /// enclosing the plain CFG.
+ VPRegionBlock *buildPlainCFG();
+
+public:
+ VPlanHCFGBuilder(Loop *Lp, LoopInfo *LI, VPlan &P)
+ : TheLoop(Lp), LI(LI), Plan(P) {}
+
+ /// Build H-CFG for TheLoop and update Plan accordingly.
+ void buildHierarchicalCFG();
+};
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VPLANHCFGBUILDER_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanLoopInfo.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanLoopInfo.h
index 4b9933630f..5208f2d58e 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanLoopInfo.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanLoopInfo.h
@@ -1,44 +1,44 @@
-//===-- VPLoopInfo.h --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines VPLoopInfo analysis and VPLoop class. VPLoopInfo is a
-/// specialization of LoopInfoBase for VPBlockBase. VPLoops is a specialization
-/// of LoopBase that is used to hold loop metadata from VPLoopInfo. Further
-/// information can be found in VectorizationPlanner.rst.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
-
-#include "llvm/Analysis/LoopInfoImpl.h"
-
-namespace llvm {
-class VPBlockBase;
-
-/// Hold analysis information for every loop detected by VPLoopInfo. It is an
-/// instantiation of LoopBase.
-class VPLoop : public LoopBase<VPBlockBase, VPLoop> {
-private:
- friend class LoopInfoBase<VPBlockBase, VPLoop>;
- explicit VPLoop(VPBlockBase *VPB) : LoopBase<VPBlockBase, VPLoop>(VPB) {}
-};
-
-/// VPLoopInfo provides analysis of natural loop for VPBlockBase-based
-/// Hierarchical CFG. It is a specialization of LoopInfoBase class.
-// TODO: VPLoopInfo is initially computed on top of the VPlan plain CFG, which
-// is the same as the incoming IR CFG. If it's more efficient than running the
-// whole loop detection algorithm, we may want to create a mechanism to
-// translate LoopInfo into VPLoopInfo. However, that would require significant
-// changes in LoopInfoBase class.
-typedef LoopInfoBase<VPBlockBase, VPLoop> VPLoopInfo;
-
-} // namespace llvm
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
+//===-- VPLoopInfo.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines VPLoopInfo analysis and VPLoop class. VPLoopInfo is a
+/// specialization of LoopInfoBase for VPBlockBase. VPLoops is a specialization
+/// of LoopBase that is used to hold loop metadata from VPLoopInfo. Further
+/// information can be found in VectorizationPlanner.rst.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
+
+#include "llvm/Analysis/LoopInfoImpl.h"
+
+namespace llvm {
+class VPBlockBase;
+
+/// Hold analysis information for every loop detected by VPLoopInfo. It is an
+/// instantiation of LoopBase.
+class VPLoop : public LoopBase<VPBlockBase, VPLoop> {
+private:
+ friend class LoopInfoBase<VPBlockBase, VPLoop>;
+ explicit VPLoop(VPBlockBase *VPB) : LoopBase<VPBlockBase, VPLoop>(VPB) {}
+};
+
+/// VPLoopInfo provides analysis of natural loop for VPBlockBase-based
+/// Hierarchical CFG. It is a specialization of LoopInfoBase class.
+// TODO: VPLoopInfo is initially computed on top of the VPlan plain CFG, which
+// is the same as the incoming IR CFG. If it's more efficient than running the
+// whole loop detection algorithm, we may want to create a mechanism to
+// translate LoopInfo into VPLoopInfo. However, that would require significant
+// changes in LoopInfoBase class.
+typedef LoopInfoBase<VPBlockBase, VPLoop> VPLoopInfo;
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLOOPINFO_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp
index 4151d85df2..ac3b3505dc 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.cpp
@@ -1,248 +1,248 @@
-//===-- VPlanPredicator.cpp -------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file implements the VPlanPredicator class which contains the public
-/// interfaces to predicate and linearize the VPlan region.
-///
-//===----------------------------------------------------------------------===//
-
-#include "VPlanPredicator.h"
-#include "VPlan.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/GraphTraits.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-#define DEBUG_TYPE "VPlanPredicator"
-
-using namespace llvm;
-
-// Generate VPInstructions at the beginning of CurrBB that calculate the
-// predicate being propagated from PredBB to CurrBB depending on the edge type
-// between them. For example if:
-// i. PredBB is controlled by predicate %BP, and
-// ii. The edge PredBB->CurrBB is the false edge, controlled by the condition
-// bit value %CBV then this function will generate the following two
-// VPInstructions at the start of CurrBB:
-// %IntermediateVal = not %CBV
-// %FinalVal = and %BP %IntermediateVal
-// It returns %FinalVal.
-VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB,
- VPBasicBlock *CurrBB) {
- VPValue *CBV = PredBB->getCondBit();
-
- // Set the intermediate value - this is either 'CBV', or 'not CBV'
- // depending on the edge type.
- EdgeType ET = getEdgeTypeBetween(PredBB, CurrBB);
- VPValue *IntermediateVal = nullptr;
- switch (ET) {
- case EdgeType::TRUE_EDGE:
- // CurrBB is the true successor of PredBB - nothing to do here.
- IntermediateVal = CBV;
- break;
-
- case EdgeType::FALSE_EDGE:
- // CurrBB is the False successor of PredBB - compute not of CBV.
- IntermediateVal = Builder.createNot(CBV);
- break;
- }
-
- // Now AND intermediate value with PredBB's block predicate if it has one.
- VPValue *BP = PredBB->getPredicate();
- if (BP)
- return Builder.createAnd(BP, IntermediateVal);
- else
- return IntermediateVal;
-}
-
-// Generate a tree of ORs for all IncomingPredicates in WorkList.
-// Note: This function destroys the original Worklist.
-//
-// P1 P2 P3 P4 P5
-// \ / \ / /
-// OR1 OR2 /
-// \ | /
-// \ +/-+
-// \ / |
-// OR3 |
-// \ |
-// OR4 <- Returns this
-// |
-//
-// The algorithm uses a worklist of predicates as its main data structure.
-// We pop a pair of values from the front (e.g. P1 and P2), generate an OR
-// (in this example OR1), and push it back. In this example the worklist
-// contains {P3, P4, P5, OR1}.
-// The process iterates until we have only one element in the Worklist (OR4).
-// The last element is the root predicate which is returned.
-VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) {
- if (Worklist.empty())
- return nullptr;
-
- // The worklist initially contains all the leaf nodes. Initialize the tree
- // using them.
- while (Worklist.size() >= 2) {
- // Pop a pair of values from the front.
- VPValue *LHS = Worklist.front();
- Worklist.pop_front();
- VPValue *RHS = Worklist.front();
- Worklist.pop_front();
-
- // Create an OR of these values.
- VPValue *Or = Builder.createOr(LHS, RHS);
-
- // Push OR to the back of the worklist.
- Worklist.push_back(Or);
- }
-
- assert(Worklist.size() == 1 && "Expected 1 item in worklist");
-
- // The root is the last node in the worklist.
- VPValue *Root = Worklist.front();
-
- // This root needs to replace the existing block predicate. This is done in
- // the caller function.
- return Root;
-}
-
-// Return whether the edge FromBlock -> ToBlock is a TRUE_EDGE or FALSE_EDGE
-VPlanPredicator::EdgeType
-VPlanPredicator::getEdgeTypeBetween(VPBlockBase *FromBlock,
- VPBlockBase *ToBlock) {
- unsigned Count = 0;
- for (VPBlockBase *SuccBlock : FromBlock->getSuccessors()) {
- if (SuccBlock == ToBlock) {
- assert(Count < 2 && "Switch not supported currently");
- return (Count == 0) ? EdgeType::TRUE_EDGE : EdgeType::FALSE_EDGE;
- }
- Count++;
- }
-
- llvm_unreachable("Broken getEdgeTypeBetween");
-}
-
-// Generate all predicates needed for CurrBlock by going through its immediate
-// predecessor blocks.
-void VPlanPredicator::createOrPropagatePredicates(VPBlockBase *CurrBlock,
- VPRegionBlock *Region) {
- // Blocks that dominate region exit inherit the predicate from the region.
- // Return after setting the predicate.
- if (VPDomTree.dominates(CurrBlock, Region->getExit())) {
- VPValue *RegionBP = Region->getPredicate();
- CurrBlock->setPredicate(RegionBP);
- return;
- }
-
- // Collect all incoming predicates in a worklist.
- std::list<VPValue *> IncomingPredicates;
-
- // Set the builder's insertion point to the top of the current BB
- VPBasicBlock *CurrBB = cast<VPBasicBlock>(CurrBlock->getEntryBasicBlock());
- Builder.setInsertPoint(CurrBB, CurrBB->begin());
-
- // For each predecessor, generate the VPInstructions required for
- // computing 'BP AND (not) CBV" at the top of CurrBB.
- // Collect the outcome of this calculation for all predecessors
- // into IncomingPredicates.
- for (VPBlockBase *PredBlock : CurrBlock->getPredecessors()) {
- // Skip back-edges
- if (VPBlockUtils::isBackEdge(PredBlock, CurrBlock, VPLI))
- continue;
-
- VPValue *IncomingPredicate = nullptr;
- unsigned NumPredSuccsNoBE =
- VPBlockUtils::countSuccessorsNoBE(PredBlock, VPLI);
-
- // If there is an unconditional branch to the currBB, then we don't create
- // edge predicates. We use the predecessor's block predicate instead.
- if (NumPredSuccsNoBE == 1)
- IncomingPredicate = PredBlock->getPredicate();
- else if (NumPredSuccsNoBE == 2) {
- // Emit recipes into CurrBlock if required
- assert(isa<VPBasicBlock>(PredBlock) && "Only BBs have multiple exits");
- IncomingPredicate =
- getOrCreateNotPredicate(cast<VPBasicBlock>(PredBlock), CurrBB);
- } else
- llvm_unreachable("FIXME: switch statement ?");
-
- if (IncomingPredicate)
- IncomingPredicates.push_back(IncomingPredicate);
- }
-
- // Logically OR all incoming predicates by building the Predicate Tree.
- VPValue *Predicate = genPredicateTree(IncomingPredicates);
-
- // Now update the block's predicate with the new one.
- CurrBlock->setPredicate(Predicate);
-}
-
-// Generate all predicates needed for Region.
-void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) {
- VPBasicBlock *EntryBlock = cast<VPBasicBlock>(Region->getEntry());
- ReversePostOrderTraversal<VPBlockBase *> RPOT(EntryBlock);
-
- // Generate edge predicates and append them to the block predicate. RPO is
- // necessary since the predecessor blocks' block predicate needs to be set
- // before the current block's block predicate can be computed.
+//===-- VPlanPredicator.cpp -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the VPlanPredicator class which contains the public
+/// interfaces to predicate and linearize the VPlan region.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanPredicator.h"
+#include "VPlan.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "VPlanPredicator"
+
+using namespace llvm;
+
+// Generate VPInstructions at the beginning of CurrBB that calculate the
+// predicate being propagated from PredBB to CurrBB depending on the edge type
+// between them. For example if:
+// i. PredBB is controlled by predicate %BP, and
+// ii. The edge PredBB->CurrBB is the false edge, controlled by the condition
+// bit value %CBV then this function will generate the following two
+// VPInstructions at the start of CurrBB:
+// %IntermediateVal = not %CBV
+// %FinalVal = and %BP %IntermediateVal
+// It returns %FinalVal.
+VPValue *VPlanPredicator::getOrCreateNotPredicate(VPBasicBlock *PredBB,
+ VPBasicBlock *CurrBB) {
+ VPValue *CBV = PredBB->getCondBit();
+
+ // Set the intermediate value - this is either 'CBV', or 'not CBV'
+ // depending on the edge type.
+ EdgeType ET = getEdgeTypeBetween(PredBB, CurrBB);
+ VPValue *IntermediateVal = nullptr;
+ switch (ET) {
+ case EdgeType::TRUE_EDGE:
+ // CurrBB is the true successor of PredBB - nothing to do here.
+ IntermediateVal = CBV;
+ break;
+
+ case EdgeType::FALSE_EDGE:
+ // CurrBB is the False successor of PredBB - compute not of CBV.
+ IntermediateVal = Builder.createNot(CBV);
+ break;
+ }
+
+ // Now AND intermediate value with PredBB's block predicate if it has one.
+ VPValue *BP = PredBB->getPredicate();
+ if (BP)
+ return Builder.createAnd(BP, IntermediateVal);
+ else
+ return IntermediateVal;
+}
+
+// Generate a tree of ORs for all IncomingPredicates in WorkList.
+// Note: This function destroys the original Worklist.
+//
+// P1 P2 P3 P4 P5
+// \ / \ / /
+// OR1 OR2 /
+// \ | /
+// \ +/-+
+// \ / |
+// OR3 |
+// \ |
+// OR4 <- Returns this
+// |
+//
+// The algorithm uses a worklist of predicates as its main data structure.
+// We pop a pair of values from the front (e.g. P1 and P2), generate an OR
+// (in this example OR1), and push it back. In this example the worklist
+// contains {P3, P4, P5, OR1}.
+// The process iterates until we have only one element in the Worklist (OR4).
+// The last element is the root predicate which is returned.
+VPValue *VPlanPredicator::genPredicateTree(std::list<VPValue *> &Worklist) {
+ if (Worklist.empty())
+ return nullptr;
+
+ // The worklist initially contains all the leaf nodes. Initialize the tree
+ // using them.
+ while (Worklist.size() >= 2) {
+ // Pop a pair of values from the front.
+ VPValue *LHS = Worklist.front();
+ Worklist.pop_front();
+ VPValue *RHS = Worklist.front();
+ Worklist.pop_front();
+
+ // Create an OR of these values.
+ VPValue *Or = Builder.createOr(LHS, RHS);
+
+ // Push OR to the back of the worklist.
+ Worklist.push_back(Or);
+ }
+
+ assert(Worklist.size() == 1 && "Expected 1 item in worklist");
+
+ // The root is the last node in the worklist.
+ VPValue *Root = Worklist.front();
+
+ // This root needs to replace the existing block predicate. This is done in
+ // the caller function.
+ return Root;
+}
+
+// Return whether the edge FromBlock -> ToBlock is a TRUE_EDGE or FALSE_EDGE
+VPlanPredicator::EdgeType
+VPlanPredicator::getEdgeTypeBetween(VPBlockBase *FromBlock,
+ VPBlockBase *ToBlock) {
+ unsigned Count = 0;
+ for (VPBlockBase *SuccBlock : FromBlock->getSuccessors()) {
+ if (SuccBlock == ToBlock) {
+ assert(Count < 2 && "Switch not supported currently");
+ return (Count == 0) ? EdgeType::TRUE_EDGE : EdgeType::FALSE_EDGE;
+ }
+ Count++;
+ }
+
+ llvm_unreachable("Broken getEdgeTypeBetween");
+}
+
+// Generate all predicates needed for CurrBlock by going through its immediate
+// predecessor blocks.
+void VPlanPredicator::createOrPropagatePredicates(VPBlockBase *CurrBlock,
+ VPRegionBlock *Region) {
+ // Blocks that dominate region exit inherit the predicate from the region.
+ // Return after setting the predicate.
+ if (VPDomTree.dominates(CurrBlock, Region->getExit())) {
+ VPValue *RegionBP = Region->getPredicate();
+ CurrBlock->setPredicate(RegionBP);
+ return;
+ }
+
+ // Collect all incoming predicates in a worklist.
+ std::list<VPValue *> IncomingPredicates;
+
+ // Set the builder's insertion point to the top of the current BB
+ VPBasicBlock *CurrBB = cast<VPBasicBlock>(CurrBlock->getEntryBasicBlock());
+ Builder.setInsertPoint(CurrBB, CurrBB->begin());
+
+ // For each predecessor, generate the VPInstructions required for
+ // computing 'BP AND (not) CBV" at the top of CurrBB.
+ // Collect the outcome of this calculation for all predecessors
+ // into IncomingPredicates.
+ for (VPBlockBase *PredBlock : CurrBlock->getPredecessors()) {
+ // Skip back-edges
+ if (VPBlockUtils::isBackEdge(PredBlock, CurrBlock, VPLI))
+ continue;
+
+ VPValue *IncomingPredicate = nullptr;
+ unsigned NumPredSuccsNoBE =
+ VPBlockUtils::countSuccessorsNoBE(PredBlock, VPLI);
+
+ // If there is an unconditional branch to the currBB, then we don't create
+ // edge predicates. We use the predecessor's block predicate instead.
+ if (NumPredSuccsNoBE == 1)
+ IncomingPredicate = PredBlock->getPredicate();
+ else if (NumPredSuccsNoBE == 2) {
+ // Emit recipes into CurrBlock if required
+ assert(isa<VPBasicBlock>(PredBlock) && "Only BBs have multiple exits");
+ IncomingPredicate =
+ getOrCreateNotPredicate(cast<VPBasicBlock>(PredBlock), CurrBB);
+ } else
+ llvm_unreachable("FIXME: switch statement ?");
+
+ if (IncomingPredicate)
+ IncomingPredicates.push_back(IncomingPredicate);
+ }
+
+ // Logically OR all incoming predicates by building the Predicate Tree.
+ VPValue *Predicate = genPredicateTree(IncomingPredicates);
+
+ // Now update the block's predicate with the new one.
+ CurrBlock->setPredicate(Predicate);
+}
+
+// Generate all predicates needed for Region.
+void VPlanPredicator::predicateRegionRec(VPRegionBlock *Region) {
+ VPBasicBlock *EntryBlock = cast<VPBasicBlock>(Region->getEntry());
+ ReversePostOrderTraversal<VPBlockBase *> RPOT(EntryBlock);
+
+ // Generate edge predicates and append them to the block predicate. RPO is
+ // necessary since the predecessor blocks' block predicate needs to be set
+ // before the current block's block predicate can be computed.
for (VPBlockBase *Block : RPOT) {
- // TODO: Handle nested regions once we start generating the same.
- assert(!isa<VPRegionBlock>(Block) && "Nested region not expected");
- createOrPropagatePredicates(Block, Region);
- }
-}
-
-// Linearize the CFG within Region.
-// TODO: Predication and linearization need RPOT for every region.
-// This traversal is expensive. Since predication is not adding new
-// blocks, we should be able to compute RPOT once in predication and
-// reuse it here. This becomes even more important once we have nested
-// regions.
-void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) {
- ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
- VPBlockBase *PrevBlock = nullptr;
-
+ // TODO: Handle nested regions once we start generating the same.
+ assert(!isa<VPRegionBlock>(Block) && "Nested region not expected");
+ createOrPropagatePredicates(Block, Region);
+ }
+}
+
+// Linearize the CFG within Region.
+// TODO: Predication and linearization need RPOT for every region.
+// This traversal is expensive. Since predication is not adding new
+// blocks, we should be able to compute RPOT once in predication and
+// reuse it here. This becomes even more important once we have nested
+// regions.
+void VPlanPredicator::linearizeRegionRec(VPRegionBlock *Region) {
+ ReversePostOrderTraversal<VPBlockBase *> RPOT(Region->getEntry());
+ VPBlockBase *PrevBlock = nullptr;
+
for (VPBlockBase *CurrBlock : RPOT) {
- // TODO: Handle nested regions once we start generating the same.
- assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected");
-
- // Linearize control flow by adding an unconditional edge between PrevBlock
- // and CurrBlock skipping loop headers and latches to keep intact loop
- // header predecessors and loop latch successors.
- if (PrevBlock && !VPLI->isLoopHeader(CurrBlock) &&
- !VPBlockUtils::blockIsLoopLatch(PrevBlock, VPLI)) {
-
- LLVM_DEBUG(dbgs() << "Linearizing: " << PrevBlock->getName() << "->"
- << CurrBlock->getName() << "\n");
-
- PrevBlock->clearSuccessors();
- CurrBlock->clearPredecessors();
- VPBlockUtils::connectBlocks(PrevBlock, CurrBlock);
- }
-
- PrevBlock = CurrBlock;
- }
-}
-
-// Entry point. The driver function for the predicator.
-void VPlanPredicator::predicate(void) {
- // Predicate the blocks within Region.
- predicateRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
-
- // Linearlize the blocks with Region.
- linearizeRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
-}
-
-VPlanPredicator::VPlanPredicator(VPlan &Plan)
- : Plan(Plan), VPLI(&(Plan.getVPLoopInfo())) {
- // FIXME: Predicator is currently computing the dominator information for the
- // top region. Once we start storing dominator information in a VPRegionBlock,
- // we can avoid this recalculation.
- VPDomTree.recalculate(*(cast<VPRegionBlock>(Plan.getEntry())));
-}
+ // TODO: Handle nested regions once we start generating the same.
+ assert(!isa<VPRegionBlock>(CurrBlock) && "Nested region not expected");
+
+ // Linearize control flow by adding an unconditional edge between PrevBlock
+ // and CurrBlock skipping loop headers and latches to keep intact loop
+ // header predecessors and loop latch successors.
+ if (PrevBlock && !VPLI->isLoopHeader(CurrBlock) &&
+ !VPBlockUtils::blockIsLoopLatch(PrevBlock, VPLI)) {
+
+ LLVM_DEBUG(dbgs() << "Linearizing: " << PrevBlock->getName() << "->"
+ << CurrBlock->getName() << "\n");
+
+ PrevBlock->clearSuccessors();
+ CurrBlock->clearPredecessors();
+ VPBlockUtils::connectBlocks(PrevBlock, CurrBlock);
+ }
+
+ PrevBlock = CurrBlock;
+ }
+}
+
+// Entry point. The driver function for the predicator.
+void VPlanPredicator::predicate(void) {
+ // Predicate the blocks within Region.
+ predicateRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
+
+ // Linearlize the blocks with Region.
+ linearizeRegionRec(cast<VPRegionBlock>(Plan.getEntry()));
+}
+
+VPlanPredicator::VPlanPredicator(VPlan &Plan)
+ : Plan(Plan), VPLI(&(Plan.getVPLoopInfo())) {
+ // FIXME: Predicator is currently computing the dominator information for the
+ // top region. Once we start storing dominator information in a VPRegionBlock,
+ // we can avoid this recalculation.
+ VPDomTree.recalculate(*(cast<VPRegionBlock>(Plan.getEntry())));
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.h
index 5dac70d090..692afd2978 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanPredicator.h
@@ -1,74 +1,74 @@
-//===-- VPlanPredicator.h ---------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines the VPlanPredicator class which contains the public
-/// interfaces to predicate and linearize the VPlan region.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
-
-#include "LoopVectorizationPlanner.h"
-#include "VPlan.h"
-#include "VPlanDominatorTree.h"
-
-namespace llvm {
-
-class VPlanPredicator {
-private:
- enum class EdgeType {
- TRUE_EDGE,
- FALSE_EDGE,
- };
-
- // VPlan being predicated.
- VPlan &Plan;
-
- // VPLoopInfo for Plan's HCFG.
- VPLoopInfo *VPLI;
-
- // Dominator tree for Plan's HCFG.
- VPDominatorTree VPDomTree;
-
- // VPlan builder used to generate VPInstructions for block predicates.
- VPBuilder Builder;
-
- /// Get the type of edge from \p FromBlock to \p ToBlock. Returns TRUE_EDGE if
- /// \p ToBlock is either the unconditional successor or the conditional true
- /// successor of \p FromBlock and FALSE_EDGE otherwise.
- EdgeType getEdgeTypeBetween(VPBlockBase *FromBlock, VPBlockBase *ToBlock);
-
- /// Create and return VPValue corresponding to the predicate for the edge from
- /// \p PredBB to \p CurrentBlock.
- VPValue *getOrCreateNotPredicate(VPBasicBlock *PredBB, VPBasicBlock *CurrBB);
-
- /// Generate and return the result of ORing all the predicate VPValues in \p
- /// Worklist.
- VPValue *genPredicateTree(std::list<VPValue *> &Worklist);
-
- /// Create or propagate predicate for \p CurrBlock in region \p Region using
- /// predicate(s) of its predecessor(s)
- void createOrPropagatePredicates(VPBlockBase *CurrBlock,
- VPRegionBlock *Region);
-
- /// Predicate the CFG within \p Region.
- void predicateRegionRec(VPRegionBlock *Region);
-
- /// Linearize the CFG within \p Region.
- void linearizeRegionRec(VPRegionBlock *Region);
-
-public:
- VPlanPredicator(VPlan &Plan);
-
- /// Predicate Plan's HCFG.
- void predicate(void);
-};
-} // end namespace llvm
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
+//===-- VPlanPredicator.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the VPlanPredicator class which contains the public
+/// interfaces to predicate and linearize the VPlan region.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
+
+#include "LoopVectorizationPlanner.h"
+#include "VPlan.h"
+#include "VPlanDominatorTree.h"
+
+namespace llvm {
+
+class VPlanPredicator {
+private:
+ enum class EdgeType {
+ TRUE_EDGE,
+ FALSE_EDGE,
+ };
+
+ // VPlan being predicated.
+ VPlan &Plan;
+
+ // VPLoopInfo for Plan's HCFG.
+ VPLoopInfo *VPLI;
+
+ // Dominator tree for Plan's HCFG.
+ VPDominatorTree VPDomTree;
+
+ // VPlan builder used to generate VPInstructions for block predicates.
+ VPBuilder Builder;
+
+ /// Get the type of edge from \p FromBlock to \p ToBlock. Returns TRUE_EDGE if
+ /// \p ToBlock is either the unconditional successor or the conditional true
+ /// successor of \p FromBlock and FALSE_EDGE otherwise.
+ EdgeType getEdgeTypeBetween(VPBlockBase *FromBlock, VPBlockBase *ToBlock);
+
+ /// Create and return VPValue corresponding to the predicate for the edge from
+ /// \p PredBB to \p CurrentBlock.
+ VPValue *getOrCreateNotPredicate(VPBasicBlock *PredBB, VPBasicBlock *CurrBB);
+
+ /// Generate and return the result of ORing all the predicate VPValues in \p
+ /// Worklist.
+ VPValue *genPredicateTree(std::list<VPValue *> &Worklist);
+
+ /// Create or propagate predicate for \p CurrBlock in region \p Region using
+ /// predicate(s) of its predecessor(s)
+ void createOrPropagatePredicates(VPBlockBase *CurrBlock,
+ VPRegionBlock *Region);
+
+ /// Predicate the CFG within \p Region.
+ void predicateRegionRec(VPRegionBlock *Region);
+
+ /// Linearize the CFG within \p Region.
+ void linearizeRegionRec(VPRegionBlock *Region);
+
+public:
+ VPlanPredicator(VPlan &Plan);
+
+ /// Predicate Plan's HCFG.
+ void predicate(void);
+};
+} // end namespace llvm
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_PREDICATOR_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp
index b2a5a7688d..6f21bf4429 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanSLP.cpp
@@ -1,473 +1,473 @@
-//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// This file implements SLP analysis based on VPlan. The analysis is based on
-/// the ideas described in
-///
-/// Look-ahead SLP: auto-vectorization in the presence of commutative
-/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
-/// Luís F. W. Góes
-///
-//===----------------------------------------------------------------------===//
-
-#include "VPlan.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/InstrTypes.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/GraphWriter.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include <cassert>
-#include <iterator>
-#include <string>
-#include <vector>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "vplan-slp"
-
-// Number of levels to look ahead when re-ordering multi node operands.
-static unsigned LookaheadMaxDepth = 5;
-
-VPInstruction *VPlanSlp::markFailed() {
- // FIXME: Currently this is used to signal we hit instructions we cannot
- // trivially SLP'ize.
- CompletelySLP = false;
- return nullptr;
-}
-
-void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
- if (all_of(Operands, [](VPValue *V) {
- return cast<VPInstruction>(V)->getUnderlyingInstr();
- })) {
- unsigned BundleSize = 0;
- for (VPValue *V : Operands) {
- Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
- assert(!T->isVectorTy() && "Only scalar types supported for now");
- BundleSize += T->getScalarSizeInBits();
- }
- WidestBundleBits = std::max(WidestBundleBits, BundleSize);
- }
-
- auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
- assert(Res.second &&
- "Already created a combined instruction for the operand bundle");
- (void)Res;
-}
-
-bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
- // Currently we only support VPInstructions.
- if (!all_of(Operands, [](VPValue *Op) {
- return Op && isa<VPInstruction>(Op) &&
- cast<VPInstruction>(Op)->getUnderlyingInstr();
- })) {
- LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
- return false;
- }
-
- // Check if opcodes and type width agree for all instructions in the bundle.
- // FIXME: Differing widths/opcodes can be handled by inserting additional
- // instructions.
- // FIXME: Deal with non-primitive types.
- const Instruction *OriginalInstr =
- cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
- unsigned Opcode = OriginalInstr->getOpcode();
- unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
- if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
- const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
- return I->getOpcode() == Opcode &&
- I->getType()->getPrimitiveSizeInBits() == Width;
- })) {
- LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
- return false;
- }
-
- // For now, all operands must be defined in the same BB.
- if (any_of(Operands, [this](VPValue *Op) {
- return cast<VPInstruction>(Op)->getParent() != &this->BB;
- })) {
- LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
- return false;
- }
-
- if (any_of(Operands,
- [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
- LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
- return false;
- }
-
- // For loads, check that there are no instructions writing to memory in
- // between them.
- // TODO: we only have to forbid instructions writing to memory that could
- // interfere with any of the loads in the bundle
- if (Opcode == Instruction::Load) {
- unsigned LoadsSeen = 0;
- VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
- for (auto &I : *Parent) {
- auto *VPI = cast<VPInstruction>(&I);
- if (VPI->getOpcode() == Instruction::Load &&
+//===- VPlanSLP.cpp - SLP Analysis based on VPlan -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// This file implements SLP analysis based on VPlan. The analysis is based on
+/// the ideas described in
+///
+/// Look-ahead SLP: auto-vectorization in the presence of commutative
+/// operations, CGO 2018 by Vasileios Porpodas, Rodrigo C. O. Rocha,
+/// Luís F. W. Góes
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlan.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <cassert>
+#include <iterator>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vplan-slp"
+
+// Number of levels to look ahead when re-ordering multi node operands.
+static unsigned LookaheadMaxDepth = 5;
+
+VPInstruction *VPlanSlp::markFailed() {
+ // FIXME: Currently this is used to signal we hit instructions we cannot
+ // trivially SLP'ize.
+ CompletelySLP = false;
+ return nullptr;
+}
+
+void VPlanSlp::addCombined(ArrayRef<VPValue *> Operands, VPInstruction *New) {
+ if (all_of(Operands, [](VPValue *V) {
+ return cast<VPInstruction>(V)->getUnderlyingInstr();
+ })) {
+ unsigned BundleSize = 0;
+ for (VPValue *V : Operands) {
+ Type *T = cast<VPInstruction>(V)->getUnderlyingInstr()->getType();
+ assert(!T->isVectorTy() && "Only scalar types supported for now");
+ BundleSize += T->getScalarSizeInBits();
+ }
+ WidestBundleBits = std::max(WidestBundleBits, BundleSize);
+ }
+
+ auto Res = BundleToCombined.try_emplace(to_vector<4>(Operands), New);
+ assert(Res.second &&
+ "Already created a combined instruction for the operand bundle");
+ (void)Res;
+}
+
+bool VPlanSlp::areVectorizable(ArrayRef<VPValue *> Operands) const {
+ // Currently we only support VPInstructions.
+ if (!all_of(Operands, [](VPValue *Op) {
+ return Op && isa<VPInstruction>(Op) &&
+ cast<VPInstruction>(Op)->getUnderlyingInstr();
+ })) {
+ LLVM_DEBUG(dbgs() << "VPSLP: not all operands are VPInstructions\n");
+ return false;
+ }
+
+ // Check if opcodes and type width agree for all instructions in the bundle.
+ // FIXME: Differing widths/opcodes can be handled by inserting additional
+ // instructions.
+ // FIXME: Deal with non-primitive types.
+ const Instruction *OriginalInstr =
+ cast<VPInstruction>(Operands[0])->getUnderlyingInstr();
+ unsigned Opcode = OriginalInstr->getOpcode();
+ unsigned Width = OriginalInstr->getType()->getPrimitiveSizeInBits();
+ if (!all_of(Operands, [Opcode, Width](VPValue *Op) {
+ const Instruction *I = cast<VPInstruction>(Op)->getUnderlyingInstr();
+ return I->getOpcode() == Opcode &&
+ I->getType()->getPrimitiveSizeInBits() == Width;
+ })) {
+ LLVM_DEBUG(dbgs() << "VPSLP: Opcodes do not agree \n");
+ return false;
+ }
+
+ // For now, all operands must be defined in the same BB.
+ if (any_of(Operands, [this](VPValue *Op) {
+ return cast<VPInstruction>(Op)->getParent() != &this->BB;
+ })) {
+ LLVM_DEBUG(dbgs() << "VPSLP: operands in different BBs\n");
+ return false;
+ }
+
+ if (any_of(Operands,
+ [](VPValue *Op) { return Op->hasMoreThanOneUniqueUser(); })) {
+ LLVM_DEBUG(dbgs() << "VPSLP: Some operands have multiple users.\n");
+ return false;
+ }
+
+ // For loads, check that there are no instructions writing to memory in
+ // between them.
+ // TODO: we only have to forbid instructions writing to memory that could
+ // interfere with any of the loads in the bundle
+ if (Opcode == Instruction::Load) {
+ unsigned LoadsSeen = 0;
+ VPBasicBlock *Parent = cast<VPInstruction>(Operands[0])->getParent();
+ for (auto &I : *Parent) {
+ auto *VPI = cast<VPInstruction>(&I);
+ if (VPI->getOpcode() == Instruction::Load &&
llvm::is_contained(Operands, VPI))
- LoadsSeen++;
-
- if (LoadsSeen == Operands.size())
- break;
- if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
- LLVM_DEBUG(
- dbgs() << "VPSLP: instruction modifying memory between loads\n");
- return false;
- }
- }
-
- if (!all_of(Operands, [](VPValue *Op) {
- return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
- ->isSimple();
- })) {
- LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
- return false;
- }
- }
-
- if (Opcode == Instruction::Store)
- if (!all_of(Operands, [](VPValue *Op) {
- return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
- ->isSimple();
- })) {
- LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
- return false;
- }
-
- return true;
-}
-
-static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
- unsigned OperandIndex) {
- SmallVector<VPValue *, 4> Operands;
- for (VPValue *V : Values) {
+ LoadsSeen++;
+
+ if (LoadsSeen == Operands.size())
+ break;
+ if (LoadsSeen > 0 && VPI->mayWriteToMemory()) {
+ LLVM_DEBUG(
+ dbgs() << "VPSLP: instruction modifying memory between loads\n");
+ return false;
+ }
+ }
+
+ if (!all_of(Operands, [](VPValue *Op) {
+ return cast<LoadInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
+ ->isSimple();
+ })) {
+ LLVM_DEBUG(dbgs() << "VPSLP: only simple loads are supported.\n");
+ return false;
+ }
+ }
+
+ if (Opcode == Instruction::Store)
+ if (!all_of(Operands, [](VPValue *Op) {
+ return cast<StoreInst>(cast<VPInstruction>(Op)->getUnderlyingInstr())
+ ->isSimple();
+ })) {
+ LLVM_DEBUG(dbgs() << "VPSLP: only simple stores are supported.\n");
+ return false;
+ }
+
+ return true;
+}
+
+static SmallVector<VPValue *, 4> getOperands(ArrayRef<VPValue *> Values,
+ unsigned OperandIndex) {
+ SmallVector<VPValue *, 4> Operands;
+ for (VPValue *V : Values) {
// Currently we only support VPInstructions.
auto *U = cast<VPInstruction>(V);
- Operands.push_back(U->getOperand(OperandIndex));
- }
- return Operands;
-}
-
-static bool areCommutative(ArrayRef<VPValue *> Values) {
- return Instruction::isCommutative(
- cast<VPInstruction>(Values[0])->getOpcode());
-}
-
-static SmallVector<SmallVector<VPValue *, 4>, 4>
-getOperands(ArrayRef<VPValue *> Values) {
- SmallVector<SmallVector<VPValue *, 4>, 4> Result;
- auto *VPI = cast<VPInstruction>(Values[0]);
-
- switch (VPI->getOpcode()) {
- case Instruction::Load:
- llvm_unreachable("Loads terminate a tree, no need to get operands");
- case Instruction::Store:
- Result.push_back(getOperands(Values, 0));
- break;
- default:
- for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
- Result.push_back(getOperands(Values, I));
- break;
- }
-
- return Result;
-}
-
-/// Returns the opcode of Values or ~0 if they do not all agree.
-static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
- unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
- if (any_of(Values, [Opcode](VPValue *V) {
- return cast<VPInstruction>(V)->getOpcode() != Opcode;
- }))
- return None;
- return {Opcode};
-}
-
-/// Returns true if A and B access sequential memory if they are loads or
-/// stores or if they have identical opcodes otherwise.
-static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
- VPInterleavedAccessInfo &IAI) {
- if (A->getOpcode() != B->getOpcode())
- return false;
-
- if (A->getOpcode() != Instruction::Load &&
- A->getOpcode() != Instruction::Store)
- return true;
- auto *GA = IAI.getInterleaveGroup(A);
- auto *GB = IAI.getInterleaveGroup(B);
-
- return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
-}
-
-/// Implements getLAScore from Listing 7 in the paper.
-/// Traverses and compares operands of V1 and V2 to MaxLevel.
-static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
- VPInterleavedAccessInfo &IAI) {
+ Operands.push_back(U->getOperand(OperandIndex));
+ }
+ return Operands;
+}
+
+static bool areCommutative(ArrayRef<VPValue *> Values) {
+ return Instruction::isCommutative(
+ cast<VPInstruction>(Values[0])->getOpcode());
+}
+
+static SmallVector<SmallVector<VPValue *, 4>, 4>
+getOperands(ArrayRef<VPValue *> Values) {
+ SmallVector<SmallVector<VPValue *, 4>, 4> Result;
+ auto *VPI = cast<VPInstruction>(Values[0]);
+
+ switch (VPI->getOpcode()) {
+ case Instruction::Load:
+ llvm_unreachable("Loads terminate a tree, no need to get operands");
+ case Instruction::Store:
+ Result.push_back(getOperands(Values, 0));
+ break;
+ default:
+ for (unsigned I = 0, NumOps = VPI->getNumOperands(); I < NumOps; ++I)
+ Result.push_back(getOperands(Values, I));
+ break;
+ }
+
+ return Result;
+}
+
+/// Returns the opcode of Values or ~0 if they do not all agree.
+static Optional<unsigned> getOpcode(ArrayRef<VPValue *> Values) {
+ unsigned Opcode = cast<VPInstruction>(Values[0])->getOpcode();
+ if (any_of(Values, [Opcode](VPValue *V) {
+ return cast<VPInstruction>(V)->getOpcode() != Opcode;
+ }))
+ return None;
+ return {Opcode};
+}
+
+/// Returns true if A and B access sequential memory if they are loads or
+/// stores or if they have identical opcodes otherwise.
+static bool areConsecutiveOrMatch(VPInstruction *A, VPInstruction *B,
+ VPInterleavedAccessInfo &IAI) {
+ if (A->getOpcode() != B->getOpcode())
+ return false;
+
+ if (A->getOpcode() != Instruction::Load &&
+ A->getOpcode() != Instruction::Store)
+ return true;
+ auto *GA = IAI.getInterleaveGroup(A);
+ auto *GB = IAI.getInterleaveGroup(B);
+
+ return GA && GB && GA == GB && GA->getIndex(A) + 1 == GB->getIndex(B);
+}
+
+/// Implements getLAScore from Listing 7 in the paper.
+/// Traverses and compares operands of V1 and V2 to MaxLevel.
+static unsigned getLAScore(VPValue *V1, VPValue *V2, unsigned MaxLevel,
+ VPInterleavedAccessInfo &IAI) {
auto *I1 = dyn_cast<VPInstruction>(V1);
auto *I2 = dyn_cast<VPInstruction>(V2);
// Currently we only support VPInstructions.
if (!I1 || !I2)
- return 0;
-
- if (MaxLevel == 0)
+ return 0;
+
+ if (MaxLevel == 0)
return (unsigned)areConsecutiveOrMatch(I1, I2, IAI);
-
- unsigned Score = 0;
+
+ unsigned Score = 0;
for (unsigned I = 0, EV1 = I1->getNumOperands(); I < EV1; ++I)
for (unsigned J = 0, EV2 = I2->getNumOperands(); J < EV2; ++J)
Score +=
getLAScore(I1->getOperand(I), I2->getOperand(J), MaxLevel - 1, IAI);
- return Score;
-}
-
-std::pair<VPlanSlp::OpMode, VPValue *>
-VPlanSlp::getBest(OpMode Mode, VPValue *Last,
- SmallPtrSetImpl<VPValue *> &Candidates,
- VPInterleavedAccessInfo &IAI) {
- assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
- "Currently we only handle load and commutative opcodes");
- LLVM_DEBUG(dbgs() << " getBest\n");
-
- SmallVector<VPValue *, 4> BestCandidates;
- LLVM_DEBUG(dbgs() << " Candidates for "
- << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
- for (auto *Candidate : Candidates) {
- auto *LastI = cast<VPInstruction>(Last);
- auto *CandidateI = cast<VPInstruction>(Candidate);
- if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
- LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
- << " ");
- BestCandidates.push_back(Candidate);
- }
- }
- LLVM_DEBUG(dbgs() << "\n");
-
- if (BestCandidates.empty())
- return {OpMode::Failed, nullptr};
-
- if (BestCandidates.size() == 1)
- return {Mode, BestCandidates[0]};
-
- VPValue *Best = nullptr;
- unsigned BestScore = 0;
- for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
- unsigned PrevScore = ~0u;
- bool AllSame = true;
-
- // FIXME: Avoid visiting the same operands multiple times.
- for (auto *Candidate : BestCandidates) {
- unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
- if (PrevScore == ~0u)
- PrevScore = Score;
- if (PrevScore != Score)
- AllSame = false;
- PrevScore = Score;
-
- if (Score > BestScore) {
- BestScore = Score;
- Best = Candidate;
- }
- }
- if (!AllSame)
- break;
- }
- LLVM_DEBUG(dbgs() << "Found best "
- << *cast<VPInstruction>(Best)->getUnderlyingInstr()
- << "\n");
- Candidates.erase(Best);
-
- return {Mode, Best};
-}
-
-SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
- SmallVector<MultiNodeOpTy, 4> FinalOrder;
- SmallVector<OpMode, 4> Mode;
- FinalOrder.reserve(MultiNodeOps.size());
- Mode.reserve(MultiNodeOps.size());
-
- LLVM_DEBUG(dbgs() << "Reordering multinode\n");
-
- for (auto &Operands : MultiNodeOps) {
- FinalOrder.push_back({Operands.first, {Operands.second[0]}});
- if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
- Instruction::Load)
- Mode.push_back(OpMode::Load);
- else
- Mode.push_back(OpMode::Opcode);
- }
-
- for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
- LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");
- SmallPtrSet<VPValue *, 4> Candidates;
- LLVM_DEBUG(dbgs() << " Candidates ");
- for (auto Ops : MultiNodeOps) {
- LLVM_DEBUG(
- dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
- << " ");
- Candidates.insert(Ops.second[Lane]);
- }
- LLVM_DEBUG(dbgs() << "\n");
-
- for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
- LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");
- if (Mode[Op] == OpMode::Failed)
- continue;
-
- VPValue *Last = FinalOrder[Op].second[Lane - 1];
- std::pair<OpMode, VPValue *> Res =
- getBest(Mode[Op], Last, Candidates, IAI);
- if (Res.second)
- FinalOrder[Op].second.push_back(Res.second);
- else
- // TODO: handle this case
- FinalOrder[Op].second.push_back(markFailed());
- }
- }
-
- return FinalOrder;
-}
-
-void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
- dbgs() << " Ops: ";
- for (auto Op : Values) {
- if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
- if (auto *Instr = VPInstr->getUnderlyingInstr()) {
- dbgs() << *Instr << " | ";
- continue;
- }
- dbgs() << " nullptr | ";
- }
- dbgs() << "\n";
-}
-
-VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
- assert(!Values.empty() && "Need some operands!");
-
- // If we already visited this instruction bundle, re-use the existing node
- auto I = BundleToCombined.find(to_vector<4>(Values));
- if (I != BundleToCombined.end()) {
-#ifndef NDEBUG
- // Check that the resulting graph is a tree. If we re-use a node, this means
- // its values have multiple users. We only allow this, if all users of each
- // value are the same instruction.
- for (auto *V : Values) {
- auto UI = V->user_begin();
- auto *FirstUser = *UI++;
- while (UI != V->user_end()) {
- assert(*UI == FirstUser && "Currently we only support SLP trees.");
- UI++;
- }
- }
-#endif
- return I->second;
- }
-
- // Dump inputs
- LLVM_DEBUG({
- dbgs() << "buildGraph: ";
- dumpBundle(Values);
- });
-
- if (!areVectorizable(Values))
- return markFailed();
-
- assert(getOpcode(Values) && "Opcodes for all values must match");
- unsigned ValuesOpcode = getOpcode(Values).getValue();
-
- SmallVector<VPValue *, 4> CombinedOperands;
- if (areCommutative(Values)) {
- bool MultiNodeRoot = !MultiNodeActive;
- MultiNodeActive = true;
- for (auto &Operands : getOperands(Values)) {
- LLVM_DEBUG({
- dbgs() << " Visiting Commutative";
- dumpBundle(Operands);
- });
-
- auto OperandsOpcode = getOpcode(Operands);
- if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
- LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
- CombinedOperands.push_back(buildGraph(Operands));
- } else {
- LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
- // Create dummy VPInstruction, which will we replace later by the
- // re-ordered operand.
- VPInstruction *Op = new VPInstruction(0, {});
- CombinedOperands.push_back(Op);
- MultiNodeOps.emplace_back(Op, Operands);
- }
- }
-
- if (MultiNodeRoot) {
- LLVM_DEBUG(dbgs() << "Reorder \n");
- MultiNodeActive = false;
-
- auto FinalOrder = reorderMultiNodeOps();
-
- MultiNodeOps.clear();
- for (auto &Ops : FinalOrder) {
- VPInstruction *NewOp = buildGraph(Ops.second);
- Ops.first->replaceAllUsesWith(NewOp);
- for (unsigned i = 0; i < CombinedOperands.size(); i++)
- if (CombinedOperands[i] == Ops.first)
- CombinedOperands[i] = NewOp;
- delete Ops.first;
- Ops.first = NewOp;
- }
- LLVM_DEBUG(dbgs() << "Found final order\n");
- }
- } else {
- LLVM_DEBUG(dbgs() << " NonCommuntative\n");
- if (ValuesOpcode == Instruction::Load)
- for (VPValue *V : Values)
- CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
- else
- for (auto &Operands : getOperands(Values))
- CombinedOperands.push_back(buildGraph(Operands));
- }
-
- unsigned Opcode;
- switch (ValuesOpcode) {
- case Instruction::Load:
- Opcode = VPInstruction::SLPLoad;
- break;
- case Instruction::Store:
- Opcode = VPInstruction::SLPStore;
- break;
- default:
- Opcode = ValuesOpcode;
- break;
- }
-
- if (!CompletelySLP)
- return markFailed();
-
- assert(CombinedOperands.size() > 0 && "Need more some operands");
- auto *VPI = new VPInstruction(Opcode, CombinedOperands);
- VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
-
+ return Score;
+}
+
+std::pair<VPlanSlp::OpMode, VPValue *>
+VPlanSlp::getBest(OpMode Mode, VPValue *Last,
+ SmallPtrSetImpl<VPValue *> &Candidates,
+ VPInterleavedAccessInfo &IAI) {
+ assert((Mode == OpMode::Load || Mode == OpMode::Opcode) &&
+ "Currently we only handle load and commutative opcodes");
+ LLVM_DEBUG(dbgs() << " getBest\n");
+
+ SmallVector<VPValue *, 4> BestCandidates;
+ LLVM_DEBUG(dbgs() << " Candidates for "
+ << *cast<VPInstruction>(Last)->getUnderlyingInstr() << " ");
+ for (auto *Candidate : Candidates) {
+ auto *LastI = cast<VPInstruction>(Last);
+ auto *CandidateI = cast<VPInstruction>(Candidate);
+ if (areConsecutiveOrMatch(LastI, CandidateI, IAI)) {
+ LLVM_DEBUG(dbgs() << *cast<VPInstruction>(Candidate)->getUnderlyingInstr()
+ << " ");
+ BestCandidates.push_back(Candidate);
+ }
+ }
+ LLVM_DEBUG(dbgs() << "\n");
+
+ if (BestCandidates.empty())
+ return {OpMode::Failed, nullptr};
+
+ if (BestCandidates.size() == 1)
+ return {Mode, BestCandidates[0]};
+
+ VPValue *Best = nullptr;
+ unsigned BestScore = 0;
+ for (unsigned Depth = 1; Depth < LookaheadMaxDepth; Depth++) {
+ unsigned PrevScore = ~0u;
+ bool AllSame = true;
+
+ // FIXME: Avoid visiting the same operands multiple times.
+ for (auto *Candidate : BestCandidates) {
+ unsigned Score = getLAScore(Last, Candidate, Depth, IAI);
+ if (PrevScore == ~0u)
+ PrevScore = Score;
+ if (PrevScore != Score)
+ AllSame = false;
+ PrevScore = Score;
+
+ if (Score > BestScore) {
+ BestScore = Score;
+ Best = Candidate;
+ }
+ }
+ if (!AllSame)
+ break;
+ }
+ LLVM_DEBUG(dbgs() << "Found best "
+ << *cast<VPInstruction>(Best)->getUnderlyingInstr()
+ << "\n");
+ Candidates.erase(Best);
+
+ return {Mode, Best};
+}
+
+SmallVector<VPlanSlp::MultiNodeOpTy, 4> VPlanSlp::reorderMultiNodeOps() {
+ SmallVector<MultiNodeOpTy, 4> FinalOrder;
+ SmallVector<OpMode, 4> Mode;
+ FinalOrder.reserve(MultiNodeOps.size());
+ Mode.reserve(MultiNodeOps.size());
+
+ LLVM_DEBUG(dbgs() << "Reordering multinode\n");
+
+ for (auto &Operands : MultiNodeOps) {
+ FinalOrder.push_back({Operands.first, {Operands.second[0]}});
+ if (cast<VPInstruction>(Operands.second[0])->getOpcode() ==
+ Instruction::Load)
+ Mode.push_back(OpMode::Load);
+ else
+ Mode.push_back(OpMode::Opcode);
+ }
+
+ for (unsigned Lane = 1, E = MultiNodeOps[0].second.size(); Lane < E; ++Lane) {
+ LLVM_DEBUG(dbgs() << " Finding best value for lane " << Lane << "\n");
+ SmallPtrSet<VPValue *, 4> Candidates;
+ LLVM_DEBUG(dbgs() << " Candidates ");
+ for (auto Ops : MultiNodeOps) {
+ LLVM_DEBUG(
+ dbgs() << *cast<VPInstruction>(Ops.second[Lane])->getUnderlyingInstr()
+ << " ");
+ Candidates.insert(Ops.second[Lane]);
+ }
+ LLVM_DEBUG(dbgs() << "\n");
+
+ for (unsigned Op = 0, E = MultiNodeOps.size(); Op < E; ++Op) {
+ LLVM_DEBUG(dbgs() << " Checking " << Op << "\n");
+ if (Mode[Op] == OpMode::Failed)
+ continue;
+
+ VPValue *Last = FinalOrder[Op].second[Lane - 1];
+ std::pair<OpMode, VPValue *> Res =
+ getBest(Mode[Op], Last, Candidates, IAI);
+ if (Res.second)
+ FinalOrder[Op].second.push_back(Res.second);
+ else
+ // TODO: handle this case
+ FinalOrder[Op].second.push_back(markFailed());
+ }
+ }
+
+ return FinalOrder;
+}
+
+void VPlanSlp::dumpBundle(ArrayRef<VPValue *> Values) {
+ dbgs() << " Ops: ";
+ for (auto Op : Values) {
+ if (auto *VPInstr = cast_or_null<VPInstruction>(Op))
+ if (auto *Instr = VPInstr->getUnderlyingInstr()) {
+ dbgs() << *Instr << " | ";
+ continue;
+ }
+ dbgs() << " nullptr | ";
+ }
+ dbgs() << "\n";
+}
+
+VPInstruction *VPlanSlp::buildGraph(ArrayRef<VPValue *> Values) {
+ assert(!Values.empty() && "Need some operands!");
+
+ // If we already visited this instruction bundle, re-use the existing node
+ auto I = BundleToCombined.find(to_vector<4>(Values));
+ if (I != BundleToCombined.end()) {
+#ifndef NDEBUG
+ // Check that the resulting graph is a tree. If we re-use a node, this means
+ // its values have multiple users. We only allow this, if all users of each
+ // value are the same instruction.
+ for (auto *V : Values) {
+ auto UI = V->user_begin();
+ auto *FirstUser = *UI++;
+ while (UI != V->user_end()) {
+ assert(*UI == FirstUser && "Currently we only support SLP trees.");
+ UI++;
+ }
+ }
+#endif
+ return I->second;
+ }
+
+ // Dump inputs
+ LLVM_DEBUG({
+ dbgs() << "buildGraph: ";
+ dumpBundle(Values);
+ });
+
+ if (!areVectorizable(Values))
+ return markFailed();
+
+ assert(getOpcode(Values) && "Opcodes for all values must match");
+ unsigned ValuesOpcode = getOpcode(Values).getValue();
+
+ SmallVector<VPValue *, 4> CombinedOperands;
+ if (areCommutative(Values)) {
+ bool MultiNodeRoot = !MultiNodeActive;
+ MultiNodeActive = true;
+ for (auto &Operands : getOperands(Values)) {
+ LLVM_DEBUG({
+ dbgs() << " Visiting Commutative";
+ dumpBundle(Operands);
+ });
+
+ auto OperandsOpcode = getOpcode(Operands);
+ if (OperandsOpcode && OperandsOpcode == getOpcode(Values)) {
+ LLVM_DEBUG(dbgs() << " Same opcode, continue building\n");
+ CombinedOperands.push_back(buildGraph(Operands));
+ } else {
+ LLVM_DEBUG(dbgs() << " Adding multinode Ops\n");
+ // Create dummy VPInstruction, which will we replace later by the
+ // re-ordered operand.
+ VPInstruction *Op = new VPInstruction(0, {});
+ CombinedOperands.push_back(Op);
+ MultiNodeOps.emplace_back(Op, Operands);
+ }
+ }
+
+ if (MultiNodeRoot) {
+ LLVM_DEBUG(dbgs() << "Reorder \n");
+ MultiNodeActive = false;
+
+ auto FinalOrder = reorderMultiNodeOps();
+
+ MultiNodeOps.clear();
+ for (auto &Ops : FinalOrder) {
+ VPInstruction *NewOp = buildGraph(Ops.second);
+ Ops.first->replaceAllUsesWith(NewOp);
+ for (unsigned i = 0; i < CombinedOperands.size(); i++)
+ if (CombinedOperands[i] == Ops.first)
+ CombinedOperands[i] = NewOp;
+ delete Ops.first;
+ Ops.first = NewOp;
+ }
+ LLVM_DEBUG(dbgs() << "Found final order\n");
+ }
+ } else {
+ LLVM_DEBUG(dbgs() << " NonCommuntative\n");
+ if (ValuesOpcode == Instruction::Load)
+ for (VPValue *V : Values)
+ CombinedOperands.push_back(cast<VPInstruction>(V)->getOperand(0));
+ else
+ for (auto &Operands : getOperands(Values))
+ CombinedOperands.push_back(buildGraph(Operands));
+ }
+
+ unsigned Opcode;
+ switch (ValuesOpcode) {
+ case Instruction::Load:
+ Opcode = VPInstruction::SLPLoad;
+ break;
+ case Instruction::Store:
+ Opcode = VPInstruction::SLPStore;
+ break;
+ default:
+ Opcode = ValuesOpcode;
+ break;
+ }
+
+ if (!CompletelySLP)
+ return markFailed();
+
+ assert(CombinedOperands.size() > 0 && "Need more some operands");
+ auto *VPI = new VPInstruction(Opcode, CombinedOperands);
+ VPI->setUnderlyingInstr(cast<VPInstruction>(Values[0])->getUnderlyingInstr());
+
LLVM_DEBUG(dbgs() << "Create VPInstruction " << *VPI << " "
<< *cast<VPInstruction>(Values[0]) << "\n");
- addCombined(Values, VPI);
- return VPI;
-}
+ addCombined(Values, VPI);
+ return VPI;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 640ca7160b..1a54603faf 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1,91 +1,91 @@
-//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file implements a set of utility VPlan to VPlan transformations.
-///
-//===----------------------------------------------------------------------===//
-
-#include "VPlanTransforms.h"
-#include "llvm/ADT/PostOrderIterator.h"
-
-using namespace llvm;
-
-void VPlanTransforms::VPInstructionsToVPRecipes(
- Loop *OrigLoop, VPlanPtr &Plan,
- LoopVectorizationLegality::InductionList &Inductions,
- SmallPtrSetImpl<Instruction *> &DeadInstructions) {
-
- auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());
- ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());
-
- // Condition bit VPValues get deleted during transformation to VPRecipes.
- // Create new VPValues and save away as condition bits. These will be deleted
- // after finalizing the vector IR basic blocks.
- for (VPBlockBase *Base : RPOT) {
- VPBasicBlock *VPBB = Base->getEntryBasicBlock();
- if (auto *CondBit = VPBB->getCondBit()) {
- auto *NCondBit = new VPValue(CondBit->getUnderlyingValue());
- VPBB->setCondBit(NCondBit);
- Plan->addCBV(NCondBit);
- }
- }
- for (VPBlockBase *Base : RPOT) {
- // Do not widen instructions in pre-header and exit blocks.
- if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0)
- continue;
-
- VPBasicBlock *VPBB = Base->getEntryBasicBlock();
- // Introduce each ingredient into VPlan.
- for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) {
- VPRecipeBase *Ingredient = &*I++;
- // Can only handle VPInstructions.
- VPInstruction *VPInst = cast<VPInstruction>(Ingredient);
- Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
- if (DeadInstructions.count(Inst)) {
+//===-- VPlanTransforms.cpp - Utility VPlan to VPlan transforms -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements a set of utility VPlan to VPlan transformations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanTransforms.h"
+#include "llvm/ADT/PostOrderIterator.h"
+
+using namespace llvm;
+
+void VPlanTransforms::VPInstructionsToVPRecipes(
+ Loop *OrigLoop, VPlanPtr &Plan,
+ LoopVectorizationLegality::InductionList &Inductions,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+
+ auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());
+ ReversePostOrderTraversal<VPBlockBase *> RPOT(TopRegion->getEntry());
+
+ // Condition bit VPValues get deleted during transformation to VPRecipes.
+ // Create new VPValues and save away as condition bits. These will be deleted
+ // after finalizing the vector IR basic blocks.
+ for (VPBlockBase *Base : RPOT) {
+ VPBasicBlock *VPBB = Base->getEntryBasicBlock();
+ if (auto *CondBit = VPBB->getCondBit()) {
+ auto *NCondBit = new VPValue(CondBit->getUnderlyingValue());
+ VPBB->setCondBit(NCondBit);
+ Plan->addCBV(NCondBit);
+ }
+ }
+ for (VPBlockBase *Base : RPOT) {
+ // Do not widen instructions in pre-header and exit blocks.
+ if (Base->getNumPredecessors() == 0 || Base->getNumSuccessors() == 0)
+ continue;
+
+ VPBasicBlock *VPBB = Base->getEntryBasicBlock();
+ // Introduce each ingredient into VPlan.
+ for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) {
+ VPRecipeBase *Ingredient = &*I++;
+ // Can only handle VPInstructions.
+ VPInstruction *VPInst = cast<VPInstruction>(Ingredient);
+ Instruction *Inst = cast<Instruction>(VPInst->getUnderlyingValue());
+ if (DeadInstructions.count(Inst)) {
VPValue DummyValue;
VPInst->replaceAllUsesWith(&DummyValue);
- Ingredient->eraseFromParent();
- continue;
- }
-
- VPRecipeBase *NewRecipe = nullptr;
- // Create VPWidenMemoryInstructionRecipe for loads and stores.
- if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
- NewRecipe = new VPWidenMemoryInstructionRecipe(
- *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
- nullptr /*Mask*/);
- else if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
- NewRecipe = new VPWidenMemoryInstructionRecipe(
- *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
- Plan->getOrAddVPValue(Store->getValueOperand()), nullptr /*Mask*/);
- else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
- InductionDescriptor II = Inductions.lookup(Phi);
- if (II.getKind() == InductionDescriptor::IK_IntInduction ||
- II.getKind() == InductionDescriptor::IK_FpInduction) {
+ Ingredient->eraseFromParent();
+ continue;
+ }
+
+ VPRecipeBase *NewRecipe = nullptr;
+ // Create VPWidenMemoryInstructionRecipe for loads and stores.
+ if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
+ NewRecipe = new VPWidenMemoryInstructionRecipe(
+ *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
+ nullptr /*Mask*/);
+ else if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
+ NewRecipe = new VPWidenMemoryInstructionRecipe(
+ *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
+ Plan->getOrAddVPValue(Store->getValueOperand()), nullptr /*Mask*/);
+ else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
+ InductionDescriptor II = Inductions.lookup(Phi);
+ if (II.getKind() == InductionDescriptor::IK_IntInduction ||
+ II.getKind() == InductionDescriptor::IK_FpInduction) {
VPValue *Start = Plan->getOrAddVPValue(II.getStartValue());
NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start);
- } else
- NewRecipe = new VPWidenPHIRecipe(Phi);
- } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
- NewRecipe = new VPWidenGEPRecipe(
- GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop);
- } else
- NewRecipe =
- new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands()));
-
- NewRecipe->insertBefore(Ingredient);
+ } else
+ NewRecipe = new VPWidenPHIRecipe(Phi);
+ } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
+ NewRecipe = new VPWidenGEPRecipe(
+ GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop);
+ } else
+ NewRecipe =
+ new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands()));
+
+ NewRecipe->insertBefore(Ingredient);
if (NewRecipe->getNumDefinedValues() == 1)
VPInst->replaceAllUsesWith(NewRecipe->getVPValue());
else
assert(NewRecipe->getNumDefinedValues() == 0 &&
"Only recpies with zero or one defined values expected");
- Ingredient->eraseFromParent();
- }
- }
-}
+ Ingredient->eraseFromParent();
+ }
+ }
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.h
index 15cc7d355f..4b20e8b4e3 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -1,33 +1,33 @@
-//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file provides utility VPlan to VPlan transformations.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
-
-#include "VPlan.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
-
-namespace llvm {
-
-struct VPlanTransforms {
- /// Replaces the VPInstructions in \p Plan with corresponding
- /// widen recipes.
- static void VPInstructionsToVPRecipes(
- Loop *OrigLoop, VPlanPtr &Plan,
- LoopVectorizationLegality::InductionList &Inductions,
- SmallPtrSetImpl<Instruction *> &DeadInstructions);
-};
-
-} // namespace llvm
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
+//===- VPlanTransforms.h - Utility VPlan to VPlan transforms --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file provides utility VPlan to VPlan transformations.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
+
+#include "VPlan.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
+
+namespace llvm {
+
+struct VPlanTransforms {
+ /// Replaces the VPInstructions in \p Plan with corresponding
+ /// widen recipes.
+ static void VPInstructionsToVPRecipes(
+ Loop *OrigLoop, VPlanPtr &Plan,
+ LoopVectorizationLegality::InductionList &Inductions,
+ SmallPtrSetImpl<Instruction *> &DeadInstructions);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLANTRANSFORMS_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h
index dbf04d3707..ed572ca366 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanValue.h
@@ -1,93 +1,93 @@
-//===- VPlanValue.h - Represent Values in Vectorizer Plan -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file contains the declarations of the entities induced by Vectorization
-/// Plans, e.g. the instructions the VPlan intends to generate if executed.
-/// VPlan models the following entities:
+//===- VPlanValue.h - Represent Values in Vectorizer Plan -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the declarations of the entities induced by Vectorization
+/// Plans, e.g. the instructions the VPlan intends to generate if executed.
+/// VPlan models the following entities:
/// VPValue VPUser VPDef
/// | |
/// VPInstruction
-/// These are documented in docs/VectorizationPlan.rst.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
-
-#include "llvm/ADT/DenseMap.h"
+/// These are documented in docs/VectorizationPlan.rst.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
+
+#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/TinyPtrVector.h"
-#include "llvm/ADT/iterator_range.h"
-
-namespace llvm {
-
-// Forward declarations.
-class raw_ostream;
-class Value;
+#include "llvm/ADT/iterator_range.h"
+
+namespace llvm {
+
+// Forward declarations.
+class raw_ostream;
+class Value;
class VPDef;
-class VPSlotTracker;
-class VPUser;
+class VPSlotTracker;
+class VPUser;
class VPRecipeBase;
class VPWidenMemoryInstructionRecipe;
-
-// This is the base class of the VPlan Def/Use graph, used for modeling the data
-// flow into, within and out of the VPlan. VPValues can stand for live-ins
-// coming from the input IR, instructions which VPlan will generate if executed
-// and live-outs which the VPlan will need to fix accordingly.
-class VPValue {
- friend class VPBuilder;
+
+// This is the base class of the VPlan Def/Use graph, used for modeling the data
+// flow into, within and out of the VPlan. VPValues can stand for live-ins
+// coming from the input IR, instructions which VPlan will generate if executed
+// and live-outs which the VPlan will need to fix accordingly.
+class VPValue {
+ friend class VPBuilder;
friend class VPDef;
friend class VPInstruction;
- friend struct VPlanTransforms;
- friend class VPBasicBlock;
- friend class VPInterleavedAccessInfo;
- friend class VPSlotTracker;
+ friend struct VPlanTransforms;
+ friend class VPBasicBlock;
+ friend class VPInterleavedAccessInfo;
+ friend class VPSlotTracker;
friend class VPRecipeBase;
friend class VPWidenMemoryInstructionRecipe;
-
- const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
-
- SmallVector<VPUser *, 1> Users;
-
-protected:
- // Hold the underlying Value, if any, attached to this VPValue.
- Value *UnderlyingVal;
-
+
+ const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
+
+ SmallVector<VPUser *, 1> Users;
+
+protected:
+ // Hold the underlying Value, if any, attached to this VPValue.
+ Value *UnderlyingVal;
+
/// Pointer to the VPDef that defines this VPValue. If it is nullptr, the
/// VPValue is not defined by any recipe modeled in VPlan.
VPDef *Def;
-
+
VPValue(const unsigned char SC, Value *UV = nullptr, VPDef *Def = nullptr);
- // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to
- // the front-end and back-end of VPlan so that the middle-end is as
- // independent as possible of the underlying IR. We grant access to the
- // underlying IR using friendship. In that way, we should be able to use VPlan
- // for multiple underlying IRs (Polly?) by providing a new VPlan front-end,
- // back-end and analysis information for the new IR.
-
- // Set \p Val as the underlying Value of this VPValue.
- void setUnderlyingValue(Value *Val) {
- assert(!UnderlyingVal && "Underlying Value is already set.");
- UnderlyingVal = Val;
- }
-
-public:
+ // DESIGN PRINCIPLE: Access to the underlying IR must be strictly limited to
+ // the front-end and back-end of VPlan so that the middle-end is as
+ // independent as possible of the underlying IR. We grant access to the
+ // underlying IR using friendship. In that way, we should be able to use VPlan
+ // for multiple underlying IRs (Polly?) by providing a new VPlan front-end,
+ // back-end and analysis information for the new IR.
+
+ // Set \p Val as the underlying Value of this VPValue.
+ void setUnderlyingValue(Value *Val) {
+ assert(!UnderlyingVal && "Underlying Value is already set.");
+ UnderlyingVal = Val;
+ }
+
+public:
/// Return the underlying Value attached to this VPValue.
Value *getUnderlyingValue() { return UnderlyingVal; }
const Value *getUnderlyingValue() const { return UnderlyingVal; }
- /// An enumeration for keeping track of the concrete subclass of VPValue that
- /// are actually instantiated. Values of this enumeration are kept in the
- /// SubclassID field of the VPValue objects. They are used for concrete
- /// type identification.
+ /// An enumeration for keeping track of the concrete subclass of VPValue that
+ /// are actually instantiated. Values of this enumeration are kept in the
+ /// SubclassID field of the VPValue objects. They are used for concrete
+ /// type identification.
enum {
VPValueSC,
VPVInstructionSC,
@@ -99,28 +99,28 @@ public:
VPVWidenGEPSC,
VPVWidenSelectSC,
};
-
+
VPValue(Value *UV = nullptr, VPDef *Def = nullptr)
: VPValue(VPValueSC, UV, Def) {}
- VPValue(const VPValue &) = delete;
- VPValue &operator=(const VPValue &) = delete;
-
+ VPValue(const VPValue &) = delete;
+ VPValue &operator=(const VPValue &) = delete;
+
virtual ~VPValue();
- /// \return an ID for the concrete type of this object.
- /// This is used to implement the classof checks. This should not be used
- /// for any other purpose, as the values may change as LLVM evolves.
- unsigned getVPValueID() const { return SubclassID; }
-
- void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const;
- void print(raw_ostream &OS, VPSlotTracker &Tracker) const;
-
+ /// \return an ID for the concrete type of this object.
+ /// This is used to implement the classof checks. This should not be used
+ /// for any other purpose, as the values may change as LLVM evolves.
+ unsigned getVPValueID() const { return SubclassID; }
+
+ void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const;
+ void print(raw_ostream &OS, VPSlotTracker &Tracker) const;
+
/// Dump the value to stderr (for debugging).
void dump() const;
- unsigned getNumUsers() const { return Users.size(); }
- void addUser(VPUser &User) { Users.push_back(&User); }
-
+ unsigned getNumUsers() const { return Users.size(); }
+ void addUser(VPUser &User) { Users.push_back(&User); }
+
/// Remove a single \p User from the list of users.
void removeUser(VPUser &User) {
bool Found = false;
@@ -137,33 +137,33 @@ public:
});
}
- typedef SmallVectorImpl<VPUser *>::iterator user_iterator;
- typedef SmallVectorImpl<VPUser *>::const_iterator const_user_iterator;
- typedef iterator_range<user_iterator> user_range;
- typedef iterator_range<const_user_iterator> const_user_range;
-
- user_iterator user_begin() { return Users.begin(); }
- const_user_iterator user_begin() const { return Users.begin(); }
- user_iterator user_end() { return Users.end(); }
- const_user_iterator user_end() const { return Users.end(); }
- user_range users() { return user_range(user_begin(), user_end()); }
- const_user_range users() const {
- return const_user_range(user_begin(), user_end());
- }
-
- /// Returns true if the value has more than one unique user.
- bool hasMoreThanOneUniqueUser() {
- if (getNumUsers() == 0)
- return false;
-
- // Check if all users match the first user.
- auto Current = std::next(user_begin());
- while (Current != user_end() && *user_begin() == *Current)
- Current++;
- return Current != user_end();
- }
-
- void replaceAllUsesWith(VPValue *New);
+ typedef SmallVectorImpl<VPUser *>::iterator user_iterator;
+ typedef SmallVectorImpl<VPUser *>::const_iterator const_user_iterator;
+ typedef iterator_range<user_iterator> user_range;
+ typedef iterator_range<const_user_iterator> const_user_range;
+
+ user_iterator user_begin() { return Users.begin(); }
+ const_user_iterator user_begin() const { return Users.begin(); }
+ user_iterator user_end() { return Users.end(); }
+ const_user_iterator user_end() const { return Users.end(); }
+ user_range users() { return user_range(user_begin(), user_end()); }
+ const_user_range users() const {
+ return const_user_range(user_begin(), user_end());
+ }
+
+ /// Returns true if the value has more than one unique user.
+ bool hasMoreThanOneUniqueUser() {
+ if (getNumUsers() == 0)
+ return false;
+
+ // Check if all users match the first user.
+ auto Current = std::next(user_begin());
+ while (Current != user_end() && *user_begin() == *Current)
+ Current++;
+ return Current != user_end();
+ }
+
+ void replaceAllUsesWith(VPValue *New);
VPDef *getDef() { return Def; }
@@ -175,77 +175,77 @@ public:
"VPValue is not a live-in; it is defined by a VPDef inside a VPlan");
return getUnderlyingValue();
}
-};
-
-typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
-typedef DenseMap<VPValue *, Value *> VPValue2ValueTy;
-
-raw_ostream &operator<<(raw_ostream &OS, const VPValue &V);
-
-/// This class augments VPValue with operands which provide the inverse def-use
-/// edges from VPValue's users to their defs.
+};
+
+typedef DenseMap<Value *, VPValue *> Value2VPValueTy;
+typedef DenseMap<VPValue *, Value *> VPValue2ValueTy;
+
+raw_ostream &operator<<(raw_ostream &OS, const VPValue &V);
+
+/// This class augments VPValue with operands which provide the inverse def-use
+/// edges from VPValue's users to their defs.
class VPUser {
- SmallVector<VPValue *, 2> Operands;
-
-protected:
+ SmallVector<VPValue *, 2> Operands;
+
+protected:
/// Print the operands to \p O.
void printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const;
public:
VPUser() {}
VPUser(ArrayRef<VPValue *> Operands) {
- for (VPValue *Operand : Operands)
- addOperand(Operand);
- }
-
- VPUser(std::initializer_list<VPValue *> Operands)
- : VPUser(ArrayRef<VPValue *>(Operands)) {}
+ for (VPValue *Operand : Operands)
+ addOperand(Operand);
+ }
+
+ VPUser(std::initializer_list<VPValue *> Operands)
+ : VPUser(ArrayRef<VPValue *>(Operands)) {}
template <typename IterT> VPUser(iterator_range<IterT> Operands) {
- for (VPValue *Operand : Operands)
- addOperand(Operand);
- }
-
- VPUser(const VPUser &) = delete;
- VPUser &operator=(const VPUser &) = delete;
+ for (VPValue *Operand : Operands)
+ addOperand(Operand);
+ }
+
+ VPUser(const VPUser &) = delete;
+ VPUser &operator=(const VPUser &) = delete;
virtual ~VPUser() {
for (VPValue *Op : operands())
Op->removeUser(*this);
- }
-
- void addOperand(VPValue *Operand) {
- Operands.push_back(Operand);
- Operand->addUser(*this);
- }
-
- unsigned getNumOperands() const { return Operands.size(); }
- inline VPValue *getOperand(unsigned N) const {
- assert(N < Operands.size() && "Operand index out of bounds");
- return Operands[N];
- }
-
+ }
+
+ void addOperand(VPValue *Operand) {
+ Operands.push_back(Operand);
+ Operand->addUser(*this);
+ }
+
+ unsigned getNumOperands() const { return Operands.size(); }
+ inline VPValue *getOperand(unsigned N) const {
+ assert(N < Operands.size() && "Operand index out of bounds");
+ return Operands[N];
+ }
+
void setOperand(unsigned I, VPValue *New) {
Operands[I]->removeUser(*this);
Operands[I] = New;
New->addUser(*this);
}
-
- typedef SmallVectorImpl<VPValue *>::iterator operand_iterator;
- typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator;
- typedef iterator_range<operand_iterator> operand_range;
- typedef iterator_range<const_operand_iterator> const_operand_range;
-
- operand_iterator op_begin() { return Operands.begin(); }
- const_operand_iterator op_begin() const { return Operands.begin(); }
- operand_iterator op_end() { return Operands.end(); }
- const_operand_iterator op_end() const { return Operands.end(); }
- operand_range operands() { return operand_range(op_begin(), op_end()); }
- const_operand_range operands() const {
- return const_operand_range(op_begin(), op_end());
- }
+
+ typedef SmallVectorImpl<VPValue *>::iterator operand_iterator;
+ typedef SmallVectorImpl<VPValue *>::const_iterator const_operand_iterator;
+ typedef iterator_range<operand_iterator> operand_range;
+ typedef iterator_range<const_operand_iterator> const_operand_range;
+
+ operand_iterator op_begin() { return Operands.begin(); }
+ const_operand_iterator op_begin() const { return Operands.begin(); }
+ operand_iterator op_end() { return Operands.end(); }
+ const_operand_iterator op_end() const { return Operands.end(); }
+ operand_range operands() { return operand_range(op_begin(), op_end()); }
+ const_operand_range operands() const {
+ return const_operand_range(op_begin(), op_end());
+ }
/// Method to support type inquiry through isa, cast, and dyn_cast.
static inline bool classof(const VPDef *Recipe);
-};
+};
/// This class augments a recipe with a set of VPValues defined by the recipe.
/// It allows recipes to define zero, one or multiple VPValues. A VPDef owns
@@ -346,38 +346,38 @@ public:
VPSlotTracker &SlotTracker) const = 0;
};
-class VPlan;
-class VPBasicBlock;
-class VPRegionBlock;
-
-/// This class can be used to assign consecutive numbers to all VPValues in a
-/// VPlan and allows querying the numbering for printing, similar to the
-/// ModuleSlotTracker for IR values.
-class VPSlotTracker {
- DenseMap<const VPValue *, unsigned> Slots;
- unsigned NextSlot = 0;
-
- void assignSlots(const VPBlockBase *VPBB);
- void assignSlots(const VPRegionBlock *Region);
- void assignSlots(const VPBasicBlock *VPBB);
- void assignSlot(const VPValue *V);
-
- void assignSlots(const VPlan &Plan);
-
-public:
+class VPlan;
+class VPBasicBlock;
+class VPRegionBlock;
+
+/// This class can be used to assign consecutive numbers to all VPValues in a
+/// VPlan and allows querying the numbering for printing, similar to the
+/// ModuleSlotTracker for IR values.
+class VPSlotTracker {
+ DenseMap<const VPValue *, unsigned> Slots;
+ unsigned NextSlot = 0;
+
+ void assignSlots(const VPBlockBase *VPBB);
+ void assignSlots(const VPRegionBlock *Region);
+ void assignSlots(const VPBasicBlock *VPBB);
+ void assignSlot(const VPValue *V);
+
+ void assignSlots(const VPlan &Plan);
+
+public:
VPSlotTracker(const VPlan *Plan = nullptr) {
- if (Plan)
- assignSlots(*Plan);
- }
-
- unsigned getSlot(const VPValue *V) const {
- auto I = Slots.find(V);
- if (I == Slots.end())
- return -1;
- return I->second;
- }
-};
-
-} // namespace llvm
-
-#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
+ if (Plan)
+ assignSlots(*Plan);
+ }
+
+ unsigned getSlot(const VPValue *V) const {
+ auto I = Slots.find(V);
+ if (I == Slots.end())
+ return -1;
+ return I->second;
+ }
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 7a602fb146..6eec8d14de 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -1,130 +1,130 @@
-//===-- VPlanVerifier.cpp -------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file defines the class VPlanVerifier, which contains utility functions
-/// to check the consistency and invariants of a VPlan.
-///
-//===----------------------------------------------------------------------===//
-
-#include "VPlanVerifier.h"
-#include "VPlan.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/Support/CommandLine.h"
-
-#define DEBUG_TYPE "loop-vectorize"
-
-using namespace llvm;
-
-static cl::opt<bool> EnableHCFGVerifier("vplan-verify-hcfg", cl::init(false),
- cl::Hidden,
- cl::desc("Verify VPlan H-CFG."));
-
-#ifndef NDEBUG
-/// Utility function that checks whether \p VPBlockVec has duplicate
-/// VPBlockBases.
-static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) {
- SmallDenseSet<const VPBlockBase *, 8> VPBlockSet;
- for (const auto *Block : VPBlockVec) {
- if (VPBlockSet.count(Block))
- return true;
- VPBlockSet.insert(Block);
- }
- return false;
-}
-#endif
-
-/// Helper function that verifies the CFG invariants of the VPBlockBases within
-/// \p Region. Checks in this function are generic for VPBlockBases. They are
-/// not specific for VPBasicBlocks or VPRegionBlocks.
-static void verifyBlocksInRegion(const VPRegionBlock *Region) {
- for (const VPBlockBase *VPB :
- make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
- df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
- // Check block's parent.
- assert(VPB->getParent() == Region && "VPBlockBase has wrong parent");
-
- // Check block's condition bit.
- if (VPB->getNumSuccessors() > 1)
- assert(VPB->getCondBit() && "Missing condition bit!");
- else
- assert(!VPB->getCondBit() && "Unexpected condition bit!");
-
- // Check block's successors.
- const auto &Successors = VPB->getSuccessors();
- // There must be only one instance of a successor in block's successor list.
- // TODO: This won't work for switch statements.
- assert(!hasDuplicates(Successors) &&
- "Multiple instances of the same successor.");
-
- for (const VPBlockBase *Succ : Successors) {
- // There must be a bi-directional link between block and successor.
- const auto &SuccPreds = Succ->getPredecessors();
+//===-- VPlanVerifier.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the class VPlanVerifier, which contains utility functions
+/// to check the consistency and invariants of a VPlan.
+///
+//===----------------------------------------------------------------------===//
+
+#include "VPlanVerifier.h"
+#include "VPlan.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Support/CommandLine.h"
+
+#define DEBUG_TYPE "loop-vectorize"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableHCFGVerifier("vplan-verify-hcfg", cl::init(false),
+ cl::Hidden,
+ cl::desc("Verify VPlan H-CFG."));
+
+#ifndef NDEBUG
+/// Utility function that checks whether \p VPBlockVec has duplicate
+/// VPBlockBases.
+static bool hasDuplicates(const SmallVectorImpl<VPBlockBase *> &VPBlockVec) {
+ SmallDenseSet<const VPBlockBase *, 8> VPBlockSet;
+ for (const auto *Block : VPBlockVec) {
+ if (VPBlockSet.count(Block))
+ return true;
+ VPBlockSet.insert(Block);
+ }
+ return false;
+}
+#endif
+
+/// Helper function that verifies the CFG invariants of the VPBlockBases within
+/// \p Region. Checks in this function are generic for VPBlockBases. They are
+/// not specific for VPBasicBlocks or VPRegionBlocks.
+static void verifyBlocksInRegion(const VPRegionBlock *Region) {
+ for (const VPBlockBase *VPB :
+ make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+ df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+ // Check block's parent.
+ assert(VPB->getParent() == Region && "VPBlockBase has wrong parent");
+
+ // Check block's condition bit.
+ if (VPB->getNumSuccessors() > 1)
+ assert(VPB->getCondBit() && "Missing condition bit!");
+ else
+ assert(!VPB->getCondBit() && "Unexpected condition bit!");
+
+ // Check block's successors.
+ const auto &Successors = VPB->getSuccessors();
+ // There must be only one instance of a successor in block's successor list.
+ // TODO: This won't work for switch statements.
+ assert(!hasDuplicates(Successors) &&
+ "Multiple instances of the same successor.");
+
+ for (const VPBlockBase *Succ : Successors) {
+ // There must be a bi-directional link between block and successor.
+ const auto &SuccPreds = Succ->getPredecessors();
assert(llvm::is_contained(SuccPreds, VPB) && "Missing predecessor link.");
- (void)SuccPreds;
- }
-
- // Check block's predecessors.
- const auto &Predecessors = VPB->getPredecessors();
- // There must be only one instance of a predecessor in block's predecessor
- // list.
- // TODO: This won't work for switch statements.
- assert(!hasDuplicates(Predecessors) &&
- "Multiple instances of the same predecessor.");
-
- for (const VPBlockBase *Pred : Predecessors) {
- // Block and predecessor must be inside the same region.
- assert(Pred->getParent() == VPB->getParent() &&
- "Predecessor is not in the same region.");
-
- // There must be a bi-directional link between block and predecessor.
- const auto &PredSuccs = Pred->getSuccessors();
+ (void)SuccPreds;
+ }
+
+ // Check block's predecessors.
+ const auto &Predecessors = VPB->getPredecessors();
+ // There must be only one instance of a predecessor in block's predecessor
+ // list.
+ // TODO: This won't work for switch statements.
+ assert(!hasDuplicates(Predecessors) &&
+ "Multiple instances of the same predecessor.");
+
+ for (const VPBlockBase *Pred : Predecessors) {
+ // Block and predecessor must be inside the same region.
+ assert(Pred->getParent() == VPB->getParent() &&
+ "Predecessor is not in the same region.");
+
+ // There must be a bi-directional link between block and predecessor.
+ const auto &PredSuccs = Pred->getSuccessors();
assert(llvm::is_contained(PredSuccs, VPB) && "Missing successor link.");
- (void)PredSuccs;
- }
- }
-}
-
-/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
-/// VPBlockBases. Do not recurse inside nested VPRegionBlocks.
-static void verifyRegion(const VPRegionBlock *Region) {
- const VPBlockBase *Entry = Region->getEntry();
- const VPBlockBase *Exit = Region->getExit();
-
- // Entry and Exit shouldn't have any predecessor/successor, respectively.
- assert(!Entry->getNumPredecessors() && "Region entry has predecessors.");
- assert(!Exit->getNumSuccessors() && "Region exit has successors.");
- (void)Entry;
- (void)Exit;
-
- verifyBlocksInRegion(Region);
-}
-
-/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
-/// VPBlockBases. Recurse inside nested VPRegionBlocks.
-static void verifyRegionRec(const VPRegionBlock *Region) {
- verifyRegion(Region);
-
- // Recurse inside nested regions.
- for (const VPBlockBase *VPB :
- make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
- df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
- if (const auto *SubRegion = dyn_cast<VPRegionBlock>(VPB))
- verifyRegionRec(SubRegion);
- }
-}
-
-void VPlanVerifier::verifyHierarchicalCFG(
- const VPRegionBlock *TopRegion) const {
- if (!EnableHCFGVerifier)
- return;
-
- LLVM_DEBUG(dbgs() << "Verifying VPlan H-CFG.\n");
- assert(!TopRegion->getParent() && "VPlan Top Region should have no parent.");
- verifyRegionRec(TopRegion);
-}
+ (void)PredSuccs;
+ }
+ }
+}
+
+/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
+/// VPBlockBases. Do not recurse inside nested VPRegionBlocks.
+static void verifyRegion(const VPRegionBlock *Region) {
+ const VPBlockBase *Entry = Region->getEntry();
+ const VPBlockBase *Exit = Region->getExit();
+
+ // Entry and Exit shouldn't have any predecessor/successor, respectively.
+ assert(!Entry->getNumPredecessors() && "Region entry has predecessors.");
+ assert(!Exit->getNumSuccessors() && "Region exit has successors.");
+ (void)Entry;
+ (void)Exit;
+
+ verifyBlocksInRegion(Region);
+}
+
+/// Verify the CFG invariants of VPRegionBlock \p Region and its nested
+/// VPBlockBases. Recurse inside nested VPRegionBlocks.
+static void verifyRegionRec(const VPRegionBlock *Region) {
+ verifyRegion(Region);
+
+ // Recurse inside nested regions.
+ for (const VPBlockBase *VPB :
+ make_range(df_iterator<const VPBlockBase *>::begin(Region->getEntry()),
+ df_iterator<const VPBlockBase *>::end(Region->getExit()))) {
+ if (const auto *SubRegion = dyn_cast<VPRegionBlock>(VPB))
+ verifyRegionRec(SubRegion);
+ }
+}
+
+void VPlanVerifier::verifyHierarchicalCFG(
+ const VPRegionBlock *TopRegion) const {
+ if (!EnableHCFGVerifier)
+ return;
+
+ LLVM_DEBUG(dbgs() << "Verifying VPlan H-CFG.\n");
+ assert(!TopRegion->getParent() && "VPlan Top Region should have no parent.");
+ verifyRegionRec(TopRegion);
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.h b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.h
index 75a92a8d12..8e8de44164 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.h
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VPlanVerifier.h
@@ -1,41 +1,41 @@
-//===-- VPlanVerifier.h -----------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file declares the class VPlanVerifier, which contains utility functions
-/// to check the consistency of a VPlan. This includes the following kinds of
-/// invariants:
-///
-/// 1. Region/Block invariants:
-/// - Region's entry/exit block must have no predecessors/successors,
-/// respectively.
-/// - Block's parent must be the region immediately containing the block.
-/// - Linked blocks must have a bi-directional link (successor/predecessor).
-/// - All predecessors/successors of a block must belong to the same region.
-/// - Blocks must have no duplicated successor/predecessor.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
-#define LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
-
-namespace llvm {
-class VPRegionBlock;
-
-/// Struct with utility functions that can be used to check the consistency and
-/// invariants of a VPlan, including the components of its H-CFG.
-struct VPlanVerifier {
- /// Verify the invariants of the H-CFG starting from \p TopRegion. The
- /// verification process comprises the following steps:
- /// 1. Region/Block verification: Check the Region/Block verification
- /// invariants for every region in the H-CFG.
- void verifyHierarchicalCFG(const VPRegionBlock *TopRegion) const;
-};
-} // namespace llvm
-
-#endif //LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
+//===-- VPlanVerifier.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file declares the class VPlanVerifier, which contains utility functions
+/// to check the consistency of a VPlan. This includes the following kinds of
+/// invariants:
+///
+/// 1. Region/Block invariants:
+/// - Region's entry/exit block must have no predecessors/successors,
+/// respectively.
+/// - Block's parent must be the region immediately containing the block.
+/// - Linked blocks must have a bi-directional link (successor/predecessor).
+/// - All predecessors/successors of a block must belong to the same region.
+/// - Blocks must have no duplicated successor/predecessor.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
+#define LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
+
+namespace llvm {
+class VPRegionBlock;
+
+/// Struct with utility functions that can be used to check the consistency and
+/// invariants of a VPlan, including the components of its H-CFG.
+struct VPlanVerifier {
+ /// Verify the invariants of the H-CFG starting from \p TopRegion. The
+ /// verification process comprises the following steps:
+ /// 1. Region/Block verification: Check the Region/Block verification
+ /// invariants for every region in the H-CFG.
+ void verifyHierarchicalCFG(const VPRegionBlock *TopRegion) const;
+};
+} // namespace llvm
+
+#endif //LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp
index 815b5eadbd..787f146bdd 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1,96 +1,96 @@
-//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This pass optimizes scalar/vector interactions using target cost models. The
-// transforms implemented here may not fit in traditional loop-based or SLP
-// vectorization passes.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Vectorize/VectorCombine.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/GlobalsModRef.h"
+//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes scalar/vector interactions using target cost models. The
+// transforms implemented here may not fit in traditional loop-based or SLP
+// vectorization passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/VectorCombine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/VectorUtils.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/PatternMatch.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Vectorize.h"
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-#define DEBUG_TYPE "vector-combine"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Vectorize.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "vector-combine"
STATISTIC(NumVecLoad, "Number of vector loads formed");
-STATISTIC(NumVecCmp, "Number of vector compares formed");
-STATISTIC(NumVecBO, "Number of vector binops formed");
-STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
-STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
-STATISTIC(NumScalarBO, "Number of scalar binops formed");
-STATISTIC(NumScalarCmp, "Number of scalar compares formed");
-
-static cl::opt<bool> DisableVectorCombine(
- "disable-vector-combine", cl::init(false), cl::Hidden,
- cl::desc("Disable all vector combine transforms"));
-
-static cl::opt<bool> DisableBinopExtractShuffle(
- "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
- cl::desc("Disable binop extract to shuffle transforms"));
-
-static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
-
-namespace {
-class VectorCombine {
-public:
- VectorCombine(Function &F, const TargetTransformInfo &TTI,
- const DominatorTree &DT)
- : F(F), Builder(F.getContext()), TTI(TTI), DT(DT) {}
-
- bool run();
-
-private:
- Function &F;
- IRBuilder<> Builder;
- const TargetTransformInfo &TTI;
- const DominatorTree &DT;
-
+STATISTIC(NumVecCmp, "Number of vector compares formed");
+STATISTIC(NumVecBO, "Number of vector binops formed");
+STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
+STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
+STATISTIC(NumScalarBO, "Number of scalar binops formed");
+STATISTIC(NumScalarCmp, "Number of scalar compares formed");
+
+static cl::opt<bool> DisableVectorCombine(
+ "disable-vector-combine", cl::init(false), cl::Hidden,
+ cl::desc("Disable all vector combine transforms"));
+
+static cl::opt<bool> DisableBinopExtractShuffle(
+ "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
+ cl::desc("Disable binop extract to shuffle transforms"));
+
+static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
+
+namespace {
+class VectorCombine {
+public:
+ VectorCombine(Function &F, const TargetTransformInfo &TTI,
+ const DominatorTree &DT)
+ : F(F), Builder(F.getContext()), TTI(TTI), DT(DT) {}
+
+ bool run();
+
+private:
+ Function &F;
+ IRBuilder<> Builder;
+ const TargetTransformInfo &TTI;
+ const DominatorTree &DT;
+
bool vectorizeLoadInsert(Instruction &I);
- ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
- ExtractElementInst *Ext1,
- unsigned PreferredExtractIndex) const;
- bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
- unsigned Opcode,
- ExtractElementInst *&ConvertToShuffle,
- unsigned PreferredExtractIndex);
- void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
- Instruction &I);
- void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
- Instruction &I);
- bool foldExtractExtract(Instruction &I);
- bool foldBitcastShuf(Instruction &I);
- bool scalarizeBinopOrCmp(Instruction &I);
- bool foldExtractedCmps(Instruction &I);
-};
-} // namespace
-
-static void replaceValue(Value &Old, Value &New) {
- Old.replaceAllUsesWith(&New);
- New.takeName(&Old);
-}
-
+ ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
+ ExtractElementInst *Ext1,
+ unsigned PreferredExtractIndex) const;
+ bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+ unsigned Opcode,
+ ExtractElementInst *&ConvertToShuffle,
+ unsigned PreferredExtractIndex);
+ void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+ Instruction &I);
+ void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+ Instruction &I);
+ bool foldExtractExtract(Instruction &I);
+ bool foldBitcastShuf(Instruction &I);
+ bool scalarizeBinopOrCmp(Instruction &I);
+ bool foldExtractedCmps(Instruction &I);
+};
+} // namespace
+
+static void replaceValue(Value &Old, Value &New) {
+ Old.replaceAllUsesWith(&New);
+ New.takeName(&Old);
+}
+
bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// Match insert into fixed vector of scalar value.
// TODO: Handle non-zero insert index.
@@ -223,628 +223,628 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
return true;
}
-/// Determine which, if any, of the inputs should be replaced by a shuffle
-/// followed by extract from a different index.
-ExtractElementInst *VectorCombine::getShuffleExtract(
- ExtractElementInst *Ext0, ExtractElementInst *Ext1,
- unsigned PreferredExtractIndex = InvalidIndex) const {
- assert(isa<ConstantInt>(Ext0->getIndexOperand()) &&
- isa<ConstantInt>(Ext1->getIndexOperand()) &&
- "Expected constant extract indexes");
-
- unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue();
- unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue();
-
- // If the extract indexes are identical, no shuffle is needed.
- if (Index0 == Index1)
- return nullptr;
-
- Type *VecTy = Ext0->getVectorOperand()->getType();
- assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
+/// Determine which, if any, of the inputs should be replaced by a shuffle
+/// followed by extract from a different index.
+ExtractElementInst *VectorCombine::getShuffleExtract(
+ ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+ unsigned PreferredExtractIndex = InvalidIndex) const {
+ assert(isa<ConstantInt>(Ext0->getIndexOperand()) &&
+ isa<ConstantInt>(Ext1->getIndexOperand()) &&
+ "Expected constant extract indexes");
+
+ unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue();
+ unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue();
+
+ // If the extract indexes are identical, no shuffle is needed.
+ if (Index0 == Index1)
+ return nullptr;
+
+ Type *VecTy = Ext0->getVectorOperand()->getType();
+ assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
InstructionCost Cost0 =
TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
InstructionCost Cost1 =
TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
-
+
// If both costs are invalid no shuffle is needed
if (!Cost0.isValid() && !Cost1.isValid())
return nullptr;
- // We are extracting from 2 different indexes, so one operand must be shuffled
- // before performing a vector operation and/or extract. The more expensive
- // extract will be replaced by a shuffle.
- if (Cost0 > Cost1)
- return Ext0;
- if (Cost1 > Cost0)
- return Ext1;
-
- // If the costs are equal and there is a preferred extract index, shuffle the
- // opposite operand.
- if (PreferredExtractIndex == Index0)
- return Ext1;
- if (PreferredExtractIndex == Index1)
- return Ext0;
-
- // Otherwise, replace the extract with the higher index.
- return Index0 > Index1 ? Ext0 : Ext1;
-}
-
-/// Compare the relative costs of 2 extracts followed by scalar operation vs.
-/// vector operation(s) followed by extract. Return true if the existing
-/// instructions are cheaper than a vector alternative. Otherwise, return false
-/// and if one of the extracts should be transformed to a shufflevector, set
-/// \p ConvertToShuffle to that extract instruction.
-bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
- ExtractElementInst *Ext1,
- unsigned Opcode,
- ExtractElementInst *&ConvertToShuffle,
- unsigned PreferredExtractIndex) {
- assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
- isa<ConstantInt>(Ext1->getOperand(1)) &&
- "Expected constant extract indexes");
- Type *ScalarTy = Ext0->getType();
- auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
+ // We are extracting from 2 different indexes, so one operand must be shuffled
+ // before performing a vector operation and/or extract. The more expensive
+ // extract will be replaced by a shuffle.
+ if (Cost0 > Cost1)
+ return Ext0;
+ if (Cost1 > Cost0)
+ return Ext1;
+
+ // If the costs are equal and there is a preferred extract index, shuffle the
+ // opposite operand.
+ if (PreferredExtractIndex == Index0)
+ return Ext1;
+ if (PreferredExtractIndex == Index1)
+ return Ext0;
+
+ // Otherwise, replace the extract with the higher index.
+ return Index0 > Index1 ? Ext0 : Ext1;
+}
+
+/// Compare the relative costs of 2 extracts followed by scalar operation vs.
+/// vector operation(s) followed by extract. Return true if the existing
+/// instructions are cheaper than a vector alternative. Otherwise, return false
+/// and if one of the extracts should be transformed to a shufflevector, set
+/// \p ConvertToShuffle to that extract instruction.
+bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
+ ExtractElementInst *Ext1,
+ unsigned Opcode,
+ ExtractElementInst *&ConvertToShuffle,
+ unsigned PreferredExtractIndex) {
+ assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
+ isa<ConstantInt>(Ext1->getOperand(1)) &&
+ "Expected constant extract indexes");
+ Type *ScalarTy = Ext0->getType();
+ auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
InstructionCost ScalarOpCost, VectorOpCost;
-
- // Get cost estimates for scalar and vector versions of the operation.
- bool IsBinOp = Instruction::isBinaryOp(Opcode);
- if (IsBinOp) {
- ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
- VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
- } else {
- assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
- "Expected a compare");
- ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy,
- CmpInst::makeCmpResultType(ScalarTy));
- VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy,
- CmpInst::makeCmpResultType(VecTy));
- }
-
- // Get cost estimates for the extract elements. These costs will factor into
- // both sequences.
- unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
- unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
-
+
+ // Get cost estimates for scalar and vector versions of the operation.
+ bool IsBinOp = Instruction::isBinaryOp(Opcode);
+ if (IsBinOp) {
+ ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
+ VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+ } else {
+ assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+ "Expected a compare");
+ ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy,
+ CmpInst::makeCmpResultType(ScalarTy));
+ VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy,
+ CmpInst::makeCmpResultType(VecTy));
+ }
+
+ // Get cost estimates for the extract elements. These costs will factor into
+ // both sequences.
+ unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
+ unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
+
InstructionCost Extract0Cost =
- TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
+ TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
InstructionCost Extract1Cost =
- TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index);
-
- // A more expensive extract will always be replaced by a splat shuffle.
- // For example, if Ext0 is more expensive:
- // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
- // extelt (opcode (splat V0, Ext0), V1), Ext1
- // TODO: Evaluate whether that always results in lowest cost. Alternatively,
- // check the cost of creating a broadcast shuffle and shuffling both
- // operands to element 0.
+ TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index);
+
+ // A more expensive extract will always be replaced by a splat shuffle.
+ // For example, if Ext0 is more expensive:
+ // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
+ // extelt (opcode (splat V0, Ext0), V1), Ext1
+ // TODO: Evaluate whether that always results in lowest cost. Alternatively,
+ // check the cost of creating a broadcast shuffle and shuffling both
+ // operands to element 0.
InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
-
- // Extra uses of the extracts mean that we include those costs in the
- // vector total because those instructions will not be eliminated.
+
+ // Extra uses of the extracts mean that we include those costs in the
+ // vector total because those instructions will not be eliminated.
InstructionCost OldCost, NewCost;
- if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) {
- // Handle a special case. If the 2 extracts are identical, adjust the
- // formulas to account for that. The extra use charge allows for either the
- // CSE'd pattern or an unoptimized form with identical values:
- // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
- bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
- : !Ext0->hasOneUse() || !Ext1->hasOneUse();
- OldCost = CheapExtractCost + ScalarOpCost;
- NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
- } else {
- // Handle the general case. Each extract is actually a different value:
- // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
- OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
- NewCost = VectorOpCost + CheapExtractCost +
- !Ext0->hasOneUse() * Extract0Cost +
- !Ext1->hasOneUse() * Extract1Cost;
- }
-
- ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
- if (ConvertToShuffle) {
- if (IsBinOp && DisableBinopExtractShuffle)
- return true;
-
- // If we are extracting from 2 different indexes, then one operand must be
- // shuffled before performing the vector operation. The shuffle mask is
- // undefined except for 1 lane that is being translated to the remaining
- // extraction lane. Therefore, it is a splat shuffle. Ex:
- // ShufMask = { undef, undef, 0, undef }
- // TODO: The cost model has an option for a "broadcast" shuffle
- // (splat-from-element-0), but no option for a more general splat.
- NewCost +=
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
- }
-
- // Aggressively form a vector op if the cost is equal because the transform
- // may enable further optimization.
- // Codegen can reverse this transform (scalarize) if it was not profitable.
- return OldCost < NewCost;
-}
-
-/// Create a shuffle that translates (shifts) 1 element from the input vector
-/// to a new element location.
-static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
- unsigned NewIndex, IRBuilder<> &Builder) {
- // The shuffle mask is undefined except for 1 lane that is being translated
- // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
- // ShufMask = { 2, undef, undef, undef }
- auto *VecTy = cast<FixedVectorType>(Vec->getType());
- SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
- ShufMask[NewIndex] = OldIndex;
+ if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) {
+ // Handle a special case. If the 2 extracts are identical, adjust the
+ // formulas to account for that. The extra use charge allows for either the
+ // CSE'd pattern or an unoptimized form with identical values:
+ // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
+ bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
+ : !Ext0->hasOneUse() || !Ext1->hasOneUse();
+ OldCost = CheapExtractCost + ScalarOpCost;
+ NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
+ } else {
+ // Handle the general case. Each extract is actually a different value:
+ // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
+ OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
+ NewCost = VectorOpCost + CheapExtractCost +
+ !Ext0->hasOneUse() * Extract0Cost +
+ !Ext1->hasOneUse() * Extract1Cost;
+ }
+
+ ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
+ if (ConvertToShuffle) {
+ if (IsBinOp && DisableBinopExtractShuffle)
+ return true;
+
+ // If we are extracting from 2 different indexes, then one operand must be
+ // shuffled before performing the vector operation. The shuffle mask is
+ // undefined except for 1 lane that is being translated to the remaining
+ // extraction lane. Therefore, it is a splat shuffle. Ex:
+ // ShufMask = { undef, undef, 0, undef }
+ // TODO: The cost model has an option for a "broadcast" shuffle
+ // (splat-from-element-0), but no option for a more general splat.
+ NewCost +=
+ TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+ }
+
+ // Aggressively form a vector op if the cost is equal because the transform
+ // may enable further optimization.
+ // Codegen can reverse this transform (scalarize) if it was not profitable.
+ return OldCost < NewCost;
+}
+
+/// Create a shuffle that translates (shifts) 1 element from the input vector
+/// to a new element location.
+static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
+ unsigned NewIndex, IRBuilder<> &Builder) {
+ // The shuffle mask is undefined except for 1 lane that is being translated
+ // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
+ // ShufMask = { 2, undef, undef, undef }
+ auto *VecTy = cast<FixedVectorType>(Vec->getType());
+ SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
+ ShufMask[NewIndex] = OldIndex;
return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
-}
-
-/// Given an extract element instruction with constant index operand, shuffle
-/// the source vector (shift the scalar element) to a NewIndex for extraction.
-/// Return null if the input can be constant folded, so that we are not creating
-/// unnecessary instructions.
-static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,
- unsigned NewIndex,
- IRBuilder<> &Builder) {
- // If the extract can be constant-folded, this code is unsimplified. Defer
- // to other passes to handle that.
- Value *X = ExtElt->getVectorOperand();
- Value *C = ExtElt->getIndexOperand();
- assert(isa<ConstantInt>(C) && "Expected a constant index operand");
- if (isa<Constant>(X))
- return nullptr;
-
- Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
- NewIndex, Builder);
- return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex));
-}
-
-/// Try to reduce extract element costs by converting scalar compares to vector
-/// compares followed by extract.
-/// cmp (ext0 V0, C), (ext1 V1, C)
-void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0,
- ExtractElementInst *Ext1, Instruction &I) {
- assert(isa<CmpInst>(&I) && "Expected a compare");
- assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
- cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
- "Expected matching constant extract indexes");
-
- // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C
- ++NumVecCmp;
- CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
- Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
- Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
- Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand());
- replaceValue(I, *NewExt);
-}
-
-/// Try to reduce extract element costs by converting scalar binops to vector
-/// binops followed by extract.
-/// bo (ext0 V0, C), (ext1 V1, C)
-void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0,
- ExtractElementInst *Ext1, Instruction &I) {
- assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
- assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
- cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
- "Expected matching constant extract indexes");
-
- // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C
- ++NumVecBO;
- Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
- Value *VecBO =
- Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1);
-
- // All IR flags are safe to back-propagate because any potential poison
- // created in unused vector elements is discarded by the extract.
- if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
- VecBOInst->copyIRFlags(&I);
-
- Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand());
- replaceValue(I, *NewExt);
-}
-
-/// Match an instruction with extracted vector operands.
-bool VectorCombine::foldExtractExtract(Instruction &I) {
- // It is not safe to transform things like div, urem, etc. because we may
- // create undefined behavior when executing those on unknown vector elements.
- if (!isSafeToSpeculativelyExecute(&I))
- return false;
-
- Instruction *I0, *I1;
- CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
- if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
- !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1))))
- return false;
-
- Value *V0, *V1;
- uint64_t C0, C1;
- if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
- !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
- V0->getType() != V1->getType())
- return false;
-
- // If the scalar value 'I' is going to be re-inserted into a vector, then try
- // to create an extract to that same element. The extract/insert can be
- // reduced to a "select shuffle".
- // TODO: If we add a larger pattern match that starts from an insert, this
- // probably becomes unnecessary.
- auto *Ext0 = cast<ExtractElementInst>(I0);
- auto *Ext1 = cast<ExtractElementInst>(I1);
- uint64_t InsertIndex = InvalidIndex;
- if (I.hasOneUse())
- match(I.user_back(),
- m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
-
- ExtractElementInst *ExtractToChange;
- if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), ExtractToChange,
- InsertIndex))
- return false;
-
- if (ExtractToChange) {
- unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
- ExtractElementInst *NewExtract =
- translateExtract(ExtractToChange, CheapExtractIdx, Builder);
- if (!NewExtract)
- return false;
- if (ExtractToChange == Ext0)
- Ext0 = NewExtract;
- else
- Ext1 = NewExtract;
- }
-
- if (Pred != CmpInst::BAD_ICMP_PREDICATE)
- foldExtExtCmp(Ext0, Ext1, I);
- else
- foldExtExtBinop(Ext0, Ext1, I);
-
- return true;
-}
-
-/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
-/// destination type followed by shuffle. This can enable further transforms by
-/// moving bitcasts or shuffles together.
-bool VectorCombine::foldBitcastShuf(Instruction &I) {
- Value *V;
- ArrayRef<int> Mask;
- if (!match(&I, m_BitCast(
- m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask))))))
- return false;
-
+}
+
+/// Given an extract element instruction with constant index operand, shuffle
+/// the source vector (shift the scalar element) to a NewIndex for extraction.
+/// Return null if the input can be constant folded, so that we are not creating
+/// unnecessary instructions.
+static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,
+ unsigned NewIndex,
+ IRBuilder<> &Builder) {
+ // If the extract can be constant-folded, this code is unsimplified. Defer
+ // to other passes to handle that.
+ Value *X = ExtElt->getVectorOperand();
+ Value *C = ExtElt->getIndexOperand();
+ assert(isa<ConstantInt>(C) && "Expected a constant index operand");
+ if (isa<Constant>(X))
+ return nullptr;
+
+ Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
+ NewIndex, Builder);
+ return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex));
+}
+
+/// Try to reduce extract element costs by converting scalar compares to vector
+/// compares followed by extract.
+/// cmp (ext0 V0, C), (ext1 V1, C)
+void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0,
+ ExtractElementInst *Ext1, Instruction &I) {
+ assert(isa<CmpInst>(&I) && "Expected a compare");
+ assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
+ cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
+ "Expected matching constant extract indexes");
+
+ // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C
+ ++NumVecCmp;
+ CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
+ Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
+ Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
+ Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand());
+ replaceValue(I, *NewExt);
+}
+
+/// Try to reduce extract element costs by converting scalar binops to vector
+/// binops followed by extract.
+/// bo (ext0 V0, C), (ext1 V1, C)
+void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0,
+ ExtractElementInst *Ext1, Instruction &I) {
+ assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
+ assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
+ cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
+ "Expected matching constant extract indexes");
+
+ // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C
+ ++NumVecBO;
+ Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
+ Value *VecBO =
+ Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1);
+
+ // All IR flags are safe to back-propagate because any potential poison
+ // created in unused vector elements is discarded by the extract.
+ if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
+ VecBOInst->copyIRFlags(&I);
+
+ Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand());
+ replaceValue(I, *NewExt);
+}
+
+/// Match an instruction with extracted vector operands.
+bool VectorCombine::foldExtractExtract(Instruction &I) {
+ // It is not safe to transform things like div, urem, etc. because we may
+ // create undefined behavior when executing those on unknown vector elements.
+ if (!isSafeToSpeculativelyExecute(&I))
+ return false;
+
+ Instruction *I0, *I1;
+ CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
+ !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1))))
+ return false;
+
+ Value *V0, *V1;
+ uint64_t C0, C1;
+ if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
+ !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
+ V0->getType() != V1->getType())
+ return false;
+
+ // If the scalar value 'I' is going to be re-inserted into a vector, then try
+ // to create an extract to that same element. The extract/insert can be
+ // reduced to a "select shuffle".
+ // TODO: If we add a larger pattern match that starts from an insert, this
+ // probably becomes unnecessary.
+ auto *Ext0 = cast<ExtractElementInst>(I0);
+ auto *Ext1 = cast<ExtractElementInst>(I1);
+ uint64_t InsertIndex = InvalidIndex;
+ if (I.hasOneUse())
+ match(I.user_back(),
+ m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
+
+ ExtractElementInst *ExtractToChange;
+ if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), ExtractToChange,
+ InsertIndex))
+ return false;
+
+ if (ExtractToChange) {
+ unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
+ ExtractElementInst *NewExtract =
+ translateExtract(ExtractToChange, CheapExtractIdx, Builder);
+ if (!NewExtract)
+ return false;
+ if (ExtractToChange == Ext0)
+ Ext0 = NewExtract;
+ else
+ Ext1 = NewExtract;
+ }
+
+ if (Pred != CmpInst::BAD_ICMP_PREDICATE)
+ foldExtExtCmp(Ext0, Ext1, I);
+ else
+ foldExtExtBinop(Ext0, Ext1, I);
+
+ return true;
+}
+
+/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
+/// destination type followed by shuffle. This can enable further transforms by
+/// moving bitcasts or shuffles together.
+bool VectorCombine::foldBitcastShuf(Instruction &I) {
+ Value *V;
+ ArrayRef<int> Mask;
+ if (!match(&I, m_BitCast(
+ m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask))))))
+ return false;
+
// 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
// scalable type is unknown; Second, we cannot reason if the narrowed shuffle
// mask for scalable type is a splat or not.
// 2) Disallow non-vector casts and length-changing shuffles.
- // TODO: We could allow any shuffle.
+ // TODO: We could allow any shuffle.
auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
auto *SrcTy = dyn_cast<FixedVectorType>(V->getType());
if (!SrcTy || !DestTy || I.getOperand(0)->getType() != SrcTy)
- return false;
-
- // The new shuffle must not cost more than the old shuffle. The bitcast is
- // moved ahead of the shuffle, so assume that it has the same cost as before.
+ return false;
+
+ // The new shuffle must not cost more than the old shuffle. The bitcast is
+ // moved ahead of the shuffle, so assume that it has the same cost as before.
InstructionCost DestCost =
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy);
InstructionCost SrcCost =
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy);
if (DestCost > SrcCost || !DestCost.isValid())
- return false;
-
- unsigned DestNumElts = DestTy->getNumElements();
- unsigned SrcNumElts = SrcTy->getNumElements();
- SmallVector<int, 16> NewMask;
- if (SrcNumElts <= DestNumElts) {
- // The bitcast is from wide to narrow/equal elements. The shuffle mask can
- // always be expanded to the equivalent form choosing narrower elements.
- assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");
- unsigned ScaleFactor = DestNumElts / SrcNumElts;
- narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
- } else {
- // The bitcast is from narrow elements to wide elements. The shuffle mask
- // must choose consecutive elements to allow casting first.
- assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask");
- unsigned ScaleFactor = SrcNumElts / DestNumElts;
- if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
- return false;
- }
- // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
- ++NumShufOfBitcast;
- Value *CastV = Builder.CreateBitCast(V, DestTy);
+ return false;
+
+ unsigned DestNumElts = DestTy->getNumElements();
+ unsigned SrcNumElts = SrcTy->getNumElements();
+ SmallVector<int, 16> NewMask;
+ if (SrcNumElts <= DestNumElts) {
+ // The bitcast is from wide to narrow/equal elements. The shuffle mask can
+ // always be expanded to the equivalent form choosing narrower elements.
+ assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");
+ unsigned ScaleFactor = DestNumElts / SrcNumElts;
+ narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
+ } else {
+ // The bitcast is from narrow elements to wide elements. The shuffle mask
+ // must choose consecutive elements to allow casting first.
+ assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask");
+ unsigned ScaleFactor = SrcNumElts / DestNumElts;
+ if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
+ return false;
+ }
+ // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
+ ++NumShufOfBitcast;
+ Value *CastV = Builder.CreateBitCast(V, DestTy);
Value *Shuf = Builder.CreateShuffleVector(CastV, NewMask);
- replaceValue(I, *Shuf);
- return true;
-}
-
-/// Match a vector binop or compare instruction with at least one inserted
-/// scalar operand and convert to scalar binop/cmp followed by insertelement.
-bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
- CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
- Value *Ins0, *Ins1;
- if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
- !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
- return false;
-
- // Do not convert the vector condition of a vector select into a scalar
- // condition. That may cause problems for codegen because of differences in
- // boolean formats and register-file transfers.
- // TODO: Can we account for that in the cost model?
- bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
- if (IsCmp)
- for (User *U : I.users())
- if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
- return false;
-
- // Match against one or both scalar values being inserted into constant
- // vectors:
- // vec_op VecC0, (inselt VecC1, V1, Index)
- // vec_op (inselt VecC0, V0, Index), VecC1
- // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
- // TODO: Deal with mismatched index constants and variable indexes?
- Constant *VecC0 = nullptr, *VecC1 = nullptr;
- Value *V0 = nullptr, *V1 = nullptr;
- uint64_t Index0 = 0, Index1 = 0;
- if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0),
- m_ConstantInt(Index0))) &&
- !match(Ins0, m_Constant(VecC0)))
- return false;
- if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1),
- m_ConstantInt(Index1))) &&
- !match(Ins1, m_Constant(VecC1)))
- return false;
-
- bool IsConst0 = !V0;
- bool IsConst1 = !V1;
- if (IsConst0 && IsConst1)
- return false;
- if (!IsConst0 && !IsConst1 && Index0 != Index1)
- return false;
-
- // Bail for single insertion if it is a load.
- // TODO: Handle this once getVectorInstrCost can cost for load/stores.
- auto *I0 = dyn_cast_or_null<Instruction>(V0);
- auto *I1 = dyn_cast_or_null<Instruction>(V1);
- if ((IsConst0 && I1 && I1->mayReadFromMemory()) ||
- (IsConst1 && I0 && I0->mayReadFromMemory()))
- return false;
-
- uint64_t Index = IsConst0 ? Index1 : Index0;
- Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType();
- Type *VecTy = I.getType();
- assert(VecTy->isVectorTy() &&
- (IsConst0 || IsConst1 || V0->getType() == V1->getType()) &&
- (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
- ScalarTy->isPointerTy()) &&
- "Unexpected types for insert element into binop or cmp");
-
- unsigned Opcode = I.getOpcode();
+ replaceValue(I, *Shuf);
+ return true;
+}
+
+/// Match a vector binop or compare instruction with at least one inserted
+/// scalar operand and convert to scalar binop/cmp followed by insertelement.
+bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
+ CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+ Value *Ins0, *Ins1;
+ if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
+ !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
+ return false;
+
+ // Do not convert the vector condition of a vector select into a scalar
+ // condition. That may cause problems for codegen because of differences in
+ // boolean formats and register-file transfers.
+ // TODO: Can we account for that in the cost model?
+ bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
+ if (IsCmp)
+ for (User *U : I.users())
+ if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
+ return false;
+
+ // Match against one or both scalar values being inserted into constant
+ // vectors:
+ // vec_op VecC0, (inselt VecC1, V1, Index)
+ // vec_op (inselt VecC0, V0, Index), VecC1
+ // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
+ // TODO: Deal with mismatched index constants and variable indexes?
+ Constant *VecC0 = nullptr, *VecC1 = nullptr;
+ Value *V0 = nullptr, *V1 = nullptr;
+ uint64_t Index0 = 0, Index1 = 0;
+ if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0),
+ m_ConstantInt(Index0))) &&
+ !match(Ins0, m_Constant(VecC0)))
+ return false;
+ if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1),
+ m_ConstantInt(Index1))) &&
+ !match(Ins1, m_Constant(VecC1)))
+ return false;
+
+ bool IsConst0 = !V0;
+ bool IsConst1 = !V1;
+ if (IsConst0 && IsConst1)
+ return false;
+ if (!IsConst0 && !IsConst1 && Index0 != Index1)
+ return false;
+
+ // Bail for single insertion if it is a load.
+ // TODO: Handle this once getVectorInstrCost can cost for load/stores.
+ auto *I0 = dyn_cast_or_null<Instruction>(V0);
+ auto *I1 = dyn_cast_or_null<Instruction>(V1);
+ if ((IsConst0 && I1 && I1->mayReadFromMemory()) ||
+ (IsConst1 && I0 && I0->mayReadFromMemory()))
+ return false;
+
+ uint64_t Index = IsConst0 ? Index1 : Index0;
+ Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType();
+ Type *VecTy = I.getType();
+ assert(VecTy->isVectorTy() &&
+ (IsConst0 || IsConst1 || V0->getType() == V1->getType()) &&
+ (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
+ ScalarTy->isPointerTy()) &&
+ "Unexpected types for insert element into binop or cmp");
+
+ unsigned Opcode = I.getOpcode();
InstructionCost ScalarOpCost, VectorOpCost;
- if (IsCmp) {
- ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy);
- VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy);
- } else {
- ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
- VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
- }
-
- // Get cost estimate for the insert element. This cost will factor into
- // both sequences.
+ if (IsCmp) {
+ ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy);
+ VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy);
+ } else {
+ ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
+ VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+ }
+
+ // Get cost estimate for the insert element. This cost will factor into
+ // both sequences.
InstructionCost InsertCost =
- TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);
+ TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);
InstructionCost OldCost =
(IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) + VectorOpCost;
InstructionCost NewCost = ScalarOpCost + InsertCost +
(IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) +
(IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost);
-
- // We want to scalarize unless the vector variant actually has lower cost.
+
+ // We want to scalarize unless the vector variant actually has lower cost.
if (OldCost < NewCost || !NewCost.isValid())
- return false;
-
- // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
- // inselt NewVecC, (scalar_op V0, V1), Index
- if (IsCmp)
- ++NumScalarCmp;
- else
- ++NumScalarBO;
-
- // For constant cases, extract the scalar element, this should constant fold.
- if (IsConst0)
- V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index));
- if (IsConst1)
- V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
-
- Value *Scalar =
- IsCmp ? Builder.CreateCmp(Pred, V0, V1)
- : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
-
- Scalar->setName(I.getName() + ".scalar");
-
- // All IR flags are safe to back-propagate. There is no potential for extra
- // poison to be created by the scalar instruction.
- if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
- ScalarInst->copyIRFlags(&I);
-
- // Fold the vector constants in the original vectors into a new base vector.
- Constant *NewVecC = IsCmp ? ConstantExpr::getCompare(Pred, VecC0, VecC1)
- : ConstantExpr::get(Opcode, VecC0, VecC1);
- Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
- replaceValue(I, *Insert);
- return true;
-}
-
-/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
-/// a vector into vector operations followed by extract. Note: The SLP pass
-/// may miss this pattern because of implementation problems.
-bool VectorCombine::foldExtractedCmps(Instruction &I) {
- // We are looking for a scalar binop of booleans.
- // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
- if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1))
- return false;
-
- // The compare predicates should match, and each compare should have a
- // constant operand.
- // TODO: Relax the one-use constraints.
- Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
- Instruction *I0, *I1;
- Constant *C0, *C1;
- CmpInst::Predicate P0, P1;
- if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) ||
- !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) ||
- P0 != P1)
- return false;
-
- // The compare operands must be extracts of the same vector with constant
- // extract indexes.
- // TODO: Relax the one-use constraints.
- Value *X;
- uint64_t Index0, Index1;
- if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) ||
- !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1)))))
- return false;
-
- auto *Ext0 = cast<ExtractElementInst>(I0);
- auto *Ext1 = cast<ExtractElementInst>(I1);
- ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
- if (!ConvertToShuf)
- return false;
-
- // The original scalar pattern is:
- // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
- CmpInst::Predicate Pred = P0;
- unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp
- : Instruction::ICmp;
- auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
- if (!VecTy)
- return false;
-
+ return false;
+
+ // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
+ // inselt NewVecC, (scalar_op V0, V1), Index
+ if (IsCmp)
+ ++NumScalarCmp;
+ else
+ ++NumScalarBO;
+
+ // For constant cases, extract the scalar element, this should constant fold.
+ if (IsConst0)
+ V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index));
+ if (IsConst1)
+ V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
+
+ Value *Scalar =
+ IsCmp ? Builder.CreateCmp(Pred, V0, V1)
+ : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+
+ Scalar->setName(I.getName() + ".scalar");
+
+ // All IR flags are safe to back-propagate. There is no potential for extra
+ // poison to be created by the scalar instruction.
+ if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
+ ScalarInst->copyIRFlags(&I);
+
+ // Fold the vector constants in the original vectors into a new base vector.
+ Constant *NewVecC = IsCmp ? ConstantExpr::getCompare(Pred, VecC0, VecC1)
+ : ConstantExpr::get(Opcode, VecC0, VecC1);
+ Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
+ replaceValue(I, *Insert);
+ return true;
+}
+
+/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
+/// a vector into vector operations followed by extract. Note: The SLP pass
+/// may miss this pattern because of implementation problems.
+bool VectorCombine::foldExtractedCmps(Instruction &I) {
+ // We are looking for a scalar binop of booleans.
+ // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
+ if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1))
+ return false;
+
+ // The compare predicates should match, and each compare should have a
+ // constant operand.
+ // TODO: Relax the one-use constraints.
+ Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
+ Instruction *I0, *I1;
+ Constant *C0, *C1;
+ CmpInst::Predicate P0, P1;
+ if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) ||
+ !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) ||
+ P0 != P1)
+ return false;
+
+ // The compare operands must be extracts of the same vector with constant
+ // extract indexes.
+ // TODO: Relax the one-use constraints.
+ Value *X;
+ uint64_t Index0, Index1;
+ if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) ||
+ !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1)))))
+ return false;
+
+ auto *Ext0 = cast<ExtractElementInst>(I0);
+ auto *Ext1 = cast<ExtractElementInst>(I1);
+ ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
+ if (!ConvertToShuf)
+ return false;
+
+ // The original scalar pattern is:
+ // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
+ CmpInst::Predicate Pred = P0;
+ unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp
+ : Instruction::ICmp;
+ auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
+ if (!VecTy)
+ return false;
+
InstructionCost OldCost =
TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
- OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
- OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2;
- OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
-
- // The proposed vector pattern is:
- // vcmp = cmp Pred X, VecC
- // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
- int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
- int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
- auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
+ OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+ OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2;
+ OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
+
+ // The proposed vector pattern is:
+ // vcmp = cmp Pred X, VecC
+ // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
+ int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
+ int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
+ auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
InstructionCost NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
- NewCost +=
- TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy);
- NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
- NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex);
-
- // Aggressively form vector ops if the cost is equal because the transform
- // may enable further optimization.
- // Codegen can reverse this transform (scalarize) if it was not profitable.
+ NewCost +=
+ TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy);
+ NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
+ NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex);
+
+ // Aggressively form vector ops if the cost is equal because the transform
+ // may enable further optimization.
+ // Codegen can reverse this transform (scalarize) if it was not profitable.
if (OldCost < NewCost || !NewCost.isValid())
- return false;
-
- // Create a vector constant from the 2 scalar constants.
- SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
- UndefValue::get(VecTy->getElementType()));
- CmpC[Index0] = C0;
- CmpC[Index1] = C1;
- Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
-
- Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
- Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(),
- VCmp, Shuf);
- Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
- replaceValue(I, *NewExt);
- ++NumVecCmpBO;
- return true;
-}
-
-/// This is the entry point for all transforms. Pass manager differences are
-/// handled in the callers of this function.
-bool VectorCombine::run() {
- if (DisableVectorCombine)
- return false;
-
+ return false;
+
+ // Create a vector constant from the 2 scalar constants.
+ SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
+ UndefValue::get(VecTy->getElementType()));
+ CmpC[Index0] = C0;
+ CmpC[Index1] = C1;
+ Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
+
+ Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
+ Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(),
+ VCmp, Shuf);
+ Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
+ replaceValue(I, *NewExt);
+ ++NumVecCmpBO;
+ return true;
+}
+
+/// This is the entry point for all transforms. Pass manager differences are
+/// handled in the callers of this function.
+bool VectorCombine::run() {
+ if (DisableVectorCombine)
+ return false;
+
// Don't attempt vectorization if the target does not support vectors.
if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
return false;
- bool MadeChange = false;
- for (BasicBlock &BB : F) {
- // Ignore unreachable basic blocks.
- if (!DT.isReachableFromEntry(&BB))
- continue;
- // Do not delete instructions under here and invalidate the iterator.
- // Walk the block forwards to enable simple iterative chains of transforms.
- // TODO: It could be more efficient to remove dead instructions
- // iteratively in this loop rather than waiting until the end.
- for (Instruction &I : BB) {
- if (isa<DbgInfoIntrinsic>(I))
- continue;
- Builder.SetInsertPoint(&I);
+ bool MadeChange = false;
+ for (BasicBlock &BB : F) {
+ // Ignore unreachable basic blocks.
+ if (!DT.isReachableFromEntry(&BB))
+ continue;
+ // Do not delete instructions under here and invalidate the iterator.
+ // Walk the block forwards to enable simple iterative chains of transforms.
+ // TODO: It could be more efficient to remove dead instructions
+ // iteratively in this loop rather than waiting until the end.
+ for (Instruction &I : BB) {
+ if (isa<DbgInfoIntrinsic>(I))
+ continue;
+ Builder.SetInsertPoint(&I);
MadeChange |= vectorizeLoadInsert(I);
- MadeChange |= foldExtractExtract(I);
- MadeChange |= foldBitcastShuf(I);
- MadeChange |= scalarizeBinopOrCmp(I);
- MadeChange |= foldExtractedCmps(I);
- }
- }
-
- // We're done with transforms, so remove dead instructions.
- if (MadeChange)
- for (BasicBlock &BB : F)
- SimplifyInstructionsInBlock(&BB);
-
- return MadeChange;
-}
-
-// Pass manager boilerplate below here.
-
-namespace {
-class VectorCombineLegacyPass : public FunctionPass {
-public:
- static char ID;
- VectorCombineLegacyPass() : FunctionPass(ID) {
- initializeVectorCombineLegacyPassPass(*PassRegistry::getPassRegistry());
- }
-
- void getAnalysisUsage(AnalysisUsage &AU) const override {
- AU.addRequired<DominatorTreeWrapperPass>();
- AU.addRequired<TargetTransformInfoWrapperPass>();
- AU.setPreservesCFG();
- AU.addPreserved<DominatorTreeWrapperPass>();
- AU.addPreserved<GlobalsAAWrapperPass>();
- AU.addPreserved<AAResultsWrapperPass>();
- AU.addPreserved<BasicAAWrapperPass>();
- FunctionPass::getAnalysisUsage(AU);
- }
-
- bool runOnFunction(Function &F) override {
- if (skipFunction(F))
- return false;
- auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
- auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
- VectorCombine Combiner(F, TTI, DT);
- return Combiner.run();
- }
-};
-} // namespace
-
-char VectorCombineLegacyPass::ID = 0;
-INITIALIZE_PASS_BEGIN(VectorCombineLegacyPass, "vector-combine",
- "Optimize scalar/vector ops", false,
- false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(VectorCombineLegacyPass, "vector-combine",
- "Optimize scalar/vector ops", false, false)
-Pass *llvm::createVectorCombinePass() {
- return new VectorCombineLegacyPass();
-}
-
-PreservedAnalyses VectorCombinePass::run(Function &F,
- FunctionAnalysisManager &FAM) {
- TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
- DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
- VectorCombine Combiner(F, TTI, DT);
- if (!Combiner.run())
- return PreservedAnalyses::all();
- PreservedAnalyses PA;
- PA.preserveSet<CFGAnalyses>();
- PA.preserve<GlobalsAA>();
- PA.preserve<AAManager>();
- PA.preserve<BasicAA>();
- return PA;
-}
+ MadeChange |= foldExtractExtract(I);
+ MadeChange |= foldBitcastShuf(I);
+ MadeChange |= scalarizeBinopOrCmp(I);
+ MadeChange |= foldExtractedCmps(I);
+ }
+ }
+
+ // We're done with transforms, so remove dead instructions.
+ if (MadeChange)
+ for (BasicBlock &BB : F)
+ SimplifyInstructionsInBlock(&BB);
+
+ return MadeChange;
+}
+
+// Pass manager boilerplate below here.
+
+namespace {
+class VectorCombineLegacyPass : public FunctionPass {
+public:
+ static char ID;
+ VectorCombineLegacyPass() : FunctionPass(ID) {
+ initializeVectorCombineLegacyPassPass(*PassRegistry::getPassRegistry());
+ }
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<DominatorTreeWrapperPass>();
+ AU.addRequired<TargetTransformInfoWrapperPass>();
+ AU.setPreservesCFG();
+ AU.addPreserved<DominatorTreeWrapperPass>();
+ AU.addPreserved<GlobalsAAWrapperPass>();
+ AU.addPreserved<AAResultsWrapperPass>();
+ AU.addPreserved<BasicAAWrapperPass>();
+ FunctionPass::getAnalysisUsage(AU);
+ }
+
+ bool runOnFunction(Function &F) override {
+ if (skipFunction(F))
+ return false;
+ auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+ auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+ VectorCombine Combiner(F, TTI, DT);
+ return Combiner.run();
+ }
+};
+} // namespace
+
+char VectorCombineLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(VectorCombineLegacyPass, "vector-combine",
+ "Optimize scalar/vector ops", false,
+ false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(VectorCombineLegacyPass, "vector-combine",
+ "Optimize scalar/vector ops", false, false)
+Pass *llvm::createVectorCombinePass() {
+ return new VectorCombineLegacyPass();
+}
+
+PreservedAnalyses VectorCombinePass::run(Function &F,
+ FunctionAnalysisManager &FAM) {
+ TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+ DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+ VectorCombine Combiner(F, TTI, DT);
+ if (!Combiner.run())
+ return PreservedAnalyses::all();
+ PreservedAnalyses PA;
+ PA.preserveSet<CFGAnalyses>();
+ PA.preserve<GlobalsAA>();
+ PA.preserve<AAManager>();
+ PA.preserve<BasicAA>();
+ return PA;
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/Vectorize.cpp b/contrib/libs/llvm12/lib/Transforms/Vectorize/Vectorize.cpp
index 7d3314b0d2..0296a995ad 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/Vectorize.cpp
@@ -1,42 +1,42 @@
-//===-- Vectorize.cpp -----------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements common infrastructure for libLLVMVectorizeOpts.a, which
-// implements several vectorization transformations over the LLVM intermediate
-// representation, including the C bindings for that library.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Vectorize.h"
-#include "llvm-c/Initialization.h"
-#include "llvm-c/Transforms/Vectorize.h"
-#include "llvm/Analysis/Passes.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/InitializePasses.h"
-
-using namespace llvm;
-
-/// Initialize all passes linked into the Vectorization library.
-void llvm::initializeVectorization(PassRegistry &Registry) {
- initializeLoopVectorizePass(Registry);
- initializeSLPVectorizerPass(Registry);
- initializeLoadStoreVectorizerLegacyPassPass(Registry);
- initializeVectorCombineLegacyPassPass(Registry);
-}
-
-void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
- initializeVectorization(*unwrap(R));
-}
-
-void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createLoopVectorizePass());
-}
-
-void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) {
- unwrap(PM)->add(createSLPVectorizerPass());
-}
+//===-- Vectorize.cpp -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements common infrastructure for libLLVMVectorizeOpts.a, which
+// implements several vectorization transformations over the LLVM intermediate
+// representation, including the C bindings for that library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm-c/Initialization.h"
+#include "llvm-c/Transforms/Vectorize.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+/// Initialize all passes linked into the Vectorization library.
+void llvm::initializeVectorization(PassRegistry &Registry) {
+ initializeLoopVectorizePass(Registry);
+ initializeSLPVectorizerPass(Registry);
+ initializeLoadStoreVectorizerLegacyPassPass(Registry);
+ initializeVectorCombineLegacyPassPass(Registry);
+}
+
+void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
+ initializeVectorization(*unwrap(R));
+}
+
+void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createLoopVectorizePass());
+}
+
+void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) {
+ unwrap(PM)->add(createSLPVectorizerPass());
+}
diff --git a/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make b/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make
index df7cc36ebe..a68c667bde 100644
--- a/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make
+++ b/contrib/libs/llvm12/lib/Transforms/Vectorize/ya.make
@@ -1,46 +1,46 @@
-# Generated by devtools/yamaker.
-
-LIBRARY()
-
+# Generated by devtools/yamaker.
+
+LIBRARY()
+
OWNER(
orivej
g:cpp-contrib
)
-
+
LICENSE(Apache-2.0 WITH LLVM-exception)
LICENSE_TEXTS(.yandex_meta/licenses.list.txt)
-PEERDIR(
+PEERDIR(
contrib/libs/llvm12
contrib/libs/llvm12/include
contrib/libs/llvm12/lib/Analysis
contrib/libs/llvm12/lib/IR
contrib/libs/llvm12/lib/Support
contrib/libs/llvm12/lib/Transforms/Utils
-)
-
+)
+
ADDINCL(
contrib/libs/llvm12/lib/Transforms/Vectorize
)
-
-NO_COMPILER_WARNINGS()
-
-NO_UTIL()
-
-SRCS(
- LoadStoreVectorizer.cpp
- LoopVectorizationLegality.cpp
- LoopVectorize.cpp
- SLPVectorizer.cpp
- VPlan.cpp
- VPlanHCFGBuilder.cpp
- VPlanPredicator.cpp
- VPlanSLP.cpp
- VPlanTransforms.cpp
- VPlanVerifier.cpp
- VectorCombine.cpp
- Vectorize.cpp
-)
-
-END()
+
+NO_COMPILER_WARNINGS()
+
+NO_UTIL()
+
+SRCS(
+ LoadStoreVectorizer.cpp
+ LoopVectorizationLegality.cpp
+ LoopVectorize.cpp
+ SLPVectorizer.cpp
+ VPlan.cpp
+ VPlanHCFGBuilder.cpp
+ VPlanPredicator.cpp
+ VPlanSLP.cpp
+ VPlanTransforms.cpp
+ VPlanVerifier.cpp
+ VectorCombine.cpp
+ Vectorize.cpp
+)
+
+END()